diff --git a/.bazelrc b/.bazelrc
index e765c302c28..396b84f70b3 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -5,6 +5,7 @@
 # Android options:
 #    android:
 #    android_arm:
+#    android_arm64:
 #    android_x86:
 #    android_x86_64:
 #
@@ -46,10 +47,6 @@
 #     using_cuda:   CUDA is available to build system.
 #     cuda:         Build with full cuda support.
 #     rocm:         Build with AMD GPU support (rocm).
-#     sycl:         Build with SYCL support.
-#     sycl_nodouble:
-#     sycl_asan:
-#     sycl_trisycl:
 #     mkl:          Enable full mkl support.
 #     tensorrt:     Enable Tensorrt support.
 #     ngraph:       Enable ngraph support.
@@ -89,6 +86,7 @@
 #     release_cpu_linux:    Toolchain and CUDA options for Linux CPU builds.
 #     release_cpu_macos:    Toolchain and CUDA options for MacOS CPU builds.
 #     release_gpu_linux:    Toolchain and CUDA options for Linux GPU builds.
+#     release_gpu_linux_cuda_10_1:    Toolchain and CUDA options for CUDA 10.1 Linux GPU builds.
 #     release_cpu_windows:    Toolchain and CUDA options for Windows CPU builds.
 #     release_gpu_windows:    Toolchain and CUDA options for Windows GPU builds.
 
@@ -161,13 +159,11 @@ build --host_java_toolchain=//third_party/toolchains/java:tf_java_toolchain
 # environment variable "TF_MKL_ROOT" every time before build.
 build:mkl --define=build_with_mkl=true --define=enable_mkl=true
 build:mkl --define=tensorflow_mkldnn_contraction_kernel=0
-build:mkl --define=build_with_mkl_dnn_v1_only=true
 build:mkl -c opt
 
 # config to build OneDNN backend with a user specified threadpool.
 build:mkl_threadpool --define=build_with_mkl=true --define=enable_mkl=true
 build:mkl_threadpool --define=tensorflow_mkldnn_contraction_kernel=0
-build:mkl_threadpool --define=build_with_mkl_dnn_v1_only=true
 build:mkl_threadpool --define=build_with_mkl_opensource=true
 build:mkl_threadpool --define=build_with_mkldnn_threadpool=true
 build:mkl_threadpool -c opt
@@ -175,10 +171,15 @@ build:mkl_threadpool -c opt
 # Config setting to build with oneDNN and without the binary blob
 build:mkl_opensource_only --define=build_with_mkl=true --define=enable_mkl=true
 build:mkl_opensource_only --define=tensorflow_mkldnn_contraction_kernel=0
-build:mkl_opensource_only --define=build_with_mkl_dnn_v1_only=true
 build:mkl_opensource_only --define=build_with_mkl_opensource=true
 build:mkl_opensource_only -c opt
 
+# Config setting to build with oneDNN for Arm.
+build:mkl_aarch64 --define=build_with_mkl_aarch64=true --define=enable_mkl=true
+build:mkl_aarch64 --define=tensorflow_mkldnn_contraction_kernel=0
+build:mkl_aarch64 --define=build_with_mkl_opensource=true
+build:mkl_aarch64 -c opt
+
 # This config refers to building with CUDA available. It does not necessarily
 # mean that we build CUDA op kernels.
 build:using_cuda --define=using_cuda=true
@@ -216,19 +217,6 @@ build:rocm --crosstool_top=@local_config_rocm//crosstool:toolchain
 build:rocm --define=using_rocm=true --define=using_rocm_hipcc=true
 build:rocm --action_env TF_NEED_ROCM=1
 
-build:sycl --crosstool_top=@local_config_sycl//crosstool:toolchain
-build:sycl --define=using_sycl=true
-build:sycl --action_env TF_NEED_OPENCL_SYCL=1
-
-build:sycl_nodouble --config=sycl
-build:sycl_nodouble --cxxopt -DTENSORFLOW_SYCL_NO_DOUBLE
-
-build:sycl_nodouble --config=sycl
-build:sycl_asan --copt -fno-omit-frame-pointer --copt -fsanitize-coverage=3 --copt -DGPR_NO_DIRECT_SYSCALLS --linkopt -fPIC --linkopt -fsanitize=address
-
-build:sycl_nodouble --config=sycl
-build:sycl_trisycl --define=using_trisycl=true
-
 # Options extracted from configure script
 build:ngraph --define=with_ngraph_support=true
 build:numa --define=with_numa_support=true
@@ -293,6 +281,7 @@ build:ios --noenable_platform_specific_config
 build:android --copt=-w
 build:ios --copt=-w
 build:linux --copt=-w
+build:linux --host_copt=-w
 build:macos --copt=-w
 build:windows --copt=/w
 
@@ -334,6 +323,11 @@ build:windows --host_copt=-DWIN32_LEAN_AND_MEAN
 build:windows --copt=-DNOGDI
 build:windows --host_copt=-DNOGDI
 
+# MSVC (Windows): Standards-conformant preprocessor mode
+# See https://docs.microsoft.com/en-us/cpp/preprocessor/preprocessor-experimental-overview
+build:windows --copt=/experimental:preprocessor
+build:windows --host_copt=/experimental:preprocessor
+
 # Misc build options we need for windows.
 build:windows --linkopt=/DEBUG
 build:windows --host_linkopt=/DEBUG
@@ -358,6 +352,7 @@ build --config=short_logs
 # TODO(gunan): Create a feature in toolchains for avx/avx2 to
 #   avoid having to define linux/win separately.
 build:avx_linux --copt=-mavx
+build:avx_linux --host_copt=-mavx
 build:avx2_linux --copt=-mavx2
 build:native_arch_linux --copt=-march=native
 build:avx_win --copt=/arch=AVX
@@ -411,9 +406,12 @@ build:rbe_linux --config=avx_linux
 build:rbe_linux --config=short_logs
 # TODO(gunan): Check why we need this specified in rbe, but not in other builds.
 build:rbe_linux --linkopt=-lrt
+build:rbe_linux --host_linkopt=-lrt
 build:rbe_linux --linkopt=-lm
+build:rbe_linux --host_linkopt=-lm
 
 build:rbe_cpu_linux --config=rbe_linux
+build:rbe_cpu_linux --host_crosstool_top="//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010:toolchain"
 build:rbe_cpu_linux --crosstool_top="//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010:toolchain"
 build:rbe_cpu_linux --extra_toolchains="//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010:cc-toolchain-k8"
 build:rbe_cpu_linux --extra_execution_platforms="@ubuntu16.04-manylinux2010-py3_config_platform//:platform"
@@ -431,6 +429,7 @@ test:rbe_linux_cuda_base --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/
 
 build:rbe_linux_cuda10.1_nvcc_base --config=rbe_linux_cuda_base
 build:rbe_linux_cuda10.1_nvcc_base --define=using_cuda_nvcc=true
+build:rbe_linux_cuda10.1_nvcc_base --host_crosstool_top="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain"
 build:rbe_linux_cuda10.1_nvcc_base --crosstool_top="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain"
 build:rbe_linux_cuda10.1_nvcc_base --extra_toolchains="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_cuda//crosstool:toolchain-linux-x86_64"
 build:rbe_linux_cuda10.1_nvcc_base --extra_execution_platforms="@ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0_config_platform//:platform"
@@ -447,6 +446,7 @@ build:rbe_linux_cuda10.1_nvcc_py3.8 --config=rbe_linux_cuda10.1_nvcc_base --repo
 
 build:rbe_linux_cuda11.0_nvcc_base --config=rbe_linux_cuda_base
 build:rbe_linux_cuda11.0_nvcc_base --define=using_cuda_nvcc=true
+build:rbe_linux_cuda11.0_nvcc_base --host_crosstool_top="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_cuda//crosstool:toolchain"
 build:rbe_linux_cuda11.0_nvcc_base --crosstool_top="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_cuda//crosstool:toolchain"
 build:rbe_linux_cuda11.0_nvcc_base --extra_toolchains="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_cuda//crosstool:toolchain-linux-x86_64"
 build:rbe_linux_cuda11.0_nvcc_base --extra_execution_platforms="@ubuntu18.04-gcc7_manylinux2010-cuda11.0-cudnn8-tensorrt7.1_config_platform//:platform"
@@ -587,7 +587,7 @@ build:release_gpu_common --action_env CUDA_TOOLKIT_PATH="/usr/local/cuda-11.0"
 build:release_gpu_common --action_env=TF_CUDA_VERSION="11"
 build:release_gpu_common --action_env=TF_CUDNN_VERSION="8"
 build:release_gpu_common --action_env=TF_NEED_TENSORRT="1"
-build:release_gpu_common --action_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_37,sm_52,sm_60,sm_61,compute_70"
+build:release_gpu_common --action_env=TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80"
 build:release_gpu_common --action_env=TENSORRT_INSTALL_PATH="/usr/local/tensorrt"
 build:release_gpu_common --action_env=LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/tensorrt/lib"
 build:release_gpu_common --action_env=GCC_HOST_COMPILER_PATH="/usr/bin/gcc-5"
@@ -603,3 +603,8 @@ build:release_windows_common --announce_rc
 build:release_cpu_windows --config=release_windows_common
 
 build:release_gpu_windows --config=release_windows_common
+
+build:release_gpu_linux_cuda_10_1 --config=release_gpu_linux
+build:release_gpu_linux_cuda_10_1 --action_env CUDA_TOOLKIT_PATH="/usr/local/cuda-10.1"
+build:release_gpu_linux_cuda_10_1 --action_env=TF_CUDA_VERSION="10"
+build:release_gpu_linux_cuda_10_1 --action_env=TF_CUDNN_VERSION="7"
diff --git a/.github/bot_config.yml b/.github/bot_config.yml
index d0e7256aec0..c6dc0ec9c85 100644
--- a/.github/bot_config.yml
+++ b/.github/bot_config.yml
@@ -12,12 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-#
-# THIS IS A GENERATED DOCKERFILE.
-#
-# This file was assembled from multiple pieces, whose use is documented
-# throughout. Please refer to the TensorFlow dockerfiles documentation
-# for more information.
 
 # A list of assignees
 assignees:
@@ -40,6 +34,22 @@ segfault_memory:
 # assignees
 filesystem_security_assignee:
    - mihaimaruseac
+   
+tflite_micro_path:
+   - tensorflow/lite/micro
+   
+tflite_micro_comment: >
+   Thanks for contributing to TensorFlow Lite Micro.
+   
+
+   To keep this process moving along, we'd like to make sure that you have completed the items on this list:
+      * Read the [contributing guidelines for TensorFlow Lite Micro](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/CONTRIBUTING.md)
+      * Created a [TF Lite Micro Github issue](https://github.com/tensorflow/tensorflow/issues/new?labels=comp%3Amicro&template=70-tflite-micro-issue.md)
+      * Linked to the issue from the PR description
+      
+
+   We would like to have a discussion on the Github issue first to determine the best path forward, and then proceed to the PR review.
+
 # Cuda Comment
 cuda_comment: >
    From the template it looks like you are installing **TensorFlow** (TF) prebuilt binaries:
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release_pip_rename.sh b/.github/workflows/update-nightly.yml
similarity index 64%
rename from tensorflow/tools/ci_build/release/windows/cpu_py35_full/release_pip_rename.sh
rename to .github/workflows/update-nightly.yml
index 43982623109..01b5147d053 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release_pip_rename.sh
+++ b/.github/workflows/update-nightly.yml
@@ -1,4 +1,3 @@
-#!/bin/bash
 # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,14 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-set -e
-set -x
+# ============================================================================
 
-source tensorflow/tools/ci_build/release/common.sh
-
-# Rename to tensorflow_cpu
-for f in $(ls py_test_dir/tensorflow-*cp3*-cp3*m-win_amd64.whl); do
-  copy_to_new_project_name "${f}" tensorflow_cpu
-  rm "${f}"
-done
+on:
+  workflow_dispatch:  # Allow manual triggers
+  schedule:
+    - cron: 0 4 * * *  # 4am UTC is 9pm PDT and 8pm PST
+name: Set nightly branch to master HEAD
+jobs:
+  master-to-nightly:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: zofrex/mirror-branch@v1
+      name: Set nightly branch to master HEAD
+      with:
+        target-branch: 'nightly'
diff --git a/ADOPTERS.md b/ADOPTERS.md
deleted file mode 100644
index c0be567dc14..00000000000
--- a/ADOPTERS.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# TensorFlow Adopters
-
-This page contains a list of people and organizations who are using TensorFlow. If you'd like to be included
-here, please send a pull request which modifies this file.
-
-We intend to use this list to contact you for surveys, and to find good candidates for invite-only events. 
-We will also point to this list if we are asked who uses TensorFlow.
-
-We will not use any of the information here for promotions or to send other regular communications. You 
-should subscribe to discuss@tensorflow.org for such announcements.
diff --git a/CODEOWNERS b/CODEOWNERS
index 3ef02ffd68c..9de1922a262 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,16 +1,15 @@
 # Where component owners are known, add them here.
 
-/tensorflow/c/eager @jaingaurav @alextp
-/tensorflow/core/common_runtime/eager @jaingaurav @alextp
+/tensorflow/c/eager @qqfish @kkimdev
+/tensorflow/core/common_runtime/eager @qqfish @kkimdev
 /tenosrflow/core/debug @caisq
 /tensorflow/core/nccl/ @azaks2 @chsigg
-/tensorflow/core/platform/windows/ @mrry
+/tensorflow/core/platform/windows/ @mihaimaruseac
 /tensorflow/lite/experimental/micro @petewarden @advaitjain
 /tensorflow/python/autograph/ @mdanatg @kkimdev
 /tensorflow/python/debug @caisq
-/tensorflow/python/eager @jaingaurav @alextp
+/tensorflow/python/eager @rohan100jain @kkimdev
 /tensorflow/python/tools/api/generator/ @annarev
-/tensorflow/tensorboard/ @jart
 /tensorflow/tools/docs/ @markdaoust
 
 /third_party/systemlibs/ @perfinion
diff --git a/README.md b/README.md
index 6398e8e27a1..63d85ce2df4 100644
--- a/README.md
+++ b/README.md
@@ -103,23 +103,22 @@ open-source software development:
 
 ### Official Builds
 
-Build Type               | Status                                                                                                                                                                           | Artifacts
------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
-**Linux CPU**            | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.html)           | [PyPI](https://pypi.org/project/tf-nightly/)
-**Linux GPU**            | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.html) | [PyPI](https://pypi.org/project/tf-nightly-gpu/)
-**Linux XLA**            | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.html)         | TBA
-**macOS**                | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.html)     | [PyPI](https://pypi.org/project/tf-nightly/)
-**Windows CPU**          | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.html)       | [PyPI](https://pypi.org/project/tf-nightly/)
-**Windows GPU**          | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.html)       | [PyPI](https://pypi.org/project/tf-nightly-gpu/)
-**Android**              | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.html)               | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
-**Raspberry Pi 0 and 1** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.html)           | [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv6l.whl)
-**Raspberry Pi 2 and 3** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.html)           | [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv7l.whl)
-**Libtensorflow MacOS CPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-mac-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-mac-cpu.html)           | [GCS](https://storage.googleapis.com/libtensorflow-nightly)
-**Libtensorflow Linux CPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-cpu.html)           | [GCS](https://storage.googleapis.com/libtensorflow-nightly)
-**Libtensorflow Linux GPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-gpu.html)           | [GCS](https://storage.googleapis.com/libtensorflow-nightly)
-**Libtensorflow Windows CPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-cpu.html)           | [GCS](https://storage.googleapis.com/libtensorflow-nightly)
-**Libtensorflow Windows GPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-gpu.html)           | [GCS](https://storage.googleapis.com/libtensorflow-nightly)
-
+Build Type                    | Status                                                                                                                                                                                             | Artifacts
+----------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
+**Linux CPU**                 | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.html)                             | [PyPI](https://pypi.org/project/tf-nightly/)
+**Linux GPU**                 | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-py3.html)                   | [PyPI](https://pypi.org/project/tf-nightly-gpu/)
+**Linux XLA**                 | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-xla.html)                           | TBA
+**macOS**                     | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.html)                       | [PyPI](https://pypi.org/project/tf-nightly/)
+**Windows CPU**               | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.html)                         | [PyPI](https://pypi.org/project/tf-nightly/)
+**Windows GPU**               | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.html)                         | [PyPI](https://pypi.org/project/tf-nightly-gpu/)
+**Android**                   | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.html)                                 | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
+**Raspberry Pi 0 and 1**      | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.html)                             | [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv6l.whl)
+**Raspberry Pi 2 and 3**      | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.html)                             | [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv7l.whl)
+**Libtensorflow MacOS CPU**   | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-mac-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-mac-cpu.html)     | [Nightly GCS](https://storage.googleapis.com/libtensorflow-nightly) [Official GCS](https://storage.googleapis.com/tensorflow/)
+**Libtensorflow Linux CPU**   | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-cpu.html) | [Nightly GCS](https://storage.googleapis.com/libtensorflow-nightly) [Official GCS](https://storage.googleapis.com/tensorflow/)
+**Libtensorflow Linux GPU**   | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-linux-gpu.html) | [Nightly GCS](https://storage.googleapis.com/libtensorflow-nightly) [Official GCS](https://storage.googleapis.com/tensorflow/)
+**Libtensorflow Windows CPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-cpu.html)     | [Nightly GCS](https://storage.googleapis.com/libtensorflow-nightly) [Official GCS](https://storage.googleapis.com/tensorflow/)
+**Libtensorflow Windows GPU** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/libtensorflow-win-gpu.html)     | [Nightly GCS](https://storage.googleapis.com/libtensorflow-nightly) [Official GCS](https://storage.googleapis.com/tensorflow/)
 
 ### Community Supported Builds
 
@@ -145,19 +144,20 @@ Build Type
 *   [TensorFlow Tutorials](https://www.tensorflow.org/tutorials/)
 *   [TensorFlow Official Models](https://github.com/tensorflow/models/tree/master/official)
 *   [TensorFlow Examples](https://github.com/tensorflow/examples)
-*   [TensorFlow in Practice from Coursera](https://www.coursera.org/specializations/tensorflow-in-practice)
+*   [DeepLearning.AI TensorFlow Developer Professional Certificate](https://www.coursera.org/specializations/tensorflow-in-practice)
 *   [TensorFlow: Data and Deployment from Coursera](https://www.coursera.org/specializations/tensorflow-data-and-deployment)
 *   [Getting Started with TensorFlow 2 from Coursera](https://www.coursera.org/learn/getting-started-with-tensor-flow2)
 *   [Intro to TensorFlow for Deep Learning from Udacity](https://www.udacity.com/course/intro-to-tensorflow-for-deep-learning--ud187)
 *   [Introduction to TensorFlow Lite from Udacity](https://www.udacity.com/course/intro-to-tensorflow-lite--ud190)
 *   [Machine Learning with TensorFlow on GCP](https://www.coursera.org/specializations/machine-learning-tensorflow-gcp)
+*   [TensorFlow Codelabs](https://codelabs.developers.google.com/?cat=TensorFlow)
 *   [TensorFlow Chat Room on StackOverflow (not actively monitored by the
     TensorFlow team)](https://chat.stackoverflow.com/rooms/216694/tensorflow)
 *   [TensorFlow Blog](https://blog.tensorflow.org)
 *   [Learn ML with TensorFlow](https://www.tensorflow.org/resources/learn-ml)
 *   [TensorFlow Twitter](https://twitter.com/tensorflow)
 *   [TensorFlow YouTube](https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ)
-*   [TensorFlow Roadmap](https://www.tensorflow.org/community/roadmap)
+*   [TensorFlow Roadmap](https://www.tensorflow.org/model_optimization/guide/roadmap)
 *   [TensorFlow White Papers](https://www.tensorflow.org/about/bib)
 *   [TensorBoard Visualization Toolkit](https://github.com/tensorflow/tensorboard)
 
diff --git a/RELEASE.md b/RELEASE.md
index 7057657c340..5aac986a135 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -34,9 +34,33 @@
   shape assumptions (note that you can pass shapes with `None` entries for axes
   that are meant to be dynamic). You can also disable the input checking
   entirely by setting `model.input_spec = None`.
+* TF pip packages now use CUDA11 and cuDNN 8.0.2.
 * XLA:CPU and XLA:GPU devices are no longer registered by default. Use
   `TF_XLA_FLAGS=--tf_xla_enable_xla_devices` if you really need them (to be
   removed).
+* `tf.raw_ops.Max` and `tf.raw_ops.Min` no longer accept inputs of type
+  `tf.complex64` or `tf.complex128`, because the behavior of these ops is not
+  well defined for complex types.
+* `tf.data.experimental.service.DispatchServer` now takes a config tuple
+  instead of individual arguments. Usages should be updated to
+  `tf.data.experimental.service.DispatchServer(dispatcher_config)`.
+* `tf.data.experimental.service.WorkerServer` now takes a config tuple
+  instead of individual arguments. Usages should be updated to
+  `tf.data.experimental.service.WorkerServer(worker_config)`.
+* `tf.quantization.quantize_and_dequantize_v2` has been introduced, which
+  updates the gradient definition for quantization which is outside the range
+  to be 0. To simulate the V1 the behavior of
+  tf.quantization.quantize_and_dequantize(...) use
+  tf.grad_pass_through(tf.quantization.quantize_and_dequantize_v2)(...).
+* `tf.distribute.Strategy.experimental_make_numpy_dataset` is removed. Please
+  use `tf.data.Dataset.from_tensor_slices` instead.
+* `experimental_hints` in `tf.distribute.StrategyExtended.reduce_to`,
+  `tf.distribute.StrategyExtended.batch_reduce_to`,
+  `tf.distribute.ReplicaContext.all_reduce` are renamed to `options`.
+  `tf.distribute.experimental.CollectiveHints` is renamed
+  `tf.distribute.experimental.CommunicationOptions`.
+  `tf.distribute.experimental.CollectiveCommunication` is renamed
+  `tf.distribute.experimental.CommunicationImplementation`.
 
 ## Known Caveats
 
@@ -46,89 +70,180 @@
 
 * <INSERT MAJOR FEATURE HERE, USING MARKDOWN SYNTAX>
 * <IF RELEASE CONTAINS MULTIPLE FEATURES FROM SAME AREA, GROUP THEM TOGETHER>
-* A new module named `tf.experimental.numpy` is added, which is a NumPy-compatible API for writing TF programs. This module provides class `ndarray`, which mimics the `ndarray` class in NumPy, and wraps an immutable `tf.Tensor` under the hood. A subset of NumPy functions (e.g. `numpy.add`) are provided. Their inter-operation with TF facilities is seamless in most cases. See tensorflow/python/ops/numpy_ops/README.md for details of what are supported and what are the differences with NumPy.
+* A new module named `tf.experimental.numpy` is added, which is a NumPy-compatible API for writing TF programs. This module provides class `ndarray`, which mimics the `ndarray` class in NumPy, and wraps an immutable `tf.Tensor` under the hood. A subset of NumPy functions (e.g. `numpy.add`) are provided. Their inter-operation with TF facilities is seamless in most cases. See [tensorflow/python/ops/numpy_ops/README.md](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/numpy_ops/README.md) for details of what operations are supported and what are the differences from NumPy.
 * A major refactoring of the internals of the Keras Functional API has been completed, that should improve the reliability, stability, and performance of constructing Functional models.
 
+* `tf.distribute`:
+  * Deprecated `experimental_distribute_datasets_from_function` method and renamed it to `distribute_datasets_from_function` as it is no longer experimental.
+
 ## Bug Fixes and Other Changes
 
-* <SIMILAR TO ABOVE SECTION, BUT FOR OTHER IMPORTANT CHANGES / BUG FIXES>
-* <IF A CHANGE CLOSES A GITHUB ISSUE, IT SHOULD BE DOCUMENTED HERE>
-* <NOTES SHOULD BE GROUPED PER AREA>
-* TF Core:
-  * `tf.types.experimental.TensorLike` is a new `Union` type that can be used as
-    type annotation for variables representing a Tensor or a value that can be
-    converted to Tensor by `tf.convert_to_tensor`.
-  * Calling ops with a python constants or numpy values is now consistent with
-    tf.convert_to_tensor behavior. This avoids operations like tf.reshape
-    truncating inputs such as from int64 to int32.
-  * Added `tf.sparse.map_values` to apply a function to the `.value`s of `SparseTensror` arguments.
-  * The Python bitwise operators for `Tensor` (`__and__`, `__or__`, `__xor__`
-    and `__invert__` now support non-`bool` arguments and apply the
-    corresponding bitwise ops. `bool` arguments continue to be supported and
-    dispatch to logical ops. This brings them more in line with Python and NumPy
-    benavior.
-  * Added `tf.SparseTensor.with_values`. This returns a new SparseTensor with
-    the same sparsity pattern, but with new provided values. It is similar to
-    the `with_values` function of `RaggedTensor`.
-  * Added `StatelessCase` op, and uses it if none of case branches has stateful ops.
-* `tf.data`:
-    * Added new `tf.data.experimental.service.register_dataset` and
-     `tf.data.experimental.service.from_dataset_id` APIs to enable one process
-      to register a dataset with the tf.data service, and another process to
-      consume data from the dataset.
-    * Added support for tf.data service dispatcher fault tolerance. To enable
-      fault tolerance, configure a `work_dir` when running your dispatcher
-      server and set `dispatcher_fault_tolerance=True`. The dispatcher will
-      store its state to `work_dir`, so that on restart it can continue from its
-      previous state after restart.
-    * Added tf.data service support for sharing dataset graphs via shared
-      filesystem instead of over RPC. This reduces load on the dispatcher,
-      improving performance of distributing datasets. For this to work, the
-      dispatcher's `work_dir` must be accessible from workers. If the worker
-      fails to read from the `work_dir`, it falls back to using RPC for dataset
-      graph transfer.
-    * Added optional `exclude_cols` parameter to CsvDataset. This parameter is
-      the complement of `select_cols`; at most one of these should be specified.
-    * We have implemented an optimization which reorders data-discarding
-      transformations such as `take` and `shard` to happen earlier in the
-      dataset when it is safe to do so. The optimization can be disabled via
-      the `experimental_optimization.reorder_data_discarding_ops` dataset
-      option.
-    * `tf.data.Options` were previously immutable and can now be overriden.
-* `tf.image`:
-    * Added deterministic `tf.image.stateless_random_*` functions for each
-      `tf.image.random_*` function. Added a new op
-      `stateless_sample_distorted_bounding_box` which is a determinstic
-      version of `sample_distorted_bounding_box` op. Given the same seed, these
-      stateless functions/ops produce the same results independent of how many
-      times the function is called, and independent of global seed settings.
+*   <SIMILAR TO ABOVE SECTION, BUT FOR OTHER IMPORTANT CHANGES / BUG FIXES>
+*   <IF A CHANGE CLOSES A GITHUB ISSUE, IT SHOULD BE DOCUMENTED HERE>
+*   <NOTES SHOULD BE GROUPED PER AREA>
+*   Security:
+    *   Fixes an undefined behavior causing a segfault in `tf.raw_ops.Switch`
+        ([CVE-2020-15190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15190))
+    *   Fixes three vulnerabilities in conversion to DLPack format
+        ([CVE-2020-15191](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15191),
+        [CVE-2020-15192](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15192),
+        [CVE-2020-15193](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15193))
+    *   Fixes two vulnerabilities in `SparseFillEmptyRowsGrad`
+        ([CVE-2020-15194](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15194),
+        [CVE-2020-15195](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15195))
+    *   Fixes several vulnerabilities in `RaggedCountSparseOutput` and
+        `SparseCountSparseOutput` operations
+        ([CVE-2020-15196](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15196),
+        [CVE-2020-15197](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15197),
+        [CVE-2020-15198](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15198),
+        [CVE-2020-15199](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15199),
+        [CVE-2020-15200](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15200),
+        [CVE-2020-15201](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15201))
+    *   Fixes an integer truncation vulnerability in code using the work sharder
+        API
+        ([CVE-2020-15202](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15202))
+    *   Fixes a format string vulnerability in `tf.strings.as_string`
+        ([CVE-2020-15203](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15203))
+    *   Fixes segfault raised by calling session-only ops in eager mode
+        ([CVE-2020-15204](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15204))
+    *   Fixes data leak and potential ASLR violation from
+        `tf.raw_ops.StringNGrams`
+        ([CVE-2020-15205](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15205))
+    *   Fixes segfaults caused by incomplete `SavedModel` validation
+        ([CVE-2020-15206](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15206))
+    *   Fixes a data corruption due to a bug in negative indexing support in
+        TFLite
+        ([CVE-2020-15207](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15207))
+    *   Fixes a data corruption due to dimension mismatch in TFLite
+        ([CVE-2020-15208](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15208))
+    *   Fixes several vulnerabilities in TFLite saved model format
+        ([CVE-2020-15209](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15209),
+        [CVE-2020-15210](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15210),
+        [CVE-2020-15211](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15211))
+    *   Fixes several vulnerabilities in TFLite implementation of segment sum
+        ([CVE-2020-15212](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15212),
+        [CVE-2020-15213](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15213),
+        [CVE-2020-15214](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15214))
+*   TF Core:
+    *   `tf.types.experimental.TensorLike` is a new `Union` type that can be
+        used as type annotation for variables representing a Tensor or a value
+        that can be converted to Tensor by `tf.convert_to_tensor`.
+    *   Calling ops with a python constants or numpy values is now consistent
+        with tf.convert_to_tensor behavior. This avoids operations like
+        tf.reshape truncating inputs such as from int64 to int32.
+    *   Added `tf.sparse.map_values` to apply a function to the `.value`s of
+        `SparseTensor` arguments.
+    *   The Python bitwise operators for `Tensor` (`__and__`, `__or__`,
+        `__xor__` and `__invert__` now support non-`bool` arguments and apply
+        the corresponding bitwise ops. `bool` arguments continue to be supported
+        and dispatch to logical ops. This brings them more in line with Python
+        and NumPy behavior.
+    *   Added `tf.SparseTensor.with_values`. This returns a new SparseTensor
+        with the same sparsity pattern, but with new provided values. It is
+        similar to the `with_values` function of `RaggedTensor`.
+    *   Added `StatelessCase` op, and uses it if none of case branches has
+        stateful ops.
+    *   Added `tf.config.experimental.get_memory_usage` to return total memory
+        usage of the device.
+    * Added gradients for `RaggedTensorToVariant` and `RaggedTensorFromVariant`.
+*   `tf.data`:
+    *   tf.data service:
+    *   Added new `tf.data.experimental.service.register_dataset` and
+        `tf.data.experimental.service.from_dataset_id` APIs to enable one
+        process to register a dataset with the tf.data service, and another
+        process to consume data from the dataset.
+    *   Added support for dispatcher fault tolerance. To enable fault tolerance,
+        configure a `work_dir` when running your dispatcher server and set
+        `dispatcher_fault_tolerance=True`. The dispatcher will store its state
+        to `work_dir`, so that on restart it can continue from its previous
+        state after restart.
+    *   Added support for sharing dataset graphs via shared filesystem instead
+        of over RPC. This reduces load on the dispatcher, improving performance
+        of distributing datasets. For this to work, the dispatcher's `work_dir`
+        must be accessible from workers. If the worker fails to read from the
+        `work_dir`, it falls back to using RPC for dataset graph transfer.
+    *   Added support for a new "distributed_epoch" processing mode. This
+        processing mode distributes a dataset across all tf.data workers,
+        instead of having each worker process the full dataset. See
+        [the tf.data service docs](https://www.tensorflow.org/api_docs/python/tf/data/experimental/service#understand_processing_mode)
+        to learn more.
+    *   Added optional `exclude_cols` parameter to CsvDataset. This parameter is
+        the complement of `select_cols`; at most one of these should be
+        specified.
+    *   We have implemented an optimization which reorders data-discarding
+        transformations such as `take` and `shard` to happen earlier in the
+        dataset when it is safe to do so. The optimization can be disabled via
+        the `experimental_optimization.reorder_data_discarding_ops` dataset
+        option.
+    *   `tf.data.Options` were previously immutable and can now be overridden.
+    *   `tf.data.Dataset.from_generator` now supports Ragged and Sparse tensors
+        with a new `output_signature` argument, which allows `from_generator` to
+        produce any type describable by a `tf.TypeSpec`.
+    *   `tf.data.experimental.AUTOTUNE` is now available in the core API as
+        `tf.data.AUTOTUNE`.
+*   `tf.image`:
+    *   Added deterministic `tf.image.stateless_random_*` functions for each
+        `tf.image.random_*` function. Added a new op
+        `stateless_sample_distorted_bounding_box` which is a deterministic
+        version of `sample_distorted_bounding_box` op. Given the same seed,
+        these stateless functions/ops produce the same results independent of
+        how many times the function is called, and independent of global seed
+        settings.
 *   `tf.distribute`:
-    * <ADD RELEASE NOTES HERE>
-* `tf.keras`:
-    * Improvements from the functional API refactoring:
-      * Functional model construction does not need to maintain a global workspace graph, removing memory leaks especially when building many models or very large models.
-      * Functional model construction should be ~8-10% faster on average.
-      * Functional models can now contain non-symbolic values in their call inputs inside of the first positional argument.
-      * Several classes of TF ops that were not reliably converted to Keras layers during functional API construction should now work, e.g. `tf.image.ssim_multiscale`
-      * Error messages when Functional API construction goes wrong (and when ops cannot be converted to Keras layers automatically) should be clearer and easier to understand.
-    * `Optimizer.minimize` can now accept a loss `Tensor` and a `GradientTape`
-      as an alternative to accepting a `callable` loss.
-    * Added `beta` hyperparameter to FTRL optimizer classes (Keras and others)
-      to match FTRL paper (https://research.google.com/pubs/archive/41159.pdf).
-    * Added `mobilenet_v3` to keras application model.
-    * `Optimizer.__init__` now accepts a `gradient_aggregator` to allow for
-      customization of how gradients are aggregated across devices, as well as
-      `gradients_transformers` to allow for custom gradient transformations
-      (such as gradient clipping).
-* `tf.function` / AutoGraph:
-  * Added `experimental_follow_type_hints` argument for `tf.function`. When
-    True, the function may use type annotations to optimize the tracing
-    performance.
-  * Added support for `iter(DistributedDataset)` in AutoGraph `for` loops.
-  * AutoGraph now allows creating new symbols inside a TensorFLow loop, if
-    the values of these symbols at an iteration does not depend on the previous
-    iteration. These types of loops must run at least one iteration, and will
-    raise a runtime error otherwise.
+    *   <ADD RELEASE NOTES HERE>
+*   `tf.keras`:
+    *   Improvements from the functional API refactoring:
+        *   Functional model construction does not need to maintain a global
+            workspace graph, removing memory leaks especially when building many
+            models or very large models.
+        *   Functional model construction should be ~8-10% faster on average.
+        *   Functional models can now contain non-symbolic values in their call
+            inputs inside of the first positional argument.
+        *   Several classes of TF ops that were not reliably converted to Keras
+            layers during functional API construction should now work, e.g.
+            `tf.image.ssim_multiscale`
+        *   Error messages when Functional API construction goes wrong (and when
+            ops cannot be converted to Keras layers automatically) should be
+            clearer and easier to understand.
+    *   `Optimizer.minimize` can now accept a loss `Tensor` and a `GradientTape`
+        as an alternative to accepting a `callable` loss.
+    *   Added `beta` hyperparameter to FTRL optimizer classes (Keras and others)
+        to match FTRL paper
+        (https://research.google.com/pubs/archive/41159.pdf).
+    *   Added `mobilenet_v3` to keras application model.
+    *   `Optimizer.__init__` now accepts a `gradient_aggregator` to allow for
+        customization of how gradients are aggregated across devices, as well as
+        `gradients_transformers` to allow for custom gradient transformations
+        (such as gradient clipping).
+    *   The `steps_per_execution` argument in `compile()` is no longer
+        experimental; if you were passing `experimental_steps_per_execution`,
+        rename it to `steps_per_execution` in your code. This argument controls
+        the number of batches to run during each `tf.function` call when calling
+        `fit()`. Running multiple batches inside a single `tf.function` call can
+        greatly improve performance on TPUs or small models with a large Python
+        overhead.
+    *   Improvements to Keras preprocessing layers:
+        *   TextVectorization can now accept a vocabulary list or file as an
+            init arg.
+        *   Normalization can now accept mean and variance values as init args.
+    *   In `Attention` and `AdditiveAttention` layers, the `call()` method now
+        accepts a `return_attention_scores` argument. When set to
+        True, the layer returns the attention scores as an additional output
+        argument.
+    *   Added `tf.metrics.log_cosh` and `tf.metrics.logcosh` API entrypoints
+        with the same implementation as their `tf.losses` equivalent.
+    *   For Keras model, the individual call of `Model.evaluate` uses no cached
+        data for evaluation, while `Model.fit` uses cached data when
+        `validation_data` arg is provided for better performance.
+*   `tf.function` / AutoGraph:
+    *   Added `experimental_follow_type_hints` argument for `tf.function`. When
+        True, the function may use type annotations to optimize the tracing
+        performance.
+    *   Added support for `iter(DistributedDataset)` in AutoGraph `for` loops.
+    *   AutoGraph now allows creating new symbols inside a TensorFLow loop, if
+        the values of these symbols at an iteration does not depend on the
+        previous iteration. These types of loops must run at least one
+        iteration, and will raise a runtime error otherwise.
 
     Example:
 
@@ -137,45 +252,103 @@
       outputs = train_step(batch)
     tf.print('final outputs', outputs)
     ```
+
     See tensorflow/python/autograph/g3doc/reference/limitations.md for more
     info.
+
 *   `tf.lite`:
-    * `DynamicBuffer::AddJoinedString()` will now add a separator if the first
-      string to be joined is empty.
-    * `TFLiteConverter`:
-      * Support optional flags `inference_input_type` and `inference_output_type` for full integer quantized models. This allows users to modify the model input and output type to integer types (`tf.int8`, `tf.uint8`) instead of defaulting to float type (`tf.float32`).
-    * Deprecate `Interpreter::UseNNAPI(bool)` C++ API
-      * Prefer using `NnApiDelegate()` and related delegate configuration methods directly.
-    * Add NNAPI Delegation support for requantization use cases by converting the operation into a dequantize-quantize pair.
-    * <ADD RELEASE NOTES HERE>
+
+    *   `TFLiteConverter`:
+        *   Support optional flags `inference_input_type` and
+            `inference_output_type` for full integer quantized models. This
+            allows users to modify the model input and output type to integer
+            types (`tf.int8`, `tf.uint8`) instead of defaulting to float type
+            (`tf.float32`).
+    *   TFLite Profiler for Android is available. See the detailed
+        [guide](https://www.tensorflow.org/lite/performance/measurement#trace_tensorflow_lite_internals_in_android).
+    * NNAPI
+        *   Added NNAPI Delegation support for requantization use cases by
+            converting the operation into a dequantize-quantize pair.
+        *   Removed deprecated `Interpreter.setUseNNAPI(boolean)` Java API.
+            *   Use `Interpreter.Options.setUseNNAPI` instead.
+        *   Deprecate `Interpreter::UseNNAPI(bool)` C++ API.
+            *   Use `NnApiDelegate()` and related delegate configuration methods
+                directly.
+        *   Deprecate `Interpreter::SetAllowFp16PrecisionForFp32(bool)` C++ API
+            *   Prefer controlling this via delegate options, e.g.
+                `tflite::StatefulNnApiDelegate::Options::allow_fp16' or
+                `TfLiteGpuDelegateOptionsV2::is_precision_loss_allowed`.
+    *   `DynamicBuffer::AddJoinedString()` will now add a separator if the first
+        string to be joined is empty.
+    *   <ADD RELEASE NOTES HERE>
+
 *   `tf.random`:
-    * <ADD RELEASE NOTES HERE>
+
+    *   <ADD RELEASE NOTES HERE>
+
 *   Math and Linear Algebra:
-    * <ADD RELEASE NOTES HERE>
+
+    * Add `tf.math.erfcinv`, the inverse to `tf.math.erfc`.
+
 *   TPU Enhancements:
-    * Added support for the `beta` parameter of the FTRL optimizer for TPU
-      embeddings. Users of other TensorFlow platforms can implement equivalent
-      behavior by adjusting the `l2` parameter.
-    * <ADD RELEASE NOTES HERE>
+
+    *   Added support for the `beta` parameter of the FTRL optimizer for TPU
+        embeddings. Users of other TensorFlow platforms can implement equivalent
+        behavior by adjusting the `l2` parameter.
+    *   <ADD RELEASE NOTES HERE>
+
 *   XLA Support:
-    * xla.experimental.compile is deprecated, use
-      `tf.function(experimental_compile=True)` instead
-    * <ADD RELEASE NOTES HERE>
+
+    *   xla.experimental.compile is deprecated, use
+        `tf.function(experimental_compile=True)` instead
+    *   Added `tf.function.experimental_get_compiler_ir` which returns compiler
+        IR (currently 'hlo' and 'optimized_hlo') for given input for given
+        function.
+    *   <ADD RELEASE NOTES HERE>
+
 *   Tracing and Debugging:
-    * <ADD RELEASE NOTES HERE>
+
+    *   <ADD RELEASE NOTES HERE>
+
 *   `tf.train.Checkpoint`:
-    * Now accepts a `root` argument in the initialization, which generates a
-      checkpoint with a root object. This allows users to create a `Checkpoint`
-      object that is compatible with Keras `model.save_weights()` and
-      `model.load_weights`. The checkpoint is also compatible with the
-      checkpoint saved in the `variables/` folder in the SavedModel.
-    * When restoring, `save_path` can be a path to a SavedModel. The function
-      will automatically find the checkpoint in the SavedModel.
+
+    *   Now accepts a `root` argument in the initialization, which generates a
+        checkpoint with a root object. This allows users to create a
+        `Checkpoint` object that is compatible with Keras `model.save_weights()`
+        and `model.load_weights`. The checkpoint is also compatible with the
+        checkpoint saved in the `variables/` folder in the SavedModel.
+    *   When restoring, `save_path` can be a path to a SavedModel. The function
+        will automatically find the checkpoint in the SavedModel.
+
+*   `tf.nn`:
+
+    *   `tf.nn.max_pool2d` now supports explicit padding.
+
+*   `tf.debugging`:
+
+    *   `tf.debugging.assert_shapes()` now works on `SparseTensor`s (#36268).
+
+*   `tf.print`:
+
+    *   Bug fix in `tf.print()` with `OrderedDict` where if an `OrderedDict`
+        didn't have the keys sorted, the keys and values were not being printed
+        in accordance with their correct mapping.
+
+*    `TensorRT`
+
+    *   We now issue a warning when the `session_config` parameter for the TF1
+        converter is used or the `rewrite_config_template` field in the TF2
+        converter parameter object is used.
+
 *   Other:
-    * We have replaced uses of "whitelist" and "blacklist" with "allowlist"
-  and "denylist" where possible. Please see 
-  https://developers.google.com/style/word-list#blacklist for more context.
-    * <ADD RELEASE NOTES HERE>
+
+    *   We have replaced uses of "whitelist" and "blacklist" with "allowlist"
+        and "denylist" where possible. Please see
+        https://developers.google.com/style/word-list#blacklist for more
+        context.
+    *   Add `tf.config.experimental.mlir_bridge_rollout` which will help us
+        rollout the new MLIR TPU bridge.
+    *   <ADD RELEASE NOTES HERE>
 
 ## Thanks to our Contributors
 
@@ -183,45 +356,327 @@ This release contains contributions from many people at Google, as well as:
 
 stjohnso98, <NAME>, <HERE>, <USING>, <GITHUB>, <HANDLE>
 
+
+# Release 2.3.1
+
+## Bug Fixes and Other Changes
+* Fixes an undefined behavior causing a segfault in `tf.raw_ops.Switch`
+  ([CVE-2020-15190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15190))
+* Fixes three vulnerabilities in conversion to DLPack format
+  ([CVE-2020-15191](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15191),
+  [CVE-2020-15192](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15192),
+  [CVE-2020-15193](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15193))
+* Fixes two vulnerabilities in `SparseFillEmptyRowsGrad`
+  ([CVE-2020-15194](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15194),
+  [CVE-2020-15195](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15195))
+* Fixes several vulnerabilities in `RaggedCountSparseOutput` and
+  `SparseCountSparseOutput` operations
+  ([CVE-2020-15196](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15196),
+  [CVE-2020-15197](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15197),
+  [CVE-2020-15198](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15198),
+  [CVE-2020-15199](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15199),
+  [CVE-2020-15200](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15200),
+  [CVE-2020-15201](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15201))
+* Fixes an integer truncation vulnerability in code using the work sharder API
+  ([CVE-2020-15202](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15202))
+* Fixes a format string vulnerability in `tf.strings.as_string`
+  ([CVE-2020-15203](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15203))
+* Fixes segfault raised by calling session-only ops in eager mode
+  ([CVE-2020-15204](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15204))
+* Fixes data leak and potential ASLR violation from `tf.raw_ops.StringNGrams`
+  ([CVE-2020-15205](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15205))
+* Fixes segfaults caused by incomplete `SavedModel` validation
+  ([CVE-2020-15206](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15206))
+* Fixes a data corruption due to a bug in negative indexing support in TFLite
+  ([CVE-2020-15207](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15207))
+* Fixes a data corruption due to dimension mismatch in TFLite
+  ([CVE-2020-15208](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15208))
+* Fixes several vulnerabilities in TFLite saved model format
+  ([CVE-2020-15209](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15209),
+  [CVE-2020-15210](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15210),
+  [CVE-2020-15211](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15211))
+* Fixes several vulnerabilities in TFLite implementation of segment sum
+  ([CVE-2020-15212](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15212),
+  [CVE-2020-15213](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15213),
+  [CVE-2020-15214](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15214))
+* Updates `sqlite3` to `3.33.00` to handle
+  [CVE-2020-15358](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15358).
+* Fixes deprecated usage of `collections` API
+* Removes `scipy` dependency from `setup.py` since TensorFlow does not need it
+  to install the pip package
+
+
+# Release 2.2.1
+
+## Bug Fixes and Other Changes
+* Fixes an undefined behavior causing a segfault in `tf.raw_ops.Switch`
+  ([CVE-2020-15190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15190))
+* Fixes three vulnerabilities in conversion to DLPack format
+  ([CVE-2020-15191](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15191),
+  [CVE-2020-15192](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15192),
+  [CVE-2020-15193](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15193))
+* Fixes two vulnerabilities in `SparseFillEmptyRowsGrad`
+  ([CVE-2020-15194](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15194),
+  [CVE-2020-15195](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15195))
+* Fixes an integer truncation vulnerability in code using the work sharder API
+  ([CVE-2020-15202](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15202))
+* Fixes a format string vulnerability in `tf.strings.as_string`
+  ([CVE-2020-15203](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15203))
+* Fixes segfault raised by calling session-only ops in eager mode
+  ([CVE-2020-15204](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15204))
+* Fixes data leak and potential ASLR violation from `tf.raw_ops.StringNGrams`
+  ([CVE-2020-15205](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15205))
+* Fixes segfaults caused by incomplete `SavedModel` validation
+  ([CVE-2020-15206](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15206))
+* Fixes a data corruption due to a bug in negative indexing support in TFLite
+  ([CVE-2020-15207](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15207))
+* Fixes a data corruption due to dimension mismatch in TFLite
+  ([CVE-2020-15208](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15208))
+* Fixes several vulnerabilities in TFLite saved model format
+  ([CVE-2020-15209](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15209),
+  [CVE-2020-15210](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15210),
+  [CVE-2020-15211](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15211))
+* Fixes several vulnerabilities in TFLite implementation of segment sum
+  ([CVE-2020-15212](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15212),
+  [CVE-2020-15213](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15213),
+  [CVE-2020-15214](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15214))
+* Updates `sqlite3` to `3.33.00` to handle
+  [CVE-2020-9327](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-9327),
+  [CVE-2020-11655](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11655),
+  [CVE-2020-11656](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11656),
+  [CVE-2020-13434](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13434),
+  [CVE-2020-13435](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13435),
+  [CVE-2020-13630](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13630),
+  [CVE-2020-13631](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13631),
+  [CVE-2020-13871](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13871),
+  and
+  [CVE-2020-15358](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15358).
+* Fixes deprecated usage of `collections` API
+* Removes `scipy` dependency from `setup.py` since TensorFlow does not need it
+  to install the pip package
+
+
+# Release 2.1.2
+
+## Bug Fixes and Other Changes
+* Fixes an undefined behavior causing a segfault in `tf.raw_ops.Switch`
+  ([CVE-2020-15190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15190))
+* Fixes three vulnerabilities in conversion to DLPack format
+  ([CVE-2020-15191](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15191),
+  [CVE-2020-15192](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15192),
+  [CVE-2020-15193](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15193))
+* Fixes two vulnerabilities in `SparseFillEmptyRowsGrad`
+  ([CVE-2020-15194](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15194),
+  [CVE-2020-15195](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15195))
+* Fixes an integer truncation vulnerability in code using the work sharder API
+  ([CVE-2020-15202](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15202))
+* Fixes a format string vulnerability in `tf.strings.as_string`
+  ([CVE-2020-15203](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15203))
+* Fixes segfault raised by calling session-only ops in eager mode
+  ([CVE-2020-15204](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15204))
+* Fixes data leak and potential ASLR violation from `tf.raw_ops.StringNGrams`
+  ([CVE-2020-15205](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15205))
+* Fixes segfaults caused by incomplete `SavedModel` validation
+  ([CVE-2020-15206](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15206))
+* Fixes a data corruption due to a bug in negative indexing support in TFLite
+  ([CVE-2020-15207](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15207))
+* Fixes a data corruption due to dimension mismatch in TFLite
+  ([CVE-2020-15208](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15208))
+* Fixes several vulnerabilities in TFLite saved model format
+  ([CVE-2020-15209](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15209),
+  [CVE-2020-15210](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15210),
+  [CVE-2020-15211](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15211))
+* Updates `sqlite3` to `3.33.00` to handle
+  [CVE-2020-9327](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-9327),
+  [CVE-2020-11655](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11655),
+  [CVE-2020-11656](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11656),
+  [CVE-2020-13434](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13434),
+  [CVE-2020-13435](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13435),
+  [CVE-2020-13630](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13630),
+  [CVE-2020-13631](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13631),
+  [CVE-2020-13871](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13871),
+  and
+  [CVE-2020-15358](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15358).
+* Removes `scipy` dependency from `setup.py` since TensorFlow does not need it
+  to install the pip package
+* Switches ROCM builds to use ROCM 3.7
+
+
+# Release 2.0.3
+
+## Bug Fixes and Other Changes
+* Fixes an undefined behavior causing a segfault in `tf.raw_ops.Switch`
+  ([CVE-2020-15190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15190))
+* Fixes three vulnerabilities in conversion to DLPack format
+  ([CVE-2020-15191](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15191),
+  [CVE-2020-15192](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15192),
+  [CVE-2020-15193](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15193))
+* Fixes two vulnerabilities in `SparseFillEmptyRowsGrad`
+  ([CVE-2020-15194](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15194),
+  [CVE-2020-15195](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15195))
+* Fixes an integer truncation vulnerability in code using the work sharder API
+  ([CVE-2020-15202](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15202))
+* Fixes a format string vulnerability in `tf.strings.as_string`
+  ([CVE-2020-15203](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15203))
+* Fixes segfault raised by calling session-only ops in eager mode
+  ([CVE-2020-15204](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15204))
+* Fixes data leak and potential ASLR violation from `tf.raw_ops.StringNGrams`
+  ([CVE-2020-15205](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15205))
+* Fixes segfaults caused by incomplete `SavedModel` validation
+  ([CVE-2020-15206](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15206))
+* Fixes a data corruption due to a bug in negative indexing support in TFLite
+  ([CVE-2020-15207](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15207))
+* Fixes a data corruption due to dimension mismatch in TFLite
+  ([CVE-2020-15208](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15208))
+* Fixes several vulnerabilities in TFLite saved model format
+  ([CVE-2020-15209](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15209),
+  [CVE-2020-15210](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15210),
+  [CVE-2020-15211](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15211))
+* Updates `sqlite3` to `3.33.00` to handle
+  [CVE-2020-9327](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-9327),
+  [CVE-2020-11655](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11655),
+  [CVE-2020-11656](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11656),
+  [CVE-2020-13434](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13434),
+  [CVE-2020-13435](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13435),
+  [CVE-2020-13630](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13630),
+  [CVE-2020-13631](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13631),
+  [CVE-2020-13871](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13871),
+  and
+  [CVE-2020-15358](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15358).
+* Pins `numpy` to 1.18.5 to prevent ABI breakage when compiling code that uses
+  both NumPy and TensorFlow headers.
+
+
+# Release 1.15.4
+
+## Bug Fixes and Other Changes
+* Fixes an undefined behavior causing a segfault in `tf.raw_ops.Switch`
+  ([CVE-2020-15190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15190))
+* Fixes three vulnerabilities in conversion to DLPack format
+  ([CVE-2020-15191](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15191),
+  [CVE-2020-15192](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15192),
+  [CVE-2020-15193](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15193))
+* Fixes two vulnerabilities in `SparseFillEmptyRowsGrad`
+  ([CVE-2020-15194](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15194),
+  [CVE-2020-15195](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15195))
+* Fixes an integer truncation vulnerability in code using the work sharder API
+  ([CVE-2020-15202](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15202))
+* Fixes a format string vulnerability in `tf.strings.as_string`
+  ([CVE-2020-15203](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15203))
+* Fixes segfault raised by calling session-only ops in eager mode
+  ([CVE-2020-15204](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15204))
+* Fixes data leak and potential ASLR violation from `tf.raw_ops.StringNGrams`
+  ([CVE-2020-15205](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15205))
+* Fixes segfaults caused by incomplete `SavedModel` validation
+  ([CVE-2020-15206](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15206))
+* Fixes a data corruption due to a bug in negative indexing support in TFLite
+  ([CVE-2020-15207](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15207))
+* Fixes a data corruption due to dimension mismatch in TFLite
+  ([CVE-2020-15208](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15208))
+* Fixes several vulnerabilities in TFLite saved model format
+  ([CVE-2020-15209](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15209),
+  [CVE-2020-15210](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15210),
+  [CVE-2020-15211](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15211))
+* Updates `sqlite3` to `3.33.00` to handle
+  [CVE-2020-9327](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-9327),
+  [CVE-2020-11655](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11655),
+  [CVE-2020-11656](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-11656),
+  [CVE-2020-13434](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13434),
+  [CVE-2020-13435](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13435),
+  [CVE-2020-13630](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13630),
+  [CVE-2020-13631](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13631),
+  [CVE-2020-13871](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-13871),
+  and
+  [CVE-2020-15358](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2020-15358).
+* Fixes #41630 by including `max_seq_length` in CuDNN descriptor cache key
+* Pins `numpy` to 1.18.5 to prevent ABI breakage when compiling code that uses
+  both NumPy and TensorFlow headers.
+
+
 # Release 2.3.0
 
 ## Major Features and Improvements
-  * `tf.data` adds two new mechanisms to solve input pipeline bottlenecks and save resources:
-    * [snapshot](https://www.tensorflow.org/api_docs/python/tf/data/experimental/snapshot)
-    * [tf.data service](https://www.tensorflow.org/api_docs/python/tf/data/experimental/service).
 
-  In addition checkout the detailed [guide](https://www.tensorflow.org/guide/data_performance_analysis) for analyzing input pipeline performance with TF Profiler.
+*   `tf.data` adds two new mechanisms to solve input pipeline bottlenecks and
+    save resources:
 
-  * [`tf.distribute.TPUStrategy`](https://www.tensorflow.org/api_docs/python/tf/distribute/TPUStrategy) is now a stable API and no longer considered experimental for TensorFlow. (earlier `tf.distribute.experimental.TPUStrategy`).
+    *   [snapshot](https://www.tensorflow.org/api_docs/python/tf/data/experimental/snapshot)
+    *   [tf.data service](https://www.tensorflow.org/api_docs/python/tf/data/experimental/service).
 
-  * [TF Profiler](https://www.tensorflow.org/guide/profiler) introduces two new tools: a memory profiler to visualize your model’s memory usage over time and a [python tracer](https://www.tensorflow.org/guide/profiler#events) which allows you to trace python function calls in your model. Usability improvements include better diagnostic messages and [profile options](https://tensorflow.org/guide/profiler#collect_performance_data) to customize the host and device trace verbosity level.
+    In addition checkout the detailed
+    [guide](https://www.tensorflow.org/guide/data_performance_analysis) for
+    analyzing input pipeline performance with TF Profiler.
 
-  * Introduces experimental support for Keras Preprocessing Layers API ([`tf.keras.layers.experimental.preprocessing.*`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing?version=nightly)) to handle data preprocessing operations, with support for composite tensor inputs. Please see below for additional details on these layers.
+*   [`tf.distribute.TPUStrategy`](https://www.tensorflow.org/api_docs/python/tf/distribute/TPUStrategy)
+    is now a stable API and no longer considered experimental for TensorFlow.
+    (earlier `tf.distribute.experimental.TPUStrategy`).
 
-  * TFLite now properly supports dynamic shapes during conversion and inference. We’ve also added opt-in support on Android and iOS for [XNNPACK](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/xnnpack), a highly optimized set of CPU kernels, as well as opt-in support for [executing quantized models on the GPU](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/gpu_advanced.md#running-quantized-models-experimental).
+*   [TF Profiler](https://www.tensorflow.org/guide/profiler) introduces two new
+    tools: a memory profiler to visualize your model’s memory usage over time
+    and a [python tracer](https://www.tensorflow.org/guide/profiler#events)
+    which allows you to trace python function calls in your model. Usability
+    improvements include better diagnostic messages and
+    [profile options](https://tensorflow.org/guide/profiler#collect_performance_data)
+    to customize the host and device trace verbosity level.
 
-  * Libtensorflow packages are available in GCS starting this release. We have also started to [release a nightly version of these packages](https://github.com/tensorflow/tensorflow#official-builds).
+*   Introduces experimental support for Keras Preprocessing Layers API
+    ([`tf.keras.layers.experimental.preprocessing.*`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/experimental/preprocessing?version=nightly))
+    to handle data preprocessing operations, with support for composite tensor
+    inputs. Please see below for additional details on these layers.
 
-  * The experimental Python API [`tf.debugging.experimental.enable_dump_debug_info()`](https://www.tensorflow.org/api_docs/python/tf/debugging/experimental/enable_dump_debug_info) now allows you to instrument a TensorFlow program and dump debugging information to a directory on the file system. The directory can be read and visualized by a new interactive dashboard in TensorBoard 2.3 called [Debugger V2](https://www.tensorflow.org/tensorboard/debugger_v2), which reveals the details of the TensorFlow program including graph structures, history of op executions at the Python (eager) and intra-graph levels, the runtime dtype, shape, and numerical composistion of tensors, as well as their code locations.
+*   TFLite now properly supports dynamic shapes during conversion and inference.
+    We’ve also added opt-in support on Android and iOS for
+    [XNNPACK](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/delegates/xnnpack),
+    a highly optimized set of CPU kernels, as well as opt-in support for
+    [executing quantized models on the GPU](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/gpu_advanced.md#running-quantized-models-experimental).
+
+*   Libtensorflow packages are available in GCS starting this release. We have
+    also started to
+    [release a nightly version of these packages](https://github.com/tensorflow/tensorflow#official-builds).
+
+*   The experimental Python API
+    [`tf.debugging.experimental.enable_dump_debug_info()`](https://www.tensorflow.org/api_docs/python/tf/debugging/experimental/enable_dump_debug_info)
+    now allows you to instrument a TensorFlow program and dump debugging
+    information to a directory on the file system. The directory can be read and
+    visualized by a new interactive dashboard in TensorBoard 2.3 called
+    [Debugger V2](https://www.tensorflow.org/tensorboard/debugger_v2), which
+    reveals the details of the TensorFlow program including graph structures,
+    history of op executions at the Python (eager) and intra-graph levels, the
+    runtime dtype, shape, and numerical composition of tensors, as well as their
+    code locations.
 
 ## Breaking Changes
-* Increases the **minimum bazel version** required to build TF to **3.1.0**.
-* `tf.data`
-  *  Makes the following (breaking) changes to the `tf.data`.
-    * C++ API: - `IteratorBase::RestoreInternal`, `IteratorBase::SaveInternal`, and `DatasetBase::CheckExternalState` become pure-virtual and subclasses are now expected to provide an implementation.
-    * The deprecated `DatasetBase::IsStateful` method is removed in favor of `DatasetBase::CheckExternalState`.
-    * Deprecated overrides of `DatasetBase::MakeIterator` and `MakeIteratorFromInputElement` are removed.
-  * The signature of `tensorflow::data::IteratorBase::SaveInternal` and `tensorflow::data::IteratorBase::SaveInput` has been extended with `SerializationContext` argument to enable overriding the default policy for the handling external state during iterator checkpointing. This is not a backwards compatible change and all subclasses of `IteratorBase` *need to be updated* accordingly.
-* `tf.keras`
-    * Add a new `BackupAndRestore` callback for handling distributed training failures & restarts. Please take a look at this [tutorial](https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras) for details on how to use the callback.
-* `tf.image.extract_glimpse` has been updated to correctly process the case
-   where `centered=False` and `normalized=False`. This is a breaking change as
-   the output is different from (incorrect) previous versions. Note this
-   breaking change only impacts `tf.image.extract_glimpse` and
-   `tf.compat.v2.image.extract_glimpse` API endpoints. The behavior of
-   `tf.compat.v1.image.extract_glimpse` does not change. The behavior of
-   exsiting C++ kernel `ExtractGlimpse` does not change either, so saved
-   models using `tf.raw_ops.ExtractGlimpse` will not be impacted.
+
+*   Increases the **minimum bazel version** required to build TF to **3.1.0**.
+*   `tf.data`
+    *   Makes the following (breaking) changes to the `tf.data`.
+    *   C++ API: - `IteratorBase::RestoreInternal`,
+        `IteratorBase::SaveInternal`, and `DatasetBase::CheckExternalState`
+        become pure-virtual and subclasses are now expected to provide an
+        implementation.
+    *   The deprecated `DatasetBase::IsStateful` method is removed in favor of
+        `DatasetBase::CheckExternalState`.
+    *   Deprecated overrides of `DatasetBase::MakeIterator` and
+        `MakeIteratorFromInputElement` are removed.
+    *   The signature of `tensorflow::data::IteratorBase::SaveInternal` and
+        `tensorflow::data::IteratorBase::SaveInput` has been extended with
+        `SerializationContext` argument to enable overriding the default policy
+        for the handling external state during iterator checkpointing. This is
+        not a backwards compatible change and all subclasses of `IteratorBase`
+        *need to be updated* accordingly.
+*   `tf.keras`
+    *   Add a new `BackupAndRestore` callback for handling distributed training
+        failures & restarts. Please take a look at this
+        [tutorial](https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras)
+        for details on how to use the callback.
+*   `tf.image.extract_glimpse` has been updated to correctly process the case
+    where `centered=False` and `normalized=False`. This is a breaking change as
+    the output is different from (incorrect) previous versions. Note this
+    breaking change only impacts `tf.image.extract_glimpse` and
+    `tf.compat.v2.image.extract_glimpse` API endpoints. The behavior of
+    `tf.compat.v1.image.extract_glimpse` does not change. The behavior of
+    existing C++ kernel `ExtractGlimpse` does not change either, so saved models
+    using `tf.raw_ops.ExtractGlimpse` will not be impacted.
 
 ## Known Caveats
   * `tf.lite`
@@ -791,7 +1246,7 @@ This release contains contributions from many people at Google, as well as:
 8bitmp3, Aaron Ma, AbdüLhamit Yilmaz, Abhai Kollara, aflc, Ag Ramesh, Albert Z. Guo, Alex Torres, amoitra, Andrii Prymostka, angeliand, Anshuman Tripathy, Anthony Barbier, Anton Kachatkou, Anubh-V, Anuja Jakhade, Artem Ryabov, autoih, Bairen Yi, Bas Aarts, Basit Ayantunde, Ben Barsdell, Bhavani Subramanian, Brett Koonce, candy.dc, Captain-Pool, caster, cathy, Chong Yan, Choong Yin Thong, Clayne Robison, Colle, Dan Ganea, David Norman, David Refaeli, dengziming, Diego Caballero, Divyanshu, djshen, Douman, Duncan Riach, EFanZh, Elena Zhelezina, Eric Schweitz, Evgenii Zheltonozhskii, Fei Hu, fo40225, Fred Reiss, Frederic Bastien, Fredrik Knutsson, fsx950223, fwcore, George Grzegorz Pawelczak, George Sterpu, Gian Marco Iodice, Giorgio Arena, giuros01, Gomathi Ramamurthy, Guozhong Zhuang, Haifeng Jin, Haoyu Wu, HarikrishnanBalagopal, HJYOO, Huang Chen-Yi, Ilham Firdausi Putra, Imran Salam, Jared Nielsen, Jason Zaman, Jasper Vicenti, Jeff Daily, Jeff Poznanovic, Jens Elofsson, Jerry Shih, jerryyin, Jesper Dramsch, jim.meyer, Jongwon Lee, Jun Wan, Junyuan Xie, Kaixi Hou, kamalkraj, Kan Chen, Karthik Muthuraman, Keiji Ariyama, Kevin Rose, Kevin Wang, Koan-Sin Tan, kstuedem, Kwabena W. Agyeman, Lakshay Tokas, latyas, Leslie-Fang-Intel, Li, Guizi, Luciano Resende, Lukas Folle, Lukas Geiger, Mahmoud Abuzaina, Manuel Freiberger, Mark Ryan, Martin Mlostek, Masaki Kozuki, Matthew Bentham, Matthew Denton, mbhuiyan, mdfaijul, Muhwan Kim, Nagy Mostafa, nammbash, Nathan Luehr, Nathan Wells, Niranjan Hasabnis, Oleksii Volkovskyi, Olivier Moindrot, olramde, Ouyang Jin, OverLordGoldDragon, Pallavi G, Paul Andrey, Paul Wais, pkanwar23, Pooya Davoodi, Prabindh Sundareson, Rajeshwar Reddy T, Ralovich, Kristof, Refraction-Ray, Richard Barnes, richardbrks, Robert Herbig, Romeo Kienzler, Ryan Mccormick, saishruthi, Saket Khandelwal, Sami Kama, Sana Damani, Satoshi Tanaka, Sergey Mironov, Sergii Khomenko, Shahid, Shawn Presser, ShengYang1, Siddhartha Bagaria, Simon Plovyt, skeydan, srinivasan.narayanamoorthy, Stephen Mugisha, sunway513, Takeshi Watanabe, Taylor Jakobson, TengLu, TheMindVirus, ThisIsIsaac, Tim Gates, Timothy Liu, Tomer Gafner, Trent Lo, Trevor Hickey, Trevor Morris, vcarpani, Wei Wang, Wen-Heng (Jack) Chung, wenshuai, Wenshuai-Xiaomi, wenxizhu, william, William D. Irons, Xinan Jiang, Yannic, Yasir Modak, Yasuhiro Matsumoto, Yong Tang, Yongfeng Gu, Youwei Song, Zaccharie Ramzi, Zhang, Zhenyu Guo, 王振华 (Zhenhua Wang), 韩董, 이중건 Isaac Lee
 
 # Release 1.15.0
-This is the last 1.x release for TensorFlow. We do not expect to update the 1.x branch with features, although we will issue patch releases to fix vulnerabilities for at least one year. 
+This is the last 1.x release for TensorFlow. We do not expect to update the 1.x branch with features, although we will issue patch releases to fix vulnerabilities for at least one year.
 
 ## Major Features and Improvements
 * As [announced](https://groups.google.com/a/tensorflow.org/forum/#!topic/developers/iRCt5m4qUz0), `tensorflow` pip package will by default include GPU support (same as `tensorflow-gpu` now) for the platforms we currently have GPU support (Linux and Windows). It will work on machines with and without Nvidia GPUs. `tensorflow-gpu` will still be available, and CPU-only packages can be downloaded at `tensorflow-cpu` for users who are concerned about package size.
@@ -801,7 +1256,7 @@ This enables writing forward compatible code: by explicitly importing either `te
 * Add toggles `tf.enable_control_flow_v2()` and `tf.disable_control_flow_v2()` for enabling/disabling v2 control flow.
 * Enable v2 control flow as part of `tf.enable_v2_behavior()` and `TF2_BEHAVIOR=1`.
 * AutoGraph translates Python control flow into TensorFlow expressions, allowing users to write regular Python inside `tf.function`-decorated functions. AutoGraph is also applied in functions used with `tf.data`, `tf.distribute` and `tf.keras` APIS.
-* Adds `enable_tensor_equality()`, which switches the behavior such that: 
+* Adds `enable_tensor_equality()`, which switches the behavior such that:
   * Tensors are no longer hashable.
   * Tensors can be compared with `==` and `!=`, yielding a Boolean Tensor with element-wise comparison results. This will be the default behavior in 2.0.
 
@@ -957,12 +1412,12 @@ For information on upgrading your existing TensorFlow 1.x models, please refer t
   * TensorFlow 2.0.0 is built using devtoolset7 (GCC7) on Ubuntu 16. This may lead to ABI incompatibilities with extensions built against earlier versions of TensorFlow.
   * Tensorflow code now produces 2 different pip packages: tensorflow_core containing all the code (in the future it will contain only the private implementation) and tensorflow which is a virtual pip package doing forwarding to tensorflow_core (and in the future will contain only the public API of tensorflow). We don't expect this to be breaking, unless you were importing directly from the implementation.
   Removed the `freeze_graph` command line tool; `SavedModel` should be used in place of frozen graphs.
-  
+
 * `tf.contrib`:
   * `tf.contrib` has been deprecated, and functionality has been either migrated to the core TensorFlow API, to an ecosystem project such as [tensorflow/addons](https://www.github.com/tensorflow/addons) or [tensorflow/io](https://www.github.com/tensorflow/io), or removed entirely.
   * Remove `tf.contrib.timeseries` dependency on TF distributions.
   * Replace contrib references with `tf.estimator.experimental.*` for apis in `early_stopping.py`.
-  
+
 * `tf.estimator`:
   * Premade estimators in the tf.estimator.DNN/Linear/DNNLinearCombined family have been updated to use `tf.keras.optimizers` instead of the `tf.compat.v1.train.Optimizer`s. If you do not pass in an `optimizer=` arg or if you use a string, the premade estimator will use the Keras optimizer. This is checkpoint breaking, as the optimizers have separate variables. A checkpoint converter tool for converting optimizers is included with the release,  but if you want to avoid any change, switch to the v1 version of the estimator:  `tf.compat.v1.estimator.DNN/Linear/DNNLinearCombined*`.
   * Default aggregation for canned Estimators is now `SUM_OVER_BATCH_SIZE`. To maintain previous default behavior, please pass `SUM` as the loss aggregation method.
@@ -970,13 +1425,13 @@ For information on upgrading your existing TensorFlow 1.x models, please refer t
   * `Estimator.export_savedmodel` has been renamed to `export_saved_model`.
   * When saving to SavedModel, Estimators will strip default op attributes. This is almost always the correct behavior, as it is more forwards compatible, but if you require that default attributes to be saved with the model, please use `tf.compat.v1.Estimator`.
   * Feature Columns have been upgraded to be more Eager-friendly and to work with Keras. As a result, `tf.feature_column.input_layer` has been deprecated in favor of `tf.keras.layers.DenseFeatures`. v1 feature columns have direct analogues in v2 except for `shared_embedding_columns`, which are not cross-compatible with v1 and v2. Use `tf.feature_column.shared_embeddings` instead.
-  
+
 * `tf.keras`:
   * `OMP_NUM_THREADS` is no longer used by the default Keras config.  To configure the number of threads, use `tf.config.threading` APIs.
   * `tf.keras.model.save_model` and `model.save` now defaults to saving a TensorFlow SavedModel. HDF5 files are still supported.
   * Deprecated `tf.keras.experimental.export_saved_model` and `tf.keras.experimental.function`. Please use `tf.keras.models.save_model(..., save_format='tf')` and `tf.keras.models.load_model` instead.
   * Layers now default to float32, and automatically cast their inputs to the layer's dtype. If you had a model that used float64, it will probably silently use float32 in TensorFlow 2, and a warning will be issued that starts with `Layer <layer-name>` is casting an input tensor from dtype float64 to the layer's dtype of float32. To fix, either set the default dtype to float64 with `tf.keras.backend.set_floatx('float64')`, or pass `dtype='float64'` to each of the Layer constructors. See `tf.keras.layers.Layer` for more information.
- 
+
 * `tf.lite`:
   * Removed `lite.OpHint`, `lite.experimental`, and `lite.constant` from 2.0 API.
 * Tensors are no longer hashable, but instead compare element-wise with `==` and `!=`. Use `tf.compat.v1.disable_tensor_equality()` to return to the previous behavior.
@@ -1211,8 +1666,8 @@ If you experience any snags when using TF 2.0, please let us know at the [TF 2.0
         conversion. TensorRT initialization arguments are now passed wrapped in
         a named-tuple, `TrtConversionParams`, rather than as separate arguments
         as in `TrtGraphConverter`.
-    *   Changed API to optimize TensorRT enginges during graph optimization.
-        This is now done by calling `converter.build()` where previously
+    *   Changed API to optimize TensorRT engines during graph optimization. This
+        is now done by calling `converter.build()` where previously
         `is_dynamic_op=False` would be set.
     *   `converter.convert()` no longer returns a `tf.function`. Now the
         function must be accessed from the saved model.
@@ -2222,7 +2677,7 @@ Ag Ramesh, Alex Wiltschko, Alexander Pantyukhin, Amogh Mannekote, An Jiaoyang, A
   * [`tf.contrib.estimator.RNNEstimator`](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/contrib/estimator/RNNClassifier)
 * The [distributions.Bijector](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/contrib/distributions/bijectors/Bijector)
   API supports broadcasting for Bijectors with new API changes.
-  
+
 ## Breaking Changes
   * If you're opening empty variable scopes; replace `variable_scope('', ...)` by
     `variable_scope(tf.get_variable_scope(), ...)`.
@@ -2701,7 +3156,7 @@ Samuel He, Sandeep Dcunha, sandipmgiri, Sang Han, scott, Scott Mudge, Se-Won Kim
 Simone Cirillo, Steffen Schmitz, Suvojit Manna, Sylvus, Taehoon Lee, Ted Chang, Thomas Deegan,
 Till Hoffmann, Tim, Toni Kunic, Toon Verstraelen, Tristan Rice, Urs KöSter, Utkarsh Upadhyay,
 Vish (Ishaya) Abrams, Winnie Tsang, Yan Chen, Yan Facai (颜发才), Yi Yang, Yong Tang,
-Youssef Hesham, Yuan (Terry) Tang, Zhengsheng Wei, zxcqwe4906, 张志豪, 田传武 
+Youssef Hesham, Yuan (Terry) Tang, Zhengsheng Wei, zxcqwe4906, 张志豪, 田传武
 
 We are also grateful to all who filed issues or helped resolve them, asked and
 answered questions, and were part of inspiring discussions.
diff --git a/configure.py b/configure.py
index 9524eada3cd..e381c8c20db 100644
--- a/configure.py
+++ b/configure.py
@@ -38,9 +38,6 @@ _DEFAULT_CUDNN_VERSION = '7'
 _DEFAULT_TENSORRT_VERSION = '6'
 _DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,7.0'
 
-_TF_OPENCL_VERSION = '1.2'
-_DEFAULT_COMPUTECPP_TOOLKIT_PATH = '/usr/local/computecpp'
-_DEFAULT_TRISYCL_INCLUDE_DIR = '/usr/local/triSYCL/include'
 _SUPPORTED_ANDROID_NDK_VERSIONS = [10, 11, 12, 13, 14, 15, 16, 17, 18]
 
 _DEFAULT_PROMPT_ASK_ATTEMPTS = 10
@@ -1114,62 +1111,6 @@ def set_host_c_compiler(environ_cp):
   write_action_env_to_bazelrc('HOST_C_COMPILER', host_c_compiler)
 
 
-def set_computecpp_toolkit_path(environ_cp):
-  """Set COMPUTECPP_TOOLKIT_PATH."""
-
-  def toolkit_exists(toolkit_path):
-    """Check if a computecpp toolkit path is valid."""
-    if is_linux():
-      sycl_rt_lib_path = 'lib/libComputeCpp.so'
-    else:
-      sycl_rt_lib_path = ''
-
-    sycl_rt_lib_path_full = os.path.join(toolkit_path, sycl_rt_lib_path)
-    exists = os.path.exists(sycl_rt_lib_path_full)
-    if not exists:
-      print('Invalid SYCL %s library path. %s cannot be found' %
-            (_TF_OPENCL_VERSION, sycl_rt_lib_path_full))
-    return exists
-
-  computecpp_toolkit_path = prompt_loop_or_load_from_env(
-      environ_cp,
-      var_name='COMPUTECPP_TOOLKIT_PATH',
-      var_default=_DEFAULT_COMPUTECPP_TOOLKIT_PATH,
-      ask_for_var=(
-          'Please specify the location where ComputeCpp for SYCL %s is '
-          'installed.' % _TF_OPENCL_VERSION),
-      check_success=toolkit_exists,
-      error_msg='Invalid SYCL compiler path. %s cannot be found.',
-      suppress_default_error=True)
-
-  write_action_env_to_bazelrc('COMPUTECPP_TOOLKIT_PATH',
-                              computecpp_toolkit_path)
-
-
-def set_trisycl_include_dir(environ_cp):
-  """Set TRISYCL_INCLUDE_DIR."""
-
-  ask_trisycl_include_dir = ('Please specify the location of the triSYCL '
-                             'include directory. (Use --config=sycl_trisycl '
-                             'when building with Bazel) '
-                             '[Default is %s]: ') % (
-                                 _DEFAULT_TRISYCL_INCLUDE_DIR)
-
-  while True:
-    trisycl_include_dir = get_from_env_or_user_or_default(
-        environ_cp, 'TRISYCL_INCLUDE_DIR', ask_trisycl_include_dir,
-        _DEFAULT_TRISYCL_INCLUDE_DIR)
-    if os.path.exists(trisycl_include_dir):
-      break
-
-    print('Invalid triSYCL include directory, %s cannot be found' %
-          (trisycl_include_dir))
-
-  # Set TRISYCL_INCLUDE_DIR
-  environ_cp['TRISYCL_INCLUDE_DIR'] = trisycl_include_dir
-  write_action_env_to_bazelrc('TRISYCL_INCLUDE_DIR', trisycl_include_dir)
-
-
 def system_specific_test_config(environ_cp):
   """Add default build and test flags required for TF tests to bazelrc."""
   write_to_bazelrc('test --flaky_test_attempts=3')
@@ -1397,8 +1338,6 @@ def main():
   setup_python(environ_cp)
 
   if is_windows():
-    environ_cp['TF_NEED_OPENCL_SYCL'] = '0'
-    environ_cp['TF_NEED_COMPUTECPP'] = '0'
     environ_cp['TF_NEED_OPENCL'] = '0'
     environ_cp['TF_CUDA_CLANG'] = '0'
     environ_cp['TF_NEED_TENSORRT'] = '0'
@@ -1415,21 +1354,6 @@ def main():
   if environ_cp.get('TF_ENABLE_XLA', '1') == '1':
     write_to_bazelrc('build --config=xla')
 
-  set_action_env_var(
-      environ_cp,
-      'TF_NEED_OPENCL_SYCL',
-      'OpenCL SYCL',
-      False,
-      bazel_config_name='sycl')
-  if environ_cp.get('TF_NEED_OPENCL_SYCL') == '1':
-    set_host_cxx_compiler(environ_cp)
-    set_host_c_compiler(environ_cp)
-    set_action_env_var(environ_cp, 'TF_NEED_COMPUTECPP', 'ComputeCPP', True)
-    if environ_cp.get('TF_NEED_COMPUTECPP') == '1':
-      set_computecpp_toolkit_path(environ_cp)
-    else:
-      set_trisycl_include_dir(environ_cp)
-
   set_action_env_var(
       environ_cp, 'TF_NEED_ROCM', 'ROCm', False, bazel_config_name='rocm')
   if (environ_cp.get('TF_NEED_ROCM') == '1' and
@@ -1442,6 +1366,11 @@ def main():
     write_action_env_to_bazelrc('ROCM_PATH', environ_cp.get('ROCM_PATH'))
     write_action_env_to_bazelrc('ROCM_ROOT', environ_cp.get('ROCM_PATH'))
 
+  if ((environ_cp.get('TF_NEED_ROCM') == '1') and
+      (environ_cp.get('TF_ENABLE_MLIR_GENERATED_GPU_KERNELS') == '1')):
+    write_to_bazelrc(
+        'build:rocm --define tensorflow_enable_mlir_generated_gpu_kernels=1')
+
   environ_cp['TF_NEED_CUDA'] = str(
       int(get_var(environ_cp, 'TF_NEED_CUDA', 'CUDA', False)))
   if (environ_cp.get('TF_NEED_CUDA') == '1' and
@@ -1523,17 +1452,15 @@ def main():
     # use it for the CPU build.
     set_tf_download_clang(environ_cp)
 
-  # SYCL / ROCm / CUDA are mutually exclusive.
+  # ROCm / CUDA are mutually exclusive.
   # At most 1 GPU platform can be configured.
   gpu_platform_count = 0
-  if environ_cp.get('TF_NEED_OPENCL_SYCL') == '1':
-    gpu_platform_count += 1
   if environ_cp.get('TF_NEED_ROCM') == '1':
     gpu_platform_count += 1
   if environ_cp.get('TF_NEED_CUDA') == '1':
     gpu_platform_count += 1
   if gpu_platform_count >= 2:
-    raise UserInputError('SYCL / CUDA / ROCm are mututally exclusive. '
+    raise UserInputError('CUDA / ROCm are mututally exclusive. '
                          'At most 1 GPU platform can be configured.')
 
   set_cc_opt_flags(environ_cp)
@@ -1558,6 +1485,7 @@ def main():
         'adding "--config=<>" to your build command. See .bazelrc for more '
         'details.')
   config_info_line('mkl', 'Build with MKL support.')
+  config_info_line('mkl_aarch64', 'Build with oneDNN support for Aarch64.')
   config_info_line('monolithic', 'Config for mostly static monolithic build.')
   config_info_line('ngraph', 'Build with Intel nGraph support.')
   config_info_line('numa', 'Build with NUMA support.')
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index d1c1d7dcdef..8946b45cacb 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -497,13 +497,20 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-# This flag enables experimental MLIR bridge support.
+# This flag forcibly enables experimental MLIR bridge support.
 config_setting(
     name = "enable_mlir_bridge",
     values = {"define": "enable_mlir_bridge=true"},
     visibility = ["//visibility:public"],
 )
 
+# This flag forcibly disables experimental MLIR bridge support.
+config_setting(
+    name = "disable_mlir_bridge",
+    values = {"define": "enable_mlir_bridge=false"},
+    visibility = ["//visibility:public"],
+)
+
 # This flag enables experimental TPU support
 config_setting(
     name = "with_tpu_support",
@@ -562,33 +569,17 @@ selects.config_setting_group(
 package_group(
     name = "internal",
     packages = [
-        "//learning/brain/swift/x10/...",
-        "//perftools/accelerators/xprof/api/...",
+        "//learning/lib/ami/simple_ml/...",
         "//tensorflow/...",
-        "//tensorflow_estimator/python/estimator/...",
-        "//tensorflow_models/official/...",
-        "//third_party/py/autograph/...",
-        "//third_party/swift/tensorflow/x10/...",
-        "//third_party/swift/tensorflow_apis/...",
     ],
 )
 
-package_group(
-    name = "ndarray_tensor_allow_list",
-    packages = ["//learning/pathways/..."],
-)
-
-# Packages that use composite tensors or dispatch.
-# TODO(b/154762408) Remove this package group once it's no longer needed.
-# If this is modified, then copy.bara.sky must also be modified.
-package_group(name = "composite_tensor_whitelist")
+package_group(name = "ndarray_tensor_allow_list")
 
 # Packages that use private types symbols, until they are exported.
 # TODO(b/154650521) Remove.
-package_group(
-    name = "types_whitelist",
-    packages = ["//learning/deepmind/tensorflow/replicator/..."],
-)
+# If this is modified, then copy.bara.sky must also be modified.
+package_group(name = "types_whitelist")
 
 # Packages that use StructuredTensors.
 # TODO(b/159007891) Remove this package once StructuredTensor is exported.
@@ -714,8 +705,12 @@ tf_cc_shared_object(
     soversion = VERSION,
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/c/experimental/filesystem:filesystem_interface",
+        "//tensorflow/c/experimental/stream_executor:stream_executor_hdrs",
+        "//tensorflow/c:kernels_hdrs",
+        "//tensorflow/c:ops_hdrs",
         "//tensorflow/cc/saved_model:loader_lite_impl",
-        "//tensorflow/core:core_cpu_impl",
+        "//tensorflow/core/common_runtime:core_cpu_impl",
         "//tensorflow/core:framework_internal_impl",
         "//tensorflow/core/common_runtime/gpu:gpu_runtime_impl",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry_impl",
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index 5932dda514d..99a278a14a4 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -138,12 +138,12 @@ if _running_from_pip_package():
   for _s in _site_packages_dirs:
     # Load first party dynamic kernels.
     _main_dir = _os.path.join(_s, 'tensorflow/core/kernels')
-    if _fi.file_exists(_main_dir):
+    if _os.path.exists(_main_dir):
       _ll.load_library(_main_dir)
 
     # Load third party dynamic kernels.
     _plugin_dir = _os.path.join(_s, 'tensorflow-plugins')
-    if _fi.file_exists(_plugin_dir):
+    if _os.path.exists(_plugin_dir):
       _ll.load_library(_plugin_dir)
 
 # Add module aliases
diff --git a/tensorflow/api_template_v1.__init__.py b/tensorflow/api_template_v1.__init__.py
index 0d1d2e56fae..ae82f7b4792 100644
--- a/tensorflow/api_template_v1.__init__.py
+++ b/tensorflow/api_template_v1.__init__.py
@@ -148,12 +148,12 @@ if _running_from_pip_package():
   for _s in _site_packages_dirs:
     # Load first party dynamic kernels.
     _main_dir = _os.path.join(_s, 'tensorflow/core/kernels')
-    if _fi.file_exists(_main_dir):
+    if _os.path.exists(_main_dir):
       _ll.load_library(_main_dir)
 
     # Load third party dynamic kernels.
     _plugin_dir = _os.path.join(_s, 'tensorflow-plugins')
-    if _fi.file_exists(_plugin_dir):
+    if _os.path.exists(_plugin_dir):
       _ll.load_library(_plugin_dir)
 
 # Delete modules that should be hidden from dir().
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 9d8032aca52..1628bf05fd6 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -1,6 +1,7 @@
 # Description:
 # C API for TensorFlow, for use by client language bindings.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
@@ -9,6 +10,11 @@ load(
     "tf_custom_op_library",
     "tf_kernel_library",
 )
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 
 package(
@@ -211,6 +217,8 @@ tf_cuda_library(
             "//tensorflow/core:lib_internal",
             "//tensorflow/core/distributed_runtime:server_lib",
             "//tensorflow/core/kernels:logging_ops",
+            "//tensorflow/compiler/mlir/tfr:node_expansion_pass",
+            "//tensorflow/compiler/mlir/tfr:graph_decompose_pass",
         ],
     }),
     alwayslink = 1,
@@ -248,6 +256,30 @@ tf_cuda_library(
     }),
 )
 
+cc_library(
+    name = "tf_shape",
+    srcs = ["tf_shape.cc"],
+    hdrs = ["tf_shape.h"],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":c_api_macros",
+        ":tf_shape_internal",
+        "//tensorflow/core:framework",
+    ],
+)
+
+cc_library(
+    name = "tf_shape_internal",
+    hdrs = ["tf_shape_internal.h"],
+    copts = tf_copts(),
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":conversion_macros",
+        "//tensorflow/core:framework",
+    ],
+)
+
 cc_library(
     name = "tf_status",
     srcs = ["tf_status.cc"],
@@ -377,6 +409,7 @@ tf_cuda_library(
         "//tensorflow/c/eager:tfe_op_internal",
         "//tensorflow/c/eager:tfe_tensorhandle_internal",
         "//tensorflow/compiler/jit:flags",
+        "//tensorflow/compiler/jit:get_compiler_ir",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -387,6 +420,7 @@ tf_cuda_library(
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/core/platform",
+        "//tensorflow/core/platform:blocking_counter",
         "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
@@ -477,6 +511,18 @@ tf_cuda_library(
     ],
 )
 
+cc_library(
+    name = "kernels_hdrs",
+    hdrs = ["kernels.h"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":c_api_internal",
+        ":tf_datatype",
+        ":tf_status",
+        ":tf_tensor",
+    ],
+)
+
 tf_cuda_library(
     name = "kernels",
     srcs = [
@@ -530,6 +576,16 @@ tf_cuda_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "ops_hdrs",
+    hdrs = ["ops.h"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":tf_datatype",
+        ":tf_status",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Tests
 
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 2e1759ecea0..a03e9227a75 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -2488,6 +2488,48 @@ TF_Buffer* TF_GetRegisteredKernelsForOp(const char* name, TF_Status* status) {
   return ret;
 }
 
+void TF_UpdateEdge(TF_Graph* graph, TF_Output new_src, TF_Input dst,
+                   TF_Status* status) {
+  using tensorflow::RecordMutation;
+  mutex_lock l(graph->mu);
+  tensorflow::shape_inference::InferenceContext* ic =
+      graph->refiner.GetContext(&new_src.oper->node);
+
+  if (ic->num_outputs() <= new_src.index) {
+    status->status = tensorflow::errors::OutOfRange(
+        "Cannot update edge. Output index [", new_src.index,
+        "] is greater than the number of total outputs [", ic->num_outputs(),
+        "].");
+    return;
+  }
+  tensorflow::shape_inference::ShapeHandle shape = ic->output(new_src.index);
+
+  tensorflow::shape_inference::InferenceContext* ic_dst =
+      graph->refiner.GetContext(&dst.oper->node);
+  if (ic_dst->num_inputs() <= dst.index) {
+    status->status = tensorflow::errors::OutOfRange(
+        "Cannot update edge. Input index [", dst.index,
+        "] is greater than the number of total inputs [", ic_dst->num_inputs(),
+        "].");
+    return;
+  }
+  if (!ic_dst->MergeInput(dst.index, shape)) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Cannot update edge, incompatible shapes: ", ic_dst->DebugString(shape),
+        " and ", ic_dst->DebugString(ic_dst->input(dst.index)), ".");
+    return;
+  }
+  status->status = graph->graph.UpdateEdge(&new_src.oper->node, new_src.index,
+                                           &dst.oper->node, dst.index);
+
+  if (TF_GetCode(status) == TF_OK) {
+    // This modification only updates the destination node for
+    // the purposes of running this graph in a session. Thus, we don't
+    // record the source node as being modified.
+    RecordMutation(graph, *dst.oper, "updating input tensor");
+  }
+}
+
 // TF_Server functions ----------------------------------------------
 
 #if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD)
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index 0b4d9993e4d..db5f8fd68f8 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -1524,6 +1524,10 @@ TF_CAPI_EXPORT extern TF_Buffer* TF_GetAllRegisteredKernels(TF_Status* status);
 TF_CAPI_EXPORT extern TF_Buffer* TF_GetRegisteredKernelsForOp(
     const char* name, TF_Status* status);
 
+// Update edge, switch input/ output in a node
+TF_CAPI_EXPORT extern void TF_UpdateEdge(TF_Graph* graph, TF_Output new_src,
+                                         TF_Input dst, TF_Status* status);
+
 // --------------------------------------------------------------------------
 // In-process TensorFlow server functionality, for use in distributed training.
 // A Server instance encapsulates a set of devices and a Session target that
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index b4297033b6d..81fb9d1a2b8 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/blocking_counter.h"
 #include "tensorflow/core/platform/casts.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/net.h"
@@ -560,6 +561,21 @@ TF_CAPI_EXPORT extern void TFE_AbortCollectiveOps(TFE_Context* ctx,
   collective_executor_handle->get()->StartAbort(status->status);
 }
 
+TF_CAPI_EXPORT extern void TFE_CollectiveOpsCheckPeerHealth(TFE_Context* ctx,
+                                                            const char* task,
+                                                            TF_Status* status) {
+  tensorflow::EagerContext* context =
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
+  auto collective_executor_handle = context->GetCollectiveExecutorHandle();
+  tensorflow::Notification done;
+  collective_executor_handle->get()->remote_access()->CheckPeerHealth(
+      task, [&done, status](const Status& s) {
+        status->status = s;
+        done.Notify();
+      });
+  done.WaitForNotification();
+}
+
 TF_ShapeAndTypeList* TF_NewShapeAndTypeList(int num_items) {
   TF_ShapeAndTypeList* result = new TF_ShapeAndTypeList;
   result->num_items = num_items;
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index ebd14b4b571..c9c74f4e874 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -231,13 +231,20 @@ TF_CAPI_EXPORT extern void TFE_EnableCollectiveOps(TFE_Context* ctx,
                                                    TF_Status* status);
 
 // Aborts all ongoing collectives with the specified status. After abortion,
-// subsequent collectives will error with this status immediately.
+// subsequent collectives will error with this status immediately. To reset the
+// collectives, create a new EagerContext.
 //
-// This is intended to be used when a peer failure is detected. There's yet no
-// way to reset the collectives other than restarting the program.
+// This is intended to be used when a peer failure is detected.
 TF_CAPI_EXPORT extern void TFE_AbortCollectiveOps(TFE_Context* ctx,
                                                   TF_Status* status);
 
+// Checks the health of collective ops peers. Explicit health check is needed in
+// multi worker collective ops to detect failures in the cluster.  If a peer is
+// down, collective ops may hang.
+TF_CAPI_EXPORT extern void TFE_CollectiveOpsCheckPeerHealth(TFE_Context* ctx,
+                                                            const char* task,
+                                                            TF_Status* status);
+
 // Information about the shape of a Tensor and its type.
 struct TF_ShapeAndType {
   // Number of dimensions. -1 indicates unknown rank.
diff --git a/tensorflow/c/c_api_function_test.cc b/tensorflow/c/c_api_function_test.cc
index 3fff9bcd371..ec8cfe4a31a 100644
--- a/tensorflow/c/c_api_function_test.cc
+++ b/tensorflow/c/c_api_function_test.cc
@@ -1704,66 +1704,5 @@ TEST_F(CApiFunctionTest, GetFunctionsFromGraph) {
   TF_DeleteFunction(func1);
 }
 
-// This test only works when the TF build includes XLA compiler. One way to set
-// this up is via bazel build option "--define with_xla_support=true".
-//
-// FIXME: generalize the macro name TENSORFLOW_EAGER_USE_XLA to
-// something like TENSORFLOW_CAPI_USE_XLA.
-#ifdef TENSORFLOW_EAGER_USE_XLA
-TEST_F(CApiFunctionTest, StatelessIf_XLA) {
-  TF_Function* func;
-  const std::string funcName = "BranchFunc";
-  DefineFunction(funcName.c_str(), &func);
-  TF_GraphCopyFunction(host_graph_, func, nullptr, s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-
-  TF_Operation* feed = Placeholder(host_graph_, s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-
-  TF_Operation* true_cond = ScalarConst(true, host_graph_, s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-
-  TF_OperationDescription* desc =
-      TF_NewOperation(host_graph_, "StatelessIf", "IfNode");
-  TF_AddInput(desc, {true_cond, 0});
-  TF_Output inputs[] = {{feed, 0}};
-  TF_AddInputList(desc, inputs, TF_ARRAYSIZE(inputs));
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  TF_SetAttrType(desc, "Tcond", TF_BOOL);
-  TF_DataType inputType = TF_INT32;
-  TF_SetAttrTypeList(desc, "Tin", &inputType, 1);
-  TF_SetAttrTypeList(desc, "Tout", &inputType, 1);
-  TF_SetAttrFuncName(desc, "then_branch", funcName.data(), funcName.size());
-  TF_SetAttrFuncName(desc, "else_branch", funcName.data(), funcName.size());
-  TF_SetDevice(desc, "/device:XLA_CPU:0");
-  auto op = TF_FinishOperation(desc, s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  ASSERT_NE(op, nullptr);
-
-  // Create a session for this graph.
-  CSession csession(host_graph_, s_, /*use_XLA*/ true);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-
-  // Run the graph.
-  csession.SetInputs({{feed, Int32Tensor(17)}});
-  csession.SetOutputs({op});
-  csession.Run(s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-  TF_Tensor* out = csession.output_tensor(0);
-  ASSERT_TRUE(out != nullptr);
-  EXPECT_EQ(TF_INT32, TF_TensorType(out));
-  EXPECT_EQ(0, TF_NumDims(out));  // scalar
-  ASSERT_EQ(sizeof(int32), TF_TensorByteSize(out));
-  int32* output_contents = static_cast<int32*>(TF_TensorData(out));
-  EXPECT_EQ(-17, *output_contents);
-
-  // Clean up
-  csession.CloseAndDelete(s_);
-  ASSERT_EQ(TF_OK, TF_GetCode(s_)) << TF_Message(s_);
-
-  TF_DeleteFunction(func);
-}
-#endif  // TENSORFLOW_EAGER_USE_XLA
-
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index bbbbb8f7d56..fc1fdccee16 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -634,6 +634,40 @@ TEST(CAPI, Graph) {
   TF_DeleteStatus(s);
 }
 
+TEST(CAPI, UpdateEdge) {
+  TF_Status* s = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+
+  // Make two scalar constants.
+  TF_Operation* one = ScalarConst(1, graph, s, "one");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  TF_Operation* two = ScalarConst(2, graph, s, "two");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Add oper.
+  TF_Operation* add = Add(one, two, graph, s, "add");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Add another oper to the graph.
+  TF_Operation* neg = Neg(add, graph, s, "neg");
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  NodeDef node_def_neg;
+  ASSERT_TRUE(GetNodeDef(neg, &node_def_neg));
+  EXPECT_EQ(string("add"), node_def_neg.input(0));
+
+  // update edge of neg
+  TF_UpdateEdge(graph, TF_Output{one, 0}, TF_Input{neg, 0}, s);
+
+  ASSERT_TRUE(GetNodeDef(neg, &node_def_neg));
+  EXPECT_EQ(string("one:0"), node_def_neg.input(0));
+
+  // Clean up
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(s);
+}
+
 /*
 TODO(skyewm): this test currently DCHECKs, change to bad status
 
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index cc02d83fe01..08b3c73ed02 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -1,13 +1,23 @@
 # Experimental extensions to the C API for eager execution of kernels.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
+    "if_libtpu",
     "tf_cc_test",
     "tf_copts",
     "tf_cuda_cc_test",
     "tf_cuda_library",
-    "tfe_xla_copts",
 )
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "internal_tfrt_deps")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_kernel_tests_linkstatic",
@@ -31,7 +41,7 @@ tf_cuda_library(
         "c_api_unified_experimental.h",
     ],
     hdrs = ["c_api.h"],
-    copts = tf_copts() + tfe_xla_copts(),
+    copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
@@ -72,13 +82,6 @@ tf_cuda_library(
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core/profiler/lib:traceme",
         ],
-    }) + select({
-        "//tensorflow:with_xla_support": [
-            "//tensorflow/compiler/tf2xla:xla_compiler",
-            "//tensorflow/compiler/jit",
-            "//tensorflow/compiler/jit:xla_device",
-        ],
-        "//conditions:default": [],
     }) + [
         "@com_google_absl//absl/memory",
         "//tensorflow/core/common_runtime/eager:eager_operation",
@@ -95,7 +98,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core:gpu_runtime",
-    ],
+    ] + internal_tfrt_deps(),
     alwayslink = 1,
 )
 
@@ -109,11 +112,16 @@ filegroup(
         "c_api_experimental.h",
         "c_api_internal.h",
         "c_api_unified_experimental.h",
+        "c_api_unified_experimental_internal.h",
         "dlpack.h",
+        "gradients.h",
+        "gradients_internal.h",
         "immediate_execution_context.h",
         "immediate_execution_operation.h",
         "immediate_execution_tensor_handle.h",
+        "tape.h",
         "tfe_cancellation_manager_internal.h",
+        "tfe_context_internal.h",
         "tfe_executor_internal.h",
         "tfe_monitoring_internal.h",
         "tfe_op_attrs_internal.h",
@@ -172,27 +180,20 @@ cc_library(
 )
 
 cc_library(
-    name = "gradients",
-    srcs = [
-        "gradients.cc",
-        "gradients_internal.h",
-    ],
+    name = "tracing_utils",
+    srcs = ["tracing_utils.cc"],
     hdrs = [
-        "gradients.h",
+        "tracing_utils.h",
     ],
     visibility = [
         "//tensorflow:internal",
     ],
     deps = [
-        ":abstract_context",
         ":abstract_operation",
-        ":abstract_tensor_handle",
         ":c_api_unified_internal",
-        ":tape",
-        "//tensorflow/core/common_runtime/eager:attr_builder",
+        "//tensorflow/c/experimental/gradients/tape:tape_operation",
         "//tensorflow/core/lib/llvm_rtti",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
+        "//tensorflow/core/platform:errors",
     ],
 )
 
@@ -228,10 +229,10 @@ tf_cuda_cc_test(
         "gradients_test.cc",
     ],
     args = ["--heap_check=local"],
-    extra_copts = tfe_xla_copts(),
     linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + ["nomac"],
     deps = [
+        ":abstract_context",
         ":abstract_tensor_handle",
         ":c_api_experimental",
         ":c_api_test_util",
@@ -242,7 +243,8 @@ tf_cuda_cc_test(
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/experimental/gradients:array_grad",
         "//tensorflow/c/experimental/gradients:math_grad",
-        "//tensorflow/c/experimental/ops:array_ops",
+        "//tensorflow/c/experimental/gradients/tape:tape_context",
+        "//tensorflow/c/experimental/ops",
         "//tensorflow/cc/profiler",
         "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
         "//tensorflow/core:lib",
@@ -256,6 +258,46 @@ tf_cuda_cc_test(
     ],
 )
 
+cc_library(
+    name = "gradients_util",
+    srcs = [
+        "gradients_util.cc",
+    ],
+    hdrs = [
+        "gradients_util.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":abstract_context",
+        ":abstract_operation",
+        ":abstract_tensor_handle",
+        ":c_api",
+        ":c_api_experimental",
+        ":c_api_unified_internal",
+        ":gradients_internal",
+        ":tape",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c/experimental/ops:array_ops",
+        "//tensorflow/c/experimental/ops:math_ops",
+        "//tensorflow/c/experimental/ops:nn_ops",
+        "//tensorflow/cc/profiler",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/llvm_rtti",
+    ] + if_libtpu(
+        if_false = ["//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration"],
+        if_true = [],
+    ),
+)
+
 cc_library(
     name = "mnist_gradients_testutil",
     srcs = [
@@ -272,17 +314,93 @@ cc_library(
         ":c_api_experimental",
         ":c_api_unified_internal",
         ":gradients_internal",
-        "//tensorflow/c:tf_status_helper",
-        "//tensorflow/c:tf_tensor",
+        ":gradients_util",
+        ":tape",
+        "//tensorflow/c/experimental/gradients/tape:tape_context",
         "//tensorflow/c/experimental/ops:array_ops",
         "//tensorflow/c/experimental/ops:math_ops",
         "//tensorflow/c/experimental/ops:nn_ops",
         "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/core/platform:status",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/types:span",
     ],
 )
 
+cc_library(
+    name = "gradient_checker",
+    srcs = [
+        "gradient_checker.cc",
+    ],
+    hdrs = [
+        "gradient_checker.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":abstract_tensor_handle",
+        ":c_api_experimental",
+        ":c_api_unified_internal",
+        ":gradients_internal",
+        ":gradients_util",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c/experimental/gradients:math_grad",
+        "//tensorflow/c/experimental/gradients:nn_grad",
+        "//tensorflow/c/experimental/ops:array_ops",
+        "//tensorflow/c/experimental/ops:math_ops",
+        "//tensorflow/c/experimental/ops:nn_ops",
+        "//tensorflow/cc/profiler",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/llvm_rtti",
+    ] + if_libtpu(
+        if_false = ["//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration"],
+        if_true = [],
+    ),
+)
+
+tf_cuda_cc_test(
+    name = "gradient_checker_test",
+    size = "small",
+    srcs = [
+        "gradient_checker_test.cc",
+    ],
+    args = ["--heap_check=local"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags() + ["nomac"],
+    deps = [
+        ":abstract_tensor_handle",
+        ":c_api_experimental",
+        ":c_api_test_util",
+        ":c_api_unified_internal",
+        ":gradient_checker",
+        ":gradients_internal",
+        ":gradients_util",
+        ":mnist_gradients_testutil",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:c_test_util",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c/experimental/gradients:math_grad",
+        "//tensorflow/c/experimental/gradients:nn_grad",
+        "//tensorflow/c/experimental/ops:array_ops",
+        "//tensorflow/c/experimental/ops:math_ops",
+        "//tensorflow/c/experimental/ops:nn_ops",
+        "//tensorflow/cc/profiler",
+        "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/lib/llvm_rtti",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "mnist_gradients_test",
     size = "small",
@@ -290,19 +408,16 @@ tf_cuda_cc_test(
         "mnist_gradients_test.cc",
     ],
     args = ["--heap_check=local"],
-    extra_copts = tfe_xla_copts(),
     linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + [
         "nomac",
-        "notap",  # TODO(b/166150182): Enable
-        "no_oss",  # TODO(b/166150182): Enable
     ],
     deps = [
         ":abstract_tensor_handle",
         ":c_api_experimental",
-        ":c_api_test_util",
         ":c_api_unified_internal",
         ":gradients_internal",
+        ":gradients_util",
         ":mnist_gradients_testutil",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_test_util",
@@ -526,6 +641,19 @@ cc_library(
     ],
 )
 
+cc_header_only_library(
+    name = "tfe_tensorhandle_internal_hdrs_only",
+    extra_deps = [
+        "@com_google_absl//absl/strings",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":tfe_tensorhandle_internal",
+    ],
+)
+
 tf_cuda_library(
     name = "c_api_test_util",
     testonly = 1,
@@ -539,6 +667,8 @@ tf_cuda_library(
         ":c_api",
         ":c_api_experimental",
         "//tensorflow/c:c_test_util",
+        "//tensorflow/c:tf_datatype",
+        "//tensorflow/c:tf_tensor",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -553,7 +683,6 @@ tf_cuda_cc_test(
         "c_api_debug_test.cc",
         "c_api_test.cc",
     ],
-    extra_copts = tfe_xla_copts(),
     tags = [
         "noguitar",  # TODO(b/155445984): flaky
         #"guitar",
@@ -608,7 +737,6 @@ tf_cuda_cc_test(
     ],
     # TODO(b/136478427): Figure out how to correctly shut the server down
     args = ["--heap_check=local"],
-    extra_copts = tfe_xla_copts(),
     tags = [
         "no_windows",
     ],
@@ -641,7 +769,6 @@ tf_cuda_cc_test(
     ],
     # TODO(b/136478427): Figure out how to correctly shut the server down
     args = ["--heap_check=local"],
-    extra_copts = tfe_xla_copts(),
     tags = [
         "no_windows",
     ],
@@ -660,7 +787,6 @@ tf_cuda_cc_test(
     ],
     # TODO(b/136478427): Figure out how to correctly shut the server down
     args = ["--heap_check=local"],
-    extra_copts = tfe_xla_copts(),
     tags = [
         "no_windows",
         "noasan",  # leaks gRPC server instances
@@ -694,7 +820,6 @@ tf_cuda_cc_test(
     ],
     # TODO(b/136478427): Figure out how to correctly shut the server down
     args = ["--heap_check=local"],
-    extra_copts = tfe_xla_copts(),
     tags = [
         "no_windows",
     ],
@@ -729,7 +854,7 @@ tf_cuda_library(
         "c_api_experimental.h",
         "c_api_unified_experimental.h",
     ],
-    copts = tf_copts() + tfe_xla_copts(),
+    copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
@@ -801,7 +926,6 @@ tf_cuda_cc_test(
         "c_api_experimental_test.cc",
     ],
     args = ["--heap_check=local"],
-    extra_copts = tfe_xla_copts(),
     linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + ["nomac"],
     deps = [
@@ -814,6 +938,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -825,7 +950,6 @@ tf_cuda_cc_test(
         "c_api_unified_experimental_test.cc",
     ],
     args = ["--heap_check=local"],
-    extra_copts = tfe_xla_copts(),
     linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + ["nomac"],
     deps = [
@@ -834,6 +958,7 @@ tf_cuda_cc_test(
         ":c_api_test_util",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_test_util",
+        "//tensorflow/c:tf_status_helper",
         "//tensorflow/cc/profiler",
         "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
         "//tensorflow/core:lib",
@@ -943,7 +1068,13 @@ filegroup(
             "c_api_unified_experimental_eager.cc",
             "c_api_unified_experimental_graph.cc",
             "c_api_unified_experimental_internal.h",
+            "gradient_checker.cc",
+            "gradient_checker.h",
             "gradients.cc",  # Uses RTTI.
+            "gradients_util.cc",
+            "gradients_util.h",
+            "tracing_utils.h",
+            "tracing_utils.cc",
             "*test*",
             "*dlpack*",
         ],
diff --git a/tensorflow/c/eager/abstract_context.h b/tensorflow/c/eager/abstract_context.h
index b488255d150..d31b1e13611 100644
--- a/tensorflow/c/eager/abstract_context.h
+++ b/tensorflow/c/eager/abstract_context.h
@@ -32,7 +32,7 @@ namespace tensorflow {
 // environment, a traced representation etc.
 class AbstractContext {
  protected:
-  enum AbstractContextKind { kGraph, kMlir, kEager, kTfrt };
+  enum AbstractContextKind { kGraph, kMlir, kEager, kTfrt, kTape };
   explicit AbstractContext(AbstractContextKind kind) : kind_(kind) {}
   virtual ~AbstractContext() {}
 
diff --git a/tensorflow/c/eager/abstract_operation.h b/tensorflow/c/eager/abstract_operation.h
index b332679cc7c..4c630528f5d 100644
--- a/tensorflow/c/eager/abstract_operation.h
+++ b/tensorflow/c/eager/abstract_operation.h
@@ -30,7 +30,7 @@ namespace tensorflow {
 // tracing or immediate execution mode.
 class AbstractOperation {
  protected:
-  enum AbstractOperationKind { kGraph, kMlir, kEager, kTfrt };
+  enum AbstractOperationKind { kGraph, kMlir, kEager, kTfrt, kTape };
   explicit AbstractOperation(AbstractOperationKind kind) : kind_(kind) {}
   virtual ~AbstractOperation() {}
 
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index fefa753c608..5f388bfe0cd 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -39,7 +39,7 @@ limitations under the License.
 #include "tensorflow/c/eager/tfe_op_internal.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/c/tf_tensor_internal.h"
-#ifdef PLATFORM_GOOGLE
+#if defined(PLATFORM_GOOGLE) && !defined(LIBTPU_ON_GCE)
 #include "tensorflow/core/tfrt/eager/c_api_tfrt.h"
 #endif
 #include "tensorflow/core/common_runtime/device.h"
@@ -51,9 +51,6 @@ limitations under the License.
 #include "tensorflow/core/protobuf/device_filters.pb.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
-#ifdef TENSORFLOW_EAGER_USE_XLA
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#endif  // TENSORFLOW_EAGER_USE_XLA
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -629,21 +626,30 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
                     "targets will fail.";
     }
   } else {
-    // The master's context_view_id will be incremented by one
-    // the UpdateRemoteMaster call later. We want all new workers and
-    // existing workers to also have the updated context_view_id, so
-    // we must set their context_view_id to the existing master's
-    // context_view_id + 1.
-    sg.Update(CreateRemoteContexts(
-        ctx, added_workers, context_id, context_view_id + 1, keep_alive_secs,
-        server_def, remote_eager_workers.get(), context->Executor().Async(),
-        context->LazyCopyFunctionRemoteInputs(), base_request));
+    if (sg.ok()) {
+      // Create remote contexts on the newly added workers only if the master
+      // has collected all device information from them (i.e., the
+      // GetAllRemoteDevices call returns succussfully). Note that in rare cases
+      // GetAllRemoteDevices can still fail even with RPCs configured to wait
+      // until the remote workers to become alive. If the master creates remote
+      // contexts on the workers whose devices are still not collected, those
+      // workers will be treated as existing workers subsequently, so the master
+      // will never get devices from them even with retrying UpdateServerDef.
+      sg.Update(CreateRemoteContexts(
+          ctx, added_workers, context_id, context_view_id + 1, keep_alive_secs,
+          server_def, remote_eager_workers.get(), context->Executor().Async(),
+          context->LazyCopyFunctionRemoteInputs(), base_request));
+    }
     if (!existing_workers.empty()) {
       if (VLOG_IS_ON(1)) {
         for (const string& w : existing_workers) {
           VLOG(1) << "Updating cluster with existing worker " << w;
         }
       }
+      // The master's context_view_id will be incremented by one in the
+      // UpdateRemoteMaster call later. We want existing workers to also have
+      // the updated context_view_id, so we must set their context_view_id to
+      // the master's current context_view_id + 1.
       sg.Update(UpdateRemoteContexts(ctx, existing_workers, added_workers,
                                      removed_workers, context_id,
                                      context_view_id + 1, server_def,
@@ -723,7 +729,7 @@ void TFE_DeleteContextOptions(TFE_ContextOptions* options) { delete options; }
 
 TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
   if (opts->use_tfrt) {
-#ifdef PLATFORM_GOOGLE
+#if defined(PLATFORM_GOOGLE) && !defined(LIBTPU_ON_GCE)
     return tensorflow::wrap(new tfrt::tf::ContextInterface(opts->async));
 #else
     status->status = tensorflow::errors::Unimplemented("TFRT is not supported");
@@ -745,10 +751,8 @@ TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
       opts->session_options.options,
       static_cast<tensorflow::ContextDevicePlacementPolicy>(
           opts->device_placement_policy),
-      static_cast<tensorflow::ContextMirroringPolicy>(opts->mirroring_policy),
       opts->async, opts->lazy_remote_inputs_copy, device_mgr.release(),
-      /*device_mgr_owned*/ true, r,
-      tensorflow::GetDefaultCustomKernelCreator()));
+      /*device_mgr_owned*/ true, r));
 }
 
 void TFE_DeleteContext(TFE_Context* ctx) {
@@ -851,20 +855,9 @@ TF_CAPI_EXPORT extern bool TFE_ContextCheckAlive(TFE_Context* ctx,
 #else   // !defined(IS_MOBILE_PLATFORM)
   tensorflow::EagerContext* context =
       tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  tensorflow::GrpcServer* grpc_server =
-      static_cast<tensorflow::GrpcServer*>(context->GetServer());
-
-  std::unique_ptr<tensorflow::eager::EagerClientCache> remote_eager_workers;
-  status->status = grpc_server->master_env()->worker_cache->GetEagerClientCache(
-      &remote_eager_workers);
-  if (!status->status.ok()) {
-    LOG(ERROR) << "Failed to get client cache for remote workers.";
-    return false;
-  }
-
   // TODO(yuefengz): support partially specified `worker_name`.
   tensorflow::core::RefCountPtr<tensorflow::eager::EagerClient> eager_client;
-  status->status = remote_eager_workers->GetClient(worker_name, &eager_client);
+  status->status = context->GetClient(worker_name, &eager_client);
   if (!status->status.ok()) {
     return false;
   }
@@ -911,9 +904,7 @@ TF_CAPI_EXPORT extern void TFE_ContextAsyncWait(TFE_Context* ctx,
 
 void TFE_ContextSetThreadLocalDevicePlacementPolicy(
     TFE_Context* ctx, TFE_ContextDevicePlacementPolicy policy) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  context->SetThreadLocalDevicePlacementPolicy(
+  tensorflow::unwrap(ctx)->SetThreadLocalDevicePlacementPolicy(
       static_cast<tensorflow::ContextDevicePlacementPolicy>(policy));
 }
 
@@ -922,10 +913,8 @@ void TFE_ContextSetThreadLocalDevicePlacementPolicy(
 // safe to call this function from the async EagerExecutor threads.
 extern TFE_ContextDevicePlacementPolicy TFE_ContextGetDevicePlacementPolicy(
     TFE_Context* ctx) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   return static_cast<TFE_ContextDevicePlacementPolicy>(
-      context->GetDevicePlacementPolicy());
+      tensorflow::unwrap(ctx)->GetDevicePlacementPolicy());
 }
 
 TFE_TensorHandle* TFE_NewTensorHandle(const TF_Tensor* t, TF_Status* status) {
@@ -1149,26 +1138,23 @@ void TFE_DeleteOp(TFE_Op* op) {
   tensorflow::unwrap(op)->Release();
 }
 
+const char* TFE_OpGetName(const TFE_Op* op, TF_Status* status) {
+  return tensorflow::unwrap(op)->Name().c_str();
+}
+
+TFE_Context* TFE_OpGetContext(const TFE_Op* op, TF_Status* status) {
+  return tensorflow::wrap(
+      &(OperationFromInterface(tensorflow::unwrap(op))->EagerContext()));
+}
+
 void TFE_OpSetDevice(TFE_Op* op, const char* device_name, TF_Status* status) {
   status->status = tensorflow::unwrap(op)->SetDeviceName(device_name);
 }
 
-const char* TFE_OpGetDevice(TFE_Op* op, TF_Status* status) {
+const char* TFE_OpGetDevice(const TFE_Op* op, TF_Status* status) {
   return tensorflow::unwrap(op)->DeviceName().c_str();
 }
 
-void TFE_OpSetXLACompilation(TFE_Op* op, unsigned char enable) {
-#ifdef TENSORFLOW_EAGER_USE_XLA
-  tensorflow::Status s = tensorflow::unwrap(op)->SetUseXla(enable);
-  if (!s.ok()) {
-    LOG(ERROR) << "Could not enable XLA compilation for op: " << s;
-  }
-#else
-  LOG(WARNING) << "This call is a no-op, as the TensorFlow library is not "
-                  "built with XLA support.";
-#endif  // TENSORFLOW_EAGER_USE_XLA
-}
-
 void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* input, TF_Status* status) {
   status->status = tensorflow::unwrap(op)->AddInput(tensorflow::unwrap(input));
 }
@@ -1181,6 +1167,15 @@ void TFE_OpAddInputList(TFE_Op* op, TFE_TensorHandle** inputs, int num_inputs,
        static_cast<size_t>(num_inputs)});
 }
 
+extern int TFE_OpGetFlatInputCount(const TFE_Op* op, TF_Status* status) {
+  return tensorflow::unwrap(op)->GetInputs().size();
+}
+
+extern TFE_TensorHandle* TFE_OpGetFlatInput(const TFE_Op* op, int index,
+                                            TF_Status* status) {
+  return tensorflow::wrap(tensorflow::unwrap(op)->GetInputs()[index]);
+}
+
 TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
                               unsigned char* is_list, TF_Status* status) {
   TF_AttrType ret = TF_ATTR_INT;
@@ -1430,21 +1425,15 @@ void TFE_ContextRemoveFunction(TFE_Context* ctx, const char* name,
 }
 
 unsigned char TFE_ContextHasFunction(TFE_Context* ctx, const char* name) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  return context->FindFunctionDef(name) != nullptr;
+  return tensorflow::unwrap(ctx)->FindFunctionDef(name) != nullptr;
 }
 
 void TFE_ContextEnableRunMetadata(TFE_Context* ctx) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  context->SetShouldStoreGraphs(true);
+  tensorflow::unwrap(ctx)->SetShouldStoreGraphs(true);
 }
 
 void TFE_ContextDisableRunMetadata(TFE_Context* ctx) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  context->SetShouldStoreGraphs(false);
+  tensorflow::unwrap(ctx)->SetShouldStoreGraphs(false);
 }
 
 }  // extern "C"
@@ -1486,7 +1475,7 @@ void TFE_ContextEndStep(TFE_Context* ctx) {
   tensorflow::unwrap(ctx)->EndStep();
 }
 
-const TFE_OpAttrs* TFE_OpGetAttrs(TFE_Op* op) {
+const TFE_OpAttrs* TFE_OpGetAttrs(const TFE_Op* op) {
   return tensorflow::wrap(
       &OperationFromInterface(tensorflow::unwrap(op))->Attrs());
 }
@@ -1551,8 +1540,67 @@ void SetOpAttrValueScalar(TFE_Context* ctx, TFE_Op* op,
       TFE_OpSetAttrFunction(op, attr_name, func_op);
       TFE_DeleteOp(func_op);
     } break;
-    case tensorflow::AttrValue::kList:
-      TF_FALLTHROUGH_INTENDED;
+    case tensorflow::AttrValue::kList: {
+      // String
+      if (const int s_size = default_value.list().s_size()) {
+        absl::InlinedVector<const void*, 4> values_vector;
+        absl::InlinedVector<size_t, 4> lengths_vector;
+        for (int i = 0; i < s_size; ++i) {
+          const string& v = default_value.list().s(i);
+          values_vector.push_back(v.data());
+          lengths_vector.push_back(v.size());
+        }
+        TFE_OpSetAttrStringList(op, attr_name, values_vector.data(),
+                                lengths_vector.data(), s_size);
+      }
+
+      // Int
+      if (const int i_size = default_value.list().i_size()) {
+        absl::InlinedVector<int64_t, 4> i_vector;
+        for (int i = 0; i < i_size; ++i) {
+          i_vector.push_back(default_value.list().i(i));
+        }
+        TFE_OpSetAttrIntList(op, attr_name, i_vector.data(), i_size);
+      }
+      // Float
+      if (const int f_size = default_value.list().f_size()) {
+        absl::InlinedVector<float, 4> f_vector;
+        for (int i = 0; i < f_size; ++i) {
+          f_vector.push_back(default_value.list().f(i));
+        }
+        TFE_OpSetAttrFloatList(op, attr_name, f_vector.data(), f_size);
+      }
+      // Bool
+      if (const int b_size = default_value.list().b_size()) {
+        absl::InlinedVector<unsigned char, 4> b_vector;
+        for (int i = 0; i < b_size; i++) {
+          b_vector.push_back(default_value.list().b(i));
+        }
+        TFE_OpSetAttrBoolList(op, attr_name, b_vector.data(), b_size);
+      }
+      // Type
+      if (const int type_size = default_value.list().type_size()) {
+        absl::InlinedVector<unsigned int, 4> type_vector;
+        for (int i = 0; i < type_size; ++i) {
+          type_vector.push_back(default_value.list().type(i));
+        }
+        TFE_OpSetAttrTypeList(
+            op, attr_name,
+            reinterpret_cast<const TF_DataType*>(type_vector.data()),
+            type_size);
+      }
+
+      // Rest are not supported.
+      if (default_value.list().shape_size() > 0 ||
+          default_value.list().func_size() > 0 ||
+          default_value.list().tensor_size() > 0) {
+        TF_SetStatus(
+            status, TF_UNIMPLEMENTED,
+            tensorflow::strings::StrCat("Unable to get setfor default value: ",
+                                        default_value.DebugString())
+                .data());
+      }
+    } break;
     case tensorflow::AttrValue::kTensor:
       TF_FALLTHROUGH_INTENDED;
     case tensorflow::AttrValue::kPlaceholder:
@@ -1612,19 +1660,12 @@ class CustomDeviceAPI : public tensorflow::CustomDevice {
     return status.status;
   }
 
-  tensorflow::Status Execute(tensorflow::EagerOperation* op,
+  tensorflow::Status Execute(const tensorflow::EagerOperation* op,
                              tensorflow::TensorHandle** retvals,
                              int* num_retvals) override {
-    std::vector<TFE_TensorHandle*> inputs;
-    inputs.reserve(op->Inputs().size());
-    for (int i = 0; i < op->Inputs().size(); ++i) {
-      op->Inputs()[i]->Ref();
-      inputs.push_back(tensorflow::wrap(op->Inputs()[i]));
-    }
     std::vector<TFE_TensorHandle*> outputs(*num_retvals);
     TF_Status status;
-    device_.execute(context_, inputs.size(), inputs.data(), op->Name().c_str(),
-                    wrap(&op->Attrs()), num_retvals, outputs.data(), &status,
+    device_.execute(tensorflow::wrap(op), num_retvals, outputs.data(), &status,
                     info_);
     if (status.status.ok()) {
       for (int i = 0; i < *num_retvals; ++i) {
@@ -1634,10 +1675,6 @@ class CustomDeviceAPI : public tensorflow::CustomDevice {
         TFE_DeleteTensorHandle(outputs[i]);
       }
     }
-
-    for (auto inp : inputs) {
-      TFE_DeleteTensorHandle(inp);
-    }
     return status.status;
   }
 
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 5afe3047dd7..0afb69bb82c 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -74,7 +74,7 @@ typedef enum TFE_ContextDevicePlacementPolicy {
   // Placement policy which silently copies int32 tensors but not other dtypes.
   TFE_DEVICE_PLACEMENT_SILENT_FOR_INT32 = 3,
 } TFE_ContextDevicePlacementPolicy;
-// LINT.ThenChange(//tensorflow/core/common_runtime/eager/context.h)
+// LINT.ThenChange(//tensorflow/c/eager/immediate_execution_context.h)
 
 // Sets the default execution mode (sync/async). Note that this can be
 // overridden per thread using TFE_ContextSetExecutorForThread.
@@ -248,22 +248,22 @@ typedef struct TFE_Op TFE_Op;
 TF_CAPI_EXPORT extern TFE_Op* TFE_NewOp(TFE_Context* ctx,
                                         const char* op_or_function_name,
                                         TF_Status* status);
-
 TF_CAPI_EXPORT extern void TFE_DeleteOp(TFE_Op* op);
 
+// Returns the op or function name `op` will execute.
+//
+// The returned string remains valid throughout the lifetime of 'op'.
+TF_CAPI_EXPORT extern const char* TFE_OpGetName(const TFE_Op* op,
+                                                TF_Status* status);
+TF_CAPI_EXPORT extern TFE_Context* TFE_OpGetContext(const TFE_Op* op,
+                                                    TF_Status* status);
+
 TF_CAPI_EXPORT extern void TFE_OpSetDevice(TFE_Op* op, const char* device_name,
                                            TF_Status* status);
 // The returned string remains valid throughout the lifetime of 'op'.
-TF_CAPI_EXPORT extern const char* TFE_OpGetDevice(TFE_Op* op,
+TF_CAPI_EXPORT extern const char* TFE_OpGetDevice(const TFE_Op* op,
                                                   TF_Status* status);
 
-// When 'enable' is set to 1, and if TensorFlow library is built with XLA
-// support, a subsequent TFE_Execute() call on `op` will run the op via XLA.
-//
-// If the library is not built with XLA support, this call would be a no-op.
-TF_CAPI_EXPORT extern void TFE_OpSetXLACompilation(TFE_Op* op,
-                                                   unsigned char enable);
-
 TF_CAPI_EXPORT extern void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* input,
                                           TF_Status* status);
 
@@ -272,6 +272,23 @@ TF_CAPI_EXPORT extern void TFE_OpAddInputList(TFE_Op* op,
                                               int num_inputs,
                                               TF_Status* status);
 
+// Fetches the current number of inputs attached to `op`.
+//
+// Does not use the operation's definition to determine how many inputs should
+// be attached. It is intended for use with TFE_OpGetFlatInput to inspect an
+// already-finalized operation.
+//
+// Note that TFE_OpGetFlatInputCount and TFE_OpGetFlatInput operate on a flat
+// sequence of inputs, unlike TFE_OpGetInputLength (for getting the length of a
+// particular named input list, which may only be part of the op's inputs).
+TF_CAPI_EXPORT extern int TFE_OpGetFlatInputCount(const TFE_Op* op,
+                                                  TF_Status* status);
+// Returns a borrowed reference to one of `op`'s inputs. Use
+// `TFE_TensorHandleCopySharingTensor` to make a new reference.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_OpGetFlatInput(const TFE_Op* op,
+                                                           int index,
+                                                           TF_Status* status);
+
 TF_CAPI_EXPORT extern TF_AttrType TFE_OpGetAttrType(TFE_Op* op,
                                                     const char* attr_name,
                                                     unsigned char* is_list,
diff --git a/tensorflow/c/eager/c_api_debug.cc b/tensorflow/c/eager/c_api_debug.cc
index dd55f05283b..b5721cdab0a 100644
--- a/tensorflow/c/eager/c_api_debug.cc
+++ b/tensorflow/c/eager/c_api_debug.cc
@@ -22,9 +22,6 @@ limitations under the License.
 #include "tensorflow/c/tf_status_internal.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/platform/status.h"
-#ifdef TENSORFLOW_EAGER_USE_XLA
-#include "tensorflow/compiler/jit/xla_device.h"
-#endif  // TENSORFLOW_EAGER_USE_XLA
 
 using tensorflow::string;
 
@@ -64,87 +61,6 @@ TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
     return nullptr;
   }
 
-#ifdef TENSORFLOW_EAGER_USE_XLA
-  auto* device = absl::get<tensorflow::Device*>(handle->device());
-
-  // If tensor resides on an XLA device, use XLA device's PaddedShapeFn.
-  auto* xla_device = dynamic_cast<tensorflow::XlaDevice*>(device);
-  if (xla_device != nullptr) {
-    tensorflow::XlaDevice::PaddedShapeFn shape_fn =
-        xla_device->metadata().padded_shape_fn();
-    xla::Shape padded_shape;
-    status->status = shape_fn(*tensor, &padded_shape);
-    if (!status->status.ok()) {
-      return nullptr;
-    }
-    if (VLOG_IS_ON(3)) {
-      std::vector<tensorflow::int64> shape_to_log =
-          TensorShapeAsVector(*handle, &status->status);
-      if (!status->status.ok()) {
-        // Ignore the status here as we are simply logging.
-        status->status = tensorflow::Status::OK();
-      } else {
-        VLOG(3) << "Fully padded shape of ["
-                << absl::StrJoin(shape_to_log, ", ") << "] is "
-                << padded_shape.DebugString();
-      }
-    }
-
-    if (padded_shape.IsTuple()) {
-      if (xla::ShapeUtil::TupleElementCount(padded_shape) != 2) {
-        // Currently, the only case of XlaTensor containing a tuple shape is to
-        // represent 64 bit ints, doubles, and complex numbers (we don't support
-        // 64bit complex numbers).
-        status->status = tensorflow::errors::InvalidArgument(
-            "XlaTensors should only contain tuples of size 2. Shape: ",
-            padded_shape.DebugString());
-        return nullptr;
-      }
-
-      // shape0 is not a const& because we will assign it to padded_shape below.
-      // It is illegal to assign a part of a message to itself.
-      xla::Shape shape0 = xla::ShapeUtil::GetTupleElementShape(padded_shape, 0);
-      const xla::Shape& shape1 =
-          xla::ShapeUtil::GetTupleElementShape(padded_shape, 1);
-      if (shape0.IsTuple() || shape1.IsTuple()) {
-        status->status = tensorflow::errors::InvalidArgument(
-            "XlaTensors should not contain nested tuples. Shape: ",
-            padded_shape.DebugString());
-        return nullptr;
-      }
-      if (!xla::ShapeUtil::Equal(shape0, shape1)) {
-        status->status = tensorflow::errors::InvalidArgument(
-            "Subshapes of XlaTensors should be the same. Shape: ",
-            padded_shape.DebugString());
-        return nullptr;
-      }
-
-      // Since the only case we handle here are two equal subshapes, we
-      // simply return one of them. The caller will interpret it as this
-      // shape directly storing the 64bit types. This approximation is good
-      // enough for this API's debugging use case.
-      padded_shape = shape0;
-    }
-
-    int rank = padded_shape.dimensions_size();
-    std::vector<tensorflow::int64> dev_dims;
-    dev_dims.reserve(rank);
-    if (rank == 1) {
-      // Rank 1 tensors might not have padded_shape.layout.minor_to_major set,
-      dev_dims.push_back(padded_shape.dimensions(0));
-    } else {
-      for (int i = rank - 1; i >= 0; --i) {
-        tensorflow::int64 dim_index = padded_shape.layout().minor_to_major(i);
-        dev_dims.push_back(padded_shape.dimensions(dim_index));
-      }
-    }
-    status->status = tensorflow::Status::OK();
-    return new TFE_TensorDebugInfo(dev_dims);
-  }
-#endif  // TENSORFLOW_EAGER_USE_XLA
-
-  // If the tensor is not an XLA tensor, the device shape is
-  // the same as regular tensor shape.
   std::vector<tensorflow::int64> dev_dims =
       TensorShapeAsVector(*handle, &status->status);
   if (!status->status.ok()) {
diff --git a/tensorflow/c/eager/c_api_distributed_test.cc b/tensorflow/c/eager/c_api_distributed_test.cc
index cf35c2d634d..d21cadfd0cb 100644
--- a/tensorflow/c/eager/c_api_distributed_test.cc
+++ b/tensorflow/c/eager/c_api_distributed_test.cc
@@ -121,25 +121,6 @@ string AddVariablesFunction() {
   return def.SerializeAsString();
 }
 
-void VarIsInitialized(TFE_Context* ctx, TFE_TensorHandle* var_handle) {
-  TF_Status* status = TF_NewStatus();
-  TFE_Op* op = TFE_NewOp(ctx, "VarIsInitializedOp", status);
-  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  TFE_OpAddInput(op, var_handle, status);
-  TFE_TensorHandle* is_initialized[1] = {nullptr};
-  int num_retvals = 1;
-  TFE_Execute(op, &is_initialized[0], &num_retvals, status);
-  CHECK_EQ(1, num_retvals);
-  TF_Tensor* t = TFE_TensorHandleResolve(is_initialized[0], status);
-  bool initialized = false;
-  memcpy(&initialized, TF_TensorData(t), TF_TensorByteSize(t));
-  EXPECT_EQ(initialized, true);
-  TF_DeleteTensor(t);
-  TFE_DeleteTensorHandle(is_initialized[0]);
-  TFE_DeleteOp(op);
-  delete status;
-}
-
 void TestFunctionWithPackedInput(const bool remote) {
   tensorflow::ServerDef server_def = GetServerDef(3);
 
@@ -182,9 +163,8 @@ void TestFunctionWithPackedInput(const bool remote) {
 
   // Add a sync point in order to make sure that variables have been initialized
   // before the function execution starts.
-  // TODO(b/155789951): Remove once b/155789951 is fixed.
-  VarIsInitialized(ctx, h1);
-  VarIsInitialized(ctx, h2);
+  TFE_ContextAsyncWait(ctx, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   // Pack 3 variable handles into one TFE_TensorHandle.
   // When remote is false, function device is placed on task0. Handle types are
@@ -396,6 +376,8 @@ TEST(CAPI, DistributedFunctionGraphPassOnlyOnce) {
 
   TFE_TensorHandle* var_handle = TestVariable(ctx, 2.0, dev2_name);
   EXPECT_NE(var_handle, nullptr);
+  TFE_ContextAsyncWait(ctx, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   const string function_def = VariableAddFunction();
   TFE_ContextAddFunctionDef(ctx, function_def.data(), function_def.size(),
@@ -517,6 +499,8 @@ void TestDistributedFunctionCancellation(bool inject_error) {
 
   TFE_TensorHandle* var_handle = TestVariable(ctx, 2.0, dev2_name);
   EXPECT_NE(var_handle, nullptr);
+  TFE_ContextAsyncWait(ctx, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   const string function_def = inject_error ? VariableAddFunctionWithGraphError()
                                            : VariableAddFunction();
@@ -561,7 +545,9 @@ TEST(CAPI, DistributedFunctionNoError) {
   TestDistributedFunctionCancellation(false);
 }
 
-TEST(CAPI, DistributedFunctionCancelledOnError) {
+// TODO(b/170399182): Update test once an alternative to using the function
+// optimization hook is in place.
+TEST(CAPI, DISABLED_DistributedFunctionCancelledOnError) {
   TestDistributedFunctionCancellation(true);
 }
 
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index 7390cf243be..1ef536a66f6 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -49,15 +49,11 @@ void TFE_OpReset(TFE_Op* op_to_reset, const char* op_or_function_name,
 }
 
 void TFE_ContextEnableGraphCollection(TFE_Context* ctx) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  context->SetShouldStoreGraphs(true);
+  tensorflow::unwrap(ctx)->SetShouldStoreGraphs(true);
 }
 
 void TFE_ContextDisableGraphCollection(TFE_Context* ctx) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  context->SetShouldStoreGraphs(false);
+  tensorflow::unwrap(ctx)->SetShouldStoreGraphs(false);
 }
 
 uint64_t TFE_GetContextId(TFE_Context* ctx) {
@@ -486,29 +482,6 @@ TFE_MonitoringSamplerCell* TFE_MonitoringGetCellSampler2(
       static_cast<void*>(sampler->sampler->GetCell(label1, label2)));
 }
 
-void TFE_ContextOptionsSetMirroringPolicy(TFE_ContextOptions* options,
-                                          TFE_ContextMirroringPolicy policy) {
-  options->mirroring_policy = policy;
-}
-
-void TFE_ContextSetThreadLocalMirroringPolicy(
-    TFE_Context* ctx, TFE_ContextMirroringPolicy policy) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  context->SetThreadLocalMirroringPolicy(
-      static_cast<tensorflow::ContextMirroringPolicy>(policy));
-}
-
-// Note: this function looks up a thread local policy. So it should be called in
-// the appropriate client thread. In particular, in async mode, it may not be
-// safe to call this function from the async EagerExecutor threads.
-extern TFE_ContextMirroringPolicy TFE_ContextGetMirroringPolicy(
-    TFE_Context* ctx) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  return static_cast<TFE_ContextMirroringPolicy>(context->GetMirroringPolicy());
-}
-
 void TFE_ContextOptionsSetLazyRemoteInputsCopy(TFE_ContextOptions* options,
                                                bool lazy_copy) {
   options->lazy_remote_inputs_copy = lazy_copy;
@@ -567,22 +540,16 @@ void TFE_ExecutorClearError(TFE_Executor* executor) {
 }
 
 void TFE_ContextSetExecutorForThread(TFE_Context* ctx, TFE_Executor* executor) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  context->SetExecutorForThread(executor->executor());
+  tensorflow::unwrap(ctx)->SetExecutorForThread(executor->executor());
 }
 
 TFE_Executor* TFE_ContextGetExecutorForThread(TFE_Context* ctx) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  return new TFE_Executor(&context->Executor());
+  return new TFE_Executor(&tensorflow::unwrap(ctx)->Executor());
 }
 
 void TFE_HostAddressSpace(TFE_Context* ctx, TF_Buffer* buf) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   auto address_space = tensorflow::DeviceNameUtils::AddressSpace(
-      context->HostCPU()->parsed_name());
+      tensorflow::unwrap(ctx)->HostCPUParsedName());
   auto str = tensorflow::DeviceNameUtils::ParsedNameToString(address_space);
   void* data = tensorflow::port::Malloc(str.length());
   str.copy(static_cast<char*>(data), str.length(), 0);
@@ -595,9 +562,7 @@ void TFE_HostAddressSpace(TFE_Context* ctx, TF_Buffer* buf) {
 
 void TFE_ContextGetFunctionDef(TFE_Context* ctx, const char* function_name,
                                TF_Buffer* buf, TF_Status* status) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  auto* function_def = context->FindFunctionDef(function_name);
+  auto* function_def = tensorflow::unwrap(ctx)->FindFunctionDef(function_name);
   if (function_def == nullptr) {
     status->status = tensorflow::errors::NotFound(
         "Unable to find FunctionDef with name: ", function_name);
@@ -666,14 +631,26 @@ TFE_TensorHandle* TFE_CreatePackedTensorHandle(TFE_Context* ctx,
 
 void TFE_ContextSetSoftDevicePlacement(TFE_Context* ctx, unsigned char enable,
                                        TF_Status* status) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  context->SetAllowSoftPlacement(enable);
+  tensorflow::unwrap(ctx)->SetAllowSoftPlacement(enable);
 }
 
 void TFE_ContextSetLogDevicePlacement(TFE_Context* ctx, unsigned char enable,
                                       TF_Status* status) {
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  context->SetLogDevicePlacement(enable);
+  tensorflow::unwrap(ctx)->SetLogDevicePlacement(enable);
+}
+
+const char* TFE_TensorHandleDeviceType(TFE_TensorHandle* h, TF_Status* status) {
+  if (h == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument("Invalid handle");
+    return nullptr;
+  }
+  return tensorflow::unwrap(h)->DeviceType(&status->status);
+}
+
+int TFE_TensorHandleDeviceID(TFE_TensorHandle* h, TF_Status* status) {
+  if (h == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument("Invalid handle");
+    return -1;
+  }
+  return tensorflow::unwrap(h)->DeviceId(&status->status);
 }
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index 1af76c01154..d0739a5437d 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -265,33 +265,6 @@ TF_CAPI_EXPORT extern void TFE_MonitoringDeleteSampler2(
 TF_CAPI_EXPORT extern TFE_MonitoringSamplerCell* TFE_MonitoringGetCellSampler2(
     TFE_MonitoringSampler2* sampler, const char* label1, const char* label2);
 
-// LINT.IfChange
-// Note: Keep in sync with internal copy of enum in eager/context.h.
-typedef enum TFE_ContextMirroringPolicy {
-  // Do not maintain mirrors in a TensorHandle, instead make new TensorHandle
-  // copies with their own lifetime.
-  TFE_MIRRORING_NONE = 0,
-  // Mirroring any remote tensor handles, associating them with the lifetime of
-  // the local TensorHandle.
-  TFE_MIRRORING_ALL = 1,
-} TFE_ContextMirroringPolicy;
-// LINT.ThenChange(//tensorflow/core/common_runtime/eager/context.h)
-
-TF_CAPI_EXPORT extern void TFE_ContextOptionsSetMirroringPolicy(
-    TFE_ContextOptions*, TFE_ContextMirroringPolicy);
-
-// Sets a thread-local mirroring policy. After this call, other calls to
-// TFE_Execute in the same thread will use the mirroring policy specified here
-// instead of the mirroring policy used to construct the context. This has no
-// effect on the mirroring policy used by other program threads.
-TF_CAPI_EXPORT extern void TFE_ContextSetThreadLocalMirroringPolicy(
-    TFE_Context*, TFE_ContextMirroringPolicy);
-
-// Returns the mirroring policy to be used by this context in the current
-// thread.
-TF_CAPI_EXPORT extern TFE_ContextMirroringPolicy TFE_ContextGetMirroringPolicy(
-    TFE_Context*);
-
 // Sets whether to copy the remote inputs of a function lazily.
 TF_CAPI_EXPORT extern void TFE_ContextOptionsSetLazyRemoteInputsCopy(
     TFE_ContextOptions*, bool lazy_copy);
@@ -441,7 +414,7 @@ typedef struct TFE_OpAttrs TFE_OpAttrs;
 
 // Fetch a reference to `op`'s attributes. The returned reference is only valid
 // while `op` is alive.
-const TFE_OpAttrs* TFE_OpGetAttrs(TFE_Op* op);
+TF_CAPI_EXPORT extern const TFE_OpAttrs* TFE_OpGetAttrs(const TFE_Op* op);
 // Add attributes in `attrs` to `op`.
 //
 // Does not overwrite or update existing attributes, but adds new ones.
@@ -462,7 +435,11 @@ TF_CAPI_EXPORT extern void TFE_OpSetAttrValueProto(const TFE_Op* op,
                                                    size_t proto_len,
                                                    TF_Status* status);
 
-#define TFE_CUSTOM_DEVICE_VERSION 2
+// TODO(b/166642410): It would be nice, for custom devices and for other users,
+// to have a non-string representation of devices (TF_Device) extracted from
+// tensors/ops/etc. and usable in APIs like OpSetDevice/ResetOp/etc.
+
+#define TFE_CUSTOM_DEVICE_VERSION 3
 
 // Struct to be filled in
 typedef struct TFE_CustomDevice {
@@ -481,9 +458,16 @@ typedef struct TFE_CustomDevice {
                                                void* device_info);
 
   // Method to execute an operation.
-  void (*execute)(TFE_Context* context, int num_inputs,
-                  TFE_TensorHandle** inputs, const char* operation_name,
-                  const TFE_OpAttrs* attributes, int* num_outputs,
+  //
+  // Arguments provide enough information to reconstruct the original `TFE_Op`,
+  // or construct a transformed version, by inspecting the passed `op`.
+  //
+  // TFE_OpGetDevice(op) records the original placement of the operation. It may
+  // be an empty string if no device was explicitly requested, but will
+  // otherwise be the name of this custom device. Ops are placed onto a custom
+  // device if any of their inputs are on that custom device, but custom devices
+  // are free to set a bad status in order to require explicit placement.
+  void (*execute)(const TFE_Op* op, int* num_outputs,
                   TFE_TensorHandle** outputs, TF_Status* s, void* device_info);
 
   // Method to delete a device.
@@ -569,6 +553,14 @@ TF_CAPI_EXPORT void TFE_ContextSetLogDevicePlacement(TFE_Context* ctx,
                                                      unsigned char enable,
                                                      TF_Status* status);
 
+// Returns the device type of the operation that produced `h`.
+TF_CAPI_EXPORT extern const char* TFE_TensorHandleDeviceType(
+    TFE_TensorHandle* h, TF_Status* status);
+
+// Returns the device ID of the operation that produced `h`.
+TF_CAPI_EXPORT extern int TFE_TensorHandleDeviceID(TFE_TensorHandle* h,
+                                                   TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/eager/c_api_experimental_test.cc b/tensorflow/c/eager/c_api_experimental_test.cc
index a4d31417073..4fe83b5116d 100644
--- a/tensorflow/c/eager/c_api_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_experimental_test.cc
@@ -316,86 +316,6 @@ TEST(CAPI, Function_ident_CPU) {
   TF_DeleteStatus(status);
 }
 
-#ifdef TENSORFLOW_EAGER_USE_XLA
-TEST(CAPI, Function_ident_XLA_CPU) {
-  // First create a simple identity function.
-  TF_Graph* function_graph = TF_NewGraph();
-  TF_OperationDescription* arg_descr =
-      TF_NewOperation(function_graph, "Placeholder", "arg");
-  TF_SetAttrType(arg_descr, "dtype", TF_INT32);
-  TF_Status* status = TF_NewStatus();
-  TF_Operation* arg = TF_FinishOperation(arg_descr, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_OperationDescription* id_descr =
-      TF_NewOperation(function_graph, "Identity", "id");
-  TF_SetAttrType(id_descr, "T", TF_INT32);
-  TF_AddInput(id_descr, {arg, 0});
-  TF_Operation* id = TF_FinishOperation(id_descr, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_Output input{arg, 0};
-  TF_Output output{id, 0};
-  TF_Function* fn =
-      TF_GraphToFunction(function_graph, "ident", 0, 1, &id, 1, &input, 1,
-                         &output, nullptr, nullptr, "test", status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_DeleteGraph(function_graph);
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_Context* ctx = TFE_NewContext(opts, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TFE_DeleteContextOptions(opts);
-  TFE_ContextAddFunction(ctx, fn, status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_DeleteFunction(fn);
-
-  for (bool async : {false, true, false}) {
-    TFE_Executor* old_executor = TFE_ContextGetExecutorForThread(ctx);
-    TFE_Executor* executor = TFE_NewExecutor(async);
-    TFE_ContextSetExecutorForThread(ctx, executor);
-    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK);
-    TF_Tensor* t =
-        TF_AllocateTensor(TF_INT32, nullptr, 0, 1 * sizeof(tensorflow::int32));
-    *reinterpret_cast<tensorflow::int32*>(TF_TensorData(t)) = 42;
-    TFE_TensorHandle* h = TFE_NewTensorHandle(t, status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-    TF_DeleteTensor(t);
-
-    TFE_Op* op = TFE_NewOp(ctx, "ident", status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-    TFE_OpAddInput(op, h, status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-
-    // Now run it via XLA.
-    TFE_OpSetXLACompilation(op, true);
-
-    std::vector<TFE_TensorHandle*> result;
-    result.push_back(nullptr);
-    int num_retvals = 1;
-    TFE_Execute(op, result.data(), &num_retvals, status);
-    TFE_DeleteOp(op);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-    ASSERT_EQ(num_retvals, 1);
-
-    TF_Tensor* r = TFE_TensorHandleResolve(result[0], status);
-    ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-    EXPECT_EQ(*reinterpret_cast<tensorflow::int32*>(TF_TensorData(r)), 42);
-    TFE_ContextSetExecutorForThread(ctx, old_executor);
-    TFE_ExecutorWaitForAllPendingNodes(executor, status);
-    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-    TFE_DeleteExecutor(executor);
-    TFE_DeleteExecutor(old_executor);
-    TFE_DeleteTensorHandle(h);
-    TF_DeleteTensor(r);
-    TFE_DeleteTensorHandle(result[0]);
-  }
-  TFE_ContextRemoveFunction(ctx, "ident", status);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TFE_DeleteContext(ctx);
-  ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
-  TF_DeleteStatus(status);
-}
-#endif  // TENSORFLOW_EAGER_USE_XLA
-
 void Executor_MatMul_CPU(bool async) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
@@ -491,5 +411,109 @@ TEST(CAPI, TensorHandleOnDeviceMemory) {
   TF_DeleteStatus(status);
 }
 
+TEST(CAPI, TensorHandleNullptr) {
+  TFE_TensorHandle* h = nullptr;
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+
+  const char* device_type = TFE_TensorHandleDeviceType(h, status.get());
+  ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
+  ASSERT_EQ(device_type, nullptr);
+  ASSERT_EQ("Invalid handle", string(TF_Message(status.get())));
+
+  TF_SetStatus(status.get(), TF_OK, "");
+
+  int device_id = TFE_TensorHandleDeviceID(h, status.get());
+  ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
+  ASSERT_EQ(device_id, -1);
+  ASSERT_EQ("Invalid handle", string(TF_Message(status.get())));
+}
+
+TEST(CAPI, TensorHandleDevices) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status.get());
+  TFE_DeleteContextOptions(opts);
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  TFE_TensorHandle* hcpu = TestMatrixTensorHandle(ctx);
+  const char* device_type = TFE_TensorHandleDeviceType(hcpu, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  ASSERT_TRUE(absl::StrContains(device_type, "CPU")) << device_type;
+  int device_id = TFE_TensorHandleDeviceID(hcpu, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  ASSERT_EQ(0, device_id) << device_id;
+
+  // Disable the test if no GPU is present.
+  string gpu_device_name;
+  if (GetDeviceName(ctx, &gpu_device_name, "GPU")) {
+    TFE_TensorHandle* hgpu = TFE_TensorHandleCopyToDevice(
+        hcpu, ctx, gpu_device_name.c_str(), status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    TFE_Op* shape_op = ShapeOp(ctx, hgpu);
+    TFE_OpSetDevice(shape_op, gpu_device_name.c_str(), status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    TFE_TensorHandle* retvals[1];
+    int num_retvals = 1;
+    TFE_Execute(shape_op, &retvals[0], &num_retvals, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    device_type = TFE_TensorHandleDeviceType(retvals[0], status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+    ASSERT_TRUE(absl::StrContains(device_type, "GPU")) << device_type;
+
+    device_id = TFE_TensorHandleDeviceID(retvals[0], status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+    ASSERT_EQ(0, device_id) << device_id;
+
+    TFE_DeleteOp(shape_op);
+    TFE_DeleteTensorHandle(retvals[0]);
+    TFE_DeleteTensorHandle(hgpu);
+  }
+
+  TFE_DeleteTensorHandle(hcpu);
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteExecutor(executor);
+  TFE_DeleteContext(ctx);
+}
+
+TEST(CAPI, TensorHandleDefaults) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status.get());
+  TFE_DeleteContextOptions(opts);
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  TFE_TensorHandle* h_default = TestMatrixTensorHandle(ctx);
+  const char* device_type = TFE_TensorHandleDeviceType(h_default, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  ASSERT_TRUE(absl::StrContains(device_type, "CPU")) << device_type;
+  int device_id = TFE_TensorHandleDeviceID(h_default, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  ASSERT_EQ(0, device_id) << device_id;
+
+  TFE_TensorHandle* h_cpu = TFE_TensorHandleCopyToDevice(
+      h_default, ctx, "/device:CPU:0", status.get());
+  const char* device_type_cpu = TFE_TensorHandleDeviceType(h_cpu, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  ASSERT_TRUE(absl::StrContains(device_type_cpu, "CPU")) << device_type_cpu;
+  int device_id_cpu = TFE_TensorHandleDeviceID(h_cpu, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  ASSERT_EQ(0, device_id_cpu) << device_id_cpu;
+
+  TFE_DeleteTensorHandle(h_default);
+  TFE_DeleteTensorHandle(h_cpu);
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteExecutor(executor);
+  TFE_DeleteContext(ctx);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 4d9be0c2501..356476c2186 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -32,7 +32,6 @@ struct TFE_ContextOptions {
   bool async = false;
   TFE_ContextDevicePlacementPolicy device_placement_policy{
       TFE_DEVICE_PLACEMENT_SILENT};
-  TFE_ContextMirroringPolicy mirroring_policy{TFE_MIRRORING_NONE};
   // If true, lazily copy the remote inputs of a function to the target devices.
   bool lazy_remote_inputs_copy = true;
   // If true, use TFRT backend
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 724176505ba..fd208c6770d 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 
 // clang-format off
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
 
@@ -876,89 +877,6 @@ TEST(CAPI, Execute_Min_CPU) {
   TF_DeleteStatus(status);
 }
 
-#ifdef TENSORFLOW_EAGER_USE_XLA
-void Execute_MatMul_XLA_CPU(bool async) {
-  TF_Status* status = TF_NewStatus();
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
-  TFE_Context* ctx = TFE_NewContext(opts, status);
-  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_DeleteContextOptions(opts);
-
-  TFE_TensorHandle* m = TestMatrixTensorHandle(ctx);
-  TFE_Op* matmul = MatMulOp(ctx, m, m);
-
-  TFE_OpSetXLACompilation(matmul, true);
-
-  TFE_TensorHandle* retvals[1] = {nullptr};
-  int num_retvals = 1;
-  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
-  // Running a primitive TF operator via XLA is not yet supported.
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  TFE_DeleteOp(matmul);
-  TFE_DeleteTensorHandle(m);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  EXPECT_EQ(1, num_retvals);
-
-  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
-  TFE_DeleteTensorHandle(retvals[0]);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  float product[4] = {0};
-  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
-  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
-  TF_DeleteTensor(t);
-  EXPECT_EQ(7, product[0]);
-  EXPECT_EQ(10, product[1]);
-  EXPECT_EQ(15, product[2]);
-  EXPECT_EQ(22, product[3]);
-  TFE_DeleteContext(ctx);
-  TF_DeleteStatus(status);
-}
-TEST(CAPI, Execute_MatMul_XLA_CPU) { Execute_MatMul_XLA_CPU(false); }
-TEST(CAPI, Execute_MatMul_XLA_CPUAsync) { Execute_MatMul_XLA_CPU(true); }
-
-void Execute_Min_XLA_CPU(bool async) {
-  TF_Status* status = TF_NewStatus();
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
-  TFE_Context* ctx = TFE_NewContext(opts, status);
-  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_DeleteContextOptions(opts);
-
-  TFE_TensorHandle* input = TestMatrixTensorHandle(ctx);
-  TFE_TensorHandle* axis = TestAxisTensorHandle(ctx);
-  TFE_Op* minOp = MinOp(ctx, input, axis);
-
-  TFE_OpSetXLACompilation(minOp, true);
-
-  TFE_TensorHandle* retvals[1] = {nullptr};
-  int num_retvals = 1;
-  TFE_Execute(minOp, &retvals[0], &num_retvals, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_DeleteOp(minOp);
-  TFE_DeleteTensorHandle(input);
-  TFE_DeleteTensorHandle(axis);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  ASSERT_EQ(1, num_retvals);
-
-  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
-  TFE_DeleteTensorHandle(retvals[0]);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  float output[2] = {0};
-  EXPECT_EQ(sizeof(output), TF_TensorByteSize(t));
-  memcpy(&output[0], TF_TensorData(t), TF_TensorByteSize(t));
-  TF_DeleteTensor(t);
-  EXPECT_EQ(1, output[0]);
-  EXPECT_EQ(3, output[1]);
-  TFE_DeleteContext(ctx);
-  TF_DeleteStatus(status);
-}
-TEST(CAPI, Execute_Min_XLA_CPU) { Execute_Min_XLA_CPU(false); }
-TEST(CAPI, Execute_Min_XLA_CPUAsync) { Execute_Min_XLA_CPU(true); }
-#endif  // TENSORFLOW_EAGER_USE_XLA
-
 void ExecuteWithTracing(bool async) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
@@ -1274,6 +1192,68 @@ TEST(CAPI, StringAttributes) {
   TF_DeleteStatus(status);
 }
 
+// Same test as above, expect use SetOpAttrValueScalar to set attrs.
+TEST(CAPI, TestTFE_SetOpAttrs) {
+  // Test that TFE_OpSetAttrString doesn't hold on to the value after it
+  // returns.
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  std::vector<int64_t> dims(4, 1);
+  TFE_Op* op = TFE_NewOp(ctx, "AvgPool", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Tensor* tensor =
+      TF_AllocateTensor(TF_FLOAT, dims.data(), dims.size(), sizeof(float));
+  float tensor_data[] = {1};
+  memcpy(TF_TensorData(tensor), tensor_data, TF_TensorByteSize(tensor));
+  TFE_TensorHandle* tensor_handle = TFE_NewTensorHandle(tensor, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, tensor_handle, status);
+  TF_DeleteTensor(tensor);
+  TFE_DeleteTensorHandle(tensor_handle);
+
+  tensorflow::AttrValue i_list_values;
+  for (int i = 0; i < 4; ++i) {
+    i_list_values.mutable_list()->add_i(1);
+  }
+  SetOpAttrValueScalar(ctx, op, i_list_values, "ksize", status);
+  SetOpAttrValueScalar(ctx, op, i_list_values, "strides", status);
+
+  tensorflow::AttrValue padding_value;
+  *padding_value.mutable_s() = "VALID";
+  tensorflow::SetOpAttrValueScalar(ctx, op, padding_value, "padding", status);
+
+  tensorflow::AttrValue data_format_value;
+  *data_format_value.mutable_s() = "NHWC";
+  tensorflow::SetOpAttrValueScalar(ctx, op, data_format_value, "data_format",
+                                   status);
+
+  TFE_OpSetAttrType(op, "T", TF_FLOAT);
+
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_TensorHandle* retvals[1];
+  int num_retvals = 1;
+  TFE_Execute(op, &retvals[0], &num_retvals, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(1, num_retvals);
+
+  tensor = TFE_TensorHandleResolve(retvals[0], status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  EXPECT_EQ(4, TF_TensorByteSize(tensor));
+  TF_DeleteTensor(tensor);
+  TFE_DeleteTensorHandle(retvals[0]);
+
+  TFE_DeleteOp(op);
+
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+}
+
 TEST(CAPI, TestTFE_TensorHandleCopySharingUnderlyingTensorHandle) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
@@ -1620,4 +1600,91 @@ TEST(CAPI, TestTFE_OpAttrsSerialize) {
   TFE_DeleteContext(ctx);
 }
 
+// Needs to work with a const TFE_Op since custom devices should not modify the
+// op they are called with.
+TFE_Op* CloneOp(const TFE_Op* other) {
+  TF_Status* status = TF_NewStatus();
+  TFE_Context* context = TFE_OpGetContext(other, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  const char* op_name = TFE_OpGetName(other, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_Op* ret = TFE_NewOp(context, op_name, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  const char* device = TFE_OpGetDevice(other, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpSetDevice(ret, device, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddAttrs(ret, TFE_OpGetAttrs(other));
+  int num_inputs = TFE_OpGetFlatInputCount(other, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  for (int input_index = 0; input_index < num_inputs; ++input_index) {
+    TFE_TensorHandle* input = TFE_OpGetFlatInput(other, input_index, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_OpAddInput(ret, input, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  }
+  TF_DeleteStatus(status);
+  return ret;
+}
+
+TEST(CAPI, TestTFE_OpRecreation) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  // Clone an op with attributes and a device set.
+  TFE_Op* original_var_op = TFE_NewOp(ctx, "VarHandleOp", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpSetAttrType(original_var_op, "dtype", TF_INT64);
+  TFE_OpSetAttrShape(original_var_op, "shape", {}, 0, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  EXPECT_EQ("", std::string(TFE_OpGetDevice(original_var_op, status)));
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpSetDevice(original_var_op,
+                  "/job:localhost/replica:0/task:0/device:CPU:0", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_Op* cloned = CloneOp(original_var_op);
+
+  EXPECT_EQ("/job:localhost/replica:0/task:0/device:CPU:0",
+            std::string(TFE_OpGetDevice(cloned, status)));
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  EXPECT_EQ("VarHandleOp", std::string(TFE_OpGetName(cloned, status)));
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  int num_retvals = 1;
+  TFE_TensorHandle* ret;
+  TFE_Execute(cloned, &ret, &num_retvals, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteTensorHandle(ret);
+
+  // Clone an op with inputs and no device set.
+  TFE_TensorHandle* input1 = TestMatrixTensorHandle(ctx);
+  TFE_TensorHandle* input2 = TestMatrixTensorHandle(ctx);
+  TFE_Op* original_identity = TFE_NewOp(ctx, "IdentityN", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_TensorHandle* inputs[] = {input1, input2};
+  TFE_OpAddInputList(original_identity, inputs, 2, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_Op* cloned_identity = CloneOp(original_identity);
+  EXPECT_EQ("", std::string(TFE_OpGetDevice(cloned_identity, status)));
+  TFE_TensorHandle* identity_ret[] = {nullptr, nullptr};
+  num_retvals = 2;
+  TFE_Execute(cloned_identity, identity_ret, &num_retvals, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_DeleteTensorHandle(input1);
+  TFE_DeleteTensorHandle(input2);
+  TFE_DeleteTensorHandle(identity_ret[0]);
+  TFE_DeleteTensorHandle(identity_ret[1]);
+
+  TFE_DeleteOp(cloned_identity);
+  TFE_DeleteOp(original_identity);
+  TFE_DeleteOp(original_var_op);
+  TFE_DeleteOp(cloned);
+  TF_DeleteStatus(status);
+  TFE_DeleteContext(ctx);
+}
+
 }  // namespace
diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc
index fd68866f502..6eb5b521c50 100644
--- a/tensorflow/c/eager/c_api_test_util.cc
+++ b/tensorflow/c/eager/c_api_test_util.cc
@@ -17,12 +17,16 @@ limitations under the License.
 
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/strcat.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/core/protobuf/cluster.pb.h"
 
 using tensorflow::string;
+using tensorflow::tstring;
 
 TFE_TensorHandle* TestScalarTensorHandle(TFE_Context* ctx, float value) {
   float data[] = {value};
@@ -36,6 +40,19 @@ TFE_TensorHandle* TestScalarTensorHandle(TFE_Context* ctx, float value) {
   return th;
 }
 
+TFE_TensorHandle* TestScalarTensorHandle(TFE_Context* ctx,
+                                         const tensorflow::tstring& value) {
+  TF_Status* status = TF_NewStatus();
+  TF_Tensor* t = TFE_AllocateHostTensor(ctx, TF_STRING, nullptr, 0, status);
+  tstring* data = static_cast<tstring*>(TF_TensorData(t));
+  *data = value;
+  TFE_TensorHandle* th = TFE_NewTensorHandleFromTensor(ctx, t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
 TFE_TensorHandle* TestScalarTensorHandle(TFE_Context* ctx, int value) {
   int data[] = {value};
   TF_Status* status = TF_NewStatus();
diff --git a/tensorflow/c/eager/c_api_test_util.h b/tensorflow/c/eager/c_api_test_util.h
index 2f77ae5cf44..ad0c7c6340f 100644
--- a/tensorflow/c/eager/c_api_test_util.h
+++ b/tensorflow/c/eager/c_api_test_util.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_C_EAGER_C_API_TEST_UTIL_H_
 
 #include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 
@@ -28,6 +29,10 @@ TFE_TensorHandle* TestScalarTensorHandle(TFE_Context* ctx, int value);
 // Return a tensor handle containing a bool scalar
 TFE_TensorHandle* TestScalarTensorHandle(TFE_Context* ctx, bool value);
 
+// Return a tensor handle containing a tstring scalar
+TFE_TensorHandle* TestScalarTensorHandle(TFE_Context* ctx,
+                                         const tensorflow::tstring& value);
+
 // Return a tensor handle containing a 2x2 matrix of doubles
 TFE_TensorHandle* DoubleTestMatrixTensorHandle(TFE_Context* ctx);
 
diff --git a/tensorflow/c/eager/c_api_unified_experimental.cc b/tensorflow/c/eager/c_api_unified_experimental.cc
index 8408f7ef60f..2d290df19ce 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental.cc
@@ -39,7 +39,7 @@ static FactoriesMap& GetFactories() {
   return *factories;
 }
 
-static const char* default_factory = "<unset>";
+static tracing::FactoryFunction default_factory;
 
 void RegisterTracingEngineFactory(const string& name, FactoryFunction factory) {
   assert((!GetFactories().count(name)) ||
@@ -48,15 +48,15 @@ void RegisterTracingEngineFactory(const string& name, FactoryFunction factory) {
   GetFactories()[name] = factory;
 }
 
-void SetDefaultTracingEngine(const char* name) { default_factory = name; }
-
-static TracingContext* CreateTracingExecutionContext(const char* fn_name,
-                                                     TF_Status* s) {
-  auto entry = GetFactories().find(default_factory);
-  if (entry != GetFactories().end()) return entry->second(fn_name, s);
+Status SetDefaultTracingEngine(const char* name) {
+  auto entry = GetFactories().find(name);
+  if (entry != GetFactories().end()) {
+    default_factory = GetFactories().find(name)->second;
+    return Status::OK();
+  }
   string msg = absl::StrCat(
-      "No tracing engine factory has been registered with the key '",
-      default_factory, "' (available: ");
+      "No tracing engine factory has been registered with the key '", name,
+      "' (available: ");
   // Ensure deterministic (sorted) order in the error message
   std::set<string> factories_sorted;
   for (const auto& factory : GetFactories())
@@ -68,7 +68,16 @@ static TracingContext* CreateTracingExecutionContext(const char* fn_name,
   }
   msg += ")";
 
-  TF_SetStatus(s, TF_INVALID_ARGUMENT, msg.c_str());
+  return errors::InvalidArgument(msg.c_str());
+}
+
+static TracingContext* CreateTracingExecutionContext(const char* fn_name,
+                                                     TF_Status* s) {
+  if (default_factory) {
+    return default_factory(fn_name, s);
+  }
+  Set_TF_Status_from_Status(
+      s, errors::FailedPrecondition("default_factory is nullptr"));
   return nullptr;
 }
 
@@ -99,8 +108,8 @@ using tensorflow::tracing::TracingContext;
 using tensorflow::tracing::TracingOperation;
 using tensorflow::tracing::TracingTensorHandle;
 
-void TF_SetTracingImplementation(const char* name) {
-  SetDefaultTracingEngine(name);
+void TF_SetTracingImplementation(const char* name, TF_Status* s) {
+  Set_TF_Status_from_Status(s, SetDefaultTracingEngine(name));
 }
 
 // Creates a new TensorFlow function, it is an execution context attached to a
diff --git a/tensorflow/c/eager/c_api_unified_experimental.h b/tensorflow/c/eager/c_api_unified_experimental.h
index b66869b4290..d216b4e694b 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.h
+++ b/tensorflow/c/eager/c_api_unified_experimental.h
@@ -52,7 +52,7 @@ typedef struct TF_AbstractFunction TF_AbstractFunction;
 // This allows the client to swap the implementation of the tracing engine.
 // Any future call to TF_CreateFunction will use the implementation defined
 // here.
-void TF_SetTracingImplementation(const char* name);
+void TF_SetTracingImplementation(const char* name, TF_Status*);
 
 // Creates a new TensorFlow function. A Function is an execution context, and as
 // such it can trace operations through TF_ExecuteOperation. After completing
diff --git a/tensorflow/c/eager/c_api_unified_experimental_graph.cc b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
index 9d064039141..0e9d6c18157 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_graph.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
@@ -365,9 +365,10 @@ class GraphContext : public TracingContext {
     }
 
     auto s = TF_NewStatus();
-    func->func = TF_GraphToFunction(
-        graph_.get(), name_, 0, -1, nullptr, inputs_.size(), inputs_.data(),
-        graph_outputs.size(), graph_outputs.data(), nullptr, nullptr, name_, s);
+    func->func = TF_GraphToFunction(graph_.get(), name_.data(), 0, -1, nullptr,
+                                    inputs_.size(), inputs_.data(),
+                                    graph_outputs.size(), graph_outputs.data(),
+                                    nullptr, nullptr, name_.data(), s);
     TF_RETURN_IF_ERROR(StatusFromTF_Status(s));
     TF_DeleteStatus(s);
     *f = func.release();
@@ -391,7 +392,7 @@ class GraphContext : public TracingContext {
  private:
   std::unique_ptr<TF_Graph, decltype(&TF_DeleteGraph)> graph_;
   std::vector<TF_Output> inputs_;
-  const char* name_;
+  string name_;
 };
 
 static TracingContext* GraphTracingFactory(const char* name, TF_Status* s) {
@@ -401,7 +402,7 @@ static TracingContext* GraphTracingFactory(const char* name, TF_Status* s) {
 // Register the tracing implemented in this file as the default tracing engine.
 static bool register_tracing = [] {
   RegisterTracingEngineFactory("graphdef", GraphTracingFactory);
-  SetDefaultTracingEngine("graphdef");
+  SetDefaultTracingEngine("graphdef").IgnoreError();
   return true;
 }();
 
diff --git a/tensorflow/c/eager/c_api_unified_experimental_internal.h b/tensorflow/c/eager/c_api_unified_experimental_internal.h
index c00e04d98af..9433fe8f120 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_internal.h
+++ b/tensorflow/c/eager/c_api_unified_experimental_internal.h
@@ -120,7 +120,7 @@ class TracingContext : public AbstractContext {
 };
 
 typedef TracingContext* (*FactoryFunction)(const char* fn_name, TF_Status*);
-void SetDefaultTracingEngine(const char* name);
+Status SetDefaultTracingEngine(const char* name);
 void RegisterTracingEngineFactory(const ::tensorflow::string& name,
                                   FactoryFunction factory);
 }  // namespace tracing
diff --git a/tensorflow/c/eager/c_api_unified_experimental_test.cc b/tensorflow/c/eager/c_api_unified_experimental_test.cc
index 7b3a497a0c5..432ddb4b2d4 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_test.cc
@@ -22,10 +22,15 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
 
+using tensorflow::Status;
 using tensorflow::string;
+using tensorflow::TF_StatusPtr;
 
 namespace tensorflow {
 namespace {
@@ -37,7 +42,10 @@ class UnifiedCAPI
     : public ::testing::TestWithParam<std::tuple<const char*, bool>> {
  protected:
   void SetUp() override {
-    TF_SetTracingImplementation(std::get<0>(GetParam()));
+    TF_StatusPtr status(TF_NewStatus());
+    TF_SetTracingImplementation(std::get<0>(GetParam()), status.get());
+    Status s = StatusFromTF_Status(status.get());
+    CHECK_EQ(errors::OK, s.code()) << s.error_message();
   }
 };
 
diff --git a/tensorflow/c/eager/custom_device_test.cc b/tensorflow/c/eager/custom_device_test.cc
index 1c078d4f42c..b058c79a17b 100644
--- a/tensorflow/c/eager/custom_device_test.cc
+++ b/tensorflow/c/eager/custom_device_test.cc
@@ -36,7 +36,8 @@ TEST(CUSTOM_DEVICE, RegisterSimpleDevice) {
   bool arrived = false;
   bool executed = false;
   const char* name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  RegisterLoggingDevice(context, name, &arrived, &executed, status.get());
+  RegisterLoggingDevice(context, name, /*strict_scope_placement=*/true,
+                        &arrived, &executed, status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
   TFE_TensorHandle* hcpu = TestMatrixTensorHandle(context);
   ASSERT_FALSE(arrived);
@@ -73,7 +74,8 @@ TEST(CUSTOM_DEVICE, ResetOperation) {
   bool executed = false;
   const char* custom_device_name =
       "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  RegisterLoggingDevice(context.get(), custom_device_name, &arrived, &executed,
+  RegisterLoggingDevice(context.get(), custom_device_name,
+                        /*strict_scope_placement=*/true, &arrived, &executed,
                         status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
@@ -103,7 +105,8 @@ TEST(CUSTOM_DEVICE, MakeVariable) {
   bool arrived = false;
   bool executed = false;
   const char* name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  RegisterLoggingDevice(context.get(), name, &arrived, &executed, status.get());
+  RegisterLoggingDevice(context.get(), name, /*strict_scope_placement=*/true,
+                        &arrived, &executed, status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
   // Create a variable handle placed on the custom device.
@@ -187,7 +190,8 @@ TEST(CUSTOM_DEVICE, AccessVariableOnCustomDevice) {
   bool arrived = false;
   bool executed = false;
   const char* name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  RegisterLoggingDevice(context.get(), name, &arrived, &executed, status.get());
+  RegisterLoggingDevice(context.get(), name, /*strict_scope_placement=*/false,
+                        &arrived, &executed, status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
   // Create a variable handle placed on the custom device.
@@ -264,10 +268,12 @@ TEST(CUSTOM_DEVICE, InputBasedPlacement) {
   const char* custom1 = "/job:localhost/replica:0/task:0/device:CUSTOM:1";
   bool arrived = false;
   bool executed = false;
-  RegisterLoggingDevice(context.get(), custom0, &arrived, &executed,
+  RegisterLoggingDevice(context.get(), custom0,
+                        /*strict_scope_placement=*/false, &arrived, &executed,
                         status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-  RegisterLoggingDevice(context.get(), custom1, &arrived, &executed,
+  RegisterLoggingDevice(context.get(), custom1,
+                        /*strict_scope_placement=*/true, &arrived, &executed,
                         status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
@@ -314,14 +320,34 @@ TEST(CUSTOM_DEVICE, InputBasedPlacement) {
   ASSERT_TRUE(absl::StrContains(TF_Message(status.get()), custom0));
   ASSERT_TRUE(absl::StrContains(TF_Message(status.get()), custom1));
 
-  // Custom device: mix of custom/physical fails.
+  // Custom device: mix of custom/physical places the op on the custom device.
   matmul.reset(MatMulOp(context.get(), hcustom0.get(), hcpu.get()));
   num_retvals = 1;
+  executed = false;
   TFE_Execute(matmul.get(), &retval, &num_retvals, status.get());
-  ASSERT_NE(TF_OK, TF_GetCode(status.get()));
-  ASSERT_TRUE(absl::StrContains(TF_Message(status.get()), custom0));
-  ASSERT_TRUE(
-      absl::StrContains(TF_Message(status.get()), "[]"));  // kVariantDeviceNull
+  EXPECT_TRUE(executed);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  TFE_DeleteTensorHandle(retval);
+
+  // Explicit placement still forces the op onto the requested device
+  matmul.reset(MatMulOp(context.get(), hcustom0.get(), hcpu.get()));
+  TFE_OpSetDevice(matmul.get(), "/job:localhost/replica:0/task:0/device:CPU:0",
+                  status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  num_retvals = 1;
+  executed = false;
+  TFE_Execute(matmul.get(), &retval, &num_retvals, status.get());
+  EXPECT_FALSE(executed);
+  ASSERT_FALSE(TF_GetCode(status.get()) == TF_OK);
+
+  // Custom devices can refuse to do type-based dispatch (as hcustom1 is
+  // configured to do)
+  matmul.reset(MatMulOp(context.get(), hcustom1.get(), hcpu.get()));
+  num_retvals = 1;
+  executed = false;
+  TFE_Execute(matmul.get(), &retval, &num_retvals, status.get());
+  EXPECT_FALSE(executed);
+  ASSERT_FALSE(TF_GetCode(status.get()) == TF_OK);
 }
 
 TEST(CUSTOM_DEVICE, InvalidRegistrationError) {
@@ -334,21 +360,24 @@ TEST(CUSTOM_DEVICE, InvalidRegistrationError) {
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
   bool arrived = false;
   bool executed = false;
-  RegisterLoggingDevice(context.get(), "/device:CUSTOM:0", &arrived, &executed,
+  RegisterLoggingDevice(context.get(), "/device:CUSTOM:0",
+                        /*strict_scope_placement=*/true, &arrived, &executed,
                         status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_INVALID_ARGUMENT)
       << TF_Message(status.get());
 
   const char* name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  RegisterLoggingDevice(context.get(), name, &arrived, &executed, status.get());
+  RegisterLoggingDevice(context.get(), name, /*strict_scope_placement=*/true,
+                        &arrived, &executed, status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-  RegisterLoggingDevice(context.get(), name, &arrived, &executed, status.get());
-  ASSERT_TRUE(TF_GetCode(status.get()) == TF_ALREADY_EXISTS)
-      << TF_Message(status.get());
-
-  RegisterLoggingDevice(context.get(),
-                        "/job:localhost/replica:0/task:0/device:CPU:0",
+  RegisterLoggingDevice(context.get(), name, /*strict_scope_placement=*/true,
                         &arrived, &executed, status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_ALREADY_EXISTS)
       << TF_Message(status.get());
+
+  RegisterLoggingDevice(
+      context.get(), "/job:localhost/replica:0/task:0/device:CPU:0",
+      /*strict_scope_placement=*/true, &arrived, &executed, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_ALREADY_EXISTS)
+      << TF_Message(status.get());
 }
diff --git a/tensorflow/c/eager/custom_device_testutil.cc b/tensorflow/c/eager/custom_device_testutil.cc
index 28de3665653..014abe38368 100644
--- a/tensorflow/c/eager/custom_device_testutil.cc
+++ b/tensorflow/c/eager/custom_device_testutil.cc
@@ -33,6 +33,9 @@ struct LoggingDevice {
   bool* arrived_flag;
   // Set to true whenever an operation is executed
   bool* executed_flag;
+  // If true, only explicit op placements are accepted. If false, uses
+  // type-based dispatch.
+  bool strict_scope_placement;
 };
 
 struct LoggedTensor {
@@ -84,18 +87,35 @@ TFE_TensorHandle* CopyTensorFromLoggingDevice(TFE_Context* context,
   return nullptr;
 }
 
-void LoggingDeviceExecute(TFE_Context* context, int num_inputs,
-                          TFE_TensorHandle** inputs, const char* operation_name,
-                          const TFE_OpAttrs* attributes, int* num_outputs,
+void LoggingDeviceExecute(const TFE_Op* original_op, int* num_outputs,
                           TFE_TensorHandle** outputs, TF_Status* s,
                           void* device_info) {
+  const char* requested_placement = TFE_OpGetDevice(original_op, s);
+  if (TF_GetCode(s) != TF_OK) return;
+
   LoggingDevice* dev = reinterpret_cast<LoggingDevice*>(device_info);
+  if (dev->strict_scope_placement && *requested_placement == '\0') {
+    TF_SetStatus(s, TF_INTERNAL,
+                 "Ops must be placed on the device explicitly, or their inputs "
+                 "first copied to other devices.");
+    return;
+  }
+  TFE_Context* context = TFE_OpGetContext(original_op, s);
+  if (TF_GetCode(s) != TF_OK) return;
+  const char* operation_name = TFE_OpGetName(original_op, s);
+  if (TF_GetCode(s) != TF_OK) return;
+  const TFE_OpAttrs* attributes = TFE_OpGetAttrs(original_op);
+
   TFE_Op* op(TFE_NewOp(context, operation_name, s));
   if (TF_GetCode(s) != TF_OK) return;
   TFE_OpAddAttrs(op, attributes);
   TFE_OpSetDevice(op, dev->underlying_device.c_str(), s);
+  if (TF_GetCode(s) != TF_OK) return;
+  int num_inputs = TFE_OpGetFlatInputCount(original_op, s);
+  if (TF_GetCode(s) != TF_OK) return;
   for (int j = 0; j < num_inputs; ++j) {
-    TFE_TensorHandle* input = inputs[j];
+    TFE_TensorHandle* input = TFE_OpGetFlatInput(original_op, j, s);
+    if (TF_GetCode(s) != TF_OK) return;
     const char* input_device = TFE_TensorHandleDeviceName(input, s);
     if (TF_GetCode(s) != TF_OK) return;
     if (dev->device_name == input_device) {
@@ -131,8 +151,8 @@ void DeleteLoggingDevice(void* device_info) {
 }  // namespace
 
 void RegisterLoggingDevice(TFE_Context* context, const char* name,
-                           bool* arrived_flag, bool* executed_flag,
-                           TF_Status* status) {
+                           bool strict_scope_placement, bool* arrived_flag,
+                           bool* executed_flag, TF_Status* status) {
   TFE_CustomDevice custom_device;
   custom_device.copy_tensor_to_device = &CopyToLoggingDevice;
   custom_device.copy_tensor_from_device = &CopyTensorFromLoggingDevice;
@@ -143,6 +163,7 @@ void RegisterLoggingDevice(TFE_Context* context, const char* name,
   device->executed_flag = executed_flag;
   device->device_name = name;
   device->underlying_device = "/job:localhost/replica:0/task:0/device:CPU:0";
+  device->strict_scope_placement = strict_scope_placement;
   TFE_RegisterCustomDevice(context, custom_device, name, device, status);
 }
 
@@ -168,5 +189,6 @@ void AllocateLoggingDevice(const char* name, bool* arrived_flag,
   logging_device->device_name = name;
   logging_device->underlying_device =
       "/job:localhost/replica:0/task:0/device:CPU:0";
+  logging_device->strict_scope_placement = true;
   *device_info = reinterpret_cast<void*>(logging_device);
 }
diff --git a/tensorflow/c/eager/custom_device_testutil.h b/tensorflow/c/eager/custom_device_testutil.h
index 509df7d3e3e..a7c60080adf 100644
--- a/tensorflow/c/eager/custom_device_testutil.h
+++ b/tensorflow/c/eager/custom_device_testutil.h
@@ -25,8 +25,8 @@ limitations under the License.
 #include "tensorflow/c/tf_status.h"
 
 void RegisterLoggingDevice(TFE_Context* context, const char* name,
-                           bool* arrived_flag, bool* executed_flag,
-                           TF_Status* status);
+                           bool strict_scope_placement, bool* arrived_flag,
+                           bool* executed_flag, TF_Status* status);
 void AllocateLoggingDevice(const char* name, bool* arrived_flag,
                            bool* executed_flag, TFE_CustomDevice** device,
                            void** device_info);
diff --git a/tensorflow/c/eager/dlpack.cc b/tensorflow/c/eager/dlpack.cc
index 45048bd6efb..df8e9ace997 100644
--- a/tensorflow/c/eager/dlpack.cc
+++ b/tensorflow/c/eager/dlpack.cc
@@ -109,7 +109,8 @@ DLDataType GetDlDataType(TF_DataType data_type, TF_Status* status) {
 // Gets DLPack's DLContext from eager tensor handle.
 DLContext GetDlContext(TFE_TensorHandle* h, TF_Status* status) {
   DLContext ctx;
-  const char* device_name = tensorflow::unwrap(h)->DeviceName(&status->status);
+  const char* device_name =
+      tensorflow::unwrap(h)->BackingDeviceName(&status->status);
   DeviceNameUtils::ParsedName parsed_name;
   tensorflow::DeviceNameUtils::ParseFullName(device_name, &parsed_name);
   std::string device_type = parsed_name.type;
@@ -248,21 +249,36 @@ void TFE_CallDLManagedTensorDeleter(void* dlm_ptr) {
 }
 
 void* TFE_HandleToDLPack(TFE_TensorHandle* h, TF_Status* status) {
+  auto tf_dlm_context = GetDlContext(h, status);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+
+  auto* tf_dlm_data = TFE_TensorHandleDevicePointer(h, status);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+
   const Tensor* tensor = GetTensorFromHandle(h, status);
   TF_DataType data_type = static_cast<TF_DataType>(tensor->dtype());
-  TensorReference tensor_ref(*tensor);  // This will call buf_->Ref()
 
+  auto tf_dlm_type = GetDlDataType(data_type, status);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+
+  TensorReference tensor_ref(*tensor);  // This will call buf_->Ref()
   auto* tf_dlm_tensor_ctx = new TfDlManagedTensorCtx(tensor_ref);
   tf_dlm_tensor_ctx->reference = tensor_ref;
 
   DLManagedTensor* dlm_tensor = &tf_dlm_tensor_ctx->tensor;
   dlm_tensor->manager_ctx = tf_dlm_tensor_ctx;
   dlm_tensor->deleter = &DLManagedTensorDeleter;
-  dlm_tensor->dl_tensor.ctx = GetDlContext(h, status);
+  dlm_tensor->dl_tensor.ctx = tf_dlm_context;
   int ndim = tensor->dims();
   dlm_tensor->dl_tensor.ndim = ndim;
-  dlm_tensor->dl_tensor.data = TFE_TensorHandleDevicePointer(h, status);
-  dlm_tensor->dl_tensor.dtype = GetDlDataType(data_type, status);
+  dlm_tensor->dl_tensor.data = tf_dlm_data;
+  dlm_tensor->dl_tensor.dtype = tf_dlm_type;
 
   std::vector<int64_t>* shape_arr = &tf_dlm_tensor_ctx->shape;
   std::vector<int64_t>* stride_arr = &tf_dlm_tensor_ctx->strides;
@@ -275,13 +291,14 @@ void* TFE_HandleToDLPack(TFE_TensorHandle* h, TF_Status* status) {
     (*stride_arr)[i] = (*shape_arr)[i + 1] * (*stride_arr)[i + 1];
   }
 
-  dlm_tensor->dl_tensor.shape = &(*shape_arr)[0];
+  dlm_tensor->dl_tensor.shape = shape_arr->data();
   // There are two ways to represent compact row-major data
   // 1) nullptr indicates tensor is compact and row-majored.
   // 2) fill in the strides array as the real case for compact row-major data.
   // Here we choose option 2, since some frameworks didn't handle the strides
   // argument properly.
-  dlm_tensor->dl_tensor.strides = &(*stride_arr)[0];
+  dlm_tensor->dl_tensor.strides = stride_arr->data();
+
   dlm_tensor->dl_tensor.byte_offset =
       0;  // TF doesn't handle the strides and byte_offsets here
   return static_cast<void*>(dlm_tensor);
diff --git a/tensorflow/c/eager/gradient_checker.cc b/tensorflow/c/eager/gradient_checker.cc
new file mode 100644
index 00000000000..640edc7228a
--- /dev/null
+++ b/tensorflow/c/eager/gradient_checker.cc
@@ -0,0 +1,201 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/eager/gradient_checker.h"
+
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/experimental/gradients/math_grad.h"
+#include "tensorflow/c/experimental/gradients/nn_grad.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+namespace gradients {
+
+using namespace std;
+
+// ================== Helper functions =================
+
+// Fills data with values [start,end) with given step size.
+void Range(vector<int>* data, int start, int end, int step = 1) {
+  for (int i = start; i < end; i += step) {
+    (*data)[i] = i;
+  }
+}
+
+// Returns AbstractTensorHandlePtr containing [0, ..., n-1].
+AbstractTensorHandlePtr GetRangeTensorHandleUtil(AbstractContext* ctx, int n) {
+  vector<int> vals(n);
+  int64_t vals_shape[] = {n};
+  Range(&vals, 0, n);
+  AbstractTensorHandlePtr r =
+      GetTensorHandleUtilInt(ctx, vals.data(), vals_shape, 1);
+  return r;
+}
+
+// Fills out_dims with the dimensions of the given tensor.
+void GetDims(const TF_Tensor* t, int64_t* out_dims) {
+  int num_dims = TF_NumDims(t);
+  for (int i = 0; i < num_dims; i++) {
+    out_dims[i] = TF_Dim(t, i);
+  }
+}
+
+// Runs model as is if output is a scalar,
+// else sums the output tensor before returning.
+Status RunAndMaybeSum(AbstractContext* ctx, Model forward,
+                      absl::Span<AbstractTensorHandle*> inputs,
+                      absl::Span<AbstractTensorHandle*> outputs,
+                      bool use_function) {
+  GradientRegistry registry;
+  std::vector<AbstractTensorHandle*> model_outputs(1);
+
+  // Run the model.
+  TF_RETURN_IF_ERROR(RunModel(forward, ctx, inputs,
+                              absl::MakeSpan(model_outputs), use_function,
+                              registry));
+  AbstractTensorHandle* model_out = model_outputs[0];
+
+  TF_Tensor* model_out_tensor;
+  TF_RETURN_IF_ERROR(GetValue(model_out, &model_out_tensor));
+  int num_dims_out = TF_NumDims(model_out_tensor);
+
+  // If the output is a scalar, then return the scalar output
+  if (num_dims_out == 0) {
+    outputs[0] = model_out;
+    return Status::OK();
+  }
+
+  // Else, reduce sum the output to get a scalar
+
+  // Will sum all dimensions, so get a Tensor containing [0,...,num_dims_out-1].
+  AbstractTensorHandlePtr sum_dims =
+      GetRangeTensorHandleUtil(ctx, num_dims_out);
+
+  // Reduce sum the output on all dimensions.
+  std::vector<AbstractTensorHandle*> sum_inputs(2);
+  sum_inputs[0] = model_out;
+  sum_inputs[1] = sum_dims.get();
+
+  TF_RETURN_IF_ERROR(ops::Sum(ctx, absl::MakeSpan(sum_inputs),
+                              absl::MakeSpan(model_outputs), "sum_output"));
+  outputs[0] = model_outputs[0];
+  return Status::OK();
+}
+// ========================= End Helper Functions==============================
+
+Status CalcNumericalGrad(AbstractContext* ctx, Model forward,
+                         absl::Span<AbstractTensorHandle*> inputs,
+                         int input_index, bool use_function,
+                         AbstractTensorHandle** numerical_grad) {
+  AbstractTensorHandle* theta =
+      inputs[input_index];  // parameter we are grad checking
+
+  // Convert from AbstractTensor to TF_Tensor.
+  TF_Tensor* theta_tensor;
+  TF_RETURN_IF_ERROR(GetValue(theta, &theta_tensor));
+
+  // Get number of elements and fill data.
+  int num_elems = TF_TensorElementCount(theta_tensor);
+  vector<float> theta_data(num_elems);
+  memcpy(theta_data.data(), TF_TensorData(theta_tensor),
+         TF_TensorByteSize(theta_tensor));
+
+  // Initialize space for the numerical gradient.
+  vector<float> dtheta_approx(num_elems);
+
+  // Get theta shape and store in theta_dims.
+  int num_dims = TF_NumDims(theta_tensor);
+  vector<int64_t> theta_dims(num_dims);
+  GetDims(theta_tensor, theta_dims.data());
+
+  // Initialize auxilary data structures.
+  vector<float> thetaPlus_data(num_elems);
+  vector<float> thetaMinus_data(num_elems);
+  std::vector<AbstractTensorHandle*> f_outputs(1);
+
+  // Numerical Grad Check
+  for (int i = 0; i < num_elems; i++) {
+    // Get relative epsilon value
+    float epsilon =
+        std::abs(theta_data[i] * 1e-4 + 1e-4);  // add 1e-4 to prevent div by 0
+    AbstractTensorHandlePtr two_eps =
+        GetScalarTensorHandleUtil(ctx, 2 * epsilon);
+
+    // Initialize theta[i] + epsilon.
+    memcpy(thetaPlus_data.data(), TF_TensorData(theta_tensor),
+           TF_TensorByteSize(theta_tensor));
+    thetaPlus_data[i] += epsilon;
+    AbstractTensorHandlePtr thetaPlus = GetTensorHandleUtilFloat(
+        ctx, thetaPlus_data.data(), theta_dims.data(), num_dims);
+
+    // Initialize theta[i] - epsilon.
+    memcpy(&thetaMinus_data[0], TF_TensorData(theta_tensor),
+           TF_TensorByteSize(theta_tensor));
+    thetaMinus_data[i] -= epsilon;
+    AbstractTensorHandlePtr thetaMinus = GetTensorHandleUtilFloat(
+        ctx, thetaMinus_data.data(), theta_dims.data(), num_dims);
+
+    // Get f(theta + eps):
+    inputs[input_index] = thetaPlus.get();
+    TF_RETURN_IF_ERROR(RunAndMaybeSum(ctx, forward, inputs,
+                                      absl::MakeSpan(f_outputs), use_function));
+    AbstractTensorHandle* fPlus = f_outputs[0];
+
+    // Get f(theta - eps):
+    inputs[input_index] = thetaMinus.get();
+    TF_RETURN_IF_ERROR(RunAndMaybeSum(ctx, forward, inputs,
+                                      absl::MakeSpan(f_outputs), use_function));
+    AbstractTensorHandle* fMinus = f_outputs[0];
+
+    // Take Difference of both estimates: (f(theta + eps) - f(theta - eps)).
+    TF_RETURN_IF_ERROR(
+        ops::Sub(ctx, {fPlus, fMinus}, absl::MakeSpan(f_outputs), "sub_top"));
+    AbstractTensorHandle* fDiff = f_outputs[0];
+
+    // Calculate using the difference quotient definition:
+    // (f(theta + eps) - f(theta - eps)) / (2 * eps).
+    TF_RETURN_IF_ERROR(ops::DivNoNan(ctx, {fDiff, two_eps.get()},
+                                     absl::MakeSpan(f_outputs),
+                                     "diff_quotient"));
+    AbstractTensorHandle* diff_quotient = f_outputs[0];
+
+    TF_Tensor* grad_tensor;
+    TF_RETURN_IF_ERROR(GetValue(diff_quotient, &grad_tensor));
+    float grad_data[1];
+    memcpy(&grad_data[0], TF_TensorData(grad_tensor),
+           TF_TensorByteSize(grad_tensor));
+
+    dtheta_approx[i] = grad_data[0];
+  }
+
+  // Populate *numerical_grad with the data from dtheta_approx.
+  TF_RETURN_IF_ERROR(TensorHandleWithDimsFloat(
+      ctx, dtheta_approx.data(), theta_dims.data(), num_dims, numerical_grad));
+  return Status::OK();
+}
+
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/gradient_checker.h b/tensorflow/c/eager/gradient_checker.h
new file mode 100644
index 00000000000..8497f5af48e
--- /dev/null
+++ b/tensorflow/c/eager/gradient_checker.h
@@ -0,0 +1,53 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/eager/gradients_util.h"
+#include "tensorflow/c/experimental/gradients/math_grad.h"
+#include "tensorflow/c/experimental/gradients/nn_grad.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+namespace gradients {
+
+/* Returns numerical grad inside `dtheta_approx` given `forward` model and
+ * parameter specified by `input_index`.
+ *
+ * I.e. if y = <output of the forward model> and w = inputs[input_index],
+ * this will calculate dy/dw numerically.
+ *
+ * `use_function` indicates whether to use graph mode(true) or eager(false).
+ *
+ * `numerical_grad` is the pointer to the AbstractTensorHandle* which will
+ * hold the numerical gradient data at the end of the function.
+ */
+Status CalcNumericalGrad(AbstractContext* ctx, Model forward,
+                         absl::Span<AbstractTensorHandle*> inputs,
+                         int input_index, bool use_function,
+                         AbstractTensorHandle** numerical_grad);
+
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/gradient_checker_test.cc b/tensorflow/c/eager/gradient_checker_test.cc
new file mode 100644
index 00000000000..7a438085fb5
--- /dev/null
+++ b/tensorflow/c/eager/gradient_checker_test.cc
@@ -0,0 +1,265 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/eager/gradient_checker.h"
+
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/eager/gradients_util.h"
+#include "tensorflow/c/eager/mnist_gradients_testutil.h"
+#include "tensorflow/c/experimental/gradients/math_grad.h"
+#include "tensorflow/c/experimental/gradients/nn_grad.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace gradients {
+namespace internal {
+namespace {
+
+class GradientCheckerTest
+    : public ::testing::TestWithParam<std::tuple<const char*, bool, bool>> {
+ protected:
+  void SetUp() override {
+    TF_StatusPtr status(TF_NewStatus());
+    TF_SetTracingImplementation(std::get<0>(GetParam()), status.get());
+    Status s = StatusFromTF_Status(status.get());
+    CHECK_EQ(errors::OK, s.code()) << s.error_message();
+  }
+};
+
+Status RegisterGradients(GradientRegistry* registry) {
+  TF_RETURN_IF_ERROR(registry->Register("MatMul", MatMulRegisterer));
+  TF_RETURN_IF_ERROR(
+      registry->Register("SparseSoftmaxCrossEntropyWithLogits",
+                         SparseSoftmaxCrossEntropyWithLogitsRegisterer));
+  return Status::OK();
+}
+
+TEST_P(GradientCheckerTest, TestGradCheckMatMul) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  float A_vals[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  int64_t A_dims[] = {2, 2};
+  float B_vals[] = {.5f, -1.0f, 1.0f, 1.0f};
+  int64_t B_dims[] = {2, 2};
+  int num_dims = 2;
+
+  AbstractTensorHandlePtr A =
+      GetTensorHandleUtilFloat(ctx.get(), A_vals, A_dims, num_dims);
+  AbstractTensorHandlePtr B =
+      GetTensorHandleUtilFloat(ctx.get(), B_vals, B_dims, num_dims);
+
+  std::vector<AbstractTensorHandle*> inputs;
+  inputs.push_back(A.get());
+  inputs.push_back(B.get());
+
+  AbstractTensorHandle* grad_approx;
+  Status s = CalcNumericalGrad(
+      ctx.get(), MatMulModel, absl::MakeSpan(inputs), /*input_index=*/0,
+      /*use_function=*/!std::get<2>(GetParam()), &grad_approx);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* gt;
+  s = GetValue(grad_approx, &gt);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  float result_data[4] = {0};
+  memcpy(&result_data[0], TF_TensorData(gt), TF_TensorByteSize(gt));
+
+  float expected_dA[4] = {-.5f, 2.0f, -.5f, 2.0f};
+  float tolerance = 1e-2;
+  for (int j = 0; j < 4; j++) {
+    ASSERT_NEAR(expected_dA[j], result_data[j], tolerance);
+  }
+  TF_DeleteTensor(gt);
+}
+
+TEST_P(GradientCheckerTest, TestGradCheckMul) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  AbstractTensorHandlePtr x;
+  {
+    AbstractTensorHandle* x_raw = nullptr;
+    Status s = ScalarTensorHandle(ctx.get(), 2.0f, &x_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    x.reset(x_raw);
+  }
+
+  AbstractTensorHandlePtr y;
+  {
+    AbstractTensorHandle* y_raw = nullptr;
+    Status s = ScalarTensorHandle(ctx.get(), 7.0f, &y_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    y.reset(y_raw);
+  }
+
+  // Will perform z = x*y.
+  // dz/dx = y
+
+  std::vector<AbstractTensorHandle*> inputs;
+  inputs.push_back(x.get());
+  inputs.push_back(y.get());
+  AbstractTensorHandle* g;
+
+  Status s = CalcNumericalGrad(ctx.get(), MulModel, absl::MakeSpan(inputs),
+                               /*input_index=*/0,
+                               /*use_function=*/!std::get<2>(GetParam()), &g);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* gt;
+  s = GetValue(g, &gt);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  float result_data[1] = {0};
+  memcpy(&result_data[0], TF_TensorData(gt), TF_TensorByteSize(gt));
+
+  ASSERT_NEAR(result_data[0], 7.0f, /*abs_error=*/1e-2);
+  TF_DeleteTensor(gt);
+}
+
+TEST_P(GradientCheckerTest, TestGradCheckSoftmax) {
+  bool use_function = !std::get<2>(GetParam());
+  if (use_function) {
+    // TODO(b/168850692): Enable this.
+    GTEST_SKIP() << "Can't take gradient of "
+                    "SparseSoftmaxCrossEntropyWithLogits in tracing mode.";
+  }
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+
+  /** Test to show how to use this API with analytical gradients:
+   *
+   *  We have `SoftmaxLossGradModel`, which is a wrapper for the
+   *  Softmax analytical gradient found in c/experimental/nn_grads.
+   *
+   *  We will use the GradientChecker by applying finite differences
+   *  to the forward pass wrapped in `SoftmaxModel` and verify that
+   *  both the analytical and numerical gradients are relatively
+   *  close.
+   *
+   */
+
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  // X = scores
+  float X_vals[] = {1.0f, 2.0f, 3.0f, -5.0f, -4.0f, -3.0f, 2.0f, 0.0f, 1.0f};
+  int64_t X_dims[] = {3, 3};
+  int num_dims = 2;
+  AbstractTensorHandlePtr X =
+      GetTensorHandleUtilFloat(ctx.get(), X_vals, X_dims, num_dims);
+
+  // y = labels
+  int y_vals[] = {1, 0, 1};
+  int64_t y_dims[] = {3};
+  num_dims = sizeof(y_dims) / sizeof(y_dims[0]);
+  AbstractTensorHandlePtr y =
+      GetTensorHandleUtilInt(ctx.get(), y_vals, y_dims, num_dims);
+
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  std::vector<AbstractTensorHandle*> inputs;
+  inputs.push_back(X.get());
+  inputs.push_back(y.get());
+
+  // Run analytical gradient and get its data.
+  std::vector<AbstractTensorHandle*> outputs(2);
+  s = RunModel(SoftmaxLossGradModel, ctx.get(), absl::MakeSpan(inputs),
+               absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* dX_tensor;
+  s = GetValue(outputs[0], &dX_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  float danalytical[9] = {0};  // Contains data from analytical gradient.
+  memcpy(&danalytical[0], TF_TensorData(dX_tensor),
+         TF_TensorByteSize(dX_tensor));
+
+  // Run numerical gradient approximation using the GradientChecker API.
+  AbstractTensorHandle* g;  // Will contain numerical approximation data.
+  s = CalcNumericalGrad(ctx.get(), SoftmaxModel, absl::MakeSpan(inputs),
+                        /*input_index=*/0,
+                        /*use_function=*/!std::get<2>(GetParam()), &g);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* gt;
+  s = GetValue(g, &gt);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  float dnumerical[9] = {0};
+  memcpy(&dnumerical[0], TF_TensorData(gt), TF_TensorByteSize(gt));
+
+  // Now compare the two implementations:
+  for (int j = 0; j < 9; j++) {
+    ASSERT_NEAR(dnumerical[j], danalytical[j], /*abs_error=*/1e-2);
+  }
+
+  // Only Unref() first output as 2nd is nullptr grad for labels
+  outputs[0]->Unref();
+  TF_DeleteTensor(dX_tensor);
+  TF_DeleteTensor(gt);
+}
+
+#ifdef PLATFORM_GOOGLE
+INSTANTIATE_TEST_SUITE_P(
+    UnifiedCAPI, GradientCheckerTest,
+    ::testing::Combine(::testing::Values("graphdef"),
+                       /*tfrt*/ ::testing::Values(false),
+                       /*executing_eagerly*/ ::testing::Values(true, false)));
+#else
+INSTANTIATE_TEST_SUITE_P(
+    UnifiedCAPI, GradientCheckerTest,
+    ::testing::Combine(::testing::Values("graphdef"),
+                       /*tfrt*/ ::testing::Values(false),
+                       /*executing_eagerly*/ ::testing::Values(true, false)));
+#endif
+}  // namespace
+}  // namespace internal
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/gradients.cc b/tensorflow/c/eager/gradients.cc
index 9bcd0d0fea0..58ffcf247cf 100644
--- a/tensorflow/c/eager/gradients.cc
+++ b/tensorflow/c/eager/gradients.cc
@@ -122,14 +122,12 @@ int64 ToId(AbstractTensorHandle* t) {
   return static_cast<int64>(reinterpret_cast<uintptr_t>(t));
 }
 
-TapeTensor::TapeTensor(AbstractTensorHandle* handle, AbstractContext* ctx)
-    : handle_(handle), ctx_(ctx) {
+TapeTensor::TapeTensor(AbstractTensorHandle* handle) : handle_(handle) {
   handle_->Ref();
 }
 TapeTensor::TapeTensor(const TapeTensor& other) {
   handle_ = other.handle_;
   handle_->Ref();
-  ctx_ = other.ctx_;
 }
 TapeTensor::~TapeTensor() { handle_->Unref(); }
 
@@ -138,33 +136,7 @@ tensorflow::int64 TapeTensor::GetID() const { return ToId(handle_); }
 tensorflow::DataType TapeTensor::GetDType() const {
   return handle_->DataType();
 }
-
-AbstractTensorHandle* TapeTensor::OnesLike() const {
-  AbstractOperationPtr op(ctx_->CreateOperation());
-  Status s = op->Reset("OnesLike", /*raw_device_name=*/nullptr);
-  if (!s.ok()) {
-    return nullptr;
-  }
-  if (isa<tracing::TracingOperation>(op.get())) {
-    s = dyn_cast<tracing::TracingOperation>(op.get())->SetOpName(
-        absl::StrCat("OnesLike", ToId(handle_)).c_str());
-    if (!s.ok()) {
-      return nullptr;
-    }
-  }
-  s = op->AddInput(handle_);
-  if (!s.ok()) {
-    return nullptr;
-  }
-  int num_outputs = 1;
-  // TODO(srbs): Figure out who is in charge of releasing this.
-  std::vector<AbstractTensorHandle*> outputs(num_outputs);
-  s = op->Execute(absl::Span<AbstractTensorHandle*>(outputs), &num_outputs);
-  if (!s.ok()) {
-    return nullptr;
-  }
-  return outputs[0];
-}
+AbstractTensorHandle* TapeTensor::GetHandle() const { return handle_; }
 
 AbstractTensorHandle* TapeTensor::ZerosLike() const { return nullptr; }
 
@@ -219,6 +191,23 @@ Status TapeVSpace::CallBackwardFunction(
       &ctx, incoming_gradients, result);
 }
 
+Status TapeVSpace::BuildOnesLike(const TapeTensor& t,
+                                 AbstractTensorHandle** result) const {
+  AbstractOperationPtr op(ctx_->CreateOperation());
+  TF_RETURN_IF_ERROR(op->Reset("OnesLike", /*raw_device_name=*/nullptr));
+  if (isa<tracing::TracingOperation>(op.get())) {
+    TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingOperation>(op.get())->SetOpName(
+        absl::StrCat("OnesLike", ToId(t.GetHandle())).c_str()));
+  }
+  TF_RETURN_IF_ERROR(op->AddInput(t.GetHandle()));
+  int num_outputs = 1;
+  std::vector<AbstractTensorHandle*> outputs(num_outputs);
+  TF_RETURN_IF_ERROR(
+      op->Execute(absl::Span<AbstractTensorHandle*>(outputs), &num_outputs));
+  *result = outputs[0];
+  return Status::OK();
+}
+
 // Looks up the ID of a Gradient.
 int64 TapeVSpace::TensorId(AbstractTensorHandle* tensor) const {
   return ToId(tensor);
@@ -226,7 +215,7 @@ int64 TapeVSpace::TensorId(AbstractTensorHandle* tensor) const {
 
 // Converts a Gradient to a TapeTensor.
 TapeTensor TapeVSpace::TapeTensorFromGradient(AbstractTensorHandle* g) const {
-  return TapeTensor(g, ctx_);
+  return TapeTensor(g);
 }
 
 void TapeVSpace::MarkAsResult(AbstractTensorHandle* gradient) const {}
@@ -242,6 +231,7 @@ namespace internal {
 Status Reset(AbstractOperation* op_, const char* op,
              const char* raw_device_name, ForwardOperation* forward_op_) {
   forward_op_->op_name = op;
+  forward_op_->attrs.Reset(op);
   return op_->Reset(op, raw_device_name);
 }
 Status AddInput(AbstractOperation* op_, AbstractTensorHandle* input,
@@ -418,9 +408,14 @@ Status Execute(AbstractOperation* op_, AbstractContext* ctx,
     // TODO(srbs): Manage refcount of ForwardOperation's inputs/outputs.
     forward_op_->outputs.push_back(retvals[i]);
   }
+  // TODO(b/166669239): This is needed to support AttrBuilder::Get for string
+  // attributes. Number type attrs and DataType attrs work fine without this.
+  // Consider getting rid of this and making the behavior between number types
+  // and string consistent.
+  forward_op_->attrs.BuildNodeDef();
   std::vector<TapeTensor> tape_tensors;
   for (auto t : retvals) {
-    tape_tensors.push_back(TapeTensor(t, ctx));
+    tape_tensors.push_back(TapeTensor(t));
   }
   tape->RecordOperation(
       op_->Name(), tape_tensors, input_ids, input_dtypes,
diff --git a/tensorflow/c/eager/gradients.h b/tensorflow/c/eager/gradients.h
index 04e11291404..f7d80cbeb34 100644
--- a/tensorflow/c/eager/gradients.h
+++ b/tensorflow/c/eager/gradients.h
@@ -80,7 +80,6 @@ struct ForwardOperation {
   std::vector<AbstractTensorHandle*> inputs;
   std::vector<AbstractTensorHandle*> outputs;
   AttrBuilder attrs;
-  AbstractContext* ctx;
 };
 
 // Interface for building default zeros gradients for op outputs which are
@@ -181,10 +180,6 @@ int64 ToId(AbstractTensorHandle* t);
 // allow us to trace the data dependencies between operations and hence compute
 // gradients.
 //
-// This also implements `OnesLike` to create the default
-// incoming gradients for tensors which do not already have an incoming
-// gradient.
-//
 // `ZerosLike` is not expected to be called and returns a nullptr. The creation
 // of default zeros grads is handled by the `DefaultGradientFunction` registered
 // for each op.
@@ -193,20 +188,19 @@ int64 ToId(AbstractTensorHandle* t);
 // TODO(srbs): Should ZerosLike check-fail instead of returning nullptr?
 class TapeTensor {
  public:
-  TapeTensor(AbstractTensorHandle* handle, AbstractContext* ctx);
+  explicit TapeTensor(AbstractTensorHandle* handle);
   TapeTensor(const TapeTensor& other);
   ~TapeTensor();
 
   tensorflow::int64 GetID() const;
   tensorflow::DataType GetDType() const;
 
-  AbstractTensorHandle* OnesLike() const;
   AbstractTensorHandle* ZerosLike() const;
 
+  AbstractTensorHandle* GetHandle() const;
+
  private:
   AbstractTensorHandle* handle_;
-  // The context where OnesLike ops are to be created.
-  AbstractContext* ctx_;
 };
 
 // Vector space for actually computing gradients. Implements methods for calling
@@ -234,6 +228,10 @@ class TapeVSpace
       gtl::ArraySlice<AbstractTensorHandle*> output_gradients,
       std::vector<AbstractTensorHandle*>* result) const override;
 
+  // Builds a tensor filled with ones with the same shape and dtype as `t`.
+  Status BuildOnesLike(const TapeTensor& t,
+                       AbstractTensorHandle** result) const override;
+
   // Looks up the ID of a Gradient.
   int64 TensorId(AbstractTensorHandle* tensor) const override;
 
diff --git a/tensorflow/c/eager/gradients_test.cc b/tensorflow/c/eager/gradients_test.cc
index 80b1f157074..7fafd6eaa07 100644
--- a/tensorflow/c/eager/gradients_test.cc
+++ b/tensorflow/c/eager/gradients_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_context.h"
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/c/eager/c_api_test_util.h"
@@ -26,7 +27,9 @@ limitations under the License.
 #include "tensorflow/c/eager/gradients_internal.h"
 #include "tensorflow/c/experimental/gradients/array_grad.h"
 #include "tensorflow/c/experimental/gradients/math_grad.h"
+#include "tensorflow/c/experimental/gradients/tape/tape_context.h"
 #include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/experimental/ops/math_ops.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
@@ -38,84 +41,32 @@ namespace gradients {
 namespace internal {
 namespace {
 using std::vector;
+using tensorflow::TF_StatusPtr;
 using tracing::TracingOperation;
 
 class CppGradients
     : public ::testing::TestWithParam<std::tuple<const char*, bool, bool>> {
  protected:
   void SetUp() override {
-    TF_SetTracingImplementation(std::get<0>(GetParam()));
+    TF_StatusPtr status(TF_NewStatus());
+    TF_SetTracingImplementation(std::get<0>(GetParam()), status.get());
+    Status s = StatusFromTF_Status(status.get());
+    CHECK_EQ(errors::OK, s.code()) << s.error_message();
   }
 };
 
 Status RegisterGradients(GradientRegistry* registry) {
-  TF_RETURN_IF_ERROR(registry->Register("Add", AddRegisterer));
+  // TODO(srbs): Rename ops::Add to ops::AddV2 and AddRegister to
+  // AddV2Registerer.
+  TF_RETURN_IF_ERROR(registry->Register("AddV2", AddRegisterer));
   TF_RETURN_IF_ERROR(registry->Register("Exp", ExpRegisterer));
   TF_RETURN_IF_ERROR(registry->Register("IdentityN", IdentityNRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Sqrt", SqrtRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Neg", NegRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Sub", SubRegisterer));
   return Status::OK();
 }
 
-// Computes `inputs[0] + inputs[1]` and records it on the tape.
-Status Add(AbstractContext* ctx, Tape* tape,
-           absl::Span<AbstractTensorHandle* const> inputs,
-           absl::Span<AbstractTensorHandle*> outputs,
-           const GradientRegistry& registry) {
-  AbstractOperationPtr add_op(ctx->CreateOperation());
-  ForwardOperation forward_op;
-  forward_op.ctx = ctx;
-  TF_RETURN_IF_ERROR(
-      Reset(add_op.get(), "Add", /*raw_device_name=*/nullptr, &forward_op));
-  if (isa<TracingOperation>(add_op.get())) {
-    TF_RETURN_IF_ERROR(
-        dyn_cast<TracingOperation>(add_op.get())->SetOpName("my_add"));
-  }
-  TF_RETURN_IF_ERROR(AddInput(add_op.get(), inputs[0], &forward_op));
-  TF_RETURN_IF_ERROR(AddInput(add_op.get(), inputs[1], &forward_op));
-  int num_retvals = 1;
-  return Execute(add_op.get(), ctx, outputs, &num_retvals, &forward_op, tape,
-                 registry);
-}
-
-// Computes `exp(inputs[0])` and records it on the tape.
-Status Exp(AbstractContext* ctx, Tape* tape,
-           absl::Span<AbstractTensorHandle* const> inputs,
-           absl::Span<AbstractTensorHandle*> outputs,
-           const GradientRegistry& registry) {
-  AbstractOperationPtr exp_op(ctx->CreateOperation());
-  ForwardOperation forward_op;
-  forward_op.ctx = ctx;
-  TF_RETURN_IF_ERROR(
-      Reset(exp_op.get(), "Exp", /*raw_device_name=*/nullptr, &forward_op));
-  if (isa<TracingOperation>(exp_op.get())) {
-    TF_RETURN_IF_ERROR(
-        dyn_cast<TracingOperation>(exp_op.get())->SetOpName("my_exp"));
-  }
-  TF_RETURN_IF_ERROR(AddInput(exp_op.get(), inputs[0], &forward_op));
-  int num_retvals = 1;
-  return Execute(exp_op.get(), ctx, outputs, &num_retvals, &forward_op, tape,
-                 registry);
-}
-
-// Computes `IdentityN(inputs)` and records it on the tape.
-Status IdentityN(AbstractContext* ctx, Tape* tape,
-                 absl::Span<AbstractTensorHandle* const> inputs,
-                 absl::Span<AbstractTensorHandle*> outputs,
-                 const GradientRegistry& registry) {
-  AbstractOperationPtr identity_n_op(ctx->CreateOperation());
-  ForwardOperation forward_op;
-  forward_op.ctx = ctx;
-  TF_RETURN_IF_ERROR(Reset(identity_n_op.get(), "IdentityN",
-                           /*raw_device_name=*/nullptr, &forward_op));
-  if (isa<TracingOperation>(identity_n_op.get())) {
-    TF_RETURN_IF_ERROR(dyn_cast<TracingOperation>(identity_n_op.get())
-                           ->SetOpName("my_identity_n"));
-  }
-  TF_RETURN_IF_ERROR(AddInputList(identity_n_op.get(), inputs, &forward_op));
-  int num_retvals = outputs.size();
-  return Execute(identity_n_op.get(), ctx, outputs, &num_retvals, &forward_op,
-                 tape, registry);
-}
-
 // Computes
 // y = inputs[0] + inputs[1]
 // return grad(y, {inputs[0], inputs[1]})
@@ -128,8 +79,10 @@ Status AddGradModel(AbstractContext* ctx,
   tape->Watch(ToId(inputs[0]));  // Watch x.
   tape->Watch(ToId(inputs[1]));  // Watch y.
   std::vector<AbstractTensorHandle*> add_outputs(1);
-  TF_RETURN_IF_ERROR(Add(ctx, tape, inputs, absl::MakeSpan(add_outputs),
-                         registry));  // Compute x+y.
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  TF_RETURN_IF_ERROR(ops::Add(tape_ctx.get(), inputs,
+                              absl::MakeSpan(add_outputs),
+                              "Add"));  // Compute x+y.
   std::unordered_map<tensorflow::int64, TapeTensor>
       source_tensors_that_are_targets;
 
@@ -160,8 +113,9 @@ Status ExpGradModel(AbstractContext* ctx,
   auto tape = new Tape(/*persistent=*/false);
   tape->Watch(ToId(inputs[0]));  // Watch x.
   std::vector<AbstractTensorHandle*> exp_outputs(1);
-  TF_RETURN_IF_ERROR(Exp(ctx, tape, inputs, absl::MakeSpan(exp_outputs),
-                         registry));  // Compute x+y.
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  TF_RETURN_IF_ERROR(
+      ops::Exp(tape_ctx.get(), inputs, absl::MakeSpan(exp_outputs), "Exp"));
   std::unordered_map<tensorflow::int64, TapeTensor>
       source_tensors_that_are_targets;
 
@@ -179,6 +133,37 @@ Status ExpGradModel(AbstractContext* ctx,
   return Status::OK();
 }
 
+// Computes
+// y = sqrt(inputs[0])
+// return grad(y, {inputs[0]})
+Status SqrtGradModel(AbstractContext* ctx,
+                     absl::Span<AbstractTensorHandle* const> inputs,
+                     absl::Span<AbstractTensorHandle*> outputs,
+                     const GradientRegistry& registry) {
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));  // Watch x.
+  std::vector<AbstractTensorHandle*> sqrt_outputs(1);
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  TF_RETURN_IF_ERROR(
+      ops::Sqrt(tape_ctx.get(), inputs, absl::MakeSpan(sqrt_outputs), "Sqrt"));
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+
+  std::vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(sqrt_outputs[0])},
+      /*source_tensor_ids=*/{ToId(inputs[0])}, source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
+  for (auto sqrt_output : sqrt_outputs) {
+    sqrt_output->Unref();
+  }
+  outputs[0] = out_grads[0];
+  delete tape;
+  return Status::OK();
+}
+
 // Computes
 // ignored, y = IdentityN(inputs[0], inputs[1])
 // return grad(y, {inputs[0], inputs[1]})
@@ -193,8 +178,9 @@ Status IdentityNGradModel(AbstractContext* ctx,
   tape->Watch(ToId(inputs[1]));
 
   vector<AbstractTensorHandle*> identity_n_outputs(2);
-  TF_RETURN_IF_ERROR(IdentityN(ctx, tape, inputs,
-                               absl::MakeSpan(identity_n_outputs), registry));
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  TF_RETURN_IF_ERROR(ops::IdentityN(
+      tape_ctx.get(), inputs, absl::MakeSpan(identity_n_outputs), "IdentityN"));
 
   std::unordered_map<tensorflow::int64, TapeTensor>
       source_tensors_that_are_targets;
@@ -214,6 +200,73 @@ Status IdentityNGradModel(AbstractContext* ctx,
   return Status::OK();
 }
 
+// Computes
+// y = - inputs[0]
+// return grad(y, {inputs[0]})
+Status NegGradModel(AbstractContext* ctx,
+                    absl::Span<AbstractTensorHandle* const> inputs,
+                    absl::Span<AbstractTensorHandle*> outputs,
+                    const GradientRegistry& registry) {
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));
+
+  std::vector<AbstractTensorHandle*> neg_outputs(1);
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  TF_RETURN_IF_ERROR(
+      ops::Neg(tape_ctx.get(), inputs, absl::MakeSpan(neg_outputs), "Neg"));
+
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+  std::vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(neg_outputs[0])},
+      /*source_tensor_ids=*/{ToId(inputs[0])}, source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
+  for (auto neg_output : neg_outputs) {
+    neg_output->Unref();
+  }
+  outputs[0] = out_grads[0];
+  delete tape;
+  return Status::OK();
+}
+
+// Computes
+// y = inputs[0] - inputs[1]
+// return grad(y, {inputs[0], inputs[1]})
+Status SubGradModel(AbstractContext* ctx,
+                    absl::Span<AbstractTensorHandle* const> inputs,
+                    absl::Span<AbstractTensorHandle*> outputs,
+                    const GradientRegistry& registry) {
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  tape->Watch(ToId(inputs[0]));  // Watch x.
+  tape->Watch(ToId(inputs[1]));  // Watch y.
+  std::vector<AbstractTensorHandle*> sub_outputs(1);
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  TF_RETURN_IF_ERROR(ops::Sub(tape_ctx.get(), inputs,
+                              absl::MakeSpan(sub_outputs),
+                              "Sub"));  // Compute x-y.
+  std::unordered_map<tensorflow::int64, TapeTensor>
+      source_tensors_that_are_targets;
+
+  std::vector<AbstractTensorHandle*> out_grads;
+  TF_RETURN_IF_ERROR(tape->ComputeGradient(
+      vspace, /*target_tensor_ids=*/{ToId(sub_outputs[0])},
+      /*source_tensor_ids=*/{ToId(inputs[0]), ToId(inputs[1])},
+      source_tensors_that_are_targets,
+      /*output_gradients=*/{}, &out_grads,
+      /*build_default_zeros_grads=*/false));
+  for (auto sub_output : sub_outputs) {
+    sub_output->Unref();
+  }
+  outputs[0] = out_grads[0];
+  outputs[1] = out_grads[1];
+  delete tape;
+  return Status::OK();
+}
+
 AbstractContext* BuildFunction(const char* fn_name) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
@@ -448,6 +501,50 @@ TEST_P(CppGradients, TestExpGrad) {
   result_tensor = nullptr;
 }
 
+TEST_P(CppGradients, TestSqrtGrad) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  AbstractTensorHandlePtr x;
+  {
+    AbstractTensorHandle* x_raw = nullptr;
+    Status s = TestScalarTensorHandle(ctx.get(), 1.0f, &x_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    x.reset(x_raw);
+  }
+
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Pseudo-code:
+  //
+  // tape.watch(x)
+  // y = sqrt(x)
+  // outputs = tape.gradient(y, x)
+  std::vector<AbstractTensorHandle*> outputs(1);
+  s = RunModel(SqrtGradModel, ctx.get(), {x.get()}, absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* result_tensor;
+  s = getValue(outputs[0], &result_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  auto result_value = static_cast<float*>(TF_TensorData(result_tensor));
+  EXPECT_NEAR(*result_value, 0.5, 0.001);
+  outputs[0]->Unref();
+  TF_DeleteTensor(result_tensor);
+  result_tensor = nullptr;
+}
+
 TEST_P(CppGradients, TestIdentityNGrad) {
   // Pseudo-code:
   //
@@ -507,6 +604,161 @@ TEST_P(CppGradients, TestIdentityNGrad) {
   result_tensor = nullptr;
 }
 
+TEST_P(CppGradients, TestNegGrad) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  AbstractTensorHandlePtr x;
+  {
+    AbstractTensorHandle* x_raw = nullptr;
+    Status s = TestScalarTensorHandle(ctx.get(), 2.0f, &x_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    x.reset(x_raw);
+  }
+
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Pseudo-code:
+  //
+  // tape.watch(x)
+  // y = - x
+  // outputs = tape.gradient(y, x)
+  std::vector<AbstractTensorHandle*> outputs(1);
+  s = RunModel(NegGradModel, ctx.get(), {x.get()}, absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* result_tensor;
+  s = getValue(outputs[0], &result_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  auto result_value = static_cast<float*>(TF_TensorData(result_tensor));
+  EXPECT_EQ(*result_value, -1.0);
+  outputs[0]->Unref();
+  TF_DeleteTensor(result_tensor);
+  result_tensor = nullptr;
+}
+
+TEST_P(CppGradients, TestSubGrad) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  AbstractTensorHandlePtr x;
+  {
+    AbstractTensorHandle* x_raw = nullptr;
+    Status s = TestScalarTensorHandle(ctx.get(), 2.0f, &x_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    x.reset(x_raw);
+  }
+
+  AbstractTensorHandlePtr y;
+  {
+    AbstractTensorHandle* y_raw = nullptr;
+    Status s = TestScalarTensorHandle(ctx.get(), 2.0f, &y_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    y.reset(y_raw);
+  }
+
+  GradientRegistry registry;
+  Status s = RegisterGradients(&registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  // Pseudo-code:
+  //
+  // tape.watch(x)
+  // tape.watch(y)
+  // y = x - y
+  // outputs = tape.gradient(y, [x, y])
+  std::vector<AbstractTensorHandle*> outputs(2);
+  s = RunModel(SubGradModel, ctx.get(), {x.get(), y.get()},
+               absl::MakeSpan(outputs),
+               /*use_function=*/!std::get<2>(GetParam()), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  TF_Tensor* result_tensor;
+  s = getValue(outputs[0], &result_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  auto result_value = static_cast<float*>(TF_TensorData(result_tensor));
+  EXPECT_EQ(*result_value, 1.0);
+  outputs[0]->Unref();
+  TF_DeleteTensor(result_tensor);
+  result_tensor = nullptr;
+
+  s = getValue(outputs[1], &result_tensor);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  result_value = static_cast<float*>(TF_TensorData(result_tensor));
+  EXPECT_EQ(*result_value, -1.0);
+  outputs[1]->Unref();
+  TF_DeleteTensor(result_tensor);
+}
+
+TEST_P(CppGradients, TestSetAttrString) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  AbstractContextPtr ctx;
+  {
+    AbstractContext* ctx_raw = nullptr;
+    Status s =
+        BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    ctx.reset(ctx_raw);
+  }
+
+  AbstractTensorHandlePtr t;
+  {
+    AbstractTensorHandle* x_raw = nullptr;
+    Status s = TestScalarTensorHandle(ctx.get(), 1.0f, &x_raw);
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+    t.reset(x_raw);
+  }
+
+  AbstractOperationPtr check_numerics_op(ctx->CreateOperation());
+  ForwardOperation forward_op;
+  Status s = Reset(check_numerics_op.get(), "CheckNumerics",
+                   /*raw_device_name=*/nullptr, &forward_op);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  if (isa<TracingOperation>(check_numerics_op.get())) {
+    s = dyn_cast<TracingOperation>(check_numerics_op.get())
+            ->SetOpName("check_numerics");
+    ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  }
+  s = AddInput(check_numerics_op.get(), t.get(), &forward_op);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  string message = "This is the way!";
+  s = SetAttrString(check_numerics_op.get(), "message", message.data(),
+                    message.length(), &forward_op);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  int num_retvals = 1;
+  std::vector<AbstractTensorHandle*> outputs(1);
+  GradientRegistry registry;
+  std::unique_ptr<Tape> tape(new Tape(/*persistent=*/false));
+  s = Execute(check_numerics_op.get(), ctx.get(), absl::MakeSpan(outputs),
+              &num_retvals, &forward_op, tape.get(), registry);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+
+  string read_message;
+  s = forward_op.attrs.Get("message", &read_message);
+  ASSERT_EQ(errors::OK, s.code()) << s.error_message();
+  ASSERT_EQ(read_message, message);
+}
+
 // TODO(b/164171226): Enable this test with tfrt after AddInputList is
 // supported. It is needed for IdentityN.
 #ifdef PLATFORM_GOOGLE
diff --git a/tensorflow/c/eager/gradients_util.cc b/tensorflow/c/eager/gradients_util.cc
new file mode 100644
index 00000000000..e53faf4a3f3
--- /dev/null
+++ b/tensorflow/c/eager/gradients_util.cc
@@ -0,0 +1,317 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/eager/gradients_util.h"
+
+#include <memory>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/experimental/ops/math_ops.h"
+#include "tensorflow/c/experimental/ops/nn_ops.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+namespace gradients {
+
+using namespace std;
+
+Status ScalarTensorHandleHelper(TFE_Context* ctx, float value,
+                                TFE_TensorHandle** result) {
+  float data[] = {value};
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_Tensor* t =
+      TFE_AllocateHostTensor(ctx, TF_FLOAT, nullptr, 0, status.get());
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TFE_TensorHandle* th = TFE_NewTensorHandleFromTensor(ctx, t, status.get());
+  *result = th;
+  TF_DeleteTensor(t);
+  return StatusFromTF_Status(status.get());
+}
+
+Status TensorHandleWithDimsFloatHelper(TFE_Context* ctx, float data[],
+                                       int64_t dims[], int num_dims,
+                                       TFE_TensorHandle** result) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_Tensor* t =
+      TFE_AllocateHostTensor(ctx, TF_FLOAT, &dims[0], num_dims, status.get());
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TFE_TensorHandle* th = TFE_NewTensorHandleFromTensor(ctx, t, status.get());
+  *result = th;
+  TF_DeleteTensor(t);
+  return StatusFromTF_Status(status.get());
+}
+
+Status TensorHandleWithDimsIntHelper(TFE_Context* ctx, int data[],
+                                     int64_t dims[], int num_dims,
+                                     TFE_TensorHandle** result) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_Tensor* t =
+      TFE_AllocateHostTensor(ctx, TF_INT32, &dims[0], num_dims, status.get());
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TFE_TensorHandle* th = TFE_NewTensorHandleFromTensor(ctx, t, status.get());
+  *result = th;
+  TF_DeleteTensor(t);
+  return StatusFromTF_Status(status.get());
+}
+
+// Get a scalar TensorHandle with given value
+Status ScalarTensorHandle(AbstractContext* ctx, float value,
+                          AbstractTensorHandle** tensor) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_Context* eager_ctx =
+      TF_ExecutionContextGetTFEContext(wrap(ctx), status.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
+  TFE_TensorHandle* input_eager;
+  TF_RETURN_IF_ERROR(ScalarTensorHandleHelper(eager_ctx, value, &input_eager));
+  *tensor =
+      unwrap(TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get()));
+  return StatusFromTF_Status(status.get());
+}
+
+// Get a TensorHandle with given float values and dimensions
+Status TensorHandleWithDimsFloat(AbstractContext* ctx, float data[],
+                                 int64_t dims[], int num_dims,
+                                 AbstractTensorHandle** tensor) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_Context* eager_ctx =
+      TF_ExecutionContextGetTFEContext(wrap(ctx), status.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
+  TFE_TensorHandle* input_eager;
+  TF_RETURN_IF_ERROR(TensorHandleWithDimsFloatHelper(eager_ctx, data, dims,
+                                                     num_dims, &input_eager));
+  *tensor =
+      unwrap(TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get()));
+  return StatusFromTF_Status(status.get());
+}
+
+// Get a TensorHandle with given int values and dimensions
+Status TensorHandleWithDimsInt(AbstractContext* ctx, int data[], int64_t dims[],
+                               int num_dims, AbstractTensorHandle** tensor) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_Context* eager_ctx =
+      TF_ExecutionContextGetTFEContext(wrap(ctx), status.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
+  TFE_TensorHandle* input_eager;
+  TF_RETURN_IF_ERROR(TensorHandleWithDimsIntHelper(eager_ctx, data, dims,
+                                                   num_dims, &input_eager));
+  *tensor =
+      unwrap(TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get()));
+  return StatusFromTF_Status(status.get());
+}
+
+Status GetValue(AbstractTensorHandle* t, TF_Tensor** result_tensor) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_TensorHandle* result_t =
+      TF_AbstractTensorGetEagerTensor(wrap(t), status.get());
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
+  *result_tensor = TFE_TensorHandleResolve(result_t, status.get());
+  return StatusFromTF_Status(status.get());
+}
+
+AbstractTensorHandlePtr GetTensorHandleUtilFloat(AbstractContext* ctx,
+                                                 float vals[], int64_t dims[],
+                                                 int num_dims) {
+  AbstractTensorHandlePtr A;
+  AbstractTensorHandle* a_raw = nullptr;
+  Status s = TensorHandleWithDimsFloat(ctx, vals, dims, num_dims, &a_raw);
+  if (s.ok()) {
+    A.reset(a_raw);
+  }
+  return A;
+}
+
+AbstractTensorHandlePtr GetTensorHandleUtilInt(AbstractContext* ctx, int vals[],
+                                               int64_t dims[], int num_dims) {
+  AbstractTensorHandlePtr A;
+  AbstractTensorHandle* a_raw = nullptr;
+  Status s = TensorHandleWithDimsInt(ctx, vals, dims, num_dims, &a_raw);
+  if (s.ok()) {
+    A.reset(a_raw);
+  }
+  return A;
+}
+
+AbstractTensorHandlePtr GetScalarTensorHandleUtil(AbstractContext* ctx,
+                                                  float val) {
+  AbstractTensorHandlePtr y;
+  AbstractTensorHandle* y_raw = nullptr;
+  Status s = ScalarTensorHandle(ctx, val, &y_raw);
+  if (s.ok()) {
+    y.reset(y_raw);
+  }
+  return y;
+}
+
+Status UpdateWeights(AbstractContext* ctx, vector<AbstractTensorHandle*>& grads,
+                     vector<AbstractTensorHandle*>& weights,
+                     AbstractTensorHandle* learning_rate) {
+  /* Update weights one by one using gradient update rule:
+   *
+   *    w -= lr*grad[w]
+   *
+   *  NOTE: assuming learning rate is positive
+   */
+
+  int num_grads = grads.size();
+  vector<AbstractTensorHandle*> temp_outputs(1);
+  std::string update_str;
+
+  // Negate learning rate for gradient descent
+  TF_RETURN_IF_ERROR(ops::Neg(ctx, {learning_rate},
+                              absl::MakeSpan(temp_outputs),
+                              "neg_lr"));  // Compute -lr
+  learning_rate = temp_outputs[0];
+
+  for (int i = 0; i < num_grads; i++) {
+    // Compute dW = -lr * grad(w[i])
+    update_str = "update_mul_" + std::to_string(i);
+    TF_RETURN_IF_ERROR(ops::Mul(ctx, {learning_rate, grads[i]},
+                                absl::MakeSpan(temp_outputs),
+                                update_str.c_str()));
+
+    AbstractTensorHandle* dW = temp_outputs[0];
+
+    // Compute temp = weights[i] + dW
+    update_str = "update_add_" + std::to_string(i);
+    TF_RETURN_IF_ERROR(ops::Add(ctx, {weights[i], dW},
+                                absl::MakeSpan(temp_outputs),
+                                update_str.c_str()));
+
+    // Update the weights
+    weights[i] = temp_outputs[0];
+  }
+
+  return Status::OK();
+}
+
+AbstractContext* BuildFunction(const char* fn_name) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction(fn_name, status.get());
+  return unwrap(graph_ctx);
+}
+
+Status CreateParamsForInputs(AbstractContext* ctx,
+                             absl::Span<AbstractTensorHandle* const> inputs,
+                             vector<AbstractTensorHandle*>* params) {
+  tracing::TracingTensorHandle* handle = nullptr;
+  for (auto input : inputs) {
+    TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingContext>(ctx)->AddParameter(
+        input->DataType(), &handle));
+    params->emplace_back(handle);
+  }
+  return Status::OK();
+}
+
+Status RunModel(Model model, AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, bool use_function,
+                const GradientRegistry& registry) {
+  if (use_function) {
+    const char* fn_name = "test_fn";
+    std::unique_ptr<AbstractFunction> scoped_func;
+    // Returning null tensors from a tf.function is not supported, so we keep
+    // track of indices in the model's outputs are nullptr in this set.
+    // The FunctionDef only outputs the non-null tensors. We later pad the
+    // function op outputs to have nullptrs at the `null_indices`.
+    absl::flat_hash_set<int> null_indices;
+    {
+      AbstractContextPtr func_ctx(BuildFunction(fn_name));
+      vector<AbstractTensorHandle*> func_inputs;
+      func_inputs.reserve(inputs.size());
+      TF_RETURN_IF_ERROR(
+          CreateParamsForInputs(func_ctx.get(), inputs, &func_inputs));
+      vector<AbstractTensorHandle*> model_outputs;
+      model_outputs.resize(outputs.size());
+      TF_RETURN_IF_ERROR(model(func_ctx.get(), absl::MakeSpan(func_inputs),
+                               absl::MakeSpan(model_outputs), registry));
+      for (auto func_input : func_inputs) {
+        func_input->Unref();
+      }
+      AbstractFunction* func = nullptr;
+      OutputList output_list;
+      output_list.expected_num_outputs = 0;
+      output_list.outputs.reserve(outputs.size());
+      for (int i = 0; i < model_outputs.size(); i++) {
+        if (model_outputs[i]) {
+          output_list.outputs.emplace_back(model_outputs[i]);
+          output_list.expected_num_outputs += 1;
+        } else {
+          null_indices.insert(i);
+        }
+      }
+      TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingContext>(func_ctx.get())
+                             ->Finalize(&output_list, &func));
+      scoped_func.reset(func);
+      for (auto output : output_list.outputs) {
+        output->Unref();
+      }
+      TF_RETURN_IF_ERROR(ctx->RegisterFunction(func));
+    }
+
+    AbstractOperationPtr fn_op(ctx->CreateOperation());
+    TF_RETURN_IF_ERROR(fn_op->Reset(fn_name, /*raw_device_name=*/nullptr));
+    for (auto input : inputs) {
+      TF_RETURN_IF_ERROR(fn_op->AddInput(input));
+    }
+    int retvals = outputs.size() - null_indices.size();
+    vector<AbstractTensorHandle*> fn_outputs(retvals);
+    TF_RETURN_IF_ERROR(fn_op->Execute(
+        absl::Span<AbstractTensorHandle*>(fn_outputs.data(), fn_outputs.size()),
+        &retvals));
+    int skipped_indices = 0;
+    for (int i = 0; i < outputs.size(); i++) {
+      if (!null_indices.contains(i)) {
+        outputs[i] = fn_outputs[i - skipped_indices];
+      } else {
+        skipped_indices += 1;
+      }
+    }
+    TF_RETURN_IF_ERROR(ctx->RemoveFunction(fn_name));
+    return Status::OK();
+  } else {
+    return model(ctx, inputs, outputs, registry);
+  }
+}
+
+Status BuildImmediateExecutionContext(bool use_tfrt, AbstractContext** ctx) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetTfrt(opts, use_tfrt);
+  *ctx = unwrap(TF_NewEagerExecutionContext(opts, status.get()));
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
+  TFE_DeleteContextOptions(opts);
+  return Status::OK();
+}
+
+}  // namespace gradients
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/tensorflow/c/eager/gradients_util.h b/tensorflow/c/eager/gradients_util.h
new file mode 100644
index 00000000000..cd0bbc0720d
--- /dev/null
+++ b/tensorflow/c/eager/gradients_util.h
@@ -0,0 +1,88 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/experimental/ops/array_ops.h"
+#include "tensorflow/c/experimental/ops/math_ops.h"
+#include "tensorflow/c/experimental/ops/nn_ops.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace gradients {
+
+// Get a scalar TensorHandle with given value
+Status ScalarTensorHandle(AbstractContext* ctx, float value,
+                          AbstractTensorHandle** tensor);
+
+// Get a TensorHandle with given float values and dimensions
+Status TensorHandleWithDimsFloat(AbstractContext* ctx, float data[],
+                                 int64_t dims[], int num_dims,
+                                 AbstractTensorHandle** tensor);
+
+// Get a TensorHandle with given int values and dimensions
+Status TensorHandleWithDimsInt(AbstractContext* ctx, int data[], int64_t dims[],
+                               int num_dims, AbstractTensorHandle** tensor);
+
+// Places data from `t` into *result_tensor.
+Status GetValue(AbstractTensorHandle* t, TF_Tensor** result_tensor);
+
+// Util function that wraps an AbstractTensorHandle* with given data and dims.
+AbstractTensorHandlePtr GetTensorHandleUtilFloat(AbstractContext* ctx,
+                                                 float vals[], int64_t dims[],
+                                                 int num_dims);
+
+// Util function that wraps an AbstractTensorHandle* with given data and dims.
+AbstractTensorHandlePtr GetTensorHandleUtilInt(AbstractContext* ctx, int vals[],
+                                               int64_t dims[], int num_dims);
+
+// Util function that wraps an AbstractTensorHandle* with given data.
+AbstractTensorHandlePtr GetScalarTensorHandleUtil(AbstractContext* ctx,
+                                                  float val);
+
+// Performs gradient update for each weight using given learning rate.
+Status UpdateWeights(AbstractContext* ctx,
+                     std::vector<AbstractTensorHandle*>& grads,
+                     std::vector<AbstractTensorHandle*>& weights,
+                     AbstractTensorHandle* learning_rate);
+
+using Model = std::function<Status(
+    AbstractContext*, absl::Span<AbstractTensorHandle* const>,
+    absl::Span<AbstractTensorHandle*>, const GradientRegistry&)>;
+
+// Runs given model in either graph or eager mode depending on value of
+// use_function.
+Status RunModel(Model model, AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, bool use_function,
+                const GradientRegistry& registry);
+
+// Builds context and returns inside *ctx.
+Status BuildImmediateExecutionContext(bool use_tfrt, AbstractContext** ctx);
+
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/immediate_execution_context.h b/tensorflow/c/eager/immediate_execution_context.h
index 02a3320ef65..a3e3857b34b 100644
--- a/tensorflow/c/eager/immediate_execution_context.h
+++ b/tensorflow/c/eager/immediate_execution_context.h
@@ -29,8 +29,25 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/tstring.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
+class EagerExecutor;
+
+// LINT.IfChange
+// Note: Keep in sync with exported copy of enum in eager/c_api.h.
+enum ContextDevicePlacementPolicy {
+  // Running operations with input tensors on the wrong device will fail.
+  DEVICE_PLACEMENT_EXPLICIT = 0,
+  // Copy the tensor to the right device but log a warning.
+  DEVICE_PLACEMENT_WARN = 1,
+  // Silently copy the tensor, which has a performance cost since the operation
+  // will be blocked till the copy completes. This is the default policy.
+  DEVICE_PLACEMENT_SILENT = 2,
+  // Placement policy which silently copies int32 tensors but not other dtypes.
+  DEVICE_PLACEMENT_SILENT_FOR_INT32 = 3,
+};
+// LINT.ThenChange(//tensorflow/c/eager/c_api.h)
 
 // Abstract interface to a context.
 //
@@ -81,14 +98,6 @@ class ImmediateExecutionContext : public AbstractContext {
   // List attributes of available devices
   virtual void ListDevices(std::vector<DeviceAttributes>* devices) = 0;
 
-  virtual void ClearCachesAndThreadExecutors() = 0;
-
-  // Initialize the step resource container for a training step. This is used
-  // in current TF runtime. For tfrt, it is used by fallback op handler.
-  virtual void StartStep() = 0;
-  // Destroy the step resource container for a training step.
-  virtual void EndStep() = 0;
-
   // Block until all pending nodes are finished.
   virtual Status AsyncWait() = 0;
 
@@ -97,11 +106,52 @@ class ImmediateExecutionContext : public AbstractContext {
   // already exists.
   virtual Status AddFunctionDef(const FunctionDef& fdef) = 0;
 
+  // Find and return a added function by its name.
+  virtual const FunctionDef* FindFunctionDef(const string& name) const = 0;
+
+  // Return the ParsedName of Host CPU device.
+  virtual const DeviceNameUtils::ParsedName& HostCPUParsedName() const = 0;
+
+  // Configure soft device placement policy.
+  virtual void SetAllowSoftPlacement(bool enable) = 0;
+
+  // Configure device placement policy logging.
+  virtual void SetLogDevicePlacement(bool enable) = 0;
+
+  // Sets the device placement policy for the current thread.
+  virtual void SetThreadLocalDevicePlacementPolicy(
+      ContextDevicePlacementPolicy policy) = 0;
+  // Returns the device placement policy for the current thread.
+  virtual ContextDevicePlacementPolicy GetDevicePlacementPolicy() const = 0;
+
   // For LLVM style RTTI.
   static bool classof(const AbstractContext* ptr) {
     return ptr->getKind() == kEager || ptr->getKind() == kTfrt;
   }
 
+  //===--------------------------------------------------------------------===//
+  // Following are legacy features in TF Eager Runtime.
+  // TODO(tf-runtime): Figure out a way to deprecate following features after
+  // migrated to TFRT.
+  //===--------------------------------------------------------------------===//
+  // Clear pending nodes in thread executors and kernel caches.
+  virtual void ClearCachesAndThreadExecutors() = 0;
+
+  // Initialize the step resource container for a training step. This is used
+  // in current TF runtime. For tfrt, it is used by fallback op handler.
+  virtual void StartStep() = 0;
+  // Destroy the step resource container for a training step.
+  virtual void EndStep() = 0;
+
+  // Return the Eager Executor for current thread. Please note that Eager
+  // Executor is only used in current TF but not in TFRT.
+  virtual EagerExecutor& Executor() = 0;
+  // Update the Eager Executor for current thread.
+  virtual void SetExecutorForThread(EagerExecutor* executor) = 0;
+
+  // Configure graph collection in RunMetadata.
+  virtual void SetShouldStoreGraphs(bool value) = 0;
+
  protected:
   explicit ImmediateExecutionContext(AbstractContextKind kind)
       : AbstractContext(kind) {}
diff --git a/tensorflow/c/eager/immediate_execution_operation.h b/tensorflow/c/eager/immediate_execution_operation.h
index ee212b21a96..7b68ec2c9f4 100644
--- a/tensorflow/c/eager/immediate_execution_operation.h
+++ b/tensorflow/c/eager/immediate_execution_operation.h
@@ -47,9 +47,6 @@ class ImmediateExecutionOperation : public AbstractOperation {
   virtual Status InputLength(const char* input_name, int* length) = 0;
   virtual Status OutputLength(const char* output_name, int* length) = 0;
 
-  // Experimental
-  virtual Status SetUseXla(bool enable) = 0;
-
   // Set stack trace to be used for potential async error reporting.
   virtual void SetStackTrace(AbstractStackTrace stack_trace) = 0;
 
diff --git a/tensorflow/c/eager/immediate_execution_tensor_handle.h b/tensorflow/c/eager/immediate_execution_tensor_handle.h
index 6d32d482747..bb6d471f12f 100644
--- a/tensorflow/c/eager/immediate_execution_tensor_handle.h
+++ b/tensorflow/c/eager/immediate_execution_tensor_handle.h
@@ -44,6 +44,10 @@ class ImmediateExecutionTensorHandle : public AbstractTensorHandle {
   virtual const char* DeviceName(Status* status) const = 0;
   // Returns the device where the tensor was placed.
   virtual const char* BackingDeviceName(Status* status) const = 0;
+  // Returns the device type which created the handle.
+  virtual const char* DeviceType(Status* status) const = 0;
+  // Returns the device ID which created the handle.
+  virtual int DeviceId(Status* status) const = 0;
   // Returns a tensor for the handle. If tensor is remote, it will be copied.
   virtual AbstractTensorInterface* Resolve(Status* status) = 0;
 
diff --git a/tensorflow/c/eager/mnist_gradients_test.cc b/tensorflow/c/eager/mnist_gradients_test.cc
index d6dd94806a7..4114f50a798 100644
--- a/tensorflow/c/eager/mnist_gradients_test.cc
+++ b/tensorflow/c/eager/mnist_gradients_test.cc
@@ -14,11 +14,11 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
-#include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/c/eager/c_api_unified_experimental.h"
 #include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
 #include "tensorflow/c/eager/gradients.h"
 #include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/eager/gradients_util.h"
 #include "tensorflow/c/eager/mnist_gradients_testutil.h"
 #include "tensorflow/c/experimental/gradients/math_grad.h"
 #include "tensorflow/c/experimental/gradients/nn_grad.h"
@@ -33,12 +33,16 @@ namespace tensorflow {
 namespace gradients {
 namespace internal {
 namespace {
+using tensorflow::TF_StatusPtr;
 
 class CppGradients
     : public ::testing::TestWithParam<std::tuple<const char*, bool, bool>> {
  protected:
   void SetUp() override {
-    TF_SetTracingImplementation(std::get<0>(GetParam()));
+    TF_StatusPtr status(TF_NewStatus());
+    TF_SetTracingImplementation(std::get<0>(GetParam()), status.get());
+    Status s = StatusFromTF_Status(status.get());
+    CHECK_EQ(errors::OK, s.code()) << s.error_message();
   }
 };
 
@@ -49,89 +53,10 @@ Status RegisterGradients(GradientRegistry* registry) {
   TF_RETURN_IF_ERROR(registry->Register("Relu", ReluRegisterer));
   TF_RETURN_IF_ERROR(
       registry->Register("SparseSoftmaxCrossEntropyWithLogits",
-                         SparseSoftmaxCrossEntropyLossRegisterer));
+                         SparseSoftmaxCrossEntropyWithLogitsRegisterer));
   return Status::OK();
 }
 
-// ========================= Test Util Functions ==============================
-
-// Get a scalar TensorHandle with given value
-Status TestScalarTensorHandle(AbstractContext* ctx, float value,
-                              AbstractTensorHandle** tensor) {
-  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-      TF_NewStatus(), TF_DeleteStatus);
-  TFE_Context* eager_ctx =
-      TF_ExecutionContextGetTFEContext(wrap(ctx), status.get());
-  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
-  TFE_TensorHandle* input_eager = TestScalarTensorHandle(eager_ctx, value);
-  *tensor =
-      unwrap(TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get()));
-  return Status::OK();
-}
-
-// Get a Matrix TensorHandle with given float values and dimensions
-Status TestTensorHandleWithDimsFloat(AbstractContext* ctx, float data[],
-                                     int64_t dims[], int num_dims,
-                                     AbstractTensorHandle** tensor) {
-  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-      TF_NewStatus(), TF_DeleteStatus);
-  TFE_Context* eager_ctx =
-      TF_ExecutionContextGetTFEContext(wrap(ctx), status.get());
-  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
-  TFE_TensorHandle* input_eager =
-      TestTensorHandleWithDimsFloat(eager_ctx, data, dims, num_dims);
-  *tensor =
-      unwrap(TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get()));
-  return Status::OK();
-}
-
-// Get a Matrix TensorHandle with given int values and dimensions
-Status TestTensorHandleWithDimsInt(AbstractContext* ctx, int data[],
-                                   int64_t dims[], int num_dims,
-                                   AbstractTensorHandle** tensor) {
-  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-      TF_NewStatus(), TF_DeleteStatus);
-  TFE_Context* eager_ctx =
-      TF_ExecutionContextGetTFEContext(wrap(ctx), status.get());
-  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
-  TFE_TensorHandle* input_eager =
-      TestTensorHandleWithDimsInt(eager_ctx, data, dims, num_dims);
-  *tensor =
-      unwrap(TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get()));
-  return Status::OK();
-}
-
-Status GetValue(AbstractTensorHandle* t, TF_Tensor** result_tensor) {
-  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-      TF_NewStatus(), TF_DeleteStatus);
-  TFE_TensorHandle* result_t =
-      TF_AbstractTensorGetEagerTensor(wrap(t), status.get());
-  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
-  *result_tensor = TFE_TensorHandleResolve(result_t, status.get());
-  return Status::OK();
-}
-
-AbstractTensorHandlePtr GetTensorHandleUtilFloat(AbstractContext* ctx,
-                                                 float vals[], int64_t dims[],
-                                                 int num_dims) {
-  AbstractTensorHandlePtr A;
-  AbstractTensorHandle* a_raw = nullptr;
-  Status s = TestTensorHandleWithDimsFloat(ctx, vals, dims, num_dims, &a_raw);
-  A.reset(a_raw);
-  return A;
-}
-
-AbstractTensorHandlePtr GetTensorHandleUtilInt(AbstractContext* ctx, int vals[],
-                                               int64_t dims[], int num_dims) {
-  AbstractTensorHandlePtr A;
-  AbstractTensorHandle* a_raw = nullptr;
-  Status s = TestTensorHandleWithDimsInt(ctx, vals, dims, num_dims, &a_raw);
-  A.reset(a_raw);
-  return A;
-}
-
-// =========================== Start Tests ================================
-
 TEST_P(CppGradients, TestMatMulGrad) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
@@ -465,6 +390,12 @@ TEST_P(CppGradients, TestReluGrad) {
 }
 
 TEST_P(CppGradients, TestSoftmaxLossGrad) {
+  bool use_function = !std::get<2>(GetParam());
+  if (use_function) {
+    // TODO(b/168850692): Enable this.
+    GTEST_SKIP() << "Can't take gradient of "
+                    "SparseSoftmaxCrossEntropyWithLogits in tracing mode.";
+  }
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
 
@@ -533,6 +464,12 @@ TEST_P(CppGradients, TestSoftmaxLossGrad) {
 }
 
 TEST_P(CppGradients, TestMNISTGrad) {
+  bool use_function = !std::get<2>(GetParam());
+  if (use_function) {
+    // TODO(b/168850692): Enable this.
+    GTEST_SKIP() << "Can't take gradient of "
+                    "SparseSoftmaxCrossEntropyWithLogits in tracing mode.";
+  }
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   AbstractContextPtr ctx;
@@ -603,7 +540,6 @@ TEST_P(CppGradients, TestMNISTGrad) {
          TF_TensorByteSize(dW1_tensor));
 
   float expected_dW1[4] = {0.0f, 3.2f, 0.0f, 4.8f};
-  ;  // dLoss
   for (int j = 0; j < 4; j++) {
     ASSERT_NEAR(result_data[j], expected_dW1[j], tolerance);
   }
@@ -643,7 +579,7 @@ TEST_P(CppGradients, TestScalarMul) {
   AbstractTensorHandlePtr eta;
   {
     AbstractTensorHandle* x_raw = nullptr;
-    Status s = TestScalarTensorHandle(ctx.get(), 1.5f, &x_raw);
+    Status s = ScalarTensorHandle(ctx.get(), 1.5f, &x_raw);
     ASSERT_EQ(errors::OK, s.code()) << s.error_message();
     eta.reset(x_raw);
   }
@@ -681,6 +617,12 @@ TEST_P(CppGradients, TestScalarMul) {
 }
 
 TEST_P(CppGradients, TestMNIST_Training) {
+  bool use_function = !std::get<2>(GetParam());
+  if (use_function) {
+    // TODO(b/168850692): Enable this.
+    GTEST_SKIP() << "Can't take gradient of "
+                    "SparseSoftmaxCrossEntropyWithLogits in tracing mode.";
+  }
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
 
@@ -733,7 +675,7 @@ TEST_P(CppGradients, TestMNIST_Training) {
 
   // Set learning rate to be 1e-1
   AbstractTensorHandle* learning_rate = nullptr;
-  s = TestScalarTensorHandle(ctx.get(), 1e-1, &learning_rate);
+  s = ScalarTensorHandle(ctx.get(), 1e-1, &learning_rate);
   ASSERT_EQ(errors::OK, s.code()) << s.error_message();
 
   // Train
@@ -765,13 +707,13 @@ TEST_P(CppGradients, TestMNIST_Training) {
 #ifdef PLATFORM_GOOGLE
 INSTANTIATE_TEST_SUITE_P(
     UnifiedCAPI, CppGradients,
-    ::testing::Combine(::testing::Values("graphdef"),
+    ::testing::Combine(::testing::Values("graphdef", "mlir"),
                        /*tfrt*/ ::testing::Values(false),
                        /*executing_eagerly*/ ::testing::Values(true, false)));
 #else
 INSTANTIATE_TEST_SUITE_P(
     UnifiedCAPI, CppGradients,
-    ::testing::Combine(::testing::Values("graphdef"),
+    ::testing::Combine(::testing::Values("graphdef", "mlir"),
                        /*tfrt*/ ::testing::Values(false),
                        /*executing_eagerly*/ ::testing::Values(true, false)));
 #endif
diff --git a/tensorflow/c/eager/mnist_gradients_testutil.cc b/tensorflow/c/eager/mnist_gradients_testutil.cc
index 4b2c87c678d..6688d9d4e75 100644
--- a/tensorflow/c/eager/mnist_gradients_testutil.cc
+++ b/tensorflow/c/eager/mnist_gradients_testutil.cc
@@ -24,136 +24,19 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
 #include "tensorflow/c/eager/gradients.h"
 #include "tensorflow/c/eager/gradients_internal.h"
+#include "tensorflow/c/eager/gradients_util.h"
+#include "tensorflow/c/experimental/gradients/tape/tape_context.h"
 #include "tensorflow/c/experimental/ops/array_ops.h"
 #include "tensorflow/c/experimental/ops/math_ops.h"
 #include "tensorflow/c/experimental/ops/nn_ops.h"
-#include "tensorflow/c/tf_status_helper.h"
-#include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
 
+
+namespace tensorflow {
+namespace gradients {
+namespace internal {
+
 using std::vector;
-using tracing::TracingOperation;
-
-// ========================== Tape Ops ==============================
-
-// Computes `inputs[0] + inputs[1]` and records it on the tape.
-Status Add(AbstractContext* ctx, Tape* tape,
-           absl::Span<AbstractTensorHandle* const> inputs,
-           absl::Span<AbstractTensorHandle*> outputs,
-           const GradientRegistry& registry) {
-  AbstractOperationPtr add_op(ctx->CreateOperation());
-  ForwardOperation forward_op;
-  forward_op.ctx = ctx;
-  TF_RETURN_IF_ERROR(
-      Reset(add_op.get(), "Add", /*raw_device_name=*/nullptr, &forward_op));
-  if (isa<TracingOperation>(add_op.get())) {
-    TF_RETURN_IF_ERROR(
-        dyn_cast<TracingOperation>(add_op.get())->SetOpName("my_add"));
-  }
-  TF_RETURN_IF_ERROR(AddInput(add_op.get(), inputs[0], &forward_op));
-  TF_RETURN_IF_ERROR(AddInput(add_op.get(), inputs[1], &forward_op));
-  int num_retvals = 1;
-  return Execute(add_op.get(), ctx, outputs, &num_retvals, &forward_op, tape,
-                 registry);
-}
-
-// Computes `inputs[0] * inputs[1]` for matrices and records it on the tape.
-Status MatMul(AbstractContext* ctx, Tape* tape,
-              absl::Span<AbstractTensorHandle* const> inputs,
-              absl::Span<AbstractTensorHandle*> outputs, const char* name,
-              bool transpose_a, bool transpose_b,
-              const GradientRegistry& registry) {
-  AbstractOperationPtr matmul_op(ctx->CreateOperation());
-  ForwardOperation forward_op;
-  forward_op.ctx = ctx;
-  TF_RETURN_IF_ERROR(Reset(matmul_op.get(), "MatMul",
-                           /*raw_device_name=*/nullptr, &forward_op));
-  if (isa<TracingOperation>(matmul_op.get())) {
-    TF_RETURN_IF_ERROR(
-        dyn_cast<TracingOperation>(matmul_op.get())->SetOpName(name));
-  }
-
-  TF_RETURN_IF_ERROR(AddInput(matmul_op.get(), inputs[0], &forward_op));
-  TF_RETURN_IF_ERROR(AddInput(matmul_op.get(), inputs[1], &forward_op));
-  TF_RETURN_IF_ERROR(tensorflow::gradients::internal::SetAttrBool(
-      matmul_op.get(), "transpose_a", transpose_a, &forward_op));
-  TF_RETURN_IF_ERROR(tensorflow::gradients::internal::SetAttrBool(
-      matmul_op.get(), "transpose_b", transpose_b, &forward_op));
-
-  int num_retvals = 1;
-  return Execute(matmul_op.get(), ctx, outputs, &num_retvals, &forward_op, tape,
-                 registry);
-}
-
-Status Mul(AbstractContext* ctx, Tape* tape,
-           absl::Span<AbstractTensorHandle* const> inputs,
-           absl::Span<AbstractTensorHandle*> outputs, const char* name,
-           const GradientRegistry& registry) {
-  AbstractOperationPtr mul_op(ctx->CreateOperation());
-  ForwardOperation forward_op;
-  forward_op.ctx = ctx;
-  TF_RETURN_IF_ERROR(
-      Reset(mul_op.get(), "Mul", /*raw_device_name=*/nullptr, &forward_op));
-  if (isa<TracingOperation>(mul_op.get())) {
-    TF_RETURN_IF_ERROR(
-        dyn_cast<TracingOperation>(mul_op.get())->SetOpName(name));
-  }
-
-  TF_RETURN_IF_ERROR(AddInput(mul_op.get(), inputs[0], &forward_op));
-  TF_RETURN_IF_ERROR(AddInput(mul_op.get(), inputs[1], &forward_op));
-
-  int num_retvals = 1;
-  return Execute(mul_op.get(), ctx, outputs, &num_retvals, &forward_op, tape,
-                 registry);
-}
-
-// Computes `Relu(inputs[0])` and records it on the tape.
-Status Relu(AbstractContext* ctx, Tape* tape,
-            absl::Span<AbstractTensorHandle* const> inputs,
-            absl::Span<AbstractTensorHandle*> outputs, const char* name,
-            const GradientRegistry& registry) {
-  AbstractOperationPtr relu_op(ctx->CreateOperation());
-  ForwardOperation forward_op;
-  forward_op.ctx = ctx;
-  TF_RETURN_IF_ERROR(
-      Reset(relu_op.get(), "Relu", /*raw_device_name=*/nullptr, &forward_op));
-  if (isa<TracingOperation>(relu_op.get())) {
-    TF_RETURN_IF_ERROR(
-        dyn_cast<TracingOperation>(relu_op.get())->SetOpName(name));
-  }
-  TF_RETURN_IF_ERROR(AddInput(relu_op.get(), inputs[0], &forward_op));
-  int num_retvals = 1;
-  return Execute(relu_op.get(), ctx, outputs, &num_retvals, &forward_op, tape,
-                 registry);
-}
-
-// Computes `SoftmaxLoss(scores, labels)` for matrices and records it on the
-// tape.
-Status SparseSoftmaxCrossEntropyLoss(
-    AbstractContext* ctx, Tape* tape,
-    absl::Span<AbstractTensorHandle* const> inputs,
-    absl::Span<AbstractTensorHandle*> outputs, const char* name,
-    const GradientRegistry& registry) {
-  AbstractTensorHandle* scores = inputs[0];
-  AbstractTensorHandle* labels = inputs[1];
-
-  AbstractOperationPtr sm_op(ctx->CreateOperation());
-  ForwardOperation forward_op;
-  forward_op.ctx = ctx;
-  TF_RETURN_IF_ERROR(Reset(sm_op.get(), "SparseSoftmaxCrossEntropyWithLogits",
-                           /*raw_device_name=*/nullptr, &forward_op));
-  if (isa<TracingOperation>(sm_op.get())) {
-    TF_RETURN_IF_ERROR(
-        dyn_cast<TracingOperation>(sm_op.get())->SetOpName(name));
-  }
-
-  TF_RETURN_IF_ERROR(AddInput(sm_op.get(), scores, &forward_op));
-  TF_RETURN_IF_ERROR(AddInput(sm_op.get(), labels, &forward_op));
-
-  int num_retvals = 2;  // returns loss values and backprop
-  return Execute(sm_op.get(), ctx, outputs, &num_retvals, &forward_op, tape,
-                 registry);
-}
 
 //===================== Test Models to run =========================
 
@@ -169,8 +52,9 @@ Status AddGradModel(AbstractContext* ctx,
   tape->Watch(ToId(inputs[0]));  // Watch x.
   tape->Watch(ToId(inputs[1]));  // Watch y.
   std::vector<AbstractTensorHandle*> add_outputs(1);
-  TF_RETURN_IF_ERROR(Add(ctx, tape, inputs, absl::MakeSpan(add_outputs),
-                         registry));  // Compute x+y.
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  TF_RETURN_IF_ERROR(
+      ops::Add(tape_ctx.get(), inputs, absl::MakeSpan(add_outputs), "Add"));
   std::unordered_map<tensorflow::int64, TapeTensor>
       source_tensors_that_are_targets;
 
@@ -202,9 +86,11 @@ Status MatMulGradModel(AbstractContext* ctx,
   tape->Watch(ToId(inputs[0]));  // Watch x.
   tape->Watch(ToId(inputs[1]));  // Watch y.
   vector<AbstractTensorHandle*> mm_outputs(1);
-  TF_RETURN_IF_ERROR(MatMul(ctx, tape, inputs, absl::MakeSpan(mm_outputs),
-                            "matmul0", /*transpose_a=*/false,
-                            /*transpose_b=*/false, registry));  // Compute x*y.
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  TF_RETURN_IF_ERROR(ops::MatMul(tape_ctx.get(), inputs,
+                                 absl::MakeSpan(mm_outputs), "matmul0",
+                                 /*transpose_a=*/false,
+                                 /*transpose_b=*/false));  // Compute x*y.
 
   std::unordered_map<tensorflow::int64, TapeTensor>
       source_tensors_that_are_targets;
@@ -238,8 +124,9 @@ Status MNISTForwardModel(AbstractContext* ctx,
    *     hidden_layer = tf.nn.relu(mm_out_1)
    *     scores = tf.matmul(hidden_layer,W2)
    *     softmax =
-   * tf.nn.sparse_softmax_cross_entropy_with_logits(scores,y_labels) return
-   * scores, softmax
+   *        tf.nn.sparse_softmax_cross_entropy_with_logits(scores,
+   *                                                       y_labels)
+   *     return scores, softmax
    *
    * Use this convention for inputs:
    *
@@ -257,24 +144,27 @@ Status MNISTForwardModel(AbstractContext* ctx,
   tape->Watch(ToId(W2));  // Watch W2.
   vector<AbstractTensorHandle*> temp_outputs(1);
 
-  TF_RETURN_IF_ERROR(MatMul(ctx, tape, {X, W1}, absl::MakeSpan(temp_outputs),
-                            "matmul0", /*transpose_a=*/false,
-                            /*transpose_b=*/false, registry));  // Compute X*W1
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  TF_RETURN_IF_ERROR(ops::MatMul(tape_ctx.get(), {X, W1},
+                                 absl::MakeSpan(temp_outputs), "matmul0",
+                                 /*transpose_a=*/false,
+                                 /*transpose_b=*/false));  // Compute X*W1
 
-  TF_RETURN_IF_ERROR(Relu(ctx, tape, {temp_outputs[0]},
-                          absl::MakeSpan(temp_outputs), "relu",
-                          registry));  // Compute Relu(X*W1)
+  TF_RETURN_IF_ERROR(ops::Relu(tape_ctx.get(), {temp_outputs[0]},
+                               absl::MakeSpan(temp_outputs),
+                               "relu"));  // Compute Relu(X*W1)
 
-  TF_RETURN_IF_ERROR(MatMul(ctx, tape, {temp_outputs[0], W2},
-                            absl::MakeSpan(temp_outputs), "matmul1",
-                            /*transpose_a=*/false, /*transpose_b=*/false,
-                            registry));  // Compute W2*Relu(X*W1)
+  TF_RETURN_IF_ERROR(ops::MatMul(
+      tape_ctx.get(), {temp_outputs[0], W2}, absl::MakeSpan(temp_outputs),
+      "matmul1",
+      /*transpose_a=*/false, /*transpose_b=*/false));  // Compute W2*Relu(X*W1)
 
   AbstractTensorHandle* scores = temp_outputs[0];
 
-  TF_RETURN_IF_ERROR(SparseSoftmaxCrossEntropyLoss(
-      ctx, tape, {scores, y_labels}, absl::MakeSpan(temp_outputs),
-      "softmax_loss", registry));  // Compute Softmax(Scores,labels)
+  temp_outputs.resize(2);
+  TF_RETURN_IF_ERROR(ops::SparseSoftmaxCrossEntropyWithLogits(
+      tape_ctx.get(), {scores, y_labels}, absl::MakeSpan(temp_outputs),
+      "softmax_loss"));  // Compute Softmax(Scores,labels)
 
   AbstractTensorHandle* loss_vals = temp_outputs[0];
 
@@ -297,9 +187,11 @@ Status MatMulTransposeModel(AbstractContext* ctx,
   tape->Watch(ToId(W1));
   vector<AbstractTensorHandle*> temp_outputs(1);
 
-  TF_RETURN_IF_ERROR(MatMul(ctx, tape, {X, W1}, absl::MakeSpan(temp_outputs),
-                            "matmul0", /*transpose_a=*/true,
-                            /*transpose_b=*/false, registry));  // Compute X*W1
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  TF_RETURN_IF_ERROR(ops::MatMul(tape_ctx.get(), {X, W1},
+                                 absl::MakeSpan(temp_outputs), "matmul0",
+                                 /*transpose_a=*/true,
+                                 /*transpose_b=*/false));  // Compute X*W1
 
   outputs[0] = temp_outputs[0];
 
@@ -315,8 +207,10 @@ Status ReluGradModel(AbstractContext* ctx,
   auto tape = new Tape(/*persistent=*/false);
   tape->Watch(ToId(inputs[0]));  // Watch X
   vector<AbstractTensorHandle*> relu_outputs(1);
-  TF_RETURN_IF_ERROR(Relu(ctx, tape, inputs, absl::MakeSpan(relu_outputs),
-                          "relu0", registry));  // Relu(X)
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  TF_RETURN_IF_ERROR(ops::Relu(tape_ctx.get(), inputs,
+                               absl::MakeSpan(relu_outputs),
+                               "relu0"));  // Relu(X)
 
   std::unordered_map<tensorflow::int64, TapeTensor>
       source_tensors_that_are_targets;
@@ -346,8 +240,9 @@ Status SoftmaxLossGradModel(AbstractContext* ctx,
   tape->Watch(ToId(inputs[0]));  // Watch scores.
   tape->Watch(ToId(inputs[1]));  // Watch labels.
   vector<AbstractTensorHandle*> sm_outputs(2);
-  TF_RETURN_IF_ERROR(SparseSoftmaxCrossEntropyLoss(
-      ctx, tape, inputs, absl::MakeSpan(sm_outputs), "softmax0", registry));
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  TF_RETURN_IF_ERROR(ops::SparseSoftmaxCrossEntropyWithLogits(
+      tape_ctx.get(), inputs, absl::MakeSpan(sm_outputs), "softmax0"));
 
   std::unordered_map<tensorflow::int64, TapeTensor>
       source_tensors_that_are_targets;
@@ -381,29 +276,30 @@ Status MNISTGradModel(AbstractContext* ctx,
   tape->Watch(ToId(W1));  // Watch W1.
   tape->Watch(ToId(W2));  // Watch W1.
   vector<AbstractTensorHandle*> temp_outputs(1);
-  TF_RETURN_IF_ERROR(MatMul(ctx, tape, {X, W1}, absl::MakeSpan(temp_outputs),
-                            "matmul0", /*transpose_a=*/false,
-                            /*transpose_b=*/false, registry));  // Compute X*W1
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  TF_RETURN_IF_ERROR(ops::MatMul(tape_ctx.get(), {X, W1},
+                                 absl::MakeSpan(temp_outputs), "matmul0",
+                                 /*transpose_a=*/false,
+                                 /*transpose_b=*/false));  // Compute X*W1
 
   AbstractTensorHandle* mm = temp_outputs[0];
 
-  TF_RETURN_IF_ERROR(Relu(ctx, tape, {mm},
-                          absl::MakeSpan(temp_outputs),  // Relu(X*W1)
-                          "relu0", registry));
+  TF_RETURN_IF_ERROR(ops::Relu(tape_ctx.get(), {mm},
+                               absl::MakeSpan(temp_outputs),  // Relu(X*W1)
+                               "relu0"));
 
   AbstractTensorHandle* hidden = temp_outputs[0];
 
-  TF_RETURN_IF_ERROR(MatMul(ctx, tape, {hidden, W2},
-                            absl::MakeSpan(temp_outputs), "matmul1",
-                            /*transpose_a=*/false, /*transpose_b=*/false,
-                            registry));  // W2*Relu(X*W1)
+  TF_RETURN_IF_ERROR(ops::MatMul(
+      tape_ctx.get(), {hidden, W2}, absl::MakeSpan(temp_outputs), "matmul1",
+      /*transpose_a=*/false, /*transpose_b=*/false));  // W2*Relu(X*W1)
 
   AbstractTensorHandle* scores = temp_outputs[0];
 
   temp_outputs.resize(2);
-  TF_RETURN_IF_ERROR(SparseSoftmaxCrossEntropyLoss(
-      ctx, tape, {scores, y_labels}, absl::MakeSpan(temp_outputs),
-      "softmaxloss", registry));  // W2*Relu(X*W1)
+  TF_RETURN_IF_ERROR(ops::SparseSoftmaxCrossEntropyWithLogits(
+      tape_ctx.get(), {scores, y_labels}, absl::MakeSpan(temp_outputs),
+      "softmaxloss"));  // W2*Relu(X*W1)
 
   AbstractTensorHandle* loss = temp_outputs[0];
 
@@ -440,8 +336,10 @@ Status ScalarMulModel(AbstractContext* ctx,
   auto tape = new Tape(/*persistent=*/false);
   vector<AbstractTensorHandle*> temp_outputs(1);
 
-  TF_RETURN_IF_ERROR(Mul(ctx, tape, {eta, A}, absl::MakeSpan(temp_outputs),
-                         "scalarMul0", registry));  // Compute eta*A
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  TF_RETURN_IF_ERROR(ops::Mul(tape_ctx.get(), {eta, A},
+                              absl::MakeSpan(temp_outputs),
+                              "scalarMul0"));  // Compute eta*A
 
   outputs[0] = temp_outputs[0];
 
@@ -449,146 +347,69 @@ Status ScalarMulModel(AbstractContext* ctx,
   return Status::OK();
 }
 
+Status MatMulModel(AbstractContext* ctx,
+                   absl::Span<AbstractTensorHandle* const> inputs,
+                   absl::Span<AbstractTensorHandle*> outputs,
+                   const GradientRegistry& registry) {
+  AbstractTensorHandle* X = inputs[0];
+  AbstractTensorHandle* W1 = inputs[1];
+
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  std::vector<AbstractTensorHandle*> temp_outputs(1);
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  TF_RETURN_IF_ERROR(ops::MatMul(tape_ctx.get(), {X, W1},
+                                 absl::MakeSpan(temp_outputs), "matmul0",
+                                 /*transpose_a=*/false,
+                                 /*transpose_b=*/false));  // Compute X*W1
+
+  outputs[0] = temp_outputs[0];
+  delete tape;
+  return Status::OK();
+}
+
+Status MulModel(AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs,
+                const GradientRegistry& registry) {
+  AbstractTensorHandle* x = inputs[0];
+  AbstractTensorHandle* y = inputs[1];
+
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  std::vector<AbstractTensorHandle*> temp_outputs(1);
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  TF_RETURN_IF_ERROR(ops::Mul(tape_ctx.get(), {x, y},
+                              absl::MakeSpan(temp_outputs),
+                              "mul0"));  // Compute x*y
+
+  outputs[0] = temp_outputs[0];
+  delete tape;
+  return Status::OK();
+}
+
+Status SoftmaxModel(AbstractContext* ctx,
+                    absl::Span<AbstractTensorHandle* const> inputs,
+                    absl::Span<AbstractTensorHandle*> outputs,
+                    const GradientRegistry& registry) {
+  AbstractTensorHandle* x = inputs[0];
+  AbstractTensorHandle* labels = inputs[1];
+
+  TapeVSpace vspace(ctx);
+  auto tape = new Tape(/*persistent=*/false);
+  std::vector<AbstractTensorHandle*> temp_outputs(2);
+  AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry));
+  TF_RETURN_IF_ERROR(ops::SparseSoftmaxCrossEntropyWithLogits(
+      tape_ctx.get(), {x, labels}, absl::MakeSpan(temp_outputs), "sm_loss"));
+
+  outputs[0] = temp_outputs[0];  // loss values
+
+  delete tape;
+  return Status::OK();
+}
+
 // ============================= End Models ================================
 
-Status UpdateWeights(AbstractContext* ctx, vector<AbstractTensorHandle*>& grads,
-                     vector<AbstractTensorHandle*>& weights,
-                     AbstractTensorHandle* learning_rate) {
-  /* Update weights one by one using gradient update rule:
-   *
-   *    w -= lr*grad[w]
-   *
-   *  NOTE: assuming learning rate is positive
-   */
-
-  Status s;
-  int num_grads = grads.size();
-  vector<AbstractTensorHandle*> temp_outputs(1);
-  std::string update_str;
-
-  // Negate learning rate for gradient descent
-  TF_RETURN_IF_ERROR(ops::Neg(ctx, {learning_rate},
-                              absl::MakeSpan(temp_outputs),
-                              "neg_lr"));  // Compute -lr
-  learning_rate = temp_outputs[0];
-
-  for (int i = 0; i < num_grads; i++) {
-    // Compute dW = -lr * grad(w[i])
-    update_str = "update_mul_" + std::to_string(i);
-    s = ops::Mul(ctx, {learning_rate, grads[i]}, absl::MakeSpan(temp_outputs),
-                 update_str.c_str());
-
-    AbstractTensorHandle* dW = temp_outputs[0];
-
-    // Compute temp = weights[i] + dW
-    update_str = "update_add_" + std::to_string(i);
-    s = ops::Add(ctx, {weights[i], dW}, absl::MakeSpan(temp_outputs),
-                 update_str.c_str());
-
-    // Update the weights
-    weights[i] = temp_outputs[0];
-  }
-
-  return Status::OK();
-}
-
-AbstractContext* BuildFunction(const char* fn_name) {
-  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-      TF_NewStatus(), TF_DeleteStatus);
-  TF_ExecutionContext* graph_ctx = TF_CreateFunction(fn_name, status.get());
-  return unwrap(graph_ctx);
-}
-
-Status CreateParamsForInputs(AbstractContext* ctx,
-                             absl::Span<AbstractTensorHandle* const> inputs,
-                             vector<AbstractTensorHandle*>* params) {
-  tracing::TracingTensorHandle* handle = nullptr;
-  for (auto input : inputs) {
-    TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingContext>(ctx)->AddParameter(
-        input->DataType(), &handle));
-    params->emplace_back(handle);
-  }
-  return Status::OK();
-}
-
-Status RunModel(Model model, AbstractContext* ctx,
-                absl::Span<AbstractTensorHandle* const> inputs,
-                absl::Span<AbstractTensorHandle*> outputs, bool use_function,
-                const GradientRegistry& registry) {
-  if (use_function) {
-    const char* fn_name = "test_fn";
-    std::unique_ptr<AbstractFunction> scoped_func;
-    // Returning null tensors from a tf.function is not supported, so we keep
-    // track of indices in the model's outputs are nullptr in this set.
-    // The FunctionDef only outputs the non-null tensors. We later pad the
-    // function op outputs to have nullptrs at the `null_indices`.
-    absl::flat_hash_set<int> null_indices;
-    {
-      AbstractContextPtr func_ctx(BuildFunction(fn_name));
-      vector<AbstractTensorHandle*> func_inputs;
-      func_inputs.reserve(inputs.size());
-      TF_RETURN_IF_ERROR(
-          CreateParamsForInputs(func_ctx.get(), inputs, &func_inputs));
-      vector<AbstractTensorHandle*> model_outputs;
-      model_outputs.resize(outputs.size());
-      TF_RETURN_IF_ERROR(model(func_ctx.get(), absl::MakeSpan(func_inputs),
-                               absl::MakeSpan(model_outputs), registry));
-      for (auto func_input : func_inputs) {
-        func_input->Unref();
-      }
-      AbstractFunction* func = nullptr;
-      OutputList output_list;
-      output_list.expected_num_outputs = 0;
-      output_list.outputs.reserve(outputs.size());
-      for (int i = 0; i < model_outputs.size(); i++) {
-        if (model_outputs[i]) {
-          output_list.outputs.emplace_back(model_outputs[i]);
-          output_list.expected_num_outputs += 1;
-        } else {
-          null_indices.insert(i);
-        }
-      }
-      TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingContext>(func_ctx.get())
-                             ->Finalize(&output_list, &func));
-      scoped_func.reset(func);
-      for (auto output : output_list.outputs) {
-        output->Unref();
-      }
-      TF_RETURN_IF_ERROR(ctx->RegisterFunction(func));
-    }
-
-    AbstractOperationPtr fn_op(ctx->CreateOperation());
-    TF_RETURN_IF_ERROR(fn_op->Reset(fn_name, /*raw_device_name=*/nullptr));
-    for (auto input : inputs) {
-      TF_RETURN_IF_ERROR(fn_op->AddInput(input));
-    }
-    int retvals = outputs.size() - null_indices.size();
-    vector<AbstractTensorHandle*> fn_outputs(retvals);
-    TF_RETURN_IF_ERROR(fn_op->Execute(
-        absl::Span<AbstractTensorHandle*>(fn_outputs.data(), fn_outputs.size()),
-        &retvals));
-    int skipped_indices = 0;
-    for (int i = 0; i < outputs.size(); i++) {
-      if (!null_indices.contains(i)) {
-        outputs[i] = fn_outputs[i - skipped_indices];
-      } else {
-        skipped_indices += 1;
-      }
-    }
-    TF_RETURN_IF_ERROR(ctx->RemoveFunction(fn_name));
-    return Status::OK();
-  } else {
-    return model(ctx, inputs, outputs, registry);
-  }
-}
-
-Status BuildImmediateExecutionContext(bool use_tfrt, AbstractContext** ctx) {
-  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-      TF_NewStatus(), TF_DeleteStatus);
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_ContextOptionsSetTfrt(opts, use_tfrt);
-  *ctx = unwrap(TF_NewEagerExecutionContext(opts, status.get()));
-  TF_RETURN_IF_ERROR(StatusFromTF_Status(status.get()));
-  TFE_DeleteContextOptions(opts);
-  return Status::OK();
-}
+}  // namespace internal
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/mnist_gradients_testutil.h b/tensorflow/c/eager/mnist_gradients_testutil.h
index b6de8ff6788..b173446ac9b 100644
--- a/tensorflow/c/eager/mnist_gradients_testutil.h
+++ b/tensorflow/c/eager/mnist_gradients_testutil.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_MNIST_GRADIENTS_TESTUTIL_H_
+#define TENSORFLOW_C_EAGER_MNIST_GRADIENTS_TESTUTIL_H_
 #include <memory>
 
 #include "absl/types/span.h"
@@ -24,50 +26,13 @@ limitations under the License.
 #include "tensorflow/c/experimental/ops/array_ops.h"
 #include "tensorflow/c/experimental/ops/math_ops.h"
 #include "tensorflow/c/experimental/ops/nn_ops.h"
-#include "tensorflow/c/tf_status_helper.h"
-#include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/status.h"
 
-using namespace tensorflow;
-using namespace tensorflow::gradients;
-using namespace tensorflow::gradients::internal;
 
-// ========================== Tape Ops ==============================
-
-// Computes `inputs[0] + inputs[1]` and records it on the tape.
-Status Add(AbstractContext* ctx, Tape* tape,
-           absl::Span<AbstractTensorHandle* const> inputs,
-           absl::Span<AbstractTensorHandle*> outputs,
-           const GradientRegistry& registry);
-
-// Computes `inputs[0] * inputs[1]` for matrices and records it on the tape.
-Status MatMul(AbstractContext* ctx, Tape* tape,
-              absl::Span<AbstractTensorHandle* const> inputs,
-              absl::Span<AbstractTensorHandle*> outputs, const char* name,
-              bool transpose_a, bool transpose_b,
-              const GradientRegistry& registry);
-
-// Computes `inputs[0] * inputs[1]` and records it on the tape.
-Status Mul(AbstractContext* ctx, Tape* tape,
-           absl::Span<AbstractTensorHandle* const> inputs,
-           absl::Span<AbstractTensorHandle*> outputs, const char* name,
-           const GradientRegistry& registry);
-
-// Computes `Relu(inputs[0])` and records it on the tape.
-Status Relu(AbstractContext* ctx, Tape* tape,
-            absl::Span<AbstractTensorHandle* const> inputs,
-            absl::Span<AbstractTensorHandle*> outputs, const char* name,
-            const GradientRegistry& registry);
-
-// Computes `SoftmaxLoss(scores, labels)` for matrices and records it on the
-// tape.
-Status SparseSoftmaxCrossEntropyLoss(
-    AbstractContext* ctx, Tape* tape,
-    absl::Span<AbstractTensorHandle* const> inputs,
-    absl::Span<AbstractTensorHandle*> outputs, const char* name,
-    const GradientRegistry& registry);
-
-// ====================== End Tape Ops ============================
+namespace tensorflow {
+namespace gradients {
+namespace internal {
 
 // Computes
 // y = inputs[0] + inputs[1]
@@ -121,26 +86,23 @@ Status ScalarMulModel(AbstractContext* ctx,
                       absl::Span<AbstractTensorHandle*> outputs,
                       const GradientRegistry& registry);
 
-// Updates the weights for a neural network given incoming grads and learning
-// rate
-Status UpdateWeights(AbstractContext* ctx,
-                     std::vector<AbstractTensorHandle*>& grads,
-                     std::vector<AbstractTensorHandle*>& weights,
-                     AbstractTensorHandle* learning_rate);
+Status MatMulModel(AbstractContext* ctx,
+                   absl::Span<AbstractTensorHandle* const> inputs,
+                   absl::Span<AbstractTensorHandle*> outputs,
+                   const GradientRegistry& registry);
 
-AbstractContext* BuildFunction(const char* fn_name);
-
-Status CreateParamsForInputs(AbstractContext* ctx,
-                             absl::Span<AbstractTensorHandle* const> inputs,
-                             std::vector<AbstractTensorHandle*>* params);
-
-using Model = std::function<Status(
-    AbstractContext*, absl::Span<AbstractTensorHandle* const>,
-    absl::Span<AbstractTensorHandle*>, const GradientRegistry&)>;
-
-Status RunModel(Model model, AbstractContext* ctx,
+Status MulModel(AbstractContext* ctx,
                 absl::Span<AbstractTensorHandle* const> inputs,
-                absl::Span<AbstractTensorHandle*> outputs, bool use_function,
+                absl::Span<AbstractTensorHandle*> outputs,
                 const GradientRegistry& registry);
 
-Status BuildImmediateExecutionContext(bool use_tfrt, AbstractContext** ctx);
+Status SoftmaxModel(AbstractContext* ctx,
+                    absl::Span<AbstractTensorHandle* const> inputs,
+                    absl::Span<AbstractTensorHandle*> outputs,
+                    const GradientRegistry& registry);
+
+}  // namespace internal
+}  // namespace gradients
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EAGER_MNIST_GRADIENTS_TESTUTIL_H_
diff --git a/tensorflow/c/eager/parallel_device/BUILD b/tensorflow/c/eager/parallel_device/BUILD
index df5504adce2..473ab503834 100644
--- a/tensorflow/c/eager/parallel_device/BUILD
+++ b/tensorflow/c/eager/parallel_device/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
@@ -103,7 +105,6 @@ cc_library(
     hdrs = ["parallel_device_testlib.h"],
     deps = [
         ":parallel_device",
-        ":parallel_device_ops",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_experimental",
         "//tensorflow/c/eager:c_api",
@@ -118,7 +119,6 @@ tf_cc_test(
     srcs = ["parallel_device_test.cc"],
     deps = [
         ":parallel_device",
-        ":parallel_device_ops",
         ":parallel_device_testlib",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_experimental",
@@ -138,7 +138,6 @@ tf_cc_test(
     args = ["--heap_check=local"],
     deps = [
         ":parallel_device",
-        ":parallel_device_ops",
         ":parallel_device_testlib",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_experimental",
@@ -150,19 +149,3 @@ tf_cc_test(
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
     ],
 )
-
-# Note: ParallelDevice-specific ops are experimental and not currently linked in
-# to TensorFlow by default, just used in a few tests.
-filegroup(
-    name = "parallel_device_ops_srcs",
-    srcs = ["parallel_device_ops.cc"],
-    visibility = ["//tensorflow/python/distribute/parallel_device:__pkg__"],
-)
-
-cc_library(
-    name = "parallel_device_ops",
-    srcs = [":parallel_device_ops_srcs"],
-    visibility = ["//tensorflow:internal"],
-    deps = ["//tensorflow/core:framework"],
-    alwayslink = 1,
-)
diff --git a/tensorflow/c/eager/parallel_device/parallel_device.cc b/tensorflow/c/eager/parallel_device/parallel_device.cc
index d0e9f351478..41bde23448b 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device.cc
@@ -136,13 +136,6 @@ absl::optional<std::vector<MaybeParallelTensorOwned>> ExecuteWithSpecialOps(
     }
     result.emplace(std::move(outputs));
     return result;
-  } else if (operation_name == std::string("DeviceID")) {
-    std::vector<MaybeParallelTensorOwned> result_content;
-    result_content.reserve(1);
-    result_content.push_back(parallel_device.DeviceIDs(context, status));
-    if (TF_GetCode(status) != TF_OK) return result;
-    result.emplace(std::move(result_content));
-    return result;
   }
   std::vector<ParallelTensor*> parallel_inputs;
   std::vector<std::unique_ptr<ParallelTensor>> implicitly_broadcast_tensors;
@@ -255,28 +248,44 @@ TFE_TensorHandle* CopyTensorFromParallelDevice(TFE_Context* context,
 // Since this function is used to satisfy the TFE_CustomDevice C API,
 // device_info is passed in using a C-style generic. It must always be a
 // ParallelDevice.
-void ParallelDeviceExecute(TFE_Context* context, int num_inputs,
-                           TFE_TensorHandle** inputs,
-                           const char* operation_name,
-                           const TFE_OpAttrs* attributes, int* num_outputs,
+void ParallelDeviceExecute(const TFE_Op* original_op, int* num_outputs,
                            TFE_TensorHandle** outputs, TF_Status* status,
                            void* device_info) {
+  const char* requested_placement = TFE_OpGetDevice(original_op, status);
+  if (*requested_placement == '\0') {
+    TF_SetStatus(
+        status, TF_INTERNAL,
+        "Ops must be placed on the parallel device explicitly, or their inputs "
+        "first un-packed. Got an un-placed op with an input placed on the "
+        "parallel device.");
+    return;
+  }
+  TFE_Context* context = TFE_OpGetContext(original_op, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  const char* operation_name = TFE_OpGetName(original_op, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  const TFE_OpAttrs* attributes = TFE_OpGetAttrs(original_op);
+
   NamedParallelDevice* named_device =
       reinterpret_cast<NamedParallelDevice*>(device_info);
   std::vector<MaybeParallelTensorUnowned> typed_inputs;
+  int num_inputs = TFE_OpGetFlatInputCount(original_op, status);
+  if (TF_GetCode(status) != TF_OK) return;
   typed_inputs.reserve(num_inputs);
   for (int i = 0; i < num_inputs; ++i) {
+    TFE_TensorHandle* input = TFE_OpGetFlatInput(original_op, i, status);
+    if (TF_GetCode(status) != TF_OK) return;
     const char* tensor_handle_device =
-        TFE_TensorHandleDeviceName(inputs[i], status);
+        TFE_TensorHandleDeviceName(input, status);
     if (TF_GetCode(status) != TF_OK) return;
     if (named_device->name() == tensor_handle_device) {
       // We assume that any tensors already placed on this device are
       // ParallelTensors.
       typed_inputs.emplace_back(reinterpret_cast<ParallelTensor*>(
-          TFE_TensorHandleDevicePointer(inputs[i], status)));
+          TFE_TensorHandleDevicePointer(input, status)));
       if (TF_GetCode(status) != TF_OK) return;
     } else {
-      typed_inputs.emplace_back(inputs[i]);
+      typed_inputs.emplace_back(input);
     }
   }
 
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
index e270bfcbb80..095f33ff303 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
@@ -58,7 +58,7 @@ using ExecutorPtr = std::unique_ptr<TFE_Executor, ExecutorDeleter>;
 class DeviceThread {
  public:
   // Starts a background thread waiting for `StartExecute`.
-  explicit DeviceThread(const std::string& device)
+  explicit DeviceThread(const std::string& device, const bool is_async)
       : status_(TF_NewStatus()),
         device_(device),
         // If the context's default exector is set to async, re-using that in
@@ -67,7 +67,7 @@ class DeviceThread {
         //
         // TODO(allenl): We should have an async API that works with the
         // parallel device.
-        executor_(TFE_NewExecutor(/*is_async=*/false)),
+        executor_(TFE_NewExecutor(is_async)),
         op_(nullptr),
         thread_(tensorflow::Env::Default()->StartThread(
             tensorflow::ThreadOptions(), "parallel_device_execute",
@@ -236,12 +236,13 @@ void DeviceThread::Execute(TFE_Context* context, const char* operation_name,
   }
 }
 
-ParallelDevice::ParallelDevice(const std::vector<std::string>& devices)
+ParallelDevice::ParallelDevice(const std::vector<std::string>& devices,
+                               const bool is_async)
     : underlying_devices_(devices) {
   device_threads_.reserve(devices.size());
   for (int device_index = 0; device_index < devices.size(); ++device_index) {
     device_threads_.emplace_back(
-        new DeviceThread(devices[device_index].c_str()));
+        new DeviceThread(devices[device_index].c_str(), is_async));
   }
 }
 
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_lib.h b/tensorflow/c/eager/parallel_device/parallel_device_lib.h
index b3dc47ab088..1bb9ce0f663 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_lib.h
+++ b/tensorflow/c/eager/parallel_device/parallel_device_lib.h
@@ -49,7 +49,10 @@ class DeviceThread;
 // placed on each underlying device.
 class ParallelDevice {
  public:
-  explicit ParallelDevice(const std::vector<std::string>& devices);
+  // Eager async execution is only supported when remote eager is not in use
+  // (b/157523095).
+  explicit ParallelDevice(const std::vector<std::string>& devices,
+                          const bool is_async = false);
 
   ~ParallelDevice();
 
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc b/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
index 828dcbae093..67bc596b180 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
@@ -279,30 +279,4 @@ void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
         TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
     ASSERT_EQ(underlying_devices[1], second_device);
   }
-  // Compute the device ID twice and verify the result
-  for (int i = 0; i < 2; ++i) {
-    std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
-        TFE_NewOp(context, "DeviceID", status.get()), TFE_DeleteOp);
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-    TFE_OpSetDevice(op.get(), device_name, status.get());
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-    TFE_TensorHandle* result_handle;
-    int num_retvals = 1;
-    TFE_Execute(op.get(), &result_handle, &num_retvals, status.get());
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-    std::array<TensorHandlePtr, 2> components;
-    ExtractPerDeviceValues(context, result_handle, &components, status.get());
-    TFE_DeleteTensorHandle(result_handle);
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-    ExpectScalarEq<int32_t>(components[0].get(), 0);
-    ExpectScalarEq<int32_t>(components[1].get(), 1);
-    std::string first_device =
-        TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
-    ASSERT_EQ(underlying_devices[0], first_device);
-    std::string second_device =
-        TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
-    ASSERT_EQ(underlying_devices[1], second_device);
-  }
 }
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index fcebe973500..efab4dfbeb2 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -98,6 +99,10 @@ class VSpace {
       gtl::ArraySlice<Gradient*> output_gradients,
       std::vector<Gradient*>* result) const = 0;
 
+  // Builds a tensor filled with ones with the same shape and dtype as `t`.
+  virtual Status BuildOnesLike(const TapeTensor& t,
+                               Gradient** result) const = 0;
+
   // Looks up the ID of a Gradient.
   virtual int64 TensorId(Gradient* tensor) const = 0;
 
@@ -121,7 +126,7 @@ class GradientTape {
   // functions (and hence the tensors they keep alive). Instead, everything
   // is deleted in ~GradientTape. Persistent GradientTapes are useful when
   // users want to compute multiple gradients over the same tape.
-  GradientTape(bool persistent) : persistent_(persistent) {}
+  explicit GradientTape(bool persistent) : persistent_(persistent) {}
   ~GradientTape() {
     for (const auto& pair : op_tape_) {
       pair.second.backward_function_deleter(pair.second.backward_function);
@@ -595,8 +600,10 @@ Status InitialGradients(
         for (int j = 0; j < op_it->second.output_tensor_info.size(); ++j) {
           if (op_it->second.output_tensor_info[j].GetID() == id) {
             found = true;
-            (*result)[id].push_back(
-                op_it->second.output_tensor_info[j].OnesLike());
+            Gradient* ones_like = nullptr;
+            TF_RETURN_IF_ERROR(vspace.BuildOnesLike(
+                op_it->second.output_tensor_info[j], &ones_like));
+            (*result)[id].push_back(ones_like);
             break;
           }
         }
@@ -611,7 +618,10 @@ Status InitialGradients(
         // target is also a source.
         auto source_tensor = sources_that_are_targets.find(id);
         if (source_tensor != sources_that_are_targets.end()) {
-          (*result)[id].push_back(source_tensor->second.OnesLike());
+          Gradient* ones_like = nullptr;
+          TF_RETURN_IF_ERROR(
+              vspace.BuildOnesLike(source_tensor->second, &ones_like));
+          (*result)[id].push_back(ones_like);
         }
       }
     } else {
@@ -934,7 +944,7 @@ ForwardAccumulator<Gradient, BackwardFunction, TapeTensor>::ForwardpropFromTape(
       // TODO(allenl): Figure out why using zeros_like everywhere causes issues
       // for some gradient functions and if there's another way to work around
       // it (e.g. conds instead of ifs). The value shouldn't really matter.
-      aid = output_tensor.OnesLike();
+      TF_RETURN_IF_ERROR(vspace_.BuildOnesLike(output_tensor, &aid));
     }
     if (TF_PREDICT_FALSE(aid == nullptr)) {
       return tensorflow::errors::Internal(
diff --git a/tensorflow/c/eager/tracing_utils.cc b/tensorflow/c/eager/tracing_utils.cc
new file mode 100644
index 00000000000..8eec4bc7d9a
--- /dev/null
+++ b/tensorflow/c/eager/tracing_utils.cc
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/eager/tracing_utils.h"
+
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/experimental/gradients/tape/tape_operation.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+namespace tracing {
+
+Status MaybeSetOpName(AbstractOperation* op, const char* op_name) {
+  if (isa<TracingOperation>(op)) {
+    TF_RETURN_IF_ERROR(dyn_cast<TracingOperation>(op)->SetOpName(op_name));
+  }
+  if (isa<gradients::TapeOperation>(op)) {
+    TF_RETURN_IF_ERROR(MaybeSetOpName(
+        dyn_cast<gradients::TapeOperation>(op)->GetBackingOperation(),
+        op_name));
+  }
+  return Status::OK();
+}
+}  // namespace tracing
+}  // namespace tensorflow
diff --git a/tensorflow/c/eager/tracing_utils.h b/tensorflow/c/eager/tracing_utils.h
new file mode 100644
index 00000000000..e2c8f9b28ec
--- /dev/null
+++ b/tensorflow/c/eager/tracing_utils.h
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_TAPE_UTILS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_TAPE_UTILS_H_
+
+#include "tensorflow/c/eager/abstract_operation.h"
+
+namespace tensorflow {
+namespace tracing {
+Status MaybeSetOpName(AbstractOperation*, const char* op_name);
+}  // namespace tracing
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_TAPE_UTILS_H_
diff --git a/tensorflow/c/experimental/filesystem/BUILD b/tensorflow/c/experimental/filesystem/BUILD
index 061fdbd893b..c05c7dc3f7e 100644
--- a/tensorflow/c/experimental/filesystem/BUILD
+++ b/tensorflow/c/experimental/filesystem/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 # Experimental filesystem C APIs for TensorFlow.
 # Will be moved in proper place once all filesystems are converted to the
 # modular framework.
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index 68875d61e47..0fc9f260b21 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 # Experimental gcs filesystem plugin.
 load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object", "tf_cc_test")
 
@@ -29,6 +31,7 @@ cc_library(
         ":gcs_helper",
         ":ram_file_block_cache",
         "//tensorflow/c:env",
+        "//tensorflow/c:logging",
         "//tensorflow/c:tf_status",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "@com_github_googlecloudplatform_google_cloud_cpp//:storage_client",
@@ -59,6 +62,7 @@ cc_library(
     deps = [
         ":cleanup",
         "//tensorflow/c:env",
+        "//tensorflow/c:logging",
         "//tensorflow/c:tf_status",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/synchronization",
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
index e01af918100..8cd8ad7ca81 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "google/cloud/storage/client.h"
 #include "tensorflow/c/env.h"
 #include "tensorflow/c/experimental/filesystem/plugins/gcs/gcs_helper.h"
+#include "tensorflow/c/logging.h"
 #include "tensorflow/c/tf_status.h"
 
 // Implementation of a filesystem for GCS environments.
@@ -120,20 +121,20 @@ static int64_t LoadBufferFromGCS(const std::string& path, size_t offset,
     return -1;
   }
   int64_t read;
-  if (!absl::SimpleAtoi(stream.headers().find("content-length")->second,
-                        &read)) {
+  auto content_length = stream.headers().find("content-length");
+  if (content_length == stream.headers().end()) {
     // When we read a file with offset that is bigger than the actual file size.
     // GCS will return an empty header (e.g no `content-length` header). In this
     // case, we will set read to `0` and continue.
-    if (TF_GetCode(status) == TF_OUT_OF_RANGE) {
-      read = 0;
-    } else {
-      TF_SetStatus(status, TF_UNKNOWN, "Could not get content-length header");
-      return -1;
-    }
+    read = 0;
+  } else if (!absl::SimpleAtoi(content_length->second, &read)) {
+    TF_SetStatus(status, TF_UNKNOWN, "Could not get content-length header");
+    return -1;
   }
   // `TF_OUT_OF_RANGE` isn't considered as an error. So we clear it here.
   TF_SetStatus(status, TF_OK, "");
+  TF_VLog(1, "Successful read of %s @ %u of size: %u", path.c_str(), offset,
+          read);
   stream.read(buffer, read);
   read = stream.gcount();
   if (read < buffer_size) {
@@ -146,6 +147,8 @@ static int64_t LoadBufferFromGCS(const std::string& path, size_t offset,
                                   path, " @ ", offset)
                          .c_str());
       }
+      TF_VLog(2, "Successful integrity check for: %s @ %u", path.c_str(),
+              offset);
     }
   }
   return read;
@@ -259,7 +262,8 @@ static void SyncImpl(const std::string& bucket, const std::string& object,
   if (*offset == -1 || *offset == 0) {
     // UploadFile will automatically switch to resumable upload based on Client
     // configuration.
-    auto metadata = gcs_client->UploadFile(outfile->getName(), bucket, object);
+    auto metadata = gcs_client->UploadFile(outfile->getName(), bucket, object,
+                                           gcs::Fields("size"));
     if (!metadata) {
       TF_SetStatusFromGCSStatus(metadata.status(), status);
       return;
@@ -278,15 +282,18 @@ static void SyncImpl(const std::string& bucket, const std::string& object,
   } else {
     std::string temporary_object =
         gcs::CreateRandomPrefixName("tf_writable_file_gcs");
-    auto metadata =
-        gcs_client->UploadFile(outfile->getName(), bucket, temporary_object);
+    auto metadata = gcs_client->UploadFile(outfile->getName(), bucket,
+                                           temporary_object, gcs::Fields(""));
     if (!metadata) {
       TF_SetStatusFromGCSStatus(metadata.status(), status);
       return;
     }
+    TF_VLog(3, "AppendObject: gs://%s/%s to gs://%s/%s", bucket.c_str(),
+            temporary_object.c_str(), bucket.c_str(), object.c_str());
     const std::vector<gcs::ComposeSourceObject> source_objects = {
         {object, {}, {}}, {temporary_object, {}, {}}};
-    metadata = gcs_client->ComposeObject(bucket, source_objects, object);
+    metadata = gcs_client->ComposeObject(bucket, source_objects, object,
+                                         gcs::Fields("size"));
     if (!metadata) {
       TF_SetStatusFromGCSStatus(metadata.status(), status);
       return;
@@ -321,6 +328,8 @@ void Append(const TF_WritableFile* file, const char* buffer, size_t n,
                  "The internal temporary file is not writable.");
     return;
   }
+  TF_VLog(3, "Append: gs://%s/%s size %u", gcs_file->bucket.c_str(),
+          gcs_file->object.c_str(), n);
   gcs_file->sync_need = true;
   gcs_file->outfile.write(buffer, n);
   if (!gcs_file->outfile)
@@ -346,6 +355,8 @@ int64_t Tell(const TF_WritableFile* file, TF_Status* status) {
 void Flush(const TF_WritableFile* file, TF_Status* status) {
   auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
   if (gcs_file->sync_need) {
+    TF_VLog(3, "Flush started: gs://%s/%s", gcs_file->bucket.c_str(),
+            gcs_file->object.c_str());
     if (!gcs_file->outfile) {
       TF_SetStatus(status, TF_INTERNAL,
                    "Could not append to the internal temporary file.");
@@ -353,6 +364,8 @@ void Flush(const TF_WritableFile* file, TF_Status* status) {
     }
     SyncImpl(gcs_file->bucket, gcs_file->object, &gcs_file->offset,
              &gcs_file->outfile, gcs_file->gcs_client, status);
+    TF_VLog(3, "Flush finished: gs://%s/%s", gcs_file->bucket.c_str(),
+            gcs_file->object.c_str());
     if (TF_GetCode(status) != TF_OK) return;
     gcs_file->sync_need = false;
   } else {
@@ -361,11 +374,16 @@ void Flush(const TF_WritableFile* file, TF_Status* status) {
 }
 
 void Sync(const TF_WritableFile* file, TF_Status* status) {
+  auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
+  TF_VLog(3, "Sync: gs://%s/%s", gcs_file->bucket.c_str(),
+          gcs_file->object.c_str());
   Flush(file, status);
 }
 
 void Close(const TF_WritableFile* file, TF_Status* status) {
   auto gcs_file = static_cast<GCSFile*>(file->plugin_file);
+  TF_VLog(3, "Close: gs://%s/%s", gcs_file->bucket.c_str(),
+          gcs_file->object.c_str());
   if (gcs_file->sync_need) {
     Flush(file, status);
   }
@@ -428,6 +446,8 @@ GCSFile::GCSFile(google::cloud::storage::Client&& gcs_client)
   if (absl::SimpleAtoi(std::getenv(kMaxStaleness), &value)) {
     max_staleness = value;
   }
+  TF_VLog(1, "GCS cache max size = %u ; block size = %u ; max staleness = %u",
+          max_bytes, block_size, max_staleness);
 
   file_block_cache = std::make_unique<RamFileBlockCache>(
       block_size, max_bytes, max_staleness,
@@ -504,13 +524,18 @@ void Cleanup(TF_Filesystem* filesystem) {
 static void UncachedStatForObject(const std::string& bucket,
                                   const std::string& object, GcsFileStat* stat,
                                   gcs::Client* gcs_client, TF_Status* status) {
-  auto metadata = gcs_client->GetObjectMetadata(bucket, object);
+  auto metadata = gcs_client->GetObjectMetadata(
+      bucket, object, gcs::Fields("generation,size,timeStorageClassUpdated"));
   if (!metadata) return TF_SetStatusFromGCSStatus(metadata.status(), status);
   stat->generation_number = metadata->generation();
   stat->base.length = metadata->size();
   stat->base.mtime_nsec =
       metadata->time_storage_class_updated().time_since_epoch().count();
   stat->base.is_directory = object.back() == '/';
+  TF_VLog(1,
+          "Stat of: gs://%s/%s --  length: %u generation: %u; mtime_nsec: %u;",
+          bucket.c_str(), object.c_str(), stat->base.length,
+          stat->generation_number, stat->base.mtime_nsec);
   return TF_SetStatus(status, TF_OK, "");
 }
 
@@ -545,9 +570,10 @@ void NewRandomAccessFile(const TF_Filesystem* filesystem, const char* path,
       if (TF_GetCode(status) != TF_OK) return -1;
       if (!gcs_file->file_block_cache->ValidateAndUpdateFileSignature(
               path, stat.generation_number)) {
-        std::cout
-            << "File signature has been changed. Refreshing the cache. Path: "
-            << path;
+        TF_VLog(
+            1,
+            "File signature has been changed. Refreshing the cache. Path: %s",
+            path.c_str());
       }
       read = gcs_file->file_block_cache->Read(path, offset, n, buffer, status);
     } else {
@@ -579,6 +605,7 @@ void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
        (gcs_file->compose ? 0 : -1)});
   // We are responsible for freeing the pointer returned by TF_GetTempFileName
   free(temp_file_name);
+  TF_VLog(3, "GcsWritableFile: %s", path);
   TF_SetStatus(status, TF_OK, "");
 }
 
@@ -608,7 +635,8 @@ void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
   } else {
     // If compose is true, we do not download anything.
     // Instead we only check if this file exists on server or not.
-    auto metadata = gcs_file->gcs_client.GetObjectMetadata(bucket, object);
+    auto metadata = gcs_file->gcs_client.GetObjectMetadata(bucket, object,
+                                                           gcs::Fields("size"));
     TF_SetStatusFromGCSStatus(metadata.status(), status);
     if (TF_GetCode(status) == TF_OK) {
       file->plugin_file = new tf_writable_file::GCSFile(
@@ -624,7 +652,8 @@ void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
       return;
     }
   }
-
+  TF_VLog(3, "GcsWritableFile: %s with existing file %s", path,
+          temp_file_name.c_str());
   TF_SetStatus(status, TF_OK, "");
 }
 
@@ -639,7 +668,8 @@ void NewReadOnlyMemoryRegionFromFile(const TF_Filesystem* filesystem,
   if (TF_GetCode(status) != TF_OK) return;
 
   auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
-  auto metadata = gcs_file->gcs_client.GetObjectMetadata(bucket, object);
+  auto metadata = gcs_file->gcs_client.GetObjectMetadata(bucket, object,
+                                                         gcs::Fields("size"));
   if (!metadata) {
     TF_SetStatusFromGCSStatus(metadata.status(), status);
     return;
@@ -670,7 +700,8 @@ static void StatForObject(GCSFile* gcs_file, const std::string& path,
   if (object.empty())
     return TF_SetStatus(
         status, TF_INVALID_ARGUMENT,
-        ("'object' must be a non-empty string. (File: " + path + ")").c_str());
+        absl::StrCat("'object' must be a non-empty string. (File: ", path, ")")
+            .c_str());
   TF_SetStatus(status, TF_OK, "");
   gcs_file->stat_cache->LookupOrCompute(
       path, stat,
@@ -698,7 +729,8 @@ static bool ObjectExists(GCSFile* gcs_file, const std::string& path,
 
 static bool BucketExists(GCSFile* gcs_file, const std::string& bucket,
                          TF_Status* status) {
-  auto metadata = gcs_file->gcs_client.GetBucketMetadata(bucket);
+  auto metadata =
+      gcs_file->gcs_client.GetBucketMetadata(bucket, gcs::Fields(""));
   TF_SetStatusFromGCSStatus(metadata.status(), status);
   if (TF_GetCode(status) != TF_OK && TF_GetCode(status) != TF_NOT_FOUND)
     return false;
@@ -721,7 +753,8 @@ static std::vector<std::string> GetChildrenBounded(
   std::string delimiter = recursive ? "" : "/";
 
   for (auto&& item : gcs_file->gcs_client.ListObjectsAndPrefixes(
-           bucket, gcs::Prefix(prefix), gcs::Delimiter(delimiter))) {
+           bucket, gcs::Prefix(prefix), gcs::Delimiter(delimiter),
+           gcs::Fields("items(name),prefixes"))) {
     if (count == max_results) {
       TF_SetStatus(status, TF_OK, "");
       return result;
@@ -737,8 +770,8 @@ static std::vector<std::string> GetChildrenBounded(
     auto pos = children.find(prefix);
     if (pos != 0) {
       TF_SetStatus(status, TF_INTERNAL,
-                   ("Unexpected response: the returned file name " + children +
-                    " doesn't match the prefix " + prefix)
+                   absl::StrCat("Unexpected response: the returned file name ",
+                                children, " doesn't match the prefix ", prefix)
                        .c_str());
       return result;
     }
@@ -812,6 +845,10 @@ void CreateDir(const TF_Filesystem* filesystem, const char* path,
                TF_Status* status) {
   std::string dir = path;
   MaybeAppendSlash(&dir);
+  TF_VLog(3,
+          "CreateDir: creating directory with path: %s and "
+          "path_with_slash: %s",
+          path, dir.c_str());
   std::string bucket, object;
   ParseGCSPath(dir, true, &bucket, &object, status);
   if (TF_GetCode(status) != TF_OK) return;
@@ -821,19 +858,23 @@ void CreateDir(const TF_Filesystem* filesystem, const char* path,
     if (TF_GetCode(status) != TF_OK) return;
     if (!is_directory)
       TF_SetStatus(status, TF_NOT_FOUND,
-                   ("The specified bucket " + dir + " was not found.").c_str());
+                   absl::StrCat("The specified bucket ", dir, " was not found.")
+                       .c_str());
     return;
   }
 
   PathExists(filesystem, dir.c_str(), status);
-  if (TF_GetCode(status) == TF_OK)
+  if (TF_GetCode(status) == TF_OK) {
+    // Use the original name for a correct error here.
+    TF_VLog(3, "CreateDir: directory already exists, not uploading %s", path);
     return TF_SetStatus(status, TF_ALREADY_EXISTS, path);
+  }
 
   auto metadata = gcs_file->gcs_client.InsertObject(
       bucket, object, "",
       // Adding this parameter means HTTP_CODE_PRECONDITION_FAILED
       // will be returned if the object already exists, so avoid reuploading.
-      gcs::IfGenerationMatch(0));
+      gcs::IfGenerationMatch(0), gcs::Fields(""));
   TF_SetStatusFromGCSStatus(metadata.status(), status);
   if (TF_GetCode(status) == TF_FAILED_PRECONDITION)
     TF_SetStatus(status, TF_ALREADY_EXISTS, path);
@@ -891,7 +932,8 @@ void CopyFile(const TF_Filesystem* filesystem, const char* src, const char* dst,
 
   auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
   auto metadata = gcs_file->gcs_client.RewriteObjectBlocking(
-      bucket_src, object_src, bucket_dst, object_dst);
+      bucket_src, object_src, bucket_dst, object_dst,
+      gcs::Fields("done,rewriteToken"));
   TF_SetStatusFromGCSStatus(metadata.status(), status);
 }
 
@@ -908,7 +950,8 @@ bool IsDirectory(const TF_Filesystem* filesystem, const char* path,
     if (!result)
       TF_SetStatus(
           status, TF_NOT_FOUND,
-          ("The specified bucket gs://" + bucket + " was not found.").c_str());
+          absl::StrCat("The specified bucket gs://", bucket, " was not found.")
+              .c_str());
     return result;
   }
 
@@ -933,6 +976,7 @@ bool IsDirectory(const TF_Filesystem* filesystem, const char* path,
 static void RenameObject(const TF_Filesystem* filesystem,
                          const std::string& src, const std::string& dst,
                          TF_Status* status) {
+  TF_VLog(3, "RenameObject: started %s to %s", src.c_str(), dst.c_str());
   std::string bucket_src, object_src;
   ParseGCSPath(src, false, &bucket_src, &object_src, status);
   if (TF_GetCode(status) != TF_OK) return;
@@ -943,9 +987,11 @@ static void RenameObject(const TF_Filesystem* filesystem,
 
   auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
   auto metadata = gcs_file->gcs_client.RewriteObjectBlocking(
-      bucket_src, object_src, bucket_dst, object_dst);
+      bucket_src, object_src, bucket_dst, object_dst,
+      gcs::Fields("done,rewriteToken"));
   TF_SetStatusFromGCSStatus(metadata.status(), status);
   if (TF_GetCode(status) != TF_OK) return;
+  TF_VLog(3, "RenameObject: finished %s to %s", src.c_str(), dst.c_str());
 
   ClearFileCaches(gcs_file, dst);
   DeleteFile(filesystem, src.c_str(), status);
@@ -954,8 +1000,10 @@ static void RenameObject(const TF_Filesystem* filesystem,
 void RenameFile(const TF_Filesystem* filesystem, const char* src,
                 const char* dst, TF_Status* status) {
   if (!IsDirectory(filesystem, src, status)) {
-    if (TF_GetCode(status) == TF_FAILED_PRECONDITION)
+    if (TF_GetCode(status) == TF_FAILED_PRECONDITION) {
+      TF_SetStatus(status, TF_OK, "");
       RenameObject(filesystem, src, dst, status);
+    }
     return;
   }
 
@@ -1032,7 +1080,8 @@ void Stat(const TF_Filesystem* filesystem, const char* path,
 
   auto gcs_file = static_cast<GCSFile*>(filesystem->plugin_filesystem);
   if (object.empty()) {
-    auto bucket_metadata = gcs_file->gcs_client.GetBucketMetadata(bucket);
+    auto bucket_metadata =
+        gcs_file->gcs_client.GetBucketMetadata(bucket, gcs::Fields(""));
     TF_SetStatusFromGCSStatus(bucket_metadata.status(), status);
     if (TF_GetCode(status) == TF_OK) {
       stats->is_directory = true;
@@ -1047,8 +1096,9 @@ void Stat(const TF_Filesystem* filesystem, const char* path,
     stats->mtime_nsec = 0;
     return TF_SetStatus(status, TF_OK, "");
   }
-  if (TF_GetCode(status) == TF_OK) {
-    auto metadata = gcs_file->gcs_client.GetObjectMetadata(bucket, object);
+  if (TF_GetCode(status) == TF_FAILED_PRECONDITION) {
+    auto metadata = gcs_file->gcs_client.GetObjectMetadata(
+        bucket, object, gcs::Fields("size,timeStorageClassUpdated"));
     if (metadata) {
       stats->is_directory = false;
       stats->length = metadata.value().size();
@@ -1061,6 +1111,18 @@ void Stat(const TF_Filesystem* filesystem, const char* path,
   }
 }
 
+int64_t GetFileSize(const TF_Filesystem* filesystem, const char* path,
+                    TF_Status* status) {
+  // Only validate the name.
+  std::string bucket, object;
+  ParseGCSPath(path, false, &bucket, &object, status);
+  if (TF_GetCode(status) != TF_OK) return -1;
+
+  TF_FileStatistics stat;
+  Stat(filesystem, path, &stat, status);
+  return stat.length;
+}
+
 static char* TranslateName(const TF_Filesystem* filesystem, const char* uri) {
   return strdup(uri);
 }
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
index 973ce9e9dc2..5612d004d82 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.h
@@ -87,6 +87,24 @@ void NewReadOnlyMemoryRegionFromFile(const TF_Filesystem* filesystem,
                                      const char* path,
                                      TF_ReadOnlyMemoryRegion* region,
                                      TF_Status* status);
+int64_t GetFileSize(const TF_Filesystem* filesystem, const char* path,
+                    TF_Status* status);
+void PathExists(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status);
+void CreateDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status);
+int GetChildren(const TF_Filesystem* filesystem, const char* path,
+                char*** entries, TF_Status* status);
+void DeleteFile(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status);
+void Stat(const TF_Filesystem* filesystem, const char* path,
+          TF_FileStatistics* stats, TF_Status* status);
+void DeleteDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status);
+void CopyFile(const TF_Filesystem* filesystem, const char* src, const char* dst,
+              TF_Status* status);
+void RenameFile(const TF_Filesystem* filesystem, const char* src,
+                const char* dst, TF_Status* status);
 }  // namespace tf_gcs_filesystem
 
 #endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_GCS_GCS_FILESYSTEM_H_
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
index 82c4e4b8705..e15921335ab 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 #define ASSERT_TF_OK(x) ASSERT_EQ(TF_OK, TF_GetCode(x)) << TF_Message(x)
+#define EXPECT_TF_OK(x) EXPECT_EQ(TF_OK, TF_GetCode(x)) << TF_Message(x)
 
 static const char* content = "abcdefghijklmnopqrstuvwxyz1234567890";
 // We will work with content_view instead of content.
@@ -94,6 +95,70 @@ class GCSFilesystemTest : public ::testing::Test {
     return translated_name;
   }
 
+  std::unique_ptr<TF_WritableFile, void (*)(TF_WritableFile* file)>
+  GetWriter() {
+    std::unique_ptr<TF_WritableFile, void (*)(TF_WritableFile * file)> writer(
+        new TF_WritableFile, [](TF_WritableFile* file) {
+          if (file != nullptr) {
+            if (file->plugin_file != nullptr) tf_writable_file::Cleanup(file);
+            delete file;
+          }
+        });
+    writer->plugin_file = nullptr;
+    return writer;
+  }
+
+  std::unique_ptr<TF_RandomAccessFile, void (*)(TF_RandomAccessFile* file)>
+  GetReader() {
+    std::unique_ptr<TF_RandomAccessFile, void (*)(TF_RandomAccessFile * file)>
+        reader(new TF_RandomAccessFile, [](TF_RandomAccessFile* file) {
+          if (file != nullptr) {
+            if (file->plugin_file != nullptr)
+              tf_random_access_file::Cleanup(file);
+            delete file;
+          }
+        });
+    reader->plugin_file = nullptr;
+    return reader;
+  }
+
+  void WriteString(const std::string& path, const std::string& content) {
+    auto writer = GetWriter();
+    tf_gcs_filesystem::NewWritableFile(filesystem_, path.c_str(), writer.get(),
+                                       status_);
+    if (TF_GetCode(status_) != TF_OK) return;
+    tf_writable_file::Append(writer.get(), content.c_str(), content.length(),
+                             status_);
+    if (TF_GetCode(status_) != TF_OK) return;
+    tf_writable_file::Close(writer.get(), status_);
+    if (TF_GetCode(status_) != TF_OK) return;
+  }
+
+  std::string ReadAll(const std::string& path) {
+    auto reader = GetReader();
+    tf_gcs_filesystem::NewRandomAccessFile(filesystem_, path.c_str(),
+                                           reader.get(), status_);
+    if (TF_GetCode(status_) != TF_OK) return "";
+
+    auto file_size =
+        tf_gcs_filesystem::GetFileSize(filesystem_, path.c_str(), status_);
+    if (TF_GetCode(status_) != TF_OK) return "";
+
+    std::string content;
+    content.resize(file_size);
+    auto read = tf_random_access_file::Read(reader.get(), 0, file_size,
+                                            &content[0], status_);
+    if (TF_GetCode(status_) != TF_OK) return "";
+    if (read >= 0) content.resize(read);
+    if (file_size != content.size())
+      TF_SetStatus(
+          status_, TF_DATA_LOSS,
+          std::string("expected " + std::to_string(file_size) + " got " +
+                      std::to_string(content.size()) + " bytes")
+              .c_str());
+    return content;
+  }
+
  protected:
   TF_Filesystem* filesystem_;
   TF_Status* status_;
@@ -326,6 +391,145 @@ TEST_F(GCSFilesystemTest, ReadOnlyMemoryRegion) {
   delete region;
 }
 
+TEST_F(GCSFilesystemTest, PathExists) {
+  tf_gcs_filesystem::Init(filesystem_, status_);
+  ASSERT_TF_OK(status_);
+  const std::string path = GetURIForPath("PathExists");
+  tf_gcs_filesystem::PathExists(filesystem_, path.c_str(), status_);
+  EXPECT_EQ(TF_NOT_FOUND, TF_GetCode(status_)) << TF_Message(status_);
+  TF_SetStatus(status_, TF_OK, "");
+  WriteString(path, "test");
+  ASSERT_TF_OK(status_);
+  tf_gcs_filesystem::PathExists(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+}
+
+TEST_F(GCSFilesystemTest, GetChildren) {
+  tf_gcs_filesystem::Init(filesystem_, status_);
+  ASSERT_TF_OK(status_);
+  const std::string base = GetURIForPath("GetChildren");
+  tf_gcs_filesystem::CreateDir(filesystem_, base.c_str(), status_);
+  EXPECT_TF_OK(status_);
+
+  const std::string file = io::JoinPath(base, "TestFile.csv");
+  WriteString(file, "test");
+  EXPECT_TF_OK(status_);
+
+  const std::string subdir = io::JoinPath(base, "SubDir");
+  tf_gcs_filesystem::CreateDir(filesystem_, subdir.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  const std::string subfile = io::JoinPath(subdir, "TestSubFile.csv");
+  WriteString(subfile, "test");
+  EXPECT_TF_OK(status_);
+
+  char** entries;
+  auto num_entries = tf_gcs_filesystem::GetChildren(filesystem_, base.c_str(),
+                                                    &entries, status_);
+  EXPECT_TF_OK(status_);
+
+  std::vector<std::string> childrens;
+  for (int i = 0; i < num_entries; ++i) {
+    childrens.push_back(entries[i]);
+  }
+  std::sort(childrens.begin(), childrens.end());
+  EXPECT_EQ(std::vector<string>({"SubDir/", "TestFile.csv"}), childrens);
+}
+
+TEST_F(GCSFilesystemTest, DeleteFile) {
+  tf_gcs_filesystem::Init(filesystem_, status_);
+  ASSERT_TF_OK(status_);
+  const std::string path = GetURIForPath("DeleteFile");
+  WriteString(path, "test");
+  ASSERT_TF_OK(status_);
+  tf_gcs_filesystem::DeleteFile(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  tf_gcs_filesystem::PathExists(filesystem_, path.c_str(), status_);
+  EXPECT_EQ(TF_GetCode(status_), TF_NOT_FOUND);
+}
+
+TEST_F(GCSFilesystemTest, CreateDir) {
+  tf_gcs_filesystem::Init(filesystem_, status_);
+  ASSERT_TF_OK(status_);
+  const std::string dir = GetURIForPath("CreateDir");
+  tf_gcs_filesystem::CreateDir(filesystem_, dir.c_str(), status_);
+  EXPECT_TF_OK(status_);
+
+  TF_FileStatistics stat;
+  tf_gcs_filesystem::Stat(filesystem_, dir.c_str(), &stat, status_);
+  EXPECT_TF_OK(status_);
+  EXPECT_TRUE(stat.is_directory);
+}
+
+TEST_F(GCSFilesystemTest, DeleteDir) {
+  tf_gcs_filesystem::Init(filesystem_, status_);
+  ASSERT_TF_OK(status_);
+  const std::string dir = GetURIForPath("DeleteDir");
+  const std::string file = io::JoinPath(dir, "DeleteDirFile.csv");
+  WriteString(file, "test");
+  ASSERT_TF_OK(status_);
+  tf_gcs_filesystem::DeleteDir(filesystem_, dir.c_str(), status_);
+  EXPECT_EQ(TF_GetCode(status_), TF_FAILED_PRECONDITION);
+
+  TF_SetStatus(status_, TF_OK, "");
+  tf_gcs_filesystem::DeleteFile(filesystem_, file.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  tf_gcs_filesystem::DeleteDir(filesystem_, dir.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  TF_FileStatistics stat;
+  tf_gcs_filesystem::Stat(filesystem_, dir.c_str(), &stat, status_);
+  EXPECT_EQ(TF_GetCode(status_), TF_NOT_FOUND) << TF_Message(status_);
+}
+
+TEST_F(GCSFilesystemTest, StatFile) {
+  tf_gcs_filesystem::Init(filesystem_, status_);
+  ASSERT_TF_OK(status_);
+  const std::string path = GetURIForPath("StatFile");
+  WriteString(path, "test");
+  ASSERT_TF_OK(status_);
+
+  TF_FileStatistics stat;
+  tf_gcs_filesystem::Stat(filesystem_, path.c_str(), &stat, status_);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(4, stat.length);
+  EXPECT_FALSE(stat.is_directory);
+}
+
+TEST_F(GCSFilesystemTest, RenameFile) {
+  tf_gcs_filesystem::Init(filesystem_, status_);
+  ASSERT_TF_OK(status_);
+  const std::string src = GetURIForPath("RenameFileSrc");
+  const std::string dst = GetURIForPath("RenameFileDst");
+  WriteString(src, "test");
+  ASSERT_TF_OK(status_);
+
+  tf_gcs_filesystem::RenameFile(filesystem_, src.c_str(), dst.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  auto result = ReadAll(dst);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ("test", result);
+}
+
+TEST_F(GCSFilesystemTest, RenameFileOverwrite) {
+  tf_gcs_filesystem::Init(filesystem_, status_);
+  ASSERT_TF_OK(status_);
+  const std::string src = GetURIForPath("RenameFileOverwriteSrc");
+  const std::string dst = GetURIForPath("RenameFileOverwriteDst");
+
+  WriteString(src, "test_old");
+  ASSERT_TF_OK(status_);
+  WriteString(dst, "test_new");
+  ASSERT_TF_OK(status_);
+
+  tf_gcs_filesystem::PathExists(filesystem_, dst.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  tf_gcs_filesystem::RenameFile(filesystem_, src.c_str(), dst.c_str(), status_);
+  EXPECT_TF_OK(status_);
+
+  auto result = ReadAll(dst);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ("test_old", result);
+}
+
 // These tests below are ported from
 // `//tensorflow/core/platform/cloud:gcs_file_system_test`
 TEST_F(GCSFilesystemTest, NewRandomAccessFile_NoBlockCache) {
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h b/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h
index 2abfb6f924b..72659a97d42 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/ram_file_block_cache.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/synchronization/notification.h"
 #include "tensorflow/c/env.h"
+#include "tensorflow/c/logging.h"
 #include "tensorflow/c/tf_status.h"
 
 namespace tf_gcs_filesystem {
@@ -65,8 +66,8 @@ class RamFileBlockCache {
       pruning_thread_.reset(
           TF_StartThread(&thread_options, "TF_prune_FBC", PruneThread, this));
     }
-    std::cout << "GCS file block cache is "
-              << (IsCacheEnabled() ? "enabled" : "disabled") << ".\n";
+    TF_VLog(1, "GCS file block cache is %s.\n",
+            (IsCacheEnabled() ? "enabled" : "disabled"));
   }
 
   ~RamFileBlockCache() {
diff --git a/tensorflow/c/experimental/filesystem/plugins/hadoop/BUILD b/tensorflow/c/experimental/filesystem/plugins/hadoop/BUILD
index 51ffd709f3d..765c4e5f06e 100644
--- a/tensorflow/c/experimental/filesystem/plugins/hadoop/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/hadoop/BUILD
@@ -1,5 +1,7 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 # Experimental hadoop filesystem plugin.
-load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object")
+load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object", "tf_cc_test")
 
 package(
     licenses = ["notice"],  # Apache 2.0
@@ -20,12 +22,14 @@ cc_library(
     name = "hadoop_filesystem_impl",
     srcs = ["hadoop_filesystem.cc"],
     hdrs = ["hadoop_filesystem.h"],
+    compatible_with = [],
     copts = select({
         "//conditions:default": [],
         "//tensorflow:windows": get_win_copts(),
     }),
     deps = [
         "//tensorflow/c:env",
+        "//tensorflow/c:logging",
         "//tensorflow/c:tf_status",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
         "//third_party/hadoop:hdfs",
@@ -33,3 +37,38 @@ cc_library(
         "@com_google_absl//absl/synchronization",
     ],
 )
+
+# This test is set to manual because it requires downloading the Hadoop
+# distribution to run. To run this test:
+# 1. Ensure $JAVA_HOME is set to the location of a JDK 8 installation.
+# 2. Download the binary Hadoop distribution from:
+#    http://hadoop.apache.org/releases.html
+# 3. Extract the Hadoop distribution and run:
+#    source libexec/hadoop-config.sh
+# 4. Optionally set up HDFS cluster configurations (optionally Kerberos) within
+#    $HADOOP_HDFS_HOME/etc/hadoop if you want to test against real
+#    distributed HDFS cluster
+# 5. bazel test \
+#      --test_env=LD_LIBRARY_PATH=$JAVA_HOME/jre/lib/amd64/server \
+#      --test_env=HADOOP_HDFS_HOME=$HADOOP_HDFS_HOME \
+#      --test_env=CLASSPATH=$($HADOOP_HDFS_HOME/bin/hadoop classpath --glob) \
+#      :hadoop_file_system_test
+#    To test against the real distributed cluster, add the following option for
+#    bazel test:
+#      --test_env=HADOOP_TEST_TMPDIR=hdfs://cluster/test/tmp/dir
+tf_cc_test(
+    name = "hadoop_filesystem_test",
+    srcs = [
+        "hadoop_filesystem_test.cc",
+    ],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        ":hadoop_filesystem_impl",
+        "//tensorflow/core/platform:path",
+        "//tensorflow/core/platform:stacktrace_handler",
+        "//tensorflow/core/platform:test",
+    ],
+)
diff --git a/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.cc
index e53e3d0bcc5..5ff28e4229a 100644
--- a/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.cc
+++ b/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.cc
@@ -22,11 +22,10 @@ limitations under the License.
 #include <sstream>
 #include <string>
 
-#include "absl/synchronization/mutex.h"
 #include "tensorflow/c/env.h"
 #include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
+#include "tensorflow/c/logging.h"
 #include "tensorflow/c/tf_status.h"
-#include "third_party/hadoop/hdfs.h"
 
 // Implementation of a filesystem for HADOOP environments.
 // This filesystem will support `hdfs://`, `viewfs://` and `har://` URI schemes.
@@ -37,11 +36,17 @@ static void plugin_memory_free(void* ptr) { free(ptr); }
 void ParseHadoopPath(const std::string& fname, std::string* scheme,
                      std::string* namenode, std::string* path) {
   size_t scheme_end = fname.find("://") + 2;
-  *scheme = fname.substr(0, scheme_end + 1);
+  // We don't want `://` in scheme.
+  *scheme = fname.substr(0, scheme_end - 2);
   size_t nn_end = fname.find("/", scheme_end + 1);
-  if (nn_end == std::string::npos) return;
+  if (nn_end == std::string::npos) {
+    *namenode = fname.substr(scheme_end + 1);
+    *path = "";
+    return;
+  }
   *namenode = fname.substr(scheme_end + 1, nn_end - scheme_end - 1);
-  *path = fname.substr(nn_end + 1);
+  // We keep `/` in path.
+  *path = fname.substr(nn_end);
 }
 
 void SplitArchiveNameAndPath(std::string* path, std::string* nn,
@@ -54,7 +59,7 @@ void SplitArchiveNameAndPath(std::string* path, std::string* nn,
   }
   // Case of hadoop archive. Namenode is the path to the archive.
   std::ostringstream namenodestream;
-  namenodestream << "har://" << nn
+  namenodestream << "har://" << *nn
                  << path->substr(0, index_end_archive_name + 4);
   *nn = namenodestream.str();
   path->erase(0, index_end_archive_name + 4);
@@ -143,15 +148,20 @@ class LibHDFS {
     char* hdfs_home = getenv("HADOOP_HDFS_HOME");
     if (hdfs_home != nullptr) {
       auto JoinPath = [](std::string home, std::string lib) {
+#if defined(_WIN32)
+        if (home.back() != '\\') home.push_back('\\');
+        return home + "lib\\native\\" + lib;
+#else
         if (home.back() != '/') home.push_back('/');
         return home + "lib/native/" + lib;
+#endif
       };
       std::string path = JoinPath(hdfs_home, kLibHdfsDso);
       TryLoadAndBind(path.c_str(), &handle_, status);
       if (TF_GetCode(status) == TF_OK) {
         return;
       } else {
-        std::cerr << "HadoopFileSystem load error: " << TF_Message(status);
+        TF_Log(TF_FATAL, "HadoopFileSystem load error: %s", TF_Message(status));
       }
     }
 
@@ -163,13 +173,15 @@ class LibHDFS {
   void* handle_;
 };
 
-// We rely on HDFS connection caching here. The HDFS client calls
-// org.apache.hadoop.fs.FileSystem.get(), which caches the connection
-// internally.
-hdfsFS Connect(LibHDFS* libhdfs, const std::string& path, TF_Status* status) {
+// We implement connection caching in Tensorflow, which can significantly
+// improve performance. Fixes #43187
+hdfsFS Connect(tf_hadoop_filesystem::HadoopFile* hadoop_file,
+               const std::string& path, TF_Status* status) {
+  auto libhdfs = hadoop_file->libhdfs;
   std::string scheme, namenode, hdfs_path;
   ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
 
+  std::string cacheKey(scheme);
   hdfsBuilder* builder = libhdfs->hdfsNewBuilder();
   if (scheme == "file") {
     libhdfs->hdfsBuilderSetNameNode(builder, nullptr);
@@ -194,15 +206,24 @@ hdfsFS Connect(LibHDFS* libhdfs, const std::string& path, TF_Status* status) {
     SplitArchiveNameAndPath(&path_har, &namenode, status);
     if (TF_GetCode(status) != TF_OK) return nullptr;
     libhdfs->hdfsBuilderSetNameNode(builder, namenode.c_str());
+    cacheKey += namenode;
   } else {
     libhdfs->hdfsBuilderSetNameNode(
         builder, namenode.empty() ? "default" : namenode.c_str());
+    cacheKey += namenode;
   }
-  auto fs = libhdfs->hdfsBuilderConnect(builder);
-  if (fs == nullptr)
-    TF_SetStatusFromIOError(status, TF_NOT_FOUND, strerror(errno));
-  else
-    TF_SetStatus(status, TF_OK, "");
+  absl::MutexLock l(&hadoop_file->connection_cache_lock);
+  if (hadoop_file->connection_cache.find(cacheKey) ==
+      hadoop_file->connection_cache.end()) {
+    auto cacheFs = libhdfs->hdfsBuilderConnect(builder);
+    if (cacheFs == nullptr) {
+      TF_SetStatusFromIOError(status, TF_NOT_FOUND, strerror(errno));
+      return cacheFs;
+    }
+    hadoop_file->connection_cache[cacheKey] = cacheFs;
+  }
+  auto fs = hadoop_file->connection_cache[cacheKey];
+  TF_SetStatus(status, TF_OK, "");
   return fs;
 }
 
@@ -216,6 +237,7 @@ typedef struct HDFSFile {
   LibHDFS* libhdfs;
   absl::Mutex mu;
   hdfsFile handle ABSL_GUARDED_BY(mu);
+  bool disable_eof_retried;
   HDFSFile(std::string path, std::string hdfs_path, hdfsFS fs, LibHDFS* libhdfs,
            hdfsFile handle)
       : path(std::move(path)),
@@ -223,7 +245,15 @@ typedef struct HDFSFile {
         fs(fs),
         libhdfs(libhdfs),
         mu(),
-        handle(handle) {}
+        handle(handle) {
+    const char* disable_eof_retried_str =
+        getenv("HDFS_DISABLE_READ_EOF_RETRIED");
+    if (disable_eof_retried_str && disable_eof_retried_str[0] == '1') {
+      disable_eof_retried = true;
+    } else {
+      disable_eof_retried = false;
+    }
+  }
 } HDFSFile;
 
 void Cleanup(TF_RandomAccessFile* file) {
@@ -247,8 +277,12 @@ int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
 
   char* dst = buffer;
   bool eof_retried = false;
-  int64_t r = 0;
-  while (TF_GetCode(status) == TF_OK && !eof_retried) {
+  if (hdfs_file->disable_eof_retried) {
+    // eof_retried = true, avoid calling hdfsOpenFile in Read, Fixes #42597
+    eof_retried = true;
+  }
+  int64_t read = 0;
+  while (TF_GetCode(status) == TF_OK && n > 0) {
     // We lock inside the loop rather than outside so we don't block other
     // concurrent readers.
     absl::MutexLock l(&hdfs_file->mu);
@@ -257,12 +291,13 @@ int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
     // of int32. -2 offset can avoid JVM OutOfMemoryError.
     size_t read_n =
         (std::min)(n, static_cast<size_t>(std::numeric_limits<int>::max() - 2));
-    r = libhdfs->hdfsPread(fs, handle, static_cast<tOffset>(offset), dst,
-                           static_cast<tSize>(read_n));
+    int64_t r = libhdfs->hdfsPread(fs, handle, static_cast<tOffset>(offset),
+                                   dst, static_cast<tSize>(read_n));
     if (r > 0) {
       dst += r;
       n -= r;
       offset += r;
+      read += r;
     } else if (!eof_retried && r == 0) {
       // Always reopen the file upon reaching EOF to see if there's more data.
       // If writers are streaming contents while others are concurrently
@@ -274,11 +309,13 @@ int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
         TF_SetStatusFromIOError(status, errno, path);
         return -1;
       }
-      handle = libhdfs->hdfsOpenFile(fs, hdfs_path, O_RDONLY, 0, 0, 0);
-      if (handle == nullptr) {
+      hdfs_file->handle =
+          libhdfs->hdfsOpenFile(fs, hdfs_path, O_RDONLY, 0, 0, 0);
+      if (hdfs_file->handle == nullptr) {
         TF_SetStatusFromIOError(status, errno, path);
         return -1;
       }
+      handle = hdfs_file->handle;
       eof_retried = true;
     } else if (eof_retried && r == 0) {
       TF_SetStatus(status, TF_OUT_OF_RANGE, "Read less bytes than requested");
@@ -288,7 +325,7 @@ int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
       TF_SetStatusFromIOError(status, errno, path);
     }
   }
-  return r;
+  return read;
 }
 
 }  // namespace tf_random_access_file
@@ -308,7 +345,7 @@ typedef struct HDFSFile {
         handle(handle) {}
 } HDFSFile;
 
-static void Cleanup(TF_WritableFile* file) {
+void Cleanup(TF_WritableFile* file) {
   auto hdfs_file = static_cast<HDFSFile*>(file->plugin_file);
   hdfs_file->libhdfs->hdfsCloseFile(hdfs_file->fs, hdfs_file->handle);
   hdfs_file->fs = nullptr;
@@ -387,30 +424,36 @@ void Close(const TF_WritableFile* file, TF_Status* status) {
 // SECTION 3. Implementation for `TF_ReadOnlyMemoryRegion`
 // ----------------------------------------------------------------------------
 namespace tf_read_only_memory_region {
-
-// TODO(vnvo2409): Implement later
-
+// Hadoop doesn't support Readonly Memory Region
 }  // namespace tf_read_only_memory_region
 
 // SECTION 4. Implementation for `TF_Filesystem`, the actual filesystem
 // ----------------------------------------------------------------------------
 namespace tf_hadoop_filesystem {
 
+HadoopFile::HadoopFile(TF_Status* status)
+    : libhdfs(new LibHDFS(status)),
+      connection_cache_lock(),
+      connection_cache() {}
+
 void Init(TF_Filesystem* filesystem, TF_Status* status) {
-  filesystem->plugin_filesystem = new LibHDFS(status);
+  filesystem->plugin_filesystem = new HadoopFile(status);
   if (TF_GetCode(status) != TF_OK) return;
   TF_SetStatus(status, TF_OK, "");
 }
 
 void Cleanup(TF_Filesystem* filesystem) {
-  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
+  auto hadoop_file = static_cast<HadoopFile*>(filesystem->plugin_filesystem);
+  auto libhdfs = hadoop_file->libhdfs;
   delete libhdfs;
+  delete hadoop_file;
 }
 
 void NewRandomAccessFile(const TF_Filesystem* filesystem, const char* path,
                          TF_RandomAccessFile* file, TF_Status* status) {
-  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
-  auto fs = Connect(libhdfs, path, status);
+  auto hadoop_file = static_cast<HadoopFile*>(filesystem->plugin_filesystem);
+  auto libhdfs = hadoop_file->libhdfs;
+  auto fs = Connect(hadoop_file, path, status);
   if (TF_GetCode(status) != TF_OK) return;
 
   std::string scheme, namenode, hdfs_path;
@@ -426,8 +469,27 @@ void NewRandomAccessFile(const TF_Filesystem* filesystem, const char* path,
 
 void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
                      TF_WritableFile* file, TF_Status* status) {
-  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
-  auto fs = Connect(libhdfs, path, status);
+  auto hadoop_file = static_cast<HadoopFile*>(filesystem->plugin_filesystem);
+  auto libhdfs = hadoop_file->libhdfs;
+  auto fs = Connect(hadoop_file, path, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  std::string scheme, namenode, hdfs_path;
+  ParseHadoopPath(path, &scheme, &namenode, &hdfs_path);
+
+  auto handle = libhdfs->hdfsOpenFile(fs, hdfs_path.c_str(), O_WRONLY, 0, 0, 0);
+  if (handle == nullptr) return TF_SetStatusFromIOError(status, errno, path);
+
+  file->plugin_file =
+      new tf_writable_file::HDFSFile(hdfs_path, fs, libhdfs, handle);
+  TF_SetStatus(status, TF_OK, "");
+}
+
+void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
+                       TF_WritableFile* file, TF_Status* status) {
+  auto hadoop_file = static_cast<HadoopFile*>(filesystem->plugin_filesystem);
+  auto libhdfs = hadoop_file->libhdfs;
+  auto fs = Connect(hadoop_file, path, status);
   if (TF_GetCode(status) != TF_OK) return;
 
   std::string scheme, namenode, hdfs_path;
@@ -458,8 +520,9 @@ void NewReadOnlyMemoryRegionFromFile(const TF_Filesystem* filesystem,
 
 void PathExists(const TF_Filesystem* filesystem, const char* path,
                 TF_Status* status) {
-  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
-  auto fs = Connect(libhdfs, path, status);
+  auto hadoop_file = static_cast<HadoopFile*>(filesystem->plugin_filesystem);
+  auto libhdfs = hadoop_file->libhdfs;
+  auto fs = Connect(hadoop_file, path, status);
   if (TF_GetCode(status) != TF_OK) return;
 
   std::string scheme, namenode, hdfs_path;
@@ -474,8 +537,9 @@ void PathExists(const TF_Filesystem* filesystem, const char* path,
 
 void Stat(const TF_Filesystem* filesystem, const char* path,
           TF_FileStatistics* stats, TF_Status* status) {
-  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
-  auto fs = Connect(libhdfs, path, status);
+  auto hadoop_file = static_cast<HadoopFile*>(filesystem->plugin_filesystem);
+  auto libhdfs = hadoop_file->libhdfs;
+  auto fs = Connect(hadoop_file, path, status);
   if (TF_GetCode(status) != TF_OK) return;
 
   std::string scheme, namenode, hdfs_path;
@@ -493,8 +557,9 @@ void Stat(const TF_Filesystem* filesystem, const char* path,
 
 int64_t GetFileSize(const TF_Filesystem* filesystem, const char* path,
                     TF_Status* status) {
-  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
-  auto fs = Connect(libhdfs, path, status);
+  auto hadoop_file = static_cast<HadoopFile*>(filesystem->plugin_filesystem);
+  auto libhdfs = hadoop_file->libhdfs;
+  auto fs = Connect(hadoop_file, path, status);
   if (TF_GetCode(status) != TF_OK) return -1;
 
   std::string scheme, namenode, hdfs_path;
@@ -514,8 +579,9 @@ int64_t GetFileSize(const TF_Filesystem* filesystem, const char* path,
 
 void DeleteFile(const TF_Filesystem* filesystem, const char* path,
                 TF_Status* status) {
-  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
-  auto fs = Connect(libhdfs, path, status);
+  auto hadoop_file = static_cast<HadoopFile*>(filesystem->plugin_filesystem);
+  auto libhdfs = hadoop_file->libhdfs;
+  auto fs = Connect(hadoop_file, path, status);
   if (TF_GetCode(status) != TF_OK) return;
 
   std::string scheme, namenode, hdfs_path;
@@ -529,8 +595,9 @@ void DeleteFile(const TF_Filesystem* filesystem, const char* path,
 
 void CreateDir(const TF_Filesystem* filesystem, const char* path,
                TF_Status* status) {
-  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
-  auto fs = Connect(libhdfs, path, status);
+  auto hadoop_file = static_cast<HadoopFile*>(filesystem->plugin_filesystem);
+  auto libhdfs = hadoop_file->libhdfs;
+  auto fs = Connect(hadoop_file, path, status);
   if (TF_GetCode(status) != TF_OK) return;
 
   std::string scheme, namenode, hdfs_path;
@@ -544,8 +611,9 @@ void CreateDir(const TF_Filesystem* filesystem, const char* path,
 
 void DeleteDir(const TF_Filesystem* filesystem, const char* path,
                TF_Status* status) {
-  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
-  auto fs = Connect(libhdfs, path, status);
+  auto hadoop_file = static_cast<HadoopFile*>(filesystem->plugin_filesystem);
+  auto libhdfs = hadoop_file->libhdfs;
+  auto fs = Connect(hadoop_file, path, status);
   if (TF_GetCode(status) != TF_OK) return;
 
   std::string scheme, namenode, hdfs_path;
@@ -580,8 +648,9 @@ void DeleteDir(const TF_Filesystem* filesystem, const char* path,
 
 void RenameFile(const TF_Filesystem* filesystem, const char* src,
                 const char* dst, TF_Status* status) {
-  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
-  auto fs = Connect(libhdfs, src, status);
+  auto hadoop_file = static_cast<HadoopFile*>(filesystem->plugin_filesystem);
+  auto libhdfs = hadoop_file->libhdfs;
+  auto fs = Connect(hadoop_file, src, status);
   if (TF_GetCode(status) != TF_OK) return;
 
   std::string scheme, namenode, hdfs_path_src, hdfs_path_dst;
@@ -601,8 +670,9 @@ void RenameFile(const TF_Filesystem* filesystem, const char* src,
 
 int GetChildren(const TF_Filesystem* filesystem, const char* path,
                 char*** entries, TF_Status* status) {
-  auto libhdfs = static_cast<LibHDFS*>(filesystem->plugin_filesystem);
-  auto fs = Connect(libhdfs, path, status);
+  auto hadoop_file = static_cast<HadoopFile*>(filesystem->plugin_filesystem);
+  auto libhdfs = hadoop_file->libhdfs;
+  auto fs = Connect(hadoop_file, path, status);
   if (TF_GetCode(status) != TF_OK) return -1;
 
   std::string scheme, namenode, hdfs_path;
@@ -638,7 +708,9 @@ int GetChildren(const TF_Filesystem* filesystem, const char* path,
   return num_entries;
 }
 
-// TODO(vnvo2409): Implement later
+static char* TranslateName(const TF_Filesystem* filesystem, const char* uri) {
+  return strdup(uri);
+}
 
 }  // namespace tf_hadoop_filesystem
 
@@ -646,6 +718,42 @@ static void ProvideFilesystemSupportFor(TF_FilesystemPluginOps* ops,
                                         const char* uri) {
   TF_SetFilesystemVersionMetadata(ops);
   ops->scheme = strdup(uri);
+
+  ops->random_access_file_ops = static_cast<TF_RandomAccessFileOps*>(
+      plugin_memory_allocate(TF_RANDOM_ACCESS_FILE_OPS_SIZE));
+  ops->random_access_file_ops->cleanup = tf_random_access_file::Cleanup;
+  ops->random_access_file_ops->read = tf_random_access_file::Read;
+
+  ops->writable_file_ops = static_cast<TF_WritableFileOps*>(
+      plugin_memory_allocate(TF_WRITABLE_FILE_OPS_SIZE));
+  ops->writable_file_ops->cleanup = tf_writable_file::Cleanup;
+  ops->writable_file_ops->append = tf_writable_file::Append;
+  ops->writable_file_ops->tell = tf_writable_file::Tell;
+  ops->writable_file_ops->flush = tf_writable_file::Flush;
+  ops->writable_file_ops->sync = tf_writable_file::Sync;
+  ops->writable_file_ops->close = tf_writable_file::Close;
+
+  ops->filesystem_ops = static_cast<TF_FilesystemOps*>(
+      plugin_memory_allocate(TF_FILESYSTEM_OPS_SIZE));
+  ops->filesystem_ops->init = tf_hadoop_filesystem::Init;
+  ops->filesystem_ops->cleanup = tf_hadoop_filesystem::Cleanup;
+  ops->filesystem_ops->new_random_access_file =
+      tf_hadoop_filesystem::NewRandomAccessFile;
+  ops->filesystem_ops->new_writable_file =
+      tf_hadoop_filesystem::NewWritableFile;
+  ops->filesystem_ops->new_appendable_file =
+      tf_hadoop_filesystem::NewAppendableFile;
+  ops->filesystem_ops->new_read_only_memory_region_from_file =
+      tf_hadoop_filesystem::NewReadOnlyMemoryRegionFromFile;
+  ops->filesystem_ops->path_exists = tf_hadoop_filesystem::PathExists;
+  ops->filesystem_ops->stat = tf_hadoop_filesystem::Stat;
+  ops->filesystem_ops->get_file_size = tf_hadoop_filesystem::GetFileSize;
+  ops->filesystem_ops->delete_file = tf_hadoop_filesystem::DeleteFile;
+  ops->filesystem_ops->create_dir = tf_hadoop_filesystem::CreateDir;
+  ops->filesystem_ops->delete_dir = tf_hadoop_filesystem::DeleteDir;
+  ops->filesystem_ops->rename_file = tf_hadoop_filesystem::RenameFile;
+  ops->filesystem_ops->get_children = tf_hadoop_filesystem::GetChildren;
+  ops->filesystem_ops->translate_name = tf_hadoop_filesystem::TranslateName;
 }
 
 void TF_InitPlugin(TF_FilesystemPluginInfo* info) {
diff --git a/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.h b/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.h
index 850cefe0231..06b91a68123 100644
--- a/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.h
+++ b/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.h
@@ -15,7 +15,73 @@ limitations under the License.
 #ifndef TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_HADOOP_HADOOP_FILESYSTEM_H_
 #define TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_HADOOP_HADOOP_FILESYSTEM_H_
 
+#include <map>
+#include <string>
+
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
 #include "tensorflow/c/tf_status.h"
+#include "third_party/hadoop/hdfs.h"
+
+void ParseHadoopPath(const std::string& fname, std::string* scheme,
+                     std::string* namenode, std::string* path);
+void SplitArchiveNameAndPath(std::string* path, std::string* nn,
+                             TF_Status* status);
+class LibHDFS;
+
+namespace tf_random_access_file {
+void Cleanup(TF_RandomAccessFile* file);
+int64_t Read(const TF_RandomAccessFile* file, uint64_t offset, size_t n,
+             char* buffer, TF_Status* status);
+}  // namespace tf_random_access_file
+
+namespace tf_writable_file {
+void Cleanup(TF_WritableFile* file);
+void Append(const TF_WritableFile* file, const char* buffer, size_t n,
+            TF_Status* status);
+int64_t Tell(const TF_WritableFile* file, TF_Status* status);
+void Sync(const TF_WritableFile* file, TF_Status* status);
+void Flush(const TF_WritableFile* file, TF_Status* status);
+void Close(const TF_WritableFile* file, TF_Status* status);
+}  // namespace tf_writable_file
+
+namespace tf_hadoop_filesystem {
+typedef struct HadoopFile {
+  LibHDFS* libhdfs;
+  absl::Mutex connection_cache_lock;
+  std::map<std::string, hdfsFS> connection_cache
+      ABSL_GUARDED_BY(connection_cache_lock);
+  HadoopFile(TF_Status* status);
+} HadoopFile;
+
+void Init(TF_Filesystem* filesystem, TF_Status* status);
+void Cleanup(TF_Filesystem* filesystem);
+void NewRandomAccessFile(const TF_Filesystem* filesystem, const char* path,
+                         TF_RandomAccessFile* file, TF_Status* status);
+void NewWritableFile(const TF_Filesystem* filesystem, const char* path,
+                     TF_WritableFile* file, TF_Status* status);
+void NewAppendableFile(const TF_Filesystem* filesystem, const char* path,
+                       TF_WritableFile* file, TF_Status* status);
+void NewReadOnlyMemoryRegionFromFile(const TF_Filesystem* filesystem,
+                                     const char* path,
+                                     TF_ReadOnlyMemoryRegion* region,
+                                     TF_Status* status);
+void PathExists(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status);
+void Stat(const TF_Filesystem* filesystem, const char* path,
+          TF_FileStatistics* stats, TF_Status* status);
+int64_t GetFileSize(const TF_Filesystem* filesystem, const char* path,
+                    TF_Status* status);
+void DeleteFile(const TF_Filesystem* filesystem, const char* path,
+                TF_Status* status);
+void CreateDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status);
+void DeleteDir(const TF_Filesystem* filesystem, const char* path,
+               TF_Status* status);
+void RenameFile(const TF_Filesystem* filesystem, const char* src,
+                const char* dst, TF_Status* status);
+int GetChildren(const TF_Filesystem* filesystem, const char* path,
+                char*** entries, TF_Status* status);
+}  // namespace tf_hadoop_filesystem
 
 #endif  // TENSORFLOW_C_EXPERIMENTAL_FILESYSTEM_PLUGINS_HADOOP_HADOOP_FILESYSTEM_H_
diff --git a/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem_test.cc b/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem_test.cc
new file mode 100644
index 00000000000..df85ba9e4dd
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem_test.cc
@@ -0,0 +1,460 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/filesystem/plugins/hadoop/hadoop_filesystem.h"
+
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/stacktrace_handler.h"
+#include "tensorflow/core/platform/test.h"
+#include "third_party/hadoop/hdfs.h"
+
+#define ASSERT_TF_OK(x) ASSERT_EQ(TF_OK, TF_GetCode(x)) << TF_Message(x)
+#define EXPECT_TF_OK(x) EXPECT_EQ(TF_OK, TF_GetCode(x)) << TF_Message(x)
+
+namespace tensorflow {
+namespace {
+
+class HadoopFileSystemTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    status_ = TF_NewStatus();
+    filesystem_ = new TF_Filesystem;
+    tf_hadoop_filesystem::Init(filesystem_, status_);
+    ASSERT_TF_OK(status_) << "Could not initialize filesystem. "
+                          << TF_Message(status_);
+  }
+  void TearDown() override {
+    TF_DeleteStatus(status_);
+    tf_hadoop_filesystem::Cleanup(filesystem_);
+    delete filesystem_;
+  }
+
+  std::string TmpDir(const std::string& path) {
+    char* test_dir = getenv("HADOOP_TEST_TMPDIR");
+    if (test_dir != nullptr) {
+      return io::JoinPath(std::string(test_dir), path);
+    } else {
+      return "file://" + io::JoinPath(testing::TmpDir(), path);
+    }
+  }
+
+  std::unique_ptr<TF_WritableFile, void (*)(TF_WritableFile* file)>
+  GetWriter() {
+    std::unique_ptr<TF_WritableFile, void (*)(TF_WritableFile * file)> writer(
+        new TF_WritableFile, [](TF_WritableFile* file) {
+          if (file != nullptr) {
+            if (file->plugin_file != nullptr) tf_writable_file::Cleanup(file);
+            delete file;
+          }
+        });
+    writer->plugin_file = nullptr;
+    return writer;
+  }
+
+  std::unique_ptr<TF_RandomAccessFile, void (*)(TF_RandomAccessFile* file)>
+  GetReader() {
+    std::unique_ptr<TF_RandomAccessFile, void (*)(TF_RandomAccessFile * file)>
+        reader(new TF_RandomAccessFile, [](TF_RandomAccessFile* file) {
+          if (file != nullptr) {
+            if (file->plugin_file != nullptr)
+              tf_random_access_file::Cleanup(file);
+            delete file;
+          }
+        });
+    reader->plugin_file = nullptr;
+    return reader;
+  }
+
+  void WriteString(const std::string& path, const std::string& content) {
+    auto writer = GetWriter();
+    tf_hadoop_filesystem::NewWritableFile(filesystem_, path.c_str(),
+                                          writer.get(), status_);
+    if (TF_GetCode(status_) != TF_OK) return;
+    tf_writable_file::Append(writer.get(), content.c_str(), content.length(),
+                             status_);
+    if (TF_GetCode(status_) != TF_OK) return;
+    tf_writable_file::Close(writer.get(), status_);
+    if (TF_GetCode(status_) != TF_OK) return;
+  }
+
+  std::string ReadAll(const std::string& path) {
+    auto reader = GetReader();
+    tf_hadoop_filesystem::NewRandomAccessFile(filesystem_, path.c_str(),
+                                              reader.get(), status_);
+    if (TF_GetCode(status_) != TF_OK) return "";
+
+    auto file_size =
+        tf_hadoop_filesystem::GetFileSize(filesystem_, path.c_str(), status_);
+    if (TF_GetCode(status_) != TF_OK) return "";
+
+    std::string content;
+    content.resize(file_size);
+    auto read = tf_random_access_file::Read(reader.get(), 0, file_size,
+                                            &content[0], status_);
+    if (TF_GetCode(status_) != TF_OK) return "";
+    if (read >= 0) content.resize(read);
+    if (file_size != content.size())
+      TF_SetStatus(
+          status_, TF_DATA_LOSS,
+          std::string("expected " + std::to_string(file_size) + " got " +
+                      std::to_string(content.size()) + " bytes")
+              .c_str());
+    return content;
+  }
+
+ protected:
+  TF_Filesystem* filesystem_;
+  TF_Status* status_;
+};
+
+TEST_F(HadoopFileSystemTest, RandomAccessFile) {
+  const std::string path = TmpDir("RandomAccessFile");
+  const std::string content = "abcdefghijklmn";
+
+  WriteString(path, content);
+  ASSERT_TF_OK(status_);
+
+  auto reader = GetReader();
+  tf_hadoop_filesystem::NewRandomAccessFile(filesystem_, path.c_str(),
+                                            reader.get(), status_);
+  EXPECT_TF_OK(status_);
+
+  std::string result;
+  result.resize(content.size());
+  auto read = tf_random_access_file::Read(reader.get(), 0, content.size(),
+                                          &result[0], status_);
+  result.resize(read);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(content.size(), result.size());
+  EXPECT_EQ(content, result);
+
+  result.clear();
+  result.resize(4);
+  read = tf_random_access_file::Read(reader.get(), 2, 4, &result[0], status_);
+  result.resize(read);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(4, result.size());
+  EXPECT_EQ(content.substr(2, 4), result);
+}
+
+TEST_F(HadoopFileSystemTest, WritableFile) {
+  auto writer = GetWriter();
+  const std::string path = TmpDir("WritableFile");
+  tf_hadoop_filesystem::NewWritableFile(filesystem_, path.c_str(), writer.get(),
+                                        status_);
+  EXPECT_TF_OK(status_);
+  tf_writable_file::Append(writer.get(), "content1,", strlen("content1,"),
+                           status_);
+  EXPECT_TF_OK(status_);
+  auto pos = tf_writable_file::Tell(writer.get(), status_);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(pos, 9);
+
+  tf_writable_file::Append(writer.get(), "content2", strlen("content2"),
+                           status_);
+  EXPECT_TF_OK(status_);
+  tf_writable_file::Flush(writer.get(), status_);
+  EXPECT_TF_OK(status_);
+  tf_writable_file::Sync(writer.get(), status_);
+  EXPECT_TF_OK(status_);
+  tf_writable_file::Close(writer.get(), status_);
+  EXPECT_TF_OK(status_);
+
+  auto content = ReadAll(path);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ("content1,content2", content);
+}
+
+TEST_F(HadoopFileSystemTest, PathExists) {
+  const std::string path = TmpDir("PathExists");
+  tf_hadoop_filesystem::PathExists(filesystem_, path.c_str(), status_);
+  EXPECT_EQ(TF_NOT_FOUND, TF_GetCode(status_)) << TF_Message(status_);
+  TF_SetStatus(status_, TF_OK, "");
+  WriteString(path, "test");
+  ASSERT_TF_OK(status_);
+  tf_hadoop_filesystem::PathExists(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+}
+
+TEST_F(HadoopFileSystemTest, GetChildren) {
+  const std::string base = TmpDir("GetChildren");
+  tf_hadoop_filesystem::CreateDir(filesystem_, base.c_str(), status_);
+  EXPECT_TF_OK(status_);
+
+  const std::string file = io::JoinPath(base, "TestFile.csv");
+  WriteString(file, "test");
+  EXPECT_TF_OK(status_);
+
+  const std::string subdir = io::JoinPath(base, "SubDir");
+  tf_hadoop_filesystem::CreateDir(filesystem_, subdir.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  const std::string subfile = io::JoinPath(subdir, "TestSubFile.csv");
+  WriteString(subfile, "test");
+  EXPECT_TF_OK(status_);
+
+  char** entries;
+  auto num_entries = tf_hadoop_filesystem::GetChildren(
+      filesystem_, base.c_str(), &entries, status_);
+  EXPECT_TF_OK(status_);
+
+  std::vector<std::string> childrens;
+  for (int i = 0; i < num_entries; ++i) {
+    childrens.push_back(entries[i]);
+  }
+  std::sort(childrens.begin(), childrens.end());
+  EXPECT_EQ(std::vector<string>({"SubDir", "TestFile.csv"}), childrens);
+}
+
+TEST_F(HadoopFileSystemTest, DeleteFile) {
+  const std::string path = TmpDir("DeleteFile");
+  WriteString(path, "test");
+  ASSERT_TF_OK(status_);
+  tf_hadoop_filesystem::DeleteFile(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+}
+
+TEST_F(HadoopFileSystemTest, GetFileSize) {
+  const std::string path = TmpDir("GetFileSize");
+  WriteString(path, "test");
+  ASSERT_TF_OK(status_);
+  auto file_size =
+      tf_hadoop_filesystem::GetFileSize(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(4, file_size);
+}
+
+TEST_F(HadoopFileSystemTest, CreateDirStat) {
+  const std::string path = TmpDir("CreateDirStat");
+  tf_hadoop_filesystem::CreateDir(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  TF_FileStatistics stat;
+  tf_hadoop_filesystem::Stat(filesystem_, path.c_str(), &stat, status_);
+  EXPECT_TF_OK(status_);
+  EXPECT_TRUE(stat.is_directory);
+}
+
+TEST_F(HadoopFileSystemTest, DeleteDir) {
+  const std::string path = TmpDir("DeleteDir");
+  tf_hadoop_filesystem::DeleteDir(filesystem_, path.c_str(), status_);
+  EXPECT_NE(TF_GetCode(status_), TF_OK);
+  tf_hadoop_filesystem::CreateDir(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  tf_hadoop_filesystem::DeleteDir(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  TF_FileStatistics stat;
+  tf_hadoop_filesystem::Stat(filesystem_, path.c_str(), &stat, status_);
+  EXPECT_NE(TF_GetCode(status_), TF_OK);
+}
+
+TEST_F(HadoopFileSystemTest, RenameFile) {
+  const std::string src = TmpDir("RenameFileSrc");
+  const std::string dst = TmpDir("RenameFileDst");
+  WriteString(src, "test");
+  ASSERT_TF_OK(status_);
+
+  tf_hadoop_filesystem::RenameFile(filesystem_, src.c_str(), dst.c_str(),
+                                   status_);
+  EXPECT_TF_OK(status_);
+  auto result = ReadAll(dst);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ("test", result);
+}
+
+TEST_F(HadoopFileSystemTest, RenameFileOverwrite) {
+  const std::string src = TmpDir("RenameFileOverwriteSrc");
+  const std::string dst = TmpDir("RenameFileOverwriteDst");
+
+  WriteString(src, "test_old");
+  ASSERT_TF_OK(status_);
+  WriteString(dst, "test_new");
+  ASSERT_TF_OK(status_);
+
+  tf_hadoop_filesystem::PathExists(filesystem_, dst.c_str(), status_);
+  EXPECT_TF_OK(status_);
+  tf_hadoop_filesystem::RenameFile(filesystem_, src.c_str(), dst.c_str(),
+                                   status_);
+  EXPECT_TF_OK(status_);
+
+  auto result = ReadAll(dst);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ("test_old", result);
+}
+
+TEST_F(HadoopFileSystemTest, StatFile) {
+  const std::string path = TmpDir("StatFile");
+  WriteString(path, "test");
+  ASSERT_TF_OK(status_);
+  TF_FileStatistics stat;
+  tf_hadoop_filesystem::Stat(filesystem_, path.c_str(), &stat, status_);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(4, stat.length);
+  EXPECT_FALSE(stat.is_directory);
+}
+
+TEST_F(HadoopFileSystemTest, WriteWhileReading) {
+  const std::string path = TmpDir("WriteWhileReading");
+  // Skip the test if we're not testing on HDFS. Hadoop's local filesystem
+  // implementation makes no guarantees that writable files are readable while
+  // being written.
+  if (path.find_first_of("hdfs://") != 0) GTEST_SKIP();
+
+  auto writer = GetWriter();
+  tf_hadoop_filesystem::NewWritableFile(filesystem_, path.c_str(), writer.get(),
+                                        status_);
+  EXPECT_TF_OK(status_);
+
+  const std::string content1 = "content1";
+  tf_writable_file::Append(writer.get(), content1.c_str(), content1.size(),
+                           status_);
+  EXPECT_TF_OK(status_);
+  tf_writable_file::Flush(writer.get(), status_);
+  EXPECT_TF_OK(status_);
+
+  auto reader = GetReader();
+  tf_hadoop_filesystem::NewRandomAccessFile(filesystem_, path.c_str(),
+                                            reader.get(), status_);
+  EXPECT_TF_OK(status_);
+
+  std::string result;
+  result.resize(content1.size());
+  auto read = tf_random_access_file::Read(reader.get(), 0, content1.size(),
+                                          &result[0], status_);
+  result.resize(read);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(content1, result);
+
+  const std::string content2 = "content2";
+  tf_writable_file::Append(writer.get(), content2.c_str(), content2.size(),
+                           status_);
+  EXPECT_TF_OK(status_);
+  tf_writable_file::Flush(writer.get(), status_);
+  EXPECT_TF_OK(status_);
+
+  result.resize(content2.size());
+  read = tf_random_access_file::Read(reader.get(), content1.size(),
+                                     content2.size(), &result[0], status_);
+  result.resize(read);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(content2, result);
+
+  tf_writable_file::Close(writer.get(), status_);
+  EXPECT_TF_OK(status_);
+}
+
+TEST_F(HadoopFileSystemTest, ReadWhileOverwriting) {
+  static char set_disable_var[] = "HDFS_DISABLE_READ_EOF_RETRIED=1";
+  putenv(set_disable_var);
+
+  const std::string path = TmpDir("ReadWhileOverwriting");
+  if (path.find_first_of("hdfs://") != 0) GTEST_SKIP();
+
+  const string content1 = "content1";
+  WriteString(path, content1);
+  ASSERT_TF_OK(status_);
+
+  auto reader = GetReader();
+  tf_hadoop_filesystem::NewRandomAccessFile(filesystem_, path.c_str(),
+                                            reader.get(), status_);
+  EXPECT_TF_OK(status_);
+
+  std::string result;
+  result.resize(content1.size());
+  auto read = tf_random_access_file::Read(reader.get(), 0, content1.size(),
+                                          &result[0], status_);
+  result.resize(read);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(content1, result);
+
+  tf_hadoop_filesystem::DeleteFile(filesystem_, path.c_str(), status_);
+  EXPECT_TF_OK(status_);
+
+  string content2 = "overwrite";
+  WriteString(path, content1 + content2);
+  ASSERT_TF_OK(status_);
+
+  result.resize(content2.size());
+  read = tf_random_access_file::Read(reader.get(), content1.size(),
+                                     content2.size(), &result[0], status_);
+  result.resize(read);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(0, result.size());
+
+  static char set_enable_var[] = "HDFS_DISABLE_READ_EOF_RETRIED=0";
+  putenv(set_enable_var);
+}
+
+TEST_F(HadoopFileSystemTest, HarSplit) {
+  const std::string har_path =
+      "har://hdfs-root/user/j.doe/my_archive.har/dir0/dir1/file.txt";
+  std::string scheme, namenode, path;
+  ParseHadoopPath(har_path, &scheme, &namenode, &path);
+  EXPECT_EQ("har", scheme);
+  EXPECT_EQ("hdfs-root", namenode);
+  EXPECT_EQ("/user/j.doe/my_archive.har/dir0/dir1/file.txt", path);
+  SplitArchiveNameAndPath(&path, &namenode, status_);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ("har://hdfs-root/user/j.doe/my_archive.har", namenode);
+  EXPECT_EQ("/dir0/dir1/file.txt", path);
+}
+
+TEST_F(HadoopFileSystemTest, NoHarExtension) {
+  const std::string har_path =
+      "har://hdfs-root/user/j.doe/my_archive/dir0/dir1/file.txt";
+  std::string scheme, namenode, path;
+  ParseHadoopPath(har_path, &scheme, &namenode, &path);
+  EXPECT_EQ("har", scheme);
+  EXPECT_EQ("hdfs-root", namenode);
+  EXPECT_EQ("/user/j.doe/my_archive/dir0/dir1/file.txt", path);
+  SplitArchiveNameAndPath(&path, &namenode, status_);
+  EXPECT_EQ(TF_GetCode(status_), TF_INVALID_ARGUMENT) << TF_Message(status_);
+}
+
+TEST_F(HadoopFileSystemTest, HarRootPath) {
+  const std::string har_path = "har://hdfs-root/user/j.doe/my_archive.har";
+  std::string scheme, namenode, path;
+  ParseHadoopPath(har_path, &scheme, &namenode, &path);
+  EXPECT_EQ("har", scheme);
+  EXPECT_EQ("hdfs-root", namenode);
+  EXPECT_EQ("/user/j.doe/my_archive.har", path);
+  SplitArchiveNameAndPath(&path, &namenode, status_);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ("har://hdfs-root/user/j.doe/my_archive.har", namenode);
+  EXPECT_EQ("/", path);
+}
+
+TEST_F(HadoopFileSystemTest, WriteLargeFile) {
+  if (std::getenv("HADOOP_TEST_LARGE_FILE") != "1") GTEST_SKIP();
+  const std::string path = TmpDir("WriteLargeFile");
+  const size_t file_size =
+      static_cast<size_t>(std::numeric_limits<tSize>::max()) + 1024;
+  // Fake a test string.
+  std::string source(file_size, {});
+  for (size_t i = 0; i < file_size; ++i) source[i] = (i % 128);
+  WriteString(path, source);
+  ASSERT_TF_OK(status_);
+  auto result = ReadAll(path);
+  EXPECT_TF_OK(status_);
+  EXPECT_EQ(source, result);
+}
+// NewAppendableFile() is not testable. Local filesystem maps to
+// ChecksumFileSystem in Hadoop, where appending is an unsupported operation.
+
+}  // namespace
+}  // namespace tensorflow
+
+GTEST_API_ int main(int argc, char** argv) {
+  tensorflow::testing::InstallStacktraceHandler();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/c/experimental/filesystem/plugins/posix/BUILD b/tensorflow/c/experimental/filesystem/plugins/posix/BUILD
index 3afe114b5a6..b87dddb96a1 100644
--- a/tensorflow/c/experimental/filesystem/plugins/posix/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/posix/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 # Experimental posix filesystem plugin.
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
 
diff --git a/tensorflow/c/experimental/filesystem/plugins/windows/BUILD b/tensorflow/c/experimental/filesystem/plugins/windows/BUILD
index b845d1e3616..02f7b5ba706 100644
--- a/tensorflow/c/experimental/filesystem/plugins/windows/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/windows/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 # Experimental windows filesystem plugin.
 load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object")
 
diff --git a/tensorflow/c/experimental/gradients/BUILD b/tensorflow/c/experimental/gradients/BUILD
index 36a3251def7..e8a50e32216 100644
--- a/tensorflow/c/experimental/gradients/BUILD
+++ b/tensorflow/c/experimental/gradients/BUILD
@@ -1,3 +1,6 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 # Library of gradient functions.
 package(
     licenses = ["notice"],  # Apache 2.0
@@ -16,7 +19,7 @@ cc_library(
         "//tensorflow/c/eager:abstract_operation",
         "//tensorflow/c/eager:abstract_tensor_handle",
         "//tensorflow/c/eager:c_api_unified_internal",
-        "//tensorflow/c/eager:gradients",
+        "//tensorflow/c/eager:gradients_internal",
         "//tensorflow/core/lib/llvm_rtti",
     ],
 )
@@ -31,14 +34,11 @@ cc_library(
         "//tensorflow:internal",
     ],
     deps = [
-        "//tensorflow/c/eager:abstract_operation",
         "//tensorflow/c/eager:abstract_tensor_handle",
-        "//tensorflow/c/eager:c_api_unified_internal",
-        "//tensorflow/c/eager:gradients",
+        "//tensorflow/c/eager:gradients_internal",
         "//tensorflow/c/experimental/ops:array_ops",
         "//tensorflow/c/experimental/ops:math_ops",
         "//tensorflow/c/experimental/ops:nn_ops",
-        "//tensorflow/core/lib/llvm_rtti",
     ],
 )
 
@@ -52,13 +52,46 @@ cc_library(
         "//tensorflow:internal",
     ],
     deps = [
-        "//tensorflow/c/eager:abstract_operation",
         "//tensorflow/c/eager:abstract_tensor_handle",
-        "//tensorflow/c/eager:c_api_unified_internal",
-        "//tensorflow/c/eager:gradients",
+        "//tensorflow/c/eager:gradients_internal",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
         "//tensorflow/c/experimental/ops:array_ops",
         "//tensorflow/c/experimental/ops:math_ops",
         "//tensorflow/c/experimental/ops:nn_ops",
         "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/core/platform:errors",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "gradients",
+    hdrs = [
+        "array_grad.h",
+        "math_grad.h",
+        "nn_grad.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":array_grad",
+        ":math_grad",
+        ":nn_grad",
+        "//tensorflow/c/eager:gradients_internal",
+    ],
+)
+
+filegroup(
+    name = "pywrap_required_hdrs",
+    srcs = [
+        "array_grad.h",
+        "math_grad.h",
+        "nn_grad.h",
+    ],
+    visibility = [
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/python:__pkg__",
     ],
 )
diff --git a/tensorflow/c/experimental/gradients/math_grad.cc b/tensorflow/c/experimental/gradients/math_grad.cc
index 3537b30c597..5551642127d 100644
--- a/tensorflow/c/experimental/gradients/math_grad.cc
+++ b/tensorflow/c/experimental/gradients/math_grad.cc
@@ -22,10 +22,10 @@ limitations under the License.
 
 using std::vector;
 using tensorflow::ops::Conj;
-using tensorflow::ops::Identity;
 using tensorflow::ops::MatMul;
 using tensorflow::ops::Mul;
-using tensorflow::ops::ZerosLike;
+using tensorflow::ops::Neg;
+using tensorflow::ops::SqrtGrad;
 
 namespace tensorflow {
 namespace gradients {
@@ -36,21 +36,14 @@ class AddGradientFunction : public GradientFunction {
   Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
                  vector<AbstractTensorHandle*>* grad_outputs) override {
     grad_outputs->resize(2);
-    vector<AbstractTensorHandle*> identity_outputs(1);
-    // TODO(b/145674566): Handle name unification in tracing code.
     // TODO(b/161805092): Support broadcasting.
 
-    std::string name = "Identity_A";
-    TF_RETURN_IF_ERROR(ops::Identity(ctx->ctx, {grad_inputs[0]},
-                                     absl::MakeSpan(identity_outputs),
-                                     name.c_str()));
-    (*grad_outputs)[0] = identity_outputs[0];
+    DCHECK(grad_inputs[0]);
+    (*grad_outputs)[0] = grad_inputs[0];
+    (*grad_outputs)[1] = grad_inputs[0];
 
-    name = "Identity_B";
-    TF_RETURN_IF_ERROR(ops::Identity(ctx->ctx, {grad_inputs[0]},
-                                     absl::MakeSpan(identity_outputs),
-                                     name.c_str()));
-    (*grad_outputs)[1] = identity_outputs[0];
+    (*grad_outputs)[0]->Ref();
+    (*grad_outputs)[1]->Ref();
     return Status::OK();
   }
   ~AddGradientFunction() override {}
@@ -81,6 +74,25 @@ class ExpGradientFunction : public GradientFunction {
   AbstractTensorHandlePtr exp_;
 };
 
+class SqrtGradientFunction : public GradientFunction {
+ public:
+  explicit SqrtGradientFunction(AbstractTensorHandle* sqrt) : sqrt_(sqrt) {
+    sqrt->Ref();
+  }
+  Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
+                 vector<AbstractTensorHandle*>* grad_outputs) override {
+    std::string name = "Sqrt_Grad";
+    grad_outputs->resize(1);
+    TF_RETURN_IF_ERROR(SqrtGrad(ctx->ctx, {sqrt_.get(), grad_inputs[0]},
+                                absl::MakeSpan(*grad_outputs), name.c_str()));
+    return Status::OK();
+  }
+  ~SqrtGradientFunction() override {}
+
+ private:
+  AbstractTensorHandlePtr sqrt_;
+};
+
 class MatMulGradientFunction : public GradientFunction {
  public:
   explicit MatMulGradientFunction(vector<AbstractTensorHandle*> f_inputs,
@@ -190,6 +202,56 @@ class MatMulGradientFunction : public GradientFunction {
   AttrBuilder forward_attrs;
 };
 
+class NegGradientFunction : public GradientFunction {
+ public:
+  Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
+                 vector<AbstractTensorHandle*>* grad_outputs) override {
+    /* Given upstream grad U and a Neg op Y = -X, the gradients are:
+     *
+     *    dX =  -U
+     *
+     */
+
+    grad_outputs->resize(1);
+    std::string name = "Neg_Grad";
+    TF_RETURN_IF_ERROR(ops::Neg(ctx->ctx, {grad_inputs[0]},
+                                absl::MakeSpan(*grad_outputs), name.c_str()));
+    return Status::OK();
+  }
+  ~NegGradientFunction() override {}
+};
+
+class SubGradientFunction : public GradientFunction {
+ public:
+  Status Compute(Context* ctx, const IncomingGradients& grad_inputs,
+                 vector<AbstractTensorHandle*>* grad_outputs) override {
+    /* Given upstream grad U and a Sub op A-B, the gradients are:
+     *
+     *    dA =  U
+     *    dB = -U
+     *
+     */
+
+    grad_outputs->resize(2);
+
+    // Grad for A
+    DCHECK(grad_inputs[0]);
+    (*grad_outputs)[0] = grad_inputs[0];
+    (*grad_outputs)[0]->Ref();
+
+    // Grad for B
+    // negate the upstream grad
+    std::vector<AbstractTensorHandle*> neg_outputs(1);
+    std::string name = "Neg_Sub_Grad_B";
+    TF_RETURN_IF_ERROR(ops::Neg(ctx->ctx, {grad_inputs[0]},
+                                absl::MakeSpan(neg_outputs), name.c_str()));
+    (*grad_outputs)[1] = neg_outputs[0];
+
+    return Status::OK();
+  }
+  ~SubGradientFunction() override {}
+};
+
 }  // namespace
 
 BackwardFunction* AddRegisterer(const ForwardOperation& op) {
@@ -219,5 +281,32 @@ BackwardFunction* MatMulRegisterer(const ForwardOperation& op) {
   return new BackwardFunction(gradient_function, default_gradients);
 }
 
+BackwardFunction* SqrtRegisterer(const ForwardOperation& op) {
+  auto gradient_function = new SqrtGradientFunction(op.outputs[0]);
+  // For ops with a single output, the gradient function is not called if there
+  // is no incoming gradient. So we do not need to worry about creating zeros
+  // grads in this case.
+  auto default_gradients = new PassThroughDefaultGradients(op);
+  return new BackwardFunction(gradient_function, default_gradients);
+}
+
+BackwardFunction* NegRegisterer(const ForwardOperation& op) {
+  auto gradient_function = new NegGradientFunction;
+  // For ops with a single output, the gradient function is not called if there
+  // is no incoming gradient. So we do not need to worry about creating zeros
+  // grads in this case.
+  auto default_gradients = new PassThroughDefaultGradients(op);
+  return new BackwardFunction(gradient_function, default_gradients);
+}
+
+BackwardFunction* SubRegisterer(const ForwardOperation& op) {
+  // For ops with a single output, the gradient function is not called if there
+  // is no incoming gradient. So we do not need to worry about creating zeros
+  // grads in this case.
+  auto gradient_function = new SubGradientFunction;
+  auto default_gradients = new PassThroughDefaultGradients(op);
+  return new BackwardFunction(gradient_function, default_gradients);
+}
+
 }  // namespace gradients
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/gradients/math_grad.h b/tensorflow/c/experimental/gradients/math_grad.h
index 205419e1201..756c5f84153 100644
--- a/tensorflow/c/experimental/gradients/math_grad.h
+++ b/tensorflow/c/experimental/gradients/math_grad.h
@@ -19,10 +19,15 @@ limitations under the License.
 
 namespace tensorflow {
 namespace gradients {
+
 BackwardFunction* AddRegisterer(const ForwardOperation& op);
 BackwardFunction* ExpRegisterer(const ForwardOperation& op);
 BackwardFunction* MatMulRegisterer(const ForwardOperation& op);
+BackwardFunction* SqrtRegisterer(const ForwardOperation& op);
+BackwardFunction* NegRegisterer(const ForwardOperation& op);
+BackwardFunction* SubRegisterer(const ForwardOperation& op);
+
 }  // namespace gradients
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_MATH_GRAD_H_
\ No newline at end of file
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_MATH_GRAD_H_
diff --git a/tensorflow/c/experimental/gradients/nn_grad.cc b/tensorflow/c/experimental/gradients/nn_grad.cc
index 3da1e0dc153..64532c8ffc0 100644
--- a/tensorflow/c/experimental/gradients/nn_grad.cc
+++ b/tensorflow/c/experimental/gradients/nn_grad.cc
@@ -14,17 +14,19 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/gradients/nn_grad.h"
 
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/experimental/ops/array_ops.h"
 #include "tensorflow/c/experimental/ops/math_ops.h"
 #include "tensorflow/c/experimental/ops/nn_ops.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
 
 using std::vector;
-using tensorflow::ops::Conj;
-using tensorflow::ops::Identity;
 using tensorflow::ops::Mul;
 using tensorflow::ops::ReluGrad;
-using tensorflow::ops::SparseSoftmaxCrossEntropyLoss;
-using tensorflow::ops::ZerosLike;
 
 namespace tensorflow {
 namespace gradients {
@@ -58,9 +60,31 @@ class ReluGradientFunction : public GradientFunction {
   vector<AbstractTensorHandle*> forward_outputs;
 };
 
-class SparseSoftmaxCrossEntropyLossGradientFunction : public GradientFunction {
+Status BroadcastMul(AbstractContext* ctx, AbstractTensorHandle* vec,
+                    AbstractTensorHandle* mat,
+                    absl::Span<AbstractTensorHandle*> outputs) {
+  if (!isa<ImmediateExecutionContext>(ctx)) {
+    // TODO(b/168850692): Fix this.
+    return errors::Unimplemented(
+        "BroadcastMul is not supported in tracing mode yet.");
+  }
+  auto imm_ctx = dyn_cast<ImmediateExecutionContext>(ctx);
+  AbstractTensorPtr minus_1(imm_ctx->CreateInt32Scalar(-1));
+  ImmediateTensorHandlePtr dim(imm_ctx->CreateLocalHandle(minus_1.get()));
+  vector<AbstractTensorHandle*> expand_dims_outputs(1);
+  TF_RETURN_IF_ERROR(ops::ExpandDims(ctx, {vec, dim.get()},
+                                     absl::MakeSpan(expand_dims_outputs),
+                                     "ExpandDims"));
+  TF_RETURN_IF_ERROR(
+      ops::Mul(ctx, {expand_dims_outputs[0], mat}, outputs, "Mul"));
+  expand_dims_outputs[0]->Unref();
+  return Status::OK();
+}
+
+class SparseSoftmaxCrossEntropyWithLogitsGradientFunction
+    : public GradientFunction {
  public:
-  explicit SparseSoftmaxCrossEntropyLossGradientFunction(
+  explicit SparseSoftmaxCrossEntropyWithLogitsGradientFunction(
       vector<AbstractTensorHandle*> f_outputs)
       : forward_outputs(f_outputs) {}
 
@@ -69,12 +93,10 @@ class SparseSoftmaxCrossEntropyLossGradientFunction : public GradientFunction {
     grad_outputs->resize(2);
 
     // Grad for Softmax Input
-    std::string name = "Mul_Softmax_Grad";
     vector<AbstractTensorHandle*> mul_outputs(1);
-    TF_RETURN_IF_ERROR(
-        ops::Mul(ctx->ctx, {grad_inputs[0], forward_outputs[1]},
-                 absl::MakeSpan(mul_outputs),
-                 name.c_str()));  // upstream_grad * local softmax grad
+    TF_RETURN_IF_ERROR(BroadcastMul(
+        ctx->ctx, grad_inputs[0], forward_outputs[1],
+        absl::MakeSpan(mul_outputs)));  // upstream_grad * local softmax grad
     (*grad_outputs)[0] = mul_outputs[0];
 
     // Grad for labels is null
@@ -82,7 +104,7 @@ class SparseSoftmaxCrossEntropyLossGradientFunction : public GradientFunction {
 
     return Status::OK();
   }
-  ~SparseSoftmaxCrossEntropyLossGradientFunction() override {}
+  ~SparseSoftmaxCrossEntropyWithLogitsGradientFunction() override {}
 
  private:
   vector<AbstractTensorHandle*> forward_outputs;
@@ -99,10 +121,10 @@ BackwardFunction* ReluRegisterer(const ForwardOperation& op) {
   return new BackwardFunction(gradient_function, default_gradients);
 }
 
-BackwardFunction* SparseSoftmaxCrossEntropyLossRegisterer(
+BackwardFunction* SparseSoftmaxCrossEntropyWithLogitsRegisterer(
     const ForwardOperation& op) {
   auto gradient_function =
-      new SparseSoftmaxCrossEntropyLossGradientFunction(op.outputs);
+      new SparseSoftmaxCrossEntropyWithLogitsGradientFunction(op.outputs);
   auto default_gradients = new PassThroughDefaultGradients(op);
   return new BackwardFunction(gradient_function, default_gradients);
 }
diff --git a/tensorflow/c/experimental/gradients/nn_grad.h b/tensorflow/c/experimental/gradients/nn_grad.h
index d002725847f..034f20d7325 100644
--- a/tensorflow/c/experimental/gradients/nn_grad.h
+++ b/tensorflow/c/experimental/gradients/nn_grad.h
@@ -20,9 +20,9 @@ limitations under the License.
 namespace tensorflow {
 namespace gradients {
 BackwardFunction* ReluRegisterer(const ForwardOperation& op);
-BackwardFunction* SparseSoftmaxCrossEntropyLossRegisterer(
+BackwardFunction* SparseSoftmaxCrossEntropyWithLogitsRegisterer(
     const ForwardOperation& op);
 }  // namespace gradients
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_NN_GRAD_H_
\ No newline at end of file
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_NN_GRAD_H_
diff --git a/tensorflow/c/experimental/gradients/tape/BUILD b/tensorflow/c/experimental/gradients/tape/BUILD
new file mode 100644
index 00000000000..bada49ea669
--- /dev/null
+++ b/tensorflow/c/experimental/gradients/tape/BUILD
@@ -0,0 +1,66 @@
+# A tape built on top of unified execution APIs.
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "tape_context",
+    srcs = ["tape_context.cc"],
+    hdrs = [
+        "tape_context.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":tape_operation",
+        "//tensorflow/c/eager:abstract_context",
+        "//tensorflow/c/eager:abstract_function",
+        "//tensorflow/c/eager:abstract_operation",
+    ],
+)
+
+cc_library(
+    name = "tape_operation",
+    srcs = ["tape_operation.cc"],
+    hdrs = [
+        "tape_operation.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/c/eager:abstract_context",
+        "//tensorflow/c/eager:abstract_function",
+        "//tensorflow/c/eager:abstract_operation",
+        "//tensorflow/c/eager:gradients_internal",
+    ],
+)
+
+cc_library(
+    name = "tape",
+    hdrs = [
+        "tape_context.h",
+        "tape_operation.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":tape_context",
+        ":tape_operation",
+    ],
+)
+
+filegroup(
+    name = "pywrap_required_hdrs",
+    srcs = [
+        "tape_context.h",
+        "tape_operation.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
diff --git a/tensorflow/c/experimental/gradients/tape/tape_context.cc b/tensorflow/c/experimental/gradients/tape/tape_context.cc
new file mode 100644
index 00000000000..1fa1a3f24f1
--- /dev/null
+++ b/tensorflow/c/experimental/gradients/tape/tape_context.cc
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/gradients/tape/tape_context.h"
+
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/experimental/gradients/tape/tape_operation.h"
+
+namespace tensorflow {
+namespace gradients {
+TapeContext::TapeContext(AbstractContext* c, Tape* tape,
+                         const GradientRegistry& registry)
+    : AbstractContext(kTape), parent_ctx_(c), tape_(tape), registry_(registry) {
+  // TODO(srbs): Make AbstractContext ref counted.
+  // parent_ctx_->Ref();
+}
+void TapeContext::Release() {
+  // TODO(srbs): Change to Unref()
+  delete this;
+}
+TapeContext::~TapeContext() {
+  // TODO(srbs): Make AbstractContext ref counted.
+  // parent_ctx_->Unref();
+}
+TapeOperation* TapeContext::CreateOperation() {
+  return new TapeOperation(parent_ctx_->CreateOperation(), tape_, registry_);
+}
+Status TapeContext::RegisterFunction(AbstractFunction* f) {
+  return parent_ctx_->RegisterFunction(f);
+}
+Status TapeContext::RemoveFunction(const string& func) {
+  return parent_ctx_->RemoveFunction(func);
+}
+
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/gradients/tape/tape_context.h b/tensorflow/c/experimental/gradients/tape/tape_context.h
new file mode 100644
index 00000000000..291053226fb
--- /dev/null
+++ b/tensorflow/c/experimental/gradients/tape/tape_context.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_TAPE_TAPE_CONTEXT_H_
+#define TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_TAPE_TAPE_CONTEXT_H_
+
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/experimental/gradients/tape/tape_operation.h"
+
+namespace tensorflow {
+namespace gradients {
+class TapeContext : public AbstractContext {
+ public:
+  explicit TapeContext(AbstractContext*, Tape*, const GradientRegistry&);
+  void Release() override;
+  TapeOperation* CreateOperation() override;
+  Status RegisterFunction(AbstractFunction*) override;
+  Status RemoveFunction(const string& func) override;
+  // For LLVM style RTTI.
+  static bool classof(const AbstractContext* ptr) {
+    return ptr->getKind() == kTape;
+  }
+  ~TapeContext() override;
+
+ private:
+  AbstractContext* parent_ctx_;  // Not owned.
+  Tape* tape_;
+  const GradientRegistry& registry_;
+};
+}  // namespace gradients
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_TAPE_TAPE_CONTEXT_H_
diff --git a/tensorflow/c/experimental/gradients/tape/tape_operation.cc b/tensorflow/c/experimental/gradients/tape/tape_operation.cc
new file mode 100644
index 00000000000..0b247d08f6c
--- /dev/null
+++ b/tensorflow/c/experimental/gradients/tape/tape_operation.cc
@@ -0,0 +1,238 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/c/experimental/gradients/tape/tape_operation.h"
+
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/gradients.h"
+
+namespace tensorflow {
+namespace gradients {
+TapeOperation::TapeOperation(AbstractOperation* parent_op, Tape* tape,
+                             const GradientRegistry& registry)
+    : AbstractOperation(kTape),
+      parent_op_(parent_op),
+      tape_(tape),
+      registry_(registry) {
+  // TODO(srbs): Make AbstractOperation RefCounted.
+  // parent_op_->Ref();
+}
+void TapeOperation::Release() {
+  // TODO(srbs): Change to Unref().
+  delete this;
+}
+TapeOperation::~TapeOperation() {
+  // TODO(srbs): Make AbstractOperation RefCounted.
+  // parent_op->Unref();
+}
+Status TapeOperation::Reset(const char* op, const char* raw_device_name) {
+  forward_op_.op_name = op;
+  forward_op_.attrs.Reset(op);
+  forward_op_.inputs.clear();
+  forward_op_.outputs.clear();
+  return parent_op_->Reset(op, raw_device_name);
+}
+const string& TapeOperation::Name() const { return parent_op_->Name(); }
+const string& TapeOperation::DeviceName() const {
+  return parent_op_->DeviceName();
+}
+Status TapeOperation::SetDeviceName(const char* name) {
+  return parent_op_->SetDeviceName(name);
+}
+Status TapeOperation::AddInput(AbstractTensorHandle* input) {
+  TF_RETURN_IF_ERROR(parent_op_->AddInput(input));
+  forward_op_.inputs.push_back(input);
+  return Status::OK();
+}
+Status TapeOperation::AddInputList(
+    absl::Span<AbstractTensorHandle* const> inputs) {
+  TF_RETURN_IF_ERROR(parent_op_->AddInputList(inputs));
+  for (auto input : inputs) {
+    forward_op_.inputs.push_back(input);
+  }
+  return Status::OK();
+}
+Status TapeOperation::SetAttrString(const char* attr_name, const char* data,
+                                    size_t length) {
+  forward_op_.attrs.Set(attr_name, StringPiece(data, length));
+  return parent_op_->SetAttrString(attr_name, data, length);
+}
+Status TapeOperation::SetAttrInt(const char* attr_name, int64_t value) {
+  forward_op_.attrs.Set(attr_name, static_cast<int64>(value));
+  return parent_op_->SetAttrInt(attr_name, value);
+}
+Status TapeOperation::SetAttrFloat(const char* attr_name, float value) {
+  forward_op_.attrs.Set(attr_name, value);
+  return parent_op_->SetAttrFloat(attr_name, value);
+}
+Status TapeOperation::SetAttrBool(const char* attr_name, bool value) {
+  forward_op_.attrs.Set(attr_name, value);
+  return parent_op_->SetAttrBool(attr_name, value);
+}
+Status TapeOperation::SetAttrType(const char* attr_name, DataType value) {
+  forward_op_.attrs.Set(attr_name, value);
+  return parent_op_->SetAttrType(attr_name, value);
+}
+Status TapeOperation::SetAttrShape(const char* attr_name, const int64_t* dims,
+                                   const int num_dims) {
+  if (num_dims > TensorShape::MaxDimensions()) {
+    return errors::InvalidArgument("Value specified for `", attr_name, "` has ",
+                                   num_dims,
+                                   " dimensions which is over the limit of ",
+                                   TensorShape::MaxDimensions(), ".");
+  }
+  TensorShapeProto proto;
+  if (num_dims < 0) {
+    proto.set_unknown_rank(true);
+  } else {
+    for (int d = 0; d < num_dims; ++d) {
+      proto.add_dim()->set_size(dims[d]);
+    }
+  }
+
+  forward_op_.attrs.Set(attr_name, proto);
+  return parent_op_->SetAttrShape(attr_name, dims, num_dims);
+}
+Status TapeOperation::SetAttrFunction(const char* attr_name,
+                                      const AbstractOperation* value) {
+  return tensorflow::errors::Unimplemented(
+      "SetAttrFunction has not been implemented yet.");
+}
+Status TapeOperation::SetAttrFunctionName(const char* attr_name,
+                                          const char* value, size_t length) {
+  return tensorflow::errors::Unimplemented(
+      "SetAttrFunctionName has not been implemented "
+      "yet.");
+}
+Status TapeOperation::SetAttrTensor(const char* attr_name,
+                                    AbstractTensorInterface* tensor) {
+  return tensorflow::errors::Unimplemented(
+      "SetAttrTensor has not been implemented yet.");
+}
+Status TapeOperation::SetAttrStringList(const char* attr_name,
+                                        const void* const* values,
+                                        const size_t* lengths, int num_values) {
+  std::vector<StringPiece> v(num_values);
+  for (int i = 0; i < num_values; ++i) {
+    v[i] = StringPiece(static_cast<const char*>(values[i]), lengths[i]);
+  }
+  forward_op_.attrs.Set(attr_name, v);
+  return parent_op_->SetAttrStringList(attr_name, values, lengths, num_values);
+}
+Status TapeOperation::SetAttrFloatList(const char* attr_name,
+                                       const float* values, int num_values) {
+  forward_op_.attrs.Set(attr_name,
+                        gtl::ArraySlice<const float>(values, num_values));
+  return parent_op_->SetAttrFloatList(attr_name, values, num_values);
+}
+Status TapeOperation::SetAttrIntList(const char* attr_name,
+                                     const int64_t* values, int num_values) {
+  forward_op_.attrs.Set(
+      attr_name, gtl::ArraySlice<const int64>(
+                     reinterpret_cast<const int64*>(values), num_values));
+  return parent_op_->SetAttrIntList(attr_name, values, num_values);
+}
+Status TapeOperation::SetAttrTypeList(const char* attr_name,
+                                      const DataType* values, int num_values) {
+  forward_op_.attrs.Set(attr_name,
+                        gtl::ArraySlice<const DataType>(values, num_values));
+  return parent_op_->SetAttrTypeList(attr_name, values, num_values);
+}
+Status TapeOperation::SetAttrBoolList(const char* attr_name,
+                                      const unsigned char* values,
+                                      int num_values) {
+  std::unique_ptr<bool[]> b(new bool[num_values]);
+  for (int i = 0; i < num_values; ++i) {
+    b[i] = values[i];
+  }
+  forward_op_.attrs.Set(attr_name,
+                        gtl::ArraySlice<const bool>(b.get(), num_values));
+  return parent_op_->SetAttrBoolList(attr_name, values, num_values);
+}
+Status TapeOperation::SetAttrShapeList(const char* attr_name,
+                                       const int64_t** dims,
+                                       const int* num_dims, int num_values) {
+  std::unique_ptr<TensorShapeProto[]> proto(new TensorShapeProto[num_values]);
+  for (int i = 0; i < num_values; ++i) {
+    const auto num_dims_i = num_dims[i];
+
+    if (num_dims_i > TensorShape::MaxDimensions()) {
+      return errors::InvalidArgument(
+          strings::StrCat("Value specified for `", attr_name, "` has ",
+                          num_dims_i, " dimensions which is over the limit of ",
+                          TensorShape::MaxDimensions(), "."));
+    }
+    if (num_dims_i < 0) {
+      proto[i].set_unknown_rank(true);
+    } else {
+      const int64_t* dims_i = dims[i];
+      auto proto_i = &proto[i];
+      for (int d = 0; d < num_dims_i; ++d) {
+        proto_i->add_dim()->set_size(dims_i[d]);
+      }
+    }
+  }
+  forward_op_.attrs.Set(
+      attr_name, gtl::ArraySlice<TensorShapeProto>(proto.get(), num_values));
+  return parent_op_->SetAttrShapeList(attr_name, dims, num_dims, num_values);
+}
+Status TapeOperation::SetAttrFunctionList(
+    const char* attr_name, absl::Span<const AbstractOperation*> values) {
+  return tensorflow::errors::Unimplemented(
+      "SetAttrFunctionList has not been "
+      "implemented yet.");
+}
+AbstractOperation* TapeOperation::GetBackingOperation() { return parent_op_; }
+Status TapeOperation::Execute(absl::Span<AbstractTensorHandle*> retvals,
+                              int* num_retvals) {
+  TF_RETURN_IF_ERROR(parent_op_->Execute(retvals, num_retvals));
+  std::vector<int64> input_ids(forward_op_.inputs.size());
+  std::vector<tensorflow::DataType> input_dtypes(forward_op_.inputs.size());
+  for (int i = 0; i < forward_op_.inputs.size(); i++) {
+    input_ids[i] = ToId(forward_op_.inputs[i]);
+    input_dtypes[i] = forward_op_.inputs[i]->DataType();
+  }
+  for (int i = 0; i < *num_retvals; i++) {
+    // TODO(srbs): Manage refcount of ForwardOperation's inputs/outputs.
+    forward_op_.outputs.push_back(retvals[i]);
+  }
+  // TODO(b/166669239): This is needed to support AttrBuilder::Get for string
+  // attributes. Number type attrs and DataType attrs work fine without this.
+  // Consider getting rid of this and making the behavior between number types
+  // and string consistent.
+  forward_op_.attrs.BuildNodeDef();
+  std::vector<TapeTensor> tape_tensors;
+  for (auto t : retvals) {
+    tape_tensors.push_back(TapeTensor(t));
+  }
+  tape_->RecordOperation(
+      parent_op_->Name(), tape_tensors, input_ids, input_dtypes,
+      [this]() -> BackwardFunction* {
+        std::unique_ptr<BackwardFunction> backward_fn;
+        Status s = registry_.Lookup(forward_op_, &backward_fn);
+        if (!s.ok()) {
+          return nullptr;
+        }
+        return backward_fn.release();
+      },
+      [](BackwardFunction* ptr) {
+        if (ptr) {
+          delete ptr;
+        }
+      });
+  return Status::OK();
+}
+
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/gradients/tape/tape_operation.h b/tensorflow/c/experimental/gradients/tape/tape_operation.h
new file mode 100644
index 00000000000..b971176d9e7
--- /dev/null
+++ b/tensorflow/c/experimental/gradients/tape/tape_operation.h
@@ -0,0 +1,80 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_TAPE_TAPE_OPERATION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_TAPE_TAPE_OPERATION_H_
+
+#include "tensorflow/c/eager/abstract_operation.h"
+#include "tensorflow/c/eager/gradients.h"
+
+namespace tensorflow {
+namespace gradients {
+class TapeOperation : public AbstractOperation {
+ public:
+  explicit TapeOperation(AbstractOperation*, Tape*, const GradientRegistry&);
+  void Release() override;
+  Status Reset(const char* op, const char* raw_device_name) override;
+  const string& Name() const override;
+  const string& DeviceName() const override;
+  Status SetDeviceName(const char* name) override;
+  Status AddInput(AbstractTensorHandle* input) override;
+  Status AddInputList(absl::Span<AbstractTensorHandle* const> inputs) override;
+  Status Execute(absl::Span<AbstractTensorHandle*> retvals,
+                 int* num_retvals) override;
+  Status SetAttrString(const char* attr_name, const char* data,
+                       size_t length) override;
+  Status SetAttrInt(const char* attr_name, int64_t value) override;
+  Status SetAttrFloat(const char* attr_name, float value) override;
+  Status SetAttrBool(const char* attr_name, bool value) override;
+  Status SetAttrType(const char* attr_name, DataType value) override;
+  Status SetAttrShape(const char* attr_name, const int64_t* dims,
+                      const int num_dims) override;
+  Status SetAttrFunction(const char* attr_name,
+                         const AbstractOperation* value) override;
+  Status SetAttrFunctionName(const char* attr_name, const char* value,
+                             size_t length) override;
+  Status SetAttrTensor(const char* attr_name,
+                       AbstractTensorInterface* tensor) override;
+  Status SetAttrStringList(const char* attr_name, const void* const* values,
+                           const size_t* lengths, int num_values) override;
+  Status SetAttrFloatList(const char* attr_name, const float* values,
+                          int num_values) override;
+  Status SetAttrIntList(const char* attr_name, const int64_t* values,
+                        int num_values) override;
+  Status SetAttrTypeList(const char* attr_name, const DataType* values,
+                         int num_values) override;
+  Status SetAttrBoolList(const char* attr_name, const unsigned char* values,
+                         int num_values) override;
+  Status SetAttrShapeList(const char* attr_name, const int64_t** dims,
+                          const int* num_dims, int num_values) override;
+  Status SetAttrFunctionList(
+      const char* attr_name,
+      absl::Span<const AbstractOperation*> values) override;
+  AbstractOperation* GetBackingOperation();
+  // For LLVM style RTTI.
+  static bool classof(const AbstractOperation* ptr) {
+    return ptr->getKind() == kTape;
+  }
+  ~TapeOperation() override;
+
+ private:
+  AbstractOperation* parent_op_;
+  ForwardOperation forward_op_;
+  Tape* tape_;
+  const GradientRegistry& registry_;
+};
+
+}  // namespace gradients
+}  // namespace tensorflow
+#endif  // TENSORFLOW_C_EXPERIMENTAL_GRADIENTS_TAPE_TAPE_OPERATION_H_
diff --git a/tensorflow/c/experimental/ops/BUILD b/tensorflow/c/experimental/ops/BUILD
index c5810bffa48..d311a0c26db 100644
--- a/tensorflow/c/experimental/ops/BUILD
+++ b/tensorflow/c/experimental/ops/BUILD
@@ -1,3 +1,6 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 # Experimental ops. These will eventually be replaced by machine-generated versions.
 package(
     licenses = ["notice"],  # Apache 2.0
@@ -19,7 +22,7 @@ cc_library(
         "//tensorflow/c/eager:abstract_operation",
         "//tensorflow/c/eager:abstract_tensor_handle",
         "//tensorflow/c/eager:c_api_unified_internal",
-        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/c/eager:tracing_utils",
         "//tensorflow/core/platform:errors",
     ],
 )
@@ -40,8 +43,8 @@ cc_library(
         "//tensorflow/c/eager:abstract_context",
         "//tensorflow/c/eager:abstract_tensor_handle",
         "//tensorflow/c/eager:c_api_unified_internal",
+        "//tensorflow/c/eager:tracing_utils",
         "//tensorflow/core:framework",
-        "//tensorflow/core/lib/llvm_rtti",
         "//tensorflow/core/platform:errors",
     ],
 )
@@ -61,7 +64,41 @@ cc_library(
         "//tensorflow/c/eager:abstract_operation",
         "//tensorflow/c/eager:abstract_tensor_handle",
         "//tensorflow/c/eager:c_api_unified_internal",
-        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/c/eager:tracing_utils",
         "//tensorflow/core/platform:errors",
     ],
 )
+
+cc_library(
+    name = "ops",
+    hdrs = [
+        "array_ops.h",
+        "math_ops.h",
+        "nn_ops.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":array_ops",
+        ":math_ops",
+        ":nn_ops",
+        "//tensorflow/c/eager:abstract_context",
+        "//tensorflow/c/eager:abstract_operation",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:c_api_unified_internal",
+    ],
+)
+
+filegroup(
+    name = "pywrap_required_hdrs",
+    srcs = [
+        "array_ops.h",
+        "math_ops.h",
+        "nn_ops.h",
+    ],
+    visibility = [
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/python:__pkg__",
+    ],
+)
diff --git a/tensorflow/c/experimental/ops/array_ops.cc b/tensorflow/c/experimental/ops/array_ops.cc
index df0f4639fbd..debeba18edf 100644
--- a/tensorflow/c/experimental/ops/array_ops.cc
+++ b/tensorflow/c/experimental/ops/array_ops.cc
@@ -14,9 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/array_ops.h"
 
-#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/tracing_utils.h"
 #include "tensorflow/core/platform/errors.h"
 
+using tensorflow::tracing::MaybeSetOpName;
+
 namespace tensorflow {
 namespace ops {
 
@@ -26,28 +28,58 @@ Status Identity(AbstractContext* ctx,
   AbstractOperationPtr identity_op(ctx->CreateOperation());
   TF_RETURN_IF_ERROR(
       identity_op->Reset("Identity", /*raw_device_name=*/nullptr));
-  if (isa<tensorflow::tracing::TracingOperation>(identity_op.get())) {
-    TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingOperation>(identity_op.get())
-                           ->SetOpName(name));
-  }
+  TF_RETURN_IF_ERROR(MaybeSetOpName(identity_op.get(), name));
   TF_RETURN_IF_ERROR(identity_op->AddInput(inputs[0]));
   int num_retvals = 1;
   return identity_op->Execute(outputs, &num_retvals);
 }
 
+Status IdentityN(AbstractContext* ctx,
+                 absl::Span<AbstractTensorHandle* const> inputs,
+                 absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr identity_n_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(
+      identity_n_op->Reset("IdentityN", /*raw_device_name=*/nullptr));
+  TF_RETURN_IF_ERROR(MaybeSetOpName(identity_n_op.get(), name));
+  TF_RETURN_IF_ERROR(identity_n_op->AddInputList(inputs));
+  int num_retvals = inputs.size();
+  return identity_n_op->Execute(outputs, &num_retvals);
+}
+
 Status ZerosLike(AbstractContext* ctx,
                  absl::Span<AbstractTensorHandle* const> inputs,
                  absl::Span<AbstractTensorHandle*> outputs, const char* name) {
   AbstractOperationPtr z_op(ctx->CreateOperation());
   TF_RETURN_IF_ERROR(z_op->Reset("ZerosLike", /*raw_device_name=*/nullptr));
-  if (isa<tensorflow::tracing::TracingOperation>(z_op.get())) {
-    TF_RETURN_IF_ERROR(
-        dyn_cast<tracing::TracingOperation>(z_op.get())->SetOpName(name));
-  }
+  TF_RETURN_IF_ERROR(MaybeSetOpName(z_op.get(), name));
   TF_RETURN_IF_ERROR(z_op->AddInput(inputs[0]));
   int num_retvals = 1;
   return z_op->Execute(outputs, &num_retvals);
 }
 
+Status Shape(AbstractContext* ctx,
+             absl::Span<AbstractTensorHandle* const> inputs,
+             absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr shape_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(shape_op->Reset("Shape", /*raw_device_name=*/nullptr));
+  TF_RETURN_IF_ERROR(MaybeSetOpName(shape_op.get(), name));
+  TF_RETURN_IF_ERROR(shape_op->AddInput(inputs[0]));  // input
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(shape_op->Execute(outputs, &num_retvals));
+  return Status::OK();
+}
+
+Status ExpandDims(AbstractContext* ctx,
+                  absl::Span<AbstractTensorHandle* const> inputs,
+                  absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(op->Reset("ExpandDims", /*raw_device_name=*/nullptr));
+  TF_RETURN_IF_ERROR(MaybeSetOpName(op.get(), name));
+  TF_RETURN_IF_ERROR(op->AddInput(inputs[0]));
+  TF_RETURN_IF_ERROR(op->AddInput(inputs[1]));
+  int num_retvals = 1;
+  return op->Execute(outputs, &num_retvals);
+}
+
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/ops/array_ops.h b/tensorflow/c/experimental/ops/array_ops.h
index 8dc68db673f..f63412ed248 100644
--- a/tensorflow/c/experimental/ops/array_ops.h
+++ b/tensorflow/c/experimental/ops/array_ops.h
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/c/eager/abstract_context.h"
 #include "tensorflow/c/eager/abstract_operation.h"
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
-#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
 
 namespace tensorflow {
 namespace ops {
@@ -27,10 +26,22 @@ Status Identity(AbstractContext* ctx,
                 absl::Span<AbstractTensorHandle* const> inputs,
                 absl::Span<AbstractTensorHandle*> outputs, const char* name);
 
+Status IdentityN(AbstractContext* ctx,
+                 absl::Span<AbstractTensorHandle* const> inputs,
+                 absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
 Status ZerosLike(AbstractContext* ctx,
                  absl::Span<AbstractTensorHandle* const> inputs,
                  absl::Span<AbstractTensorHandle*> outputs, const char* name);
 
+Status Shape(AbstractContext* ctx,
+             absl::Span<AbstractTensorHandle* const> inputs,
+             absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+Status ExpandDims(AbstractContext* ctx,
+                  absl::Span<AbstractTensorHandle* const> inputs,
+                  absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
 }  // namespace ops
 }  // namespace tensorflow
 
diff --git a/tensorflow/c/experimental/ops/math_ops.cc b/tensorflow/c/experimental/ops/math_ops.cc
index 82c2f0e8169..20aab8a77d3 100644
--- a/tensorflow/c/experimental/ops/math_ops.cc
+++ b/tensorflow/c/experimental/ops/math_ops.cc
@@ -16,22 +16,21 @@ limitations under the License.
 
 #include "tensorflow/c/eager/abstract_context.h"
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
-#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/tracing_utils.h"
 #include "tensorflow/c/experimental/ops/array_ops.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/errors.h"
+
+using tensorflow::tracing::MaybeSetOpName;
+
 namespace tensorflow {
 namespace ops {
-using tensorflow::tracing::TracingOperation;
 
 Status Mul(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
            absl::Span<AbstractTensorHandle*> outputs, const char* name) {
   AbstractOperationPtr mul_op(ctx->CreateOperation());
   TF_RETURN_IF_ERROR(mul_op->Reset("Mul", /*raw_device_name=*/nullptr));
-  if (isa<TracingOperation>(mul_op.get())) {
-    TF_RETURN_IF_ERROR(
-        dyn_cast<TracingOperation>(mul_op.get())->SetOpName(name));
-  }
+  TF_RETURN_IF_ERROR(MaybeSetOpName(mul_op.get(), name));
   TF_RETURN_IF_ERROR(mul_op->AddInput(inputs[0]));
   TF_RETURN_IF_ERROR(mul_op->AddInput(inputs[1]));
   int num_retvals = 1;
@@ -55,12 +54,7 @@ Status Add(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
            absl::Span<AbstractTensorHandle*> outputs, const char* name) {
   AbstractOperationPtr add_op(ctx->CreateOperation());
   TF_RETURN_IF_ERROR(add_op->Reset("AddV2", /*raw_device_name=*/nullptr));
-
-  if (isa<tracing::TracingOperation>(add_op.get())) {
-    TF_RETURN_IF_ERROR(
-        dyn_cast<tracing::TracingOperation>(add_op.get())->SetOpName(name));
-  }
-
+  TF_RETURN_IF_ERROR(MaybeSetOpName(add_op.get(), name));
   TF_RETURN_IF_ERROR(add_op->AddInput(inputs[0]));
   TF_RETURN_IF_ERROR(add_op->AddInput(inputs[1]));
 
@@ -69,18 +63,26 @@ Status Add(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
   return Status::OK();
 }
 
+Status Sub(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr sub_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(sub_op->Reset("Sub", /*raw_device_name=*/nullptr));
+  TF_RETURN_IF_ERROR(MaybeSetOpName(sub_op.get(), name));
+  TF_RETURN_IF_ERROR(sub_op->AddInput(inputs[0]));
+  TF_RETURN_IF_ERROR(sub_op->AddInput(inputs[1]));
+
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(sub_op->Execute(outputs, &num_retvals));
+  return Status::OK();
+}
+
 Status MatMul(AbstractContext* ctx,
               absl::Span<AbstractTensorHandle* const> inputs,
               absl::Span<AbstractTensorHandle*> outputs, const char* name,
               bool transpose_a = false, bool transpose_b = false) {
   AbstractOperationPtr matmul_op(ctx->CreateOperation());
   TF_RETURN_IF_ERROR(matmul_op->Reset("MatMul", /*raw_device_name=*/nullptr));
-
-  if (isa<tracing::TracingOperation>(matmul_op.get())) {
-    TF_RETURN_IF_ERROR(
-        dyn_cast<tracing::TracingOperation>(matmul_op.get())->SetOpName(name));
-  }
-
+  TF_RETURN_IF_ERROR(MaybeSetOpName(matmul_op.get(), name));
   TF_RETURN_IF_ERROR(matmul_op->AddInput(inputs[0]));
   TF_RETURN_IF_ERROR(matmul_op->AddInput(inputs[1]));
 
@@ -96,15 +98,79 @@ Status Neg(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
            absl::Span<AbstractTensorHandle*> outputs, const char* name) {
   AbstractOperationPtr neg_op(ctx->CreateOperation());
   TF_RETURN_IF_ERROR(neg_op->Reset("Neg", /*raw_device_name=*/nullptr));
-  if (isa<TracingOperation>(neg_op.get())) {
-    TF_RETURN_IF_ERROR(
-        dyn_cast<TracingOperation>(neg_op.get())->SetOpName(name));
-  }
+  TF_RETURN_IF_ERROR(MaybeSetOpName(neg_op.get(), name));
   TF_RETURN_IF_ERROR(neg_op->AddInput(inputs[0]));
 
   int num_retvals = 1;
   return neg_op->Execute(outputs, &num_retvals);
 }
 
+Status Sum(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr sum_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(sum_op->Reset("Sum", /*raw_device_name=*/nullptr));
+  TF_RETURN_IF_ERROR(MaybeSetOpName(sum_op.get(), name));
+  TF_RETURN_IF_ERROR(sum_op->AddInput(inputs[0]));  // input_vals
+  TF_RETURN_IF_ERROR(sum_op->AddInput(inputs[1]));  // reduction_indices
+
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(sum_op->Execute(outputs, &num_retvals));
+  return Status::OK();
+}
+
+Status DivNoNan(AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr div_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(div_op->Reset("DivNoNan", /*raw_device_name=*/nullptr));
+  TF_RETURN_IF_ERROR(MaybeSetOpName(div_op.get(), name));
+  TF_RETURN_IF_ERROR(div_op->AddInput(inputs[0]));  // x
+  TF_RETURN_IF_ERROR(div_op->AddInput(inputs[1]));  // y
+
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(div_op->Execute(
+      outputs, &num_retvals));  // z = x / y, (z_i = 0 if y_i = 0)
+  return Status::OK();
+}
+
+Status Exp(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr exp_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(exp_op->Reset("Exp", /*raw_device_name=*/nullptr));
+  TF_RETURN_IF_ERROR(MaybeSetOpName(exp_op.get(), name));
+  TF_RETURN_IF_ERROR(exp_op->AddInput(inputs[0]));
+
+  int num_retvals = 1;
+  return exp_op->Execute(outputs, &num_retvals);
+}
+
+Status Sqrt(AbstractContext* ctx,
+            absl::Span<AbstractTensorHandle* const> inputs,
+            absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr sqrt_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(sqrt_op->Reset("Sqrt", /*raw_device_name=*/nullptr));
+  TF_RETURN_IF_ERROR(MaybeSetOpName(sqrt_op.get(), name));
+  TF_RETURN_IF_ERROR(sqrt_op->AddInput(inputs[0]));
+
+  int num_retvals = 1;
+  Status s = sqrt_op->Execute(outputs, &num_retvals);
+  return s;
+}
+
+Status SqrtGrad(AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr sqrt_grad_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(
+      sqrt_grad_op->Reset("SqrtGrad", /*raw_device_name=*/nullptr));
+  TF_RETURN_IF_ERROR(MaybeSetOpName(sqrt_grad_op.get(), name));
+  TF_RETURN_IF_ERROR(sqrt_grad_op->AddInput(inputs[0]));
+  TF_RETURN_IF_ERROR(sqrt_grad_op->AddInput(inputs[1]));
+
+  int num_retvals = 1;
+  Status s = sqrt_grad_op->Execute(outputs, &num_retvals);
+  return s;
+}
+
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/ops/math_ops.h b/tensorflow/c/experimental/ops/math_ops.h
index ed1e6c5b3d6..7051e38656f 100644
--- a/tensorflow/c/experimental/ops/math_ops.h
+++ b/tensorflow/c/experimental/ops/math_ops.h
@@ -22,18 +22,43 @@ namespace tensorflow {
 namespace ops {
 Status Mul(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
            absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
 Status Conj(AbstractContext* ctx,
             absl::Span<AbstractTensorHandle* const> inputs,
             absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
 Status Add(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
            absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
 Status MatMul(AbstractContext* ctx,
               absl::Span<AbstractTensorHandle* const> inputs,
               absl::Span<AbstractTensorHandle*> outputs, const char* name,
               bool transpose_a, bool transpose_b);
+
 Status Neg(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
            absl::Span<AbstractTensorHandle*> outputs, const char* name);
 
+Status Sum(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+Status Sub(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+Status DivNoNan(AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+Status Exp(AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
+           absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+Status Sqrt(AbstractContext* ctx,
+            absl::Span<AbstractTensorHandle* const> inputs,
+            absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
+Status SqrtGrad(AbstractContext* ctx,
+                absl::Span<AbstractTensorHandle* const> inputs,
+                absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
 }  // namespace ops
 }  // namespace tensorflow
 
diff --git a/tensorflow/c/experimental/ops/nn_ops.cc b/tensorflow/c/experimental/ops/nn_ops.cc
index 8f5f550bb8b..6a97dbf0939 100644
--- a/tensorflow/c/experimental/ops/nn_ops.cc
+++ b/tensorflow/c/experimental/ops/nn_ops.cc
@@ -15,24 +15,22 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/c/experimental/ops/nn_ops.h"
 
+#include "tensorflow/c/eager/tracing_utils.h"
 #include "tensorflow/core/platform/errors.h"
 
+using tensorflow::tracing::MaybeSetOpName;
+
 namespace tensorflow {
 namespace ops {
 
 // Softmax Loss given scores and labels, used by the SoftMaxLossGradient
-Status SparseSoftmaxCrossEntropyLoss(
+Status SparseSoftmaxCrossEntropyWithLogits(
     AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
     absl::Span<AbstractTensorHandle*> outputs, const char* name) {
   AbstractOperationPtr sm_loss_op(ctx->CreateOperation());
   TF_RETURN_IF_ERROR(sm_loss_op->Reset("SparseSoftmaxCrossEntropyWithLogits",
                                        /*raw_device_name=*/nullptr));
-
-  if (isa<tracing::TracingOperation>(sm_loss_op.get())) {
-    TF_RETURN_IF_ERROR(
-        dyn_cast<tracing::TracingOperation>(sm_loss_op.get())->SetOpName(name));
-  }
-
+  TF_RETURN_IF_ERROR(MaybeSetOpName(sm_loss_op.get(), name));
   TF_RETURN_IF_ERROR(sm_loss_op->AddInput(inputs[0]));  // input scores
   TF_RETURN_IF_ERROR(sm_loss_op->AddInput(inputs[1]));  // labels
 
@@ -49,12 +47,7 @@ Status ReluGrad(AbstractContext* ctx,
   AbstractOperationPtr relugrad_op(ctx->CreateOperation());
   TF_RETURN_IF_ERROR(
       relugrad_op->Reset("ReluGrad", /*raw_device_name=*/nullptr));
-
-  if (isa<tracing::TracingOperation>(relugrad_op.get())) {
-    TF_RETURN_IF_ERROR(dyn_cast<tracing::TracingOperation>(relugrad_op.get())
-                           ->SetOpName(name));
-  }
-
+  TF_RETURN_IF_ERROR(MaybeSetOpName(relugrad_op.get(), name));
   TF_RETURN_IF_ERROR(relugrad_op->AddInput(inputs[0]));  // upstream grads
   TF_RETURN_IF_ERROR(relugrad_op->AddInput(inputs[1]));  // relu inputs
 
@@ -63,5 +56,18 @@ Status ReluGrad(AbstractContext* ctx,
   return Status::OK();
 }
 
+Status Relu(AbstractContext* ctx,
+            absl::Span<AbstractTensorHandle* const> inputs,
+            absl::Span<AbstractTensorHandle*> outputs, const char* name) {
+  AbstractOperationPtr relu_op(ctx->CreateOperation());
+  TF_RETURN_IF_ERROR(relu_op->Reset("Relu", /*raw_device_name=*/nullptr));
+  TF_RETURN_IF_ERROR(MaybeSetOpName(relu_op.get(), name));
+  TF_RETURN_IF_ERROR(relu_op->AddInput(inputs[0]));
+
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(relu_op->Execute(outputs, &num_retvals));
+  return Status::OK();
+}
+
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/ops/nn_ops.h b/tensorflow/c/experimental/ops/nn_ops.h
index 3e618b00869..3c0e04579a1 100644
--- a/tensorflow/c/experimental/ops/nn_ops.h
+++ b/tensorflow/c/experimental/ops/nn_ops.h
@@ -18,12 +18,11 @@ limitations under the License.
 #include "tensorflow/c/eager/abstract_operation.h"
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
-#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
 
 namespace tensorflow {
 namespace ops {
 
-Status SparseSoftmaxCrossEntropyLoss(
+Status SparseSoftmaxCrossEntropyWithLogits(
     AbstractContext* ctx, absl::Span<AbstractTensorHandle* const> inputs,
     absl::Span<AbstractTensorHandle*> outputs, const char* name);
 
@@ -31,6 +30,10 @@ Status ReluGrad(AbstractContext* ctx,
                 absl::Span<AbstractTensorHandle* const> inputs,
                 absl::Span<AbstractTensorHandle*> outputs, const char* name);
 
+Status Relu(AbstractContext* ctx,
+            absl::Span<AbstractTensorHandle* const> inputs,
+            absl::Span<AbstractTensorHandle*> outputs, const char* name);
+
 }  // namespace ops
 }  // namespace tensorflow
 
diff --git a/tensorflow/c/experimental/saved_model/core/BUILD b/tensorflow/c/experimental/saved_model/core/BUILD
index 2feb7c1b33e..4cf868e4714 100644
--- a/tensorflow/c/experimental/saved_model/core/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 # Experimental SavedModel C APIs for TensorFlow. See RFC
 # https://github.com/tensorflow/community/pull/207
 # Targets in this directory are pure C++ "Classes" underlying the C API types
@@ -62,13 +64,21 @@ cc_library(
         ":function_metadata",
         "//tensorflow/c:tf_tensor_internal",
         "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/experimental/saved_model/core/revived_types:asset",
         "//tensorflow/c/experimental/saved_model/core/revived_types:constant",
+        "//tensorflow/c/experimental/saved_model/core/revived_types:partially_revived_objects",
+        "//tensorflow/c/experimental/saved_model/core/revived_types:restored_resource_revival_state",
         "//tensorflow/c/experimental/saved_model/core/revived_types:tf_concrete_function",
+        "//tensorflow/c/experimental/saved_model/core/revived_types:tf_concrete_function_revival_state",
+        "//tensorflow/c/experimental/saved_model/core/revived_types:tf_signature_def_function_revival_state",
         "//tensorflow/c/experimental/saved_model/core/revived_types:variable",
+        "//tensorflow/cc/saved_model:loader_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -81,15 +91,24 @@ cc_library(
         ":signature_def_function_metadata",
         "//tensorflow/c/eager:immediate_execution_operation",
         "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/types:span",
     ],
 )
 
 cc_library(
     name = "signature_def_function_metadata",
+    srcs = [
+        "signature_def_function_metadata.cc",
+    ],
     hdrs = [
         "signature_def_function_metadata.h",
     ],
+    deps = [
+        ":tensor_spec",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
 )
 
 cc_library(
@@ -138,11 +157,13 @@ cc_library(
         ":saved_model_api",
         ":saved_model_utils",
         ":signature_def_function",
-        "//tensorflow/c:tensor_interface",
         "//tensorflow/c/eager:immediate_execution_context",
         "//tensorflow/c/eager:immediate_execution_tensor_handle",
         "//tensorflow/c/experimental/saved_model/core/ops:restore_ops",
         "//tensorflow/c/experimental/saved_model/core/revived_types:constant",
+        "//tensorflow/c/experimental/saved_model/core/revived_types:flat_tensor_function",
+        "//tensorflow/c/experimental/saved_model/core/revived_types:partially_revived_objects",
+        "//tensorflow/c/experimental/saved_model/core/revived_types:revived_objects",
         "//tensorflow/c/experimental/saved_model/core/revived_types:tensorhandle_convertible",
         "//tensorflow/c/experimental/saved_model/core/revived_types:tf_concrete_function",
         "//tensorflow/c/experimental/saved_model/core/revived_types:variable",
@@ -151,7 +172,6 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/common_runtime/eager:tensor_handle",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
@@ -213,6 +233,7 @@ tf_cc_test(
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/common_runtime/eager:core",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -256,6 +277,20 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "tensor_spec",
+    srcs = [
+        "tensor_spec.cc",
+    ],
+    hdrs = [
+        "tensor_spec.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 tf_cc_test(
     name = "tf_concrete_function_loading_test",
     srcs = [
diff --git a/tensorflow/c/experimental/saved_model/core/concrete_function.h b/tensorflow/c/experimental/saved_model/core/concrete_function.h
index 934fa6d2bda..48a20ef7768 100644
--- a/tensorflow/c/experimental/saved_model/core/concrete_function.h
+++ b/tensorflow/c/experimental/saved_model/core/concrete_function.h
@@ -43,8 +43,8 @@ class ConcreteFunction {
   virtual ~ConcreteFunction() = default;
 
   // This method returns the "Call" Op used to execute the function.
-  virtual Status GetCallOp(absl::Span<AbstractTensorHandle* const> inputs,
-                           ImmediateOpPtr* out) = 0;
+  virtual Status MakeCallOp(absl::Span<AbstractTensorHandle* const> inputs,
+                            ImmediateOpPtr* out) const = 0;
 
   virtual const FunctionMetadata& GetFunctionMetadata() const = 0;
 };
diff --git a/tensorflow/c/experimental/saved_model/core/object_graph_traversal_test.cc b/tensorflow/c/experimental/saved_model/core/object_graph_traversal_test.cc
index 1c70d40cada..d179d0de6b7 100644
--- a/tensorflow/c/experimental/saved_model/core/object_graph_traversal_test.cc
+++ b/tensorflow/c/experimental/saved_model/core/object_graph_traversal_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/c/experimental/saved_model/core/saved_model_utils.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/stringpiece.h"
@@ -300,80 +301,70 @@ nodes {
 
 TEST(ObjectGraphTraversalTest, Success) {
   SavedObjectGraph object_graph = ParseSavedObjectGraph(kSingleChildFoo);
-  const SavedObject* obj = internal::FindNodeAtPath("foo", object_graph);
-  ASSERT_NE(nullptr, obj);
-  EXPECT_EQ(obj->kind_case(), SavedObject::kUserObject);
-  EXPECT_EQ(obj->user_object().identifier(), "_generic_user_object");
+  absl::optional<int> node = internal::FindNodeAtPath("foo", object_graph);
+  ASSERT_TRUE(node.has_value());
+  EXPECT_EQ(*node, 1);
 }
 
 TEST(ObjectGraphTraversalTest, ObjectNotFound) {
   SavedObjectGraph object_graph = ParseSavedObjectGraph(kSingleChildFoo);
-  const SavedObject* obj = internal::FindNodeAtPath("bar", object_graph);
-  EXPECT_EQ(nullptr, obj);
+  absl::optional<int> node = internal::FindNodeAtPath("bar", object_graph);
+  EXPECT_FALSE(node.has_value());
 }
 
 TEST(ObjectGraphTraversalTest, CaseSensitiveMismatch) {
   SavedObjectGraph object_graph = ParseSavedObjectGraph(kSingleChildFoo);
-  const SavedObject* obj = internal::FindNodeAtPath("FOO", object_graph);
-  EXPECT_EQ(nullptr, obj);
+  absl::optional<int> node = internal::FindNodeAtPath("FOO", object_graph);
+  EXPECT_FALSE(node.has_value());
 }
 
 TEST(ObjectGraphTraversalTest, NestedObjectFound) {
   SavedObjectGraph object_graph =
       ParseSavedObjectGraph(kSingleChildFooWithFuncBar);
-  const SavedObject* obj = internal::FindNodeAtPath("foo.bar", object_graph);
-  ASSERT_NE(nullptr, obj);
-  EXPECT_EQ(obj->kind_case(), SavedObject::kFunction);
-  EXPECT_EQ(obj->function().concrete_functions_size(), 1);
-  EXPECT_EQ(obj->function().concrete_functions(0), "__inference_my_func_5");
+  absl::optional<int> node = internal::FindNodeAtPath("foo.bar", object_graph);
+  ASSERT_TRUE(node.has_value());
+  EXPECT_EQ(*node, 2);
 }
 
 TEST(ObjectGraphTraversalTest, MultiplePathsAliasSameObject) {
   SavedObjectGraph object_graph = ParseSavedObjectGraph(kMultiplePathsToChild);
-  const SavedObject* foo_baz =
+  absl::optional<int> foo_baz_node =
       internal::FindNodeAtPath("foo.baz", object_graph);
-  ASSERT_NE(nullptr, foo_baz);
-  EXPECT_EQ(foo_baz->kind_case(), SavedObject::kUserObject);
-  EXPECT_EQ(foo_baz->user_object().identifier(), "_generic_user_object");
+  ASSERT_TRUE(foo_baz_node.has_value());
+  EXPECT_EQ(*foo_baz_node, 4);
 
-  const SavedObject* bar_wombat =
+  absl::optional<int> bar_wombat_node =
       internal::FindNodeAtPath("bar.wombat", object_graph);
-  ASSERT_NE(nullptr, bar_wombat);
-  EXPECT_EQ(bar_wombat->kind_case(), SavedObject::kUserObject);
-  EXPECT_EQ(bar_wombat->user_object().identifier(), "_generic_user_object");
+  ASSERT_TRUE(bar_wombat_node.has_value());
+  EXPECT_EQ(*bar_wombat_node, 4);
 
-  EXPECT_EQ(foo_baz, bar_wombat);
+  EXPECT_EQ(*foo_baz_node, *bar_wombat_node);
 }
 
 TEST(ObjectGraphTraversalTest, CyclesAreOK) {
   SavedObjectGraph object_graph =
       ParseSavedObjectGraph(kCycleBetweenParentAndChild);
-  const SavedObject* foo = internal::FindNodeAtPath("foo", object_graph);
-  ASSERT_NE(nullptr, foo);
-  EXPECT_EQ(foo->kind_case(), SavedObject::kUserObject);
-  EXPECT_EQ(foo->user_object().identifier(), "_generic_user_object");
+  absl::optional<int> foo = internal::FindNodeAtPath("foo", object_graph);
+  ASSERT_TRUE(foo.has_value());
+  EXPECT_EQ(*foo, 1);
 
-  const SavedObject* foo_bar =
+  absl::optional<int> foo_bar =
       internal::FindNodeAtPath("foo.bar", object_graph);
-  ASSERT_NE(nullptr, foo_bar);
-  EXPECT_EQ(foo_bar->kind_case(), SavedObject::kUserObject);
-  EXPECT_EQ(foo_bar->user_object().identifier(), "_generic_user_object");
+  ASSERT_TRUE(foo_bar.has_value());
+  EXPECT_EQ(*foo_bar, 3);
 
-  const SavedObject* foo_bar_parent =
+  absl::optional<int> foo_bar_parent =
       internal::FindNodeAtPath("foo.bar.parent", object_graph);
-  ASSERT_NE(nullptr, foo_bar_parent);
-  EXPECT_EQ(foo_bar_parent->kind_case(), SavedObject::kUserObject);
-  EXPECT_EQ(foo_bar_parent->user_object().identifier(), "_generic_user_object");
+  ASSERT_TRUE(foo_bar_parent.has_value());
+  EXPECT_EQ(*foo_bar_parent, 1);
 
-  const SavedObject* foo_bar_parent_bar =
+  absl::optional<int> foo_bar_parent_bar =
       internal::FindNodeAtPath("foo.bar.parent.bar", object_graph);
-  ASSERT_NE(nullptr, foo_bar_parent_bar);
-  EXPECT_EQ(foo_bar_parent_bar->kind_case(), SavedObject::kUserObject);
-  EXPECT_EQ(foo_bar_parent_bar->user_object().identifier(),
-            "_generic_user_object");
+  ASSERT_TRUE(foo_bar_parent_bar.has_value());
+  EXPECT_EQ(*foo_bar_parent_bar, 3);
 
-  EXPECT_EQ(foo, foo_bar_parent);
-  EXPECT_EQ(foo_bar, foo_bar_parent_bar);
+  EXPECT_EQ(*foo, *foo_bar_parent);
+  EXPECT_EQ(*foo_bar, *foo_bar_parent_bar);
 }
 
 }  // namespace
diff --git a/tensorflow/c/experimental/saved_model/core/ops/BUILD b/tensorflow/c/experimental/saved_model/core/ops/BUILD
index 673ea1a80e2..549980b03e9 100644
--- a/tensorflow/c/experimental/saved_model/core/ops/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/ops/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 # This package contains written convenience helpers for Eager Operations
 # used by SavedModel. Once we autogenerate C++ Eager Op wrappers, we can remove these.
 load(
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/BUILD b/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
index 2b883618c87..ac168830a0e 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 # This package contains classes corresponding to Revived SavedObjectGraph types
 # used by SavedModel. See https://cs.opensource.google/tensorflow/tensorflow/+/c575e2ba93c442121d98d3f125d83fed1339924d:tensorflow/core/protobuf/saved_object_graph.proto;l=56-62
 package(
@@ -8,6 +10,25 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+cc_library(
+    name = "asset",
+    srcs = [
+        "asset.cc",
+    ],
+    hdrs = [
+        "asset.h",
+    ],
+    deps = [
+        ":tensorhandle_convertible",
+        "//tensorflow/c:tensor_interface",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/cc/saved_model:constants",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "constant",
     srcs = [
@@ -28,6 +49,106 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "flat_tensor_function",
+    srcs = [
+        "flat_tensor_function.cc",
+    ],
+    hdrs = [
+        "flat_tensor_function.h",
+    ],
+    deps = [
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_operation",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime/eager:context",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "partially_revived_objects",
+    srcs = [
+        "partially_revived_objects.cc",
+    ],
+    hdrs = [
+        "partially_revived_objects.h",
+    ],
+    deps = [
+        ":asset",
+        ":constant",
+        ":restored_resource",
+        ":restored_resource_revival_state",
+        ":revived_objects",
+        ":tf_concrete_function",
+        ":tf_concrete_function_revival_state",
+        ":tf_signature_def_function",
+        ":tf_signature_def_function_revival_state",
+        ":variable",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_operation",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/c/experimental/saved_model/core:signature_def_function_metadata",
+        "//tensorflow/c/experimental/saved_model/core:tensor_spec",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/llvm_rtti",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "restored_resource",
+    srcs = [
+        "restored_resource.cc",
+    ],
+    hdrs = [
+        "restored_resource.h",
+    ],
+    deps = [
+        ":tensorhandle_convertible",
+        ":tf_concrete_function",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_operation",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "restored_resource_revival_state",
+    hdrs = [
+        "restored_resource_revival_state.h",
+    ],
+    deps = [
+        ":tf_concrete_function_revival_state",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+    ],
+)
+
+cc_library(
+    name = "revived_objects",
+    hdrs = [
+        "revived_objects.h",
+    ],
+    deps = [
+        ":asset",
+        ":constant",
+        ":restored_resource",
+        ":tf_concrete_function",
+        ":tf_signature_def_function",
+        ":variable",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "variable",
     srcs = [
@@ -45,6 +166,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "//tensorflow/core/lib/llvm_rtti",
         "@com_google_absl//absl/types:optional",
     ],
 )
@@ -68,7 +191,7 @@ cc_library(
         "tf_concrete_function.h",
     ],
     deps = [
-        ":tensorhandle_convertible",
+        ":flat_tensor_function",
         "//tensorflow/c/eager:abstract_tensor_handle",
         "//tensorflow/c/eager:immediate_execution_context",
         "//tensorflow/c/eager:immediate_execution_operation",
@@ -81,3 +204,55 @@ cc_library(
         "@com_google_absl//absl/types:span",
     ],
 )
+
+cc_library(
+    name = "tf_concrete_function_revival_state",
+    hdrs = [
+        "tf_concrete_function_revival_state.h",
+    ],
+    deps = [
+        ":tf_concrete_function",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+cc_library(
+    name = "tf_signature_def_function",
+    srcs = [
+        "tf_signature_def_function.cc",
+    ],
+    hdrs = [
+        "tf_signature_def_function.h",
+    ],
+    deps = [
+        ":flat_tensor_function",
+        "//tensorflow/c/eager:abstract_tensor_handle",
+        "//tensorflow/c/eager:immediate_execution_context",
+        "//tensorflow/c/eager:immediate_execution_operation",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/c/experimental/saved_model/core:signature_def_function",
+        "//tensorflow/c/experimental/saved_model/core:signature_def_function_metadata",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime/eager:context",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "tf_signature_def_function_revival_state",
+    hdrs = [
+        "tf_signature_def_function_revival_state.h",
+    ],
+    deps = [
+        ":tf_signature_def_function",
+        "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "//tensorflow/c/experimental/saved_model/core:signature_def_function_metadata",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/asset.cc b/tensorflow/c/experimental/saved_model/core/revived_types/asset.cc
new file mode 100644
index 00000000000..5cc14d615f5
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/asset.cc
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/revived_types/asset.h"
+
+#include <string>
+
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/path.h"
+
+namespace tensorflow {
+
+Asset::Asset(ImmediateTensorHandlePtr handle)
+    : TensorHandleConvertible(std::move(handle)) {}
+
+Status Asset::Create(ImmediateExecutionContext* ctx,
+                     const std::string& saved_model_dir,
+                     const std::string& asset_filename,
+                     std::unique_ptr<Asset>* output) {
+  std::string abs_path =
+      io::JoinPath(saved_model_dir, kSavedModelAssetsDirectory, asset_filename);
+  AbstractTensorPtr tensor(ctx->CreateStringScalar(abs_path));
+  if (tensor.get() == nullptr) {
+    return errors::Internal(
+        "Failed to create scalar string tensor for Asset at path ", abs_path);
+  }
+
+  ImmediateTensorHandlePtr handle(ctx->CreateLocalHandle(tensor.get()));
+  output->reset(new Asset(std::move(handle)));
+  return Status();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/asset.h b/tensorflow/c/experimental/saved_model/core/revived_types/asset.h
new file mode 100644
index 00000000000..c98bd9b5628
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/asset.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_ASSET_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_ASSET_H_
+
+#include <string>
+
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
+#include "tensorflow/c/tensor_interface.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+
+namespace tensorflow {
+
+class Asset : public TensorHandleConvertible {
+ public:
+  static Status Create(ImmediateExecutionContext* ctx,
+                       const std::string& saved_model_dir,
+                       const std::string& asset_filename,
+                       std::unique_ptr<Asset>* output);
+
+  // Asset is movable, but not copyable.
+  Asset(Asset&& other) = default;
+  Asset& operator=(Asset&& other) = default;
+
+  ~Asset() override = default;
+
+ private:
+  explicit Asset(ImmediateTensorHandlePtr handle);
+  Asset(const Asset&) = delete;
+  Asset& operator=(const Asset&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_ASSET_H_
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.cc b/tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.cc
new file mode 100644
index 00000000000..59f7306fedc
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.cc
@@ -0,0 +1,91 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h"
+
+#include <memory>
+#include <string>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+#include "tensorflow/core/protobuf/struct.pb.h"
+
+namespace tensorflow {
+
+FlatTensorFunction::FlatTensorFunction(
+    const std::string& name, std::vector<ImmediateTensorHandlePtr> captures,
+    ImmediateExecutionContext* ctx)
+    : name_(name), captures_(std::move(captures)), ctx_(ctx) {}
+
+FlatTensorFunction::~FlatTensorFunction() {
+  Status status = ctx_->RemoveFunction(name_);
+  if (!status.ok()) {
+    LOG(ERROR) << "Failed to remove functiondef " << name_ << ". "
+               << status.error_message();
+  }
+}
+
+Status FlatTensorFunction::Create(
+    const FunctionDef* function_def,
+    std::vector<ImmediateExecutionTensorHandle*> captures,
+    ImmediateExecutionContext* ctx, std::unique_ptr<FlatTensorFunction>* out) {
+  TF_RETURN_IF_ERROR(ctx->AddFunctionDef(*function_def));
+  std::vector<ImmediateTensorHandlePtr> owned_captures;
+  owned_captures.reserve(captures.size());
+  for (ImmediateExecutionTensorHandle* capture : captures) {
+    capture->Ref();
+    owned_captures.push_back(ImmediateTensorHandlePtr(capture));
+  }
+
+  out->reset(new FlatTensorFunction(function_def->signature().name(),
+                                    std::move(owned_captures), ctx));
+  return Status();
+}
+
+Status FlatTensorFunction::MakeCallOp(
+    absl::Span<AbstractTensorHandle* const> inputs, ImmediateOpPtr* out) const {
+  out->reset(ctx_->CreateOperation());
+  // In eager mode, TF2 python executes functions by constructing an op with
+  // the name of the functiondef:
+  // https://github.com/tensorflow/tensorflow/blob/66668ec0ca432e2f38a575b814f45b6d299d01ed/tensorflow/python/eager/function.py#L545
+  // In graph mode, we create a PartitionedCallOp instead:
+  // https://github.com/tensorflow/tensorflow/blob/66668ec0ca432e2f38a575b814f45b6d299d01ed/tensorflow/python/eager/function.py#L573
+
+  // TODO(bmzhao): After discussing with Allen, we should execute this via a
+  // PartitionedCallOp for compatibility with "tooling that assumes functions in
+  // graphs are PartitionedCallOps".
+  TF_RETURN_IF_ERROR((*out)->Reset(name_.c_str(), nullptr));
+
+  // Adding the user-provided inputs to the function.
+  TF_RETURN_IF_ERROR((*out)->AddInputList(inputs));
+
+  absl::Span<AbstractTensorHandle* const> captures(
+      reinterpret_cast<AbstractTensorHandle* const*>(captures_.data()),
+      captures_.size());
+
+  // Adding the captures of the function.
+  TF_RETURN_IF_ERROR((*out)->AddInputList(captures));
+  return Status();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h b/tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h
new file mode 100644
index 00000000000..a6769d323b4
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h
@@ -0,0 +1,86 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_FLAT_TENSOR_FUNCTION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_FLAT_TENSOR_FUNCTION_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+
+namespace tensorflow {
+
+// FlatTensorFunction models a TF2 eager runtime view of a callable function,
+// taking + returning flat lists of tensors, including any captures.
+// Effectively, it is a thin wrapper around a FunctionDef owned by the
+// EagerContext, and any TensorHandle captures associated with the function. The
+// MakeCallOp method handles the logic of marshaling captures after the user
+// provided inputs automatically.
+// Note(bmzhao): This class is mainly intended to house low-level reusable
+// function logic between SignatureDefFunction and ConcreteFunction, which
+// present higher level interfaces. This type does *not* hold any "function
+// metadata".
+class FlatTensorFunction {
+ public:
+  // Factory for creating a FlatTensorFunction.
+  //
+  // Params:
+  //  function_def - The function_def associated with the created
+  //                 FlatTensorFunction. FlatTensorFunction will register this
+  //                 function_def with `ctx` on creation, and de-register it on
+  //                 destruction. function_def must be non-null, but
+  //                 otherwise has no lifetime requirements.
+  //  captures - The captured TensorHandles associated with this
+  //             FlatTensorFunction. FlatTensorFunction will participate in
+  //             ownership of the handles (it explicitly increments the refcount
+  //             of each handle, and will decrement them on destruction).
+  //  ctx      - A handle to the Tensorflow runtime. This MUST be non-null and
+  //             outlive TFConcreteFunction.
+  //  out      - The output FlatTensorFunction.
+  static Status Create(const FunctionDef* function_def,
+                       std::vector<ImmediateExecutionTensorHandle*> captures,
+                       ImmediateExecutionContext* ctx,
+                       std::unique_ptr<FlatTensorFunction>* out);
+
+  // This method creates a "Call" Op used to execute the function.
+  Status MakeCallOp(absl::Span<AbstractTensorHandle* const> inputs,
+                    ImmediateOpPtr* out) const;
+
+  ~FlatTensorFunction();
+
+ private:
+  FlatTensorFunction(const std::string& name,
+                     std::vector<ImmediateTensorHandlePtr> captures,
+                     ImmediateExecutionContext* ctx);
+
+  FlatTensorFunction(const FlatTensorFunction&) = delete;
+  FlatTensorFunction& operator=(const FlatTensorFunction&) = delete;
+
+  // Name of the FunctionDef corresponding to this TFConcreteFunction
+  std::string name_;
+  std::vector<ImmediateTensorHandlePtr> captures_;
+  ImmediateExecutionContext* ctx_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_FLAT_TENSOR_FUNCTION_H_
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.cc b/tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.cc
new file mode 100644
index 00000000000..1c615405644
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.cc
@@ -0,0 +1,543 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/restored_resource.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/restored_resource_revival_state.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/revived_objects.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function_revival_state.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function_revival_state.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h"
+#include "tensorflow/c/experimental/saved_model/core/tensor_spec.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+#include "tensorflow/core/protobuf/struct.pb.h"
+
+namespace tensorflow {
+
+namespace {
+
+using StructuredValueDictEntry =
+    protobuf::MapPair<std::string, StructuredValue>;
+
+using NamedParamMap =
+    gtl::FlatMap<StringPiece, const TensorSpecProto*, StringPieceHasher>;
+
+Status AssertAllCreateResourceFunctionsHaveNoCaptures(
+    const PartiallyRevivedObjects& objects) {
+  for (const auto& id_and_resource : objects.restored_resources) {
+    int node_id = id_and_resource.first;
+    const RestoredResourceRevivalState& resource = id_and_resource.second;
+    const TFConcreteFunctionRevivalState* create_resource_fn =
+        resource.create_resource;
+    if (create_resource_fn == nullptr) {
+      return errors::FailedPrecondition(
+          "Resource at node ", node_id,
+          " did not have a create_resource() function");
+    }
+    const SavedConcreteFunction* saved_create_resource_fn =
+        create_resource_fn->saved_concrete_func;
+    if (!saved_create_resource_fn->bound_inputs().empty()) {
+      // TODO(b/124045874): Support loading resource functions via a top sort
+      return errors::Unimplemented(
+          "Create Resource functions with captures are currently unsupported.");
+    }
+  }
+  return Status();
+}
+
+// Retrieves the TensorHandle associated with `node_id` from `obj_graph`, and
+// set `*handle` to point to it.
+Status TensorHandleFromNode(int node_id, const SavedObjectGraph& obj_graph,
+                            const PartiallyRevivedObjects& objects,
+                            ImmediateExecutionTensorHandle** handle) {
+  const SavedObject& node = obj_graph.nodes(node_id);
+  SavedObject::KindCase kind = node.kind_case();
+  switch (kind) {
+    case SavedObject::kVariable: {
+      const auto& variables_iter = objects.variables.find(node_id);
+      if (variables_iter == objects.variables.end()) {
+        return errors::FailedPrecondition(
+            "Tried to convert node id ", node_id,
+            " of type variable to tensor but the variable wasn't initialized");
+      }
+      *handle = variables_iter->second->handle();
+      return Status();
+    }
+    case SavedObject::kConstant: {
+      const auto& constants_iter = objects.constants.find(node_id);
+      if (constants_iter == objects.constants.end()) {
+        return errors::FailedPrecondition("Tried to convert node id ", node_id,
+                                          " of type constant to tensor but the "
+                                          "constant wasn't initialized");
+      }
+      *handle = constants_iter->second->handle();
+      return Status();
+    }
+    case SavedObject::kAsset: {
+      const auto& assets_iter = objects.assets.find(node_id);
+      if (assets_iter == objects.assets.end()) {
+        return errors::FailedPrecondition(
+            "Tried to convert node id ", node_id,
+            " of type asset to tensor but the asset wasn't initialized");
+      }
+      *handle = assets_iter->second->handle();
+      return Status();
+    }
+    case SavedObject::kResource: {
+      const auto& resource_iter = objects.restored_resources.find(node_id);
+      if (resource_iter == objects.restored_resources.end()) {
+        return errors::FailedPrecondition(
+            "Tried to convert node id ", node_id,
+            " of type Resource to tensor but the Resource wasn't initialized");
+      }
+      const RestoredResourceRevivalState& resource = resource_iter->second;
+      if (resource.resource_handle == nullptr) {
+        return errors::FailedPrecondition(
+            "Resource with node id ", node_id,
+            " should have its resource_handle created, but was nullptr.");
+      }
+      *handle = resource.resource_handle.get();
+      return Status();
+    }
+    default: {
+      return errors::FailedPrecondition(
+          "Only objects of type variable, constant, asset, and resources have "
+          "capturable tensorhandles. Encountered object of kind ",
+          node.kind_case(), " at node id: ", node_id);
+    }
+  }
+}
+
+std::vector<SignatureDefParam> SignatureDefParamsFromNamedParamMap(
+    const NamedParamMap& params) {
+  // The underlying functiondef associated with the SignatureDef has
+  // nest.flattened inputs and outputs, which are sorted by string key.
+  std::vector<SignatureDefParam> result;
+  result.reserve(params.size());
+  for (const auto& named_param : params) {
+    result.push_back(SignatureDefParam(std::string(named_param.first),
+                                       TensorSpec(*named_param.second)));
+  }
+  std::sort(result.begin(), result.end(),
+            [](const SignatureDefParam& x, const SignatureDefParam& y) {
+              return x.name() < y.name();
+            });
+
+  return result;
+}
+
+// SignatureDefArgsFromInputs takes the "canonicalized_input_signature"
+// field of a SavedConcreteFunction, ensures it conforms to the structure of
+// tuple(tuple(), dict<string,TensorSpec>()), and "returns" a list of
+// SignatureDefParams of the SignatureDefFunction's arguments.
+Status SignatureDefArgsFromInputs(
+    const StructuredValue& canonicalized_input_signature,
+    std::vector<SignatureDefParam>* out) {
+  // Note(bmzhao): canonicalized_input_signature should be a tuple of
+  // (args, kwargs), where args is an empty tuple, and kwargs is a dictionary of
+  // string keys to TensorSpecs.
+  if (!canonicalized_input_signature.has_tuple_value()) {
+    return errors::FailedPrecondition(
+        "SignatureDefFunction's canonicalized_input_signature should be "
+        "of form tuple(tuple(), dict()), but was instead: \n",
+        canonicalized_input_signature.DebugString());
+  }
+
+  const TupleValue& args_kwargs_tuple =
+      canonicalized_input_signature.tuple_value();
+  if (args_kwargs_tuple.values_size() != 2) {
+    return errors::FailedPrecondition(
+        "SignatureDefFunction's canonicalized_input_signature should be "
+        "a tuple of two elements (args, kwargs), but was instead: \n",
+        args_kwargs_tuple.DebugString());
+  }
+
+  const StructuredValue& args = args_kwargs_tuple.values(0);
+  if (!args.has_tuple_value() || !args.tuple_value().values().empty()) {
+    return errors::FailedPrecondition(
+        "SignatureDefFunction's canonicalized_input_signature's args"
+        "should be an empty tuple, but instead got: \n",
+        args.DebugString());
+  }
+
+  const StructuredValue& kwargs = args_kwargs_tuple.values(1);
+  if (!kwargs.has_dict_value()) {
+    return errors::FailedPrecondition(
+        "SignatureDefFunction's canonicalized_input_signature's kwargs"
+        "should be a dictionary, but instead got: \n",
+        kwargs.DebugString());
+  }
+
+  const DictValue& kwargs_dict = kwargs.dict_value();
+  NamedParamMap result;
+  result.reserve(kwargs_dict.fields_size());
+
+  for (const auto& key_value : kwargs_dict.fields()) {
+    const std::string& key = key_value.first;
+    const StructuredValue& value = key_value.second;
+    if (!value.has_tensor_spec_value()) {
+      return errors::FailedPrecondition(
+          "SignatureDefFunction's canonicalized_input_signature's kwargs"
+          "dictionary contained a non-tensorspec value for key-value pair: \n",
+          "Key: ", key, "Value: \n", value.DebugString());
+    }
+    result[key] = &value.tensor_spec_value();
+  }
+
+  *out = SignatureDefParamsFromNamedParamMap(result);
+
+  return Status();
+}
+
+// SignatureDefReturnsFromOutputs takes the "output_signature" field of a
+// SavedConcreteFunction, ensures it conforms to the structure of
+// dict<string,TensorSpec>(), and "returns" a list of SignatureDefParams of the
+// SignatureDefFunction's returns.
+Status SignatureDefReturnsFromOutputs(const StructuredValue& output_signature,
+                                      std::vector<SignatureDefParam>* out) {
+  if (!output_signature.has_dict_value()) {
+    return errors::FailedPrecondition(
+        "SignatureDefFunction's output_signature must be a dictionary, but "
+        "instead got: ",
+        output_signature.DebugString());
+  }
+
+  const DictValue& output_dict = output_signature.dict_value();
+  NamedParamMap result;
+  result.reserve(output_dict.fields_size());
+
+  for (const auto& key_value : output_dict.fields()) {
+    const std::string& key = key_value.first;
+    const StructuredValue& value = key_value.second;
+    if (!value.has_tensor_spec_value()) {
+      return errors::FailedPrecondition(
+          "SignatureDefFunction's output_signature dictionary contained a "
+          "non-tensorspec value for key-value pair: \n",
+          "Key: ", key, "Value: \n", value.DebugString());
+    }
+    result[key] = &value.tensor_spec_value();
+  }
+  *out = SignatureDefParamsFromNamedParamMap(result);
+
+  return Status();
+}
+
+// The implementation takes advantage of the fact that SignatureDefFunction's
+// "traced" Signature wrapper function always has inputs/outputs of dictionaries
+// https://github.com/tensorflow/tensorflow/blob/53cdd5e87c423b195f33775753273286fd5a1a65/tensorflow/python/saved_model/signature_serialization.py#L119-L126
+// https://github.com/tensorflow/tensorflow/blob/53cdd5e87c423b195f33775753273286fd5a1a65/tensorflow/python/saved_model/signature_serialization.py#L153-L178
+// Additionally, we take advantage of the fact that the SignatureDefFunction's
+// associated functiondef has lexicographically ordered inputs/outputs due to
+// nest.flatten.
+Status LoadSignatureDefFunctionMetadata(
+    const SavedConcreteFunction& saved_concrete_function,
+    SignatureDefFunctionMetadata* out) {
+  std::vector<SignatureDefParam> args;
+  TF_RETURN_IF_ERROR(SignatureDefArgsFromInputs(
+      saved_concrete_function.canonicalized_input_signature(), &args));
+
+  std::vector<SignatureDefParam> rets;
+  TF_RETURN_IF_ERROR(SignatureDefReturnsFromOutputs(
+      saved_concrete_function.output_signature(), &rets));
+
+  *out = SignatureDefFunctionMetadata(std::move(args), std::move(rets));
+  return Status();
+}
+
+// This function finds the necessary captures, then forwards to the builder
+// method
+Status CreateConcreteFunction(ImmediateExecutionContext* ctx,
+                              const TFConcreteFunctionRevivalState& builder,
+                              const SavedObjectGraph& obj_graph,
+                              const PartiallyRevivedObjects& objects,
+                              std::unique_ptr<TFConcreteFunction>* out) {
+  const auto& capture_node_ids = builder.saved_concrete_func->bound_inputs();
+  std::vector<ImmediateExecutionTensorHandle*> captures;
+  captures.reserve(capture_node_ids.size());
+  for (int capture_node_id : capture_node_ids) {
+    ImmediateExecutionTensorHandle* capture_handle;
+    TF_RETURN_IF_ERROR(TensorHandleFromNode(capture_node_id, obj_graph, objects,
+                                            &capture_handle));
+    captures.push_back(capture_handle);
+  }
+  // TODO(bmzhao): Create Metadata here
+  return TFConcreteFunction::Create(/*function_def=*/builder.fdef,
+                                    /*captures=*/std::move(captures),
+                                    /*metadata=*/{},
+                                    /*ctx=*/ctx,
+                                    /*out=*/out);
+}
+
+Status CreateSignatureDefFunction(
+    ImmediateExecutionContext* ctx,
+    const TFSignatureDefFunctionRevivalState& builder,
+    const SavedObjectGraph& obj_graph, const PartiallyRevivedObjects& objects,
+    std::unique_ptr<TFSignatureDefFunction>* out) {
+  const auto& capture_node_ids = builder.saved_concrete_func->bound_inputs();
+  std::vector<ImmediateExecutionTensorHandle*> captures;
+  captures.reserve(capture_node_ids.size());
+  for (int capture_node_id : capture_node_ids) {
+    ImmediateExecutionTensorHandle* capture_handle;
+    TF_RETURN_IF_ERROR(TensorHandleFromNode(capture_node_id, obj_graph, objects,
+                                            &capture_handle));
+    captures.push_back(capture_handle);
+  }
+
+  SignatureDefFunctionMetadata metadata;
+  TF_RETURN_IF_ERROR(LoadSignatureDefFunctionMetadata(
+      *builder.saved_concrete_func, &metadata));
+
+  return TFSignatureDefFunction::Create(/*function_def=*/builder.fdef,
+                                        /*captures=*/std::move(captures),
+                                        /*metadata=*/std::move(metadata),
+                                        /*ctx=*/ctx,
+                                        /*out=*/out);
+}
+
+Status InitializeCreateResourceFunctions(ImmediateExecutionContext* ctx,
+                                         const SavedObjectGraph& obj_graph,
+                                         const PartiallyRevivedObjects& objects,
+                                         RevivedObjects* revived) {
+  for (const auto& id_and_resource : objects.restored_resources) {
+    const RestoredResourceRevivalState& resource = id_and_resource.second;
+    const TFConcreteFunctionRevivalState* create_resource_fn =
+        resource.create_resource;
+
+    const SavedConcreteFunction* saved_create_resource_fn =
+        create_resource_fn->saved_concrete_func;
+    if (!saved_create_resource_fn->bound_inputs().empty()) {
+      // TODO(b/124045874): Load resource functions via a topological sort
+      return errors::Unimplemented(
+          "Create Resource functions with captures are currently unsupported.");
+    }
+    std::unique_ptr<TFConcreteFunction> out;
+    TF_RETURN_IF_ERROR(CreateConcreteFunction(ctx, *create_resource_fn,
+                                              obj_graph, objects, &out));
+    revived->concrete_functions[create_resource_fn->node_id] = std::move(out);
+  }
+  return Status();
+}
+
+Status InitializeAllFunctions(ImmediateExecutionContext* ctx,
+                              const SavedObjectGraph& obj_graph,
+                              const PartiallyRevivedObjects& objects,
+                              RevivedObjects* revived) {
+  gtl::FlatMap<int, std::unique_ptr<TFConcreteFunction>>* destination_func_map =
+      &revived->concrete_functions;
+  gtl::FlatMap<int, std::unique_ptr<TFSignatureDefFunction>>*
+      destination_sig_map = &revived->signature_def_functions;
+
+  for (const auto& id_and_func : objects.concrete_functions) {
+    int node_id = id_and_func.first;
+    const TFConcreteFunctionRevivalState& func = id_and_func.second;
+
+    if (destination_func_map->find(node_id) != destination_func_map->end()) {
+      // The function has already been initialized in the destination_map,
+      // so we can skip this node. This can occur because we initialize
+      // CreateResource functions before calling this function.
+      continue;
+    }
+
+    std::unique_ptr<TFConcreteFunction> out;
+    TF_RETURN_IF_ERROR(
+        CreateConcreteFunction(ctx, func, obj_graph, objects, &out));
+    (*destination_func_map)[node_id] = std::move(out);
+  }
+
+  for (const auto& id_and_func : objects.signature_def_functions) {
+    int node_id = id_and_func.first;
+    const TFSignatureDefFunctionRevivalState& func = id_and_func.second;
+
+    if (destination_sig_map->find(node_id) != destination_sig_map->end()) {
+      continue;
+    }
+
+    std::unique_ptr<TFSignatureDefFunction> out;
+    TF_RETURN_IF_ERROR(
+        CreateSignatureDefFunction(ctx, func, obj_graph, objects, &out));
+    (*destination_sig_map)[node_id] = std::move(out);
+  }
+
+  return Status();
+}
+
+Status CreateAllResourceHandles(ImmediateExecutionContext* ctx,
+                                const SavedObjectGraph& obj_graph,
+                                PartiallyRevivedObjects* objects,
+                                RevivedObjects* revived) {
+  for (auto& id_and_resource : objects->restored_resources) {
+    RestoredResourceRevivalState& resource = id_and_resource.second;
+    int create_resource_fn_node = resource.create_resource->node_id;
+    const gtl::FlatMap<int, std::unique_ptr<TFConcreteFunction>>&
+        revived_functions = revived->concrete_functions;
+
+    const auto& revived_functions_iter =
+        revived_functions.find(create_resource_fn_node);
+    if (revived_functions_iter == revived_functions.end()) {
+      return errors::FailedPrecondition(
+          "ConcreteFunction at node ", create_resource_fn_node,
+          " should have been initialized prior to being called.");
+    }
+    const TFConcreteFunction& create_resource_fn =
+        *revived_functions_iter->second;
+    ImmediateOpPtr function_op;
+    TF_RETURN_IF_ERROR(create_resource_fn.MakeCallOp({}, &function_op));
+    TF_RETURN_IF_ERROR(function_op->SetDeviceName(resource.device.c_str()));
+
+    AbstractTensorHandle* resource_handle = nullptr;
+    int num_retvals = 1;
+    TF_RETURN_IF_ERROR(function_op->Execute(
+        absl::MakeSpan(&resource_handle, num_retvals), &num_retvals));
+    AbstractTensorHandlePtr owned_resource_handle(resource_handle);
+    if (!tensorflow::isa<ImmediateExecutionTensorHandle>(
+            owned_resource_handle.get())) {
+      return errors::Internal("Unexpected tensor handle kind.");
+    }
+    ImmediateTensorHandlePtr result(
+        reinterpret_cast<ImmediateExecutionTensorHandle*>(
+            owned_resource_handle.release()));
+    resource.resource_handle = std::move(result);
+  }
+  return Status();
+}
+
+// Finds a ConcreteFunction with node id `node` in `objects`, and sets *out to
+// point to it. If node doesn't exist in `objects`, out is untouched, and an
+// error status is returned.
+Status FindConcreteFunction(int node, RevivedObjects* objects,
+                            TFConcreteFunction** out) {
+  auto func_iter = objects->concrete_functions.find(node);
+  if (func_iter == objects->concrete_functions.end()) {
+    return errors::FailedPrecondition(
+        "Failed to find ConcreteFunction with node id ", node,
+        " in revived objects");
+  }
+  *out = func_iter->second.get();
+  return Status();
+}
+
+Status BuildResources(ImmediateExecutionContext* ctx,
+                      const SavedObjectGraph& obj_graph,
+                      PartiallyRevivedObjects* objects,
+                      RevivedObjects* revived) {
+  for (auto& id_and_resource : objects->restored_resources) {
+    int node_id = id_and_resource.first;
+    RestoredResourceRevivalState& resource_revival_state =
+        id_and_resource.second;
+
+    TFConcreteFunction* create_resource = nullptr;
+
+    // Check all the functions associated with the resource have already been
+    // initialized in `revived`
+    if (resource_revival_state.create_resource != nullptr) {
+      TF_RETURN_IF_ERROR(
+          FindConcreteFunction(resource_revival_state.create_resource->node_id,
+                               revived, &create_resource));
+    }
+
+    TFConcreteFunction* initialize = nullptr;
+    if (resource_revival_state.initialize != nullptr) {
+      TF_RETURN_IF_ERROR(FindConcreteFunction(
+          resource_revival_state.initialize->node_id, revived, &initialize));
+    }
+
+    TFConcreteFunction* destroy_resource = nullptr;
+    if (resource_revival_state.destroy_resource != nullptr) {
+      TF_RETURN_IF_ERROR(
+          FindConcreteFunction(resource_revival_state.destroy_resource->node_id,
+                               revived, &destroy_resource));
+    }
+
+    if (resource_revival_state.resource_handle == nullptr) {
+      return errors::FailedPrecondition("Resource at node id ", node_id,
+                                        " does not have a resource handle.");
+    }
+
+    revived->restored_resources.emplace(
+        node_id, RestoredResource(
+                     /*device=*/resource_revival_state.device,
+                     /*create_resource=*/create_resource,
+                     /*initialize=*/initialize,
+                     /*destroy_resource=*/destroy_resource,
+                     /*resource_handle=*/
+                     std::move(resource_revival_state.resource_handle)));
+  }
+  return Status();
+}
+
+}  // namespace
+
+Status PartiallyRevivedObjects::Build(ImmediateExecutionContext* ctx,
+                                      const SavedObjectGraph& obj_graph,
+                                      RevivedObjects* revived) {
+  // Step 1: We would like to initialize all functions; this requires setting up
+  // their captured tensorhandles, which may come from variables, assets,
+  // constants, or resources. The first three are trivial; However,
+  // tensorhandles that correspond to resources must be created by invoking
+  // their "create_resource" function.
+  // https://github.com/tensorflow/tensorflow/blob/f19c6efb4a8ba60e2492eedc98ef5375abb39dc7/tensorflow/python/saved_model/load.py#L240
+  // https://github.com/tensorflow/tensorflow/blob/f19c6efb4a8ba60e2492eedc98ef5375abb39dc7/tensorflow/python/training/tracking/tracking.py#L233
+  // For now, we assert that all create_resource functions must have no
+  // captures. This aligns with the current behavior in python.
+  // https://github.com/tensorflow/tensorflow/blob/50eac986bf7a0ad12594e080f083181f277e0b49/tensorflow/python/saved_model/load.py#L152-L155
+  // TODO(bmzhao): We should do a topological sort instead.
+
+  // 1a. Make sure all CreateResource functions have no captures.
+  TF_RETURN_IF_ERROR(AssertAllCreateResourceFunctionsHaveNoCaptures(*this));
+
+  // 1b. Initialize all CreateResource functions, storing them in `revived`
+  TF_RETURN_IF_ERROR(
+      InitializeCreateResourceFunctions(ctx, obj_graph, *this, revived));
+
+  // 1c. Invoke all "CreateResource" functions and store their ResourceHandles
+  // https://github.com/tensorflow/tensorflow/blob/3b6b41b68a95dc70c26dc816b29d359bfb88c116/tensorflow/python/training/tracking/tracking.py#L241-L247
+  // in *this->resources.
+  // TODO(bmzhao): Maybe store them separately, not in *this?
+  TF_RETURN_IF_ERROR(CreateAllResourceHandles(ctx, obj_graph, this, revived));
+
+  // 2. Initialize all the rest of the functions
+  TF_RETURN_IF_ERROR(InitializeAllFunctions(ctx, obj_graph, *this, revived));
+
+  // 3a. Move over all non-function, non-resource objects
+  revived->variables = std::move(variables);
+  revived->assets = std::move(assets);
+  revived->constants = std::move(constants);
+  revived->signatures_map = std::move(signatures_map);
+
+  // 3b. Move over resources.
+  TF_RETURN_IF_ERROR(BuildResources(ctx, obj_graph, this, revived));
+
+  return Status();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.h b/tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.h
new file mode 100644
index 00000000000..78960b8c95f
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.h
@@ -0,0 +1,62 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_PARTIALLY_REVIVED_OBJECTS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_PARTIALLY_REVIVED_OBJECTS_H_
+
+#include <memory>
+
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/asset.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/restored_resource_revival_state.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/revived_objects.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function_revival_state.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function_revival_state.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/variable.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+
+namespace tensorflow {
+
+// Container for objects during the revival step in SavedModel's loading.
+// Notably, resources and functions can be in a state where they reference
+// other resources/functions that have not been constructed yet. We collect
+// *all* objects in a partially valid state here, then properly initialize
+// resources and functions. Implementation-wise, PartiallyRevivedObjects
+// contains maps keyed by the node number of the SavedObjectGraph, and map to an
+// object of the corresponding type. So, if node 2 in the object graph is a
+// variable, PartiallyRevivedObjects.variables[2] exists, and corresponds to a
+// tensorflow::Variable object. The only exception to this is the
+// "signatures_map", which is keyed by the "signature" key
+// (https://github.com/tensorflow/tensorflow/blob/372918decee7f558b3c194b04f77c20dcc679a31/tensorflow/core/protobuf/meta_graph.proto#L89),
+// and maps to the SignatureDefFunction node in the SavedObjectGraph.
+struct PartiallyRevivedObjects {
+  gtl::FlatMap<int, std::unique_ptr<Variable>> variables;
+  gtl::FlatMap<int, std::unique_ptr<Asset>> assets;
+  gtl::FlatMap<int, std::unique_ptr<Constant>> constants;
+  gtl::FlatMap<int, TFConcreteFunctionRevivalState> concrete_functions;
+  gtl::FlatMap<int, TFSignatureDefFunctionRevivalState> signature_def_functions;
+  gtl::FlatMap<int, RestoredResourceRevivalState> restored_resources;
+  gtl::FlatMap<std::string, int> signatures_map;
+
+  Status Build(ImmediateExecutionContext* ctx,
+               const SavedObjectGraph& obj_graph, RevivedObjects* revived);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_PARTIALLY_REVIVED_OBJECTS_H_
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/restored_resource.cc b/tensorflow/c/experimental/saved_model/core/revived_types/restored_resource.cc
new file mode 100644
index 00000000000..47860ce8b39
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/restored_resource.cc
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/revived_types/restored_resource.h"
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+namespace {
+
+Status ExecuteNoArgDummyReturnFunction(TFConcreteFunction* func) {
+  ImmediateOpPtr function_op;
+  TF_RETURN_IF_ERROR(func->MakeCallOp({}, &function_op));
+
+  AbstractTensorHandle* dummy_output = nullptr;
+  int num_retvals = 1;
+  TF_RETURN_IF_ERROR(function_op->Execute(
+      absl::MakeSpan(&dummy_output, num_retvals), &num_retvals));
+  AbstractTensorHandlePtr owned_dummy_output(dummy_output);
+  return Status();
+}
+
+}  // namespace
+
+RestoredResource::RestoredResource(const std::string& device,
+                                   TFConcreteFunction* create_resource,
+                                   TFConcreteFunction* initialize,
+                                   TFConcreteFunction* destroy_resource,
+                                   ImmediateTensorHandlePtr resource_handle)
+    : TensorHandleConvertible(std::move(resource_handle)),
+      device_(device),
+      create_resource_(create_resource),
+      initialize_(initialize),
+      destroy_resource_(destroy_resource) {}
+
+Status RestoredResource::Initialize() const {
+  return ExecuteNoArgDummyReturnFunction(initialize_);
+}
+
+RestoredResource::~RestoredResource() {
+  // Note(bmzhao): SavedModels saved before
+  // https://github.com/tensorflow/tensorflow/commit/3c806101f57768e479f8646e7518bbdff1632ca3
+  // did not have their destroy_resource function saved, meaning they will
+  // leak resources.
+  if (destroy_resource_ != nullptr) {
+    Status status = ExecuteNoArgDummyReturnFunction(destroy_resource_);
+    if (!status.ok()) {
+      LOG(WARNING)
+          << "Failed executing destroy_resource function for RestoredResource: "
+          << status.error_message();
+    }
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/restored_resource.h b/tensorflow/c/experimental/saved_model/core/revived_types/restored_resource.h
new file mode 100644
index 00000000000..7adbd563a6b
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/restored_resource.h
@@ -0,0 +1,87 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_RESTORED_RESOURCE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_RESTORED_RESOURCE_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/types/optional.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h"
+
+namespace tensorflow {
+
+// RestoredResource represents a TF2 "Resource" object loaded from a savedmodel,
+// analogous to the Python _RestoredResource object:
+// https://github.com/tensorflow/tensorflow/blob/fda326e542ca67534e8411edb180e8760a4828b7/tensorflow/python/saved_model/load.py#L481
+// TF2 resource objects typically extend TrackableResource:
+// https://github.com/tensorflow/tensorflow/blob/fda326e542ca67534e8411edb180e8760a4828b7/tensorflow/python/training/tracking/tracking.py#L285
+// and are expected to implement "_create_resource", "_initialize", and
+// "_destroy_resource" functions:
+// https://github.com/tensorflow/tensorflow/blob/139ba9c5284799beafdd1d7f895127cf00e7c48f/tensorflow/python/training/tracking/tracking.py#L262-L281
+class RestoredResource : TensorHandleConvertible {
+ public:
+  // Note(bmzhao): RestoredResource stores non-owning pointers to its associated
+  // functions because SavedModel internally owns all functions and objects in
+  // the RevivedObjects struct (which owns all functions). One alternative would
+  // be to have RevivedObjects store shared_ptr<TFConcreteFunction> instead, and
+  // change RestoredResource's constructor take shared_ptr<TFConcreteFunction>.
+  // To keep things simple, I've stuck to raw pointers for now.
+  //
+  // Params:
+  //  device - The device string associated with the SavedResource
+  //           https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/saved_object_graph.proto#L182
+  //           Conceptually, this is the same device used in CapturableResource:
+  //           https://github.com/tensorflow/tensorflow/blob/568e2bef00f24af1159a0846abf67c099ca78a21/tensorflow/python/training/tracking/tracking.py#L222-L225
+  //           Implementation-wise, it is device used when invoking the
+  //           create_resource function to produce the resource_handle
+  //           associated with the object:
+  //           https://github.com/tensorflow/tensorflow/blob/568e2bef00f24af1159a0846abf67c099ca78a21/tensorflow/python/training/tracking/tracking.py#L246-L247
+  //  create_resource - Non owning pointer to the create_resource function
+  //                    associated with this object. Must be NON-NULL.
+  //  initialize - Non owning pointer to the initialize function associated with
+  //               this object. Must be NON-NULL.
+  //  destroy_resource - Non owning pointer to the destroy_resource function
+  //                     associated with this object. Ideally this should be
+  //                     NON-NULL, but in order to support models saved prior to
+  //                     https://github.com/tensorflow/tensorflow/commit/3c806101f57768e479f8646e7518bbdff1632ca3
+  //                     we allow null here. This will, however, leak resources.
+  RestoredResource(const std::string& device,
+                   TFConcreteFunction* create_resource,
+                   TFConcreteFunction* initialize,
+                   TFConcreteFunction* destroy_resource,
+                   ImmediateTensorHandlePtr resource_handle);
+
+  Status Initialize() const;
+
+  // RestoredResource is movable, but not copyable.
+  RestoredResource(RestoredResource&& other) = default;
+  RestoredResource& operator=(RestoredResource&& other) = default;
+
+  ~RestoredResource() override;
+
+ private:
+  std::string device_;
+  TFConcreteFunction* create_resource_;
+  TFConcreteFunction* initialize_;
+  TFConcreteFunction* destroy_resource_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_RESTORED_RESOURCE_H_
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/restored_resource_revival_state.h b/tensorflow/c/experimental/saved_model/core/revived_types/restored_resource_revival_state.h
new file mode 100644
index 00000000000..48d00308cc1
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/restored_resource_revival_state.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_RESTORED_RESOURCE_REVIVAL_STATE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_RESTORED_RESOURCE_REVIVAL_STATE_H_
+
+#include <string>
+
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function_revival_state.h"
+
+namespace tensorflow {
+
+// All "Resources" should have these 3 saved functions:
+// https://github.com/tensorflow/tensorflow/blob/86dc281333d7d277ddc1882f2bca4b17e7ec40e5/tensorflow/python/training/tracking/tracking.py#L277-L281
+struct RestoredResourceRevivalState {
+  std::string device;
+  TFConcreteFunctionRevivalState* create_resource = nullptr;
+  TFConcreteFunctionRevivalState* initialize = nullptr;
+  TFConcreteFunctionRevivalState* destroy_resource = nullptr;
+  ImmediateTensorHandlePtr resource_handle = nullptr;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_RESTORED_RESOURCE_REVIVAL_STATE_H_
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/revived_objects.h b/tensorflow/c/experimental/saved_model/core/revived_types/revived_objects.h
new file mode 100644
index 00000000000..cc9be0b937d
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/revived_objects.h
@@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_REVIVED_OBJECTS_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_REVIVED_OBJECTS_H_
+
+#include <memory>
+#include <unordered_map>
+
+#include "tensorflow/c/experimental/saved_model/core/revived_types/asset.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/restored_resource.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/variable.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace tensorflow {
+
+// RevivedObjects is mainly used as a container for all the "state" owned by
+// SavedModel. It stores all non-"user object" nodes from a SavedModel
+// (https://github.com/tensorflow/tensorflow/blob/568e2bef00f24af1159a0846abf67c099ca78a21/tensorflow/core/protobuf/saved_object_graph.proto#L57-L62)
+// in a "fully constructed" state. It is effectively a strongly typed map, where
+// each member is a map from the node id in the SavedObjectGraph's nodes
+// (https://github.com/tensorflow/tensorflow/blob/568e2bef00f24af1159a0846abf67c099ca78a21/tensorflow/core/protobuf/saved_object_graph.proto#L25-L29)
+// to the revived object of the corresponding type.
+struct RevivedObjects {
+  gtl::FlatMap<int, std::unique_ptr<Variable>> variables;
+  gtl::FlatMap<int, std::unique_ptr<Asset>> assets;
+  gtl::FlatMap<int, std::unique_ptr<Constant>> constants;
+  gtl::FlatMap<int, std::unique_ptr<TFConcreteFunction>> concrete_functions;
+  gtl::FlatMap<int, std::unique_ptr<TFSignatureDefFunction>>
+      signature_def_functions;
+  gtl::FlatMap<int, RestoredResource> restored_resources;
+  gtl::FlatMap<std::string, int> signatures_map;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_REVIVED_OBJECTS_H_
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.cc b/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.cc
index f734f9eca66..d9773a4520f 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.cc
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/eager/immediate_execution_operation.h"
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
-#include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/platform/errors.h"
@@ -33,32 +33,20 @@ limitations under the License.
 
 namespace tensorflow {
 
-TFConcreteFunction::TFConcreteFunction(
-    const std::string& name,
-    std::vector<ImmediateExecutionTensorHandle*> captures,
-    FunctionMetadata metadata, ImmediateExecutionContext* ctx)
-    : name_(name),
-      captures_(std::move(captures)),
-      metadata_(std::move(metadata)),
-      ctx_(ctx) {}
-
-TFConcreteFunction::~TFConcreteFunction() {
-  Status status = ctx_->RemoveFunction(name_);
-  if (!status.ok()) {
-    LOG(ERROR) << "Failed to remove functiondef " << name_ << ". "
-               << status.error_message();
-  }
-}
+TFConcreteFunction::TFConcreteFunction(std::unique_ptr<FlatTensorFunction> func,
+                                       FunctionMetadata metadata)
+    : func_(std::move(func)), metadata_(std::move(metadata)) {}
 
 Status TFConcreteFunction::Create(
     const FunctionDef* function_def,
     std::vector<ImmediateExecutionTensorHandle*> captures,
     FunctionMetadata metadata, ImmediateExecutionContext* ctx,
     std::unique_ptr<TFConcreteFunction>* out) {
-  TF_RETURN_IF_ERROR(ctx->AddFunctionDef(*function_def));
-  out->reset(new TFConcreteFunction(function_def->signature().name(),
-                                    std::move(captures), std::move(metadata),
-                                    ctx));
+  std::unique_ptr<FlatTensorFunction> func;
+  TF_RETURN_IF_ERROR(FlatTensorFunction::Create(
+      function_def, std::move(captures), ctx, &func));
+
+  out->reset(new TFConcreteFunction(std::move(func), std::move(metadata)));
   return Status();
 }
 
@@ -66,30 +54,9 @@ const FunctionMetadata& TFConcreteFunction::GetFunctionMetadata() const {
   return metadata_;
 }
 
-Status TFConcreteFunction::GetCallOp(
-    absl::Span<AbstractTensorHandle* const> inputs, ImmediateOpPtr* out) {
-  out->reset(ctx_->CreateOperation());
-  // In eager mode, TF2 python executes functions by constructing an op with
-  // the name of the functiondef:
-  // https://github.com/tensorflow/tensorflow/blob/66668ec0ca432e2f38a575b814f45b6d299d01ed/tensorflow/python/eager/function.py#L545
-  // In graph mode, we create a PartitionedCallOp instead:
-  // https://github.com/tensorflow/tensorflow/blob/66668ec0ca432e2f38a575b814f45b6d299d01ed/tensorflow/python/eager/function.py#L573
-
-  // TODO(bmzhao): After discussing with Allen, we should execute this via a
-  // PartitionedCallOp for compatibility with "tooling that assumes functions in
-  // graphs are PartitionedCallOps".
-  TF_RETURN_IF_ERROR((*out)->Reset(name_.c_str(), nullptr));
-
-  // Adding the user-provided inputs to the function.
-  TF_RETURN_IF_ERROR((*out)->AddInputList(inputs));
-
-  absl::Span<AbstractTensorHandle* const> captures(
-      reinterpret_cast<AbstractTensorHandle**>(captures_.data()),
-      captures_.size());
-
-  // Adding the captures of the function.
-  TF_RETURN_IF_ERROR((*out)->AddInputList(captures));
-  return Status();
+Status TFConcreteFunction::MakeCallOp(
+    absl::Span<AbstractTensorHandle* const> inputs, ImmediateOpPtr* out) const {
+  return func_->MakeCallOp(inputs, out);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h b/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h
index d38f3546f91..edc26f4d5aa 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
 #include "tensorflow/c/experimental/saved_model/core/function_metadata.h"
-#include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/protobuf/saved_object_graph.pb.h"
 
@@ -58,26 +58,22 @@ class TFConcreteFunction : public ConcreteFunction {
                        std::unique_ptr<TFConcreteFunction>* out);
 
   // This method returns the "Call" Op used to execute the function.
-  Status GetCallOp(absl::Span<AbstractTensorHandle* const> inputs,
-                   ImmediateOpPtr* out) override;
+  Status MakeCallOp(absl::Span<AbstractTensorHandle* const> inputs,
+                    ImmediateOpPtr* out) const override;
 
   const FunctionMetadata& GetFunctionMetadata() const override;
 
-  ~TFConcreteFunction() override;
+  ~TFConcreteFunction() override = default;
 
  private:
-  TFConcreteFunction(const std::string& name,
-                     std::vector<ImmediateExecutionTensorHandle*> captures,
-                     FunctionMetadata metadata, ImmediateExecutionContext* ctx);
+  TFConcreteFunction(std::unique_ptr<FlatTensorFunction> func,
+                     FunctionMetadata metadata);
 
   TFConcreteFunction(const TFConcreteFunction&) = delete;
   TFConcreteFunction& operator=(const TFConcreteFunction&) = delete;
 
-  // Name of the FunctionDef corresponding to this TFConcreteFunction
-  std::string name_;
-  std::vector<ImmediateExecutionTensorHandle*> captures_;
+  std::unique_ptr<FlatTensorFunction> func_;
   FunctionMetadata metadata_;
-  ImmediateExecutionContext* ctx_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function_revival_state.h b/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function_revival_state.h
new file mode 100644
index 00000000000..3dd7a6eecc4
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function_revival_state.h
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TF_CONCRETE_FUNCTION_REVIVAL_STATE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TF_CONCRETE_FUNCTION_REVIVAL_STATE_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+
+namespace tensorflow {
+
+// TFConcreteFunctionRevivalState wraps the state needed for building a
+// TF_ConcreteFunction. This is mainly used in PartiallyRevivedObjects, which
+// wraps partially constructed Function and Resource objects.
+struct TFConcreteFunctionRevivalState {
+  // Index of the node in the SavedObjectGraph it was loaded from.
+  int node_id;
+
+  // Pointer to the original functiondef. fdef_ is guaranteed to be
+  // non-null.
+  const FunctionDef* fdef;
+
+  // TensorHandle captures for this funtion
+  std::vector<ImmediateExecutionTensorHandle*> captures;
+
+  // SavedConcreteFunction contains much of the metadata of the expected "types"
+  // of the inputs and outputs of a function.
+  // Note(bmzhao): saved_concrete_func_ is guaranteed to be non-null.
+  const SavedConcreteFunction* saved_concrete_func;
+
+  // This field is only present on TF2 ConcreteFunctions, and is useful for
+  // determining the original argument *names* of the function, (since the
+  // "canonicalized_input_signature" may append extra uniquifying integers).
+  // However, SavedBareConcreteFunctions do not have a FunctionSpec.
+  // Note(bmzhao): if function_spec_.has_value(), *function_spec_ is guaranteed
+  // to be non-null.
+  absl::optional<const FunctionSpec*> function_spec;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TF_CONCRETE_FUNCTION_REVIVAL_STATE_H_
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.cc b/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.cc
new file mode 100644
index 00000000000..ab1745dcd47
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.cc
@@ -0,0 +1,64 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.h"
+
+#include <memory>
+#include <string>
+
+#include "absl/types/span.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+#include "tensorflow/core/protobuf/struct.pb.h"
+
+namespace tensorflow {
+
+TFSignatureDefFunction::TFSignatureDefFunction(
+    std::unique_ptr<FlatTensorFunction> func,
+    SignatureDefFunctionMetadata metadata)
+    : func_(std::move(func)), metadata_(std::move(metadata)) {}
+
+Status TFSignatureDefFunction::Create(
+    const FunctionDef* function_def,
+    std::vector<ImmediateExecutionTensorHandle*> captures,
+    SignatureDefFunctionMetadata metadata, ImmediateExecutionContext* ctx,
+    std::unique_ptr<TFSignatureDefFunction>* out) {
+  std::unique_ptr<FlatTensorFunction> func;
+  TF_RETURN_IF_ERROR(FlatTensorFunction::Create(
+      function_def, std::move(captures), ctx, &func));
+
+  out->reset(new TFSignatureDefFunction(std::move(func), std::move(metadata)));
+  return Status();
+}
+
+const SignatureDefFunctionMetadata&
+TFSignatureDefFunction::GetFunctionMetadata() const {
+  return metadata_;
+}
+
+Status TFSignatureDefFunction::MakeCallOp(
+    absl::Span<AbstractTensorHandle* const> inputs, ImmediateOpPtr* out) const {
+  return func_->MakeCallOp(inputs, out);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.h b/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.h
new file mode 100644
index 00000000000..7b564185b8b
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.h
@@ -0,0 +1,85 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TF_SIGNATURE_DEF_FUNCTION_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TF_SIGNATURE_DEF_FUNCTION_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_operation.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+
+namespace tensorflow {
+
+// This is the TF eager runtime implementation of SignatureDefFunction (separate
+// from the TFRT implementation). The user-facing API of SignatureDefFunctions
+// and their semantic differences from ConcreteFunction are described here:
+// https://github.com/tensorflow/tensorflow/blob/e2db60c9d9598ebae0b7741587ce6f5d473584d9/tensorflow/cc/saved_model/experimental/public/signature_def_function.h#L30-L59
+// Additional implementation notes are available here:
+// https://github.com/tensorflow/tensorflow/blob/e2db60c9d9598ebae0b7741587ce6f5d473584d9/tensorflow/c/experimental/saved_model/core/signature_def_function.h#L31-L48
+class TFSignatureDefFunction : public SignatureDefFunction {
+ public:
+  // Factory function for creating a TFSignatureDefFunction.
+  //
+  // Params:
+  //  function_def - The function_def associated with the created
+  //                 TFSignatureDefFunction. TFSignatureDefFunction will
+  //                 register this function_def with `ctx` on creation, and
+  //                 de-register it on destruction. function_def must be
+  //                 non-null, but otherwise has no lifetime requirements.
+  //  captures - The captured TensorHandles associated with this
+  //             TFConcreteFunction.
+  //  metadata - FunctionMetadata associated with this TFSignatureDefFunction.
+  //  ctx      - A handle to the Tensorflow runtime. This MUST be non-null and
+  //             outlive TFSignatureDefFunction.
+  //  out      - The output TFSignatureDefFunction.
+  static Status Create(const FunctionDef* function_def,
+                       std::vector<ImmediateExecutionTensorHandle*> captures,
+                       SignatureDefFunctionMetadata metadata,
+                       ImmediateExecutionContext* ctx,
+                       std::unique_ptr<TFSignatureDefFunction>* out);
+
+  // This method creates a "Call" Op used to execute the function.
+  Status MakeCallOp(absl::Span<AbstractTensorHandle* const> inputs,
+                    ImmediateOpPtr* out) const override;
+
+  const SignatureDefFunctionMetadata& GetFunctionMetadata() const override;
+
+  ~TFSignatureDefFunction() override = default;
+
+ private:
+  TFSignatureDefFunction(std::unique_ptr<FlatTensorFunction> func,
+                         SignatureDefFunctionMetadata metadata);
+
+  TFSignatureDefFunction(const TFSignatureDefFunction&) = delete;
+  TFSignatureDefFunction& operator=(const TFSignatureDefFunction&) = delete;
+
+  std::unique_ptr<FlatTensorFunction> func_;
+  SignatureDefFunctionMetadata metadata_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TF_SIGNATURE_DEF_FUNCTION_H_
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function_revival_state.h b/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function_revival_state.h
new file mode 100644
index 00000000000..ac1b20e474b
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function_revival_state.h
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TF_SIGNATURE_DEF_FUNCTION_REVIVAL_STATE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TF_SIGNATURE_DEF_FUNCTION_REVIVAL_STATE_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/protobuf/saved_object_graph.pb.h"
+
+namespace tensorflow {
+
+// FunctionBuilder wraps the state needed for building a SignatureDefFunction.
+// This is mainly used in PartiallyRevivedObjects, which wraps partially
+// constructed Function and Resource objects.
+struct TFSignatureDefFunctionRevivalState {
+  // Index of the node in the SavedObjectGraph it was loaded from.
+  int node_id = 0;
+
+  // Pointer to the original functiondef. fdef_ is guaranteed to be
+  // non-null.
+  const FunctionDef* fdef = nullptr;
+
+  // SavedConcreteFunction contains much of the metadata of the expected "types"
+  // of the inputs and outputs of a function.
+  // Note(bmzhao): saved_concrete_func_ is guaranteed to be non-null.
+  const SavedConcreteFunction* saved_concrete_func = nullptr;
+
+  // The name of the SignatureDef key.
+  std::string signature_key;
+
+  // TensorHandle captures for this funtion
+  std::vector<ImmediateExecutionTensorHandle*> captures;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_REVIVED_TYPES_TF_SIGNATURE_DEF_FUNCTION_REVIVAL_STATE_H_
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/variable.cc b/tensorflow/c/experimental/saved_model/core/revived_types/variable.cc
index a212c25bd28..2ede228e4ed 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/variable.cc
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/variable.cc
@@ -20,8 +20,10 @@ limitations under the License.
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/experimental/saved_model/core/ops/variable_ops.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -62,15 +64,53 @@ Status Variable::ReadValue(ImmediateTensorHandlePtr* out) {
   return internal::ReadVariable(ctx_, handle_.get(), dtype_, out);
 }
 
-Status Variable::CreateUninitialized(ImmediateExecutionContext* ctx,
-                                     DataType dtype, TensorShape shape,
-                                     absl::optional<std::string> name,
-                                     const char* raw_device_name,
-                                     std::unique_ptr<Variable>* output) {
+Status Variable::CreateUninitialized(
+    ImmediateExecutionContext* ctx, DataType dtype, TensorShape shape,
+    absl::optional<std::string> name, const char* raw_device_name,
+    const std::vector<std::string>& component_devices,
+    std::unique_ptr<Variable>* output) {
   ImmediateTensorHandlePtr handle;
-  TF_RETURN_IF_ERROR(internal::CreateUninitializedResourceVariable(
-      ctx, dtype, shape, raw_device_name, &handle));
 
+  if (component_devices.empty()) {
+    TF_RETURN_IF_ERROR(internal::CreateUninitializedResourceVariable(
+        ctx, dtype, shape, raw_device_name, &handle));
+    output->reset(
+        new Variable(ctx, dtype, shape, std::move(name), std::move(handle)));
+    return Status();
+  }
+
+  if (!tensorflow::isa<EagerContext>(ctx)) {
+    return errors::InvalidArgument(
+        "Can only load distributed variables with EagerContext.");
+  }
+
+  EagerContext* eager_ctx = reinterpret_cast<EagerContext*>(ctx);
+
+  std::vector<TensorHandle*> handles;
+  for (const auto& device : component_devices) {
+    ImmediateTensorHandlePtr handlePtr;
+    TF_RETURN_IF_ERROR(internal::CreateUninitializedResourceVariable(
+        ctx, dtype, shape, device.empty() ? nullptr : device.c_str(),
+        &handlePtr));
+    if (!tensorflow::isa<TensorHandle>(handlePtr.get())) {
+      return errors::Internal("Returned replica handle has unsupported type.");
+    }
+    handles.push_back(reinterpret_cast<TensorHandle*>(handlePtr.release()));
+  }
+  TensorHandle* packed_handle;
+  TF_RETURN_IF_ERROR(TensorHandle::CreatePackedHandle(
+      std::move(handles), eager_ctx, &packed_handle));
+  // The call to `CreatePackedHandle` incremented the handles' reference count,
+  // which we must now decrement to make the packed handle the owner of those
+  // handles. We can't loop through the `handles` vector because it was
+  // `std::move`d in the call above.
+  for (int i = 0; i != packed_handle->NumPackedHandles(); ++i) {
+    TensorHandle* component;
+    TF_RETURN_IF_ERROR(packed_handle->ExtractPackedHandle(i, &component));
+    component->Unref();
+  }
+
+  handle.reset(packed_handle);
   output->reset(
       new Variable(ctx, dtype, shape, std::move(name), std::move(handle)));
   return Status();
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/variable.h b/tensorflow/c/experimental/saved_model/core/revived_types/variable.h
index 13f56fda5f3..6d630b54562 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/variable.h
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/variable.h
@@ -34,11 +34,11 @@ class Variable : public TensorHandleConvertible {
  public:
   // Creates an uninitialized resource variable. Note that a caller must
   // call "assign" to associate a value with the variable.
-  static Status CreateUninitialized(ImmediateExecutionContext* ctx,
-                                    DataType dtype, TensorShape shape,
-                                    absl::optional<std::string> name,
-                                    const char* raw_device_name,
-                                    std::unique_ptr<Variable>* output);
+  static Status CreateUninitialized(
+      ImmediateExecutionContext* ctx, DataType dtype, TensorShape shape,
+      absl::optional<std::string> name, const char* raw_device_name,
+      const std::vector<std::string>& component_devices,
+      std::unique_ptr<Variable>* output);
 
   // The dtype of the underlying variable.
   DataType dtype();
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc b/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
index e79fd8d7001..2a4297e2b67 100644
--- a/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_utils.cc
@@ -17,14 +17,22 @@ limitations under the License.
 
 #include <algorithm>
 #include <memory>
+#include <unordered_map>
+#include <unordered_set>
 
 #include "absl/strings/str_split.h"
+#include "absl/types/optional.h"
 #include "tensorflow/c/experimental/saved_model/core/function_metadata.h"
 #include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/restored_resource_revival_state.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function_revival_state.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/tf_signature_def_function_revival_state.h"
 #include "tensorflow/c/experimental/saved_model/core/revived_types/variable.h"
 #include "tensorflow/c/tf_tensor_internal.h"
+#include "tensorflow/cc/saved_model/loader_util.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -40,6 +48,83 @@ namespace {
 using StructuredValueDictEntry =
     protobuf::MapPair<std::string, StructuredValue>;
 
+// Maps from a Nodedef's name to its corresponding AttrValues, for a given
+// Graphdef
+using NodeAttrMap =
+    gtl::FlatMap<StringPiece, const AttrValueMap*, StringPieceHasher>;
+
+// Maps from a FunctionDef's name to FunctionDef, for a given FunctionDefLibrary
+using FunctionDefMap = gtl::FlatMap<StringPiece, const tensorflow::FunctionDef*,
+                                    StringPieceHasher>;
+
+// Looks up a SavedConstant's associated tensorproto from the NodeAttrMap and
+// returns a tensorflow::Constant.
+Status ConstantFromSavedConstant(
+    ImmediateExecutionContext* ctx,
+    const tensorflow::SavedConstant& saved_constant,
+    const NodeAttrMap& node_attr_map, std::unique_ptr<Constant>* output) {
+  const std::string& const_op_name = saved_constant.operation();
+  const auto& node_name_and_attrs = node_attr_map.find(const_op_name);
+  if (node_name_and_attrs == node_attr_map.end()) {
+    return errors::FailedPrecondition(
+        "Unable to find Const operation with name'", const_op_name,
+        "' in SavedModel graphdef");
+  }
+  const AttrValueMap* attrs = node_name_and_attrs->second;
+  const auto& attr_name_and_value = attrs->find("value");
+  if (attr_name_and_value == attrs->end()) {
+    return errors::FailedPrecondition("Unable to find Const operation '",
+                                      const_op_name, "'s value attribute");
+  }
+  const TensorProto& tensor_proto = attr_name_and_value->second.tensor();
+  return internal::TensorProtoToConstant(ctx, tensor_proto, output);
+}
+
+// Finds the "signatures" object in the object graph, and fills a mapping of
+// each signature's name to the corresponding function's node in the object
+// graph.
+Status GetSignaturesMap(const SavedObjectGraph& saved_objects,
+                        gtl::FlatMap<std::string, int>* signatures_map) {
+  if (saved_objects.nodes().empty()) {
+    return errors::FailedPrecondition("Saved Object Graph was empty.");
+  }
+  const SavedObject& root = saved_objects.nodes(0);
+  const SavedObject* signatures = nullptr;
+  for (const auto& child : root.children()) {
+    if (child.local_name() == "signatures") {
+      if (child.node_id() >= saved_objects.nodes().size()) {
+        return errors::FailedPrecondition(
+            "Signature object had child node id ", child.node_id(),
+            " which exceeds the size of the set of nodes");
+      }
+      signatures = &saved_objects.nodes(child.node_id());
+    }
+  }
+
+  // Some basic sanity checks that this object is actually our "signatures" map
+  if (signatures == nullptr) {
+    // This is where the "signatures" attribute is always set:
+    // https://github.com/tensorflow/tensorflow/blob/a2c542a0d83227568f9214a2af9a38ae3625976f/tensorflow/python/saved_model/save.py#L1106-L1109
+    return errors::FailedPrecondition(
+        "SavedObjectGraph's root object must have a child 'signatures' object");
+  }
+  if (signatures->kind_case() != SavedObject::kUserObject) {
+    return errors::FailedPrecondition(
+        "Signatures must be a SavedObject of type UserObject.");
+  }
+  if (signatures->user_object().identifier() != "signature_map") {
+    // This is where the string comes from:
+    // https://github.com/tensorflow/tensorflow/blob/c59af2913aaec235d883f50428efef1086f4c0e6/tensorflow/python/saved_model/signature_serialization.py#L220
+    return errors::FailedPrecondition(
+        "Signatures SavedObject must have identifier 'signature_map'.");
+  }
+
+  for (const auto& child : signatures->children()) {
+    (*signatures_map)[child.local_name()] = child.node_id();
+  }
+  return Status();
+}
+
 // Perform some basic sanity checks on SavedConcreteFunction's input and
 // output signatures with respect to the corresponding FunctionDef's input
 // and output args.
@@ -98,8 +183,37 @@ Status ValidateSavedFunctionCompatibleWithFunctionDef(
   return Status();
 }
 
+Status ValidateSingleConcreteFunction(const SavedFunction& saved_function) {
+  // We only allow loading functions that have an annotated input signature,
+  // which means there is 1:1 correspondence between tf.function
+  // <=> SavedFunction <=> SavedConcreteFunction <=> FunctionDef. This is
+  // the same restriction that MLIR has:
+  // https://github.com/tensorflow/tensorflow/blob/1c064ab76064c58e54261b805027474885a1534d/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc#L2677-L2707
+  if (saved_function.concrete_functions_size() != 1) {
+    return errors::FailedPrecondition(
+        "Only tf.functions annotated with an input signature are supported "
+        "by SavedModelAPI. This means that there should only be a single "
+        "ConcreteFunction per tf.function");
+  }
+  return Status();
+}
+
 }  // namespace
 
+Status LoadSavedAsset(ImmediateExecutionContext* ctx, const SavedAsset& asset,
+                      const std::string& saved_model_dir,
+                      absl::Span<const AssetFileDef> assets,
+                      std::unique_ptr<Asset>* output) {
+  int asset_index = asset.asset_file_def_index();
+  if (asset_index >= assets.size()) {
+    return errors::FailedPrecondition(
+        "SavedAsset contained asset index ", asset_index,
+        " but AssetFileDef only contains ", assets.size(), " # of assets");
+  }
+  const std::string& asset_filename = assets[asset_index].filename();
+  return Asset::Create(ctx, saved_model_dir, asset_filename, output);
+}
+
 Status TensorProtoToConstant(ImmediateExecutionContext* ctx,
                              const TensorProto& proto,
                              std::unique_ptr<Constant>* output) {
@@ -121,10 +235,17 @@ Status LoadSavedVariable(ImmediateExecutionContext* ctx,
   const std::string& name = variable.name();
   tensorflow::TensorShape shape(variable.shape());
   tensorflow::DataType dtype = variable.dtype();
+  std::vector<std::string> component_devices;
+
+  for (const auto& component :
+       variable.experimental_distributed_variable_components()) {
+    component_devices.push_back(component.device());
+  }
 
   TF_RETURN_IF_ERROR(Variable::CreateUninitialized(
       ctx, dtype, shape, name,
-      variable.device().empty() ? nullptr : variable.device().c_str(), output));
+      variable.device().empty() ? nullptr : variable.device().c_str(),
+      component_devices, output));
   return Status();
 }
 
@@ -210,16 +331,17 @@ Status FlattenSignature(const StructuredValue& signature,
   }
 }
 
-const SavedObject* FindNodeAtPath(StringPiece path,
-                                  const SavedObjectGraph& object_graph) {
+absl::optional<int> FindNodeAtPath(StringPiece path,
+                                   const SavedObjectGraph& object_graph) {
   const auto& nodes = object_graph.nodes();
   if (nodes.empty()) {
-    return nullptr;
+    return absl::nullopt;
   }
 
   // Starting from the root, iterate through the saved object graph, matching
   // object names as we go.
-  const SavedObject* current_node = &nodes.Get(0);
+  int node_id = 0;
+  const SavedObject* current_node = &nodes.Get(node_id);
 
   for (absl::string_view object_name : absl::StrSplit(path, '.')) {
     auto child_node_iter = std::find_if(
@@ -229,29 +351,28 @@ const SavedObject* FindNodeAtPath(StringPiece path,
           return object_name == obj.local_name();
         });
     if (child_node_iter == current_node->children().end()) {
-      return nullptr;
+      return absl::nullopt;
     }
-    current_node = &nodes.Get(child_node_iter->node_id());
+
+    node_id = child_node_iter->node_id();
+    current_node = &nodes.Get(node_id);
   }
 
-  return current_node;
+  return node_id;
 }
 
-std::unordered_map<StringPiece, const AttrValueMap*, StringPieceHasher>
-NodeToAttrMap(const tensorflow::GraphDef& graphdef) {
-  std::unordered_map<StringPiece, const AttrValueMap*, StringPieceHasher>
-      result;
+gtl::FlatMap<StringPiece, const AttrValueMap*, StringPieceHasher> NodeToAttrMap(
+    const tensorflow::GraphDef& graphdef) {
+  gtl::FlatMap<StringPiece, const AttrValueMap*, StringPieceHasher> result;
   for (const tensorflow::NodeDef& node : graphdef.node()) {
     result[node.name()] = &node.attr();
   }
   return result;
 }
 
-std::unordered_map<StringPiece, const tensorflow::FunctionDef*,
-                   StringPieceHasher>
+gtl::FlatMap<StringPiece, const tensorflow::FunctionDef*, StringPieceHasher>
 FunctionNameToFunctionDefMap(const FunctionDefLibrary& library) {
-  std::unordered_map<StringPiece, const tensorflow::FunctionDef*,
-                     StringPieceHasher>
+  gtl::FlatMap<StringPiece, const tensorflow::FunctionDef*, StringPieceHasher>
       result;
   for (const FunctionDef& function_def : library.function()) {
     result[function_def.signature().name()] = &function_def;
@@ -259,5 +380,156 @@ FunctionNameToFunctionDefMap(const FunctionDefLibrary& library) {
   return result;
 }
 
+Status PartiallyReviveSavedModelObjects(const MetaGraphDef& metagraph,
+                                        ImmediateExecutionContext* context,
+                                        const std::string& directory,
+                                        PartiallyRevivedObjects* objects) {
+  // This is needed to restore "Constant" nodes by looking up their
+  // "Value" attribute.
+  NodeAttrMap node_attr_map = NodeToAttrMap(metagraph.graph_def());
+
+  // These are needed for creating "Assets", by looking up their filenames.
+  std::vector<AssetFileDef> assets;
+  TF_RETURN_IF_ERROR(GetAssetFileDefs(metagraph, &assets));
+
+  // Signatures are needed for determining whether a function is a
+  // SignatureDefFunction or not.
+  gtl::FlatMap<std::string, int> signatures_map;
+  TF_RETURN_IF_ERROR(
+      GetSignaturesMap(metagraph.object_graph_def(), &signatures_map));
+
+  gtl::FlatMap<int, std::string> reversed_signatures_map;
+  reversed_signatures_map.reserve(signatures_map.size());
+  for (const auto& signature_key_and_node : signatures_map) {
+    reversed_signatures_map.emplace(signature_key_and_node.second,
+                                    signature_key_and_node.first);
+  }
+
+  // FunctionDefs are needed to help construct
+  // TFConcreteFunction/SignatureDefFunctions
+  const FunctionDefMap function_def_map =
+      internal::FunctionNameToFunctionDefMap(metagraph.graph_def().library());
+
+  // Iterate through all the saved objects, restoring objects (if we can) as we
+  // go. For objects that dependencies on other objects (resources/functions),
+  // we partially initialize "builders" that correspond to their currently known
+  // state, and gradually fill them out in subsequent passes.
+  for (int i = 0; i < metagraph.object_graph_def().nodes_size(); ++i) {
+    const SavedObject& node = metagraph.object_graph_def().nodes(i);
+    if (node.kind_case() == SavedObject::kVariable) {
+      std::unique_ptr<Variable> variable;
+      TF_RETURN_IF_ERROR(
+          LoadSavedVariable(context, node.variable(), &variable));
+      objects->variables[i] = std::move(variable);
+    } else if (node.kind_case() == SavedObject::kConstant) {
+      std::unique_ptr<Constant> constant;
+      TF_RETURN_IF_ERROR(ConstantFromSavedConstant(context, node.constant(),
+                                                   node_attr_map, &constant));
+      objects->constants[i] = std::move(constant);
+    } else if (node.kind_case() == SavedObject::kAsset) {
+      std::unique_ptr<Asset> asset;
+      TF_RETURN_IF_ERROR(
+          LoadSavedAsset(context, node.asset(), directory, assets, &asset));
+      objects->assets[i] = std::move(asset);
+    } else if (node.kind_case() == SavedObject::kResource) {
+      RestoredResourceRevivalState resource_revival_state;
+      // We'll set the resource's functions in a subsequent pass, once we get
+      // all functions in a partially revived state.
+      resource_revival_state.device = node.resource().device();
+      objects->restored_resources[i] = std::move(resource_revival_state);
+    } else if (node.kind_case() == SavedObject::kFunction) {
+      // Get the SavedFunction node and validate it has a single concrete func.
+      const SavedFunction& saved_function = node.function();
+      TF_RETURN_IF_ERROR(ValidateSingleConcreteFunction(saved_function));
+
+      // Retrieve related function information.
+      const std::string& function_name = saved_function.concrete_functions(0);
+      const FunctionDef* function_def = function_def_map.at(function_name);
+      const SavedConcreteFunction& saved_concrete_func =
+          metagraph.object_graph_def().concrete_functions().at(function_name);
+      const FunctionSpec& function_spec = saved_function.function_spec();
+
+      // Construct either a SignatureDefFunctionBuilder or a
+      // ConcreteFunctionBuilder, depending on whether this node was a child
+      // of the "signatures" attribute from root object.
+      auto reverse_signature_iter = reversed_signatures_map.find(i);
+      if (reverse_signature_iter != reversed_signatures_map.end()) {
+        TFSignatureDefFunctionRevivalState func_revival_state;
+        func_revival_state.node_id = i;
+        func_revival_state.fdef = function_def;
+        func_revival_state.saved_concrete_func = &saved_concrete_func;
+        func_revival_state.signature_key = reverse_signature_iter->second;
+        objects->signature_def_functions[i] = std::move(func_revival_state);
+      } else {
+        TFConcreteFunctionRevivalState func_revival_state;
+        func_revival_state.node_id = i;
+        func_revival_state.fdef = function_def;
+        func_revival_state.saved_concrete_func = &saved_concrete_func;
+        func_revival_state.function_spec = &function_spec;
+        objects->concrete_functions[i] = std::move(func_revival_state);
+      }
+    } else if (node.kind_case() == SavedObject::kBareConcreteFunction) {
+      const SavedBareConcreteFunction& bare_cf = node.bare_concrete_function();
+
+      // Retrieve related function information.
+      const std::string& function_name = bare_cf.concrete_function_name();
+      const FunctionDef* function_def = function_def_map.at(function_name);
+      const SavedConcreteFunction& saved_concrete_func =
+          metagraph.object_graph_def().concrete_functions().at(function_name);
+
+      // Check whether this is a SignatureDefFunction, or not.
+      auto reverse_signature_iter = reversed_signatures_map.find(i);
+      if (reverse_signature_iter != reversed_signatures_map.end()) {
+        TFSignatureDefFunctionRevivalState func_revival_state;
+        func_revival_state.node_id = i;
+        func_revival_state.fdef = function_def;
+        func_revival_state.saved_concrete_func = &saved_concrete_func;
+        func_revival_state.signature_key = reverse_signature_iter->second;
+        objects->signature_def_functions[i] = std::move(func_revival_state);
+      } else {
+        TFConcreteFunctionRevivalState func_revival_state;
+        func_revival_state.node_id = i;
+        func_revival_state.fdef = function_def;
+        func_revival_state.saved_concrete_func = &saved_concrete_func;
+        objects->concrete_functions[i] = std::move(func_revival_state);
+      }
+    }
+  }
+
+  // Now that we've partially restored all functions, we can have resources
+  // point to them
+  for (auto& node_and_resource_revival_state : objects->restored_resources) {
+    int node_id = node_and_resource_revival_state.first;
+    const SavedObjectGraph& obj_graph = metagraph.object_graph_def();
+    const SavedObject& node = obj_graph.nodes(node_id);
+    RestoredResourceRevivalState& resource =
+        node_and_resource_revival_state.second;
+    for (const TrackableObjectGraph::TrackableObject::ObjectReference& child :
+         node.children()) {
+      int child_node_id = child.node_id();
+      // Note(bmzhao): The expected functions saved by a resource object are:
+      // "_create_resource", "_initialize", and "_destroy_resource".
+      // https://github.com/tensorflow/tensorflow/blob/ad66f588c1666ade8051feb42811fa27b285271c/tensorflow/python/training/tracking/tracking.py#L277-L281
+      if (child.local_name() == "_create_resource" &&
+          obj_graph.nodes(child.node_id()).kind_case() ==
+              SavedObject::kFunction) {
+        resource.create_resource = &objects->concrete_functions[child_node_id];
+      } else if (child.local_name() == "_initialize" &&
+                 obj_graph.nodes(child.node_id()).kind_case() ==
+                     SavedObject::kFunction) {
+        resource.initialize = &objects->concrete_functions[child_node_id];
+      } else if (child.local_name() == "_destroy_resource" &&
+                 obj_graph.nodes(child.node_id()).kind_case() ==
+                     SavedObject::kFunction) {
+        resource.destroy_resource = &objects->concrete_functions[child_node_id];
+      }
+    }
+  }
+
+  objects->signatures_map = std::move(signatures_map);
+
+  return Status();
+}
+
 }  // namespace internal
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/saved_model_utils.h b/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
index 68bfbe32222..db45e28087f 100644
--- a/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
+++ b/tensorflow/c/experimental/saved_model/core/saved_model_utils.h
@@ -22,15 +22,21 @@ limitations under the License.
 #include <memory>
 #include <unordered_map>
 
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
 #include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/asset.h"
 #include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.h"
 #include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h"
 #include "tensorflow/c/experimental/saved_model/core/revived_types/variable.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/saved_object_graph.pb.h"
 #include "tensorflow/core/protobuf/struct.pb.h"
 
@@ -52,6 +58,11 @@ Status LoadSavedVariable(ImmediateExecutionContext* ctx,
                          const SavedVariable& variable,
                          std::unique_ptr<Variable>* output);
 
+Status LoadSavedAsset(ImmediateExecutionContext* ctx, const SavedAsset& asset,
+                      const std::string& saved_model_dir,
+                      absl::Span<const AssetFileDef> assets,
+                      std::unique_ptr<Asset>* output);
+
 // Creates a TFConcreteFunction from a SavedConcreteFunction.
 Status LoadTFConcreteFunction(
     const SavedConcreteFunction& saved_concrete_function,
@@ -67,24 +78,30 @@ Status LoadTFConcreteFunction(
 Status FlattenSignature(const StructuredValue& signature,
                         std::vector<const TensorSpecProto*>* flattened_specs);
 
-// Find the SavedObject in `object_graph` at location `path`. `path` must be
+// Find the node id in `object_graph` at location `path`. `path` must be
 // a dot-delimited string of object names relative to the root object. If no
-// object is found, returns nullptr. Callers must ensure `object_graph`
-// outlives the returned pointer.
-const SavedObject* FindNodeAtPath(StringPiece path,
-                                  const SavedObjectGraph& object_graph);
+// object is found, returns absl::nullopt.
+absl::optional<int> FindNodeAtPath(StringPiece path,
+                                   const SavedObjectGraph& object_graph);
 
 // Maps each node in `graphdef` to its corresponding Attribute Map.
 // Callers must ensure that `graphdef` outlives the returned map.
-std::unordered_map<StringPiece, const AttrValueMap*, StringPieceHasher>
-NodeToAttrMap(const tensorflow::GraphDef& graphdef);
+gtl::FlatMap<StringPiece, const AttrValueMap*, StringPieceHasher> NodeToAttrMap(
+    const tensorflow::GraphDef& graphdef);
 
 // Maps the name of each FunctionDef in `library` to its corresponding
 // FunctionDef. Callers must ensure `library` outlives the returned map.
-std::unordered_map<StringPiece, const tensorflow::FunctionDef*,
-                   StringPieceHasher>
+gtl::FlatMap<StringPiece, const tensorflow::FunctionDef*, StringPieceHasher>
 FunctionNameToFunctionDefMap(const FunctionDefLibrary& library);
 
+// Walks through the SavedObjectGraph in metagraph, and restores all nodes
+// (except "UserDefinedObjects") with their corresponding type in
+// "PartiallyRevivedObjects".
+Status PartiallyReviveSavedModelObjects(const MetaGraphDef& metagraph,
+                                        ImmediateExecutionContext* context,
+                                        const std::string& directory,
+                                        PartiallyRevivedObjects* objects);
+
 }  // namespace internal
 }  // namespace tensorflow
 
diff --git a/tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc b/tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc
index 45b0ac00c9b..a5a4e900843 100644
--- a/tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc
+++ b/tensorflow/c/experimental/saved_model/core/saved_variable_loading_test.cc
@@ -119,7 +119,7 @@ TEST_P(SavedVariableLoadingTest, AssignAndReadVariableSuccesful) {
   Status status;
   std::unique_ptr<Variable> var;
   TF_EXPECT_OK(Variable::CreateUninitialized(context(), dtype, shape,
-                                             absl::nullopt, nullptr, &var));
+                                             absl::nullopt, nullptr, {}, &var));
 
   // Create a TensorHandle
   ImmediateTensorHandlePtr expected_handle =
diff --git a/tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.cc b/tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.cc
new file mode 100644
index 00000000000..4e455f08f49
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.cc
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h"
+
+namespace tensorflow {
+
+SignatureDefParam::SignatureDefParam(std::string name, TensorSpec spec)
+    : name_(std::move(name)), spec_(std::move(spec)) {}
+
+const std::string& SignatureDefParam::name() const { return name_; }
+
+const TensorSpec& SignatureDefParam::spec() const { return spec_; }
+
+SignatureDefFunctionMetadata::SignatureDefFunctionMetadata(
+    std::vector<SignatureDefParam> arguments,
+    std::vector<SignatureDefParam> returns)
+    : arguments_(std::move(arguments)), returns_(std::move(returns)) {}
+
+const std::vector<SignatureDefParam>& SignatureDefFunctionMetadata::arguments()
+    const {
+  return arguments_;
+}
+
+const std::vector<SignatureDefParam>& SignatureDefFunctionMetadata::returns()
+    const {
+  return returns_;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h b/tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h
index 5a579676d4e..e9cc0b11b00 100644
--- a/tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h
+++ b/tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h
@@ -16,10 +16,42 @@ limitations under the License.
 #ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_METADATA_H_
 #define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_SIGNATURE_DEF_FUNCTION_METADATA_H_
 
+#include <string>
+#include <vector>
+
+#include "tensorflow/c/experimental/saved_model/core/tensor_spec.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/struct.pb.h"
+
 namespace tensorflow {
 
+// SignatureDefParam represents a named Tensor input or output to a
+// SignatureDefFunction.
+class SignatureDefParam {
+ public:
+  SignatureDefParam(std::string name, TensorSpec spec);
+
+  const std::string& name() const;
+
+  const TensorSpec& spec() const;
+
+ private:
+  std::string name_;
+  TensorSpec spec_;
+};
+
 class SignatureDefFunctionMetadata {
-  // TODO(bmzhao): Fill in with fields as necessary
+ public:
+  SignatureDefFunctionMetadata() = default;
+  SignatureDefFunctionMetadata(std::vector<SignatureDefParam> arguments,
+                               std::vector<SignatureDefParam> returns);
+
+  const std::vector<SignatureDefParam>& arguments() const;
+  const std::vector<SignatureDefParam>& returns() const;
+
+ private:
+  std::vector<SignatureDefParam> arguments_;
+  std::vector<SignatureDefParam> returns_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/tensor_spec.cc b/tensorflow/c/experimental/saved_model/core/tensor_spec.cc
new file mode 100644
index 00000000000..4d68ec73b1b
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/tensor_spec.cc
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/core/tensor_spec.h"
+
+#include <initializer_list>
+
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow {
+
+TensorSpec::TensorSpec()
+    : shape_(std::initializer_list<int64>()), dtype_(DT_FLOAT) {}
+
+TensorSpec::TensorSpec(PartialTensorShape shape, DataType dtype)
+    : shape_(std::move(shape)), dtype_(dtype) {}
+
+TensorSpec::TensorSpec(const TensorSpecProto& proto)
+    : shape_(proto.shape()), dtype_(proto.dtype()) {}
+
+const PartialTensorShape& TensorSpec::shape() const { return shape_; }
+
+DataType TensorSpec::dtype() const { return dtype_; }
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/core/tensor_spec.h b/tensorflow/c/experimental/saved_model/core/tensor_spec.h
new file mode 100644
index 00000000000..dcdff8900bd
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/core/tensor_spec.h
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TENSOR_SPEC_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TENSOR_SPEC_H_
+
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/protobuf/struct.pb.h"
+
+namespace tensorflow {
+
+// Note(bmzhao): TensorSpec deliberately does not store the "name" from a
+// TensorSpecProto. From edloper@, "Names should really be associated with
+// parameters, not the tensors inside those parameters. This would be
+// inconsistent with the corresponding Python class, but I don't think that's
+// necessarily a problem. If it turns out later that we really need a name
+// attribute here, we can always add it back in; but let's see how far we can
+// get without it."
+class TensorSpec {
+ public:
+  // Constructs a scalar, DT_FLOAT TensorSpec
+  TensorSpec();
+
+  TensorSpec(PartialTensorShape shape, DataType dtype);
+
+  explicit TensorSpec(const TensorSpecProto& proto);
+
+  const PartialTensorShape& shape() const;
+  DataType dtype() const;
+
+ private:
+  PartialTensorShape shape_;
+  DataType dtype_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_CORE_TENSOR_SPEC_H_
diff --git a/tensorflow/c/experimental/saved_model/core/test_utils.cc b/tensorflow/c/experimental/saved_model/core/test_utils.cc
index d551919ea94..988f7e382a8 100644
--- a/tensorflow/c/experimental/saved_model/core/test_utils.cc
+++ b/tensorflow/c/experimental/saved_model/core/test_utils.cc
@@ -45,11 +45,9 @@ EagerContextPtr CreateTestingEagerContext(DeviceMgr* device_mgr) {
   return EagerContextPtr(new EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE,
       /* async= */ false,
       /* lazy_copy_function_remote_inputs= */ false, device_mgr,
       /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
-      /* custom_kernel_creator= */ nullptr,
       /* cluster_flr= */ nullptr));
 }
 
diff --git a/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.cc b/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.cc
index ab7052b52ed..f0990235963 100644
--- a/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.cc
+++ b/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
@@ -30,6 +29,9 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
 #include "tensorflow/c/experimental/saved_model/core/ops/restore_ops.h"
 #include "tensorflow/c/experimental/saved_model/core/revived_types/constant.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/flat_tensor_function.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/revived_objects.h"
 #include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
 #include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h"
 #include "tensorflow/c/experimental/saved_model/core/revived_types/variable.h"
@@ -45,6 +47,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/casts.h"
 #include "tensorflow/core/platform/errors.h"
@@ -61,139 +64,15 @@ limitations under the License.
 namespace tensorflow {
 
 // Maps from a FunctionDef's name to FunctionDef, for a given FunctionDefLibrary
-using FunctionDefMap =
-    std::unordered_map<StringPiece, const tensorflow::FunctionDef*,
-                       StringPieceHasher>;
-
-// Maps from a Nodedef's name to its corresponding AttrValues, for a given
-// Graphdef
-using NodeAttrMap =
-    std::unordered_map<StringPiece, const AttrValueMap*, StringPieceHasher>;
-
-// Maps from Node ID to an "Revived Object" implementing
-// "TensorHandleConvertible"
-using RevivedObjectMap =
-    std::unordered_map<int, std::unique_ptr<TensorHandleConvertible>>;
+using FunctionDefMap = gtl::FlatMap<StringPiece, const tensorflow::FunctionDef*,
+                                    StringPieceHasher>;
 
 // Maps from a functiondef's name to the corresponding "TFConcreteFunction"
-using ConcreteFunctionMap =
-    std::unordered_map<std::string, std::unique_ptr<TFConcreteFunction>>;
+using FlatTensorFunctionMap =
+    gtl::FlatMap<std::string, std::unique_ptr<FlatTensorFunction>>;
 
 namespace {
 
-Status ConstantFromSavedConstant(
-    ImmediateExecutionContext* ctx,
-    const tensorflow::SavedConstant& saved_constant,
-    const NodeAttrMap& node_attr_map, std::unique_ptr<Constant>* output) {
-  const std::string& const_op_name = saved_constant.operation();
-  const auto& node_name_and_attrs = node_attr_map.find(const_op_name);
-  if (node_name_and_attrs == node_attr_map.end()) {
-    return errors::FailedPrecondition(
-        "Unable to find Const operation with name'", const_op_name,
-        "' in SavedModel graphdef");
-  }
-  const AttrValueMap* attrs = node_name_and_attrs->second;
-  const auto& attr_name_and_value = attrs->find("value");
-  if (attr_name_and_value == attrs->end()) {
-    return errors::FailedPrecondition("Unable to find Const operation '",
-                                      const_op_name, "'s value attribute");
-  }
-  const TensorProto& tensor_proto = attr_name_and_value->second.tensor();
-  return internal::TensorProtoToConstant(ctx, tensor_proto, output);
-}
-
-// Restores all non-function objects in the SavedModel's object graph.
-// This function walks through the metagraph's saved object graph, and
-// constructs revived versions of SavedVariable, SavedConstant, SavedAsset, and
-// SavedResources. These are returned via the `out` parameter.
-Status ReviveObjects(
-    const MetaGraphDef& metagraph, ImmediateExecutionContext* context,
-    std::unordered_map<int, std::unique_ptr<TensorHandleConvertible>>*
-        revived_objects) {
-  // This is needed to restore "Constant" nodes by looking up their
-  // "Value" attribute.
-  NodeAttrMap node_attr_map = internal::NodeToAttrMap(metagraph.graph_def());
-
-  // Iterate through all the saved objects, restoring objects as we go.
-  // We don't recreate functions until all other objects have been created.
-  for (int i = 0; i < metagraph.object_graph_def().nodes_size(); ++i) {
-    const SavedObject& node = metagraph.object_graph_def().nodes(i);
-    if (node.kind_case() == SavedObject::kVariable) {
-      std::unique_ptr<Variable> variable;
-      TF_RETURN_IF_ERROR(
-          internal::LoadSavedVariable(context, node.variable(), &variable));
-      (*revived_objects)[i] = std::move(variable);
-    } else if (node.kind_case() == SavedObject::kConstant) {
-      std::unique_ptr<Constant> constant;
-      TF_RETURN_IF_ERROR(ConstantFromSavedConstant(context, node.constant(),
-                                                   node_attr_map, &constant));
-      (*revived_objects)[i] = std::move(constant);
-    } else if (node.kind_case() == SavedObject::kAsset) {
-      // TODO(bmzhao): Implement Asset C++ class. This should be just recreating
-      // the full path to the asset file:
-      // https://github.com/tensorflow/tensorflow/blob/6a0bdbdb7c48a3491ae1277083ae3dafb4ab4d7a/tensorflow/python/saved_model/load.py#L395-L396
-      // and storing it as a string tensor:
-      // https://github.com/tensorflow/tensorflow/blob/6a0bdbdb7c48a3491ae1277083ae3dafb4ab4d7a/tensorflow/python/training/tracking/tracking.py#L324-L325
-      return errors::Unimplemented("SavedAsset loading is not implemented yet");
-    } else if (node.kind_case() == SavedObject::kResource) {
-      // TODO(bmzhao): Figure out how resource loading works and implement it
-      return errors::Unimplemented(
-          "SavedResource loading is not implemented yet");
-    }
-  }
-  return Status();
-}
-
-Status ReviveFunctions(const MetaGraphDef& metagraph,
-                       const RevivedObjectMap& revived_objects,
-                       ImmediateExecutionContext* context,
-                       ConcreteFunctionMap* restored_functions) {
-  const FunctionDefMap function_def_map =
-      internal::FunctionNameToFunctionDefMap(metagraph.graph_def().library());
-
-  // Iterate through all objects, only examining functions.
-  for (const SavedObject& node : metagraph.object_graph_def().nodes()) {
-    if (node.kind_case() == SavedObject::kBareConcreteFunction) {
-      const std::string& function_name =
-          node.bare_concrete_function().concrete_function_name();
-
-      const SavedConcreteFunction& saved_concrete_function =
-          metagraph.object_graph_def().concrete_functions().at(function_name);
-
-      const FunctionDef* function_def = function_def_map.at(function_name);
-      std::unique_ptr<TFConcreteFunction> concrete_function;
-      TF_RETURN_IF_ERROR(internal::LoadTFConcreteFunction(
-          saved_concrete_function, function_def, revived_objects, context,
-          &concrete_function));
-      (*restored_functions)[function_name] = std::move(concrete_function);
-    } else if (node.kind_case() == SavedObject::kFunction) {
-      // We only allow loading functions that have an annotated input signature,
-      // which means there is 1:1 correspondence between tf.function
-      // <=> SavedFunction <=> SavedConcreteFunction <=> FunctionDef. This is
-      // the same restriction that MLIR has:
-      // https://github.com/tensorflow/tensorflow/blob/1c064ab76064c58e54261b805027474885a1534d/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc#L2677-L2707
-      const SavedFunction& saved_function = node.function();
-      if (saved_function.concrete_functions_size() != 1) {
-        return errors::FailedPrecondition(
-            "Only tf.functions annotated with an input signature are supported "
-            "by SavedModelAPI. This means that there should only be a single "
-            "ConcreteFunction per tf.function");
-      }
-      const std::string& function_name = saved_function.concrete_functions(0);
-      const SavedConcreteFunction& saved_concrete_function =
-          metagraph.object_graph_def().concrete_functions().at(function_name);
-
-      const FunctionDef* function_def = function_def_map.at(function_name);
-
-      std::unique_ptr<TFConcreteFunction> concrete_function;
-      TF_RETURN_IF_ERROR(internal::LoadTFConcreteFunction(
-          saved_concrete_function, function_def, revived_objects, context,
-          &concrete_function));
-      (*restored_functions)[function_name] = std::move(concrete_function);
-    }
-  }
-  return Status();
-}
 
 const TrackableObjectGraph::TrackableObject::SerializedTensor*
 FindSerializedTensorInTrackable(
@@ -230,7 +109,7 @@ FindSerializedTensorInTrackable(
 // overridden "restore" method:
 // https://github.com/tensorflow/tensorflow/blob/ddc1bbad3dfd4a089eb96014f26cc16664b1b2f8/tensorflow/python/training/saving/saveable_object.py#L85
 Status RestoreCheckpoint(SavedModelV2Bundle* bundle,
-                         const RevivedObjectMap& revived_objects,
+                         const RevivedObjects& revived_objects,
                          const std::string& directory,
                          ImmediateExecutionContext* context) {
   // TODO(bmzhao): Batch up all the restores into a single restore op per
@@ -250,8 +129,7 @@ Status RestoreCheckpoint(SavedModelV2Bundle* bundle,
           return Status::OK();
         }
 
-        Variable* variable =
-            down_cast<Variable*>(revived_objects.at(node).get());
+        Variable* variable = revived_objects.variables.at(node).get();
 
         // Restore the tensor's value from the checkpoint
         const TrackableObjectGraph::TrackableObject::SerializedTensor*
@@ -264,6 +142,12 @@ Status RestoreCheckpoint(SavedModelV2Bundle* bundle,
         }
 
         const std::string& checkpoint_key = attribute->checkpoint_key();
+        if (!bundle->variable_reader()->Contains(checkpoint_key)) {
+          LOG(WARNING) << "No checkpoint entry found for " << checkpoint_key
+                       << ". Variable will be uninitialized.";
+          return Status();
+        }
+
         std::string variables_path_prefix =
             io::JoinPath(directory, kSavedModelVariablesDirectory,
                          kSavedModelVariablesFilename);
@@ -279,58 +163,86 @@ Status RestoreCheckpoint(SavedModelV2Bundle* bundle,
   return Status();
 }
 
+Status InitializeAllResources(const RevivedObjects& revived) {
+  for (const auto& node_and_resource : revived.restored_resources) {
+    const RestoredResource& resource = node_and_resource.second;
+    TF_RETURN_IF_ERROR(resource.Initialize());
+  }
+  return Status();
+}
+
 }  // namespace
 
 Status TFSavedModelAPI::GetFunction(const std::string& function_path,
                                     ConcreteFunction** function) {
-  const SavedObject* object =
+  absl::optional<int> node =
       internal::FindNodeAtPath(function_path, bundle_.saved_object_graph());
-  if (object == nullptr) {
+  if (!node.has_value()) {
     return errors::NotFound("No saved object found at path ", function_path);
   }
 
-  if (object->kind_case() == SavedObject::kBareConcreteFunction) {
-    *function =
-        concrete_functions_
-            .at(object->bare_concrete_function().concrete_function_name())
-            .get();
-  } else if (object->kind_case() == SavedObject::kFunction) {
-    *function =
-        concrete_functions_.at(object->function().concrete_functions(0)).get();
-  } else {
-    return errors::InvalidArgument(function_path,
-                                   " is not a path to a Function.");
+  auto function_iter = revived_objects_.concrete_functions.find(*node);
+  if (function_iter == revived_objects_.concrete_functions.end()) {
+    return errors::NotFound("No function found at path ", function_path);
   }
 
+  *function = function_iter->second.get();
   return Status();
 }
 
 Status TFSavedModelAPI::GetSignatureDefFunction(
     const std::string& signature_def_key, SignatureDefFunction** function) {
-  // TODO(bmzhao): Add support for retrieving a signaturedef function.
-  return errors::Unimplemented(
-      "Retrieving SignatureDef functions is unimplemented currently");
+  auto signatures_iter =
+      revived_objects_.signatures_map.find(signature_def_key);
+  if (signatures_iter == revived_objects_.signatures_map.end()) {
+    return errors::NotFound("No signature with key ", signature_def_key,
+                            " was found");
+  }
+  int node = signatures_iter->second;
+
+  auto function_iter = revived_objects_.signature_def_functions.find(node);
+  if (function_iter == revived_objects_.signature_def_functions.end()) {
+    return errors::Internal(
+        "Unable to find SignatureDefFunction associated with key ",
+        signature_def_key, " despite key being valid.");
+  }
+
+  *function = function_iter->second.get();
+  return Status();
 }
 
 std::vector<ConcreteFunction*> TFSavedModelAPI::ListFunctions() {
   std::vector<ConcreteFunction*> result;
-  result.reserve(concrete_functions_.size());
-  for (auto& index_and_function : concrete_functions_) {
+  result.reserve(revived_objects_.concrete_functions.size());
+  for (auto& index_and_function : revived_objects_.concrete_functions) {
     result.push_back(index_and_function.second.get());
   }
   return result;
 }
 
-TFSavedModelAPI::TFSavedModelAPI(
-    const std::string& directory, SavedModelV2Bundle bundle,
-    std::unordered_map<int, std::unique_ptr<TensorHandleConvertible>>
-        revived_objects,
-    std::unordered_map<std::string, std::unique_ptr<TFConcreteFunction>>
-        concrete_functions)
+Status TFSavedModelAPI::GetVariable(const std::string& variable_path,
+                                    Variable** variable) {
+  absl::optional<int> node =
+      internal::FindNodeAtPath(variable_path, bundle_.saved_object_graph());
+  if (!node.has_value()) {
+    return errors::NotFound("No saved object found at path ", variable_path);
+  }
+
+  auto variables_iter = revived_objects_.variables.find(*node);
+  if (variables_iter == revived_objects_.variables.end()) {
+    return errors::NotFound("No variable found at path ", variable_path);
+  }
+
+  *variable = variables_iter->second.get();
+  return Status();
+}
+
+TFSavedModelAPI::TFSavedModelAPI(const std::string& directory,
+                                 SavedModelV2Bundle bundle,
+                                 RevivedObjects revived_objects)
     : directory_(directory),
       bundle_(std::move(bundle)),
-      revived_objects_(std::move(revived_objects)),
-      concrete_functions_(std::move(concrete_functions)) {}
+      revived_objects_(std::move(revived_objects)) {}
 
 Status TFSavedModelAPI::Load(
     const std::string& directory,
@@ -351,28 +263,25 @@ Status TFSavedModelAPI::Load(
   // This occurs in python here:
   // https://github.com/tensorflow/tensorflow/blob/285b5fa15405c5e2c084080f52a1818be8648079/tensorflow/python/saved_model/function_deserialization.py#L438-L454
 
-  RevivedObjectMap revived_objects;
-  TF_RETURN_IF_ERROR(
-      ReviveObjects(bundle.meta_graph_def(), context, &revived_objects));
+  // Step 1: For each node in the graph, we should initialize an object of the
+  // corresponding type. For objects that depend on the initialization of other
+  // objects (like functions which capture resources), we will initialize them
+  // in step 2.
+  PartiallyRevivedObjects partially_revived_objects;
+  TF_RETURN_IF_ERROR(internal::PartiallyReviveSavedModelObjects(
+      bundle.meta_graph_def(), context, directory, &partially_revived_objects));
 
-  // TODO(bmzhao): When we later add support for loading resources, we need to
-  // handle the case where materializing a function's captures requires invoking
-  // other functions. This occurs when retrieving the resource handle for a
-  // TrackableResource:
-  // https://github.com/tensorflow/tensorflow/blob/f19c6efb4a8ba60e2492eedc98ef5375abb39dc7/tensorflow/python/saved_model/load.py#L240
-  // https://github.com/tensorflow/tensorflow/blob/f19c6efb4a8ba60e2492eedc98ef5375abb39dc7/tensorflow/python/training/tracking/tracking.py#L233
-  // This requires restoring functions in a topological sort order by capture
-  // dependencies.
-  ConcreteFunctionMap function_map;
-  TF_RETURN_IF_ERROR(ReviveFunctions(bundle.meta_graph_def(), revived_objects,
-                                     context, &function_map));
+  RevivedObjects revived_objects;
+  TF_RETURN_IF_ERROR(partially_revived_objects.Build(
+      context, bundle.saved_object_graph(), &revived_objects));
 
   TF_RETURN_IF_ERROR(
       RestoreCheckpoint(&bundle, revived_objects, directory, context));
 
+  TF_RETURN_IF_ERROR(InitializeAllResources(revived_objects));
+
   out->reset(new TFSavedModelAPI(directory, std::move(bundle),
-                                 std::move(revived_objects),
-                                 std::move(function_map)));
+                                 std::move(revived_objects)));
   return Status();
 }
 
diff --git a/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h b/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h
index fd07c09474b..bc39a974ad2 100644
--- a/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h
+++ b/tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h
@@ -25,8 +25,10 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/c/eager/immediate_execution_context.h"
 #include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/revived_objects.h"
 #include "tensorflow/c/experimental/saved_model/core/revived_types/tensorhandle_convertible.h"
 #include "tensorflow/c/experimental/saved_model/core/revived_types/tf_concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/core/revived_types/variable.h"
 #include "tensorflow/c/experimental/saved_model/core/saved_model_api.h"
 #include "tensorflow/c/experimental/saved_model/core/signature_def_function.h"
 #include "tensorflow/cc/saved_model/bundle_v2.h"
@@ -68,20 +70,15 @@ class TFSavedModelAPI : public SavedModelAPI {
 
   ~TFSavedModelAPI() override = default;
 
+  Status GetVariable(const std::string& variable_path, Variable** variable);
+
  private:
-  TFSavedModelAPI(
-      const std::string& directory, SavedModelV2Bundle bundle,
-      std::unordered_map<int, std::unique_ptr<TensorHandleConvertible>>
-          revived_objects,
-      std::unordered_map<std::string, std::unique_ptr<TFConcreteFunction>>
-          concrete_functions);
+  TFSavedModelAPI(const std::string& directory, SavedModelV2Bundle bundle,
+                  RevivedObjects revived_objects);
 
   std::string directory_;
   SavedModelV2Bundle bundle_;
-  std::unordered_map<int, std::unique_ptr<TensorHandleConvertible>>
-      revived_objects_;
-  std::unordered_map<std::string, std::unique_ptr<TFConcreteFunction>>
-      concrete_functions_;
+  RevivedObjects revived_objects_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/saved_model/internal/BUILD b/tensorflow/c/experimental/saved_model/internal/BUILD
index c0d121a4aee..06fbc7aef0a 100644
--- a/tensorflow/c/experimental/saved_model/internal/BUILD
+++ b/tensorflow/c/experimental/saved_model/internal/BUILD
@@ -9,6 +9,8 @@
 # Note(bmzhao): The *.cc files in this directory form the direct implementation of the
 # C API functions exposed in tf/c/experimental/saved_model/public/.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 # Note(bmzhao): All *type.h files in this directory are the internal definitions of
 # the opaque C types. These headers should only be visible to internal tensorflow
 # implementors.
@@ -222,6 +224,8 @@ cc_library(
     ],
     deps = [
         ":signature_def_function_metadata_type",
+        ":signature_def_param_list",
+        ":signature_def_param_list_type",
         "//tensorflow/c:c_api_macros",
         "//tensorflow/c/experimental/saved_model/core:signature_def_function_metadata",
     ],
@@ -238,6 +242,104 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "signature_def_param",
+    srcs = [
+        "signature_def_param.cc",
+    ],
+    hdrs = [
+        "//tensorflow/c/experimental/saved_model/public:signature_def_param.h",
+    ],
+    copts = tf_copts(),
+    visibility = [
+        "//tensorflow/c/experimental/saved_model/public:__pkg__",
+    ],
+    deps = [
+        ":signature_def_param_type",
+        ":tensor_spec",
+        ":tensor_spec_type",
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c:tf_shape_internal",
+        "//tensorflow/c/experimental/saved_model/core:signature_def_function_metadata",
+    ],
+)
+
+cc_library(
+    name = "signature_def_param_type",
+    hdrs = [
+        "signature_def_param_type.h",
+    ],
+    deps = [
+        "//tensorflow/c:conversion_macros",
+        "//tensorflow/c/experimental/saved_model/core:signature_def_function_metadata",
+    ],
+)
+
+cc_library(
+    name = "signature_def_param_list",
+    srcs = [
+        "signature_def_param_list.cc",
+    ],
+    hdrs = [
+        "//tensorflow/c/experimental/saved_model/public:signature_def_param_list.h",
+    ],
+    copts = tf_copts(),
+    visibility = [
+        "//tensorflow/c/experimental/saved_model/public:__pkg__",
+    ],
+    deps = [
+        ":signature_def_param",
+        ":signature_def_param_list_type",
+        ":signature_def_param_type",
+        "//tensorflow/c:c_api_macros",
+    ],
+)
+
+cc_library(
+    name = "signature_def_param_list_type",
+    hdrs = [
+        "signature_def_param_list_type.h",
+    ],
+    deps = [
+        "//tensorflow/c:conversion_macros",
+        "//tensorflow/c/experimental/saved_model/core:signature_def_function_metadata",
+    ],
+)
+
+cc_library(
+    name = "tensor_spec",
+    srcs = [
+        "tensor_spec.cc",
+    ],
+    hdrs = [
+        "//tensorflow/c/experimental/saved_model/public:tensor_spec.h",
+    ],
+    copts = tf_copts(),
+    visibility = [
+        "//tensorflow/c/experimental/saved_model/public:__pkg__",
+    ],
+    deps = [
+        ":tensor_spec_type",
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c:tf_datatype",
+        "//tensorflow/c:tf_shape",
+        "//tensorflow/c:tf_shape_internal",
+        "//tensorflow/c/experimental/saved_model/core:tensor_spec",
+    ],
+)
+
+cc_library(
+    name = "tensor_spec_type",
+    hdrs = [
+        "tensor_spec_type.h",
+    ],
+    deps = [
+        "//tensorflow/c:conversion_macros",
+        "//tensorflow/c:tf_shape_internal",
+        "//tensorflow/c/experimental/saved_model/core:tensor_spec",
+    ],
+)
+
 tf_cc_test(
     name = "saved_model_api_test",
     size = "small",
@@ -245,16 +347,26 @@ tf_cc_test(
         "saved_model_api_test.cc",
     ],
     data = [
+        "//tensorflow/c/experimental/saved_model/internal/testdata:saved_models",
         "//tensorflow/cc/saved_model:saved_model_half_plus_two",
     ],
     deps = [
+        ":saved_model_api_type",
+        "//tensorflow/c:tf_datatype",
+        "//tensorflow/c:tf_shape",
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_tensor",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_experimental",
         "//tensorflow/c/eager:c_api_test_util",
+        "//tensorflow/c/experimental/saved_model/core:tf_saved_model_api",
         "//tensorflow/c/experimental/saved_model/public:concrete_function",
         "//tensorflow/c/experimental/saved_model/public:saved_model_api",
+        "//tensorflow/c/experimental/saved_model/public:signature_def_function",
+        "//tensorflow/c/experimental/saved_model/public:signature_def_function_metadata",
+        "//tensorflow/c/experimental/saved_model/public:signature_def_param",
+        "//tensorflow/c/experimental/saved_model/public:signature_def_param_list",
+        "//tensorflow/c/experimental/saved_model/public:tensor_spec",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/c/experimental/saved_model/internal/concrete_function.cc b/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
index 65c6eca5623..2beed8f4119 100644
--- a/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
+++ b/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
@@ -34,15 +34,15 @@ TF_FunctionMetadata* TF_ConcreteFunctionGetMetadata(TF_ConcreteFunction* func) {
       &tensorflow::unwrap(func)->GetFunctionMetadata()));
 }
 
-TFE_Op* TF_ConcreteFunctionGetCallOp(TF_ConcreteFunction* func,
-                                     TFE_TensorHandle** inputs, int num_inputs,
-                                     TF_Status* status) {
+TFE_Op* TF_ConcreteFunctionMakeCallOp(TF_ConcreteFunction* func,
+                                      TFE_TensorHandle** inputs, int num_inputs,
+                                      TF_Status* status) {
   tensorflow::ImmediateOpPtr call_op;
   absl::Span<tensorflow::AbstractTensorHandle* const> input_span(
       reinterpret_cast<tensorflow::AbstractTensorHandle**>(
           tensorflow::unwrap(inputs)),
       static_cast<size_t>(num_inputs));
-  status->status = tensorflow::unwrap(func)->GetCallOp(input_span, &call_op);
+  status->status = tensorflow::unwrap(func)->MakeCallOp(input_span, &call_op);
   if (!status->status.ok()) {
     return nullptr;
   }
diff --git a/tensorflow/c/experimental/saved_model/internal/saved_model_api_test.cc b/tensorflow/c/experimental/saved_model/internal/saved_model_api_test.cc
index e58b232f9c9..5a4f676ec06 100644
--- a/tensorflow/c/experimental/saved_model/internal/saved_model_api_test.cc
+++ b/tensorflow/c/experimental/saved_model/internal/saved_model_api_test.cc
@@ -21,15 +21,28 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/c/experimental/saved_model/core/tf_saved_model_api.h"
+#include "tensorflow/c/experimental/saved_model/internal/saved_model_api_type.h"
 #include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_param.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_param_list.h"
+#include "tensorflow/c/experimental/saved_model/public/tensor_spec.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_shape.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/tstring.h"
 
 namespace {
 
+using tensorflow::tstring;
+
 constexpr char kTestData[] = "cc/saved_model/testdata";
 const char* kServeTag[] = {"serve"};
 
@@ -107,7 +120,7 @@ TEST_P(CSavedModelAPITest, LoadsSavedModel) {
   compute_fn_inputs.push_back(input_a);
   compute_fn_inputs.push_back(input_b);
 
-  TFE_Op* compute_fn_op = TF_ConcreteFunctionGetCallOp(
+  TFE_Op* compute_fn_op = TF_ConcreteFunctionMakeCallOp(
       compute_fn, compute_fn_inputs.data(), compute_fn_inputs.size(), status);
   EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
 
@@ -137,6 +150,380 @@ TEST_P(CSavedModelAPITest, LoadsSavedModel) {
   TFE_DeleteContext(ctx);
 }
 
+// This tests running the "serving_default" SignatureDefFunction from the
+// VarsAndArithmeticObjectGraph savedmodel. Here's what the signature_defs
+// protobuf in the metagraph looks like:
+// signature_def: {
+//   key  : "serving_default"
+//   value: {
+//     inputs: {
+//       key  : "a"
+//       value: {
+//         name : "serving_default_a:0"
+//         dtype: DT_FLOAT
+//         tensor_shape: {
+//         }
+//       }
+//     }
+//     inputs: {
+//       key  : "b"
+//       value: {
+//         name : "serving_default_b:0"
+//         dtype: DT_FLOAT
+//         tensor_shape: {
+//         }
+//       }
+//     }
+//     outputs: {
+//       key  : "output_0"
+//       value: {
+//         name : "StatefulPartitionedCall:0"
+//         dtype: DT_FLOAT
+//         tensor_shape: {
+//         }
+//       }
+//     }
+//     method_name: "tensorflow/serving/predict"
+//   }
+// }
+TEST_P(CSavedModelAPITest, RunsSignatureDefFunction) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  bool use_tfrt = GetParam();
+  if (use_tfrt) {
+    TFE_DeleteContextOptions(opts);
+    TF_DeleteStatus(status);
+    GTEST_SKIP();  // TODO(chky) : Enable this once TFRT is open sourced.
+  }
+
+  TFE_ContextOptionsSetTfrt(opts, use_tfrt);
+
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  std::string model_dir = SavedModelPath("VarsAndArithmeticObjectGraph");
+
+  TF_SavedModel* saved_model =
+      TF_LoadSavedModel(model_dir.c_str(), ctx, status);
+
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TF_SignatureDefFunction* serving_default =
+      TF_GetSavedModelSignatureDefFunction(saved_model, "serving_default",
+                                           status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TF_SignatureDefFunctionMetadata* metadata =
+      TF_SignatureDefFunctionGetMetadata(serving_default);
+
+  const TF_SignatureDefParamList* args =
+      TF_SignatureDefFunctionMetadataArgs(metadata);
+  const TF_SignatureDefParamList* returns =
+      TF_SignatureDefFunctionMetadataReturns(metadata);
+
+  EXPECT_EQ(TF_SignatureDefParamListSize(args), 2);
+  const TF_SignatureDefParam* param_a = TF_SignatureDefParamListGet(args, 0);
+  const TF_TensorSpec* tensor_spec_a = TF_SignatureDefParamTensorSpec(param_a);
+  const TF_Shape* shape_a = TF_TensorSpecShape(tensor_spec_a);
+
+  // Input "a" is a scalar, float32 tensor
+  EXPECT_EQ("a", std::string(TF_SignatureDefParamName(param_a)));
+  EXPECT_EQ(TF_FLOAT, TF_TensorSpecDataType(tensor_spec_a));
+  EXPECT_EQ(0, TF_ShapeDims(shape_a));
+
+  const TF_SignatureDefParam* param_b = TF_SignatureDefParamListGet(args, 1);
+  const TF_TensorSpec* tensor_spec_b = TF_SignatureDefParamTensorSpec(param_b);
+  const TF_Shape* shape_b = TF_TensorSpecShape(tensor_spec_b);
+
+  // Input "b" is a scalar, float32 tensor
+  EXPECT_EQ("b", std::string(TF_SignatureDefParamName(param_b)));
+  EXPECT_EQ(TF_FLOAT, TF_TensorSpecDataType(tensor_spec_b));
+  EXPECT_EQ(0, TF_ShapeDims(shape_b));
+
+  EXPECT_EQ(TF_SignatureDefParamListSize(returns), 1);
+
+  const TF_SignatureDefParam* param_out =
+      TF_SignatureDefParamListGet(returns, 0);
+  const TF_TensorSpec* tensor_spec_out =
+      TF_SignatureDefParamTensorSpec(param_out);
+  const TF_Shape* shape_out = TF_TensorSpecShape(tensor_spec_out);
+
+  // Output "output_0" is a scalar, float32 tensor
+  EXPECT_EQ("output_0", std::string(TF_SignatureDefParamName(param_out)));
+  EXPECT_EQ(TF_FLOAT, TF_TensorSpecDataType(tensor_spec_out));
+  EXPECT_EQ(0, TF_ShapeDims(shape_out));
+
+  std::vector<TFE_TensorHandle*> compute_fn_inputs;
+  TFE_TensorHandle* input_a = TestScalarTensorHandle(ctx, 2.0f);
+  TFE_TensorHandle* input_b = TestScalarTensorHandle(ctx, 1.0f);
+  compute_fn_inputs.push_back(input_a);
+  compute_fn_inputs.push_back(input_b);
+
+  TFE_Op* serving_default_op = TF_SignatureDefFunctionMakeCallOp(
+      serving_default, compute_fn_inputs.data(), compute_fn_inputs.size(),
+      status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  std::vector<TFE_TensorHandle*> compute_fn_outputs(
+      TF_SignatureDefParamListSize(returns));
+  int num_retvals = TF_SignatureDefParamListSize(returns);
+
+  TFE_Execute(serving_default_op, compute_fn_outputs.data(), &num_retvals,
+              status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TF_Tensor* result = TFE_TensorHandleResolve(compute_fn_outputs[0], status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  EXPECT_EQ(TF_NumDims(result), 0);
+  float output_value = *static_cast<float*>(TF_TensorData(result));
+  // (1 + 2) * (2 + 1) / 3 + 5 should be 8
+  EXPECT_FLOAT_EQ(output_value, 8.0);
+
+  TF_DeleteTensor(result);
+  TFE_DeleteTensorHandle(compute_fn_outputs[0]);
+  TFE_DeleteTensorHandle(input_a);
+  TFE_DeleteTensorHandle(input_b);
+  TFE_DeleteOp(serving_default_op);
+  TF_DeleteSavedModel(saved_model);
+  TF_DeleteStatus(status);
+  TFE_DeleteContext(ctx);
+}
+
+TEST_P(CSavedModelAPITest, LoadsAssetSavedModel) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  bool use_tfrt = GetParam();
+  if (use_tfrt) {
+    TFE_DeleteContextOptions(opts);
+    TF_DeleteStatus(status);
+    GTEST_SKIP();  // TODO(chky) : Enable this once TFRT is open sourced.
+  }
+
+  TFE_ContextOptionsSetTfrt(opts, use_tfrt);
+
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  std::string model_dir = SavedModelPath("AssetModule");
+
+  TF_SavedModel* saved_model =
+      TF_LoadSavedModel(model_dir.c_str(), ctx, status);
+
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TF_ConcreteFunction* read_file_fn =
+      TF_GetSavedModelConcreteFunction(saved_model, "read_file", status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TFE_Op* read_file_op =
+      TF_ConcreteFunctionMakeCallOp(read_file_fn, nullptr, 0, status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  // TODO(bmzhao): Finish API on FunctionMetadata args, so we know how many
+  // inputs + outputs a function has.
+  TFE_TensorHandle* read_file_fn_outputs[1] = {nullptr};
+  int num_retvals = 1;
+
+  TFE_Execute(read_file_op, &read_file_fn_outputs[0], &num_retvals, status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TF_Tensor* result = TFE_TensorHandleResolve(read_file_fn_outputs[0], status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  EXPECT_EQ(TF_NumDims(result), 0);
+  tensorflow::tstring* output_value =
+      static_cast<tensorflow::tstring*>(TF_TensorData(result));
+  std::string file_contents(*output_value);
+  EXPECT_NE(file_contents.find("TEST ASSET FILE CONTENTS"), std::string::npos);
+
+  TF_DeleteTensor(result);
+  TFE_DeleteTensorHandle(read_file_fn_outputs[0]);
+  TFE_DeleteOp(read_file_op);
+  TF_DeleteSavedModel(saved_model);
+  TF_DeleteStatus(status);
+  TFE_DeleteContext(ctx);
+}
+
+TEST_P(CSavedModelAPITest, LoadsStaticHashtableSavedModel) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  bool use_tfrt = GetParam();
+  if (use_tfrt) {
+    TFE_DeleteContextOptions(opts);
+    TF_DeleteStatus(status);
+    GTEST_SKIP();  // TODO(chky) : Enable this once TFRT is open sourced.
+  }
+
+  TFE_ContextOptionsSetTfrt(opts, use_tfrt);
+
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  std::string model_dir = SavedModelPath("StaticHashTableModule");
+
+  TF_SavedModel* saved_model =
+      TF_LoadSavedModel(model_dir.c_str(), ctx, status);
+
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TF_ConcreteFunction* lookup_fn =
+      TF_GetSavedModelConcreteFunction(saved_model, "lookup", status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  // Note(bmzhao): Based on static_hashtable_asset.txt, we expect the following
+  // mapping:
+  // "foo" -> 0
+  // "bar" -> 1
+  // "baz" -> 2
+  // "wombat" -> 3
+  // all other strings -> -1
+
+  // Call lookup function with input "foo", expecting an output of 0
+  {
+    std::vector<TFE_TensorHandle*> lookup_fn_inputs;
+    TFE_TensorHandle* input_foo = TestScalarTensorHandle(ctx, tstring("foo"));
+    lookup_fn_inputs.push_back(input_foo);
+
+    TFE_Op* lookup_op = TF_ConcreteFunctionMakeCallOp(
+        lookup_fn, lookup_fn_inputs.data(), lookup_fn_inputs.size(), status);
+    EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+    // TODO(bmzhao): Finish API on FunctionMetadata args, so we know how many
+    // inputs + outputs a function has.
+    TFE_TensorHandle* lookup_fn_outputs[1] = {nullptr};
+    int num_retvals = 1;
+
+    TFE_Execute(lookup_op, &lookup_fn_outputs[0], &num_retvals, status);
+    EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+    TF_Tensor* result = TFE_TensorHandleResolve(lookup_fn_outputs[0], status);
+    EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+    EXPECT_EQ(TF_NumDims(result), 0);
+    tensorflow::int64* output_value =
+        static_cast<tensorflow::int64*>(TF_TensorData(result));
+    EXPECT_EQ(*output_value, 0);
+
+    TF_DeleteTensor(result);
+    TFE_DeleteTensorHandle(input_foo);
+    TFE_DeleteTensorHandle(lookup_fn_outputs[0]);
+    TFE_DeleteOp(lookup_op);
+  }
+
+  // Call lookup function with input "baz", expecting an output of 2
+  {
+    std::vector<TFE_TensorHandle*> lookup_fn_inputs;
+    TFE_TensorHandle* input_foo = TestScalarTensorHandle(ctx, tstring("baz"));
+    lookup_fn_inputs.push_back(input_foo);
+
+    TFE_Op* lookup_op = TF_ConcreteFunctionMakeCallOp(
+        lookup_fn, lookup_fn_inputs.data(), lookup_fn_inputs.size(), status);
+    EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+    // TODO(bmzhao): Finish API on FunctionMetadata args, so we know how many
+    // inputs + outputs a function has.
+    TFE_TensorHandle* lookup_fn_outputs[1] = {nullptr};
+    int num_retvals = 1;
+
+    TFE_Execute(lookup_op, &lookup_fn_outputs[0], &num_retvals, status);
+    EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+    TF_Tensor* result = TFE_TensorHandleResolve(lookup_fn_outputs[0], status);
+    EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+    EXPECT_EQ(TF_NumDims(result), 0);
+    tensorflow::int64* output_value =
+        static_cast<tensorflow::int64*>(TF_TensorData(result));
+    EXPECT_EQ(*output_value, 2);
+
+    TF_DeleteTensor(result);
+    TFE_DeleteTensorHandle(input_foo);
+    TFE_DeleteTensorHandle(lookup_fn_outputs[0]);
+    TFE_DeleteOp(lookup_op);
+  }
+
+  // Call lookup function w/input "NON-EXISTENT-KEY", expecting an output of -1
+  {
+    std::vector<TFE_TensorHandle*> lookup_fn_inputs;
+    TFE_TensorHandle* input_foo =
+        TestScalarTensorHandle(ctx, tstring("NON-EXISTENT-KEY"));
+    lookup_fn_inputs.push_back(input_foo);
+
+    TFE_Op* lookup_op = TF_ConcreteFunctionMakeCallOp(
+        lookup_fn, lookup_fn_inputs.data(), lookup_fn_inputs.size(), status);
+    EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+    // TODO(bmzhao): Finish API on FunctionMetadata args, so we know how many
+    // inputs + outputs a function has.
+    TFE_TensorHandle* lookup_fn_outputs[1] = {nullptr};
+    int num_retvals = 1;
+
+    TFE_Execute(lookup_op, &lookup_fn_outputs[0], &num_retvals, status);
+    EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+    TF_Tensor* result = TFE_TensorHandleResolve(lookup_fn_outputs[0], status);
+    EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+    EXPECT_EQ(TF_NumDims(result), 0);
+    tensorflow::int64* output_value =
+        static_cast<tensorflow::int64*>(TF_TensorData(result));
+    EXPECT_EQ(*output_value, -1);
+
+    TF_DeleteTensor(result);
+    TFE_DeleteTensorHandle(input_foo);
+    TFE_DeleteTensorHandle(lookup_fn_outputs[0]);
+    TFE_DeleteOp(lookup_op);
+  }
+
+  TF_DeleteSavedModel(saved_model);
+  TF_DeleteStatus(status);
+  TFE_DeleteContext(ctx);
+}
+
+TEST_P(CSavedModelAPITest, LoadSavedModelWithUninitializedVariable) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  bool use_tfrt = GetParam();
+  if (use_tfrt) {
+    TFE_DeleteContextOptions(opts);
+    TF_DeleteStatus(status);
+    GTEST_SKIP();  // TODO(chky) : Enable this once TFRT is open sourced.
+  }
+
+  TFE_ContextOptionsSetTfrt(opts, use_tfrt);
+
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  std::string model_dir = tensorflow::io::JoinPath(
+      tensorflow::testing::TensorFlowSrcRoot(),
+      "c/experimental/saved_model/internal/testdata/UninitializedVariable");
+
+  TF_SavedModel* saved_model =
+      TF_LoadSavedModel(model_dir.c_str(), ctx, status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  tensorflow::TFSavedModelAPI* model_api =
+      tensorflow::down_cast<tensorflow::TFSavedModelAPI*>(
+          tensorflow::unwrap(saved_model));
+  tensorflow::Variable* uninitialized_variable;
+  ASSERT_EQ(tensorflow::Status::OK(),
+            model_api->GetVariable("uninitialized_variable",
+                                   &uninitialized_variable));
+  ASSERT_EQ(tensorflow::DT_FLOAT, uninitialized_variable->dtype());
+
+  ASSERT_EQ(tensorflow::Status::OK(),
+            model_api->GetVariable("sub_module.uninitialized_variable",
+                                   &uninitialized_variable));
+  ASSERT_EQ(tensorflow::DT_INT64, uninitialized_variable->dtype());
+
+  TF_DeleteSavedModel(saved_model);
+  TF_DeleteStatus(status);
+  TFE_DeleteContext(ctx);
+}
+
 INSTANTIATE_TEST_SUITE_P(RuntimeAgnosticSavedModelTests, CSavedModelAPITest,
                          ::testing::Bool());
 
diff --git a/tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata.cc b/tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata.cc
index c5c3616211c..1c547a94155 100644
--- a/tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata.cc
+++ b/tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata.cc
@@ -16,5 +16,18 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h"
 
 #include "tensorflow/c/experimental/saved_model/internal/signature_def_function_metadata_type.h"
+#include "tensorflow/c/experimental/saved_model/internal/signature_def_param_list_type.h"
 
-// TODO(bmzhao): Add getter functions here as necessary.
+extern "C" {
+
+extern const TF_SignatureDefParamList* TF_SignatureDefFunctionMetadataArgs(
+    const TF_SignatureDefFunctionMetadata* list) {
+  return tensorflow::wrap(&tensorflow::unwrap(list)->arguments());
+}
+
+extern const TF_SignatureDefParamList* TF_SignatureDefFunctionMetadataReturns(
+    const TF_SignatureDefFunctionMetadata* list) {
+  return tensorflow::wrap(&tensorflow::unwrap(list)->returns());
+}
+
+}  // end extern "C"
diff --git a/tensorflow/c/experimental/saved_model/internal/signature_def_param.cc b/tensorflow/c/experimental/saved_model/internal/signature_def_param.cc
new file mode 100644
index 00000000000..ac54f8f5700
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/signature_def_param.cc
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/public/signature_def_param.h"
+
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h"
+#include "tensorflow/c/experimental/saved_model/internal/signature_def_param_type.h"
+#include "tensorflow/c/experimental/saved_model/internal/tensor_spec_type.h"
+
+extern "C" {
+
+extern const char* TF_SignatureDefParamName(const TF_SignatureDefParam* param) {
+  return tensorflow::unwrap(param)->name().c_str();
+}
+
+extern const TF_TensorSpec* TF_SignatureDefParamTensorSpec(
+    const TF_SignatureDefParam* param) {
+  return tensorflow::wrap(&tensorflow::unwrap(param)->spec());
+}
+
+}  // end extern "C"
diff --git a/tensorflow/c/experimental/saved_model/internal/signature_def_param_list.cc b/tensorflow/c/experimental/saved_model/internal/signature_def_param_list.cc
new file mode 100644
index 00000000000..328f21635c3
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/signature_def_param_list.cc
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/public/signature_def_param_list.h"
+
+#include "tensorflow/c/experimental/saved_model/internal/signature_def_param_list_type.h"
+#include "tensorflow/c/experimental/saved_model/internal/signature_def_param_type.h"
+
+extern "C" {
+
+extern size_t TF_SignatureDefParamListSize(
+    const TF_SignatureDefParamList* list) {
+  return tensorflow::unwrap(list)->size();
+}
+
+extern const TF_SignatureDefParam* TF_SignatureDefParamListGet(
+    const TF_SignatureDefParamList* list, int i) {
+  return tensorflow::wrap(&tensorflow::unwrap(list)->at(i));
+}
+
+}  // end extern "C"
diff --git a/tensorflow/c/experimental/saved_model/internal/signature_def_param_list_type.h b/tensorflow/c/experimental/saved_model/internal/signature_def_param_list_type.h
new file mode 100644
index 00000000000..6f535110cee
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/signature_def_param_list_type.h
@@ -0,0 +1,33 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_PARAM_LIST_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_PARAM_LIST_TYPE_H_
+
+#include <vector>
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h"
+
+typedef struct TF_SignatureDefParamList TF_SignatureDefParamList;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(std::vector<SignatureDefParam>,
+                            TF_SignatureDefParamList)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_PARAM_LIST_TYPE_H_
diff --git a/tensorflow/c/experimental/saved_model/internal/signature_def_param_type.h b/tensorflow/c/experimental/saved_model/internal/signature_def_param_type.h
new file mode 100644
index 00000000000..fd634bcddb0
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/signature_def_param_type.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_PARAM_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_PARAM_TYPE_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/experimental/saved_model/core/signature_def_function_metadata.h"
+
+typedef struct TF_SignatureDefParam TF_SignatureDefParam;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::SignatureDefParam, TF_SignatureDefParam)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SIGNATURE_DEF_PARAM_TYPE_H_
diff --git a/tensorflow/c/experimental/saved_model/internal/tensor_spec.cc b/tensorflow/c/experimental/saved_model/internal/tensor_spec.cc
new file mode 100644
index 00000000000..f310adef449
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/tensor_spec.cc
@@ -0,0 +1,32 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/public/tensor_spec.h"
+
+#include "tensorflow/c/experimental/saved_model/core/tensor_spec.h"
+#include "tensorflow/c/experimental/saved_model/internal/tensor_spec_type.h"
+#include "tensorflow/c/tf_shape_internal.h"
+
+extern "C" {
+
+TF_DataType TF_TensorSpecDataType(const TF_TensorSpec* spec) {
+  return static_cast<TF_DataType>(tensorflow::unwrap(spec)->dtype());
+}
+
+const TF_Shape* TF_TensorSpecShape(const TF_TensorSpec* spec) {
+  return tensorflow::wrap(&tensorflow::unwrap(spec)->shape());
+}
+
+}  // end extern "C"
diff --git a/tensorflow/c/experimental/saved_model/internal/tensor_spec_type.h b/tensorflow/c/experimental/saved_model/internal/tensor_spec_type.h
new file mode 100644
index 00000000000..7284c8a8fb2
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/tensor_spec_type.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_TENSOR_SPEC_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_TENSOR_SPEC_TYPE_H_
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/experimental/saved_model/core/tensor_spec.h"
+
+typedef struct TF_TensorSpec TF_TensorSpec;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::TensorSpec, TF_TensorSpec)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_TENSOR_SPEC_TYPE_H_
diff --git a/tensorflow/c/experimental/saved_model/internal/testdata/BUILD b/tensorflow/c/experimental/saved_model/internal/testdata/BUILD
new file mode 100644
index 00000000000..f446401ae77
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/testdata/BUILD
@@ -0,0 +1,37 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow:tensorflow.bzl", "py_strict_binary")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Run this binary manually, with an argument pointing to the testdata/
+# directory, to generate the test files used by the filegroup rule below.
+py_strict_binary(
+    name = "gen_saved_models",
+    srcs = ["gen_saved_models.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/module",
+        "//tensorflow/python/saved_model",
+        "//tensorflow/python/saved_model:save_options",
+    ],
+)
+
+# Files generated by the binary above.
+filegroup(
+    name = "saved_models",
+    srcs = glob([
+        "UninitializedVariable/**",
+    ]),
+    visibility = [
+        "//tensorflow/c/experimental/saved_model/internal:__pkg__",
+    ],
+)
diff --git a/tensorflow/c/experimental/saved_model/internal/testdata/UninitializedVariable/saved_model.pb b/tensorflow/c/experimental/saved_model/internal/testdata/UninitializedVariable/saved_model.pb
new file mode 100644
index 00000000000..81ce8fe662b
Binary files /dev/null and b/tensorflow/c/experimental/saved_model/internal/testdata/UninitializedVariable/saved_model.pb differ
diff --git a/tensorflow/c/experimental/saved_model/internal/testdata/UninitializedVariable/variables/variables.data-00000-of-00001 b/tensorflow/c/experimental/saved_model/internal/testdata/UninitializedVariable/variables/variables.data-00000-of-00001
new file mode 100644
index 00000000000..b68ed0f5a6e
Binary files /dev/null and b/tensorflow/c/experimental/saved_model/internal/testdata/UninitializedVariable/variables/variables.data-00000-of-00001 differ
diff --git a/tensorflow/c/experimental/saved_model/internal/testdata/UninitializedVariable/variables/variables.index b/tensorflow/c/experimental/saved_model/internal/testdata/UninitializedVariable/variables/variables.index
new file mode 100644
index 00000000000..ed07d0514c7
Binary files /dev/null and b/tensorflow/c/experimental/saved_model/internal/testdata/UninitializedVariable/variables/variables.index differ
diff --git a/tensorflow/c/experimental/saved_model/internal/testdata/gen_saved_models.py b/tensorflow/c/experimental/saved_model/internal/testdata/gen_saved_models.py
new file mode 100644
index 00000000000..f2a8bd5a9a4
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/testdata/gen_saved_models.py
@@ -0,0 +1,84 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Lint as: python3
+"""Creates saved models used for testing.
+
+This executable should be run with an argument pointing to the testdata/ folder
+in this directory. It will re-generate the saved models that are used for
+testing.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import google_type_annotations
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.compat import v2_compat
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.module import module
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import app
+from tensorflow.python.saved_model import saved_model
+
+
+def _gen_uninitialized_variable(base_dir):
+  """Generates a saved model with an uninitialized variable."""
+
+  class SubModule(module.Module):
+    """A module with an UninitializedVariable."""
+
+    def __init__(self):
+      self.uninitialized_variable = resource_variable_ops.UninitializedVariable(
+          name="uninitialized_variable", dtype=dtypes.int64)
+
+  class Module(module.Module):
+    """A module with an UninitializedVariable."""
+
+    def __init__(self):
+      super(Module, self).__init__()
+      self.sub_module = SubModule()
+      self.initialized_variable = variables.Variable(
+          1.0, name="initialized_variable")
+      # An UninitializedVariable with the same name as the variable in the
+      # SubModule, but with a different type.
+      self.uninitialized_variable = resource_variable_ops.UninitializedVariable(
+          name="uninitialized_variable", dtype=dtypes.float32)
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec((), dtypes.float32)])
+    def compute(self, value):
+      return self.initialized_variable + value
+
+  to_save = Module()
+  saved_model.save(
+      to_save, export_dir=os.path.join(base_dir, "UninitializedVariable"))
+
+
+def main(args):
+  if len(args) != 2:
+    raise app.UsageError("Expected one argument (base_dir).")
+  _, base_dir = args
+  _gen_uninitialized_variable(base_dir)
+
+
+if __name__ == "__main__":
+  v2_compat.enable_v2_behavior()
+  app.run(main)
diff --git a/tensorflow/c/experimental/saved_model/public/BUILD b/tensorflow/c/experimental/saved_model/public/BUILD
index d29585ae1ba..4198b0e7ee7 100644
--- a/tensorflow/c/experimental/saved_model/public/BUILD
+++ b/tensorflow/c/experimental/saved_model/public/BUILD
@@ -8,6 +8,8 @@
 # programmatic checks that all "public" headers only include other "public"
 # headers.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 package(
     # This is intentionally public
     default_visibility = [
@@ -26,6 +28,9 @@ exports_files(
         "saved_model_api.h",
         "signature_def_function.h",
         "signature_def_function_metadata.h",
+        "signature_def_param.h",
+        "signature_def_param_list.h",
+        "tensor_spec.h",
     ],
     visibility = ["//tensorflow/c/experimental/saved_model/internal:__pkg__"],
 )
@@ -43,6 +48,9 @@ cc_library(
         ":saved_model_api",
         ":signature_def_function",
         ":signature_def_function_metadata",
+        ":signature_def_param",
+        ":signature_def_param_list",
+        ":tensor_spec",
     ],
 )
 
@@ -75,3 +83,18 @@ alias(
     name = "signature_def_function_metadata",
     actual = "//tensorflow/c/experimental/saved_model/internal:signature_def_function_metadata",
 )
+
+alias(
+    name = "signature_def_param",
+    actual = "//tensorflow/c/experimental/saved_model/internal:signature_def_param",
+)
+
+alias(
+    name = "signature_def_param_list",
+    actual = "//tensorflow/c/experimental/saved_model/internal:signature_def_param_list",
+)
+
+alias(
+    name = "tensor_spec",
+    actual = "//tensorflow/c/experimental/saved_model/internal:tensor_spec",
+)
diff --git a/tensorflow/c/experimental/saved_model/public/README.md b/tensorflow/c/experimental/saved_model/public/README.md
new file mode 100644
index 00000000000..9b3f392d7a8
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/public/README.md
@@ -0,0 +1,28 @@
+# TensorFlow Saved Model C API
+
+## Small ConcreteFunction Example
+
+The following example loads a saved model from `"/path/to/model"` and
+executes a function `f` taking no arguments and returning one single
+value (error checking is omitted for simplicity):
+
+```c
+TF_Status* status = TF_NewStatus();
+TFE_ContextOptions* ctx_options = TFE_NewContextOptions();
+TFE_Context* ctx = TFE_NewContext(ctx_options, status);
+
+TF_SavedModel* saved_model = TF_LoadSavedModel("/path/to/model", ctx, status);
+TF_ConcreteFunction* f = TF_GetSavedModelConcreteFunction(saved_model, "f", status);
+TFE_Op* op = TF_ConcreteFunctionMakeCallOp(f, NULL, 0, status);
+
+TFE_TensorHandle* output;
+int nouts = 1;
+TFE_Execute(op, &output, &nouts, status);
+
+TFE_DeleteTensorHandle(output);
+TFE_DeleteOp(op);
+TFE_DeleteSavedModel(saved_model);
+TFE_DeleteContext(ctx);
+TFE_DeleteContextOptions(ctx_options);
+TF_DeleteStatus(status);
+```
diff --git a/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h b/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
index cedb9de66b8..68f1ece2991 100644
--- a/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
+++ b/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
@@ -23,6 +23,9 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/public/saved_model_api.h"
 #include "tensorflow/c/experimental/saved_model/public/signature_def_function.h"
 #include "tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_param.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_param_list.h"
+#include "tensorflow/c/experimental/saved_model/public/tensor_spec.h"
 // IWYU pragma: end_exports
 
 #endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_C_SAVED_MODEL_API_H_
diff --git a/tensorflow/c/experimental/saved_model/public/concrete_function.h b/tensorflow/c/experimental/saved_model/public/concrete_function.h
index 0fd0f70cf16..ff8a245961a 100644
--- a/tensorflow/c/experimental/saved_model/public/concrete_function.h
+++ b/tensorflow/c/experimental/saved_model/public/concrete_function.h
@@ -47,7 +47,7 @@ TF_CAPI_EXPORT extern TF_FunctionMetadata* TF_ConcreteFunctionGetMetadata(
 // high-level API here. A strawman for what this interface could look like:
 // TF_Value* TF_ExecuteFunction(TFE_Context*, TF_ConcreteFunction*, TF_Value*
 // inputs, int num_inputs, TF_Status* status);
-TF_CAPI_EXPORT extern TFE_Op* TF_ConcreteFunctionGetCallOp(
+TF_CAPI_EXPORT extern TFE_Op* TF_ConcreteFunctionMakeCallOp(
     TF_ConcreteFunction* func, TFE_TensorHandle** inputs, int num_inputs,
     TF_Status* status);
 
diff --git a/tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h b/tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h
index 6f4459732c4..b7a7f67eb19 100644
--- a/tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h
+++ b/tensorflow/c/experimental/saved_model/public/signature_def_function_metadata.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
 #define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_FUNCTION_METADATA_H_
 
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_param_list.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
@@ -24,6 +27,18 @@ extern "C" {
 // SavedModel.
 typedef struct TF_SignatureDefFunctionMetadata TF_SignatureDefFunctionMetadata;
 
+// Retrieves the arguments of the SignatureDefFunction. The caller is not
+// responsible for freeing the returned pointer.
+TF_CAPI_EXPORT extern const TF_SignatureDefParamList*
+TF_SignatureDefFunctionMetadataArgs(
+    const TF_SignatureDefFunctionMetadata* list);
+
+// Retrieves the returns of the SignatureDefFunction. The caller is not
+// responsible for freeing the returned pointer.
+TF_CAPI_EXPORT extern const TF_SignatureDefParamList*
+TF_SignatureDefFunctionMetadataReturns(
+    const TF_SignatureDefFunctionMetadata* list);
+
 #ifdef __cplusplus
 }  // end extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/c/experimental/saved_model/public/signature_def_param.h b/tensorflow/c/experimental/saved_model/public/signature_def_param.h
new file mode 100644
index 00000000000..82993d7fedf
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/public/signature_def_param.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_PARAM_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_PARAM_H_
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/experimental/saved_model/public/tensor_spec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type that containing metadata of an input/output of a
+// TF_SignatureDefFunction loaded from a SavedModel.
+typedef struct TF_SignatureDefParam TF_SignatureDefParam;
+
+// Returns the name of the given parameter. The caller is not responsible for
+// freeing the returned char*.
+TF_CAPI_EXPORT extern const char* TF_SignatureDefParamName(
+    const TF_SignatureDefParam* param);
+
+// Returns the TensorSpec associated with the given parameter. The caller is
+// not reponsible for freeing the returned TF_TensorSpec*.
+TF_CAPI_EXPORT extern const TF_TensorSpec* TF_SignatureDefParamTensorSpec(
+    const TF_SignatureDefParam* param);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_PARAM_H_
diff --git a/tensorflow/c/experimental/saved_model/public/signature_def_param_list.h b/tensorflow/c/experimental/saved_model/public/signature_def_param_list.h
new file mode 100644
index 00000000000..0cb3a0d6d33
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/public/signature_def_param_list.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_PARAM_LIST_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_PARAM_LIST_H_
+
+#include <stddef.h>
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/experimental/saved_model/public/signature_def_param.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type that containing metadata of an input/output of a
+// ConcreteFunction loaded from a SavedModel.
+typedef struct TF_SignatureDefParamList TF_SignatureDefParamList;
+
+// Returns the size of `list`.
+TF_CAPI_EXPORT extern size_t TF_SignatureDefParamListSize(
+    const TF_SignatureDefParamList* list);
+
+// Returns the `i`th TF_SignatureDefParam in the list.
+TF_CAPI_EXPORT extern const TF_SignatureDefParam* TF_SignatureDefParamListGet(
+    const TF_SignatureDefParamList* list, int i);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_SIGNATURE_DEF_PARAM_LIST_H_
diff --git a/tensorflow/c/experimental/saved_model/public/tensor_spec.h b/tensorflow/c/experimental/saved_model/public/tensor_spec.h
new file mode 100644
index 00000000000..82972ef74ef
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/public/tensor_spec.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSOR_SPEC_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSOR_SPEC_H_
+
+#include <stddef.h>
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/c/tf_shape.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type corresponding to TensorSpec
+typedef struct TF_TensorSpec TF_TensorSpec;
+
+// Returns the dtype associated with the TensorSpec.
+TF_CAPI_EXPORT extern TF_DataType TF_TensorSpecDataType(
+    const TF_TensorSpec* spec);
+
+// Returns the shape associated with the TensorSpec. The returned Shape is not
+// owned by the caller. Caller must not call TF_DeleteShape on the returned
+// shape.
+TF_CAPI_EXPORT extern const TF_Shape* TF_TensorSpecShape(
+    const TF_TensorSpec* spec);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSOR_SPEC_H_
diff --git a/tensorflow/c/experimental/stream_executor/BUILD b/tensorflow/c/experimental/stream_executor/BUILD
index 7daa311d461..214313c960a 100644
--- a/tensorflow/c/experimental/stream_executor/BUILD
+++ b/tensorflow/c/experimental/stream_executor/BUILD
@@ -1,6 +1,7 @@
 # Description:
 # StreamExecutor C API.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
@@ -10,17 +11,29 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+cc_library(
+    name = "stream_executor_hdrs",
+    hdrs = ["stream_executor.h"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c:tf_status",
+    ],
+)
+
 cc_library(
     name = "stream_executor",
     srcs = ["stream_executor.cc"],
     hdrs = ["stream_executor.h"],
-    visibility = ["//visibility:public"],
+    visibility = ["//tensorflow:internal"],
     deps = [
         ":stream_executor_internal",
         "//tensorflow/c:c_api_macros",
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform:regexp",
+        "//tensorflow/core/platform:strcat",
         "//tensorflow/stream_executor:executor_cache",
         "//tensorflow/stream_executor:multi_platform_manager",
         "//tensorflow/stream_executor:platform",
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor.cc b/tensorflow/c/experimental/stream_executor/stream_executor.cc
index 0e55ba3d72a..ec2bada791e 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor.cc
@@ -28,10 +28,14 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/stream_executor/executor_cache.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"
+#include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/stream_executor/timer.h"
@@ -39,6 +43,8 @@ limitations under the License.
 using tensorflow::StatusFromTF_Status;
 
 namespace stream_executor {
+using tensorflow::StringPiece;
+
 namespace {
 
 #define VALIDATE_STRUCT_SIZE(STRUCT_NAME, STRUCT_OBJ, SIZE_VALUE_NAME) \
@@ -58,17 +64,50 @@ namespace {
     }                                                            \
   } while (0)
 
+port::Status ValidateDeviceType(StringPiece type) {
+  // Validate device type. Device type must start with a capital letter and
+  // consist of capital letters and underscores. Reasoning behind this decision:
+  // * At the minimum we want to disallow '/' and ':' since
+  //   these characters are used in device spec, for e.g.
+  //   /job:foo/replica:12/device:GPU:1.
+  // * Underscores seem useful, for e.g. XLA_GPU uses underscores.
+  // * Allowing lowercase might get confusing. For example, say someone
+  //   registers a new type called "Gpu". It might be confusing for users that
+  //   "Gpu" is not the same device type as "GPU".
+  //   Note that lowercase "cpu" and "gpu" are currently supported only for
+  //   legacy reasons:
+  //   https://cs.opensource.google/tensorflow/tensorflow/+/master:tensorflow/python/framework/device_spec.py;l=46;drc=d3a378f9665d8eee827c74cb9ecbee81e4c288dd
+  static const LazyRE2 kTfDeviceTypeRegEx = {"[A-Z][A-Z_]*"};
+  bool matches = RE2::FullMatch(type, *kTfDeviceTypeRegEx);
+  if (!matches) {
+    return port::FailedPreconditionError(
+        tensorflow::strings::StrCat("Device name/type '", type, "' must match ",
+                                    kTfDeviceTypeRegEx->pattern(), "."));
+  }
+  return port::Status::OK();
+}
+
 port::Status ValidateSPPlatform(const SP_Platform& platform) {
   VALIDATE_STRUCT_SIZE(SP_Platform, platform, SP_PLATFORM_STRUCT_SIZE);
   VALIDATE_MEMBER(SP_Platform, platform, name);
   VALIDATE_MEMBER(SP_Platform, platform, type);
-  VALIDATE_MEMBER(SP_Platform, platform, visible_device_count);
-  VALIDATE_MEMBER(SP_Platform, platform, create_device);
-  VALIDATE_MEMBER(SP_Platform, platform, destroy_device);
-  VALIDATE_MEMBER(SP_Platform, platform, create_stream_executor);
-  VALIDATE_MEMBER(SP_Platform, platform, destroy_stream_executor);
-  VALIDATE_MEMBER(SP_Platform, platform, create_timer_fns);
-  VALIDATE_MEMBER(SP_Platform, platform, destroy_timer_fns);
+  TF_RETURN_IF_ERROR(ValidateDeviceType(platform.name));
+  TF_RETURN_IF_ERROR(ValidateDeviceType(platform.type));
+  // `visible_device_count` could be 0 at initialization time.
+  return port::Status::OK();
+}
+
+port::Status ValidateSPPlatformFns(const SP_PlatformFns& platform_fns) {
+  VALIDATE_STRUCT_SIZE(SP_PlatformFns, platform_fns,
+                       SP_PLATFORM_FNS_STRUCT_SIZE);
+  VALIDATE_MEMBER(SP_PlatformFns, platform_fns, create_device);
+  VALIDATE_MEMBER(SP_PlatformFns, platform_fns, destroy_device);
+  VALIDATE_MEMBER(SP_PlatformFns, platform_fns, create_stream_executor);
+  VALIDATE_MEMBER(SP_PlatformFns, platform_fns, destroy_stream_executor);
+  VALIDATE_MEMBER(SP_PlatformFns, platform_fns, create_timer_fns);
+  VALIDATE_MEMBER(SP_PlatformFns, platform_fns, destroy_timer_fns);
+  VALIDATE_MEMBER(SP_PlatformFns, platform_fns, create_device_fns);
+  VALIDATE_MEMBER(SP_PlatformFns, platform_fns, destroy_device_fns);
   return port::Status::OK();
 }
 
@@ -97,11 +136,24 @@ port::Status ValidateSPDevice(const SP_Device& device) {
   return port::Status::OK();
 }
 
-port::Status ValidateSPStreamExecutor(const SP_StreamExecutor& se) {
+port::Status ValidateSPDeviceFns(const SP_DeviceFns& device_fns) {
+  VALIDATE_STRUCT_SIZE(SP_DeviceFns, device_fns, SP_DEVICE_FNS_STRUCT_SIZE);
+  // All other fields could theoretically be zero/null.
+  return port::Status::OK();
+}
+
+port::Status ValidateSPStreamExecutor(const SP_StreamExecutor& se,
+                                      const SP_Platform& platform) {
   VALIDATE_STRUCT_SIZE(SP_StreamExecutor, se, SP_STREAM_EXECUTOR_STRUCT_SIZE);
   VALIDATE_MEMBER(SP_StreamExecutor, se, allocate);
   VALIDATE_MEMBER(SP_StreamExecutor, se, deallocate);
   VALIDATE_MEMBER(SP_StreamExecutor, se, get_allocator_stats);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, host_memory_allocate);
+  VALIDATE_MEMBER(SP_StreamExecutor, se, host_memory_deallocate);
+  if (platform.supports_unified_memory) {
+    VALIDATE_MEMBER(SP_StreamExecutor, se, unified_memory_allocate);
+    VALIDATE_MEMBER(SP_StreamExecutor, se, unified_memory_deallocate);
+  }
   VALIDATE_MEMBER(SP_StreamExecutor, se, device_memory_usage);
   VALIDATE_MEMBER(SP_StreamExecutor, se, create_stream);
   VALIDATE_MEMBER(SP_StreamExecutor, se, destroy_stream);
@@ -131,9 +183,9 @@ port::Status ValidateSEPlatformRegistrationParams(
   VALIDATE_STRUCT_SIZE(SE_PlatformRegistrationParams, params,
                        SE_PLATFORM_REGISTRATION_PARAMS_STRUCT_SIZE);
   VALIDATE_MEMBER(SE_PlatformRegistrationParams, params, destroy_platform);
+  VALIDATE_MEMBER(SE_PlatformRegistrationParams, params, destroy_platform_fns);
   return port::Status::OK();
 }
-
 #undef VALIDATE_MEMBER
 
 struct TFStatusDeleter {
@@ -297,19 +349,23 @@ void HostCallbackTrampoline(void* ctx, TF_Status* status) {
 
 class CStreamExecutor : public internal::StreamExecutorInterface {
  public:
-  explicit CStreamExecutor(SP_Device device,
-                           void (*destroy_device)(SP_Device* const device),
+  explicit CStreamExecutor(SP_Device device, SP_DeviceFns* device_fns,
                            SP_StreamExecutor* stream_executor,
+                           SP_Platform* platform, SP_PlatformFns* platform_fns,
                            SP_TimerFns* timer_fns, const std::string& name,
                            int visible_device_count)
       : device_(std::move(device)),
-        destroy_device_(destroy_device),
+        device_fns_(device_fns),
         stream_executor_(stream_executor),
+        platform_(platform),
+        platform_fns_(platform_fns),
         timer_fns_(timer_fns),
         platform_name_(name),
         visible_device_count_(visible_device_count) {}
 
-  ~CStreamExecutor() override { destroy_device_(&device_); }
+  ~CStreamExecutor() override {
+    platform_fns_->destroy_device(platform_, &device_);
+  }
 
   port::Status Init(int device_ordinal, DeviceOptions device_options) override {
     return port::Status::OK();
@@ -348,6 +404,16 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
   bool HostMemoryRegister(void* mem, uint64 size) override { return false; }
   bool HostMemoryUnregister(void* mem) override { return false; }
 
+  void* UnifiedMemoryAllocate(uint64 size) override {
+    CHECK(stream_executor_->unified_memory_allocate);
+    return stream_executor_->unified_memory_allocate(&device_, size);
+  }
+
+  void UnifiedMemoryDeallocate(void* mem) override {
+    CHECK(stream_executor_->unified_memory_deallocate);
+    stream_executor_->unified_memory_deallocate(&device_, mem);
+  }
+
   absl::optional<AllocatorStats> GetAllocatorStats() override {
     SP_AllocatorStats c_stats{SP_ALLOCATORSTATS_STRUCT_SIZE};
     TF_Bool has_stats =
@@ -597,11 +663,19 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
 
   port::Status BlockHostUntilDone(Stream* stream) override {
     OwnedTFStatus c_status(TF_NewStatus());
+    SP_Stream stream_handle =
+        static_cast<CStream*>(stream->implementation())->Handle();
+
+    // If `block_host_until_done` is set, use it.
+    if (stream_executor_->block_host_until_done != nullptr) {
+      stream_executor_->block_host_until_done(&device_, stream_handle,
+                                              c_status.get());
+      return StatusFromTF_Status(c_status.get());
+    }
+    // Create and record an event and then wait for it.
     SP_Event event_handle;
     stream_executor_->create_event(&device_, &event_handle, c_status.get());
     TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status.get()));
-    SP_Stream stream_handle =
-        static_cast<CStream*>(stream->implementation())->Handle();
     stream_executor_->record_event(&device_, stream_handle, event_handle,
                                    c_status.get());
     port::Status s = StatusFromTF_Status(c_status.get());
@@ -644,9 +718,35 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
   // Ownership is transferred to the caller.
   port::StatusOr<std::unique_ptr<DeviceDescription>> CreateDeviceDescription()
       const override {
-    // TODO(annarev): Figure out if we need to support more description fields.
+    OwnedTFStatus c_status(TF_NewStatus());
+
     internal::DeviceDescriptionBuilder builder;
-    builder.set_name(platform_name_);
+    if (device_.hardware_name != nullptr) {
+      builder.set_name(device_.hardware_name);
+    }
+    if (device_.device_vendor != nullptr) {
+      builder.set_device_vendor(device_.device_vendor);
+    }
+    if (device_.pci_bus_id != nullptr) {
+      builder.set_pci_bus_id(device_.pci_bus_id);
+    }
+
+    if (device_fns_->get_numa_node != nullptr) {
+      int32_t numa_node = device_fns_->get_numa_node(&device_);
+      if (numa_node >= 0) {
+        builder.set_numa_node(numa_node);
+      }
+    }
+
+    if (device_fns_->get_memory_bandwidth != nullptr) {
+      int64_t memory_bandwidth = device_fns_->get_memory_bandwidth(&device_);
+      if (memory_bandwidth >= 0) {
+        builder.set_memory_bandwidth(memory_bandwidth);
+      }
+    }
+    // TODO(annarev): Add gflops field in DeviceDescription and set it here.
+    // TODO(annarev): Perhaps add `supports_unified_memory` in
+    // DeviceDescription.
     return builder.Build();
   }
 
@@ -674,8 +774,10 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
 
  private:
   SP_Device device_;
-  void (*destroy_device_)(SP_Device* const device);
+  SP_DeviceFns* device_fns_;
   SP_StreamExecutor* stream_executor_;
+  SP_Platform* platform_;
+  SP_PlatformFns* platform_fns_;
   SP_TimerFns* timer_fns_;
   std::string platform_name_;
   int visible_device_count_;
@@ -684,18 +786,26 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
 
 CPlatform::CPlatform(SP_Platform platform,
                      void (*destroy_platform)(SP_Platform*),
-                     SP_StreamExecutor stream_executor, SP_TimerFns timer_fns)
+                     SP_PlatformFns platform_fns,
+                     void (*destroy_platform_fns)(SP_PlatformFns*),
+                     SP_DeviceFns device_fns, SP_StreamExecutor stream_executor,
+                     SP_TimerFns timer_fns)
     : platform_(std::move(platform)),
       destroy_platform_(destroy_platform),
+      platform_fns_(std::move(platform_fns)),
+      destroy_platform_fns_(destroy_platform_fns),
+      device_fns_(std::move(device_fns)),
       stream_executor_(std::move(stream_executor)),
       timer_fns_(std::move(timer_fns)),
       name_(platform.name) {}
 
 CPlatform::~CPlatform() {
   executor_cache_.DestroyAllExecutors();
-  platform_.destroy_stream_executor(&stream_executor_);
-  platform_.destroy_timer_fns(&timer_fns_);
+  platform_fns_.destroy_device_fns(&platform_, &device_fns_);
+  platform_fns_.destroy_stream_executor(&platform_, &stream_executor_);
+  platform_fns_.destroy_timer_fns(&platform_, &timer_fns_);
   destroy_platform_(&platform_);
+  destroy_platform_fns_(&platform_fns_);
 }
 
 port::StatusOr<std::unique_ptr<DeviceDescription>>
@@ -735,48 +845,59 @@ port::StatusOr<std::unique_ptr<StreamExecutor>> CPlatform::GetUncachedExecutor(
   OwnedTFStatus c_status(TF_NewStatus());
 
   // Create Device
-  platform_.create_device(&device_params, c_status.get());
+  platform_fns_.create_device(&platform_, &device_params, c_status.get());
   TF_RETURN_IF_ERROR(StatusFromTF_Status(c_status.get()));
   TF_RETURN_IF_ERROR(ValidateSPDevice(device));
 
   auto executor = absl::make_unique<CStreamExecutor>(
-      std::move(device), platform_.destroy_device, &stream_executor_,
-      &timer_fns_, name_, platform_.visible_device_count);
+      std::move(device), &device_fns_, &stream_executor_, &platform_,
+      &platform_fns_, &timer_fns_, name_, platform_.visible_device_count);
   auto result = absl::make_unique<StreamExecutor>(this, std::move(executor),
                                                   config.ordinal);
   return result;
 }
 
-port::Status RegisterDevicePlugin(const std::string& dso_path) {
-  // Step 1: Load plugin
+port::Status InitStreamExecutorPlugin(void* dso_handle) {
   tensorflow::Env* env = tensorflow::Env::Default();
-  void* dso_handle;
-  TF_RETURN_IF_ERROR(env->LoadDynamicLibrary(dso_path.c_str(), &dso_handle));
 
-  // Step 2: Load symbol for `TF_InitPlugin`
+  // Step 1: Load symbol for `TF_InitPlugin`
   void* dso_symbol;
   TF_RETURN_IF_ERROR(
       env->GetSymbolFromLibrary(dso_handle, "SE_InitPlugin", &dso_symbol));
 
-  // Step 3: Call `TF_InitPlugin`
-  auto init_fn = reinterpret_cast<SEPluginInitFn>(dso_symbol);
-  return RegisterDevicePlugin(init_fn);
+  // Step 2: Call `TF_InitPlugin`
+  auto init_fn = reinterpret_cast<SEInitPluginFn>(dso_symbol);
+  return InitStreamExecutorPlugin(init_fn);
 }
 
-port::Status RegisterDevicePlugin(SEPluginInitFn init_fn) {
+port::Status InitStreamExecutorPlugin(SEInitPluginFn init_fn) {
   SE_PlatformRegistrationParams params{
       SE_PLATFORM_REGISTRATION_PARAMS_STRUCT_SIZE};
   SP_Platform platform{SP_PLATFORM_STRUCT_SIZE};
+  SP_PlatformFns platform_fns{SP_PLATFORM_FNS_STRUCT_SIZE};
   params.major_version = SE_MAJOR;
   params.minor_version = SE_MINOR;
-  params.revision_version = SE_REVISION;
+  params.patch_version = SE_PATCH;
   params.platform = &platform;
+  params.platform_fns = &platform_fns;
 
   OwnedTFStatus c_status(TF_NewStatus());
   init_fn(&params, c_status.get());
   TF_RETURN_IF_ERROR(tensorflow::StatusFromTF_Status(c_status.get()));
   TF_RETURN_IF_ERROR(ValidateSEPlatformRegistrationParams(params));
   TF_RETURN_IF_ERROR(ValidateSPPlatform(platform));
+  TF_RETURN_IF_ERROR(ValidateSPPlatformFns(platform_fns));
+
+  // Fill SP_DeviceFns creation params
+  SE_CreateDeviceFnsParams device_fns_params{
+      SE_CREATE_DEVICE_FNS_PARAMS_STRUCT_SIZE};
+  SP_DeviceFns device_fns{SP_DEVICE_FNS_STRUCT_SIZE};
+  device_fns_params.device_fns = &device_fns;
+
+  // Create StreamExecutor
+  platform_fns.create_device_fns(&platform, &device_fns_params, c_status.get());
+  TF_RETURN_IF_ERROR(tensorflow::StatusFromTF_Status(c_status.get()));
+  TF_RETURN_IF_ERROR(ValidateSPDeviceFns(device_fns));
 
   // Fill stream executor creation params
   SE_CreateStreamExecutorParams se_params{
@@ -785,21 +906,26 @@ port::Status RegisterDevicePlugin(SEPluginInitFn init_fn) {
   se_params.stream_executor = &se;
 
   // Create StreamExecutor
-  platform.create_stream_executor(&se_params, c_status.get());
+  platform_fns.create_stream_executor(&platform, &se_params, c_status.get());
   TF_RETURN_IF_ERROR(tensorflow::StatusFromTF_Status(c_status.get()));
-  TF_RETURN_IF_ERROR(ValidateSPStreamExecutor(se));
+  TF_RETURN_IF_ERROR(ValidateSPStreamExecutor(se, platform));
 
   SP_TimerFns timer_fns{SP_TIMER_FNS_STRUCT_SIZE};
-  platform.create_timer_fns(&timer_fns, c_status.get());
+  platform_fns.create_timer_fns(&platform, &timer_fns, c_status.get());
+  TF_RETURN_IF_ERROR(tensorflow::StatusFromTF_Status(c_status.get()));
+  TF_RETURN_IF_ERROR(ValidateSPTimerFns(timer_fns));
+
+  platform_fns.create_timer_fns(&platform, &timer_fns, c_status.get());
   TF_RETURN_IF_ERROR(tensorflow::StatusFromTF_Status(c_status.get()));
   TF_RETURN_IF_ERROR(ValidateSPTimerFns(timer_fns));
 
   // Register new platform
   std::string platform_name = std::string(platform.name);
   std::unique_ptr<stream_executor::CPlatform> cplatform(
-      new stream_executor::CPlatform(std::move(platform),
-                                     params.destroy_platform, std::move(se),
-                                     std::move(timer_fns)));
+      new stream_executor::CPlatform(
+          std::move(platform), params.destroy_platform, std::move(platform_fns),
+          params.destroy_platform_fns, std::move(device_fns), std::move(se),
+          std::move(timer_fns)));
   SE_CHECK_OK(stream_executor::MultiPlatformManager::RegisterPlatform(
       std::move(cplatform)));
 
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor.h b/tensorflow/c/experimental/stream_executor/stream_executor.h
index b3459a29ccc..bec77ef520b 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor.h
+++ b/tensorflow/c/experimental/stream_executor/stream_executor.h
@@ -52,10 +52,11 @@ limitations under the License.
 //   params.device = &device;
 //
 //   /* Plugin code below */
-//   constexpr char DEVICE_NAME[] = "MyDevice";
+//   constexpr char DEVICE_NAME[] = "MY_DEVICE";
 //   constexpr char DEVICE_TYPE[] = "GPU";
 //
-//   void create_device(SE_CreateDeviceParams* params, TF_Status* status) {
+//   void create_device(const SP_Platform* platform,
+//                      SE_CreateDeviceParams* params, TF_Status* status) {
 //     // Custom actions based on TensorFlow's view of SP_Device.
 //     OnTFDeviceView(params->device->struct_size);
 //     params->device = { SP_DEVICE_STRUCT_SIZE };
@@ -64,7 +65,7 @@ limitations under the License.
 //     ...
 //   }
 //
-//   void destroy_device(SP_Device* device) {
+//   void destroy_device(const SP_Platform* platform, SP_Device* device) {
 //     delete_my_device_handle(device->device_handle);
 //   }
 //
@@ -76,14 +77,14 @@ limitations under the License.
 //     params->platform->name = DEVICE_NAME;
 //     params->platform->type = DEVICE_TYPE;
 //     params->platform->visible_device_count = 2;
-//     params->platform->create_device = create_device;
-//     params->platform->destroy_device = destroy_device;
+//     params->platform_fns->create_device = create_device;
+//     params->platform_fns->destroy_device = destroy_device;
 //     ...
 //   }
 
 #define SE_MAJOR 0
 #define SE_MINOR 0
-#define SE_REVISION 1
+#define SE_PATCH 1
 
 #ifdef __cplusplus
 extern "C" {
@@ -147,7 +148,7 @@ typedef struct SP_DeviceMemoryBase {
 } SP_DeviceMemoryBase;
 
 #define SP_DEVICE_MEMORY_BASE_STRUCT_SIZE \
-  TF_OFFSET_OF_END(SP_DeviceMemoryBase, size)
+  TF_OFFSET_OF_END(SP_DeviceMemoryBase, payload)
 
 typedef struct SP_Device {
   size_t struct_size;
@@ -157,9 +158,30 @@ typedef struct SP_Device {
   // Device vendor can store handle to their device representation
   // here.
   void* device_handle;
+
+  // [Optional]
+  // Device hardware name. Used for printing.
+  // Must be null-terminated.
+  const char* hardware_name;
+
+  // [Optional]
+  // Device vendor name. Used for printing.
+  // Must be null-terminated.
+  const char* device_vendor;
+
+  // [Optional]
+  // Returns the PCI bus identifier for this device, of the form
+  // [domain]:[bus]:[device].[function]
+  // where domain number is usually 0000.
+  // Example: 0000:00:02.1
+  // For more information see:
+  // https://en.wikipedia.org/wiki/PCI_configuration_space
+  // https://www.oreilly.com/library/view/linux-device-drivers/0596005903/ch12.html
+  // Used for printing. Must be null-terminated.
+  const char* pci_bus_id;
 } SP_Device;
 
-#define SP_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SP_Device, device_handle)
+#define SP_DEVICE_STRUCT_SIZE TF_OFFSET_OF_END(SP_Device, pci_bus_id)
 
 typedef struct SE_CreateDeviceParams {
   size_t struct_size;
@@ -173,6 +195,42 @@ typedef struct SE_CreateDeviceParams {
 #define SE_CREATE_DEVICE_PARAMS_STRUCT_SIZE \
   TF_OFFSET_OF_END(SE_CreateDeviceParams, device)
 
+typedef struct SP_DeviceFns {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+
+  // [Optional]
+  // Returns the NUMA node associated with this device, for use in
+  // determining socket locality. If the NUMA node could not be determined, -1
+  // is returned.
+  // Negative values are treated as "unset".
+  int32_t (*get_numa_node)(const SP_Device* device);
+
+  // [Optional]
+  // Device's memory bandwidth in bytes/sec.  (This is for reads/writes to/from
+  // the device's own memory, not for transfers between the host and device.)
+  // Negative values are treated as "unset".
+  int64_t (*get_memory_bandwidth)(const SP_Device* device);
+
+  // [Optional]
+  // Estimate of average number of floating point operations per second for
+  // this device * 10e-9.
+  // Negative values are treated as "unset".
+  double (*get_gflops)(const SP_Device* device);
+} SP_DeviceFns;
+
+#define SP_DEVICE_FNS_STRUCT_SIZE TF_OFFSET_OF_END(SP_DeviceFns, get_gflops)
+
+typedef struct SE_CreateDeviceFnsParams {
+  size_t struct_size;
+  void* ext;  // reserved for future use
+
+  SP_DeviceFns* device_fns;  // output, to be filled by plugin
+} SE_CreateDeviceFnsParams;
+
+#define SE_CREATE_DEVICE_FNS_PARAMS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SE_CreateDeviceFnsParams, device_fns)
+
 typedef struct SP_StreamExecutor {
   size_t struct_size;
   void* ext;  // reserved for future use
@@ -198,6 +256,17 @@ typedef struct SP_StreamExecutor {
   // Deallocates a region of host memory allocated by `host_memory_allocate`.
   void (*host_memory_deallocate)(const SP_Device* device, void* mem);
 
+  // Allocates unified memory space of the given size, if supported. Unified
+  // memory support should be added by setting `supports_unified_memory` field
+  // in `SP_Platform`.
+  void* (*unified_memory_allocate)(const SP_Device* device, uint64_t bytes);
+
+  // Deallocates unified memory space previously allocated with
+  // `unified_memory_allocate`. Unified
+  // memory support should be added by setting `supports_unified_memory` field
+  // in `SP_Platform`.
+  void (*unified_memory_deallocate)(const SP_Device* device, void* location);
+
   // Fills SP_AllocatorStats with allocator statistics, if it is available.
   // If it is not available, return false.
   TF_Bool (*get_allocator_stats)(const SP_Device* device,
@@ -309,13 +378,23 @@ typedef struct SP_StreamExecutor {
   void (*block_host_for_event)(const SP_Device* device, SP_Event event,
                                TF_Status* status);
 
+  // [Optional]
+  // Causes the host code to synchronously wait for operations entrained onto
+  // stream to complete. Effectively a join on the asynchronous device
+  // operations enqueued on the stream before this program point.
+  // If not set, then corresponding functionality will be implemented
+  // by registering an event on the `stream` and waiting for it using
+  // `block_host_for_event`.
+  void (*block_host_until_done)(const SP_Device* device, SP_Stream stream,
+                                TF_Status* status);
+
   // Synchronizes all activity occurring in the StreamExecutor's context (most
   // likely a whole device).
   void (*synchronize_all_activity)(const SP_Device* device, TF_Status* status);
 
   // Enqueues on a stream a user-specified function to be run on the host.
   // `callback_arg` should be passed as the first argument to `callback_fn`.
-  TF_Bool (*host_callback)(SP_Device* device, SP_Stream stream,
+  TF_Bool (*host_callback)(const SP_Device* device, SP_Stream stream,
                            SE_StatusCallbackFn callback_fn, void* callback_arg);
 } SP_StreamExecutor;
 
@@ -337,36 +416,70 @@ typedef struct SP_Platform {
 
   void* ext;  // free-form data set by plugin
 
-  // Platform name. Must be null-terminated.
+  // Platform name (also referred to as subtype), for example MY_DEVICE.
+  // The name must start with a capital letter and consist of
+  // capital letters and underscores.
+  // Must be null-terminated.
   const char* name;
 
   // Device type name, for example GPU. Must be null-terminated.
+  // The name must start with a capital letter and consist of
+  // capital letters and underscores.
   const char* type;
 
   // Number of visible devices
   size_t visible_device_count;
 
+  // Whether this platform supports unified memory.
+  // Unified memory is a single memory address space accessible from any device.
+  TF_Bool supports_unified_memory;
+} SP_Platform;
+
+#define SP_PLATFORM_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SP_Platform, supports_unified_memory)
+
+typedef struct SP_PlatformFns {
+  size_t struct_size;
+
+  void* ext;  // reserved for future use
+
   // Callbacks for creating/destroying SP_Device.
-  void (*create_device)(SE_CreateDeviceParams* params, TF_Status* status);
+  void (*create_device)(const SP_Platform* platform,
+                        SE_CreateDeviceParams* params, TF_Status* status);
 
   // Clean up fields inside SP_Device that were allocated
   // by the plugin. `device` itself should not be deleted here.
-  void (*destroy_device)(SP_Device* device);
+  void (*destroy_device)(const SP_Platform* platform, SP_Device* device);
+
+  // Callbacks for creating/destroying SP_DeviceFns.
+  void (*create_device_fns)(const SP_Platform* platform,
+                            SE_CreateDeviceFnsParams* params,
+                            TF_Status* status);
+
+  // Clean up fields inside SP_DeviceFns that were allocated
+  // by the plugin. `device_fns` itself should not be deleted here.
+  void (*destroy_device_fns)(const SP_Platform* platform,
+                             SP_DeviceFns* device_fns);
 
   // Callbacks for creating/destroying SP_StreamExecutor.
-  void (*create_stream_executor)(SE_CreateStreamExecutorParams* params,
+  void (*create_stream_executor)(const SP_Platform* platform,
+                                 SE_CreateStreamExecutorParams* params,
                                  TF_Status* status);
   // Clean up fields inside SP_StreamExecutor that were allocated
   // by the plugin. `stream_executor` itself should not be deleted here.
-  void (*destroy_stream_executor)(SP_StreamExecutor* stream_executor);
+  void (*destroy_stream_executor)(const SP_Platform* platform,
+                                  SP_StreamExecutor* stream_executor);
 
   // Callbacks for creating/destroying SP_TimerFns.
-  void (*create_timer_fns)(SP_TimerFns* timer, TF_Status* status);
+  void (*create_timer_fns)(const SP_Platform* platform, SP_TimerFns* timer,
+                           TF_Status* status);
 
-  void (*destroy_timer_fns)(SP_TimerFns* timer_fns);
-} SP_Platform;
+  void (*destroy_timer_fns)(const SP_Platform* platform,
+                            SP_TimerFns* timer_fns);
+} SP_PlatformFns;
 
-#define SP_PLATFORM_STRUCT_SIZE TF_OFFSET_OF_END(SP_Platform, destroy_timer_fns)
+#define SP_PLATFORM_FNS_STRUCT_SIZE \
+  TF_OFFSET_OF_END(SP_PlatformFns, destroy_timer_fns)
 
 typedef struct SE_PlatformRegistrationParams {
   size_t struct_size;
@@ -375,16 +488,19 @@ typedef struct SE_PlatformRegistrationParams {
   // StreamExecutor C API version.
   int32_t major_version;
   int32_t minor_version;
-  int32_t revision_version;
+  int32_t patch_version;
 
-  SP_Platform* platform;  // output, set by plugin
+  SP_Platform* platform;         // output, set by plugin
+  SP_PlatformFns* platform_fns;  // output, set by plugin
   // Clean up fields inside SP_Platform that were allocated
   // by the plugin. `platform` itself should not be deleted here.
   void (*destroy_platform)(SP_Platform* platform);  // out, set by plugin
+  void (*destroy_platform_fns)(
+      SP_PlatformFns* platform_fns);  // out, set by plugin
 } SE_PlatformRegistrationParams;
 
 #define SE_PLATFORM_REGISTRATION_PARAMS_STRUCT_SIZE \
-  TF_OFFSET_OF_END(SE_PlatformRegistrationParams, destroy_platform)
+  TF_OFFSET_OF_END(SE_PlatformRegistrationParams, destroy_platform_fns)
 
 void SE_InitPlugin(SE_PlatformRegistrationParams* params, TF_Status* status);
 
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_internal.h b/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
index 2285fe85867..52ae4ba77e0 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_internal.h
@@ -27,20 +27,24 @@ namespace stream_executor {
 
 // Plugin initialization function that a device plugin
 // must define.
-typedef void (*SEPluginInitFn)(SE_PlatformRegistrationParams* const,
+typedef void (*SEInitPluginFn)(SE_PlatformRegistrationParams* const,
                                TF_Status* const);
 
-// Loads dso and registers StreamExecutor-based pluggable device.
-port::Status RegisterDevicePlugin(const std::string& dso_path);
+// Registers StreamExecutor platform.
+port::Status InitStreamExecutorPlugin(void* dso_handle);
 
-// Allow registering a plugin using a function (used for testing).
-port::Status RegisterDevicePlugin(SEPluginInitFn init_fn);
+// Allow registering a StreamExecutor plugin using a function (used for
+// testing).
+port::Status InitStreamExecutorPlugin(SEInitPluginFn init_fn);
 
 class CPlatform : public Platform {
  public:
   explicit CPlatform(SP_Platform platform,
                      void (*destroy_platform)(SP_Platform*),
-                     SP_StreamExecutor stream_executor, SP_TimerFns timer_fns);
+                     SP_PlatformFns platform_fns,
+                     void (*destroy_platform_fns)(SP_PlatformFns*),
+                     SP_DeviceFns device_fns, SP_StreamExecutor stream_executor,
+                     SP_TimerFns timer_fns);
   ~CPlatform() override;
 
   Id id() const override { return const_cast<int*>(&plugin_id_value_); }
@@ -69,6 +73,9 @@ class CPlatform : public Platform {
  private:
   SP_Platform platform_;
   void (*destroy_platform_)(SP_Platform*);
+  SP_PlatformFns platform_fns_;
+  void (*destroy_platform_fns_)(SP_PlatformFns*);
+  SP_DeviceFns device_fns_;
   SP_StreamExecutor stream_executor_;
   SP_TimerFns timer_fns_;
   const std::string name_;
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_test.cc b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
index 86fe00fe5ad..56c4ea09052 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
@@ -41,15 +41,19 @@ struct SP_Timer_st {
 
 namespace stream_executor {
 namespace {
-constexpr int DEVICE_COUNT = 2;
-constexpr char DEVICE_NAME[] = "MyDevice";
-constexpr char DEVICE_TYPE[] = "GPU";
+constexpr int kDeviceCount = 2;
+constexpr char kDeviceName[] = "MY_DEVICE";
+constexpr char kDeviceType[] = "GPU";
 
 /*** Create SP_StreamExecutor (with empty functions) ***/
 void allocate(const SP_Device* const device, uint64_t size,
               int64_t memory_space, SP_DeviceMemoryBase* const mem) {}
 void deallocate(const SP_Device* const device, SP_DeviceMemoryBase* const mem) {
 }
+void* host_memory_allocate(const SP_Device* const device, uint64_t size) {
+  return nullptr;
+}
+void host_memory_deallocate(const SP_Device* const device, void* mem) {}
 TF_Bool get_allocator_stats(const SP_Device* const device,
                             SP_AllocatorStats* const stats) {
   return true;
@@ -104,16 +108,18 @@ void block_host_for_event(const SP_Device* const device, SP_Event event,
                           TF_Status* const status) {}
 void synchronize_all_activity(const SP_Device* const device,
                               TF_Status* const status) {}
-TF_Bool host_callback(SP_Device* const device, SP_Stream stream,
+TF_Bool host_callback(const SP_Device* const device, SP_Stream stream,
                       SE_StatusCallbackFn const callback_fn,
                       void* const callback_arg) {
   return true;
 }
 
 void PopulateDefaultStreamExecutor(SP_StreamExecutor* se) {
-  se->struct_size = SP_STREAMEXECUTOR_STRUCT_SIZE;
+  *se = {SP_STREAMEXECUTOR_STRUCT_SIZE};
   se->allocate = allocate;
   se->deallocate = deallocate;
+  se->host_memory_allocate = host_memory_allocate;
+  se->host_memory_deallocate = host_memory_deallocate;
   se->get_allocator_stats = get_allocator_stats;
   se->device_memory_usage = device_memory_usage;
   se->create_stream = create_stream;
@@ -138,6 +144,10 @@ void PopulateDefaultStreamExecutor(SP_StreamExecutor* se) {
   se->host_callback = host_callback;
 }
 
+void PopulateDefaultDeviceFns(SP_DeviceFns* device_fns) {
+  *device_fns = {SP_DEVICE_FNS_STRUCT_SIZE};
+}
+
 /*** Create SP_TimerFns ***/
 uint64_t nanoseconds(SP_Timer timer) { return timer->timer_id; }
 
@@ -146,91 +156,158 @@ void PopulateDefaultTimerFns(SP_TimerFns* timer_fns) {
 }
 
 /*** Create SP_Platform ***/
-void create_timer_fns(SP_TimerFns* timer_fns, TF_Status* status) {
+void create_timer_fns(const SP_Platform* platform, SP_TimerFns* timer_fns,
+                      TF_Status* status) {
   TF_SetStatus(status, TF_OK, "");
   PopulateDefaultTimerFns(timer_fns);
 }
-void destroy_timer_fns(SP_TimerFns* timer_fns) {}
+void destroy_timer_fns(const SP_Platform* platform, SP_TimerFns* timer_fns) {}
 
-void create_stream_executor(SE_CreateStreamExecutorParams* params,
+void create_stream_executor(const SP_Platform* platform,
+                            SE_CreateStreamExecutorParams* params,
                             TF_Status* status) {
   TF_SetStatus(status, TF_OK, "");
   PopulateDefaultStreamExecutor(params->stream_executor);
 }
-void destroy_stream_executor(SP_StreamExecutor* se) {}
+void destroy_stream_executor(const SP_Platform* platform,
+                             SP_StreamExecutor* se) {}
 
-void create_device(SE_CreateDeviceParams* params, TF_Status* status) {
+void create_device(const SP_Platform* platform, SE_CreateDeviceParams* params,
+                   TF_Status* status) {
   TF_SetStatus(status, TF_OK, "");
-  params->device->struct_size = SP_DEVICE_STRUCT_SIZE;
+  params->device->struct_size = {SP_DEVICE_STRUCT_SIZE};
 }
-void destroy_device(SP_Device* device) {}
+void destroy_device(const SP_Platform* platform, SP_Device* device) {}
 
-void PopulateDefaultPlatform(SP_Platform* platform) {
-  platform->struct_size = SP_PLATFORM_STRUCT_SIZE;
-  platform->name = DEVICE_NAME;
-  platform->type = DEVICE_TYPE;
-  platform->visible_device_count = DEVICE_COUNT;
-  platform->create_device = create_device;
-  platform->destroy_device = destroy_device;
-  platform->create_stream_executor = create_stream_executor;
-  platform->destroy_stream_executor = destroy_stream_executor;
-  platform->create_timer_fns = create_timer_fns;
-  platform->destroy_timer_fns = destroy_timer_fns;
+void create_device_fns(const SP_Platform* platform,
+                       SE_CreateDeviceFnsParams* params, TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  params->device_fns->struct_size = {SP_DEVICE_FNS_STRUCT_SIZE};
+}
+void destroy_device_fns(const SP_Platform* platform, SP_DeviceFns* device_fns) {
+}
+
+void PopulateDefaultPlatform(SP_Platform* platform,
+                             SP_PlatformFns* platform_fns) {
+  *platform = {SP_PLATFORM_STRUCT_SIZE};
+  platform->name = kDeviceName;
+  platform->type = kDeviceType;
+  platform->visible_device_count = kDeviceCount;
+  platform_fns->create_device = create_device;
+  platform_fns->destroy_device = destroy_device;
+  platform_fns->create_device_fns = create_device_fns;
+  platform_fns->destroy_device_fns = destroy_device_fns;
+  platform_fns->create_stream_executor = create_stream_executor;
+  platform_fns->destroy_stream_executor = destroy_stream_executor;
+  platform_fns->create_timer_fns = create_timer_fns;
+  platform_fns->destroy_timer_fns = destroy_timer_fns;
 }
 
 void destroy_platform(SP_Platform* const platform) {}
+void destroy_platform_fns(SP_PlatformFns* const platform_fns) {}
 
 /*** Registration tests ***/
 TEST(StreamExecutor, SuccessfulRegistration) {
   auto plugin_init = [](SE_PlatformRegistrationParams* const params,
                         TF_Status* const status) -> void {
     TF_SetStatus(status, TF_OK, "");
-    PopulateDefaultPlatform(params->platform);
+    PopulateDefaultPlatform(params->platform, params->platform_fns);
     params->destroy_platform = destroy_platform;
+    params->destroy_platform_fns = destroy_platform_fns;
   };
-  port::Status status = RegisterDevicePlugin(plugin_init);
+  port::Status status = InitStreamExecutorPlugin(plugin_init);
   TF_ASSERT_OK(status);
   port::StatusOr<Platform*> maybe_platform =
-      MultiPlatformManager::PlatformWithName("MyDevice");
+      MultiPlatformManager::PlatformWithName("MY_DEVICE");
   TF_ASSERT_OK(maybe_platform.status());
   Platform* platform = maybe_platform.ConsumeValueOrDie();
-  ASSERT_EQ(platform->Name(), DEVICE_NAME);
-  ASSERT_EQ(platform->VisibleDeviceCount(), DEVICE_COUNT);
+  ASSERT_EQ(platform->Name(), kDeviceName);
+  ASSERT_EQ(platform->VisibleDeviceCount(), kDeviceCount);
 
   port::StatusOr<StreamExecutor*> maybe_executor =
       platform->ExecutorForDevice(0);
   TF_ASSERT_OK(maybe_executor.status());
-  StreamExecutor* executor = maybe_executor.ConsumeValueOrDie();
-  ASSERT_EQ(executor->GetDeviceDescription().name(), "MyDevice");
 }
 
 TEST(StreamExecutor, NameNotSet) {
   auto plugin_init = [](SE_PlatformRegistrationParams* const params,
                         TF_Status* const status) -> void {
     TF_SetStatus(status, TF_OK, "");
-    PopulateDefaultPlatform(params->platform);
+    PopulateDefaultPlatform(params->platform, params->platform_fns);
     params->platform->name = nullptr;
     params->destroy_platform = destroy_platform;
+    params->destroy_platform_fns = destroy_platform_fns;
   };
 
-  port::Status status = RegisterDevicePlugin(plugin_init);
+  port::Status status = InitStreamExecutorPlugin(plugin_init);
   ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
   ASSERT_EQ(status.error_message(), "'name' field in SP_Platform must be set.");
 }
 
+TEST(StreamExecutor, InvalidNameWithSemicolon) {
+  auto plugin_init = [](SE_PlatformRegistrationParams* const params,
+                        TF_Status* const status) -> void {
+    TF_SetStatus(status, TF_OK, "");
+    PopulateDefaultPlatform(params->platform, params->platform_fns);
+    params->platform->name = "INVALID:NAME";
+    params->destroy_platform = destroy_platform;
+    params->destroy_platform_fns = destroy_platform_fns;
+  };
+
+  port::Status status = InitStreamExecutorPlugin(plugin_init);
+  ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
+  EXPECT_THAT(
+      status.error_message(),
+      testing::ContainsRegex("Device name/type 'INVALID:NAME' must match"));
+}
+
+TEST(StreamExecutor, InvalidNameWithSlash) {
+  auto plugin_init = [](SE_PlatformRegistrationParams* const params,
+                        TF_Status* const status) -> void {
+    TF_SetStatus(status, TF_OK, "");
+    PopulateDefaultPlatform(params->platform, params->platform_fns);
+    params->platform->name = "INVALID/";
+    params->destroy_platform = destroy_platform;
+    params->destroy_platform_fns = destroy_platform_fns;
+  };
+
+  port::Status status = InitStreamExecutorPlugin(plugin_init);
+  ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
+  EXPECT_THAT(status.error_message(),
+              testing::ContainsRegex("Device name/type 'INVALID/' must match"));
+}
+
 TEST(StreamExecutor, CreateDeviceNotSet) {
   auto plugin_init = [](SE_PlatformRegistrationParams* const params,
                         TF_Status* const status) -> void {
     TF_SetStatus(status, TF_OK, "");
-    PopulateDefaultPlatform(params->platform);
-    params->platform->create_device = nullptr;
+    PopulateDefaultPlatform(params->platform, params->platform_fns);
+    params->platform_fns->create_device = nullptr;
     params->destroy_platform = destroy_platform;
+    params->destroy_platform_fns = destroy_platform_fns;
   };
 
-  port::Status status = RegisterDevicePlugin(plugin_init);
+  port::Status status = InitStreamExecutorPlugin(plugin_init);
   ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
   ASSERT_EQ(status.error_message(),
-            "'create_device' field in SP_Platform must be set.");
+            "'create_device' field in SP_PlatformFns must be set.");
+}
+
+TEST(StreamExecutor, UnifiedMemoryAllocateNotSet) {
+  auto plugin_init = [](SE_PlatformRegistrationParams* const params,
+                        TF_Status* const status) -> void {
+    TF_SetStatus(status, TF_OK, "");
+    PopulateDefaultPlatform(params->platform, params->platform_fns);
+    params->platform->supports_unified_memory = true;
+    params->destroy_platform = destroy_platform;
+    params->destroy_platform_fns = destroy_platform_fns;
+  };
+
+  port::Status status = InitStreamExecutorPlugin(plugin_init);
+  ASSERT_EQ(status.code(), tensorflow::error::FAILED_PRECONDITION);
+  ASSERT_EQ(
+      status.error_message(),
+      "'unified_memory_allocate' field in SP_StreamExecutor must be set.");
 }
 
 /*** StreamExecutor behavior tests ***/
@@ -238,7 +315,8 @@ class StreamExecutorTest : public ::testing::Test {
  protected:
   StreamExecutorTest() {}
   void SetUp() override {
-    PopulateDefaultPlatform(&platform_);
+    PopulateDefaultPlatform(&platform_, &platform_fns_);
+    PopulateDefaultDeviceFns(&device_fns_);
     PopulateDefaultStreamExecutor(&se_);
     PopulateDefaultTimerFns(&timer_fns_);
   }
@@ -246,8 +324,9 @@ class StreamExecutorTest : public ::testing::Test {
 
   StreamExecutor* GetExecutor(int ordinal) {
     if (!cplatform_) {
-      cplatform_ = absl::make_unique<CPlatform>(platform_, destroy_platform,
-                                                se_, timer_fns_);
+      cplatform_ = absl::make_unique<CPlatform>(
+          platform_, destroy_platform, platform_fns_, destroy_platform_fns,
+          device_fns_, se_, timer_fns_);
     }
     port::StatusOr<StreamExecutor*> maybe_executor =
         cplatform_->ExecutorForDevice(ordinal);
@@ -255,6 +334,8 @@ class StreamExecutorTest : public ::testing::Test {
     return maybe_executor.ConsumeValueOrDie();
   }
   SP_Platform platform_;
+  SP_PlatformFns platform_fns_;
+  SP_DeviceFns device_fns_;
   SP_StreamExecutor se_;
   SP_TimerFns timer_fns_;
   std::unique_ptr<CPlatform> cplatform_;
@@ -264,13 +345,13 @@ TEST_F(StreamExecutorTest, Allocate) {
   se_.allocate = [](const SP_Device* const device, uint64_t size,
                     int64_t memory_space, SP_DeviceMemoryBase* const mem) {
     mem->struct_size = SP_DEVICE_MEMORY_BASE_STRUCT_SIZE;
-    mem->opaque = std::malloc(size);
+    mem->opaque = malloc(size);
     mem->size = size;
   };
   se_.deallocate = [](const SP_Device* const device,
                       SP_DeviceMemoryBase* const mem) {
     EXPECT_EQ(mem->size, 2 * sizeof(int));
-    std::free(mem->opaque);
+    free(mem->opaque);
     mem->opaque = nullptr;
     mem->size = 0;
   };
@@ -287,10 +368,10 @@ TEST_F(StreamExecutorTest, HostMemoryAllocate) {
   static bool deallocate_called = false;
   se_.host_memory_allocate = [](const SP_Device* const device, uint64_t size) {
     allocate_called = true;
-    return std::malloc(size);
+    return malloc(size);
   };
   se_.host_memory_deallocate = [](const SP_Device* const device, void* mem) {
-    std::free(mem);
+    free(mem);
     deallocate_called = true;
   };
   StreamExecutor* executor = GetExecutor(0);
@@ -303,6 +384,28 @@ TEST_F(StreamExecutorTest, HostMemoryAllocate) {
   ASSERT_TRUE(deallocate_called);
 }
 
+TEST_F(StreamExecutorTest, UnifiedMemoryAllocate) {
+  static bool allocate_called = false;
+  static bool deallocate_called = false;
+  se_.unified_memory_allocate = [](const SP_Device* const device,
+                                   uint64_t size) {
+    allocate_called = true;
+    return malloc(size);
+  };
+  se_.unified_memory_deallocate = [](const SP_Device* const device, void* mem) {
+    free(mem);
+    deallocate_called = true;
+  };
+  StreamExecutor* executor = GetExecutor(0);
+  ASSERT_FALSE(allocate_called);
+  void* mem = executor->UnifiedMemoryAllocate(8);
+  ASSERT_NE(mem, nullptr);
+  ASSERT_TRUE(allocate_called);
+  ASSERT_FALSE(deallocate_called);
+  executor->UnifiedMemoryDeallocate(mem);
+  ASSERT_TRUE(deallocate_called);
+}
+
 TEST_F(StreamExecutorTest, GetAllocatorStats) {
   se_.get_allocator_stats = [](const SP_Device* const device,
                                SP_AllocatorStats* const stat) -> TF_Bool {
@@ -745,6 +848,31 @@ TEST_F(StreamExecutorTest, BlockHostForEvent) {
   ASSERT_TRUE(block_host_for_event_called);
 }
 
+TEST_F(StreamExecutorTest, BlockHostUntilDone) {
+  static bool block_host_until_done_called = false;
+  se_.create_stream = [](const SP_Device* const device, SP_Stream* stream,
+                         TF_Status* const status) {
+    *stream = new SP_Stream_st(58);
+  };
+  se_.destroy_stream = [](const SP_Device* const device, SP_Stream stream) {
+    delete stream;
+  };
+  se_.block_host_until_done = [](const SP_Device* const device,
+                                 SP_Stream stream,
+                                 TF_Status* const status) -> void {
+    ASSERT_EQ(stream->stream_id, 58);
+    TF_SetStatus(status, TF_OK, "");
+    block_host_until_done_called = true;
+  };
+
+  StreamExecutor* executor = GetExecutor(0);
+  Stream stream(executor);
+  stream.Init();
+  ASSERT_FALSE(block_host_until_done_called);
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+  ASSERT_TRUE(block_host_until_done_called);
+}
+
 TEST_F(StreamExecutorTest, SynchronizeAllActivity) {
   static bool synchronize_all_called = false;
   se_.synchronize_all_activity = [](const SP_Device* const device,
@@ -760,7 +888,7 @@ TEST_F(StreamExecutorTest, SynchronizeAllActivity) {
 }
 
 TEST_F(StreamExecutorTest, HostCallbackOk) {
-  se_.host_callback = [](SP_Device* const device, SP_Stream stream,
+  se_.host_callback = [](const SP_Device* const device, SP_Stream stream,
                          SE_StatusCallbackFn const callback_fn,
                          void* const callback_arg) -> TF_Bool {
     TF_Status* status = TF_NewStatus();
@@ -780,7 +908,7 @@ TEST_F(StreamExecutorTest, HostCallbackOk) {
 }
 
 TEST_F(StreamExecutorTest, HostCallbackError) {
-  se_.host_callback = [](SP_Device* const device, SP_Stream stream,
+  se_.host_callback = [](const SP_Device* const device, SP_Stream stream,
                          SE_StatusCallbackFn const callback_fn,
                          void* const callback_arg) -> TF_Bool {
     TF_Status* status = TF_NewStatus();
@@ -798,5 +926,59 @@ TEST_F(StreamExecutorTest, HostCallbackError) {
   stream.ThenDoHostCallbackWithStatus(callback);
   ASSERT_FALSE(stream.ok());
 }
+
+TEST_F(StreamExecutorTest, DeviceDescription) {
+  static const char* hardware_name = "TestName";
+  static const char* vendor = "TestVendor";
+  static const char* pci_bus_id = "TestPCIBusId";
+  platform_fns_.create_device = [](const SP_Platform* platform,
+                                   SE_CreateDeviceParams* params,
+                                   TF_Status* status) {
+    params->device->hardware_name = hardware_name;
+    params->device->device_vendor = vendor;
+    params->device->pci_bus_id = pci_bus_id;
+  };
+
+  device_fns_.get_numa_node = [](const SP_Device* device) { return 123; };
+  device_fns_.get_memory_bandwidth = [](const SP_Device* device) -> int64_t {
+    return 54;
+  };
+  device_fns_.get_gflops = [](const SP_Device* device) -> double { return 32; };
+
+  StreamExecutor* executor = GetExecutor(0);
+  const DeviceDescription& description = executor->GetDeviceDescription();
+  ASSERT_EQ(description.name(), "TestName");
+  ASSERT_EQ(description.device_vendor(), "TestVendor");
+  ASSERT_EQ(description.pci_bus_id(), "TestPCIBusId");
+  ASSERT_EQ(description.numa_node(), 123);
+  ASSERT_EQ(description.memory_bandwidth(), 54);
+}
+
+TEST_F(StreamExecutorTest, DeviceDescriptionNumaNodeNotSet) {
+  static const char* hardware_name = "TestName";
+  static const char* vendor = "TestVendor";
+  static const char* pci_bus_id = "TestPCIBusId";
+  platform_fns_.create_device = [](const SP_Platform* platform,
+                                   SE_CreateDeviceParams* params,
+                                   TF_Status* status) {
+    params->device->hardware_name = hardware_name;
+    params->device->device_vendor = vendor;
+    params->device->pci_bus_id = pci_bus_id;
+  };
+
+  device_fns_.get_memory_bandwidth = [](const SP_Device* device) -> int64_t {
+    return 54;
+  };
+  device_fns_.get_gflops = [](const SP_Device* device) -> double { return 32; };
+
+  StreamExecutor* executor = GetExecutor(0);
+  const DeviceDescription& description = executor->GetDeviceDescription();
+  ASSERT_EQ(description.name(), "TestName");
+  ASSERT_EQ(description.device_vendor(), "TestVendor");
+  ASSERT_EQ(description.pci_bus_id(), "TestPCIBusId");
+  ASSERT_EQ(description.numa_node(), -1);
+  ASSERT_EQ(description.memory_bandwidth(), 54);
+}
+
 }  // namespace
 }  // namespace stream_executor
diff --git a/tensorflow/c/kernels/BUILD b/tensorflow/c/kernels/BUILD
index 93b82b2396f..d89eda3eb4e 100644
--- a/tensorflow/c/kernels/BUILD
+++ b/tensorflow/c/kernels/BUILD
@@ -1,9 +1,13 @@
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-    "tf_gen_op_libs",
-    "tf_kernel_library",
-)
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -132,6 +136,23 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "summary_op_benchmark_test",
+    size = "small",
+    srcs = ["summary_op_benchmark_test.cc"],
+    deps = [
+        ":summary_op",
+        "//tensorflow/c:kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 cc_library(
     name = "tensor_shape_utils",
     srcs = ["tensor_shape_utils.cc"],
diff --git a/tensorflow/c/kernels/histogram_summary_op.cc b/tensorflow/c/kernels/histogram_summary_op.cc
index 5de52703f5d..143a2675a05 100644
--- a/tensorflow/c/kernels/histogram_summary_op.cc
+++ b/tensorflow/c/kernels/histogram_summary_op.cc
@@ -93,11 +93,13 @@ void HistogramSummaryOp_Compute(void* kernel, TF_OpKernelContext* ctx) {
       std::ostringstream err;
       err << "Nan in summary histogram for: " << k->op_node_name;
       TF_SetStatus(status.get(), TF_INVALID_ARGUMENT, err.str().c_str());
+      TF_OpKernelContext_Failure(ctx, status.get());
       return;
     } else if (Eigen::numext::isinf(double_val)) {
       std::ostringstream err;
       err << "Infinity in Histogram for: " << k->op_node_name;
       TF_SetStatus(status.get(), TF_INVALID_ARGUMENT, err.str().c_str());
+      TF_OpKernelContext_Failure(ctx, status.get());
       return;
     }
     histo.Add(double_val);
diff --git a/tensorflow/c/kernels/summary_op_benchmark_test.cc b/tensorflow/c/kernels/summary_op_benchmark_test.cc
new file mode 100644
index 00000000000..887a86066d3
--- /dev/null
+++ b/tensorflow/c/kernels/summary_op_benchmark_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+Graph* BM_ScalarSummaryOp(TensorShape shape, std::string tag, float value) {
+  Graph* g = new Graph(OpRegistry::Global());
+  Tensor tags(DT_STRING, shape);
+  Tensor values(DT_FLOAT, shape);
+  for (int i = 0; i < tags.NumElements(); ++i) {
+    tags.flat<tstring>()(i) = tag;
+    values.flat<float>()(i) = value;
+  }
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("dummy"), "ScalarSummary")
+                  .Input(test::graph::Constant(g, tags))
+                  .Input(test::graph::Constant(g, values))
+                  .Attr("T", DT_FLOAT)
+                  .Finalize(g, &ret));
+  return g;
+}
+
+// Macro used to parse initializer list for tensorshape
+#define DIMARGS(...) \
+  { __VA_ARGS__ }
+// // Random parameters for testing
+constexpr char longTagParam[] = "LONGTAG____________________________";
+constexpr float largeValueParam = 2352352.2623433;
+
+#define BM_ScalarSummaryDev(device, dims, name, tag, value) \
+  void BM_ScalarSummary##name##device(int iters) {          \
+    testing::StopTiming();                                  \
+    TensorShape tensorshape(DIMARGS dims);                  \
+    auto g = BM_ScalarSummaryOp(tensorshape, #tag, value);  \
+    testing::StartTiming();                                 \
+    test::Benchmark("cpu", g).Run(iters);                   \
+  }                                                         \
+  BENCHMARK(BM_ScalarSummary##name##device);
+
+BM_ScalarSummaryDev(Cpu, (5, 10, 100), Base, Tag, 5.2);
+// Benchmark for large shapes
+BM_ScalarSummaryDev(Cpu, (500, 100, 100), LargeShape, Tag, 5.2);
+// Benchmark for large tag tstring
+BM_ScalarSummaryDev(Cpu, (5, 10, 100), LongTag, longTagParam, 5.2);
+// Benchmark for large values
+BM_ScalarSummaryDev(Cpu, (500, 100, 100), LargeValue, Tag, largeValueParam);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc
index 3bdaa866ee6..7ec1f4cc951 100644
--- a/tensorflow/c/python_api.cc
+++ b/tensorflow/c/python_api.cc
@@ -57,43 +57,7 @@ void SetRequestedDevice(TF_Graph* graph, TF_Operation* op, const char* device) {
 
 void UpdateEdge(TF_Graph* graph, TF_Output new_src, TF_Input dst,
                 TF_Status* status) {
-  mutex_lock l(graph->mu);
-  tensorflow::shape_inference::InferenceContext* ic =
-      graph->refiner.GetContext(&new_src.oper->node);
-
-  if (ic->num_outputs() <= new_src.index) {
-    status->status = tensorflow::errors::OutOfRange(
-        "Cannot update edge. Output index [", new_src.index,
-        "] is greater than the number of total outputs [", ic->num_outputs(),
-        "].");
-    return;
-  }
-  tensorflow::shape_inference::ShapeHandle shape = ic->output(new_src.index);
-
-  tensorflow::shape_inference::InferenceContext* ic_dst =
-      graph->refiner.GetContext(&dst.oper->node);
-  if (ic_dst->num_inputs() <= dst.index) {
-    status->status = tensorflow::errors::OutOfRange(
-        "Cannot update edge. Input index [", dst.index,
-        "] is greater than the number of total inputs [", ic_dst->num_inputs(),
-        "].");
-    return;
-  }
-  if (!ic_dst->MergeInput(dst.index, shape)) {
-    status->status = tensorflow::errors::InvalidArgument(
-        "Cannot update edge, incompatible shapes: ", ic_dst->DebugString(shape),
-        " and ", ic_dst->DebugString(ic_dst->input(dst.index)), ".");
-    return;
-  }
-  status->status = graph->graph.UpdateEdge(&new_src.oper->node, new_src.index,
-                                           &dst.oper->node, dst.index);
-
-  if (TF_GetCode(status) == TF_OK) {
-    // This modification only updates the destination node for
-    // the purposes of running this graph in a session. Thus, we don't
-    // record the source node as being modified.
-    RecordMutation(graph, *dst.oper, "updating input tensor");
-  }
+  TF_UpdateEdge(graph, new_src, dst, status);
 }
 
 void RemoveAllControlInputs(TF_Graph* graph, TF_Operation* op) {
@@ -136,6 +100,7 @@ std::string GetHandleShapeAndType(TF_Graph* graph, TF_Output output) {
       auto* out_shape_and_type = handle_data.add_shape_and_type();
       ic->ShapeHandleToProto(p.shape, out_shape_and_type->mutable_shape());
       out_shape_and_type->set_dtype(p.dtype);
+      out_shape_and_type->set_specialized_type(p.specialized_type);
     }
   }
   string result;
@@ -163,7 +128,8 @@ void SetHandleShapeAndType(TF_Graph* graph, TF_Output output, const void* proto,
     status->status =
         ic->MakeShapeFromShapeProto(shape_and_type_proto.shape(), &shape);
     if (TF_GetCode(status) != TF_OK) return;
-    shapes_and_types.emplace_back(shape, shape_and_type_proto.dtype());
+    shapes_and_types.emplace_back(shape, shape_and_type_proto.dtype(),
+                                  shape_and_type_proto.specialized_type());
   }
   ic->set_output_handle_shapes_and_types(output.index, shapes_and_types);
 }
diff --git a/tensorflow/c/tf_shape.cc b/tensorflow/c/tf_shape.cc
new file mode 100644
index 00000000000..a715544a13f
--- /dev/null
+++ b/tensorflow/c/tf_shape.cc
@@ -0,0 +1,39 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/tf_shape.h"
+
+#include <stdint.h>
+
+#include "tensorflow/c/tf_shape_internal.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+extern "C" {
+
+TF_Shape* TF_NewShape() {
+  return tensorflow::wrap(new tensorflow::PartialTensorShape());
+}
+
+int TF_ShapeDims(const TF_Shape* shape) {
+  return tensorflow::unwrap(shape)->dims();
+}
+
+int64_t TF_ShapeDimSize(const TF_Shape* shape, int d) {
+  return tensorflow::unwrap(shape)->dim_size(d);
+}
+
+void TF_DeleteShape(TF_Shape* shape) { delete tensorflow::unwrap(shape); }
+
+}  // end extern "C"
diff --git a/tensorflow/c/tf_shape.h b/tensorflow/c/tf_shape.h
new file mode 100644
index 00000000000..f218d05e274
--- /dev/null
+++ b/tensorflow/c/tf_shape.h
@@ -0,0 +1,50 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+
+#include "tensorflow/c/c_api_macros.h"
+
+#ifndef TENSORFLOW_C_TF_SHAPE_H_
+#define TENSORFLOW_C_TF_SHAPE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// An opaque type corresponding to a shape in tensorflow. In the future,
+// we may expose the ABI of TF_Shape for performance reasons.
+typedef struct TF_Shape TF_Shape;
+
+// Return a new, unknown rank shape object. The caller is responsible for
+// calling TF_DeleteShape to deallocate and destroy the returned shape.
+TF_CAPI_EXPORT extern TF_Shape* TF_NewShape();
+
+// Returns the rank of `shape`. If `shape` has unknown rank, returns -1.
+TF_CAPI_EXPORT extern int TF_ShapeDims(const TF_Shape* shape);
+
+// Returns the `d`th dimension of `shape`. If `shape` has unknown rank,
+// invoking this function is undefined behavior. Returns -1 if dimension is
+// unknown.
+TF_CAPI_EXPORT extern int64_t TF_ShapeDimSize(const TF_Shape* shape, int d);
+
+// Deletes `shape`.
+TF_CAPI_EXPORT extern void TF_DeleteShape(TF_Shape* shape);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_TF_SHAPE_H_
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/dialect_registration.cc b/tensorflow/c/tf_shape_internal.h
similarity index 62%
rename from tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/dialect_registration.cc
rename to tensorflow/c/tf_shape_internal.h
index 9d1c354690a..fe97726460f 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/dialect_registration.cc
+++ b/tensorflow/c/tf_shape_internal.h
@@ -13,11 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#ifndef TENSORFLOW_C_TF_SHAPE_INTERNAL_H_
+#define TENSORFLOW_C_TF_SHAPE_INTERNAL_H_
 
-// Static initialization for *HLO dialects registration.
-static mlir::DialectRegistration<mlir::mhlo::MhloDialect> mhlo_ops;
-static mlir::DialectRegistration<mlir::chlo::HloClientDialect> chlo_ops;
-static mlir::DialectRegistration<mlir::lmhlo::LmhloDialect> lmhlo_ops;
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+typedef struct TF_Shape TF_Shape;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::PartialTensorShape, TF_Shape);
+
+}
+
+#endif  // TENSORFLOW_C_TF_SHAPE_INTERNAL_H_
diff --git a/tensorflow/c/tf_status_helper.h b/tensorflow/c/tf_status_helper.h
index ff8085f1229..a895e608159 100644
--- a/tensorflow/c/tf_status_helper.h
+++ b/tensorflow/c/tf_status_helper.h
@@ -28,6 +28,14 @@ void Set_TF_Status_from_Status(TF_Status* tf_status,
 // Returns a "status" from "tf_status".
 tensorflow::Status StatusFromTF_Status(const TF_Status* tf_status);
 
+namespace internal {
+struct TF_StatusDeleter {
+  void operator()(TF_Status* tf_status) const { TF_DeleteStatus(tf_status); }
+};
+}  // namespace internal
+
+using TF_StatusPtr = std::unique_ptr<TF_Status, internal::TF_StatusDeleter>;
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_TF_STATUS_HELPER_H_
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 8602bfafff8..8f7e447d322 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -2,16 +2,22 @@
 # TensorFlow is a computational framework, primarily for use in machine
 # learning applications.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "cc_library_with_android_deps",
     "tf_cc_binary",
     "tf_cc_test",
     "tf_copts",
-    "tf_gen_op_wrappers_cc",
     "transitive_hdrs",
 )
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrappers_cc")
+
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
@@ -245,7 +251,6 @@ cc_library_with_android_deps(
     deps = [
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_experimental",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -260,7 +265,6 @@ tf_cc_test(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_experimental",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/cc/client/client_session.h b/tensorflow/cc/client/client_session.h
index 3765eaec9bf..a661319b074 100644
--- a/tensorflow/cc/client/client_session.h
+++ b/tensorflow/cc/client/client_session.h
@@ -64,7 +64,7 @@ class ClientSession {
   ClientSession(const Scope& scope, const string& target);
 
   /// Same as above, but use the empty string ("") as the target specification.
-  ClientSession(const Scope& scope);
+  explicit ClientSession(const Scope& scope);
 
   /// Create a new session, configuring it with `session_options`.
   ClientSession(const Scope& scope, const SessionOptions& session_options);
diff --git a/tensorflow/cc/experimental/base/public/BUILD b/tensorflow/cc/experimental/base/public/BUILD
index 045d4e6cd97..0aaf2238e6a 100644
--- a/tensorflow/cc/experimental/base/public/BUILD
+++ b/tensorflow/cc/experimental/base/public/BUILD
@@ -8,6 +8,8 @@
 # 2. Are std:: types
 # 3. Wrap an opaque C type
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 package(
     # This is intentionally public
     default_visibility = [
diff --git a/tensorflow/cc/experimental/base/tests/BUILD b/tensorflow/cc/experimental/base/tests/BUILD
index f449d618f72..f7f6e77c98f 100644
--- a/tensorflow/cc/experimental/base/tests/BUILD
+++ b/tensorflow/cc/experimental/base/tests/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 # Tests for the C++ header-only base types.
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc
index e9173227aad..480243a29e6 100644
--- a/tensorflow/cc/gradients/array_grad.cc
+++ b/tensorflow/cc/gradients/array_grad.cc
@@ -15,13 +15,12 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/cc/framework/grad_op_registry.h"
+#include "tensorflow/cc/framework/gradients.h"
 #include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
-#include "tensorflow/cc/framework/grad_op_registry.h"
-#include "tensorflow/cc/framework/gradients.h"
-
 namespace tensorflow {
 namespace ops {
 namespace {
@@ -90,15 +89,25 @@ Status QuantizeAndDequantizeGrad(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("QuantizeAndDequantize", QuantizeAndDequantizeGrad);
 
-Status QuantizeAndDequantizeV2Grad(const Scope& scope, const Operation& op,
-                                   const std::vector<Output>& grad_inputs,
-                                   std::vector<Output>* grad_outputs) {
-  grad_outputs->push_back(Identity(scope, grad_inputs[0]));
-  grad_outputs->push_back(NoGradient());
-  grad_outputs->push_back(NoGradient());
+Status QuantizeAndDequantizeV4GradHelper(const Scope& scope,
+                                         const Operation& op,
+                                         const std::vector<Output>& grad_inputs,
+                                         std::vector<Output>* grad_outputs) {
+  Input input = Shape(scope, op.input(0));
+  Input input_min = op.input(1);
+  Input input_max = op.input(2);
+  int64 axis;
+  TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "axis", &axis));
+  auto qdq_v4_grad = QuantizeAndDequantizeV4Grad(
+      scope, grad_inputs[0], input, input_min, input_max,
+      QuantizeAndDequantizeV4Grad::Axis(axis));
+  grad_outputs->push_back(qdq_v4_grad.input_backprop);
+  grad_outputs->push_back(qdq_v4_grad.input_min_backprop);
+  grad_outputs->push_back(qdq_v4_grad.input_max_backprop);
   return scope.status();
 }
-REGISTER_GRADIENT_OP("QuantizeAndDequantizeV2", QuantizeAndDequantizeV2Grad);
+REGISTER_GRADIENT_OP("QuantizeAndDequantizeV4",
+                     QuantizeAndDequantizeV4GradHelper);
 
 Status QuantizeAndDequantizeV3Grad(const Scope& scope, const Operation& op,
                                    const std::vector<Output>& grad_inputs,
diff --git a/tensorflow/cc/gradients/grad_testutil.h b/tensorflow/cc/gradients/grad_testutil.h
index 70c81f1a73a..43d533ad760 100644
--- a/tensorflow/cc/gradients/grad_testutil.h
+++ b/tensorflow/cc/gradients/grad_testutil.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CC_GRADIENTS_GRAD_TESTUTIL_H_
 #define TENSORFLOW_CC_GRADIENTS_GRAD_TESTUTIL_H_
 
+#include <vector>
+
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 
diff --git a/tensorflow/cc/ops/const_op.h b/tensorflow/cc/ops/const_op.h
index 424a683665f..9c888701b45 100644
--- a/tensorflow/cc/ops/const_op.h
+++ b/tensorflow/cc/ops/const_op.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CC_OPS_CONST_OP_H_
 #define TENSORFLOW_CC_OPS_CONST_OP_H_
 
+#include <vector>
+
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/core/graph/node_builder.h"
diff --git a/tensorflow/cc/ops/while_loop.h b/tensorflow/cc/ops/while_loop.h
index 727237b5c7a..6dbf1d23dba 100644
--- a/tensorflow/cc/ops/while_loop.h
+++ b/tensorflow/cc/ops/while_loop.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CC_OPS_WHILE_LOOP_H_
 #define TENSORFLOW_CC_OPS_WHILE_LOOP_H_
 
+#include <string>
+#include <vector>
+
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 
diff --git a/tensorflow/cc/profiler/BUILD b/tensorflow/cc/profiler/BUILD
index 057ce7cb993..43240506f8c 100644
--- a/tensorflow/cc/profiler/BUILD
+++ b/tensorflow/cc/profiler/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 
 package(
diff --git a/tensorflow/cc/profiler/profiler.h b/tensorflow/cc/profiler/profiler.h
index 64edbb5766c..dc60fd5fb37 100644
--- a/tensorflow/cc/profiler/profiler.h
+++ b/tensorflow/cc/profiler/profiler.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CC_PROFILER_PROFILER_H_
 #define TENSORFLOW_CC_PROFILER_PROFILER_H_
 
+#include <memory>
+#include <string>
+
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/profiler/internal/tfprof_stats.h"
@@ -56,7 +59,7 @@ namespace tfprof {
 class Profiler {
  public:
   /// `graph` is the model's GraphDef.
-  Profiler(const GraphDef& graph);
+  explicit Profiler(const GraphDef& graph);
 
   /// Adds tracing information `run_meta` to profiler. A `run_meta` is
   /// generated by a TensorFlow session run call. `step` is the key
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index a3ea0c75bc7..056c99eed8e 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -1,6 +1,8 @@
 # Description:
 # TensorFlow SavedModel.
 
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_android",
@@ -19,10 +21,7 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files([
-    "LICENSE",
-    "loader.h",
-])
+exports_files(["loader.h"])
 
 cc_library(
     name = "constants",
@@ -43,13 +42,15 @@ cc_library(
     name = "reader",
     srcs = ["reader.cc"],
     hdrs = ["reader.h"],
-    deps = [":constants"] + if_not_mobile([
+    deps = [
+        ":constants",
+        "//tensorflow/core:protos_all_cc",
+    ] + if_not_mobile([
         # TODO(b/111634734): :lib and :protos_all contain dependencies that
         # cannot be built on mobile platforms. Instead, include the appropriate
         # tf_lib depending on the build platform.
         "@com_google_absl//absl/memory:memory",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
     ]),
 )
 
@@ -57,7 +58,7 @@ tf_cc_test(
     name = "reader_test",
     srcs = ["reader_test.cc"],
     data = [
-        ":saved_model_half_plus_two",
+        ":saved_model_test_files",
     ],
     linkstatic = 1,
     deps = [
@@ -149,7 +150,7 @@ tf_cc_test(
     name = "bundle_v2_test",
     srcs = ["bundle_v2_test.cc"],
     data = [
-        ":saved_model_half_plus_two",
+        ":saved_model_test_files",
     ],
     linkstatic = 1,
     deps = [
@@ -166,7 +167,7 @@ tf_cc_test(
     name = "saved_model_bundle_test",
     srcs = ["saved_model_bundle_test.cc"],
     data = [
-        ":saved_model_half_plus_two",
+        ":saved_model_test_files",
     ],
     linkstatic = 1,
     deps = [
@@ -188,7 +189,7 @@ tf_cc_test(
     name = "saved_model_bundle_lite_test",
     srcs = ["saved_model_bundle_lite_test.cc"],
     data = [
-        ":saved_model_half_plus_two",
+        ":saved_model_test_files",
     ],
     linkstatic = 1,
     deps = [
@@ -209,11 +210,17 @@ tf_cc_test(
 py_binary(
     name = "testdata/generate_saved_models",
     srcs = ["testdata/generate_saved_models.py"],
+    data = [
+        ":saved_model_asset_data",
+        ":saved_model_static_hashtable_asset_data",
+    ],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
+        "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python:variables",
         "//tensorflow/python/compat:v2_compat",
@@ -221,24 +228,47 @@ py_binary(
         "//tensorflow/python/module",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/saved_model:save_options",
+        "//tensorflow/python/training/tracking",
         "@absl_py//absl:app",
     ],
 )
 
 # TODO(b/32673259): add a test to continuously validate these files.
 filegroup(
-    name = "saved_model_half_plus_two",
+    name = "saved_model_test_files",
     srcs = glob([
+        "testdata/AssetModule/**",
         "testdata/half_plus_two_pbtxt/**",
         "testdata/half_plus_two_main_op/**",
         "testdata/half_plus_two/**",
         "testdata/half_plus_two_v2/**",
         "testdata/x_plus_y_v2_debuginfo/**",
         "testdata/CyclicModule/**",
+        "testdata/StaticHashTableModule/**",
         "testdata/VarsAndArithmeticObjectGraph/**",
+        "testdata/fuzz_generated/**",
     ]),
 )
 
+alias(
+    name = "saved_model_half_plus_two",
+    actual = ":saved_model_test_files",
+)
+
+filegroup(
+    name = "saved_model_asset_data",
+    srcs = [
+        "testdata/test_asset.txt",
+    ],
+)
+
+filegroup(
+    name = "saved_model_static_hashtable_asset_data",
+    srcs = [
+        "testdata/static_hashtable_asset.txt",
+    ],
+)
+
 exports_files(
     glob([
         "testdata/half_plus_two_pbtxt/**",
@@ -248,5 +278,6 @@ exports_files(
         "testdata/x_plus_y_v2_debuginfo/**",
         "testdata/CyclicModule/**",
         "testdata/VarsAndArithmeticObjectGraph/**",
+        "testdata/fuzz_generated/**",
     ]),
 )
diff --git a/tensorflow/cc/saved_model/experimental/public/BUILD b/tensorflow/cc/saved_model/experimental/public/BUILD
index 9640848ebf5..a0f8204c937 100644
--- a/tensorflow/cc/saved_model/experimental/public/BUILD
+++ b/tensorflow/cc/saved_model/experimental/public/BUILD
@@ -1,6 +1,8 @@
 # Experimental C++ SavedModel Header Only APIs. See RFC
 # https://github.com/tensorflow/community/pull/207
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 package(
     # This is intentionally public
     default_visibility = [
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index ecefe7d0406..70d080a682f 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/loader_util.h"
 #include "tensorflow/cc/saved_model/reader.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -73,26 +74,41 @@ uint64 GetLatencyMicroseconds(const uint64 start_microseconds) {
 // Ensure that constant tensors loaded from the saved model have valid shape.
 // Also ensure that constant nodes have a value assigned to them.
 // TODO(b/154763635): this is temporary and will be replaced with a better audit
+static Status ValidateNode(const NodeDef& node) {
+  const auto node_iterator = node.attr().find("value");
+  if (node_iterator != node.attr().end()) {
+    AttrValue node_value = node_iterator->second;
+    if (node_value.has_tensor()) {
+      const PartialTensorShape node_shape(node_value.tensor().tensor_shape());
+      if (node_shape.num_elements() < 0) {
+        return errors::FailedPrecondition(
+            "Saved model contains node \"", node.name(), "\" (op \"", node.op(),
+            "\") which initializes from a tensor with ",
+            node_shape.num_elements(), " elements");
+      }
+    }
+  } else if (node.op() == "Const") {
+    return errors::FailedPrecondition(
+        "Saved model contains node \"", node.name(),
+        "\" which is a constant tensor but no value has been provided");
+  }
+  return Status::OK();
+}
+
 static Status ValidateSavedTensors(const GraphDef& graph_def) {
   for (const auto& node : graph_def.node()) {
-    const auto node_iterator = node.attr().find("value");
-    if (node_iterator != node.attr().end()) {
-      AttrValue node_value = node_iterator->second;
-      if (node_value.has_tensor()) {
-        const PartialTensorShape node_shape(node_value.tensor().tensor_shape());
-        if (node_shape.num_elements() < 0) {
-          return errors::FailedPrecondition(
-              "Saved model contains node \"", node.name(), "\" (op \"",
-              node.op(), "\") which initializes from a tensor with ",
-              node_shape.num_elements(), " elements");
-        }
+    TF_RETURN_IF_ERROR(ValidateNode(node));
+  }
+
+  if (graph_def.has_library()) {
+    const FunctionDefLibrary& library = graph_def.library();
+    for (const auto& function : library.function()) {
+      for (const auto& node : function.node_def()) {
+        TF_RETURN_IF_ERROR(ValidateNode(node));
       }
-    } else if (node.op() == "Const") {
-      return errors::FailedPrecondition(
-          "Saved model contains node \"", node.name(),
-          "\" which is a constant tensor but no value has been provided");
     }
   }
+
   return Status::OK();
 }
 
diff --git a/tensorflow/cc/saved_model/saved_model_bundle_test.cc b/tensorflow/cc/saved_model/saved_model_bundle_test.cc
index 31f676920aa..127176002b9 100644
--- a/tensorflow/cc/saved_model/saved_model_bundle_test.cc
+++ b/tensorflow/cc/saved_model/saved_model_bundle_test.cc
@@ -45,6 +45,8 @@ constexpr char kTestFuzzGeneratedNegativeShape[] =
     "cc/saved_model/testdata/fuzz_generated/negative_shape";
 constexpr char kTestFuzzGeneratedConstWithNoValue[] =
     "cc/saved_model/testdata/fuzz_generated/const_with_no_value";
+constexpr char kTestFuzzGeneratedBadNodeAttr[] =
+    "cc/saved_model/testdata/fuzz_generated/bad_node_attr";
 
 class LoaderTest : public ::testing::Test {
  protected:
@@ -308,6 +310,9 @@ TEST_F(LoaderTest, NegativeShapeDimension) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {kSavedModelTagServe}, &bundle);
   EXPECT_FALSE(st.ok());
+  EXPECT_NE(
+      st.error_message().find("initializes from a tensor with -1 elements"),
+      std::string::npos);
 }
 
 TEST_F(LoaderTest, ConstNoValue) {
@@ -320,6 +325,24 @@ TEST_F(LoaderTest, ConstNoValue) {
   Status st = LoadSavedModel(session_options, run_options, export_dir,
                              {kSavedModelTagServe}, &bundle);
   EXPECT_FALSE(st.ok());
+  EXPECT_NE(
+      st.error_message().find("constant tensor but no value has been provided"),
+      std::string::npos);
+}
+
+TEST_F(LoaderTest, BadNodeAttr) {
+  SavedModelBundle bundle;
+  RunOptions run_options;
+  SessionOptions session_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestFuzzGeneratedBadNodeAttr);
+  Status st = LoadSavedModel(session_options, run_options, export_dir,
+                             {kSavedModelTagServe}, &bundle);
+  EXPECT_FALSE(st.ok());
+  EXPECT_NE(
+      st.error_message().find("constant tensor but no value has been provided"),
+      std::string::npos);
 }
 
 }  // namespace
diff --git a/tensorflow/cc/saved_model/testdata/AssetModule/assets/test_asset.txt b/tensorflow/cc/saved_model/testdata/AssetModule/assets/test_asset.txt
new file mode 100644
index 00000000000..40d69b1aac4
--- /dev/null
+++ b/tensorflow/cc/saved_model/testdata/AssetModule/assets/test_asset.txt
@@ -0,0 +1 @@
+TEST ASSET FILE CONTENTS
diff --git a/tensorflow/cc/saved_model/testdata/AssetModule/saved_model.pb b/tensorflow/cc/saved_model/testdata/AssetModule/saved_model.pb
new file mode 100644
index 00000000000..4bf99e03c22
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/AssetModule/saved_model.pb differ
diff --git a/tensorflow/cc/saved_model/testdata/AssetModule/variables/variables.data-00000-of-00001 b/tensorflow/cc/saved_model/testdata/AssetModule/variables/variables.data-00000-of-00001
new file mode 100644
index 00000000000..4105bb4c15e
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/AssetModule/variables/variables.data-00000-of-00001 differ
diff --git a/tensorflow/cc/saved_model/testdata/AssetModule/variables/variables.index b/tensorflow/cc/saved_model/testdata/AssetModule/variables/variables.index
new file mode 100644
index 00000000000..3d903ca79a2
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/AssetModule/variables/variables.index differ
diff --git a/tensorflow/cc/saved_model/testdata/StaticHashTableModule/assets/static_hashtable_asset.txt b/tensorflow/cc/saved_model/testdata/StaticHashTableModule/assets/static_hashtable_asset.txt
new file mode 100644
index 00000000000..e79f591665f
--- /dev/null
+++ b/tensorflow/cc/saved_model/testdata/StaticHashTableModule/assets/static_hashtable_asset.txt
@@ -0,0 +1,4 @@
+foo
+bar
+baz
+wombat
diff --git a/tensorflow/cc/saved_model/testdata/StaticHashTableModule/saved_model.pb b/tensorflow/cc/saved_model/testdata/StaticHashTableModule/saved_model.pb
new file mode 100644
index 00000000000..04e8ba62bdb
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/StaticHashTableModule/saved_model.pb differ
diff --git a/tensorflow/cc/saved_model/testdata/StaticHashTableModule/variables/variables.data-00000-of-00001 b/tensorflow/cc/saved_model/testdata/StaticHashTableModule/variables/variables.data-00000-of-00001
new file mode 100644
index 00000000000..f6d62d9a51c
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/StaticHashTableModule/variables/variables.data-00000-of-00001 differ
diff --git a/tensorflow/cc/saved_model/testdata/StaticHashTableModule/variables/variables.index b/tensorflow/cc/saved_model/testdata/StaticHashTableModule/variables/variables.index
new file mode 100644
index 00000000000..df6c85e5783
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/StaticHashTableModule/variables/variables.index differ
diff --git a/tensorflow/examples/android/__init__.py b/tensorflow/cc/saved_model/testdata/fuzz_generated/bad_node_attr/assets/empty
similarity index 100%
rename from tensorflow/examples/android/__init__.py
rename to tensorflow/cc/saved_model/testdata/fuzz_generated/bad_node_attr/assets/empty
diff --git a/tensorflow/cc/saved_model/testdata/fuzz_generated/bad_node_attr/saved_model.pb b/tensorflow/cc/saved_model/testdata/fuzz_generated/bad_node_attr/saved_model.pb
new file mode 100644
index 00000000000..0b33dbe7352
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/fuzz_generated/bad_node_attr/saved_model.pb differ
diff --git a/tensorflow/cc/saved_model/testdata/fuzz_generated/bad_node_attr/variables/variables.data-00000-of-00001 b/tensorflow/cc/saved_model/testdata/fuzz_generated/bad_node_attr/variables/variables.data-00000-of-00001
new file mode 100644
index 00000000000..3fd3ba2223d
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/fuzz_generated/bad_node_attr/variables/variables.data-00000-of-00001 differ
diff --git a/tensorflow/cc/saved_model/testdata/fuzz_generated/bad_node_attr/variables/variables.index b/tensorflow/cc/saved_model/testdata/fuzz_generated/bad_node_attr/variables/variables.index
new file mode 100644
index 00000000000..7357e8d57ed
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/fuzz_generated/bad_node_attr/variables/variables.index differ
diff --git a/tensorflow/examples/android/jni/__init__.py b/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/assets/empty
similarity index 100%
rename from tensorflow/examples/android/jni/__init__.py
rename to tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/assets/empty
diff --git a/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value b/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/saved_model.pb
similarity index 100%
rename from tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value
rename to tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/saved_model.pb
diff --git a/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/variables/variables.data-00000-of-00001 b/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/variables/variables.data-00000-of-00001
new file mode 100644
index 00000000000..3fd3ba2223d
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/variables/variables.data-00000-of-00001 differ
diff --git a/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/variables/variables.index b/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/variables/variables.index
new file mode 100644
index 00000000000..7357e8d57ed
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/fuzz_generated/const_with_no_value/variables/variables.index differ
diff --git a/tensorflow/examples/tutorials/__init__.py b/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/assets/empty
similarity index 100%
rename from tensorflow/examples/tutorials/__init__.py
rename to tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/assets/empty
diff --git a/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape b/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/saved_model.pb
similarity index 100%
rename from tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape
rename to tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/saved_model.pb
diff --git a/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/variables/variables.data-00000-of-00001 b/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/variables/variables.data-00000-of-00001
new file mode 100644
index 00000000000..3fd3ba2223d
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/variables/variables.data-00000-of-00001 differ
diff --git a/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/variables/variables.index b/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/variables/variables.index
new file mode 100644
index 00000000000..7357e8d57ed
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/fuzz_generated/negative_shape/variables/variables.index differ
diff --git a/tensorflow/cc/saved_model/testdata/generate_saved_models.py b/tensorflow/cc/saved_model/testdata/generate_saved_models.py
index 5f39ae0651d..2b64cf52096 100644
--- a/tensorflow/cc/saved_model/testdata/generate_saved_models.py
+++ b/tensorflow/cc/saved_model/testdata/generate_saved_models.py
@@ -29,9 +29,13 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.module import module
+from tensorflow.python.ops import io_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
 from tensorflow.python.saved_model import save_options
 from tensorflow.python.saved_model import saved_model
+from tensorflow.python.training.tracking import tracking
 
 
 class VarsAndArithmeticObjectGraph(module.Module):
@@ -68,9 +72,42 @@ class CyclicModule(module.Module):
     self.child = ReferencesParent(self)
 
 
+class AssetModule(module.Module):
+
+  def __init__(self):
+    self.asset = tracking.Asset(
+        test.test_src_dir_path("cc/saved_model/testdata/test_asset.txt"))
+
+  @def_function.function(input_signature=[])
+  def read_file(self):
+    return io_ops.read_file(self.asset)
+
+
+class StaticHashTableModule(module.Module):
+  """A module with an Asset, StaticHashTable, and a lookup function."""
+
+  def __init__(self):
+    self.asset = tracking.Asset(
+        test.test_src_dir_path(
+            "cc/saved_model/testdata/static_hashtable_asset.txt"))
+    self.table = lookup_ops.StaticHashTable(
+        lookup_ops.TextFileInitializer(self.asset, dtypes.string,
+                                       lookup_ops.TextFileIndex.WHOLE_LINE,
+                                       dtypes.int64,
+                                       lookup_ops.TextFileIndex.LINE_NUMBER),
+        -1)
+
+  @def_function.function(
+      input_signature=[tensor_spec.TensorSpec(shape=None, dtype=dtypes.string)])
+  def lookup(self, word):
+    return self.table.lookup(word)
+
+
 MODULE_CTORS = {
     "VarsAndArithmeticObjectGraph": VarsAndArithmeticObjectGraph,
     "CyclicModule": CyclicModule,
+    "AssetModule": AssetModule,
+    "StaticHashTableModule": StaticHashTableModule,
 }
 
 
diff --git a/tensorflow/cc/saved_model/testdata/static_hashtable_asset.txt b/tensorflow/cc/saved_model/testdata/static_hashtable_asset.txt
new file mode 100644
index 00000000000..e79f591665f
--- /dev/null
+++ b/tensorflow/cc/saved_model/testdata/static_hashtable_asset.txt
@@ -0,0 +1,4 @@
+foo
+bar
+baz
+wombat
diff --git a/tensorflow/cc/saved_model/testdata/test_asset.txt b/tensorflow/cc/saved_model/testdata/test_asset.txt
new file mode 100644
index 00000000000..40d69b1aac4
--- /dev/null
+++ b/tensorflow/cc/saved_model/testdata/test_asset.txt
@@ -0,0 +1 @@
+TEST ASSET FILE CONTENTS
diff --git a/tensorflow/cc/tools/BUILD b/tensorflow/cc/tools/BUILD
index a192c4bdb18..e8e128f9a16 100644
--- a/tensorflow/cc/tools/BUILD
+++ b/tensorflow/cc/tools/BUILD
@@ -1,6 +1,7 @@
 # Description:
 # TensorFlow cc tools.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
@@ -11,8 +12,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 cc_library(
     name = "freeze_saved_model",
     srcs = ["freeze_saved_model.cc"],
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index ff255dd9cc1..4a41caf1d40 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 load("//tensorflow/core/platform:build_config.bzl", "if_llvm_aarch64_available", "if_llvm_system_z_available")
@@ -74,7 +75,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
         "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
     ] + if_llvm_system_z_available([
         "@llvm-project//llvm:SystemZCodeGen",  # fixdeps: keep
     ]) + if_llvm_aarch64_available([
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index 5f6b3dc7101..06745de647b 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -1,3 +1,8 @@
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "genrule")
 load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
@@ -331,9 +336,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:hlo_profile_printer",
         "//tensorflow/core:lib",
-        "//tensorflow/core:regexp_internal",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:regexp",
         "//third_party/eigen3",
         "@com_google_absl//absl/strings",
     ],
@@ -554,9 +559,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:hlo_profile_printer",
         "//tensorflow/core:lib",
-        "//tensorflow/core:regexp_internal",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:regexp",
         "//third_party/eigen3",
         "@com_google_absl//absl/strings",
     ],
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 29f37bf7498..742cb308b3c 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -127,7 +127,7 @@ def tf_library(
                    "$(location " + tfcompile_tool + ")" +
                    " --config=$(location " + config + ")" +
                    " --dump_fetch_nodes > $@"),
-            tools = [tfcompile_tool],
+            exec_tools = [tfcompile_tool],
             # Run tfcompile on the build host, rather than forge, since it's
             # typically way faster on the local machine.
             local = 1,
@@ -162,7 +162,7 @@ def tf_library(
                 "//tensorflow/python/tools:freeze_graph)" +
                 freeze_args
             ),
-            tools = ["//tensorflow/python/tools:freeze_graph"],
+            exec_tools = ["//tensorflow/python/tools:freeze_graph"],
             tags = tags,
         )
         tfcompile_graph = freeze_file
@@ -242,7 +242,7 @@ def tf_library(
             " --out_function_object=$(@D)/" + function_object_file +
             " " + flags + " " + profiling_flag + " " + mlir_flag + " " + traceme_flag
         ),
-        tools = [tfcompile_tool],
+        exec_tools = [tfcompile_tool],
         visibility = visibility,
         testonly = testonly,
         # Run tfcompile on the build host since it's typically faster on the
@@ -281,7 +281,7 @@ def tf_library(
             " --out_session_module=$(@D)/" + session_module_pb +
             " " + flags
         ),
-        tools = [tfcompile_tool],
+        exec_tools = [tfcompile_tool],
         visibility = visibility,
         testonly = testonly,
         local = 1,
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 35c6a8b0357..deb3396d89c 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -1,5 +1,19 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "tf_cc_test")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "if_libtpu", "tf_copts")
 load("//tensorflow/stream_executor:build_defs.bzl", "if_cuda_or_rocm")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library", "tf_jit_compilation_passes_extra_deps")
 load("//tensorflow/core/platform:build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_cuda_tests_tags")
@@ -15,6 +29,7 @@ package_group(
         "//tensorflow/compiler/tf2xla:internal",
     ],
     packages = [
+        "//tensorflow/c/...",
         "//tensorflow/compiler/tests/...",
         "//tensorflow/python/...",
     ],
@@ -65,8 +80,10 @@ cc_library(
         "//tensorflow/compiler/jit/kernels:xla_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
-        "//tensorflow/compiler/xla/service:cpu_plugin",
-    ],
+    ] + if_libtpu(
+        if_false = ["//tensorflow/compiler/xla/service:cpu_plugin"],
+        if_true = [],
+    ),
     alwayslink = 1,
 )
 
@@ -93,15 +110,19 @@ cc_library(
         ":jit_compilation_passes",
         ":xla_device",
         ":xla_kernel_creator",  # buildcleaner: keep
+        "@com_google_absl//absl/memory",
         "//tensorflow/compiler/jit/kernels:xla_ops",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
-        "//tensorflow/compiler/xla/service:cpu_plugin",  # buildcleaner: keep
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
-        "@com_google_absl//absl/memory",
-    ],
+    ] + if_libtpu(
+        if_false = [
+            "//tensorflow/compiler/xla/service:cpu_plugin",  # buildcleaner: keep
+        ],
+        if_true = [],
+    ),
     alwayslink = 1,
 )
 
@@ -114,17 +135,21 @@ cc_library(
         ":jit_compilation_passes",
         ":xla_device",
         ":xla_kernel_creator",  # buildcleaner: keep
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "//tensorflow/compiler/jit/kernels:xla_ops",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
-        "//tensorflow/compiler/xla/service:gpu_plugin",  # buildcleaner: keep
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core/common_runtime/gpu:gpu_init",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-    ],
+    ] + if_libtpu(
+        if_false = [
+            "//tensorflow/compiler/xla/service:gpu_plugin",  # buildcleaner: keep
+        ],
+        if_true = [],
+    ),
     alwayslink = 1,
 )
 
@@ -182,7 +207,7 @@ XLA_DEVICE_DEPS = [
     "//tensorflow/core:resource_variable_ops_op_lib",
     "//tensorflow/core:sendrecv_ops_op_lib",
     "//tensorflow/core:state_ops_op_lib",
-    "//tensorflow/core:stream_executor_no_cuda",
+    "//tensorflow/core/platform:stream_executor_no_cuda",
     "//tensorflow/core/kernels:constant_op",
     "//tensorflow/core/kernels:fifo_queue",
     "//tensorflow/core/kernels:function_ops",
@@ -261,6 +286,7 @@ cc_library(
         "//tensorflow/compiler/xla:parse_flags_from_env",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
     ],
@@ -269,7 +295,7 @@ cc_library(
 # Header-only version of "flags" library, for linking from the shared object
 # without ODR violations.
 cc_library(
-    name = "flags_headers_only",
+    name = "flags_headers",
     hdrs = ["flags.h"],
     visibility = [":friends"],
     deps = [
@@ -280,6 +306,11 @@ cc_library(
     ],
 )
 
+cc_header_only_library(
+    name = "flags_headers_only",
+    deps = [":flags_headers"],
+)
+
 cc_library(
     name = "common",
     srcs = [
@@ -328,10 +359,17 @@ cc_library(
     name = "xla_compilation_cache",
     srcs = ["xla_compilation_cache.cc"],
     hdrs = ["xla_compilation_cache.h"],
+    copts = tf_copts(),
     deps = [
+        ":flags",
         ":xla_activity_listener",
         ":xla_activity_proto_cc",
-        "//tensorflow/compiler/mlir/tensorflow:compile_mlir_util_no_tf_dialect_passes",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_context",
@@ -346,13 +384,13 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:logging",
-        "@com_google_absl//absl/base",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/types:span",
-    ],
+    ] + if_libtpu(
+        if_false = [
+            "//tensorflow/compiler/mlir:array_container_utils",
+            "//tensorflow/compiler/mlir/tensorflow:compile_mlir_util_no_tf_dialect_passes",
+        ],
+        if_true = [],
+    ),
 )
 
 tf_cc_test(
@@ -361,8 +399,11 @@ tf_cc_test(
         "xla_compilation_cache_test.cc",
     ],
     deps = [
+        ":flags",
         ":xla_compilation_cache",
+        ":xla_cpu_jit",
         "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
@@ -382,6 +423,72 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "get_compiler_ir",
+    srcs = ["get_compiler_ir.cc"],
+    hdrs = ["get_compiler_ir.h"],
+    visibility = [
+        ":internal",
+        "//learning/brain/contrib/tpu_modeling/exp/tpu_inference_converter:__pkg__",
+        "//tensorflow/core/common_runtime/eager:__pkg__",
+    ],
+    deps = [
+        ":common",
+        ":compilability_check_util",
+        ":flags",
+        ":xla_device_no_jit_rewrite_registration",
+        ":xla_launch_util",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/service:hlo_graph_dumper",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/common_runtime:core_cpu_internal",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+    ],
+    alwayslink = 1,
+)
+
+# Header-only version of "flags" library, for linking from the shared object
+# without ODR violations.
+cc_library(
+    name = "get_compiler_ir_hdrs",
+    textual_hdrs = ["get_compiler_ir.h"],
+    visibility = [
+        ":internal",
+        "//learning/brain/contrib/tpu_modeling/exp/tpu_inference_converter:__pkg__",
+        "//tensorflow/core/common_runtime/eager:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:statusor",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_header_only_library(
+    name = "get_compiler_ir_hdrs_only",
+    deps = [":get_compiler_ir_hdrs"],
+)
+
+# This target can be used by XLA device plugins to prevent circular dependencies, and provides access to all of the required headers for building a device library.
+cc_header_only_library(
+    name = "xla_jit_headers_lib",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":xla_cpu_device",
+        ":xla_cpu_jit",
+        ":xla_gpu_device",
+        ":xla_gpu_jit",
+    ],
+)
+
 cc_library(
     name = "xla_kernel_creator",
     srcs = [
@@ -604,7 +711,6 @@ cc_library(
         ":flags",
         ":resource_operation_safety_analysis",
         ":shape_inference_helpers",
-        ":union_find",
         ":xla_activity_listener",
         ":xla_cluster_util",
         "//tensorflow/cc:cc_ops",
@@ -623,8 +729,8 @@ cc_library(
         "//tensorflow/compiler/tf2xla/cc:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:union_find",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -701,11 +807,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "union_find",
-    hdrs = ["union_find.h"],
-)
-
 tf_cc_test(
     name = "deadness_analysis_test",
     size = "small",
@@ -800,6 +901,7 @@ tf_cc_test(
         "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -886,7 +988,6 @@ cc_library(
         ":device_util",
         ":flags",
         ":resource_operation_safety_analysis",
-        ":union_find",
         ":xla_activity_listener",
         ":xla_activity_proto_cc",
         ":xla_cluster_util",
@@ -895,6 +996,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:union_find",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -918,6 +1020,7 @@ tf_cc_test(
         ":xla_cpu_jit",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:functional_ops",
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope",
         "//tensorflow/compiler/tf2xla:test_util",
@@ -944,11 +1047,12 @@ tf_cc_test(
         ":xla_cpu_jit",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:ops",
+        "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu",
-        "//tensorflow/core:direct_session_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:ops",
         "//tensorflow/core:test",
+        "//tensorflow/core/common_runtime:direct_session_internal",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:matmul_op",
         "//tensorflow/core/kernels:partitioned_function_ops",
@@ -997,15 +1101,3 @@ cc_library(
     ],
     alwayslink = 1,
 )
-
-# This target can be used by XLA device plugins to prevent circular dependencies, and provides access to all of the required headers for building a device library.
-cc_header_only_library(
-    name = "xla_jit_headers_lib",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":xla_cpu_device",
-        ":xla_cpu_jit",
-        ":xla_gpu_device",
-        ":xla_gpu_jit",
-    ],
-)
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
index 8463c788496..160ea83585d 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
@@ -130,17 +130,6 @@ FunctionDefLibrary CreateFunctionDefLibWithConstFunction(const string& name) {
   return fdef_lib;
 }
 
-FunctionDefLibrary CreateFunctionDefLibWithInt32Input(const string& name) {
-  FunctionDefLibrary fdef_lib;
-  FunctionDef func = FunctionDefHelper::Create(
-      /*function_name=*/name, /*in_def=*/{"in: int32"},
-      /*out_def=*/{"out: int32"},
-      /*attr_def=*/{}, /*node_def=*/{{{"out"}, "Identity", {"in"}}},
-      /*ret_def=*/{{"out", "out:output:0"}});
-  *fdef_lib.add_function() = std::move(func);
-  return fdef_lib;
-}
-
 TEST_F(BuildXlaOpsTest, ControlDepsPreserved) {
   const char* kXlaDeviceName = "/job:worker/replica:0/task:0/device:XLA_CPU:0";
   Scope root = Scope::NewRootScope().WithDevice(kXlaDeviceName).ExitOnError();
@@ -269,6 +258,17 @@ TEST_F(BuildXlaOpsTest, NoExtraMergeForEdgeToSink) {
 }
 
 #ifdef GOOGLE_CUDA
+FunctionDefLibrary CreateFunctionDefLibWithInt32Input(const string& name) {
+  FunctionDefLibrary fdef_lib;
+  FunctionDef func = FunctionDefHelper::Create(
+      /*function_name=*/name, /*in_def=*/{"in: int32"},
+      /*out_def=*/{"out: int32"},
+      /*attr_def=*/{}, /*node_def=*/{{{"out"}, "Identity", {"in"}}},
+      /*ret_def=*/{{"out", "out:output:0"}});
+  *fdef_lib.add_function() = std::move(func);
+  return fdef_lib;
+}
+
 // This tests a rewrite that only makes sense and is active in a CUDA-enabled
 // build.  Specifically we check that we insert an IdentityN op to avoid extra
 // device-to-host copies.
diff --git a/tensorflow/compiler/jit/compilability_check_util.cc b/tensorflow/compiler/jit/compilability_check_util.cc
index 6d4bc51f1b2..62e121420c3 100644
--- a/tensorflow/compiler/jit/compilability_check_util.cc
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
-#include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/jit/xla_activity.pb.h"
 #include "tensorflow/compiler/jit/xla_activity_listener.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
@@ -44,6 +43,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/resource_operation_table.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/union_find.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
@@ -84,6 +84,60 @@ Status MakeCallNodeFromAttribute(const Node& node, const std::string& attr_name,
   return Status::OK();
 }
 
+xla::StatusOr<std::vector<NodeDef>> MakeCallNodesFromAttribute(
+    const Node& node, absl::string_view attr_name,
+    absl::string_view call_name) {
+  std::vector<NameAttrList> attr_lists;
+  TF_RETURN_IF_ERROR(GetNodeAttr(node.attrs(), attr_name, &attr_lists));
+
+  std::vector<NodeDef> out;
+  for (int i = 0; i < attr_lists.size(); i++) {
+    out.emplace_back();
+    NodeDef& inserted = out.back();
+    inserted.set_name(absl::StrCat(call_name, "_", i));
+    inserted.set_op(attr_lists[i].name());
+    *inserted.mutable_attr() = attr_lists[i].attr();
+  }
+  return out;
+}
+
+// Utility which searches for values in a sorted list by scanning over it once.
+// No matter how many times ScanForValue is called, the list is scanned at most
+// once. However, if a call to ScanForValue skips over a value, that value is
+// not revisited in future calls to ScanForValue, so callers must take
+// care to order their calls.
+//
+// Useful for merging multiple sorted lists in O(n) time.
+class SinglePassSearch {
+ public:
+  // Creates a SinglePassSearch object that can be used to search in `values`.
+  // Does not take ownership of `values`. `values` must outlive this.
+  // `values` must be sorted.
+  explicit SinglePassSearch(absl::Span<int const> values)
+      : current_index_(0), values_(values) {}
+
+  // Scans forward in the vector looking for "value", updating the internal
+  // position in to the vector.
+  // Returns true iff the vector contains the given value at or after current
+  // position.
+  // Not thread-safe.
+  bool ScanForValue(int value) {
+    while (current_index_ < values_.size() &&
+           values_[current_index_] <= value) {
+      if (values_[current_index_] == value) {
+        current_index_++;
+        return true;
+      }
+      current_index_++;
+    }
+    return false;
+  }
+
+ private:
+  int current_index_;
+  const absl::Span<int const> values_;
+};
+
 }  // anonymous namespace
 
 RecursiveCompilabilityChecker::UncompilableNodesMap
@@ -190,6 +244,30 @@ bool RecursiveCompilabilityChecker::IsCompilableIf(
   return is_compilable;
 }
 
+bool RecursiveCompilabilityChecker::IsCompilableCase(
+    const Node& case_node, FunctionLibraryRuntime* lib_runtime,
+    std::vector<StackFrameView>* stack_trace,
+    NameAttrList* encapsulating_function,
+    RecursiveCompilabilityChecker::UncompilableNodesMap* uncompilable_nodes)
+    const {
+  xla::StatusOr<std::vector<NodeDef>> calls =
+      MakeCallNodesFromAttribute(case_node, "branches", "branch");
+  if (!calls.ok()) {
+    VLOG(2) << "Rejecting node " << case_node.name() << ": "
+            << "missing attribute 'branches'";
+    return false;
+  }
+
+  bool is_compilable = true;
+
+  for (const NodeDef& call : *calls) {
+    is_compilable &=
+        IsCompilableCall(call, lib_runtime, stack_trace, encapsulating_function,
+                         uncompilable_nodes);
+  }
+  return is_compilable;
+}
+
 // Tests whether 'while_node' is a completely compilable loop.
 // Every operator in the condition and body functions must be compilable for a
 // while loop to be compilable.
@@ -380,6 +458,13 @@ bool RecursiveCompilabilityChecker::IsCompilableNode(
     return false;
   }
 
+  if (op_filter_.require_always_compilable && node.IsCaseNode() &&
+      !IsCompilableCase(node, lib_runtime, stack_trace, encapsulating_function,
+                        uncompilable_nodes)) {
+    LogNotCompilable(node, "unsupported case");
+    return false;
+  }
+
   if (!op_filter_.allow_stateful_rng_ops &&
       IsStatefulRandomOp(node.type_string())) {
     absl::string_view uncompilable_reason = "stateful random op";
@@ -518,23 +603,23 @@ RecursiveCompilabilityChecker::OperationFilter CreateOperationFilter(
   }
 }
 
+// Returns `true` iff node has a given `attr` set to `true`. Returns `false`
+// both for the missing attr, and the attr set to `false`.
+static bool HasBoolAttr(const NodeDef& node, const char* attr) {
+  const auto& it = node.attr().find(attr);
+  return it != node.attr().end() && it->second.b();
+}
+
 bool CanCreateXlaKernel(const NodeDef& node_def) {
-  // If kXlaMustCompileAttr is set on the node_def, use its value.
-  const auto& it = node_def.attr().find(kXlaMustCompileAttr);
-  return it != node_def.attr().end() && it->second.b();
+  return HasBoolAttr(node_def, kXlaMustCompileAttr);
 }
 
 Status GetBodyAndConstantsAndResources(FunctionLibraryRuntime* flr,
-                                       const NodeDef& node_def,
+                                       const NameAttrList& function,
                                        const FunctionBody** fbody,
                                        std::vector<int>* constant_arg_indices,
                                        std::vector<int>* resource_arg_indices) {
   FunctionLibraryRuntime::Handle handle;
-  // If node_def is not instantiable, e.g., the function does not exist,
-  // simply bail out.
-  NameAttrList function;
-  TF_RETURN_IF_ERROR(NameAndAttrsFromFunctionCall(node_def, &function));
-
   TF_RETURN_IF_ERROR(
       flr->Instantiate(function.name(), AttrSlice(&function.attr()), &handle));
   *fbody = flr->GetFunctionBody(handle);
@@ -564,4 +649,96 @@ Status GetBodyAndConstantsAndResources(FunctionLibraryRuntime* flr,
   return Status::OK();
 }
 
+tensorflow::MemoryTypeVector GetInputMemoryTypes(
+    const tensorflow::FunctionBody* fbody,
+    absl::Span<int const> constant_arg_indices,
+    absl::Span<int const> resource_arg_indices) {
+  // Set input and output memory types.
+  tensorflow::MemoryTypeVector input_memory_types(fbody->arg_types.size(),
+                                                  tensorflow::DEVICE_MEMORY);
+  // These indices are used only for optimization purposes. They allow us
+  // to loop over constant_arg_indices and resource_arg_indices only once
+  // while iterating over all the function arguments checking if it is a
+  // resource or a constant.
+  // The reason we optimized this code is because functions can have a lot of
+  // captured arguments. For example, the backward pass of ResNet50 takes in all
+  // 214 variables and a similar number of activations.
+  SinglePassSearch constants_search(constant_arg_indices);
+  SinglePassSearch resources_search(resource_arg_indices);
+  for (size_t i = 0; i < fbody->arg_types.size(); ++i) {
+    if (resources_search.ScanForValue(i) || constants_search.ScanForValue(i)) {
+      // Compile-time constants and resource handles are expected to be in
+      // host memory.
+      input_memory_types[i] = tensorflow::HOST_MEMORY;
+    }
+  }
+  return input_memory_types;
+}
+
+tensorflow::MemoryTypeVector GetOutputMemoryTypes(
+    const tensorflow::FunctionBody* fbody) {
+  tensorflow::MemoryTypeVector output_memory_types(fbody->ret_types.size(),
+                                                   tensorflow::DEVICE_MEMORY);
+  for (size_t i = 0; i < fbody->ret_types.size(); ++i) {
+    if (fbody->ret_types[i] == tensorflow::DT_RESOURCE) {
+      output_memory_types[i] = tensorflow::HOST_MEMORY;
+    }
+  }
+  return output_memory_types;
+}
+
+static auto const ops_triggering_xla_compilation =
+    new absl::flat_hash_set<std::string>{"XlaBroadcastHelper",
+                                         "XlaConv",
+                                         "XlaDequantize",
+                                         "XlaDot",
+                                         "XlaDynamicSlice",
+                                         "XlaDynamicUpdateSlice",
+                                         "XlaEinsum",
+                                         "XlaGather",
+                                         "XlaIf",
+                                         "XlaKeyValueSort",
+                                         "XlaPad",
+                                         "XlaRecv",
+                                         "XlaReduce",
+                                         "XlaReduceWindow",
+                                         "XlaReplicaId",
+                                         "XlaScatter",
+                                         "XlaSelectAndScatter",
+                                         "XlaSelfAdjointEig",
+                                         "XlaSend",
+                                         "XlaSharding",
+                                         "XlaSort",
+                                         "XlaSpmdFullToShardShape",
+                                         "XlaSpmdShardToFullShape",
+                                         "XlaSvd",
+                                         "XlaWhile"};
+
+static bool NodeCanTriggerXlaCompilation(const NodeDef& node) {
+  return node.attr().find(kXlaClusterIdAttr) != node.attr().end() ||
+         HasBoolAttr(node, kXlaMustCompileAttr) ||
+         HasBoolAttr(node, kXlaCompileAttr) ||
+         HasBoolAttr(node, kXlaScopeAttr) ||
+         HasBoolAttr(node, kXlaInternalScopeAttr) ||
+         ops_triggering_xla_compilation->count(node.op());
+}
+
+bool CanTriggerXlaCompilation(const GraphDef& graph) {
+  for (const FunctionDef& function : graph.library().function()) {
+    for (const NodeDef& node : function.node_def()) {
+      if (NodeCanTriggerXlaCompilation(node)) {
+        return true;
+      }
+    }
+  }
+
+  for (const NodeDef& node : graph.node()) {
+    if (NodeCanTriggerXlaCompilation(node)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/compilability_check_util.h b/tensorflow/compiler/jit/compilability_check_util.h
index 3b20784cc29..65da072483b 100644
--- a/tensorflow/compiler/jit/compilability_check_util.h
+++ b/tensorflow/compiler/jit/compilability_check_util.h
@@ -26,11 +26,11 @@ limitations under the License.
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
-#include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/resource_operation_table.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/union_find.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
@@ -124,11 +124,16 @@ class RecursiveCompilabilityChecker {
     // Whether ops known to have numerical accuracy issues should be considered
     // compilable..
     bool allow_inaccurate_ops = false;
+
+    // Require the function to be always compilable, regardless whether some
+    // control flow branches might be dead for a given input.
+    bool require_always_compilable = false;
   };
 
-  RecursiveCompilabilityChecker(const OperationFilter* op_filter,
-                                const DeviceType* jit_device_type)
-      : op_filter_(*op_filter), jit_device_type_(*jit_device_type) {}
+  RecursiveCompilabilityChecker(OperationFilter op_filter,
+                                DeviceType jit_device_type)
+      : op_filter_(std::move(op_filter)),
+        jit_device_type_(std::move(jit_device_type)) {}
 
   using UncompilableNodesMap =
       std::map<std::string,
@@ -210,6 +215,14 @@ class RecursiveCompilabilityChecker {
                          NameAttrList* encapsulating_function,
                          UncompilableNodesMap* uncompilable_nodes) const;
 
+  // Tests whether 'case_node' is compilable. Every operator in all branches
+  // must be compilable.
+  bool IsCompilableCase(const Node& case_node,
+                        FunctionLibraryRuntime* lib_runtime,
+                        std::vector<StackFrameView>* stack_trace,
+                        NameAttrList* encapsulating_function,
+                        UncompilableNodesMap* uncompilable_nodes) const;
+
   // Returns compilability of node def retrieved from `node`'s attribute with
   // name `attr_name`.
   bool ExtractNodeDefAndCheckCompilability(
@@ -259,21 +272,20 @@ class RecursiveCompilabilityChecker {
   // Make sure we don't recurse infinitely on recursive functions.
   const size_t kMaxRecursionDepth = 10;
 
-  const OperationFilter& op_filter_;
-  const DeviceType& jit_device_type_;
+  const OperationFilter op_filter_;
+  const DeviceType jit_device_type_;
 };
 
 RecursiveCompilabilityChecker::OperationFilter CreateOperationFilter(
     const XlaOpRegistry::DeviceRegistration& registration);
 
-// Given a FunctionLibraryRuntime and a NodeDef calling a function in the
-// runtime, returns this function's body in `fbody` as well as the indices
-// of its constant and resource arguments.
+// Given a FunctionLibraryRuntime and a `function`, returns this function's body
+// in `fbody` as well as the indices of its constant and resource arguments.
 // `fbody` is owned by `flr`.
 // `constant_arg_indices` and `resource_arg_indices` should be empty vector.
 // They are sorted in ascending order on this function's return.
 Status GetBodyAndConstantsAndResources(FunctionLibraryRuntime* flr,
-                                       const NodeDef& node_def,
+                                       const NameAttrList& function,
                                        const FunctionBody** fbody,
                                        std::vector<int>* constant_arg_indices,
                                        std::vector<int>* resource_arg_indices);
@@ -282,6 +294,44 @@ Status GetBodyAndConstantsAndResources(FunctionLibraryRuntime* flr,
 // set.
 bool CanCreateXlaKernel(const NodeDef& node_def);
 
+// Returns memory types for the input.
+// `constant_arg_indices` and `resource_arg_indices` are sorted arrays of
+// indices corresponding to constant and resource arguments respectively.
+//
+// One might wonder, about the case where a compile-time constant argument
+// (which must be in host memory) is also used as an input into an op,
+// e.g. `Add`, that expects its inputs in device memory. Here is how it
+// works now.
+// First, what do we mean by "op expects an input in XYZ memory"?
+// There are two types of "ops" here: the tf2xla kernel and the HLO
+// computation it builds. The tf2xla kernel needs to retrieve the actual
+// numeric value of the compile-time constant tensors, so it really expects
+// them to be on in host memory. However, for other inputs, it refers to them
+// using xla::ComputationDataHandle, which is just a symbolic handle that
+// xla::ComputationBuilder assigns. How does this handle gets assigned for
+// constant arguments? Even constant arguments get an _Arg node in the graph
+// instantiated for Function compilation. The tf2xla kernel for constant _Arg
+// nodes takes the constant value, converts it to XlaLiteral, and feeds it
+// to xla::ComputationBuilder.ConstantLiteral, which returns the handle. This
+// constant XlaLiteral is included in the HLO graph, and subsequently, in
+// the actual executable, which is copied to the device before being
+// executed. Thus, when this executable runs, the constant is available in
+// device memory.
+tensorflow::MemoryTypeVector GetInputMemoryTypes(
+    const tensorflow::FunctionBody* fbody,
+    absl::Span<int const> constant_arg_indices,
+    absl::Span<int const> resource_arg_indices);
+
+// Returns output memory types.
+//
+// XlaLaunch kernel keeps all outputs (including constants, which it copies),
+// in device memory except for resources.
+tensorflow::MemoryTypeVector GetOutputMemoryTypes(
+    const tensorflow::FunctionBody* fbody);
+
+// Check whether graph can trigger XLA compilation.
+bool CanTriggerXlaCompilation(const GraphDef& graph);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_COMPILABILITY_CHECK_UTIL_H_
diff --git a/tensorflow/compiler/jit/compilability_check_util_test.cc b/tensorflow/compiler/jit/compilability_check_util_test.cc
index 3ea38e69ad9..9058b129589 100644
--- a/tensorflow/compiler/jit/compilability_check_util_test.cc
+++ b/tensorflow/compiler/jit/compilability_check_util_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/functional_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -33,7 +34,16 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+AttrValue FuncListAttr(const absl::Span<const char* const> names) {
+  AttrValue attr;
+  for (const char* name : names) {
+    attr.mutable_list()->add_func()->set_name(name);
+  }
+  return attr;
+}
+
 constexpr char kFunctionalIfNodeName[] = "If";
+constexpr char kFunctionalCaseNodeName[] = "Case";
 constexpr char kFunctionalWhileNodeName[] = "While";
 constexpr char kCompilableFunctionName[] = "CompilableFn";
 constexpr char kCompilableFunctionNodeName[] = "n_c";
@@ -75,8 +85,12 @@ class CompilabilityCheckUtilTest : public ::testing::Test {
     op_filter_.allow_inaccurate_ops = false;
     op_filter_.allow_slow_ops = false;
 
-    checker_ = absl::make_unique<RecursiveCompilabilityChecker>(&op_filter_,
-                                                                &device_type_);
+    checker_ = CreateCompilabilityChecker();
+  }
+
+  std::unique_ptr<RecursiveCompilabilityChecker> CreateCompilabilityChecker() {
+    return absl::make_unique<RecursiveCompilabilityChecker>(op_filter_,
+                                                            device_type_);
   }
 
   FunctionLibraryRuntime* GetFunctionLibraryRuntime() {
@@ -354,5 +368,161 @@ TEST_F(CompilabilityCheckUtilTest, CheckFunctionalIfNode) {
                                 "unsupported op"));
 }
 
+TEST_F(CompilabilityCheckUtilTest, CheckFunctionalCaseNode) {
+  FunctionDefLibrary flib;
+  *flib.add_function() = FunctionDefHelper::Define(
+      /*Function*/ kUncompilableFunctionName,
+      /*Inputs*/ {"n_a:float"},
+      /*Outputs*/ {"n_c_uncompilable:float"},
+      /*Attributes*/ {},
+      // Node info
+      {{{kUncompilableFunctionNodeName}, "MissingKernel", {"n_a"}}});
+  *flib.add_function() = FunctionDefHelper::Define(
+      /*Function*/ kUncompilableFunctionTwoName,
+      /*Inputs*/ {"n_a:float"},
+      /*Outputs*/ {"n_d_uncompilable:float"},
+      /*Attribute*/ {},
+      // Node info
+      {{{kUncompilableFunctionNodeTwoName}, "MissingKernel", {"n_a"}}});
+
+  Scope root = Scope::NewRootScope().ExitOnError();
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(flib));
+  auto branch_index = ops::Placeholder(root.WithOpName("pred"), DT_INT32);
+  auto placeholder = ops::Placeholder(root.WithOpName("A"), DT_INT32);
+  std::vector<NodeBuilder::NodeOut> inputes(
+      {NodeBuilder::NodeOut(placeholder.node())});
+  Node* case_node;
+  TF_ASSERT_OK(
+      NodeBuilder(kFunctionalCaseNodeName, "Case", &root.graph()->flib_def())
+          .Input(branch_index.node())
+          .Input(inputes)
+          .Attr("branches", FuncListAttr({kUncompilableFunctionName,
+                                          kUncompilableFunctionTwoName}))
+          .Attr("Tout", {DT_INT32})
+          .Finalize(root.graph(), &case_node));
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  flib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), flib));
+
+  auto case_node_it = std::find_if(
+      graph->nodes().begin(), graph->nodes().end(),
+      [&](const Node* n) { return n->name() == kFunctionalCaseNodeName; });
+  EXPECT_NE(case_node_it, graph->nodes().end());
+  auto* flib_runtime = GetFunctionLibraryRuntime();
+
+  op_filter_.require_always_compilable = false;
+  checker_ = CreateCompilabilityChecker();
+  EXPECT_TRUE(checker_->IsCompilableNode(**case_node_it, flib_runtime));
+  op_filter_.require_always_compilable = true;
+  checker_ = CreateCompilabilityChecker();
+  EXPECT_FALSE(checker_->IsCompilableNode(**case_node_it, flib_runtime));
+}
+
+TEST_F(CompilabilityCheckUtilTest, TestCanNotTriggerXlaCompilation) {
+  GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+  Scope root = Scope::NewRootScope().ExitOnError();
+  FunctionDefLibrary library;
+
+  FunctionDef identity_func = FunctionDefHelper::Create(
+      "IdentityFunc",
+      /*in_def=*/{"x:float"},
+      /*out_def=*/{"res:float"},
+      /*attr_def=*/{},
+      /*node_def=*/{{{"t0"}, "Identity", {"x"}, {{"T", DT_FLOAT}}}},
+      /*ret_def*/ {{"res", "t0:output"}});
+
+  *library.add_function() = identity_func;
+
+  Output in = ops::Placeholder(root, DT_FLOAT);
+  NameAttrList b_name_attr;
+  b_name_attr.set_name("IdentityFunc");
+  ops::PartitionedCall call(root.WithOpName("call"), {in}, {DT_FLOAT},
+                            b_name_attr);
+
+  GraphDef graph_def;
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(library));
+  TF_ASSERT_OK(root.ToGraphDef(&graph_def));
+
+  EXPECT_FALSE(CanTriggerXlaCompilation(graph_def));
+}
+
+TEST_F(CompilabilityCheckUtilTest, TestXlaOpsCanTriggerXlaCompilation) {
+  GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+  Scope root = Scope::NewRootScope().ExitOnError();
+  FunctionDefLibrary library;
+
+  FunctionDef sort_func = FunctionDefHelper::Create(
+      "SortFunc",
+      /*in_def=*/{"x:float"},
+      /*out_def=*/{"res:float"},
+      /*attr_def=*/{},
+      /*node_def=*/{{{"t0"}, "XlaSort", {"x"}, {{"T", DT_FLOAT}}}},
+      /*ret_def*/ {{"res", "t0:output"}});
+
+  *library.add_function() = sort_func;
+
+  Output in = ops::Placeholder(root, DT_FLOAT);
+  NameAttrList b_name_attr;
+  b_name_attr.set_name("SortFunc");
+  ops::PartitionedCall call(root.WithOpName("call"), {in}, {DT_FLOAT},
+                            b_name_attr);
+
+  GraphDef graph_def;
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(library));
+  TF_ASSERT_OK(root.ToGraphDef(&graph_def));
+
+  EXPECT_TRUE(CanTriggerXlaCompilation(graph_def));
+}
+
+TEST_F(CompilabilityCheckUtilTest, TestCanTriggerXlaCompilation) {
+  GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+  Scope root = Scope::NewRootScope().ExitOnError();
+  FunctionDefLibrary library;
+
+  AttrValue true_attribute;
+  true_attribute.set_b(true);
+
+  FunctionDef identity_func = FunctionDefHelper::Create(
+      "IdentityFunc",
+      /*in_def=*/{"x:float"},
+      /*out_def=*/{"res:float"},
+      /*attr_def=*/{},
+      /*node_def=*/{{{"t0"}, "Identity", {"x"}, {{"T", DT_FLOAT}}}},
+      /*ret_def*/ {{"res", "t0:output"}});
+
+  (*identity_func.mutable_attr())[kXlaMustCompileAttr] = true_attribute;
+
+  FunctionDef call_identity = FunctionDefHelper::Create(
+      "CallIdentity",
+      /*in_def=*/{"x:float"},
+      /*out_def=*/{"z:float"}, /*attr_def=*/{},
+      /*node_def=*/
+      {{{"func_call"},
+        "PartitionedCall",
+        {"x"},
+        {{"Tin", DataTypeSlice({DT_FLOAT})},
+         {"Tout", DataTypeSlice({DT_FLOAT})},
+         {"f",
+          FunctionDefHelper::FunctionRef("IdentityRef", {{"T", DT_FLOAT}})},
+         {kXlaMustCompileAttr, true}}}},
+      /*ret_def=*/{{"z", "func_call:output:0"}});
+
+  *library.add_function() = identity_func;
+  *library.add_function() = call_identity;
+
+  Output in = ops::Placeholder(root, DT_FLOAT);
+  NameAttrList b_name_attr;
+  b_name_attr.set_name("CallIdentity");
+  ops::PartitionedCall call(root.WithOpName("call"), {in}, {DT_FLOAT},
+                            b_name_attr);
+
+  GraphDef graph_def;
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(library));
+  TF_ASSERT_OK(root.ToGraphDef(&graph_def));
+
+  EXPECT_TRUE(CanTriggerXlaCompilation(graph_def));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/defs.cc b/tensorflow/compiler/jit/defs.cc
index 4bea71e8fc1..84e1e36bcf6 100644
--- a/tensorflow/compiler/jit/defs.cc
+++ b/tensorflow/compiler/jit/defs.cc
@@ -28,4 +28,6 @@ const char* const kXlaScopeAttr = "_XlaScope";
 // only when auto_jit is ON.
 const char* const kXlaInternalScopeAttr = "_XlaInternalScope";
 
+const char* const kXlaClusterIdAttr = "_xla_compile_id";
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/defs.h b/tensorflow/compiler/jit/defs.h
index 9eb4c2ca2e8..fa983db8df8 100644
--- a/tensorflow/compiler/jit/defs.h
+++ b/tensorflow/compiler/jit/defs.h
@@ -35,6 +35,9 @@ extern const char* const kXlaCompileAttr;  // "_XlaCompile"
 extern const char* const kXlaScopeAttr;    // "_XlaScope"
 extern const char* const kXlaInternalScopeAttr;  // "_XlaInternalScope"
 
+// The id of the compiled cluster.
+extern const char* const kXlaClusterIdAttr;  // "_xla_compile_id"
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_DEFS_H_
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
index ed25baa62ff..4a5c79c02d9 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -34,9 +35,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-const char* const EncapsulateXlaComputationsPass::kXlaClusterAttr =
-    "_xla_compile_id";
-
 namespace {
 
 const char* const kXlaClusterOutput = "XlaClusterOutput";
@@ -45,10 +43,7 @@ bool IsCpuGpuCompile(const Graph* graph) {
   for (Node* n : graph->nodes()) {
     string name;
     // Only consider nodes being compiled.
-    if (!GetNodeAttr(n->attrs(),
-                     EncapsulateXlaComputationsPass::kXlaClusterAttr, &name)
-             .ok())
-      continue;
+    if (!GetNodeAttr(n->attrs(), kXlaClusterIdAttr, &name).ok()) continue;
     // Early return for any node with a device that is not a CPU or GPU.
     DeviceNameUtils::ParsedName parsed;
     if (DeviceNameUtils::ParseFullName(n->requested_device(), &parsed)) {
@@ -180,8 +175,7 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
     retvals[i]->AddAttr("index", i);
   }
 
-  AddNodeAttr(EncapsulateXlaComputationsPass::kXlaClusterAttr, call_def->name(),
-              call_def);
+  AddNodeAttr(kXlaClusterIdAttr, call_def->name(), call_def);
   AddNodeAttr("_variable_start_index", variable_start_index, call_def);
 
   // Uniquify the function name.
@@ -216,8 +210,8 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
   // O(n) pass over the edges.
   for (const Edge* e : (*graph)->edges()) {
     if (!e->IsControlEdge() &&
-        e->src()->attrs().Find(kXlaClusterAttr) != nullptr &&
-        e->dst()->attrs().Find(kXlaClusterAttr) == nullptr &&
+        e->src()->attrs().Find(kXlaClusterIdAttr) != nullptr &&
+        e->dst()->attrs().Find(kXlaClusterIdAttr) == nullptr &&
         e->dst()->type_string() != kXlaClusterOutput) {
       return errors::InvalidArgument(
           "Undeclared output of XLA computation. Some common causes of this "
@@ -232,9 +226,9 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
 
   auto output = absl::make_unique<Graph>((*graph)->op_registry());
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      EncapsulateSubgraphsInFunctions(kXlaClusterAttr, **graph, RewriteSubgraph,
-                                      /*reuse_existing_functions=*/true,
-                                      &output, flib_def),
+      EncapsulateSubgraphsInFunctions(
+          kXlaClusterIdAttr, **graph, RewriteSubgraph,
+          /*reuse_existing_functions=*/true, &output, flib_def),
       "EncapsulateXlaComputationsPass failed");
   graph->swap(output);
   return Status::OK();
@@ -246,7 +240,7 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
   // while iterating.
   std::vector<Node*> launch_nodes;
   for (Node* n : graph->nodes()) {
-    const string& name = GetNodeAttrString(n->attrs(), kXlaClusterAttr);
+    const string& name = GetNodeAttrString(n->attrs(), kXlaClusterIdAttr);
     if (!name.empty()) {
       launch_nodes.push_back(n);
     }
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.h b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.h
index 3057e4c7469..9931b23fa41 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.h
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.h
@@ -34,8 +34,6 @@ namespace tensorflow {
 // XlaLaunch operators.
 class EncapsulateXlaComputationsPass : public GraphOptimizationPass {
  public:
-  static const char* const kXlaClusterAttr;  // _xla_compile_id
-
   Status Run(const GraphOptimizationPassOptions& options) override;
 
   // The following methods are public only for unit tests.
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
index cc177036591..61c9a3ff9c0 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
 #include "tensorflow/compiler/tf2xla/cc/ops/xla_jit_ops.h"
 #include "tensorflow/compiler/tf2xla/test_util.h"
@@ -46,19 +47,18 @@ static std::unique_ptr<Graph> MakeOuterGraph(
   auto w = ops::Placeholder(scope.WithOpName("W"), DT_RESOURCE);
 
   NodeDef def;
-  TF_CHECK_OK(
-      NodeDefBuilder("launch0", function, &flib_def)
-          .Input(a.node()->name(), 0, DT_INT32)
-          .Input(b.node()->name(), 0, DT_FLOAT)
-          .Input(c.node()->name(), 0, DT_INT32)
-          .Input(d.node()->name(), 0, DT_FLOAT)
-          .Input(u.node()->name(), 0, DT_RESOURCE)
-          .Input(v.node()->name(), 0, DT_RESOURCE)
-          .Input(w.node()->name(), 0, DT_RESOURCE)
-          .Device("/gpu:0")
-          .Attr(EncapsulateXlaComputationsPass::kXlaClusterAttr, "launch0")
-          .Attr("_variable_start_index", 4)
-          .Finalize(&def));
+  TF_CHECK_OK(NodeDefBuilder("launch0", function, &flib_def)
+                  .Input(a.node()->name(), 0, DT_INT32)
+                  .Input(b.node()->name(), 0, DT_FLOAT)
+                  .Input(c.node()->name(), 0, DT_INT32)
+                  .Input(d.node()->name(), 0, DT_FLOAT)
+                  .Input(u.node()->name(), 0, DT_RESOURCE)
+                  .Input(v.node()->name(), 0, DT_RESOURCE)
+                  .Input(w.node()->name(), 0, DT_RESOURCE)
+                  .Device("/gpu:0")
+                  .Attr(kXlaClusterIdAttr, "launch0")
+                  .Attr("_variable_start_index", 4)
+                  .Finalize(&def));
 
   Status status;
   Node* launch = scope.graph()->AddNode(def, &status);
@@ -107,7 +107,7 @@ static std::unique_ptr<Graph> MakeBodyGraph() {
   auto arg6 = ops::_Arg(scope.WithOpName("w_0_arg"), DT_RESOURCE, 6);
 
   auto add_attrs = [](Node* node) {
-    node->AddAttr(EncapsulateXlaComputationsPass::kXlaClusterAttr, "launch0");
+    node->AddAttr(kXlaClusterIdAttr, "launch0");
     node->set_requested_device("/gpu:0");
   };
 
@@ -155,8 +155,7 @@ TEST(EncapsulateXlaComputations, DeterministicEncapsulate) {
                                     : ops::Add(scope.WithOpName("E"), a1, a0);
 
       auto add_attrs = [](Node* node) {
-        node->AddAttr(EncapsulateXlaComputationsPass::kXlaClusterAttr,
-                      "launch0");
+        node->AddAttr(kXlaClusterIdAttr, "launch0");
       };
       add_attrs(e.node());
 
@@ -216,7 +215,7 @@ TEST(EncapsulateXlaComputations, Encapsulate) {
     auto w = ops::Placeholder(scope.WithOpName("W"), DT_RESOURCE);
 
     auto add_attrs = [](Node* node) {
-      node->AddAttr(EncapsulateXlaComputationsPass::kXlaClusterAttr, "launch0");
+      node->AddAttr(kXlaClusterIdAttr, "launch0");
       node->set_requested_device("/gpu:0");
     };
 
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index a4a750bae0d..683acd0bae9 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -167,8 +167,8 @@ void AllocateAndParseFlags() {
   jitter_flags = new IntroduceFloatingPointJitterPassFlags;
   jitter_flags->jitter_amount = 1e-5;
 
-  mlir_flags = new MlirCommonFlags;
-  mlir_flags->tf_mlir_enable_mlir_bridge = false;
+  bool enable_mlir_bridge = false;
+  bool enable_mlir_bridge_flag_updated = false;
 
   auto setter_for_jitter_tensor_names = [](string sequence) {
     jitter_flags->tensor_names = absl::StrSplit(sequence, ',');
@@ -217,12 +217,24 @@ void AllocateAndParseFlags() {
             "The amount of jitter to introduce.  This amount is added to each "
             "element in the tensors named in `tensor_names."),
 
-       Flag("tf_mlir_enable_mlir_bridge",
-            &mlir_flags->tf_mlir_enable_mlir_bridge,
-            "Enables experimental MLIR-Based TensorFlow Compiler Bridge.")});
+       Flag("tf_mlir_enable_mlir_bridge", &enable_mlir_bridge,
+            "Enables experimental MLIR-Based TensorFlow Compiler Bridge.",
+            &enable_mlir_bridge_flag_updated)});
 
   AppendMarkForCompilationPassFlagsInternal(flag_list);
   xla::ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", *flag_list);
+
+  mlir_flags = new MlirCommonFlags;
+  if (!enable_mlir_bridge_flag_updated) {
+    mlir_flags->tf_mlir_enable_mlir_bridge =
+        ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_UNSPECIFIED;
+  } else if (enable_mlir_bridge) {
+    mlir_flags->tf_mlir_enable_mlir_bridge =
+        ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED;
+  } else {
+    mlir_flags->tf_mlir_enable_mlir_bridge =
+        ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_DISABLED;
+  }
 }
 
 }  // namespace
@@ -268,4 +280,10 @@ void AppendMarkForCompilationPassFlags(std::vector<Flag>* flag_list) {
   AppendMarkForCompilationPassFlagsInternal(flag_list);
 }
 
+static std::atomic<bool> xla_compilation_disabled(false);
+
+void DisableXlaCompilation() { xla_compilation_disabled = true; }
+
+bool FailOnXlaCompilation() { return xla_compilation_disabled; }
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index 6c54fc8825e..a0860da7b04 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace tensorflow {
@@ -135,7 +136,7 @@ struct IntroduceFloatingPointJitterPassFlags {
 
 // Flags for common MLIR configurations.
 struct MlirCommonFlags {
-  bool tf_mlir_enable_mlir_bridge;
+  ConfigProto::Experimental::MlirBridgeRollout tf_mlir_enable_mlir_bridge;
 };
 
 // Return a pointer to the DumpGraphFlags struct;
@@ -162,6 +163,13 @@ MlirCommonFlags* GetMlirCommonFlags();
 void AppendMarkForCompilationPassFlags(
     std::vector<tensorflow::Flag>* flag_list);
 
+// Disables XLA compilation, forces it to return an error message instead. Can
+// be used by a server to ensure that JIT compilation is opt-in.
+void DisableXlaCompilation();
+
+// Returns `false` unless `DisableXlaCompilation` was called.
+bool FailOnXlaCompilation();
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_FLAGS_H_
diff --git a/tensorflow/compiler/jit/force_xla_constants_on_host_pass.cc b/tensorflow/compiler/jit/force_xla_constants_on_host_pass.cc
index 3ba32f07506..3692d1f3aba 100644
--- a/tensorflow/compiler/jit/force_xla_constants_on_host_pass.cc
+++ b/tensorflow/compiler/jit/force_xla_constants_on_host_pass.cc
@@ -38,10 +38,12 @@ Status ForceXlaConstantsOnHostPass::Run(
       std::vector<int> constant_arg_indices;
       std::vector<int> resource_arg_indices;
 
+      NameAttrList function;
+      TF_RETURN_IF_ERROR(NameAndAttrsFromFunctionCall(node->def(), &function));
+
       // Force all constants to be on the host memory.
       TF_RETURN_IF_ERROR(GetBodyAndConstantsAndResources(
-          flr, node->def(), &fbody, &constant_arg_indices,
-          &resource_arg_indices));
+          flr, function, &fbody, &constant_arg_indices, &resource_arg_indices));
       VLOG(3) << "Found constant arg indices: "
               << absl::StrJoin(constant_arg_indices, ", ");
 
diff --git a/tensorflow/compiler/jit/get_compiler_ir.cc b/tensorflow/compiler/jit/get_compiler_ir.cc
new file mode 100644
index 00000000000..08b3bea1084
--- /dev/null
+++ b/tensorflow/compiler/jit/get_compiler_ir.cc
@@ -0,0 +1,158 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/get_compiler_ir.h"
+
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/jit/compilability_check_util.h"
+#include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/compiler/jit/xla_platform_info.h"
+#include "tensorflow/compiler/tf2xla/const_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+
+static xla::StatusOr<xla::LocalExecutable*> GetLocalExecutable(
+    const XlaCompiler::Options& options,
+    const XlaCompiler::CompileOptions& compile_options,
+    const NameAttrList& function, XlaCompilationCache* cache,
+    absl::Span<XlaCompiler::Argument const> args, const XlaCompiler& compiler) {
+  const XlaCompiler::CompilationResult* compilation_result = nullptr;
+  xla::LocalExecutable* executable = nullptr;
+  TF_RETURN_IF_ERROR(cache->Compile(options, function, args, compile_options,
+                                    XlaCompilationCache::CompileMode::kStrict,
+                                    &compilation_result, &executable));
+  return executable;
+}
+
+xla::StatusOr<std::string> GetCompilerIr(
+    IrExportStage stage, ProcessFunctionLibraryRuntime* pflr,
+    absl::string_view func_name, Device* dev, EagerContext* context,
+    absl::Span<const TensorHandle* const> inputs_handles) {
+  NameAttrList function;
+  function.set_name(std::string{func_name});
+
+  FunctionLibraryRuntime* flr = pflr->GetFLR(dev->name());
+  ResourceMgr* rmgr = dev->resource_manager();
+
+  const FunctionBody* fbody = nullptr;
+  std::vector<int> constant_arg_indices;
+  std::vector<int> resource_arg_indices;
+  TF_RETURN_IF_ERROR(GetBodyAndConstantsAndResources(
+      flr, function, &fbody, &constant_arg_indices, &resource_arg_indices));
+
+  MemoryTypeVector input_memory_types =
+      GetInputMemoryTypes(fbody, constant_arg_indices, resource_arg_indices);
+  MemoryTypeVector output_memory_types = GetOutputMemoryTypes(fbody);
+
+  std::deque<Tensor> inputs_storage;
+  std::vector<const Tensor*> inputs;
+  inputs.reserve(inputs_handles.size());
+  for (int i = 0; i < inputs_handles.size(); i++) {
+    const TensorHandle* th = inputs_handles[i];
+    const Tensor* t;
+    // Handle owns the tensor.
+    TF_RETURN_IF_ERROR(th->Tensor(&t));
+    if (absl::c_binary_search(constant_arg_indices, i)) {
+      // Need to make sure it's on the host.
+      inputs_storage.emplace_back(t->dtype(), t->shape());
+      TF_RETURN_IF_ERROR(
+          th->CopyToDevice(*context, /*d=*/nullptr, &inputs_storage.back()));
+      inputs.push_back(&inputs_storage.back());
+    } else {
+      inputs.push_back(t);
+    }
+  }
+
+  std::vector<VariableInfo> variable_infos;
+  TF_RETURN_IF_ERROR(GetVariableInfosFromInputs(
+      rmgr, dev, inputs, resource_arg_indices, &variable_infos));
+  TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(variable_infos)));
+
+  XlaPlatformInfo platform_info = XlaPlatformInfoFromDevice(dev);
+
+  XlaCompilationCache* cache;
+  TF_RETURN_IF_ERROR(rmgr->LookupOrCreate<XlaCompilationCache>(
+      rmgr->default_container(), "xla_cache", &cache,
+      [&](XlaCompilationCache** cache_write_into) {
+        return BuildXlaCompilationCache(dev, platform_info, cache_write_into);
+      }));
+  core::ScopedUnref cache_ref(cache);
+
+  absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
+
+  XlaCompiler::Options options =
+      GenerateCompilerOptions(*cache, *flr, dev,
+                              /*stream=*/nullptr, platform_info,
+                              /*has_ref_vars=*/false, &tf_allocator_adapter);
+
+  XlaCompiler::CompileOptions compile_options;
+  compile_options.always_return_tuple = false;
+  compile_options.alias_resource_update = true;
+
+  XlaCompiler compiler(options);
+
+  xla::StatusOr<std::vector<XlaCompiler::Argument>> args =
+      XlaComputationLaunchContext::BuildXlaCompilerArguments(
+          constant_arg_indices, inputs, variable_infos);
+  TF_RETURN_IF_ERROR(args.status());
+
+  switch (stage) {
+    case IrExportStage::HLO: {
+      XlaCompiler::CompilationResult result;
+      TF_RETURN_IF_ERROR(
+          compiler.CompileFunction(compile_options, function, *args, &result));
+
+      TF_ASSIGN_OR_RETURN(xla::ProgramShape program_shape,
+                          result.computation->GetProgramShape());
+      xla::HloModuleConfig config(program_shape);
+      TF_ASSIGN_OR_RETURN(
+          std::unique_ptr<xla::HloModule> new_module,
+          xla::HloModule::CreateFromProto(result.computation->proto(), config));
+
+      return new_module->ToString();
+    }
+    case IrExportStage::OPTIMIZED_HLO: {
+      xla::StatusOr<xla::LocalExecutable*> executable = GetLocalExecutable(
+          options, compile_options, function, cache, *args, compiler);
+      TF_RETURN_IF_ERROR(executable.status());
+      return (*executable)->executable()->module().ToString();
+    }
+    case IrExportStage::OPTIMIZED_HLO_DOT: {
+      xla::StatusOr<xla::LocalExecutable*> executable = GetLocalExecutable(
+          options, compile_options, function, cache, *args, compiler);
+      TF_RETURN_IF_ERROR(executable.status());
+      xla::StatusOr<std::string> graph = xla::RenderGraph(
+          *(*executable)->executable()->module().entry_computation(),
+          "Visualization",
+          /*debug_options=*/{}, xla::RenderedGraphFormat::kDot,
+          /*hlo_execution_profile=*/nullptr,
+          /*hlo_render_options=*/{});
+      TF_RETURN_IF_ERROR(graph.status());
+      return *graph;
+    }
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/get_compiler_ir.h b/tensorflow/compiler/jit/get_compiler_ir.h
new file mode 100644
index 00000000000..0a0a1a44271
--- /dev/null
+++ b/tensorflow/compiler/jit/get_compiler_ir.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_JIT_GET_COMPILER_IR_H_
+#define TENSORFLOW_COMPILER_JIT_GET_COMPILER_IR_H_
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace tensorflow {
+
+class ProcessFunctionLibraryRuntime;
+class Device;
+class Tensor;
+class TensorHandle;
+class EagerContext;
+
+enum class IrExportStage { HLO, OPTIMIZED_HLO, OPTIMIZED_HLO_DOT };
+
+// Returns HLO text for a given function `func_name` using library runtime
+// `runtime` on a device `dev` with given `inputs`.
+xla::StatusOr<std::string> GetCompilerIr(
+    IrExportStage stage, ProcessFunctionLibraryRuntime* pflr,
+    absl::string_view func_name, Device* dev, EagerContext* context,
+    absl::Span<const TensorHandle* const> inputs);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_GET_COMPILER_IR_H_
diff --git a/tensorflow/compiler/jit/graphcycles/BUILD b/tensorflow/compiler/jit/graphcycles/BUILD
index 61d0c0de35f..23d994c27c5 100644
--- a/tensorflow/compiler/jit/graphcycles/BUILD
+++ b/tensorflow/compiler/jit/graphcycles/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index eb9ad8a2e85..1f400137f5b 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 package(
     default_visibility = [
         "//tensorflow/compiler/tf2xla:internal",
@@ -32,7 +34,7 @@ XLA_OPS_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:state_ops_op_lib",
-    "//tensorflow/core:stream_executor_no_cuda",
+    "//tensorflow/core/platform:stream_executor_no_cuda",
     "//tensorflow/core/profiler/lib:traceme",
     "//tensorflow/stream_executor:tf_allocator_adapter",
 ]
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index de462928c46..0f0f43cbad6 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -158,12 +158,13 @@ XlaLocalLaunchBase::XlaLocalLaunchBase(OpKernelConstruction* ctx,
       constants_(constants),
       resources_(resources),
       function_(function),
-      platform_info_(XlaPlatformInfoFromContext(ctx)),
+      platform_info_(XlaPlatformInfoFromDevice(ctx->device())),
       has_ref_vars_(has_ref_vars) {}
 
 static Status CompileToLocalExecutable(
     OpKernelContext* ctx, const NameAttrList& function, bool has_ref_vars,
     const XlaPlatformInfo& platform_info,
+    absl::Span<const Tensor* const> inputs,
     absl::Span<VariableInfo const> variable_infos,
     absl::Span<const int> constants, bool lazy, bool may_alias_resource_update,
     xla::LocalClient** client,
@@ -180,7 +181,7 @@ static Status CompileToLocalExecutable(
   TF_RETURN_IF_ERROR(rm->LookupOrCreate<XlaCompilationCache>(
       rm->default_container(), "xla_cache", &cache,
       [&](XlaCompilationCache** cache) {
-        return BuildXlaCompilationCache(ctx, platform_info, cache);
+        return BuildXlaCompilationCache(ctx->device(), platform_info, cache);
       }));
   // Hold the reference to the JIT during evaluation. (We could probably
   // free it sooner because the ResourceMgr will retain a reference, but
@@ -191,12 +192,9 @@ static Status CompileToLocalExecutable(
 
   absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
   XlaCompiler::Options options = GenerateCompilerOptions(
-      *cache, ctx, platform_info, has_ref_vars, &tf_allocator_adapter);
-
-  std::map<int, Tensor> constant_args;
-  for (int i : constants) {
-    constant_args.insert({i, ctx->input(i)});
-  }
+      *cache, *ctx->function_library(), ctx->device(),
+      ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr,
+      platform_info, has_ref_vars, &tf_allocator_adapter);
 
   XlaCompiler::CompileOptions compile_options;
   compile_options.is_entry_computation = true;
@@ -207,10 +205,11 @@ static Status CompileToLocalExecutable(
                                           !platform_info.is_on_xla_device() &&
                                           may_alias_resource_update;
 
-  std::vector<XlaCompiler::Argument> args;
-  TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
-      constant_args, variable_infos, ctx, &args));
-  return cache->Compile(options, function, args, compile_options,
+  xla::StatusOr<std::vector<XlaCompiler::Argument>> args =
+      XlaComputationLaunchContext::BuildXlaCompilerArguments(constants, inputs,
+                                                             variable_infos);
+  TF_RETURN_IF_ERROR(args.status());
+  return cache->Compile(options, function, *args, compile_options,
                         lazy ? XlaCompilationCache::CompileMode::kLazy
                              : XlaCompilationCache::CompileMode::kStrict,
                         compilation_result, executable);
@@ -220,6 +219,7 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
   VLOG(1) << "XlaLocalLaunchOpBase::Compute "
           << Canonicalize(function_.name(), AttrSlice(&function_.attr()));
 
+  std::vector<const Tensor*> inputs = InputsFromContext(ctx);
   xla::LocalClient* client;
   const XlaCompiler::CompilationResult* compilation_result;
   xla::LocalExecutable* executable;
@@ -227,10 +227,11 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
   std::vector<VariableInfo> variable_infos;
   {
     OP_REQUIRES_OK(
-        ctx, GetVariableInfosFromCtxInputs(ctx, resources_, &variable_infos));
+        ctx, GetVariableInfosFromInputs(ctx->resource_manager(), ctx->device(),
+                                        inputs, resources_, &variable_infos));
     OP_REQUIRES_OK(ctx, LockVariables(absl::MakeSpan(variable_infos)));
     Status s = CompileToLocalExecutable(
-        ctx, function_, /*has_ref_vars=*/has_ref_vars_, platform_info_,
+        ctx, function_, /*has_ref_vars=*/has_ref_vars_, platform_info_, inputs,
         variable_infos, constants_, /*lazy=*/false,
         /*may_alias_resource_update=*/true, &client, &compilation_result,
         &executable);
@@ -248,8 +249,10 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
   VLOG(1) << "Executing XLA Computation...";
 
   absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
-  se::DeviceMemoryAllocator* allocator =
-      GetAllocator(&tf_allocator_adapter, ctx, platform_info_);
+  se::DeviceMemoryAllocator* allocator = GetAllocator(
+      &tf_allocator_adapter, ctx->device(),
+      ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr,
+      platform_info_);
   int device_ordinal = stream ? stream->parent()->device_ordinal()
                               : client->default_device_ordinal();
   XlaComputationLaunchContext launch_context(
@@ -271,18 +274,6 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
   run_options.set_allocator(allocator);
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
   run_options.set_rng_seed(GetXLARandomSeed());
-  xla::ThenExecuteFunction then_execute;
-  if (ctx->op_device_context()) {
-    then_execute = [&](se::Stream* stream, std::function<void()> fn) {
-      Status status = ctx->op_device_context()->ThenExecute(
-          down_cast<Device*>(ctx->device()), stream, std::move(fn));
-      if (!status.ok()) {
-        // This should never happen.
-        LOG(ERROR) << "ThenExecute failed " << status;
-      }
-    };
-    run_options.set_then_execute_function(&then_execute);
-  }
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
 
@@ -373,7 +364,7 @@ XlaCompileOp::XlaCompileOp(OpKernelConstruction* ctx)
       constants_(ConstantsVector(ctx)),
       resources_(ResourcesVector(ctx)),
       function_(FunctionAttr(ctx)),
-      platform_info_(XlaPlatformInfoFromContext(ctx)),
+      platform_info_(XlaPlatformInfoFromDevice(ctx->device())),
       must_compile_(MustCompileAttr(ctx)),
       has_ref_vars_(HasRefVars(ctx)) {}
 
@@ -385,6 +376,7 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
   xla::LocalExecutable* executable;
   ResourceVarsSnapshot variables;
 
+  std::vector<const Tensor*> inputs = InputsFromContext(ctx);
   bool cannot_compile_cluster;
   {
     mutex_lock guard(cannot_compile_cluster_mu_);
@@ -397,13 +389,14 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
   } else {
     std::vector<VariableInfo> variable_infos;
     OP_REQUIRES_OK(
-        ctx, GetVariableInfosFromCtxInputs(ctx, resources_, &variable_infos));
+        ctx, GetVariableInfosFromInputs(ctx->resource_manager(), ctx->device(),
+                                        inputs, resources_, &variable_infos));
     OP_REQUIRES_OK(ctx, LockVariables(absl::MakeSpan(variable_infos)));
 
     // Do not alias resource updates as locking variables in XlaCompile and
     // unlocking them in XlaRun may lead to deadlocks.
     Status status = CompileToLocalExecutable(
-        ctx, function_, has_ref_vars_, platform_info_, variable_infos,
+        ctx, function_, has_ref_vars_, platform_info_, inputs, variable_infos,
         constants_,
         /*lazy=*/!must_compile_,
         /*may_alias_resource_update=*/false, &client, &kernel, &executable);
@@ -461,7 +454,7 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
 }
 
 XlaRunOp::XlaRunOp(OpKernelConstruction* ctx)
-    : OpKernel(ctx), platform_info_(XlaPlatformInfoFromContext(ctx)) {}
+    : OpKernel(ctx), platform_info_(XlaPlatformInfoFromDevice(ctx->device())) {}
 
 void XlaRunOp::Compute(OpKernelContext* ctx) {
   VLOG(3) << "XlaRunOp " << def().name();
@@ -472,8 +465,10 @@ void XlaRunOp::Compute(OpKernelContext* ctx) {
       XlaExecutableClosureStore::Global()->Consume(key);
 
   absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
-  se::DeviceMemoryAllocator* allocator =
-      GetAllocator(&tf_allocator_adapter, ctx, platform_info_);
+  se::DeviceMemoryAllocator* allocator = GetAllocator(
+      &tf_allocator_adapter, ctx->device(),
+      ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr,
+      platform_info_);
   se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
   int device_ordinal = stream ? stream->parent()->device_ordinal()
@@ -515,18 +510,6 @@ void XlaRunOp::Compute(OpKernelContext* ctx) {
   run_options.set_allocator(allocator);
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
   run_options.set_rng_seed(GetXLARandomSeed());
-  xla::ThenExecuteFunction then_execute;
-  if (ctx->op_device_context()) {
-    then_execute = [&](se::Stream* stream, std::function<void()> fn) {
-      Status status = ctx->op_device_context()->ThenExecute(
-          down_cast<Device*>(ctx->device()), stream, std::move(fn));
-      if (!status.ok()) {
-        // This should never happen.
-        LOG(ERROR) << "ThenExecute failed " << status;
-      }
-    };
-    run_options.set_then_execute_function(&then_execute);
-  }
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 19eb61b6f72..ada7766fcbb 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -32,12 +32,12 @@ limitations under the License.
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/resource_operation_safety_analysis.h"
-#include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/resource_operation_table.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/union_find.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
@@ -1196,13 +1196,14 @@ Status MarkForCompilationPassImpl::FindCompilationCandidates() {
       continue;
     }
 
-    DeviceType jit_device_type(registration->compilation_device_name);
-
-    RecursiveCompilabilityChecker::OperationFilter op_filter =
+    RecursiveCompilabilityChecker::OperationFilter filter =
         CreateOperationFilter(*registration);
+    filter.require_always_compilable = true;
 
-    if (!RecursiveCompilabilityChecker{&op_filter, &jit_device_type}
-             .IsCompilableNode(*node, lib_runtime)) {
+    RecursiveCompilabilityChecker checker(
+        filter, DeviceType{registration->compilation_device_name});
+
+    if (!checker.IsCompilableNode(*node, lib_runtime)) {
       continue;
     }
 
@@ -1711,40 +1712,6 @@ std::atomic<int64>* GetPointerToFuel(int64 initial_value) {
 }
 }  // anonymous namespace
 
-bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef,
-                  RecursiveCompilabilityChecker::UncompilableNodesMap*
-                      uncompilable_node_info) {
-  Device* device = flr->device();
-  const XlaOpRegistry::DeviceRegistration* registration;
-  CHECK(XlaOpRegistry::GetCompilationDevice(device->device_type(),
-                                            &registration));
-  DeviceType jit_device_type(registration->compilation_device_name);
-
-  // We can always *compile* resource operations, stateful RNGs and dummy ops,
-  // even if we are sometimes unable to auto-cluster them.
-  RecursiveCompilabilityChecker::OperationFilter op_filter;
-  op_filter.allow_resource_ops_in_called_functions = true;
-  op_filter.allow_stack_ops = true;
-  op_filter.allow_tensor_array_ops = true;
-  op_filter.allow_stateful_rng_ops = true;
-  op_filter.allow_control_trigger = true;
-  op_filter.allow_eliding_assert_and_checknumerics_ops = true;
-  op_filter.allow_ops_producing_or_consuming_variant = true;
-  op_filter.allow_slow_ops = true;
-  op_filter.allow_inaccurate_ops = true;
-
-  RecursiveCompilabilityChecker checker{&op_filter, &jit_device_type};
-  if (!uncompilable_node_info) {
-    // We do not need uncompilable node info. Just return the result.
-    return checker.IsCompilableCall(ndef, flr);
-  }
-
-  RecursiveCompilabilityChecker::UncompilableNodesMap uncompilable_node_result =
-      checker.FindUncompilableNodes(ndef, flr);
-  uncompilable_node_info->swap(uncompilable_node_result);
-  return uncompilable_node_info->empty();
-}
-
 Status MarkForCompilationPass::Run(
     const GraphOptimizationPassOptions& options) {
   MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
@@ -1837,7 +1804,9 @@ absl::flat_hash_map<string, std::vector<string>>* GetAllowlistTable() {
       "ConcatOffset", "Const", "MirrorPad", "Pack", "Pad", "PadV2", "Reverse",
       "ReverseV2", "ReverseSequence", "Slice", "Split", "SplitV",
       "StridedSlice", "StridedSliceGrad", "ResourceStridedSliceAssign",
-      "Tile", "Transpose", "InvertPermutation", "Unpack", "DeviceIndex"}}};
+      "Tile", "Transpose", "InvertPermutation", "Unpack", "DeviceIndex",
+      "TensorStridedSliceUpdate",
+     }}};
   // clang-format on
   return result;
 }
@@ -1952,6 +1921,7 @@ absl::flat_hash_set<string> GetKnownXLAAllowlistOp() {
                                      "ParallelDynamicStitch",
                                      "ParameterizedTruncatedNormal",
                                      "PartitionedCall",
+                                     "Polygamma",
                                      "PopulationCount",
                                      "Qr",
                                      "QuantizeAndDequantizeV2",
@@ -1996,6 +1966,8 @@ absl::flat_hash_set<string> GetKnownXLAAllowlistOp() {
                                      "ResourceScatterNdUpdate",
                                      "ResourceScatterSub",
                                      "ResourceScatterUpdate",
+                                     "RngReadAndSkip",
+                                     "RngSkip",
                                      "Roll",
                                      "ScatterNd",
                                      "SelfAdjointEigV2",
@@ -2018,11 +1990,17 @@ absl::flat_hash_set<string> GetKnownXLAAllowlistOp() {
                                      "StatelessCase",
                                      "StatelessIf",
                                      "StatelessMultinomial",
+                                     "StatelessRandomGetKeyCounterAlg",
                                      "StatelessRandomNormal",
+                                     "StatelessRandomNormalV2",
                                      "StatelessRandomUniform",
+                                     "StatelessRandomUniformV2",
                                      "StatelessRandomUniformInt",
+                                     "StatelessRandomUniformIntV2",
                                      "StatelessRandomUniformFullInt",
+                                     "StatelessRandomUniformFullIntV2",
                                      "StatelessTruncatedNormal",
+                                     "StatelessTruncatedNormalV2",
                                      "StatelessWhile",
                                      "Svd",
                                      "SymbolicGradient",
@@ -2049,6 +2027,8 @@ absl::flat_hash_set<string> GetKnownXLAAllowlistOp() {
                                      "TensorListSplit",
                                      "TensorListStack",
                                      "TensorScatterAdd",
+                                     "TensorScatterMax",
+                                     "TensorScatterMin",
                                      "TensorScatterSub",
                                      "TensorScatterUpdate",
                                      "TridiagonalSolve",
@@ -2080,12 +2060,15 @@ absl::flat_hash_set<string> GetKnownXLAAllowlistOp() {
                                      "XlaSelectAndScatter",
                                      "XlaSelfAdjointEig",
                                      "XlaSend",
+                                     "XlaSetBound",
                                      "XlaSharding",
                                      "XlaSort",
                                      "XlaSpmdFullToShardShape",
                                      "XlaSpmdShardToFullShape",
                                      "XlaSvd",
+                                     "XlaVariadicReduce",
                                      "XlaWhile",
+                                     "Zeta",
                                      "_Arg",
                                      "_ArrayToList",
                                      "_ListToArray",
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.h b/tensorflow/compiler/jit/mark_for_compilation_pass.h
index 0e9a64e7f28..810ebf38b5c 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.h
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.h
@@ -50,14 +50,6 @@ class MarkForCompilationPass : public GraphOptimizationPass {
   friend class MarkForCompilationPassTestHelper;
 };
 
-// Returns true iff 'ndef' is a call to a function that is compilable.  A
-// function is compilable iff every operator in the function body is
-// compilable. If 'ndef' is not compilable and 'uncompilable_node_info' is not
-// null, we will populate 'uncompilable_node_info' with uncompilable node info.
-bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef,
-                  RecursiveCompilabilityChecker::UncompilableNodesMap*
-                      uncompilable_node_info = nullptr);
-
 absl::flat_hash_map<string, std::vector<string>>* GetAllowlistTable();
 
 namespace testing {
diff --git a/tensorflow/compiler/jit/ops/BUILD b/tensorflow/compiler/jit/ops/BUILD
index afc96a8e68c..6ca8fd0e34a 100644
--- a/tensorflow/compiler/jit/ops/BUILD
+++ b/tensorflow/compiler/jit/ops/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 
 package(
diff --git a/tensorflow/compiler/jit/tests/BUILD b/tensorflow/compiler/jit/tests/BUILD
index 412dfefb9b7..88ce43902fd 100644
--- a/tensorflow/compiler/jit/tests/BUILD
+++ b/tensorflow/compiler/jit/tests/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 licenses(["notice"])  # Apache 2.0
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 971a5383f6b..461a6692c84 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include "absl/base/call_once.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_activity.pb.h"
 #include "tensorflow/compiler/jit/xla_activity_listener.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -47,6 +47,11 @@ limitations under the License.
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/dump_graph.h"
 
+#if !defined(LIBTPU_ON_GCE)
+#include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
+#include "tensorflow/compiler/mlir/utils/array_container_utils.h"
+#endif
+
 namespace tensorflow {
 
 constexpr int64 XlaCompilationCache::kDefaultCompilationThreshold;
@@ -278,23 +283,39 @@ Status XlaCompilationCache::CompileSingleOp(
     const NodeDef& node_def = ctx->op_kernel().def();
     TF_ASSIGN_OR_RETURN(auto graph, CreateGraph(node_def, args, result_dtypes));
 
+    // TODO(b/155596779): Support TensorList args.
     bool has_tensor_list_arg =
         absl::c_any_of(args, [](const XlaCompiler::Argument arg) {
           return arg.kind == XlaCompiler::Argument::kTensorList;
         });
     const ConfigProto* config = ctx->function_library()->config_proto();
-    bool use_mlir = config && config->experimental().enable_mlir_bridge();
-    // TODO(b/155596779): Support TensorList args.
-    if (!use_mlir || !has_tensor_list_arg) {
+    // TODO(b/171039585): Support tf.VarIsInitializedOp using MLIR.
+    bool use_mlir = config && config->experimental().enable_mlir_bridge() &&
+                    !has_tensor_list_arg &&
+                    node_def.op() != "VarIsInitializedOp";
+#ifdef LIBTPU_ON_GCE
+    if (use_mlir) {
+      LOG(WARNING) << "MLIR is not supported in this environment.";
+    }
+    return compiler->CompileGraph(compile_options, node_def.name(),
+                                  std::move(graph), args, result);
+#else
+    if (!use_mlir) {
       return compiler->CompileGraph(compile_options, node_def.name(),
                                     std::move(graph), args, result);
     }
 
+    VLOG(1) << "Using MLIR bridge";
     GraphDebugInfo debug_info;
+    std::vector<std::string> control_rets;
+    if (result_dtypes.empty()) {
+      control_rets.push_back(node_def.name());
+    }
     return CompileGraphToXlaHlo(
-        *graph, {args.data(), args.size()}, options.device_type.type_string(),
-        compile_options.use_tuple_arg, *options.flib_def, debug_info,
-        options.shape_representation_fn, result);
+        *graph, mlir::SpanToArrayRef<XlaCompiler::Argument>(args), control_rets,
+        options.device_type.type_string(), compile_options.use_tuple_arg,
+        *options.flib_def, debug_info, options.shape_representation_fn, result);
+#endif
   };
   return CompileImpl(options, name, args, compile_op,
                      /*compile_threshold=*/absl::nullopt,
@@ -323,6 +344,10 @@ Status XlaCompilationCache::CompileImpl(
     absl::optional<int64> compile_threshold,
     const XlaCompiler::CompilationResult** out_compilation_result,
     xla::LocalExecutable** out_executable) {
+  if (FailOnXlaCompilation()) {
+    return errors::Internal("XLA compilation disabled");
+  }
+
   DCHECK_NE(out_executable, nullptr);
   VLOG(2) << "XlaCompilationCache::Compile " << DebugString();
 
diff --git a/tensorflow/compiler/jit/xla_compilation_cache_test.cc b/tensorflow/compiler/jit/xla_compilation_cache_test.cc
index 7227615d2bb..5578925b790 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache_test.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache_test.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/xla_compilation_cache.h"
 
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 
@@ -52,6 +54,30 @@ TEST(XlaCompilationCacheTest, SignatureEquality) {
   }
 }
 
+TEST(XlaCompilationCacheTest, TestDisabledXlaCompilation) {
+  NameAttrList fn;
+  fn.set_name("afunction");
+
+  DisableXlaCompilation();
+
+  xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
+  DeviceType device_type = DeviceType(DEVICE_CPU_XLA_JIT);
+
+  const XlaCompiler::CompilationResult* compilation_result;
+  xla::LocalExecutable* executable;
+
+  auto cache = new XlaCompilationCache(client, device_type);
+  core::ScopedUnref cache_ref(cache);
+
+  Status status = cache->Compile(XlaCompiler::Options{}, fn, {},
+                                 XlaCompiler::CompileOptions{},
+                                 XlaCompilationCache::CompileMode::kStrict,
+                                 &compilation_result, &executable);
+  EXPECT_FALSE(status.ok());
+  EXPECT_TRUE(
+      absl::StrContains(status.error_message(), "XLA compilation disabled"));
+}
+
 static void BM_BuildSignature(int iters, int n_args) {
   NameAttrList fn;
   fn.set_name("afunction");
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index da251c2c8f3..d092508eccf 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -49,8 +49,10 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
   xla::LocalClient* client = static_cast<xla::LocalClient*>(cache->client());
 
   absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
-  se::DeviceMemoryAllocator* allocator =
-      GetAllocator(&tf_allocator_adapter, ctx, platform_info_);
+  se::DeviceMemoryAllocator* allocator = GetAllocator(
+      &tf_allocator_adapter, ctx->device(),
+      ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr,
+      platform_info_);
   XlaComputationLaunchContext launch_context(
       client, allocator, client->default_device_ordinal(),
       /*allocate_xla_tensors=*/platform_info_.xla_device_metadata() != nullptr,
@@ -101,53 +103,16 @@ Status XlaCompileOnDemandOp::Compile(
     OpKernelContext* ctx, const XlaCompiler::CompilationResult** result,
     XlaCompilationCache** cache, ResourceVarsSnapshot* variable_args,
     xla::LocalExecutable** executable) {
-  std::map<int, Tensor> constant_arguments;
 
   std::vector<int> constant_input_indices;
   TF_RETURN_IF_ERROR(GetCompileTimeConstInputs(
       &ctx->op_kernel(), &constant_input_indices, ctx->function_library()));
-  CHECK(absl::c_is_sorted(constant_input_indices));
-
-  for (int64 i = 0; i < ctx->num_inputs(); ++i) {
-    const Tensor& device_tensor = ctx->input(i);
-
-    if (const XlaTensor* xla_tensor = XlaTensor::FromTensor(&device_tensor)) {
-      if (xla_tensor->has_host_tensor()) {
-        if (absl::c_binary_search(constant_input_indices, i)) {
-          constant_arguments[i] = xla_tensor->host_tensor();
-        }
-      }
-    }
-
-    if (!constant_arguments.count(i)) {
-      if (absl::c_binary_search(constant_input_indices, i)) {
-        if (ctx->input_memory_type(i) != HOST_MEMORY &&
-            ctx->op_device_context()) {
-          // Slow path; the argument is not available as a host constant so we
-          // must fetch it synchronously.
-          Tensor host_tensor;
-          AllocatorAttributes attrs;
-          attrs.set_on_host(true);
-          TF_RETURN_IF_ERROR(ctx->allocate_temp(device_tensor.dtype(),
-                                                device_tensor.shape(),
-                                                &host_tensor, attrs));
-          Status status = ctx->op_device_context()->CopyDeviceTensorToCPUSync(
-              &device_tensor, "ConstantArgument",
-              reinterpret_cast<Device*>(ctx->device()), &host_tensor);
-          if (!status.ok()) {
-            LOG(ERROR) << "Copying tensor of shape "
-                       << device_tensor.shape().DebugString() << " from "
-                       << ctx->device()->name() << "to CPU failed with "
-                       << status.ToString();
-            return status;
-          }
-          constant_arguments[i] = host_tensor;
-        } else {
-          constant_arguments[i] = device_tensor;
-        }
-      }
-    }
+  if (!absl::c_all_of(constant_input_indices, [&](int idx) {
+        return ctx->input_memory_type(idx) == HOST_MEMORY;
+      })) {
+    return errors::Internal("Unexpected device placement for a constant input");
   }
+  std::vector<const Tensor*> inputs = InputsFromContext(ctx);
 
   // We store information about the JIT-compiled XLA computation
   // in the ResourceMgr.
@@ -157,13 +122,16 @@ Status XlaCompileOnDemandOp::Compile(
   TF_RETURN_IF_ERROR(rm->LookupOrCreate<XlaCompilationCache>(
       rm->default_container(), "xla_cache", cache,
       [&](XlaCompilationCache** write_into_cache) {
-        return BuildXlaCompilationCache(ctx, platform_info_, write_into_cache);
+        return BuildXlaCompilationCache(ctx->device(), platform_info_,
+                                        write_into_cache);
       }));
 
   absl::optional<se::TfAllocatorAdapter> tf_allocator_adapter;
-  XlaCompiler::Options options =
-      GenerateCompilerOptions(**cache, ctx, platform_info_,
-                              /*has_ref_vars=*/true, &tf_allocator_adapter);
+  XlaCompiler::Options options = GenerateCompilerOptions(
+      **cache, *ctx->function_library(), ctx->device(),
+      ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr,
+      platform_info_,
+      /*has_ref_vars=*/true, &tf_allocator_adapter);
 
   XlaCompiler::CompileOptions compile_options;
   compile_options.is_entry_computation = true;
@@ -172,19 +140,23 @@ Status XlaCompileOnDemandOp::Compile(
   compile_options.always_return_tuple = false;
 
   std::vector<int> variables_indices = GetResourceVariableIndices(ctx);
-  std::vector<XlaCompiler::Argument> args;
+  xla::StatusOr<std::vector<XlaCompiler::Argument>> args;
   {
     std::vector<VariableInfo> variable_infos;
     TF_RETURN_IF_ERROR(
-        GetVariableInfosFromCtxInputs(ctx, variables_indices, &variable_infos));
+        GetVariableInfosFromInputs(ctx->resource_manager(), ctx->device(),
+                                   inputs, variables_indices, &variable_infos));
+
     TF_RETURN_IF_ERROR(LockVariables(absl::MakeSpan(variable_infos)));
     TF_RETURN_IF_ERROR(SnapshotResourceVariables(
         ctx, variables_indices, variable_infos, variable_args));
-    TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
-        constant_arguments, variable_infos, ctx, &args));
+
+    args = XlaComputationLaunchContext::BuildXlaCompilerArguments(
+        constant_input_indices, inputs, variable_infos);
+    TF_RETURN_IF_ERROR(args.status());
   }
 
-  return (*cache)->CompileSingleOp(options, args, ctx, compile_options, result,
+  return (*cache)->CompileSingleOp(options, *args, ctx, compile_options, result,
                                    executable);
 }
 
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.h b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
index 095d3427d41..bb8ab889ce9 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.h
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
@@ -37,7 +37,8 @@ namespace tensorflow {
 class XlaCompileOnDemandOp : public OpKernel {
  public:
   explicit XlaCompileOnDemandOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx), platform_info_(XlaPlatformInfoFromContext(ctx)) {}
+      : OpKernel(ctx),
+        platform_info_(XlaPlatformInfoFromDevice(ctx->device())) {}
   void Compute(OpKernelContext* ctx) override;
 
  private:
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index 446cd8944de..dd1ddb616f5 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -51,7 +51,7 @@ Status XlaCpuDeviceFactory::CreateDevices(
     std::vector<std::unique_ptr<Device>>* devices) {
   XlaDeviceFlags* flags = GetXlaDeviceFlags();
   if (!flags->tf_xla_enable_xla_devices) {
-    LOG(INFO) << "Not creating XLA devices, tf_xla_enable_xla_devices not set";
+    VLOG(1) << "Not creating XLA devices, tf_xla_enable_xla_devices not set";
     return Status::OK();
   }
   bool compile_on_demand = flags->tf_xla_compile_on_demand;
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index c47c9a29c1a..089d22dca03 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -573,8 +573,7 @@ XlaDeviceOpRegistrations* RegisterXlaDeviceKernels(const char* device,
   // Any op assigned to the device that isn't rewritten by the graph rewriter
   // gets executed by an XlaCompileOnDemandOp, which compiles it and executes
   // it just-in-time.
-  OpKernel* (*factory)(OpKernelConstruction*) =
-      [](OpKernelConstruction* context) -> OpKernel* {
+  auto factory = [](OpKernelConstruction* context) -> OpKernel* {
     return new XlaCompileOnDemandOp(context);
   };
   XlaOpRegistry::RegisterCompilationKernels();
@@ -583,6 +582,13 @@ XlaDeviceOpRegistrations* RegisterXlaDeviceKernels(const char* device,
            jit_device,
            /*include_compilation_only_kernels=*/false)) {
     KernelDef* def = new KernelDef(*jit_def);
+    const std::unordered_set<std::string>* constant_inputs =
+        XlaOpRegistry::CompileTimeConstantInputArgNames(def->op());
+
+    for (const std::string& arg_name : *constant_inputs) {
+      def->add_host_memory_arg(arg_name);
+    }
+
     def->set_device_type(device);
     registrations->op_kernel_registrars.emplace_back(
         new kernel_factory::OpKernelRegistrar(def, "XlaCompileOnDemandOp",
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index f7e7ee9cf95..6d6086ce0fa 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -94,6 +94,11 @@ class XlaDevice : public LocalDevice {
   static Status GetMetadata(OpKernelConstruction* ctx,
                             const Metadata** metadata);
 
+  // Sets `*metadata` to the XlaDevice Metadata in the XLA device used by
+  // `device`.
+  static Status GetMetadataFromDevice(DeviceBase* device,
+                                      const XlaDevice::Metadata** metadata);
+
   struct Options {
     // The StreamExecutor platform. Not owned. Must be non-null.
     se::Platform* platform = nullptr;
@@ -196,8 +201,6 @@ class XlaDevice : public LocalDevice {
   xla::StatusOr<std::pair<XlaDeviceContext*, XlaDeviceContext*>>
   GetDeviceContextLocked() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  static Status GetMetadataFromDevice(DeviceBase* device,
-                                      const XlaDevice::Metadata** metadata);
 
   Status MakeTensorFromProto(XlaDeviceContext* device_context,
                              const TensorProto& tensor_proto,
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index e1cef25e33e..7bdd0aecb34 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -294,4 +294,12 @@ se::Stream* XlaDeviceContext::GetDeviceToDeviceStream() {
   return device_to_device_stream(stream);
 }
 
+Status XlaDeviceContext::ThenExecute(Device* device,
+                                     stream_executor::Stream* stream,
+                                     std::function<void()> func) {
+  VLOG(2) << "XlaDeviceContext::ThenExecute";
+  stream->ThenDoHostCallback(std::move(func));
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index 05d8dfa7556..5689e815a99 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -86,6 +86,9 @@ class XlaDeviceContext : public DeviceContext {
   // Returns a device-to-device stream, in round-robin fashion.
   se::Stream* GetDeviceToDeviceStream();
 
+  Status ThenExecute(Device* device, stream_executor::Stream* stream,
+                     std::function<void()> func) override;
+
  private:
   bool UseMultipleStreams() const { return stream_ != host_to_device_stream_; }
 
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 16f496d51a3..99ba5658819 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -66,7 +66,7 @@ class XlaGpuDeviceFactory : public DeviceFactory {
 Status XlaGpuDeviceFactory::ListPhysicalDevices(std::vector<string>* devices) {
   XlaDeviceFlags* flags = GetXlaDeviceFlags();
   if (!flags->tf_xla_enable_xla_devices) {
-    LOG(INFO) << "Not creating XLA devices, tf_xla_enable_xla_devices not set";
+    VLOG(1) << "Not creating XLA devices, tf_xla_enable_xla_devices not set";
     return Status::OK();
   }
 
diff --git a/tensorflow/compiler/jit/xla_kernel_creator.cc b/tensorflow/compiler/jit/xla_kernel_creator.cc
index 3a6345afe9f..b90f8b7b990 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator.cc
+++ b/tensorflow/compiler/jit/xla_kernel_creator.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/kernels/xla_ops.h"
-#include "tensorflow/compiler/jit/mark_for_compilation_pass.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -30,53 +29,51 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/util/ptr_util.h"
 
-namespace {
+namespace tensorflow {
 
-// Utility which searches for values in a sorted list by scanning over it once.
-// No matter how many times ScanForValue is called, the list is scanned at most
-// once. However, if a call to ScanForValue skips over a value, that value is
-// not revisited in future calls to ScanForValue, so callers must take
-// care to order their calls.
-//
-// Useful for merging multiple sorted lists in O(n) time.
-class SinglePassSearch {
- public:
-  // Creates a SinglePassSearch object that can be used to search in `values`.
-  // Does not take ownership of `values`. `values` must outlive this.
-  // `values` must be sorted.
-  explicit SinglePassSearch(const std::vector<int>* values)
-      : current_index_(0), values_(values) {}
+// Returns true iff 'ndef' is a call to a function that is compilable.  A
+// function is compilable iff every operator in the function body is
+// compilable. If 'ndef' is not compilable and 'uncompilable_node_info' is not
+// null, we will populate 'uncompilable_node_info' with uncompilable node info.
+static bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef,
+                         RecursiveCompilabilityChecker::UncompilableNodesMap*
+                             uncompilable_node_info) {
+  Device* device = flr->device();
+  const XlaOpRegistry::DeviceRegistration* registration;
+  CHECK(XlaOpRegistry::GetCompilationDevice(device->device_type(),
+                                            &registration));
 
-  // Scans forward in the vector looking for "value", updating the internal
-  // position in to the vector.
-  // Returns true iff the vector contains the given value at or after current
-  // position.
-  // Not thread-safe.
-  bool ScanForValue(int value) {
-    while (current_index_ < values_->size() &&
-           (*values_)[current_index_] <= value) {
-      if ((*values_)[current_index_] == value) {
-        current_index_++;
-        return true;
-      }
-      current_index_++;
-    }
-    return false;
+  // We can always *compile* resource operations, stateful RNGs and dummy ops,
+  // even if we are sometimes unable to auto-cluster them.
+  RecursiveCompilabilityChecker::OperationFilter op_filter;
+  op_filter.allow_resource_ops_in_called_functions = true;
+  op_filter.allow_stack_ops = true;
+  op_filter.allow_tensor_array_ops = true;
+  op_filter.allow_stateful_rng_ops = true;
+  op_filter.allow_control_trigger = true;
+  op_filter.allow_eliding_assert_and_checknumerics_ops = true;
+  op_filter.allow_ops_producing_or_consuming_variant = true;
+  op_filter.allow_slow_ops = true;
+  op_filter.allow_inaccurate_ops = true;
+
+  RecursiveCompilabilityChecker checker{
+      op_filter, DeviceType{registration->compilation_device_name}};
+  if (!uncompilable_node_info) {
+    // We do not need uncompilable node info. Just return the result.
+    return checker.IsCompilableCall(ndef, flr);
   }
 
- private:
-  int current_index_;
-  const std::vector<int>* values_;
-};
-
-}  // end namespace
-
-namespace tensorflow {
+  RecursiveCompilabilityChecker::UncompilableNodesMap uncompilable_node_result =
+      checker.FindUncompilableNodes(ndef, flr);
+  uncompilable_node_info->swap(uncompilable_node_result);
+  return uncompilable_node_info->empty();
+}
 
 bool XlaKernelCreator::CanCreateKernel(
     const FunctionLibraryRuntime& flr,
     const std::shared_ptr<const NodeProperties>& props) const {
-  return CanCreateXlaKernel(props->node_def);
+  return CanCreateXlaKernel(props->node_def) &&
+         !XlaOpRegistry::IsCompilationDevice(flr.device()->device_type());
 }
 
 static Status CreateXlaKernel(FunctionLibraryRuntime* flr,
@@ -92,7 +89,8 @@ static Status CreateXlaKernel(FunctionLibraryRuntime* flr,
   XlaOpRegistry::RegisterCompilationKernels();
 
   // Only check for compilability if the MLIR bridge is not enabled.
-  if (!GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge) {
+  if (GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge !=
+      ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED) {
     RecursiveCompilabilityChecker::UncompilableNodesMap uncompilable_nodes_map;
     if (!IsCompilable(flr, node_def, &uncompilable_nodes_map)) {
       std::vector<RecursiveCompilabilityChecker::UncompilableNodeInfo>
@@ -122,62 +120,19 @@ static Status CreateXlaKernel(FunctionLibraryRuntime* flr,
   }
 
   // Get function body, constant args, and resource args.
+  NameAttrList function;
+  TF_RETURN_IF_ERROR(NameAndAttrsFromFunctionCall(node_def, &function));
   const FunctionBody* fbody = nullptr;
   std::vector<int> constant_arg_indices;
   std::vector<int> resource_arg_indices;
   TF_RETURN_IF_ERROR(GetBodyAndConstantsAndResources(
-      flr, node_def, &fbody, &constant_arg_indices, &resource_arg_indices));
+      flr, function, &fbody, &constant_arg_indices, &resource_arg_indices));
 
-  // Set input and output memory types.
-  MemoryTypeVector input_memory_types(fbody->arg_types.size(), DEVICE_MEMORY);
-  // These indices are used only for optimization purposes. They allow us
-  // to loop over constant_arg_indices and resource_arg_indices only once
-  // while iterating over all the function arguments checking if it is a
-  // resource or a constant.
-  // The reason we optimized this code is because functions can have a lot of
-  // captured arguments. For example, the backward pass of ResNet50 takes in all
-  // 214 variables and a similar number of activations.
-  SinglePassSearch constants_search(&constant_arg_indices);
-  SinglePassSearch resources_search(&resource_arg_indices);
-  for (size_t i = 0; i < fbody->arg_types.size(); ++i) {
-    if (resources_search.ScanForValue(i) || constants_search.ScanForValue(i)) {
-      // Compile-time constants and resource handles are expected to be in
-      // host memory.
-      input_memory_types[i] = HOST_MEMORY;
-    }
-  }
-  // One might wonder, about the case where a compile-time constant argument
-  // (which must be in host memory) is also used as an input into an op,
-  // e.g. Add, that expects its inputs in device memory. Here is how it
-  // works now.
-  // First, what do we mean by "op expects an input in XYZ memory"?
-  // There are two types of "ops" here: the tf2xla kernel and the HLO
-  // computation it builds. The tf2xla kernel needs to retrieve the actual
-  // numeric value of the compile-time constant tensors, so it really expects
-  // them to be on in host memory. However, for other inputs, it refers to them
-  // using xla::ComputationDataHandle, which is just a symbolic handle that
-  // xla::ComputationBuilder assigns. How does this handle gets assigned for
-  // constant arguments? Even constant arguments get an _Arg node in the graph
-  // instantiated for Function compilation. The tf2xla kernel for constant _Arg
-  // nodes takes the constant value, converts it to XlaLiteral, and feeds it
-  // to xla::ComputationBuilder.ConstantLiteral, which returns the handle. This
-  // constant XlaLiteral is included in the HLO graph, and subsequently, in
-  // the actual executable, which is copied to the device before being
-  // executed. Thus, when this executable runs, the constant is available in
-  // device memory.
-
-  // XlaLaunch kernel keeps all outputs (including constants, which it copies),
-  // in device memory except for resources.
-  MemoryTypeVector output_memory_types(fbody->ret_types.size(), DEVICE_MEMORY);
-  for (size_t i = 0; i < fbody->ret_types.size(); ++i) {
-    if (fbody->ret_types[i] == DT_RESOURCE) {
-      output_memory_types[i] = HOST_MEMORY;
-    }
-  }
+  MemoryTypeVector input_memory_types =
+      GetInputMemoryTypes(fbody, constant_arg_indices, resource_arg_indices);
+  MemoryTypeVector output_memory_types = GetOutputMemoryTypes(fbody);
 
   // Create the kernel.
-  NameAttrList function;
-  TF_RETURN_IF_ERROR(NameAndAttrsFromFunctionCall(node_def, &function));
   Device* dev = flr->device();
   Status s;
   auto props = std::make_shared<NodeProperties>(
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 19e2b5a2bb5..a0e60b1eafe 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -44,12 +44,6 @@ namespace {
 using xla::ScopedShapedBuffer;
 using xla::ShapedBuffer;
 
-const char kPossibleNonVariableResourceHintMessage[] =
-    "If the error is similar to `Trying to access resource using the wrong "
-    "type`, this is likely because XLA only accepts Resource Variables as "
-    "inputs by snapshotting their values. Other TensorFlow resource types like "
-    "TensorList/TensorArray/Stack are not supported. Try removing non-variable "
-    "resource inputs to XLA.";
 }  // anonymous namespace
 
 VariableInfo::VariableInfo(int index, absl::string_view name, Var* var)
@@ -85,19 +79,22 @@ VariableInfo::~VariableInfo() {
   }
 }
 
-// Returns a vector of VariableInfo instances for the resource variable inputs
-// to the kernel with context `ctx`.  The input indices for the resource
-// variable inputs are in `variable_indices`.
-Status GetVariableInfosFromCtxInputs(OpKernelContext* ctx,
-                                     absl::Span<const int> variable_indices,
-                                     std::vector<VariableInfo>* result) {
+Status GetVariableInfosFromInputs(ResourceMgr* rm, DeviceBase* dev,
+                                  absl::Span<const Tensor* const> inputs,
+                                  absl::Span<const int> variable_indices,
+                                  std::vector<VariableInfo>* result) {
   result->clear();
   result->reserve(variable_indices.size());
   for (int var_idx : variable_indices) {
     Var* variable = nullptr;
-    ResourceHandle handle = HandleFromInput(ctx, var_idx);
-    TF_RETURN_IF_ERROR(
-        LookupOrCreateResource<Var>(ctx, handle, &variable, [&](Var** ptr) {
+    ResourceHandle handle = inputs[var_idx]->flat<ResourceHandle>()(0);
+    if (handle.device() != dev->attributes().name()) {
+      return errors::InvalidArgument(
+          "Trying to access resource ", handle.name(), " located in device ",
+          handle.device(), " from device ", dev->attributes().name());
+    }
+    TF_RETURN_IF_ERROR(rm->LookupOrCreate<Var>(
+        handle.container(), handle.name(), &variable, [](Var** ptr) {
           // This var is uninitialized for now.
           *ptr = new Var(DT_INVALID);
           return Status::OK();
@@ -107,6 +104,15 @@ Status GetVariableInfosFromCtxInputs(OpKernelContext* ctx,
   return Status::OK();
 }
 
+std::vector<const Tensor*> InputsFromContext(OpKernelContext* ctx) {
+  std::vector<const Tensor*> inputs;
+  inputs.reserve(ctx->num_inputs());
+  for (int input_idx = 0; input_idx < ctx->num_inputs(); input_idx++) {
+    inputs.push_back(&ctx->input(input_idx));
+  }
+  return inputs;
+}
+
 Status LockVariables(absl::Span<VariableInfo> variables) {
   std::vector<int> lock_order(variables.size());
   std::iota(lock_order.begin(), lock_order.end(), 0);
@@ -358,9 +364,6 @@ static Status SetOutputForConstant(
     ctx->set_output(output_num, const_tensor);
     output_tensor = ctx->mutable_output(output_num);
   }
-  if (XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor)) {
-    xla_tensor->set_host_tensor(const_tensor);
-  }
   return Status::OK();
 }
 
@@ -557,11 +560,14 @@ Status XlaComputationLaunchContext::PopulateOutputs(
   return Status::OK();
 }
 
-Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
-    const std::map<int, Tensor>& must_be_constant_args,
-    absl::Span<VariableInfo const> variable_args, OpKernelContext* ctx,
-    std::vector<XlaCompiler::Argument>* args) {
-  args->resize(ctx->num_inputs());
+xla::StatusOr<std::vector<XlaCompiler::Argument>>
+XlaComputationLaunchContext::BuildXlaCompilerArguments(
+    absl::Span<int const> must_be_constant_idxs,
+    absl::Span<const Tensor* const> inputs,
+    absl::Span<VariableInfo const> variable_args) {
+  CHECK(absl::c_is_sorted(must_be_constant_idxs));
+  std::vector<XlaCompiler::Argument> out;
+  out.resize(inputs.size());
 
   absl::flat_hash_map<int, const VariableInfo*> variable_info_lookup;
   for (const VariableInfo& info : variable_args) {
@@ -571,33 +577,20 @@ Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
     variable_info_lookup.emplace(info.index(), &info);
   }
 
-  for (int64 input_num = 0; input_num < ctx->num_inputs(); ++input_num) {
-    XlaCompiler::Argument& arg = (*args)[input_num];
+  for (int64 input_num = 0; input_num < inputs.size(); ++input_num) {
+    const Tensor* input = inputs[input_num];
 
-    if (must_be_constant_args.count(input_num) > 0) {
+    XlaCompiler::Argument& arg = out[input_num];
+    if (absl::c_binary_search(must_be_constant_idxs, input_num)) {
       // Handles compile-time constants.
-      const Tensor& input = must_be_constant_args.at(input_num);
-      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
+      TF_RET_CHECK(input->dtype() != DT_RESOURCE);
       arg.kind = XlaCompiler::Argument::kConstant;
-      arg.type = input.dtype();
-      arg.shape = input.shape();
-      arg.constant_value = input;
-    } else if (variable_info_lookup.count(input_num) == 0) {
-      // Handles the non-constant arguments.
-      const Tensor& input = ctx->input(input_num);
-      TF_RET_CHECK(input.dtype() != DT_RESOURCE);
-      if (input.NumElements() > 0) {
-        arg.kind = XlaCompiler::Argument::kParameter;
-      } else {
-        arg.kind = XlaCompiler::Argument::kConstant;
-        arg.constant_value = input;
-      }
-      arg.type = input.dtype();
-      arg.shape = input.shape();
-    } else {
+      arg.type = input->dtype();
+      arg.shape = input->shape();
+      arg.constant_value = *input;
+    } else if (variable_info_lookup.count(input_num)) {
       // Handles resource variables.
-      const Tensor& input = ctx->input(input_num);
-      TF_RET_CHECK(input.dtype() == DT_RESOURCE);
+      TF_RET_CHECK(input->dtype() == DT_RESOURCE);
       const VariableInfo& variable = *variable_info_lookup[input_num];
       arg.name = std::string(variable.name());
       arg.kind = XlaCompiler::Argument::kResource;
@@ -616,10 +609,21 @@ Status XlaComputationLaunchContext::BuildXlaCompilerArguments(
         arg.type = DT_INVALID;
         arg.shape = TensorShape();
       }
+    } else {
+      // Normal inputs.
+      TF_RET_CHECK(input->dtype() != DT_RESOURCE);
+      if (input->NumElements() > 0) {
+        arg.kind = XlaCompiler::Argument::kParameter;
+      } else {
+        arg.kind = XlaCompiler::Argument::kConstant;
+        arg.constant_value = *input;
+      }
+      arg.type = input->dtype();
+      arg.shape = input->shape();
     }
   }
 
-  return Status::OK();
+  return out;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index b34b3059a4f..ac085a022c8 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -109,12 +109,16 @@ Status SnapshotResourceVariables(OpKernelContext* ctx,
 Status LockVariables(absl::Span<VariableInfo> variables)
     TF_EXCLUSIVE_LOCK_FUNCTION();
 
-// Returns a vector of VariableInfo instances for the resource variable inputs
-// to the kernel with context `ctx`.  The input indices for the resource
+// Returns a vector of VariableInfo instances for the resource variable inputs,
+// given that *all* inputs are in `inputs`. The input indices for the resource
 // variable inputs are in `variable_indices`.
-Status GetVariableInfosFromCtxInputs(OpKernelContext* ctx,
-                                     absl::Span<const int> variable_indices,
-                                     std::vector<VariableInfo>* result);
+Status GetVariableInfosFromInputs(ResourceMgr* rm, DeviceBase* dev,
+                                  absl::Span<const Tensor* const> inputs,
+                                  absl::Span<const int> variable_indices,
+                                  std::vector<VariableInfo>* result);
+
+// Returns pointers to inputs stored in `ctx`.
+std::vector<const Tensor*> InputsFromContext(OpKernelContext* ctx);
 
 // Helper class to perform the marshalling of TensorFlow inputs and outputs to
 // ShapedBuffers suitable for passing to an XLA computation.
@@ -136,10 +140,10 @@ class XlaComputationLaunchContext {
   // Builds a XlaCompiler::Argument vector from the arguments to an XlaLaunch
   // op.
   // Precondition: variables in `variable_args` are locked.
-  static Status BuildXlaCompilerArguments(
-      const std::map<int, Tensor>& constant_args,
-      absl::Span<VariableInfo const> variable_args, OpKernelContext* ctx,
-      std::vector<XlaCompiler::Argument>* args);
+  static xla::StatusOr<std::vector<XlaCompiler::Argument>>
+  BuildXlaCompilerArguments(absl::Span<int const> must_be_constant_idxs,
+                            absl::Span<const Tensor* const> inputs,
+                            absl::Span<VariableInfo const> variable_args);
 
   // Add all inputs within `ctx` as XLA arguments (returned by arguments()).
   // `variables` is a map from TensorFlow argument number to resource variable.
diff --git a/tensorflow/compiler/jit/xla_ops_on_regular_devices.cc b/tensorflow/compiler/jit/xla_ops_on_regular_devices.cc
index 82510a4926b..6c6c490e032 100644
--- a/tensorflow/compiler/jit/xla_ops_on_regular_devices.cc
+++ b/tensorflow/compiler/jit/xla_ops_on_regular_devices.cc
@@ -38,18 +38,29 @@ namespace tensorflow {
                           XlaCompileOnDemandOp);                               \
   REGISTER_KERNEL_BUILDER(Name("XlaDot").Device(DEVICE),                       \
                           XlaCompileOnDemandOp);                               \
-  REGISTER_KERNEL_BUILDER(Name("XlaDynamicSlice").Device(DEVICE),              \
-                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("XlaDynamicSlice").HostMemory("size_indices").Device(DEVICE),       \
+      XlaCompileOnDemandOp);                                                   \
   REGISTER_KERNEL_BUILDER(Name("XlaDynamicUpdateSlice").Device(DEVICE),        \
                           XlaCompileOnDemandOp);                               \
   REGISTER_KERNEL_BUILDER(Name("XlaIf").Device(DEVICE), XlaCompileOnDemandOp); \
-  REGISTER_KERNEL_BUILDER(Name("XlaPad").Device(DEVICE),                       \
+  REGISTER_KERNEL_BUILDER(Name("XlaPad")                                       \
+                              .HostMemory("padding_low")                       \
+                              .HostMemory("padding_high")                      \
+                              .HostMemory("padding_interior")                  \
+                              .Device(DEVICE),                                 \
                           XlaCompileOnDemandOp);                               \
   REGISTER_KERNEL_BUILDER(Name("XlaRecv").Device(DEVICE),                      \
                           XlaCompileOnDemandOp);                               \
   REGISTER_KERNEL_BUILDER(Name("XlaReduce").Device(DEVICE),                    \
                           XlaCompileOnDemandOp);                               \
-  REGISTER_KERNEL_BUILDER(Name("XlaReduceWindow").Device(DEVICE),              \
+  REGISTER_KERNEL_BUILDER(Name("XlaReduceWindow")                              \
+                              .HostMemory("window_dimensions")                 \
+                              .HostMemory("window_strides")                    \
+                              .HostMemory("base_dilations")                    \
+                              .HostMemory("window_dilations")                  \
+                              .HostMemory("padding")                           \
+                              .Device(DEVICE),                                 \
                           XlaCompileOnDemandOp);                               \
   REGISTER_KERNEL_BUILDER(Name("XlaSelectAndScatter")                          \
                               .HostMemory("window_dimensions")                 \
@@ -75,11 +86,9 @@ namespace tensorflow {
                           XlaCompileOnDemandOp);                               \
   REGISTER_KERNEL_BUILDER(Name("XlaReplicaId").Device(DEVICE),                 \
                           XlaCompileOnDemandOp);                               \
-  REGISTER_KERNEL_BUILDER(Name("XlaGather")                                    \
-                              .HostMemory("start_indices")                     \
-                              .HostMemory("slice_sizes")                       \
-                              .Device(DEVICE),                                 \
-                          XlaCompileOnDemandOp);                               \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("XlaGather").HostMemory("slice_sizes").Device(DEVICE),              \
+      XlaCompileOnDemandOp);                                                   \
   REGISTER_KERNEL_BUILDER(Name("XlaScatter").Device(DEVICE),                   \
                           XlaCompileOnDemandOp);
 
diff --git a/tensorflow/compiler/jit/xla_platform_info.cc b/tensorflow/compiler/jit/xla_platform_info.cc
index a5e12b37563..b38bf9282b1 100644
--- a/tensorflow/compiler/jit/xla_platform_info.cc
+++ b/tensorflow/compiler/jit/xla_platform_info.cc
@@ -19,7 +19,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-Status BuildXlaCompilationCache(OpKernelContext* ctx,
+Status BuildXlaCompilationCache(DeviceBase* device,
                                 const XlaPlatformInfo& platform_info,
                                 XlaCompilationCache** cache) {
   if (platform_info.xla_device_metadata()) {
@@ -59,7 +59,7 @@ Status BuildXlaCompilationCache(OpKernelContext* ctx,
   xla::LocalClientOptions client_options;
   client_options.set_platform(platform.ValueOrDie());
   client_options.set_intra_op_parallelism_threads(
-      ctx->device()->tensorflow_cpu_worker_threads()->num_threads);
+      device->tensorflow_cpu_worker_threads()->num_threads);
   auto client = xla::ClientLibrary::GetOrCreateLocalClient(client_options);
   if (!client.ok()) {
     return client.status();
@@ -75,21 +75,21 @@ Status BuildXlaCompilationCache(OpKernelContext* ctx,
   return Status::OK();
 }
 
-XlaPlatformInfo XlaPlatformInfoFromContext(OpKernelConstruction* ctx) {
-  DeviceType device_type = ctx->device_type();
+XlaPlatformInfo XlaPlatformInfoFromDevice(DeviceBase* device_base) {
+  auto device = static_cast<Device*>(device_base);
   se::Platform::Id platform_id = nullptr;
   const XlaDevice::Metadata* xla_device_metadata = nullptr;
   se::DeviceMemoryAllocator* custom_allocator = nullptr;
 
-  if (ctx->device_type() == DeviceType(DEVICE_CPU)) {
+  if (device->device_type() == DEVICE_CPU) {
     platform_id = se::host::kHostPlatformId;
-  } else if (ctx->device_type() == DeviceType(DEVICE_GPU)) {
-    platform_id = ctx->device()
-                      ->tensorflow_gpu_device_info()
+  } else if (device->device_type() == DEVICE_GPU) {
+    platform_id = device->tensorflow_gpu_device_info()
                       ->stream->parent()
                       ->platform()
                       ->id();
-  } else if (XlaDevice::GetMetadata(ctx, &xla_device_metadata).ok()) {
+  } else if (XlaDevice::GetMetadataFromDevice(device, &xla_device_metadata)
+                 .ok()) {
     // If we are on an XlaDevice, use the underlying XLA platform's allocator
     // directly. We could use the StreamExecutor's allocator which may
     // theoretically be more correct, but XLA returns a nice OOM message in a
@@ -104,47 +104,46 @@ XlaPlatformInfo XlaPlatformInfoFromContext(OpKernelConstruction* ctx) {
         xla_device_metadata->client()->backend().memory_allocator();
   }
 
-  return XlaPlatformInfo(device_type, platform_id, xla_device_metadata,
-                         custom_allocator);
+  return XlaPlatformInfo(DeviceType(device->device_type()), platform_id,
+                         xla_device_metadata, custom_allocator);
 }
 
 se::DeviceMemoryAllocator* GetAllocator(
     absl::optional<se::TfAllocatorAdapter>* tf_allocator_adapter,
-    OpKernelContext* ctx, const XlaPlatformInfo& platform_info) {
+    DeviceBase* device, se::Stream* stream,
+    const XlaPlatformInfo& platform_info) {
   if (platform_info.custom_allocator()) {
     return platform_info.custom_allocator();
   }
-  if (!ctx->op_device_context()) {
+  if (!stream) {
     // Stream is not set for the host platform.
     se::Platform* platform =
         se::MultiPlatformManager::PlatformWithId(platform_info.platform_id())
             .ValueOrDie();
-    tf_allocator_adapter->emplace(ctx->device()->GetAllocator({}), platform);
+    tf_allocator_adapter->emplace(device->GetAllocator({}), platform);
     return &tf_allocator_adapter->value();
   }
-  tf_allocator_adapter->emplace(ctx->device()->GetAllocator({}),
-                                ctx->op_device_context()->stream());
+  tf_allocator_adapter->emplace(device->GetAllocator({}), stream);
   return &tf_allocator_adapter->value();
 }
 
 XlaCompiler::Options GenerateCompilerOptions(
-    const XlaCompilationCache& cache, OpKernelContext* ctx,
-    const XlaPlatformInfo& platform_info, bool has_ref_vars,
+    const XlaCompilationCache& cache,
+    const FunctionLibraryRuntime& function_library, DeviceBase* device,
+    se::Stream* stream, const XlaPlatformInfo& platform_info, bool has_ref_vars,
     absl::optional<se::TfAllocatorAdapter>* tf_allocator_adapter) {
-  CHECK(ctx->function_library());
   XlaCompiler::Options options;
   options.client = static_cast<xla::LocalClient*>(cache.client());
-  if (ctx->op_device_context() != nullptr) {
-    options.device_ordinal =
-        ctx->op_device_context()->stream()->parent()->device_ordinal();
+  if (stream != nullptr) {
+    options.device_ordinal = stream->parent()->device_ordinal();
   }
   options.device_type = cache.device_type();
-  options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
-  options.graph_def_version = ctx->function_library()->graph_def_version();
+  options.flib_def = function_library.GetFunctionLibraryDefinition();
+  options.graph_def_version = function_library.graph_def_version();
   options.allow_cpu_custom_calls =
       (platform_info.platform_id() == se::host::kHostPlatformId);
   options.device_allocator =
-      GetAllocator(tf_allocator_adapter, ctx, platform_info);
+      GetAllocator(tf_allocator_adapter, device, stream, platform_info);
   if (platform_info.xla_device_metadata()) {
     options.shape_representation_fn =
         platform_info.xla_device_metadata()->shape_representation_fn();
diff --git a/tensorflow/compiler/jit/xla_platform_info.h b/tensorflow/compiler/jit/xla_platform_info.h
index d58b32a996f..bfb438cc398 100644
--- a/tensorflow/compiler/jit/xla_platform_info.h
+++ b/tensorflow/compiler/jit/xla_platform_info.h
@@ -80,27 +80,31 @@ class XlaPlatformInfo {
 };
 
 // Returns created XLA compilation cache.
-Status BuildXlaCompilationCache(OpKernelContext* ctx,
+Status BuildXlaCompilationCache(DeviceBase* dev,
                                 const XlaPlatformInfo& platform_info,
                                 XlaCompilationCache** cache);
 
 // Returns information about the platform from kernel context.
-XlaPlatformInfo XlaPlatformInfoFromContext(OpKernelConstruction* ctx);
+XlaPlatformInfo XlaPlatformInfoFromDevice(DeviceBase* device);
 
 // Returns allocator from platform info if non-null, or populate and return a
 // pointer to the allocator adapter with allocator from context.
 //
 // This is necessary because for XLA devices the underlying TF allocator returns
 // dummy tensors.
+//
+// `stream` parameter is nullable when running on host.
 se::DeviceMemoryAllocator* GetAllocator(
     absl::optional<se::TfAllocatorAdapter>* tf_allocator_adapter,
-    OpKernelContext* ctx, const XlaPlatformInfo& platform_info);
+    DeviceBase* device, se::Stream* stream,
+    const XlaPlatformInfo& platform_info);
 
 // Returns created options for the XLA compiler, and writes the used allocator
 // into `tf_allocator_adapter`.
 XlaCompiler::Options GenerateCompilerOptions(
-    const XlaCompilationCache& cache, OpKernelContext* ctx,
-    const XlaPlatformInfo& platform_info, bool has_ref_vars,
+    const XlaCompilationCache& cache,
+    const FunctionLibraryRuntime& function_library, DeviceBase* device,
+    se::Stream* stream, const XlaPlatformInfo& platform_info, bool has_ref_vars,
     absl::optional<se::TfAllocatorAdapter>* tf_allocator_adapter);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h
index dc358760534..2da1501819c 100644
--- a/tensorflow/compiler/jit/xla_tensor.h
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -71,18 +71,6 @@ class XlaTensor {
     shaped_buffer_ = std::move(shaped_buffer);
   }
 
-  // Some tensors on the device may have known values on the host. We use these
-  // in on-demand mode to avoid re-copying values from the device if we know the
-  // host value already.
-
-  // Return true if this XlaTensor contains a host tensor.
-  bool has_host_tensor() const { return host_tensor_.has_value(); }
-  // Return the contained host tensor.
-  // REQUIRES: has_host_tensor()
-  const Tensor& host_tensor() const { return *host_tensor_; }
-  // Sets the contained host tensor.
-  void set_host_tensor(const Tensor& tensor) { host_tensor_.emplace(tensor); }
-
   // Adds synchronization events to 'stream' that wait for this tensor to be
   // defined on 'stream'. Does nothing if the tensor is already defined on that
   // stream.
diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index fd9953de1e2..18d05bdaace 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -1,6 +1,8 @@
 # Description:
 #   TensorFlow/TensorFlow Lite/XLA MLIR dialects and tools.
 
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 package(
@@ -24,11 +26,40 @@ filegroup(
     srcs = glob(["**/*.td"]),
 )
 
+cc_library(
+    name = "string_container_utils",
+    hdrs = ["utils/string_container_utils.h"],
+    deps = [
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "array_container_utils",
+    hdrs = ["utils/array_container_utils.h"],
+    deps = [
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "name_utils",
+    srcs = ["utils/name_utils.cc"],
+    hdrs = ["utils/name_utils.h"],
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "op_or_arg_name_mapper",
     srcs = ["op_or_arg_name_mapper.cc"],
     hdrs = ["op_or_arg_name_mapper.h"],
     deps = [
+        ":name_utils",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -40,17 +71,15 @@ cc_library(
     srcs = ["tf_mlir_opt_main.cc"],
     deps = [
         ":init_mlir",
+        "//tensorflow/compiler/mlir/hlo:all_passes",
+        "//tensorflow/compiler/mlir/hlo:hlo_dialect_registration",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
         "//tensorflow/core:lib",
-        "//tensorflow/core/platform:logging",
-        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
-        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MlirOptLib",
-        "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Shape",
-        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -67,14 +96,13 @@ cc_library(
         # xla-legalize-tf-with-tf2xla pass.
         "//tensorflow/compiler/jit",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "//tensorflow/compiler/mlir/lite:tensorflow_lite_dialect_registration",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite_legalize_tf",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite_optimize",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite_quantize",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_passes",
         "//tensorflow/compiler/mlir/lite/quantization/tensorflow:tf_to_quant",
         "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
+        "//tensorflow/compiler/mlir/tensorflow:compile_mlir_util_pass",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_test_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
@@ -141,10 +169,9 @@ tf_cc_binary(
     srcs = ["tf_mlir_translate_main.cc"],
     deps = [
         ":init_mlir",
-        "//tensorflow/compiler/mlir/lite:tensorflow_lite_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
+        "//tensorflow/compiler/mlir/tensorflow:tf_xla_mlir_translate",
         "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/compiler/mlir/tensorflow:translate_registration",
@@ -157,7 +184,7 @@ tf_cc_binary(
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Translation",
diff --git a/tensorflow/compiler/mlir/README.md b/tensorflow/compiler/mlir/README.md
index cbb0b08503a..c415edceb8c 100644
--- a/tensorflow/compiler/mlir/README.md
+++ b/tensorflow/compiler/mlir/README.md
@@ -9,3 +9,31 @@ dialects and utilities for
 3. TF Lite
 
 See [MLIR's website](https://mlir.llvm.org) for complete documentation.
+
+## Getting started
+
+Building dialects and utilities here follow the standard approach using
+`bazel` as the rest of TensorFlow.
+
+### Using local LLVM repo
+
+To develop across MLIR core and TensorFlow, it is useful to override the repo
+to use a local version instead of fetching from head. This can be achieved as
+below but note, the BUILD files are not automatically generated from or CMake
+used, so if your change requires a BUILD file change (or you are using a
+different version of LLVM than set in tensorflow/workspace.bzl's LLVM_COMMIT)
+then manual BUILD file changes may be required.
+
+```sh
+LLVM_SRC=...
+
+# Create basic workspace file
+echo 'workspace(name = "llvm-project")' > $LLVM_SRC/WORKSPACE
+# and copy over the bazel BUILD files.
+cp third_party/llvm/llvm.autogenerated.BUILD $LLVM_SRC/llvm/BUILD
+cp third_party/mlir/BUILD $LLVM_SRC/mlir
+cp third_party/mlir/test.BUILD $LLVM_SRC/mlir/test/BUILD
+
+bazel build --override_repository=llvm-project=$LLVM_SRC \
+  -c opt tensorflow/compiler/mlir:tf-opt
+```
diff --git a/tensorflow/compiler/mlir/hlo/BUILD b/tensorflow/compiler/mlir/hlo/BUILD
index 7be39aef9da..1636bbb89ee 100644
--- a/tensorflow/compiler/mlir/hlo/BUILD
+++ b/tensorflow/compiler/mlir/hlo/BUILD
@@ -1,3 +1,10 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
 load("//third_party/mlir:tblgen.bzl", "gentbl")
 
 # TODO(b/160617323): Decouple MLIR HLO from TensorFlow/XLA
@@ -17,6 +24,7 @@ package_group(
         "//learning/brain/experimental/mlir/...",
         "//learning/brain/google/xla/kernels/...",
         "//learning/brain/google/xla/mlir/...",
+        "//learning/deepmind/partir/...",
         "//learning/pathways/data_parallel/tf2xla/...",
         "//platforms/xla/...",
         "//tensorflow/compiler/mlir/...",
@@ -37,10 +45,13 @@ filegroup(
         "include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.td",
         "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td",
         "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td",
+        "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.td",
         "include/mlir-hlo/Dialect/mhlo/IR/hlo_utils.td",
         "include/mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.td",
         "include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td",
+        "include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops_base.td",
         "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:include/mlir/Interfaces/CopyOpInterface.td",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
         "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
         "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
@@ -57,7 +68,8 @@ filegroup(
 
 gentbl(
     name = "MhloPassIncGen",
-    strip_include_prefix = "include/mlir-hlo/Dialect/mhlo/transforms/",
+    compatible_with = get_compatible_with_cloud(),
+    strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-pass-decls -name MHLO",
@@ -73,7 +85,8 @@ gentbl(
 
 gentbl(
     name = "LmhloPassIncGen",
-    strip_include_prefix = "include/mlir-hlo/Dialect/mhlo/transforms/",
+    compatible_with = get_compatible_with_cloud(),
+    strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-pass-decls -name LMHLO",
@@ -89,6 +102,7 @@ gentbl(
 
 gentbl(
     name = "chlo_ops_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     strip_include_prefix = "include",
     tbl_outs = [
         ("-gen-op-decls", "include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h.inc"),
@@ -104,12 +118,11 @@ gentbl(
 
 gentbl(
     name = "hlo_ops_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     strip_include_prefix = "include",
     tbl_outs = [
         ("-gen-op-decls", "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h.inc"),
         ("-gen-op-defs", "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.cc.inc"),
-        ("-gen-struct-attr-decls", "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_structs.h.inc"),
-        ("-gen-struct-attr-defs", "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_structs.cc.inc"),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td",
@@ -128,6 +141,7 @@ gentbl(
 
 gentbl(
     name = "hlo_ops_base_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     strip_include_prefix = "include",
     tbl_outs = [
         ("-gen-op-decls", "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.h.inc"),
@@ -135,11 +149,30 @@ gentbl(
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td",
+    td_relative_includes = [
+        "include",
+    ],
+    td_srcs = [":hlo_ops_td_files"],
+)
+
+gentbl(
+    name = "hlo_ops_base_structs_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    tbl_outs = [
+        ("-gen-struct-attr-decls", "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h.inc"),
+        ("-gen-struct-attr-defs", "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.cc.inc"),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td",
+    td_relative_includes = [
+        "include",
+    ],
     td_srcs = [":hlo_ops_td_files"],
 )
 
 gentbl(
     name = "hlo_ops_pattern_gen",
+    compatible_with = get_compatible_with_cloud(),
     strip_include_prefix = "lib/Dialect/mhlo/IR/",
     tbl_outs = [
         (
@@ -162,6 +195,7 @@ gentbl(
 
 gentbl(
     name = "lhlo_ops_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     strip_include_prefix = "include",
     tbl_outs = [
         ("-gen-op-decls", "include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h.inc"),
@@ -177,9 +211,67 @@ gentbl(
     td_srcs = [":hlo_ops_td_files"],
 )
 
+gentbl(
+    name = "lhlo_gpu_ops_structs_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    strip_include_prefix = "include",
+    tbl_outs = [
+        ("-gen-struct-attr-decls", "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h.inc"),
+        ("-gen-struct-attr-defs", "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.cc.inc"),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.td",
+    td_relative_includes = [
+        "include",
+    ],
+    td_srcs = [
+        ":hlo_ops_td_files",
+        "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_base.td",
+    ],
+)
+
+cc_library(
+    name = "lhlo_gpu_ops_structs",
+    srcs = [
+        "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.cc.inc",
+        "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h.inc",
+        "lib/Dialect/mhlo/IR/lhlo_gpu_ops_structs.cc",
+    ],
+    hdrs = [
+        "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":lhlo_gpu_ops_structs_inc_gen",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+gentbl(
+    name = "lhlo_gpu_ops_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    strip_include_prefix = "include",
+    tbl_outs = [
+        ("-gen-op-decls", "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h.inc"),
+        ("-gen-op-defs", "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.cc.inc"),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.td",
+    td_relative_includes = [
+        "include",
+    ],
+    td_srcs = [
+        ":hlo_ops_td_files",
+        "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_base.td",
+        "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.td",
+    ],
+)
+
 #TODO(aminim): revisit the naming and grouping of these rules post-move.
 gentbl(
     name = "canonicalize_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     strip_include_prefix = "lib/Dialect/mhlo/IR/",
     tbl_outs = [
         ("-gen-rewriters", "lib/Dialect/mhlo/IR/mhlo_canonicalize.inc"),
@@ -194,6 +286,7 @@ gentbl(
 
 gentbl(
     name = "infer_fusibility_op_interface_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-op-interface-decls",
@@ -232,6 +325,23 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "hlo_ops_base_structs",
+    srcs = [
+        "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h.inc",
+        "lib/Dialect/mhlo/IR/hlo_ops_base_structs.cc",
+    ],
+    hdrs = [
+        "include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":hlo_ops_base_structs_inc_gen",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "convert_op_folder",
     srcs = ["lib/utils/convert_op_folder.cc"],
@@ -265,6 +375,7 @@ cc_library(
         ":chlo_ops_inc_gen",
         ":convert_op_folder",
         ":hlo_ops_base_inc_gen",
+        ":hlo_ops_base_structs",
         ":hlo_ops_inc_gen",
         ":infer_fusibility_op_interface",
         "@llvm-project//llvm:Support",
@@ -295,9 +406,11 @@ cc_library(
     includes = ["include"],
     deps = [
         ":hlo_ops_base_inc_gen",
+        ":hlo_ops_base_structs",
         ":lhlo_ops_inc_gen",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:CopyOpInterface",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SideEffects",
@@ -311,12 +424,34 @@ cc_library(
 )
 
 cc_library(
-    name = "hlo_dialect_force_registration",
-    srcs = ["lib/Dialect/mhlo/IR/dialect_registration.cc"],
+    name = "lhlo_gpu",
+    srcs = [
+        "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.cc.inc",
+        "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h.inc",
+        "lib/Dialect/mhlo/IR/lhlo_gpu_ops.cc",
+    ],
+    hdrs = [
+        "include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h",
+    ],
+    includes = ["include"],
     deps = [
         ":hlo",
-        ":lhlo",
+        ":hlo_ops_base_structs",
+        ":infer_fusibility_op_interface",
+        ":lhlo_gpu_ops_inc_gen",
+        ":lhlo_gpu_ops_structs",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Analysis",
+        "@llvm-project//mlir:CopyOpInterface",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffects",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+        "@llvm-project//mlir:Transforms",
+        "@llvm-project//mlir:ViewLikeInterface",
     ],
     alwayslink = 1,
 )
@@ -328,24 +463,45 @@ cc_library(
     deps = [
         ":hlo",
         ":lhlo",
+        ":lhlo_gpu",
         "@llvm-project//mlir:IR",
     ],
 )
 
 cc_library(
     name = "sink_constants_to_control_flow",
-    srcs = ["lib/Dialect/mhlo/transforms/sink_constants_to_control_flow.cc"],
+    srcs = [
+        "lib/Dialect/mhlo/transforms/sink_constants_to_control_flow.cc",
+    ],
+    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/passes.h"],
+    deps = [
+        ":hlo",
+        ":pass_details",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "mhlo_control_flow_to_scf",
+    srcs = ["lib/Dialect/mhlo/transforms/mhlo_control_flow_to_scf.cc"],
     hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/passes.h"],
     deps = [
         ":hlo",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
     ],
-    alwayslink = 1,
 )
 
 cc_library(
@@ -356,6 +512,7 @@ cc_library(
         ":lhlo",
         ":map_hlo_to_lhlo_op",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:StandardOps",
     ],
 )
@@ -420,7 +577,10 @@ cc_library(
 cc_library(
     name = "legalize_to_linalg",
     srcs = ["lib/Dialect/mhlo/transforms/legalize_to_linalg.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"],
+    hdrs = [
+        "include/mlir-hlo/Dialect/mhlo/transforms/passes.h",
+        "include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h",
+    ],
     deps = [
         ":hlo",
         ":lhlo",
@@ -439,9 +599,13 @@ cc_library(
 cc_library(
     name = "transform_unranked_hlo",
     srcs = ["lib/Dialect/mhlo/transforms/transform_unranked_hlo.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"],
+    hdrs = [
+        "include/mlir-hlo/Dialect/mhlo/transforms/passes.h",
+        "include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h",
+    ],
     deps = [
         ":hlo",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Shape",
@@ -459,6 +623,7 @@ cc_library(
         ":lhlo",
         ":map_lmhlo_to_scalar_op",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Affine",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgOps",
@@ -477,27 +642,15 @@ cc_library(
     deps = [
         ":lhlo",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Affine",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "lhlo_copy_removal",
-    srcs = ["lib/Dialect/mhlo/transforms/lhlo_copy_removal.cc"],
-    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/passes.h"],
-    deps = [
-        ":lhlo",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:ViewLikeInterface",
     ],
     alwayslink = 1,
 )
@@ -516,6 +669,8 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Shape",
+        "@llvm-project//mlir:ShapeTransforms",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
@@ -564,6 +719,7 @@ cc_library(
 
 gentbl(
     name = "legalize_to_standard_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     strip_include_prefix = "lib/Dialect/mhlo/transforms/",
     tbl_outs = [
         ("-gen-rewriters", "lib/Dialect/mhlo/transforms/generated_legalize_to_standard.inc"),
@@ -601,8 +757,8 @@ cc_library(
     hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/passes.h"],
     deps = [
         ":hlo",
-        ":legalize_tanh_to_approximation",
         ":legalize_to_standard_inc_gen",
+        ":legalize_trigonometric_to_approximation",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -632,8 +788,8 @@ cc_library(
 )
 
 cc_library(
-    name = "legalize_tanh_to_approximation",
-    srcs = ["lib/Dialect/mhlo/transforms/legalize_tanh_to_approximation.cc"],
+    name = "legalize_trigonometric_to_approximation",
+    srcs = ["lib/Dialect/mhlo/transforms/legalize_trigonometric_to_approximation.cc"],
     hdrs = [
         "include/mlir-hlo/Dialect/mhlo/transforms/passes.h",
         "include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h",
@@ -652,6 +808,7 @@ cc_library(
 
 gentbl(
     name = "lower_complex_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     strip_include_prefix = "lib/Dialect/mhlo/transforms/",
     tbl_outs = [
         ("-gen-rewriters", "lib/Dialect/mhlo/transforms/generated_lower_complex.inc"),
@@ -682,7 +839,6 @@ cc_library(
     ],
     deps = [
         ":hlo",
-        ":hlo_dialect_force_registration",
         ":lower_complex_inc_gen",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
@@ -733,6 +889,7 @@ cc_library(
     srcs = ["lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc"],
     hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"],
     deps = [
+        ":chlo_legalize_to_hlo_inc_gen",
         ":hlo",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:SCFDialect",
@@ -742,6 +899,40 @@ cc_library(
     ],
 )
 
+gentbl(
+    name = "chlo_legalize_to_hlo_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    strip_include_prefix = "lib/Dialect/mhlo/transforms/",
+    tbl_outs = [
+        (
+            "-gen-rewriters",
+            "lib/Dialect/mhlo/transforms/generated_chlo_legalize_to_hlo.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_patterns.td",
+    td_relative_includes = [
+        "include",
+    ],
+    td_srcs = [
+        ":hlo_ops_td_files",
+    ],
+)
+
+cc_library(
+    name = "pass_details",
+    hdrs = [
+        "include/mlir-hlo/Dialect/mhlo/transforms/PassDetail.h",
+    ],
+    visibility = [
+        "//visibility:private",  # This target is a private detail of pass implementations
+    ],
+    deps = [
+        ":MhloPassIncGen",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
 cc_library(
     name = "test_passes",
     srcs = [
@@ -759,6 +950,7 @@ cc_library(
         ":lhlo",
         ":lhlo_legalize_to_llvm",  # build-cleaner: keep
         ":materialize_broadcasts",  # build-cleaner: keep
+        ":pass_details",
         ":unfuse_batch_norm",  # build-cleaner: keep
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
@@ -788,15 +980,15 @@ cc_library(
         ":hlo_legalize_to_lhlo",
         ":legalize_control_flow",
         ":legalize_gather_to_torch_index_select",
-        ":legalize_tanh_to_approximation",
         ":legalize_to_linalg",
         ":legalize_to_standard",
+        ":legalize_trigonometric_to_approximation",
         ":lhlo",
-        ":lhlo_copy_removal",
         ":lhlo_fuse_linalg",
         ":lhlo_legalize_to_affine",
         ":lhlo_legalize_to_gpu",
         ":lhlo_legalize_to_parallel_loops",
+        ":mhlo_control_flow_to_scf",
         ":mhlo_fusion",
         ":mhlo_to_mhlo_lowering_patterns",
         ":sink_constants_to_control_flow",
@@ -815,6 +1007,7 @@ cc_binary(
         ":all_passes",
         ":hlo",
         ":lhlo",
+        ":lhlo_gpu",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/hlo/README.md b/tensorflow/compiler/mlir/hlo/README.md
index 9eaa14031fd..61517cd9fca 100644
--- a/tensorflow/compiler/mlir/hlo/README.md
+++ b/tensorflow/compiler/mlir/hlo/README.md
@@ -106,7 +106,7 @@ pipeline using MLIR:
 *   `mhlo`: "meta"-HLO dialect ; similar to `xla_hlo`, but with extensions for
     dynamic shape support.
 *   `lmhlo`: "late"-"meta"-HLO, it is the IR after buffer allocation is
-    performed. In XLA the buffer allocation is a side-datastructure which keeps
+    performed. In XLA the buffer allocation is a side-data structure which keeps
     track of these informations, while this separate dialect materializes it in
     the IR.
 
@@ -114,7 +114,7 @@ We describe these in more details below.
 
 ### HLO Client Dialect: `chlo`.
 
-*   It was originaly designed to map the
+*   It was originally designed to map the
     [XLA client APIs](https://www.tensorflow.org/xla/operation_semantics) (e.g.,
     ops supports implicit broadcast and roughly modeled on XlaBuilder API)
     modulo support for dynamic shapes and additional ops required to support
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/CMakeLists.txt
index 09bdca84cd3..3fa2b908d9c 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/CMakeLists.txt
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/CMakeLists.txt
@@ -25,7 +25,22 @@ function(add_mlir_hlo_dialect dialect dialect_namespace)
 endfunction()
 
 add_mlir_hlo_dialect(chlo_ops chlo)
-add_mlir_hlo_dialect(hlo_ops mhlo)
 add_mlir_hlo_dialect(lhlo_ops lmhlo)
 
+set(LLVM_TARGET_DEFINITIONS hlo_ops.td)
+mlir_tablegen(hlo_ops.h.inc -gen-op-decls)
+mlir_tablegen(hlo_ops.cc.inc -gen-op-defs)
+mlir_tablegen(hlo_ops_base_structs.h.inc -gen-struct-attr-decls)
+mlir_tablegen(hlo_ops_base_structs.cc.inc -gen-struct-attr-defs)
+add_public_tablegen_target(MLIRhlo_opsIncGen)
+
+set(LLVM_TARGET_DEFINITIONS lhlo_gpu_ops.td)
+mlir_tablegen(lhlo_gpu_ops.h.inc -gen-op-decls)
+mlir_tablegen(lhlo_gpu_ops.cc.inc -gen-op-defs)
+set(LLVM_TARGET_DEFINITIONS lhlo_gpu_ops_structs.td)
+mlir_tablegen(lhlo_gpu_ops_structs.h.inc -gen-struct-attr-decls)
+mlir_tablegen(lhlo_gpu_ops_structs.cc.inc -gen-struct-attr-defs)
+add_public_tablegen_target(MLIRlhlo_gpu_opsIncGen)
+add_dependencies(mlir-headers MLIRlhlo_gpu_opsIncGen)
+
 add_mlir_interface(infer_fusibility_op_interface)
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h
index 9704f34a4d6..05b22770401 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h
@@ -44,11 +44,18 @@ class HloClientDialect : public Dialect {
   static StringRef getDialectNamespace() { return "chlo"; }
 };
 
+}  // namespace chlo
+}  // namespace mlir
+
 #define GET_OP_CLASSES
 #include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h.inc"
 
+namespace mlir {
+namespace chlo {
+
 template <typename T>
-static Value getConstantLike(OpBuilder& b, T constant, Value val) {
+static Value getConstantLike(OpBuilder& b, Location loc, T constant,
+                             Value val) {
   Type ty = getElementTypeOrSelf(val.getType());
 
   auto getAttr = [&]() -> Attribute {
@@ -56,8 +63,7 @@ static Value getConstantLike(OpBuilder& b, T constant, Value val) {
     if (ty.isa<FloatType>()) return b.getFloatAttr(ty, constant);
     llvm_unreachable("unhandled element type");
   };
-  // TODO(jpienaar): Add ability to pass loc via native call and update.
-  return b.create<ConstantLikeOp>(b.getUnknownLoc(), getAttr(), val);
+  return b.create<ConstantLikeOp>(loc, getAttr(), val);
 }
 
 }  // namespace chlo
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.td
index 2f3bbefb5ab..13d5f02368b 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.td
@@ -37,7 +37,7 @@ include "mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.td"
 
 def HLOClient_Dialect : Dialect {
   let name = "chlo";
-  let cppNamespace = "chlo";
+  let cppNamespace = "::mlir::chlo";
   let summary = [{
     Client HLO Ops
   }];
@@ -79,7 +79,8 @@ class HLOClient_BroadcastBinaryElementwiseOp<
   string mnemonic, list<OpTrait> traits> :
         HLOClient_Op<mnemonic,
             !listconcat(traits, [
-              DeclareOpInterfaceMethods<InferShapedTypeOpInterface>])> {
+              DeclareOpInterfaceMethods<InferShapedTypeOpInterface,
+              ["reifyReturnTypeShapes"]>])> {
   let arguments = (ins
     HLO_Tensor:$lhs,
     HLO_Tensor:$rhs,
@@ -99,13 +100,6 @@ class HLOClient_BroadcastBinaryElementwiseOp<
     $lhs `,` $rhs attr-dict `:`
     `(` type($lhs) `,` type($rhs) `)` `->` type(results)
   }];
-
-  let extraClassDeclaration = [{
-    // TODO(laurenzo): It isn't clear to me why reifyReturnShapes does not
-    // have its declaration generated by DeclareOpInterfaceMethods.
-    LogicalResult reifyReturnTypeShapes(
-         OpBuilder& builder, SmallVectorImpl<Value>& reifiedReturnShapes);
-  }];
 }
 
 def HLOClient_BroadcastAddOp : HLOClient_BroadcastBinaryElementwiseOp<"broadcast_add",
@@ -344,14 +338,16 @@ def HLOClient_BroadcastComplexOp : HLOClient_BroadcastBinaryElementwiseOp<
 //===----------------------------------------------------------------------===//
 
 class HLOClient_UnaryElementwiseOp<string mnemonic, list<OpTrait> traits,
-    Type TensorType>: HLOClient_Op<mnemonic,
-      !listconcat(traits, [InferFusibilityOpInterface])> {
+    Type TensorType> : HLOClient_Op<mnemonic, !listconcat(traits, [
+    InferFusibilityOpInterface, NoSideEffect, SameOperandsAndResultType])> {
   let arguments = (ins TensorType:$operand);
-  let results = (outs TensorType);
+  let results = (outs TensorType:$result);
+
+  let assemblyFormat = "$operand attr-dict `:` type($operand)";
 }
 
-def HLOClient_AcosOp: HLOClient_UnaryElementwiseOp<"acos",
-    [NoSideEffect, SameOperandsAndResultType], HLO_FpOrComplexTensor> {
+def HLOClient_AcosOp : HLOClient_UnaryElementwiseOp<"acos", [],
+    HLO_FpOrComplexTensor> {
   let summary = "Acos operator";
 
   let description = [{
@@ -364,7 +360,47 @@ def HLOClient_AcosOp: HLOClient_UnaryElementwiseOp<"acos",
   }];
 }
 
-def HLOClient_ConstantLikeOp: HLOClient_Op<"constant_like",
+def HLOClient_AtanOp : HLOClient_UnaryElementwiseOp<"atan", [],
+    HLO_FpOrComplexTensor> {
+  let summary = "Atan operator";
+
+  let description = [{
+    Returns `Atan(operand)` element-wise.
+
+    $$
+    \atan(x) = \atan2(x, 1)
+    $$
+  }];
+}
+
+def HLOClient_SinhOp : HLOClient_UnaryElementwiseOp<"sinh", [],
+    HLO_FpOrComplexTensor> {
+  let summary = "Sinh operation";
+
+  let description = [{
+    Returns `Sinh(operand)` element-wise.
+
+    $$
+    \sinh(x) = (e^x - e^-x) / 2                     if |x| < 1
+             = e^(x + log(1/2)) - e^(-x + log(1/2)) otherwise.
+    $$
+  }];
+}
+
+def HLOClient_TanOp : HLOClient_UnaryElementwiseOp<"tan", [],
+    HLO_FpOrComplexTensor> {
+  let summary = "Tan operation";
+
+  let description = [{
+    Returns `Tan(operand)` element-wise.
+
+    $$
+    \tan(x) = \sin(x) / \cos(x)
+    $$
+  }];
+}
+
+def HLOClient_ConstantLikeOp : HLOClient_Op<"constant_like",
     [NoSideEffect, SameOperandsAndResultShape,
      InferTypeOpInterface,
      DeclareOpInterfaceMethods<InferShapedTypeOpInterface>,
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h
index 4286c837a24..b354189c12a 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h
@@ -19,7 +19,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_H_
 
 #include "llvm/ADT/StringRef.h"
-#include "mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/DialectImplementation.h"
@@ -32,11 +32,14 @@ limitations under the License.
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
+// clang-format off
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h"
+#include "mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h"
+// clang-format on
+
 namespace mlir {
 class OpBuilder;
 
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_structs.h.inc"
-
 namespace mhlo {
 
 class MhloDialect : public Dialect {
@@ -77,10 +80,10 @@ LogicalResult deriveShapeFromFirstOperand(
     OpBuilder *builder, Operation *op,
     SmallVectorImpl<Value> *reifiedReturnShapes);
 
-#define GET_OP_CLASSES
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h.inc"
-
 }  // end namespace mhlo
 }  // end namespace mlir
 
+#define GET_OP_CLASSES
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h.inc"
+
 #endif  //  TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_H_
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
index d0abbe043ea..3defb65adf8 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
@@ -25,11 +25,6 @@ include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td"
 include "mlir-hlo/Dialect/mhlo/IR/hlo_utils.td"
 include "mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.td"
 
-def HLO_Dialect : Dialect {
-  let name = "mhlo";
-  let cppNamespace = "mhlo";
-}
-
 class HLO_Op<string mnemonic, list<OpTrait> traits> :
     Op<HLO_Dialect, mnemonic, traits> {
   // Whether this operation has a custom conversion to HLO or not.
@@ -136,8 +131,8 @@ class HLO_UnaryElementwiseOp<string mnemonic, list<OpTrait> traits,
       }
       LogicalResult reifyReturnTypeShapes(
           OpBuilder& builder, SmallVectorImpl<Value>& reifiedReturnShapes) {
-        return deriveShapeFromFirstOperand(&builder, getOperation(),
-                                           &reifiedReturnShapes);
+        return ::mlir::mhlo::deriveShapeFromFirstOperand(&builder, getOperation(),
+                                                         &reifiedReturnShapes);
       }
       bool inferInputOutputShapeEquality(int input, int output) {
         return true;
@@ -153,10 +148,13 @@ def HLO_AbsOp: HLO_UnaryElementwiseOp<"abs",
     [NoSideEffect, SameOperandsAndResultShape],
      TensorOf<[HLO_SInt, AnyFloat, HLO_Complex]>>, BASE_HLO_AbsOp {
   let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, Value operand"
+    "Value operand"
   >];
 }
 
+def HLO_CbrtOp: HLO_UnaryElementwiseOp<"cbrt",
+    [NoSideEffect, SameOperandsAndResultType], HLO_FpTensor>, BASE_HLO_CbrtOp;
+
 def HLO_CeilOp: HLO_UnaryElementwiseOp<"ceil",
     [NoSideEffect, SameOperandsAndResultType], HLO_FpTensor>, BASE_HLO_CeilOp;
 
@@ -165,8 +163,7 @@ def HLO_ConvertOp : HLO_UnaryElementwiseOp<
     BASE_HLO_ConvertOp {
 
   let builders = [OpBuilder<
-    "OpBuilder &, OperationState &tblgen_state, Value operand, "
-    "Type result_element_ty"
+    "Value operand, Type result_element_ty"
   >];
 
   let hasFolder = 1;
@@ -193,12 +190,10 @@ def HLO_Expm1Op: HLO_UnaryElementwiseOp<"exponential_minus_one",
 def HLO_FloorOp: HLO_UnaryElementwiseOp<"floor",
     [NoSideEffect, SameOperandsAndResultType], HLO_FpTensor>, BASE_HLO_FloorOp;
 
-def HLO_ImagOp: HLO_Op<
-    "imag", [NoSideEffect, SameOperandsAndResultShape]>, BASE_HLO_ImagOp {
-  let builders = [OpBuilder<
-    "OpBuilder &, OperationState &tblgen_state, Value val">];
-
-  let arguments = (ins HLO_ComplexTensor);
+def HLO_ImagOp: HLO_UnaryElementwiseOp<"imag",
+    [NoSideEffect, SameOperandsAndResultShape,
+     DeclareOpInterfaceMethods<InferTypeOpInterface>],
+    HLO_ComplexTensor>, BASE_HLO_ImagOp {
   let results = (outs HLO_FpTensor);
   let hasFolder = 1;
 }
@@ -224,22 +219,23 @@ def HLO_LogisticOp: HLO_UnaryElementwiseOp<"logistic",
 
 def HLO_NotOp: HLO_UnaryElementwiseOp<"not",
     [NoSideEffect, SameOperandsAndResultType], HLO_PredOrIntTensor>,
-    BASE_HLO_NotOp;
+    BASE_HLO_NotOp {
+}
 
 def HLO_NegOp: HLO_UnaryElementwiseOp<"negate",
     [NoSideEffect, SameOperandsAndResultType], HLO_IntFpOrComplexTensor>,
-    BASE_HLO_NegOp;
+    BASE_HLO_NegOp {
+  let hasFolder = 1;
+}
 
 def HLO_PopulationCountOp: HLO_UnaryElementwiseOp<"popcnt",
     [NoSideEffect, SameOperandsAndResultType], HLO_IntTensor>,
     BASE_HLO_PopulationCountOp;
 
-def HLO_RealOp: HLO_Op<
-    "real", [NoSideEffect, SameOperandsAndResultShape]>, BASE_HLO_RealOp {
-  let builders = [OpBuilder<
-    "OpBuilder &, OperationState &tblgen_state, Value val">];
-
-  let arguments = (ins HLO_ComplexTensor);
+def HLO_RealOp: HLO_UnaryElementwiseOp<"real",
+    [NoSideEffect, SameOperandsAndResultShape,
+     DeclareOpInterfaceMethods<InferTypeOpInterface>],
+    HLO_ComplexTensor>, BASE_HLO_RealOp {
   let results = (outs HLO_FpTensor);
   let hasFolder = 1;
 }
@@ -262,7 +258,9 @@ def HLO_SinOp: HLO_UnaryElementwiseOp<"sine",
 
 def HLO_SqrtOp: HLO_UnaryElementwiseOp<"sqrt",
     [NoSideEffect, SameOperandsAndResultType], HLO_FpOrComplexTensor>,
-    BASE_HLO_SqrtOp;
+    BASE_HLO_SqrtOp {
+  let hasFolder = 1;
+}
 
 def HLO_TanhOp: HLO_UnaryElementwiseOp<"tanh",
     [NoSideEffect, SameOperandsAndResultType],
@@ -289,8 +287,8 @@ class HLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
     }
     LogicalResult reifyReturnTypeShapes(
         OpBuilder& builder, SmallVectorImpl<Value>& reifiedReturnShapes) {
-      return deriveShapeFromFirstOperand(&builder, getOperation(),
-                                         &reifiedReturnShapes);
+      return ::mlir::mhlo::deriveShapeFromFirstOperand(&builder, getOperation(),
+                                                       &reifiedReturnShapes);
     }
     bool inferInputsShapeEquality(int lhs, int rhs) {
       return true;
@@ -316,12 +314,10 @@ def HLO_AddOp : HLO_BinaryElementwiseOp<"add",
 def HLO_Atan2Op : HLO_BinaryElementwiseOp<"atan2",
       [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_Atan2Op;
 
-def HLO_ComplexOp: HLO_Op<"complex",
-    [NoSideEffect, SameOperandsAndResultShape]>,
+def HLO_ComplexOp: HLO_BinaryElementwiseOp<"complex",
+    [NoSideEffect, SameOperandsAndResultShape,
+     DeclareOpInterfaceMethods<InferTypeOpInterface>]>,
     BASE_HLO_ComplexOp {
-  let builders = [OpBuilder<
-    "OpBuilder &, OperationState &tblgen_state, Value lhs, Value rhs">];
-
   let arguments = (ins HLO_FpTensor:$lhs, HLO_FpTensor:$rhs);
   let results = (outs HLO_ComplexTensor);
   let hasFolder = 1;
@@ -351,7 +347,9 @@ def HLO_PowOp : HLO_BinaryElementwiseOp<"power",
       [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_PowOp;
 
 def HLO_RemOp : HLO_BinaryElementwiseOp<"remainder",
-      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_RemOp;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_RemOp {
+  let hasFolder = 1;
+}
 
 def HLO_ShiftLeftOp : HLO_BinaryElementwiseOp<"shift_left",
       [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_ShiftLeftOp;
@@ -379,6 +377,8 @@ class HLO_BinaryLogicalElementwiseOp<string mnemonic> :
     HLO_PredOrIntTensor:$lhs,
     HLO_PredOrIntTensor:$rhs
   );
+
+  let hasFolder = 1;
 }
 
 def HLO_AndOp: HLO_BinaryLogicalElementwiseOp<"and">, BASE_HLO_AndOp;
@@ -452,7 +452,7 @@ def HLO_SendOp : HLO_Op<"send", []> {
   let arguments = (ins
     HLO_TensorOrTuple:$operand,
     HLO_Token:$token,
-    ChannelHandle<HLO_Dialect>:$channel_id,
+    ChannelHandle:$channel_id,
     DefaultValuedAttr<BoolAttr, "false">:$is_host_transfer
   );
 
@@ -477,7 +477,7 @@ def HLO_RecvOp : HLO_Op<"recv", []> {
 
   let arguments = (ins
     HLO_Token:$token,
-    ChannelHandle<HLO_Dialect>:$channel_id,
+    ChannelHandle:$channel_id,
     DefaultValuedAttr<BoolAttr, "false">:$is_host_transfer
   );
 
@@ -491,9 +491,7 @@ def HLO_RecvOp : HLO_Op<"recv", []> {
 
 def HLO_ReplicaIdOp : HLO_Op<"replica_id", [NoSideEffect]>,
       BASE_HLO_ReplicaIdOp {
-  // TODO(prakalps): The output should unsigned 32-bit integer but mlir does
-  // not differentiate between signed and unsigned int.
-  let results = (outs I32Tensor);
+  let results = (outs TensorOf<[UI32]>);
 }
 
 //===----------------------------------------------------------------------===//
@@ -583,7 +581,7 @@ def HLO_AllReduceOp : HLO_Op<"all_reduce",
   let arguments = (ins
     HLO_Tensor:$operand,
     I64ElementsAttr:$replica_groups,
-    OptionalAttr<ChannelHandle<HLO_Dialect>>:$channel_id
+    OptionalAttr<ChannelHandle>:$channel_id
   );
   let regions = (region SizedRegion<1>:$computation);
   let results = (outs HLO_Tensor);
@@ -673,9 +671,10 @@ def HLO_TupleOp : HLO_Op<"tuple", [NoSideEffect]>, BASE_HLO_TupleOp {
   let hasCanonicalizer = 1;
 }
 
-def HLO_CompareOp: HLO_Op<"compare",
-      [NoSideEffect, SameTypeOperands, SameOperandsAndResultShape]>,
-      BASE_HLO_CompareOp {
+def HLO_CompareOp: HLO_Op<"compare", [NoSideEffect, SameTypeOperands,
+    SameOperandsAndResultShape,
+    DeclareOpInterfaceMethods<InferShapedTypeOpInterface,
+    ["reifyReturnTypeShapes"]>]>, BASE_HLO_CompareOp {
   let arguments = (ins
     HLO_Tensor:$lhs,
     HLO_Tensor:$rhs,
@@ -683,6 +682,8 @@ def HLO_CompareOp: HLO_Op<"compare",
   );
   let results = (outs HLO_PredTensor);
 
+  let hasFolder = 1;
+
   let builders = [OpBuilder<
     "OpBuilder &builder, OperationState &result, Value lhs, Value rhs, "
     "StringAttr comparison_direction"
@@ -905,39 +906,12 @@ def HLO_CollectivePermuteOp: HLO_Op<"collective_permute",
   let results = (outs HLO_Tensor);
 }
 
-// TODO(hinsu): Make this struct dialect independent so that it can be shared
-// between HLO and LHLO dialect.
-def ConvDimensionNumbers : StructAttr<"ConvDimensionNumbers", HLO_Dialect, [
-  StructFieldAttr<"input_batch_dimension",I64Attr>,
-  StructFieldAttr<"input_feature_dimension", I64Attr>,
-  StructFieldAttr<"input_spatial_dimensions", I64ElementsAttr>,
-  StructFieldAttr<"kernel_input_feature_dimension", I64Attr>,
-  StructFieldAttr<"kernel_output_feature_dimension", I64Attr>,
-  StructFieldAttr<"kernel_spatial_dimensions", I64ElementsAttr>,
-  StructFieldAttr<"output_batch_dimension", I64Attr>,
-  StructFieldAttr<"output_feature_dimension", I64Attr>,
-  StructFieldAttr<"output_spatial_dimensions", I64ElementsAttr>] > {
-
-  let description = "Structure of dimension information for conv op";
-}
-
 def HLO_ConvOp : HLO_Op<"convolution", [NoSideEffect]>, BASE_HLO_ConvOp {
-  let arguments = (ins
-    HLO_Tensor:$lhs,
-    HLO_Tensor:$rhs,
-    // Default value: one for each of the spatial dimension.
-    OptionalAttr<I64ElementsAttr>:$window_strides,
-    // Default value: zero for each of the spatial dimension.
-    OptionalAttr<I64ElementsAttr>:$padding,
-    // Default value: one for each of the spatial dimension.
-    OptionalAttr<I64ElementsAttr>:$lhs_dilation,
-    // Default value: one for each of the spatial dimension.
-    OptionalAttr<I64ElementsAttr>:$rhs_dilation,
-    ConvDimensionNumbers:$dimension_numbers,
-    I64Attr:$feature_group_count,
-    I64Attr:$batch_group_count,
-    HLO_PrecisionConfigAttr:$precision_config
-  );
+  let arguments = !con(
+    (ins
+       HLO_Tensor:$lhs,
+       HLO_Tensor:$rhs),
+    ConvolutionAttributes<HLO_Dialect>.attributes);
 
   let results = (outs HLO_Tensor);
 }
@@ -979,15 +953,6 @@ def HLO_DotOp: HLO_Op<"dot", [NoSideEffect]>, BASE_HLO_DotOp {
   let results = (outs HLO_Tensor);
 }
 
-def DotDimensionNumbers : StructAttr<"DotDimensionNumbers", HLO_Dialect, [
-                StructFieldAttr<"lhs_batching_dimensions",   I64ElementsAttr>,
-                StructFieldAttr<"rhs_batching_dimensions",   I64ElementsAttr>,
-                StructFieldAttr<"lhs_contracting_dimensions", I64ElementsAttr>,
-                StructFieldAttr<"rhs_contracting_dimensions", I64ElementsAttr>
-  ]> {
-  let description = "Structure of dimension information for dot product";
-}
-
 def HLO_DotGeneralOp: HLO_Op<"dot_general", [NoSideEffect]>, BASE_HLO_DotGeneralOp {
   let arguments = (ins
     HLO_Tensor:$lhs,
@@ -1049,14 +1014,6 @@ def HLO_FftOp: HLO_Op<"fft", [NoSideEffect]>, BASE_HLO_FftOp {
   let results = (outs HLO_Tensor);
 }
 
-def GatherDimensionNumbers : StructAttr<"GatherDimensionNumbers", HLO_Dialect,
-      [StructFieldAttr<"offset_dims", I64ElementsAttr>,
-      StructFieldAttr<"collapsed_slice_dims", I64ElementsAttr>,
-      StructFieldAttr<"start_index_map", I64ElementsAttr>,
-      StructFieldAttr<"index_vector_dim", I64Attr>]> {
-  let description = "Structure of dimension information for gather";
-}
-
 def HLO_GatherOp: HLO_Op<"gather", [NoSideEffect]>, BASE_HLO_GatherOp {
   let arguments = (ins
     HLO_Tensor:$operand,
@@ -1067,6 +1024,8 @@ def HLO_GatherOp: HLO_Op<"gather", [NoSideEffect]>, BASE_HLO_GatherOp {
   );
 
   let results = (outs HLO_Tensor);
+
+  let hasCanonicalizer = 1;
 }
 
 def HLO_GetDimensionSizeOp: HLO_Op<"get_dimension_size", [NoSideEffect]>,
@@ -1079,6 +1038,8 @@ def HLO_GetDimensionSizeOp: HLO_Op<"get_dimension_size", [NoSideEffect]>,
   // XLA semantics is available. This limitation is because of the current XLA
   // implementation.
   let results = (outs I32Tensor);
+
+  let hasFolder = 1;
 }
 
 def HLO_MapOp: HLO_Op<"map",
@@ -1130,7 +1091,7 @@ def HLO_ScatterOp: HLO_Op<"scatter", [RecursiveSideEffects]>,
     HLO_Tensor:$operand,
     HLO_Tensor:$scatter_indices,
     HLO_Tensor:$updates,
-    ScatterDimensionNumbers<HLO_Dialect>:$scatter_dimension_numbers,
+    ScatterDimensionNumbers:$scatter_dimension_numbers,
     DefaultValuedAttr<BoolAttr, "false">:$indices_are_sorted,
     DefaultValuedAttr<BoolAttr, "false">:$unique_indices
   );
@@ -1140,10 +1101,15 @@ def HLO_ScatterOp: HLO_Op<"scatter", [RecursiveSideEffects]>,
   let results = (outs HLO_Tensor);
 
   let hasCustomHLOConverter = 1;
+
+  let hasFolder = 1;
 }
 
 // TODO(jpienaar): Add broadcastable trait.
-def HLO_SelectOp: HLO_Op<"select", [NoSideEffect, DeclareOpInterfaceMethods<InferTypeOpInterface>]>, BASE_HLO_SelectOp {
+def HLO_SelectOp: HLO_Op<"select", [NoSideEffect,
+    DeclareOpInterfaceMethods<InferShapedTypeOpInterface,
+    ["reifyReturnTypeShapes"]>, DeclareOpInterfaceMethods<InferTypeOpInterface>,
+    ]>, BASE_HLO_SelectOp {
   let arguments = (ins
     HLO_PredTensor:$pred,
     HLO_Tensor:$on_true,
@@ -1151,6 +1117,8 @@ def HLO_SelectOp: HLO_Op<"select", [NoSideEffect, DeclareOpInterfaceMethods<Infe
   );
 
   let results = (outs HLO_Tensor);
+
+  let hasFolder = 1;
 }
 
 def HLO_SelectAndScatterOp: HLO_Op<"select_and_scatter",
@@ -1181,14 +1149,14 @@ def HLO_SetDimensionSizeOp: HLO_Op<"set_dimension_size", [NoSideEffect]>,
   let results = (outs HLO_Tensor);
 }
 
-def HLO_SortOp : HLO_Op<"sort", [RecursiveSideEffects]>, BASE_HLO_SortOp {
+def HLO_SortOp : HLO_Op<"sort", [RecursiveSideEffects, SameOperandsAndResultShape]>, BASE_HLO_SortOp {
   let arguments = (ins
     Variadic<HLO_Tensor>:$operands,
     DefaultValuedAttr<I64Attr, "-1">:$dimension,
     DefaultValuedAttr<BoolAttr, "false">:$is_stable
   );
 
-  let results = (outs HLO_TensorOrTuple);
+  let results = (outs Variadic<HLO_Tensor>);
 
   let regions = (region SizedRegion<1>:$comparator);
 
@@ -1412,4 +1380,21 @@ def HLO_FusionOp : HLO_Op<"fusion", []> {
   let hasCustomHLOConverter = 1;
 }
 
+// This is an op for purposes internal to XLA/GPU.
+def HLO_BitcastOp : HLO_Op<"bitcast", [NoSideEffect]>, BASE_HLO_BitcastOp {
+  let arguments = (ins HLO_Tensor:$operand);
+  let results = (outs HLO_Tensor);
+  let hasCustomHLOConverter = 1;
+}
+
+def HLO_ReducePrecisionOp: HLO_Op<"reduce_precision", [SameOperandsAndResultShape]>,
+                           BASE_HLO_ReducePrecisionOp {
+  let arguments = (ins
+    HLO_FpTensor:$operand,
+    I32Attr:$exponent_bits,
+    I32Attr:$mantissa_bits
+  );
+  let results = (outs HLO_FpTensor:$output);
+}
+
 #endif // HLO_OPS
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td
index 2f80545ad19..da8c921a47b 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td
@@ -18,6 +18,13 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 
+def HLO_Dialect : Dialect {
+  let name = "mhlo";
+  let cppNamespace = "::mlir::mhlo";
+}
+
+include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.td"
+
 def HLO_Pred : TypeAlias<I1, "pred (AKA boolean or 1-bit integer)">;
 
 // TODO(hinsu): Use signed integers instead of signless integer which is being
@@ -45,9 +52,7 @@ def HLO_Token : Type<CPred<"$_self.isa<TokenType>()">, "token">;
 def HLO_IntTensor : TensorOf<[HLO_Int]>;
 
 // Any integer tensor type with rank 0 (i.e. representing a single integer).
-def HLO_ScalarIntTensor : ShapedContainerType<
-  [HLO_Int], And<[IsTensorTypePred, HasAnyRankOfPred<[0]>]>,
-  "a 0-dim integer tensor">;
+def HLO_ScalarIntTensor : 0DTensorOf<[HLO_Int]>;
 
 // Any floating-point tensor types
 def HLO_FpTensor : TensorOf<[AnyFloat]>;
@@ -67,10 +72,7 @@ def HLO_TensorOrTokenOrTuple : AnyTypeOf<[HLO_Tensor, HLO_Token, HLO_Tuple]>;
 def HLO_DimensionValue : AnyTypeOf<[Index, HLO_Pred, HLO_Int]>;
 
 // Dynamic representation of a shape vector as a tensor.
-def HLO_DimensionTensor : ShapedContainerType<
-    [HLO_DimensionValue],
-    And<[IsTensorTypePred, HasAnyRankOfPred<[1]>]>,
-    "a 1D tensor of dimensions">;
+def HLO_DimensionTensor : 1DTensorOf<[HLO_DimensionValue]>;
 
 // In general, static shaped tensor constraints should be avoided unless
 // it is for a legacy op which is only correct with static shapes.
@@ -132,6 +134,17 @@ class BASE_HLO_AbsOp {
   }];
 }
 
+class BASE_HLO_CbrtOp {
+  string summary = "Cubic root operator";
+
+  string description = [{
+    Returns element-wise cubic root of the operand.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
+  }];
+}
+
 class BASE_HLO_CeilOp {
   string summary = "Ceil operator";
 
@@ -608,15 +621,6 @@ class BASE_HLO_CaseOp {
 // XLA parallelism related op definitions.
 //===----------------------------------------------------------------------===//
 
-// Represents a unique identifier for each Send/Recv instruction pair or
-// optionally for collective instructions (AllReduce, CollectivePermute,
-// AllToAll). Non-positive channel_id handle is equivalent to no channel id.
-class ChannelHandle<Dialect dialect> : StructAttr<"ChannelHandle", dialect, [
-                StructFieldAttr<"handle", I64Attr>,
-                StructFieldAttr<"type", I64Attr>]> {
-  let description = "two 64-bit integers 'handle' and 'type'";
-}
-
 class BASE_HLO_ReplicaIdOp {
   string summary = "ReplicaId operator";
 
@@ -706,6 +710,7 @@ def HLO_PrecisionConfigAttr:
     OptionalAttr<
           TypedArrayAttrBase<HLO_PrecisionAttr, "Precision Config attribute">>;
 
+
 //===----------------------------------------------------------------------===//
 // Fast Fourier Transform Type enum definitions.
 //===----------------------------------------------------------------------===//
@@ -1001,6 +1006,27 @@ class BASE_HLO_ConcatenateOp {
    }];
 }
 
+//===----------------------------------------------------------------------===//
+// Common convolution attributes
+//===----------------------------------------------------------------------===//
+
+class ConvolutionAttributes<Dialect dialect> {
+  dag attributes = (ins
+    // Default value: one for each of the spatial dimension.
+    OptionalAttr<I64ElementsAttr>:$window_strides,
+    // Default value: zero for each of the spatial dimension.
+    OptionalAttr<I64ElementsAttr>:$padding,
+    // Default value: one for each of the spatial dimension.
+    OptionalAttr<I64ElementsAttr>:$lhs_dilation,
+    // Default value: one for each of the spatial dimension.
+    OptionalAttr<I64ElementsAttr>:$rhs_dilation,
+    ConvDimensionNumbers:$dimension_numbers,
+    I64Attr:$feature_group_count,
+    I64Attr:$batch_group_count,
+    HLO_PrecisionConfigAttr:$precision_config
+  );
+}
+
 class BASE_HLO_ConvOp {
   string summary = "Convolution operator";
 
@@ -1122,15 +1148,6 @@ class BASE_HLO_ReshapeOp {
   }];
 }
 
-class ScatterDimensionNumbers<Dialect dialect> : StructAttr<
-    "ScatterDimensionNumbers", dialect, [
-      StructFieldAttr<"update_window_dims", I64ElementsAttr>,
-      StructFieldAttr<"inserted_window_dims", I64ElementsAttr>,
-      StructFieldAttr<"scatter_dims_to_operand_dims", I64ElementsAttr>,
-      StructFieldAttr<"index_vector_dim", I64Attr>]> {
-  let description = "Structure of dimension information for scatter";
-}
-
 class BASE_HLO_ScatterOp {
   string summary = "Scatter operator";
 
@@ -1341,4 +1358,17 @@ class BASE_HLO_WhileOp {
   }];
 }
 
+class BASE_HLO_BitcastOp {
+  string summary = "Bitcast operator";
+
+  string description = [{
+    This op changes the shape of the input in the way that the physical
+    arranggment of elements are unchanged.
+
+    However, the op needs layout information to make sense of "physical
+    arrangement of elements". Layout support in MHLO is currently under
+    exploration.
+  }];
+}
+
 #endif // HLO_OPS_BASE
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h
new file mode 100644
index 00000000000..3b78ff8a367
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines structures used in MHLO and LMHLO.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_BASE_STRUCTS_H_
+#define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_BASE_STRUCTS_H_
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+
+// Order matters, this .inc header is not self-contained, and relies on the
+// #includes above.
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_HLO_OPS_BASE_STRUCTS_H_
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.td
new file mode 100644
index 00000000000..d25eb5104c6
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.td
@@ -0,0 +1,73 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef HLO_OPS_BASE_STRUCTS
+#define HLO_OPS_BASE_STRUCTS
+
+//===----------------------------------------------------------------------===//
+// Dot dimensions enum definitions.
+//===----------------------------------------------------------------------===//
+
+def DotDimensionNumbers : StructAttr<"DotDimensionNumbers", HLO_Dialect, [
+                StructFieldAttr<"lhs_batching_dimensions",   I64ElementsAttr>,
+                StructFieldAttr<"rhs_batching_dimensions",   I64ElementsAttr>,
+                StructFieldAttr<"lhs_contracting_dimensions", I64ElementsAttr>,
+                StructFieldAttr<"rhs_contracting_dimensions", I64ElementsAttr>
+  ]> {
+  let description = "Structure of dimension information for dot product";
+}
+
+def ScatterDimensionNumbers : StructAttr<
+    "ScatterDimensionNumbers", HLO_Dialect, [
+      StructFieldAttr<"update_window_dims", I64ElementsAttr>,
+      StructFieldAttr<"inserted_window_dims", I64ElementsAttr>,
+      StructFieldAttr<"scatter_dims_to_operand_dims", I64ElementsAttr>,
+      StructFieldAttr<"index_vector_dim", I64Attr>]> {
+  let description = "Structure of dimension information for scatter";
+}
+
+def ConvDimensionNumbers : StructAttr<"ConvDimensionNumbers", HLO_Dialect, [
+  StructFieldAttr<"input_batch_dimension",I64Attr>,
+  StructFieldAttr<"input_feature_dimension", I64Attr>,
+  StructFieldAttr<"input_spatial_dimensions", I64ElementsAttr>,
+  StructFieldAttr<"kernel_input_feature_dimension", I64Attr>,
+  StructFieldAttr<"kernel_output_feature_dimension", I64Attr>,
+  StructFieldAttr<"kernel_spatial_dimensions", I64ElementsAttr>,
+  StructFieldAttr<"output_batch_dimension", I64Attr>,
+  StructFieldAttr<"output_feature_dimension", I64Attr>,
+  StructFieldAttr<"output_spatial_dimensions", I64ElementsAttr>] > {
+
+  let description = "Structure of dimension information for conv op";
+}
+
+def GatherDimensionNumbers : StructAttr<"GatherDimensionNumbers", HLO_Dialect,
+      [StructFieldAttr<"offset_dims", I64ElementsAttr>,
+      StructFieldAttr<"collapsed_slice_dims", I64ElementsAttr>,
+      StructFieldAttr<"start_index_map", I64ElementsAttr>,
+      StructFieldAttr<"index_vector_dim", I64Attr>]> {
+  let description = "Structure of dimension information for gather";
+}
+
+
+// Represents a unique identifier for each Send/Recv instruction pair or
+// optionally for collective instructions (AllReduce, CollectivePermute,
+// AllToAll). Non-positive channel_id handle is equivalent to no channel id.
+def ChannelHandle : StructAttr<"ChannelHandle", HLO_Dialect, [
+                StructFieldAttr<"handle", I64Attr>,
+                StructFieldAttr<"type", I64Attr>]> {
+  let description = "two 64-bit integers 'handle' and 'type'";
+}
+
+#endif // HLO_OPS_BASE_STRUCTS
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_utils.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_utils.td
index c201aeff8ec..32940cbc623 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_utils.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_utils.td
@@ -28,7 +28,7 @@ class ConstantSplat<string value> : NativeCodeCall<
     "hlo::getSplat(&$_builder, $0, " # value # ")">;
 
 class HLO_ConstantLike<string value> : NativeCodeCall<
-    "chlo::getConstantLike($_builder, " # value # ", $0)">;
+    "chlo::getConstantLike($_builder, $_loc, " # value # ", $0)">;
 
 def NullDenseIntElementsAttr : NativeCodeCall<"DenseIntElementsAttr()">;
 
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h
new file mode 100644
index 00000000000..effa9ecc83b
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations used in the LHLO dialect.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_GPU_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_GPU_OPS_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h"
+#include "mlir-hlo/Dialect/mhlo/IR/infer_fusibility_op_interface.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+#include "mlir/Interfaces/CopyOpInterface.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
+
+namespace mlir {
+class OpBuilder;
+}  // namespace mlir
+
+
+namespace mlir {
+namespace lmhlo_gpu {
+
+class LmhloGpuDialect : public Dialect {
+ public:
+  explicit LmhloGpuDialect(MLIRContext *context);
+  static StringRef getDialectNamespace() { return "lmhlo_gpu"; }
+};
+
+}  // namespace lmhlo_gpu
+}  // end namespace mlir
+
+#define GET_OP_CLASSES
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_GPU_OPS_H_
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.td
new file mode 100644
index 00000000000..b3708bf4ff1
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.td
@@ -0,0 +1,210 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the operation definition file for LHMLO level GPU operations.
+// Because these are LMHLO level operations, they operate on memrefs.
+
+#ifndef LHLO_GPU_OPS
+#define LHLO_GPU_OPS
+
+include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops_base.td"
+include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_base.td"
+include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.td"
+
+
+class LHLOGPU_Op<string mnemonic, list<OpTrait> traits = []> :
+  Op<LHLO_GPU_Dialect, mnemonic,
+    !listconcat([MemoryEffects<[MemRead, MemWrite]>], traits)>;
+
+// Type for scratch buffers used by GPU library calls (memref<?xi8>)
+def UntypedBuffer : MemRefRankOf<[I8], [1]>;
+
+// Cholesky info output buffer type.
+def I32Buffer : MemRefOf<[I32]>;
+
+//===----------------------------------------------------------------------===//
+// LMHLO ops representing batch norm library functions.
+//===----------------------------------------------------------------------===//
+
+// Note: these are semantically different from similar LHLO as the GPU library
+// calls generate or consume standard deviation, whereas LHLO ops generate or
+// consume variance (= std-dev ^ 2).
+
+def LHLOGPU_BatchNormGradOp : LHLOGPU_Op<"batch_norm_grad">,
+    BASE_HLO_BatchNormGradOp {
+  let arguments = (ins
+    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
+    Arg<LHLO_Buffer, "", [MemRead]>:$scale,
+    Arg<LHLO_Buffer, "", [MemRead]>:$mean,
+    Arg<LHLO_Buffer, "", [MemRead]>:$stddev,
+    Arg<LHLO_Buffer, "", [MemRead]>:$grad_output,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$grad_operand,  // gradient of $operand.
+    Arg<LHLO_Buffer, "", [MemWrite]>:$grad_scale,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$grad_offset,
+    F32Attr:$epsilon,
+    I64Attr:$feature_index
+  );
+}
+
+def LHLOGPU_BatchNormInferenceOp : LHLOGPU_Op<"batch_norm_inference">,
+    BASE_HLO_BatchNormInferenceOp {
+  let arguments = (ins
+    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
+    Arg<LHLO_Buffer, "", [MemRead]>:$scale,
+    Arg<LHLO_Buffer, "", [MemRead]>:$offset,
+    Arg<LHLO_Buffer, "", [MemRead]>:$mean,
+    Arg<LHLO_Buffer, "", [MemRead]>:$stddev,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
+    F32Attr:$epsilon,
+    I64Attr:$feature_index);
+}
+
+def LHLOGPU_BatchNormTrainingOp : LHLOGPU_Op<"batch_norm_training">,
+    BASE_HLO_BatchNormTrainingOp {
+
+  let arguments = (ins
+    Arg<LHLO_Buffer, "", [MemRead]>:$operand,
+    Arg<LHLO_Buffer, "", [MemRead]>:$scale,
+    Arg<LHLO_Buffer, "", [MemRead]>:$offset,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$batch_mean,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$batch_stddev,
+    F32Attr:$epsilon,
+    I64Attr:$feature_index
+  );
+}
+
+//===----------------------------------------------------------------------===//
+// LMHLO ops representing convolution library functions.
+//===----------------------------------------------------------------------===//
+
+def ActivationModeNone : StrEnumAttrCase<"None">;
+def ActivationModeSigmoid : StrEnumAttrCase<"Sigmoid">;
+def ActivationModeTanh : StrEnumAttrCase<"Relu">;
+def ActivationModeRelu : StrEnumAttrCase<"Relu">;
+def ActivationModeRelu6 : StrEnumAttrCase<"Relu6">;
+def ActivationModeReluX : StrEnumAttrCase<"ReluX">;
+def ActivationModeBandPass : StrEnumAttrCase<"BandPass">;
+
+def ActivationAttr : StrEnumAttr<"Activation",
+    "Activation applied with fused convolution",
+    [ActivationModeNone,  ActivationModeSigmoid, ActivationModeTanh,
+     ActivationModeRelu, ActivationModeRelu6, ActivationModeReluX,
+     ActivationModeBandPass]>;
+
+def GpuConvolutionAttributes {
+  dag attributes = !con(
+    ConvolutionAttributes<LHLO_GPU_Dialect>.attributes,
+    (ins F64Attr:$result_scale),
+    (ins ConvolutionBackendConfigAttr:$backend_config));
+}
+
+def GpuFusedConvolutionAttributes {
+  dag attributes = !con(
+    ConvolutionAttributes<LHLO_GPU_Dialect>.attributes,
+    (ins F64Attr:$result_scale,
+         ActivationAttr:$activation_mode,
+         F64Attr:$side_input_scale),
+    (ins ConvolutionBackendConfigAttr:$backend_config));
+}
+
+def LHLOGPU_ConvForwardOp : LHLOGPU_Op<"conv_forward"> {
+  let arguments = !con(
+    (ins
+       Arg<LHLO_Buffer, "", [MemRead]>:$input,
+       Arg<LHLO_Buffer, "", [MemRead]>:$filter,
+       Arg<LHLO_Buffer, "", [MemWrite]>:$output,
+       Arg<UntypedBuffer, "", [MemWrite]>:$scratch),
+     GpuConvolutionAttributes.attributes);
+}
+
+def LHLOGPU_ConvBackwardInputOp : LHLOGPU_Op<"conv_backwardinput"> {
+  let arguments = !con(
+    (ins
+       Arg<LHLO_Buffer, "", [MemRead]>:$d_output,
+       Arg<LHLO_Buffer, "", [MemRead]>:$filter,
+       Arg<LHLO_Buffer, "", [MemWrite]>:$d_input,
+       Arg<UntypedBuffer, "", [MemWrite]>:$scratch),
+     GpuConvolutionAttributes.attributes);
+}
+
+def LHLOGPU_ConvBackwardFilterOp : LHLOGPU_Op<"conv_backwardfilter"> {
+  let arguments = !con(
+    (ins
+       Arg<LHLO_Buffer, "", [MemRead]>:$input,
+       Arg<LHLO_Buffer, "", [MemRead]>:$d_output,
+       Arg<LHLO_Buffer, "", [MemWrite]>:$d_filter,
+       Arg<UntypedBuffer, "", [MemWrite]>:$scratch),
+     GpuConvolutionAttributes.attributes);
+}
+
+// output = activation(result_scale * conv(input, filter) +
+//                     side_input * side_input_scale +
+//                     bias)
+def LHLOGPU_ConvForwardFusedOp : LHLOGPU_Op<"conv_forward_fused"> {
+  let arguments = !con(
+    (ins
+       Arg<LHLO_Buffer, "", [MemRead]>:$input,
+       Arg<LHLO_Buffer, "", [MemRead]>:$filter,
+       Arg<LHLO_Buffer, "", [MemRead]>:$bias,
+       Arg<LHLO_Buffer, "", [MemRead]>:$side_input,
+       Arg<LHLO_Buffer, "", [MemWrite]>:$output,
+       Arg<UntypedBuffer, "", [MemWrite]>:$scratch),
+     GpuFusedConvolutionAttributes.attributes);
+}
+
+//===----------------------------------------------------------------------===//
+// LMHLO ops representing other library functions.
+//===----------------------------------------------------------------------===//
+
+// output = alpha * (lhs * rhs)
+// Verify: beta = 0.0
+def LHLOGPU_GEMMOp : LHLOGPU_Op<"gemm"> {
+  let arguments = (ins
+    Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
+    Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
+    Arg<LHLO_Buffer, "", [MemRead]>:$output,
+    DotDimensionNumbers:$dot_dimension_numbers,
+    F64Attr:$alpha,
+    I64Attr:$batch_size,
+    I64Attr:$algorithm);
+}
+
+// output = alpha(lhs * rhs) + beta * bias
+def LHLOGPU_GEMM_BiasOp : LHLOGPU_Op<"gemm_bias"> {
+  let arguments = (ins
+    Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
+    Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
+    Arg<LHLO_Buffer, "", [MemRead]>:$bias,
+    Arg<LHLO_Buffer, "", [MemRead]>:$output,
+    DotDimensionNumbers:$dot_dimension_numbers,
+    F64Attr:$alpha,
+    F64Attr:$beta,
+    I64Attr:$batch_size,
+    I64Attr:$algorithm);
+}
+
+def LHLOGPU_CholeskyOp : LHLOGPU_Op<"cholesky"> {
+  let arguments = (ins
+    Arg<LHLO_Buffer, "", [MemRead]>:$input,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
+    Arg<UntypedBuffer, "", [MemWrite]>:$scratch,
+    Arg<I32Buffer, "", [MemWrite]>:$info,
+    BoolAttr:$is_upper);
+}
+
+#endif // LHLO_GPU_OPS
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_base.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_base.td
new file mode 100644
index 00000000000..820e4ce64b0
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_base.td
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// We define the dialect here so that both structs and ops can refer to it.
+
+#ifndef LHLO_GPU_OPS_BASE
+#define LHLO_GPU_OPS_BASE
+
+include "mlir/IR/OpBase.td"
+
+def LHLO_GPU_Dialect : Dialect {
+  let name = "lmhlo_gpu";
+  let cppNamespace = "::mlir::lmhlo_gpu";
+}
+
+#endif // LHLO_GPU_OPS_BASE
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h
new file mode 100644
index 00000000000..ff642b82c22
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License.
+ *     ==============================================================================*/
+
+// This file defines structures used in the LMHLO_GPU dialect.
+
+#ifndef THIRD_PARTY_TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_GPU_OPS_STRUCTS_H_
+#define THIRD_PARTY_TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_GPU_OPS_STRUCTS_H_
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Identifier.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Types.h"
+
+// Order matters, this .inc header is not self-contained, and relies on the
+// #includes above.
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h.inc"
+
+#endif  // THIRD_PARTY_TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_GPU_OPS_STRUCTS_H_
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.td
new file mode 100644
index 00000000000..2236fc38e29
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.td
@@ -0,0 +1,29 @@
+
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef LHLO_GPU_OPS_STRUCTS
+#define LHLO_GPU_OPS_STRUCTS
+
+include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_base.td"
+
+def ConvolutionBackendConfigAttr : StructAttr<"ConvolutionBackendConfig",
+                                          LHLO_GPU_Dialect, [
+   StructFieldAttr<"algorithm", I64Attr>,
+   StructFieldAttr<"tensor_ops_enabled", BoolAttr>]> {
+   let description = "GPU Convolution backend configuration";
+}
+
+#endif // LHLO_GPU_OPS_STRUCTS
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h
index bb9b29096f3..9dc6d7aa0c0 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h
@@ -13,12 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This file defines the operations used in the LXLA dialect.
+// This file defines the operations used in the LHLO dialect.
 
 #ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_OPS_H_
 #define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_OPS_H_
 
 #include "llvm/ADT/StringRef.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/Location.h"
@@ -27,14 +28,12 @@ limitations under the License.
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/StandardTypes.h"
 #include "mlir/IR/Types.h"
+#include "mlir/Interfaces/CopyOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
 
 namespace mlir {
 class OpBuilder;
-
-#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops_structs.h.inc"
-
 namespace lmhlo {
 
 class LmhloDialect : public Dialect {
@@ -43,10 +42,10 @@ class LmhloDialect : public Dialect {
   static StringRef getDialectNamespace() { return "lmhlo"; }
 };
 
-#define GET_OP_CLASSES
-#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h.inc"
-
 }  // namespace lmhlo
 }  // end namespace mlir
 
+#define GET_OP_CLASSES
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h.inc"
+
 #endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_IR_LHLO_OPS_H_
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td
index 750cce65b62..25d5e50af7d 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.td
@@ -16,58 +16,34 @@ limitations under the License.
 // This is the operation definition file for LMHLO, the "late" MHLO variant of
 // the dialect, which operates on buffers instead of tensors.
 //
-// This file largely overlaps with mhlo_ops.td at a logic level. It's tempting to
-// merge these two files together, but we need to consider the following
+// This file largely overlaps with hlo_ops.td at a logical level. It's tempting
+// to merge these two files together, but we need to consider the following
 // obstacles:
 // * We need to have a common representation for arguments. That is to say,
-// HLO_Array<X> translates to HLO_Tensor<X> in HLO dialect, and
-// Arg<LHLO_Buffer<X>, "", [Mem(Read|Write)]> in LHLO. Array types within tuples
-// also need to be transformed.
+//   HLO_Array<X> translates to HLO_Tensor<X> in HLO dialect, and
+//   Arg<LHLO_Buffer<X>, "", [Mem(Read|Write)]> in LHLO. Array types within
+//   tuples also need to be transformed.
 // * As of now, TableGen's dag functions are not sufficient to accomplish the
-// one above.
-// * Traits aren't identical, but need to be coped. For example,
-// SameOperandAndResultType in HLO corresponds to SameTypeOperands in LHLO.
+//   one above.
+// * Traits aren't identical, but need to be copied. For example,
+//   SameOperandAndResultType in HLO corresponds to SameTypeOperands in LHLO.
 // * Also, currently HLO describes the API in XLA's client side, not service
-// side. LHLO aims for the service side.
+//   side. LHLO aims for the service side.
 
 #ifndef LHLO_OPS
 #define LHLO_OPS
 
 include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/CopyOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
-include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td"
+include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops_base.td"
 
 def LHLO_Dialect : Dialect {
   let name = "lmhlo";
-  let cppNamespace = "lmhlo";
+  let cppNamespace = "::mlir::lmhlo";
 }
 
-//===----------------------------------------------------------------------===//
-// LMHLO type definitions.
-//===----------------------------------------------------------------------===//
-
-// Any integer tensor types
-def LHLO_IntBuffer : MemRefOf<[HLO_Int]>;
-
-// Any floating-point tensor types
-def LHLO_FpBuffer : MemRefOf<[AnyFloat]>;
-
-def LHLO_ComplexBuffer : MemRefOf<[AnyComplex]>;
-
-def LHLO_FpOrComplexBuffer : MemRefOf<[AnyFloat, AnyComplex]>;
-
-def LHLO_PredBuffer : MemRefOf<[HLO_Pred]>;
-
-// Any integer or floating-point tensor types
-def LHLO_IntOrFpBuffer : MemRefOf<[HLO_Int, AnyFloat]>;
-
-def LHLO_PredOrIntBuffer : MemRefOf<[HLO_Int, HLO_Pred]>;
-
-def LHLO_Buffer : MemRefOf<[AnyFloat, AnySignlessInteger, AnyComplex]>;
-
-def LHLO_ExtentBuffer : MemRefRankOf<[AnySignlessInteger, Index], [1]>;
-
 //===----------------------------------------------------------------------===//
 // LMHLO nullary op definitions.
 //===----------------------------------------------------------------------===//
@@ -288,6 +264,16 @@ def LHLO_WhileOp: LHLO_Op<"while", [SameVariadicOperandSize]>,
   let regions = (region SizedRegion<1>:$cond, SizedRegion<1>:$body);
 }
 
+def LHLO_CustomCallOp : LHLO_Op<"custom_call", []>, BASE_HLO_CustomCallOp {
+  let arguments = (ins
+    Arg<Variadic<LHLO_Buffer>, "", [MemRead]>:$args,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
+    StrAttr:$call_target_name,
+    DefaultValuedAttr<BoolAttr, "false">:$has_side_effect,
+    DefaultValuedAttr<StrAttr, "">:$backend_config
+  );
+}
+
 //===----------------------------------------------------------------------===//
 // LMHLO tuple op definitions.
 //===----------------------------------------------------------------------===//
@@ -334,10 +320,11 @@ def HLO_DynamicUpdateSliceOp: LHLO_Op<"dynamic-update-slice", []> {
 def HLO_StaticMemRefCastOp: Op<LHLO_Dialect, "static_memref_cast",
     [NoSideEffect, DeclareOpInterfaceMethods<ViewLikeOpInterface>]> {
   let summary = [{
-    "modifies the offset, sizes and strides of a statically shaped memref.
+    modifies the offset, sizes and strides of a statically shaped memref
   }];
   let description = [{
-    Allows to modify the offset, sizes and strides of a statically shaped memref.
+    Casts the statically shaped memref operand to a memref with optionally
+    modified offsets, sizes and strides.
 
     Example:
     ```mlir
@@ -353,12 +340,11 @@ def HLO_StaticMemRefCastOp: Op<LHLO_Dialect, "static_memref_cast",
   let arguments = (ins Arg<LHLO_Buffer, "", []>:$operand);
   let results = (outs Res<LHLO_Buffer, "", []>:$result);
 
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, MemRefType resultType, " #
-    "Value operand", [{
-       result.addOperands(operand);
-       result.types.push_back(resultType);
-     }]>];
+  let builders = [OpBuilder<"MemRefType resultType, Value operand",
+    [{
+      $_state.addOperands(operand);
+      $_state.types.push_back(resultType);
+    }]>];
 
   let extraClassDeclaration = [{
     MemRefType getType() { return getResult().getType().cast<MemRefType>(); }
@@ -399,13 +385,13 @@ def HLO_DynamicMemRefCastOp: Op<LHLO_Dialect, "dynamic_memref_cast",
   );
   let results = (outs Res<LHLO_Buffer, "", []>:$result);
 
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, MemRefType resultType, " #
-    "Value operand, ValueRange sizes, ValueRange strides", [{
-       result.addOperands(operand);
-       result.addOperands(sizes);
-       result.addOperands(strides);
-       result.types.push_back(resultType);
+  let builders = [
+    OpBuilder<"MemRefType resultType, Value operand, ValueRange sizes, "
+              "ValueRange strides", [{
+      $_state.addOperands(operand);
+      $_state.addOperands(sizes);
+      $_state.addOperands(strides);
+      $_state.types.push_back(resultType);
      }]>];
 
   let extraClassDeclaration = [{
@@ -476,7 +462,8 @@ def ReshapeMemRefCastOp: Op<LHLO_Dialect, "reshape_memref_cast", [
   let results = (outs AnyRankedOrUnrankedMemRef:$result);
 
   let extraClassDeclaration = [{
-    MemRefType getType() { return getResult().getType().cast<MemRefType>(); }
+    BaseMemRefType getType() {
+        return getResult().getType().cast<BaseMemRefType>(); }
   }];
 
   let verifier = [{ return Verify(*this); }];
@@ -580,53 +567,32 @@ def LHLO_ConcatenateOp : LHLO_Op<"concatenate", []>, BASE_HLO_ConcatenateOp {
    );
 }
 
-// TODO(bondhugula): Make this struct dialect independent so that it can be
-// shared between the HLO and LHLO dialects.
-def ConvDimensionNumbers : StructAttr<"ConvDimensionNumbers", LHLO_Dialect, [
-  StructFieldAttr<"input_batch_dimension",I64Attr>,
-  StructFieldAttr<"input_feature_dimension", I64Attr>,
-  StructFieldAttr<"input_spatial_dimensions", I64ElementsAttr>,
-  StructFieldAttr<"kernel_input_feature_dimension", I64Attr>,
-  StructFieldAttr<"kernel_output_feature_dimension", I64Attr>,
-  StructFieldAttr<"kernel_spatial_dimensions", I64ElementsAttr>,
-  StructFieldAttr<"output_batch_dimension", I64Attr>,
-  StructFieldAttr<"output_feature_dimension", I64Attr>,
-  StructFieldAttr<"output_spatial_dimensions", I64ElementsAttr>] > {
-
-  let description = "Structure of dimension information for conv op";
-}
-
 def LHLO_ConvOp : LHLO_Op<"convolution", []>, BASE_HLO_ConvOp {
-  let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
-    Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
-    // Default value: one for each of the spatial dimension.
-    OptionalAttr<I64ElementsAttr>:$window_strides,
-    // Default value: zero for each of the spatial dimension.
-    OptionalAttr<I64ElementsAttr>:$padding,
-    // Default value: one for each of the spatial dimension.
-    OptionalAttr<I64ElementsAttr>:$lhs_dilation,
-    // Default value: one for each of the spatial dimension.
-    OptionalAttr<I64ElementsAttr>:$rhs_dilation,
-    ConvDimensionNumbers:$dimension_numbers,
-    I64Attr:$feature_group_count,
-    I64Attr:$batch_group_count,
-    HLO_PrecisionConfigAttr:$precision_config
-  );
+  let arguments = !con(
+    (ins
+       Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
+       Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
+       Arg<LHLO_Buffer, "", [MemWrite]>:$output),
+    ConvolutionAttributes<LHLO_Dialect>.attributes);
 }
 
-def LHLO_CopyOp: LHLO_Op<"copy", []>, BASE_HLO_CopyOp {
+def LHLO_CopyOp: LHLO_Op<"copy", [CopyOpInterface]>, BASE_HLO_CopyOp {
   let arguments = (ins
     Arg<LHLO_Buffer, "", [MemRead]>:$operand,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output
   );
+
+  let extraClassDeclaration = [{
+    Value getSource() { return operand();}
+    Value getTarget() { return output(); }
+  }];
 }
 
 def LHLO_DotOp: LHLO_Op<"dot", []>, BASE_HLO_DotOp {
   let arguments = (ins
     Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
     Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
+    DotDimensionNumbers:$dot_dimension_numbers,
     HLO_PrecisionConfigAttr:$precision_config,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output
   );
@@ -658,7 +624,7 @@ def LHLO_ScatterOp: LHLO_Op<"scatter", []>, BASE_HLO_ScatterOp {
     Arg<LHLO_Buffer, "", [MemRead]>:$scatter_indices,
     Arg<LHLO_Buffer, "", [MemRead]>:$updates,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output,
-    ScatterDimensionNumbers<LHLO_Dialect>:$scatter_dimension_numbers,
+    ScatterDimensionNumbers:$scatter_dimension_numbers,
     DefaultValuedAttr<BoolAttr, "false">:$indices_are_sorted,
     DefaultValuedAttr<BoolAttr, "false">:$unique_indices
   );
@@ -734,7 +700,7 @@ def LHLO_AllReduceOp : LHLO_Op<"all_reduce", [SameTypeOperands]>,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output,
     I64ElementsAttr:$replica_groups,
     DefaultValuedAttr<BoolAttr, "false">:$constrain_layout,
-    OptionalAttr<ChannelHandle<LHLO_Dialect>>:$channel_id,
+    OptionalAttr<ChannelHandle>:$channel_id,
     DefaultValuedAttr<BoolAttr, "false">:$use_global_device_ids
   );
   let regions = (region SizedRegion<1>:$computation);
@@ -747,7 +713,7 @@ def LHLO_CollectivePermuteOp: LHLO_Op<"collective_permute", [SameTypeOperands]>,
     Arg<LHLO_Buffer, "", [MemRead]>:$operand,
     Arg<LHLO_Buffer, "", [MemWrite]>:$output,
     I64ElementsAttr:$source_target_pairs,
-    OptionalAttr<ChannelHandle<LHLO_Dialect>>:$channel_id
+    OptionalAttr<ChannelHandle>:$channel_id
   );
 }
 
@@ -849,9 +815,8 @@ def FusionOp : LHLO_Op<"fusion", [SingleBlockImplicitTerminator<"TerminatorOp">]
 
   let skipDefaultBuilders = 1;
   let builders = [
-     OpBuilder<"OpBuilder &builder, OperationState &result, "
-               "ArrayRef<NamedAttribute> attributes">
-   ];
+     OpBuilder<"ArrayRef<NamedAttribute> attributes">
+  ];
 }
 
 def TerminatorOp :
@@ -860,9 +825,8 @@ def TerminatorOp :
   let description = [{
     Terminator operation for the LHLO dialect.
   }];
-  let builders = [OpBuilder<
-    "OpBuilder &b, OperationState &result, ValueRange operands",
-    [{ build(b, result, llvm::None, operands, llvm::None); }]
+  let builders = [OpBuilder<"ValueRange operands",
+    [{ build($_builder, $_state, llvm::None, operands, llvm::None); }]
   >];
 }
 
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops_base.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops_base.td
new file mode 100644
index 00000000000..9cd77417ffd
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops_base.td
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef LHLO_OPS_BASE
+#define LHLO_OPS_BASE
+
+include "mlir/IR/OpBase.td"
+include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td"
+
+//===----------------------------------------------------------------------===//
+// LMHLO type definitions.
+//===----------------------------------------------------------------------===//
+
+// Any integer tensor types
+def LHLO_IntBuffer : MemRefOf<[HLO_Int]>;
+
+// Any floating-point tensor types
+def LHLO_FpBuffer : MemRefOf<[AnyFloat]>;
+
+def LHLO_ComplexBuffer : MemRefOf<[AnyComplex]>;
+
+def LHLO_FpOrComplexBuffer : MemRefOf<[AnyFloat, AnyComplex]>;
+
+def LHLO_PredBuffer : MemRefOf<[HLO_Pred]>;
+
+// Any integer or floating-point tensor types
+def LHLO_IntOrFpBuffer : MemRefOf<[HLO_Int, AnyFloat]>;
+
+def LHLO_PredOrIntBuffer : MemRefOf<[HLO_Int, HLO_Pred]>;
+
+def LHLO_Buffer : MemRefOf<[AnyFloat, AnySignlessInteger, AnyComplex]>;
+
+def LHLO_ExtentBuffer : MemRefRankOf<[AnySignlessInteger, Index], [1]>;
+
+#endif // LHLO_OPS_BASE
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h
index 90ff6c99751..cb0af3a159d 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h
@@ -20,8 +20,6 @@ namespace mlir {
 class DialectRegistry;
 namespace mhlo {
 
-void registerAllDialects();
-
 // Add chlo, mhlo, lmhlo dialects to the provided registry.
 void registerAllMhloDialects(DialectRegistry &registry);
 }
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/PassDetail.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/PassDetail.h
new file mode 100644
index 00000000000..5f18eeb6ecc
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/PassDetail.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_PASSDETAIL_H_
+#define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_PASSDETAIL_H_
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace mhlo {
+
+#define GEN_PASS_CLASSES
+#include "mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.h.inc"
+
+}  // end namespace mhlo
+}  // end namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_PASSDETAIL_H_
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.td
index 963ff5dbacf..39b4ca65043 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.td
@@ -15,12 +15,6 @@ limitations under the License.
 
 include "mlir/Pass/PassBase.td"
 
-def LhloCopyRemovalPass : Pass<"lhlo-copy-removal", "FuncOp"> {
-  let summary = "Removes redundant LHLO copy operations.";
-  let constructor = "createLhloCopyRemovalPass()";
-}
-
-
 def LhloLegalizeToLinalgPass : Pass<"lhlo-legalize-to-linalg", "FuncOp"> {
   let summary = "Legalize from LHLO dialect to Linalg dialect.";
   let constructor = "createLegalizeLhloToLinalgPass()";
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h
index c51bcfcfe89..ac67619e6e3 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h
@@ -40,6 +40,7 @@ using HloToLhloOp = typename HloToLhloOpImpl<HloOpTy>::Type;
 MAP_HLO_TO_LHLO(AbsOp);
 MAP_HLO_TO_LHLO(AddOp);
 MAP_HLO_TO_LHLO(AndOp);
+MAP_HLO_TO_LHLO(Atan2Op);
 MAP_HLO_TO_LHLO(BroadcastInDimOp);
 MAP_HLO_TO_LHLO(CeilOp);
 MAP_HLO_TO_LHLO(ConstOp);
@@ -49,17 +50,21 @@ MAP_HLO_TO_LHLO(ConvOp);
 MAP_HLO_TO_LHLO(ConvertOp);
 MAP_HLO_TO_LHLO(CopyOp);
 MAP_HLO_TO_LHLO(CosOp);
+MAP_HLO_TO_LHLO(CustomCallOp);
 MAP_HLO_TO_LHLO(DivOp);
 MAP_HLO_TO_LHLO(DotOp);
 MAP_HLO_TO_LHLO(ExpOp);
+MAP_HLO_TO_LHLO(FloorOp);
 MAP_HLO_TO_LHLO(GatherOp);
 MAP_HLO_TO_LHLO(ImagOp);
 MAP_HLO_TO_LHLO(IotaOp);
+MAP_HLO_TO_LHLO(IsFiniteOp);
 MAP_HLO_TO_LHLO(LogOp);
 MAP_HLO_TO_LHLO(MaxOp);
 MAP_HLO_TO_LHLO(MinOp);
 MAP_HLO_TO_LHLO(MulOp);
 MAP_HLO_TO_LHLO(NegOp);
+MAP_HLO_TO_LHLO(NotOp);
 MAP_HLO_TO_LHLO(RealOp);
 MAP_HLO_TO_LHLO(ReduceOp);
 MAP_HLO_TO_LHLO(ReshapeOp);
@@ -68,9 +73,11 @@ MAP_HLO_TO_LHLO(RsqrtOp);
 MAP_HLO_TO_LHLO(SelectOp);
 MAP_HLO_TO_LHLO(SignOp);
 MAP_HLO_TO_LHLO(SinOp);
+MAP_HLO_TO_LHLO(SliceOp);
 MAP_HLO_TO_LHLO(SqrtOp);
 MAP_HLO_TO_LHLO(SubOp);
 MAP_HLO_TO_LHLO(TanhOp);
+MAP_HLO_TO_LHLO(TransposeOp);
 
 #undef MAP_HLO_TO_LHLO
 
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h
index 2bb5ab2888d..d59dfd43d1b 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/TypeUtilities.h"
 
 namespace mlir {
 namespace lmhlo {
@@ -96,7 +97,7 @@ template <typename SupportedType, typename StdScalarOp, typename... Args>
 struct MapLhloOpToStdScalarOpImpl<SupportedType, StdScalarOp, Args...> {
   Value operator()(Location loc, ArrayRef<Type> result_types,
                    ArrayRef<Value> args, OpBuilder* b) {
-    Type element_type = args.front().getType();
+    Type element_type = getElementTypeOrSelf(args.front().getType());
     if (element_type.isa<SupportedType>()) {
       return b->template create<StdScalarOp>(loc, result_types, args,
                                              mlir::None);
@@ -120,7 +121,7 @@ inline Value MapLhloOpToStdScalarOp<lmhlo::AbsOp>(Location loc,
                                                   ArrayRef<Type> result_types,
                                                   ArrayRef<Value> args,
                                                   OpBuilder* b) {
-  Type element_type = args.front().getType();
+  Type element_type = getElementTypeOrSelf(args.front().getType());
   if (element_type.isa<FloatType>()) {
     return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::AbsFOp>{}(
         loc, result_types, args, b);
@@ -130,8 +131,11 @@ inline Value MapLhloOpToStdScalarOp<lmhlo::AbsOp>(Location loc,
     Value lhs = args[0];
     auto integer_type = element_type.dyn_cast<IntegerType>();
 
-    auto zero_intval =
+    Value zero_intval =
         b->create<::mlir::ConstantIntOp>(loc, 0, integer_type.getWidth());
+    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
+      zero_intval = b->create<::mlir::SplatOp>(loc, vec_type, zero_intval);
+    }
     auto lhs_gt_zero = b->create<ScalarIOp<CompareOp>>(loc, CmpIPredicate::sge,
                                                        lhs, zero_intval);
     auto neg_val = b->create<ScalarIOp<lmhlo::SubOp>>(loc, zero_intval, lhs);
@@ -149,6 +153,15 @@ inline Value MapLhloOpToStdScalarOp<lmhlo::AndOp>(Location loc,
       loc, result_types, args, b);
 }
 
+template <>
+inline Value MapLhloOpToStdScalarOp<lmhlo::Atan2Op>(Location loc,
+                                                    ArrayRef<Type> result_types,
+                                                    ArrayRef<Value> args,
+                                                    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::Atan2Op>{}(
+      loc, result_types, args, b);
+}
+
 template <typename PredicateType>
 inline Optional<PredicateType> getCmpPredicate(StringRef comparison_direction) {
   return llvm::None;
@@ -187,7 +200,7 @@ inline Value MapCompareOpToStdScalarOp(Location loc,
                                        ArrayRef<Value> args, OpBuilder* b) {
   const auto& lhs = args[0];
   const auto& rhs = args[1];
-  Type element_type = lhs.getType();
+  Type element_type = getElementTypeOrSelf(lhs.getType());
   if (element_type.isSignlessInteger()) {
     Optional<CmpIPredicate> predicate =
         getCmpPredicate<CmpIPredicate>(comparison_direction);
@@ -259,8 +272,8 @@ template <>
 inline Value MapLhloOpToStdScalarOp<lmhlo::ConvertOp>(
     Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
     OpBuilder* b) {
-  Type sourceType = args.front().getType();
-  Type targetType = result_types.front();
+  Type sourceType = getElementTypeOrSelf(args.front().getType());
+  Type targetType = getElementTypeOrSelf(result_types.front());
 
   if (mlir::SIToFPOp::areCastCompatible(sourceType, targetType)) {
     return b->create<mlir::SIToFPOp>(loc, result_types, args, mlir::None);
@@ -336,6 +349,31 @@ inline Value MapLhloOpToStdScalarOp<lmhlo::SinOp>(Location loc,
       loc, result_types, args, b);
 }
 
+template <>
+inline Value MapLhloOpToStdScalarOp<lmhlo::FloorOp>(Location loc,
+                                                    ArrayRef<Type> result_types,
+                                                    ArrayRef<Value> args,
+                                                    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::FloorFOp>{}(
+      loc, result_types, args, b);
+}
+
+template <>
+inline Value MapLhloOpToStdScalarOp<lmhlo::IsFiniteOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  if (args[0].getType().isa<FloatType>()) {
+    auto pos_inf = APFloat::getInf(
+        args[0].getType().cast<FloatType>().getFloatSemantics());
+    auto const_pos_inf =
+        b->create<ConstantOp>(loc, b->getFloatAttr(args[0].getType(), pos_inf));
+    Value abs_x = b->create<::mlir::AbsFOp>(loc, args[0]);
+    return b->create<::mlir::CmpFOp>(loc, CmpFPredicate::ONE, abs_x,
+                                     const_pos_inf);
+  }
+  return nullptr;
+}
+
 /// Implements the conversion of HLO op to scalar op (to use within region of a
 /// linalg.generic op) for compare-select style operations like min/max.
 template <typename... Args>
@@ -356,7 +394,7 @@ struct CompareSelectOpToStdScalarOp<SupportedType, StdCompareOp, Predicate,
   static Value map(Location loc, StringRef comparison_direction,
                    ArrayRef<Type> result_types, ArrayRef<Value> args,
                    OpBuilder* b) {
-    Type element_type = args.front().getType();
+    Type element_type = getElementTypeOrSelf(args.front().getType());
     if (element_type.isa<SupportedType>()) {
       auto predicate = getCmpPredicate<Predicate>(comparison_direction);
       assert(predicate.hasValue() && "expected valid comparison direction");
@@ -405,7 +443,7 @@ inline Value MapLhloOpToStdScalarOp<lmhlo::NegOp>(Location loc,
                                                   ArrayRef<Type> result_types,
                                                   ArrayRef<Value> args,
                                                   OpBuilder* b) {
-  Type element_type = args.front().getType();
+  Type element_type = getElementTypeOrSelf(args.front().getType());
   if (element_type.isa<FloatType>()) {
     return MapLhloOpToStdScalarOpImpl<FloatType, ::mlir::NegFOp>{}(
         loc, result_types, args, b);
@@ -415,13 +453,34 @@ inline Value MapLhloOpToStdScalarOp<lmhlo::NegOp>(Location loc,
     Value lhs = args[0];
     auto integer_type = element_type.dyn_cast<IntegerType>();
 
-    auto zero_intval =
+    Value zero_intval =
         b->create<::mlir::ConstantIntOp>(loc, 0, integer_type.getWidth());
+    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
+      zero_intval = b->create<::mlir::SplatOp>(loc, vec_type, zero_intval);
+    }
     return b->create<ScalarIOp<lmhlo::SubOp>>(loc, zero_intval, lhs);
   }
   return nullptr;
 }
 
+template <>
+inline Value MapLhloOpToStdScalarOp<lmhlo::NotOp>(Location loc,
+                                                  ArrayRef<Type> result_types,
+                                                  ArrayRef<Value> args,
+                                                  OpBuilder* b) {
+  Type element_type = getElementTypeOrSelf(args.front().getType());
+  if (auto integer_type = element_type.dyn_cast<IntegerType>()) {
+    // lmhlo.not(x) -> x ^ -1
+    Value all_ones =
+        b->create<::mlir::ConstantIntOp>(loc, -1, integer_type.getWidth());
+    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
+      all_ones = b->create<::mlir::SplatOp>(loc, vec_type, all_ones);
+    }
+    return b->create<::mlir::XOrOp>(loc, all_ones, args[0]);
+  }
+  return nullptr;
+}
+
 template <>
 inline Value MapLhloOpToStdScalarOp<lmhlo::RsqrtOp>(Location loc,
                                                     ArrayRef<Type> result_types,
@@ -444,12 +503,37 @@ inline Value MapLhloOpToStdScalarOp<lmhlo::SignOp>(Location loc,
                                                    ArrayRef<Type> result_types,
                                                    ArrayRef<Value> args,
                                                    OpBuilder* b) {
-  Type element_type = args.front().getType();
-  if (element_type.isa<FloatType>()) {
-    FloatType float_type = element_type.cast<FloatType>();
-    APFloat const_value = float_type.isF32() ? APFloat(1.0f) : APFloat(1.0);
-    Value one = b->create<mlir::ConstantFloatOp>(loc, const_value, float_type);
+  Type element_type = getElementTypeOrSelf(args.front().getType());
+  if (auto float_type = element_type.dyn_cast<FloatType>()) {
+    bool ignored;
+    APFloat one_apfloat(1.0f);
+    one_apfloat.convert(float_type.getFloatSemantics(),
+                        APFloat::rmNearestTiesToEven, &ignored);
+    Value one = b->create<mlir::ConstantFloatOp>(loc, one_apfloat, float_type);
+    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
+      one = b->create<::mlir::SplatOp>(loc, vec_type, one);
+    }
     return b->create<::mlir::CopySignOp>(loc, result_types, one, args[0]);
+  } else if (auto integer_type = element_type.dyn_cast<IntegerType>()) {
+    // sign(x) = x == 0 ? 0 : ((x s>> 31) | 1)
+    Value zero =
+        b->create<::mlir::ConstantIntOp>(loc, 0, integer_type.getWidth());
+    Value bitwidth_minus_one = b->create<::mlir::ConstantIntOp>(
+        loc, integer_type.getWidth() - 1, integer_type.getWidth());
+    Value one =
+        b->create<::mlir::ConstantIntOp>(loc, 1, integer_type.getWidth());
+    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
+      zero = b->create<::mlir::SplatOp>(loc, vec_type, zero);
+      bitwidth_minus_one =
+          b->create<::mlir::SplatOp>(loc, vec_type, bitwidth_minus_one);
+      one = b->create<::mlir::SplatOp>(loc, vec_type, one);
+    }
+    Value cmp =
+        b->create<::mlir::CmpIOp>(loc, CmpIPredicate::eq, args[0], zero);
+    Value ashr =
+        b->create<::mlir::SignedShiftRightOp>(loc, args[0], bitwidth_minus_one);
+    Value or_op = b->create<::mlir::OrOp>(loc, ashr, one);
+    return b->create<::mlir::SelectOp>(loc, cmp, zero, or_op);
   }
   return nullptr;
 }
@@ -518,6 +602,27 @@ struct HloOpToStdScalarOp {
     return impl::MapCompareOpToStdScalarOp<lmhlo::CompareOp>(
         op.getLoc(), comparison_direction, result_types, args, b);
   }
+
+  // Implementation for LHLO ops except lmhlo::CompareOp.
+  template <typename LhloOpTy,
+            typename = std::enable_if_t<
+                !std::is_same<LhloOpTy, lmhlo::CompareOp>::value &&
+                std::is_same<typename mhlo::HloToLhloOp<LhloOpTy>,
+                             std::false_type>::value>>
+  static Value map(Location loc, ArrayRef<Type> result_types,
+                   ArrayRef<Value> args, OpBuilder* b, unsigned i = 0) {
+    return impl::MapLhloOpToStdScalarOp<LhloOpTy>(loc, result_types, args, b);
+  }
+
+  // Implementation for lmhlo::CompareOp.
+  template <typename LhloOpTy, typename = std::enable_if_t<std::is_same<
+                                   LhloOpTy, lmhlo::CompareOp>::value>>
+  static Value map(Location loc, StringRef comparison_direction,
+                   ArrayRef<Type> result_types, ArrayRef<Value> args,
+                   OpBuilder* b) {
+    return impl::MapCompareOpToStdScalarOp<lmhlo::CompareOp>(
+        loc, comparison_direction, result_types, args, b);
+  }
 };
 
 }  // namespace lmhlo
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.td
index fa3bde24df1..4348464fa74 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/mhlo_passes.td
@@ -15,9 +15,9 @@ limitations under the License.
 
 include "mlir/Pass/PassBase.td"
 
-def TestChloLegalizeToHloPass : Pass<"mhlo-test-chlo-legalize-to-hlo", "FuncOp"> {
-  let summary = "Test pass for applying chlo -> hlo legalization patterns.";
-  let constructor = "createTestChloLegalizeToHloPass()";
+def ChloLegalizeToHloPass : Pass<"chlo-legalize-to-hlo", "FuncOp"> {
+  let summary = "Legalize CHLO to HLO.";
+  let constructor = "createChloLegalizeToHloPass()";
 }
 
 def HloLegalizeToLhloPass : Pass<"hlo-legalize-to-lhlo", "ModuleOp"> {
@@ -30,15 +30,20 @@ def LegalizeControlFlowPass : Pass<"mhlo-legalize-control-flow", "FuncOp"> {
   let constructor = "createLegalizeControlFlowPass()";
 }
 
+def LegalizeControlFlowToScfPass : Pass<"mhlo-control-flow-to-scf", "FuncOp"> {
+  let summary = "Legalize from MHLO control flow to SCF control flow.";
+  let constructor = "createControlFlowToScfPass()";
+}
+
 def LegalizeGatherToTorchIndexSelectPass : Pass<"mhlo-legalize-gather-to-torch-index-select", "FuncOp"> {
   let summary = "Legalizes gathers to a torch index select.";
   let constructor = "createLegalizeGatherToTorchIndexSelectPass()";
 }
 
 
-def LegalizeTanhToApproximationPass : Pass<"mhlo-legalize-tanh-to-approximation", "FuncOp"> {
-  let summary = "Legalize tanh from standard dialect to an approximation.";
-  let constructor = "createLegalizeTanhToApproximationPass()";
+def LegalizeTanhToApproximationPass : Pass<"mhlo-legalize-trigonometric-to-approximation", "FuncOp"> {
+  let summary = "Legalize trigonometric operations from standard dialect to an approximation.";
+  let constructor = "createLegalizeTrigonometricToApproximationPass()";
 }
 
 
@@ -83,7 +88,7 @@ def OptimizeMhloPass : Pass<"mhlo-test-optimize", "FuncOp"> {
 }
 
 
-def SinkConstantsToControlFlowPass : Pass<"mhlo-sink-constants-to-control-flow", "FuncOp"> {
+def SinkConstantsToControlFlowPass : FunctionPass<"mhlo-sink-constants-to-control-flow"> {
   let summary = "Sink constants implicitly captured in control flow regions. This "
     "is necessary to export to XLA.";
   let constructor = "createSinkConstantsToControlFlowPass()";
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h
index efa116f3f0d..b1933f6686b 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h
@@ -30,14 +30,23 @@ template <typename T>
 class OperationPass;
 class Pass;
 
+// Transforms unranked HLO operations to ranked ones where possible.
+std::unique_ptr<FunctionPass> createTransformUnrankedHloPass();
+
 namespace mhlo {
 
 /// Lowers HLO control flow ops to the Standard dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeControlFlowPass();
 
+/// Lowers MHLO control flow ops to the SCF dialect.
+std::unique_ptr<OperationPass<FuncOp>> createControlFlowToScfPass();
+
 /// Lowers from HLO dialect to Standard dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeToStdPass();
 
+/// Lowers from the CHLO dialect to the HLO dialect.
+std::unique_ptr<FunctionPass> createChloLegalizeToHloPass();
+
 /// Lowers from HLO dialect to LHLO dialect allocating/deallocating temporary
 /// buffers if necessary. If `results_escape_functions` is set to true,
 /// allocated buffers for function results will be returned and escape the
@@ -49,9 +58,6 @@ std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToLhloPass(
 // Lowers from HLO dialect to Linalg dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeHloToLinalgPass();
 
-// Transforms unranked HLO operations to ranked ones where possible.
-std::unique_ptr<OperationPass<FuncOp>> createTransformUnrankedHloPass();
-
 // Sinks constants implicitly captured in control flow regions. This is
 // necessary to export to XLA.
 std::unique_ptr<OperationPass<FuncOp>> createSinkConstantsToControlFlowPass();
@@ -59,8 +65,10 @@ std::unique_ptr<OperationPass<FuncOp>> createSinkConstantsToControlFlowPass();
 // fuse mhlo ops to kLoop/kInput fusion patterns
 std::unique_ptr<OperationPass<FuncOp>> createMhloFusionPass();
 
-/// Lowers the standard TanhOp to an approximation that does not use intrinsics.
-std::unique_ptr<OperationPass<FuncOp>> createLegalizeTanhToApproximationPass();
+/// Lowers trigonometric operations from the standard dialect to approximations
+/// that do not use intrinsics.
+std::unique_ptr<OperationPass<FuncOp>>
+createLegalizeTrigonometricToApproximationPass();
 
 std::unique_ptr<FunctionPass> createOptimizeMhloPass();
 std::unique_ptr<FunctionPass> createLowerComplexPass();
@@ -92,12 +100,6 @@ std::unique_ptr<FunctionPass> createLegalizeToGpuPass();
 std::unique_ptr<FunctionPass> createLhloFuseLinalgPass(
     bool use_parallel_loops = false, llvm::ArrayRef<unsigned> tile_sizes = {});
 
-// Removes unnecessary LHLO copies which copy from the allocated buffers to the
-// block arguments. The block arguments are used instead of all uses of these
-// buffers. The buffers are freed. This pass only works in regions that contain
-// a single block.
-std::unique_ptr<Pass> createLhloCopyRemovalPass();
-
 // Lowers from LHLO dialect to parallel loops.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeLhloToParallelLoopsPass();
 
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/register_passes.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/register_passes.h
index 8f70f64359b..e9418f0e7a0 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/register_passes.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/register_passes.h
@@ -22,7 +22,6 @@ limitations under the License.
 namespace mlir {
 namespace mhlo {
 
-std::unique_ptr<Pass> createTestChloLegalizeToHloPass();
 std::unique_ptr<FunctionPass> createTestInferShapedTypeMethodsPass();
 std::unique_ptr<Pass> createTestMaterializeBroadcastsPass();
 std::unique_ptr<Pass> createTestUnfuseBatchNormPass();
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h
index 725155e9403..b6706187d50 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/Bufferize.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
@@ -27,6 +28,12 @@ class LLVMTypeConverter;
 class LowerToLLVMOptions;
 class OwningRewritePatternList;
 class BufferAssignmentPlacer;
+
+// Populates a collection of rewrite patterns to realize element-wise operations
+// on ranked tensors where possible.
+void PopulateTransformUnrankedHloPatterns(MLIRContext *context,
+                                          OwningRewritePatternList *patterns);
+
 namespace mhlo {
 
 // Collection of rewrite patterns for lowering a general dot product.
@@ -49,9 +56,10 @@ void PopulateMhloToStdPatterns(OwningRewritePatternList *patterns,
                                MLIRContext *ctx);
 
 // Collection of rewrite patterns for lowering of HLO to LHLO dialect.
-void populateHLOToLHLOConversionPattern(
-    MLIRContext *context, BufferAssignmentPlacer *bufferAssignment,
-    TypeConverter *converter, OwningRewritePatternList *patterns);
+void populateHLOToLHLOConversionPattern(MLIRContext *context,
+                                        BufferizeTypeConverter *converter,
+                                        OwningRewritePatternList *patterns);
+
 // Collection of rewrite patterns for lowering of HLO to Linalg dialect.
 void populateHLOToLinalgConversionPattern(MLIRContext *context,
                                           OwningRewritePatternList *patterns);
@@ -80,10 +88,10 @@ void PopulateTransformUnrankedHloPatterns(MLIRContext *context,
 void PopulateUnfuseBatchNormPatterns(MLIRContext *context,
                                      OwningRewritePatternList *patterns);
 
-// Populates a pattern that translates the standard TanhOp to an approximation
-// that does not use intrinsics.
-void PopulateTanhToApproximationPatterns(MLIRContext *context,
-                                         OwningRewritePatternList *patterns);
+// Populates patterns that translate the trigonometric operations from the
+// standard dialect to approximations that do not use intrinsics.
+void PopulateTrigonometricToApproximationPatterns(
+    MLIRContext *context, OwningRewritePatternList *patterns);
 
 }  // namespace mhlo
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/CMakeLists.txt
index d7bb5057b00..7c0c11b1edd 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/CMakeLists.txt
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/CMakeLists.txt
@@ -43,6 +43,7 @@ add_mlir_library(MhloInferFusibilityOpInterface
 
 add_mlir_dialect_library(MhloDialect
   hlo_ops.cc
+  hlo_ops_base_structs.cc
 
   DEPENDS
   MLIRhlo_opsIncGen
@@ -66,6 +67,15 @@ add_mlir_dialect_library(LmhloDialect
 )
 target_link_libraries(LmhloDialect PUBLIC MLIRIR)
 
+add_mlir_dialect_library(LmhloGPUDialect
+  lhlo_gpu_ops.cc
+  lhlo_gpu_ops_structs.cc
+
+  DEPENDS
+  MLIRlhlo_gpu_opsIncGen
+)
+target_link_libraries(LmhloGPUDialect PUBLIC MLIRIR)
+
 
 add_mlir_dialect_library(MhloRegisterDialects
   init.cc
@@ -73,10 +83,12 @@ DEPENDS
   MLIRchlo_opsIncGen
   MLIRhlo_opsIncGen
   MLIRlhlo_opsIncGen
+  MLIRlhlo_gpu_opsIncGen
 )
 target_link_libraries(MhloRegisterDialects
   PUBLIC
   ChloDialect
   MhloDialect
   LmhloDialect
+  LmhloGPUDialect
 )
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_ops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_ops.cc
index b5eacd686bd..99b22a75a14 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_ops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/chlo_ops.cc
@@ -303,9 +303,15 @@ void ConstantLikeOp::getCanonicalizationPatterns(
   results.insert<ConstantLikeToConstant>(context);
 }
 
+}  // namespace chlo
+}  // namespace mlir
+
 #define GET_OP_CLASSES
 #include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.cc.inc"
 
+namespace mlir {
+namespace chlo {
+
 //===----------------------------------------------------------------------===//
 // chlo Dialect Constructor
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc
index f5deb94e3a4..241b5938017 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -60,7 +61,9 @@ limitations under the License.
 
 namespace mlir {
 #include "hlo_patterns.cc.inc"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_structs.cc.inc"
+}  // namespace mlir
+
+namespace mlir {
 namespace mhlo {
 
 Operation* MhloDialect::materializeConstant(OpBuilder& builder, Attribute value,
@@ -165,6 +168,94 @@ static LogicalResult Verify(DotGeneralOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// GatherOp
+//===----------------------------------------------------------------------===//
+
+// Converts gather ops to slice ops in case we have a single set of constant
+// indices.
+struct GatherSlice : public OpRewritePattern<GatherOp> {
+  using OpRewritePattern<GatherOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(GatherOp gather,
+                                PatternRewriter& rewriter) const override {
+    DenseIntElementsAttr index;
+    if (!matchPattern(gather.start_indices(), m_Constant(&index)))
+      return failure();
+
+    const auto& dnums = gather.dimension_numbers();
+    if (dnums.index_vector_dim().getInt() != 0 || index.getType().getRank() > 1)
+      return failure();
+
+    // TODO(tberghammer): Remove when the verifier catches this case what is
+    // invalid if all previous condition holds.
+    if (index.getNumElements() != dnums.start_index_map().getNumElements())
+      return failure();
+
+    auto slice_end =
+        llvm::to_vector<8>(gather.slice_sizes().getValues<int64_t>());
+    llvm::SmallVector<int64_t, 8> slice_start(slice_end.size(), 0);
+    for (auto it : llvm::zip(dnums.start_index_map().getIntValues(),
+                             index.getIntValues())) {
+      int64_t map_index = std::get<0>(it).getSExtValue();
+      int64_t offset = std::get<1>(it).getSExtValue();
+      slice_start[map_index] += offset;
+      slice_end[map_index] += offset;
+    }
+
+    llvm::SmallVector<int64_t, 8> slice_stride(slice_end.size(), 1);
+    llvm::SmallVector<int64_t, 8> slice_shape(slice_end.size());
+    for (int64_t i = 0; i < slice_end.size(); ++i) {
+      slice_shape[i] = slice_end[i] - slice_start[i];
+    }
+    Type element_type = gather.getType().cast<TensorType>().getElementType();
+    auto slice_type = RankedTensorType::get(slice_shape, element_type);
+    Value result = rewriter.create<SliceOp>(
+        gather.getLoc(), slice_type, gather.getOperand(0),
+        GetI64ElementsAttr(slice_start, &rewriter),
+        GetI64ElementsAttr(slice_end, &rewriter),
+        GetI64ElementsAttr(slice_stride, &rewriter));
+
+    if (dnums.collapsed_slice_dims().getNumElements() > 0) {
+      auto collapsed_slice_dims = llvm::to_vector<8>(llvm::map_range(
+          dnums.collapsed_slice_dims().getIntValues(),
+          [](const llvm::APInt& i) { return i.getSExtValue(); }));
+      llvm::SmallVector<int64_t, 8> reshape_shape;
+      for (int64_t i = 0; i < slice_shape.size(); ++i) {
+        if (llvm::count(collapsed_slice_dims, i) == 0) {
+          reshape_shape.push_back(slice_shape[i]);
+        }
+      }
+      auto reshape_type = RankedTensorType::get(reshape_shape, element_type);
+      result =
+          rewriter.create<ReshapeOp>(gather.getLoc(), reshape_type, result);
+    }
+
+    result.setType(gather.getType());
+    rewriter.replaceOp(gather, result);
+    return success();
+  }
+};
+
+void GatherOp::getCanonicalizationPatterns(OwningRewritePatternList& results,
+                                           MLIRContext* context) {
+  results.insert<GatherSlice>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// GetDimensionSizeOp
+//===----------------------------------------------------------------------===//
+
+/// Fold get_dimension_size when the said shape dimension is a constant.
+OpFoldResult GetDimensionSizeOp::fold(ArrayRef<Attribute> attrs) {
+  RankedTensorType type = operand().getType().cast<RankedTensorType>();
+  int32_t dim = dimension();
+  if (type.isDynamic(dim)) return {};
+  // The result type is always is a 0-d i32 tensor.
+  return DenseIntElementsAttr::get<int32_t>(
+      getResult().getType().cast<RankedTensorType>(), type.getDimSize(dim));
+}
+
 //===----------------------------------------------------------------------===//
 // IotaOp
 //===----------------------------------------------------------------------===//
@@ -176,7 +267,7 @@ static LogicalResult Verify(IotaOp op) {
   if (shape.getRank() == 0)
     return op.emitOpError() << "does not support scalars.";
 
-  auto iota_dimension = op.iota_dimension().getSExtValue();
+  auto iota_dimension = op.iota_dimension();
   if (iota_dimension >= shape.getRank() || iota_dimension < 0)
     return op.emitOpError() << "iota dimension cannot go beyond the output "
                                "rank or be negative.";
@@ -198,8 +289,7 @@ struct IotaBroadcast : public OpRewritePattern<IotaOp> {
     auto iota_dimension = iota.iota_dimension();
 
     auto iota_type = RankedTensorType::get(
-        {result_ty.getDimSize(iota_dimension.getLimitedValue())},
-        result_ty.getElementType());
+        {result_ty.getDimSize(iota_dimension)}, result_ty.getElementType());
 
     auto new_iota = rewriter.create<IotaOp>(iota.getLoc(), iota_type,
                                             rewriter.getI64IntegerAttr(0));
@@ -219,7 +309,7 @@ void IotaOp::getCanonicalizationPatterns(OwningRewritePatternList& results,
 }
 
 OpFoldResult IotaOp::fold(ArrayRef<Attribute> operands) {
-  auto dimension = iota_dimension().getLimitedValue();
+  auto dimension = iota_dimension();
   auto result_ty = getResult().getType().cast<ShapedType>();
   if (result_ty.hasRank() && result_ty.getDimSize(dimension) == 1) {
     Builder builder(getContext());
@@ -263,7 +353,7 @@ struct DynamicIotaBroadcast : public OpRewritePattern<DynamicIotaOp> {
     }
 
     auto iota_dimension = iota.iota_dimension();
-    auto iota_dimension_int = iota_dimension.getLimitedValue();
+    auto iota_dimension_int = iota_dimension;
 
     auto converted_shape = rewriter.create<IndexCastOp>(
         iota.getLoc(),
@@ -462,7 +552,7 @@ static LogicalResult Verify(DequantizeOp op) {
 //===----------------------------------------------------------------------===//
 
 static LogicalResult Verify(GetTupleElementOp op) {
-  auto indexVal = op.index().getZExtValue();
+  auto indexVal = op.index();
   auto operandType = op.getOperand().getType().cast<TupleType>();
   if (indexVal >= operandType.size()) {
     return op.emitOpError(
@@ -481,7 +571,7 @@ static LogicalResult Verify(GetTupleElementOp op) {
 OpFoldResult GetTupleElementOp::fold(ArrayRef<Attribute> operands) {
   if (auto tupleOp =
           dyn_cast_or_null<mhlo::TupleOp>(getOperand().getDefiningOp())) {
-    return tupleOp.getOperand(index().getLimitedValue());
+    return tupleOp.getOperand(index());
   }
 
   return {};
@@ -551,8 +641,8 @@ static LogicalResult Verify(AllToAllOp op) {
   // count.
   auto type = op.getOperand().getType().dyn_cast<RankedTensorType>();
   if (!type) return success();
-  auto split_dim_size = type.getDimSize(op.split_dimension().getSExtValue());
-  auto split_count = op.split_count().getSExtValue();
+  auto split_dim_size = type.getDimSize(op.split_dimension());
+  auto split_count = op.split_count();
   if (split_dim_size % split_count != 0) {
     return op.emitError() << "split dimension has size " << split_dim_size
                           << ", expected to be a multiple of split_count "
@@ -821,9 +911,10 @@ static LogicalResult Verify(ClampOp op) {
 // ComplexOp
 //===----------------------------------------------------------------------===//
 
-void ComplexOp::build(OpBuilder& builder, OperationState& state, Value lhs,
-                      Value rhs) {
-  auto type = lhs.getType();
+LogicalResult ComplexOp::inferReturnTypes(
+    MLIRContext*, Optional<Location>, ValueRange operands, DictionaryAttr,
+    RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
+  auto type = operands[0].getType();
   auto element_ty = ComplexType::get(getElementTypeOrSelf(type));
   Type result_ty;
   if (auto ranked_type = type.dyn_cast<RankedTensorType>()) {
@@ -833,8 +924,8 @@ void ComplexOp::build(OpBuilder& builder, OperationState& state, Value lhs,
   } else {
     result_ty = element_ty;
   }
-
-  build(builder, state, result_ty, lhs, rhs);
+  inferredReturnTypes.push_back(result_ty);
+  return success();
 }
 
 OpFoldResult ComplexOp::fold(ArrayRef<Attribute> operands) {
@@ -864,8 +955,11 @@ Type CreateRealType(Type type) {
 }
 }  // namespace
 
-void ImagOp::build(OpBuilder& builder, OperationState& state, Value val) {
-  build(builder, state, CreateRealType(val.getType()), val);
+LogicalResult ImagOp::inferReturnTypes(
+    MLIRContext*, Optional<Location>, ValueRange operands, DictionaryAttr,
+    RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
+  inferredReturnTypes.push_back(CreateRealType(operands[0].getType()));
+  return success();
 }
 
 OpFoldResult ImagOp::fold(ArrayRef<Attribute> operands) {
@@ -877,8 +971,11 @@ OpFoldResult ImagOp::fold(ArrayRef<Attribute> operands) {
   return {};
 }
 
-void RealOp::build(OpBuilder& builder, OperationState& state, Value val) {
-  build(builder, state, CreateRealType(val.getType()), val);
+LogicalResult RealOp::inferReturnTypes(
+    MLIRContext*, Optional<Location>, ValueRange operands, DictionaryAttr,
+    RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
+  inferredReturnTypes.push_back(CreateRealType(operands[0].getType()));
+  return success();
 }
 
 OpFoldResult RealOp::fold(ArrayRef<Attribute> operands) {
@@ -900,7 +997,7 @@ class ConcatenateOperandRemoval : public OpRewritePattern<ConcatenateOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(ConcatenateOp op,
                                 PatternRewriter& rewriter) const override {
-    auto axis = op.dimension().getLimitedValue();
+    auto axis = op.dimension();
     llvm::SmallVector<Value, 6> new_operands;
     for (auto operand : op.getOperands()) {
       auto ty = operand.getType().cast<ShapedType>();
@@ -941,13 +1038,41 @@ LogicalResult ConcatenateOp::inferReturnTypes(
     }
   }
 
-  // If an input is unranked the output shape is unranked.
+  // Find the first ranked input to determine the output rank.
+  for (auto type : operands.getTypes()) {
+    auto shaped_type = type.cast<ShapedType>();
+    if (shaped_type.hasRank()) {
+      first_type = shaped_type;
+      break;
+    }
+  }
+
+  // If all inputs are unranked, the result must be unranked.
   if (!first_type.hasRank()) {
     inferredReturnTypes.push_back(UnrankedTensorType::get(out_element));
     return success();
   }
 
+  if (first_type.getRank() == 0)
+    return emitOptionalError(location, "rank-0 values cannot be concatenated");
+
   auto out_shape = llvm::to_vector<6>(first_type.getShape());
+
+  // Determine what the non-concatenate dimensions should be.
+  for (auto type : operands.getTypes()) {
+    auto shaped_ty = type.cast<ShapedType>();
+    if (!shaped_ty.hasRank()) {
+      continue;
+    }
+
+    for (auto it : llvm::enumerate(shaped_ty.getShape())) {
+      // If a dimension is not dynamic, the output shape should match.
+      if (ShapedType::isDynamic(out_shape[it.index()])) {
+        out_shape[it.index()] = it.value();
+      }
+    }
+  }
+
   out_shape[dimension] = 0;
 
   for (auto operand : operands.getTypes()) {
@@ -980,7 +1105,7 @@ void ConcatenateOp::getCanonicalizationPatterns(
 template <typename T>
 static Attribute foldConcatenateHelper(ConcatenateOp* op,
                                        ArrayRef<Attribute> operands) {
-  auto axis = op->dimension().getLimitedValue();
+  auto axis = op->dimension();
   auto type = op->getType().cast<ShapedType>();
 
   SmallVector<T, 6> values;
@@ -1028,7 +1153,7 @@ OpFoldResult ConcatenateOp::fold(ArrayRef<Attribute> operands) {
   ShapedType type = getResult().getType().cast<ShapedType>();
   if (!type.hasStaticShape()) return {};
 
-  auto axis = dimension().getLimitedValue();
+  auto axis = dimension();
   if (auto attr = foldConcatenate(this, operands)) {
     return attr;
   }
@@ -1203,6 +1328,131 @@ static LogicalResult Verify(InfeedOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// Logical Ops
+//===----------------------------------------------------------------------===//
+
+OpFoldResult AndOp::fold(ArrayRef<Attribute> operands) {
+  if (lhs() == rhs()) return lhs();
+
+  auto rType = getType().cast<ShapedType>();
+  auto lhsVal = operands[0].dyn_cast_or_null<DenseElementsAttr>();
+  auto rhsVal = operands[1].dyn_cast_or_null<DenseElementsAttr>();
+
+  if (lhsVal && lhsVal.isSplat()) {
+    if (lhsVal.getSplatValue()
+            .cast<IntegerAttr>()
+            .getValue()
+            .isAllOnesValue()) {
+      return rhs();
+    }
+
+    if (lhsVal.getSplatValue().cast<IntegerAttr>().getValue().isNullValue()) {
+      return lhsVal;
+    }
+  }
+
+  if (rhsVal && rhsVal.isSplat()) {
+    if (rhsVal.getSplatValue()
+            .cast<IntegerAttr>()
+            .getValue()
+            .isAllOnesValue()) {
+      return lhs();
+    }
+
+    if (rhsVal.getSplatValue().cast<IntegerAttr>().getValue().isNullValue()) {
+      return rhsVal;
+    }
+  }
+
+  if (!rhsVal || !lhsVal) return {};
+
+  llvm::SmallVector<APInt, 4> values;
+  values.reserve(rhsVal.getNumElements());
+  for (auto it : llvm::zip(rhsVal.getIntValues(), lhsVal.getIntValues())) {
+    values.push_back(std::get<0>(it) & std::get<1>(it));
+  }
+
+  return DenseIntElementsAttr::get(rType, values);
+}
+
+OpFoldResult OrOp::fold(ArrayRef<Attribute> operands) {
+  if (lhs() == rhs()) return lhs();
+
+  auto rType = getType().cast<ShapedType>();
+  auto lhsVal = operands[0].dyn_cast_or_null<DenseElementsAttr>();
+  auto rhsVal = operands[1].dyn_cast_or_null<DenseElementsAttr>();
+
+  if (lhsVal && lhsVal.isSplat()) {
+    if (lhsVal.getSplatValue()
+            .cast<IntegerAttr>()
+            .getValue()
+            .isAllOnesValue()) {
+      return lhsVal;
+    }
+
+    if (lhsVal.getSplatValue().cast<IntegerAttr>().getValue().isNullValue()) {
+      return rhs();
+    }
+  }
+
+  if (rhsVal && rhsVal.isSplat()) {
+    if (rhsVal.getSplatValue()
+            .cast<IntegerAttr>()
+            .getValue()
+            .isAllOnesValue()) {
+      return rhsVal;
+    }
+
+    if (rhsVal.getSplatValue().cast<IntegerAttr>().getValue().isNullValue()) {
+      return lhs();
+    }
+  }
+
+  if (!rhsVal || !lhsVal) return {};
+
+  llvm::SmallVector<APInt, 4> values;
+  values.reserve(rhsVal.getNumElements());
+  for (auto it : llvm::zip(rhsVal.getIntValues(), lhsVal.getIntValues())) {
+    values.push_back(std::get<0>(it) | std::get<1>(it));
+  }
+
+  return DenseIntElementsAttr::get(rType, values);
+}
+
+OpFoldResult XorOp::fold(ArrayRef<Attribute> operands) {
+  auto rType = getType().cast<ShapedType>();
+  if (lhs() == rhs()) {
+    Builder builder(getContext());
+    return builder.getZeroAttr(rType);
+  }
+
+  auto lhsVal = operands[0].dyn_cast_or_null<DenseElementsAttr>();
+  auto rhsVal = operands[1].dyn_cast_or_null<DenseElementsAttr>();
+
+  if (lhsVal && lhsVal.isSplat()) {
+    if (lhsVal.getSplatValue().cast<IntegerAttr>().getValue().isNullValue()) {
+      return rhs();
+    }
+  }
+
+  if (rhsVal && rhsVal.isSplat()) {
+    if (rhsVal.getSplatValue().cast<IntegerAttr>().getValue().isNullValue()) {
+      return lhs();
+    }
+  }
+
+  if (!rhsVal || !lhsVal) return {};
+
+  llvm::SmallVector<APInt, 4> values;
+  values.reserve(rhsVal.getNumElements());
+  for (auto it : llvm::zip(rhsVal.getIntValues(), lhsVal.getIntValues())) {
+    values.push_back(std::get<0>(it) ^ std::get<1>(it));
+  }
+
+  return DenseIntElementsAttr::get(rType, values);
+}
+
 //===----------------------------------------------------------------------===//
 // MapOp
 //===----------------------------------------------------------------------===//
@@ -1396,6 +1646,29 @@ static LogicalResult Verify(SelectOp op) {
   return success();
 }
 
+OpFoldResult SelectOp::fold(ArrayRef<Attribute> operands) {
+  if (on_true() == on_false()) {
+    return on_true();
+  }
+
+  auto predicate = operands[0].dyn_cast_or_null<DenseIntElementsAttr>();
+  if (!predicate) {
+    return {};
+  }
+
+  auto predicateTy = predicate.getType().cast<ShapedType>();
+  if (!predicateTy.getElementType().isInteger(1)) {
+    return {};
+  }
+
+  if (predicate.isSplat()) {
+    return predicate.getSplatValue<APInt>().getBoolValue() ? on_true()
+                                                           : on_false();
+  }
+
+  return {};
+}
+
 // Makes it such that a SelectOp that is a non-root operation in a DRR infers
 // the return type based on operand type.
 LogicalResult SelectOp::inferReturnTypes(
@@ -1437,6 +1710,20 @@ LogicalResult SelectOp::inferReturnTypes(
   return success();
 }
 
+LogicalResult SelectOp::inferReturnTypeComponents(
+    mlir::MLIRContext*, llvm::Optional<mlir::Location>, mlir::ValueRange,
+    mlir::DictionaryAttr, mlir::RegionRange,
+    llvm::SmallVectorImpl<mlir::ShapedTypeComponents>&) {
+  // TODO(b/168772852)
+  return failure();
+}
+
+LogicalResult SelectOp::reifyReturnTypeShapes(
+    OpBuilder& builder, SmallVectorImpl<Value>& reifiedReturnShapes) {
+  return deriveShapeFromFirstOperand(&builder, getOperation(),
+                                     &reifiedReturnShapes);
+}
+
 //===----------------------------------------------------------------------===//
 // PadOp
 //===----------------------------------------------------------------------===//
@@ -1584,6 +1871,79 @@ static LogicalResult Verify(CaseOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// SqrtOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult SqrtOp::fold(ArrayRef<Attribute> operands) {
+  auto val = operands[0].dyn_cast_or_null<DenseElementsAttr>();
+  if (!val) return {};
+
+  auto type = getElementTypeOrSelf(getType());
+  if (!type.isF32() && !type.isF64()) return {};
+
+  auto shaped_type = getType().cast<ShapedType>();
+  if (!shaped_type.hasStaticShape()) return {};
+
+  int bit_width = type.getIntOrFloatBitWidth();
+  llvm::SmallVector<APFloat, 4> values;
+  values.reserve(val.getNumElements());
+  for (auto it : val.getFloatValues()) {
+    double value = bit_width == 32 ? it.convertToFloat() : it.convertToDouble();
+    if (value < 0) return {};
+    value = std::sqrt(value);
+    if (bit_width == 32)
+      values.emplace_back(static_cast<float>(value));
+    else
+      values.emplace_back(value);
+  }
+  return DenseFPElementsAttr::get(shaped_type, values);
+}
+
+//===----------------------------------------------------------------------===//
+// UnaryOps
+//===----------------------------------------------------------------------===//
+
+template <typename Op, typename ElementType = Type, typename ValType,
+          typename Convert>
+static Attribute UnaryFolder(Op* op, ArrayRef<Attribute> attrs) {
+  if (!attrs[0]) return {};
+
+  DenseElementsAttr val = attrs[0].dyn_cast<DenseElementsAttr>();
+  if (!val) return {};
+
+  ShapedType type = op->getType().template cast<ShapedType>();
+  if (!type.hasStaticShape()) {
+    return {};
+  }
+
+  Type etype = type.getElementType();
+
+  // Evaluate for integer values.
+  if (!etype.isa<ElementType>()) {
+    return {};
+  }
+
+  SmallVector<ValType, 6> values;
+  values.reserve(val.getNumElements());
+  for (const auto v : val.getValues<ValType>()) {
+    values.push_back(Convert()(v));
+  }
+
+  return DenseElementsAttr::get(type, values);
+}
+
+#define UNARY_FOLDER(Op, Func)                                                \
+  OpFoldResult Op::fold(ArrayRef<Attribute> attrs) {                          \
+    if (getElementTypeOrSelf(getType()).isa<FloatType>())                     \
+      return UnaryFolder<Op, FloatType, APFloat, Func<APFloat>>(this, attrs); \
+    if (getElementTypeOrSelf(getType()).isa<IntegerType>())                   \
+      return UnaryFolder<Op, IntegerType, APInt, Func<APInt>>(this, attrs);   \
+    return {};                                                                \
+  }
+
+UNARY_FOLDER(NegOp, std::negate);
+
 //===----------------------------------------------------------------------===//
 // BinaryOps
 //===----------------------------------------------------------------------===//
@@ -1643,6 +2003,23 @@ struct divide<APInt> {
   APInt operator()(const APInt& a, const APInt& b) const { return a.sdiv(b); }
 };
 
+template <typename T>
+struct remainder : std::modulus<T> {};
+
+template <>
+struct remainder<APInt> {
+  APInt operator()(const APInt& a, const APInt& b) const { return a.srem(b); }
+};
+
+template <>
+struct remainder<APFloat> {
+  APFloat operator()(const APFloat& a, const APFloat& b) const {
+    APFloat result(a);
+    result.remainder(b);
+    return result;
+  }
+};
+
 template <typename T>
 struct max {
   T operator()(const T& a, const T& b) const { return std::max<T>(a, b); }
@@ -1684,6 +2061,7 @@ BINARY_FOLDER(AddOp, std::plus);
 BINARY_FOLDER(SubOp, std::minus);
 BINARY_FOLDER(MulOp, std::multiplies);
 BINARY_FOLDER(DivOp, divide);
+BINARY_FOLDER(RemOp, remainder);
 BINARY_FOLDER(MaxOp, max);
 BINARY_FOLDER(MinOp, min);
 
@@ -1758,11 +2136,11 @@ static Attribute FoldSlice(SliceOp* op, I values) {
 
 OpFoldResult SliceOp::fold(ArrayRef<Attribute> operands) {
   // Check if the SliceOp is a NoOp operation.
-  auto operand_shape = getOperand().getType().cast<ShapedType>().getShape();
+  auto operand_type = getOperand().getType().cast<ShapedType>();
   auto result_type = getResult().getType().cast<ShapedType>();
-  auto result_shape = result_type.getShape();
 
-  if (result_type.hasStaticShape() && (operand_shape == result_shape)) {
+  if (operand_type.hasStaticShape() && result_type.hasStaticShape() &&
+      (operand_type.getShape() == result_type.getShape())) {
     return getOperand();
   }
 
@@ -1808,7 +2186,7 @@ struct SimplifyConcatSlice : public OpRewritePattern<SliceOp> {
       return failure();
     }
 
-    auto dimension = concat.dimension().getSExtValue();
+    auto dimension = concat.dimension();
 
     auto start = slice.start_indices().getIntValues();
     auto limit = slice.limit_indices().getIntValues();
@@ -1933,10 +2311,7 @@ void SortOp::build(OpBuilder& builder, OperationState& state,
   state.addAttribute("dimension", builder.getI64IntegerAttr(dimension));
   state.addAttribute("is_stable", builder.getBoolAttr(dimension));
 
-  SmallVector<Type, 2> element_types;
-  element_types.reserve(operands.size());
-  for (Value operand : operands) element_types.push_back(operand.getType());
-  state.addTypes(builder.getTupleType(element_types));
+  for (Value operand : operands) state.addTypes(operand.getType());
 
   state.addRegion();
 }
@@ -1958,7 +2333,7 @@ static LogicalResult Verify(SortOp op) {
       return op.emitOpError("requires all inputs to have the same dimensions");
 
     int64_t rank = input_shape.size();
-    int64_t cmp_dim = op.dimension().getSExtValue();
+    int64_t cmp_dim = op.dimension();
     if (cmp_dim < -rank || cmp_dim >= rank)
       return op.emitOpError("dimension attribute value must be in range [-")
              << rank << ", " << rank << "), but found " << cmp_dim;
@@ -2159,9 +2534,267 @@ void CompareOp::build(OpBuilder& builder, OperationState& result, Value lhs,
   build(builder, result, new_type, lhs, rhs, comparison_direction);
 }
 
+LogicalResult CompareOp::inferReturnTypeComponents(
+    mlir::MLIRContext*, llvm::Optional<mlir::Location>, mlir::ValueRange,
+    mlir::DictionaryAttr, mlir::RegionRange,
+    llvm::SmallVectorImpl<mlir::ShapedTypeComponents>&) {
+  // TODO(b/168772852)
+  return failure();
+}
+
+LogicalResult CompareOp::reifyReturnTypeShapes(
+    OpBuilder& builder, SmallVectorImpl<Value>& reifiedReturnShapes) {
+  return deriveShapeFromFirstOperand(&builder, getOperation(),
+                                     &reifiedReturnShapes);
+}
+
+template <typename T>
+struct less : std::less<T> {};
+
+template <>
+struct less<APInt> {
+  bool operator()(const APInt& a, const APInt& b) const { return a.slt(b); }
+};
+
+template <typename T>
+struct less_equal : std::less_equal<T> {};
+
+template <>
+struct less_equal<APInt> {
+  bool operator()(const APInt& a, const APInt& b) const { return a.sle(b); }
+};
+
+template <typename T>
+struct greater : std::greater<T> {};
+
+template <>
+struct greater<APInt> {
+  bool operator()(const APInt& a, const APInt& b) const { return a.sgt(b); }
+};
+
+template <typename T>
+struct greater_equal : std::greater_equal<T> {};
+
+template <>
+struct greater_equal<APInt> {
+  bool operator()(const APInt& a, const APInt& b) const { return a.sge(b); }
+};
+
+template <typename Op, typename ElementType, typename SrcType, typename Convert>
+static Attribute CompareFolder(CompareOp op, ArrayRef<Attribute> attrs) {
+  if (!attrs[0] || !attrs[1]) return {};
+
+  DenseElementsAttr lhs = attrs[0].dyn_cast<DenseElementsAttr>();
+  DenseElementsAttr rhs = attrs[1].dyn_cast<DenseElementsAttr>();
+  if (!lhs || !rhs) return {};
+
+  ShapedType operand_type =
+      op.getOperand(0).getType().template cast<ShapedType>();
+  if (!operand_type.hasStaticShape()) {
+    return {};
+  }
+
+  if (!operand_type.getElementType().isa<ElementType>()) {
+    return {};
+  }
+
+  SmallVector<bool, 6> values;
+  values.reserve(lhs.getNumElements());
+  for (const auto zip :
+       llvm::zip(lhs.getValues<SrcType>(), rhs.getValues<SrcType>())) {
+    values.push_back(Convert()(std::get<0>(zip), std::get<1>(zip)));
+  }
+
+  auto result_ty = op.getType().cast<ShapedType>();
+  return DenseElementsAttr::get(result_ty, values);
+}
+
+OpFoldResult CompareOp::fold(ArrayRef<Attribute> operands) {
+  auto result_ty = getType().cast<ShapedType>();
+  if (!result_ty.hasStaticShape()) return {};
+
+  auto direction = comparison_direction();
+  if (lhs() == rhs()) {
+    if (direction == "LE" || direction == "EQ" || direction == "GE") {
+      return DenseIntElementsAttr::get(result_ty, {true});
+    }
+
+    return DenseIntElementsAttr::get(result_ty, {false});
+  }
+
+  if (!operands[0] || !operands[1]) {
+    return {};
+  }
+
+#define COMPARE_FOLDER(Op, comparison, Func)                                \
+  if (direction == comparison) {                                            \
+    if (auto folded = CompareFolder<Op, FloatType, APFloat, Func<APFloat>>( \
+            *this, operands))                                               \
+      return folded;                                                        \
+    if (auto folded = CompareFolder<Op, IntegerType, APInt, Func<APInt>>(   \
+            *this, operands))                                               \
+      return folded;                                                        \
+  }
+
+  COMPARE_FOLDER(CompareOp, "EQ", std::equal_to);
+  COMPARE_FOLDER(CompareOp, "NE", std::not_equal_to);
+  COMPARE_FOLDER(CompareOp, "LT", less);
+  COMPARE_FOLDER(CompareOp, "LE", less_equal);
+  COMPARE_FOLDER(CompareOp, "GT", greater);
+  COMPARE_FOLDER(CompareOp, "GE", greater_equal);
+#undef COMPARE_FOLDER
+
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// ScatterOp
+//===----------------------------------------------------------------------===//
+
+llvm::SmallVector<Attribute, 4> evaluateMhloRegion(Region& region,
+                                                   ArrayRef<Attribute> inputs) {
+  if (region.getNumArguments() != inputs.size()) return {};
+
+  llvm::DenseMap<Value, Attribute> values;
+  values.reserve(region.getNumArguments());
+  for (auto it : llvm::zip(region.getArguments(), inputs)) {
+    values.try_emplace(std::get<0>(it), std::get<1>(it));
+  }
+
+  for (auto& op : region.getOps()) {
+    llvm::SmallVector<Attribute, 4> inputs;
+    for (auto& operand : op.getOpOperands()) {
+      inputs.push_back(values.lookup(operand.get()));
+    }
+    if (isa<ReturnOp>(op)) return inputs;
+
+    llvm::SmallVector<OpFoldResult, 4> results;
+    if (failed(op.fold(inputs, results))) return {};
+    for (auto it : llvm::zip(op.getResults(), results)) {
+      if (!std::get<1>(it).is<Attribute>()) return {};
+      values.insert({std::get<0>(it), std::get<1>(it).get<Attribute>()});
+    }
+  }
+  return {};
+}
+
+OpFoldResult ScatterOp::fold(ArrayRef<Attribute> operands) {
+  auto base = operands[0].dyn_cast_or_null<DenseElementsAttr>();
+  auto index = operands[1].dyn_cast_or_null<DenseIntElementsAttr>();
+  auto update = operands[2].dyn_cast_or_null<DenseElementsAttr>();
+  if (!base || !index || !update) return {};
+
+  auto base_type = base.getType().dyn_cast<RankedTensorType>();
+  auto index_type = index.getType().dyn_cast<RankedTensorType>();
+  auto update_type = update.getType().dyn_cast<RankedTensorType>();
+  if (!base_type || !index_type || !update_type) return {};
+
+  // Add the virtual trailing dimension of size 1 if index_vector_dim equals to
+  // index_type.rank.
+  const int64_t index_vector_dim =
+      scatter_dimension_numbers().index_vector_dim().getInt();
+  if (index_vector_dim == index_type.getRank()) {
+    auto index_shape = index_type.getShape().vec();
+    index_shape.push_back(1);
+    index_type =
+        RankedTensorType::get(index_shape, index_type.getElementType());
+    index = index.reshape(index_type).cast<DenseIntElementsAttr>();
+  }
+
+  // Increment the multi-dimensional index vector based on the limits for each
+  // dimension specified by shape and returns false if the index rolled around
+  // with true otherwise.
+  auto next_index = [](llvm::SmallVector<uint64_t, 8>& index,
+                       llvm::ArrayRef<int64_t> shape) {
+    for (int64_t i = index.size() - 1; i >= 0; --i) {
+      ++index[i];
+      if (index[i] < shape[i]) return true;
+      index[i] = 0;
+    }
+    return false;
+  };
+
+  // Iterate over all elements of the update tensor, then find the corresponding
+  // value in the indices tensor to determine which location we have to update
+  // in the base/result tensor.
+  llvm::SmallVector<Attribute, 8> results(base.getValues<Attribute>());
+  llvm::SmallVector<uint64_t, 8> update_index(update_type.getRank(), 0);
+  llvm::SmallVector<uint64_t, 8> index_index;
+  index_index.reserve(index_type.getRank());
+  llvm::SmallVector<uint64_t, 8> base_index;
+  base_index.reserve(base_type.getRank());
+  do {
+    // Compute the index for the slice of the indices tensor for this update
+    // value.
+    index_index.clear();
+    if (index_vector_dim == 0) index_index.push_back(0);
+    for (int64_t i = 0; i < update_index.size(); ++i) {
+      if (llvm::count(scatter_dimension_numbers().update_window_dims(), i) == 0)
+        index_index.push_back(update_index[i]);
+      if (index_index.size() == index_vector_dim) index_index.push_back(0);
+    }
+
+    // Compute the index for the given update value in the base tensor.
+    base_index.assign(base_type.getRank(), 0);
+    uint64_t index_count = index_type.getShape()[index_vector_dim];
+    for (uint64_t i = 0; i < index_count; ++i) {
+      uint64_t operand_dim = scatter_dimension_numbers()
+                                 .scatter_dims_to_operand_dims()
+                                 .getValue<APInt>({i})
+                                 .getSExtValue();
+      index_index[index_vector_dim] = i;
+      base_index[operand_dim] +=
+          index.getValue<APInt>(index_index).getSExtValue();
+    }
+    uint64_t update_window_dim_index = 0;
+    for (uint64_t i = 0; i < base_index.size(); ++i) {
+      if (llvm::count(scatter_dimension_numbers().inserted_window_dims(), i))
+        continue;
+      base_index[i] +=
+          update_index[scatter_dimension_numbers()
+                           .update_window_dims()
+                           .getValue<APInt>({update_window_dim_index})
+                           .getSExtValue()];
+      update_window_dim_index++;
+    }
+
+    // Compute the linear index for the index into the base tensor.
+    int64_t linear_base_index = 0;
+    int64_t linear_base_index_multiplyer = 1;
+    for (int64_t i = base_index.size() - 1; i >= 0; --i) {
+      // Out of bound index have backend specific behaviour so avoid folding it.
+      if (base_index[i] < 0 || base_index[i] >= base_type.getShape()[i])
+        return {};
+      linear_base_index += base_index[i] * linear_base_index_multiplyer;
+      linear_base_index_multiplyer *= base_type.getShape()[i];
+    }
+
+    // Evaluate update computation and update the value with the newly computed
+    // attribute in the base tensor.
+    auto lhs = DenseElementsAttr::get(
+        RankedTensorType::get({}, base_type.getElementType()),
+        results[linear_base_index]);
+    auto rhs = DenseElementsAttr::get(
+        RankedTensorType::get({}, base_type.getElementType()),
+        update.getValue<Attribute>(update_index));
+    auto new_value = evaluateMhloRegion(update_computation(), {lhs, rhs});
+    if (new_value.size() != 1 || !new_value[0]) return {};
+    results[linear_base_index] =
+        new_value[0].cast<DenseElementsAttr>().getValue<Attribute>({});
+  } while (next_index(update_index, update_type.getShape()));
+
+  return DenseElementsAttr::get(base_type, results);
+}
+
+}  // namespace mhlo
+}  // namespace mlir
+
 #define GET_OP_CLASSES
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.cc.inc"
 
+namespace mlir {
+namespace mhlo {
+
 //===----------------------------------------------------------------------===//
 // mhlo Dialect Interfaces
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/xla/service/gpu/ir/dialect_registration.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops_base_structs.cc
similarity index 75%
rename from tensorflow/compiler/xla/service/gpu/ir/dialect_registration.cc
rename to tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops_base_structs.cc
index 2e3461951d8..90da1251ea0 100644
--- a/tensorflow/compiler/xla/service/gpu/ir/dialect_registration.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops_base_structs.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.h"
 
-// Static initialization for GPU thunks op registration.
-static mlir::DialectRegistration<mlir::xla_thunks::XLAThunksDialect>
-    xla_thunks_ops;
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_structs.cc.inc"
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/init.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/init.cc
index cf8bd257d20..ca8c6a8d150 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/init.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/init.cc
@@ -15,27 +15,15 @@ limitations under the License.
 
 #include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/register.h"
 
-// Static initialization for *HLO dialects registration.
-
-void mlir::mhlo::registerAllDialects() {
-  static bool init_once = []() {
-    registerDialect<mlir::chlo::HloClientDialect>();
-    registerDialect<mlir::lmhlo::LmhloDialect>();
-    registerDialect<mlir::mhlo::MhloDialect>();
-    return true;
-  }();
-  (void)init_once;
-
-  // Dependent dialects
-}
-
 void mlir::mhlo::registerAllMhloDialects(mlir::DialectRegistry &registry) {
   // clang-format off
   registry.insert<mlir::chlo::HloClientDialect,
+                  mlir::mhlo::MhloDialect,
                   mlir::lmhlo::LmhloDialect,
-                  mlir::mhlo::MhloDialect>();
+                  mlir::lmhlo_gpu::LmhloGpuDialect>();
   // clang-format on
 }
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_gpu_ops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_gpu_ops.cc
new file mode 100644
index 00000000000..10c5c0c2f9d
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_gpu_ops.cc
@@ -0,0 +1,64 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file defines the operations used in the LMHLO GPU dialect.
+
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h"
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
+
+namespace mlir {
+namespace lmhlo_gpu {
+
+LmhloGpuDialect::LmhloGpuDialect(MLIRContext *context)
+    : Dialect(getDialectNamespace(), context, TypeID::get<LmhloGpuDialect>()) {
+  addOperations<
+#define GET_OP_LIST
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.cc.inc"
+      >();
+}
+
+// TODO(jurahul): Add verification for operand shapes and ranks.
+
+}  // namespace lmhlo_gpu
+}  // namespace mlir
+
+#define GET_OP_CLASSES
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.cc.inc"
diff --git a/tensorflow/python/util/tf32.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_gpu_ops_structs.cc
similarity index 73%
rename from tensorflow/python/util/tf32.cc
rename to tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_gpu_ops_structs.cc
index 7dece6ccdae..cd2cfc58836 100644
--- a/tensorflow/python/util/tf32.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_gpu_ops_structs.cc
@@ -13,10 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "pybind11/pybind11.h"
-#include "tensorflow/core/platform/tf32_utils.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h"
 
-PYBIND11_MODULE(_pywrap_tf32_execution, m) {
-  m.def("allow", &tensorflow::allow_tf32_execution);
-  m.def("is_allowed", &tensorflow::tf32_execution_allowed);
-}
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.cc.inc"
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc
index 81407c89204..4524cf3ec1f 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops.cc
@@ -46,7 +46,6 @@ limitations under the License.
 #include "mlir/IR/Value.h"
 
 namespace mlir {
-#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops_structs.cc.inc"
 namespace lmhlo {
 
 LmhloDialect::LmhloDialect(MLIRContext *context)
@@ -159,9 +158,15 @@ static LogicalResult Verify(ReshapeMemRefCastOp op) {
   return success();
 }
 
+}  // namespace lmhlo
+}  // namespace mlir
+
 #define GET_OP_CLASSES
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.cc.inc"
 
+namespace mlir {
+namespace lmhlo {
+
 // TODO(cheshire): Support folding, reuse code from hlo_ops.cc.
 
 void FusionOp::build(OpBuilder &builder, OperationState &result,
diff --git a/tensorflow/compiler/mlir/lite/ir/dialect_registration.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops_structs.cc
similarity index 70%
rename from tensorflow/compiler/mlir/lite/ir/dialect_registration.cc
rename to tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops_structs.cc
index fae20437811..83dd4e62b47 100644
--- a/tensorflow/compiler/mlir/lite/ir/dialect_registration.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/lhlo_ops_structs.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,7 +13,5 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-
-// Static initialization for TensorFlow Lite op registration.
-static mlir::DialectRegistration<mlir::TFL::TensorFlowLiteDialect> tfl_ops;
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.cc.inc"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops_structs.h"
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/CMakeLists.txt
index bb9f98d32d3..354913264bb 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/CMakeLists.txt
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/CMakeLists.txt
@@ -25,6 +25,10 @@ set(LLVM_TARGET_DEFINITIONS legalize_to_standard_patterns.td)
 mlir_tablegen(generated_legalize_to_standard.inc -gen-rewriters)
 add_public_tablegen_target(MLIRMhloLegalizeToStandardIncGen)
 
+set(LLVM_TARGET_DEFINITIONS chlo_legalize_to_hlo_patterns.td)
+mlir_tablegen(generated_chlo_legalize_to_hlo.inc -gen-rewriters)
+add_public_tablegen_target(MLIRChloLegalizeToHloIncGen)
+
 
 add_mlir_library(ChloPasses
   chlo_legalize_to_hlo.cc
@@ -32,6 +36,7 @@ add_mlir_library(ChloPasses
 
   DEPENDS
   MLIRhlo_opsIncGen
+  MLIRChloLegalizeToHloIncGen
 
   LINK_COMPONENTS
   Core
@@ -44,7 +49,7 @@ add_mlir_library(ChloPasses
 
 add_mlir_library(MhloPasses
   legalize_gather_to_torch_index_select.cc
-  legalize_tanh_to_approximation.cc
+  legalize_trigonometric_to_approximation.cc
   lower_complex.cc
   lower_complex_patterns.td
   lower_general_dot.cc
@@ -93,6 +98,7 @@ add_mlir_library(MhloToLhloConversion
 add_mlir_library(MhloToStandard
   legalize_control_flow.cc
   legalize_to_standard.cc
+  mhlo_control_flow_to_scf.cc
 
   DEPENDS
   MLIRhlo_opsIncGen
@@ -124,7 +130,6 @@ add_mlir_library(MhloLhloToLinalg
 )
 
 add_mlir_library(LmhloPasses
-  lhlo_copy_removal.cc
   lhlo_fuse_linalg.cc
   lhlo_legalize_to_affine.cc
   lhlo_legalize_to_gpu.cc
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc
index c2db4880632..42d6d70b524 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <numeric>
+
 #include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
@@ -31,6 +33,39 @@ namespace mlir {
 namespace chlo {
 namespace {
 
+struct ConvertConstantLikeOp : public OpConversionPattern<ConstantLikeOp> {
+  using OpConversionPattern<ConstantLikeOp>::OpConversionPattern;
+  LogicalResult matchAndRewrite(
+      ConstantLikeOp op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    auto result_ty = op.getType().cast<ShapedType>();
+
+    // Unranked uses are not supported.  Consider `transform-unranked-hlo`.
+    if (!result_ty.hasRank()) return failure();
+
+    // Lower to MHLO constant if statically shaped.
+    if (result_ty.hasStaticShape()) {
+      rewriter.replaceOpWithNewOp<mhlo::ConstOp>(
+          op, DenseElementsAttr::get(result_ty, op.value()));
+      return success();
+    }
+
+    // Lower to broadcasted constant.
+    ConstantLikeOp::Adaptor transformed(operands);
+    auto loc = op.getLoc();
+    Type extent_tensor_type = shape::getExtentTensorType(op.getContext());
+    Value constant = rewriter.create<mhlo::ConstOp>(loc, op.value());
+    Value uncasted_shape = rewriter.create<shape::ShapeOfOp>(
+        loc, extent_tensor_type, transformed.operand());
+    Type shape_ty =
+        RankedTensorType::get({result_ty.getRank()}, rewriter.getIndexType());
+    Value shape = rewriter.create<TensorCastOp>(loc, shape_ty, uncasted_shape);
+    rewriter.replaceOpWithNewOp<mhlo::DynamicBroadcastInDimOp>(
+        op, result_ty, constant, shape, rewriter.getI64TensorAttr({}));
+    return success();
+  }
+};
+
 // Converts binary ops that statically are determined to not broadcast directly
 // to the corresponding mhlo non-broadcasting op.
 template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
@@ -248,7 +283,7 @@ struct ConvertUnrankedDynamicBroadcastBinaryOp
     auto if_op = rewriter.create<scf::IfOp>(
         loc, result_type, IsScalarTensor(rewriter, op, lhs), true);
     OpBuilder if_lhs_scalar_builder = if_op.getThenBodyBuilder();
-    Value reshaped_lhs = if_lhs_scalar_builder.create<mhlo::ReshapeOp>(
+    Value reshaped_lhs = if_lhs_scalar_builder.create<TensorCastOp>(
         loc, RankedTensorType::get({}, lhs_type.getElementType()), lhs);
     Value if_lhs_scalar_result = if_lhs_scalar_builder.create<ChloOpTy>(
         loc, ArrayRef<Type>{result_type}, ArrayRef<Value>{reshaped_lhs, rhs},
@@ -265,7 +300,7 @@ struct ConvertUnrankedDynamicBroadcastBinaryOp
     else_lhs_scalar_builder.create<scf::YieldOp>(loc,
                                                  if_rhs_scalar_op.getResult(0));
     OpBuilder if_rhs_scalar_builder = if_rhs_scalar_op.getThenBodyBuilder();
-    Value reshaped_rhs = if_rhs_scalar_builder.create<mhlo::ReshapeOp>(
+    Value reshaped_rhs = if_rhs_scalar_builder.create<TensorCastOp>(
         loc, RankedTensorType::get({}, lhs_type.getElementType()), rhs);
     Value if_rhs_scalar_result = if_rhs_scalar_builder.create<ChloOpTy>(
         loc, ArrayRef<Type>{result_type}, ArrayRef<Value>{lhs, reshaped_rhs},
@@ -338,30 +373,37 @@ struct ConvertUnrankedDynamicBroadcastBinaryOp
     Value lhs_shape = if_builder.create<shape::ShapeOfOp>(loc, lhs);
     Value rhs_shape = if_builder.create<shape::ShapeOfOp>(loc, rhs);
     SmallVector<int64_t, 6> ranked_shape(targeted_rank, 1);
-    auto extent_tensor_type =
+    auto unknown_rank_extent_tensor_type = RankedTensorType::get(
+        {RankedTensorType::kDynamicSize}, builder.getIndexType());
+    auto known_rank_extent_tensor_type =
         RankedTensorType::get({targeted_rank}, builder.getIndexType());
     auto reshaped_type = RankedTensorType::get(
         llvm::SmallVector<int64_t, 6>(targeted_rank,
                                       RankedTensorType::kDynamicSize),
         lhs.getType().template dyn_cast<TensorType>().getElementType());
     Value ranked_shape_val = if_builder.create<shape::ConstShapeOp>(
-        loc, extent_tensor_type,
-        mlir::DenseIntElementsAttr::get(extent_tensor_type, ranked_shape));
-    // TODO(tpopp): Return extent tensors when possible to signal that this is a
-    // guaranteed safe broadcast by construction.
+        loc, known_rank_extent_tensor_type,
+        mlir::DenseIntElementsAttr::get(known_rank_extent_tensor_type,
+                                        ranked_shape));
     Value extended_lhs = if_builder.create<shape::BroadcastOp>(
-        loc, extent_tensor_type, lhs_shape, ranked_shape_val, nullptr);
+        loc, unknown_rank_extent_tensor_type, lhs_shape, ranked_shape_val,
+        nullptr);
+    Value extended_lhs_casted = if_builder.create<TensorCastOp>(
+        loc, known_rank_extent_tensor_type, extended_lhs);
     Value extended_rhs = if_builder.create<shape::BroadcastOp>(
-        loc, extent_tensor_type, rhs_shape, ranked_shape_val, nullptr);
+        loc, unknown_rank_extent_tensor_type, rhs_shape, ranked_shape_val,
+        nullptr);
+    Value extended_rhs_casted = if_builder.create<TensorCastOp>(
+        loc, known_rank_extent_tensor_type, extended_rhs);
 
     // 1. Reshape operands to the given rank (with the same number of elements)
     // 2. Compute the ranked-broadcasted ChloOp (which will assert that the ops
     //    can be broadcasted and do the actual broadcasting)
     // 3. Type erase the output back to unranked
     Value reshaped_lhs = if_builder.create<mhlo::DynamicReshapeOp>(
-        loc, reshaped_type, lhs, extended_lhs);
+        loc, reshaped_type, lhs, extended_lhs_casted);
     Value reshaped_rhs = if_builder.create<mhlo::DynamicReshapeOp>(
-        loc, reshaped_type, rhs, extended_rhs);
+        loc, reshaped_type, rhs, extended_rhs_casted);
     Value result = if_builder.create<ChloOpTy>(
         loc, ArrayRef<Type>{reshaped_type},
         ArrayRef<Value>{reshaped_lhs, reshaped_rhs}, op.getAttrs());
@@ -469,10 +511,13 @@ struct HloCompareAdaptor {
   }
 };
 
+#include "generated_chlo_legalize_to_hlo.inc"
 }  // namespace
 
 void PopulateLegalizeChloToHloPatterns(MLIRContext *context,
                                        OwningRewritePatternList *patterns) {
+  populateWithGenerated(context, *patterns);
+
   // Instantiate conversion templates for conforming binary elementwise ops
   // that do not have different dtypes between operands and results and do
   // not have special attributes that need to be preserved.
@@ -502,6 +547,9 @@ void PopulateLegalizeChloToHloPatterns(MLIRContext *context,
       context, patterns);
   PopulateForBinaryOp<BroadcastCompareOp, mhlo::CompareOp, HloCompareAdaptor>(
       context, patterns);
+
+  // Other patterns.
+  patterns->insert<ConvertConstantLikeOp>(context);
 }
 
 }  // namespace chlo
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc
index 50cd6df5c99..d2f415d91f9 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_pass.cc
@@ -27,16 +27,21 @@ namespace mhlo {
 
 namespace {
 
-struct TestChloLegalizeToHloPass
-    : public PassWrapper<TestChloLegalizeToHloPass, FunctionPass> {
+struct ChloLegalizeToHloPass
+    : public PassWrapper<ChloLegalizeToHloPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<mhlo::MhloDialect, shape::ShapeDialect, scf::SCFDialect>();
+  }
+
   void runOnFunction() override {
     ConversionTarget conversionTarget(getContext());
     OwningRewritePatternList conversionPatterns;
-
     conversionTarget.addIllegalDialect<chlo::HloClientDialect>();
+
     // Consider the mhlo dialect legal for tests.
     conversionTarget.addLegalDialect<mhlo::MhloDialect>();
-    // The conversion uses helpers from the Standard dialect.
+
+    // The conversion uses helpers from the standard dialect.
     conversionTarget.addLegalDialect<mlir::StandardOpsDialect>();
     conversionTarget.addLegalDialect<mlir::shape::ShapeDialect>();
     conversionTarget.addLegalDialect<mlir::scf::SCFDialect>();
@@ -52,8 +57,8 @@ struct TestChloLegalizeToHloPass
 
 }  // namespace
 
-std::unique_ptr<FunctionPass> createTestChloLegalizeToHloPass() {
-  return std::make_unique<TestChloLegalizeToHloPass>();
+std::unique_ptr<FunctionPass> createChloLegalizeToHloPass() {
+  return std::make_unique<ChloLegalizeToHloPass>();
 }
 
 }  // namespace mhlo
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_patterns.td b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_patterns.td
new file mode 100644
index 00000000000..a48abb6190c
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/chlo_legalize_to_hlo_patterns.td
@@ -0,0 +1,107 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the legalization pattern definition file for CHLO to MHLO.
+
+include "mlir/IR/OpBase.td"
+include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.td"
+include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.td"
+
+//===----------------------------------------------------------------------===//
+// Unary op patterns.
+//===----------------------------------------------------------------------===//
+
+// Expand acos to MHLO dialect as follows:
+//   acos(x) = 2 * atan2(sqrt(1 - x^2), (1 + x))  if x != -1
+//           = pi                                 if x == -1
+def : Pat<(HLOClient_AcosOp $input),
+  (HLO_SelectOp
+    (HLO_CompareOp
+      $input,
+      (HLO_ConstantLike<"-1"> $input),
+      HLO_COMPARISON_DIRECTION_NE
+    ),
+    (HLO_MulOp
+      (HLO_ConstantLike<"2"> $input),
+      (HLO_Atan2Op
+        (HLO_SqrtOp
+          (HLO_SubOp
+            (HLO_ConstantLike<"1"> $input),
+            (HLO_MulOp $input, $input)
+          )
+        ),
+        (HLO_AddOp
+          (HLO_ConstantLike<"1"> $input),
+          $input
+        )
+      )
+    ),
+    (HLO_ConstantLike<"M_PI"> $input)
+  )>;
+
+// Express `atan` as
+//   atan(x) = atan2(x, 1)
+def : Pat<(HLOClient_AtanOp $input),
+  (HLO_Atan2Op
+    $input,
+    (HLO_ConstantLike<"1"> $input)
+  )>;
+
+// Express `sinh` as
+//   sinh(x) = (e^x - e^-x) / 2                     if |x| < 1
+//           = e^(x + log(1/2)) - e^(-x + log(1/2)) otherwise.
+def : Pat<(HLOClient_SinhOp $input),
+  (HLO_SelectOp
+    (HLO_CompareOp
+      (HLO_AbsOp $input),
+      (HLO_ConstantLike<"1"> $input),
+      HLO_COMPARISON_DIRECTION_LT
+    ),
+    (HLO_DivOp
+      (HLO_SubOp
+        (HLO_ExpOp $input),
+        (HLO_ExpOp
+          (HLO_NegOp $input)
+        )
+      ),
+      (HLO_ConstantLike<"2"> $input)
+    ),
+    (HLO_SubOp
+      (HLO_ExpOp
+        (HLO_AddOp
+          $input,
+          (HLO_LogOp
+            (HLO_ConstantLike<"0.5"> $input)
+          )
+        )
+      ),
+      (HLO_ExpOp
+        (HLO_SubOp
+          (HLO_LogOp
+            (HLO_ConstantLike<"0.5"> $input)
+          ),
+          $input
+        )
+      )
+    )
+  )>;
+
+// Express tan in MHLO dialect as
+//   tan(x) = sin(x) / cos(x).
+def : Pat<(HLOClient_TanOp $input),
+  (HLO_DivOp
+    (HLO_SinOp $input),
+    (HLO_CosOp $input)
+  )>;
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc
index a8c3ad17ebb..7b401d56e8c 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"
+#include "mlir/Dialect/Shape/Transforms/Passes.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Attributes.h"
@@ -32,7 +34,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/StandardTypes.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/BufferPlacement.h"
+#include "mlir/Transforms/Bufferize.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
@@ -40,12 +42,12 @@ namespace mhlo {
 namespace {
 
 template <typename T>
-using BaseOpConversion = BufferAssignmentOpConversionPattern<T>;
+using BaseOpConversion = BufferizeOpConversionPattern<T>;
 
 Value InsertDynamicAllocAndDealloc(Location loc, Value result,
                                    Value shape_operand,
                                    ConversionPatternRewriter* rewriter) {
-  auto result_type = result.getType().dyn_cast<ShapedType>();
+  auto result_type = result.getType().dyn_cast<RankedTensorType>();
   if (!result_type) {
     result.getDefiningOp()->emitOpError()
         << "tensor to buffer conversion expects ranked results";
@@ -53,17 +55,13 @@ Value InsertDynamicAllocAndDealloc(Location loc, Value result,
   auto memref_type =
       MemRefType::get(result_type.getShape(), result_type.getElementType());
 
-  Operation* op = result.getDefiningOp();
-
   // Extract the required element out of the vector.
   SmallVector<Value, 4> dynamic_operands;
   for (auto shape_element : llvm::enumerate(result_type.getShape())) {
     if (shape_element.value() != ShapedType::kDynamicSize) continue;
-    Value index = rewriter->create<ConstantOp>(
-        loc, rewriter->getIntegerAttr(rewriter->getIndexType(),
-                                      shape_element.index()));
-    Value alloc_operand = rewriter->create<ExtractElementOp>(loc, shape_operand,
-                                                             ValueRange{index});
+    Value index = rewriter->create<ConstantIndexOp>(loc, shape_element.index());
+    Value alloc_operand =
+        rewriter->create<ExtractElementOp>(loc, shape_operand, index);
     if (!alloc_operand.getType().isIndex()) {
       alloc_operand = rewriter->create<IndexCastOp>(loc, alloc_operand,
                                                     rewriter->getIndexType());
@@ -71,16 +69,12 @@ Value InsertDynamicAllocAndDealloc(Location loc, Value result,
     dynamic_operands.push_back(alloc_operand);
   }
 
-  // Insert in front of op to ensure sizes are available.
-  OpBuilder allocBuilder(op);
-  auto alloc = allocBuilder.create<AllocOp>(loc, memref_type, dynamic_operands);
-  return alloc;
+  return rewriter->create<AllocOp>(loc, memref_type, dynamic_operands);
 }
 
 Value InsertAlloc(Location loc, OpResult result,
-                  BufferAssignmentPlacer* bufferAssignment,
                   ConversionPatternRewriter* rewriter) {
-  auto result_type = result.getType().dyn_cast<ShapedType>();
+  auto result_type = result.getType().dyn_cast<RankedTensorType>();
   if (!result_type || !result_type.hasStaticShape()) {
     result.getDefiningOp()->emitOpError()
         << "tensor to buffer conversion expects statically shaped results";
@@ -88,8 +82,7 @@ Value InsertAlloc(Location loc, OpResult result,
   auto memref_type =
       MemRefType::get(result_type.getShape(), result_type.getElementType());
   OpBuilder::InsertionGuard guard(*rewriter);
-  rewriter->restoreInsertionPoint(
-      bufferAssignment->computeAllocPosition(result));
+  rewriter->setInsertionPoint(result.getDefiningOp());
   auto alloc = rewriter->create<AllocOp>(loc, memref_type);
   return alloc;
 }
@@ -111,8 +104,52 @@ class HloToLhloOpConverter : public BaseOpConversion<HloOpTy> {
         return failure();
       }
       if (resultType.hasStaticShape()) {
-        buffer_args.push_back(InsertAlloc(op->getLoc(), result.value(),
-                                          this->bufferAssignment, &rewriter));
+        buffer_args.push_back(
+            InsertAlloc(op->getLoc(), result.value(), &rewriter));
+      } else {
+        auto shape_type_op = dyn_cast<InferShapedTypeOpInterface>(op);
+        if (!shape_type_op) return failure();
+
+        SmallVector<Value, 1> results_shape;
+        auto status =
+            shape_type_op.reifyReturnTypeShapes(rewriter, results_shape);
+        if (failed(status)) return failure();
+        buffer_args.push_back(InsertDynamicAllocAndDealloc(
+            op->getLoc(), result.value(), results_shape.front(), &rewriter));
+      }
+    }
+    rewriter.create<mhlo::HloToLhloOp<HloOpTy>>(op->getLoc(), llvm::None,
+                                                buffer_args, op->getAttrs());
+    rewriter.replaceOp(
+        op, llvm::makeArrayRef(buffer_args).drop_front(operands.size()));
+    return success();
+  }
+};
+
+// This specialization exists so that LMHLO's Dot can be given a specific set of
+// dimension numbers, when lowering from MHLO's Dot, which does not have
+// dimension numbers (it uses DotGeneral for this generalized notion of dot
+// products). When these two dialects are in sync with respect to the
+// Dot/DotGeneral issue, this specialization should be deleted.
+template <>
+class HloToLhloOpConverter<mhlo::DotOp> : public BaseOpConversion<mhlo::DotOp> {
+ public:
+  using BaseOpConversion<mhlo::DotOp>::BaseOpConversion;
+  LogicalResult matchAndRewrite(
+      mhlo::DotOp hloOp, ArrayRef<Value> operands,
+      ConversionPatternRewriter& rewriter) const final {
+    Operation* op = hloOp.getOperation();
+    const auto& original_results = op->getResults();
+    SmallVector<Value, 2> buffer_args(operands.begin(), operands.end());
+    for (auto result : llvm::enumerate(original_results)) {
+      RankedTensorType resultType =
+          result.value().getType().dyn_cast<RankedTensorType>();
+      if (!resultType) {
+        return failure();
+      }
+      if (resultType.hasStaticShape()) {
+        buffer_args.push_back(
+            InsertAlloc(op->getLoc(), result.value(), &rewriter));
       } else {
         SmallVector<Value, 1> results_shape;
         auto shape_type_op = dyn_cast<InferShapedTypeOpInterface>(op);
@@ -124,8 +161,20 @@ class HloToLhloOpConverter : public BaseOpConversion<HloOpTy> {
             op->getLoc(), result.value(), results_shape.front(), &rewriter));
       }
     }
-    rewriter.create<mhlo::HloToLhloOp<HloOpTy>>(op->getLoc(), llvm::None,
-                                                buffer_args, op->getAttrs());
+
+    // TODO(silvasean): Move this helper to MLIR core.
+    auto make_elements_attr = [&rewriter](ArrayRef<int64_t> integers) {
+      auto type = RankedTensorType::get({static_cast<int64_t>(integers.size())},
+                                        rewriter.getIntegerType(64));
+      return DenseIntElementsAttr::get(type, integers);
+    };
+    auto dotOp = rewriter.create<lmhlo::DotOp>(op->getLoc(), llvm::None,
+                                               buffer_args, op->getAttrs());
+    // MHLO's Dot uses rank-2 operands, of the form ([N, M], [M, O]) -> [N, O].
+    auto dimension_numbers = mhlo::DotDimensionNumbers::get(
+        make_elements_attr({}), make_elements_attr({}), make_elements_attr({1}),
+        make_elements_attr({0}), rewriter.getContext());
+    dotOp.dot_dimension_numbersAttr(dimension_numbers);
     rewriter.replaceOp(op, ArrayRef<Value>(buffer_args).slice(operands.size()));
     return success();
   }
@@ -241,6 +290,43 @@ struct HloToLhloDynamicReshapeConverter
   }
 };
 
+struct HloToLhloDotGeneralOpConverter
+    : public BaseOpConversion<mhlo::DotGeneralOp> {
+  using BaseOpConversion<mhlo::DotGeneralOp>::BaseOpConversion;
+  LogicalResult matchAndRewrite(
+      mhlo::DotGeneralOp dotGeneralOp, ArrayRef<Value> operands,
+      ConversionPatternRewriter& rewriter) const final {
+    Operation* op = dotGeneralOp.getOperation();
+
+    if (op->getResults().empty()) return failure();
+    OpResult result = op->getResults()[0];
+    RankedTensorType resultType = result.getType().dyn_cast<RankedTensorType>();
+    if (!resultType) return failure();
+
+    // The third buffer argument will be filled with what used to be the return
+    // type of the DotGeneral.
+    if (operands.size() != 2) return failure();
+    std::array<Value, 3> bufferArgs = {operands[0], operands[1], {}};
+
+    if (resultType.hasStaticShape()) {
+      bufferArgs[2] = InsertAlloc(op->getLoc(), result, &rewriter);
+    } else {
+      SmallVector<Value, 1> results_shape;
+      auto shape_type_op = dyn_cast<InferShapedTypeOpInterface>(op);
+      if (failed(shape_type_op.reifyReturnTypeShapes(rewriter, results_shape)))
+        return failure();
+
+      bufferArgs[2] = InsertDynamicAllocAndDealloc(
+          op->getLoc(), result, results_shape.front(), &rewriter);
+    }
+
+    rewriter.create<lmhlo::DotOp>(op->getLoc(), llvm::None, bufferArgs,
+                                  op->getAttrs());
+    rewriter.replaceOp(op, bufferArgs[2]);
+    return success();
+  }
+};
+
 struct HloToLhloReduceOpConverter : public BaseOpConversion<mhlo::ReduceOp> {
  public:
   using BaseOpConversion<mhlo::ReduceOp>::BaseOpConversion;
@@ -259,8 +345,7 @@ struct HloToLhloReduceOpConverter : public BaseOpConversion<mhlo::ReduceOp> {
     const auto& original_results = op.getResults();
     SmallVector<Value, 4> buffer_args(operands.begin(), operands.end());
     for (auto result : original_results) {
-      buffer_args.push_back(
-          InsertAlloc(loc, result, this->bufferAssignment, &rewriter));
+      buffer_args.push_back(InsertAlloc(loc, result, &rewriter));
     }
     auto new_op = rewriter.create<lmhlo::ReduceOp>(loc, llvm::None, buffer_args,
                                                    op.getAttrs());
@@ -290,11 +375,36 @@ struct HloToLhloReduceOpConverter : public BaseOpConversion<mhlo::ReduceOp> {
   }
 };
 
-// Legalize mhlo.return to a lmhlo.copy and lmhlo.terminator. This functionality
-// is provided by mlir buffer assignment, so use the pattern from there.
-// TODO(DFKI): Move this out of detail.
-using HloToLhloReturnOpConverter = detail::BufferAssignmentReturnOpConverter<
-    mhlo::ReturnOp, lmhlo::TerminatorOp, lmhlo::CopyOp, false>;
+// Legalize mhlo.return to a lmhlo.copy and lmhlo.terminator.
+struct HloToLhloReturnOpConverter : public BaseOpConversion<mhlo::ReturnOp> {
+ public:
+  using BaseOpConversion<mhlo::ReturnOp>::BaseOpConversion;
+
+  LogicalResult matchAndRewrite(
+      mhlo::ReturnOp op, ArrayRef<Value> operands,
+      ConversionPatternRewriter& rewriter) const final {
+    auto loc = op.getLoc();
+    auto& entry_block = op.getParentRegion()->front();
+    auto num_arguments = entry_block.getNumArguments();
+    if (operands.size() > num_arguments) {
+      return op.emitError(
+          "The number of operands that need Copy operations is more "
+          "than the number of target function arguments.");
+    }
+
+    // The index of the first output block argument.
+    auto dest_arg_idx = num_arguments - operands.size();
+
+    // Create a lmhlo.copy for each operand of mhlo.return.
+    for (Value operand : operands) {
+      rewriter.create<lmhlo::CopyOp>(loc, operand,
+                                     entry_block.getArgument(dest_arg_idx));
+      ++dest_arg_idx;
+    }
+    rewriter.replaceOpWithNewOp<lmhlo::TerminatorOp>(op);
+    return success();
+  }
+};
 
 class HloToLhloTensorLoadOpConverter
     : public BaseOpConversion<mlir::TensorLoadOp> {
@@ -388,6 +498,10 @@ class HloToLhloTensorStoreOpConverter
 
 struct HloLegalizeToLhlo
     : public PassWrapper<HloLegalizeToLhlo, OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<lmhlo::LmhloDialect>();
+  }
+
  public:
   HloLegalizeToLhlo() = default;
   HloLegalizeToLhlo(const HloLegalizeToLhlo& o) {
@@ -410,7 +524,7 @@ struct HloLegalizeToLhlo
     target.addLegalOp<TensorFromElementsOp>();
     target.addIllegalDialect<mhlo::MhloDialect>();
 
-    BufferAssignmentTypeConverter converter;
+    BufferizeTypeConverter converter;
     auto isMemRefType = [](Type type) { return type.isa<BaseMemRefType>(); };
     target.addDynamicallyLegalOp<FuncOp>([&](FuncOp op) {
       auto inputs = op.getType().getInputs();
@@ -427,29 +541,25 @@ struct HloLegalizeToLhlo
       return std::all_of(op.operand_type_begin(), op.operand_type_end(),
                          isMemRefType);
     });
-
-    auto module = getOperation();
-    WalkResult result = module.walk([&](FuncOp func) -> WalkResult {
-      BufferAssignmentPlacer bufferAssignment(func);
-      OwningRewritePatternList patterns;
-      populateHLOToLHLOConversionPattern(func.getContext(), &bufferAssignment,
-                                         &converter, &patterns);
-      if (results_escape_function) {
-        populateWithBufferAssignmentOpConversionPatterns<
-            mlir::ReturnOp, mlir::ReturnOp, lmhlo::CopyOp,
-            /*allowMemrefFunctionResults=*/true>(&context, &bufferAssignment,
-                                                 &converter, &patterns);
-      } else {
-        populateWithBufferAssignmentOpConversionPatterns<
-            mlir::ReturnOp, mlir::ReturnOp, lmhlo::CopyOp,
-            /*allowMemrefFunctionResults=*/false>(&context, &bufferAssignment,
-                                                  &converter, &patterns);
-      }
-      return applyPartialConversion(func, target, patterns);
+    target.addDynamicallyLegalOp<shape::AssumingOp>([&](shape::AssumingOp op) {
+      return std::all_of(op.result_type_begin(), op.result_type_end(),
+                         isMemRefType);
     });
-    if (result.wasInterrupted()) {
+
+    auto kind = results_escape_function
+                    ? BufferizeTypeConverter::KeepAsFunctionResult
+                    : BufferizeTypeConverter::AppendToArgumentsList;
+    converter.setResultConversionKind<UnrankedTensorType, UnrankedMemRefType>(
+        kind);
+    converter.setResultConversionKind<RankedTensorType, MemRefType>(kind);
+
+    populateHLOToLHLOConversionPattern(&context, &converter, &patterns);
+    populateWithBufferizeOpConversionPatterns<mlir::ReturnOp, mlir::ReturnOp,
+                                              lmhlo::CopyOp>(
+        &context, converter, patterns);
+    populateShapeTypeConversionPatterns(&context, converter, patterns);
+    if (failed(applyPartialConversion(getOperation(), target, patterns)))
       signalPassFailure();
-    }
   }
 
  private:
@@ -461,16 +571,18 @@ struct HloLegalizeToLhlo
 };
 }  // namespace
 
-void populateHLOToLHLOConversionPattern(
-    MLIRContext* context, BufferAssignmentPlacer* bufferAssignment,
-    TypeConverter* converter, OwningRewritePatternList* patterns) {
+void populateHLOToLHLOConversionPattern(MLIRContext* context,
+                                        BufferizeTypeConverter* converter,
+                                        OwningRewritePatternList* patterns) {
   // clang-format off
   patterns->insert<
+      HloToLhloDotGeneralOpConverter,
       HloToLhloDynamicBroadcastInDimOpConverter,
       HloToLhloDynamicReshapeConverter,
       HloToLhloOpConverter<mhlo::AbsOp>,
       HloToLhloOpConverter<mhlo::AddOp>,
       HloToLhloOpConverter<mhlo::AndOp>,
+      HloToLhloOpConverter<mhlo::Atan2Op>,
       HloToLhloOpConverter<mhlo::BroadcastInDimOp>,
       HloToLhloOpConverter<mhlo::CeilOp>,
       HloToLhloOpConverter<mhlo::CompareOp>,
@@ -480,31 +592,38 @@ void populateHLOToLHLOConversionPattern(
       HloToLhloOpConverter<mhlo::ConvertOp>,
       HloToLhloOpConverter<mhlo::CopyOp>,
       HloToLhloOpConverter<mhlo::CosOp>,
+      HloToLhloOpConverter<mhlo::CustomCallOp>,
       HloToLhloOpConverter<mhlo::DivOp>,
       HloToLhloOpConverter<mhlo::DotOp>,
       HloToLhloOpConverter<mhlo::ExpOp>,
+      HloToLhloOpConverter<mhlo::FloorOp>,
       HloToLhloOpConverter<mhlo::GatherOp>,
       HloToLhloOpConverter<mhlo::ImagOp>,
       HloToLhloOpConverter<mhlo::IotaOp>,
+      HloToLhloOpConverter<mhlo::IsFiniteOp>,
       HloToLhloOpConverter<mhlo::LogOp>,
       HloToLhloOpConverter<mhlo::MaxOp>,
       HloToLhloOpConverter<mhlo::MinOp>,
       HloToLhloOpConverter<mhlo::MulOp>,
       HloToLhloOpConverter<mhlo::NegOp>,
+      HloToLhloOpConverter<mhlo::NotOp>,
       HloToLhloOpConverter<mhlo::RealOp>,
       HloToLhloOpConverter<mhlo::RemOp>,
       HloToLhloOpConverter<mhlo::RsqrtOp>,
       HloToLhloOpConverter<mhlo::ReshapeOp>,
       HloToLhloOpConverter<mhlo::SelectOp>,
       HloToLhloOpConverter<mhlo::SignOp>,
+      HloToLhloOpConverter<mhlo::SinOp>,
+      HloToLhloOpConverter<mhlo::SliceOp>,
       HloToLhloOpConverter<mhlo::SqrtOp>,
       HloToLhloOpConverter<mhlo::SubOp>,
       HloToLhloOpConverter<mhlo::TanhOp>,
+      HloToLhloOpConverter<mhlo::TransposeOp>,
       HloToLhloReduceOpConverter,
       HloToLhloReturnOpConverter,
       HloToLhloTensorLoadOpConverter,
       HloToLhloTensorStoreOpConverter
-  >(context, bufferAssignment, converter);
+  >(context, *converter);
   // clang-format on
 }
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_control_flow.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_control_flow.cc
index b6e23a6b131..adf2a398a00 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_control_flow.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_control_flow.cc
@@ -32,8 +32,6 @@ limitations under the License.
 #include "mlir/Pass/PassRegistry.h"
 #include "mlir/Support/LogicalResult.h"
 
-using mlir::PassRegistration;
-
 namespace mlir {
 namespace mhlo {
 namespace {
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_tanh_to_approximation.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_tanh_to_approximation.cc
deleted file mode 100644
index 57c494f536b..00000000000
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_tanh_to_approximation.cc
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements logic for lowering the tanh standard ops to an
-// approximation.
-
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Function.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-namespace mhlo {
-namespace {
-
-/// Emits the fast tanh approximation that is also used by XLA.
-Value EmitTanhApproximation(Value input, Location loc,
-                            PatternRewriter &rewriter) {
-  // For small values of x, we can approximate tanh(x)=x. For extremely small
-  // values of x (|x| < 1e-37), the other approximation would evaluate
-  // tanh(x) = 0.
-  constexpr float kCanUseApprox = 0.0004;
-  Value abs_value = rewriter.create<AbsFOp>(loc, input);
-  Value can_use_approx =
-      rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(kCanUseApprox));
-  Value return_input = rewriter.create<CmpFOp>(loc, CmpFPredicate::OLT,
-                                               abs_value, can_use_approx);
-  // Clamp the input to [-c, c].
-  Value max_clamp = rewriter.create<ConstantOp>(
-      loc, rewriter.getF32FloatAttr(7.90531110763549805f));
-  Value smaller_than_max =
-      rewriter.create<CmpFOp>(loc, CmpFPredicate::ULE, input, max_clamp);
-  Value clamped_half =
-      rewriter.create<SelectOp>(loc, smaller_than_max, input, max_clamp);
-  Value min_clamp = rewriter.create<ConstantOp>(
-      loc, rewriter.getF32FloatAttr(-7.90531110763549805f));
-  Value larger_than_min =
-      rewriter.create<CmpFOp>(loc, CmpFPredicate::UGE, clamped_half, min_clamp);
-  Value input_clamped =
-      rewriter.create<SelectOp>(loc, larger_than_min, clamped_half, min_clamp);
-
-  static constexpr std::array<float, 7> numerator_coeffs{
-      -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
-      5.12229709037114e-08f,  1.48572235717979e-05f, 6.37261928875436e-04f,
-      4.89352455891786e-03f};
-
-  static constexpr std::array<float, 4> denominator_coeffs{
-      1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
-      4.89352518554385e-03f};
-
-  Value input_squared =
-      rewriter.create<MulFOp>(loc, input_clamped, input_clamped);
-  Value numerator = rewriter.create<ConstantOp>(
-      loc, rewriter.getF32FloatAttr(numerator_coeffs[0]));
-  for (int i = 1; i < numerator_coeffs.size(); i++) {
-    numerator = rewriter.create<AddFOp>(
-        loc, rewriter.create<MulFOp>(loc, input_squared, numerator),
-        rewriter.create<ConstantOp>(
-            loc, rewriter.getF32FloatAttr(numerator_coeffs[i])));
-  }
-
-  numerator = rewriter.create<MulFOp>(loc, input_clamped, numerator);
-
-  Value denominator = rewriter.create<ConstantOp>(
-      loc, rewriter.getF32FloatAttr(denominator_coeffs[0]));
-  for (int i = 1; i < denominator_coeffs.size(); i++) {
-    denominator = rewriter.create<AddFOp>(
-        loc, rewriter.create<MulFOp>(loc, input_squared, denominator),
-        rewriter.create<ConstantOp>(
-            loc, rewriter.getF32FloatAttr(denominator_coeffs[i])));
-  }
-
-  Value approx = rewriter.create<DivFOp>(loc, numerator, denominator);
-
-  return rewriter.create<SelectOp>(loc, return_input, input, approx);
-}
-
-class ApproximateTanhLowering : public OpRewritePattern<TanhOp> {
- public:
-  explicit ApproximateTanhLowering(MLIRContext *ctx)
-      : OpRewritePattern<TanhOp>(ctx, 100) {}
-
-  LogicalResult matchAndRewrite(TanhOp tanhOp,
-                                PatternRewriter &rewriter) const override {
-    Type operand_type = tanhOp.getType();
-
-    if (operand_type.isF64()) {
-      // Similar to XLA, do not rewrite f64 as precision might matter.
-      return failure();
-    }
-
-    Location loc = tanhOp.getLoc();
-    Value input = tanhOp.operand();
-    if (operand_type.isF16()) {
-      input = rewriter.create<FPExtOp>(loc, input, rewriter.getF32Type());
-    }
-
-    // If we still do not have f32, fail.
-    if (!input.getType().isF32()) {
-      return failure();
-    }
-
-    Value result = EmitTanhApproximation(input, loc, rewriter);
-
-    // Truncate back if needed.
-    if (operand_type.isF16()) {
-      result = rewriter.create<FPTruncOp>(loc, result, rewriter.getF16Type());
-    }
-
-    rewriter.replaceOp(tanhOp, {result});
-    return success();
-  }
-};
-
-struct LegalizeTanhToApproximationPass
-    : public PassWrapper<LegalizeTanhToApproximationPass, FunctionPass> {
-  /// Perform the lowering of standard dialect operations to approximations.
-  void runOnFunction() override {
-    OwningRewritePatternList patterns;
-    PopulateTanhToApproximationPatterns(&getContext(), &patterns);
-    applyPatternsAndFoldGreedily(getFunction(), patterns);
-  }
-};
-
-}  // anonymous namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::FuncOp>>
-createLegalizeTanhToApproximationPass() {
-  return std::make_unique<LegalizeTanhToApproximationPass>();
-}
-
-void PopulateTanhToApproximationPatterns(mlir::MLIRContext *context,
-                                         OwningRewritePatternList *patterns) {
-  patterns->insert<ApproximateTanhLowering>(context);
-}
-
-}  // namespace mhlo
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
index 033021c36ac..b64d66200cf 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <numeric>
 
+#include "llvm/ADT/STLExtras.h"
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h"
@@ -32,8 +33,10 @@ limitations under the License.
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/TypeUtilities.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
 
@@ -75,69 +78,69 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
       OpTy op, ArrayRef<Value> args,
       ConversionPatternRewriter& rewriter) const final {
     auto loc = op.getLoc();
-    auto argType =
-        op.getOperation()->getOperand(0).getType().template cast<ShapedType>();
-    if (!argType.hasRank()) {
-      emitError(loc, "lhlo to linalg conversion expects ranked args");
-      return failure();
-    }
-    auto elemTy = argType.getElementType();
-    if (!elemTy.isSignlessIntOrFloat() && !elemTy.template isa<ComplexType>()) {
-      return failure();
-    }
+    ShapedType t0 = args[0].getType().template dyn_cast<ShapedType>();
+    if (!t0) return failure();
+
+    unsigned nloops = t0.getRank();
+    auto fail = [&](ShapedType t) {
+      return !t || !t.hasRank() || t.getRank() != nloops ||
+             !(t.getElementType().isSignlessIntOrFloat() ||
+               t.getElementType().isa<ComplexType>());
+    };
+    if (llvm::any_of(args,
+                     [&](Value v) {
+                       return fail(v.getType().dyn_cast<ShapedType>());
+                     }) ||
+        llvm::any_of(op.getOperation()->getResultTypes(),
+                     [&](Type t) { return fail(t.dyn_cast<ShapedType>()); }))
+      return emitError(loc,
+                       "lhlo to linalg conversion expects ranked args of "
+                       "signless int, float or complex element type with ")
+             << nloops << " parallel iterators: " << *(op.getOperation());
 
     // Construct the indexing maps needed for linalg.generic ops.
-    SmallVector<AffineMap, 2> indexing_maps;
     SmallVector<Type, 4> bodyArgTypes, bodyResultTypes, opResultTypes;
 
     // This doesnt account for implicit broadcast, but the working assumption
-    // here is that are broadcasts have been made explicit.
-    unsigned nloops = argType.getRank();
+    // in HLO/LHLO is that are broadcasts are made explicit.
 
     if (isLHLO && !nloops) return failure();
 
-    int operandCount = (isLHLO ? args.size() - 1 : args.size());
-    auto verifyArgOrResultType = [&](Value val) -> ShapedType {
-      auto shapedType = val.getType().dyn_cast<ShapedType>();
-      if (!shapedType ||
-          (!shapedType.isa<MemRefType>() &&
-           !shapedType.isa<RankedTensorType>()) ||
-          shapedType.getRank() != nloops)
-        return nullptr;
-      indexing_maps.emplace_back(
-          nloops ? rewriter.getMultiDimIdentityMap(nloops)
-                 : AffineMap::get(nloops, 0, rewriter.getContext()));
-      return shapedType;
-    };
-    for (const auto& arg : llvm::enumerate(args)) {
-      auto shapedType = verifyArgOrResultType(arg.value());
-      if (!shapedType) return failure();
-      auto& result_or_body_arg =
-          arg.index() < operandCount ? bodyArgTypes : bodyResultTypes;
-      result_or_body_arg.emplace_back(shapedType.getElementType());
-    }
+    int numInputs = (isLHLO ? args.size() - 1 : args.size());
+
+    ValueRange inputs(args.take_front(numInputs));
+    for (Value in : inputs)
+      bodyArgTypes.emplace_back(getElementTypeOrSelf(in.getType()));
+
+    ValueRange outputBuffers(args.take_back(args.size() - numInputs));
+    for (Value out : outputBuffers)
+      bodyResultTypes.emplace_back(getElementTypeOrSelf(out.getType()));
+
     if (!isLHLO) {
       // HLO operations have return as tensor types.
       assert(bodyResultTypes.empty() &&
              "When lowering HLO ops result can't be part of arguments");
       Value result = op.getOperation()->getResult(0);
-      auto shapedType = verifyArgOrResultType(result);
-      if (!shapedType) return failure();
-      bodyResultTypes.push_back(shapedType.getElementType());
-      opResultTypes.push_back(shapedType);
+      bodyResultTypes.push_back(getElementTypeOrSelf(result));
+      opResultTypes.push_back(result.getType());
     }
 
-    int64_t args_count = bodyArgTypes.size();
-    int64_t results_count = bodyResultTypes.size();
+    AffineMap commonIndexingMap =
+        nloops ? rewriter.getMultiDimIdentityMap(nloops)
+               : AffineMap::get(nloops, 0, rewriter.getContext());
+    SmallVector<AffineMap, 2> indexing_maps(args.size() + (isLHLO ? 0 : 1),
+                                            commonIndexingMap);
+
     auto linalgOp = rewriter.create<linalg::GenericOp>(
-        loc, opResultTypes, args, args_count, results_count, indexing_maps,
+        loc, opResultTypes, inputs, outputBuffers,
+        /*initTensors=*/ValueRange{}, indexing_maps,
         GetNParallelLoopsAttrs(nloops),
         [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
           // TODO(ravishankarm) : For now use the method in lmhlo namespace.
           // That method needs to be moved out of there.
           Value opResult = lmhlo::HloOpToStdScalarOp::map<OpTy>(
               op, bodyResultTypes,
-              llvm::to_vector<2>(args.take_front(args_count)), &rewriter);
+              llvm::to_vector<2>(args.take_front(inputs.size())), &rewriter);
           nestedBuilder.create<linalg::YieldOp>(loc, opResult);
         });
     rewriter.replaceOp(op, linalgOp.getOperation()->getResults());
@@ -189,7 +192,7 @@ struct ConvToLinalgConverter : public OpConversionPattern<lmhlo::ConvOp> {
       lmhlo::ConvOp op, ArrayRef<Value> args,
       ConversionPatternRewriter& rewriter) const final {
     // Check validity of dimension information.
-    if (const lmhlo::ConvDimensionNumbers& dimensionNumbers =
+    if (const mhlo::ConvDimensionNumbers& dimensionNumbers =
             op.dimension_numbers()) {
       const int inputSpatialRank =
           llvm::size(dimensionNumbers.input_spatial_dimensions());
@@ -299,12 +302,15 @@ class DataMovementOpConverter : public OpConversionPattern<OpTy> {
     auto nloops = resultType.getRank();
     auto loc = op.getLoc();
     auto linalgOp = rewriter.create<linalg::GenericOp>(
-        loc, isLHLO ? ArrayRef<Type>{} : resultType, args, /*argsIn=*/1,
-        /*argsOut=*/1, indexing_maps, GetNParallelLoopsAttrs(nloops),
+        loc,
+        /*resultTensorTypes=*/isLHLO ? ArrayRef<Type>{} : resultType,
+        /*inputs=*/args.front(),
+        /*outputBuffers=*/isLHLO ? ValueRange{args.back()} : ValueRange{},
+        /*initTensor=*/ValueRange{}, indexing_maps,
+        GetNParallelLoopsAttrs(nloops),
         [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
           nestedBuilder.create<linalg::YieldOp>(loc, *args.begin());
         });
-
     rewriter.replaceOp(op, linalgOp.getOperation()->getResults());
     return success();
   }
@@ -420,8 +426,8 @@ class LhloBroadcastInDimConverter
       Value val =
           rewriter.create<LoadOp>(loc, operand, llvm::makeArrayRef({zero}));
       rewriter.create<linalg::GenericOp>(
-          loc, llvm::None, llvm::makeArrayRef(operand_adaptor.output()),
-          /*argsIn=*/0, /*argsOut=*/1,
+          loc, /*inputs=*/ValueRange{},
+          /*outputBuffers=*/ValueRange{operand_adaptor.output()},
           llvm::makeArrayRef(rewriter.getMultiDimIdentityMap(nloops)),
           GetNParallelLoopsAttrs(nloops),
           [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
@@ -432,9 +438,8 @@ class LhloBroadcastInDimConverter
       auto indexing_maps = getIndexingMaps(op, broadcast_dims, result_shape,
                                            operand_type, &rewriter);
       rewriter.create<linalg::GenericOp>(
-          loc, llvm::None,
-          llvm::makeArrayRef({operand, operand_adaptor.output()}),
-          /*argsIn=*/1, /*argsOut=*/1, indexing_maps,
+          loc, /*inputs=*/ValueRange{operand},
+          /*outputBuffers=*/ValueRange{operand_adaptor.output()}, indexing_maps,
           GetNParallelLoopsAttrs(nloops),
           [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange args) {
             nestedBuilder.create<linalg::YieldOp>(loc, *args.begin());
@@ -627,7 +632,8 @@ class ReshapeOpConverter : public OpConversionPattern<OpTy> {
       }
       currDstDim++;
     }
-    if (currSrcDim != srcShape.size()) isExpandingOrCollapsing = false;
+    if (currSrcDim != srcShape.size() || currDstDim != dstShape.size())
+      isExpandingOrCollapsing = false;
 
     if (!isExpandingOrCollapsing) {
       auto getIdentityExprs = [&rewriter](int n) {
@@ -696,15 +702,18 @@ class IotaConverter : public OpConversionPattern<OpTy> {
     unsigned nloops = resultShapedType.getRank();
 
     auto linalgOp = rewriter.create<linalg::IndexedGenericOp>(
-        iotaOp.getLoc(), isLHLO ? ArrayRef<Type>{} : resultShapedType, args,
-        0,  // args_in
-        1,  // args_out
+        iotaOp.getLoc(),
+        /*resultTensorTypes=*/
+        isLHLO ? ArrayRef<Type>{} : ArrayRef<Type>{resultShapedType},
+        /*inputs=*/ValueRange{},
+        /*outputBuffers=*/isLHLO ? ValueRange{args} : ValueRange{},
+        /*initTensors=*/ValueRange{},
         llvm::makeArrayRef(rewriter.getMultiDimIdentityMap(nloops)),
         GetNParallelLoopsAttrs(nloops),
         [&](OpBuilder& nestedBuilder, Location nestedLoc, ValueRange ivs,
             ValueRange args) {
           Value castOp = nestedBuilder.create<IndexCastOp>(
-              nestedLoc, ivs[iotaOp.iota_dimension().getZExtValue()],
+              nestedLoc, ivs[iotaOp.iota_dimension()],
               nestedBuilder.getIntegerType(
                   resultElementType.getIntOrFloatBitWidth()));
           if (resultElementType.template isa<FloatType>()) {
@@ -716,7 +725,7 @@ class IotaConverter : public OpConversionPattern<OpTy> {
     if (isLHLO)
       rewriter.replaceOp(iotaOp, llvm::None);
     else
-      rewriter.replaceOp(iotaOp, linalgOp.output_tensors());
+      rewriter.replaceOp(iotaOp, linalgOp.result_tensors());
     return success();
   }
 };
@@ -813,6 +822,7 @@ void populateLHLOToLinalgConversionPattern(MLIRContext* context,
                    PointwiseToLinalgConverter<lmhlo::AbsOp>,
                    PointwiseToLinalgConverter<lmhlo::AddOp>,
                    PointwiseToLinalgConverter<lmhlo::AndOp>,
+                   PointwiseToLinalgConverter<lmhlo::Atan2Op>,
                    PointwiseToLinalgConverter<lmhlo::CeilOp>,
                    PointwiseToLinalgConverter<lmhlo::CompareOp>,
                    PointwiseToLinalgConverter<lmhlo::ComplexOp>,
@@ -822,12 +832,14 @@ void populateLHLOToLinalgConversionPattern(MLIRContext* context,
                    PointwiseToLinalgConverter<lmhlo::CosOp>,
                    PointwiseToLinalgConverter<lmhlo::DivOp>,
                    PointwiseToLinalgConverter<lmhlo::ExpOp>,
+                   PointwiseToLinalgConverter<lmhlo::FloorOp>,
                    PointwiseToLinalgConverter<lmhlo::ImagOp>,
                    PointwiseToLinalgConverter<lmhlo::LogOp>,
                    PointwiseToLinalgConverter<lmhlo::MaxOp>,
                    PointwiseToLinalgConverter<lmhlo::MinOp>,
                    PointwiseToLinalgConverter<lmhlo::MulOp>,
                    PointwiseToLinalgConverter<lmhlo::NegOp>,
+                   PointwiseToLinalgConverter<lmhlo::NotOp>,
                    PointwiseToLinalgConverter<lmhlo::RealOp>,
                    PointwiseToLinalgConverter<lmhlo::RemOp>,
                    PointwiseToLinalgConverter<lmhlo::RsqrtOp>,
@@ -837,10 +849,12 @@ void populateLHLOToLinalgConversionPattern(MLIRContext* context,
                    PointwiseToLinalgConverter<lmhlo::SqrtOp>,
                    PointwiseToLinalgConverter<lmhlo::SubOp>,
                    PointwiseToLinalgConverter<lmhlo::TanhOp>,
+                   PointwiseToLinalgConverter<lmhlo::IsFiniteOp>,
                    ReshapeOpConverter<lmhlo::ReshapeOp>,
                    ReverseConverter<lmhlo::ReverseOp>,
                    ScalarPointwiseToStandardConverter<lmhlo::AddOp>,
-                   SliceConverter
+                   SliceConverter,
+                   TransposeConverter<lmhlo::TransposeOp>
                   >(context);
   // clang-format on
 }
@@ -859,13 +873,15 @@ void populateLHLOToLinalgConversionPattern(MLIRContext* context,
 //     %0 = addf %arg4, %arg5 : f32
 //     "linalg.yield"(%0) : (f32) -> ()
 // }) {
-//     args_in = 2,
-//     args_out = 1,
 //     indexing_maps = [#map0, #map0, #map0],
 //     iterator_types = ["parallel", "parallel"],
 // } : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
 struct LhloLegalizeToLinalgPass
     : public PassWrapper<LhloLegalizeToLinalgPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<AffineDialect, linalg::LinalgDialect>();
+  }
+
   void runOnFunction() override {
     OwningRewritePatternList patterns;
     ConversionTarget target(getContext());
@@ -882,6 +898,10 @@ struct LhloLegalizeToLinalgPass
 
 struct HloLegalizeToLinalgPass
     : public PassWrapper<HloLegalizeToLinalgPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<linalg::LinalgDialect>();
+  }
+
   void runOnFunction() override {
     OwningRewritePatternList patterns;
     ConversionTarget target(getContext());
@@ -913,6 +933,7 @@ void populateHLOToLinalgConversionPattern(MLIRContext* context,
                PointwiseToLinalgConverter<mhlo::AbsOp, false>,
                PointwiseToLinalgConverter<mhlo::AddOp, false>,
                PointwiseToLinalgConverter<mhlo::AndOp, false>,
+               PointwiseToLinalgConverter<mhlo::Atan2Op, false>,
                PointwiseToLinalgConverter<mhlo::CeilOp, false>,
                PointwiseToLinalgConverter<mhlo::CompareOp, false>,
                PointwiseToLinalgConverter<mhlo::ComplexOp, false>,
@@ -921,12 +942,14 @@ void populateHLOToLinalgConversionPattern(MLIRContext* context,
                PointwiseToLinalgConverter<mhlo::CosOp, false>,
                PointwiseToLinalgConverter<mhlo::DivOp, false>,
                PointwiseToLinalgConverter<mhlo::ExpOp, false>,
+               PointwiseToLinalgConverter<mhlo::FloorOp, false>,
                PointwiseToLinalgConverter<mhlo::ImagOp, false>,
                PointwiseToLinalgConverter<mhlo::LogOp, false>,
                PointwiseToLinalgConverter<mhlo::MaxOp, false>,
                PointwiseToLinalgConverter<mhlo::MinOp, false>,
                PointwiseToLinalgConverter<mhlo::MulOp, false>,
                PointwiseToLinalgConverter<mhlo::NegOp, false>,
+               PointwiseToLinalgConverter<mhlo::NotOp, false>,
                PointwiseToLinalgConverter<mhlo::RealOp, false>,
                PointwiseToLinalgConverter<mhlo::RemOp, false>,
                PointwiseToLinalgConverter<mhlo::RsqrtOp, false>,
@@ -935,6 +958,7 @@ void populateHLOToLinalgConversionPattern(MLIRContext* context,
                PointwiseToLinalgConverter<mhlo::SqrtOp, false>,
                PointwiseToLinalgConverter<mhlo::SubOp, false>,
                PointwiseToLinalgConverter<mhlo::TanhOp, false>,
+               PointwiseToLinalgConverter<mhlo::IsFiniteOp, false>,
                ReshapeOpConverter<mhlo::ReshapeOp, false>,
                ReverseConverter<mhlo::ReverseOp, false>,
                TransposeConverter<mhlo::TransposeOp, false>>(context);
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard.cc
index cc574e008d5..84255c2810e 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_standard.cc
@@ -117,7 +117,7 @@ class ConvertIotaOp : public OpRewritePattern<mhlo::IotaOp> {
                                 PatternRewriter &rewriter) const override {
     auto output_type = op.getType().cast<ShapedType>();
     auto output_size = output_type.getNumElements();
-    auto dimension = op.iota_dimension().getSExtValue();
+    auto dimension = op.iota_dimension();
     auto max_dim_size = output_type.getDimSize(dimension);
 
     auto element_type = output_type.getElementType();
@@ -178,6 +178,10 @@ class ConvertIotaOp : public OpRewritePattern<mhlo::IotaOp> {
 namespace {
 struct LegalizeToStandardPass
     : public PassWrapper<LegalizeToStandardPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<StandardOpsDialect>();
+  }
+
   /// Perform the lowering to Standard dialect.
   void runOnFunction() override;
 };
@@ -189,7 +193,7 @@ std::unique_ptr<mlir::OperationPass<mlir::FuncOp>> createLegalizeToStdPass() {
 
 void PopulateMhloToStdPatterns(OwningRewritePatternList *patterns,
                                mlir::MLIRContext *ctx) {
-  mlir::populateWithGenerated(ctx, patterns);
+  mlir::populateWithGenerated(ctx, *patterns);
   patterns->insert<CompareFConvert, CompareIConvert, ConvertIotaOp>(ctx);
 }
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_trigonometric_to_approximation.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_trigonometric_to_approximation.cc
new file mode 100644
index 00000000000..10030866d0f
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_trigonometric_to_approximation.cc
@@ -0,0 +1,284 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file implements the lowering for trigonometric standard ops to
+// approximations.
+
+#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Function.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+namespace mhlo {
+namespace {
+
+template <typename OpTy>
+class ApproximateOnExtendedF32Lowering : public OpRewritePattern<OpTy> {
+ public:
+  explicit ApproximateOnExtendedF32Lowering(MLIRContext *ctx)
+      : OpRewritePattern<OpTy>(ctx, /*benefit=*/100) {}
+
+  virtual Value emitApproximation(ValueRange, Location,
+                                  PatternRewriter &) const = 0;
+
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    auto raw_args = op.getOperation()->getOperands();
+
+    // Supports only f16 and f32 for now.
+    if (!op.getType().isF16() && !op.getType().isF32()) return failure();
+
+    // Extend operands to f32 if needed and possible.
+    SmallVector<Value, 2> f32_args;
+    f32_args.reserve(raw_args.size());
+    for (Value arg : raw_args) {
+      // Similar to XLA, do not rewrite f64 as precision might matter.
+      Type arg_ty = arg.getType();
+      if (arg_ty.isF64()) return failure();
+
+      if (arg_ty.isF16())
+        arg = rewriter.create<FPExtOp>(loc, arg, rewriter.getF32Type());
+
+      // If we still do not have f32, fail.
+      if (!arg.getType().isF32()) return failure();
+
+      f32_args.push_back(arg);
+    }
+
+    Value result = emitApproximation(f32_args, loc, rewriter);
+    assert(result.getType().isF32() && "Expect f32 intermediate result.");
+
+    // Truncate back if needed.
+    if (op.getType().isF16())
+      result = rewriter.create<FPTruncOp>(loc, result, rewriter.getF16Type());
+
+    rewriter.replaceOp(op, {result});
+    return success();
+  }
+};
+
+class ApproximateTanhLowering
+    : public ApproximateOnExtendedF32Lowering<TanhOp> {
+ public:
+  explicit ApproximateTanhLowering(MLIRContext *ctx)
+      : ApproximateOnExtendedF32Lowering<TanhOp>(ctx) {}
+
+  // Emits the fast tanh approximation that is also used by XLA.
+  Value emitApproximation(ValueRange args, Location loc,
+                          PatternRewriter &rewriter) const override {
+    // For small values of x, we can approximate tanh(x) = x.  For extremely
+    // small values of x (|x| < 1e-37), the other approximation would evaluate
+    // tanh(x) = 0.
+    Value input = args.front();
+    assert(input.getType().isF32());
+    constexpr float kCanUseApprox = 0.0004;
+    Value abs_value = rewriter.create<AbsFOp>(loc, input);
+    Value can_use_approx = rewriter.create<ConstantOp>(
+        loc, rewriter.getF32FloatAttr(kCanUseApprox));
+    Value return_input = rewriter.create<CmpFOp>(loc, CmpFPredicate::OLT,
+                                                 abs_value, can_use_approx);
+    // Clamp the input to [-c, c].
+    Value max_clamp = rewriter.create<ConstantOp>(
+        loc, rewriter.getF32FloatAttr(7.90531110763549805f));
+    Value smaller_than_max =
+        rewriter.create<CmpFOp>(loc, CmpFPredicate::ULE, input, max_clamp);
+    Value clamped_half =
+        rewriter.create<SelectOp>(loc, smaller_than_max, input, max_clamp);
+    Value min_clamp = rewriter.create<ConstantOp>(
+        loc, rewriter.getF32FloatAttr(-7.90531110763549805f));
+    Value larger_than_min = rewriter.create<CmpFOp>(loc, CmpFPredicate::UGE,
+                                                    clamped_half, min_clamp);
+    Value input_clamped = rewriter.create<SelectOp>(loc, larger_than_min,
+                                                    clamped_half, min_clamp);
+
+    static constexpr std::array<float, 7> numerator_coeffs{
+        -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
+        5.12229709037114e-08f,  1.48572235717979e-05f, 6.37261928875436e-04f,
+        4.89352455891786e-03f};
+
+    static constexpr std::array<float, 4> denominator_coeffs{
+        1.19825839466702e-06f, 1.18534705686654e-04f, 2.26843463243900e-03f,
+        4.89352518554385e-03f};
+
+    Value input_squared =
+        rewriter.create<MulFOp>(loc, input_clamped, input_clamped);
+    Value numerator = rewriter.create<ConstantOp>(
+        loc, rewriter.getF32FloatAttr(numerator_coeffs[0]));
+    for (int i = 1; i < numerator_coeffs.size(); i++) {
+      numerator = rewriter.create<AddFOp>(
+          loc, rewriter.create<MulFOp>(loc, input_squared, numerator),
+          rewriter.create<ConstantOp>(
+              loc, rewriter.getF32FloatAttr(numerator_coeffs[i])));
+    }
+
+    numerator = rewriter.create<MulFOp>(loc, input_clamped, numerator);
+
+    Value denominator = rewriter.create<ConstantOp>(
+        loc, rewriter.getF32FloatAttr(denominator_coeffs[0]));
+    for (int i = 1; i < denominator_coeffs.size(); i++) {
+      denominator = rewriter.create<AddFOp>(
+          loc, rewriter.create<MulFOp>(loc, input_squared, denominator),
+          rewriter.create<ConstantOp>(
+              loc, rewriter.getF32FloatAttr(denominator_coeffs[i])));
+    }
+
+    Value approx = rewriter.create<DivFOp>(loc, numerator, denominator);
+
+    return rewriter.create<SelectOp>(loc, return_input, input, approx);
+  }
+};
+
+class ApproximateAtan2Lowering
+    : public ApproximateOnExtendedF32Lowering<Atan2Op> {
+ public:
+  explicit ApproximateAtan2Lowering(MLIRContext *ctx)
+      : ApproximateOnExtendedF32Lowering<Atan2Op>(ctx) {}
+
+  // Reduces atan2 to atan in the same way XLA does it.
+  Value emitApproximation(ValueRange args, Location loc,
+                          PatternRewriter &rewriter) const override {
+    Value y = args[0];
+    Value x = args[1];
+    assert(x.getType().isF32() && y.getType().isF32() &&
+           "expect f32 arguments");
+    Value ax = rewriter.create<AbsFOp>(loc, x);
+    Value ay = rewriter.create<AbsFOp>(loc, y);
+    Value le_ax_ay = rewriter.create<CmpFOp>(loc, CmpFPredicate::OLE, ax, ay);
+    Value min_ax_ay = rewriter.create<mlir::SelectOp>(loc, le_ax_ay, ax, ay);
+    Value max_ax_ay = rewriter.create<mlir::SelectOp>(loc, le_ax_ay, ay, ax);
+    Value zero_to_one = rewriter.create<DivFOp>(loc, min_ax_ay, max_ax_ay);
+    Value a = emitAtanCoreApproximation(zero_to_one, loc, rewriter);
+
+    Value pi_over_2 =
+        rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(1.57079637f));
+    a = rewriter.create<mlir::SelectOp>(
+        loc, le_ax_ay, rewriter.create<SubFOp>(loc, pi_over_2, a), a);
+
+    Value zero = rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(0));
+    Value lt_x_0 = rewriter.create<CmpFOp>(loc, CmpFPredicate::OLT, x, zero);
+    Value pi =
+        rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(3.14159274f));
+    a = rewriter.create<mlir::SelectOp>(loc, lt_x_0,
+                                        rewriter.create<SubFOp>(loc, pi, a), a);
+
+    Value t = rewriter.create<mlir::SelectOp>(loc, lt_x_0, pi, zero);
+    Value eq_y_0 = rewriter.create<CmpFOp>(loc, CmpFPredicate::OEQ, y, zero);
+    a = rewriter.create<mlir::SelectOp>(loc, eq_y_0, t, a);
+
+    // Propagate nan.
+    Value is_nan = rewriter.create<CmpFOp>(loc, CmpFPredicate::UNO, y, x);
+    Value nan = rewriter.create<ConstantOp>(
+        loc, rewriter.getF32FloatAttr(std::numeric_limits<float>::quiet_NaN()));
+    a = rewriter.create<mlir::SelectOp>(loc, is_nan, nan, a);
+
+    // x and y are +- inf.
+    Value three_pi_over_4 =
+        rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(2.3561945f));
+    Value pi_over_4 = rewriter.create<ConstantOp>(
+        loc, rewriter.getF32FloatAttr(0.785398185f));
+    t = rewriter.create<mlir::SelectOp>(loc, lt_x_0, three_pi_over_4,
+                                        pi_over_4);
+    Value inf = rewriter.create<ConstantOp>(
+        loc, rewriter.getF32FloatAttr(std::numeric_limits<float>::infinity()));
+    Value eq_x_inf = rewriter.create<CmpFOp>(loc, CmpFPredicate::OEQ, x, inf);
+    Value eq_y_inf = rewriter.create<CmpFOp>(loc, CmpFPredicate::OEQ, y, inf);
+    Value all_inf = rewriter.create<mlir::AndOp>(loc, eq_x_inf, eq_y_inf);
+    a = rewriter.create<mlir::SelectOp>(loc, all_inf, t, a);
+
+    return rewriter.create<CopySignOp>(loc, a, y);
+  }
+
+ private:
+  // The core atan reduction derives from the heuristic described in
+  // https://arxiv.org/abs/1508.03211 and has a < 0.95 ulp error in the [-1, 1]
+  // range (though that assumed FMA was available, and it is not here).  This is
+  // the same approximation that is also used by XLA.
+  Value emitAtanCoreApproximation(Value x, Location loc,
+                                  PatternRewriter &rewriter) const {
+    auto constant = [&](float c) {
+      return rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(c));
+    };
+
+    // Computes ab + c.
+    auto mul_add = [&](Value a, Value b, Value c) {
+      Value prod = rewriter.create<MulFOp>(loc, a, b);
+      return rewriter.create<AddFOp>(loc, prod, c);
+    };
+
+    Value s = rewriter.create<MulFOp>(loc, x, x);
+    Value r = constant(0.0027856871f);
+    r = mul_add(r, s, constant(-0.0158660002f));
+    r = mul_add(r, s, constant(0.042472221f));
+    r = mul_add(r, s, constant(-0.0749753043f));
+    r = mul_add(r, s, constant(0.106448799f));
+    r = mul_add(r, s, constant(-0.142070308f));
+    r = mul_add(r, s, constant(0.199934542f));
+    r = mul_add(r, s, constant(-0.333331466f));
+    r = rewriter.create<MulFOp>(loc, r, s);
+    return mul_add(r, x, x);
+  }
+};
+
+class ApproximateAtanLowering
+    : public ApproximateOnExtendedF32Lowering<AtanOp> {
+ public:
+  explicit ApproximateAtanLowering(MLIRContext *ctx)
+      : ApproximateOnExtendedF32Lowering<AtanOp>(ctx) {}
+
+  // Reduce atan(x) to atan2(x, 1) to subsequently rely on an atan approximation
+  // for the argument range [-1, 1].
+  Value emitApproximation(ValueRange args, Location loc,
+                          PatternRewriter &rewriter) const override {
+    Value x = args.front();
+    assert(x.getType().isF32());
+    Value one = rewriter.create<ConstantOp>(loc, rewriter.getF32FloatAttr(1));
+    return rewriter.create<Atan2Op>(loc, x, one);
+  }
+};
+
+struct LegalizeTrigonometricToApproximationPass
+    : public PassWrapper<LegalizeTrigonometricToApproximationPass,
+                         FunctionPass> {
+  /// Perform the lowering of standard dialect operations to approximations.
+  void runOnFunction() override {
+    OwningRewritePatternList patterns;
+    PopulateTrigonometricToApproximationPatterns(&getContext(), &patterns);
+    applyPatternsAndFoldGreedily(getFunction(), patterns);
+  }
+};
+
+}  // anonymous namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::FuncOp>>
+createLegalizeTrigonometricToApproximationPass() {
+  return std::make_unique<LegalizeTrigonometricToApproximationPass>();
+}
+
+void PopulateTrigonometricToApproximationPatterns(
+    mlir::MLIRContext *context, OwningRewritePatternList *patterns) {
+  // clang-format off
+  patterns->insert<
+      ApproximateAtanLowering,
+      ApproximateAtan2Lowering,
+      ApproximateTanhLowering>(context);
+  // clang-format on
+}
+
+}  // namespace mhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_copy_removal.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_copy_removal.cc
deleted file mode 100644
index 7a4418466b5..00000000000
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_copy_removal.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements a pass to remove redundant LHLO copy operations.
-
-#include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-namespace lmhlo {
-namespace {
-
-// Removes LHLO copy operations that copy from allocated buffers to block
-// arguments. All uses of each buffer are replaced with the corresponding block
-// argument and the buffer is freed. Note that this pass only works in regions
-// with a single block.
-struct LhloCopyRemovalPass
-    : mlir::PassWrapper<LhloCopyRemovalPass, OperationPass<>> {
-  void runOnOperation() override {
-    llvm::SmallVector<mlir::Operation*, 2> eraseList;
-    auto operation = getOperation();
-    operation->walk([&](mlir::lmhlo::CopyOp copyOp) {
-      // If this region contains more than one block, then ignore this copy
-      // operation.
-      if (copyOp.getParentRegion()->getBlocks().size() > 1) {
-        return;
-      }
-
-      mlir::Value fromOperand = copyOp.operand();
-      mlir::Value toOperand = copyOp.output();
-
-      // If the fromOperand value is a block argument or the toOperand
-      // value is not a block argument, then ignore this copy operation.
-      if (!fromOperand.getDefiningOp() || toOperand.getDefiningOp()) {
-        return;
-      }
-
-      // The copy operation removal is illegal if there is at least a single use
-      // of toOperand value that lies between the first use of fromOperand value
-      // and the copy operation.
-      auto fromOperandUsers = fromOperand.getUsers();
-      auto firstUser = *fromOperandUsers.begin();
-      for (auto op : fromOperandUsers) {
-        if (op->isBeforeInBlock(firstUser)) firstUser = op;
-      }
-      for (auto op : toOperand.getUsers()) {
-        if (op->isBeforeInBlock(copyOp) && firstUser->isBeforeInBlock(op)) {
-          return;
-        }
-      }
-
-      // TODO(DFKI): Use live variable analysis to solve aliasing issues among
-      // block arguments.
-
-      // Remove the associated alloc operation.
-      auto allocOp = fromOperand.getDefiningOp();
-      eraseList.push_back(allocOp);
-
-      // Iterate over all uses of the fromOperand to find the associated
-      // deallocOp (if any).
-      for (auto op : fromOperandUsers) {
-        if (isa<mlir::DeallocOp>(op)) {
-          eraseList.push_back(op);
-          break;
-        }
-      }
-
-      // Replace all uses of the fromOperand with the toOperand. This rewires
-      // all references pointing to the original alloc operation to the new
-      // target operation in order to safely remove the copy op.
-      fromOperand.replaceAllUsesWith(toOperand);
-      copyOp.erase();
-    });
-    for (auto op : eraseList) {
-      op->erase();
-    }
-  };
-};
-
-}  // namespace
-
-std::unique_ptr<Pass> createLhloCopyRemovalPass() {
-  return std::make_unique<LhloCopyRemovalPass>();
-}
-
-}  // namespace lmhlo
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc
index 1467f015dc9..8f50ad0667f 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_fuse_linalg.cc
@@ -19,9 +19,12 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/FoldUtils.h"
 
@@ -33,6 +36,10 @@ using linalg::LinalgOp;
 
 class LhloFuseLinalgPass
     : public PassWrapper<LhloFuseLinalgPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<AffineDialect, linalg::LinalgDialect, scf::SCFDialect>();
+  }
+
  public:
   LhloFuseLinalgPass() = default;
   LhloFuseLinalgPass(const LhloFuseLinalgPass&) {}
@@ -67,6 +74,24 @@ class LhloFuseLinalgPass
         result_buffers.insert(operand);
       }
     }
+    // Resolve aliasing operations (like casts) on the result to identify
+    // results. This only handles escaping results.
+    // TODO(herhut): Use BufferizeAliasAnalysis for this.
+    llvm::SmallVector<Value, 4> worklist(result_buffers.begin(),
+                                         result_buffers.end());
+    while (!worklist.empty()) {
+      Value result = worklist.pop_back_val();
+      auto definingOp = result.getDefiningOp();
+      if (!definingOp) {
+        continue;
+      }
+      if (auto viewLike = dyn_cast<ViewLikeOpInterface>(definingOp)) {
+        auto alias = viewLike.getViewSource();
+        if (result_buffers.insert(alias).second) {
+          worklist.push_back(alias);
+        }
+      }
+    }
     MLIRContext* ctx = func.getContext();
     OpBuilder b(func);
     OperationFolder folder(ctx);
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc
index 07891327775..2041d22c62b 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc
@@ -59,6 +59,20 @@ struct DotOpConverter : public OpRewritePattern<DotOp> {
       return failure();
     }
 
+    // We don't currently support batching dimensions, or multiple contraction
+    // dimensions.
+    mhlo::DotDimensionNumbers dot_dimension_numbers =
+        op.dot_dimension_numbers();
+    if (dot_dimension_numbers.lhs_batching_dimensions().size() > 0 ||
+        dot_dimension_numbers.rhs_batching_dimensions().size() > 0)
+      return failure();
+    if (dot_dimension_numbers.lhs_contracting_dimensions().size() != 1 ||
+        *dot_dimension_numbers.lhs_contracting_dimensions().begin() != 1 ||
+        dot_dimension_numbers.rhs_contracting_dimensions().size() != 1 ||
+        *dot_dimension_numbers.rhs_contracting_dimensions().begin() != 0) {
+      return failure();
+    }
+
     LogicalResult map_status = success();
     auto body_builder = [&](OpBuilder& builder, Location loc, ValueRange ivs) {
       SmallVector<Value, 2> lhs_indices{ivs[0], ivs[2]},
@@ -139,6 +153,9 @@ void populateLHLOToAffineConversionPattern(MLIRContext* context,
 
 struct LhloLegalizeToAffinePass
     : public PassWrapper<LhloLegalizeToAffinePass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<AffineDialect>();
+  }
   void runOnFunction() override {
     OwningRewritePatternList patterns;
     auto func = getFunction();
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_gpu.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_gpu.cc
index cffb58b37de..fbade8f7387 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_gpu.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_gpu.cc
@@ -20,8 +20,10 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Attributes.h"
@@ -169,6 +171,11 @@ class LhloReduceToGPULaunchConverter : public OpConversionPattern<ReduceOp> {
 
 struct LhloLegalizeToGpuPass
     : public PassWrapper<LhloLegalizeToGpuPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<AffineDialect, gpu::GPUDialect, linalg::LinalgDialect,
+                    scf::SCFDialect>();
+  }
+
   void runOnFunction() override {
     OwningRewritePatternList patterns;
     ConversionTarget target(getContext());
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm.cc
index 42b71543543..57ea947c473 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm.cc
@@ -45,7 +45,7 @@ struct StaticMemRefCastOpConverter
       return failure();
     // Create descriptor.
     auto desc = MemRefDescriptor::undef(rewriter, loc, llvmTargetDescriptorTy);
-    Type llvmTargetElementTy = desc.getElementType();
+    Type llvmTargetElementTy = desc.getElementPtrType();
     // Set allocated ptr.
     Value allocated = sourceMemRef.allocatedPtr(rewriter, loc);
     allocated =
@@ -96,7 +96,7 @@ struct DynamicMemRefCastOpConverter
       return failure();
     // Create descriptor.
     auto desc = MemRefDescriptor::undef(rewriter, loc, llvmTargetDescriptorTy);
-    Type llvmTargetElementTy = desc.getElementType();
+    Type llvmTargetElementTy = desc.getElementPtrType();
     // Set allocated ptr.
     Value allocated = sourceMemRef.allocatedPtr(rewriter, loc);
     allocated =
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm_pass.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm_pass.cc
index 8493a1feb5d..3d49027bb50 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm_pass.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_llvm_pass.cc
@@ -29,6 +29,10 @@ namespace {
 class TestLhloToLLVMPass
     : public ::mlir::PassWrapper<TestLhloToLLVMPass,
                                  ::mlir::OperationPass<::mlir::ModuleOp>> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<LLVM::LLVMDialect>();
+  }
+
  public:
   void runOnOperation() override {
     ModuleOp m = getOperation();
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_parallel_loops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_parallel_loops.cc
index 19f47d08c0d..d9a2d993496 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_parallel_loops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_parallel_loops.cc
@@ -691,6 +691,10 @@ class SelectAndScatterOpConverter
 
 struct LhloLegalizeToParallelLoopsPass
     : public PassWrapper<LhloLegalizeToParallelLoopsPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<StandardOpsDialect, scf::SCFDialect>();
+  }
+
   void runOnFunction() override {
     auto func = getFunction();
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_complex.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_complex.cc
index 9f7c946577d..491f1c01cf7 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_complex.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_complex.cc
@@ -37,7 +37,6 @@ limitations under the License.
 
 using mlir::FunctionPass;
 using mlir::OwningRewritePatternList;
-using mlir::PassRegistration;
 using mlir::PassWrapper;
 
 namespace {
@@ -60,7 +59,7 @@ namespace {
 
 void PopulateComplexLoweringPatterns(MLIRContext* context,
                                      OwningRewritePatternList* patterns) {
-  populateWithGenerated(context, patterns);
+  populateWithGenerated(context, *patterns);
 }
 }  // end namespace mhlo
 }  // end namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_general_dot.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_general_dot.cc
index 2bbd4691f95..ada30a289a4 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_general_dot.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lower_general_dot.cc
@@ -38,7 +38,6 @@ using mlir::LogicalResult;
 using mlir::MLIRContext;
 using mlir::OpRewritePattern;
 using mlir::OwningRewritePatternList;
-using mlir::PassRegistration;
 using mlir::PassWrapper;
 using mlir::PatternRewriter;
 using mlir::RankedTensorType;
@@ -155,9 +154,16 @@ struct GeneralDotConvert : public OpRewritePattern<mlir::mhlo::DotGeneralOp> {
                              dot_numbers.rhs_contracting_dimensions(),
                              /*outer_dims_first=*/false, &rewriter);
 
+    // Accept only static shaped types.
+    auto lhs_shape_type = lhs.getType().dyn_cast_or_null<mlir::ShapedType>();
+    auto rhs_shape_type = rhs.getType().dyn_cast_or_null<mlir::ShapedType>();
+    if (!lhs_shape_type || !rhs_shape_type) return failure();
+    if (!lhs_shape_type.hasStaticShape() || !rhs_shape_type.hasStaticShape())
+      return failure();
+
     // Dot resulting shape.
-    auto lhs_shape = lhs.getType().cast<mlir::ShapedType>().getShape();
-    auto rhs_shape = rhs.getType().cast<mlir::ShapedType>().getShape();
+    auto lhs_shape = lhs_shape_type.getShape();
+    auto rhs_shape = rhs_shape_type.getShape();
     auto new_dot_type =
         RankedTensorType::get({lhs_shape[0], rhs_shape[1]}, dot_element_type);
 
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/mhlo_control_flow_to_scf.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/mhlo_control_flow_to_scf.cc
new file mode 100644
index 00000000000..dba3cab6956
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/mhlo_control_flow_to_scf.cc
@@ -0,0 +1,199 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/Support/Casting.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/StandardTypes.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+
+#define DEBUG_TYPE "mhlo-control-flow-to-scf"
+
+namespace mlir {
+namespace mhlo {
+
+namespace {
+
+/// Convert MHLO While to SCF.
+void MatchAndRewrite(WhileOp whileOp);
+
+/// Pass that converts MHLO control flow to SCF.
+class ControlFlowToScfPass
+    : public mlir::PassWrapper<ControlFlowToScfPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<scf::SCFDialect>();
+  }
+  void runOnFunction() override {
+    getFunction().walk([&](WhileOp whileOp) { MatchAndRewrite(whileOp); });
+  }
+};
+
+// TODO(jpienaar): Look into reformulating as a pattern.
+void MatchAndRewrite(WhileOp whileOp) {
+  // Handle pattern:
+  //   x = start
+  //   step = ...
+  //   limit = ...
+  //   while (x < limit) { ... x += step; }
+
+  // Only handling multi value while loops at the moment.
+  auto tupleOp = whileOp.getOperand().getDefiningOp<TupleOp>();
+  if (!tupleOp) return;
+  auto bodyReturn = whileOp.body()
+                        .front()
+                        .getTerminator()
+                        ->getOperand(0)
+                        .getDefiningOp<mhlo::TupleOp>();
+  // Note: due to the shape restrictions on While, if the operand to While is a
+  // tuple, then so is the return type of the body. But the verifier isn't
+  // checking that at the moment, so just bail out here if this doesn't hold.
+  if (!bodyReturn) return;
+
+  Value result = whileOp.cond().front().getTerminator()->getOperand(0);
+  // TODO(jpienaar): Expand to handle more than simple case with LT compare and
+  // constant step.
+  auto cmp = result.getDefiningOp<mhlo::CompareOp>();
+  if (!cmp || cmp.comparison_direction() != "LT") return;
+
+  const int kConstant = -1;
+  auto getValueAndIndex = [&](Value val) -> std::pair<Value, int> {
+    if (matchPattern(val, m_Constant())) return {val, kConstant};
+    // If it is defined by a tuple, then the tuple has to have been fed in and
+    // the external value is captured.
+    if (auto gte = val.getDefiningOp<GetTupleElementOp>()) {
+      if (!gte.getOperand().isa<mlir::BlockArgument>()) return {nullptr, 0};
+      int index = gte.index();
+      return {tupleOp.getOperand(index), index};
+    }
+    return {nullptr, 0};
+  };
+
+  using ValueIndex = std::pair<Value, int>;
+  ValueIndex loopIndVar = getValueAndIndex(cmp.lhs());
+  ValueIndex max = getValueAndIndex(cmp.rhs());
+  if (!loopIndVar.first || !max.first) return;
+  auto add =
+      bodyReturn.getOperand(loopIndVar.second).getDefiningOp<mhlo::AddOp>();
+  if (!add) return;
+  ValueIndex step = getValueAndIndex(add.rhs());
+  if (step.second != kConstant || !step.first) return;
+
+  // Only handle case where tuple isn't propagated as is for now.
+  // TODO(jpienaar): Remove this when a tuple is also created inside the loop
+  // to propagate.
+  for (auto* use : whileOp.body().front().getArgument(0).getUsers())
+    if (!isa<GetTupleElementOp>(use)) return;
+
+  LLVM_DEBUG(llvm::dbgs() << "Found for (" << whileOp.getLoc() << "):\n";
+             llvm::dbgs() << "  loopIndVar = " << loopIndVar.second << " max = "
+                          << max.second << " step = " << step.second << "\n";
+             llvm::dbgs() << "  loopIndVar = " << loopIndVar.first << " max = "
+                          << max.first << " step = " << step.first << "\n";);
+  OpBuilder b(whileOp);
+  // Inputs to new for loop.
+  llvm::SmallVector<Value, 4> input;
+  input.reserve(tupleOp.getNumOperands());
+  for (auto r : tupleOp.getOperands().take_front(loopIndVar.second))
+    input.push_back(r);
+  for (auto r : tupleOp.getOperands().drop_front(loopIndVar.second + 1))
+    input.push_back(r);
+
+  auto tensorIndexType = RankedTensorType::get({}, b.getIndexType());
+  auto getAsIndex = [&](Value val) {
+    auto loc = whileOp.getLoc();
+    return b.create<ExtractElementOp>(
+        loc, b.create<IndexCastOp>(loc, tensorIndexType, val), ValueRange());
+  };
+
+  // SCF for uses index type, so converted these.
+  auto forloopIndVar = getAsIndex(loopIndVar.first);
+  auto forMax = getAsIndex(max.first);
+  auto forStep = getAsIndex(step.first);
+  auto forOp = b.create<mlir::scf::ForOp>(whileOp.getLoc(), forloopIndVar,
+                                          forMax, forStep, input);
+  // Transfer the body without the block arguments.
+  forOp.getLoopBody().front().getOperations().splice(
+      forOp.getLoopBody().front().getOperations().end(),
+      whileOp.body().front().getOperations());
+
+  b.setInsertionPointToStart(&forOp.getLoopBody().front());
+  auto loopIndVarElType =
+      loopIndVar.first.getType().cast<ShapedType>().getElementType();
+  Value indVar = b.create<SplatOp>(
+      whileOp.getLoc(), RankedTensorType::get({}, loopIndVarElType),
+      b.create<IndexCastOp>(whileOp.getLoc(), loopIndVarElType,
+                            forOp.getInductionVar()));
+  // Update all block argument users to the SCF For args.
+  for (auto* use :
+       llvm::make_early_inc_range(whileOp.body().getArgument(0).getUsers())) {
+    // TODO(jpienaar): Expand here too when we allow using the tuple in the
+    // loop.
+    auto gte = cast<GetTupleElementOp>(use);
+    // If the loop induction var, then refer to the loop induction variable as
+    // this operand is not updated.
+    if (gte.index() == loopIndVar.second) {
+      use->getResult(0).replaceAllUsesWith(indVar);
+      use->erase();
+      continue;
+    }
+    int index = gte.index();
+    // If after the loop induction variable, then decrement as we don't include
+    // the loop induction variable in the for iter operands.
+    if (index > loopIndVar.second) --index;
+    use->getResult(0).replaceAllUsesWith(forOp.getIterOperands()[index]);
+    use->erase();
+  }
+
+  // Create new yield op without induction var update.
+  SmallVector<Value, 4> newYieldOps;
+  newYieldOps.reserve(bodyReturn.getNumOperands() - 1);
+  for (auto r : bodyReturn.getOperands().take_front(loopIndVar.second))
+    newYieldOps.push_back(r);
+  for (auto r : bodyReturn.getOperands().drop_front(loopIndVar.second + 1))
+    newYieldOps.push_back(r);
+  // Delete return & tuple op.
+  forOp.getLoopBody().front().back().erase();
+  forOp.getLoopBody().front().back().erase();
+  b.setInsertionPointToEnd(&forOp.getLoopBody().front());
+  b.create<scf::YieldOp>(whileOp.getLoc(), newYieldOps);
+
+  // Recombine output tuple with max value of induction variable.
+  llvm::SmallVector<Value, 4> loopOut;
+  loopOut.reserve(forOp.getNumResults() + 1);
+  for (auto r : forOp.getResults().take_front(loopIndVar.second))
+    loopOut.push_back(r);
+  loopOut.push_back(max.first);
+  for (auto r : forOp.getResults().drop_front(loopIndVar.second))
+    loopOut.push_back(r);
+  b.setInsertionPoint(whileOp);
+  auto newRes = b.create<mhlo::TupleOp>(whileOp.getLoc(), loopOut);
+  whileOp.replaceAllUsesWith(newRes.getOperation());
+  whileOp.erase();
+}
+
+}  // anonymous namespace
+
+std::unique_ptr<OperationPass<FuncOp>> createControlFlowToScfPass() {
+  return std::make_unique<ControlFlowToScfPass>();
+}
+
+}  // namespace mhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/optimize_mhlo_pass.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/optimize_mhlo_pass.cc
index 32a846e79ef..febd4423bf2 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/optimize_mhlo_pass.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/optimize_mhlo_pass.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"
 
 using mlir::FunctionPass;
-using mlir::PassRegistration;
 using mlir::PassWrapper;
 
 namespace {
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/sink_constants_to_control_flow.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/sink_constants_to_control_flow.cc
index 8d677f45c19..d863d825bcb 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/sink_constants_to_control_flow.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/sink_constants_to_control_flow.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Casting.h"
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/PassDetail.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Pass/Pass.h"
@@ -39,7 +40,8 @@ namespace {
 // those within internally. Note that doing so is the only option in case of
 // values defined outside that are BlockArguments of any of the parent region.
 class SinkConstantsToControlFlowPass
-    : public mlir::PassWrapper<SinkConstantsToControlFlowPass, FunctionPass> {
+    : public SinkConstantsToControlFlowPassBase<
+          SinkConstantsToControlFlowPass> {
   void runOnFunction() override {
     getFunction().walk([](Operation* op) {
       if (auto while_op = llvm::dyn_cast<WhileOp>(op)) {
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/transform_unranked_hlo.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/transform_unranked_hlo.cc
index 7c985ea7535..7c01fa22372 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/transform_unranked_hlo.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/transform_unranked_hlo.cc
@@ -14,6 +14,7 @@ limitations under the License.
 
 ==============================================================================*/
 
+#include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
-namespace mhlo {
 namespace {
 
 // TODO(herhut): Generate these out of op definitions.
@@ -46,106 +46,81 @@ namespace {
           sep fn(ShiftLeftOp) sep fn(ShiftRightArithmeticOp)              \
               sep fn(ShiftRightLogicalOp) sep fn(SubOp)
 
+// TODO(herhut): Generate these out of op definitions.
+#define MAP_CHLO_OPERATION_CWISE_UNARY(fn, sep) \
+  fn(AcosOp) sep fn(AtanOp) sep fn(SinhOp) sep fn(TanOp)
+
 template <typename OpTy>
 inline void AddLegalOpOnRankedTensor(ConversionTarget *target) {
   target->addDynamicallyLegalOp<OpTy>([](OpTy op) {
-    return llvm::all_of((op.getOperation())->getOperandTypes(),
+    return llvm::all_of(op.getOperation()->getOperandTypes(),
                         [&](Type t) { return t.isa<RankedTensorType>(); });
   });
 }
 
-/// Unary element-wise operations on unranked tensors can be applied to the
-/// flattened tensor with the same effect.
-/// This pattern rewrites every such operation to
+/// Element-wise operations on unranked tensors can be applied to the flattened
+/// tensor operands with the same effect.  This pattern rewrites every such
+/// operation to
 ///   (i)   flatten the input tensor,
-///   (ii)  apply the unary operation, and
+///   (ii)  apply the operation, and
 ///   (iii) restore the original shape.
 template <typename OpTy>
-struct UnaryElementwiseOpConversion : public OpRewritePattern<OpTy> {
-  explicit UnaryElementwiseOpConversion(MLIRContext *context)
+struct ElementwiseOpConversion : public OpRewritePattern<OpTy> {
+  explicit ElementwiseOpConversion(MLIRContext *context)
       : OpRewritePattern<OpTy>(context) {}
 
   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
-    // Don't apply conversion to ops with statically shaped operands.
-    Value operand = op.getOperand();
-    auto operandTy = operand.getType().dyn_cast<TensorType>();
-    if (operandTy.hasRank()) return failure();
-
-    // Generate IR to flatten the operand.
-    auto loc = op.getLoc();
-    Type extentTensorTy = shape::getExtentTensorType(rewriter.getContext());
-    Value shape =
-        rewriter.create<shape::ShapeOfOp>(loc, extentTensorTy, operand);
-    Type indexTy = rewriter.getIndexType();
-    Value numElements =
-        rewriter.create<shape::NumElementsOp>(loc, indexTy, shape);
-    Value flatShape = rewriter.create<TensorFromElementsOp>(loc, numElements);
-    auto flatTensorTy = RankedTensorType::get({ShapedType::kDynamicSize},
-                                              operandTy.getElementType());
-    Value flatOperand = rewriter.create<mhlo::DynamicReshapeOp>(
-        loc, flatTensorTy, operand, flatShape);
-
-    // Generate IR for the actual operation.
-    Value flatResult = rewriter.create<OpTy>(loc, flatTensorTy, flatOperand);
-
-    // Generate IR to restore the original shape.
-    rewriter.replaceOpWithNewOp<mhlo::DynamicReshapeOp>(op, operandTy,
-                                                        flatResult, shape);
-
-    return success();
-  }
-};
-
-/// Binary element-wise operation on unranked tensors can be applied to the
-/// flattened operand tensors with the same effect.
-/// This pattern rewrites every such operation to
-///   (i)   flatten the operand tensors,
-///   (ii)  apply the binary operation, and
-//    (iii) restore the original shape.
-template <typename OpTy>
-struct BinaryElementwiseOpConversion : public OpRewritePattern<OpTy> {
-  explicit BinaryElementwiseOpConversion(MLIRContext *context)
-      : OpRewritePattern<OpTy>(context) {}
-
-  LogicalResult matchAndRewrite(OpTy op,
-                                PatternRewriter &rewriter) const override {
-    // Don't apply conversion unless both operands are unranked.
-    if (op.lhs().getType().template isa<RankedTensorType>() ||
-        op.rhs().getType().template isa<RankedTensorType>()) {
+    // Don't apply conversion unless all operands are unranked.
+    if (!llvm::all_of(op.getOperation()->getOperands(), [&](Value operand) {
+          return operand.getType().isa<UnrankedTensorType>();
+        })) {
       return failure();
     }
 
-    // Flatten operands.
+    // Get operands' shape.
     auto loc = op.getLoc();
     Type extentTensorTy = shape::getExtentTensorType(rewriter.getContext());
-    Value shapeLhs =
-        rewriter.create<shape::ShapeOfOp>(loc, extentTensorTy, op.lhs());
-    Value shapeRhs =
-        rewriter.create<shape::ShapeOfOp>(loc, extentTensorTy, op.rhs());
-    Value shape = rewriter.create<shape::AnyOp>(loc, extentTensorTy,
-                                                ValueRange{shapeLhs, shapeRhs});
+    SmallVector<Value, 3> operandShapes;
+    for (Value operand : op.getOperation()->getOperands()) {
+      Value shape =
+          rewriter.create<shape::ShapeOfOp>(loc, extentTensorTy, operand);
+      operandShapes.push_back(shape);
+    }
+    Value shape =
+        operandShapes.size() == 1
+            ? operandShapes.front()
+            : rewriter.create<shape::AnyOp>(loc, extentTensorTy, operandShapes);
+
+    // Derive flat shape.
     Type indexTy = rewriter.getIndexType();
     Value numElements =
         rewriter.create<shape::NumElementsOp>(loc, indexTy, shape);
     Value flatShape = rewriter.create<TensorFromElementsOp>(loc, numElements);
-    TensorType lhsTy = op.lhs().getType().template cast<TensorType>();
-    Type flatLhsTy = RankedTensorType::get({ShapedType::kDynamicSize},
-                                           lhsTy.getElementType());
-    Value flatLhs =
-        rewriter.create<DynamicReshapeOp>(loc, flatLhsTy, op.lhs(), flatShape);
-    TensorType rhsTy = op.rhs().getType().template cast<TensorType>();
-    Type flatRhsTy = RankedTensorType::get({ShapedType::kDynamicSize},
-                                           rhsTy.getElementType());
-    Value flatRhs =
-        rewriter.create<DynamicReshapeOp>(loc, flatRhsTy, op.rhs(), flatShape);
 
-    // Apply actual operation to flattened operands.
-    Value flatResult = rewriter.create<OpTy>(loc, flatLhs, flatRhs);
+    // Flatten operands.
+    SmallVector<Value, 3> flatOperands;
+    for (Value operand : op.getOperation()->getOperands()) {
+      Type operandElementTy =
+          operand.getType().template cast<ShapedType>().getElementType();
+      Type flatTy =
+          RankedTensorType::get({ShapedType::kDynamicSize}, operandElementTy);
+      Value flat = rewriter.create<mhlo::DynamicReshapeOp>(loc, flatTy, operand,
+                                                           flatShape);
+      flatOperands.push_back(flat);
+    }
+
+    // Apply operation to flattened operands.
+    Type resultElementTy =
+        op.getType().template cast<ShapedType>().getElementType();
+    Type flatResultTy =
+        RankedTensorType::get({ShapedType::kDynamicSize}, resultElementTy);
+    Value flatResult =
+        rewriter.create<OpTy>(loc, flatResultTy, flatOperands, op.getAttrs());
 
     // Restore original shape.
-    rewriter.replaceOpWithNewOp<DynamicReshapeOp>(op, op.getType(), flatResult,
-                                                  shape);
+    rewriter.replaceOpWithNewOp<mhlo::DynamicReshapeOp>(op, op.getType(),
+                                                        flatResult, shape);
 
     return success();
   }
@@ -153,17 +128,26 @@ struct BinaryElementwiseOpConversion : public OpRewritePattern<OpTy> {
 
 struct TransformUnrankedHloPass
     : public PassWrapper<TransformUnrankedHloPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<shape::ShapeDialect, mhlo::MhloDialect>();
+  }
+
   void runOnFunction() override {
     // Setup conversion target.
     MLIRContext &ctx = getContext();
     ConversionTarget target(ctx);
-    target.addLegalDialect<MhloDialect, StandardOpsDialect,
+    target.addLegalDialect<mhlo::MhloDialect, StandardOpsDialect,
                            shape::ShapeDialect>();
     target.addLegalOp<FuncOp>();
-#define ADD_LEGAL(op) AddLegalOpOnRankedTensor<op>(&target)
-    MAP_XLA_OPERATION_CWISE_UNARY(ADD_LEGAL, ;);
-    MAP_XLA_OPERATION_CWISE_BINARY(ADD_LEGAL, ;);
-#undef ADD_LEGAL
+#define ADD_LEGAL_MHLO(op) AddLegalOpOnRankedTensor<mhlo::op>(&target)
+#define ADD_LEGAL_CHLO(op) AddLegalOpOnRankedTensor<chlo::op>(&target)
+    MAP_XLA_OPERATION_CWISE_UNARY(ADD_LEGAL_MHLO, ;);
+    MAP_XLA_OPERATION_CWISE_BINARY(ADD_LEGAL_MHLO, ;);
+    MAP_CHLO_OPERATION_CWISE_UNARY(ADD_LEGAL_CHLO, ;);
+#undef ADD_LEGAL_MHLO
+#undef ADD_LEGAL_CHLO
+    AddLegalOpOnRankedTensor<mhlo::CompareOp>(&target);
+    AddLegalOpOnRankedTensor<mhlo::SelectOp>(&target);
 
     // Populate rewrite patterns.
     OwningRewritePatternList patterns;
@@ -179,24 +163,26 @@ struct TransformUnrankedHloPass
 
 void PopulateTransformUnrankedHloPatterns(MLIRContext *context,
                                           OwningRewritePatternList *patterns) {
-  // TODO(frgossen): Populate all unary and binary operations.
-  // clang-format off
-#define MAP_UNARY(op) UnaryElementwiseOpConversion<op>
-#define MAP_BINARY(op) BinaryElementwiseOpConversion<op>
+#define MAP_UNARY(op) ElementwiseOpConversion<mhlo::op>
+#define MAP_BINARY(op) ElementwiseOpConversion<mhlo::op>
+#define MAP_CHLO_UNARY(op) ElementwiseOpConversion<chlo::op>
 #define COMMA ,
+  // clang-format off
   patterns->insert<
       MAP_XLA_OPERATION_CWISE_UNARY(MAP_UNARY, COMMA),
-      MAP_XLA_OPERATION_CWISE_BINARY(MAP_BINARY, COMMA)
-      >(context);
+      MAP_XLA_OPERATION_CWISE_BINARY(MAP_BINARY, COMMA),
+      MAP_CHLO_OPERATION_CWISE_UNARY(MAP_CHLO_UNARY, COMMA),
+      ElementwiseOpConversion<mhlo::CompareOp>,
+      ElementwiseOpConversion<mhlo::SelectOp>>(context);
+  // clang-format on
 #undef MAP_UNARY
 #undef MAP_BINARY
+#undef MAP_CHLO_UNARY
 #undef COMMA
-  // clang-format on
 }
 
-std::unique_ptr<::mlir::Pass> createTransformUnrankedHloPass() {
+std::unique_ptr<FunctionPass> createTransformUnrankedHloPass() {
   return std::make_unique<TransformUnrankedHloPass>();
 }
 
-}  // namespace mhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm.cc
index 1458e5f3d63..9d072488389 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/unfuse_batch_norm.cc
@@ -122,7 +122,7 @@ class UnfuseBatchNormInferencePattern
     if (!fp_type) {
       return failure();
     }
-    int64_t feature_dim = bn_op.feature_index().getSExtValue();
+    int64_t feature_dim = bn_op.feature_index();
 
     // Add epsilon to the variance and sqrt to get stddev:
     // stddev = sqrt(variance + epsilon)
diff --git a/tensorflow/compiler/mlir/hlo/tests/BUILD b/tensorflow/compiler/mlir/hlo/tests/BUILD
index 2c3150a217a..df74de64d7f 100644
--- a/tensorflow/compiler/mlir/hlo/tests/BUILD
+++ b/tensorflow/compiler/mlir/hlo/tests/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(licenses = ["notice"])
diff --git a/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir b/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir
index 0d20c3f517b..4effdc14ed6 100644
--- a/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir
@@ -63,6 +63,24 @@ func @divide_fold_float() -> tensor<4xf64> {
   return %2 : tensor<4xf64>
 }
 
+// CHECK-LABEL: remainder_fold_int
+func @remainder_fold_int() -> tensor<4xi32> {
+  %0 = mhlo.constant dense<[5, 66, 5, 1]> : tensor<4xi32>
+  %1 = mhlo.constant dense<[3, 5, 1, 2]> : tensor<4xi32>
+  // CHECK: mhlo.constant dense<[2, 1, 0, 1]>
+  %2 = "mhlo.remainder"(%0, %1) : (tensor<4xi32>, tensor<4xi32>) -> (tensor<4xi32>)
+  return %2 : tensor<4xi32>
+}
+
+// CHECK-LABEL: remainder_fold_float
+func @remainder_fold_float() -> tensor<4xf32> {
+  %0 = mhlo.constant dense<[7.0, 66.5, 5.0, 3.1]> : tensor<4xf32>
+  %1 = mhlo.constant dense<[3.0, 5.0, 1.0, 2.6]> : tensor<4xf32>
+  // CHECK: mhlo.constant dense<[1.000000e+00, 1.500000e+00, 0.000000e+00, 5.000000e-01]>
+  %2 = "mhlo.remainder"(%0, %1) : (tensor<4xf32>, tensor<4xf32>) -> (tensor<4xf32>)
+  return %2 : tensor<4xf32>
+}
+
 // CHECK-LABEL: max_scalar_fold
 func @max_scalar_fold() -> tensor<4xi64> {
   %0 = mhlo.constant dense<7> : tensor<4xi64>
@@ -301,6 +319,13 @@ func @slice_2D_fold_vertical() -> tensor<4x1xi64> {
   return %1 : tensor<4x1xi64>
 }
 
+// CHECK-LABEL: slice_unknown_shape
+func @slice_unknown_shape(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK: "mhlo.slice"(%arg0) {limit_indices = dense<[1, 4]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<*xf32>) -> tensor<*xf32>
+  %0 = "mhlo.slice"(%arg0) {limit_indices = dense<[1, 4]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
 // CHECK-LABEL: slice_concat_fold_first
 func @slice_concat_fold_first(%arg0: tensor<1x5xf32>, %arg1: tensor<1x5xf32>) -> tensor<1x5xf32> {
   %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1x5xf32>, tensor<1x5xf32>) -> tensor<2x5xf32>
@@ -576,6 +601,262 @@ func @dce_while_without_side_effect(%arg0: tensor<i64>) -> tensor<i64> {
   return %arg0 : tensor<i64>
 }
 
+// CHECK-LABEL: fold_compare_same_eq
+func @fold_compare_same_eq(%arg0: tensor<i64>) -> tensor<i1> {
+  // CHECK: %0 = mhlo.constant dense<true> : tensor<i1>
+  %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "EQ"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_same_le
+func @fold_compare_same_le(%arg0: tensor<i64>) -> tensor<i1> {
+  // CHECK: %0 = mhlo.constant dense<true> : tensor<i1>
+  %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "LE"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_same_ge
+func @fold_compare_same_ge(%arg0: tensor<i64>) -> tensor<i1> {
+  // CHECK: %0 = mhlo.constant dense<true> : tensor<i1>
+  %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "GE"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+// CHECK-LABEL: fold_compare_same_ne
+func @fold_compare_same_ne(%arg0: tensor<i64>) -> tensor<i1> {
+  // CHECK: %0 = mhlo.constant dense<false> : tensor<i1>
+  %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "NE"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_same_lt
+func @fold_compare_same_lt(%arg0: tensor<i64>) -> tensor<i1> {
+  // CHECK: %0 = mhlo.constant dense<false> : tensor<i1>
+  %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_same_gt
+func @fold_compare_same_gt(%arg0: tensor<i64>) -> tensor<i1> {
+  // CHECK: %0 = mhlo.constant dense<false> : tensor<i1>
+  %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "GT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_false_eq
+func @fold_compare_false_eq() -> tensor<i1> {
+  %0 = mhlo.constant dense<0> : tensor<i32>
+  %1 = mhlo.constant dense<1> : tensor<i32>
+  // CHECK: %0 = mhlo.constant dense<false> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "EQ"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+// CHECK-LABEL: fold_compare_true_eq
+func @fold_compare_true_eq() -> tensor<i1> {
+  %0 = mhlo.constant dense<1> : tensor<i32>
+  %1 = mhlo.constant dense<1> : tensor<i32>
+  // CHECK: %0 = mhlo.constant dense<true> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "EQ"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_false_eq_float
+func @fold_compare_false_eq_float() -> tensor<i1> {
+  %0 = mhlo.constant dense<0.> : tensor<f32>
+  %1 = mhlo.constant dense<1.> : tensor<f32>
+  // CHECK: %0 = mhlo.constant dense<false> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "EQ"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_true_eq_float
+func @fold_compare_true_eq_float() -> tensor<i1> {
+  %0 = mhlo.constant dense<1.> : tensor<f32>
+  %1 = mhlo.constant dense<1.> : tensor<f32>
+  // CHECK: %0 = mhlo.constant dense<true> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "EQ"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_false_ne
+func @fold_compare_false_ne() -> tensor<i1> {
+  %0 = mhlo.constant dense<1> : tensor<i32>
+  %1 = mhlo.constant dense<1> : tensor<i32>
+  // CHECK: %0 = mhlo.constant dense<false> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "NE"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_true_ne
+func @fold_compare_true_ne() -> tensor<i1> {
+  %0 = mhlo.constant dense<1> : tensor<i32>
+  %1 = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: %0 = mhlo.constant dense<true> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "NE"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_false_ne_float
+func @fold_compare_false_ne_float() -> tensor<i1> {
+  %0 = mhlo.constant dense<1.> : tensor<f32>
+  %1 = mhlo.constant dense<1.> : tensor<f32>
+  // CHECK: %0 = mhlo.constant dense<false> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "NE"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_true_ne_float
+func @fold_compare_true_ne_float() -> tensor<i1> {
+  %0 = mhlo.constant dense<0.> : tensor<f32>
+  %1 = mhlo.constant dense<1.> : tensor<f32>
+  // CHECK: %0 = mhlo.constant dense<true> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "NE"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_false_lt
+func @fold_compare_false_lt() -> tensor<i1> {
+  %0 = mhlo.constant dense<1> : tensor<i32>
+  %1 = mhlo.constant dense<1> : tensor<i32>
+  // CHECK: %0 = mhlo.constant dense<false> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "LT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_true_lt
+func @fold_compare_true_lt() -> tensor<i1> {
+  %0 = mhlo.constant dense<0> : tensor<i32>
+  %1 = mhlo.constant dense<1> : tensor<i32>
+  // CHECK: %0 = mhlo.constant dense<true> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "LT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_false_lt_float
+func @fold_compare_false_lt_float() -> tensor<i1> {
+  %0 = mhlo.constant dense<1.> : tensor<f32>
+  %1 = mhlo.constant dense<1.> : tensor<f32>
+  // CHECK: %0 = mhlo.constant dense<false> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "LT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_true_lt_float
+func @fold_compare_true_lt_float() -> tensor<i1> {
+  %0 = mhlo.constant dense<0.> : tensor<f32>
+  %1 = mhlo.constant dense<1.> : tensor<f32>
+  // CHECK: %0 = mhlo.constant dense<true> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "LT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_false_le
+func @fold_compare_false_le() -> tensor<i1> {
+  %0 = mhlo.constant dense<1> : tensor<i32>
+  %1 = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: %0 = mhlo.constant dense<false> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "LE"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_true_le
+func @fold_compare_true_le() -> tensor<i1> {
+  %0 = mhlo.constant dense<1> : tensor<i32>
+  %1 = mhlo.constant dense<1> : tensor<i32>
+  // CHECK: %0 = mhlo.constant dense<true> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "LE"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_false_le_float
+func @fold_compare_false_le_float() -> tensor<i1> {
+  %0 = mhlo.constant dense<1.> : tensor<f32>
+  %1 = mhlo.constant dense<0.> : tensor<f32>
+  // CHECK: %0 = mhlo.constant dense<false> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "LE"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_true_le_float
+func @fold_compare_true_le_float() -> tensor<i1> {
+  %0 = mhlo.constant dense<1.> : tensor<f32>
+  %1 = mhlo.constant dense<1.> : tensor<f32>
+  // CHECK: %0 = mhlo.constant dense<true> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "LE"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_false_gt
+func @fold_compare_false_gt() -> tensor<i1> {
+  %0 = mhlo.constant dense<0> : tensor<i32>
+  %1 = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: %0 = mhlo.constant dense<false> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "GT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_true_gt
+func @fold_compare_true_gt() -> tensor<i1> {
+  %0 = mhlo.constant dense<1> : tensor<i32>
+  %1 = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: %0 = mhlo.constant dense<true> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "GT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_false_gt_float
+func @fold_compare_false_gt_float() -> tensor<i1> {
+  %0 = mhlo.constant dense<0.> : tensor<f32>
+  %1 = mhlo.constant dense<0.> : tensor<f32>
+  // CHECK: %0 = mhlo.constant dense<false> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_true_gt_float
+func @fold_compare_true_gt_float() -> tensor<i1> {
+  %0 = mhlo.constant dense<1.> : tensor<f32>
+  %1 = mhlo.constant dense<0.> : tensor<f32>
+  // CHECK: %0 = mhlo.constant dense<true> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_false_ge
+func @fold_compare_false_ge() -> tensor<i1> {
+  %0 = mhlo.constant dense<0> : tensor<i32>
+  %1 = mhlo.constant dense<1> : tensor<i32>
+  // CHECK: %0 = mhlo.constant dense<false> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "GE"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_true_ge
+func @fold_compare_true_ge() -> tensor<i1> {
+  %0 = mhlo.constant dense<0> : tensor<i32>
+  %1 = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: %0 = mhlo.constant dense<true> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "GE"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_false_ge_float
+func @fold_compare_false_ge_float() -> tensor<i1> {
+  %0 = mhlo.constant dense<0.> : tensor<f32>
+  %1 = mhlo.constant dense<1.> : tensor<f32>
+  // CHECK: %0 = mhlo.constant dense<false> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "GE"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: fold_compare_true_ge_float
+func @fold_compare_true_ge_float() -> tensor<i1> {
+  %0 = mhlo.constant dense<0.> : tensor<f32>
+  %1 = mhlo.constant dense<0.> : tensor<f32>
+  // CHECK: %0 = mhlo.constant dense<true> : tensor<i1>
+  %2 = "mhlo.compare"(%0, %1) {comparison_direction = "GE"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
 // CHECK-LABEL: unpack_repack_same_tuple
 // CHECK-SAME: ([[ARG0:%.*]]: tuple<tensor<i32>, !mhlo.token, tensor<f32>>)
 func @unpack_repack_same_tuple(%arg0: tuple<tensor<i32>, !mhlo.token, tensor<f32>>) -> tuple<tensor<i32>, !mhlo.token, tensor<f32>> {
@@ -618,3 +899,533 @@ func @erase_dead_lhlo_constant_negative(%M : memref<4xf32>) -> memref<256x1024xf
   "lmhlo.constant"(%N) {value = dense<0.0> : tensor<f32>} : (memref<256x1024xf32>) -> ()
   return %N : memref<256x1024xf32>
 }
+
+// CHECK-LABEL: func @fold_get_dimension_size
+func @fold_get_dimension_size(%I : tensor<1x128x512xf32>) -> tensor<i32> {
+  %size = "mhlo.get_dimension_size"(%I) {dimension = 2 : i32} : (tensor<1x128x512xf32>) -> tensor<i32>
+  return %size : tensor<i32>
+  // CHECK-NEXT: %[[C:.*]] = mhlo.constant dense<512> : tensor<i32>
+  // CHECK-NEXT: return %[[C]]
+}
+
+// CHECK-LABEL: func @fold_select_same
+func @fold_select_same(%arg0 : tensor<f32>, %arg1 : tensor<i1>) -> tensor<f32> {
+  %1 = "mhlo.select"(%arg1, %arg0, %arg0) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: return %arg0
+  return %1 : tensor<f32>
+}
+
+// CHECK-LABEL: func @fold_select_first
+func @fold_select_first(%arg0 : tensor<f32>, %arg1 : tensor<f32>) -> tensor<f32> {
+  %0 = mhlo.constant dense<1> : tensor<i1>
+  %1 = "mhlo.select"(%0, %arg0, %arg1) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: return %arg0
+  return %1 : tensor<f32>
+}
+
+// CHECK-LABEL: func @fold_select_second
+func @fold_select_second(%arg0 : tensor<f32>, %arg1 : tensor<f32>) -> tensor<f32> {
+  %0 = mhlo.constant dense<0> : tensor<i1>
+  %1 = "mhlo.select"(%0, %arg0, %arg1) : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: return %arg1
+  return %1 : tensor<f32>
+}
+
+// CHECK-LABEL: func @fold_select_vector
+func @fold_select_vector(%arg0 : tensor<4xf32>, %arg1 : tensor<4xf32>) -> tensor<4xf32> {
+  %0 = mhlo.constant dense<1> : tensor<4xi1>
+  %1 = "mhlo.select"(%0, %arg0, %arg1) : (tensor<4xi1>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: return %arg0
+  return %1 : tensor<4xf32>
+}
+
+// CHECK-LABEL: gather_to_slice
+func @gather_to_slice(%arg0: tensor<5x6x7xf32>) -> tensor<3x6x5xf32> {
+  %0 = constant dense<[1, 2]> : tensor<2xi32>
+  %1 = "mhlo.gather"(%arg0, %0) {
+    dimension_numbers = {collapsed_slice_dims = dense<> : tensor<0xi64>,
+                         index_vector_dim = 0 : i64,
+                         offset_dims = dense<[0, 1, 2]> : tensor<3xi64>,
+                         start_index_map = dense<[0, 2]> : tensor<2xi64>},
+    indices_are_sorted = false,
+    slice_sizes = dense<[3, 6, 5]> : tensor<3xi64>} : (tensor<5x6x7xf32>, tensor<2xi32>) -> tensor<3x6x5xf32>
+  return %1 : tensor<3x6x5xf32>
+  // CHECK:  %[[RET:.*]] = "mhlo.slice"(%arg0) {limit_indices = dense<[4, 6, 7]> : tensor<3xi64>, start_indices = dense<[1, 0, 2]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<5x6x7xf32>) -> tensor<3x6x5xf32>
+  // CHECK: return %[[RET]] : tensor<3x6x5xf32>
+}
+
+// CHECK-LABEL: gather_scalar_index_to_slice
+func @gather_scalar_index_to_slice(%arg0: tensor<5x6x7xf32>) -> tensor<5x6x4xf32> {
+  %0 = constant dense<1> : tensor<i32>
+  %1 = "mhlo.gather"(%arg0, %0) {
+    dimension_numbers = {collapsed_slice_dims = dense<> : tensor<0xi64>,
+                         index_vector_dim = 0 : i64,
+                         offset_dims = dense<[0, 1, 2]> : tensor<3xi64>,
+                         start_index_map = dense<[2]> : tensor<1xi64>},
+    indices_are_sorted = false,
+    slice_sizes = dense<[5, 6, 4]> : tensor<3xi64>} : (tensor<5x6x7xf32>, tensor<i32>) -> tensor<5x6x4xf32>
+  return %1 : tensor<5x6x4xf32>
+  // CHECK:  %[[RET:.*]] = "mhlo.slice"(%arg0) {limit_indices = dense<[5, 6, 5]> : tensor<3xi64>, start_indices = dense<[0, 0, 1]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<5x6x7xf32>) -> tensor<5x6x4xf32>
+  // CHECK: return %[[RET]] : tensor<5x6x4xf32>
+}
+
+// CHECK-LABEL: gather_to_slice_reshape
+func @gather_to_slice_reshape(%arg0: tensor<5x6x7xf32>) -> tensor<3x6xf32> {
+  %0 = constant dense<[1, 2]> : tensor<2xi32>
+  %1 = "mhlo.gather"(%arg0, %0) {
+    dimension_numbers = {collapsed_slice_dims = dense<[2]> : tensor<1xi64>,
+                         index_vector_dim = 0 : i64,
+                         offset_dims = dense<[0, 1, 2]> : tensor<3xi64>,
+                         start_index_map = dense<[0, 2]> : tensor<2xi64>},
+    indices_are_sorted = false,
+    slice_sizes = dense<[3, 6, 1]> : tensor<3xi64>} : (tensor<5x6x7xf32>, tensor<2xi32>) -> tensor<3x6xf32>
+  return %1 : tensor<3x6xf32>
+  // CHECK:  %[[V0:.*]] = "mhlo.slice"(%arg0) {limit_indices = dense<[4, 6, 3]> : tensor<3xi64>, start_indices = dense<[1, 0, 2]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<5x6x7xf32>) -> tensor<3x6x1xf32>
+  // CHECK:  %[[V1:.*]] = "mhlo.reshape"(%[[V0]]) : (tensor<3x6x1xf32>) -> tensor<3x6xf32>
+  // CHECK: return %[[V1]] : tensor<3x6xf32>
+}
+
+// CHECK-LABEL: func @fold_and_same
+func @fold_and_same(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = "mhlo.and"(%arg0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: return %arg0
+  return %0 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_and_ones
+func @fold_and_ones(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<-1> : tensor<4xi32>
+  %1 = "mhlo.and"(%0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: return %arg0
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_and_zeros
+func @fold_and_zeros(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<0> : tensor<4xi32>
+  %1 = "mhlo.and"(%0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: return %0
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_and_constant
+func @fold_and_constant(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<7> : tensor<4xi32>
+  // CHECK: mhlo.and
+  %1 = "mhlo.and"(%0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_and_constants
+func @fold_and_constants() -> tensor<4xi32> {
+  %0 = mhlo.constant dense<[0, 1, 6, 3]> : tensor<4xi32>
+  %1 = mhlo.constant dense<[7, 3, 7, 2]> : tensor<4xi32>
+  %2 = "mhlo.and"(%0, %1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: %0 = mhlo.constant dense<[0, 1, 6, 2]> : tensor<4xi32>
+  // CHECK: return %0
+  return %2 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_or_same
+func @fold_or_same(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = "mhlo.or"(%arg0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: return %arg0
+  return %0 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_or_ones
+func @fold_or_ones(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<-1> : tensor<4xi32>
+  %1 = "mhlo.or"(%0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: return %0
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_or_zeros
+func @fold_or_zeros(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<0> : tensor<4xi32>
+  %1 = "mhlo.or"(%0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: return %arg0
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_or_constant
+func @fold_or_constant(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<7> : tensor<4xi32>
+  // CHECK: mhlo.or
+  %1 = "mhlo.or"(%0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_or_zeros_right
+func @fold_or_zeros_right(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<0> : tensor<4xi32>
+  %1 = "mhlo.or"(%arg0, %0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: return %arg0
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_or_zeros_constants
+func @fold_or_zeros_constants() -> tensor<4xi32> {
+  %0 = mhlo.constant dense<[0, 1, 6, 3]> : tensor<4xi32>
+  %1 = mhlo.constant dense<[7, 3, 7, 2]> : tensor<4xi32>
+  %2 = "mhlo.or"(%0, %1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: %0 = mhlo.constant dense<[7, 3, 7, 3]> : tensor<4xi32>
+  // CHECK: return %0
+  return %2 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_xor_same
+func @fold_xor_same(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = "mhlo.xor"(%arg0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: %0 = mhlo.constant dense<0> : tensor<4xi32>
+  // CHECK: return %0
+  return %0 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_xor_ones_left
+func @fold_xor_ones_left(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<-1> : tensor<4xi32>
+  // CHECK: mhlo.xor
+  %1 = "mhlo.xor"(%0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_xor_ones_right
+func @fold_xor_ones_right(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<-1> : tensor<4xi32>
+  // CHECK: mhlo.xor
+  %1 = "mhlo.xor"(%arg0, %0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_xor_zeros_left
+func @fold_xor_zeros_left(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<0> : tensor<4xi32>
+  %1 = "mhlo.xor"(%0, %arg0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: return %arg0
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_xor_zeros_right
+func @fold_xor_zeros_right(%arg0 : tensor<4xi32>) -> tensor<4xi32> {
+  %0 = mhlo.constant dense<0> : tensor<4xi32>
+  %1 = "mhlo.xor"(%arg0, %0) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: return %arg0
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_xor_zeros_constants
+func @fold_xor_zeros_constants() -> tensor<4xi32> {
+  %0 = mhlo.constant dense<[0, 1, 6, 3]> : tensor<4xi32>
+  %1 = mhlo.constant dense<[7, 3, 7, 2]> : tensor<4xi32>
+  %2 = "mhlo.xor"(%0, %1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: %0 = mhlo.constant dense<[7, 2, 1, 1]> : tensor<4xi32>
+  // CHECK: return %0
+  return %2 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_negate_int
+func @fold_negate_int() -> tensor<4xi32> {
+  %0 = mhlo.constant dense<[0, 1, 6, -3]> : tensor<4xi32>
+  // CHECK: mhlo.constant dense<[0, -1, -6, 3]>
+  %1 = "mhlo.negate"(%0) : (tensor<4xi32>) -> tensor<4xi32>
+  return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @fold_negate_float
+func @fold_negate_float() -> tensor<4xf32> {
+  %0 = mhlo.constant dense<[0., 1., 6., -3.]> : tensor<4xf32>
+  // CHECK: mhlo.constant dense<[-0.000000e+00, -1.000000e+00, -6.000000e+00, 3.000000e+00]>
+  %1 = "mhlo.negate"(%0) : (tensor<4xf32>) -> tensor<4xf32>
+  return %1 : tensor<4xf32>
+}
+
+// CHECK-LABEL: func @fold_sqrt_f32_constants
+func @fold_sqrt_f32_constants() -> tensor<4xf32> {
+  %0 = mhlo.constant dense<1.0> : tensor<4xf32>
+  %1 = "mhlo.sqrt"(%0) : (tensor<4xf32>) -> tensor<4xf32>
+  //     CHECK: mhlo.constant dense<1.000000e+00> : tensor<4xf32>
+  // CHECK-NOT: mhlo.sqrt
+  return %1 : tensor<4xf32>
+}
+
+// CHECK-LABEL: func @fold_sqrt_f64_constants
+func @fold_sqrt_f64_constants() -> tensor<4xf64> {
+  %0 = mhlo.constant dense<[1.0, 4.0, 9.0, 16.0]> : tensor<4xf64>
+  %1 = "mhlo.sqrt"(%0) : (tensor<4xf64>) -> tensor<4xf64>
+  //     CHECK: mhlo.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xf64>
+  // CHECK-NOT: mhlo.sqrt
+  return %1 : tensor<4xf64>
+}
+
+// CHECK-LABEL: func @not_fold_sqrt_neg_constants
+func @not_fold_sqrt_neg_constants() -> tensor<4xf32> {
+  %0 = mhlo.constant dense<-1.0> : tensor<4xf32>
+  %1 = "mhlo.sqrt"(%0) : (tensor<4xf32>) -> tensor<4xf32>
+  // CHECK: mhlo.constant dense<-1.000000e+00> : tensor<4xf32>
+  // CHECK: mhlo.sqrt
+  return %1 : tensor<4xf32>
+}
+
+// CHECK-LABEL: @tensor_flow_scatter_v1_update
+func @tensor_flow_scatter_v1_update() -> tensor<3x3xi32> {
+  %0 = constant dense<[[1, 2, 3], [4, 5, 6], [7, 8, 9]]> : tensor<3x3xi32>
+  %1 = constant dense<[0, 2]> : tensor<2xi32>
+  %2 = constant dense<[[10, 20, 30], [70, 80, 90]]> : tensor<2x3xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      "mhlo.return"(%arg1) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 1 : i64,
+          inserted_window_dims = dense<0> : tensor<1xi64>,
+          scatter_dims_to_operand_dims = dense<0> : tensor<1xi64>,
+          update_window_dims = dense<[1]> : tensor<1xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3xi32>, tensor<2xi32>, tensor<2x3xi32>) -> tensor<3x3xi32>
+  return %3 : tensor<3x3xi32>
+  // CHECK: mhlo.constant dense<[
+  // CHECK-SAME: [10, 20, 30], [4, 5, 6], [70, 80, 90]
+  // CHECK-SAME: ]> : tensor<3x3xi32>
+}
+
+// CHECK-LABEL: @tensor_flow_scatter_v2_update
+func @tensor_flow_scatter_v2_update() -> tensor<3x3xi32> {
+  %0 = constant dense<[[1, 2, 3], [4, 5, 6], [7, 8, 9]]> : tensor<3x3xi32>
+  %1 = constant dense<[0, 2]> : tensor<2xi32>
+  %2 = constant dense<[[10, 30], [40, 60], [70, 90]]> : tensor<3x2xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      "mhlo.return"(%arg1) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 1 : i64,
+          inserted_window_dims = dense<1> : tensor<1xi64>,
+          scatter_dims_to_operand_dims = dense<1> : tensor<1xi64>,
+          update_window_dims = dense<[0]> : tensor<1xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3xi32>, tensor<2xi32>, tensor<3x2xi32>) -> tensor<3x3xi32>
+  return %3 : tensor<3x3xi32>
+  // CHECK: mhlo.constant dense<[
+  // CHECK-SAME: [10, 2, 30], [40, 5, 60], [70, 8, 90]
+  // CHECK-SAME: ]> : tensor<3x3xi32>
+}
+
+// CHECK-LABEL: @tensor_flow_scatter_add
+func @tensor_flow_scatter_add() -> tensor<3x3xi32> {
+  %0 = constant dense<[[1, 2, 3], [4, 5, 6], [7, 8, 9]]> : tensor<3x3xi32>
+  %1 = constant dense<[0, 2]> : tensor<2xi32>
+  %2 = constant dense<[[10, 20, 30], [70, 80, 90]]> : tensor<2x3xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      %4 = "mhlo.add"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+      "mhlo.return"(%4) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 1 : i64,
+          inserted_window_dims = dense<0> : tensor<1xi64>,
+          scatter_dims_to_operand_dims = dense<0> : tensor<1xi64>,
+          update_window_dims = dense<[1]> : tensor<1xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3xi32>, tensor<2xi32>, tensor<2x3xi32>) -> tensor<3x3xi32>
+  return %3 : tensor<3x3xi32>
+  // CHECK: mhlo.constant dense<[
+  // CHECK-SAME: [11, 22, 33], [4, 5, 6], [77, 88, 99]
+  // CHECK-SAME: ]> : tensor<3x3xi32>
+}
+
+// CHECK-LABEL: @tensor_flow_scatter_repeated
+func @tensor_flow_scatter_repeated() -> tensor<3x3xi32> {
+  %0 = constant dense<[[1, 2, 3], [4, 5, 6], [7, 8, 9]]> : tensor<3x3xi32>
+  %1 = constant dense<[1, 1]> : tensor<2xi32>
+  %2 = constant dense<[[10, 20, 30], [70, 80, 90]]> : tensor<2x3xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      %4 = "mhlo.add"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+      "mhlo.return"(%4) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 1 : i64,
+          inserted_window_dims = dense<0> : tensor<1xi64>,
+          scatter_dims_to_operand_dims = dense<0> : tensor<1xi64>,
+          update_window_dims = dense<[1]> : tensor<1xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3xi32>, tensor<2xi32>, tensor<2x3xi32>) -> tensor<3x3xi32>
+  return %3 : tensor<3x3xi32>
+  // CHECK: mhlo.constant dense<[
+  // CHECK-SAME: [1, 2, 3], [84, 105, 126], [7, 8, 9]
+  // CHECK-SAME: ]> : tensor<3x3xi32>
+}
+
+// CHECK-LABEL: @tensor_flow_scatter_multiple_batch
+func @tensor_flow_scatter_multiple_batch() -> tensor<3x3xi32> {
+  %0 = constant dense<[[1, 2, 3], [4, 5, 6], [7, 8, 9]]> : tensor<3x3xi32>
+  %1 = constant dense<[[0, 2], [2, 1]]> : tensor<2x2xi32>
+  %2 = constant dense<[[[10, 30], [40, 60], [70, 90]], [[5, 5], [5, 5], [5, 5]]]> : tensor<2x3x2xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      %4 = "mhlo.add"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+      "mhlo.return"(%4) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 2 : i64,
+          inserted_window_dims = dense<1> : tensor<1xi64>,
+          scatter_dims_to_operand_dims = dense<1> : tensor<1xi64>,
+          update_window_dims = dense<[1]> : tensor<1xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3xi32>, tensor<2x2xi32>, tensor<2x3x2xi32>) -> tensor<3x3xi32>
+  return %3 : tensor<3x3xi32>
+  // CHECK: mhlo.constant dense<[
+  // CHECK-SAME: [11, 7, 38], [44, 10, 71], [77, 13, 104]
+  // CHECK-SAME: ]> : tensor<3x3xi32>
+}
+
+// CHECK-LABEL: @tensor_flow_scatter_nd
+func @tensor_flow_scatter_nd() -> tensor<3x3x2xi32> {
+  %0 = constant dense<[[[-1, 1], [-2, 2], [-3, 3]], [[-4, 4], [-5, 5], [-6, 6]], [[-7, 7], [-8, 8], [-9, 9]]]> : tensor<3x3x2xi32>
+  %1 = constant dense<[[0, 0], [1, 0]]> : tensor<2x2xi32>
+  %2 = constant dense<[[-10, 10], [-40, 40]]> : tensor<2x2xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      "mhlo.return"(%arg1) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 1 : i64,
+          inserted_window_dims = dense<[0, 1]> : tensor<2xi64>,
+          scatter_dims_to_operand_dims = dense<[0, 1]> : tensor<2xi64>,
+          update_window_dims = dense<[1]> : tensor<1xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3x2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<3x3x2xi32>
+  return %3 : tensor<3x3x2xi32>
+  // CHECK: mhlo.constant dense<[
+  // CHECK-SAME: [-10, 10], [-2, 2], [-3, 3]
+  // CHECK-SAME: [-40, 40], [-5, 5], [-6, 6]
+  // CHECK-SAME: [-7, 7], [-8, 8], [-9, 9]
+  // CHECK-SAME: ]> : tensor<3x3x2xi32>
+}
+
+// CHECK-LABEL: @tensor_flow_scatter_nd_index_vector
+func @tensor_flow_scatter_nd_index_vector() -> tensor<3x3x2xi32> {
+  %0 = constant dense<[[[-1, 1], [-2, 2], [-3, 3]], [[-4, 4], [-5, 5], [-6, 6]], [[-7, 7], [-8, 8], [-9, 9]]]> : tensor<3x3x2xi32>
+  %1 = constant dense<[[0, 0], [1, 0]]> : tensor<2x2xi32>
+  %2 = constant dense<[[-10, 10], [-20, 20]]> : tensor<2x2xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      "mhlo.return"(%arg1) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 0 : i64,
+          inserted_window_dims = dense<[0, 1]> : tensor<2xi64>,
+          scatter_dims_to_operand_dims = dense<[0, 1]> : tensor<2xi64>,
+          update_window_dims = dense<[1]> : tensor<1xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3x2xi32>, tensor<2x2xi32>, tensor<2x2xi32>) -> tensor<3x3x2xi32>
+  return %3 : tensor<3x3x2xi32>
+  // CHECK: mhlo.constant dense<[
+  // CHECK-SAME: [-20, 20], [-10, 10], [-3, 3]
+  // CHECK-SAME: [-4, 4], [-5, 5], [-6, 6]
+  // CHECK-SAME: [-7, 7], [-8, 8], [-9, 9]
+  // CHECK-SAME: ]> : tensor<3x3x2xi32>
+}
+
+// CHECK-LABEL: @scatter_batch_dus
+func @scatter_batch_dus() -> tensor<3x3xi32> {
+  %0 = constant dense<[[1, 2, 3], [4, 5, 6], [7, 8, 9]]> : tensor<3x3xi32>
+  %1 = constant dense<[[2, 1], [1, 1]]> : tensor<2x2xi32>
+  %2 = constant dense<[[[10]], [[20]]]> : tensor<2x1x1xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      "mhlo.return"(%arg1) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 0 : i64,
+          inserted_window_dims = dense<> : tensor<0xi64>,
+          scatter_dims_to_operand_dims = dense<[0, 1]> : tensor<2xi64>,
+          update_window_dims = dense<[1, 2]> : tensor<2xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3xi32>, tensor<2x2xi32>, tensor<2x1x1xi32>) -> tensor<3x3xi32>
+  return %3 : tensor<3x3xi32>
+  // CHECK: mhlo.constant dense<[
+  // CHECK-SAME: [1, 2, 3], [4, 20, 6], [7, 10, 9]
+  // CHECK-SAME: ]> : tensor<3x3xi32>
+}
+
+// CHECK-LABEL: @scatter_no_update_window_dim
+func @scatter_no_update_window_dim() -> tensor<3xi32> {
+  %0 = constant dense<[0, 1, 2]> : tensor<3xi32>
+  %1 = constant dense<[[[0], [1]], [[2], [1]]]> : tensor<2x2x1xi32>
+  %2 = constant dense<[[10, 20], [30, 40]]> : tensor<2x2xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      %4 = "mhlo.add"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+      "mhlo.return"(%4) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 2 : i64,
+          inserted_window_dims = dense<0> : tensor<1xi64>,
+          scatter_dims_to_operand_dims = dense<0> : tensor<1xi64>,
+          update_window_dims = dense<> : tensor<0xi64>
+        },
+        unique_indices = false
+    } : (tensor<3xi32>, tensor<2x2x1xi32>, tensor<2x2xi32>) -> tensor<3xi32>
+  return %3 : tensor<3xi32>
+  // CHECK: mhlo.constant dense<[10, 61, 32]> : tensor<3xi32>
+}
+
+// CHECK-LABEL: @scatter_negative_index
+func @scatter_negative_index() -> tensor<3x3xi32> {
+  %0 = constant dense<[[1, 2, 3], [4, 5, 6], [7, 8, 9]]> : tensor<3x3xi32>
+  %1 = constant dense<[0, -1]> : tensor<2xi32>
+  %2 = constant dense<[[10, 20, 30], [70, 80, 90]]> : tensor<2x3xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      "mhlo.return"(%arg1) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 1 : i64,
+          inserted_window_dims = dense<0> : tensor<1xi64>,
+          scatter_dims_to_operand_dims = dense<0> : tensor<1xi64>,
+          update_window_dims = dense<[1]> : tensor<1xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3xi32>, tensor<2xi32>, tensor<2x3xi32>) -> tensor<3x3xi32>
+  return %3 : tensor<3x3xi32>
+  // CHECK: constant dense<[
+  // CHECK-SAME: [1, 2, 3], [4, 5, 6], [7, 8, 9]
+  // CHECK-SAME: ]> : tensor<3x3xi32>
+  // CHECK: "mhlo.scatter"
+}
+
+// CHECK-LABEL: @scatter_out_of_bound
+func @scatter_out_of_bound() -> tensor<3x3xi32> {
+  %0 = constant dense<[[1, 2, 3], [4, 5, 6], [7, 8, 9]]> : tensor<3x3xi32>
+  %1 = constant dense<[1, 5]> : tensor<2xi32>
+  %2 = constant dense<[[10, 20, 30], [70, 80, 90]]> : tensor<2x3xi32>
+  %3 = "mhlo.scatter"(%0, %1, %2) ( {
+    ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
+      "mhlo.return"(%arg1) : (tensor<i32>) -> ()
+    }) {indices_are_sorted = false,
+        scatter_dimension_numbers = {
+          index_vector_dim = 1 : i64,
+          inserted_window_dims = dense<0> : tensor<1xi64>,
+          scatter_dims_to_operand_dims = dense<0> : tensor<1xi64>,
+          update_window_dims = dense<[1]> : tensor<1xi64>
+        },
+        unique_indices = false
+    } : (tensor<3x3xi32>, tensor<2xi32>, tensor<2x3xi32>) -> tensor<3x3xi32>
+  return %3 : tensor<3x3xi32>
+  // CHECK: constant dense<[
+  // CHECK-SAME: [1, 2, 3], [4, 5, 6], [7, 8, 9]
+  // CHECK-SAME: ]> : tensor<3x3xi32>
+  // CHECK: "mhlo.scatter"
+}
+
diff --git a/tensorflow/compiler/mlir/hlo/tests/chlo_infer_shape_type_methods.mlir b/tensorflow/compiler/mlir/hlo/tests/chlo_infer_shape_type_methods.mlir
index d226c92858a..0738459f8b6 100644
--- a/tensorflow/compiler/mlir/hlo/tests/chlo_infer_shape_type_methods.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/chlo_infer_shape_type_methods.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt -mhlo-test-infer-shaped-type-methods -allow-unregistered-dialect -split-input-file -verify-diagnostics %s -o - | FileCheck %s
+// RUN: mlir-hlo-opt --mhlo-test-infer-shaped-type-methods --allow-unregistered-dialect --split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: @broadcast_add
 // Note that all broadcast_ops are expanded from the same template, so
diff --git a/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_hlo_broadcasts.mlir b/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_hlo_broadcasts.mlir
index 9670372a864..60ec26f48a1 100644
--- a/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_hlo_broadcasts.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_hlo_broadcasts.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt -mhlo-test-chlo-legalize-to-hlo -cse -split-input-file -verify-diagnostics %s -o - | FileCheck %s
+// RUN: mlir-hlo-opt -chlo-legalize-to-hlo -cse -split-input-file -verify-diagnostics %s -o - | FileCheck %s
 
 // Check the non-broadcast case for each registered op, then just check a
 // representative op for detailed broadcast semantics.
@@ -253,7 +253,7 @@ func @addScalarUnranked(%arg0: tensor<f32>, %arg1: tensor<*xf32>) -> tensor<*xf3
 //                  to a 1D tensor.
 // CHECK:           %[[SHAPE_1:.*]] = shape.shape_of %[[ARG_1]] : tensor<*xf32>
 // CHECK:           %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE_1]] : tensor<?xindex> -> index
-// CHECK:           %[[SIZE_TENSOR:.*]] = tensor_from_elements(%[[NUM_ELEMENTS]]) : tensor<1xindex>
+// CHECK:           %[[SIZE_TENSOR:.*]] = tensor_from_elements %[[NUM_ELEMENTS]] : tensor<1xindex>
 // CHECK:           %[[RESHAPED:.*]] = "mhlo.dynamic_reshape"(%[[ARG_1]], %[[SIZE_TENSOR]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
 //                  The assuming region is part of the second stage of lowering
 //                  with ranked broadcasting logic.
@@ -288,7 +288,7 @@ func @addUnrankedScalar(%arg0: tensor<*xf32>, %arg1: tensor<f32>) -> tensor<*xf3
 //                  to a 1D tensor.
 // CHECK:           %[[SHAPE_0:.*]] = shape.shape_of %[[ARG_0]] : tensor<*xf32>
 // CHECK:           %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE_0]] : tensor<?xindex> -> index
-// CHECK:           %[[SIZE_TENSOR:.*]] = tensor_from_elements(%[[NUM_ELEMENTS]]) : tensor<1xindex>
+// CHECK:           %[[SIZE_TENSOR:.*]] = tensor_from_elements %[[NUM_ELEMENTS]] : tensor<1xindex>
 // CHECK:           %[[RESHAPED:.*]] = "mhlo.dynamic_reshape"(%[[ARG_0]], %[[SIZE_TENSOR]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
 //                  The assuming region is part of the second stage of lowering
 //                  with ranked broadcasting logic.
@@ -325,7 +325,7 @@ func @addUnrankedUnranked(
 // CHECK:           %[[LHS_IS_SCALAR:.*]] = cmpi "eq", %[[RANK_LHS]], %[[C0]] : index
 //                  Handle scalar LHS case
 // CHECK:           %[[VAL_8:.*]] = scf.if %[[LHS_IS_SCALAR]] -> (tensor<*xf32>) {
-// CHECK:             %[[SCALAR_LHS:.*]] = "mhlo.reshape"(%[[LHS]]) : (tensor<*xf32>) -> tensor<f32>
+// CHECK:             %[[SCALAR_LHS:.*]] = tensor_cast %[[LHS]] : tensor<*xf32> to tensor<f32>
 // CHECK:             %[[VAL_10:.*]] = chlo.broadcast_add %[[SCALAR_LHS]], %[[RHS]] : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
 // CHECK:             scf.yield %[[VAL_10]] : tensor<*xf32>
 // CHECK:           } else {
@@ -334,7 +334,7 @@ func @addUnrankedUnranked(
 // CHECK:             %[[RHS_IS_SCALAR:.*]] = cmpi "eq", %[[RANK_RHS]], %[[C0]] : index
   //                  Handle scalar RHS case
 // CHECK:             %[[VAL_14:.*]] = scf.if %[[RHS_IS_SCALAR]] -> (tensor<*xf32>) {
-// CHECK:               %[[SCALAR_RHS:.*]] = "mhlo.reshape"(%[[RHS]]) : (tensor<*xf32>) -> tensor<f32>
+// CHECK:               %[[SCALAR_RHS:.*]] = tensor_cast %[[RHS]] : tensor<*xf32> to tensor<f32>
 // CHECK:               %[[VAL_16:.*]] = chlo.broadcast_add %[[LHS]], %[[SCALAR_RHS]] : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
 // CHECK:               scf.yield %[[VAL_16]] : tensor<*xf32>
 // CHECK:             } else {
@@ -353,10 +353,12 @@ func @addUnrankedUnranked(
 //                        Handle rank 2 specialization
 // CHECK:                 %[[VAL_26:.*]] = scf.if %[[GREATEST_RANK_IS_2]] -> (tensor<*xf32>) {
 // CHECK:                   %[[CONST_SHAPE_2:.*]] = shape.const_shape [1, 1]
-// CHECK:                   %[[BROADCASTED_LHS_2:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_2]] : tensor<?xindex>, tensor<2xindex> -> tensor<2xindex>
-// CHECK:                   %[[BROADCASTED_RHS_2:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_2]] : tensor<?xindex>, tensor<2xindex> -> tensor<2xindex>
-// CHECK:                   %[[RESHAPED_LHS_2:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[BROADCASTED_LHS_2]]) : (tensor<*xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-// CHECK:                   %[[RESHAPED_RHS_2:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[BROADCASTED_RHS_2]]) : (tensor<*xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+// CHECK:                   %[[BROADCASTED_LHS_2:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_2]] : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+// CHECK:                   %[[CASTED_LHS_2:.*]] = tensor_cast %[[BROADCASTED_LHS_2]] : tensor<?xindex> to tensor<2xindex>
+// CHECK:                   %[[BROADCASTED_RHS_2:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_2]] : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+// CHECK:                   %[[CASTED_RHS_2:.*]] = tensor_cast %[[BROADCASTED_RHS_2]] : tensor<?xindex> to tensor<2xindex>
+// CHECK:                   %[[RESHAPED_LHS_2:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_2]]) : (tensor<*xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+// CHECK:                   %[[RESHAPED_RHS_2:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_2]]) : (tensor<*xf32>, tensor<2xindex>) -> tensor<?x?xf32>
 // CHECK:                   %[[RESULT_RANK_2:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_2]], %[[RESHAPED_RHS_2]] : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
 // CHECK:                   %[[RESULT_2:.*]] = tensor_cast %[[RESULT_RANK_2]] : tensor<?x?xf32> to tensor<*xf32>
 // CHECK:                   scf.yield %[[RESULT_2]] : tensor<*xf32>
@@ -366,10 +368,12 @@ func @addUnrankedUnranked(
 //                          Handle rank 3 specialization
 // CHECK:                   %[[VAL_34:.*]] = scf.if %[[GREATEST_RANK_IS_3]] -> (tensor<*xf32>) {
 // CHECK:                     %[[CONST_SHAPE_3:.*]] = shape.const_shape [1, 1, 1]
-// CHECK:                     %[[BROADCASTED_LHS_3:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_3]] : tensor<?xindex>, tensor<3xindex> -> tensor<3xindex>
-// CHECK:                     %[[BROADCASTED_RHS_3:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_3]] : tensor<?xindex>, tensor<3xindex> -> tensor<3xindex>
-// CHECK:                     %[[RESHAPED_LHS_3:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[BROADCASTED_LHS_3]]) : (tensor<*xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
-// CHECK:                     %[[RESHAPED_RHS_3:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[BROADCASTED_RHS_3]]) : (tensor<*xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
+// CHECK:                     %[[BROADCASTED_LHS_3:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_3]] : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+// CHECK:                     %[[CASTED_LHS_3:.*]] = tensor_cast %[[BROADCASTED_LHS_3]] : tensor<?xindex> to tensor<3xindex>
+// CHECK:                     %[[BROADCASTED_RHS_3:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_3]] : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+// CHECK:                     %[[CASTED_RHS_3:.*]] = tensor_cast %[[BROADCASTED_RHS_3]] : tensor<?xindex> to tensor<3xindex>
+// CHECK:                     %[[RESHAPED_LHS_3:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_3]]) : (tensor<*xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
+// CHECK:                     %[[RESHAPED_RHS_3:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_3]]) : (tensor<*xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
 // CHECK:                     %[[RESULT_RANK_3:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_3]], %[[RESHAPED_RHS_3]] : (tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
 // CHECK:                     %[[RESULT_3:.*]] = tensor_cast %[[RESULT_RANK_3]] : tensor<?x?x?xf32> to tensor<*xf32>
 // CHECK:                     scf.yield %[[RESULT_3]] : tensor<*xf32>
@@ -379,10 +383,12 @@ func @addUnrankedUnranked(
 //                            Handle rank 4 specialization
 // CHECK:                     %[[VAL_42:.*]] = scf.if %[[GREATEST_RANK_IS_4]] -> (tensor<*xf32>) {
 // CHECK:                       %[[CONST_SHAPE_4:.*]] = shape.const_shape [1, 1, 1, 1]
-// CHECK:                       %[[BROADCASTED_LHS_4:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_4]] : tensor<?xindex>, tensor<4xindex> -> tensor<4xindex>
-// CHECK:                       %[[BROADCASTED_RHS_4:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_4]] : tensor<?xindex>, tensor<4xindex> -> tensor<4xindex>
-// CHECK:                       %[[RESHAPED_LHS_4:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[BROADCASTED_LHS_4]]) : (tensor<*xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
-// CHECK:                       %[[RESHAPED_RHS_4:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[BROADCASTED_RHS_4]]) : (tensor<*xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+// CHECK:                       %[[BROADCASTED_LHS_4:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_4]] : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+// CHECK:                       %[[CASTED_LHS_4:.*]] = tensor_cast %[[BROADCASTED_LHS_4]] : tensor<?xindex> to tensor<4xindex>
+// CHECK:                       %[[BROADCASTED_RHS_4:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_4]] : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+// CHECK:                       %[[CASTED_RHS_4:.*]] = tensor_cast %[[BROADCASTED_RHS_4]] : tensor<?xindex> to tensor<4xindex>
+// CHECK:                       %[[RESHAPED_LHS_4:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_4]]) : (tensor<*xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+// CHECK:                       %[[RESHAPED_RHS_4:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_4]]) : (tensor<*xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
 // CHECK:                       %[[RESULT_RANK_4:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_4]], %[[RESHAPED_RHS_4]] : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
 // CHECK:                       %[[RESULT_4:.*]] = tensor_cast %[[RESULT_RANK_4]] : tensor<?x?x?x?xf32> to tensor<*xf32>
 // CHECK:                       scf.yield %[[RESULT_4]] : tensor<*xf32>
@@ -392,10 +398,12 @@ func @addUnrankedUnranked(
 //                              Handle rank 5 specialization
 // CHECK:                       %[[VAL_50:.*]] = scf.if %[[GREATEST_RANK_IS_5]] -> (tensor<*xf32>) {
 // CHECK:                         %[[CONST_SHAPE_5:.*]] = shape.const_shape [1, 1, 1, 1, 1]
-// CHECK:                         %[[BROADCASTED_LHS_5:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_5]] : tensor<?xindex>, tensor<5xindex> -> tensor<5xindex>
-// CHECK:                         %[[BROADCASTED_RHS_5:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_5]] : tensor<?xindex>, tensor<5xindex> -> tensor<5xindex>
-// CHECK:                         %[[RESHAPED_LHS_5:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[BROADCASTED_LHS_5]]) : (tensor<*xf32>, tensor<5xindex>) -> tensor<?x?x?x?x?xf32>
-// CHECK:                         %[[RESHAPED_RHS_5:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[BROADCASTED_RHS_5]]) : (tensor<*xf32>, tensor<5xindex>) -> tensor<?x?x?x?x?xf32>
+// CHECK:                         %[[BROADCASTED_LHS_5:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_5]] : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+// CHECK:                         %[[CASTED_LHS_5:.*]] = tensor_cast %[[BROADCASTED_LHS_5]] : tensor<?xindex> to tensor<5xindex>
+// CHECK:                         %[[BROADCASTED_RHS_5:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_5]] : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+// CHECK:                         %[[CASTED_RHS_5:.*]] = tensor_cast %[[BROADCASTED_RHS_5]] : tensor<?xindex> to tensor<5xindex>
+// CHECK:                         %[[RESHAPED_LHS_5:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_5]]) : (tensor<*xf32>, tensor<5xindex>) -> tensor<?x?x?x?x?xf32>
+// CHECK:                         %[[RESHAPED_RHS_5:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_5]]) : (tensor<*xf32>, tensor<5xindex>) -> tensor<?x?x?x?x?xf32>
 // CHECK:                         %[[RESULT_RANK_5:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_5]], %[[RESHAPED_RHS_5]] : (tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
 // CHECK:                         %[[RESULT_5:.*]] = tensor_cast %[[RESULT_RANK_5]] : tensor<?x?x?x?x?xf32> to tensor<*xf32>
 // CHECK:                         scf.yield %[[RESULT_5]] : tensor<*xf32>
@@ -405,10 +413,12 @@ func @addUnrankedUnranked(
 //                                Handle rank 6 specialization
 // CHECK:                         %[[VAL_58:.*]] = scf.if %[[GREATEST_RANK_IS_6]] -> (tensor<*xf32>) {
 // CHECK:                           %[[CONST_SHAPE_6:.*]] = shape.const_shape [1, 1, 1, 1, 1, 1]
-// CHECK:                           %[[BROADCASTED_LHS_6:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_6]] : tensor<?xindex>, tensor<6xindex> -> tensor<6xindex>
-// CHECK:                           %[[BROADCASTED_RHS_6:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_6]] : tensor<?xindex>, tensor<6xindex> -> tensor<6xindex>
-// CHECK:                           %[[RESHAPED_LHS_6:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[BROADCASTED_LHS_6]]) : (tensor<*xf32>, tensor<6xindex>) -> tensor<?x?x?x?x?x?xf32>
-// CHECK:                           %[[RESHAPED_RHS_6:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[BROADCASTED_RHS_6]]) : (tensor<*xf32>, tensor<6xindex>) -> tensor<?x?x?x?x?x?xf32>
+// CHECK:                           %[[BROADCASTED_LHS_6:.*]] = shape.broadcast %[[LHS_SHAPE]], %[[CONST_SHAPE_6]] : tensor<?xindex>, tensor<6xindex> -> tensor<?xindex>
+// CHECK:                           %[[CASTED_LHS_6:.*]] = tensor_cast %[[BROADCASTED_LHS_6]] : tensor<?xindex> to tensor<6xindex>
+// CHECK:                           %[[BROADCASTED_RHS_6:.*]] = shape.broadcast %[[RHS_SHAPE]], %[[CONST_SHAPE_6]] : tensor<?xindex>, tensor<6xindex> -> tensor<?xindex>
+// CHECK:                           %[[CASTED_RHS_6:.*]] = tensor_cast %[[BROADCASTED_RHS_6]] : tensor<?xindex> to tensor<6xindex>
+// CHECK:                           %[[RESHAPED_LHS_6:.*]] = "mhlo.dynamic_reshape"(%[[LHS]], %[[CASTED_LHS_6]]) : (tensor<*xf32>, tensor<6xindex>) -> tensor<?x?x?x?x?x?xf32>
+// CHECK:                           %[[RESHAPED_RHS_6:.*]] = "mhlo.dynamic_reshape"(%[[RHS]], %[[CASTED_RHS_6]]) : (tensor<*xf32>, tensor<6xindex>) -> tensor<?x?x?x?x?x?xf32>
 // CHECK:                           %[[RESULT_RANK_6:.*]] = chlo.broadcast_add %[[RESHAPED_LHS_6]], %[[RESHAPED_RHS_6]] : (tensor<?x?x?x?x?x?xf32>, tensor<?x?x?x?x?x?xf32>) -> tensor<?x?x?x?x?x?xf32>
 // CHECK:                           %[[RESULT_6:.*]] = tensor_cast %[[RESULT_RANK_6]] : tensor<?x?x?x?x?x?xf32> to tensor<*xf32>
 // CHECK:                           scf.yield %[[RESULT_6]] : tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_mhlo.mlir b/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_mhlo.mlir
new file mode 100644
index 00000000000..2bec91203f9
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tests/chlo_legalize_to_mhlo.mlir
@@ -0,0 +1,26 @@
+// RUN: mlir-hlo-opt --chlo-legalize-to-hlo --split-input-file %s | FileCheck %s
+
+// Lower statically shaped `constant_like` to constant.
+// CHECK-LABEL: @constant_like_static_shape
+func @constant_like_static_shape(%arg : tensor<1x2xi64>) -> tensor<1x2xf32> {
+  // CHECK: %[[RESULT:.*]] = mhlo.constant dense<3.200000e+00> : tensor<1x2xf32>
+  // CHECK: return %[[RESULT]]
+  %result = "chlo.constant_like"(%arg) { value = 3.2 : f32 }
+      : (tensor<1x2xi64>) -> tensor<1x2xf32>
+  return %result : tensor<1x2xf32>
+}
+
+// Lower dynamically shaped `constant_like` to broadcasted constant.
+// CHECK-LABEL: constant_like_dynamic_shape
+// CHECK-SAME: (%[[ARG:.*]]: tensor<?x?xi64>)
+func @constant_like_dynamic_shape(%arg : tensor<?x?xi64>) -> tensor<?x?xf32> {
+  // CHECK: %[[CONSTANT:.*]] = mhlo.constant dense<3.200000e+00> : tensor<f32>
+  // CHECK: %[[UNCASTED_SHAPE:.*]] = shape.shape_of %[[ARG]] : tensor<?x?xi64> -> tensor<?xindex>
+  // CHECK: %[[SHAPE:.*]] = tensor_cast %[[UNCASTED_SHAPE]] : tensor<?xindex> to tensor<2xindex>
+  // CHECK: %[[BROADCASTED_CONSTANT:.*]] = "mhlo.dynamic_broadcast_in_dim"(%[[CONSTANT]], %[[SHAPE]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK: return %[[BROADCASTED_CONSTANT]] : tensor<?x?xf32>
+  %result = "chlo.constant_like"(%arg) { value = 3.2 : f32 }
+      : (tensor<?x?xi64>) -> tensor<?x?xf32>
+  return %result : tensor<?x?xf32>
+}
+
diff --git a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo.mlir b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo.mlir
index 018711e33cb..f6fdc4439bb 100644
--- a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-lhlo.mlir
@@ -170,7 +170,7 @@ func @dyn_broadcast(%operand: memref<?x?xf32>) {
   // BOTH-SAME: (%[[OPERAND:.*]]: memref<?x?xf32>)
   %tensor_operand = tensor_load %operand : memref<?x?xf32>
   %c1 = constant 1 : i64
-  %shape = tensor_from_elements(%c1, %c1, %c1) : tensor<3xi64>
+  %shape = tensor_from_elements %c1, %c1, %c1 : tensor<3xi64>
   %tensor_result = "mhlo.dynamic_broadcast_in_dim"(%tensor_operand, %shape) {
     broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>
   } : (tensor<?x?xf32>, tensor<3xi64>) -> tensor<?x?x?xf32>
@@ -236,6 +236,21 @@ func @complex(%real: memref<2x2xf32>,
 
 // -----
 
+// BOTH-LABEL: func @complex_dyn
+func @complex_dyn(%real: memref<?xf32>,
+                  %imag: memref<?xf32>,
+                  %result: memref<?xcomplex<f32>>) {
+  %tensor_real = tensor_load %real : memref<?xf32>
+  %tensor_imag = tensor_load %imag : memref<?xf32>
+  %tensor_result = "mhlo.complex"(%tensor_real, %tensor_imag)
+      : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xcomplex<f32>>
+  // BOTH: "lmhlo.complex"(%{{.*}}, %{{.*}})
+  tensor_store %tensor_result, %result : memref<?xcomplex<f32>>
+  return
+}
+
+// -----
+
 // BOTH-LABEL: func @real
 func @real(%operand: memref<2x2xcomplex<f32>>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xcomplex<f32>>
@@ -248,6 +263,18 @@ func @real(%operand: memref<2x2xcomplex<f32>>, %result: memref<2x2xf32>) {
 
 // -----
 
+// BOTH-LABEL: func @real_dyn
+func @real_dyn(%operand: memref<?xcomplex<f32>>, %result: memref<?xf32>) {
+  %tensor_operand = tensor_load %operand : memref<?xcomplex<f32>>
+  %tensor_result = "mhlo.real"(%tensor_operand)
+      : (tensor<?xcomplex<f32>>) -> tensor<?xf32>
+  // BOTH: "lmhlo.real"(%{{.*}}, %{{.*}})
+  tensor_store %tensor_result, %result : memref<?xf32>
+  return
+}
+
+// -----
+
 // BOTH-LABEL: func @imag
 func @imag(%operand: memref<2x2xcomplex<f32>>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xcomplex<f32>>
@@ -260,6 +287,18 @@ func @imag(%operand: memref<2x2xcomplex<f32>>, %result: memref<2x2xf32>) {
 
 // -----
 
+// BOTH-LABEL: func @imag_dyn
+func @imag_dyn(%operand: memref<?xcomplex<f32>>, %result: memref<?xf32>) {
+  %tensor_operand = tensor_load %operand : memref<?xcomplex<f32>>
+  %tensor_result = "mhlo.imag"(%tensor_operand)
+      : (tensor<?xcomplex<f32>>) -> tensor<?xf32>
+  // BOTH: "lmhlo.imag"(%{{.*}}, %{{.*}})
+  tensor_store %tensor_result, %result : memref<?xf32>
+  return
+}
+
+// -----
+
 // BOTH-LABEL: func @iota
 func @iota(%result: memref<10xi32>) {
   %tensor_result = "mhlo.iota"()
@@ -320,6 +359,18 @@ func @cos(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
 
 // -----
 
+// BOTH-LABEL: func @floor
+func @floor(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  %tensor_operand = tensor_load %operand : memref<2x2xf32>
+  %tensor_result = "mhlo.floor"(%tensor_operand)
+      : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  // BOTH: "lmhlo.floor"(%{{.*}}, %{{.*}})
+  tensor_store %tensor_result, %result : memref<2x2xf32>
+  return
+}
+
+// -----
+
 // BOTH-LABEL: func @neg
 func @neg(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -332,6 +383,18 @@ func @neg(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
 
 // -----
 
+// BOTH-LABEL: func @not
+func @not(%operand: memref<2x2xi32>, %result: memref<2x2xi32>) {
+  %tensor_operand = tensor_load %operand : memref<2x2xi32>
+  %tensor_result = "mhlo.not"(%tensor_operand)
+      : (tensor<2x2xi32>) -> tensor<2x2xi32>
+  // BOTH: "lmhlo.not"(%{{.*}}, %{{.*}})
+  tensor_store %tensor_result, %result : memref<2x2xi32>
+  return
+}
+
+// -----
+
 // BOTH-LABEL: func @rsqrt
 func @rsqrt(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
   %tensor_operand = tensor_load %operand : memref<2x2xf32>
@@ -404,7 +467,7 @@ func @add_dyn(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>) {
   // BOTH: %[[C1:.*]] = constant 1 : index
   // BOTH: %[[DIM1:.*]] = dim %arg0, %[[C1]] : memref<?x?xf32>
   // BOTH: %[[IC1:.*]] = index_cast %[[DIM1]] : index to i64
-  // BOTH: %[[SHAPE:.*]] = tensor_from_elements(%[[IC0]], %[[IC1]]) : tensor<2xi64>
+  // BOTH: %[[SHAPE:.*]] = tensor_from_elements %[[IC0]], %[[IC1]] : tensor<2xi64>
   // BOTH: %[[C0_:.*]] = constant 0 : index
   // BOTH: %[[EE0:.*]] = extract_element %[[SHAPE]][%[[C0_]]] : tensor<2xi64>
   // BOTH: %[[ICS0:.*]] = index_cast %[[EE0]] : i64 to index
@@ -429,7 +492,7 @@ func @tanh_dyn(%arg0: tensor<?x?xf32>) {
   // BOTH: %[[C1:.*]] = constant 1 : index
   // BOTH: %[[DIM1:.*]] = dim %arg0, %[[C1]] : memref<?x?xf32>
   // BOTH: %[[IC1:.*]] = index_cast %[[DIM1]] : index to i64
-  // BOTH: %[[SHAPE:.*]] = tensor_from_elements(%[[IC0]], %[[IC1]]) : tensor<2xi64>
+  // BOTH: %[[SHAPE:.*]] = tensor_from_elements %[[IC0]], %[[IC1]] : tensor<2xi64>
   // BOTH: %[[C0_:.*]] = constant 0 : index
   // BOTH: %[[EE0:.*]] = extract_element %[[SHAPE]][%[[C0_]]] : tensor<2xi64>
   // BOTH: %[[ICS0:.*]] = index_cast %[[EE0]] : i64 to index
@@ -448,7 +511,13 @@ func @dot(%arg0: tensor<1024x1024xf32>) -> tensor<1024x1024xf32> {
 //  PRE-SAME: (%[[ARG0:.*]]: [[TYPE:.*]], %[[RESULT:.*]]: [[TYPE]])
 //  ESC-SAME: (%[[ARG0:.*]]: [[TYPE:.*]]) -> [[TYPE]]
 // BOTH-NEXT: %[[ALLOC:.*]] = alloc
-//      BOTH: "lmhlo.dot"(%[[ARG0]], %[[ARG0]], %[[ALLOC]]) : ([[TYPE]], [[TYPE]], [[TYPE]]) -> ()
+//      BOTH: "lmhlo.dot"(%[[ARG0]], %[[ARG0]], %[[ALLOC]]) {
+//        dot_dimension_numbers = {
+//          lhs_batching_dimensions = dense<> : tensor<0xi64>,
+//          lhs_contracting_dimensions = dense<1> : tensor<1xi64>,
+//          rhs_batching_dimensions = dense<> : tensor<0xi64>,
+//          rhs_contracting_dimensions = dense<0> : tensor<1xi64>}}
+//        : ([[TYPE]], [[TYPE]], [[TYPE]]) -> ()
   %dot = "mhlo.dot"(%arg0, %arg0)
           : (tensor<1024x1024xf32>, tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
 // PRE: "lmhlo.copy"(%[[ALLOC]], %[[RESULT]])
@@ -510,3 +579,63 @@ func @reduce(%arg0: tensor<1x8xf32>, %arg1: tensor<f32>) -> tensor<1xf32> {
       : (tensor<1x8xf32>, tensor<f32>) -> tensor<1xf32>
   return %0 : tensor<1xf32>
 }
+
+// -----
+
+// BOTH-LABEL: func @transpose
+func @transpose(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  %tensor_operand = tensor_load %operand : memref<2x2xf32>
+  %tensor_result = "mhlo.transpose"(%tensor_operand) {permutation = dense<[1, 0]> : tensor<2xi64>}
+                    : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  // BOTH: "lmhlo.transpose"(%{{.*}}, %{{.*}}) {permutation = dense<[1, 0]> : tensor<2xi64>}
+  // BOTH-NOT: tensor_store
+  tensor_store %tensor_result, %result : memref<2x2xf32>
+  return
+}
+
+// -----
+
+// BOTH-LABEL: func @custom_call
+// BOTH-SAME:([[ARG0:%.*]]: memref<2x2xf32>, [[ARG1:%.*]]: memref<2x3xf32>, [[RESULT:%.*]]: memref<4x4xf16>)
+func @custom_call(%arg0: memref<2x2xf32>, %arg1: memref<2x3xf32>, %result: memref<4x4xf16>) {
+  %arg0_tensor = tensor_load %arg0 : memref<2x2xf32>
+  %arg1_tensor = tensor_load %arg1 : memref<2x3xf32>
+  // BOTH: "lmhlo.custom_call"([[ARG0]], [[ARG1]], %{{.*}}) {backend_config = "", call_target_name = "foo", has_side_effect = false}
+  %result_tensor = "mhlo.custom_call"(%arg0_tensor, %arg1_tensor)
+                   {backend_config = "", call_target_name = "foo", has_side_effect = false}
+                   : (tensor<2x2xf32>, tensor<2x3xf32>) -> tensor<4x4xf16>
+  tensor_store %result_tensor, %result: memref<4x4xf16>
+  return
+}
+
+// ----
+
+// BOTH-LABEL: func @isfinite
+func @isfinite(%arg0: memref<2x2xf32>, %result: memref<2x2xi1>) {
+  %arg0_tensor = tensor_load %arg0 : memref<2x2xf32>
+  // BOTH: "lmhlo.is_finite"(%{{.*}}, %{{.*}})
+  %result_tensor = "mhlo.is_finite"(%arg0_tensor) : (tensor<2x2xf32>) -> tensor<2x2xi1>
+  tensor_store %result_tensor, %result: memref<2x2xi1>
+  return
+}
+
+// -----
+
+// Test that assuming ops propagate memref types.
+// BOTH-LABEL: func @shape_assuming_memref
+func @shape_assuming_memref(%arg0: tensor<?xf16>) -> tensor<?xf16> {
+  %0 = mhlo.constant dense<0.000000e+00> : tensor<f16>
+  %1 = shape.const_witness true
+  // BOTH: shape.assuming %{{.*}} -> (memref<?xf16>)
+  %2 = shape.assuming %1 -> (tensor<?xf16>) {
+    %3 = shape.shape_of %arg0 : tensor<?xf16> -> tensor<?xindex>
+    %4 = tensor_cast %3 : tensor<?xindex> to tensor<1xindex>
+    %5 = "mhlo.dynamic_broadcast_in_dim"(%0, %4) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f16>, tensor<1xindex>) -> tensor<?xf16>
+    %6 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %4) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<?xf16>, tensor<1xindex>) -> tensor<?xf16>
+    // BOTH: "lmhlo.maximum"(%6, %9, %20) : (memref<?xf16>, memref<?xf16>, memref<?xf16>) -> ()
+    %7 = mhlo.maximum %5, %6 : tensor<?xf16>
+    // BOTH: shape.assuming_yield %{{.*}} : memref<?xf16>
+    shape.assuming_yield %7 : tensor<?xf16>
+  }
+  return %2 : tensor<?xf16>
+}
diff --git a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir
index aecf612962a..91490b43f95 100644
--- a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir
@@ -152,6 +152,16 @@ func @float_ceil(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 
 // -----
 
+// CHECK-LABEL: func @floor
+func @floor(%input: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  // CHECK: linalg.generic
+  // CHECK: floorf
+  %0 = "mhlo.floor"(%input) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @float_neg
 func @float_neg(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   // CHECK: linalg.generic
@@ -242,6 +252,20 @@ func @copy(%input: tensor<2x4x8xf32>) -> tensor<2x4x8xf32> {
 
 // -----
 
+// CHECK-LABEL: func @is_finte
+func @is_finte(%input: tensor<2x2xf32>) -> tensor<2x2xi1> {
+  %0 = "mhlo.is_finite"(%input) : (tensor<2x2xf32>) -> tensor<2x2xi1>
+  return %0 : tensor<2x2xi1>
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32
+// CHECK-NEXT:   %[[POS_INF:.+]] = constant 0x7F800000 : f32
+// CHECK-NEXT:   %[[ABS_X:.+]] = absf %[[OPERAND_IN]] : f32
+// CHECK-NEXT:   %[[RESULT:.+]] = cmpf "one", %[[ABS_X]], %[[POS_INF]] : f32
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : i1
+
+// -----
+
 // CHECK-LABEL: func @select
 func @select(%pred: tensor<2x2xi1>, %lhs: tensor<2x2xf32>,
              %rhs: tensor<2x2xf32>) -> tensor<2x2xf32> {
@@ -385,6 +409,28 @@ func @reshape_3D_4D(%arg0: tensor<1x49x16xf32>) -> tensor<1x784x1x1xf32> {
 
 // -----
 
+// CHECK-DAG: #[[MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK-LABEL: func @reshape1_4D_4D
+func @reshape1_4D_4D(%arg0: tensor<4x512x1x1xi32>) -> tensor<1x4x1x512xi32> {
+  %0 = "mhlo.reshape"(%arg0) : (tensor<4x512x1x1xi32>) -> tensor<1x4x1x512xi32>
+  return %0 : tensor<1x4x1x512xi32>
+}
+// CHECK: linalg.tensor_reshape %{{.*}} [#[[MAP]]]
+// CHECK: linalg.tensor_reshape %{{.*}} [#[[MAP]]]
+
+// -----
+
+// CHECK-DAG: #[[MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK-LABEL: func @reshape2_4D_4D
+func @reshape2_4D_4D(%arg0: tensor<4x1x1x1024xi32>) -> tensor<4x1024x1x1xi32> {
+  %0 = "mhlo.reshape"(%arg0) : (tensor<4x1x1x1024xi32>) -> tensor<4x1024x1x1xi32>
+  return %0 : tensor<4x1024x1x1xi32>
+}
+// CHECK: linalg.tensor_reshape %{{.*}} [#[[MAP]]]
+// CHECK: linalg.tensor_reshape %{{.*}} [#[[MAP]]]
+
+// -----
+
 // CHECK-LABEL: func @minf
 func @minf(%lhs: tensor<2x2xf32>, %rhs: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %0 = "mhlo.minimum"(%lhs, %rhs)
diff --git a/tensorflow/compiler/mlir/hlo/tests/mhlo-transform-unranked.mlir b/tensorflow/compiler/mlir/hlo/tests/hlo-transform-unranked.mlir
similarity index 74%
rename from tensorflow/compiler/mlir/hlo/tests/mhlo-transform-unranked.mlir
rename to tensorflow/compiler/mlir/hlo/tests/hlo-transform-unranked.mlir
index 01ef250efd0..ae61fc8477e 100644
--- a/tensorflow/compiler/mlir/hlo/tests/mhlo-transform-unranked.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/hlo-transform-unranked.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt -transform-unranked-hlo -split-input-file %s | FileCheck %s
+// RUN: mlir-hlo-opt --transform-unranked-hlo --split-input-file %s | FileCheck %s
 
 // Check the validity of expected IR.
 // CHECK-LABEL: @sqr_transform_result
@@ -7,7 +7,7 @@ func @sqr_transform_result(%a: tensor<*xf32>) -> tensor<*xf32> {
   // Flatten operand shape.
   %shape = shape.shape_of %a : tensor<*xf32> -> tensor<?xindex>
   %num_elements = shape.num_elements %shape : tensor<?xindex> -> index
-  %flat_shape = tensor_from_elements(%num_elements) : tensor<1xindex>
+  %flat_shape = tensor_from_elements %num_elements : tensor<1xindex>
   %flat_a = "mhlo.dynamic_reshape"(%a, %flat_shape)
       : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
 
@@ -29,7 +29,7 @@ func @sqr_transform_result(%a: tensor<*xf32>) -> tensor<*xf32> {
 func @sqrt(%a: tensor<*xf32>) -> tensor<*xf32> {
   // CHECK-NEXT: %[[SHAPE:.*]] = shape.shape_of %[[A]] : tensor<*xf32> -> tensor<?xindex>
   // CHECK-NEXT: %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE]]
-  // CHECK-NEXT: %[[FLAT_SHAPE:.*]] = tensor_from_elements(%[[NUM_ELEMENTS]]) : tensor<1xindex>
+  // CHECK-NEXT: %[[FLAT_SHAPE:.*]] = tensor_from_elements %[[NUM_ELEMENTS]] : tensor<1xindex>
   // CHECK-NEXT: %[[FLAT_A:.*]] = "mhlo.dynamic_reshape"(%[[A]], %[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK-NEXT: %[[FLAT_B:.*]] = "mhlo.sqrt"(%[[FLAT_A]]) : (tensor<?xf32>) -> tensor<?xf32>
   // CHECK-NEXT: %[[B:.*]] = "mhlo.dynamic_reshape"(%[[FLAT_B]], %[[SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
@@ -71,7 +71,7 @@ func @add_unranked(%a : tensor<*xf32>, %b : tensor<*xf32>) -> tensor<*xf32> {
   // CHECK: %[[SHAPE_B:.*]] = shape.shape_of %[[B]]
   // CHECK: %[[SHAPE:.*]] = shape.any %[[SHAPE_A]], %[[SHAPE_B]]
   // CHECK: %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE]]
-  // CHECK: %[[FLAT_SHAPE:.*]] = tensor_from_elements(%[[NUM_ELEMENTS]]) : tensor<1xindex>
+  // CHECK: %[[FLAT_SHAPE:.*]] = tensor_from_elements %[[NUM_ELEMENTS]] : tensor<1xindex>
   // CHECK: %[[FLAT_A:.*]] = "mhlo.dynamic_reshape"(%[[A]], %[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK: %[[FLAT_B:.*]] = "mhlo.dynamic_reshape"(%[[B]], %[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK: %[[FLAT_RESULT:.*]] = mhlo.add %[[FLAT_A]], %[[FLAT_B]] : tensor<?xf32>
@@ -80,3 +80,19 @@ func @add_unranked(%a : tensor<*xf32>, %b : tensor<*xf32>) -> tensor<*xf32> {
   %result = mhlo.add %a, %b : tensor<*xf32>
   return %result : tensor<*xf32>
 }
+
+// -----
+
+// CHECK-LABEL: @tan
+// CHECK-SAME: (%[[A:.*]]: tensor<*xf32>) -> tensor<*xf32>
+func @tan(%a : tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK: %[[SHAPE:.*]] = shape.shape_of %[[A]] : tensor<*xf32> -> tensor<?xindex>
+  // CHECK: %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE]]
+  // CHECK: %[[FLAT_SHAPE:.*]] = tensor_from_elements %[[NUM_ELEMENTS]] : tensor<1xindex>
+  // CHECK: %[[FLAT_A:.*]] = "mhlo.dynamic_reshape"(%[[A]], %[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+  // CHECK: %[[FLAT_B:.*]] = chlo.tan %[[FLAT_A]] : tensor<?xf32>
+  // CHECK: %[[B:.*]] = "mhlo.dynamic_reshape"(%[[FLAT_B]], %[[SHAPE]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+  // CHECK: return %[[B]] : tensor<*xf32>
+  %result = chlo.tan %a : tensor<*xf32>
+  return %result : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/hlo/tests/legalize-to-std.mlir b/tensorflow/compiler/mlir/hlo/tests/legalize-to-std.mlir
index abe4e872b73..404be85e05e 100644
--- a/tensorflow/compiler/mlir/hlo/tests/legalize-to-std.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/legalize-to-std.mlir
@@ -51,38 +51,38 @@ func @unary_ops_float(%arg0: tensor<4xf32>) -> tensor<4xf32> {
   return %0 : tensor<4xf32>
 }
 
-// CHECK-LABEL: func @compare_int(%arg0: tensor<4xi32>) -> (tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>) {
-func @compare_int(%arg0: tensor<4xi32>) -> (tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>) {
-  // CHECK-NEXT: %0 = cmpi "eq", %arg0, %arg0 : tensor<4xi32>
-  %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "EQ"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
-  // CHECK-NEXT: %1 = cmpi "ne", %arg0, %arg0 : tensor<4xi32>
-  %1 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "NE"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
-  // CHECK-NEXT: %2 = cmpi "slt", %arg0, %arg0 : tensor<4xi32>
-  %2 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "LT"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
-  // CHECK-NEXT: %3 = cmpi "sle", %arg0, %arg0 : tensor<4xi32>
-  %3 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "LE"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
-  // CHECK-NEXT: %4 = cmpi "sgt", %arg0, %arg0 : tensor<4xi32>
-  %4 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "GT"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
-  // CHECK-NEXT: %5 = cmpi "sge", %arg0, %arg0 : tensor<4xi32>
-  %5 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "GE"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
+// CHECK-LABEL: func @compare_int
+func @compare_int(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> (tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>) {
+  // CHECK-NEXT: %0 = cmpi "eq", %arg0, %arg1 : tensor<4xi32>
+  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
+  // CHECK-NEXT: %1 = cmpi "ne", %arg0, %arg1 : tensor<4xi32>
+  %1 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "NE"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
+  // CHECK-NEXT: %2 = cmpi "slt", %arg0, %arg1 : tensor<4xi32>
+  %2 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "LT"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
+  // CHECK-NEXT: %3 = cmpi "sle", %arg0, %arg1 : tensor<4xi32>
+  %3 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "LE"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
+  // CHECK-NEXT: %4 = cmpi "sgt", %arg0, %arg1 : tensor<4xi32>
+  %4 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
+  // CHECK-NEXT: %5 = cmpi "sge", %arg0, %arg1 : tensor<4xi32>
+  %5 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GE"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi1>
   // CHECK-NEXT: return %0, %1, %2, %3, %4, %5 : tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>
   return %0, %1, %2, %3, %4, %5 : tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>
 }
 
 // CHECK-LABEL: func @compare_float
-func @compare_float(%arg0: tensor<4xf32>) -> (tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>) {
-  // CHECK-NEXT: %0 = cmpf "oeq", %arg0, %arg0 : tensor<4xf32>
-  %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "EQ"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
-  // CHECK-NEXT: %1 = cmpf "une", %arg0, %arg0 : tensor<4xf32>
-  %1 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "NE"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
-  // CHECK-NEXT: %2 = cmpf "olt", %arg0, %arg0 : tensor<4xf32>
-  %2 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "LT"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
-  // CHECK-NEXT: %3 = cmpf "ole", %arg0, %arg0 : tensor<4xf32>
-  %3 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "LE"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
-  // CHECK-NEXT: %4 = cmpf "ogt", %arg0, %arg0 : tensor<4xf32>
-  %4 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "GT"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
-  // CHECK-NEXT: %5 = cmpf "oge", %arg0, %arg0 : tensor<4xf32>
-  %5 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "GE"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
+func @compare_float(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> (tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>) {
+  // CHECK-NEXT: %0 = cmpf "oeq", %arg0, %arg1 : tensor<4xf32>
+  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
+  // CHECK-NEXT: %1 = cmpf "une", %arg0, %arg1 : tensor<4xf32>
+  %1 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "NE"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
+  // CHECK-NEXT: %2 = cmpf "olt", %arg0, %arg1 : tensor<4xf32>
+  %2 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "LT"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
+  // CHECK-NEXT: %3 = cmpf "ole", %arg0, %arg1 : tensor<4xf32>
+  %3 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "LE"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
+  // CHECK-NEXT: %4 = cmpf "ogt", %arg0, %arg1 : tensor<4xf32>
+  %4 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
+  // CHECK-NEXT: %5 = cmpf "oge", %arg0, %arg1 : tensor<4xf32>
+  %5 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GE"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xi1>
   return %0, %1, %2, %3, %4, %5: tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>
 }
 
diff --git a/tensorflow/compiler/mlir/hlo/tests/legalize-trigonometric-to-approximation.mlir b/tensorflow/compiler/mlir/hlo/tests/legalize-trigonometric-to-approximation.mlir
new file mode 100644
index 00000000000..c25545ca2bd
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tests/legalize-trigonometric-to-approximation.mlir
@@ -0,0 +1,380 @@
+// RUN: mlir-hlo-opt --mhlo-legalize-trigonometric-to-approximation --split-input-file %s | FileCheck %s
+
+func @tanh_f64(%arg0 : f64) -> f64 {
+  %res = tanh %arg0 : f64
+  return %res : f64
+}
+
+// CHECK-LABEL: @tanh_f64
+// CHECK: tanh
+
+// -----
+
+func @tanh_f32(%arg0 : f32) -> f32 {
+  %res = tanh %arg0 : f32
+  return %res : f32
+}
+
+// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+// CHECK-LABEL: func @tanh_f32
+// CHECK-SAME: (%[[VAL_0:.*]]: f32) -> f32
+// CHECK: %[[VAL_1:.*]] = constant 4.000000e-04 : f32
+// CHECK: %[[VAL_2:.*]] = constant 7.90531111 : f32
+// CHECK: %[[VAL_3:.*]] = constant -7.90531111 : f32
+// CHECK: %[[VAL_4:.*]] = constant -2.76076837E-16 : f32
+// CHECK: %[[VAL_5:.*]] = constant 2.00018794E-13 : f32
+// CHECK: %[[VAL_6:.*]] = constant -8.60467184E-11 : f32
+// CHECK: %[[VAL_7:.*]] = constant 5.12229725E-8 : f32
+// CHECK: %[[VAL_8:.*]] = constant 1.48572235E-5 : f32
+// CHECK: %[[VAL_9:.*]] = constant 6.37261954E-4 : f32
+// CHECK: %[[VAL_10:.*]] = constant 0.00489352457 : f32
+// CHECK: %[[VAL_11:.*]] = constant 1.19825836E-6 : f32
+// CHECK: %[[VAL_12:.*]] = constant 1.18534706E-4 : f32
+// CHECK: %[[VAL_13:.*]] = constant 0.00226843474 : f32
+// CHECK: %[[VAL_14:.*]] = constant 0.00489352504 : f32
+// CHECK: %[[VAL_15:.*]] = absf %[[VAL_0]] : f32
+// CHECK: %[[VAL_16:.*]] = cmpf "olt", %[[VAL_15]], %[[VAL_1]] : f32
+// CHECK: %[[VAL_17:.*]] = cmpf "ule", %[[VAL_0]], %[[VAL_2]] : f32
+// CHECK: %[[VAL_18:.*]] = select %[[VAL_17]], %[[VAL_0]], %[[VAL_2]] : f32
+// CHECK: %[[VAL_19:.*]] = cmpf "uge", %[[VAL_18]], %[[VAL_3]] : f32
+// CHECK: %[[VAL_20:.*]] = select %[[VAL_19]], %[[VAL_18]], %[[VAL_3]] : f32
+// CHECK: %[[VAL_21:.*]] = mulf %[[VAL_20]], %[[VAL_20]] : f32
+// CHECK: %[[VAL_22:.*]] = mulf %[[VAL_21]], %[[VAL_4]] : f32
+// CHECK: %[[VAL_23:.*]] = addf %[[VAL_22]], %[[VAL_5]] : f32
+// CHECK: %[[VAL_24:.*]] = mulf %[[VAL_21]], %[[VAL_23]] : f32
+// CHECK: %[[VAL_25:.*]] = addf %[[VAL_24]], %[[VAL_6]] : f32
+// CHECK: %[[VAL_26:.*]] = mulf %[[VAL_21]], %[[VAL_25]] : f32
+// CHECK: %[[VAL_27:.*]] = addf %[[VAL_26]], %[[VAL_7]] : f32
+// CHECK: %[[VAL_28:.*]] = mulf %[[VAL_21]], %[[VAL_27]] : f32
+// CHECK: %[[VAL_29:.*]] = addf %[[VAL_28]], %[[VAL_8]] : f32
+// CHECK: %[[VAL_30:.*]] = mulf %[[VAL_21]], %[[VAL_29]] : f32
+// CHECK: %[[VAL_31:.*]] = addf %[[VAL_30]], %[[VAL_9]] : f32
+// CHECK: %[[VAL_32:.*]] = mulf %[[VAL_21]], %[[VAL_31]] : f32
+// CHECK: %[[VAL_33:.*]] = addf %[[VAL_32]], %[[VAL_10]] : f32
+// CHECK: %[[VAL_34:.*]] = mulf %[[VAL_20]], %[[VAL_33]] : f32
+// CHECK: %[[VAL_35:.*]] = mulf %[[VAL_21]], %[[VAL_11]] : f32
+// CHECK: %[[VAL_36:.*]] = addf %[[VAL_35]], %[[VAL_12]] : f32
+// CHECK: %[[VAL_37:.*]] = mulf %[[VAL_21]], %[[VAL_36]] : f32
+// CHECK: %[[VAL_38:.*]] = addf %[[VAL_37]], %[[VAL_13]] : f32
+// CHECK: %[[VAL_39:.*]] = mulf %[[VAL_21]], %[[VAL_38]] : f32
+// CHECK: %[[VAL_40:.*]] = addf %[[VAL_39]], %[[VAL_14]] : f32
+// CHECK: %[[VAL_41:.*]] = divf %[[VAL_34]], %[[VAL_40]] : f32
+// CHECK: %[[VAL_42:.*]] = select %[[VAL_16]], %[[VAL_0]], %[[VAL_41]] : f32
+// CHECK: return %[[VAL_42]] : f32
+
+// -----
+
+func @tanh_f16(%arg0 : f16) -> f16 {
+  %res = tanh %arg0 : f16
+  return %res : f16
+}
+
+// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+// CHECK-LABEL: func @tanh_f16
+// CHECK-SAME: (%[[VAL_0:.*]]: f16) -> f16
+// CHECK: %[[VAL_1:.*]] = constant 4.000000e-04 : f32
+// CHECK: %[[VAL_2:.*]] = constant 7.90531111 : f32
+// CHECK: %[[VAL_3:.*]] = constant -7.90531111 : f32
+// CHECK: %[[VAL_4:.*]] = constant -2.76076837E-16 : f32
+// CHECK: %[[VAL_5:.*]] = constant 2.00018794E-13 : f32
+// CHECK: %[[VAL_6:.*]] = constant -8.60467184E-11 : f32
+// CHECK: %[[VAL_7:.*]] = constant 5.12229725E-8 : f32
+// CHECK: %[[VAL_8:.*]] = constant 1.48572235E-5 : f32
+// CHECK: %[[VAL_9:.*]] = constant 6.37261954E-4 : f32
+// CHECK: %[[VAL_10:.*]] = constant 0.00489352457 : f32
+// CHECK: %[[VAL_11:.*]] = constant 1.19825836E-6 : f32
+// CHECK: %[[VAL_12:.*]] = constant 1.18534706E-4 : f32
+// CHECK: %[[VAL_13:.*]] = constant 0.00226843474 : f32
+// CHECK: %[[VAL_14:.*]] = constant 0.00489352504 : f32
+// CHECK: %[[VAL_15:.*]] = fpext %[[VAL_0]] : f16 to f32
+// CHECK: %[[VAL_16:.*]] = absf %[[VAL_15]] : f32
+// CHECK: %[[VAL_17:.*]] = cmpf "olt", %[[VAL_16]], %[[VAL_1]] : f32
+// CHECK: %[[VAL_18:.*]] = cmpf "ule", %[[VAL_15]], %[[VAL_2]] : f32
+// CHECK: %[[VAL_19:.*]] = select %[[VAL_18]], %[[VAL_15]], %[[VAL_2]] : f32
+// CHECK: %[[VAL_20:.*]] = cmpf "uge", %[[VAL_19]], %[[VAL_3]] : f32
+// CHECK: %[[VAL_21:.*]] = select %[[VAL_20]], %[[VAL_19]], %[[VAL_3]] : f32
+// CHECK: %[[VAL_22:.*]] = mulf %[[VAL_21]], %[[VAL_21]] : f32
+// CHECK: %[[VAL_23:.*]] = mulf %[[VAL_22]], %[[VAL_4]] : f32
+// CHECK: %[[VAL_24:.*]] = addf %[[VAL_23]], %[[VAL_5]] : f32
+// CHECK: %[[VAL_25:.*]] = mulf %[[VAL_22]], %[[VAL_24]] : f32
+// CHECK: %[[VAL_26:.*]] = addf %[[VAL_25]], %[[VAL_6]] : f32
+// CHECK: %[[VAL_27:.*]] = mulf %[[VAL_22]], %[[VAL_26]] : f32
+// CHECK: %[[VAL_28:.*]] = addf %[[VAL_27]], %[[VAL_7]] : f32
+// CHECK: %[[VAL_29:.*]] = mulf %[[VAL_22]], %[[VAL_28]] : f32
+// CHECK: %[[VAL_30:.*]] = addf %[[VAL_29]], %[[VAL_8]] : f32
+// CHECK: %[[VAL_31:.*]] = mulf %[[VAL_22]], %[[VAL_30]] : f32
+// CHECK: %[[VAL_32:.*]] = addf %[[VAL_31]], %[[VAL_9]] : f32
+// CHECK: %[[VAL_33:.*]] = mulf %[[VAL_22]], %[[VAL_32]] : f32
+// CHECK: %[[VAL_34:.*]] = addf %[[VAL_33]], %[[VAL_10]] : f32
+// CHECK: %[[VAL_35:.*]] = mulf %[[VAL_21]], %[[VAL_34]] : f32
+// CHECK: %[[VAL_36:.*]] = mulf %[[VAL_22]], %[[VAL_11]] : f32
+// CHECK: %[[VAL_37:.*]] = addf %[[VAL_36]], %[[VAL_12]] : f32
+// CHECK: %[[VAL_38:.*]] = mulf %[[VAL_22]], %[[VAL_37]] : f32
+// CHECK: %[[VAL_39:.*]] = addf %[[VAL_38]], %[[VAL_13]] : f32
+// CHECK: %[[VAL_40:.*]] = mulf %[[VAL_22]], %[[VAL_39]] : f32
+// CHECK: %[[VAL_41:.*]] = addf %[[VAL_40]], %[[VAL_14]] : f32
+// CHECK: %[[VAL_42:.*]] = divf %[[VAL_35]], %[[VAL_41]] : f32
+// CHECK: %[[VAL_43:.*]] = select %[[VAL_17]], %[[VAL_15]], %[[VAL_42]] : f32
+// CHECK: %[[VAL_44:.*]] = fptrunc %[[VAL_43]] : f32 to f16
+// CHECK: return %[[VAL_44]] : f16
+
+// -----
+
+// CHECK-LABEL: @atan2_f64
+func @atan2_f64(%arg0 : f64, %arg1 : f64) -> f64 {
+  // CHECK: atan2
+  %res = atan2 %arg0, %arg1 : f64
+  return %res : f64
+}
+
+// -----
+
+// CHECK-LABEL: func @atan2_f32
+// CHECK-SAME: (%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32) -> f32
+func @atan2_f32(%arg0 : f32, %arg1 : f32) -> f32 {
+  // CHECK: %[[CST:.*]] = constant 0.0027856871 : f32
+  // CHECK: %[[CST_0:.*]] = constant -1.586600e-02 : f32
+  // CHECK: %[[CST_1:.*]] = constant 0.042472221 : f32
+  // CHECK: %[[CST_2:.*]] = constant -0.0749753043 : f32
+  // CHECK: %[[CST_3:.*]] = constant 0.106448799 : f32
+  // CHECK: %[[CST_4:.*]] = constant -0.142070308 : f32
+  // CHECK: %[[CST_5:.*]] = constant 0.199934542 : f32
+  // CHECK: %[[CST_6:.*]] = constant -0.333331466 : f32
+  // CHECK: %[[CST_7:.*]] = constant 1.57079637 : f32
+  // CHECK: %[[CST_8:.*]] = constant 0.000000e+00 : f32
+  // CHECK: %[[CST_9:.*]] = constant 3.14159274 : f32
+  // CHECK: %[[CST_10:.*]] = constant 0x7FC00000 : f32
+  // CHECK: %[[CST_11:.*]] = constant 2.3561945 : f32
+  // CHECK: %[[CST_12:.*]] = constant 0.785398185 : f32
+  // CHECK: %[[CST_13:.*]] = constant 0x7F800000 : f32
+  // CHECK: %[[VAL_0:.*]] = absf %[[ARG1]] : f32
+  // CHECK: %[[VAL_1:.*]] = absf %[[ARG0]] : f32
+  // CHECK: %[[VAL_2:.*]] = cmpf "ole", %[[VAL_0]], %[[VAL_1]] : f32
+  // CHECK: %[[VAL_3:.*]] = select %[[VAL_2]], %[[VAL_0]], %[[VAL_1]] : f32
+  // CHECK: %[[VAL_4:.*]] = select %[[VAL_2]], %[[VAL_1]], %[[VAL_0]] : f32
+  // CHECK: %[[VAL_5:.*]] = divf %[[VAL_3]], %[[VAL_4]] : f32
+  // CHECK: %[[VAL_6:.*]] = mulf %[[VAL_5]], %[[VAL_5]] : f32
+  // CHECK: %[[VAL_7:.*]] = mulf %[[CST]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_8:.*]] = addf %[[VAL_7]], %[[CST_0]] : f32
+  // CHECK: %[[VAL_9:.*]] = mulf %[[VAL_8]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_10:.*]] = addf %[[VAL_9]], %[[CST_1]] : f32
+  // CHECK: %[[VAL_11:.*]] = mulf %[[VAL_10]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_12:.*]] = addf %[[VAL_11]], %[[CST_2]] : f32
+  // CHECK: %[[VAL_13:.*]] = mulf %[[VAL_12]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_14:.*]] = addf %[[VAL_13]], %[[CST_3]] : f32
+  // CHECK: %[[VAL_15:.*]] = mulf %[[VAL_14]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_16:.*]] = addf %[[VAL_15]], %[[CST_4]] : f32
+  // CHECK: %[[VAL_17:.*]] = mulf %[[VAL_16]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_18:.*]] = addf %[[VAL_17]], %[[CST_5]] : f32
+  // CHECK: %[[VAL_19:.*]] = mulf %[[VAL_18]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_20:.*]] = addf %[[VAL_19]], %[[CST_6]] : f32
+  // CHECK: %[[VAL_21:.*]] = mulf %[[VAL_20]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_22:.*]] = mulf %[[VAL_21]], %[[VAL_5]] : f32
+  // CHECK: %[[VAL_23:.*]] = addf %[[VAL_22]], %[[VAL_5]] : f32
+  // CHECK: %[[VAL_24:.*]] = subf %[[CST_7]], %[[VAL_23]] : f32
+  // CHECK: %[[VAL_25:.*]] = select %[[VAL_2]], %[[VAL_24]], %[[VAL_23]] : f32
+  // CHECK: %[[VAL_26:.*]] = cmpf "olt", %[[ARG1]], %[[CST_8]] : f32
+  // CHECK: %[[VAL_27:.*]] = subf %[[CST_9]], %[[VAL_25]] : f32
+  // CHECK: %[[VAL_28:.*]] = select %[[VAL_26]], %[[VAL_27]], %[[VAL_25]] : f32
+  // CHECK: %[[VAL_29:.*]] = select %[[VAL_26]], %[[CST_9]], %[[CST_8]] : f32
+  // CHECK: %[[VAL_30:.*]] = cmpf "oeq", %[[ARG0]], %[[CST_8]] : f32
+  // CHECK: %[[VAL_31:.*]] = select %[[VAL_30]], %[[VAL_29]], %[[VAL_28]] : f32
+  // CHECK: %[[VAL_32:.*]] = cmpf "uno", %[[ARG0]], %[[ARG1]] : f32
+  // CHECK: %[[VAL_35:.*]] = select %[[VAL_32]], %[[CST_10]], %[[VAL_31]] : f32
+  // CHECK: %[[VAL_36:.*]] = select %[[VAL_26]], %[[CST_11]], %[[CST_12]] : f32
+  // CHECK: %[[VAL_37:.*]] = cmpf "oeq", %[[ARG1]], %[[CST_13]] : f32
+  // CHECK: %[[VAL_38:.*]] = cmpf "oeq", %[[ARG0]], %[[CST_13]] : f32
+  // CHECK: %[[VAL_39:.*]] = and %[[VAL_37]], %[[VAL_38]] : i1
+  // CHECK: %[[VAL_40:.*]] = select %[[VAL_39]], %[[VAL_36]], %[[VAL_35]] : f32
+  // CHECK: %[[VAL_41:.*]] = copysign %[[VAL_40]], %[[ARG0]] : f32
+  // CHECK: return %[[VAL_41]] : f32
+  %res = atan2 %arg0, %arg1 : f32
+  return %res : f32
+}
+
+// -----
+
+// CHECK-LABEL: @atan2_f16
+// CHECK-SAME: (%[[ARG0:.*]]: f16, %[[ARG1:.*]]: f16) -> f16
+func @atan2_f16(%arg0 : f16, %arg1 : f16) -> f16 {
+  // CHECK: %[[CST:.*]] = constant 0.0027856871 : f32
+  // CHECK: %[[CST_0:.*]] = constant -1.586600e-02 : f32
+  // CHECK: %[[CST_1:.*]] = constant 0.042472221 : f32
+  // CHECK: %[[CST_2:.*]] = constant -0.0749753043 : f32
+  // CHECK: %[[CST_3:.*]] = constant 0.106448799 : f32
+  // CHECK: %[[CST_4:.*]] = constant -0.142070308 : f32
+  // CHECK: %[[CST_5:.*]] = constant 0.199934542 : f32
+  // CHECK: %[[CST_6:.*]] = constant -0.333331466 : f32
+  // CHECK: %[[CST_7:.*]] = constant 1.57079637 : f32
+  // CHECK: %[[CST_8:.*]] = constant 0.000000e+00 : f32
+  // CHECK: %[[CST_9:.*]] = constant 3.14159274 : f32
+  // CHECK: %[[CST_10:.*]] = constant 0x7FC00000 : f32
+  // CHECK: %[[CST_11:.*]] = constant 2.3561945 : f32
+  // CHECK: %[[CST_12:.*]] = constant 0.785398185 : f32
+  // CHECK: %[[CST_13:.*]] = constant 0x7F800000 : f32
+  // CHECK: %[[VAL_0:.*]] = fpext %[[ARG0]] : f16 to f32
+  // CHECK: %[[VAL_1:.*]] = fpext %[[ARG1]] : f16 to f32
+  // CHECK: %[[VAL_2:.*]] = absf %[[VAL_1]] : f32
+  // CHECK: %[[VAL_3:.*]] = absf %[[VAL_0]] : f32
+  // CHECK: %[[VAL_4:.*]] = cmpf "ole", %[[VAL_2]], %[[VAL_3]] : f32
+  // CHECK: %[[VAL_5:.*]] = select %[[VAL_4]], %[[VAL_2]], %[[VAL_3]] : f32
+  // CHECK: %[[VAL_6:.*]] = select %[[VAL_4]], %[[VAL_3]], %[[VAL_2]] : f32
+  // CHECK: %[[VAL_7:.*]] = divf %[[VAL_5]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_8:.*]] = mulf %[[VAL_7]], %[[VAL_7]] : f32
+  // CHECK: %[[VAL_9:.*]] = mulf %[[CST]], %[[VAL_8]] : f32
+  // CHECK: %[[VAL_10:.*]] = addf %[[VAL_9]], %[[CST_0]] : f32
+  // CHECK: %[[VAL_11:.*]] = mulf %[[VAL_10]], %[[VAL_8]] : f32
+  // CHECK: %[[VAL_12:.*]] = addf %[[VAL_11]], %[[CST_1]] : f32
+  // CHECK: %[[VAL_13:.*]] = mulf %[[VAL_12]], %[[VAL_8]] : f32
+  // CHECK: %[[VAL_14:.*]] = addf %[[VAL_13]], %[[CST_2]] : f32
+  // CHECK: %[[VAL_15:.*]] = mulf %[[VAL_14]], %[[VAL_8]] : f32
+  // CHECK: %[[VAL_16:.*]] = addf %[[VAL_15]], %[[CST_3]] : f32
+  // CHECK: %[[VAL_17:.*]] = mulf %[[VAL_16]], %[[VAL_8]] : f32
+  // CHECK: %[[VAL_18:.*]] = addf %[[VAL_17]], %[[CST_4]] : f32
+  // CHECK: %[[VAL_19:.*]] = mulf %[[VAL_18]], %[[VAL_8]] : f32
+  // CHECK: %[[VAL_20:.*]] = addf %[[VAL_19]], %[[CST_5]] : f32
+  // CHECK: %[[VAL_21:.*]] = mulf %[[VAL_20]], %[[VAL_8]] : f32
+  // CHECK: %[[VAL_22:.*]] = addf %[[VAL_21]], %[[CST_6]] : f32
+  // CHECK: %[[VAL_23:.*]] = mulf %[[VAL_22]], %[[VAL_8]] : f32
+  // CHECK: %[[VAL_24:.*]] = mulf %[[VAL_23]], %[[VAL_7]] : f32
+  // CHECK: %[[VAL_25:.*]] = addf %[[VAL_24]], %[[VAL_7]] : f32
+  // CHECK: %[[VAL_26:.*]] = subf %[[CST_7]], %[[VAL_25]] : f32
+  // CHECK: %[[VAL_27:.*]] = select %[[VAL_4]], %[[VAL_26]], %[[VAL_25]] : f32
+  // CHECK: %[[VAL_28:.*]] = cmpf "olt", %[[VAL_1]], %[[CST_8]] : f32
+  // CHECK: %[[VAL_29:.*]] = subf %[[CST_9]], %[[VAL_27]] : f32
+  // CHECK: %[[VAL_30:.*]] = select %[[VAL_28]], %[[VAL_29]], %[[VAL_27]] : f32
+  // CHECK: %[[VAL_31:.*]] = select %[[VAL_28]], %[[CST_9]], %[[CST_8]] : f32
+  // CHECK: %[[VAL_32:.*]] = cmpf "oeq", %[[VAL_0]], %[[CST_8]] : f32
+  // CHECK: %[[VAL_33:.*]] = select %[[VAL_32]], %[[VAL_31]], %[[VAL_30]] : f32
+  // CHECK: %[[VAL_34:.*]] = cmpf "uno", %[[VAL_0]], %[[VAL_1]] : f32
+  // CHECK: %[[VAL_37:.*]] = select %[[VAL_34]], %[[CST_10]], %[[VAL_33]] : f32
+  // CHECK: %[[VAL_38:.*]] = select %[[VAL_28]], %[[CST_11]], %[[CST_12]] : f32
+  // CHECK: %[[VAL_39:.*]] = cmpf "oeq", %[[VAL_1]], %[[CST_13]] : f32
+  // CHECK: %[[VAL_40:.*]] = cmpf "oeq", %[[VAL_0]], %[[CST_13]] : f32
+  // CHECK: %[[VAL_41:.*]] = and %[[VAL_39]], %[[VAL_40]] : i1
+  // CHECK: %[[VAL_42:.*]] = select %[[VAL_41]], %[[VAL_38]], %[[VAL_37]] : f32
+  // CHECK: %[[VAL_43:.*]] = copysign %[[VAL_42]], %[[VAL_0]] : f32
+  // CHECK: %[[VAL_44:.*]] = fptrunc %[[VAL_43]] : f32 to f16
+  // CHECK: return %[[VAL_44]] : f16
+  %res = atan2 %arg0, %arg1 : f16
+  return %res : f16
+}
+
+// -----
+
+// CHECK-LABEL: @atan_f64
+func @atan_f64(%arg : f64) -> f64 {
+  // CHECK: atan
+  %res = atan %arg : f64
+  return %res : f64
+}
+
+// -----
+
+// CHECK-LABEL: func @atan_f32
+// CHECK-SAME: (%[[ARG:.*]]: f32) -> f32
+func @atan_f32(%arg : f32) -> f32 {
+  // CHECK: %[[CST:.*]] = constant 1.000000e+00 : f32
+  // CHECK: %[[CST_0:.*]] = constant 0.0027856871 : f32
+  // CHECK: %[[CST_1:.*]] = constant -1.586600e-02 : f32
+  // CHECK: %[[CST_2:.*]] = constant 0.042472221 : f32
+  // CHECK: %[[CST_3:.*]] = constant -0.0749753043 : f32
+  // CHECK: %[[CST_4:.*]] = constant 0.106448799 : f32
+  // CHECK: %[[CST_5:.*]] = constant -0.142070308 : f32
+  // CHECK: %[[CST_6:.*]] = constant 0.199934542 : f32
+  // CHECK: %[[CST_7:.*]] = constant -0.333331466 : f32
+  // CHECK: %[[CST_8:.*]] = constant 1.57079637 : f32
+  // CHECK: %[[CST_9:.*]] = constant 0.000000e+00 : f32
+  // CHECK: %[[CST_10:.*]] = constant 0x7FC00000 : f32
+  // CHECK: %[[VAL_0:.*]] = absf %[[CST]] : f32
+  // CHECK: %[[VAL_1:.*]] = absf %arg0 : f32
+  // CHECK: %[[VAL_2:.*]] = cmpf "ole", %[[VAL_0]], %[[VAL_1]] : f32
+  // CHECK: %[[VAL_3:.*]] = select %[[VAL_2]], %[[VAL_0]], %[[VAL_1]] : f32
+  // CHECK: %[[VAL_4:.*]] = select %[[VAL_2]], %[[VAL_1]], %[[VAL_0]] : f32
+  // CHECK: %[[VAL_5:.*]] = divf %[[VAL_3]], %[[VAL_4]] : f32
+  // CHECK: %[[VAL_6:.*]] = mulf %[[VAL_5]], %[[VAL_5]] : f32
+  // CHECK: %[[VAL_7:.*]] = mulf %[[CST_0]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_8:.*]] = addf %[[VAL_7]], %[[CST_1]] : f32
+  // CHECK: %[[VAL_9:.*]] = mulf %[[VAL_8]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_10:.*]] = addf %[[VAL_9]], %[[CST_2]] : f32
+  // CHECK: %[[VAL_11:.*]] = mulf %[[VAL_10]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_12:.*]] = addf %[[VAL_11]], %[[CST_3]] : f32
+  // CHECK: %[[VAL_13:.*]] = mulf %[[VAL_12]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_14:.*]] = addf %[[VAL_13]], %[[CST_4]] : f32
+  // CHECK: %[[VAL_15:.*]] = mulf %[[VAL_14]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_16:.*]] = addf %[[VAL_15]], %[[CST_5]] : f32
+  // CHECK: %[[VAL_17:.*]] = mulf %[[VAL_16]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_18:.*]] = addf %[[VAL_17]], %[[CST_6]] : f32
+  // CHECK: %[[VAL_19:.*]] = mulf %[[VAL_18]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_20:.*]] = addf %[[VAL_19]], %[[CST_7]] : f32
+  // CHECK: %[[VAL_21:.*]] = mulf %[[VAL_20]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_22:.*]] = mulf %[[VAL_21]], %[[VAL_5]] : f32
+  // CHECK: %[[VAL_23:.*]] = addf %[[VAL_22]], %[[VAL_5]] : f32
+  // CHECK: %[[VAL_24:.*]] = subf %[[CST_8]], %[[VAL_23]] : f32
+  // CHECK: %[[VAL_25:.*]] = select %[[VAL_2]], %[[VAL_24]], %[[VAL_23]] : f32
+  // CHECK: %[[VAL_26:.*]] = cmpf "oeq", %arg0, %[[CST_9]] : f32
+  // CHECK: %[[VAL_27:.*]] = select %[[VAL_26]], %[[CST_9]], %[[VAL_25]] : f32
+  // CHECK: %[[VAL_28:.*]] = cmpf "uno", %arg0, %[[CST]] : f32
+  // CHECK: %[[VAL_29:.*]] = select %[[VAL_28]], %[[CST_10]], %[[VAL_27]] : f32
+  // CHECK: %[[VAL_30:.*]] = copysign %[[VAL_29]], %arg0 : f32
+  // CHECK: return %[[VAL_30]] : f32
+  %res = atan %arg : f32
+  return %res : f32
+}
+
+// -----
+
+// CHECK-LABEL: @atan_f16
+// CHECK-SAME: (%[[ARG:.*]]: f16) -> f16
+func @atan_f16(%arg : f16) -> f16 {
+  // CHECK: %[[CST:.*]] = constant 1.000000e+00 : f32
+  // CHECK: %[[CST_0:.*]] = constant 0.0027856871 : f32
+  // CHECK: %[[CST_1:.*]] = constant -1.586600e-02 : f32
+  // CHECK: %[[CST_2:.*]] = constant 0.042472221 : f32
+  // CHECK: %[[CST_3:.*]] = constant -0.0749753043 : f32
+  // CHECK: %[[CST_4:.*]] = constant 0.106448799 : f32
+  // CHECK: %[[CST_5:.*]] = constant -0.142070308 : f32
+  // CHECK: %[[CST_6:.*]] = constant 0.199934542 : f32
+  // CHECK: %[[CST_7:.*]] = constant -0.333331466 : f32
+  // CHECK: %[[CST_8:.*]] = constant 1.57079637 : f32
+  // CHECK: %[[CST_9:.*]] = constant 0.000000e+00 : f32
+  // CHECK: %[[CST_10:.*]] = constant 0x7FC00000 : f32
+  // CHECK: %[[VAL_0:.*]] = fpext %arg0 : f16 to f32
+  // CHECK: %[[VAL_1:.*]] = absf %[[CST]] : f32
+  // CHECK: %[[VAL_2:.*]] = absf %[[VAL_0]] : f32
+  // CHECK: %[[VAL_3:.*]] = cmpf "ole", %[[VAL_1]], %[[VAL_2]] : f32
+  // CHECK: %[[VAL_4:.*]] = select %[[VAL_3]], %[[VAL_1]], %[[VAL_2]] : f32
+  // CHECK: %[[VAL_5:.*]] = select %[[VAL_3]], %[[VAL_2]], %[[VAL_1]] : f32
+  // CHECK: %[[VAL_6:.*]] = divf %[[VAL_4]], %[[VAL_5]] : f32
+  // CHECK: %[[VAL_7:.*]] = mulf %[[VAL_6]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_8:.*]] = mulf %[[CST_0]], %[[VAL_7]] : f32
+  // CHECK: %[[VAL_9:.*]] = addf %[[VAL_8]], %[[CST_1]] : f32
+  // CHECK: %[[VAL_10:.*]] = mulf %[[VAL_9]], %[[VAL_7]] : f32
+  // CHECK: %[[VAL_11:.*]] = addf %[[VAL_10]], %[[CST_2]] : f32
+  // CHECK: %[[VAL_12:.*]] = mulf %[[VAL_11]], %[[VAL_7]] : f32
+  // CHECK: %[[VAL_13:.*]] = addf %[[VAL_12]], %[[CST_3]] : f32
+  // CHECK: %[[VAL_14:.*]] = mulf %[[VAL_13]], %[[VAL_7]] : f32
+  // CHECK: %[[VAL_15:.*]] = addf %[[VAL_14]], %[[CST_4]] : f32
+  // CHECK: %[[VAL_16:.*]] = mulf %[[VAL_15]], %[[VAL_7]] : f32
+  // CHECK: %[[VAL_17:.*]] = addf %[[VAL_16]], %[[CST_5]] : f32
+  // CHECK: %[[VAL_18:.*]] = mulf %[[VAL_17]], %[[VAL_7]] : f32
+  // CHECK: %[[VAL_19:.*]] = addf %[[VAL_18]], %[[CST_6]] : f32
+  // CHECK: %[[VAL_20:.*]] = mulf %[[VAL_19]], %[[VAL_7]] : f32
+  // CHECK: %[[VAL_21:.*]] = addf %[[VAL_20]], %[[CST_7]] : f32
+  // CHECK: %[[VAL_22:.*]] = mulf %[[VAL_21]], %[[VAL_7]] : f32
+  // CHECK: %[[VAL_23:.*]] = mulf %[[VAL_22]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_24:.*]] = addf %[[VAL_23]], %[[VAL_6]] : f32
+  // CHECK: %[[VAL_25:.*]] = subf %[[CST_8]], %[[VAL_24]] : f32
+  // CHECK: %[[VAL_26:.*]] = select %[[VAL_3]], %[[VAL_25]], %[[VAL_24]] : f32
+  // CHECK: %[[VAL_27:.*]] = cmpf "oeq", %[[VAL_0]], %[[CST_9]] : f32
+  // CHECK: %[[VAL_28:.*]] = select %[[VAL_27]], %[[CST_9]], %[[VAL_26]] : f32
+  // CHECK: %[[VAL_29:.*]] = cmpf "uno", %[[VAL_0]], %[[CST]] : f32
+  // CHECK: %[[VAL_30:.*]] = select %[[VAL_29]], %[[CST_10]], %[[VAL_28]] : f32
+  // CHECK: %[[VAL_31:.*]] = copysign %[[VAL_30]], %[[VAL_0]] : f32
+  // CHECK: %[[VAL_32:.*]] = fptrunc %[[VAL_31]] : f32 to f16
+  // CHECK: return %[[VAL_32]] : f16
+  %res = atan %arg : f16
+  return %res : f16
+}
diff --git a/tensorflow/compiler/mlir/hlo/tests/legalize_tanh_to_approximation.mlir b/tensorflow/compiler/mlir/hlo/tests/legalize_tanh_to_approximation.mlir
deleted file mode 100644
index aa834d36ac4..00000000000
--- a/tensorflow/compiler/mlir/hlo/tests/legalize_tanh_to_approximation.mlir
+++ /dev/null
@@ -1,125 +0,0 @@
-// RUN: mlir-hlo-opt -mhlo-legalize-tanh-to-approximation -split-input-file %s | FileCheck %s
-
-func @tanh_f64(%arg0 : f64) -> f64 {
-  %res = tanh %arg0 : f64
-  return %res : f64
-}
-
-// CHECK-LABEL: @tanh_f64
-// CHECK: tanh
-
-// -----
-
-func @tanh_f32(%arg0 : f32) -> f32 {
-  %res = tanh %arg0 : f32
-  return %res : f32
-}
-
-// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-// CHECK-LABEL:   func @tanh_f32(
-// CHECK-SAME:                   %[[VAL_0:.*]]: f32) -> f32 {
-// CHECK:           %[[VAL_1:.*]] = constant 4.000000e-04 : f32
-// CHECK:           %[[VAL_2:.*]] = constant 7.90531111 : f32
-// CHECK:           %[[VAL_3:.*]] = constant -7.90531111 : f32
-// CHECK:           %[[VAL_4:.*]] = constant -2.76076837E-16 : f32
-// CHECK:           %[[VAL_5:.*]] = constant 2.00018794E-13 : f32
-// CHECK:           %[[VAL_6:.*]] = constant -8.60467184E-11 : f32
-// CHECK:           %[[VAL_7:.*]] = constant 5.12229725E-8 : f32
-// CHECK:           %[[VAL_8:.*]] = constant 1.48572235E-5 : f32
-// CHECK:           %[[VAL_9:.*]] = constant 6.37261954E-4 : f32
-// CHECK:           %[[VAL_10:.*]] = constant 0.00489352457 : f32
-// CHECK:           %[[VAL_11:.*]] = constant 1.19825836E-6 : f32
-// CHECK:           %[[VAL_12:.*]] = constant 1.18534706E-4 : f32
-// CHECK:           %[[VAL_13:.*]] = constant 0.00226843474 : f32
-// CHECK:           %[[VAL_14:.*]] = constant 0.00489352504 : f32
-// CHECK:           %[[VAL_15:.*]] = absf %[[VAL_0]] : f32
-// CHECK:           %[[VAL_16:.*]] = cmpf "olt", %[[VAL_15]], %[[VAL_1]] : f32
-// CHECK:           %[[VAL_17:.*]] = cmpf "ule", %[[VAL_0]], %[[VAL_2]] : f32
-// CHECK:           %[[VAL_18:.*]] = select %[[VAL_17]], %[[VAL_0]], %[[VAL_2]] : f32
-// CHECK:           %[[VAL_19:.*]] = cmpf "uge", %[[VAL_18]], %[[VAL_3]] : f32
-// CHECK:           %[[VAL_20:.*]] = select %[[VAL_19]], %[[VAL_18]], %[[VAL_3]] : f32
-// CHECK:           %[[VAL_21:.*]] = mulf %[[VAL_20]], %[[VAL_20]] : f32
-// CHECK:           %[[VAL_22:.*]] = mulf %[[VAL_21]], %[[VAL_4]] : f32
-// CHECK:           %[[VAL_23:.*]] = addf %[[VAL_22]], %[[VAL_5]] : f32
-// CHECK:           %[[VAL_24:.*]] = mulf %[[VAL_21]], %[[VAL_23]] : f32
-// CHECK:           %[[VAL_25:.*]] = addf %[[VAL_24]], %[[VAL_6]] : f32
-// CHECK:           %[[VAL_26:.*]] = mulf %[[VAL_21]], %[[VAL_25]] : f32
-// CHECK:           %[[VAL_27:.*]] = addf %[[VAL_26]], %[[VAL_7]] : f32
-// CHECK:           %[[VAL_28:.*]] = mulf %[[VAL_21]], %[[VAL_27]] : f32
-// CHECK:           %[[VAL_29:.*]] = addf %[[VAL_28]], %[[VAL_8]] : f32
-// CHECK:           %[[VAL_30:.*]] = mulf %[[VAL_21]], %[[VAL_29]] : f32
-// CHECK:           %[[VAL_31:.*]] = addf %[[VAL_30]], %[[VAL_9]] : f32
-// CHECK:           %[[VAL_32:.*]] = mulf %[[VAL_21]], %[[VAL_31]] : f32
-// CHECK:           %[[VAL_33:.*]] = addf %[[VAL_32]], %[[VAL_10]] : f32
-// CHECK:           %[[VAL_34:.*]] = mulf %[[VAL_20]], %[[VAL_33]] : f32
-// CHECK:           %[[VAL_35:.*]] = mulf %[[VAL_21]], %[[VAL_11]] : f32
-// CHECK:           %[[VAL_36:.*]] = addf %[[VAL_35]], %[[VAL_12]] : f32
-// CHECK:           %[[VAL_37:.*]] = mulf %[[VAL_21]], %[[VAL_36]] : f32
-// CHECK:           %[[VAL_38:.*]] = addf %[[VAL_37]], %[[VAL_13]] : f32
-// CHECK:           %[[VAL_39:.*]] = mulf %[[VAL_21]], %[[VAL_38]] : f32
-// CHECK:           %[[VAL_40:.*]] = addf %[[VAL_39]], %[[VAL_14]] : f32
-// CHECK:           %[[VAL_41:.*]] = divf %[[VAL_34]], %[[VAL_40]] : f32
-// CHECK:           %[[VAL_42:.*]] = select %[[VAL_16]], %[[VAL_0]], %[[VAL_41]] : f32
-// CHECK:           return %[[VAL_42]] : f32
-// CHECK:         }
-
-// -----
-
-func @tanh_f16(%arg0 : f16) -> f16 {
-  %res = tanh %arg0 : f16
-  return %res : f16
-}
-
-// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-// CHECK-LABEL:   func @tanh_f16(
-// CHECK-SAME:                   %[[VAL_0:.*]]: f16) -> f16 {
-// CHECK:           %[[VAL_1:.*]] = constant 4.000000e-04 : f32
-// CHECK:           %[[VAL_2:.*]] = constant 7.90531111 : f32
-// CHECK:           %[[VAL_3:.*]] = constant -7.90531111 : f32
-// CHECK:           %[[VAL_4:.*]] = constant -2.76076837E-16 : f32
-// CHECK:           %[[VAL_5:.*]] = constant 2.00018794E-13 : f32
-// CHECK:           %[[VAL_6:.*]] = constant -8.60467184E-11 : f32
-// CHECK:           %[[VAL_7:.*]] = constant 5.12229725E-8 : f32
-// CHECK:           %[[VAL_8:.*]] = constant 1.48572235E-5 : f32
-// CHECK:           %[[VAL_9:.*]] = constant 6.37261954E-4 : f32
-// CHECK:           %[[VAL_10:.*]] = constant 0.00489352457 : f32
-// CHECK:           %[[VAL_11:.*]] = constant 1.19825836E-6 : f32
-// CHECK:           %[[VAL_12:.*]] = constant 1.18534706E-4 : f32
-// CHECK:           %[[VAL_13:.*]] = constant 0.00226843474 : f32
-// CHECK:           %[[VAL_14:.*]] = constant 0.00489352504 : f32
-// CHECK:           %[[VAL_15:.*]] = fpext %[[VAL_0]] : f16 to f32
-// CHECK:           %[[VAL_16:.*]] = absf %[[VAL_15]] : f32
-// CHECK:           %[[VAL_17:.*]] = cmpf "olt", %[[VAL_16]], %[[VAL_1]] : f32
-// CHECK:           %[[VAL_18:.*]] = cmpf "ule", %[[VAL_15]], %[[VAL_2]] : f32
-// CHECK:           %[[VAL_19:.*]] = select %[[VAL_18]], %[[VAL_15]], %[[VAL_2]] : f32
-// CHECK:           %[[VAL_20:.*]] = cmpf "uge", %[[VAL_19]], %[[VAL_3]] : f32
-// CHECK:           %[[VAL_21:.*]] = select %[[VAL_20]], %[[VAL_19]], %[[VAL_3]] : f32
-// CHECK:           %[[VAL_22:.*]] = mulf %[[VAL_21]], %[[VAL_21]] : f32
-// CHECK:           %[[VAL_23:.*]] = mulf %[[VAL_22]], %[[VAL_4]] : f32
-// CHECK:           %[[VAL_24:.*]] = addf %[[VAL_23]], %[[VAL_5]] : f32
-// CHECK:           %[[VAL_25:.*]] = mulf %[[VAL_22]], %[[VAL_24]] : f32
-// CHECK:           %[[VAL_26:.*]] = addf %[[VAL_25]], %[[VAL_6]] : f32
-// CHECK:           %[[VAL_27:.*]] = mulf %[[VAL_22]], %[[VAL_26]] : f32
-// CHECK:           %[[VAL_28:.*]] = addf %[[VAL_27]], %[[VAL_7]] : f32
-// CHECK:           %[[VAL_29:.*]] = mulf %[[VAL_22]], %[[VAL_28]] : f32
-// CHECK:           %[[VAL_30:.*]] = addf %[[VAL_29]], %[[VAL_8]] : f32
-// CHECK:           %[[VAL_31:.*]] = mulf %[[VAL_22]], %[[VAL_30]] : f32
-// CHECK:           %[[VAL_32:.*]] = addf %[[VAL_31]], %[[VAL_9]] : f32
-// CHECK:           %[[VAL_33:.*]] = mulf %[[VAL_22]], %[[VAL_32]] : f32
-// CHECK:           %[[VAL_34:.*]] = addf %[[VAL_33]], %[[VAL_10]] : f32
-// CHECK:           %[[VAL_35:.*]] = mulf %[[VAL_21]], %[[VAL_34]] : f32
-// CHECK:           %[[VAL_36:.*]] = mulf %[[VAL_22]], %[[VAL_11]] : f32
-// CHECK:           %[[VAL_37:.*]] = addf %[[VAL_36]], %[[VAL_12]] : f32
-// CHECK:           %[[VAL_38:.*]] = mulf %[[VAL_22]], %[[VAL_37]] : f32
-// CHECK:           %[[VAL_39:.*]] = addf %[[VAL_38]], %[[VAL_13]] : f32
-// CHECK:           %[[VAL_40:.*]] = mulf %[[VAL_22]], %[[VAL_39]] : f32
-// CHECK:           %[[VAL_41:.*]] = addf %[[VAL_40]], %[[VAL_14]] : f32
-// CHECK:           %[[VAL_42:.*]] = divf %[[VAL_35]], %[[VAL_41]] : f32
-// CHECK:           %[[VAL_43:.*]] = select %[[VAL_17]], %[[VAL_15]], %[[VAL_42]] : f32
-// CHECK:           %[[VAL_44:.*]] = fptrunc %[[VAL_43]] : f32 to f16
-// CHECK:           return %[[VAL_44]] : f16
-// CHECK:         }
-
-
diff --git a/tensorflow/compiler/mlir/hlo/tests/legalize_to_scf.mlir b/tensorflow/compiler/mlir/hlo/tests/legalize_to_scf.mlir
new file mode 100644
index 00000000000..9c887a73a0f
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tests/legalize_to_scf.mlir
@@ -0,0 +1,38 @@
+// RUN: mlir-hlo-opt --mhlo-control-flow-to-scf %s | FileCheck %s
+
+func @lt_loop(%arg0: tensor<4xf32>, %arg1: tensor<f32>, %arg2: tensor<f32>, %arg3: tensor<4xf32>, %arg4: tensor<f32>, %arg5: tensor<f32>, %arg6: tensor<f32>, %arg7: tensor<f32>, %arg8: tensor<i32>) -> (tuple<tensor<i32>, tensor<i32>, tensor<i32>>) {
+  %cst = constant dense<-1> : tensor<i32>
+  %cst_0 = constant dense<1> : tensor<i32>
+  %cst_1 = constant dense<0> : tensor<i32>
+  %cst_2 = constant dense<1000> : tensor<i32>
+  %0 = "mhlo.tuple"(%cst_1, %cst, %cst_2) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tuple<tensor<i32>, tensor<i32>, tensor<i32>>
+  %1 = "mhlo.while"(%0) ( {
+  ^bb0(%arg9: tuple<tensor<i32>, tensor<i32>, tensor<i32>>):  // no predecessors
+    %2 = "mhlo.get_tuple_element"(%arg9) {index = 0 : i32} : (tuple<tensor<i32>, tensor<i32>, tensor<i32>>) -> tensor<i32>
+    %3 = "mhlo.get_tuple_element"(%arg9) {index = 2 : i32} : (tuple<tensor<i32>, tensor<i32>, tensor<i32>>) -> tensor<i32>
+    %4 = "mhlo.compare"(%2, %3) {comparison_direction = "LT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    "mhlo.return"(%4) : (tensor<i1>) -> ()
+  },  {
+  ^bb0(%arg9: tuple<tensor<i32>, tensor<i32>, tensor<i32>>):  // no predecessors
+    %2 = "mhlo.get_tuple_element"(%arg9) {index = 0 : i32} : (tuple<tensor<i32>, tensor<i32>, tensor<i32>>) -> tensor<i32>
+    %3 = mhlo.add %2, %cst_0 : tensor<i32>
+    %4 = "mhlo.get_tuple_element"(%arg9) {index = 1 : i32} : (tuple<tensor<i32>, tensor<i32>, tensor<i32>>) -> tensor<i32>
+    %5 = "mhlo.get_tuple_element"(%arg9) {index = 2 : i32} : (tuple<tensor<i32>, tensor<i32>, tensor<i32>>) -> tensor<i32>
+    %6 = "mhlo.tuple"(%3, %4, %5) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tuple<tensor<i32>, tensor<i32>, tensor<i32>>
+    "mhlo.return"(%6) : (tuple<tensor<i32>, tensor<i32>, tensor<i32>>) -> ()
+  }) : (tuple<tensor<i32>, tensor<i32>, tensor<i32>>) -> tuple<tensor<i32>, tensor<i32>, tensor<i32>>
+  return %1 : tuple<tensor<i32>, tensor<i32>, tensor<i32>>
+}
+
+// CHECK-LABEL:   func @lt_loop(
+// CHECK:  %[[VAL_9:.*]] = constant dense<-1> : tensor<i32>
+// CHECK:  %[[VAL_10:.*]] = constant dense<1> : tensor<i32>
+// CHECK:  %[[VAL_11:.*]] = constant dense<0> : tensor<i32>
+// CHECK:  %[[VAL_12:.*]] = constant dense<1000> : tensor<i32>
+// CHECK:  %[[VAL_14:.*]] = index_cast %[[VAL_11]] : tensor<i32> to tensor<index>
+// CHECK:  %[[VAL_15:.*]] = extract_element %[[VAL_14]][] : tensor<index>
+// CHECK:  %[[VAL_16:.*]] = index_cast %[[VAL_12]] : tensor<i32> to tensor<index>
+// CHECK:  %[[VAL_17:.*]] = extract_element %[[VAL_16]][] : tensor<index>
+// CHECK:  %[[VAL_18:.*]] = index_cast %[[VAL_10]] : tensor<i32> to tensor<index>
+// CHECK:  %[[VAL_19:.*]] = extract_element %[[VAL_18]][] : tensor<index>
+// CHECK:  scf.for %[[VAL_21:.*]] = %[[VAL_15]] to %[[VAL_17]] step %[[VAL_19]] iter_args(%[[VAL_22:.*]] = %[[VAL_9]], %[[VAL_23:.*]] = %[[VAL_12]])
diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo-copy-removal.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo-copy-removal.mlir
deleted file mode 100644
index 3271595900d..00000000000
--- a/tensorflow/compiler/mlir/hlo/tests/lhlo-copy-removal.mlir
+++ /dev/null
@@ -1,115 +0,0 @@
-// RUN: mlir-hlo-opt -lhlo-copy-removal %s -o - | FileCheck %s
-
-// CHECK-LABEL: func @remove_simple
-func @remove_simple(%arg0: memref<2x2xf32>) {
-    %0 = alloc() {temp = true} : memref<2x2xf32>
-    "lmhlo.copy"(%0, %arg0) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    dealloc %0 : memref<2x2xf32>
-    // CHECK-NEXT: "lmhlo.terminator"() : () -> ()
-    "lmhlo.terminator"() : () -> ()
-}
-
-// -----
-
-// CHECK-LABEL: func @remove_without_dealloc
-func @remove_without_dealloc(%arg0: memref<2x2xf32>) {
-    %0 = alloc() {temp = true} : memref<2x2xf32>
-    "lmhlo.copy"(%0, %arg0) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    // CHECK-NEXT: "lmhlo.terminator"() : () -> ()
-    "lmhlo.terminator"() : () -> ()
-}
-
-// -----
-
-// CHECK-LABEL: func @replace_dependency
-func @replace_dependency(%arg0: memref<2x2xf32>, %arg1: memref<2x2xf32>) {
-    %0 = alloc() {temp = true} : memref<2x2xf32>
-    "lmhlo.exponential"(%arg0, %0) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    // CHECK-NEXT: "lmhlo.exponential"(%arg0, %arg1) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.copy"(%0, %arg1) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    dealloc %0 : memref<2x2xf32>
-    // CHECK-NEXT: "lmhlo.terminator"() : () -> ()
-    "lmhlo.terminator"() : () -> ()
-}
-
-// -----
-
-// CHECK-LABEL: func @keep_copies
-func @keep_copies(%arg0: memref<2x2xf32>, %arg1: memref<2x2xf32>) {
-    // CHECK-NEXT: "lmhlo.copy"(%arg0, %arg1) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.copy"(%arg0, %arg1) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    // CHECK-NEXT: "lmhlo.terminator"() : () -> ()
-    "lmhlo.terminator"() : () -> ()
-}
-
-// -----
-
-// CHECK-LABEL: func @must_not_be_removed
-func @must_not_be_removed(%arg0: memref<2x2xf32>,
-                          %arg1: memref<2x2xf32>,
-                          %arg2: memref<2x2xf32>) {
-    // CHECK-NEXT: %[[ALLOC:.*]] = alloc() {temp = true} : memref<2x2xf32>
-    %0 = alloc() {temp = true} : memref<2x2xf32>
-    // CHECK-NEXT: "lmhlo.exponential"(%arg0, %[[ALLOC]]) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.exponential"(%arg0, %0) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    // CHECK-NEXT: "lmhlo.exponential"(%arg1, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.exponential"(%arg1, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    // CHECK-NEXT: "lmhlo.copy"(%[[ALLOC]], %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.copy"(%0, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    dealloc %0 : memref<2x2xf32>
-    "lmhlo.terminator"() : () -> ()
-}
-
-// -----
-
-// CHECK-LABEL: func @must_be_removed_first
-func @must_be_removed_first(%arg0: memref<2x2xf32>,
-                            %arg1: memref<2x2xf32>,
-                            %arg2: memref<2x2xf32>) {
-    %0 = alloc() {temp = true} : memref<2x2xf32>
-    // CHECK-NEXT: "lmhlo.exponential"(%arg1, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.exponential"(%arg1, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    // CHECK-NEXT: "lmhlo.exponential"(%arg0, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.exponential"(%arg0, %0) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.copy"(%0, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    dealloc %0 : memref<2x2xf32>
-    "lmhlo.terminator"() : () -> ()
-}
-
-// -----
-
-// CHECK-LABEL: func @must_be_removed_second
-func @must_be_removed_second(%arg0: memref<2x2xf32>,
-                             %arg1: memref<2x2xf32>,
-                             %arg2: memref<2x2xf32>) {
-    %0 = alloc() {temp = true} : memref<2x2xf32>
-    // CHECK-NEXT: "lmhlo.exponential"(%arg0, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.exponential"(%arg0, %0) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.copy"(%0, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    // CHECK-NEXT: "lmhlo.exponential"(%arg1, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo.exponential"(%arg1, %arg2) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-    dealloc %0 : memref<2x2xf32>
-    "lmhlo.terminator"() : () -> ()
-}
-
-// -----
-
-// CHECK-LABEL: func @reduce
-func @reduce(%arg0: memref<1x8xf32>, %arg1: memref<f32>, %arg2: memref<1xf32>) {
-  %0 = alloc() : memref<1xf32>
-  "lmhlo.reduce"(%arg0, %arg1, %0) ( {
-  // CHECK: ^bb0(%[[ARG0:.*]]: memref<f32>, %[[ARG1:.*]]: memref<f32>,
-  // CHECK-SAME: %[[ARG2:.*]]: memref<f32>)
-  ^bb0(%arg3: memref<f32>, %arg4: memref<f32>, %arg5: memref<f32>):
-    %1 = alloc() : memref<f32>
-    // CHECK: "lmhlo.add"(%[[ARG0]], %[[ARG1]], %[[ARG2]])
-    "lmhlo.add"(%arg3, %arg4, %1)
-        : (memref<f32>, memref<f32>, memref<f32>) -> ()
-    // CHECK-NOT; lmhlo.copy
-    "lmhlo.copy"(%1, %arg5) : (memref<f32>, memref<f32>) -> ()
-    "lmhlo.terminator"() : () -> ()
-  }) {dimensions = dense<1> : tensor<1xi64>}
-      : (memref<1x8xf32>, memref<f32>, memref<1xf32>) -> ()
-  "lmhlo.copy"(%0, %arg2) : (memref<1xf32>, memref<1xf32>) -> ()
-  return
-}
diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo-fuse-linalg.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo-fuse-linalg.mlir
index 6a674664a36..e51bdfec6f7 100644
--- a/tensorflow/compiler/mlir/hlo/tests/lhlo-fuse-linalg.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/lhlo-fuse-linalg.mlir
@@ -3,20 +3,25 @@
 // RUN: mlir-hlo-opt -lhlo-fuse-linalg=use-parallel-loops %s -split-input-file | FileCheck %s -check-prefix=PLOOP
 
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
-#pointwise_2d_trait = {args_in = 2, args_out = 1, indexing_maps = [#map0, #map0, #map0], iterator_types = ["parallel", "parallel"]}
+#pointwise_2d_trait = {indexing_maps = [#map0, #map0, #map0],
+                       iterator_types = ["parallel", "parallel"]}
 func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
              %summand_2: memref<6x6xf32>, %result: memref<6x6xf32>) {
   %temp_result = alloc() : memref<6x6xf32>
-  linalg.generic #pointwise_2d_trait %summand_1, %summand_2, %temp_result {
+  linalg.generic #pointwise_2d_trait
+    ins(%summand_1, %summand_2 : memref<6x6xf32>, memref<6x6xf32>)
+   outs(%temp_result : memref<6x6xf32>) {
   ^bb0(%summand_1_in: f32, %summand_2_in: f32, %temp_result_in: f32):
     %out = addf %summand_1_in, %summand_2_in : f32
     linalg.yield %out : f32
-  } : memref<6x6xf32>, memref<6x6xf32>, memref<6x6xf32>
-  linalg.generic #pointwise_2d_trait %temp_result, %multiplier, %result {
+  }
+  linalg.generic #pointwise_2d_trait
+    ins(%temp_result, %multiplier : memref<6x6xf32>, memref<6x6xf32>)
+   outs(%result : memref<6x6xf32>) {
   ^bb0(%temp_result_in: f32, %multiplier_in: f32, %result_in: f32):
     %out = mulf %temp_result_in, %multiplier_in : f32
     linalg.yield %out : f32
-  } : memref<6x6xf32>, memref<6x6xf32>, memref<6x6xf32>
+  }
   dealloc %temp_result : memref<6x6xf32>
   return
 }
@@ -59,36 +64,37 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
                       %arg2: memref<100x10xf32>) {
  %0 = alloc() : memref<100x10xf32>
  linalg.generic {
-   args_in = 1 : i64,
-   args_out = 1 : i64,
-   indexing_maps = [affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>],
-   iterator_types = ["parallel", "parallel"]
- } %arg1, %0 {
-     ^bb0(%arg3: f32, %arg4: f32): // no predecessors
-       linalg.yield %arg3 : f32
-     }: memref<100xf32>, memref<100x10xf32>
+   indexing_maps = [affine_map<(d0, d1) -> (d0)>,
+                    affine_map<(d0, d1) -> (d0, d1)>],
+   iterator_types = ["parallel", "parallel"]}
+     ins(%arg1 : memref<100xf32>)
+    outs(%0 : memref<100x10xf32>) {
+   ^bb0(%arg3: f32, %arg4: f32): // no predecessors
+     linalg.yield %arg3 : f32
+   }
  %1 = alloc() : memref<100x10xf32>
  linalg.generic {
-   args_in = 2 : i64,
-   args_out = 1 : i64,
-   indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
-   iterator_types = ["parallel", "parallel"]
- } %arg0, %0, %1 {
+   indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                    affine_map<(d0, d1) -> (d0, d1)>,
+                    affine_map<(d0, d1) -> (d0, d1)>],
+   iterator_types = ["parallel", "parallel"]}
+    ins(%arg0, %0 : memref<100x10xf32>, memref<100x10xf32>)
+   outs(%1 : memref<100x10xf32>) {
      ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): // no predecessors
        %2 = subf %arg3, %arg4 : f32
        linalg.yield %2 : f32
-     }: memref<100x10xf32>, memref<100x10xf32>, memref<100x10xf32>
+     }
  dealloc %0 : memref<100x10xf32>
  linalg.generic {
-   args_in = 1 : i64,
-   args_out = 1 : i64,
-   indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>],
-   iterator_types = ["parallel", "parallel"]
- } %1, %arg2 {
+   indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                    affine_map<(d0, d1) -> (d0, d1)>],
+   iterator_types = ["parallel", "parallel"]}
+     ins(%1 : memref<100x10xf32>)
+    outs(%arg2 : memref<100x10xf32>) {
      ^bb0(%arg3: f32, %arg4: f32): // no predecessors
        %2 = exp %arg3 : f32
        linalg.yield %2 : f32
-     }: memref<100x10xf32>, memref<100x10xf32>
+     }
  dealloc %1 : memref<100x10xf32>
  return
 }
@@ -130,20 +136,26 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
 // -----
 
 #map0 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
-#pointwise_4d_trait = {args_in = 2, args_out = 1, indexing_maps = [#map0, #map0, #map0], iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+#pointwise_4d_trait = {indexing_maps = [#map0, #map0, #map0],
+                       iterator_types = ["parallel", "parallel", "parallel",
+                                         "parallel"]}
 func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32>,
              %summand_2: memref<6x6x6x6xf32>, %result: memref<6x6x6x6xf32>) {
   %temp_result = alloc() : memref<6x6x6x6xf32>
-  linalg.generic #pointwise_4d_trait %summand_1, %summand_2, %temp_result {
+  linalg.generic #pointwise_4d_trait
+    ins(%summand_1, %summand_2 : memref<6x6x6x6xf32>, memref<6x6x6x6xf32>)
+   outs(%temp_result : memref<6x6x6x6xf32>) {
   ^bb0(%summand_1_in: f32, %summand_2_in: f32, %temp_result_in: f32):
     %out = addf %summand_1_in, %summand_2_in : f32
     linalg.yield %out : f32
-  } : memref<6x6x6x6xf32>, memref<6x6x6x6xf32>, memref<6x6x6x6xf32>
-  linalg.generic #pointwise_4d_trait %temp_result, %multiplier, %result {
+  }
+  linalg.generic #pointwise_4d_trait
+    ins(%temp_result, %multiplier : memref<6x6x6x6xf32>, memref<6x6x6x6xf32>)
+   outs(%result : memref<6x6x6x6xf32>) {
   ^bb0(%temp_result_in: f32, %multiplier_in: f32, %result_in: f32):
     %out = mulf %temp_result_in, %multiplier_in : f32
     linalg.yield %out : f32
-  } : memref<6x6x6x6xf32>, memref<6x6x6x6xf32>, memref<6x6x6x6xf32>
+  }
   dealloc %temp_result : memref<6x6x6x6xf32>
   return
 }
@@ -184,21 +196,26 @@ func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32
 // -----
 
 #map0 = affine_map<(d0, d1) -> (d0, d1)>
-#pointwise_2d_trait = {args_in = 2, args_out = 1, indexing_maps = [#map0, #map0, #map0], iterator_types = ["parallel", "parallel"]}
+#pointwise_2d_trait = {indexing_maps = [#map0, #map0, #map0],
+                       iterator_types = ["parallel", "parallel"]}
 func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
              %summand_2: memref<6x6xf32>) -> memref<6x6xf32> {
   %temp_result = alloc() : memref<6x6xf32>
-  linalg.generic #pointwise_2d_trait %summand_1, %summand_2, %temp_result {
+  linalg.generic #pointwise_2d_trait
+    ins(%summand_1, %summand_2 : memref<6x6xf32>, memref<6x6xf32>)
+   outs(%temp_result : memref<6x6xf32>) {
   ^bb0(%summand_1_in: f32, %summand_2_in: f32, %temp_result_in: f32):
     %out = addf %summand_1_in, %summand_2_in : f32
     linalg.yield %out : f32
-  } : memref<6x6xf32>, memref<6x6xf32>, memref<6x6xf32>
+  }
   %result = alloc() : memref<6x6xf32>
-  linalg.generic #pointwise_2d_trait %temp_result, %multiplier, %result {
+  linalg.generic #pointwise_2d_trait
+    ins(%temp_result, %multiplier : memref<6x6xf32>, memref<6x6xf32>)
+   outs(%result : memref<6x6xf32>) {
   ^bb0(%temp_result_in: f32, %multiplier_in: f32, %result_in: f32):
     %out = mulf %temp_result_in, %multiplier_in : f32
     linalg.yield %out : f32
-  } : memref<6x6xf32>, memref<6x6xf32>, memref<6x6xf32>
+  }
   dealloc %temp_result : memref<6x6xf32>
   return %result : memref<6x6xf32>
 }
@@ -234,3 +251,51 @@ func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
 //       PLOOP:        addf
 //       PLOOP:      linalg.generic
 //       PLOOP:        mulf
+
+// -----
+
+func @view_result(%arg0: memref<?xf32>, %arg1: memref<?xindex>, %arg2: index)
+    -> memref<*xf32> {
+  %c1 = constant 1 : index
+  %c0 = constant 0 : index
+  %1 = alloc(%arg2) : memref<?xf32>
+  linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>,
+                                   affine_map<(d0) -> (d0)>],
+                  iterator_types = ["parallel"]}
+      ins(%arg0 : memref<?xf32>) outs(%1 : memref<?xf32>) {
+  ^bb0(%arg3: f32, %arg4: f32):  // no predecessors
+    %13 = absf %arg3 : f32
+    linalg.yield %13 : f32
+  }
+  %2 = lmhlo.reshape_memref_cast %1(%arg1)
+      : (memref<?xf32>, memref<?xindex>) -> memref<*xf32>
+  return %2 : memref<*xf32>
+}
+
+// CHECK-LABEL: func @view_result
+//       CHECK:  %[[C1:.*]] = constant 1
+//   CHECK-NOT:  linalg.generic
+//       CHECK:  scf.for {{.*}} step %[[C1]]
+//   CHECK-NOT:  scf.for
+//       CHECK:      linalg.generic
+//       CHECK:        absf
+//       CHECK:  reshape_memref_cast
+
+// TILED-LABEL: func @view_result
+//   TILED-DAG:  %[[C2:.*]] = constant 2
+//   TILED-NOT:  linalg.generic
+//       TILED:  scf.for {{.*}} step %[[C2]]
+//   TILED-NOT:  scf.for
+//       TILED:      linalg.generic
+//       TILED:        absf
+//       TILED:  reshape_memref_cast
+
+
+// PLOOP-LABEL: func @view_result
+//   PLOOP-NOT:  linalg.generic
+//       PLOOP:  scf.parallel
+//   PLOOP-NOT:  scf.parallel
+//       PLOOP:      linalg.generic
+//       PLOOP:        absf
+//       PLOOP:  reshape_memref_cast
+
diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-affine.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-affine.mlir
index 87818045993..d020f7a083b 100644
--- a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-affine.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-affine.mlir
@@ -158,7 +158,14 @@ func @float_dot_op(%lhs: memref<7x3xf32>, %rhs:
     // CHECK-NEXT:      %[[ADD:.*]] =  addf %[[MULT]], %[[RESULT]] : f32
     // CHECK-NEXT:      affine.store %[[ADD]], %{{.*}}[%[[I]], %[[J]]] : memref<7x4xf32>
     // CHECK: return
-  "lmhlo.dot"(%lhs, %rhs, %result) :
+  "lmhlo.dot"(%lhs, %rhs, %result) {
+      dot_dimension_numbers = {
+        lhs_batching_dimensions = dense<> : tensor<0xi64>,
+        rhs_batching_dimensions = dense<> : tensor<0xi64>,
+        lhs_contracting_dimensions = dense<1> : tensor<1xi64>,
+        rhs_contracting_dimensions = dense<0> : tensor<1xi64>
+      }
+    } :
     (memref<7x3xf32>, memref<3x4xf32>, memref<7x4xf32>) -> ()
   return
 }
@@ -175,7 +182,14 @@ func @int_dot_op(%lhs: memref<7x3xi32>, %rhs:
     // CHECK-NEXT:      %[[ADD:.*]] =  addi %[[MULT]], %[[RESULT]] : i32
     // CHECK-NEXT:      affine.store %[[ADD]], %{{.*}}[%[[I]], %[[J]]] : memref<7x4xi32>
     // CHECK: return
-  "lmhlo.dot"(%lhs, %rhs, %result) :
+  "lmhlo.dot"(%lhs, %rhs, %result) {
+      dot_dimension_numbers = {
+        lhs_batching_dimensions = dense<> : tensor<0xi64>,
+        rhs_batching_dimensions = dense<> : tensor<0xi64>,
+        lhs_contracting_dimensions = dense<1> : tensor<1xi64>,
+        rhs_contracting_dimensions = dense<0> : tensor<1xi64>
+      }
+     } :
     (memref<7x3xi32>, memref<3x4xi32>, memref<7x4xi32>) -> ()
   return
 }
diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir
index f174b005a8d..47151089ccb 100644
--- a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir
@@ -125,6 +125,20 @@ func @copy(%in: memref<2x4x8xf32>, %out: memref<2x4x8xf32>) {
 
 // -----
 
+// CHECK-LABEL: func @is_finte
+func @is_finte(%input: memref<2x2xf32>, %result: memref<2x2xi1>) {
+  "lmhlo.is_finite"(%input, %result) : (memref<2x2xf32>, memref<2x2xi1>) -> ()
+  return
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
+// CHECK-NEXT:   %[[POS_INF:.+]] = constant 0x7F800000 : f32
+// CHECK-NEXT:   %[[ABS_X:.+]] = absf %[[OPERAND_IN]] : f32
+// CHECK-NEXT:   %[[RESULT:.+]] = cmpf "one", %[[ABS_X]], %[[POS_INF]] : f32
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : i1
+
+// -----
+
 // CHECK-LABEL: func @float_cmp
 func @float_cmp(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>,
                 %result: memref<2x2xi1>) {
@@ -263,7 +277,8 @@ func @static_broadcast_in_dim_expansion(%operand: memref<1x5xf32>,
 // CHECK: %[[RESHAPED_ARG:.*]] = linalg.reshape %{{.*}}#[[REASSOCIATION]]]
 // CHECK-SAME:                   memref<1x5xf32> into memref<5xf32>
 // CHECK: linalg.generic {{{.*}}indexing_maps =
-// CHECK-SAME:       [#[[OPERAND_MAP]], #[[RESULT_MAP]]]{{.*}} %[[RESHAPED_ARG]]
+// CHECK-SAME:       [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+// CHECK-SAME:   ins(%[[RESHAPED_ARG]] :
 // CHECK-NEXT: ^bb0(%[[OPERAND:.*]]: f32, %[[RESULT:.*]]: f32):
 // CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
 
@@ -496,6 +511,18 @@ func @sin(%input: memref<2x2xf32>,
 
 // -----
 
+// CHECK-LABEL: func @floor
+func @floor(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
+  "lmhlo.floor"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+  return
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
+// CHECK-NEXT:   %[[RESULT:.*]] = floorf %[[OPERAND_IN]] : f32
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
+
+// -----
+
 // CHECK-LABEL: func @negf
 func @negf(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
   "lmhlo.negate"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
@@ -521,6 +548,19 @@ func @negi(%input: memref<2x2xi32>, %result: memref<2x2xi32>) {
 
 // -----
 
+// CHECK-LABEL: func @not
+func @not(%input: memref<2x2xi64>, %result: memref<2x2xi64>) {
+  "lmhlo.not"(%input, %result) : (memref<2x2xi64>, memref<2x2xi64>) -> ()
+  return
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i64, %[[RESULT_OUT:.*]]):
+// CHECK-NEXT:   %[[N1:.*]] = constant -1 : i64
+// CHECK-NEXT:   %[[RESULT:.*]] = xor %[[N1]], %[[OPERAND_IN]] : i64
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : i64
+
+// -----
+
 // CHECK-LABEL: func @rem
 func @remainder(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>,
                 %result: memref<2x2xf32>) {
@@ -560,6 +600,37 @@ func @sign(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
 
 // -----
 
+// CHECK-LABEL: func @sign_bf16
+func @sign_bf16(%input: memref<2x2xbf16>, %result: memref<2x2xbf16>) {
+  "lmhlo.sign"(%input, %result) : (memref<2x2xbf16>, memref<2x2xbf16>) -> ()
+  return
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: bf16, %[[RESULT_OUT:.*]]):
+// CHECK-NEXT:   %[[CST:.*]] = constant 1.000000e+00 : bf16
+// CHECK-NEXT:   %[[RESULT:.*]] = copysign %[[CST]], %[[OPERAND_IN]] : bf16
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : bf16
+
+// -----
+
+// CHECK-LABEL: func @sign_i16
+func @sign_i16(%input: memref<2x2xi16>, %result: memref<2x2xi16>) {
+  "lmhlo.sign"(%input, %result) : (memref<2x2xi16>, memref<2x2xi16>) -> ()
+  return
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i16, %[[RESULT_OUT:.*]]):
+// CHECK-NEXT:   %[[C0:.*]] = constant 0 : i16
+// CHECK-NEXT:   %[[C15:.*]] = constant 15 : i16
+// CHECK-NEXT:   %[[C1:.*]] = constant 1 : i16
+// CHECK-NEXT:   %[[CMP:.*]] = cmpi "eq", %[[OPERAND_IN]], %[[C0]] : i16
+// CHECK-NEXT:   %[[ASHR:.*]] = shift_right_signed %[[OPERAND_IN]], %[[C15]] : i16
+// CHECK-NEXT:   %[[OR:.*]] = or %[[ASHR]], %[[C1]] : i16
+// CHECK-NEXT:   %[[RESULT:.*]] = select %[[CMP]], %[[C0]], %[[OR]] : i16
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : i16
+
+// -----
+
 // CHECK-LABEL: func @sqrt
 func @sqrt(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
   "lmhlo.sqrt"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
@@ -702,6 +773,32 @@ func @reshape_3D_4D(%arg0: memref<1x49x16xf32>, %arg1: memref<1x784x1x1xf32>) {
 
 // -----
 
+// CHECK-DAG: #[[MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK-LABEL: func @reshape1_4D_4D
+func @reshape1_4D_4D(%arg0: memref<4x512x1x1xi32>,
+                     %arg1: memref<1x4x1x512xi32>) {
+  "lmhlo.reshape"(%arg0, %arg1)
+   : (memref<4x512x1x1xi32>, memref<1x4x1x512xi32>) -> ()
+  return
+}
+// CHECK: linalg.reshape %{{.*}} [#[[MAP]]]
+// CHECK: linalg.reshape %{{.*}} [#[[MAP]]]
+
+// -----
+
+// CHECK-DAG: #[[MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK-LABEL: func @reshape2_4D_4D
+func @reshape2_4D_4D(%arg0: memref<4x1x1x1024xi32>,
+                     %arg1: memref<4x1024x1x1xi32>) {
+  "lmhlo.reshape"(%arg0, %arg1)
+   : (memref<4x1x1x1024xi32>, memref<4x1024x1x1xi32>) -> ()
+  return
+}
+// CHECK: linalg.reshape %{{.*}} [#[[MAP]]]
+// CHECK: linalg.reshape %{{.*}} [#[[MAP]]]
+
+// -----
+
 // CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 2)>
 // CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: func @reverse
@@ -736,3 +833,16 @@ func @conv(%input: memref<3x5x5x3xf32>, %filter: memref<2x2x3x4xf32>, %output: m
   "lmhlo.copy"(%0, %output) : (memref<3x5x5x4xf32>, memref<3x5x5x4xf32>) -> ()
   "lmhlo.terminator"() : () -> ()
 }
+
+// -----
+
+// CHECK-DAG: #[[TRANSPOSE_INPUT_MAP:.*]] = affine_map<(d0, d1) -> (d1, d0)>
+// CHECK-DAG: #[[TRANSPOSE_OUTPUT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @transpose
+func @transpose(%arg0: memref<2x2xf32>, %arg1: memref<2x2xf32>) {
+  "lmhlo.transpose"(%arg0, %arg1) {
+    permutation = dense<[1, 0]> : tensor<2xi64>
+  } : (memref<2x2xf32>, memref<2x2xf32>) -> ()
+  return
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[TRANSPOSE_INPUT_MAP]], #[[TRANSPOSE_OUTPUT_MAP]]]
diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo_gpu_ops.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo_gpu_ops.mlir
new file mode 100644
index 00000000000..9e5ce67f39a
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tests/lhlo_gpu_ops.mlir
@@ -0,0 +1,99 @@
+// RUN: mlir-hlo-opt %s -verify-diagnostics -split-input-file | mlir-hlo-opt | FileCheck %s
+
+// CHECK-LABEL: func @batch_norm_grad_memrefs
+func @batch_norm_grad_memrefs(%arg0: memref<8x8x8x8xf32>, %arg1: memref<8xf32>, %arg2: memref<8xf32>,
+                              %arg3: memref<8xf32>, %arg4: memref<8x8x8x8xf32>,
+                              %grad_operand: memref<8x8x8x8xf32>, %grad_scale: memref<8xf32>,
+                              %grad_offset: memref<8xf32>) -> () {
+  "lmhlo_gpu.batch_norm_grad"(%arg0, %arg1, %arg2, %arg3, %arg4, %grad_operand, %grad_scale, %grad_offset) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}
+      : (memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>, memref<8xf32>, memref<8x8x8x8xf32>,
+         memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @batch_norm_inference_memrefs
+func @batch_norm_inference_memrefs(%arg0: memref<8x8x8x8xf32>, %arg1: memref<8xf32>, %arg2: memref<8xf32>,
+                                   %arg3: memref<8xf32>, %arg4: memref<8xf32>, %arg_out: memref<8x8x8x8xf32>) -> () {
+  "lmhlo_gpu.batch_norm_inference"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg_out) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}
+      : (memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>, memref<8xf32>, memref<8xf32>, memref<8x8x8x8xf32>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @batch_norm_training_memrefs
+func @batch_norm_training_memrefs(%arg0: memref<8x8x8x8xf32>, %arg1: memref<8xf32>, %arg2: memref<8xf32>,
+                                  %output: memref<8x8x8x8xf32>, %batch_mean: memref<8xf32>,
+                                  %batch_var: memref<8xf32>) -> () {
+  "lmhlo_gpu.batch_norm_training"(%arg0, %arg1, %arg2, %output, %batch_mean, %batch_var) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64}
+      : (memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>, memref<8x8x8x8xf32>, memref<8xf32>, memref<8xf32>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @conv_forward
+func @conv_forward(%input : memref<1x1x8x8xf16>, %filter: memref<1x1x2x2xf16>, %output: memref<1x1x7x7xf16>) {
+  %scratch = alloc() : memref<32xi8>
+  // This defined a 2D convolution over a 8x8 single channel input using a 2x2
+  // filter and with an output of 7x7xf16. The 1x1x8x8 is (N, C, H, W)
+  "lmhlo_gpu.conv_forward"(%input, %filter, %output, %scratch)
+    { dimension_numbers = {input_batch_dimension = 0 : i64,
+                           input_feature_dimension = 1 : i64,
+                           input_spatial_dimensions = dense<[2,3]> : tensor<2xi64>,
+                           kernel_input_feature_dimension = 0 : i64,
+                           kernel_output_feature_dimension = 1 : i64,
+                           kernel_spatial_dimensions = dense<[2,3]> : tensor<2xi64>,
+                           output_batch_dimension = 0 : i64,
+                           output_feature_dimension = 1 : i64,
+                           output_spatial_dimensions = dense<[2,3]> : tensor<2xi64>},
+      window_strides = dense<[1, 1]> : tensor<2xi64>,
+      padding = dense<[0,0]> : tensor<2xi64>,
+      lhs_dilation = dense<[1,1]> : tensor<2xi64>,
+      rhs_dilation = dense<[1,1]> : tensor<2xi64>,
+      feature_group_count = 1,
+      batch_group_count = 1,
+      result_scale = 1.0,
+      backend_config = {algorithm=0, tensor_ops_enabled = true }
+    }
+    : (memref<1x1x8x8xf16>, memref<1x1x2x2xf16>, memref<1x1x7x7xf16>, memref<32xi8>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @gemm
+func @gemm(%lhs: memref<5x4xf32>, %rhs: memref<4x5xf32>, %output:memref<5x5xf32>) {
+  "lmhlo_gpu.gemm"(%lhs, %rhs, %output) { dot_dimension_numbers = {
+       lhs_batching_dimensions = dense<[1,1]> : tensor<2xi64>,
+       rhs_batching_dimensions = dense<[1,1]> : tensor<2xi64>,
+       lhs_contracting_dimensions = dense<[1,1]> : tensor<2xi64>,
+       rhs_contracting_dimensions = dense<[1,1]> : tensor<2xi64>},
+       alpha = 0.5,
+       batch_size = 1,
+       algorithm = 0}
+    : (memref<5x4xf32>, memref<4x5xf32>, memref<5x5xf32>) -> ()
+  return
+}
+
+
+// CHECK-LABEL: func @gemm_bias
+func @gemm_bias(%lhs: memref<5x4xf32>, %rhs: memref<4x5xf32>,
+                %bias: memref<5x5xf32>, %output:memref<5x5xf32>) {
+  "lmhlo_gpu.gemm_bias"(%lhs, %rhs, %bias, %output) { dot_dimension_numbers = {
+       lhs_batching_dimensions = dense<[1,1]> : tensor<2xi64>,
+       rhs_batching_dimensions = dense<[1,1]> : tensor<2xi64>,
+       lhs_contracting_dimensions = dense<[1,1]> : tensor<2xi64>,
+       rhs_contracting_dimensions = dense<[1,1]> : tensor<2xi64>},
+       alpha = 0.5,
+       beta = 1.0,
+       batch_size = 1,
+       algorithm = 0}
+    : (memref<5x4xf32>, memref<4x5xf32>, memref<5x5xf32>, memref<5x5xf32>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @cholesky
+func @cholesky(%arg : memref<10x10xf32>, %out: memref<10x10xf32>) {
+  %scratch = alloc() : memref<32xi8>
+  %info = alloc() : memref<32xi32>
+  "lmhlo_gpu.cholesky"(%arg, %out, %scratch, %info) { is_upper = true }
+      : (memref<10x10xf32>, memref<10x10xf32>, memref<32xi8>, memref<32xi32>) -> ()
+  return
+}
diff --git a/tensorflow/compiler/mlir/hlo/tests/lower-complex.mlir b/tensorflow/compiler/mlir/hlo/tests/lower-complex.mlir
index a7bd21257a6..b9c91d61377 100644
--- a/tensorflow/compiler/mlir/hlo/tests/lower-complex.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/lower-complex.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt %s -mhlo-test-chlo-legalize-to-hlo -mhlo-test-lower-complex | FileCheck %s
+// RUN: mlir-hlo-opt %s -chlo-legalize-to-hlo -mhlo-test-lower-complex | FileCheck %s
 
 // CHECK-LABEL: @add
 func @add(%arg0 : tensor<2xf32>, %arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>, %arg3 : tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) {
diff --git a/tensorflow/compiler/mlir/hlo/tests/mhlo_infer_shape_type_methods.mlir b/tensorflow/compiler/mlir/hlo/tests/mhlo_infer_shape_type_methods.mlir
new file mode 100644
index 00000000000..d626f520824
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/tests/mhlo_infer_shape_type_methods.mlir
@@ -0,0 +1,37 @@
+// RUN: mlir-hlo-opt --mhlo-test-infer-shaped-type-methods --allow-unregistered-dialect --split-input-file %s | FileCheck %s
+
+// -----
+// CHECK-LABEL: @select
+// CHECK-SAME: (%[[PRED:.*]]: tensor<2x?xi1>,
+func @select(%pred : tensor<2x?xi1>, %a : tensor<2x?xf32>, %b : tensor<2x?xf32>)
+    -> tensor<2xi64> {
+  // CHECK: %[[C2:.*]] = constant 2 : i64
+  // CHECK: %[[C1:.*]] = constant 1 : index
+  // CHECK: %[[DIM_AS_INDEX:.*]] = dim %[[PRED]], %[[C1]] : tensor<2x?xi1>
+  // CHECK: %[[DIM:.*]] = index_cast %[[DIM_AS_INDEX]] : index to i64
+  // CHECK: %[[SHAPE:.*]] = tensor_from_elements %[[C2]], %[[DIM]] : tensor<2xi64>
+  // CHECK: return %[[SHAPE]] : tensor<2xi64>
+  %0 = "mhlo.select"(%pred, %a, %b)
+      : (tensor<2x?xi1>, tensor<2x?xf32>, tensor<2x?xf32>) -> tensor<2x?xf32>
+  %1 = "mhlo_test.reify_return_type_shapes"(%0)
+      : (tensor<2x?xf32>) -> tensor<2xi64>
+  return %1 : tensor<2xi64>
+}
+
+// -----
+// CHECK-LABEL: @compare
+// CHECK-SAME: (%[[A:.*]]: tensor<2x?xf32>,
+func @compare(%a : tensor<2x?xf32>, %b : tensor<2x?xf32>) -> tensor<2xi64> {
+  // CHECK: %[[C2:.*]] = constant 2 : i64
+  // CHECK: %[[C1:.*]] = constant 1 : index
+  // CHECK: %[[DIM_AS_INDEX:.*]] = dim %[[A]], %[[C1]] : tensor<2x?xf32>
+  // CHECK: %[[DIM:.*]] = index_cast %[[DIM_AS_INDEX]] : index to i64
+  // CHECK: %[[SHAPE:.*]] = tensor_from_elements %[[C2]], %[[DIM]] : tensor<2xi64>
+  // CHECK: return %[[SHAPE]] : tensor<2xi64>
+  %0 = "mhlo.compare"(%a, %b) { comparison_direction = "NE" }
+      : (tensor<2x?xf32>, tensor<2x?xf32>) -> tensor<2x?xi1>
+  %1 = "mhlo_test.reify_return_type_shapes"(%0)
+      : (tensor<2x?xi1>) -> tensor<2xi64>
+  return %1 : tensor<2xi64>
+}
+
diff --git a/tensorflow/compiler/mlir/hlo/tests/ops.mlir b/tensorflow/compiler/mlir/hlo/tests/ops.mlir
index a8f16c403ae..fb4ab62371f 100644
--- a/tensorflow/compiler/mlir/hlo/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/ops.mlir
@@ -328,6 +328,14 @@ func @collective_permute_duplicate_sources(%arg0: tensor<128x32xf32>) -> tensor<
 
 // -----
 
+func @concat_0D(%arg0: tensor<i32>, %arg1: tensor<i32>)  -> tensor<2xi32> {
+  // expected-error@+1 {{rank-0 values cannot be concatenated}}
+  %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  return %0 : tensor<2xi32>
+}
+
+// -----
+
 // CHECK-LABEL: @concat_1D
 func @concat_1D(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<3xi32> {
   %0 = "mhlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
@@ -600,6 +608,14 @@ func @recv_non_token_second_result(%token: !mhlo.token) -> tuple<tensor<3x4xi32>
 
 // -----
 
+// CHECK-LABEL: func @replica_id
+func @replica_id() -> tensor<ui32> {
+  %0 = "mhlo.replica_id"() : () -> tensor<ui32>
+  return %0 : tensor<ui32>
+}
+
+// -----
+
 func @rng_uniform_invalid_type(%mu: tensor<complex<f32>>, %sigma: tensor<f32>) -> tensor<2x3x5xf32> {
   %shape = mhlo.constant dense<[2, 3, 5]> : tensor<3xi64>
   // expected-error@+1 {{but got 'tensor<complex<f32>>'}}
@@ -731,7 +747,7 @@ func @dynamic_slice_mismatch_element_types(%arg0: tensor<3x4xi32>, %arg1: tensor
 // -----
 
 func @dynamic_slice_invalid_start(%arg0: tensor<3x4xi32>, %arg1: tensor<2xi64>) -> tensor<1x4xi32> {
-  // expected-error@+1 {{operand #1 must be a 0-dim integer tensor of 8/16/32/64-bit signless integer or 8/16/32/64-bit unsigned integer values, but got 'tensor<2xi64>'}}
+  // expected-error@+1 {{operand #1 must be 0D tensor of 8/16/32/64-bit signless integer or 8/16/32/64-bit unsigned integer values, but got 'tensor<2xi64>'}}
   %0 = "mhlo.dynamic-slice"(%arg0, %arg1) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<2xi64>) -> tensor<1x4xi32>
   return %0 : tensor<1x4xi32>
 }
@@ -747,7 +763,7 @@ func @dynamic_update_slice(%input: tensor<3x4xi64>, %update: tensor<2xi64>, %sta
 // -----
 
 func @dynamic_update_slice_invalid_start(%input: tensor<3x4xi64>, %update: tensor<2xi64>, %start: tensor<2xi64>) -> tensor<3x4xi64> {
-  // expected-error@+1 {{operand #2 must be a 0-dim integer tensor of 8/16/32/64-bit signless integer or 8/16/32/64-bit unsigned integer values, but got 'tensor<2xi64>'}}
+  // expected-error@+1 {{operand #2 must be 0D tensor of 8/16/32/64-bit signless integer or 8/16/32/64-bit unsigned integer values, but got 'tensor<2xi64>'}}
   %0 = "mhlo.dynamic-update-slice"(%input, %update, %start) : (tensor<3x4xi64>, tensor<2xi64>, tensor<2xi64>) -> tensor<3x4xi64>
   return %0 : tensor<3x4xi64>
 }
@@ -1002,34 +1018,34 @@ func @constant_invalid() -> () {
 
 func @sort(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) {
   // CHECK: mhlo.sort
-  %0 = "mhlo.sort"(%input0, %input1) ( {
+  %0:2 = "mhlo.sort"(%input0, %input1) ( {
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>):
     %7 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     "mhlo.return"(%7) : (tensor<i1>) -> ()
-  }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> tuple<tensor<16x16xf32>, tensor<16x16xi32>>
+  }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> (tensor<16x16xf32>, tensor<16x16xi32>)
   return
 }
 
 // -----
 
 func @sort_no_operands() {
-  // expected-error @+1 {{op requires at least one input}}
-  %0 = "mhlo.sort"() ( {
+  // expected-error @+1 {{expected named operation to have atleast 1 result}}
+  %0:0 = "mhlo.sort"() ( {
   ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>, %arg3: tensor<i32>, %arg4: tensor<i32>):
     %7 = "mhlo.compare"(%arg1, %arg2) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     "mhlo.return"(%7) : (tensor<i1>) -> ()
-  }) {dimension = 1 : i64, is_stable = true} : () -> tuple<>
+  }) {dimension = 1 : i64, is_stable = true} : () -> ()
   return
 }
 
 // -----
 
 func @sort_unknown_rank(%input0: tensor<*xf32>, %input1: tensor<16x16xi32>) {
-  %0 = "mhlo.sort"(%input0, %input1) ( {
+  %0:2 = "mhlo.sort"(%input0, %input1) ( {
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>):
     %7 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     "mhlo.return"(%7) : (tensor<i1>) -> ()
-  }) {dimension = 1 : i64, is_stable = true} : (tensor<*xf32>, tensor<16x16xi32>) -> tuple<tensor<16x16xf32>, tensor<16x16xi32>>
+  }) {dimension = 1 : i64, is_stable = true} : (tensor<*xf32>, tensor<16x16xi32>) -> (tensor<16x16xf32>, tensor<16x16xi32>)
   return
 }
 
@@ -1037,23 +1053,23 @@ func @sort_unknown_rank(%input0: tensor<*xf32>, %input1: tensor<16x16xi32>) {
 
 func @sort_unknown_rank(%input0: tensor<*xf32>, %input1: tensor<16x16xi32>) {
   // expected-error @+1 {{comparator block argument #0 should be of type 'tensor<f32>' but got 'tensor<i32>'}}
-  %0 = "mhlo.sort"(%input0, %input1) ( {
+  %0:2 = "mhlo.sort"(%input0, %input1) ( {
   ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>):
     %7 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
     "mhlo.return"(%7) : (tensor<i1>) -> ()
-  }) {dimension = 1 : i64, is_stable = true} : (tensor<*xf32>, tensor<16x16xi32>) -> tuple<tensor<16x16xf32>, tensor<16x16xi32>>
+  }) {dimension = 1 : i64, is_stable = true} : (tensor<*xf32>, tensor<16x16xi32>) -> (tensor<16x16xf32>, tensor<16x16xi32>)
   return
 }
 
 // -----
 
 func @sort_different_dims(%input0: tensor<16x8xf32>, %input1: tensor<16x16xi32>) {
-  // expected-error @+1 {{op requires all inputs to have the same dimensions}}
-  %0 = "mhlo.sort"(%input0, %input1) ( {
+  // expected-error @+1 {{op requires the same shape for all operands and results}}
+  %0:2 = "mhlo.sort"(%input0, %input1) ( {
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>):
     %7 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     "mhlo.return"(%7) : (tensor<i1>) -> ()
-  }) {dimension = 1 : i64, is_stable = true} : (tensor<16x8xf32>, tensor<16x16xi32>) -> tuple<tensor<16x16xf32>, tensor<16x16xi32>>
+  }) {dimension = 1 : i64, is_stable = true} : (tensor<16x8xf32>, tensor<16x16xi32>) -> (tensor<16x16xf32>, tensor<16x16xi32>)
   return
 }
 
@@ -1061,11 +1077,11 @@ func @sort_different_dims(%input0: tensor<16x8xf32>, %input1: tensor<16x16xi32>)
 
 func @sort_dim_out_of_range(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) {
   // expected-error @+1 {{dimension attribute value must be in range [-2, 2), but found 10}}
-  %0 = "mhlo.sort"(%input0, %input1) ( {
+  %0:2 = "mhlo.sort"(%input0, %input1) ( {
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>):
     %7 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     "mhlo.return"(%7) : (tensor<i1>) -> ()
-  }) {dimension = 10 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> tuple<tensor<16x16xf32>, tensor<16x16xi32>>
+  }) {dimension = 10 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> (tensor<16x16xf32>, tensor<16x16xi32>)
   return
 }
 
@@ -1073,11 +1089,11 @@ func @sort_dim_out_of_range(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi3
 
 func @sort_dim_out_of_range(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) {
   // expected-error @+1 {{dimension attribute value must be in range [-2, 2), but found -3}}
-  %0 = "mhlo.sort"(%input0, %input1) ( {
+  %0:2 = "mhlo.sort"(%input0, %input1) ( {
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>):
     %7 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     "mhlo.return"(%7) : (tensor<i1>) -> ()
-  }) {dimension = -3 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> tuple<tensor<16x16xf32>, tensor<16x16xi32>>
+  }) {dimension = -3 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> (tensor<16x16xf32>, tensor<16x16xi32>)
   return
 }
 
@@ -1085,11 +1101,11 @@ func @sort_dim_out_of_range(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi3
 
 func @sort_wrong_block_arg_count(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) {
   // expected-error @+1 {{op comparator block should have 4 arguments}}
-  %0 = "mhlo.sort"(%input0, %input1) ( {
+  %0:2 = "mhlo.sort"(%input0, %input1) ( {
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
     %7 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     "mhlo.return"(%7) : (tensor<i1>) -> ()
-  }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> tuple<tensor<16x16xf32>, tensor<16x16xi32>>
+  }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> (tensor<16x16xf32>, tensor<16x16xi32>)
   return
 }
 
@@ -1097,11 +1113,11 @@ func @sort_wrong_block_arg_count(%input0: tensor<16x16xf32>, %input1: tensor<16x
 
 func @sort_wrong_block_arg_type(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) {
   // expected-error @+1 {{op comparator block argument #3 should be of type 'tensor<i32>' but got 'tensor<f32>'}}
-  %0 = "mhlo.sort"(%input0, %input1) ( {
+  %0:2 = "mhlo.sort"(%input0, %input1) ( {
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<f32>):
     %7 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     "mhlo.return"(%7) : (tensor<i1>) -> ()
-  }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> tuple<tensor<16x16xf32>, tensor<16x16xi32>>
+  }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> (tensor<16x16xf32>, tensor<16x16xi32>)
   return
 }
 
@@ -1185,3 +1201,24 @@ func @incompatible_shapes(%arg0: tensor<?xf32>, %shape: tensor<2xindex>) -> tens
   %0 = "mhlo.dynamic_reshape"(%arg0, %shape) : (tensor<?xf32>, tensor<2xindex>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
+
+// -----
+
+func @cbrt(%arg: tensor<2x4xf32>) -> tensor<2x4xf32> {
+  %0 = "mhlo.cbrt"(%arg) : (tensor<2x4xf32>) -> tensor<2x4xf32>
+  return %0 : tensor<2x4xf32>
+}
+
+// -----
+
+func @bitcast(%arg: tensor<2x4xf32>) -> tensor<2x4xf32> {
+  %0 = "mhlo.bitcast"(%arg) : (tensor<2x4xf32>) -> tensor<2x4xf32>
+  return %0 : tensor<2x4xf32>
+}
+
+// -----
+
+func @bitcast(%arg: tensor<2x4xf32>) -> tensor<2x4xf32> {
+  %0 = "mhlo.reduce_precision"(%arg) {exponent_bits=2 : i32, mantissa_bits=3 : i32} : (tensor<2x4xf32>) -> tensor<2x4xf32>
+  return %0 : tensor<2x4xf32>
+}
diff --git a/tensorflow/compiler/mlir/hlo/tests/unfuse_batch_norm.mlir b/tensorflow/compiler/mlir/hlo/tests/unfuse_batch_norm.mlir
index f903dbb7080..53ee94f8d1a 100644
--- a/tensorflow/compiler/mlir/hlo/tests/unfuse_batch_norm.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/unfuse_batch_norm.mlir
@@ -109,7 +109,7 @@ func @batchNormInference_dynamic_shape(
   // CHECK-DAG: %[[C3:.*]] = constant 3 : index
   // CHECK-DAG: %[[EPS:.+]] = mhlo.constant dense<1.000000e-03> : tensor<f32>
   // CHECK-DAG: %[[DIM:.+]] = dim %[[VARIANCE]], %[[C0]] : tensor<?xf32>
-  // CHECK-DAG: %[[TO_DIM_TENSOR:.+]] = tensor_from_elements(%[[DIM]]) : tensor<1xindex>
+  // CHECK-DAG: %[[TO_DIM_TENSOR:.+]] = tensor_from_elements %[[DIM]] : tensor<1xindex>
   // CHECK-DAG: %[[EPS_BCAST:.+]] =  "mhlo.dynamic_broadcast_in_dim"(%[[EPS]], %[[TO_DIM_TENSOR]]) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK-DAG: %[[VARIANCE_EPS:.+]] = mhlo.add %[[VARIANCE]], %[[EPS_BCAST]] : tensor<?xf32>
   // CHECK-DAG: %[[STDDEV:.+]] = "mhlo.sqrt"(%[[VARIANCE_EPS]]) : (tensor<?xf32>) -> tensor<?xf32>
@@ -117,7 +117,7 @@ func @batchNormInference_dynamic_shape(
   // CHECK-DAG: %[[INPUT_DIM_1:.+]] = dim %[[X]], %[[C1]] : tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[INPUT_DIM_2:.+]] = dim %[[X]], %[[C2]] : tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[INPUT_DIM_3:.+]] = dim %[[X]], %[[C3]] : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[TO_INPUT_DIM_TENSOR:.+]] = tensor_from_elements(%[[INPUT_DIM_0]], %[[INPUT_DIM_1]], %[[INPUT_DIM_2]], %[[INPUT_DIM_3]]) : tensor<4xindex>
+  // CHECK-DAG: %[[TO_INPUT_DIM_TENSOR:.+]] = tensor_from_elements %[[INPUT_DIM_0]], %[[INPUT_DIM_1]], %[[INPUT_DIM_2]], %[[INPUT_DIM_3]] : tensor<4xindex>
   // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[STDDEV]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[SCALE_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[SCALE]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[OFFSET]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
diff --git a/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cpp b/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cpp
index d0c0e3c51e1..ed96dd5ffd8 100644
--- a/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cpp
+++ b/tensorflow/compiler/mlir/hlo/tools/mlir-hlo-opt/mlir-hlo-opt.cpp
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/lhlo_gpu_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/register_passes.h"
 #include "mlir/InitAllDialects.h"
@@ -31,6 +32,7 @@ int main(int argc, char **argv) {
   registry.insert<mlir::mhlo::MhloDialect>();
   registry.insert<mlir::chlo::HloClientDialect>();
   registry.insert<mlir::lmhlo::LmhloDialect>();
+  registry.insert<mlir::lmhlo_gpu::LmhloGpuDialect>();
 
   return failed(
       mlir::MlirOptMain(argc, argv, "MLIR HLO pass driver\n", registry));
diff --git a/tensorflow/compiler/mlir/init_mlir.cc b/tensorflow/compiler/mlir/init_mlir.cc
index 54f8a57d8a6..fac9f51d8ba 100644
--- a/tensorflow/compiler/mlir/init_mlir.cc
+++ b/tensorflow/compiler/mlir/init_mlir.cc
@@ -20,6 +20,11 @@ limitations under the License.
 namespace tensorflow {
 
 InitMlir::InitMlir(int *argc, char ***argv) : init_llvm_(*argc, *argv) {
+  llvm::setBugReportMsg(
+      "TensorFlow crashed, please file a bug on "
+      "https://github.com/tensorflow/tensorflow/issues with the trace "
+      "below.\n");
+
   constexpr char kSeparator[] = "--";
 
   // Find index of separator between two sets of flags.
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 2d3a58b5b9d..eff591895e1 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -1,3 +1,9 @@
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_native_cc_binary")
 load(
     "//third_party/mlir:tblgen.bzl",
@@ -37,6 +43,7 @@ filegroup(
 
 gentbl(
     name = "tensorflow_lite_ops_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-op-decls",
@@ -68,6 +75,7 @@ gentbl(
 
 gentbl(
     name = "tensorflow_lite_op_interfaces_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-op-interface-decls",
@@ -87,6 +95,7 @@ gentbl(
 
 gentbl(
     name = "tensorflow_lite_prepare_tf_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-rewriters",
@@ -105,6 +114,7 @@ gentbl(
 
 gentbl(
     name = "tensorflow_lite_lower_static_tensor_list_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-rewriters",
@@ -122,6 +132,7 @@ gentbl(
 
 gentbl(
     name = "tensorflow_lite_legalize_tf_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-rewriters",
@@ -139,6 +150,7 @@ gentbl(
 
 gentbl(
     name = "tensorflow_lite_optimize_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-rewriters",
@@ -157,6 +169,7 @@ gentbl(
 
 gentbl(
     name = "tensorflow_lite_quantize_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-rewriters",
@@ -173,6 +186,7 @@ gentbl(
 
 gentbl(
     name = "tensorflow_lite_post_quantize_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-rewriters",
@@ -280,6 +294,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "nms_utils",
+    srcs = [
+        "utils/nms_utils.cc",
+    ],
+    hdrs = [
+        "utils/nms_utils.h",
+    ],
+    copts = ["-std=c++14"],
+    deps = [
+        ":tensorflow_lite",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
+        "//tensorflow/core:framework",
+        "@flatbuffers",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "tftext_utils",
     srcs = [
@@ -373,6 +409,7 @@ cc_library(
     deps = [
         ":constant_utils",
         ":lstm_utils",
+        ":nms_utils",
         ":stateful_ops_utils",
         ":tensorflow_lite",
         ":tftext_utils",
@@ -384,6 +421,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
         "//tensorflow/compiler/mlir/tensorflow:mangling_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/mlir/tensorflow:tf_legalize_hlo",
         "//tensorflow/compiler/mlir/tensorflow:unroll_batch_matmul_pass",
@@ -439,7 +477,6 @@ cc_library(
         "transforms/default_quant_params.cc",
         "transforms/generated_post_quantize.inc",
         "transforms/generated_quantize.inc",
-        "transforms/load_quantization_recipe.cc",
         "transforms/post_quantize.cc",
         "transforms/prepare_quantize.cc",
         "transforms/quantize.cc",
@@ -498,6 +535,7 @@ filegroup(
 
 gentbl(
     name = "op_quant_spec_getters_inc",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [("", "utils/generated_op_quant_spec_getters.inc")],
     tblgen = "//tensorflow/compiler/mlir/lite/quantization:op_quant_spec_getters_gen",
     td_file = "ir/tfl_ops.td",
@@ -509,19 +547,6 @@ gentbl(
     ],
 )
 
-# Library with tensorflow Lite dialect static initialization.
-cc_library(
-    name = "tensorflow_lite_dialect_registration",
-    srcs = [
-        "ir/dialect_registration.cc",
-    ],
-    deps = [
-        ":tensorflow_lite",
-        "@llvm-project//mlir:IR",
-    ],
-    alwayslink = 1,
-)
-
 tf_native_cc_binary(
     name = "converter-gen",
     srcs = [
@@ -536,6 +561,7 @@ tf_native_cc_binary(
 
 gentbl(
     name = "converter_inc",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "--gen-operator-converters",
@@ -628,12 +654,10 @@ cc_library(
         ":flatbuffer_tflite_operator_lib",
         ":stateful_ops_utils",
         ":tensorflow_lite",
-        ":tensorflow_lite_dialect_registration",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
         "//tensorflow/compiler/mlir/tensorflow:export_tf_dialect_op",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:protos_all_cc",
@@ -645,6 +669,7 @@ cc_library(
         "//tensorflow/lite/delegates/flex:allowlisted_flex_ops_lib",
         "//tensorflow/lite/kernels/internal:kernel_utils",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/tools/versioning",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -672,7 +697,6 @@ cc_library(
         ":convert_type",
         ":flatbuffer_tflite_operator_lib",
         ":tensorflow_lite",
-        ":tensorflow_lite_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:mangling_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
@@ -682,6 +706,7 @@ cc_library(
         "//tensorflow/core/platform:status",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -867,7 +892,6 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/quantization:quantization_passes",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:decode_constant_pass",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
diff --git a/tensorflow/compiler/mlir/lite/converter_gen.cc b/tensorflow/compiler/mlir/lite/converter_gen.cc
index edead2037a3..44eba0d5e6f 100644
--- a/tensorflow/compiler/mlir/lite/converter_gen.cc
+++ b/tensorflow/compiler/mlir/lite/converter_gen.cc
@@ -513,7 +513,7 @@ static bool RuntimeVerifierWriterMain(raw_ostream &os, RecordKeeper &records) {
         continue;
       }
       if (trait.getDef().getValueAsString("trait") !=
-          "OpTrait::TFLRuntimeOpTrait") {
+          "::mlir::OpTrait::TFLRuntimeOpTrait") {
         continue;
       }
 
diff --git a/tensorflow/compiler/mlir/lite/experimental/estimators/BUILD b/tensorflow/compiler/mlir/lite/experimental/estimators/BUILD
index 373c95f6bf5..3b80b871790 100644
--- a/tensorflow/compiler/mlir/lite/experimental/estimators/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/estimators/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 package(
     default_visibility = [
         "//visibility:public",
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index 34200fb88b6..a98e83b7e1e 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -75,6 +75,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/flex/allowlisted_flex_ops.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/tools/versioning/op_version.h"
 #include "tensorflow/lite/tools/versioning/runtime_version.h"
@@ -325,6 +326,21 @@ static Optional<TfLitePoolParams> GetTflitePoolParams(Operation* inst,
 
 namespace {
 
+// Helper struct that wraps inputs/outputs of a single SignatureDef.
+struct SignatureDefData {
+  // Note, we are using maps here to make order deterministic
+  // for easily testing only.
+
+  // Inputs defined in the signature def mapped to tensor names.
+  std::map<std::string, std::string> inputs;
+  // Outputs defined in the signature def mapped to tensor names.
+  std::map<std::string, std::string> outputs;
+  // Method name exported by the signature def.
+  std::string method_name;
+  // SignatureDef key.
+  std::string signature_def_key;
+};
+
 // Translates an MLIR module in TFLite dialect to TFLite FlatBuffer.
 class Translator {
  public:
@@ -333,16 +349,19 @@ class Translator {
   // internal error.
   static Optional<std::string> Translate(
       ModuleOp module, bool emit_builtin_tflite_ops, bool emit_select_tf_ops,
-      bool emit_custom_ops, OpOrArgNameMapper* op_or_arg_name_mapper);
+      bool emit_custom_ops, const std::unordered_set<std::string>& tags,
+      OpOrArgNameMapper* op_or_arg_name_mapper);
 
  private:
   enum class OpType : char { kTfliteBuiltin, kSelectTf, kCustomOp };
   explicit Translator(ModuleOp module, bool emit_builtin_tflite_ops,
                       bool emit_select_tf_ops, bool emit_custom_ops,
+                      const std::unordered_set<std::string>& saved_model_tags,
                       OpOrArgNameMapper* op_or_arg_name_mapper)
       : module_(module),
         name_mapper_(*op_or_arg_name_mapper),
-        builder_(kInitialBufferSize) {
+        builder_(kInitialBufferSize),
+        saved_model_tags_(saved_model_tags) {
     // The first buffer must be empty according to the schema definition.
     empty_buffer_ = tflite::CreateBuffer(builder_);
     buffers_.push_back(empty_buffer_);
@@ -449,6 +468,17 @@ class Translator {
   Optional<VectorBufferOffset<BufferOffset<tflite::Metadata>>>
   CreateMetadataVector();
 
+  // Builds and returns list of tfl.SignatureDef sections in the model.
+  Optional<VectorBufferOffset<BufferOffset<tflite::SignatureDef>>>
+  CreateSignatureDefs(const std::vector<SignatureDefData>& signature_defs);
+
+  // Returns list of offsets for the passed 'items' in TensorMap structure
+  // inside the flatbuffer.
+  // 'items' is a map from tensor name in signatureDef to tensor name in
+  // the model.
+  std::vector<BufferOffset<tflite::TensorMap>> GetList(
+      const std::map<std::string, std::string>& items);
+
   // Uses the tf.entry_function attribute (if set) to initialize the op to name
   // mapping.
   void InitializeNamesFromAttribute(FuncOp fn, bool* has_input_attr);
@@ -471,6 +501,8 @@ class Translator {
   BufferOffset<tflite::Buffer> empty_buffer_;
 
   std::vector<BufferOffset<tflite::Buffer>> buffers_;
+  // Maps tensor name in the graph to the tensor index.
+  absl::flat_hash_map<std::string, int> tensor_index_map_;
 
   // Maps op name to index of the corresponding OperatorCode in opcodes_ vector.
   absl::flat_hash_map<std::string, uint32_t> opcode_index_map_;
@@ -489,6 +521,9 @@ class Translator {
   // The failed ops during legalization.
   std::set<std::string> failed_flex_ops_;
   std::set<std::string> failed_custom_ops_;
+
+  // Set of saved model tags, if any.
+  const std::unordered_set<std::string> saved_model_tags_;
 };
 
 std::string Translator::UniqueName(mlir::Value val) {
@@ -1130,6 +1165,7 @@ Optional<BufferOffset<tflite::SubGraph>> Translator::BuildSubGraph(
     }
 
     tensor_index_map.insert({value, tensors.size()});
+    tensor_index_map_[name] = tensors.size();
     auto tensor_or = BuildTensor(value, name, buffers_.size());
     if (!tensor_or) return false;
     tensors.push_back(*tensor_or);
@@ -1285,6 +1321,149 @@ Translator::CreateMetadataVector() {
   return builder_.CreateVector(metadata);
 }
 
+// Helper method that returns list of all strings in a StringAttr identified
+// by 'attr_key' and values are separated by a comma.
+llvm::SmallVector<llvm::StringRef, 2> GetStringsFromAttrWithSeparator(
+    mlir::DictionaryAttr attr, const std::string& attr_key) {
+  llvm::SmallVector<llvm::StringRef, 2> result;
+  if (auto str = attr.get(attr_key).dyn_cast_or_null<mlir::StringAttr>()) {
+    str.getValue().split(result, ',', /*MaxSplit=*/-1,
+                         /*KeepEmpty=*/false);
+  }
+  return result;
+}
+
+// Helper method that return list of string for all the StringAttr in the
+// Attribute identified by 'attr_name'.
+std::vector<std::string> GetStringsFromDictionaryAttr(
+    const llvm::SmallVector<mlir::MutableDictionaryAttr, 4>& dict_attrs,
+    const std::string& attr_name) {
+  std::vector<std::string> result;
+  for (const auto& arg_attr : dict_attrs) {
+    auto attrs = arg_attr.getAttrs();
+    for (const auto attr : attrs) {
+      if (attr.first.str() == attr_name) {
+        auto array_attr = attr.second.dyn_cast_or_null<mlir::ArrayAttr>();
+        if (!array_attr || array_attr.empty()) continue;
+        auto string_attr = array_attr[0].dyn_cast_or_null<mlir::StringAttr>();
+        if (!string_attr) continue;
+        result.push_back(string_attr.getValue().str());
+      }
+    }
+  }
+  return result;
+}
+
+std::vector<SignatureDefData> BuildSignaturedef(
+    FuncOp main_op, const std::string& saved_model_tag) {
+  static const char kSignatureDefIndexPath[] = "tf_saved_model.index_path";
+  static const char kEntryFunctionAttributes[] = "tf.entry_function";
+
+  // Fetch inputs and outputs from the signature.
+  llvm::SmallVector<mlir::MutableDictionaryAttr, 4> arg_attrs, res_attrs;
+  main_op.getAllArgAttrs(arg_attrs);
+  main_op.getAllResultAttrs(res_attrs);
+  std::vector<std::string> sig_def_inputs =
+      GetStringsFromDictionaryAttr(arg_attrs, kSignatureDefIndexPath);
+  std::vector<std::string> sig_def_outputs =
+      GetStringsFromDictionaryAttr(res_attrs, kSignatureDefIndexPath);
+
+  // If no defined saved model signature, then return empty list.
+  // This can happen when we are converting model not from SavedModel.
+  if (sig_def_inputs.empty() || sig_def_outputs.empty()) return {};
+
+  // Fetch function inputs and outputs tensor names.
+  auto dict_attr =
+      main_op.getAttrOfType<mlir::DictionaryAttr>(kEntryFunctionAttributes);
+  if (!dict_attr) return {};
+
+  // Get Input and output tensor names from attribute.
+  llvm::SmallVector<llvm::StringRef, 2> input_names =
+      GetStringsFromAttrWithSeparator(dict_attr, /*attr_key=*/"inputs");
+  llvm::SmallVector<llvm::StringRef, 2> output_names =
+      GetStringsFromAttrWithSeparator(dict_attr, /*attr_key=*/"outputs");
+
+  // Verify input size match the number of arguments.
+  if (input_names.size() != main_op.getNumArguments()) {
+    main_op.emitWarning() << "invalid entry function specification";
+    return {};
+  }
+  // Verify output size match the number of arguments.
+  auto term = main_op.back().getTerminator();
+  if (output_names.size() != term->getNumOperands()) {
+    main_op.emitWarning() << "output names (" << output_names.size()
+                          << ") != terminator operands ("
+                          << term->getNumOperands() << ")";
+    return {};
+  }
+  // Verify number of tensors for inputs and outputs matches size
+  // of the list in the signature def.
+  if (input_names.size() != sig_def_inputs.size() ||
+      output_names.size() != sig_def_outputs.size()) {
+    main_op.emitWarning(
+        "Mismatch between signature def inputs/outputs and main function "
+        "arguments.");
+    return {};
+  }
+  // Exported method name.
+  auto exported_name =
+      main_op.getAttrOfType<mlir::ArrayAttr>("tf_saved_model.exported_names");
+  if (exported_name.empty()) {
+    main_op.emitError("Empty exported names for main Function");
+    return {};
+  }
+  // Fill the SignatureDefData container.
+  // We create vector of size 1 as TFLite now supports only 1 signatureDef.
+  std::vector<SignatureDefData> result(1);
+  for (int i = 0; i < input_names.size(); ++i) {
+    result[0].inputs[sig_def_inputs[i]] = input_names[i].str();
+  }
+  for (int i = 0; i < output_names.size(); ++i) {
+    result[0].outputs[sig_def_outputs[i]] = output_names[i].str();
+  }
+  if (auto name_attr = exported_name[0].dyn_cast_or_null<StringAttr>())
+    result[0].method_name = name_attr.getValue().str();
+  result[0].signature_def_key = saved_model_tag;
+  return result;
+}
+
+std::vector<BufferOffset<tflite::TensorMap>> Translator::GetList(
+    const std::map<std::string, std::string>& items) {
+  std::vector<BufferOffset<tflite::TensorMap>> result;
+  for (const auto& item : items) {
+    auto name_buf = builder_.CreateString(item.first);
+    tflite::TensorMapBuilder tensor_map_builder(builder_);
+    tensor_map_builder.add_name(name_buf);
+    tensor_map_builder.add_tensor_index(tensor_index_map_[item.second]);
+    result.push_back(tensor_map_builder.Finish());
+  }
+  return result;
+}
+
+Optional<VectorBufferOffset<BufferOffset<tflite::SignatureDef>>>
+Translator::CreateSignatureDefs(
+    const std::vector<SignatureDefData>& signature_defs) {
+  std::vector<BufferOffset<tflite::SignatureDef>> signature_defs_buffer;
+  for (const auto& signature_def_data : signature_defs) {
+    auto inputs = GetList(signature_def_data.inputs);
+    auto outputs = GetList(signature_def_data.outputs);
+    auto inputs_buf = builder_.CreateVector(inputs);
+    auto outputs_buf = builder_.CreateVector(outputs);
+    auto method_name_buf =
+        builder_.CreateString(signature_def_data.method_name);
+    auto signature_def_key_buf =
+        builder_.CreateString(signature_def_data.signature_def_key);
+    tflite::SignatureDefBuilder sig_def_builder(builder_);
+    sig_def_builder.add_inputs(inputs_buf);
+    sig_def_builder.add_outputs(outputs_buf);
+    sig_def_builder.add_method_name(method_name_buf);
+    sig_def_builder.add_key(signature_def_key_buf);
+    signature_defs_buffer.push_back(sig_def_builder.Finish());
+  }
+
+  return builder_.CreateVector(signature_defs_buffer);
+}
+
 bool UpdateEntryFunction(ModuleOp module) {
   if (module.lookupSymbol<FuncOp>("main") != nullptr) {
     // We already have an entry function.
@@ -1311,11 +1490,12 @@ bool UpdateEntryFunction(ModuleOp module) {
 
 Optional<std::string> Translator::Translate(
     ModuleOp module, bool emit_builtin_tflite_ops, bool emit_select_tf_ops,
-    bool emit_custom_ops, OpOrArgNameMapper* op_or_arg_name_mapper) {
+    bool emit_custom_ops, const std::unordered_set<std::string>& tags,
+    OpOrArgNameMapper* op_or_arg_name_mapper) {
   if (!UpdateEntryFunction(module)) return llvm::None;
   if (!IsValidTFLiteMlirModule(module)) return llvm::None;
   Translator translator(module, emit_builtin_tflite_ops, emit_select_tf_ops,
-                        emit_custom_ops, op_or_arg_name_mapper);
+                        emit_custom_ops, tags, op_or_arg_name_mapper);
   return translator.TranslateInternal();
 }
 
@@ -1391,10 +1571,17 @@ Optional<std::string> Translator::TranslateInternal() {
   auto metadata = CreateMetadataVector();
   if (!metadata) return llvm::None;
 
-  auto model = tflite::CreateModel(
-      builder_, TFLITE_SCHEMA_VERSION, builder_.CreateVector(opcodes_),
-      builder_.CreateVector(subgraphs), description,
-      builder_.CreateVector(buffers_), metadata_buffer, *metadata);
+  // Build SignatureDef
+  // We only have 1 entry point 'main' function, so build only 1 signature def.
+  auto main_fn_signature_def = BuildSignaturedef(
+      main_fn, saved_model_tags_.empty() ? "" : *saved_model_tags_.begin());
+  auto signature_defs = CreateSignatureDefs(main_fn_signature_def);
+
+  auto model = tflite::CreateModel(builder_, TFLITE_SCHEMA_VERSION,
+                                   builder_.CreateVector(opcodes_),
+                                   builder_.CreateVector(subgraphs),
+                                   description, builder_.CreateVector(buffers_),
+                                   metadata_buffer, *metadata, *signature_defs);
   tflite::FinishModelBuffer(builder_, model);
   tflite::UpdateOpVersion(builder_.GetBufferPointer());
   tflite::UpdateMinimumRuntimeVersionForModel(builder_.GetBufferPointer());
@@ -1518,12 +1705,10 @@ bool tflite::MlirToFlatBufferTranslateFunction(
     ModuleOp module, std::string* serialized_flatbuffer,
     bool emit_builtin_tflite_ops, bool emit_select_tf_ops, bool emit_custom_ops,
     OpOrArgNameMapper* op_or_arg_name_mapper) {
-  auto maybe_translated =
-      Translator::Translate(module, emit_builtin_tflite_ops, emit_select_tf_ops,
-                            emit_custom_ops, op_or_arg_name_mapper);
-  if (!maybe_translated) return true;
-  *serialized_flatbuffer = std::move(*maybe_translated);
-  return false;
+  return MlirToFlatBufferTranslateFunction(
+      module, serialized_flatbuffer, emit_builtin_tflite_ops,
+      emit_select_tf_ops, emit_custom_ops, /*saved_model_tags=*/{},
+      op_or_arg_name_mapper);
 }
 
 bool tflite::MlirToFlatBufferTranslateFunction(
@@ -1533,5 +1718,30 @@ bool tflite::MlirToFlatBufferTranslateFunction(
   OpOrArgLocNameMapper op_or_arg_name_mapper;
   return MlirToFlatBufferTranslateFunction(
       module, serialized_flatbuffer, emit_builtin_tflite_ops,
-      emit_select_tf_ops, emit_custom_ops, &op_or_arg_name_mapper);
+      emit_select_tf_ops, emit_custom_ops, /*saved_model_tags=*/{},
+      &op_or_arg_name_mapper);
+}
+
+bool tflite::MlirToFlatBufferTranslateFunction(
+    mlir::ModuleOp module, std::string* serialized_flatbuffer,
+    bool emit_builtin_tflite_ops, bool emit_select_tf_ops, bool emit_custom_ops,
+    const std::unordered_set<std::string>& saved_model_tags) {
+  OpOrArgLocNameMapper op_or_arg_name_mapper;
+  return MlirToFlatBufferTranslateFunction(
+      module, serialized_flatbuffer, emit_builtin_tflite_ops,
+      emit_select_tf_ops, emit_custom_ops, saved_model_tags,
+      &op_or_arg_name_mapper);
+}
+
+bool tflite::MlirToFlatBufferTranslateFunction(
+    mlir::ModuleOp module, std::string* serialized_flatbuffer,
+    bool emit_builtin_tflite_ops, bool emit_select_tf_ops, bool emit_custom_ops,
+    const std::unordered_set<std::string>& saved_model_tags,
+    OpOrArgNameMapper* op_or_arg_name_mapper) {
+  auto maybe_translated = Translator::Translate(
+      module, emit_builtin_tflite_ops, emit_select_tf_ops, emit_custom_ops,
+      saved_model_tags, op_or_arg_name_mapper);
+  if (!maybe_translated) return true;
+  *serialized_flatbuffer = std::move(*maybe_translated);
+  return false;
 }
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.h b/tensorflow/compiler/mlir/lite/flatbuffer_export.h
index 0fbf2f07dfb..0888d2a4a41 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.h
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_EXPORT_H_
 
 #include <string>
+#include <unordered_set>
 
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
@@ -33,11 +34,24 @@ bool MlirToFlatBufferTranslateFunction(mlir::ModuleOp module,
                                        bool emit_select_tf_ops,
                                        bool emit_custom_ops);
 
+// Same as above but takes SavedModel tags of the model.
+bool MlirToFlatBufferTranslateFunction(
+    mlir::ModuleOp module, std::string* serialized_flatbuffer,
+    bool emit_builtin_tflite_ops, bool emit_select_tf_ops, bool emit_custom_ops,
+    const std::unordered_set<std::string>& saved_model_tags);
+
 // Same as the above but with a custom op name mapper.
 bool MlirToFlatBufferTranslateFunction(
     mlir::ModuleOp module, std::string* serialized_flatbuffer,
     bool emit_builtin_tflite_ops, bool emit_select_tf_ops, bool emit_custom_ops,
     tensorflow::OpOrArgNameMapper* op_or_arg_name_mapper);
+
+// Same as above but takes SavedModel tags of the model.
+bool MlirToFlatBufferTranslateFunction(
+    mlir::ModuleOp module, std::string* serialized_flatbuffer,
+    bool emit_builtin_tflite_ops, bool emit_select_tf_ops, bool emit_custom_ops,
+    const std::unordered_set<std::string>& saved_model_tags,
+    tensorflow::OpOrArgNameMapper* op_or_arg_name_mapper);
 }  // namespace tflite
 
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_EXPORT_H_
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 230383729c4..7d64e268063 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -75,6 +75,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 
 using llvm::ArrayRef;
 using mlir::Builder;
@@ -271,18 +272,18 @@ StatusOr<std::string> GetMlirOpName(const tflite::OperatorT& op,
     return std::string("tfl.basic_lstm");
   }
 
-  if (op_code.builtin_code == tflite::BuiltinOperator_CUSTOM) {
+  auto builtin_code = tflite::GetBuiltinCode(&op_code);
+  if (builtin_code == tflite::BuiltinOperator_CUSTOM) {
     return std::string("tfl.custom");
   }
-  if (op_code.builtin_code == tflite::BuiltinOperator_IF) {
+  if (builtin_code == tflite::BuiltinOperator_IF) {
     return std::string("tf.If");
   }
-  if (op_code.builtin_code == tflite::BuiltinOperator_WHILE) {
+  if (builtin_code == tflite::BuiltinOperator_WHILE) {
     return std::string("tf.While");
   }
 
-  llvm::StringRef op_name(
-      tflite::EnumNameBuiltinOperator(op_code.builtin_code));
+  llvm::StringRef op_name(tflite::EnumNameBuiltinOperator(builtin_code));
   return llvm::Twine("tfl.", op_name.lower()).str();
 }
 
@@ -637,7 +638,8 @@ StatusOr<Operation*> ConvertOp(
   }
 
   llvm::SmallVector<mlir::NamedAttribute, 2> attrs;
-  if (op_code.builtin_code == tflite::BuiltinOperator_CUSTOM) {
+  auto builtin_code = tflite::GetBuiltinCode(&op_code);
+  if (builtin_code == tflite::BuiltinOperator_CUSTOM) {
     auto status = mlir::CustomOptionsToAttributes(
         op_code.custom_code, op.custom_options, builder, loc, &attrs);
     if (!status.ok()) {
@@ -784,7 +786,7 @@ static StatusOr<FuncOp> PostProcessFuncOp(FuncOp func) {
         auto new_output_type = new_qtype.castFromExpressedType(
             mlir::quant::UniformQuantizedType::castToExpressedType(
                 value.getType()));
-        builder.setInsertionPointAfter(cst);
+        builder.setInsertionPointAfter(cst.getOperation());
         auto new_op = builder.create<tfl::QConstOp>(
             cst.getLoc(), new_output_type, mlir::TypeAttr::get(new_output_type),
             cst.valueAttr());
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
index 5accb419e83..60fd1160be2 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
@@ -127,12 +127,12 @@ static tflite::TensorType ConvertDerivedTypeAttrForOptionWriter(
 
 // I32Attr already returns an int as required by flatbuffer builders.
 static int ConvertI32AttrForOptionWriter(
-    llvm::APInt i, flatbuffers::FlatBufferBuilder* builder) {
-  return i.getSExtValue();
+    int i, flatbuffers::FlatBufferBuilder* builder) {
+  return i;
 }
 
 static int ConvertPositiveI32AttrForOptionWriter(
-    llvm::APInt i, flatbuffers::FlatBufferBuilder* builder) {
+    int i, flatbuffers::FlatBufferBuilder* builder) {
   return ConvertI32AttrForOptionWriter(i, builder);
 }
 
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 403b3dd18ad..2894af9b97e 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -41,10 +41,10 @@ limitations under the License.
 #include "mlir/Transforms/FoldUtils.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_structs.cc.inc"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 
 namespace mlir {
-#include "tensorflow/compiler/mlir/lite/ir/tfl_structs.cc.inc"
 namespace TFL {
 
 // Returns true when the given operand arguments have the same shape or
@@ -569,7 +569,7 @@ namespace {
 
 int64_t GetConcatenationOpAxis(ConcatenationOp op) {
   auto output_type = op.output().getType().cast<RankedTensorType>();
-  int64_t axis = op.axis().getSExtValue();
+  int32_t axis = op.axis();
   if (axis < 0) axis += output_type.getRank();
   return axis;
 }
@@ -1027,13 +1027,13 @@ static LogicalResult Verify(PackOp op) {
 
   // Check axis bounds.
   if (input_type.hasRank()) {
-    int64_t axis_value = op.axis().getSExtValue();
+    int32_t axis_value = op.axis();
     if (axis_value < 0) axis_value += input_type.getRank() + 1;
     if (axis_value < 0 || axis_value >= input_type.getRank() + 1)
       return op.emitOpError()
              << "op attribute 'axis' should be in range [-rank - 1, rank + 1), "
              << "got rank = " << input_type.getRank()
-             << ", and axis = " << op.axis().getSExtValue();
+             << ", and axis = " << op.axis();
   }
 
   // Make sure all inputs have the same shape and element type.
@@ -1545,7 +1545,7 @@ static LogicalResult VerifySplitOpOutputTypes(
 }
 
 static LogicalResult Verify(SplitOp op) {
-  int64_t num_splits = op.num_splits().getSExtValue();
+  int64_t num_splits = op.num_splits();
   if (op.getNumResults() != num_splits)
     return op.emitOpError("output count should match 'num_splits' attribute");
 
@@ -1581,7 +1581,7 @@ static LogicalResult Verify(SplitOp op) {
 }
 
 static LogicalResult Verify(SplitVOp op) {
-  int64_t num_splits = op.num_splits().getSExtValue();
+  int64_t num_splits = op.num_splits();
   if (op.getNumResults() != num_splits)
     return op.emitOpError("output count should match 'num_splits' attribute");
 
@@ -2377,8 +2377,16 @@ LogicalResult WhileOp::moveOutOfLoop(llvm::ArrayRef<mlir::Operation *> ops) {
 //===----------------------------------------------------------------------===//
 
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops_interface.cc.inc"
+
+}  // namespace TFL
+}  // namespace mlir
+
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.cc.inc"
+
+namespace mlir {
+namespace TFL {
+
 #include "tensorflow/compiler/mlir/lite/runtime_verifiers.inc"
 
 Operation *TensorFlowLiteDialect::materializeConstant(OpBuilder &builder,
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
index d2d8442155b..589f18d789d 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
@@ -30,11 +30,11 @@ limitations under the License.
 #include "mlir/Interfaces/LoopLikeInterface.h"  // from @llvm-project
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_structs.h.inc"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace mlir {
-#include "tensorflow/compiler/mlir/lite/ir/tfl_structs.h.inc"
 namespace TFL {
 
 class TensorFlowLiteDialect : public Dialect {
@@ -50,10 +50,11 @@ class TensorFlowLiteDialect : public Dialect {
 };
 
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops_interface.h.inc"
-#define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h.inc"
 
 }  // end namespace TFL
 }  // end namespace mlir
 
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h.inc"
+
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_IR_TFL_OPS_H_
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index f1cdfec631d..f7ee323957d 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -39,7 +39,7 @@ def TFL_Dialect : Dialect {
       represented using zero-dimensional tensors);
   }];
 
-  let cppNamespace = "TFL";
+  let cppNamespace = "::mlir::TFL";
 }
 
 //===----------------------------------------------------------------------===//
@@ -385,28 +385,27 @@ def BinaryOpSameElementTypeConstraint :
 //===----------------------------------------------------------------------===//
 
 def TFL_BroadcastableBinaryBuilder : OpBuilder<
-  "OpBuilder &builder, OperationState &result, Value lhs, Value rhs",
+  "Value lhs, Value rhs",
   [{
     auto resultType =
       OpTrait::util::getBroadcastedType(lhs.getType(), rhs.getType());
     if (!resultType)
-      mlir::emitError(result.location, "non-broadcastable operands");
-    result.addOperands({lhs, rhs});
-    result.types.push_back(resultType);
+      mlir::emitError($_state.location, "non-broadcastable operands");
+    $_state.addOperands({lhs, rhs});
+    $_state.types.push_back(resultType);
   }]>;
 
 def TFL_FusedBroadcastableBinaryBuilder : OpBuilder<
-  "OpBuilder &builder, OperationState &result, Value lhs, Value rhs, "
-  "StringAttr fusedActivationFunction",
+  "Value lhs, Value rhs, StringAttr fusedActivationFunction",
   [{
     buildFusedBroadcastableBinOp(
-       &builder, result, lhs, rhs, fusedActivationFunction);
+       &$_builder, $_state, lhs, rhs, fusedActivationFunction);
   }]>;
 
 def TFL_ComparisonBinaryBuilder : OpBuilder<
-  "OpBuilder &builder, OperationState &result, Value lhs, Value rhs",
+  "Value lhs, Value rhs",
   [{
-    buildComparisonBinOp(&builder, result, lhs, rhs);
+    buildComparisonBinOp(&$_builder, $_state, lhs, rhs);
   }]>;
 
 //===----------------------------------------------------------------------===//
@@ -520,7 +519,11 @@ def TFL_AddOp : TFL_Op<"add", [
   let hasOptions = 1;
 }
 
-def TFL_AddNOp : TFL_Op<"add_n", [Commutative, NoSideEffect, SameOperandsAndResultsScale]> {
+def TFL_AddNOp : TFL_Op<"add_n", [
+    Commutative,
+    NoSideEffect,
+    SameOperandsAndResultsScale,
+    NoQuantizableResult]> {
   let summary = "add_n operator";
 
   let description = [{
@@ -536,7 +539,9 @@ def TFL_AddNOp : TFL_Op<"add_n", [Commutative, NoSideEffect, SameOperandsAndResu
   );
 }
 
-def TFL_ReduceAnyOp : TFL_Op<"reduce_any", [NoSideEffect]> {
+def TFL_ReduceAnyOp : TFL_Op<"reduce_any", [
+    NoSideEffect,
+    NoQuantizableResult]> {
   let summary = [{
 Computes the "logical or" of elements across dimensions of a tensor.
   }];
@@ -693,7 +698,8 @@ def TFL_ArgMinOp : TFL_Op<"arg_min", [NoSideEffect]> {
 def TFL_CeilOp: TFL_Op<"ceil", [
     NoSideEffect,
     SameOperandsAndResultShape,
-    SameOperandsAndResultType]> {
+    SameOperandsAndResultType,
+    NoQuantizableResult]> {
   let summary = "Ceil operator";
 
   let description = [{
@@ -720,14 +726,14 @@ def TFL_ConcatenationOp : TFL_Op<"concatenation",
 
   let arguments = (
     ins TFL_VariadicTensorOf<
-      [F32, I64, I32, I16, I8, QI8, QUI8, UI8]>:$values,
+      [F32, I64, I32, I16, I8, QI8, QUI8, UI8, I1]>:$values,
     I32Attr:$axis,
     TFL_AFAttr:$fused_activation_function
   );
 
   let results = (outs
     TFL_TensorOf<
-      [F32, I64, I32, I16, I8, QI8, QUI8, UI8]>:$output
+      [F32, I64, I32, I16, I8, QI8, QUI8, UI8, I1]>:$output
   );
 
   let hasOptions = 1;
@@ -765,10 +771,10 @@ def TFL_ConstOp : Op<TFL_Dialect, "pseudo_const", [ConstantLike, NoSideEffect,
   let hasFolder = 1;
 
   let builders = [OpBuilder<
-    "OpBuilder &, OperationState &state, Attribute value",
+    "Attribute value",
     [{
-      state.addAttribute("value", value);
-      state.addTypes(value.getType());
+      $_state.addAttribute("value", value);
+      $_state.addTypes(value.getType());
     }]>
   ];
 }
@@ -817,13 +823,12 @@ def TFL_SparseConstOp : Op<TFL_Dialect, "pseudo_sparse_const", [
   let results = (outs AnyTensor:$output);
 
   let builders = [OpBuilder<
-    "OpBuilder &, OperationState &state, Attribute value, "
-    "SparsityParameterAttr s_param, Attribute compressed_data",
+    "Attribute value, SparsityParameterAttr s_param, Attribute compressed_data",
     [{
-      state.addTypes(value.getType());
-      state.addAttribute("value", value);
-      state.addAttribute("s_param", s_param);
-      state.addAttribute("compressed_data", compressed_data);
+      $_state.addTypes(value.getType());
+      $_state.addAttribute("value", value);
+      $_state.addAttribute("s_param", s_param);
+      $_state.addAttribute("compressed_data", compressed_data);
     }]>
   ];
 }
@@ -889,7 +894,7 @@ def TFL_DepthwiseConv2DOp :
   let extraClassDeclaration = [{
     // AffineQuantizedOpInterface:
     int GetChannelDimIndex() { return 3; }
-    int GetQuantizationDimIndex() { return 3; }    
+    int GetQuantizationDimIndex() { return 3; }
     // SparseOpInterface:
     std::vector<int> GetSparseOperands() { return {1}; }
     std::vector<std::vector<int>> GetFloatBlockSize() { return {}; }
@@ -1002,9 +1007,8 @@ def TFL_GatherOp : TFL_Op<"gather", [
 
   let builders =
   [
-    OpBuilder<"OpBuilder &builder, OperationState &result, "
-      "Value params, Value indices, IntegerAttr axis",
-        [{ BuildGatherOp(&builder, result, params, indices, axis); }]>
+    OpBuilder<"Value params, Value indices, IntegerAttr axis",
+        [{ BuildGatherOp(&$_builder, $_state, params, indices, axis); }]>
   ];
 
   let results = (outs
@@ -1093,7 +1097,8 @@ def TFL_LocalResponseNormalizationOp : TFL_Op<"local_response_normalization", [
     TFL_OperandHasRank<0, 4>,
     SameOperandsAndResultShape,
     SameOperandsAndResultType,
-    NoSideEffect]> {
+    NoSideEffect,
+    NoQuantizableResult]> {
   let summary = "Local Response Normalization.";
 
   let description = [{
@@ -1220,7 +1225,8 @@ def TFL_NonMaxSuppressionV4Op : TFL_Op<"non_max_suppression_v4", [
   TFL_OperandHasRank<1, 1>,
   // Other operands are scalar params.
   TFL_OperandHasRank<2, 0>, TFL_OperandHasRank<3, 0>,
-  TFL_OperandHasRank<4, 0>]> {
+  TFL_OperandHasRank<4, 0>,
+  NoQuantizableResult]> {
   let summary = [{
 Greedily selects a subset of bounding boxes in descending order of score,
   }];
@@ -1269,7 +1275,8 @@ def TFL_NonMaxSuppressionV5Op : TFL_Op<"non_max_suppression_v5", [
   TFL_OperandHasRank<1, 1>,
   // Other operands are scalar params.
   TFL_OperandHasRank<2, 0>, TFL_OperandHasRank<3, 0>,
-  TFL_OperandHasRank<4, 0>, TFL_OperandHasRank<5, 0>]> {
+  TFL_OperandHasRank<4, 0>, TFL_OperandHasRank<5, 0>,
+  NoQuantizableResult]> {
   let summary = [{
 Greedily selects a subset of bounding boxes in descending order of score,
   }];
@@ -1336,10 +1343,9 @@ def TFL_NotEqualOp : TFL_Op<"not_equal", [
 
   let builders =
   [
-    OpBuilder<
-      "OpBuilder &builder, OperationState &result, Value lhs, Value rhs",
+    OpBuilder<"Value lhs, Value rhs",
       [{
-        buildComparisonBinOp(&builder, result, lhs, rhs);
+        buildComparisonBinOp(&$_builder, $_state, lhs, rhs);
       }]>
   ];
 
@@ -1383,7 +1389,8 @@ def TFL_DivOp : TFL_Op<"div", [
 def TFL_EluOp: TFL_Op<"elu", [
     NoSideEffect,
     SameOperandsAndResultShape,
-    SameOperandsAndResultType]> {
+    SameOperandsAndResultType,
+    NoQuantizableResult]> {
   let summary = "Exponential Linear Unit operator";
   let description = [{
     Computes the exponential linear
@@ -1441,8 +1448,10 @@ def TFL_EqualOp: TFL_Op<"equal", [
   let builders = [TFL_ComparisonBinaryBuilder];
 }
 
-def TFL_ExpOp: TFL_Op<"exp", [NoSideEffect,
-                              SameOperandsAndResultType]> {
+def TFL_ExpOp: TFL_Op<"exp", [
+    NoSideEffect,
+    SameOperandsAndResultType,
+    NoQuantizableResult]> {
   let summary = "Natural exponentiation operator";
 
   let description = [{
@@ -1546,7 +1555,8 @@ shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
 def TFL_FillOp: TFL_Op<"fill", [
     NoSideEffect,
     PredOpTrait<"input and result must have same element type",
-      TFL_TCresVTEtIsSameAsOp<0, 1>>]> {
+      TFL_TCresVTEtIsSameAsOp<0, 1>>,
+    NoQuantizableResult]> {
   let summary = "Fill the tensor with given value.";
   let description = [{
     Fill the tensor with given value.
@@ -1563,7 +1573,8 @@ def TFL_FillOp: TFL_Op<"fill", [
 def TFL_FloorOp: TFL_Op<"floor", [
     NoSideEffect,
     SameOperandsAndResultShape,
-    SameOperandsAndResultType]> {
+    SameOperandsAndResultType,
+    NoQuantizableResult]> {
   let summary = "Floor operator";
 
   let description = [{
@@ -1581,7 +1592,8 @@ def TFL_FloorDivOp : TFL_Op<"floor_div", [
     BinaryOpSameElementTypeConstraint,
     PredOpTrait<"lhs and output must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
-    TFL_OperandsHaveSameShapesOrBroadcastableShape<[0, 1], 4>]> {
+    TFL_OperandsHaveSameShapesOrBroadcastableShape<[0, 1], 4>,
+    NoQuantizableResult]> {
   let summary = "Floor div operator";
 
   let description = [{
@@ -1606,7 +1618,8 @@ def TFL_FloorModOp : TFL_Op<"floor_mod", [
     BinaryOpSameElementTypeConstraint,
     PredOpTrait<"lhs and output must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
-    TFL_OperandsHaveSameShapesOrBroadcastableShape<[0, 1], 4>]> {
+    TFL_OperandsHaveSameShapesOrBroadcastableShape<[0, 1], 4>,
+    NoQuantizableResult]> {
   let summary = "Division reminder";
 
   let description = [{
@@ -1745,7 +1758,9 @@ def TFL_LessOp : TFL_Op<"less", [
   let printer = [{ return mlir::impl::printOneResultOp(getOperation(), p); }];
 }
 
-def TFL_LogicalAndOp : TFL_Op<"logical_and", [NoSideEffect]> {
+def TFL_LogicalAndOp : TFL_Op<"logical_and", [
+    NoSideEffect,
+    NoQuantizableResult]> {
   let summary = "Logical AND operator";
 
   let description = [{
@@ -1778,7 +1793,9 @@ def TFL_LogicalNotOp : TFL_Op<"logical_not", [
   let results = (outs TFL_BoolTensor:$output);
 }
 
-def TFL_LogicalOrOp : TFL_Op<"logical_or", [NoSideEffect]> {
+def TFL_LogicalOrOp : TFL_Op<"logical_or", [
+    NoSideEffect,
+    NoQuantizableResult]> {
   let summary = "Logical OR operator";
 
   let description = [{
@@ -2005,7 +2022,8 @@ def TFL_OneHotOp : TFL_Op<"one_hot", [NoSideEffect]> {
 def TFL_RoundOp: TFL_Op<"round", [
     NoSideEffect,
     SameOperandsAndResultShape,
-    SameOperandsAndResultType]> {
+    SameOperandsAndResultType,
+    NoQuantizableResult]> {
   let summary = "Round operator";
 
   let description = [{
@@ -2213,7 +2231,8 @@ def TFL_MulOp : TFL_Op<"mul", [
 def TFL_NegOp: TFL_Op<"neg", [
     NoSideEffect,
     SameOperandsAndResultShape,
-    SameOperandsAndResultType]> {
+    SameOperandsAndResultType,
+    NoQuantizableResult]> {
   let summary = "Negation operator";
 
   let description = [{
@@ -2447,8 +2466,7 @@ def TFL_ReluOp: TFL_Op<"relu", [
     PredOpTrait<"x and y must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
     NoSideEffect,
-    SameOperandsAndResultShape,
-    SameOperandsAndResultsScale]> {
+    SameOperandsAndResultShape]> {
   let summary = "Relu operator";
 
   let description = [{
@@ -2463,11 +2481,10 @@ def TFL_ReluOp: TFL_Op<"relu", [
   // This builder doesn't work with quantized type, so it can only be used by
   // non-quantization tablegen patterns. Currently, it is used by the
   // elementwise-move reordering pattern in the optimize_patterns.td
-  let builders = [OpBuilder<
-    "OpBuilder &, OperationState &state, Value input",
+  let builders = [OpBuilder<"Value input",
     [{
-      state.addOperands({input});
-      state.addTypes(input.getType());
+      $_state.addOperands({input});
+      $_state.addTypes(input.getType());
     }]>
   ];
 }
@@ -2476,8 +2493,7 @@ def TFL_Relu6Op: TFL_Op<"relu6", [
     PredOpTrait<"x and y must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
     NoSideEffect,
-    SameOperandsAndResultShape,
-    SameOperandsAndResultsScale]> {
+    SameOperandsAndResultShape]> {
   let summary = "Relu6 operator";
 
   let description = [{
@@ -2492,11 +2508,10 @@ def TFL_Relu6Op: TFL_Op<"relu6", [
   // This builder doesn't work with quantized type, so it can only be used by
   // non-quantization tablegen patterns. Currently, it is used by the
   // elementwise-move reordering pattern in the optimize_patterns.td
-  let builders = [OpBuilder<
-    "OpBuilder &, OperationState &state, Value input",
+  let builders = [OpBuilder<"Value input",
     [{
-      state.addOperands({input});
-      state.addTypes(input.getType());
+      $_state.addOperands({input});
+      $_state.addTypes(input.getType());
     }]>
   ];
 }
@@ -2505,8 +2520,7 @@ def TFL_Relu1Op: TFL_Op<"relu_n1_to_1", [
     PredOpTrait<"x and y must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
     NoSideEffect,
-    SameOperandsAndResultShape,
-    SameOperandsAndResultsScale]> {
+    SameOperandsAndResultShape]> {
   let summary = "Relu1 operator";
 
   let description = [{
@@ -2522,10 +2536,10 @@ def TFL_Relu1Op: TFL_Op<"relu_n1_to_1", [
   // non-quantization tablegen patterns. Currently, it is used by the
   // elementwise-move reordering pattern in the optimize_patterns.td
   let builders = [OpBuilder<
-    "OpBuilder &, OperationState &state, Value input",
+    "Value input",
     [{
-      state.addOperands({input});
-      state.addTypes(input.getType());
+      $_state.addOperands({input});
+      $_state.addTypes(input.getType());
     }]>
   ];
 }
@@ -2625,7 +2639,8 @@ def TFL_RangeOp: TFL_Op<"range", [
     TFL_OperandHasRank<2, 0>,
     PredOpTrait<"operands and output must have same element type",
       And<[TCresVTEtIsSameAsOp<0, 0>, TCresVTEtIsSameAsOp<0, 1>,
-           TCresVTEtIsSameAsOp<0, 2>]>>]> {
+           TCresVTEtIsSameAsOp<0, 2>]>>,
+    NoQuantizableResult]> {
   let summary = "Range operator";
 
   let description = [{
@@ -2704,12 +2719,11 @@ def TFL_SelectOp : TFL_Op<"select", [
     TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$output);
 
   // TODO(jpienaar): autogenerate this.
-  let builders = [OpBuilder<"OpBuilder &builder, OperationState &result, "
-                            "Value condition, Value x, Value y",
+  let builders = [OpBuilder<"Value condition, Value x, Value y",
   [{
     auto resultType = x.getType();
-    result.addOperands({condition, x, y});
-    result.types.push_back(resultType);
+    $_state.addOperands({condition, x, y});
+    $_state.types.push_back(resultType);
   }]>];
 
   let hasOptions = 1;
@@ -2740,10 +2754,9 @@ def TFL_SelectV2Op : TFL_Op<"select_v2", [
   let results = (outs
     TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$output);
 
-  let builders = [OpBuilder<"OpBuilder &builder, OperationState &result, "
-                            "Value cond, Value x, Value y",
+  let builders = [OpBuilder<"Value cond, Value x, Value y",
   [{
-    BuildSelectV2Op(&builder, result, cond, x, y);
+    BuildSelectV2Op(&$_builder, $_state, cond, x, y);
   }]>];
 
   let hasOptions = 1;
@@ -2918,11 +2931,10 @@ def TFL_TanhOp: TFL_Op<"tanh", [
   // This builder doesn't work with quantized type, so it can only be used by
   // non-quantization tablegen patterns. Currently, it is used by the
   // elementwise-move reordering pattern in the optimize_patterns.td
-  let builders = [OpBuilder<
-    "OpBuilder &, OperationState &state, Value input",
+  let builders = [OpBuilder<"Value input",
     [{
-      state.addOperands({input});
-      state.addTypes(input.getType());
+      $_state.addOperands({input});
+      $_state.addTypes(input.getType());
     }]>
   ];
 
@@ -2992,9 +3004,8 @@ def TFL_TopKV2Op: TFL_Op<"topk_v2", [
     TFL_TensorOf<[F32, I8, I32, I64, UI8, QI8, QUI8]>:$values,
     TFL_I32Tensor:$indices);
 
-  let builders = [OpBuilder<"OpBuilder &builder, OperationState &result, "
-                            "Value input, Value k",
-                  [{ BuildTopKOp(&builder, result, input, k); }]>];
+  let builders = [OpBuilder<"Value input, Value k",
+                  [{ BuildTopKOp(&$_builder, $_state, input, k); }]>];
 
   let hasOptions = 1;
 }
@@ -3069,7 +3080,8 @@ def TFL_ZerosLikeOp: TFL_Op<"zeros_like", [
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
     SameOperandsAndResultType,
     SameOperandsAndResultShape,
-    NoSideEffect]> {
+    NoSideEffect,
+    NoQuantizableResult]> {
   let summary = "ZerosLike operator";
 
   let description = [{
@@ -3526,11 +3538,11 @@ def TFL_QConstOp : Op<TFL_Dialect, "pseudo_qconst", [
   let results = (outs TFL_TensorOf<[QUI8, QI8, QI16, QUI16, TFL_Quint8]>:$output);
 
   let builders = [OpBuilder<
-    "OpBuilder &, OperationState &state, TypeAttr qtype, Attribute value",
+    "TypeAttr qtype, Attribute value",
     [{
-      state.addAttribute("qtype", qtype);
-      state.addAttribute("value", value);
-      state.addTypes(qtype.getValue());
+      $_state.addAttribute("qtype", qtype);
+      $_state.addAttribute("value", value);
+      $_state.addTypes(qtype.getValue());
     }]>
   ];
 }
@@ -3555,14 +3567,14 @@ def TFL_SparseQConstOp : Op<TFL_Dialect, "pseudo_sparse_qconst", [
   let results = (outs TFL_TensorOf<[QUI8, QI8, QI16, QUI16, TFL_Quint8]>:$output);
 
   let builders = [OpBuilder<
-    "OpBuilder &, OperationState &state, TypeAttr qtype, "
-    "Attribute value, SparsityParameterAttr s_param, Attribute compressed_data",
+    "TypeAttr qtype, Attribute value, SparsityParameterAttr s_param, "
+    "Attribute compressed_data",
     [{
-      state.addTypes(qtype.getValue());
-      state.addAttribute("qtype", qtype);
-      state.addAttribute("value", value);
-      state.addAttribute("s_param", s_param);
-      state.addAttribute("compressed_data", compressed_data);
+      $_state.addTypes(qtype.getValue());
+      $_state.addAttribute("qtype", qtype);
+      $_state.addAttribute("value", value);
+      $_state.addAttribute("s_param", s_param);
+      $_state.addAttribute("compressed_data", compressed_data);
     }]>
   ];
 }
@@ -4243,7 +4255,8 @@ def TFL_SVDFOp :
 def TFL_SegmentSumOp: TFL_Op<"segment_sum", [
     NoSideEffect,
     PredOpTrait<"input and output must have same element type",
-      TFL_TCresVTEtIsSameAsOp<0, 0>>]> {
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoQuantizableResult]> {
   let summary = "SegmentSum operator";
 
   let description = [{
diff --git a/tensorflow/compiler/mlir/lite/python/BUILD b/tensorflow/compiler/mlir/lite/python/BUILD
index ceca156e07e..caa5605b00b 100644
--- a/tensorflow/compiler/mlir/lite/python/BUILD
+++ b/tensorflow/compiler/mlir/lite/python/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = [":friends"])
@@ -86,6 +88,7 @@ cc_library(
         "//tensorflow/lite/toco:toco_flags_proto_cc",
         "//tensorflow/lite/toco:types_proto_cc",
         "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
diff --git a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
index e786bedc86d..005c5123906 100644
--- a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
@@ -90,9 +90,10 @@ Status ConvertGraphDefToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
   pass_config.emit_builtin_tflite_ops = emit_builtin_tflite_ops;
   pass_config.lower_tensor_list_ops = true;
 
-  return internal::ConvertMLIRToTFLiteFlatBuffer(toco_flags, std::move(module),
-                                                 pass_config, result,
-                                                 /*session=*/llvm::None);
+  return internal::ConvertMLIRToTFLiteFlatBuffer(
+      toco_flags, std::move(module), pass_config, /*saved_model_tags=*/{},
+      result,
+      /*session=*/llvm::None);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index 529c9ee9238..7bbd3209dfe 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <utility>
 
+#include "absl/types/span.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/ToolOutputFile.h"
@@ -118,9 +119,9 @@ Status HandleInputOutputArraysWithModule(const toco::ModelFlags& model_flags,
   return Status::OK();
 }
 
-Status ConvertSavedModelToTFLiteFlatBuffer(
-    const toco::ModelFlags& model_flags, const toco::TocoFlags& toco_flags,
-    string* result) {
+Status ConvertSavedModelToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
+                                           const toco::TocoFlags& toco_flags,
+                                           string* result) {
   mlir::MLIRContext context;
   mlir::TFL::QuantizationSpecs quant_specs;
 
@@ -156,9 +157,12 @@ Status ConvertSavedModelToTFLiteFlatBuffer(
   tensorflow::GraphImportConfig specs;
   specs.upgrade_legacy = true;
 
+  std::vector<std::string> custom_opdefs(toco_flags.custom_opdefs().begin(),
+                                         toco_flags.custom_opdefs().end());
   TF_ASSIGN_OR_RETURN(auto module,
                       ImportSavedModel(model_flags.saved_model_dir(),
                                        model_flags.saved_model_version(), tags,
+                                       absl::MakeSpan(custom_opdefs),
                                        exported_names, specs, &context));
 
   if (!model_flags.input_arrays().empty() ||
@@ -173,7 +177,7 @@ Status ConvertSavedModelToTFLiteFlatBuffer(
 
   // TODO(b/153507667): Pass the session object when importing logic is removed.
   auto status = internal::ConvertMLIRToTFLiteFlatBuffer(
-      toco_flags, std::move(module), pass_config, result,
+      toco_flags, std::move(module), pass_config, tags, result,
       /*session=*/llvm::None);
   return status;
 }
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
index a4e58123e05..ae2454dcf1e 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
@@ -273,7 +273,8 @@ Status DumpOpGraphToFile(mlir::ModuleOp module, const std::string& filename) {
 
 Status ConvertMLIRToTFLiteFlatBuffer(
     const toco::TocoFlags& toco_flags, mlir::OwningModuleRef module,
-    const mlir::TFL::PassConfig& pass_config, string* result,
+    const mlir::TFL::PassConfig& pass_config,
+    const std::unordered_set<std::string>& saved_model_tags, string* result,
     llvm::Optional<tensorflow::Session*> session) {
   bool emit_builtin_tflite_ops = !toco_flags.force_select_tf_ops();
   bool emit_select_tf_ops = toco_flags.enable_select_tf_ops();
@@ -297,8 +298,8 @@ Status ConvertMLIRToTFLiteFlatBuffer(
 
   auto status = ConvertTFExecutorToTFLOrFlatbuffer(
       module.get(), /*export_to_mlir=*/false, emit_builtin_tflite_ops,
-      emit_select_tf_ops, emit_custom_ops, pass_config.quant_specs, result,
-      &pm);
+      emit_select_tf_ops, emit_custom_ops, pass_config.quant_specs,
+      saved_model_tags, result, &pm);
   if (toco_flags.has_dump_graphviz_dir()) {
     TF_RETURN_IF_ERROR(DumpOpGraphToFile(
         // rename once we enable the new converter feature flag.
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
index d79bdc6df67..d4f9e739121 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_TF_TFL_FLATBUFFER_HELPERS_H_
 
 #include <ostream>
+#include <unordered_set>
 #include <utility>
 
 #include "llvm/ADT/Optional.h"
@@ -48,7 +49,8 @@ Status PopulateQuantizationSpecs(
 // This will also run relevant passes as well.
 Status ConvertMLIRToTFLiteFlatBuffer(
     const toco::TocoFlags& toco_flags, mlir::OwningModuleRef module,
-    const mlir::TFL::PassConfig& pass_config, string* result,
+    const mlir::TFL::PassConfig& pass_config,
+    const std::unordered_set<std::string>& saved_model_tags, string* result,
     llvm::Optional<tensorflow::Session*> session);
 
 // Give a warning for any unused flags that have been specified.
diff --git a/tensorflow/compiler/mlir/lite/quantization/BUILD b/tensorflow/compiler/mlir/lite/quantization/BUILD
index aec0d8da34f..7e7020997ef 100644
--- a/tensorflow/compiler/mlir/lite/quantization/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/BUILD
@@ -1,3 +1,9 @@
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_native_cc_binary")
 load(
     "//tensorflow/core/platform:build_config.bzl",
@@ -41,6 +47,7 @@ filegroup(
 
 gentbl(
     name = "quantization_interfaces_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-op-interface-decls",
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
index 38c7ad86e05..905426ab952 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 package(
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
index 6e356acbbdf..eb9843f6e4a 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
@@ -106,9 +106,9 @@ struct ConvertStatsToQDQs : public OpRewritePattern<quant::StatisticsOp> {
         mins.push_back(FloatAttr::getValueAsDouble(*it++));
         maxs.push_back(FloatAttr::getValueAsDouble(*it));
       }
-      quant_type = quant::fakeQuantAttrsToType(
-          op.getLoc(), num_bits, op.axis()->getSExtValue(), mins, maxs,
-          narrow_range, expressed, is_signed);
+      quant_type =
+          quant::fakeQuantAttrsToType(op.getLoc(), num_bits, *op.axis(), mins,
+                                      maxs, narrow_range, expressed, is_signed);
     } else if (auto stats = op.layerStats().dyn_cast<DenseFPElementsAttr>()) {
       double rmin = FloatAttr::getValueAsDouble(stats.getValue<APFloat>({0}));
       double rmax = FloatAttr::getValueAsDouble(stats.getValue<APFloat>({1}));
@@ -119,7 +119,7 @@ struct ConvertStatsToQDQs : public OpRewritePattern<quant::StatisticsOp> {
       return failure();
     }
 
-    rewriter.setInsertionPointAfter(op);
+    rewriter.setInsertionPointAfter(op.getOperation());
     Type result_type = quant_type.castFromExpressedType(op.getType());
     auto q = rewriter.create<Q>(op.getLoc(), result_type, op.arg());
     auto dq = rewriter.create<DQ>(op.getLoc(), op.getType(), q);
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/BUILD b/tensorflow/compiler/mlir/lite/quantization/tensorflow/BUILD
index 38ea69c51d6..76fd75e18ea 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 package(
     default_visibility = [
         ":friends",
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/BUILD b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/BUILD
index 4faa8d2efe8..d7d01eb59a3 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(licenses = ["notice"])
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
index 0826b3265f6..b043834188c 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
@@ -106,9 +106,8 @@ struct InsertQuantOpsAfterTFFakeQuantOp
     }
     // Use the min/max from the operands and the num_bits and narrow_range
     // attribute to create the quantization parameter for the new quantize op.
-    rewriter.setInsertionPointAfter(tf_op);
-    IntegerAttr num_bits =
-        rewriter.getI64IntegerAttr(tf_op.num_bits().getSExtValue());
+    rewriter.setInsertionPointAfter(tf_op.getOperation());
+    IntegerAttr num_bits = rewriter.getI64IntegerAttr(tf_op.num_bits());
     BoolAttr narrow_range = rewriter.getBoolAttr(tf_op.narrow_range());
     Type res_type = tf_op.getType();
     TypeAttr qtype = quant::GetQuantizedTypeAttr(
diff --git a/tensorflow/compiler/mlir/lite/quantization/tests/BUILD b/tensorflow/compiler/mlir/lite/quantization/tests/BUILD
index 4faa8d2efe8..d7d01eb59a3 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tests/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/tests/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(licenses = ["notice"])
diff --git a/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc b/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc
index 208fb4c8a56..fc56ad05535 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc
@@ -55,7 +55,7 @@ static bool OpQuantSpecWriter(raw_ostream &os, RecordKeeper &records) {
     for (const auto t : op.getTraits()) {
       if (auto opTrait = llvm::dyn_cast<mlir::tblgen::NativeOpTrait>(&t)) {
         auto trait = opTrait->getTrait();
-        if (!trait.consume_front("OpTrait::quant::")) continue;
+        if (!trait.consume_front("::mlir::OpTrait::quant::")) continue;
 
         OUT(2) << "if (auto tfl = llvm::dyn_cast<" << op.getQualCppClassName()
                << ">(op)) {\n";
@@ -65,7 +65,7 @@ static bool OpQuantSpecWriter(raw_ostream &os, RecordKeeper &records) {
           OUT(4) << "for (int i = 0, e = op->getNumResults(); i != e; ++i)\n";
           OUT(6) << "spec->restricted_output_params[std::make_pair("
                  << matches[1] << ", " << matches[2]
-                 << ")].push_back(tfl.OpTrait::quant::" << trait << "<"
+                 << ")].push_back(tfl.::mlir::OpTrait::quant::" << trait << "<"
                  << op.getQualCppClassName()
                  << ">::GetResultQuantizedType(i));\n";
           matches.clear();
diff --git a/tensorflow/compiler/mlir/lite/sparsity/BUILD b/tensorflow/compiler/mlir/lite/sparsity/BUILD
index 9ced3220c9b..7f9f06455cb 100644
--- a/tensorflow/compiler/mlir/lite/sparsity/BUILD
+++ b/tensorflow/compiler/mlir/lite/sparsity/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 package(
     default_visibility = [
         ":friends",
diff --git a/tensorflow/compiler/mlir/lite/tests/BUILD b/tensorflow/compiler/mlir/lite/tests/BUILD
index 58d5afb5864..d34fb991b71 100644
--- a/tensorflow/compiler/mlir/lite/tests/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(licenses = ["notice"])
diff --git a/tensorflow/compiler/mlir/lite/tests/debuginfo/BUILD b/tensorflow/compiler/mlir/lite/tests/debuginfo/BUILD
index 1f746c528d6..6ea272745bd 100644
--- a/tensorflow/compiler/mlir/lite/tests/debuginfo/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/debuginfo/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 licenses(["notice"])
diff --git a/tensorflow/compiler/mlir/lite/tests/dilated-conv.mlir b/tensorflow/compiler/mlir/lite/tests/dilated-conv.mlir
index 6b3c5b04aa4..d92bdc3f460 100644
--- a/tensorflow/compiler/mlir/lite/tests/dilated-conv.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/dilated-conv.mlir
@@ -2,7 +2,7 @@
 
 func @testDilatedConv(%arg0: tensor<1x128x128x3xf32>, %arg1: tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
-  %cst_0 = constant dense<2> : tensor<2x2xi32>
+  %cst_0 = constant dense<4> : tensor<2x2xi32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %cst_0) : (tensor<1x128x128x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68x3xf32>
   %1 = "tf.Conv2D"(%0, %arg1) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x68x68x3xf32>, tensor<5x5x3x8xf32>) -> tensor<4x64x64x8xf32>
   %2 = "tf.BatchToSpaceND"(%1, %cst, %cst_0) : (tensor<4x64x64x8xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x8xf32>
@@ -29,8 +29,8 @@ func @testDilatedConvWithNonConstantPadAndCrops(%arg0: tensor<1x128x128x3xf32>,
 
 func @testDilatedConvWithNonZeroBasePadding(%arg0: tensor<1x128x128x3xf32>, %arg1: tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
-  %cst_0 = constant dense<2> : tensor<2x2xi32>
-  %cst_1 = constant dense<1> : tensor<2x2xi32>
+  %cst_0 = constant dense<4> : tensor<2x2xi32>
+  %cst_1 = constant dense<0> : tensor<2x2xi32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %cst_0) : (tensor<1x128x128x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68x3xf32>
   %1 = "tf.Conv2D"(%0, %arg1) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x68x68x3xf32>, tensor<5x5x3x8xf32>) -> tensor<4x64x64x8xf32>
   %2 = "tf.BatchToSpaceND"(%1, %cst, %cst_1) : (tensor<4x64x64x8xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x8xf32>
@@ -44,10 +44,11 @@ func @testDilatedConvWithNonZeroBasePadding(%arg0: tensor<1x128x128x3xf32>, %arg
 
 func @testDilatedConvWithNonTrivialDilations(%arg0: tensor<1x128x128x3xf32>, %arg1: tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
-  %cst_0 = constant dense<2> : tensor<2x2xi32>
+  %cst_0 = constant dense<4> : tensor<2x2xi32>
+  %cst_1 = constant dense<0> : tensor<2x2xi32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %cst_0) : (tensor<1x128x128x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68x3xf32>
   %1 = "tf.Conv2D"(%0, %arg1) {padding = "VALID", dilations = [1, 2, 2, 1], strides = [1, 1, 1, 1]} : (tensor<4x68x68x3xf32>, tensor<5x5x3x8xf32>) -> tensor<4x64x64x8xf32>
-  %2 = "tf.BatchToSpaceND"(%1, %cst, %cst_0) : (tensor<4x64x64x8xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x8xf32>
+  %2 = "tf.BatchToSpaceND"(%1, %cst, %cst_1) : (tensor<4x64x64x8xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x8xf32>
   return %2 : tensor<1x128x128x8xf32>
 
   // CHECK-LABEL: testDilatedConvWithNonTrivialDilations
@@ -59,80 +60,85 @@ func @testDilatedConvWithNonTrivialDilations(%arg0: tensor<1x128x128x3xf32>, %ar
 
 func @testDilatedDepthWiseConv(%arg0: tensor<1x128x128x3xf32>, %arg1: tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
-  %cst_0 = constant dense<2> : tensor<2x2xi32>
+  %cst_0 = constant dense<4> : tensor<2x2xi32>
+  %cst_1 = constant dense<0> : tensor<2x2xi32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %cst_0) : (tensor<1x128x128x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68x3xf32>
   %1 = "tf.DepthwiseConv2dNative"(%0, %arg1) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x68x68x3xf32>, tensor<5x5x3x8xf32>) -> tensor<4x64x64x8xf32>
-  %2 = "tf.BatchToSpaceND"(%1, %cst, %cst_0) : (tensor<4x64x64x8xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x8xf32>
+  %2 = "tf.BatchToSpaceND"(%1, %cst, %cst_1) : (tensor<4x64x64x8xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x8xf32>
   return %2 : tensor<1x128x128x8xf32>
 
   // CHECK-LABEL: testDilatedDepthWiseConv
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128x3xf32>, [[FILTER:%.*]]: tensor<5x5x3x8xf32>)
-  // CHECK-NEXT: [[RESULT:%.*]] = "tf.DepthwiseConv2dNative"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
+  // CHECK-NEXT: [[RESULT:%.*]] = "tf.DepthwiseConv2dNative"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128x8xf32>
 }
 
 func @testDilatedConvWithPad(%arg0: tensor<1x128x128x3xf32>, %arg1: tensor<2x2xi32>, %arg2: tensor<5x5x3x8xf32>, %arg3: tensor<8xf32>) -> tensor<1x128x128x8xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
-  %cst_0 = constant dense<2> : tensor<2x2xi32>
+  %cst_0 = constant dense<4> : tensor<2x2xi32>
+  %cst_1 = constant dense<0> : tensor<2x2xi32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %cst_0) : (tensor<1x128x128x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68x3xf32>
   %1 = "tf.Conv2D"(%0, %arg2) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x68x68x3xf32>, tensor<5x5x3x8xf32>) -> tensor<4x64x64x8xf32>
   %2 = "tf.Pad"(%1, %arg1) : (tensor<4x64x64x8xf32>, tensor<2x2xi32>) -> tensor<4x64x64x8xf32>
-  %3 = "tf.BatchToSpaceND"(%2, %cst, %cst_0) : (tensor<4x64x64x8xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x8xf32>
+  %3 = "tf.BatchToSpaceND"(%2, %cst, %cst_1) : (tensor<4x64x64x8xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x8xf32>
   %4 = "tf.BiasAdd"(%3, %arg3) : (tensor<1x128x128x8xf32>, tensor<8xf32>) -> tensor<1x128x128x8xf32>
   return %4 : tensor<1x128x128x8xf32>
 
   // CHECK-LABEL: testDilatedConvWithPad
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128x3xf32>, [[PADDING:%.*]]: tensor<2x2xi32>, [[FILTER:%.*]]: tensor<5x5x3x8xf32>, [[BIAS:%.*]]: tensor<8xf32>)
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[CONV]], [[BIAS]]) : (tensor<1x128x128x8xf32>, tensor<8xf32>) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128x8xf32>
 }
 
 func @testDilatedDepthWiseConvWithPad(%arg0: tensor<1x128x128x3xf32>, %arg1: tensor<2x2xi32>, %arg2: tensor<5x5x3x8xf32>, %arg3: tensor<8xf32>) -> tensor<1x128x128x8xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
-  %cst_0 = constant dense<2> : tensor<2x2xi32>
+  %cst_0 = constant dense<4> : tensor<2x2xi32>
+  %cst_1 = constant dense<0> : tensor<2x2xi32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %cst_0) : (tensor<1x128x128x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68x3xf32>
   %1 = "tf.DepthwiseConv2dNative"(%0, %arg2) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x68x68x3xf32>, tensor<5x5x3x8xf32>) -> tensor<4x64x64x8xf32>
   %2 = "tf.Pad"(%1, %arg1) : (tensor<4x64x64x8xf32>, tensor<2x2xi32>) -> tensor<4x64x64x8xf32>
-  %3 = "tf.BatchToSpaceND"(%2, %cst, %cst_0) : (tensor<4x64x64x8xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x8xf32>
+  %3 = "tf.BatchToSpaceND"(%2, %cst, %cst_1) : (tensor<4x64x64x8xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x8xf32>
   %4 = "tf.BiasAdd"(%3, %arg3) : (tensor<1x128x128x8xf32>, tensor<8xf32>) -> tensor<1x128x128x8xf32>
   return %4 : tensor<1x128x128x8xf32>
 
   // CHECK-LABEL: testDilatedDepthWiseConvWithPad
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128x3xf32>, [[PADDING:%.*]]: tensor<2x2xi32>, [[FILTER:%.*]]: tensor<5x5x3x8xf32>, [[BIAS:%.*]]: tensor<8xf32>)
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[CONV]], [[BIAS]]) : (tensor<1x128x128x8xf32>, tensor<8xf32>) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128x8xf32>
 }
 
 func @testDilatedConvWithBiasAdd(%arg0: tensor<1x128x128x3xf32>, %arg1: tensor<5x5x3x8xf32>, %arg2: tensor<8xf32>) -> tensor<1x128x128x8xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
-  %cst_0 = constant dense<2> : tensor<2x2xi32>
+  %cst_0 = constant dense<4> : tensor<2x2xi32>
+  %cst_1 = constant dense<0> : tensor<2x2xi32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %cst_0) : (tensor<1x128x128x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68x3xf32>
   %1 = "tf.Conv2D"(%0, %arg1) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x68x68x3xf32>, tensor<5x5x3x8xf32>) -> tensor<4x64x64x8xf32>
-  %2 = "tf.BatchToSpaceND"(%1, %cst, %cst_0) : (tensor<4x64x64x8xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x8xf32>
+  %2 = "tf.BatchToSpaceND"(%1, %cst, %cst_1) : (tensor<4x64x64x8xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x8xf32>
   %3 = "tf.BiasAdd"(%2, %arg2) : (tensor<1x128x128x8xf32>, tensor<8xf32>) -> tensor<1x128x128x8xf32>
   return %3 : tensor<1x128x128x8xf32>
 
   // CHECK-LABEL: testDilatedConvWithBiasAdd
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128x3xf32>, [[FILTER:%.*]]: tensor<5x5x3x8xf32>, [[BIAS:%.*]]: tensor<8xf32>)
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[CONV]], [[BIAS]]) : (tensor<1x128x128x8xf32>, tensor<8xf32>) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128x8xf32>
 }
 
 func @testDilatedDepthWiseConvWithBiasAdd(%arg0: tensor<1x128x128x3xf32>, %arg1: tensor<5x5x3x8xf32>, %arg2: tensor<8xf32>) -> tensor<1x128x128x8xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
-  %cst_0 = constant dense<2> : tensor<2x2xi32>
+  %cst_0 = constant dense<4> : tensor<2x2xi32>
+  %cst_1 = constant dense<0> : tensor<2x2xi32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %cst_0) : (tensor<1x128x128x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68x3xf32>
   %1 = "tf.DepthwiseConv2dNative"(%0, %arg1) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x68x68x3xf32>, tensor<5x5x3x8xf32>) -> tensor<4x64x64x8xf32>
-  %2 = "tf.BatchToSpaceND"(%1, %cst, %cst_0) : (tensor<4x64x64x8xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x8xf32>
+  %2 = "tf.BatchToSpaceND"(%1, %cst, %cst_1) : (tensor<4x64x64x8xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x8xf32>
   %3 = "tf.BiasAdd"(%2, %arg2) : (tensor<1x128x128x8xf32>, tensor<8xf32>) -> tensor<1x128x128x8xf32>
   return %3 : tensor<1x128x128x8xf32>
 
   // CHECK-LABEL: testDilatedDepthWiseConvWithBiasAdd
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128x3xf32>, [[FILTER:%.*]]: tensor<5x5x3x8xf32>, [[BIAS:%.*]]: tensor<8xf32>)
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[INPUT]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x3xf32>, tensor<5x5x3x8xf32>) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[CONV]], [[BIAS]]) : (tensor<1x128x128x8xf32>, tensor<8xf32>) -> tensor<1x128x128x8xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128x8xf32>
 }
@@ -140,12 +146,13 @@ func @testDilatedDepthWiseConvWithBiasAdd(%arg0: tensor<1x128x128x3xf32>, %arg1:
 func @testDilatedConvWithExpandSqueeze1(%arg0: tensor<1x128x128xf32>, %arg1: tensor<5x5x1x1xf32>, %arg2: tensor<128xf32>) -> tensor<1x128x128xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
   %cst_0 = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
-  %cst_1 = constant dense<2> : tensor<2x2xi32>
+  %cst_1 = constant dense<4> : tensor<2x2xi32>
+  %cst_2 = constant dense<0> : tensor<2x2xi32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %cst_1) : (tensor<1x128x128xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68xf32>
   %1 = "tf.ExpandDims"(%0, %cst_0) : (tensor<4x68x68xf32>, tensor<i32>) -> tensor<4x68x68x1xf32>
   %2 = "tf.Conv2D"(%1, %arg1) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x68x68x1xf32>, tensor<5x5x1x1xf32>) -> tensor<4x64x64x1xf32>
   %3 = "tf.Squeeze"(%2) {squeeze_dims = [3]} : (tensor<4x64x64x1xf32>) -> tensor<4x64x64xf32>
-  %4 = "tf.BatchToSpaceND"(%3, %cst, %cst_1) : (tensor<4x64x64xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128xf32>
+  %4 = "tf.BatchToSpaceND"(%3, %cst, %cst_2) : (tensor<4x64x64xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128xf32>
   %5 = "tf.BiasAdd"(%4, %arg2) : (tensor<1x128x128xf32>, tensor<128xf32>) -> tensor<1x128x128xf32>
   return %5 : tensor<1x128x128xf32>
 
@@ -153,7 +160,7 @@ func @testDilatedConvWithExpandSqueeze1(%arg0: tensor<1x128x128xf32>, %arg1: ten
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128xf32>, [[FILTER:%.*]]: tensor<5x5x1x1xf32>, [[BIAS:%.*]]: tensor<128xf32>)
   // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x128xf32>, tensor<i32>) -> tensor<1x128x128x1xf32>
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
   // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [3]} : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[SQUEEZE]], [[BIAS]]) : (tensor<1x128x128xf32>, tensor<128xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128xf32>
@@ -162,12 +169,13 @@ func @testDilatedConvWithExpandSqueeze1(%arg0: tensor<1x128x128xf32>, %arg1: ten
 func @testDilatedDepthWiseConvWithExpandSqueeze1(%arg0: tensor<1x128x128xf32>, %arg1: tensor<5x5x1x1xf32>, %arg2: tensor<128xf32>) -> tensor<1x128x128xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
   %cst_0 = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
-  %cst_1 = constant dense<2> : tensor<2x2xi32>
+  %cst_1 = constant dense<4> : tensor<2x2xi32>
+  %cst_2 = constant dense<0> : tensor<2x2xi32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %cst_1) : (tensor<1x128x128xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68xf32>
   %1 = "tf.ExpandDims"(%0, %cst_0) : (tensor<4x68x68xf32>, tensor<i32>) -> tensor<4x68x68x1xf32>
   %2 = "tf.DepthwiseConv2dNative"(%1, %arg1) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x68x68x1xf32>, tensor<5x5x1x1xf32>) -> tensor<4x64x64x1xf32>
   %3 = "tf.Squeeze"(%2) {squeeze_dims = [3]} : (tensor<4x64x64x1xf32>) -> tensor<4x64x64xf32>
-  %4 = "tf.BatchToSpaceND"(%3, %cst, %cst_1) : (tensor<4x64x64xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128xf32>
+  %4 = "tf.BatchToSpaceND"(%3, %cst, %cst_2) : (tensor<4x64x64xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128xf32>
   %5 = "tf.BiasAdd"(%4, %arg2) : (tensor<1x128x128xf32>, tensor<128xf32>) -> tensor<1x128x128xf32>
   return %5 : tensor<1x128x128xf32>
 
@@ -175,7 +183,7 @@ func @testDilatedDepthWiseConvWithExpandSqueeze1(%arg0: tensor<1x128x128xf32>, %
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128xf32>, [[FILTER:%.*]]: tensor<5x5x1x1xf32>, [[BIAS:%.*]]: tensor<128xf32>)
   // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x128xf32>, tensor<i32>) -> tensor<1x128x128x1xf32>
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
   // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [3]} : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[SQUEEZE]], [[BIAS]]) : (tensor<1x128x128xf32>, tensor<128xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128xf32>
@@ -184,20 +192,21 @@ func @testDilatedDepthWiseConvWithExpandSqueeze1(%arg0: tensor<1x128x128xf32>, %
 func @testDilatedConvWithExpandSqueeze2(%arg0: tensor<1x128x128xf32>, %arg1: tensor<5x5x1x1xf32>, %arg2: tensor<?xf32>) -> tensor<1x128x128xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
   %cst_0 = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
-  %cst_1 = constant dense<2> : tensor<2x2xi32>
+  %cst_1 = constant dense<4> : tensor<2x2xi32>
+  %cst_2 = constant dense<0> : tensor<2x2xi32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %cst_1) : (tensor<1x128x128xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x?x?xf32>
   %1 = "tf.ExpandDims"(%0, %cst_0) : (tensor<4x?x?xf32>, tensor<i32>) -> tensor<4x?x?x1xf32>
   %2 = "tf.Conv2D"(%1, %arg1) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x?x?x1xf32>, tensor<5x5x1x1xf32>) -> tensor<4x?x?x1xf32>
   %3 = "tf.Squeeze"(%2) {squeeze_dims = [3]} : (tensor<4x?x?x1xf32>) -> tensor<4x?x?xf32>
   %4 = "tf.BiasAdd"(%3, %arg2) : (tensor<4x?x?xf32>, tensor<?xf32>) -> tensor<4x?x?xf32>
-  %5 = "tf.BatchToSpaceND"(%4, %cst, %cst_1) : (tensor<4x?x?xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128xf32>
+  %5 = "tf.BatchToSpaceND"(%4, %cst, %cst_2) : (tensor<4x?x?xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128xf32>
   return %5 : tensor<1x128x128xf32>
 
   // CHECK-LABEL: testDilatedConvWithExpandSqueeze2
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128xf32>, [[FILTER:%.*]]: tensor<5x5x1x1xf32>, [[BIAS:%.*]]: tensor<?xf32>)
   // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x128xf32>, tensor<i32>) -> tensor<1x128x128x1xf32>
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.Conv2D"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
   // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [3]} : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[SQUEEZE]], [[BIAS]]) : (tensor<1x128x128xf32>, tensor<?xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128xf32>
@@ -206,20 +215,21 @@ func @testDilatedConvWithExpandSqueeze2(%arg0: tensor<1x128x128xf32>, %arg1: ten
 func @testDilatedDepthWiseConvWithExpandSqueeze2(%arg0: tensor<1x128x128xf32>, %arg1: tensor<5x5x1x1xf32>, %arg2: tensor<?xf32>) -> tensor<1x128x128xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
   %cst_0 = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
-  %cst_1 = constant dense<2> : tensor<2x2xi32>
+  %cst_1 = constant dense<4> : tensor<2x2xi32>
+  %cst_2 = constant dense<0> : tensor<2x2xi32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %cst_1) : (tensor<1x128x128xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x?x?xf32>
   %1 = "tf.ExpandDims"(%0, %cst_0) : (tensor<4x?x?xf32>, tensor<i32>) -> tensor<4x?x?x1xf32>
   %2 = "tf.DepthwiseConv2dNative"(%1, %arg1) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x?x?x1xf32>, tensor<5x5x1x1xf32>) -> tensor<4x?x?x1xf32>
   %3 = "tf.Squeeze"(%2) {squeeze_dims = [3]} : (tensor<4x?x?x1xf32>) -> tensor<4x?x?xf32>
   %4 = "tf.BiasAdd"(%3, %arg2) : (tensor<4x?x?xf32>, tensor<?xf32>) -> tensor<4x?x?xf32>
-  %5 = "tf.BatchToSpaceND"(%4, %cst, %cst_1) : (tensor<4x?x?xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128xf32>
+  %5 = "tf.BatchToSpaceND"(%4, %cst, %cst_2) : (tensor<4x?x?xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128xf32>
   return %5 : tensor<1x128x128xf32>
 
   // CHECK-LABEL: testDilatedDepthWiseConvWithExpandSqueeze2
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128xf32>, [[FILTER:%.*]]: tensor<5x5x1x1xf32>, [[BIAS:%.*]]: tensor<?xf32>)
   // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x128xf32>, tensor<i32>) -> tensor<1x128x128x1xf32>
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
   // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [3]} : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[SQUEEZE]], [[BIAS]]) : (tensor<1x128x128xf32>, tensor<?xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128xf32>
@@ -228,7 +238,8 @@ func @testDilatedDepthWiseConvWithExpandSqueeze2(%arg0: tensor<1x128x128xf32>, %
 func @testDilatedConvWithExpandSqueeze3(%arg0: tensor<1x128x128xf32>, %arg1: tensor<2x2xi32>, %arg2: tensor<5x5x1x1xf32>, %arg3: tensor<128xf32>) -> tensor<1x128x128xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
   %cst_0 = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
-  %cst_1 = constant dense<2> : tensor<2x2xi32>
+  %cst_1 = constant dense<4> : tensor<2x2xi32>
+  %cst_2 = constant dense<0> : tensor<2x2xi32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %cst_1) : (tensor<1x128x128xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68xf32>
   %1 = "tf.ExpandDims"(%0, %cst_0) : (tensor<4x68x68xf32>, tensor<i32>) -> tensor<4x68x68x1xf32>
   %2 = "tf.Conv2D"(%1, %arg2) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x68x68x1xf32>, tensor<5x5x1x1xf32>) -> tensor<4x64x64x1xf32>
@@ -251,13 +262,14 @@ func @testDilatedConvWithExpandSqueeze3(%arg0: tensor<1x128x128xf32>, %arg1: ten
 func @testDilatedDepthWiseConvWithExpandSqueeze3(%arg0: tensor<1x128x128xf32>, %arg1: tensor<2x2xi32>, %arg2: tensor<5x5x1x1xf32>, %arg3: tensor<128xf32>) -> tensor<1x128x128xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
   %cst_0 = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
-  %cst_1 = constant dense<2> : tensor<2x2xi32>
+  %cst_1 = constant dense<4> : tensor<2x2xi32>
+  %cst_2 = constant dense<0> : tensor<2x2xi32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %cst_1) : (tensor<1x128x128xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68xf32>
   %1 = "tf.ExpandDims"(%0, %cst_0) : (tensor<4x68x68xf32>, tensor<i32>) -> tensor<4x68x68x1xf32>
   %2 = "tf.DepthwiseConv2dNative"(%1, %arg2) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x68x68x1xf32>, tensor<5x5x1x1xf32>) -> tensor<4x64x64x1xf32>
   %3 = "tf.Squeeze"(%2) {squeeze_dims = [3]} : (tensor<4x64x64x1xf32>) -> tensor<4x64x64xf32>
   %4 = "tf.Pad"(%3, %arg1) : (tensor<4x64x64xf32>, tensor<2x2xi32>) -> tensor<4x64x64xf32>
-  %5 = "tf.BatchToSpaceND"(%4, %cst, %cst_1) : (tensor<4x64x64xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128xf32>
+  %5 = "tf.BatchToSpaceND"(%4, %cst, %cst_2) : (tensor<4x64x64xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128xf32>
   %6 = "tf.BiasAdd"(%5, %arg3) : (tensor<1x128x128xf32>, tensor<128xf32>) -> tensor<1x128x128xf32>
   return %6 : tensor<1x128x128xf32>
 
@@ -265,7 +277,7 @@ func @testDilatedDepthWiseConvWithExpandSqueeze3(%arg0: tensor<1x128x128xf32>, %
   // CHECK-SAME: ([[INPUT:%.*]]: tensor<1x128x128xf32>, [[PADDING:%.*]]: tensor<2x2xi32>, [[FILTER:%.*]]: tensor<5x5x1x1xf32>, [[BIAS:%.*]]: tensor<128xf32>)
   // CHECK-NEXT: [[AXIS:%.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
   // CHECK-NEXT: [[EXPAND:%.*]] = "tf.ExpandDims"([[INPUT]], [[AXIS]]) : (tensor<1x128x128xf32>, tensor<i32>) -> tensor<1x128x128x1xf32>
-  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
+  // CHECK-NEXT: [[CONV:%.*]] = "tf.DepthwiseConv2dNative"([[EXPAND]], [[FILTER]]) {dilations = [1, 2, 2, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x128x128x1xf32>, tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32>
   // CHECK-NEXT: [[SQUEEZE:%.*]] = "tf.Squeeze"([[CONV]]) {squeeze_dims = [3]} : (tensor<1x128x128x1xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: [[RESULT:%.*]] = "tf.BiasAdd"([[SQUEEZE]], [[BIAS]]) : (tensor<1x128x128xf32>, tensor<128xf32>) -> tensor<1x128x128xf32>
   // CHECK-NEXT: return [[RESULT]] : tensor<1x128x128xf32>
@@ -274,12 +286,13 @@ func @testDilatedDepthWiseConvWithExpandSqueeze3(%arg0: tensor<1x128x128xf32>, %
 func @testDilatedConvWithDifferentExpandSqueezeAxis(%arg0: tensor<1x128x128xf32>, %arg1: tensor<5x5x1x1xf32>) -> tensor<1x128x128x1xf32> {
   %cst = constant dense<[2, 2]> : tensor<2xi32>
   %cst_0 = "tf.Const"() { value = dense<3> : tensor<i32> } : () -> tensor<i32>
-  %cst_1 = constant dense<2> : tensor<2x2xi32>
+  %cst_1 = constant dense<4> : tensor<2x2xi32>
+  %cst_2 = constant dense<0> : tensor<2x2xi32>
   %0 = "tf.SpaceToBatchND"(%arg0, %cst, %cst_1) : (tensor<1x128x128xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<4x68x68xf32>
   %1 = "tf.ExpandDims"(%0, %cst_0) : (tensor<4x68x68xf32>, tensor<i32>) -> tensor<4x68x68x1xf32>
   %2 = "tf.Conv2D"(%1, %arg1) {padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<4x68x68x1xf32>, tensor<5x5x1x1xf32>) -> tensor<4x64x64x1xf32>
   %3 = "tf.Squeeze"(%2) {squeeze_dims = [2]} : (tensor<4x64x64x1xf32>) -> tensor<4x64x64x1xf32>
-  %4 = "tf.BatchToSpaceND"(%3, %cst, %cst_1) : (tensor<4x64x64x1xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x1xf32>
+  %4 = "tf.BatchToSpaceND"(%3, %cst, %cst_2) : (tensor<4x64x64x1xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<1x128x128x1xf32>
   return %4 : tensor<1x128x128x1xf32>
 
   // CHECK-LABEL: testDilatedConvWithDifferentExpandSqueezeAxis
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/BUILD b/tensorflow/compiler/mlir/lite/tests/end2end/BUILD
index 25bd761f99e..b0b794034ea 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 licenses(["notice"])
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/add.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/add.pbtxt
index facd6005e7d..9f8d82eb184 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/add.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/add.pbtxt
@@ -96,5 +96,6 @@ versions {
 # CHECK-NEXT:   metadata: [ {
 # CHECK-NEXT:   name: "min_runtime_version",
 # CHECK-NEXT:   buffer: 4
-# CHECK-NEXT:   } ]
+# CHECK-NEXT:   } ],
+# CHECK-NEXT:   signature_defs: [ ]
 # CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/conv_2d_nchw.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/conv_2d_nchw.pbtxt
new file mode 100644
index 00000000000..5f498a404a9
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/conv_2d_nchw.pbtxt
@@ -0,0 +1,232 @@
+# RUN: tf_tfl_translate -tf-input-arrays=input -tf-input-shapes=1,8,8,2 -tf-input-data-types=DT_FLOAT -tf-output-arrays=output_0 -print-function-result-mapping %s -o - 2>&1 | FileCheck %s
+
+node {
+  name: "input"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 1
+        }
+        dim {
+          size: 8
+        }
+        dim {
+          size: 8
+        }
+        dim {
+          size: 2
+        }
+      }
+    }
+  }
+}
+node {
+  name: "conv_net_2d/conv_2d_0/w"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 3
+          }
+          dim {
+            size: 3
+          }
+          dim {
+            size: 2
+          }
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: ";;\177<5\241i\275\312f\211>#\346j>\033W\325\275\253>\210=Vr\r\276\304\222\313\276\374\346\214>\016e\211>)\253\000>\3241\337\275\235g-\276*(\216\276\326#\367\274\023\213\300\276\227\031\206>PUF=\253\330\263<\337IL\276\334\320\215>\377\306v\276\372C\302\273baM>H\314\270<2\221\352=J\026{\276\221\243\245\276?\314\240=UW2\2755\207\253\274\256\207\333\273\335\372\227>\246\232;\276%\r\374<Z\346\204>"
+      }
+    }
+  }
+}
+node {
+  name: "conv_net_2d/conv_2d_0/w/read"
+  op: "Identity"
+  input: "conv_net_2d/conv_2d_0/w"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv_net_2d/conv_2d_0/w"
+      }
+    }
+  }
+}
+node {
+  name: "conv_net_2d_1/conv_2d_0/convolution"
+  op: "Conv2D"
+  input: "input"
+  input: "conv_net_2d/conv_2d_0/w/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NCHW"
+    }
+  }
+  attr {
+    key: "dilations"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "explicit_paddings"
+    value {
+      list {
+      }
+    }
+  }
+  attr {
+    key: "padding"
+    value {
+      s: "SAME"
+    }
+  }
+  attr {
+    key: "strides"
+    value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    key: "use_cudnn_on_gpu"
+    value {
+      b: true
+    }
+  }
+}
+node {
+  name: "conv_net_2d/conv_2d_0/b"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 2
+          }
+        }
+        tensor_content: "\315\314\314=\315\314\314="
+      }
+    }
+  }
+}
+node {
+  name: "conv_net_2d/conv_2d_0/b/read"
+  op: "Identity"
+  input: "conv_net_2d/conv_2d_0/b"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "_class"
+    value {
+      list {
+        s: "loc:@conv_net_2d/conv_2d_0/b"
+      }
+    }
+  }
+}
+node {
+  name: "conv_net_2d_1/conv_2d_0/BiasAdd"
+  op: "BiasAdd"
+  input: "conv_net_2d_1/conv_2d_0/convolution"
+  input: "conv_net_2d/conv_2d_0/b/read"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "conv_net_2d_1/Relu"
+  op: "Relu"
+  input: "conv_net_2d_1/conv_2d_0/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "output_0"
+  op: "Identity"
+  input: "conv_net_2d_1/Relu"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+library {
+}
+
+# CHECK: 'main' inputs:
+# CHECK-NEXT: name: 'input'
+# CHECK-NEXT: 'main' outputs:
+# CHECK-NEXT: name: 'output_0'
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_per_channel.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_per_channel.pbtxt
index adfcd93b4bc..117edd02beb 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_per_channel.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/fake_quant_per_channel.pbtxt
@@ -459,11 +459,13 @@ node {
 # CHECK-LABEL: {
 # CHECK:   version: 3,
 # CHECK:   operator_codes: [ {
-# CHECK:     builtin_code: CONV_2D,
-# CHECK:     version: 3
+# CHECK:     deprecated_builtin_code: 3,
+# CHECK:     version: 3,
+# CHECK:     builtin_code: CONV_2D
 # CHECK:   }, {
-# CHECK:     builtin_code: RESHAPE,
+# CHECK:     deprecated_builtin_code: 22,
 # CHECK:     version: 1
+# CHECK:     builtin_code: RESHAPE
 # CHECK:   } ],
 # CHECK:   subgraphs: [ {
 # CHECK:     tensors: [ {
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
index e21b268279c..41fbbbcb9c5 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "tf_native_cc_binary")
 
diff --git a/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir b/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
index 138614d81e6..d56c2cc221a 100644
--- a/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
@@ -3442,8 +3442,8 @@ func @sgnn_projection(%arg0: tensor<?x!tf.string> {tf._user_specified_name = "va
   %0 = "tf.Const"() {value = dense<[[1902835825], [-1475704015], [473120514], [1254202069], [1558833093], [1756181982], [1906603252], [-1034142694], [542842690], [535515822]]> : tensor<10x1xi64>} : () -> tensor<10x1xi64>
   %1 = "tf.StringToHashBucketFast"(%arg0) {device = "", num_buckets = 2147483647 : i64} : (tensor<?x!tf.string>) -> tensor<?xi64>
   %2 = "tf.Sgnn"(%1, %0) {device = ""} : (tensor<?xi64>, tensor<10x1xi64>) -> tensor<10x?xf64>
-  %3 = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-  %4 = "tf.Reshape"(%2, %3) : (tensor<10x?xf64>, tensor<1xi64>) -> tensor<?x10xf64>
+  %3 = "tf.Const"() {value = dense<[-1, 10]> : tensor<2xi64>} : () -> tensor<2xi64>
+  %4 = "tf.Reshape"(%2, %3) : (tensor<10x?xf64>, tensor<2xi64>) -> tensor<?x10xf64>
   return %4 : tensor<?x10xf64>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index d02e4e705f4..4de278ee324 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -tfl-legalize-tf | FileCheck %s
+// RUN: tf-opt %s -tfl-legalize-tf --cse | FileCheck %s
 
 func @add(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
@@ -196,7 +196,6 @@ func @shape(%arg0: tensor<?x1001xf32>) -> tensor<2xi32> {
 
 // CHECK-LABEL: shape
 // CHECK:  "tfl.shape"(%arg0) : (tensor<?x1001xf32>) -> tensor<2xi32>
-// CHECK:  %1 = "tfl.shape"(%arg0) : (tensor<?x1001xf32>) -> tensor<2xi32>
 }
 
 func @fill(%arg0: tensor<3xi32>, %arg1: tensor<f32>) -> tensor<?x?x?xf32> {
@@ -719,9 +718,8 @@ func @matrix_diag_v2_no_match(%arg0: tensor<8x16xf32>) -> tensor<8x16x16xf32> {
 // CHECK-SAME:                                  [[VAL_0:%.*]]: tensor<8x16xf32>) -> tensor<8x16x16xf32> {
 // CHECK:           [[VAL_1:%.*]] = constant dense<1> : tensor<1xi32>
 // CHECK:           [[VAL_2:%.*]] = constant dense<-1> : tensor<1xi32>
-// CHECK:           [[VAL_5:%.*]] = constant dense<-1> : tensor<1xi32>
 // CHECK:           [[VAL_3:%.*]] = constant dense<0> : tensor<2xi32>
-// CHECK:           [[VAL_4:%.*]] = "tf.MatrixDiagV2"([[VAL_0]], [[VAL_1]], [[VAL_2]], [[VAL_5]], [[VAL_3]]) : (tensor<8x16xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<2xi32>) -> tensor<8x16x16xf32>
+// CHECK:           [[VAL_4:%.*]] = "tf.MatrixDiagV2"([[VAL_0]], [[VAL_1]], [[VAL_2]], [[VAL_2]], [[VAL_3]]) : (tensor<8x16xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<2xi32>) -> tensor<8x16x16xf32>
 // CHECK:           return [[VAL_4]] : tensor<8x16x16xf32>
 }
 
@@ -753,9 +751,8 @@ func @matrix_diag_v3_no_match(%arg0: tensor<8x16xf32>) -> tensor<8x16x16xf32> {
 // CHECK-SAME:      [[VAL_0:%.*]]: tensor<8x16xf32>) -> tensor<8x16x16xf32> {
 // CHECK:           [[VAL_1:%.*]] = constant dense<1> : tensor<1xi32>
 // CHECK:           [[VAL_2:%.*]] = constant dense<-1> : tensor<1xi32>
-// CHECK:           [[VAL_5:%.*]] = constant dense<-1> : tensor<1xi32>
 // CHECK:           [[VAL_3:%.*]] = constant dense<0> : tensor<2xi32>
-// CHECK:           [[VAL_4:%.*]] = "tf.MatrixDiagV3"([[VAL_0]], [[VAL_1]], [[VAL_2]], [[VAL_5]], [[VAL_3]]) : (tensor<8x16xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<2xi32>) -> tensor<8x16x16xf32>
+// CHECK:           [[VAL_4:%.*]] = "tf.MatrixDiagV3"([[VAL_0]], [[VAL_1]], [[VAL_2]], [[VAL_2]], [[VAL_3]]) : (tensor<8x16xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<2xi32>) -> tensor<8x16x16xf32>
 // CHECK:           return [[VAL_4]] : tensor<8x16x16xf32>
 }
 
@@ -1006,11 +1003,11 @@ func @batch_to_space_nd_unsupported(%arg0: tensor<?x1x1x1x4xf32>, %arg1: tensor<
   // CHECK: "tf.BatchToSpaceND"
 }
 
-func @space_to_batch_nd(%arg0: tensor<1x4x4x3xf32>, %arg1: tensor<2xi32>, %arg2: tensor<2x2xi32>) -> tensor<?xf32> {
-  %0 = "tf.SpaceToBatchND"(%arg0, %arg1, %arg2) : (tensor<1x4x4x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<?xf32>
-  return %0 : tensor<?xf32>
+func @space_to_batch_nd(%arg0: tensor<1x4x4x3xf32>, %arg1: tensor<2xi32>, %arg2: tensor<2x2xi32>) -> tensor<*xf32> {
+  %0 = "tf.SpaceToBatchND"(%arg0, %arg1, %arg2) : (tensor<1x4x4x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
   // CHECK-LABEL: space_to_batch_nd
-  // CHECK: "tfl.space_to_batch_nd"(%arg0, %arg1, %arg2) : (tensor<1x4x4x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<?xf32>
+  // CHECK: "tfl.space_to_batch_nd"(%arg0, %arg1, %arg2) : (tensor<1x4x4x3xf32>, tensor<2xi32>, tensor<2x2xi32>) -> tensor<*xf32>
 }
 
 func @split(%arg0: tensor<i32>, %arg1: tensor<1x4x3x3xf32>) -> tensor<1x4x3xf32> {
@@ -1029,32 +1026,75 @@ func @splitv(%arg0: tensor<1x4x3x3xf32>, %arg1: tensor<2xi32>, %arg2: tensor<i32
   // CHECK: "tfl.split_v"(%arg0, %arg1, %arg2) {num_splits = 2 : i32} : (tensor<1x4x3x3xf32>, tensor<2xi32>, tensor<i32>) -> (tensor<1x4x2x3xf32>, tensor<1x4x1x3xf32>)
 }
 
-func @matmul_transposed(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32> {
+func @matmul(%arg0: tensor<40x37xf32>, %arg1: tensor<37x40xf32>) -> tensor<40x40xf32> {
+  %0 = "tf.MatMul"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", device = "/device:CPU:0", name = "MatMul", transpose_a = false, transpose_b = false} :
+(tensor<40x37xf32>, tensor<37x40xf32>) -> tensor<40x40xf32>
+  return %0 : tensor<40x40xf32>
+// CHECK-LABEL: matmul
+// CHECK: %[[CST:.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK: %[[ARG:.*]] = "tfl.transpose"(%arg1, %[[CST]]) : (tensor<37x40xf32>, tensor<2xi32>) -> tensor<40x37xf32>
+// CHECK: %[[CST_0:.*]] = constant unit
+// CHECK: "tfl.fully_connected"(%arg0, %[[ARG]], %[[CST_0]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> tensor<40x40xf32>
+}
+
+func @matmul_transposed_a(%arg0: tensor<37x40xf32>, %arg1: tensor<37x40xf32>) -> tensor<40x40xf32> {
+  %0 = "tf.MatMul"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", device = "/device:CPU:0", name = "MatMul", transpose_a = true, transpose_b = false} :
+(tensor<37x40xf32>, tensor<37x40xf32>) -> tensor<40x40xf32>
+  return %0 : tensor<40x40xf32>
+// CHECK-LABEL: matmul_transposed_a
+// CHECK: %[[CST_0:.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK: %[[ARG_0:.*]] = "tfl.transpose"(%arg0, %[[CST_0]]) : (tensor<37x40xf32>, tensor<2xi32>) -> tensor<40x37xf32>
+// CHECK: %[[ARG_1:.*]] = "tfl.transpose"(%arg1, %[[CST_0]]) : (tensor<37x40xf32>, tensor<2xi32>) -> tensor<40x37xf32>
+// CHECK: %[[CST_2:.*]] = constant unit
+// CHECK: "tfl.fully_connected"(%[[ARG_0]], %[[ARG_1]], %[[CST_2]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> tensor<40x40xf32>
+}
+
+func @matmul_transposed_b(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32> {
   %0 = "tf.MatMul"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", device = "/device:CPU:0", name = "MatMul", transpose_a = false, transpose_b = true} :
 (tensor<40x37xf32>, tensor<40x37xf32>) -> tensor<40x40xf32>
   return %0 : tensor<40x40xf32>
-// CHECK-LABEL: matmul_transposed
+// CHECK-LABEL: matmul_transposed_b
 // CHECK: "tfl.fully_connected"(%arg0, %arg1, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> tensor<40x40xf32>
 }
 
-func @concatv2With3Tensors(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1xi32>, %arg2: tensor<2x1xi32>) -> tensor<2x3xi32> {
+func @matmul_transposed_ab(%arg0: tensor<37x40xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32> {
+  %0 = "tf.MatMul"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", device = "/device:CPU:0", name = "MatMul", transpose_a = true, transpose_b = true} :
+(tensor<37x40xf32>, tensor<40x37xf32>) -> tensor<40x40xf32>
+  return %0 : tensor<40x40xf32>
+// CHECK-LABEL: matmul_transposed_ab
+// CHECK: %[[CST_0:.*]] = constant dense<[1, 0]> : tensor<2xi32>
+// CHECK: %[[ARG_0:.*]] = "tfl.transpose"(%arg0, %[[CST_0]]) : (tensor<37x40xf32>, tensor<2xi32>) -> tensor<40x37xf32>
+// CHECK: %[[CST_1:.*]] = constant unit
+// CHECK: "tfl.fully_connected"(%[[ARG_0]], %arg1, %[[CST_1]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<40x37xf32>, tensor<40x37xf32>, none) -> tensor<40x40xf32>
+}
+
+func @concat_v2_with_3_tensors(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1xi32>, %arg2: tensor<2x1xi32>) -> tensor<2x3xi32> {
   %0 = "tf.Const"() { value = dense<-1> : tensor<i32> } : () -> tensor<i32>
   %1 = "tf.ConcatV2"(%arg0, %arg1, %arg2, %0) : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>, tensor<i32>) -> tensor<2x3xi32>
   return %1 : tensor<2x3xi32>
 
-// CHECK-LABEL: concatv2With3Tensors
+// CHECK-LABEL: concat_v2_with_3_tensors
 // CHECK: "tfl.concatenation"(%arg0, %arg1, %arg2) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x3xi32>
 }
 
-func @concatv2I64Axis(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1xi32>, %arg2: tensor<2x1xi32>) -> tensor<2x3xi32> {
+func @concat_v2_i64_axis(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1xi32>, %arg2: tensor<2x1xi32>) -> tensor<2x3xi32> {
   %0 = "tf.Const"() { value = dense<-1> : tensor<i64> } : () -> tensor<i64>
   %1 = "tf.ConcatV2"(%arg0, %arg1, %arg2, %0) : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>, tensor<i64>) -> tensor<2x3xi32>
   return %1 : tensor<2x3xi32>
 
-// CHECK-LABEL: concatv2I64Axis
+// CHECK-LABEL: concat_v2_i64_axis
 // CHECK: "tfl.concatenation"(%arg0, %arg1, %arg2) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x3xi32>
 }
 
+func @concat_v2_with_bool_type(%arg0: tensor<?x1xi1>, %arg1: tensor<?x1xi1>) -> tensor<?x2xi1> {
+  %0 = "tf.Const"() { value = dense<-1> : tensor<i32> } : () -> tensor<i32>
+  %1 = "tf.ConcatV2"(%arg0, %arg1, %0) : (tensor<?x1xi1>, tensor<?x1xi1>, tensor<i32>) -> tensor<?x2xi1>
+  return %1 : tensor<?x2xi1>
+
+// CHECK-LABEL: concat_v2_with_bool_type
+// CHECK: "tfl.concatenation"(%arg0, %arg1) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<?x1xi1>, tensor<?x1xi1>) -> tensor<?x2xi1>
+}
+
 func @resize_with_bilinear(%arg0: tensor<1x100x100x3xf32>, %arg1: tensor<4xi32>) -> tensor<?xf32> {
   %0 = "tf.ResizeBilinear"(%arg0, %arg1) {align_corners = true} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
@@ -1324,10 +1364,7 @@ func @conv2d_backprop_input(%arg0: tensor<4xi32>, %arg1: tensor<3x3x1x32xf32>, %
   // CHECK: %[[ARG0:.*]] = "tfl.transpose"(%arg1, %[[CST]]) : (tensor<3x3x1x32xf32>, tensor<4xi32>) -> tensor<1x3x3x32xf32>
   // CHECK: %[[CST_0:.*]] = constant unit
   // CHECK: %[[ARG1:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG0]], %arg2, %[[CST_0]]) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>, none) -> tensor<15x28x28x1xf32>
-  // CHECK: %[[CST_1:.*]] = constant dense<[2, 0, 1, 3]> : tensor<4xi32>
-  // CHECK: %[[ARG2:.*]] = "tfl.transpose"(%arg1, %[[CST_1]]) : (tensor<3x3x1x32xf32>, tensor<4xi32>) -> tensor<1x3x3x32xf32>
-  // CHECK: %[[CST_2:.*]] = constant unit
-  // CHECK: %[[ARG3:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG2]], %arg2, %[[CST_2]]) {padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>, none) -> tensor<15x28x28x1xf32>
+  // CHECK: %[[ARG3:.*]] = "tfl.transpose_conv"(%arg0, %[[ARG0]], %arg2, %[[CST_0]]) {padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<1x3x3x32xf32>, tensor<15x14x14x32xf32>, none) -> tensor<15x28x28x1xf32>
   // CHECK: %[[RESULT:.*]] = tfl.add %[[ARG1]], %[[ARG3]] {fused_activation_function = "NONE"} : tensor<15x28x28x1xf32>
   // CHECK: return %[[RESULT]] : tensor<15x28x28x1xf32>
 }
@@ -1533,3 +1570,27 @@ func @add_with_int32_5d_inputs(%arg0: tensor<1x1x1x3x1xi32>, %arg1 : tensor<1x1x
 // CHECK-LABEL: add_with_int32_5d_inputs
 // CHECK: "tf.Add"(%arg0, %arg1)
 }
+
+func @tranpose_int32_perm(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> {
+  %cst = "tf.Const"() { value = dense<[1, 0]> : tensor<2xi32> } : () -> tensor<2xi32>
+  %0 = "tf.Transpose"(%arg0, %cst): (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32>
+  return %0 : tensor<3x2xf32>
+  // CHECK-LABEL: tranpose_int32_perm
+  // CHECK: "tfl.transpose"
+}
+
+func @tranpose_int64_perm(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> {
+  %cst = "tf.Const"() { value = dense<[1, 0]> : tensor<2xi64> } : () -> tensor<2xi64>
+  %0 = "tf.Transpose"(%arg0, %cst): (tensor<2x3xf32>, tensor<2xi64>) -> tensor<3x2xf32>
+  return %0 : tensor<3x2xf32>
+  // CHECK-LABEL: tranpose_int64_perm
+  // CHECK: "tfl.transpose"
+}
+
+func @tranpose_arg(%arg0: tensor<2x3xf32>, %arg1: tensor<2xi32>) -> tensor<3x2xf32> {
+  %0 = "tf.Transpose"(%arg0, %arg1): (tensor<2x3xf32>, tensor<2xi32>) -> tensor<3x2xf32>
+  return %0 : tensor<3x2xf32>
+  // CHECK-LABEL: tranpose_arg
+  // CHECK: "tfl.transpose"
+}
+
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2exec/BUILD b/tensorflow/compiler/mlir/lite/tests/mlir2exec/BUILD
index 745d9eacf15..35e0a376384 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2exec/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2exec/BUILD
@@ -6,6 +6,7 @@
 # runtime behavior, but the majority of runtime tests should be TFLite side and
 # invariants only verified in the converter/compiler.
 
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 licenses(["notice"])
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/BUILD b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/BUILD
index c0ae9570225..e77b8d8fbd5 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 licenses(["notice"])
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/basic_lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/basic_lstm.mlir
index 8389045fc57..b2f684e6be8 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/basic_lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/basic_lstm.mlir
@@ -4,8 +4,9 @@ func @main(tensor<1x384xf32>, tensor<1x96xf32>, tensor<384x480xf32>, tensor<384x
 // CHECK: {
 // CHECK-NEXT:  version: 3,
 // CHECK-NEXT:  operator_codes: [ {
-// CHECK-NEXT:    builtin_code: LSTM,
+// CHECK-NEXT:    deprecated_builtin_code: 16,
 // CHECK-NEXT:    version: 2
+// CHECK-NEXT:    builtin_code: LSTM
 // CHECK-NEXT:  } ],
 // CHECK-NEXT:  subgraphs: [ {
 // CHECK-NEXT:    tensors: [ {
@@ -115,6 +116,7 @@ func @main(tensor<1x384xf32>, tensor<1x96xf32>, tensor<384x480xf32>, tensor<384x
 // CHECK-NEXT:  name: "min_runtime_version",
 // CHECK-NEXT:  buffer: 10
 // CHECK-NEXT:  } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:}
 
 ^bb0(%arg0: tensor<1x384xf32>, %arg1: tensor<1x96xf32>, %arg2: tensor<384x480xf32>, %arg3: tensor<384xf32>, %arg4: tensor<1x96xf32>):
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
index 2d906d6901e..a067826f86d 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
@@ -6,14 +6,17 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK:  {
 // CHECK-NEXT:    version: 3,
 // CHECK-NEXT:    operator_codes: [ {
-// CHECK-NEXT:      builtin_code: MUL,
+// CHECK-NEXT:      deprecated_builtin_code: 18,
 // CHECK-NEXT:      version: 1
+// CHECK-NEXT:      builtin_code: MUL
 // CHECK-NEXT:    }, {
-// CHECK-NEXT:      builtin_code: CUSTOM,
-// CHECK-NEXT:      custom_code: "MyCustomOp"
+// CHECK-NEXT:      deprecated_builtin_code: 32,
+// CHECK-NEXT:      custom_code: "MyCustomOp",
+// CHECK-NEXT:      builtin_code: CUSTOM
 // CHECK-NEXT:    }, {
-// CHECK-NEXT:      builtin_code: EXP,
-// CHECK-NEXT:      version: 1
+// CHECK-NEXT:      deprecated_builtin_code: 47,
+// CHECK-NEXT:      version: 1,
+// CHECK-NEXT:      builtin_code: EXP
 // CHECK-NEXT:    } ],
 // CHECK-NEXT:    subgraphs: [ {
 // CHECK-NEXT:      tensors: [ {
@@ -97,6 +100,7 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-NEXT:  name: "min_runtime_version",
 // CHECK-NEXT:  buffer: 6
 // CHECK-NEXT:  } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:}
 
   %0 = "tfl.pseudo_const" () {value = dense<1.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d.mlir
index 98c3eb154e1..ef82175a47d 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d.mlir
@@ -5,11 +5,13 @@ func @main(tensor<1x224x224x3xf32>) -> tensor<1x112x112x32xf32> {
   // CHECK:      {
   // CHECK-NEXT:  version: 3,
   // CHECK-NEXT:  operator_codes: [ {
-  // CHECK-NEXT:    builtin_code: DEQUANTIZE,
+  // CHECK-NEXT:    deprecated_builtin_code: 6,
   // CHECK-NEXT:    version: 1
+  // CHECK-NEXT:    builtin_code: DEQUANTIZE
   // CHECK-NEXT:  }, {
-  // CHECK-NEXT:    builtin_code: DEPTHWISE_CONV_2D,
+  // CHECK-NEXT:    deprecated_builtin_code: 4,
   // CHECK-NEXT:    version: 1
+  // CHECK-NEXT:    builtin_code: DEPTHWISE_CONV_2D
   // CHECK-NEXT:  } ],
   // CHECK-NEXT:  subgraphs: [ {
   // CHECK-NEXT:    tensors: [ {
@@ -89,6 +91,7 @@ func @main(tensor<1x224x224x3xf32>) -> tensor<1x112x112x32xf32> {
   // CHECK-NEXT:  name: "min_runtime_version",
   // CHECK-NEXT:  buffer: 6
   // CHECK-NEXT:  } ]
+  // CHECK-NEXT:  signature_defs: [ ]
   // CHECK-NEXT:}
 
   %0 = "tfl.pseudo_const" () {value = dense<-1.23697901> : tensor<32xf32>} : () -> tensor<32xf32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d_v2.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d_v2.mlir
index 86f27936946..f4bc10b2fe2 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d_v2.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/depthwise_conv2d_v2.mlir
@@ -5,11 +5,13 @@ func @main(tensor<1x224x224x3xf32>) -> tensor<1x112x112x32xf32> {
   // CHECK:      {
   // CHECK-NEXT:  version: 3,
   // CHECK-NEXT:  operator_codes: [ {
-  // CHECK-NEXT:    builtin_code: DEQUANTIZE,
-  // CHECK-NEXT:    version: 1
+  // CHECK-NEXT:    deprecated_builtin_code: 6,
+  // CHECK-NEXT:    version: 1,
+  // CHECK-NEXT:    builtin_code: DEQUANTIZE
   // CHECK-NEXT:  }, {
-  // CHECK-NEXT:    builtin_code: DEPTHWISE_CONV_2D,
-  // CHECK-NEXT:    version: 2
+  // CHECK-NEXT:    deprecated_builtin_code: 4,
+  // CHECK-NEXT:    version: 2,
+  // CHECK-NEXT:    builtin_code: DEPTHWISE_CONV_2D
   // CHECK-NEXT:  } ],
   // CHECK-NEXT:  subgraphs: [ {
   // CHECK-NEXT:    tensors: [ {
@@ -91,6 +93,7 @@ func @main(tensor<1x224x224x3xf32>) -> tensor<1x112x112x32xf32> {
   // CHECK-NEXT:  name: "min_runtime_version",
   // CHECK-NEXT:  buffer: 6
   // CHECK-NEXT:  } ]
+  // CHECK-NEXT:  signature_defs: [ ]
   // CHECK-NEXT:}
 
   %0 = "tfl.pseudo_const" () {value = dense<-1.23697901> : tensor<32xf32>} : () -> tensor<32xf32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_flex_enable_builtin.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_flex_enable_builtin.mlir
index c034fa7e462..f7ff99b117d 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_flex_enable_builtin.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/disable_flex_enable_builtin.mlir
@@ -5,11 +5,13 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK:  {
 // CHECK-NEXT:  version: 3,
 // CHECK-NEXT:  operator_codes: [ {
-// CHECK-NEXT:    builtin_code: MUL,
-// CHECK-NEXT:    version: 1
+// CHECK-NEXT:    deprecated_builtin_code: 18,
+// CHECK-NEXT:    version: 1,
+// CHECK-NEXT:    builtin_code: MUL
 // CHECK-NEXT:  }, {
-// CHECK-NEXT:    builtin_code: EXP,
-// CHECK-NEXT:    version: 1
+// CHECK-NEXT:    deprecated_builtin_code: 47,
+// CHECK-NEXT:    version: 1,
+// CHECK-NEXT:    builtin_code: EXP
 // CHECK-NEXT:  } ],
 // CHECK-NEXT:  subgraphs: [ {
 // CHECK-NEXT:    tensors: [ {
@@ -95,6 +97,7 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-NEXT:  name: "min_runtime_version",
 // CHECK-NEXT:  buffer: 6
 // CHECK-NEXT:  } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:}
 
   %0 = "tfl.pseudo_const" () {value = dense<1.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fake_quant.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fake_quant.mlir
index 6d8c54b783a..9aca1ecb47d 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fake_quant.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fake_quant.mlir
@@ -6,8 +6,9 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK: {
 // CHECK-NEXT:   version: 3,
 // CHECK-NEXT:     operator_codes: [ {
-// CHECK-NEXT:       builtin_code: FAKE_QUANT,
-// CHECK-NEXT:       version: 1
+// CHECK-NEXT:       deprecated_builtin_code: 80,
+// CHECK-NEXT:       version: 1,
+// CHECK-NEXT:       builtin_code: FAKE_QUANT
 // CHECK-NEXT:     } ],
 // CHECK-NEXT:     subgraphs: [ {
 // CHECK-NEXT:       tensors: [ {
@@ -53,6 +54,7 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-NEXT:     name: "min_runtime_version",
 // CHECK-NEXT:     buffer: 3
 // CHECK-NEXT:     } ]
+// CHECK-NEXT:     signature_defs: [ ]
 // CHECK-NEXT:   }
 
 // IMPORT: "tfl.fake_quant"(%arg0) {max = 1.400000e+00 : f32, min = 3.000000e-01 : f32, narrow_range = false, num_bits = 6 : i32}
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_exclusively.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_exclusively.mlir
index 018d99fc74d..b2d7f611ede 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_exclusively.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_exclusively.mlir
@@ -4,8 +4,9 @@ func @main(%arg0: tensor<3x2xf32>) -> tensor<3x2xf32> {
 // CHECK:  {
 // CHECK-NEXT:    version: 3,
 // CHECK-NEXT:    operator_codes: [ {
-// CHECK-NEXT:      builtin_code: CUSTOM,
+// CHECK-NEXT:      deprecated_builtin_code: 32,
 // CHECK-NEXT:      custom_code: "FlexAddV2"
+// CHECK-NEXT:      builtin_code: CUSTOM
 // CHECK-NEXT:    } ],
 // CHECK-NEXT:    subgraphs: [ {
 // CHECK-NEXT:      tensors: [ {
@@ -46,6 +47,7 @@ func @main(%arg0: tensor<3x2xf32>) -> tensor<3x2xf32> {
 // CHECK-NEXT:    name: "min_runtime_version",
 // CHECK-NEXT:    buffer: 3
 // CHECK-NEXT:    } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:  }
 
   %0 = "tf.AddV2"(%arg0, %arg0) : (tensor<3x2xf32>, tensor<3x2xf32>) -> tensor<3x2xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_complex128.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_complex128.mlir
index a5e6d4aabb5..b8749b4b76c 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_complex128.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_complex128.mlir
@@ -5,8 +5,9 @@ func @main(tensor<4xcomplex<f64>>, tensor<4xcomplex<f64>>) -> tensor<4xcomplex<f
 // CHECK:  {
 // CHECK-NEXT:  version: 3,
 // CHECK-NEXT:  operator_codes: [ {
-// CHECK-NEXT:    builtin_code: CUSTOM,
-// CHECK-NEXT:    custom_code: "FlexAdd"
+// CHECK-NEXT:    deprecated_builtin_code: 32,
+// CHECK-NEXT:    custom_code: "FlexAdd",
+// CHECK-NEXT:    builtin_code: CUSTOM
 // CHECK-NEXT:  } ],
 // CHECK-NEXT:  subgraphs: [ {
 // CHECK-NEXT:    tensors: [ {
@@ -59,6 +60,7 @@ func @main(tensor<4xcomplex<f64>>, tensor<4xcomplex<f64>>) -> tensor<4xcomplex<f
 // CHECK-NEXT:    name: "min_runtime_version",
 // CHECK-NEXT:    buffer: 4
 // CHECK-NEXT:  } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:}
 
   %0 = "tf.Add"(%arg0, %arg1)  : (tensor<4xcomplex<f64>>, tensor<4xcomplex<f64>>) -> tensor<4xcomplex<f64>> loc("add")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_f64.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_f64.mlir
index 4b75d3e8ff4..c8f3949500e 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_f64.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_f64.mlir
@@ -5,8 +5,9 @@ func @main(tensor<4xf64>, tensor<4xf64>) -> tensor<4xf64> {
 // CHECK:  {
 // CHECK-NEXT:  version: 3,
 // CHECK-NEXT:  operator_codes: [ {
-// CHECK-NEXT:    builtin_code: CUSTOM,
-// CHECK-NEXT:    custom_code: "FlexAdd"
+// CHECK-NEXT:    deprecated_builtin_code: 32,
+// CHECK-NEXT:    custom_code: "FlexAdd",
+// CHECK-NEXT:    builtin_code: CUSTOM
 // CHECK-NEXT:  } ],
 // CHECK-NEXT:  subgraphs: [ {
 // CHECK-NEXT:    tensors: [ {
@@ -59,6 +60,7 @@ func @main(tensor<4xf64>, tensor<4xf64>) -> tensor<4xf64> {
 // CHECK-NEXT:    name: "min_runtime_version",
 // CHECK-NEXT:    buffer: 4
 // CHECK-NEXT:  } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:}
 
   %0 = "tf.Add"(%arg0, %arg1)  : (tensor<4xf64>, tensor<4xf64>) -> tensor<4xf64> loc("add")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_tflite_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_tflite_op.mlir
index 8a9175b5c59..059cfc0d54e 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_tflite_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/flex_op_with_tflite_op.mlir
@@ -5,14 +5,17 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK:  {
 // CHECK-NEXT:  version: 3,
 // CHECK-NEXT:  operator_codes: [ {
+// CHECK-NEXT:    deprecated_builtin_code: 18,
+// CHECK-NEXT:    version: 1,
 // CHECK-NEXT:    builtin_code: MUL
-// CHECK-NEXT:    version: 1
 // CHECK-NEXT:  }, {
-// CHECK-NEXT:    builtin_code: CUSTOM,
-// CHECK-NEXT:    custom_code: "FlexDiv"
+// CHECK-NEXT:    deprecated_builtin_code: 32,
+// CHECK-NEXT:    custom_code: "FlexDiv",
+// CHECK-NEXT:    builtin_code: CUSTOM
 // CHECK-NEXT:  }, {
+// CHECK-NEXT:    deprecated_builtin_code: 47,
+// CHECK-NEXT:    version: 1,
 // CHECK-NEXT:    builtin_code: EXP
-// CHECK-NEXT:    version: 1
 // CHECK-NEXT:  } ],
 // CHECK-NEXT:  subgraphs: [ {
 // CHECK-NEXT:    tensors: [ {
@@ -96,6 +99,7 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-NEXT:  name: "min_runtime_version",
 // CHECK-NEXT:  buffer: 6
 // CHECK-NEXT:  } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:}
 
   %0 = "tfl.pseudo_const" () {value = dense<1.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected.mlir
index bbe4fdb8337..b01bafe4ea7 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected.mlir
@@ -5,8 +5,9 @@ func @main(tensor<40x37xf32>, tensor<40x37xf32>) -> tensor<40x40xf32> {
   // CHECK:      {
   // CHECK-NEXT:  version: 3,
   // CHECK-NEXT:  operator_codes: [ {
-  // CHECK-NEXT:    builtin_code: FULLY_CONNECTED,
-  // CHECK-NEXT:    version: 1
+  // CHECK-NEXT:    deprecated_builtin_code: 9,
+  // CHECK-NEXT:    version: 1,
+  // CHECK-NEXT:    builtin_code: FULLY_CONNECTED
   // CHECK-NEXT:  } ],
   // CHECK-NEXT:  subgraphs: [ {
   // CHECK-NEXT:    tensors: [ {
@@ -68,6 +69,7 @@ func @main(tensor<40x37xf32>, tensor<40x37xf32>) -> tensor<40x40xf32> {
   // CHECK-NEXT:  name: "min_runtime_version",
   // CHECK-NEXT:  buffer: 5
   // CHECK-NEXT:  } ]
+  // CHECK-NEXT:  signature_defs: [ ]
   // CHECK-NEXT:}
 
   %cst = constant unit
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected_v2.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected_v2.mlir
index 0abe720ccba..95bcc1547f7 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected_v2.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/fully_connected_v2.mlir
@@ -5,8 +5,9 @@ func @main(tensor<40x37xf32>, tensor<40x37xf32>) -> tensor<40x40xf32> {
   // CHECK:      {
   // CHECK-NEXT:  version: 3,
   // CHECK-NEXT:  operator_codes: [ {
-  // CHECK-NEXT:    builtin_code: FULLY_CONNECTED,
-  // CHECK-NEXT:    version: 2
+  // CHECK-NEXT:    deprecated_builtin_code: 9,
+  // CHECK-NEXT:    version: 2,
+  // CHECK-NEXT:    builtin_code: FULLY_CONNECTED
   // CHECK-NEXT:  } ],
   // CHECK-NEXT:  subgraphs: [ {
   // CHECK-NEXT:    tensors: [ {
@@ -68,6 +69,7 @@ func @main(tensor<40x37xf32>, tensor<40x37xf32>) -> tensor<40x40xf32> {
   // CHECK-NEXT:  name: "min_runtime_version",
   // CHECK-NEXT:  buffer: 5
   // CHECK-NEXT:  } ]
+  // CHECK-NEXT:  signature_defs: [ ]
   // CHECK-NEXT:}
 
   %cst = constant unit
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/hashtable_resource.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/hashtable_resource.mlir
index 3adee1dec77..2d5852dd83d 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/hashtable_resource.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/hashtable_resource.mlir
@@ -3,8 +3,9 @@
 // CHECK: {
 // CHECK:  version: 3,
 // CHECK:  operator_codes: [ {
-// CHECK:    builtin_code: CUSTOM,
-// CHECK:   custom_code: "HashTableV2"
+// CHECK:    deprecated_builtin_code: 32,
+// CHECK:    custom_code: "HashTableV2",
+// CHECK:    builtin_code: CUSTOM
 // CHECK: } ],
 // CHECK: subgraphs: [ {
 // CHECK:   tensors: [ {
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir
index 7290209cc4a..c89239c2e6f 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir
@@ -4,16 +4,19 @@
 // CHECK: {
 // CHECK-NEXT:   version: 3,
 // CHECK-NEXT:   operator_codes: [ {
-// CHECK-NEXT:     builtin_code: LESS,
-// CHECK-NEXT:     version: 1
+// CHECK-NEXT:     deprecated_builtin_code: 58,
+// CHECK-NEXT:     version: 1,
+// CHECK-NEXT:     builtin_code: LESS
 // CHECK-NEXT:   }, {
-// CHECK-NEXT:     builtin_code: IF,
-// CHECK-NEXT:     version: 1
+// CHECK-NEXT:     deprecated_builtin_code: 118,
+// CHECK-NEXT:     version: 1,
+// CHECK-NEXT:     builtin_code: IF
 // CHECK-NEXT:   }, {
 // CHECK-NEXT:     version: 1
 // CHECK-NEXT:   }, {
-// CHECK-NEXT:     builtin_code: MUL,
-// CHECK-NEXT:     version: 1
+// CHECK-NEXT:     deprecated_builtin_code: 18,
+// CHECK-NEXT:     version: 1,
+// CHECK-NEXT:     builtin_code: MUL
 // CHECK-NEXT:   } ],
 // CHECK-NEXT:   subgraphs: [ {
 // CHECK-NEXT:     tensors: [ {
@@ -163,6 +166,7 @@
 // CHECK-NEXT:   name: "min_runtime_version",
 // CHECK-NEXT:   buffer: 11
 // CHECK-NEXT:   } ]
+// CHECK-NEXT:   signature_defs: [ ]
 // CHECK-NEXT: }
 
 func @main(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/logical.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/logical.mlir
index 84cbf48c099..f32fe880121 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/logical.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/logical.mlir
@@ -5,11 +5,13 @@ func @main(tensor<4xi1>) -> tensor<4xi1> {
   // CHECK:      {
   // CHECK-NEXT:   version: 3,
   // CHECK-NEXT:   operator_codes: [ {
-  // CHECK-NEXT:     builtin_code: LOGICAL_OR,
-  // CHECK-NEXT:     version: 1
+  // CHECK-NEXT:     deprecated_builtin_code: 84,
+  // CHECK-NEXT:     version: 1,
+  // CHECK-NEXT:     builtin_code: LOGICAL_OR
   // CHECK-NEXT:   }, {
-  // CHECK-NEXT:     builtin_code: LOGICAL_AND,
-  // CHECK-NEXT:     version: 1
+  // CHECK-NEXT:     deprecated_builtin_code: 86,
+  // CHECK-NEXT:     version: 1,
+  // CHECK-NEXT:     builtin_code: LOGICAL_AND
   // CHECK-NEXT:   } ],
   // CHECK-NEXT:   subgraphs: [ {
   // CHECK-NEXT:     tensors: [ {
@@ -85,6 +87,7 @@ func @main(tensor<4xi1>) -> tensor<4xi1> {
   // CHECK-NEXT:   name: "min_runtime_version",
   // CHECK-NEXT:   buffer: 6
   // CHECK-NEXT:   } ]
+  // CHECK-NEXT:   signature_defs: [ ]
   // CHECK-NEXT: }
   // CHECK-EMPTY:
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
index 707bc926870..017870ca334 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
@@ -4,8 +4,9 @@ func @main(tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, t
 // CHECK: {
 // CHECK-NEXT:   version: 3,
 // CHECK-NEXT:   operator_codes: [ {
-// CHECK-NEXT:     builtin_code: LSTM,
-// CHECK-NEXT:     version: 1
+// CHECK-NEXT:     deprecated_builtin_code: 16,
+// CHECK-NEXT:     version: 1,
+// CHECK-NEXT:     builtin_code: LSTM
 // CHECK-NEXT:   } ],
 // CHECK-NEXT:   subgraphs: [ {
 // CHECK-NEXT:     tensors: [ {
@@ -257,6 +258,7 @@ func @main(tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, t
 // CHECK-NEXT:   name: "min_runtime_version",
 // CHECK-NEXT:   buffer: 26
 // CHECK-NEXT:   } ]
+// CHECK-NEXT:   signature_defs: [ ]
 // CHECK-NEXT: }
 // CHECK-EMPTY:
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm_quantized.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm_quantized.mlir
index 5985ffaa446..10332e45bec 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm_quantized.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm_quantized.mlir
@@ -7,8 +7,9 @@ func @main(%arg0: tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>
 // CHECK: {
 // CHECK-NEXT:   version: 3,
 // CHECK-NEXT:   operator_codes: [ {
-// CHECK-NEXT:     builtin_code: LSTM,
-// CHECK-NEXT:     version: 1
+// CHECK-NEXT:     deprecated_builtin_code: 16,
+// CHECK-NEXT:     version: 1,
+// CHECK-NEXT:     builtin_code: LSTM
 // CHECK-NEXT:   } ],
 // CHECK-NEXT:   subgraphs: [ {
 // CHECK-NEXT:     tensors: [ {
@@ -319,5 +320,6 @@ func @main(%arg0: tensor<1x528x!quant.uniform<i8:f32, 0.037248000502586365:-19>>
 // CHECK-NEXT:     name: "min_runtime_version",
 // CHECK-NEXT:     buffer: 23
 // CHECK-NEXT:   } ]
+// CHECK-NEXT:   signature_defs: [ ]
 // CHECK-NEXT: }
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/math.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/math.mlir
index 297a8b8cb59..eeca4267524 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/math.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/math.mlir
@@ -5,20 +5,25 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
   // CHECK:      {
   // CHECK-NEXT:   version: 3,
   // CHECK-NEXT:   operator_codes: [ {
-  // CHECK-NEXT:     builtin_code: SQUARED_DIFFERENCE,
-  // CHECK-NEXT:     version: 1
+  // CHECK-NEXT:     deprecated_builtin_code: 99,
+  // CHECK-NEXT:     version: 1,
+  // CHECK-NEXT:     builtin_code: SQUARED_DIFFERENCE
   // CHECK-NEXT:   }, {
-  // CHECK-NEXT:     builtin_code: MUL,
-  // CHECK-NEXT:     version: 1
+  // CHECK-NEXT:     deprecated_builtin_code: 18,
+  // CHECK-NEXT:     version: 1,
+  // CHECK-NEXT:     builtin_code: MUL
   // CHECK-NEXT:   }, {
-  // CHECK-NEXT:     builtin_code: DIV,
-  // CHECK-NEXT:     version: 1
+  // CHECK-NEXT:     deprecated_builtin_code: 42,
+  // CHECK-NEXT:     version: 1,
+  // CHECK-NEXT:     builtin_code: DIV
   // CHECK-NEXT:   }, {
-  // CHECK-NEXT:     builtin_code: EXP,
-  // CHECK-NEXT:     version: 1
+  // CHECK-NEXT:     deprecated_builtin_code: 47,
+  // CHECK-NEXT:     version: 1,
+  // CHECK-NEXT:     builtin_code: EXP
   // CHECK-NEXT:   }, {
-  // CHECK-NEXT:     builtin_code: NEG,
-  // CHECK-NEXT:     version: 1
+  // CHECK-NEXT:     deprecated_builtin_code: 59,
+  // CHECK-NEXT:     version: 1,
+  // CHECK-NEXT:     builtin_code: NEG
   // CHECK-NEXT:   } ],
   // CHECK-NEXT:   subgraphs: [ {
   // CHECK-NEXT:     tensors: [ {
@@ -135,6 +140,7 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
   // CHECK-NEXT:   name: "min_runtime_version",
   // CHECK-NEXT:   buffer: 8
   // CHECK-NEXT:   } ]
+  // CHECK-NEXT:   signature_defs: [ ]
   // CHECK-NEXT: }
 
   %0 = "tfl.pseudo_const" () {value = dense<1.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata.mlir
index 49d71f24d2d..3fb00cf6024 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/metadata.mlir
@@ -33,4 +33,5 @@ module attributes {
 // CHECK-NEXT:   name: "min_runtime_version",
 // CHECK-NEXT:   buffer: 6
 // CHECK-NEXT: } ]
+// CHECK-NEXT: signature_defs: [ ]
 // CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v2.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v2.mlir
index 15fce806a70..c8af68a190d 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v2.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v2.mlir
@@ -5,8 +5,9 @@ func @main(tensor<3x!quant.uniform<i8:f32, 0.1>>) -> tensor<3x!quant.uniform<i8:
   // CHECK:      {
   // CHECK-NEXT:  version: 3,
   // CHECK-NEXT:  operator_codes: [ {
-  // CHECK-NEXT:    builtin_code: MUL,
-  // CHECK-NEXT:    version: 2
+  // CHECK-NEXT:    deprecated_builtin_code: 18,
+  // CHECK-NEXT:    version: 2,
+  // CHECK-NEXT:    builtin_code: MUL
   // CHECK-NEXT:  } ],
   // CHECK-NEXT:  subgraphs: [ {
   // CHECK-NEXT:    tensors: [ {
@@ -65,6 +66,7 @@ func @main(tensor<3x!quant.uniform<i8:f32, 0.1>>) -> tensor<3x!quant.uniform<i8:
   // CHECK-NEXT:  name: "min_runtime_version",
   // CHECK-NEXT:  buffer: 4
   // CHECK-NEXT:  } ]
+  // CHECK-NEXT:  signature_defs: [ ]
   // CHECK-NEXT:}
 
   %0 = "tfl.pseudo_qconst"() { qtype = tensor<3x!quant.uniform<i8:f32, 0.1>>, value = dense<2> : tensor<3xi8>} : () -> tensor<3x!quant.uniform<i8:f32, 0.1>>
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v3.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v3.mlir
index 2e0d76b511a..441dbd8f925 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v3.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/mul_v3.mlir
@@ -5,8 +5,9 @@ func @main(tensor<3x!quant.uniform<i8:f32, 1.0>>) -> tensor<3x!quant.uniform<i8:
   // CHECK:      {
   // CHECK-NEXT:  version: 3,
   // CHECK-NEXT:  operator_codes: [ {
-  // CHECK-NEXT:    builtin_code: MUL,
-  // CHECK-NEXT:    version: 3
+  // CHECK-NEXT:    deprecated_builtin_code: 18,
+  // CHECK-NEXT:    version: 3,
+  // CHECK-NEXT:    builtin_code: MUL
   // CHECK-NEXT:  } ],
   // CHECK-NEXT:  subgraphs: [ {
   // CHECK-NEXT:    tensors: [ {
@@ -65,6 +66,7 @@ func @main(tensor<3x!quant.uniform<i8:f32, 1.0>>) -> tensor<3x!quant.uniform<i8:
   // CHECK-NEXT:  name: "min_runtime_version",
   // CHECK-NEXT:  buffer: 4
   // CHECK-NEXT:  } ]
+  // CHECK-NEXT:  signature_defs: [ ]
   // CHECK-NEXT:}
 
   %0 = "tfl.pseudo_qconst"() { qtype = tensor<3x!quant.uniform<i8:f32, 1.0>>, value = dense<2> : tensor<3xi8>} : () -> tensor<3x!quant.uniform<i8:f32, 1.0>>
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/nn.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/nn.mlir
index ffa13532679..ec0fd07c25a 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/nn.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/nn.mlir
@@ -5,8 +5,9 @@ func @main(tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32> {
   // CHECK:      {
   // CHECK-NEXT:   version: 3,
   // CHECK-NEXT:   operator_codes: [ {
-  // CHECK-NEXT:     builtin_code: AVERAGE_POOL_2D,
-  // CHECK-NEXT:     version: 1
+  // CHECK-NEXT:     deprecated_builtin_code: 1,
+  // CHECK-NEXT:     version: 1,
+  // CHECK-NEXT:     builtin_code: AVERAGE_POOL_2D
   // CHECK-NEXT:   } ],
   // CHECK-NEXT:   subgraphs: [ {
   // CHECK-NEXT:     tensors: [ {
@@ -54,6 +55,7 @@ func @main(tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32> {
   // CHECK-NEXT:   name: "min_runtime_version",
   // CHECK-NEXT:   buffer: 3
   // CHECK-NEXT:   } ]
+  // CHECK-NEXT:   signature_defs: [ ]
   // CHECK-NEXT: }
 
   %0 = "tfl.average_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 6 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 3 : i32, stride_w = 1 : i32} : (tensor<1x6x6x16xf32>) -> tensor<1x1x1x16xf32> loc("avgpool")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/numeric_verify.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/numeric_verify.mlir
index 4f28ad327df..60360c7ded6 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/numeric_verify.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/numeric_verify.mlir
@@ -3,8 +3,9 @@
 // CHECK:      {
 // CHECK-NEXT:    version: 3,
 // CHECK-NEXT:    operator_codes: [ {
-// CHECK-NEXT:    builtin_code: CUSTOM,
-// CHECK-NEXT:    custom_code: "NumericVerify"
+// CHECK-NEXT:    deprecated_builtin_code: 32,
+// CHECK-NEXT:    custom_code: "NumericVerify",
+// CHECK-NEXT:    builtin_code: CUSTOM
 // CHECK-NEXT:  } ],
 // CHECK-NEXT:  subgraphs: [ {
 // CHECK-NEXT:    tensors: [ {
@@ -47,6 +48,7 @@
 // CHECK-NEXT:  name: "min_runtime_version",
 // CHECK-NEXT:  buffer: 3
 // CHECK-NEXT:  } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:}
 
 func @main(%arg0: tensor<4xf32>, %arg1: tensor<4x!quant.uniform<u8:f32, 0.1>>) -> tensor<4xf32> {
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/quantization.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/quantization.mlir
index dbe10a3f90c..93581e54f10 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/quantization.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/quantization.mlir
@@ -4,20 +4,25 @@ func @main(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x1001xf32> {
 // CHECK: {
 // CHECK-NEXT:  version: 3,
 // CHECK-NEXT:  operator_codes: [ {
-// CHECK-NEXT:    builtin_code: QUANTIZE,
-// CHECK-NEXT:    version: 1
+// CHECK-NEXT:    deprecated_builtin_code: 114,
+// CHECK-NEXT:    version: 1,
+// CHECK-NEXT:    builtin_code: QUANTIZE
 // CHECK-NEXT:  }, {
-// CHECK-NEXT:    builtin_code: CONV_2D,
-// CHECK-NEXT:    version: 1
+// CHECK-NEXT:    deprecated_builtin_code: 3,
+// CHECK-NEXT:    version: 1,
+// CHECK-NEXT:    builtin_code: CONV_2D
 // CHECK-NEXT:  }, {
-// CHECK-NEXT:    builtin_code: RESHAPE,
-// CHECK-NEXT:    version: 1
+// CHECK-NEXT:    deprecated_builtin_code: 22,
+// CHECK-NEXT:    version: 1,
+// CHECK-NEXT:    builtin_code: RESHAPE
 // CHECK-NEXT:  }, {
-// CHECK-NEXT:    builtin_code: SOFTMAX,
-// CHECK-NEXT:    version: 1
+// CHECK-NEXT:    deprecated_builtin_code: 25,
+// CHECK-NEXT:    version: 1,
+// CHECK-NEXT:    builtin_code: SOFTMAX
 // CHECK-NEXT:  }, {
-// CHECK-NEXT:    builtin_code: DEQUANTIZE,
-// CHECK-NEXT:    version: 1
+// CHECK-NEXT:    deprecated_builtin_code: 6,
+// CHECK-NEXT:    version: 1,
+// CHECK-NEXT:    builtin_code: DEQUANTIZE
 // CHECK-NEXT:  } ],
 // CHECK-NEXT:  subgraphs: [ {
 // CHECK-NEXT:    tensors: [ {
@@ -160,6 +165,7 @@ func @main(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x1001xf32> {
 // CHECK-NEXT:  name: "min_runtime_version",
 // CHECK-NEXT:  buffer: 10
 // CHECK-NEXT:  } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:}
 
   %0 = "tfl.pseudo_const" () {value = dense<[1, 1001]> : tensor<2xi32>} : () -> tensor<2xi32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/reshape.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/reshape.mlir
index 15defbc3957..af59475f6a1 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/reshape.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/reshape.mlir
@@ -5,8 +5,9 @@ func @main(tensor<3x2xi32>) -> tensor<6xi32> {
 // CHECK: {
 // CHECK-NEXT:   version: 3,
 // CHECK-NEXT:   operator_codes: [ {
-// CHECK-NEXT:     builtin_code: RESHAPE,
-// CHECK-NEXT:     version: 1
+// CHECK-NEXT:     deprecated_builtin_code: 22,
+// CHECK-NEXT:     version: 1,
+// CHECK-NEXT:     builtin_code: RESHAPE
 // CHECK-NEXT:   } ],
 // CHECK-NEXT:   subgraphs: [ {
 // CHECK-NEXT:     tensors: [ {
@@ -58,6 +59,7 @@ func @main(tensor<3x2xi32>) -> tensor<6xi32> {
 // CHECK-NEXT:   name: "min_runtime_version",
 // CHECK-NEXT:   buffer: 4
 // CHECK-NEXT:   } ]
+// CHECK-NEXT:   signature_defs: [ ]
 // CHECK-NEXT: }
 
   %0 = "tfl.pseudo_const" () {value = dense<[6]> : tensor<1xi32>} : () -> tensor<1xi32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/signature_def.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/signature_def.mlir
new file mode 100644
index 00000000000..b9866b4696d
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/signature_def.mlir
@@ -0,0 +1,117 @@
+// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s
+
+// CHECK: {
+// CHECK-NEXT:  version: 3,
+// CHECK-NEXT:  operator_codes: [ {
+// CHECK-NEXT:    deprecated_builtin_code: 9,
+// CHECK-NEXT:    version: 1,
+// CHECK-NEXT:    builtin_code: FULLY_CONNECTED
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  subgraphs: [ {
+// CHECK-NEXT:    tensors: [ {
+// CHECK-NEXT:      shape: [ 1, 384 ],
+// CHECK-NEXT:      buffer: 1,
+// CHECK-NEXT:      name: "serving_default_input2:0",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      },
+// CHECK-NEXT:      shape_signature: [ -1, 384 ]
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      shape: [ 1, 384 ],
+// CHECK-NEXT:      buffer: 2,
+// CHECK-NEXT:      name: "serving_default_input1:0",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      },
+// CHECK-NEXT:      shape_signature: [ -1, 384 ]
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      shape: [ 5 ],
+// CHECK-NEXT:      buffer: 3,
+// CHECK-NEXT:      name: "std.constant",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      shape: [ 5, 384 ],
+// CHECK-NEXT:      buffer: 4,
+// CHECK-NEXT:      name: "std.constant1",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      shape: [ 5, 384 ],
+// CHECK-NEXT:      buffer: 5,
+// CHECK-NEXT:      name: "std.constant2",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      shape: [ 1, 5 ],
+// CHECK-NEXT:      buffer: 6,
+// CHECK-NEXT:      name: "StatefulPartitionedCall:0",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      },
+// CHECK-NEXT:      shape_signature: [ -1, 5 ]
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      shape: [ 1, 5 ],
+// CHECK-NEXT:      buffer: 7,
+// CHECK-NEXT:      name: "StatefulPartitionedCall:1",
+// CHECK-NEXT:      quantization: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      },
+// CHECK-NEXT:      shape_signature: [ -1, 5 ]
+// CHECK-NEXT:    } ],
+// CHECK-NEXT:    inputs: [ 0, 1 ],
+// CHECK-NEXT:    outputs: [ 6, 5 ],
+// CHECK-NEXT:    operators: [ {
+// CHECK-NEXT:      inputs: [ 0, 3, 2 ],
+// CHECK-NEXT:      outputs: [ 5 ],
+// CHECK-NEXT:      builtin_options_type: FullyConnectedOptions,
+// CHECK-NEXT:      builtin_options: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      inputs: [ 0, 4, 2 ],
+// CHECK-NEXT:      outputs: [ 6 ],
+// CHECK-NEXT:      builtin_options_type: FullyConnectedOptions,
+// CHECK-NEXT:      builtin_options: {
+// CHECK-EMPTY:
+// CHECK-NEXT:      }
+// CHECK-NEXT:    } ],
+// CHECK-NEXT:    name: "main"
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  description: "MLIR Converted.",
+
+// CHECK:  metadata: [ {
+// CHECK-NEXT:    name: "min_runtime_version",
+// CHECK-NEXT:    buffer: 8
+// CHECK-NEXT:  } ],
+// CHECK-NEXT:  signature_defs: [ {
+// CHECK-NEXT:    inputs: [ {
+// CHECK-NEXT:      name: "input1",
+// CHECK-NEXT:      tensor_index: 1
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      name: "input2"
+// CHECK-NEXT:    } ],
+// CHECK-NEXT:    outputs: [ {
+// CHECK-NEXT:      name: "end_logits",
+// CHECK-NEXT:      tensor_index: 5
+// CHECK-NEXT:    }, {
+// CHECK-NEXT:      name: "start_logits",
+// CHECK-NEXT:      tensor_index: 6
+// CHECK-NEXT:    } ],
+// CHECK-NEXT:    method_name: "serving_default",
+// CHECK-NEXT:    key: ""
+// CHECK-NEXT:  } ]
+// CHECK-NEXT:}
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 554 : i32}, tf_saved_model.semantics} {
+  func @main(%arg0: tensor<?x384xf32> {tf_saved_model.index_path = ["input2"]}, %arg1: tensor<?x384xf32> {tf_saved_model.index_path = ["input1"]}) -> (tensor<?x5xf32> {tf_saved_model.index_path = ["start_logits"]}, tensor<?x5xf32> {tf_saved_model.index_path = ["end_logits"]}) attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input2:0,serving_default_input1:0", outputs = "StatefulPartitionedCall:1,StatefulPartitionedCall:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %cst = constant dense<0.000000e+00> : tensor<5xf32>
+    %cst_0 = constant dense<1.0> : tensor<5x384xf32>
+    %cst_1 = constant dense<1.0> : tensor<5x384xf32>
+    %0 = "tfl.fully_connected"(%arg0, %cst_0, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<?x384xf32>, tensor<5x384xf32>, tensor<5xf32>) -> tensor<?x5xf32>
+    %1 = "tfl.fully_connected"(%arg0, %cst_1, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<?x384xf32>, tensor<5x384xf32>, tensor<5xf32>) -> tensor<?x5xf32>
+    return %1, %0 : tensor<?x5xf32>, tensor<?x5xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir
index 2182db1d39e..fd0e0386c46 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir
@@ -7,8 +7,9 @@ func @main(tensor<3x2xi32>) -> tensor<3x2xi32>
 // CHECK: {
 // CHECK-NEXT:   version: 3,
 // CHECK-NEXT:   operator_codes: [ {
-// CHECK-NEXT:     builtin_code: SUB,
-// CHECK-NEXT:     version: 1
+// CHECK-NEXT:     deprecated_builtin_code: 41,
+// CHECK-NEXT:     version: 1,
+// CHECK-NEXT:     builtin_code: SUB
 // CHECK-NEXT:   }, {
 // CHECK-NEXT:     version: 1
 // CHECK-NEXT:   } ],
@@ -104,6 +105,7 @@ func @main(tensor<3x2xi32>) -> tensor<3x2xi32>
 // CHECK-NEXT:   name: "min_runtime_version",
 // CHECK-NEXT:   buffer: 6
 // CHECK-NEXT:   } ]
+// CHECK-NEXT:   signature_defs: [ ]
 // CHECK-NEXT: }
 
   %0 = "tfl.pseudo_const" () {value = dense<[[1, 2], [3, 4], [5, 6]]> : tensor<3x2xi32>} : () -> tensor<3x2xi32> loc("Const")
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf.mlir
index 3d29823c93c..3f48facd122 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf.mlir
@@ -4,8 +4,9 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -
 // CHECK:      {
 // CHECK-NEXT:     version: 3,
 // CHECK-NEXT:     operator_codes: [ {
-// CHECK-NEXT:       builtin_code: SVDF,
-// CHECK-NEXT:       version: 1
+// CHECK-NEXT:       deprecated_builtin_code: 27,
+// CHECK-NEXT:       version: 1,
+// CHECK-NEXT:       builtin_code: SVDF
 // CHECK-NEXT:     } ],
 // CHECK-NEXT:     subgraphs: [ {
 // CHECK-NEXT:       tensors: [ {
@@ -86,6 +87,7 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -
 // CHECK-NEXT:     name: "min_runtime_version",
 // CHECK-NEXT:     buffer: 7
 // CHECK-NEXT:     } ]
+// CHECK-NEXT:     signature_defs: [ ]
 // CHECK-NEXT:   }
 // CHECK-EMPTY:
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf_v2.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf_v2.mlir
index 8dfa68798b8..1b6b66ed087 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf_v2.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/svdf_v2.mlir
@@ -4,8 +4,9 @@ func @main(tensor<4 x f32>, tensor<4 x i8>, tensor<4 x f32>, tensor<4 x f32>) ->
 // CHECK:      {
 // CHECK-NEXT:     version: 3,
 // CHECK-NEXT:     operator_codes: [ {
-// CHECK-NEXT:       builtin_code: SVDF,
-// CHECK-NEXT:       version: 2
+// CHECK-NEXT:       deprecated_builtin_code: 27,
+// CHECK-NEXT:       version: 2,
+// CHECK-NEXT:       builtin_code: SVDF
 // CHECK-NEXT:     } ],
 // CHECK-NEXT:     subgraphs: [ {
 // CHECK-NEXT:       tensors: [ {
@@ -87,6 +88,7 @@ func @main(tensor<4 x f32>, tensor<4 x i8>, tensor<4 x f32>, tensor<4 x f32>) ->
 // CHECK-NEXT:     name: "min_runtime_version",
 // CHECK-NEXT:     buffer: 7
 // CHECK-NEXT:     } ]
+// CHECK-NEXT:     signature_defs: [ ]
 // CHECK-NEXT:   }
 // CHECK-EMPTY:
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/tfl_while_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/tfl_while_op.mlir
index 996543cc9c7..68b21765717 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/tfl_while_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/tfl_while_op.mlir
@@ -3,14 +3,17 @@
 // CHECK: {
 // CHECK-NEXT:   version: 3,
 // CHECK-NEXT:   operator_codes: [ {
-// CHECK-NEXT:     builtin_code: WHILE,
-// CHECK-NEXT:     version: 1
+// CHECK-NEXT:     deprecated_builtin_code: 119,
+// CHECK-NEXT:     version: 1,
+// CHECK-NEXT:     builtin_code: WHILE
 // CHECK-NEXT:   }, {
-// CHECK-NEXT:     builtin_code: GREATER,
-// CHECK-NEXT:     version: 1
+// CHECK-NEXT:     deprecated_builtin_code: 61,
+// CHECK-NEXT:     version: 1,
+// CHECK-NEXT:     builtin_code: GREATER
 // CHECK-NEXT:   }, {
-// CHECK-NEXT:     builtin_code: SUB,
-// CHECK-NEXT:     version: 1
+// CHECK-NEXT:     deprecated_builtin_code: 41,
+// CHECK-NEXT:     version: 1,
+// CHECK-NEXT:     builtin_code: SUB
 // CHECK-NEXT:   }, {
 // CHECK-NEXT:     version: 1
 // CHECK-NEXT:   } ],
@@ -196,6 +199,7 @@
 // CHECK-NEXT:  name: "min_runtime_version",
 // CHECK-NEXT:  buffer: 14
 // CHECK-NEXT:   } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT: }
 
 func @WhileOp_cond(%arg0: tensor<*xi32>, %arg1: tensor<*xf32>) -> tensor<i1> {
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/transpose_conv_optional.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/transpose_conv_optional.mlir
index ca335ebd000..1256dd19036 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/transpose_conv_optional.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/transpose_conv_optional.mlir
@@ -4,8 +4,9 @@ func @main(%arg0: tensor<4xi32>, %arg1: tensor<32x4x4x128xf32>, %arg2: tensor<1x
 // CHECK: {
 // CHECK-NEXT:  version: 3,
 // CHECK-NEXT:  operator_codes: [ {
-// CHECK-NEXT:    builtin_code: TRANSPOSE_CONV,
-// CHECK-NEXT:    version: 1
+// CHECK-NEXT:    deprecated_builtin_code: 67,
+// CHECK-NEXT:    version: 1,
+// CHECK-NEXT:    builtin_code: TRANSPOSE_CONV
 // CHECK-NEXT:  } ],
 // CHECK-NEXT:  subgraphs: [ {
 // CHECK-NEXT:    tensors: [ {
@@ -69,6 +70,7 @@ func @main(%arg0: tensor<4xi32>, %arg1: tensor<32x4x4x128xf32>, %arg2: tensor<1x
 // CHECK-NEXT:    name: "min_runtime_version",
 // CHECK-NEXT:    buffer: 5
 // CHECK-NEXT:  } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT:}
 
   %cst = constant unit
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/type_attr.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/type_attr.mlir
index 01410d370d4..690331dec84 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/type_attr.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/type_attr.mlir
@@ -3,8 +3,9 @@
 // CHECK: {
 // CHECK:   version: 3,
 // CHECK:   operator_codes: [ {
-// CHECK:     builtin_code: CUSTOM,
-// CHECK:     custom_code: "SomeOperation"
+// CHECK:     deprecated_builtin_code: 32,
+// CHECK:     custom_code: "SomeOperation",
+// CHECK:     builtin_code: CUSTOM
 // CHECK:   } ],
 // CHECK:   subgraphs: [ {
 // CHECK:     tensors: [ {
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
index 9b0315e1e20..ffb5b2c781b 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
@@ -4,8 +4,9 @@ func @main(tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, t
 // CHECK: {
 // CHECK-NEXT:   version: 3,
 // CHECK-NEXT:   operator_codes: [ {
-// CHECK-NEXT:     builtin_code: UNIDIRECTIONAL_SEQUENCE_LSTM,
-// CHECK-NEXT:     version: 1
+// CHECK-NEXT:     deprecated_builtin_code: 44,
+// CHECK-NEXT:     version: 1,
+// CHECK-NEXT:     builtin_code: UNIDIRECTIONAL_SEQUENCE_LSTM
 // CHECK-NEXT:   } ],
 // CHECK-NEXT:   subgraphs: [ {
 // CHECK-NEXT:     tensors: [ {
@@ -256,6 +257,7 @@ func @main(tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, t
 // CHECK-NEXT:   name: "min_runtime_version",
 // CHECK-NEXT:   buffer: 26
 // CHECK-NEXT:   } ]
+// CHECK-NEXT:   signature_defs: [ ]
 // CHECK-NEXT: }
 // CHECK-EMPTY:
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir
index 67349b857f7..5b29c1ff050 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir
@@ -4,8 +4,9 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -
 // CHECK:      {
 // CHECK-NEXT:     version: 3,
 // CHECK-NEXT:     operator_codes: [ {
-// CHECK-NEXT:       builtin_code: UNIDIRECTIONAL_SEQUENCE_RNN,
-// CHECK-NEXT:       version: 1
+// CHECK-NEXT:       deprecated_builtin_code: 35,
+// CHECK-NEXT:       version: 1,
+// CHECK-NEXT:       builtin_code: UNIDIRECTIONAL_SEQUENCE_RNN
 // CHECK-NEXT:     } ],
 // CHECK-NEXT:     subgraphs: [ {
 // CHECK-NEXT:       tensors: [ {
@@ -86,6 +87,7 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -
 // CHECK-NEXT:     name: "min_runtime_version",
 // CHECK-NEXT:     buffer: 7
 // CHECK-NEXT:     } ]
+// CHECK-NEXT:     signature_defs: [ ]
 // CHECK-NEXT:   }
 // CHECK-EMPTY:
 
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir
index d69e8f40311..51935676eed 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir
@@ -3,14 +3,17 @@
 // CHECK: {
 // CHECK-NEXT:   version: 3,
 // CHECK-NEXT:   operator_codes: [ {
-// CHECK-NEXT:     builtin_code: WHILE,
-// CHECK-NEXT:     version: 1
+// CHECK-NEXT:     deprecated_builtin_code: 119,
+// CHECK-NEXT:     version: 1,
+// CHECK-NEXT:     builtin_code: WHILE
 // CHECK-NEXT:   }, {
-// CHECK-NEXT:     builtin_code: GREATER,
-// CHECK-NEXT:     version: 1
+// CHECK-NEXT:     deprecated_builtin_code: 61,
+// CHECK-NEXT:     version: 1,
+// CHECK-NEXT:     builtin_code: GREATER
 // CHECK-NEXT:   }, {
-// CHECK-NEXT:     builtin_code: SUB,
-// CHECK-NEXT:     version: 1
+// CHECK-NEXT:     deprecated_builtin_code: 41,
+// CHECK-NEXT:     version: 1,
+// CHECK-NEXT:     builtin_code: SUB
 // CHECK-NEXT:   }, {
 // CHECK-NEXT:     version: 1
 // CHECK-NEXT:   } ],
@@ -196,6 +199,7 @@
 // CHECK-NEXT:   name: "min_runtime_version",
 // CHECK-NEXT:   buffer: 14
 // CHECK-NEXT:   } ]
+// CHECK-NEXT:  signature_defs: [ ]
 // CHECK-NEXT: }
 
 func @main(%arg0: tensor<i32>, %arg1: tensor<1xf32>) -> tensor<1xf32> {
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index cbb562c2e03..b62f5655183 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -1700,6 +1700,15 @@ func @testRelu6WithQuantizedTypes(%arg0 : tensor<10x!quant.uniform<u8:f32, 1.0>>
 
 // -----
 
+func @testReluWithDifferentScales(%arg0 : tensor<10x!quant.uniform<u8:f32, 1.0>>) -> tensor<10x!quant.uniform<u8:f32, 4.0>> {
+  %0 = "tfl.relu"(%arg0) : (tensor<10x!quant.uniform<u8:f32, 1.0>>) -> tensor<10x!quant.uniform<u8:f32, 2.0>>
+  %1 = "tfl.relu_n1_to_1"(%0) : (tensor<10x!quant.uniform<u8:f32, 2.0>>) -> tensor<10x!quant.uniform<u8:f32, 3.0>>
+  %2 = "tfl.relu6"(%1) : (tensor<10x!quant.uniform<u8:f32, 3.0>>) -> tensor<10x!quant.uniform<u8:f32, 4.0>>
+  return %2 : tensor<10x!quant.uniform<u8:f32, 4.0>>
+}
+
+// -----
+
 func @testEmbeddingLookup(%arg0 : tensor<?xi32>, %arg1 : tensor<?x?xf32>) -> tensor<?xf32> {
   %0 = "tfl.embedding_lookup"(%arg0, %arg1) : (tensor<?xi32>,tensor<?x?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index edbcef3d321..bedf77f726a 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -26,6 +26,26 @@ func @fusedDepthwiseConv2dRelu6(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16
   // CHECK: return %0
 }
 
+// CHECK-LABEL: fusedMaxPool2dRelu
+func @fusedMaxPool2dRelu(%arg0: tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32> {
+  %0 = "tfl.max_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32>
+  %1 = "tfl.relu"(%0) : (tensor<1x73x73x16xf32>) -> tensor<1x73x73x16xf32>
+  return %1 : tensor<1x73x73x16xf32>
+
+  // CHECK: %0 = "tfl.max_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "RELU", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32>
+  // CHECK: return %0
+}
+
+// CHECK-LABEL: fusedAvgPool2dRelu1
+func @fusedAvgPool2dRelu1(%arg0: tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32> {
+  %0 = "tfl.average_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32>
+  %1 = "tfl.relu_n1_to_1"(%0) : (tensor<1x73x73x16xf32>) -> tensor<1x73x73x16xf32>
+  return %1 : tensor<1x73x73x16xf32>
+
+  // CHECK: %0 = "tfl.average_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "RELU_N1_TO_1", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32>
+  // CHECK: return %0
+}
+
 // CHECK-LABEL: fuseAddIntoConv2d
 func @fuseAddIntoConv2d(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16x3x3x3xf32>) -> tensor<256x30x30x16xf32> {
   %cst = constant dense<1.5> : tensor<16xf32>
@@ -50,6 +70,96 @@ func @fuseSubIntoConv2d(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16x3x3x3xf
   // CHECK: %0 = "tfl.conv_2d"(%arg0, %arg1, %cst)
 }
 
+// CHECK-LABEL: fuseAddIntoTransposeConv
+func @fuseAddIntoTransposeConv(%arg0: tensor<1x32x42x128xf32>) -> tensor<1x64x84x32xf32> {
+  %cst = constant dense<1.5> : tensor<32xf32>
+  %cst_0 = constant dense<[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]> : tensor<16xf32>
+  %cst_1 = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  %cst_2 = constant dense<1.0> : tensor<32x4x4x128xf32>
+  %cst_3 = constant dense<[1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0]> : tensor<32xf32>
+  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
+  %1 = "tfl.add"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<1x64x84x32xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
+  return %1 : tensor<1x64x84x32xf32>
+
+  // CHECK: %[[SHAPE:.*]] = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  // CHECK: %[[WEIGHTS:.*]] = constant dense<1.000000e+00> : tensor<32x4x4x128xf32>
+  // CHECK: %[[BIAS:.*]] = constant dense<[2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00, 2.500000e+00, 3.500000e+00]> : tensor<32xf32>
+  // CHECK: %[[RESULT:.*]] = "tfl.transpose_conv"(%[[SHAPE]], %[[WEIGHTS]], %arg0, %[[BIAS]])
+  // CHECK: return %[[RESULT]]
+}
+
+// CHECK-LABEL: fuseSubIntoTransposeConv
+func @fuseSubIntoTransposeConv(%arg0: tensor<1x32x42x128xf32>) -> tensor<1x64x84x32xf32> {
+  %cst = constant dense<1.5> : tensor<32xf32>
+  %cst_0 = constant dense<[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]> : tensor<16xf32>
+  %cst_1 = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  %cst_2 = constant dense<1.0> : tensor<32x4x4x128xf32>
+  %cst_3 = constant dense<[1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0]> : tensor<32xf32>
+  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
+  %1 = "tfl.sub"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<1x64x84x32xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
+  return %1 : tensor<1x64x84x32xf32>
+
+  // CHECK: %[[SHAPE:.*]] = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  // CHECK: %[[WEIGHTS:.*]] = constant dense<1.000000e+00> : tensor<32x4x4x128xf32>
+  // CHECK: %[[BIAS:.*]] = constant dense<[-5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01, -5.000000e-01, 5.000000e-01]> : tensor<32xf32>
+  // CHECK: %[[RESULT:.*]] = "tfl.transpose_conv"(%[[SHAPE]], %[[WEIGHTS]], %arg0, %[[BIAS]])
+  // CHECK: return %[[RESULT]]
+}
+
+// CHECK-LABEL: fuseAddIntoTransposeConvNoBias
+func @fuseAddIntoTransposeConvNoBias(%arg0: tensor<1x32x42x128xf32>) -> tensor<1x64x84x32xf32> {
+  %cst = constant dense<1.5> : tensor<32xf32>
+  %cst_0 = constant dense<[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]> : tensor<16xf32>
+  %cst_1 = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  %cst_2 = constant dense<1.0> : tensor<32x4x4x128xf32>
+  %cst_3 = constant unit
+  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x32xf32>
+  %1 = "tfl.add"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<1x64x84x32xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
+  return %1 : tensor<1x64x84x32xf32>
+
+  // CHECK: %[[SHAPE:.*]] = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  // CHECK: %[[WEIGHTS:.*]] = constant dense<1.000000e+00> : tensor<32x4x4x128xf32>
+  // CHECK: %[[BIAS:.*]] = constant dense<1.500000e+00> : tensor<32xf32>
+  // CHECK: %[[RESULT:.*]] = "tfl.transpose_conv"(%[[SHAPE]], %[[WEIGHTS]], %arg0, %[[BIAS]])
+  // CHECK: return %[[RESULT]]
+}
+
+// CHECK-LABEL: fuseMulIntoTransposeConv
+func @fuseMulIntoTransposeConv(%arg0: tensor<1x32x42x128xf32>) -> tensor<1x64x84x32xf32> {
+  %cst = constant dense<1.5> : tensor<32xf32>
+  %cst_0 = constant dense<[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]> : tensor<16xf32>
+  %cst_1 = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  %cst_2 = constant dense<1.0> : tensor<32x4x4x128xf32>
+  %cst_3 = constant dense<[1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0]> : tensor<32xf32>
+  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
+  %1 = "tfl.mul"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<1x64x84x32xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
+  return %1 : tensor<1x64x84x32xf32>
+
+  // CHECK: %[[SHAPE:.*]] = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  // CHECK: %[[WEIGHTS:.*]] = constant dense<1.500000e+00> : tensor<32x4x4x128xf32>
+  // CHECK: %[[BIAS:.*]] = constant dense<[1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00, 1.500000e+00, 3.000000e+00]> : tensor<32xf32>
+  // CHECK: %[[RESULT:.*]] = "tfl.transpose_conv"(%[[SHAPE]], %[[WEIGHTS]], %arg0, %[[BIAS]])
+  // CHECK: return %[[RESULT]]
+}
+
+// CHECK-LABEL: fuseMulIntoTransposeConvNoBias
+func @fuseMulIntoTransposeConvNoBias(%arg0: tensor<1x32x42x128xf32>) -> tensor<1x64x84x32xf32> {
+  %cst = constant dense<1.5> : tensor<32xf32>
+  %cst_0 = constant dense<[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]> : tensor<16xf32>
+  %cst_1 = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  %cst_2 = constant dense<1.0> : tensor<32x4x4x128xf32>
+  %cst_3 = constant unit
+  %0 = "tfl.transpose_conv"(%cst_1, %cst_2, %arg0, %cst_3) {padding = "SAME", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<4xi32>, tensor<32x4x4x128xf32>, tensor<1x32x42x128xf32>, none) -> tensor<1x64x84x32xf32>
+  %1 = "tfl.mul"(%0, %cst) {fused_activation_function = "NONE"} : (tensor<1x64x84x32xf32>, tensor<32xf32>) -> tensor<1x64x84x32xf32>
+  return %1 : tensor<1x64x84x32xf32>
+
+  // CHECK: %[[SHAPE:.*]] = constant dense<[1, 64, 84, 32]> : tensor<4xi32>
+  // CHECK: %[[WEIGHTS:.*]] = constant dense<1.500000e+00> : tensor<32x4x4x128xf32>
+  // CHECK: %[[BIAS:.*]] = constant unit
+  // CHECK: %[[RESULT:.*]] = "tfl.transpose_conv"(%[[SHAPE]], %[[WEIGHTS]], %arg0, %[[BIAS]])
+  // CHECK: return %[[RESULT]]
+}
+
 // CHECK-LABEL: fuseAddIntoFollowingConv2d
 func @fuseAddIntoFollowingConv2d(%arg0: tensor<256x32x32x3xf32>) -> tensor<256x30x30x16xf32> {
   %cst = constant dense<1.5> : tensor<f32>
@@ -182,6 +292,22 @@ func @fuseMulIntoFullyConnected(%arg0: tensor<4x2xf32>) -> tensor<4x2xf32> {
 // CHECK:  return %[[RES]] : tensor<4x2xf32>
 }
 
+// CHECK-LABEL: @fuseBroadcastMulIntoFullyConnected
+func @fuseBroadcastMulIntoFullyConnected(%arg0: tensor<1x10368xbf16>) -> tensor<32x1x256xbf16> {
+  %cst_0 = constant dense<2.0> : tensor<256x10368xbf16>
+  %cst_1 = constant unit
+  %cst_2 = constant dense<3.0> : tensor<32x1x256xbf16>
+  %0 = "tfl.fully_connected"(%arg0, %cst_0, %cst_1) {
+    fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"
+  } : (tensor<1x10368xbf16>, tensor<256x10368xbf16>, none) -> tensor<1x256xbf16>
+  %1 = "tfl.mul"(%0, %cst_2) {fused_activation_function = "NONE"} : (tensor<1x256xbf16>, tensor<32x1x256xbf16>) -> tensor<32x1x256xbf16>
+  return %1 : tensor<32x1x256xbf16>
+
+// CHECK:  %[[V0:.*]] = "tfl.fully_connected"(%arg0, {{.*}}) {{{.*}}} : (tensor<1x10368xbf16>, tensor<256x10368xbf16>, none) -> tensor<1x256xbf16>
+// CHECK:  %[[V1:.*]] = "tfl.mul"(%[[V0]], {{.*}}) {{{.*}}} : (tensor<1x256xbf16>, tensor<32x1x256xbf16>) -> tensor<32x1x256xbf16>
+// CHECK:  return %[[V1]] : tensor<32x1x256xbf16>
+}
+
 
 // CHECK-LABEL: @fuseAddIntoFollowingFullyConnectedWithQDQs
 func @fuseAddIntoFollowingFullyConnectedWithQDQs(%arg0: tensor<4x2xf32>) -> tensor<4x2xf32> {
@@ -865,6 +991,16 @@ func @Relu(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
   // CHECK: return %[[RESULT]]
 }
 
+// CHECK-LABEL: Relu_bf16
+func @Relu_bf16(%arg0: tensor<2x3xbf16>) -> tensor<2x3xbf16> {
+  %cst = constant dense<0.0> : tensor<2x3xbf16>
+  %0 = "tfl.maximum"(%arg0, %cst) : (tensor<2x3xbf16>, tensor<2x3xbf16>) -> tensor<2x3xbf16>
+  return %0 : tensor<2x3xbf16>
+
+  // CHECK: %[[RESULT:.*]] = "tfl.relu"(%arg0)
+  // CHECK: return %[[RESULT]]
+}
+
 // CHECK-LABEL: Relu1
 func @Relu1(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
   %cst = constant dense<-1.0> : tensor<f32>
@@ -1175,3 +1311,29 @@ func @FoldReduceProdKeepDim(%arg0: tensor<8x128xf32>) -> tensor<1x1xf32> {
 // CHECK: %[[RESULT:.*]] = "tfl.reduce_prod"(%arg0, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<2xi32>) -> tensor<1x1xf32>
 // CHECK: return %[[RESULT]] : tensor<1x1xf32>
 }
+
+func @SoftMaxWithNormalization(%arg0: tensor<8x128xf32>) -> tensor<8x128xf32> {
+  %cst = constant dense<1> : tensor<1xi32>
+  %0 = "tfl.reduce_max"(%arg0, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<8x1xf32>
+  %1 = "tfl.sub"(%arg0, %0) {fused_activation_function = "NONE"} : (tensor<8x128xf32>, tensor<8x1xf32>) -> tensor<8x128xf32>
+  %2 = "tfl.exp"(%1) : (tensor<8x128xf32>) -> tensor<8x128xf32>
+  %3 = "tfl.sum"(%2, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<8x1xf32>
+  %4 = "tfl.div"(%2, %3) {fused_activation_function = "NONE"} : (tensor<8x128xf32>, tensor<8x1xf32>) -> tensor<8x128xf32>
+  return %4 : tensor<8x128xf32>
+
+// CHECK-LABEL: SoftMaxWithNormalization
+// CHECK: %[[RESULT:.*]] = "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<8x128xf32>) -> tensor<8x128xf32>
+// CHECK: return %[[RESULT]] : tensor<8x128xf32>
+}
+
+func @SoftMaxWithoutNormalization(%arg0: tensor<8x128xf32>) -> tensor<8x128xf32> {
+  %cst = constant dense<1> : tensor<1xi32>
+  %0 = "tfl.exp"(%arg0) : (tensor<8x128xf32>) -> tensor<8x128xf32>
+  %1 = "tfl.sum"(%0, %cst) {keep_dims = true} : (tensor<8x128xf32>, tensor<1xi32>) -> tensor<8x1xf32>
+  %2 = "tfl.div"(%0, %1) {fused_activation_function = "NONE"} : (tensor<8x128xf32>, tensor<8x1xf32>) -> tensor<8x128xf32>
+  return %2 : tensor<8x128xf32>
+
+// CHECK-LABEL: SoftMaxWithoutNormalization
+// CHECK: %[[RESULT:.*]] = "tfl.softmax"(%arg0) {beta = 1.000000e+00 : f32} : (tensor<8x128xf32>) -> tensor<8x128xf32>
+// CHECK: return %[[RESULT]] : tensor<8x128xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
index 9e8a957b34c..2b871769c81 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-composite-functions-tf.mlir
@@ -520,3 +520,42 @@ func @func_with_call(%arg0: tensor<100xf32>) -> tensor<100xf32> {
   return %0 : tensor<100xf32>
   }
 }
+
+// -----
+
+module {
+func @tflite_custom_nms(%arg0: tensor<1x100x4xf32>, %arg1: tensor<1x100x91xf32>, %arg2: tensor<100x4xf32>) -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) attributes  {tf._implements = #tf.func<@"TFLite_Detection_PostProcess", {max_detections = 10 : i64, max_classes_per_detection = 1 : i64, num_classes = 91 : i64, nms_score_threshold = 0.5 : f32, nms_iou_threshold = 0.6 : f32, y_scale = 5.0 : f32, x_scale = 10.0 : f32, h_scale = 1.0 : f32, w_scale = 2.0 : f32, use_regular_nms = 0 : i1}>, tf._reference = "mlir"} {
+  %0 = "tf.Const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.Const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
+  %2 = "tf.Const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
+  %3 = "tf.Const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
+  return %0, %1, %2, %3 : tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>
+}
+
+// CHECK-LABEL: func @tflite_custom_nms(
+// CHECK-SAME:                          %[[VAL_0:.*]]: tensor<1x100x4xf32>,
+// CHECK-SAME:                          %[[VAL_1:.*]]: tensor<1x100x91xf32>,
+// CHECK-SAME:                          %[[VAL_2:.*]]: tensor<100x4xf32>) -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) attributes {tf._implements = "TFLite_Detection_PostProcess", tf._reference = "mlir"} {
+// CHECK:         %[[VAL_3:.*]]:4 = "tfl.custom"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) {custom_code = "TFLite_Detection_PostProcess", custom_option = opaque<"tfl", "0x6D61785F646574656374696F6E73006D61785F636C61737365735F7065725F646574656374696F6E006E756D5F636C6173736573006E6D735F73636F72655F7468726573686F6C64006E6D735F696F755F7468726573686F6C6400795F7363616C6500785F7363616C6500685F7363616C6500775F7363616C65007573655F726567756C61725F6E6D73000A217E8E465B681720313A00000C000000010000000A0000000000803F010000000A0000009A99193F0000003F5B0000000000000000000040000020410000A0400E06060E0E06060E0E0E322601"> : tensor<217xi8>} : (tensor<1x100x4xf32>, tensor<1x100x91xf32>, tensor<100x4xf32>) -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>)
+// CHECK:         return %[[VAL_3]]#0, %[[VAL_3]]#1, %[[VAL_3]]#2, %[[VAL_3]]#3 : tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>
+// CHECK:       }
+}
+
+// -----
+
+module {
+// expected-error @+1 {{Invalid number of results from TFLite_Detection_PostProcess}}
+func @tflite_custom_nms_invalid_results(%arg0: tensor<1x100x4xf32>, %arg1: tensor<1x100x91xf32>, %arg2: tensor<100x4xf32>) -> (tensor<f32>, tensor<f32>, tensor<f32>) attributes  {tf._implements = #tf.func<@"TFLite_Detection_PostProcess", {max_detections = 10 : i64, max_classes_per_detection = 1 : i64, num_classes = 91 : i64, nms_score_threshold = 0.5 : f32, nms_iou_threshold = 0.6 : f32, y_scale = 5.0 : f32, x_scale = 10.0 : f32, h_scale = 1.0 : f32, w_scale = 2.0 : f32, use_regular_nms = 0 : i1}>, tf._reference = "mlir"}
+
+// expected-error @+1 {{Invalid number of arguments to TFLite_Detection_PostProcess}}
+func @tflite_custom_nms_invalid_args(%arg0: tensor<1x100x4xf32>, %arg1: tensor<1x100x91xf32>) -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) attributes  {tf._implements = #tf.func<@"TFLite_Detection_PostProcess", {max_detections = 10 : i64, max_classes_per_detection = 1 : i64, num_classes = 91 : i64, nms_score_threshold = 0.5 : f32, nms_iou_threshold = 0.6 : f32, y_scale = 5.0 : f32, x_scale = 10.0 : f32, h_scale = 1.0 : f32, w_scale = 2.0 : f32, use_regular_nms = 0 : i1}>, tf._reference = "mlir"}
+
+// expected-error @+1 {{max_classes_per_detection attribute is not set or not an integer}}
+func @tflite_custom_nms_missing_func_args(%arg0: tensor<1x100x4xf32>, %arg1: tensor<1x100x91xf32>, %arg2: tensor<100x4xf32>) -> (tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) attributes  {tf._implements = #tf.func<@"TFLite_Detection_PostProcess", {max_detections = 10 : i64, num_classes = 91 : i64, nms_score_threshold = 0.5 : f32, nms_iou_threshold = 0.6 : f32, y_scale = 5.0 : f32, x_scale = 10.0 : f32, h_scale = 1.0 : f32, w_scale = 2.0 : f32, use_regular_nms = 0 : i1}>, tf._reference = "mlir"} {
+  %0 = "tf.Const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.Const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
+  %2 = "tf.Const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
+  %3 = "tf.Const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
+  return %0, %1, %2, %3 : tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>
+}
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index 6a992d6dfe4..186c8631e56 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -1,4 +1,5 @@
 // RUN: tf-opt -tfl-prepare-tf %s | FileCheck %s
+// RUN: tf-opt %s -tf-layout-optimization=force-data-format=NHWC -tfl-prepare-tf  | FileCheck --check-prefix=LAYOUT --dump-input=always %s
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
 
@@ -53,37 +54,12 @@ func @depthwiseConv2D(tensor<256x32x32x3xf32>, tensor<3x3x3x4xf32>, tensor<256x3
 // CHECK:  %5 = "tf.DepthwiseConv2dNative"
 }
 
-func @fusedBatchNorm(tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>) {
-^bb0(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>):
-  // OK
-  %0:5 = "tf.FusedBatchNorm"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
-  // Unsupported training
-  %1:5 = "tf.FusedBatchNorm"( %0#0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = true}  : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
-  // Use other output
-  %2:5 = "tf.FusedBatchNorm"( %1#0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+func @Conv2dNCHW(%arg0: tensor<256x3x32x32xf32>, %arg1: tensor<3x3x3x16xf32>) -> tensor<256x16x30x30xf32> {
+  %0 = "tf.Conv2D"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NCHW", dilations = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<256x3x32x32xf32>, tensor<3x3x3x16xf32>) -> tensor<256x16x30x30xf32>
+  return %0 : tensor<256x16x30x30xf32>
 
-  return %2, %2#1 : tensor<8x8x8x8xf32>, tensor<8xf32>
-
-// CHECK-LABEL: fusedBatchNorm
-// CHECK:  %[[CONSTANT:.*]] = constant dense<1.000000e-03>
-//              variance + epsilon
-// CHECK:  %[[ADD1:.*]] = "tf.Add"(%[[ARG4:.*]], %[[CONSTANT]])
-//              rsqrt(variance + epsilon)
-// CHECK:  %[[RSQRT:.*]] = "tf.Rsqrt"(%[[ADD1]])
-//              scale * rsqrt(variance + epsilon)
-// CHECK:  %[[MUL1:.*]] = "tf.Mul"(%[[ARG1:.*]], %[[RSQRT]])
-//              x * scale * rsqrt(variance + epsilon)
-// CHECK:  %[[MUL2:.*]] = "tf.Mul"(%[[ARG0:.*]], %[[MUL1]])
-//              mean * scale * rsqrt(variance + epsilon)
-// CHECK:  %[[MUL3:.*]] = "tf.Mul"(%[[ARG3:.*]], %[[MUL1]])
-//              offset - mean * scale * rsqrt(variance + epsilon)
-// CHECK:  %[[SUB:.*]] = "tf.Sub"(%[[ARG2:.*]], %[[MUL3]])
-//              x * scale * rsqrt(variance + epsilon) +
-//              offset - mean * scale * rsqrt(variance + epsilon)
-// CHECK:  %[[ADD2:.*]] = "tf.Add"(%[[MUL2]], %[[SUB]])
-
-// CHECK:  %[[BATCHNORM1_a:[^,]+]], {{.*}} = "tf.FusedBatchNorm"(%[[ADD2]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG4]])
-// CHECK:  "tf.FusedBatchNorm"(%[[BATCHNORM1_a]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG4]])
+  // LAYOUT-LABEL: Conv2dNCHW
+  // LAYOUT: "tfl.conv_2d"
 }
 
 func @fusedBatchNormV3(tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>) {
@@ -483,6 +459,20 @@ func @StridedSliceEllipsisMaskBefore(%arg0: tensor<21x15x7xf32>) -> tensor<21x15
   // CHECK: %[[STRIDED_SLICE:.*]] = "tf.StridedSlice"(%arg0, %[[CST]], %[[CST]], %[[CST_0]]) {begin_mask = 3 : i64, ellipsis_mask = 0 : i64, end_mask = 3 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<21x15x7xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<21x15x2xf32>
 }
 
+// CHECK-LABEL: @StridedSliceEllipsisMaskBeforeWithBeginAndEndMask
+func @StridedSliceEllipsisMaskBeforeWithBeginAndEndMask(%arg0: tensor<4x5x4xf32>) -> tensor<4x4x4xf32> {
+  %cst = constant dense<[0, 1, 0]> : tensor<3xi32>
+  %cst_0 = constant dense<0> : tensor<3xi32>
+  %cst_1 = constant dense<1> : tensor<3xi32>
+  %0 = "tf.StridedSlice"(%arg0, %cst, %cst_0, %cst_1) {begin_mask = 6 : i64, ellipsis_mask = 1 : i64, end_mask = 4 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<4x5x4xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<4x4x4xf32>
+  return %0 : tensor<4x4x4xf32>
+
+  // CHECK: %[[CST:.*]] = constant dense<[0, 1, 0]> : tensor<3xi32>
+  // CHECK: %[[CST_0:.*]] = constant dense<0> : tensor<3xi32>
+  // CHECK: %[[CST_1:.*]] = constant dense<1> : tensor<3xi32>
+  // CHECK: %[[STRIDED_SLICE:.*]] = "tf.StridedSlice"(%arg0, %[[CST]], %[[CST_0]], %[[CST_1]]) {begin_mask = 7 : i64, ellipsis_mask = 0 : i64, end_mask = 5 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<4x5x4xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<4x4x4xf32>
+}
+
 // CHECK-LABEL: @StridedSliceEllipsisMaskAfter
 func @StridedSliceEllipsisMaskAfter(%arg0: tensor<21x15x7xf32>) -> tensor<5x15x7xf32> {
   %cst = constant dense<0> : tensor<2xi32>
@@ -629,4 +619,24 @@ func @lower_rfft_to_rfft2d(%input: tensor<10x20x30xf32>, %fft_len: tensor<1xi32>
 // CHECK:  %[[SQE:.*]] = "tf.Squeeze"(%[[RFF]]) {squeeze_dims = [-2]} : (tensor<10x20x1x30xcomplex<f64>>) -> tensor<10x20x30xcomplex<f64>>
 }
 
+// CHECK-LABEL: xla_gather_to_slice
+func @xla_gather_to_slice(%arg0 : tensor<1x9x104x768xf32>) -> tensor<*xf32> {
+  %0 = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+  %1 = "tf.Const"() {value = dense<[1, 9, 23, 768]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %2 = "tf.XlaGather"(%arg0, %0, %1) {device = "", dimension_numbers = "\0A\04\00\01\02\03\1A\01\02", indices_are_sorted = false} : (tensor<1x9x104x768xf32>, tensor<1xi32>, tensor<4xi32>) -> tensor<*xf32>
+  return %2 : tensor<*xf32>
+
+// CHECK: %[[CST:.*]] = constant dense<0> : tensor<4xi64>
+// CHECK: %[[CST0:.*]] = constant dense<[1, 9, 23, 768]> : tensor<4xi64>
+// CHECK: %[[V0:.*]] = "tf.Slice"(%arg0, %[[CST]], %[[CST0]]) : (tensor<1x9x104x768xf32>, tensor<4xi64>, tensor<4xi64>) -> tensor<*xf32>
+// CHECK: return %[[V0]] : tensor<*xf32>
+}
+
+// CHECK-LABEL: DontMatchFusedBatchNormV3
+func @DontMatchFusedBatchNormV3(%arg0 :tensor<?x576x1x1xf32>, %arg1 : tensor<576xf32>, %arg2 : tensor<576xf32>, %arg3 : tensor<576xf32>,%arg4 : tensor<576xf32>) -> (tensor<?x576x1x1xf32>) {
+  %result:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {data_format = "NHWC", device = "", epsilon = 0.001 : f32, exponential_avg_factor = 1.0 : f32, is_training = false} : (tensor<?x576x1x1xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>) -> (tensor<?x576x1x1xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<576xf32>, tensor<*xf32>)
+  return %result : tensor<?x576x1x1xf32>
+  // CHECK: "tf.FusedBatchNormV3"
+}
+
 }
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index d63eb481376..2feb7fedb81 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -38,6 +38,10 @@ CreateTFExecutorToControlDialectConversion();
 }  // namespace mlir
 
 namespace tensorflow {
+namespace {
+// Data layout supported by TFLite.
+const char kTFLiteDataLayout[] = "NHWC";
+}  // namespace
 
 void AddQuantizationPasses(const mlir::TFL::QuantizationSpecs& quant_specs,
                            mlir::OpPassManager* pass_manager) {
@@ -170,6 +174,12 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
     if (pass_config.shape_inference) {
       pass_manager->addPass(mlir::TF::CreateTFShapeInferencePass());
     }
+    // Force layout supported by TFLite, this will transpose the data
+    // to match 'kTFLiteDataLayout'
+    mlir::TF::LayoutOptimizationPipelineOptions layout_optimization_options;
+    layout_optimization_options.force_data_format = kTFLiteDataLayout;
+    mlir::TF::CreateLayoutOptimizationPipeline(*pass_manager,
+                                               layout_optimization_options);
     // Prepare for TFLite dialect, rerun canonicalization, and then legalize to
     // the TFLite dialect.
     pass_manager->addPass(
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index 046c7bbbcf0..aa3545d9beb 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -143,6 +143,7 @@ int main(int argc, char **argv) {
   mlir::SourceMgrDiagnosticHandler sourceMgrHandler(source_mgr, &context);
 
   StatusOr<mlir::OwningModuleRef> module;
+  std::unordered_set<std::string> tags;
 
   tensorflow::GraphImportConfig specs;
   specs.upgrade_legacy = upgrade_legacy;
@@ -161,8 +162,7 @@ int main(int argc, char **argv) {
       module = tensorflow::errors::InvalidArgument(
           "Importing saved model should not have input_mlir set");
 
-    std::unordered_set<std::string> tags =
-        absl::StrSplit(saved_model_tags, ',');
+    tags = absl::StrSplit(saved_model_tags, ',');
     std::vector<std::string> exported_names_vector =
         absl::StrSplit(saved_model_exported_names, ',', absl::SkipEmpty());
     absl::Span<std::string> exported_names(exported_names_vector);
@@ -171,10 +171,11 @@ int main(int argc, char **argv) {
       llvm::errs() << "There should be only one exported name";
       return kTrFailure;
     }
-
-    module =
-        tensorflow::ImportSavedModel(input_file_name, saved_model_version, tags,
-                                     exported_names, specs, &context);
+    std::vector<std::string> extra_opdefs(custom_opdefs.begin(),
+                                          custom_opdefs.end());
+    module = tensorflow::ImportSavedModel(input_file_name, saved_model_version,
+                                          tags, extra_opdefs, exported_names,
+                                          specs, &context);
   } else {
     module = tensorflow::LoadFromGraphdefOrMlirSource(
         input_file_name, input_mlir, use_splatted_constant, custom_opdefs,
@@ -240,7 +241,7 @@ int main(int argc, char **argv) {
   std::string result;
   auto status = tensorflow::ConvertTFExecutorToTFLOrFlatbuffer(
       module.ValueOrDie().get(), output_mlir, emit_builtin_tflite_ops,
-      emit_select_tf_ops, emit_custom_ops, quant_specs, &result, &pm);
+      emit_select_tf_ops, emit_custom_ops, quant_specs, tags, &result, &pm);
   if (!status.ok()) return kTrFailure;
 
   std::string error_msg;
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index c158f3a8e21..622e32c2766 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <unordered_set>
 
+#include "absl/types/span.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
@@ -70,6 +71,27 @@ mlir::LogicalResult IsValidGraph(mlir::ModuleOp module) {
   }
   return mlir::success();
 }
+
+// Util that registers 'extra_tf_opdefs' to the TF global registry.
+// Return OK on success, failure if registering failed.
+Status RegisterExtraTfOpDefs(absl::Span<const std::string> extra_tf_opdefs) {
+  for (const auto& tf_opdefs_string : extra_tf_opdefs) {
+    tensorflow::OpDef opdef;
+    if (!tensorflow::protobuf::TextFormat::ParseFromString(tf_opdefs_string,
+                                                           &opdef)) {
+      LOG(ERROR) << "OpDef parsing failed for: " << tf_opdefs_string;
+      return errors::InvalidArgument("fail to parse extra OpDef");
+    }
+    // Register extra opdefs.
+    // TODO(b/133770952): Support shape functions.
+    tensorflow::OpRegistry::Global()->Register(
+        [opdef](tensorflow::OpRegistrationData* op_reg_data) -> Status {
+          *op_reg_data = tensorflow::OpRegistrationData(opdef);
+          return Status::OK();
+        });
+  }
+  return Status::OK();
+}
 }  // namespace
 
 StatusOr<OwningModuleRef> LoadFromGraphdefOrMlirSource(
@@ -92,21 +114,9 @@ StatusOr<OwningModuleRef> LoadFromGraphdefOrMlirSource(
     return OwningModuleRef(mlir::parseSourceFile(*source_mgr, context));
   }
 
-  for (const auto& tf_opdefs_string : extra_tf_opdefs) {
-    tensorflow::OpDef opdef;
-    if (!tensorflow::protobuf::TextFormat::ParseFromString(tf_opdefs_string,
-                                                           &opdef)) {
-      LOG(ERROR) << "OpDef parsing failed for: " << tf_opdefs_string;
-      return errors::InvalidArgument("fail to parse extra OpDef");
-    }
-    // Register extra opdefs.
-    // TODO(b/133770952): Support shape functions.
-    tensorflow::OpRegistry::Global()->Register(
-        [opdef](tensorflow::OpRegistrationData* op_reg_data) -> Status {
-          *op_reg_data = tensorflow::OpRegistrationData(opdef);
-          return Status::OK();
-        });
-  }
+  // Register extra TF ops passed as OpDef.
+  auto extra_opdefs_status = RegisterExtraTfOpDefs(extra_tf_opdefs);
+  if (!extra_opdefs_status.ok()) return extra_opdefs_status;
 
   if (use_splatted_constant) {
     return tensorflow::GraphdefToSplattedMlirTranslateFunction(
@@ -127,8 +137,12 @@ StatusOr<OwningModuleRef> LoadFromGraphdefOrMlirSource(
 Status ConvertTFExecutorToTFLOrFlatbuffer(
     mlir::ModuleOp module, bool export_to_mlir, bool emit_builtin_tflite_ops,
     bool emit_select_tf_ops, bool emit_custom_ops,
-    const mlir::TFL::QuantizationSpecs& quant_specs, std::string* result,
-    mlir::PassManager* pass_manager) {
+    const mlir::TFL::QuantizationSpecs& quant_specs,
+    const std::unordered_set<std::string>& saved_model_tags,
+    std::string* result, mlir::PassManager* pass_manager) {
+  // Explicitly disable dumping Op details on failures.
+  module.getContext()->printOpOnDiagnostic(false);
+
   // Register a warning handler only log to std out.
   mlir::ScopedDiagnosticHandler s(
       module.getContext(), [](mlir::Diagnostic& diag) {
@@ -158,7 +172,7 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
   if (!quant_specs.RunWeightQuantization()) {
     if (tflite::MlirToFlatBufferTranslateFunction(
             module, result, emit_builtin_tflite_ops, emit_select_tf_ops,
-            emit_custom_ops)) {
+            emit_custom_ops, saved_model_tags)) {
       return statusHandler.ConsumeStatus();
     }
   } else {
@@ -167,7 +181,7 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
     std::string pre_quantized_result;
     if (tflite::MlirToFlatBufferTranslateFunction(
             module, &pre_quantized_result, emit_builtin_tflite_ops,
-            emit_select_tf_ops, emit_custom_ops)) {
+            emit_select_tf_ops, emit_custom_ops, saved_model_tags)) {
       return statusHandler.ConsumeStatus();
     }
     flatbuffers::FlatBufferBuilder q_builder(/*initial_size=*/10240);
@@ -198,8 +212,13 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
 StatusOr<mlir::OwningModuleRef> ImportSavedModel(
     const std::string& input_filename, const int saved_model_version,
     const std::unordered_set<std::string>& tags,
+    absl::Span<const std::string> extra_tf_opdefs,
     absl::Span<std::string> exported_names, const GraphImportConfig& specs,
     mlir::MLIRContext* context) {
+  // Register extra TF ops passed as OpDef.
+  auto extra_opdefs_status = RegisterExtraTfOpDefs(extra_tf_opdefs);
+  if (!extra_opdefs_status.ok()) return extra_opdefs_status;
+
   if (saved_model_version == 2) {
     auto module_or = tensorflow::SavedModelObjectGraphToMlirImport(
         input_filename, tags, exported_names, context);
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
index 8f1edec8879..95b6097e1eb 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
@@ -48,6 +48,7 @@ LoadFromGraphdefOrMlirSource(
 stream_executor::port::StatusOr<mlir::OwningModuleRef> ImportSavedModel(
     const std::string& input_filename, const int saved_model_version,
     const std::unordered_set<std::string>& tags,
+    absl::Span<const std::string> extra_tf_opdefs,
     absl::Span<std::string> exported_names, const GraphImportConfig& specs,
     mlir::MLIRContext* context);
 
@@ -62,8 +63,9 @@ stream_executor::port::StatusOr<mlir::OwningModuleRef> ImportSavedModel(
 Status ConvertTFExecutorToTFLOrFlatbuffer(
     mlir::ModuleOp module, bool export_to_mlir, bool emit_builtin_tflite_ops,
     bool emit_select_tf_ops, bool emit_custom_ops,
-    const mlir::TFL::QuantizationSpecs& quant_specs, std::string* result,
-    mlir::PassManager* pass_manager);
+    const mlir::TFL::QuantizationSpecs& quant_specs,
+    const std::unordered_set<std::string>& saved_model_tags,
+    std::string* result, mlir::PassManager* pass_manager);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_TF_TO_TFL_FLATBUFFER_H_
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index 47cfaecd3fb..322da815a47 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -27,6 +27,9 @@ def NonOpaqueElementsAttr : ElementsAttrBase<
 def F32ElementsAttr : ElementsAttrBase<
   CPred<"$_self.cast<ElementsAttr>().getType().getElementType().isF32()">, "float constant tensor">;
 
+def Int64ElementsAttr : ElementsAttrBase<
+  CPred<"$_self.cast<ElementsAttr>().getType().getElementType().isInteger(64)">, "Int 64 constant tensor">;
+
 // Extract the ith int element from an ArrayAttr $0 as an 32-bit IntegerAttr
 // with builder.
 class ExtractI32At<int i> : NativeCodeCall<
@@ -50,6 +53,10 @@ def ExtractSingleElementAsInteger : NativeCodeCall<
 def ExtractSingleElementAsInt32 : NativeCodeCall<
     "$_builder.getI32IntegerAttr(ExtractSingleElementAsInteger($_self.cast<ElementsAttr>()).getInt())">;
 
+// Converts tensor with int64 to int32.
+def CreateCastToInt32 : NativeCodeCall<
+  "CreateCastToInt32($0, $_loc, $_builder)">;
+
 // Checks whether the given operation has static shapes and same shapes of all inputs.
 def HasSameStaticShapesPred : CPred<"HasSameStaticShapes($0.getDefiningOp())">;
 def HasSameStaticShapes : Constraint<HasSameStaticShapesPred, "op must have static same input shapes">;
@@ -149,6 +156,7 @@ def LegalizeMaxPool2D : Pat<
               IsIntList1XY1:$ksize,
               IsIntList1XY1:$strides,
               $padding,
+              $explicit_paddings,
               IsDataFormatNHWC:$format),
           (TFL_MaxPool2DOp $value,
               /*padding=*/$padding,
@@ -207,8 +215,14 @@ def LegalizeSoftPlus : Pat<(TF_SoftplusOp F32Tensor:$arg0),
 def LegalizeSqueeze : Pat<(TF_SqueezeOp $arg, $squeeze_dims),
                           (TFL_SqueezeOp $arg, $squeeze_dims)>;
 def LegalizeTanh : Pat<(TF_TanhOp $arg), (TFL_TanhOp $arg)>;
+
+def LegalizeTransposeInt64 : Pat<
+  (TF_TransposeOp $arg, (ConstantOp Int64ElementsAttr:$perm)),
+  (TFL_TransposeOp $arg, (CreateCastToInt32 $perm))>;
+
 def LegalizeTranspose : Pat<(TF_TransposeOp $arg, $perm),
                             (TFL_TransposeOp $arg, $perm)>;
+
 def LegalizeWhere : Pat<(TF_WhereOp $arg), (TFL_WhereOp $arg)>;
 def LegalizeZerosLike : Pat<(TF_ZerosLikeOp $arg), (TFL_ZerosLikeOp $arg)>;
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index 297b1459fc5..13c7a08a094 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Threading.h"
 #include "mlir/Dialect/Quant/FakeQuantSupport.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/UniformSupport.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
@@ -48,6 +49,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/utils/constant_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -65,7 +67,6 @@ namespace TFL {
 // The actual LegalizeTF Pass.
 namespace {
 
-using xla::Status;
 using xla::StatusOr;
 
 constexpr char kUnidirectionalSequenceLstm[] = "tf.UnidirectionalSequenceLstm";
@@ -74,6 +75,10 @@ constexpr char kTfLiteInputIndices[] = "_tflite_input_indices";
 
 // Legalize operations in functions.
 class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<quant::QuantizationDialect, TFL::TensorFlowLiteDialect>();
+  }
+
  public:
   LegalizeTF() = default;
   LegalizeTF(const LegalizeTF&) {}
@@ -112,6 +117,17 @@ bool HasSameStaticShapes(Operation* op) {
   return true;
 }
 
+// Util that casts 'val' to Int32 by adding a cast Op.
+Value CreateCastToInt32(Attribute val, Location loc,
+                        PatternRewriter& rewriter) {
+  auto shape = val.getType().dyn_cast<RankedTensorType>().getShape();
+  IntegerType new_ele_type = rewriter.getIntegerType(32);
+  ShapedType new_type = RankedTensorType::get(shape, new_ele_type);
+  return rewriter.create<TF::CastOp>(loc, new_type,
+                                     rewriter.create<TF::ConstOp>(loc, val),
+                                     rewriter.getBoolAttr(false));
+}
+
 #include "tensorflow/compiler/mlir/lite/transforms/generated_legalize_tf.inc"
 
 #define DECL_CONVERT_OP(tf_op)                                               \
@@ -154,9 +170,8 @@ LogicalResult ConvertTFRandomUniformOp::matchAndRewrite(
       tensorflow::random::PhiloxRandom, float>
       Distribution;
 
-  tensorflow::random::PhiloxRandom generator(
-      random_uniform_op.seed().getSExtValue(),
-      random_uniform_op.seed2().getSExtValue());
+  tensorflow::random::PhiloxRandom generator(random_uniform_op.seed(),
+                                             random_uniform_op.seed2());
   Distribution dist;
   size_t num_elements = 0;
   if (auto output_type =
@@ -227,26 +242,47 @@ LogicalResult ConvertTFConcatV2Op::matchAndRewrite(
   return success();
 }
 
-// The following is effectively:
-// def : Pat<
-//   (TF_MatMulOp $a, $b, ConstBoolAttrFalse:$transpose_a,
-//      ConstBoolAttrTrue:$transpose_b),
-//   (TFL_FullyConnectedOp:$__0 $a, $b,
-//     NoInput.pattern, TFL_AF_None, TFL_FCWO_Default, ConstBoolAttrFalse)>;
 LogicalResult ConvertTFMatMulOp::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_matmul_op = cast<TF::MatMulOp>(op);
-  if (tf_matmul_op.transpose_a()) return failure();
-  if (!tf_matmul_op.transpose_b()) return failure();
+  auto lhs = op->getOperand(0);
+  auto rhs = op->getOperand(1);
+  auto transpose = [&](Value input) -> std::pair<LogicalResult, Value> {
+    RankedTensorType type =
+        input.getType().dyn_cast_or_null<RankedTensorType>();
+    if (!type || type.getRank() != 2) return {failure(), nullptr};
+
+    auto permute_attr = DenseIntElementsAttr::get(
+        RankedTensorType::get({2}, rewriter.getI32Type()), {1, 0});
+    auto permute = rewriter.create<ConstantOp>(
+        op->getLoc(), permute_attr.getType(), permute_attr);
+    llvm::SmallVector<int64_t, 2> new_shape{type.getShape()[1],
+                                            type.getShape()[0]};
+    auto output = rewriter.create<TFL::TransposeOp>(
+        op->getLoc(), RankedTensorType::get(new_shape, type.getElementType()),
+        input, permute);
+    return {success(), output};
+  };
+
+  // TODO(jpienaar): Remove once handled via dailect conversion.
+  if (tf_matmul_op.transpose_a()) {
+    LogicalResult result = success();
+    std::tie(result, lhs) = transpose(lhs);
+    if (failed(result)) return failure();
+  }
+  if (!tf_matmul_op.transpose_b()) {
+    LogicalResult result = success();
+    std::tie(result, rhs) = transpose(rhs);
+    if (failed(result)) return failure();
+  }
 
   Type output_type = tf_matmul_op.getResult().getType();
-  // TODO(jpienaar): Follow up post shuffle discussion.
   auto no_input = rewriter.create<ConstantOp>(
       op->getLoc(), rewriter.getNoneType(), rewriter.getUnitAttr());
   auto fc_op = rewriter.create<FullyConnectedOp>(
-      op->getLoc(), ArrayRef<Type>{output_type}, op->getOperand(0),
-      op->getOperand(1), no_input, rewriter.getStringAttr("NONE"),
-      rewriter.getStringAttr("DEFAULT"), rewriter.getBoolAttr(false));
+      op->getLoc(), ArrayRef<Type>{output_type}, lhs, rhs, no_input,
+      rewriter.getStringAttr("NONE"), rewriter.getStringAttr("DEFAULT"),
+      rewriter.getBoolAttr(false));
   rewriter.replaceOp(op, {fc_op.getResult(0)});
   return success();
 }
@@ -259,7 +295,7 @@ LogicalResult ConvertTFPackOp::matchAndRewrite(
   auto output_type = tf_pack_op.output().getType();
   auto values_count = rewriter.getI32IntegerAttr(tf_pack_op.N());
   // Axis can be negative.
-  auto axis = rewriter.getI32IntegerAttr(tf_pack_op.axis().getSExtValue());
+  auto axis = rewriter.getI32IntegerAttr(tf_pack_op.axis());
 
   rewriter.replaceOpWithNewOp<PackOp>(op, output_type, values, values_count,
                                       axis);
@@ -356,27 +392,22 @@ LogicalResult ConvertTFStridedSliceOp::matchAndRewrite(
         op, tf_strided_slice_op.output().getType(), tf_strided_slice_op.input(),
         tf_strided_slice_op.begin(), tf_strided_slice_op.end(),
         tf_strided_slice_op.strides(),
-        rewriter.getI32IntegerAttr(
-            tf_strided_slice_op.begin_mask().getSExtValue()),
-        rewriter.getI32IntegerAttr(
-            tf_strided_slice_op.end_mask().getSExtValue()),
-        rewriter.getI32IntegerAttr(
-            tf_strided_slice_op.ellipsis_mask().getSExtValue()),
-        rewriter.getI32IntegerAttr(
-            tf_strided_slice_op.new_axis_mask().getSExtValue()),
-        rewriter.getI32IntegerAttr(
-            tf_strided_slice_op.shrink_axis_mask().getSExtValue()));
+        rewriter.getI32IntegerAttr(tf_strided_slice_op.begin_mask()),
+        rewriter.getI32IntegerAttr(tf_strided_slice_op.end_mask()),
+        rewriter.getI32IntegerAttr(tf_strided_slice_op.ellipsis_mask()),
+        rewriter.getI32IntegerAttr(tf_strided_slice_op.new_axis_mask()),
+        rewriter.getI32IntegerAttr(tf_strided_slice_op.shrink_axis_mask()));
     return success();
   }
 
   int num_input_dims = ranked_input_type.getRank();
   // Pad `begin` array with zero values and update the `begin_mask`.
   SmallVector<int32_t, 8> begin_pad_val(num_input_dims, 0);
-  int begin_mask = tf_strided_slice_op.begin_mask().getSExtValue();
+  int begin_mask = tf_strided_slice_op.begin_mask();
   Value padded_begin = PadStridedSliceAttributeArray(
       op, rewriter, tf_strided_slice_op.begin(), begin_pad_val, &begin_mask);
   // Pad `end` array with `input_shape` and update the `end_mask`.
-  int end_mask = tf_strided_slice_op.end_mask().getSExtValue();
+  int end_mask = tf_strided_slice_op.end_mask();
   auto input_shape = ranked_input_type.getShape();
   SmallVector<int32_t, 8> end_pad_val(input_shape.begin(), input_shape.end());
   Value padded_end = PadStridedSliceAttributeArray(
@@ -390,12 +421,9 @@ LogicalResult ConvertTFStridedSliceOp::matchAndRewrite(
       padded_begin, padded_end, padded_strides,
       rewriter.getI32IntegerAttr(begin_mask),
       rewriter.getI32IntegerAttr(end_mask),
-      rewriter.getI32IntegerAttr(
-          tf_strided_slice_op.ellipsis_mask().getSExtValue()),
-      rewriter.getI32IntegerAttr(
-          tf_strided_slice_op.new_axis_mask().getSExtValue()),
-      rewriter.getI32IntegerAttr(
-          tf_strided_slice_op.shrink_axis_mask().getSExtValue()));
+      rewriter.getI32IntegerAttr(tf_strided_slice_op.ellipsis_mask()),
+      rewriter.getI32IntegerAttr(tf_strided_slice_op.new_axis_mask()),
+      rewriter.getI32IntegerAttr(tf_strided_slice_op.shrink_axis_mask()));
   return success();
 }
 
@@ -406,7 +434,7 @@ LogicalResult ConvertTFUnpackOp::matchAndRewrite(
   auto input = tf_unpack_op.value();
   auto num = rewriter.getI32IntegerAttr(tf_unpack_op.num());
   // Axis can be negative.
-  auto axis = rewriter.getI32IntegerAttr(tf_unpack_op.axis().getSExtValue());
+  auto axis = rewriter.getI32IntegerAttr(tf_unpack_op.axis());
 
   rewriter.replaceOpWithNewOp<UnpackOp>(op, tf_unpack_op.output().getTypes(),
                                         input, num, axis);
@@ -637,7 +665,7 @@ void LegalizeTF::runOnFunction() {
   auto func = getFunction();
 
   // Add the generated patterns to the list.
-  populateWithGenerated(context, &patterns);
+  populateWithGenerated(context, patterns);
   patterns
       .insert<ConvertTFConcatV2Op, ConvertTFMatMulOp, ConvertTFMatrixDiagV2Op,
               ConvertTFMatrixDiagV3Op, ConvertTFPackOp, ConvertTFReshapeOp,
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc
index 6202507ae91..dc0f6615d5d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf_while.cc
@@ -33,6 +33,10 @@ namespace {
 // cond and body regions.
 struct LegalizeWhile
     : public PassWrapper<LegalizeWhile, OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TFL::TensorFlowLiteDialect>();
+  }
+
   void RunOnFunction(FuncOp func);
 
   void runOnOperation() override {
@@ -60,8 +64,8 @@ void RunOnWhile(TF::WhileOp while_op) {
     // Mark old function as private so that it can be DCE'd if not called.
     func.setVisibility(SymbolTable::Visibility::Private);
   };
-  create_region_with_call(while_op.cond_func(), new_op.cond());
-  create_region_with_call(while_op.body_func(), new_op.body());
+  create_region_with_call(while_op.cond_function(), new_op.cond());
+  create_region_with_call(while_op.body_function(), new_op.body());
 
   op->replaceAllUsesWith(new_op.getResults());
   op->erase();
diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
index edddc7751ab..c0a7ea9337b 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
@@ -714,7 +714,7 @@ struct ConvertTensorListStack
     RankedTensorType shape_type =
         RankedTensorType::get({-1}, rewriter.getIntegerType(32));
     auto new_shape = rewriter.create<TF::ShapeOp>(loc, shape_type, input);
-    SmallVector<int64_t, 8> output_shape = {op.num_elements().getSExtValue()};
+    SmallVector<int64_t, 8> output_shape(/*Size=*/1, op.num_elements());
     for (const auto &dim : dense_elem_attr.getIntValues())
       output_shape.push_back(dim.getSExtValue());
     RankedTensorType result_type =
@@ -749,7 +749,7 @@ Type VariantToUnrankedTensorType(Type type, Value value) {
 // Changes the function type of `cond_func` and `body_func` for the given While
 // op.
 LogicalResult UpdateFunctionTypes(TF::WhileOp op) {
-  for (FuncOp func : {op.cond_func(), op.body_func()}) {
+  for (FuncOp func : {op.cond_function(), op.body_function()}) {
     if (!func) continue;
 
     FunctionType func_type = func.getType();
@@ -892,7 +892,7 @@ LogicalResult LowerStaticTensorListPass::RewriteFunction(
   target.addLegalOp<TFL::BidirectionalSequenceLSTMOp>();
 
   OwningRewritePatternList patterns;
-  populateWithGenerated(context, &patterns);
+  populateWithGenerated(context, patterns);
   patterns.insert<ConvertConst, ConvertEmptyTensorList, ConvertIdentity,
                   ConvertTensorListGetItem, ConvertTensorListLength,
                   ConvertTensorListPushBack, ConvertTensorListReserve,
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index d28ee4b31fa..3c11fe2b610 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -416,6 +416,10 @@ struct FuseFullyConnectedAndMul : public OpRewritePattern<TFL::MulOp> {
 
   LogicalResult matchAndRewrite(TFL::MulOp mul_op,
                                 PatternRewriter &rewriter) const override {
+    // If we are broadcasting on the lhs then don't fold the multiply as it
+    // would increase the amount of compute done by the fully connected op.
+    if (mul_op.lhs().getType() != mul_op.getType()) return failure();
+
     // Mul.
     DenseElementsAttr cst;
     Value constant_val = mul_op.rhs();
@@ -794,7 +798,7 @@ void Optimize::runOnFunction() {
   // Potentially the binary ops might be fused together, like hard_swish, thus
   // we explore these potentially first and then fuse the binary ops with the
   // following ops in a second pattern match.
-  TFL::populateWithGenerated(ctx, &patterns);
+  TFL::populateWithGenerated(ctx, patterns);
   patterns.insert<FuseFullyConnectedAndAdd,
                   FuseFullyConnectedAndReluX<TFL::ReluOp, kRelu>,
                   FuseFullyConnectedAndReluX<TFL::Relu6Op, kRelu6>,
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
index 2311ae0668c..f1ea837446b 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc
@@ -83,8 +83,8 @@ class FoldIfOp : public OpRewritePattern<TF::IfOp> {
     if (!llvm::hasSingleElement(parent_op)) return failure();
 
     // Find the then and else branch functions.
-    FuncOp then_func = op.then_func();
-    FuncOp else_func = op.else_func();
+    FuncOp then_func = op.then_function();
+    FuncOp else_func = op.else_function();
 
     // If the If has no uses and its functions are side-effect free, then
     // remove.
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index 559d22dcf47..653c33ea9df 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -21,8 +21,13 @@ include "tensorflow/compiler/mlir/lite/ir/tfl_ops.td"
 include "tensorflow/compiler/mlir/lite/utils/utils.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 
+// Checks if the param passed is a F32 ElementsAttr.
 def F32ElementsAttr : ElementsAttrBase<
-  CPred<"$_self.cast<ElementsAttr>().getType().getElementType().isF32()">, "float constant tensor">;
+  CPred<"$_self.isa<ElementsAttr>() && $_self.cast<ElementsAttr>().getType().getElementType().isF32()">,
+        "float constant tensor">;
+
+// Checks if the param passed is of NoneType.
+def IsNoneType : Constraint<CPred<"$0.getType().isa<NoneType>()">>;
 
 def ExtractSingleElementAsFloat : NativeCodeCall<
     "ExtractSingleElementAsFloat($_self.cast<ElementsAttr>())">;
@@ -52,15 +57,31 @@ multiclass FuseActFnIntoConvOpPat<dag ActFnOp, dag ActFnAttr> {
     [(HasOneUse $conv_out)]>;
 }
 
+multiclass FuseActFnIntoPoolOpPat<dag ActFnOp, dag ActFnAttr> {
+  def FuseActivationFuncWithAvgPool#ActFnOp#ActFnAttr : Pat<
+    (ActFnOp (TFL_AveragePool2DOp:$pool_out $input, $filter_height,
+                  $filter_width, $padding, $stride_h, $stride_w, TFL_AF_None)),
+    (TFL_AveragePool2DOp $input, $filter_height, $filter_width, $padding,
+        $stride_h, $stride_w, ActFnAttr),
+    [(HasOneUse $pool_out)]>;
+  def FuseActivationFuncWithMaxPool#ActFnOp#ActFnAttr : Pat<
+    (ActFnOp (TFL_MaxPool2DOp:$pool_out $input, $padding, $stride_w, $stride_h,
+                  $filter_width, $filter_height, TFL_AF_None)),
+    (TFL_MaxPool2DOp $input, $padding, $stride_w, $stride_h,
+        $filter_width, $filter_height, ActFnAttr),
+    [(HasOneUse $pool_out)]>;
+}
+
 // TODO(hinsu): Also fuse ops corresponding to SIGN_BIT fused
 // activation functions.
 // Currently we're not fusing tanh, sigmoid, hard_swish and other activations
 // those cannot be simply translated into clamping.
 foreach actFnPair = [[TFL_ReluOp, TFL_AF_Relu],
                      [TFL_Relu6Op, TFL_AF_Relu6],
-                     [TFL_Relu1Op, TFL_AF_Relu1]] in
+                     [TFL_Relu1Op, TFL_AF_Relu1]] in {
   defm : FuseActFnIntoConvOpPat<actFnPair[0], actFnPair[1]>;
-
+  defm : FuseActFnIntoPoolOpPat<actFnPair[0], actFnPair[1]>;
+}
 
 class CanFuseConvOrDepthwiseConv<string is_depthwise> : Constraint<
   CPred<"TFL::CanFuseConvOrDepthwiseConv($0, $1, " # is_depthwise # ")">>;
@@ -93,6 +114,29 @@ multiclass FuseBinaryOpToPrecedingAffine<dag binaryOp> {
       $multiplier),
     [(CanFuseConvOrDepthwiseConv<"true"> $filter, $value),
      (HasOneUse $output)]>;
+   def FuseBinaryOpWithTransposeConv#binaryOp : Pat<
+    (binaryOp (TFL_TransposeConvOp:$output $output_shape, $weights, $inputs,
+                (ConstantOp F32ElementsAttr:$bias), $padding,
+                $stride_h, $stride_w),
+              (ConstantOp F32ElementsAttr:$value), TFL_AF_None),
+    (TFL_TransposeConvOp $output_shape, $weights, $inputs,
+      (binaryOp (ConstantOp $bias),
+         (ConstantOp $value), TFL_AF_None),
+      $padding, $stride_h, $stride_w),
+    [(CanFuseConvOrDepthwiseConv<"false"> $weights, $value),
+     (HasOneUse $output)]>;
+  // Fuse for TransposeConv with no bias
+  def FuseBinaryOpWithTransposeConvNoneBias#binaryOp : Pat<
+    (binaryOp (TFL_TransposeConvOp:$output $output_shape, $weights, $inputs,
+                (ConstantOp $bias), $padding,
+                $stride_h, $stride_w),
+              (ConstantOp F32ElementsAttr:$value), TFL_AF_None),
+    (TFL_TransposeConvOp $output_shape, $weights, $inputs,
+      (ConstantOp $value),
+      $padding, $stride_h, $stride_w),
+    [(CanFuseConvOrDepthwiseConv<"false"> $weights, $value),
+     (IsNoneType $bias),
+     (HasOneUse $output)]>;
 }
 foreach binaryOp = [TFL_AddOp, TFL_SubOp] in
   defm : FuseBinaryOpToPrecedingAffine<binaryOp>;
@@ -146,6 +190,39 @@ multiclass FuseMulOrDivWithConv2dOrDepthwiseConv2d<dag BinaryOp> {
       $h_factor, $w_factor, $act_fn, $padding, $stride_h, $stride_w),
     [(CanFuseConvOrDepthwiseConv<"false"> $filter, $value),
      (HasOneUse $conv_output)]>;
+  def FuseMulOrDivWithTransposeConv#BinaryOp : Pat<
+    (BinaryOp (TFL_TransposeConvOp:$output $output_shape,
+                (ConstantOp F32ElementsAttr:$weights), $input,
+                (ConstantOp F32ElementsAttr:$bias),
+                $padding, $stride_h, $stride_w),
+              (ConstantOp $value), TFL_AF_None),
+    (TFL_TransposeConvOp $output_shape,
+      (BinaryOp (ConstantOp $weights),
+        (ConstantOp (ExpandTo4DForConv $value)),
+        TFL_AF_None),
+      $input,
+      (BinaryOp (ConstantOp $bias),
+        (ConstantOp $value),
+        TFL_AF_None),
+      $padding, $stride_h, $stride_w),
+    [(CanFuseConvOrDepthwiseConv<"false"> $weights, $value),
+     (HasOneUse $output)]>;
+  def FuseMulOrDivWithTransposeConvWithNoneBias#BinaryOp : Pat<
+    (BinaryOp (TFL_TransposeConvOp:$output $output_shape,
+                (ConstantOp F32ElementsAttr:$weights), $input,
+                (ConstantOp $bias),
+                $padding, $stride_h, $stride_w),
+              (ConstantOp $value), TFL_AF_None),
+    (TFL_TransposeConvOp $output_shape,
+      (BinaryOp (ConstantOp $weights),
+        (ConstantOp (ExpandTo4DForConv $value)),
+        TFL_AF_None),
+      $input,
+      (ConstantOp $bias),
+      $padding, $stride_h, $stride_w),
+    [(CanFuseConvOrDepthwiseConv<"false"> $weights, $value),
+     (IsNoneType $bias),
+     (HasOneUse $output)]>;
 }
 
 foreach BinaryOp = [TFL_DivOp, TFL_MulOp] in
@@ -420,9 +497,9 @@ def ConvertExpandDimsToReshape : Pat<
   [(AnyStaticShapeTensor $expand_dims_op)]>;
 
 class FloatValueEquals<string val> : Constraint<CPred<
-  "$0.cast<DenseElementsAttr>().getNumElements() == 1 &&"
-  "$0.isa<DenseFPElementsAttr>() &&"
-  "*$0.cast<DenseElementsAttr>().getValues<float>().begin() == " # val>>;
+  "$0.isa<DenseFPElementsAttr>() && "
+  "llvm::all_of($0.cast<DenseElementsAttr>().getFloatValues(), "
+  "[](const APFloat& f) { return f.isExactlyValue(" # val # "); })">>;
 
 // ReLU patterns
 def MatchReluPattern : Pat<
@@ -552,3 +629,37 @@ foreach ReduceOp = [TFL_ReduceMaxOp, TFL_ReduceMinOp, TFL_ReduceProdOp,
      (HasOneUse $reduce)]>;
 }
 
+
+def IsSame : Constraint<CPred<"$0 == $1">>;
+def HasTwoUse : Constraint<CPred<
+  "std::distance($0.use_begin(), $0.use_end()) == 2">>;
+def AxesIsLastDimension : Constraint<CPred<
+  "$0.cast<DenseIntElementsAttr>().getNumElements() == 1 && "
+  "$0.cast<DenseIntElementsAttr>().getValue<APInt>({0}) == "
+  "$1.getType().cast<ShapedType>().getRank() - 1">>;
+
+// Convert exp(x)/sum(exp(x)) into softmax.
+def OptimizeToSoftmax : Pat<
+  (TFL_DivOp (TFL_ExpOp:$exp $input),
+             (TFL_SumOp:$sum $sum_input, (ConstantOp I32ElementsAttr: $axes),
+                             ConstBoolAttrTrue), TFL_AF_None),
+  (TFL_SoftmaxOp $input, ConstF32Attr<"1.0">),
+  [(IsSame $exp, $sum_input),
+   (AxesIsLastDimension $axes, $sum_input),
+   (HasTwoUse $exp),
+   (HasOneUse $sum)]>;
+
+// Convert softmax(x-max(x)) into softmax(x) as the softmax op already deals
+// with the max normalization.
+def FoldNormalizationIntoSoftmax : Pat<
+  (TFL_SoftmaxOp
+    (TFL_SubOp:$sub $input,
+      (TFL_ReduceMaxOp:$max $max_input, (ConstantOp I32ElementsAttr: $axes),
+                            ConstBoolAttrTrue),
+    TFL_AF_None),
+    $beta),
+  (TFL_SoftmaxOp $input, $beta),
+  [(IsSame $input, $max_input),
+   (AxesIsLastDimension $axes, $max_input),
+   (HasOneUse $sub),
+   (HasOneUse $max)]>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
index 1c6550bc902..ca30b2f1fcf 100644
--- a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
@@ -144,7 +144,7 @@ void PostQuantizePass::runOnFunction() {
   OwningRewritePatternList patterns;
   auto func = getFunction();
   auto* ctx = func.getContext();
-  TFL::populateWithGenerated(ctx, &patterns);
+  TFL::populateWithGenerated(ctx, patterns);
   patterns.insert<quant::FoldTrivalRequantizeOp<QuantizeOp>>(ctx);
   applyPatternsAndFoldGreedily(func, patterns);
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
index 0efd7187e16..172ce59ddd4 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/lstm_utils.h"
+#include "tensorflow/compiler/mlir/lite/utils/nms_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/tftext_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -59,6 +60,7 @@ namespace {
 
 constexpr char kTFAPIImplements[] = "tf.api_implements";
 constexpr char kTFTextAPIPrefix[] = "tftext:";
+constexpr char kCustomSSDPostprocessing[] = "TFLite_Detection_PostProcess";
 constexpr char kTfNMSPadded[] = "non_max_suppression_padded_v2";
 
 using mlir::TF::FuncAttr;
@@ -99,59 +101,6 @@ class ConvertEmbeddedLookupFunc {
   FuncOp func_;
 };
 
-// Abstracts the conversion of the padded NMS composite function.
-class ConvertNMSPaddedFunc {
- public:
-  explicit ConvertNMSPaddedFunc(FuncOp func) : func_(func) {}
-
-  void RewriteFunc() {
-    func_.setAttr(kTFImplements,
-                  StringAttr::get(kTfNMSPadded, func_.getContext()));
-    Value boxes = func_.getArgument(0);
-    Value scores = func_.getArgument(1);
-    Value max_output_size = func_.getArgument(2);
-    Value iou_threshold = func_.getArgument(3);
-    Value score_threshold = func_.getArgument(4);
-    auto output_type0 = func_.getType().getResult(0);
-    auto output_type1 = func_.getType().getResult(1);
-
-    OpBuilder builder(func_.getBody());
-    auto op = builder.create<mlir::TFL::NonMaxSuppressionV4Op>(
-        func_.getLoc(), output_type0, output_type1, boxes, scores,
-        max_output_size, iou_threshold, score_threshold);
-
-    builder.create<mlir::ReturnOp>(func_.getLoc(), op.getResults());
-  }
-
-  LogicalResult VerifySignature() {
-    // Verify high-level function signature.
-    // Relevant argument characteristics are checked by the TFL op definition.
-    if (func_.getNumArguments() < 5) {
-      return func_.emitError()
-             << "Invalid number of arguments to "
-                "non_max_suppression_padded_v2 (need atleast 5): "
-             << func_.getNumArguments();
-    }
-    if (func_.getType().getNumResults() != 2) {
-      return func_.emitError() << "Invalid number of results from "
-                                  "non_max_suppression_padded_v2 (need 2): "
-                               << func_.getType().getNumResults();
-    }
-    // The TFLite fused op does not support batching yet.
-    // TODO(b/158709815): Add support for batches with padded NMS.
-    auto boxes_type =
-        func_.getArgument(0).getType().dyn_cast<RankedTensorType>();
-    if (!boxes_type.hasRank() || boxes_type.getRank() != 2) {
-      return func_.emitError() << "TFLite does not support batched input for "
-                                  "non_max_suppression_padded";
-    }
-    return success();
-  }
-
- private:
-  FuncOp func_;
-};
-
 // This pass uses mechanisms listed in RFC:
 // https://github.com/tensorflow/community/pull/113
 // It prepares composite functions that are attributed to indicate
@@ -161,6 +110,10 @@ class ConvertNMSPaddedFunc {
 class PrepareCompositeFunctionsPass
     : public PassWrapper<PrepareCompositeFunctionsPass,
                          OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TFL::TensorFlowLiteDialect>();
+  }
+
  public:
   explicit PrepareCompositeFunctionsPass() {}
 
@@ -219,6 +172,12 @@ void PrepareCompositeFunctionsPass::ConvertTFImplementsWithAttributes(
     if (failed(ConvertTFTextAPI(func, api_name, attr))) {
       return signalPassFailure();
     }
+  } else if (api_name == kCustomSSDPostprocessing) {
+    ConvertSSDPostProcessFunc convert_ssd_postprocess(func, attr);
+    if (failed(convert_ssd_postprocess.VerifySignature()) ||
+        failed(convert_ssd_postprocess.RewriteFunc())) {
+      return signalPassFailure();
+    }
   }
 }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
index f5b252773f6..5cfdb4b982d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
@@ -40,64 +40,6 @@ def : Pat<
     (TF_MulOp $t, (TF_MulOp:$mul (TF_RsqrtOp (TF_AddOp $v, (TF_ConstOp $variance_epsilon))), $gamma)),
     (TF_SubOp $beta, (TF_MulOp $m, $mul)))>;
 
-// Converts tf.FusedBatchNorm & tf.FusedBatchNormV3 into a sequence of more primitive arithmetic
-// operations. Specifically, performs the following calculation:
-//
-//   (x - mean) * scale / sqrt(variance + epsilon) + offset
-//
-// Let multiplier = scale / sqrt(variance + epsilon),
-// to compute
-//   (x - mean) * scale / sqrt(variance + epsilon) + offset,
-// is then to compute
-//   (x * multiplier) + (offset - mean * multiplier).
-def : Pattern<
-    (TF_FusedBatchNormOp:$root
-        $x, $scale, $offset, $mean, $variance,
-        F32Attr:$epsilon, $exponential_avg_factor,
-        $data_format, FalseBoolAttr:$is_training),
-    [(TF_AddOp
-        (TF_MulOp
-            $x,
-            (TF_MulOp:$multiplier
-                $scale,
-                (TF_RsqrtOp
-                    (TF_AddOp $variance,
-                              (TF_ConstOp $epsilon))))),
-        (TF_SubOp $offset, (TF_MulOp $mean, $multiplier))),
-     // We already guaranteed that the last four results has no use so it does
-     // not matter what value we provide here for replacement.
-     /*batch_mean=*/(replaceWithValue $x),
-     /*batch_variance=*/(replaceWithValue $x),
-     /*reserve_space_1=*/(replaceWithValue $x),
-     /*reserve_space_2=*/(replaceWithValue $x)],
-    [(HasNoUseOf:$root__1), (HasNoUseOf:$root__2),
-     (HasNoUseOf:$root__3), (HasNoUseOf:$root__4)]>;
-
-def : Pattern<
-    (TF_FusedBatchNormV3Op:$root
-        $x, $scale, $offset, $mean, $variance,
-        F32Attr:$epsilon, $exponential_avg_factor,
-        $data_format, FalseBoolAttr:$is_training),
-    [(TF_AddOp
-        (TF_MulOp
-            $x,
-            (TF_MulOp:$multiplier
-                $scale,
-                (TF_RsqrtOp
-                    (TF_AddOp $variance,
-                              (TF_ConstOp $epsilon))))),
-        (TF_SubOp $offset, (TF_MulOp $mean, $multiplier))),
-     // We already guaranteed that the last five results have no use so it does
-     // not matter what value we provide here for replacement.
-     /*batch_mean=*/(replaceWithValue $x),
-     /*batch_variance=*/(replaceWithValue $x),
-     /*reserve_space_1=*/(replaceWithValue $x),
-     /*reserve_space_2=*/(replaceWithValue $x),
-     /*reserve_space_3=*/(replaceWithValue $x)],
-    [(HasNoUseOf:$root__1), (HasNoUseOf:$root__2),
-     (HasNoUseOf:$root__3), (HasNoUseOf:$root__4),
-     (HasNoUseOf:$root__5)]>;
-
 class TFi32<int v> : ConstantAttr<I32ElementsAttr, !cast<string>(v)>;
 
 // Matmul without transpose on b to matmul with explicit transpose op and
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index 07b7aacd95d..783f21fce21 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -69,6 +69,11 @@ namespace {
 // training quantization simpler.
 class PrepareQuantizePass
     : public PassWrapper<PrepareQuantizePass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TFL::TensorFlowLiteDialect,
+                    ::mlir::quant::QuantizationDialect>();
+  }
+
  public:
   // Constructor used by the PassRegistration and enforce uint8 quantization.
   // This is only used by test.
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index c521ca0ed53..c4f30c22be3 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -80,9 +80,11 @@ namespace {
 // Prepare TF operations in functions for subsequent legalization.
 class PrepareTFPass : public PassWrapper<PrepareTFPass, FunctionPass> {
  public:
-  explicit PrepareTFPass() : unfold_batch_matmul_(true) {}
-  explicit PrepareTFPass(bool unfold_batch_matmul)
-      : unfold_batch_matmul_(unfold_batch_matmul) {}
+  PrepareTFPass() = default;
+  PrepareTFPass(const PrepareTFPass &) {}
+  explicit PrepareTFPass(bool unfold_batch_matmul) {
+    unfold_batch_matmul_ = unfold_batch_matmul;
+  }
   void runOnFunction() override;
 
   void getDependentDialects(DialectRegistry &registry) const override {
@@ -91,7 +93,10 @@ class PrepareTFPass : public PassWrapper<PrepareTFPass, FunctionPass> {
   }
 
  private:
-  bool unfold_batch_matmul_;
+  Option<bool> unfold_batch_matmul_{
+      *this, "tfl-unfold-batch-matmul",
+      llvm::cl::desc("Unfold BatchMatMul into individual MatMul ops."),
+      llvm::cl::init(true)};
 };
 
 template <class TFFakeQuantOp>
@@ -210,9 +215,8 @@ struct InsertTFLQuantOpsAfterTFFakeQuantOp
     }
     // Use the min/max from the operands and the num_bits and narrow_range
     // attribute to create the quantization parameter for the new quantize op.
-    rewriter.setInsertionPointAfter(tf_op);
-    IntegerAttr num_bits =
-        rewriter.getI64IntegerAttr(tf_op.num_bits().getSExtValue());
+    rewriter.setInsertionPointAfter(tf_op.getOperation());
+    IntegerAttr num_bits = rewriter.getI64IntegerAttr(tf_op.num_bits());
     BoolAttr narrow_range = rewriter.getBoolAttr(tf_op.narrow_range());
     Type res_type = tf_op.getType();
     TypeAttr qtype = quant::GetQuantizedTypeAttr(
@@ -533,8 +537,8 @@ struct ConvertTFStridedSlice : public RewritePattern {
         loc, new_output_type, original_input, shape);
 
     // Replace the original strided_slice.
-    llvm::APInt new_begin_mask = strided_slice_op.begin_mask();
-    llvm::APInt new_end_mask = strided_slice_op.end_mask();
+    uint64_t new_begin_mask = strided_slice_op.begin_mask();
+    uint64_t new_end_mask = strided_slice_op.end_mask();
     // Since we expand the dims, we need to apply them to the begin_mask &
     // end_mask.
     new_begin_mask |= strided_slice_op.new_axis_mask();
@@ -597,8 +601,8 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
     const int ellipsis_filled_dim_size = input_size - begin_shape[0] + 1;
 
-    int64_t begin_mask = strided_slice_op.begin_mask().getSExtValue();
-    int64_t end_mask = strided_slice_op.end_mask().getSExtValue();
+    int64_t begin_mask = strided_slice_op.begin_mask();
+    int64_t end_mask = strided_slice_op.end_mask();
     int64_t new_begin_mask = 0;
     int64_t new_end_mask = 0;
 
@@ -634,13 +638,16 @@ struct ConvertTFStridedSlice : public RewritePattern {
     ++index;
 
     // After the ellipsis.
-    for (; index < begin_shape[0]; ++index) {
+    for (; index < begin_shape[0];) {
       padded_begin.push_back(begin_dense_elem_attr.getValue<int32_t>(index));
       padded_end.push_back(end_dense_elem_attr.getValue<int32_t>(index));
       padded_stride.push_back(stride_dense_elem_attr.getValue<int32_t>(index));
 
       if ((begin_mask >> index) & 1) new_begin_mask |= (1 << new_index);
       if ((end_mask >> index) & 1) new_end_mask |= (1 << new_index);
+
+      ++index;
+      ++new_index;
     }
 
     auto attribute_type = rewriter.getIntegerType(64);
@@ -676,16 +683,16 @@ struct ConvertTFStridedSlice : public RewritePattern {
 
     // TODO(renjieliu): Consider expand the transformation for shrink mask as
     // well.
-    if (strided_slice_op.shrink_axis_mask().getZExtValue()) return failure();
+    if (strided_slice_op.shrink_axis_mask()) return failure();
 
     // Handle new axis mask.
-    uint64_t new_axis_mask = strided_slice_op.new_axis_mask().getZExtValue();
+    uint64_t new_axis_mask = strided_slice_op.new_axis_mask();
     if (new_axis_mask != 0) {
       return RewriteNewAxisMask(strided_slice_op, new_axis_mask, rewriter);
     }
 
     // Handle ellipsis mask.
-    uint64_t ellipsis_mask = strided_slice_op.ellipsis_mask().getZExtValue();
+    uint64_t ellipsis_mask = strided_slice_op.ellipsis_mask();
     if (ellipsis_mask != 0) {
       return RewriteEllipsisMask(strided_slice_op, ellipsis_mask, rewriter);
     }
@@ -733,6 +740,278 @@ struct ConvertTFBroadcastTo : public RewritePattern {
   }
 };
 
+// The below pattern is equivalent to the DRR rule below
+// The checks are dependent on generated values, so we can't add
+// the checks on intermediate values, ideally we should find equivalent
+// checks that guarantees the resultant ops are valid.
+// The extra conditions are the broadcasting conditions.
+//
+// The pattern lower FusedBatchNormV3 to arithmetic ops.
+// Specifically, performs the following calculation:
+//
+//   (x - mean) * scale / sqrt(variance + epsilon) + offset
+//
+// Let multiplier = scale / sqrt(variance + epsilon),
+// to compute
+//   (x - mean) * scale / sqrt(variance + epsilon) + offset,
+// is then to compute
+//   (x * multiplier) + (offset - mean * multiplier).
+//
+// def : Pattern<
+//     (TF_FusedBatchNormV3Op:$root
+//         $x, $scale, $offset, $mean, $variance,
+//         F32Attr:$epsilon, $exponential_avg_factor,
+//         $data_format, FalseBoolAttr:$is_training),
+//     [(TF_AddOp
+//         (TF_MulOp
+//             $x,
+//             (TF_MulOp:$multiplier
+//                 $scale,
+//                 (TF_RsqrtOp
+//                     (TF_AddOp $variance,
+//                               (TF_ConstOp $epsilon))))),
+//         (TF_SubOp $offset, (TF_MulOp $mean, $multiplier))),
+//    // We already guaranteed that the last five results have no use so it does
+//    // not matter what value we provide here for replacement.
+//      /*batch_mean=*/(replaceWithValue $x),
+//      /*batch_variance=*/(replaceWithValue $x),
+//      /*reserve_space_1=*/(replaceWithValue $x),
+//      /*reserve_space_2=*/(replaceWithValue $x),
+//      /*reserve_space_3=*/(replaceWithValue $x)],
+//     [(HasNoUseOf:$root__1), (HasNoUseOf:$root__2),
+//      (HasNoUseOf:$root__3), (HasNoUseOf:$root__4),
+//      (HasNoUseOf:$root__5), (AreBroadcastableTypes $multiplier, $x)]>;
+
+struct FusedBatchNormV3Pat : public ::mlir::RewritePattern {
+  explicit FusedBatchNormV3Pat(::mlir::MLIRContext *context)
+      : ::mlir::RewritePattern(
+            "tf.FusedBatchNormV3",
+            {"tf.Add", "tf.Const", "tf.Mul", "tf.Rsqrt", "tf.Sub"}, 1,
+            context) {}
+
+  ::mlir::LogicalResult matchAndRewrite(
+      ::mlir::Operation *fused_batch_norm,
+      ::mlir::PatternRewriter &rewriter) const override {
+    // Variables for capturing values and attributes used for creating ops
+    Operation::operand_range mean(fused_batch_norm->getOperands());
+    ::mlir::FloatAttr exponential_avg_factor;
+    ::mlir::StringAttr data_format;
+    ::mlir::TF::FusedBatchNormV3Op root;
+    Operation::operand_range offset(fused_batch_norm->getOperands());
+    Operation::operand_range x(fused_batch_norm->getOperands());
+    Operation::operand_range scale(fused_batch_norm->getOperands());
+    Operation::operand_range variance(fused_batch_norm->getOperands());
+    ::mlir::FloatAttr epsilon;
+    ::mlir::BoolAttr is_training;
+
+    // Match
+    auto fused_batch_norm_op =
+        dyn_cast_or_null<::mlir::TF::FusedBatchNormV3Op>(fused_batch_norm);
+    root = fused_batch_norm_op;
+    x = fused_batch_norm_op.getODSOperands(0);
+    scale = fused_batch_norm_op.getODSOperands(1);
+    offset = fused_batch_norm_op.getODSOperands(2);
+    mean = fused_batch_norm_op.getODSOperands(3);
+    variance = fused_batch_norm_op.getODSOperands(4);
+    {
+      epsilon = fused_batch_norm_op.getAttrOfType<::mlir::FloatAttr>("epsilon");
+      if (!epsilon)
+        epsilon = rewriter.getFloatAttr(rewriter.getF32Type(), 0.0001f);
+
+      if (!(((epsilon.isa<::mlir::FloatAttr>())) &&
+            ((epsilon.cast<::mlir::FloatAttr>().getType().isF32())))) {
+        return rewriter.notifyMatchFailure(
+            fused_batch_norm_op, [&](::mlir::Diagnostic &diag) {
+              diag << "op 'tf.FusedBatchNormV3' attribute 'epsilon' failed to "
+                      "satisfy constraint: 32-bit float attribute";
+            });
+      }
+    }
+    {
+      exponential_avg_factor =
+          fused_batch_norm_op.getAttrOfType<::mlir::FloatAttr>(
+              "exponential_avg_factor");
+      if (!exponential_avg_factor)
+        exponential_avg_factor =
+            rewriter.getFloatAttr(rewriter.getF32Type(), 1.0f);
+    }
+    {
+      data_format =
+          fused_batch_norm_op.getAttrOfType<::mlir::StringAttr>("data_format");
+      if (!data_format) data_format = rewriter.getStringAttr("NHWC");
+    }
+    {
+      is_training =
+          fused_batch_norm_op.getAttrOfType<::mlir::BoolAttr>("is_training");
+      if (!is_training) is_training = rewriter.getBoolAttr(true);
+
+      if (!((!is_training.getValue()))) {
+        return rewriter.notifyMatchFailure(
+            fused_batch_norm_op, [&](::mlir::Diagnostic &diag) {
+              diag << "op 'tf.FusedBatchNormV3' attribute 'is_training' failed "
+                      "to "
+                      "satisfy constraint: FalseBoolAttr";
+            });
+      }
+    }
+
+    if (!(((*root.getODSResults(1).begin()).use_empty()))) {
+      return rewriter.notifyMatchFailure(
+          fused_batch_norm_op, [&](::mlir::Diagnostic &diag) {
+            diag << "entities '' failed to satisfy constraint: has no use";
+          });
+    }
+
+    if (!(((*root.getODSResults(2).begin()).use_empty()))) {
+      return rewriter.notifyMatchFailure(
+          fused_batch_norm_op, [&](::mlir::Diagnostic &diag) {
+            diag << "entities '' failed to satisfy constraint: has no use";
+          });
+    }
+
+    if (!(((*root.getODSResults(3).begin()).use_empty()))) {
+      return rewriter.notifyMatchFailure(
+          fused_batch_norm_op, [&](::mlir::Diagnostic &diag) {
+            diag << "entities '' failed to satisfy constraint: has no use";
+          });
+    }
+
+    if (!(((*root.getODSResults(4).begin()).use_empty()))) {
+      return rewriter.notifyMatchFailure(
+          fused_batch_norm_op, [&](::mlir::Diagnostic &diag) {
+            diag << "entities '' failed to satisfy constraint: has no use";
+          });
+    }
+
+    if (!(((*root.getODSResults(5).begin()).use_empty()))) {
+      return rewriter.notifyMatchFailure(
+          fused_batch_norm_op, [&](::mlir::Diagnostic &diag) {
+            diag << "entities '' failed to satisfy constraint: has no use";
+          });
+    }
+    // Rewrite
+    auto odsLoc = rewriter.getFusedLoc({fused_batch_norm->getLoc()});
+    ::llvm::SmallVector<::mlir::Value, 4> replace_values;
+    ::mlir::TF::ConstOp epsilon_const_op;
+    {
+      epsilon_const_op =
+          rewriter.create<::mlir::TF::ConstOp>(odsLoc,
+                                               /*value=*/epsilon);
+    }
+    ::mlir::TF::AddOp add_op_1;
+    {
+      ::mlir::Value tblgen_value_0 = (*variance.begin());
+      ::mlir::Value tblgen_value_1 =
+          (*epsilon_const_op.getODSResults(0).begin());
+      add_op_1 = rewriter.create<::mlir::TF::AddOp>(odsLoc,
+                                                    /*x=*/tblgen_value_0,
+                                                    /*y=*/tblgen_value_1);
+      // We need to make sure the Add operands are broadcastable.
+      if (mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(add_op_1)
+              .value == LogicalResult::Failure) {
+        return failure();
+      }
+    }
+    ::mlir::TF::RsqrtOp rsqrt_op;
+    {
+      ::mlir::SmallVector<::mlir::Value, 4> tblgen_values;
+      ::mlir::SmallVector<::mlir::NamedAttribute, 4> tblgen_attrs;
+      tblgen_values.push_back((*add_op_1.getODSResults(0).begin()));
+      rsqrt_op = rewriter.create<::mlir::TF::RsqrtOp>(odsLoc, tblgen_values,
+                                                      tblgen_attrs);
+    }
+    ::mlir::TF::MulOp multiplier;
+    {
+      ::mlir::Value tblgen_value_0 = (*scale.begin());
+      ::mlir::Value tblgen_value_1 = (*rsqrt_op.getODSResults(0).begin());
+      // We need to make sure the Add operands are broadcastable.
+      multiplier = rewriter.create<::mlir::TF::MulOp>(odsLoc,
+                                                      /*x=*/tblgen_value_0,
+                                                      /*y=*/tblgen_value_1);
+      if (mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(multiplier)
+              .value == LogicalResult::Failure) {
+        return failure();
+      }
+    }
+    ::mlir::TF::MulOp mul_op_1;
+    {
+      ::mlir::Value tblgen_value_0 = (*x.begin());
+      ::mlir::Value tblgen_value_1 = (*multiplier.getODSResults(0).begin());
+      mul_op_1 = rewriter.create<::mlir::TF::MulOp>(odsLoc,
+                                                    /*x=*/tblgen_value_0,
+                                                    /*y=*/tblgen_value_1);
+      // We need to make sure the Mul operands are broadcastable.
+      if (mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(mul_op_1)
+              .value == LogicalResult::Failure) {
+        return failure();
+      }
+    }
+    ::mlir::TF::MulOp mul_op_2;
+    {
+      ::mlir::Value tblgen_value_0 = (*mean.begin());
+      ::mlir::Value tblgen_value_1 = (*multiplier.getODSResults(0).begin());
+      mul_op_2 = rewriter.create<::mlir::TF::MulOp>(odsLoc,
+                                                    /*x=*/tblgen_value_0,
+                                                    /*y=*/tblgen_value_1);
+      if (mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(mul_op_2)
+              .value == LogicalResult::Failure) {
+        return failure();
+      }
+    }
+    ::mlir::TF::SubOp sub_op;
+    {
+      ::mlir::Value tblgen_value_0 = (*offset.begin());
+      ::mlir::Value tblgen_value_1 = (*mul_op_2.getODSResults(0).begin());
+      sub_op = rewriter.create<::mlir::TF::SubOp>(odsLoc,
+                                                  /*x=*/tblgen_value_0,
+                                                  /*y=*/tblgen_value_1);
+      if (mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(sub_op).value ==
+          LogicalResult::Failure) {
+        return failure();
+      }
+    }
+    ::mlir::TF::AddOp add_op_2;
+    {
+      ::mlir::SmallVector<::mlir::Value, 4> tblgen_values;
+      ::mlir::SmallVector<::mlir::NamedAttribute, 4> tblgen_attrs;
+      tblgen_values.push_back((*mul_op_1.getODSResults(0).begin()));
+      tblgen_values.push_back((*sub_op.getODSResults(0).begin()));
+      ::mlir::SmallVector<::mlir::Type, 4> tblgen_types;
+      for (auto v : fused_batch_norm_op.getODSResults(0)) {
+        tblgen_types.push_back(v.getType());
+      }
+      add_op_2 = rewriter.create<::mlir::TF::AddOp>(
+          odsLoc, tblgen_types, tblgen_values, tblgen_attrs);
+      // We need to make sure the Add operands are broadcastable.
+      if (mlir::OpTrait::impl::verifyCompatibleOperandBroadcast(add_op_2)
+              .value == LogicalResult::Failure) {
+        return failure();
+      }
+    }
+    for (auto v :
+         ::llvm::SmallVector<::mlir::Value, 4>{add_op_2.getODSResults(0)}) {
+      replace_values.push_back(v);
+    }
+    for (auto v : ::llvm::SmallVector<::mlir::Value, 4>{x}) {
+      replace_values.push_back(v);
+    }
+    for (auto v : ::llvm::SmallVector<::mlir::Value, 4>{x}) {
+      replace_values.push_back(v);
+    }
+    for (auto v : ::llvm::SmallVector<::mlir::Value, 4>{x}) {
+      replace_values.push_back(v);
+    }
+    for (auto v : ::llvm::SmallVector<::mlir::Value, 4>{x}) {
+      replace_values.push_back(v);
+    }
+    for (auto v : ::llvm::SmallVector<::mlir::Value, 4>{x}) {
+      replace_values.push_back(v);
+    }
+    rewriter.replaceOp(fused_batch_norm, replace_values);
+    return success();
+  };
+};
+
 #include "tensorflow/compiler/mlir/lite/transforms/generated_prepare_tf.inc"
 
 // Returns success if all the operations in the `op`'s regions including `op`
@@ -758,10 +1037,13 @@ LogicalResult ConvertTf2XlaOps(FuncOp func, MLIRContext *context) {
   target.addLegalOp<ModuleOp>();
   target.addLegalOp<FuncOp>();
   target.addIllegalOp<TF::XlaConvOp>();
+  target.addIllegalOp<TF::XlaGatherOp>();
 
   OwningRewritePatternList patterns;
   mhlo::PopulateLegalizeTfWithTf2XlaPatterns("XLA_CPU_JIT", patterns);
+  mhlo::PopulateLegalizeTfPatterns(context, &patterns);
   TF::PopulateLegalizeHloToTfPatterns(&patterns, context);
+  mhlo::GatherOp::getCanonicalizationPatterns(patterns, context);
 
   return applyPartialConversion(func, target, patterns);
 }
@@ -892,9 +1174,10 @@ void PrepareTFPass::runOnFunction() {
   // This pattern will try to identify and optimize for dilated convolution.
   // e.g. Patterns like "SpaceToBatchND -> Conv2D -> BatchToSpaceND" will be
   // replaced with a single Conv op with dilation parameter.
-  patterns.insert<ConvertTFDilatedConvOp<TF::Conv2DOp>,
+  patterns.insert<ConvertTFDilatedConvOp<TF::Conv2DOp>, FusedBatchNormV3Pat,
                   ConvertTFDilatedConvOp<TF::DepthwiseConv2dNativeOp>>(ctx);
-  TFL::populateWithGenerated(ctx, &patterns);
+
+  TFL::populateWithGenerated(ctx, patterns);
   // TODO(karimnosseir): Split to separate pass probably after
   // deciding on long term plan for this optimization.
   // This will allow optimizing any TF_Mul->TF_Conv in the graph
@@ -905,7 +1188,7 @@ void PrepareTFPass::runOnFunction() {
   // Load the generated pattern again, so new quantization pass-through
   // will be applied.
   patterns.clear();
-  TFL::populateWithGenerated(ctx, &patterns);
+  TFL::populateWithGenerated(ctx, patterns);
   if (unfold_batch_matmul_) {
     patterns.insert<TF::ConvertTFBatchMatMulOp<TF::BatchMatMulOp>,
                     TF::ConvertTFBatchMatMulOp<TF::BatchMatMulV2Op>>(ctx);
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize.cc b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
index ba25b5c897c..529e57780c3 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
@@ -84,7 +84,7 @@ void QuantizePass::runOnFunction() {
   OwningRewritePatternList patterns;
   auto func = getFunction();
   auto* ctx = func.getContext();
-  TFL::populateWithGenerated(ctx, &patterns);
+  TFL::populateWithGenerated(ctx, patterns);
   patterns.insert<TFLFullQuantization>(
       ctx, enable_numeric_verify, error_tolerance, enable_single_layer_verify);
   applyPatternsAndFoldGreedily(func, patterns);
diff --git a/tensorflow/compiler/mlir/lite/utils/nms_utils.cc b/tensorflow/compiler/mlir/lite/utils/nms_utils.cc
new file mode 100644
index 00000000000..e462d4f38b0
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/nms_utils.cc
@@ -0,0 +1,174 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/utils/nms_utils.h"
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+
+namespace mlir {
+namespace TFL {
+
+namespace {
+
+// TODO(b/162842801): Consolidate all util definitions of kTFImplements.
+constexpr char kTFImplements[] = "tf._implements";
+constexpr char kCustomSSDPostprocessing[] = "TFLite_Detection_PostProcess";
+constexpr char kTfNMSPadded[] = "non_max_suppression_padded_v2";
+
+inline OpaqueElementsAttr CustomOption(OpBuilder* builder,
+                                       const std::string& content) {
+  ShapedType type = RankedTensorType::get(
+      {static_cast<int64_t>(content.size())}, builder->getIntegerType(8));
+  return OpaqueElementsAttr::get(builder->getContext()->getLoadedDialect("tfl"),
+                                 type,
+                                 StringRef(content.data(), content.size()));
+}
+
+}  // namespace
+
+void ConvertNMSPaddedFunc::RewriteFunc() {
+  func_.setAttr(kTFImplements,
+                StringAttr::get(kTfNMSPadded, func_.getContext()));
+  Value boxes = func_.getArgument(0);
+  Value scores = func_.getArgument(1);
+  Value max_output_size = func_.getArgument(2);
+  Value iou_threshold = func_.getArgument(3);
+  Value score_threshold = func_.getArgument(4);
+  auto output_type0 = func_.getType().getResult(0);
+  auto output_type1 = func_.getType().getResult(1);
+
+  OpBuilder builder(func_.getBody());
+  auto op = builder.create<mlir::TFL::NonMaxSuppressionV4Op>(
+      func_.getLoc(), output_type0, output_type1, boxes, scores,
+      max_output_size, iou_threshold, score_threshold);
+
+  builder.create<mlir::ReturnOp>(func_.getLoc(), op.getResults());
+}
+
+LogicalResult ConvertNMSPaddedFunc::VerifySignature() {
+  // Verify high-level function signature.
+  // Relevant argument characteristics are checked by the TFL op definition.
+  if (func_.getNumArguments() < 5) {
+    return func_.emitError()
+           << "Invalid number of arguments to "
+              "non_max_suppression_padded_v2 (need atleast 5): "
+           << func_.getNumArguments();
+  }
+  if (func_.getType().getNumResults() != 2) {
+    return func_.emitError() << "Invalid number of results from "
+                                "non_max_suppression_padded_v2 (need 2): "
+                             << func_.getType().getNumResults();
+  }
+  // The TFLite fused op does not support batching yet.
+  // TODO(b/158709815): Add support for batches with padded NMS.
+  auto boxes_type = func_.getArgument(0).getType().dyn_cast<RankedTensorType>();
+  if (!boxes_type.hasRank() || boxes_type.getRank() != 2) {
+    return func_.emitError() << "TFLite does not support batched input for "
+                                "non_max_suppression_padded";
+  }
+  return success();
+}
+
+LogicalResult ConvertSSDPostProcessFunc::RewriteFunc() {
+  func_.eraseBody();
+  func_.addEntryBlock();
+  func_.setAttr(kTFImplements,
+                StringAttr::get(kCustomSSDPostprocessing, func_.getContext()));
+
+  OpBuilder builder(func_.getBody());
+  std::string custom_option_buffer;
+  if (failed(CreateNMSCustomOptions(func_, attr_.GetAttrs(),
+                                    custom_option_buffer))) {
+    return failure();
+  }
+  auto op = builder.create<CustomOp>(
+      func_.getLoc(), func_.getType().getResults(), func_.getArguments(),
+      kCustomSSDPostprocessing, CustomOption(&builder, custom_option_buffer));
+  builder.create<ReturnOp>(func_.getLoc(), op.getResults());
+
+  return success();
+}
+
+LogicalResult ConvertSSDPostProcessFunc::CreateNMSCustomOptions(
+    FuncOp func, DictionaryAttr attrs, std::string& custom_option_buffer) {
+  flexbuffers::Builder fbb;
+  size_t start_map = fbb.StartMap();
+
+  if (failed(AddIntAttr(func, attrs, "max_detections", &fbb)) ||
+      failed(AddIntAttr(func, attrs, "max_classes_per_detection", &fbb)) ||
+      failed(AddIntAttr(func, attrs, "num_classes", &fbb)) ||
+      failed(AddFloatAttr(func, attrs, "nms_score_threshold", &fbb)) ||
+      failed(AddFloatAttr(func, attrs, "nms_iou_threshold", &fbb)) ||
+      failed(AddFloatAttr(func, attrs, "y_scale", &fbb)) ||
+      failed(AddFloatAttr(func, attrs, "x_scale", &fbb)) ||
+      failed(AddFloatAttr(func, attrs, "h_scale", &fbb)) ||
+      failed(AddFloatAttr(func, attrs, "w_scale", &fbb)))
+    return failure();
+  auto use_regular_nms =
+      attrs.get("use_regular_nms").dyn_cast_or_null<BoolAttr>();
+  if (!use_regular_nms) {
+    return func.emitError()
+           << "use_regular_nms attribute is not set or not a bool";
+  }
+  fbb.Int("use_regular_nms", use_regular_nms.getValue());
+
+  fbb.EndMap(start_map);
+  fbb.Finish();
+  custom_option_buffer.assign(fbb.GetBuffer().begin(), fbb.GetBuffer().end());
+  return success();
+}
+
+LogicalResult ConvertSSDPostProcessFunc::AddIntAttr(
+    FuncOp func, DictionaryAttr attrs, const std::string& attribute,
+    flexbuffers::Builder* builder) {
+  auto int_attr = attrs.get(attribute).dyn_cast_or_null<IntegerAttr>();
+  if (!int_attr) {
+    return func.emitError()
+           << attribute.c_str() << " attribute is not set or not an integer";
+  }
+  builder->Int(attribute.c_str(), int_attr.getInt());
+  return success();
+}
+
+LogicalResult ConvertSSDPostProcessFunc::AddFloatAttr(
+    FuncOp func, DictionaryAttr attrs, const std::string& attribute,
+    flexbuffers::Builder* builder) {
+  auto float_attr = attrs.get(attribute).dyn_cast_or_null<FloatAttr>();
+  if (!float_attr) {
+    return func.emitError()
+           << attribute.c_str() << " attribute is not set or not a float";
+  }
+  builder->Float(attribute.c_str(), float_attr.getValue().convertToFloat());
+  return success();
+}
+
+LogicalResult ConvertSSDPostProcessFunc::VerifySignature() {
+  // Verify high-level function signature.
+  if (func_.getNumArguments() != 3) {
+    return func_.emitError()
+           << "Invalid number of arguments to " << kCustomSSDPostprocessing
+           << ": " << func_.getNumArguments();
+  }
+  if (func_.getType().getNumResults() != 4) {
+    return func_.emitError()
+           << "Invalid number of results from " << kCustomSSDPostprocessing
+           << ": " << func_.getType().getNumResults();
+  }
+  return success();
+}
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/utils/nms_utils.h b/tensorflow/compiler/mlir/lite/utils/nms_utils.h
new file mode 100644
index 00000000000..6a9035e0c81
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/nms_utils.h
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines common utils used by TFLite transformation
+// passes to work with NMS ops in TFLite.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_NMS_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_NMS_UTILS_H_
+
+#include <string>
+
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+
+namespace mlir {
+namespace TFL {
+
+// Abstracts the conversion of the padded NMS composite function.
+class ConvertNMSPaddedFunc {
+ public:
+  explicit ConvertNMSPaddedFunc(FuncOp func) : func_(func) {}
+
+  void RewriteFunc();
+
+  LogicalResult VerifySignature();
+
+ private:
+  FuncOp func_;
+};
+
+// Abstracts the conversion of the SSD post-processing composite function to
+// TFLite.
+class ConvertSSDPostProcessFunc {
+ public:
+  explicit ConvertSSDPostProcessFunc(FuncOp func, mlir::TF::FuncAttr attr)
+      : func_(func), attr_(attr) {}
+
+  LogicalResult RewriteFunc();
+
+  LogicalResult VerifySignature();
+
+ private:
+  LogicalResult CreateNMSCustomOptions(FuncOp func, DictionaryAttr attrs,
+                                       std::string& custom_option_buffer);
+
+  LogicalResult AddIntAttr(FuncOp func, DictionaryAttr attrs,
+                           const std::string& attribute,
+                           flexbuffers::Builder* builder);
+
+  LogicalResult AddFloatAttr(FuncOp func, DictionaryAttr attrs,
+                             const std::string& attribute,
+                             flexbuffers::Builder* builder);
+
+  FuncOp func_;
+  mlir::TF::FuncAttr attr_;
+};
+
+}  // end namespace TFL
+}  // end namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_TFTEXT_UTILS_H_
diff --git a/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc b/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
index bce0ed4a33d..6b605741355 100644
--- a/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
+++ b/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/utils/name_utils.h"
 
 static inline absl::string_view StringRefToView(llvm::StringRef ref) {
   return absl::string_view(ref.data(), ref.size());
@@ -103,62 +104,16 @@ int OpOrArgNameMapper::InitOpName(OpOrVal op_or_val, llvm::StringRef name) {
 
 bool OpOrArgNameMapper::IsUnique(llvm::StringRef name) { return true; }
 
-namespace {
-// Derives name from location.
-std::string GetNameFromLoc(mlir::Location loc) {
-  llvm::SmallVector<llvm::StringRef, 8> loc_names;
-  llvm::SmallVector<mlir::Location, 8> locs;
-  locs.push_back(loc);
-  bool names_is_nonempty = false;
-
-  while (!locs.empty()) {
-    mlir::Location curr_loc = locs.pop_back_val();
-
-    if (auto name_loc = curr_loc.dyn_cast<mlir::NameLoc>()) {
-      // Add name in NameLoc. For NameLoc we also account for names due to ops
-      // in functions where the op's name is first.
-      auto name = name_loc.getName().strref().split('@').first;
-      loc_names.push_back(name);
-      if (!name.empty()) names_is_nonempty = true;
-      continue;
-    } else if (auto call_loc = curr_loc.dyn_cast<mlir::CallSiteLoc>()) {
-      // Add name if CallSiteLoc's callee has a NameLoc (as should be the
-      // case if imported with DebugInfo).
-      if (auto name_loc = call_loc.getCallee().dyn_cast<mlir::NameLoc>()) {
-        auto name = name_loc.getName().strref().split('@').first;
-        loc_names.push_back(name);
-        if (!name.empty()) names_is_nonempty = true;
-        continue;
-      }
-    } else if (auto fused_loc = curr_loc.dyn_cast<mlir::FusedLoc>()) {
-      // Push all locations in FusedLoc in reverse order, so locations are
-      // visited based on order in FusedLoc.
-      auto reversed_fused_locs = llvm::reverse(fused_loc.getLocations());
-      locs.append(reversed_fused_locs.begin(), reversed_fused_locs.end());
-      continue;
-    }
-
-    // Location is not a supported, so an empty StringRef is added.
-    loc_names.push_back(llvm::StringRef());
-  }
-
-  if (names_is_nonempty)
-    return llvm::join(loc_names.begin(), loc_names.end(), ";");
-
-  return "";
-}
-}  // anonymous namespace
-
 std::string OpOrArgLocNameMapper::GetName(OpOrVal op_or_val) {
   if (auto* op = op_or_val.dyn_cast<mlir::Operation*>()) {
-    auto name_from_loc = GetNameFromLoc(op->getLoc());
+    auto name_from_loc = mlir::GetNameFromLoc(op->getLoc());
     if (!name_from_loc.empty()) return name_from_loc;
     // If the location is none of the expected types, then simply use name
     // generated using the op type.
     return std::string(op->getName().getStringRef());
   }
   auto val = op_or_val.dyn_cast<mlir::Value>();
-  auto name_from_loc = GetNameFromLoc(val.getLoc());
+  auto name_from_loc = mlir::GetNameFromLoc(val.getLoc());
   if (!name_from_loc.empty()) return name_from_loc;
   // If the location is none of the expected types, then simply use name
   // generated using the op type. Follow TF convention and append the result
diff --git a/tensorflow/compiler/mlir/python/BUILD b/tensorflow/compiler/mlir/python/BUILD
index 5bbfba773a3..502695acd40 100644
--- a/tensorflow/compiler/mlir/python/BUILD
+++ b/tensorflow/compiler/mlir/python/BUILD
@@ -1,3 +1,6 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
@@ -10,6 +13,7 @@ cc_library(
     deps = [
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_status_helper",
+        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
@@ -35,6 +39,9 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/mlir/python/mlir.cc b/tensorflow/compiler/mlir/python/mlir.cc
index f1f6c43d3b3..066726593a7 100644
--- a/tensorflow/compiler/mlir/python/mlir.cc
+++ b/tensorflow/compiler/mlir/python/mlir.cc
@@ -16,19 +16,53 @@ limitations under the License.
 #include <string>
 
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/InitAllPasses.h"  // from @llvm-project
 #include "mlir/Parser.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/import_utils.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/op.h"
 
 namespace tensorflow {
 
+namespace {
+
+// Runs pass pipeline `pass_pipeline` on `module` if `pass_pipeline` is not
+// empty.
+std::string RunPassPipelineOnModule(mlir::ModuleOp module,
+                                    const std::string &pass_pipeline,
+                                    TF_Status *status) {
+  if (!pass_pipeline.empty()) {
+    mlir::PassManager pm(module.getContext());
+    std::string error;
+    llvm::raw_string_ostream error_stream(error);
+    if (failed(mlir::parsePassPipeline(pass_pipeline, pm, error_stream))) {
+      TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                   ("Invalid pass_pipeline: " + error_stream.str()).c_str());
+      return "// error";
+    }
+
+    mlir::StatusScopedDiagnosticHandler statusHandler(module.getContext());
+    if (failed(pm.run(module))) {
+      Set_TF_Status_from_Status(status, statusHandler.ConsumeStatus());
+      return "// error";
+    }
+  }
+  return MlirModuleToString(module);
+}
+
+}  // anonymous namespace
+
 std::string ImportGraphDef(const std::string &proto,
                            const std::string &pass_pipeline,
                            TF_Status *status) {
@@ -41,31 +75,49 @@ std::string ImportGraphDef(const std::string &proto,
   GraphDebugInfo debug_info;
   GraphImportConfig specs;
   mlir::MLIRContext context;
-  context.loadAllGloballyRegisteredDialects();
   auto module = ConvertGraphdefToMlir(graphdef, debug_info, specs, &context);
   if (!module.ok()) {
     Set_TF_Status_from_Status(status, module.status());
     return "// error";
   }
 
-  // Run the pass_pipeline on the module if not empty.
-  if (!pass_pipeline.empty()) {
-    mlir::PassManager pm(&context);
-    std::string error;
-    llvm::raw_string_ostream error_stream(error);
-    if (failed(mlir::parsePassPipeline(pass_pipeline, pm, error_stream))) {
-      TF_SetStatus(status, TF_INVALID_ARGUMENT,
-                   ("Invalid pass_pipeline: " + error_stream.str()).c_str());
-      return "// error";
-    }
+  return RunPassPipelineOnModule(module->get(), pass_pipeline, status);
+}
 
-    mlir::StatusScopedDiagnosticHandler statusHandler(&context);
-    if (failed(pm.run(*module.ValueOrDie()))) {
-      Set_TF_Status_from_Status(status, statusHandler.ConsumeStatus());
-      return "// error";
-    }
+std::string ImportFunction(const std::string &functiondef_proto,
+                           const std::string &functiondef_library_proto,
+                           const std::string &pass_pipeline,
+                           TF_Status *status) {
+  FunctionDef functiondef;
+  auto s = tensorflow::LoadProtoFromBuffer(functiondef_proto, &functiondef);
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(status, s);
+    return "// error";
   }
-  return MlirModuleToString(*module.ConsumeValueOrDie());
+
+  FunctionDefLibrary fdef_lib;
+  s = tensorflow::LoadProtoFromBuffer(functiondef_library_proto, &fdef_lib);
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(status, s);
+    return "// error";
+  }
+
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), fdef_lib);
+  s = flib_def.AddFunctionDef(functiondef);
+  if (!s.ok()) {
+    Set_TF_Status_from_Status(status, s);
+    return "// error";
+  }
+
+  const std::string &function_name = functiondef.signature().name();
+  mlir::MLIRContext context;
+  auto module = ConvertFunctionToMlir(function_name, flib_def, &context);
+  if (!module.ok()) {
+    Set_TF_Status_from_Status(status, module.status());
+    return "// error";
+  }
+
+  return RunPassPipelineOnModule(module->get(), pass_pipeline, status);
 }
 
 std::string ExperimentalConvertSavedModelToMlir(
@@ -86,7 +138,6 @@ std::string ExperimentalConvertSavedModelToMlir(
   std::vector<string> exported_names =
       absl::StrSplit(exported_names_str, ',', absl::SkipEmpty());
   mlir::MLIRContext context;
-  context.loadAllGloballyRegisteredDialects();
   auto module_or = ConvertSavedModelToMlir(
       &bundle, &context, absl::Span<std::string>(exported_names));
   if (!module_or.status().ok()) {
@@ -117,7 +168,6 @@ std::string ExperimentalConvertSavedModelV1ToMlir(
   // Convert the SavedModelBundle to an MLIR module.
 
   mlir::MLIRContext context;
-  context.loadAllGloballyRegisteredDialects();
   auto module_or =
       ConvertSavedModelV1ToMlir(bundle, {}, &context, upgrade_legacy);
   if (!module_or.status().ok()) {
@@ -153,6 +203,7 @@ std::string ExperimentalRunPassPipeline(const std::string &mlir_txt,
                                         bool show_debug_info,
                                         TF_Status *status) {
   mlir::MLIRContext context;
+  mlir::RegisterAllTensorFlowDialects(context.getDialectRegistry());
   mlir::OwningModuleRef module;
   {
     mlir::StatusScopedDiagnosticHandler diagnostic_handler(&context);
@@ -167,6 +218,7 @@ std::string ExperimentalRunPassPipeline(const std::string &mlir_txt,
   mlir::PassManager pm(&context);
   std::string error;
   llvm::raw_string_ostream error_stream(error);
+  mlir::registerAllPasses();
   if (failed(mlir::parsePassPipeline(pass_pipeline, pm, error_stream))) {
     TF_SetStatus(status, TF_INVALID_ARGUMENT,
                  ("Invalid pass_pipeline: " + error_stream.str()).c_str());
diff --git a/tensorflow/compiler/mlir/python/mlir.h b/tensorflow/compiler/mlir/python/mlir.h
index e68ac28124b..6133068a5e8 100644
--- a/tensorflow/compiler/mlir/python/mlir.h
+++ b/tensorflow/compiler/mlir/python/mlir.h
@@ -25,13 +25,23 @@ limitations under the License.
 namespace tensorflow {
 
 // Simple wrapper to support tf.mlir.experimental.convert_graph_def.
-// Load a .pbptx, convert to MLIR, and (optionally) optimize the module before
-// returning it as a string.
+// Load a GraphDef (binary or textual proto format), convert to MLIR, and
+// (optionally) optimize the module before returning it as a string.
 // This is an early experimental API, ideally we should return a wrapper object
 // around a Python binding to the MLIR module.
 std::string ImportGraphDef(const std::string &proto,
                            const std::string &pass_pipeline, TF_Status *status);
 
+// Simple wrapper to support tf.mlir.experimental.convert_function.
+// Load FunctionDef and FunctionDefLibrary (binary or textual proto format),
+// convert to MLIR, and (optionally) optimize the module before returning it as
+// a string.
+// This is an early experimental API, ideally we should return a wrapper object
+// around a Python binding to the MLIR module.
+std::string ImportFunction(const std::string &functiondef_proto,
+                           const std::string &functiondef_library_proto,
+                           const std::string &pass_pipeline, TF_Status *status);
+
 // Load a SavedModel and return a textual MLIR string corresponding to it.
 //
 // Args:
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD b/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD
index 5e21dddd444..47bff366311 100644
--- a/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD
@@ -20,6 +20,7 @@ tf_python_pybind_extension(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/python:pybind11_lib",
         "//tensorflow/python:pybind11_status",
+        "@llvm-project//llvm:FileCheckLib",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
@@ -36,6 +37,7 @@ tf_python_pybind_extension(
     deps = [
         "//tensorflow/python:pybind11_lib",
         "//tensorflow/python:pybind11_status",
+        "@llvm-project//llvm:FileCheckLib",
         "@llvm-project//llvm:Support",
         "@pybind11",
     ],
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/basic_classes.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/basic_classes.cc
index 25adb44fe1d..5ae638851f4 100644
--- a/tensorflow/compiler/mlir/python/mlir_wrapper/basic_classes.cc
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/basic_classes.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "llvm/Support/FileCheck.h"
+#include "llvm/FileCheck/FileCheck.h"
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/filecheck_wrapper.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/filecheck_wrapper.cc
index 8a841856b72..051952ebaba 100644
--- a/tensorflow/compiler/mlir/python/mlir_wrapper/filecheck_wrapper.cc
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/filecheck_wrapper.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "llvm/Support/FileCheck.h"
+#include "llvm/FileCheck/FileCheck.h"
 #include "llvm/Support/SourceMgr.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.cc b/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.cc
index 4152b576e71..6cd49cf368d 100644
--- a/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.cc
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/mlir_wrapper.cc
@@ -22,23 +22,25 @@ limitations under the License.
 #include "mlir/Parser.h"  // from @llvm-project
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
 
 PYBIND11_MODULE(mlir_wrapper, m) {
-  m.def("registerDialects", []() {
-    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
-    mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
-    mlir::registerDialect<mlir::StandardOpsDialect>();
+  m.def("preloadTensorFlowDialects", [](mlir::MLIRContext &context) {
+    mlir::RegisterAllTensorFlowDialects(context.getDialectRegistry());
+    context.getDialectRegistry().loadAll(&context);
   });
+
   m.def("verify", [](std::string input) {
     llvm::SourceMgr SM = llvm::SourceMgr();
     SM.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(input),
                           llvm::SMLoc());
     mlir::MLIRContext ctx;
-    ctx.loadAllGloballyRegisteredDialects();
+    mlir::RegisterAllTensorFlowDialects(ctx.getDialectRegistry());
+    ctx.getDialectRegistry().loadAll(&ctx);
     auto module = mlir::parseSourceFile(SM, &ctx);
     if (!module) {
       return false;
diff --git a/tensorflow/compiler/mlir/runlit.cfg.py b/tensorflow/compiler/mlir/runlit.cfg.py
index f9870183b88..17410b4e5b2 100644
--- a/tensorflow/compiler/mlir/runlit.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.cfg.py
@@ -73,8 +73,9 @@ tool_names = [
     'mlir-opt', 'mlir-hlo-opt', 'mlir-translate', 'tf-opt', 'tf_tfl_translate',
     'tf_tfjs_translate', 'flatbuffer_to_string', 'flatbuffer_translate',
     'tf-mlir-translate', 'mlir-tflite-runner', 'tfcompile',
-    'json_to_flatbuffer', 'xla-gpu-opt', 'xla-opt', 'hlo_to_llvm_ir',
-    'kernel-gen-opt', 'xla-thunks-opt', 'tfjs-opt'
+    'json_to_flatbuffer', 'xla-gpu-opt', 'xla-mlir-gpu-opt', 'xla-opt',
+    'hlo_to_llvm_ir', 'kernel-gen-opt', 'tf_to_kernel', 'tf_to_gpu_binary',
+    'xla-thunks-opt', 'tfjs-opt'
 ]
 tools = [ToolSubst(s, unresolved='ignore') for s in tool_names]
 llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index f9cdc40a901..1c740731acd 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -1,5 +1,11 @@
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//third_party/mlir:tblgen.bzl", "gentbl")
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_gen_op_wrapper_py", "tf_native_cc_binary")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_gen_op_wrapper_py")
 
 package(
     default_visibility = [":friends"],
@@ -13,6 +19,7 @@ package_group(
         "//learning/brain/experimental/dtensor/...",
         "//learning/brain/experimental/tfrt/...",
         "//learning/pathways/data_parallel/tf2xla/...",
+        "//platforms/xla/sparse_core/...",
         "//tensorflow/compiler/...",
         "//tensorflow/lite/experimental/tf_runtime/...",
         "//tensorflow/python/...",
@@ -33,6 +40,7 @@ filegroup(
         "ir/tf_op_base.td",
         "ir/tf_op_interfaces.td",
         "ir/tf_ops.td",
+        "ir/tfrt_ops.td",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
@@ -43,6 +51,7 @@ filegroup(
 
 gentbl(
     name = "tensorflow_op_interfaces_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-op-interface-decls",
@@ -63,6 +72,7 @@ gentbl(
 
 gentbl(
     name = "tensorflow_struct_doc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-dialect-doc",
@@ -100,6 +110,8 @@ cc_library(
     deps = [
         ":tensorflow_op_interfaces_inc_gen",
         ":tensorflow_structs",
+        "//tensorflow/core:framework",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
     ],
@@ -107,6 +119,7 @@ cc_library(
 
 gentbl(
     name = "tensorflow_all_ops_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-op-decls",
@@ -124,6 +137,26 @@ gentbl(
     ],
 )
 
+gentbl(
+    name = "tensorflow_tfrt_ops_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    tbl_outs = [
+        (
+            "-gen-op-decls",
+            "ir/tfrt_ops.h.inc",
+        ),
+        (
+            "-gen-op-defs",
+            "ir/tfrt_ops.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "ir/tfrt_ops.td",
+    td_srcs = [
+        ":tensorflow_ops_td_files",
+    ],
+)
+
 # We only shard tf_op on name for build performance reasons.
 tf_ops_category_list = [
     {
@@ -139,6 +172,7 @@ tf_ops_category_list = [
 [[
     gentbl(
         name = "tensorflow_" + target["name"] + "_inc_gen",
+        compatible_with = get_compatible_with_cloud(),
         tbl_outs = [
             (
                 "-gen-op-decls -op-include-regex='" + target["include"] + "'",
@@ -159,6 +193,7 @@ tf_ops_category_list = [
 
 gentbl(
     name = "tensorflow_remaining_ops_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-op-decls -op-exclude-regex='" + "|".join([target["include"] for target in tf_ops_category_list]) + "' ",
@@ -178,6 +213,7 @@ gentbl(
 
 gentbl(
     name = "tf_saved_model_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-op-decls",
@@ -204,6 +240,7 @@ gentbl(
 
 gentbl(
     name = "tensorflow_executor_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-op-decls",
@@ -230,6 +267,7 @@ gentbl(
 
 gentbl(
     name = "tensorflow_device_ops_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-op-decls ",
@@ -255,6 +293,7 @@ gentbl(
 
 gentbl(
     name = "tensorflow_canonicalize_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-rewriters",
@@ -270,6 +309,7 @@ gentbl(
 
 gentbl(
     name = "hlo_legalize_tf_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         ("-gen-rewriters", "transforms/generated_legalize_hlo.inc"),
     ],
@@ -343,6 +383,7 @@ cc_library(
         name = "tensorflow_" + target["name"],
         srcs = [
             "ir/tf_ops.h",
+            "ir/tfrt_ops.h",
             "ir/tf_remaining_ops.h",
             "ir/tf_" + target["name"] + ".cc",
             "ir/tf_" + target["name"] + ".cc.inc",
@@ -352,6 +393,7 @@ cc_library(
         textual_hdrs = [
             "ir/tf_all_ops.h.inc",
             "ir/tf_ops_helpers.inc",
+            "ir/tfrt_ops.h.inc",
             "ir/tf_remaining_ops.h.inc",
         ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
         deps = [
@@ -386,6 +428,7 @@ cc_library(
         "ir/tf_ops.h",
         "ir/tf_remaining_ops.h",
         "ir/tf_remaining_ops.cc",
+        "ir/tfrt_ops.h",
     ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
     hdrs = [
     ],
@@ -393,6 +436,49 @@ cc_library(
         "ir/tf_all_ops.h.inc",
         "ir/tf_ops_helpers.inc",
         "ir/tf_remaining_ops.h.inc",
+        "ir/tfrt_ops.h.inc",
+    ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
+    deps = [
+        ":tensorflow_attributes",
+        ":tensorflow_canonicalize_inc_gen",
+        ":tensorflow_op_interfaces",
+        ":tensorflow_op_interfaces_inc_gen",
+        ":tensorflow_remaining_ops_inc_gen",
+        ":tensorflow_side_effects",
+        ":tensorflow_structs",
+        ":tensorflow_tfrt_ops_inc_gen",
+        ":tensorflow_traits",
+        ":tensorflow_types",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:DerivedAttributeOpInterface",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:LoopLikeInterface",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:SideEffects",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "tensorflow_tfrt_ops",
+    srcs = [
+        "ir/tf_ops.h",
+        "ir/tfrt_ops.h",
+        "ir/tfrt_ops.cc",
+        "ir/tf_remaining_ops.h",
+    ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
+    hdrs = [
+    ],
+    textual_hdrs = [
+        "ir/tf_all_ops.h.inc",
+        "ir/tf_ops_helpers.inc",
+        "ir/tfrt_ops.h.inc",
+        "ir/tf_remaining_ops.h.inc",
     ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
     deps = [
         ":tensorflow_attributes",
@@ -402,6 +488,7 @@ cc_library(
         ":tensorflow_remaining_ops_inc_gen",
         ":tensorflow_side_effects",
         ":tensorflow_structs",
+        ":tensorflow_tfrt_ops_inc_gen",
         ":tensorflow_traits",
         ":tensorflow_types",
         "//tensorflow/core:framework",
@@ -428,9 +515,11 @@ cc_library(
     textual_hdrs = [
         "ir/tf_all_ops.h.inc",
         "ir/tf_remaining_ops.h",
+        "ir/tfrt_ops.h",
     ] + ["ir/tf_" + target["name"] + ".h" for target in tf_ops_category_list],
     deps = [
         ":tensorflow_all_ops_inc_gen",
+        ":tensorflow_tfrt_ops_inc_gen",
         ":tensorflow_remaining_ops_inc_gen",
         ":tensorflow_attributes",
         ":tensorflow_canonicalize_inc_gen",
@@ -441,6 +530,7 @@ cc_library(
         ":tensorflow_traits",
         ":tensorflow_types",
         ":tensorflow_remaining_ops",
+        ":tensorflow_tfrt_ops",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:Dialect",
@@ -538,6 +628,7 @@ cc_library(
         ":tensorflow_ops",
         ":tensorflow_side_effects",
         ":tensorflow_structs",
+        ":tensorflow_tfrt_ops_inc_gen",
         ":tensorflow_traits",
         ":tensorflow_types",
         ":tf_saved_model_inc_gen",
@@ -567,6 +658,7 @@ cc_library(
 
 gentbl(
     name = "decompose_resource_ops_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-rewriters",
@@ -599,6 +691,7 @@ cc_library(
 
 gentbl(
     name = "tf_data_optimization_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-rewriters",
@@ -719,14 +812,13 @@ cc_library(
     ],
     deps = [
         ":tensorflow",
+        ":tensorflow_op_interfaces",
         ":tensorflow_types",
-        "//tensorflow/compiler/tf2xla:resource_operation_table",
-        "//tensorflow/core:framework",
-        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
     ],
 )
@@ -741,6 +833,7 @@ cc_library(
         "transforms/cluster_formation.cc",
         "transforms/cluster_outlining.cc",
         "transforms/collection_ops_util.cc",
+        "transforms/contraction_fusion.cc",
         "transforms/decompose_resource_ops_pass.cc",
         "transforms/device_index_selector.cc",
         "transforms/einsum.cc",
@@ -772,6 +865,8 @@ cc_library(
         "transforms/replicate_to_island.cc",
         "transforms/resource_device_inference.cc",
         "transforms/resource_op_lifting.cc",
+        "transforms/resource_op_lifting_cleanup.cc",
+        "transforms/resource_op_lifting_cleanup.h",
         "transforms/rewrite_tpu_embedding_ops.cc",
         "transforms/shape_inference.cc",
         "transforms/shape_inference_pass.cc",
@@ -785,7 +880,9 @@ cc_library(
         "transforms/test_visitor_util.cc",
         "transforms/tf_data_optimization_pass.cc",
         "transforms/tf_device_assignment.cc",
+        "transforms/tpu_cluster_cleanup_attributes.cc",
         "transforms/tpu_cluster_formation.cc",
+        "transforms/tpu_colocate_composite_resource_ops.cc",
         "transforms/tpu_dynamic_layout_pass.cc",
         "transforms/tpu_dynamic_padding_mapper.cc",
         "transforms/tpu_extract_head_tail_outside_compilation.cc",
@@ -794,6 +891,8 @@ cc_library(
         "transforms/tpu_identity_pruning.cc",
         "transforms/tpu_merge_variables_with_execute.cc",
         "transforms/tpu_outside_compilation_cluster.cc",
+        "transforms/tpu_parallel_execute_sink_resource_write.cc",
+        "transforms/tpu_resource_read_for_write.cc",
         "transforms/tpu_rewrite_pass.cc",
         "transforms/tpu_sharding_identification_pass.cc",
         "transforms/tpu_space_to_depth_pass.cc",
@@ -804,7 +903,6 @@ cc_library(
         "translate/tf_functional_to_executor.cc",
     ],
     hdrs = [
-        "transforms/batchmatmul_to_einsum.h",
         "transforms/bridge.h",
         "transforms/collection_ops_util.h",
         "transforms/einsum.h",
@@ -812,6 +910,9 @@ cc_library(
         "transforms/shape_inference.h",
     ],
     includes = ["include"],
+    textual_hdrs = [
+        "ir/tf_ops_helpers.inc",
+    ],
     deps = [
         ":attribute_utils",
         ":bridge_logger",
@@ -820,10 +921,13 @@ cc_library(
         ":decompose_resource_ops",
         ":decompose_resource_ops_inc_gen",
         ":device_util",
+        ":dump_mlir_util",
         ":error_util",
         ":export_tf_dialect_op",
         ":lower_tf_lib",
         ":mangling_util",
+        ":serialize_mlir_module_utils",
+        ":shape_inference_utils",
         ":tensorflow",
         ":tensorflow_analysis",
         ":tensorflow_optimize_inc_gen",
@@ -854,6 +958,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
@@ -898,6 +1003,7 @@ cc_library(
     srcs = ["transforms/graph_optimization_pass.cc"],
     hdrs = ["transforms/graph_optimization_pass.h"],
     deps = [
+        ":dump_mlir_util",
         ":error_util",
         ":tensorflow_passes",
         "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
@@ -920,15 +1026,14 @@ cc_library(
     alwayslink = 1,
 )
 
-# Library with TensorFlow dialect static initialization.
 cc_library(
-    name = "tensorflow_dialect_registration",
-    srcs = ["ir/dialect_registration.cc"],
+    name = "upgrade_graph",
+    srcs = ["translate/upgrade_graph.cc"],
+    hdrs = ["translate/upgrade_graph.h"],
     deps = [
-        ":tensorflow",
-        "@llvm-project//mlir:Shape",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
     ],
-    alwayslink = 1,
 )
 
 cc_library(
@@ -942,8 +1047,10 @@ cc_library(
         "translate/import_model.h",
     ],
     deps = [
+        ":convert_attr",
         ":convert_tensor",
         ":convert_type",
+        ":dump_mlir_util",
         ":error_util",
         ":export_tf_dialect_op",
         ":export_utils",
@@ -955,11 +1062,13 @@ cc_library(
         ":tensorflow_types",
         ":tf_saved_model_passes",
         ":translate_utils",
+        ":upgrade_graph",
         "//tensorflow/cc/saved_model:bundle_v2",
         "//tensorflow/cc/saved_model:constants",
         "//tensorflow/cc/saved_model:loader_lite",
         "//tensorflow/cc/saved_model:loader_util",
         "//tensorflow/compiler/jit:shape_inference_helpers",
+        "//tensorflow/compiler/mlir:name_utils",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/tf2xla:functionalize_control_flow",
         "//tensorflow/compiler/xla:status_macros",
@@ -1064,7 +1173,6 @@ cc_library(
 cc_library(
     name = "export_tf_dialect_op",
     srcs = [
-        "translate/derived_attr_populator.inc",
         "translate/export_tf_dialect_op.cc",
     ],
     hdrs = [
@@ -1074,13 +1182,16 @@ cc_library(
         ":convert_type",
         ":export_utils",
         ":tensorflow",
+        "//tensorflow/compiler/mlir:string_container_utils",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:DerivedAttributeOpInterface",
         "@llvm-project//mlir:IR",
     ],
 )
@@ -1154,6 +1265,24 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "convert_attr",
+    srcs = ["utils/convert_attr.cc"],
+    hdrs = ["utils/convert_attr.h"],
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        ":convert_tensor",
+        ":convert_type",
+        ":tensorflow_attributes",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/stream_executor/lib",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "convert_type",
     srcs = ["utils/convert_type.cc"],
@@ -1286,6 +1415,7 @@ cc_library(
         ":decode_constant_pass",
         ":eval_util",
         ":tensorflow",
+        ":tensorflow_traits",
         ":tensorflow_types",
         "//tensorflow/c:tf_status",
         "//tensorflow/c/eager:c_api",
@@ -1304,9 +1434,8 @@ cc_library(
 cc_library(
     name = "tf_dialect_lib",
     deps = [
-        ":tensorflow_dialect_registration",
         ":tf_dialect_passes",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
     ],
 )
 
@@ -1317,6 +1446,7 @@ cc_library(
     deps = [
         ":convert_graphdef",
         ":mlir_roundtrip_flags",
+        ":tensorflow",
         "//tensorflow/compiler/tf2xla:functionalize_control_flow_pass_registration",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -1437,37 +1567,6 @@ tf_cc_test(
     ],
 )
 
-tf_native_cc_binary(
-    name = "derived_attr_populator_gen",
-    srcs = [
-        "translate/derived_attr_populator_gen.cc",
-    ],
-    deps = [
-        "@llvm-project//llvm:Support",
-        "@llvm-project//llvm:TableGen",
-        "@llvm-project//mlir:TableGen",
-    ],
-)
-
-gentbl(
-    name = "derived_attr_populator_inc",
-    tbl_outs = [
-        ("", "translate/derived_attr_populator.inc"),
-    ],
-    tblgen = ":derived_attr_populator_gen",
-    td_file = "ir/tf_ops.td",
-    td_srcs = [
-        "@llvm-project//mlir:include/mlir/IR/OpBase.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
-        "ir/tf_generated_ops.td",
-        "ir/tf_op_base.td",
-        "ir/tf_op_interfaces.td",
-    ],
-)
-
 filegroup(
     name = "tensorflow_optimize_td_files",
     srcs = [
@@ -1477,6 +1576,7 @@ filegroup(
 
 gentbl(
     name = "tensorflow_optimize_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-rewriters",
@@ -1494,19 +1594,20 @@ gentbl(
 COMPILE_MLIR_UTIL_DEPS = [
     ":bridge_logger",
     ":convert_graphdef",
+    ":convert_tensor",
     ":convert_type",
     ":dump_mlir_util",
     ":error_util",
     ":mlir_roundtrip_flags",
+    ":serialize_mlir_module_utils",
     ":tensorflow",
-    ":tensorflow_dialect_registration",
     ":tensorflow_types",
     ":tensorflow_passes",
     ":translate_utils",
     "@com_google_absl//absl/types:optional",
+    "@com_google_absl//absl/types:variant",
     "@llvm-project//llvm:Support",
     "@llvm-project//mlir:IR",
-    "@llvm-project//mlir:Parser",
     "@llvm-project//mlir:Pass",
     "@llvm-project//mlir:Shape",
     "@llvm-project//mlir:StandardOps",
@@ -1528,9 +1629,9 @@ COMPILE_MLIR_UTIL_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:protos_all_cc",
     "//tensorflow/stream_executor/lib",
+    "//tensorflow/compiler/xla:shape_util",
     "//tensorflow/compiler/xla:xla_data_proto_cc",
     "//tensorflow/compiler/xla/service:hlo",
-    ":convert_tensor",
 ]
 
 # Prefer to link 'compile_mlir_util' library that also links necessary
@@ -1557,27 +1658,61 @@ cc_library(
     ],
 )
 
-tf_cc_test(
-    name = "compile_mlir_util_test",
-    size = "small",
-    srcs = ["utils/compile_mlir_util_test.cc"],
+cc_library(
+    name = "compile_mlir_util_pass",
+    srcs = ["utils/compile_mlir_util_pass.cc"],
     deps = [
         ":compile_mlir_util",
-        "//tensorflow/cc:function_ops",
-        "//tensorflow/cc:resource_variable_ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/compiler/jit",
-        "//tensorflow/compiler/tf2xla:common",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/stream_executor/lib",
+        "@llvm-project//mlir:Pass",
     ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "serialize_mlir_module_utils",
+    srcs = ["utils/serialize_mlir_module_utils.cc"],
+    hdrs = ["utils/serialize_mlir_module_utils.h"],
+    deps = [
+        ":error_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+    ],
+)
+
+cc_library(
+    name = "tf_xla_mlir_translate",
+    srcs = ["utils/tf_xla_mlir_translate.cc"],
+    deps = [
+        ":compile_mlir_util",
+        ":mlir_roundtrip_flags",
+        ":serialize_mlir_module_utils",
+        ":tensorflow",
+        ":translate_cl_options",
+        "//tensorflow/compiler/mlir:string_container_utils",
+        "//tensorflow/compiler/mlir/xla:translate_cl_options",
+        "//tensorflow/compiler/tf2xla:xla_argument",
+        "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Translation",
+    ],
+    alwayslink = 1,
 )
 
 cc_library(
@@ -1607,6 +1742,7 @@ cc_library(
 tf_gen_op_wrapper_py(
     name = "gen_mlir_passthrough_op_py",
     out = "gen_mlir_passthrough_op.py",
+    compatible_with = [],
     deps = [":mlir_passthrough_op"],
 )
 
@@ -1616,6 +1752,7 @@ tf_gen_op_wrapper_py(
 # without linking any of the other tensorflow passes.
 gentbl(
     name = "lower_tf_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-rewriters",
@@ -1728,6 +1865,7 @@ cc_library(
         "//tensorflow/core/platform:logging",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
     ],
 )
 
@@ -1754,14 +1892,13 @@ cc_library(
         ":convert_graphdef",
         ":error_util",
         ":tensorflow",
-        ":tensorflow_dialect_registration",
         ":tensorflow_passes",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:logging",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -1777,6 +1914,7 @@ tf_cc_test(
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
+        "//tensorflow/core:ops",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:test",
@@ -1837,3 +1975,28 @@ cc_library(
         "@llvm-project//mlir:IR",
     ],
 )
+
+cc_library(
+    name = "shape_inference_utils",
+    srcs = ["utils/shape_inference_utils.cc"],
+    hdrs = ["utils/shape_inference_utils.h"],
+    deps = [
+        ":convert_tensor",
+        ":convert_type",
+        ":export_tf_dialect_op",
+        ":export_utils",
+        ":tensorflow",
+        ":tensorflow_attributes",
+        ":tensorflow_types",
+        "//tensorflow/compiler/mlir:array_container_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:types",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:DerivedAttributeOpInterface",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
index 8ec7513f81f..cdc9e33e368 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc
@@ -18,20 +18,17 @@ limitations under the License.
 #include <cstdint>
 #include <initializer_list>
 
-#include "absl/strings/str_cat.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
 #include "mlir/Analysis/CallGraph.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
@@ -42,9 +39,8 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/tf2xla/resource_operation_table.h"
-#include "tensorflow/core/framework/resource_mgr.h"
 
 namespace mlir {
 namespace TF {
@@ -231,48 +227,16 @@ BacktrackAnalysisInfo::BacktrackAnalysisInfo(
     backtracked_values_.push_back(backtrack_analysis.BacktrackValue(result));
 }
 
-namespace {
-
-//===----------------------------------------------------------------------===//
-// ResourceAliasAnalysisInfo helper functions.
-//===----------------------------------------------------------------------===//
-
-constexpr char kResourceArgUniqueIdAttr[] = "tf._resource_arg_unique_id";
-
-// Returns if a VarHandleOp is anonymous, which means it always creates a new
-// variable.
-bool IsResourceHandleAnonymous(VarHandleOp handle) {
-  return handle.shared_name() == tensorflow::ResourceHandle::ANONYMOUS_NAME;
-}
-
-// Returns a string unique identifier for a non-anonymous VarHandleOp.
-std::string GetVarHandleStringId(VarHandleOp handle) {
-  auto device = handle.getAttrOfType<StringAttr>("device");
-  return absl::StrCat(handle.container().str(), "/", handle.shared_name().str(),
-                      "/", device ? device.getValue().str() : std::string(""));
-}
-
-// Finds a unique ID for a VarHandleOp's output. If it is anonymous, always
-// creates a new ID; otherwise, tries to reuse the existing ID for the
-// referenced variable if it exists, or creates a new one if not.
-int64_t GetOrCreateIdForVarHandle(VarHandleOp handle, int64_t* next_id,
-                                  llvm::StringMap<int64_t>* name_id_map) {
-  // Always create a new ID for anonymous handle.
-  if (IsResourceHandleAnonymous(handle)) return (*next_id)++;
-
-  auto name = GetVarHandleStringId(handle);
-  auto emplace_res = name_id_map->try_emplace(name, *next_id);
-  // New ID created, increment next_id.
-  if (emplace_res.second) ++(*next_id);
-  return emplace_res.first->second;
-}
-
-}  // namespace
-
 //===----------------------------------------------------------------------===//
 // ResourceAliasAnalysisInfo
 //===----------------------------------------------------------------------===//
 
+namespace {
+
+constexpr char kResourceArgUniqueIdAttr[] = "tf._resource_arg_unique_id";
+
+}  // namespace
+
 constexpr int64_t ResourceAliasAnalysisInfo::kUnknownResourceId;
 
 // Constructs the analysis info by analyzing the given function.
@@ -338,60 +302,33 @@ ResourceAliasAnalysisInfo::ResourceAliasAnalysisInfo(
     }
   });
 
-  llvm::StringMap<int64_t> var_handle_name_id_map;
+  llvm::SmallDenseMap<ResourceHandle, int64_t> resource_handle_id_map;
   func_op.walk([&](Operation* op) {
-    if (auto var_handle = dyn_cast<VarHandleOp>(op)) {
-      AddValueUniqueIDMapping(
-          var_handle.resource(),
-          GetOrCreateIdForVarHandle(var_handle, &next_unique_id,
-                                    &var_handle_name_id_map));
+    if (auto resource_alloc = dyn_cast<ResourceHandleAllocatorInterface>(op)) {
+      ResourceHandleValueAndId resource =
+          resource_alloc.GetResourceHandleValueAndId(resource_handle_id_map,
+                                                     next_unique_id);
+      AddValueUniqueIDMapping(resource.value, resource.id);
     } else if (llvm::isa<IdentityNOp, IdentityOp>(op)) {
       for (auto result : filter_resources(op->getResults()))
         PropagateInputToOutput(op->getOperand(result.getResultNumber()),
                                result);
     } else if (auto while_op = dyn_cast<WhileOp>(op)) {
       AnalyzeWhileLoop(while_op, backtrack_analysis.GetAnalysisForFunc(
-                                     while_op.body_func()));
+                                     while_op.body_function()));
     } else if (auto while_region = dyn_cast<WhileRegionOp>(op)) {
       AnalyzeWhileLoop(while_region, backtrack_analysis.GetAnalysisForRegion(
                                          while_region.body()));
+    } else if (auto case_op = dyn_cast<CaseOp>(op)) {
+      llvm::SmallVector<FuncOp, 4> functions;
+      case_op.get_branch_functions(functions);
+      AnalyzeFunctionalCaseOrIfOp(case_op, functions, backtrack_analysis);
     } else if (auto if_op = dyn_cast<IfOp>(op)) {
-      const auto& then_info =
-          backtrack_analysis.GetAnalysisForFunc(if_op.then_func());
-      const auto& else_info =
-          backtrack_analysis.GetAnalysisForFunc(if_op.else_func());
-      // If a result is a passthrough of both branches' inputs, merge the
-      // resource IDs of corresponding operands for the two inputs.
-      for (auto result : filter_resources(if_op.getResults())) {
-        auto passthrough_then_arg = then_info.GetArg(result.getResultNumber());
-        auto passthrough_else_arg = else_info.GetArg(result.getResultNumber());
-        if (passthrough_then_arg && passthrough_else_arg) {
-          Value then_operand = if_op.input()[passthrough_then_arg.getValue()];
-          Value else_operand = if_op.input()[passthrough_else_arg.getValue()];
-          PropagateInputToOutput(then_operand, result);
-          PropagateInputToOutput(else_operand, result);
-        } else {
-          AddValueUniqueIDMapping(result, kUnknownResourceId);
-        }
-      }
-    } else if (auto if_region = dyn_cast<IfRegionOp>(op)) {
-      const auto& then_info =
-          backtrack_analysis.GetAnalysisForRegion(if_region.then_branch());
-      const auto& else_info =
-          backtrack_analysis.GetAnalysisForRegion(if_region.else_branch());
-      for (auto result : filter_resources(if_region.getResults())) {
-        Value then_result = then_info.GetValue(result.getResultNumber());
-        Value else_result = else_info.GetValue(result.getResultNumber());
-        // For IfRegion, the walk would have visited the else and then regions
-        // before visiting the IfRegion op. Backtracking of the then and else
-        // results will either give a value computed within these regions,
-        // or a region capture. If its a region capture, computed before this
-        // IfRegion, it will have been visited earlier and a mapping would
-        // exist for that value. If its computed within the region, then again
-        // a mapping would exist.
-        PropagateInputToOutput(then_result, result);
-        PropagateInputToOutput(else_result, result);
-      }
+      AnalyzeFunctionalCaseOrIfOp(
+          if_op, {if_op.then_function(), if_op.else_function()},
+          backtrack_analysis);
+    } else if (llvm::isa<CaseRegionOp, IfRegionOp>(op)) {
+      AnalyzeRegionCaseOrIfOp(op, backtrack_analysis);
     } else if (auto call = dyn_cast<CallOpInterface>(op)) {
       FuncOp func = dyn_cast<FuncOp>(call.resolveCallable());
       if (!func) {
@@ -501,6 +438,59 @@ void ResourceAliasAnalysisInfo::AnalyzeWhileLoop(
   }
 }
 
+template <class CaseOrIfOp>
+void ResourceAliasAnalysisInfo::AnalyzeFunctionalCaseOrIfOp(
+    CaseOrIfOp case_or_if_op, llvm::ArrayRef<FuncOp> functions,
+    const BacktrackAnalysis& backtrack_analysis) {
+  llvm::SmallVector<const BacktrackAnalysisInfo*, 2> infos;
+  infos.reserve(functions.size());
+  for (FuncOp func : functions)
+    infos.push_back(&backtrack_analysis.GetAnalysisForFunc(func));
+
+  // If a result is a passthrough of all branches' inputs, merge the resource
+  // IDs of corresponding operands for all the inputs.
+  for (auto result : filter_resources(case_or_if_op.getResults())) {
+    llvm::SmallVector<llvm::Optional<int>, 2> passthrough_args;
+    passthrough_args.reserve(functions.size());
+    for (const auto* info : infos)
+      passthrough_args.emplace_back(info->GetArg(result.getResultNumber()));
+
+    const bool all_passthrough_args_known = llvm::all_of(
+        passthrough_args, [](const llvm::Optional<int>& passthrough_arg) {
+          return passthrough_arg.hasValue();
+        });
+    if (all_passthrough_args_known) {
+      for (const auto& passthrough_arg : passthrough_args) {
+        Value operand = case_or_if_op.input()[passthrough_arg.getValue()];
+        PropagateInputToOutput(operand, result);
+      }
+    } else {
+      AddValueUniqueIDMapping(result, kUnknownResourceId);
+    }
+  }
+}
+
+void ResourceAliasAnalysisInfo::AnalyzeRegionCaseOrIfOp(
+    Operation* case_or_if_op, const BacktrackAnalysis& backtrack_analysis) {
+  llvm::SmallVector<const BacktrackAnalysisInfo*, 2> infos;
+  infos.reserve(case_or_if_op->getNumRegions());
+  for (Region& region : case_or_if_op->getRegions())
+    infos.push_back(&backtrack_analysis.GetAnalysisForRegion(region));
+
+  // For region Case/If, the walk would have visited all branch regions before
+  // visiting the Case/If op. Backtracking of each region results will either
+  // give a value computed within these regions, or a region capture. If it is a
+  // region capture computed before this Case/If, it will have been visited
+  // earlier and a mapping would exist for that value. If it is computed within
+  // the region, then again a mapping would exist.
+  for (auto result : filter_resources(case_or_if_op->getResults())) {
+    for (const auto* info : infos) {
+      Value region_result = info->GetValue(result.getResultNumber());
+      PropagateInputToOutput(region_result, result);
+    }
+  }
+}
+
 bool ResourceAliasAnalysisInfo::IsUnknownResource(Value resource) const {
   auto it = resource_value_to_ids_.find(resource);
   assert(it != resource_value_to_ids_.end() && !it->getSecond().empty());
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h
index 46bb57c942d..5575767dcc4 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
@@ -77,6 +78,16 @@ class ResourceAliasAnalysisInfo {
   void AnalyzeWhileLoop(Operation* while_op,
                         const BacktrackAnalysisInfo& body_info);
 
+  // Analyzes tf.Case/tf.If ops to compute resourceID's.
+  template <class CaseOrIfOp>
+  void AnalyzeFunctionalCaseOrIfOp(CaseOrIfOp case_or_if_op,
+                                   llvm::ArrayRef<FuncOp> functions,
+                                   const BacktrackAnalysis& backtrack_analysis);
+
+  // Analyzes tf.CaseRegion/tf.IfRegion ops to compute resourceID's.
+  void AnalyzeRegionCaseOrIfOp(Operation* case_or_if_op,
+                               const BacktrackAnalysis& backtrack_analysis);
+
   // Maps each resource-type value to a set of unique IDs that it could alias.
   llvm::SmallDenseMap<Value, llvm::SmallSet<int64_t, 8>, 8>
       resource_value_to_ids_;
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
index c78a7e403c4..4d2c237e9a0 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
@@ -18,30 +18,31 @@ limitations under the License.
 #include <cstdint>
 #include <initializer_list>
 
-#include "absl/strings/str_cat.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/compiler/tf2xla/resource_operation_table.h"
-#include "tensorflow/core/framework/resource_mgr.h"
 
 namespace mlir {
 namespace TF {
@@ -80,38 +81,125 @@ llvm::SmallDenseSet<int64_t, 8> FindAccessedResources(
   return resources;
 }
 
-// Returns an XlaResourceOpInfo (or nullptr if it does not exist) that specifies
-// the resource access type of the op. It tells whether the op is read only,
-// etc.
-//
-// TODO(yuanzx): Define this information in a different place. Currently we use
-// tensorflow/compiler/tf2xla/resource_operation_table.h.
-const tensorflow::XlaResourceOpInfo* GetResourceInfoForOp(Operation* op) {
-  if (op->getName().getDialect() !=
-      TF::TensorFlowDialect::getDialectNamespace()) {
-    return nullptr;
+// Helper struct defining what memory effects are present for a resource.
+struct SideEffects {
+  bool alloc = false;
+  bool free = false;
+  bool read = false;
+  bool write = false;
+
+  bool IsAllocOnly() const { return alloc && !free && !read && !write; }
+  bool IsReadOnly() const { return !alloc && !free && read && !write; }
+};
+
+using ResourceSideEffectsByValue = llvm::SmallDenseMap<Value, SideEffects>;
+
+// Collects memory side effects for an operation by value (operands and
+// results).
+ResourceSideEffectsByValue GetResourceInfoForOp(Operation* op) {
+  ResourceSideEffectsByValue resource_info;
+
+  auto interface = dyn_cast<MemoryEffectOpInterface>(op);
+  if (!interface) return resource_info;
+
+  llvm::SmallVector<MemoryEffects::EffectInstance, 4> effects;
+  interface.getEffects(effects);
+
+  for (auto& effect : effects) {
+    // TODO(lyandy): Support effects with no value defined.
+    if (!effect.getValue()) return ResourceSideEffectsByValue();
+    auto it = resource_info.try_emplace(effect.getValue());
+    auto& side_effect = it.first->getSecond();
+    auto* resource_effect = effect.getEffect();
+    if (isa<MemoryEffects::Allocate>(resource_effect)) {
+      side_effect.alloc = true;
+    } else if (isa<MemoryEffects::Free>(resource_effect)) {
+      side_effect.free = true;
+    } else if (isa<MemoryEffects::Read>(resource_effect)) {
+      side_effect.read = true;
+    } else if (isa<MemoryEffects::Write>(resource_effect)) {
+      side_effect.write = true;
+    } else {
+      return ResourceSideEffectsByValue();
+    }
   }
-  return tensorflow::GetResourceOpInfoForOp(
-      op->getName().getStringRef().split('.').second.str());
+
+  return resource_info;
 }
 
-// Returns whether `op` accesses resources and it is known to be read-only.
-bool OpIsReadOnly(Operation* op) {
-  auto resource_op_info = GetResourceInfoForOp(op);
-  return resource_op_info &&
-         resource_op_info->kind() == tensorflow::XlaResourceOpKind::kRead;
+// Checks if a value is a result of `op`.
+bool IsOperationResult(Operation* op, Value value) {
+  return value.getDefiningOp() == op;
+}
+
+// Checks if an operation's resource operands are read only. Operation results
+// are ignored.
+bool IsResourceOpReadOnly(Operation* op,
+                          const ResourceSideEffectsByValue& resource_op_info) {
+  if (resource_op_info.empty()) return false;
+
+  for (const auto& resource_info : resource_op_info) {
+    Value value = resource_info.getFirst();
+    if (IsOperationResult(op, value)) continue;
+    const SideEffects& side_effects = resource_info.getSecond();
+    if (!side_effects.IsReadOnly()) return false;
+  }
+
+  return true;
+}
+
+// Checks if an operation's resource results are alloc only and no side effects
+// are present for its operands.
+bool IsResourceOpAllocOnly(Operation* op,
+                           const ResourceSideEffectsByValue& resource_op_info) {
+  if (resource_op_info.empty()) return false;
+
+  for (const auto& resource_info : resource_op_info) {
+    // Operand with side effect.
+    Value value = resource_info.getFirst();
+    if (!IsOperationResult(op, value)) return false;
+    const SideEffects& side_effects = resource_info.getSecond();
+    if (!side_effects.IsAllocOnly()) return false;
+  }
+
+  return true;
 }
 
 // Returns if `op` is a resource declaration.
 bool OpIsDeclaration(Operation* op,
                      const ResourceAliasAnalysis::Info& alias_analysis) {
-  // TODO(yuanzx): Add other types of resources.
-  return llvm::isa<TF::VarHandleOp>(op) ||
-         (llvm::isa<TF::IdentityNOp, TF::IdentityOp>(op) &&
-          !FindAccessedResources(op, alias_analysis).empty());
+  return llvm::isa<TF::IdentityNOp, TF::IdentityOp>(op) &&
+         !FindAccessedResources(op, alias_analysis).empty();
 }
 
-// Returns if `op` is know to not have any side effect.
+// A vector of resource variable id's with their associated resource value.
+using ResourceIdsByValue =
+    llvm::SmallVector<std::pair<Value, const llvm::SmallSet<int64_t, 8>*>, 4>;
+
+// Collects resource id's by resource value. If operation resource side effects
+// are unknown or a resource is unknown, an empty optional is returned.
+llvm::Optional<ResourceIdsByValue> GetResourceIdsByValue(
+    Operation* op, const ResourceAliasAnalysis::Info& alias_analysis,
+    const ResourceSideEffectsByValue& resource_op_info) {
+  ResourceIdsByValue resource_ids_by_value;
+  if (resource_op_info.empty()) return llvm::None;
+
+  auto collect_ids = [&](ValueRange values) {
+    for (auto value : filter_resources(values)) {
+      if (alias_analysis.IsUnknownResource(value)) return false;
+      const auto& ids = alias_analysis.GetResourceUniqueIds(value);
+      resource_ids_by_value.push_back({value, &ids});
+    }
+    return true;
+  };
+
+  if (collect_ids(op->getOperands()) && collect_ids(op->getResults()))
+    return resource_ids_by_value;
+  else
+    return llvm::None;
+}
+
+// Returns true if `op` is known to not have any side effect.
 bool OpIsKnownToHaveNoSideEffect(Operation* op) {
   // Note: Identity op is really side-effect free, but it is not marked as such
   // in the TF dialect (see comments in definition of Identity op in tf_ops.td)
@@ -253,17 +341,17 @@ void SideEffectAnalysisInfo::AnalyzeRegion(
       if (OpIsDeclaration(&op, alias_analysis)) continue;
 
       auto resource_op_info = GetResourceInfoForOp(&op);
-      if (!resource_op_info && OpIsKnownToHaveNoSideEffect(&op)) continue;
+      if (resource_op_info.empty() && OpIsKnownToHaveNoSideEffect(&op))
+        continue;
 
-      llvm::SmallDenseSet<int64_t, 8> resources =
-          resource_op_info ? FindAccessedResources(&op, alias_analysis)
-                           : UnknownResourceSet();
-      assert(!resources.empty());
-      const bool is_unknown = resources.count(kUnknownResourceId) > 0;
-      const bool read_only = OpIsReadOnly(&op);
+      if (IsResourceOpAllocOnly(&op, resource_op_info)) continue;
+
+      auto resource_ids_by_value =
+          GetResourceIdsByValue(&op, alias_analysis, resource_op_info);
+      const bool read_only = IsResourceOpReadOnly(&op, resource_op_info);
       bool indirectly_tracked_unknown_access = false;
       // First add edges from known resources.
-      if (is_unknown) {
+      if (!resource_ids_by_value.hasValue()) {
         for (auto& entry : per_resource_access_info_) {
           if (entry.getFirst() == kUnknownResourceId) continue;
           AddPredecessorsForAccess(entry.getFirst(), &op, read_only);
@@ -272,20 +360,43 @@ void SideEffectAnalysisInfo::AnalyzeRegion(
                                                             read_only);
         }
       } else {
-        for (int64_t resource : resources) {
-          AddPredecessorsForAccess(resource, &op, read_only);
+        // Collect all resource id's and whether their side effect is read only.
+        llvm::SmallDenseMap<int64_t, bool> read_only_by_resource_id;
+        for (const auto& resource_ids : *resource_ids_by_value) {
+          const bool is_result = resource_ids.first.getDefiningOp() == &op;
+          auto value_resource_info = resource_op_info.find(resource_ids.first);
+          bool resource_read_only = false;
+          if (value_resource_info != resource_op_info.end()) {
+            if (is_result && value_resource_info->getSecond().IsAllocOnly())
+              continue;
+            resource_read_only = value_resource_info->getSecond().IsReadOnly();
+          }
+
+          for (const auto& id : *resource_ids.second) {
+            auto it =
+                read_only_by_resource_id.try_emplace(id, resource_read_only);
+            if (!it.second && !resource_read_only)
+              it.first->getSecond() = resource_read_only;
+          }
+        }
+
+        for (const auto& resource : read_only_by_resource_id) {
+          const auto& resource_id = resource.getFirst();
+          const auto& resource_read_only = resource.getSecond();
+          AddPredecessorsForAccess(resource_id, &op, resource_read_only);
           indirectly_tracked_unknown_access |=
-              unknown_access_indirectly_tracked_by_resource(resource,
-                                                            read_only);
+              unknown_access_indirectly_tracked_by_resource(resource_id,
+                                                            resource_read_only);
           // Update access info for known resources.
-          TrackAccess(resource, &op, read_only);
+          TrackAccess(resource_id, &op, resource_read_only);
         }
       }
+
       // If not indirectly tracked, add edges from the unknown resource.
       if (!indirectly_tracked_unknown_access) {
         AddPredecessorsForAccess(kUnknownResourceId, &op, read_only);
       }
-      if (is_unknown) {
+      if (!resource_ids_by_value.hasValue()) {
         // Update access info for unknown resource.
         TrackAccess(kUnknownResourceId, &op, read_only);
       }
diff --git a/tensorflow/compiler/mlir/tensorflow/c/BUILD b/tensorflow/compiler/mlir/tensorflow/c/BUILD
index 243f4b5139f..64c56cf8aa9 100644
--- a/tensorflow/compiler/mlir/tensorflow/c/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/c/BUILD
@@ -1,8 +1,8 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_copts",
     "tf_cuda_library",
-    "tfe_xla_copts",
 )
 
 package(
@@ -20,7 +20,7 @@ tf_cuda_library(
     srcs = [
         "c_api_unified_experimental_mlir.cc",
     ],
-    copts = tf_copts() + tfe_xla_copts(),
+    copts = tf_copts(),
     deps = [
         "//tensorflow/c:c_api",
         "//tensorflow/c:tensor_interface",
@@ -35,6 +35,7 @@ tf_cuda_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
         "//tensorflow/compiler/mlir/tensorflow:convert_type",
+        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
index c62d62a2d3d..32c51f2e2bd 100644
--- a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
+++ b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/c/tf_status_internal.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -50,6 +51,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -74,15 +76,9 @@ using tensorflow::tracing::TracingTensorHandle;
 
 namespace {
 
-static void RegisterDialects() {
-  static bool init_once = []() {
-    mlir::registerDialect<mlir::StandardOpsDialect>();
-    mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
-    mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
-    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
-    return true;
-  }();
-  (void)init_once;
+void RegisterDialects(mlir::MLIRContext& ctx) {
+  mlir::RegisterAllTensorFlowDialects(ctx.getDialectRegistry());
+  ctx.getDialectRegistry().loadAll(&ctx);
 }
 
 Status ConvertDataTypeToTensor(tensorflow::DataType dtype, Builder builder,
@@ -239,6 +235,7 @@ class MlirFunctionContext : public TracingContext {
       : TracingContext(kMlir),
         context_(std::make_unique<MLIRContext>()),
         builder_(context_.get()) {
+    RegisterDialects(*context_);
     // TODO(aminim) figure out the location story here
     module_ = ModuleOp::create(builder_.getUnknownLoc());
     func_ = FuncOp::create(builder_.getUnknownLoc(), name,
@@ -456,7 +453,8 @@ Status MlirAbstractOp::SetAttrFloat(const char* attr_name, float value) {
   return Unimplemented("SetAttrFloat has not been implemented yet.");
 }
 Status MlirAbstractOp::SetAttrBool(const char* attr_name, bool value) {
-  return Unimplemented("SetAttrBool has not been implemented yet.");
+  attrs_[attr_name] = BoolAttr::get(value, context_);
+  return Status::OK();
 }
 Status MlirAbstractOp::SetAttrShape(const char* attr_name, const int64_t* dims,
                                     const int num_dims) {
@@ -514,6 +512,7 @@ Status MlirFunction::GetFunctionDef(tensorflow::FunctionDef** f) {
     return Status::OK();
   }
   PassManager pm(func_.getContext());
+  ::tensorflow::applyTensorflowAndCLOptions(pm);
   pm.addNestedPass<FuncOp>(CreateFunctionalToExecutorDialectConversionPass());
   pm.addPass(CreateBreakUpIslandsPass());
 
@@ -656,9 +655,8 @@ Status MlirFunctionContext::Finalize(OutputList* outputs,
   }
   builder_.create<ReturnOp>(func_.getLoc(), ret_operands);
 
-  auto arg_types = llvm::to_vector<8>(body.getArgumentTypes());
-  auto result_types =
-      llvm::to_vector<8>(body.getTerminator()->getOperandTypes());
+  auto arg_types = body.getArgumentTypes();
+  auto result_types = body.getTerminator()->getOperandTypes();
   func_.setType(FunctionType::get(arg_types, result_types, func_.getContext()));
   *f = new MlirFunction(std::move(context_), std::move(module_), func_);
   return Status::OK();
@@ -666,7 +664,6 @@ Status MlirFunctionContext::Finalize(OutputList* outputs,
 
 extern "C" {
 TracingContext* MlirTracingFactory(const char* fn_name, TF_Status* s) {
-  RegisterDialects();
   return new MlirFunctionContext(fn_name);
 }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc b/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc
deleted file mode 100644
index 45985cea583..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
-
-namespace mlir {
-
-// Static initialization for TF dialect registration.
-static DialectRegistration<TF::TensorFlowDialect> tf_ops;
-static DialectRegistration<tf_executor::TensorFlowExecutorDialect>
-    tf_executor_dialect;
-static DialectRegistration<tf_device::TensorFlowDeviceDialect>
-    tf_device_dialect;
-static DialectRegistration<tf_saved_model::TensorFlowSavedModelDialect>
-    tf_saved_model_dialect;
-static DialectRegistration<mlir::shape::ShapeDialect> shape_dialect;
-
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc
index 40cc2c99c27..746b34a018a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-
 namespace mlir {
 namespace TF {
 
@@ -79,6 +77,14 @@ ShapeAttr ShapeAttr::get(mlir::MLIRContext* context,
   return Base::get(context, ArrayRef<int64_t>(), /*unranked=*/true);
 }
 
+// Get or create a shape attribute.
+ShapeAttr ShapeAttr::get(mlir::MLIRContext* context, ShapedType shaped_type) {
+  if (shaped_type.hasRank())
+    return Base::get(context, shaped_type.getShape(), /*unranked=*/false);
+
+  return Base::get(context, ArrayRef<int64_t>(), /*unranked=*/true);
+}
+
 llvm::Optional<ArrayRef<int64_t>> ShapeAttr::getValue() const {
   if (hasRank()) return getShape();
   return llvm::None;
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
index 5a18b77ab5c..0927aefff68 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
@@ -20,6 +20,8 @@ limitations under the License.
 
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
 
 namespace mlir {
 namespace TF {
@@ -43,6 +45,9 @@ class ShapeAttr : public Attribute::AttrBase<ShapeAttr, Attribute,
   static ShapeAttr get(mlir::MLIRContext* context,
                        llvm::Optional<ArrayRef<int64_t>> shape);
 
+  // Get or create a shape attribute from a ShapedType type.
+  static ShapeAttr get(mlir::MLIRContext* context, ShapedType shaped_type);
+
   llvm::Optional<ArrayRef<int64_t>> getValue() const;
 
   bool hasRank() const;
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
index 5345000b4bd..3a2e8095139 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
@@ -269,8 +269,6 @@ ParseResult SetReplicateOpOperands(
         replicated_inputs,
     llvm::ArrayRef<OpAsmParser::OperandType> packed_inputs,
     llvm::ArrayRef<Type> region_arg_types, int32_t* n) {
-  if (replicated_inputs.empty() && packed_inputs.empty()) return success();
-
   for (const auto& attr : state->attributes)
     if (attr.first.strref() == "n")
       if (auto n_attr = attr.second.dyn_cast<IntegerAttr>())
@@ -279,6 +277,8 @@ ParseResult SetReplicateOpOperands(
   if (*n < 2)
     return parser->emitError(loc) << "expects 'n' to be at least 2, got " << *n;
 
+  if (replicated_inputs.empty() && packed_inputs.empty()) return success();
+
   for (auto replicated_input_and_idx : llvm::enumerate(replicated_inputs)) {
     const int32_t idx = replicated_input_and_idx.index();
     const auto& replicated_input = replicated_input_and_idx.value();
@@ -369,7 +369,7 @@ void Print(ReplicateOp op, OpAsmPrinter* p) {
   //     [%a, ...] as %block_arg0: type
   //   packed_input
   //     %b as %block_arg1: type
-  const int32_t n = op.n().getSExtValue();
+  const int32_t n = op.n();
   const int32_t num_replicated_inputs =
       (*op.operand_segment_sizes().int_value_begin()).getSExtValue();
   const int32_t num_replicated_block_args = num_replicated_inputs / n;
@@ -413,7 +413,7 @@ LogicalResult VerifyCompatibleTypes(Type a, Type b) {
 }
 
 LogicalResult Verify(ReplicateOp op) {
-  int32_t n = op.n().getSExtValue();
+  int32_t n = op.n();
 
   // Check number of devices, if set, matches `n`.
   if (op.devices().hasValue()) {
@@ -504,13 +504,12 @@ LogicalResult Verify(ReplicateOp op) {
   return success();
 }
 
-template <typename OperandsTy, typename ResultsTy>
 void BuildReplicateOp(
     Builder* builder, OperationState* state, int n,
     const llvm::SmallDenseMap<StringRef, llvm::SmallVector<StringRef, 4>>&
         devices,
-    llvm::ArrayRef<std::pair<OperandsTy, Type>> replicated_inputs,
-    llvm::ArrayRef<Value> packed_inputs, ResultsTy replica_output_types) {
+    llvm::ArrayRef<std::pair<ValueRange, Type>> replicated_inputs,
+    ValueRange packed_inputs, TypeRange replica_output_types) {
   DCHECK_GE(n, 2);
   state->addAttribute("n", builder->getI32IntegerAttr(n));
 
@@ -538,7 +537,7 @@ void BuildReplicateOp(
     block.addArgument(replicated_input.second);
   }
 
-  for (auto& packed_input : packed_inputs) {
+  for (auto packed_input : packed_inputs) {
     state->addOperands(packed_input);
     block.addArgument(packed_input.getType());
   }
@@ -560,20 +559,8 @@ void ReplicateOp::build(
     OpBuilder& builder, OperationState& state, int n,
     const llvm::SmallDenseMap<StringRef, llvm::SmallVector<StringRef, 4>>&
         devices,
-    llvm::ArrayRef<std::pair<llvm::ArrayRef<Value>, Type>> replicated_inputs,
-    llvm::ArrayRef<Value> packed_inputs,
-    llvm::ArrayRef<Type> replica_output_types) {
-  BuildReplicateOp(&builder, &state, n, devices, replicated_inputs,
-                   packed_inputs, replica_output_types);
-}
-
-void ReplicateOp::build(
-    OpBuilder& builder, OperationState& state, int n,
-    const llvm::SmallDenseMap<StringRef, llvm::SmallVector<StringRef, 4>>&
-        devices,
-    llvm::ArrayRef<std::pair<Operation::operand_range, Type>> replicated_inputs,
-    llvm::ArrayRef<Value> packed_inputs,
-    Operation::result_type_range replica_output_types) {
+    llvm::ArrayRef<std::pair<ValueRange, Type>> replicated_inputs,
+    ValueRange packed_inputs, TypeRange replica_output_types) {
   BuildReplicateOp(&builder, &state, n, devices, replicated_inputs,
                    packed_inputs, replica_output_types);
 }
@@ -670,12 +657,12 @@ void LaunchOp::getCanonicalizationPatterns(OwningRewritePatternList& results,
   results.insert<DropEmptyLaunch>(context);
 }
 
+}  // namespace tf_device
+}  // namespace mlir
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
 
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc.inc"
-
-}  // namespace tf_device
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h
index 688c8ca5715..5b1d9711875 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.h
@@ -41,11 +41,11 @@ class TensorFlowDeviceDialect : public Dialect {
   explicit TensorFlowDeviceDialect(MLIRContext* context);
 };
 
+}  // namespace tf_device
+}  // namespace mlir
+
 // Declares the operations for this dialect using the generated header.
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h.inc"
 
-}  // namespace tf_device
-}  // namespace mlir
-
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_DEVICE_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
index d94a37d9b02..65de4ea306f 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
@@ -36,7 +36,7 @@ def TfDevice_Dialect : Dialect {
     XlaRun.
 }];
 
-  let cppNamespace = "tf_device";
+  let cppNamespace = "::mlir::tf_device";
 }
 
 //===----------------------------------------------------------------------===//
@@ -295,14 +295,8 @@ For example:
   let builders = [
     OpBuilder<"OpBuilder& builder, OperationState& state, int n, "
               "const llvm::SmallDenseMap<StringRef, llvm::SmallVector<StringRef, 4>>& devices, "
-              "llvm::ArrayRef<std::pair<llvm::ArrayRef<Value>, Type>> replicated_inputs, "
-              "llvm::ArrayRef<Value> packed_inputs, "
-              "llvm::ArrayRef<Type> replica_output_types">,
-    OpBuilder<"OpBuilder& builder, OperationState& state, int n, "
-              "const llvm::SmallDenseMap<StringRef, llvm::SmallVector<StringRef, 4>>& devices, "
-              "llvm::ArrayRef<std::pair<Operation::operand_range, Type>> replicated_inputs, "
-              "llvm::ArrayRef<Value> packed_inputs, "
-              "Operation::result_type_range replica_output_types">
+              "llvm::ArrayRef<std::pair<ValueRange, Type>> replicated_inputs, "
+              "ValueRange packed_inputs, TypeRange replica_output_types">,
   ];
 
   let parser = [{ return Parse$cppClass(&parser, &result); }];
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index ea9ae5d9477..f2d0a548420 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -250,33 +250,6 @@ ParseResult ParseGraphOp(OpAsmParser &parser, OperationState &result) {
 // tf_executor.fetch
 //===----------------------------------------------------------------------===//
 
-namespace {
-
-void Print(FetchOp fetch, OpAsmPrinter &p) {
-  p << fetch.getOperationName();
-  if (fetch.getNumOperands() > 0) {
-    p << ' ';
-    p.printOperands(fetch.operand_begin(), fetch.operand_end());
-    p << " : ";
-    interleaveComma(fetch.getOperandTypes(), p);
-  }
-  p.printOptionalAttrDict(fetch.getAttrs());
-}
-
-ParseResult ParseFetchOp(OpAsmParser &parser, OperationState &result) {
-  SmallVector<OpAsmParser::OperandType, 2> opInfo;
-  SmallVector<Type, 2> types;
-  llvm::SMLoc loc = parser.getCurrentLocation();
-  return failure(parser.parseOperandList(opInfo) ||
-                 (!opInfo.empty() && parser.parseColonTypeList(types)) ||
-                 parser.resolveOperands(opInfo, types, loc, result.operands) ||
-                 parser.parseOptionalAttrDict(result.attributes)
-
-  );
-}
-
-}  // anonymous namespace
-
 //===----------------------------------------------------------------------===//
 // tf_executor.island
 //===----------------------------------------------------------------------===//
@@ -411,31 +384,6 @@ ParseResult ParseIslandOp(OpAsmParser &parser, OperationState &result) {
 // tf_executor.yield
 //===----------------------------------------------------------------------===//
 
-namespace {
-
-void Print(YieldOp yield, OpAsmPrinter &p) {
-  p << yield.getOperationName();
-  if (yield.getNumOperands() > 0) {
-    p << ' ';
-    p.printOperands(yield.operand_begin(), yield.operand_end());
-    p << " : ";
-    interleaveComma(yield.getOperandTypes(), p);
-  }
-  p.printOptionalAttrDict(yield.getAttrs());
-}
-
-ParseResult ParseYieldOp(OpAsmParser &parser, OperationState &result) {
-  SmallVector<OpAsmParser::OperandType, 2> op_info;
-  SmallVector<Type, 2> types;
-  llvm::SMLoc loc = parser.getCurrentLocation();
-  return failure(parser.parseOperandList(op_info) ||
-                 (!op_info.empty() && parser.parseColonTypeList(types)) ||
-                 parser.resolveOperands(op_info, types, loc, result.operands) ||
-                 parser.parseOptionalAttrDict(result.attributes));
-}
-
-}  // anonymous namespace
-
 //===----------------------------------------------------------------------===//
 // tf_executor.Switch
 //===----------------------------------------------------------------------===//
@@ -848,23 +796,6 @@ LogicalResult Verify(NextIterationSourceOp source) {
   return success();
 }
 
-void Print(NextIterationSourceOp next_iteration, OpAsmPrinter &p) {
-  p << next_iteration.getOperationName() << " : " << next_iteration.getType(0);
-  p.printOptionalAttrDict(next_iteration.getAttrs());
-}
-
-ParseResult ParseNextIterationSourceOp(OpAsmParser &parser,
-                                       OperationState &result) {
-  SmallVector<Type, 1> types;
-  if (parser.parseColonTypeList(types)) return failure();
-
-  MLIRContext *context = parser.getBuilder().getContext();
-  Type token_type = TokenType::get(context);
-  Type control_type = ControlType::get(context);
-  result.addTypes({types.front(), token_type, control_type});
-  return parser.parseOptionalAttrDict(result.attributes);
-}
-
 }  // anonymous namespace
 
 //===----------------------------------------------------------------------===//
@@ -891,36 +822,6 @@ LogicalResult Verify(NextIterationSinkOp sink) {
   return success();
 }
 
-void Print(NextIterationSinkOp next_iteration, OpAsmPrinter &p) {
-  p << next_iteration.getOperationName() << " [";
-  p.printOperand(next_iteration.getOperand(0));
-  p << "] ";
-  p.printOperands(llvm::drop_begin(next_iteration.getOperands(), 1));
-  p << " : " << next_iteration.getOperand(1).getType();
-  p.printOptionalAttrDict(next_iteration.getAttrs());
-}
-
-ParseResult ParseNextIterationSinkOp(OpAsmParser &parser,
-                                     OperationState &result) {
-  SmallVector<OpAsmParser::OperandType, 2> op_infos;
-  llvm::SMLoc loc = parser.getCurrentLocation();
-
-  // First type is always the token consumed from the NextIteration.source
-  Type token_type = TokenType::get(parser.getBuilder().getContext());
-  SmallVector<Type, 1> types = {token_type};
-
-  if (parser.parseOperandList(op_infos, 1, OpAsmParser::Delimiter::Square) ||
-      parser.parseOperandList(op_infos) || parser.parseColonTypeList(types))
-    return failure();
-
-  Type control_type = ControlType::get(parser.getBuilder().getContext());
-  types.append(op_infos.size() - 2, control_type);
-  if (parser.resolveOperands(op_infos, types, loc, result.operands))
-    return failure();
-
-  return parser.parseOptionalAttrDict(result.attributes);
-}
-
 }  // anonymous namespace
 
 //===----------------------------------------------------------------------===//
@@ -959,32 +860,6 @@ ParseResult ParseExitOp(OpAsmParser &parser, OperationState &result) {
 // tf_executor.ControlTrigger
 //===----------------------------------------------------------------------===//
 
-namespace {
-
-void Print(ControlTriggerOp trigger, OpAsmPrinter &p) {
-  p << trigger.getOperationName() << ' ';
-  p.printOperands(trigger.getOperands());
-  p.printOptionalAttrDict(trigger.getAttrs());
-}
-
-ParseResult ParseControlTriggerOp(OpAsmParser &parser, OperationState &result) {
-  SmallVector<OpAsmParser::OperandType, 2> op_infos;
-  SmallVector<Type, 1> types;
-  llvm::SMLoc loc = parser.getCurrentLocation();
-
-  if (parser.parseOperandList(op_infos)) return failure();
-  Type control_type = ControlType::get(parser.getBuilder().getContext());
-  types.append(op_infos.size(), control_type);
-  if (parser.resolveOperands(op_infos, types, loc, result.operands))
-    return failure();
-
-  // Single control as the only output
-  result.types.push_back(control_type);
-  return parser.parseOptionalAttrDict(result.attributes);
-}
-
-}  // anonymous namespace
-
 //===----------------------------------------------------------------------===//
 // tf_executor.LoopCond
 //===----------------------------------------------------------------------===//
@@ -1246,12 +1121,12 @@ LogicalResult IslandOp::fold(llvm::ArrayRef<Attribute> operands,
   return success();
 }
 
+}  // namespace tf_executor
+}  // namespace mlir
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
 
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc.inc"
-
-}  // namespace tf_executor
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h
index 60036ddc9f8..2bc13556b4b 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h
@@ -57,11 +57,11 @@ class TokenType : public Type::TypeBase<TokenType, Type, TypeStorage> {
   using Base::Base;
 };
 
+}  // namespace tf_executor
+}  // namespace mlir
+
 // Declares the operations for this dialect using the generated header.
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h.inc"
 
-}  // namespace tf_executor
-}  // namespace mlir
-
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_EXECUTOR_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
index 3081018b8da..713ddc44cba 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td
@@ -43,14 +43,16 @@ def TfExecutor_Dialect : Dialect {
     value).
 }];
 
-  let cppNamespace = "tf_executor";
+  let cppNamespace = "::mlir::tf_executor";
 }
 
 // Control type.
-def TfeControlType : Type<CPred<"$_self.isa<ControlType>()">, "control">;
+def TfeControlType : Type<CPred<"$_self.isa<ControlType>()">, "control">,
+                     BuildableType<"$_builder.getType<ControlType>()">;
 
 // Token type.
-def TfeTokenType : Type<CPred<"$_self.isa<TokenType>()">, "token">;
+def TfeTokenType : Type<CPred<"$_self.isa<TokenType>()">, "token">,
+                   BuildableType<"$_builder.getType<TokenType>()">;
 
 // TODO(hinsu): Define and use TensorType instead of AnyType for data operands
 // and results. For example, MergeOp output type.
@@ -148,7 +150,11 @@ def TfExecutor_FetchOp : TfExecutor_Op<"fetch",
     }]>
    ];
 
+  let assemblyFormat = "($fetches^ `:` type($fetches))? attr-dict";
+
   let verifier = ?;
+  let printer = ?;
+  let parser = ?;
 }
 
 def TfExecutor_IslandOp : TfExecutor_Op<"island",
@@ -229,7 +235,11 @@ def TfExecutor_YieldOp : TfExecutor_Op<"yield",
     }]>
    ];
 
+  let assemblyFormat = "($fetches^ `:` type($fetches))? attr-dict";
+
   let verifier = ?;
+  let printer = ?;
+  let parser = ?;
 }
 
 def TfExecutor_SwitchOp : TfExecutor_Op<"Switch",
@@ -466,6 +476,10 @@ def TfExecutor_NextIterationSourceOp : TfExecutor_Op<"NextIteration.Source",
     }
   }];
 
+  let assemblyFormat = "`:` type($output) attr-dict";
+
+  let printer = ?;
+  let parser = ?;
 }
 
 
@@ -527,6 +541,11 @@ def TfExecutor_NextIterationSinkOp : TfExecutor_Op<"NextIteration.Sink",
       result.attributes.append(attributes.begin(), attributes.end());
     }]>
    ];
+
+  let assemblyFormat = " `[` $token `]` $input (`,` $controlInputs^)? `:` type($input) attr-dict";
+
+  let printer = ?;
+  let parser = ?;
 }
 
 def TfExecutor_ExitOp : TfExecutor_Op<"Exit",
@@ -552,7 +571,7 @@ def TfExecutor_ExitOp : TfExecutor_Op<"Exit",
        .Attr("T: type")
 
     For example:
-     %1:2 = tf_executor.Exit %0#0 {T: "tfdtype$DT_INT32"} : tensor<*xi32>
+     %1:2 = tf_executor.Exit %0#0 : tensor<*xi32> {T: "tfdtype$DT_INT32"}
 
     Note: Additional result corresponds to the control output.
   }];
@@ -607,6 +626,11 @@ def TfExecutor_ControlTriggerOp : TfExecutor_Op<"ControlTrigger",
       result.attributes.append(attributes.begin(), attributes.end());
     }]>
    ];
+
+  let assemblyFormat = "$controlInputs attr-dict";
+
+  let printer = ?;
+  let parser = ?;
 }
 
 def TfExecutor_LoopCondOp : TfExecutor_Op<"LoopCond",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 283e3326029..aa1b7bb81a9 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -16,11 +16,12 @@ limitations under the License.
 // This is the operation definition file for TensorFlow.
 //
 // This file contains TensorFlow ops whose definitions are programmatically
-// generated from the TensorFlow codebase. The generated fields for an op
-// includes name, summary, description, traits, arguments, results, derived
-// attributes. Therefore, modifications to these fields will **not** be
-// respected upon subsequent refreshes. However, additional fields after those
-// fields will be retained.
+// generated from the api-def-files in the following folder:
+// tensorflow/core/api_def/base_api
+// The generated fields for an op include name, summary, description, traits,
+// arguments, results, derived attributes. Therefore, modifications to these
+// fields will **not** be respected upon subsequent refreshes. However,
+// additional fields after those fields will be retained.
 //
 // If you absolutely need to modify the generated fields of an op, move the
 // definition to `tf_ops.td` and perform the modification there.
@@ -28,6 +29,7 @@ limitations under the License.
 // Ops in this file are sorted alphabetically.
 
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td"
+include "mlir/Interfaces/InferTypeOpInterface.td"
 
 def TF_AbsOp : TF_Op<"Abs", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes the absolute value of a tensor.";
@@ -39,11 +41,11 @@ an output element, this operation computes \\(y = |x|\\).
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8]>:$x
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8]>:$y
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -52,12 +54,18 @@ an output element, this operation computes \\(y = |x|\\).
 def TF_AcosOp : TF_Op<"Acos", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes acos of x element-wise.";
 
+  let description = [{
+Provided an input tensor, the `tf.math.acos` operation returns the inverse cosine of each element of the tensor. If `y = tf.math.cos(x)` then, `x = tf.math.acos(y)`.
+
+  Input range is `[-1, 1]` and the output has a range of `[0, pi]`.
+  }];
+
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -87,29 +95,6 @@ tf.math.acosh(x) ==> [nan nan 0. 0.62236255 5.9914584 9.903487 inf]
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_AddOp : TF_Op<"Add", [NoSideEffect, ResultsBroadcastableShape, TF_LayoutAgnostic, TF_SameOperandsAndResultElementTypeResolveRef]>,
-               WithBroadcastableBinOpBuilder {
-  let summary = "Returns x + y element-wise.";
-
-  let description = [{
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-  }];
-
-  let arguments = (ins
-    TF_NumberOrStrTensor:$x,
-    TF_NumberOrStrTensor:$y
-  );
-
-  let results = (outs
-    TF_NumberOrStrTensor:$z
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-
-  let hasCanonicalizer = 1;
-}
-
 def TF_AddNOp : TF_Op<"AddN", [Commutative, NoSideEffect]> {
   let summary = "Add all input tensors element wise.";
 
@@ -123,11 +108,11 @@ Inputs must be of same size and shape.
   }];
 
   let arguments = (ins
-    Variadic<TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8, TF_Variant]>>:$inputs
+    Variadic<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8, TF_Variant]>>:$inputs
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8, TF_Variant]>:$sum
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8, TF_Variant]>:$sum
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -136,31 +121,6 @@ Inputs must be of same size and shape.
   let hasFolder = 1;
 }
 
-def TF_AddV2Op : TF_Op<"AddV2", [Commutative, NoSideEffect, ResultsBroadcastableShape, TF_CwiseBinary, TF_LayoutAgnostic, TF_SameOperandsAndResultElementTypeResolveRef]>,
-                 WithBroadcastableBinOpBuilder {
-  let summary = "Returns x + y element-wise.";
-
-  let description = [{
-*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-  }];
-
-  let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint8]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint8]>:$y
-  );
-
-  let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint8]>:$z
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-
-  let hasCanonicalizer = 1;
-
-  let hasFolder = 1;
-}
-
 def TF_AdjustContrastv2Op : TF_Op<"AdjustContrastv2", [NoSideEffect]> {
   let summary = "Adjust the contrast of one or more images.";
 
@@ -177,12 +137,12 @@ channel and then adjusts each component of each pixel to
   }];
 
   let arguments = (ins
-    TensorOf<[F16, F32]>:$images,
-    F32Tensor:$contrast_factor
+    TensorOf<[TF_Float16, TF_Float32]>:$images,
+    TF_Float32Tensor:$contrast_factor
   );
 
   let results = (outs
-    TensorOf<[F16, F32]>:$output
+    TensorOf<[TF_Float16, TF_Float32]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -201,12 +161,12 @@ and then remapped back to RGB colorspace.
   }];
 
   let arguments = (ins
-    TensorOf<[F16, F32]>:$images,
-    F32Tensor:$delta
+    TensorOf<[TF_Float16, TF_Float32]>:$images,
+    TF_Float32Tensor:$delta
   );
 
   let results = (outs
-    TensorOf<[F16, F32]>:$output
+    TensorOf<[TF_Float16, TF_Float32]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -225,12 +185,12 @@ values, and then remapped back to RGB colorspace.
   }];
 
   let arguments = (ins
-    TensorOf<[F16, F32]>:$images,
-    F32Tensor:$scale
+    TensorOf<[TF_Float16, TF_Float32]>:$images,
+    TF_Float32Tensor:$scale
   );
 
   let results = (outs
-    TensorOf<[F16, F32]>:$output
+    TensorOf<[TF_Float16, TF_Float32]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -249,14 +209,14 @@ retained with length 1.
   }];
 
   let arguments = (ins
-    I1Tensor:$input,
+    TF_BoolTensor:$input,
     TF_I32OrI64Tensor:$reduction_indices,
 
     DefaultValuedAttr<BoolAttr, "false">:$keep_dims
   );
 
   let results = (outs
-    I1Tensor:$output
+    TF_BoolTensor:$output
   );
 
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
@@ -264,7 +224,7 @@ retained with length 1.
   let verifier = [{ return Verify(*this); }];
 }
 
-def TF_AllToAllOp : TF_Op<"AllToAll", [NoSideEffect]> {
+def TF_AllToAllOp : TF_Op<"AllToAll", [NoSideEffect, TF_NoConstantFold]> {
   let summary = "An Op to exchange data across TPU replicas.";
 
   let description = [{
@@ -287,8 +247,8 @@ replica 1's output: `[[B], [D]]`
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
-    I32Tensor:$group_assignment,
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    TF_Int32Tensor:$group_assignment,
 
     I64Attr:$concat_dimension,
     I64Attr:$split_dimension,
@@ -296,7 +256,7 @@ replica 1's output: `[[B], [D]]`
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -337,6 +297,88 @@ Equivalent to np.angle.
   TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_AnonymousIteratorOp : TF_Op<"AnonymousIterator", []> {
+  let summary = "A container for an iterator resource.";
+
+  let arguments = (ins
+    Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
+    Confined<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_DatasetIteratorAlloc]>:$handle
+  );
+}
+
+def TF_AnonymousIteratorV2Op : TF_Op<"AnonymousIteratorV2", []> {
+  let summary = "A container for an iterator resource.";
+
+  let arguments = (ins
+    Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
+    Confined<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_DatasetIteratorAlloc]>:$handle,
+    TF_VariantTensor:$deleter
+  );
+}
+
+def TF_AnonymousMemoryCacheOp : TF_Op<"AnonymousMemoryCache", []> {
+  let summary = "";
+
+  let arguments = (ins);
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_DatasetMemoryCacheAlloc]>:$handle,
+    TF_VariantTensor:$deleter
+  );
+}
+
+def TF_AnonymousMultiDeviceIteratorOp : TF_Op<"AnonymousMultiDeviceIterator", []> {
+  let summary = "A container for a multi device iterator resource.";
+
+  let arguments = (ins
+    Confined<StrArrayAttr, [ArrayMinCount<1>]>:$devices,
+    Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
+    Confined<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_DatasetIteratorAlloc]>:$handle,
+    TF_VariantTensor:$deleter
+  );
+}
+
+def TF_AnonymousRandomSeedGeneratorOp : TF_Op<"AnonymousRandomSeedGenerator", []> {
+  let summary = "";
+
+  let arguments = (ins
+    TF_Int64Tensor:$seed,
+    TF_Int64Tensor:$seed2
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_DatasetSeedGeneratorAlloc]>:$handle,
+    TF_VariantTensor:$deleter
+  );
+}
+
+def TF_AnonymousSeedGeneratorOp : TF_Op<"AnonymousSeedGenerator", []> {
+  let summary = "";
+
+  let arguments = (ins
+    TF_Int64Tensor:$seed,
+    TF_Int64Tensor:$seed2,
+    TF_BoolTensor:$reshuffle
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_DatasetSeedGeneratorAlloc]>:$handle,
+    TF_VariantTensor:$deleter
+  );
+}
+
 def TF_AnyOp : TF_Op<"Any", [NoSideEffect]> {
   let summary = [{
 Computes the "logical or" of elements across dimensions of a tensor.
@@ -350,14 +392,14 @@ retained with length 1.
   }];
 
   let arguments = (ins
-    I1Tensor:$input,
+    TF_BoolTensor:$input,
     TF_I32OrI64Tensor:$reduction_indices,
 
     DefaultValuedAttr<BoolAttr, "false">:$keep_dims
   );
 
   let results = (outs
-    I1Tensor:$output
+    TF_BoolTensor:$output
   );
 
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
@@ -369,14 +411,14 @@ def TF_ApproximateEqualOp : TF_Op<"ApproximateEqual", [Commutative, NoSideEffect
   let summary = "Returns the truth value of abs(x-y) < tolerance element-wise.";
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$y,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$x,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$y,
 
     DefaultValuedAttr<F32Attr, "1e-05f">:$tolerance
   );
 
   let results = (outs
-    I1Tensor:$z
+    TF_BoolTensor:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -402,7 +444,7 @@ Usage:
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     TF_I32OrI64Tensor:$dimension
   );
 
@@ -435,7 +477,7 @@ Usage:
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     TF_I32OrI64Tensor:$dimension
   );
 
@@ -467,7 +509,7 @@ array([b'3.14', b'2.72'], dtype=object)
   }];
 
   let arguments = (ins
-    TensorOf<[F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$input,
+    TensorOf<[TF_Bool, TF_Complex128, TF_Complex64, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$input,
 
     DefaultValuedAttr<I64Attr, "-1">:$precision,
     DefaultValuedAttr<BoolAttr, "false">:$scientific,
@@ -505,11 +547,11 @@ tf.math.asin(y) # [1.047, 0.785] = x
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -549,7 +591,7 @@ If `condition` evaluates to false, print the list of tensors in `data`.
   }];
 
   let arguments = (ins
-    I1Tensor:$condition,
+    TF_BoolTensor:$condition,
     Variadic<TF_Tensor>:$data,
 
     DefaultValuedAttr<I64Attr, "3">:$summarize
@@ -571,7 +613,7 @@ see the incremented value or a subsequent newer one.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$resource,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$resource,
     TF_Tensor:$value
   );
 
@@ -589,7 +631,7 @@ see the decremented value or a subsequent newer one.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$resource,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$resource,
     TF_Tensor:$value
   );
 
@@ -607,7 +649,7 @@ this value or a subsequent newer value of the variable.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$resource,
+    Arg<TF_ResourceTensor, "", [TF_VariableWrite]>:$resource,
     TF_Tensor:$value
   );
 
@@ -638,11 +680,11 @@ tf.math.atan(y) # [1.047, 0.785] = x
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -663,12 +705,12 @@ where \(r = \sqrt(x^2 + y^2) \).
   }];
 
   let arguments = (ins
-    TF_FpTensor:$y,
-    TF_FpTensor:$x
+    TF_FloatTensor:$y,
+    TF_FloatTensor:$x
   );
 
   let results = (outs
-    TF_FpTensor:$z
+    TF_FloatTensor:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -710,7 +752,7 @@ window in `value`.
   }];
 
   let arguments = (ins
-    TF_FpTensor:$value,
+    TF_FloatTensor:$value,
 
     Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$ksize,
     Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$strides,
@@ -719,7 +761,7 @@ window in `value`.
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -734,7 +776,7 @@ Each entry in `output` is the mean of the corresponding size `ksize` window in
   }];
 
   let arguments = (ins
-    TF_FpTensor:$input,
+    TF_FloatTensor:$input,
 
     Confined<I64ArrayAttr, [ArrayMinCount<5>]>:$ksize,
     Confined<I64ArrayAttr, [ArrayMinCount<5>]>:$strides,
@@ -743,7 +785,7 @@ Each entry in `output` is the mean of the corresponding size `ksize` window in
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -753,8 +795,8 @@ def TF_AvgPool3DGradOp : TF_Op<"AvgPool3DGrad", [NoSideEffect]> {
   let summary = "Computes gradients of average pooling function.";
 
   let arguments = (ins
-    I32Tensor:$orig_input_shape,
-    TF_FpTensor:$grad,
+    TF_Int32Tensor:$orig_input_shape,
+    TF_FloatTensor:$grad,
 
     Confined<I64ArrayAttr, [ArrayMinCount<5>]>:$ksize,
     Confined<I64ArrayAttr, [ArrayMinCount<5>]>:$strides,
@@ -763,7 +805,7 @@ def TF_AvgPool3DGradOp : TF_Op<"AvgPool3DGrad", [NoSideEffect]> {
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
@@ -773,8 +815,8 @@ def TF_AvgPoolGradOp : TF_Op<"AvgPoolGrad", [NoSideEffect]> {
   let summary = "Computes gradients of the average pooling function.";
 
   let arguments = (ins
-    I32Tensor:$orig_input_shape,
-    TF_FpTensor:$grad,
+    TF_Int32Tensor:$orig_input_shape,
+    TF_FloatTensor:$grad,
 
     Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$ksize,
     Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$strides,
@@ -783,7 +825,7 @@ def TF_AvgPoolGradOp : TF_Op<"AvgPoolGrad", [NoSideEffect]> {
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
@@ -814,15 +856,15 @@ It is computed as:
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$x,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$y,
 
     DefaultValuedAttr<BoolAttr, "false">:$adj_x,
     DefaultValuedAttr<BoolAttr, "false">:$adj_y
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -859,15 +901,15 @@ about broadcasting
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Complex128, TF_Complex64]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Complex128, TF_Complex64]>:$y,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64]>:$x,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64]>:$y,
 
     DefaultValuedAttr<BoolAttr, "false">:$adj_x,
     DefaultValuedAttr<BoolAttr, "false">:$adj_y
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Complex128, TF_Complex64]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -887,18 +929,18 @@ This op is deprecated. Prefer `tf.nn.batch_normalization`.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$t,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$m,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$v,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$gamma,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$t,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$m,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$v,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$gamma,
 
     F32Attr:$variance_epsilon,
     BoolAttr:$scale_after_normalization
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$result
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$result
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -999,7 +1041,7 @@ beta function.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_BiasAddOp : TF_Op<"BiasAdd", [NoSideEffect]> {
+def TF_BiasAddOp : TF_Op<"BiasAdd", [NoSideEffect, TF_ContractionFusableInterface]> {
   let summary = "Adds `bias` to `value`.";
 
   let description = [{
@@ -1008,18 +1050,23 @@ Broadcasting is supported, so `value` may have any number of dimensions.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$value,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$bias,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$value,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$bias,
 
     DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
+  let extraClassDeclaration = [{
+    // TF_ContractionFusableInterface:
+    Optional<ContractionFusion> GetContractionFusion();
+  }];
+
   let verifier = [{
     return Verify(*this);
   }];
@@ -1037,13 +1084,13 @@ the feature dimension is the third-to-last.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$out_backprop,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$out_backprop,
 
     DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1064,12 +1111,12 @@ Broadcasting is supported, so `value` may have any number of dimensions.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$value,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$bias
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$value,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$bias
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1373,13 +1420,13 @@ then the output will be
   }];
 
   let arguments = (ins
-    TensorOf<[F32, F64, I32, I64]>:$input,
+    TensorOf<[TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$input,
 
     F32ArrayAttr:$boundaries
   );
 
   let results = (outs
-    I32Tensor:$output
+    TF_Int32Tensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1408,11 +1455,11 @@ def TF_CeilOp : TF_Op<"Ceil", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Returns element-wise smallest integer not less than x.";
 
   let arguments = (ins
-    TF_FpTensor:$x
+    TF_FloatTensor:$x
   );
 
   let results = (outs
-    TF_FpTensor:$y
+    TF_FloatTensor:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1427,13 +1474,13 @@ that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
   }];
 
   let arguments = (ins
-    TF_FpTensor:$tensor,
+    TF_FloatTensor:$tensor,
 
     StrAttr:$message
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1461,11 +1508,11 @@ case it might be faster to use the CPU.
   }];
 
   let arguments = (ins
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$input
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$input
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$output
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1482,13 +1529,13 @@ greater than `clip_value_max` are set to `clip_value_max`.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$t,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$clip_value_min,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$clip_value_max
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$t,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$clip_value_min,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$clip_value_max
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1507,7 +1554,7 @@ def TF_CollectiveBcastRecvOp : TF_Op<"CollectiveBcastRecv", []> {
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64, I1, I32, I64]>:$data
+    TensorOf<[TF_Bool, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$data
   );
 
   TF_DerivedResultTypeAttr T = TF_DerivedResultTypeAttr<0>;
@@ -1517,7 +1564,7 @@ def TF_CollectiveBcastSendOp : TF_Op<"CollectiveBcastSend", []> {
   let summary = "Broadcasts a tensor value to one or more other devices.";
 
   let arguments = (ins
-    TensorOf<[F16, F32, F64, I1, I32, I64]>:$input,
+    TensorOf<[TF_Bool, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$input,
 
     I64Attr:$group_size,
     I64Attr:$group_key,
@@ -1528,7 +1575,7 @@ def TF_CollectiveBcastSendOp : TF_Op<"CollectiveBcastSend", []> {
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64, I1, I32, I64]>:$data
+    TensorOf<[TF_Bool, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$data
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1540,7 +1587,7 @@ Mutually accumulates multiple tensors of identical type and shape.
   }];
 
   let arguments = (ins
-    TensorOf<[F16, F32, F64, I32, I64]>:$input,
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$input,
 
     I64Attr:$group_size,
     I64Attr:$group_key,
@@ -1551,7 +1598,7 @@ Mutually accumulates multiple tensors of identical type and shape.
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64, I32, I64]>:$data
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$data
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1563,7 +1610,7 @@ Mutually reduces multiple tensors of identical type and shape.
   }];
 
   let arguments = (ins
-    TensorOf<[F16, F32, F64, I32, I64]>:$input,
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$input,
 
     I64Attr:$group_size,
     I64Attr:$group_key,
@@ -1577,7 +1624,7 @@ Mutually reduces multiple tensors of identical type and shape.
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64, I32, I64]>:$data
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$data
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1589,10 +1636,10 @@ Mutually reduces multiple tensors of identical type and shape.
   }];
 
   let arguments = (ins
-    TensorOf<[F16, F32, F64, I32, I64]>:$input,
-    I32Tensor:$group_size,
-    I32Tensor:$group_key,
-    I32Tensor:$instance_key,
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$input,
+    TF_Int32Tensor:$group_size,
+    TF_Int32Tensor:$group_key,
+    TF_Int32Tensor:$instance_key,
 
     TF_AnyStrAttrOf<["Min", "Max", "Mul", "Add"]>:$merge_op,
     TF_AnyStrAttrOf<["Id", "Div"]>:$final_op,
@@ -1600,7 +1647,7 @@ Mutually reduces multiple tensors of identical type and shape.
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64, I32, I64]>:$data
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$data
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1665,7 +1712,7 @@ def TF_ConcatOp : TF_Op<"Concat", [NoSideEffect]> {
   let summary = "Concatenates tensors along one dimension.";
 
   let arguments = (ins
-    I32Tensor:$concat_dim,
+    TF_Int32Tensor:$concat_dim,
     Variadic<TF_Tensor>:$values
   );
 
@@ -1700,12 +1747,12 @@ This is typically used by gradient computations for a concat operation.
   }];
 
   let arguments = (ins
-    I32Tensor:$concat_dim,
-    Variadic<I32Tensor>:$shape
+    TF_Int32Tensor:$concat_dim,
+    Variadic<TF_Int32Tensor>:$shape
   );
 
   let results = (outs
-    Variadic<I32Tensor>:$offset
+    Variadic<TF_Int32Tensor>:$offset
   );
 
   TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<1>;
@@ -1740,6 +1787,34 @@ def TF_ConcatV2Op : TF_Op<"ConcatV2", [NoSideEffect]> {
   let hasCanonicalizer = 1;
 }
 
+def TF_ConfigureDistributedTPUOp : TF_Op<"ConfigureDistributedTPU", []> {
+  let summary = [{
+Sets up the centralized structures for a distributed TPU system.
+  }];
+
+  let arguments = (ins
+    StrAttr:$embedding_config,
+    StrAttr:$tpu_embedding_config,
+    DefaultValuedAttr<BoolAttr, "false">:$is_global_init,
+    DefaultValuedAttr<BoolAttr, "false">:$enable_whole_mesh_compilations,
+    DefaultValuedAttr<BoolAttr, "true">:$compilation_failure_closes_chips
+  );
+
+  let results = (outs
+    TF_StrTensor:$topology
+  );
+}
+
+def TF_ConfigureTPUEmbeddingOp : TF_Op<"ConfigureTPUEmbedding", []> {
+  let summary = "Sets up TPUEmbedding in a distributed TPU system.";
+
+  let arguments = (ins
+    StrAttr:$config
+  );
+
+  let results = (outs);
+}
+
 def TF_ConjOp : TF_Op<"Conj", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Returns the complex conjugate of a complex number.";
 
@@ -1826,8 +1901,8 @@ horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32]>:$input,
-    TensorOf<[BF16, F16, F32, F64, I32]>:$filter,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int32]>:$input,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int32]>:$filter,
 
     I64ArrayAttr:$strides,
     DefaultValuedAttr<BoolAttr, "true">:$use_cudnn_on_gpu,
@@ -1838,7 +1913,7 @@ horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32]>:$output
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int32]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1862,9 +1937,9 @@ Computes the gradients of convolution with respect to the filter.
   }];
 
   let arguments = (ins
-    TF_FpTensor:$input,
-    I32Tensor:$filter_sizes,
-    TF_FpTensor:$out_backprop,
+    TF_FloatTensor:$input,
+    TF_Int32Tensor:$filter_sizes,
+    TF_FloatTensor:$out_backprop,
 
     I64ArrayAttr:$strides,
     DefaultValuedAttr<BoolAttr, "true">:$use_cudnn_on_gpu,
@@ -1875,7 +1950,7 @@ Computes the gradients of convolution with respect to the filter.
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1895,9 +1970,9 @@ Computes the gradients of convolution with respect to the input.
   }];
 
   let arguments = (ins
-    I32Tensor:$input_sizes,
-    TensorOf<[BF16, F16, F32, F64, I32]>:$filter,
-    TensorOf<[BF16, F16, F32, F64, I32]>:$out_backprop,
+    TF_Int32Tensor:$input_sizes,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int32]>:$filter,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int32]>:$out_backprop,
 
     I64ArrayAttr:$strides,
     DefaultValuedAttr<BoolAttr, "true">:$use_cudnn_on_gpu,
@@ -1908,7 +1983,7 @@ Computes the gradients of convolution with respect to the input.
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32]>:$output
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int32]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
@@ -1940,8 +2015,8 @@ Our Conv3D implements a form of cross-correlation.
   }];
 
   let arguments = (ins
-    TF_FpTensor:$input,
-    TF_FpTensor:$filter,
+    TF_FloatTensor:$input,
+    TF_FloatTensor:$filter,
 
     Confined<I64ArrayAttr, [ArrayMinCount<5>]>:$strides,
     TF_AnyStrAttrOf<["SAME", "VALID"]>:$padding,
@@ -1950,7 +2025,7 @@ Our Conv3D implements a form of cross-correlation.
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1966,9 +2041,9 @@ Computes the gradients of 3-D convolution with respect to the filter.
   }];
 
   let arguments = (ins
-    TF_FpTensor:$input,
-    I32Tensor:$filter_sizes,
-    TF_FpTensor:$out_backprop,
+    TF_FloatTensor:$input,
+    TF_Int32Tensor:$filter_sizes,
+    TF_FloatTensor:$out_backprop,
 
     Confined<I64ArrayAttr, [ArrayMinCount<5>]>:$strides,
     TF_AnyStrAttrOf<["SAME", "VALID"]>:$padding,
@@ -1977,7 +2052,7 @@ Computes the gradients of 3-D convolution with respect to the filter.
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1990,8 +2065,8 @@ Computes the gradients of 3-D convolution with respect to the input.
 
   let arguments = (ins
     TF_I32OrI64Tensor:$input_sizes,
-    TF_FpTensor:$filter,
-    TF_FpTensor:$out_backprop,
+    TF_FloatTensor:$filter,
+    TF_FloatTensor:$out_backprop,
 
     Confined<I64ArrayAttr, [ArrayMinCount<5>]>:$strides,
     TF_AnyStrAttrOf<["SAME", "VALID"]>:$padding,
@@ -2000,7 +2075,7 @@ Computes the gradients of 3-D convolution with respect to the input.
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
@@ -2079,7 +2154,7 @@ of corresponding 3-element vectors is cross-multiplied independently.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_CrossReplicaSumOp : TF_Op<"CrossReplicaSum", [NoSideEffect, TF_AllTypesMatch<["input", "output"]>]> {
+def TF_CrossReplicaSumOp : TF_Op<"CrossReplicaSum", [NoSideEffect, TF_AllTypesMatch<["input", "output"]>, TF_NoConstantFold]> {
   let summary = "An Op to sum inputs across replicated TPU instances.";
 
   let description = [{
@@ -2092,12 +2167,12 @@ and `B, D, F, H` as group 1. Thus we get the outputs:
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, I32, TF_Uint32]>:$input,
-    I32Tensor:$group_assignment
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Int32, TF_Uint32]>:$input,
+    TF_Int32Tensor:$group_assignment
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, I32, TF_Uint32]>:$output
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Int32, TF_Uint32]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -2140,7 +2215,7 @@ tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$x,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$x,
     TF_I32OrI64Tensor:$axis,
 
     DefaultValuedAttr<BoolAttr, "false">:$exclusive,
@@ -2148,7 +2223,7 @@ tf.cumprod([a, b, c], exclusive=True, reverse=True)  # => [b * c, c, 1]
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$out
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$out
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -2194,7 +2269,7 @@ tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$x,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$x,
     TF_I32OrI64Tensor:$axis,
 
     DefaultValuedAttr<BoolAttr, "false">:$exclusive,
@@ -2202,7 +2277,7 @@ tf.cumsum([a, b, c], exclusive=True, reverse=True)  # => [b + c, c, 0]
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$out
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$out
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -2339,7 +2414,7 @@ decoding partial jpeg image.
 
   let arguments = (ins
     TF_StrTensor:$contents,
-    I32Tensor:$crop_window,
+    TF_Int32Tensor:$crop_window,
 
     DefaultValuedAttr<I64Attr, "0">:$channels,
     DefaultValuedAttr<I64Attr, "1">:$ratio,
@@ -2452,6 +2527,64 @@ is the same, though it is cleaner to use `tf.io.decode_image`.
   TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_DeleteIteratorOp : TF_Op<"DeleteIterator", []> {
+  let summary = "A container for an iterator resource.";
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_DatasetIteratorFree]>:$handle,
+    TF_VariantTensor:$deleter
+  );
+
+  let results = (outs);
+}
+
+def TF_DeleteMemoryCacheOp : TF_Op<"DeleteMemoryCache", []> {
+  let summary = "";
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_DatasetMemoryCacheFree]>:$handle,
+    TF_VariantTensor:$deleter
+  );
+
+  let results = (outs);
+}
+
+def TF_DeleteMultiDeviceIteratorOp : TF_Op<"DeleteMultiDeviceIterator", []> {
+  let summary = "A container for an iterator resource.";
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_DatasetIteratorFree]>:$multi_device_iterator,
+    Arg<Variadic<TF_ResourceTensor>, "", [TF_DatasetIteratorRead]>:$iterators,
+    TF_VariantTensor:$deleter
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<1>;
+}
+
+def TF_DeleteRandomSeedGeneratorOp : TF_Op<"DeleteRandomSeedGenerator", []> {
+  let summary = "";
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_DatasetSeedGeneratorFree]>:$handle,
+    TF_VariantTensor:$deleter
+  );
+
+  let results = (outs);
+}
+
+def TF_DeleteSeedGeneratorOp : TF_Op<"DeleteSeedGenerator", []> {
+  let summary = "";
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_DatasetSeedGeneratorFree]>:$handle,
+    TF_VariantTensor:$deleter
+  );
+
+  let results = (outs);
+}
+
 def TF_DepthToSpaceOp : TF_Op<"DepthToSpace", [NoSideEffect]> {
   let summary = "DepthToSpace for tensors of type T.";
 
@@ -2588,8 +2721,8 @@ horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
   }];
 
   let arguments = (ins
-    TF_FpTensor:$input,
-    TF_FpTensor:$filter,
+    TF_FloatTensor:$input,
+    TF_FloatTensor:$filter,
 
     I64ArrayAttr:$strides,
     TF_AnyStrAttrOf<["SAME", "VALID", "EXPLICIT"]>:$padding,
@@ -2599,7 +2732,7 @@ horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -2611,9 +2744,9 @@ Computes the gradients of depthwise convolution with respect to the filter.
   }];
 
   let arguments = (ins
-    TF_FpTensor:$input,
-    I32Tensor:$filter_sizes,
-    TF_FpTensor:$out_backprop,
+    TF_FloatTensor:$input,
+    TF_Int32Tensor:$filter_sizes,
+    TF_FloatTensor:$out_backprop,
 
     I64ArrayAttr:$strides,
     TF_AnyStrAttrOf<["SAME", "VALID", "EXPLICIT"]>:$padding,
@@ -2623,7 +2756,7 @@ Computes the gradients of depthwise convolution with respect to the filter.
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -2635,9 +2768,9 @@ Computes the gradients of depthwise convolution with respect to the input.
   }];
 
   let arguments = (ins
-    I32Tensor:$input_sizes,
-    TF_FpTensor:$filter,
-    TF_FpTensor:$out_backprop,
+    TF_Int32Tensor:$input_sizes,
+    TF_FloatTensor:$filter,
+    TF_FloatTensor:$out_backprop,
 
     I64ArrayAttr:$strides,
     TF_AnyStrAttrOf<["SAME", "VALID", "EXPLICIT"]>:$padding,
@@ -2647,12 +2780,42 @@ Computes the gradients of depthwise convolution with respect to the input.
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
 }
 
+def TF_DeserializeIteratorOp : TF_Op<"DeserializeIterator", []> {
+  let summary = [{
+Converts the given variant tensor to an iterator and stores it in the given resource.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_DatasetIteratorWrite]>:$resource_handle,
+    TF_VariantTensor:$serialized
+  );
+
+  let results = (outs);
+}
+
+def TF_DestroyResourceOp : TF_Op<"DestroyResourceOp", []> {
+  let summary = "Deletes the resource specified by the handle.";
+
+  let description = [{
+All subsequent operations using the resource will result in a NotFound
+error status.
+  }];
+
+  let arguments = (ins
+    TF_ResourceTensor:$resource,
+
+    DefaultValuedAttr<BoolAttr, "true">:$ignore_lookup_error
+  );
+
+  let results = (outs);
+}
+
 def TF_DeviceIndexOp : TF_Op<"DeviceIndex", [NoSideEffect]> {
   let summary = "Return the index of device the op runs.";
 
@@ -2668,7 +2831,7 @@ this op runs. The length of the list is returned in two cases:
   );
 
   let results = (outs
-    I32Tensor:$index
+    TF_Int32Tensor:$index
   );
 }
 
@@ -2696,11 +2859,11 @@ tf.diag(diagonal) ==> [[1, 0, 0, 0]
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$diagonal
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$diagonal
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -2731,11 +2894,11 @@ tf.diag_part(input) ==> [1, 2, 3, 4]
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$input
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$input
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$diagonal
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$diagonal
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -2751,11 +2914,11 @@ Computes Psi, the derivative of Lgamma (the log of the absolute value of
   }];
 
   let arguments = (ins
-    TF_FpTensor:$x
+    TF_FloatTensor:$x
   );
 
   let results = (outs
-    TF_FpTensor:$y
+    TF_FloatTensor:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -2771,12 +2934,12 @@ def TF_DivOp : TF_Op<"Div", [NoSideEffect, ResultsBroadcastableShape, TF_SameOpe
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$y
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint8]>:$x,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint8]>:$y
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$z
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint8]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -2786,25 +2949,24 @@ def TF_DivOp : TF_Op<"Div", [NoSideEffect, ResultsBroadcastableShape, TF_SameOpe
   let hasFolder = 1;
 }
 
-def TF_DivNoNanOp : TF_Op<"DivNoNan", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
-                    WithBroadcastableBinOpBuilder {
-  let summary = "Returns 0 if the denominator is zero.";
+def TF_DummyMemoryCacheOp : TF_Op<"DummyMemoryCache", []> {
+  let summary = "";
 
-  let description = [{
-*NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-  }];
-
-  let arguments = (ins
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$x,
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$y
-  );
+  let arguments = (ins);
 
   let results = (outs
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$z
+    Res<TF_ResourceTensor, "", [TF_DatasetMemoryCacheAlloc]>:$handle
   );
+}
 
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+def TF_DummySeedGeneratorOp : TF_Op<"DummySeedGenerator", []> {
+  let summary = "";
+
+  let arguments = (ins);
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_DatasetSeedGeneratorAlloc]>:$handle
+  );
 }
 
 def TF_DynamicStitchOp : TF_Op<"DynamicStitch", [NoSideEffect, SameVariadicOperandSize]> {
@@ -2878,7 +3040,7 @@ as illustrated on the following example:
   }];
 
   let arguments = (ins
-    Variadic<I32Tensor>:$indices,
+    Variadic<TF_Int32Tensor>:$indices,
     Variadic<TF_Tensor>:$data
   );
 
@@ -3006,11 +3168,11 @@ See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
   }];
 
   let arguments = (ins
-    TF_FpTensor:$features
+    TF_FloatTensor:$features
   );
 
   let results = (outs
-    TF_FpTensor:$activations
+    TF_FloatTensor:$activations
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -3022,12 +3184,12 @@ Computes gradients for the exponential linear (Elu) operation.
   }];
 
   let arguments = (ins
-    TF_FpTensor:$gradients,
-    TF_FpTensor:$outputs
+    TF_FloatTensor:$gradients,
+    TF_FloatTensor:$outputs
   );
 
   let results = (outs
-    TF_FpTensor:$backprops
+    TF_FloatTensor:$backprops
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -3041,7 +3203,7 @@ This operation creates a tensor of `shape` and `dtype`.
   }];
 
   let arguments = (ins
-    I32Tensor:$shape,
+    TF_Int32Tensor:$shape,
 
     DefaultValuedAttr<BoolAttr, "false">:$init
   );
@@ -3165,21 +3327,20 @@ tf.math.equal(x, y) ==> array([True,  True])
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$y,
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Quint16, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$x,
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Quint16, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$y,
 
     DefaultValuedAttr<BoolAttr, "true">:$incompatible_shape_error
   );
 
   let results = (outs
-    I1Tensor:$z
+    TF_BoolTensor:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
   let builders = [
-    OpBuilder<"OpBuilder& builder, OperationState& result, Value  x, "
-              "Value  y, BoolAttr incompatible_shape_error">
+    OpBuilder<"Value x, Value y, BoolAttr incompatible_shape_error">
   ];
 
   let verifier = [{
@@ -3191,11 +3352,11 @@ def TF_ErfOp : TF_Op<"Erf", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes the Gauss error function of `x` element-wise.";
 
   let arguments = (ins
-    TF_FpTensor:$x
+    TF_FloatTensor:$x
   );
 
   let results = (outs
-    TF_FpTensor:$y
+    TF_FloatTensor:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -3207,11 +3368,11 @@ Computes the complementary error function of `x` element-wise.
   }];
 
   let arguments = (ins
-    TF_FpTensor:$x
+    TF_FloatTensor:$x
   );
 
   let results = (outs
-    TF_FpTensor:$y
+    TF_FloatTensor:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -3221,11 +3382,11 @@ def TF_ErfinvOp : TF_Op<"Erfinv", [NoSideEffect]> {
   let summary = "";
 
   let arguments = (ins
-    TF_FpTensor:$x
+    TF_FloatTensor:$x
   );
 
   let results = (outs
-    TF_FpTensor:$y
+    TF_FloatTensor:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -3325,8 +3486,7 @@ size 1.
   TF_DerivedOperandTypeAttr Tdim = TF_DerivedOperandTypeAttr<1>;
 
   let builders = [
-    OpBuilder<"OpBuilder& builder, OperationState& result, Value  condition, "
-              "Value  dim">
+    OpBuilder<"Value condition, Value dim">
   ];
 }
 
@@ -3366,7 +3526,7 @@ Extract `patches` from `images` and put them in the "depth" output dimension.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$images,
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$images,
 
     Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$ksizes,
     Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$strides,
@@ -3375,7 +3535,7 @@ Extract `patches` from `images` and put them in the "depth" output dimension.
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$patches
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$patches
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -3486,6 +3646,28 @@ Quantization is called fake since the output is still in floating point.
   }];
 
   let arguments = (ins
+    TF_Float32Tensor:$inputs,
+
+    DefaultValuedAttr<F32Attr, "-6.0f">:$min,
+    DefaultValuedAttr<F32Attr, "6.0f">:$max,
+    DefaultValuedAttr<I64Attr, "8">:$num_bits,
+    DefaultValuedAttr<BoolAttr, "false">:$narrow_range
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$outputs
+  );
+
+  let verifier = [{
+    return Verify(*this);
+  }];
+}
+
+def TF_FakeQuantWithMinMaxArgsGradientOp : TF_Op<"FakeQuantWithMinMaxArgsGradient", [NoSideEffect, SameOperandsAndResultType]> {
+  let summary = "Compute gradients for a FakeQuantWithMinMaxArgs operation.";
+
+  let arguments = (ins
+    F32Tensor:$gradients,
     F32Tensor:$inputs,
 
     DefaultValuedAttr<F32Attr, "-6.0f">:$min,
@@ -3495,12 +3677,8 @@ Quantization is called fake since the output is still in floating point.
   );
 
   let results = (outs
-    F32Tensor:$outputs
+    F32Tensor:$backprops
   );
-
-  let verifier = [{
-    return Verify(*this);
-  }];
 }
 
 def TF_FakeQuantWithMinMaxVarsOp : TF_Op<"FakeQuantWithMinMaxVars", [NoSideEffect]> {
@@ -3536,6 +3714,28 @@ values.
   }];
 
   let arguments = (ins
+    TF_Float32Tensor:$inputs,
+    TF_Float32Tensor:$min,
+    TF_Float32Tensor:$max,
+
+    DefaultValuedAttr<I64Attr, "8">:$num_bits,
+    DefaultValuedAttr<BoolAttr, "false">:$narrow_range
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$outputs
+  );
+
+  let verifier = [{
+    return Verify(*this);
+  }];
+}
+
+def TF_FakeQuantWithMinMaxVarsGradientOp : TF_Op<"FakeQuantWithMinMaxVarsGradient", [NoSideEffect]> {
+  let summary = "Compute gradients for a FakeQuantWithMinMaxVars operation.";
+
+  let arguments = (ins
+    F32Tensor:$gradients,
     F32Tensor:$inputs,
     F32Tensor:$min,
     F32Tensor:$max,
@@ -3545,12 +3745,10 @@ values.
   );
 
   let results = (outs
-    F32Tensor:$outputs
+    F32Tensor:$backprops_wrt_input,
+    F32Tensor:$backprop_wrt_min,
+    F32Tensor:$backprop_wrt_max
   );
-
-  let verifier = [{
-    return Verify(*this);
-  }];
 }
 
 def TF_FakeQuantWithMinMaxVarsPerChannelOp : TF_Op<"FakeQuantWithMinMaxVarsPerChannel", [NoSideEffect]> {
@@ -3587,16 +3785,16 @@ values.
   }];
 
   let arguments = (ins
-    F32Tensor:$inputs,
-    F32Tensor:$min,
-    F32Tensor:$max,
+    TF_Float32Tensor:$inputs,
+    TF_Float32Tensor:$min,
+    TF_Float32Tensor:$max,
 
     DefaultValuedAttr<I64Attr, "8">:$num_bits,
     DefaultValuedAttr<BoolAttr, "false">:$narrow_range
   );
 
   let results = (outs
-    F32Tensor:$outputs
+    TF_Float32Tensor:$outputs
   );
 
   let verifier = [{
@@ -3647,20 +3845,20 @@ fill([2, 3], 9) ==> [[9, 9, 9]
 
   let hasFolder = 1;
 
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, Value dims, Value value"
-  >];
+  let builders = [
+    OpBuilder<"Value dims, Value value">
+  ];
 }
 
 def TF_FloorOp : TF_Op<"Floor", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Returns element-wise largest integer not greater than x.";
 
   let arguments = (ins
-    TF_FpTensor:$x
+    TF_FloatTensor:$x
   );
 
   let results = (outs
-    TF_FpTensor:$y
+    TF_FloatTensor:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -3676,12 +3874,12 @@ def TF_FloorDivOp : TF_Op<"FloorDiv", [NoSideEffect, ResultsBroadcastableShape]>
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$y
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint8]>:$x,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint8]>:$y
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$z
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint8]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -3702,12 +3900,12 @@ with a flooring divide. E.g. `floor(x / y) * y + mod(x, y) = x`.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Uint64]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Uint64]>:$y
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64, TF_Uint64]>:$x,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64, TF_Uint64]>:$y
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Uint64]>:$z
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64, TF_Uint64]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -3722,11 +3920,11 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
   }];
 
   let arguments = (ins
-    F32Tensor:$x,
-    F32Tensor:$scale,
-    F32Tensor:$offset,
-    F32Tensor:$mean,
-    F32Tensor:$variance,
+    TF_Float32Tensor:$x,
+    TF_Float32Tensor:$scale,
+    TF_Float32Tensor:$offset,
+    TF_Float32Tensor:$mean,
+    TF_Float32Tensor:$variance,
 
     DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
     DefaultValuedAttr<F32Attr, "1.0f">:$exponential_avg_factor,
@@ -3735,15 +3933,17 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
   );
 
   let results = (outs
-    F32Tensor:$y,
-    F32Tensor:$batch_mean,
-    F32Tensor:$batch_variance,
-    F32Tensor:$reserve_space_1,
-    F32Tensor:$reserve_space_2
+    TF_Float32Tensor:$y,
+    TF_Float32Tensor:$batch_mean,
+    TF_Float32Tensor:$batch_variance,
+    TF_Float32Tensor:$reserve_space_1,
+    TF_Float32Tensor:$reserve_space_2
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
+  let hasCanonicalizer = 1;
+
   let verifier = [{
     return Verify(*this);
   }];
@@ -3758,11 +3958,11 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
   }];
 
   let arguments = (ins
-    F32Tensor:$y_backprop,
-    F32Tensor:$x,
-    F32Tensor:$scale,
-    F32Tensor:$reserve_space_1,
-    F32Tensor:$reserve_space_2,
+    TF_Float32Tensor:$y_backprop,
+    TF_Float32Tensor:$x,
+    TF_Float32Tensor:$scale,
+    TF_Float32Tensor:$reserve_space_1,
+    TF_Float32Tensor:$reserve_space_2,
 
     DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
     DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
@@ -3770,11 +3970,11 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
   );
 
   let results = (outs
-    F32Tensor:$x_backprop,
-    F32Tensor:$scale_backprop,
-    F32Tensor:$offset_backprop,
-    F32Tensor:$reserve_space_3,
-    F32Tensor:$reserve_space_4
+    TF_Float32Tensor:$x_backprop,
+    TF_Float32Tensor:$scale_backprop,
+    TF_Float32Tensor:$offset_backprop,
+    TF_Float32Tensor:$reserve_space_3,
+    TF_Float32Tensor:$reserve_space_4
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -3789,11 +3989,11 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32]>:$y_backprop,
-    TensorOf<[BF16, F16, F32]>:$x,
-    F32Tensor:$scale,
-    F32Tensor:$reserve_space_1,
-    F32Tensor:$reserve_space_2,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$y_backprop,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$x,
+    TF_Float32Tensor:$scale,
+    TF_Float32Tensor:$reserve_space_1,
+    TF_Float32Tensor:$reserve_space_2,
 
     DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
     DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
@@ -3801,11 +4001,11 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32]>:$x_backprop,
-    F32Tensor:$scale_backprop,
-    F32Tensor:$offset_backprop,
-    F32Tensor:$reserve_space_3,
-    F32Tensor:$reserve_space_4
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$x_backprop,
+    TF_Float32Tensor:$scale_backprop,
+    TF_Float32Tensor:$offset_backprop,
+    TF_Float32Tensor:$reserve_space_3,
+    TF_Float32Tensor:$reserve_space_4
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -3821,24 +4021,24 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32]>:$y_backprop,
-    TensorOf<[BF16, F16, F32]>:$x,
-    F32Tensor:$scale,
-    F32Tensor:$reserve_space_1,
-    F32Tensor:$reserve_space_2,
-    F32Tensor:$reserve_space_3,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$y_backprop,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$x,
+    TF_Float32Tensor:$scale,
+    TF_Float32Tensor:$reserve_space_1,
+    TF_Float32Tensor:$reserve_space_2,
+    TF_Float32Tensor:$reserve_space_3,
 
     DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
-    DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
+    DefaultValuedAttr<TF_AnyStrAttrOf<["NHWC", "NCHW", "NDHWC", "NCDHW"]>, "NHWC">:$data_format,
     DefaultValuedAttr<BoolAttr, "true">:$is_training
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32]>:$x_backprop,
-    F32Tensor:$scale_backprop,
-    F32Tensor:$offset_backprop,
-    F32Tensor:$reserve_space_4,
-    F32Tensor:$reserve_space_5
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$x_backprop,
+    TF_Float32Tensor:$scale_backprop,
+    TF_Float32Tensor:$offset_backprop,
+    TF_Float32Tensor:$reserve_space_4,
+    TF_Float32Tensor:$reserve_space_5
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -3853,6 +4053,95 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
   }];
 }
 
+def TF_FusedBatchNormV2Op : TF_Op<"FusedBatchNormV2", [NoSideEffect, TF_FoldOperandsTransposeInterface, TF_LayoutSensitiveInterface]> {
+  let summary = "Batch normalization.";
+
+  let description = [{
+Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+The size of 1D Tensors matches the dimension C of the 4D Tensors.
+  }];
+
+  let arguments = (ins
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$x,
+    TF_Float32Tensor:$scale,
+    TF_Float32Tensor:$offset,
+    TF_Float32Tensor:$mean,
+    TF_Float32Tensor:$variance,
+
+    DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
+    DefaultValuedAttr<F32Attr, "1.0f">:$exponential_avg_factor,
+    DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
+    DefaultValuedAttr<BoolAttr, "true">:$is_training
+  );
+
+  let results = (outs
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$y,
+    TF_Float32Tensor:$batch_mean,
+    TF_Float32Tensor:$batch_variance,
+    TF_Float32Tensor:$reserve_space_1,
+    TF_Float32Tensor:$reserve_space_2
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr U = TF_DerivedOperandTypeAttr<1>;
+
+  let extraClassDeclaration = [{
+    // TF_FoldOperandsTransposeInterface:
+    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
+    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
+    LogicalResult FoldOperandsPermutation(ArrayRef<int64_t> permutation);
+
+    // TF_LayoutSensitiveInterface:
+    StringRef GetOptimalLayout(const RuntimeDevices& devices);
+    LogicalResult UpdateDataFormat(StringRef data_format);
+  }];
+}
+
+def TF_FusedBatchNormV3Op : TF_Op<"FusedBatchNormV3", [NoSideEffect, TF_FoldOperandsTransposeInterface, TF_LayoutSensitiveInterface]> {
+  let summary = "Batch normalization.";
+
+  let description = [{
+Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+The size of 1D Tensors matches the dimension C of the 4D Tensors.
+  }];
+
+  let arguments = (ins
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$x,
+    TF_Float32Tensor:$scale,
+    TF_Float32Tensor:$offset,
+    TF_Float32Tensor:$mean,
+    TF_Float32Tensor:$variance,
+
+    DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
+    DefaultValuedAttr<F32Attr, "1.0f">:$exponential_avg_factor,
+    DefaultValuedAttr<TF_AnyStrAttrOf<["NHWC", "NCHW", "NDHWC", "NCDHW"]>, "NHWC">:$data_format,
+    DefaultValuedAttr<BoolAttr, "true">:$is_training
+  );
+
+  let results = (outs
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$y,
+    TF_Float32Tensor:$batch_mean,
+    TF_Float32Tensor:$batch_variance,
+    TF_Float32Tensor:$reserve_space_1,
+    TF_Float32Tensor:$reserve_space_2,
+    TF_Float32Tensor:$reserve_space_3
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr U = TF_DerivedOperandTypeAttr<1>;
+
+  let extraClassDeclaration = [{
+    // TF_FoldOperandsTransposeInterface:
+    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
+    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
+    LogicalResult FoldOperandsPermutation(ArrayRef<int64_t> permutation);
+
+    // TF_LayoutSensitiveInterface:
+    StringRef GetOptimalLayout(const RuntimeDevices& devices);
+    LogicalResult UpdateDataFormat(StringRef data_format);
+  }];
+}
+
 def TF_GatherOp : TF_Op<"Gather", [NoSideEffect]> {
   let summary = "Gather slices from `params` according to `indices`.";
 
@@ -4107,7 +4396,7 @@ tf.math.greater(x, y) ==> [False, False, True]
   );
 
   let results = (outs
-    I1Tensor:$z
+    TF_BoolTensor:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -4140,7 +4429,7 @@ tf.math.greater_equal(x, y) ==> [True, False, True, True]
   );
 
   let results = (outs
-    I1Tensor:$z
+    TF_BoolTensor:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -4158,11 +4447,11 @@ See `rgb_to_hsv` for a description of the HSV encoding.
   }];
 
   let arguments = (ins
-    TF_FpTensor:$images
+    TF_FloatTensor:$images
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -4186,7 +4475,7 @@ table will be immutable.
   );
 
   let results = (outs
-    TF_ResourceTensor:$table_handle
+    Res<TF_ResourceTensor, "", [TF_LookupTableAlloc]>:$table_handle
   );
 }
 
@@ -4268,7 +4557,7 @@ larger, the dimension is padded with zeros.
 
   let arguments = (ins
     TensorOf<[TF_Complex128, TF_Complex64]>:$input,
-    I32Tensor:$fft_length
+    TF_Int32Tensor:$fft_length
   );
 
   let results = (outs
@@ -4301,7 +4590,7 @@ the dimension is padded with zeros.
 
   let arguments = (ins
     TensorOf<[TF_Complex128, TF_Complex64]>:$input,
-    I32Tensor:$fft_length
+    TF_Int32Tensor:$fft_length
   );
 
   let results = (outs
@@ -4334,7 +4623,7 @@ the dimension is padded with zeros.
 
   let arguments = (ins
     TensorOf<[TF_Complex128, TF_Complex64]>:$input,
-    I32Tensor:$fft_length
+    TF_Int32Tensor:$fft_length
   );
 
   let results = (outs
@@ -4491,6 +4780,39 @@ tf.imag(input) ==> [4.75, 5.75]
   TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_InTopKV2Op : TF_Op<"InTopKV2", [NoSideEffect]> {
+  let summary = "Says whether the targets are in the top `K` predictions.";
+
+  let description = [{
+This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+prediction for the target class is among the top `k` predictions among
+all predictions for example `i`. Note that the behavior of `InTopK` differs
+from the `TopK` op in its handling of ties; if multiple classes have the
+same prediction value and straddle the top-`k` boundary, all of those
+classes are considered to be in the top `k`.
+
+More formally, let
+
+  \\(predictions_i\\) be the predictions for all classes for example `i`,
+  \\(targets_i\\) be the target class for example `i`,
+  \\(out_i\\) be the output for example `i`,
+
+$$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
+  }];
+
+  let arguments = (ins
+    F32Tensor:$predictions,
+    TF_I32OrI64Tensor:$targets,
+    TF_I32OrI64Tensor:$k
+  );
+
+  let results = (outs
+    I1Tensor:$precision
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_InfeedDequeueOp : TF_Op<"InfeedDequeue", []> {
   let summary = [{
 A placeholder op for a value that will be fed into the computation.
@@ -4507,33 +4829,21 @@ A placeholder op for a value that will be fed into the computation.
   TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
 }
 
-def TF_InitializeTableFromTextFileV2Op : TF_Op<"InitializeTableFromTextFileV2", []> {
-  let summary = "Initializes a table from a text file.";
-
-  let description = [{
-It inserts one key-value pair into the table for each line of the file.
-The key and value is extracted from the whole line content, elements from the
-split line based on `delimiter` or the line number (starting from zero).
-Where to extract the key and value from a line is specified by `key_index` and
-`value_index`.
-
-- A value of -1 means use the line number(starting from zero), expects `int64`.
-- A value of -2 means use the whole line content, expects `string`.
-- A value >= 0 means use the index (starting at zero) of the split line based
-  on `delimiter`.
+def TF_InitializeTableV2Op : TF_Op<"InitializeTableV2", []> {
+  let summary = [{
+Table initializer that takes two tensors for keys and values respectively.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$table_handle,
-    TF_StrTensor:$filename,
-
-    Confined<I64Attr, [IntMinValue<-2>]>:$key_index,
-    Confined<I64Attr, [IntMinValue<-2>]>:$value_index,
-    Confined<DefaultValuedAttr<I64Attr, "-1">, [IntMinValue<-1>]>:$vocab_size,
-    DefaultValuedAttr<StrAttr, "\t">:$delimiter
+    Arg<TF_ResourceTensor, "", [TF_LookupTableWrite]>:$table_handle,
+    TF_Tensor:$keys,
+    TF_Tensor:$values
   );
 
   let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tval = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedOperandTypeAttr Tkey = TF_DerivedOperandTypeAttr<1>;
 }
 
 def TF_InplaceUpdateOp : TF_Op<"InplaceUpdate", [NoSideEffect]> {
@@ -4548,7 +4858,7 @@ operation create / operate on a copy of `x`.
 
   let arguments = (ins
     TF_Tensor:$x,
-    I32Tensor:$i,
+    TF_Int32Tensor:$i,
     TF_Tensor:$v
   );
 
@@ -4567,11 +4877,11 @@ I.e., \\(y = 1 / x\\).
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -4690,11 +5000,11 @@ tf.math.is_finite(x) ==> [True, True, True, False, False]
   }];
 
   let arguments = (ins
-    TF_FpTensor:$x
+    TF_FloatTensor:$x
   );
 
   let results = (outs
-    I1Tensor:$y
+    TF_BoolTensor:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -4717,11 +5027,11 @@ tf.math.is_inf(x) ==> [False, True, False, True]
   }];
 
   let arguments = (ins
-    TF_FpTensor:$x
+    TF_FloatTensor:$x
   );
 
   let results = (outs
-    I1Tensor:$y
+    TF_BoolTensor:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -4744,21 +5054,68 @@ tf.math.is_nan(x) ==> [False, True, False, True, False]
   }];
 
   let arguments = (ins
-    TF_FpTensor:$x
+    TF_FloatTensor:$x
   );
 
   let results = (outs
-    I1Tensor:$y
+    TF_BoolTensor:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_IteratorOp : TF_Op<"Iterator", []> {
+  let summary = "A container for an iterator resource.";
+
+  let arguments = (ins
+    StrAttr:$shared_name,
+    StrAttr:$container,
+    Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
+    Confined<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_DatasetIteratorAlloc]>:$handle
+  );
+}
+
+def TF_IteratorFromStringHandleOp : TF_Op<"IteratorFromStringHandle", []> {
+  let summary = [{
+Converts the given string representing a handle to an iterator to a resource.
+  }];
+
+  let arguments = (ins
+    TF_StrTensor:$string_handle,
+
+    DefaultValuedAttr<TypeArrayAttr, "{}">:$output_types,
+    DefaultValuedAttr<TF_ShapeAttrArray, "{}">:$output_shapes
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_DatasetIteratorAlloc]>:$resource_handle
+  );
+}
+
+def TF_IteratorFromStringHandleV2Op : TF_Op<"IteratorFromStringHandleV2", []> {
+  let summary = "";
+
+  let arguments = (ins
+    TF_StrTensor:$string_handle,
+
+    DefaultValuedAttr<TypeArrayAttr, "{}">:$output_types,
+    DefaultValuedAttr<TF_ShapeAttrArray, "{}">:$output_shapes
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_DatasetIteratorAlloc]>:$resource_handle
+  );
+}
+
 def TF_IteratorGetNextOp : TF_Op<"IteratorGetNext", []> {
   let summary = "Gets the next output from the given iterator .";
 
   let arguments = (ins
-    TF_ResourceTensor:$iterator
+    Arg<TF_ResourceTensor, "", [TF_DatasetIteratorRead, TF_DatasetIteratorWrite]>:$iterator
   );
 
   let results = (outs
@@ -4769,6 +5126,74 @@ def TF_IteratorGetNextOp : TF_Op<"IteratorGetNext", []> {
   TF_DerivedResultTypeListAttr output_types = TF_DerivedResultTypeListAttr<0>;
 }
 
+def TF_IteratorGetNextAsOptionalOp : TF_Op<"IteratorGetNextAsOptional", []> {
+  let summary = [{
+Gets the next output from the given iterator as an Optional variant.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_DatasetIteratorRead, TF_DatasetIteratorWrite]>:$iterator,
+
+    Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
+    Confined<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes
+  );
+
+  let results = (outs
+    TF_VariantTensor:$optional
+  );
+}
+
+def TF_IteratorGetNextSyncOp : TF_Op<"IteratorGetNextSync", []> {
+  let summary = "Gets the next output from the given iterator.";
+
+  let description = [{
+This operation is a synchronous version IteratorGetNext. It should only be used
+in situations where the iterator does not block the calling thread, or where
+the calling thread is not a member of the thread pool used to execute parallel
+operations (e.g. in eager mode).
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_DatasetIteratorRead, TF_DatasetIteratorWrite]>:$iterator
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$components
+  );
+
+  TF_DerivedResultShapeListAttr output_shapes = TF_DerivedResultShapeListAttr<0>;
+  TF_DerivedResultTypeListAttr output_types = TF_DerivedResultTypeListAttr<0>;
+}
+
+def TF_IteratorToStringHandleOp : TF_Op<"IteratorToStringHandle", []> {
+  let summary = [{
+Converts the given `resource_handle` representing an iterator to a string.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_DatasetIteratorRead]>:$resource_handle
+  );
+
+  let results = (outs
+    TF_StrTensor:$string_handle
+  );
+}
+
+def TF_IteratorV2Op : TF_Op<"IteratorV2", []> {
+  let summary = "";
+
+  let arguments = (ins
+    StrAttr:$shared_name,
+    StrAttr:$container,
+    Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
+    Confined<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_DatasetIteratorAlloc]>:$handle
+  );
+}
+
 def TF_L2LossOp : TF_Op<"L2Loss", [NoSideEffect]> {
   let summary = "L2 Loss.";
 
@@ -4779,11 +5204,11 @@ Computes half the L2 norm of a tensor without the `sqrt`:
   }];
 
   let arguments = (ins
-    TF_FpTensor:$t
+    TF_FloatTensor:$t
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -4807,7 +5232,7 @@ convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imag
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32]>:$input,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$input,
 
     DefaultValuedAttr<I64Attr, "5">:$depth_radius,
     DefaultValuedAttr<F32Attr, "1.0f">:$bias,
@@ -4816,7 +5241,7 @@ convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imag
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32]>:$output
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -4826,9 +5251,9 @@ def TF_LRNGradOp : TF_Op<"LRNGrad", [NoSideEffect]> {
   let summary = "Gradients for Local Response Normalization.";
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32]>:$input_grads,
-    TensorOf<[BF16, F16, F32]>:$input_image,
-    TensorOf<[BF16, F16, F32]>:$output_image,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$input_grads,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$input_image,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$output_image,
 
     DefaultValuedAttr<I64Attr, "5">:$depth_radius,
     DefaultValuedAttr<F32Attr, "1.0f">:$bias,
@@ -4837,28 +5262,33 @@ def TF_LRNGradOp : TF_Op<"LRNGrad", [NoSideEffect]> {
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32]>:$output
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_LeakyReluOp : TF_Op<"LeakyRelu", [NoSideEffect, SameOperandsAndResultType]> {
+def TF_LeakyReluOp : TF_Op<"LeakyRelu", [NoSideEffect, SameOperandsAndResultType, TF_ContractionFusableInterface]> {
   let summary = "Computes rectified linear: `max(features, features * alpha)`.";
 
   let arguments = (ins
-    TF_FpTensor:$features,
+    TF_FloatTensor:$features,
 
     DefaultValuedAttr<F32Attr, "0.2f">:$alpha
   );
 
   let results = (outs
-    TF_FpTensor:$activations
+    TF_FloatTensor:$activations
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
   let hasFolder = 1;
+
+  let extraClassDeclaration = [{
+    // TF_ContractionFusableInterface:
+    Optional<ContractionFusion> GetContractionFusion();
+  }];
 }
 
 def TF_LeakyReluGradOp : TF_Op<"LeakyReluGrad", [NoSideEffect, SameOperandsAndResultType]> {
@@ -4867,14 +5297,14 @@ Computes rectified linear gradients for a LeakyRelu operation.
   }];
 
   let arguments = (ins
-    TF_FpTensor:$gradients,
-    TF_FpTensor:$features,
+    TF_FloatTensor:$gradients,
+    TF_FloatTensor:$features,
 
     DefaultValuedAttr<F32Attr, "0.2f">:$alpha
   );
 
   let results = (outs
-    TF_FpTensor:$backprops
+    TF_FloatTensor:$backprops
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -4956,7 +5386,7 @@ tf.math.less(x, y) ==> [False, True, True]
   );
 
   let results = (outs
-    I1Tensor:$z
+    TF_BoolTensor:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -4989,7 +5419,7 @@ tf.math.less_equal(x, y) ==> [True, True, True]
   );
 
   let results = (outs
-    I1Tensor:$z
+    TF_BoolTensor:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -5013,11 +5443,11 @@ tf.math.lgamma(x) ==> [inf, 0.5723649, 0., 2.4537368, inf, -4.6477685]
   }];
 
   let arguments = (ins
-    TF_FpTensor:$x
+    TF_FloatTensor:$x
   );
 
   let results = (outs
-    TF_FpTensor:$y
+    TF_FloatTensor:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -5039,13 +5469,13 @@ tf.linspace(10.0, 12.0, 3, name="linspace") => [ 10.0  11.0  12.0]
   }];
 
   let arguments = (ins
-    TF_FpTensor:$start,
-    TF_FpTensor:$stop,
+    TF_FloatTensor:$start,
+    TF_FloatTensor:$stop,
     TF_I32OrI64Tensor:$num
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -5157,11 +5587,11 @@ For each batch `i` and class `j` we have
   }];
 
   let arguments = (ins
-    TF_FpTensor:$logits
+    TF_FloatTensor:$logits
   );
 
   let results = (outs
-    TF_FpTensor:$logsoftmax
+    TF_FloatTensor:$logsoftmax
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -5177,12 +5607,12 @@ def TF_LogicalAndOp : TF_Op<"LogicalAnd", [Commutative, NoSideEffect, ResultsBro
   }];
 
   let arguments = (ins
-    I1Tensor:$x,
-    I1Tensor:$y
+    TF_BoolTensor:$x,
+    TF_BoolTensor:$y
   );
 
   let results = (outs
-    I1Tensor:$z
+    TF_BoolTensor:$z
   );
 }
 
@@ -5190,11 +5620,11 @@ def TF_LogicalNotOp : TF_Op<"LogicalNot", [NoSideEffect, SameOperandsAndResultTy
   let summary = "Returns the truth value of `NOT x` element-wise.";
 
   let arguments = (ins
-    I1Tensor:$x
+    TF_BoolTensor:$x
   );
 
   let results = (outs
-    I1Tensor:$y
+    TF_BoolTensor:$y
   );
 
   let hasCanonicalizer = 1;
@@ -5210,15 +5640,31 @@ def TF_LogicalOrOp : TF_Op<"LogicalOr", [Commutative, NoSideEffect, ResultsBroad
   }];
 
   let arguments = (ins
-    I1Tensor:$x,
-    I1Tensor:$y
+    TF_BoolTensor:$x,
+    TF_BoolTensor:$y
   );
 
   let results = (outs
-    I1Tensor:$z
+    TF_BoolTensor:$z
   );
 }
 
+def TF_LookupTableExportV2Op : TF_Op<"LookupTableExportV2", []> {
+  let summary = "Outputs all keys and values in the table.";
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_LookupTableRead]>:$table_handle
+  );
+
+  let results = (outs
+    TF_Tensor:$keys,
+    TF_Tensor:$values
+  );
+
+  TF_DerivedResultTypeAttr Tkeys = TF_DerivedResultTypeAttr<0>;
+  TF_DerivedResultTypeAttr Tvalues = TF_DerivedResultTypeAttr<1>;
+}
+
 def TF_LookupTableFindV2Op : TF_Op<"LookupTableFindV2", []> {
   let summary = "Looks up keys in a table, outputs the corresponding values.";
 
@@ -5231,7 +5677,7 @@ table. It must also be of the same type as the table values.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$table_handle,
+    Arg<TF_ResourceTensor, "", [TF_LookupTableRead]>:$table_handle,
     TF_Tensor:$keys,
     TF_Tensor:$default_value
   );
@@ -5255,7 +5701,7 @@ The tensor `values` must be of the type of the table values.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$table_handle,
+    Arg<TF_ResourceTensor, "", [TF_LookupTableWrite]>:$table_handle,
     TF_Tensor:$keys,
     TF_Tensor:$values
   );
@@ -5266,6 +5712,44 @@ The tensor `values` must be of the type of the table values.
   TF_DerivedOperandTypeAttr Tout = TF_DerivedOperandTypeAttr<2>;
 }
 
+def TF_LookupTableInsertV2Op : TF_Op<"LookupTableInsertV2", []> {
+  let summary = "Updates the table to associates keys with values.";
+
+  let description = [{
+The tensor `keys` must be of the same type as the keys of the table.
+The tensor `values` must be of the type of the table values.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_LookupTableWrite]>:$table_handle,
+    TF_Tensor:$keys,
+    TF_Tensor:$values
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tin = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr Tout = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_LookupTableRemoveV2Op : TF_Op<"LookupTableRemoveV2", []> {
+  let summary = "Removes keys and its associated values from a table.";
+
+  let description = [{
+The tensor `keys` must of the same type as the keys of the table. Keys not
+already in the table are silently ignored.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_LookupTableWrite]>:$table_handle,
+    TF_Tensor:$keys
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tin = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_LookupTableSizeV2Op : TF_Op<"LookupTableSizeV2", []> {
   let summary = "Computes the number of elements in the given table.";
 
@@ -5274,7 +5758,7 @@ def TF_LookupTableSizeV2Op : TF_Op<"LookupTableSizeV2", []> {
   );
 
   let results = (outs
-    I64Tensor:$size
+    TF_Int64Tensor:$size
   );
 }
 
@@ -5316,6 +5800,24 @@ A 2-D example:
   TF_DerivedResultTypeAttr out_type = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_MakeIteratorOp : TF_Op<"MakeIterator", []> {
+  let summary = [{
+Makes a new iterator from the given `dataset` and stores it in `iterator`.
+  }];
+
+  let description = [{
+This operation may be executed multiple times. Each execution will reset the
+iterator in `iterator` to the first element of `dataset`.
+  }];
+
+  let arguments = (ins
+    TF_VariantTensor:$dataset,
+    Arg<TF_ResourceTensor, "", [TF_DatasetIteratorWrite]>:$iterator
+  );
+
+  let results = (outs);
+}
+
 def TF_MatMulOp : TF_Op<"MatMul", [NoSideEffect, TF_SameOperandsAndResultElementTypeResolveRef]> {
   let summary = [{
 Multiply the matrix "a" by the matrix "b".
@@ -5332,15 +5834,15 @@ cublas.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$a,
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$b,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$a,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$b,
 
     DefaultValuedAttr<BoolAttr, "false">:$transpose_a,
     DefaultValuedAttr<BoolAttr, "false">:$transpose_b
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$product
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$product
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -5561,7 +6063,7 @@ tf.matrix_diag_part(input, k = (1, 3), padding_value = 9)
 
   let arguments = (ins
     TF_Tensor:$input,
-    I32Tensor:$k,
+    TF_Int32Tensor:$k,
     TF_Tensor:$padding_value,
 
     DefaultValuedAttr<TF_AnyStrAttrOf<["LEFT_RIGHT", "RIGHT_LEFT", "LEFT_LEFT", "RIGHT_RIGHT"]>, "RIGHT_LEFT">:$align
@@ -5671,9 +6173,9 @@ tf.matrix_diag(diagonal, k = -1, num_rows = 3, padding_value = 9)
 
   let arguments = (ins
     TF_Tensor:$diagonal,
-    I32Tensor:$k,
-    I32Tensor:$num_rows,
-    I32Tensor:$num_cols,
+    TF_Int32Tensor:$k,
+    TF_Int32Tensor:$num_rows,
+    TF_Int32Tensor:$num_cols,
     TF_Tensor:$padding_value
   );
 
@@ -5810,9 +6312,9 @@ tf.matrix_diag(diagonal, k = -1, num_rows = 3, padding_value = 9)
 
   let arguments = (ins
     TF_Tensor:$diagonal,
-    I32Tensor:$k,
-    I32Tensor:$num_rows,
-    I32Tensor:$num_cols,
+    TF_Int32Tensor:$k,
+    TF_Int32Tensor:$num_rows,
+    TF_Int32Tensor:$num_cols,
     TF_Tensor:$padding_value,
 
     DefaultValuedAttr<TF_AnyStrAttrOf<["LEFT_RIGHT", "RIGHT_LEFT", "LEFT_LEFT", "RIGHT_RIGHT"]>, "RIGHT_LEFT">:$align
@@ -5843,13 +6345,13 @@ garbage result.
   }];
 
   let arguments = (ins
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$input,
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$input,
 
     DefaultValuedAttr<BoolAttr, "false">:$adjoint
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$output
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -5969,7 +6471,7 @@ tf.matrix_set_diag(diagonals, k = (-1, 0))
   let arguments = (ins
     TF_Tensor:$input,
     TF_Tensor:$diagonal,
-    I32Tensor:$k
+    TF_Int32Tensor:$k
   );
 
   let results = (outs
@@ -6094,7 +6596,7 @@ tf.matrix_set_diag(input, diagonals, k = (-1, 2), align="LEFT_RIGHT")
   let arguments = (ins
     TF_Tensor:$input,
     TF_Tensor:$diagonal,
-    I32Tensor:$k,
+    TF_Int32Tensor:$k,
 
     DefaultValuedAttr<TF_AnyStrAttrOf<["LEFT_RIGHT", "RIGHT_LEFT", "LEFT_LEFT", "RIGHT_RIGHT"]>, "RIGHT_LEFT">:$align
   );
@@ -6119,14 +6621,14 @@ If `adjoint` is `True` then each output matrix satisfies
   }];
 
   let arguments = (ins
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$matrix,
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$rhs,
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$matrix,
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$rhs,
 
     DefaultValuedAttr<BoolAttr, "false">:$adjoint
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$output
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -6186,15 +6688,15 @@ tf.matmul(a, x)
   }];
 
   let arguments = (ins
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$matrix,
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$rhs,
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$matrix,
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$rhs,
 
     DefaultValuedAttr<BoolAttr, "true">:$lower,
     DefaultValuedAttr<BoolAttr, "false">:$adjoint
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$output
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -6213,39 +6715,39 @@ retained with length 1.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     TF_I32OrI64Tensor:$reduction_indices,
 
     DefaultValuedAttr<BoolAttr, "false">:$keep_dims
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
 
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, Value input, "
-    "Value reduction_indices, BoolAttr keep_dims"
-  >];
+  let builders = [
+    OpBuilder<"Value input, Value reduction_indices, BoolAttr keep_dims">
+  ];
 }
 
 def TF_MaxPoolOp : TF_Op<"MaxPool", [NoSideEffect, TF_FoldOperandsTransposeInterface]> {
   let summary = "Performs max pooling on the input.";
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Qint8, TF_Uint16, TF_Uint8]>:$input,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint8, TF_Uint16, TF_Uint8]>:$input,
 
     Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$ksize,
     Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$strides,
-    TF_AnyStrAttrOf<["SAME", "VALID"]>:$padding,
+    TF_AnyStrAttrOf<["SAME", "VALID", "EXPLICIT"]>:$padding,
+    DefaultValuedAttr<I64ArrayAttr, "{}">:$explicit_paddings,
     DefaultValuedAttr<TF_AnyStrAttrOf<["NHWC", "NCHW", "NCHW_VECT_C"]>, "NHWC">:$data_format
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Qint8, TF_Uint16, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint8, TF_Uint16, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -6262,7 +6764,7 @@ def TF_MaxPool3DOp : TF_Op<"MaxPool3D", [NoSideEffect]> {
   let summary = "Performs 3D max pooling on the input.";
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32]>:$input,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$input,
 
     Confined<I64ArrayAttr, [ArrayMinCount<5>]>:$ksize,
     Confined<I64ArrayAttr, [ArrayMinCount<5>]>:$strides,
@@ -6271,7 +6773,7 @@ def TF_MaxPool3DOp : TF_Op<"MaxPool3D", [NoSideEffect]> {
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32]>:$output
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -6281,9 +6783,9 @@ def TF_MaxPool3DGradOp : TF_Op<"MaxPool3DGrad", [NoSideEffect]> {
   let summary = "Computes gradients of 3D max pooling function.";
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32]>:$orig_input,
-    TensorOf<[BF16, F16, F32]>:$orig_output,
-    TensorOf<[BF16, F16, F32]>:$grad,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$orig_input,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$orig_output,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$grad,
 
     Confined<I64ArrayAttr, [ArrayMinCount<5>]>:$ksize,
     Confined<I64ArrayAttr, [ArrayMinCount<5>]>:$strides,
@@ -6292,7 +6794,7 @@ def TF_MaxPool3DGradOp : TF_Op<"MaxPool3DGrad", [NoSideEffect]> {
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32]>:$output
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32]>:$output
   );
 
   TF_DerivedOperandTypeAttr TInput = TF_DerivedOperandTypeAttr<0>;
@@ -6309,7 +6811,8 @@ def TF_MaxPoolGradOp : TF_Op<"MaxPoolGrad", [NoSideEffect]> {
 
     Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$ksize,
     Confined<I64ArrayAttr, [ArrayMinCount<4>]>:$strides,
-    TF_AnyStrAttrOf<["SAME", "VALID"]>:$padding,
+    TF_AnyStrAttrOf<["SAME", "VALID", "EXPLICIT"]>:$padding,
+    DefaultValuedAttr<I64ArrayAttr, "{}">:$explicit_paddings,
     DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format
   );
 
@@ -6324,27 +6827,6 @@ def TF_MaxPoolGradOp : TF_Op<"MaxPoolGrad", [NoSideEffect]> {
   }];
 }
 
-def TF_MaximumOp : TF_Op<"Maximum", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
-                   WithBroadcastableBinOpBuilder {
-  let summary = "Returns the max of x and y (i.e. x > y ? x : y) element-wise.";
-
-  let description = [{
-*NOTE*: `Maximum` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-  }];
-
-  let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Uint8]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Uint8]>:$y
-  );
-
-  let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Uint8]>:$z
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-}
-
 def TF_MeanOp : TF_Op<"Mean", [NoSideEffect, TF_FoldOperandsTransposeInterface]> {
   let summary = "Computes the mean of elements across dimensions of a tensor.";
 
@@ -6356,14 +6838,14 @@ retained with length 1.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     TF_I32OrI64Tensor:$reduction_indices,
 
     DefaultValuedAttr<BoolAttr, "false">:$keep_dims
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -6440,14 +6922,14 @@ retained with length 1.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     TF_I32OrI64Tensor:$reduction_indices,
 
     DefaultValuedAttr<BoolAttr, "false">:$keep_dims
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Qint32, TF_Qint8, TF_Quint16, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -6464,12 +6946,12 @@ def TF_MinimumOp : TF_Op<"Minimum", [NoSideEffect, ResultsBroadcastableShape, TF
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Uint8]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Uint8]>:$y
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Uint8]>:$x,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Uint8]>:$y
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, TF_Uint8]>:$z
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Uint8]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -6522,7 +7004,7 @@ pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
 }
 
 def TF_MlirLocalVarOp : TF_Op<"MlirLocalVarOp", []> {
-  let summary = "Creates a handle to a in-scope variable.";
+  let summary = "Creates a handle to an in-scope variable.";
 
   let description = [{
 Used by internal passes for temporary representation of local state, which will
@@ -6532,7 +7014,7 @@ be eventually removed.
   let arguments = (ins);
 
   let results = (outs
-    TF_ResourceTensor:$resource
+    Res<TF_ResourceTensor, "", [TF_VariableAlloc]>:$resource
   );
 }
 
@@ -6623,12 +7105,12 @@ def TF_MulOp : TF_Op<"Mul", [Commutative, NoSideEffect, ResultsBroadcastableShap
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$y
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint8]>:$x,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint8]>:$y
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$z
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint8]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -6659,12 +7141,88 @@ Returns x * y element-wise. Returns zero if y is zero, even if x if infinite or
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_MultiDeviceIteratorOp : TF_Op<"MultiDeviceIterator", []> {
+  let summary = "Creates a MultiDeviceIterator resource.";
+
+  let arguments = (ins
+    Confined<StrArrayAttr, [ArrayMinCount<1>]>:$devices,
+    StrAttr:$shared_name,
+    StrAttr:$container,
+    Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
+    Confined<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_DatasetIteratorAlloc]>:$handle
+  );
+}
+
+def TF_MultiDeviceIteratorFromStringHandleOp : TF_Op<"MultiDeviceIteratorFromStringHandle", []> {
+  let summary = [{
+Generates a MultiDeviceIterator resource from its provided string handle.
+  }];
+
+  let arguments = (ins
+    TF_StrTensor:$string_handle,
+
+    DefaultValuedAttr<TypeArrayAttr, "{}">:$output_types,
+    DefaultValuedAttr<TF_ShapeAttrArray, "{}">:$output_shapes
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_DatasetIteratorAlloc]>:$multi_device_iterator
+  );
+}
+
+def TF_MultiDeviceIteratorGetNextFromShardOp : TF_Op<"MultiDeviceIteratorGetNextFromShard", []> {
+  let summary = "Gets next element for the provided shard number.";
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_DatasetIteratorRead, TF_DatasetIteratorWrite]>:$multi_device_iterator,
+    TF_Int32Tensor:$shard_num,
+    TF_Int64Tensor:$incarnation_id
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$components
+  );
+
+  TF_DerivedResultShapeListAttr output_shapes = TF_DerivedResultShapeListAttr<0>;
+  TF_DerivedResultTypeListAttr output_types = TF_DerivedResultTypeListAttr<0>;
+}
+
+def TF_MultiDeviceIteratorInitOp : TF_Op<"MultiDeviceIteratorInit", []> {
+  let summary = "Initializes the multi device iterator with the given dataset.";
+
+  let arguments = (ins
+    TF_VariantTensor:$dataset,
+    Arg<TF_ResourceTensor, "", [TF_DatasetIteratorWrite]>:$multi_device_iterator,
+    TF_Int64Tensor:$max_buffer_size
+  );
+
+  let results = (outs
+    TF_Int64Tensor:$incarnation_id
+  );
+}
+
+def TF_MultiDeviceIteratorToStringHandleOp : TF_Op<"MultiDeviceIteratorToStringHandle", []> {
+  let summary = "Produces a string handle for the given MultiDeviceIterator.";
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_DatasetIteratorRead]>:$multi_device_iterator
+  );
+
+  let results = (outs
+    TF_StrTensor:$string_handle
+  );
+}
+
 def TF_MultinomialOp : TF_Op<"Multinomial", [TF_CannotDuplicate]> {
   let summary = "Draws samples from a multinomial distribution.";
 
   let arguments = (ins
     TF_IntOrFpTensor:$logits,
-    I32Tensor:$num_samples,
+    TF_Int32Tensor:$num_samples,
 
     DefaultValuedAttr<I64Attr, "0">:$seed,
     DefaultValuedAttr<I64Attr, "0">:$seed2
@@ -6678,15 +7236,94 @@ def TF_MultinomialOp : TF_Op<"Multinomial", [TF_CannotDuplicate]> {
   TF_DerivedResultTypeAttr output_dtype = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_MutableDenseHashTableV2Op : TF_Op<"MutableDenseHashTableV2", []> {
+  let summary = [{
+Creates an empty hash table that uses tensors as the backing store.
+  }];
+
+  let description = [{
+It uses "open addressing" with quadratic reprobing to resolve
+collisions.
+
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$empty_key,
+    TF_Tensor:$deleted_key,
+
+    StrAttr:$container,
+    StrAttr:$shared_name,
+    DefaultValuedAttr<BoolAttr, "false">:$use_node_name_sharing,
+    TypeAttr:$value_dtype,
+    DefaultValuedAttr<TF_ShapeAttr, "llvm::ArrayRef<int64_t>({})">:$value_shape,
+    DefaultValuedAttr<I64Attr, "131072">:$initial_num_buckets,
+    DefaultValuedAttr<F32Attr, "0.8f">:$max_load_factor
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_LookupTableAlloc]>:$table_handle
+  );
+
+  TF_DerivedOperandTypeAttr key_dtype = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_MutableHashTableOfTensorsV2Op : TF_Op<"MutableHashTableOfTensorsV2", []> {
+  let summary = "Creates an empty hash table.";
+
+  let description = [{
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a vector. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+  }];
+
+  let arguments = (ins
+    StrAttr:$container,
+    StrAttr:$shared_name,
+    DefaultValuedAttr<BoolAttr, "false">:$use_node_name_sharing,
+    TypeAttr:$key_dtype,
+    TypeAttr:$value_dtype,
+    DefaultValuedAttr<TF_ShapeAttr, "llvm::ArrayRef<int64_t>({})">:$value_shape
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_LookupTableAlloc]>:$table_handle
+  );
+}
+
+def TF_MutableHashTableV2Op : TF_Op<"MutableHashTableV2", []> {
+  let summary = "Creates an empty hash table.";
+
+  let description = [{
+This op creates a mutable hash table, specifying the type of its keys and
+values. Each value must be a scalar. Data can be inserted into the table using
+the insert operations. It does not support the initialization operation.
+  }];
+
+  let arguments = (ins
+    StrAttr:$container,
+    StrAttr:$shared_name,
+    DefaultValuedAttr<BoolAttr, "false">:$use_node_name_sharing,
+    TypeAttr:$key_dtype,
+    TypeAttr:$value_dtype
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_LookupTableAlloc]>:$table_handle
+  );
+}
+
 def TF_NdtriOp : TF_Op<"Ndtri", [NoSideEffect]> {
   let summary = "";
 
   let arguments = (ins
-    TF_FpTensor:$x
+    TF_FloatTensor:$x
   );
 
   let results = (outs
-    TF_FpTensor:$y
+    TF_FloatTensor:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -6700,11 +7337,11 @@ I.e., \\(y = -x\\).
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -6712,6 +7349,34 @@ I.e., \\(y = -x\\).
   let hasCanonicalizer = 1;
 }
 
+def TF_NextAfterOp : TF_Op<"NextAfter", [NoSideEffect, ResultsBroadcastableShape]>,
+                     WithBroadcastableBinOpBuilder {
+  let summary = [{
+Returns the next representable value of `x1` in the direction of `x2`, element-wise.
+  }];
+
+  let description = [{
+This operation returns the same result as the C++ std::nextafter function.
+
+It can also return a subnormal number.
+
+@compatibility(cpp)
+Equivalent to C++ std::nextafter function.
+@end_compatibility
+  }];
+
+  let arguments = (ins
+    TF_F32OrF64Tensor:$x1,
+    TF_F32OrF64Tensor:$x2
+  );
+
+  let results = (outs
+    TF_F32OrF64Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_NoOp : TF_Op<"NoOp", [NoSideEffect]> {
   let summary = "Does nothing. Only useful as a placeholder for control edges.";
 
@@ -6720,6 +7385,49 @@ def TF_NoOp : TF_Op<"NoOp", [NoSideEffect]> {
   let results = (outs);
 }
 
+def TF_NonMaxSuppressionV3Op : TF_Op<"NonMaxSuppressionV3", [NoSideEffect]> {
+  let summary = [{
+Greedily selects a subset of bounding boxes in descending order of score,
+  }];
+
+  let description = [{
+pruning away boxes that have high intersection-over-union (IOU) overlap
+with previously selected boxes.  Bounding boxes with score less than
+`score_threshold` are removed.  Bounding boxes are supplied as
+[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+diagonal pair of box corners and the coordinates can be provided as normalized
+(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+is agnostic to where the origin is in the coordinate system and more
+generally is invariant to orthogonal transformations and translations
+of the coordinate system; thus translating or reflections of the coordinate
+system result in the same boxes being selected by the algorithm.
+The output of this operation is a set of integers indexing into the input
+collection of bounding boxes representing the selected boxes.  The bounding
+box coordinates corresponding to the selected indices can then be obtained
+using the `tf.gather operation`.  For example:
+  selected_indices = tf.image.non_max_suppression_v2(
+      boxes, scores, max_output_size, iou_threshold, score_threshold)
+  selected_boxes = tf.gather(boxes, selected_indices)
+  }];
+
+  let arguments = (ins
+    TensorOf<[TF_Float16, TF_Float32]>:$boxes,
+    TensorOf<[TF_Float16, TF_Float32]>:$scores,
+    TF_Int32Tensor:$max_output_size,
+    TensorOf<[TF_Float16, TF_Float32]>:$iou_threshold,
+    TensorOf<[TF_Float16, TF_Float32]>:$score_threshold
+  );
+
+  let results = (outs
+    TF_Int32Tensor:$selected_indices
+  );
+
+  TF_DerivedOperandTypeAttr T_threshold = TF_DerivedOperandTypeAttr<3>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let hasCanonicalizer = 1;
+}
+
 def TF_NonMaxSuppressionV4Op : TF_Op<"NonMaxSuppressionV4", [NoSideEffect]> {
   let summary = [{
 Greedily selects a subset of bounding boxes in descending order of score,
@@ -6746,18 +7454,18 @@ using the `tf.gather operation`.  For example:
   }];
 
   let arguments = (ins
-    TensorOf<[F16, F32]>:$boxes,
-    TensorOf<[F16, F32]>:$scores,
-    I32Tensor:$max_output_size,
-    TensorOf<[F16, F32]>:$iou_threshold,
-    TensorOf<[F16, F32]>:$score_threshold,
+    TensorOf<[TF_Float16, TF_Float32]>:$boxes,
+    TensorOf<[TF_Float16, TF_Float32]>:$scores,
+    TF_Int32Tensor:$max_output_size,
+    TensorOf<[TF_Float16, TF_Float32]>:$iou_threshold,
+    TensorOf<[TF_Float16, TF_Float32]>:$score_threshold,
 
     DefaultValuedAttr<BoolAttr, "false">:$pad_to_max_output_size
   );
 
   let results = (outs
-    I32Tensor:$selected_indices,
-    I32Tensor:$valid_outputs
+    TF_Int32Tensor:$selected_indices,
+    TF_Int32Tensor:$valid_outputs
   );
 
   TF_DerivedOperandTypeAttr T_threshold = TF_DerivedOperandTypeAttr<3>;
@@ -6795,20 +7503,20 @@ larger than 0.
   }];
 
   let arguments = (ins
-    TensorOf<[F16, F32]>:$boxes,
-    TensorOf<[F16, F32]>:$scores,
-    I32Tensor:$max_output_size,
-    TensorOf<[F16, F32]>:$iou_threshold,
-    TensorOf<[F16, F32]>:$score_threshold,
-    TensorOf<[F16, F32]>:$soft_nms_sigma,
+    TensorOf<[TF_Float16, TF_Float32]>:$boxes,
+    TensorOf<[TF_Float16, TF_Float32]>:$scores,
+    TF_Int32Tensor:$max_output_size,
+    TensorOf<[TF_Float16, TF_Float32]>:$iou_threshold,
+    TensorOf<[TF_Float16, TF_Float32]>:$score_threshold,
+    TensorOf<[TF_Float16, TF_Float32]>:$soft_nms_sigma,
 
     DefaultValuedAttr<BoolAttr, "false">:$pad_to_max_output_size
   );
 
   let results = (outs
-    I32Tensor:$selected_indices,
-    TensorOf<[F16, F32]>:$selected_scores,
-    I32Tensor:$valid_outputs
+    TF_Int32Tensor:$selected_indices,
+    TensorOf<[TF_Float16, TF_Float32]>:$selected_scores,
+    TF_Int32Tensor:$valid_outputs
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -6823,21 +7531,20 @@ def TF_NotEqualOp : TF_Op<"NotEqual", [Commutative, NoSideEffect]> {
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$y,
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Quint16, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$x,
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint16, TF_Quint16, TF_Qint32, TF_Qint8, TF_Quint8, TF_Str, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$y,
 
     DefaultValuedAttr<BoolAttr, "true">:$incompatible_shape_error
   );
 
   let results = (outs
-    I1Tensor:$z
+    TF_BoolTensor:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
   let builders = [
-    OpBuilder<"OpBuilder& builder, OperationState& result, Value  x, "
-              "Value  y, BoolAttr incompatible_shape_error">
+    OpBuilder<"Value x, Value y, BoolAttr incompatible_shape_error">
   ];
 
   let verifier = [{
@@ -6939,8 +7646,8 @@ output =
   }];
 
   let arguments = (ins
-    TensorOf<[I32, I64, TF_Uint8]>:$indices,
-    I32Tensor:$depth,
+    TensorOf<[TF_Int32, TF_Int64, TF_Uint8]>:$indices,
+    TF_Int32Tensor:$depth,
     TF_Tensor:$on_value,
     TF_Tensor:$off_value,
 
@@ -6955,8 +7662,7 @@ output =
   TF_DerivedOperandTypeAttr TI = TF_DerivedOperandTypeAttr<0>;
 
   let builders = [
-    OpBuilder<"OpBuilder& builder, OperationState& result, Value  indices, "
-              "Value  depth, Value  on_value, Value  off_value, "
+    OpBuilder<"Value indices, Value depth, Value on_value, Value off_value, "
               "IntegerAttr axis">
   ];
 
@@ -6965,6 +7671,44 @@ output =
   }];
 }
 
+def TF_OneShotIteratorOp : TF_Op<"OneShotIterator", []> {
+  let summary = [{
+Makes a "one-shot" iterator that can be iterated only once.
+  }];
+
+  let description = [{
+A one-shot iterator bundles the logic for defining the dataset and
+the state of the iterator in a single op, which allows simple input
+pipelines to be defined without an additional initialization
+("MakeIterator") step.
+
+One-shot iterators have the following limitations:
+
+* They do not support parameterization: all logic for creating the underlying
+  dataset must be bundled in the `dataset_factory` function.
+* They are not resettable. Once a one-shot iterator reaches the end of its
+  underlying dataset, subsequent "IteratorGetNext" operations on that
+  iterator will always produce an `OutOfRange` error.
+
+For greater flexibility, use "Iterator" and "MakeIterator" to define
+an iterator using an arbitrary subgraph, which may capture tensors
+(including fed values) as parameters, and which may be reset multiple
+times by rerunning "MakeIterator".
+  }];
+
+  let arguments = (ins
+    SymbolRefAttr:$dataset_factory,
+    Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
+    Confined<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes,
+    StrAttr:$container,
+    StrAttr:$shared_name
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_DatasetIteratorAlloc]>:$handle
+  );
+}
+
 def TF_OutfeedEnqueueTupleOp : TF_Op<"OutfeedEnqueueTuple", []> {
   let summary = "Enqueue multiple Tensor values on the computation outfeed.";
 
@@ -7128,17 +7872,17 @@ stores the parameters for each batch.
 
   let arguments = (ins
     TF_I32OrI64Tensor:$shape,
-    TF_FpTensor:$means,
-    TF_FpTensor:$stdevs,
-    TF_FpTensor:$minvals,
-    TF_FpTensor:$maxvals,
+    TF_FloatTensor:$means,
+    TF_FloatTensor:$stdevs,
+    TF_FloatTensor:$minvals,
+    TF_FloatTensor:$maxvals,
 
     DefaultValuedAttr<I64Attr, "0">:$seed,
     DefaultValuedAttr<I64Attr, "0">:$seed2
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -7161,12 +7905,12 @@ tf.pow(x, y) ==> [[256, 65536], [9, 27]]
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$x,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$y
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$z
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -7232,14 +7976,14 @@ retained with length 1.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     TF_I32OrI64Tensor:$reduction_indices,
 
     DefaultValuedAttr<BoolAttr, "false">:$keep_dims
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -7263,14 +8007,14 @@ q_full, r_full = qr(a, full_matrices=True)
   }];
 
   let arguments = (ins
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$input,
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$input,
 
     DefaultValuedAttr<BoolAttr, "false">:$full_matrices
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$q,
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$r
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$q,
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$r
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -7284,7 +8028,7 @@ def TF_QuantizeAndDequantizeOp : TF_Op<"QuantizeAndDequantize", [NoSideEffect, S
   let summary = "Use QuantizeAndDequantizeV2 instead.";
 
   let arguments = (ins
-    TF_FpTensor:$input,
+    TF_FloatTensor:$input,
 
     DefaultValuedAttr<BoolAttr, "true">:$signed_input,
     DefaultValuedAttr<I64Attr, "8">:$num_bits,
@@ -7294,7 +8038,7 @@ def TF_QuantizeAndDequantizeOp : TF_Op<"QuantizeAndDequantize", [NoSideEffect, S
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -7358,9 +8102,9 @@ The above round function rounds the value based on the given round_mode.
   }];
 
   let arguments = (ins
-    TF_FpTensor:$input,
-    TF_FpTensor:$input_min,
-    TF_FpTensor:$input_max,
+    TF_FloatTensor:$input,
+    TF_FloatTensor:$input_min,
+    TF_FloatTensor:$input_max,
 
     DefaultValuedAttr<BoolAttr, "true">:$signed_input,
     DefaultValuedAttr<I64Attr, "8">:$num_bits,
@@ -7371,7 +8115,7 @@ The above round function rounds the value based on the given round_mode.
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -7386,10 +8130,10 @@ tensor, so its value can change during training.
   }];
 
   let arguments = (ins
-    TF_FpTensor:$input,
-    TF_FpTensor:$input_min,
-    TF_FpTensor:$input_max,
-    I32Tensor:$num_bits,
+    TF_FloatTensor:$input,
+    TF_FloatTensor:$input_min,
+    TF_FloatTensor:$input_max,
+    TF_Int32Tensor:$num_bits,
 
     DefaultValuedAttr<BoolAttr, "true">:$signed_input,
     DefaultValuedAttr<BoolAttr, "true">:$range_given,
@@ -7398,7 +8142,7 @@ tensor, so its value can change during training.
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -7422,7 +8166,7 @@ the dimension is padded with zeros.
 
   let arguments = (ins
     TF_F32OrF64Tensor:$input,
-    I32Tensor:$fft_length
+    TF_Int32Tensor:$fft_length
   );
 
   let results = (outs
@@ -7452,7 +8196,7 @@ the dimension is padded with zeros.
 
   let arguments = (ins
     TF_F32OrF64Tensor:$input,
-    I32Tensor:$fft_length
+    TF_Int32Tensor:$fft_length
   );
 
   let results = (outs
@@ -7482,7 +8226,7 @@ the dimension is padded with zeros.
 
   let arguments = (ins
     TF_F32OrF64Tensor:$input,
-    I32Tensor:$fft_length
+    TF_Int32Tensor:$fft_length
   );
 
   let results = (outs
@@ -7518,11 +8262,11 @@ array([0.6666667, 1. , 1. ], dtype=float32)
   }];
 
   let arguments = (ins
-    TF_FpTensor:$images
+    TF_FloatTensor:$images
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -7541,14 +8285,14 @@ See http://dl.acm.org/citation.cfm?id=358414
 
   let arguments = (ins
     TF_I32OrI64Tensor:$shape,
-    TensorOf<[F16, F32, F64]>:$alpha,
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64]>:$alpha,
 
     DefaultValuedAttr<I64Attr, "0">:$seed,
     DefaultValuedAttr<I64Attr, "0">:$seed2
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64]>:$output
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64]>:$output
   );
 
   TF_DerivedOperandTypeAttr S = TF_DerivedOperandTypeAttr<0>;
@@ -7578,14 +8322,14 @@ def TF_RandomPoissonOp : TF_Op<"RandomPoisson", [TF_CannotDuplicate]> {
 
   let arguments = (ins
     TF_I32OrI64Tensor:$shape,
-    TensorOf<[F16, F32, F64]>:$rate,
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64]>:$rate,
 
     DefaultValuedAttr<I64Attr, "0">:$seed,
     DefaultValuedAttr<I64Attr, "0">:$seed2
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64]>:$output
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64]>:$output
   );
 
   TF_DerivedOperandTypeAttr S = TF_DerivedOperandTypeAttr<0>;
@@ -7611,14 +8355,14 @@ Programming, Volume 2. Addison Wesley
 
   let arguments = (ins
     TF_I32OrI64Tensor:$shape,
-    TensorOf<[F16, F32, F64, I32, I64]>:$rate,
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$rate,
 
     DefaultValuedAttr<I64Attr, "0">:$seed,
     DefaultValuedAttr<I64Attr, "0">:$seed2
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64, I32, I64]>:$output
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$output
   );
 
   TF_DerivedOperandTypeAttr R = TF_DerivedOperandTypeAttr<1>;
@@ -7670,7 +8414,7 @@ The generated values will have mean 0 and standard deviation 1.
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -7693,7 +8437,7 @@ lower bound 0 is included in the range, while the upper bound 1 is excluded.
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -7764,8 +8508,7 @@ tf.range(start, limit, delta) ==> [3, 6, 9, 12, 15]
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<0>;
 
   let builders = [
-    OpBuilder<"OpBuilder& builder, OperationState& result, Value  start, "
-              "Value  limit, Value  delta">
+    OpBuilder<"Value start, Value limit, Value delta">
   ];
 }
 
@@ -7775,9 +8518,9 @@ Creates a dataset with a range of values. Corresponds to python's xrange.
   }];
 
   let arguments = (ins
-    I64Tensor:$start,
-    I64Tensor:$stop,
-    I64Tensor:$step,
+    TF_Int64Tensor:$start,
+    TF_Int64Tensor:$stop,
+    TF_Int64Tensor:$step,
 
     Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
     Confined<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes
@@ -7812,13 +8555,13 @@ of the tensor. Rank is also known as "order", "degree", or "ndims."
   );
 
   let results = (outs
-    I32Tensor:$output
+    TF_Int32Tensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
   let builders = [
-    OpBuilder<"OpBuilder& builder, OperationState& result, Value  input">
+    OpBuilder<"Value input">
   ];
 
   let hasFolder = 1;
@@ -7878,33 +8621,6 @@ tf.real(input) ==> [-2.25, 3.25]
   TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
 }
 
-def TF_RealDivOp : TF_Op<"RealDiv", [NoSideEffect, ResultsBroadcastableShape, TF_CwiseBinary]>,
-                   WithBroadcastableBinOpBuilder {
-  let summary = "Returns x / y element-wise for real types.";
-
-  let description = [{
-If `x` and `y` are reals, this will return the floating-point division.
-
-*NOTE*: `Div` supports broadcasting. More about broadcasting
-[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-  }];
-
-  let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$y
-  );
-
-  let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$z
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-
-  let hasCanonicalizer = 1;
-
-  let hasFolder = 1;
-}
-
 def TF_ReciprocalOp : TF_Op<"Reciprocal", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes the reciprocal of x element-wise.";
 
@@ -7913,11 +8629,11 @@ I.e., \\(y = 1 / x\\).
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -7962,13 +8678,13 @@ most one RecvTPUEmbeddingActivations op in the TPU graph.
   );
 
   let results = (outs
-    Variadic<F32Tensor>:$outputs
+    Variadic<TF_Float32Tensor>:$outputs
   );
 
   TF_DerivedResultSizeAttr num_outputs = TF_DerivedResultSizeAttr<0>;
 }
 
-def TF_ReluOp : TF_Op<"Relu", [NoSideEffect, SameOperandsAndResultType, TF_LayoutAgnostic]> {
+def TF_ReluOp : TF_Op<"Relu", [NoSideEffect, SameOperandsAndResultType, TF_ContractionFusableInterface, TF_LayoutAgnostic]> {
   let summary = "Computes rectified linear: `max(features, 0)`.";
 
   let description = [{
@@ -7979,14 +8695,19 @@ array([ 0.,  0., -0.,  3.], dtype=float32)
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Qint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$features
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$features
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Qint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$activations
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$activations
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let extraClassDeclaration = [{
+    // TF_ContractionFusableInterface:
+    Optional<ContractionFusion> GetContractionFusion();
+  }];
 }
 
 def TF_Relu6Op : TF_Op<"Relu6", [NoSideEffect, SameOperandsAndResultType]> {
@@ -8111,8 +8832,7 @@ reshape(t, []) ==> 7
   TF_DerivedOperandTypeAttr Tshape = TF_DerivedOperandTypeAttr<1>;
 
   let builders = [
-    OpBuilder<
-      "OpBuilder& builder, OperationState& result, Value  tensor, Value  shape">
+    OpBuilder<"Value tensor, Value shape">
   ];
 
   let verifier = [{
@@ -8131,15 +8851,15 @@ Input images can be of different types but output images are always float.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Uint16, TF_Uint8]>:$images,
-    I32Tensor:$size,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint8]>:$images,
+    TF_Int32Tensor:$size,
 
     DefaultValuedAttr<BoolAttr, "false">:$align_corners,
     DefaultValuedAttr<BoolAttr, "false">:$half_pixel_centers
   );
 
   let results = (outs
-    F32Tensor:$resized_images
+    TF_Float32Tensor:$resized_images
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -8149,15 +8869,15 @@ def TF_ResizeBilinearGradOp : TF_Op<"ResizeBilinearGrad", [NoSideEffect]> {
   let summary = "Computes the gradient of bilinear interpolation.";
 
   let arguments = (ins
-    F32Tensor:$grads,
-    TF_FpTensor:$original_image,
+    TF_Float32Tensor:$grads,
+    TF_FloatTensor:$original_image,
 
     DefaultValuedAttr<BoolAttr, "false">:$align_corners,
     DefaultValuedAttr<BoolAttr, "false">:$half_pixel_centers
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
@@ -8169,20 +8889,137 @@ Resize `images` to `size` using nearest neighbor interpolation.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Uint16, TF_Uint8]>:$images,
-    I32Tensor:$size,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint8]>:$images,
+    TF_Int32Tensor:$size,
 
     DefaultValuedAttr<BoolAttr, "false">:$align_corners,
     DefaultValuedAttr<BoolAttr, "false">:$half_pixel_centers
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Uint16, TF_Uint8]>:$resized_images
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint8]>:$resized_images
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_ResizeNearestNeighborGradOp : TF_Op<"ResizeNearestNeighborGrad", [NoSideEffect]> {
+  let summary = "Computes the gradient of nearest neighbor interpolation.";
+
+  let arguments = (ins
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int8, TF_Uint8]>:$grads,
+    TF_Int32Tensor:$size,
+
+    DefaultValuedAttr<BoolAttr, "false">:$align_corners,
+    DefaultValuedAttr<BoolAttr, "false">:$half_pixel_centers
+  );
+
+  let results = (outs
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int8, TF_Uint8]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_ResourceApplyAdaMaxOp : TF_Op<"ResourceApplyAdaMax", []> {
+  let summary = "Update '*var' according to the AdaMax algorithm.";
+
+  let description = [{
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+v_t <- max(beta2 * v_{t-1}, abs(g))
+variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$m,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$v,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta1_power,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta1,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta2,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$epsilon,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
+}
+
+def TF_ResourceApplyAdadeltaOp : TF_Op<"ResourceApplyAdadelta", []> {
+  let summary = "Update '*var' according to the adadelta scheme.";
+
+  let description = [{
+accum = rho() * accum + (1 - rho()) * grad.square();
+update = (update_accum + epsilon).sqrt() * (accum + epsilon()).rsqrt() * grad;
+update_accum = rho() * update_accum + (1 - rho()) * update.square();
+var -= update;
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$accum,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$accum_update,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rho,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$epsilon,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
+}
+
+def TF_ResourceApplyAdagradOp : TF_Op<"ResourceApplyAdagrad", []> {
+  let summary = "Update '*var' according to the adagrad scheme.";
+
+  let description = [{
+accum += grad * grad
+var -= lr * grad * (1 / sqrt(accum))
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$accum,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking,
+    DefaultValuedAttr<BoolAttr, "true">:$update_slots
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceApplyAdagradDAOp : TF_Op<"ResourceApplyAdagradDA", []> {
+  let summary = "Update '*var' according to the proximal adagrad scheme.";
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$gradient_accumulator,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$gradient_squared_accumulator,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l1,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l2,
+    TF_Int64Tensor:$global_step,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
+}
+
 def TF_ResourceApplyAdagradV2Op : TF_Op<"ResourceApplyAdagradV2", []> {
   let summary = "Update '*var' according to the adagrad scheme.";
 
@@ -8192,11 +9029,11 @@ var -= lr * grad * (1 / (sqrt(accum) + epsilon))
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$var,
-    TF_ResourceTensor:$accum,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$epsilon,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$accum,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$epsilon,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
 
     DefaultValuedAttr<BoolAttr, "false">:$use_locking,
     DefaultValuedAttr<BoolAttr, "true">:$update_slots
@@ -8218,16 +9055,16 @@ $$\text{variable} := \text{variable} - \text{lr}_t * m_t / (\sqrt{v_t} + \epsilo
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$var,
-    TF_ResourceTensor:$m,
-    TF_ResourceTensor:$v,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta1_power,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta2_power,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta1,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta2,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$epsilon,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$m,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$v,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta1_power,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta2_power,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta1,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta2,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$epsilon,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
 
     DefaultValuedAttr<BoolAttr, "false">:$use_locking,
     DefaultValuedAttr<BoolAttr, "false">:$use_nesterov
@@ -8238,6 +9075,32 @@ $$\text{variable} := \text{variable} - \text{lr}_t * m_t / (\sqrt{v_t} + \epsilo
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
 }
 
+def TF_ResourceApplyAddSignOp : TF_Op<"ResourceApplyAddSign", []> {
+  let summary = "Update '*var' according to the AddSign update.";
+
+  let description = [{
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+variable <- variable - lr_t * update
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$m,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$alpha,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$sign_decay,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+}
+
 def TF_ResourceApplyCenteredRMSPropOp : TF_Op<"ResourceApplyCenteredRMSProp", []> {
   let summary = "Update '*var' according to the centered RMSProp algorithm.";
 
@@ -8263,15 +9126,15 @@ var <- var - mom
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$var,
-    TF_ResourceTensor:$mg,
-    TF_ResourceTensor:$ms,
-    TF_ResourceTensor:$mom,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rho,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$momentum,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$epsilon,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$mg,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$ms,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$mom,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rho,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$momentum,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$epsilon,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
 
     DefaultValuedAttr<BoolAttr, "false">:$use_locking
   );
@@ -8281,13 +9144,76 @@ var <- var - mom
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<4>;
 }
 
+def TF_ResourceApplyFtrlOp : TF_Op<"ResourceApplyFtrl", []> {
+  let summary = "Update '*var' according to the Ftrl-proximal scheme.";
+
+  let description = [{
+accum_new = accum + grad * grad
+linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$accum,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$linear,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l1,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l2,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr_power,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking,
+    DefaultValuedAttr<BoolAttr, "false">:$multiply_linear_by_lr
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
+}
+
+def TF_ResourceApplyFtrlV2Op : TF_Op<"ResourceApplyFtrlV2", []> {
+  let summary = "Update '*var' according to the Ftrl-proximal scheme.";
+
+  let description = [{
+grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+linear += grad_with_shrinkage +
+    (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+accum = accum_new
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$accum,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$linear,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l1,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l2,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l2_shrinkage,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr_power,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking,
+    DefaultValuedAttr<BoolAttr, "false">:$multiply_linear_by_lr
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
+}
+
 def TF_ResourceApplyGradientDescentOp : TF_Op<"ResourceApplyGradientDescent", []> {
   let summary = "Update '*var' by subtracting 'alpha' * 'delta' from it.";
 
   let arguments = (ins
-    TF_ResourceTensor:$var,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$alpha,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$delta,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$alpha,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$delta,
 
     DefaultValuedAttr<BoolAttr, "false">:$use_locking
   );
@@ -8308,11 +9234,11 @@ var += accum
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$var,
-    TF_ResourceTensor:$accum,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$momentum,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$accum,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$momentum,
 
     DefaultValuedAttr<BoolAttr, "false">:$use_locking,
     DefaultValuedAttr<BoolAttr, "false">:$use_nesterov
@@ -8334,11 +9260,11 @@ var -= lr * accum
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$var,
-    TF_ResourceTensor:$accum,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$momentum,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$accum,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$momentum,
 
     DefaultValuedAttr<BoolAttr, "false">:$use_locking,
     DefaultValuedAttr<BoolAttr, "false">:$use_nesterov
@@ -8349,6 +9275,116 @@ var -= lr * accum
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
 }
 
+def TF_ResourceApplyPowerSignOp : TF_Op<"ResourceApplyPowerSign", []> {
+  let summary = "Update '*var' according to the AddSign update.";
+
+  let description = [{
+m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
+variable <- variable - lr_t * update
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$m,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$logbase,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$sign_decay,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$beta,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceApplyProximalAdagradOp : TF_Op<"ResourceApplyProximalAdagrad", []> {
+  let summary = [{
+Update '*var' and '*accum' according to FOBOS with Adagrad learning rate.
+  }];
+
+  let description = [{
+accum += grad * grad
+prox_v = var - lr * grad * (1 / sqrt(accum))
+var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$accum,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l1,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l2,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceApplyProximalGradientDescentOp : TF_Op<"ResourceApplyProximalGradientDescent", []> {
+  let summary = "Update '*var' as FOBOS algorithm with fixed learning rate.";
+
+  let description = [{
+prox_v = var - alpha * delta
+var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$alpha,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l1,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$l2,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$delta,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+}
+
+def TF_ResourceApplyRMSPropOp : TF_Op<"ResourceApplyRMSProp", []> {
+  let summary = "Update '*var' according to the RMSProp algorithm.";
+
+  let description = [{
+Note that in dense implementation of this algorithm, ms and mom will
+update even if the grad is zero, but in this sparse implementation, ms
+and mom will not update in iterations during which the grad is zero.
+
+mean_square = decay * mean_square + (1-decay) * gradient ** 2
+Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+
+ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+var <- var - mom
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$var,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$ms,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$mom,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lr,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rho,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$momentum,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$epsilon,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$grad,
+
+    DefaultValuedAttr<BoolAttr, "false">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
+}
+
 def TF_ResourceGatherOp : TF_Op<"ResourceGather", []> {
   let summary = [{
 Gather slices from the variable pointed to by `resource` according to `indices`.
@@ -8371,7 +9407,7 @@ Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$resource,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead]>:$resource,
     TF_I32OrI64Tensor:$indices,
 
     DefaultValuedAttr<I64Attr, "0">:$batch_dims,
@@ -8386,6 +9422,405 @@ Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
   TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_ResourceScatterAddOp : TF_Op<"ResourceScatterAdd", []> {
+  let summary = "Adds sparse updates to the variable referenced by `resource`.";
+
+  let description = [{
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] += updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] += updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] += updates[i, ..., j, ...]
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions add.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$resource,
+    TF_I32OrI64Tensor:$indices,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$updates
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceScatterDivOp : TF_Op<"ResourceScatterDiv", []> {
+  let summary = [{
+Divides sparse updates into the variable referenced by `resource`.
+  }];
+
+  let description = [{
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] /= updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] /= updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] /= updates[i, ..., j, ...]
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions multiply.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$resource,
+    TF_I32OrI64Tensor:$indices,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$updates
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceScatterMaxOp : TF_Op<"ResourceScatterMax", []> {
+  let summary = [{
+Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
+  }];
+
+  let description = [{
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] = max(ref[indices, ...], updates[...])
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] = max(ref[indices[i], ...], updates[i, ...])
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] = max(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions are combined.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$resource,
+    TF_I32OrI64Tensor:$indices,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$updates
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceScatterMinOp : TF_Op<"ResourceScatterMin", []> {
+  let summary = [{
+Reduces sparse updates into the variable referenced by `resource` using the `min` operation.
+  }];
+
+  let description = [{
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] = min(ref[indices, ...], updates[...])
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] = min(ref[indices[i], ...], updates[i, ...])
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] = min(ref[indices[i, ..., j], ...], updates[i, ..., j, ...])
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions are combined.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$resource,
+    TF_I32OrI64Tensor:$indices,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$updates
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceScatterMulOp : TF_Op<"ResourceScatterMul", []> {
+  let summary = [{
+Multiplies sparse updates into the variable referenced by `resource`.
+  }];
+
+  let description = [{
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] *= updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] *= updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions multiply.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$resource,
+    TF_I32OrI64Tensor:$indices,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$updates
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceScatterNdAddOp : TF_Op<"ResourceScatterNdAdd", []> {
+  let summary = [{
+Applies sparse addition to individual values or slices in a Variable.
+  }];
+
+  let description = [{
+`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `ref`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+dimension of `ref`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+```
+
+For example, say we want to add 4 scattered elements to a rank-1 tensor to
+8 elements. In Python, that addition would look like this:
+
+```python
+ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+indices = tf.constant([[4], [3], [1], [7]])
+updates = tf.constant([9, 10, 11, 12])
+add = tf.scatter_nd_add(ref, indices, updates)
+with tf.Session() as sess:
+  print sess.run(add)
+```
+
+The resulting update to ref would look like this:
+
+    [1, 13, 3, 14, 14, 6, 7, 20]
+
+See `tf.scatter_nd` for more details about how to make updates to
+slices.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$ref,
+    TF_I32OrI64Tensor:$indices,
+    TF_Tensor:$updates,
+
+    DefaultValuedAttr<BoolAttr, "true">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceScatterNdSubOp : TF_Op<"ResourceScatterNdSub", []> {
+  let summary = [{
+Applies sparse subtraction to individual values or slices in a Variable.
+  }];
+
+  let description = [{
+`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `ref`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+dimension of `ref`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]]
+```
+
+For example, say we want to subtract 4 scattered elements from a rank-1 tensor
+with 8 elements. In Python, that subtraction would look like this:
+
+```python
+ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8], use_resource=True)
+indices = tf.constant([[4], [3], [1], [7]])
+updates = tf.constant([9, 10, 11, 12])
+sub = tf.scatter_nd_sub(ref, indices, updates)
+with tf.Session() as sess:
+  print sess.run(sub)
+```
+
+The resulting update to ref would look like this:
+
+    [1, -9, 3, -6, -4, 6, 7, -4]
+
+See `tf.scatter_nd` for more details about how to make updates to
+slices.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$ref,
+    TF_I32OrI64Tensor:$indices,
+    TF_Tensor:$updates,
+
+    DefaultValuedAttr<BoolAttr, "true">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceScatterNdUpdateOp : TF_Op<"ResourceScatterNdUpdate", []> {
+  let summary = [{
+Applies sparse `updates` to individual values or slices within a given
+  }];
+
+  let description = [{
+variable according to `indices`.
+
+`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `ref`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+dimension of `ref`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+```
+
+For example, say we want to update 4 scattered elements to a rank-1 tensor to
+8 elements. In Python, that update would look like this:
+
+```python
+    ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+    indices = tf.constant([[4], [3], [1] ,[7]])
+    updates = tf.constant([9, 10, 11, 12])
+    update = tf.scatter_nd_update(ref, indices, updates)
+    with tf.Session() as sess:
+      print sess.run(update)
+```
+
+The resulting update to ref would look like this:
+
+    [1, 11, 3, 10, 9, 6, 7, 12]
+
+See `tf.scatter_nd` for more details about how to make updates to
+slices.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$ref,
+    TF_I32OrI64Tensor:$indices,
+    TF_Tensor:$updates,
+
+    DefaultValuedAttr<BoolAttr, "true">:$use_locking
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_ResourceScatterSubOp : TF_Op<"ResourceScatterSub", []> {
+  let summary = [{
+Subtracts sparse updates from the variable referenced by `resource`.
+  }];
+
+  let description = [{
+This operation computes
+
+    # Scalar indices
+    ref[indices, ...] -= updates[...]
+
+    # Vector indices (for each i)
+    ref[indices[i], ...] -= updates[i, ...]
+
+    # High rank indices (for each i, ..., j)
+    ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
+
+Duplicate entries are handled correctly: if multiple `indices` reference
+the same location, their contributions add.
+
+Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+</div>
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$resource,
+    TF_I32OrI64Tensor:$indices,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$updates
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
+}
+
 def TF_ResourceScatterUpdateOp : TF_Op<"ResourceScatterUpdate", []> {
   let summary = [{
 Assigns sparse updates to the variable referenced by `resource`.
@@ -8405,7 +9840,7 @@ This operation computes
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$resource,
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$resource,
     TF_I32OrI64Tensor:$indices,
     TF_Tensor:$updates
   );
@@ -8416,6 +9851,38 @@ This operation computes
   TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
 }
 
+def TF_ResourceStridedSliceAssignOp : TF_Op<"ResourceStridedSliceAssign", []> {
+  let summary = "Assign `value` to the sliced l-value reference of `ref`.";
+
+  let description = [{
+The values of `value` are assigned to the positions in the variable
+`ref` that are selected by the slice parameters. The slice parameters
+`begin, `end`, `strides`, etc. work exactly as in `StridedSlice`.
+
+NOTE this op currently does not support broadcasting and so `value`'s
+shape must be exactly the shape produced by the slice of `ref`.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$ref,
+    TF_I32OrI64Tensor:$begin,
+    TF_I32OrI64Tensor:$end,
+    TF_I32OrI64Tensor:$strides,
+    TF_Tensor:$value,
+
+    DefaultValuedAttr<I64Attr, "0">:$begin_mask,
+    DefaultValuedAttr<I64Attr, "0">:$end_mask,
+    DefaultValuedAttr<I64Attr, "0">:$ellipsis_mask,
+    DefaultValuedAttr<I64Attr, "0">:$new_axis_mask,
+    DefaultValuedAttr<I64Attr, "0">:$shrink_axis_mask
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<4>;
+  TF_DerivedOperandTypeAttr Index = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_RestoreV2Op : TF_Op<"RestoreV2", []> {
   let summary = "Restores tensors from a V2 checkpoint.";
 
@@ -8577,12 +10044,12 @@ reverse(t, dims) ==> [[[[8, 9, 10, 11],
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Str, TF_Uint16, TF_Uint8]>:$tensor,
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Str, TF_Uint16, TF_Uint8]>:$tensor,
     TF_I32OrI64Tensor:$axis
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Str, TF_Uint16, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Str, TF_Uint16, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -8657,11 +10124,11 @@ rint([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) ==> [-2., -2., -0., 0., 2., 2., 2.]
   }];
 
   let arguments = (ins
-    TF_FpTensor:$x
+    TF_FloatTensor:$x
   );
 
   let results = (outs
-    TF_FpTensor:$y
+    TF_FloatTensor:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -8719,11 +10186,11 @@ according to the current system rounding mode use std::cint.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -8938,12 +10405,12 @@ tf.segment_mean(c, tf.constant([0, 0, 1]))
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$data,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$data,
     TF_I32OrI64Tensor:$segment_ids
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
@@ -9020,12 +10487,12 @@ tf.segment_prod(c, tf.constant([0, 0, 1]))
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$data,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$data,
     TF_I32OrI64Tensor:$segment_ids
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
@@ -9061,12 +10528,12 @@ tf.segment_sum(c, tf.constant([0, 0, 1]))
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$data,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$data,
     TF_I32OrI64Tensor:$segment_ids
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
@@ -9118,7 +10585,7 @@ select(condition, t, e) ==> [[1, 2],
   }];
 
   let arguments = (ins
-    I1Tensor:$condition,
+    TF_BoolTensor:$condition,
     TF_Tensor:$t,
     TF_Tensor:$e
   );
@@ -9140,7 +10607,7 @@ def TF_SelectV2Op : TF_Op<"SelectV2", [NoSideEffect, ResultsBroadcastableShape]>
   let summary = "";
 
   let arguments = (ins
-    I1Tensor:$condition,
+    TF_BoolTensor:$condition,
     TF_Tensor:$t,
     TF_Tensor:$e
   );
@@ -9152,7 +10619,7 @@ def TF_SelectV2Op : TF_Op<"SelectV2", [NoSideEffect, ResultsBroadcastableShape]>
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
 
   let builders = [
-    OpBuilder<"OpBuilder& builder, OperationState& result, Value  condition, Value  e, Value  t">
+    OpBuilder<"Value condition, Value e, Value t">
   ];
 }
 
@@ -9176,14 +10643,14 @@ e = self_adjoint_eig(a, compute_v=False)
   }];
 
   let arguments = (ins
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$input,
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$input,
 
     DefaultValuedAttr<BoolAttr, "true">:$compute_v
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$e,
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$v
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$e,
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$v
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -9205,11 +10672,11 @@ See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
   }];
 
   let arguments = (ins
-    TF_FpTensor:$features
+    TF_FloatTensor:$features
   );
 
   let results = (outs
-    TF_FpTensor:$activations
+    TF_FloatTensor:$activations
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -9221,17 +10688,33 @@ Computes gradients for the scaled exponential linear (Selu) operation.
   }];
 
   let arguments = (ins
-    TF_FpTensor:$gradients,
-    TF_FpTensor:$outputs
+    TF_FloatTensor:$gradients,
+    TF_FloatTensor:$outputs
   );
 
   let results = (outs
-    TF_FpTensor:$backprops
+    TF_FloatTensor:$backprops
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_SerializeIteratorOp : TF_Op<"SerializeIterator", []> {
+  let summary = [{
+Converts the given `resource_handle` representing an iterator to a variant tensor.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_DatasetIteratorRead]>:$resource_handle,
+
+    DefaultValuedAttr<I64Attr, "0">:$external_state_policy
+  );
+
+  let results = (outs
+    TF_VariantTensor:$serialized
+  );
+}
+
 def TF_ShapeOp : TF_Op<"Shape", [NoSideEffect]> {
   let summary = "Returns the shape of a tensor.";
 
@@ -9262,7 +10745,7 @@ shape(t) ==> [2, 2, 3]
   }];
 
   let builders = [
-    OpBuilder<"OpBuilder& builder, OperationState& result, Value  input, BoolAttr use32Bit">
+    OpBuilder<"Value input, BoolAttr use32Bit">
   ];
 
   let hasFolder = 1;
@@ -9305,8 +10788,8 @@ Generate a sharded filename. The filename is printf formatted as
 
   let arguments = (ins
     TF_StrTensor:$basename,
-    I32Tensor:$shard,
-    I32Tensor:$num_shards
+    TF_Int32Tensor:$shard,
+    TF_Int32Tensor:$num_shards
   );
 
   let results = (outs
@@ -9314,6 +10797,76 @@ Generate a sharded filename. The filename is printf formatted as
   );
 }
 
+def TF_ShuffleAndRepeatDatasetV2Op : TF_Op<"ShuffleAndRepeatDatasetV2", []> {
+  let summary = "";
+
+  let arguments = (ins
+    TF_VariantTensor:$input_dataset,
+    TF_Int64Tensor:$buffer_size,
+    TF_Int64Tensor:$seed,
+    TF_Int64Tensor:$seed2,
+    TF_Int64Tensor:$count,
+    Arg<TF_ResourceTensor, "", [TF_DatasetSeedGeneratorRead, TF_DatasetSeedGeneratorWrite]>:$seed_generator,
+
+    DefaultValuedAttr<BoolAttr, "true">:$reshuffle_each_iteration,
+    Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
+    Confined<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes
+  );
+
+  let results = (outs
+    TF_VariantTensor:$handle
+  );
+}
+
+def TF_ShuffleDatasetV2Op : TF_Op<"ShuffleDatasetV2", []> {
+  let summary = "";
+
+  let arguments = (ins
+    TF_VariantTensor:$input_dataset,
+    TF_Int64Tensor:$buffer_size,
+    Arg<TF_ResourceTensor, "", [TF_DatasetSeedGeneratorRead, TF_DatasetSeedGeneratorWrite]>:$seed_generator,
+
+    Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
+    Confined<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes
+  );
+
+  let results = (outs
+    TF_VariantTensor:$handle
+  );
+}
+
+def TF_ShuffleDatasetV3Op : TF_Op<"ShuffleDatasetV3", []> {
+  let summary = "";
+
+  let arguments = (ins
+    TF_VariantTensor:$input_dataset,
+    TF_Int64Tensor:$buffer_size,
+    TF_Int64Tensor:$seed,
+    TF_Int64Tensor:$seed2,
+    Arg<TF_ResourceTensor, "", [TF_DatasetSeedGeneratorRead, TF_DatasetSeedGeneratorWrite]>:$seed_generator,
+
+    DefaultValuedAttr<BoolAttr, "true">:$reshuffle_each_iteration,
+    Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
+    Confined<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes
+  );
+
+  let results = (outs
+    TF_VariantTensor:$handle
+  );
+}
+
+def TF_ShutdownDistributedTPUOp : TF_Op<"ShutdownDistributedTPU", []> {
+  let summary = "Shuts down a running distributed TPU system.";
+
+  let description = [{
+The op returns an error if no system is running.
+  }];
+
+  let arguments = (ins);
+
+  let results = (outs);
+}
+
 def TF_SigmoidOp : TF_Op<"Sigmoid", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes sigmoid of `x` element-wise.";
 
@@ -9366,11 +10919,11 @@ Example usage:
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -9513,11 +11066,11 @@ For each batch `i` and class `j` we have
   }];
 
   let arguments = (ins
-    TF_FpTensor:$logits
+    TF_FloatTensor:$logits
   );
 
   let results = (outs
-    TF_FpTensor:$softmax
+    TF_FloatTensor:$softmax
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -9537,13 +11090,13 @@ Inputs are the logits, not probabilities.
   }];
 
   let arguments = (ins
-    TF_FpTensor:$features,
-    TF_FpTensor:$labels
+    TF_FloatTensor:$features,
+    TF_FloatTensor:$labels
   );
 
   let results = (outs
-    TF_FpTensor:$loss,
-    TF_FpTensor:$backprop
+    TF_FloatTensor:$loss,
+    TF_FloatTensor:$backprop
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -9557,11 +11110,11 @@ def TF_SoftplusOp : TF_Op<"Softplus", [NoSideEffect, SameOperandsAndResultType]>
   let summary = "Computes softplus: `log(exp(features) + 1)`.";
 
   let arguments = (ins
-    TF_FpTensor:$features
+    TF_FloatTensor:$features
   );
 
   let results = (outs
-    TF_FpTensor:$activations
+    TF_FloatTensor:$activations
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -9571,12 +11124,12 @@ def TF_SoftplusGradOp : TF_Op<"SoftplusGrad", [NoSideEffect, SameOperandsAndResu
   let summary = "Computes softplus gradients for a softplus operation.";
 
   let arguments = (ins
-    TF_FpTensor:$gradients,
-    TF_FpTensor:$features
+    TF_FloatTensor:$gradients,
+    TF_FloatTensor:$features
   );
 
   let results = (outs
-    TF_FpTensor:$backprops
+    TF_FloatTensor:$backprops
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -9586,11 +11139,11 @@ def TF_SoftsignOp : TF_Op<"Softsign", [NoSideEffect, SameOperandsAndResultType]>
   let summary = "Computes softsign: `features / (abs(features) + 1)`.";
 
   let arguments = (ins
-    TF_FpTensor:$features
+    TF_FloatTensor:$features
   );
 
   let results = (outs
-    TF_FpTensor:$activations
+    TF_FloatTensor:$activations
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -9600,12 +11153,12 @@ def TF_SoftsignGradOp : TF_Op<"SoftsignGrad", [NoSideEffect, SameOperandsAndResu
   let summary = "Computes softsign gradients for a softsign operation.";
 
   let arguments = (ins
-    TF_FpTensor:$gradients,
-    TF_FpTensor:$features
+    TF_FloatTensor:$gradients,
+    TF_FloatTensor:$features
   );
 
   let results = (outs
-    TF_FpTensor:$backprops
+    TF_FloatTensor:$backprops
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -9639,7 +11192,7 @@ block size.
   TF_DerivedOperandTypeAttr Tpaddings = TF_DerivedOperandTypeAttr<1>;
 }
 
-def TF_SpaceToBatchNDOp : TF_Op<"SpaceToBatchND", [NoSideEffect]> {
+def TF_SpaceToBatchNDOp : TF_Op<"SpaceToBatchND", [DeclareOpInterfaceMethods<InferTypeOpInterface>, NoSideEffect]> {
   let summary = "SpaceToBatch for N-D tensors of type T.";
 
   let description = [{
@@ -9666,6 +11219,14 @@ precise description.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedOperandTypeAttr Tpaddings = TF_DerivedOperandTypeAttr<2>;
   TF_DerivedOperandTypeAttr Tblock_shape = TF_DerivedOperandTypeAttr<1>;
+
+  let verifier = [{ return Verify(*this); }];
+
+  let extraClassDeclaration = [{
+    static bool isCompatibleReturnTypes(ArrayRef<Type> l, ArrayRef<Type> r) {
+      return ArraysAreCastCompatible(l, r);
+    }
+  }];
 }
 
 def TF_SpaceToDepthOp : TF_Op<"SpaceToDepth", [NoSideEffect]> {
@@ -9816,22 +11377,57 @@ backpropagation,
   }];
 
   let arguments = (ins
-    I64Tensor:$indices,
+    TF_Int64Tensor:$indices,
     TF_Tensor:$values,
-    I64Tensor:$dense_shape,
+    TF_Int64Tensor:$dense_shape,
     TF_Tensor:$default_value
   );
 
   let results = (outs
-    I64Tensor:$output_indices,
+    TF_Int64Tensor:$output_indices,
     TF_Tensor:$output_values,
-    I1Tensor:$empty_row_indicator,
-    I64Tensor:$reverse_index_map
+    TF_BoolTensor:$empty_row_indicator,
+    TF_Int64Tensor:$reverse_index_map
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
 }
 
+def TF_SparseMatMulOp : TF_Op<"SparseMatMul", [NoSideEffect]> {
+  let summary = [{
+Multiply matrix "a" by matrix "b".
+  }];
+
+  let description = [{
+The inputs must be two-dimensional matrices and the inner dimension of "a" must
+match the outer dimension of "b". Both "a" and "b" must be `Tensor`s not
+`SparseTensor`s.  This op is optimized for the case where at least one of "a" or
+"b" is sparse, in the sense that they have a large proportion of zero values.
+The breakeven for using this versus a dense matrix multiply on one platform was
+30% zero values in the sparse matrix.
+
+The gradient computation of this operation will only take advantage of sparsity
+in the input gradient when that gradient comes from a Relu.
+  }];
+
+  let arguments = (ins
+    TensorOf<[TF_Bfloat16, TF_Float32]>:$a,
+    TensorOf<[TF_Bfloat16, TF_Float32]>:$b,
+
+    DefaultValuedAttr<BoolAttr, "false">:$transpose_a,
+    DefaultValuedAttr<BoolAttr, "false">:$transpose_b,
+    DefaultValuedAttr<BoolAttr, "false">:$a_is_sparse,
+    DefaultValuedAttr<BoolAttr, "false">:$b_is_sparse
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$product
+  );
+
+  TF_DerivedOperandTypeAttr Ta = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tb = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_SparseReshapeOp : TF_Op<"SparseReshape", [NoSideEffect]> {
   let summary = [{
 Reshapes a SparseTensor to represent values in a new dense shape.
@@ -9856,14 +11452,14 @@ has length `R_out`, then `input_indices` has shape `[N, R_in]`,
   }];
 
   let arguments = (ins
-    I64Tensor:$input_indices,
-    I64Tensor:$input_shape,
-    I64Tensor:$new_shape
+    TF_Int64Tensor:$input_indices,
+    TF_Int64Tensor:$input_shape,
+    TF_Int64Tensor:$new_shape
   );
 
   let results = (outs
-    I64Tensor:$output_indices,
-    I64Tensor:$output_shape
+    TF_Int64Tensor:$output_indices,
+    TF_Int64Tensor:$output_shape
   );
 }
 
@@ -9879,13 +11475,13 @@ See `tf.sparse.segment_sum` for usage examples.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F32, F64]>:$data,
+    TensorOf<[TF_Bfloat16, TF_Float32, TF_Float64]>:$data,
     TF_I32OrI64Tensor:$indices,
     TF_I32OrI64Tensor:$segment_ids
   );
 
   let results = (outs
-    TensorOf<[BF16, F32, F64]>:$output
+    TensorOf<[TF_Bfloat16, TF_Float32, TF_Float64]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -9908,13 +11504,13 @@ Inputs are the logits, not probabilities.
   }];
 
   let arguments = (ins
-    TF_FpTensor:$features,
+    TF_FloatTensor:$features,
     TF_I32OrI64Tensor:$labels
   );
 
   let results = (outs
-    TF_FpTensor:$loss,
-    TF_FpTensor:$backprop
+    TF_FloatTensor:$loss,
+    TF_FloatTensor:$backprop
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -9969,7 +11565,7 @@ def TF_SplitOp : TF_Op<"Split", [NoSideEffect]> {
   let summary = "Splits a tensor into `num_split` tensors along one dimension.";
 
   let arguments = (ins
-    I32Tensor:$split_dim,
+    TF_Int32Tensor:$split_dim,
     TF_Tensor:$value
   );
 
@@ -9989,7 +11585,7 @@ def TF_SplitVOp : TF_Op<"SplitV", [NoSideEffect]> {
   let arguments = (ins
     TF_Tensor:$value,
     TF_I32OrI64Tensor:$size_splits,
-    I32Tensor:$split_dim
+    TF_Int32Tensor:$split_dim
   );
 
   let results = (outs
@@ -10049,11 +11645,11 @@ I.e., \\(y = x * x = x^2\\).
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -10071,12 +11667,12 @@ def TF_SquaredDifferenceOp : TF_Op<"SquaredDifference", [Commutative, NoSideEffe
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$x,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$y
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I32, I64, TF_Complex128, TF_Complex64]>:$z
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -10123,7 +11719,7 @@ def TF_StackCloseV2Op : TF_Op<"StackCloseV2", []> {
   let summary = "Delete the stack from its resource container.";
 
   let arguments = (ins
-    TF_ResourceTensor:$handle
+    Arg<TF_ResourceTensor, "", [TF_StackFree]>:$handle
   );
 
   let results = (outs);
@@ -10133,7 +11729,7 @@ def TF_StackPopV2Op : TF_Op<"StackPopV2", []> {
   let summary = "Pop the element at the top of the stack.";
 
   let arguments = (ins
-    TF_ResourceTensor:$handle
+    Arg<TF_ResourceTensor, "", [TF_StackRead, TF_StackWrite]>:$handle
   );
 
   let results = (outs
@@ -10147,7 +11743,7 @@ def TF_StackPushV2Op : TF_Op<"StackPushV2", []> {
   let summary = "Push an element onto the stack.";
 
   let arguments = (ins
-    TF_ResourceTensor:$handle,
+    Arg<TF_ResourceTensor, "", [TF_StackRead, TF_StackWrite]>:$handle,
     TF_Tensor:$elem,
 
     DefaultValuedAttr<BoolAttr, "false">:$swap_memory
@@ -10164,23 +11760,23 @@ def TF_StackV2Op : TF_Op<"StackV2", []> {
   let summary = "A stack that produces elements in first-in last-out order.";
 
   let arguments = (ins
-    I32Tensor:$max_size,
+    TF_Int32Tensor:$max_size,
 
     TypeAttr:$elem_type,
     StrAttr:$stack_name
   );
 
   let results = (outs
-    TF_ResourceTensor:$handle
+    Res<TF_ResourceTensor, "", [TF_StackAlloc]>:$handle
   );
 }
 
-def TF_StatelessMultinomialOp : TF_Op<"StatelessMultinomial", [NoSideEffect]> {
+def TF_StatelessMultinomialOp : TF_Op<"StatelessMultinomial", [NoSideEffect, TF_NoConstantFold]> {
   let summary = "Draws samples from a multinomial distribution.";
 
   let arguments = (ins
     TF_IntOrFpTensor:$logits,
-    I32Tensor:$num_samples,
+    TF_Int32Tensor:$num_samples,
     TF_I32OrI64Tensor:$seed
   );
 
@@ -10193,7 +11789,82 @@ def TF_StatelessMultinomialOp : TF_Op<"StatelessMultinomial", [NoSideEffect]> {
   TF_DerivedResultTypeAttr output_dtype = TF_DerivedResultTypeAttr<0>;
 }
 
-def TF_StatelessRandomNormalOp : TF_Op<"StatelessRandomNormal", [NoSideEffect]> {
+def TF_StatelessParameterizedTruncatedNormalOp : TF_Op<"StatelessParameterizedTruncatedNormal", [NoSideEffect, TF_NoConstantFold]> {
+  let summary = "";
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TF_I32OrI64Tensor:$seed,
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64]>:$means,
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64]>:$stddevs,
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64]>:$minvals,
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64]>:$maxvals
+  );
+
+  let results = (outs
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr S = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tseed = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_StatelessRandomBinomialOp : TF_Op<"StatelessRandomBinomial", [NoSideEffect, TF_NoConstantFold]> {
+  let summary = [{
+Outputs deterministic pseudorandom random numbers from a binomial distribution.
+  }];
+
+  let description = [{
+Outputs random values from a binomial distribution.
+
+The outputs are a deterministic function of `shape`, `seed`, `counts`, and `probs`.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TF_I32OrI64Tensor:$seed,
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$counts,
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$probs
+  );
+
+  let results = (outs
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr S = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedOperandTypeAttr Tseed = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_StatelessRandomGammaV2Op : TF_Op<"StatelessRandomGammaV2", [NoSideEffect, TF_NoConstantFold]> {
+  let summary = [{
+Outputs deterministic pseudorandom random numbers from a gamma distribution.
+  }];
+
+  let description = [{
+Outputs random values from a gamma distribution.
+
+The outputs are a deterministic function of `shape`, `seed`, and `alpha`.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TF_I32OrI64Tensor:$seed,
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64]>:$alpha
+  );
+
+  let results = (outs
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tseed = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_StatelessRandomNormalOp : TF_Op<"StatelessRandomNormal", [NoSideEffect, TF_NoConstantFold]> {
   let summary = [{
 Outputs deterministic pseudorandom values from a normal distribution.
   }];
@@ -10210,7 +11881,7 @@ The outputs are a deterministic function of `shape` and `seed`.
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -10218,7 +11889,34 @@ The outputs are a deterministic function of `shape` and `seed`.
   TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
 }
 
-def TF_StatelessRandomUniformOp : TF_Op<"StatelessRandomUniform", [NoSideEffect]> {
+def TF_StatelessRandomPoissonOp : TF_Op<"StatelessRandomPoisson", [NoSideEffect, TF_NoConstantFold]> {
+  let summary = [{
+Outputs deterministic pseudorandom random numbers from a Poisson distribution.
+  }];
+
+  let description = [{
+Outputs random values from a Poisson distribution.
+
+The outputs are a deterministic function of `shape`, `seed`, and `lam`.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TF_I32OrI64Tensor:$seed,
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$lam
+  );
+
+  let results = (outs
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Int32, TF_Int64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tseed = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Rtype = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_StatelessRandomUniformOp : TF_Op<"StatelessRandomUniform", [NoSideEffect, TF_NoConstantFold]> {
   let summary = [{
 Outputs deterministic pseudorandom random values from a uniform distribution.
   }];
@@ -10236,7 +11934,7 @@ The outputs are a deterministic function of `shape` and `seed`.
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -10244,7 +11942,32 @@ The outputs are a deterministic function of `shape` and `seed`.
   TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
 }
 
-def TF_StatelessRandomUniformIntOp : TF_Op<"StatelessRandomUniformInt", [NoSideEffect]> {
+def TF_StatelessRandomUniformFullIntOp : TF_Op<"StatelessRandomUniformFullInt", [NoSideEffect, TF_NoConstantFold]> {
+  let summary = [{
+Outputs deterministic pseudorandom random integers from a uniform distribution.
+  }];
+
+  let description = [{
+The generated values are uniform integers covering the whole range of `dtype`.
+
+The outputs are a deterministic function of `shape` and `seed`.
+  }];
+
+  let arguments = (ins
+    TF_I32OrI64Tensor:$shape,
+    TensorOf<[TF_Int32, TF_Int64, TF_Uint32, TF_Uint64]>:$seed
+  );
+
+  let results = (outs
+    TensorOf<[TF_Int32, TF_Int64, TF_Uint32, TF_Uint64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tseed = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_StatelessRandomUniformIntOp : TF_Op<"StatelessRandomUniformInt", [NoSideEffect, TF_NoConstantFold]> {
   let summary = [{
 Outputs deterministic pseudorandom random integers from a uniform distribution.
   }];
@@ -10271,7 +11994,7 @@ The outputs are a deterministic function of `shape`, `seed`, `minval`, and `maxv
   TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<2>;
 }
 
-def TF_StatelessTruncatedNormalOp : TF_Op<"StatelessTruncatedNormal", [NoSideEffect]> {
+def TF_StatelessTruncatedNormalOp : TF_Op<"StatelessTruncatedNormal", [NoSideEffect, TF_NoConstantFold]> {
   let summary = [{
 Outputs deterministic pseudorandom values from a truncated normal distribution.
   }];
@@ -10290,7 +12013,7 @@ The outputs are a deterministic function of `shape` and `seed`.
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -10566,7 +12289,7 @@ array([0, 2, 2])
   );
 
   let results = (outs
-    I64Tensor:$output
+    TF_Int64Tensor:$output
   );
 }
 
@@ -10580,12 +12303,12 @@ def TF_SubOp : TF_Op<"Sub", [NoSideEffect, ResultsBroadcastableShape, TF_CwiseBi
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint32, TF_Uint8]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint32, TF_Uint8]>:$y
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint32, TF_Uint8]>:$x,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint32, TF_Uint8]>:$y
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint32, TF_Uint8]>:$z
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint32, TF_Uint8]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -10606,23 +12329,22 @@ retained with length 1.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     TF_I32OrI64Tensor:$reduction_indices,
 
     DefaultValuedAttr<BoolAttr, "false">:$keep_dims
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
   TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>;
 
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, Value input, "
-    "Value reduction_indices, BoolAttr keep_dims"
-  >];
+  let builders = [
+    OpBuilder<"Value input, Value reduction_indices, BoolAttr keep_dims">
+  ];
 }
 
 def TF_SymbolicGradientOp : TF_Op<"SymbolicGradient", [NoSideEffect]> {
@@ -10687,7 +12409,7 @@ For internal use only.
 
   let arguments = (ins
     TF_Tensor:$input,
-    I64Tensor:$layout
+    TF_Int64Tensor:$layout
   );
 
   let results = (outs
@@ -10697,6 +12419,30 @@ For internal use only.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_TPUEmbeddingActivationsOp : TF_Op<"TPUEmbeddingActivations", [NoSideEffect]> {
+  let summary = "An op enabling differentiation of TPU Embeddings.";
+
+  let description = [{
+This op simply returns its first input, which is assumed to have been sliced
+from the Tensors returned by TPUEmbeddingDequeueActivations. The presence of
+this op, and its first argument being a trainable Variable, enables automatic
+differentiation of graphs containing embeddings via the TPU Embedding Python
+libraries.
+  }];
+
+  let arguments = (ins
+    TF_Float32Tensor:$embedding_variable,
+    TF_Float32Tensor:$sliced_activations,
+
+    Confined<I64Attr, [IntMinValue<0>]>:$table_id,
+    Confined<I64Attr, [IntMinValue<0>]>:$lookup_id
+  );
+
+  let results = (outs
+    TF_Float32Tensor:$output
+  );
+}
+
 def TF_TPUExecuteOp : TF_Op<"TPUExecute", []> {
   let summary = "Op that loads and executes a TPU program on a TPU device.";
 
@@ -10765,7 +12511,7 @@ For internal use only.
   );
 
   let results = (outs
-    I64Tensor:$layout
+    TF_Int64Tensor:$layout
   );
 }
 
@@ -10781,7 +12527,7 @@ consumed by TPUPartitionedCall.
   let arguments = (ins);
 
   let results = (outs
-    I32Tensor:$device_ordinals
+    TF_Int32Tensor:$device_ordinals
   );
 }
 
@@ -10858,9 +12604,9 @@ variables.
   }];
 
   let arguments = (ins
-    Variadic<TF_ResourceTensor>:$vars,
+    Arg<Variadic<TF_ResourceTensor>, "", [TF_VariableRead, TF_VariableWrite]>:$vars,
     TF_StrTensor:$new_format_key,
-    TF_ResourceTensor:$format_state_var
+    Arg<TF_ResourceTensor, "", [TF_VariableRead, TF_VariableWrite]>:$format_state_var
   );
 
   let results = (outs);
@@ -10884,11 +12630,11 @@ Given an input tensor, this function computes tangent of every
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$x
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$x
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -10902,10 +12648,11 @@ Given an input tensor, this function computes hyperbolic tangent of every
   element in the tensor. Input range is `[-inf, inf]` and
   output range is `[-1,1]`.
 
-  ```python
-  x = tf.constant([-float("inf"), -5, -0.5, 1, 1.2, 2, 3, float("inf")])
-  tf.math.tanh(x) ==> [-1. -0.99990916 -0.46211717 0.7615942 0.8336547 0.9640276 0.9950547 1.]
-  ```
+  >>> x = tf.constant([-float("inf"), -5, -0.5, 1, 1.2, 2, 3, float("inf")])
+  >>> tf.math.tanh(x)
+  <tf.Tensor: shape=(8,), dtype=float32, numpy=
+  array([-1.        , -0.99990916, -0.46211717,  0.7615942 ,  0.8336547 ,
+          0.9640276 ,  0.9950547 ,  1.        ], dtype=float32)>
   }];
 
   let arguments = (ins
@@ -10948,7 +12695,7 @@ of a step/run.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$handle
+    Arg<TF_ResourceTensor, "", [TF_TensorArrayFree]>:$handle
   );
 
   let results = (outs);
@@ -10972,15 +12719,15 @@ All elements must have the same shape (excepting the first dimension).
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$handle,
-    F32Tensor:$flow_in,
+    Arg<TF_ResourceTensor, "", [TF_TensorArrayRead]>:$handle,
+    TF_Float32Tensor:$flow_in,
 
     DefaultValuedAttr<TF_ShapeAttr, "llvm::None">:$element_shape_except0
   );
 
   let results = (outs
     TF_Tensor:$value,
-    I64Tensor:$lengths
+    TF_Int64Tensor:$lengths
   );
 
   TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
@@ -10996,9 +12743,9 @@ All elements selected by `indices` must have the same shape.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$handle,
-    I32Tensor:$indices,
-    F32Tensor:$flow_in,
+    Arg<TF_ResourceTensor, "", [TF_TensorArrayRead]>:$handle,
+    TF_Int32Tensor:$indices,
+    TF_Float32Tensor:$flow_in,
 
     DefaultValuedAttr<TF_ShapeAttr, "llvm::None">:$element_shape
   );
@@ -11055,15 +12802,15 @@ calculation gets its own TensorArray accumulator.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$handle,
-    F32Tensor:$flow_in,
+    Arg<TF_ResourceTensor, "", [TF_TensorArrayRead, TF_TensorArrayWrite]>:$handle,
+    TF_Float32Tensor:$flow_in,
 
     StrAttr:$source
   );
 
   let results = (outs
-    TF_ResourceTensor:$grad_handle,
-    F32Tensor:$flow_out
+    Res<TF_ResourceTensor, "", [TF_TensorArrayAlloc]>:$grad_handle,
+    TF_Float32Tensor:$flow_out
   );
 }
 
@@ -11071,9 +12818,9 @@ def TF_TensorArrayReadV3Op : TF_Op<"TensorArrayReadV3", []> {
   let summary = "Read an element from the TensorArray into output `value`.";
 
   let arguments = (ins
-    TF_ResourceTensor:$handle,
-    I32Tensor:$index,
-    F32Tensor:$flow_in
+    Arg<TF_ResourceTensor, "", [TF_TensorArrayRead]>:$handle,
+    TF_Int32Tensor:$index,
+    TF_Float32Tensor:$flow_in
   );
 
   let results = (outs
@@ -11093,14 +12840,14 @@ Scatter the data from the input value into specific TensorArray elements.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$handle,
-    I32Tensor:$indices,
+    Arg<TF_ResourceTensor, "", [TF_TensorArrayRead, TF_TensorArrayWrite]>:$handle,
+    TF_Int32Tensor:$indices,
     TF_Tensor:$value,
-    F32Tensor:$flow_in
+    TF_Float32Tensor:$flow_in
   );
 
   let results = (outs
-    F32Tensor:$flow_out
+    TF_Float32Tensor:$flow_out
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
@@ -11110,12 +12857,12 @@ def TF_TensorArraySizeV3Op : TF_Op<"TensorArraySizeV3", []> {
   let summary = "Get the current size of the TensorArray.";
 
   let arguments = (ins
-    TF_ResourceTensor:$handle,
-    F32Tensor:$flow_in
+    Arg<TF_ResourceTensor, "", [TF_TensorArrayRead]>:$handle,
+    TF_Float32Tensor:$flow_in
   );
 
   let results = (outs
-    I32Tensor:$size
+    TF_Int32Tensor:$size
   );
 }
 
@@ -11145,14 +12892,14 @@ and having size
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$handle,
+    Arg<TF_ResourceTensor, "", [TF_TensorArrayRead, TF_TensorArrayWrite]>:$handle,
     TF_Tensor:$value,
-    I64Tensor:$lengths,
-    F32Tensor:$flow_in
+    TF_Int64Tensor:$lengths,
+    TF_Float32Tensor:$flow_in
   );
 
   let results = (outs
-    F32Tensor:$flow_out
+    TF_Float32Tensor:$flow_out
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
@@ -11166,7 +12913,7 @@ Write data via Write and read via Read or Pack.
   }];
 
   let arguments = (ins
-    I32Tensor:$size,
+    TF_Int32Tensor:$size,
 
     TypeAttr:$dtype,
     DefaultValuedAttr<TF_ShapeAttr, "llvm::None">:$element_shape,
@@ -11177,8 +12924,8 @@ Write data via Write and read via Read or Pack.
   );
 
   let results = (outs
-    TF_ResourceTensor:$handle,
-    F32Tensor:$flow
+    Res<TF_ResourceTensor, "", [TF_TensorArrayAlloc]>:$handle,
+    TF_Float32Tensor:$flow
   );
 }
 
@@ -11186,14 +12933,14 @@ def TF_TensorArrayWriteV3Op : TF_Op<"TensorArrayWriteV3", []> {
   let summary = "Push an element onto the tensor_array.";
 
   let arguments = (ins
-    TF_ResourceTensor:$handle,
-    I32Tensor:$index,
+    Arg<TF_ResourceTensor, "", [TF_TensorArrayRead, TF_TensorArrayWrite]>:$handle,
+    TF_Int32Tensor:$index,
     TF_Tensor:$value,
-    F32Tensor:$flow_in
+    TF_Float32Tensor:$flow_in
   );
 
   let results = (outs
-    F32Tensor:$flow_out
+    TF_Float32Tensor:$flow_out
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
@@ -11219,12 +12966,12 @@ lengths: Output tensor containing sizes of the 0th dimension of tensors in the l
   let arguments = (ins
     TF_VariantTensor:$input_handle,
     TF_I32OrI64Tensor:$element_shape,
-    I64Tensor:$leading_dims
+    TF_Int64Tensor:$leading_dims
   );
 
   let results = (outs
     TF_Tensor:$tensor,
-    I64Tensor:$lengths
+    TF_Int64Tensor:$lengths
   );
 
   TF_DerivedOperandTypeAttr shape_type = TF_DerivedOperandTypeAttr<1>;
@@ -11291,8 +13038,8 @@ values: The tensor.
 
   let arguments = (ins
     TF_VariantTensor:$input_handle,
-    I32Tensor:$indices,
-    I32Tensor:$element_shape
+    TF_Int32Tensor:$indices,
+    TF_Int32Tensor:$element_shape
   );
 
   let results = (outs
@@ -11307,8 +13054,8 @@ def TF_TensorListGetItemOp : TF_Op<"TensorListGetItem", [NoSideEffect]> {
 
   let arguments = (ins
     TF_VariantTensor:$input_handle,
-    I32Tensor:$index,
-    I32Tensor:$element_shape
+    TF_Int32Tensor:$index,
+    TF_Int32Tensor:$element_shape
   );
 
   let results = (outs
@@ -11331,7 +13078,7 @@ length: the number of tensors in the list
   );
 
   let results = (outs
-    I32Tensor:$length
+    TF_Int32Tensor:$length
   );
 }
 
@@ -11351,7 +13098,7 @@ element_shape: the shape of the output tensor
 
   let arguments = (ins
     TF_VariantTensor:$input_handle,
-    I32Tensor:$element_shape
+    TF_Int32Tensor:$element_shape
   );
 
   let results = (outs
@@ -11397,7 +13144,7 @@ size: size of the output list
 
   let arguments = (ins
     TF_VariantTensor:$input_handle,
-    I32Tensor:$size
+    TF_Int32Tensor:$size
   );
 
   let results = (outs
@@ -11421,7 +13168,7 @@ output_handle: The TensorList.
   let arguments = (ins
     TF_VariantTensor:$input_handle,
     TF_Tensor:$tensor,
-    I32Tensor:$indices
+    TF_Int32Tensor:$indices
   );
 
   let results = (outs
@@ -11436,7 +13183,7 @@ def TF_TensorListSetItemOp : TF_Op<"TensorListSetItem", [NoSideEffect]> {
 
   let arguments = (ins
     TF_VariantTensor:$input_handle,
-    I32Tensor:$index,
+    TF_Int32Tensor:$index,
     TF_Tensor:$item
   );
 
@@ -11460,7 +13207,7 @@ num_elements: optional. If not -1, the number of elements in the list.
 
   let arguments = (ins
     TF_VariantTensor:$input_handle,
-    I32Tensor:$element_shape,
+    TF_Int32Tensor:$element_shape,
 
     DefaultValuedAttr<I64Attr, "-1">:$num_elements
   );
@@ -11573,14 +13320,46 @@ On GPU, if an out of bound index is found, the index is ignored.
   let verifier = [{ return Verify(*this); }];
 
   let builders = [
-    OpBuilder<
-      "OpBuilder& builder, OperationState& result, "
-      "Value tensor, Value indices, Value updates",
-      [{build(builder, result, tensor.getType(), tensor, indices, updates);}]
+    OpBuilder<"Value tensor, Value indices, Value updates",
+      [{build($_builder, $_state, tensor.getType(), tensor, indices, updates);}]
     >
   ];
 }
 
+def TF_TensorStridedSliceUpdateOp : TF_Op<"TensorStridedSliceUpdate", [NoSideEffect]> {
+  let summary = "Assign `value` to the sliced l-value reference of `input`.";
+
+  let description = [{
+The values of `value` are assigned to the positions in the tensor `input` that
+are selected by the slice parameters. The slice parameters `begin` `end`
+`strides` etc. work exactly as in `StridedSlice`.
+
+NOTE this op currently does not support broadcasting and so `value`'s shape
+must be exactly the shape produced by the slice of `input`.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+    TF_I32OrI64Tensor:$begin,
+    TF_I32OrI64Tensor:$end,
+    TF_I32OrI64Tensor:$strides,
+    TF_Tensor:$value,
+
+    DefaultValuedAttr<I64Attr, "0">:$begin_mask,
+    DefaultValuedAttr<I64Attr, "0">:$end_mask,
+    DefaultValuedAttr<I64Attr, "0">:$ellipsis_mask,
+    DefaultValuedAttr<I64Attr, "0">:$new_axis_mask,
+    DefaultValuedAttr<I64Attr, "0">:$shrink_axis_mask
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Index = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_TileOp : TF_Op<"Tile", [NoSideEffect]> {
   let summary = "Constructs a tensor by tiling a given tensor.";
 
@@ -11625,9 +13404,9 @@ array([[1, 2, 3, 1, 2, 3],
   TF_DerivedOperandTypeAttr Tmultiples = TF_DerivedOperandTypeAttr<1>;
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
-  // TODO(parkers): Add folds for multiples = [1,...].
-  // TODO(parkers): Add errors for negative multiples and multiples.size() !=
-  // input.rank()
+  let verifier = [{ return Verify(*this); }];
+
+  let hasFolder = 1;
 }
 
 def TF_TopKV2Op : TF_Op<"TopKV2", [NoSideEffect]> {
@@ -11650,14 +13429,14 @@ If two elements are equal, the lower-index element appears first.
 
   let arguments = (ins
     TF_IntOrFpTensor:$input,
-    I32Tensor:$k,
+    TF_Int32Tensor:$k,
 
     DefaultValuedAttr<BoolAttr, "true">:$sorted
   );
 
   let results = (outs
     TF_IntOrFpTensor:$values,
-    I32Tensor:$indices
+    TF_Int32Tensor:$indices
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -11686,8 +13465,7 @@ The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
   TF_DerivedOperandTypeAttr Tperm = TF_DerivedOperandTypeAttr<1>;
 
   let builders = [
-    OpBuilder<
-      "OpBuilder& builder, OperationState& result, Value  x, Value  perm">
+    OpBuilder<"Value x, Value perm">
   ];
 
   let verifier = [{
@@ -11712,12 +13490,12 @@ Python Semantics.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$x,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$y
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint8]>:$x,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint8]>:$y
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Uint16, TF_Uint8]>:$z
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Uint16, TF_Uint8]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -11768,7 +13546,7 @@ deviations from the mean are dropped and re-picked.
   );
 
   let results = (outs
-    TF_FpTensor:$output
+    TF_FloatTensor:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -11986,13 +13764,13 @@ dropped, and will not be included in the result.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$data,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$data,
     TF_I32OrI64Tensor:$segment_ids,
     TF_I32OrI64Tensor:$num_segments
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
@@ -12035,13 +13813,13 @@ tf.unsorted_segment_sum(c, tf.constant([0, 1, 0]), num_segments=2)
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$data,
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$data,
     TF_I32OrI64Tensor:$segment_ids,
     TF_I32OrI64Tensor:$num_segments
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
@@ -12095,11 +13873,11 @@ Checks whether a resource handle-based variable has been initialized.
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$resource
+    Arg<TF_ResourceTensor, "", [TF_VariableRead]>:$resource
   );
 
   let results = (outs
-    I1Tensor:$is_initialized
+    TF_BoolTensor:$is_initialized
   );
 
   let hasCanonicalizer = 1;
@@ -12120,7 +13898,7 @@ shape(t) ==> [2, 2, 3]
   }];
 
   let arguments = (ins
-    TF_ResourceTensor:$input
+    Arg<TF_ResourceTensor, "", [TF_VariableRead]>:$input
   );
 
   let results = (outs
@@ -12226,161 +14004,27 @@ where(input) ==> [[0, 0, 0],
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input
+    TensorOf<[TF_Bfloat16, TF_Bool, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input
   );
 
   let results = (outs
-    I64Tensor:$index
+    TF_Int64Tensor:$index
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_WriteAudioSummaryOp : TF_Op<"WriteAudioSummary", []> {
-  let summary = "Writes an audio summary.";
-
-  let description = [{
-Writes encoded audio summary `tensor` at `step` with `tag` using summary `writer`.
-`sample_rate` is the audio sample rate is Hz.
-  }];
-
-  let arguments = (ins
-    TF_ResourceTensor:$writer,
-    I64Tensor:$step,
-    TF_StrTensor:$tag,
-    F32Tensor:$tensor,
-    F32Tensor:$sample_rate,
-
-    Confined<DefaultValuedAttr<I64Attr, "3">, [IntMinValue<1>]>:$max_outputs
-  );
-
-  let results = (outs);
-}
-
-def TF_WriteGraphSummaryOp : TF_Op<"WriteGraphSummary", []> {
-  let summary = "Writes a graph summary.";
-
-  let description = [{
-Writes TensorFlow graph `tensor` at `step` using summary `writer`.
-  }];
-
-  let arguments = (ins
-    TF_ResourceTensor:$writer,
-    I64Tensor:$step,
-    TF_StrTensor:$tensor
-  );
-
-  let results = (outs);
-}
-
-def TF_WriteHistogramSummaryOp : TF_Op<"WriteHistogramSummary", []> {
-  let summary = "Writes a histogram summary.";
-
-  let description = [{
-Writes histogram `values` at `step` with `tag` using summary `writer`.
-  }];
-
-  let arguments = (ins
-    TF_ResourceTensor:$writer,
-    I64Tensor:$step,
-    TF_StrTensor:$tag,
-    TF_IntOrFpTensor:$values
-  );
-
-  let results = (outs);
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
-}
-
-def TF_WriteImageSummaryOp : TF_Op<"WriteImageSummary", []> {
-  let summary = "Writes an image summary.";
-
-  let description = [{
-Writes image `tensor` at `step` with `tag` using summary `writer`.
-`tensor` is image with shape [height, width, channels].
-  }];
-
-  let arguments = (ins
-    TF_ResourceTensor:$writer,
-    I64Tensor:$step,
-    TF_StrTensor:$tag,
-    TensorOf<[F16, F32, TF_Uint8]>:$tensor,
-    TF_Uint8Tensor:$bad_color,
-
-    Confined<DefaultValuedAttr<I64Attr, "3">, [IntMinValue<1>]>:$max_images
-  );
-
-  let results = (outs);
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
-}
-
-def TF_WriteRawProtoSummaryOp : TF_Op<"WriteRawProtoSummary", []> {
-  let summary = "Writes a serialized proto summary.";
-
-  let description = [{
-Writes `tensor`, a serialized proto at `step` using summary `writer`.
-  }];
-
-  let arguments = (ins
-    TF_ResourceTensor:$writer,
-    I64Tensor:$step,
-    TF_StrTensor:$tensor
-  );
-
-  let results = (outs);
-}
-
-def TF_WriteScalarSummaryOp : TF_Op<"WriteScalarSummary", []> {
-  let summary = "Writes a scalar summary.";
-
-  let description = [{
-Writes scalar `value` at `step` with `tag` using summary `writer`.
-  }];
-
-  let arguments = (ins
-    TF_ResourceTensor:$writer,
-    I64Tensor:$step,
-    TF_StrTensor:$tag,
-    TF_IntOrFpTensor:$value
-  );
-
-  let results = (outs);
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
-}
-
-def TF_WriteSummaryOp : TF_Op<"WriteSummary", []> {
-  let summary = "Writes a tensor summary.";
-
-  let description = [{
-Writes `tensor` at `step` with `tag` using summary `writer`.
-  }];
-
-  let arguments = (ins
-    TF_ResourceTensor:$writer,
-    I64Tensor:$step,
-    TF_Tensor:$tensor,
-    TF_StrTensor:$tag,
-    TF_StrTensor:$summary_metadata
-  );
-
-  let results = (outs);
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
-}
-
 def TF_XdivyOp : TF_Op<"Xdivy", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
                  WithBroadcastableBinOpBuilder {
   let summary = "Returns 0 if x == 0, and x / y otherwise, elementwise.";
 
   let arguments = (ins
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$x,
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$x,
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$y
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$z
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -12398,14 +14042,14 @@ for binary operators.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lhs,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rhs,
-    TF_I32OrI64Tensor:$broadcast_dims
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the LHS input tensor}]>:$lhs,
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the RHS input tensor}]>:$rhs,
+    Arg<TF_I32OrI64Tensor, [{an XLA-style broadcast dimension specification}]>:$broadcast_dims
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lhs_output,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rhs_output
+    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the broadcasted LHS tensor}]>:$lhs_output,
+    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the broadcasted RHS tensor}]>:$rhs_output
   );
 
   TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<2>;
@@ -12421,20 +14065,20 @@ https://www.tensorflow.org/performance/xla/operation_semantics#conv_convolution
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lhs,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rhs,
-    TF_I32OrI64Tensor:$window_strides,
-    TF_I32OrI64Tensor:$padding,
-    TF_I32OrI64Tensor:$lhs_dilation,
-    TF_I32OrI64Tensor:$rhs_dilation,
-    TF_I32OrI64Tensor:$feature_group_count,
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the input tensor}]>:$lhs,
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the kernel tensor}]>:$rhs,
+    Arg<TF_I32OrI64Tensor, [{the inter-window strides}]>:$window_strides,
+    Arg<TF_I32OrI64Tensor, [{the padding to apply at the start and end of each input dimensions}]>:$padding,
+    Arg<TF_I32OrI64Tensor, [{dilation to apply between input elements}]>:$lhs_dilation,
+    Arg<TF_I32OrI64Tensor, [{dilation to apply between kernel elements}]>:$rhs_dilation,
+    Arg<TF_I32OrI64Tensor, [{number of feature groups for grouped convolution.}]>:$feature_group_count,
 
     StrAttr:$dimension_numbers,
     StrAttr:$precision_config
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<2>;
@@ -12450,15 +14094,15 @@ https://www.tensorflow.org/performance/xla/operation_semantics#dotgeneral
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$lhs,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$rhs,
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the LHS tensor}]>:$lhs,
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the RHS tensor}]>:$rhs,
 
     StrAttr:$dimension_numbers,
     StrAttr:$precision_config
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -12479,8 +14123,11 @@ with dimension size equal to the rank of operand.
   }];
 
   let arguments = (ins
-    TF_Tensor:$input,
-    TF_I32OrI64Tensor:$start_indices,
+    Arg<TF_Tensor, [{A `Tensor` of type T.}]>:$input,
+    Arg<TF_I32OrI64Tensor, [{List of N integers containing the slice size for each
+dimension. Each value must be strictly greater than zero, and start + size
+must be less than or equal to the size of the dimension to avoid
+implementation defined behavior.}]>:$start_indices,
     TF_I32OrI64Tensor:$size_indices
   );
 
@@ -12508,19 +14155,44 @@ Handling of out-of-bounds slice indices is implementation-defined.
   }];
 
   let arguments = (ins
-    TF_Tensor:$input,
-    TF_Tensor:$update,
-    TF_I32OrI64Tensor:$indices
+    Arg<TF_Tensor, [{A `Tensor` of type T.}]>:$input,
+    Arg<TF_Tensor, [{A `Tensor` of type T. Same rank as `input`.}]>:$update,
+    Arg<TF_I32OrI64Tensor, [{A vector of indices into `input`. Must have length equal to the rank of
+`input`.}]>:$indices
   );
 
   let results = (outs
-    TF_Tensor:$output
+    Res<TF_Tensor, [{A `Tensor` of type T.}]>:$output
   );
 
   TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<2>;
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_XlaEinsumOp : TF_Op<"XlaEinsum", [NoSideEffect]> {
+  let summary = [{
+An op which supports basic einsum op with 2 inputs and 1 output.
+  }];
+
+  let description = [{
+This op has better TPU performance since it doesn't have explicitly reshape and
+transpose operations as tf.einsum does.
+  }];
+
+  let arguments = (ins
+    TensorOf<[TF_Bfloat16, TF_Complex64, TF_Float32]>:$a,
+    TensorOf<[TF_Bfloat16, TF_Complex64, TF_Float32]>:$b,
+
+    StrAttr:$equation
+  );
+
+  let results = (outs
+    TensorOf<[TF_Bfloat16, TF_Complex64, TF_Float32]>:$product
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_XlaGatherOp : TF_Op<"XlaGather", [NoSideEffect]> {
   let summary = "Wraps the XLA Gather operator documented at";
 
@@ -12529,16 +14201,16 @@ https://www.tensorflow.org/xla/operation_semantics#gather
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$operand,
-    TF_I32OrI64Tensor:$start_indices,
-    TF_I32OrI64Tensor:$slice_sizes,
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The array we're gathering from.}]>:$operand,
+    Arg<TF_I32OrI64Tensor, [{Array containing the starting indices of the slices we gather.}]>:$start_indices,
+    Arg<TF_I32OrI64Tensor, [{slice_sizes[i] is the bounds for the slice on dimension i.}]>:$slice_sizes,
 
     StrAttr:$dimension_numbers,
     BoolAttr:$indices_are_sorted
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
@@ -12580,13 +14252,13 @@ Sorts a tensor. Currently only sorts in ascending order are supported.
   }];
 
   let arguments = (ins
-    TF_IntOrFpTensor:$keys,
-    TF_Tensor:$values
+    Arg<TF_IntOrFpTensor, [{A `Tensor` of type K.}]>:$keys,
+    Arg<TF_Tensor, [{A `Tensor` of type V.}]>:$values
   );
 
   let results = (outs
-    TF_IntOrFpTensor:$sorted_keys,
-    TF_Tensor:$sorted_values
+    Res<TF_IntOrFpTensor, [{A `Tensor` of type K.}]>:$sorted_keys,
+    Res<TF_Tensor, [{A `Tensor` of type V.}]>:$sorted_values
   );
 
   TF_DerivedOperandTypeAttr V = TF_DerivedOperandTypeAttr<1>;
@@ -12602,15 +14274,15 @@ https://www.tensorflow.org/performance/xla/operation_semantics#pad
   }];
 
   let arguments = (ins
-    TF_Tensor:$input,
-    TF_Tensor:$padding_value,
-    TF_I32OrI64Tensor:$padding_low,
-    TF_I32OrI64Tensor:$padding_high,
-    TF_I32OrI64Tensor:$padding_interior
+    Arg<TF_Tensor, [{A `Tensor` of type T.}]>:$input,
+    Arg<TF_Tensor, [{A scalar `Tensor` of type T.}]>:$padding_value,
+    Arg<TF_I32OrI64Tensor, [{the padding to apply at the start of each input dimensions}]>:$padding_low,
+    Arg<TF_I32OrI64Tensor, [{the padding to apply at the end of each input dimension.}]>:$padding_high,
+    Arg<TF_I32OrI64Tensor, [{the padding to apply between each input element.}]>:$padding_interior
   );
 
   let results = (outs
-    TF_Tensor:$output
+    Res<TF_Tensor, [{A `Tensor` of type T.}]>:$output
   );
 
   TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<2>;
@@ -12647,15 +14319,15 @@ https://www.tensorflow.org/performance/xla/operation_semantics#reduce .
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$init_value,
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the input tensor}]>:$input,
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{a scalar representing the initial value for the reduction}]>:$init_value,
 
     I64ArrayAttr:$dimensions_to_reduce,
     SymbolRefAttr:$reducer
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -12667,7 +14339,7 @@ def TF_XlaReplicaIdOp : TF_Op<"XlaReplicaId", [NoSideEffect]> {
   let arguments = (ins);
 
   let results = (outs
-    I32Tensor:$id
+    TF_Int32Tensor:$id
   );
 }
 
@@ -12679,9 +14351,10 @@ https://www.tensorflow.org/xla/operation_semantics#scatter.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$operand,
-    TF_I32OrI64Tensor:$scatter_indices,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$updates,
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Array to be scattered into.}]>:$operand,
+    Arg<TF_I32OrI64Tensor, [{Array containing the starting indices of the slices that must
+be scattered to.}]>:$scatter_indices,
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Array containing the values that must be used for scattering.}]>:$updates,
 
     SymbolRefAttr:$update_computation,
     StrAttr:$dimension_numbers,
@@ -12689,7 +14362,7 @@ https://www.tensorflow.org/xla/operation_semantics#scatter.
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr Tindices = TF_DerivedOperandTypeAttr<1>;
@@ -12710,7 +14383,7 @@ i=0...N-1.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$a,
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the input tensor.}]>:$a,
 
     BoolAttr:$lower,
     I64Attr:$max_iter,
@@ -12718,8 +14391,10 @@ i=0...N-1.
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$w,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$v
+    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The eigenvalues in ascending order, each repeated according to its
+multiplicity.}]>:$w,
+    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{The column v[..., :, i] is the normalized eigenvector corresponding to the
+eigenvalue w[..., i].}]>:$v
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -12758,7 +14433,7 @@ tensor such that tensor[...,:,:] = u[..., :, :] * Diag(s[..., :]) * Transpose(v[
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$a,
+    Arg<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{the input tensor.}]>:$a,
 
     I64Attr:$max_iter,
     F32Attr:$epsilon,
@@ -12766,9 +14441,10 @@ tensor such that tensor[...,:,:] = u[..., :, :] * Diag(s[..., :]) * Transpose(v[
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$s,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$u,
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$v
+    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Singular values. The values are sorted in reverse order of magnitude, so
+s[..., 0] is the largest value, s[..., 1] is the second largest, etc.}]>:$s,
+    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Left singular vectors.}]>:$u,
+    Res<TensorOf<[TF_Bfloat16, TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>, [{Right singular vectors.}]>:$v
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -12778,12 +14454,12 @@ def TF_Xlog1pyOp : TF_Op<"Xlog1py", [NoSideEffect, TF_SameOperandsAndResultEleme
   let summary = "Returns 0 if x == 0, and x * log1p(y) otherwise, elementwise.";
 
   let arguments = (ins
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$x,
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$x,
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$y
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$z
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -12794,12 +14470,12 @@ def TF_XlogyOp : TF_Op<"Xlogy", [NoSideEffect, ResultsBroadcastableShape, TF_Sam
   let summary = "Returns 0 if x == 0, and x * log(y) otherwise, elementwise.";
 
   let arguments = (ins
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$x,
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$y
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$x,
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$y
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64, TF_Complex128, TF_Complex64]>:$z
+    TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>:$z
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -12828,12 +14504,12 @@ expected to create these operators.
   }];
 
   let arguments = (ins
-    TensorOf<[F16, F32]>:$x,
-    F32Tensor:$scale,
-    F32Tensor:$offset,
-    F32Tensor:$mean,
-    F32Tensor:$variance,
-    Variadic<TensorOf<[F16, F32]>>:$side_input,
+    TensorOf<[TF_Float16, TF_Float32]>:$x,
+    TF_Float32Tensor:$scale,
+    TF_Float32Tensor:$offset,
+    TF_Float32Tensor:$mean,
+    TF_Float32Tensor:$variance,
+    Variadic<TensorOf<[TF_Float16, TF_Float32]>>:$side_input,
 
     DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
     DefaultValuedAttr<F32Attr, "1.0f">:$exponential_avg_factor,
@@ -12843,12 +14519,12 @@ expected to create these operators.
   );
 
   let results = (outs
-    TensorOf<[F16, F32]>:$y,
-    F32Tensor:$batch_mean,
-    F32Tensor:$batch_variance,
-    F32Tensor:$reserve_space_1,
-    F32Tensor:$reserve_space_2,
-    F32Tensor:$reserve_space_3
+    TensorOf<[TF_Float16, TF_Float32]>:$y,
+    TF_Float32Tensor:$batch_mean,
+    TF_Float32Tensor:$batch_variance,
+    TF_Float32Tensor:$reserve_space_1,
+    TF_Float32Tensor:$reserve_space_2,
+    TF_Float32Tensor:$reserve_space_3
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -12893,7 +14569,8 @@ create these operators.
     DefaultValuedAttr<I64ArrayAttr, "{1, 1, 1, 1}">:$dilations,
     DefaultValuedAttr<BoolAttr, "true">:$use_cudnn_on_gpu,
     DefaultValuedAttr<StrArrayAttr, "{}">:$fused_ops,
-    DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon
+    DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
+    DefaultValuedAttr<F32Attr, "0.2f">:$leakyrelu_alpha
   );
 
   let results = (outs
@@ -12930,9 +14607,9 @@ expected to create these operators.
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F32]>:$a,
-    TensorOf<[BF16, F32]>:$b,
-    Variadic<TensorOf<[BF16, F32]>>:$args,
+    TensorOf<[TF_Bfloat16, TF_Float32]>:$a,
+    TensorOf<[TF_Bfloat16, TF_Float32]>:$b,
+    Variadic<TensorOf<[TF_Bfloat16, TF_Float32]>>:$args,
 
     DefaultValuedAttr<BoolAttr, "false">:$transpose_a,
     DefaultValuedAttr<BoolAttr, "false">:$transpose_b,
@@ -12941,7 +14618,7 @@ expected to create these operators.
   );
 
   let results = (outs
-    TensorOf<[BF16, F32]>:$product
+    TensorOf<[TF_Bfloat16, TF_Float32]>:$product
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -12959,13 +14636,19 @@ Tensor of activations per table specified in the model.
   }];
 
   let arguments = (ins
-    TF_VariantTensor:$deduplication_data,
+    Arg<TF_VariantTensor, [{A Tensor with type=DT_VARIANT containing the deduplication
+data. The tensor is an XLA nested tuple containing N elements (where N is
+the ratio of the number of embedding to tensor cores per TPU chip). Each
+element of the nested tuple is a tuple of rank 1 tensors. Each tensor either
+contains indices (DT_UINT32) for embedding lookup on the TensorCore or
+weights (DT_FLOAT) to apply to the output of the embedding lookup operation.}]>:$deduplication_data,
 
     StrAttr:$config
   );
 
   let results = (outs
-    Variadic<F32Tensor>:$outputs
+    Res<Variadic<TF_Float32Tensor>, [{A TensorList of embedding activations containing one Tensor per
+embedding table in the model.}]>:$outputs
   );
 
   TF_DerivedResultSizeAttr num_tables = TF_DerivedResultSizeAttr<0>;
@@ -12991,7 +14674,7 @@ look up the program in the compilation cache.
   }];
 
   let arguments = (ins
-    Variadic<I64Tensor>:$dynamic_shapes,
+    Variadic<TF_Int64Tensor>:$dynamic_shapes,
 
     StrAttr:$mlir_module,
     StrAttr:$metadata
@@ -13034,13 +14717,13 @@ expected to create these operators.
   }];
 
   let arguments = (ins
-    TensorOf<[F16, F32, F64]>:$x,
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64]>:$x,
 
     StrArrayAttr:$op_names
   );
 
   let results = (outs
-    TensorOf<[F16, F32, F64]>:$y
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64]>:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -13052,7 +14735,7 @@ A pseudo-op to represent host-side computation in an XLA program.
   }];
 
   let arguments = (ins
-    Variadic<TF_Tensor>:$inputs,
+    Arg<Variadic<TF_Tensor>, [{A list of tensors that will be sent to the host.}]>:$inputs,
 
     StrAttr:$send_key,
     StrAttr:$recv_key,
@@ -13060,7 +14743,7 @@ A pseudo-op to represent host-side computation in an XLA program.
   );
 
   let results = (outs
-    Variadic<TF_Tensor>:$outputs
+    Res<Variadic<TF_Tensor>, [{A list of tensors that will be returned to the device.}]>:$outputs
   );
 
   TF_DerivedOperandTypeListAttr Tinputs = TF_DerivedOperandTypeListAttr<0>;
@@ -13073,14 +14756,15 @@ A placeholder op to receive values from a running XLA computation.
   }];
 
   let arguments = (ins
-    TF_StrTensor:$dynamic_key,
+    Arg<TF_StrTensor, [{The key sent at runtime by the compile node to identify which
+execution the transfer corresponds to.}]>:$dynamic_key,
 
     StrAttr:$key,
     I64Attr:$device_ordinal
   );
 
   let results = (outs
-    Variadic<TF_Tensor>:$outputs
+    Res<Variadic<TF_Tensor>, [{A list of tensors that will be received from the XLA computation.}]>:$outputs
   );
 
   TF_DerivedResultTypeListAttr Toutputs = TF_DerivedResultTypeListAttr<0>;
@@ -13090,8 +14774,9 @@ def TF__XlaSendFromHostOp : TF_Op<"_XlaSendFromHost", []> {
   let summary = "A placeholder op to send values to a running XLA computation.";
 
   let arguments = (ins
-    Variadic<TF_Tensor>:$inputs,
-    TF_StrTensor:$dynamic_key,
+    Arg<Variadic<TF_Tensor>, [{A list of tensors that will be sent to the XLA computation.}]>:$inputs,
+    Arg<TF_StrTensor, [{The key sent at runtime by the compile node to identify which
+execution the transfer corresponds to.}]>:$dynamic_key,
 
     StrAttr:$key,
     I64Attr:$device_ordinal
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index 1755c975c23..15c0d7b10f7 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -46,7 +46,7 @@ Invariants:
 TODO: Make invariants more structured so that we can reference them in ops.
   }];
 
-  let cppNamespace = "TF";
+  let cppNamespace = "::mlir::TF";
 }
 
 //===----------------------------------------------------------------------===//
@@ -73,6 +73,9 @@ def TF_LayoutAgnostic : NativeOpTrait<"TF::LayoutAgnostic">;
 // certain state around within their implementations.
 def TF_CannotDuplicate : NativeOpTrait<"TF::CannotDuplicate">;
 
+// Trait to indicate an operation cannot be constant folded.
+def TF_NoConstantFold : NativeOpTrait<"TF::NoConstantFold">;
+
 // Coefficient wise binary operation with implicit broadcasting support, for
 // example tf.Sub operation.
 def TF_CwiseBinary : NativeOpTrait<"TF::CwiseBinary">;
@@ -108,14 +111,44 @@ class TF_ResourceBase<string resourceKind> :
 def TF_VariableResource : TF_ResourceBase<"Variable">;
 def TF_StackResource : TF_ResourceBase<"Stack">;
 def TF_TensorArrayResource : TF_ResourceBase<"TensorArray">;
+def TF_SummaryResource : TF_ResourceBase<"Summary">;
+def TF_LookupTableResource : TF_ResourceBase<"LookupTable">;
+def TF_DatasetSeedGeneratorResource : TF_ResourceBase<"DatasetSeedGenerator">;
+def TF_DatasetMemoryCacheResource : TF_ResourceBase<"DatasetMemoryCache">;
+def TF_DatasetIteratorResource : TF_ResourceBase<"DatasetIterator">;
 
 def TF_VariableRead : MemRead<TF_VariableResource>;
 def TF_StackRead : MemRead<TF_StackResource>;
 def TF_TensorArrayRead : MemRead<TF_TensorArrayResource>;
+def TF_LookupTableRead : MemRead<TF_LookupTableResource>;
+def TF_DatasetSeedGeneratorRead : MemRead<TF_DatasetSeedGeneratorResource>;
+def TF_DatasetMemoryCacheRead : MemRead<TF_DatasetMemoryCacheResource>;
+def TF_DatasetIteratorRead : MemRead<TF_DatasetIteratorResource>;
 
 def TF_VariableWrite : MemWrite<TF_VariableResource>;
 def TF_StackWrite : MemWrite<TF_StackResource>;
 def TF_TensorArrayWrite : MemWrite<TF_TensorArrayResource>;
+def TF_SummaryWrite : MemWrite<TF_SummaryResource>;
+def TF_LookupTableWrite : MemWrite<TF_LookupTableResource>;
+def TF_DatasetSeedGeneratorWrite : MemWrite<TF_DatasetSeedGeneratorResource>;
+def TF_DatasetMemoryCacheWrite : MemWrite<TF_DatasetMemoryCacheResource>;
+def TF_DatasetIteratorWrite : MemWrite<TF_DatasetIteratorResource>;
+
+def TF_VariableAlloc : MemAlloc<TF_VariableResource>;
+def TF_StackAlloc : MemAlloc<TF_StackResource>;
+def TF_TensorArrayAlloc : MemAlloc<TF_TensorArrayResource>;
+def TF_SummaryAlloc : MemAlloc<TF_SummaryResource>;
+def TF_LookupTableAlloc : MemAlloc<TF_LookupTableResource>;
+def TF_DatasetSeedGeneratorAlloc : MemAlloc<TF_DatasetSeedGeneratorResource>;
+def TF_DatasetMemoryCacheAlloc : MemAlloc<TF_DatasetMemoryCacheResource>;
+def TF_DatasetIteratorAlloc : MemAlloc<TF_DatasetIteratorResource>;
+
+def TF_StackFree : MemFree<TF_StackResource>;
+def TF_TensorArrayFree : MemFree<TF_TensorArrayResource>;
+def TF_SummaryFree : MemFree<TF_SummaryResource>;
+def TF_DatasetSeedGeneratorFree : MemFree<TF_DatasetSeedGeneratorResource>;
+def TF_DatasetMemoryCacheFree : MemFree<TF_DatasetMemoryCacheResource>;
+def TF_DatasetIteratorFree : MemFree<TF_DatasetIteratorResource>;
 
 //===----------------------------------------------------------------------===//
 // TensorFlow op definitions
@@ -157,118 +190,194 @@ class TF_TensorFlowType <string name, string description> :
          "TensorFlow " # description # " type">,
     BuildableType<"getType<mlir::TF::" # name # "Type>()">;
 
-// Any tensor element type allowed in TensorFlow ops
-def TF_ElementType : Type<Or<[AnyFloat.predicate,
-                              AnySignlessInteger.predicate,
-                              AnyUnsignedInteger.predicate,
-                              AnyComplex.predicate,
-                              TF_TFDialectType.predicate]>,
-                          "tf.dtype">;
+//===----------------------------------------------------------------------===//
+// Reference types
 
-// Any TensorFlow tensor type
-def TF_Tensor : TensorOf<[TF_ElementType]>;
+// Float reference types
+def TF_Float16Ref : TF_TensorFlowType<"HalfRef", "f16ref">;
+def TF_Float32Ref : TF_TensorFlowType<"FloatRef", "f32ref">;
+def TF_Float64Ref : TF_TensorFlowType<"DoubleRef", "f64ref">;
+def TF_Bfloat16Ref : TF_TensorFlowType<"Bfloat16Ref", "bf16ref">;
+
+// Complex reference types
+def TF_Complex64Ref : TF_TensorFlowType<"Complex64Ref", "complex64ref">;
+def TF_Complex128Ref : TF_TensorFlowType<"Complex128Ref", "complex128ref">;
+
+// Integer reference types
+def TF_Int8Ref : TF_TensorFlowType<"Int8Ref", "i8ref">;
+def TF_Int16Ref : TF_TensorFlowType<"Int16Ref", "i16ref">;
+def TF_Int32Ref : TF_TensorFlowType<"Int32Ref", "i32ref">;
+def TF_Int64Ref : TF_TensorFlowType<"Int64Ref", "i64ref">;
+
+def TF_Uint8Ref : TF_TensorFlowType<"Uint8Ref", "ui8ref">;
+def TF_Uint16Ref : TF_TensorFlowType<"Uint16Ref", "ui16ref">;
+def TF_Uint32Ref : TF_TensorFlowType<"Uint32Ref", "ui32ref">;
+def TF_Uint64Ref : TF_TensorFlowType<"Uint64Ref", "ui64ref">;
+
+// Quantized reference types
+def TF_Qint8Ref : TF_TensorFlowType<"Qint8Ref", "qint8ref">;
+def TF_Qint16Ref : TF_TensorFlowType<"Qint16Ref", "qint16ref">;
+def TF_Qint32Ref : TF_TensorFlowType<"Qint32Ref", "qint32ref">;
+def TF_Quint8Ref : TF_TensorFlowType<"Quint8Ref", "quint8ref">;
+def TF_Quint16Ref : TF_TensorFlowType<"Quint16Ref", "quint16ref">;
+
+// Other reference types
+def TF_BoolRef : TF_TensorFlowType<"BoolRef", "boolref">;
+def TF_ResourceRef : TF_TensorFlowType<"ResourceRef", "resourceref">;
+def TF_StrRef : TF_TensorFlowType<"StringRef", "stringref">;
+def TF_VariantRef : TF_TensorFlowType<"VariantRef", "variantref">;
 
 //===----------------------------------------------------------------------===//
-// Integer types
+// Integer types (including corresponding reference types)
 
-def TF_I32Or64 : SignlessIntOfWidths<[32, 64]>;
+def TF_Bool : AnyTypeOf<[I<1>, TF_BoolRef], "bool">;
 
-def TF_I32OrI64Tensor : TensorOf<[TF_I32Or64]>;
+def TF_Int8 : AnyTypeOf<[I8, TF_Int8Ref], "8-bit integer">;
+def TF_Int16 : AnyTypeOf<[I16, TF_Int16Ref], "16-bit integer">;
+def TF_Int32 : AnyTypeOf<[I32, TF_Int32Ref], "32-bit integer">;
+def TF_Int64 : AnyTypeOf<[I64, TF_Int64Ref], "64-bit integer">;
+def TF_I32OrI64 : AnyTypeOf<[I32, I64, TF_Int32Ref, TF_Int64Ref],
+                           "32/64-bit signed integer">;
 
-def TF_Uint8 : UI<8>;
-def TF_Uint8Tensor : TensorOf<[TF_Uint8]>;
-
-def TF_Uint16 : UI<16>;
-def TF_Uint16Tensor : TensorOf<[TF_Uint16]>;
-
-def TF_Uint32 : UI<32>;
-def TF_Uint32Tensor : TensorOf<[TF_Uint32]>;
-
-def TF_Uint64 : UI<64>;
-def TF_Uint64Tensor : TensorOf<[TF_Uint64]>;
+def TF_Uint8 : AnyTypeOf<[UI<8>, TF_Uint8Ref], "8-bit unsigned integer">;
+def TF_Uint16 : AnyTypeOf<[UI<16>, TF_Uint16Ref], "16-bit unsigned integer">;
+def TF_Uint32 : AnyTypeOf<[UI<32>, TF_Uint32Ref], "32-bit unsigned integer">;
+def TF_Uint64 : AnyTypeOf<[UI<64>, TF_Uint64Ref], "64-bit unsigned integer">;
 
 // Any unsigned integer type
-def TF_UInt : UnsignedIntOfWidths<[8, 16, 32, 64]>;
+def TF_UInt : AnyTypeOf<[TF_Uint8, TF_Uint16, TF_Uint32, TF_Uint64],
+                        "unsigned integer">;
 
 // Any signed integer type
-def TF_SInt : SignlessIntOfWidths<[8, 16, 32, 64]>;
+def TF_SInt : AnyTypeOf<[TF_Int8, TF_Int16, TF_Int32, TF_Int64],
+                        "signed integer">;
 
 // Any integer type
-def TF_Int : AnyTypeOf<[TF_SInt, TF_UInt]>;
+def TF_Int : AnyTypeOf<[TF_SInt, TF_UInt], "integer">;
+
+// Tensor types
+def TF_BoolTensor : TensorOf<[TF_Bool]>;
 
-// Any integer tensor types
 def TF_IntTensor : TensorOf<[TF_Int]>;
+def TF_Int8Tensor : TensorOf<[TF_Int8]>;
+def TF_Int16Tensor : TensorOf<[TF_Int16]>;
+def TF_Int32Tensor : TensorOf<[TF_Int32]>;
+def TF_Int64Tensor : TensorOf<[TF_Int64]>;
+def TF_I32OrI64Tensor : TensorOf<[TF_I32OrI64]>;
+
+def TF_Uint8Tensor : TensorOf<[TF_Uint8]>;
+def TF_Uint16Tensor : TensorOf<[TF_Uint16]>;
+def TF_Uint32Tensor : TensorOf<[TF_Uint32]>;
+def TF_Uint64Tensor : TensorOf<[TF_Uint64]>;
 
 //===----------------------------------------------------------------------===//
-// Quantized types
-def TF_Qint8   : TF_TensorFlowType<"Qint8", "qint8">;
-def TF_Qint16  : TF_TensorFlowType<"Qint16", "qint16">;
-def TF_Qint32  : TF_TensorFlowType<"Qint32", "qint32">;
-def TF_Quint8  : TF_TensorFlowType<"Quint8", "quint8">;
-def TF_Quint16 : TF_TensorFlowType<"Quint16", "quint16">;
+// Quantized types (including corresponding reference types)
+
+def TF_Qint8   : AnyTypeOf<
+  [TF_TensorFlowType<"Qint8", "qint8">, TF_Qint8Ref],
+  "8-bit quantized integer">;
+def TF_Qint16  : AnyTypeOf<
+  [TF_TensorFlowType<"Qint16", "qint16">, TF_Qint16Ref],
+  "16-bit quantized integer">;
+def TF_Qint32  : AnyTypeOf<
+  [TF_TensorFlowType<"Qint32", "qint32">, TF_Qint32Ref],
+  "32-bit quantized integer">;
+def TF_Quint8  : AnyTypeOf<
+  [TF_TensorFlowType<"Quint8", "quint8">, TF_Quint8Ref],
+  "8-bit quantized unsigned integer">;
+def TF_Quint16 : AnyTypeOf<
+  [TF_TensorFlowType<"Quint16", "quint16">, TF_Quint16Ref],
+  "16-bit quantized unsigned integer">;
 
 // Any quantized type
-def TF_AnyQuantized : AnyTypeOf<[TF_Qint8, TF_Qint16, TF_Qint32, TF_Quint8,
-                              TF_Quint16]>;
-//===----------------------------------------------------------------------===//
-// Floating-point types
-
-def TF_F32Or64 : FloatOfWidths<[32, 64]>;
-
-def TF_F32OrF64Tensor : TensorOf<[TF_F32Or64]>;
-
-// Any floating-point tensor types
-def TF_FpTensor : TensorOf<[AnyFloat]>;
+def TF_Quantized : AnyTypeOf<
+  [TF_Qint8, TF_Qint16, TF_Qint32, TF_Quint8, TF_Quint16], "quantized">;
 
 //===----------------------------------------------------------------------===//
-// Complex types
+// Floating-point types (including corresponding reference types)
+
+def TF_Float16 : AnyTypeOf<[F16, TF_Float16Ref], "16-bit float">;
+def TF_Float32 : AnyTypeOf<[F32, TF_Float32Ref], "32-bit float">;
+def TF_Float64 : AnyTypeOf<[F64, TF_Float64Ref], "64-bit float">;
+def TF_Bfloat16 : AnyTypeOf<[BF16, TF_Bfloat16Ref], "bfloat16">;
+
+def TF_F32OrF64 : AnyTypeOf<[TF_Float32, TF_Float64], "32/64-bit float">;
+
+def TF_Float : AnyTypeOf<
+  [TF_Float16, TF_Float32, TF_Float64, TF_Bfloat16,
+   TF_Float16Ref, TF_Float32Ref, TF_Float64Ref, TF_Bfloat16Ref],
+  "floating-point">;
+
+// Tensor types
+def TF_FloatTensor : TensorOf<[TF_Float]>;
+def TF_F32OrF64Tensor : TensorOf<[TF_F32OrF64]>;
+def TF_Float16Tensor : TensorOf<[TF_Float16]>;
+def TF_Float32Tensor : TensorOf<[TF_Float32]>;
+def TF_Float64Tensor : TensorOf<[TF_Float64]>;
+def TF_Bfloat16Tensor : TensorOf<[TF_Bfloat16]>;
+
+//===----------------------------------------------------------------------===//
+// Complex types (including corresponding reference types)
 
 // TODO(suderman): Remove TF_Complex64 and use a standard ops declaration, along
 // with the associated cleanup.
-def TF_Complex64 : Complex<F<32>>;
-def TF_Complex64Tensor : TensorOf<[TF_Complex64]>;
+def TF_Complex64 : AnyTypeOf<[Complex<F<32>>, TF_Complex64Ref],
+  "64-bit complex">;
+def TF_Complex128 : AnyTypeOf<[Complex<F<64>>, TF_Complex128Ref],
+  "128-bit complex">;
+def TF_Complex : AnyTypeOf<[TF_Complex64, TF_Complex128], "complex">;
 
-def TF_Complex128 : Complex<F<64>>;
+// Tensor types
+def TF_ComplexTensor : TensorOf<[TF_Complex]>;
+def TF_Complex64Tensor : TensorOf<[TF_Complex64]>;
 def TF_Complex128Tensor : TensorOf<[TF_Complex128]>;
 
-def TF_AnyComplex : AnyTypeOf<[TF_Complex64, TF_Complex128],
-                              "64/128-bit complex type">;
-
-def TF_ComplexTensor : TensorOf<[TF_AnyComplex]>;
-
 //===----------------------------------------------------------------------===//
-// String/variant/resource types
+// String/variant/resource types (including corresponding reference types)
 
-def TF_Str : TF_TensorFlowType<"String", "string">;
+def TF_Str : AnyTypeOf<
+  [TF_TensorFlowType<"String", "str">, TF_StrRef], "string">;
 def TF_StrTensor : TensorOf<[TF_Str]>;
 
-def TF_Variant : TF_TensorFlowType<"Variant", "variant">;
+def TF_Variant : AnyTypeOf<
+  [TF_TensorFlowType<"Variant", "var">, TF_VariantRef], "variant">;
 def TF_VariantTensor : TensorOf<[TF_Variant]>;
 
-def TF_Resource : TF_TensorFlowType<"Resource", "resource">;
+def TF_Resource : AnyTypeOf<
+  [TF_TensorFlowType<"Resource", "res">, TF_ResourceRef], "resource">;
 def TF_ResourceTensor : TensorOf<[TF_Resource]>;
 
 //===----------------------------------------------------------------------===//
 // Multi-category type constraints
 
-def TF_IntOrF32OrF64Tensor: TensorOf<[TF_Int, TF_F32Or64]>;
+def TF_IntOrF32OrF64Tensor: TensorOf<[TF_Int, TF_F32OrF64]>;
+def TF_FpOrI32OrI64Tensor : TensorOf<[TF_Float, TF_I32OrI64]>;
+def TF_IntOrFpTensor : TensorOf<[TF_Int, TF_Float]>;
+def TF_SintOrFpTensor : TensorOf<[TF_SInt, TF_Float]>;
+def TF_FpOrComplexTensor : TensorOf<[TF_Float, TF_Complex]>;
 
-def TF_FpOrI32OrI64Tensor : TensorOf<[AnyFloat, TF_I32Or64]>;
+def TF_Number : AnyTypeOf<
+  [TF_Int, TF_Float, TF_Quantized, TF_Complex], "number">;
+def TF_NumberTensor : TensorOf<[TF_Number]>;
 
-// Any integer or floating-point tensor types
-def TF_IntOrFpTensor : TensorOf<[TF_Int, AnyFloat]>;
+def TF_NumberNotQuantizedOrStr :
+  AnyTypeOf<[TF_Float, TF_SInt, TF_Complex, TF_Uint8, TF_Str]>;
+def TF_NumberNotQuantizedOrStrTensor : TensorOf<[TF_NumberNotQuantizedOrStr]>;
 
-def TF_SintOrFpTensor : TensorOf<[TF_SInt, AnyFloat]>;
+//===----------------------------------------------------------------------===//
+// Tensor and tensor element types
 
-def TF_FpOrComplexTensor : TensorOf<[AnyFloat, TF_AnyComplex]>;
+// Any tensor element type allowed in TensorFlow ops
+// (see https://www.tensorflow.org/api_docs/python/tf/dtypes/DType)
+def TF_ElementType : Type<Or<[TF_Float.predicate,
+                              TF_Complex.predicate,
+                              TF_Int.predicate,
+                              TF_Bool.predicate,
+                              TF_TFDialectType.predicate]>,
+                          "tf.dtype">;
 
-def TF_AnyNumber : AnyTypeOf<[TF_Int, AnyFloat, TF_AnyQuantized, TF_AnyComplex],
-                             "number">;
-
-def TF_NumberTensor : TensorOf<[TF_AnyNumber]>;
-
-def TF_NumberOrStr : AnyTypeOf<[AnyFloat, TF_SInt, TF_AnyComplex, TF_Uint8, TF_Str]>;
-def TF_NumberOrStrTensor : TensorOf<[TF_NumberOrStr]>;
+// Any TensorFlow tensor type
+def TF_Tensor : TensorOf<[TF_ElementType]>;
 
 //===----------------------------------------------------------------------===//
 // TensorFlow attribute definitions
@@ -423,7 +532,7 @@ class TF_DerivedResultShapeListAttr<int idx> : DerivedAttr<
 // A derived attribute that returns the shape of the first result type.
 def TF_DerivedResultShapeAttr : DerivedAttr<"ShapedType",
   "return (*getOperation()->result_type_begin()).cast<ShapedType>();",
-  [{ TypeAttr::get($_self) }]>;
+  [{ mlir::TF::ShapeAttr::get($_ctx, $_self) }]>;
 
 // A derived attribute that returns the element type of the tensor held by a
 // named resource-type operand or result.
@@ -443,7 +552,7 @@ class TF_DerivedOperandOrResultHandleShapeAttr<string name> : DerivedAttr<
   "  .cast<TF::ResourceType>();\n"
   "assert(!resource_type.getSubtypes().empty() && \"unknown shape\");\n"
   "return resource_type.getSubtypes().begin()->cast<ShapedType>();",
-  [{ TypeAttr::get($_self) }]>;
+  [{ mlir::TF::ShapeAttr::get($_ctx, $_self) }]>;
 
 def TF_IntTypeAttr : TypeAttrBase<"IntegerType", "integer type"> {
   let returnType = "Type";
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h
index ec1f748367d..3a6a9336a24 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h
@@ -15,14 +15,121 @@ limitations under the License.
 
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OP_INTERFACES_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OP_INTERFACES_H_
+
+#include <string>
+
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 
 namespace mlir {
 namespace TF {
+
+//===----------------------------------------------------------------------===//
+// TensorFlow Contraction Fusion.
+//===----------------------------------------------------------------------===//
+
+struct ContractionFusion {
+  explicit ContractionFusion(
+      StringRef output_kernel, ArrayRef<int> additional_arguments = {},
+      ArrayRef<NamedAttribute> additional_attributes = {})
+      : output_kernel(output_kernel.str()),
+        additional_arguments(additional_arguments.begin(),
+                             additional_arguments.end()),
+        additional_attributes(additional_attributes.begin(),
+                              additional_attributes.end()) {}
+
+  // Name of the output kernel implementing the contraction fusion.
+  std::string output_kernel;
+
+  // Indices of additional arguments that will be forwarded to the fused
+  // operation (e.g. forward bias vector if fusing BiasAdd operation).
+  SmallVector<int, 4> additional_arguments;
+
+  // Add additional attributes to the fused node.
+  SmallVector<NamedAttribute, 4> additional_attributes;
+};
+
+//===----------------------------------------------------------------------===//
+// TensorFlow Resource Handles.
+//===----------------------------------------------------------------------===//
+
+inline bool IsResourceHandleAnonymous(StringRef name) {
+  return name == ::tensorflow::ResourceHandle::ANONYMOUS_NAME;
+}
+
+// Helper struct representing an identifier for a resource handle. For resource
+// handles created explicitly and shared across resource allocator ops,
+// `container`, `name`, and `device` can be set. If an resource handle is tied
+// to an instance of an operation (e.g. TensorFlow runtime operation caching),
+// `op` can be set instead.
+struct ResourceHandle {
+  ResourceHandle(StringRef container, StringRef name, StringRef device,
+                 Operation* op)
+      : container(container), name(name), device(device), op(op) {}
+
+  bool operator==(const ResourceHandle& rhs) const {
+    return container == rhs.container && name == rhs.name &&
+           device == rhs.device && op == rhs.op;
+  }
+
+  // Make ResourceHandle hashable.
+  friend ::llvm::hash_code hash_value(const ResourceHandle& resource_handle);
+
+  std::string container;
+  std::string name;
+  std::string device;
+  Operation* op = nullptr;
+};
+
+// Make ResourceHandle hashable.
+inline ::llvm::hash_code hash_value(const ResourceHandle& resource_handle) {
+  return ::llvm::hash_combine(resource_handle.container, resource_handle.name,
+                              resource_handle.device, resource_handle.op);
+}
+
+// Helper struct holding a resource handle value and unique id associated to the
+// resource handle.
+struct ResourceHandleValueAndId {
+  ResourceHandleValueAndId(Value value, int64_t id) : value(value), id(id) {}
+
+  Value value;
+  int64_t id = -1;
+};
+
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h.inc"
 }  // namespace TF
 }  // namespace mlir
 
+namespace llvm {
+template <>
+struct DenseMapInfo<mlir::TF::ResourceHandle> {
+  static mlir::TF::ResourceHandle getEmptyKey() {
+    return {/*container=*/"", /*name=*/"", /*device=*/"",
+            /*op=*/DenseMapInfo<mlir::Operation*>::getEmptyKey()};
+  }
+
+  static mlir::TF::ResourceHandle getTombstoneKey() {
+    return {/*container=*/"", /*name=*/"", /*device=*/"",
+            /*op=*/DenseMapInfo<mlir::Operation*>::getTombstoneKey()};
+  }
+
+  static unsigned getHashValue(
+      const mlir::TF::ResourceHandle& resource_handle) {
+    return mlir::TF::hash_value(resource_handle);
+  }
+
+  static bool isEqual(const mlir::TF::ResourceHandle& lhs,
+                      const mlir::TF::ResourceHandle& rhs) {
+    return lhs == rhs;
+  }
+};
+}  // namespace llvm
+
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OP_INTERFACES_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
index 3743bdda043..1ed30c89a77 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
@@ -21,7 +21,7 @@ limitations under the License.
 include "mlir/IR/OpBase.td"
 
 //===----------------------------------------------------------------------===//
-// TensorFlow interfaces
+// TensorFlow Layout Optimization Interfaces.
 //===----------------------------------------------------------------------===//
 
 def TF_LayoutSensitiveInterface : OpInterface<"LayoutSensitiveInterface"> {
@@ -104,4 +104,48 @@ def TF_FoldOperandsTransposeInterface : OpInterface<"FoldOperandsTransposeInterf
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// TensorFlow Contraction Fusion Interfaces.
+//===----------------------------------------------------------------------===//
+
+def TF_ContractionFusableInterface : OpInterface<"ContractionFusableInterface"> {
+  let description = [{
+    A contraction fusable operation is one that can be fused into the output of
+    a tensor contraction (MatMul, Conv2D, etc...) operation.
+
+    For example all element wise operations are trivially contraction fusable.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      [{Returns contraction fusion if the operation satisfies all the fusion
+        requirements. Otherwise returns empty optional.}],
+      "Optional<ContractionFusion>", "GetContractionFusion", (ins)
+    >,
+  ];
+}
+
+//===----------------------------------------------------------------------===//
+// TensorFlow Resource Handle Interfaces.
+//===----------------------------------------------------------------------===//
+
+def TF_ResourceHandleAllocatorInterface : OpInterface<"ResourceHandleAllocatorInterface"> {
+  let description = [{
+    A resource handle allocator operation is one that creates a resource handle,
+    or looks up and reuses an existing resource handle.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      /*desc=*/[{Returns the resource handle value and unique id associated with
+                 the resource handle. If a resource handle is reused, then an
+                 existing id will be returned.}],
+      /*retTy=*/"ResourceHandleValueAndId",
+      /*methodName=*/"GetResourceHandleValueAndId",
+      /*args=*/(ins "llvm::SmallDenseMap<ResourceHandle, int64_t>&":$resource_handle_id_map,
+                    "int64_t&":$next_id)
+    >,
+  ];
+}
+
 #endif // TF_OP_INTERFACES
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 737442d5f8c..634004038d0 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -233,6 +233,10 @@ TensorFlowDialect::TensorFlowDialect(MLIRContext *context)
 #define GET_OP_LIST
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_all_ops.cc.inc"
       >();
+  addOperations<
+#define GET_OP_LIST
+#include "tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc.inc"
+      >();
   addTypes<
 #define HANDLE_TF_TYPE(tftype, enumerant, name) tftype##Type,
 #define HANDLE_LAST_TF_TYPE(tftype, enumerant, name) tftype##Type
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
index 3169f7fba8d..9ebd59007e3 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h"
 
 namespace mlir {
 namespace TF {
@@ -112,8 +113,7 @@ class TensorFlowDialect : public Dialect {
   // same interface.
   template <typename... Args>
   void addOperations() {
-    (void)std::initializer_list<int>{
-        0, (addOperation(AbstractOperation::get<Args>(*this)), 0)...};
+    Dialect::addOperations<Args...>();
   }
 
   using ConstantFoldHook = LogicalResult (*)(Operation *, ArrayRef<Attribute>,
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index db0a97d4b96..c814153eb43 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -116,6 +116,24 @@ An n-way switch statement, implementing the following:
   let verifier = [{
     return Verify(*this);
   }];
+
+
+ let extraClassDeclaration = [{
+    int num_branches() { return branches().size(); }
+
+    // Gets function corresponding branch # `index`.
+    FuncOp branch_function(int index) {
+      auto flat_sym_ref = branches()[index].cast<FlatSymbolRefAttr>();
+      return SymbolTable::lookupNearestSymbolFrom<FuncOp>(*this, flat_sym_ref);
+    }
+
+    // Gets all branch functions.
+    void get_branch_functions(SmallVectorImpl<FuncOp> &functions) {
+      functions.reserve(num_branches());
+      for (int idx : llvm::seq<int>(0, num_branches()))
+        functions.push_back(branch_function(idx));
+    }
+  }];
 }
 
 def TF_CaseRegionOp : TF_Op<"CaseRegion",
@@ -160,6 +178,9 @@ An n-way switch statement, implementing the following:
   let verifier = [{
     return Verify(*this);
   }];
+
+  let hasCanonicalizer = 1;
+
 }
 
 // In MLIR, the TensorFlow tensor value is represented as an ElementsAttr, with
@@ -206,12 +227,12 @@ source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
   }];
 
   let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
     I32Tensor:$source_target_pairs
   );
 
   let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+    TensorOf<[TF_Bfloat16, TF_Float16, TF_Float32, TF_Float64, TF_Int16, TF_Int32, TF_Int64, TF_Int8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -231,7 +252,7 @@ element_shape: a shape compatible with that of elements in the list.
 
   let arguments = (ins
     TF_I32OrI64Tensor:$element_shape,
-    I32Tensor:$max_num_elements
+    TF_Int32Tensor:$max_num_elements
   );
 }
 
@@ -305,12 +326,12 @@ else_branch: A function that takes 'inputs' and returns a list of
 
   let extraClassDeclaration = [{
     // Get the then branch function.
-    FuncOp then_func() {
+    FuncOp then_function() {
      return SymbolTable::lookupNearestSymbolFrom<FuncOp>(*this, then_branch());
     }
 
     // Get the else branch function.
-    FuncOp else_func() {
+    FuncOp else_function() {
      return SymbolTable::lookupNearestSymbolFrom<FuncOp>(*this, else_branch());
     }
   }];
@@ -369,6 +390,12 @@ else_branch: A region that computes the outputs of the op if cond = false.
     return Verify(*this);
   }];
 
+  let builders = [OpBuilder<
+    "OpBuilder &builder, OperationState &result, TypeRange resultTypes, ValueRange operands, llvm::ArrayRef<::mlir::NamedAttribute> attributes, unsigned numRegions", [{
+      assert(numRegions == 2u && "mismatched number of regions");
+      build(builder, result, resultTypes, operands, attributes);
+    }]>];
+
   let hasCanonicalizer = 1;
 }
 
@@ -424,7 +451,7 @@ def TF_ParseExampleOp : TF_Op<"ParseExample",
     TF_StrTensor:$names,
     Variadic<TF_StrTensor>:$sparse_keys,
     Variadic<TF_StrTensor>:$dense_keys,
-    Variadic<TensorOf<[F32, I64, TF_Str]>>:$dense_defaults,
+    Variadic<TensorOf<[TF_Float32, TF_Int64, TF_Str]>>:$dense_defaults,
 
     TF_ShapeAttrArray:$dense_shapes,
     I32ElementsAttr:$result_segment_sizes,
@@ -432,10 +459,10 @@ def TF_ParseExampleOp : TF_Op<"ParseExample",
   );
 
   let results = (outs
-    Variadic<I64Tensor>:$sparse_indices,                    // len(sparse_types)
-    Variadic<TensorOf<[F32, I64, TF_Str]>>:$sparse_values,  // len(sparse_types)
-    Variadic<I64Tensor>:$sparse_shapes,                     // len(sparse_types)
-    Variadic<TensorOf<[F32, I64, TF_Str]>>:$dense_values    // len(Tdense)
+    Variadic<TF_Int64Tensor>:$sparse_indices,                           // len(sparse_types)
+    Variadic<TensorOf<[TF_Float32, TF_Int64, TF_Str]>>:$sparse_values,  // len(sparse_types)
+    Variadic<TF_Int64Tensor>:$sparse_shapes,                            // len(sparse_types)
+    Variadic<TensorOf<[TF_Float32, TF_Int64, TF_Str]>>:$dense_values    // len(Tdense)
   );
 
   TF_DerivedOperandSizeAttr Nsparse = TF_DerivedOperandSizeAttr<2>;
@@ -459,7 +486,7 @@ def TF_ParseExampleV2Op : TF_Op<"ParseExampleV2",
     TF_StrTensor:$sparse_keys,
     TF_StrTensor:$dense_keys,
     TF_StrTensor:$ragged_keys,
-    Variadic<TensorOf<[F32, I64, TF_Str]>>:$dense_defaults,
+    Variadic<TensorOf<[TF_Float32, TF_Int64, TF_Str]>>:$dense_defaults,
 
     Confined<I64Attr, [IntMinValue<0>]>:$num_sparse,
     TF_ShapeAttrArray:$dense_shapes,
@@ -467,13 +494,13 @@ def TF_ParseExampleV2Op : TF_Op<"ParseExampleV2",
   );
 
   let results = (outs
-    Variadic<I64Tensor>:$sparse_indices,                    // len(sparse_types)
-    Variadic<TensorOf<[F32, I64, TF_Str]>>:$sparse_values,  // len(sparse_types)
-    Variadic<I64Tensor>:$sparse_shapes,                     // len(sparse_types)
-    Variadic<TensorOf<[F32, I64, TF_Str]>>:$dense_values,   // len(Tdense)
-    Variadic<TensorOf<[F32, I64, TF_Str]>>:$ragged_values,  // len(ragged_value_types)
+    Variadic<TF_Int64Tensor>:$sparse_indices,                           // len(sparse_types)
+    Variadic<TensorOf<[TF_Float32, TF_Int64, TF_Str]>>:$sparse_values,  // len(sparse_types)
+    Variadic<TF_Int64Tensor>:$sparse_shapes,                            // len(sparse_types)
+    Variadic<TensorOf<[TF_Float32, TF_Int64, TF_Str]>>:$dense_values,   // len(Tdense)
+    Variadic<TensorOf<[TF_Float32, TF_Int64, TF_Str]>>:$ragged_values,  // len(ragged_value_types)
                                                             //     = len(ragged_split_types)
-    Variadic<TensorOf<[I32, I64]>>:$ragged_row_splits       // len(ragged_split_types)
+    Variadic<TensorOf<[TF_Int32, TF_Int64]>>:$ragged_row_splits         // len(ragged_split_types)
                                                             //     = len(ragged_value_types)
   );
 
@@ -570,36 +597,6 @@ def TF_PlaceholderWithDefaultOp : TF_Op<"PlaceholderWithDefault", [NoSideEffect]
   DerivedAttr shape = TF_DerivedResultShapeAttr;
 }
 
-def TF_SparseMatMulOp : TF_Op<"SparseMatMul", [NoSideEffect]> {
-  let summary = [{
-SparseMatMul is MatMul with hints on the sparseness of the matrices.
-  }];
-
-  let description = [{
-Similar to MatMul, with a_is_sparse and b_is_sparse indicating whether a and b
-are sparse matrices.
-  }];
-
-  let arguments = (ins
-    TensorOf<[BF16, F32]>:$a,
-    TensorOf<[BF16, F32]>:$b,
-
-    DefaultValuedAttr<BoolAttr, "true">:$a_is_sparse,
-    DefaultValuedAttr<BoolAttr, "false">:$b_is_sparse,
-
-    DefaultValuedAttr<BoolAttr, "false">:$transpose_a,
-    DefaultValuedAttr<BoolAttr, "false">:$transpose_b
-  );
-
-  let results = (outs
-    TensorOf<[F32]>:$product
-  );
-
-  TF_DerivedOperandTypeAttr Ta = TF_DerivedOperandTypeAttr<0>;
-  TF_DerivedOperandTypeAttr Tb = TF_DerivedOperandTypeAttr<1>;
-}
-
-
 def TF_StatefulPartitionedCallOp : TF_Op<"StatefulPartitionedCall",
                                          [CallOpInterface]> {
   let summary =
@@ -691,18 +688,18 @@ body: A function that takes a list of tensors and returns another
 
   let extraClassDeclaration = [{
     // Get the condition function.
-    FuncOp cond_func() {
+    FuncOp cond_function() {
       return SymbolTable::lookupNearestSymbolFrom<FuncOp>(*this, cond());
     }
 
     // Get the body function.
-    FuncOp body_func() {
+    FuncOp body_function() {
       return SymbolTable::lookupNearestSymbolFrom<FuncOp>(*this, body());
     }
   }];
 }
 
-def TL_WhileRegionOp : TF_Op<"WhileRegion",
+def TF_WhileRegionOp : TF_Op<"WhileRegion",
       [DeclareOpInterfaceMethods<LoopLikeOpInterface>,
        SingleBlockImplicitTerminator<"YieldOp">]> {
   let summary = "while operation";
@@ -765,7 +762,7 @@ element_dtype: the desired type of elements in the list.
 
   let arguments = (ins
     TF_I32OrI64Tensor:$element_shape,
-    I32Tensor:$num_elements
+    TF_Int32Tensor:$num_elements
   );
 }
 
@@ -799,7 +796,7 @@ This operation holds the metadata common to operations of a `tpu.replicate()` co
   let results = (outs);
 }
 
-def TF_VarHandleOp : TF_Op<"VarHandleOp", []> {
+def TF_VarHandleOp : TF_Op<"VarHandleOp", [TF_ResourceHandleAllocatorInterface]> {
   let summary = "Creates a handle to a Variable resource from its name.";
 
   let description = [{
@@ -821,13 +818,20 @@ Example:
   );
 
   let results = (outs
-    TF_ResourceTensor:$resource
+    Res<TF_ResourceTensor, "", [TF_VariableAlloc]>:$resource
   );
 
   TF_DerivedOperandOrResultHandleTypeAttr dtype =
     TF_DerivedOperandOrResultHandleTypeAttr<"resource">;
   TF_DerivedOperandOrResultHandleShapeAttr shape =
     TF_DerivedOperandOrResultHandleShapeAttr<"resource">;
+
+  let extraClassDeclaration = [{
+    // TF_ResourceHandleAllocatorInterface:
+    ResourceHandleValueAndId GetResourceHandleValueAndId(
+      llvm::SmallDenseMap<ResourceHandle, int64_t> &resource_handle_id_map,
+      int64_t &next_id);
+  }];
 }
 
 // Multiple variadic operands with different sizes are not supported by the
@@ -986,8 +990,8 @@ Creates a dataset that batches `batch_size` elements from `input_dataset`.
 
   let arguments = (ins
     TF_VariantTensor:$input_dataset,
-    I64Tensor:$batch_size,
-    I1Tensor:$drop_remainder,
+    TF_Int64Tensor:$batch_size,
+    TF_BoolTensor:$drop_remainder,
 
     DefaultValuedAttr<BoolAttr, "false">:$parallel_copy,
     Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
@@ -1036,9 +1040,9 @@ to `batch_size * num_parallel_batches` copies of `f` in parallel.
   let arguments = (ins
     TF_VariantTensor:$input_dataset,
     Variadic<TF_Tensor>:$other_arguments,
-    I64Tensor:$batch_size,
-    I64Tensor:$num_parallel_calls,
-    I1Tensor:$drop_remainder,
+    TF_Int64Tensor:$batch_size,
+    TF_Int64Tensor:$num_parallel_calls,
+    TF_BoolTensor:$drop_remainder,
 
     SymbolRefAttr:$f,
     Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
@@ -1066,7 +1070,7 @@ def TF_ParallelMapDatasetOp : TF_Op<"ParallelMapDataset", [NoSideEffect]> {
   let arguments = (ins
     TF_VariantTensor:$input_dataset,
     Variadic<TF_Tensor>:$other_arguments,
-    I32Tensor:$num_parallel_calls,
+    TF_Int32Tensor:$num_parallel_calls,
 
     SymbolRefAttr:$f,
     Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
@@ -1148,11 +1152,11 @@ This function is faster and numerically stabler than `bessel_i0(x)`.
   }];
 
   let arguments = (ins
-    TF_FpTensor:$x
+    TF_FloatTensor:$x
   );
 
   let results = (outs
-    TF_FpTensor:$y
+    TF_FloatTensor:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1169,11 +1173,11 @@ This function is faster and numerically stabler than `bessel_i1(x)`.
   }];
 
   let arguments = (ins
-    TF_FpTensor:$x
+    TF_FloatTensor:$x
   );
 
   let results = (outs
-    TF_FpTensor:$y
+    TF_FloatTensor:$y
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -1184,7 +1188,7 @@ def TF_TPUPartitionedCallOp : TF_Op<"TPUPartitionedCall", [CallOpInterface]> {
 
   let arguments = (ins
     Variadic<TF_Tensor>:$args,
-    I32Tensor:$device_ordinal,
+    TF_Int32Tensor:$device_ordinal,
 
     SymbolRefAttr:$f,
     DefaultValuedAttr<I64Attr, "0">:$autotuner_thresh
@@ -1213,63 +1217,6 @@ def TF_TPUPartitionedCallOp : TF_Op<"TPUPartitionedCall", [CallOpInterface]> {
   let verifier = [{ return VerifyPartitionedCall(*this); }];
 }
 
-class TF_FusedBatchNormOpBase<string Name> : TF_Op<Name, [NoSideEffect, TF_FoldOperandsTransposeInterface, TF_LayoutSensitiveInterface]> {
-  let summary = "Batch normalization.";
-
-  let description = [{
-Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-The size of 1D Tensors matches the dimension C of the 4D Tensors.
-  }];
-
-  let arguments = (ins
-    TensorOf<[BF16, F16, F32]>:$x,
-    F32Tensor:$scale,
-    F32Tensor:$offset,
-    F32Tensor:$mean,
-    F32Tensor:$variance,
-
-    DefaultValuedAttr<F32Attr, "0.0001f">:$epsilon,
-    DefaultValuedAttr<F32Attr, "1.0f">:$exponential_avg_factor,
-    DefaultValuedAttr<TF_ConvnetDataFormatAttr, "NHWC">:$data_format,
-    DefaultValuedAttr<BoolAttr, "true">:$is_training
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-  TF_DerivedOperandTypeAttr U = TF_DerivedOperandTypeAttr<1>;
-
-  let extraClassDeclaration = [{
-    // TF_FoldOperandsTransposeInterface:
-    SmallVector<unsigned, 4> GetLayoutDependentArgs() { return {0}; }
-    SmallVector<unsigned, 4> GetLayoutDependentResults() { return {0}; }
-    LogicalResult FoldOperandsPermutation(ArrayRef<int64_t> permutation);
-
-    // TF_LayoutSensitiveInterface:
-    StringRef GetOptimalLayout(const RuntimeDevices& devices);
-    LogicalResult UpdateDataFormat(StringRef data_format);
-  }];
-}
-
-def TF_FusedBatchNormV2Op : TF_FusedBatchNormOpBase<"FusedBatchNormV2"> {
-  let results = (outs
-    TensorOf<[BF16, F16, F32]>:$y,
-    F32Tensor:$batch_mean,
-    F32Tensor:$batch_variance,
-    F32Tensor:$reserve_space_1,
-    F32Tensor:$reserve_space_2
-  );
-}
-
-def TF_FusedBatchNormV3Op : TF_FusedBatchNormOpBase<"FusedBatchNormV3"> {
-  let results = (outs
-    TensorOf<[BF16, F16, F32]>:$y,
-    F32Tensor:$batch_mean,
-    F32Tensor:$batch_variance,
-    F32Tensor:$reserve_space_1,
-    F32Tensor:$reserve_space_2,
-    F32Tensor:$reserve_space_3
-  );
-}
-
 def TF_BatchFunctionOp : TF_Op<"BatchFunction", [AttrSizedOperandSegments]> {
   let summary = [{
 Batches all the inputs tensors to the computation done by the function.
@@ -1341,4 +1288,649 @@ must be a Tensor or a list/tuple of Tensors.
   TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
 }
 
+def TF_AddV2Op : TF_Op<"AddV2", [Commutative, NoSideEffect, ResultsBroadcastableShape, TF_CwiseBinary, TF_LayoutAgnostic, TF_SameOperandsAndResultElementTypeResolveRef]>,
+                 WithBroadcastableBinOpBuilder {
+  let summary = "Returns x + y element-wise.";
+
+  let description = [{
+*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+  }];
+
+  let arguments = (ins
+    TensorOf<[TF_Float, TF_SInt, TF_Complex, TF_Uint8, TF_Uint32]>:$x,
+    TensorOf<[TF_Float, TF_SInt, TF_Complex, TF_Uint8, TF_Uint32]>:$y
+  );
+
+  let results = (outs
+    TensorOf<[TF_Float, TF_SInt, TF_Complex, TF_Uint8, TF_Uint32]>:$z
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let hasCanonicalizer = 1;
+
+  let hasFolder = 1;
+}
+
+def TF_DivNoNanOp : TF_Op<"DivNoNan", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
+                    WithBroadcastableBinOpBuilder {
+  let summary = "Returns 0 if the denominator is zero.";
+
+  let description = [{
+*NOTE*: `DivNoNan` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+  }];
+
+  let arguments = (ins
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Complex]>:$x,
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Complex]>:$y
+  );
+
+  let results = (outs
+    TensorOf<[TF_Float16, TF_Float32, TF_Float64, TF_Complex]>:$z
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_MaximumOp : TF_Op<"Maximum", [NoSideEffect, ResultsBroadcastableShape, TF_SameOperandsAndResultElementTypeResolveRef]>,
+                   WithBroadcastableBinOpBuilder {
+  let summary = "Returns the max of x and y (i.e. x > y ? x : y) element-wise.";
+
+  let description = [{
+*NOTE*: `Maximum` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+  }];
+
+  let arguments = (ins
+    TensorOf<[TF_Float, TF_Int16, TF_Int32, TF_Int64, TF_Uint8]>:$x,
+    TensorOf<[TF_Float, TF_Int16, TF_Int32, TF_Int64, TF_Uint8]>:$y
+  );
+
+  let results = (outs
+    TensorOf<[TF_Float, TF_Int16, TF_Int32, TF_Int64, TF_Uint8]>:$z
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_RealDivOp : TF_Op<"RealDiv", [NoSideEffect, ResultsBroadcastableShape, TF_CwiseBinary]>,
+                   WithBroadcastableBinOpBuilder {
+  let summary = "Returns x / y element-wise for real types.";
+
+  let description = [{
+If `x` and `y` are reals, this will return the floating-point division.
+
+*NOTE*: `Div` supports broadcasting. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+  }];
+
+  let arguments = (ins
+    TensorOf<[TF_Float, TF_SInt, TF_Complex, TF_Uint16, TF_Uint8]>:$x,
+    TensorOf<[TF_Float, TF_SInt, TF_Complex, TF_Uint16, TF_Uint8]>:$y
+  );
+
+  let results = (outs
+    TensorOf<[TF_Float, TF_SInt, TF_Complex, TF_Uint16, TF_Uint8]>:$z
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let hasCanonicalizer = 1;
+
+  let hasFolder = 1;
+}
+
+def TF_AddOp : TF_Op<"Add", [NoSideEffect, ResultsBroadcastableShape, TF_LayoutAgnostic, TF_SameOperandsAndResultElementTypeResolveRef]>,
+               WithBroadcastableBinOpBuilder {
+  let summary = "Returns x + y element-wise.";
+
+  let description = [{
+*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+Given two input tensors, the `tf.add` operation computes the sum for every element in the tensor.
+
+Both input and output have a range `(-inf, inf)`.
+  }];
+
+  let arguments = (ins
+    TensorOf<[TF_NumberNotQuantizedOrStr]>:$x,
+    TensorOf<[TF_NumberNotQuantizedOrStr]>:$y
+  );
+
+  let results = (outs
+    TensorOf<[TF_NumberNotQuantizedOrStr]>:$z
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let hasCanonicalizer = 1;
+}
+
+def TF_StatefulStandardNormalV2Op : TF_Op<"StatefulStandardNormalV2", []> {
+  let summary = "Outputs random values from a normal distribution.";
+
+  let description = [{
+The generated values will have mean 0 and standard deviation 1.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead,TF_VariableWrite]>:$resource,
+    TF_Int64Tensor:$algorithm,
+    TF_I32OrI64Tensor:$shape
+  );
+
+  let results = (outs
+    TF_FloatTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr shape_dtype = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_StatefulTruncatedNormalOp : TF_Op<"StatefulTruncatedNormal", []> {
+  let summary = "Outputs random values from a truncated normal distribution.";
+
+  let description = [{
+The generated values follow a normal distribution with mean 0 and standard
+deviation 1, except that values whose magnitude is more than 2 standard
+deviations from the mean are dropped and re-picked.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead,TF_VariableWrite]>:$resource,
+    TF_Int64Tensor:$algorithm,
+    TF_I32OrI64Tensor:$shape
+  );
+
+  let results = (outs
+    TF_FloatTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr shape_dtype = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_StatefulUniformOp : TF_Op<"StatefulUniform", []> {
+  let summary = "Outputs random values from a uniform distribution.";
+
+  let description = [{
+The generated values follow a uniform distribution in the range `[0, 1)`. The
+lower bound 0 is included in the range, while the upper bound 1 is excluded.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead,TF_VariableWrite]>:$resource,
+    TF_Int64Tensor:$algorithm,
+    TF_I32OrI64Tensor:$shape
+  );
+
+  let results = (outs
+    TF_FloatTensor:$output
+  );
+
+  TF_DerivedOperandTypeAttr shape_dtype = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_StatefulUniformFullIntOp : TF_Op<"StatefulUniformFullInt", []> {
+  let summary = "Outputs random integers from a uniform distribution.";
+
+  let description = [{
+The generated values are uniform integers covering the whole range of `dtype`.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead,TF_VariableWrite]>:$resource,
+    TF_Int64Tensor:$algorithm,
+    TF_I32OrI64Tensor:$shape
+  );
+
+  let results = (outs
+    TensorOf<[TF_Int32, TF_Int64, TF_Uint32, TF_Uint64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr shape_dtype = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedResultTypeAttr dtype = TF_DerivedResultTypeAttr<0>;
+}
+
+// TODO(lyandy): Investigate supported dtypes (`minval`, `maxval`, `output`) for
+// `tf.StatefulUniformInt`. tf2xla kernels support i32, i64, ui32, and ui64
+// while TensorFlow CPU/GPU kernels only support i32 and i64.
+def TF_StatefulUniformIntOp : TF_Op<"StatefulUniformInt", []> {
+  let summary = "Outputs random integers from a uniform distribution.";
+
+  let description = [{
+The generated values are uniform integers in the range `[minval, maxval)`.
+The lower bound `minval` is included in the range, while the upper bound
+`maxval` is excluded.
+
+The random integers are slightly biased unless `maxval - minval` is an exact
+power of two.  The bias is small for values of `maxval - minval` significantly
+smaller than the range of the output (either `2^32` or `2^64`).
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_VariableRead,TF_VariableWrite]>:$resource,
+    TF_Int64Tensor:$algorithm,
+    TF_I32OrI64Tensor:$shape,
+    TensorOf<[TF_Int32, TF_Int64, TF_Uint32, TF_Uint64]>:$minval,
+    TensorOf<[TF_Int32, TF_Int64, TF_Uint32, TF_Uint64]>:$maxval
+  );
+
+  let results = (outs
+    TensorOf<[TF_Int32, TF_Int64, TF_Uint32, TF_Uint64]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr shape_dtype = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedOperandTypeAttr dtype = TF_DerivedOperandTypeAttr<3>;
+}
+
+def TF_CloseSummaryWriterOp : TF_Op<"CloseSummaryWriter", []> {
+  let summary = "Flushes and closes the summary writer.";
+
+  let description = [{
+Also removes it from the resource manager. To reopen, use another
+CreateSummaryFileWriter op.
+
+writer: A handle to the summary writer resource.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryFree]>:$writer
+  );
+
+  let results = (outs);
+}
+
+// TODO(b/168035831): Model db_uri read/write.
+def TF_CreateSummaryDbWriterOp : TF_Op<"CreateSummaryDbWriter", []> {
+  let summary = "Creates summary database writer accessible by given resource handle.";
+
+  let description = [{
+This can be used to write tensors from the execution graph directly
+to a database. Only SQLite is supported right now. This function
+will create the schema if it doesn't exist. Entries in the Users,
+Experiments, and Runs tables will be created automatically if they
+don't already exist.
+
+writer: Handle to SummaryWriter resource to overwrite.
+db_uri: For example "file:/tmp/foo.sqlite".
+experiment_name: Can't contain ASCII control characters or <>. Case
+  sensitive. If empty, then the Run will not be associated with any
+  Experiment.
+run_name: Can't contain ASCII control characters or <>. Case sensitive.
+  If empty, then each Tag will not be associated with any Run.
+user_name: Must be valid as both a DNS label and Linux username. If
+  empty, then the Experiment will not be associated with any User.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    TF_StrTensor:$db_uri,
+    TF_StrTensor:$experiment_name,
+    TF_StrTensor:$run_name,
+    TF_StrTensor:$user_name
+  );
+
+  let results = (outs);
+}
+
+// TODO(b/168035831): Model logdir read/write.
+def TF_CreateSummaryFileWriterOp : TF_Op<"CreateSummaryFileWriter", []> {
+  let summary = "Creates a summary file writer accessible by the given resource handle.";
+
+  let description = [{
+writer: A handle to the summary writer resource
+logdir: Directory where the event file will be written.
+max_queue: Size of the queue of pending events and summaries.
+flush_millis: How often, in milliseconds, to flush the pending events and
+  summaries to disk.
+filename_suffix: Every event file's name is suffixed with this suffix.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    TF_StrTensor:$logdir,
+    TF_Int32Tensor:$max_queue,
+    TF_Int32Tensor:$flush_millis,
+    TF_StrTensor:$filename_suffix
+  );
+
+  let results = (outs);
+}
+
+def TF_FlushSummaryWriterOp : TF_Op<"FlushSummaryWriter", []> {
+  let summary = "Flushes the writer's unwritten events.";
+
+  let description = [{
+writer: A handle to the summary writer resource.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer
+  );
+
+  let results = (outs);
+}
+
+def TF_ImportEventOp : TF_Op<"ImportEvent", []> {
+  let summary = "Outputs a `tf.Event` protocol buffer.";
+
+  let description = [{
+When CreateSummaryDbWriter is being used, this op can be useful for
+importing data from event logs.
+
+writer: A handle to a summary writer.
+event: A string containing a binary-encoded tf.Event proto.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    TF_StrTensor:$event
+  );
+
+  let results = (outs);
+}
+
+def TF_SummaryWriterOp : TF_Op<"SummaryWriter", [TF_ResourceHandleAllocatorInterface]> {
+  let summary = "Returns a handle to be used to access a summary writer.";
+
+  let description = [{
+The summary writer is an in-graph resource which can be used by ops to write
+summaries to event files.
+
+writer: the summary writer resource. Scalar handle.
+  }];
+
+  let arguments = (ins
+    StrAttr:$shared_name,
+    StrAttr:$container
+  );
+
+  let results = (outs
+    Res<TF_ResourceTensor, "", [TF_SummaryAlloc]>:$writer
+  );
+
+  let extraClassDeclaration = [{
+    // TF_ResourceHandleAllocatorInterface:
+    ResourceHandleValueAndId GetResourceHandleValueAndId(
+      llvm::SmallDenseMap<ResourceHandle, int64_t> &resource_handle_id_map,
+      int64_t &next_id);
+  }];
+}
+
+def TF_WriteAudioSummaryOp : TF_Op<"WriteAudioSummary", []> {
+  let summary = "Writes a `Summary` protocol buffer with audio.";
+
+  let description = [{
+The summary has up to `max_outputs` summary values containing audio. The
+audio is built from `tensor` which must be 3-D with shape `[batch_size,
+frames, channels]` or 2-D with shape `[batch_size, frames]`. The values are
+assumed to be in the range of `[-1.0, 1.0]` with a sample rate of `sample_rate`.
+
+The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+build the `tag` of the summary values:
+
+*  If `max_outputs` is 1, the summary value tag is '*tag*/audio'.
+*  If `max_outputs` is greater than 1, the summary value tags are
+   generated sequentially as '*tag*/audio/0', '*tag*/audio/1', etc.
+
+writer: A handle to a summary writer.
+step: The step to write the summary for.
+tag: Scalar. Used to build the `tag` attribute of the summary values.
+tensor: 2-D of shape `[batch_size, frames]`.
+sample_rate: The sample rate of the signal in hertz.
+max_outputs: Max number of batch elements to generate audio for.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    TF_Int64Tensor:$step,
+    TF_StrTensor:$tag,
+    TF_Float32Tensor:$tensor,
+    TF_Float32Tensor:$sample_rate,
+
+    Confined<DefaultValuedAttr<I64Attr, "3">, [IntMinValue<1>]>:$max_outputs
+  );
+
+  let results = (outs);
+}
+
+def TF_WriteGraphSummaryOp : TF_Op<"WriteGraphSummary", []> {
+  let summary = "Writes a `GraphDef` protocol buffer to a `SummaryWriter`.";
+
+  let description = [{
+writer: Handle of `SummaryWriter`.
+step: The step to write the summary for.
+tensor: A scalar string of the serialized tf.GraphDef proto.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    TF_Int64Tensor:$step,
+    TF_StrTensor:$tensor
+  );
+
+  let results = (outs);
+}
+
+def TF_WriteHistogramSummaryOp : TF_Op<"WriteHistogramSummary", []> {
+  let summary = "Writes a histogram summary.";
+
+  let description = [{
+The generated
+[`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+has one summary value containing a histogram for `values`.
+
+This op reports an `InvalidArgument` error if any value is not finite.
+
+writer: A handle to a summary writer.
+step: The step to write the summary for.
+tag: Scalar.  Tag to use for the `Summary.Value`.
+values: Any shape. Values to use to build the histogram.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    TF_Int64Tensor:$step,
+    TF_StrTensor:$tag,
+    TF_IntOrFpTensor:$values
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
+}
+
+def TF_WriteImageSummaryOp : TF_Op<"WriteImageSummary", []> {
+  let summary = "Writes a `Summary` protocol buffer with images.";
+
+  let description = [{
+The summary has up to `max_images` summary values containing images. The
+images are built from `tensor` which must be 4-D with shape `[batch_size,
+height, width, channels]` and where `channels` can be:
+
+*  1: `tensor` is interpreted as Grayscale.
+*  3: `tensor` is interpreted as RGB.
+*  4: `tensor` is interpreted as RGBA.
+
+The images have the same number of channels as the input tensor. For float
+input, the values are normalized one image at a time to fit in the range
+`[0, 255]`.  `uint8` values are unchanged.  The op uses two different
+normalization algorithms:
+
+*  If the input values are all positive, they are rescaled so the largest one
+   is 255.
+
+*  If any input value is negative, the values are shifted so input value 0.0
+   is at 127.  They are then rescaled so that either the smallest value is 0,
+   or the largest one is 255.
+
+The `tag` argument is a scalar `Tensor` of type `string`.  It is used to
+build the `tag` of the summary values:
+
+*  If `max_images` is 1, the summary value tag is '*tag*/image'.
+*  If `max_images` is greater than 1, the summary value tags are
+   generated sequentially as '*tag*/image/0', '*tag*/image/1', etc.
+
+The `bad_color` argument is the color to use in the generated images for
+non-finite input values.  It is a `unit8` 1-D tensor of length `channels`.
+Each element must be in the range `[0, 255]` (It represents the value of a
+pixel in the output image).  Non-finite values in the input tensor are
+replaced by this tensor in the output image.  The default value is the color
+red.
+
+writer: A handle to a summary writer.
+step: The step to write the summary for.
+tag: Scalar. Used to build the `tag` attribute of the summary values.
+tensor: 4-D of shape `[batch_size, height, width, channels]` where
+  `channels` is 1, 3, or 4.
+max_images: Max number of batch elements to generate images for.
+bad_color: Color to use for pixels with non-finite values.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    TF_Int64Tensor:$step,
+    TF_StrTensor:$tag,
+    TensorOf<[TF_Float16, TF_Float32, TF_Uint8]>:$tensor,
+    TF_Uint8Tensor:$bad_color,
+
+    Confined<DefaultValuedAttr<I64Attr, "3">, [IntMinValue<1>]>:$max_images
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
+}
+
+def TF_WriteRawProtoSummaryOp : TF_Op<"WriteRawProtoSummary", []> {
+  let summary = "Writes a `Summary` protocol buffer with serialized string `Summary` protocol buffers.";
+
+  let description = [{
+writer: A handle to a summary writer.
+step: The step to write the summary for.
+tensor: A tensor holding one or more serialized `Summary` protobufs to write.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    TF_Int64Tensor:$step,
+    TF_StrTensor:$tensor
+  );
+
+  let results = (outs);
+}
+
+def TF_WriteScalarSummaryOp : TF_Op<"WriteScalarSummary", []> {
+  let summary = "Writes a `Summary` protocol buffer with scalar values.";
+
+  let description = [{
+The input `tag` and `value` must have the scalars.
+
+writer: A handle to a summary writer.
+step: The step to write the summary for.
+tag: Tag for the summary.
+value: Value for the summary.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    TF_Int64Tensor:$step,
+    TF_StrTensor:$tag,
+    TF_IntOrFpTensor:$value
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<3>;
+}
+
+def TF_WriteSummaryOp : TF_Op<"WriteSummary", []> {
+  let summary = "Outputs a `Summary` protocol buffer with a tensor.";
+
+  let description = [{
+writer: A handle to a summary writer.
+step: The step to write the summary for.
+tensor: A tensor to serialize.
+tag: The summary's tag.
+summary_metadata: Serialized SummaryMetadata protocol buffer containing
+ plugin-related metadata for this summary.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_SummaryWrite]>:$writer,
+    TF_Int64Tensor:$step,
+    TF_Tensor:$tensor,
+    TF_StrTensor:$tag,
+    TF_StrTensor:$summary_metadata
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<2>;
+}
+
+def TF_InitializeTableFromDatasetOp : TF_Op<"InitializeTableFromDataset", []> {
+  let summary = "";
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_LookupTableWrite]>:$table_handle,
+    TF_VariantTensor:$dataset
+  );
+
+  let results = (outs);
+}
+
+// TODO(b/168035831): Model filename read.
+def TF_InitializeTableFromTextFileV2Op : TF_Op<"InitializeTableFromTextFileV2", []> {
+  let summary = "Initializes a table from a text file.";
+
+  let description = [{
+It inserts one key-value pair into the table for each line of the file.
+The key and value is extracted from the whole line content, elements from the
+split line based on `delimiter` or the line number (starting from zero).
+Where to extract the key and value from a line is specified by `key_index` and
+`value_index`.
+
+- A value of -1 means use the line number(starting from zero), expects `int64`.
+- A value of -2 means use the whole line content, expects `string`.
+- A value >= 0 means use the index (starting at zero) of the split line based
+  on `delimiter`.
+  }];
+
+  let arguments = (ins
+    Arg<TF_ResourceTensor, "", [TF_LookupTableWrite]>:$table_handle,
+    TF_StrTensor:$filename,
+
+    Confined<I64Attr, [IntMinValue<-2>]>:$key_index,
+    Confined<I64Attr, [IntMinValue<-2>]>:$value_index,
+    Confined<DefaultValuedAttr<I64Attr, "-1">, [IntMinValue<-1>]>:$vocab_size,
+    DefaultValuedAttr<StrAttr, "\t">:$delimiter
+  );
+
+  let results = (outs);
+}
+
+// TODO(b/168035831): Model filename read.
+def TF_CacheDatasetV2Op : TF_Op<"CacheDatasetV2", []> {
+  let summary = "";
+
+  let arguments = (ins
+    TF_VariantTensor:$input_dataset,
+    TF_StrTensor:$filename,
+    Arg<TF_ResourceTensor, "", [TF_DatasetMemoryCacheRead, TF_DatasetMemoryCacheWrite]>:$cache,
+
+    Confined<TypeArrayAttr, [ArrayMinCount<1>]>:$output_types,
+    Confined<TF_ShapeAttrArray, [ArrayMinCount<1>]>:$output_shapes
+  );
+
+  let results = (outs
+    TF_VariantTensor:$handle
+  );
+}
+
 #endif // TF_OPS
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
index b465c1da68c..8bbc6a843e5 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
@@ -176,6 +176,72 @@ static LogicalResult Verify(BatchMatMulV2Op op) {
   if (!HasRankAtLeast(op.y(), 2)) {
     return op.emitOpError("requires rhs operand to have rank at least two");
   }
+
+  RankedTensorType x_ty = GetRankedTensorTypeForOperand(op.x());
+  RankedTensorType y_ty = GetRankedTensorTypeForOperand(op.y());
+
+  if (!x_ty || !y_ty) return success();
+
+  ArrayRef<int64_t> x_shape = x_ty.getShape();
+  ArrayRef<int64_t> y_shape = y_ty.getShape();
+
+  // Check broadcast compatibility if both input shapes are known.
+  //
+  // The last two dimensions are non-batch dimensions that don't need to
+  // participate in batch dimension compatibility check.
+
+  llvm::SmallVector<int64_t, 4> result_batch_shape;
+  if (!OpTrait::util::getBroadcastedShape(
+          x_shape.drop_back(2), y_shape.drop_back(2), result_batch_shape))
+    return op.emitOpError()
+           << "found incompatible broadcast batch dimensions for lhs shape "
+           << x_ty << " and rhs shape " << y_ty;
+
+  RankedTensorType output_ty = GetRankedTensorTypeForOperand(op.output());
+  if (!output_ty) return success();
+
+  int64_t expected_output_rank = std::max(x_ty.getRank(), y_ty.getRank());
+  if (output_ty.getRank() != expected_output_rank)
+    return op.emitOpError()
+           << "found invalid output rank, expected " << expected_output_rank
+           << " but got " << output_ty.getRank();
+
+  // Check output batch dim with potential broadcasting.
+  ArrayRef<int64_t> output_shape = output_ty.getShape();
+  for (int i = 0; i < result_batch_shape.size(); ++i) {
+    if (output_shape[i] != ShapedType::kDynamicSize &&
+        output_shape[i] != result_batch_shape[i])
+      return op.emitOpError()
+             << "has mismatching input batch dimension "
+             << result_batch_shape[i] << " and output batch dimension "
+             << output_shape[i];
+  }
+
+  // Check output shape for non-batch dimension, following documentation below.
+  // https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/batch-mat-mul
+  int64_t x_row_dim = x_shape[x_shape.size() - 2];
+  int64_t x_col_dim = x_shape[x_shape.size() - 1];
+  int64_t y_row_dim = y_shape[y_shape.size() - 2];
+  int64_t y_col_dim = y_shape[y_shape.size() - 1];
+  int64_t out_row_dim = output_shape[output_shape.size() - 2];
+  int64_t out_col_dim = output_shape[output_shape.size() - 1];
+
+  int64_t expected_out_row_dim = op.adj_x() ? x_col_dim : x_row_dim;
+  int64_t expected_out_col_dim = op.adj_y() ? y_row_dim : y_col_dim;
+
+  if (expected_out_row_dim != ShapedType::kDynamicSize &&
+      out_row_dim != ShapedType::kDynamicSize &&
+      out_row_dim != expected_out_row_dim)
+    return op.emitOpError()
+           << "found invalid output dimension on row, expected "
+           << expected_out_row_dim << " but got " << out_row_dim;
+  if (expected_out_col_dim != ShapedType::kDynamicSize &&
+      out_col_dim != ShapedType::kDynamicSize &&
+      out_col_dim != expected_out_col_dim)
+    return op.emitOpError()
+           << "found invalid output dimension on col, expected "
+           << expected_out_col_dim << " but got " << out_col_dim;
+
   return success();
 }
 
@@ -190,7 +256,7 @@ void BatchMatMulV2Op::getCanonicalizationPatterns(
 
 static LogicalResult Verify(BatchToSpaceOp op) {
   // Op already has a constraint that block_size >= 2.
-  int64_t block_size = op.block_size().getSExtValue();
+  int64_t block_size = op.block_size();
 
   llvm::SmallVector<int64_t, 4> input_shape(4, ShapedType::kDynamicSize);
   auto input_type = op.input().getType().cast<TensorType>();
@@ -381,6 +447,13 @@ static LogicalResult Verify(BiasAddOp op) {
   return success();
 }
 
+Optional<ContractionFusion> BiasAddOp::GetContractionFusion() {
+  // Only NHWC in f32 is supported for fusion.
+  if (data_format() != "NHWC" || !T().isF32()) return None;
+
+  return ContractionFusion("BiasAdd", /*additional_arguments=*/{1});
+}
+
 //===----------------------------------------------------------------------===//
 // BiasAddGradOp
 //===----------------------------------------------------------------------===//
@@ -473,8 +546,7 @@ LogicalResult FoldConstantCaseOp::matchAndRewrite(
   if (!matchPattern(op.branch_index(), m_Constant(&branch))) return failure();
 
   int index = *branch.getValues<int>().begin();
-  if (index < 0 || index >= op.branches().size())
-    index = op.branches().size() - 1;
+  if (index < 0 || index >= op.num_branches()) index = op.num_branches() - 1;
 
   auto func = op.branches()[index].cast<SymbolRefAttr>();
   auto empty = rewriter.getStringAttr("");
@@ -507,8 +579,9 @@ static LogicalResult VerifyCaseOrIfOpBranchFunctions(
 
   // Functions have one less operand compared to op as first operand is elided
   // (`cond` of `tf.If` and `branch_index` of `tf.Case`).
-  int expected_num_inputs = op->getNumOperands() - 1;
-  int expected_num_results = op->getNumResults();
+  TypeRangeWithDesc input{op->getOperands().drop_front().getTypes(), "input"};
+  TypeRangeWithDesc result{op->getResultTypes(), "result"};
+
   for (auto branch : llvm::enumerate(branches)) {
     auto branch_func = SymbolTable::lookupNearestSymbolFrom<FuncOp>(
         op, branch.value().cast<SymbolRefAttr>());
@@ -518,47 +591,22 @@ static LogicalResult VerifyCaseOrIfOpBranchFunctions(
              << branch.value() << ") to point to a defined function";
 
     FunctionType branch_type = branch_func.getType();
-    if (branch_type.getNumInputs() != expected_num_inputs)
-      return op->emitOpError()
-             << "expects all branches to have " << expected_num_inputs
-             << " input(s), but " << branch_name(branch.index()) << " has "
-             << branch_type.getNumInputs() << " input(s)";
+    std::string desc = branch_name(branch.index()) + " input";
+    TypeRangeWithDesc branch_input{branch_type.getInputs(), desc};
+    if (failed(VerifyTypeRangesAreCompatible(op, branch_input, input)))
+      return failure();
 
-    if (branch_type.getNumResults() != expected_num_results)
-      return op->emitOpError()
-             << "expects all branches to have " << expected_num_results
-             << " result(s), but " << branch_name(branch.index()) << " has "
-             << branch_type.getNumResults() << " result(s)";
-
-    // Non-conditional operands starting with the second operand are passed to
-    // branches and should be compatible across all branches' inputs.
-    for (auto operand_type :
-         llvm::enumerate(llvm::drop_begin(op->getOperandTypes(), 1))) {
-      Type branch_input_i_type = branch_type.getInput(operand_type.index());
-      if (!AreCastCompatible({operand_type.value(), branch_input_i_type}))
-        return op->emitOpError()
-               << "expects operand type " << operand_type.value()
-               << " to be cast compatible with " << branch_name(branch.index())
-               << " input type " << branch_input_i_type << " at index "
-               << operand_type.index();
-    }
-
-    // Branches' results should be pair-wise compatible with the op results.
-    for (auto result_type : llvm::enumerate(op->getResultTypes())) {
-      Type branch_result_i_type = branch_type.getResult(result_type.index());
-      if (!AreCastCompatible({result_type.value(), branch_result_i_type}))
-        return op->emitOpError()
-               << "expects result type " << result_type.value()
-               << " to be cast compatible with " << branch_name(branch.index())
-               << " result type " << branch_result_i_type << " at index "
-               << result_type.index();
-    }
+    desc = branch_name(branch.index()) + " result";
+    TypeRangeWithDesc branch_result{branch_type.getResults(), desc};
+    if (failed(VerifyTypeRangesAreCompatible(op, branch_result, result)))
+      return failure();
 
     branch_types.push_back(branch_type);
   }
 
   // If branches have incompatible input types that means that no tensor can
   // serve as input to all the functions. Hence, the op is invalid.
+  int expected_num_inputs = op->getNumOperands() - 1;
   for (int i = 0; i < expected_num_inputs; ++i) {
     SmallVector<Type, 2> branch_input_i_types;
     branch_input_i_types.reserve(branches.size());
@@ -597,16 +645,89 @@ static LogicalResult Verify(CaseRegionOp op) {
 
   if (failed(VerifyCaseOpBase(op, op.branch_index()))) return failure();
 
+  TypeRangeWithDesc results{op.getResultTypes(), "result"};
+
   for (auto region_and_idx : llvm::enumerate(op.branches())) {
-    std::string region_name =
-        llvm::formatv("region #{0}", region_and_idx.index()).str();
-    if (failed(VerifyRegionResults(op, region_and_idx.value(), region_name)))
+    std::string description =
+        llvm::formatv("branch #{0} result", region_and_idx.index()).str();
+    Operation *yield = region_and_idx.value().front().getTerminator();
+    TypeRangeWithDesc branch_results{yield->getOperandTypes(), description};
+    if (failed(VerifyTypeRangesAreCompatible(op, branch_results, results)))
       return failure();
   }
 
   return success();
 }
 
+namespace {
+// Eliminate values that pass through the CaseRegionOp or IfRegionOp branches.
+template <class CaseOrIfRegionOp>
+class CaseOrIfRegionEliminatePassThrough
+    : public OpRewritePattern<CaseOrIfRegionOp> {
+  using OpRewritePattern<CaseOrIfRegionOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(CaseOrIfRegionOp op,
+                                PatternRewriter &rewriter) const override {
+    RegionRange branches = op.getRegions();
+    SmallVector<Type, 4> new_result_types;
+    // Maps pass through results to extern values.
+    llvm::SmallDenseMap<Value, Value, 4> result_to_extern_value;
+
+    for (auto result : op.getResults()) {
+      unsigned index = result.getResultNumber();
+      Region *first_branch = *branches.begin();
+      Operation *first_terminator = first_branch->front().getTerminator();
+      Value returned_val = first_terminator->getOperand(index);
+
+      // Pass through values would be defined outside the branch region. Keep
+      // the type of non pass through results to create a new op later, if
+      // required.
+      if (returned_val.getParentBlock() == &first_branch->front()) {
+        new_result_types.push_back(result.getType());
+        continue;
+      }
+      // Check if the same extern value is returned in each branch.
+      for (Region *region : branches.drop_front()) {
+        Operation *terminator = region->front().getTerminator();
+        if (terminator->getOperand(index) != returned_val) return failure();
+      }
+      result_to_extern_value[result] = returned_val;
+    }
+
+    // If no pass through values are found, no change is required.
+    if (result_to_extern_value.empty()) return failure();
+
+    // Create new case/if region op.
+    auto new_op = rewriter.create<CaseOrIfRegionOp>(
+        op.getLoc(), new_result_types, op.getOperand(), op.getAttrs(),
+        op.getNumRegions());
+
+    int next_index = 0;
+    for (auto result : op.getResults()) {
+      if (!result_to_extern_value.count(result)) {
+        result.replaceAllUsesWith(new_op.getResult(next_index++));
+        continue;
+      }
+      result.replaceAllUsesWith(result_to_extern_value[result]);
+      for (Region *branch : branches)
+        branch->front().getTerminator()->eraseOperand(next_index);
+    }
+
+    // Move region bodies to the new op.
+    for (auto region_index : llvm::seq<int>(0, branches.size()))
+      new_op.getRegion(region_index).takeBody(op.getRegion(region_index));
+
+    op.erase();
+    return success();
+  }
+};
+}  // namespace
+
+void CaseRegionOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<CaseOrIfRegionEliminatePassThrough<TF::CaseRegionOp>>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // CastOp
 //===----------------------------------------------------------------------===//
@@ -1639,7 +1760,7 @@ static LogicalResult Verify(FakeQuantWithMinMaxArgsOp op) {
     return op.emitOpError("range is invalid: [" + Twine(std::to_string(rmin)) +
                           "," + Twine(std::to_string(rmax)) + "]");
   }
-  int64_t num_bits = op.num_bits().getSExtValue();
+  int64_t num_bits = op.num_bits();
   if (num_bits < 2 || num_bits > 16) {
     return op.emitOpError(
         "requires num_bits to be between 2 and 16, inclusive");
@@ -1659,7 +1780,7 @@ static LogicalResult Verify(FakeQuantWithMinMaxVarsOp op) {
   if (max && !IsOfRankedFloatTensorType(max, 0))
     return op.emitOpError("requires max to be a 0d float tensor");
 
-  int64_t num_bits = op.num_bits().getSExtValue();
+  int64_t num_bits = op.num_bits();
   if (num_bits < 2 || num_bits > 16) {
     return op.emitOpError(
         "requires num_bits to be between 2 and 16, inclusive");
@@ -1683,7 +1804,7 @@ static LogicalResult Verify(FakeQuantWithMinMaxVarsPerChannelOp op) {
   if (!HasRankAtLeast(inputs, 1))
     return op.emitError("requires inputs to be at least 1d float tensor");
 
-  int64_t num_bits = op.num_bits().getSExtValue();
+  int64_t num_bits = op.num_bits();
   if (num_bits < 2 || num_bits > 16) {
     return op.emitOpError(
         "requires num_bits to be between 2 and 16, inclusive");
@@ -1886,7 +2007,7 @@ StringRef FusedBatchNormV3Op::GetOptimalLayout(const RuntimeDevices &devices) {
 //===----------------------------------------------------------------------===//
 
 static LogicalResult Verify(GatherV2Op op) {
-  int64_t batch_dims = op.batch_dims().getSExtValue();
+  int64_t batch_dims = op.batch_dims();
   if (auto ty = op.indices().getType().dyn_cast<RankedTensorType>()) {
     int64_t rank = ty.getRank();
     if (batch_dims > rank || batch_dims < -rank)
@@ -1992,9 +2113,18 @@ void IfOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
 //===----------------------------------------------------------------------===//
 
 static LogicalResult Verify(IfRegionOp op) {
-  if (failed(VerifyRegionResults(op, op.then_branch(), "then")))
+  TypeRange then_types =
+      op.then_branch().front().getTerminator()->getOperandTypes();
+  TypeRange else_types =
+      op.else_branch().front().getTerminator()->getOperandTypes();
+
+  TypeRangeWithDesc results{op.getResultTypes(), "result"};
+  TypeRangeWithDesc then_results{then_types, "then result"};
+  TypeRangeWithDesc else_results{else_types, "else result"};
+
+  if (failed(VerifyTypeRangesAreCompatible(op, then_results, results)))
     return failure();
-  if (failed(VerifyRegionResults(op, op.else_branch(), "else")))
+  if (failed(VerifyTypeRangesAreCompatible(op, else_results, results)))
     return failure();
   return success();
 }
@@ -2051,7 +2181,8 @@ LogicalResult FoldConstantIfRegionOp::matchAndRewrite(
 
 void IfRegionOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
                                              MLIRContext *context) {
-  results.insert<FoldConstantIfRegionOp>(context);
+  results.insert<FoldConstantIfRegionOp,
+                 CaseOrIfRegionEliminatePassThrough<TF::IfRegionOp>>(context);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2102,6 +2233,15 @@ OpFoldResult LeakyReluOp::fold(ArrayRef<Attribute> operands) {
   return {};
 }
 
+Optional<ContractionFusion> LeakyReluOp::GetContractionFusion() {
+  // Only f32 is supported for fusion.
+  if (!T().isF32()) return None;
+
+  NamedAttribute alpha(Identifier::get("alpha", getContext()), alphaAttr());
+  return ContractionFusion("LeakyRelu", /*additional_arguments=*/{},
+                           /*additional_attributes=*/{alpha});
+}
+
 //===----------------------------------------------------------------------===//
 // LogOp
 //===----------------------------------------------------------------------===//
@@ -2223,12 +2363,12 @@ OpFoldResult MulOp::fold(ArrayRef<Attribute> operands) {
   return IdentityArithmeticOpFolder<MulOp>(*this, operands);
 }
 
+}  // namespace TF
+}  // namespace mlir
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
 
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc.inc"
-
-}  // namespace TF
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h
index 19a927a23d7..8d98632b198 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h
@@ -43,6 +43,9 @@ namespace TF {
 
 class YieldOp;
 
+}  // namespace TF
+}  // namespace mlir
+
 // TODO(b/131258166): TensorFlow's mutex.h defines a `mutex_lock` macro, whose
 // purpose is to catch bug on `tensorflow::mutex_lock`. We don't use
 // `tensorflow::mutex_lock` here but we have ops (`tf.MutexLock` and
@@ -56,7 +59,4 @@ class YieldOp;
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h.inc"
 
-}  // namespace TF
-}  // namespace mlir
-
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_A_M_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc
index bb7d9a50521..72ca50b5c37 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc
@@ -543,27 +543,27 @@ static LogicalResult VerifyReductionInputAndDims(Value input, Value dims,
   return success();
 }
 
-LogicalResult VerifyRegionResults(Operation *op, Region &region,
-                                  StringRef region_name) {
-  auto op_name = op->getName().getStringRef();
-  // verify that op outputs match yield inputs
-  YieldOp yield = cast<YieldOp>(region.front().getTerminator());
-  unsigned expected_num_results = op->getNumResults();
-  if (yield.getNumOperands() != expected_num_results)
-    return op->emitOpError()
-           << region_name + " should have same number (" << expected_num_results
-           << ") of results as " << op_name << " but has "
-           << yield.getNumOperands() << " results";
+// A type range with description (in singular form) attached to it.
+using TypeRangeWithDesc = std::pair<TypeRange, StringRef>;
 
-  for (int idx : llvm::seq<int>(0, expected_num_results)) {
-    auto op_result_type = op->getResult(idx).getType().cast<TensorType>();
-    auto region_result_type =
-        yield.getOperand(idx).getType().cast<TensorType>();
-    if (!AreCastCompatible({region_result_type, op_result_type}))
-      return op->emitError(llvm::formatv(
-          "{0} result type {1} is incompatible with {2} "
-          "result type {3} at index {4}",
-          region_name, region_result_type, op_name, op_result_type, idx));
+LogicalResult VerifyTypeRangesAreCompatible(Operation *op,
+                                            TypeRangeWithDesc range0,
+                                            TypeRangeWithDesc range1) {
+  if (range0.first.size() != range1.first.size()) {
+    return op->emitOpError()
+           << range0.second << "s (size = " << range0.first.size() << ")"
+           << " should have the same number of values as " << range1.second
+           << "s (size = " << range1.first.size() << ")";
+  }
+
+  for (auto it : llvm::enumerate(llvm::zip(range0.first, range1.first))) {
+    int index = it.index();
+    Type type0 = std::get<0>(it.value());
+    Type type1 = std::get<1>(it.value());
+    if (!AreCastCompatible({type0, type1}))
+      return op->emitOpError(llvm::formatv(
+          "{0} type {1} is incompatible with {2} type {3} at index {4}",
+          range0.second, type0, range1.second, type1, index));
   }
   return success();
 }
@@ -587,3 +587,31 @@ struct DropAttributes : public OpRewritePattern<Op> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// TF op helper functions for handling resource handles and ids.
+//===----------------------------------------------------------------------===//
+
+// Returns device of op if present. If op has no device set, an empty string ref
+// is returned instead.
+llvm::StringRef GetDeviceOrEmpty(Operation *op) {
+  if (auto device_attr = op->getAttrOfType<StringAttr>("device"))
+    return device_attr.getValue();
+  return llvm::StringRef();
+}
+
+// Returns resource handle value and id for resource op based on attributes. If
+// a resource handle is anonymous, a new id is always returned.
+ResourceHandleValueAndId GetResourceHandleValueAndIdBase(
+    llvm::StringRef container, llvm::StringRef shared_name,
+    llvm::StringRef device, Value resource,
+    llvm::SmallDenseMap<ResourceHandle, int64_t> &resource_handle_id_map,
+    int64_t &next_id) {
+  // Always create a new ID for anonymous handle.
+  if (IsResourceHandleAnonymous(shared_name)) return {resource, next_id++};
+
+  ResourceHandle handle(container, shared_name, device, /*op=*/nullptr);
+  auto emplace_res = resource_handle_id_map.try_emplace(handle, next_id);
+  // New ID created, increment next_id.
+  if (emplace_res.second) ++next_id;
+  return {resource, emplace_res.first->second};
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
index cbac03f80f8..b99c99029ed 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
@@ -27,6 +27,8 @@ limitations under the License.
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
@@ -34,6 +36,7 @@ limitations under the License.
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -109,7 +112,7 @@ void NotEqualOp::build(OpBuilder &builder, OperationState &result, Value x,
 //===----------------------------------------------------------------------===//
 
 static LogicalResult Verify(OneHotOp op) {
-  int64_t axis = op.axis().getSExtValue();
+  int64_t axis = op.axis();
 
   auto indices_ty = op.indices().getType().dyn_cast<RankedTensorType>();
   if (indices_ty &&
@@ -207,7 +210,7 @@ static LogicalResult Verify(PackOp op) {
   // the axis value range is [-(R+1), R+1).
   int64_t range_begin = -inputs_rank - 1;  // Inclusive
   int64_t range_end = inputs_rank + 1;     // Exclusive
-  int64_t axis = op.axis().getSExtValue();
+  int64_t axis = op.axis();
   if (axis < range_begin || axis >= range_end) {
     return op.emitError() << "attribute 'axis' should be within range ["
                           << range_begin << ", " << range_end
@@ -232,7 +235,7 @@ OpFoldResult PackOp::fold(ArrayRef<Attribute> operands) {
   if (values().size() < 2) return {};
 
   // Dimensions packed along axis = 0 (pack scalars into vector).
-  if (axis().getSExtValue() != 0) return {};
+  if (axis() != 0) return {};
 
   // First packed value is defined by a strided slice operation.
   auto slice_op = dyn_cast_or_null<StridedSliceOp>(values()[0].getDefiningOp());
@@ -247,11 +250,9 @@ OpFoldResult PackOp::fold(ArrayRef<Attribute> operands) {
 
   // All masks are `0` except `shrink_axis_mask` which is equal to `1` (slicing
   // scalar value from input vector).
-  if (slice_op.begin_mask().getSExtValue() != 0 ||
-      slice_op.ellipsis_mask().getSExtValue() != 0 ||
-      slice_op.end_mask().getSExtValue() != 0 ||
-      slice_op.new_axis_mask().getSExtValue() != 0 ||
-      slice_op.shrink_axis_mask().getSExtValue() != 1)
+  if (slice_op.begin_mask() != 0 || slice_op.ellipsis_mask() != 0 ||
+      slice_op.end_mask() != 0 || slice_op.new_axis_mask() != 0 ||
+      slice_op.shrink_axis_mask() != 1)
     return {};
 
   // Returns a value if the `value` is defined by a ConstOp with a single
@@ -566,135 +567,158 @@ OpFoldResult RealDivOp::fold(ArrayRef<Attribute> operands) {
   return IdentityArithmeticOpFolder<RealDivOp>(*this, operands);
 }
 
+//===----------------------------------------------------------------------===//
+// ReluOp
+//===----------------------------------------------------------------------===//
+
+Optional<ContractionFusion> ReluOp::GetContractionFusion() {
+  // Only f32 is supported for fusion.
+  if (!T().isF32()) return None;
+
+  return ContractionFusion("Relu", /*additional_arguments=*/{});
+}
+
 //===----------------------------------------------------------------------===//
 // ReshapeOp
 //===----------------------------------------------------------------------===//
 
-// TODO(b/128020684): Verify the output type.
-static LogicalResult Verify(ReshapeOp op) {
-  auto shape_type = op.shape().getType().cast<TensorType>();
-  if (!shape_type.hasRank()) return success();
-  if (shape_type.getRank() != 1)
-    return op.emitOpError("shape must be 1D tensor");
-  auto rank_by_shape = shape_type.getShape()[0];
-  auto type_of_tensor = op.tensor().getType().cast<TensorType>();
-  // No compile time verification for unknown sized shape.
-  if (rank_by_shape == -1 || !type_of_tensor.hasStaticShape()) return success();
-  int64_t num_by_tensor = type_of_tensor.getNumElements();
+namespace {
+using ReshapeErrorHandler =
+    llvm::function_ref<LogicalResult(const llvm::Twine &)>;
 
-  auto out_ty = op.getType().dyn_cast<RankedTensorType>();
-  if (out_ty && out_ty.hasStaticShape()) {
-    int64_t num_output_elements = out_ty.getNumElements();
-    if (num_by_tensor != num_output_elements)
-      return op.emitOpError()
-             << "number of output elements (" << num_output_elements
-             << ") does not match expected number of elements ("
-             << num_by_tensor << ")";
-  }
+LogicalResult GetReshapeOutputType(Value tensor, Value shape,
+                                   ReshapeErrorHandler error_handler,
+                                   TensorType &output_ty) {
+  auto tensor_ty = tensor.getType().cast<TensorType>();
+  auto element_ty = tensor_ty.getElementType();
+  output_ty = UnrankedTensorType::get(element_ty);
 
-  // Check values if constant shape. No compiling time verification for
-  // non-constant shape.
-  auto *shape_op = op.shape().getDefiningOp();
-  if (!shape_op) return success();
-  Attribute shape_cst;
-  if (!matchPattern(shape_op, m_Constant(&shape_cst))) return success();
-  auto shape_cst_attr = shape_cst.dyn_cast<ElementsAttr>();
-  if (!shape_cst_attr) return op.emitOpError("shape must be a valid tensor");
+  auto shape_ty = shape.getType().dyn_cast<RankedTensorType>();
+  if (!shape_ty) return success();
+  if (shape_ty.getRank() != 1)
+    return error_handler(llvm::formatv(
+        "requires 'shape' to be rank 1, but got {0}", shape_ty.getRank()));
 
-  if (auto opaque_attr = shape_cst_attr.dyn_cast<OpaqueElementsAttr>()) {
-    opaque_attr.decode(shape_cst_attr);
-  }
-
-  // We know the shape is a 1-D Tensor, then let us get the number of
-  // elements it implies.
-  unsigned num_by_shape = 1;
-  unsigned unknown_dim_count = 0;
-  for (int i = 0, e = rank_by_shape; i != e; ++i) {
-    auto num = shape_cst_attr.getValue<IntegerAttr>(i).getInt();
-    // The dimension size value can be -1, and that the real size needs to
-    // be computed so that the total size remains constant. At most one
-    // component of shape can be -1.
-    if (num == -1) {
-      if (++unknown_dim_count > 1) {
-        return op.emitOpError("more than one component of shape are -1");
-      }
-    } else {
-      num_by_shape *= num;
+  DenseIntElementsAttr shape_attr;
+  if (!matchPattern(shape, m_Constant(&shape_attr))) {
+    // If only shape of `shape` is known, return ranked but dynamic output
+    // shape.
+    if (shape_ty.hasStaticShape()) {
+      llvm::SmallVector<int64_t, 8> dynamic_shape(shape_ty.getDimSize(0),
+                                                  ShapedType::kDynamicSize);
+      output_ty = RankedTensorType::get(dynamic_shape, element_ty);
     }
-  }
-  // If there is one component of shape is -1, the dimension should be
-  // computed so that the total size remains constant.
-  if (unknown_dim_count == 1) {
-    if (num_by_tensor % num_by_shape != 0)
-      return op.emitOpError(
-          "one component of shape is -1 but couldn't infer the dimension");
     return success();
   }
-  // If the elements by the tensor and implies by the shape don't match,
-  // fail this static check.
-  if (num_by_tensor != num_by_shape) {
-    return op.emitOpError(
-        "mismatch in tensor elements and shape implied elements");
+
+  // Detect if reshape output shape is folded.
+  bool shape_ty_zero_dim = false;
+  int unknown_index = -1;
+  // The product of constant shape argument excluding unknown dimension.
+  int64_t shape_ty_size = 1;
+  llvm::SmallVector<int64_t, 8> output_ty_shape;
+  output_ty_shape.reserve(shape_attr.getNumElements());
+  for (const auto &dim : llvm::enumerate(shape_attr.getIntValues())) {
+    const int64_t size = dim.value().getSExtValue();
+    if (size == ShapedType::kDynamicSize) {
+      if (unknown_index != -1)
+        return error_handler(llvm::formatv(
+            "requires 'shape' to have at most one dynamic dimension, but got "
+            "multiple dynamic dimensions at indices {0} and {1}",
+            unknown_index, dim.index()));
+
+      unknown_index = dim.index();
+    } else if (size == 0) {
+      shape_ty_zero_dim = true;
+    } else if (size > 0) {
+      shape_ty_size *= size;
+    } else {
+      return error_handler(
+          llvm::formatv("requires 'shape' to have dimensions greater than -1, "
+                        "but got {0} at index {1}",
+                        size, dim.index()));
+    }
+    output_ty_shape.push_back(size);
   }
+
+  if (!tensor_ty.hasStaticShape()) {
+    output_ty = RankedTensorType::get(output_ty_shape, element_ty);
+    return success();
+  }
+
+  // Compute the value of the unknown dimension.
+  if (unknown_index != -1) {
+    // Compute number of elements in tensor shape.
+    int64_t tensor_ty_size = 1;
+    bool tensor_ty_zero_dim = false;
+    for (const auto &dim : tensor_ty.getShape()) {
+      if (dim > 0 || !shape_ty_zero_dim) {
+        tensor_ty_size *= dim;
+      } else {
+        tensor_ty_zero_dim = true;
+      }
+    }
+
+    const int64_t missing_dim = tensor_ty_size / shape_ty_size;
+    if (!tensor_ty_zero_dim && shape_ty_size * missing_dim != tensor_ty_size)
+      return error_handler(
+          llvm::formatv("requires 'tensor' number of elements be a multiple of "
+                        "{0}, but got {1}",
+                        shape_ty_size, tensor_ty_size));
+
+    // Set the unknown dimension such that total number of elements remain
+    // constant.
+    output_ty_shape[unknown_index] = missing_dim;
+  }
+
+  output_ty = RankedTensorType::get(output_ty_shape, element_ty);
+
+  return success();
+}
+}  // namespace
+
+static LogicalResult Verify(ReshapeOp op) {
+  auto error_handler = [&op](const llvm::Twine &message) -> LogicalResult {
+    return op.emitOpError() << message;
+  };
+  TensorType expected_ty;
+  if (failed(GetReshapeOutputType(op.tensor(), op.shape(), error_handler,
+                                  expected_ty)))
+    return failure();
+
+  auto output_ty = op.getType().dyn_cast<RankedTensorType>();
+  if (!output_ty) return success();
+  auto tensor_ty = op.tensor().getType().cast<TensorType>();
+  if (output_ty.hasStaticShape() && tensor_ty.hasStaticShape()) {
+    const int64_t output_ty_size = output_ty.getNumElements();
+    const int64_t tensor_ty_size = tensor_ty.getNumElements();
+    if (tensor_ty_size != output_ty_size)
+      return op.emitOpError() << "requires 'output' number of elements to "
+                                 "match 'tensor' number of elements, but got "
+                              << output_ty_size << " and " << tensor_ty_size;
+  }
+
+  if (!AreCastCompatible({output_ty, expected_ty}))
+    return op.emitOpError()
+           << "requires 'output' type " << output_ty
+           << " to be cast compatible with expected type " << expected_ty;
+
   return success();
 }
 
+// Currently there are use cases that rely on partial evaluation of the `shape`
+// operand, so InferTypeOpInterface is not used (along with generated builder of
+// the same signature).
 void ReshapeOp::build(OpBuilder &builder, OperationState &result, Value tensor,
                       Value shape) {
-  auto ttype = tensor.getType().cast<ShapedType>();
-  auto etype = ttype.getElementType();
-
-  auto unranked = [&builder, etype, &result, shape, tensor]() {
-    return ReshapeOp::build(builder, result, UnrankedTensorType::get(etype),
-                            tensor, shape);
+  auto error_handler = [&result](const llvm::Twine &message) {
+    return mlir::emitError(result.location) << message;
   };
+  TensorType output_ty;
+  if (failed(GetReshapeOutputType(tensor, shape, error_handler, output_ty)))
+    return;
 
-  // If tensor is unranked then we have no info about output of shape.
-  if (!ttype.hasRank()) return unranked();
-
-  DenseIntElementsAttr attr_shape;
-  if (matchPattern(shape, m_Constant(&attr_shape))) {
-    llvm::SmallVector<int64_t, 4> const_shape;
-    const_shape.reserve(attr_shape.getNumElements());
-
-    // Detect if reshape output shape is folded.
-    bool flatten = false;
-    int unknown_index = -1;
-    // The product of constant shape argument excluding unknown dimension.
-    int64_t product_cshape = 1;
-    for (auto e : llvm::enumerate(attr_shape)) {
-      int64_t val = e.value().getSExtValue();
-      if (IsUnknownDimOrRank(val)) {
-        if (flatten) {
-          mlir::emitError(result.location)
-              << "only one unknown dimension allowed";
-          return;
-        }
-        flatten = true;
-        unknown_index = e.index();
-      } else {
-        product_cshape *= val;
-      }
-      const_shape.push_back(val);
-    }
-
-    // Compute the value of the unknown dimension.
-    if (flatten) {
-      // Compute number of elements in tensor shape.
-      auto tshape = ttype.getShape();
-      int64_t product_tshape = std::accumulate(tshape.begin(), tshape.end(), 1,
-                                               std::multiplies<int64_t>());
-      // Set the unknown dimension such that total number of elements remain
-      // constant.
-      // Note: The case where the ratio is not integral, and so the total size
-      // of reshape not constant, is checked in verify function.
-      const_shape[unknown_index] = product_tshape / product_cshape;
-    }
-    return ReshapeOp::build(builder, result,
-                            RankedTensorType::get(const_shape, etype), tensor,
-                            shape);
-  }
-  return unranked();
+  return ReshapeOp::build(builder, result, output_ty, tensor, shape);
 }
 
 void ReshapeOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
@@ -1023,6 +1047,7 @@ static LogicalResult Verify(SizeOp op) {
 
 OpFoldResult SizeOp::fold(ArrayRef<Attribute> operands) {
   ShapedType output_type = getType().cast<ShapedType>();
+  if (!output_type.hasRank()) return {};
   ShapedType input_type = getOperand().getType().cast<ShapedType>();
   if (!input_type.hasStaticShape()) return {};
   int size = input_type.getNumElements();
@@ -1042,8 +1067,11 @@ OpFoldResult SizeOp::fold(ArrayRef<Attribute> operands) {
 //   of elements in operands begin and size.
 // - if begin are constants, that
 //   0 <= begin[i] <= begin[i] + size[i] <= input_ty.getShape()[i]
+//   and
+//   size[i] == output_ty.getShape()[i]
 // - if begins aren't constant but the input is a ranked tensor, that
 //   size[i] <= input_ty.getShape()[i]
+// - output rank is the same as input rank
 //
 static LogicalResult Verify(SliceOp op) {
   RankedTensorType begin_ty = GetRankedTensorTypeForOperand(op.begin());
@@ -1071,21 +1099,40 @@ static LogicalResult Verify(SliceOp op) {
                                "are equal to input rank";
   }
 
+  auto output_ty = op.output().getType().dyn_cast<RankedTensorType>();
+  if (output_ty && input_ty && output_ty.getRank() != input_ty.getRank()) {
+    return op.emitOpError()
+           << "requires output to have the same rank as input, but got input "
+              "rank "
+           << input_ty.getRank() << " and output rank " << output_ty.getRank();
+  }
+
   DenseIntElementsAttr begin_indices;
   if (matchPattern(op.begin(), m_Constant(&begin_indices))) {
     DenseIntElementsAttr slice_sizes;
     bool constant_slice_sizes =
         matchPattern(op.size(), m_Constant(&slice_sizes));
     int dim = 0;
+    // TODO(jpienaar): Reformulate the shape verification below to not use magic
+    // constants.
     for (const APInt &raw_begin_index : begin_indices.getValues<APInt>()) {
       int64_t begin_index = raw_begin_index.getSExtValue();
       int64_t input_size = input_ty ? input_ty.getShape()[dim] : -1;
       int64_t slice_size = constant_slice_sizes
                                ? slice_sizes.getValue<APInt>(dim).getSExtValue()
                                : 0;
+      int64_t output_size = output_ty ? output_ty.getShape()[dim] : -1;
+
       if (slice_size == -1 && input_size != -1) {
         slice_size = input_size - begin_index;
       }
+      if (output_size != -1 && constant_slice_sizes &&
+          output_size != slice_size) {
+        return op.emitOpError()
+               << "requires output size to have the same size of slice, got "
+                  "slice size "
+               << slice_size << " and output size " << output_size;
+      }
       if (begin_index < 0 ||
           (input_size != -1 && begin_index + slice_size > input_size)) {
         return op.emitOpError()
@@ -1143,6 +1190,183 @@ static LogicalResult Verify(SoftmaxCrossEntropyWithLogitsOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// SpaceToBatchNDOp
+//===----------------------------------------------------------------------===//
+
+int64_t SpaceToBatchNDBlockRank(const TensorType block_shape_type,
+                                const TensorType paddings_type) {
+  if (block_shape_type.hasStaticShape()) {
+    return block_shape_type.getShape()[0];
+  } else if (paddings_type.hasStaticShape()) {
+    return paddings_type.getShape()[0];
+  } else {
+    return -1;
+  }
+}
+
+static LogicalResult Verify(SpaceToBatchNDOp op) {
+  const auto input_type = op.input().getType().cast<TensorType>();
+  const auto block_shape_type = op.block_shape().getType().cast<TensorType>();
+  const auto paddings_type = op.paddings().getType().cast<TensorType>();
+
+  // Check that block_shape has rank 1.
+  if (!IsOfRankOrUnranked(op.block_shape(), 1)) {
+    return op.emitOpError() << "requires rank of block_shape = 1; got "
+                            << block_shape_type.getRank();
+  }
+
+  // Check that paddings has rank 2.
+  if (!IsOfRankOrUnranked(op.paddings(), 2)) {
+    return op.emitOpError()
+           << "requires rank of paddings = 2; got " << paddings_type.getRank();
+  }
+
+  // Check that paddings.shape[1]=2.
+  if (paddings_type.hasStaticShape() && paddings_type.getShape()[1] != 2) {
+    return op.emitOpError() << "requires paddings.shape[1] to be 2; got "
+                            << paddings_type.getShape()[1];
+  }
+
+  // Check that block_shape and paddings have consistent ranks.
+  if (block_shape_type.hasStaticShape() && paddings_type.hasStaticShape() &&
+      block_shape_type.getShape()[0] != paddings_type.getShape()[0]) {
+    return op.emitOpError()
+           << "requires block_shape.shape[0] must equal paddings.shape[0]";
+  }
+
+  const int64_t block_rank =
+      SpaceToBatchNDBlockRank(block_shape_type, paddings_type);
+
+  // Further checks require block_rank to be known.
+  if (block_rank == -1) {
+    return success();
+  }
+
+  // check that rank of input_type >= block_rank + 1
+  if (input_type.hasRank() && input_type.getRank() < 1 + block_rank) {
+    return op.emitOpError() << "requires rank of input >= 1 + rank of block";
+  }
+
+  ElementsAttr block_shape_attr = nullptr;
+  ElementsAttr paddings_attr = nullptr;
+
+  // Check that block_shape[*] >= 1.
+  if (matchPattern(op.block_shape(), m_Constant(&block_shape_attr))) {
+    uint64_t i = 0;
+    for (auto block_len : block_shape_attr.getValues<APInt>()) {
+      if (block_len.getSExtValue() < 1) {
+        return op.emitOpError()
+               << "requires all values of block_shape to be >= 1; "
+                  "failed for dimension "
+               << i;
+      }
+      ++i;
+    }
+  }
+
+  // Check that paddings[*] >= 0.
+  if (matchPattern(op.paddings(), m_Constant(&paddings_attr))) {
+    for (uint64_t i = 0; i < block_rank; ++i) {
+      const int64_t pad_start =
+          paddings_attr.getValue({i, 0}).cast<IntegerAttr>().getInt();
+      const int64_t pad_end =
+          paddings_attr.getValue({i, 1}).cast<IntegerAttr>().getInt();
+      if (pad_start < 0 || pad_end < 0) {
+        return op.emitOpError()
+               << "requires all values of paddings to be >= 0; "
+                  "failed for dimension "
+               << i;
+      }
+    }
+  }
+
+  // Check that block_shape divides the padded input.
+  if (input_type.hasStaticShape() && block_shape_attr && paddings_attr) {
+    for (uint64_t i = 0; i < block_rank; ++i) {
+      const int64_t input_len = input_type.getShape()[1 + i];
+      const int64_t pad_start =
+          paddings_attr.getValue({i, 0}).cast<IntegerAttr>().getInt();
+      const int64_t pad_end =
+          paddings_attr.getValue({i, 1}).cast<IntegerAttr>().getInt();
+      const int64_t block_len =
+          block_shape_attr.getValue({i}).cast<IntegerAttr>().getInt();
+      if ((input_len + pad_start + pad_end) % block_len != 0) {
+        return op.emitOpError()
+               << "requires block_shape[i] divides "
+                  "input_shape[i + 1] + paddings[i, 0] + paddings[i, 1]; "
+                  "failed for i="
+               << i;
+      }
+    }
+  }
+
+  return success();
+}
+
+// Infers returned rank if possible. Further, infers returned dimension sizes
+// when possible. For all dimensions sizes to be inferred, the arguments
+// block_shape and paddings must be constant.
+LogicalResult SpaceToBatchNDOp::inferReturnTypes(
+    MLIRContext *context, Optional<Location> location, ValueRange operands,
+    DictionaryAttr attributes, RegionRange regions,
+    SmallVectorImpl<Type> &inferredReturnTypes) {
+  const Value input = operands[0];
+  const Value block_shape_val = operands[1];
+  const Value paddings_val = operands[2];
+  const auto input_type = input.getType().cast<TensorType>();
+  const auto block_shape_type = block_shape_val.getType().cast<TensorType>();
+  const auto paddings_type = paddings_val.getType().cast<TensorType>();
+
+  // The return is unranked when the input is unranked.
+  if (!input_type.hasRank()) {
+    inferredReturnTypes.assign(
+        {UnrankedTensorType::get(input_type.getElementType())});
+    return success();
+  }
+
+  const int64_t input_rank = input_type.getRank();
+  const ArrayRef<int64_t> input_shape = input_type.getShape();
+  const int64_t block_rank =
+      SpaceToBatchNDBlockRank(block_shape_type, paddings_type);
+  SmallVector<int64_t, 4> return_shape(input_rank, ShapedType::kDynamicSize);
+
+  // The return has all dimension sizes unknown when block_rank is unknown.
+  if (block_rank == -1) {
+    inferredReturnTypes.assign(
+        {RankedTensorType::get(return_shape, input_type.getElementType())});
+    return success();
+  }
+
+  // The return preserves the remaining dimensions after blocked dimensions.
+  for (uint64_t i = 1 + block_rank; i < input_rank; ++i) {
+    return_shape[i] = input_shape[i];
+  }
+
+  // The rest of the dimension sizes can be calculated when block_shape and
+  // paddings arguments are constant.
+  ElementsAttr block_shape_attr;
+  ElementsAttr paddings_attr;
+  if (matchPattern(block_shape_val, m_Constant(&block_shape_attr)) &&
+      matchPattern(paddings_val, m_Constant(&paddings_attr))) {
+    int64_t return_batch = input_shape[0];
+    for (uint64_t i = 0; i < block_rank; ++i) {
+      int64_t paddings_sum =
+          paddings_attr.getValue({i, 0}).cast<IntegerAttr>().getInt() +
+          paddings_attr.getValue({i, 1}).cast<IntegerAttr>().getInt();
+      int64_t block_shape_i =
+          block_shape_attr.getValue({i}).cast<IntegerAttr>().getInt();
+      return_batch *= block_shape_i;
+      return_shape[1 + i] = (paddings_sum + input_shape[i + 1]) / block_shape_i;
+    }
+    return_shape[0] = return_batch;
+  }
+
+  inferredReturnTypes.assign(
+      {RankedTensorType::get(return_shape, input_type.getElementType())});
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // SparseSoftmaxCrossEntropyWithLogitsOp
 //===----------------------------------------------------------------------===//
@@ -1237,7 +1461,8 @@ static LogicalResult Verify(SplitVOp op) {
   if (!split_sizes_type) return success();
 
   if (split_sizes_type.getRank() != 1 ||
-      split_sizes_type.getDimSize(0) != op.getNumResults())
+      (split_sizes_type.getDimSize(0) != ShapedType::kDynamicSize &&
+       split_sizes_type.getDimSize(0) != op.getNumResults()))
     return op.emitOpError("split sizes should be a 1D tensor of ")
            << op.getNumResults() << " elements";
 
@@ -1389,7 +1614,7 @@ static LogicalResult VerifyStridedSliceBase(OpTy op) {
 
   // Use bit compares to ensure ellipsis_mask is 0 or a power of 2, i.e. there
   // exists only no more than one ellipsis.
-  uint32_t ellipsis_mask = op.ellipsis_mask().getZExtValue();
+  uint32_t ellipsis_mask = op.ellipsis_mask();
   if (ellipsis_mask != 0 && !llvm::isPowerOf2_32(ellipsis_mask))
     return op.emitOpError("cannot have multiple ellipses");
 
@@ -1645,10 +1870,9 @@ bool StridedSliceOp::GetSlicedBoundRanges(
     sparse_strides.push_back(stride.getSExtValue());
 
   CalculateSlicedShapeFromSparseIndices(
-      input_shape, sparse_begin, sparse_end, sparse_strides,
-      begin_mask().getZExtValue(), end_mask().getZExtValue(),
-      ellipsis_mask().getZExtValue(), new_axis_mask().getZExtValue(),
-      shrink_axis_mask().getZExtValue(), slice_begin, slice_end, slice_stride);
+      input_shape, sparse_begin, sparse_end, sparse_strides, begin_mask(),
+      end_mask(), ellipsis_mask(), new_axis_mask(), shrink_axis_mask(),
+      slice_begin, slice_end, slice_stride);
   return true;
 }
 
@@ -1699,13 +1923,25 @@ bool StridedSliceGradOp::GetSlicedShapeAndBoundRanges(
     sparse_strides.push_back(stride.getSExtValue());
 
   CalculateSlicedShapeFromSparseIndices(
-      *input_shape, sparse_begin, sparse_end, sparse_strides,
-      begin_mask().getZExtValue(), end_mask().getZExtValue(),
-      ellipsis_mask().getZExtValue(), new_axis_mask().getZExtValue(),
-      shrink_axis_mask().getZExtValue(), slice_begin, slice_end, slice_stride);
+      *input_shape, sparse_begin, sparse_end, sparse_strides, begin_mask(),
+      end_mask(), ellipsis_mask(), new_axis_mask(), shrink_axis_mask(),
+      slice_begin, slice_end, slice_stride);
   return true;
 }
 
+//===----------------------------------------------------------------------===//
+// SummaryWriterOp
+//===----------------------------------------------------------------------===//
+
+ResourceHandleValueAndId SummaryWriterOp::GetResourceHandleValueAndId(
+    llvm::SmallDenseMap<ResourceHandle, int64_t> &resource_handle_id_map,
+    int64_t &next_id) {
+  llvm::StringRef device = GetDeviceOrEmpty(getOperation());
+  return GetResourceHandleValueAndIdBase(container(), shared_name(), device,
+                                         writer(), resource_handle_id_map,
+                                         next_id);
+}
+
 //===----------------------------------------------------------------------===//
 // TensorListReserveOp
 //===----------------------------------------------------------------------===//
@@ -1776,6 +2012,87 @@ static LogicalResult Verify(TensorScatterUpdateOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// TileOp
+//===----------------------------------------------------------------------===//
+
+// Verifies that,
+//
+// - input has at least rank 1
+// - multiples is rank 1
+// - multiples.size() == input.rank()
+// - input.rank() == output.rank()
+// - Elements in multiples are non-negative
+// - input.shape[i] * multiples[i] == output.shape[i]
+//   for i in [0, input.rank() - 1]
+
+static LogicalResult Verify(TileOp op) {
+  auto input_type = op.input().getType().dyn_cast<RankedTensorType>();
+  auto multiples_type = op.multiples().getType().dyn_cast<RankedTensorType>();
+  auto output_type = op.output().getType().dyn_cast<RankedTensorType>();
+
+  if (multiples_type && multiples_type.getRank() != 1) {
+    return op.emitOpError() << "expected multiples to be rank 1, got rank = "
+                            << multiples_type.getRank();
+  }
+
+  if (input_type && multiples_type && multiples_type.hasStaticShape() &&
+      (input_type.getRank() != multiples_type.getNumElements() ||
+       (input_type.getRank() == 0 && multiples_type.getNumElements() == 1))) {
+    return op.emitOpError()
+           << "expected size of multiples equal to rank of input"
+           << ", got multiples of size " << multiples_type.getNumElements()
+           << ", and input of rank " << input_type.getRank();
+  }
+
+  if (input_type && output_type) {
+    if (input_type.getRank() != output_type.getRank()) {
+      return op.emitOpError()
+             << "expected rank of input to equal to rank of output"
+             << ", got input of rank " << input_type.getRank()
+             << ", and output of rank " << output_type.getRank();
+    }
+
+    DenseIntElementsAttr multiples_attr;
+    if (matchPattern(op.multiples(), m_Constant(&multiples_attr))) {
+      for (int32_t i = 0, e = input_type.getRank(); i < e; ++i) {
+        const int64_t input_dim = input_type.getDimSize(i);
+        const int64_t output_dim = output_type.getDimSize(i);
+        const int64_t m = multiples_attr.getValue<APInt>(i).getSExtValue();
+
+        if (m < 0) {
+          return op.emitOpError()
+                 << "expected multiples to be non-negative, got "
+                 << "multiples[" << i << "] = " << m;
+        }
+
+        if (!ShapedType::isDynamic(input_dim) &&
+            !ShapedType::isDynamic(output_dim) && output_dim != input_dim * m) {
+          return op.emitOpError()
+                 << "requires input.shape[" << i << "] (" << input_dim << ")"
+                 << " * " << m << " to be equal to "
+                 << "output.shape[" << i << "] (" << output_dim << ")";
+        }
+      }
+    }
+  }
+
+  return success();
+}
+
+OpFoldResult TileOp::fold(ArrayRef<Attribute> operands) {
+  DenseIntElementsAttr multiples_attr;
+  if (matchPattern(multiples(), m_Constant(&multiples_attr))) {
+    // Return input directly when multiples are all ones,
+    // regardless what input is.
+    if (multiples_attr.isSplat() &&
+        multiples_attr.getSplatValue<APInt>().getSExtValue() == 1) {
+      return input();
+    }
+  }
+  return {};
+}
+
 //===----------------------------------------------------------------------===//
 // TopKV2Op
 //===----------------------------------------------------------------------===//
@@ -1993,6 +2310,80 @@ void TruncateDivOp::getCanonicalizationPatterns(
   results.insert<TruncateDivWithSqrtDivisor>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// NonMaxSuppressionV3Op
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+// Canonicalize NonMaxSuppressionV3Op to NonMaxSuppressionV4Op.
+class NMSV3ToNMSV4Op : public OpRewritePattern<NonMaxSuppressionV3Op> {
+  using OpRewritePattern<NonMaxSuppressionV3Op>::OpRewritePattern;
+  LogicalResult matchAndRewrite(NonMaxSuppressionV3Op nms_op,
+                                PatternRewriter &rewriter) const override {
+    if (nms_op.getNumOperands() != 5) {
+      return failure();
+    }
+    SmallVector<Type, 2> new_result_types;
+    new_result_types.push_back(nms_op.getType());
+    auto input_ty = nms_op.getType().template cast<ShapedType>();
+    // corresponds to the second result type of nmsv4
+    RankedTensorType valid_output_type =
+        RankedTensorType::get({}, input_ty.getElementType());
+    new_result_types.push_back(valid_output_type);
+
+    auto nmsv4 = rewriter.create<TF::NonMaxSuppressionV4Op>(
+        nms_op.getLoc(), new_result_types, nms_op.boxes(), nms_op.scores(),
+        nms_op.max_output_size(), nms_op.iou_threshold(),
+        nms_op.score_threshold());
+    // Cannot replace the NMSv3 Op with NMSv4 since the outputs between the
+    // two are different (v4 expects two output values vs v3 requires only one.
+    nms_op.replaceAllUsesWith(nmsv4.getResult(0));
+    return success();
+  }
+};
+}  // namespace.
+
+void NonMaxSuppressionV3Op::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<NMSV3ToNMSV4Op>(context);
+}
+
+//===----------------------------------------------------------------------===//
+// FusedBatchNormOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class ConvertFusedBatchNorm : public OpRewritePattern<TF::FusedBatchNormOp> {
+  using OpRewritePattern<FusedBatchNormOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(TF::FusedBatchNormOp tf_fused_batch_norm_op,
+                                PatternRewriter &rewriter) const override {
+    auto new_result_types =
+        llvm::to_vector<6>(tf_fused_batch_norm_op.getResultTypes());
+    // reserve_space_3
+    new_result_types.push_back(
+        UnrankedTensorType::get(FloatType::getF32(rewriter.getContext())));
+
+    OperationState new_state(tf_fused_batch_norm_op.getLoc(),
+                             TF::FusedBatchNormV3Op::getOperationName(),
+                             tf_fused_batch_norm_op.getOperands(),
+                             new_result_types,
+                             tf_fused_batch_norm_op.getAttrs());
+    Operation *tf_fused_batch_norm_op_v3 = rewriter.createOperation(new_state);
+
+    rewriter.replaceOp(tf_fused_batch_norm_op,
+                       tf_fused_batch_norm_op_v3->getResults().drop_back());
+    return success();
+  }
+};
+}  // namespace.
+
+void FusedBatchNormOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<ConvertFusedBatchNorm>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // UnpackOp
 //===----------------------------------------------------------------------===//
@@ -2002,7 +2393,7 @@ static LogicalResult Verify(UnpackOp op) {
   if (!value_type) return success();
 
   int64_t value_rank = value_type.getRank();
-  int64_t axis = op.axis().getSExtValue();
+  int64_t axis = op.axis();
   if (axis < -value_rank || axis >= value_rank)
     return op.emitOpError("axis attribute must be in the range of [-")
            << value_rank << ", " << value_rank << ')';
@@ -2060,6 +2451,19 @@ static LogicalResult VerifyUnsortedSegmentReduction(Op op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// VarHandleOp
+//===----------------------------------------------------------------------===//
+
+ResourceHandleValueAndId VarHandleOp::GetResourceHandleValueAndId(
+    llvm::SmallDenseMap<ResourceHandle, int64_t> &resource_handle_id_map,
+    int64_t &next_id) {
+  llvm::StringRef device = GetDeviceOrEmpty(getOperation());
+  return GetResourceHandleValueAndIdBase(container(), shared_name(), device,
+                                         resource(), resource_handle_id_map,
+                                         next_id);
+}
+
 //===----------------------------------------------------------------------===//
 // VarIsInitializedOp
 //===----------------------------------------------------------------------===//
@@ -2122,38 +2526,19 @@ OpFoldResult VariableShapeOp::fold(ArrayRef<Attribute> operands) {
 // WhileOp
 //===----------------------------------------------------------------------===//
 
-static LogicalResult Verify(WhileOp op) {
-  auto cond_fn = op.cond_func();
-  auto body_fn = op.body_func();
-  if (!cond_fn) {
-    return op.emitOpError("cond refers to an undefined function : ")
-           << op.cond();
-  }
-  if (!body_fn) {
-    return op.emitOpError("body refers to an undefined function : ")
-           << op.body();
-  }
-
-  auto cond_fn_type = cond_fn.getType();
-  auto body_fn_type = body_fn.getType();
-
-  // Verify that the cond function has exactly one result.
-  if (cond_fn_type.getNumResults() != 1)
-    return op.emitOpError("requires cond function to have exactly one result");
-
-  SmallVector<Type, 4> operands(op.getOperandTypes());
-
+static LogicalResult VerifyWhileTypes(Operation *op, TypeRange cond_input,
+                                      TypeRange body_input,
+                                      TypeRange body_result) {
   // Collect all the type lists for the op so that different pairs of type lists
   // can be compared for the compatibility.
   constexpr int kNumTypeLists = 5;
-  const std::array<std::pair<std::string, ArrayRef<Type>>, kNumTypeLists>
-      type_lists = {{
-          {"operand", operands},
-          {"body function result", body_fn_type.getResults()},
-          {"result", op.getResultTypes()},
-          {"cond function input", cond_fn_type.getInputs()},
-          {"body function input", body_fn_type.getInputs()},
-      }};
+  const std::array<TypeRangeWithDesc, kNumTypeLists> type_lists = {{
+      {op->getOperandTypes(), "input"},
+      {body_result, "body result"},
+      {op->getResultTypes(), "result"},
+      {cond_input, "condition input"},
+      {body_input, "body input"},
+  }};
 
   // A pair of type lists should be cast compatible with each other if one is
   // converted to the another for a function call or assignment or there is a
@@ -2183,28 +2568,38 @@ static LogicalResult Verify(WhileOp op) {
     for (int j = std::max(2, i + 1); j < kNumTypeLists; ++j) {
       auto &a = type_lists[i];
       auto &b = type_lists[j];
-
-      int a_size = a.second.size();
-      if (a_size != b.second.size())
-        return op.emitOpError(
-            llvm::formatv("requires the number of {0}s to be equal to the "
-                          "number of {1}s. Found {2} and {3}, respectively",
-                          a.first, b.first, a_size, b.second.size()));
-
-      for (int idx = 0; idx < a_size; ++idx) {
-        auto a_type = a.second[idx];
-        auto b_type = b.second[idx];
-
-        if (!AreCastCompatible({a_type, b_type}))
-          return op.emitError(llvm::formatv(
-              "{0} type {1} is incompatible with {2} type {3} at index {4}",
-              a.first, a_type, b.first, b_type, idx));
-      }
+      if (failed(VerifyTypeRangesAreCompatible(op, a, b))) return failure();
     }
   }
   return success();
 }
 
+static LogicalResult Verify(WhileOp op) {
+  auto cond_fn = op.cond_function();
+  auto body_fn = op.body_function();
+  if (!cond_fn) {
+    return op.emitOpError("cond refers to an undefined function : ")
+           << op.cond();
+  }
+  if (!body_fn) {
+    return op.emitOpError("body refers to an undefined function : ")
+           << op.body();
+  }
+
+  auto cond_fn_type = cond_fn.getType();
+  auto body_fn_type = body_fn.getType();
+
+  // Verify that the cond function has exactly one result.
+  if (cond_fn_type.getNumResults() != 1)
+    return op.emitOpError("requires cond function to have exactly one result");
+
+  if (failed(VerifyWhileTypes(op, /*cond_input=*/cond_fn_type.getInputs(),
+                              /*body_input=*/body_fn_type.getInputs(),
+                              /*body_result=*/body_fn_type.getResults())))
+    return failure();
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // WhileOp canonicalization.
 //===----------------------------------------------------------------------===//
@@ -2218,50 +2613,23 @@ void WhileOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
 //===----------------------------------------------------------------------===//
 static LogicalResult Verify(WhileRegionOp op) {
   // Verify that the condition generates a single tensor<i1> result.
-  YieldOp yield = cast<YieldOp>(op.cond().front().getTerminator());
-  if (yield.getNumOperands() != 1)
+  Operation *cond_yield = op.cond().front().getTerminator();
+  if (cond_yield->getNumOperands() != 1)
     return op.emitOpError()
            << "condition should have a single tensor<i1> result";
 
-  auto cond_type = yield.getOperand(0).getType().dyn_cast<RankedTensorType>();
+  auto cond_type =
+      cond_yield->getOperand(0).getType().dyn_cast<RankedTensorType>();
   if (!cond_type || !cond_type.getShape().equals({}) ||
       !cond_type.getElementType().isInteger(/*width=*/1))
     return op.emitOpError()
            << "condition should have a single tensor<i1> result";
 
-  // The body result types should match while op result types.
-  if (failed(VerifyRegionResults(op, op.body(), "body"))) return failure();
-
-  // Both condition and body should have same number and type of operands as
-  // the WhileRegion inputs.
-  const int num_inputs = op.getNumOperands();
-  auto block_inputs_match_op_inputs = [&](Region &region,
-                                          StringRef name) -> LogicalResult {
-    Block &block = region.front();
-    if (block.getNumArguments() != num_inputs)
-      return op.emitOpError()
-             << name << " should have same number of inputs (" << num_inputs
-             << ") as " << WhileRegionOp::getOperationName() << " but has "
-             << block.getNumArguments() << " inputs";
-
-    for (auto types_idx : llvm::enumerate(
-             llvm::zip(op.getOperandTypes(), block.getArgumentTypes()))) {
-      auto op_input_type = std::get<0>(types_idx.value());
-      auto block_input_type = std::get<1>(types_idx.value());
-      if (!AreCastCompatible({block_input_type, op_input_type}))
-        return op.emitOpError(llvm::formatv(
-            "{0} input type {1} is incompatible with {2} "
-            "input type {3} at index {4}",
-            name, block_input_type, WhileRegionOp::getOperationName(),
-            op_input_type, types_idx.index()));
-    }
-    return success();
-  };
-
-  if (failed(block_inputs_match_op_inputs(op.cond(), "condition")) ||
-      failed(block_inputs_match_op_inputs(op.body(), "body")))
+  Operation *body_yield = op.body().front().getTerminator();
+  if (failed(VerifyWhileTypes(op, /*cond_input=*/op.cond().getArgumentTypes(),
+                              /*body_input=*/op.body().getArgumentTypes(),
+                              /*body_result=*/body_yield->getOperandTypes())))
     return failure();
-
   return success();
 }
 
@@ -2373,7 +2741,8 @@ struct WhileRegionEliminatePassThrough
     auto &new_body_block = new_while_op.body().front();
     auto &new_yield = *new_body_block.getTerminator();
 
-    // Build a vector of new results. Also patch up the region bodies and yield.
+    // Build a vector of new results. Also patch up the region bodies and
+    // yield.
     SmallVector<Value, 4> new_results;
     next_idx = 0;
     for (int op_idx : llvm::seq<int>(0, old_num_operands)) {
@@ -2408,12 +2777,12 @@ void XdivyOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
   results.insert<XdivyWithSqrtDivisor>(context);
 }
 
+}  // namespace TF
+}  // namespace mlir
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
 
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc.inc"
-
-}  // namespace TF
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h
index 761c06a475c..9b06d855b01 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h
@@ -38,15 +38,9 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
 
-namespace mlir {
-namespace TF {
-
 #define GET_OP_FWD_DEFINES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_all_ops.h.inc"
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h.inc"
 
-}  // namespace TF
-}  // namespace mlir
-
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_N_Z_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
index e87cc494a4a..38f9175a500 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc
@@ -70,11 +70,12 @@ limitations under the License.
 
 namespace mlir {
 namespace TF {
-
 namespace {
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/generated_canonicalize.inc"
 }  // namespace
+}  // namespace TF
+}  // namespace mlir
 
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
@@ -82,6 +83,3 @@ namespace {
 
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.cc.inc"
-
-}  // namespace TF
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h
index 8586515edee..589e0e91615 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h
@@ -36,15 +36,9 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h"
 
-namespace mlir {
-namespace TF {
-
 #define GET_OP_FWD_DEFINES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_all_ops.h.inc"
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_remaining_ops.h.inc"
 
-}  // namespace TF
-}  // namespace mlir
-
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_REMAINING_OPS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index 6883d0358ec..1eaf997ab69 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -105,9 +105,15 @@ static LogicalResult Verify(SessionInitializerOp session_initializer) {
   return success();
 }
 
+}  // namespace tf_saved_model
+}  // namespace mlir
+
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc.inc"
 
+namespace mlir {
+namespace tf_saved_model {
+
 //===----------------------------------------------------------------------===//
 // TensorFlowSavedModelDialect Dialect
 //===----------------------------------------------------------------------===//
@@ -115,6 +121,11 @@ static LogicalResult Verify(SessionInitializerOp session_initializer) {
 TensorFlowSavedModelDialect::TensorFlowSavedModelDialect(MLIRContext *context)
     : Dialect(/*name=*/"tf_saved_model", context,
               TypeID::get<TensorFlowSavedModelDialect>()) {
+  // The TensorFlow Dialect is needed in the verifier and other routines
+  // associated to this dialect. It makes little sense anyway to use the
+  // SavedModel dialect without the TensorFlow Dialect.
+  context->loadDialect<TF::TensorFlowDialect>();
+
   addOperations<
 #define GET_OP_LIST
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc.inc"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
index 02b7f0b75f4..c8518a9ca02 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h
@@ -40,10 +40,16 @@ class TensorFlowSavedModelDialect : public Dialect {
   static StringRef getDialectNamespace() { return "tf_saved_model"; }
 };
 
+}  // namespace tf_saved_model
+}  // namespace mlir
+
 // Declares the operations for this dialect using the generated header.
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h.inc"
 
+namespace mlir {
+namespace tf_saved_model {
+
 // Returns the list of exported names for `op`.
 // An empty list means `op` is not exported.
 SmallVector<StringRef, 2> GetExportedNames(Operation *op);
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
index a22a684953b..753e2368d6e 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model_ops.td
@@ -82,7 +82,7 @@ def TfSavedModel_Dialect : Dialect {
     with "get_global @some_global_tensor" in the function body.
   }];
 
-  let cppNamespace = "tf_saved_model";
+  let cppNamespace = "::mlir::tf_saved_model";
 }
 
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
index 9be61b1db39..3c8ec1d38af 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
@@ -35,6 +35,28 @@ struct TensorArray : ::mlir::SideEffects::Resource::Base<TensorArray> {
   StringRef getName() final { return "TensorArray"; }
 };
 
+struct Summary : ::mlir::SideEffects::Resource::Base<Summary> {
+  StringRef getName() final { return "Summary"; }
+};
+
+struct LookupTable : ::mlir::SideEffects::Resource::Base<LookupTable> {
+  StringRef getName() final { return "LookupTable"; }
+};
+
+struct DatasetSeedGenerator
+    : ::mlir::SideEffects::Resource::Base<DatasetSeedGenerator> {
+  StringRef getName() final { return "DatasetSeedGenerator"; }
+};
+
+struct DatasetMemoryCache
+    : ::mlir::SideEffects::Resource::Base<DatasetMemoryCache> {
+  StringRef getName() final { return "DatasetMemoryCache"; }
+};
+
+struct DatasetIterator : ::mlir::SideEffects::Resource::Base<DatasetIterator> {
+  StringRef getName() final { return "DatasetIterator"; }
+};
+
 }  // namespace ResourceEffects
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.cc
index 6c5485c16dd..9d8f25c6633 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.cc
@@ -15,11 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 
-namespace mlir {
-
-// NOLINTNEXTLINE
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.cc.inc"
 
+namespace mlir {
 namespace TF {
 
 void RuntimeDevices::AddDevice(const ParsedName& device) {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h
index b1f39ad1d28..b90bf2d47a8 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h
@@ -26,10 +26,9 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "tensorflow/core/util/device_name_utils.h"
 
-namespace mlir {
-
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h.inc"
 
+namespace mlir {
 namespace TF {
 
 // Tensorflow devices available at runtime with corresponding metadata if it is
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
index 412bf113a0f..aef3c538bc8 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h
@@ -124,6 +124,10 @@ class CannotDuplicate : public TraitBase<ConcreteType, CannotDuplicate> {
   }
 };
 
+// Trait to indicate an operation cannot be constant folded.
+template <typename ConcreteType>
+class NoConstantFold : public TraitBase<ConcreteType, NoConstantFold> {};
+
 // Coefficient-wise binary operation with implicit broadcasting support, for
 // example tf.Sub operation.
 template <typename ConcreteType>
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
index 2ec73824f6c..86369b993be 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
@@ -62,101 +62,6 @@ bool GetCastCompatibleShape(llvm::ArrayRef<int64_t> a_shape,
   return true;
 }
 
-// Given two types `a` and `b`, returns a refined type which is cast compatible
-// with both `a` and `b` and is equal to or more precise than both of them. It
-// returns empty Type if the input types are not cast compatible.
-//
-// The two types are considered cast compatible if they have dynamically equal
-// shapes and element type. For element types that do not have subtypes, they
-// must be equal. However for TensorFlow types such as Resource and Variant,
-// that also have subtypes, we recursively check for subtype compatibilty for
-// Resource types and assume all variant types are cast compatible. If either
-// one of `a` or `b` have empty subtypes, they are considered cast compatible.
-//
-// The returned type is same or more precise than the input types. For example,
-// if `a` and `b` are cast compatible types tensor<2x?x?xf32> and
-// tensor<?x4x?xf32> respectively, the returned type is tensor<2x4x?xf32>.
-//
-// Provides option to ignore ref types on 'a'. This is useful for TF ops that
-// might allow operands to either be same as result type or be a ref type
-// corresponding to it.
-mlir::Type GetCastCompatibleType(mlir::Type a, mlir::Type b,
-                                 bool may_ignore_ref_type_a) {
-  // Fast path if everything is equal.
-  if (a == b) return b;
-
-  auto a_tt = a.dyn_cast<mlir::TensorType>();
-  auto b_tt = b.dyn_cast<mlir::TensorType>();
-
-  // If only one of a or b is a tensor type, they are incompatible.
-  if (static_cast<bool>(a_tt) ^ static_cast<bool>(b_tt)) return nullptr;
-
-  // For non-tensor types, we do not need to worry about shape and can return
-  // early.
-  if (!a_tt && !b_tt) {
-    // Remove ref types.
-    if (may_ignore_ref_type_a) {
-      if (auto ref_type = a.dyn_cast<mlir::TF::TensorFlowRefType>()) {
-        a = ref_type.RemoveRef();
-        if (a == b) return a;
-      }
-    }
-    if (a.getTypeID() != b.getTypeID()) return nullptr;
-
-    // If either is not a type that contain subtypes then the types are not cast
-    // compatible.
-    auto a_wst = a.dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>();
-    auto b_wst = b.dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>();
-    if (!a_wst || !b_wst) return nullptr;
-
-    // For Variant types we are more permissive right now and accept all pairs
-    // of Variant types. If we are more constrainted and check compatibility of
-    // subtypes, we might reject valid graphs.
-    // TODO(prakalps): Variant doesn't have a subtype, we assign it
-    // one, so we should only assign it one when we know the subtype. Then we
-    // can be more constrained and check subtypes for cast compatibility as
-    // well.
-    if (a.isa<mlir::TF::VariantType>()) return a;
-
-    // For Resource types, we recursively check the subtypes for cast
-    // compatibility, if possible. Otherwise treat them as compatible.
-    auto a_wst_st = a_wst.GetSubtypes();
-    auto b_wst_st = b_wst.GetSubtypes();
-    if (a_wst_st.empty() || b_wst_st.empty()) return a;
-    if (a_wst_st.size() != b_wst_st.size()) return nullptr;
-    llvm::SmallVector<mlir::TensorType, 4> refined_subtypes;
-    for (auto subtypes : llvm::zip(a_wst_st, b_wst_st)) {
-      mlir::Type refined_st =
-          GetCastCompatibleType(std::get<0>(subtypes), std::get<1>(subtypes),
-                                /*may_ignore_ref_type_a=*/false);
-      if (!refined_st) return nullptr;
-      refined_subtypes.push_back(refined_st.cast<mlir::TensorType>());
-    }
-
-    return mlir::TF::ResourceType::get(refined_subtypes, a.getContext());
-  }
-
-  // For tensor types, check compatibility of both element type and shape.
-  mlir::Type refined_element_ty = GetCastCompatibleType(
-      a_tt.getElementType(), b_tt.getElementType(), may_ignore_ref_type_a);
-  if (!refined_element_ty) return nullptr;
-
-  if (!a_tt.hasRank() && !b_tt.hasRank()) {
-    return mlir::UnrankedTensorType::get(refined_element_ty);
-  }
-  if (!a_tt.hasRank()) {
-    return mlir::RankedTensorType::get(b_tt.getShape(), refined_element_ty);
-  }
-  if (!b_tt.hasRank()) {
-    return mlir::RankedTensorType::get(a_tt.getShape(), refined_element_ty);
-  }
-
-  llvm::SmallVector<int64_t, 8> refined_shape;
-  if (!GetCastCompatibleShape(a_tt.getShape(), b_tt.getShape(), &refined_shape))
-    return nullptr;
-
-  return mlir::RankedTensorType::get(refined_shape, refined_element_ty);
-}
 }  // namespace
 
 namespace mlir {
@@ -343,6 +248,102 @@ bool BroadcastCompatible(ArrayRef<Type> lhs, ArrayRef<Type> rhs) {
   return true;
 }
 
+// Given two types `a` and `b`, returns a refined type which is cast compatible
+// with both `a` and `b` and is equal to or more precise than both of them. It
+// returns empty Type if the input types are not cast compatible.
+//
+// The two types are considered cast compatible if they have dynamically equal
+// shapes and element type. For element types that do not have subtypes, they
+// must be equal. However for TensorFlow types such as Resource and Variant,
+// that also have subtypes, we recursively check for subtype compatibilty for
+// Resource types and assume all variant types are cast compatible. If either
+// one of `a` or `b` have empty subtypes, they are considered cast compatible.
+//
+// The returned type is same or more precise than the input types. For example,
+// if `a` and `b` are cast compatible types tensor<2x?x?xf32> and
+// tensor<?x4x?xf32> respectively, the returned type is tensor<2x4x?xf32>.
+//
+// Provides option to ignore ref types on 'a'. This is useful for TF ops that
+// might allow operands to either be same as result type or be a ref type
+// corresponding to it.
+mlir::Type GetCastCompatibleType(mlir::Type a, mlir::Type b,
+                                 bool may_ignore_ref_type_a) {
+  // Fast path if everything is equal.
+  if (a == b) return b;
+
+  auto a_tt = a.dyn_cast<mlir::TensorType>();
+  auto b_tt = b.dyn_cast<mlir::TensorType>();
+
+  // If only one of a or b is a tensor type, they are incompatible.
+  if (static_cast<bool>(a_tt) ^ static_cast<bool>(b_tt)) return nullptr;
+
+  // For non-tensor types, we do not need to worry about shape and can return
+  // early.
+  if (!a_tt && !b_tt) {
+    // Remove ref types.
+    if (may_ignore_ref_type_a) {
+      if (auto ref_type = a.dyn_cast<mlir::TF::TensorFlowRefType>()) {
+        a = ref_type.RemoveRef();
+        if (a == b) return a;
+      }
+    }
+    if (a.getTypeID() != b.getTypeID()) return nullptr;
+
+    // If either is not a type that contain subtypes then the types are not cast
+    // compatible.
+    auto a_wst = a.dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>();
+    auto b_wst = b.dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>();
+    if (!a_wst || !b_wst) return nullptr;
+
+    // For Variant types we are more permissive right now and accept all pairs
+    // of Variant types. If we are more constrainted and check compatibility of
+    // subtypes, we might reject valid graphs.
+    // TODO(prakalps): Variant doesn't have a subtype, we assign it
+    // one, so we should only assign it one when we know the subtype. Then we
+    // can be more constrained and check subtypes for cast compatibility as
+    // well.
+    if (a.isa<mlir::TF::VariantType>()) return a;
+
+    // For Resource types, we recursively check the subtypes for cast
+    // compatibility, if possible. Otherwise treat them as compatible.
+    auto a_wst_st = a_wst.GetSubtypes();
+    auto b_wst_st = b_wst.GetSubtypes();
+    if (a_wst_st.empty() || b_wst_st.empty()) return a;
+    if (a_wst_st.size() != b_wst_st.size()) return nullptr;
+    llvm::SmallVector<mlir::TensorType, 4> refined_subtypes;
+    for (auto subtypes : llvm::zip(a_wst_st, b_wst_st)) {
+      mlir::Type refined_st =
+          GetCastCompatibleType(std::get<0>(subtypes), std::get<1>(subtypes),
+                                /*may_ignore_ref_type_a=*/false);
+      if (!refined_st) return nullptr;
+      refined_subtypes.push_back(refined_st.cast<mlir::TensorType>());
+    }
+
+    return mlir::TF::ResourceType::get(refined_subtypes, a.getContext());
+  }
+
+  // For tensor types, check compatibility of both element type and shape.
+  mlir::Type refined_element_ty = GetCastCompatibleType(
+      a_tt.getElementType(), b_tt.getElementType(), may_ignore_ref_type_a);
+  if (!refined_element_ty) return nullptr;
+
+  if (!a_tt.hasRank() && !b_tt.hasRank()) {
+    return mlir::UnrankedTensorType::get(refined_element_ty);
+  }
+  if (!a_tt.hasRank()) {
+    return mlir::RankedTensorType::get(b_tt.getShape(), refined_element_ty);
+  }
+  if (!b_tt.hasRank()) {
+    return mlir::RankedTensorType::get(a_tt.getShape(), refined_element_ty);
+  }
+
+  llvm::SmallVector<int64_t, 8> refined_shape;
+  if (!GetCastCompatibleShape(a_tt.getShape(), b_tt.getShape(), &refined_shape))
+    return nullptr;
+
+  return mlir::RankedTensorType::get(refined_shape, refined_element_ty);
+}
+
 bool HasCompatibleElementTypes(Type lhs, Type rhs,
                                bool may_ignore_ref_type_lhs) {
   return GetCastCompatibleType(lhs, rhs, may_ignore_ref_type_lhs) != nullptr;
@@ -359,6 +360,16 @@ bool AreCastCompatible(ArrayRef<Type> types) {
   return true;
 }
 
+bool ArraysAreCastCompatible(ArrayRef<Type> lhs, ArrayRef<Type> rhs) {
+  if (lhs.size() != rhs.size()) return false;
+  for (auto pair : llvm::zip(lhs, rhs)) {
+    auto lhs_i = std::get<0>(pair);
+    auto rhs_i = std::get<1>(pair);
+    if (!AreCastCompatible({lhs_i, rhs_i})) return false;
+  }
+  return true;
+}
+
 // Assumes a function `GetDefaultTypeOf(ComposedType)` that returns the default
 // type for a composed type (such as a ref type or a type with subtypes).
 template <typename ComposedType>
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
index f93f6b657da..1d3ca0c4a60 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
@@ -272,6 +272,15 @@ class VariantType : public detail::TypeWithSubtypeImpl<VariantType> {
   static std::string getTypeName() { return "VariantType"; }
 };
 
+// Given two types `a` and `b`, returns a refined type which is cast compatible
+// with both `a` and `b` and is equal to or more precise than both of them. It
+// returns empty Type if the input types are not cast compatible.
+// Provides option to ignore ref types on 'a'. This is useful for TF ops that
+// might allow operands to either be same as result type or be a ref type
+// corresponding to it.
+mlir::Type GetCastCompatibleType(mlir::Type a, mlir::Type b,
+                                 bool may_ignore_ref_type_a);
+
 // Returns whether two arrays of Type are broadcast compatible.
 bool BroadcastCompatible(ArrayRef<Type> lhs, ArrayRef<Type> rhs);
 
@@ -293,6 +302,10 @@ bool HasCompatibleElementTypes(Type lhs, Type rhs,
 // compatible.
 bool AreCastCompatible(ArrayRef<Type> types);
 
+// Returns true if corresponding elements of lhs and rhs AreCastCompatible and
+// lhs and rhs are the same length.
+bool ArraysAreCastCompatible(ArrayRef<Type> lhs, ArrayRef<Type> rhs);
+
 // If `ty` is a tensor type and its element type has subtypes, then returns a
 // new type of same shape but dropped subtypes for the element type.
 // Otherwise, if `ty` has subtypes, then returns corresponding type with dropped
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc
new file mode 100644
index 00000000000..6a6a7574f29
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h"
+
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc.inc"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h
new file mode 100644
index 00000000000..039f211533c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h
@@ -0,0 +1,26 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TFRT_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TFRT_OPS_H_
+
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TFRT_OPS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.td
new file mode 100644
index 00000000000..fea9500b638
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.td
@@ -0,0 +1,61 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the operation definition file for TensorFlow operations with
+// implementation available only in TFRT.
+
+#ifndef TFRT_OPS
+#define TFRT_OPS
+
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td"
+include "mlir/IR/OpBase.td"
+
+def TF__JitFusedMatMulOp : TF_Op<"_JitFusedMatMul", [NoSideEffect, SameOperandsAndResultElementType]> {
+  let summary = [{
+    MatMul operation with an output fusion compiled at runtime via MLIR codegen.
+  }];
+
+  let description = [{
+The inputs to the MatMul are specified by `a` and `b`. The series of operations
+that follows is specified by the `fusion` attribute, which is a list of output
+kernel names specified as strings (e.g. "BiasAdd"). They are performed in order,
+where the (first) input to each op is the output of the preceding op. The first
+input and the output of each fused_op must be of type T.
+
+Supported list of fusions is defined by ContractionOutputKernelBuilder
+implementations.
+
+*WARN*: This is a TFRT only operations, and it does not exist in TF. This
+operation is only added by the ContractionFusion pass.
+  }];
+
+  let arguments = (ins
+    TensorOf<[F32]>:$a,
+    TensorOf<[F32]>:$b,
+    Variadic<TensorOf<[F32]>>:$additional_args,
+
+    DefaultValuedAttr<BoolAttr, "false">:$transpose_a,
+    DefaultValuedAttr<BoolAttr, "false">:$transpose_b,
+    DefaultValuedAttr<StrArrayAttr, "{}">:$fusion
+  );
+
+  let results = (outs
+    TensorOf<[F32]>:$product
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+#endif // TFRT_OPS
diff --git a/tensorflow/compiler/mlir/tensorflow/ops/mlir_local_var_op.cc b/tensorflow/compiler/mlir/tensorflow/ops/mlir_local_var_op.cc
index 211866900aa..d2c2cecdfdd 100644
--- a/tensorflow/compiler/mlir/tensorflow/ops/mlir_local_var_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ops/mlir_local_var_op.cc
@@ -21,7 +21,7 @@ namespace tensorflow {
 REGISTER_OP("MlirLocalVarOp")
     .Output("resource: resource")
     .SetShapeFn(shape_inference::UnknownShape)
-    .Doc(R"(Creates a handle to a in-scope variable.
+    .Doc(R"(Creates a handle to an in-scope variable.
 Used by internal passes for temporary representation of local state, which will
 be eventually removed.)");
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/BUILD
index daa583bed0e..63d01bf355e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(licenses = ["notice"])
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir b/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
index 05d34eb0755..6654341ab42 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir
@@ -285,7 +285,7 @@ func @empty_island_multiple_data_results(%arg0: tensor<*xf32>, %arg1: tensor<*xi
 // and certain tf_executor ops are added correctly.
 
 // CHECK: %[[CONTROL:[^ ,]*]] = tf_executor.island wraps "tf.Print"
-// CHECK: tf_executor.NextIteration.Sink [{{.*}}] {{.*}}, %[[CONTROL]]
+// CHECK: tf_executor.NextIteration.Sink[{{.*}}] {{.*}}, %[[CONTROL]]
 func @next_iteration_sink_control_input() {
   tf_executor.graph {
     %source:3 = tf_executor.NextIteration.Source : tensor<*xi32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index 50486909694..e77dd365abf 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -568,6 +568,14 @@ func @testSelectElseUnranked(%arg0: tensor<3xi1>, %arg1: tensor<3x2xf16>, %arg2:
   return %0: tensor<*xf16>
 }
 
+// CHECK-LABEL: testTileMultiplesAllOnes
+func @testTileMultiplesAllOnes(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  %cst = constant dense <[1, 1]> : tensor<2xi32>
+  // CHECK: return %arg0
+  %0 = "tf.Tile"(%arg0, %cst) : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<2x3xf32>
+  return %0: tensor<2x3xf32>
+}
+
 // CHECK-LABEL: testLogicalNotOfEqual
 func @testLogicalNotOfEqual(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xi1> {
   %0 = "tf.Equal"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
@@ -967,6 +975,65 @@ func @foldIfRegionMismatchedTypes(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %a
   return %0 : tensor<1xf32>
 }
 
+// CHECK-LABEL: func @eliminatePassThroughIfRegion(
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<!tf.resource>
+func @eliminatePassThroughIfRegion(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<!tf.resource>) -> (tensor<f32>) {
+  // CHECK: %[[PRED:.*]] = "tf._SomeOp"() : () -> tensor<i1>
+  %pred = "tf._SomeOp"() : () -> tensor<i1>
+  // CHECK: %[[IF_OUTPUT:.*]] = "tf.IfRegion"(%[[PRED]]) ( {
+  // CHECK:   %[[MUL:.*]] = "tf.Mul"(%[[ARG0]], %[[ARG1]])
+  // CHECK:   "tf.Yield"(%[[MUL]]) : (tensor<f32>)
+  // CHECK:  },  {
+  // CHECK:    %[[SUB:.*]] = "tf.Sub"(%[[ARG0]], %[[ARG1]])
+  // CHECK:    "tf.Yield"(%[[SUB]]) : (tensor<f32>)
+  // CHECK:  }) {is_stateless = true} : (tensor<i1>) -> tensor<f32>
+  %0:4 = "tf.IfRegion"(%pred) ({
+      %true_value = "tf.Mul"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "tf.Yield"(%arg1, %arg2, %true_value, %arg2) : (tensor<f32>, tensor<!tf.resource>, tensor<f32>, tensor<!tf.resource>) -> ()
+    }, {
+      %false_value = "tf.Sub"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "tf.Yield"(%arg1, %arg2, %false_value, %arg2) : (tensor<f32>, tensor<!tf.resource>, tensor<f32>, tensor<!tf.resource>) -> ()
+    }) { is_stateless = true}: (tensor<i1>) -> (tensor<f32>, tensor<!tf.resource>, tensor<f32>, tensor<!tf.resource>)
+  // CHECK: "tf._SomeOp"(%[[ARG2]], %[[ARG1]]) : (tensor<!tf.resource>, tensor<f32>) -> ()
+  "tf._SomeOp"(%0#1, %0#0) : (tensor<!tf.resource>, tensor<f32>) -> ()
+  // CHECK: "tf._SomeOp"(%[[ARG2]], %[[IF_OUTPUT]]) : (tensor<!tf.resource>, tensor<f32>) -> ()
+  "tf._SomeOp"(%0#3, %0#2) : (tensor<!tf.resource>, tensor<f32>) -> ()
+  // CHECK: return %[[IF_OUTPUT]] : tensor<f32>
+  return %0#2 : tensor<f32>
+}
+
+// CHECK-LABEL: func @eliminatePassThroughCaseRegion(
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>, %[[ARG2:.*]]: tensor<!tf.resource>
+func @eliminatePassThroughCaseRegion(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<!tf.resource>) -> (tensor<f32>) {
+  // CHECK: %[[INDEX:.*]] = "tf._SomeOp"() : () -> tensor<i32>
+  %index = "tf._SomeOp"() : () -> tensor<i32>
+  // CHECK: %[[CASE_OUTPUT:.*]] = "tf.CaseRegion"(%[[INDEX]]) ( {
+  // CHECK:   %[[MUL:.*]] = "tf.Mul"(%[[ARG0]], %[[ARG1]])
+  // CHECK:   "tf.Yield"(%[[MUL]]) : (tensor<f32>)
+  // CHECK:  },  {
+  // CHECK:    %[[SUB:.*]] = "tf.Sub"(%[[ARG0]], %[[ARG1]])
+  // CHECK:    "tf.Yield"(%[[SUB]]) : (tensor<f32>)
+  // CHECK:  },  {
+  // CHECK:    %[[ADD:.*]] = "tf.AddV2"(%[[ARG0]], %[[ARG1]])
+  // CHECK:    "tf.Yield"(%[[ADD]]) : (tensor<f32>)
+  // CHECK:  }) {is_stateless = true} : (tensor<i32>) -> tensor<f32>
+  %0:3 = "tf.CaseRegion"(%index) ({
+      %mul = "tf.Mul"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "tf.Yield"(%arg1, %mul, %arg2) : (tensor<f32>, tensor<f32>, tensor<!tf.resource>) -> ()
+    }, {
+      %sub = "tf.Sub"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "tf.Yield"(%arg1, %sub, %arg2) : (tensor<f32>, tensor<f32>, tensor<!tf.resource>) -> ()
+    }, {
+      %add = "tf.AddV2"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "tf.Yield"(%arg1, %add, %arg2) : (tensor<f32>, tensor<f32>, tensor<!tf.resource>) -> ()
+    }) { is_stateless = true}: (tensor<i32>) -> (tensor<f32>, tensor<f32>, tensor<!tf.resource>)
+  // CHECK: "tf._SomeOp"(%[[ARG2]], %[[ARG1]]) : (tensor<!tf.resource>, tensor<f32>) -> ()
+  "tf._SomeOp"(%0#2, %0#0) : (tensor<!tf.resource>, tensor<f32>) -> ()
+  // CHECK: return %[[CASE_OUTPUT]] : tensor<f32>
+  return %0#1 : tensor<f32>
+}
+
+
 // CHECK-LABEL: foldCase
 func @foldCase(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>) {
   %2 = constant dense<1> : tensor<i32>
@@ -1209,3 +1276,18 @@ func @testWhileDropOutputShapes(tensor<*xf32>) -> (tensor<*xf32>) {
 
   return %1 : tensor<*xf32>
 }
+
+// CHECK-LABEL: testNMSV3ToNMSV4
+func @testNMSV3ToNMSV4(%arg0: tensor<3x4xf32>, %arg1: tensor<3xf32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<2xi32> {
+  %max_size = constant dense<2> : tensor<i32>
+  // CHECK: "tf.NonMaxSuppressionV4"
+  %0 = "tf.NonMaxSuppressionV3"(%arg0, %arg1, %max_size, %arg2, %arg3): (tensor<3x4xf32>, tensor<3xf32>, tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<2xi32>)
+  return %0 : tensor<2xi32>
+}
+
+// CHECK-LABEL: testFusedBatchNormToBatchNormV3
+func @testFusedBatchNormToBatchNormV3(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
+  // CHECK: "tf.FusedBatchNormV3"
+  %0:5 = "tf.FusedBatchNorm"(%arg0, %arg1, %arg2, %arg3, %arg4): (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32> )
+  return %0#0  : tensor<8x8x8x8xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/BUILD
new file mode 100644
index 00000000000..b8ab6ffeeb9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/BUILD
@@ -0,0 +1,26 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+
+licenses(["notice"])
+
+glob_lit_tests(
+    data = [
+        ":test_utilities",
+    ],
+    driver = "@llvm-project//mlir:run_lit.sh",
+    test_file_exts = [
+        "mlir",
+        "pbtxt",
+    ],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/mlir:tf-mlir-translate",
+        "@llvm-project//llvm:FileCheck",
+        "@llvm-project//llvm:not",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/add.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/add.mlir
new file mode 100644
index 00000000000..84e3f528a5c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/add.mlir
@@ -0,0 +1,38 @@
+// RUN: tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=: -emit-return-tuple | FileCheck %s
+// RUN: tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=: -emit-use-tuple-args -emit-return-tuple | FileCheck -check-prefix=TUPLE-ARGS %s
+
+module attributes {tf.versions = {producer = 179 : i32}} {
+  func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
+    %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+}
+
+// CHECK-LABEL: HloModule main
+// CHECK:       ENTRY %main.{{[0-9]+}} ([[ARG0:.*]]: f32[], [[ARG1:.*]]: f32[]) -> (f32[]) {
+// CHECK-NEXT:    %[[ARG0]] = f32[] parameter(0)
+// CHECK-NEXT:    %[[ARG1]] = f32[] parameter(1)
+// CHECK-NEXT:    [[ADD:%.*]] = f32[] add(f32[] %[[ARG0]], f32[] %[[ARG1]])
+// CHECK-NEXT:    ROOT %tuple.{{[0-9]+}} = (f32[]) tuple(f32[] [[ADD]])
+// CHECK-NEXT:  }
+
+// CHECK:       // InputMapping {0, 1}
+// CHECK-NEXT:  // XlaInputShape f32[]
+// CHECK-NEXT:  // XlaInputShape f32[]
+// CHECK-NEXT:  // XlaOutputShape (f32[])
+// CHECK-NEXT:  // XlaOutputDescription type=float shape=()
+
+
+// TUPLE-ARGS-LABEL: HloModule main
+// TUPLE-ARGS:       ENTRY %main.{{[0-9]+}} ([[ARG_TUPLE:.*]]: (f32[], f32[])) -> (f32[]) {
+// TUPLE-ARGS:         %[[ARG_TUPLE]] = (f32[], f32[]) parameter(0)
+// TUPLE-ARGS:         [[ARG0:%.*]] = f32[] get-tuple-element((f32[], f32[]) %[[ARG_TUPLE]]), index=0
+// TUPLE-ARGS:         [[ARG1:%.*]] = f32[] get-tuple-element((f32[], f32[]) %[[ARG_TUPLE]]), index=1
+// TUPLE-ARGS:         [[ADD:%.*]] = f32[] add(f32[] [[ARG0]], f32[] [[ARG1]])
+// TUPLE-ARGS:         ROOT %tuple.{{[0-9]+}} = (f32[]) tuple(f32[] [[ADD]])
+// TUPLE-ARGS:       }
+
+// TUPLE-ARGS:       // InputMapping {0, 1}
+// TUPLE-ARGS-NEXT:  // XlaInputShape (f32[], f32[])
+// TUPLE-ARGS-NEXT:  // XlaOutputShape (f32[])
+// TUPLE-ARGS-NEXT:  // XlaOutputDescription type=float shape=()
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/argument-sharding-invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/argument-sharding-invalid.mlir
new file mode 100644
index 00000000000..5347037d7cf
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/argument-sharding-invalid.mlir
@@ -0,0 +1,9 @@
+// RUN: not tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=128,10 -emit-use-tuple-args -emit-return-tuple 2>&1 | FileCheck %s
+
+module attributes {tf.versions = {producer = 179 : i32}} {
+  func @main(%arg0: tensor<128x8xf32> {mhlo.sharding = "bad_sharding"}) {
+    return
+  }
+}
+
+// CHECK: failed to parse argument sharding 0 'bad_sharding'
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/argument-sharding.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/argument-sharding.mlir
new file mode 100644
index 00000000000..7154919c3d1
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/argument-sharding.mlir
@@ -0,0 +1,38 @@
+// RUN: tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=128,10:10,1024:128,1024 -emit-use-tuple-args -emit-return-tuple | FileCheck %s
+
+module attributes {tf.versions = {producer = 179 : i32}} {
+  func @main(%arg0: tensor<128x10xf32> {mhlo.sharding = "\08\03\1A\02\01\02\22\02\00\01"}, %arg1: tensor<10x1024xf32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg2: tensor<128x1024xf32> {mhlo.sharding = ""}) {
+    return
+  }
+}
+
+// The following xla::OpSharding protos are used:
+//  Serialized string:
+//   "\08\03\1A\02\01\02\22\02\00\01"
+//  Proto debug string:
+//   type: OTHER
+//   tile_assignment_dimensions: 1
+//   tile_assignment_dimensions: 2
+//   tile_assignment_devices: 0
+//   tile_assignment_devices: 1
+//
+//  Serialized string:
+//   "\08\01\1A\01\01\22\01\00"
+//  Proto debug string:
+//   type: MAXIMAL
+//   tile_assignment_dimensions: 1
+//   tile_assignment_devices: 0
+//
+//  Serialized string:
+//   ""
+//  Proto debug string (empty but would equivalent to):
+//   type: REPLICATED
+
+// CHECK-LABEL: HloModule main
+// CHECK:       ENTRY %main.{{[0-9]+}} ([[ARG_TUPLE:.*]]: (f32[128,10], f32[10,1024], f32[128,1024])) -> () {
+// CHECK:         %[[ARG_TUPLE]] = (f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) parameter(0)
+// CHECK-SAME:    sharding={
+// CHECK-SAME:    {devices=[1,2]0,1}
+// CHECK-SAME:    {maximal device=0}
+// CHECK-SAME:    {replicated}
+// CHECK-SAME:    }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/constant-folding-hook.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/constant-folding-hook.mlir
new file mode 100644
index 00000000000..c745fbc0744
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/constant-folding-hook.mlir
@@ -0,0 +1,16 @@
+// RUN: tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=: -emit-use-tuple-args -emit-return-tuple | FileCheck %s
+
+module attributes {tf.versions = {producer = 179 : i32}} {
+  func @main() -> (tensor<0xi32>, tensor<0xi32>) {
+    %0 = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
+    %r0, %r1 = "tf.BroadcastGradientArgs"(%0, %0) {T = i32} : (tensor<0xi32>, tensor<0xi32>) -> (tensor<0xi32>, tensor<0xi32>)
+    return %r0, %r1 : tensor<0xi32>, tensor<0xi32>
+  }
+}
+
+// CHECK-LABEL: HloModule main
+// CHECK:       ENTRY %main.{{[0-9+]}} ([[ARG_TUPLE:.*]]: ()) -> (s32[0], s32[0]) {
+// CHECK:         %[[ARG_TUPLE]] = () parameter(0)
+// CHECK:         [[CONSTANT:%.*]] = s32[0]{0} constant({})
+// CHECK:         ROOT %tuple.{{[0-9]+}} = (s32[0]{0}, s32[0]{0}) tuple(s32[0]{0} [[CONSTANT]], s32[0]{0} [[CONSTANT]])
+// CHECK:       }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/constant-folding.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/constant-folding.mlir
new file mode 100644
index 00000000000..e54ff79e5e4
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/constant-folding.mlir
@@ -0,0 +1,23 @@
+// RUN: tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=10,19:19,10 -emit-use-tuple-args -emit-return-tuple | FileCheck %s
+
+module attributes {tf.versions = {producer = 179 : i32}} {
+  func @main(%arg0: tensor<10x19xf32>, %arg1: tensor<19x10xf32> {mhlo.is_same_data_across_replicas}) -> tensor<10x19xf32> {
+    %0 = "tf.Shape"(%arg0) : (tensor<10x19xf32>) -> tensor<2xi64>
+    %1 = "tf.Reshape"(%arg1, %0) : (tensor<19x10xf32>, tensor<2xi64>) -> tensor<10x19xf32>
+    return %1 : tensor<10x19xf32>
+  }
+}
+
+// Tests that foldable ops are constant-folded to enable legalization of ops
+// that require compile time constant operand.
+// "tf.Shape" can only be folded away after shape inference. tf.Reshape can only
+// be lowered when tf.Shape is folded into a constant.
+
+// CHECK-LABEL: HloModule main
+// CHECK:       ENTRY %main.{{[0-9]+}} ([[ARG_TUPLE:.*]]: (f32[10,19], f32[19,10])) -> (f32[10,19]) {
+// CHECK:         %[[ARG_TUPLE]] = (f32[10,19]{1,0}, f32[19,10]{1,0}) parameter(0), parameter_replication={false,true}
+// CHECK:         [[ARG0:%.*]] = f32[10,19]{1,0} get-tuple-element((f32[10,19]{1,0}, f32[19,10]{1,0}) %[[ARG_TUPLE]]), index=0
+// CHECK:         [[ARG1:%.*]] = f32[19,10]{1,0} get-tuple-element((f32[10,19]{1,0}, f32[19,10]{1,0}) %[[ARG_TUPLE]]), index=1
+// CHECK:         [[RESHAPE:%.*]] = f32[10,19]{1,0} reshape(f32[19,10]{1,0} [[ARG1]])
+// CHECK:         ROOT %tuple.{{[0-9]+}} = (f32[10,19]{1,0}) tuple(f32[10,19]{1,0} [[RESHAPE]])
+// CHECK:       }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph-resource.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph-resource.mlir
new file mode 100644
index 00000000000..3d1a34b932d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph-resource.mlir
@@ -0,0 +1,27 @@
+// RUN: tf-mlir-translate -mlir-tf-graph-to-hlo-text %s -tf-input-shapes=2:2 -tf-input-data-types=DT_FLOAT,DT_FLOAT -tf-xla-input-types=parameter,resource -emit-return-tuple | FileCheck %s
+
+module attributes {tf.versions = {producer = 511 : i32}} {
+  func @main(%arg0: tensor<*xf32>, %arg1: tensor<*x!tf.resource>) {
+    tf_executor.graph {
+      %control = tf_executor.island wraps "tf.AssignVariableOp"(%arg1, %arg0) : (tensor<*x!tf.resource>, tensor<*xf32>) -> ()
+      tf_executor.fetch %control : !tf_executor.control
+    }
+    return
+  }
+}
+
+// Tests a conversion from Graph (tf_executor dialect MLIR) to MLIR with
+// resource arguments.
+
+// CHECK-LABEL: HloModule main.{{[0-9]+}}, input_output_alias={ {0}: (1, {}, may-alias) }
+// CHECK:       ENTRY %main.{{[0-9]+}} ([[ARG0:.*]]: f32[2], [[ARG1:.*]]: f32[2]) -> (f32[2]) {
+// CHECK-NEXT:    %[[ARG1]] = f32[2]{0} parameter(1)
+// CHECK-NEXT:    %[[ARG0]] = f32[2]{0} parameter(0)
+// CHECK-NEXT:    ROOT %tuple.{{[0-9]+}} = (f32[2]{0}) tuple(f32[2]{0} %[[ARG0]])
+// CHECK-NEXT:  }
+
+// CHECK:       // InputMapping {0, 1}
+// CHECK-NEXT:  // XlaInputShape f32[2]
+// CHECK-NEXT:  // XlaInputShape f32[2]
+// CHECK-NEXT:  // XlaOutputShape (f32[2])
+// CHECK-NEXT:  // ResourceUpdate input_index=1 type=float shape=(2) modified
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph-resource.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph-resource.pbtxt
new file mode 100644
index 00000000000..5fb90b1bce0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph-resource.pbtxt
@@ -0,0 +1,66 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-graph-as-function | tf-mlir-translate -mlir-tf-graph-to-hlo-text -tf-input-shapes=2:2 -tf-input-data-types=DT_FLOAT,DT_FLOAT -tf-xla-input-types=parameter,resource -emit-return-tuple | FileCheck %s
+
+node {
+  name: "arg0"
+  op: "_Arg"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "arg1"
+  op: "_Arg"
+  attr {
+    key: "T"
+    value {
+      type: DT_RESOURCE
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 1
+    }
+  }
+}
+node {
+  name: "assign_variable"
+  op: "AssignVariableOp"
+  input: "arg1"
+  input: "arg0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 511
+}
+
+# Tests a conversion from Graph to MLIR with resource arguments.
+
+# CHECK-LABEL: HloModule main.{{[0-9]+}}, input_output_alias={ {0}: (1, {}, may-alias) }
+# CHECK:       ENTRY %main.{{[0-9]+}} ([[ARG0:.*]]: f32[2], [[ARG1:.*]]: f32[2]) -> (f32[2]) {
+# CHECK-NEXT:    %[[ARG1]] = f32[2]{0} parameter(1)
+# CHECK-NEXT:    %[[ARG0]] = f32[2]{0} parameter(0)
+# CHECK-NEXT:    ROOT %tuple.{{[0-9]+}} = (f32[2]{0}) tuple(f32[2]{0} %[[ARG0]])
+# CHECK-NEXT:  }
+
+# CHECK:       // InputMapping {0, 1}
+# CHECK-NEXT:  // XlaInputShape f32[2]
+# CHECK-NEXT:  // XlaInputShape f32[2]
+# CHECK-NEXT:  // XlaOutputShape (f32[2])
+# CHECK-NEXT:  // ResourceUpdate input_index=1 type=float shape=(2) modified
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph.pbtxt
new file mode 100644
index 00000000000..f1f7c6434eb
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/graph.pbtxt
@@ -0,0 +1,47 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-graph-as-function | tf-mlir-translate -mlir-tf-graph-to-hlo-text -tf-input-shapes='' -tf-input-data-types=DT_FLOAT -emit-return-tuple | FileCheck %s
+
+node {
+  name: "arg"
+  op: "_Arg"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+node {
+  name: "retval"
+  op: "_Retval"
+  input: "arg"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+versions {
+  producer: 511
+}
+
+# Verify that conversion from Graph to MLIR and empty shape representation
+# function is successful.
+
+# CHECK-LABEL: HloModule main
+# CHECK:       ENTRY %main.{{[0-9]+}} ([[ARG0:.*]]: f32[]) -> (f32[]) {
+# CHECK-NEXT:    %[[ARG0]] = f32[] parameter(0)
+# CHECK-NEXT:    ROOT %tuple.{{[0-9]+}} = (f32[]) tuple(f32[] %[[ARG0]])
+# CHECK-NEXT:  }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/mlir-module-serialized-str-attr.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/mlir-module-serialized-str-attr.mlir
new file mode 100644
index 00000000000..b68f177b183
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/mlir-module-serialized-str-attr.mlir
@@ -0,0 +1,10 @@
+// RUN: tf-mlir-translate -mlir-tf-mlir-to-str-attr %s | FileCheck %s
+
+module attributes {tf.versions = {producer = 888 : i32}} {
+  func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.Identity"(%arg0) : (tensor<?xi32>) -> tensor<?xi32> loc(unknown)
+    return %0 : tensor<?xi32> loc(unknown)
+  } loc(unknown)
+} loc(unknown)
+
+// CHECK: "\0A\0Amodule attributes {tf.versions = {producer = 888 : i32}} {\0A func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {\0A %0 = \22tf.Identity\22(%arg0) : (tensor<?xi32>) -> tensor<?xi32> loc(unknown)\0A return %0 : tensor<?xi32> loc(unknown)\0A } loc(unknown)\0A} loc(unknown)"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/result-sharding.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/result-sharding.mlir
new file mode 100644
index 00000000000..c9c02ba2588
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/result-sharding.mlir
@@ -0,0 +1,39 @@
+// RUN: tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=128,10:10,1024:128,1024 -emit-use-tuple-args -emit-return-tuple | FileCheck %s
+
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 351 : i32}} {
+  func @main(%arg0: tensor<128x10xf32>, %arg1: tensor<10x1024xf32>, %arg2: tensor<128x1024xf32>) -> (tensor<128x10xf32> {mhlo.sharding = "\08\03\1A\02\01\02\22\02\00\01"}, tensor<10x1024xf32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, tensor<128x1024xf32> {mhlo.sharding = ""}) {
+    return %arg0, %arg1, %arg2 : tensor<128x10xf32>, tensor<10x1024xf32>, tensor<128x1024xf32>
+  }
+}
+
+// The following xla::OpSharding protos are used:
+//  Serialized string:
+//   "\08\03\1A\02\01\02\22\02\00\01"
+//  Proto debug string:
+//   type: OTHER
+//   tile_assignment_dimensions: 1
+//   tile_assignment_dimensions: 2
+//   tile_assignment_devices: 0
+//   tile_assignment_devices: 1
+//
+//  Serialized string:
+//   "\08\01\1A\01\01\22\01\00"
+//  Proto debug string:
+//   type: MAXIMAL
+//   tile_assignment_dimensions: 1
+//   tile_assignment_devices: 0
+//
+//  Serialized string:
+//   ""
+//  Proto debug string (empty but would equivalent to):
+//   type: REPLICATED
+
+// CHECK-LABEL: HloModule main
+// CHECK:       ENTRY %main.{{[0-9]+}}
+// CHECK-SAME:  (arg_tuple.{{[0-9]+}}: (f32[128,10], f32[10,1024], f32[128,1024])) -> (f32[128,10], f32[10,1024], f32[128,1024]) {
+// CHECK:         ROOT %tuple.{{[0-9]+}}
+// CHECK-SAME:    sharding={
+// CHECK-SAME:    {devices=[1,2]0,1}
+// CHECK-SAME:    {maximal device=0}
+// CHECK-SAME:    {replicated}
+// CHECK-SAME:    }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/serialized-mlir-module-str-attr-invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/serialized-mlir-module-str-attr-invalid.mlir
new file mode 100644
index 00000000000..ced11f3a083
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/serialized-mlir-module-str-attr-invalid.mlir
@@ -0,0 +1,5 @@
+// RUN: not tf-mlir-translate -mlir-tf-str-attr-to-mlir %s 2>&1 | FileCheck %s
+
+"totally @invalid MLIR module {here} <-"
+
+// CHECK: Invalid argument: could not parse MLIR module-:1:1: error: custom op 'totally' is unknown
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/serialized-mlir-module-str-attr.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/serialized-mlir-module-str-attr.mlir
new file mode 100644
index 00000000000..9a0e1dc38c8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/serialized-mlir-module-str-attr.mlir
@@ -0,0 +1,15 @@
+// RUN: tf-mlir-translate -mlir-tf-str-attr-to-mlir %s -mlir-print-debuginfo | FileCheck %s
+
+"\0A\0Amodule attributes {tf.versions = {producer = 888 : i32}} {\0A func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {\0A %0 = \22tf.Identity\22(%arg0) : (tensor<?xi32>) -> tensor<?xi32> loc(unknown)\0A return %0 : tensor<?xi32> loc(unknown)\0A } loc(unknown)\0A} loc(unknown)"
+
+// Test simple serialized computation consisting of a function named `main`
+// with a tf.Identity op forwarding the function single argument to the function
+// single result.
+
+// CHECK-LABEL: module
+// CHECK-SAME:  attributes {tf.versions = {producer = 888 : i32}} {
+// CHECK-NEXT:   func @main([[ARG0:%.+]]: tensor<?xi32>) -> tensor<?xi32> {
+// CHECK-NEXT:     [[IDENTITY:%.+]] = "tf.Identity"([[ARG0]]) : (tensor<?xi32>) -> tensor<?xi32> loc(unknown)
+// CHECK-NEXT:     return [[IDENTITY]] : tensor<?xi32> loc(unknown)
+// CHECK-NEXT:   } loc(unknown)
+// CHECK-NEXT: } loc(unknown)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/shape-inference-after-legalization.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/shape-inference-after-legalization.mlir
new file mode 100644
index 00000000000..55bdea5dd36
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/shape-inference-after-legalization.mlir
@@ -0,0 +1,11 @@
+// RUN: tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=8,16,16,64:64 -emit-use-tuple-args -emit-return-tuple | FileCheck %s
+
+module attributes {tf.versions = {producer = 179 : i32}} {
+  func @main(%arg0: tensor<8x16x16x64xbf16>, %arg1: tensor<64xf32>) -> (tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<*xf32>) {
+    %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg1, %arg1, %arg1) {data_format = "NHWC", device = "", epsilon = 9.99999974E-5 : f32, exponential_avg_factor = 1.000000e+00 : f32, is_training = false} : (tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<*xf32>)
+    return %0#0, %0#1, %0#2, %0#3, %0#4, %0#5 : tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<*xf32>
+  }
+}
+
+// CHECK-LABEL: HloModule main
+// CHECK:       -> (bf16[8,16,16,64], f32[64], f32[64], f32[64], f32[64], f32[0])
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/shape-inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/shape-inference.mlir
new file mode 100644
index 00000000000..f9eca514da3
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/shape-inference.mlir
@@ -0,0 +1,11 @@
+// RUN: tf-mlir-translate -mlir-tf-to-hlo-text %s -tf-input-shapes=10,17:17,19 -emit-use-tuple-args -emit-return-tuple | FileCheck %s
+
+module attributes {tf.versions = {producer = 179 : i32}} {
+  func @main(%arg0: tensor<*xf32>, %arg1: tensor<?x19xf32>) -> tensor<?x19xf32> {
+    %0 = "tf.MatMul"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", transpose_a = false, transpose_b = false} : (tensor<*xf32>, tensor<?x19xf32>) -> tensor<?x19xf32>
+    return %0 : tensor<?x19xf32>
+  }
+}
+
+// CHECK-LABEL: HloModule main
+// CHECK:       (arg_tuple.{{[0-9]+}}: (f32[10,17], f32[17,19])) -> (f32[10,19])
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
index f114d1724f2..779065b94d5 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
@@ -492,3 +492,22 @@ func @DontFoldTile() -> (tensor<8x10000xi32>) {
   return %3 : tensor<8x10000xi32>
 }
 // LINT.ThenChange(../transforms/constant_fold.cc:folding-policy)
+
+func @fold_conv() -> tensor<1x520x520x1xf32> {
+  %0 = "tf.Const"() {value = dense<0.111111112> : tensor<3x3x1x1xf32>} : () -> tensor<3x3x1x1xf32>
+  %1 = "tf.Const"() {value = dense<1.000000e+00> : tensor<1x520x520x1xf32>} : () -> tensor<1x520x520x1xf32>
+  %2 = "tf.DepthwiseConv2dNative"(%1, %0) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x520x520x1xf32>, tensor<3x3x1x1xf32>) -> tensor<1x520x520x1xf32>
+  return %2 : tensor<1x520x520x1xf32>
+
+  // CHECK: tf.Const
+  // CHECK-NOT: tf.DepthwiseConv2dNative
+}
+
+// CHECK-LABEL: DontFoldNoConstantFold
+func @DontFoldNoConstantFold() -> tensor<8xf32> {
+  %0 = "tf.Const"() {value = dense<[8]> : tensor<1xi32>} : () -> tensor<1xi32>
+  %1 = "tf.Const"() {value = dense<[2, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK: tf.StatelessRandomUniform
+  %2 = "tf.StatelessRandomUniform"(%0, %1) : (tensor<1xi32>, tensor<2xi32>) -> tensor<8xf32>
+  return %2 : tensor<8xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/contraction_fusion.mlir b/tensorflow/compiler/mlir/tensorflow/tests/contraction_fusion.mlir
new file mode 100644
index 00000000000..b12f50ad525
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/contraction_fusion.mlir
@@ -0,0 +1,37 @@
+// RUN: tf-opt %s -tf-contraction-fusion | FileCheck %s
+
+// CHECK-LABEL: matmulBiasAdd
+func @matmulBiasAdd(%arg0: tensor<64xf32>, %arg1: tensor<8x32xf32>, %arg2: tensor<32x64xf32>) -> tensor<8x64xf32> {
+  // CHECK: %[[FUSED:.*]] = "tf._JitFusedMatMul"(%arg1, %arg2, %arg0)
+  // CHECK-SAME: fusion = ["BiasAdd"]
+  // CHECK-SAME: transpose_a = false, transpose_b = false
+  %3 = "tf.MatMul"(%arg1, %arg2) {transpose_a = false, transpose_b = false} : (tensor<8x32xf32>, tensor<32x64xf32>) -> tensor<8x64xf32>
+  %4 = "tf.BiasAdd"(%3, %arg0) {data_format = "NHWC"} : (tensor<8x64xf32>, tensor<64xf32>) -> tensor<8x64xf32>
+  // CHECK: return %[[FUSED]]
+  return %4 : tensor<8x64xf32>
+}
+
+// CHECK-LABEL: matmulBiasAddRelu
+func @matmulBiasAddRelu(%arg0: tensor<64xf32>, %arg1: tensor<8x32xf32>, %arg2: tensor<32x64xf32>) -> tensor<8x64xf32> {
+  // CHECK: %[[FUSED:.*]] = "tf._JitFusedMatMul"(%arg1, %arg2, %arg0)
+  // CHECK-SAME: fusion = ["BiasAdd", "Relu"]
+  // CHECK-SAME: transpose_a = false, transpose_b = false
+  %3 = "tf.MatMul"(%arg1, %arg2) {transpose_a = false, transpose_b = false} : (tensor<8x32xf32>, tensor<32x64xf32>) -> tensor<8x64xf32>
+  %4 = "tf.BiasAdd"(%3, %arg0) {data_format = "NHWC"} : (tensor<8x64xf32>, tensor<64xf32>) -> tensor<8x64xf32>
+  %5 = "tf.Relu"(%4) : (tensor<8x64xf32>) -> tensor<8x64xf32>
+  // CHECK: return %[[FUSED]]
+  return %5 : tensor<8x64xf32>
+}
+
+// CHECK-LABEL: matmulBiasAddLeakyRelu
+func @matmulBiasAddLeakyRelu(%arg0: tensor<64xf32>, %arg1: tensor<8x32xf32>, %arg2: tensor<32x64xf32>) -> tensor<8x64xf32> {
+  // CHECK: %[[FUSED:.*]] = "tf._JitFusedMatMul"(%arg1, %arg2, %arg0)
+  // CHECK-SAME: alpha = 2.000000e-01 : f32
+  // CHECK-SAME: fusion = ["BiasAdd", "LeakyRelu"]
+  // CHECK-SAME: transpose_a = false, transpose_b = false
+  %3 = "tf.MatMul"(%arg1, %arg2) {transpose_a = false, transpose_b = false} : (tensor<8x32xf32>, tensor<32x64xf32>) -> tensor<8x64xf32>
+  %4 = "tf.BiasAdd"(%3, %arg0) {data_format = "NHWC"} : (tensor<8x64xf32>, tensor<64xf32>) -> tensor<8x64xf32>
+  %5 = "tf.LeakyRelu"(%4) { alpha = 0.2 : f32 } : (tensor<8x64xf32>) -> tensor<8x64xf32>
+  // CHECK: return %[[FUSED]]
+  return %5 : tensor<8x64xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir
index ff4dbf41221..e6a92a520f0 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir
@@ -101,7 +101,7 @@ func @decompose_resource_apply_momentum_non_nesterov(%arg0: tensor<f32>, %arg1:
 
   // CHECK: [[ACCUM:%.*]] = "tf.ReadVariableOp"([[ACCUM_HANDLE]])
   // CHECK: [[ACCUM_MOMENTUM:%.*]] = "tf.Mul"([[ACCUM]], [[MOMENTUM]])
-  // CHECK: [[ACCUM_NEW:%.*]] = "tf.Add"([[ACCUM_MOMENTUM]], [[GRAD]])
+  // CHECK: [[ACCUM_NEW:%.*]] = "tf.AddV2"([[ACCUM_MOMENTUM]], [[GRAD]])
   // CHECK: "tf.AssignVariableOp"([[ACCUM_HANDLE]], [[ACCUM_NEW]])
   // CHECK: [[ACCUM_NEW_LR:%.*]] = "tf.Mul"([[ACCUM_NEW]], [[LR]])
   // CHECK: [[VAR:%.*]] = "tf.ReadVariableOp"([[VAR_HANDLE]])
@@ -127,12 +127,12 @@ func @decompose_resource_apply_momentum_nesterov(%arg0: tensor<f32>, %arg1: tens
 
   // CHECK: [[ACCUM:%.*]] = "tf.ReadVariableOp"([[ACCUM_HANDLE]])
   // CHECK: [[ACCUM_MOMENTUM:%.*]] = "tf.Mul"([[ACCUM]], [[MOMENTUM]])
-  // CHECK: [[ACCUM_NEW:%.*]] = "tf.Add"([[ACCUM_MOMENTUM]], [[GRAD]])
+  // CHECK: [[ACCUM_NEW:%.*]] = "tf.AddV2"([[ACCUM_MOMENTUM]], [[GRAD]])
   // CHECK: "tf.AssignVariableOp"([[ACCUM_HANDLE]], [[ACCUM_NEW]])
   // CHECK: [[GRAD_LR:%.*]] = "tf.Mul"([[GRAD]], [[LR]])
   // CHECK: [[MOMENTUM_LR:%.*]] = "tf.Mul"([[MOMENTUM]], [[LR]])
   // CHECK: [[ACCUM_NEW_MOMENTUM_LR:%.*]] = "tf.Mul"([[ACCUM_NEW]], [[MOMENTUM_LR]])
-  // CHECK: [[DELTA:%.*]] = "tf.Add"([[GRAD_LR]], [[ACCUM_NEW_MOMENTUM_LR]])
+  // CHECK: [[DELTA:%.*]] = "tf.AddV2"([[GRAD_LR]], [[ACCUM_NEW_MOMENTUM_LR]])
   // CHECK: [[VAR:%.*]] = "tf.ReadVariableOp"([[VAR_HANDLE]])
   // CHECK: [[VAR_NEW:%.*]] = "tf.Sub"([[VAR]], [[DELTA]])
   // CHECK: "tf.AssignVariableOp"([[VAR_HANDLE]], [[VAR_NEW]])
@@ -231,6 +231,31 @@ func @decompose_resource_apply_adagradv2(%arg0: tensor<f32>, %arg1: tensor<f32>,
   return
 }
 
+// -----
+// CHECK-LABEL: func @decompose_resource_apply_adagrad
+// CHECK-SAME:  (%[[LR:.*]]: tensor<f32>, %[[GRAD:.*]]: tensor<f32>)
+func @decompose_resource_apply_adagrad(%arg0: tensor<f32>, %arg1: tensor<f32>) -> () {
+
+  // CHECK: %[[VAR_HANDLE:.*]] = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
+  // CHECK: %[[ACCUM_HANDLE:.*]] = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
+  // CHECK: %[[GRAD_SQUARE:.*]] = "tf.Mul"(%[[GRAD]], %[[GRAD]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: %[[ACCUM:.*]] = "tf.ReadVariableOp"(%[[ACCUM_HANDLE]]) : (tensor<*x!tf.resource>) -> tensor<*xf32>
+  // CHECK: %[[ACCUM_NEW:.*]] = "tf.AddV2"(%[[ACCUM]], %[[GRAD_SQUARE]]) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+  // CHECK: %[[LR_MULTIPLY:.*]] = "tf.Mul"(%[[LR]], %[[GRAD]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: %[[SQRT:.*]] = "tf.Sqrt"(%[[ACCUM_NEW]]) : (tensor<*xf32>) -> tensor<*xf32>
+  // CHECK: %[[DIV:.*]] = "tf.Div"(%[[LR_MULTIPLY]], %[[SQRT]]) : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
+  // CHECK: %[[VAR:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE]]) : (tensor<*x!tf.resource>) -> tensor<*xf32>
+  // CHECK: %[[VAR_NEW:.*]] = "tf.Sub"(%[[VAR]], %[[DIV]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  // CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[VAR_NEW]]) : (tensor<*x!tf.resource>, tensor<*xf32>) -> ()
+  // CHECK: "tf.AssignVariableOp"(%[[ACCUM_HANDLE]], %[[ACCUM_NEW]]) : (tensor<*x!tf.resource>, tensor<*xf32>) -> ()
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
+  %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
+
+  "tf.ResourceApplyAdagrad"(%0, %1, %arg0, %arg1) {update_slots = true, use_locking = true} : (tensor<*x!tf.resource>, tensor<*x!tf.resource>, tensor<f32>, tensor<f32>) -> ()
+
+  return
+}
+
 // -----
 
 // Tests that composite tf.ResourceApplyAdam (non-Nesterov) operation is
@@ -388,14 +413,14 @@ func @decompose_resource_apply_centered_RMS_prop(%arg0: tensor<f32>, %arg1: tens
   // CHECK: [[GRAD_SUB:%.*]] = "tf.Mul"([[GRADSQ]], [[SB]])
   // CHECK: [[MS:%.*]] = "tf.ReadVariableOp"([[MS_HANDLE]])
   // CHECK: [[MS_RHO:%.*]] = "tf.Mul"([[MS]], [[RHO]])
-  // CHECK: [[MS_NEW:%.*]] = "tf.Add"([[GRAD_SUB]], [[MS_RHO]])
+  // CHECK: [[MS_NEW:%.*]] = "tf.AddV2"([[GRAD_SUB]], [[MS_RHO]])
   // CHECK: "tf.AssignVariableOp"([[MS_HANDLE]], [[MS_NEW]])
 
   // CHECK: [[SUB_RHO:%.*]] = "tf.Sub"([[ONE]], [[RHO]])
   // CHECK: [[SUB_GRAD:%.*]] = "tf.Mul"([[GRAD]], [[SUB_RHO]])
   // CHECK: [[MG:%.*]] = "tf.ReadVariableOp"([[MG_HANDLE]])
   // CHECK: [[MG_RHO:%.*]] = "tf.Mul"([[MG]], [[RHO]])
-  // CHECK: [[MG_NEW:%.*]] = "tf.Add"([[SUB_GRAD]], [[MG_RHO]])
+  // CHECK: [[MG_NEW:%.*]] = "tf.AddV2"([[SUB_GRAD]], [[MG_RHO]])
   // CHECK: "tf.AssignVariableOp"([[MG_HANDLE]], [[MG_NEW]])
 
   // CHECK: [[MOM:%.*]] = "tf.ReadVariableOp"([[MOM_HANDLE]])
@@ -403,11 +428,11 @@ func @decompose_resource_apply_centered_RMS_prop(%arg0: tensor<f32>, %arg1: tens
   // CHECK: [[LR_GRAD:%.*]] = "tf.Mul"([[LR]], [[GRAD]])
 
   // CHECK: [[MG_MG:%.*]] = "tf.Mul"([[MG_NEW]], [[MG_NEW]])
-  // CHECK: [[MG_NEW:%.*]] = "tf.Add"([[MG_MG]], [[EPSILON]])
+  // CHECK: [[MG_NEW:%.*]] = "tf.AddV2"([[MG_MG]], [[EPSILON]])
   // CHECK: [[MG_SUB:%.*]] = "tf.Sub"([[MS_NEW]], [[MG_NEW]])
   // CHECK: [[MG_SQRT:%.*]] = "tf.Sqrt"([[MG_SUB]])
   // CHECK: [[MOM_DIV:%.*]] = "tf.Div"([[LR_GRAD]], [[MG_SQRT]])
-  // CHECK: [[MOM_NEW:%.*]] = "tf.Add"([[MOM_MOM]], [[MOM_DIV]])
+  // CHECK: [[MOM_NEW:%.*]] = "tf.AddV2"([[MOM_MOM]], [[MOM_DIV]])
 
   // CHECK: [[VAR:%.*]] = "tf.ReadVariableOp"([[VAR_HANDLE]])
   // CHECK: [[VAR_NEW:%.*]] = "tf.Sub"([[VAR]], [[MOM_NEW]])
@@ -416,6 +441,33 @@ func @decompose_resource_apply_centered_RMS_prop(%arg0: tensor<f32>, %arg1: tens
   "tf.ResourceApplyCenteredRMSProp"(%0, %1, %2, %3, %arg4, %arg5, %arg6, %arg7, %arg8) {use_locking = false} : (tensor<*x!tf.resource>, tensor<*x!tf.resource>, tensor<*x!tf.resource>, tensor<*x!tf.resource>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) -> ()
   return
 }
+// -----
+// CHECK-LABEL: func @decompose_resource_apply_RMS_prop
+// CHECK-SAME:  (%[[VAR_HANDLE:.*]]: tensor<*x!tf.resource>, %[[MS_HANDLE:.*]]: tensor<*x!tf.resource>, %[[MOM_HANDLE:.*]]: tensor<*x!tf.resource>,
+// CHECK-SAME:   %[[LR:.*]]: tensor<f32>, %[[RHO:.*]]: tensor<f32>, %[[MOMENTUM:.*]]: tensor<f32>, %[[EPSILON:.*]]: tensor<f32>, %[[GRAD:.*]]: tensor<f32>)
+func @decompose_resource_apply_RMS_prop(%arg0: tensor<*x!tf.resource>, %arg1: tensor<*x!tf.resource>, %arg2: tensor<*x!tf.resource>, %arg3: tensor<f32>, %arg4: tensor<f32>, %arg5: tensor<f32>, %arg6: tensor<f32>, %arg7: tensor<f32>) -> () {
+// CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK: %[[MS:.*]] = "tf.ReadVariableOp"(%[[MS_HANDLE]]) : (tensor<*x!tf.resource>) -> tensor<*xf32>
+// CHECK: %[[MS_RHO:.*]] = "tf.Mul"(%[[MS]], %[[RHO]]) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+// CHECK: %[[GRAD_SQUARE:.*]] = "tf.Square"(%[[GRAD]]) : (tensor<f32>) -> tensor<f32>
+// CHECK: %[[ONE_RHO:.*]] = "tf.Sub"(%[[ONE]], %[[RHO]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK: %[[MUL:.*]] = "tf.Mul"(%[[GRAD_SQUARE]], %[[ONE_RHO]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK: %[[MS_NEW:.*]] = "tf.AddV2"(%[[MS_RHO]], %[[MUL]]) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+// CHECK: "tf.AssignVariableOp"(%[[MS_HANDLE]], %[[MS_NEW]]) : (tensor<*x!tf.resource>, tensor<*xf32>) -> ()
+// CHECK: %[[MOM:.*]] = "tf.ReadVariableOp"(%[[MOM_HANDLE]]) : (tensor<*x!tf.resource>) -> tensor<*xf32>
+// CHECK: %[[MOMENTUM_MOM:.*]] = "tf.Mul"(%[[MOMENTUM]], %[[MOM]]) : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
+// CHECK: %[[LR_GRAD:.*]] = "tf.Mul"(%[[LR]], %[[GRAD]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[MS_NEW]], %[[EPSILON]]) : (tensor<*xf32>, tensor<f32>) -> tensor<*xf32>
+// CHECK: %[[SQRT:.*]] = "tf.Sqrt"(%[[ADD]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK: %[[DIV:.*]] = "tf.Div"(%[[LR_GRAD]], %[[SQRT]]) : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
+// CHECK: %[[MOM_NEW:.*]] = "tf.AddV2"(%[[MOMENTUM_MOM]], %[[DIV]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+// CHECK: "tf.AssignVariableOp"(%[[MOM_HANDLE]], %[[MOM_NEW]]) : (tensor<*x!tf.resource>, tensor<*xf32>) -> ()
+// CHECK: %[[VAR:.*]] = "tf.ReadVariableOp"(%[[VAR_HANDLE]]) : (tensor<*x!tf.resource>) -> tensor<*xf32>
+// CHECK: %[[VAR_NEW:.*]] = "tf.Sub"(%[[VAR]], %[[MOM_NEW]]) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+// CHECK: "tf.AssignVariableOp"(%[[VAR_HANDLE]], %[[VAR_NEW]]) : (tensor<*x!tf.resource>, tensor<*xf32>) -> ()
+  "tf.ResourceApplyRMSProp"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) {use_locking = false} : (tensor<*x!tf.resource>, tensor<*x!tf.resource>, tensor<*x!tf.resource>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>, tensor<f32>) -> ()
+  return
+}
 
 // -----
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir b/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir
index e7430993755..c963147b855 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/einsum.mlir
@@ -7,6 +7,14 @@ func @einsum_basic(%arg0: tensor<3x4x5xf32>, %arg1: tensor<3x5x6xf32>) -> tensor
   // CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<3x4x5xf32>, tensor<3x5x6xf32>) -> tensor<3x4x6xf32>
 }
 
+func @einsum_matmul(%arg0: tensor<7x9xf32>, %arg1: tensor<9x5xf32>) -> tensor<7x5xf32> {
+  %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ae,ed->ad"}: (tensor<7x9xf32>, tensor<9x5xf32>) -> tensor<7x5xf32>
+  return %0 : tensor<7x5xf32>
+  // CHECK-LABEL: einsum_matmul
+  // CHECK: %[[v0:.*]] = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<7x9xf32>, tensor<9x5xf32>) -> tensor<7x5xf32>
+  // CHECK: return %[[v0]] : tensor<7x5xf32>
+}
+
 func @einsum_broadcast(%arg0: tensor<3x4x5xf32>, %arg1: tensor<5x6xf32>) -> tensor<3x4x6xf32> {
   %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ijk,km->ijm"}: (tensor<3x4x5xf32>, tensor<5x6xf32>) -> tensor<3x4x6xf32>
   return %0 : tensor<3x4x6xf32>
@@ -14,18 +22,27 @@ func @einsum_broadcast(%arg0: tensor<3x4x5xf32>, %arg1: tensor<5x6xf32>) -> tens
   // CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<3x4x5xf32>, tensor<5x6xf32>) -> tensor<3x4x6xf32>
 }
 
+func @einsum_broadcast4(%arg0: tensor<3x4x5x6x7xf32>, %arg1: tensor<7x8xf32>) -> tensor<3x4x5x6x8xf32> {
+  %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "abcdh,hg->abcdg"}: (tensor<3x4x5x6x7xf32>, tensor<7x8xf32>) -> tensor<3x4x5x6x8xf32>
+  return %0 : tensor<3x4x5x6x8xf32>
+  // CHECK-LABEL: einsum_broadcast4
+  // CHECK: "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<3x4x5x6x7xf32>, tensor<7x8xf32>) -> tensor<3x4x5x6x8xf32>
+}
+
 func @einsum_reducesum(%arg0: tensor<2x5x7xf32>, %arg1: tensor<5x2xf32>) -> tensor<5x7xf32> {
   %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "lbh,bl->bh"}: (tensor<2x5x7xf32>, tensor<5x2xf32>) -> tensor<5x7xf32>
   return %0 : tensor<5x7xf32>
   // CHECK-LABEL: einsum_reducesum
   // CHECK: %[[cst:.*]] = constant dense<[1, 2, 0]> : tensor<3xi32>
-  // CHECK: %[[cst_1:.*]] = constant dense<[5, 1, 2]> : tensor<3xi64>
-  // CHECK: %[[cst_2:.*]] = constant dense<2> : tensor<1xi32>
+  // CHECK: %[[cst_1:.*]] = constant dense<[5, 2, 1]> : tensor<3xi64>
+  // CHECK: %[[cst_2:.*]] = constant dense<[5, 7]> : tensor<2xi64>
   // CHECK: %[[v0:.*]] = "tf.Transpose"(%arg0, %[[cst]]) : (tensor<2x5x7xf32>, tensor<3xi32>) -> tensor<5x7x2xf32>
-  // CHECK: %[[v1:.*]] = "tf.Reshape"(%arg1, %[[cst_1]]) : (tensor<5x2xf32>, tensor<3xi64>) -> tensor<5x1x2xf32>
-  // CHECK: %[[v2:.*]] = "tf.Mul"(%[[v0]], %[[v1]]) : (tensor<5x7x2xf32>, tensor<5x1x2xf32>) -> tensor<5x7x2xf32>
-  // CHECK: "tf.Sum"(%[[v2]], %[[cst_2]]) {keep_dims = false} : (tensor<5x7x2xf32>, tensor<1xi32>) -> tensor<5x7xf32>
+  // CHECK: %[[v1:.*]] = "tf.Reshape"(%arg1, %[[cst_1]]) : (tensor<5x2xf32>, tensor<3xi64>) -> tensor<5x2x1xf32>
+  // CHECK: %[[v2:.*]] = "tf.BatchMatMulV2"(%[[v0]], %[[v1]]) {adj_x = false, adj_y = false} : (tensor<5x7x2xf32>, tensor<5x2x1xf32>) -> tensor<5x7x1xf32>
+  // CHECK: %[[v3:.*]] = "tf.Reshape"(%[[v2]], %[[cst_2]]) : (tensor<5x7x1xf32>, tensor<2xi64>) -> tensor<5x7xf32>
+  // CHECK: return %[[v3:.*]] : tensor<5x7xf32>
 }
+
 func @einsum_transpose_matmul(%arg0: tensor<2x5x7xf32>, %arg1: tensor<5x3x2xf32>) -> tensor<5x3x7xf32> {
   %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "lbh,bkl->bkh"}: (tensor<2x5x7xf32>, tensor<5x3x2xf32>) -> tensor<5x3x7xf32>
   return %0 : tensor<5x3x7xf32>
@@ -88,12 +105,12 @@ func @einsum_transposereduceddim(%arg0: tensor<2x5x7xf32>, %arg1: tensor<2x5x3x7
   %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "bij,binj->bin"}: (tensor<2x5x7xf32>, tensor<2x5x3x7xf32>) -> tensor<2x5x3xf32>
   return %0 : tensor<2x5x3xf32>
   // CHECK-LABEL: einsum_transposereduceddim
-  // CHECK: %[[cst:.*]] = constant dense<[2, 5, 1, 7]> : tensor<4xi64>
-  // CHECK: %[[cst_1:.*]] = constant dense<[0, 1, 3, 2]> : tensor<4xi32>
+  // CHECK: %[[cst:.*]] = constant dense<[0, 1, 3, 2]> : tensor<4xi32>
+  // CHECK: %[[cst_1:.*]] = constant dense<[2, 5, 1, 7]> : tensor<4xi64>
   // CHECK: %[[cst_2:.*]] = constant dense<[2, 5, 3]> : tensor<3xi64>
-  // CHECK: %[[v0:.*]] = "tf.Reshape"(%arg0, %[[cst]]) : (tensor<2x5x7xf32>, tensor<4xi64>) -> tensor<2x5x1x7xf32>
-  // CHECK: %[[v1:.*]] = "tf.Transpose"(%arg1, %[[cst_1]]) : (tensor<2x5x3x7xf32>, tensor<4xi32>) -> tensor<2x5x7x3xf32>
-  // CHECK: %[[v2:.*]] = "tf.BatchMatMulV2"(%[[v0]], %[[v1]]) {adj_x = false, adj_y = false} : (tensor<2x5x1x7xf32>, tensor<2x5x7x3xf32>) -> tensor<2x5x1x3xf32>
+  // CHECK: %[[v0:.*]] = "tf.Transpose"(%arg1, %[[cst]]) : (tensor<2x5x3x7xf32>, tensor<4xi32>) -> tensor<2x5x7x3xf32>
+  // CHECK: %[[v1:.*]] = "tf.Reshape"(%arg0, %[[cst_1]]) : (tensor<2x5x7xf32>, tensor<4xi64>) -> tensor<2x5x1x7xf32>
+  // CHECK: %[[v2:.*]] = "tf.BatchMatMulV2"(%[[v1]], %[[v0]]) {adj_x = false, adj_y = false} : (tensor<2x5x1x7xf32>, tensor<2x5x7x3xf32>) -> tensor<2x5x1x3xf32>
   // CHECK: %[[v3:.*]] = "tf.Reshape"(%[[v2]], %[[cst_2]]) : (tensor<2x5x1x3xf32>, tensor<3xi64>) -> tensor<2x5x3xf32>
   // CHECK: return %[[v3]] : tensor<2x5x3xf32>
 }
@@ -123,13 +140,26 @@ func @einsum_fourdtransposeall(%arg0: tensor<2x5x7x3xf32>, %arg1: tensor<2x11x7x
   // CHECK: return %[[v3]] : tensor<2x7x11x5xf32>
 }
 
-func @einsum_no_match(%arg0: tensor<4x5xf32>, %arg1: tensor<5xf32>) -> tensor<4xf32> {
-  %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ij,j->i"}: (tensor<4x5xf32>, tensor<5xf32>) -> tensor<4xf32>
+func @einsum_4d_1(%arg0: tensor<3x4x5x6xf32>, %arg1: tensor<3x7x5x6xf32>) -> tensor<3x5x4x7xf32> {
+  %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "jbki,jfki->jkbf"}: (tensor<3x4x5x6xf32>, tensor<3x7x5x6xf32>) -> tensor<3x5x4x7xf32>
+  return %0 : tensor<3x5x4x7xf32>
+  // CHECK-LABEL: einsum_4d_1
+  // CHECK: %[[cst:.*]] = constant dense<[0, 2, 1, 3]> : tensor<4xi32>
+  // CHECK: %[[cst_1:.*]] = constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+  // CHECK: %[[v0:.*]] = "tf.Transpose"(%arg0, %[[cst:.*]]) : (tensor<3x4x5x6xf32>, tensor<4xi32>) -> tensor<3x5x4x6xf32>
+  // CHECK: %[[v1:.*]] = "tf.Transpose"(%arg1, %[[cst_1]]) : (tensor<3x7x5x6xf32>, tensor<4xi32>) -> tensor<3x5x6x7xf32>
+  // CHECK: %[[v2:.*]] = "tf.BatchMatMulV2"(%[[v0]], %[[v1]]) {adj_x = false, adj_y = false} : (tensor<3x5x4x6xf32>, tensor<3x5x6x7xf32>) -> tensor<3x5x4x7xf32>
+  // CHECK: return %[[v2]] : tensor<3x5x4x7xf32>
+}
+
+func @einsum_no_match(%arg0: tensor<4x5x6xf32>, %arg1: tensor<5xf32>) -> tensor<4xf32> {
+  %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ijk,j->i"}: (tensor<4x5x6xf32>, tensor<5xf32>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
 // CHECK-LABEL: einsum_no_match
-// CHECK: %[[v0:.*]] = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ij,j->i"} : (tensor<4x5xf32>, tensor<5xf32>) -> tensor<4xf32>
+// CHECK: %[[v0:.*]] = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ijk,j->i"} : (tensor<4x5x6xf32>, tensor<5xf32>) -> tensor<4xf32>
 // CHECK: return %[[v0]]
 }
+
 func @einsum_illegal_no_match(%arg0: tensor<4x5xf32>, %arg1: tensor<5xf32>) -> tensor<4xf32> {
   %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ij,?zw->kq->i"}: (tensor<4x5xf32>, tensor<5xf32>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
@@ -137,10 +167,15 @@ func @einsum_illegal_no_match(%arg0: tensor<4x5xf32>, %arg1: tensor<5xf32>) -> t
 // CHECK: %[[v0:.*]] = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ij,?zw->kq->i"} : (tensor<4x5xf32>, tensor<5xf32>) -> tensor<4xf32>
 // CHECK: return %[[v0]]
 }
-func @einsum_no_match5D(%arg0: tensor<4x5xf32>, %arg1: tensor<2x4x7x3x5xf32>) -> tensor<4xf32> {
-  %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "ij,j->i"}: (tensor<4x5xf32>, tensor<2x4x7x3x5xf32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
-// CHECK-LABEL: einsum_no_match5D
-// CHECK: %[[v0:.*]] = "tf.Einsum"
-// CHECK: return %[[v0]]
+
+func @batch_multilhs_einsum(%arg0: tensor<2x1x1x11xf32>, %arg1: tensor<2x11x2xf32>) -> tensor<2x1x1x2xf32> {
+  %0 = "tf.Einsum"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", equation = "BiNj,BjS->BiNS"} : (tensor<2x1x1x11xf32>, tensor<2x11x2xf32>) -> tensor<2x1x1x2xf32>
+  return %0 : tensor<2x1x1x2xf32>
+// CHECK-LABEL: batch_multilhs_einsum
+// CHECK: %[[cst:.*]] = constant dense<[2, 1, 11]> : tensor<3xi64>
+// CHECK: %[[cst_1:.*]] = constant dense<[2, 1, 1, 2]> : tensor<4xi64>
+// CHECK: %[[v0:.*]] = "tf.Reshape"(%arg0, %[[cst]]) : (tensor<2x1x1x11xf32>, tensor<3xi64>) -> tensor<2x1x11xf32>
+// CHECK: %[[v1:.*]] = "tf.BatchMatMulV2"(%[[v0]], %arg1) {adj_x = false, adj_y = false} : (tensor<2x1x11xf32>, tensor<2x11x2xf32>) -> tensor<2x1x2xf32>
+// CHECK: %[[v2:.*]] = "tf.Reshape"(%[[v1]], %[[cst_1]]) : (tensor<2x1x2xf32>, tensor<4xi64>) -> tensor<2x1x1x2xf32>
+// CHECK: return %[[v2]] : tensor<2x1x1x2xf32>
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_island_coarsening.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_island_coarsening.mlir
index bec48181b3b..726495f1fbc 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/executor_island_coarsening.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_island_coarsening.mlir
@@ -220,7 +220,7 @@ func @merge_islands_only() {
     %11:2 = tf_executor.island(%10#1) wraps "tf.opF"() : () -> tensor<i32>
     %12:2 = tf_executor.island wraps "tf.opG"(%10#0, %11#0) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
     %13 = tf_executor.ControlTrigger %2, %12#1, %9#1
-    tf_executor.NextIteration.Sink [%3#1] %12#0, %13 : tensor<*xi32>
+    tf_executor.NextIteration.Sink[%3#1] %12#0, %13 : tensor<*xi32>
     tf_executor.fetch
   }
   return
@@ -244,7 +244,7 @@ func @merge_islands_only() {
 // CHECK-NEXT:     %[[OP_G:[0-9]*]] = "tf.opG"(%[[OP_E]], %[[OP_F]])
 // CHECK-NEXT:     tf_executor.yield %[[OP_G]] : tensor<*xi32>
 // CHECK:        %[[CT:.*]] = tf_executor.ControlTrigger %[[ISLAND_1]], %[[ISLAND_3_control]], %[[EXIT_control]]
-// CHECK-NEXT:   tf_executor.NextIteration.Sink [%[[NEXTIT_SRC_token]]] %[[ISLAND_3]], %[[CT]]
+// CHECK-NEXT:   tf_executor.NextIteration.Sink[%[[NEXTIT_SRC_token]]] %[[ISLAND_3]], %[[CT]]
 
 
 // Test no merging took place as cycle would be formed otherwise.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/BUILD
index 1544d27009f..81cb0ed7c73 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 licenses(["notice"])
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/force_shared_name_for_resource_ops.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/force_shared_name_for_resource_ops.pbtxt
new file mode 100644
index 00000000000..05302ed430c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/force_shared_name_for_resource_ops.pbtxt
@@ -0,0 +1,95 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-upgrade-legacy %s -tf-output-arrays=hash_table_node -o - | FileCheck %s
+
+node: {
+  name: "hash_table_node"
+  op: "HashTableV2"
+  attr: {
+    key: "key_dtype"
+    value: {
+      type: DT_INT32
+    }
+  }
+  attr: {
+    key: "shared_name"
+    value: {
+      s: ""
+    }
+  }
+  attr: {
+    key: "value_dtype"
+    value: {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "Call"
+  op: "PartitionedCall"
+  attr {
+    key: "Tin"
+    value {
+      list {
+      }
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+      list {
+        type: DT_RESOURCE
+      }
+    }
+  }
+  attr {
+    key: "f"
+    value {
+      func {
+        name: "create_resource"
+      }
+    }
+  }
+}
+library {
+  function {
+    signature {
+      name: "create_resource"
+      output_arg {
+        name: "handle"
+        type: DT_RESOURCE
+      }
+    }
+    node_def: {
+      name: "hash_table_node"
+      op: "HashTableV2"
+      attr: {
+        key: "key_dtype"
+        value: {
+          type: DT_INT32
+        }
+      }
+      attr: {
+        key: "shared_name"
+        value: {
+          s: ""
+        }
+      }
+      attr: {
+        key: "value_dtype"
+        value: {
+          type: DT_FLOAT
+        }
+      }
+    }
+    ret {
+      key: "handle"
+      value: "hash_table_node:table_handle:0"
+    }
+  }
+}
+
+# CHECK: tf.HashTableV2
+# CHECK-SAME: shared_name = "hash_table_node"
+
+# CHECK: func @create_resource
+# CHECK: tf.HashTableV2
+# CHECK-SAME: shared_name = "hash_table_node@create_resource"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt
index e21fd901a9e..a6b1979ee26 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt
@@ -7,7 +7,7 @@
 # CHECK:    %[[NEXTITERATION:[a-z0-9]+]], %[[NEXTITERATION_token:[a-z0-9]+]], {{.*}} = tf_executor.NextIteration.Source
 # CHECK:    tf_executor.Merge {{.*}} %[[NEXTITERATION]]
 
-# CHECK:    tf_executor.NextIteration.Sink [%[[NEXTITERATION_token]]]
+# CHECK:    tf_executor.NextIteration.Sink[%[[NEXTITERATION_token]]]
 
 node {
   name: "Const"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir
index 30599b2e437..9bb05a75877 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nchw.mlir
@@ -7,7 +7,7 @@
 // CHECK-LABEL: func @transposeConv2D
 func @transposeConv2D(%input: tensor<1x32x32x3xf32>, %filter: tensor<1x1x3x8xf32>) -> tensor<1x32x32x8xf32> {
 
-  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
 
   // CHECK: %[[CONV2D:[0-9]*]] = "tf.Conv2D"(%[[ARG_TRANSPOSE]], %arg1)
@@ -18,7 +18,7 @@ func @transposeConv2D(%input: tensor<1x32x32x3xf32>, %filter: tensor<1x1x3x8xf32
   // CHECK-SAME: strides = [5, 8, 6, 7]
   // CHECK-SAME: (tensor<1x3x32x32xf32>, tensor<1x1x3x8xf32>) -> tensor<1x8x32x32xf32>
 
-  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[CONV2D]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
@@ -38,7 +38,7 @@ func @transposeConv2D(%input: tensor<1x32x32x3xf32>, %filter: tensor<1x1x3x8xf32
 func @transposeConv2DWithDefaultAttr(%input: tensor<1x32x32x3xf32>, %filter: tensor<1x1x3x8xf32>) -> tensor<*xf32>
 {
 
-  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
 
   // CHECK: %[[CONV2D:[0-9]*]] = "tf.Conv2D"(%[[ARG_TRANSPOSE]], %arg1)
@@ -49,7 +49,7 @@ func @transposeConv2DWithDefaultAttr(%input: tensor<1x32x32x3xf32>, %filter: ten
   // CHECK-SAME: strides = [5, 8, 6, 7]
   // CHECK-SAME: (tensor<1x3x32x32xf32>, tensor<1x1x3x8xf32>) -> tensor<*xf32>
 
-  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[CONV2D]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
@@ -77,7 +77,7 @@ func @transposeConv2DBackpropFilter(
   // CHECK-SAME: dst_format = "NCHW"
   // CHECK-SAME: src_format = "NHWC"
 
-  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
   // CHECK: %[[IN_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
   // CHECK: %[[OUT_BP_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg2, %[[ARG_PERM]])
 
@@ -117,7 +117,7 @@ func @transposeConv2DBackpropInput(
   // CHECK-SAME: dst_format = "NCHW"
   // CHECK-SAME: src_format = "NHWC"
 
-  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
   // CHECK: %[[OUT_BP_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg2, %[[ARG_PERM]])
 
   // CHECK: %[[CONV2D_BACKPROP:[0-9]*]] = "tf.Conv2DBackpropInput"
@@ -130,7 +130,7 @@ func @transposeConv2DBackpropInput(
   // CHECK-SAME: (tensor<4xi32>, tensor<1x1x3x8xf32>, tensor<1x8x32x32xf32>)
   // CHECK-SAME: -> tensor<1x3x32x32xf32>
 
-  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[CONV2D_BACKPROP]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
@@ -154,7 +154,7 @@ func @transposeFusedBatchNormV3(
 ) -> tensor<1x28x28x64xf32> {
 
   // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"()
-  // CHECK-SAME: {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK-SAME: {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
 
   // CHECK: "tf.FusedBatchNormV3"
@@ -164,7 +164,7 @@ func @transposeFusedBatchNormV3(
   // CHECK-SAME: -> (tensor<1x64x28x28xf32>, tensor<64xf32>,
 
   // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"()
-  // CHECK-SAME: {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK-SAME: {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%y, %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
@@ -192,7 +192,7 @@ func @transposeFusedBatchNormGradV3(
 ) -> tensor<1x28x28x64xf32> {
 
   // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"()
-  // CHECK-SAME: {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK-SAME: {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
 
   // CHECK: %[[ARG0_TPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
   // CHECK: %[[ARG1_TPOSE:[0-9]*]] = "tf.Transpose"(%arg1, %[[ARG_PERM]])
@@ -204,7 +204,7 @@ func @transposeFusedBatchNormGradV3(
   // CHECK-SAME: -> (tensor<1x64x28x28xf32>,
 
   // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"()
-  // CHECK-SAME: {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK-SAME: {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
 
   // CHECK: %[[RES_TPOSE:[0-9]*]] = "tf.Transpose"
   // CHECK-SAME: (%x_backprop, %[[RES_PERM]])
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir
index e6b3bf08394..c71d8ef2850 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_layout_assignment_to_nhwc.mlir
@@ -7,7 +7,7 @@
 // CHECK-LABEL: func @transposeConv2D
 func @transposeConv2D(%input: tensor<1x3x32x32xf32>, %filter: tensor<1x1x3x8xf32>) -> tensor<1x8x32x32xf32> {
 
-  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
 
   // CHECK: %[[CONV2D:[0-9]*]] = "tf.Conv2D"(%[[ARG_TRANSPOSE]], %arg1)
@@ -18,7 +18,7 @@ func @transposeConv2D(%input: tensor<1x3x32x32xf32>, %filter: tensor<1x1x3x8xf32
   // CHECK-SAME: strides = [5, 7, 8, 6]
   // CHECK-SAME: (tensor<1x32x32x3xf32>, tensor<1x1x3x8xf32>) -> tensor<1x32x32x8xf32>
 
-  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%[[CONV2D]], %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
@@ -41,7 +41,7 @@ func @transposeFusedBatchNormV3(
 ) -> tensor<1x64x28x28xf32> {
 
   // CHECK: %[[ARG_PERM:[0-9]*]] = "tf.Const"()
-  // CHECK-SAME: {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK-SAME: {value = dense<[0, 2, 3, 1]> : tensor<4xi64>}
   // CHECK: %[[ARG_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%arg0, %[[ARG_PERM]])
 
   // CHECK: "tf.FusedBatchNormV3"
@@ -51,7 +51,7 @@ func @transposeFusedBatchNormV3(
   // CHECK-SAME: -> (tensor<1x28x28x64xf32>, tensor<64xf32>,
 
   // CHECK: %[[RES_PERM:[0-9]*]] = "tf.Const"()
-  // CHECK-SAME: {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK-SAME: {value = dense<[0, 3, 1, 2]> : tensor<4xi64>}
   // CHECK: %[[RES_TRANSPOSE:[0-9]*]] = "tf.Transpose"(%y, %[[RES_PERM]])
   // CHECK: return %[[RES_TRANSPOSE]]
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir
index 0b1e27733eb..bacfeea2dc9 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/layout_optimization_move_transposes_begin.mlir
@@ -65,3 +65,40 @@ func @move_with_multiple_uses(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x8x4x4xf32>
 
   return %3 : tensor<1x8x4x4xf32>
 }
+
+// CHECK-LABEL: move_transpose_handle_broadcast
+func @move_transpose_handle_broadcast(%arg0:tensor<8x64xf32>, %arg1:tensor<8x64x64xf32>) -> tensor<512x64xf32> {
+  %cst = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
+  %cst_1 = "tf.Const"() {value = dense<[2, 0, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %cst_2 = "tf.Const"() {value = dense<[512, 64]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %0 = "tf.ExpandDims"(%arg0, %cst) {device = ""} : (tensor<8x64xf32>, tensor<i32>) -> tensor<8x64x1xf32>
+  %1 = "tf.AddV2"(%0, %arg1) {device = ""} : (tensor<8x64x1xf32>, tensor<8x64x64xf32>) -> tensor<8x64x64xf32>
+  %2 = "tf.Transpose"(%1, %cst_1) {device = ""} : (tensor<8x64x64xf32>, tensor<3xi32>) -> tensor<64x8x64xf32>
+  %3 = "tf.Reshape"(%2, %cst_2) {device = ""} : (tensor<64x8x64xf32>, tensor<2xi32>) -> tensor<512x64xf32>
+
+  return %3 : tensor<512x64xf32>
+
+  // CHECK: %[[CST_0:.*]] = "tf.Const"() {value = dense<[2, 0, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+  // CHECK: %[[CST_1:.*]] = "tf.Const"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[CST_2:.*]] = "tf.Const"() {value = dense<[512, 64]> : tensor<2xi32>} : () -> tensor<2xi32>
+  // CHECK: %[[EXPAND_DIMS:.*]] = "tf.ExpandDims"(%arg0, %[[CST_1]]) {device = ""} : (tensor<8x64xf32>, tensor<i32>) -> tensor<8x64x1xf32>
+  // CHECK: %[[TRANSPOSE_1:.*]] = "tf.Transpose"(%[[EXPAND_DIMS]], %[[CST_0]]) : (tensor<8x64x1xf32>, tensor<3xi32>) -> tensor<1x8x64xf32>
+  // CHECK: %[[TRANSPOSE_2:.*]] = "tf.Transpose"(%arg1, %[[CST_0]]) : (tensor<8x64x64xf32>, tensor<3xi32>) -> tensor<64x8x64xf32>
+  // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[TRANSPOSE_1]], %[[TRANSPOSE_2]]) {device = ""} : (tensor<1x8x64xf32>, tensor<64x8x64xf32>) -> tensor<64x8x64xf32>
+  // CHECK: %[[RESHAPE:.*]] = "tf.Reshape"(%[[ADD]], %[[CST_2]]) {device = ""} : (tensor<64x8x64xf32>, tensor<2xi32>) -> tensor<512x64xf32>
+  // CHECK: return %[[RESHAPE]] : tensor<512x64xf32>
+}
+
+// CHECK-LABEL: dont_move_transpose_different_ranks
+func @dont_move_transpose_different_ranks(%arg0:tensor<1x1x2x3xf32>, %arg1:tensor<2x3xf32>) -> tensor<1x2x1x3xf32> {
+  %cst = "tf.Const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %0 = "tf.AddV2"(%arg0, %arg1) {device = ""} : (tensor<1x1x2x3xf32>, tensor<2x3xf32>) -> tensor<1x1x2x3xf32>
+  %1 = "tf.Transpose"(%0, %cst) {device = ""} : (tensor<1x1x2x3xf32>, tensor<4xi32>) -> tensor<1x2x1x3xf32>
+
+  return %1 : tensor<1x2x1x3xf32>
+
+  // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[0, 2, 1, 3]> : tensor<4xi32>} : () -> tensor<4xi32>
+  // CHECK: %[[ADD:.*]] = "tf.AddV2"(%arg0, %arg1) {device = ""} : (tensor<1x1x2x3xf32>, tensor<2x3xf32>) -> tensor<1x1x2x3xf32>
+  // CHECK: %[[TRANSPOSE:.*]] = "tf.Transpose"(%[[ADD]], %[[CST]]) {device = ""} : (tensor<1x1x2x3xf32>, tensor<4xi32>) -> tensor<1x2x1x3xf32>
+  // CHECK: return %[[TRANSPOSE]] : tensor<1x2x1x3xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
index 4f044cd5eff..cc923070077 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
@@ -1,177 +1,396 @@
+// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 // RUN: tf-opt -tf-legalize-hlo %s | FileCheck %s
 
 
+// CHECK-LABEL:   func @biasAdd_NHWC(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<1x32x10x32xi32>,
+// CHECK-SAME:                       %[[VAL_1:.*]]: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.AddV2"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+// CHECK:           return %[[VAL_2]] : tensor<1x32x10x32xi32>
+// CHECK:         }
 func @biasAdd_NHWC(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
   %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   return %0 : tensor<1x32x10x32xi32>
 }
 
+// CHECK-LABEL:   func @biasAdd_NCHW(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<1x32x10x32xi32>,
+// CHECK-SAME:                       %[[VAL_1:.*]]: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.AddV2"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+// CHECK:           return %[[VAL_2]] : tensor<1x32x10x32xi32>
+// CHECK:         }
 func @biasAdd_NCHW(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
   %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   return %0 : tensor<1x32x10x32xi32>
 }
 
+// CHECK-LABEL:   func @biasAdd_dynamic(
+// CHECK-SAME:                          %[[VAL_0:.*]]: tensor<?x?x?x?xi32>,
+// CHECK-SAME:                          %[[VAL_1:.*]]: tensor<?xi32>) -> tensor<?x?x?x?xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.AddV2"(%[[VAL_0]], %[[VAL_1]]) : (tensor<?x?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?x?xi32>
+// CHECK:           return %[[VAL_2]] : tensor<?x?x?x?xi32>
+// CHECK:         }
 func @biasAdd_dynamic(%arg0: tensor<?x?x?x?xi32>, %arg1: tensor<?xi32>) -> tensor<?x?x?x?xi32> {
   %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<?x?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?x?xi32>
   return %0 : tensor<?x?x?x?xi32>
 }
 
+// CHECK-LABEL:   func @add(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.AddV2"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           %[[VAL_2:.*]] = "tf.AddV2"(%[[VAL_1]], %[[VAL_0]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           return %[[VAL_2]] : tensor<2xi32>
+// CHECK:         }
 func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   %0 = mhlo.add %arg0, %arg0 : tensor<2xi32>
   %1 = mhlo.add %0, %arg0 : tensor<2xi32>
   return %1 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @broadcast_add(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.AddV2"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi32>
+// CHECK:         }
 func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
   %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
+// CHECK-LABEL:   func @broadcast_multi_dim_add(
+// CHECK-SAME:                                  %[[VAL_0:.*]]: tensor<4x1x1xi32>,
+// CHECK-SAME:                                  %[[VAL_1:.*]]: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.AddV2"(%[[VAL_0]], %[[VAL_1]]) : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
+// CHECK:           return %[[VAL_2]] : tensor<4x4x4x4xi32>
+// CHECK:         }
 func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
   %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>} : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
   return %0 : tensor<4x4x4x4xi32>
 }
 
+// CHECK-LABEL:   func @div(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           return %[[VAL_1]] : tensor<2xi32>
+// CHECK:         }
 func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   %0 = mhlo.divide %arg0, %arg0 : tensor<2xi32>
   return %0 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @broadcast_div(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi32>
+// CHECK:         }
 func @broadcast_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
   %0 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
+// CHECK-LABEL:   func @shift_left(
+// CHECK-SAME:                     %[[VAL_0:.*]]: tensor<4xi32>,
+// CHECK-SAME:                     %[[VAL_1:.*]]: tensor<4xi32>) -> tensor<4xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.LeftShift"(%[[VAL_0]], %[[VAL_1]]) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+// CHECK:           return %[[VAL_2]] : tensor<4xi32>
+// CHECK:         }
 func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   %0 = mhlo.shift_left %arg0, %arg1 : tensor<4xi32>
   return %0 : tensor<4xi32>
 }
 
+// CHECK-LABEL:   func @div_dynamic(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xi32>,
+// CHECK-SAME:                      %[[VAL_1:.*]]: tensor<?x?xi32>) -> tensor<?x?xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_1]]) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+// CHECK:           return %[[VAL_2]] : tensor<?x?xi32>
+// CHECK:         }
 func @div_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
   %0 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
   return %0 : tensor<?x?xi32>
 }
 
+// CHECK-LABEL:   func @maximum(
+// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<4xf32>,
+// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<4xf32>) -> tensor<4xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Maximum"(%[[VAL_0]], %[[VAL_1]]) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+// CHECK:           return %[[VAL_2]] : tensor<4xf32>
+// CHECK:         }
 func @maximum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
   %0 = mhlo.maximum %arg0, %arg1 : tensor<4xf32>
   return %0 : tensor<4xf32>
 }
 
+// CHECK-LABEL:   func @minimum(
+// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<4xf32>,
+// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<4xf32>) -> tensor<4xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Minimum"(%[[VAL_0]], %[[VAL_1]]) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+// CHECK:           return %[[VAL_2]] : tensor<4xf32>
+// CHECK:         }
 func @minimum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
   %0 = mhlo.minimum %arg0, %arg1 : tensor<4xf32>
   return %0 : tensor<4xf32>
 }
 
+// CHECK-LABEL:   func @mul(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Mul"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           return %[[VAL_1]] : tensor<2xi32>
+// CHECK:         }
 func @mul(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   %0 = mhlo.multiply %arg0, %arg0 : tensor<2xi32>
   return %0 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @broadcast_mul(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Mul"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi32>
+// CHECK:         }
 func @broadcast_mul(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
   %0 = "chlo.broadcast_multiply"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
+// CHECK-LABEL:   func @real_div(
+// CHECK-SAME:                   %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           return %[[VAL_1]] : tensor<2xi32>
+// CHECK:         }
 func @real_div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   %0 = mhlo.divide %arg0, %arg0 : tensor<2xi32>
   return %0 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @broadcast_real_div(
+// CHECK-SAME:                             %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                             %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi32>
+// CHECK:         }
 func @broadcast_real_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
   %0 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
+// CHECK-LABEL:   func @sub(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Sub"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           return %[[VAL_1]] : tensor<2xi32>
+// CHECK:         }
 func @sub(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   %0 = mhlo.subtract %arg0, %arg0 : tensor<2xi32>
   return %0 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @broadcast_sub(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Sub"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi32>
+// CHECK:         }
 func @broadcast_sub(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
   %0 = "chlo.broadcast_subtract"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
+// CHECK-LABEL:   func @shift_right(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<4xi32>,
+// CHECK-SAME:                      %[[VAL_1:.*]]: tensor<4xi32>) -> tensor<4xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.RightShift"(%[[VAL_0]], %[[VAL_1]]) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+// CHECK:           return %[[VAL_2]] : tensor<4xi32>
+// CHECK:         }
 func @shift_right(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   %0 = mhlo.shift_right_arithmetic %arg0, %arg1 : tensor<4xi32>
   return %0 : tensor<4xi32>
 }
 
+// CHECK-LABEL:   func @broadcast_shift_right(
+// CHECK-SAME:                                %[[VAL_0:.*]]: tensor<4xi32>,
+// CHECK-SAME:                                %[[VAL_1:.*]]: tensor<2x4xi32>) -> tensor<2x4xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.RightShift"(%[[VAL_0]], %[[VAL_1]]) : (tensor<4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
+// CHECK:           return %[[VAL_2]] : tensor<2x4xi32>
+// CHECK:         }
 func @broadcast_shift_right(%arg0: tensor<4xi32>, %arg1: tensor<2x4xi32>) -> tensor<2x4xi32> {
   %0 = "chlo.broadcast_shift_right_arithmetic"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
   return %0 : tensor<2x4xi32>
 }
 
-func @and(%arg0: tensor<2xi1>) -> tensor<2xi1> {
-  %0 = mhlo.and %arg0, %arg0 : tensor<2xi1>
+// CHECK-LABEL:   func @and(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xi1>,
+// CHECK-SAME:              %[[VAL_1:.*]]: tensor<2xi1>) -> tensor<2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.LogicalAnd"(%[[VAL_0]], %[[VAL_1]]) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<2xi1>
+// CHECK:         }
+func @and(%arg0: tensor<2xi1>, %arg1: tensor<2xi1>) -> tensor<2xi1> {
+  %0 = mhlo.and %arg0, %arg1 : tensor<2xi1>
   return %0 : tensor<2xi1>
 }
 
+// CHECK-LABEL:   func @and_broadcast(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1xi1>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1x2xi1>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.LogicalAnd"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @and_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
+// CHECK-LABEL:   func @and_dynamic(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xi1>,
+// CHECK-SAME:                      %[[VAL_1:.*]]: tensor<1xi1>) -> tensor<?xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.LogicalAnd"(%[[VAL_0]], %[[VAL_1]]) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
+// CHECK:           return %[[VAL_2]] : tensor<?xi1>
+// CHECK:         }
 func @and_dynamic(%arg0: tensor<?xi1>, %arg1: tensor<1xi1>) -> tensor<?xi1> {
   %0 = "chlo.broadcast_and"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
-func @or(%arg0: tensor<2xi1>) -> tensor<2xi1> {
-  %0 = mhlo.or %arg0, %arg0 : tensor<2xi1>
+// CHECK-LABEL:   func @or(
+// CHECK-SAME:             %[[VAL_0:.*]]: tensor<2xi1>,
+// CHECK-SAME:             %[[VAL_1:.*]]: tensor<2xi1>) -> tensor<2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.LogicalOr"(%[[VAL_0]], %[[VAL_1]]) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<2xi1>
+// CHECK:         }
+func @or(%arg0: tensor<2xi1>, %arg1: tensor<2xi1>) -> tensor<2xi1> {
+  %0 = mhlo.or %arg0, %arg1 : tensor<2xi1>
   return %0 : tensor<2xi1>
 }
 
+// CHECK-LABEL:   func @or_broadcast(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<1xi1>,
+// CHECK-SAME:                       %[[VAL_1:.*]]: tensor<1x2xi1>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.LogicalOr"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @or_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
+// CHECK-LABEL:   func @or_dynamic(
+// CHECK-SAME:                     %[[VAL_0:.*]]: tensor<?xi1>,
+// CHECK-SAME:                     %[[VAL_1:.*]]: tensor<1xi1>) -> tensor<?xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.LogicalOr"(%[[VAL_0]], %[[VAL_1]]) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
+// CHECK:           return %[[VAL_2]] : tensor<?xi1>
+// CHECK:         }
 func @or_dynamic(%arg0: tensor<?xi1>, %arg1: tensor<1xi1>) -> tensor<?xi1> {
   %0 = "chlo.broadcast_or"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
+// CHECK-LABEL:   func @bitwise_or(
+// CHECK-SAME:                     %[[VAL_0:.*]]: tensor<4xi32>,
+// CHECK-SAME:                     %[[VAL_1:.*]]: tensor<4xi32>) -> tensor<4xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.BitwiseOr"(%[[VAL_0]], %[[VAL_1]]) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+// CHECK:           return %[[VAL_2]] : tensor<4xi32>
+// CHECK:         }
 func @bitwise_or(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   %0 = mhlo.or %arg0, %arg1 : tensor<4xi32>
   return %0 : tensor<4xi32>
 }
 
+// CHECK-LABEL:   func @bitwise_or_broadcast(
+// CHECK-SAME:                               %[[VAL_0:.*]]: tensor<1xi8>,
+// CHECK-SAME:                               %[[VAL_1:.*]]: tensor<1x4xi8>) -> tensor<1x4xi8> {
+// CHECK:           %[[VAL_2:.*]] = "tf.BitwiseOr"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
+// CHECK:           return %[[VAL_2]] : tensor<1x4xi8>
+// CHECK:         }
 func @bitwise_or_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
   %0 = "chlo.broadcast_or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
   return %0 : tensor<1x4xi8>
 }
 
+// CHECK-LABEL:   func @bitwise_or_dynamic(
+// CHECK-SAME:                             %[[VAL_0:.*]]: tensor<?xi32>,
+// CHECK-SAME:                             %[[VAL_1:.*]]: tensor<1xi32>) -> tensor<?xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.BitwiseOr"(%[[VAL_0]], %[[VAL_1]]) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+// CHECK:           return %[[VAL_2]] : tensor<?xi32>
+// CHECK:         }
 func @bitwise_or_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi32> {
   %0 = "chlo.broadcast_or"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
 
+// CHECK-LABEL:   func @bitwise_and(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<4xi32>,
+// CHECK-SAME:                      %[[VAL_1:.*]]: tensor<4xi32>) -> tensor<4xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.BitwiseAnd"(%[[VAL_0]], %[[VAL_1]]) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+// CHECK:           return %[[VAL_2]] : tensor<4xi32>
+// CHECK:         }
 func @bitwise_and(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   %0 = mhlo.and %arg0, %arg1 : tensor<4xi32>
   return %0 : tensor<4xi32>
 }
 
+// CHECK-LABEL:   func @bitwise_and_broadcast(
+// CHECK-SAME:                                %[[VAL_0:.*]]: tensor<1xi8>,
+// CHECK-SAME:                                %[[VAL_1:.*]]: tensor<1x4xi8>) -> tensor<1x4xi8> {
+// CHECK:           %[[VAL_2:.*]] = "tf.BitwiseAnd"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
+// CHECK:           return %[[VAL_2]] : tensor<1x4xi8>
+// CHECK:         }
 func @bitwise_and_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
   %0 = "chlo.broadcast_and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
   return %0 : tensor<1x4xi8>
 }
 
+// CHECK-LABEL:   func @bitwise_and_dynamic(
+// CHECK-SAME:                              %[[VAL_0:.*]]: tensor<?xi32>,
+// CHECK-SAME:                              %[[VAL_1:.*]]: tensor<1xi32>) -> tensor<?xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.BitwiseAnd"(%[[VAL_0]], %[[VAL_1]]) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+// CHECK:           return %[[VAL_2]] : tensor<?xi32>
+// CHECK:         }
 func @bitwise_and_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi32> {
   %0 = "chlo.broadcast_and"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
 
+// CHECK-LABEL:   func @pow(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Pow"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @pow(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = mhlo.power %arg0, %arg0 : tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @pow_dynamic(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Pow"(%[[VAL_0]], %[[VAL_0]]) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @pow_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = mhlo.power %arg0, %arg0 : tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @floordiv_broadcast_i32(
+// CHECK-SAME:                                 %[[VAL_0:.*]]: tensor<2x3xi32>,
+// CHECK-SAME:                                 %[[VAL_1:.*]]: tensor<3xi32>) -> tensor<2x3xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<0> : tensor<2x3xi32>} : () -> tensor<2x3xi32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Less"(%[[VAL_0]], %[[VAL_2]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<0> : tensor<3xi32>} : () -> tensor<3xi32>
+// CHECK:           %[[VAL_5:.*]] = "tf.Less"(%[[VAL_1]], %[[VAL_4]]) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
+// CHECK:           %[[VAL_6:.*]] = "tf.Equal"(%[[VAL_3]], %[[VAL_5]]) {incompatible_shape_error = true} : (tensor<2x3xi1>, tensor<3xi1>) -> tensor<2x3xi1>
+// CHECK:           %[[VAL_7:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_1]]) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_8:.*]] = "tf.Abs"(%[[VAL_0]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_9:.*]] = "tf.Abs"(%[[VAL_1]]) : (tensor<3xi32>) -> tensor<3xi32>
+// CHECK:           %[[VAL_10:.*]] = "tf.Const"() {value = dense<1> : tensor<3xi32>} : () -> tensor<3xi32>
+// CHECK:           %[[VAL_11:.*]] = "tf.Sub"(%[[VAL_9]], %[[VAL_10]]) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi32>
+// CHECK:           %[[VAL_12:.*]] = "tf.AddV2"(%[[VAL_8]], %[[VAL_11]]) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_13:.*]] = "tf.Neg"(%[[VAL_12]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_14:.*]] = "tf.Abs"(%[[VAL_1]]) : (tensor<3xi32>) -> tensor<3xi32>
+// CHECK:           %[[VAL_15:.*]] = "tf.Div"(%[[VAL_13]], %[[VAL_14]]) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_16:.*]] = "tf.Select"(%[[VAL_6]], %[[VAL_7]], %[[VAL_15]]) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           return %[[VAL_16]] : tensor<2x3xi32>
+// CHECK:         }
 func @floordiv_broadcast_i32(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>) -> tensor<2x3xi32> {
   %0 = mhlo.constant dense<0> : tensor<2x3xi32>
   %1 = "chlo.broadcast_compare"(%arg0, %0) {comparison_direction = "LT"} : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
@@ -191,6 +410,26 @@ func @floordiv_broadcast_i32(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>) -> te
   return %14 : tensor<2x3xi32>
 }
 
+// CHECK-LABEL:   func @floordiv_reverse_broadcast_i32(
+// CHECK-SAME:                                         %[[VAL_0:.*]]: tensor<3xi32>,
+// CHECK-SAME:                                         %[[VAL_1:.*]]: tensor<2x3xi32>) -> tensor<2x3xi32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<0> : tensor<3xi32>} : () -> tensor<3xi32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Less"(%[[VAL_0]], %[[VAL_2]]) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<0> : tensor<2x3xi32>} : () -> tensor<2x3xi32>
+// CHECK:           %[[VAL_5:.*]] = "tf.Less"(%[[VAL_1]], %[[VAL_4]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
+// CHECK:           %[[VAL_6:.*]] = "tf.Equal"(%[[VAL_3]], %[[VAL_5]]) {incompatible_shape_error = true} : (tensor<3xi1>, tensor<2x3xi1>) -> tensor<2x3xi1>
+// CHECK:           %[[VAL_7:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_1]]) : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_8:.*]] = "tf.Abs"(%[[VAL_0]]) : (tensor<3xi32>) -> tensor<3xi32>
+// CHECK:           %[[VAL_9:.*]] = "tf.Abs"(%[[VAL_1]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_10:.*]] = "tf.Const"() {value = dense<1> : tensor<2x3xi32>} : () -> tensor<2x3xi32>
+// CHECK:           %[[VAL_11:.*]] = "tf.Sub"(%[[VAL_9]], %[[VAL_10]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_12:.*]] = "tf.AddV2"(%[[VAL_8]], %[[VAL_11]]) : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_13:.*]] = "tf.Neg"(%[[VAL_12]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_14:.*]] = "tf.Abs"(%[[VAL_1]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_15:.*]] = "tf.Div"(%[[VAL_13]], %[[VAL_14]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           %[[VAL_16:.*]] = "tf.Select"(%[[VAL_6]], %[[VAL_7]], %[[VAL_15]]) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+// CHECK:           return %[[VAL_16]] : tensor<2x3xi32>
+// CHECK:         }
 func @floordiv_reverse_broadcast_i32(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32>) -> tensor<2x3xi32> {
   %0 = mhlo.constant dense<0> : tensor<3xi32>
   %1 = "mhlo.compare"(%arg0, %0) {comparison_direction = "LT"} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
@@ -210,6 +449,13 @@ func @floordiv_reverse_broadcast_i32(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32
   return %14 : tensor<2x3xi32>
 }
 
+// CHECK-LABEL:   func @floordiv_f32(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           %[[VAL_3:.*]] = "tf.FloorDiv"(%[[VAL_0]], %[[VAL_0]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_3]] : tensor<2xf32>
+// CHECK:         }
 func @floordiv_f32(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = mhlo.divide %arg0, %arg0 : tensor<2xf32>
   %1 = mhlo.divide %arg0, %arg0 : tensor<2xf32>
@@ -217,6 +463,14 @@ func @floordiv_f32(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   return %2 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @floordiv_f16_broadcast(
+// CHECK-SAME:                                 %[[VAL_0:.*]]: tensor<2x3xf16>,
+// CHECK-SAME:                                 %[[VAL_1:.*]]: tensor<3xf16>) -> tensor<2x3xf16> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_1]]) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
+// CHECK:           %[[VAL_3:.*]] = "tf.Div"(%[[VAL_0]], %[[VAL_1]]) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
+// CHECK:           %[[VAL_4:.*]] = "tf.FloorDiv"(%[[VAL_0]], %[[VAL_1]]) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
+// CHECK:           return %[[VAL_4]] : tensor<2x3xf16>
+// CHECK:         }
 func @floordiv_f16_broadcast(%arg0: tensor<2x3xf16>, %arg1: tensor<3xf16>) -> tensor<2x3xf16> {
   %0 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
   %1 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
@@ -224,118 +478,258 @@ func @floordiv_f16_broadcast(%arg0: tensor<2x3xf16>, %arg1: tensor<3xf16>) -> te
   return %2 : tensor<2x3xf16>
 }
 
-func @equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "EQ"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK-LABEL:   func @equal(
+// CHECK-SAME:                %[[VAL_0:.*]]: tensor<2xi32>,
+// CHECK-SAME:                %[[VAL_1:.*]]: tensor<2xi32>) -> tensor<2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<2xi1>
+// CHECK:         }
+func @equal(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
+  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0 : tensor<2xi1>
 }
 
+// CHECK-LABEL:   func @equal_dynamic(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<?xi32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<1xi32>) -> tensor<?xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+// CHECK:           return %[[VAL_2]] : tensor<?xi1>
+// CHECK:         }
 func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
+// CHECK-LABEL:   func @equal_broadcast(
+// CHECK-SAME:                          %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                          %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @equal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
+// CHECK-LABEL:   func @equal_broadcast_no_incompatible_shapes_error(
+// CHECK-SAME:                                                       %[[VAL_0:.*]]: tensor<2xi32>,
+// CHECK-SAME:                                                       %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @equal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
+// CHECK-LABEL:   func @equal_incompatible_shape_broadcastable(
+// CHECK-SAME:                                                 %[[VAL_0:.*]]: tensor<?xi32>,
+// CHECK-SAME:                                                 %[[VAL_1:.*]]: tensor<1xi32>) -> tensor<?xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Equal"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+// CHECK:           return %[[VAL_2]] : tensor<?xi1>
+// CHECK:         }
 func @equal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
-func @notequal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "NE"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK-LABEL:   func @notequal(
+// CHECK-SAME:                   %[[VAL_0:.*]]: tensor<2xi32>,
+// CHECK-SAME:                   %[[VAL_1:.*]]: tensor<2xi32>) -> tensor<2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<2xi1>
+// CHECK:         }
+func @notequal(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
+  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "NE"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0 : tensor<2xi1>
 }
 
+// CHECK-LABEL:   func @notequal_broadcast(
+// CHECK-SAME:                             %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                             %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @notequal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
+// CHECK-LABEL:   func @notequal_broadcast_no_incompatible_shapes_error(
+// CHECK-SAME:                                                          %[[VAL_0:.*]]: tensor<2xi32>,
+// CHECK-SAME:                                                          %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @notequal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
+// CHECK-LABEL:   func @notequal_incompatible_shape_broadcastable(
+// CHECK-SAME:                                                    %[[VAL_0:.*]]: tensor<?xi32>,
+// CHECK-SAME:                                                    %[[VAL_1:.*]]: tensor<1xi32>) -> tensor<?xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+// CHECK:           return %[[VAL_2]] : tensor<?xi1>
+// CHECK:         }
 func @notequal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {comparison_direction = "NE"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
-func @greater(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "GT"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK-LABEL:   func @greater(
+// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<2xi32>,
+// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<2xi32>) -> tensor<2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Greater"(%[[VAL_0]], %[[VAL_1]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<2xi1>
+// CHECK:         }
+func @greater(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
+  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0 : tensor<2xi1>
 }
 
+// CHECK-LABEL:   func @broadcast_greater(
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                            %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Greater"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @broadcast_greater(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GT"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
-func @greater_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "GE"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK-LABEL:   func @greater_equal(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<2xi32>,
+// CHECK-SAME:                        %[[VAL_1:.*]]: tensor<2xi32>) -> tensor<2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.GreaterEqual"(%[[VAL_0]], %[[VAL_1]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<2xi1>
+// CHECK:         }
+func @greater_equal(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
+  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GE"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0 : tensor<2xi1>
 }
 
+// CHECK-LABEL:   func @broadcast_greater_equal(
+// CHECK-SAME:                                  %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                                  %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.GreaterEqual"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @broadcast_greater_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
-func @less(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "LT"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK-LABEL:   func @less(
+// CHECK-SAME:               %[[VAL_0:.*]]: tensor<2xi32>,
+// CHECK-SAME:               %[[VAL_1:.*]]: tensor<2xi32>) -> tensor<2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Less"(%[[VAL_0]], %[[VAL_1]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<2xi1>
+// CHECK:         }
+func @less(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
+  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "LT"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0 : tensor<2xi1>
 }
 
+// CHECK-LABEL:   func @broadcast_less(
+// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                         %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Less"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @broadcast_less(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LT"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
-func @less_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "LE"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK-LABEL:   func @less_equal(
+// CHECK-SAME:                     %[[VAL_0:.*]]: tensor<2xi32>,
+// CHECK-SAME:                     %[[VAL_1:.*]]: tensor<2xi32>) -> tensor<2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.LessEqual"(%[[VAL_0]], %[[VAL_1]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<2xi1>
+// CHECK:         }
+func @less_equal(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
+  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "LE"} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0 : tensor<2xi1>
 }
 
+// CHECK-LABEL:   func @broadcast_less_equal(
+// CHECK-SAME:                               %[[VAL_0:.*]]: tensor<1xi32>,
+// CHECK-SAME:                               %[[VAL_1:.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
+// CHECK:           %[[VAL_2:.*]] = "tf.LessEqual"(%[[VAL_0]], %[[VAL_1]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+// CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
+// CHECK:         }
 func @broadcast_less_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
   %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
+// CHECK-LABEL:   func @concat_v2(
+// CHECK-SAME:                    %[[VAL_0:.*]]: tensor<3x3xf32>,
+// CHECK-SAME:                    %[[VAL_1:.*]]: tensor<3x3xf32>) -> tensor<6x3xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
+// CHECK:           %[[VAL_3:.*]] = "tf.ConcatV2"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<i64>) -> tensor<6x3xf32>
+// CHECK:           return %[[VAL_3]] : tensor<6x3xf32>
+// CHECK:         }
 func @concat_v2(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
   %2 = "mhlo.concatenate"(%arg0, %arg1) {dimension = 0 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
   return %2 : tensor<6x3xf32>
 }
 
+// CHECK-LABEL:   func @concat_v2_1d_axis(
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<3x3xf32>,
+// CHECK-SAME:                            %[[VAL_1:.*]]: tensor<3x3xf32>) -> tensor<3x6xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+// CHECK:           %[[VAL_3:.*]] = "tf.ConcatV2"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<i64>) -> tensor<3x6xf32>
+// CHECK:           return %[[VAL_3]] : tensor<3x6xf32>
+// CHECK:         }
 func @concat_v2_1d_axis(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x6xf32> {
   %2 = "mhlo.concatenate"(%arg0, %arg1) {dimension = 1 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x6xf32>
   return %2 : tensor<3x6xf32>
 }
 
+// CHECK-LABEL:   func @const() -> tensor<2xi32> {
+// CHECK:           %[[VAL_0:.*]] = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK:           return %[[VAL_0]] : tensor<2xi32>
+// CHECK:         }
 func @const() -> tensor<2xi32> {
   %0 = mhlo.constant dense<0> : tensor<2xi32>
   return %0 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @relu(
+// CHECK-SAME:               %[[VAL_0:.*]]: tensor<1xi32>) -> tensor<1xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Maximum"(%[[VAL_1]], %[[VAL_0]]) : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
+// CHECK:           return %[[VAL_2]] : tensor<1xi32>
+// CHECK:         }
 func @relu(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   %0 = mhlo.constant dense<0> : tensor<i32>
   %1 = "chlo.broadcast_maximum"(%0, %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
   return %1 : tensor<1xi32>
 }
 
+// CHECK-LABEL:   func @relu_unranked(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<?xi32>) -> tensor<?xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Maximum"(%[[VAL_1]], %[[VAL_0]]) : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
+// CHECK:           return %[[VAL_2]] : tensor<?xi32>
+// CHECK:         }
 func @relu_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = mhlo.constant dense<0> : tensor<i32>
   %1 = "chlo.broadcast_maximum"(%0, %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
   return %1 : tensor<?xi32>
 }
 
+// CHECK-LABEL:   func @relu6(
+// CHECK-SAME:                %[[VAL_0:.*]]: tensor<1xi32>) -> tensor<1xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<6> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Minimum"(%[[VAL_0]], %[[VAL_2]]) : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+// CHECK:           %[[VAL_4:.*]] = "tf.Maximum"(%[[VAL_3]], %[[VAL_1]]) : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+// CHECK:           return %[[VAL_4]] : tensor<1xi32>
+// CHECK:         }
 func @relu6(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   %0 = mhlo.constant dense<0> : tensor<i32>
   %1 = mhlo.constant dense<6> : tensor<i32>
@@ -344,6 +738,14 @@ func @relu6(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   return %3 : tensor<1xi32>
 }
 
+// CHECK-LABEL:   func @relu6_unranked(
+// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<?xi32>) -> tensor<?xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<6> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Minimum"(%[[VAL_0]], %[[VAL_2]]) : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+// CHECK:           %[[VAL_4:.*]] = "tf.Maximum"(%[[VAL_3]], %[[VAL_1]]) : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+// CHECK:           return %[[VAL_4]] : tensor<?xi32>
+// CHECK:         }
 func @relu6_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = mhlo.constant dense<0> : tensor<i32>
   %1 = mhlo.constant dense<6> : tensor<i32>
@@ -352,6 +754,15 @@ func @relu6_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   return %3 : tensor<?xi32>
 }
 
+// CHECK-LABEL:   func @relu_grad(
+// CHECK-SAME:                    %[[VAL_0:.*]]: tensor<4x8xf32>,
+// CHECK-SAME:                    %[[VAL_1:.*]]: tensor<?x?xf32>) -> tensor<4x8xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Greater"(%[[VAL_1]], %[[VAL_2]]) : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xi1>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<4x8xf32>} : () -> tensor<4x8xf32>
+// CHECK:           %[[VAL_5:.*]] = "tf.Select"(%[[VAL_3]], %[[VAL_0]], %[[VAL_4]]) : (tensor<?x?xi1>, tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32>
+// CHECK:           return %[[VAL_5]] : tensor<4x8xf32>
+// CHECK:         }
 func @relu_grad(%arg0: tensor<4x8xf32>, %arg1: tensor<?x?xf32>) -> tensor<4x8xf32> {
   %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
   %1 = "chlo.broadcast_compare"(%arg1, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "GT"} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xi1>
@@ -360,31 +771,74 @@ func @relu_grad(%arg0: tensor<4x8xf32>, %arg1: tensor<?x?xf32>) -> tensor<4x8xf3
   return %3 : tensor<4x8xf32>
 }
 
+// CHECK-LABEL:   func @select(
+// CHECK-SAME:                 %[[VAL_0:.*]]: tensor<2xi1>,
+// CHECK-SAME:                 %[[VAL_1:.*]]: tensor<2xi32>,
+// CHECK-SAME:                 %[[VAL_2:.*]]: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK:           %[[VAL_3:.*]] = "tf.Select"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           return %[[VAL_3]] : tensor<2xi32>
+// CHECK:         }
 func @select(%arg0: tensor<2xi1>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
   %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   return %0 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @select_float(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<2xi1>,
+// CHECK-SAME:                       %[[VAL_1:.*]]: tensor<2xf32>,
+// CHECK-SAME:                       %[[VAL_2:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_3:.*]] = "tf.Select"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<2xi1>, tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_3]] : tensor<2xf32>
+// CHECK:         }
 func @select_float(%arg0: tensor<2xi1>, %arg1: tensor<2xf32>, %arg2: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @select_multidimensional(
+// CHECK-SAME:                                  %[[VAL_0:.*]]: tensor<3x2xi1>,
+// CHECK-SAME:                                  %[[VAL_1:.*]]: tensor<3x2xi32>,
+// CHECK-SAME:                                  %[[VAL_2:.*]]: tensor<3x2xi32>) -> tensor<3x2xi32> {
+// CHECK:           %[[VAL_3:.*]] = "tf.Select"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<3x2xi1>, tensor<3x2xi32>, tensor<3x2xi32>) -> tensor<3x2xi32>
+// CHECK:           return %[[VAL_3]] : tensor<3x2xi32>
+// CHECK:         }
 func @select_multidimensional(%arg0: tensor<3x2xi1>, %arg1: tensor<3x2xi32>, %arg2: tensor<3x2xi32>) -> tensor<3x2xi32> {
   %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<3x2xi1>, tensor<3x2xi32>, tensor<3x2xi32>) -> tensor<3x2xi32>
   return %0 : tensor<3x2xi32>
 }
 
+// CHECK-LABEL:   func @selectv2(
+// CHECK-SAME:                   %[[VAL_0:.*]]: tensor<2xi1>,
+// CHECK-SAME:                   %[[VAL_1:.*]]: tensor<2xi32>,
+// CHECK-SAME:                   %[[VAL_2:.*]]: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK:           %[[VAL_3:.*]] = "tf.Select"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           return %[[VAL_3]] : tensor<2xi32>
+// CHECK:         }
 func @selectv2(%arg0: tensor<2xi1>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
   %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   return %0 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @selectv2_pred_scalar(
+// CHECK-SAME:                               %[[VAL_0:.*]]: tensor<i1>,
+// CHECK-SAME:                               %[[VAL_1:.*]]: tensor<2xi32>,
+// CHECK-SAME:                               %[[VAL_2:.*]]: tensor<2xi32>) -> tensor<2xi32> {
+// CHECK:           %[[VAL_3:.*]] = "tf.Select"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<i1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+// CHECK:           return %[[VAL_3]] : tensor<2xi32>
+// CHECK:         }
 func @selectv2_pred_scalar(%arg0: tensor<i1>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
   %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   return %0 : tensor<2xi32>
 }
 
+// CHECK-LABEL:   func @transpose_2d(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<2x3xf32>) -> tensor<3x2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_4:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_3]]) : (tensor<2x3xf32>, tensor<2xi64>) -> tensor<3x2xf32>
+// CHECK:           return %[[VAL_4]] : tensor<3x2xf32>
+// CHECK:         }
 func @transpose_2d(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> {
   %0 = mhlo.constant dense<[1, 0]> : tensor<2xi64>
   %1 = mhlo.constant dense<[1, 0]> : tensor<2xi64>
@@ -392,6 +846,14 @@ func @transpose_2d(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> {
   return %2 : tensor<3x2xf32>
 }
 
+// CHECK-LABEL:   func @transpose_3d_int32(
+// CHECK-SAME:                             %[[VAL_0:.*]]: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi32>} : () -> tensor<3xi32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK:           %[[VAL_4:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_3]]) : (tensor<1x2x3xf32>, tensor<3xi64>) -> tensor<3x2x1xf32>
+// CHECK:           return %[[VAL_4]] : tensor<3x2x1xf32>
+// CHECK:         }
 func @transpose_3d_int32(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
   %0 = mhlo.constant dense<[2, 1, 0]> : tensor<3xi32>
   %1 = mhlo.constant dense<[2, 1, 0]> : tensor<3xi64>
@@ -399,6 +861,14 @@ func @transpose_3d_int32(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
   return %2 : tensor<3x2x1xf32>
 }
 
+// CHECK-LABEL:   func @transpose_3d(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK:           %[[VAL_4:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_3]]) : (tensor<1x2x3xf32>, tensor<3xi64>) -> tensor<3x2x1xf32>
+// CHECK:           return %[[VAL_4]] : tensor<3x2x1xf32>
+// CHECK:         }
 func @transpose_3d(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
   %0 = mhlo.constant dense<[2, 1, 0]> : tensor<3xi64>
   %1 = mhlo.constant dense<[2, 1, 0]> : tensor<3xi64>
@@ -406,6 +876,14 @@ func @transpose_3d(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
   return %2 : tensor<3x2x1xf32>
 }
 
+// CHECK-LABEL:   func @transpose_dynamic_2d(
+// CHECK-SAME:                               %[[VAL_0:.*]]: tensor<?x4xf32>) -> tensor<4x?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_4:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_3]]) : (tensor<?x4xf32>, tensor<2xi64>) -> tensor<4x?xf32>
+// CHECK:           return %[[VAL_4]] : tensor<4x?xf32>
+// CHECK:         }
 func @transpose_dynamic_2d(%arg0: tensor<?x4xf32>) -> tensor<4x?xf32> {
   %0 = mhlo.constant dense<[1, 0]> : tensor<2xi64>
   %1 = mhlo.constant dense<[1, 0]> : tensor<2xi64>
@@ -413,6 +891,14 @@ func @transpose_dynamic_2d(%arg0: tensor<?x4xf32>) -> tensor<4x?xf32> {
   return %2 : tensor<4x?xf32>
 }
 
+// CHECK-LABEL:   func @transpose_unranked_2d(
+// CHECK-SAME:                                %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_4:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_3]]) : (tensor<*xf32>, tensor<2xi64>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_4]] : tensor<*xf32>
+// CHECK:         }
 func @transpose_unranked_2d(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = mhlo.constant dense<[1, 0]> : tensor<2xi64>
   %1 = mhlo.constant dense<[1, 0]> : tensor<2xi64>
@@ -420,146 +906,297 @@ func @transpose_unranked_2d(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   return %2 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @abs(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Abs"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @abs(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.abs"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @abs_dynamic(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Abs"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @abs_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.abs"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @abs_unranked(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Abs"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @abs_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.abs"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @ceil(
+// CHECK-SAME:               %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Ceil"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @ceil(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.ceil"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @ceil_dynamic(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Ceil"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @ceil_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.ceil"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @ceil_unranked(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Ceil"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @ceil_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.ceil"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @complex_abs(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<2xcomplex<f32>>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.ComplexAbs"(%[[VAL_0]]) : (tensor<2xcomplex<f32>>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @complex_abs(%arg0: tensor<2xcomplex<f32>>) -> tensor<2xf32> {
   %0 = "mhlo.abs"(%arg0) : (tensor<2xcomplex<f32>>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @cos(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Cos"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @cos(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.cosine"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @cos_dynamic(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Cos"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @cos_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.cosine"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @cos_unranked(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Cos"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @cos_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.cosine"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @exp(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Exp"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @exp(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.exponential"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @exp_dynamic(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Exp"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @exp_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.exponential"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @exp_unranked(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Exp"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @exp_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.exponential"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @floor(
+// CHECK-SAME:                %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Floor"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @floor(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.floor"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @floor_dynamic(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Floor"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @floor_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.floor"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @floor_unranked(
+// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Floor"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @floor_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.floor"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @is_finite(
+// CHECK-SAME:                    %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xi1> {
+// CHECK:           %[[VAL_1:.*]] = "tf.IsFinite"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xi1>
+// CHECK:           return %[[VAL_1]] : tensor<2xi1>
+// CHECK:         }
 func @is_finite(%arg0: tensor<2xf32>) -> tensor<2xi1> {
   %0 = "mhlo.is_finite"(%arg0) : (tensor<2xf32>) -> tensor<2xi1>
   return %0 : tensor<2xi1>
 }
 
+// CHECK-LABEL:   func @is_finite_dynamic(
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xi1> {
+// CHECK:           %[[VAL_1:.*]] = "tf.IsFinite"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xi1>
+// CHECK:           return %[[VAL_1]] : tensor<?xi1>
+// CHECK:         }
 func @is_finite_dynamic(%arg0: tensor<?xf32>) -> tensor<?xi1> {
   %0 = "mhlo.is_finite"(%arg0) : (tensor<?xf32>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
+// CHECK-LABEL:   func @is_finite_unranked(
+// CHECK-SAME:                             %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xi1> {
+// CHECK:           %[[VAL_1:.*]] = "tf.IsFinite"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xi1>
+// CHECK:           return %[[VAL_1]] : tensor<*xi1>
+// CHECK:         }
 func @is_finite_unranked(%arg0: tensor<*xf32>) -> tensor<*xi1> {
   %0 = "mhlo.is_finite"(%arg0) : (tensor<*xf32>) -> tensor<*xi1>
   return %0 : tensor<*xi1>
 }
 
+// CHECK-LABEL:   func @log(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Log"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @log(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.log"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @log_dynamic(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Log"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @log_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.log"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @log_unranked(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Log"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @log_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.log"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @log1p(
+// CHECK-SAME:                %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Log1p"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @log1p(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.log_plus_one"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @log1p_dynamic(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Log1p"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @log1p_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.log_plus_one"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @log1p_unranked(
+// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Log1p"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @log1p_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.log_plus_one"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @neg(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Neg"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @neg(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.negate"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @neg_dynamic(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Neg"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @neg_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.negate"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @neg_unranked(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Neg"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @neg_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.negate"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @sigmoid(
+// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<2> : tensor<1xi64>} : () -> tensor<1xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<2xf32>} : () -> tensor<2xf32>
+// CHECK:           %[[VAL_4:.*]] = "tf.Mul"(%[[VAL_0]], %[[VAL_3]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           %[[VAL_5:.*]] = "tf.Tanh"(%[[VAL_4]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           %[[VAL_6:.*]] = "tf.Mul"(%[[VAL_5]], %[[VAL_3]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           %[[VAL_7:.*]] = "tf.AddV2"(%[[VAL_6]], %[[VAL_3]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_7]] : tensor<2xf32>
+// CHECK:         }
 func @sigmoid(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = mhlo.constant dense<5.000000e-01> : tensor<f32>
   %1 = mhlo.constant dense<2> : tensor<1xi64>
@@ -571,90 +1208,182 @@ func @sigmoid(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   return %6 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @sin(
+// CHECK-SAME:              %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Sin"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @sin(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.sine"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @sin_dynamic(
+// CHECK-SAME:                      %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Sin"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @sin_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.sine"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @sin_unranked(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Sin"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @sin_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.sine"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @rsqrt(
+// CHECK-SAME:                %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Rsqrt"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @rsqrt(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.rsqrt"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @rsqrt_dynamic(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Rsqrt"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @rsqrt_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.rsqrt"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @rsqrt_unranked(
+// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Rsqrt"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @rsqrt_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.rsqrt"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @sqrt(
+// CHECK-SAME:               %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Sqrt"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @sqrt(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.sqrt"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @sqrt_dynamic(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Sqrt"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @sqrt_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.sqrt"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @sqrt_unranked(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Sqrt"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @sqrt_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.sqrt"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @tanh(
+// CHECK-SAME:               %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Tanh"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @tanh(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.tanh"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @tanh_dynamic(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Tanh"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @tanh_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.tanh"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @tanh_unranked(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Tanh"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @tanh_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.tanh"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @bitcast(
+// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Bitcast"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @bitcast(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   %0 = "mhlo.bitcast_convert"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @bitcast_dynamic(
+// CHECK-SAME:                          %[[VAL_0:.*]]: tensor<?xf32>) -> tensor<?xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Bitcast"(%[[VAL_0]]) : (tensor<?xf32>) -> tensor<?xf32>
+// CHECK:           return %[[VAL_1]] : tensor<?xf32>
+// CHECK:         }
 func @bitcast_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
   %0 = "mhlo.bitcast_convert"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
+// CHECK-LABEL:   func @bitcast_unranked(
+// CHECK-SAME:                           %[[VAL_0:.*]]: tensor<*xf32>) -> tensor<*xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Bitcast"(%[[VAL_0]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK:           return %[[VAL_1]] : tensor<*xf32>
+// CHECK:         }
 func @bitcast_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "mhlo.bitcast_convert"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL:   func @bitcast_same_widths(
+// CHECK-SAME:                              %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xi32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Bitcast"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xi32>
+// CHECK:           return %[[VAL_1]] : tensor<2xi32>
+// CHECK:         }
 func @bitcast_same_widths(%arg0: tensor<2xf32>) -> tensor<2xi32> {
   %0 = "mhlo.bitcast_convert"(%arg0) : (tensor<2xf32>) -> tensor<2xi32>
   return %0 : tensor<2xi32>
 }
 
-func @sign(%arg0: tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32> {
-  %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "NE"} : (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xi1>
+// CHECK-LABEL:   func @sign(
+// CHECK-SAME:               %[[VAL_0:.*]]: tensor<1x2x3x4xf32>,
+// CHECK-SAME:               %[[VAL_1:.*]]: tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xi1>
+// CHECK:           %[[VAL_3:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1x2x3x4xf32>} : () -> tensor<1x2x3x4xf32>
+// CHECK:           %[[VAL_4:.*]] = "tf.NotEqual"(%[[VAL_0]], %[[VAL_1]]) {incompatible_shape_error = true} : (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xi1>
+// CHECK:           %[[VAL_5:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1x2x3x4xf32>} : () -> tensor<1x2x3x4xf32>
+// CHECK:           %[[VAL_6:.*]] = "tf.Sign"(%[[VAL_0]]) : (tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
+// CHECK:           %[[VAL_7:.*]] = "tf.Select"(%[[VAL_4]], %[[VAL_5]], %[[VAL_6]]) : (tensor<1x2x3x4xi1>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
+// CHECK:           %[[VAL_8:.*]] = "tf.Select"(%[[VAL_2]], %[[VAL_3]], %[[VAL_7]]) : (tensor<1x2x3x4xi1>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
+// CHECK:           return %[[VAL_8]] : tensor<1x2x3x4xf32>
+// CHECK:         }
+func @sign(%arg0: tensor<1x2x3x4xf32>, %arg1: tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32> {
+  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "NE"} : (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xi1>
   %1 = mhlo.constant dense<0.000000e+00> : tensor<1x2x3x4xf32>
-  %2 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "NE"} : (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xi1>
+  %2 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "NE"} : (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xi1>
   %3 = mhlo.constant dense<0.000000e+00> : tensor<1x2x3x4xf32>
   %4 = "mhlo.sign"(%arg0) : (tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
   %5 = "mhlo.select"(%2, %3, %4) : (tensor<1x2x3x4xi1>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
@@ -662,72 +1391,180 @@ func @sign(%arg0: tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32> {
   return %6 : tensor<1x2x3x4xf32>
 }
 
+// CHECK-LABEL:   func @size_rank_one_i32(
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<f32>) -> tensor<i32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           return %[[VAL_1]] : tensor<i32>
+// CHECK:         }
 func @size_rank_one_i32(%arg0: tensor<f32>) -> tensor<i32> {
   %0 = mhlo.constant dense<1> : tensor<i32>
   return %0 : tensor<i32>
 }
 
+// CHECK-LABEL:   func @size_rank_one_i64(
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<f32>) -> tensor<i64> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+// CHECK:           return %[[VAL_1]] : tensor<i64>
+// CHECK:         }
 func @size_rank_one_i64(%arg0: tensor<f32>) -> tensor<i64> {
   %0 = mhlo.constant dense<1> : tensor<i64>
   return %0 : tensor<i64>
 }
 
+// CHECK-LABEL:   func @complex(
+// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<3xf32>,
+// CHECK-SAME:                  %[[VAL_1:.*]]: tensor<3xf32>) -> tensor<3xcomplex<f32>> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Complex"(%[[VAL_0]], %[[VAL_1]]) : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xcomplex<f32>>
+// CHECK:           return %[[VAL_2]] : tensor<3xcomplex<f32>>
+// CHECK:         }
 func @complex(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>) -> tensor<3xcomplex<f32>> {
   %0 = "mhlo.complex"(%arg0, %arg1) : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xcomplex<f32>>
   return %0 : tensor<3xcomplex<f32>>
 }
 
+// CHECK-LABEL:   func @convert_i32_f32(
+// CHECK-SAME:                          %[[VAL_0:.*]]: tensor<2xi32>) -> tensor<2xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Cast"(%[[VAL_0]]) {Truncate = false} : (tensor<2xi32>) -> tensor<2xf32>
+// CHECK:           return %[[VAL_1]] : tensor<2xf32>
+// CHECK:         }
 func @convert_i32_f32(%arg0: tensor<2xi32>) -> tensor<2xf32> {
   %0 = "mhlo.convert"(%arg0) : (tensor<2xi32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL:   func @convert_slice(
+// CHECK-SAME:                        %[[VAL_0:.*]]: tensor<1x4672xf32>) -> tensor<1x519xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<[0, 4153]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[1, 519]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Slice"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<1x4672xf32>, tensor<2xi64>, tensor<2xi64>) -> tensor<1x519xf32>
+// CHECK:           return %[[VAL_3]] : tensor<1x519xf32>
+// CHECK:         }
 func @convert_slice(%arg0: tensor<1x4672xf32>) -> tensor<1x519xf32> {
   %0 = "mhlo.slice"(%arg0) {limit_indices = dense<[1, 4672]> : tensor<2xi64>, start_indices = dense<[0, 4153]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x4672xf32>) -> tensor<1x519xf32>
   return %0 : tensor<1x519xf32>
 }
 
+// CHECK-LABEL:   func @reshape(
+// CHECK-SAME:                  %[[VAL_0:.*]]: tensor<4x6xf32>) -> tensor<2x2x6xf32> {
+// CHECK:           %[[VAL_1:.*]] = constant dense<[2, 2, 6]> : tensor<3xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_1]]) : (tensor<4x6xf32>, tensor<3xi64>) -> tensor<2x2x6xf32>
+// CHECK:           return %[[VAL_2]] : tensor<2x2x6xf32>
+// CHECK:         }
 func @reshape(%arg0: tensor<4x6xf32>) -> tensor<2x2x6xf32> {
   %0 = "mhlo.reshape"(%arg0) : (tensor<4x6xf32>) -> tensor<2x2x6xf32>
   return %0 : tensor<2x2x6xf32>
 
 }
 
+// CHECK-LABEL:   func @convert_dot_1d_2d(
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<256xf32>,
+// CHECK-SAME:                            %[[VAL_1:.*]]: tensor<256x1xf32>) -> tensor<1xf32> {
+// CHECK:           %[[VAL_2:.*]] = constant dense<[1, 256]> : tensor<2xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_2]]) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
+// CHECK:           %[[VAL_4:.*]] = "tf.MatMul"(%[[VAL_3]], %[[VAL_1]]) {transpose_a = false, transpose_b = false} : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
+// CHECK:           %[[VAL_5:.*]] = constant dense<1> : tensor<1xi64>
+// CHECK:           %[[VAL_6:.*]] = "tf.Reshape"(%[[VAL_4]], %[[VAL_5]]) : (tensor<1x1xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK:           return %[[VAL_6]] : tensor<1xf32>
+// CHECK:         }
 func @convert_dot_1d_2d(%arg0: tensor<256xf32>, %arg1: tensor<256x1xf32>) -> tensor<1xf32> {
   %0 = "mhlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<256xf32>, tensor<256x1xf32>) -> tensor<1xf32>
   return %0 : tensor<1xf32>
 }
 
+// CHECK-LABEL:   func @convert_dot_2d_1d(
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<1x256xf32>,
+// CHECK-SAME:                            %[[VAL_1:.*]]: tensor<256xf32>) -> tensor<1xf32> {
+// CHECK:           %[[VAL_2:.*]] = constant dense<[1, 256]> : tensor<2xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_1]], %[[VAL_2]]) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
+// CHECK:           %[[VAL_4:.*]] = "tf.MatMul"(%[[VAL_0]], %[[VAL_3]]) {transpose_a = false, transpose_b = true} : (tensor<1x256xf32>, tensor<1x256xf32>) -> tensor<1x1xf32>
+// CHECK:           %[[VAL_5:.*]] = constant dense<1> : tensor<1xi64>
+// CHECK:           %[[VAL_6:.*]] = "tf.Reshape"(%[[VAL_4]], %[[VAL_5]]) : (tensor<1x1xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK:           return %[[VAL_6]] : tensor<1xf32>
+// CHECK:         }
 func @convert_dot_2d_1d(%arg0: tensor<1x256xf32>, %arg1: tensor<256xf32>) -> tensor<1xf32> {
   %0 = "mhlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x256xf32>, tensor<256xf32>) -> tensor<1xf32>
   return %0 : tensor<1xf32>
 }
 
+// CHECK-LABEL:   func @convert_dot_1d_1d(
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<256xf32>,
+// CHECK-SAME:                            %[[VAL_1:.*]]: tensor<256xf32>) -> tensor<f32> {
+// CHECK:           %[[VAL_2:.*]] = constant dense<[1, 256]> : tensor<2xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_2]]) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
+// CHECK:           %[[VAL_4:.*]] = constant dense<[1, 256]> : tensor<2xi64>
+// CHECK:           %[[VAL_5:.*]] = "tf.Reshape"(%[[VAL_1]], %[[VAL_4]]) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
+// CHECK:           %[[VAL_6:.*]] = "tf.MatMul"(%[[VAL_3]], %[[VAL_5]]) {transpose_a = false, transpose_b = true} : (tensor<1x256xf32>, tensor<1x256xf32>) -> tensor<1x1xf32>
+// CHECK:           %[[VAL_7:.*]] = constant dense<> : tensor<0xi64>
+// CHECK:           %[[VAL_8:.*]] = "tf.Reshape"(%[[VAL_6]], %[[VAL_7]]) : (tensor<1x1xf32>, tensor<0xi64>) -> tensor<f32>
+// CHECK:           return %[[VAL_8]] : tensor<f32>
+// CHECK:         }
 func @convert_dot_1d_1d(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<f32> {
   %0 = "mhlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<256xf32>, tensor<256xf32>) -> tensor<f32>
   return %0 : tensor<f32>
 }
 
+// CHECK-LABEL:   func @convert_dot_2d_2d(
+// CHECK-SAME:                            %[[VAL_0:.*]]: tensor<1x256xf32>,
+// CHECK-SAME:                            %[[VAL_1:.*]]: tensor<256x1xf32>) -> tensor<1x1xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.MatMul"(%[[VAL_0]], %[[VAL_1]]) {transpose_a = false, transpose_b = false} : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
+// CHECK:           return %[[VAL_2]] : tensor<1x1xf32>
+// CHECK:         }
 func @convert_dot_2d_2d(%arg0: tensor<1x256xf32>, %arg1: tensor<256x1xf32>) -> tensor<1x1xf32> {
   %0 = "mhlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
   return %0 : tensor<1x1xf32>
 }
 
+// CHECK-LABEL:   func @broadcast_in_dim_tf_style(
+// CHECK-SAME:                                    %[[VAL_0:.*]]: tensor<8x1x16xf32>) -> tensor<3x8x8x16xf32> {
+// CHECK:           %[[VAL_1:.*]] = constant dense<[3, 8, 8, 16]> : tensor<4xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.BroadcastTo"(%[[VAL_0]], %[[VAL_1]]) : (tensor<8x1x16xf32>, tensor<4xi64>) -> tensor<3x8x8x16xf32>
+// CHECK:           return %[[VAL_2]] : tensor<3x8x8x16xf32>
+// CHECK:         }
 func @broadcast_in_dim_tf_style(%arg0: tensor<8x1x16xf32>) -> tensor<3x8x8x16xf32> {
   %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>, name = "broadcast.0"} : (tensor<8x1x16xf32>) -> tensor<3x8x8x16xf32>
   return %0 : tensor<3x8x8x16xf32>
 }
 
+// CHECK-LABEL:   func @broadcast_in_dim_general_case(
+// CHECK-SAME:                                        %[[VAL_0:.*]]: tensor<3x1x16xf32>) -> tensor<3x8x8x16xf32> {
+// CHECK:           %[[VAL_1:.*]] = constant dense<[3, 1, 1, 16]> : tensor<4xi64>
+// CHECK:           %[[VAL_2:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_1]]) : (tensor<3x1x16xf32>, tensor<4xi64>) -> tensor<3x1x1x16xf32>
+// CHECK:           %[[VAL_3:.*]] = constant dense<[3, 8, 8, 16]> : tensor<4xi64>
+// CHECK:           %[[VAL_4:.*]] = "tf.BroadcastTo"(%[[VAL_2]], %[[VAL_3]]) : (tensor<3x1x1x16xf32>, tensor<4xi64>) -> tensor<3x8x8x16xf32>
+// CHECK:           return %[[VAL_4]] : tensor<3x8x8x16xf32>
+// CHECK:         }
 func @broadcast_in_dim_general_case(%arg0: tensor<3x1x16xf32>) -> tensor<3x8x8x16xf32> {
   %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 2, 3]> : tensor<3xi64>, name = "broadcast.0"} : (tensor<3x1x16xf32>) -> tensor<3x8x8x16xf32>
   return %0 : tensor<3x8x8x16xf32>
 }
 
+// CHECK-LABEL:   func @convert_dot_general(
+// CHECK-SAME:                              %[[VAL_0:.*]]: tensor<3x2x6x5x1xf32>,
+// CHECK-SAME:                              %[[VAL_1:.*]]: tensor<3x2x4x6xf32>) -> tensor<3x5x1x4xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<[0, 3, 4, 1, 2]> : tensor<5xi64>} : () -> tensor<5xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Transpose"(%[[VAL_0]], %[[VAL_2]]) : (tensor<3x2x6x5x1xf32>, tensor<5xi64>) -> tensor<3x5x1x2x6xf32>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[VAL_5:.*]] = "tf.Transpose"(%[[VAL_1]], %[[VAL_4]]) : (tensor<3x2x4x6xf32>, tensor<4xi64>) -> tensor<3x2x6x4xf32>
+// CHECK:           %[[VAL_6:.*]] = constant dense<[3, 5, 12]> : tensor<3xi64>
+// CHECK:           %[[VAL_7:.*]] = "tf.Reshape"(%[[VAL_3]], %[[VAL_6]]) : (tensor<3x5x1x2x6xf32>, tensor<3xi64>) -> tensor<3x5x12xf32>
+// CHECK:           %[[VAL_8:.*]] = constant dense<[3, 12, 4]> : tensor<3xi64>
+// CHECK:           %[[VAL_9:.*]] = "tf.Reshape"(%[[VAL_5]], %[[VAL_8]]) : (tensor<3x2x6x4xf32>, tensor<3xi64>) -> tensor<3x12x4xf32>
+// CHECK:           %[[VAL_10:.*]] = "tf.BatchMatMulV2"(%[[VAL_7]], %[[VAL_9]]) {adj_x = false, adj_y = false} : (tensor<3x5x12xf32>, tensor<3x12x4xf32>) -> tensor<3x5x4xf32>
+// CHECK:           %[[VAL_11:.*]] = constant dense<[3, 5, 1, 4]> : tensor<4xi64>
+// CHECK:           %[[VAL_12:.*]] = "tf.Reshape"(%[[VAL_10]], %[[VAL_11]]) : (tensor<3x5x4xf32>, tensor<4xi64>) -> tensor<3x5x1x4xf32>
+// CHECK:           return %[[VAL_12]] : tensor<3x5x1x4xf32>
+// CHECK:         }
 func @convert_dot_general(%arg0: tensor<3x2x6x5x1xf32>, %arg1: tensor<3x2x4x6xf32>) -> tensor<3x5x1x4xf32> {
   %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<[1, 2]> : tensor<2xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<[1, 3]> : tensor<2xi64>}, precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<3x2x6x5x1xf32>, tensor<3x2x4x6xf32>) -> tensor<3x5x1x4xf32>
   return %0 : tensor<3x5x1x4xf32>
 }
 
+// CHECK-LABEL:   func @convert_conv2d(
+// CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1x8x8x207xf32>,
+// CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
+// CHECK:           return %[[VAL_2]] : tensor<1x8x8x16xf32>
+// CHECK:         }
 func @convert_conv2d(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
   %0 = "mhlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers =
        {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
@@ -736,6 +1573,12 @@ func @convert_conv2d(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>
   return %0 : tensor<1x8x8x16xf32>
 }
 
+// CHECK-LABEL:   func @convert_depthwise_conv2d(
+// CHECK-SAME:                                   %[[VAL_0:.*]]: tensor<1x8x8x207xf32>,
+// CHECK-SAME:                                   %[[VAL_1:.*]]: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.DepthwiseConv2dNative"(%[[VAL_0]], %[[VAL_1]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
+// CHECK:           return %[[VAL_2]] : tensor<1x8x8x16xf32>
+// CHECK:         }
 func @convert_depthwise_conv2d(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
   %0 = "mhlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers =
        {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
@@ -744,6 +1587,12 @@ func @convert_depthwise_conv2d(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x2
   return %0 : tensor<1x8x8x16xf32>
 }
 
+// CHECK-LABEL:   func @convert_conv2d_valid_padding(
+// CHECK-SAME:                                       %[[VAL_0:.*]]: tensor<1x8x8x207xf32>,
+// CHECK-SAME:                                       %[[VAL_1:.*]]: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
+// CHECK:           return %[[VAL_2]] : tensor<1x8x8x16xf32>
+// CHECK:         }
 func @convert_conv2d_valid_padding(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
   %0 = "mhlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64, dimension_numbers =
        {input_batch_dimension = 0 : i64, input_feature_dimension = 3 : i64, input_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>, kernel_input_feature_dimension = 2 : i64, kernel_output_feature_dimension = 3 : i64, kernel_spatial_dimensions = dense<[0, 1]> : tensor<2xi64>, output_batch_dimension = 0 : i64, output_feature_dimension = 3 : i64, output_spatial_dimensions = dense<[1, 2]> : tensor<2xi64>},
@@ -752,6 +1601,13 @@ func @convert_conv2d_valid_padding(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3
   return %0 : tensor<1x8x8x16xf32>
 }
 
+// CHECK-LABEL:   func @convert_reduce_to_sum(
+// CHECK-SAME:                                %[[VAL_0:.*]]: tensor<1x256xf32>) -> tensor<1xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Sum"(%[[VAL_0]], %[[VAL_2]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK:           return %[[VAL_3]] : tensor<1xf32>
+// CHECK:         }
 func @convert_reduce_to_sum(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
   %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
   %1 = "mhlo.reduce"(%arg0, %0) ( {
@@ -762,6 +1618,13 @@ func @convert_reduce_to_sum(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
   return %1 : tensor<1xf32>
 }
 
+// CHECK-LABEL:   func @convert_reduce_to_max(
+// CHECK-SAME:                                %[[VAL_0:.*]]: tensor<1x256xf32>) -> tensor<1xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<0xFF800000> : tensor<f32>} : () -> tensor<f32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Max"(%[[VAL_0]], %[[VAL_2]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK:           return %[[VAL_3]] : tensor<1xf32>
+// CHECK:         }
 func @convert_reduce_to_max(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
   // "0xFF800000" represents -INF for f32.
   %0 = mhlo.constant dense<0xFF800000> : tensor<f32>
@@ -773,7 +1636,13 @@ func @convert_reduce_to_max(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
   return %1 : tensor<1xf32>
 }
 
-
+// CHECK-LABEL:   func @convert_reduce_to_min(
+// CHECK-SAME:                                %[[VAL_0:.*]]: tensor<1x256xf32>) -> tensor<1xf32> {
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<0x7F800000> : tensor<f32>} : () -> tensor<f32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Min"(%[[VAL_0]], %[[VAL_2]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK:           return %[[VAL_3]] : tensor<1xf32>
+// CHECK:         }
 func @convert_reduce_to_min(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
   // "0x7F800000" represents INF for f32.
   %0 = mhlo.constant dense<0x7F800000> : tensor<f32>
@@ -785,928 +1654,31 @@ func @convert_reduce_to_min(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
   return %1 : tensor<1xf32>
 }
 
+// CHECK-LABEL:   func @convert_iota_1d() -> tensor<123xf32> {
+// CHECK:           %[[VAL_0:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<1.230000e+02> : tensor<f32>} : () -> tensor<f32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Range"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<123xf32>
+// CHECK:           return %[[VAL_3]] : tensor<123xf32>
+// CHECK:         }
+func @convert_iota_1d() -> tensor<123xf32> {
+  %0 = "mhlo.iota"() { iota_dimension = 0 : i64 } : () -> tensor<123xf32>
+  return %0 : tensor<123xf32>
+}
+
+// CHECK-LABEL:   func @convert_iota_3d() -> tensor<5x7x9xi32> {
+// CHECK:           %[[VAL_0:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_1:.*]] = "tf.Const"() {value = dense<7> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+// CHECK:           %[[VAL_3:.*]] = "tf.Range"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<7xi32>
+// CHECK:           %[[VAL_4:.*]] = "tf.Const"() {value = dense<[1, 7, 1]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK:           %[[VAL_5:.*]] = "tf.Reshape"(%[[VAL_3]], %[[VAL_4]]) : (tensor<7xi32>, tensor<3xi64>) -> tensor<1x7x1xi32>
+// CHECK:           %[[VAL_6:.*]] = "tf.Const"() {value = dense<[5, 7, 9]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CHECK:           %[[VAL_7:.*]] = "tf.BroadcastTo"(%[[VAL_5]], %[[VAL_6]]) : (tensor<1x7x1xi32>, tensor<3xi64>) -> tensor<5x7x9xi32>
+// CHECK:           return %[[VAL_7]] : tensor<5x7x9xi32>
+// CHECK:         }
+func @convert_iota_3d() -> tensor<5x7x9xi32> {
+  %0 = "mhlo.iota"() { iota_dimension = 1 : i64 } : () -> tensor<5x7x9xi32>
+  return %0 : tensor<5x7x9xi32>
+}
 
-
-// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-// CHECK-LABEL:   func @biasAdd_NHWC(
-// CHECK-SAME:                       [[VAL_0:%.*]]: tensor<1x32x10x32xi32>, [[VAL_1:%.*]]: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-// CHECK:           [[VAL_2:%.*]] = "tf.AddV2"([[VAL_0]], [[VAL_1]]) : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
-// CHECK:           return [[VAL_2]] : tensor<1x32x10x32xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @biasAdd_NCHW(
-// CHECK-SAME:                       [[VAL_3:%.*]]: tensor<1x32x10x32xi32>, [[VAL_4:%.*]]: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-// CHECK:           [[VAL_5:%.*]] = "tf.AddV2"([[VAL_3]], [[VAL_4]]) : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
-// CHECK:           return [[VAL_5]] : tensor<1x32x10x32xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @biasAdd_dynamic(
-// CHECK-SAME:                          [[VAL_6:%.*]]: tensor<?x?x?x?xi32>, [[VAL_7:%.*]]: tensor<?xi32>) -> tensor<?x?x?x?xi32> {
-// CHECK:           [[VAL_8:%.*]] = "tf.AddV2"([[VAL_6]], [[VAL_7]]) : (tensor<?x?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?x?xi32>
-// CHECK:           return [[VAL_8]] : tensor<?x?x?x?xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @add(
-// CHECK-SAME:              [[VAL_9:%.*]]: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_10:%.*]] = "tf.AddV2"([[VAL_9]], [[VAL_9]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK:           [[VAL_11:%.*]] = "tf.AddV2"([[VAL_10]], [[VAL_9]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK:           return [[VAL_11]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_add(
-// CHECK-SAME:                        [[VAL_12:%.*]]: tensor<1xi32>, [[VAL_13:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
-// CHECK:           [[VAL_14:%.*]] = "tf.AddV2"([[VAL_12]], [[VAL_13]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-// CHECK:           return [[VAL_14]] : tensor<1x2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_multi_dim_add(
-// CHECK-SAME:                                  [[VAL_15:%.*]]: tensor<4x1x1xi32>, [[VAL_16:%.*]]: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
-// CHECK:           [[VAL_17:%.*]] = "tf.AddV2"([[VAL_15]], [[VAL_16]]) : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
-// CHECK:           return [[VAL_17]] : tensor<4x4x4x4xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @div(
-// CHECK-SAME:              [[VAL_18:%.*]]: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_19:%.*]] = "tf.Div"([[VAL_18]], [[VAL_18]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK:           return [[VAL_19]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_div(
-// CHECK-SAME:                        [[VAL_20:%.*]]: tensor<1xi32>, [[VAL_21:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
-// CHECK:           [[VAL_22:%.*]] = "tf.Div"([[VAL_20]], [[VAL_21]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-// CHECK:           return [[VAL_22]] : tensor<1x2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @shift_left(
-// CHECK-SAME:                     [[VAL_23:%.*]]: tensor<4xi32>, [[VAL_24:%.*]]: tensor<4xi32>) -> tensor<4xi32> {
-// CHECK:           [[VAL_25:%.*]] = "tf.LeftShift"([[VAL_23]], [[VAL_24]]) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-// CHECK:           return [[VAL_25]] : tensor<4xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @div_dynamic(
-// CHECK-SAME:                      [[VAL_26:%.*]]: tensor<?xi32>, [[VAL_27:%.*]]: tensor<?x?xi32>) -> tensor<?x?xi32> {
-// CHECK:           [[VAL_28:%.*]] = "tf.Div"([[VAL_26]], [[VAL_27]]) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
-// CHECK:           return [[VAL_28]] : tensor<?x?xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @maximum(
-// CHECK-SAME:                  [[VAL_29:%.*]]: tensor<4xf32>, [[VAL_30:%.*]]: tensor<4xf32>) -> tensor<4xf32> {
-// CHECK:           [[VAL_31:%.*]] = "tf.Maximum"([[VAL_29]], [[VAL_30]]) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-// CHECK:           return [[VAL_31]] : tensor<4xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @minimum(
-// CHECK-SAME:                  [[VAL_32:%.*]]: tensor<4xf32>, [[VAL_33:%.*]]: tensor<4xf32>) -> tensor<4xf32> {
-// CHECK:           [[VAL_34:%.*]] = "tf.Minimum"([[VAL_32]], [[VAL_33]]) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-// CHECK:           return [[VAL_34]] : tensor<4xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @mul(
-// CHECK-SAME:              [[VAL_35:%.*]]: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_36:%.*]] = "tf.Mul"([[VAL_35]], [[VAL_35]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK:           return [[VAL_36]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_mul(
-// CHECK-SAME:                        [[VAL_37:%.*]]: tensor<1xi32>, [[VAL_38:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
-// CHECK:           [[VAL_39:%.*]] = "tf.Mul"([[VAL_37]], [[VAL_38]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-// CHECK:           return [[VAL_39]] : tensor<1x2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @real_div(
-// CHECK-SAME:                   [[VAL_40:%.*]]: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_41:%.*]] = "tf.Div"([[VAL_40]], [[VAL_40]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK:           return [[VAL_41]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_real_div(
-// CHECK-SAME:                             [[VAL_42:%.*]]: tensor<1xi32>, [[VAL_43:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
-// CHECK:           [[VAL_44:%.*]] = "tf.Div"([[VAL_42]], [[VAL_43]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-// CHECK:           return [[VAL_44]] : tensor<1x2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @sub(
-// CHECK-SAME:              [[VAL_45:%.*]]: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_46:%.*]] = "tf.Sub"([[VAL_45]], [[VAL_45]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK:           return [[VAL_46]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_sub(
-// CHECK-SAME:                        [[VAL_47:%.*]]: tensor<1xi32>, [[VAL_48:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi32> {
-// CHECK:           [[VAL_49:%.*]] = "tf.Sub"([[VAL_47]], [[VAL_48]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-// CHECK:           return [[VAL_49]] : tensor<1x2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @shift_right(
-// CHECK-SAME:                      [[VAL_50:%.*]]: tensor<4xi32>, [[VAL_51:%.*]]: tensor<4xi32>) -> tensor<4xi32> {
-// CHECK:           [[VAL_52:%.*]] = "tf.RightShift"([[VAL_50]], [[VAL_51]]) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-// CHECK:           return [[VAL_52]] : tensor<4xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_shift_right(
-// CHECK-SAME:                                [[VAL_53:%.*]]: tensor<4xi32>, [[VAL_54:%.*]]: tensor<2x4xi32>) -> tensor<2x4xi32> {
-// CHECK:           [[VAL_55:%.*]] = "tf.RightShift"([[VAL_53]], [[VAL_54]]) : (tensor<4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
-// CHECK:           return [[VAL_55]] : tensor<2x4xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @and(
-// CHECK-SAME:              [[VAL_56:%.*]]: tensor<2xi1>) -> tensor<2xi1> {
-// CHECK:           [[VAL_57:%.*]] = "tf.LogicalAnd"([[VAL_56]], [[VAL_56]]) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
-// CHECK:           return [[VAL_57]] : tensor<2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @and_broadcast(
-// CHECK-SAME:                        [[VAL_58:%.*]]: tensor<1xi1>, [[VAL_59:%.*]]: tensor<1x2xi1>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_60:%.*]] = "tf.LogicalAnd"([[VAL_58]], [[VAL_59]]) : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_60]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @and_dynamic(
-// CHECK-SAME:                      [[VAL_61:%.*]]: tensor<?xi1>, [[VAL_62:%.*]]: tensor<1xi1>) -> tensor<?xi1> {
-// CHECK:           [[VAL_63:%.*]] = "tf.LogicalAnd"([[VAL_61]], [[VAL_62]]) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
-// CHECK:           return [[VAL_63]] : tensor<?xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @or(
-// CHECK-SAME:             [[VAL_64:%.*]]: tensor<2xi1>) -> tensor<2xi1> {
-// CHECK:           [[VAL_65:%.*]] = "tf.LogicalOr"([[VAL_64]], [[VAL_64]]) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
-// CHECK:           return [[VAL_65]] : tensor<2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @or_broadcast(
-// CHECK-SAME:                       [[VAL_66:%.*]]: tensor<1xi1>, [[VAL_67:%.*]]: tensor<1x2xi1>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_68:%.*]] = "tf.LogicalOr"([[VAL_66]], [[VAL_67]]) : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_68]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @or_dynamic(
-// CHECK-SAME:                     [[VAL_69:%.*]]: tensor<?xi1>, [[VAL_70:%.*]]: tensor<1xi1>) -> tensor<?xi1> {
-// CHECK:           [[VAL_71:%.*]] = "tf.LogicalOr"([[VAL_69]], [[VAL_70]]) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
-// CHECK:           return [[VAL_71]] : tensor<?xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitwise_or(
-// CHECK-SAME:                     [[VAL_72:%.*]]: tensor<4xi32>, [[VAL_73:%.*]]: tensor<4xi32>) -> tensor<4xi32> {
-// CHECK:           [[VAL_74:%.*]] = "tf.BitwiseOr"([[VAL_72]], [[VAL_73]]) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-// CHECK:           return [[VAL_74]] : tensor<4xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitwise_or_broadcast(
-// CHECK-SAME:                               [[VAL_75:%.*]]: tensor<1xi8>, [[VAL_76:%.*]]: tensor<1x4xi8>) -> tensor<1x4xi8> {
-// CHECK:           [[VAL_77:%.*]] = "tf.BitwiseOr"([[VAL_75]], [[VAL_76]]) : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
-// CHECK:           return [[VAL_77]] : tensor<1x4xi8>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitwise_or_dynamic(
-// CHECK-SAME:                             [[VAL_78:%.*]]: tensor<?xi32>, [[VAL_79:%.*]]: tensor<1xi32>) -> tensor<?xi32> {
-// CHECK:           [[VAL_80:%.*]] = "tf.BitwiseOr"([[VAL_78]], [[VAL_79]]) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-// CHECK:           return [[VAL_80]] : tensor<?xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitwise_and(
-// CHECK-SAME:                      [[VAL_81:%.*]]: tensor<4xi32>, [[VAL_82:%.*]]: tensor<4xi32>) -> tensor<4xi32> {
-// CHECK:           [[VAL_83:%.*]] = "tf.BitwiseAnd"([[VAL_81]], [[VAL_82]]) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-// CHECK:           return [[VAL_83]] : tensor<4xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitwise_and_broadcast(
-// CHECK-SAME:                                [[VAL_84:%.*]]: tensor<1xi8>, [[VAL_85:%.*]]: tensor<1x4xi8>) -> tensor<1x4xi8> {
-// CHECK:           [[VAL_86:%.*]] = "tf.BitwiseAnd"([[VAL_84]], [[VAL_85]]) : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
-// CHECK:           return [[VAL_86]] : tensor<1x4xi8>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitwise_and_dynamic(
-// CHECK-SAME:                              [[VAL_87:%.*]]: tensor<?xi32>, [[VAL_88:%.*]]: tensor<1xi32>) -> tensor<?xi32> {
-// CHECK:           [[VAL_89:%.*]] = "tf.BitwiseAnd"([[VAL_87]], [[VAL_88]]) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-// CHECK:           return [[VAL_89]] : tensor<?xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @pow(
-// CHECK-SAME:              [[VAL_90:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_91:%.*]] = "tf.Pow"([[VAL_90]], [[VAL_90]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_91]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @pow_dynamic(
-// CHECK-SAME:                      [[VAL_92:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_93:%.*]] = "tf.Pow"([[VAL_92]], [[VAL_92]]) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_93]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @floordiv_broadcast_i32(
-// CHECK-SAME:                                 [[VAL_94:%.*]]: tensor<2x3xi32>, [[VAL_95:%.*]]: tensor<3xi32>) -> tensor<2x3xi32> {
-// CHECK:           [[VAL_96:%.*]] = "tf.Const"() {value = dense<0> : tensor<2x3xi32>} : () -> tensor<2x3xi32>
-// CHECK:           [[VAL_97:%.*]] = "tf.Less"([[VAL_94]], [[VAL_96]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
-// CHECK:           [[VAL_98:%.*]] = "tf.Const"() {value = dense<0> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK:           [[VAL_99:%.*]] = "tf.Less"([[VAL_95]], [[VAL_98]]) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
-// CHECK:           [[VAL_100:%.*]] = "tf.Equal"([[VAL_97]], [[VAL_99]]) {incompatible_shape_error = true} : (tensor<2x3xi1>, tensor<3xi1>) -> tensor<2x3xi1>
-// CHECK:           [[VAL_101:%.*]] = "tf.Div"([[VAL_94]], [[VAL_95]]) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_102:%.*]] = "tf.Abs"([[VAL_94]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_103:%.*]] = "tf.Abs"([[VAL_95]]) : (tensor<3xi32>) -> tensor<3xi32>
-// CHECK:           [[VAL_104:%.*]] = "tf.Const"() {value = dense<1> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK:           [[VAL_105:%.*]] = "tf.Sub"([[VAL_103]], [[VAL_104]]) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi32>
-// CHECK:           [[VAL_106:%.*]] = "tf.AddV2"([[VAL_102]], [[VAL_105]]) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_107:%.*]] = "tf.Neg"([[VAL_106]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_108:%.*]] = "tf.Abs"([[VAL_95]]) : (tensor<3xi32>) -> tensor<3xi32>
-// CHECK:           [[VAL_109:%.*]] = "tf.Div"([[VAL_107]], [[VAL_108]]) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_110:%.*]] = "tf.Select"([[VAL_100]], [[VAL_101]], [[VAL_109]]) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           return [[VAL_110]] : tensor<2x3xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @floordiv_reverse_broadcast_i32(
-// CHECK-SAME:                                         [[VAL_111:%.*]]: tensor<3xi32>, [[VAL_112:%.*]]: tensor<2x3xi32>) -> tensor<2x3xi32> {
-// CHECK:           [[VAL_113:%.*]] = "tf.Const"() {value = dense<0> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK:           [[VAL_114:%.*]] = "tf.Less"([[VAL_111]], [[VAL_113]]) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
-// CHECK:           [[VAL_115:%.*]] = "tf.Const"() {value = dense<0> : tensor<2x3xi32>} : () -> tensor<2x3xi32>
-// CHECK:           [[VAL_116:%.*]] = "tf.Less"([[VAL_112]], [[VAL_115]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
-// CHECK:           [[VAL_117:%.*]] = "tf.Equal"([[VAL_114]], [[VAL_116]]) {incompatible_shape_error = true} : (tensor<3xi1>, tensor<2x3xi1>) -> tensor<2x3xi1>
-// CHECK:           [[VAL_118:%.*]] = "tf.Div"([[VAL_111]], [[VAL_112]]) : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_119:%.*]] = "tf.Abs"([[VAL_111]]) : (tensor<3xi32>) -> tensor<3xi32>
-// CHECK:           [[VAL_120:%.*]] = "tf.Abs"([[VAL_112]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_121:%.*]] = "tf.Const"() {value = dense<1> : tensor<2x3xi32>} : () -> tensor<2x3xi32>
-// CHECK:           [[VAL_122:%.*]] = "tf.Sub"([[VAL_120]], [[VAL_121]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_123:%.*]] = "tf.AddV2"([[VAL_119]], [[VAL_122]]) : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_124:%.*]] = "tf.Neg"([[VAL_123]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_125:%.*]] = "tf.Abs"([[VAL_112]]) : (tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_126:%.*]] = "tf.Div"([[VAL_124]], [[VAL_125]]) : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           [[VAL_127:%.*]] = "tf.Select"([[VAL_117]], [[VAL_118]], [[VAL_126]]) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
-// CHECK:           return [[VAL_127]] : tensor<2x3xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @floordiv_f32(
-// CHECK-SAME:                       [[VAL_128:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_129:%.*]] = "tf.Div"([[VAL_128]], [[VAL_128]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           [[VAL_130:%.*]] = "tf.Div"([[VAL_128]], [[VAL_128]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           [[VAL_131:%.*]] = "tf.FloorDiv"([[VAL_128]], [[VAL_128]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_131]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @floordiv_f16_broadcast(
-// CHECK-SAME:                                 [[VAL_132:%.*]]: tensor<2x3xf16>, [[VAL_133:%.*]]: tensor<3xf16>) -> tensor<2x3xf16> {
-// CHECK:           [[VAL_134:%.*]] = "tf.Div"([[VAL_132]], [[VAL_133]]) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
-// CHECK:           [[VAL_135:%.*]] = "tf.Div"([[VAL_132]], [[VAL_133]]) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
-// CHECK:           [[VAL_136:%.*]] = "tf.FloorDiv"([[VAL_132]], [[VAL_133]]) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
-// CHECK:           return [[VAL_136]] : tensor<2x3xf16>
-// CHECK:         }
-
-// CHECK-LABEL:   func @equal(
-// CHECK-SAME:                [[VAL_137:%.*]]: tensor<2xi32>) -> tensor<2xi1> {
-// CHECK:           [[VAL_138:%.*]] = "tf.Equal"([[VAL_137]], [[VAL_137]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-// CHECK:           return [[VAL_138]] : tensor<2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @equal_dynamic(
-// CHECK-SAME:                        [[VAL_139:%.*]]: tensor<?xi32>, [[VAL_140:%.*]]: tensor<1xi32>) -> tensor<?xi1> {
-// CHECK:           [[VAL_141:%.*]] = "tf.Equal"([[VAL_139]], [[VAL_140]]) {incompatible_shape_error = true} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-// CHECK:           return [[VAL_141]] : tensor<?xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @equal_broadcast(
-// CHECK-SAME:                          [[VAL_142:%.*]]: tensor<1xi32>, [[VAL_143:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_144:%.*]] = "tf.Equal"([[VAL_142]], [[VAL_143]]) {incompatible_shape_error = true} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_144]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @equal_broadcast_no_incompatible_shapes_error(
-// CHECK-SAME:                                                       [[VAL_145:%.*]]: tensor<2xi32>, [[VAL_146:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_147:%.*]] = "tf.Equal"([[VAL_145]], [[VAL_146]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_147]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @equal_incompatible_shape_broadcastable(
-// CHECK-SAME:                                                 [[VAL_148:%.*]]: tensor<?xi32>, [[VAL_149:%.*]]: tensor<1xi32>) -> tensor<?xi1> {
-// CHECK:           [[VAL_150:%.*]] = "tf.Equal"([[VAL_148]], [[VAL_149]]) {incompatible_shape_error = true} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-// CHECK:           return [[VAL_150]] : tensor<?xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @notequal(
-// CHECK-SAME:                   [[VAL_151:%.*]]: tensor<2xi32>) -> tensor<2xi1> {
-// CHECK:           [[VAL_152:%.*]] = "tf.NotEqual"([[VAL_151]], [[VAL_151]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-// CHECK:           return [[VAL_152]] : tensor<2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @notequal_broadcast(
-// CHECK-SAME:                             [[VAL_153:%.*]]: tensor<1xi32>, [[VAL_154:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_155:%.*]] = "tf.NotEqual"([[VAL_153]], [[VAL_154]]) {incompatible_shape_error = true} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_155]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @notequal_broadcast_no_incompatible_shapes_error(
-// CHECK-SAME:                                                          [[VAL_156:%.*]]: tensor<2xi32>, [[VAL_157:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_158:%.*]] = "tf.NotEqual"([[VAL_156]], [[VAL_157]]) {incompatible_shape_error = true} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_158]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @notequal_incompatible_shape_broadcastable(
-// CHECK-SAME:                                                    [[VAL_159:%.*]]: tensor<?xi32>, [[VAL_160:%.*]]: tensor<1xi32>) -> tensor<?xi1> {
-// CHECK:           [[VAL_161:%.*]] = "tf.NotEqual"([[VAL_159]], [[VAL_160]]) {incompatible_shape_error = true} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-// CHECK:           return [[VAL_161]] : tensor<?xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @greater(
-// CHECK-SAME:                  [[VAL_162:%.*]]: tensor<2xi32>) -> tensor<2xi1> {
-// CHECK:           [[VAL_163:%.*]] = "tf.Greater"([[VAL_162]], [[VAL_162]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-// CHECK:           return [[VAL_163]] : tensor<2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_greater(
-// CHECK-SAME:                            [[VAL_164:%.*]]: tensor<1xi32>, [[VAL_165:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_166:%.*]] = "tf.Greater"([[VAL_164]], [[VAL_165]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_166]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @greater_equal(
-// CHECK-SAME:                        [[VAL_167:%.*]]: tensor<2xi32>) -> tensor<2xi1> {
-// CHECK:           [[VAL_168:%.*]] = "tf.GreaterEqual"([[VAL_167]], [[VAL_167]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-// CHECK:           return [[VAL_168]] : tensor<2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_greater_equal(
-// CHECK-SAME:                                  [[VAL_169:%.*]]: tensor<1xi32>, [[VAL_170:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_171:%.*]] = "tf.GreaterEqual"([[VAL_169]], [[VAL_170]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_171]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @less(
-// CHECK-SAME:               [[VAL_172:%.*]]: tensor<2xi32>) -> tensor<2xi1> {
-// CHECK:           [[VAL_173:%.*]] = "tf.Less"([[VAL_172]], [[VAL_172]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-// CHECK:           return [[VAL_173]] : tensor<2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_less(
-// CHECK-SAME:                         [[VAL_174:%.*]]: tensor<1xi32>, [[VAL_175:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_176:%.*]] = "tf.Less"([[VAL_174]], [[VAL_175]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_176]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @less_equal(
-// CHECK-SAME:                     [[VAL_177:%.*]]: tensor<2xi32>) -> tensor<2xi1> {
-// CHECK:           [[VAL_178:%.*]] = "tf.LessEqual"([[VAL_177]], [[VAL_177]]) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-// CHECK:           return [[VAL_178]] : tensor<2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_less_equal(
-// CHECK-SAME:                               [[VAL_179:%.*]]: tensor<1xi32>, [[VAL_180:%.*]]: tensor<1x2xi32>) -> tensor<1x2xi1> {
-// CHECK:           [[VAL_181:%.*]] = "tf.LessEqual"([[VAL_179]], [[VAL_180]]) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-// CHECK:           return [[VAL_181]] : tensor<1x2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @concat_v2(
-// CHECK-SAME:                    [[VAL_182:%.*]]: tensor<3x3xf32>, [[VAL_183:%.*]]: tensor<3x3xf32>) -> tensor<6x3xf32> {
-// CHECK:           [[VAL_184:%.*]] = "tf.Const"() {value = dense<0> : tensor<i64>} : () -> tensor<i64>
-// CHECK:           [[VAL_185:%.*]] = "tf.ConcatV2"([[VAL_182]], [[VAL_183]], [[VAL_184]]) : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<i64>) -> tensor<6x3xf32>
-// CHECK:           return [[VAL_185]] : tensor<6x3xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @concat_v2_1d_axis(
-// CHECK-SAME:                            [[VAL_186:%.*]]: tensor<3x3xf32>, [[VAL_187:%.*]]: tensor<3x3xf32>) -> tensor<3x6xf32> {
-// CHECK:           [[VAL_188:%.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
-// CHECK:           [[VAL_189:%.*]] = "tf.ConcatV2"([[VAL_186]], [[VAL_187]], [[VAL_188]]) : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<i64>) -> tensor<3x6xf32>
-// CHECK:           return [[VAL_189]] : tensor<3x6xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @const() -> tensor<2xi32> {
-// CHECK:           [[VAL_190:%.*]] = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK:           return [[VAL_190]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @relu(
-// CHECK-SAME:               [[VAL_192:%.*]]: tensor<1xi32>) -> tensor<1xi32> {
-// CHECK:           [[VAL_193:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           [[VAL_194:%.*]] = "tf.Maximum"([[VAL_193]], [[VAL_192]]) : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
-// CHECK:           return [[VAL_194]] : tensor<1xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @relu_unranked(
-// CHECK-SAME:                        [[VAL_195:%.*]]: tensor<?xi32>) -> tensor<?xi32> {
-// CHECK:           [[VAL_196:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           [[VAL_197:%.*]] = "tf.Maximum"([[VAL_196]], [[VAL_195]]) : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
-// CHECK:           return [[VAL_197]] : tensor<?xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @relu6(
-// CHECK-SAME:                [[VAL_198:%.*]]: tensor<1xi32>) -> tensor<1xi32> {
-// CHECK:           [[VAL_199:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           [[VAL_200:%.*]] = "tf.Const"() {value = dense<6> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           [[VAL_201:%.*]] = "tf.Minimum"([[VAL_198]], [[VAL_200]]) : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
-// CHECK:           [[VAL_202:%.*]] = "tf.Maximum"([[VAL_201]], [[VAL_199]]) : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
-// CHECK:           return [[VAL_202]] : tensor<1xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @relu6_unranked(
-// CHECK-SAME:                         [[VAL_203:%.*]]: tensor<?xi32>) -> tensor<?xi32> {
-// CHECK:           [[VAL_204:%.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           [[VAL_205:%.*]] = "tf.Const"() {value = dense<6> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           [[VAL_206:%.*]] = "tf.Minimum"([[VAL_203]], [[VAL_205]]) : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
-// CHECK:           [[VAL_207:%.*]] = "tf.Maximum"([[VAL_206]], [[VAL_204]]) : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
-// CHECK:           return [[VAL_207]] : tensor<?xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @relu_grad(
-// CHECK-SAME:                    [[VAL_208:%.*]]: tensor<4x8xf32>, [[VAL_209:%.*]]: tensor<?x?xf32>) -> tensor<4x8xf32> {
-// CHECK:           [[VAL_210:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>} : () -> tensor<f32>
-// CHECK:           [[VAL_211:%.*]] = "tf.Greater"([[VAL_209]], [[VAL_210]]) : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xi1>
-// CHECK:           [[VAL_212:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<4x8xf32>} : () -> tensor<4x8xf32>
-// CHECK:           [[VAL_213:%.*]] = "tf.Select"([[VAL_211]], [[VAL_208]], [[VAL_212]]) : (tensor<?x?xi1>, tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32>
-// CHECK:           return [[VAL_213]] : tensor<4x8xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @select(
-// CHECK-SAME:                 [[VAL_214:%.*]]: tensor<2xi1>, [[VAL_215:%.*]]: tensor<2xi32>, [[VAL_216:%.*]]: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_217:%.*]] = "tf.Select"([[VAL_214]], [[VAL_215]], [[VAL_216]]) : (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK:           return [[VAL_217]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @select_float(
-// CHECK-SAME:                       [[VAL_218:%.*]]: tensor<2xi1>, [[VAL_219:%.*]]: tensor<2xf32>, [[VAL_220:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_221:%.*]] = "tf.Select"([[VAL_218]], [[VAL_219]], [[VAL_220]]) : (tensor<2xi1>, tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_221]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @select_multidimensional(
-// CHECK-SAME:                                  [[VAL_222:%.*]]: tensor<3x2xi1>, [[VAL_223:%.*]]: tensor<3x2xi32>, [[VAL_224:%.*]]: tensor<3x2xi32>) -> tensor<3x2xi32> {
-// CHECK:           [[VAL_225:%.*]] = "tf.Select"([[VAL_222]], [[VAL_223]], [[VAL_224]]) : (tensor<3x2xi1>, tensor<3x2xi32>, tensor<3x2xi32>) -> tensor<3x2xi32>
-// CHECK:           return [[VAL_225]] : tensor<3x2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @selectv2(
-// CHECK-SAME:                   [[VAL_226:%.*]]: tensor<2xi1>, [[VAL_227:%.*]]: tensor<2xi32>, [[VAL_228:%.*]]: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_229:%.*]] = "tf.Select"([[VAL_226]], [[VAL_227]], [[VAL_228]]) : (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK:           return [[VAL_229]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @selectv2_pred_scalar(
-// CHECK-SAME:                               [[VAL_230:%.*]]: tensor<i1>, [[VAL_231:%.*]]: tensor<2xi32>, [[VAL_232:%.*]]: tensor<2xi32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_233:%.*]] = "tf.Select"([[VAL_230]], [[VAL_231]], [[VAL_232]]) : (tensor<i1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-// CHECK:           return [[VAL_233]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @transpose_2d(
-// CHECK-SAME:                       [[VAL_234:%.*]]: tensor<2x3xf32>) -> tensor<3x2xf32> {
-// CHECK:           [[VAL_235:%.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_236:%.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_237:%.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_238:%.*]] = "tf.Transpose"([[VAL_234]], [[VAL_237]]) : (tensor<2x3xf32>, tensor<2xi64>) -> tensor<3x2xf32>
-// CHECK:           return [[VAL_238]] : tensor<3x2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @transpose_3d_int32(
-// CHECK-SAME:                             [[VAL_239:%.*]]: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
-// CHECK:           [[VAL_240:%.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK:           [[VAL_241:%.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
-// CHECK:           [[VAL_242:%.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
-// CHECK:           [[VAL_243:%.*]] = "tf.Transpose"([[VAL_239]], [[VAL_242]]) : (tensor<1x2x3xf32>, tensor<3xi64>) -> tensor<3x2x1xf32>
-// CHECK:           return [[VAL_243]] : tensor<3x2x1xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @transpose_3d(
-// CHECK-SAME:                       [[VAL_244:%.*]]: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
-// CHECK:           [[VAL_245:%.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
-// CHECK:           [[VAL_246:%.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
-// CHECK:           [[VAL_247:%.*]] = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> tensor<3xi64>
-// CHECK:           [[VAL_248:%.*]] = "tf.Transpose"([[VAL_244]], [[VAL_247]]) : (tensor<1x2x3xf32>, tensor<3xi64>) -> tensor<3x2x1xf32>
-// CHECK:           return [[VAL_248]] : tensor<3x2x1xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @transpose_dynamic_2d(
-// CHECK-SAME:                               [[VAL_249:%.*]]: tensor<?x4xf32>) -> tensor<4x?xf32> {
-// CHECK:           [[VAL_250:%.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_251:%.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_252:%.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_253:%.*]] = "tf.Transpose"([[VAL_249]], [[VAL_252]]) : (tensor<?x4xf32>, tensor<2xi64>) -> tensor<4x?xf32>
-// CHECK:           return [[VAL_253]] : tensor<4x?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @transpose_unranked_2d(
-// CHECK-SAME:                                [[VAL_254:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_255:%.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_256:%.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_257:%.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_258:%.*]] = "tf.Transpose"([[VAL_254]], [[VAL_257]]) : (tensor<*xf32>, tensor<2xi64>) -> tensor<*xf32>
-// CHECK:           return [[VAL_258]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @abs(
-// CHECK-SAME:              [[VAL_259:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_260:%.*]] = "tf.Abs"([[VAL_259]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_260]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @abs_dynamic(
-// CHECK-SAME:                      [[VAL_261:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_262:%.*]] = "tf.Abs"([[VAL_261]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_262]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @abs_unranked(
-// CHECK-SAME:                       [[VAL_263:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_264:%.*]] = "tf.Abs"([[VAL_263]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_264]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @ceil(
-// CHECK-SAME:               [[VAL_265:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_266:%.*]] = "tf.Ceil"([[VAL_265]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_266]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @ceil_dynamic(
-// CHECK-SAME:                       [[VAL_267:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_268:%.*]] = "tf.Ceil"([[VAL_267]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_268]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @ceil_unranked(
-// CHECK-SAME:                        [[VAL_269:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_270:%.*]] = "tf.Ceil"([[VAL_269]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_270]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @complex_abs(
-// CHECK-SAME:                      [[VAL_271:%.*]]: tensor<2xcomplex<f32>>) -> tensor<2xf32> {
-// CHECK:           [[VAL_272:%.*]] = "tf.ComplexAbs"([[VAL_271]]) : (tensor<2xcomplex<f32>>) -> tensor<2xf32>
-// CHECK:           return [[VAL_272]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @cos(
-// CHECK-SAME:              [[VAL_273:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_274:%.*]] = "tf.Cos"([[VAL_273]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_274]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @cos_dynamic(
-// CHECK-SAME:                      [[VAL_275:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_276:%.*]] = "tf.Cos"([[VAL_275]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_276]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @cos_unranked(
-// CHECK-SAME:                       [[VAL_277:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_278:%.*]] = "tf.Cos"([[VAL_277]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_278]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @exp(
-// CHECK-SAME:              [[VAL_279:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_280:%.*]] = "tf.Exp"([[VAL_279]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_280]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @exp_dynamic(
-// CHECK-SAME:                      [[VAL_281:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_282:%.*]] = "tf.Exp"([[VAL_281]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_282]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @exp_unranked(
-// CHECK-SAME:                       [[VAL_283:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_284:%.*]] = "tf.Exp"([[VAL_283]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_284]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @floor(
-// CHECK-SAME:                [[VAL_285:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_286:%.*]] = "tf.Floor"([[VAL_285]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_286]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @floor_dynamic(
-// CHECK-SAME:                        [[VAL_287:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_288:%.*]] = "tf.Floor"([[VAL_287]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_288]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @floor_unranked(
-// CHECK-SAME:                         [[VAL_289:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_290:%.*]] = "tf.Floor"([[VAL_289]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_290]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @is_finite(
-// CHECK-SAME:                    [[VAL_291:%.*]]: tensor<2xf32>) -> tensor<2xi1> {
-// CHECK:           [[VAL_292:%.*]] = "tf.IsFinite"([[VAL_291]]) : (tensor<2xf32>) -> tensor<2xi1>
-// CHECK:           return [[VAL_292]] : tensor<2xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @is_finite_dynamic(
-// CHECK-SAME:                            [[VAL_293:%.*]]: tensor<?xf32>) -> tensor<?xi1> {
-// CHECK:           [[VAL_294:%.*]] = "tf.IsFinite"([[VAL_293]]) : (tensor<?xf32>) -> tensor<?xi1>
-// CHECK:           return [[VAL_294]] : tensor<?xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @is_finite_unranked(
-// CHECK-SAME:                             [[VAL_295:%.*]]: tensor<*xf32>) -> tensor<*xi1> {
-// CHECK:           [[VAL_296:%.*]] = "tf.IsFinite"([[VAL_295]]) : (tensor<*xf32>) -> tensor<*xi1>
-// CHECK:           return [[VAL_296]] : tensor<*xi1>
-// CHECK:         }
-
-// CHECK-LABEL:   func @log(
-// CHECK-SAME:              [[VAL_297:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_298:%.*]] = "tf.Log"([[VAL_297]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_298]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @log_dynamic(
-// CHECK-SAME:                      [[VAL_299:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_300:%.*]] = "tf.Log"([[VAL_299]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_300]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @log_unranked(
-// CHECK-SAME:                       [[VAL_301:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_302:%.*]] = "tf.Log"([[VAL_301]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_302]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @log1p(
-// CHECK-SAME:                [[VAL_303:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_304:%.*]] = "tf.Log1p"([[VAL_303]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_304]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @log1p_dynamic(
-// CHECK-SAME:                        [[VAL_305:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_306:%.*]] = "tf.Log1p"([[VAL_305]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_306]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @log1p_unranked(
-// CHECK-SAME:                         [[VAL_307:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_308:%.*]] = "tf.Log1p"([[VAL_307]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_308]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @neg(
-// CHECK-SAME:              [[VAL_309:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_310:%.*]] = "tf.Neg"([[VAL_309]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_310]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @neg_dynamic(
-// CHECK-SAME:                      [[VAL_311:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_312:%.*]] = "tf.Neg"([[VAL_311]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_312]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @neg_unranked(
-// CHECK-SAME:                       [[VAL_313:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_314:%.*]] = "tf.Neg"([[VAL_313]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_314]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @sigmoid(
-// CHECK-SAME:                  [[VAL_315:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_316:%.*]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<f32>} : () -> tensor<f32>
-// CHECK:           [[VAL_317:%.*]] = "tf.Const"() {value = dense<2> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           [[VAL_318:%.*]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<2xf32>} : () -> tensor<2xf32>
-// CHECK:           [[VAL_319:%.*]] = "tf.Mul"([[VAL_315]], [[VAL_318]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           [[VAL_320:%.*]] = "tf.Tanh"([[VAL_319]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           [[VAL_321:%.*]] = "tf.Mul"([[VAL_320]], [[VAL_318]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           [[VAL_322:%.*]] = "tf.AddV2"([[VAL_321]], [[VAL_318]]) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_322]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @sin(
-// CHECK-SAME:              [[VAL_323:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_324:%.*]] = "tf.Sin"([[VAL_323]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_324]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @sin_dynamic(
-// CHECK-SAME:                      [[VAL_325:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_326:%.*]] = "tf.Sin"([[VAL_325]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_326]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @sin_unranked(
-// CHECK-SAME:                       [[VAL_327:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_328:%.*]] = "tf.Sin"([[VAL_327]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_328]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @rsqrt(
-// CHECK-SAME:                [[VAL_329:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_330:%.*]] = "tf.Rsqrt"([[VAL_329]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_330]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @rsqrt_dynamic(
-// CHECK-SAME:                        [[VAL_331:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_332:%.*]] = "tf.Rsqrt"([[VAL_331]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_332]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @rsqrt_unranked(
-// CHECK-SAME:                         [[VAL_333:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_334:%.*]] = "tf.Rsqrt"([[VAL_333]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_334]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @sqrt(
-// CHECK-SAME:               [[VAL_335:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_336:%.*]] = "tf.Sqrt"([[VAL_335]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_336]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @sqrt_dynamic(
-// CHECK-SAME:                       [[VAL_337:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_338:%.*]] = "tf.Sqrt"([[VAL_337]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_338]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @sqrt_unranked(
-// CHECK-SAME:                        [[VAL_339:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_340:%.*]] = "tf.Sqrt"([[VAL_339]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_340]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @tanh(
-// CHECK-SAME:               [[VAL_341:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_342:%.*]] = "tf.Tanh"([[VAL_341]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_342]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @tanh_dynamic(
-// CHECK-SAME:                       [[VAL_343:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_344:%.*]] = "tf.Tanh"([[VAL_343]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_344]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @tanh_unranked(
-// CHECK-SAME:                        [[VAL_345:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_346:%.*]] = "tf.Tanh"([[VAL_345]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_346]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitcast(
-// CHECK-SAME:                  [[VAL_347:%.*]]: tensor<2xf32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_348:%.*]] = "tf.Bitcast"([[VAL_347]]) : (tensor<2xf32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_348]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitcast_dynamic(
-// CHECK-SAME:                          [[VAL_349:%.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK:           [[VAL_350:%.*]] = "tf.Bitcast"([[VAL_349]]) : (tensor<?xf32>) -> tensor<?xf32>
-// CHECK:           return [[VAL_350]] : tensor<?xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitcast_unranked(
-// CHECK-SAME:                           [[VAL_351:%.*]]: tensor<*xf32>) -> tensor<*xf32> {
-// CHECK:           [[VAL_352:%.*]] = "tf.Bitcast"([[VAL_351]]) : (tensor<*xf32>) -> tensor<*xf32>
-// CHECK:           return [[VAL_352]] : tensor<*xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @bitcast_same_widths(
-// CHECK-SAME:                              [[VAL_353:%.*]]: tensor<2xf32>) -> tensor<2xi32> {
-// CHECK:           [[VAL_354:%.*]] = "tf.Bitcast"([[VAL_353]]) : (tensor<2xf32>) -> tensor<2xi32>
-// CHECK:           return [[VAL_354]] : tensor<2xi32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @sign(
-// CHECK-SAME:               [[VAL_355:%.*]]: tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32> {
-// CHECK:           [[VAL_356:%.*]] = "tf.NotEqual"([[VAL_355]], [[VAL_355]]) {incompatible_shape_error = true} : (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xi1>
-// CHECK:           [[VAL_357:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1x2x3x4xf32>} : () -> tensor<1x2x3x4xf32>
-// CHECK:           [[VAL_358:%.*]] = "tf.NotEqual"([[VAL_355]], [[VAL_355]]) {incompatible_shape_error = true} : (tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xi1>
-// CHECK:           [[VAL_359:%.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<1x2x3x4xf32>} : () -> tensor<1x2x3x4xf32>
-// CHECK:           [[VAL_360:%.*]] = "tf.Sign"([[VAL_355]]) : (tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
-// CHECK:           [[VAL_361:%.*]] = "tf.Select"([[VAL_358]], [[VAL_359]], [[VAL_360]]) : (tensor<1x2x3x4xi1>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
-// CHECK:           [[VAL_362:%.*]] = "tf.Select"([[VAL_356]], [[VAL_357]], [[VAL_361]]) : (tensor<1x2x3x4xi1>, tensor<1x2x3x4xf32>, tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32>
-// CHECK:           return [[VAL_362]] : tensor<1x2x3x4xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @size_rank_one_i32(
-// CHECK-SAME:                            [[VAL_363:%.*]]: tensor<f32>) -> tensor<i32> {
-// CHECK:           [[VAL_364:%.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           return [[VAL_364]] : tensor<i32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @size_rank_one_i64(
-// CHECK-SAME:                            [[VAL_365:%.*]]: tensor<f32>) -> tensor<i64> {
-// CHECK:           [[VAL_366:%.*]] = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
-// CHECK:           return [[VAL_366]] : tensor<i64>
-// CHECK:         }
-
-// CHECK-LABEL:   func @complex(
-// CHECK-SAME:                  [[VAL_367:%.*]]: tensor<3xf32>, [[VAL_368:%.*]]: tensor<3xf32>) -> tensor<3xcomplex<f32>> {
-// CHECK:           [[VAL_369:%.*]] = "tf.Complex"([[VAL_367]], [[VAL_368]]) : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xcomplex<f32>>
-// CHECK:           return [[VAL_369]] : tensor<3xcomplex<f32>>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_i32_f32(
-// CHECK-SAME:                          [[VAL_370:%.*]]: tensor<2xi32>) -> tensor<2xf32> {
-// CHECK:           [[VAL_371:%.*]] = "tf.Cast"([[VAL_370]]) {Truncate = false} : (tensor<2xi32>) -> tensor<2xf32>
-// CHECK:           return [[VAL_371]] : tensor<2xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_slice(
-// CHECK-SAME:                          [[VAL_372:%.*]]: tensor<1x4672xf32>) -> tensor<1x519xf32> {
-// CHECK:           [[VAL_373:%.*]] = "tf.Const"() {value = dense<[0, 4153]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_374:%.*]] = "tf.Const"() {value = dense<[1, 519]> : tensor<2xi64>} : () -> tensor<2xi64>
-// CHECK:           [[VAL_375:%.*]] = "tf.Slice"([[VAL_372]], [[VAL_373]], [[VAL_374]]) : (tensor<1x4672xf32>, tensor<2xi64>, tensor<2xi64>) -> tensor<1x519xf32>
-// CHECK:           return [[VAL_375]] : tensor<1x519xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @reshape(
-// CHECK-SAME:                  [[VAL_372:%.*]]: tensor<4x6xf32>) -> tensor<2x2x6xf32> {
-// CHECK:           [[VAL_373:%.*]] = constant dense<[2, 2, 6]> : tensor<3xi64>
-// CHECK:           [[VAL_374:%.*]] = "tf.Reshape"([[VAL_372]], [[VAL_373]]) : (tensor<4x6xf32>, tensor<3xi64>) -> tensor<2x2x6xf32>
-// CHECK:           return [[VAL_374]] : tensor<2x2x6xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_dot_1d_2d(
-// CHECK-SAME:                            [[VAL_376:%.*]]: tensor<256xf32>, [[VAL_377:%.*]]: tensor<256x1xf32>) -> tensor<1xf32> {
-// CHECK:           [[VAL_378:%.*]] = "tf.Reshape"([[VAL_376]], {{.*}}) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
-// CHECK:           [[VAL_379:%.*]] = "tf.MatMul"([[VAL_378]], [[VAL_377]]) {transpose_a = false, transpose_b = false} : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
-// CHECK:           [[VAL_380:%.*]] = "tf.Reshape"([[VAL_379]], {{.*}}) : (tensor<1x1xf32>, tensor<1xi64>) -> tensor<1xf32>
-// CHECK:           return [[VAL_380]] : tensor<1xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_dot_2d_1d(
-// CHECK-SAME:                            [[VAL_381:%.*]]: tensor<1x256xf32>, [[VAL_382:%.*]]: tensor<256xf32>) -> tensor<1xf32> {
-// CHECK:           [[VAL_383:%.*]] = "tf.Reshape"([[VAL_382]], {{.*}}) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
-// CHECK:           [[VAL_384:%.*]] = "tf.MatMul"([[VAL_381]], [[VAL_383]]) {transpose_a = false, transpose_b = true} : (tensor<1x256xf32>, tensor<1x256xf32>) -> tensor<1x1xf32>
-// CHECK:           [[VAL_385:%.*]] = "tf.Reshape"([[VAL_384]], {{.*}}) : (tensor<1x1xf32>, tensor<1xi64>) -> tensor<1xf32>
-// CHECK:           return [[VAL_385]] : tensor<1xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_dot_1d_1d(
-// CHECK-SAME:                            [[VAL_386:%.*]]: tensor<256xf32>, [[VAL_387:%.*]]: tensor<256xf32>) -> tensor<f32> {
-// CHECK-DAG:       [[VAL_388:%.*]] = "tf.Reshape"([[VAL_386]], {{.*}}) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
-// CHECK-DAG:       [[VAL_389:%.*]] = "tf.Reshape"([[VAL_387]], {{.*}}) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
-// CHECK:           [[VAL_390:%.*]] = "tf.MatMul"([[VAL_388]], [[VAL_389]]) {transpose_a = false, transpose_b = true} : (tensor<1x256xf32>, tensor<1x256xf32>) -> tensor<1x1xf32>
-// CHECK:           [[VAL_391:%.*]] = "tf.Reshape"([[VAL_390]], {{.*}}) : (tensor<1x1xf32>, tensor<0xi64>) -> tensor<f32>
-// CHECK:           return [[VAL_391]] : tensor<f32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_dot_2d_2d(
-// CHECK-SAME:                            [[VAL_392:%.*]]: tensor<1x256xf32>, [[VAL_393:%.*]]: tensor<256x1xf32>) -> tensor<1x1xf32> {
-// CHECK:           [[VAL_394:%.*]] = "tf.MatMul"([[VAL_392]], [[VAL_393]]) {transpose_a = false, transpose_b = false} : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
-// CHECK:           return [[VAL_394]] : tensor<1x1xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_in_dim_tf_style(
-// CHECK-SAME:                  [[VAL_395:%.*]]: tensor<8x1x16xf32>) -> tensor<3x8x8x16xf32> {
-// CHECK:           [[VAL_396:%.*]] = constant dense<[3, 8, 8, 16]> : tensor<4xi64>
-// CHECK:           [[VAL_397:%.*]] = "tf.BroadcastTo"([[VAL_395]], [[VAL_396]]) : (tensor<8x1x16xf32>, tensor<4xi64>) -> tensor<3x8x8x16xf32>
-// CHECK:           return [[VAL_397]] : tensor<3x8x8x16xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @broadcast_in_dim_general_case(
-// CHECK-SAME:                  [[VAL_398:%.*]]: tensor<3x1x16xf32>) -> tensor<3x8x8x16xf32> {
-// CHECK:           [[VAL_399:%.*]] = constant dense<[3, 1, 1, 16]> : tensor<4xi64>
-// CHECK:           [[VAL_400:%.*]] = "tf.Reshape"([[VAL_398]], [[VAL_399]]) : (tensor<3x1x16xf32>, tensor<4xi64>) -> tensor<3x1x1x16xf32>
-// CHECK:           [[VAL_401:%.*]] = constant dense<[3, 8, 8, 16]> : tensor<4xi64>
-// CHECK:           [[VAL_402:%.*]] = "tf.BroadcastTo"([[VAL_400]], [[VAL_401]]) : (tensor<3x1x1x16xf32>, tensor<4xi64>) -> tensor<3x8x8x16xf32>
-// CHECK:           return [[VAL_402]] : tensor<3x8x8x16xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_dot_general(
-// CHECK-SAME:                  [[VAL_396:%.*]]: tensor<3x2x6x5x1xf32>, [[VAL_397:%.*]]: tensor<3x2x4x6xf32>) -> tensor<3x5x1x4xf32> {
-// CHECK:           [[VAL_398:%.*]] = "tf.Transpose"([[VAL_396]], {{.*}}) : (tensor<3x2x6x5x1xf32>, tensor<5xi64>) -> tensor<3x5x1x2x6xf32>
-// CHECK:           [[VAL_399:%.*]] = "tf.Transpose"([[VAL_397]], {{.*}}) : (tensor<3x2x4x6xf32>, tensor<4xi64>) -> tensor<3x2x6x4xf32>
-// CHECK:           [[VAL_400:%.*]] = "tf.Reshape"([[VAL_398]], {{.*}}) : (tensor<3x5x1x2x6xf32>, tensor<3xi64>) -> tensor<3x5x12xf32>
-// CHECK:           [[VAL_401:%.*]] = "tf.Reshape"([[VAL_399]], {{.*}}) : (tensor<3x2x6x4xf32>, tensor<3xi64>) -> tensor<3x12x4xf32>
-// CHECK:           [[VAL_402:%.*]] = "tf.BatchMatMulV2"([[VAL_400]], [[VAL_401]]) {adj_x = false, adj_y = false} : (tensor<3x5x12xf32>, tensor<3x12x4xf32>) -> tensor<3x5x4xf32>
-// CHECK:           [[VAL_403:%.*]] = "tf.Reshape"([[VAL_402]], {{.*}}) : (tensor<3x5x4xf32>, tensor<4xi64>) -> tensor<3x5x1x4xf32>
-// CHECK:           return [[VAL_403]] : tensor<3x5x1x4xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_conv2d(
-// CHECK-SAME:                  [[VAL_404:%.*]]: tensor<1x8x8x207xf32>, [[VAL_405:%.*]]: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
-// CHECK:           [[VAL_406:%.*]] = "tf.Conv2D"([[VAL_404]], [[VAL_405]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
-// CHECK:           return [[VAL_406]] : tensor<1x8x8x16xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_depthwise_conv2d(
-// CHECK-SAME:                  [[VAL_407:%.*]]: tensor<1x8x8x207xf32>, [[VAL_408:%.*]]: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
-// CHECK:           [[VAL_409:%.*]] = "tf.DepthwiseConv2dNative"([[VAL_407]], [[VAL_408]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1]} : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
-// CHECK:           return [[VAL_409]] : tensor<1x8x8x16xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_conv2d_valid_padding(
-// CHECK-SAME:                  [[VAL_410:%.*]]: tensor<1x8x8x207xf32>, [[VAL_411:%.*]]: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
-// CHECK:           [[VAL_412:%.*]] = "tf.Conv2D"([[VAL_410]], [[VAL_411]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
-// CHECK:           return [[VAL_412]] : tensor<1x8x8x16xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_reduce_to_sum(
-// CHECK-SAME:                  [[VAL_413:%.*]]: tensor<1x256xf32>) -> tensor<1xf32> {
-// CHECK:           [[VAL_414:%.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           [[VAL_415:%.*]] = "tf.Sum"([[VAL_413:%.*]], [[VAL_414:%.*]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
-// CHECK:           return [[VAL_415]] : tensor<1xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_reduce_to_max(
-// CHECK-SAME:                  [[VAL_416:%.*]]: tensor<1x256xf32>) -> tensor<1xf32> {
-// CHECK:           [[VAL_417:%.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           [[VAL_418:%.*]] = "tf.Max"([[VAL_416:%.*]], [[VAL_417:%.*]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
-// CHECK:           return [[VAL_418]] : tensor<1xf32>
-// CHECK:         }
-
-// CHECK-LABEL:   func @convert_reduce_to_min(
-// CHECK-SAME:                  [[VAL_419:%.*]]: tensor<1x256xf32>) -> tensor<1xf32> {
-// CHECK:           [[VAL_420:%.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
-// CHECK:           [[VAL_421:%.*]] = "tf.Min"([[VAL_419:%.*]], [[VAL_420:%.*]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
-// CHECK:           return [[VAL_421]] : tensor<1xf32>
-// CHECK:         }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
index ea55e50db30..b1787546d67 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir
@@ -86,10 +86,19 @@ func @mul_no_nan(%arg0: tensor<2x3xf32>, %arg1: tensor<3xf32>) -> tensor<2x3xf32
   return %0 : tensor<2x3xf32>
 }
 
+// CHECK-LABEL: @is_inf
+func @is_inf(%arg0: tensor<3x4xf32>) -> tensor<3x4xi1> {
+  // CHECK: %[[INF:.*]] = "tf.Const"() {value = dense<0x7F800000> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: %[[ABS:.*]] = "tf.Abs"(%arg0) : (tensor<3x4xf32>) -> tensor<3x4xf32>
+  // CHECK: %[[RESULT:.*]] = "tf.Equal"(%[[ABS]], %[[INF]]) {incompatible_shape_error = true} : (tensor<3x4xf32>, tensor<f32>) -> tensor<3x4xi1>
+  %0 = "tf.IsInf"(%arg0) : (tensor<3x4xf32>) -> tensor<3x4xi1>
+  // CHECK: return %[[RESULT]]
+  return %0 : tensor<3x4xi1>
+}
+
 // CHECK-LABEL: @is_nan
 func @is_nan(%arg0: tensor<3x4xf32>) -> tensor<3x4xi1> {
-  // CHECK: %[[NAN:.*]] = "tf.Const"() {value = dense<0x7FC00000> : tensor<f32>} : () -> tensor<f32>
-  // CHECK: %[[RESULT:.*]] = "tf.Equal"(%arg0, %[[NAN]]) {incompatible_shape_error = true} : (tensor<3x4xf32>, tensor<f32>) -> tensor<3x4xi1>
+  // CHECK: %[[RESULT:.*]] = "tf.NotEqual"(%arg0, %arg0) {incompatible_shape_error = true} : (tensor<3x4xf32>, tensor<3x4xf32>) -> tensor<3x4xi1>
   %0 = "tf.IsNan"(%arg0) : (tensor<3x4xf32>) -> tensor<3x4xi1>
   // CHECK: return %[[RESULT]]
   return %0 : tensor<3x4xi1>
@@ -215,6 +224,112 @@ func @rsqrt_grad_unranked(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<
   return %0 : tensor<*xf32>
 }
 
+// %input has 1 batch dimension then 2 block dimensions then 1 remainder
+// dimension.
+// CHECK-LABEL: fourdim_SpaceToBatchND
+func @fourdim_SpaceToBatchND(%input: tensor<3x5x7x10xf32>, %block_shape: tensor<2xi64>, %paddings: tensor<2x2xi64>) -> tensor<?x?x?x10xf32> {
+  // CHECK-DAG: [[PAD00:%.+]] = "tf.Const"() {value = dense<0> : tensor<1x2xi64>}
+  // CHECK-DAG: [[ZERO_I32:%.+]] = "tf.Const"() {value = dense<0> : tensor<i32>}
+  // CHECK-DAG: [[ZERO_I64:%.+]] = "tf.Const"() {value = dense<0> : tensor<i64>}
+  // CHECK-DAG: [[ONE_I64:%.+]] = "tf.Const"() {value = dense<1> : tensor<i64>}
+  // CHECK-DAG: [[FULL_PADDINGS:%.+]] = "tf.ConcatV2"([[PAD00]], %arg2, [[PAD00]], [[ZERO_I64]])
+  // CHECK-DAG: [[PAD_DEFAULT:%.+]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<f32>}
+  // CHECK-DAG: [[PADDED:%.+]] = "tf.PadV2"(%arg0, [[FULL_PADDINGS]], [[PAD_DEFAULT]])
+  // CHECK-DAG: [[PADDINGS_SUM:%.+]] = "tf.Sum"([[FULL_PADDINGS]], [[ONE_I64]])
+  // CHECK-DAG: [[INPUT_SHAPE:%.+]] = "tf.Const"() {value = dense<[3, 5, 7, 10]> : tensor<4xi64>}
+  // CHECK-DAG: [[PADDED_SHAPE:%.+]] = "tf.Add"([[PADDINGS_SUM]], [[INPUT_SHAPE]])
+  // CHECK-DAG: [[PADDED_SHAPE_SPLITS:%.+]]:4 = "tf.Split"([[ZERO_I32]], [[PADDED_SHAPE]])
+  // CHECK-DAG: [[BLOCK_SHAPE_SPLITS:%.+]]:2 = "tf.Split"([[ZERO_I32]], %arg1)
+  // CHECK-DAG: [[OUTER_SHAPE_0:%.+]] = "tf.Div"([[PADDED_SHAPE_SPLITS]]#1, [[BLOCK_SHAPE_SPLITS]]#0)
+  // CHECK-DAG: [[OUTER_SHAPE_1:%.+]] = "tf.Div"([[PADDED_SHAPE_SPLITS]]#2, [[BLOCK_SHAPE_SPLITS]]#1)
+  // CHECK-DAG: [[RESHAPED_SHAPE:%.+]] = "tf.ConcatV2"([[PADDED_SHAPE_SPLITS]]#0, [[OUTER_SHAPE_0]], [[BLOCK_SHAPE_SPLITS]]#0, [[OUTER_SHAPE_1]], [[BLOCK_SHAPE_SPLITS]]#1, [[PADDED_SHAPE_SPLITS]]#3, [[ZERO_I64]])
+  // CHECK-DAG: [[PERMUTATION:%.+]] = "tf.Const"() {value = dense<[2, 4, 0, 1, 3, 5]> : tensor<6xi64>}
+  // CHECK-DAG: [[OUTPUT_BATCH_PART:%.+]] = "tf.Mul"([[PADDED_SHAPE_SPLITS]]#0, [[BLOCK_SHAPE_SPLITS]]#0)
+  // CHECK-DAG: [[OUTPUT_BATCH:%.+]] = "tf.Mul"([[OUTPUT_BATCH_PART]], [[BLOCK_SHAPE_SPLITS]]#1)
+  // CHECK-DAG: [[OUTPUT_SHAPE:%.+]] = "tf.ConcatV2"([[OUTPUT_BATCH]], [[OUTER_SHAPE_0]], [[OUTER_SHAPE_1]], [[PADDED_SHAPE_SPLITS]]#3, [[ZERO_I64]])
+  // CHECK-DAG: [[RESHAPED:%.+]] = "tf.Reshape"([[PADDED]], [[RESHAPED_SHAPE]])
+  // CHECK-DAG: [[PERMUTED:%.+]] = "tf.Transpose"([[RESHAPED]], [[PERMUTATION]])
+  // CHECK-DAG: [[RESULT:%.+]] = "tf.Reshape"([[PERMUTED]], [[OUTPUT_SHAPE]])
+  // CHECK-DAG: return [[RESULT]]
+  %0 = "tf.SpaceToBatchND"(%input, %block_shape, %paddings) : (tensor<3x5x7x10xf32>, tensor<2xi64>, tensor<2x2xi64>) -> tensor<?x?x?x10xf32>
+  return %0 : tensor<?x?x?x10xf32>
+}
+
+// %input has 1 batch dimension then 3 block dimensions then 2 remainder
+// dimensions. This checks only ops that are specific to the case with 3 block
+// dimension and 2 remainder dimensions.
+// CHECK-LABEL: sixdim_SpaceToBatchND
+func @sixdim_SpaceToBatchND(%input: tensor<3x5x7x9x10x11xf32>, %block_shape: tensor<3xi64>, %paddings: tensor<3x2xi64>) -> tensor<?x?x?x?x10x11xf32> {
+  // CHECK-DAG: [[PAD00:%.+]] = "tf.Const"()
+  // CHECK-DAG: [[FULL_PADDINGS:%.+]] = "tf.ConcatV2"([[PAD00]], %arg2, [[PAD00]], [[PAD00]], {{.+}})
+  // CHECK-DAG: [[INPUT_SHAPE:%.+]] = "tf.Const"() {value = dense<[3, 5, 7, 9, 10, 11]> : tensor<6xi64>}
+  // CHECK-DAG: [[PADDED_SHAPE_SPLITS:%.+]]:6 = "tf.Split"
+  // CHECK-DAG: [[BLOCK_SHAPE_SPLITS:%.+]]:3 = "tf.Split"
+  // CHECK-DAG: [[OUTER_SHAPE_0:%.+]] = "tf.Div"([[PADDED_SHAPE_SPLITS]]#1, [[BLOCK_SHAPE_SPLITS]]#0)
+  // CHECK-DAG: [[OUTER_SHAPE_1:%.+]] = "tf.Div"([[PADDED_SHAPE_SPLITS]]#2, [[BLOCK_SHAPE_SPLITS]]#1)
+  // CHECK-DAG: [[OUTER_SHAPE_2:%.+]] = "tf.Div"([[PADDED_SHAPE_SPLITS]]#3, [[BLOCK_SHAPE_SPLITS]]#2)
+  // CHECK-DAG: [[RESHAPED_SHAPE:%.+]] = "tf.ConcatV2"([[PADDED_SHAPE_SPLITS]]#0, [[OUTER_SHAPE_0]], [[BLOCK_SHAPE_SPLITS]]#0, [[OUTER_SHAPE_1]], [[BLOCK_SHAPE_SPLITS]]#1, [[OUTER_SHAPE_2]], [[BLOCK_SHAPE_SPLITS]]#2, [[PADDED_SHAPE_SPLITS]]#4, [[PADDED_SHAPE_SPLITS]]#5, {{.+}})
+  // CHECK-DAG: [[PERMUTATION:%.+]] = "tf.Const"() {value = dense<[2, 4, 6, 0, 1, 3, 5, 7, 8]> : tensor<9xi64>}
+  // CHECK-DAG: [[OUTPUT_BATCH_PART1:%.+]] = "tf.Mul"([[PADDED_SHAPE_SPLITS]]#0, [[BLOCK_SHAPE_SPLITS]]#0)
+  // CHECK-DAG: [[OUTPUT_BATCH_PART2:%.+]] = "tf.Mul"([[OUTPUT_BATCH_PART1]], [[BLOCK_SHAPE_SPLITS]]#1)
+  // CHECK-DAG: [[OUTPUT_BATCH:%.+]] = "tf.Mul"([[OUTPUT_BATCH_PART2]], [[BLOCK_SHAPE_SPLITS]]#2)
+  // CHECK-DAG: [[OUTPUT_SHAPE:%.+]] = "tf.ConcatV2"([[OUTPUT_BATCH]], [[OUTER_SHAPE_0]], [[OUTER_SHAPE_1]], [[OUTER_SHAPE_2]], [[PADDED_SHAPE_SPLITS]]#4, [[PADDED_SHAPE_SPLITS]]#5, {{.+}})
+  %0 = "tf.SpaceToBatchND"(%input, %block_shape, %paddings) : (tensor<3x5x7x9x10x11xf32>, tensor<3xi64>, tensor<3x2xi64>) -> tensor<?x?x?x?x10x11xf32>
+  return %0 : tensor<?x?x?x?x10x11xf32>
+}
+
+func @fake_quant_with_min_max_args(%arg0 : tensor<?x?xf32>) -> tensor<?x?xf32> {
+  // CHECK-DAG: [[VAL0:%.+]] = "tf.Const"() {value = dense<1.275000e+02> : tensor<f32>}
+  // CHECK-DAG: [[VAL1:%.+]] = "tf.Const"() {value = dense<1.00392163> : tensor<f32>}
+  // CHECK-DAG: [[VAL2:%.+]] = "tf.Const"() {value = dense<-0.996078491> : tensor<f32>}
+  // CHECK-DAG: [[VAL3:%.+]] = "tf.Const"() {value = dense<0.00784313772> : tensor<f32>}
+  // CHECK-DAG: [[VAL4:%.+]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<f32>}
+  // CHECK-DAG: [[VAL5:%.+]] = "tf.ClipByValue"(%arg0, [[VAL2]], [[VAL1]])
+  // CHECK-DAG: [[VAL6:%.+]] = "tf.Sub"([[VAL5]], [[VAL2]])
+  // CHECK-DAG: [[VAL7:%.+]] = "tf.Mul"([[VAL6]], [[VAL0]])
+  // CHECK-DAG: [[VAL8:%.+]] = "tf.Add"([[VAL7]], [[VAL4]])
+  // CHECK-DAG: [[VAL9:%.+]] = "tf.Floor"([[VAL8]])
+  // CHECK-DAG: [[VAL10:%.+]] = "tf.Mul"([[VAL9]], [[VAL3]])
+  // CHECK-DAG: [[VAL11:%.+]] = "tf.Add"([[VAL10]], [[VAL2]])
+  %0 = "tf.FakeQuantWithMinMaxArgs"(%arg0) {max = 1.0 : f32, min = -1.0 : f32, narrow_range = false, num_bits = 8 : i64} : (tensor<?x?xf32>) -> tensor<?x?xf32>
+
+  // CHECK: return [[VAL11]]
+  return %0 : tensor<?x?xf32>
+}
+
+func @fake_quant_with_min_max_vars(%arg0 : tensor<?x?xf32>, %arg1 : tensor<f32>, %arg2 : tensor<f32>) -> tensor<?x?xf32> {
+  // CHECK-DAG: [[VAL0:%.+]] = "tf.Const"() {value = dense<0.000000e+00>
+  // CHECK-DAG: [[VAL1:%.+]] = "tf.Const"() {value = dense<2.550000e+02>
+  // CHECK-DAG: [[VAL2:%.+]] = "tf.Const"() {value = dense<1.000000e+00>
+  // CHECK-DAG: [[VAL3:%.+]] = "tf.Const"() {value = dense<5.000000e-01>
+  // CHECK-DAG: [[VAL4:%.+]] = "tf.Sub"(%arg2, %arg1)
+  // CHECK-DAG: [[VAL5:%.+]] = "tf.Div"([[VAL4]], [[VAL1]])
+  // CHECK-DAG: [[VAL6:%.+]] = "tf.Div"([[VAL1]], [[VAL4]])
+  // CHECK-DAG: [[VAL7:%.+]] = "tf.Div"(%arg1, [[VAL5]])
+  // CHECK-DAG: [[VAL8:%.+]] = "tf.Sub"([[VAL0]], [[VAL7]])
+  // CHECK-DAG: [[VAL9:%.+]] = "tf.Floor"([[VAL8]])
+  // CHECK-DAG: [[VAL10:%.+]] = "tf.Sub"([[VAL8]], [[VAL9]])
+  // CHECK-DAG: [[VAL11:%.+]] = "tf.Less"([[VAL10]], [[VAL3]])
+  // CHECK-DAG: [[VAL12:%.+]] = "tf.Add"([[VAL2]], [[VAL9]])
+  // CHECK-DAG: [[VAL13:%.+]] = "tf.Select"([[VAL11]], [[VAL9]], [[VAL12]])
+  // CHECK-DAG: [[VAL14:%.+]] = "tf.ClipByValue"([[VAL13]], [[VAL0]], [[VAL1]]) :
+  // CHECK-DAG: [[VAL15:%.+]] = "tf.Sub"([[VAL0]], [[VAL14]])
+  // CHECK-DAG: [[VAL16:%.+]] = "tf.Sub"([[VAL1]], [[VAL14]])
+  // CHECK-DAG: [[VAL17:%.+]] = "tf.Mul"([[VAL15]], [[VAL5]])
+  // CHECK-DAG: [[VAL18:%.+]] = "tf.Mul"([[VAL16]], [[VAL5]])
+  // CHECK-DAG: [[VAL19:%.+]] = "tf.ClipByValue"(%arg0, [[VAL17]], [[VAL18]])
+  // CHECK-DAG: [[VAL20:%.+]] = "tf.Sub"([[VAL19]], [[VAL17]])
+  // CHECK-DAG: [[VAL21:%.+]] = "tf.Mul"([[VAL20]], [[VAL6]])
+  // CHECK-DAG: [[VAL22:%.+]] = "tf.Add"([[VAL21]], [[VAL3]])
+  // CHECK-DAG: [[VAL23:%.+]] = "tf.Floor"([[VAL22]])
+  // CHECK-DAG: [[VAL24:%.+]] = "tf.Mul"([[VAL23]], [[VAL5]])
+  // CHECK-DAG: [[VAL25:%.+]] = "tf.Add"([[VAL24]], [[VAL17]])
+  %0 = "tf.FakeQuantWithMinMaxVars"(%arg0, %arg1, %arg2) {narrow_range = false, num_bits = 8 : i64} : (tensor<?x?xf32>, tensor<f32>, tensor<f32>) -> tensor<?x?xf32>
+
+  // CHECK: return [[VAL25]]
+  return %0 : tensor<?x?xf32>
+}
+
 // CHECK-LABEL: SoftmaxCrossEntropyWithLogits
 // CHECK-SAME: %[[FEATURES:.*]]: tensor<2x3xf32>, %[[LABELS:.*]]: tensor<2x3xf32>
 func @SoftmaxCrossEntropyWithLogits(%features: tensor<2x3xf32>, %labels: tensor<2x3xf32>) -> (tensor<2xf32>, tensor<2x3xf32>) {
@@ -533,3 +648,59 @@ func @_UnaryOpsComposition(%arg0: tensor<4xf32>) -> tensor<4xf32> {
   %0 = "tf._UnaryOpsComposition"(%arg0) {op_names = ["Asin", "Abs", "Log"]} : (tensor<4xf32>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
 }
+
+
+// CHECK-LABEL: @round_int
+func @round_int(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  // CHECK: [[IDENTITY:%.+]] = "tf.Identity"(%arg0)
+  %0 = "tf.Round"(%arg0) : (tensor<2xi32>) -> tensor<2xi32>
+  // CHECK: return [[IDENTITY]]
+  return %0 : tensor<2xi32>
+}
+
+// CHECK-LABEL: @round
+func @round(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  // CHECK-DAG: [[FLOOR:%.+]] = "tf.Floor"(%arg0)
+  // CHECK-DAG: [[SUB:%.+]] = "tf.Sub"(%arg0, [[FLOOR]])
+  // CHECK-DAG: [[HALF:%.+]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<f32>}
+  // CHECK-DAG: [[CMP:%.+]] = "tf.Less"([[SUB]], [[HALF]])
+  // CHECK-DAG: [[ONE:%.+]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+  // CHECK-DAG: [[ADD:%.+]] = "tf.Add"([[ONE]], [[FLOOR]])
+  // CHECK-DAG: [[SELECT:%.+]] = "tf.Select"([[CMP]], [[FLOOR]], [[ADD]])
+  %0 = "tf.Round"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+
+  // CHECK: return [[SELECT]]
+  return %0 : tensor<2xf32>
+}
+
+// CHECK-LABEL: func @round_dynamic
+func @round_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
+  // CHECK-DAG: [[FLOOR:%.+]] = "tf.Floor"(%arg0)
+  // CHECK-DAG: [[SUB:%.+]] = "tf.Sub"(%arg0, [[FLOOR]])
+  // CHECK-DAG: [[HALF:%.+]] = "tf.Const"() {value = dense<5.000000e-01> : tensor<f32>}
+  // CHECK-DAG: [[CMP:%.+]] = "tf.Less"([[SUB]], [[HALF]])
+  // CHECK-DAG: [[ONE:%.+]] = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+  // CHECK-DAG: [[ADD:%.+]] = "tf.Add"([[ONE]], [[FLOOR]])
+  // CHECK-DAG: [[SELECT:%.+]] = "tf.Select"([[CMP]], [[FLOOR]], [[ADD]])
+  %0 = "tf.Round"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
+
+  // CHECK: return [[SELECT]]
+  return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL: func @round_unranked
+func @round_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.Round"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// CHECK-LABEL: func @lgamma
+func @lgamma(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+  // The lowering for lgamma is complicated, which makes it awkward to write a
+  // complete test for it here. Instead we test that Lgamma is at least being
+  // lowered here and rely on UnaryOpsTest.testFloatOps and other TensorFlow
+  // tests to check it is lowered correctly and with sufficient precision.
+  // CHECK-NOT: tf.Lgamma
+  %0 = "tf.Lgamma"(%arg0) : (tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
index dc99d9d6343..c8a6d5489c3 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mark_ops_for_outside_compilation.mlir
@@ -74,6 +74,17 @@ func @ignore_embedding_ops() -> () {
   return
 }
 
+// CHECK-LABEL: func @ignore_stack_ops
+func @ignore_stack_ops(%arg0: tensor<i32>) -> () {
+  "tf_device.cluster"() ( {
+    // CHECK: "tf.StackV2"
+    // CHECK-NOT: _xla_outside_compilation
+    %0 = "tf.StackV2"(%arg0) {elem_type = f32, stack_name = "s"} : (tensor<i32>) -> tensor<!tf.resource>
+    tf_device.return
+  }) {allow_soft_placement = true, num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> ()
+  return
+}
+
 // CHECK-LABEL: func @op_string_result
 func @op_string_result() -> tensor<i32> {
   %0 = "tf_device.cluster"() ( {
@@ -127,17 +138,17 @@ func @op_string_operand_string_result(%arg0: tensor<!tf.string>) -> tensor<i32>
   return %0 : tensor<i32>
 }
 
-// Test that a tf.IfRegion op with a captured string operand is marked for outside compilation.
+// Test that operations inside tf.IfRegion op are corrected marked for outside
+// compilation.
 
-// CHECK-LABEL: func @if_region_captured_string
-func @if_region_captured_string(%arg0: tensor<i1>, %arg1: tensor<!tf.string>) -> tensor<f32> {
+// CHECK-LABEL: func @ops_inside_tf_if_outside_compiled
+func @ops_inside_tf_if_outside_compiled(%arg0: tensor<i1>, %arg1: tensor<!tf.string>) -> tensor<f32> {
   %0 = "tf_device.cluster"() ( {
-    // CHECK: "tf.Const"() {value = dense<1> : tensor<i32>}
-    // CHECK-NOT: _xla_outside_compilation
-    // CHECK: "tf.IfRegion"
-    // CHECK: "tf.StringToNumber"
-    // CHECK-NOT: _xla_outside_compilation
-    // CHECK: _xla_outside_compilation = "auto1", is_stateless = true
+    // CHECK:      "tf.Const"() {value = dense<1> : tensor<i32>}
+    // CHECK-NOT:  _xla_outside_compilation
+    // CHECK:      "tf.IfRegion"
+    // CHECK:        "tf.StringToNumber"
+    // CHECK-SAME:   _xla_outside_compilation
     %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
     %2 = "tf.IfRegion"(%arg0) ( {
       %3 = "tf.StringToNumber"(%arg1) {out_type = f32} : (tensor<!tf.string>) -> tensor<f32>
@@ -152,7 +163,8 @@ func @if_region_captured_string(%arg0: tensor<i1>, %arg1: tensor<!tf.string>) ->
   return %0 : tensor<f32>
 }
 
-// Test that ops with string results/operands inside a tf.IfRegion branch are marked for outside compilation.
+// Test that ops with string results/operands inside a tf.IfRegion branch are
+// marked for outside compilation.
 
 // CHECK-LABEL: func @if_region_string_op
 func @if_region_string_op(%arg0: tensor<i1>, %arg1: tensor<?xi32>) -> tensor<f32> {
@@ -180,7 +192,8 @@ func @if_region_string_op(%arg0: tensor<i1>, %arg1: tensor<?xi32>) -> tensor<f32
   return %0 : tensor<f32>
 }
 
-// Test that ops with string results/operands inside a nested tf.IfRegion branch are marked for outside compilation.
+// Test that ops with string results/operands inside a nested tf.IfRegion branch
+// are marked for outside compilation.
 
 // CHECK-LABEL: func @nested_if_region_string_op
 func @nested_if_region_string_op(%arg0: tensor<i1>, %arg1: tensor<?xi32>) -> tensor<f32> {
@@ -220,16 +233,17 @@ func @nested_if_region_string_op(%arg0: tensor<i1>, %arg1: tensor<?xi32>) -> ten
   return %0 : tensor<f32>
 }
 
-// Test that a tf.WhileRegion op with a captured string operand is marked for outside compilation.
+// Test that ops inside tf.WhileRegion op are correct marked for outside
+// compilation.
 
-// CHECK-LABEL: func @while_region_captured_string
-func @while_region_captured_string(%arg0: tensor<i32>, %arg1: tensor<!tf.string>) -> tensor<f32> {
+// CHECK-LABEL: func @ops_inside_while_outside_compiled
+func @ops_inside_while_outside_compiled(%arg0: tensor<i32>, %arg1: tensor<!tf.string>) -> tensor<f32> {
   %0 = "tf_device.cluster"() ( {
-    // CHECK: "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
+    // CHECK:     "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>}
     // CHECK-NOT: _xla_outside_compilation
-    // CHECK: "tf.WhileRegion"
-    // CHECK: "tf.StringToNumber"
-    // CHECK: _xla_outside_compilation = "auto1", is_stateless = true
+    // CHECK:     "tf.WhileRegion"
+    // CHECK:       "tf.StringToNumber"
+    // CHECK-SAME:   _xla_outside_compilation
     %1 = "tf.Const"() {value = dense<1.0> : tensor<f32>} : () -> tensor<f32>
     %2:2 = "tf.WhileRegion"(%1, %arg0) ( {
       ^bb0(%carg0: tensor<f32>, %carg1: tensor<i32>):
@@ -284,3 +298,31 @@ func @while_region_unsupported_op(%arg0: tensor<i32>, %arg1: tensor<!tf.string>)
   }) {allow_soft_placement = true, num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<f32>
   return %0 : tensor<f32>
 }
+
+// Checks that ops with inputs and outputs with string subtypes are marked
+// for outside compilation.
+
+// CHECK-LABEL: func @check_op_with_variant_string_subtypes_outside_compiled
+func @check_op_with_variant_string_subtypes_outside_compiled(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<3xi32>) -> () {
+  "tf_device.cluster"() ( {
+    // CHECK:      "tf.TensorListReserve"
+    // CHECK-SAME: _xla_outside_compilation
+    // CHECK:      "tf.TensorListGetItem"
+    // CHECK-SAME: _xla_outside_compilation
+    %0 = "tf.TensorListReserve"(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<!tf.variant<tensor<*x!tf.string>>>
+    "tf.TensorListGetItem"(%0, %arg1, %arg2) : (tensor<!tf.variant<tensor<*x!tf.string>>>, tensor<i32>, tensor<3xi32>) -> tensor<24x24x64xui8>
+    tf_device.return
+  }) {allow_soft_placement = true, num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> ()
+  return
+}
+// CHECK-LABEL: func @check_op_with_resource_string_subtypes_outside_compiled
+func @check_op_with_resource_string_subtypes_outside_compiled(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<!tf.resource<tensor<!tf.string>>>) -> () {
+  "tf_device.cluster"() ( {
+    // CHECK:      "tf.VarHandleOp"
+    // CHECK-SAME: _xla_outside_compilation
+    "tf.VarHandleOp"() {allowed_devices = [], container = "", device = "", shared_name = ""} : () -> tensor<!tf.resource<tensor<!tf.string>>>
+    tf_device.return
+  }) {allow_soft_placement = true, num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> ()
+  return
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/BUILD
index cbdf5d96d0e..b98ed445e86 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 licenses(["notice"])
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir b/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir
index 52dc06cd393..03cac7dbd33 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/parallel_execute_to_islands.mlir
@@ -1,13 +1,12 @@
 // RUN: tf-opt %s -tf-parallel-execute-to-islands | FILECHECK_OPTS="" FileCheck %s
 
-// CHECK-LABEL: func @check_regions_to_islands
-func @check_regions_to_islands() {
+// CHECK-LABEL: func @testEmptyRegions
+func @testEmptyRegions() {
   tf_executor.graph {
     tf_executor.island() {
       "tf_device.parallel_execute"() ({
         tf_device.return
-      },
-      {
+      }, {
         tf_device.return
       }) {} : () -> ()
       tf_executor.yield
@@ -17,210 +16,133 @@ func @check_regions_to_islands() {
   return
 }
 
-// CHECK:      %[[ISLAND_1_CTL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK:      [[ISLAND_0_CTRL:%.+]] = tf_executor.island {
 // CHECK:        tf_executor.yield
-// CHECK:      %[[ISLAND_2_CTL:[a-z_0-9]*]] = tf_executor.island {
+// CHECK:      [[ISLAND_1_CTRL:%.+]] = tf_executor.island {
 // CHECK:        tf_executor.yield
-// CHECK:      %{{.*}} = tf_executor.island(%[[ISLAND_1_CTL]], %[[ISLAND_2_CTL]]) {
-// CHECK-NEXT:   tf_executor.yield
+// CHECK:      tf_executor.fetch [[ISLAND_0_CTRL]], [[ISLAND_1_CTRL]] :
 
 
-// CHECK-LABEL: func @check_regions_to_islands_with_inputs
-// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
-func @check_regions_to_islands_with_inputs(%arg0 : tensor<i1>) {
-  tf_executor.graph {
+// CHECK-LABEL: func @testDataOperandsAndResults
+// CHECK-SAME: ([[ARG_0:%.+]]: tensor<i1>)
+func @testDataOperandsAndResults(%arg0 : tensor<i1>) {
+  %0:2 = tf_executor.graph {
     %1:2 = tf_executor.island {
       %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
       tf_executor.yield %2 : tensor<i1>
     }
-    tf_executor.island() {
-      "tf_device.parallel_execute"() ({
-        %3 = "tf.opB"(%1#0) : (tensor<i1>) -> tensor<i1>
-        tf_device.return %3 : tensor<i1>
-      },
-      {
+    %3:3 = tf_executor.island() {
+      %4:2 = "tf_device.parallel_execute"() ({
+        %5 = "tf.opB"(%1#0) : (tensor<i1>) -> tensor<i1>
+        tf_device.return %5 : tensor<i1>
+      }, {
         %5 = "tf.opC"(%1#0) : (tensor<i1>) -> tensor<i32>
         tf_device.return %5 : tensor<i32>
       }) {} : () -> (tensor<i1>, tensor<i32>)
-      tf_executor.yield
+      tf_executor.yield %4#0, %4#1 : tensor<i1>, tensor<i32>
     }
-    tf_executor.fetch
+    tf_executor.fetch %3#0, %3#1 : tensor<i1>, tensor<i32>
   }
   return
 }
 
-// CHECK:       %[[INPUT_A:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
-// CHECK-NEXT:    %[[OP_A_OUTPUT:[a-z_0-9]*]] = "tf.opA"(%[[ARG_0]]) : (tensor<i1>) -> tensor<i1>
-// CHECK-NEXT:    tf_executor.yield %[[OP_A_OUTPUT]] : tensor<i1>
-// CHECK:       %[[INPUT_0:[a-z_0-9]*]], %[[INPUT_CONTROL:[a-z_0-9]*]] = tf_executor.island {
-// CHECK-NEXT:    tf_executor.yield %[[INPUT_A]] : tensor<i1>
-// CHECK:       %[[ISLAND_1_OUTPUT:[a-z_0-9]*]], %[[ISLAND_1_CTL:[a-z_0-9]*]] = tf_executor.island {
-// CHECK-NEXT:    %[[OP_B_OUTPUT:[a-z_0-9]*]] = "tf.opB"(%[[INPUT_0]]) : (tensor<i1>) -> tensor<i1>
-// CHECK:         tf_executor.yield %[[OP_B_OUTPUT]] : tensor<i1>
-// CHECK:      %[[ISLAND_2_OUTPUT:[a-z_0-9]*]], %[[ISLAND_2_CTL:[a-z_0-9]*]] = tf_executor.island {
-// CHECK-NEXT:   %[[OP_C_OUTPUT:[a-z_0-9]*]] = "tf.opC"(%outputs_0) : (tensor<i1>) -> tensor<i32>
-// CHECK:        tf_executor.yield %[[OP_C_OUTPUT]] : tensor<i32>
-// CHECK:      %{{.*}} = tf_executor.island(%[[ISLAND_1_CTL]], %[[ISLAND_2_CTL]]) {
-// CHECK-NEXT:   tf_executor.yield
+// CHECK:      [[INPUT_A:%.+]], {{%.+}} = tf_executor.island {
+// CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"([[ARG_0]])
+// CHECK-NEXT:   tf_executor.yield [[OP_A_OUTPUT]] :
+// CHECK:      [[ISLAND_0_OUTPUT:%.+]], {{%.+}} = tf_executor.island {
+// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"([[INPUT_A]])
+// CHECK:        tf_executor.yield [[OP_B_OUTPUT]] :
+// CHECK:      [[ISLAND_1_OUTPUT:%.+]], {{%.+}} = tf_executor.island {
+// CHECK-NEXT:   [[OP_C_OUTPUT:%.+]] = "tf.opC"([[INPUT_A]])
+// CHECK:        tf_executor.yield [[OP_C_OUTPUT]] :
+// CHECK:      tf_executor.fetch [[ISLAND_0_OUTPUT]], [[ISLAND_1_OUTPUT]] :
 
 
-// CHECK-LABEL: func @check_input_sink_island_forwards_control_inputs
-// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
-func @check_input_sink_island_forwards_control_inputs(%arg0 : tensor<i1>) {
-  tf_executor.graph {
-    %1:2 = tf_executor.island {
-      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
-      tf_executor.yield %2 : tensor<i1>
-    }
-    %7 = tf_executor.ControlTrigger {}
-    %8 = tf_executor.ControlTrigger {}
-    tf_executor.island(%7, %8) {
-      "tf_device.parallel_execute"() ({
-        %3 = "tf.opB"(%1#0) : (tensor<i1>) -> tensor<i1>
-        tf_device.return %3 : tensor<i1>
-      },
-      {
-        %5 = "tf.opC"() : () -> tensor<i32>
-        tf_device.return %5 : tensor<i32>
-      }) {} : () -> (tensor<i1>, tensor<i32>)
+// CHECK-LABEL: func @testControlOperands
+func @testControlOperands() {
+  %0:2 = tf_executor.graph {
+    %1 = tf_executor.island {
       tf_executor.yield
     }
-    tf_executor.fetch
-  }
-  return
-}
-
-// CHECK:       %[[INPUT_A:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
-// CHECK-NEXT:    %[[OP_A_OUTPUT:[a-z_0-9]*]] = "tf.opA"(%[[ARG_0]]) : (tensor<i1>) -> tensor<i1>
-// CHECK-NEXT:    tf_executor.yield %[[OP_A_OUTPUT]] : tensor<i1>
-// CHECK: %[[CT_0:[0-9]*]] = tf_executor.ControlTrigger
-// CHECK: %[[CT_1:[0-9]*]] = tf_executor.ControlTrigger
-// CHECK:       %[[INPUT_0:[a-z_0-9]*]], %[[INPUT_CONTROL:[a-z_0-9]*]] = tf_executor.island(%[[CT_0]], %[[CT_1]]) {
-// CHECK-NEXT:    tf_executor.yield %[[INPUT_A]] : tensor<i1>
-// CHECK:       %[[ISLAND_1_OUTPUT:[a-z_0-9]*]], %[[ISLAND_1_CTL:[a-z_0-9]*]] = tf_executor.island {
-// CHECK-NEXT:    %[[OP_B_OUTPUT:[a-z_0-9]*]] = "tf.opB"(%[[INPUT_0]]) : (tensor<i1>) -> tensor<i1>
-// CHECK:         tf_executor.yield %[[OP_B_OUTPUT]] : tensor<i1>
-// CHECK:      %[[ISLAND_2_OUTPUT:[a-z_0-9]*]], %[[ISLAND_2_CTL:[a-z_0-9]*]] = tf_executor.island(%[[INPUT_CONTROL]]) {
-// CHECK-NEXT:   %[[OP_C_OUTPUT:[a-z_0-9]*]] = "tf.opC"() : () -> tensor<i32>
-// CHECK:        tf_executor.yield %[[OP_C_OUTPUT]] : tensor<i32>
-// CHECK:      %{{.*}} = tf_executor.island(%[[ISLAND_1_CTL]], %[[ISLAND_2_CTL]]) {
-// CHECK-NEXT:   tf_executor.yield
-
-
-// CHECK-LABEL: func @check_control_dep_added_when_region_does_not_have_inputs
-// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
-func @check_control_dep_added_when_region_does_not_have_inputs(%arg0 : tensor<i1>) {
-  tf_executor.graph {
-    %1:2 = tf_executor.island {
-      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
-      tf_executor.yield %2 : tensor<i1>
-    }
-    %7:3 = tf_executor.island() {
-      %8:2 = "tf_device.parallel_execute"() (
-      {
-        %3 = "tf.opB"() : () -> tensor<i1>
-        tf_device.return %3 : tensor<i1>
-      },
-      {
-        %5 = "tf.opC"(%1#0) : (tensor<i1>) -> tensor<i32>
-        tf_device.return %5 : tensor<i32>
-       }
-       ) {} : () -> (tensor<i1>, tensor<i32>)
-
-      tf_executor.yield %8#0, %8#1 : tensor<i1>, tensor<i32>
-    }
-
-    tf_executor.island {
-      "tf.opD"(%7#0, %7#1) : (tensor<i1>, tensor<i32>) -> ()
-      tf_executor.yield
-    }
-    tf_executor.fetch
-  }
-  return
-}
-
-// CHECK:       %[[INPUT_A:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
-// CHECK-NEXT:    %[[OP_A_OUTPUT:[a-z_0-9]*]] = "tf.opA"(%[[ARG_0]]) : (tensor<i1>) -> tensor<i1>
-// CHECK-NEXT:    tf_executor.yield %[[OP_A_OUTPUT]] : tensor<i1>
-// CHECK:      %[[INPUT_0:[a-z_0-9]*]], %[[INPUT_CTL:[a-z_0-9]*]] = tf_executor.island {
-// CHECK-NEXT:   tf_executor.yield %[[INPUT_A]] : tensor<i1>
-// CHECK:      %[[ISLAND_1_OUTPUT:[a-z_0-9]*]], %{{.*}} = tf_executor.island(%[[INPUT_CTL]]) {
-// CHECK-NEXT:   %[[OP_B_OUTPUT:[a-z_0-9]*]] = "tf.opB"() : () -> tensor<i1>
-// CHECK:        tf_executor.yield %[[OP_B_OUTPUT]] : tensor<i1>
-// CHECK:      %[[ISLAND_2_OUTPUT:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
-// CHECK-NEXT:   %[[OP_C_OUTPUT:[a-z_0-9]*]] = "tf.opC"(%outputs_0) : (tensor<i1>) -> tensor<i32>
-// CHECK:        tf_executor.yield %[[OP_C_OUTPUT]] : tensor<i32>
-// CHECK:      %{{.*}} = tf_executor.island {
-// CHECK-NEXT:   tf_executor.yield %[[ISLAND_1_OUTPUT]], %[[ISLAND_2_OUTPUT]]
-
-
-// CHECK-LABEL: func @check_output_barrier_correctly_forwards_outputs
-func @check_output_barrier_correctly_forwards_outputs(%arg0 : tensor<i1>) -> tensor<i1> {
-  %0 = tf_executor.graph {
-    %1:2 = tf_executor.island {
-      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
-      tf_executor.yield %2 : tensor<i1>
-    }
-    %8:3 = tf_executor.island() {
-      %7:2 = "tf_device.parallel_execute"() ({
-        %3 = "tf.opB"() : () -> tensor<i1>
-        tf_device.return %3 : tensor<i1>
-      },
-      {
-        %5 = "tf.opC"(%1#0) : (tensor<i1>) -> tensor<i32>
-        tf_device.return %5 : tensor<i32>
-      }) {} : () -> (tensor<i1>, tensor<i32>)
-      tf_executor.yield %7#0, %7#1 : tensor<i1>, tensor<i32>
-    }
-    tf_executor.fetch %8#0 : tensor<i1>
-  }
-  return %0 : tensor<i1>
-}
-
-// CHECK:       %[[INPUT_A:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
-// CHECK-NEXT:    %[[OP_A_OUTPUT:[a-z_0-9]*]] = "tf.opA"(%[[ARG_0]]) : (tensor<i1>) -> tensor<i1>
-// CHECK-NEXT:    tf_executor.yield %[[OP_A_OUTPUT]] : tensor<i1>
-// CHECK:       %[[INPUT_0:[a-z_0-9]*]], %[[INPUT_CTL:[a-z_0-9]*]] = tf_executor.island {
-// CHECK-NEXT:    tf_executor.yield %[[INPUT_A]] : tensor<i1>
-// CHECK:       %[[ISLAND_1_OUTPUT:[a-z_0-9]*]], %{{.*}} = tf_executor.island(%[[INPUT_CTL]]) {
-// CHECK-NEXT:    %[[OP_B_OUTPUT:[a-z_0-9]*]] = "tf.opB"() : () -> tensor<i1>
-// CHECK:         tf_executor.yield %[[OP_B_OUTPUT]] : tensor<i1>
-// CHECK:       %[[ISLAND_2_OUTPUT:[a-z_0-9]*]], %{{.*}} = tf_executor.island {
-// CHECK-NEXT:    %[[OP_C_OUTPUT:[a-z_0-9]*]] = "tf.opC"(%[[INPUT_0]]) : (tensor<i1>) -> tensor<i32>
-// CHECK:         tf_executor.yield %[[OP_C_OUTPUT]] : tensor<i32>
-// CHECK:       %[[OUTPUT_SINK_OUTPUT:[a-z_0-9]*]]:2, %[[OUTPUT_SINK_CTL:[a-z_0-9]*]] = tf_executor.island {
-// CHECK-NEXT:    tf_executor.yield %[[ISLAND_1_OUTPUT]], %[[ISLAND_2_OUTPUT]] : tensor<i1>, tensor<i32>
-
-// CHECK-LABEL: func @check_parallel_execute_using_args
-// CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<i1>)
-func @check_parallel_execute_using_args(%arg0 : tensor<i1>) {
-  tf_executor.graph {
-    %1:2 = tf_executor.island {
-      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
-      tf_executor.yield %2 : tensor<i1>
-    }
-    %2:2 = tf_executor.island {
-      %3 = "tf.opB"(%arg0) : (tensor<i1>) -> tensor<i1>
-      tf_executor.yield %3 : tensor<i1>
-    }
-    tf_executor.island() {
-      "tf_device.parallel_execute"() ({
-        %4 = "tf.opC"(%arg0, %1#0) : (tensor<i1>, tensor<i1>) -> tensor<i1>
+    %2:3 = tf_executor.island(%1) {
+      %3:2 = "tf_device.parallel_execute"() ({
+        %4 = "tf.opA"() : () -> tensor<i1>
         tf_device.return %4 : tensor<i1>
-      },
-      {
-        %5 = "tf.opD"(%arg0, %2#0) : (tensor<i1>, tensor<i1>) -> tensor<i32>
-        tf_device.return %5 : tensor<i32>
+      }, {
+        %4 = "tf.opB"() : () -> tensor<i32>
+        tf_device.return %4 : tensor<i32>
       }) {} : () -> (tensor<i1>, tensor<i32>)
-      tf_executor.yield
+      tf_executor.yield %3#0, %3#1 : tensor<i1>, tensor<i32>
     }
-    tf_executor.fetch
+    tf_executor.fetch %2#0, %2#1 : tensor<i1>, tensor<i32>
   }
   return
 }
 
-// Verify that args are directly accessed in newly created island without alias
-// through entry barrier.
+// CHECK:      [[INPUT_CTRL:%.+]] = tf_executor.island {
+// CHECK:      [[ISLAND_0_OUTPUT:%.+]], {{%.+}} = tf_executor.island([[INPUT_CTRL]]) {
+// CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"()
+// CHECK:        tf_executor.yield [[OP_A_OUTPUT]] :
+// CHECK:      [[ISLAND_1_OUTPUT:%.+]], {{%.+}} = tf_executor.island([[INPUT_CTRL]]) {
+// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"()
+// CHECK:        tf_executor.yield [[OP_B_OUTPUT]] :
+// CHECK:      tf_executor.fetch [[ISLAND_0_OUTPUT]], [[ISLAND_1_OUTPUT]] :
 
-// CHECK:  "tf.opC"(%[[ARG_0]]
-// CHECK:  "tf.opD"(%[[ARG_0]]
+
+// CHECK-LABEL: func @testControlResults
+func @testControlResults() {
+  tf_executor.graph {
+    %0:3 = tf_executor.island {
+      %1:2 = "tf_device.parallel_execute"() ({
+        %2 = "tf.opA"() : () -> tensor<i1>
+        tf_device.return %2 : tensor<i1>
+      }, {
+        %2 = "tf.opB"() : () -> tensor<i32>
+        tf_device.return %2 : tensor<i32>
+      }) {} : () -> (tensor<i1>, tensor<i32>)
+      tf_executor.yield %1#0, %1#1 : tensor<i1>, tensor<i32>
+    }
+    %3 = tf_executor.island(%0#2) {
+      tf_executor.yield
+    }
+    tf_executor.fetch %3 : !tf_executor.control
+  }
+  return
+}
+
+// CHECK:      {{%.+}}, [[ISLAND_0_CTRL:%.+]] = tf_executor.island {
+// CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"()
+// CHECK:        tf_executor.yield [[OP_A_OUTPUT]] :
+// CHECK:      {{%.+}}, [[ISLAND_1_CTRL:%.+]] = tf_executor.island {
+// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"()
+// CHECK:        tf_executor.yield [[OP_B_OUTPUT]] :
+// CHECK:      [[OUTPUT_CTRL:%.+]] = tf_executor.island([[ISLAND_0_CTRL]], [[ISLAND_1_CTRL]]) {
+// CHECK:      [[FETCH_ISLAND:%.+]] = tf_executor.island([[OUTPUT_CTRL]]) {
+// CHECK:      tf_executor.fetch [[FETCH_ISLAND]] : !tf_executor.control
+
+
+// CHECK-LABEL: func @testSomeRegionNoUsers
+func @testSomeRegionNoUsers() {
+  %0 = tf_executor.graph {
+    %1:3 = tf_executor.island {
+      %2:2 = "tf_device.parallel_execute"() ({
+        %3 = "tf.opA"() : () -> tensor<i1>
+        tf_device.return %3 : tensor<i1>
+      }, {
+        %3 = "tf.opB"() : () -> tensor<i32>
+        tf_device.return %3 : tensor<i32>
+      }) {} : () -> (tensor<i1>, tensor<i32>)
+      tf_executor.yield %2#0, %2#1 : tensor<i1>, tensor<i32>
+    }
+    tf_executor.fetch %1#0 : tensor<i1>
+  }
+  return
+}
+
+// CHECK:      [[ISLAND_0_OUTPUT:%.+]], {{%.+}} = tf_executor.island {
+// CHECK-NEXT:   [[OP_A_OUTPUT:%.+]] = "tf.opA"()
+// CHECK:        tf_executor.yield [[OP_A_OUTPUT]] :
+// CHECK:      {{%.+}}, [[ISLAND_1_CTRL:%.+]] = tf_executor.island {
+// CHECK-NEXT:   [[OP_B_OUTPUT:%.+]] = "tf.opB"()
+// CHECK:        tf_executor.yield [[OP_B_OUTPUT]] :
+// CHECK:      tf_executor.fetch [[ISLAND_0_OUTPUT]], [[ISLAND_1_CTRL]] :
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
index 3e6d4f37bac..0813ee8db90 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
@@ -258,6 +258,19 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<f32>)  {
 
 // -----
 
+// Tests removal of dead local variables.
+
+// CHECK-LABEL: func @main
+func @main(%arg0: tensor<2xf32>) {
+  // CHECK-NOT: tf.MlirLocalVarOp
+  // CHECK-NOT: tf.AssignVariableOp
+  %0 = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<2xf32>>>
+  "tf.AssignVariableOp"(%0, %arg0) : (tensor<!tf.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+  return
+}
+
+// -----
+
 // Tests first read of one resource is used as a value to write to another
 // resource.
 
@@ -272,6 +285,26 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<!tf.resource<
   return
 }
 
+// -----
+
+// Tests if local variables that are dead after resource op lifting are removed.
+
+// CHECK-LABEL: func @main
+func @main(%arg0: tensor<i32>) -> tensor<2xf32> {
+  // CHECK-NOT: tf.MlirLocalVarOp
+  // CHECK-NOT: tf.AssignVariableOp
+  %0 = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<2xf32>>>
+  %1 = "tf._SomeOp"() : () -> tensor<2xf32>
+  "tf.AssignVariableOp"(%0, %1) : (tensor<!tf.resource<tensor<2xf32>>>, tensor<2xf32>) -> ()
+  %2 = "tf.PartitionedCall"(%0) {config = "", config_proto = "", executor_type = "", f = @callee} : (tensor<!tf.resource<tensor<2xf32>>>) -> tensor<2xf32>
+  return %2 : tensor<2xf32>
+}
+func @callee(%arg0: tensor<!tf.resource<tensor<2xf32>>>) -> tensor<2xf32> attributes {sym_visibility = "private"} {
+  %0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<2xf32>>>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
+}
+
+
 // -----
 
 // Tests main function with multiple blocks.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource-alias-analysis-test.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource-alias-analysis-test.mlir
index da0a2df9e6a..e857831e6be 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/resource-alias-analysis-test.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource-alias-analysis-test.mlir
@@ -106,6 +106,52 @@ func @if_else(%arg0: !tf_res, %arg1: !tf_res) -> (!tf_res, !tf_res, !tf_res) {
   return %id0, %id0, %arg1 : !tf_res, !tf_res, !tf_res
 }
 
+// -----
+// Test aliasing through CaseOp
+
+!tf_res = type tensor<*x!tf.resource<tensor<i32>>>
+
+// CHECK-LABEL: func @case_op_aliasing
+// expected-remark@below {{Region #0, Arg #0, ID 4 : 1, 4}}
+// expected-remark@below {{Region #0, Arg #1, ID 5 : 1, 2, 3, 5}}
+func @case_op_aliasing(%arg0: !tf_res, %arg1: !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0}}
+  %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+  %read0 = "tf.ReadVariableOp"(%vh0) : (!tf_res) -> tensor<i32>
+  // expected-remark@below {{Result #0, ID 1 : Unknown}}
+  // expected-remark@below {{Result #1, ID 2 : 1, 2, 3, 5}}
+  // expected-remark@below {{Result #2, ID 3 : 0, 1, 2, 3, 5}}
+  %if:3 = "tf.Case"(%read0, %arg1, %vh0) {
+            branches = [@case_branch0, @case_branch1, @case_branch2],
+            is_stateless = true
+          } : (tensor<i32>, !tf_res, !tf_res) -> (!tf_res, !tf_res, !tf_res)
+  return
+}
+
+// expected-remark@below {{Region #0, Arg #0, ID 2 : 0, 1, 2}}
+// expected-remark@below {{Region #0, Arg #1, ID 3 : 0, 3}}
+func @case_branch0(%arg0: !tf_res, %arg1: !tf_res) -> (!tf_res, !tf_res, !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : Unknown}}
+  %u0 = "tf._UnknownSideEffectingOp_"() : () -> !tf_res
+  // expected-remark@below {{Result #0, ID 1 : 0, 1, 2}}
+  %id0 = "tf.Identity"(%arg0) : (!tf_res) -> !tf_res
+  return %u0, %id0, %id0 : !tf_res, !tf_res, !tf_res
+}
+
+// expected-remark@below {{Region #0, Arg #0, ID 1 : 0, 1}}
+// expected-remark@below {{Region #0, Arg #1, ID 2 : 2}}
+func @case_branch1(%arg0: !tf_res, %arg1: !tf_res) -> (!tf_res, !tf_res, !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0, 1}}
+  %id0 = "tf.Identity"(%arg0) : (!tf_res) -> !tf_res
+  return %id0, %id0, %arg1 : !tf_res, !tf_res, !tf_res
+}
+
+// expected-remark@below {{Region #0, Arg #0, ID 0 : 0}}
+// expected-remark@below {{Region #0, Arg #1, ID 1 : 1}}
+func @case_branch2(%arg0: !tf_res, %arg1: !tf_res) -> (!tf_res, !tf_res, !tf_res) {
+  return %arg0, %arg0, %arg1 : !tf_res, !tf_res, !tf_res
+}
+
 // -----
 // Test aliasing through WhileOp
 !tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
@@ -199,6 +245,37 @@ func @if_region_aliasing(%arg0: !tf_res, %arg1: !tf_res) {
   return
 }
 
+// -----
+// Test aliasing through CaseRegion
+
+!tf_res = type tensor<*x!tf.resource<tensor<i32>>>
+
+// CHECK-LABEL: func @case_region_aliasing
+// expected-remark@below {{Region #0, Arg #0, ID 7 : 1, 4, 6, 7}}
+// expected-remark@below {{Region #0, Arg #1, ID 8 : 1, 2, 4, 5, 6, 8}}
+func @case_region_aliasing(%arg0: !tf_res, %arg1: !tf_res) {
+  // expected-remark@below {{Result #0, ID 0 : 0, 1, 3, 4, 5}}
+  %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> !tf_res
+  %read0 = "tf.ReadVariableOp"(%vh0) : (!tf_res) -> tensor<i32>
+  // expected-remark@below {{Result #0, ID 4 : Unknown}}
+  // expected-remark@below {{Result #1, ID 5 : 0, 1, 2, 3, 4, 5, 6, 8}}
+  // expected-remark@below {{Result #2, ID 6 : 1, 2, 4, 5, 6, 7, 8}}
+  %if:3 = "tf.CaseRegion"(%read0) ({
+            // expected-remark@below {{Result #0, ID 1 : Unknown}}
+            %u0 = "tf._UnknownSideEffectingOp_"() : () -> !tf_res
+            // expected-remark@below {{Result #0, ID 2 : 1, 2, 4, 5, 6, 8}}
+            %id0 = "tf.Identity"(%arg1) : (!tf_res) -> !tf_res
+            "tf.Yield"(%u0, %id0, %id0) : (!tf_res, !tf_res, !tf_res) -> ()
+          }, {
+            // expected-remark@below {{Result #0, ID 3 : 0, 1, 3, 4, 5}}
+            %id0 = "tf.Identity"(%vh0) : (!tf_res) -> !tf_res
+            "tf.Yield"(%id0, %id0, %arg0) : (!tf_res, !tf_res, !tf_res) -> ()
+          }, {
+            "tf.Yield"(%vh0, %arg1, %arg1) : (!tf_res, !tf_res, !tf_res) -> ()
+          }) {is_stateless = true} : (tensor<i32>) -> (!tf_res, !tf_res, !tf_res)
+  return
+}
+
 // -----
 // Test aliasing through WhileRegion
 !tf_res = type tensor<*x!tf.resource<tensor<32xf32>>>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
index 213ca402f56..79b90b67956 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
@@ -8,7 +8,7 @@ func @only_resource_load() -> tensor<*xi32> {
   // CHECK: %[[RES_HANDLE:[0-9]*]] = "tf.VarHandleOp"
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
 
-  // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]]) {dtype = i32}
+  // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]])
   // CHECK: "tf_device.cluster"
   // CHECK: %[[COMPUTE_RES:[0-9]*]] = "tf.SomeComputation"(%[[RES_READ_VAL]])
   // CHECK: tf_device.return %[[COMPUTE_RES]]
@@ -39,7 +39,7 @@ func @only_resource_store() -> tensor<*xi32> {
   // CHECK: tf_device.return %[[COMPUTE_RES]], %[[COMPUTE_RES]]
   // CHECK: {cluster_attr = "cluster_attr"}
   // CHECK-SAME: () -> (tensor<*xi32>, tensor<*xi32>)
-  // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[CLUSTER_RES]]#1) {dtype = i32}
+  // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[CLUSTER_RES]]#1)
 
   %1 = "tf_device.cluster"() ( {
     %2 = "tf.SomeComputation"() : () -> (tensor<*xi32>)
@@ -61,13 +61,13 @@ func @same_resource_load_and_store() -> tensor<*xi32> {
   // CHECK: %[[RES_HANDLE:[0-9]*]] = "tf.VarHandleOp"
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
 
-  // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]]) {dtype = i32}
+  // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]])
   // CHECK: %[[CLUSTER_RES:[0-9]*]]:2 = "tf_device.cluster"
   // CHECK: %[[COMPUTE_RES:[0-9]*]] = "tf.SomeComputation"(%[[RES_READ_VAL]])
   // CHECK: tf_device.return %[[COMPUTE_RES]], %[[COMPUTE_RES]]
   // CHECK: {cluster_attr = "cluster_attr"}
   // CHECK-SAME: () -> (tensor<*xi32>, tensor<*xi32>)
-  // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[CLUSTER_RES]]#1) {dtype = i32}
+  // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[CLUSTER_RES]]#1)
 
   %1 = "tf_device.cluster"() ( {
     %2 = "tf.ReadVariableOp"(%0) {dtype = i32} : (tensor<*x!tf.resource>) -> tensor<*xi32>
@@ -308,6 +308,7 @@ func @while_cond1(%arg0: tensor<*x!tf.resource<tensor<f32>>>, %arg1: tensor<*x!t
 func @cluster_with_loop() -> () {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
   "tf_device.cluster"() ( {
+    // expected-error@+1 {{result #0 not tied to function argument for branch @while_body}}
     %1 = "tf.While"(%0) {
       body = @while_body, cond = @while_cond, device = "", is_stateless = false}
          : (tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.resource<tensor<f32>>>)
@@ -317,7 +318,6 @@ func @cluster_with_loop() -> () {
 }
 func @while_body(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.resource<tensor<f32>>>) {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<f32>>>
-  // expected-error @+1 {{resource used in while loop is only supported when the resource input and output alias each other in the loop body}}
   return %0 : tensor<*x!tf.resource<tensor<f32>>>
 }
 func @while_cond(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32> {
@@ -332,6 +332,7 @@ func @while_cond(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32> {
 func @cluster_with_loop() -> () {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
   "tf_device.cluster"() ( {
+    // expected-error@+1 {{found resource write in loop condition.}}
     %1 = "tf.While"(%0) {
       body = @while_body, cond = @while_cond, device = "", is_stateless = false}
          : (tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.resource<tensor<f32>>>)
@@ -347,7 +348,6 @@ func @while_body(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> (tensor<*x!tf.re
 func @while_cond(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32> {
   %read = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32>
   %constant = "tf.Const"() {value = dense<0.0> : tensor<f32>} : () -> tensor<f32>
-  // expected-error @+1 {{found resource write in loop condition.}}
   "tf.AssignVariableOp"(%arg0, %constant) : (tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
   return %read : tensor<f32>
 }
@@ -527,7 +527,7 @@ func @cluster_with_if(%arg0: tensor<i1>) -> tensor<4xf32> {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
   %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
   %2 = "tf_device.cluster"() ( {
-    // expected-error @+1 {{unsupported output: resource does not alias a single input}}
+    // expected-error @+1 {{result #0 is not tied to the same argument across all branches}}
     %3 = "tf.If"(%arg0, %0, %1) {then_branch = @if_then, else_branch = @if_else,
         is_stateless = false}
       : (tensor<i1>, tensor<*x!tf.resource<tensor<4xf32>>>, tensor<*x!tf.resource<tensor<4xf32>>>)
@@ -554,7 +554,7 @@ func @cluster_with_if(%arg0: tensor<i1>) -> tensor<4xf32> {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
   %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
   %2 = "tf_device.cluster"() ( {
-    // expected-error @+1 {{unsupported output: resource does not alias input}}
+    // expected-error @+1 {{result #0 not tied to function argument for branch @if_then}}
     %3 = "tf.If"(%arg0, %0, %1) {then_branch = @if_then, else_branch = @if_else,
         is_stateless = false}
       : (tensor<i1>, tensor<*x!tf.resource<tensor<4xf32>>>, tensor<*x!tf.resource<tensor<4xf32>>>)
@@ -713,3 +713,507 @@ func @callee(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32> {
 
 // CHECK:      func @callee_resource_lifted(%[[A0:.*]]: tensor<f32>) -> tensor<f32>
 // CHECK-NEXT:   return %[[A0]]
+
+// -----
+
+// Test that the pass can lift resources out of IfRegion
+// CHECK: func @cluster_with_ifregion(%[[ARG0:.*]]: tensor<i1>) -> tensor<4xf32>
+func @cluster_with_ifregion(%arg0: tensor<i1>) -> tensor<4xf32> {
+  // CHECK: %[[VH0:.*]] = "tf.VarHandleOp"()
+  // CHECK: %[[VH1:.*]] = "tf.VarHandleOp"()
+  // CHECK: %[[READ1:.*]] = "tf.ReadVariableOp"(%[[VH1]])
+  // CHECK: %[[CLUSTER:.*]]:2 = "tf_device.cluster"()
+  // CHECK: %[[IF:.*]]:2 = "tf.IfRegion"(%[[ARG0]])
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  %2 = "tf_device.cluster"() ( {
+    %3:2 = "tf.IfRegion"(%arg0) ({
+          // CHECK-NEXT: %[[CONST:.*]] = "tf.Const"()
+          // CHECK-NEXT: "tf.Yield"(%[[CONST]], %[[CONST]])
+          %constant = "tf.Const"() {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32>
+          "tf.AssignVariableOp"(%0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+          "tf.Yield"(%0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+      }, {
+          // CHECK: "tf.Yield"(%[[READ1]], %[[READ1]])
+          %id = "tf.Identity"(%1) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<*x!tf.resource<tensor<4xf32>>>
+          %read = "tf.ReadVariableOp"(%id) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+          "tf.AssignVariableOp"(%0, %read) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+          "tf.Yield"(%0, %read) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+      }) {is_stateless = false} : (tensor<i1>) -> (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>)
+    // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[IF]]#1, %[[IF]]#0)
+    // CHECK-NEXT: tf_device.return %[[ADD]], %[[IF]]#1
+    %4 = "tf.ReadVariableOp"(%3#0) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+    %5 = "tf.AddV2"(%4, %3#1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+    tf_device.return %5 : tensor<4xf32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<4xf32>
+  // CHECK: {cluster_attr = "cluster_attr"} : () -> (tensor<4xf32>, tensor<4xf32>)
+  // CHECK: "tf.AssignVariableOp"(%[[VH0]], %[[CLUSTER]]#1)
+  // CHECK: return %[[CLUSTER]]#0
+  return %2 : tensor<4xf32>
+}
+
+// Test that the pass can lift resources out of CaseRegion
+// CHECK: func @cluster_with_caseregion(%[[ARG0:.*]]: tensor<i32>) -> tensor<4xf32>
+func @cluster_with_caseregion(%arg0: tensor<i32>) -> tensor<4xf32> {
+  // CHECK: %[[VH0:.*]] = "tf.VarHandleOp"()
+  // CHECK: %[[VH1:.*]] = "tf.VarHandleOp"()
+  // CHECK: %[[READ1:.*]] = "tf.ReadVariableOp"(%[[VH1]])
+  // CHECK: %[[CLUSTER:.*]]:2 = "tf_device.cluster"()
+  // CHECK: %[[CASE:.*]]:2 = "tf.CaseRegion"(%[[ARG0]])
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  %2 = "tf_device.cluster"() ( {
+    %3:2 = "tf.CaseRegion"(%arg0) ({
+          // CHECK-NEXT: %[[CONST:.*]] = "tf.Const"()
+          // CHECK-NEXT: "tf.Yield"(%[[CONST]], %[[CONST]])
+          %constant = "tf.Const"() {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32>
+          "tf.AssignVariableOp"(%0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+          "tf.Yield"(%0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+      }, {
+          // CHECK: "tf.Yield"(%[[READ1]], %[[READ1]])
+          %id = "tf.Identity"(%1) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<*x!tf.resource<tensor<4xf32>>>
+          %read = "tf.ReadVariableOp"(%id) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+          "tf.AssignVariableOp"(%0, %read) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+          "tf.Yield"(%0, %read) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+      }, {
+          // CHECK: %[[CONST1:.*]] = "tf.Const"
+          // CHECK: %[[SUB:.*]] = "tf.Sub"(%[[READ1]], %[[CONST1]])
+          // CHECK: "tf.Yield"(%[[READ1]], %[[SUB]])
+          %id = "tf.Identity"(%1) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<*x!tf.resource<tensor<4xf32>>>
+          %read = "tf.ReadVariableOp"(%id) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+          %constant = "tf.Const"() {value = dense<1.0> : tensor<4xf32>} : () -> tensor<4xf32>
+          %sub = "tf.Sub"(%read, %constant) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+          "tf.AssignVariableOp"(%0, %sub) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+          "tf.Yield"(%0, %read) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+      }) {is_stateless = false} : (tensor<i32>) -> (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>)
+    // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[CASE]]#1, %[[CASE]]#0)
+    // CHECK-NEXT: tf_device.return %[[ADD]], %[[CASE]]#1
+    %4 = "tf.ReadVariableOp"(%3#0) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+    %5 = "tf.AddV2"(%4, %3#1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+    tf_device.return %5 : tensor<4xf32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<4xf32>
+  // CHECK: {cluster_attr = "cluster_attr"} : () -> (tensor<4xf32>, tensor<4xf32>)
+  // CHECK: "tf.AssignVariableOp"(%[[VH0]], %[[CLUSTER]]#1)
+  // CHECK: return %[[CLUSTER]]#0
+  return %2 : tensor<4xf32>
+}
+
+// -----
+
+// Test that the pass can lift resources out of WhileRegion
+// CHECK-LABEL: func @cluster_with_whileregion
+func @cluster_with_whileregion() -> () {
+  // CHECK: %[[COUNT:.*]] = "tf.Const"() {value = dense<10> : tensor<i32>}
+  // CHECK: %[[VH:.*]] = "tf.VarHandleOp"()
+  // CHECK: %[[READ:.*]] = "tf.ReadVariableOp"(%[[VH]])
+  // CHECK: %[[CLUSTER:.*]] = "tf_device.cluster"()
+  // CHECK: %[[WHILE:.*]]:2 = "tf.WhileRegion"(%[[COUNT]], %[[READ]])
+  %0 = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<f32>>>
+  %unused = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<f32>>>
+  "tf_device.cluster"() ( {
+    %2:3 = "tf.WhileRegion"(%0, %1, %unused) ({
+            // CHECK: (%[[CARG0:.+]]: tensor<i32>, %[[CARG1:.+]]: tensor<f32>):
+            // CHECK: %[[CAST:.+]] = "tf.Cast"(%[[CARG1]])
+            // CHECK: "tf.Less"(%[[CARG0]], %[[CAST]])
+            // CHECK: "tf.Yield"
+            ^bb0(%carg0: tensor<i32>, %carg1:tensor<*x!tf.resource<tensor<f32>>>, %carg2: tensor<*x!tf.resource<tensor<f32>>>):
+               %read0 = "tf.ReadVariableOp"(%carg1) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32>
+               %cast = "tf.Cast"(%read0) : (tensor<f32>) -> tensor<i32>
+               %cond = "tf.Less"(%carg0, %cast) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+               "tf.Yield"(%cond) : (tensor<i1>) -> ()
+            }, {
+            // CHECK: (%[[BARG0:.+]]: tensor<i32>, %[[BARG1:.+]]: tensor<f32>):
+            // CHECK: %[[ADD0:.*]] = "tf.AddV2"(%[[BARG1]], %[[BARG1]])
+            // CHECK-NEXT: %[[ADD1:.*]] = "tf.AddV2"(%[[ADD0]], %[[ADD0]])
+            // CHECK-NEXT: %[[DELTA:.*]] = "tf.Const"() {value = dense<-1> : tensor<i32>}
+            // CHECK-NEXT: %[[ADD2:.*]] = "tf.AddV2"(%[[BARG0]], %[[DELTA]])
+            // CHECK-NEXT: "tf.Yield"(%[[ADD2]], %[[ADD1]])
+            ^bb1(%barg0: tensor<i32>, %barg1:tensor<*x!tf.resource<tensor<f32>>>, %barg2: tensor<*x!tf.resource<tensor<f32>>>):
+              %read0 = "tf.ReadVariableOp"(%barg1) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32>
+              %add0 = "tf.AddV2"(%read0, %read0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+              "tf.AssignVariableOp"(%barg1, %add0) : (tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+              %read1 = "tf.ReadVariableOp"(%barg1) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32>
+              %add1 = "tf.AddV2"(%read1, %read1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+              "tf.AssignVariableOp"(%barg1, %add1) : (tensor<*x!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+              %constant = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+              %add2 = "tf.AddV2"(%barg0, %constant) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+              %id = "tf.Identity"(%barg2) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<*x!tf.resource<tensor<f32>>>
+              "tf.Yield"(%add2, %barg1, %id) : (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>) -> ()
+            }) {device = "", is_stateless = false}
+         : (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
+         -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  // CHECK: tf_device.return %[[WHILE]]#1 : tensor<f32>
+  // CHECK: {cluster_attr = "cluster_attr"} : () -> tensor<f32>
+  // CHECK: "tf.AssignVariableOp"(%[[VH]], %[[CLUSTER]])
+  // CHECK: return
+  return
+}
+
+// -----
+
+// Test that the pass can lift out recursively (If with another if it its body)
+// CHECK: func @cluster_with_if_within_if(%[[ARG0:.*]]: tensor<i1>, %[[ARG1:.*]]: tensor<i1>) -> tensor<4xf32>
+func @cluster_with_if_within_if(%arg0: tensor<i1>, %arg1: tensor<i1>) -> tensor<4xf32> {
+  // CHECK: %[[VH0:.*]] = "tf.VarHandleOp"()
+  // CHECK: %[[VH1:.*]] = "tf.VarHandleOp"()
+  // CHECK: %[[READ0:.*]] = "tf.ReadVariableOp"(%[[VH0]])
+  // CHECK: %[[READ1:.*]] = "tf.ReadVariableOp"(%[[VH1]])
+  // CHECK: %[[CLUSTER:.*]]:2 = "tf_device.cluster"()
+  // CHECK: %[[IF:.*]]:2 = "tf.IfRegion"(%[[ARG0]])
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  %2 = "tf_device.cluster"() ( {
+    %3:2 = "tf.IfRegion"(%arg0) ({
+          // CHECK-NEXT: %[[CONST:.*]] = "tf.Const"()
+          // CHECK-NEXT: "tf.Yield"(%[[CONST]], %[[CONST]])
+          %constant = "tf.Const"() {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32>
+          "tf.AssignVariableOp"(%0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+          "tf.Yield"(%0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+      }, {
+          // CHECK: %[[IF1:.*]] = "tf.IfRegion"
+          // CHECK:  "tf.Yield"(%[[READ1]])
+          // CHECK:  "tf.Yield"(%[[READ0]])
+          // CHECK: "tf.Yield"(%[[IF1]], %[[IF1]])
+          %id = "tf.Identity"(%1) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<*x!tf.resource<tensor<4xf32>>>
+          %read = "tf.IfRegion"(%arg1) ({
+            %read_then = "tf.ReadVariableOp"(%id) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+            "tf.Yield"(%read_then) : (tensor<4xf32>) -> ()
+          }, {
+            %read_else = "tf.ReadVariableOp"(%0) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+            "tf.Yield"(%read_else) : (tensor<4xf32>) -> ()
+          }) {is_stateless = false} : (tensor<i1>) -> tensor<4xf32>
+          "tf.AssignVariableOp"(%0, %read) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+          "tf.Yield"(%0, %read) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+      }) {is_stateless = false} : (tensor<i1>) -> (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>)
+    // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[IF]]#1, %[[IF]]#0)
+    // CHECK-NEXT: tf_device.return %[[ADD]], %[[IF]]#1
+    %4 = "tf.ReadVariableOp"(%3#0) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+    %5 = "tf.AddV2"(%4, %3#1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+    tf_device.return %5 : tensor<4xf32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<4xf32>
+  // CHECK: {cluster_attr = "cluster_attr"} : () -> (tensor<4xf32>, tensor<4xf32>)
+  // CHECK: "tf.AssignVariableOp"(%[[VH0]], %[[CLUSTER]]#1)
+  // CHECK: return %[[CLUSTER]]#0
+  return %2 : tensor<4xf32>
+}
+
+// -----
+
+// IfRegion with store in just one branch
+
+// CHECK: func @if_region_with_store_in_then(%[[ARG0:.*]]: tensor<i1>)
+func @if_region_with_store_in_then(%arg0: tensor<i1>) {
+  // CHECK: %[[VH:.*]] = "tf.VarHandleOp"()
+  // CHECK: %[[READ:.*]] = "tf.ReadVariableOp"(%[[VH]])
+  // CHECK: %[[CLUSTER:.*]] = "tf_device.cluster"()
+  // CHECK: %[[IF:.*]] = "tf.IfRegion"(%[[ARG0]])
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  "tf_device.cluster"() ({
+    "tf.IfRegion"(%arg0) ({
+       // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<0.000000e+00>
+       // CHECK: "tf.Yield"(%[[CONST]])
+       %constant = "tf.Const"() {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32>
+       "tf.AssignVariableOp"(%0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+       "tf.Yield"() : () -> ()
+      }, {
+       // CHECK: "tf.Yield"(%[[READ]])
+       "tf.Yield"() : () -> ()
+      }) { is_stateless = true} : (tensor<i1>) -> ()
+    tf_device.return
+  }) { cluster_attr = "cluster_attr" } : () -> ()
+  // CHECK: tf_device.return %[[IF]]
+  // CHECK: "tf.AssignVariableOp"(%[[VH0]], %[[CLUSTER]])
+  return
+}
+
+// -----
+
+// IfRegion with store in both branches
+
+// CHECK: func @if_region_with_store_in_both(%[[ARG0:.*]]: tensor<i1>)
+func @if_region_with_store_in_both(%arg0: tensor<i1>) {
+  // CHECK: %[[VH:.*]] = "tf.VarHandleOp"()
+  // CHECK: %[[CLUSTER:.*]] = "tf_device.cluster"()
+  // CHECK: %[[IF:.*]] = "tf.IfRegion"(%[[ARG0]])
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  "tf_device.cluster"() ({
+    "tf.IfRegion"(%arg0) ({
+       // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<0.000000e+00>
+       // CHECK: "tf.Yield"(%[[CONST]])
+       %constant = "tf.Const"() {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32>
+       "tf.AssignVariableOp"(%0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+       "tf.Yield"() : () -> ()
+      }, {
+       // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<1.000000e+00>
+       // CHECK: "tf.Yield"(%[[CONST]])
+       %constant = "tf.Const"() {value = dense<1.0> : tensor<4xf32>} : () -> tensor<4xf32>
+       "tf.AssignVariableOp"(%0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+       "tf.Yield"() : () -> ()
+      }) { is_stateless = true} : (tensor<i1>) -> ()
+    tf_device.return
+  }) { cluster_attr = "cluster_attr" } : () -> ()
+  // CHECK: tf_device.return %[[IF]]
+  // CHECK: "tf.AssignVariableOp"(%[[VH0]], %[[CLUSTER]])
+  return
+}
+
+
+// Make sure unsupported resources are handled correctly. If a resource is used
+// in an unsupported op, resource op lifting should skip lifting that resource.
+// So for the below test, the IR should stay unchanged.
+// CHECK-LABEL: func @test_unsupported_resource_op
+func @test_unsupported_resource_op() -> tensor<*xi32> {
+  // CHECK: "tf.VarHandleOp"
+  // CHECK: "tf_device.cluster"() ( {
+  // CHECK: "tf.ReadVariableOp"
+  // CHECK: "tf.SomeResourceOperation"
+  // CHECK: "tf.SomeComputation"
+  // CHECK: tf_device.return
+  // CHECK: {cluster_attr = "cluster_attr"}
+  // CHECK: return
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
+  %1 = "tf_device.cluster"() ( {
+    %2 = "tf.ReadVariableOp"(%0) {dtype = i32} : (tensor<*x!tf.resource>) -> tensor<*xi32>
+    "tf.SomeResourceOperation"(%0) : (tensor<*x!tf.resource>) -> ()
+    %3 = "tf.SomeComputation"(%2) : (tensor<*xi32>) -> (tensor<*xi32>)
+    tf_device.return %3 : tensor<*xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<*xi32>
+
+  return %1 : tensor<*xi32>
+}
+
+// Test unsupported use of resource ops in functional control flow. In the test
+// below, arg0 has an unsupported use whereas arg1 does not. So we expect arg0
+// to not be lifted and arg1 to be lifted.
+// CHECK-LABEL: func @test_unsupported_resource_op_in_if
+func @test_unsupported_resource_op_in_if(%arg0: tensor<i1>) -> tensor<*xi32> {
+  // CHECK: [[VH0:%.*]] = "tf.VarHandleOp"() {container = "c", shared_name = "v"}
+  // CHECK: [[VH1:%.*]] = "tf.VarHandleOp"() {container = "d", shared_name = "w"}
+  // CHECK-NOT: "tf.ReadVariableOp"([[VH0]])
+  // CHECK: [[READ1:%.*]] = "tf.ReadVariableOp"([[VH1]])
+  // CHECK-NOT: "tf.ReadVariableOp"([[VH0]])
+  // CHECK: "tf_device.cluster"() ( {
+  // CHECK:   "tf.If"({{%.*}}, [[VH0]], [[READ1]])
+  // CHECK-SAME: else_branch = @else_fn, is_stateless = true, then_branch = @then_fn
+  // CHECK: tf_device.return
+  // CHECK: return
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
+  %1 = "tf.VarHandleOp"() {container = "d", shared_name = "w"} : () -> tensor<*x!tf.resource>
+  %2 = "tf_device.cluster"() ( {
+    %3 = "tf.If"(%arg0, %0, %1)
+          { else_branch = @else_fn, then_branch = @then_fn, is_stateless = true}
+          : (tensor<i1>, tensor<*x!tf.resource>, tensor<*x!tf.resource>) -> tensor<*xi32>
+    tf_device.return %3 : tensor<*xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<*xi32>
+  return %2 : tensor<*xi32>
+}
+
+// CHECK-LABEL: func @else_fn
+// CHECK-SAME: (%{{.*}}: tensor<*x!tf.resource>, %{{.*}}: tensor<*xi32>)
+func @else_fn(%arg0: tensor<*x!tf.resource>, %arg1: tensor<*x!tf.resource>) -> tensor<*xi32> {
+  %0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource>) -> tensor<*xi32>
+  %1 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf.resource>) -> tensor<*xi32>
+  %2 = "tf.Add"(%0, %1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+  return %2 : tensor<*xi32>
+}
+
+// CHECK-LABEL: func @then_fn
+// CHECK-SAME: (%{{.*}}: tensor<*x!tf.resource>, %{{.*}}: tensor<*xi32>)
+func @then_fn(%arg0: tensor<*x!tf.resource>, %arg1: tensor<*x!tf.resource>) -> tensor<*xi32> {
+  %0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource>) -> tensor<*xi32>
+  %1 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf.resource>) -> tensor<*xi32>
+  %2 = "tf.Add"(%0, %1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+  "tf.UnsupportedResourceOp"(%arg0) : (tensor<*x!tf.resource>) -> ()
+  return %2 : tensor<*xi32>
+}
+
+// Test type refinement. If the resource has a single subtype, check that that
+// type gets used when hoisting the read. None of the result types will change.
+// CHECK-LABEL: func @type_refinement_use_subtype
+func @type_refinement_use_subtype() -> tensor<*xi32> {
+
+  // CHECK: %[[RES_HANDLE:[0-9]*]] = "tf.VarHandleOp"
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xi32>>>
+
+  // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]])
+  // CHECK-SAME: -> tensor<4xi32>
+  // CHECK: %[[CLUSTER_RES:[0-9]*]]:2 = "tf_device.cluster"
+  // CHECK: %[[COMPUTE_RES:[0-9]*]] = "tf.SomeComputation"(%[[RES_READ_VAL]]) : (tensor<4xi32>) -> tensor<*xi32>
+  // CHECK: tf_device.return %[[COMPUTE_RES]], %[[COMPUTE_RES]]
+  // CHECK-SAME: tensor<*xi32>, tensor<*xi32>
+  // CHECK: {cluster_attr = "cluster_attr"}
+  // CHECK-SAME: () -> (tensor<*xi32>, tensor<*xi32>)
+  // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[CLUSTER_RES]]#1)
+
+  %1 = "tf_device.cluster"() ( {
+    %2 = "tf.ReadVariableOp"(%0) {dtype = i32} : (tensor<*x!tf.resource<tensor<4xi32>>>) -> tensor<*xi32>
+    %3 = "tf.SomeComputation"(%2) : (tensor<*xi32>) -> (tensor<*xi32>)
+    "tf.AssignVariableOp"(%0, %3) {dtype = i32} : (tensor<*x!tf.resource<tensor<4xi32>>>, tensor<*xi32>) -> ()
+    tf_device.return %3 : tensor<*xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<*xi32>
+
+  // CHECK: return %[[CLUSTER_RES]]#0
+  // CHECK-SAME: tensor<*xi32>
+  return %1 : tensor<*xi32>
+}
+
+// If multiple types are used across reads and writes, check that the read uses
+// the most refined type. The first ReadVariable should refine the type from
+// *xi32 to ?xi32 and the assign should refine it further to 4xi32.
+// CHECK-LABEL: func @type_refinement_use_refined_type
+func @type_refinement_use_refined_type() -> tensor<4xi32> {
+
+  // CHECK: %[[RES_HANDLE:[0-9]*]] = "tf.VarHandleOp"
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<*xi32>>>
+
+  // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]])
+  // CHECK-SAME: -> tensor<4xi32>
+  // CHECK: %[[CLUSTER_RES:[0-9]*]]:2 = "tf_device.cluster"
+  // CHECK: %[[COMPUTE_RES:[0-9]*]] = "tf.SomeComputation"(%[[RES_READ_VAL]]) : (tensor<4xi32>) -> tensor<4xi32>
+  // CHECK: tf_device.return %[[COMPUTE_RES]], %[[COMPUTE_RES]]
+  // CHECK-SAME: tensor<4xi32>, tensor<4xi32>
+  // CHECK: {cluster_attr = "cluster_attr"}
+  // CHECK-SAME: () -> (tensor<4xi32>, tensor<4xi32>)
+  // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[CLUSTER_RES]]#1)
+
+  %1 = "tf_device.cluster"() ( {
+    %2 = "tf.ReadVariableOp"(%0) {dtype = i32} : (tensor<*x!tf.resource<tensor<*xi32>>>) -> tensor<?xi32>
+    %3 = "tf.SomeComputation"(%2) : (tensor<?xi32>) -> (tensor<4xi32>)
+    "tf.AssignVariableOp"(%0, %3) {dtype = i32} : (tensor<*x!tf.resource<tensor<*xi32>>>, tensor<4xi32>) -> ()
+    tf_device.return %3 : tensor<4xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<4xi32>
+
+  // CHECK: return %[[CLUSTER_RES]]#0
+  // CHECK-SAME: tensor<4xi32>
+  return %1 : tensor<4xi32>
+}
+
+// -----
+
+!tf_res = type tensor<*x!tf.resource<tensor<f32>>>
+
+// Test all tf.VarIsInitializedOp's are set to true.
+// CHECK-LABEL: func @tpu_computation
+func @tpu_computation(%arg0: !tf_res, %arg1: tensor<i1>, %arg2: tensor<i32>) {
+  %0 = "tf_device.cluster"() ( {
+    %1 = "tf.Case"(%arg2, %arg0) {branches = [@case_branch], is_stateless = false} : (tensor<i32>, !tf_res) -> tensor<i1>
+
+    // CHECK: "tf.CaseRegion"
+    %2 = "tf.CaseRegion"(%arg2) ( {
+      // CHECK-NEXT: [[CASE_REGION_BRANCH:%.+]] = "tf.Const"
+      // CHECK-SAME: value = dense<true> : tensor<i1>
+      %3 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+      // CHECK-NEXT: "tf.Yield"([[CASE_REGION_BRANCH]])
+      "tf.Yield"(%3) : (tensor<i1>) -> ()
+    }) {is_stateless = false} : (tensor<i32>) -> tensor<i1>
+
+    %4 = "tf.If"(%arg1, %arg0) {then_branch = @if_then, else_branch = @if_else, is_stateless = false} : (tensor<i1>, !tf_res) -> tensor<i1>
+
+    // CHECK: "tf.IfRegion"
+    %5 = "tf.IfRegion"(%arg1) ( {
+      // CHECK-NEXT: [[IF_REGION_THEN:%.+]] = "tf.Const"
+      // CHECK-SAME: value = dense<true> : tensor<i1>
+      %6 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+      // CHECK-NEXT: "tf.Yield"([[IF_REGION_THEN]])
+      "tf.Yield"(%6) : (tensor<i1>) -> ()
+    // CHECK-NEXT: }, {
+    }, {
+      // CHECK-NEXT: [[IF_REGION_ELSE:%.+]] = "tf.Const"
+      // CHECK-SAME: value = dense<true> : tensor<i1>
+      %7 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+      // CHECK-NEXT: "tf.Yield"([[IF_REGION_ELSE]])
+      "tf.Yield"(%7) : (tensor<i1>) -> ()
+    }) {is_stateless = false} : (tensor<i1>) -> tensor<i1>
+
+    %8:2 = "tf.While"(%arg0, %arg1) {body = @while_body, cond = @while_cond, is_stateless = false} : (!tf_res, tensor<i1>) -> (!tf_res, tensor<i1>)
+
+    // CHECK: "tf.WhileRegion"
+    %9 = "tf.WhileRegion"(%arg1) ( {
+    // CHECK-NEXT: ^{{.+}}({{.+}}: tensor<i1>):
+    ^cond(%carg0: tensor<i1>):
+      // CHECK-NEXT: [[WHILE_REGION_COND:%.+]] = "tf.Const"
+      // CHECK-SAME: value = dense<true> : tensor<i1>
+      %10 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+      // CHECK-NEXT: "tf.Yield"([[WHILE_REGION_COND]])
+      "tf.Yield"(%10) : (tensor<i1>) -> ()
+    // CHECK-NEXT: }, {
+    }, {
+    // CHECK-NEXT: ^{{.+}}({{.+}}: tensor<i1>):
+    ^body(%barg0: tensor<i1>):
+      // CHECK-NEXT: [[WHILE_REGION_BODY:%.+]] = "tf.Const"
+      // CHECK-SAME: value = dense<true> : tensor<i1>
+      %11 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+      // CHECK-NEXT: "tf.Yield"([[WHILE_REGION_BODY]])
+      "tf.Yield"(%11) : (tensor<i1>) -> ()
+    }) {is_stateless = false} : (tensor<i1>) -> tensor<i1>
+
+    %12 = "tf.StatefulPartitionedCall"(%arg0) {f = @callee, config = "", config_proto = "", executor_type = ""} : (!tf_res) -> tensor<i1>
+
+    // CHECK: [[TRUE:%.+]] = "tf.Const"
+    // CHECK-SAME: value = dense<true> : tensor<i1>
+    %13 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+
+    // CHECK: tf_device.return [[TRUE]] :
+    tf_device.return %13 : tensor<i1>
+  }) : () -> tensor<i1>
+  return
+}
+
+// CHECK-LABEL: func @case_branch
+func @case_branch(%arg0: !tf_res) -> tensor<i1> {
+  // CHECK: [[TRUE:%.+]] = "tf.Const"
+  // CHECK-SAME: value = dense<true> : tensor<i1>
+  %0 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+  // CHECK-NEXT: return [[TRUE]] :
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @if_then
+func @if_then(%arg0: !tf_res) -> tensor<i1> {
+  // CHECK: [[TRUE:%.+]] = "tf.Const"
+  // CHECK-SAME: value = dense<true> : tensor<i1>
+  %0 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+  // CHECK-NEXT: return [[TRUE]] :
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @if_else
+func @if_else(%arg0: !tf_res) -> tensor<i1> {
+  // CHECK: [[TRUE:%.+]] = "tf.Const"
+  // CHECK-SAME: value = dense<true> : tensor<i1>
+  %0 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+  // CHECK-NEXT: return [[TRUE]] :
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @while_cond
+// CHECK-SAME: ({{.+}}: tensor<i1>)
+func @while_cond(%arg0: !tf_res, %arg1: tensor<i1>) -> tensor<i1> {
+  // CHECK: [[TRUE:%.+]] = "tf.Const"
+  // CHECK-SAME: value = dense<true> : tensor<i1>
+  %0 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+  // CHECK-NEXT: return [[TRUE]] :
+  return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @while_body
+// CHECK-SAME: ({{.+}}: tensor<i1>)
+func @while_body(%arg0: !tf_res, %arg1: tensor<i1>) -> (!tf_res, tensor<i1>) {
+  // CHECK: [[TRUE:%.+]] = "tf.Const"
+  // CHECK-SAME: value = dense<true> : tensor<i1>
+  %0 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+  // CHECK-NEXT: return [[TRUE]] :
+  return %arg0, %0 : !tf_res, tensor<i1>
+}
+
+// CHECK-LABEL: func @callee
+func @callee(%arg0: !tf_res) -> tensor<i1> {
+  // CHECK: [[TRUE:%.+]] = "tf.Const"
+  // CHECK-SAME: value = dense<true> : tensor<i1>
+  %0 = "tf.VarIsInitializedOp"(%arg0) : (!tf_res) -> tensor<i1>
+  // CHECK-NEXT: return [[TRUE]] :
+  return %0 : tensor<i1>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 3e613573d42..428af91f155 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -530,6 +530,21 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     return %3#0, %3#1 : tensor<*xf32>, tensor<*xf32>
   }
 
+  // CHECK-LABEL: infer_device_cluster
+  func @infer_device_cluster(%arg0: tensor<1x8x2xi32>) -> (tensor<*xf32>, tensor<*xf32>) {
+    %0 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf_device.cluster"() ({
+      %2 = "tf.Cast"(%arg0) {Truncate = false} : (tensor<1x8x2xi32>) -> tensor<1x8x2xf32>
+      tf_device.return %2 : tensor<1x8x2xf32>
+    // CHECK: () -> tensor<1x8x2xf32>
+    }) : () -> tensor<*xf32>
+    // CHECK: "tf.Cast"(%{{.*}}) {Truncate = false} : (tensor<1x8x2xf32>) -> tensor<*xf32>
+    // CHECK: (tensor<i32>, tensor<1x8x2xf32>) -> (tensor<1x8x1xf32>, tensor<1x8x1xf32>)
+    %3:2 = "tf.Split"(%0, %1) {device = ""} : (tensor<i32>, tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>)
+    %4 = addf %1, %1 : tensor<*xf32>
+    return %3#0, %3#1 : tensor<*xf32>, tensor<*xf32>
+  }
+
   // CHECK-LABEL: func @tensor_cast(%arg0: tensor<1xi32>) -> tensor<1xi32>
   func @tensor_cast(%arg0: tensor<1xi32>) -> tensor<*xi32> {
    // CHECK: %[[RESULT:.*]] = tensor_cast
@@ -560,4 +575,15 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   func @pcall_resource_result_func(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<*x!tf.resource<tensor<f32>>> {
     return %arg0 : tensor<*x!tf.resource<tensor<f32>>>
   }
+
+  // Check that the fold for tf.Size does not crash with unranked output type.
+  // CHECK-LABEL: func @unranked_tf_size
+  func @unranked_tf_size() -> tensor<*xi32> {
+    %0 = "tf.Const"() {value = dense<[-1, 26]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %add = "tf.AddV2"(%0, %0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<*xi32>
+    // CHECK: "tf.Size"
+    // CHECK-SAME: (tensor<2xi32>) -> tensor<i32>
+    %size = "tf.Size"(%add) {device = ""} : (tensor<*xi32>) -> tensor<*xi32>
+    return %size : tensor<*xi32>
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir b/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir
index e4fdad2eddb..17329050f3e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/stack_ops_decomposition.mlir
@@ -122,6 +122,108 @@ func @while_cond(%arg0: tensor<!tf.resource>, %arg1: tensor<i32>) -> tensor<i32>
 
 // -----
 
+// Tests WhileRegion Op.
+
+// CHECK-LABEL: func @main()
+func @main() -> () {
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NOT: tf.Stack
+  // CHECK: %[[BUFFER:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<10xf32>>>
+  // CHECK: %[[SIZE:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<1xi32>>>
+  // CHECK: tf.AssignVariableOp
+  // CHECK: tf.AssignVariableOp
+  %stack = "tf.StackV2"(%max_size) {elem_type = f32, stack_name = "s"} : (tensor<i32>) -> tensor<!tf.resource>
+  // CHECK: tf.WhileRegion
+  %while = "tf.WhileRegion"(%max_size) ({
+    // CHECK: ^bb0(%[[BARG0:.*]]: tensor<i32>
+    ^bb0(%barg0: tensor<i32>):
+     // CHECK: "tf._SomeOp"(%[[BARG0]])
+     %pred = "tf._SomeOp"(%barg0) : (tensor<i32>) -> tensor<i1>
+    "tf.Yield"(%pred) : (tensor<i1>) -> ()
+  }, {
+    // CHECK: ^bb0(%[[BARG0:.*]]: tensor<i32>
+    ^bb0(%barg0: tensor<i32>):
+    // CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %const1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    // CHECK: %[[SUB:.*]] = "tf.Sub"(%[[BARG0]], %[[CONST1]])
+    %sub = "tf.Sub"(%barg0, %const1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %elem = "tf._SomeOp"() : () -> tensor<f32>
+    // CHECK-NOT: "tf.StackPushV2"
+    // CHECK: %[[BUFFER_VAL:.*]] = "tf.ReadVariableOp"(%[[BUFFER]])
+    // CHECK: %[[SIZE_VAL:.*]] = "tf.ReadVariableOp"(%[[SIZE]])
+    // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[BUFFER_VAL]]
+    // CHECK: "tf.AssignVariableOp"(%[[BUFFER]], %[[UPDATE]])
+    // CHECK: "tf.AssignVariableOp"(%[[SIZE]]
+    // CHECK-NOT: "tf.StackPushV2"
+    %push = "tf.StackPushV2"(%stack, %elem) {swap_memory = false} : (tensor<!tf.resource>, tensor<f32>) -> tensor<f32>
+    // CHECK: "tf.Yield"(%[[SUB]])
+    "tf.Yield"(%sub) : (tensor<i32>) -> ()
+  }) {is_stateless = false}
+       : (tensor<i32>) -> tensor<i32>
+  // CHECK-NOT: tf.StackPopV2
+  // CHECK: %[[BUFFER_VAL:.*]] = "tf.ReadVariableOp"(%[[BUFFER]])
+  // CHECK: %[[SIZE_VAL:.*]] = "tf.ReadVariableOp"(%[[SIZE]])
+  // CHECK: %[[POP_VAL:.*]] = "tf.Slice"(%[[BUFFER_VAL]]
+  // CHECK: "tf.AssignVariableOp"(%[[SIZE]]
+  %pop = "tf.StackPopV2"(%stack) : (tensor<!tf.resource>) -> tensor<f32>
+  // CHECK-NOT: tf.StackCloseV2
+  "tf.StackCloseV2"(%stack) : (tensor<!tf.resource>) -> ()
+  return
+}
+
+// -----
+
+// Test CaseRegionOp
+
+// CHECK-LABEL: func @main
+// CHECK-SAME:  %[[BRANCH_INDEX:.*]]: tensor<i32>
+func @main(%arg0: tensor<i32>) -> () {
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NOT: tf.StackV2
+  // CHECK: %[[BUFFER:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<10xf32>>>
+  // CHECK: %[[SIZE:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<1xi32>>>
+  // CHECK: tf.AssignVariableOp
+  // CHECK: tf.AssignVariableOp
+  %stack = "tf.StackV2"(%max_size) {elem_type = f32, stack_name = "s"} : (tensor<i32>) -> tensor<!tf.resource>
+  // CHECK: %[[CASE_OUTPUT:.*]] = "tf.CaseRegion"(%[[BRANCH_INDEX]]) ( {
+  %case_op = "tf.CaseRegion"(%arg0) ({
+    %elem = "tf._SomeOp"() : () -> tensor<f32>
+    // CHECK-NOT: tf.StackPushV2
+    // CHECK: %[[BUFFER_VAL:.*]] = "tf.ReadVariableOp"(%[[BUFFER]])
+    // CHECK: %[[SIZE_VAL:.*]] = "tf.ReadVariableOp"(%[[SIZE]])
+    // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[BUFFER_VAL]]
+    // CHECK: "tf.AssignVariableOp"(%[[BUFFER]], %[[UPDATE]])
+    // CHECK: "tf.AssignVariableOp"(%[[SIZE]]
+     %push = "tf.StackPushV2"(%stack, %elem) {swap_memory = false} : (tensor<!tf.resource>, tensor<f32>) -> tensor<f32>
+    "tf.Yield"(%elem) : (tensor<f32>) -> ()
+  }, {
+    %elem = "tf._SomeOtherOp"() : () -> tensor<f32>
+    // CHECK-NOT: tf.StackPushV2
+    // CHECK: %[[BUFFER_VAL:.*]] = "tf.ReadVariableOp"(%[[BUFFER]])
+    // CHECK: %[[SIZE_VAL:.*]] = "tf.ReadVariableOp"(%[[SIZE]])
+    // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[BUFFER_VAL]]
+    // CHECK: "tf.AssignVariableOp"(%[[BUFFER]], %[[UPDATE]])
+    // CHECK: "tf.AssignVariableOp"(%[[SIZE]]
+    %push = "tf.StackPushV2"(%stack, %elem) {swap_memory = false} : (tensor<!tf.resource>, tensor<f32>) -> tensor<f32>
+    "tf.Yield"(%elem) : (tensor<f32>) -> ()
+  }, {
+    // CHECK-NOT: tf.StackPopV2
+    // CHECK: %[[BUFFER_VAL:.*]] = "tf.ReadVariableOp"(%[[BUFFER]])
+    // CHECK: %[[SIZE_VAL:.*]] = "tf.ReadVariableOp"(%[[SIZE]])
+    // CHECK: %[[POP_VAL:.*]] = "tf.Slice"(%[[BUFFER_VAL]]
+    // CHECK: "tf.AssignVariableOp"(%[[SIZE]]
+    %pop = "tf.StackPopV2"(%stack) : (tensor<!tf.resource>) -> tensor<f32>
+    "tf.Yield"(%pop) : (tensor<f32>) -> ()
+  }) {is_stateless = false}
+    : (tensor<i32>) -> tensor<f32>
+  // CHECK-NOT: tf.StackPopV2
+  %pop = "tf.StackPopV2"(%stack) : (tensor<!tf.resource>) -> tensor<f32>
+  // CHECK-NOT: tf.StackCloseV2
+  "tf.StackCloseV2"(%stack) : (tensor<!tf.resource>) -> ()
+  return
+}
+
+// -----
 // Tests IfOp.
 
 // CHECK-LABEL: func @main
@@ -308,3 +410,53 @@ func @if_else(%arg0: tensor<!tf.resource>, %arg1: tensor<!tf.resource>) -> tenso
   %push = "tf.StackPushV2"(%arg1, %elem) {swap_memory = false} : (tensor<!tf.resource>, tensor<f32>) -> tensor<f32>
   return %arg1 : tensor<!tf.resource>
 }
+
+// -----
+
+// Tests that the pass returns meaningful error message when WhileRegion op has
+// resource arguments.
+func @main() -> () {
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  %stack = "tf.StackV2"(%max_size) {elem_type = f32, stack_name = "s"} : (tensor<i32>) -> tensor<!tf.resource>
+  %elem = "tf._SomeOp"() : () -> tensor<f32>
+  %push_0 = "tf.StackPushV2"(%stack, %elem) {swap_memory = false} : (tensor<!tf.resource>, tensor<f32>) -> tensor<f32>
+  // expected-error @+1 {{found unexpected type 'tensor<!tf.resource<tensor<10xf32>>>' of operand #0, resource type operands are expected to have been canonicalized away for region based control flow ops}}
+  %1:2 = "tf.WhileRegion"(%stack, %max_size) ({
+    ^bb0 (%carg0: tensor<!tf.resource>, %carg1: tensor<i32>):
+    %pred = "tf._SomeOp"(%carg1) : (tensor<i32>) -> tensor<i1>
+    "tf.Yield"(%pred) : (tensor<i1>) -> ()
+  }, {
+    ^bb0 (%carg0: tensor<!tf.resource>, %carg1: tensor<i32>):
+    %const1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %sub = "tf.Sub"(%carg1, %const1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %push_1 = "tf.StackPushV2"(%carg0, %elem) {swap_memory = false} : (tensor<!tf.resource>, tensor<f32>) -> tensor<f32>
+    "tf.Yield"(%carg0, %sub) : (tensor<!tf.resource>, tensor<i32>) -> ()
+  }) {is_stateless = false}
+       : (tensor<!tf.resource>, tensor<i32>) -> (tensor<!tf.resource>, tensor<i32>)
+  %pop = "tf.StackPopV2"(%1#0) : (tensor<!tf.resource>) -> tensor<f32>
+  "tf.StackCloseV2"(%stack) : (tensor<!tf.resource>) -> ()
+  return
+}
+
+// -----
+
+// Tests that the pass returns meaningful error message when IfRegion op has
+// resource returns.
+
+func @main(%arg0: tensor<i1>) -> () {
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  %stack = "tf.StackV2"(%max_size) {elem_type = f32, stack_name = "s"} : (tensor<i32>) -> tensor<!tf.resource>
+  // expected-error @+1 {{found unexpected type 'tensor<!tf.resource>' of result #0, resource type results are expected to have been canonicalized away for region based control flow ops}}
+  %if_op = "tf.IfRegion"(%arg0) ({
+    %elem = "tf._SomeOp"() : () -> tensor<f32>
+    %push = "tf.StackPushV2"(%stack, %elem) {swap_memory = false} : (tensor<!tf.resource>, tensor<f32>) -> tensor<f32>
+    "tf.Yield"(%stack) : (tensor<!tf.resource>) -> ()
+  }, {
+    %pop = "tf.StackPopV2"(%stack) : (tensor<!tf.resource>) -> tensor<f32>
+    "tf.Yield"(%stack) : (tensor<!tf.resource>) -> ()
+  }) {is_stateless = false}
+    : (tensor<i1>) -> tensor<!tf.resource>
+  %pop = "tf.StackPopV2"(%if_op) : (tensor<!tf.resource>) -> tensor<f32>
+  "tf.StackCloseV2"(%stack) : (tensor<!tf.resource>) -> ()
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tensor_array_ops_decomposition.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tensor_array_ops_decomposition.mlir
index b65e88c589a..0c4dc77cf69 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tensor_array_ops_decomposition.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tensor_array_ops_decomposition.mlir
@@ -54,6 +54,22 @@ func @main() -> tensor<i32> {
 
 // -----
 
+// Test inferring shape from the first scatter.
+
+// CHECK-LABEL: func @main
+func @main() -> tensor<i32> {
+  %size = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
+  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<*>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource>, tensor<f32>)
+  %indices = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %values = "tf.Const"() {value = dense<[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+  %write = "tf.TensorArrayScatterV3"(%ta#0, %indices, %values, %ta#1) : (tensor<!tf.resource>, tensor<2xi32>, tensor<2x3xf32>, tensor<f32>) -> tensor<f32>
+  %size_out = "tf.TensorArraySizeV3"(%ta#0, %write) : (tensor<!tf.resource>, tensor<f32>) -> tensor<i32>
+  return %size_out : tensor<i32>
+}
+
+// -----
+
 // Test tensor array concat and split.
 
 // CHECK-LABEL: func @main
@@ -259,6 +275,13 @@ func @main() -> () {
   // CHECK: %[[READ:.*]] = "tf.ReadVariableOp"(%[[VAR]]) : (tensor<!tf.resource<tensor<5x3xf32>>>) -> tensor<5x3xf32>
   // CHECK: "tf.Slice"(%[[READ]],
   %read = "tf.TensorArrayReadV3"(%1, %index, %ta#1) : (tensor<!tf.resource>, tensor<i32>, tensor<f32>) -> tensor<3xf32>
+  // CHECK: %[[READ_GVAR1:.*]] = "tf.ReadVariableOp"(%[[GVAR1]])
+  // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[READ_GVAR1]],
+  // CHECK: "tf.AssignVariableOp"(%[[GVAR1]], %[[UPDATE]])
+  %const = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %elem = "tf._SomeOp"() : () -> tensor<3xf32>
+  %grad:2 = "tf.TensorArrayGradV3"(%ta#0, %ta#1) {source = "a"} : (tensor<!tf.resource>, tensor<f32>) -> (tensor<!tf.resource>, tensor<f32>)
+  %gwrite = "tf.TensorArrayWriteV3"(%grad#0, %const, %elem, %grad#1) : (tensor<!tf.resource>, tensor<i32>, tensor<3xf32>, tensor<f32>) -> tensor<f32>
   return
 }
 // CHECK: func @then_branch(%[[TARG0:.*]]: tensor<!tf.resource<tensor<5x3xf32>>>, %[[TARG1:.*]]: tensor<!tf.resource<tensor<5x3xf32>>>, %[[TARG2:.*]]: tensor<!tf.resource<tensor<5x3xf32>>>)
@@ -412,6 +435,32 @@ func @callee() -> tensor<i32> attributes {sym_visibility = "public"} {
 
 // -----
 
+// CHECK-LABEL: func @main
+func @main() -> () {
+  // CHECK: "tf.PartitionedCall"() {config = "", config_proto = "", executor_type = "", f = @callee} : () -> tensor<*xf32>
+  %call = "tf.PartitionedCall"() {config = "", config_proto = "", executor_type = "", f = @callee} : () -> (tensor<*xf32>)
+  return
+}
+func @callee() -> (tensor<*xf32>) attributes {sym_visibility = "private"} {
+  %size = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[LOCAL_VAR:.*]] = "tf.MlirLocalVarOp"() : () -> tensor<!tf.resource<tensor<5x3xf32>>>
+  %ta:2 = "tf.TensorArrayV3"(%size) {dtype = f32, element_shape = #tf.shape<*>, dynamic_size = false, clear_after_read = true, identical_element_shapes = true, tensor_array_name = "ta"} : (tensor<i32>) -> (tensor<!tf.resource<tensor<*xf32>>>, tensor<f32>)
+  %index = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  %value = "tf.Const"() {value = dense<[1.0, 2.0, 3.0]> : tensor<3xf32>} : () -> tensor<3xf32>
+  // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"
+  // CHECK: "tf.AssignVariableOp"(%[[LOCAL_VAR]], %[[UPDATE]]) : (tensor<!tf.resource<tensor<5x3xf32>>>, tensor<5x3xf32>) -> ()
+  %flow = "tf.TensorArrayWriteV3"(%ta#0, %index, %value, %ta#1) : (tensor<!tf.resource<tensor<*xf32>>>, tensor<i32>, tensor<3xf32>, tensor<f32>) -> tensor<f32>
+  // CHECK: %[[SLICE:.*]] = "tf.Slice"
+  // CHECK: %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<3> : tensor<1xi32>}
+  // CHECK: %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]])
+  %val = "tf.TensorArrayReadV3"(%ta#0, %index, %ta#1) : (tensor<!tf.resource<tensor<*xf32>>>, tensor<i32>, tensor<f32>) -> tensor<*xf32>
+  // CHECK: %[[CAST:.*]] = tensor_cast %[[ELEM]] : tensor<3xf32> to tensor<*xf32>
+  // CHECK: return %[[CAST]] : tensor<*xf32>
+  return %val : tensor<*xf32>
+}
+
+// -----
+
 // Test the pass reports failure on unknown size.
 
 func @main(%arg0: tensor<i32>) -> () {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
index 92cb0458bf9..09a2dcb6713 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tensor_list_ops_decomposition.mlir
@@ -34,7 +34,7 @@ func @main() -> (tensor<f32>, tensor<i32>) {
   // CHECK-NEXT: %[[SCALAR_SHAPE:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>}
   // CHECK-NEXT: %[[LENGTH:.*]] = "tf.Reshape"(%[[NEW_SIZE]], %[[SCALAR_SHAPE]])
   %length = "tf.TensorListLength"(%push) : (tensor<!tf.variant<tensor<f32>>>) -> tensor<i32>
-  // CHECK-NEXT:  return %[[ELEM]], %[[LENGTH]] : tensor<f32>, tensor<i32>
+  // CHECK-NEXT: return %[[ELEM]], %[[LENGTH]] : tensor<f32>, tensor<i32>
   return %pop#1, %length: tensor<f32>, tensor<i32>
 }
 
@@ -81,7 +81,7 @@ func @main(%arg0: tensor<i32>) -> (tensor<f32>, tensor<10xf32>, tensor<i32>) {
   %stack = "tf.TensorListStack"(%addn2, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> tensor<10xf32>
   // CHECK-NEXT: %[[LEN:.*]] = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
   %length = "tf.TensorListLength"(%addn2) : (tensor<!tf.variant<tensor<f32>>>) -> tensor<i32>
-  // CHECK-NEXT:  return %[[ELEM]], %[[ADDN2]], %[[LEN]] : tensor<f32>, tensor<10xf32>, tensor<i32>
+  // CHECK-NEXT: return %[[ELEM]], %[[ADDN2]], %[[LEN]] : tensor<f32>, tensor<10xf32>, tensor<i32>
   return %get, %stack, %length : tensor<f32>, tensor<10xf32>, tensor<i32>
 }
 
@@ -104,7 +104,7 @@ func @main(%arg0: tensor<i32>, %arg1: tensor<10xf32>) -> tensor<f32> {
   // CHECK-NEXT: %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
   // CHECK-NEXT: %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]]) : (tensor<1xf32>, tensor<0xi32>) -> tensor<f32>
   %get = "tf.TensorListGetItem"(%tl, %arg0, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<i32>, tensor<0xi32>) -> tensor<f32>
-  // CHECK-NEXT:  return %[[ELEM]] : tensor<f32>
+  // CHECK-NEXT: return %[[ELEM]] : tensor<f32>
   return %get: tensor<f32>
 }
 
@@ -118,7 +118,7 @@ func @main(%arg0: tensor<10x8x9xf32>) -> tensor<2xi64> {
   %tl = "tf.TensorListFromTensor"(%arg0, %elem_shape) : (tensor<10x8x9xf32>, tensor<2xi32>) -> tensor<!tf.variant<tensor<8x9xf32>>>
   // CHECK: %[[SHAPE:.*]] = "tf.Const"() {value = dense<[8, 9]> : tensor<2xi64>} : () -> tensor<2xi64>
   %shape = "tf.TensorListElementShape"(%tl) : (tensor<!tf.variant<tensor<8x9xf32>>>) -> tensor<2xi64>
-  // CHECK-NEXT:  return %[[SHAPE]] : tensor<2xi64>
+  // CHECK-NEXT: return %[[SHAPE]] : tensor<2xi64>
   return %shape: tensor<2xi64>
 }
 
@@ -135,7 +135,7 @@ func @main(%arg0: tensor<10x8x9xf32>, %arg1: tensor<3xi32>) -> tensor<3x8x9xf32>
   // CHECK: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
   // CHECK: %[[GATHER:.*]] = "tf.GatherV2"(%[[BUFFER]], %[[ARG1]], %[[AXIS]]) : (tensor<10x8x9xf32>, tensor<3xi32>, tensor<i32>) -> tensor<3x8x9xf32>
   %gather = "tf.TensorListGather"(%tl, %arg1, %elem_shape) : (tensor<!tf.variant<tensor<8x9xf32>>>, tensor<3xi32>, tensor<2xi32>) -> tensor<3x8x9xf32>
-  // CHECK-NEXT:  return %[[GATHER]] : tensor<3x8x9xf32>
+  // CHECK-NEXT: return %[[GATHER]] : tensor<3x8x9xf32>
   return %gather: tensor<3x8x9xf32>
 }
 
@@ -173,7 +173,7 @@ func @main() -> () {
        : (tensor<!tf.variant<tensor<f32>>>, tensor<i32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<i32>)
   // CHECK: "tf.Slice"
   %pop:2 = "tf.TensorListPopBack"(%1#0, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
-  // CHECK-NOT: tf.EmptyTensorList
+  // CHECK-NOT: tf.TensorListPopBack
   // CHECK: return
   return
 }
@@ -242,7 +242,7 @@ func @if_else(%arg0: tensor<!tf.variant<tensor<f32>>>) -> tensor<!tf.variant<ten
   // CHECK: %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]]) : (tensor<1xf32>, tensor<0xi32>) -> tensor<f32>
   // CHECK-NOT: "tf.TensorListPopBack"
   %pop:2 = "tf.TensorListPopBack"(%arg0, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
-  // CHECK:  return %[[COPY]], %[[SUB]]
+  // CHECK: return %[[COPY]], %[[SUB]]
   return %pop#0 : tensor<!tf.variant<tensor<f32>>>
 }
 
@@ -289,7 +289,7 @@ func @branch_1(%arg0: tensor<!tf.variant<tensor<f32>>>) -> tensor<!tf.variant<te
   // CHECK: %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]]) : (tensor<1xf32>, tensor<0xi32>) -> tensor<f32>
   // CHECK-NOT: "tf.TensorListPopBack"
   %pop:2 = "tf.TensorListPopBack"(%arg0, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
-  // CHECK:  return %[[COPY]], %[[SUB]]
+  // CHECK: return %[[COPY]], %[[SUB]]
   return %pop#0 : tensor<!tf.variant<tensor<f32>>>
 }
 // CHECK: func @branch_2(%[[EARG0:.*]]: tensor<10xf32>, %[[EARG1:.*]]: tensor<1xi32>) -> (tensor<10xf32>, tensor<1xi32>)
@@ -305,9 +305,145 @@ func @branch_2(%arg0: tensor<!tf.variant<tensor<f32>>>) -> tensor<!tf.variant<te
   // CHECK: %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]]) : (tensor<1xf32>, tensor<0xi32>) -> tensor<f32>
   // CHECK-NOT: "tf.TensorListPopBack"
   %pop:2 = "tf.TensorListPopBack"(%arg0, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
-  // CHECK:  return %[[COPY]], %[[SUB]]
+  // CHECK: return %[[COPY]], %[[SUB]]
   return %pop#0 : tensor<!tf.variant<tensor<f32>>>
 }
+
+// -----
+
+// CHECK-LABEL: func @main
+func @main() -> tensor<f32> {
+  %elem_shape = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  %size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NOT: tf.EmptyTensorList
+  %tl = "tf.EmptyTensorList"(%elem_shape, %size) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<f32>>>
+  %while_op:2 = "tf.WhileRegion"(%tl, %size) ( {
+  // CHECK: ^bb0(%[[CARG0:.*]]: tensor<10xf32>, %[[CARG1:.*]]: tensor<i32>, %[[CARG2:.*]]: tensor<1xi32>):
+  ^bb0(%arg0: tensor<!tf.variant<tensor<f32>>>, %arg1: tensor<i32>):  // no predecessors
+    // CHECK:   %[[PRED:.*]] = "tf._SomeOp"()
+    // CHECK:   "tf.Yield"(%[[PRED]])
+    %pred = "tf._SomeOp"() : () -> tensor<i1>
+    "tf.Yield"(%pred) : (tensor<i1>) -> ()
+  },  {
+  // CHECK: ^bb0(%[[CARG0:.*]]: tensor<10xf32>, %[[CARG1:.*]]: tensor<i32>, %[[CARG2:.*]]: tensor<1xi32>):
+  ^bb0(%arg0: tensor<!tf.variant<tensor<f32>>>, %arg1: tensor<i32>):  // no predecessors
+    // CHECK:   %[[CST:.*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    // CHECK:   %[[SUB:.*]] = "tf.Sub"(%[[CARG1]], %[[CST]])
+    // CHECK:   %[[ELEM:.*]] = "tf._SomeOp"() : () -> tensor<f32>
+    %cst = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %sub = "tf.Sub"(%arg1, %cst) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %elem = "tf._SomeOp"() : () -> tensor<f32>
+    // CHECK-NOT: "tf.TensorListPushBack"
+    // CHECK:   %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"(%[[CARG0]]
+    // CHECK:   %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+    // CHECK:   %[[ADD:.*]] = "tf.AddV2"(%[[CARG2]], %[[ONE]])
+    // CHECK-NOT: "tf.TensorListPushBack"
+    // CHECK:   "tf.Yield"(%[[UPDATE]], %[[SUB]], %[[ADD]])
+    // CHECK: }) {is_stateless = false}
+    %push = "tf.TensorListPushBack"(%arg0, %elem) : (tensor<!tf.variant<tensor<f32>>>, tensor<f32>) -> tensor<!tf.variant<tensor<f32>>>
+    "tf.Yield"(%push, %sub) : (tensor<!tf.variant<tensor<f32>>>, tensor<i32>) -> ()
+  }) {is_stateless = false} : (tensor<!tf.variant<tensor<f32>>>, tensor<i32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<i32>)
+  // CHECK: "tf.Slice"
+  // CHECK-NOT: tf.TensorListPopBack
+  %pop:2 = "tf.TensorListPopBack"(%while_op#0, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
+  // CHECK: return
+  return %pop#1 : tensor<f32>
+}
+// -----
+
+// CHECK-LABEL: func @main
+func @main(%arg0: tensor<i1>) -> () {
+  %elem_shape = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  %tl = "tf.EmptyTensorList"(%elem_shape, %max_size) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK: %[[ZERO:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
+  // CHECK: %[[ZERO_F32:.*]] = "tf.Cast"(%[[ZERO]])
+  // CHECK: %[[MAX_SIZE:.*]] = "tf.Const"() {value = dense<10> : tensor<1xi32>}
+  // CHECK: %[[BUFFER:.*]] = "tf.BroadcastTo"(%[[ZERO_F32]], %[[MAX_SIZE]])
+  // CHECK: %[[BUFFER_SIZE:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>}
+  // CHECK-NOT: tf.EmptyTensorList
+  %if_op = "tf.IfRegion"(%arg0) ({
+      %elem = "tf._SomeOp"() : () -> tensor<f32>
+      // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"
+      // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+      // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[BUFFER_SIZE]], %[[ONE]])
+      // CHECK-NOT: "tf.TensorListPushBack"
+      %push = "tf.TensorListPushBack"(%tl, %elem) : (tensor<!tf.variant<tensor<f32>>>, tensor<f32>) -> tensor<!tf.variant<tensor<f32>>>
+      "tf.Yield" (%push) : (tensor<!tf.variant<tensor<f32>>>) -> ()
+    }, {
+      // CHECK:   %[[COPY:.*]] = "tf.Identity"(%[[BUFFER]])
+      // CHECK:   %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+      // CHECK:   %[[SUB:.*]] = "tf.Sub"(%[[BUFFER_SIZE]], %[[ONE]])
+      // CHECK:   %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+      // CHECK:   %[[SLICE:.*]] = "tf.Slice"(%[[COPY]], %[[SUB]], %[[SLICE_SIZE]])
+      // CHECK:   %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>}
+      // CHECK:   %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]])
+      // CHECK-NOT: "tf.TensorListPopBack"
+      %pop:2 = "tf.TensorListPopBack"(%tl, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
+      // CHECK:   "tf.Yield"(%[[COPY]], %[[SUB]])
+      "tf.Yield" (%pop#0) : (tensor<!tf.variant<tensor<f32>>>) -> ()
+    })
+    {is_stateless = false}
+    : (tensor<i1>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK: "tf.Slice"
+  // CHECK-NOT: tf.TensorListPopBack
+  %pop:2 = "tf.TensorListPopBack"(%if_op, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+func @main(%arg0: tensor<i32>) -> () {
+  %elem_shape = "tf.Const"() {value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  %max_size = "tf.Const"() {value = dense<10> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: %[[ZERO:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>}
+  // CHECK: %[[ZERO_F32:.*]] = "tf.Cast"(%[[ZERO]])
+  // CHECK: %[[MAX_SIZE:.*]] = "tf.Const"() {value = dense<10> : tensor<1xi32>}
+  // CHECK: %[[BUFFER:.*]] = "tf.BroadcastTo"(%[[ZERO_F32]], %[[MAX_SIZE]])
+  // CHECK: %[[BUFFER_SIZE:.*]] = "tf.Const"() {value = dense<0> : tensor<1xi32>}
+  // CHECK-NOT: tf.EmptyTensorList
+  %tl = "tf.EmptyTensorList"(%elem_shape, %max_size) : (tensor<0xi32>, tensor<i32>) -> tensor<!tf.variant<tensor<f32>>>
+  %case_op = "tf.CaseRegion"(%arg0) ({
+      %elem = "tf._SomeOp"() : () -> tensor<f32>
+      // CHECK: %[[UPDATE:.*]] = "tf.XlaDynamicUpdateSlice"
+      // CHECK: %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+      // CHECK: %[[ADD:.*]] = "tf.AddV2"(%[[BUFFER_SIZE]], %[[ONE]])
+      // CHECK-NOT: "tf.TensorListPushBack"
+      %push = "tf.TensorListPushBack"(%tl, %elem) : (tensor<!tf.variant<tensor<f32>>>, tensor<f32>) -> tensor<!tf.variant<tensor<f32>>>
+      "tf.Yield" (%push) : (tensor<!tf.variant<tensor<f32>>>) -> ()
+    }, {
+      // CHECK:   %[[COPY:.*]] = "tf.Identity"(%[[BUFFER]])
+      // CHECK:   %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+      // CHECK:   %[[SUB:.*]] = "tf.Sub"(%[[BUFFER_SIZE]], %[[ONE]])
+      // CHECK:   %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+      // CHECK:   %[[SLICE:.*]] = "tf.Slice"(%[[COPY]], %[[SUB]], %[[SLICE_SIZE]])
+      // CHECK:   %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>}
+      // CHECK:   %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]])
+      // CHECK-NOT: "tf.TensorListPopBack"
+      %pop:2 = "tf.TensorListPopBack"(%tl, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
+      // CHECK:   "tf.Yield"(%[[COPY]], %[[SUB]])
+      "tf.Yield" (%pop#0) : (tensor<!tf.variant<tensor<f32>>>) -> ()
+    }, {
+      // CHECK:   %[[COPY:.*]] = "tf.Identity"(%[[BUFFER]])
+      // CHECK:   %[[ONE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+      // CHECK:   %[[SUB:.*]] = "tf.Sub"(%[[BUFFER_SIZE]], %[[ONE]])
+      // CHECK:   %[[SLICE_SIZE:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi32>}
+      // CHECK:   %[[SLICE:.*]] = "tf.Slice"(%[[COPY]], %[[SUB]], %[[SLICE_SIZE]])
+      // CHECK:   %[[ELEM_SHAPE:.*]] = "tf.Const"() {value = dense<> : tensor<0xi32>}
+      // CHECK:   %[[ELEM:.*]] = "tf.Reshape"(%[[SLICE]], %[[ELEM_SHAPE]])
+      // CHECK-NOT: "tf.TensorListPopBack"
+      %pop:2 = "tf.TensorListPopBack"(%tl, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
+      // CHECK:   "tf.Yield"(%[[COPY]], %[[SUB]])
+      "tf.Yield" (%pop#0) : (tensor<!tf.variant<tensor<f32>>>) -> ()
+    }) {is_stateless = false}
+    : (tensor<i32>) -> tensor<!tf.variant<tensor<f32>>>
+  // CHECK: "tf.Slice"
+  // CHECK-NOT: tf.TensorListPopBack
+  %pop:2 = "tf.TensorListPopBack"(%case_op, %elem_shape) : (tensor<!tf.variant<tensor<f32>>>, tensor<0xi32>) -> (tensor<!tf.variant<tensor<f32>>>, tensor<f32>)
+  return
+}
+
 // -----
 
 // Tests PartitionedCall/StatefulPartitionedCall.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 9a8d97eddf1..8b97bfdad6d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -183,6 +183,20 @@ func @testLeakyWrongAlphaType(tensor<16xf32>) -> tensor<16xf32> {
 
 // -----
 
+// Test tf.Min with complex numbers.
+// Previous versions of tensorflow said complex numbers were allowed with
+// tf.Min even though it doesn't make sense. The legalization of tf to xla
+// requires that complex types are not allowed in tf.Min, so we have an
+// explicit unit here to make sure that invariant is enforced.
+func @testMinComplex(%arg0: tensor<4x8xcomplex<f32>>) -> tensor<4x1xcomplex<f32>> {
+  %dimension = "tf.Const"() { value = dense<1> : tensor<1xi64> } : () -> tensor<1xi64>
+  // expected-error@below {{'tf.Min' op operand #0 must be tensor of}}
+  %0 = "tf.Min"(%arg0, %dimension) { keep_dims = true }: (tensor<4x8xcomplex<f32>>, tensor<1xi64>) -> tensor<4x1xcomplex<f32>>
+  return %0 : tensor<4x1xcomplex<f32>>
+}
+
+// -----
+
 // CHECK-LABEL: func @testMul
 func @testMul(%arg0: tensor<2xui16>) -> (tensor<2xui16>) {
   %0 = "tf.Mul"(%arg0, %arg0) {T = "tfdtype$DT_UINT16", device = "/device:CPU:0", name = "Mul"} : (tensor<2xui16>, tensor<2xui16>) -> tensor<2xui16>
@@ -210,17 +224,17 @@ func @testIncompatibleElementTypes(%arg0: tensor<3x2xf32>, %arg1: tensor<3x2xf32
 // -----
 
 // CHECK-LABEL: func @testReshape(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<10000xf32>, %arg3: tensor<*xi32>)
-func @testReshape(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<10000xf32>, %arg3: tensor<*xi32>) -> (tensor<100x100xf32>, tensor<*xf32>, tensor<10000xf32>, tensor<100x100xf32>, tensor<*xf32>, tensor<*xf32>) {
+func @testReshape(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<10000xf32>, %arg3: tensor<*xi32>) -> (tensor<100x100xf32>, tensor<*xf32>, tensor<100x100xf32>, tensor<100x100xf32>, tensor<*xf32>, tensor<*xf32>) {
   %shape1 = constant dense<100> : tensor<2xi32>
-  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<*xf32>, tensor<2xi32>) -> (tensor<100x100xf32>)
-  %shape2 = "tf.Shape"(%arg0) {device = "", name = "Shape", T = "tfdtype$DT_FLOAT", out_type = "tfdtype$DT_INT32"} : (tensor<*xf32>) -> (tensor<?xi32>)
-  %r2 = "tf.Reshape"(%arg1, %shape2) {device = "", name = "Reshape_1", T = "tfdtype$DT_FLOAT", Tshape = "tfdtype$DT_INT32"} : (tensor<*xf32>, tensor<?xi32>) -> (tensor<*xf32>)
-  %r3 = "tf.Reshape"(%arg2, %shape1) {device = "", name = "Reshape_1", T = "tfdtype$DT_FLOAT", Tshape = "tfdtype$DT_INT32"} : (tensor<10000xf32>, tensor<2xi32>) -> (tensor<10000xf32>)
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<*xf32>, tensor<2xi32>) -> tensor<100x100xf32>
+  %shape2 = "tf.Shape"(%arg0) : (tensor<*xf32>) -> tensor<?xi32>
+  %r2 = "tf.Reshape"(%arg1, %shape2) : (tensor<*xf32>, tensor<?xi32>) -> tensor<*xf32>
+  %r3 = "tf.Reshape"(%arg2, %shape1) : (tensor<10000xf32>, tensor<2xi32>) -> tensor<100x100xf32>
   %shape3 = constant dense<[-1, 100]> : tensor<2xi32>
-  %r4 = "tf.Reshape"(%arg2, %shape3) {device = "", name = "Reshape_1", T = "tfdtype$DT_FLOAT", Tshape = "tfdtype$DT_INT32"} : (tensor<10000xf32>, tensor<2xi32>) -> (tensor<100x100xf32>)
-  %r5 = "tf.Reshape"(%arg0, %arg3) {T = "tfdtype$DT_FLOAT", Tshape = "tfdtype$DT_INT32"} : (tensor<*xf32>, tensor<*xi32>) -> (tensor<*xf32>)
-  %r6 = "tf.Reshape"(%arg2, %arg3) {T = "tfdtype$DT_FLOAT", Tshape = "tfdtype$DT_INT32"} : (tensor<10000xf32>, tensor<*xi32>) -> (tensor<*xf32>)
-  return %r1, %r2, %r3, %r4, %r5, %r6: tensor<100x100xf32>, tensor<*xf32>, tensor<10000xf32>, tensor<100x100xf32>, tensor<*xf32>, tensor<*xf32>
+  %r4 = "tf.Reshape"(%arg2, %shape3) : (tensor<10000xf32>, tensor<2xi32>) -> tensor<100x100xf32>
+  %r5 = "tf.Reshape"(%arg0, %arg3) : (tensor<*xf32>, tensor<*xi32>) -> tensor<*xf32>
+  %r6 = "tf.Reshape"(%arg2, %arg3) : (tensor<10000xf32>, tensor<*xi32>) -> tensor<*xf32>
+  return %r1, %r2, %r3, %r4, %r5, %r6: tensor<100x100xf32>, tensor<*xf32>, tensor<100x100xf32>, tensor<100x100xf32>, tensor<*xf32>, tensor<*xf32>
 }
 
 // -----
@@ -228,26 +242,42 @@ func @testReshape(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<1000
 func @testReshape(tensor<*xf32>, tensor<*xf32>) -> (tensor<100x100xf32>) {
 ^bb0(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>):
   %shape1 = constant dense<100.> : tensor<2xf32>
-  // expected-error @+1 {{must be tensor of 32/64-bit signless integer values}}
-  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<*xf32>, tensor<2xf32>) -> (tensor<100x100xf32>)
+  // expected-error @+1 {{must be tensor of 32/64-bit signed integer values}}
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<*xf32>, tensor<2xf32>) -> tensor<100x100xf32>
   return %r1 : tensor<100x100xf32>
 }
 
 // -----
 // tf.Reshape with incorrect element number.
-func @testReshape(%arg0: tensor<10x10x10xf32>) -> tensor<100x100xf32> {
-  %shape1 = constant dense<100> : tensor<2xi32>
-  // expected-error @+1 {{number of output elements (10000) does not match expected number of elements (1000)}}
-  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10xf32>, tensor<2xi32>) -> (tensor<100x100xf32>)
+func @testReshape(%arg0: tensor<10x10x10xf32>, %shape1: tensor<2xi32>) -> tensor<100x100xf32> {
+  // expected-error @+1 {{requires 'output' number of elements to match 'tensor' number of elements, but got 10000 and 1000}}
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10xf32>, tensor<2xi32>) -> tensor<100x100xf32>
   return %r1 : tensor<100x100xf32>
 }
 
+// -----
+// tf.Reshape with incorrect shape operand rank.
+func @testReshape(%arg0: tensor<10x10x10xf32>, %shape1: tensor<2x2xi32>) -> tensor<*xf32> {
+  // expected-error @+1 {{requires 'shape' to be rank 1, but got 2}}
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10xf32>, tensor<2x2xi32>) -> tensor<*xf32>
+  return %r1 : tensor<*xf32>
+}
+
 // -----
 // tf.Reshape with more than one -1 in the shape.
 func @testReshape(%arg0: tensor<10x10x10x10xf32>) -> tensor<100x100xf32> {
   %shape1 = constant dense<-1> : tensor<2xi32>
-  // expected-error @+1 {{more than one component of shape are -1}}
-  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10x10xf32>, tensor<2xi32>) -> (tensor<100x100xf32>)
+  // expected-error @+1 {{requires 'shape' to have at most one dynamic dimension, but got multiple dynamic dimensions at indices 0 and 1}}
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10x10xf32>, tensor<2xi32>) -> tensor<100x100xf32>
+  return %r1 : tensor<100x100xf32>
+}
+
+// -----
+// tf.Reshape with shape operand element < -1.
+func @testReshape(%arg0: tensor<10x10x10x10xf32>) -> tensor<100x100xf32> {
+  %shape1 = constant dense<[100, -2]> : tensor<2xi32>
+  // expected-error @+1 {{requires 'shape' to have dimensions greater than -1, but got -2 at index 1}}
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10x10xf32>, tensor<2xi32>) -> tensor<100x100xf32>
   return %r1 : tensor<100x100xf32>
 }
 
@@ -255,19 +285,68 @@ func @testReshape(%arg0: tensor<10x10x10x10xf32>) -> tensor<100x100xf32> {
 // tf.Reshape with -1 in the shape can't infer the dimension.
 func @testReshape(%arg0: tensor<10x10x10x10xf32>) -> tensor<100x100xf32> {
   %shape1 = constant dense<[101, -1]> : tensor<2xi32>
-  // expected-error @+1 {{one component of shape is -1 but couldn't infer the dimension}}
-  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10x10xf32>, tensor<2xi32>) -> (tensor<100x100xf32>)
+  // expected-error @+1 {{requires 'tensor' number of elements be a multiple of 101, but got 10000}}
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10x10xf32>, tensor<2xi32>) -> tensor<100x100xf32>
   return %r1 : tensor<100x100xf32>
 }
 
 // -----
-// tf.Reshape with a first operand that has non-static shape.
+// tf.Reshape with incorrect output rank.
+func @testReshape(%arg0: tensor<10x10xf32>) -> tensor<?x?xf32> {
+  %shape1 = constant dense<[100]> : tensor<1xi32>
+  // expected-error @+1 {{requires 'output' type 'tensor<?x?xf32>' to be cast compatible with expected type 'tensor<100xf32>'}}
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10xf32>, tensor<1xi32>) -> tensor<?x?xf32>
+  return %r1 : tensor<?x?xf32>
+}
+
+// -----
+// tf.Reshape with incorrect output dimension.
+func @testReshape(%arg0: tensor<1000xf32>) -> tensor<?x8x?xf32> {
+  %shape1 = constant dense<[10, 10, 10]> : tensor<3xi32>
+  // expected-error @+1 {{requires 'output' type 'tensor<?x8x?xf32>' to be cast compatible with expected type 'tensor<10x10x10xf32>'}}
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<1000xf32>, tensor<3xi32>) -> tensor<?x8x?xf32>
+  return %r1 : tensor<?x8x?xf32>
+}
+
+// -----
+// tf.Reshape with a shape operand that has 0 for one of its elements.
+func @testReshape(%arg0: tensor<10x10x10xf32>) -> tensor<?x0xf32> {
+  %shape1 = constant dense<[-1, 0]> : tensor<2xi32>
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10xf32>, tensor<2xi32>) -> tensor<?x0xf32>
+  return %r1 : tensor<?x0xf32>
+}
+
+// -----
+// tf.Reshape with a tensor operand that has 0 for one of its elements.
+func @testReshape(%arg0: tensor<10x10x0xf32>) -> tensor<?x0xf32> {
+  %shape1 = constant dense<[-1, 0]> : tensor<2xi32>
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x0xf32>, tensor<2xi32>) -> tensor<?x0xf32>
+  return %r1 : tensor<?x0xf32>
+}
+
+// -----
+// tf.Reshape with a tensor operand that has non-static shape.
 func @testReshape(%arg0: tensor<10x10x?xf32>) -> tensor<10x10xf32> {
   %shape1 = constant dense<[10, 10]> : tensor<2xi32>
-  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x?xf32>, tensor<2xi32>) -> (tensor<10x10xf32>)
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x?xf32>, tensor<2xi32>) -> tensor<10x10xf32>
   return %r1 : tensor<10x10xf32>
 }
 
+// -----
+// tf.Reshape with tensor operand that has non-static shape and shape operand
+// with static shape.
+func @testReshape(%arg0: tensor<10x10x?xf32>, %shape1: tensor<2xi32>) -> tensor<100x100xf32> {
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x?xf32>, tensor<2xi32>) -> tensor<100x100xf32>
+  return %r1 : tensor<100x100xf32>
+}
+
+// -----
+// tf.Reshape with tensor and shape operands with static shape.
+func @testReshape(%arg0: tensor<10x10x10x10xf32>, %shape1: tensor<2xi32>) -> tensor<100x100xf32> {
+  %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x10x10xf32>, tensor<2xi32>) -> tensor<100x100xf32>
+  return %r1 : tensor<100x100xf32>
+}
+
 // -----
 
 // CHECK-LABEL: func @testValidAvgPool
@@ -780,7 +859,7 @@ func @testIfElse(tensor<2xf32>) -> tensor<2xf32>
 // Test invalid tf.If operation
 func @testInvalidIfOp(tensor<i1>, tensor<2xf32>) -> tensor<2xf32> {
 ^bb0(%arg0: tensor<i1>, %arg1: tensor<2xf32>):
-  // expected-error @+1 {{expects all branches to have 1 input(s), but 'then_branch' has 2 input(s)}}
+  // expected-error @+1 {{'tf.If' op 'then_branch' inputs (size = 2) should have the same number of values as inputs (size = 1)}}
   %1 = "tf.If"(%arg0, %arg1) {
     then_branch = @testIfThen,
     else_branch = @testIfElse,
@@ -798,7 +877,7 @@ func @testIfElse(tensor<2xf32>) -> tensor<2xf32>
 // Test invalid tf.If operation
 func @testInvalidIfOp(tensor<i1>, tensor<2xf32>) -> tensor<2xf32> {
 ^bb0(%arg0: tensor<i1>, %arg1: tensor<2xf32>):
-  // expected-error @+1 {{expects all branches to have 1 result(s), but 'then_branch' has 2 result(s)}}
+  // expected-error @+1 {{'tf.If' op 'then_branch' results (size = 2) should have the same number of values as results (size = 1)}}
   %1 = "tf.If"(%arg0, %arg1) {
     then_branch = @testIfThen,
     else_branch = @testIfElse,
@@ -816,7 +895,7 @@ func @testIfElse(tensor<*xf32>) -> tensor<*xf32>
 // Test invalid tf.If operation
 func @testInvalidIfOp(tensor<i1>, tensor<2xf32>) -> tensor<2xf32> {
 ^bb0(%arg0: tensor<i1>, %arg1: tensor<2xf32>):
-  // expected-error @+1 {{expects operand type 'tensor<2xf32>' to be cast compatible with 'then_branch' input type 'tensor<*xf16>' at index 0}}
+  // expected-error @+1 {{'tf.If' op 'then_branch' input type tensor<*xf16> is incompatible with input type tensor<2xf32> at index 0}}
   %1 = "tf.If"(%arg0, %arg1) {
     then_branch = @testIfThen,
     else_branch = @testIfElse,
@@ -852,7 +931,7 @@ func @testIfElse(tensor<*xf32>) -> tensor<3xf32>
 // Test invalid tf.If operation
 func @testInvalidIfOp(tensor<i1>, tensor<*xf32>) -> tensor<2xf32> {
 ^bb0(%arg0: tensor<i1>, %arg1: tensor<*xf32>):
-  // expected-error @+1 {{expects result type 'tensor<2xf32>' to be cast compatible with 'else_branch' result type 'tensor<3xf32>' at index 0}}
+  // expected-error @+1 {{'tf.If' op 'else_branch' result type tensor<3xf32> is incompatible with result type tensor<2xf32> at index 0}}
   %1 = "tf.If"(%arg0, %arg1) {
     then_branch = @testIfThen,
     else_branch = @testIfElse,
@@ -1000,7 +1079,7 @@ func @testIfRegionElseTerminator(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> ten
 
 // tf.Region yield number of results should match op number of results
 func @testIfRegionThenResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{'tf.IfRegion' op then should have same number (1) of results as tf.IfRegion but has 2 results}}
+  // expected-error @+1 {{'tf.IfRegion' op then results (size = 2) should have the same number of values as results (size = 1)}}
   %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t, %t) : (tensor<2xf32>, tensor<2xf32>) -> ()
@@ -1015,7 +1094,7 @@ func @testIfRegionThenResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> te
 // -----
 
 func @testIfRegionElseResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{tf.IfRegion' op else should have same number (1) of results as tf.IfRegion but has 2 results}}
+  // expected-error @+1 {{'tf.IfRegion' op else results (size = 2) should have the same number of values as results (size = 1)}}
   %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t) : (tensor<2xf32>) -> ()
@@ -1031,7 +1110,7 @@ func @testIfRegionElseResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> te
 
 // tf.IfRegion yield types should match op result types
 func @testIfRegionOpYieldMismatchThen(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{then result type tensor<i1> is incompatible with tf.IfRegion result type tensor<2xf32> at index 0}}
+  // expected-error @+1 {{'tf.IfRegion' op then result type tensor<i1> is incompatible with result type tensor<2xf32> at index 0}}
   %0 = "tf.IfRegion"(%arg0) ({
      "tf.Yield"(%arg0) : (tensor<i1>) -> ()
     }, {
@@ -1045,7 +1124,7 @@ func @testIfRegionOpYieldMismatchThen(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -
 // -----
 
 func @testIfRegionOpYieldMismatchElse(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{else result type tensor<i1> is incompatible with tf.IfRegion result type tensor<2xf32> at index 0}}
+  // expected-error @+1 {{'tf.IfRegion' op else result type tensor<i1> is incompatible with result type tensor<2xf32> at index 0}}
   %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t) : (tensor<2xf32>) -> ()
@@ -1434,6 +1513,110 @@ func @testSoftmaxCrossEntropyWithLogits(%arg0: tensor<3xf32>, %arg1: tensor<3xf3
 
 // -----
 
+//===--------------------------------------------------------------------===//
+//  tf.SpaceToBatchND
+//===--------------------------------------------------------------------===//
+
+// Test valid tf.SpaceToBatchND
+// CHECK-LABEL: func @testSpaceToBatchND
+func @testSpaceToBatchND(%input: tensor<3x5x7x10xf32>, %block_shape: tensor<2xi64>, %paddings: tensor<2x2xi64>) -> tensor<?x?x?x10xf32> {
+  %0 = "tf.SpaceToBatchND"(%input, %block_shape, %paddings) : (tensor<3x5x7x10xf32>, tensor<2xi64>, tensor<2x2xi64>) -> tensor<?x?x?x10xf32>
+  return %0 : tensor<?x?x?x10xf32>
+}
+
+// -----
+
+// Test valid tf.SpaceToBatchND
+// CHECK-LABEL: func @testSpaceToBatchND
+func @testSpaceToBatchND(%input: tensor<3x5x7x10xf32>) -> tensor<36x2x3x10xf32> {
+  %block_shape = "tf.Const"() {value = dense<[4, 3]> : tensor<2xi64>} : () -> tensor<2xi64>
+  %paddings = "tf.Const"() {value = dense<[[1, 2], [1, 1]]> : tensor<2x2xi64>} : () -> tensor<2x2xi64>
+  %0 = "tf.SpaceToBatchND"(%input, %block_shape, %paddings) : (tensor<3x5x7x10xf32>, tensor<2xi64>, tensor<2x2xi64>) -> tensor<36x2x3x10xf32>
+  return %0 : tensor<36x2x3x10xf32>
+}
+
+// -----
+
+// Test invalid tf.SpaceToBatchND
+func @testSpaceToBatchND(%input: tensor<3x5x7x10xf32>, %block_shape: tensor<2x2xi64>, %paddings: tensor<2x2xi64>) -> tensor<?x?x?x10xf32> {
+  // expected-error @+1 {{requires rank of block_shape = 1; got 2}}
+  %0 = "tf.SpaceToBatchND"(%input, %block_shape, %paddings) : (tensor<3x5x7x10xf32>, tensor<2x2xi64>, tensor<2x2xi64>) -> tensor<?x?x?x10xf32>
+  return %0 : tensor<?x?x?x10xf32>
+}
+
+// -----
+
+// Test invalid tf.SpaceToBatchND
+func @testSpaceToBatchND(%input: tensor<3x5x7x10xf32>, %block_shape: tensor<2xi64>, %paddings: tensor<2xi64>) -> tensor<?x?x?x10xf32> {
+  // expected-error @+1 {{requires rank of paddings = 2; got 1}}
+  %0 = "tf.SpaceToBatchND"(%input, %block_shape, %paddings) : (tensor<3x5x7x10xf32>, tensor<2xi64>, tensor<2xi64>) -> tensor<?x?x?x10xf32>
+  return %0 : tensor<?x?x?x10xf32>
+}
+
+// -----
+
+// Test invalid tf.SpaceToBatchND
+func @testSpaceToBatchND(%input: tensor<3x5x7x10xf32>, %block_shape: tensor<2xi64>, %paddings: tensor<2x10xi64>) -> tensor<?x?x?x10xf32> {
+  // expected-error @+1 {{requires paddings.shape[1] to be 2; got 10}}
+  %0 = "tf.SpaceToBatchND"(%input, %block_shape, %paddings) : (tensor<3x5x7x10xf32>, tensor<2xi64>, tensor<2x10xi64>) -> tensor<?x?x?x10xf32>
+  return %0 : tensor<?x?x?x10xf32>
+}
+
+// -----
+
+// Test invalid tf.SpaceToBatchND
+func @testSpaceToBatchND(%input: tensor<3x5x7x10xf32>, %block_shape: tensor<4xi64>, %paddings: tensor<2x2xi64>) -> tensor<?x?x?x?xf32> {
+  // expected-error @+1 {{requires block_shape.shape[0] must equal paddings.shape[0]}}
+  %0 = "tf.SpaceToBatchND"(%input, %block_shape, %paddings) : (tensor<3x5x7x10xf32>, tensor<4xi64>, tensor<2x2xi64>) -> tensor<?x?x?x?xf32>
+  return %0 : tensor<?x?x?x?xf32>
+}
+
+// -----
+
+// Test invalid tf.SpaceToBatchND
+func @testSpaceToBatchND(%input: tensor<3x5xf32>, %block_shape: tensor<2xi64>, %paddings: tensor<2x2xi64>) -> tensor<?x?xf32> {
+  // expected-error @+1 {{requires rank of input >= 1 + rank of block}}
+  %0 = "tf.SpaceToBatchND"(%input, %block_shape, %paddings) : (tensor<3x5xf32>, tensor<2xi64>, tensor<2x2xi64>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+
+// -----
+
+// Test invalid tf.SpaceToBatchND
+func @testSpaceToBatchND(%input: tensor<3x5x7x10xf32>, %paddings: tensor<2x2xi64>) -> tensor<?x?x?x10xf32> {
+  %block_shape = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+  // expected-error @+1 {{requires all values of block_shape to be >= 1; failed for dimension 1}}
+  %1 = "tf.SpaceToBatchND"(%input, %block_shape, %paddings) : (tensor<3x5x7x10xf32>, tensor<2xi64>, tensor<2x2xi64>) -> tensor<?x?x?x10xf32>
+  return %1 : tensor<?x?x?x10xf32>
+}
+
+// -----
+
+// Test invalid tf.SpaceToBatchND
+func @testSpaceToBatchND(%input: tensor<3x5x7x10xf32>, %block_shape: tensor<2xi64>) -> tensor<?x?x?x10xf32> {
+  %paddings = "tf.Const"() {value = dense<[[1, 0], [-1, 0]]> : tensor<2x2xi64>} : () -> tensor<2x2xi64>
+  // expected-error @+1 {{requires all values of paddings to be >= 0; failed for dimension 1}}
+  %1 = "tf.SpaceToBatchND"(%input, %block_shape, %paddings) : (tensor<3x5x7x10xf32>, tensor<2xi64>, tensor<2x2xi64>) -> tensor<?x?x?x10xf32>
+  return %1 : tensor<?x?x?x10xf32>
+}
+
+// -----
+
+// Test invalid tf.SpaceToBatchND
+func @testSpaceToBatchND(%input: tensor<3x5x7x10xf32>) -> tensor<36x2x3x10xf32> {
+  %block_shape = "tf.Const"() {value = dense<[4, 3]> : tensor<2xi64>} : () -> tensor<2xi64>
+  %paddings = "tf.Const"() {value = dense<[[1, 2], [1, 2]]> : tensor<2x2xi64>} : () -> tensor<2x2xi64>
+  // expected-error @+1 {{requires block_shape[i] divides input_shape[i + 1] + paddings[i, 0] + paddings[i, 1]; failed for i=1}}
+  %1 = "tf.SpaceToBatchND"(%input, %block_shape, %paddings) : (tensor<3x5x7x10xf32>, tensor<2xi64>, tensor<2x2xi64>) -> tensor<36x2x3x10xf32>
+  return %1 : tensor<36x2x3x10xf32>
+}
+
+// -----
+
+//===--------------------------------------------------------------------===//
+//  tf.SparseSoftmaxCrossEntropyWithLogits
+//===--------------------------------------------------------------------===//
+
 // Test valid tf.SparseSoftmaxCrossEntropyWithLogits
 // CHECK-LABEL: func @testSparseSoftmaxCrossEntropyWithLogits
 func @testSparseSoftmaxCrossEntropyWithLogits(%arg0: tensor<2x3xf32>, %arg1: tensor<2xi32>) -> (tensor<3xf32>, tensor<2x3xf32>) {
@@ -1527,7 +1710,7 @@ func @testWhileBody(tensor<*xf32>) -> (tensor<*xf32>)
 // Test invalid 'While' operation
 func @testWhileResult(tensor<*xf32>) -> (tensor<*xi32>) {
 ^bb0(%arg0: tensor<*xf32>):
-  // expected-error @+1 {{operand type tensor<*xf32> is incompatible with result type}}
+  // expected-error @+1 {{'tf.While' op input type tensor<*xf32> is incompatible with result type tensor<*xi32> at index 0}}
   %1 = "tf.While"(%arg0) {
     cond = @testWhileCond,
     body = @testWhileBody,
@@ -1545,7 +1728,7 @@ func @testWhileBody(tensor<*xf32>) -> (tensor<*xf32>)
 // Test invalid 'While' operation
 func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
 ^bb0(%arg0: tensor<*xf32>):
-  // expected-error @+1 {{operand type tensor<*xf32> is incompatible with cond function input type}}
+  // expected-error @+1 {{'tf.While' op input type tensor<*xf32> is incompatible with condition input type tensor<*xi32> at index 0}}
   %1 = "tf.While"(%arg0) {
     cond = @testWhileCond,
     body = @testWhileBody,
@@ -1563,7 +1746,7 @@ func @testWhileBody(tensor<*xf32>, tensor<*xf32>) -> (tensor<*xf32>)
 // Test invalid 'While' operation
 func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
 ^bb0(%arg0: tensor<*xf32>):
-  // expected-error @+1 {{requires the number of operands to be equal to the number of body function inputs. Found 1 and 2, respectively}}
+  // expected-error @+1 {{'tf.While' op inputs (size = 1) should have the same number of values as body inputs (size = 2)}}
   %1 = "tf.While"(%arg0) {
     cond = @testWhileCond,
     body = @testWhileBody,
@@ -1581,7 +1764,7 @@ func @testWhileBody(tensor<*xf32>) -> (tensor<*xi32>)
 // Test invalid 'While' operation
 func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
 ^bb0(%arg0: tensor<*xf32>):
-  // expected-error @+1 {{body function result type tensor<*xi32> is incompatible with result type}}
+  // expected-error @+1 {{'tf.While' op body result type tensor<*xi32> is incompatible with result type tensor<*xf32> at index 0}}
   %1 = "tf.While"(%arg0) {
     cond = @testWhileCond,
     body = @testWhileBody,
@@ -1599,7 +1782,7 @@ func @testWhileBody(tensor<4xf32>) -> (tensor<*xf32>)
 // Test invalid 'While' operation
 func @testWhileResult(tensor<*xf32>) -> (tensor<*xf32>) {
 ^bb0(%arg0: tensor<*xf32>):
-  // expected-error @+1 {{cond function input type tensor<3xf32> is incompatible with body function input type}}
+  // expected-error @+1 {{'tf.While' op condition input type tensor<3xf32> is incompatible with body input type tensor<4xf32> at index 0}}
   %1 = "tf.While"(%arg0) {
     cond = @testWhileCond,
     body = @testWhileBody,
@@ -1618,7 +1801,7 @@ func @testWhileBody(tensor<*x!tf.resource<tensor<32xf32>>>) -> (tensor<!tf.resou
 // subtypes.
 func @testWhileResult(tensor<*x!tf.resource<tensor<32xf32>>>) -> (tensor<!tf.resource<tensor<16xf32>>>) {
 ^bb0(%arg0: tensor<*x!tf.resource<tensor<32xf32>>>):
-  // expected-error @+1 {{operand type tensor<*x!tf.resource<tensor<32xf32>>> is incompatible with result type}}
+  // expected-error @+1 {{'tf.While' op input type tensor<*x!tf.resource<tensor<32xf32>>> is incompatible with result type tensor<!tf.resource<tensor<16xf32>>> at index 0}}
   %1 = "tf.While"(%arg0) {
     cond = @testWhileCond,
     body = @testWhileBody,
@@ -1714,48 +1897,71 @@ func @testValidWhileRegionNoInputs() -> () {
 }
 
 // -----
+// Invalid while tests. There are 5 sets of type matching that is required
+//   I = input, O = output, BI, BO = body input/output, CI = cond input.
+//   [I, O], [I, CI], [I, BI], [BO, BI], [BO, O].
+// Each check can fail due to number or type mismatch. However, these
+// conditions are not all independent. So we just check I->{CI, BI}, O->BO, and
+// in addition I->O. BO->BI mismatch cannot be independently created without
+// breaking one of these mismatches. That gives us 4x2 tests. In addition
+// condition result needs to be tensor<i1>, for which we have 3
+// additional validation tests. All these tests are based on the following
+// valid while
 
-func @testInvalidWhileRegionMismatchCondInputCount(%arg : tensor<i32>) -> (tensor<i32>) {
-  // expected-error @+1 {{'tf.WhileRegion' op condition should have same number of inputs (1) as tf.WhileRegion but has 0 inputs}}
-  %0 = "tf.WhileRegion"(%arg) (
-     {
-       // ^bb0(%carg: tensor<i32>):
-        %true = constant dense<1> : tensor<i1>
-        "tf.Yield"(%true) : (tensor<i1>) -> ()
-     },
-     {
-       ^bb0(%barg: tensor<i32>):
-        "tf.Yield"(%arg) : (tensor<i32>) -> ()
-     }
-  ) : (tensor<i32>) -> (tensor<i32>)
+func @testInvalidTestValidBase(%arg0 : tensor<i32>) -> (tensor<i32>) {
+  %0 = "tf.WhileRegion"(%arg0) (
+    {
+     ^bb0(%carg: tensor<i32>):
+      %false = constant dense<false> : tensor<i1>
+      "tf.Yield"(%false) : (tensor<i1>) -> ()
+    },
+    {
+     ^bb0(%barg: tensor<i32>):
+      "tf.Yield"(%barg) : (tensor<i32>) -> ()
+    }
+  ) { is_stateless = true } : (tensor<i32>) -> (tensor<i32>)
+  return %0 : tensor<i32>
+}
 
+func @testInvalidWhileRegion_I_CI_CountMismatch(%arg0 : tensor<i32>) -> (tensor<i32>) {
+  // expected-error @+1 {{'tf.WhileRegion' op inputs (size = 1) should have the same number of values as condition inputs (size = 0)}}
+  %0 = "tf.WhileRegion"(%arg0) (
+    {
+     //^bb0(%carg: tensor<i32>):
+      %false = constant dense<false> : tensor<i1>
+      "tf.Yield"(%false) : (tensor<i1>) -> ()
+    },
+    {
+     ^bb0(%barg: tensor<i32>):
+      "tf.Yield"(%barg) : (tensor<i32>) -> ()
+    }
+  ) { is_stateless = true } : (tensor<i32>) -> (tensor<i32>)
   return %0 : tensor<i32>
 }
 
 // -----
 
-func @testInvalidWhileRegionMismatchCondInputType(%arg : tensor<i32>) -> (tensor<i32>) {
-  // expected-error @+1 {{'tf.WhileRegion' op condition input type tensor<f32> is incompatible with tf.WhileRegion input type tensor<i32> at index 0}}
-  %0 = "tf.WhileRegion"(%arg) (
-     {
-       ^bb0(%carg: tensor<f32>):
-        %true = constant dense<1> : tensor<i1>
-        "tf.Yield"(%true) : (tensor<i1>) -> ()
-     },
-     {
-       ^bb0(%barg: tensor<i32>):
-        "tf.Yield"(%barg) : (tensor<i32>) -> ()
-     }
-  ) : (tensor<i32>) -> (tensor<i32>)
-
+func @testInvalidWhileRegion_I_CI_TypeMismatch(%arg0 : tensor<i32>) -> (tensor<i32>) {
+  // expected-error @+1 {{'tf.WhileRegion' op input type tensor<i32> is incompatible with condition input type tensor<f32> at index 0}}
+  %0 = "tf.WhileRegion"(%arg0) (
+    {
+     ^bb0(%carg: tensor<f32>):
+      %false = constant dense<false> : tensor<i1>
+      "tf.Yield"(%false) : (tensor<i1>) -> ()
+    },
+    {
+     ^bb0(%barg: tensor<i32>):
+      "tf.Yield"(%barg) : (tensor<i32>) -> ()
+    }
+  ) { is_stateless = true } : (tensor<i32>) -> (tensor<i32>)
   return %0 : tensor<i32>
 }
 
 // -----
 
-func @testInvalidWhileRegionMismatchBodyInputCount(%arg : tensor<i32>) -> (tensor<i32>) {
-  // expected-error @+1 {{'tf.WhileRegion' op body should have same number of inputs (1) as tf.WhileRegion but has 2 inputs}}
-  %0 = "tf.WhileRegion"(%arg) (
+func @testInvalidWhileRegion_I_BI_CountMismatch(%arg0 : tensor<i32>) -> (tensor<i32>) {
+  // expected-error @+1 {{'tf.WhileRegion' op inputs (size = 1) should have the same number of values as body inputs (size = 2)}}
+  %0 = "tf.WhileRegion"(%arg0) (
      {
        ^bb0(%carg: tensor<i32>):
         %true = constant dense<1> : tensor<i1>
@@ -1772,9 +1978,9 @@ func @testInvalidWhileRegionMismatchBodyInputCount(%arg : tensor<i32>) -> (tenso
 
 // -----
 
-func @testInvalidWhileRegionMismatchBodyInputType(%arg : tensor<i32>) -> (tensor<i32>) {
-  // expected-error @+1 {{body input type tensor<f32> is incompatible with tf.WhileRegion input type tensor<i32> at index 0}}
-  %0 = "tf.WhileRegion"(%arg) (
+func @testInvalidWhileRegion_I_BI_TypeMismatch(%arg0 : tensor<i32>) -> (tensor<i32>) {
+  // expected-error @+1 {{'tf.WhileRegion' op input type tensor<i32> is incompatible with body input type tensor<f32> at index 0}}
+  %0 = "tf.WhileRegion"(%arg0) (
      {
        ^bb0(%carg: tensor<i32>):
         %true = constant dense<1> : tensor<i1>
@@ -1792,6 +1998,77 @@ func @testInvalidWhileRegionMismatchBodyInputType(%arg : tensor<i32>) -> (tensor
 
 // -----
 
+func @testInvalidWhileRegion_O_BO_CountMismatch(%arg0 : tensor<i32>) -> (tensor<i32>) {
+  // expected-error @+1 {{'tf.WhileRegion' op body results (size = 2) should have the same number of values as results (size = 1)}}
+  %0 = "tf.WhileRegion"(%arg0) (
+    {
+     ^bb0(%carg: tensor<i32>):
+      %false = constant dense<false> : tensor<i1>
+      "tf.Yield"(%false) : (tensor<i1>) -> ()
+    },
+    {
+     ^bb0(%barg: tensor<i32>):
+      "tf.Yield"(%barg, %barg) : (tensor<i32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = true } : (tensor<i32>) -> (tensor<i32>)
+  return %0#0 : tensor<i32>
+}
+
+// -----
+
+func @testInvalidWhileRegionMismatch_O_BO_TypeMismatch(%arg0 : tensor<i32>, %arg1: tensor<f32>) -> (tensor<i32>) {
+  // expected-error @+1 {{'tf.WhileRegion' op body result type tensor<f32> is incompatible with result type tensor<i32> at index 0}}
+  %0 = "tf.WhileRegion"(%arg0) (
+    {
+     ^bb0(%carg: tensor<i32>):
+      %false = constant dense<false> : tensor<i1>
+      "tf.Yield"(%false) : (tensor<i1>) -> ()
+    },
+    {
+     ^bb0(%barg: tensor<i32>):
+      "tf.Yield"(%arg1) : (tensor<f32>) -> ()
+    }
+  ) { is_stateless = true } : (tensor<i32>) -> (tensor<i32>)
+  return %0 : tensor<i32>
+}
+
+// -----
+
+func @testInvalidWhileRegion_I_O_CountMismatch(%arg0 : tensor<i32>) -> (tensor<i32>) {
+  // expected-error@+1 {{'tf.WhileRegion' op inputs (size = 1) should have the same number of values as results (size = 2)}}
+  %0:2 = "tf.WhileRegion"(%arg0) (
+    {
+     ^bb0(%carg: tensor<i32>):
+      %false = constant dense<false> : tensor<i1>
+      "tf.Yield"(%false) : (tensor<i1>) -> ()
+    },
+    {
+     ^bb0(%barg: tensor<i32>):
+      "tf.Yield"(%barg, %barg) : (tensor<i32>, tensor<i32>) -> ()
+    }
+  ) { is_stateless = true } : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+  return %0#0 : tensor<i32>
+}
+
+// -----
+
+func @testInvalidWhileRegion_I_O_TypeMismatch(%arg0: tensor<i32>, %arg1 : tensor<f32>) -> (tensor<f32>) {
+  // expected-error@+1 {{'tf.WhileRegion' op input type tensor<i32> is incompatible with result type tensor<f32> at index 0}}
+  %0 = "tf.WhileRegion"(%arg0) (
+    {
+     ^bb0(%carg: tensor<i32>):
+      %false = constant dense<false> : tensor<i1>
+      "tf.Yield"(%false) : (tensor<i1>) -> ()
+    },
+    {
+     ^bb0(%barg: tensor<i32>):
+      "tf.Yield"(%arg1) : (tensor<f32>) -> ()
+    }
+  ) { is_stateless = true } : (tensor<i32>) -> (tensor<f32>)
+  return %0 : tensor<f32>
+}
+// -----
+
 func @testInvalidWhileRegionConditionOutputCount2(%arg : tensor<i32>) -> (tensor<i32>) {
   // expected-error @+1 {{'tf.WhileRegion' op condition should have a single tensor<i1> result}}
   %0 = "tf.WhileRegion"(%arg) (
@@ -1845,45 +2122,6 @@ func @testInvalidWhileRegionConditionOutputType(%arg : tensor<i32>) -> (tensor<i
   return %0 : tensor<i32>
 }
 
-// -----
-
-func @testInvalidWhileRegionMismatchBodyOutputCount(%arg : tensor<i32>) -> (tensor<i32>) {
-  // expected-error @+1 {{'tf.WhileRegion' op body should have same number (1) of results as tf.WhileRegion but has 2 results}}
-  %0 = "tf.WhileRegion"(%arg) (
-     {
-       ^bb0(%carg: tensor<i32>):
-        %true = constant dense<1> : tensor<i1>
-        "tf.Yield"(%true) : (tensor<i1>) -> ()
-     },
-     {
-       ^bb0(%barg: tensor<i32>):
-        %false = constant dense<1> : tensor<i1>
-        "tf.Yield"(%barg, %false) : (tensor<i32>, tensor<i1>) -> ()
-     }
-  ) : (tensor<i32>) -> (tensor<i32>)
-
-  return %0 : tensor<i32>
-}
-
-// -----
-
-func @testInvalidWhileRegionMismatchBodyOutputType(%arg : tensor<i32>) -> (tensor<i32>) {
-  // expected-error @+1 {{body result type tensor<f32> is incompatible with tf.WhileRegion result type tensor<i32> at index 0}}
-  %0 = "tf.WhileRegion"(%arg) (
-     {
-       ^bb0(%carg: tensor<i32>):
-        %true = constant dense<1> : tensor<i1>
-        "tf.Yield"(%true) : (tensor<i1>) -> ()
-     },
-     {
-       ^bb0(%barg: tensor<i32>):
-        %c = "tf.Cast"(%barg) : (tensor<i32>) -> tensor<f32>
-        "tf.Yield"(%c) : (tensor<f32>) -> ()
-     }
-  ) : (tensor<i32>) -> (tensor<i32>)
-
-  return %0 : tensor<i32>
-}
 
 // -----
 
@@ -1898,7 +2136,7 @@ func @testValidShape(tensor<1x32x32x16xf32>, tensor<*xf32>) -> (tensor<4xi32>, t
 // -----
 
 func @testShapeWrongResultElemType(%arg0: tensor<1x32x32x16xf32>) -> tensor<4xf32> {
-  // expected-error @+1 {{result #0 must be tensor of 32/64-bit signless integer values}}
+  // expected-error @+1 {{result #0 must be tensor of 32/64-bit signed integer values}}
   %0 = "tf.Shape"(%arg0) : (tensor<1x32x32x16xf32>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
 }
@@ -1942,7 +2180,7 @@ func @testValidShapeN(%arg0 : tensor<1x32x32x16xf32>, %arg1 : tensor<*xf32>) ->
 // -----
 
 func @testShapeNWrongResultElemType(%arg0: tensor<1x32x32x16xf32>) -> tensor<4xf32> {
-  // expected-error @+1 {{result #1 must be tensor of 32/64-bit signless integer values}}
+  // expected-error @+1 {{result #1 must be tensor of 32/64-bit signed integer values}}
   %0:2 = "tf.ShapeN"(%arg0, %arg0) : (tensor<1x32x32x16xf32>, tensor<1x32x32x16xf32>) -> (tensor<4xi32>, tensor<4xf32>)
   return %0#1 : tensor<4xf32>
 }
@@ -2003,7 +2241,7 @@ func @testVariableShapeMultipleSubtypes(%arg0: tensor<*x!tf.resource<tensor<1x32
 // -----
 
 func @testVariableShapeWrongResultElemType(%arg0: tensor<*x!tf.resource<tensor<1x32x32x16xf32>>>) -> tensor<?xf32> {
-  // expected-error @+1 {{result #0 must be tensor of 32/64-bit signless integer values}}
+  // expected-error @+1 {{result #0 must be tensor of 32/64-bit signed integer values}}
   %0 = "tf.VariableShape"(%arg0) : (tensor<*x!tf.resource<tensor<1x32x32x16xf32>>>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
 }
@@ -2139,7 +2377,7 @@ func @testTranspose(tensor<2x3x4xf32>) -> tensor<3x2x4xf32> {
 // Test invalid tf.Less
 func @testLess(tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32> {
 ^bb0(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>):
-  // expected-error @+1 {{op result #0 must be tensor of 1-bit signless integer values}}
+  // expected-error @+1 {{op result #0 must be tensor of bool values}}
   %0 = "tf.Less"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
   return %0 : tensor<4xi32>
 }
@@ -2156,7 +2394,7 @@ func @testConcatV2(%arg: tensor<8x16xf32>, %axis: tensor<i32>) -> tensor<?xf32>
 
 // tf.ConcatV2 with wrong 'axis' element type
 func @testConcatV2(%arg: tensor<8x16xf32>, %axis: tensor<f32>) -> tensor<?xf32> {
-  // expected-error @+1 {{operand #2 must be tensor of 32/64-bit signless integer values}}
+  // expected-error @+1 {{operand #2 must be tensor of 32/64-bit signed integer values}}
   %0 = "tf.ConcatV2"(%arg, %arg, %axis) : (tensor<8x16xf32>, tensor<8x16xf32>, tensor<f32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
@@ -2189,7 +2427,7 @@ func @testAll64(%arg0: tensor<2x2xi1>, %arg1: tensor<i64>) -> tensor<i1> {
 // -----
 
 func @testAllFloat(%arg0: tensor<2x2xi1>, %arg1: tensor<f32>) -> tensor<i1> {
-  // expected-error @+1 {{'tf.All' op operand #1 must be tensor of 32/64-bit signless integer values}}
+  // expected-error @+1 {{'tf.All' op operand #1 must be tensor of 32/64-bit signed integer values}}
   %0 = "tf.All"(%arg0, %arg1) {keep_dims = false} : (tensor<2x2xi1>, tensor<f32>) -> tensor<i1>
   return %0 : tensor<i1>
 }
@@ -2197,7 +2435,7 @@ func @testAllFloat(%arg0: tensor<2x2xi1>, %arg1: tensor<f32>) -> tensor<i1> {
 // -----
 
 func @testAllI32(%arg0: tensor<2x2xi32>, %arg1: tensor<f32>) -> tensor<i32> {
-  // expected-error @+1 {{'tf.All' op operand #0 must be tensor of 1-bit signless integer values}}
+  // expected-error @+1 {{'tf.All' op operand #0 must be tensor of bool values}}
   %0 = "tf.All"(%arg0, %arg1) {keep_dims = false} : (tensor<2x2xi32>, tensor<f32>) -> tensor<i32>
   return %0 : tensor<i32>
 }
@@ -2381,6 +2619,25 @@ func @testSlice_unknown_begin_in_bounds(%arg0: tensor<4xi32>, %begins: tensor<1x
 
 // -----
 
+func @testSlice_unequal_output_input_rank(%arg0: tensor<4xi32>, %begins: tensor<1xi64>) -> tensor<i32> {
+  %sizes = "tf.Const"() {value = dense<[1]> : tensor<1xi64>} : () -> (tensor<1xi64>)
+  // expected-error @+1 {{requires output to have the same rank as input, but got input rank 1 and output rank 0}}
+  %0 = "tf.Slice"(%arg0, %begins, %sizes) : (tensor<4xi32>, tensor<1xi64>, tensor<1xi64>) -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// -----
+
+func @testSlice_wrong_output_size(%arg0: tensor<4xi32>) -> tensor<1xi32> {
+  %begins = "tf.Const"() {value = dense<[1]> : tensor<1xi64>} : () -> (tensor<1xi64>)
+  %sizes = "tf.Const"() {value = dense<[2]> : tensor<1xi64>} : () -> (tensor<1xi64>)
+  // expected-error @+1 {{requires output size to have the same size of slice, got slice size 2 and output size 1}}
+  %0 = "tf.Slice"(%arg0, %begins, %sizes) : (tensor<4xi32>, tensor<1xi64>, tensor<1xi64>) -> tensor<1xi32>
+  return %0 : tensor<1xi32>
+}
+
+// -----
+
 // Valid StridedSlice operation.
 func @testStridedSlice(%input: tensor<4x8xf32>, %begin: tensor<2xi64>, %end: tensor<2xi64>, %strides: tensor<2xi64>) -> tensor<?x?xf32> {
   %0 = "tf.StridedSlice"(%input, %begin, %end, %strides) : (tensor<4x8xf32>, tensor<2xi64>, tensor<2xi64>, tensor<2xi64>) -> tensor<?x?xf32>
@@ -2660,6 +2917,13 @@ func @testSplitV2(%input: tensor<4x4xf32>) {
 
 // -----
 
+func @testSplitVDynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?xi32>, %arg2: tensor<i32>) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+  %0:2 = "tf.SplitV"(%arg0, %arg1, %arg2) : (tensor<?x?xf32>, tensor<?xi32>, tensor<i32>) -> (tensor<?x?xf32>, tensor<?x?xf32>)
+  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
+}
+
+// -----
+
 //===--------------------------------------------------------------------===//
 //  tf.All
 //===--------------------------------------------------------------------===//
@@ -3165,6 +3429,125 @@ func @testBatchMatMulV2(%lhs: tensor<10x10xf32>, %rhs: tensor<f32>) {
 
 // -----
 
+// CHECK-LABEL: func @testBatchMatMulV2NoBatchDimension
+func @testBatchMatMulV2NoBatchDimension(%lhs: tensor<5x10xf32>, %rhs: tensor<10x10xf32>) -> (tensor<5x10xf32>) {
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<5x10xf32>, tensor<10x10xf32>) -> tensor<5x10xf32>
+  return %0 : tensor<5x10xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @testBatchMatMulV2ValidBroadcastingBatchDimension
+func @testBatchMatMulV2ValidBroadcastingBatchDimension(%lhs: tensor<10x2x5x10xf32>, %rhs: tensor<10x10xf32>) -> (tensor<10x2x5x10xf32>) {
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<10x2x5x10xf32>, tensor<10x10xf32>) -> tensor<10x2x5x10xf32>
+  return %0 : tensor<10x2x5x10xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @testBatchMatMulV2ValidMultiBatchDimension
+func @testBatchMatMulV2ValidMultiBatchDimension(%lhs: tensor<4x5x1x3x2xf32>, %rhs: tensor<1x1x3x5xf32>) -> (tensor<4x5x1x2x5xf32>) {
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) { adj_x = true } : (tensor<4x5x1x3x2xf32>, tensor<1x1x3x5xf32>) -> tensor<4x5x1x2x5xf32>
+  return %0 : tensor<4x5x1x2x5xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2InvalidBroadcastingBatchDimensionWithHigherXRank(%lhs: tensor<10x2x5x10xf32>, %rhs: tensor<10x10x10xf32>) {
+  // expected-error @+1 {{found incompatible broadcast batch dimensions for lhs shape 'tensor<10x2x5x10xf32>' and rhs shape 'tensor<10x10x10xf32>'}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<10x2x5x10xf32>, tensor<10x10x10xf32>) -> tensor<10x10xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2InvalidBroadcastingBatchDimensionWithSameRank(%lhs: tensor<10x2x5x10xf32>, %rhs: tensor<10x10x10x10xf32>) {
+  // expected-error @+1 {{found incompatible broadcast batch dimensions for lhs shape 'tensor<10x2x5x10xf32>' and rhs shape 'tensor<10x10x10x10xf32>'}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<10x2x5x10xf32>, tensor<10x10x10x10xf32>) -> tensor<10x10xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2InvalidBroadcastingBatchDimensionWithHigherYRank(%lhs: tensor<2x5x10xf32>, %rhs: tensor<10x10x10x10xf32>) {
+  // expected-error @+1 {{found incompatible broadcast batch dimensions for lhs shape 'tensor<2x5x10xf32>' and rhs shape 'tensor<10x10x10x10xf32>'}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<2x5x10xf32>, tensor<10x10x10x10xf32>) -> tensor<10x10xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2InvalidOutputBatchDimension(%lhs: tensor<10x2x5x10xf32>, %rhs: tensor<2x10x10xf32>) {
+  // expected-error @+1 {{has mismatching input batch dimension 2 and output batch dimension 3}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<10x2x5x10xf32>, tensor<2x10x10xf32>) -> tensor<10x3x10x10xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2InvalidOutputRank(%lhs: tensor<10x2x5x10xf32>, %rhs: tensor<10x1x10x10xf32>) {
+  // expected-error @+1 {{found invalid output rank, expected 4 but got 3}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<10x2x5x10xf32>, tensor<10x1x10x10xf32>) -> tensor<10x5x10xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2InvalidOutputRowDim(%lhs: tensor<10x2x5x10xf32>, %rhs: tensor<10x10xf32>) {
+  // expected-error @+1 {{found invalid output dimension on row, expected 5 but got 10}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<10x2x5x10xf32>, tensor<10x10xf32>) -> tensor<10x2x10x10xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2AdjXInvalidOutputRowDim(%lhs: tensor<10x2x10x5xf32>, %rhs: tensor<10x10xf32>) {
+  // expected-error @+1 {{found invalid output dimension on row, expected 5 but got 10}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) { adj_x = true } : (tensor<10x2x10x5xf32>, tensor<10x10xf32>) -> tensor<10x2x10x10xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2InvalidOutputColDim(%lhs: tensor<10x2x5x10xf32>, %rhs: tensor<10x10xf32>) {
+  // expected-error @+1 {{found invalid output dimension on col, expected 10 but got 5}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<10x2x5x10xf32>, tensor<10x10xf32>) -> tensor<10x2x5x5xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2AdjYInvalidOutputColDim(%lhs: tensor<10x2x5x10xf32>, %rhs: tensor<4x10xf32>) {
+  // expected-error @+1 {{found invalid output dimension on col, expected 4 but got 10}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) { adj_y = true } : (tensor<10x2x5x10xf32>, tensor<4x10xf32>) -> tensor<10x2x5x10xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @testBatchMatMulV2PartiallyKnownInputBatchDim
+func @testBatchMatMulV2PartiallyKnownInputBatchDim(%lhs: tensor<4x5x?x3x2xf32>, %rhs: tensor<1x1x3x5xf32>) -> (tensor<4x5x?x2x5xf32>) {
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) { adj_x = true } : (tensor<4x5x?x3x2xf32>, tensor<1x1x3x5xf32>) -> tensor<4x5x?x2x5xf32>
+  return %0 : tensor<4x5x?x2x5xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @testBatchMatMulV2PartiallyKnownMatmulDim
+func @testBatchMatMulV2PartiallyKnownMatmulDim(%lhs: tensor<4x5x1x?x3xf32>, %rhs: tensor<1x1x3x5xf32>) -> (tensor<4x5x1x?x5xf32>) {
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<4x5x1x?x3xf32>, tensor<1x1x3x5xf32>) -> tensor<4x5x1x?x5xf32>
+  return %0 : tensor<4x5x1x?x5xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2InvalidPartiallyKnownMatmulDim(%lhs: tensor<4x5x1x?x3xf32>, %rhs: tensor<1x1x3x5xf32>) -> (tensor<4x5x1x?x3xf32>) {
+  // expected-error @+1 {{found invalid output dimension on col, expected 5 but got 3}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) : (tensor<4x5x1x?x3xf32>, tensor<1x1x3x5xf32>) -> tensor<4x5x1x?x3xf32>
+  return %0 : tensor<4x5x1x?x3xf32>
+}
+
+// -----
+
+func @testBatchMatMulV2AdjXInvalidPartiallyKnownMatmulDim(%lhs: tensor<4x5x1x3x?xf32>, %rhs: tensor<1x1x3x5xf32>) -> (tensor<4x5x1x?x3xf32>) {
+  // expected-error @+1 {{found invalid output dimension on col, expected 5 but got 3}}
+  %0 = "tf.BatchMatMulV2"(%lhs, %rhs) { adj_x = true } : (tensor<4x5x1x3x?xf32>, tensor<1x1x3x5xf32>) -> tensor<4x5x1x?x3xf32>
+  return %0 : tensor<4x5x1x?x3xf32>
+}
+
+// -----
+
 func @testDataFormatVecPermuteInvalid1dInput(%x: tensor<5xi32>) {
   // expected-error @+1 {{requires 1D input of size 4}}
   %0 = "tf.DataFormatVecPermute"(%x): (tensor<5xi32>) -> tensor<5xi32>
@@ -3357,7 +3740,7 @@ func @branch0(tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
 func @branch1(tensor<2xf32>) -> tensor<2xf32>
 
 func @testCaseMismatchedNumOperands(%arg0: tensor<i32>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{expects all branches to have 1 input(s), but branch #0 has 2 input(s)}}
+  // expected-error @+1 {{'tf.Case' op branch #0 inputs (size = 2) should have the same number of values as inputs (size = 1)}}
   %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch0, @branch1], is_stateless = false} : (tensor<i32>, tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
@@ -3368,7 +3751,7 @@ func @branch0(tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>)
 func @branch1(tensor<2xf32>) -> tensor<2xf32>
 
 func @testCaseMismatchedNumResults(%arg0: tensor<i32>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{expects all branches to have 1 result(s), but branch #0 has 2 result(s)}}
+  // expected-error @+1 {{'tf.Case' op branch #0 results (size = 2) should have the same number of values as results (size = 1)}}
   %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch0, @branch1], is_stateless = false} : (tensor<i32>, tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
@@ -3379,7 +3762,7 @@ func @branch0(tensor<*xf16>) -> tensor<*xf32>
 func @branch1(tensor<*xf32>) -> tensor<*xf32>
 
 func @testCaseOperandNotCastCompatible(%arg0: tensor<i32>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{expects operand type 'tensor<2xf32>' to be cast compatible with branch #0 input type 'tensor<*xf16>' at index 0}}
+  // expected-error @+1 {{'tf.Case' op branch #0 input type tensor<*xf16> is incompatible with input type tensor<2xf32> at index 0}}
   %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch0, @branch1], is_stateless = false} : (tensor<i32>, tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
@@ -3401,7 +3784,7 @@ func @branch0(tensor<*xf32>) -> tensor<*xf32>
 func @branch1(tensor<*xf32>) -> tensor<3xf32>
 
 func @testCaseResultNotCastCompatible(%arg0: tensor<i32>, %arg1: tensor<*xf32>) -> tensor<2xf32> {
-  // expected-error @+1 {{expects result type 'tensor<2xf32>' to be cast compatible with branch #1 result type 'tensor<3xf32>' at index 0}}
+  // expected-error @+1 {{'tf.Case' op branch #1 result type tensor<3xf32> is incompatible with result type tensor<2xf32> at index 0}}
   %0 = "tf.Case"(%arg0, %arg1) {branches = [@branch0, @branch1], is_stateless = false} : (tensor<i32>, tensor<*xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
@@ -3427,7 +3810,7 @@ func @testCaseRegionBadBranchIndicesShape(%arg0: tensor<8xi32>) {
 // -----
 
 func @testCaseRegionMismatchedNumResults(%arg0: tensor<i32>) {
-  // expected-error @+1 {{region #0 should have same number (1) of results as tf.CaseRegion but has 0 results}}
+  // expected-error @+1 {{'tf.CaseRegion' op branch #0 results (size = 0) should have the same number of values as results (size = 1)}}
   %1 = "tf.CaseRegion"(%arg0) ( {
     "tf.Yield"() : () -> ()
   }) {is_stateless = false} : (tensor<i32>) -> tensor<i1>
@@ -3437,7 +3820,7 @@ func @testCaseRegionMismatchedNumResults(%arg0: tensor<i32>) {
 // -----
 
 func @testCaseRegionMismatchedResultTypes(%arg0: tensor<i32>, %arg1: tensor<f32>) {
-  // expected-error @+1 {{region #0 result type tensor<f32> is incompatible with tf.CaseRegion result type tensor<i1> at index 0}}
+  // expected-error @+1 {{'tf.CaseRegion' op branch #0 result type tensor<f32> is incompatible with result type tensor<i1> at index 0}}
   %1 = "tf.CaseRegion"(%arg0) ( {
     "tf.Yield"(%arg1) : (tensor<f32>) -> ()
   }) {is_stateless = false} : (tensor<i32>) -> tensor<i1>
@@ -3468,3 +3851,92 @@ func @testCumprod(%arg: tensor<8x16xf32>) -> tensor<8x16xf32> {
   %0 = "tf.Cumprod"(%arg, %axis) : (tensor<8x16xf32>, tensor<i32>) -> tensor<8x16xf32>
   return %0 : tensor<8x16xf32>
 }
+
+// -----
+
+func @testTile(%arg0: tensor<2x3x?xf32>) {
+  %cst = constant dense <[2, 3, 4]> : tensor<3xi32>
+  %0 = "tf.Tile"(%arg0, %cst) : (tensor<2x3x?xf32>, tensor<3xi32>) -> tensor<4x9x?xf32>
+  return
+}
+
+// -----
+
+func @testTileMultipleNotRank1(%arg0: tensor<2x3xf32>, %arg1: tensor<1x1xi32>) {
+  // expected-error @+1 {{expected multiples to be rank 1, got rank = 2}}
+  %0 = "tf.Tile"(%arg0, %arg1) : (tensor<2x3xf32>, tensor<1x1xi32>) -> tensor<2x3xf32>
+  return
+}
+
+// -----
+
+func @testTileInputRankNotEqualToMultiplesSize(%arg0: tensor<2x3xf32>, %arg1: tensor<3xi32>) {
+  // expected-error @+1 {{expected size of multiples equal to rank of input, got multiples of size 3, and input of rank 2}}
+  %0 = "tf.Tile"(%arg0, %arg1) : (tensor<2x3xf32>, tensor<3xi32>) -> tensor<2x3xf32>
+  return
+}
+
+// -----
+
+func @testTileInputRankNotEqualToOutputRank(%arg0: tensor<2x3xf32>, %arg1: tensor<2xi32>) {
+  // expected-error @+1 {{expected rank of input to equal to rank of output, got input of rank 2, and output of rank 3}}
+  %0 = "tf.Tile"(%arg0, %arg1) : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<2x3x1xf32>
+  return
+}
+
+// -----
+
+func @testTileNegativeMultiples(%arg0: tensor<2x3xf32>) {
+  %cst = constant dense <[-1, 1]> : tensor<2xi32>
+  // expected-error @+1 {{expected multiples to be non-negative, got multiples[0] = -1}}
+  %0 = "tf.Tile"(%arg0, %cst) : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<2x3xf32>
+  return
+}
+
+// -----
+
+func @testTileInvalidOutputShape(%arg0: tensor<2x3xf32>) {
+  %cst = constant dense <[2, 3]> : tensor<2xi32>
+  // expected-error @+1 {{requires input.shape[1] (3) * 3 to be equal to output.shape[1] (6)}}
+  %0 = "tf.Tile"(%arg0, %cst) : (tensor<2x3xf32>, tensor<2xi32>) -> tensor<4x6xf32>
+  return
+}
+
+// -----
+
+// Test reference variable support for some ops (no errors expected)
+
+// CHECK-LABEL: @testMaximumWithRef
+func @testMaximumWithRef(%arg0: tensor<!tf.f32ref>, %arg1: tensor<f32>) -> tensor<f32> {
+  // CHECK: tf.Maximum
+  %0 = "tf.Maximum"(%arg0, %arg1) : (tensor<!tf.f32ref>, tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// CHECK-LABEL: @testAddV2WithRef
+func @testAddV2WithRef(%arg0: tensor<!tf.int16ref>, %arg1: tensor<i16>) -> tensor<i16> {
+  // CHECK: tf.AddV2
+  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<!tf.int16ref>, tensor<i16>) -> tensor<i16>
+  return %0 : tensor<i16>
+}
+
+// CHECK-LABEL: @testRealDivWithRef
+func @testRealDivWithRef(%arg0: tensor<f64>, %arg1: tensor<!tf.f64ref>) -> tensor<f64> {
+  // CHECK: tf.RealDivOp
+  %0 = "tf.RealDivOp"(%arg0, %arg1) : (tensor<f64>, tensor<!tf.f64ref>) -> tensor<f64>
+  return %0 : tensor<f64>
+}
+
+// CHECK-LABEL: @testDivNoNanWithRef
+func @testDivNoNanWithRef(%arg0: tensor<f32>, %arg1: tensor<!tf.f32ref>) -> tensor<f32> {
+  // CHECK: tf.DivNoNanOp
+  %0 = "tf.DivNoNanOp"(%arg0, %arg1) : (tensor<f32>, tensor<!tf.f32ref>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// CHECK-LABEL: @testAddWithRef
+func @testAddWithRef(%arg0: tensor<!tf.f64ref>, %arg1: tensor<f64>) -> tensor<f64> {
+  // CHECK: tf.Add
+  %0 = "tf.Add"(%arg0, %arg1) : (tensor<!tf.f64ref>, tensor<f64>) -> tensor<f64>
+  return %0 : tensor<f64>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_device_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_device_ops.mlir
index 745cf72f959..f6f14c5be61 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_device_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_device_ops.mlir
@@ -40,6 +40,19 @@ func @empty_replicate() {
 // CHECK-NEXT:   tf_device.return
 }
 
+// CHECK-LABEL: func @no_operand_replicate
+func @no_operand_replicate() {
+  tf_device.replicate {n = 2 : i32} {
+    %0 = "tf.Const"() { value = dense<0> : tensor<i64> } : () -> tensor<i64>
+    %1 = "tf.Const"() { value = dense<1> : tensor<i64> } : () -> tensor<i64>
+    tf_device.return %0, %1 : tensor<i64>, tensor<i64>
+  }
+  return
+  // CHECK:      tf_device.replicate
+  // CHECK-SAME: n = 2
+  // CHECK:   tf_device.return
+}
+
 // CHECK-LABEL: func @replicate_with_multiple_operands
 func @replicate_with_multiple_operands() {
   %0 = "tf.opA"() : () -> tensor<*xi1>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir
index 1e537880620..23a8e904ad9 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_executor_ops.mlir
@@ -433,7 +433,7 @@ func @nextiteration(%arg0: tensor<*xf32>, %arg1: i1) -> tensor<*xf32> {
     %1:3 = tf_executor.NextIteration.Source : tensor<*xf32>
     tf_executor.NextIteration.Sink[%1#1] %1#0 : tensor<*xf32>
 // CHECK: tf_executor.NextIteration.Source : tensor<*xf32>
-// CHECK: tf_executor.NextIteration.Sink [%{{.*}}] %{{.*}} : tensor<*xf32>
+// CHECK: tf_executor.NextIteration.Sink[%{{.*}}] %{{.*}} : tensor<*xf32>
     tf_executor.fetch %1#0 : tensor<*xf32>
   }
   return %0 : tensor<*xf32>
@@ -445,7 +445,7 @@ func @nextiteration_with_attributes(%arg0: tensor<*xf32>, %arg1: i1) -> tensor<*
     %1:3 = tf_executor.NextIteration.Source : tensor<*xf32> {attr3 = 32 : i64, tf_executor.attr_fetch = "some_value"}
     tf_executor.NextIteration.Sink[%1#1] %1#0 : tensor<*xf32> {attr4 = 42 : i64, tf_executor.attr_push = "other_value"}
 // CHECK: tf_executor.NextIteration.Source : tensor<*xf32> {attr3 = 32 : i64, tf_executor.attr_fetch = "some_value"}
-// CHECK: tf_executor.NextIteration.Sink [%{{.*}}] %{{.*}} : tensor<*xf32> {attr4 = 42 : i64, tf_executor.attr_push = "other_value"}
+// CHECK: tf_executor.NextIteration.Sink[%{{.*}}] %{{.*}} : tensor<*xf32> {attr4 = 42 : i64, tf_executor.attr_push = "other_value"}
     tf_executor.fetch %1#0 : tensor<*xf32>
   }
   return %0 : tensor<*xf32>
@@ -457,9 +457,9 @@ func @nextiteration_control(%arg0: tensor<*xf32>, %arg1: tensor<i1>) -> tensor<*
     %1:3 = tf_executor.Switch %arg0, %arg1 : tensor<*xf32>
     %2:2 = tf_executor.Enter %arg0, %1#2, %1#2 frame "some/frame" : tensor<*xf32>
     %3:3 = tf_executor.NextIteration.Source : tensor<*xf32>
-    tf_executor.NextIteration.Sink [%3#1] %3#0, %1#2 : tensor<*xf32>
+    tf_executor.NextIteration.Sink[%3#1] %3#0, %1#2 : tensor<*xf32>
 // CHECK: tf_executor.NextIteration.Source : tensor<*xf32>
-// CHECK: tf_executor.NextIteration.Sink [%{{.*}}] %{{.*}}, %{{.*}} : tensor<*xf32>
+// CHECK: tf_executor.NextIteration.Sink[%{{.*}}] %{{.*}}, %{{.*}} : tensor<*xf32>
     tf_executor.fetch %3#0 : tensor<*xf32>
   }
   return %0 : tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD
index 318f0422231..8ba18215ab5 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model:build_defs.bzl", "tf_saved_model_test")
 
 package(
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_output.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_output.py
new file mode 100644
index 00000000000..a6d78d4693b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_output.py
@@ -0,0 +1,125 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# RUN: %p/structured_output | FileCheck %s
+
+# pylint: disable=missing-docstring,line-too-long
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v2 as tf
+from tensorflow.compiler.mlir.tensorflow.tests.tf_saved_model import common
+
+
+class TestModule(tf.Module):
+  # The fNNNN name prefixes in this file are such that the sorted order of the
+  # functions in the resulting MLIR output match the order in the source file,
+  # allowing us to conveniently co-locate the CHECK's with the code they are
+  # checking.
+  #
+  # Note: CHECK-DAG doesn't work with CHECK-SAME/CHECK-NEXT.
+
+  # Check index paths for results.
+  #
+  # CHECK:      func {{@[a-zA-Z_0-9]+}}() -> (
+  # CHECK-SAME:   tensor<1xf32> {tf_saved_model.index_path = []})
+  # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0000_single_return"]
+  @tf.function(input_signature=[])
+  def f0000_single_return(self):
+    return tf.constant(1.0, shape=[1])
+
+  # Check index paths for results with multiple return values.
+  # Note that semantically in Python, multiple return values are equivalent
+  # to returning a tuple/list.
+  #
+  # CHECK:      func {{@[a-zA-Z_0-9]+}}() -> (
+  # CHECK-SAME:   tensor<1xf32> {tf_saved_model.index_path = [0]},
+  # CHECK-SAME:   tensor<2xf32> {tf_saved_model.index_path = [1]})
+  # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0001_multiple_results_no_punctuation"]
+  @tf.function(input_signature=[])
+  def f0001_multiple_results_no_punctuation(self):
+    return tf.constant(1.0, shape=[1]), tf.constant(1.0, shape=[2])
+
+  # Check index paths for results written explicitly with parentheses.
+  # This is semantically equivalent to the earlier test without parentheses,
+  # but this test serves as documentation of this behavior for the purposes
+  # of tf_saved_model users.
+  #
+  # CHECK:      func {{@[a-zA-Z_0-9]+}}() -> (
+  # CHECK-SAME:   tensor<1xf32> {tf_saved_model.index_path = [0]},
+  # CHECK-SAME:   tensor<2xf32> {tf_saved_model.index_path = [1]})
+  # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0002_multiple_results_parentheses"]
+  @tf.function(input_signature=[])
+  def f0002_multiple_results_parentheses(self):
+    return (tf.constant(1.0, shape=[1]), tf.constant(1.0, shape=[2]))
+
+  # Check index paths for results written explicitly with brackets.
+  # This is semantically equivalent to the earlier test without parentheses,
+  # but this test serves as documentation of this behavior for the purposes
+  # of tf_saved_model users.
+  #
+  # CHECK:      func {{@[a-zA-Z_0-9]+}}() -> (
+  # CHECK-SAME:   tensor<1xf32> {tf_saved_model.index_path = [0]},
+  # CHECK-SAME:   tensor<2xf32> {tf_saved_model.index_path = [1]})
+  # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0003_multiple_results_brackets"]
+  @tf.function(input_signature=[])
+  def f0003_multiple_results_brackets(self):
+    return [tf.constant(1.0, shape=[1]), tf.constant(1.0, shape=[2])]
+
+  # Check index paths for lists.
+  #
+  # CHECK:      func {{@[a-zA-Z_0-9]+}}() -> (
+  # CHECK-SAME:   tensor<1xf32> {tf_saved_model.index_path = [0, 0]},
+  # CHECK-SAME:   tensor<2xf32> {tf_saved_model.index_path = [0, 1]})
+  # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0004_list_2_elements"]
+  @tf.function(input_signature=[])
+  def f0004_list_2_elements(self):
+    return [[tf.constant(1.0, shape=[1]), tf.constant(1.0, shape=[2])]]
+
+  # Check index paths for dicts.
+  # Keys are linearized in sorted order, matching `tf.nest.flatten`.
+  # More thorough testing of this is in structured_input.py. The underlying code
+  # path for linearization is shared, so no need to replicate that testing here.
+  #
+  # CHECK:      func {{@[a-zA-Z_0-9]+}}() -> (
+  # CHECK-SAME:   tensor<1xf32> {tf_saved_model.index_path = ["x"]},
+  # CHECK-SAME:   tensor<2xf32> {tf_saved_model.index_path = ["y"]})
+  # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0005_dict_2_keys"]
+  @tf.function(input_signature=[])
+  def f0005_dict_2_keys(self):
+    return {
+        'x': tf.constant(1.0, shape=[1]),
+        'y': tf.constant(1.0, shape=[2]),
+    }
+
+  # Check index paths for outputs are correctly handled in the presence of
+  # multiple return statements.
+  #
+  # CHECK:      func {{@[a-zA-Z_0-9]+}}(
+  # CHECK-SAME:   %arg0: tensor<f32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]}
+  # CHECK-SAME: ) -> (
+  # CHECK-SAME:   tensor<1xf32> {tf_saved_model.index_path = ["x"]})
+  # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0006_multiple_return_statements"]
+  @tf.function(input_signature=[tf.TensorSpec([], tf.float32)])
+  def f0006_multiple_return_statements(self, x):
+    if x > 3.:
+      return {'x': tf.constant(1.0, shape=[1])}
+    else:
+      return {'x': tf.constant(1.0, shape=[1])}
+
+
+if __name__ == '__main__':
+  common.do_test(TestModule)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_lift_variables.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_lift_variables.mlir
index 84b4f97d4eb..ea2ebc64a29 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_lift_variables.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_lift_variables.mlir
@@ -59,3 +59,25 @@ module attributes {tf_saved_model.semantics, tf_saved_model.under_construction}
   // CHECK:    %arg0: tensor<!tf.resource<tensor<100x50xf32>>> {tf_saved_model.bound_input = @"dense/kernel"},
   // CHECK:    %arg1: tensor<!tf.resource<tensor<50xf32>>> {tf_saved_model.bound_input = @"dense/bias"})
 }
+
+// -----
+
+module attributes {tf_saved_model.semantics, tf_saved_model.under_construction} {
+
+  // Test case: Fix bound_inputs' types.
+
+  func @serving_default(%arg0: tensor<!tf.resource<tensor<*xf32>>> {tf.resource_name = "dense/kernel"}, %arg1: tensor<!tf.resource<tensor<*xf32>>> {tf.resource_name = "dense/bias"}) -> (tensor<*xf32> {tf_saved_model.index_path = ["dense_2"]})
+  attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "dense_2/Add:0"}, tf_saved_model.exported_names = ["serving_default"]} {
+    %0 = "tf.ReadVariableOp"(%arg0) {device = ""} : (tensor<!tf.resource<tensor<*xf32>>>) -> tensor<*xf32>
+    %1 = "tf.ReadVariableOp"(%arg1) {device = ""} : (tensor<!tf.resource<tensor<*xf32>>>) -> tensor<*xf32>
+    %2 = "tf.Add"(%0, %1) {device = ""} : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+    return %2 : tensor<*xf32>
+  }
+  // CHECK: "tf_saved_model.global_tensor"()
+  // CHECK:    sym_name = "dense/kernel"
+  // CHECK: "tf_saved_model.global_tensor"()
+  // CHECK:    sym_name = "dense/bias"
+  // CHECK:  func @serving_default(
+  // CHECK:    %arg0: tensor<!tf.resource<tensor<100x50xf32>>> {tf_saved_model.bound_input = @"dense/kernel"},
+  // CHECK:    %arg1: tensor<!tf.resource<tensor<50xf32>>> {tf_saved_model.bound_input = @"dense/bias"})
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-cluster-cleanup-attributes.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-cluster-cleanup-attributes.mlir
new file mode 100644
index 00000000000..6399d7d6fb0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-cluster-cleanup-attributes.mlir
@@ -0,0 +1,24 @@
+// RUN: tf-opt %s -tf-tpu-cleanup-cluster-attributes | FileCheck %s
+
+func @test(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f32>) ->  tensor<f32> {
+  // CHECK: "tf_device.cluster"
+  // CHECK-NOT: _tpu_replicate =
+  // CHECK-NOT: device =
+  %1 = "tf_device.cluster"() ( {
+    %2 = "tf.Add"(%arg1, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %3 = "tf.IfRegion"(%arg0) ({
+        %4 = "tf.Mul" (%arg1, %2) {device = "y"}: (tensor<f32>, tensor<f32>) -> tensor<f32>
+        "tf.Yield"(%4) : (tensor<f32>) -> ()
+      }, {
+        %5 = "tf.Div" (%arg1, %2) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+        "tf.Yield"(%5) : (tensor<f32>) -> ()
+      }) {is_stateless = true, _tpu_replicate = "x" } : (tensor<i1>) -> (tensor<f32>)
+    tf_device.return %3 : tensor<f32>
+  // CHECK: {_tpu_replicate = "x", cluster_attr = "cluster_attr", device = "y"}
+  }) {cluster_attr = "cluster_attr", _tpu_replicate = "x", device = "y"} : () -> tensor<f32>
+  // CHECK: "tf.Add"
+  // CHECK-SAME: {_tpu_replicate = "x", device = "y"}
+  %2 = "tf.Add"(%arg2, %1) {_tpu_replicate = "x", device = "y"} : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: return
+  return %2 : tensor<f32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-resource-read-for-write.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-resource-read-for-write.mlir
new file mode 100644
index 00000000000..a505a4e3269
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-resource-read-for-write.mlir
@@ -0,0 +1,64 @@
+// RUN: tf-opt -tf-tpu-resource-read-for-write %s | FileCheck %s --dump-input=always
+
+// CHECK-LABEL: func @write_only_resource
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<i32>, [[ARG1:%.*]]: tensor<f32>, [[ARG2:%.*]]: tensor<*x!tf.resource<tensor<i32>>>)
+func @write_only_resource(%arg0: tensor<i32>, %arg1: tensor<f32>, %arg2: tensor<*x!tf.resource<tensor<i32>>>) {
+  // CHECK-NEXT: [[READ:%.*]] = "tf.ReadVariableOp"([[ARG2]])
+  // CHECK-NEXT: [[CLUSTER:%.*]]:2 = "tf_device.cluster_func"([[ARG0]], [[ARG1]], [[READ]])
+  // CHECK-SAME: _tpu_replicate = "write"
+  %0:2 = "tf_device.cluster_func"(%arg0, %arg1) {_tpu_replicate = "write", func = @write_func} : (tensor<i32>, tensor<f32>) -> (tensor<f32>, tensor<i32>)
+  // CHECK-NEXT: "tf.AssignVariableOp"([[ARG2]], [[CLUSTER]]#1)
+  "tf.AssignVariableOp"(%arg2, %0#1) : (tensor<*x!tf.resource<tensor<i32>>>, tensor<i32>) -> ()
+  // CHECK-NEXT: return
+  return
+}
+
+// CHECK-LABEL: func @write_func
+// CHECK-SAME: ({{%.*}}: tensor<i32>, {{%.*}}: tensor<f32>, {{%.*}}: tensor<i32>) -> (tensor<f32>, tensor<i32>)
+func @write_func(%arg0: tensor<i32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<i32>) {
+  return %arg1, %arg0 : tensor<f32>, tensor<i32>
+}
+
+// CHECK-LABEL: func @read_write_resource
+func @read_write_resource(%arg0: tensor<i32>, %arg1: tensor<f32>, %arg2: tensor<*x!tf.resource<tensor<i32>>>) {
+  // CHECK-COUNT-1: tf.ReadVariableOp
+  %0 = "tf.ReadVariableOp"(%arg2) : (tensor<*x!tf.resource<tensor<i32>>>) -> tensor<i32>
+  %1:2 = "tf_device.cluster_func"(%arg0, %arg1, %0) {_tpu_replicate = "read_write", func = @read_write_func} : (tensor<i32>, tensor<f32>, tensor<i32>) -> (tensor<f32>, tensor<i32>)
+  "tf.AssignVariableOp"(%arg2, %1#1) : (tensor<*x!tf.resource<tensor<i32>>>, tensor<i32>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @read_write_func
+// CHECK-SAME: ({{%.*}}: tensor<i32>, {{%.*}}: tensor<f32>) -> (tensor<f32>, tensor<i32>)
+func @read_write_func(%arg0: tensor<i32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<i32>) {
+  return %arg1, %arg0 : tensor<f32>, tensor<i32>
+}
+
+// CHECK-LABEL: func @multiple_write_resource
+func @multiple_write_resource(%arg0: tensor<i32>, %arg1: tensor<*x!tf.resource<tensor<i32>>>) {
+  // CHECK-NOT: tf.ReadVariableOp
+  %0:2 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "multiple_write", func = @multiple_write_func} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+  "tf.AssignVariableOp"(%arg1, %0#0) : (tensor<*x!tf.resource<tensor<i32>>>, tensor<i32>) -> ()
+  "tf.AssignVariableOp"(%arg1, %0#1) : (tensor<*x!tf.resource<tensor<i32>>>, tensor<i32>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @multiple_write_func
+// CHECK-SAME: ({{%.*}}: tensor<i32>) -> (tensor<i32>, tensor<i32>)
+func @multiple_write_func(%arg0: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
+  return %arg0, %arg0 : tensor<i32>, tensor<i32>
+}
+
+// CHECK-LABEL: func @multiple_result_user
+func @multiple_result_user(%arg0: tensor<i32>, %arg1: tensor<*x!tf.resource<tensor<i32>>>) -> tensor<i32> {
+  // CHECK-NOT: tf.ReadVariableOp
+  %0 = "tf_device.cluster_func"(%arg0) {_tpu_replicate = "multiple_uses", func = @multiple_result_user_func} : (tensor<i32>) -> tensor<i32>
+  "tf.AssignVariableOp"(%arg1, %0) : (tensor<*x!tf.resource<tensor<i32>>>, tensor<i32>) -> ()
+  return %0 : tensor<i32>
+}
+
+// CHECK-LABEL: func @multiple_result_user_func
+// CHECK-SAME: ({{%.*}}: tensor<i32>) -> tensor<i32>
+func @multiple_result_user_func(%arg0: tensor<i32>) -> tensor<i32> {
+  return %arg0 : tensor<i32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
index 277e4a8415e..e87b83b0cdf 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu-variable-runtime-reformatting.mlir
@@ -2,29 +2,81 @@
 
 // Tests that the pass can correctly transform a training loop with 2 replicas.
 
+!tf_res_f32 = type tensor<*x!tf.resource<tensor<f32>>>
+!tf_res_md_f32 = type tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> // Multi-dim f32
+
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
   // CHECK-LABEL: func @main
-  func @main(%arg0: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
-             %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
-             %arg2: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:0"},
-             %arg3: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"}) {
+  // CHECK-SAME: %[[ARG0:.*]]: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
+  // CHECK-SAME: %[[ARG1:.*]]: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
+  // CHECK-SAME: %[[ARG2:.*]]: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:0"},
+  // CHECK-SAME: %[[ARG3:.*]]: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"})
+  func @main(%arg0: !tf_res_f32 {tf.device = "/device:TPU:0"},
+             %arg1: !tf_res_f32 {tf.device = "/device:TPU:1"},
+             %arg2: !tf_res_md_f32 {tf.device = "/device:TPU:0"},
+             %arg3: !tf_res_md_f32 {tf.device = "/device:TPU:1"}) {
 
     %0 = "tf.Const"() {value = dense<100> : tensor<i32>} : () -> tensor<i32>
     // CHECK: %[[STATE0:.*]] = "tf.VarHandleOp"()
     // CHECK-SAME: device = "/device:TPU:0"
     // CHECK: %[[STATE1:.*]] = "tf.VarHandleOp"()
     // CHECK-SAME: device = "/device:TPU:1"
-    // CHECK: %[[WHILE:.*]]:7 = "tf.While"(
-    // CHECK-SAME: %[[STATE0]], %[[STATE1]])
-    %1:5 = "tf.While"(%0, %arg0, %arg1, %arg2, %arg3)
-               {T = ["tfdtype$DT_INT32", "tfdtype$DT_RESOURCE",
-                 "tfdtype$DT_RESOURCE", "tfdtype$DT_RESOURCE",
-                 "tfdtype$DT_RESOURCE"], body = @while_body_7560,
-                cond = @while_cond_7550, device = "", is_stateless = false}
-         : (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
-            tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
-         -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
-             tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
+    // CHECK: %[[WHILE:.*]] = "tf.WhileRegion"(
+    %1 = "tf.WhileRegion"(%0) ( {
+       // Condition region
+       // CHECK: ^bb
+       // CHECK: "tf.Yield"
+       ^bb0(%carg0: tensor<i32>):
+          %c0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+          %c1 = "tf.GreaterEqual"(%carg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+          "tf.Yield"(%c1) : (tensor<i1>) -> ()
+      }, {
+       // Body region
+       // CHECK: ^bb0
+       ^bb0(%barg0: tensor<i32>):
+          %b0 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+          %b1 = "tf.AddV2"(%barg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+          // CHECK: %[[COMPILE:.*]]:2 = "tf_device.launch"
+          // CHECK-NEXT: "tf._TPUCompileMlir"()
+          %compile:2 = "tf_device.launch"() ( {
+            %b2:2 = "tf._TPUCompileMlir"() {
+              NumDynamicShapes = 0 : i64,
+              // The metadata encodes 2 parameter and two return values.
+              metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
+              mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+            tf_device.return %b2#0, %b2#1 : tensor<!tf.string>, tensor<2x!tf.string>
+          }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+          "tf_device.launch"() ( {
+            "tf.TPUCompileSucceededAssert"(%compile#0) : (tensor<!tf.string>) -> ()
+            tf_device.return
+          }) {device = "/device:CPU:0"} : () -> ()
+          // CHECK: tf_device.replicate
+          // CHECK-SAME: [%[[ARG0]], %[[ARG1]]] as %[[R0:.*]]: tensor<*x!tf.resource<tensor<f32>>>,
+          // CHECK-SAME: [%[[ARG2]], %[[ARG3]]] as %[[R1:.*]]: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>,
+          // CHECK-SAME: [%[[STATE0]], %[[STATE1]]] as %[[R_STATE:.*]]: tensor<!tf.resource<tensor<2x!tf.string>>>
+          // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]
+          %rep:2 = tf_device.replicate([%arg0, %arg1] as %arg30: tensor<*x!tf.resource<tensor<f32>>>,
+                              [%arg2, %arg3] as %arg31: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
+                  {_mirrored_variable_indices = [0, 1], devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]}, n = 2 : i32} {
+            // CHECK: %[[ID:.*]] = "tf.Identity"(%[[R0]])
+            %id = "tf.Identity"(%arg30) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<*x!tf.resource<tensor<f32>>>
+            // CHECK: "tf_device.launch"
+            // CHECK-NEXT: "tf.TPUReshardVariables"(%[[ID]], %[[R1]], %[[COMPILE]]#1, %[[R_STATE]])
+            // CHECK-NEXT: tf_device.return
+            // CHECK-NEXT: device = "TPU_REPLICATED_CORE_0"
+            // CHECK: "tf.TPUExecuteAndUpdateVariables"(%[[ID]], %[[R1]], %[[COMPILE]]#1)
+            "tf_device.launch"() ( {
+              "tf.TPUExecuteAndUpdateVariables"(%id, %arg31, %compile#1)
+                    {device_var_reads_indices = [0, 1], device_var_updates_indices = [0, 1]}
+                      : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<2x!tf.string>) -> ()
+              tf_device.return
+            }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
+            %ret = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+            tf_device.return %ret : tensor<i32>
+          }
+          // CHECK: "tf.Yield"
+          "tf.Yield"(%b1) :  (tensor<i32>) -> ()
+      }) {device = "", is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
     // CHECK: %[[DEFAULT:.*]] = "tf.Const"()
     // CHECK:  tf_device.replicate
     // CHECK-SAME: as %[[V0:.*]]: tensor<*x!tf.resource<tensor<f32>>>,
@@ -37,165 +89,72 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     // CHECK-NEXT: device = "TPU_REPLICATED_CORE_0"
     return
   }
-  // CHECK-LABEL: func @while_body_7560
-  func @while_body_7560(%arg0: tensor<i32>,
-                        %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
-                        %arg2: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
-                        %arg3: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:0"},
-                        %arg4: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"})
-        -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
-            tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>) {
-    // CHECK-SAME: (%[[ITER:.*]]: tensor<i32>,
-    // CHECK-SAME: %[[BODY_ARG1:.*]]: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
-    // CHECK-SAME: %[[BODY_ARG2:.*]]: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
-    // CHECK-SAME: %[[BODY_ARG3:.*]]: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:0"},
-    // CHECK-SAME: %[[BODY_ARG4:.*]]: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"},
-    // CHECK-SAME: %[[STATE_ARG0:.*]]: tensor<!tf.resource<tensor<2x!tf.string>>> {tf.device = "/device:TPU:0"},
-    // CHECK-SAME: %[[STATE_ARG1:.*]]: tensor<!tf.resource<tensor<2x!tf.string>>> {tf.device = "/device:TPU:1"})
-    %0 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
-    %1 = "tf.AddV2"(%arg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    // CHECK: %[[COMPILE:.*]]:2 = "tf_device.launch"
-    // CHECK-NEXT: "tf._TPUCompileMlir"()
-    %compile:2 = "tf_device.launch"() ( {
-      %2:2 = "tf._TPUCompileMlir"() {
-        NumDynamicShapes = 0 : i64,
-        // The metadata encodes 2 parameter and two return values.
-        metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
-      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<2x!tf.string>
-    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
-    "tf_device.launch"() ( {
-      "tf.TPUCompileSucceededAssert"(%compile#0) : (tensor<!tf.string>) -> ()
-      tf_device.return
-    }) {device = "/device:CPU:0"} : () -> ()
-    // CHECK: tf_device.replicate
-    // CHECK-SAME: [%[[BODY_ARG1]], %[[BODY_ARG2]]] as %[[R0:.*]]: tensor<*x!tf.resource<tensor<f32>>>,
-    // CHECK-SAME: [%[[BODY_ARG3]], %[[BODY_ARG4]]] as %[[R1:.*]]: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>,
-    // CHECK-SAME: [%[[STATE_ARG0]], %[[STATE_ARG1]]] as %[[R_STATE:.*]]: tensor<!tf.resource<tensor<2x!tf.string>>>
-    // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]
-    %rep:2 = tf_device.replicate([%arg1, %arg2] as %arg30: tensor<*x!tf.resource<tensor<f32>>>,
-                        [%arg3, %arg4] as %arg31: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
-            {_mirrored_variable_indices = [0, 1], devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]}, n = 2 : i32} {
-      // CHECK: %[[ID:.*]] = "tf.Identity"(%[[R0]])
-      %id = "tf.Identity"(%arg30) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<*x!tf.resource<tensor<f32>>>
-      // CHECK: "tf_device.launch"
-      // CHECK-NEXT: "tf.TPUReshardVariables"(%[[ID]], %[[R1]], %[[COMPILE]]#1, %[[R_STATE]])
-      // CHECK-NEXT: tf_device.return
-      // CHECK-NEXT: device = "TPU_REPLICATED_CORE_0"
-      // CHECK: "tf.TPUExecuteAndUpdateVariables"(%[[ID]], %[[R1]], %[[COMPILE]]#1)
-      "tf_device.launch"() ( {
-        "tf.TPUExecuteAndUpdateVariables"(%id, %arg31, %compile#1)
-              {device_var_reads_indices = [0, 1], device_var_updates_indices = [0, 1]}
-                : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<2x!tf.string>) -> ()
-        tf_device.return
-      }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
-      %ret = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-      tf_device.return %ret : tensor<i32>
-    }
-    return %1, %arg1, %arg2, %arg3, %arg4 : tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>,
-              tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>,
-              tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>
-  }
-  // CHECK-LABEL: func @while_cond_7550
-  func @while_cond_7550(%arg0: tensor<i32>,
-                        %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
-                        %arg2: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
-                        %arg3: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:0"},
-                        %arg4: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"})
-       -> tensor<i1> {
-    %0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %1 = "tf.GreaterEqual"(%arg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
 }
 
+
 // -----
 
 // Tests that the pass does not format variables with other uses.
 
+!tf_res_f32 = type tensor<*x!tf.resource<tensor<f32>>>
+!tf_res_md_f32 = type tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> // Multi-dim f32
+
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
   // CHECK-LABEL: func @main
   // CHECK-NOT: TPUReshardVariables
-  func @main(%arg0: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
-             %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
-             %arg2: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:0"},
-             %arg3: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"},
-             %arg4: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
-             %arg5: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"}) {
+  func @main(%arg0: !tf_res_f32 {tf.device = "/device:TPU:0"},
+             %arg1: !tf_res_f32 {tf.device = "/device:TPU:1"},
+             %arg2: !tf_res_md_f32 {tf.device = "/device:TPU:0"},
+             %arg3: !tf_res_md_f32 {tf.device = "/device:TPU:1"},
+             %arg4: !tf_res_f32 {tf.device = "/device:TPU:1"}) {
+
     %0 = "tf.Const"() {value = dense<100> : tensor<i32>} : () -> tensor<i32>
-    %1:7 = "tf.While"(%0, %arg0, %arg1, %arg2, %arg3, %arg4, %arg5)
-               {body = @while_body_7560,
-                cond = @while_cond_7550, device = "", is_stateless = false}
-         : (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
-            tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>,
-            tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
-         -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
-             tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>,
-             tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
+    %1 = "tf.WhileRegion"(%0) ( {
+       // Condition region
+       ^bb0(%carg0: tensor<i32>):
+          %c0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+          %c1 = "tf.GreaterEqual"(%carg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+          "tf._UnknownOp1_"(%arg1) : (!tf_res_f32) -> ()
+          "tf.Yield"(%c1) : (tensor<i1>) -> ()
+      }, {
+       // Body region
+       ^bb0(%barg0: tensor<i32>):
+          %b0 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+          %b1 = "tf.AddV2"(%barg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+          %compile:2 = "tf_device.launch"() ( {
+            %b2:2 = "tf._TPUCompileMlir"() {
+              NumDynamicShapes = 0 : i64,
+              // The metadata encodes 2 parameter and two return values.
+              metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
+              mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+            tf_device.return %b2#0, %b2#1 : tensor<!tf.string>, tensor<2x!tf.string>
+          }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+          "tf_device.launch"() ( {
+            "tf.TPUCompileSucceededAssert"(%compile#0) : (tensor<!tf.string>) -> ()
+            tf_device.return
+          }) {device = "/device:CPU:0"} : () -> ()
+          %id0 = "tf.Identity"(%arg3) : (!tf_res_md_f32) -> !tf_res_md_f32
+          "tf._Unknown_"(%id0) : (!tf_res_md_f32) -> ()
+          %newvar = "tf._SomeOp"() : () -> !tf_res_f32
+          %rep:2 = tf_device.replicate([%arg0, %arg1] as %arg30: !tf_res_f32,
+                                       [%arg2, %arg3] as %arg31: !tf_res_md_f32,
+                                       [%newvar, %arg4] as %arg32 : !tf_res_f32)
+                  {_mirrored_variable_indices = [0, 1, 2], devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]}, n = 2 : i32} {
+            // %arg30 is used in the cond function, %arg31 has other uses (%id0), and
+            // %arg32 is not a pass-through.
+            "tf_device.launch"() ( {
+              "tf.TPUExecuteAndUpdateVariables"(%arg30, %arg31, %arg32, %compile#1)
+                    {device_var_reads_indices = [0, 1], device_var_updates_indices = [0, 1]}
+                      : (!tf_res_f32, !tf_res_md_f32, !tf_res_f32, tensor<2x!tf.string>) -> ()
+              tf_device.return
+            }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
+            %ret = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+            tf_device.return %ret : tensor<i32>
+          }
+          "tf.Yield"(%b1) :  (tensor<i32>) -> ()
+      }) {device = "", is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
     return
   }
-  // CHECK-LABEL: func @while_body_7560
-  // CHECK-NOT: TPUReshardVariables
-  func @while_body_7560(%arg0: tensor<i32>,
-                        %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
-                        %arg2: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
-                        %arg3: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:0"},
-                        %arg4: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"},
-                        %arg5: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
-                        %arg6: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"})
-        -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
-            tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>,
-            tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>) {
-    %0 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
-    %1 = "tf.AddV2"(%arg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %compile:2 = "tf_device.launch"() ( {
-      %2:2 = "tf._TPUCompileMlir"() {
-        NumDynamicShapes = 0 : i64,
-        // The metadata encodes 2 parameter and two return values.
-        metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
-      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<2x!tf.string>
-    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
-    "tf_device.launch"() ( {
-      "tf.TPUCompileSucceededAssert"(%compile#0) : (tensor<!tf.string>) -> ()
-      tf_device.return
-    }) {device = "/device:CPU:0"} : () -> ()
-    %id0 = "tf.Identity"(%arg3) : (tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>) -> tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>
-    "tf._Unknown_"(%id0) : (tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>) -> ()
-    %newvar = "tf._SomeOp"() : () -> tensor<*x!tf.resource<tensor<f32>>>
-    tf_device.replicate([%arg1, %arg2] as %arg30: tensor<*x!tf.resource<tensor<f32>>>,
-                        [%arg3, %arg4] as %arg31: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>,
-                        [%newvar, %arg6] as %arg32: tensor<*x!tf.resource<tensor<f32>>>)
-            {_mirrored_variable_indices = [0, 1], devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]}, n = 2 : i32} {
-      // %arg30 is used in the cond function, %arg31 has other uses (%id0), and
-      // %arg32 is not a pass-through.
-      "tf_device.launch"() ( {
-        "tf.TPUExecuteAndUpdateVariables"(%arg30, %arg31, %arg32, %compile#1)
-              {device_var_reads_indices = [0, 1], device_var_updates_indices = [0, 1]}
-                : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>,
-                   tensor<*x!tf.resource<tensor<f32>>>, tensor<2x!tf.string>) -> ()
-        tf_device.return
-      }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
-      tf_device.return
-    }
-    return %1, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6 : tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>,
-              tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>,
-              tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>
-  }
-  // CHECK-LABEL: func @while_cond_7550
-  func @while_cond_7550(%arg0: tensor<i32>,
-                        %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
-                        %arg2: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
-                        %arg3: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:0"},
-                        %arg4: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"},
-                        %arg5: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
-                        %arg6: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"})
-       -> tensor<i1> {
-    %0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %1 = "tf.GreaterEqual"(%arg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
-    "tf._UnknownOp1_"(%arg1) : (tensor<*x!tf.resource<tensor<f32>>>) -> ()
-    return %1 : tensor<i1>
-  }
 }
 
 // -----
@@ -203,81 +162,62 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
 // Tests that the pass does not format variables when model parallelism is
 // present.
 
+!tf_res_f32 = type tensor<*x!tf.resource<tensor<f32>>>
+!tf_res_md_f32 = type tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> // Multi-dim f32
+
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
   // CHECK-LABEL: func @main
   // CHECK-NOT: TPUReshardVariables
-  func @main(%arg0: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
-             %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
-             %arg2: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:0"},
-             %arg3: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"}) {
+  func @main(%arg0: !tf_res_f32 {tf.device = "/device:TPU:0"},
+             %arg1: !tf_res_f32 {tf.device = "/device:TPU:1"},
+             %arg2: !tf_res_md_f32 {tf.device = "/device:TPU:0"},
+             %arg3: !tf_res_md_f32 {tf.device = "/device:TPU:1"}) {
 
     %0 = "tf.Const"() {value = dense<100> : tensor<i32>} : () -> tensor<i32>
-    %1:5 = "tf.While"(%0, %arg0, %arg1, %arg2, %arg3)
-               {T = ["tfdtype$DT_INT32", "tfdtype$DT_RESOURCE",
-                 "tfdtype$DT_RESOURCE", "tfdtype$DT_RESOURCE",
-                 "tfdtype$DT_RESOURCE"], body = @while_body_7560,
-                cond = @while_cond_7550, device = "", is_stateless = false}
-         : (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
-            tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
-         -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
-             tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
-    return
-  }
-  // CHECK-LABEL: func @while_body_7560
-  // CHECK-NOT: TPUReshardVariables
-  func @while_body_7560(%arg0: tensor<i32>,
-                        %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
-                        %arg2: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
-                        %arg3: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:0"},
-                        %arg4: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"})
-        -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
-            tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>) {
-    %0 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
-    %1 = "tf.AddV2"(%arg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    %compile:2 = "tf_device.launch"() ( {
-      %2:2 = "tf._TPUCompileMlir"() {
-        NumDynamicShapes = 0 : i64,
-        // The metadata encodes 2 parameter and two return values.
-        metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
-      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<2x!tf.string>
-    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
-    "tf_device.launch"() ( {
-      "tf.TPUCompileSucceededAssert"(%compile#0) : (tensor<!tf.string>) -> ()
-      tf_device.return
-    }) {device = "/device:CPU:0"} : () -> ()
-    %rep:2 = tf_device.replicate([%arg1, %arg2] as %arg30: tensor<*x!tf.resource<tensor<f32>>>,
-                        [%arg3, %arg4] as %arg31: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
-            {_mirrored_variable_indices = [0, 1], devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]}, n = 2 : i32} {
-      %id = "tf.Identity"(%arg30) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<*x!tf.resource<tensor<f32>>>
-      "tf_device.parallel_execute"() ({
-        "tf_device.launch"() ( {
-          "tf.TPUExecuteAndUpdateVariables"(%id, %arg31, %compile#1)
-                {device_var_reads_indices = [0, 1], device_var_updates_indices = [0, 1]}
-                  : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<2x!tf.string>) -> ()
-          tf_device.return
-        }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
-        tf_device.return
+    %1 = "tf.WhileRegion"(%0) ( {
+       // Condition region
+       ^bb0(%carg0: tensor<i32>):
+          %c0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+          %c1 = "tf.GreaterEqual"(%carg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+          "tf.Yield"(%c1) : (tensor<i1>) -> ()
       }, {
-        tf_device.return
-      }) {} : () -> ()
-      %ret = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-      tf_device.return %ret : tensor<i32>
-    }
-    return %1, %arg1, %arg2, %arg3, %arg4 : tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>,
-              tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>,
-              tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>
-  }
-  // CHECK-LABEL: func @while_cond_7550
-  func @while_cond_7550(%arg0: tensor<i32>,
-                        %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
-                        %arg2: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
-                        %arg3: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:0"},
-                        %arg4: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:TPU:1"})
-       -> tensor<i1> {
-    %0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %1 = "tf.GreaterEqual"(%arg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
-    return %1 : tensor<i1>
+       // Body region
+       ^bb0(%barg0: tensor<i32>):
+          %b0 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+          %b1 = "tf.AddV2"(%barg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+          %compile:2 = "tf_device.launch"() ( {
+            %b2:2 = "tf._TPUCompileMlir"() {
+              NumDynamicShapes = 0 : i64,
+              // The metadata encodes 2 parameter and two return values.
+              metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
+              mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+            tf_device.return %b2#0, %b2#1 : tensor<!tf.string>, tensor<2x!tf.string>
+          }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+          "tf_device.launch"() ( {
+            "tf.TPUCompileSucceededAssert"(%compile#0) : (tensor<!tf.string>) -> ()
+            tf_device.return
+          }) {device = "/device:CPU:0"} : () -> ()
+          %rep:2 = tf_device.replicate([%arg0, %arg1] as %arg30: tensor<*x!tf.resource<tensor<f32>>>,
+                              [%arg2, %arg3] as %arg31: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
+                  {_mirrored_variable_indices = [0, 1], devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]}, n = 2 : i32} {
+            %id = "tf.Identity"(%arg30) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<*x!tf.resource<tensor<f32>>>
+            "tf_device.parallel_execute"() ({
+              "tf_device.launch"() ( {
+                "tf.TPUExecuteAndUpdateVariables"(%id, %arg31, %compile#1)
+                      {device_var_reads_indices = [0, 1], device_var_updates_indices = [0, 1]}
+                        : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<2x!tf.string>) -> ()
+                tf_device.return
+              }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
+              tf_device.return
+            }, {
+              tf_device.return
+            }) {} : () -> ()
+            %ret = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+            tf_device.return %ret : tensor<i32>
+          }
+          "tf.Yield"(%b1) :  (tensor<i32>) -> ()
+      }) {device = "", is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
+    return
   }
 }
 
@@ -285,34 +225,83 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
 
 // Tests that the pass can correctly transform a training loop with a packed
 // variable.
+!tf_res_f32 = type tensor<*x!tf.resource<tensor<f32>>>
+!tf_res_md_f32 = type tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> // Multi-dim f32
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
   // CHECK-LABEL: func @main
-  func @main(%arg0: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
-             %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
-             %arg2: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:COMPOSITE:0"}) {
-
+  // CHECK-SAME: %[[ARG0:.*]]: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
+  // CHECK-SAME: %[[ARG1:.*]]: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
+  // CHECK-SAME: %[[ARG2:.*]]: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:COMPOSITE:0"})
+  func @main(%arg0: !tf_res_f32 {tf.device = "/device:TPU:0"},
+             %arg1: !tf_res_f32 {tf.device = "/device:TPU:1"},
+             %arg2: !tf_res_md_f32 {tf.device = "/device:COMPOSITE:0"}) {
     %0 = "tf.Const"() {value = dense<100> : tensor<i32>} : () -> tensor<i32>
     // CHECK: %[[STATE0:.*]] = "tf.VarHandleOp"()
     // CHECK-SAME: device = "/device:TPU:0"
     // CHECK: %[[STATE1:.*]] = "tf.VarHandleOp"()
     // CHECK-SAME: device = "/device:TPU:1"
-    // CHECK: %[[WHILE:.*]]:6 = "tf.While"(
-    // CHECK-SAME: %[[STATE0]], %[[STATE1]])
-    %1:4 = "tf.While"(%0, %arg0, %arg1, %arg2)
-               {T = ["tfdtype$DT_INT32", "tfdtype$DT_RESOURCE",
-                 "tfdtype$DT_RESOURCE", "tfdtype$DT_RESOURCE"],
-                body = @while_body_7560,
-                cond = @while_cond_7550, device = "", is_stateless = false}
-         : (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
-            tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
-         -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
-             tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
+    // CHECK: %[[WHILE:.*]] = "tf.WhileRegion"(
+    %1 = "tf.WhileRegion"(%0) ( {
+       // Condition region
+       // CHECK: ^bb
+       // CHECK: "tf.Yield"
+       ^bb0(%carg0: tensor<i32>):
+          %c0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+          %c1 = "tf.GreaterEqual"(%carg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+          "tf.Yield"(%c1) : (tensor<i1>) -> ()
+      }, {
+       // Body region
+       // CHECK: ^bb0
+       ^bb0(%barg0: tensor<i32>):
+          %b0 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+          %b1 = "tf.AddV2"(%barg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+          // CHECK: %[[COMPILE:.*]]:2 = "tf_device.launch"
+          // CHECK-NEXT: "tf._TPUCompileMlir"()
+          %compile:2 = "tf_device.launch"() ( {
+            %b2:2 = "tf._TPUCompileMlir"() {
+              NumDynamicShapes = 0 : i64,
+              // The metadata encodes 2 parameter and two return values.
+              metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
+              mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+            tf_device.return %b2#0, %b2#1 : tensor<!tf.string>, tensor<2x!tf.string>
+          }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
+          "tf_device.launch"() ( {
+            "tf.TPUCompileSucceededAssert"(%compile#0) : (tensor<!tf.string>) -> ()
+            tf_device.return
+          }) {device = "/device:CPU:0"} : () -> ()
+          // CHECK: tf_device.replicate
+          // CHECK-SAME: [%[[ARG0]], %[[ARG1]]] as %[[R0:.*]]: tensor<*x!tf.resource<tensor<f32>>>,
+          // CHECK-SAME: [%[[STATE0]], %[[STATE1]]] as %[[R_STATE:.*]]: tensor<!tf.resource<tensor<2x!tf.string>>>
+          // CHECK-SAME: %[[ARG2]] as %[[R1:.*]]: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>
+          // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]
+          %rep:2 = tf_device.replicate([%arg0, %arg1] as %arg30: tensor<*x!tf.resource<tensor<f32>>>,
+                                        %arg2 as %arg31: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
+                  {_mirrored_variable_indices = [0, 1], _packed_input_indices = [1], devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]}, n = 2 : i32} {
+            // CHECK: %[[ID:.*]] = "tf.Identity"(%[[R0]])
+            %id = "tf.Identity"(%arg30) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<*x!tf.resource<tensor<f32>>>
+            // CHECK: "tf_device.launch"
+            // CHECK-NEXT: "tf.TPUReshardVariables"(%[[ID]], %[[R1]], %[[COMPILE]]#1, %[[R_STATE]])
+            // CHECK-NEXT: tf_device.return
+            // CHECK-NEXT: device = "TPU_REPLICATED_CORE_0"
+            // CHECK: "tf.TPUExecuteAndUpdateVariables"(%[[ID]], %[[R1]], %[[COMPILE]]#1)
+            "tf_device.launch"() ( {
+              "tf.TPUExecuteAndUpdateVariables"(%id, %arg31, %compile#1)
+                    {device_var_reads_indices = [0, 1], device_var_updates_indices = [0, 1]}
+                      : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<2x!tf.string>) -> ()
+              tf_device.return
+            }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
+            %ret = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+            tf_device.return %ret : tensor<i32>
+          }
+          // CHECK: "tf.Yield"
+          "tf.Yield"(%b1) :  (tensor<i32>) -> ()
+      }) {device = "", is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
     // CHECK: %[[DEFAULT:.*]] = "tf.Const"()
     // CHECK:  tf_device.replicate
-    // CHECK-SAME: as %[[V0:.*]]: tensor<*x!tf.resource<tensor<f32>>>,
-    // CHECK-SAME: [%[[STATE0]], %[[STATE1]]] as %[[STATE:.*]]: tensor<!tf.resource<tensor<2x!tf.string>>>,
-    // CHECK-SAME: as %[[V1:.*]]: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>
+    // CHECK-SAME: [%[[ARG0]], %[[ARG1]]] as %[[V0:.*]]: tensor<*x!tf.resource<tensor<f32>>>,
+    // CHECK-SAME: [%[[STATE0]], %[[STATE1]]] as %[[STATE:.*]]: tensor<!tf.resource<tensor<2x!tf.string>>>
+    // CHECK-SAME: %[[ARG2]] as %[[V1:.*]]: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>
     // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]
     // CHECK: "tf_device.launch"
     // CHECK-NEXT: "tf.TPUReshardVariables"(%[[V0]], %[[V1]], %[[DEFAULT]], %[[STATE]])
@@ -320,70 +309,4 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     // CHECK-NEXT: device = "TPU_REPLICATED_CORE_0"
     return
   }
-  // CHECK-LABEL: func @while_body_7560
-  func @while_body_7560(%arg0: tensor<i32>,
-                        %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
-                        %arg2: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
-                        %arg3: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:COMPOSITE:0"})
-        -> (tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>,
-            tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>) {
-    // CHECK-SAME: (%[[ITER:.*]]: tensor<i32>,
-    // CHECK-SAME: %[[BODY_ARG1:.*]]: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
-    // CHECK-SAME: %[[BODY_ARG2:.*]]: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
-    // CHECK-SAME: %[[BODY_ARG3:.*]]: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:COMPOSITE:0"},
-    // CHECK-SAME: %[[STATE_ARG0:.*]]: tensor<!tf.resource<tensor<2x!tf.string>>> {tf.device = "/device:TPU:0"},
-    // CHECK-SAME: %[[STATE_ARG1:.*]]: tensor<!tf.resource<tensor<2x!tf.string>>> {tf.device = "/device:TPU:1"})
-    %0 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
-    %1 = "tf.AddV2"(%arg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    // CHECK: %[[COMPILE:.*]]:2 = "tf_device.launch"
-    // CHECK-NEXT: "tf._TPUCompileMlir"()
-    %compile:2 = "tf_device.launch"() ( {
-      %2:2 = "tf._TPUCompileMlir"() {
-        NumDynamicShapes = 0 : i64,
-        // The metadata encodes 2 parameter and two return values.
-        metadata = "\0A\0E\08\01\18\01\22\08\08\01\1A\01\01\22\01\00\0A \08\01\12\10\12\02\08\03\12\02\08\03\12\02\08\01\12\02\08 \18\01\22\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\12\0A\0A\08\08\01\1A\01\01\22\01\00\18\02 \01",
-        mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
-      tf_device.return %2#0, %2#1 : tensor<!tf.string>, tensor<2x!tf.string>
-    }) {device = "/device:CPU:0"} : () -> (tensor<!tf.string>, tensor<2x!tf.string>)
-    "tf_device.launch"() ( {
-      "tf.TPUCompileSucceededAssert"(%compile#0) : (tensor<!tf.string>) -> ()
-      tf_device.return
-    }) {device = "/device:CPU:0"} : () -> ()
-    // CHECK: tf_device.replicate
-    // CHECK-SAME: [%[[BODY_ARG1]], %[[BODY_ARG2]]] as %[[R0:.*]]: tensor<*x!tf.resource<tensor<f32>>>,
-    // CHECK-SAME: [%[[STATE_ARG0]], %[[STATE_ARG1]]] as %[[R_STATE:.*]]: tensor<!tf.resource<tensor<2x!tf.string>>>,
-    // CHECK-SAME: %[[BODY_ARG3]] as %[[R1:.*]]: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>
-    // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]
-    %rep:2 = tf_device.replicate([%arg1, %arg2] as %arg30: tensor<*x!tf.resource<tensor<f32>>>,
-                        %arg3 as %arg31: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>)
-            {_mirrored_variable_indices = [0, 1], _packed_input_indices = [1], devices = {TPU_REPLICATED_CORE_0 = ["/device:TPU:0", "/device:TPU:1"]}, n = 2 : i32} {
-      // CHECK: %[[ID:.*]] = "tf.Identity"(%[[R0]])
-      %id = "tf.Identity"(%arg30) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<*x!tf.resource<tensor<f32>>>
-      // CHECK: "tf_device.launch"
-      // CHECK-NEXT: "tf.TPUReshardVariables"(%[[ID]], %[[R1]], %[[COMPILE]]#1, %[[R_STATE]])
-      // CHECK-NEXT: tf_device.return
-      // CHECK-NEXT: device = "TPU_REPLICATED_CORE_0"
-      // CHECK: "tf.TPUExecuteAndUpdateVariables"(%[[ID]], %[[R1]], %[[COMPILE]]#1)
-      "tf_device.launch"() ( {
-        "tf.TPUExecuteAndUpdateVariables"(%id, %arg31, %compile#1)
-              {device_var_reads_indices = [0, 1], device_var_updates_indices = [0, 1]}
-                : (tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>, tensor<2x!tf.string>) -> ()
-        tf_device.return
-      }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
-      %ret = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-      tf_device.return %ret : tensor<i32>
-    }
-    return %1, %arg1, %arg2, %arg3 : tensor<i32>, tensor<*x!tf.resource<tensor<f32>>>,
-              tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<3x3x1x32xf32>>>
-  }
-  // CHECK-LABEL: func @while_cond_7550
-  func @while_cond_7550(%arg0: tensor<i32>,
-                        %arg1: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:0"},
-                        %arg2: tensor<*x!tf.resource<tensor<f32>>> {tf.device = "/device:TPU:1"},
-                        %arg3: tensor<*x!tf.resource<tensor<3x3x1x32xf32>>> {tf.device = "/device:COMPOSITE:0"})
-       -> tensor<i1> {
-    %0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-    %1 = "tf.GreaterEqual"(%arg0, %0) {T = i32, device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
-    return %1 : tensor<i1>
-  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
index 978f6e74aa8..3c2344be1e4 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
@@ -380,6 +380,153 @@ func @resource_before_cluster() {
 }
 
 
+// Test cluster formation with ops with attached regions within a cluster.
+// Nested op's that are moved should get their _tpu_replicate and device
+// attributes cleared.
+// CHECK-LABEL: func @cluster_ops_with_regions
+func @cluster_ops_with_regions() {
+  %0 = "tf.opA"() ({
+      %1 = "tf.opB"() {_tpu_replicate = "replicate", device = "device", name = "nameB"} : () -> (tensor<i32>)
+    }) {_tpu_replicate = "replicate", device = "device", name = "nameA"} : () -> tensor<i1>
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "replicate", device = "device", num_replicas = 1, topology = "topology"} : () -> ()
+  return
+}
+
+// CHECK:      "tf.opA"() ( {
+// CHECK-NEXT: "tf.opB"
+// CHECK-NOT: _tpu_replicate = "replicate"
+// CHECK-NOT:  device = "device"
+// CHECK-SAME: name = "nameB"
+// CHECK:      })
+// CHECK-NOT:  _tpu_replicate = "replicate"
+// CHECK-NOT:  device = "device"
+// CHECK:      name = "nameA"
+// CHECK:      tf_device.return
+
+// A nested cluster op using result of another cluster op. In the below, opA and
+// opB go in a cluster, and opD stays outside.
+// CHECK-LABEL: func @cluster_nested_op_using_other_op
+func @cluster_nested_op_using_other_op() {
+  %0 = "tf.opA"() { _tpu_replicate = "foo" } : () -> tensor<i32>
+  "tf.opB"() ({
+    "tf.opC"(%0) : (tensor<i32>) -> ()
+   }) { _tpu_replicate = "foo" } : () -> ()
+  "tf.opD"(%0) : (tensor<i32>) -> ()
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "foo", device = "CPU", num_replicas = 1, topology = "topology"} : () -> ()
+  return
+}
+
+// CHECK: [[CLUSTER:%.*]] = "tf_device.cluster"() ( {
+// CHECK:    [[OPA:%.*]] = "tf.opA"() : () -> tensor<i32>
+// CHECK:    "tf.opB"() ( {
+// CHECK:      "tf.opC"([[OPA]])
+// CHECK:    tf_device.return [[OPA]]
+// CHECK:    "tf.opD"([[CLUSTER]])
+
+// Preceding user is using resource updated by a nested op.
+!tf_res = type tensor<*x!tf.resource<tensor<f32>>>
+// CHECK-LABEL: func @cluster_nested_op_updating_resource
+func @cluster_nested_op_updating_resource() {
+  %0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.VarHandleOp"() {container = "", shape = #tf.shape<>, shared_name = "x"} : () -> !tf_res
+
+  "tf.opA"() ({
+    "tf.AssignAddVariableOp"(%1, %0) : (!tf_res, tensor<f32>) -> ()
+    "tf.terminator"() : () -> ()
+  }) { _tpu_replicate = "foo" } : () -> ()
+  "tf.AssignAddVariableOp"(%1, %0) : (!tf_res, tensor<f32>) -> ()
+  "tf.opB"() { _tpu_replicate = "foo" } : () -> ()
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "foo", device = "CPU", num_replicas = 1, topology = "topology"} : () -> ()
+  return
+}
+
+// CHECK: [[CONST:%.*]] = "tf.Const"
+// CHECK: [[VAR:%.*]] = "tf.VarHandleOp"
+// CHECK: "tf_device.cluster"() ( {
+// CHECK:   "tf.opA"() ( {
+// CHECK:     "tf.AssignAddVariableOp"([[VAR]], [[CONST]])
+// CHECK:    })
+// CHECK:    "tf.opB"()
+// CHECK:    tf_device.return
+// CHECK:  })
+// CHECK-SAME: _tpu_replicate = "foo"
+// CHECK: "tf.AssignAddVariableOp"([[VAR]], [[CONST]])
+
+// Preceding user is using resource updated by the cluster within a nested op.
+// Resource is updated by a cluster op, and opA (not in cluster) is using the
+// resource in a nested op. We expect opA to be after the cluster.
+// CHECK-LABEL: func @cluster_nested_op_using_resource
+func @cluster_nested_op_using_resource() {
+  %0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.VarHandleOp"() {container = "", shape = #tf.shape<>, shared_name = "x"} : () -> !tf_res
+  "tf.AssignAddVariableOp"(%1, %0) { _tpu_replicate = "foo" } : (!tf_res, tensor<f32>) -> ()
+  "tf.opA"() ({
+    "tf.AssignAddVariableOp"(%1, %0) : (!tf_res, tensor<f32>) -> ()
+    "tf.terminator"() : () -> ()
+   }) : () -> ()
+  "tf.opB"() { _tpu_replicate = "foo" } : () -> ()
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "foo", device = "CPU", num_replicas = 1, topology = "topology"} : () -> ()
+  return
+}
+
+// CHECK: [[CONST:%.*]] = "tf.Const"
+// CHECK: [[VAR:%.*]] = "tf.VarHandleOp"
+// CHECK: "tf_device.cluster"() ( {
+// CHECK:   "tf.AssignAddVariableOp"([[VAR]], [[CONST]])
+// CHECK:    "tf.opB"()
+// CHECK:    tf_device.return
+// CHECK:  })
+// CHECK-SAME: _tpu_replicate = "foo"
+// CHECK:  "tf.opA"() ( {
+// CHECK:   "tf.AssignAddVariableOp"([[VAR]], [[CONST]])
+
+
+// -----
+
+
+!tf_res = type tensor<*x!tf.resource<tensor<f32>>>
+
+// Test multiple replicated clusters interleaved and uses resource variables.
+// CHECK-LABEL: func @multiple_replicated_interleaved
+func @multiple_replicated_interleaved(%arg0: !tf_res) {
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "a", num_replicas = 2, topology = "topology"} : () -> ()
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "b", num_replicas = 2, topology = "topology"} : () -> ()
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "c", num_replicas = 2, topology = "topology"} : () -> ()
+  %0 = "tf.TPUReplicatedInput"(%arg0, %arg0) : (!tf_res, !tf_res) -> !tf_res
+  %1 = "tf.TPUReplicatedInput"(%arg0, %arg0) : (!tf_res, !tf_res) -> !tf_res
+  %2 = "tf.TPUReplicatedInput"(%arg0, %arg0) : (!tf_res, !tf_res) -> !tf_res
+  %3 = "tf.ReadVariableOp"(%0) {_tpu_replicate = "a"} : (!tf_res) -> tensor<f32>
+  %4 = "tf.ReadVariableOp"(%1) {_tpu_replicate = "b"} : (!tf_res) -> tensor<f32>
+  %5 = "tf.ReadVariableOp"(%2) {_tpu_replicate = "c"} : (!tf_res) -> tensor<f32>
+  %6 = "tf.Identity"(%3) {_tpu_replicate = "a"} : (tensor<f32>) -> tensor<f32>
+  %7 = "tf.Identity"(%4) {_tpu_replicate = "b"} : (tensor<f32>) -> tensor<f32>
+  %8 = "tf.Identity"(%5) {_tpu_replicate = "c"} : (tensor<f32>) -> tensor<f32>
+  %9:2 = "tf.TPUReplicatedOutput"(%6) : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  %10:2 = "tf.TPUReplicatedOutput"(%7) : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  %11:2 = "tf.TPUReplicatedOutput"(%8) : (tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  return
+}
+
+// CHECK: tf_device.replicate
+// CHECK: tf_device.replicate
+// CHECK: tf_device.replicate
+
+
+// -----
+
+
+// Test cluster that is replicated but has a non TPUReplicatedOutput consumer.
+// CHECK-LABEL: func @replicated_non_replicated_output
+func @replicated_non_replicated_output() {
+  %0 = "tf.opA"() {_tpu_replicate = "replicate", device = "device", name = "name"} : () -> tensor<i1>
+  %1 = "tf.opB"(%0) : (tensor<i1>) -> tensor<i1>
+  "tf.TPUReplicateMetadata"() {_tpu_replicate = "replicate", device = "device", num_replicas = 2, topology = "topology"} : () -> ()
+  return
+}
+
+// CHECK: [[REPLICATE:%.+]]:2 = tf_device.replicate
+// CHECK: "tf.opB"([[REPLICATE]]#0)
+
 // -----
 
 
@@ -407,18 +554,6 @@ func @bad_num_replicas() {
 // -----
 
 
-// Test that functions without TPUReplicateMetadata op are skipped without
-// error
-// CHECK-LABEL: func @missing_metadata_op
-func @missing_metadata_op() {
-  // expected-warning@+1 {{TPUReplicateMetadata for associated '_tpu_replicate' attribute 'replicate' is missing}}
-  %0 = "tf.opA"() {_tpu_replicate = "replicate"} : () -> tensor<i1>
-  return
-}
-
-// -----
-
-
 // Test cluster with TPUReplicatedInput where the number of operands does not
 // match associated `num_replicas` attribute.
 func @mismatched_replicated_input(%arg0: tensor<i1>) {
@@ -447,20 +582,6 @@ func @mismatched_replicated_output() {
 // -----
 
 
-// Test cluster that should be replicated where its outputs do not lead to a
-// TPUReplicatedOutput.
-func @missing_replicated_output() {
-  // expected-error@+1 {{requires output of tf_device.cluster to lead to a 'tf.TPUReplicatedOutput' op}}
-  %0 = "tf.opA"() {_tpu_replicate = "replicate", device = "device", name = "name"} : () -> tensor<i1>
-  %1 = "tf.opB"(%0) : (tensor<i1>) -> tensor<i1>
-  "tf.TPUReplicateMetadata"() {_tpu_replicate = "replicate", device = "device", num_replicas = 2, topology = "topology"} : () -> ()
-  return
-}
-
-
-// -----
-
-
 // Test unused TPUReplicatedInput that has more than one operand.
 func @leftover_replicated_input(%arg0: tensor<i1>) {
   %0 = "tf.TPUReplicatedInput"(%arg0, %arg0) : (tensor<i1>, tensor<i1>) -> tensor<i1>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_colocate_composite_resource_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_colocate_composite_resource_ops.mlir
new file mode 100644
index 00000000000..88af4535d81
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_colocate_composite_resource_ops.mlir
@@ -0,0 +1,118 @@
+// RUN: tf-opt %s -tf-tpu-colocate-composite-resource-ops | FileCheck %s
+
+// Tests ReadVariable op using composite device resource is wrapped inside
+// tf_device.Cluster.
+
+// CHECK-LABEL: func @testReadVariableOpColocated
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<*x!tf.resource<tensor<4xf32>>>)
+func @testReadVariableOpColocated(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>) {
+  // CHECK:      tf_device.replicate
+  // CHECK-SAME: (%[[ARG0]] as %[[RI_0:[a-z0-9]*]]: tensor<*x!tf.resource<tensor<4xf32>>>)
+  tf_device.replicate(%arg0 as %arg1: tensor<*x!tf.resource<tensor<4xf32>>>) {
+    _mirrored_variable_indices = [0], _replicated_input_indices = [-1],
+    devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]},
+    n = 2 : i32} {
+     // CHECK:      %[[RESOURCE_OUT:.*]] = "tf_device.launch"()
+     // CHECK-NEXT:   %[[READ_OUT:.*]] = "tf.ReadVariableOp"(%[[RI_0]])
+     // CHECK-NEXT:   tf_device.return %[[READ_OUT]]
+     // CHECK-NEXT: TPU_REPLICATED_CORE_0
+     %0 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+     %1 = "tf.A"() : () -> (tensor<2x!tf.string>)
+     "tf_device.launch"() ( {
+       "tf.TPUExecuteAndUpdateVariables"(%arg1, %1) {device_var_reads_indices = [0], device_var_updates_indices = [-1]} : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<2x!tf.string>) -> ()
+       tf_device.return
+    }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
+    "tf_device.launch"() ( {
+      // CHECK:  "tf.B"(%[[RESOURCE_OUT]])
+      "tf.B"(%0) : (tensor<4xf32>) -> ()
+       tf_device.return
+    }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
+    tf_device.return
+  }
+  return
+}
+
+// CHECK-LABEL: func @testReadVariableOpAfterIdentityColocated
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<*x!tf.resource<tensor<4xf32>>>)
+func @testReadVariableOpAfterIdentityColocated(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>) {
+  // CHECK:      tf_device.replicate
+  // CHECK-SAME: (%[[ARG0]] as %[[RI_0:[a-z0-9]*]]: tensor<*x!tf.resource<tensor<4xf32>>>)
+  tf_device.replicate(%arg0 as %arg1: tensor<*x!tf.resource<tensor<4xf32>>>) {
+    _mirrored_variable_indices = [0], _replicated_input_indices = [-1],
+    devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]},
+    n = 2 : i32} {
+     // CHECK:      %[[IDENTITY_OUT:.*]] = "tf.Identity"(%[[RI_0]])
+     // CHECK:      %[[RESOURCE_OUT:.*]] = "tf_device.launch"()
+     // CHECK-NEXT:   %[[READ_OUT:.*]] = "tf.ReadVariableOp"(%[[IDENTITY_OUT]])
+     // CHECK-NEXT:   tf_device.return %[[READ_OUT]]
+     // CHECK-NEXT: TPU_REPLICATED_CORE_0
+     %0 = "tf.Identity"(%arg1) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<*x!tf.resource<tensor<4xf32>>>
+     %1 = "tf.ReadVariableOp"(%0) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+     %2 = "tf.A"() : () -> (tensor<2x!tf.string>)
+     "tf_device.launch"() ( {
+       "tf.TPUExecuteAndUpdateVariables"(%arg1, %2) {device_var_reads_indices = [0], device_var_updates_indices = [-1]} : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<2x!tf.string>) -> ()
+       tf_device.return
+    }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
+    "tf_device.launch"() ( {
+      // CHECK:  "tf.B"(%[[RESOURCE_OUT]])
+      "tf.B"(%1) : (tensor<4xf32>) -> ()
+       tf_device.return
+    }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
+    tf_device.return
+  }
+  return
+}
+
+// Tests AssignVariable op using composite device resource is wrapped inside
+// tf_device.Cluster.
+
+// CHECK-LABEL: func @testAssignVariableOpColocated
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<*x!tf.resource<tensor<4xf32>>>)
+func @testAssignVariableOpColocated(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>) {
+  // CHECK:      tf_device.replicate
+  // CHECK-SAME: (%[[ARG0]] as %[[RI_0:[a-z0-9]*]]: tensor<*x!tf.resource<tensor<4xf32>>>)
+  tf_device.replicate(%arg0 as %arg1: tensor<*x!tf.resource<tensor<4xf32>>>) {
+    _mirrored_variable_indices = [0], _replicated_input_indices = [-1],
+    devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]},
+    n = 2 : i32} {
+     // CHECK:      %[[VAL_OUT:.*]] = "tf.A"() : () -> tensor<4xf32>
+     // CHECK:      "tf_device.launch"()
+     // CHECK-NEXT:   "tf.AssignVariableOp"(%[[RI_0]], %[[VAL_OUT]])
+     // CHECK-NEXT:   tf_device.return
+     // CHECK-NEXT: TPU_REPLICATED_CORE_0
+     %1 = "tf.A"() : () -> (tensor<4xf32>)
+     "tf.AssignVariableOp"(%arg1, %1) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+     %2 = "tf.B"() : () -> (tensor<2x!tf.string>)
+     "tf_device.launch"() ( {
+       "tf.TPUExecuteAndUpdateVariables"(%arg1, %2) {device_var_reads_indices = [0], device_var_updates_indices = [-1]} : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<2x!tf.string>) -> ()
+       tf_device.return
+    }) {device = "TPU_REPLICATED_CORE_0"} : () -> ()
+    tf_device.return
+  }
+  return
+}
+
+// Tests tf_device.replicate op not running on TPU devices ignored.
+
+// CHECK-LABEL: func @testNonTPUDeviceReplicationIgnored
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<*x!tf.resource<tensor<4xf32>>>)
+func @testNonTPUDeviceReplicationIgnored(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>) {
+  // CHECK:      tf_device.replicate
+  // CHECK-SAME: (%[[ARG0]] as %[[RI_0:[a-z0-9]*]]: tensor<*x!tf.resource<tensor<4xf32>>>)
+  tf_device.replicate(%arg0 as %arg1: tensor<*x!tf.resource<tensor<4xf32>>>) {
+    _mirrored_variable_indices = [0], _replicated_input_indices = [-1],
+    devices = {TPU_REPLICATED_HOST = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:CPU:1"]},
+    n = 2 : i32} {
+     // CHECK:      %[[VAL_OUT:.*]] = "tf.A"() : () -> tensor<4xf32>
+     // CHECK-NEXT: "tf.AssignVariableOp"(%[[RI_0]], %[[VAL_OUT]])
+     %1 = "tf.A"() : () -> (tensor<4xf32>)
+     "tf.AssignVariableOp"(%arg1, %1) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+     %2 = "tf.B"() : () -> (tensor<2x!tf.string>)
+     "tf_device.launch"() ( {
+       "tf.TPUExecuteAndUpdateVariables"(%arg1, %2) {device_var_reads_indices = [0], device_var_updates_indices = [-1]} : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<2x!tf.string>) -> ()
+       tf_device.return
+    }) {device = "TPU_REPLICATED_HOST"} : () -> ()
+    tf_device.return
+  }
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
index 32a8000ea82..8ae6fa958a6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
@@ -173,7 +173,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   func @tail_single_outside_compiled_op() {
     // CHECK:      %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
     // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
-    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   "tf.NoOp"
     // CHECK-NEXT:   tf_device.return %[[A_OUT]]
     // CHECK-NEXT: {
     // CHECK-DAG:  num_cores_per_replica = 1
@@ -190,7 +190,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     "tf_device.cluster"() ( {
       %a = "tf.A"() : () -> tensor<i32>
       "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> ()
-      "tf.C"() : () -> ()
+      "tf.NoOp"() : () -> ()
       tf_device.return
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
@@ -200,7 +200,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   func @tail_single_outside_compiled_op_user() -> tensor<i32> {
     // CHECK:      %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
     // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
-    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   "tf.NoOp"
     // CHECK-NEXT:   tf_device.return %[[A_OUT]]
     // CHECK-NEXT: {
     // CHECK-DAG:  num_cores_per_replica = 1
@@ -217,7 +217,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %cluster = "tf_device.cluster"() ( {
       %a = "tf.A"() : () -> tensor<i32>
       %b = "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
-      "tf.C"() : () -> ()
+      "tf.NoOp"() : () -> ()
       tf_device.return %b : tensor<i32>
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> tensor<i32>
     // CHECK:      return %[[LAUNCH_OUT]]
@@ -262,7 +262,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %b = "tf.B"() : () -> tensor<i32>
     // CHECK:      %[[CLUSTER_OUT:.*]]:2 = "tf_device.cluster"
     // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"
-    // CHECK-NEXT:   %[[E_OUT:.*]] = "tf.E"
+    // CHECK-NEXT:   %[[E_OUT:.*]] = "tf.Const"
     // CHECK-NEXT:   tf_device.return %[[C_OUT]], %[[E_OUT]]
     // CHECK-NEXT: {
     // CHECK-DAG:  num_cores_per_replica = 1
@@ -279,7 +279,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %cluster:5 = "tf_device.cluster"() ( {
       %c = "tf.C"()  : () -> tensor<i32>
       %d = "tf.D"(%c, %a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-      %e = "tf.E"()  : () -> tensor<i32>
+      %e = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
       tf_device.return %a, %b, %c, %d, %e : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>)
     // CHECK:      return %[[A_OUT]], %[[B_OUT]], %[[CLUSTER_OUT]]#0, %[[LAUNCH_OUT]], %[[CLUSTER_OUT]]#1
@@ -320,14 +320,14 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   func @head_tail_no_extraction_middle_outside_compiled_ops(%arg0: tensor<i32>) {
     // CHECK-NOT:  "tf_device.launch"
     // CHECK:      "tf_device.cluster"
-    // CHECK-NEXT:   "tf.A"
+    // CHECK-NEXT:   "tf.Identity"
     // CHECK-NEXT:   "tf.B"
-    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   "tf.Identity"
     // CHECK-NEXT:   tf_device.return
     "tf_device.cluster"() ( {
-      %a = "tf.A"(%arg0) : (tensor<i32>) -> tensor<i32>
+      %a = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
       %b = "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
-      "tf.C"(%b) : (tensor<i32>) -> ()
+      %c = "tf.Identity"(%b) : (tensor<i32>) -> tensor<i32>
       tf_device.return
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
@@ -379,7 +379,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:        %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
     // CHECK-NEXT:     %[[B_OUT:.*]] = "tf.B"
     // CHECK-NEXT:     %[[C_OUT:.*]] = "tf.C"(%[[RI]], %[[B_OUT]])
-    // CHECK-NEXT:     "tf.E"(%[[C_OUT]], %[[HEAD_LAUNCH_OUT]])
+    // CHECK-NEXT:     "tf.IdentityN"(%[[C_OUT]], %[[HEAD_LAUNCH_OUT]])
     // CHECK-NEXT:     tf_device.return %[[C_OUT]]
     // CHECK-NEXT:   {
     // CHECK-DAG:    num_cores_per_replica = 1
@@ -399,11 +399,139 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
         %b = "tf.B"() : () -> tensor<i32>
         %c = "tf.C"(%ri, %b) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
         %d = "tf.D"(%a, %c, %ri) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
-        %e = "tf.E"(%c, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+        %e:2 = "tf.IdentityN"(%c, %a) : (tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>)
         tf_device.return
       }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
       tf_device.return
     }
     return
   }
+
+  // CHECK-LABEL: func @side_effect_middle
+  func @side_effect_middle() {
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.A"
+    // CHECK-NEXT:   "tf.B"
+    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      "tf.A"() : () -> ()
+      "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
+      "tf.C"() : () -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
+  // CHECK-LABEL: func @side_effect_head_no_operand
+  func @side_effect_head_no_operand() {
+    // CHECK:      %[[HEAD_LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK-NEXT:   "tf.B"
+    // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"
+    // CHECK-NEXT:   tf_device.return %[[C_OUT]]
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.Const"
+    // CHECK-NEXT:   "tf.D"(%[[HEAD_LAUNCH_OUT]])
+    // CHECK-NEXT:   tf_device.return
+
+    "tf_device.cluster"() ( {
+      %cst = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+      "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
+      %c = "tf.C"() {_xla_outside_compilation = "cluster1"} : () -> tensor<i32>
+      "tf.D"(%c) : (tensor<i32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
+  // CHECK-LABEL: func @side_effect_tail_no_operand
+  func @side_effect_tail_no_operand() {
+    // CHECK:      %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
+    // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:   "tf.Const"
+    // CHECK-NEXT:   tf_device.return %[[A_OUT]]
+
+    // CHECK:      "tf_device.launch"()
+    // CHECK-NEXT:   "tf.B"(%[[CLUSTER_OUT]])
+    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   tf_device.return
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    "tf_device.cluster"() ( {
+      %a = "tf.A"() : () -> tensor<i32>
+      "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> ()
+      "tf.C"() {_xla_outside_compilation = "cluster1"} : () -> ()
+      %cst = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
+  // Test embedding ops can be head extracted and side effect analysis
+  // predecessors are ignored.
+
+  // CHECK-LABEL: func @embedding_head_extraction
+  func @embedding_head_extraction(%arg0: tensor<!tf.string>) {
+    // CHECK:      "tf_device.launch"()
+    // CHECK-NEXT:   "tf.EnqueueTPUEmbeddingRaggedTensorBatch"
+    // CHECK-NEXT:   tf_device.return
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.UnknownOp"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      "tf.UnknownOp"() : () -> ()
+      "tf.EnqueueTPUEmbeddingRaggedTensorBatch"(%arg0) {_xla_outside_compilation = "cluster1", table_ids = [1, 2]} : (tensor<!tf.string>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
+  // Test side effecting op after embedding op can be head extracted.
+
+  // CHECK-LABEL: func @op_after_embedding_head_extraction
+  func @op_after_embedding_head_extraction() {
+    // CHECK:      "tf_device.launch"()
+    // CHECK-NEXT:   "tf.A"
+    // CHECK-NEXT:   tf_device.return
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.RecvTPUEmbeddingActivations"
+    // CHECK-NEXT:   "tf.SendTPUEmbeddingGradients"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      %0 = "tf.RecvTPUEmbeddingActivations"() {config = "test_config_recv_embedding"} : () -> tensor<512x256xf32>
+      "tf.SendTPUEmbeddingGradients"(%0) {N = 1 : i64, NN = 0 : i64, config = "test_config_send_embedding", operand_segment_sizes = dense<[1, 0]> : vector<2xi32>} : (tensor<512x256xf32>) -> ()
+      "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
+  // Test side effecting op before embedding op can be tail extracted.
+
+  // CHECK-LABEL: func @op_before_embedding_tail_extraction
+  func @op_before_embedding_tail_extraction() {
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.UnknownOp"
+    // CHECK-NEXT:   "tf.RecvTPUEmbeddingActivations"
+    // CHECK-NEXT:   "tf.SendTPUEmbeddingGradients"
+    // CHECK-NEXT:   tf_device.return
+
+    // CHECK:      "tf_device.launch"()
+    // CHECK-NEXT:   "tf.A"
+    // CHECK-NEXT:   tf_device.return
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    "tf_device.cluster"() ( {
+      "tf.UnknownOp"() : () -> ()
+      "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> ()
+      %0 = "tf.RecvTPUEmbeddingActivations"() {config = "test_config_recv_embedding"} : () -> tensor<512x256xf32>
+      "tf.SendTPUEmbeddingGradients"(%0) {N = 1 : i64, NN = 0 : i64, config = "test_config_send_embedding", operand_segment_sizes = dense<[1, 0]> : vector<2xi32>} : (tensor<512x256xf32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
index 2271bca7382..e2cfd6c82b2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
@@ -145,12 +145,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:     "tf_device.launch"
     // CHECK:            %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
     // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_0_args"
     // CHECK:            "tf.B"(%[[RECV_OUTPUT]])
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            "tf._XlaHostComputeMlir"(%[[A_OUTPUT]])
-    // CHECK-SAME:       send_key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:       send_key = "host_compute_channel_cluster1_0_args"
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       %2 = "tf_device.cluster"() ( {
         %3 = "tf.A"() : () -> (tensor<?xi32>)
@@ -164,6 +164,32 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     return %1 : tensor<?xi32>
   }
 
+  // Tests value is added as operand to XlaHostCompute op only if defining op is
+  // in TPU cluster.
+
+  // CHECK-LABEL: func @single_outside_compiled_input_from_outside_device_cluster
+  func @single_outside_compiled_input_from_outside_device_cluster(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK-NEXT:   %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:       "tf.B"(%[[A_OUTPUT]])
+    // CHECK:          "tf_device.cluster"
+    // CHECK-NEXT:       "tf.C"()
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      %2 = "tf_device.cluster"() ( {
+        "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> ()
+        %4 = "tf.C"() : () -> tensor<?xi32>
+        tf_device.return %4 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
   // Tests extraction of a single outside compiled cluster with single host->device output.
 
   // CHECK-LABEL: func @single_outside_compiled_output_single_outside_compilation
@@ -174,15 +200,15 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:     "tf_device.launch"
     // CHECK:            %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
     // CHECK:            "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_0_args"
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"()
     // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_0_retvals"
     // CHECK:         "tf_device.cluster"
     // CHECK:           %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:           %[[HOST_OUTPUT:[0-9]*]] = "tf._XlaHostComputeMlir"()
-    // CHECK-SAME:      recv_key = "host_compute_channel_cluster1_retvals"
-    // CHECK-SAME:      send_key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:      recv_key = "host_compute_channel_cluster1_0_retvals"
+    // CHECK-SAME:      send_key = "host_compute_channel_cluster1_0_args"
     // CHECK:           "tf.C"(%[[HOST_OUTPUT]])
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       %2 = "tf_device.cluster"() ( {
@@ -209,11 +235,11 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[RECV_OUTPUT]])
     // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_0_retvals"
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            %[[HOST_OUTPUT:[0-9]*]] = "tf._XlaHostComputeMlir"(%[[A_OUTPUT]])
-    // CHECK-SAME:       recv_key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:       recv_key = "host_compute_channel_cluster1_0_retvals"
     // CHECK:            tf_device.return %[[HOST_OUTPUT]]
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       %2 = "tf_device.cluster"() ( {
@@ -240,11 +266,11 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[RECV_OUTPUT]])
     // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_0_retvals"
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            %[[HOST_OUTPUT:[0-9]*]] = "tf._XlaHostComputeMlir"(%[[A_OUTPUT]])
-    // CHECK-SAME:       recv_key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:       recv_key = "host_compute_channel_cluster1_0_retvals"
     // CHECK:            "tf.C"(%[[HOST_OUTPUT]])
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       %2 = "tf_device.cluster"() ( {
@@ -259,6 +285,42 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     return %1 : tensor<?xi32>
   }
 
+  // Tests host to device communcation is added only if value is used for ops
+  // that are not outside compiled.
+
+  // CHECK-LABEL: func @single_outside_compiled_output_used_for_another_host_op
+  func @single_outside_compiled_output_used_for_another_host_op(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:       %[[B_OUTPUT:[0-9]*]] = "tf.B"()
+    // CHECK-NEXT:       "tf.IfRegion"(%[[A_OUTPUT]])
+    // CHECK-NEXT:         "tf.D"(%[[B_OUTPUT]])
+    // CHECK:          "tf_device.cluster"
+    // CHECK-NEXT:       "tf.C"()
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %3 = "tf.A"() : () -> (tensor<i1>)
+      %2 = "tf_device.cluster"() ( {
+        %4 = "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> (tensor<?xi32>)
+        "tf.IfRegion"(%3) ({
+          "tf.D"(%4) : (tensor<?xi32>) -> ()
+          "tf.Yield"() : () -> ()
+        }, {
+          "tf.Yield"() : () -> ()
+        }) { _xla_outside_compilation = "cluster1", is_stateless = false} : (tensor<i1>) -> ()
+
+        %5 = "tf.C"() : () -> (tensor<?xi32>)
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+
   // Tests extraction of a single outside compiled cluster with multiple input/output.
 
   // CHECK-LABEL: func @multiple_outside_compiled_input_output_single_outside_compilation
@@ -271,12 +333,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:            %[[RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
     // CHECK:            %[[B_OUTPUT:[0-9]*]]:2 = "tf.C"(%[[RECV_OUTPUT]]#0, %[[RECV_OUTPUT]]#1)
     // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]]#0, %[[B_OUTPUT]]#1, %[[PROGRAM_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_0_retvals"
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
     // CHECK:            %[[HOST_OUTPUT:[0-9]*]]:2 = "tf._XlaHostComputeMlir"(%[[A_OUTPUT]], %[[B_OUTPUT]])
-    // CHECK-SAME:       recv_key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:       recv_key = "host_compute_channel_cluster1_0_retvals"
     // CHECK:            "tf.D"(%[[HOST_OUTPUT]]#0)
     // CHECK:            "tf.E"(%[[HOST_OUTPUT]]#1)
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
@@ -306,20 +368,20 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:            %[[RECV_OUTPUT2:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT2]])
     // CHECK:            %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[RECV_OUTPUT2]])
     // CHECK:            "tf._XlaSendFromHost"(%[[D_OUTPUT]], %[[PROGRAM_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster2_retvals"
+    // CHECK-SAME:       key = "host_compute_channel_cluster2_0_retvals"
     // CHECK:          "tf_device.launch"
     // CHECK:            %[[PROGRAM_OUTPUT1:[a-z_0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
     // CHECK:            %[[RECV_OUTPUT1:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT1]])
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[RECV_OUTPUT1]])
     // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_0_retvals"
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            %[[HOST_OUTPUT1:[0-9]*]] = "tf._XlaHostComputeMlir"(%[[A_OUTPUT]])
-    // CHECK-SAME:       recv_key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:       recv_key = "host_compute_channel_cluster1_0_retvals"
     // CHECK:            %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[HOST_OUTPUT1]])
     // CHECK:            %[[HOST_OUTPUT2:[0-9]*]] = "tf._XlaHostComputeMlir"(%[[C_OUTPUT]])
-    // CHECK-SAME:       recv_key = "host_compute_channel_cluster2_retvals"
+    // CHECK-SAME:       recv_key = "host_compute_channel_cluster2_0_retvals"
     // CHECK:            "tf.E"(%[[HOST_OUTPUT2]])
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       %2 = "tf_device.cluster"() ( {
@@ -346,12 +408,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:     "tf_device.launch"
     // CHECK:            %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
     // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_0_args"
     // CHECK:            "tf.B"(%arg0, %[[RECV_OUTPUT]])
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            "tf._XlaHostComputeMlir"(%[[A_OUTPUT]])
-    // CHECK-SAME:       send_key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:       send_key = "host_compute_channel_cluster1_0_args"
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       %2 = "tf_device.cluster"() ( {
         %3 = "tf.A"() : () -> (tensor<?xi32>)
@@ -375,20 +437,20 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:     "tf_device.launch"
     // CHECK:            %[[PROGRAM_OUTPUT_2:[a-z_0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
     // CHECK:            %[[RECV_OUTPUT_2:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT_2]])
-    // CHECK-SAME:      key = "host_compute_channel_cluster2_args"
+    // CHECK-SAME:      key = "host_compute_channel_cluster2_0_args"
     // CHECK:           "tf.D"(%[[RECV_OUTPUT_2]])
     // CHECK:          "tf_device.launch"
     // CHECK:            %[[PROGRAM_OUTPUT_1:[a-z_0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
     // CHECK:            %[[RECV_OUTPUT_1:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT_1]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_0_args"
     // CHECK:            "tf.B"(%[[RECV_OUTPUT_1]])
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            "tf._XlaHostComputeMlir"(%[[A_OUTPUT]])
-    // CHECK-SAME:       send_key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:       send_key = "host_compute_channel_cluster1_0_args"
     // CHECK:            %[[C_OUTPUT:[0-9]*]] = "tf.C"
     // CHECK:            "tf._XlaHostComputeMlir"(%[[C_OUTPUT]])
-    // CHECK-SAME:       send_key = "host_compute_channel_cluster2_args"
+    // CHECK-SAME:       send_key = "host_compute_channel_cluster2_0_args"
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       %2 = "tf_device.cluster"() ( {
         %3 = "tf.A"() : () -> (tensor<?xi32>)
@@ -413,14 +475,14 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:     "tf_device.launch"
     // CHECK:            %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"
     // CHECK:            %[[RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
-    // CHECK-SAME:       key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_0_args"
     // CHECK:            "tf.C"(%[[RECV_OUTPUT]]#0)
     // CHECK:            "tf.D"(%[[RECV_OUTPUT]]#1, %[[RECV_OUTPUT]]#0)
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
     // CHECK:            "tf._XlaHostComputeMlir"(%[[A_OUTPUT]], %[[B_OUTPUT]])
-    // CHECK-SAME:       send_key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:       send_key = "host_compute_channel_cluster1_0_args"
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       %2 = "tf_device.cluster"() ( {
         %3 = "tf.A"() : () -> (tensor<?xi32>)
@@ -469,25 +531,25 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
     // CHECK-NEXT:      %[[PREDICATE_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
     // CHECK-SAME:      device_ordinal = 0
-    // CHECK-SAME:      key = "if_predicate_channel_cluster1_0"
+    // CHECK-SAME:      key = "if_predicate_channel_cluster1_0_0"
     // CHECK-NEXT:       tf.IfRegion"(%[[PREDICATE_RECV_OUTPUT]])
     // CHECK-NEXT:         %[[ARG_RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
     // CHECK-SAME:         device_ordinal = 0
-    // CHECK-SAME:         key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_0_args"
     // CHECK:              "tf.D"(%[[ARG_RECV_OUTPUT]]#0, %[[ARG_RECV_OUTPUT]]#1)
     // CHECK:              "tf._XlaSendFromHost"(%[[PLACEHOLDER_KEY]])
     // CHECK-SAME:         device_ordinal = 0
-    // CHECK-SAME:         key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_0_retvals"
     // CHECK-NEXT:         "tf.Yield"() : () -> ()
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
     // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
-    // CHECK:            "tf.XlaSendToHost"(%6) {key = "if_predicate_channel_cluster1_0"}
+    // CHECK:            "tf.XlaSendToHost"(%6) {key = "if_predicate_channel_cluster1_0_0"}
     // CHECK-NEXT:       tf.IfRegion"(%[[G_OUTPUT]])
     // CHECK:              "tf._XlaHostComputeMlir"(%[[B_OUTPUT]], %[[A_OUTPUT]])
-    // CHECK-SAME:         recv_key = "host_compute_channel_cluster1_retvals"
-    // CHECK-SAME:         send_key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:         recv_key = "host_compute_channel_cluster1_0_retvals"
+    // CHECK-SAME:         send_key = "host_compute_channel_cluster1_0_args"
     // CHECK-SAME:         tpu_core = 0
     // CHECK-NEXT:         "tf.Yield"() : () -> ()
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
@@ -525,20 +587,20 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:       %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
     // CHECK-NEXT:       %[[RECV_OUTPUT:[0-9]*]]:3 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
     // CHECK-SAME:       device_ordinal = 0
-    // CHECK-SAME:       key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_0_args"
     // CHECK-SAME:       (tensor<2x!tf.string>) -> (tensor<?xi32>, tensor<?xi32>, tensor<i1>)
     // CHECK-NEXT:       tf.IfRegion"(%[[RECV_OUTPUT]]#2)
     // CHECK:              "tf.D"(%[[RECV_OUTPUT]]#0, %[[RECV_OUTPUT]]#1, %[[F_OUT]])
     // CHECK:              "tf._XlaSendFromHost"(%[[PLACEHOLDER_KEY]])
     // CHECK-SAME:         device_ordinal = 0
-    // CHECK-SAME:         key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_0_retvals"
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
     // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
     // CHECK:            "tf._XlaHostComputeMlir"(%[[B_OUTPUT]], %[[A_OUTPUT]], %[[G_OUTPUT]])
-    // CHECK-SAME:       recv_key = "host_compute_channel_cluster1_retvals"
-    // CHECK-SAME:       send_key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:       recv_key = "host_compute_channel_cluster1_0_retvals"
+    // CHECK-SAME:       send_key = "host_compute_channel_cluster1_0_args"
     // CHECK-SAME:       tpu_core = 0
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     %7 = "tf.F"() : () -> tensor<?xi32>
@@ -579,12 +641,12 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:       %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
     // CHECK-NEXT:       %[[RECV_OUTPUT_PREDICATE:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
     // CHECK-SAME:       device_ordinal = 0
-    // CHECK-SAME:       key = "if_predicate_channel_cluster1_0"
+    // CHECK-SAME:       key = "if_predicate_channel_cluster1_0_0"
     // CHECK-SAME:       (tensor<2x!tf.string>) -> tensor<i1>
     // CHECK-NEXT:       tf.IfRegion"(%[[RECV_OUTPUT_PREDICATE]])
     // CHECK-NEXT:         %[[RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
     // CHECK-SAME:         device_ordinal = 0
-    // CHECK-SAME:         key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_0_args"
     // CHECK-SAME:         (tensor<2x!tf.string>) -> (tensor<?xi32>, tensor<i1>)
     // CHECK-NEXT:         tf.IfRegion"(%[[RECV_OUTPUT]]#1)
     // CHECK-NEXT:           "tf.H"(%[[RECV_OUTPUT]]#0, %[[F_OUT]])
@@ -592,20 +654,20 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:                "tf.Yield"() : () -> ()
     // CHECK:              "tf._XlaSendFromHost"(%[[PLACEHOLDER_KEY]])
     // CHECK-SAME:         device_ordinal = 0
-    // CHECK-SAME:         key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_0_retvals"
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
     // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
     // CHECK:            "tf.XlaSendToHost"(%[[G_OUTPUT]])
-    // CHECK-SAME:       key = "if_predicate_channel_cluster1_0"
+    // CHECK-SAME:       key = "if_predicate_channel_cluster1_0_0"
     // CHECK-SAME:       (tensor<i1>) -> ()
     // CHECK-NEXT:       "tf.IfRegion"(%[[G_OUTPUT]])
     // CHECK:              %[[D_OUT:[0-9]*]] = "tf.D"
     // CHECK-NEXT:         %[[F_OUT:[0-9]*]] = "tf.F"
     // CHECK:              "tf._XlaHostComputeMlir"(%[[D_OUT]], %[[F_OUT]])
-    // CHECK-SAME:         recv_key = "host_compute_channel_cluster1_retvals"
-    // CHECK-SAME:         send_key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:         recv_key = "host_compute_channel_cluster1_0_retvals"
+    // CHECK-SAME:         send_key = "host_compute_channel_cluster1_0_args"
     // CHECK-SAME:         tpu_core = 0
     // CHECK:              "tf.Yield"() : () -> ()
     // CHECK:              "tf.Yield"() : () -> ()
@@ -657,25 +719,25 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
     // CHECK-NEXT:      %[[PREDICATE_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
     // CHECK-SAME:      device_ordinal = 0
-    // CHECK-SAME:      key = "if_predicate_channel_cluster1_0"
+    // CHECK-SAME:      key = "if_predicate_channel_cluster1_0_0"
     // CHECK-NEXT:       tf.IfRegion"(%[[PREDICATE_RECV_OUTPUT]])
     // CHECK-NEXT:         %[[ARG_RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
     // CHECK-SAME:         device_ordinal = 0
-    // CHECK-SAME:         key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_0_args"
     // CHECK:              %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[ARG_RECV_OUTPUT]]#0, %[[ARG_RECV_OUTPUT]]#1)
     // CHECK:              "tf._XlaSendFromHost"(%[[D_OUTPUT]], %[[PLACEHOLDER_KEY]])
     // CHECK-SAME:         device_ordinal = 0
-    // CHECK-SAME:         key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_0_retvals"
     // CHECK-NEXT:         "tf.Yield"() : () -> ()
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
     // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
-    // CHECK:            "tf.XlaSendToHost"(%6) {key = "if_predicate_channel_cluster1_0"}
+    // CHECK:            "tf.XlaSendToHost"(%6) {key = "if_predicate_channel_cluster1_0_0"}
     // CHECK-NEXT:       tf.IfRegion"(%[[G_OUTPUT]])
     // CHECK:              %[[HOST_COMPUTE_OUT:[0-9]*]] = "tf._XlaHostComputeMlir"(%[[B_OUTPUT]], %[[A_OUTPUT]])
-    // CHECK-SAME:         recv_key = "host_compute_channel_cluster1_retvals"
-    // CHECK-SAME:         send_key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:         recv_key = "host_compute_channel_cluster1_0_retvals"
+    // CHECK-SAME:         send_key = "host_compute_channel_cluster1_0_args"
     // CHECK-SAME:         tpu_core = 0
     // CHECK-NEXT:         "tf.Yield"(%[[HOST_COMPUTE_OUT]])
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
@@ -714,7 +776,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
     // CHECK-NEXT:      %[[PREDICATE_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
     // CHECK-SAME:      device_ordinal = 0
-    // CHECK-SAME:      key = "if_predicate_channel_cluster1_0"
+    // CHECK-SAME:      key = "if_predicate_channel_cluster1_0_0"
     // CHECK-NEXT:       tf.IfRegion"(%[[PREDICATE_RECV_OUTPUT]])
     // CHECK:              "tf.D"
     // CHECK-NEXT:         "tf.Yield"() : () -> ()
@@ -722,7 +784,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
     // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
-    // CHECK:            "tf.XlaSendToHost"(%6) {key = "if_predicate_channel_cluster1_0"}
+    // CHECK:            "tf.XlaSendToHost"(%6) {key = "if_predicate_channel_cluster1_0_0"}
     // CHECK-NEXT:       tf.IfRegion"(%[[G_OUTPUT]])
     // CHECK-NEXT:         "tf.Yield"() : () -> ()
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
@@ -759,30 +821,30 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
     // CHECK-NEXT:      %[[PREDICATE_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
     // CHECK-SAME:      device_ordinal = 0
-    // CHECK-SAME:      key = "if_predicate_channel_cluster1_0"
+    // CHECK-SAME:      key = "if_predicate_channel_cluster1_0_0"
     // CHECK-NEXT:      tf.IfRegion"(%[[PREDICATE_RECV_OUTPUT]])
     // CHECK-NEXT:        %[[PREDICATE2_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
     // CHECK-SAME:        device_ordinal = 0
-    // CHECK-SAME:        key = "if_predicate_channel_cluster1_1"
+    // CHECK-SAME:        key = "if_predicate_channel_cluster1_0_1"
     // CHECK-NEXT:        tf.IfRegion"(%[[PREDICATE2_RECV_OUTPUT]])
     // CHECK-NEXT:          "tf.Yield"() : () -> ()
     // CHECK:               %[[ARG_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
     // CHECK-SAME:          device_ordinal = 0
-    // CHECK-SAME:          key = "host_compute_channel_cluster1_args"
+    // CHECK-SAME:          key = "host_compute_channel_cluster1_0_args"
     // CHECK:               "tf.D"(%[[ARG_RECV_OUTPUT]])
     // CHECK:               "tf._XlaSendFromHost"(%[[PLACEHOLDER_KEY]])
     // CHECK-SAME:          device_ordinal = 0
-    // CHECK-SAME:          key = "host_compute_channel_cluster1_retvals"
+    // CHECK-SAME:          key = "host_compute_channel_cluster1_0_retvals"
     // CHECK-NEXT:          "tf.Yield"() : () -> ()
 
     // CHECK:          "tf_device.cluster"
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
     // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
-    // CHECK:            "tf.XlaSendToHost"(%[[G_OUTPUT]]) {key = "if_predicate_channel_cluster1_0"}
+    // CHECK:            "tf.XlaSendToHost"(%[[G_OUTPUT]]) {key = "if_predicate_channel_cluster1_0_0"}
     // CHECK-NEXT:       tf.IfRegion"(%[[G_OUTPUT]])
     // CHECK:              %[[H_OUTPUT:[0-9]*]] = "tf.H"(%[[B_OUTPUT]])
-    // CHECK:              "tf.XlaSendToHost"(%[[H_OUTPUT]]) {key = "if_predicate_channel_cluster1_1"}
+    // CHECK:              "tf.XlaSendToHost"(%[[H_OUTPUT]]) {key = "if_predicate_channel_cluster1_0_1"}
     // CHECK-NEXT:         tf.IfRegion"(%[[H_OUTPUT]])
     // CHECK-NEXT:           "tf.Yield"() : () -> ()
     // CHECK:                 %[[I_OUTPUT:[0-9]*]] = "tf.I"(%[[H_OUTPUT]])
@@ -819,4 +881,442 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
     return %1 : tensor<?xi32>
   }
+
+  // Tests extraction of a single outside compiled cluster inside a tf.WhileRegion op body.
+
+  // CHECK-LABEL: func @outside_compiled_ops_inside_tf_while_body
+  func @outside_compiled_ops_inside_tf_while_body(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:       tf.WhileRegion"
+    // CHECK-NEXT:         %[[COND_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "while_condition_channel_cluster1_0_0"
+    // CHECK:              "tf.Yield"(%[[COND_RECV_OUTPUT]])
+    // CHECK:              %[[BODY_RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK:              %[[D_OUTPUT:[0-9]*]] = "tf.D"
+    // CHECK:              "tf._XlaSendFromHost"(%[[D_OUTPUT]], %[[PLACEHOLDER_KEY]])
+    // CHECK_NEXT:         "tf.Yield"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK-NEXT:       tf.WhileRegion"(%[[B_OUTPUT]], %[[A_OUTPUT]])
+    // CHECK:              %[[H_OUTPUT:[0-9]*]] = "tf.H"
+    // CHECK-NEXT:         "tf.XlaSendToHost"(%[[H_OUTPUT]])
+    // CHECK-NEXT:         "tf.Yield"(%[[H_OUTPUT]])
+    // CHECK:              %[[C_OUTPUT:[0-9]*]] = "tf.C"
+    // CHECK-NEXT:         %[[HOST_COMPUTE_OUTPUT:[0-9]*]] = "tf._XlaHostComputeMlir"
+    // CHECK-NEXT          "tf.Yield"(%[[C_OUTPUT]], %[[HOST_COMPUTE_OUTPUT]])
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xf32>)
+        %4 = "tf.B"() : () -> (tensor<i32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.WhileRegion"(%4, %3) ({
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	  %7 = "tf.H"(%arg1) :  (tensor<i32>) -> tensor<i1>
+          "tf.Yield"(%7) : (tensor<i1>) -> ()
+        }, {
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	  %8 = "tf.C"(%arg1) : (tensor<i32>) -> tensor<i32>
+          %9 = "tf.D"(%arg1, %arg2) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
+          "tf.Yield"(%8, %9) : (tensor<i32>, tensor<?xf32>) -> ()
+        }) { is_stateless = false} : (tensor<i32>, tensor<?xf32>) -> (tensor<i32>, tensor<?xf32>)
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of a single outside compiled cluster inside a tf.WhileRegion op cond.
+
+  // CHECK-LABEL: func @outside_compiled_ops_inside_tf_while_cond
+  func @outside_compiled_ops_inside_tf_while_cond(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:       tf.WhileRegion"
+    // CHECK-NEXT:         %[[COND_RECV_OUTPUT1:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-NEXT:         %[[I_OUTPUT:[0-9]*]] = "tf.I"(%[[COND_RECV_OUTPUT1]]#0, %[[COND_RECV_OUTPUT1]]#1)
+    // CHECK-NEXT:         "tf._XlaSendFromHost"(%[[I_OUTPUT]], %[[PLACEHOLDER_KEY]])
+    // CHECK-NEXT:         %[[COND_RECV_OUTPUT2:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "while_condition_channel_cluster1_0_0"
+    // CHECK:              "tf.Yield"(%[[COND_RECV_OUTPUT2]])
+    // CHECK_NEXT:         "tf.Yield"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK-NEXT:       tf.WhileRegion"(%[[B_OUTPUT]], %[[A_OUTPUT]])
+    // CHECK               "tf.XlaHostCompute"
+    // CHECK:              %[[H_OUTPUT:[0-9]*]] = "tf.H"
+    // CHECK-NEXT:         "tf.XlaSendToHost"(%[[H_OUTPUT]])
+    // CHECK-NEXT:         "tf.Yield"(%[[H_OUTPUT]])
+    // CHECK:              %[[C_OUTPUT:[0-9]*]] = "tf.C"
+    // CHECK-NEXT:         "tf.D"
+    // CHECK-NEXT          "tf.Yield"(%[[C_OUTPUT]], %[[HOST_COMPUTE_OUTPUT]])
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xf32>)
+        %4 = "tf.B"() : () -> (tensor<i32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.WhileRegion"(%4, %3) ({
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+          %7 = "tf.I"(%arg1, %arg2) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<?xf32>) -> tensor<i32>
+	  %8 = "tf.H"(%7) :  (tensor<i32>) -> tensor<i1>
+          "tf.Yield"(%8) : (tensor<i1>) -> ()
+        }, {
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	  %7 = "tf.C"(%arg1) : (tensor<i32>) -> tensor<i32>
+          %8 = "tf.D"(%arg1, %arg2) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
+          "tf.Yield"(%7, %8) : (tensor<i32>, tensor<?xf32>) -> ()
+        }) { is_stateless = false} : (tensor<i32>, tensor<?xf32>) -> (tensor<i32>, tensor<?xf32>)
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of a single outside compiled cluster inside a tf.WhileRegion op cond and body.
+
+  // CHECK-LABEL: func @outside_compiled_ops_inside_tf_while_cond_body
+  func @outside_compiled_ops_inside_tf_while_cond_body(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:       %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:       tf.WhileRegion"
+    // CHECK-NEXT:         %[[COND_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "while_condition_channel_cluster2_0_0"
+    // CHECK:              "tf.Yield"(%[[COND_RECV_OUTPUT]])
+    // CHECK:              %[[BODY_RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK:              %[[D_OUTPUT:[0-9]*]] = "tf.D"
+    // CHECK:              "tf._XlaSendFromHost"(%[[D_OUTPUT]], %[[PLACEHOLDER_KEY]])
+    // CHECK_NEXT:         "tf.Yield"
+    // CHECK:         "tf_device.launch"
+    // CHECK-NEXT:       %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:       tf.WhileRegion"
+    // CHECK-NEXT:         %[[COND_RECV_OUTPUT1:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-NEXT:         %[[I_OUTPUT:[0-9]*]] = "tf.I"(%[[COND_RECV_OUTPUT1]]#0, %[[COND_RECV_OUTPUT1]]#1)
+    // CHECK-NEXT:         "tf._XlaSendFromHost"(%[[I_OUTPUT]], %[[PLACEHOLDER_KEY]])
+    // CHECK-NEXT:         %[[COND_RECV_OUTPUT2:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "while_condition_channel_cluster1_0_0"
+    // CHECK:              "tf.Yield"(%[[COND_RECV_OUTPUT2]])
+    // CHECK_NEXT:         "tf.Yield"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK-NEXT:       tf.WhileRegion"(%[[B_OUTPUT]], %[[A_OUTPUT]])
+    // CHECK               "tf.XlaHostCompute"
+    // CHECK:              %[[H_OUTPUT:[0-9]*]] = "tf.H"
+    // CHECK-NEXT:         "tf.XlaSendToHost"(%[[H_OUTPUT]])
+    // CHECK-NEXT:         "tf.XlaSendToHost"(%[[H_OUTPUT]])
+    // CHECK-NEXT:         "tf.Yield"(%[[H_OUTPUT]])
+    // CHECK:              %[[C_OUTPUT:[0-9]*]] = "tf.C"
+    // CHECK-NEXT          "tf.Yield"(%[[C_OUTPUT]], %[[HOST_COMPUTE_OUTPUT]])
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xf32>)
+        %4 = "tf.B"() : () -> (tensor<i32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.WhileRegion"(%4, %3) ({
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+          %7 = "tf.I"(%arg1, %arg2) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<?xf32>) -> tensor<i32>
+	  %8 = "tf.H"(%7) :  (tensor<i32>) -> tensor<i1>
+          "tf.Yield"(%8) : (tensor<i1>) -> ()
+        }, {
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	  %7 = "tf.C"(%arg1) : (tensor<i32>) -> tensor<i32>
+          %8 = "tf.D"(%arg1, %arg2) {_xla_outside_compilation = "cluster2"} : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
+          "tf.Yield"(%7, %8) : (tensor<i32>, tensor<?xf32>) -> ()
+        }) { is_stateless = false} : (tensor<i32>, tensor<?xf32>) -> (tensor<i32>, tensor<?xf32>)
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of a single outside compiled cluster inside a tf.IfRegion op
+  // nested in a tf.WhileRegion.
+
+  // CHECK-LABEL: func @outside_compiled_ops_inside_tf_while_if
+  func @outside_compiled_ops_inside_tf_while_if(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:       tf.WhileRegion"
+    // CHECK-NEXT:         %[[COND_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "while_condition_channel_cluster1_0_0"
+    // CHECK:              "tf.Yield"(%[[COND_RECV_OUTPUT]])
+    // CHECK:              %[[PREDICATE_RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-NEXT:         tf.IfRegion"(%[[PREDICATE_RECV_OUTPUT]])
+    // CHECK:                "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK:                %[[D_OUTPUT:[0-9]*]] = "tf.D"
+    // CHECK:                "tf._XlaSendFromHost"(%[[D_OUTPUT]], %[[PLACEHOLDER_KEY]])
+    // CHECK_NEXT:           "tf.Yield"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK-NEXT:       tf.WhileRegion"(%[[B_OUTPUT]], %[[A_OUTPUT]])
+    // CHECK:              %[[H_OUTPUT:[0-9]*]] = "tf.H"
+    // CHECK-NEXT:         "tf.XlaSendToHost"(%[[H_OUTPUT]])
+    // CHECK-NEXT:         "tf.Yield"(%[[H_OUTPUT]])
+    // CHECK:              %[[C_OUTPUT:[0-9]*]] = "tf.C"
+    // CHECK-NEXT:         "tf.XlaSendToHost"(%[[G_OUTPUT]])
+    // CHECK-NEXT:         tf.IfRegion"(%[[G_OUTPUT]])
+    // CHECK-NEXT:         %[[HOST_COMPUTE_OUTPUT:[0-9]*]] = "tf._XlaHostComputeMlir"
+    // CHECK-NEXT          "tf.Yield"(%[[HOST_COMPUTE_OUTPUT]])
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xf32>)
+        %4 = "tf.B"() : () -> (tensor<i32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.WhileRegion"(%4, %3) ({
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	  %7 = "tf.H"(%arg1) :  (tensor<i32>) -> tensor<i1>
+          "tf.Yield"(%7) : (tensor<i1>) -> ()
+        }, {
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	  %8 = "tf.C"(%arg1) : (tensor<i32>) -> tensor<i32>
+          %10 = "tf.IfRegion"(%6) ({
+            %9 = "tf.D"() {_xla_outside_compilation = "cluster1"} : () -> tensor<?xf32>
+	    "tf.Yield"(%9) : (tensor<?xf32>) -> ()
+	  }, {
+	    "tf.Yield"(%arg2) : (tensor<?xf32>) -> ()
+	  }) { is_stateless = false} : (tensor<i1>) -> tensor<?xf32>
+          "tf.Yield"(%8, %10) : (tensor<i32>, tensor<?xf32>) -> ()
+        }) { is_stateless = false} : (tensor<i32>, tensor<?xf32>) -> (tensor<i32>, tensor<?xf32>)
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of an outside compiled tf.IfRegion op where the entirety
+  // of tf.IfRegion op is outside compiled with a nested tf.WhileRegion op.
+
+  // CHECK-LABEL: func @outside_compiled_tf_if_nested_while
+  func @outside_compiled_tf_if_nested_while(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    // CHECK:      %[[A_OUT:[0-9]*]] = "tf.A"
+    // CHECK:      %[[F_OUT:[0-9]*]] = "tf.F"
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:       %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:       %[[RECV_OUTPUT:[0-9]*]]:3 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:       device_ordinal = 0
+    // CHECK-SAME:       key = "host_compute_channel_cluster1_0_args"
+    // CHECK-SAME:       (tensor<2x!tf.string>) -> (tensor<?xi32>, tensor<?xi32>, tensor<i1>)
+    // CHECK-NEXT:       tf.IfRegion"(%[[RECV_OUTPUT]]#2)
+    // CHECK:              %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[RECV_OUTPUT]]#0, %[[RECV_OUTPUT]]#1, %[[F_OUT]])
+    // CHECK-NEXT:         %[[J_OUTPUT:[0-9]*]] = "tf.J"
+    // CHECK-NEXT:         %[[K_OUTPUT:[0-9]*]] = "tf.K"
+    // CHECK-NEXT:          tf.WhileRegion"(%[[J_OUTPUT]], %[[D_OUTPUT]])
+    // CHECK:                 %[[H_OUTPUT:[0-9]*]] = "tf.H"(%[[K_OUTPUT]])
+    // CHECK:              "tf._XlaSendFromHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-SAME:         device_ordinal = 0
+    // CHECK-SAME:         key = "host_compute_channel_cluster1_0_retvals"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK:            "tf._XlaHostComputeMlir"(%[[B_OUTPUT]], %[[A_OUTPUT]], %[[G_OUTPUT]])
+    // CHECK-SAME:       recv_key = "host_compute_channel_cluster1_0_retvals"
+    // CHECK-SAME:       send_key = "host_compute_channel_cluster1_0_args"
+    // CHECK-SAME:       tpu_core = 0
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    %7 = "tf.F"() : () -> tensor<?xi32>
+
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xi32>)
+        %4 = "tf.B"() : () -> (tensor<?xi32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.IfRegion"(%6) ({
+          %8 = "tf.D"(%4, %3, %7) {} : (tensor<?xi32>, tensor<?xi32>, tensor<?xi32>) -> (tensor<?xf32>)
+          %9 = "tf.J"() : () -> (tensor<i32>)
+          %10 = "tf.K"() : () -> (tensor<i32>)
+          "tf.WhileRegion"(%9, %8) ({
+	  ^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+            %11 = "tf.I"(%arg1, %arg2) : (tensor<i32>, tensor<?xf32>) -> tensor<i32>
+	    %12 = "tf.H"(%10) :  (tensor<i32>) -> tensor<i1>
+            "tf.Yield"(%12) : (tensor<i1>) -> ()
+          }, {
+	  ^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	    %11 = "tf.C"(%arg1) : (tensor<i32>) -> tensor<i32>
+            %12 = "tf.D"(%arg1, %arg2) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
+            "tf.Yield"(%11, %12) : (tensor<i32>, tensor<?xf32>) -> ()
+          }) { is_stateless = false} : (tensor<i32>, tensor<?xf32>) -> (tensor<i32>, tensor<?xf32>)
+          "tf.Yield"() : () -> ()
+        }, {
+          "tf.Yield"() : () -> ()
+        }) {_xla_outside_compilation = "cluster1", is_stateless = false} : (tensor<i1>) -> ()
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of an outside compiled tf.WhileRegion where the entire
+  // tf.WhileRegion op is outside compiled with a nested tf.IfRegion.
+
+  // CHECK-LABEL: func @outside_compiled_ops_tf_while_nested_if
+  func @outside_compiled_ops_tf_while_nested_if(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:      %[[HOST_RECV_OUTPUT:[0-9]*]]:3 = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK:           "tf.WhileRegion"(%[[HOST_RECV_OUTPUT]]#1, %[[HOST_RECV_OUTPUT]]#2)
+    // CHECK:             %[[C_OUTPUT:[0-9]*]] = "tf.C"
+    // CHECK:             "tf.IfRegion"(%[[HOST_RECV_OUTPUT]]#0)
+    // CHECK:               %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[C_OUTPUT]])
+    // CHECK_NEXT:          "tf.Yield"
+    // CHECK:          "tf_device.cluster"
+    // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+    // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
+    // CHECK:            "tf._XlaHostComputeMlir"(%[[G_OUTPUT]], %[[B_OUTPUT]], %[[A_OUTPUT]])
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xf32>)
+        %4 = "tf.B"() : () -> (tensor<i32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+
+        "tf.WhileRegion"(%4, %3) ({
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	  %7 = "tf.H"(%arg1) :  (tensor<i32>) -> tensor<i1>
+          "tf.Yield"(%7) : (tensor<i1>) -> ()
+        }, {
+	^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	  %8 = "tf.C"(%arg1) : (tensor<i32>) -> tensor<i32>
+          %10 = "tf.IfRegion"(%6) ({
+            %9 = "tf.D"(%8) : (tensor<i32>) -> tensor<?xf32>
+	    "tf.Yield"(%9) : (tensor<?xf32>) -> ()
+	  }, {
+	    "tf.Yield"(%arg2) : (tensor<?xf32>) -> ()
+	  }) { is_stateless = false} : (tensor<i1>) -> tensor<?xf32>
+          "tf.Yield"(%8, %10) : (tensor<i32>, tensor<?xf32>) -> ()
+        }) {_xla_outside_compilation = "cluster1", is_stateless = false} : (tensor<i32>, tensor<?xf32>) -> (tensor<i32>, tensor<?xf32>)
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
+
+  // Tests extraction of an outside compiled cluster that contains ops wrapped
+  // inside multiple regions of nested tf.IfRegion and tf.WhileRegion.
+
+  // CHECK-LABEL: func @outside_compiled_ops_with_multiple_region_single_cluster
+  func @outside_compiled_ops_with_multiple_region_single_cluster(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+
+    // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+    // CHECK-NEXT:     "tf_device.launch"
+    // CHECK-NEXT:      %[[PLACEHOLDER_KEY:[0-9]*]] = "tf._TPUCompileMlirPlaceholderProgramKey"()
+    // CHECK-NEXT:      "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-NEXT:      %[[B_OUT:.*]] = "tf.B"
+    // CHECK-NEXT:      "tf._XlaSendFromHost"(%[[B_OUT]], %[[PLACEHOLDER_KEY]])
+    // CHECK-NEXT:      "tf.WhileRegion"()
+    // CHECK-NEXT:        %[[WHILE_COND:.*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-NEXT:        "tf.Yield"(%[[WHILE_COND]])
+    // CHECK:             "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-NEXT:        %[[C_OUT:.*]] = "tf.C"(%[[B_OUT]])
+    // CHECK-NEXT:        "tf._XlaSendFromHost"(%[[C_OUT]], %[[PLACEHOLDER_KEY]])
+    // CHECK-NEXT:        %[[IF_COND:.*]] = "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-NEXT:        "tf.IfRegion"(%[[IF_COND]])
+    // CHECK-NEXT:          "tf._XlaRecvAtHost"(%[[PLACEHOLDER_KEY]])
+    // CHECK-NEXT:           %[[D_OUT:.*]] = "tf.D"(%[[C_OUT]])
+    // CHECK:          "tf_device.cluster"
+    // CHECK-NEXT:       %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:       %[[B_OUT_DEVICE:.*]] = "tf._XlaHostComputeMlir"()
+    // CHECK-NEXT:       %[[G_OUT:.*]] = "tf.G"
+    // CHECK-NEXT:       "tf.WhileRegion"(%[[B_OUT_DEVICE]], %[[A_OUT]])
+    // CHECK:              %[[H_OUT:.*]] = "tf.H"
+    // CHECK-NEXT:         "tf.XlaSendToHost"(%[[H_OUT]])
+    // CHECK-NEXT:         "tf.Yield"(%[[H_OUT]])
+    // CHECK:              %[[C_OUT_DEVICE:.*]] = "tf._XlaHostComputeMlir"()
+    // CHECK-NEXT:         "tf.XlaSendToHost"(%[[G_OUT]])
+    // CHECK-NEXT:         "tf.IfRegion"(%[[G_OUT]])
+    // CHECK-NEXT:           "tf._XlaHostComputeMlir"()
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      %2 = "tf_device.cluster"() ( {
+        %3 = "tf.A"() : () -> (tensor<?xf32>)
+        %4 = "tf.B"() {_xla_outside_compilation="cluster0"} : () -> (tensor<i32>)
+        %6 = "tf.G"() : () -> (tensor<i1>)
+        "tf.WhileRegion"(%4, %3) ({
+          ^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	         %7 = "tf.H"(%arg1) :  (tensor<i32>) -> tensor<i1>
+           "tf.Yield"(%7) : (tensor<i1>) -> ()
+        }, {
+          ^bb0(%arg1: tensor<i32>, %arg2: tensor<?xf32>):
+	          %8 = "tf.C"(%4) {_xla_outside_compilation="cluster0"} : (tensor<i32>) -> tensor<i32>
+            %10 = "tf.IfRegion"(%6) ({
+              %9 = "tf.D"(%8) {_xla_outside_compilation="cluster0"} : (tensor<i32>) -> tensor<?xf32>
+	            "tf.Yield"(%9) : (tensor<?xf32>) -> ()
+            }, {
+	            "tf.Yield"(%arg2) : (tensor<?xf32>) -> ()
+            }) { is_stateless = false} : (tensor<i1>) -> tensor<?xf32>
+            "tf.Yield"(%8, %10) : (tensor<i32>, tensor<?xf32>) -> ()
+        }) {is_stateless = false} : (tensor<i32>, tensor<?xf32>) -> (tensor<i32>, tensor<?xf32>)
+
+        %5 = "tf.E"() : () -> tensor<?xi32>
+        tf_device.return %5 : tensor<?xi32>
+      }) {num_cores_per_replica = 1, topology =  "", device_assignment =  []} : () -> tensor<?xi32>
+      tf_device.return %2 : tensor<?xi32>
+    }
+
+    return %1 : tensor<?xi32>
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_outside_compilation_cluster.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_outside_compilation_cluster.mlir
index 1394bd22dc8..183c7c34d41 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_outside_compilation_cluster.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_outside_compilation_cluster.mlir
@@ -75,7 +75,7 @@ func @two_clusters_no_dependencies() {
   // CHECK: "tf.opB"
   // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER4:[a-zA-Z_0-9]+]]"
   // CHECK: "tf.opC"
-  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER4]]"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER5:[a-zA-Z_0-9]+]]"
   // CHECK: "tf.opD"
   "tf_device.cluster"() ( {
     "tf.opA"() : () -> ()
@@ -95,7 +95,6 @@ func @two_clusters_with_one_op_each() {
   // CHECK-NEXT: "tf.opC"
   // CHECK-NEXT: "tf.opD"
   // CHECK-NOT: _xla_outside_compilation = "[[CLUSTER6]]"
-  // CHECK-SAME: _xla_outside_compilation = "{{[a-zA-Z_0-9]+}}"
   // CHECK-NEXT: "tf.opE"
   "tf_device.cluster"() ( {
     %a = "tf.opA"() : () -> tensor<i32>
@@ -118,9 +117,8 @@ func @two_clusters_with_two_ops_each() {
   // CHECK-NEXT: "tf.opD"
   // CHECK-NEXT: "tf.opE"
   // CHECK-NOT: _xla_outside_compilation = "[[CLUSTER8]]"
-  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER9:[a-zA-Z_0-9]+]]"
   // CHECK-NEXT: "tf.opF"
-  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER9]]"
+  // CHECK-NOT: _xla_outside_compilation = "[[CLUSTER8]]"
   // CHECK-NEXT: "tf.opG"
   "tf_device.cluster"() ( {
     %a = "tf.opA"() : () -> tensor<i32>
@@ -135,6 +133,27 @@ func @two_clusters_with_two_ops_each() {
   return
 }
 
+// CHECK-LABEL: func @resource_side_effect_cycle
+func @resource_side_effect_cycle(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<!tf.resource<tensor<f32>>>) {
+  // CHECK: "tf.ReadVariableOp"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER1:[a-zA-Z_0-9]+]]"
+  // CHECK-NEXT: "tf.Identity"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER1]]"
+  // CHECK-NEXT: "tf.AssignVariableOp"
+  // CHECK-NOT:  {_xla_outside_compilation = "[[CLUSTER1]]"
+  "tf_device.cluster"() ( {
+    %read0 = "tf.ReadVariableOp"(%arg0) {_xla_outside_compilation = "0"} : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    %idet0 = "tf.Identity"(%read0) {_xla_outside_compilation = "0"} : (tensor<f32>) -> tensor<f32>
+    "tf.AssignVariableOp"(%arg1, %idet0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+    %read1 = "tf.ReadVariableOp"(%arg1) {_xla_outside_compilation = "0"} : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    %idet1 = "tf.Identity"(%read1) {_xla_outside_compilation = "0"} : (tensor<f32>) -> tensor<f32>
+    %add0 = "tf.AddV2"(%idet0, %idet1) {_xla_outside_compilation = "0"} : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    "tf.AssignVariableOp"(%arg0, %add0) {_xla_outside_compilation = "0"} : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
+
 // CHECK-LABEL: func @two_clusters_with_same_parent
 func @two_clusters_with_same_parent() {
   // CHECK: "tf.opA"
@@ -142,12 +161,11 @@ func @two_clusters_with_same_parent() {
   // CHECK-NEXT: "tf.opB"
   // CHECK-NEXT: "tf.opC"
   // CHECK-NOT: _xla_outside_compilation = "[[CLUSTER10]]"
-  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER11:[a-zA-Z_0-9]+]]"
   // CHECK-NEXT: "tf.opD"
-  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER10]]"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER12:[a-zA-Z_0-9]+]]"
   // CHECK-NEXT: "tf.opE"
   // CHECK-NEXT: "tf.opF"
-  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER11]]"
+  // CHECK-NOT: _xla_outside_compilation = "[[CLUSTER12]]"
   // CHECK-NEXT: "tf.opG"
   "tf_device.cluster"() ( {
     %a = "tf.opA"() {_xla_outside_compilation = "0"} : () -> tensor<i32>
@@ -168,11 +186,11 @@ func @two_clusters_with_same_outside_compiled_parent() {
   // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER12:[a-zA-Z_0-9]+]]"
   // CHECK-NEXT: "tf.opB"
   // CHECK-NEXT: "tf.opC"
-  // CHECK-NOT: _xla_outside_compilation = "[[CLUSTER12]]"
   // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER13:[a-zA-Z_0-9]+]]"
   // CHECK-NEXT: "tf.opD"
-  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER12]]"
-  // CHECK-NEXT: "tf.opE"
+  // CHECK-NOT: _xla_outside_compilation = "[[CLUSTER12]]"
+  // CHECK-NOT: _xla_outside_compilation = "[[CLUSTER13]]"
+  // CHECK-NEXT: "tf.Identity"
   // CHECK-NEXT: "tf.opF"
   // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER13]]"
   // CHECK-NEXT: "tf.opG"
@@ -182,7 +200,7 @@ func @two_clusters_with_same_outside_compiled_parent() {
     %b = "tf.opB"(%a) : (tensor<i32>) -> tensor<i32>
     %c = "tf.opC"(%b) {_xla_outside_compilation = "0"} : (tensor<i32>) -> tensor<i32>
     %d = "tf.opD"() {_xla_outside_compilation = "0"} : () -> tensor<i32>
-    %e = "tf.opE"(%d) : (tensor<i32>) -> tensor<i32>
+    %e = "tf.Identity"(%d) : (tensor<i32>) -> tensor<i32>
     %f = "tf.opF"(%e) {_xla_outside_compilation = "0"} : (tensor<i32>) -> tensor<i32>
     %g = "tf.opG"(%c, %f) {_xla_outside_compilation = "0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
     tf_device.return
@@ -213,14 +231,14 @@ func @outside_compile_with_block() {
   // CHECK-NEXT: "tf.opB"
   // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER15]]"
   // CHECK: "tf.opC"
-  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER15]]"
+  // CHECK-NOT: _xla_outside_compilation = "[[CLUSTER15]]"
   "tf_device.cluster"() ( {
     %a = "tf.opA"() {_xla_outside_compilation = "0"} : () -> tensor<i32>
-    %b = "tf.opB"() {_xla_outside_compilation = "0"} : () -> tensor<i32>
+    %b = "tf.opB"(%a) {_xla_outside_compilation = "0"} : (tensor<i32>) -> tensor<i32>
     "tf_device.cluster" () ( {
       tf_device.return
     }) {cluster_attr = "cluster_attr"} : () -> ()
-    %c = "tf.opC"() {_xla_outside_compilation = "0"} : () -> tensor<i32>
+    %c = "tf.opC"(%b) {_xla_outside_compilation = "0"} : (tensor<i32>) -> tensor<i32>
     tf_device.return
   }) {cluster_attr = "cluster_attr"} : () -> ()
   return
@@ -235,7 +253,6 @@ func @two_clusters_with_one_op_each_with_indirect_dependency() {
   // CHECK-NEXT: "tf.opD"
   // CHECK-NEXT: "tf.opE"
   // CHECK-NOT: _xla_outside_compilation = "[[CLUSTER16]]"
-  // CHECK-SAME: _xla_outside_compilation = "{{[a-zA-Z_0-9]+}}"
   // CHECK-NEXT: "tf.opF"
   "tf_device.cluster"() ( {
     %a = "tf.opA"() : () -> tensor<i32>
@@ -248,3 +265,277 @@ func @two_clusters_with_one_op_each_with_indirect_dependency() {
   }) {cluster_attr = "cluster_attr"} : () -> ()
   return
 }
+
+// CHECK-LABEL: func @check_ops_with_data_dependency_added_as_host_cluster
+func @check_ops_with_data_dependency_added_as_host_cluster() {
+  // CHECK: "tf.opA"
+  // CHECK-NEXT: "tf.opB"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER16:[a-zA-Z_0-9]+]]"
+  // CHECK-NEXT: "tf.Identity"
+  // CHECK-NEXT: "tf.Identity"
+  // CHECK-NEXT: "tf.opE"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER16]]"
+  // CHECK-NEXT: "tf.opF"
+  "tf_device.cluster"() ( {
+    %a = "tf.opA"() : () -> tensor<i32>
+    %b = "tf.opB"(%a) {_xla_outside_compilation = "0"} : (tensor<i32>) -> tensor<i32>
+    %c = "tf.Identity"(%b) : (tensor<i32>) -> tensor<i32>
+    %d = "tf.Identity"(%c) : (tensor<i32>) -> tensor<i32>
+    %e = "tf.opE"(%d, %b, %c) {_xla_outside_compilation = "0"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
+    "tf.opF"(%e) : (tensor<i32>) -> ()
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @check_op_inside_nested_region_clustered
+func @check_op_inside_nested_region_clustered(%arg0 : tensor<*x!tf.resource>) {
+  // CHECK:      tf_device.cluster
+  // CHECK:        "tf.IfRegion"
+  // CHECK-NEXT:     "tf.Const"
+  // CHECK-NEXT:     "tf.B"
+  // CHECK-NEXT:     "tf.C"
+  // CHECK-NEXT:     "tf.Const"
+  // CHECK-SAME:     _xla_outside_compilation = "[[CLUSTER17:[a-zA-Z_0-9]+]]"
+  // CHECK-NEXT:     "tf.Const"
+  // CHECK-SAME:     _xla_outside_compilation = "[[CLUSTER17]]"
+  // CHECK-NEXT:     "tf.WriteSummary"
+  // CHECK-SAME:     _xla_outside_compilation = "[[CLUSTER17]]"
+  "tf_device.cluster"() ( {
+    %0 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+    "tf.IfRegion"(%0) ( {
+      %1 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+      %2 = "tf.B"() : () -> (tensor<i64>)
+      %3 = "tf.C"() : () -> (tensor<f32>)
+      %4 = "tf.Const"() {_xla_outside_compilation = "auto0", value = dense<"logits"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+      %5 = "tf.Const"() {_xla_outside_compilation = "auto1", value = dense<"\0A\09\0A\07scalars"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+      "tf.WriteSummary"(%arg0, %2, %3, %4, %5) {_xla_outside_compilation = "auto2", device = "/device:CPU:0"} : (tensor<*x!tf.resource>, tensor<i64>, tensor<f32>, tensor<!tf.string>, tensor<!tf.string>) -> ()
+      "tf.Yield"(%1) : (tensor<i1>) -> ()
+      }, {
+      %1 = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+      "tf.Yield"(%1) : (tensor<i1>) -> ()
+      }) { is_stateless = true } : (tensor<i1>) -> tensor<i1>
+
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @check_ops_inside_different_block_clustered
+func @check_ops_inside_different_block_clustered(%arg0 : tensor<*x!tf.resource>) {
+  // CHECK:      tf_device.cluster
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   "tf.B"
+  // CHECK-SAME:   _xla_outside_compilation = "[[CLUSTER17:[a-zA-Z_0-9]+]]"
+  // CHECK-NEXT:   "tf.C"
+  // CHECK-SAME:   _xla_outside_compilation = "[[CLUSTER18:[a-zA-Z_0-9]+]]"
+  // CHECK:      "tf.IfRegion"
+  // CHECK-NEXT:     "tf.Const"
+  // CHECK-NEXT:     "tf.Const"
+  // CHECK-SAME:     _xla_outside_compilation = "[[CLUSTER17]]"
+  // CHECK-NEXT:     "tf.Const"
+  // CHECK-SAME:     _xla_outside_compilation = "[[CLUSTER17]]"
+  // CHECK-NEXT:     "tf.WriteSummary"
+  // CHECK-SAME:     _xla_outside_compilation = "[[CLUSTER17]]"
+  // CHECK:          "tf.Const"
+  // CHECK-NEXT:     "tf.Const"
+  // CHECK-SAME:     _xla_outside_compilation = "[[CLUSTER18]]"
+  // CHECK-NEXT:     "tf.D"
+  // CHECK-SAME:     _xla_outside_compilation = "[[CLUSTER18]]"
+  "tf_device.cluster"() ( {
+    %0 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+    %2 = "tf.B"() {_xla_outside_compilation = "auto1"} : () -> (tensor<i64>)
+    %3 = "tf.C"() {_xla_outside_compilation = "auto2"} : () -> (tensor<f32>)
+    "tf.IfRegion"(%0) ( {
+      %1 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+      %4 = "tf.Const"() {_xla_outside_compilation = "auto3", value = dense<"logits"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+      %5 = "tf.Const"() {_xla_outside_compilation = "auto4", value = dense<"\0A\09\0A\07scalars"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+      "tf.WriteSummary"(%arg0, %2, %3, %4, %5) {_xla_outside_compilation = "auto2", device = "/device:CPU:0"} : (tensor<*x!tf.resource>, tensor<i64>, tensor<f32>, tensor<!tf.string>, tensor<!tf.string>) -> ()
+      "tf.Yield"(%1) : (tensor<i1>) -> ()
+      }, {
+      %1 = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+      %4 = "tf.Const"() {_xla_outside_compilation = "auto5", value = dense<"a"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+      "tf.D"(%3, %4, %1) {_xla_outside_compilation = "auto6"} : (tensor<f32>, tensor<!tf.string>, tensor<i1>) -> ()
+      "tf.Yield"(%1) : (tensor<i1>) -> ()
+      }) { is_stateless = true } : (tensor<i1>) -> tensor<i1>
+
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @check_clustering_ops_inside_nested_control_flow
+func @check_clustering_ops_inside_nested_control_flow(%arg0 : tensor<*x!tf.resource>) {
+  // CHECK:      tf_device.cluster
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   "tf.B"
+  // CHECK-SAME:   _xla_outside_compilation = "[[CLUSTER17:[a-zA-Z_0-9]+]]"
+  // CHECK-NEXT:   "tf.C"
+  // CHECK:        _xla_outside_compilation = "[[CLUSTER17]]"
+  // CHECK:        "tf.IfRegion"
+  // CHECK:          "tf.IfRegion"
+  // CHECK-NEXT:       "tf.Const"
+  // CHECK-NEXT:       "tf.Const"
+  // CHECK-SAME:       _xla_outside_compilation = "[[CLUSTER17]]"
+  // CHECK-NEXT:       "tf.Const"
+  // CHECK-SAME:       _xla_outside_compilation = "[[CLUSTER17]]"
+  // CHECK-NEXT:       "tf.WriteSummary"
+  // CHECK-SAME:       _xla_outside_compilation = "[[CLUSTER17]]"
+  "tf_device.cluster"() ( {
+    %0 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+    %2 = "tf.B"() {_xla_outside_compilation = "auto1"} : () -> (tensor<i64>)
+    %3 = "tf.C"() {_xla_outside_compilation = "auto2"} : () -> (tensor<f32>)
+    "tf.IfRegion"(%0) ( {
+      %6 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+      "tf.IfRegion"(%6) ( {
+        %1 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+        %4 = "tf.Const"() {_xla_outside_compilation = "auto3", value = dense<"logits"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+        %5 = "tf.Const"() {_xla_outside_compilation = "auto4", value = dense<"\0A\09\0A\07scalars"> : tensor<!tf.string>} : () -> tensor<!tf.string>
+        "tf.WriteSummary"(%arg0, %2, %3, %4, %5) {_xla_outside_compilation = "auto2", device = "/device:CPU:0"} : (tensor<*x!tf.resource>, tensor<i64>, tensor<f32>, tensor<!tf.string>, tensor<!tf.string>) -> ()
+        "tf.Yield"(%1) : (tensor<i1>) -> ()
+      }, {
+        %1 = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+        "tf.Yield"(%1) : (tensor<i1>) -> ()
+      }) { is_stateless = true } : (tensor<i1>) -> tensor<i1>
+      "tf.Yield"(%6) : (tensor<i1>) -> ()
+    }, {
+      %7 = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+      "tf.Yield"(%7) : (tensor<i1>) -> ()
+    }) { is_stateless = true } : (tensor<i1>) -> tensor<i1>
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @single_variant_input
+func @single_variant_input() {
+  // CHECK: "tf.opA"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER1:[a-zA-Z_0-9]+]]"
+  // CHECK: "tf.opB"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER1]]"
+  // CHECK: "tf.opC"
+  "tf_device.cluster"() ( {
+    %1= "tf.opA"() : () -> tensor<!tf.variant<tensor<f32>>>
+    "tf.opB"(%1) {_xla_outside_compilation = "0"} : (tensor<!tf.variant<tensor<f32>>>) -> ()
+    "tf.opC"() : () -> ()
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @chained_variant_input
+func @chained_variant_input() {
+  // CHECK: "tf.opA"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER1:[a-zA-Z_0-9]+]]"
+  // CHECK: "tf.opB"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER1]]"
+  // CHECK: "tf.opC"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER1]]"
+  "tf_device.cluster"() ( {
+    %1 = "tf.opA"() : () -> tensor<!tf.variant<tensor<f32>>>
+    %2 = "tf.opB"(%1) : (tensor<!tf.variant<tensor<f32>>>) -> (tensor<!tf.variant<tensor<f32>>>)
+    "tf.opC"(%2) {_xla_outside_compilation = "0"} : (tensor<!tf.variant<tensor<f32>>>) -> ()
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @single_variant_output
+func @single_variant_output() {
+  // CHECK: "tf.opA"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER1:[a-zA-Z_0-9]+]]"
+  // CHECK: "tf.opB"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER1]]"
+  // CHECK: "tf.opC"
+  "tf_device.cluster"() ( {
+    %1= "tf.opA"() {_xla_outside_compilation = "0"} : () -> tensor<!tf.variant<tensor<f32>>>
+    "tf.opB"(%1) : (tensor<!tf.variant<tensor<f32>>>) -> ()
+    "tf.opC"() : () -> ()
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @chained_variant_output
+func @chained_variant_output() {
+  // CHECK: "tf.opA"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER1:[a-zA-Z_0-9]+]]"
+  // CHECK: "tf.opB"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER1]]"
+  // CHECK: "tf.opC"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER1]]"
+  "tf_device.cluster"() ( {
+    %1 = "tf.opA"() {_xla_outside_compilation = "0"} : () -> tensor<!tf.variant<tensor<f32>>>
+    %2 = "tf.opB"(%1) : (tensor<!tf.variant<tensor<f32>>>) -> (tensor<!tf.variant<tensor<f32>>>)
+    "tf.opC"(%2) : (tensor<!tf.variant<tensor<f32>>>) -> ()
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @variant_input_output
+func @variant_input_output() {
+  // CHECK: "tf.opA"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER1:[a-zA-Z_0-9]+]]"
+  // CHECK: "tf.opB"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER1]]"
+  // CHECK: "tf.opC"
+  // CHECK-SAME: _xla_outside_compilation = "[[CLUSTER1]]"
+  "tf_device.cluster"() ( {
+    %1 = "tf.opA"() : () -> tensor<!tf.variant<tensor<f32>>>
+    %2 = "tf.opB"(%1) {_xla_outside_compilation = "0"} : (tensor<!tf.variant<tensor<f32>>>) -> (tensor<!tf.variant<tensor<f32>>>)
+    "tf.opC"(%2) : (tensor<!tf.variant<tensor<f32>>>) -> ()
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @variant_input_nested
+func @variant_input_nested(%arg0 : tensor<*x!tf.resource>) {
+  // CHECK:      tf_device.cluster
+  // CHECK-NEXT:   "tf.Const"
+  // CHECK-NEXT:   "tf.C"
+  // CHECK-SAME:   _xla_outside_compilation = "[[CLUSTER1:[a-zA-Z_0-9]+]]"
+  // CHECK:        "tf.IfRegion"
+  // CHECK:        "tf.opD"
+  // CHECK-NOT: _xla_outside_compilation = "[[CLUSTER1]]"
+  "tf_device.cluster"() ( {
+    %0 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+    %2 = "tf.C"() {_xla_outside_compilation = "auto0"} : () -> (tensor<!tf.variant<tensor<f32>>>)
+    "tf.IfRegion"(%0) ( {
+      %1 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+      "tf.opD"(%2) : (tensor<!tf.variant<tensor<f32>>>) -> ()
+      "tf.Yield"(%1) : (tensor<i1>) -> ()
+      }, {
+      %1 = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+      "tf.Yield"(%1) : (tensor<i1>) -> ()
+      }) { is_stateless = true, _xla_outside_compilation = "auto1" } : (tensor<i1>) -> tensor<i1>
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @variant_output_nested
+func @variant_output_nested(%arg0 : tensor<*x!tf.resource>) {
+  // CHECK:      tf_device.cluster
+  // CHECK:        "tf.IfRegion"
+  // CHECK:        "tf.C"
+  // CHECK-NOT: _xla_outside_compilation
+  // CHECK:        "tf.D"
+  // CHECK-NOT: _xla_outside_compilation
+  // CHECK:        "tf.Yield"
+  // CHECK: _xla_outside_compilation
+  "tf_device.cluster"() ( {
+    %0 = "tf.Const"() {value = dense<true> : tensor<i1>} : () -> tensor<i1>
+    %1 = "tf.IfRegion"(%0) ( {
+      %2 = "tf.C"()  : () -> (tensor<!tf.variant<tensor<f32>>>)
+      "tf.Yield"(%2) : (tensor<!tf.variant<tensor<f32>>>) -> ()
+      }, {
+      %2 = "tf.D"() : () -> (tensor<!tf.variant<tensor<f32>>>)
+      "tf.Yield"(%2) : (tensor<!tf.variant<tensor<f32>>>) -> ()
+      }) { is_stateless = true, _xla_outside_compilation = "auto1" } : (tensor<i1>) -> tensor<!tf.variant<tensor<f32>>>
+    "tf.E"(%1) {_xla_outside_compilation = "auto0"} : (tensor<!tf.variant<tensor<f32>>>) -> ()
+    tf_device.return
+  }) {cluster_attr = "cluster_attr"} : () -> ()
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_parallel_execute_sink_resource_write.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_parallel_execute_sink_resource_write.mlir
new file mode 100644
index 00000000000..ad4433c1d20
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_parallel_execute_sink_resource_write.mlir
@@ -0,0 +1,137 @@
+// RUN: tf-opt %s -tf-tpu-parallel-execute-sink-resource-write | FILECHECK_OPTS="" FileCheck %s
+
+// CHECK-LABEL: func @multiple_uses
+// CHECK-SAME:  ({{.+}}: tensor<i1>, [[ARG1:%.+]]: tensor<!tf.resource>)
+func @multiple_uses(%arg0: tensor<i1>, %arg1: tensor<!tf.resource>) -> tensor<i1> {
+  // CHECK:      [[PARALLEL_EXECUTE:%.+]]:2 = "tf_device.parallel_execute"
+  %0:2 = "tf_device.parallel_execute"() ( {
+    tf_device.return %arg0 : tensor<i1>
+  }, {
+    tf_device.return %arg0 : tensor<i1>
+  // CHECK:      }) : () -> (tensor<i1>, tensor<i1>)
+  }) : () -> (tensor<i1>, tensor<i1>)
+  // CHECK-NEXT: "tf.AssignVariableOp"([[ARG1]], [[PARALLEL_EXECUTE]]#0)
+  "tf.AssignVariableOp"(%arg1, %0#0) : (tensor<!tf.resource>, tensor<i1>) -> ()
+  // CHECK-NEXT: return [[PARALLEL_EXECUTE]]#0
+  return %0#0 : tensor<i1>
+}
+
+// CHECK-LABEL: func @not_assign_var
+// CHECK-SAME:  ({{.+}}: tensor<i1>, [[ARG1:%.+]]: tensor<!tf.resource>)
+func @not_assign_var(%arg0: tensor<i1>, %arg1: tensor<!tf.resource>) {
+  // CHECK:      [[PARALLEL_EXECUTE:%.+]]:2 = "tf_device.parallel_execute"
+  %0:2 = "tf_device.parallel_execute"() ( {
+    tf_device.return %arg0 : tensor<i1>
+  }, {
+    tf_device.return %arg0 : tensor<i1>
+  // CHECK:      }) : () -> (tensor<i1>, tensor<i1>)
+  }) : () -> (tensor<i1>, tensor<i1>)
+  // CHECK-NEXT: "tf.AssignAddVariableOp"([[ARG1]], [[PARALLEL_EXECUTE]]#0)
+  "tf.AssignAddVariableOp"(%arg1, %0#0) : (tensor<!tf.resource>, tensor<i1>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @resource_handle_output
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i1>, {{.+}}: tensor<!tf.resource>)
+func @resource_handle_output(%arg0: tensor<i1>, %arg1: tensor<!tf.resource>) {
+  // CHECK:      [[PARALLEL_EXECUTE:%.+]]:2 = "tf_device.parallel_execute"
+  %0:2 = "tf_device.parallel_execute"() ( {
+    tf_device.return %arg1 : tensor<!tf.resource>
+  }, {
+    tf_device.return %arg1 : tensor<!tf.resource>
+  // CHECK:      }) : () -> (tensor<!tf.resource>, tensor<!tf.resource>)
+  }) : () -> (tensor<!tf.resource>, tensor<!tf.resource>)
+  // CHECK-NEXT: "tf.AssignVariableOp"([[PARALLEL_EXECUTE]]#0, [[ARG0]])
+  "tf.AssignVariableOp"(%0#0, %arg0) : (tensor<!tf.resource>, tensor<i1>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @resource_handle_and_value_output
+func @resource_handle_and_value_output(%arg0: tensor<i1>, %arg1: tensor<!tf.resource>) {
+  // CHECK: [[PARALLEL_EXECUTE:%.+]]:2 = "tf_device.parallel_execute"
+  %0:2 = "tf_device.parallel_execute"() ( {
+    tf_device.return %arg0, %arg1 : tensor<i1>, tensor<!tf.resource>
+  }, {
+    tf_device.return
+  }) : () -> (tensor<i1>, tensor<!tf.resource>)
+  // CHECK: "tf.AssignVariableOp"([[PARALLEL_EXECUTE]]#1, [[PARALLEL_EXECUTE]]#0)
+  "tf.AssignVariableOp"(%0#1, %0#0) : (tensor<!tf.resource>, tensor<i1>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @resource_handle_after_parallel_execute
+func @resource_handle_after_parallel_execute(%arg0: tensor<i1>) {
+  // CHECK:      [[PARALLEL_EXECUTE:%.+]]:2 = "tf_device.parallel_execute"
+  %0:2 = "tf_device.parallel_execute"() ( {
+    tf_device.return %arg0 : tensor<i1>
+  }, {
+    tf_device.return %arg0 : tensor<i1>
+  // CHECK:      }) : () -> (tensor<i1>, tensor<i1>)
+  }) : () -> (tensor<i1>, tensor<i1>)
+  // CHECK-NEXT: [[VAR:%.+]] = "tf.VarHandleOp"
+  %1 = "tf.VarHandleOp"() {container = "", shape = #tf.shape<>, shared_name = "x"} : () -> tensor<!tf.resource<tensor<i1>>>
+  // CHECK-NEXT: "tf.AssignVariableOp"([[VAR]], [[PARALLEL_EXECUTE]]#0)
+  "tf.AssignVariableOp"(%1, %0#0) : (tensor<!tf.resource<tensor<i1>>>, tensor<i1>) -> ()
+  return
+}
+
+// CHECK-LABEL: func @replace_single_output
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i1>, [[ARG1:%.+]]: tensor<i1>, [[ARG2:%.+]]: tensor<i1>, [[ARG3:%.+]]: tensor<!tf.resource>)
+func @replace_single_output(%arg0: tensor<i1>, %arg1: tensor<i1>, %arg2: tensor<i1>, %arg3: tensor<!tf.resource>) {
+  // CHECK:      {{%.+}}:2 = "tf_device.parallel_execute"
+  %0:3 = "tf_device.parallel_execute"() ( {
+    // CHECK-NEXT: "tf.AssignVariableOp"([[ARG3]], [[ARG1]])
+    // CHECK-NEXT: tf_device.return [[ARG0]], [[ARG2]] : tensor<i1>, tensor<i1>
+    tf_device.return %arg0, %arg1, %arg2 : tensor<i1>, tensor<i1>, tensor<i1>
+  // CHECK-NEXT: }, {
+  }, {
+    // CHECK-NEXT: tf_device.return
+    tf_device.return
+  // CHECK-NEXT: }) : () -> (tensor<i1>, tensor<i1>)
+  }) : () -> (tensor<i1>, tensor<i1>, tensor<i1>)
+  "tf.AssignVariableOp"(%arg3, %0#1) : (tensor<!tf.resource>, tensor<i1>) -> ()
+  // CHECK-NEXT: return
+  return
+}
+
+// CHECK-LABEL: func @replace_multiple_outputs
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i1>, [[ARG1:%.+]]: tensor<i32>, [[ARG2:%.+]]: tensor<i64>, [[ARG3:%.+]]: tensor<f32>, [[ARG4:%.+]]: tensor<f64>, [[ARG5:%.+]]: tensor<!tf.resource>, [[ARG6:%.+]]: tensor<!tf.resource>)
+func @replace_multiple_outputs(%arg0: tensor<i1>, %arg1: tensor<i32>, %arg2: tensor<i64>, %arg3: tensor<f32>, %arg4: tensor<f64>, %arg5: tensor<!tf.resource>, %arg6: tensor<!tf.resource>) {
+  // CHECK:      {{%.+}}:3 = "tf_device.parallel_execute"
+  %0:5 = "tf_device.parallel_execute"() ( {
+    // CHECK-NEXT: "tf.AssignVariableOp"([[ARG5]], [[ARG1]])
+    // CHECK-NEXT: "tf.AssignVariableOp"([[ARG6]], [[ARG3]])
+    // CHECK-NEXT: tf_device.return [[ARG0]], [[ARG2]], [[ARG4]] : tensor<i1>, tensor<i64>, tensor<f64>
+    tf_device.return %arg0, %arg1, %arg2, %arg3, %arg4 : tensor<i1>, tensor<i32>, tensor<i64>, tensor<f32>, tensor<f64>
+  // CHECK-NEXT: }, {
+  }, {
+    // CHECK-NEXT: tf_device.return
+    tf_device.return
+  // CHECK-NEXT: }) : () -> (tensor<i1>, tensor<i64>, tensor<f64>)
+  }) : () -> (tensor<i1>, tensor<i32>, tensor<i64>, tensor<f32>, tensor<f64>)
+  "tf.AssignVariableOp"(%arg5, %0#1) : (tensor<!tf.resource>, tensor<i32>) -> ()
+  "tf.AssignVariableOp"(%arg6, %0#3) : (tensor<!tf.resource>, tensor<f32>) -> ()
+  // CHECK-NEXT: return
+  return
+}
+
+// CHECK-LABEL: func @replace_multiple_outputs_regions
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i1>, [[ARG1:%.+]]: tensor<i32>, [[ARG2:%.+]]: tensor<i64>, [[ARG3:%.+]]: tensor<bf16>, [[ARG4:%.+]]: tensor<f32>, [[ARG5:%.+]]: tensor<f64>, [[ARG6:%.+]]: tensor<!tf.resource>, [[ARG7:%.+]]: tensor<!tf.resource>)
+func @replace_multiple_outputs_regions(%arg0: tensor<i1>, %arg1: tensor<i32>, %arg2: tensor<i64>, %arg3: tensor<bf16>, %arg4: tensor<f32>, %arg5: tensor<f64>, %arg6: tensor<!tf.resource>, %arg7: tensor<!tf.resource>) {
+  // CHECK:      {{%.+}}:4 = "tf_device.parallel_execute"
+  %0:6 = "tf_device.parallel_execute"() ( {
+    // CHECK-NEXT: "tf.AssignVariableOp"([[ARG6]], [[ARG1]])
+    // CHECK-NEXT: tf_device.return [[ARG0]], [[ARG2]] : tensor<i1>, tensor<i64>
+    tf_device.return %arg0, %arg1, %arg2 : tensor<i1>, tensor<i32>, tensor<i64>
+  // CHECK-NEXT: }, {
+  }, {
+    // CHECK-NEXT: "tf.AssignVariableOp"([[ARG7]], [[ARG4]])
+    // CHECK-NEXT: tf_device.return [[ARG3]], [[ARG5]] : tensor<bf16>, tensor<f64>
+    tf_device.return %arg3, %arg4, %arg5 : tensor<bf16>, tensor<f32>, tensor<f64>
+  // CHECK-NEXT: }) : () -> (tensor<i1>, tensor<i64>, tensor<bf16>, tensor<f64>)
+  }) : () -> (tensor<i1>, tensor<i32>, tensor<i64>, tensor<bf16>, tensor<f32>, tensor<f64>)
+  "tf.AssignVariableOp"(%arg6, %0#1) : (tensor<!tf.resource>, tensor<i32>) -> ()
+  "tf.AssignVariableOp"(%arg7, %0#4) : (tensor<!tf.resource>, tensor<f32>) -> ()
+  // CHECK-NEXT: return
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
index 2e3e38c7004..a3d5a43a214 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
@@ -227,3 +227,28 @@ func @pcall_func_body(%arg0: tensor<*xi1>) -> tensor<i32> {
   %2 = "tf.D"(%1) : (tensor<*xi1>) -> (tensor<i32>)
   return %2 : tensor<i32>
 }
+
+// -----
+
+// Tests that output sharding inside a functional op is parsed correctly.
+
+// CHECK-LABEL: func @check_sharding_inside_functional_op
+func @check_sharding_inside_functional_op(%arg0: tensor<*xi32>) {
+  "tf_device.cluster_func"(%arg0) {func = @cluster_func, step_marker_location = ""} : (tensor<*xi32>) -> tensor<*xi32>
+  // CHECK: input_sharding_configuration
+  // CHECK-SAME: ["\01\02\03"]
+  // CHECK: output_sharding_configuration
+  // CHECK-SAME: ["\01\02\03"]
+  return
+}
+
+func @cluster_func(%arg0: tensor<*xi32>) -> tensor<*xi32> {
+  %0 = "tf.PartitionedCall"(%arg0) {f= @func_body, config="", config_proto="", executor_type=""} : (tensor<*xi32>) -> tensor<*xi32>
+  return %0 : tensor<*xi32>
+}
+
+func @func_body(%arg0: tensor<*xi32>)-> tensor<*xi32> {
+  %0 = "tf.XlaSharding"(%arg0) { _XlaSharding = "\01\02\03" } : (tensor<*xi32>) -> tensor<*xi32>
+  %1 = "tf.Identity"(%0) : (tensor<*xi32>) -> (tensor<*xi32>)
+  return %1 : tensor<*xi32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
index de73dff8b0b..fe0c5bea44e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.h"
-
 #include <climits>
 #include <cstdint>
 #include <numeric>
@@ -41,7 +39,48 @@ namespace mlir {
 namespace TF {
 
 namespace {
-// Replace TF BatchMatMul by TF Einsum
+
+// Replace TF BatchMatMul by TF Einsum op
+template <typename BatchMatMulOpType>
+class ConvertTFBatchMatMulToEinsumOp
+    : public OpRewritePattern<BatchMatMulOpType> {
+  using OpRewritePattern<BatchMatMulOpType>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(BatchMatMulOpType op,
+                                PatternRewriter& rewriter) const override {
+    Value input_lhs = op.x();
+    Value input_rhs = op.y();
+
+    // LHS and RHS must be a ranked tensor type
+    auto lhs_type = input_lhs.getType().dyn_cast<RankedTensorType>();
+    auto rhs_type = input_rhs.getType().dyn_cast<RankedTensorType>();
+
+    if (!lhs_type || !rhs_type) return failure();
+
+    auto lhs_shape = lhs_type.getShape();
+    auto rhs_shape = rhs_type.getShape();
+
+    // Ensure that input ranks are at least 2.
+    const int dims_a = lhs_shape.size();
+    const int dims_b = rhs_shape.size();
+    if (dims_a < 2 || dims_b < 2) {
+      return failure();
+    }
+
+    // einsum equation for batchmatmul
+    std::string equation("...mk,...kn->...mn");
+    if (op.adj_x()) std::swap(equation[3], equation[4]);
+    if (op.adj_y()) std::swap(equation[6 + 3], equation[6 + 4]);
+
+    rewriter.replaceOpWithNewOp<TF::EinsumOp>(
+        op, op.getType(),
+        /*inputs=*/ValueRange({input_lhs, input_rhs}),
+        /*equation=*/equation);
+
+    return success();
+  }
+};
+
 struct BatchMatMulToEinsumPass
     : public PassWrapper<BatchMatMulToEinsumPass, FunctionPass> {
   void runOnFunction() override;
@@ -57,65 +96,10 @@ void BatchMatMulToEinsumPass::runOnFunction() {
   applyPatternsAndFoldGreedily(func, patterns);
 }
 
-}  // namespace
-
-template <typename BatchMatMulOpType>
-LogicalResult
-ConvertTFBatchMatMulToEinsumOp<BatchMatMulOpType>::matchAndRewrite(
-    BatchMatMulOpType op, PatternRewriter& rewriter) const {
-  Value input_lhs = op.x();
-  Value input_rhs = op.y();
-
-  if (!input_lhs.getType().isa<RankedTensorType>()) {
-    // LHS must be a ranked tensor type
-    return failure();
-  }
-  if (!input_rhs.getType().isa<RankedTensorType>()) {
-    // RHS must be a ranked tensor type
-    return failure();
-  }
-
-  auto lhs_type = input_lhs.getType().dyn_cast<RankedTensorType>();
-  auto rhs_type = input_rhs.getType().dyn_cast<RankedTensorType>();
-
-  if (!lhs_type || !rhs_type) {
-    return failure();
-  }
-
-  auto lhs_shape = lhs_type.getShape();
-  auto rhs_shape = rhs_type.getShape();
-
-  Location loc = op.getLoc();
-
-  // Ensure that input ranks are at least 2.
-  const int dims_a = lhs_shape.size();
-  const int dims_b = rhs_shape.size();
-  if (dims_a < 2 || dims_b < 2) {
-    // Both inputs must have rank >= 2
-    return failure();
-  }
-
-  // einsum equation for batchmatmul
-  std::string equation("...mk,...kn->...mn");
-
-  if (op.adj_x()) {
-    std::swap(equation[3], equation[4]);
-  }
-  if (op.adj_y()) {
-    std::swap(equation[6 + 3], equation[6 + 4]);
-  }
-
-  llvm::SmallVector<Value, 2> inputs = {input_lhs, input_rhs};
-  rewriter.replaceOpWithNewOp<TF::EinsumOp>(op, op.getType(),
-                                            /*inputs=*/ValueRange(inputs),
-                                            /*equation=*/equation);
-
-  return success();
-}
-
-static PassRegistration<BatchMatMulToEinsumPass> pass(
+PassRegistration<BatchMatMulToEinsumPass> pass(
     "tf-batch-matmul-to-tf-einsum",
     "Replace TF BatchMatMul op by TF Einsum op.");
+}  // namespace
 
 std::unique_ptr<OperationPass<FuncOp>> CreateBatchMatMulToEinsumPass() {
   return std::make_unique<BatchMatMulToEinsumPass>();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.h b/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.h
deleted file mode 100644
index d39f3575b4a..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/transforms/batchmatmul_to_einsum.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_BATCHMATMUL_TO_EINSUM_H_
-#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_BATCHMATMUL_TO_EINSUM_H_
-
-#include "llvm/ADT/ArrayRef.h"
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/core/util/matmul_bcast.h"
-
-namespace mlir {
-namespace TF {
-
-// Replace TF BatchMatMul by TF Einsum op
-template <typename BatchMatMulOpType>
-class ConvertTFBatchMatMulToEinsumOp
-    : public OpRewritePattern<BatchMatMulOpType> {
-  using OpRewritePattern<BatchMatMulOpType>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(
-      BatchMatMulOpType op,
-      PatternRewriter& rewriter) const override;  // NOLINT
-};
-
-}  // namespace TF
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_BATCHMATMUL_TO_EINSUM_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
index 0c21078b0ad..eccbe5feaec 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 
 namespace mlir {
@@ -57,6 +58,7 @@ tensorflow::Status RunTPUBridge(
     ModuleOp module, bool enable_logging,
     llvm::function_ref<void(OpPassManager &pm)> pipeline_builder) {
   PassManager bridge(module.getContext());
+  ::tensorflow::applyTensorflowAndCLOptions(bridge);
   if (enable_logging) EnableLogging(&bridge);
 
   // Populate a passmanager with the list of passes that implement the bridge.
@@ -98,18 +100,20 @@ void CreateTPUBridgePipeline(OpPassManager &pm) {
   // Run another shape inference pass because resource decomposition might have
   // created new partial types.
   pm.addPass(TF::CreateTFShapeInferencePass());
-  pm.addPass(TFDevice::CreateResourceOpLiftingPass());
   pm.addPass(TF::CreateTFFunctionalControlFlowToRegions());
   pm.addPass(mlir::createInlinerPass());
+  pm.addPass(CreateTPUClusterCleanupAttributesPass());
+  pm.addPass(TFDevice::CreateResourceOpLiftingPass());
   pm.addPass(TFDevice::CreateMarkOpsForOutsideCompilationPass());
   pm.addPass(CreateTPUExtractHeadTailOutsideCompilationPass());
+  pm.addPass(CreateTPUOutsideCompilationClusterPass());
   pm.addPass(CreateTPUExtractOutsideCompilationPass());
-  pm.addPass(TF::CreateTFRegionControlFlowToFunctional());
 
   pm.addNestedPass<FuncOp>(tf_executor::CreateTFExecutorConstantSinkingPass());
   pm.addPass(TF::CreateResourceDeviceInferencePass());
   pm.addPass(TFDevice::CreateClusterOutliningPass());
   pm.addPass(CreateTPUDynamicPaddingMapperPass());
+  pm.addPass(CreateTPUResourceReadForWritePass());
   pm.addPass(CreateTPUShardingIdentificationPass());
   pm.addPass(TFDevice::CreateAnnotateParameterReplicationPass());
   pm.addPass(CreateTPURewritePass());
@@ -117,7 +121,9 @@ void CreateTPUBridgePipeline(OpPassManager &pm) {
   pm.addNestedPass<FuncOp>(TFDevice::CreateReplicateInvariantOpHoistingPass());
   pm.addNestedPass<FuncOp>(CreateTPUDynamicLayoutPass());
   pm.addNestedPass<FuncOp>(CreateTPUMergeVariablesWithExecutePass());
+  pm.addNestedPass<FuncOp>(CreateTPUColocateCompositeResourceOps());
   pm.addPass(CreateTPUVariableReformattingPass());
+  pm.addPass(TF::CreateTFRegionControlFlowToFunctional());
 }
 
 void CreateTPUBridgePipelineV1(OpPassManager &pm) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
index 2b8ab85be38..e85058a1964 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_formation.cc
@@ -39,6 +39,10 @@ namespace {
 
 struct ClusterFormationPass
     : public PassWrapper<ClusterFormationPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<tf_device::TensorFlowDeviceDialect>();
+  }
+
   void runOnFunction() override;
 };
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc b/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
index 57a5cd888a1..cde07503e75 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.cc
@@ -181,14 +181,14 @@ llvm::Optional<RankedTensorType> GetElementTypeFromAccess(
     llvm::function_ref<llvm::Optional<Type>(Operation*)> infer_from_op) {
   for (auto& use : collection.getUses()) {
     if (auto while_op = llvm::dyn_cast<TF::WhileOp>(use.getOwner())) {
-      auto body = while_op.body_func();
+      auto body = while_op.body_function();
       assert(body);
       auto type_from_body = GetElementTypeFromAccess(
           body.getArgument(use.getOperandNumber()), module, infer_from_op);
       if (type_from_body.hasValue()) return type_from_body;
     } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(use.getOwner())) {
-      auto then_branch = if_op.then_func();
-      auto else_branch = if_op.else_func();
+      auto then_branch = if_op.then_function();
+      auto else_branch = if_op.else_function();
       assert(then_branch && else_branch);
       auto type_from_then = GetElementTypeFromAccess(
           then_branch.getArgument(use.getOperandNumber() - 1), module,
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
index 3005c78c54f..31cfc5ebf9c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/eval_util.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -72,7 +73,8 @@ LogicalResult ConstantFoldFallbackHook(
     SmallVectorImpl<OpFoldResult>& results) {  // NOLINT
   // Instructions with side effects should not be constant folded to preserve
   // the original semantics.
-  if (inst->getNumRegions() != 0 || !MemoryEffectOpInterface::hasNoEffect(inst))
+  if (inst->hasTrait<OpTrait::TF::NoConstantFold>() ||
+      inst->getNumRegions() != 0 || !MemoryEffectOpInterface::hasNoEffect(inst))
     return failure();
 
   // If any of the result types are variants, don't try to constant fold them.
@@ -87,7 +89,7 @@ LogicalResult ConstantFoldFallbackHook(
   }
 
   // Do not execute function calls.
-  if (llvm::isa<TF::WhileOp, TF::IfOp, CallOpInterface>(inst)) {
+  if (llvm::isa<TF::WhileOp, TF::CaseOp, TF::IfOp, CallOpInterface>(inst)) {
     return failure();
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/contraction_fusion.cc b/tensorflow/compiler/mlir/tensorflow/transforms/contraction_fusion.cc
new file mode 100644
index 00000000000..b5d09f7a794
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/contraction_fusion.cc
@@ -0,0 +1,162 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/UseDefLists.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TF {
+namespace {
+
+// -------------------------------------------------------------------------- //
+// Fuse ContractionFusableInterface operations into contraction operation.
+// -------------------------------------------------------------------------- //
+
+template <typename BaseOp, typename FusedOp>
+class FuseIntoContractionOp : public RewritePattern {
+ public:
+  FuseIntoContractionOp()
+      : RewritePattern(PatternBenefit(1), MatchAnyOpTypeTag()) {}
+
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const override {
+    auto fusable = dyn_cast<ContractionFusableInterface>(op);
+    if (!fusable) return failure();
+
+    auto failed = [&](Twine message) -> LogicalResult {
+      return rewriter.notifyMatchFailure(op, message);
+    };
+
+    // Check if the operation can be fused.
+    Optional<ContractionFusion> fusion = fusable.GetContractionFusion();
+    if (!fusion.hasValue()) {
+      return failed("returned empty contraction fusion specification");
+    }
+
+    // Check if preceeding operation is a BaseOp or FusedOp that we can use for
+    // fusion.
+    Operation *fuse_into = nullptr;
+    Value operand = op->getOperand(0);
+
+    if (BaseOp base_op = operand.getDefiningOp<BaseOp>()) {
+      fuse_into = base_op.getOperation();
+    } else if (FusedOp fused_op = operand.getDefiningOp<FusedOp>()) {
+      fuse_into = fused_op.getOperation();
+    } else {
+      return failed("input to the fusable op must be a " +
+                    BaseOp::getOperationName() + " or a " +
+                    FusedOp::getOperationName());
+    }
+
+    // Operand result must have one use, because we do not want to compute
+    // tensor contraction twice.
+    if (!fuse_into->getResult(0).hasOneUse()) {
+      return failed("fused into op result must have one use");
+    }
+
+    MLIRContext *ctx = op->getContext();
+
+    // Build a fused MatMul operation from a base MatMul and a fusion.
+    SmallVector<Location, 3> locations = {fuse_into->getLoc(), op->getLoc()};
+    Location loc = rewriter.getFusedLoc(locations);
+
+    // Fusion can't change the type of a fused operation.
+    Type result_ty = fuse_into->getResult(0).getType();
+
+    // Copy all operands from a base op and add additional fusion arguments.
+    SmallVector<Value, 3> operands(fuse_into->getOperands());
+    for (int idx : fusion->additional_arguments) {
+      operands.push_back(op->getOperand(idx));
+    }
+
+    // Copy attributes from a base op that we fuse into (e.g. copy all
+    // MatMul or Conv attributes to the fused operation).
+    SmallVector<NamedAttribute, 4> attrs(fuse_into->getAttrs().begin(),
+                                         fuse_into->getAttrs().end());
+
+    // Add fusion specific additional attributes.
+    for (auto attr : fusion->additional_attributes) {
+      attrs.push_back(attr);
+    }
+
+    // Add a fused output kernel name to the list of fusions.
+    Identifier fusion_id = Identifier::get("fusion", ctx);
+    StringAttr fusion_name = StringAttr::get(fusion->output_kernel, ctx);
+
+    auto is_fusion = [&](const NamedAttribute &attr) -> bool {
+      return attr.first == fusion_id;
+    };
+
+    if (isa<BaseOp>(fuse_into)) {
+      NamedAttribute fusion_attr(fusion_id, ArrayAttr::get({fusion_name}, ctx));
+      attrs.push_back(fusion_attr);
+
+    } else {
+      ArrayAttr arr =
+          llvm::find_if(attrs, is_fusion)->second.template cast<ArrayAttr>();
+      llvm::erase_if(attrs, is_fusion);
+
+      auto rng = arr.getAsRange<Attribute>();
+      SmallVector<Attribute, 4> updated(rng.begin(), rng.end());
+      updated.push_back(fusion_name);
+
+      attrs.push_back(NamedAttribute(fusion_id, ArrayAttr::get(updated, ctx)));
+    }
+
+    // Update all uses of a fusable op with a new fused operation.
+    Value fused = rewriter.create<FusedOp>(loc, result_ty, operands, attrs);
+    rewriter.replaceOp(op, {fused});
+
+    return failure();
+  }
+};
+
+// -------------------------------------------------------------------------- //
+
+using FuseIntoMatMulOp = FuseIntoContractionOp<MatMulOp, _JitFusedMatMulOp>;
+
+struct ContractionFusionPass
+    : public PassWrapper<ContractionFusionPass, FunctionPass> {
+  void runOnFunction() override;
+};
+
+void ContractionFusionPass::runOnFunction() {
+  FuncOp func = getFunction();
+
+  OwningRewritePatternList patterns;
+  patterns.insert<FuseIntoMatMulOp>();
+  applyPatternsAndFoldGreedily(func, patterns);
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<FuncOp>> CreateContractionFusionPass() {
+  return std::make_unique<ContractionFusionPass>();
+}
+
+static PassRegistration<ContractionFusionPass> pass(
+    "tf-contraction-fusion",
+    "Fuses operations implementing ContractionFusionInterface into the "
+    "contraction operations");
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc
index 4737f44ae1e..28a5c583919 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.cc
@@ -73,7 +73,7 @@ static Type GetResourceSubtype(Value resource) {
 
 void PopulateDecomposeResourceOpsPatterns(MLIRContext *context,
                                           OwningRewritePatternList *patterns) {
-  populateWithGenerated(context, patterns);
+  populateWithGenerated(context, *patterns);
 }
 
 }  // namespace TF
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
index 40339cebd31..4ed0307e2ef 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
@@ -85,7 +85,7 @@ def DecomposeResourceApplyMomentumOpNonNesterov :
        $var_resource, $accum_resource, $lr, $grad, $momentum,
        BoolAttr:$_, ConstBoolAttrFalse:$use_nesterov
     ),
-    [(TF_AddOp:$accum_new
+    [(TF_AddV2Op:$accum_new
       (TF_MulOp
         (CreateTFReadVariableOp $src_op, $grad, $accum_resource),
         $momentum
@@ -107,7 +107,7 @@ def DecomposeResourceApplyMomentumOpNesterov :
        $var_resource, $accum_resource, $lr, $grad, $momentum,
        BoolAttr:$_, ConstBoolAttrTrue:$use_nesterov
     ),
-    [(TF_AddOp:$accum_new
+    [(TF_AddV2Op:$accum_new
        (TF_MulOp
          (CreateTFReadVariableOp $src_op, $grad, $accum_resource),
          $momentum
@@ -117,7 +117,7 @@ def DecomposeResourceApplyMomentumOpNesterov :
      (TF_AssignVariableOp $accum_resource, $accum_new),
      (TF_AssignSubVariableOp
        $var_resource,
-       (TF_AddOp
+       (TF_AddV2Op
          (TF_MulOp $grad, $lr),
          (TF_MulOp $accum_new, (TF_MulOp $momentum, $lr))
        )
@@ -175,7 +175,7 @@ def DecomposeResourceApplyKerasMomentumOpNesterov :
     ]
   >;
 
-// Pattern to Decompose ResourceApplyAdagrad.
+// Pattern to Decompose ResourceApplyAdagradV2.
 // This decomposition is only correct inside XLA as it ignores use_locking
 // attribute.
 // accum <- accum + grad * grad
@@ -201,6 +201,21 @@ def DecomposeResourceApplyAdagradV2 :
     ]
   >;
 
+// ResourceApplyAdagrad op can be canonicalized to ResourceApplyAdagradV2 with
+// zero epsilon and then decomposed using DecomposeResourceApplyAdagradV2
+// pattern.
+def DecomposeResourceApplyAdagrad :
+  Pattern<
+    (TF_ResourceApplyAdagradOp $var_resource, $accum_resource, $lr, $grad,
+       $use_locking, $update_slots),
+    [
+      (TF_ConstOp:$zero_epsilon (GetScalarOfType<0> $grad)),
+      (TF_ResourceApplyAdagradV2Op $var_resource, $accum_resource, $lr,
+          $zero_epsilon, $grad, $use_locking, $update_slots
+      )
+    ]>;
+
+
 // Pattern to Decompose ResourceApplyAdam without Nesterov momentum.
 // This decomposition is only correct inside XLA as it ignores use_locking
 // attribute.
@@ -342,7 +357,7 @@ def DecomposeResourceApplyCenteredRMSProp :
     ),
     [(TF_ConstOp:$one (GetScalarOfType<1> $grad)),
      (CreateTFReadVariableOp $src_op, $grad, $ms_resource),
-     (TF_AddOp:$ms_new
+     (TF_AddV2Op:$ms_new
        (TF_MulOp
          (TF_MulOp $grad, $grad),
          (TF_SubOp $one, $rho)
@@ -354,7 +369,7 @@ def DecomposeResourceApplyCenteredRMSProp :
      ),
      (TF_AssignVariableOp $ms_resource, $ms_new),
      // mg = grad * (one - rho) + mg * rho;
-     (TF_AddOp:$mg_new
+     (TF_AddV2Op:$mg_new
        (TF_MulOp
          $grad,
          (TF_SubOp $one, $rho)
@@ -366,7 +381,7 @@ def DecomposeResourceApplyCenteredRMSProp :
      ),
      (TF_AssignVariableOp $mg_resource, $mg_new),
      // mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms - mg * mg + epsilon)
-     (TF_AddOp:$mom_new
+     (TF_AddV2Op:$mom_new
       (TF_MulOp $momentum,
        (CreateTFReadVariableOp $src_op, $grad, $mom_resource)),
       (TF_DivOp
@@ -374,7 +389,7 @@ def DecomposeResourceApplyCenteredRMSProp :
          (TF_SqrtOp
            (TF_SubOp
              $ms_new,
-             (TF_AddOp
+             (TF_AddV2Op
                (TF_MulOp
                  $mg_new,
                  $mg_new
@@ -390,3 +405,45 @@ def DecomposeResourceApplyCenteredRMSProp :
      (TF_AssignSubVariableOp $var_resource, $mom_new)
    ]
    >;
+
+// This decomposition is only correct inside XLA as it ignores use_locking
+// attribute.
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+def DecomposeResourceApplyRMSProp :
+  Pattern<
+    (TF_ResourceApplyRMSPropOp:$src_op
+       $var_resource, $ms_resource, $mom_resource, $lr, $rho, $momentum, $epsilon,
+       $grad, ConstBoolAttrFalse:$use_locking
+    ),
+    [(TF_ConstOp:$one (GetScalarOfType<1> $grad)),
+     (CreateTFReadVariableOp $src_op, $grad, $ms_resource),
+     // ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+     (TF_AddV2Op:$ms_new
+       (TF_MulOp
+          (CreateTFReadVariableOp $src_op, $grad, $ms_resource),
+          $rho
+       ),
+       (TF_MulOp
+         (TF_SquareOp $grad),
+         (TF_SubOp $one, $rho)
+       )
+     ),
+     (TF_AssignVariableOp $ms_resource, $ms_new),
+     // mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+     (TF_AddV2Op:$mom_new
+      (TF_MulOp $momentum,
+       (CreateTFReadVariableOp $src_op, $grad, $mom_resource)),
+      (TF_DivOp
+         (TF_MulOp $lr, $grad),
+         (TF_SqrtOp
+           (TF_AddV2Op $ms_new, $epsilon)
+         )
+      )
+     ),
+     (TF_AssignVariableOp $mom_resource, $mom_new),
+     // var <- var - mom
+     (TF_AssignSubVariableOp $var_resource, $mom_new)
+   ]
+   >;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
index 69dab58c3f5..c3d43c27ac5 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/einsum.cc
@@ -16,12 +16,16 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/einsum.h"
 
 #include <algorithm>
+#include <cctype>
 #include <climits>
 #include <cstdint>
 
 #include "absl/memory/memory.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
@@ -43,130 +47,6 @@ namespace TF {
 
 namespace {
 
-// All supported Einsum equations.
-enum EinsumEquation {
-  BatchMatMul,
-  FourDMatrixDotProd,
-  ThreeDReshapeTail,
-  FourDBatchMatMul,
-  BroadcastMatMul,
-  ReduceSum,
-  TransposeMatMul,
-  BatchMatMulReducedDim,
-  TransposeReducedDim,
-  FourDReduceLast,
-  FourDTransposeAll,
-  UnsupportedEquation
-};
-
-// Tokens for parsing the given equation string.
-enum EquationToken {
-  A,
-  B,
-  C,
-  D,
-  E,
-  COMMA,
-  ARROW,
-};
-constexpr int kNumSupportedEquationVariables = 5;  // A - E for now.
-
-bool tokenizeEquation(const llvm::StringRef& equation,
-                      std::vector<EquationToken>* tokens) {
-  std::map<char, EquationToken> label_axis_mapping;
-  size_t index = 0;
-  int variable_count = 0;
-  llvm::Regex r("[[:alpha:]]");
-  while (index < equation.size()) {
-    if (r.match(equation.substr(index, 1))) {
-      const char ltr = equation[index];
-      auto itr = label_axis_mapping.find(ltr);
-      if (itr == label_axis_mapping.end() &&
-          variable_count < kNumSupportedEquationVariables) {
-        label_axis_mapping[ltr] = EquationToken(variable_count);
-        tokens->push_back(EquationToken(variable_count));
-        variable_count++;
-      } else if (itr != label_axis_mapping.end()) {
-        tokens->push_back(itr->second);
-      } else {
-        // Ran out of equation variables.
-        return false;
-      }
-    } else if (equation.substr(index, 1).contains(",")) {
-      tokens->push_back(COMMA);
-    } else if ((index < (equation.size() - 1)) &&
-               (equation.substr(index, 2).contains("->"))) {
-      tokens->push_back(ARROW);
-      index++;
-    } else {
-      // Unallowed character encountered.
-      return false;
-    }
-    index++;
-  }
-  return true;
-}
-
-EinsumEquation parseEquation(const std::vector<EquationToken>& eqn) {
-  auto is_equal = [](const std::vector<EquationToken>& eqn1,
-                     const std::initializer_list<EquationToken>& eqn2) {
-    return std::equal(eqn1.begin(), eqn1.end(), eqn2.begin(), eqn2.end());
-  };
-  // IJK,IKM->IJM
-  if (is_equal(eqn, {A, B, C, COMMA, A, C, D, ARROW, A, B, D})) {
-    return EinsumEquation::BatchMatMul;
-  }
-  // BFND,NDH->BFH
-  if (is_equal(eqn, {A, B, C, D, COMMA, C, D, E, ARROW, A, B, E})) {
-    return EinsumEquation::FourDMatrixDotProd;
-  }
-  // BFNH,BTNH->BNFT
-  if (is_equal(eqn, {A, B, C, D, COMMA, A, E, C, D, ARROW, A, C, B, E})) {
-    return EinsumEquation::FourDBatchMatMul;
-  }
-  // BFD,DNH->BFNH
-  if (is_equal(eqn, {A, B, C, COMMA, C, D, E, ARROW, A, B, D, E})) {
-    return EinsumEquation::ThreeDReshapeTail;
-  }
-  // BFH,HO->BFO
-  if (is_equal(eqn, {A, B, C, COMMA, C, D, ARROW, A, B, D})) {
-    return EinsumEquation::BroadcastMatMul;
-  }
-  // LBH,BL->BH
-  if (is_equal(eqn, {A, B, C, COMMA, B, A, ARROW, B, C})) {
-    return EinsumEquation::ReduceSum;
-  }
-  // LBH,BKL->BKH
-  if (is_equal(eqn, {A, B, C, COMMA, B, D, A, ARROW, B, D, C})) {
-    return EinsumEquation::TransposeMatMul;
-  }
-  // BIN,BINJ->BIJ
-  if (is_equal(eqn, {A, B, C, COMMA, A, B, C, D, ARROW, A, B, D})) {
-    return EinsumEquation::BatchMatMulReducedDim;
-  }
-  // BIJ,BINJ->BIN
-  if (is_equal(eqn, {A, B, C, COMMA, A, B, D, C, ARROW, A, B, D})) {
-    return EinsumEquation::TransposeReducedDim;
-  }
-  // ABCD,ADBE->ACBE
-  if (is_equal(eqn, {A, B, C, D, COMMA, A, D, B, E, ARROW, A, C, B, E})) {
-    return EinsumEquation::FourDReduceLast;
-  }
-  // ABCD,AECD->ACEB
-  if (is_equal(eqn, {A, B, C, D, COMMA, A, E, C, D, ARROW, A, C, E, B})) {
-    return EinsumEquation::FourDTransposeAll;
-  }
-  return EinsumEquation::UnsupportedEquation;
-}
-
-EinsumEquation tokenizeAndParse(const llvm::StringRef& equation) {
-  std::vector<EquationToken> tokens;
-  if (tokenizeEquation(equation, &tokens)) {
-    return parseEquation(tokens);
-  }
-  return EinsumEquation::UnsupportedEquation;
-}
-
 TF::TransposeOp createTransposeOp(Value value, Location loc,
                                   llvm::ArrayRef<int32_t> permutation,
                                   PatternRewriter* rewriter) {
@@ -186,28 +66,6 @@ TF::TransposeOp createTransposeOp(Value value, Location loc,
                                            perm_op);
 }
 
-TF::SumOp createSumOp(Value value, Location loc,
-                      llvm::ArrayRef<int32_t> redux_axes,
-                      PatternRewriter* rewriter) {
-  auto value_type = value.getType().cast<RankedTensorType>();
-  auto shape = value_type.getShape();
-  auto redux_type = RankedTensorType::get(
-      {static_cast<int32_t>(redux_axes.size())}, rewriter->getIntegerType(32));
-  auto redux_attr = DenseElementsAttr::get(redux_type, redux_axes);
-  auto redux_op = rewriter->create<ConstantOp>(loc, redux_type, redux_attr);
-  std::vector<int64_t> sum_shape(shape.size() - redux_axes.size());
-  int count = 0;
-  for (int i = 0, end = shape.size(); i < end; ++i) {
-    if (std::find(redux_axes.begin(), redux_axes.end(), i) ==
-        redux_axes.end()) {
-      sum_shape[count] = shape[i];
-      count++;
-    }
-  }
-  auto sum_type = RankedTensorType::get(sum_shape, value_type.getElementType());
-  return rewriter->create<TF::SumOp>(loc, sum_type, value, redux_op);
-}
-
 TF::ReshapeOp createReshapeOp(Value value, ArrayRef<int64_t> shape,
                               Type element_type, Location loc,
                               PatternRewriter* rewriter) {
@@ -222,241 +80,277 @@ TF::ReshapeOp createReshapeOp(Value value, ArrayRef<int64_t> shape,
                                          /*shape=*/shape_tensor);
 }
 
+struct EinsumDimensionNumbers {
+  // Each field contains the list of dimensions appearing only in the specifed
+  // arguments of the einsum op with natural ordering. For example `rhs_out`
+  // contains the dimensions appearing in the RHS and the OUTPUT of the einsum
+  // but not in the LHS.
+  std::vector<int64_t> lhs;
+  std::vector<int64_t> rhs;
+  std::vector<std::tuple<int64_t, int64_t>> lhs_rhs;
+  std::vector<std::tuple<int64_t, int64_t>> lhs_out;
+  std::vector<std::tuple<int64_t, int64_t>> rhs_out;
+  std::vector<std::tuple<int64_t, int64_t, int64_t>> lhs_rhs_out;
+};
+
+llvm::Optional<llvm::SmallDenseMap<char, int64_t>> EquationToMap(
+    llvm::StringRef equation) {
+  llvm::SmallDenseMap<char, int64_t> map;
+  for (int64_t i = 0; i < equation.size(); ++i) {
+    if (!std::isalpha(equation[i])) {
+      // Unsupported character in the equation.
+      return llvm::None;
+    }
+    if (map.count(equation[i])) {
+      // Duplicate character in the equation.
+      return llvm::None;
+    }
+    map.try_emplace(equation[i], i);
+  }
+  return map;
+}
+
+llvm::Optional<EinsumDimensionNumbers> GetEinsumDimensionNumbers(
+    llvm::StringRef equation) {
+  llvm::StringRef lhs_rhs;
+  llvm::StringRef out;
+  std::tie(lhs_rhs, out) = equation.split("->");
+  if (lhs_rhs.empty() || out.empty()) return llvm::None;
+
+  llvm::StringRef lhs;
+  llvm::StringRef rhs;
+  std::tie(lhs, rhs) = lhs_rhs.split(',');
+  if (lhs.empty() || rhs.empty()) return llvm::None;
+
+  auto lhs_map_or = EquationToMap(lhs);
+  if (!lhs_map_or.hasValue()) return llvm::None;
+  auto lhs_map = lhs_map_or.getValue();
+
+  auto rhs_map_or = EquationToMap(rhs);
+  if (!rhs_map_or.hasValue()) return llvm::None;
+  auto rhs_map = rhs_map_or.getValue();
+
+  auto out_map_or = EquationToMap(out);
+  if (!out_map_or.hasValue()) return llvm::None;
+  auto out_map = out_map_or.getValue();
+
+  EinsumDimensionNumbers dnums;
+  for (int64_t i = 0, e = lhs.size(); i < e; ++i) {
+    auto rhs_index = rhs_map.find(lhs[i]);
+    auto out_index = out_map.find(lhs[i]);
+    if (rhs_index == rhs_map.end() && out_index == out_map.end()) {
+      dnums.lhs.emplace_back(i);
+    } else if (rhs_index == rhs_map.end()) {
+      dnums.lhs_out.emplace_back(i, out_index->second);
+    } else if (out_index == out_map.end()) {
+      dnums.lhs_rhs.emplace_back(i, rhs_index->second);
+    } else {
+      dnums.lhs_rhs_out.emplace_back(i, rhs_index->second, out_index->second);
+    }
+  }
+  for (int64_t i = 0, e = rhs.size(); i < e; ++i) {
+    auto lhs_index = lhs_map.find(rhs[i]);
+    auto out_index = out_map.find(rhs[i]);
+    if (lhs_index == lhs_map.end()) {
+      if (out_index == out_map.end()) {
+        dnums.rhs.emplace_back(i);
+      } else {
+        dnums.rhs_out.emplace_back(i, out_index->second);
+      }
+    }
+  }
+  for (int64_t i = 0, e = out.size(); i < e; ++i) {
+    auto lhs_index = lhs_map.find(out[i]);
+    auto rhs_index = rhs_map.find(out[i]);
+    if (lhs_index == lhs_map.end() && rhs_index == rhs_map.end()) {
+      // out only isn't supported
+      return llvm::None;
+    }
+  }
+  return dnums;
+}
+
+std::vector<int64_t> inverseTransposeVector(
+    llvm::ArrayRef<int64_t> input, llvm::ArrayRef<int32_t> permutation) {
+  std::vector<int64_t> output(input.size());
+  for (int64_t i = 0; i < input.size(); ++i) {
+    output[permutation[i]] = input[i];
+  }
+  return output;
+}
+
+// Computes the transpositions required to convert dnums to one supported by
+// tf.BatchMatmulV2 and returns the new set of dimension numbers with them.
+LogicalResult transposeForBatchMatmul(
+    const Location& loc, EinsumDimensionNumbers& dnums, Value* lhs, Value* rhs,
+    std::vector<int32_t>* out_inverse_transpose, PatternRewriter* rewriter) {
+  std::vector<int32_t> lhs_transpose;
+  std::vector<int32_t> rhs_transpose;
+  std::vector<int32_t> out_transpose;
+  lhs_transpose.reserve(dnums.lhs_rhs_out.size() + dnums.lhs_out.size() +
+                        dnums.lhs_rhs.size());
+  rhs_transpose.reserve(dnums.lhs_rhs_out.size() + dnums.rhs_out.size() +
+                        dnums.lhs_rhs.size());
+  out_transpose.reserve(dnums.lhs_rhs_out.size() + dnums.lhs_out.size() +
+                        dnums.rhs_out.size());
+  for (int64_t i = 0, e = dnums.lhs_rhs_out.size(); i < e; ++i) {
+    lhs_transpose.push_back(std::get<0>(dnums.lhs_rhs_out[i]));
+    rhs_transpose.push_back(std::get<1>(dnums.lhs_rhs_out[i]));
+    out_transpose.push_back(std::get<2>(dnums.lhs_rhs_out[i]));
+    dnums.lhs_rhs_out[i] = std::make_tuple(i, i, i);
+  }
+
+  for (int64_t i = 0, e = dnums.lhs_out.size(); i < e; ++i) {
+    lhs_transpose.push_back(std::get<0>(dnums.lhs_out[i]));
+    out_transpose.push_back(std::get<1>(dnums.lhs_out[i]));
+    dnums.lhs_out[i] =
+        std::make_tuple(lhs_transpose.size() - 1, out_transpose.size() - 1);
+  }
+  for (int64_t i = 0, e = dnums.lhs_rhs.size(); i < e; ++i) {
+    lhs_transpose.push_back(std::get<0>(dnums.lhs_rhs[i]));
+    rhs_transpose.push_back(std::get<1>(dnums.lhs_rhs[i]));
+    dnums.lhs_rhs[i] =
+        std::make_tuple(lhs_transpose.size() - 1, rhs_transpose.size() - 1);
+  }
+  for (int64_t i = 0, e = dnums.rhs_out.size(); i < e; ++i) {
+    rhs_transpose.push_back(std::get<0>(dnums.rhs_out[i]));
+    out_transpose.push_back(std::get<1>(dnums.rhs_out[i]));
+    dnums.rhs_out[i] =
+        std::make_tuple(rhs_transpose.size() - 1, out_transpose.size() - 1);
+  }
+
+  out_inverse_transpose->resize(out_transpose.size());
+  for (int64_t i = 0, e = out_transpose.size(); i < e; ++i) {
+    out_inverse_transpose->at(out_transpose[i]) = i;
+  }
+
+  *lhs = createTransposeOp(*lhs, loc, lhs_transpose, rewriter);
+  *rhs = createTransposeOp(*rhs, loc, rhs_transpose, rewriter);
+  return success();
+}
+
+// Reshapes LHS and RHS to have B0,...,Bn,L,C and B0,...,Bn,C,R shape
+// respectively while assuming that the initial shape for them is
+// B0,...,Bn,L0,...,Ln,C0,...,Cn and B0,...,Bn,C0,...,Cn,R0,...,Rn respectively.
+LogicalResult reshapeForBatchMatmul(const Location& loc,
+                                    EinsumDimensionNumbers& dnums, Value* lhs,
+                                    Value* rhs, std::vector<int64_t>* out_shape,
+                                    PatternRewriter* rewriter) {
+  RankedTensorType lhs_type = lhs->getType().cast<RankedTensorType>();
+  RankedTensorType rhs_type = rhs->getType().cast<RankedTensorType>();
+
+  std::vector<int64_t> lhs_shape;
+  std::vector<int64_t> rhs_shape;
+  lhs_shape.reserve(dnums.lhs_rhs_out.size() + dnums.lhs_out.size() + 1);
+  rhs_shape.reserve(dnums.lhs_rhs_out.size() + 2);
+  for (auto i : dnums.lhs_rhs_out) {
+    int64_t b = lhs_type.getShape()[std::get<0>(i)];
+    lhs_shape.push_back(b);
+    rhs_shape.push_back(b);
+    out_shape->push_back(b);
+  }
+
+  if (dnums.lhs_out.empty()) {
+    lhs_shape.push_back(1);
+    out_shape->push_back(1);
+    dnums.lhs_out.emplace_back(lhs_shape.size() - 1, out_shape->size() - 1);
+  } else if (dnums.lhs_rhs_out.empty()) {
+    for (auto i : dnums.lhs_out) {
+      int64_t b = lhs_type.getShape()[std::get<0>(i)];
+      lhs_shape.push_back(b);
+      out_shape->push_back(b);
+    }
+  } else {
+    int64_t lhs_out_size = 1;
+    for (auto i : dnums.lhs_out) {
+      lhs_out_size *= lhs_type.getShape()[std::get<0>(i)];
+    }
+    lhs_shape.push_back(lhs_out_size);
+    out_shape->push_back(lhs_out_size);
+  }
+
+  int64_t lhs_rhs_size = 1;
+  for (auto i : dnums.lhs_rhs) {
+    lhs_rhs_size *= lhs_type.getShape()[std::get<0>(i)];
+  }
+  lhs_shape.push_back(lhs_rhs_size);
+  rhs_shape.push_back(lhs_rhs_size);
+
+  int64_t rhs_size = 1;
+  for (auto i : dnums.rhs_out) {
+    rhs_size *= rhs_type.getShape()[std::get<0>(i)];
+  }
+  rhs_shape.push_back(rhs_size);
+  out_shape->push_back(rhs_size);
+
+  *lhs = createReshapeOp(*lhs, lhs_shape, lhs_type.getElementType(), loc,
+                         rewriter);
+  *rhs = createReshapeOp(*rhs, rhs_shape, rhs_type.getElementType(), loc,
+                         rewriter);
+
+  dnums.lhs_rhs.assign(
+      {std::make_tuple(dnums.lhs_rhs_out.size() + dnums.lhs_out.size(),
+                       dnums.lhs_rhs_out.size())});
+  dnums.rhs_out.assign(
+      {std::make_tuple(dnums.lhs_rhs_out.size() + dnums.lhs_out.size(),
+                       dnums.lhs_rhs_out.size() + dnums.lhs_out.size())});
+  return success();
+}
+
+LogicalResult rewriteToBatchMatmul(TF::EinsumOp op,
+                                   EinsumDimensionNumbers dnums,
+                                   PatternRewriter& rewriter) {
+  if (!dnums.lhs.empty() || !dnums.rhs.empty()) return failure();
+
+  auto inputs = op.inputs();
+  if (inputs.size() != 2) return failure();
+  Value lhs = inputs.front();
+  Value rhs = inputs.back();
+
+  RankedTensorType original_type =
+      op.getResult().getType().dyn_cast_or_null<RankedTensorType>();
+  if (!original_type) return failure();
+
+  std::vector<int32_t> out_transpose;
+  if (failed(transposeForBatchMatmul(op.getLoc(), dnums, &lhs, &rhs,
+                                     &out_transpose, &rewriter)))
+    return failure();
+
+  std::vector<int64_t> matmul_shape;
+  if (failed(reshapeForBatchMatmul(op.getLoc(), dnums, &lhs, &rhs,
+                                   &matmul_shape, &rewriter)))
+    return failure();
+
+  auto matmul_type =
+      RankedTensorType::get(matmul_shape, original_type.getElementType());
+  Value out = rewriter.create<TF::BatchMatMulV2Op>(
+      op.getLoc(), matmul_type, lhs, rhs, rewriter.getBoolAttr(false),
+      rewriter.getBoolAttr(false));
+
+  out = createReshapeOp(
+      out, inverseTransposeVector(original_type.getShape(), out_transpose),
+      original_type.getElementType(), op.getLoc(), &rewriter);
+  out = createTransposeOp(out, op.getLoc(), out_transpose, &rewriter);
+
+  rewriter.replaceOp(op, out);
+  return success();
+}
+
 }  // namespace
 
 LogicalResult ConvertTFEinsumOp::matchAndRewrite(
     TF::EinsumOp op, PatternRewriter& rewriter) const {
-  Type output_type = op.getResult().getType();
-  Value lhs = op.getOperand(0);
-  Value rhs = op.getOperand(1);
-  Location loc = op.getLoc();
-  if (!lhs.getType().isa<RankedTensorType>()) {
-    // LHS must be a ranked tensor type
-    return failure();
-  }
-  if (!rhs.getType().isa<RankedTensorType>()) {
-    // RHS must be a ranked tensor type
-    return failure();
-  }
+  const auto dnums_or = GetEinsumDimensionNumbers(op.equation());
+  if (!dnums_or.hasValue()) return failure();
+  const auto& dnums = dnums_or.getValue();
 
-  auto lhs_type = lhs.getType().cast<RankedTensorType>();
-  auto rhs_type = rhs.getType().cast<RankedTensorType>();
-  auto lhs_shape = lhs_type.getShape();
-  auto rhs_shape = rhs_type.getShape();
+  RankedTensorType lhs =
+      op.getOperand(0).getType().dyn_cast_or_null<RankedTensorType>();
+  RankedTensorType rhs =
+      op.getOperand(1).getType().dyn_cast_or_null<RankedTensorType>();
+  if (!lhs || !rhs) return failure();
 
-  // Currently only support static shapes.
-  if (!(lhs_type.hasStaticShape() && rhs_type.hasStaticShape())) {
-    return failure();
-  }
-
-  // Currently support use cases of LHS dims \in {3,4}  RHS dims \in {2, 3, 4}
-  const int dims_lhs = lhs_shape.size();
-  const int dims_rhs = rhs_shape.size();
-  if (dims_lhs < 3 || dims_lhs > 4 || dims_rhs < 2 || dims_rhs > 4) {
-    return failure();
-  }
-
-  EinsumEquation einsum_eqn = tokenizeAndParse(op.equation());
-  if (einsum_eqn == EinsumEquation::BatchMatMul) {
-    // Case "IJK,IKM->IJM"
-    auto bmm_op = rewriter.create<TF::BatchMatMulV2Op>(
-        loc, ArrayRef<Type>{output_type}, lhs, rhs, rewriter.getBoolAttr(false),
-        rewriter.getBoolAttr(false));
-    rewriter.replaceOp(op, bmm_op.getResult());
-    return success();
-  }
-  if (einsum_eqn == EinsumEquation::BroadcastMatMul) {
-    // Case "BFH,HO->BFO"
-    auto bmm_op = rewriter.create<TF::BatchMatMulV2Op>(
-        loc, ArrayRef<Type>{output_type}, lhs, rhs, rewriter.getBoolAttr(false),
-        rewriter.getBoolAttr(false));
-    rewriter.replaceOp(op, bmm_op.getResult());
-    return success();
-  }
-  if (einsum_eqn == EinsumEquation::ReduceSum) {
-    // Case "LBH,BL->BH"
-    // Transpose LHS
-    lhs = createTransposeOp(lhs, loc, {1, 2, 0}, &rewriter);
-    // Reshape RHS
-    auto rhs_element_type = rhs_type.getElementType();
-    const int rhs_dim0 = rhs_shape[0];
-    const int rhs_dim1 = rhs_shape[1];
-    auto reshaped_rhs = createReshapeOp(rhs, {rhs_dim0, 1, rhs_dim1},
-                                        rhs_element_type, loc, &rewriter);
-    auto mul_op = rewriter.create<TF::MulOp>(loc, lhs, reshaped_rhs);
-
-    auto sum_op = createSumOp(mul_op, loc, {2}, &rewriter);
-    rewriter.replaceOp(op, {sum_op.getResult()});
-    return success();
-  }
-  if (einsum_eqn == EinsumEquation::TransposeMatMul) {
-    // Case "LBH,BKL->BKH"
-    // Transpose LHS
-    lhs = createTransposeOp(lhs, loc, {1, 2, 0}, &rewriter);
-    // Transpose RHS
-    rhs = createTransposeOp(rhs, loc, {0, 2, 1}, &rewriter);
-    std::vector<int64_t> bmm_shape = {lhs_shape[1], lhs_shape[2], rhs_shape[1]};
-    auto bmm_type = RankedTensorType::get(bmm_shape, rhs_type.getElementType());
-    auto bmm_op = rewriter.create<TF::BatchMatMulV2Op>(
-        loc, ArrayRef<Type>{bmm_type}, lhs, rhs, rewriter.getBoolAttr(false),
-        rewriter.getBoolAttr(false));
-
-    auto trans_bmm = createTransposeOp(bmm_op, loc, {0, 2, 1}, &rewriter);
-    rewriter.replaceOp(op, {trans_bmm.getResult()});
-    return success();
-  }
-  if (einsum_eqn == EinsumEquation::ThreeDReshapeTail) {
-    // Case "BFD,DNH->BFNH"
-    auto lhs_type = lhs.getType().cast<RankedTensorType>();
-    auto lhs_shape = lhs_type.getShape();
-    const int lhs_dim0 = lhs_shape[0];
-    const int lhs_dim1 = lhs_shape[1];
-    // Reshape RHS
-    auto rhs_type = rhs.getType().cast<RankedTensorType>();
-    auto rhs_shape = rhs_type.getShape();
-    auto rhs_element_type = rhs_type.getElementType();
-    const int rhs_dim0 = rhs_shape[0];
-    const int rhs_dim1 = rhs_shape[1];
-    const int rhs_dim2 = rhs_shape[2];
-    auto reshaped_rhs = createReshapeOp(rhs, {rhs_dim0, rhs_dim1 * rhs_dim2},
-                                        rhs_element_type, loc, &rewriter);
-
-    std::vector<int64_t> bmm_shape = {lhs_dim0, lhs_dim1, rhs_dim1 * rhs_dim2};
-    auto bmm_type = RankedTensorType::get(bmm_shape, rhs_type.getElementType());
-    auto bmm_op = rewriter.create<TF::BatchMatMulV2Op>(
-        loc, ArrayRef<Type>{bmm_type}, lhs, reshaped_rhs,
-        rewriter.getBoolAttr(false), rewriter.getBoolAttr(false));
-    auto bmm_element_type = bmm_type.getElementType();
-    auto final_reshape =
-        createReshapeOp(bmm_op, {lhs_dim0, lhs_dim1, rhs_dim1, rhs_dim2},
-                        bmm_element_type, loc, &rewriter);
-    rewriter.replaceOp(op, {final_reshape.getResult()});
-    return success();
-  }
-  if (einsum_eqn == EinsumEquation::FourDMatrixDotProd) {
-    // Case "BFND,NDH->BFH"
-    // Reshape LHS
-    auto lhs_element_type = lhs_type.getElementType();
-    const int lhs_dim0 = lhs_shape[0];
-    const int lhs_dim1 = lhs_shape[1];
-    const int lhs_dim2 = lhs_shape[2];
-    const int lhs_dim3 = lhs_shape[3];
-    auto reshaped_lhs =
-        createReshapeOp(lhs, {lhs_dim0, lhs_dim1, lhs_dim2 * lhs_dim3},
-                        lhs_element_type, loc, &rewriter);
-    // Reshape RHS
-    auto rhs_element_type = rhs_type.getElementType();
-    const int rhs_dim0 = rhs_shape[0];
-    const int rhs_dim1 = rhs_shape[1];
-    const int rhs_dim2 = rhs_shape[2];
-    auto reshaped_rhs = createReshapeOp(rhs, {rhs_dim0 * rhs_dim1, rhs_dim2},
-                                        rhs_element_type, loc, &rewriter);
-    auto bmm_op = rewriter.create<TF::BatchMatMulV2Op>(
-        loc, ArrayRef<Type>{output_type}, reshaped_lhs, reshaped_rhs,
-        rewriter.getBoolAttr(false), rewriter.getBoolAttr(false));
-    rewriter.replaceOp(op, {bmm_op.getResult()});
-    return success();
-  }
-  if (einsum_eqn == EinsumEquation::FourDBatchMatMul) {
-    // Case "BFNH,BTNH->BNFT"
-    // Transpose LHS
-    lhs = createTransposeOp(lhs, loc, {0, 2, 1, 3}, &rewriter);
-    // Transpose RHS
-    rhs = createTransposeOp(rhs, loc, {0, 2, 3, 1}, &rewriter);
-    auto bmm_op = rewriter.create<TF::BatchMatMulV2Op>(
-        loc, ArrayRef<Type>{output_type}, lhs, rhs, rewriter.getBoolAttr(false),
-        rewriter.getBoolAttr(false));
-    rewriter.replaceOp(op, {bmm_op.getResult()});
-    return success();
-  }
-  if (einsum_eqn == EinsumEquation::BatchMatMulReducedDim) {
-    // Case "BIN,BINJ->BIJ"
-    // Reshape LHS
-    auto lhs_element_type = lhs_type.getElementType();
-    const int lhs_dim0 = lhs_shape[0];
-    const int lhs_dim1 = lhs_shape[1];
-    const int lhs_dim2 = lhs_shape[2];
-    const int rhs_dim3 = rhs_shape[3];
-
-    auto reshaped_lhs = createReshapeOp(lhs, {lhs_dim0, lhs_dim1, 1, lhs_dim2},
-                                        lhs_element_type, loc, &rewriter);
-    std::vector<int64_t> bmm_shape = {lhs_dim0, lhs_dim1, 1, rhs_dim3};
-    auto bmm_type = RankedTensorType::get(bmm_shape, rhs_type.getElementType());
-    auto bmm_op = rewriter.create<TF::BatchMatMulV2Op>(
-        loc, ArrayRef<Type>{bmm_type}, reshaped_lhs, rhs,
-        rewriter.getBoolAttr(false), rewriter.getBoolAttr(false));
-
-    auto bmm_element_type = bmm_type.getElementType();
-    auto final_reshape = createReshapeOp(bmm_op, {lhs_dim0, lhs_dim1, rhs_dim3},
-                                         bmm_element_type, loc, &rewriter);
-    rewriter.replaceOp(op, {final_reshape.getResult()});
-    return success();
-  }
-  if (einsum_eqn == EinsumEquation::TransposeReducedDim) {
-    // Case "BIJ,BINJ->BIN"
-    // Reshape LHS
-    auto lhs_element_type = lhs_type.getElementType();
-    const int lhs_dim0 = lhs_shape[0];
-    const int lhs_dim1 = lhs_shape[1];
-    const int lhs_dim2 = lhs_shape[2];
-    const int rhs_dim2 = rhs_shape[2];
-
-    auto reshaped_lhs = createReshapeOp(lhs, {lhs_dim0, lhs_dim1, 1, lhs_dim2},
-                                        lhs_element_type, loc, &rewriter);
-    // Transpose RHS
-    rhs = createTransposeOp(rhs, loc, {0, 1, 3, 2}, &rewriter);
-    std::vector<int64_t> bmm_shape = {lhs_dim0, lhs_dim1, 1, rhs_dim2};
-    auto bmm_type = RankedTensorType::get(bmm_shape, rhs_type.getElementType());
-    auto bmm_op = rewriter.create<TF::BatchMatMulV2Op>(
-        loc, ArrayRef<Type>{bmm_type}, reshaped_lhs, rhs,
-        rewriter.getBoolAttr(false), rewriter.getBoolAttr(false));
-
-    auto bmm_element_type = bmm_type.getElementType();
-    auto final_reshape = createReshapeOp(bmm_op, {lhs_dim0, lhs_dim1, rhs_dim2},
-                                         bmm_element_type, loc, &rewriter);
-    rewriter.replaceOp(op, {final_reshape.getResult()});
-    return success();
-  }
-  if (einsum_eqn == EinsumEquation::FourDReduceLast) {
-    // Case "acbe,aecd->abcd"
-    const int lhs_dim2 = lhs_shape[2];
-    const int rhs_dim0 = rhs_shape[0];
-    const int rhs_dim2 = rhs_shape[2];
-    const int rhs_dim3 = rhs_shape[3];
-    // Transpose RHS
-    rhs = createTransposeOp(rhs, loc, {0, 2, 1, 3}, &rewriter);
-    std::vector<int64_t> bmm_shape = {rhs_dim0, rhs_dim2, lhs_dim2, rhs_dim3};
-    auto bmm_type = RankedTensorType::get(bmm_shape, rhs_type.getElementType());
-    auto bmm_op = rewriter.create<TF::BatchMatMulV2Op>(
-        loc, ArrayRef<Type>{bmm_type}, lhs, rhs, rewriter.getBoolAttr(false),
-        rewriter.getBoolAttr(false));
-
-    auto trans_bmm = createTransposeOp(bmm_op, loc, {0, 2, 1, 3}, &rewriter);
-    rewriter.replaceOp(op, {trans_bmm.getResult()});
-    return success();
-  }
-  if (einsum_eqn == EinsumEquation::FourDTransposeAll) {
-    // Case "aecd,abcd->acbe"
-    const int lhs_dim0 = lhs_shape[0];
-    const int lhs_dim1 = lhs_shape[1];
-    const int lhs_dim2 = lhs_shape[2];
-    const int rhs_dim1 = rhs_shape[1];
-    // Transpose LHS
-    lhs = createTransposeOp(lhs, loc, {0, 2, 1, 3}, &rewriter);
-    // Transpose RHS
-    rhs = createTransposeOp(rhs, loc, {0, 2, 3, 1}, &rewriter);
-    std::vector<int64_t> bmm_shape = {lhs_dim0, lhs_dim2, lhs_dim1, rhs_dim1};
-    auto bmm_type = RankedTensorType::get(bmm_shape, rhs_type.getElementType());
-    auto bmm_op = rewriter.create<TF::BatchMatMulV2Op>(
-        loc, ArrayRef<Type>{bmm_type}, lhs, rhs, rewriter.getBoolAttr(false),
-        rewriter.getBoolAttr(false));
-
-    auto trans_bmm = createTransposeOp(bmm_op, loc, {0, 1, 3, 2}, &rewriter);
-    rewriter.replaceOp(op, {trans_bmm.getResult()});
-    return success();
-  }
-
-  return failure();
+  return rewriteToBatchMatmul(op, dnums, rewriter);
 }
 
 // Transform Einsum to other TF Ops for the supported variants.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
index d8678e620f4..a5d76619416 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc
@@ -157,14 +157,14 @@ static LogicalResult LowerIfOp(IfOp op) {
 
   // Set up the 'then' block.
   Block* then_block = builder.createBlock(merge_block);
-  Operation* call_op = CallFn(loc, get_operand, op.then_func(), &builder);
+  Operation* call_op = CallFn(loc, get_operand, op.then_function(), &builder);
 
   auto get_then_result = [&](int i) { return call_op->getResult(i); };
   JumpToBlock(loc, get_then_result, merge_block, &builder);
 
   // Set up the 'else' block.
   Block* else_block = builder.createBlock(merge_block);
-  call_op = CallFn(loc, get_operand, op.else_func(), &builder);
+  call_op = CallFn(loc, get_operand, op.else_function(), &builder);
 
   auto get_else_result = [&](int i) { return call_op->getResult(i); };
   JumpToBlock(loc, get_else_result, merge_block, &builder);
@@ -190,8 +190,8 @@ static LogicalResult LowerWhileOp(WhileOp op) {
 
   OpBuilder builder(op_inst);
 
-  auto cond_fn = op.cond_func();
-  auto body_fn = op.body_func();
+  auto cond_fn = op.cond_function();
+  auto body_fn = op.body_function();
 
   // Split the block containing the While op into two blocks.  One containing
   // operations before the While op and other containing the rest.  Create two
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
index 11d74e87f96..87733bbbf3f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
@@ -98,10 +98,10 @@ LogicalResult ConvertIfOp(IfOp if_op) {
       if_op.getLoc(), if_op.getResultTypes(), cond, if_op.is_stateless());
   CopyDeviceAndUnderscoredAttributes(if_op, if_region);
 
-  CreateCall(if_op, if_op.then_func(),
+  CreateCall(if_op, if_op.then_function(),
              /*caller_region=*/if_region.then_branch(), if_op.input(),
              /*use_region_args=*/false);
-  CreateCall(if_op, if_op.else_func(),
+  CreateCall(if_op, if_op.else_function(),
              /*caller_region=*/if_region.else_branch(), if_op.input(),
              /*use_region_args=*/false);
   if_op.replaceAllUsesWith(if_region.getResults());
@@ -116,14 +116,14 @@ LogicalResult ConvertWhileOp(WhileOp while_op) {
   CopyDeviceAndUnderscoredAttributes(while_op, while_region);
 
   YieldOp cond_yield =
-      CreateCall(while_op, while_op.cond_func(),
+      CreateCall(while_op, while_op.cond_function(),
                  /*caller_region=*/while_region.cond(), while_op.input(),
                  /*use_region_args=*/true);
   Value i1_cond =
       ConvertConditionToBoolean(cond_yield, cond_yield.getOperand(0));
   cond_yield.setOperand(0, i1_cond);
 
-  CreateCall(while_op, while_op.body_func(),
+  CreateCall(while_op, while_op.body_function(),
              /*caller_region=*/while_region.body(), while_op.input(),
              /*use_region_args=*/true);
   while_op.replaceAllUsesWith(while_region.getResults());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc
index 7563f606434..a18d893fac7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/graph_optimization_pass.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 
 namespace mlir {
@@ -39,6 +40,7 @@ Status MlirGraphOptimizationPass::Run(const ConfigProto& config_proto,
 
   VLOG(1) << "Run MLIR Graph Optimization Passes";
   PassManager pm(module.getContext());
+  ::tensorflow::applyTensorflowAndCLOptions(pm);
 
   // Run island coarsening before shape inference to allow more exact shape
   // inference using constant folding within islands.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
index e76a8da0b29..8123f50757e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/layout_optimization.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
@@ -33,6 +35,34 @@ namespace mlir {
 namespace TF {
 
 namespace {
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_helpers.inc"
+
+// Helper method that returns an op from 'transpose_ops' that match criteria
+// for an 'operand' and 'permutation'
+TransposeOp ReuseExistingTranspose(const OpOperand* operand,
+                                   const SmallVector<int64_t, 4>& permutation,
+                                   Operation* op, ConstOp permutation_op,
+                                   SmallVector<TransposeOp, 2>* transpose_ops) {
+  for (auto it = transpose_ops->begin(); it != transpose_ops->end(); ++it) {
+    auto tranpose_op = *it;
+    for (auto tranpose_operand : tranpose_op.getOperands()) {
+      auto ranked_tranpose_type =
+          tranpose_operand.getType().dyn_cast_or_null<RankedTensorType>();
+      if (!ranked_tranpose_type) continue;
+      if (ranked_tranpose_type.getRank() == permutation.size() &&
+          operand->get().getType() ==
+              ShuffleRankedTensorType(ranked_tranpose_type, permutation)) {
+        TransposeOp transpose = tranpose_op;
+        transpose.getOperation()->moveBefore(op);
+        transpose.setOperand(0, operand->get());
+        transpose.setOperand(1, permutation_op);
+        transpose_ops->erase(it);
+        return transpose;
+      }
+    }
+  }
+  return nullptr;
+}
 
 // LayoutAssignmentPass assigns optimal data layout (data format) for all
 // layout sensitive operations.
@@ -79,18 +109,7 @@ class MoveTransposesPass
           clEnumValN(Direction::kEnd, "end", "end of the block"))};
 };
 
-using Permutation = SmallVector<int32_t, 4>;
-
-Permutation GetDataFormatPermutation(StringRef from_data_format,
-                                     StringRef to_data_format) {
-  if (from_data_format == "NHWC" && to_data_format == "NCHW") {
-    return {0, 3, 1, 2};
-  } else if (from_data_format == "NCHW" && to_data_format == "NHWC") {
-    return {0, 2, 3, 1};
-  } else {
-    llvm_unreachable("Unknown data format combination");
-  }
-}
+using Permutation = SmallVector<int64_t, 4>;
 
 void LayoutAssignmentPass::runOnFunction() {
   FuncOp func = getFunction();
@@ -131,7 +150,7 @@ void LayoutAssignmentPass::runOnFunction() {
     OpBuilder builder = OpBuilder::atBlockEnd(op->getBlock());
 
     auto perm_attr = [&](Permutation permutation) -> DenseIntElementsAttr {
-      auto perm_ty = RankedTensorType::get({4}, builder.getIntegerType(32));
+      auto perm_ty = RankedTensorType::get({4}, builder.getIntegerType(64));
       return DenseIntElementsAttr::get(perm_ty, permutation);
     };
 
@@ -202,6 +221,27 @@ void MoveTransposeBefore(Operation* op, SmallVector<Operation*, 8>* work_list) {
 
   // Nothing to do here.
   if (!permutation_op || transpose_ops.empty()) return;
+  SmallVector<int64_t, 4> permutation;
+  auto perm_attr = permutation_op.value().cast<DenseElementsAttr>();
+  for (const auto& value : perm_attr.getIntValues())
+    permutation.push_back(value.getSExtValue());
+
+  // We want to make sure the shape of the operand equals the transposed shape.
+  // mismatch can happen if 'op' supports broadcasting and the operands have
+  // different ranks.
+  if (op->hasTrait<OpTrait::ResultsBroadcastableShape>()) {
+    auto transpose_op = *transpose_ops.begin();
+    auto result_type =
+        transpose_op.getResult().getType().dyn_cast_or_null<ShapedType>();
+    auto is_valid_move =
+        llvm::all_of(op->getOperands(), [result_type](Value operand) -> bool {
+          auto operand_type = operand.getType().dyn_cast_or_null<ShapedType>();
+          return result_type && operand_type && result_type.hasRank() &&
+                 operand_type.hasRank() &&
+                 result_type.getRank() == operand_type.getRank();
+        });
+    if (!is_valid_move) return;
+  }
 
   // At this point we checked that we can safely move Transpose node before
   // `op`, and bypass all result transposes.
@@ -228,16 +268,12 @@ void MoveTransposeBefore(Operation* op, SmallVector<Operation*, 8>* work_list) {
       work_list->push_back(operand_op);
 
     // Try to reuse result transposes.
-    TransposeOp transpose;
-    if (!transpose_ops.empty()) {
-      transpose = transpose_ops.pop_back_val();
-      transpose.getOperation()->moveBefore(op);
-      transpose.setOperand(0, operand.get());
-      transpose.setOperand(1, permutation_op);
-    } else {
+    TransposeOp transpose = ReuseExistingTranspose(
+        &operand, permutation, op, permutation_op, &transpose_ops);
+    // If no transpose available for using, create new one.
+    if (!transpose)
       transpose =
           builder.create<TransposeOp>(loc, operand.get(), permutation_op);
-    }
 
     operand.set(transpose);
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
index ad241ef9488..8ab348c1e5b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
@@ -88,7 +88,7 @@ class ConvertConvOp : public OpConversionPattern<mhlo::ConvOp> {
     const int input_channels =
         conv_op.lhs().getType().cast<ShapedType>().getDimSize(
             input_feature_dimension);
-    int feature_group_count = conv_op.feature_group_count().getSExtValue();
+    int feature_group_count = conv_op.feature_group_count();
 
     const bool is_depthwise_conv = input_channels == feature_group_count;
     std::string padding;
@@ -250,7 +250,7 @@ class ConvertSliceOp : public OpConversionPattern<mhlo::SliceOp> {
         strides.getSplatValue().cast<IntegerAttr>().getInt() != 1)
       return failure();
 
-    rewriter.setInsertionPointAfter(slice_op);
+    rewriter.setInsertionPointAfter(slice_op.getOperation());
     auto start_indices = slice_op.start_indices();
     auto limit_indices = slice_op.limit_indices();
     std::vector<int64_t> size_values;
@@ -614,7 +614,65 @@ class ConvertReduceOpToTfMin : public OpConversionPattern<mhlo::ReduceOp> {
   };
 };
 
+class ConvertIotaOpToTfRange : public OpConversionPattern<mhlo::IotaOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      mhlo::IotaOp iota_op, ArrayRef<Value> args,
+      ConversionPatternRewriter &rewriter) const final {
+    RankedTensorType type =
+        iota_op.getType().dyn_cast_or_null<RankedTensorType>();
+    if (!type) return failure();
+
+    const uint64_t dimension = iota_op.iota_dimension();
+    Type element_type = type.getElementType();
+    Attribute start, limit, delta;
+    if (element_type.isa<FloatType>()) {
+      start = rewriter.getFloatAttr(element_type, 0.0);
+      limit = rewriter.getFloatAttr(element_type, type.getShape()[dimension]);
+      delta = rewriter.getFloatAttr(element_type, 1.0);
+    } else if (element_type.isa<IntegerType>()) {
+      start = rewriter.getIntegerAttr(element_type, 0);
+      limit = rewriter.getIntegerAttr(element_type, type.getShape()[dimension]);
+      delta = rewriter.getIntegerAttr(element_type, 1);
+    } else {
+      return failure();
+    }
+
+    auto range_type =
+        RankedTensorType::get({type.getShape()[dimension]}, element_type);
+    Value start_op = rewriter.create<TF::ConstOp>(iota_op.getLoc(), start);
+    Value limit_op = rewriter.create<TF::ConstOp>(iota_op.getLoc(), limit);
+    Value delta_op = rewriter.create<TF::ConstOp>(iota_op.getLoc(), delta);
+    Value result = rewriter.create<TF::RangeOp>(iota_op.getLoc(), range_type,
+                                                start_op, limit_op, delta_op);
+
+    if (type.getRank() > 1) {
+      std::vector<int64_t> reshape_shape(type.getRank(), 1);
+      reshape_shape[iota_op.iota_dimension()] = type.getShape()[dimension];
+      auto reshape_type = RankedTensorType::get(reshape_shape, element_type);
+      Value reshape_shape_op = rewriter.create<TF::ConstOp>(
+          iota_op.getLoc(), rewriter.getI64TensorAttr(reshape_shape));
+      result = rewriter.create<TF::ReshapeOp>(iota_op.getLoc(), reshape_type,
+                                              result, reshape_shape_op);
+
+      Value broadcast_shape_op = rewriter.create<TF::ConstOp>(
+          iota_op.getLoc(), rewriter.getI64TensorAttr(type.getShape()));
+      result = rewriter.create<TF::BroadcastToOp>(iota_op.getLoc(), type,
+                                                  result, broadcast_shape_op);
+    }
+
+    rewriter.replaceOp(iota_op, result);
+    return success();
+  }
+};
+
 class LegalizeHloToTf : public PassWrapper<LegalizeHloToTf, FunctionPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<TF::TensorFlowDialect>();
+  }
+
  public:
   LegalizeHloToTf() = default;
   LegalizeHloToTf(const LegalizeHloToTf &) {}
@@ -763,9 +821,10 @@ static PassRegistration<LegalizeHloToTf> pass(
 
 void PopulateLegalizeHloToTfPatterns(OwningRewritePatternList *patterns,
                                      MLIRContext *context) {
-  populateWithGenerated(context, patterns);
+  populateWithGenerated(context, *patterns);
   patterns->insert<ConvertConvOp, ConvertSliceOp, ConvertReduceOpToTfMax,
-                   ConvertReduceOpToTfMin, ConvertReduceOpToTfSum>(context);
+                   ConvertReduceOpToTfMin, ConvertReduceOpToTfSum,
+                   ConvertIotaOpToTfRange>(context);
 }
 
 std::unique_ptr<OperationPass<FuncOp>> CreateLegalizeHloToTfPass() {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.cc
index 6686b340be9..6c1e6a827c7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lift_variables.cc
@@ -176,7 +176,48 @@ LogicalResult LiftVariables(ModuleOp module, Session* session) {
 
   if (resource_names.empty()) return success();
 
-  return LiftVariablesFromSession(module, session, resource_names);
+  if (failed(LiftVariablesFromSession(module, session, resource_names)))
+    return failure();
+
+  // Now that we have all global tensors created, we set the corresponding
+  // bound_inputs' types correctly.
+  SymbolTable symbol_table(module);
+  for (auto func : module.getOps<FuncOp>()) {
+    for (auto arg : func.getArguments()) {
+      unsigned arg_number = arg.getArgNumber();
+      auto global_tensor = LookupBoundInputOfType<GlobalTensorOp>(
+          func, arg_number, symbol_table);
+      if (!global_tensor) continue;
+
+      auto arg_type = arg.getType().cast<RankedTensorType>();
+      assert(arg_type.getRank() == 0);
+      llvm::ArrayRef<TensorType> underlying_type =
+          arg_type.getElementType().cast<TF::ResourceType>().getSubtypes();
+
+      // If the arg type already matches the global_tensor type, we don't need
+      // to do anything.
+      if (!underlying_type.empty() &&
+          underlying_type[0] == global_tensor.type()) {
+        assert(underlying_type.size() == 1);
+        continue;
+      }
+
+      // Otherwise, set this argument's type to the global_tensor's type.
+      auto new_arg_type = mlir::RankedTensorType::get(
+          /*shape=*/{},
+          mlir::TF::ResourceType::get(
+              /*subtypes=*/{global_tensor.type().cast<TensorType>()},
+              module.getContext()));
+
+      arg.setType(new_arg_type);
+    }
+
+    // Update the function type.
+    func.setType(mlir::FunctionType::get(func.getArgumentTypes(),
+                                         func.getType().getResults(),
+                                         module.getContext()));
+  }
+  return success();
 }
 
 }  // namespace tf_saved_model
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
index 6946dc65104..a462f967bef 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc
@@ -56,6 +56,14 @@ static DenseIntElementsAttr GetI64ElementsAttrForSeq(int start, int end,
   return DenseIntElementsAttr::get(ty, vals);
 }
 
+static APFloat ConvertToAPFloat(double val, Type type) {
+  if (type.getIntOrFloatBitWidth() == 32) {
+    return APFloat(static_cast<float>(val));
+  }
+
+  return APFloat(val);
+}
+
 // Returns int, float, or complex DenseElementsAttr with scalar shape with the
 // given element type and the integer value.
 static DenseElementsAttr GetScalarOfType(Type ty, int64_t raw_value) {
@@ -121,6 +129,17 @@ Type InferExpandDimsType(Type ty, int64_t axis, Builder *builder) {
   return RankedTensorType::get(shape, ranked_ty.getElementType());
 }
 
+// Converts individual Values to a tensor of rank 1. Each input Value has rank 1
+// and size 1.
+Value ValuesToRank1(PatternRewriter &rewriter, Location loc, Type dtype,
+                    ArrayRef<Value> vals) {
+  int64_t length = vals.size();
+  auto type = RankedTensorType::get({length}, dtype);
+  auto axis = rewriter.create<TF::ConstOp>(
+      loc, GetScalarOfType(rewriter.getIntegerType(64), 0));
+  return rewriter.create<TF::ConcatV2Op>(loc, type, ValueRange(vals), axis);
+}
+
 // Lowers AddN op to a sequence of AddV2 ops to accumulate operands.
 //
 // Note that to improve the parallelism, AddN op uses tree-based reduction.
@@ -160,34 +179,37 @@ Type InferExpandDimsType(Type ty, int64_t axis, Builder *builder) {
 //   %sum2 = "tf.AddV2"(%sum0, %sum1)
 //   %result = "tf.AddV2"(%sum2, %4)
 //
-class LowerAddNOp : public OpRewritePattern<TF::AddNOp> {
+class LowerAddNOp : public RewritePattern {
  public:
   explicit LowerAddNOp(MLIRContext *context)
-      : OpRewritePattern<TF::AddNOp>(context) {}
+      : RewritePattern(TF::AddNOp::getOperationName(),
+                       {TF::AddV2Op::getOperationName()}, 1, context) {}
 
-  LogicalResult matchAndRewrite(TF::AddNOp op,
+  LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override {
+    auto addn_op = cast<TF::AddNOp>(op);
+
     // TODO(hinsu): Support variant with TensorList type. tf.AddV2 doesn't
     // support variant type so variant types require special handling.
-    if (getElementTypeOrSelf(op.getType()).isa<VariantType>()) return failure();
-
-    llvm::SmallVector<Value, 4> operands(op.inputs().begin(),
-                                         op.inputs().end());
+    if (getElementTypeOrSelf(addn_op.getType()).isa<VariantType>())
+      return failure();
+    llvm::SmallVector<Value, 4> operands(addn_op.inputs().begin(),
+                                         addn_op.inputs().end());
 
     int64_t n = operands.size();
     // Keep doing tree-based reduction when there are more than one operand.
     while (n > 1) {
       for (int64_t i = 0; i < n; i += 2) {
         // Add two adjacent operands if applicable.
-        operands[i / 2] = (i + 1 < n)
-                              ? rewriter.create<TF::AddV2Op>(
-                                    op.getLoc(), operands[i], operands[i + 1])
-                              : operands[i];
+        operands[i / 2] =
+            (i + 1 < n) ? rewriter.create<TF::AddV2Op>(
+                              addn_op.getLoc(), operands[i], operands[i + 1])
+                        : operands[i];
       }
       n = (n + 1) / 2;
     }
 
-    rewriter.replaceOp(op, operands[0]);
+    rewriter.replaceOp(addn_op, operands[0]);
     return success();
   }
 };
@@ -273,7 +295,7 @@ class LowerDynamicStitchOp : public OpRewritePattern<TF::DynamicStitchOp> {
           reshaped_data.getType().cast<RankedTensorType>().getShape()[0];
       auto items = rewriter.create<UnpackOp>(
           loc, SmallVector<Type, 4>(num_items, item_ty), reshaped_data,
-          /*axis=*/APInt(64, 0));
+          /*axis=*/0);
       for (auto index_item : llvm::zip(index_attr, items.getResults())) {
         int64_t output_index = std::get<0>(index_item).getSExtValue();
         Value item = std::get<1>(index_item);
@@ -287,6 +309,114 @@ class LowerDynamicStitchOp : public OpRewritePattern<TF::DynamicStitchOp> {
   }
 };
 
+// This pass performs a manual conversion with FakeQuant, converting between
+// floating point and quantized space. It is designed to reproduce TF's
+// implementation, mirroring the previous XLA implementation.
+//
+// 1. Computing proper quantized bounds. This involves nudging the input bounds.
+// 2. Converting the input bounds to quantized space, rounding values.
+// 3. Convert back into floating point space.
+class ConvertFakeQuantWithMinMaxVarsOp
+    : public OpRewritePattern<TF::FakeQuantWithMinMaxVarsOp> {
+  using OpRewritePattern<TF::FakeQuantWithMinMaxVarsOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::FakeQuantWithMinMaxVarsOp op,
+                                PatternRewriter &rewriter) const override {
+    auto input = op.inputs();
+    auto input_ty = input.getType().cast<ShapedType>();
+    auto element_ty = input_ty.getElementType();
+    auto scalar_ty = RankedTensorType::get({}, element_ty);
+
+    auto num_bits = op.num_bits();
+    auto narrow_range = op.narrow_range();
+    const double bits_min = narrow_range ? 1 : 0;
+    const double bits_max = (1 << num_bits) - 1;
+
+    auto float_min = op.min();
+    auto float_max = op.max();
+
+    auto float_diff =
+        rewriter.create<TF::SubOp>(op.getLoc(), float_max, float_min);
+
+    // Compute the range when quantized.
+    auto quant_min = rewriter.create<TF::ConstOp>(
+        op.getLoc(), DenseElementsAttr::get(
+                         scalar_ty, ConvertToAPFloat(bits_min, element_ty)));
+
+    auto quant_max = rewriter.create<TF::ConstOp>(
+        op.getLoc(), DenseElementsAttr::get(
+                         scalar_ty, ConvertToAPFloat(bits_max, element_ty)));
+
+    auto quant_diff = rewriter.create<TF::ConstOp>(
+        op.getLoc(),
+        DenseElementsAttr::get(
+            scalar_ty, ConvertToAPFloat(bits_max - bits_min, element_ty)));
+
+    auto quant_to_float =
+        rewriter.create<TF::DivOp>(op.getLoc(), float_diff, quant_diff);
+
+    auto float_to_quant =
+        rewriter.create<TF::DivOp>(op.getLoc(), quant_diff, float_diff);
+
+    // During quantization, the quantized min/max values may not line up
+    // perfectly with the specified min/max. Nudge them into the right range.
+    auto min_scaled =
+        rewriter.create<TF::DivOp>(op.getLoc(), float_min, quant_to_float);
+    auto min_scaled_sub =
+        rewriter.create<TF::SubOp>(op.getLoc(), quant_min, min_scaled);
+
+    auto mid_rounded =
+        rewriter.create<TF::RoundOp>(op.getLoc(), scalar_ty, min_scaled_sub);
+
+    auto nudged_zero_point_val = rewriter.create<TF::ClipByValueOp>(
+        op.getLoc(), scalar_ty, mid_rounded, quant_min, quant_max);
+
+    auto quant_min_sub = rewriter.create<TF::SubOp>(op.getLoc(), quant_min,
+                                                    nudged_zero_point_val);
+    auto quant_max_sub = rewriter.create<TF::SubOp>(op.getLoc(), quant_max,
+                                                    nudged_zero_point_val);
+
+    auto nudged_float_min =
+        rewriter.create<TF::MulOp>(op.getLoc(), quant_min_sub, quant_to_float);
+
+    auto nudged_float_max =
+        rewriter.create<TF::MulOp>(op.getLoc(), quant_max_sub, quant_to_float);
+
+    // Now quantize the input value with the approximated min/max values.
+
+    // Move the input value into quantized space
+    Value quantized_input = rewriter.create<TF::ClipByValueOp>(
+        op.getLoc(), input_ty, input, nudged_float_min, nudged_float_max);
+
+    quantized_input = rewriter.create<TF::SubOp>(
+        op.getLoc(), input_ty, quantized_input, nudged_float_min);
+
+    quantized_input = rewriter.create<TF::MulOp>(
+        op.getLoc(), input_ty, quantized_input, float_to_quant);
+
+    // Round the quantized input always to the positive direction.
+    auto half_val = rewriter.create<TF::ConstOp>(
+        op.getLoc(),
+        DenseElementsAttr::get(scalar_ty, ConvertToAPFloat(0.5, element_ty)));
+
+    quantized_input = rewriter.create<TF::AddOp>(op.getLoc(), input_ty,
+                                                 quantized_input, half_val);
+
+    quantized_input =
+        rewriter.create<TF::FloorOp>(op.getLoc(), quantized_input);
+
+    // Convert back into floating point spae.
+    Value output = rewriter.create<TF::MulOp>(op.getLoc(), input_ty,
+                                              quantized_input, quant_to_float);
+
+    output = rewriter.create<TF::AddOp>(op.getLoc(), input_ty, output,
+                                        nudged_float_min);
+
+    rewriter.replaceOp(op, {output});
+    return success();
+  }
+};
+
 // Lowers InvertPermutation op to TensorScatterUpdate op.
 //
 // Example:
@@ -347,6 +477,210 @@ class LowerInvertPermutationOp
   }
 };
 
+// Approximates lgamma using Lanczos' approximation from
+// "A Precision Approximation of the Gamma Function". SIAM Journal on Numerical
+// Analysis series B. Vol. 1:
+// lgamma(z + 1) = (log(2) + log(pi)) / 2 + (z + 1/2) * log(t(z)) - t(z) + A(z)
+// t(z) = z + kLanczosGamma + 1/2
+// A(z) = kBaseLanczosCoeff
+//       + sigma(k = 1, n, kLanczosCoefficients[i] / (z +  k))
+//
+// Coefficients for the Lanczos approximation of the gamma function. The
+// coefficients are uniquely determined by the choice of g and n
+// (kLanczosGamma and kLanczosCoefficients.size() + 1). The coefficients below
+// correspond to [7, 9]. [5, 7], [7, 9], [9, 10], and [607/128.0, 15] were
+// evaluated and [7, 9] seemed to be the least sensitive to the quality of the
+// log function. In particular, [5, 7] is the only choice where -1.5e-5 <=
+// lgamma(2) <= 1.5e-5 for a particularly inaccurate log function.
+static constexpr double kLanczosGamma = 7;  // aka g
+static constexpr double kBaseLanczosCoeff = 0.99999999999980993227684700473478;
+static constexpr std::array<double, 8> kLanczosCoefficients = {
+    676.520368121885098567009190444019, -1259.13921672240287047156078755283,
+    771.3234287776530788486528258894,   -176.61502916214059906584551354,
+    12.507343278686904814458936853,     -0.13857109526572011689554707,
+    9.984369578019570859563e-6,         1.50563273514931155834e-7};
+
+class LowerLgammaOp : public OpRewritePattern<TF::LgammaOp> {
+ public:
+  using OpRewritePattern<TF::LgammaOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::LgammaOp op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    Value input = op.x();
+    TensorType original_tensor_type = op.x().getType().cast<TensorType>();
+
+    // The approximation is not precise enough for float16. Do the computation
+    // in float32 for that case.
+    TensorType tensor_type = original_tensor_type;
+    FloatType float_type = tensor_type.getElementType().cast<FloatType>();
+    bool needs_cast = float_type.getWidth() < 32;
+    if (needs_cast) {
+      MLIRContext *context = rewriter.getContext();
+      float_type = FloatType::getF32(context);
+      if (original_tensor_type.hasRank()) {
+        tensor_type =
+            RankedTensorType::get(original_tensor_type.getShape(), float_type);
+      } else {
+        tensor_type = UnrankedTensorType::get(float_type);
+      }
+      input = rewriter.create<TF::CastOp>(loc, tensor_type, input);
+    }
+
+    // Helper lambda function for creating a ConstOp for a tensor filled with
+    // the given constant float value.
+    auto create_const_op = [&rewriter, loc, tensor_type,
+                            float_type](double value) {
+      return rewriter.create<TF::ConstOp>(
+          loc, DenseElementsAttr::get(tensor_type,
+                                      FloatAttr::get(float_type, value)));
+    };
+
+    Value one_half = create_const_op(0.5);
+    Value one = create_const_op(1.0);
+    Value infinity = create_const_op(std::numeric_limits<double>::infinity());
+    Value pi = create_const_op(M_PI);
+    Value log_pi = create_const_op(std::log(M_PI));
+    Value log_sqrt_two_pi = create_const_op((std::log(2) + std::log(M_PI)) / 2);
+    Value lanczos_gamma_plus_one_half = create_const_op(kLanczosGamma + 0.5);
+    Value log_lanczos_gamma_plus_one_half =
+        create_const_op(std::log(kLanczosGamma + 0.5));
+    Value base_lanczos_coeff = create_const_op(kBaseLanczosCoeff);
+
+    Value minus_input = rewriter.create<TF::NegOp>(loc, input);
+    Value input_minus_one = rewriter.create<TF::SubOp>(loc, input, one);
+
+    // If the input is less than 0.5 use Euler's reflection formula:
+    // gamma(x) = pi / (sin(pi * x) * gamma(1 - x))
+    Value need_to_reflect = rewriter.create<TF::LessOp>(loc, input, one_half);
+    Type tensor_bool_type = need_to_reflect.getType();
+    Value z = rewriter.create<TF::SelectV2Op>(loc, need_to_reflect, minus_input,
+                                              input_minus_one);
+
+    Value x = base_lanczos_coeff;
+    for (int i = 0, end = kLanczosCoefficients.size(); i < end; ++i) {
+      Value lanczos_coefficient = create_const_op(kLanczosCoefficients[i]);
+      Value index = create_const_op(static_cast<double>(i));
+      Value z_plus_index = rewriter.create<TF::AddV2Op>(loc, z, index);
+      Value z_plus_index_plus_one =
+          rewriter.create<TF::AddV2Op>(loc, z_plus_index, one);
+      Value incr = rewriter.create<TF::DivOp>(loc, lanczos_coefficient,
+                                              z_plus_index_plus_one);
+      x = rewriter.create<TF::AddV2Op>(loc, x, incr);
+    }
+
+    // To improve accuracy on platforms with less-precise log implementations,
+    // compute log(lanczos_gamma_plus_one_half) at compile time and use log1p on
+    // the device.
+    // log(t) = log(kLanczosGamma + 0.5 + z)
+    //        = log(kLanczosGamma + 0.5) + log1p(z / (kLanczosGamma + 0.5))
+    Value t = rewriter.create<TF::AddV2Op>(loc, lanczos_gamma_plus_one_half, z);
+    Value z_div_lanczos_gamma_plus_one_half =
+        rewriter.create<TF::DivOp>(loc, z, lanczos_gamma_plus_one_half);
+    Value log1p_z_div_lanczos_gamma_plus_one_half =
+        rewriter.create<TF::Log1pOp>(loc, z_div_lanczos_gamma_plus_one_half);
+    Value log_t =
+        rewriter.create<TF::AddV2Op>(loc, log_lanczos_gamma_plus_one_half,
+                                     log1p_z_div_lanczos_gamma_plus_one_half);
+
+    // Compute the final result (modulo reflection).  t(z) may be large, and we
+    // need to be careful not to overflow to infinity in the first term of
+    //
+    //   (z + 1/2) * log(t(z)) - t(z).
+    //
+    // Therefore we compute this as
+    //
+    //   (z + 1/2 - t(z) / log(t(z))) * log(t(z)).
+    //
+    // log_y = log_sqrt_two_pi + (z + one_half - t / log_t) * log_t + Log(x);
+    Value t_div_log_t = rewriter.create<TF::DivOp>(loc, t, log_t);
+    Value one_half_minus_t_div_log_t =
+        rewriter.create<TF::SubOp>(loc, one_half, t_div_log_t);
+    Value z_plus_one_half_minus_t_div_log_t =
+        rewriter.create<TF::AddV2Op>(loc, z, one_half_minus_t_div_log_t);
+    Value z_plus_one_half_minus_t_div_log_t_mul_log_t =
+        rewriter.create<TF::MulOp>(loc, z_plus_one_half_minus_t_div_log_t,
+                                   log_t);
+    Value log_x = rewriter.create<TF::LogOp>(loc, x);
+    Value log_y_rhs = rewriter.create<TF::AddV2Op>(
+        loc, z_plus_one_half_minus_t_div_log_t_mul_log_t, log_x);
+    Value log_y = rewriter.create<TF::AddV2Op>(loc, log_sqrt_two_pi, log_y_rhs);
+
+    // Compute the reflected value, used when x < 0.5:
+    //
+    //   lgamma(x) = log(pi) - lgamma(1-x) - log(abs(sin(pi * x))).
+    //
+    // (The abs is because lgamma is the log of the absolute value of the gamma
+    // function.)
+    //
+    // We have to be careful when computing the final term above. gamma(x) goes
+    // to +/-inf at every integer x < 0, and this is controlled by the
+    // sin(pi * x) term.  The slope is large, so precision is particularly
+    // important.
+    //
+    // Because abs(sin(pi * x)) has period 1, we can equivalently use
+    // abs(sin(pi * frac(x))), where frac(x) is the fractional part of x.  This
+    // is more numerically accurate: It doesn't overflow to inf like pi * x can,
+    // and if x is an integer, it evaluates to 0 exactly, which is significant
+    // because we then take the log of this value, and log(0) is inf.
+    //
+    // We don't have a frac(x) primitive in XLA and computing it is tricky, but
+    // because abs(sin(pi * x)) = abs(sin(pi * abs(x))), it's good enough for
+    // our purposes to use abs(frac(x)) = abs(x) - floor(abs(x)).
+    //
+    // Furthermore, pi * abs(frac(x)) loses precision when abs(frac(x)) is close
+    // to 1.  To remedy this, we can use the fact that sin(pi * x) in the domain
+    // [0, 1] is symmetric across the line Y=0.5.
+    Value abs_input = rewriter.create<TF::AbsOp>(loc, input);
+    Value abs_input_floor = rewriter.create<TF::FloorOp>(loc, abs_input);
+    Value abs_frac_input =
+        rewriter.create<TF::SubOp>(loc, abs_input, abs_input_floor);
+
+    // Convert values of abs_frac_input > 0.5 to (1 - frac_input) to improve
+    // precision of pi * abs_frac_input for values of abs_frac_input close to 1.
+    Value one_minus_abs_frac_input =
+        rewriter.create<TF::SubOp>(loc, one, abs_frac_input);
+    Value abs_frac_input_gt_one_half =
+        rewriter.create<TF::GreaterOp>(loc, abs_frac_input, one_half);
+    Value reduced_frac_input = rewriter.create<TF::SelectV2Op>(
+        loc, abs_frac_input_gt_one_half, one_minus_abs_frac_input,
+        abs_frac_input);
+    Value pi_mul_reduced_frac_input =
+        rewriter.create<TF::MulOp>(loc, pi, reduced_frac_input);
+    Value sin_pi_mul_reduced_frac_input =
+        rewriter.create<TF::SinOp>(loc, pi_mul_reduced_frac_input);
+    Value reflection_denom =
+        rewriter.create<TF::LogOp>(loc, sin_pi_mul_reduced_frac_input);
+
+    // Avoid computing -inf - inf, which is nan.  If reflection_denom is +/-inf,
+    // then it "wins" and the result is +/-inf.
+    Value is_finite = rewriter.create<TF::IsFiniteOp>(loc, tensor_bool_type,
+                                                      reflection_denom);
+    Value neg_reflection_denom =
+        rewriter.create<TF::NegOp>(loc, reflection_denom);
+    Value log_pi_minus_reflection_denom =
+        rewriter.create<TF::SubOp>(loc, log_pi, reflection_denom);
+    Value reflection_if_finite =
+        rewriter.create<TF::SubOp>(loc, log_pi_minus_reflection_denom, log_y);
+    Value reflection = rewriter.create<TF::SelectV2Op>(
+        loc, is_finite, reflection_if_finite, neg_reflection_denom);
+
+    Value result = rewriter.create<TF::SelectV2Op>(loc, need_to_reflect,
+                                                   reflection, log_y);
+
+    // lgamma(+/-inf) = +inf.
+    Value is_inf = rewriter.create<TF::IsInfOp>(loc, tensor_bool_type, input);
+    result = rewriter.create<SelectV2Op>(loc, is_inf, infinity, result);
+
+    if (needs_cast) {
+      result = rewriter.create<TF::CastOp>(loc, original_tensor_type, result);
+    }
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+
 // Lowers Pack op to ConcatV2 op after changing shape of the inputs with
 // ExpandDims op.
 //
@@ -369,7 +703,7 @@ class LowerPackOp : public OpRewritePattern<TF::PackOp> {
         loc,
         DenseElementsAttr::get(
             RankedTensorType::get({}, rewriter.getIntegerType(64)), op.axis()));
-    int64_t axis = op.axis().getSExtValue();
+    int64_t axis = op.axis();
 
     Type prev_input_ty, inferred_ty;
     SmallVector<Value, 4> expanded_inputs;
@@ -393,6 +727,187 @@ class LowerPackOp : public OpRewritePattern<TF::PackOp> {
   }
 };
 
+// Lowers SpaceToBatchND by reducing to reshape(transpose(reshape(pad(input)))).
+//
+// Before rewrite:
+//   output = SpaceToBatchND(input, block_shape, paddings)
+// Let:
+//   [batch] + spatial_shape + remaining_shape = input.shape
+//   M = spatial_shape.rank
+// After rewrite:
+//   padded = zero-pad input with paddings
+//     The spatial_shape component of input.shape pads with paddings[*, 0]
+//     before each dimension, and paddings[*, 1] after each dimension.
+//   reshaped = reshape padded to:
+//     [batch]
+//     + [padded.shape[1]/block_shape[0], block_shape[0], ...,
+//        padded.shape[M]/block_shape[M-1], block_shape[M-1]]
+//     + remaining_shape
+//   permuted = transpose reshaped to:
+//     block_shape
+//     + [batch]
+//     + [padded.shape[1]/block_shape[0], ..., padded.shape[M]/block_shape[M-1]]
+//     + remaining_shape
+//   result = reshape permuted to:
+//     [batch * product(block_shape)]
+//     + [padded.shape[1]/block_shape[0], ..., padded.shape[M]/block_shape[M-1]]
+//     + remaining_shape
+class LowerSpaceToBatchNDOp : public OpRewritePattern<TF::SpaceToBatchNDOp> {
+ public:
+  using OpRewritePattern<TF::SpaceToBatchNDOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::SpaceToBatchNDOp op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = op.getLoc();
+    auto input_type = op.input().getType().cast<TensorType>();
+    if (!input_type.hasStaticShape()) {
+      return failure();
+    }
+    ArrayRef<int64_t> input_shape = input_type.getShape();
+    auto block_shape_type = op.block_shape().getType().cast<TensorType>();
+    if (!block_shape_type.hasStaticShape()) {
+      return failure();
+    }
+    auto paddings_type = op.paddings().getType().cast<ShapedType>();
+
+    int64_t input_rank = input_type.getRank();
+    int64_t block_rank = block_shape_type.getNumElements();
+    int64_t remaining_rank = input_rank - 1 - block_rank;
+    if (remaining_rank < 0) {
+      // TODO(b/157475606): Move this check to ::Verify
+      return failure();
+    }
+
+    auto block_shape_i64_type = RankedTensorType::get(
+        block_shape_type.getShape(), rewriter.getIntegerType(64));
+    auto block_shape_i64 = rewriter.create<TF::CastOp>(
+        loc, block_shape_i64_type, op.block_shape());
+
+    auto paddings_i64_type = RankedTensorType::get(paddings_type.getShape(),
+                                                   rewriter.getIntegerType(64));
+    auto paddings_i64 =
+        rewriter.create<TF::CastOp>(loc, paddings_i64_type, op.paddings());
+
+    auto pad00 = rewriter.create<TF::ConstOp>(
+        loc, DenseElementsAttr::get<int64_t>(
+                 RankedTensorType::get({1, 2}, rewriter.getIntegerType(64)),
+                 {0, 0}));
+    SmallVector<Value, 4> full_paddings_list{pad00, paddings_i64};
+    full_paddings_list.append(remaining_rank, pad00);
+    auto full_paddings_type =
+        RankedTensorType::get({input_rank, 2}, rewriter.getIntegerType(64));
+    auto zero_i64 = rewriter.create<TF::ConstOp>(
+        loc, GetScalarOfType(rewriter.getIntegerType(64), 0));
+    // Extends paddings to all dimensions of input by adding 0s to non-block
+    // dimensions.
+    auto full_paddings = rewriter.create<TF::ConcatV2Op>(
+        loc, full_paddings_type, full_paddings_list, zero_i64);
+
+    SmallVector<int64_t, 4> padded_shape(input_rank, ShapedType::kDynamicSize);
+    auto padded_type =
+        RankedTensorType::get(padded_shape, rewriter.getF32Type());
+    // padded = pad(input, full_paddings)
+    auto padded =
+        rewriter.create<TF::PadOp>(loc, padded_type, op.input(), full_paddings);
+
+    auto paddings_sum_type =
+        RankedTensorType::get({input_rank}, rewriter.getIntegerType(64));
+    auto one_i64 = rewriter.create<TF::ConstOp>(
+        loc, GetScalarOfType(rewriter.getIntegerType(64), 1));
+    // paddings_sum = paddings[*,0] + paddings[*,1]
+    auto paddings_sum = rewriter.create<TF::SumOp>(loc, paddings_sum_type,
+                                                   full_paddings, one_i64);
+
+    // input_shape_tensor = input.shape
+    auto input_shape_tensor = rewriter.create<TF::ConstOp>(
+        loc,
+        DenseElementsAttr::get(
+            RankedTensorType::get({input_rank}, rewriter.getIntegerType(64)),
+            input_shape));
+
+    // padded_shape_tensor is the shape of padded.
+    auto padded_shape_tensor =
+        rewriter.create<TF::AddOp>(loc, paddings_sum, input_shape_tensor);
+
+    auto zero_i32 = rewriter.create<TF::ConstOp>(
+        loc, GetScalarOfType(rewriter.getIntegerType(32), 0));
+    SmallVector<Type, 4> padded_shape_splits_types(
+        input_rank, RankedTensorType::get({1}, rewriter.getIntegerType(64)));
+    SmallVector<Value, 4> padded_shape_splits(
+        rewriter
+            .create<TF::SplitOp>(loc, padded_shape_splits_types, zero_i32,
+                                 padded_shape_tensor)
+            .output());
+
+    SmallVector<Type, 4> block_shape_splits_types(
+        block_rank, RankedTensorType::get({1}, rewriter.getIntegerType(64)));
+    SmallVector<Value, 4> block_shape_splits(
+        rewriter
+            .create<TF::SplitOp>(loc, block_shape_splits_types, zero_i32,
+                                 block_shape_i64)
+            .output());
+
+    SmallVector<Value, 4> outer_shape_vals;
+    for (int64_t i = 0; i < block_rank; ++i) {
+      // TODO(b/157475606): Insert tf.Assert that the following division has
+      // remainder 0.
+      outer_shape_vals.push_back(rewriter.create<TF::DivOp>(
+          loc, padded_shape_splits[1 + i], block_shape_splits[i]));
+    }
+
+    SmallVector<Value, 6> reshaped_shape_vals{padded_shape_splits[0]};
+    for (int64_t i = 0; i < block_rank; ++i) {
+      reshaped_shape_vals.push_back(outer_shape_vals[i]);
+      reshaped_shape_vals.push_back(block_shape_splits[i]);
+    }
+    for (int64_t i = 1 + block_rank; i < input_rank; ++i) {
+      reshaped_shape_vals.push_back(padded_shape_splits[i]);
+    }
+    auto reshaped_shape = ValuesToRank1(
+        rewriter, loc, rewriter.getIntegerType(64), reshaped_shape_vals);
+
+    SmallVector<int64_t, 6> permutation_vals;
+    for (int64_t i = 0; i < block_rank; ++i) {
+      permutation_vals.push_back(2 + 2 * i);
+    }
+    permutation_vals.push_back(0);
+    for (int64_t i = 0; i < block_rank; ++i) {
+      permutation_vals.push_back(1 + 2 * i);
+    }
+    for (int64_t i = 1 + block_rank; i < input_rank; ++i) {
+      permutation_vals.push_back(block_rank + i);
+    }
+    auto permutation = rewriter.create<TF::ConstOp>(
+        loc, GetI64ElementsAttr(permutation_vals, &rewriter));
+
+    auto output_batch = padded_shape_splits[0];
+    for (int64_t i = 0; i < block_rank; ++i) {
+      output_batch =
+          rewriter.create<TF::MulOp>(loc, output_batch, block_shape_splits[i]);
+    }
+    SmallVector<Value, 4> output_shape_vals{output_batch};
+    for (int64_t i = 0; i < block_rank; ++i) {
+      output_shape_vals.push_back(outer_shape_vals[i]);
+    }
+    for (int64_t i = 1 + block_rank; i < input_rank; ++i) {
+      output_shape_vals.push_back(padded_shape_splits[i]);
+    }
+    auto output_shape = ValuesToRank1(
+        rewriter, loc, rewriter.getIntegerType(64), output_shape_vals);
+    auto reshaped = rewriter.create<TF::ReshapeOp>(loc, padded, reshaped_shape);
+    auto permuted =
+        rewriter.create<TF::TransposeOp>(loc, reshaped, permutation);
+
+    // Sometimes the result type is more specific than what the reshape builder
+    // can infer.
+    auto result_type = op.getResult().getType();
+    rewriter.replaceOpWithNewOp<TF::ReshapeOp>(op, result_type, permuted,
+                                               output_shape);
+
+    return success();
+  }
+};
+
 // Lowers `TF::SparseMatMulOp` to `TF::MatMulOp`, ignoring the sparseness hints,
 // since we currently don't have an implementation that can use this
 // information. Adds appropriate casts where necessary to align element types
@@ -447,8 +962,7 @@ class Lower_UnaryOpsComposition
   LogicalResult matchAndRewrite(TF::_UnaryOpsCompositionOp op,
                                 PatternRewriter &rewriter) const override {
     Value result = op.x();
-    for (StringRef op_name :
-         op.op_names().getAsRange<StringAttr, StringRef>()) {
+    for (StringRef op_name : op.op_names().getAsValueRange<StringAttr>()) {
       std::string full_name = "tf." + op_name.str();
       // All ops in the sequences have the same result type as the original
       // result type.
@@ -466,10 +980,11 @@ class Lower_UnaryOpsComposition
 
 void PopulateLoweringTFPatterns(MLIRContext *context,
                                 OwningRewritePatternList *patterns) {
-  patterns->insert<LowerAddNOp, LowerDynamicStitchOp, LowerInvertPermutationOp,
-                   LowerPackOp, LowerSparseMatMulOp, Lower_UnaryOpsComposition>(
-      context);
-  populateWithGenerated(context, patterns);
+  patterns->insert<LowerAddNOp, ConvertFakeQuantWithMinMaxVarsOp,
+                   LowerDynamicStitchOp, LowerInvertPermutationOp,
+                   LowerLgammaOp, LowerPackOp, LowerSpaceToBatchNDOp,
+                   LowerSparseMatMulOp, Lower_UnaryOpsComposition>(context);
+  populateWithGenerated(context, *patterns);
 }
 
 }  // namespace TF
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
index f7a867f3130..bddc863ee60 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.td
@@ -24,6 +24,10 @@ class GetScalarOfType<int value> : NativeCodeCall<
 class GetScalarOfFloatType<string value> : NativeCodeCall<
   "GetScalarOfFloatType(getElementTypeOrSelf($0)," # value # ")">;
 
+def GetScalarInfOfType : NativeCodeCall<
+  "GetScalarOfFloatType(getElementTypeOrSelf($0), "
+  "std::numeric_limits<double>::infinity())">;
+
 def GetScalarNanOfType : NativeCodeCall<
   "GetScalarOfFloatType(getElementTypeOrSelf($0), "
   "std::numeric_limits<double>::quiet_NaN())">;
@@ -154,13 +158,22 @@ foreach fromToBinPair = [[TF_DivNoNanOp, TF_DivOp],
 def LowerFillOp : Pat<(TF_FillOp $dims, $value),
                       (TF_BroadcastToOp $value, $dims)>;
 
+//===----------------------------------------------------------------------===//
+// Inf op patterns.
+//===----------------------------------------------------------------------===//
+
+def LowerIsInfOp : Pat<(TF_IsInfOp $x),
+                       (TF_EqualOp (TF_AbsOp:$abs $x),
+                        (TF_ConstOp:$inf (GetScalarInfOfType $x)),
+                        /*incompatible_shape_error*/ConstBoolAttrTrue)>;
+
 //===----------------------------------------------------------------------===//
 // NaN op patterns.
 //===----------------------------------------------------------------------===//
 
 def LowerIsNanOp : Pat<(TF_IsNanOp $x),
-                     (TF_EqualOp $x, (TF_ConstOp:$nan (GetScalarNanOfType $x)),
-                      /*incompatible_shape_error*/ConstBoolAttrTrue)>;
+                       (TF_NotEqualOp $x, $x,
+                        /*incompatible_shape_error*/ConstBoolAttrTrue)>;
 
 //===----------------------------------------------------------------------===//
 // L2Loss op patterns.
@@ -198,6 +211,25 @@ def : Pat<(TF_PadOp TensorOf<[AnySignlessInteger, AnyFloat]>:$input, $paddings),
 def LowerReciprocal : Pat<(TF_ReciprocalOp $x),
                           (TF_DivOp (TF_ConstOp (GetScalarOfType<1> $x)), $x)>;
 
+//===----------------------------------------------------------------------===//
+// Round op patterns.
+//===----------------------------------------------------------------------===//
+
+
+// Rounds on integers should just be bypassed.
+def : Pat<(TF_RoundOp:$res TF_IntTensor:$input), (TF_IdentityOp $input)>;
+
+// Implements TF Round on floats using basic operations.
+def : Pat<(TF_RoundOp:$res TF_FloatTensor:$input),
+          (TF_SelectOp
+           (TF_LessOp
+            (TF_SubOp $input, (TF_FloorOp:$floor $input)),
+            (TF_ConstOp (GetScalarOfFloatType<"0.5"> $input))),
+           $floor,
+           (TF_AddOp
+            (TF_ConstOp (GetScalarOfType<1> $input)), $floor))>;
+
+
 //===----------------------------------------------------------------------===//
 // Rsqrt op patterns.
 //===----------------------------------------------------------------------===//
@@ -217,12 +249,22 @@ def : Pat<(TF_RsqrtGradOp $lhs, $rhs),
 
 // TODO(hinsu): Support complex input types.
 def LowerTanhGradOp :
-  Pat<(TF_TanhGradOp TF_FpTensor:$y, TF_FpTensor:$dy),
+  Pat<(TF_TanhGradOp TF_FloatTensor:$y, TF_FloatTensor:$dy),
       (TF_MulOp $dy,
                 (TF_SubOp (TF_ConstOp (GetScalarOfType<1> $y)),
                           (TF_SquareOp $y)))>;
 
+
 //===----------------------------------------------------------------------===//
+// LowerFakeQuantWithMinMaxArgs op patterns.
+//===----------------------------------------------------------------------===//
+
+def LowerFakeQuantWithMinMaxArgs :
+  Pat<(TF_FakeQuantWithMinMaxArgsOp TF_FloatTensor: $input,
+       $min, $max, $bits, $narrow_range),
+      (TF_FakeQuantWithMinMaxVarsOp $input,
+         (TF_ConstOp $min), (TF_ConstOp $max), $bits, $narrow_range)>;
+
 // ZerosLike op patterns.
 //===----------------------------------------------------------------------===//
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
index 4438f19bb74..ac844b925ce 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
 
 namespace mlir {
 namespace TFDevice {
@@ -37,6 +38,11 @@ namespace {
 constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
 constexpr char kAllowSoftPlacementAttr[] = "allow_soft_placement";
 
+auto* auto_outside_compilation_gauge =
+    tensorflow::monitoring::Gauge<bool, 0>::New(
+        "/tensorflow/core/use_auto_outside_compilation",
+        "Tracks if auto outside compilation is enabled");
+
 // This pass marks unsupported ops in a device cluster with
 // `_xla_outside_compilation` attribute so the operations will run on the host
 // instead of the device.  Unsupported ops are ops that can not be code
@@ -47,6 +53,15 @@ struct MarkOpsForOutsideCompilation
   void runOnOperation() override;
 };
 
+// Adds any canonicalization patterns to list of supported `patterns`.
+// TODO(b/161726307): Move or import the relevant patterns to LowerTF pass and
+// remove this.
+void AddCanonicalizationPatterns(MLIRContext* context,
+                                 OwningRewritePatternList* patterns) {
+  for (auto* op : context->getRegisteredOperations())
+    op->getCanonicalizationPatterns(*patterns, context);
+}
+
 // TODO(b/159128666): Check the control flow legalization passes instead once
 // added.
 void AddSupportedControlFlowOps(MLIRContext* context,
@@ -68,16 +83,71 @@ void AddRewrittenEmbeddingOps(MLIRContext* context,
       TF::SendTPUEmbeddingGradientsOp::getOperationName(), context));
 }
 
+// Stack, TensorList and TensorArray ops are rewritten during the second phase
+// of the bridge (compilation of TPUCompile op). They would not match any
+// legalization/canonicalization pattern and have to be manually added to the
+// list of supported ops.
+void AddRewrittenCompositeOps(MLIRContext* context,
+                              llvm::DenseSet<OperationName>* supported_ops) {
+#define GET_OPERATION_NAME(op) OperationName(op::getOperationName(), context)
+  llvm::SmallDenseSet<OperationName, 32> allowlist_ops = {
+      // Stack ops.
+      GET_OPERATION_NAME(TF::StackV2Op),
+      GET_OPERATION_NAME(TF::StackPushV2Op),
+      GET_OPERATION_NAME(TF::StackPopV2Op),
+      // Tensor Array ops.
+      GET_OPERATION_NAME(TF::TensorArrayV3Op),
+      GET_OPERATION_NAME(TF::TensorArrayReadV3Op),
+      GET_OPERATION_NAME(TF::TensorArrayWriteV3Op),
+      GET_OPERATION_NAME(TF::TensorArrayConcatV3Op),
+      GET_OPERATION_NAME(TF::TensorArraySplitV3Op),
+      GET_OPERATION_NAME(TF::TensorArraySizeV3Op),
+      GET_OPERATION_NAME(TF::TensorArrayGradV3Op),
+      GET_OPERATION_NAME(TF::TensorArrayGatherV3Op),
+      GET_OPERATION_NAME(TF::TensorArrayScatterV3Op),
+      GET_OPERATION_NAME(TF::TensorListFromTensorOp),
+      // Tensor List Ops.
+      GET_OPERATION_NAME(TF::EmptyTensorListOp),
+      GET_OPERATION_NAME(TF::TensorListReserveOp),
+      GET_OPERATION_NAME(TF::TensorListFromTensorOp),
+      GET_OPERATION_NAME(TF::TensorListPushBackOp),
+      GET_OPERATION_NAME(TF::TensorListPopBackOp),
+      GET_OPERATION_NAME(TF::TensorListGetItemOp),
+      GET_OPERATION_NAME(TF::TensorListSetItemOp),
+      GET_OPERATION_NAME(TF::TensorListLengthOp),
+      GET_OPERATION_NAME(TF::TensorListElementShapeOp),
+      GET_OPERATION_NAME(TF::TensorListGatherOp),
+      GET_OPERATION_NAME(TF::TensorListScatterIntoExistingListOp),
+  };
+#undef GET_OPERATION_NAME
+
+  supported_ops->insert(allowlist_ops.begin(), allowlist_ops.end());
+}
+
+bool IsStringType(Type type) {
+  if (type.isa<TF::StringType>()) return true;
+
+  auto sub_type = type.dyn_cast<TF::TensorFlowTypeWithSubtype>();
+  if (!sub_type) return false;
+
+  bool has_string = llvm::any_of(sub_type.GetSubtypes(), [](TensorType type) {
+    return type.getElementType().isa<TF::StringType>();
+  });
+  return has_string;
+}
+
 bool HasStringOperand(Operation& op) {
   for (auto operand : op.getOperands()) {
-    if (getElementTypeOrSelf(operand).isa<TF::StringType>()) return true;
+    auto operand_type = getElementTypeOrSelf(operand);
+    if (IsStringType(operand_type)) return true;
   }
   return false;
 }
 
 bool HasStringResult(Operation& op) {
   for (auto result : op.getResults()) {
-    if (getElementTypeOrSelf(result).isa<TF::StringType>()) return true;
+    auto result_type = getElementTypeOrSelf(result);
+    if (IsStringType(result_type)) return true;
   }
   return false;
 }
@@ -135,18 +205,10 @@ LogicalResult MarkUncompilableOps(
               op->getContext()));
       outside_compiled_cluster_counter++;
     }
-    if (llvm::isa<TF::IfRegionOp, TF::WhileRegionOp>(op)) {
-      if (HasCapturedStringOperand(op)) {
-        op->setAttr(
-            kXlaOutsideCompilationAttr,
-            StringAttr::get(
-                llvm::formatv("auto{0}", outside_compiled_cluster_counter)
-                    .str(),
-                op->getContext()));
-        outside_compiled_cluster_counter++;
-      }
-    }
   });
+  if (outside_compiled_cluster_counter > 0) {
+    auto_outside_compilation_gauge->GetCell()->Set(true);
+  }
   return success();
 }
 
@@ -179,6 +241,7 @@ void MarkOpsForOutsideCompilation::runOnOperation() {
   OwningRewritePatternList patterns;
   mhlo::PopulateLegalizeTfPatterns(module.getContext(), &patterns);
   TF::PopulateLoweringTFPatterns(module.getContext(), &patterns);
+  AddCanonicalizationPatterns(module.getContext(), &patterns);
 
   // `supported_ops` contains the name of all of the ops that can potentially be
   // lowered into HLO on the device. This doesn't always mean that the op can
@@ -186,10 +249,12 @@ void MarkOpsForOutsideCompilation::runOnOperation() {
   // be lowered in a subsequent pass.
   llvm::DenseSet<OperationName> supported_ops;
   for (auto& pattern : patterns) {
-    supported_ops.insert(*pattern->getRootKind());
+    Optional<OperationName> root_kind = pattern->getRootKind();
+    if (root_kind.hasValue()) supported_ops.insert(root_kind.getValue());
   }
   AddSupportedControlFlowOps(module.getContext(), &supported_ops);
   AddRewrittenEmbeddingOps(module.getContext(), &supported_ops);
+  AddRewrittenCompositeOps(module.getContext(), &supported_ops);
 
   auto result = module.walk([&](tf_device::ClusterOp cluster) {
     // Only if `allow_soft_placement` attribute is true should we mark ops
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
index 24e77d31e7c..29ecc38de0b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize.cc
@@ -37,7 +37,7 @@ struct TFOptimizePass : public PassWrapper<TFOptimizePass, FunctionPass> {
   void runOnFunction() override {
     OwningRewritePatternList patterns;
     auto func = getFunction();
-    populateWithGenerated(&getContext(), &patterns);
+    populateWithGenerated(&getContext(), patterns);
     applyPatternsAndFoldGreedily(func, patterns);
   }
 };
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
index 6fee693554e..b81e390580d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
@@ -109,13 +109,14 @@ class ResourceAnalyzer {
         return;
       }
       if (auto if_op = dyn_cast<TF::IfOp>(op)) {
-        for (auto callee : {if_op.then_func(), if_op.else_func()}) {
+        for (auto callee : {if_op.then_function(), if_op.else_function()}) {
           PropagatePotentiallyWrittenUpFromCallee(callee, if_op.input());
         }
         return;
       }
       if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
-        for (auto callee : {while_op.cond_func(), while_op.body_func()}) {
+        for (auto callee :
+             {while_op.cond_function(), while_op.body_function()}) {
           PropagatePotentiallyWrittenUpFromCallee(callee, while_op.input());
         }
         return;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc b/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc
index 1332c8b6e59..86eea50d744 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/parallel_execute_to_islands.cc
@@ -16,66 +16,62 @@ limitations under the License.
 // This pass forms `tf_executor.island` per region of
 // `tf_device.parallel_execute`.
 //
-// For example:
+// For example, the following:
+//
+//  %0 = tf_executor.island {
+//    tf_executor.yield
+//  }
 //  %1:2 = tf_executor.island {
 //    %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
 //      tf_executor.yield %2 : tensor<i1>
 //  }
-//  tf_executor.island() {
-//    "tf_device.parallel_execute"() ({
-//      %3 = "tf.opB"() : () -> tensor<i1>
-//      tf_device.return %3 : tensor<i1>
-//    },
-//    {
+//  %3:2 = tf_executor.island(%0) {
+//    %4 = "tf_device.parallel_execute"() ( {
+//      %5 = "tf.opB"() : () -> tensor<i1>
+//      tf_device.return %5 : tensor<i1>
+//    }, {
 //      %5 = "tf.opC"(%1#0) : (tensor<i1>) -> tensor<i32>
 //      tf_device.return
 //    }) {} : () -> (tensor<i1>)
+//    tf_executor.yield %4 : tensor<i1>
+//  }
+//  tf_executor.fetch %3#0 : tensor<i1>
+//
+// gets lowered to:
+//
+//  %0 = tf_executor.island {
 //    tf_executor.yield
 //  }
-//  tf_executor.fetch
+//  %1:2 = tf_executor.island {
+//    %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
+//    tf_executor.yield %2 : tensor<i1>
+//  }
 //
-//  Would become:
-//    %1:2 = tf_executor.island {
-//      %2 = "tf.opA"(%arg0) : (tensor<i1>) -> tensor<i1>
-//      tf_executor.yield %2 : tensor<i1>
-//    }
+//  // Island for the first region of above parallel_execute.
+//  %3:2 = tf_executor.island(%0) {
+//    %4 = "tf.opB"() : () -> tensor<i1>
+//    tf_executor.yield %4 : tensor<i1>
+//  }
 //
-//    // Input barrier sink island that forwards all inputs.
-//    %output_0, %control_1 = tf_executor.island {
-//      tf_executor.yield %1#0: tensor<i1>
-//    }
+//  // Island for the second region of above parallel_execute.
+//  %5 = tf_executor.island(%0) {
+//    %6 = "tf.opC"(%1#0) : (tensor<i1>) -> tensor<i32>
+//    tf_executor.yield
+//  }
 //
-//    // Island for the first region of above parallel_execute.
-//    %output_2, %control_3 = tf_executor.island(%control_1) {
-//      %3 = "tf.opB"() : () -> tensor<i1>
-//      tf_executor.yield %3 : tensor<i1>
-//    }
-//
-//    // Island for the second region of above parallel_execute.
-//    %control_5 = tf_executor.island {
-//        %5 = "tf.opC"(%output_0) : (tensor<i1>) -> tensor<i32>
-//      tf_executor.yield
-//    }
-//
-//    // Output barrier sink island that forwards all outputs.
-//    %output_5, %control_6 = tf_executor.island(%control_5) {
-//      tf_executor.yield %output_2
-//    }
+//  tf_executor.fetch %3#0, %5 : tensor<i1>, !tf_executor.control
 //
 //  When tf_device.parallel_execute op is enclosed after tf_device.replicate,
 //  then this pass will run following `replicate-to-island` pass and
 //  `tf-executor-break-up-islands` pass.
 
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 
@@ -89,175 +85,117 @@ struct ParallelExecuteToIslandsPass
 };
 
 // Convert parallel_execute op to a set of islands where each region of
-// parallel_execute op becomes a separate island. This ensures that
-// regions of parallel_execute op gets executed concurrently.
-LogicalResult ExpandParallelExecuteToIslands(
-    tf_executor::IslandOp island_op, tf_executor::IslandOp input_sink_island,
+// parallel_execute op becomes a separate island. This ensures that the regions
+// of the parallel_execute op gets executed concurrently.
+void ExpandParallelExecuteToIslands(
+    tf_executor::IslandOp island_op,
     tf_device::ParallelExecuteOp parallel_execute_op, OpBuilder* builder,
-    llvm::SmallVector<tf_executor::IslandOp, 4>* islands) {
-  const int num_executions =
-      parallel_execute_op.getOperation()->getNumRegions();
-  llvm::SmallVector<tf_executor::IslandOp, 4> executions;
-  executions.reserve(num_executions);
-  builder->setInsertionPoint(island_op);
+    llvm::SmallVectorImpl<tf_executor::IslandOp>& executes) {
+  const int num_regions = parallel_execute_op.getOperation()->getNumRegions();
+  executes.reserve(num_regions);
 
-  auto control_type = tf_executor::ControlType::get(island_op.getContext());
-  for (int i : llvm::seq<int>(0, num_executions)) {
-    auto execute_region =
-        parallel_execute_op.GetRegionBlockWithIndex(i).getParent();
+  for (int i : llvm::seq<int>(0, num_regions)) {
+    Block& execute_block = parallel_execute_op.GetRegionBlockWithIndex(i);
 
-    // If region does not have any inputs, then add explicit control dependency
-    // from the input sink island. This guarantees that all inputs of
-    // parallel_execute op must be materialized before any of the islands are
-    // executed.
-    llvm::SetVector<Value> region_inputs;
-    getUsedValuesDefinedAbove(*execute_region, region_inputs);
-    llvm::SmallVector<Value, 1> execution_control_inputs;
-    if (region_inputs.empty() && input_sink_island)
-      execution_control_inputs.emplace_back(input_sink_island.control());
-
-    // Collect result types and operands.
-    Operation* terminator = execute_region->front().getTerminator();
-    llvm::SmallVector<Type, 8> output_types(terminator->getOperandTypes());
-
-    // Replace terminator with YieldOp as island op always ends with yield op.
+    // Replace terminator with tf_executor.YieldOp.
+    Operation* terminator = execute_block.getTerminator();
     builder->setInsertionPoint(terminator);
-    builder->create<tf_executor::YieldOp>(terminator->getLoc(),
-                                          terminator->getOperands());
+    auto yield = builder->create<tf_executor::YieldOp>(
+        terminator->getLoc(), terminator->getOperands());
     terminator->erase();
 
     // Create new island for each region.
     builder->setInsertionPoint(island_op);
-    auto execution_island = builder->create<tf_executor::IslandOp>(
-        island_op.getLoc(), output_types, control_type,
-        execution_control_inputs);
+    auto execute_island = builder->create<tf_executor::IslandOp>(
+        island_op.getLoc(), yield.getOperandTypes(),
+        island_op.control().getType(), island_op.controlInputs());
 
-    // Move over tf_device.parallel_execute body region into newly a
-    // created island.
-    execution_island.body().takeBody(*execute_region);
-    islands->push_back(execution_island);
+    // Move over tf_device.parallel_execute body region into newly the created
+    // island.
+    execute_island.body().takeBody(*execute_block.getParent());
+    executes.push_back(execute_island);
   }
-
-  return success();
 }
 
-// Creates an island that works as input sync point for islands. This guarantees
-// that all (implicitly captured) inputs of parallel_execute are materialized
-// before any of the islands are executed.
-tf_executor::IslandOp CreateInputBarrierIsland(
-    OpBuilder* builder, tf_executor::IslandOp island_op) {
-  builder->setInsertionPoint(island_op);
-
-  llvm::SetVector<Value> all_inputs;
-  getUsedValuesDefinedAbove(island_op.body(), all_inputs);
-
-  // Filter out values that are arguments and doesn't need to be part of the
-  // entry barrier.
-  llvm::SmallVector<Value, 8> island_inputs;
-  llvm::SmallVector<Type, 8> input_types;
-  island_inputs.reserve(all_inputs.size());
-  input_types.reserve(all_inputs.size());
-  for (Value val : all_inputs) {
-    if (!val.isa<BlockArgument>()) {
-      island_inputs.push_back(val);
-      input_types.push_back(val.getType());
-    }
-  }
-  if (island_inputs.empty() && island_op.controlInputs().empty()) return {};
-
-  // Create new island for that forwards all inputs.
-  auto control_type = tf_executor::ControlType::get(island_op.getContext());
-  auto input_sink_island = builder->create<tf_executor::IslandOp>(
-      island_op.getLoc(), input_types, control_type, island_op.controlInputs());
-  input_sink_island.body().push_back(new Block);
-
-  for (auto input_index_and_value : llvm::enumerate(island_inputs)) {
-    int index = input_index_and_value.index();
-    Value input_value = input_index_and_value.value();
-    replaceAllUsesInRegionWith(input_value, input_sink_island.getResult(index),
-                               island_op.body());
-  }
-
-  // Create YieldOp for the new input sink island.
-  builder->setInsertionPointToEnd(&input_sink_island.GetBody());
-  builder->create<tf_executor::YieldOp>(island_op.getLoc(),
-                                        llvm::to_vector<8>(island_inputs));
-  return input_sink_island;
-}
-
-// Creates an islands that works as output sync point. This guarantees that
-// execution of all islands must be completed before op following
-// parallel_execute runs.
-tf_executor::IslandOp CreateOutputBarrierIsland(
-    OpBuilder* builder, tf_executor::IslandOp island_op,
-    llvm::SmallVectorImpl<tf_executor::IslandOp>* islands) {
-  // Add control dependency to island operand if island output has no uses.
-  llvm::SmallVector<Value, 8> island_operands;
-  for (auto& island : *islands)
-    if (island.use_empty()) island_operands.push_back(island.control());
-
-  // Create single island forwarding all island results.
-  builder->setInsertionPoint(island_op);
-  auto island_output_sink = builder->create<tf_executor::IslandOp>(
-      island_op.getLoc(), llvm::to_vector<8>(island_op.getResultTypes()),
-      island_operands);
-  island_output_sink.body().push_back(new Block);
-  return island_output_sink;
-}
-
-LogicalResult CreateIslandsFromParallelExecute(
+void CreateIslandsFromParallelExecute(
     tf_executor::IslandOp island_op,
     tf_device::ParallelExecuteOp parallel_execute_op) {
   OpBuilder builder(island_op);
-  auto input_sink_island = CreateInputBarrierIsland(&builder, island_op);
 
-  // Create N islands where N is the number of regions inside parallel_execute
-  // op.
-  llvm::SmallVector<tf_executor::IslandOp, 4> islands;
-  auto result = ExpandParallelExecuteToIslands(
-      island_op, input_sink_island, parallel_execute_op, &builder, &islands);
-  if (failed(result)) return result;
+  // Create islands for each region of the parallel_execute op.
+  llvm::SmallVector<tf_executor::IslandOp, 4> executes;
+  ExpandParallelExecuteToIslands(island_op, parallel_execute_op, &builder,
+                                 executes);
 
-  // Remap all results of parallel_execute op with outputs from newly
-  // created islands.
+  // Remap all results of parallel_execute op with outputs from newly created
+  // islands.
   llvm::SmallVector<Value, 8> parallel_execute_outputs;
   parallel_execute_outputs.reserve(
       parallel_execute_op.getOperation()->getNumResults());
 
-  for (auto island : islands)
-    for (auto output_value : island.outputs())
-      parallel_execute_outputs.emplace_back(output_value);
+  for (auto& execute : executes)
+    parallel_execute_outputs.append(execute.outputs().begin(),
+                                    execute.outputs().end());
 
-  parallel_execute_op.getOperation()->replaceAllUsesWith(
-      parallel_execute_outputs);
+  for (auto result : llvm::zip(island_op.outputs(), parallel_execute_outputs))
+    std::get<0>(result).replaceAllUsesWith(std::get<1>(result));
 
-  auto island_output_sink =
-      CreateOutputBarrierIsland(&builder, island_op, &islands);
+  // Add sink island to pin all islands as a control dependency if there is a
+  // control dependency leading from the parallel_execute originally.
+  if (!island_op.control().use_empty()) {
+    llvm::SmallVector<Value, 8> island_operands;
+    for (auto& execute : executes) island_operands.push_back(execute.control());
+
+    builder.setInsertionPoint(island_op);
+    auto island_sink = builder.create<tf_executor::IslandOp>(
+        island_op.getLoc(), llvm::ArrayRef<Type>{},
+        island_op.control().getType(), island_operands);
+    island_sink.body().push_back(new Block);
+    builder.setInsertionPointToEnd(&island_sink.GetBody());
+    builder.create<tf_executor::YieldOp>(island_op.getLoc(),
+                                         llvm::ArrayRef<Value>{});
+    island_op.control().replaceAllUsesWith(island_sink.control());
+  }
+
+  // Islands with no uses should be pinned to a graph fetch so they still
+  // execute.
+  llvm::SmallVector<Value, 8> unused_execute_controls;
+  for (auto& execute : executes)
+    if (execute.use_empty())
+      unused_execute_controls.push_back(execute.control());
+
+  if (!unused_execute_controls.empty()) {
+    auto graph_op = island_op.getParentOfType<tf_executor::GraphOp>();
+    tf_executor::FetchOp fetch = graph_op.GetFetch();
+    auto fetches = llvm::to_vector<8>(fetch.getOperands());
+    fetches.append(unused_execute_controls.begin(),
+                   unused_execute_controls.end());
+    builder.setInsertionPoint(fetch);
+    builder.create<tf_executor::FetchOp>(fetch.getLoc(), fetches);
+    fetch.erase();
+  }
 
-  // Move island YieldOp over to new single island and remap island results.
-  island_op.GetYield().getOperation()->moveBefore(
-      &island_output_sink.GetBody(), island_output_sink.GetBody().begin());
-  island_op.replaceAllUsesWith(island_output_sink);
   island_op.erase();
-
-  return success();
-}
-
-// Finds islands with a single `tf_device.parallel_execute` and create
-// individual islands per region of parallel_execute.
-void LowerSingleIslandParallelExecuteToIslands(
-    tf_executor::IslandOp island_op) {
-  if (!hasSingleElement(island_op.GetBody().without_terminator())) return;
-
-  if (auto parallel_execute_op = llvm::dyn_cast<tf_device::ParallelExecuteOp>(
-          &island_op.GetBody().front()))
-    CreateIslandsFromParallelExecute(island_op, parallel_execute_op);
 }
 
 void ParallelExecuteToIslandsPass::runOnFunction() {
-  getFunction().walk([&](tf_executor::IslandOp island_op) {
-    LowerSingleIslandParallelExecuteToIslands(island_op);
+  // Find islands with a single `tf_device.parallel_execute` and create
+  // individual islands per execute region of the parallel_execute.
+  llvm::SmallVector<tf_executor::IslandOp, 4> parallel_execute_op_islands;
+  getFunction().walk([&](tf_executor::GraphOp graph_op) {
+    for (auto island_op : graph_op.getOps<tf_executor::IslandOp>()) {
+      if (!island_op.WrapsSingleOp()) continue;
+
+      if (isa<tf_device::ParallelExecuteOp>(&island_op.GetBody().front()))
+        parallel_execute_op_islands.push_back(island_op);
+    }
   });
+
+  for (tf_executor::IslandOp island_op : parallel_execute_op_islands) {
+    auto parallel_execute_op =
+        cast<tf_device::ParallelExecuteOp>(island_op.GetBody().front());
+    CreateIslandsFromParallelExecute(island_op, parallel_execute_op);
+  }
 }
 }  // anonymous namespace
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/parallelize_embedding_params_ops_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/parallelize_embedding_params_ops_pass.cc
index 527af0934ea..352604955c0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/parallelize_embedding_params_ops_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/parallelize_embedding_params_ops_pass.cc
@@ -39,6 +39,10 @@ namespace {
 
 struct ParallelizeEmbeddingParamsOpsPass
     : public PassWrapper<ParallelizeEmbeddingParamsOpsPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<tf_device::TensorFlowDeviceDialect>();
+  }
+
   void runOnFunction() override;
 };
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index d93d9ddccaf..a4ddb713ec0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -167,6 +167,12 @@ void PopulateLegalizeHloToTfPatterns(OwningRewritePatternList* patterns,
 // future these fusions may be codegen'd automatically.
 std::unique_ptr<OperationPass<FuncOp>> CreateFusedKernelMatcherPass();
 
+// Fuses operations defining `ContractionFusableInterface` interface into the
+// contraction operations (MatMul, Conv2D, etc...). This is a more general
+// version of `CreateFusedKernelMatcherPass` that relies on codegen to compose
+// contraction fusions together.
+std::unique_ptr<OperationPass<FuncOp>> CreateContractionFusionPass();
+
 // Creates function pass to select device index/fold tf.DeviceIndex.
 std::unique_ptr<OperationPass<FuncOp>> CreateDeviceIndexSelectorPass();
 
@@ -276,6 +282,11 @@ namespace TFTPU {
 // `_tpu_replicate` attribute.
 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUClusterFormationPass();
 
+// Creates a pass that cleans up `_tpu_replicate` attribute on operations
+// that are inside a cluster.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTPUClusterCleanupAttributesPass();
+
 // Creates a pass that removes Identity/IdentityN ops from a cluster.
 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUIdentityPruningPass();
 
@@ -287,6 +298,10 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateTPUDynamicLayoutPass();
 // `tf_device.launch_func` `padding_map` attribute to its encapsulated function.
 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUDynamicPaddingMapperPass();
 
+// Creates a pass that adds `tf.ReadVariableOp` to a TPU cluster for resources
+// the cluster only writes to.
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPUResourceReadForWritePass();
+
 // Creates a pass that rewrites `tf_device.launch_func` on TPUs into TPU runtime
 // ops.
 std::unique_ptr<OperationPass<ModuleOp>> CreateTPURewritePass();
@@ -295,18 +310,29 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateTPURewritePass();
 // computation.
 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUShardingIdentificationPass();
 
+// Creates a pass that moves `tf.AssignVariableOp` into a
+// `tf_device.parallel_execute` region if the `tf.AssignVariableOp` is the
+// only consumer of a `tf_device.parallel_execute` result.
+std::unique_ptr<OperationPass<FuncOp>>
+CreateTPUParallelExecuteSinkResourceWritePass();
+
 // Creates a pass that merges device variable reads/updates into the surrounded
 // TPUExecute node. This allows the execute node to perform in-place variable
 // updates.
 std::unique_ptr<OperationPass<FuncOp>> CreateTPUMergeVariablesWithExecutePass();
 
+// Creates a pass that wraps ReadVariableOp/AssignVariable op that consumes a
+// packed tensor to have same device placement as underlying TPU device.
+std::unique_ptr<OperationPass<FuncOp>> CreateTPUColocateCompositeResourceOps();
+
 // Creates a pass that adds ops which perform formatting on variables at
 // run-time according to compilation result.
 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUVariableReformattingPass();
 
 // Creates a pass that groups outside compiled operations (CPU ops inside TPU
 // cluster) into clusters that can be extracted and run on the CPU.
-std::unique_ptr<OperationPass<FuncOp>> CreateTPUOutsideCompilationClusterPass();
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTPUOutsideCompilationClusterPass();
 
 // Creates a pass that extracts outside compilation (CPU ops inside TPU cluster)
 // at head/tail of TPU cluster to run before/after TPU computation.
@@ -329,6 +355,7 @@ std::unique_ptr<OperationPass<ModuleOp>>
 CreateTPUExtractOutsideCompilationPass();
 
 // Populates the supplied passmanager with the passes required to run the
+// bridge.
 void CreateTPUBridgePipeline(OpPassManager& pm);
 
 // Populates the supplied passmanager with the passes required to run the
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
index 031d57e99ba..96ff2890558 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_invariant_op_hoisting.cc
@@ -151,7 +151,7 @@ bool IsOpReplicateInvariant(Region* replicate_region, Operation* op) {
 // invariant. Shape ops are rewritten to be invariant when possible, prior to
 // hoisting ops.
 void HoistReplicateInvariantOps(tf_device::ReplicateOp replicate_op) {
-  const int num_replicas = replicate_op.n().getLimitedValue();
+  const int num_replicas = replicate_op.n();
   Block* replicate_block = &replicate_op.GetBody();
 
   replicate_op.walk([&](TF::ShapeOp shape_op) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
index d99279c0014..5b70729ee80 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
@@ -376,7 +376,7 @@ LogicalResult CreateIslandsFromReplicate(const Dialect* tf_dialect,
                                          tf_executor::IslandOp island_op,
                                          tf_device::ReplicateOp replicate_op) {
   OpBuilder builder(island_op);
-  const int num_replicas = replicate_op.n().getLimitedValue();
+  const int num_replicas = replicate_op.n();
 
   // Create islands per replica.
   llvm::SmallVector<tf_executor::IslandOp, 8> replicas;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
index c1ca98bf1f1..648805febfe 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
@@ -283,12 +283,13 @@ void ResourceDeviceInference::runOnOperation() {
       if (auto while_op = dyn_cast<WhileOp>(op)) {
         if (failed(propagate_operands_to_callee_arguments(
                 while_op, while_op.getOperands(),
-                {while_op.body_func(), while_op.cond_func()}, func_res)))
+                {while_op.body_function(), while_op.cond_function()},
+                func_res)))
           return WalkResult::interrupt();
       } else if (auto if_op = dyn_cast<IfOp>(op)) {
         if (failed(propagate_operands_to_callee_arguments(
-                if_op, if_op.input(), {if_op.then_func(), if_op.else_func()},
-                func_res)))
+                if_op, if_op.input(),
+                {if_op.then_function(), if_op.else_function()}, func_res)))
           return WalkResult::interrupt();
       } else if (auto call = dyn_cast<CallOpInterface>(op)) {
         auto func = dyn_cast<FuncOp>(call.resolveCallable());
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index 77f672f5ee4..c357abd10da 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -15,11 +15,15 @@ limitations under the License.
 
 // This pass lifts resource variable operations outside of device computation.
 
+#include <cstddef>
 #include <cstdint>
 
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
@@ -32,20 +36,24 @@ limitations under the License.
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
@@ -137,14 +145,37 @@ struct ResourceOpLiftingPass
   void runOnOperation() override;
 };
 
-// Removes identity nodes in the block. The device computation does not need
-// such nodes to carry information.
-void RemoveIdentity(Block* block) {
-  for (auto& op : llvm::make_early_inc_range(*block)) {
-    if (isa<TF::IdentityOp, TF::IdentityNOp>(&op)) {
-      op.replaceAllUsesWith(op.getOperands());
-      op.erase();
-    }
+bool IsResource(Value value) {
+  return getElementTypeOrSelf(value.getType()).isa<TF::ResourceType>();
+}
+
+// Get the type of the data contained in a resource. Returns null if there is
+// no single type in the resource.
+Type GetResourceSubtype(Value value) {
+  auto resource_type =
+      getElementTypeOrSelf(value.getType()).dyn_cast<TF::ResourceType>();
+  auto subtypes = resource_type.getSubtypes();
+  if (subtypes.size() == 1) return subtypes[0];
+  return nullptr;
+}
+
+// Replaces all `tf.VarIsInitializedOp` in a block with a constant true.
+// TODO(b/171039585): Replace this with proper analysis of
+// `tf.VarIsInitializedOp` in regards to resource writes and control flow.
+void SetAllVarIsInitializedToTrue(Block* block) {
+  auto builder = OpBuilder::atBlockBegin(block);
+  TF::ConstOp const_true = nullptr;
+  for (auto op :
+       llvm::make_early_inc_range(block->getOps<TF::VarIsInitializedOp>())) {
+    builder.setInsertionPoint(op);
+    if (!const_true)
+      const_true = builder.create<TF::ConstOp>(
+          op.getLoc(),
+          DenseIntElementsAttr::get(
+              RankedTensorType::get(/*shape=*/{}, builder.getI1Type()), true));
+
+    op.is_initialized().replaceAllUsesWith(const_true);
+    op.erase();
   }
 }
 
@@ -187,157 +218,447 @@ void ForwardStoreToLoad(Block* block) {
   }
 }
 
-// Moves resource load operations with the provided `move_load` function. This
-// assumes load-store forwarding has been performed on this block such that
-// all loads of same resource are on its initial values. A `skip_load` functions
-// is used to indicate whether a load should be skipped. If there are multiple
-// loads on the same resource, only the first one will be moved, and the later
-// ones will be removed and replaced with the first one.
-void HoistResourceLoads(
-    Block* block, llvm::function_ref<bool(TF::ReadVariableOp)> skip_load,
-    llvm::function_ref<void(TF::ReadVariableOp)> move_load) {
-  llvm::SmallDenseMap<Value, TF::ReadVariableOp> resource_to_read_ops;
+//===----------------------------------------------------------------------===//
+// RegionResourceHoister
+//===----------------------------------------------------------------------===//
 
+// Helper class to hoist resource ops out of regions attached to an op.
+class RegionResourceHoister {
+ public:
+  explicit RegionResourceHoister(Operation* op) : op_(op) {}
+
+  // Analyzes attached regions to record resources read and written.
+  LogicalResult Analyze();
+
+  // Returns all resources accessed by the regions attached the op.
+  auto& GetResources() { return resources_; }
+
+  // Returns if the given value is a resouce that needs lifting.
+  bool Contains(Value resource) const {
+    return resources_.find(resource) != resources_.end();
+  }
+
+  // Drops the given resource from lifting.
+  void DropResource(Value resource) {
+    resources_.erase(resource);
+    written_resources_.remove(resource);
+  }
+
+  // Replaces all resource loads in all regions attached to the op.
+  void ReplaceResourceLoads(bool read_only) {
+    llvm::for_each(op_->getRegions(), [&](Region& region) {
+      ReplaceResourceLoads(region, read_only);
+    });
+  }
+
+  static LogicalResult ReplaceOpWithNewOp(Operation* op);
+
+ private:
+  // Returns if any resources need lifting.
+  bool NeedsLifting() const { return !resources_.empty(); }
+
+  // Returns the number of results generated by the lifted op.
+  int GetLiftedNumResults() const { return num_new_results_; }
+
+  // Generates hoisted reads for resources that need them before the op.
+  void GenerateHoistedReads();
+
+  // Replaces all resource loads in the given region with hoisted loads. If
+  // `read_only` is true, limit this replacement to read only resources.
+  void ReplaceResourceLoads(Region& region, bool read_only);
+
+  // Appends final values writte to resources to the region returns for the
+  // given set of regions.
+  void AppendResourceStoreValueToReturn(RegionRange regions);
+
+  // Performs the final replacement of the op.
+  void ReplaceOpWithNewOp();
+
+  // Returns is this resource was written to in any of the regions.
+  bool IsWritten(Value resource) const {
+    return written_resources_.contains(resource);
+  }
+
+  static LogicalResult HoistResourcesOutOfIfCaseCluster(Operation* op);
+  static LogicalResult HoistResourcesOutOfWhileRegion(TF::WhileRegionOp op);
+
+  Operation* op_;
+
+  // Per resource information about accesses to that resource.
+  struct ResourceInfo {
+    // Is this resource read in any of the regions?
+    bool is_read;
+    // Is this resource written in any of the regions?
+    bool is_written;
+    // Is this resource written in all of the regions?
+    bool is_written_all;
+    // The hoisted read used to replace region reads.
+    Value hoisted_read;
+    // the type of the data held by the resource.
+    Type data_type;
+    // For written resources, the result # of the lifted op which will hold the
+    // value of the resource. This result will be used to generates writes to
+    // the resource after the lifted op.
+    int result_index;
+    // Attributes on the read operation.
+    DictionaryAttr read_attrs;
+    // Attributes on the write operation.
+    DictionaryAttr write_attrs;
+
+    ResourceInfo()
+        : is_read(false),
+          is_written(false),
+          is_written_all(false),
+          hoisted_read(nullptr),
+          data_type(nullptr),
+          result_index(-1) {}
+
+    bool IsResultIndexAssigned() { return result_index != -1; }
+
+    // Refine the resource type using the given type `type`.
+    void RefineType(Type type) {
+      if (!data_type) {
+        data_type = type;
+      } else {
+        data_type = TF::GetCastCompatibleType(data_type, type,
+                                              /*may_ignore_ref_type_a=*/false);
+        assert(data_type != nullptr && "Resource used with incompatible types");
+      }
+    }
+  };
+  llvm::MapVector<Value, ResourceInfo> resources_;
+  llvm::SetVector<Value> written_resources_;
+  // number of new results after lifting.
+  int num_new_results_;
+};
+
+// Analyzes resources that are read or written within attached regions.
+LogicalResult RegionResourceHoister::Analyze() {
+  // Hoisting of child regions might have created opportunity for store-load
+  // forwarding.
+  for (Region& region : op_->getRegions()) {
+    ForwardStoreToLoad(&region.front());
+  }
+
+  llvm::SetVector<Value> all_resources;
+  bool is_func = false;
+  // For functions, the resources to analyze are the function arguments.
+  // Otherwise, its the region captures.
+  if (FuncOp func = dyn_cast<FuncOp>(op_)) {
+    is_func = true;
+    Region& body = func.getBody();
+    for (BlockArgument arg : body.getArguments()) {
+      if (IsResource(arg)) all_resources.insert(arg);
+    }
+  } else {
+    getUsedValuesDefinedAbove(op_->getRegions(), all_resources);
+    all_resources.remove_if([](Value value) { return !IsResource(value); });
+  }
+
+  num_new_results_ = op_->getNumResults();
+
+  for (auto resource : all_resources) {
+    ResourceInfo info;
+    info.data_type = GetResourceSubtype(resource);
+    llvm::BitVector written_regions(op_->getNumRegions());
+    bool unsupported_use = false;
+    for (OpOperand& use : resource.getUses()) {
+      Operation* user = use.getOwner();
+      // If the user is not in one of the regions, we are not interested in it.
+      // Since all the sub-regions within this region (i.e., regions attached to
+      // op's in this region) have themselves gone through lifting, all resource
+      // users are expected to be operations in this region and and not embedded
+      // within other sub-regions attached to op's in this region. So the check
+      // for whether a user is in one of the regions attached to this op is
+      // straightforward.
+      if (user->getParentRegion()->getParentOp() != op_) continue;
+
+      // For functions, if the resource is used as a return operand, use that
+      // as its result index.
+      if (is_func && isa<ReturnOp>(user)) {
+        assert(!info.IsResultIndexAssigned() &&
+               "Expect resource argument to returned no more than once");
+        info.result_index = use.getOperandNumber();
+        continue;
+      }
+
+      auto read = dyn_cast<TF::ReadVariableOp>(user);
+      auto write = dyn_cast<TF::AssignVariableOp>(user);
+      if (!read && !write) {
+        unsupported_use = true;
+        break;
+      }
+
+      if (read && !info.is_read) {
+        info.is_read = true;
+        info.RefineType(read.value().getType());
+        info.read_attrs = user->getAttrDictionary();
+      }
+
+      if (write) {
+        info.is_written = true;
+        info.RefineType(write.value().getType());
+        info.write_attrs = user->getAttrDictionary();
+        written_regions.set(user->getParentRegion()->getRegionNumber());
+      }
+    }
+
+    // If the resource is used in an op that we do not understand, skip
+    // lifting for that resource.
+    if (unsupported_use) continue;
+
+    info.is_written_all = written_regions.count() == op_->getNumRegions();
+
+    // If the resource is written in some but not all regions, we would need
+    // a read for the value before these regions. Note that this is applicable
+    // only to multi-region ops:
+    // If/Case: If not all regions write to the resource, post hoisting the read
+    //   value need to be routed through all paths that don't write.
+    // While: since while condition cannot write, any resource written in the
+    //   while body will need to be read as well in case the while body is never
+    //   executed.
+    // Both cases are handled by the condition below.
+    if (info.is_written && !info.is_written_all) info.is_read = true;
+
+    // Allocate a result index for written resources that don't have one.
+    if (info.is_written) {
+      written_resources_.insert(resource);
+      if (!info.IsResultIndexAssigned()) info.result_index = num_new_results_++;
+    }
+
+    resources_.insert({resource, info});
+  }
+  return success();
+}
+
+// Generates hoisted reads for all resources that need them just before the op.
+void RegionResourceHoister::GenerateHoistedReads() {
+  OpBuilder builder(op_);
+  for (auto& resource_it : GetResources()) {
+    Value resource = resource_it.first;
+    auto& info = resource_it.second;
+
+    if (info.is_read) {
+      Operation* read = builder.create<TF::ReadVariableOp>(
+          op_->getLoc(), info.data_type, resource);
+      read->setAttrs(info.read_attrs);
+      info.hoisted_read = read->getResult(0);
+    }
+  }
+}
+
+// Replaces all resource reads with the hoisted read.
+void RegionResourceHoister::ReplaceResourceLoads(Region& region,
+                                                 bool read_only) {
+  assert(llvm::hasSingleElement(region) && "Expected single block region");
   // Only iterate through ops directly in the body as we can't handle
   // ops nested deeper in regions.
-  for (Operation& op : llvm::make_early_inc_range(*block)) {
-    auto read_variable_op = dyn_cast<TF::ReadVariableOp>(&op);
-    if (!read_variable_op) continue;
-    if (skip_load(read_variable_op)) continue;
+  auto all_reads = region.front().getOps<TF::ReadVariableOp>();
+  for (auto read_op : llvm::make_early_inc_range(all_reads)) {
+    Value resource = read_op.resource();
+    if (!Contains(resource)) continue;
 
-    Value resource = read_variable_op.resource();
-    auto p = resource_to_read_ops.insert({resource, read_variable_op});
-    if (p.second) {
-      move_load(read_variable_op);
-      continue;
+    ResourceInfo& info = resources_[resource];
+    // If replacing loads for read only resources, skip if the resource
+    // was written to.
+    if (read_only && info.is_written) continue;
+
+    read_op.replaceAllUsesWith(info.hoisted_read);
+    read_op.erase();
+  }
+}
+
+// For written resources, add its value at the end of each region to that
+// regions return value. For a region, its value at the end may be a value
+// written to that resource in that region, or its hoisted read value if the
+// resource is not written in that region. The return value can be vended out
+// either as an existing return value, or a newly allocated return value.
+void RegionResourceHoister::AppendResourceStoreValueToReturn(
+    RegionRange regions) {
+  for (Region* region : regions) {
+    assert(llvm::hasSingleElement(*region) && "Expected single block region");
+    Block& front = region->front();
+    auto old_return = front.getTerminator();
+    assert(old_return->getNumOperands() == op_->getNumResults());
+    auto new_return_operands = llvm::to_vector<4>(old_return->getOperands());
+    new_return_operands.resize(num_new_results_);
+
+    // initialize return values for written resources to be the hosited reads.
+    for (Value resource : written_resources_) {
+      const ResourceInfo& info = resources_[resource];
+      new_return_operands[info.result_index] = info.hoisted_read;
     }
 
-    // Getting here means a load operation of this resource has been hoisted out
-    // before. Use hoisted load result to replace all uses of current op result
-    // and erase op.
-    op.replaceAllUsesWith(p.first->second);
-    op.erase();
-  }
-}
+    // Only iterate through ops directly in the body as op's embedded in child
+    // regions should have been lifted out.
+    auto assign_ops = front.getOps<TF::AssignVariableOp>();
+    for (auto assign_variable_op : llvm::make_early_inc_range(assign_ops)) {
+      Value resource = assign_variable_op.resource();
+      if (!IsWritten(resource)) continue;
 
-// If there are any stores to resource defined outside of the block then the
-// stored values must be returned so that new values can be used by sunk
-// resource stores.
-// Returns true if any resource variable stored values are appended, otherwise
-// false.
-bool AppendResourceStoreValueToReturn(Block* body) {
-  bool has_resource_store = false;
-  auto old_return = body->getTerminator();
-
-  llvm::SmallVector<Value, 4> new_return_operands(old_return->getOperands());
-
-  // Only iterate through ops directly in the body as we can't handle ops nested
-  // deeper in regions.
-  for (auto assign_variable_op : body->getOps<TF::AssignVariableOp>()) {
-    Value resource = assign_variable_op.resource();
-    if (!resource) continue;
-
-    // Skip resources created inside of the body.
-    if (resource.getParentRegion() == body->getParent()) continue;
-
-    // TODO(ycao): Prevent same value from being returned multiple times.
-    // TODO(ycao): Do not return resource store value if it is defined outside
-    // of cluster.
-    new_return_operands.push_back(assign_variable_op.value());
-    has_resource_store = true;
-  }
-
-  // If no resource stores are found, no need to update return op.
-  if (!has_resource_store) return false;
-
-  OpBuilder builder(old_return);
-  builder.create<tf_device::ReturnOp>(old_return->getLoc(),
-                                      new_return_operands);
-  old_return->erase();
-  return true;
-}
-
-// Moves resource store operations to after cluster. This assumes load-store
-// forwarding has been performed on this cluster such that there is at most one
-// resource store operation carrying its final value.
-tf_device::ClusterOp SinkResourceStores(tf_device::ClusterOp cluster,
-                                        OpBuilder* builder) {
-  // Update ReturnOp inside cluster's body to output final values of updated
-  // external resources.
-  if (!AppendResourceStoreValueToReturn(&cluster.GetBody())) return cluster;
-
-  auto new_return_op = cluster.GetBody().getTerminator();
-  llvm::SmallVector<Type, 4> new_return_types(new_return_op->getOperandTypes());
-
-  builder->setInsertionPoint(cluster);
-  auto new_cluster = builder->create<tf_device::ClusterOp>(
-      cluster.getLoc(), new_return_types,
-      /*operands=*/llvm::SmallVector<Value, 4>(), cluster.getAttrs());
-  new_cluster.body().takeBody(cluster.body());
-
-  // Replace uses of old cluster results with those of new_cluster.
-  for (auto result : llvm::zip(cluster.getResults(), new_cluster.getResults()))
-    std::get<0>(result).replaceAllUsesWith(std::get<1>(result));
-
-  // Create a mapping from operands of new_return_op operands to new_cluster
-  // results.
-  BlockAndValueMapping mapper;
-  for (auto operand_result :
-       llvm::zip(new_return_op->getOperands(), new_cluster.getResults()))
-    mapper.map(std::get<0>(operand_result), std::get<1>(operand_result));
-
-  // Clone all resource store ops and map their operands to values returned from
-  // new_cluster.
-  for (Operation& op : llvm::make_early_inc_range(new_cluster.GetBody())) {
-    if (isa<TF::AssignVariableOp>(op)) {
-      builder->clone(op, mapper);
-      op.erase();
+      // TODO(ycao): Prevent same value from being returned multiple times.
+      // TODO(ycao): Do not return resource store value if it is defined outside
+      // of cluster. Both of these can be post-resource-op-lifting cleanup
+      // passes.
+      int result_index = resources_[resource].result_index;
+      new_return_operands[result_index] = assign_variable_op.value();
+      assign_variable_op.erase();
     }
+    old_return->setOperands(new_return_operands);
   }
-
-  cluster.erase();
-  return new_cluster;
 }
 
-// Hoists resource variable loads and sinks stores from cluster.
-LogicalResult HoistResourceOpsFromCluster(tf_device::ClusterOp cluster,
-                                          ModuleOp module) {
-  OpBuilder builder(module);
+// Replace the old op with a new op (with potentially additional results), and
+// add stores to written resources after the new op.
+void RegionResourceHoister::ReplaceOpWithNewOp() {
+  auto new_result_types = llvm::to_vector<4>(op_->getResultTypes());
+  int result_region = isa<TF::WhileRegionOp>(op_) ? 1 : 0;
+  Operation* terminator = op_->getRegion(result_region).front().getTerminator();
+  auto extra_result_types =
+      terminator->getOperands().drop_front(op_->getNumResults()).getTypes();
+  new_result_types.insert(new_result_types.end(), extra_result_types.begin(),
+                          extra_result_types.end());
+  OpBuilder builder(op_);
+  // Clone ths old operation but with new result types.
+  Operation* new_op = Operation::create(
+      op_->getLoc(), op_->getName(), new_result_types, op_->getOperands(),
+      op_->getAttrs(), op_->getSuccessors(), op_->getNumRegions());
+  builder.insert(new_op);
 
-  // Remove identity nodes to avoid aliasing.
-  RemoveIdentity(&cluster.GetBody());
+  // Move regions to the new op.
+  for (auto it : llvm::zip(op_->getRegions(), new_op->getRegions())) {
+    Region& old_region = std::get<0>(it);
+    Region& new_region = std::get<1>(it);
+    new_region.takeBody(old_region);
+  }
 
-  // Perform store-load forwarding. So that each resource is only loaded with
-  // its initial value and is only stored with its final value.
-  ForwardStoreToLoad(&cluster.GetBody());
+  // Insert stores to all written resources.
+  for (Value resource : written_resources_) {
+    ResourceInfo& info = resources_[resource];
+    Value value_to_write = new_op->getResult(info.result_index);
+    Operation* write = builder.create<TF::AssignVariableOp>(
+        op_->getLoc(), resource, value_to_write);
+    write->setAttrs(info.write_attrs);
+  }
 
-  // Move loads of external resources, if any, to before cluster.
-  // (Skipping resources created inside of cluster.)
-  HoistResourceLoads(
-      &cluster.GetBody(),
-      /*skip_load=*/
-      [&](TF::ReadVariableOp read) {
-        return read.resource().getParentRegion() == &cluster.body();
-      },
-      /*move_load=*/
-      [&](TF::ReadVariableOp read) {
-        read.getOperation()->moveBefore(cluster);
-      });
+  // As a part of lifting, we either reuse an existing slot for resource type
+  // results or add a new slot. Resource type results should not have any uses
+  // to begin with. So we can safely replace each old op result with the
+  // corresponding new op result.
+  int old_num_results = op_->getNumResults();
+  op_->replaceAllUsesWith(new_op->getResults().take_front(old_num_results));
+  op_->erase();
+  op_ = nullptr;
+}
 
-  // Move stores of external resources, if any, to after cluster.
-  auto new_cluster = SinkResourceStores(cluster, &builder);
+// Lift resource load and stores out of regions attached to `op`, where op is
+// an If/case/cluster op.
+LogicalResult RegionResourceHoister::HoistResourcesOutOfIfCaseCluster(
+    Operation* op) {
+  RegionResourceHoister hoister(op);
+  if (failed(hoister.Analyze())) return failure();
 
-  llvm::SetVector<Value> captured_values;
-  getUsedValuesDefinedAbove(new_cluster.body(), new_cluster.body(),
-                            captured_values);
+  // If there are no resource region captures, then nothing to do.
+  if (!hoister.NeedsLifting()) return success();
 
+  // Start the transformation. For each region, replace the resource read with
+  // the value read before the op.
+  hoister.GenerateHoistedReads();
+  hoister.ReplaceResourceLoads(/*read_only=*/false);
+  hoister.AppendResourceStoreValueToReturn(op->getRegions());
+  hoister.ReplaceOpWithNewOp();
   return success();
 }
 
+// Lift resource loads and stores out of WhileRegion
+LogicalResult RegionResourceHoister::HoistResourcesOutOfWhileRegion(
+    TF::WhileRegionOp op) {
+  // For WhileRegion, post canonicalization all resource used within the
+  // body and condition regions are replaced with captured values, so we do not
+  // need to take into account the body and condition region arguments.
+  RegionResourceHoister hoister(op);
+
+  if (failed(hoister.Analyze())) return failure();
+
+  // If there are no resource region captures, then nothing to do.
+  if (!hoister.NeedsLifting()) return success();
+
+  // The resources captured for While loop fall into two categories:
+  // (a) read-only. These reads can be replaced by a hoisted read created
+  //        before the WhileOp (similar to if and case).
+  // (b) written: since the value is written in the loop (which can only in
+  //        loop body, all these will become loop variables. Since all resource
+  //        variables are removed from the loop variabled during
+  //        canonicalizationW, we need to create new operand/result slots. The
+  //        input operands for these slots are the read values
+  //        prior to the op, and all references to these are replaced by the
+  //        corresponding slot argument. We need to generate writes following
+  //        the while for these resources.
+  //
+  // Note that for WhileRegion ops, if a resource is written, it will be written
+  // only in the body and not the condition, so the hoister analysis will infer
+  // it as needing a read as well.
+
+  // Generate hoisted reads before the while.
+  hoister.GenerateHoistedReads();
+
+  // Replace just the read-only resources with the hoisted reads.
+  hoister.ReplaceResourceLoads(/*read_only=*/true);
+
+  // For written resources, add additional operands to the while op.
+  int num_old_results = op.getNumResults();
+  int num_new_results = hoister.GetLiftedNumResults();
+  int num_extra_results = num_new_results - num_old_results;
+
+  SmallVector<Type, 4> new_result_types;
+  SmallVector<Value, 4> new_while_operands;
+  new_result_types.resize(num_extra_results);
+  new_while_operands.resize(num_extra_results);
+
+  for (auto& it : hoister.GetResources()) {
+    if (!it.second.is_written) continue;
+    int index = it.second.result_index - num_old_results;
+    new_result_types[index] = it.second.data_type;
+    new_while_operands[index] = it.second.hoisted_read;
+  }
+  op.getOperation()->insertOperands(op.getNumOperands(), new_while_operands);
+
+  // Patch the cond and body regions to have additional arguments, and replace
+  // the remaining resource reads (which will be resource reads for written
+  // resources) with these arguments.
+  for (Region* region : op.getRegions()) {
+    region->addArguments(new_result_types);
+    // Point hoisted read for written resources to the region's arguments.
+    for (auto& it : hoister.GetResources()) {
+      if (!it.second.is_written) continue;
+      it.second.hoisted_read = region->getArgument(it.second.result_index);
+    }
+    hoister.ReplaceResourceLoads(*region, /*read_only=*/false);
+  }
+
+  // Add additional return values to body return. These correspond to values
+  // written to resources in the body region.
+  hoister.AppendResourceStoreValueToReturn(op.getRegions().drop_front());
+
+  // Finally, create a new while with additional return values.
+  hoister.ReplaceOpWithNewOp();
+  return success();
+}
+
+// Lift resources out of the regions attached to `op`
+LogicalResult RegionResourceHoister::ReplaceOpWithNewOp(Operation* op) {
+  if (auto while_op = dyn_cast<TF::WhileRegionOp>(op))
+    return HoistResourcesOutOfWhileRegion(while_op);
+  return HoistResourcesOutOfIfCaseCluster(op);
+}
+
 // Holds information about a function's use of a resource argument.
 struct ResourceArgUseInfo {
+  // Data type of the data contained in the resource.
   Type data_type;
+  // Is the resource argument used in an assign op?
   bool updated;
+  // Is the resource argument used in a read or assign op?
   bool used;
 };
 
@@ -348,12 +669,12 @@ struct ResourceArgUseInfo {
 LogicalResult FindResourceArgUseInfo(
     FuncOp func_op, llvm::SmallDenseMap<int64_t, ResourceArgUseInfo>* result) {
   auto return_op = func_op.front().getTerminator();
-  for (auto arg : func_op.getArguments()) {
-    if (!getElementTypeOrSelf(arg.getType()).isa<TF::ResourceType>()) continue;
+  for (auto arg : TF::filter_resources(func_op.getArguments())) {
     ResourceArgUseInfo info;
     info.used = false;
     info.updated = false;
     bool read_or_assigned = false;
+    bool used_in_unsupported_op = false;
     for (auto user : arg.getUsers()) {
       if (user == return_op) continue;
       info.used = true;
@@ -362,14 +683,21 @@ LogicalResult FindResourceArgUseInfo(
         info.data_type = read.getType();
         continue;
       }
+
       if (auto assign = llvm::dyn_cast<TF::AssignVariableOp>(user)) {
         read_or_assigned = true;
         info.updated = true;
         info.data_type = assign.value().getType();
         continue;
       }
+
+      used_in_unsupported_op = true;
+      break;
     }
-    if (!info.used || read_or_assigned) (*result)[arg.getArgNumber()] = info;
+
+    // If the arg is used in an unsupported op, skip lifting it.
+    if (used_in_unsupported_op) continue;
+    (*result)[arg.getArgNumber()] = info;
   }
   return success();
 }
@@ -455,59 +783,59 @@ void RemoveUnusedResourceArgumentsAndForwardedRetvals(
 // signature. resource_data_types is the (index, data type) pair for each
 // resource argument. handle_updated_arg_value is a caller-provided function
 // that handles the updated value for an resource argument.
-void LiftArgRetResourcesForFunction(
+LogicalResult LiftArgRetResourcesForFunction(
     FuncOp func_op,
     const llvm::SmallDenseMap<int64_t, Type>& resource_data_types,
     llvm::function_ref<void(int64_t, Value)> handle_updated_arg_value) {
-  ForwardStoreToLoad(&func_op.front());
-  // Maps a resource argument to the first read.
-  llvm::SmallDenseMap<Value, TF::ReadVariableOp, 4> resource_arg_read;
-  // Maps a resource argument to the last write.
-  llvm::SmallDenseMap<Value, TF::AssignVariableOp, 4> resource_arg_write;
-  // Use HoistResourceLoads to CSE loads and the `move_load` function only
-  // records the remaining load to resource_arg_read.
-  HoistResourceLoads(
-      &func_op.front(),
-      /*skip_load=*/
-      [&](TF::ReadVariableOp read) {
-        return !read.resource().isa<BlockArgument>();
-      },
-      /*move_load=*/
-      [&](TF::ReadVariableOp read) {
-        resource_arg_read[read.resource()] = read;
-      });
-  // Record the stores in resource_arg_read.
-  for (auto& op : llvm::make_early_inc_range(func_op.front())) {
-    auto write = llvm::dyn_cast<TF::AssignVariableOp>(&op);
-    if (!write) continue;
-    auto arg = write.resource().dyn_cast<BlockArgument>();
-    if (!arg) continue;
-    // After ForwardStoreToLoad(), there should be just one store for each
-    // resource.
-    resource_arg_write[arg] = write;
-  }
-  // Now change the input types to non-resource and remove the internal loads.
-  auto new_types = llvm::to_vector<8>(func_op.getType().getInputs());
-  for (auto& entry : resource_data_types) {
-    auto arg = func_op.getArgument(entry.getFirst());
-    auto read_it = resource_arg_read.find(arg);
-    auto write_it = resource_arg_write.find(arg);
-    arg.setType(entry.getSecond());
-    new_types[arg.getArgNumber()] = entry.getSecond();
-    if (read_it != resource_arg_read.end()) {
-      read_it->getSecond().replaceAllUsesWith(arg);
-      read_it->getSecond().erase();
-    }
-    if (write_it != resource_arg_write.end()) {
-      handle_updated_arg_value(arg.getArgNumber(),
-                               write_it->getSecond().value());
-      write_it->getSecond().erase();
+  RegionResourceHoister hoister(func_op);
+  if (failed(hoister.Analyze())) return failure();
+
+  // Each of these resources could be read or written in the function. If its
+  // read, we need to replace the resource arg with a value arg to get the
+  // read value. If its written, we need to replace the write with an additional
+  // value to be written.
+
+  // Now create read values that will be used to replace each resource that
+  // is read in the function body. These read vaulues are just the same argument
+  // with type replaced.
+  llvm::SmallVector<Value, 4> skipped_args;
+  for (auto& it : hoister.GetResources()) {
+    BlockArgument arg = it.first.dyn_cast<BlockArgument>();
+    assert(arg && "Expect resources for FuncOp to be its arguments");
+    auto type_iter = resource_data_types.find(arg.getArgNumber());
+    if (type_iter == resource_data_types.end()) {
+      // Skip lifting the resource if it's not present in the data type map.
+      // This indicates that the resource is not to be lifted because it is used
+      // in an unsupported op in some other function.
+      skipped_args.push_back(arg);
+    } else {
+      arg.setType(type_iter->second);
+      it.second.hoisted_read = arg;
     }
   }
-  func_op.setType(FunctionType::get(
-      new_types,
-      llvm::to_vector<4>(func_op.front().getTerminator()->getOperandTypes()),
-      func_op.getContext()));
+
+  // Drop all the args that have to be skipped.
+  for (Value arg : skipped_args) hoister.DropResource(arg);
+
+  hoister.ReplaceResourceLoads(/*read_only=*/false);
+
+  // For writes, invoke the callback and then erase the write.
+  auto assign_ops = func_op.front().getOps<TF::AssignVariableOp>();
+  for (auto assign_variable_op : llvm::make_early_inc_range(assign_ops)) {
+    Value resource = assign_variable_op.resource();
+    if (!hoister.Contains(resource)) continue;
+
+    auto arg = resource.dyn_cast<BlockArgument>();
+    handle_updated_arg_value(arg.getArgNumber(), assign_variable_op.value());
+    assign_variable_op.erase();
+  }
+
+  func_op.setType(
+      FunctionType::get(func_op.front().getArgumentTypes(),
+                        func_op.front().getTerminator()->getOperandTypes(),
+                        func_op.getContext()));
+
+  return success();
 }
 
 // Returns a vector filtered from range where the unused elements (specified by
@@ -556,29 +884,7 @@ void AddLoadsStoresOutsideControlFlowOp(
 
 // Lifts loads/stores from while loop's body and cond functions.
 LogicalResult HandleWhileLoop(TF::WhileOp while_op, FuncOp body, FuncOp cond) {
-  // Remove identity nodes to avoid aliasing.
-  RemoveIdentity(&body.front());
-  RemoveIdentity(&cond.front());
   auto return_op = body.front().getTerminator();
-  // Sanity check: body resource input/output should alias each other.
-  for (auto arg : body.getArguments()) {
-    if (!getElementTypeOrSelf(arg.getType()).isa<TF::ResourceType>()) continue;
-    if (return_op->getOperand(arg.getArgNumber()) != arg) {
-      return return_op->emitOpError(
-                 "resource used in while loop is only supported when the ")
-             << "resource input and output alias each other in the loop body.";
-    }
-  }
-  // FindResourceArgUseInfo will check supported resource ops (read and assign),
-  // but loop condition has additional requirement that it cannot write
-  // resources.
-  if (cond.walk([&](TF::AssignVariableOp assign) {
-            assign.emitOpError("found resource write in loop condition.");
-            return WalkResult::interrupt();
-          })
-          .wasInterrupted()) {
-    return failure();
-  }
   llvm::SmallDenseMap<int64_t, ResourceArgUseInfo> body_use_info;
   llvm::SmallDenseMap<int64_t, ResourceArgUseInfo> cond_use_info;
   if (failed(FindResourceArgUseInfo(body, &body_use_info)) ||
@@ -589,12 +895,7 @@ LogicalResult HandleWhileLoop(TF::WhileOp while_op, FuncOp body, FuncOp cond) {
   auto resource_arg_uses =
       MergeArgResourceUseInfo(body_use_info, cond_use_info);
   if (resource_arg_uses.empty()) return success();
-  for (const auto& entry : resource_arg_uses) {
-    // Replace output resource uses with the input, so that we can later freely
-    // change the output type.
-    while_op.getResult(entry.getFirst())
-        .replaceAllUsesWith(while_op.getOperand(entry.getFirst()));
-  }
+
   // Remove unused resources in functions.
   llvm::SmallVector<int64_t, 4> old_to_new_indices;
   llvm::SmallDenseMap<int64_t, Type> remaining_resource_data_types;
@@ -647,50 +948,8 @@ LogicalResult HandleWhileLoop(TF::WhileOp while_op, FuncOp body, FuncOp cond) {
 // Lifts loads/stores from an IfOp or CaseOp's branches.
 template <class CaseOrIfOp>
 LogicalResult HandleCaseOrIfOp(CaseOrIfOp op, ArrayRef<FuncOp> branches) {
-  // Remove identity nodes to avoid aliasing.
-  for (auto func : branches) RemoveIdentity(&func.front());
-
-  // Sanity check: branch return of resources should be aliases of inputs. If
-  // so, replace the output uses with the input so that we can remove these
-  // outputs.
-  for (OpResult result : op.getResults()) {
-    if (!getElementTypeOrSelf(result.getType()).isa<TF::ResourceType>())
-      continue;
-    unsigned result_index = result.getResultNumber();
-    constexpr unsigned kUnassigned = -1;
-    unsigned common_aliasing_arg_num = kUnassigned;
-    for (auto func : branches) {
-      auto retval = func.front().getTerminator()->getOperand(result_index);
-      assert(result.getType() == retval.getType());
-      auto aliasing_arg = retval.dyn_cast<BlockArgument>();
-      if (!aliasing_arg)
-        return op.emitOpError("unsupported output: ")
-               << "resource does not alias input";
-      if (common_aliasing_arg_num == kUnassigned)
-        common_aliasing_arg_num = aliasing_arg.getArgNumber();
-      if (aliasing_arg.getArgNumber() != common_aliasing_arg_num)
-        return op.emitOpError("unsupported output: ")
-               << "resource does not alias a single input";
-    }
-    assert(common_aliasing_arg_num != kUnassigned);
-    result.replaceAllUsesWith(op.getOperand(common_aliasing_arg_num + 1));
-  }
-
-  // Erase the resource outputs from the branches.
-  int64_t non_resource_results = 0;
-  llvm::SmallVector<int64_t, 4> old_to_new_output_indices;
-  bool output_removed = false;
-  for (auto result : op.getResults()) {
-    if (!getElementTypeOrSelf(result.getType())
-             .template isa<TF::ResourceType>()) {
-      old_to_new_output_indices.push_back(non_resource_results++);
-      continue;
-    }
-    old_to_new_output_indices.push_back(-1);
-    for (auto func : branches)
-      func.front().getTerminator()->eraseOperand(non_resource_results);
-    output_removed = true;
-  }
+  // For canonicalized If/Case, there should not be any resource outputs
+  int64_t non_resource_results = op.getNumResults();
 
   llvm::SmallDenseMap<int64_t, ResourceArgUseInfo> resource_arg_uses;
   if (failed(FindResourceArgUseInfo(branches.front(), &resource_arg_uses)))
@@ -705,7 +964,7 @@ LogicalResult HandleCaseOrIfOp(CaseOrIfOp op, ArrayRef<FuncOp> branches) {
         MergeArgResourceUseInfo(resource_arg_uses, branch_use_info);
   }
 
-  if (resource_arg_uses.empty() && !output_removed) return success();
+  if (resource_arg_uses.empty()) return success();
   // Remove unused resources in functions.
   llvm::SmallDenseMap<int64_t, Type> remaining_resource_data_types;
   RemoveUnusedResourceArgumentsAndForwardedRetvals(
@@ -780,12 +1039,7 @@ LogicalResult HandleCaseOrIfOp(CaseOrIfOp op, ArrayRef<FuncOp> branches) {
   AddLoadsStoresOutsideControlFlowOp(new_op,
                                      arg_data_type_and_updated_output_index);
   // Replace uses.
-  for (int64_t i = 0, end = old_to_new_output_indices.size(); i < end; ++i) {
-    if (old_to_new_output_indices[i] >= 0) {
-      op.getResult(i).replaceAllUsesWith(
-          new_op.getResult(old_to_new_output_indices[i]));
-    }
-  }
+  op.replaceAllUsesWith(new_op.getResults().take_front(op.getNumResults()));
   op.erase();
   return success();
 }
@@ -811,8 +1065,6 @@ struct PartitionedCallLiftingInfo {
 // happens on a clone, which will be stored in `result`.
 LogicalResult HandlePartitionedCallOpCallee(
     FuncOp callee, PartitionedCallLiftingInfo* result) {
-  // Remove identity nodes to avoid aliasing.
-  RemoveIdentity(&callee.front());
   // Sanity check: return of resources should be aliases of inputs. Such outputs
   // will be removed later.
   int64_t non_resource_results = 0;
@@ -932,8 +1184,8 @@ void UpdatePartitionedCallOpWithNewCallee(
   call_op.erase();
 }
 
-LogicalResult HoistForFunctionalControlFlow(
-    Block*, ModuleOp,
+LogicalResult HoistForControlFlow(
+    Block*, ModuleOp, bool,
     llvm::SmallDenseMap<llvm::StringRef, PartitionedCallLiftingInfo>*);
 
 // A templated routine for handling both PartitionedCallOp and
@@ -942,14 +1194,17 @@ LogicalResult HoistForFunctionalControlFlow(
 // flow, then performs lifting on the callee.
 template <typename CallOpType>
 LogicalResult HandlePartitionedCallOp(
-    CallOpType call_op, FuncOp callee, ModuleOp module,
+    CallOpType call_op, FuncOp callee, ModuleOp module, bool vars_initialized,
     llvm::SmallDenseMap<llvm::StringRef, PartitionedCallLiftingInfo>*
         lifted_callees) {
   auto emplace_res = lifted_callees->try_emplace(callee.getName(),
                                                  PartitionedCallLiftingInfo());
   if (emplace_res.second) {
     // Unseen callee. Perform resource lifting on it.
-    HoistForFunctionalControlFlow(&callee.front(), module, lifted_callees);
+    if (failed(HoistForControlFlow(&callee.front(), module, vars_initialized,
+                                   lifted_callees)))
+      return failure();
+
     if (failed(HandlePartitionedCallOpCallee(
             callee, &emplace_res.first->getSecond()))) {
       return failure();
@@ -961,50 +1216,49 @@ LogicalResult HandlePartitionedCallOp(
 
 // Hoists resource loads/stores from control flow ops in `block` outside the
 // body/cond/branch/callee functions.
-LogicalResult HoistForFunctionalControlFlow(
-    Block* block, ModuleOp module,
+LogicalResult HoistForControlFlow(
+    Block* block, ModuleOp module, bool vars_initialized,
     llvm::SmallDenseMap<llvm::StringRef, PartitionedCallLiftingInfo>*
         lifted_partitioned_call_callees) {
-  // Remove identity nodes to avoid aliasing.
-  RemoveIdentity(block);
+  if (vars_initialized) SetAllVarIsInitializedToTrue(block);
+
   for (Operation& op : llvm::make_early_inc_range(*block)) {
     if (auto while_op = llvm::dyn_cast<TF::WhileOp>(&op)) {
-      auto body = while_op.body_func();
-      auto cond = while_op.cond_func();
+      auto body = while_op.body_function();
+      auto cond = while_op.cond_function();
       // Recursively handle the nested control flow.
-      HoistForFunctionalControlFlow(&body.front(), module,
-                                    lifted_partitioned_call_callees);
-      HoistForFunctionalControlFlow(&cond.front(), module,
-                                    lifted_partitioned_call_callees);
+      HoistForControlFlow(&body.front(), module, vars_initialized,
+                          lifted_partitioned_call_callees);
+      HoistForControlFlow(&cond.front(), module, vars_initialized,
+                          lifted_partitioned_call_callees);
       if (failed(HandleWhileLoop(while_op, body, cond))) return failure();
     } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(&op)) {
-      auto then_branch = if_op.then_func();
-      auto else_branch = if_op.else_func();
+      auto then_branch = if_op.then_function();
+      auto else_branch = if_op.else_function();
       // Recursively handle the nested control flow.
-      HoistForFunctionalControlFlow(&then_branch.front(), module,
-                                    lifted_partitioned_call_callees);
-      HoistForFunctionalControlFlow(&else_branch.front(), module,
-                                    lifted_partitioned_call_callees);
+      HoistForControlFlow(&then_branch.front(), module, vars_initialized,
+                          lifted_partitioned_call_callees);
+      HoistForControlFlow(&else_branch.front(), module, vars_initialized,
+                          lifted_partitioned_call_callees);
       if (failed(HandleCaseOrIfOp(if_op, {then_branch, else_branch})))
         return failure();
     } else if (auto case_op = llvm::dyn_cast<TF::CaseOp>(&op)) {
       SmallVector<FuncOp, 4> branch_functions;
-      branch_functions.reserve(case_op.branches().size());
-      for (const Attribute& branch : case_op.branches()) {
-        FuncOp func =
-            module.lookupSymbol<FuncOp>(branch.cast<FlatSymbolRefAttr>());
+      case_op.get_branch_functions(branch_functions);
+      for (FuncOp func : branch_functions) {
         // Recursively handle the nested control flow.
-        HoistForFunctionalControlFlow(&func.front(), module,
-                                      lifted_partitioned_call_callees);
-        branch_functions.push_back(func);
+        HoistForControlFlow(&func.front(), module, vars_initialized,
+                            lifted_partitioned_call_callees);
       }
       if (failed(HandleCaseOrIfOp(case_op, branch_functions))) return failure();
     } else if (auto call_op = llvm::dyn_cast<TF::PartitionedCallOp>(&op)) {
       auto callee = call_op.func();
-      if (!callee)
+      if (!callee) {
         return call_op.emitOpError(
             "resource lifting does not support call with nested references.");
+      }
       if (failed(HandlePartitionedCallOp(call_op, callee, module,
+                                         vars_initialized,
                                          lifted_partitioned_call_callees))) {
         // Nested control flow handling is done in HandlePartitionedCallOp().
         return failure();
@@ -1012,29 +1266,23 @@ LogicalResult HoistForFunctionalControlFlow(
     } else if (auto call_op =
                    llvm::dyn_cast<TF::StatefulPartitionedCallOp>(&op)) {
       if (failed(HandlePartitionedCallOp(call_op, call_op.func(), module,
+                                         vars_initialized,
                                          lifted_partitioned_call_callees))) {
         return failure();
       }
+    } else if (isa<TF::IfRegionOp, TF::CaseRegionOp, TF::WhileRegionOp>(op)) {
+      for (Region& region : op.getRegions())
+        HoistForControlFlow(&region.front(), module, vars_initialized,
+                            lifted_partitioned_call_callees);
+      LogicalResult result = RegionResourceHoister::ReplaceOpWithNewOp(&op);
+      if (failed(result)) return failure();
     }
   }
 
-  // Remove unused local variables.
+  // After we have hoisted operations in the block, we may have added new read
+  // and writes of resources to this block. Clean them up by doing store-load
+  // forwarding.
   ForwardStoreToLoad(block);
-  llvm::SmallVector<TF::MlirLocalVarOp, 8> local_vars;
-  for (Operation& op : *block) {
-    if (auto local_var = llvm::dyn_cast<TF::MlirLocalVarOp>(&op)) {
-      local_vars.push_back(local_var);
-    }
-  }
-  for (auto local_var : local_vars) {
-    if (llvm::all_of(local_var.resource().getUsers(),
-                     [](const Operation* user) {
-                       return isa<TF::AssignVariableOp>(user);
-                     })) {
-      for (auto user : local_var.resource().getUsers()) user->erase();
-      local_var.erase();
-    }
-  }
   return success();
 }
 
@@ -1045,19 +1293,23 @@ void ResourceOpLiftingPass::runOnOperation() {
   llvm::SmallDenseMap<llvm::StringRef, PartitionedCallLiftingInfo>
       lifted_partitioned_call_callees;
   ModuleOp module = getOperation();
-  auto result = module.walk([&](FuncOp func_op) {
+
+  if (failed(TF::CleanupAndCanonicalizeForResourceOpLifting(module)))
+    return signalPassFailure();
+
+  auto walk_result = module.walk([&](FuncOp func_op) {
     return func_op.walk([&](tf_device::ClusterOp cluster) {
-      if (failed(HoistForFunctionalControlFlow(
-              &cluster.GetBody(), module, &lifted_partitioned_call_callees)) ||
-          failed(HoistResourceOpsFromCluster(cluster, module))) {
-        return WalkResult::interrupt();
-      }
+      LogicalResult result = HoistForControlFlow(
+          &cluster.GetBody(), module, /*vars_initialized=*/true,
+          &lifted_partitioned_call_callees);
+      if (failed(result)) return WalkResult::interrupt();
+      result = RegionResourceHoister::ReplaceOpWithNewOp(cluster);
+      if (failed(result)) return WalkResult::interrupt();
       return WalkResult::advance();
     });
   });
-  if (result.wasInterrupted()) {
-    signalPassFailure();
-  }
+
+  if (walk_result.wasInterrupted()) return signalPassFailure();
 }
 
 struct ResourceOpLiftingForMainFunctionPass
@@ -1107,11 +1359,20 @@ LogicalResult ResourceLiftingForFunctionalControlFlow(FuncOp function) {
            << function.getBlocks().size();
   }
 
+  if (failed(TF::CleanupAndCanonicalizeForResourceOpLifting(function)))
+    return failure();
+
   llvm::SmallDenseMap<llvm::StringRef, PartitionedCallLiftingInfo>
       lifted_partitioned_call_callees;
-  return HoistForFunctionalControlFlow(&function.front(),
-                                       cast<ModuleOp>(function.getParentOp()),
-                                       &lifted_partitioned_call_callees);
+  if (failed(HoistForControlFlow(
+          &function.front(), cast<ModuleOp>(function.getParentOp()),
+          /*vars_initialized=*/false, &lifted_partitioned_call_callees)))
+    return failure();
+
+  // Clean up and canonicalize to remove dead local variables as some local
+  // variables might be dead after hoisting resource loads/stores from control
+  // flow ops.
+  return TF::CleanupAndCanonicalizeForResourceOpLifting(function);
 }
 }  // namespace TF
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc
new file mode 100644
index 00000000000..b635096cc9b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc
@@ -0,0 +1,459 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.h"
+
+#include "llvm/ADT/BitVector.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace {
+
+bool IsResource(Value value) {
+  return getElementTypeOrSelf(value.getType()).isa<TF::ResourceType>();
+}
+
+// Removes identity nodes in the block. The device computation does not need
+// such nodes to carry information.
+void RemoveIdentity(Block &block) {
+  for (auto &op : llvm::make_early_inc_range(block)) {
+    if (isa<TF::IdentityOp, TF::IdentityNOp>(&op)) {
+      op.replaceAllUsesWith(op.getOperands());
+      op.erase();
+    }
+  }
+}
+
+// Eliminate local variables that are only assigned to but never read, and thus
+// are dead.
+void RemoveDeadLocalVariables(Block &block) {
+  llvm::SmallVector<TF::MlirLocalVarOp, 8> local_vars;
+  for (Operation &op : block) {
+    if (auto local_var = llvm::dyn_cast<TF::MlirLocalVarOp>(&op)) {
+      local_vars.push_back(local_var);
+    }
+  }
+  for (auto local_var : local_vars) {
+    auto users = local_var.resource().getUsers();
+    if (llvm::all_of(users, [](const Operation *user) {
+          return isa<TF::AssignVariableOp>(user);
+        })) {
+      for (auto user : llvm::make_early_inc_range(users)) user->erase();
+      local_var.erase();
+    }
+  }
+}
+
+LogicalResult CleanupAndCanonicalize(Operation *parent_op);
+
+// Eliminates unusued results from an operation `op` by cloning it with reduced
+// result types and doing appropriate use replacements. `results_to_eliminate`
+// is a bitvector of result positions to eliminate. If its null, then all unused
+// results of the operation will be eliminated.
+void EliminateUnusedResults(
+    Operation *op, const llvm::BitVector *results_to_eliminate = nullptr) {
+  auto can_eliminate = [&](OpResult &result) -> bool {
+    if (!result.use_empty()) return false;
+    if (results_to_eliminate)
+      return results_to_eliminate->test(result.getResultNumber());
+    else
+      return true;
+  };
+  SmallVector<Type, 4> new_result_types;
+  for (OpResult result : op->getResults()) {
+    if (can_eliminate(result)) continue;
+    new_result_types.push_back(result.getType());
+  }
+
+  // Rebuild the new operation with lesser number of results.
+  OpBuilder builder(op);
+  Operation *new_op = Operation::create(
+      op->getLoc(), op->getName(), new_result_types, op->getOperands(),
+      op->getAttrs(), op->getSuccessors(), op->getNumRegions());
+  builder.insert(new_op);
+
+  // Move region bodies to the new operation.
+  for (auto it : llvm::zip(op->getRegions(), new_op->getRegions())) {
+    Region &old_region = std::get<0>(it);
+    Region &new_region = std::get<1>(it);
+    new_region.takeBody(old_region);
+  }
+
+  // Replace used results and erase the old op.
+  int next_result_idx = 0;
+  for (OpResult result : op->getResults()) {
+    if (can_eliminate(result)) continue;
+    result.replaceAllUsesWith(new_op->getResult(next_result_idx++));
+  }
+  op->erase();
+}
+
+// Clones a function if it cannot be patched in place. Clone if there are
+// multiple uses or unknown uses (for external functions). The cloned function
+// will be marked as private.
+FuncOp CloneFunctionIfNeeded(FuncOp func) {
+  ModuleOp module = func.getParentOfType<ModuleOp>();
+  auto func_uses = SymbolTable::getSymbolUses(func, &module.getBodyRegion());
+  if (func_uses.hasValue() && llvm::hasSingleElement(func_uses.getValue()))
+    return func;
+  FuncOp cloned = func.clone();
+  cloned.setVisibility(SymbolTable::Visibility::Private);
+  cloned.setName(func.getName().str() + "_lifted");
+  SymbolTable(module).insert(cloned);
+  return cloned;
+}
+
+// Eliminates unused results for If/Case operations. Also patches up the
+// branch functions to (a) drop the ununsed return values, and (b) as a result
+// if some argument becomes unused in all branches, drop that argument and the
+// corresponding if/case input operand.
+void EliminateUnusedResultsForIfCase(Operation *op, ArrayRef<FuncOp> branches) {
+  // Clone branch functions if needed since we will be mutating them.
+  SmallVector<FuncOp, 2> cloned_branches;
+  cloned_branches.reserve(branches.size());
+  for (FuncOp func : branches) {
+    FuncOp cloned = CloneFunctionIfNeeded(func);
+    cloned_branches.push_back(cloned);
+    if (cloned == func) continue;
+    // Patch up the op attribute to point to the new function.
+    for (NamedAttribute attr : op->getAttrs()) {
+      auto symref = attr.second.dyn_cast<FlatSymbolRefAttr>();
+      if (!symref) continue;
+      if (symref.getValue() != func.getName()) continue;
+      op->setAttr(attr.first,
+                  FlatSymbolRefAttr::get(cloned.getName(), op->getContext()));
+      break;
+    }
+  }
+
+  // Traverse results backward so that indices to be deleted stay unchanged.
+  for (OpResult result : llvm::reverse(op->getResults())) {
+    if (!result.use_empty()) continue;
+    int result_idx = result.getResultNumber();
+    for (FuncOp func : cloned_branches)
+      func.front().getTerminator()->eraseOperand(result_idx);
+  }
+
+  // Check which function arguments are unused in all branches. We can drop
+  // those as well.
+  int num_args = cloned_branches[0].getNumArguments();
+  llvm::BitVector used_args(num_args);
+  for (FuncOp func : branches) {
+    for (BlockArgument arg : func.getArguments()) {
+      if (!arg.use_empty()) used_args.set(arg.getArgNumber());
+    }
+  }
+
+  // There are some unused args that we can drop. Also drop the corresponding
+  // input operand.
+  if (used_args.count() != num_args) {
+    // Traverse arguments backward so that indices to be deleted stay unchanged.
+    for (int idx = num_args - 1; idx >= 0; --idx) {
+      if (used_args.test(idx)) continue;
+      for (FuncOp func : cloned_branches) func.eraseArgument(idx);
+      // For if/case, arg #i of attached function corresponds to operand #i+1
+      op->eraseOperand(idx + 1);
+    }
+  }
+
+  // Patch up function types (with less number of return values and potentially
+  // less number of arguments)
+  for (FuncOp func : cloned_branches) {
+    func.setType(FunctionType::get(
+        func.front().getArgumentTypes(),
+        func.front().getTerminator()->getOperandTypes(), func.getContext()));
+  }
+
+  EliminateUnusedResults(op);
+}
+
+// Eliminated unused results from a functional while.
+void EliminateUnusedResultsForWhile(TF::WhileOp op) {
+  FuncOp cond = op.cond_function();
+  FuncOp body = op.body_function();
+
+  llvm::BitVector can_eliminate(op.getNumResults());
+  for (OpResult result : llvm::reverse(op.getResults())) {
+    if (!result.use_empty()) continue;
+    int result_idx = result.getResultNumber();
+    BlockArgument cond_arg = cond.getArgument(result_idx);
+    BlockArgument body_arg = cond.getArgument(result_idx);
+    Operation *body_ret = body.front().getTerminator();
+    // We can eliminate a result if its unused and the corresponding argument
+    // is unused in cond and the only use in body is use it as a return value.
+    if (cond_arg.use_empty() && body_arg.hasOneUse() &&
+        body_arg.use_begin()->getOperandNumber() == result_idx &&
+        body_arg.use_begin()->getOwner() == body_ret) {
+      can_eliminate.set(result_idx);
+    }
+  }
+
+  if (can_eliminate.empty()) return;
+
+  FuncOp cloned_cond = CloneFunctionIfNeeded(cond);
+  FuncOp cloned_body = CloneFunctionIfNeeded(body);
+  op.condAttr(FlatSymbolRefAttr::get(cloned_cond.getName(), op.getContext()));
+  op.bodyAttr(FlatSymbolRefAttr::get(cloned_body.getName(), op.getContext()));
+
+  // Drop cond/body args and return value. WhileOp result will be dropped later
+  // in EliminateUnusedResults. Traverse in reverse order so that indices to be
+  // deleted stay unchanged.
+  for (int idx = op.getNumResults() - 1; idx >= 0; --idx) {
+    if (!can_eliminate.test(idx)) continue;
+    cloned_cond.eraseArgument(idx);
+    cloned_body.front().getTerminator()->eraseOperand(idx);
+    cloned_body.eraseArgument(idx);
+  }
+
+  // Patch up branch function types.
+  for (FuncOp func : {cloned_cond, cloned_body}) {
+    func.setType(FunctionType::get(
+        func.front().getArgumentTypes(),
+        func.front().getTerminator()->getOperandTypes(), func.getContext()));
+  }
+  EliminateUnusedResults(op, &can_eliminate);
+}
+
+// For resource results, replace all uses with the resource input to which the
+// result is tied to. After this, resource outputs of this op are expected to be
+// unused.
+LogicalResult ForwardCommonArgToOutput(Operation *op, ArrayRef<FuncOp> branches,
+                                       ValueRange branch_args,
+                                       bool &has_resource_result) {
+  // For while, the branch inputs and outputs need to match.
+  bool io_match = isa<TF::WhileOp>(op);
+
+  has_resource_result = false;
+  // Check if the same input argument number is passed through all functions.
+  for (OpResult result : op->getResults()) {
+    if (!IsResource(result)) continue;
+
+    has_resource_result = true;
+    int result_idx = result.getResultNumber();
+    Optional<int> common_arg_index;
+    for (FuncOp func : branches) {
+      auto ret = func.front().getTerminator();
+      auto block_arg = ret->getOperand(result_idx).dyn_cast<BlockArgument>();
+      if (!block_arg) {
+        return op->emitOpError("result #")
+               << result_idx << " not tied to function argument for branch @"
+               << func.getName();
+      }
+      if (!common_arg_index.hasValue()) {
+        common_arg_index = block_arg.getArgNumber();
+      } else if (common_arg_index.getValue() != block_arg.getArgNumber()) {
+        return op->emitError("result #")
+               << result_idx
+               << " is not tied to the same argument across all branches";
+      }
+    }
+
+    if (io_match && result_idx != common_arg_index.getValue()) {
+      return op->emitOpError("Result #")
+             << result_idx << " is tied to argument #"
+             << common_arg_index.getValue();
+    }
+
+    // Forward the corresponding input to the output
+    result.replaceAllUsesWith(branch_args[common_arg_index.getValue()]);
+  }
+  return success();
+}
+
+// Canonicalizes a function if. Forwards input argument to resource results and
+// then deletes the resource results.
+LogicalResult CanonicalizeFunctionalIfCase(Operation *op,
+                                           ArrayRef<FuncOp> branches,
+                                           ValueRange branch_args) {
+  for (FuncOp func : branches) {
+    if (failed(CleanupAndCanonicalize(func))) return failure();
+  }
+
+  bool has_resource_result = false;
+  if (failed(ForwardCommonArgToOutput(op, branches, branch_args,
+                                      has_resource_result)))
+    return failure();
+
+  // If no resource type results were found, no further cleanup needed.
+  if (!has_resource_result) return success();
+
+  // Drop unused results.
+  EliminateUnusedResultsForIfCase(op, branches);
+  return success();
+}
+
+// Canonicalizes a functional while. Forwards common argument to results and
+// drop resource results if posible.
+LogicalResult CanonicalizeFunctionalWhile(TF::WhileOp op) {
+  for (FuncOp func : {op.cond_function(), op.body_function()}) {
+    if (failed(CleanupAndCanonicalize(func))) return failure();
+  }
+
+  // For while, just use the body function to forward operand to result.
+  bool has_resource_result = false;
+  if (failed(ForwardCommonArgToOutput(op, {op.body_function()},
+                                      op.getOperands(), has_resource_result)))
+    return failure();
+  // If no resource type results were found, no further cleanup needed.
+  if (!has_resource_result) return success();
+
+  // Drop unused results.
+  EliminateUnusedResultsForWhile(op);
+  return success();
+}
+
+// Canonicalizes region based if/case and cluster operations. If the same
+// captured resource typed value is used for all region results, then that value
+// is forwared to the result and the result is dropped.
+LogicalResult CanonicalizeRegionIfCaseCluster(Operation *op) {
+  // Check if the same value is used for all region results for this output.
+  bool has_resource_result = false;
+  for (OpResult result : op->getResults()) {
+    if (!IsResource(result)) continue;
+    has_resource_result = true;
+    int result_idx = result.getResultNumber();
+
+    Value ret0 =
+        op->getRegion(0).front().getTerminator()->getOperand(result_idx);
+    for (Region &region : op->getRegions().drop_front()) {
+      Value ret = region.front().getTerminator()->getOperand(result_idx);
+      if (ret != ret0) {
+        return op->emitError("Result #")
+               << result_idx
+               << " not tied to the same capture across all regions";
+      }
+    }
+    result.replaceAllUsesWith(ret0);
+  }
+
+  if (!has_resource_result) return success();
+
+  // Eliminate unused region results. Traverse in reverse order so that
+  // indices to be deleted stay unchanged.
+  for (OpResult result : llvm::reverse(op->getResults())) {
+    if (!result.use_empty()) continue;
+    int result_idx = result.getResultNumber();
+    for (Region &region : op->getRegions())
+      region.front().getTerminator()->eraseOperand(result_idx);
+  }
+  EliminateUnusedResults(op);
+  return success();
+}
+
+// Canonicalizes a region based while. If the same value is passed through
+// the body, the result is replaced with the operand and all argument/results
+// and retuns values corresponding to that result are dropped.
+LogicalResult CanonicalizeWhileRegion(TF::WhileRegionOp op) {
+  Region &body = op.body();
+  Region &cond = op.cond();
+  llvm::BitVector can_eliminate(op.getNumResults());
+
+  // Traverse in reverse order so that indices to be deleted stay unchanged.
+  for (OpResult result : llvm::reverse(op.getResults())) {
+    if (!IsResource(result)) continue;
+    int result_idx = result.getResultNumber();
+    auto body_arg = body.front()
+                        .getTerminator()
+                        ->getOperand(result_idx)
+                        .dyn_cast<BlockArgument>();
+    if (!body_arg || body_arg.getArgNumber() != result_idx) {
+      return op.emitOpError("Result #") << result_idx << " is not tied to arg #"
+                                        << result_idx << " of the body";
+    }
+    body.getArgument(result_idx).replaceAllUsesWith(op.getOperand(result_idx));
+    cond.getArgument(result_idx).replaceAllUsesWith(op.getOperand(result_idx));
+    body.front().getTerminator()->eraseOperand(result_idx);
+    body.eraseArgument(result_idx);
+    cond.eraseArgument(result_idx);
+    result.replaceAllUsesWith(op.getOperand(result_idx));
+    op.getOperation()->eraseOperand(result_idx);
+    can_eliminate.set(result_idx);
+  }
+  EliminateUnusedResults(op, &can_eliminate);
+  return success();
+}
+
+// Removes identities and canonicalizes all operations within `parent_op`.
+LogicalResult CleanupAndCanonicalize(Operation *parent_op) {
+  auto walk_result = parent_op->walk([](Operation *op) {
+    // Cleanup code in attached regions.
+    for (Region &region : op->getRegions()) {
+      if (!llvm::hasSingleElement(region)) return WalkResult::interrupt();
+      RemoveIdentity(region.front());
+      RemoveDeadLocalVariables(region.front());
+    }
+
+    LogicalResult result = success();
+
+    // While condition cannot write to resource variables.
+    auto check_while_cond = [&](TF::AssignVariableOp assign) {
+      op->emitOpError("found resource write in loop condition.");
+      return WalkResult::interrupt();
+    };
+
+    if (auto if_op = dyn_cast<TF::IfOp>(op)) {
+      result = CanonicalizeFunctionalIfCase(
+          op, {if_op.then_function(), if_op.else_function()}, if_op.input());
+    } else if (auto case_op = dyn_cast<TF::CaseOp>(op)) {
+      SmallVector<FuncOp, 4> branches;
+      case_op.get_branch_functions(branches);
+      result = CanonicalizeFunctionalIfCase(case_op, branches, case_op.input());
+    } else if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
+      if (while_op.cond_function().walk(check_while_cond).wasInterrupted())
+        return WalkResult::interrupt();
+      result = CanonicalizeFunctionalWhile(while_op);
+    } else if (isa<TF::IfRegionOp, TF::CaseRegionOp, tf_device::ClusterOp>(
+                   op)) {
+      result = CanonicalizeRegionIfCaseCluster(op);
+    } else if (auto while_region = dyn_cast<TF::WhileRegionOp>(op)) {
+      if (while_region.cond().walk(check_while_cond).wasInterrupted())
+        return WalkResult::interrupt();
+      // For while region, the body input and output arg should match.
+      CanonicalizeWhileRegion(while_region);
+    } else if (auto call = dyn_cast<CallOpInterface>(op)) {
+      FuncOp func = dyn_cast<FuncOp>(call.resolveCallable());
+      if (!func) return WalkResult::interrupt();
+      result = CleanupAndCanonicalize(func);
+    }
+    return failed(result) ? WalkResult::interrupt() : WalkResult::advance();
+  });
+
+  return failure(walk_result.wasInterrupted());
+}
+
+}  // anonymous namespace
+
+namespace TF {
+
+LogicalResult CleanupAndCanonicalizeForResourceOpLifting(FuncOp func) {
+  return CleanupAndCanonicalize(func);
+}
+
+LogicalResult CleanupAndCanonicalizeForResourceOpLifting(ModuleOp module) {
+  auto walk_result = module.walk([](tf_device::ClusterOp cluster) {
+    if (failed(CleanupAndCanonicalize(cluster))) return WalkResult::interrupt();
+    return WalkResult::advance();
+  });
+  return failure(walk_result.wasInterrupted());
+}
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.h b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.h
new file mode 100644
index 00000000000..626ef91bcf6
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_RESOURCE_OP_LIFTING_CLEANUP_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_RESOURCE_OP_LIFTING_CLEANUP_H_
+
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+
+// Performs IR cleanup and canonicalization in preparation for Resource Op
+// Lifting pass. It does several things:
+// - Eliminate identity nodes to remove (most) of resource aliasing
+// - Canonicalize functional control flow. For functional control flow we
+//   expect that any resource output of these ops matches the corresponding
+//   input, and then forward that input to the output. Fails if this is not the
+//   case. If successful, the following invariants will hold true:
+//   (a) For if/case, any resource type results will be deleted.
+//   (b) For while, any resource type results will be unused.
+// - Canonicalize region based control flow. Again, any resource outputs are
+//   expected to be resolved to be one of the captured resource inputs. Fails
+//   if this is not the case. If successful, the following invariants will hold
+//   true:
+//   (a) For if/case, any resource type results will be deleted.
+//   (b) For while, any resource type results will be unused.
+namespace mlir {
+namespace TF {
+LogicalResult CleanupAndCanonicalizeForResourceOpLifting(ModuleOp module);
+LogicalResult CleanupAndCanonicalizeForResourceOpLifting(FuncOp func);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_RESOURCE_OP_LIFTING_CLEANUP_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 88ad787df3e..e802353b84c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <iterator>
 
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -41,6 +42,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Interfaces/FoldInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
@@ -51,10 +53,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
-#include "tensorflow/core/framework/op.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/types.pb.h"
 
@@ -68,7 +67,7 @@ using tensorflow::shape_inference::ShapeHandle;
 namespace mlir {
 namespace TF {
 namespace {
-Optional<SmallVector<Type, 4>> InferShapeForFunctionReturnType(FuncOp func) {
+Optional<TypeRange> InferShapeForFunctionReturnType(FuncOp func) {
   // Find any return ops.
   SmallVector<ReturnOp, 4> return_ops;
   for (Block& block : func) {
@@ -111,17 +110,17 @@ Optional<SmallVector<Type, 4>> InferShapeForFunctionReturnType(FuncOp func) {
     }
   }
 
-  return llvm::to_vector<4>(return_op.getOperandTypes());
+  return TypeRange(return_op.getOperandTypes());
 }
 
 // Returns if the shape inference pass supports an op outside the TF dialect.
 bool IsSupportedNonTFOp(Operation* op) {
-  return isa<ReturnOp, tf_device::ReturnOp, tf_executor::EnterOp,
-             tf_executor::ExitOp, tf_executor::FetchOp, tf_executor::GraphOp,
-             tf_executor::IslandOp, tf_executor::LoopCondOp,
-             tf_executor::MergeOp, tf_executor::NextIterationSinkOp,
-             tf_executor::SwitchNOp, tf_executor::SwitchOp,
-             tf_executor::YieldOp>(op);
+  return isa<ReturnOp, tf_device::ReturnOp, tf_device::ClusterOp,
+             tf_device::LaunchOp, tf_executor::EnterOp, tf_executor::ExitOp,
+             tf_executor::FetchOp, tf_executor::GraphOp, tf_executor::IslandOp,
+             tf_executor::LoopCondOp, tf_executor::MergeOp,
+             tf_executor::NextIterationSinkOp, tf_executor::SwitchNOp,
+             tf_executor::SwitchOp, tf_executor::YieldOp>(op);
 }
 
 // Returns whether a cast back would need to be inserted, e.g., whether the
@@ -156,57 +155,6 @@ void UpdateTypeAndInsertIncompatibleUseCasts(Dialect* tf_dialect, Type new_type,
   result.setType(new_type);
 }
 
-// Extracts a PartialTensorShape from the MLIR type.
-Optional<tensorflow::PartialTensorShape> GetShapeFromMlirType(Type t) {
-  if (auto ranked_type = t.dyn_cast<RankedTensorType>()) {
-    // Convert the MLIR shape indices (int64_t) to TensorFlow indices
-    // (int64).
-    ArrayRef<int64_t> shape = ranked_type.getShape();
-    SmallVector<int64, 8> tf_shape(shape.begin(), shape.end());
-    return tensorflow::PartialTensorShape({tf_shape.data(), tf_shape.size()});
-  }
-  return None;
-}
-
-// Gets the subtype's shape and data type for `type`. Templated to support both
-// ResourceType and VariantType.
-template <typename T>
-std::unique_ptr<std::vector<
-    std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>
-GetSubtypesHelper(Type type) {
-  auto type_with_subtypes =
-      type.cast<TensorType>().getElementType().dyn_cast<T>();
-  if (!type_with_subtypes || type_with_subtypes.getSubtypes().empty()) {
-    return nullptr;
-  }
-  auto shapes_and_types = absl::make_unique<std::vector<
-      std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>();
-  for (auto subtype : type_with_subtypes.getSubtypes()) {
-    auto shape = GetShapeFromMlirType(subtype);
-    // handle_shapes_and_types requires all shapes to be known. So if any
-    // subtype is unknown, clear the vector.
-    if (!shape) {
-      shapes_and_types = nullptr;
-      break;
-    }
-    tensorflow::DataType dtype;
-    auto status =
-        tensorflow::ConvertToDataType(subtype.getElementType(), &dtype);
-    assert(status.ok() && "Unknown element type");
-    shapes_and_types->emplace_back(*shape, dtype);
-  }
-  return shapes_and_types;
-}
-
-// Gets the subtype's shape and data type for `type`.
-std::unique_ptr<std::vector<
-    std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>
-GetSubtypes(Type type) {
-  auto subclasses = GetSubtypesHelper<TF::ResourceType>(type);
-  if (subclasses) return subclasses;
-  return GetSubtypesHelper<TF::VariantType>(type);
-}
-
 // Returns whether type can be further refined.
 bool CanBeRefined(Type type) {
   auto shape_type = type.dyn_cast<ShapedType>();
@@ -293,8 +241,8 @@ bool InferShapeForCast(CastOp op, Dialect* tf_dialect) {
 // function result types.
 bool InferShapeForIf(IfOp op) {
   bool changed = false;
-  auto then_results = op.then_func().getType().getResults();
-  auto else_results = op.else_func().getType().getResults();
+  auto then_results = op.then_function().getType().getResults();
+  auto else_results = op.else_function().getType().getResults();
   for (auto it : llvm::zip(op.getResults(), then_results, else_results)) {
     // If then and else types do not match, skip refinement for that result.
     if (std::get<1>(it) != std::get<2>(it)) continue;
@@ -745,6 +693,11 @@ bool ShapeInference::InferShapeForNonTFDialectOperation(Operation* op) {
     return RefineTypeForPassThroughOperands(op, terminator->getOperands(),
                                             op->getResults());
   }
+  if (auto cluster_op = dyn_cast<tf_device::ClusterOp>(op)) {
+    auto terminator = cluster_op.GetBody().getTerminator();
+    return RefineTypeForPassThroughOperands(op, terminator->getOperands(),
+                                            op->getResults());
+  }
   if (op->hasTrait<OpTrait::SameOperandsAndResultShape>()) {
     return RefineShapeForPassThroughOps(op);
   }
@@ -794,182 +747,54 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
   if (auto if_region = dyn_cast<IfRegionOp>(op))
     return InferShapeForIfRegion(if_region);
 
-  StringRef op_name = op->getName().getStringRef();
-  // Drop the `tf.` prefix to query TF registry.
-  auto node_name =
-      op_name.drop_front(TensorFlowDialect::getDialectNamespace().size() + 1);
-
-  // Get information from the registry and check if we have a shape function for
-  // this op.
-  const tensorflow::OpRegistrationData* op_reg_data =
-      tensorflow::OpRegistry::Global()->LookUp(node_name.data());
-  if (!op_reg_data) {
-    LLVM_DEBUG(llvm::dbgs() << "Skipping inference for unregistered op '"
-                            << op->getName() << "'.\n");
-    return false;
-  }
-  if (op_reg_data->shape_inference_fn == nullptr) {
-    LLVM_DEBUG(llvm::dbgs()
-               << "Skipping inference for op without shape function '"
-               << op->getName() << "'.\n");
-    return false;
-  }
-
-  // Convert the operation to a NodeDef to be able to use the InferenceContext
-  // and the TensorFlow shape function.
-  auto node_def_or = tensorflow::ConvertTFDialectOpToNodeDef(
-      op, node_name, /*ignore_unregistered_attrs=*/true);
-  if (!node_def_or.ok()) {
-    LLVM_DEBUG(llvm::dbgs()
-               << "Error converting op '" << *op << "' to NodeDef: "
-               << node_def_or.status().error_message() << "\n");
-    return false;
-  }
-  std::unique_ptr<tensorflow::NodeDef> node_def =
-      std::move(node_def_or).ValueOrDie();
-
-  // Collect an array with input values for constant operands and input shapes
-  // for all the operands.
-  std::vector<const tensorflow::Tensor*> input_tensors(op->getNumOperands());
-  std::vector<tensorflow::PartialTensorShape> input_shapes(
-      op->getNumOperands());
-  std::vector<tensorflow::Tensor> tensors(op->getNumOperands());
-  std::vector<std::unique_ptr<std::vector<
-      std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>>
-      handle_shapes_and_types(op->getNumOperands());
-  for (auto it : llvm::enumerate(op->getOperands())) {
-    Value operand = it.value();
-    size_t index = it.index();
-
-    // If the operand is constant, then convert it to Tensor.
+  // Return operand as a constant attribute.
+  auto operand_as_constant_fn = [&](Value operand) {
     ValuePort vp(operand);
     Attribute attr = ComputeOutputComponent(vp);
     if (!attr && matchPattern(operand, m_Constant(&attr)))
       RecordValue(vp, attr);
-    if (attr) {
-      tensorflow::Tensor* input_tensor = &tensors[index];
-      auto status =
-          tensorflow::ConvertToTensor(attr.cast<ElementsAttr>(), input_tensor);
-      if (status.ok()) {
-        input_tensors[index] = input_tensor;
-      } else {
-        LLVM_DEBUG(llvm::dbgs()
-                   << "Error converting input " << index << " of op '" << *op
-                   << "' to Tensor: " << status.error_message() << "\n");
-      }
-    }
+    return attr;
+  };
 
-    Type operand_type = operand.getType();
-    if (auto shape = GetShapeFromMlirType(operand_type)) {
-      input_shapes[index] = *shape;
-    }
-    // Collect the handle shapes and types for a resource/variant.
-    handle_shapes_and_types[index] = GetSubtypes(operand_type);
-  }
+  // Return op result as a shape.
+  auto op_result_as_shape_fn = [&](InferenceContext& context,
+                                   OpResult op_result) {
+    return ComputeOutputAsShape(op_result, &context);
+  };
 
-  // Perform the shape inference using an InferenceContext with the input
-  // shapes. This object is abstracting the information that the ShapeInference
-  // function operates on.
-  InferenceContext c(graph_version_, *node_def, op_reg_data->op_def,
-                     input_shapes, input_tensors,
-                     /*input_tensors_as_shapes=*/{}, handle_shapes_and_types);
-  auto status = c.Run(op_reg_data->shape_inference_fn);
-  if (!status.ok()) {
-    LLVM_DEBUG(llvm::dbgs() << "Shape inference error for '" << *op
-                            << "': " << status.error_message() << "\n");
+  // Return result element type at `index`.
+  auto result_element_type_fn = [&](int index) {
+    return op->getResult(index).getType().cast<TensorType>().getElementType();
+  };
+
+  llvm::SmallVector<ShapedTypeComponents, 4> inferred_return_shapes;
+  if (failed(InferReturnTypeComponentsForTFOp(
+          /*location=*/None, op, graph_version_, operand_as_constant_fn,
+          op_result_as_shape_fn, result_element_type_fn,
+          inferred_return_shapes)))
     return false;
-  }
-
-  // Determine if, during shape computation, the shape functions attempted to
-  // query an input operand as shape where the input was not known/constant.
-  bool requires_inputs =
-      any_of(llvm::seq<int>(0, c.num_inputs()), [&](int input) {
-        return c.requested_input_tensor_as_partial_shape(input) &&
-               !input_tensors[input];
-      });
-  if (requires_inputs) {
-    LLVM_DEBUG(llvm::dbgs() << "\trequired input\n");
-    std::vector<ShapeHandle> input_tensors_as_shapes;
-    for (int input : llvm::seq<int>(0, c.num_inputs())) {
-      if (c.requested_input_tensor_as_partial_shape(input) &&
-          !input_tensors[input]) {
-        LLVM_DEBUG(llvm::dbgs() << "Requesting " << input << " as shape\n");
-        auto op_result = op->getOperand(input).dyn_cast<OpResult>();
-        if (!op_result) continue;
-        // Resize on first valid shape computed.
-        input_tensors_as_shapes.resize(c.num_inputs());
-        auto handle = ComputeOutputAsShape(op_result, &c);
-        LLVM_DEBUG(llvm::dbgs() << "Requested " << input << " as shape "
-                                << (handle.Handle() ? "found" : "not found"));
-        if (handle.Handle()) input_tensors_as_shapes[input] = handle;
-      }
-    }
-
-    // Attempt to compute the unknown operands as shapes.
-    // Note: in the case where no partial outputs could be computed, this would
-    // be empty.
-    if (!input_tensors_as_shapes.empty()) {
-      c.set_input_tensors_as_shapes(input_tensors_as_shapes);
-      auto status = c.Run(op_reg_data->shape_inference_fn);
-      if (!status.ok()) {
-        LLVM_DEBUG(llvm::dbgs() << "Shape inference error for '" << *op
-                                << "': " << status.error_message() << "\n");
-        return false;
-      }
-    }
-  }
-
-  assert(c.num_outputs() == op->getNumResults() &&
-         "inference context matches the MLIR number of results.");
 
   // Update the shape for each of the operation result if the InferenceContext
   // has more precise shapes recorded.
   bool changed = false;
-  for (int output : llvm::seq<int>(0, c.num_outputs())) {
-    // Skip already statically shaped results.
-    Value result = op->getResult(output);
-    if (!CanBeRefined(result.getType())) continue;
-    auto shaped_type = result.getType().cast<ShapedType>();
+  for (auto result : llvm::zip(op->getResults(), inferred_return_shapes)) {
+    Value op_result = std::get<0>(result);
+    if (!CanBeRefined(op_result.getType())) continue;
 
-    ShapeHandle shape_handle = c.output(output);
-    LLVM_DEBUG(llvm::dbgs() << "Inferred output " << output << " : "
-                            << c.DebugString(shape_handle) << "\n");
-    auto get_tensor_type = [&c](const ShapeHandle& sh,
-                                Type element_type) -> TensorType {
-      if (!c.RankKnown(sh)) return UnrankedTensorType::get(element_type);
-      // Convert the shape from TensorFlow (int64) to MLIR (int64_t).
-      SmallVector<int64_t, 8> shape;
-      for (int dim : llvm::seq<int>(0, c.Rank(sh)))
-        shape.push_back(c.Value(c.Dim(sh, dim)));
-      return RankedTensorType::get(shape, element_type);
-    };
-    auto new_element_type = shaped_type.getElementType();
-    // Populate the handle shapes for a resource/variant.
-    if (new_element_type.isa<TF::ResourceType, TF::VariantType>()) {
-      auto handle_shapes_types = c.output_handle_shapes_and_types(output);
-      if (handle_shapes_types) {
-        SmallVector<TensorType, 1> subtypes;
-        OpBuilder b(op);
-        for (const auto& shape_n_type : *handle_shapes_types) {
-          Type element_type;
-          auto status =
-              tensorflow::ConvertDataType(shape_n_type.dtype, b, &element_type);
-          assert(status.ok() && "Unknown element type");
-          subtypes.push_back(get_tensor_type(shape_n_type.shape, element_type));
-        }
-        if (new_element_type.isa<TF::ResourceType>()) {
-          new_element_type = TF::ResourceType::get(subtypes, op->getContext());
-        } else {
-          new_element_type = TF::VariantType::get(subtypes, op->getContext());
-        }
-      }
-    }
-    auto new_type = get_tensor_type(shape_handle, new_element_type);
-    if (result.getType() == new_type) continue;
+    ShapedTypeComponents inferred = std::get<1>(result);
+    TensorType inferred_type;
+    if (inferred.hasRank())
+      inferred_type =
+          RankedTensorType::get(inferred.getDims(), inferred.getElementType());
+    else
+      inferred_type = UnrankedTensorType::get(inferred.getElementType());
 
-    UpdateTypeAndInsertIncompatibleUseCasts(tf_dialect_, new_type, op, result);
+    if (op_result.getType() == inferred_type) continue;
+    UpdateTypeAndInsertIncompatibleUseCasts(tf_dialect_, inferred_type, op,
+                                            op_result);
     changed = true;
   }
+
   if (changed)
     LLVM_DEBUG(llvm::dbgs()
                << "Modified after shape inference: '" << *op << "'\n");
@@ -980,7 +805,6 @@ LogicalResult ShapeInference::PropagateShapeToFunctions(
     ModuleOp module, Operation::operand_type_range input_types,
     ArrayRef<FuncOp> functions, int64_t max_iteration) {
   bool all_succeeded = true;
-  auto types = llvm::to_vector<4>(input_types);
   // If shape propagation fails for one function, return failure, but do not
   // early exit and attempt to propagate shapes for all provided functions to
   // have a best-effort propagation.
@@ -997,8 +821,8 @@ LogicalResult ShapeInference::PropagateShapeToFunctions(
     }
 
     FunctionType func_type = func.getType();
-    func.setType(
-        FunctionType::get(types, func_type.getResults(), func.getContext()));
+    func.setType(FunctionType::get(input_types, func_type.getResults(),
+                                   func.getContext()));
 
     auto res =
         PropagateShapeToRegions(input_types, {&func.getBody()}, max_iteration);
@@ -1009,7 +833,7 @@ LogicalResult ShapeInference::PropagateShapeToFunctions(
 
     auto new_return_types = InferShapeForFunctionReturnType(func);
     if (new_return_types)
-      func.setType(FunctionType::get(types, new_return_types.getValue(),
+      func.setType(FunctionType::get(input_types, new_return_types.getValue(),
                                      func.getContext()));
   }
   return success(all_succeeded);
@@ -1019,16 +843,17 @@ LogicalResult ShapeInference::PropagateShapeToRegions(
     Operation::operand_type_range input_types, ArrayRef<Region*> regions,
     int64_t max_iteration) {
   bool all_succeeded = true;
-  auto types = llvm::to_vector<4>(input_types);
   // If shape propagation fails for one region, return failure, but do not
   // early exit and attempt to propagate shapes for all provided regions to
   // have a best-effort propagation.
   for (auto region : regions) {
     // Refine region arguments.
     Block& entry = region->front();
-    assert(types.size() == entry.getNumArguments());
-    for (auto arg_and_idx : llvm::enumerate(entry.getArguments())) {
-      arg_and_idx.value().setType(types[arg_and_idx.index()]);
+    assert(llvm::size(input_types) == entry.getNumArguments());
+    for (auto it : llvm::zip(entry.getArguments(), input_types)) {
+      BlockArgument arg = std::get<0>(it);
+      Type type = std::get<1>(it);
+      arg.setType(type);
     }
 
     // Propagate shapes into the region.
@@ -1099,20 +924,17 @@ LogicalResult ShapeInference::PropagateShapeIntoAttachedFunctions(
   if (auto if_op = dyn_cast<TF::IfOp>(op)) {
     return PropagateShapeToFunctions(
         module, drop_begin(if_op.getOperandTypes(), 1),
-        {if_op.then_func(), if_op.else_func()}, max_iteration);
+        {if_op.then_function(), if_op.else_function()}, max_iteration);
   } else if (auto case_op = dyn_cast<TF::CaseOp>(op)) {
     SmallVector<FuncOp, 4> branches;
-    for (Attribute branch : case_op.branches()) {
-      auto sym = branch.cast<FlatSymbolRefAttr>();
-      branches.push_back(SymbolTable::lookupNearestSymbolFrom<FuncOp>(op, sym));
-    }
+    case_op.get_branch_functions(branches);
     return PropagateShapeToFunctions(module,
                                      drop_begin(case_op.getOperandTypes(), 1),
                                      branches, max_iteration);
   } else if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
     return PropagateShapeToFunctions(
         module, while_op.getOperandTypes(),
-        {while_op.cond_func(), while_op.body_func()}, max_iteration);
+        {while_op.cond_function(), while_op.body_function()}, max_iteration);
   } else if (auto call_op = dyn_cast<CallOpInterface>(op)) {
     if (auto func = dyn_cast<FuncOp>(call_op.resolveCallable())) {
       PropagateConstantToCallee(call_op, func, module);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
index d3755a4a7d0..05eef4d5045 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/stack_ops_decomposition.cc
@@ -139,8 +139,7 @@ void ModifyFunctionSignature(
     handle_new_size_vars(func.getArguments().drop_front(original_arg_count));
   }
   func.setType(FunctionType::get(
-      new_input_types,
-      llvm::to_vector<8>(func.front().getTerminator()->getOperandTypes()),
+      new_input_types, func.front().getTerminator()->getOperandTypes(),
       func.getContext()));
 }
 
@@ -163,7 +162,7 @@ LogicalResult HandleWhileOp(
     const llvm::SmallDenseMap<Value, Value>& data_var_to_size_var,
     llvm::StringMap<PartitionedCallStackOpsInfo>*
         decomposed_partitioned_call_callees) {
-  auto body = while_op.body_func();
+  auto body = while_op.body_function();
   llvm::SmallDenseMap<Value, Value> body_map;
   auto find_arg_stack_type = [&](int64_t index) -> llvm::Optional<Type> {
     auto it = data_var_to_size_var.find(while_op.getOperand(index));
@@ -187,7 +186,7 @@ LogicalResult HandleWhileOp(
     return failure();
   }
   // Cond should not change stacks in the arguments, so use an empty map.
-  auto cond = while_op.cond_func();
+  auto cond = while_op.cond_function();
   ModifyFunctionSignature(cond, nullptr, find_arg_stack_type);
   llvm::SmallDenseMap<Value, Value> empty_map;
   if (failed(DecomposeStackOpsInternal(&cond.front(), module, &empty_map,
@@ -231,8 +230,8 @@ LogicalResult HandleIfOp(
     const llvm::SmallDenseMap<Value, Value>& data_var_to_size_var,
     llvm::StringMap<PartitionedCallStackOpsInfo>*
         decomposed_partitioned_call_callees) {
-  auto then_func = if_op.then_func();
-  auto else_func = if_op.else_func();
+  auto then_func = if_op.then_function();
+  auto else_func = if_op.else_function();
   llvm::SmallDenseMap<Value, Value> then_map;
   llvm::SmallDenseMap<Value, Value> else_map;
 
@@ -465,6 +464,38 @@ LogicalResult HandleStackPopV2Op(
   return success();
 }
 
+LogicalResult HandleRegionControlFlowOps(
+    Operation& op, ModuleOp module,
+    llvm::SmallDenseMap<Value, Value>* data_var_to_size_var,
+    llvm::StringMap<PartitionedCallStackOpsInfo>*
+        decomposed_partitioned_call_callees) {
+  for (OpOperand& operand : op.getOpOperands()) {
+    if (getElementTypeOrSelf(operand.get().getType()).isa<TF::ResourceType>()) {
+      return op.emitOpError()
+             << "found unexpected type " << operand.get().getType()
+             << " of operand #" << operand.getOperandNumber()
+             << ", resource type operands are expected to have been "
+                "canonicalized away for region based control flow ops";
+    }
+  }
+  for (OpResult result : op.getResults()) {
+    if (getElementTypeOrSelf(result.getType()).isa<TF::ResourceType>()) {
+      return op.emitOpError()
+             << "found unexpected type " << result.getType() << " of result #"
+             << result.getResultNumber()
+             << ", resource type results are expected to have been "
+                "canonicalized away for region based control flow ops";
+    }
+  }
+  for (Region& region : op.getRegions()) {
+    if (failed(DecomposeStackOpsInternal(&region.front(), module,
+                                         data_var_to_size_var,
+                                         decomposed_partitioned_call_callees)))
+      return failure();
+  }
+  return success();
+}
+
 // Decomposes stack ops on a region and recursively decomposes called functions.
 // data_var_to_size_var: a mapping from stacks' buffer local variables to size
 // local variables.
@@ -506,6 +537,13 @@ LogicalResult DecomposeStackOpsInternal(
                             decomposed_partitioned_call_callees))) {
         return failure();
       }
+    } else if (llvm::isa<TF::WhileRegionOp>(op) ||
+               llvm::isa<TF::IfRegionOp>(op) ||
+               llvm::isa<TF::CaseRegionOp>(op)) {
+      if (failed(
+              HandleRegionControlFlowOps(op, module, data_var_to_size_var,
+                                         decomposed_partitioned_call_callees)))
+        return failure();
     } else if (auto pcall = llvm::dyn_cast<TF::PartitionedCallOp>(&op)) {
       if (!pcall.func()) {
         return pcall.emitOpError(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
index b3a05c06a67..680d5334ceb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_array_ops_decomposition.cc
@@ -132,6 +132,15 @@ llvm::Optional<llvm::SmallVector<int64_t, 8>> GetTensorArrayElementShape(
             return llvm::None;
           }
           return t;
+        } else if (auto scatter =
+                       llvm::dyn_cast<TF::TensorArrayScatterV3Op>(user)) {
+          // TensorArrayScatter writes vector of tensors to TensorArray. We can
+          // deduce the shape of TensorArray by dropping the 0th dim of
+          // TensorArrayScatter `value`.
+          auto t = scatter.value().getType().dyn_cast<RankedTensorType>();
+          if (!t || t.getShape().empty()) return llvm::None;
+          return RankedTensorType::get(t.getShape().drop_front(),
+                                       t.getElementType());
         }
         return llvm::None;
       });
@@ -139,6 +148,26 @@ llvm::Optional<llvm::SmallVector<int64_t, 8>> GetTensorArrayElementShape(
   return llvm::to_vector<8>(elem_type->getShape());
 }
 
+void ReplaceAllUsesWithCast(Value old_val, Value new_val) {
+  if (old_val.use_empty()) return;
+  auto cast_op =
+      OpBuilder(old_val.getDefiningOp())
+          .create<TensorCastOp>(old_val.getLoc(), old_val.getType(), new_val);
+  old_val.replaceAllUsesWith(cast_op);
+}
+
+void ReplaceAllUsesExceptTerminator(Value old_val, Value new_val) {
+  if (old_val.getType() == new_val.getType()) {
+    old_val.replaceAllUsesWith(new_val);
+    return;
+  }
+  Operation* old_op = old_val.getDefiningOp();
+  Operation* terminator_op =
+      old_op->getParentOfType<FuncOp>().front().getTerminator();
+  llvm::SmallPtrSet<mlir::Operation*, 1> exceptions = {terminator_op};
+  old_val.replaceAllUsesExcept(new_val, exceptions);
+}
+
 struct TensorArrayStats {
   // Whether a write op should accumulate with the old value. Set to true if
   // this is a gradient.
@@ -195,7 +224,8 @@ LogicalResult HandleTensorArrayReadV3Op(
   auto index_reshape =
       cutil::ReshapeScalarToSizeType(builder, read.index(), read.getLoc());
   auto elem = cutil::GetElement(index_reshape, buffer, builder, read.getLoc());
-  read.value().replaceAllUsesWith(elem);
+  ReplaceAllUsesExceptTerminator(read.value(), elem);
+  ReplaceAllUsesWithCast(read.value(), elem);
   read.erase();
   // The clear_after_read attribute does not mean setting the tensor to 0 after
   // read; instead it does not allow a second read before the next write. We
@@ -260,7 +290,8 @@ LogicalResult HandleTensorArrayConcatV3Op(
           RankedTensorType::get(shape, buffer_type.getElementType())},
       ArrayRef<Value>{buffer,
                       cutil::GetR1Const(shape, builder, concat.getLoc())});
-  concat.value().replaceAllUsesWith(buffer);
+  ReplaceAllUsesExceptTerminator(concat.value(), buffer);
+  ReplaceAllUsesWithCast(concat.value(), buffer);
 
   // Create the lengths as a list of the same value (element size).
   tensorflow::Tensor lengths_tensor(tensorflow::DT_INT64,
@@ -389,7 +420,8 @@ LogicalResult HandleTensorArrayGatherV3Op(
   auto buffer = cutil::ReadLocalVariable(local_var, builder, gather.getLoc());
   auto result =
       cutil::GatherElements(gather.indices(), buffer, builder, gather.getLoc());
-  gather.value().replaceAllUsesWith(result);
+  ReplaceAllUsesExceptTerminator(gather.value(), result);
+  ReplaceAllUsesWithCast(gather.value(), result);
   gather.erase();
   return success();
 }
@@ -443,12 +475,12 @@ llvm::SmallDenseMap<int64_t, llvm::SmallVector<string, 4>> AccessedGradients(
         insert(grad.handle(), grad.source().str());
       } else if (auto while_op = llvm::dyn_cast<TF::WhileOp>(&op)) {
         for (const auto& entry : AccessedGradients(
-                 {while_op.body_func(), while_op.cond_func()}, module))
+                 {while_op.body_function(), while_op.cond_function()}, module))
           for (const string& source : entry.getSecond())
             insert(while_op.getOperand(entry.getFirst()), source);
       } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(&op)) {
-        for (const auto& entry :
-             AccessedGradients({if_op.then_func(), if_op.else_func()}, module))
+        for (const auto& entry : AccessedGradients(
+                 {if_op.then_function(), if_op.else_function()}, module))
           for (const string& source : entry.getSecond())
             insert(if_op.getOperand(entry.getFirst() + 1), source);
       } else if (auto call = llvm::dyn_cast<CallOpInterface>(&op)) {
@@ -509,8 +541,8 @@ LogicalResult HandleWhileOp(TF::WhileOp while_op, ModuleOp module,
                             llvm::SmallDenseMap<Value, TensorArrayStats>* stats,
                             llvm::StringMap<PartitionedCallTensorArrayOpsInfo>*
                                 decomposed_partitioned_call_callees) {
-  auto body = while_op.body_func();
-  auto cond = while_op.cond_func();
+  auto body = while_op.body_function();
+  auto cond = while_op.cond_function();
   auto grads = AccessedGradients({body, cond}, module);
   auto ta_arg_buffer_type = [&](int64_t index) -> Type {
     auto it = stats->find(while_op.getOperand(index));
@@ -570,6 +602,7 @@ LogicalResult HandleWhileOp(TF::WhileOp while_op, ModuleOp module,
         }
         stat.grads[source] = grad_var;
         operands.push_back(grad_var);
+        (*stats)[grad_var].accumulate_on_write = true;
       }
     }
   }
@@ -592,8 +625,8 @@ LogicalResult HandleIfOp(TF::IfOp if_op, ModuleOp module,
                          llvm::SmallDenseMap<Value, TensorArrayStats>* stats,
                          llvm::StringMap<PartitionedCallTensorArrayOpsInfo>*
                              decomposed_partitioned_call_callees) {
-  auto then_branch = if_op.then_func();
-  auto else_branch = if_op.else_func();
+  auto then_branch = if_op.then_function();
+  auto else_branch = if_op.else_function();
   auto grads = AccessedGradients({then_branch, else_branch}, module);
   auto ta_arg_buffer_type = [&](int64_t index) -> Type {
     auto it = stats->find(if_op.getOperand(index + 1));
@@ -636,6 +669,7 @@ LogicalResult HandleIfOp(TF::IfOp if_op, ModuleOp module,
         }
         stat.grads[source] = grad_var;
         operands.push_back(grad_var);
+        (*stats)[grad_var].accumulate_on_write = true;
       }
     }
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
index 9634e4a8be3..f7c0357a212 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
@@ -73,8 +73,7 @@ void UpdateFuncType(FuncOp func) {
   llvm::SmallVector<Type, 8> arg_types;
   for (auto arg : func.getArguments()) arg_types.push_back(arg.getType());
   func.setType(FunctionType::get(
-      arg_types,
-      llvm::to_vector<8>(func.front().getTerminator()->getOperandTypes()),
+      arg_types, func.front().getTerminator()->getOperandTypes(),
       func.getContext()));
 }
 
@@ -125,26 +124,39 @@ LogicalResult DecomposeTensorListOpsInternal(
     Block*, ModuleOp, llvm::SmallDenseMap<Value, SizeInfo>*,
     llvm::StringMap<PartitionedCallDecompositionInfo>*);
 
+// Adds the corresponding sizes of tensor list buffers in block's terminator
+// to the list of return values. Returns the mapping from the buffer
+// indices to the added size indices, which is a list of tuples
+// (buffer_return_index, size_return_index, fixed_size).
+template <class TerminatorOp>
+llvm::SmallVector<std::tuple<int64_t, int64_t, bool>, 8>
+AddTensorListSizesToTerminator(
+    Block& block, const llvm::SmallDenseMap<Value, SizeInfo>& buffer_to_size) {
+  auto old_terminator = block.getTerminator();
+  auto new_outputs = llvm::to_vector<8>(old_terminator->getOperands());
+  llvm::SmallVector<std::tuple<int64_t, int64_t, bool>, 8>
+      output_buffer_to_size;
+  for (auto retval : llvm::enumerate(old_terminator->getOperands())) {
+    auto it = buffer_to_size.find(retval.value());
+    if (it == buffer_to_size.end()) continue;
+    output_buffer_to_size.emplace_back(retval.index(), new_outputs.size(),
+                                       it->getSecond().fixed);
+    new_outputs.push_back(it->getSecond().size);
+  }
+  OpBuilder(old_terminator)
+      .create<TerminatorOp>(old_terminator->getLoc(), new_outputs);
+  old_terminator->erase();
+  return output_buffer_to_size;
+}
+
 // Adds the corresponding sizes of tensor list buffers in func's return values
 // to the list of return values. Returns the mapping from the buffer indices to
 // the added size indices, which is a list of tuples (buffer_return_index,
 // size_return_index, fixed_size).
-llvm::SmallVector<std::tuple<int64_t, int64_t, bool>, 8>
-AddTensorListSizesToReturn(
+llvm::SmallVector<std::tuple<int64_t, int64_t, bool>, 8> ModifyFunctionReturn(
     FuncOp func, const llvm::SmallDenseMap<Value, SizeInfo>& buffer_to_size) {
-  auto old_return = func.front().getTerminator();
-  auto new_returns = llvm::to_vector<8>(old_return->getOperands());
-  llvm::SmallVector<std::tuple<int64_t, int64_t, bool>, 8>
-      output_buffer_to_size;
-  for (auto retval : llvm::enumerate(old_return->getOperands())) {
-    auto it = buffer_to_size.find(retval.value());
-    if (it == buffer_to_size.end()) continue;
-    output_buffer_to_size.emplace_back(retval.index(), new_returns.size(),
-                                       it->getSecond().fixed);
-    new_returns.push_back(it->getSecond().size);
-  }
-  OpBuilder(old_return).create<ReturnOp>(old_return->getLoc(), new_returns);
-  old_return->erase();
+  auto output_buffer_to_size =
+      AddTensorListSizesToTerminator<ReturnOp>(func.front(), buffer_to_size);
   UpdateFuncType(func);
   return output_buffer_to_size;
 }
@@ -155,7 +167,7 @@ LogicalResult HandleWhileOp(
     llvm::StringMap<PartitionedCallDecompositionInfo>*
         decomposed_partitioned_call_callees) {
   // Rewrite body.
-  auto body = while_op.body_func();
+  auto body = while_op.body_function();
   llvm::SmallDenseMap<Value, SizeInfo> body_map;
   auto find_arg_tensor_list_type = [&](int64_t index) -> llvm::Optional<Type> {
     auto it = buffer_to_size->find(while_op.getOperand(index));
@@ -173,10 +185,10 @@ LogicalResult HandleWhileOp(
           decomposed_partitioned_call_callees))) {
     return failure();
   }
-  auto output_buffer_to_size = AddTensorListSizesToReturn(body, body_map);
+  auto output_buffer_to_size = ModifyFunctionReturn(body, body_map);
 
   // Rewrite cond.
-  auto cond = while_op.cond_func();
+  auto cond = while_op.cond_function();
   llvm::SmallDenseMap<Value, SizeInfo> cond_map;
   ModifyFunctionSignature(cond, cutil::GetSizeType(builder), &cond_map,
                           find_arg_tensor_list_type, arg_buffer_size_is_fixed);
@@ -241,9 +253,9 @@ LogicalResult HandleCaseOrIfOp(
 
   const bool arg_no_changed = branch_maps.front().empty();
   auto output_buffer_to_size =
-      AddTensorListSizesToReturn(branches.front(), branch_maps.front());
+      ModifyFunctionReturn(branches.front(), branch_maps.front());
   for (const auto& pair : llvm::drop_begin(llvm::zip(branches, branch_maps), 1))
-    AddTensorListSizesToReturn(std::get<0>(pair), std::get<1>(pair));
+    ModifyFunctionReturn(std::get<0>(pair), std::get<1>(pair));
 
   if (output_buffer_to_size.empty() && arg_no_changed) return success();
 
@@ -267,6 +279,158 @@ LogicalResult HandleCaseOrIfOp(
   return success();
 }
 
+LogicalResult HandleWhileRegionOp(
+    TF::WhileRegionOp while_op, ModuleOp module,
+    llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size,
+    llvm::StringMap<PartitionedCallDecompositionInfo>*
+        decomposed_partitioned_call_callees) {
+  OpBuilder builder(while_op);
+  auto modify_region_arguments = [&](Region& region) {
+    int64_t original_arg_count = region.getNumArguments();
+    for (int64_t i = 0; i < original_arg_count; ++i) {
+      auto operand = while_op.getOperand(i);
+      auto it = buffer_to_size->find(operand);
+      if (it == buffer_to_size->end()) continue;
+      auto buffer_type = it->getFirst().getType();
+      region.getArgument(i).setType(buffer_type);
+      auto size_arg = region.addArgument(cutil::GetSizeType(builder));
+      (*buffer_to_size)[region.getArgument(i)] = {size_arg,
+                                                  it->getSecond().fixed};
+    }
+  };
+
+  // Rewrite body.
+  Region& body_region = while_op.body();
+  modify_region_arguments(body_region);
+  if (failed(DecomposeTensorListOpsInternal(
+          &body_region.front(), module, buffer_to_size,
+          decomposed_partitioned_call_callees))) {
+    return failure();
+  }
+  auto output_buffer_to_size = AddTensorListSizesToTerminator<TF::YieldOp>(
+      body_region.front(), *buffer_to_size);
+
+  // Rewrite cond.
+  Region& cond_region = while_op.cond();
+  modify_region_arguments(cond_region);
+  if (failed(DecomposeTensorListOpsInternal(
+          &cond_region.front(), module, buffer_to_size,
+          decomposed_partitioned_call_callees))) {
+    return failure();
+  }
+
+  if (output_buffer_to_size.empty()) return success();
+
+  // Create the new while op.
+  auto new_while_operands = llvm::to_vector<8>(while_op.getOperands());
+  for (int64_t i = 0; i < while_op.getNumResults(); ++i) {
+    auto it = buffer_to_size->find(while_op.getOperand(i));
+    if (it == buffer_to_size->end()) continue;
+    new_while_operands.push_back(it->getSecond().size);
+  }
+  auto new_while = builder.create<TF::WhileRegionOp>(
+      while_op.getLoc(), body_region.front().getTerminator()->getOperandTypes(),
+      new_while_operands, while_op.getAttrs());
+  new_while.body().takeBody(body_region);
+  new_while.cond().takeBody(cond_region);
+  for (const auto& entry : output_buffer_to_size) {
+    (*buffer_to_size)[new_while.getResult(std::get<0>(entry))] = {
+        new_while.getResult(std::get<1>(entry)), std::get<2>(entry)};
+  }
+  while_op.replaceAllUsesWith(
+      new_while.getResults().take_front(while_op.getNumResults()));
+  while_op.erase();
+  return success();
+}
+
+LogicalResult HandleIfRegionOp(
+    TF::IfRegionOp if_op, ModuleOp module,
+    llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size,
+    llvm::StringMap<PartitionedCallDecompositionInfo>*
+        decomposed_partitioned_call_callees) {
+  // Rewrite the branches.
+  Region& then_branch = if_op.then_branch();
+  Region& else_branch = if_op.else_branch();
+  if (failed(DecomposeTensorListOpsInternal(
+          &then_branch.front(), module, buffer_to_size,
+          decomposed_partitioned_call_callees)))
+    return failure();
+  if (failed(DecomposeTensorListOpsInternal(
+          &else_branch.front(), module, buffer_to_size,
+          decomposed_partitioned_call_callees)))
+    return failure();
+
+  auto output_buffer_to_size = AddTensorListSizesToTerminator<TF::YieldOp>(
+      then_branch.front(), *buffer_to_size);
+  AddTensorListSizesToTerminator<TF::YieldOp>(else_branch.front(),
+                                              *buffer_to_size);
+
+  if (output_buffer_to_size.empty()) return success();
+
+  // Recreate the op.
+  auto new_op = OpBuilder(if_op).create<TF::IfRegionOp>(
+      if_op.getLoc(), then_branch.front().getTerminator()->getOperandTypes(),
+      if_op.getOperand(), if_op.getAttrs());
+  for (const auto& entry : output_buffer_to_size) {
+    (*buffer_to_size)[new_op.getResult(std::get<0>(entry))] = {
+        new_op.getResult(std::get<1>(entry)), std::get<2>(entry)};
+  }
+
+  new_op.then_branch().takeBody(if_op.then_branch());
+  new_op.else_branch().takeBody(if_op.else_branch());
+
+  if_op.replaceAllUsesWith(
+      new_op.getResults().take_front(if_op.getNumResults()));
+  if_op.erase();
+  return success();
+}
+
+LogicalResult HandleCaseRegionOp(
+    TF::CaseRegionOp case_op, ModuleOp module,
+    llvm::SmallDenseMap<Value, SizeInfo>* buffer_to_size,
+    llvm::StringMap<PartitionedCallDecompositionInfo>*
+        decomposed_partitioned_call_callees) {
+  // Rewrite the branches.
+  RegionRange branches = case_op.getRegions();
+
+  for (Region* branch : branches) {
+    if (failed(DecomposeTensorListOpsInternal(
+            &branch->front(), module, buffer_to_size,
+            decomposed_partitioned_call_callees)))
+      return failure();
+  }
+
+  // Get the output buffer index to size index mapping one of the branches. It
+  // should be same for all the branches so we only get it for the first branch.
+  Region* first_branch = branches.front();
+  auto output_buffer_to_size = AddTensorListSizesToTerminator<TF::YieldOp>(
+      first_branch->front(), *buffer_to_size);
+  for (Region* branch : branches.drop_front()) {
+    AddTensorListSizesToTerminator<TF::YieldOp>(branch->front(),
+                                                *buffer_to_size);
+  }
+
+  if (output_buffer_to_size.empty()) return success();
+
+  // Recreate the op.
+  auto new_op = OpBuilder(case_op).create<TF::CaseRegionOp>(
+      case_op.getLoc(),
+      first_branch->front().getTerminator()->getOperandTypes(),
+      case_op.getOperand(), case_op.getAttrs(), case_op.getNumRegions());
+  for (const auto& entry : output_buffer_to_size) {
+    (*buffer_to_size)[new_op.getResult(std::get<0>(entry))] = {
+        new_op.getResult(std::get<1>(entry)), std::get<2>(entry)};
+  }
+
+  for (auto pair : llvm::zip(new_op.getRegions(), case_op.getRegions())) {
+    std::get<0>(pair)->takeBody(*std::get<1>(pair));
+  }
+  case_op.replaceAllUsesWith(
+      new_op.getResults().take_front(case_op.getNumResults()));
+  case_op.erase();
+  return success();
+}
+
 template <typename CallOp>
 LogicalResult HandlePartitionedCallOp(
     CallOp call, FuncOp callee, ModuleOp module,
@@ -337,7 +501,7 @@ LogicalResult HandlePartitionedCallOp(
     return failure();
   }
   info.buffer_ret_to_size_ret =
-      AddTensorListSizesToReturn(lowered_callee, callee_map);
+      ModifyFunctionReturn(lowered_callee, callee_map);
   info.decomposed_callee = lowered_callee;
   if (args_no_changed && info.buffer_ret_to_size_ret.empty()) {
     // Signature is not modified. We do not need to keep two copies.
@@ -701,17 +865,14 @@ LogicalResult DecomposeTensorListOpsInternal(
         return failure();
       }
     } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(&op)) {
-      if (failed(HandleCaseOrIfOp(if_op, {if_op.then_func(), if_op.else_func()},
-                                  module, buffer_to_size,
-                                  decomposed_partitioned_call_callees))) {
+      if (failed(HandleCaseOrIfOp(
+              if_op, {if_op.then_function(), if_op.else_function()}, module,
+              buffer_to_size, decomposed_partitioned_call_callees))) {
         return failure();
       }
     } else if (auto case_op = llvm::dyn_cast<TF::CaseOp>(&op)) {
       SmallVector<FuncOp, 2> branches;
-      for (auto branch_symbol : case_op.branches()) {
-        branches.push_back(module.lookupSymbol<FuncOp>(
-            branch_symbol.cast<FlatSymbolRefAttr>()));
-      }
+      case_op.get_branch_functions(branches);
       if (failed(HandleCaseOrIfOp(case_op, branches, module, buffer_to_size,
                                   decomposed_partitioned_call_callees))) {
         return failure();
@@ -734,6 +895,21 @@ LogicalResult DecomposeTensorListOpsInternal(
               decomposed_partitioned_call_callees))) {
         return failure();
       }
+    } else if (auto while_op = llvm::dyn_cast<TF::WhileRegionOp>(&op)) {
+      if (failed(HandleWhileRegionOp(while_op, module, buffer_to_size,
+                                     decomposed_partitioned_call_callees))) {
+        return failure();
+      }
+    } else if (auto if_op = llvm::dyn_cast<TF::IfRegionOp>(&op)) {
+      if (failed(HandleIfRegionOp(if_op, module, buffer_to_size,
+                                  decomposed_partitioned_call_callees))) {
+        return failure();
+      }
+    } else if (auto case_op = llvm::dyn_cast<TF::CaseRegionOp>(&op)) {
+      if (failed(HandleCaseRegionOp(case_op, module, buffer_to_size,
+                                    decomposed_partitioned_call_callees))) {
+        return failure();
+      }
     }
   }
   return success();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc
index 786c4b74b34..f2321df9823 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_data_optimization.cc
@@ -58,7 +58,7 @@ struct FuseParallelMapAndBatch : public OpRewritePattern<BatchDatasetV2Op> {
 void PopulateTFDataOptimizationPatterns(MLIRContext *context,
                                         OwningRewritePatternList *patterns) {
   patterns->insert<FuseParallelMapAndBatch>(context);
-  populateWithGenerated(context, patterns);
+  populateWithGenerated(context, *patterns);
 }
 
 }  // namespace TF
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc
index 1e4caaf5dd6..52ac87ecf71 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_graph_optimization_pass.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/IR/Identifier.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
@@ -43,6 +44,10 @@ namespace tensorflow {
 class GraphOptPass
     : public mlir::PassWrapper<GraphOptPass,
                                mlir::OperationPass<mlir::ModuleOp>> {
+  void getDependentDialects(mlir::DialectRegistry& registry) const override {
+    mlir::RegisterAllTensorFlowDialects(registry);
+  }
+
  public:
   explicit GraphOptPass(std::vector<tensorflow::GraphOptimizationPass*> passes)
       : passes_(std::move(passes)) {}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_cleanup_attributes.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_cleanup_attributes.cc
new file mode 100644
index 00000000000..93098acdc9d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_cleanup_attributes.cc
@@ -0,0 +1,60 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+
+// This pass eliminate `_tpu_replicate` and `device` attribute on operations
+// that are contained in a tf_device.cluster op.
+
+namespace mlir {
+namespace TFTPU {
+
+namespace {
+
+constexpr char kTPUReplicateAttr[] = "_tpu_replicate";
+constexpr char kDeviceAttr[] = "device";
+
+class TPUCleanupClusterAttributesPass
+    : public PassWrapper<TPUCleanupClusterAttributesPass,
+                         OperationPass<ModuleOp>> {
+ public:
+  void runOnOperation() override {
+    getOperation().walk([](tf_device::ClusterOp cluster) {
+      cluster.walk([](Operation *op) {
+        if (isa<tf_device::ClusterOp>(op)) return;
+        for (StringRef attr : {kTPUReplicateAttr, kDeviceAttr})
+          op->removeAttr(attr);
+      });
+    });
+  }
+};
+
+PassRegistration<TPUCleanupClusterAttributesPass> pass(
+    "tf-tpu-cleanup-cluster-attributes",
+    "Eliminate _tpu_replicate and other attributes from ops in a cluster");
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTPUClusterCleanupAttributesPass() {
+  return std::make_unique<TPUCleanupClusterAttributesPass>();
+}
+
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
index f5bdd08d980..46bc094e5ed 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
@@ -71,13 +71,19 @@ constexpr char kBadTPUReplicateAttrMsg[] =
 using MetadataMap =
     llvm::SmallDenseMap<llvm::StringRef, MutableDictionaryAttr, 8>;
 
+// A set of operations in a cluster.
+using ClusterOps = llvm::SmallSetVector<Operation*, 8>;
+
 // Mapping for `_tpu_replicate` attribute to ops of a cluster.
-using ClusterMap = llvm::SmallDenseMap<llvm::StringRef,
-                                       llvm::SmallSetVector<Operation*, 8>, 8>;
+using ClusterMap = llvm::SmallDenseMap<llvm::StringRef, ClusterOps, 8>;
 
 struct TPUClusterFormation
     : public TF::PerFunctionAggregateAnalysisConsumerPass<
           TPUClusterFormation, TF::ResourceAliasAnalysis> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<tf_device::TensorFlowDeviceDialect>();
+  }
+
   void runOnFunction(
       FuncOp func,
       const TF::ResourceAliasAnalysis::Info& resource_alias_analysis);
@@ -87,42 +93,40 @@ struct TPUClusterFormation
 // attribute to its attributes and removes the ops. If multiple
 // TPUReplicateMetadata ops have the same `_tpu_replicate` attribute, an error
 // will be returned.
-LogicalResult CollectMetadata(Operation* op, MetadataMap* metadata_map) {
-  auto result =
-      op->walk([&](TF::TPUReplicateMetadataOp metadata_op) -> WalkResult {
-        MutableDictionaryAttr attrs = metadata_op.getAttrs();
+LogicalResult CollectMetadata(Block* block, MetadataMap* metadata_map) {
+  // Just look at top-level operations in the block (not nested ones)
+  for (Operation& op : llvm::make_early_inc_range(*block)) {
+    auto metadata_op = dyn_cast<TF::TPUReplicateMetadataOp>(op);
+    if (!metadata_op) continue;
 
-        // Missing or bad `_tpu_replicate` attribute.
-        auto tpu_replicate_attr = attrs.get(kTPUReplicateAttr);
-        if (!tpu_replicate_attr)
-          return metadata_op.emitError() << kBadTPUReplicateAttrMsg;
+    MutableDictionaryAttr attrs = metadata_op.getAttrs();
 
-        auto tpu_replicate_attr_str = tpu_replicate_attr.dyn_cast<StringAttr>();
-        if (!tpu_replicate_attr_str ||
-            tpu_replicate_attr_str.getValue().empty())
-          return metadata_op.emitError() << kBadTPUReplicateAttrMsg;
+    // Missing or bad `_tpu_replicate` attribute.
+    auto tpu_replicate_attr = attrs.get(kTPUReplicateAttr);
+    if (!tpu_replicate_attr)
+      return metadata_op.emitError() << kBadTPUReplicateAttrMsg;
 
-        // Remove `name` attribute.
-        attrs.remove(Identifier::get(kNameAttr, metadata_op.getContext()));
+    auto tpu_replicate_attr_str = tpu_replicate_attr.dyn_cast<StringAttr>();
+    if (!tpu_replicate_attr_str || tpu_replicate_attr_str.getValue().empty())
+      return metadata_op.emitError() << kBadTPUReplicateAttrMsg;
 
-        auto it = metadata_map->try_emplace(tpu_replicate_attr_str.getValue(),
-                                            std::move(attrs));
+    // Remove `name` attribute.
+    attrs.remove(Identifier::get(kNameAttr, metadata_op.getContext()));
 
-        // There are multiple TPUReplicateMetadata ops with the same
-        // `_tpu_replicate` attribute.
-        if (!it.second) {
-          return metadata_op.emitError()
-                 << "multiple TPUReplicateMetadata ops with the same '"
-                 << kTPUReplicateAttr << "' attribute '"
-                 << tpu_replicate_attr_str.getValue() << "' found";
-        }
+    auto it = metadata_map->try_emplace(tpu_replicate_attr_str.getValue(),
+                                        std::move(attrs));
 
-        metadata_op.erase();
-        return WalkResult::advance();
-      });
-
-  // Return failure if the walk was interrupted.
-  return failure(result.wasInterrupted());
+    // There are multiple TPUReplicateMetadata ops with the same
+    // `_tpu_replicate` attribute.
+    if (!it.second) {
+      return metadata_op.emitError()
+             << "multiple TPUReplicateMetadata ops with the same '"
+             << kTPUReplicateAttr << "' attribute '"
+             << tpu_replicate_attr_str.getValue() << "' found";
+    }
+    metadata_op.erase();
+  }
+  return success();
 }
 
 // Collects and clusters ops with the same `_tpu_replicate` attribute. This will
@@ -150,12 +154,12 @@ void CollectResourceIdsFromOp(
   op.walk([&](Operation* inner_op) {
     for (Value operand : TF::filter_resources(inner_op->getOperands())) {
       if (resource_alias_analysis.IsUnknownResource(operand)) continue;
-      auto ids = resource_alias_analysis.GetResourceUniqueIds(operand);
+      const auto& ids = resource_alias_analysis.GetResourceUniqueIds(operand);
       observed_resource_ids.insert(ids.begin(), ids.end());
     }
     for (Value result : TF::filter_resources(inner_op->getResults())) {
       if (resource_alias_analysis.IsUnknownResource(result)) continue;
-      auto ids = resource_alias_analysis.GetResourceUniqueIds(result);
+      const auto& ids = resource_alias_analysis.GetResourceUniqueIds(result);
       observed_resource_ids.insert(ids.begin(), ids.end());
     }
   });
@@ -164,13 +168,13 @@ void CollectResourceIdsFromOp(
 // Checks if an op should be moved after a cluster. There may be users of a
 // cluster interleaved among the cluster ops.
 bool ShouldMoveOpAfterCluster(
-    Block* block, Operation* op,
-    const llvm::SmallSetVector<Operation*, 8>& cluster_ops,
+    Block* block, Operation* op, const ClusterOps& cluster_ops,
     const llvm::SmallSetVector<Operation*, 8>& preceding_users,
     const TF::ResourceAliasAnalysis::Info& resource_alias_analysis,
     const llvm::SmallDenseSet<int64_t>& observed_resource_ids) {
-  auto result = op->walk([&](Operation* op) {
-    for (Value operand : op->getOperands()) {
+  const bool is_replicate = llvm::isa<tf_device::ReplicateOp>(op);
+  auto result = op->walk([&](Operation* inner_op) {
+    for (Value operand : inner_op->getOperands()) {
       Operation* def = operand.getDefiningOp();
       // Operands may not have a defining op (BlockArgument) or is from a
       // different block.
@@ -183,8 +187,13 @@ bool ShouldMoveOpAfterCluster(
       }
     }
 
+    // Don't visit replicate op inner op operands as new resource
+    // values/arguments may have been created but are not known in
+    // `resource_alias_analysis`.
+    if (is_replicate && inner_op != op) return WalkResult::advance();
+
     // Check for uses of any resource in or after cluster.
-    for (Value operand : TF::filter_resources(op->getOperands())) {
+    for (Value operand : TF::filter_resources(inner_op->getOperands())) {
       if (resource_alias_analysis.IsUnknownResource(operand)) continue;
       auto ids = resource_alias_analysis.GetResourceUniqueIds(operand);
       for (const auto& id : ids)
@@ -204,13 +213,14 @@ bool ShouldMoveOpAfterCluster(
 // TODO(lyandy): Extend this to handle all side effecting ops while handling
 // transitive data dependencies.
 llvm::SmallSetVector<Operation*, 8> CollectClusterPrecedingUsers(
-    Block* block, const llvm::SmallSetVector<Operation*, 8>& cluster_ops,
+    Block* block, const ClusterOps& cluster_ops,
     const TF::ResourceAliasAnalysis::Info& resource_alias_analysis) {
   llvm::SmallSetVector<Operation*, 8> preceding_users;
   llvm::SmallDenseSet<int64_t> observed_resource_ids;
 
-  for (Operation& op : llvm::make_range(Block::iterator(cluster_ops.front()),
-                                        Block::iterator(cluster_ops.back()))) {
+  auto front = Block::iterator(cluster_ops.front());
+  auto back = Block::iterator(cluster_ops.back());
+  for (Operation& op : llvm::make_range(front, back)) {
     if (cluster_ops.contains(&op)) {
       CollectResourceIdsFromOp(op, resource_alias_analysis,
                                observed_resource_ids);
@@ -232,7 +242,7 @@ llvm::SmallSetVector<Operation*, 8> CollectClusterPrecedingUsers(
 // outside of the cluster (i.e. results of ops in the cluster are only consumed
 // by other ops in the cluster) are pruned.
 llvm::SmallVector<Value, 8> CollectClusterResults(
-    Block* block, const llvm::SmallSetVector<Operation*, 8>& cluster_ops) {
+    Block* block, const ClusterOps& cluster_ops) {
   llvm::SmallVector<Value, 8> results;
 
   for (Operation* op : cluster_ops) {
@@ -251,61 +261,52 @@ llvm::SmallVector<Value, 8> CollectClusterResults(
 }
 
 // Creates a `tf_device.cluster` to wrap cluster ops.
-tf_device::ClusterOp CreateOpForCluster(Operation* last_cluster_op,
-                                        llvm::ArrayRef<Value> results) {
+tf_device::ClusterOp CreateClusterOp(
+    Block* block, const ClusterOps& cluster_ops, llvm::ArrayRef<Value> results,
+    llvm::ArrayRef<Operation*> preceding_users) {
   // `tf_device.cluster` will be placed at where the last op of the cluster is.
+  Operation* last_cluster_op = cluster_ops.back();
   OpBuilder builder(last_cluster_op);
 
   llvm::SmallVector<Type, 8> result_types;
   for (Value result : results) result_types.push_back(result.getType());
-
   auto cluster = builder.create<tf_device::ClusterOp>(last_cluster_op->getLoc(),
                                                       result_types);
 
-  cluster.body().push_back(new Block);
+  Block* body = new Block;
+  cluster.body().push_back(body);
+
+  // Move cluster ops to the cluster body. Also remove `_tpu_replicate` and
+  // `device` attribute from ops in the cluster as that information will be
+  // present in the `tf_device.cluster`. Do this for all ops including nested
+  // ops.
+  for (Operation* cluster_op : cluster_ops) {
+    cluster_op->moveBefore(body, body->end());
+    cluster_op->walk([&](Operation* inner_op) {
+      inner_op->removeAttr(kTPUReplicateAttr);
+      inner_op->removeAttr(kDeviceAttr);
+    });
+  }
 
   // Add terminator.
-  builder.setInsertionPointToEnd(&cluster.GetBody());
+  builder.setInsertionPointToEnd(body);
   builder.create<tf_device::ReturnOp>(last_cluster_op->getLoc(), results);
 
-  return cluster;
-}
-
-// Moves cluster ops to associated `tf_device.cluster` body.
-void MoveClusterOpsToCluster(
-    tf_device::ClusterOp cluster,
-    const llvm::SmallSetVector<Operation*, 8>& cluster_ops) {
-  MLIRContext* context = cluster.getContext();
-  Operation* terminator = cluster.GetBody().getTerminator();
-
-  for (Operation* cluster_op : cluster_ops) {
-    // Remove `_tpu_replicate` and `device` attribute from ops in the cluster
-    // as that information will be present in the `tf_device.cluster`.
-    cluster_op->removeAttr(Identifier::get(kTPUReplicateAttr, context));
-    cluster_op->removeAttr(Identifier::get(kDeviceAttr, context));
-    cluster_op->moveBefore(terminator);
-  }
-}
-
-// Replaces uses of cluster ops results outside of cluster with the associated
-// `tf_device.cluster` results.
-void UpdateClusterResultExternalUses(tf_device::ClusterOp cluster,
-                                     llvm::ArrayRef<Value> results) {
-  Block& cluster_block = cluster.GetBody();
+  // Replaces uses of cluster ops results outside of cluster with the associated
+  // `tf_device.cluster` results.
   for (auto ret_vals : llvm::zip(results, cluster.getResults())) {
     Value old_ret = std::get<0>(ret_vals);
     Value new_ret = std::get<1>(ret_vals);
-    for (auto& use : llvm::make_early_inc_range(old_ret.getUses()))
-      if (!cluster_block.findAncestorOpInBlock(*use.getOwner()))
-        use.set(new_ret);
+    for (auto& use : llvm::make_early_inc_range(old_ret.getUses())) {
+      Operation* user = use.getOwner();
+      if (!body->findAncestorOpInBlock(*user)) use.set(new_ret);
+    }
   }
-}
 
-// Moves users of cluster that are before the cluster to after the cluster.
-void MovePrecedingClusterUsers(tf_device::ClusterOp cluster,
-                               llvm::ArrayRef<Operation*> preceding_users) {
+  // Move users of cluster that are before the cluster to after the cluster.
   Operation* op_after_cluster = cluster.getOperation()->getNextNode();
   for (Operation* user : preceding_users) user->moveBefore(op_after_cluster);
+  return cluster;
 }
 
 // Sorts `tf.TPUReplicatedInput` ops by `index` attribute. Ops with an `index`
@@ -318,8 +319,7 @@ LogicalResult SortTPUReplicatedInputsByIndex(
     llvm::SmallVectorImpl<Operation*>* sorted_inputs) {
   llvm::SmallDenseSet<int64_t, 8> unique_indices;
   for (Operation* input : inputs) {
-    int64_t index =
-        llvm::cast<TF::TPUReplicatedInputOp>(input).index().getSExtValue();
+    int64_t index = llvm::cast<TF::TPUReplicatedInputOp>(input).index();
     if (index < -1)
       return input->emitOpError()
              << "requires index to be at least -1, but got " << index;
@@ -338,10 +338,8 @@ LogicalResult SortTPUReplicatedInputsByIndex(
   std::stable_sort(
       sorted_inputs->begin(), sorted_inputs->end(),
       [](Operation* l, Operation* r) {
-        int64_t l_index =
-            llvm::cast<TF::TPUReplicatedInputOp>(l).index().getSExtValue();
-        int64_t r_index =
-            llvm::cast<TF::TPUReplicatedInputOp>(r).index().getSExtValue();
+        int64_t l_index = llvm::cast<TF::TPUReplicatedInputOp>(l).index();
+        int64_t r_index = llvm::cast<TF::TPUReplicatedInputOp>(r).index();
         if (l_index == -1 && r_index != -1) return false;
         if (r_index == -1 && l_index != -1) return true;
         return l_index < r_index;
@@ -385,8 +383,7 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas) {
   // Check if number of operands of each used TPUReplicatedInput op matches
   // `num_replicas` or 1. Collect all their operands and associated type for
   // creating the replicate op.
-  llvm::SmallVector<std::pair<Operation::operand_range, Type>, 8>
-      replicated_inputs;
+  llvm::SmallVector<std::pair<ValueRange, Type>, 8> replicated_inputs;
   llvm::SmallVector<Value, 8> packed_inputs;
   for (auto& pos_and_input : llvm::enumerate(replicated_input_ops)) {
     auto input = pos_and_input.value();
@@ -397,8 +394,7 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas) {
       return input->emitOpError() << "requires " << num_inputs << " operands";
 
     auto tpu_replicated_input = llvm::cast<TF::TPUReplicatedInputOp>(input);
-    int64_t tpu_replicated_input_index =
-        tpu_replicated_input.index().getSExtValue();
+    int64_t tpu_replicated_input_index = tpu_replicated_input.index();
     if (is_packed) {
       packed_inputs.push_back(input->getOperand(0));
       packed_input_indices.push_back(tpu_replicated_input_index);
@@ -434,20 +430,24 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas) {
   for (auto result_and_idx : llvm::enumerate(cluster.getResults())) {
     Value result = result_and_idx.value();
     int idx = result_and_idx.index();
-    for (auto& use : result.getUses()) {
-      Operation* def = use.getOwner();
-      if (!def || !llvm::isa<TF::TPUReplicatedOutputOp>(def))
-        return cluster.emitError()
-               << "requires output of " << cluster.getOperationName()
-               << " to lead to a 'tf.TPUReplicatedOutput' op";
+    auto replicate_outputs = llvm::make_range(
+        std::next(replicate_op.result_begin(), idx * num_replicas),
+        std::next(replicate_op.result_begin(), (idx + 1) * num_replicas));
 
-      const int def_NumResults = def->getNumResults();
-      if (def_NumResults != num_replicas)
+    for (auto& use : llvm::make_early_inc_range(result.getUses())) {
+      Operation* def = use.getOwner();
+      if (!llvm::isa<TF::TPUReplicatedOutputOp>(def)) {
+        // If user is not a `tf.TPUReplicatedOutput`, simply forward the first
+        // replica output. Certain Graphs under V1 create `tf.Identity` users of
+        // replicated ops to pin the TPU computation for execution.
+        use.set(*replicate_outputs.begin());
+        continue;
+      }
+
+      const int def_num_results = def->getNumResults();
+      if (def_num_results != num_replicas)
         return def->emitOpError() << "requires " << num_replicas << " results";
 
-      auto replicate_outputs = llvm::make_range(
-          std::next(replicate_op.result_begin(), idx * num_replicas),
-          std::next(replicate_op.result_begin(), (idx + 1) * num_replicas));
       def->replaceAllUsesWith(replicate_outputs);
     }
   }
@@ -490,10 +490,29 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas) {
 //      attribute `num_replicas` is greater than 1.
 //   9. Copy over TPUReplicateMetadata attributes to `tf_device.cluster`.
 LogicalResult FormClustersInBlock(
-    Block* block, const MetadataMap& metadata_map,
+    Block* block,
     const TF::ResourceAliasAnalysis::Info& resource_alias_analysis) {
+  MetadataMap metadata_map;
+  LogicalResult result = CollectMetadata(block, &metadata_map);
+  if (failed(result)) return result;
+
+  // If there is no TPUReplicateMetadata op in this block, process blocks in
+  // regions attached to the op's in the block.
+  if (metadata_map.empty()) {
+    for (Operation& op : *block) {
+      for (Region& region : op.getRegions()) {
+        if (!llvm::hasSingleElement(region))
+          return op.emitOpError("Expected single block region");
+        if (failed(
+                FormClustersInBlock(&region.front(), resource_alias_analysis)))
+          return failure();
+      }
+    }
+    return success();
+  }
+
   ClusterMap clusters;
-  LogicalResult result = CollectAndGroupClusterOps(block, &clusters);
+  result = CollectAndGroupClusterOps(block, &clusters);
   if (failed(result)) return result;
 
   for (const auto& cluster_metadata_and_ops : clusters) {
@@ -518,14 +537,8 @@ LogicalResult FormClustersInBlock(
     llvm::SmallVector<Value, 8> results =
         CollectClusterResults(block, cluster_ops);
 
-    tf_device::ClusterOp cluster =
-        CreateOpForCluster(cluster_ops.back(), results);
-
-    MoveClusterOpsToCluster(cluster, cluster_ops);
-
-    UpdateClusterResultExternalUses(cluster, results);
-
-    MovePrecedingClusterUsers(cluster, preceding_users.getArrayRef());
+    tf_device::ClusterOp cluster = CreateClusterOp(
+        block, cluster_ops, results, preceding_users.getArrayRef());
 
     auto num_replicas = cluster_metadata->getSecond().get(kNumReplicasAttr);
     if (!num_replicas || !num_replicas.isa<mlir::IntegerAttr>())
@@ -548,13 +561,13 @@ LogicalResult FormClustersInBlock(
 void TPUClusterFormation::runOnFunction(
     FuncOp func,
     const TF::ResourceAliasAnalysis::Info& resource_alias_analysis) {
-  MetadataMap metadata_map;
-  if (failed(CollectMetadata(func, &metadata_map))) return signalPassFailure();
+  if (!llvm::hasSingleElement(func)) {
+    func.emitOpError("Expecting a single block function");
+    return signalPassFailure();
+  }
 
-  for (Block& block : func)
-    if (failed(
-            FormClustersInBlock(&block, metadata_map, resource_alias_analysis)))
-      return signalPassFailure();
+  if (failed(FormClustersInBlock(&func.front(), resource_alias_analysis)))
+    return signalPassFailure();
 
   // Remove TPUReplicatedInput and TPUReplicatedOutput nodes.
   auto remove_result = func.walk([&](Operation* op) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_colocate_composite_resource_ops.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_colocate_composite_resource_ops.cc
new file mode 100644
index 00000000000..b4889f6e52c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_colocate_composite_resource_ops.cc
@@ -0,0 +1,137 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
+
+namespace mlir {
+namespace TFTPU {
+namespace {
+
+// Pass that co-locates resource ops that use composite device resources
+// (packed tensors) with the underlying physical TPU device.
+struct TPUColocateCompositeResourceOps
+    : public PassWrapper<TPUColocateCompositeResourceOps, FunctionPass> {
+  void runOnFunction() override;
+};
+
+// Wraps single op in `tf_device.launch` for explicit device assignment.
+void WrapOpInLaunch(OpBuilder* builder, Location loc, Operation* op,
+                    llvm::StringRef device) {
+  builder->setInsertionPoint(op);
+  auto launch = builder->create<tf_device::LaunchOp>(
+      loc, builder->getStringAttr(device), op->getResultTypes());
+  launch.body().push_back(new Block);
+  op->replaceAllUsesWith(launch);
+
+  builder->setInsertionPointToEnd(&launch.GetBody());
+  builder->create<tf_device::ReturnOp>(loc, op->getResults());
+
+  // Move op inside cluster.
+  op->moveBefore(launch.GetBody().getTerminator());
+}
+
+llvm::SmallVector<Operation*, 4> GetResourceOpsUsingCompositeArgsInReplicate(
+    tf_device::ReplicateOp replicate) {
+  llvm::SmallVector<Operation*, 4> resource_users;
+  const auto add_resource_op_to_list = [&resource_users](Operation* op) {
+    if (!llvm::isa<TF::AssignVariableOp, TF::ReadVariableOp>(op)) return;
+
+    resource_users.emplace_back(op);
+  };
+
+  llvm::SmallVector<Operation*, 4> resource_users_to_visit;
+  for (auto composite_arguments : replicate.GetPackedBlockArguments()) {
+    for (auto resource_user : composite_arguments.getUsers())
+      resource_users_to_visit.emplace_back(resource_user);
+  }
+
+  while (!resource_users_to_visit.empty()) {
+    llvm::SmallVector<Operation*, 4> new_resource_users;
+
+    for (auto resource_user : resource_users_to_visit) {
+      add_resource_op_to_list(resource_user);
+
+      // Account for pass-through identity ops.
+      if (auto pass_through_identity =
+              llvm::dyn_cast<TF::IdentityOp>(resource_user)) {
+        for (auto identity_user : pass_through_identity.output().getUsers()) {
+          new_resource_users.emplace_back(identity_user);
+        }
+      }
+    }
+    resource_users_to_visit.swap(new_resource_users);
+  }
+
+  return resource_users;
+}
+
+void ColocateCompositeResourceOpsInReplicate(
+    tf_device::ReplicateOp replicate_op, OpBuilder* builder) {
+  auto devices = replicate_op.devices();
+  if (!devices) return;
+  if (!devices.getValue().get(tensorflow::GetDeviceAliasForLogicalCore(0)))
+    return;
+
+  const auto composite_resource_users =
+      GetResourceOpsUsingCompositeArgsInReplicate(replicate_op);
+  for (auto resource_user : composite_resource_users) {
+    WrapOpInLaunch(builder, resource_user->getLoc(), resource_user,
+                   tensorflow::GetDeviceAliasForLogicalCore(0));
+  }
+}
+
+void TPUColocateCompositeResourceOps::runOnFunction() {
+  // Find all the executes first, since we will mutate the nodes around each
+  // execute in the same tf_device.replicate op.
+  llvm::SmallVector<tf_device::LaunchOp, 8> execute_launches;
+  getFunction().walk([&](tf_device::LaunchOp op) {
+    if (op.WrapsSingleOp() &&
+        llvm::isa<TF::TPUExecuteOp, TF::TPUExecuteAndUpdateVariablesOp>(
+            op.GetBody().front()))
+      execute_launches.push_back(op);
+  });
+
+  OpBuilder builder(&getContext());
+  for (auto execute_launch : execute_launches) {
+    auto replicate = execute_launch.getParentOfType<tf_device::ReplicateOp>();
+    if (!replicate) continue;
+
+    ColocateCompositeResourceOpsInReplicate(replicate, &builder);
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<FuncOp>> CreateTPUColocateCompositeResourceOps() {
+  return std::make_unique<TPUColocateCompositeResourceOps>();
+}
+
+static PassRegistration<TPUColocateCompositeResourceOps> pass(
+    "tf-tpu-colocate-composite-resource-ops",
+    "Colocate resource with composite device assignment to TPU device.");
+
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
index 41362465cd9..59f36e03fbb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_dynamic_layout_pass.cc
@@ -185,7 +185,7 @@ bool HandleReplicatedInputs(
     const TF::ResourceAliasAnalysis::Info& resource_alias_analysis) {
   // We need to know the devices to copy to.
   if (!replicate.devices()) return false;
-  int64_t num_replicas = replicate.n().getZExtValue();
+  int64_t num_replicas = replicate.n();
   auto inputs = replicate.getOperands()
                     .drop_front(replicate_arg_index * num_replicas)
                     .take_front(num_replicas);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
index fed4002bfcf..6e106b278fe 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
@@ -23,10 +23,10 @@ limitations under the License.
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
@@ -34,7 +34,9 @@ limitations under the License.
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
@@ -113,12 +115,23 @@ tf_device::LaunchOp CreateLaunchForBlock(OpBuilder* builder, Operation* op,
   return launch;
 }
 
+// Checks if an operation is a supported TPU embedding op.
+bool IsEmbeddingOp(Operation* op) {
+  return isa<TF::EnqueueTPUEmbeddingRaggedTensorBatchOp,
+             TF::EnqueueTPUEmbeddingSparseTensorBatchOp,
+             TF::RecvTPUEmbeddingActivationsOp,
+             TF::SendTPUEmbeddingGradientsOp>(op);
+}
+
 // Returns a set of ops that are outside compiled and can be extracted to before
 // the TPU computation. These ops are either connected to the inputs of the TPU
 // computation or other ops that can be extracted, and have no operands from
 // other ops in the TPU computation that cannot be extracted.
 llvm::SmallVector<Operation*, 4> FindOutsideCompiledOpsAtHead(
+    const TF::SideEffectAnalysis& side_effect_analysis,
     tf_device::ClusterOp cluster) {
+  const auto& analysis = side_effect_analysis.GetAnalysisForFunc(
+      cluster.getParentOfType<FuncOp>());
   Region* cluster_region = &cluster.body();
   llvm::SmallSetVector<Operation*, 4> head_outside_compiled_ops;
 
@@ -127,6 +140,24 @@ llvm::SmallVector<Operation*, 4> FindOutsideCompiledOpsAtHead(
     if (!HasOutsideCompilationAttribute(&cluster_op)) continue;
     // An outside compiled op can be extracted if its operands are not from
     // other ops in the cluster that cannot be extracted.
+
+    // Check if the side effecting op right before this side effecting op, if
+    // it is side effecting, can be head extracted. Because of op ordering due
+    // to side effects, if this is not true, this op cannot be head extracted.
+    // TODO(lyandy): Remove special handling of embedding ops. Currently the IR
+    // is in a topological sort order and depending on that ordering, embedding
+    // ops may prevent other ops from being head extracted.
+    auto predecessors = analysis.DirectControlPredecessors(&cluster_op);
+    if (!predecessors.empty() && !IsEmbeddingOp(&cluster_op)) {
+      bool skip = false;
+      for (Operation* predecessor : llvm::reverse(predecessors)) {
+        if (IsEmbeddingOp(predecessor)) continue;
+        skip = !head_outside_compiled_ops.contains(predecessor);
+        break;
+      }
+      if (skip) continue;
+    }
+
     auto walk_result = cluster_op.walk([&](Operation* op) {
       for (Value operand : op->getOperands()) {
         Operation* operand_op = GetOpOfValue(operand);
@@ -168,11 +199,11 @@ void CreateHeadComputation(OpBuilder* builder, tf_device::ClusterOp cluster,
 // Extracts and move outside compiled ops that have no dependencies in the
 // cluster to before the cluster.
 mlir::LogicalResult LiftHeadOutsideCompiledOps(
-    OpBuilder* builder, const mlir::TF::RuntimeDevices& devices,
-    tf_device::ClusterOp cluster, std::string* host_device,
-    bool* cluster_updated) {
+    OpBuilder* builder, const TF::SideEffectAnalysis& side_effect_analysis,
+    const mlir::TF::RuntimeDevices& devices, tf_device::ClusterOp cluster,
+    std::string* host_device, bool* cluster_updated) {
   llvm::SmallVector<Operation*, 4> head_outside_compiled_ops =
-      FindOutsideCompiledOpsAtHead(cluster);
+      FindOutsideCompiledOpsAtHead(side_effect_analysis, cluster);
   if (head_outside_compiled_ops.empty()) return success();
   if (failed(tensorflow::GetHostDeviceOutsideComputation(devices, cluster,
                                                          host_device)))
@@ -191,9 +222,12 @@ mlir::LogicalResult LiftHeadOutsideCompiledOps(
 // TPU computation or other ops that can be extracted, and have no results used
 // by other ops in the TPU computation that cannot be extracted.
 void FindOutsideCompiledOpsAtTailAndClusterResults(
+    const TF::SideEffectAnalysis& side_effect_analysis,
     tf_device::ClusterOp cluster,
     llvm::SmallVectorImpl<Operation*>* tail_outside_compiled_ops,
     llvm::SmallVectorImpl<Value>* cluster_results) {
+  const auto& analysis = side_effect_analysis.GetAnalysisForFunc(
+      cluster.getParentOfType<FuncOp>());
   Region* cluster_region = &cluster.body();
   llvm::SmallSetVector<Operation*, 4> tail_outside_compiled_ops_set;
   Operation* terminator = cluster.GetBody().getTerminator();
@@ -205,6 +239,24 @@ void FindOutsideCompiledOpsAtTailAndClusterResults(
   for (Operation& cluster_op : cluster_ops) {
     if (!HasOutsideCompilationAttribute(&cluster_op)) continue;
 
+    // Check if the side effecting op right after this side effecting op, if
+    // it is side effecting, can be tail extracted. Because of op ordering due
+    // to side effects, if this is not true, this op cannot be tail extracted.
+    // TODO(lyandy): Remove special handling of embedding ops. Currently the IR
+    // is in a topological sort order and depending on that ordering, embedding
+    // ops may prevent other ops from being tail extracted.
+    auto successors = analysis.DirectControlSuccessors(
+        &cluster_op, [&terminator](Operation* op) { return op != terminator; });
+    if (!successors.empty() && !IsEmbeddingOp(&cluster_op)) {
+      bool skip = false;
+      for (Operation* successor : successors) {
+        if (IsEmbeddingOp(successor)) continue;
+        skip = !tail_outside_compiled_ops_set.contains(successor);
+        break;
+      }
+      if (skip) continue;
+    }
+
     llvm::SmallVector<int, 4> results_to_forward;
     bool can_be_extracted =
         llvm::all_of(cluster_op.getUsers(), [&](Operation* op) {
@@ -293,13 +345,14 @@ tf_device::ClusterOp UpdateClusterResults(
 // Extracts and move outside compiled ops that do not create dependencies in the
 // cluster to after the cluster.
 mlir::LogicalResult LiftTailOutsideCompiledOps(
-    OpBuilder* builder, const mlir::TF::RuntimeDevices& devices,
-    std::string host_device, tf_device::ClusterOp* cluster,
-    bool* cluster_updated) {
+    OpBuilder* builder, const TF::SideEffectAnalysis& side_effect_analysis,
+    const mlir::TF::RuntimeDevices& devices, std::string host_device,
+    tf_device::ClusterOp* cluster, bool* cluster_updated) {
   llvm::SmallVector<Operation*, 4> tail_outside_compiled_ops;
   llvm::SmallVector<Value, 4> cluster_results;
-  FindOutsideCompiledOpsAtTailAndClusterResults(
-      *cluster, &tail_outside_compiled_ops, &cluster_results);
+  FindOutsideCompiledOpsAtTailAndClusterResults(side_effect_analysis, *cluster,
+                                                &tail_outside_compiled_ops,
+                                                &cluster_results);
   if (tail_outside_compiled_ops.empty()) return success();
 
   if (host_device.empty())
@@ -365,6 +418,7 @@ struct TPUExtractHeadTailOutsideCompilation
 };
 
 void TPUExtractHeadTailOutsideCompilation::runOnOperation() {
+  auto& side_effect_analysis = getAnalysis<TF::SideEffectAnalysis>();
   // Get runtime devices information from the closest parent module.
   auto module = getOperation();
   mlir::TF::RuntimeDevices devices;
@@ -379,10 +433,12 @@ void TPUExtractHeadTailOutsideCompilation::runOnOperation() {
   for (tf_device::ClusterOp cluster : clusters) {
     std::string host_device;
     bool cluster_updated = false;
-    if (failed(LiftHeadOutsideCompiledOps(&builder, devices, cluster,
-                                          &host_device, &cluster_updated)) ||
-        failed(LiftTailOutsideCompiledOps(&builder, devices, host_device,
-                                          &cluster, &cluster_updated)))
+    if (failed(LiftHeadOutsideCompiledOps(&builder, side_effect_analysis,
+                                          devices, cluster, &host_device,
+                                          &cluster_updated)) ||
+        failed(LiftTailOutsideCompiledOps(&builder, side_effect_analysis,
+                                          devices, host_device, &cluster,
+                                          &cluster_updated)))
       return signalPassFailure();
     if (cluster_updated) RemoveClusterAliasedOutputs(&builder, cluster);
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
index b141a7dc792..65490716cf0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
@@ -88,22 +88,30 @@ struct TPUExtractOutsideCompilation
 };
 
 // Holds information about control flow operations that wrap outside compiled
-// op. Currently only tf.If op is supported.
+// op. Currently only tf.IfRegion and tf.WhileRegion ops are supported.
 class ControlFlowStackInfo {
  public:
-  enum ControlFlowBranchType { kIfThen, kIfElse };
+  enum ControlFlowBranchType { kIfThen, kIfElse, kWhileCond, kWhileBody };
 
   explicit ControlFlowStackInfo(Operation* wrapping_op, Operation* nested_op)
       : callsite_op_(wrapping_op) {
-    // Only tf.IfRegion op is supported for now.
-    auto control_flow_op = llvm::cast<TF::IfRegionOp>(callsite_op_);
-    assert(control_flow_op);
-
-    auto parent_region = nested_op->getParentRegion();
-    if (&control_flow_op.then_branch() == parent_region) {
-      type_ = ControlFlowBranchType::kIfThen;
+    if (auto control_flow_op = llvm::dyn_cast<TF::IfRegionOp>(callsite_op_)) {
+      auto parent_region = nested_op->getParentRegion();
+      if (&control_flow_op.then_branch() == parent_region) {
+        type_ = ControlFlowBranchType::kIfThen;
+      } else {
+        type_ = ControlFlowBranchType::kIfElse;
+      }
+    } else if (auto control_flow_op =
+                   llvm::dyn_cast<TF::WhileRegionOp>(callsite_op_)) {
+      auto parent_region = nested_op->getParentRegion();
+      if (&control_flow_op.cond() == parent_region) {
+        type_ = ControlFlowBranchType::kWhileCond;
+      } else {
+        type_ = ControlFlowBranchType::kWhileBody;
+      }
     } else {
-      type_ = ControlFlowBranchType::kIfElse;
+      assert(false);
     }
   }
 
@@ -116,6 +124,10 @@ class ControlFlowStackInfo {
 
   Operation* GetCallSiteOp() const { return callsite_op_; }
 
+  bool operator==(const ControlFlowStackInfo& other) const {
+    return type_ == other.type_ && callsite_op_ == other.callsite_op_;
+  }
+
  private:
   ControlFlowBranchType type_;
 
@@ -133,7 +145,7 @@ llvm::SmallVector<ControlFlowStackInfo, 4> GetControlFlowStackForOp(
   Operation* op_in_stack = op;
   while (op_in_stack != tpu_cluster.getOperation()) {
     auto parent_op = op_in_stack->getParentOp();
-    if (llvm::isa<TF::IfRegionOp>(parent_op)) {
+    if (llvm::isa<TF::IfRegionOp, TF::WhileRegionOp>(parent_op)) {
       controlflow_stack.insert(controlflow_stack.begin(),
                                ControlFlowStackInfo(parent_op, op_in_stack));
     }
@@ -166,7 +178,7 @@ TF::IfRegionOp CloneEmptyIfWithPredicate(Value predicate, bool is_stateless,
 
 // Replicates tf.IfRegion op to host side computation.
 Operation* ReplicateIf(const ControlFlowStackInfo& controlflow_info,
-                       llvm::StringRef outside_cluster_name, ModuleOp module,
+                       llvm::StringRef outside_cluster_name,
                        Value compilation_key, OpBuilder* builder,
                        int* send_recv_counter) {
   // Create XlaSendToHostOp to send predicate value from device to host.
@@ -200,6 +212,64 @@ Operation* ReplicateIf(const ControlFlowStackInfo& controlflow_info,
                                    if_callsite_op.getLoc(), builder);
 }
 
+// Creates a WhileRegionOp cond and body regions with yield op and
+// an empty body.
+TF::WhileRegionOp CloneEmptyWhile(bool is_stateless,
+                                  uint64_t parallel_iterations, Location loc,
+                                  OpBuilder* builder) {
+  auto host_side_while = builder->create<TF::WhileRegionOp>(
+      loc, /*output=*/ArrayRef<Type>{}, /*input=*/ArrayRef<Value>{},
+      is_stateless, parallel_iterations);
+
+  // Create empty else branch region.
+  auto& body = host_side_while.body();
+  body.push_back(new Block);
+  builder->setInsertionPointToEnd(&body.front());
+  builder->create<TF::YieldOp>(loc, /*operands=*/ArrayRef<Value>{});
+  return host_side_while;
+}
+
+// Replicates tf.WhileRegion op to host side computation.
+Operation* ReplicateWhile(const ControlFlowStackInfo& controlflow_info,
+                          llvm::StringRef outside_cluster_name,
+                          Value compilation_key, OpBuilder* builder,
+                          int* send_recv_counter) {
+  // Create XlaSendToHostOp to send cond region output from device to host.
+  OpBuilder::InsertPoint insert_point = builder->saveInsertionPoint();
+  auto while_callsite_op =
+      llvm::cast<TF::WhileRegionOp>(controlflow_info.GetCallSiteOp());
+  builder->setInsertionPoint(while_callsite_op.cond().front().getTerminator());
+
+  const auto condition_send_recv_key =
+      llvm::formatv("while_condition_channel_{0}_{1}", outside_cluster_name,
+                    *send_recv_counter)
+          .str();
+  *send_recv_counter += 1;
+  auto condition =
+      while_callsite_op.cond().front().getTerminator()->getOperand(0);
+  builder->create<TF::XlaSendToHostOp>(while_callsite_op.getLoc(), condition,
+                                       condition_send_recv_key);
+  builder->restoreInsertionPoint(insert_point);
+
+  auto host_side_while = CloneEmptyWhile(
+      while_callsite_op.is_stateless(), while_callsite_op.parallel_iterations(),
+      while_callsite_op.getLoc(), builder);
+
+  // Create cond region and yield the condition from the device.
+  auto& cond = host_side_while.cond();
+  cond.push_back(new Block);
+  builder->setInsertionPointToEnd(&cond.front());
+  auto recv_condition_at_host = builder->create<TF::_XlaRecvAtHostOp>(
+      while_callsite_op.getLoc(), llvm::ArrayRef<Type>{condition.getType()},
+      /*dynamic_key=*/compilation_key,
+      builder->getStringAttr(condition_send_recv_key),
+      /*device_ordinal=*/builder->getI64IntegerAttr(0));
+  builder->create<TF::YieldOp>(while_callsite_op.getLoc(),
+                               recv_condition_at_host.getResults());
+
+  return host_side_while;
+}
+
 // TODO(b/157054714): Use a better abstraction instead of
 // _TPUCompileMlirOp and _XlaRecvAtHostOp and _XlaSendFromHostOp.
 // Creates a compilation key as placeholder. A placeholder compilation cache key
@@ -214,45 +284,97 @@ Value CreateCompilationKeyPlaceholder(Location loc, OpBuilder* builder) {
       loc, /*program=*/result_type, llvm::ArrayRef<Value>{});
 }
 
+// Retrieves terminator of branch specified by `control_flow_info` of replicated
+// control flow op.
+Operation* GetControlFlowBranchRegionTerminator(
+    const ControlFlowStackInfo& controlflow_info, Operation* controlflow_op) {
+  if (auto inner_most_if = llvm::dyn_cast<TF::IfRegionOp>(controlflow_op)) {
+    if (controlflow_info.GetBranchType() == ControlFlowStackInfo::kIfThen) {
+      return inner_most_if.then_branch().front().getTerminator();
+    } else {
+      return inner_most_if.else_branch().front().getTerminator();
+    }
+  } else if (auto inner_most_while =
+                 llvm::dyn_cast<TF::WhileRegionOp>(controlflow_op)) {
+    if (controlflow_info.GetBranchType() == ControlFlowStackInfo::kWhileCond) {
+      return &inner_most_while.cond().front().front();
+    } else {
+      return inner_most_while.body().front().getTerminator();
+    }
+  }
+  assert(false);
+  return nullptr;
+}
+
 // Replicates the control flow operations that wraps outside compiled ops to
 // `destination_block`.
-Block* ReplicateControlFlowStack(
+Operation* GetOrReplicateControlFlowStack(
     llvm::StringRef outside_cluster_name,
     const llvm::SmallVectorImpl<ControlFlowStackInfo>& stack_info,
     tf_device::ClusterOp tpu_cluster, ModuleOp module, Value compilation_key,
-    Block* destination_block, int* send_recv_counter) {
-  assert(stack_info.size());
+    Block* destination_block, int* send_recv_counter,
+    llvm::SmallDenseMap<Operation*, Operation*>* replicated_controlflow_map) {
+  assert(!stack_info.empty());
+  const auto& controlflow_info = stack_info.back();
+  auto it = replicated_controlflow_map->find(controlflow_info.GetCallSiteOp());
+  if (it != replicated_controlflow_map->end())
+    return GetControlFlowBranchRegionTerminator(controlflow_info, it->second);
+
   OpBuilder builder = OpBuilder::atBlockTerminator(destination_block);
   Operation* previous_replicated_controlflow_op = nullptr;
   for (const auto& controlflow_stack_info : stack_info) {
+    // If controlflow operation has already been created, reuse the cached
+    // controlflow operation.
+    auto it = replicated_controlflow_map->find(
+        controlflow_stack_info.GetCallSiteOp());
+    if (it != replicated_controlflow_map->end()) {
+      previous_replicated_controlflow_op = it->second;
+      builder.setInsertionPoint(GetControlFlowBranchRegionTerminator(
+          controlflow_stack_info, previous_replicated_controlflow_op));
+      continue;
+    }
+
     // Create control flow op given provided insertion point and
     // ControlFlowStackInfo.
-    previous_replicated_controlflow_op =
-        ReplicateIf(controlflow_stack_info, outside_cluster_name, module,
-                    compilation_key, &builder, send_recv_counter);
-    auto if_op = llvm::cast<TF::IfRegionOp>(previous_replicated_controlflow_op);
-    auto type = controlflow_stack_info.GetBranchType();
+    if (auto control_flow_op = llvm::dyn_cast<TF::IfRegionOp>(
+            controlflow_stack_info.GetCallSiteOp())) {
+      previous_replicated_controlflow_op =
+          ReplicateIf(controlflow_stack_info, outside_cluster_name,
+                      compilation_key, &builder, send_recv_counter);
+      auto if_op =
+          llvm::cast<TF::IfRegionOp>(previous_replicated_controlflow_op);
+      auto type = controlflow_stack_info.GetBranchType();
 
-    // Update the insertion point to proper region inside the newly created
-    // control flow op.
-    if (type == ControlFlowStackInfo::kIfThen) {
-      builder.setInsertionPoint(&if_op.then_branch().front().front());
-    } else {
-      builder.setInsertionPoint(&if_op.else_branch().front().front());
+      // Update the insertion point to proper region inside the newly created
+      // control flow op.
+      if (type == ControlFlowStackInfo::kIfThen) {
+        builder.setInsertionPoint(&if_op.then_branch().front().front());
+      } else {
+        builder.setInsertionPoint(&if_op.else_branch().front().front());
+      }
+    } else if (auto control_flow_op = llvm::dyn_cast<TF::WhileRegionOp>(
+                   controlflow_stack_info.GetCallSiteOp())) {
+      previous_replicated_controlflow_op =
+          ReplicateWhile(controlflow_stack_info, outside_cluster_name,
+                         compilation_key, &builder, send_recv_counter);
+      auto while_op =
+          llvm::cast<TF::WhileRegionOp>(previous_replicated_controlflow_op);
+      auto type = controlflow_stack_info.GetBranchType();
+      if (type == ControlFlowStackInfo::kWhileCond) {
+        builder.setInsertionPoint(&while_op.cond().front().front());
+      } else {
+        builder.setInsertionPoint(&while_op.body().front().front());
+      }
     }
   }
 
-  // Return the inner most branch at which outside compiled op is located.
-  // This block will later be used as insertion point to create send/recv ops.
-  auto inner_most_controlflow_stack = stack_info.back();
-  auto inner_most_if =
-      llvm::cast<TF::IfRegionOp>(previous_replicated_controlflow_op);
-  if (inner_most_controlflow_stack.GetBranchType() ==
-      ControlFlowStackInfo::kIfThen) {
-    return &inner_most_if.then_branch().front();
-  } else {
-    return &inner_most_if.else_branch().front();
-  }
+  replicated_controlflow_map->try_emplace(stack_info.back().GetCallSiteOp(),
+                                          previous_replicated_controlflow_op);
+
+  // Return operation which should be used to as the insertion point to create
+  // send/recv ops.
+  return GetControlFlowBranchRegionTerminator(
+      stack_info.back(), previous_replicated_controlflow_op);
 }
 
 // Collects and clusters ops in `block` with the same `_xla_outside_compilation`
@@ -279,18 +401,17 @@ LogicalResult CollectAndGroupOutsideClusterOps(Block* block,
   return failure(walk_result.wasInterrupted());
 }
 
-// Moves `cluster_ops` to associated `block`.
-void MoveOutsideClusterOpsToBlock(Block& block,
-                                  llvm::ArrayRef<Operation*> cluster_ops,
-                                  MLIRContext* context) {
-  Operation* terminator = block.getTerminator();
+// Moves `cluster_ops` before `op`.
+void MoveOutsideClusterOpsBeforeOp(Operation* op,
+                                   llvm::ArrayRef<Operation*> cluster_ops,
+                                   MLIRContext* context) {
   for (Operation* cluster_op : cluster_ops) {
     // Remove `_xla_outside_compilation` and `device` attribute from ops in the
     // cluster as that information will be present in the `launch_op`.
     cluster_op->removeAttr(
         Identifier::get(kXlaOutsideCompilationAttr, context));
     cluster_op->removeAttr(Identifier::get(kDeviceAttr, context));
-    cluster_op->moveBefore(terminator);
+    cluster_op->moveBefore(op);
   }
 }
 
@@ -330,19 +451,46 @@ llvm::SmallSetVector<Value, 4> GetExternalOperands(
         // in `host_cluster_ops`.
         for (Value v : op->getOperands()) {
           Operation* defining_op = v.getDefiningOp();
-          if (!defining_op) continue;
-          bool is_external = llvm::none_of(
-              host_cluster_ops,
-              [&](Operation* cluster_op) { return defining_op == cluster_op; });
+          bool is_external = false;
+          if (defining_op) {
+            if (!tpu_cluster.getOperation()->isAncestor(defining_op)) continue;
 
+            is_external =
+                llvm::none_of(host_cluster_ops, [&](Operation* cluster_op) {
+                  return defining_op == cluster_op;
+                });
+          } else {
+            if (auto block_arg = v.dyn_cast<BlockArgument>()) {
+              if (block_arg.getParentRegion() == cluster_op_parent_region)
+                is_external = true;
+            }
+          }
           if (is_external) external_values.insert(v);
         }
       } else {
         llvm::SetVector<Value> external_captured_inputs;
         visitUsedValuesDefinedAbove(*region, *region, [&](OpOperand* operand) {
-          Region* parent_region = operand->get().getParentRegion();
-          if (!tpu_cluster.body().isAncestor(parent_region)) return;
+          const bool captured_value_from_host =
+              llvm::find(host_cluster_ops, operand->get().getDefiningOp()) !=
+              host_cluster_ops.end();
+          if (captured_value_from_host) return;
 
+          Region* operand_defined_region = operand->get().getParentRegion();
+          if (!tpu_cluster.body().isAncestor(operand_defined_region)) return;
+          // If the host_cluster_op is regional control flow (if, while),
+          // then check if the operand_defined_region is an ancestor of the
+          // control flow regions.
+          if (auto if_op = llvm::dyn_cast<TF::IfRegionOp>(host_cluster_op)) {
+            if (if_op.then_branch().isAncestor(operand_defined_region) ||
+                if_op.else_branch().isAncestor(operand_defined_region))
+              return;
+          }
+          if (auto while_op =
+                  llvm::dyn_cast<TF::WhileRegionOp>(host_cluster_op)) {
+            if (while_op.cond().isAncestor(operand_defined_region) ||
+                while_op.body().isAncestor(operand_defined_region))
+              return;
+          }
           external_captured_inputs.insert(operand->get());
         });
         external_values.insert(external_captured_inputs.begin(),
@@ -355,15 +503,21 @@ llvm::SmallSetVector<Value, 4> GetExternalOperands(
 }
 
 // Extracts all externally used outputs of `cluster_ops`.
-llvm::SmallVector<Value, 4> GetExternalOutputs(
+llvm::SmallSetVector<Value, 4> GetExternalOutputs(
     llvm::ArrayRef<Operation*> cluster_ops) {
   llvm::SmallSetVector<Value, 4> external_outputs;
+  llvm::SmallPtrSet<Operation*, 4> host_cluster_ops_set;
+  for (auto op : cluster_ops) {
+    op->walk([&](Operation* host_cluster_op) {
+      host_cluster_ops_set.insert(host_cluster_op);
+    });
+  }
 
   for (Operation* op : cluster_ops) {
     for (Operation* user : op->getUsers()) {
-      bool is_external = llvm::none_of(cluster_ops, [&](Operation* cluster_op) {
-        return user == cluster_op;
-      });
+      bool is_external = llvm::none_of(
+          host_cluster_ops_set,
+          [&](Operation* cluster_op) { return user == cluster_op; });
       if (!is_external) continue;
       for (Value v : user->getOperands()) {
         if (v.getDefiningOp() == op) external_outputs.insert(v);
@@ -371,7 +525,7 @@ llvm::SmallVector<Value, 4> GetExternalOutputs(
     }
   }
 
-  return external_outputs.takeVector();
+  return external_outputs;
 }
 
 // Sets the insertion point on `builder` for HostCompute op.  Sets insertion
@@ -390,6 +544,12 @@ void SetHostComputeInsertion(
       }
     }
   }
+
+  // If no operand usage can be found, this means that external input is
+  // implicitly captured inputs for ops inside internal regions of one of the
+  // `cluster_ops`. In that case, set the insertion point to the last op of the
+  // `cluster_ops` in the IR.
+  builder->setInsertionPoint(cluster_ops.back());
 }
 
 // Creates the HostCompute with `inputs` and `outputs`
@@ -412,53 +572,62 @@ TF::_XlaHostComputeMlirOp CreateHostCompute(
   return host_compute;
 }
 
-void MoveOutsideCompiledOps(
+// Represents a set of ops inside host computation that is wrapped inside the
+// same control flow.
+struct HostCluster {
+  // List of control flow that wraps host computation operations. May be empty.
+  llvm::SmallVector<ControlFlowStackInfo, 4> controlflow_stack;
+
+  // Set of operations that will run on host wrapped around same stack of
+  // control flow.
+  llvm::SmallVector<Operation*, 4> section_ops;
+};
+
+HostCluster* FindHostCluster(
+    llvm::SmallVectorImpl<HostCluster>& host_cluster_sections,
+    const llvm::SmallVector<ControlFlowStackInfo, 4>& control_flows) {
+  for (auto& section : host_cluster_sections)
+    if (control_flows == section.controlflow_stack) return &section;
+  return nullptr;
+}
+
+void MoveOutsideCompiledOpsInsideControlFlow(
     ModuleOp module, tf_device::ClusterOp tpu_cluster,
-    llvm::StringRef outside_cluster_name, tf_device::LaunchOp host_launch_op,
-    llvm::ArrayRef<Operation*> cluster_ops,
-    const llvm::SmallSetVector<Value, 4>& external_inputs,
-    llvm::ArrayRef<Value> external_outputs) {
-  // Since ops in `cluster_ops` do not cross function/control flow boundary, it
-  // is sufficient to identify the control flow that wraps `cluster_ops` by
-  // looking at any arbitary op inside `cluster_ops`.
-  auto controlflow_stack =
-      GetControlFlowStackForOp(tpu_cluster, cluster_ops.front());
-
-  Value compilation_key;
-  if (!controlflow_stack.empty() || !external_inputs.empty() ||
-      !external_outputs.empty()) {
-    OpBuilder builder(&host_launch_op.GetBody().front());
-    compilation_key =
-        CreateCompilationKeyPlaceholder(tpu_cluster.getLoc(), &builder);
-  }
-
-  Block* block_to_move_host_cluster = nullptr;
+    llvm::StringRef host_cluster_section_name,
+    tf_device::LaunchOp host_launch_op, Value compilation_key,
+    llvm::ArrayRef<Operation*> cluster_section_ops,
+    const llvm::SmallVectorImpl<ControlFlowStackInfo>& controlflow_stack,
+    const llvm::SmallSetVector<Value, 4>& section_external_inputs,
+    llvm::ArrayRef<Value> section_external_outputs,
+    llvm::SmallDenseMap<Operation*, Operation*>* replicated_controlflow_map) {
+  Operation* insertion_op = nullptr;
   if (controlflow_stack.empty()) {
-    block_to_move_host_cluster = &host_launch_op.GetBody();
+    insertion_op = host_launch_op.GetBody().getTerminator();
   } else {
     int send_recv_counter = 0;
-    block_to_move_host_cluster = ReplicateControlFlowStack(
-        outside_cluster_name, controlflow_stack, tpu_cluster, module,
-        compilation_key, &host_launch_op.GetBody(), &send_recv_counter);
+    insertion_op = GetOrReplicateControlFlowStack(
+        host_cluster_section_name, controlflow_stack, tpu_cluster, module,
+        compilation_key, &host_launch_op.GetBody(), &send_recv_counter,
+        replicated_controlflow_map);
   }
 
   MLIRContext* context = host_launch_op.getContext();
-  if (external_inputs.empty() && external_outputs.empty()) {
-    MoveOutsideClusterOpsToBlock(*block_to_move_host_cluster, cluster_ops,
-                                 context);
+  if (section_external_inputs.empty() && section_external_outputs.empty()) {
+    MoveOutsideClusterOpsBeforeOp(insertion_op, cluster_section_ops, context);
     return;
   }
 
-  OpBuilder builder(block_to_move_host_cluster->getTerminator());
+  OpBuilder builder(insertion_op);
   llvm::SmallVector<Type, 4> host_output_types;
-  for (const auto& external_input : external_inputs)
+  for (const auto& external_input : section_external_inputs)
     host_output_types.push_back(external_input.getType());
 
   std::string args_communication_key =
-      llvm::formatv("host_compute_channel_{0}_args", outside_cluster_name)
+      llvm::formatv("host_compute_channel_{0}_args", host_cluster_section_name)
           .str();
   std::string retvals_communication_key =
-      llvm::formatv("host_compute_channel_{0}_retvals", outside_cluster_name)
+      llvm::formatv("host_compute_channel_{0}_retvals",
+                    host_cluster_section_name)
           .str();
 
   auto recv_at_host = builder.create<TF::_XlaRecvAtHostOp>(
@@ -467,26 +636,105 @@ void MoveOutsideCompiledOps(
       builder.getStringAttr(args_communication_key),
       /*device_ordinal=*/builder.getI64IntegerAttr(0));
 
-  auto host_compute = CreateHostCompute(
-      &builder, tpu_cluster, cluster_ops, external_inputs, external_outputs,
-      args_communication_key, retvals_communication_key);
-  MoveOutsideClusterOpsToBlock(*block_to_move_host_cluster, cluster_ops,
-                               context);
+  auto host_compute =
+      CreateHostCompute(&builder, tpu_cluster, cluster_section_ops,
+                        section_external_inputs, section_external_outputs,
+                        args_communication_key, retvals_communication_key);
+  MoveOutsideClusterOpsBeforeOp(insertion_op, cluster_section_ops, context);
 
-  builder.setInsertionPoint(block_to_move_host_cluster->getTerminator());
+  builder.setInsertionPoint(insertion_op);
   builder.create<TF::_XlaSendFromHostOp>(
-      tpu_cluster.getLoc(), external_outputs,
+      tpu_cluster.getLoc(), section_external_outputs,
       /*dynamic_key=*/compilation_key,
       builder.getStringAttr(retvals_communication_key),
       /*device_ordinal=*/builder.getI64IntegerAttr(0));
 
-  for (auto result : llvm::zip(external_inputs, recv_at_host.getResults()))
+  for (auto result :
+       llvm::zip(section_external_inputs, recv_at_host.getResults())) {
     mlir::replaceAllUsesInRegionWith(std::get<0>(result), std::get<1>(result),
-                                     host_launch_op.body());
+                                     *insertion_op->getParentRegion());
+  }
 
-  for (auto result : llvm::zip(external_outputs, host_compute.getResults()))
-    mlir::replaceAllUsesInRegionWith(std::get<0>(result), std::get<1>(result),
-                                     tpu_cluster.body());
+  for (auto result :
+       llvm::zip(section_external_outputs, host_compute.getResults())) {
+    for (auto& result_use : std::get<0>(result).getUses()) {
+      Operation* result_using_op = result_use.getOwner();
+      const bool inside_device_cluster =
+          tpu_cluster.body().isAncestor(result_using_op->getParentRegion());
+      if (inside_device_cluster) result_use.set(std::get<1>(result));
+    }
+  }
+}
+
+void MoveOutsideCompiledOps(
+    ModuleOp module, tf_device::ClusterOp tpu_cluster,
+    llvm::StringRef outside_cluster_name, tf_device::LaunchOp host_launch_op,
+    llvm::ArrayRef<Operation*> cluster_ops,
+    const llvm::SmallSetVector<Value, 4>& external_inputs,
+    const llvm::SmallSetVector<Value, 4>& external_outputs) {
+  // Identify and groups ops in `cluster_ops` by ops wrapped inside the same
+  // control flows.
+  llvm::SmallVector<HostCluster, 4> host_cluster_sections;
+  for (Operation* host_cluster_op : cluster_ops) {
+    auto controlflow_stack =
+        GetControlFlowStackForOp(tpu_cluster, host_cluster_op);
+    auto host_cluster_section =
+        FindHostCluster(host_cluster_sections, controlflow_stack);
+    if (!host_cluster_section) {
+      host_cluster_sections.emplace_back(
+          HostCluster{controlflow_stack, {host_cluster_op}});
+    } else {
+      host_cluster_section->section_ops.emplace_back(host_cluster_op);
+    }
+  }
+
+  const bool has_control_flow =
+      llvm::any_of(host_cluster_sections, [](const auto host_cluster_section) {
+        return !host_cluster_section.controlflow_stack.empty();
+      });
+
+  Value compilation_key;
+  if (has_control_flow || !external_inputs.empty() ||
+      !external_outputs.empty()) {
+    OpBuilder builder(&host_launch_op.GetBody().front());
+    compilation_key =
+        CreateCompilationKeyPlaceholder(tpu_cluster.getLoc(), &builder);
+  }
+
+  // Maintains a map of control flow callsite operation in TPU device side
+  // and an replicated control flow operation on host cluster.
+  llvm::SmallDenseMap<Operation*, Operation*> replicated_controlflows;
+
+  // Move `cluster_op` to host cluster, replicating control flow if ops are
+  // wrapped inside a control flow.
+  for (const auto& host_cluster_section_and_index :
+       llvm::enumerate(host_cluster_sections)) {
+    const auto& host_cluster_section = host_cluster_section_and_index.value();
+    const int index = host_cluster_section_and_index.index();
+
+    const auto& controlflow_stack = host_cluster_section.controlflow_stack;
+    const auto& cluster_section_ops = host_cluster_section.section_ops;
+    auto section_external_inputs =
+        GetExternalOperands(tpu_cluster, cluster_section_ops);
+    for (auto input : section_external_inputs) {
+      if (!external_inputs.contains(input))
+        section_external_inputs.remove(input);
+    }
+    auto section_external_outputs = GetExternalOutputs(cluster_section_ops);
+    for (auto output : section_external_outputs) {
+      if (!external_outputs.contains(output))
+        section_external_outputs.remove(output);
+    }
+
+    const std::string host_cluster_section_name =
+        llvm::formatv("{0}_{1}", outside_cluster_name, index).str();
+
+    MoveOutsideCompiledOpsInsideControlFlow(
+        module, tpu_cluster, host_cluster_section_name, host_launch_op,
+        compilation_key, cluster_section_ops, controlflow_stack,
+        section_external_inputs, section_external_outputs.takeVector(),
+        &replicated_controlflows);
+  }
 }
 
 // Creates a `parallel_execute` op in place of launch with 'clusters` and
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_outside_compilation_cluster.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_outside_compilation_cluster.cc
index be01b7644ea..63bb53f52b5 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_outside_compilation_cluster.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_outside_compilation_cluster.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -22,6 +23,7 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 
@@ -33,10 +35,26 @@ namespace {
 constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
 
 struct TPUOutsideCompilationCluster
-    : public PassWrapper<TPUOutsideCompilationCluster, FunctionPass> {
-  void runOnFunction() override;
+    : public TF::PerFunctionAggregateAnalysisConsumerPass<
+          TPUOutsideCompilationCluster, TF::SideEffectAnalysis> {
+  void runOnFunction(FuncOp func,
+                     const TF::SideEffectAnalysis::Info& side_effect_analysis);
 };
 
+bool IsVariant(Value value) {
+  return getElementTypeOrSelf(value.getType()).isa<TF::VariantType>();
+}
+
+bool HasOutsideCompiledAncestor(Operation* op) {
+  Operation* parent = op->getParentOp();
+  while (parent) {
+    if (parent->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr))
+      return true;
+    parent = parent->getParentOp();
+  }
+  return false;
+}
+
 // Represents an outside compiled cluster. All ops that are added to the same
 // cluster will be extracted together in a later pass.
 class OutsideCompiledCluster {
@@ -44,81 +62,141 @@ class OutsideCompiledCluster {
   explicit OutsideCompiledCluster(int number)
       : cluster_name_(llvm::formatv("cluster{0}", number).str()) {}
 
-  // Attempts to add an op to this cluster.
-  // This function requires all ops to be added before their uses.
-  bool AddOp(Operation* op) {
+  // Attempts to add an op to this cluster. Ops can be grouped to the same
+  // cluster if they have data dependency and are inside the same block.
+  bool AddOp(Operation* op,
+             const TF::SideEffectAnalysis::Info& side_effect_analysis) {
     // Check if the op is safe to add before adding it.
-    bool add = IsSafeToAdd(op);
-    if (add) {
-      // Set the ops kXlaOutsideCompilationAttr to the cluster name.
+    if (IsSafeToAdd(op, side_effect_analysis)) {
       op->setAttr(kXlaOutsideCompilationAttr,
                   StringAttr::get(cluster_name_, op->getContext()));
-
-      // Since we are adding the op to the cluster, the op is no longer
-      // considered a user of this cluster.
-      users_.erase(op);
+      host_cluster_ops_.insert(op);
+      return true;
     }
+    return false;
+  }
 
-    // Add this op's users to the cluster users.
-    users_.insert(op->user_begin(), op->user_end());
-    return add;
+  // If any tf.variants are inputs/outputs to the cluster, add them to the
+  // cluster unless they are already marks with outside compilation attribute.
+  bool AddVariantInputsOutputs() {
+    bool added_op = false;
+    llvm::SmallPtrSet<Operation*, 8> expanded_cluster_ops(host_cluster_ops_);
+    for (Operation* cluster_op : host_cluster_ops_) {
+      // Walk the clustered operations to handle nested ops.
+      cluster_op->walk([&](Operation* op) {
+        // Add any operations that provide variant inputs to the cluster.
+        for (auto value : op->getOperands()) {
+          auto input_defining_op = value.getDefiningOp();
+          if (IsVariant(value) && input_defining_op &&
+              !HasOutsideCompiledAncestor(input_defining_op) &&
+              !input_defining_op->getAttrOfType<StringAttr>(
+                  kXlaOutsideCompilationAttr)) {
+            expanded_cluster_ops.insert(input_defining_op);
+            input_defining_op->setAttr(
+                kXlaOutsideCompilationAttr,
+                StringAttr::get(cluster_name_,
+                                input_defining_op->getContext()));
+            added_op = true;
+          }
+        }
+        // Add any operations that consume variant outputs to the cluster.
+        for (auto value : op->getResults()) {
+          if (IsVariant(value)) {
+            for (auto user : value.getUsers()) {
+              if (!host_cluster_ops_.contains(user) &&
+                  !HasOutsideCompiledAncestor(user) &&
+                  !user->getAttrOfType<StringAttr>(
+                      kXlaOutsideCompilationAttr)) {
+                expanded_cluster_ops.insert(user);
+                user->setAttr(
+                    kXlaOutsideCompilationAttr,
+                    StringAttr::get(cluster_name_, user->getContext()));
+                added_op = true;
+              }
+            }
+          }
+        }
+      });
+    }
+    host_cluster_ops_.swap(expanded_cluster_ops);
+
+    return added_op;
   }
 
  private:
   // Checks if it is safe for an op to be merged into this cluster.
-  bool IsSafeToAdd(Operation* op) {
+  bool IsSafeToAdd(Operation* op,
+                   const TF::SideEffectAnalysis::Info& side_effect_analysis) {
+    if (closed_) return false;
     // If the op is not marked for outside compilation it doesn't belong in a
     // cluster.
-    if (!op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr))
+    if (!op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
+      auto successors = side_effect_analysis.DirectControlSuccessors(op);
+      // If non outside compiled op with side effect successors is encountered,
+      // close this cluster to additions so that no cluster cyclic dependencies
+      // can be created.
+      if (!successors.empty()) {
+        closed_ = true;
+      }
       return false;
-
-    // Checks to see if the op's operands are related to this
-    // clusters users. If they are related, then there is an op between this
-    // op and the cluster. Since ops are added before their uses, there
-    // is no way for the op in-between to ever be added to this cluster
-    // therefore there is no way this op can ever be added to the cluster.
-    for (const Value& value : op->getOperands()) {
-      Operation* op_operand = value.getDefiningOp();
-      if (op_operand && users_.find(op_operand) != users_.end()) return false;
     }
-    return true;
+
+    if (host_cluster_ops_.empty()) return true;
+
+    // Checks to see if there is data dependency between ops in
+    // `host_cluster_ops_` and `op`.
+    const bool contains_data_dependency = llvm::any_of(
+        op->getUsers(),
+        [&](Operation* user) { return host_cluster_ops_.contains(user); });
+
+    return contains_data_dependency;
   }
 
-  // users_ stores the direct and indirect users of the outside compiled ops in
-  // this cluster. It does NOT store the outside compiled ops that are a part
-  // of this cluster that will be collectively extracted and run on the cpu.
-  // users_ is consulted when attempting to add a new outside compiled to the
-  // cluster. If the new op's operand(s) are already in users_, it means that
-  // the operand(s) were not added to the cluster so it is not safe to add the
-  // new op to the cluster either.
-  llvm::SmallPtrSet<Operation*, 8> users_;
+  // `host_cluster_op_` stores a set of ops that will be grouped and computed
+  // on host as single XlaHostCompute op. An outside compiled op can be grouped
+  // to a single cluster if it has data dependency to another op already in the
+  // cluster.
+  llvm::SmallPtrSet<Operation*, 8> host_cluster_ops_;
   std::string cluster_name_;
+  bool closed_ = false;  // Cluster is closed to further additions.
 };
 
-void TPUOutsideCompilationCluster::runOnFunction() {
+void TPUOutsideCompilationCluster::runOnFunction(
+    FuncOp func, const TF::SideEffectAnalysis::Info& side_effect_analysis) {
   llvm::SmallVector<OutsideCompiledCluster, 8> clusters;
   int cluster_counter = 0;
 
-  getFunction().walk([&](tf_device::ClusterOp tpu_cluster) {
-    for (Operation& op : tpu_cluster.GetBody()) {
+  func.walk([&](tf_device::ClusterOp tpu_cluster) {
+    llvm::SmallVector<Operation*, 4> tpu_cluster_ops;
+    tpu_cluster_ops.reserve(tpu_cluster.getBody()->getOperations().size());
+
+    tpu_cluster.walk([&](Operation* op) { tpu_cluster_ops.emplace_back(op); });
+
+    // In order to cluster ops feeding results to the same operation, traverse
+    // the ops in reverse order.
+    for (Operation* op : llvm::reverse(tpu_cluster_ops)) {
       // Try to add the op to existing clusters.
       bool added = false;
       for (auto& cluster : clusters)
-        if ((added = cluster.AddOp(&op))) break;
+        if ((added = cluster.AddOp(op, side_effect_analysis))) break;
 
       // If the op cannot be added to existing clusters, create a new cluster.
       if (!added) {
         OutsideCompiledCluster new_cluster(cluster_counter++);
-        new_cluster.AddOp(&op);
+        new_cluster.AddOp(op, side_effect_analysis);
         clusters.push_back(new_cluster);
       }
     }
   });
+  for (auto& cluster : clusters) {
+    bool variants_to_add = true;
+    while (variants_to_add) variants_to_add = cluster.AddVariantInputsOutputs();
+  }
 }
 
 }  // anonymous namespace
 
-std::unique_ptr<OperationPass<FuncOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
 CreateTPUOutsideCompilationClusterPass() {
   return std::make_unique<TPUOutsideCompilationCluster>();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_parallel_execute_sink_resource_write.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_parallel_execute_sink_resource_write.cc
new file mode 100644
index 00000000000..45773a128fd
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_parallel_execute_sink_resource_write.cc
@@ -0,0 +1,166 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <memory>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TFTPU {
+
+namespace {
+
+// A pass that moves `tf.AssignVariableOp` into a `tf_device.parallel_execute`
+// region if the `tf.AssignVariableOp` is the only consumer of a
+// `tf_device.parallel_execute` result. This will allow
+// TPUMergeVariablesWithExecute to merge resource writes without special
+// handling for `tf_device.parallel_execute`.
+struct TPUParallelExecuteSinkResourceWrite
+    : public PassWrapper<TPUParallelExecuteSinkResourceWrite, FunctionPass> {
+  void runOnFunction() override;
+};
+
+// Finds an AssignVariableOp that can be moved into the parallel_execute region.
+// These AssignVariableOps must be the only consumer of the respective
+// parallel_execute result, and the resource handle producer must be from an op
+// before or above the parallel_execute.
+TF::AssignVariableOp GetSingleUseResourceWrite(
+    tf_device::ParallelExecuteOp parallel_execute, Value result) {
+  if (!result.hasOneUse()) return nullptr;
+
+  OpOperand& use = *result.getUses().begin();
+  auto assign_var = dyn_cast<TF::AssignVariableOp>(use.getOwner());
+  if (!assign_var) return nullptr;
+
+  if (use.get() != assign_var.value()) return nullptr;
+
+  auto* resource_handle_op = assign_var.resource().getDefiningOp();
+  if (resource_handle_op == parallel_execute) return nullptr;
+
+  if (resource_handle_op &&
+      resource_handle_op->getBlock() ==
+          parallel_execute.getOperation()->getBlock() &&
+      parallel_execute.getOperation()->isBeforeInBlock(resource_handle_op))
+    return nullptr;
+
+  return assign_var;
+}
+
+// Finds AssignVariableOps that can be moved into a parallel_execute region and
+// moves them. Leftover parallel_execute results that were used by the
+// such AssignVariableOp are also pruned.
+void SinkResourceWritesIntoParallelExecute(
+    tf_device::ParallelExecuteOp parallel_execute) {
+  bool rewrite = false;
+  const int num_regions = parallel_execute.getNumRegions();
+  llvm::SmallVector<Value, 4> results_to_remap;
+
+  // Go through each region and find AssignVariableOps that can be moved into
+  // the parallel_execute region. Result indices by region index are collected,
+  // so they can be removed afterwards.
+  llvm::SmallVector<llvm::SmallVector<int, 4>, 4> results_to_remove_by_region;
+  results_to_remove_by_region.resize(num_regions);
+  for (int i = 0; i < num_regions; ++i) {
+    Block& block = parallel_execute.GetRegionBlockWithIndex(i);
+    auto results = parallel_execute.GetRegionOutputs(i);
+    auto& results_to_remove = results_to_remove_by_region[i];
+    results_to_remove.reserve(results.size());
+    Operation* terminator = block.getTerminator();
+    for (auto result : llvm::enumerate(results)) {
+      TF::AssignVariableOp assign_var =
+          GetSingleUseResourceWrite(parallel_execute, result.value());
+      if (!assign_var) {
+        results_to_remap.push_back(result.value());
+        continue;
+      }
+
+      // Move AssignVariableOp and update the value to be written to the
+      // resource variable to be the non forwarded value from within the
+      // parallel_execute region.
+      assign_var.getOperation()->moveBefore(terminator);
+      assign_var.valueMutable().assign(terminator->getOperand(result.index()));
+      results_to_remove.push_back(result.index());
+    }
+
+    rewrite |= !results_to_remove.empty();
+  }
+
+  if (!rewrite) return;
+
+  // Remove leftover unused results (terminator operands) from moving
+  // AssignVariabeOps into the parallel_execute region.
+  for (auto results_to_remove : llvm::enumerate(results_to_remove_by_region)) {
+    Block& block =
+        parallel_execute.GetRegionBlockWithIndex(results_to_remove.index());
+    Operation* terminator = block.getTerminator();
+    for (int index_to_remove : llvm::reverse(results_to_remove.value()))
+      terminator->eraseOperand(index_to_remove);
+  }
+
+  // Replace old parallel_execute with new parallel_execute by moving the
+  // regions to a new parallel_execute and remapping the results.
+  llvm::SmallVector<Type, 4> new_result_types;
+  new_result_types.reserve(results_to_remap.size());
+  for (Value old_result : results_to_remap)
+    new_result_types.push_back(old_result.getType());
+
+  OpBuilder builder(parallel_execute);
+  auto new_parallel_execute = builder.create<tf_device::ParallelExecuteOp>(
+      parallel_execute.getLoc(), num_regions, new_result_types);
+
+  for (auto region : llvm::zip(new_parallel_execute.getRegions(),
+                               parallel_execute.getRegions()))
+    std::get<0>(region)->takeBody(*std::get<1>(region));
+
+  for (auto result :
+       llvm::zip(results_to_remap, new_parallel_execute.getResults()))
+    std::get<0>(result).replaceAllUsesWith(std::get<1>(result));
+
+  parallel_execute.erase();
+}
+
+void TPUParallelExecuteSinkResourceWrite::runOnFunction() {
+  llvm::SmallVector<tf_device::ParallelExecuteOp, 4> parallel_executes;
+  getFunction().walk([&](tf_device::ParallelExecuteOp parallel_execute) {
+    parallel_executes.push_back(parallel_execute);
+  });
+
+  for (tf_device::ParallelExecuteOp parallel_execute : parallel_executes)
+    SinkResourceWritesIntoParallelExecute(parallel_execute);
+}
+
+}  // anonymous namespace
+
+std::unique_ptr<OperationPass<FuncOp>>
+CreateTPUParallelExecuteSinkResourceWritePass() {
+  return std::make_unique<TPUParallelExecuteSinkResourceWrite>();
+}
+
+static PassRegistration<TPUParallelExecuteSinkResourceWrite> pass(
+    "tf-tpu-parallel-execute-sink-resource-write",
+    "Moves tf.AssignVariableOp consumers of tf_device.parallel_execute into "
+    "tf_device.parallel_execute regions");
+
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc
new file mode 100644
index 00000000000..cccd528da1d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_resource_read_for_write.cc
@@ -0,0 +1,140 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace mlir {
+namespace TFTPU {
+
+// A pass that finds TPU clusters with write only resource access and adds an
+// associated resource read, so the resource can later be fused into TPUExecute.
+namespace {
+struct TPUResourceReadForWrite
+    : public PassWrapper<TPUResourceReadForWrite, OperationPass<ModuleOp>> {
+  void runOnOperation() override;
+};
+
+// Helper struct holding a resource value and its associated type.
+struct ResourceValueAndSubtype {
+  Value resource;
+  Type subtype;
+};
+
+// Finds resource handle and type for result if result writes to a resource.
+ResourceValueAndSubtype GetResourceWriteResult(
+    tf_device::ClusterFuncOp cluster_func, Value result) {
+  ResourceValueAndSubtype resource;
+  if (!result.hasOneUse()) return resource;
+  Operation* result_user = *result.getUsers().begin();
+  auto assign_var = dyn_cast<TF::AssignVariableOp>(result_user);
+  if (!assign_var) return resource;
+
+  auto handle = assign_var.resource();
+  // Skip result if cluster writes to the same variable via multiple results.
+  for (Operation* handle_user : handle.getUsers()) {
+    if (handle_user == assign_var) continue;
+    auto assign_var_user = dyn_cast<TF::AssignVariableOp>(handle_user);
+    if (!assign_var_user) continue;
+    if (assign_var_user.value().getDefiningOp() == cluster_func)
+      return resource;
+  }
+
+  resource.resource = assign_var.resource();
+  resource.subtype = assign_var.value().getType();
+  return resource;
+}
+
+// Checks if resource is read by TPU cluster.
+bool ClusterFuncHasResourceRead(tf_device::ClusterFuncOp cluster_func,
+                                Value resource) {
+  for (Operation* resource_user : resource.getUsers())
+    if (auto read = dyn_cast<TF::ReadVariableOp>(resource_user))
+      for (Operation* read_user : read.value().getUsers())
+        if (read_user == cluster_func) return true;
+
+  return false;
+}
+
+void TPUResourceReadForWrite::runOnOperation() {
+  SmallVector<tf_device::ClusterFuncOp, 4> cluster_funcs;
+  getOperation().walk([&](tf_device::ClusterFuncOp cluster_func) {
+    cluster_funcs.push_back(cluster_func);
+  });
+
+  OpBuilder builder(&getContext());
+  // Add resource reads for resource writes from TPU cluster where for such
+  // resources the TPU cluster does not read from.
+  for (tf_device::ClusterFuncOp cluster_func : cluster_funcs) {
+    builder.setInsertionPoint(cluster_func);
+
+    SmallVector<Value, 4> read_operands;
+    for (Value result : cluster_func.getResults()) {
+      // TODO(lyandy): Update pass to use resource alias analysis.
+      auto resource_and_type = GetResourceWriteResult(cluster_func, result);
+      if (!resource_and_type.resource) continue;
+      if (ClusterFuncHasResourceRead(cluster_func, resource_and_type.resource))
+        continue;
+      auto new_read = builder.create<TF::ReadVariableOp>(
+          resource_and_type.resource.getLoc(), resource_and_type.subtype,
+          resource_and_type.resource);
+      read_operands.push_back(new_read.value());
+    }
+
+    if (read_operands.empty()) continue;
+
+    // Update caller and function types with new read operands.
+    auto operands = llvm::to_vector<4>(cluster_func.getOperands());
+    operands.append(read_operands.begin(), read_operands.end());
+
+    auto new_cluster_func = builder.create<tf_device::ClusterFuncOp>(
+        cluster_func.getLoc(), cluster_func.getResultTypes(), operands,
+        cluster_func.getAttrs());
+    cluster_func.replaceAllUsesWith(new_cluster_func);
+    FuncOp func = cluster_func.getFunc();
+    Block& block = func.front();
+    for (Value read_operand : read_operands)
+      block.addArgument(read_operand.getType());
+
+    func.setType(FunctionType::get(block.getArgumentTypes(),
+                                   func.getCallableResults(), &getContext()));
+    cluster_func.erase();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPUResourceReadForWritePass() {
+  return std::make_unique<TPUResourceReadForWrite>();
+}
+
+static PassRegistration<TPUResourceReadForWrite> pass(
+    "tf-tpu-resource-read-for-write",
+    "Inserts tf.ReadVariableOp inputs to a TPU cluster for resource writes "
+    "with no reads");
+
+}  // namespace TFTPU
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index 21ad457a7a6..86aeec81150 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
@@ -42,6 +41,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
@@ -154,11 +154,8 @@ LogicalResult EncapsulateFuncAndSerialize(FuncOp entry_func,
     symbol_table.insert(clone);
   }
 
-  // Serialize module and return.
-  {
-    llvm::raw_string_ostream os(*serialized_func_module);
-    module_for_func.get().print(os);
-  }
+  *serialized_func_module =
+      tensorflow::SerializeMlirModule(module_for_func.get());
   return success();
 }
 
@@ -647,7 +644,7 @@ LogicalResult Rewrite(
   int num_replicas = 1;
   tf_device::ReplicateOp replicate =
       cluster_func.getParentOfType<tf_device::ReplicateOp>();
-  if (replicate) num_replicas = replicate.n().getLimitedValue();
+  if (replicate) num_replicas = replicate.n();
 
   auto num_cores_per_replica_attr = cluster_func.getAttrOfType<IntegerAttr>(
       tensorflow::kNumCoresPerReplicaAttr);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
index 0b9eaba8c97..35ad3d21b30 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
@@ -47,133 +48,47 @@ struct TPUShardingIdentificationPass
   void runOnOperation() override;
 };
 
-// Sets `sharding_op` if `op` is XlaShardingOp or if XlaSharding op is adjacent
-// to `op`. XlaSharding op may be direct user of inputs but it may also be
-// followed by an Identity op and, in the case where bfloat16 type is used, Cast
-// op may be added right after the input. As so, parse the users of the
-// operation to access connected XlaSharding op.
+// Finds XlaSharding op connected to an argument value. If value is a resource
+// type then XlaSharding op will be connected to a ReadVariable op. XlaSharding
+// op may be direct user of inputs but it may also be followed by an Identity op
+// and, in the case where bfloat16 type is used, Cast op may be added right
+// after the input.
 //
+// TODO(hongjunchoi): Add logic to parse XlaSharding op inside control flow (If,
+// Case, While) ops and Caller return values.
 // TODO(hongjunchoi): Consider explicitly checking op patterns to detect sharded
 // inputs.
-void GetAdjacentXlaShardingOp(Operation* op,
-                              llvm::Optional<TF::XlaShardingOp>* sharding_op) {
-  // TODO(hongjunchoi): Detect the case when sharding configuration is ambiguous
-  // for a single input (i.e. multiple different XlaSharding ops with different
-  // configuration policies are connected).
-  if (sharding_op->hasValue()) return;
+llvm::Optional<llvm::StringRef> GetXlaShardingFromArg(const Value& value) {
+  llvm::SmallPtrSet<Value, 4> visited_values;
+  llvm::SmallVector<Value, 4> values_to_visit{value};
+  while (!values_to_visit.empty()) {
+    llvm::SmallVector<Value, 4> next_values_to_visit;
+    for (Value value_to_visit : values_to_visit) {
+      if (!visited_values.insert(value_to_visit).second) continue;
 
-  if (auto sharding = llvm::dyn_cast<TF::XlaShardingOp>(op)) {
-    sharding_op->emplace(sharding);
-    return;
-  }
+      for (auto& use : value_to_visit.getUses()) {
+        Operation* owner = use.getOwner();
+        if (auto sharding = llvm::dyn_cast<TF::XlaShardingOp>(owner))
+          return sharding._XlaSharding();
 
-  if (llvm::isa<TF::IdentityOp, TF::CastOp>(op)) {
-    for (auto user : op->getUsers())
-      GetAdjacentXlaShardingOp(user, sharding_op);
-  }
-}
+        if (llvm::isa<TF::IdentityOp, TF::CastOp, TF::ReadVariableOp>(owner)) {
+          next_values_to_visit.push_back(use.getOwner()->getResult(0));
+          continue;
+        }
 
-// Parses XlaSharding op connected to input args. If Input to
-// tf_device.ClusterFunc op is of resource type, then XlaSharding op will be
-// connected to following ReadVariable op.
-//
-// TODO(hongjunchoi): Add logic to parse XlaSharding op inside a Call op or
-// If/While op.
-llvm::Optional<llvm::StringRef> ParseInputSharding(const Value& arg) {
-  llvm::Optional<TF::XlaShardingOp> parsed_sharding_op;
-  for (auto user : arg.getUsers()) {
-    if (parsed_sharding_op) continue;
-
-    GetAdjacentXlaShardingOp(user, &parsed_sharding_op);
-    if (parsed_sharding_op) continue;
-
-    if (llvm::isa<TF::ReadVariableOp>(user))
-      for (auto read_variable_user : user->getUsers())
-        GetAdjacentXlaShardingOp(read_variable_user, &parsed_sharding_op);
-  }
-
-  if (!parsed_sharding_op) return llvm::Optional<llvm::StringRef>();
-  return parsed_sharding_op.getValue()._XlaSharding();
-}
-
-// Returns the provided sharding configuration if operand of return value of
-// tf_device.ClusterFunc op is directly from XlaSharding op,
-llvm::Optional<StringRef> ParseReturnValueSharding(FuncOp func,
-                                                   const int output_index,
-                                                   const OpOperand& operand) {
-  if (auto sharding_op = llvm::dyn_cast_or_null<TF::XlaShardingOp>(
-          operand.get().getDefiningOp()))
-    return sharding_op._XlaSharding();
-
-  return llvm::Optional<StringRef>();
-}
-
-// Includes information on Func op and argument index of the input value. This
-// is used to trace Value that is fed into function call ops.
-struct FunctionAndArgumentInfo {
-  FuncOp func;
-  int argument_index;
-};
-
-// Adds tf.PartitionedCall op or tf.StatefulPartitionedCall op to `list`. If
-// `op` is a function call op, then find the func op from provided `module` and
-// add the func op with `arg_index` to `list`. `list` will later be used to
-// trace mlir::Value that is fed into (potentially nested) function call ops.
-void AddFunctionalOpsToList(
-    const int arg_index, ModuleOp module, Operation* op,
-    llvm::SmallVectorImpl<FunctionAndArgumentInfo>* list) {
-  if (auto pcall_op = llvm::dyn_cast<TF::PartitionedCallOp>(op)) {
-    if (!pcall_op.f().isa<FlatSymbolRefAttr>()) return;
-
-    auto pcall_func = llvm::cast<FuncOp>(
-        module.lookupSymbol(pcall_op.f().getRootReference()));
-    assert(pcall_func);
-    list->emplace_back(FunctionAndArgumentInfo{pcall_func, arg_index});
-
-  } else if (auto spcall_op =
-                 llvm::dyn_cast<TF::StatefulPartitionedCallOp>(op)) {
-    auto sp_call_func = llvm::cast<FuncOp>(module.lookupSymbol(spcall_op.f()));
-    assert(sp_call_func);
-    list->emplace_back(FunctionAndArgumentInfo{sp_call_func, arg_index});
-  }
-}
-
-// Walks the MLIR graph from `arg` and return a list of all function call ops to
-// which the `arg` op is directly connected.
-//
-// For example:
-//   argument0 -> PartitionedCallOp -> StatefulPartitionedCallOp -> AddOp
-//
-// For above case, PartitionedCall op and StatefulPartitionedCallOp will be
-// returned.
-llvm::SmallVector<FunctionAndArgumentInfo, 4> ExtractFunctionsConnectedToArg(
-    BlockArgument arg, ModuleOp module) {
-  llvm::SmallVector<FunctionAndArgumentInfo, 4> functions_connected_to_arg;
-  for (auto& arg_use : arg.getUses())
-    AddFunctionalOpsToList(arg_use.getOperandNumber(), module,
-                           arg_use.getOwner(), &functions_connected_to_arg);
-
-  llvm::SmallVector<FunctionAndArgumentInfo, 4> functions_to_parse{
-      functions_connected_to_arg.begin(), functions_connected_to_arg.end()};
-
-  while (!functions_to_parse.empty()) {
-    llvm::SmallVector<FunctionAndArgumentInfo, 4> newly_discovered_functions;
-    for (auto function_info : functions_to_parse) {
-      Block& func_entry_block = function_info.func.front();
-      auto argument =
-          func_entry_block.getArgument(function_info.argument_index);
-
-      for (auto& arg_use : argument.getUses())
-        AddFunctionalOpsToList(arg_use.getOperandNumber(), module,
-                               arg_use.getOwner(), &newly_discovered_functions);
+        if (auto call_op = llvm::dyn_cast<CallOpInterface>(owner)) {
+          FuncOp func = llvm::dyn_cast<FuncOp>(call_op.resolveCallable());
+          if (!func) continue;
+          next_values_to_visit.push_back(
+              func.getArgument(use.getOperandNumber()));
+        }
+      }
     }
 
-    functions_connected_to_arg.append(newly_discovered_functions.begin(),
-                                      newly_discovered_functions.end());
-    std::swap(functions_to_parse, newly_discovered_functions);
+    values_to_visit.swap(next_values_to_visit);
   }
 
-  return functions_connected_to_arg;
+  return llvm::None;
 }
 
 // Walks the graph from the arguments of the `cluster_func_op` and extracts
@@ -186,7 +101,6 @@ void IdentifyXlaShardingForComputationInputs(
     FuncOp cluster_function, Builder* builder) {
   // Look up function definition from module.
   Block& cluster_function_block = cluster_function.front();
-  ModuleOp module = cluster_func_op.getParentOfType<ModuleOp>();
 
   llvm::SmallVector<llvm::StringRef, 8> sharding_for_args(
       cluster_function_block.getNumArguments(), logical_core_0_sharding);
@@ -202,31 +116,17 @@ void IdentifyXlaShardingForComputationInputs(
   // Sharding configurations are added to the tf_device.ClusterFunc as an
   // attribute and the function as an argument attribute.
   for (auto& arg : cluster_function_block.getArguments()) {
-    auto arg_sharding = ParseInputSharding(arg);
-    const int arg_index_to_tpu_computation = arg.getArgNumber();
-
-    if (!arg_sharding.hasValue()) {
-      auto connected_functions_to_arg =
-          ExtractFunctionsConnectedToArg(arg, module);
-      for (auto& function_arg_info : connected_functions_to_arg) {
-        if (arg_sharding.hasValue()) break;
-
-        const int function_argument_index = function_arg_info.argument_index;
-        auto& parsed_function = function_arg_info.func;
-        Block& parsed_function_block = parsed_function.front();
-        arg_sharding = ParseInputSharding(
-            parsed_function_block.getArgument(function_argument_index));
-      }
-    }
+    auto arg_sharding = GetXlaShardingFromArg(arg);
+    const int index = arg.getArgNumber();
 
     if (arg_sharding) {
-      sharding_for_args[arg_index_to_tpu_computation] = arg_sharding.getValue();
+      sharding_for_args[index] = arg_sharding.getValue();
       cluster_function.setArgAttr(
-          arg_index_to_tpu_computation, kShardingAttr,
+          index, kShardingAttr,
           builder->getStringAttr(arg_sharding.getValue()));
     } else {
       cluster_function.setArgAttr(
-          arg_index_to_tpu_computation, kShardingAttr,
+          index, kShardingAttr,
           builder->getStringAttr(logical_core_0_sharding));
     }
   }
@@ -235,6 +135,44 @@ void IdentifyXlaShardingForComputationInputs(
                           builder->getStrArrayAttr(sharding_for_args));
 }
 
+// Finds XlaSharding op connected to a result value. XlaSharding op may be
+// direct user of inputs but it may also be followed by an Identity op and, in
+// the case where bfloat16 type is used, Cast op may be added right after the
+// input.
+//
+// TODO(hongjunchoi): Add logic to parse XlaSharding op inside control flow (If,
+// Case, While) ops and Caller argument values.
+// TODO(hongjunchoi): Consider explicitly checking op patterns to detect sharded
+// inputs.
+llvm::Optional<StringRef> GetXlaShardingFromRetval(const Value& value) {
+  llvm::SmallPtrSet<Value, 4> visited_values;
+  Value value_to_visit = value;
+  while (value_to_visit) {
+    if (!visited_values.insert(value_to_visit).second) return llvm::None;
+
+    Operation* def = value_to_visit.getDefiningOp();
+    if (auto sharding = llvm::dyn_cast_or_null<TF::XlaShardingOp>(def))
+      return sharding._XlaSharding();
+
+    if (llvm::isa_and_nonnull<TF::IdentityOp, TF::CastOp>(def)) {
+      value_to_visit = def->getOperand(0);
+      continue;
+    }
+
+    if (auto call_op = llvm::dyn_cast_or_null<CallOpInterface>(def)) {
+      FuncOp func = llvm::dyn_cast<FuncOp>(call_op.resolveCallable());
+      if (!func) continue;
+      value_to_visit = func.front().getTerminator()->getOperand(
+          value_to_visit.cast<OpResult>().getResultNumber());
+      continue;
+    }
+
+    break;
+  }
+
+  return llvm::None;
+}
+
 // Parses XlaSharding op directly connected from the outputs of the
 // `cluster_func` and extract sharding configurations for outputs.
 void IdentifyXlaShardingForComputationOutputs(
@@ -252,8 +190,8 @@ void IdentifyXlaShardingForComputationOutputs(
   // tf_device.ClusterFunc as an attribute and the function as a result
   // attribute.
   for (auto& ret : terminator->getOpOperands()) {
+    auto ret_sharding = GetXlaShardingFromRetval(ret.get());
     const int index = ret.getOperandNumber();
-    auto ret_sharding = ParseReturnValueSharding(func, index, ret);
 
     if (ret_sharding) {
       sharding_for_rets[index] = ret_sharding.getValue();
@@ -264,6 +202,7 @@ void IdentifyXlaShardingForComputationOutputs(
                          builder->getStringAttr(logical_core_0_sharding));
     }
   }
+
   cluster_func.setAttr(tensorflow::kOutputShardingAttr,
                        builder->getStrArrayAttr(sharding_for_rets));
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc
index 2f1db0899f7..ed4c411aae8 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc
@@ -115,9 +115,8 @@ struct TPUSpaceToDepthPass
 
 // Updates func argument type to have the updated input shape.
 void UpdateFuncType(FuncOp func) {
-  auto arg_types = llvm::to_vector<8>(func.front().getArgumentTypes());
-  auto result_types =
-      llvm::to_vector<4>(func.front().getTerminator()->getOperandTypes());
+  auto arg_types = func.front().getArgumentTypes();
+  auto result_types = func.front().getTerminator()->getOperandTypes();
   func.setType(FunctionType::get(arg_types, result_types, func.getContext()));
 }
 
@@ -432,9 +431,8 @@ TF::SpaceToDepthOp BuildSpaceToDepth(tf_device::ClusterFuncOp cluster_func,
       input_shape[3] * block_size * block_size};
   auto transform_result_type =
       RankedTensorType::get(transform_shape, getElementTypeOrSelf(input));
-  return builder.create<TF::SpaceToDepthOp>(cluster_func.getLoc(),
-                                            transform_result_type, input,
-                                            APInt(64, block_size));
+  return builder.create<TF::SpaceToDepthOp>(
+      cluster_func.getLoc(), transform_result_type, input, block_size);
 }
 
 // Performs transformation for a non-replicated input.
@@ -458,7 +456,7 @@ bool HandleHostReplicatedInputs(int64_t index,
   int64_t replicate_arg_index = block_arg.getArgNumber();
   // We need to know the devices to copy to.
   if (!replicate.devices()) return false;
-  int64_t num_replicas = replicate.n().getZExtValue();
+  int64_t num_replicas = replicate.n();
   // Gets inputs at replicate_arg_index for each replica.
   auto inputs = replicate.getOperands()
                     .drop_front(replicate_arg_index * num_replicas)
@@ -669,7 +667,6 @@ void TPUSpaceToDepthPass::runOnOperation() {
   if (!device_func) return;
 
   TF::Conv2DOp first_conv;
-  Optional<ArrayRef<int64_t>> input_shape;
   // A map maps block argument id to the convolutions consumes them.
   llvm::SmallDenseMap<unsigned, std::vector<Conv2DWithBlockSize>>
       argnum_and_convolutions;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
index 3262b83fc94..6e5b07526d1 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
@@ -138,14 +138,17 @@ Value SkipIdentity(Value v, bool allow_other_use,
 
 // Finds the formattable arguments of `execute` and annotates the metadata of
 // `compile` to record these arguments. In addition, it returns a mapping from
-// the formattable arguments of `execute` to the corresponding arguments of
-// `while_op` (which should be passed through to `execute` via `replicate`). The
+// the formattable arguments of `execute` to the corresponding operand of
+// `replicate`. The
 // entries in the mapping are sorted in the order of operands of `execute`.
 llvm::SmallVector<std::pair<int64_t, llvm::SmallVector<Value, 4>>, 4>
 AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
-    TF::WhileOp while_op, tf_device::ReplicateOp replicate,
+    TF::WhileRegionOp while_op, tf_device::ReplicateOp replicate,
     TF::TPUExecuteAndUpdateVariablesOp execute,
-    tf_device::LaunchOp compile_launch, FuncOp body, FuncOp cond) {
+    tf_device::LaunchOp compile_launch) {
+  Region& body = while_op.body();
+  Region& cond = while_op.cond();
+
   llvm::SmallVector<std::pair<int64_t, llvm::SmallVector<Value, 4>>, 4> mapping;
   auto mirrored_variable_indices_attr =
       replicate.getAttrOfType<ArrayAttr>(kMirroredVariableIndicesAttr);
@@ -174,7 +177,7 @@ AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
   assert(metadata_str && "Missing compilation metadata");
   tensorflow::tpu::TPUCompileMetadataProto metadata;
   metadata.ParseFromString(std::string(metadata_str.getValue()));
-  int64_t num_replicas = replicate.n().getLimitedValue();
+  int64_t num_replicas = replicate.n();
   // Find the formattable operands of `execute`, which must be mirrored
   // variables (arguments of `replicate`), and must be pass-throughs from while
   // operands.
@@ -204,39 +207,43 @@ AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
     // We have found a mirrored variable which is an input to the replicated
     // `execute`. Now find if this mirrored variable is a pass-through of while
     // arguments.
-    llvm::SmallVector<Value, 4> while_args;
+    llvm::SmallVector<Value, 4> replicate_args;
     for (int64_t i = 0; i < num_inputs; ++i) {
       llvm::SmallPtrSet<Operation*, 4> skipped_identities;
 
       auto replicate_operand = SkipIdentity(
           replicate.GetReplicaOperandForBlockArgument(block_arg, i),
           /*allow_other_use=*/false, &skipped_identities);
-      auto block_arg = replicate_operand.dyn_cast<BlockArgument>();
-      // To qualify for a valid pass-through mirrored variable, it must satisfy
-      //   1) it is the body's argument;
-      //   2) it has no other uses than `replicate`, the skipped identitiy ops,
-      //      or the return;
-      //   3) the corresponding argument in the cond function has no uses.
-      if (!block_arg || block_arg.getOwner() != &body.front() ||
-          llvm::any_of(replicate_operand.getUsers(),
-                       [&](Operation* user) {
-                         return user != body.front().getTerminator() &&
-                                skipped_identities.count(user) == 0 &&
-                                user != replicate;
-                       }) ||
-          !cond.getArgument(block_arg.getArgNumber()).use_empty()) {
-        while_args.clear();
+      // For region based control flow, the resource operand for the replicate
+      // should be a region capture. If this has any use other than the
+      // replicate op (within the body of the while) or the skipped identities,
+      // then do not apply the transformation to this variable.
+      bool is_region_capture =
+          replicate_operand.getParentRegion()->isProperAncestor(&body);
+      bool has_other_use_in_body =
+          llvm::any_of(replicate_operand.getUsers(), [&](Operation* user) {
+            // Ignore uses that are not in the while body or condition.
+            if (!body.isAncestor(user->getParentRegion()) &&
+                !cond.isAncestor(user->getParentRegion()))
+              return false;
+            // Within the body or cond, only uses in replicate and the skipped
+            // identities is allowed.
+            return user != replicate && skipped_identities.count(user) == 0;
+          });
+
+      if (!is_region_capture || has_other_use_in_body) {
+        replicate_args.clear();
         break;
       }
-      while_args.push_back(while_op.getOperand(block_arg.getArgNumber()));
+      replicate_args.push_back(replicate_operand);
     }
-    if (while_args.empty()) continue;
+    if (replicate_args.empty()) continue;
     // Now set the enable_xla_sharding field in the metadata to inform the
     // compile op.
     auto metadata_arg = metadata.mutable_args(it->second);
     metadata_arg->set_enable_xla_sharding(
         ::tensorflow::tpu::TPUCompileMetadataProto_Arg::ALLOWED);
-    mapping.emplace_back(it->second, std::move(while_args));
+    mapping.emplace_back(it->second, std::move(replicate_args));
   }
   // Sort the mapping according to execute operand order.
   llvm::sort(mapping, llvm::less_first());
@@ -261,10 +268,11 @@ AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
 
 // Adds a new replicated input to the replicate op.
 tf_device::ReplicateOp AddInputsToReplicateOp(
-    tf_device::ReplicateOp replicate, ArrayRef<Value> new_inputs,
+    tf_device::ReplicateOp replicate,
+    MutableArrayRef<TF::VarHandleOp> new_inputs,
     const llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<StringRef, 4>>&
         devices) {
-  int64_t num_replicas = replicate.n().getLimitedValue();
+  int64_t num_replicas = replicate.n();
   assert(new_inputs.size() == num_replicas);
 
   // As model parallelism is not yet supported, we assume that all ops are
@@ -275,8 +283,7 @@ tf_device::ReplicateOp AddInputsToReplicateOp(
              ->getSecond()
              .size() == num_replicas);
 
-  llvm::SmallVector<std::pair<llvm::ArrayRef<Value>, Type>, 8>
-      new_replicated_inputs;
+  llvm::SmallVector<std::pair<ValueRange, Type>, 8> new_replicated_inputs;
   llvm::SmallVector<Value, 8> new_packed_inputs;
   llvm::SmallVector<llvm::SmallVector<Value, 8>, 8> replicated_inputs;
   replicated_inputs.reserve(replicate.GetNumReplicatedBlockArguments());
@@ -293,13 +300,16 @@ tf_device::ReplicateOp AddInputsToReplicateOp(
     new_packed_inputs.emplace_back(
         replicate.GetReplicaOperandForBlockArgument(arg, /*replica=*/0));
   }
-  new_replicated_inputs.emplace_back(new_inputs, new_inputs.front().getType());
+  SmallVector<Value, 4> new_input_values;
+  new_input_values.reserve(new_inputs.size());
+  for (auto var : new_inputs) new_input_values.push_back(var.resource());
+  new_replicated_inputs.emplace_back(new_input_values,
+                                     new_input_values.front().getType());
   OpBuilder builder(replicate);
   auto new_replicate = builder.create<tf_device::ReplicateOp>(
       replicate.getLoc(), num_replicas, devices, new_replicated_inputs,
       new_packed_inputs,
-      llvm::to_vector<8>(
-          replicate.GetBody().getTerminator()->getOperandTypes()));
+      replicate.GetBody().getTerminator()->getOperandTypes());
   for (auto arg : replicate.GetBody().getArguments()) {
     if (replicate.IsReplicatedBlockArgument(arg)) {
       arg.replaceAllUsesWith(
@@ -319,58 +329,6 @@ tf_device::ReplicateOp AddInputsToReplicateOp(
   return new_replicate;
 }
 
-// Adds the per-device state variables to the while-loop's inputs/outputs.
-TF::WhileOp AddStateVarsToWhileOp(TF::WhileOp while_op, FuncOp body,
-                                  FuncOp cond,
-                                  ArrayRef<TF::VarHandleOp> state_vars) {
-  auto body_return = llvm::cast<ReturnOp>(body.front().back());
-  auto new_body_return_vals = llvm::to_vector<4>(body_return.getOperands());
-  auto new_while_operands = llvm::to_vector<4>(while_op.getOperands());
-  auto append_types = [&](ArrayRef<Type> types) {
-    auto new_types = llvm::to_vector<4>(types);
-    for (auto state_var : state_vars) {
-      new_types.push_back(state_var.resource().getType());
-    }
-    return new_types;
-  };
-  for (auto state_var : state_vars) {
-    body.front().addArgument(state_var.resource().getType());
-    cond.front().addArgument(state_var.resource().getType());
-    auto inner_arg = body.getArgument(body.front().getNumArguments() - 1);
-    new_body_return_vals.push_back(inner_arg);
-    new_while_operands.push_back(state_var.resource());
-  }
-  OpBuilder builder = OpBuilder::atBlockEnd(&body.front());
-  // Update return values.
-  builder.create<ReturnOp>(body_return.getLoc(), new_body_return_vals);
-  body_return.erase();
-
-  body.setType(FunctionType::get(append_types(body.getType().getInputs()),
-                                 append_types(body.getType().getResults()),
-                                 body.getContext()));
-  cond.setType(FunctionType::get(append_types(cond.getType().getInputs()),
-                                 cond.getType().getResults(),
-                                 cond.getContext()));
-  for (int64_t i = 0, end = state_vars.size(); i < end; ++i) {
-    int64_t arg_index = body.getNumArguments() - state_vars.size() + i;
-    TF::VarHandleOp state_var = state_vars[i];
-    auto device_attr = state_var.getAttr(kDeviceAttr);
-    if (device_attr) {
-      body.setArgAttr(arg_index, kFuncDeviceAttr, device_attr);
-      cond.setArgAttr(arg_index, kFuncDeviceAttr, device_attr);
-    }
-  }
-  builder.setInsertionPoint(while_op);
-  auto new_while_op = builder.create<TF::WhileOp>(
-      while_op.getLoc(),
-      append_types(llvm::to_vector<4>(while_op.getResultTypes())),
-      new_while_operands, while_op.getAttrs());
-  while_op.replaceAllUsesWith(
-      new_while_op.getResults().take_front(while_op.getNumResults()));
-  while_op.erase();
-  return new_while_op;
-}
-
 // Creates the per-device variables that represent the formatting state of each
 // device.
 llvm::SmallVector<TF::VarHandleOp, 4> CreateStateVars(
@@ -421,9 +379,9 @@ void WrapOpInLaunch(OpBuilder* builder, Location loc, Operation* op,
 }
 
 // Performs the transformation for a replicate op inside a while loop.
-void HandleReplicateOp(TF::WhileOp while_op, tf_device::ReplicateOp replicate,
-                       MLIRContext* context) {
-  int64_t num_replicas = replicate.n().getLimitedValue();
+void HandleReplicateOp(TF::WhileRegionOp while_op,
+                       tf_device::ReplicateOp replicate) {
+  int64_t num_replicas = replicate.n();
   if (num_replicas == 1) return;
   tf_device::LaunchOp execute_launch;
   for (auto execute_launch_op :
@@ -452,13 +410,10 @@ void HandleReplicateOp(TF::WhileOp while_op, tf_device::ReplicateOp replicate,
       !llvm::isa<TF::_TPUCompileMlirOp>(compile_launch.GetBody().front()))
     return;
 
-  FuncOp body = while_op.body_func();
-  FuncOp cond = while_op.cond_func();
-
   // Analyze the formattable inputs.
   auto execute_arg_to_outer_args =
       AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
-          while_op, replicate, execute, compile_launch, body, cond);
+          while_op, replicate, execute, compile_launch);
   if (execute_arg_to_outer_args.empty()) return;
 
   // Extract the replicated devices.
@@ -489,16 +444,7 @@ void HandleReplicateOp(TF::WhileOp while_op, tf_device::ReplicateOp replicate,
       RankedTensorType::get({2}, TF::StringType::get(builder.getContext()));
   auto state_vars =
       CreateStateVars(devices, while_op.getLoc(), key_type, &builder);
-  while_op = AddStateVarsToWhileOp(while_op, body, cond, state_vars);
-  // Add the new while loop inputs to the replicate op inside the body.
-  int64_t new_while_operand_count = while_op.getNumOperands();
-  llvm::SmallVector<Value, 4> inner_state_vars;
-  for (int64_t i = new_while_operand_count - num_replicas;
-       i < new_while_operand_count; ++i) {
-    inner_state_vars.push_back(body.front().getArgument(i));
-  }
-
-  replicate = AddInputsToReplicateOp(replicate, inner_state_vars, devices);
+  replicate = AddInputsToReplicateOp(replicate, state_vars, devices);
   // Build the reformat according to the compilation. Build it inside
   // `replicate`.
   llvm::SmallVector<Value, 8> reformat_operands;
@@ -516,8 +462,7 @@ void HandleReplicateOp(TF::WhileOp while_op, tf_device::ReplicateOp replicate,
 
   // Build the replicated unformat op after the loop. First prepare building the
   // replicate op.
-  llvm::SmallVector<std::pair<llvm::ArrayRef<Value>, Type>, 8>
-      unformat_replicate_operands;
+  llvm::SmallVector<std::pair<ValueRange, Type>, 8> unformat_replicate_operands;
   llvm::SmallVector<Value, 8> unformat_packed_operands;
   for (const auto& entry : execute_arg_to_outer_args) {
     if (entry.second.size() > 1) {
@@ -537,16 +482,17 @@ void HandleReplicateOp(TF::WhileOp while_op, tf_device::ReplicateOp replicate,
   // Build a constant default key to specify that the unformatting should
   // transform the variables to the original format.
   builder.setInsertionPointAfter(while_op);
-  tensorflow::Tensor default_key_tensor(tensorflow::DT_STRING, {2});
+  tensorflow::Tensor default_key_tensor(tensorflow::DT_STRING, {3});
   default_key_tensor.vec<tensorflow::tstring>()(0) = kDefaultShardingValue;
   default_key_tensor.vec<tensorflow::tstring>()(1) = kDefaultShardingValue;
+  default_key_tensor.vec<tensorflow::tstring>()(2) = kDefaultShardingValue;
   auto default_state_key = builder.create<TF::ConstOp>(
       while_op.getLoc(),
       tensorflow::ConvertTensor(default_key_tensor, &builder).ValueOrDie());
   // With all replicated inputs, now build the replicate op.
   auto unformat_replicate = builder.create<tf_device::ReplicateOp>(
       while_op.getLoc(), num_replicas, devices, unformat_replicate_operands,
-      unformat_packed_operands, ArrayRef<Type>{});
+      unformat_packed_operands, TypeRange{});
   // Then build the unformat op in the replicate op.
   builder.setInsertionPointToEnd(&unformat_replicate.GetBody());
   llvm::SmallVector<Value, 8> unformat_operands;
@@ -575,10 +521,9 @@ void HandleReplicateOp(TF::WhileOp while_op, tf_device::ReplicateOp replicate,
 
 void TPUVariableRuntimeReformattingPass::runOnOperation() {
   auto module = getOperation();
-  module.walk([&](TF::WhileOp while_op) {
-    auto body = llvm::cast<FuncOp>(module.lookupSymbol(while_op.body()));
+  module.walk([&](TF::WhileRegionOp while_op) {
     tf_device::ReplicateOp replicate;
-    body.walk([&](tf_device::ReplicateOp replicate_op) {
+    while_op.body().walk([&](tf_device::ReplicateOp replicate_op) {
       if (replicate == nullptr) {
         replicate = replicate_op;
         return WalkResult::advance();
@@ -591,7 +536,7 @@ void TPUVariableRuntimeReformattingPass::runOnOperation() {
     // `tf_device.parallel_execute` op in the `tf_device.replicate` is present.
     if (replicate &&
         replicate.GetBody().getOps<tf_device::ParallelExecuteOp>().empty())
-      HandleReplicateOp(while_op, replicate, &getContext());
+      HandleReplicateOp(while_op, replicate);
   });
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
index 0a69987deb0..e9cea13f550 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc
@@ -43,6 +43,10 @@ namespace {
 
 class BreakUpIslands : public TF::PerFunctionAggregateAnalysisConsumerPass<
                            BreakUpIslands, TF::SideEffectAnalysis> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<tf_executor::TensorFlowExecutorDialect>();
+  }
+
  public:
   void runOnFunction(FuncOp func,
                      const TF::SideEffectAnalysis::Info& side_effect_analysis);
@@ -126,18 +130,15 @@ void PopulateEmptyIsland(tf_executor::IslandOp island) {
   OpBuilder builder(&island.GetBody(), island.GetBody().begin());
   tf_executor::YieldOp yield = island.GetYield();
   if (yield.getNumOperands() == 0) {
-    builder.create<TF::NoOp>(island.getLoc(), llvm::ArrayRef<mlir::Type>{},
-                             llvm::ArrayRef<mlir::Value>{},
-                             llvm::ArrayRef<mlir::NamedAttribute>{});
+    builder.create<TF::NoOp>(island.getLoc(), TypeRange{}, ValueRange{});
   } else if (yield.getNumOperands() == 1) {
     Value operand = yield.getOperand(0);
     auto identity = builder.create<TF::IdentityOp>(island.getLoc(),
                                                    operand.getType(), operand);
     yield.setOperand(0, identity.output());
   } else {
-    auto types = llvm::to_vector<4>(yield.getOperandTypes());
-    auto identity_n = builder.create<TF::IdentityNOp>(island.getLoc(), types,
-                                                      yield.getOperands());
+    auto identity_n = builder.create<TF::IdentityNOp>(
+        island.getLoc(), yield.getOperandTypes(), yield.getOperands());
     for (auto it : llvm::enumerate(identity_n.getResults()))
       yield.setOperand(it.index(), it.value());
   }
@@ -145,8 +146,8 @@ void PopulateEmptyIsland(tf_executor::IslandOp island) {
 
 // Helper that creates an island. If `sub_op` is not nullptr, it will be moved
 // to the island. Otherwise a NoOp will be added to the island.
-tf_executor::IslandOp CreateIsland(ArrayRef<Type> result_types,
-                                   ArrayRef<Value> control_inputs,
+tf_executor::IslandOp CreateIsland(TypeRange result_types,
+                                   ValueRange control_inputs,
                                    const tf_executor::ControlType& control_type,
                                    const Location& loc, Operation* sub_op,
                                    tf_executor::IslandOp original_island) {
@@ -162,10 +163,8 @@ tf_executor::IslandOp CreateIsland(ArrayRef<Type> result_types,
     sub_op->moveBefore(block, block->begin());
     island_builder.create<tf_executor::YieldOp>(loc, sub_op->getResults());
   } else {
-    island_builder.create<TF::NoOp>(
-        island.getLoc(), llvm::ArrayRef<mlir::Type>{},
-        llvm::ArrayRef<mlir::Value>{}, llvm::ArrayRef<mlir::NamedAttribute>{});
-    island_builder.create<tf_executor::YieldOp>(loc, ArrayRef<Value>{});
+    island_builder.create<TF::NoOp>(island.getLoc(), TypeRange{}, ValueRange{});
+    island_builder.create<tf_executor::YieldOp>(loc, ValueRange{});
   }
   return island;
 }
@@ -278,8 +277,8 @@ void BreakUpIslands::BreakUpIsland(
                                   ? island_control_inputs
                                   : predecessor_controls;
     auto new_island =
-        CreateIsland(llvm::to_vector<4>(sub_op.getResultTypes()), control,
-                     control_type, sub_op.getLoc(), &sub_op, island_op);
+        CreateIsland(sub_op.getResultTypes(), control, control_type,
+                     sub_op.getLoc(), &sub_op, island_op);
     new_control_for_sub_ops[&sub_op] = new_island.control();
     if (sources_and_sinks.sinks.count(&sub_op)) {
       sink_island_controls.push_back(new_island.control());
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/derived_attr_populator_gen.cc b/tensorflow/compiler/mlir/tensorflow/translate/derived_attr_populator_gen.cc
deleted file mode 100644
index e4c965b6cb1..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/translate/derived_attr_populator_gen.cc
+++ /dev/null
@@ -1,196 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/Signals.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/TableGen/Error.h"
-#include "llvm/TableGen/Main.h"
-#include "llvm/TableGen/Record.h"
-#include "llvm/TableGen/TableGenBackend.h"
-#include "mlir/TableGen/Operator.h"  // from @llvm-project
-
-using llvm::LessRecord;
-using llvm::raw_ostream;
-using llvm::Record;
-using llvm::RecordKeeper;
-using mlir::tblgen::Operator;
-
-// Helper macro that returns indented os.
-#define OUT(X) os.indent((X))
-
-// Emits TensorFlow derived attribute populator functions for each of the ops.
-static void EmitOpAttrPopulators(const std::vector<Operator> &ops,
-                                 raw_ostream *ostream) {
-  raw_ostream &os = *ostream;
-
-  for (const auto &op : ops) {
-    // TODO(hinsu): Introduce a derived attribute property for ops with no
-    // type attributes. That way an error can be generated if no derived type
-    // attribute or the property is set. This will make sure derived type
-    // attributes are not omitted by mistake.
-
-    // Emit function signature.
-    auto op_name = op.getCppClassName();
-    OUT(0) << "static Status Populate" << op_name
-           << "DerivedAttrs(mlir::TF::" << op_name
-           << "& op, AttrValueMap *values) {\n";
-
-    for (const auto &named_attr : op.getAttributes()) {
-      auto attr_name = named_attr.name;
-      const auto &attr = named_attr.attr;
-      if (!attr.isDerivedAttr()) continue;
-      auto retType = attr.getReturnType();
-      if (retType == "ShapedType" || retType == "mlir::TF::OperandShapeRange" ||
-          retType == "mlir::TF::ResultShapeRange") {
-        OUT(2) << "TF_RETURN_IF_ERROR(SetShapeAttribute(\"" << attr_name
-               << "\", op." << attr_name << "(), values));\n";
-      } else if (retType == "Type" ||
-                 retType == "mlir::OperandElementTypeRange" ||
-                 retType == "mlir::ResultElementTypeRange") {
-        OUT(2) << "TF_RETURN_IF_ERROR(SetTypeAttribute(\"" << attr_name
-               << "\", op." << attr_name << "(), values));\n";
-      } else if (attr.isSubClassOf("TF_DerivedOperandSizeAttr") ||
-                 attr.isSubClassOf("TF_DerivedResultSizeAttr")) {
-        OUT(2) << "TF_RETURN_IF_ERROR(SetSizeAttribute(\"" << attr_name
-               << "\", op." << attr_name << "(), values));\n";
-      } else {
-        PrintFatalError(op.getLoc(),
-                        "NYI: attribute populator for derived attributes");
-      }
-    }
-
-    OUT(2) << "return Status::OK();\n";
-    OUT(0) << "}\n\n";
-  }
-}
-
-// Emits TensorFlow derived attribute populator function taking an Operation
-// as argument.
-static void EmitInstAttrPopulator(const std::vector<Operator> &ops,
-                                  raw_ostream *ostream) {
-  raw_ostream &os = *ostream;
-
-  // Emit function signature.
-  OUT(0) << "static Status PopulateDerivedAttrs(mlir::Operation* op, "
-            "AttrValueMap* values) {\n";
-
-  for (const auto &op : ops) {
-    auto op_name = op.getCppClassName();
-
-    // Emit conditional for the op and then call populator for the op on match.
-    OUT(2) << "if (auto tfOp = llvm::dyn_cast<mlir::TF::" << op_name
-           << ">(op)) {\n";
-    OUT(4) << "TF_RETURN_IF_ERROR(Populate" << op_name
-           << "DerivedAttrs(tfOp, values));\n";
-    OUT(2) << "}\n";
-  }
-  OUT(2) << "return Status::OK();\n";
-  OUT(0) << "}\n\n";
-}
-
-// Emits TensorFlow derived attribute name collector functions for each of the
-// ops.
-static void EmitOpAttrNameCollector(const std::vector<Operator> &ops,
-                                    raw_ostream *ostream) {
-  raw_ostream &os = *ostream;
-
-  for (const auto &op : ops) {
-    // Emit function signature.
-    auto op_name = op.getCppClassName();
-    OUT(0) << "static void Collect" << op_name
-           << "DerivedAttrsName(mlir::TF::" << op_name
-           << "& op, llvm::SmallDenseSet<llvm::StringRef>* values) {\n";
-
-    // Insert the name for each derived attribute in the set.
-    for (const auto &named_attr : op.getAttributes()) {
-      auto attr_name = named_attr.name;
-      const auto &attr = named_attr.attr;
-      if (!attr.isDerivedAttr()) continue;
-      OUT(2) << "values->insert(\"" << attr_name << "\");\n";
-    }
-
-    OUT(2) << "return;\n";
-    OUT(0) << "}\n\n";
-  }
-}
-
-// Emits TensorFlow derived attribute name collector function taking an
-// Operation as argument.
-static void EmitInstAttrNameCollector(const std::vector<Operator> &ops,
-                                      raw_ostream *ostream) {
-  raw_ostream &os = *ostream;
-
-  // Emit function signature.
-  OUT(0) << "static void CollectDerivedAttrsName(mlir::Operation* op, "
-            "llvm::SmallDenseSet<llvm::StringRef>* values) {\n";
-
-  for (const auto &op : ops) {
-    auto op_name = op.getCppClassName();
-
-    // Emit conditional for the op and then call collect for the op on match.
-    OUT(2) << "if (auto tf_op = llvm::dyn_cast<mlir::TF::" << op_name
-           << ">(op)) {\n";
-    OUT(4) << "Collect" << op_name << "DerivedAttrsName(tf_op, values);\n";
-    OUT(2) << "}\n";
-  }
-  OUT(2) << "return;\n";
-  OUT(0) << "}\n\n";
-}
-
-// The function below has a non-constant reference as that is required by LLVM's
-// TableGenMain.
-// NOLINTNEXTLINE
-static bool DerivedAttrWritersMain(raw_ostream &os, RecordKeeper &records) {
-  emitSourceFileHeader("MLIR Derived TensorFlow Attributes Populators", os);
-
-  // Retrieve all the definitions derived from TF_Op and sort by record name.
-  std::vector<Record *> defs = records.getAllDerivedDefinitions("TF_Op");
-  llvm::sort(defs, LessRecord());
-
-  std::vector<Operator> ops;
-  ops.reserve(defs.size());
-
-  // Wrap TensorFlow op definitions into tblgen Operator wrapper and verify
-  // them.
-  for (const auto *def : defs) {
-    ops.emplace_back(Operator(def));
-
-    const Operator &op = ops.back();
-    if (op.getDialectName() != "tf")
-      PrintFatalError(op.getLoc(),
-                      "unexpected op name format: 'TF_' prefix missing");
-    if (!op.getCppClassName().endswith("Op"))
-      PrintFatalError(op.getLoc(),
-                      "unexpected op name format: 'Op' suffix missing");
-  }
-
-  EmitOpAttrPopulators(ops, &os);
-  EmitInstAttrPopulator(ops, &os);
-
-  EmitOpAttrNameCollector(ops, &os);
-  EmitInstAttrNameCollector(ops, &os);
-
-  return false;
-}
-
-int main(int argc, char **argv) {
-  llvm::InitLLVM y(argc, argv);
-  llvm::cl::ParseCommandLineOptions(argc, argv);
-  return TableGenMain(argv[0], &DerivedAttrWritersMain);
-}
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 631553b381e..c69e802994d 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -49,6 +49,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/export_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
+#include "tensorflow/compiler/mlir/utils/name_utils.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
@@ -80,46 +81,14 @@ constexpr char kInvalidExecutorGraphMsg[] =
 constexpr char kDeviceAttr[] = "tf.device";
 constexpr char kResourceArgUniqueIdAttr[] = "tf._resource_arg_unique_id";
 
-bool IsLegalChar(char c, bool first_char) {
-  if (isalpha(c)) return true;
-  if (isdigit(c)) return true;
-  if (c == '.') return true;
-  if (c == '_') return true;
-
-  // First character of a node name can only be a letter, digit, dot or
-  // underscore.
-  if (first_char) return false;
-
-  if (c == '/') return true;
-  if (c == '-') return true;
-
-  return false;
-}
-
-// Convert characters in name that are considered illegal in TensorFlow Node
-// name to '.'.
-std::string LegalizeNodeName(llvm::StringRef name) {
-  assert(!name.empty() && "expected non-empty name");
-
-  std::string legalized_name;
-  bool first = true;
-  for (auto c : name) {
-    if (IsLegalChar(c, first)) {
-      legalized_name += c;
-    } else {
-      legalized_name += '.';
-    }
-    first = false;
-  }
-
-  return legalized_name;
-}
-
 // OpOrArgLocNameMapper that legalizes the returned name.
 class LegalizedOpOrValLocNameMapper : public OpOrArgLocNameMapper {
  private:
   std::string GetName(OpOrVal op_or_val) override {
-    return LegalizeNodeName(OpOrArgLocNameMapper::GetName(op_or_val));
+    std::string name = OpOrArgLocNameMapper::GetName(op_or_val);
+    assert(!name.empty() && "expected non-empty name");
+    mlir::LegalizeNodeName(name);
+    return name;
   }
 };
 
@@ -275,6 +244,7 @@ StatusOr<std::unique_ptr<NodeDef>> Exporter::GetArgumentNode(
       func.getArgAttrs(index);
   absl::flat_hash_set<absl::string_view> attrs_to_ignore = {kDeviceAttr};
   TF_RETURN_IF_ERROR(ConvertAttributes(func_arg_i_attrs, attrs_to_ignore,
+                                       /*remove_ref_type=*/false,
                                        node_def->mutable_attr()));
 
   return node_def;
@@ -523,13 +493,14 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(
       if (index >= num_data_results) break;
       // TODO(jpienaar): If there is a result index specified, ensure only one
       // and that it matches the result index of the op.
-      std::string orig_name(output_names[index]);
-      auto tensor_id = ParseTensorName(orig_name);
-      auto name = LegalizeNodeName(
-          llvm::StringRef(tensor_id.node().data(), tensor_id.node().size()));
+      std::string name(output_names[index]);
+      auto tensor_id = ParseTensorName(name);
+      std::string tensor_id_node(tensor_id.node());
+      assert(!tensor_id_node.empty() && "expected non-empty name");
+      mlir::LegalizeNodeName(tensor_id_node);
 
       // Ensure name does not get reused.
-      (void)exporter.op_to_name_.GetUniqueName(name);
+      (void)exporter.op_to_name_.GetUniqueName(tensor_id_node);
     }
   }
 
@@ -537,8 +508,9 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(
     TF_RET_CHECK(input_names.size() == block.getNumArguments());
     for (const auto& it : llvm::enumerate(function.getArguments())) {
       // TODO(lyandy): Update when changing feed/fetch import.
-      std::string orig_name(input_names[it.index()]);
-      std::string name = LegalizeNodeName(orig_name);
+      std::string name(input_names[it.index()]);
+      assert(!name.empty() && "expected non-empty name");
+      mlir::LegalizeNodeName(name);
       auto tensor_id = ParseTensorName(name);
       TF_RET_CHECK(tensor_id.index() == 0)
           << "input port designation not supported";
@@ -690,8 +662,9 @@ Status Exporter::ConvertLibFunction(const GraphExportConfig& configs,
       grad_string.data(), stateful_string.data()};
   llvm::SmallVector<mlir::NamedAttribute, 8> funcAttrs(
       function.getDialectAttrs());
-  TF_RETURN_IF_ERROR(
-      ConvertAttributes(funcAttrs, attrs_to_ignore, func_def.mutable_attr()));
+  TF_RETURN_IF_ERROR(ConvertAttributes(funcAttrs, attrs_to_ignore,
+                                       /*remove_ref_type=*/false,
+                                       func_def.mutable_attr()));
 
   for (int i = 0, e = function.getNumArguments(); i < e; ++i) {
     if (auto resource_arg_unique_id_attr =
@@ -708,6 +681,7 @@ Status Exporter::ConvertLibFunction(const GraphExportConfig& configs,
         kDeviceAttr, kResourceArgUniqueIdAttr};
     FunctionDef::ArgAttrs func_def_arg_i_attrs;
     TF_RETURN_IF_ERROR(ConvertAttributes(func_arg_i_attrs, attrs_to_ignore,
+                                         /*remove_ref_type=*/false,
                                          func_def_arg_i_attrs.mutable_attr()));
     if (func_def_arg_i_attrs.attr().empty()) continue;
     (*func_def.mutable_arg_attr())[i] = std::move(func_def_arg_i_attrs);
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc
index 3ca06e5efa9..0057e498cea 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.cc
@@ -21,12 +21,15 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/export_utils.h"
+#include "tensorflow/compiler/mlir/utils/string_container_utils.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -34,7 +37,6 @@ limitations under the License.
 namespace tensorflow {
 
 namespace {
-using stream_executor::port::StatusOr;
 
 // Sets type list attribute with the given `name` to the given `types`. If the
 // attribute already exists with a different value, returns an error.
@@ -85,22 +87,12 @@ Status SetShapeAttribute(absl::string_view name, ContainerT shapes,
   return Status::OK();
 }
 
-// Include the auto generated derived attribute populator function taking
-// TensorFlow dialect operation as an argument. This file contains the function
-// definitions and isn't a header file.
-#include "tensorflow/compiler/mlir/tensorflow/translate/derived_attr_populator.inc"
-
-// Collect all the unregistered attributes for an TF dialect operation.
+// Collects all the unregistered attributes for an TF dialect operation.
 // Attributes "name" and "device" are not included because they are not part
 // of an TF op attributes.
 Status GetUnregisteredAttrs(
-    mlir::Operation* inst,
+    mlir::Operation* inst, const tensorflow::OpRegistrationData* op_reg_data,
     absl::flat_hash_set<absl::string_view>* attrs_to_ignore) {
-  TF_ASSIGN_OR_RETURN(auto op_name,
-                      GetTensorFlowOpName(inst->getName().getStringRef()));
-
-  const tensorflow::OpRegistrationData* op_reg_data =
-      tensorflow::OpRegistry::Global()->LookUp(std::string(op_name));
   if (!op_reg_data) {
     // This is likely a function call node, so we should continue.
     return Status::OK();
@@ -123,29 +115,27 @@ Status GetUnregisteredAttrs(
   return Status::OK();
 }
 
-}  // namespace
-
-StatusOr<std::unique_ptr<NodeDef>> ConvertTFDialectOpToNodeDef(
-    mlir::Operation* inst, llvm::StringRef name,
+// Collects all attribute names to ignore in an MLIR operation when exporting to
+// a TensorFlow NodeDef.
+StatusOr<absl::flat_hash_set<absl::string_view>> GetAttributesToIgnore(
+    mlir::Operation* inst, mlir::DictionaryAttr derived_attrs,
+    const tensorflow::OpRegistrationData* op_reg_data,
     bool ignore_unregistered_attrs) {
-  // Use auto generated function to populate derived attribute.
-  //
-  // Note: This only populates derived attributes for TensorFlow ops that are
-  // generated using the TableGen. Manually defined ops should have all the
-  // attributes present as native MLIR op attributes.
-
   // The elements are owned by the MLIRContext.
   absl::flat_hash_set<absl::string_view> attrs_to_ignore;
-  if (inst->isRegistered()) {
-    // We ignore attributes attached to the operation when there is already a
-    // derived attribute defined in ODS.
-    llvm::SmallDenseSet<llvm::StringRef> derived_attrs;
-    CollectDerivedAttrsName(inst, &derived_attrs);
-    for (auto name : derived_attrs) attrs_to_ignore.insert(name.data());
+
+  // We ignore attributes attached to the operation when there is already a
+  // derived attribute defined in ODS.
+  if (derived_attrs) {
+    for (auto derived_attr : derived_attrs) {
+      attrs_to_ignore.insert(
+          mlir::StringRefToView(derived_attr.first.strref()));
+    }
   }
 
   if (ignore_unregistered_attrs) {
-    TF_RETURN_IF_ERROR(GetUnregisteredAttrs(inst, &attrs_to_ignore));
+    TF_RETURN_IF_ERROR(
+        GetUnregisteredAttrs(inst, op_reg_data, &attrs_to_ignore));
   }
 
   if (inst->hasTrait<mlir::OpTrait::AttrSizedOperandSegments>()) {
@@ -162,15 +152,24 @@ StatusOr<std::unique_ptr<NodeDef>> ConvertTFDialectOpToNodeDef(
     attrs_to_ignore.insert(attr_name.data());
   }
 
-  TF_ASSIGN_OR_RETURN(auto node_def,
-                      GetOperationNodeDef(attrs_to_ignore, inst, name));
+  if (llvm::isa<mlir::TF::CaseOp, mlir::TF::IfOp, mlir::TF::WhileOp>(inst))
+    attrs_to_ignore.insert("is_stateless");
 
-  // If the operation is not registered, we won't be able to infer any attribute
-  if (inst->isRegistered()) {
+  return attrs_to_ignore;
+}
+
+// Populates all derived attributes of a MLIR operation in a proto
+// map<string, AttrValue>.
+Status PopulateDerivedAttributes(mlir::Operation* inst, llvm::StringRef name,
+                                 mlir::DictionaryAttr derived_attrs,
+                                 bool ignore_unregistered_attrs,
+                                 AttrValueMap* attributes) {
+  if (derived_attrs) {
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        PopulateDerivedAttrs(inst, node_def->mutable_attr()),
-        "When populating derived attrs for ",
-        inst->getName().getStringRef().str());
+        ConvertAttributes(derived_attrs.getValue(), /*attrs_to_ignore=*/{},
+                          /*remove_ref_type=*/true, attributes),
+        "while converting derived attributes for node: ",
+        mlir::StringRefToView(name));
   }
 
   // Here we only add the shapes for the leading values with ShapedType,
@@ -185,10 +184,46 @@ StatusOr<std::unique_ptr<NodeDef>> ConvertTFDialectOpToNodeDef(
       mlir::TF::ResultShapeRange output_shapes = {
           mlir::TF::ResultShapeIterator(begin),
           mlir::TF::ResultShapeIterator(end)};
-      TF_RETURN_IF_ERROR(SetShapeAttribute("_output_shapes", output_shapes,
-                                           node_def->mutable_attr()));
+      TF_RETURN_IF_ERROR(
+          SetShapeAttribute("_output_shapes", output_shapes, attributes));
     }
   }
+
+  return Status::OK();
+}
+
+}  // namespace
+
+Status GetAttrValuesFromOperation(
+    mlir::Operation* inst, llvm::StringRef name,
+    const tensorflow::OpRegistrationData* op_reg_data,
+    bool ignore_unregistered_attrs, AttrValueMap* attributes) {
+  mlir::DictionaryAttr derived_attrs = nullptr;
+  if (auto interface = llvm::dyn_cast<mlir::DerivedAttributeOpInterface>(inst))
+    derived_attrs = interface.materializeDerivedAttributes();
+  TF_ASSIGN_OR_RETURN(auto attrs_to_ignore,
+                      GetAttributesToIgnore(inst, derived_attrs, op_reg_data,
+                                            ignore_unregistered_attrs));
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      ConvertAttributes(inst->getAttrs(), attrs_to_ignore,
+                        /*remove_ref_type=*/false, attributes),
+      "while converting attributes for node: ", mlir::StringRefToView(name));
+  TF_RETURN_IF_ERROR(PopulateDerivedAttributes(
+      inst, name, derived_attrs, ignore_unregistered_attrs, attributes));
+  return Status::OK();
+}
+
+StatusOr<std::unique_ptr<NodeDef>> ConvertTFDialectOpToNodeDef(
+    mlir::Operation* inst, llvm::StringRef name,
+    bool ignore_unregistered_attrs) {
+  TF_ASSIGN_OR_RETURN(auto node_def, GetOperationNodeDef(inst, name));
+  TF_ASSIGN_OR_RETURN(auto op_name,
+                      GetTensorFlowOpName(inst->getName().getStringRef()));
+  const tensorflow::OpRegistrationData* op_reg_data =
+      tensorflow::OpRegistry::Global()->LookUp(op_name.str());
+  TF_RETURN_IF_ERROR(GetAttrValuesFromOperation(inst, name, op_reg_data,
+                                                ignore_unregistered_attrs,
+                                                node_def->mutable_attr()));
   return node_def;
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h
index a19ad1f2940..6341b14fe7b 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h
@@ -18,12 +18,24 @@ limitations under the License.
 
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/utils/export_utils.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
 
-// Converts an MLIR operation to TensorFlow NodeDef with given node name. This
+// Extracts the attributes of a MLIR operation and populates the converted
+// attributes in a proto map<string, AttrValue>.
+Status GetAttrValuesFromOperation(
+    mlir::Operation* inst, llvm::StringRef name,
+    const tensorflow::OpRegistrationData* op_reg_data,
+    bool ignore_unregistered_attrs, AttrValueMap* attributes);
+
+// Converts a MLIR operation to TensorFlow NodeDef with given node name. This
 // name should be unique to the graph it is being inserted to. If the
 // `ignore_unregistered_attrs` argument is set to true, the attributes which are
 // not in the op registry will be ignored. If the `ignore_unregistered_attrs`
@@ -31,9 +43,9 @@ namespace tensorflow {
 // ShapedType for the leading values with ShapedType in the results of the
 // nodes. Set it to true if the returned NodeDef will be executed by the linked
 // TF Eager runtime.
-stream_executor::port::StatusOr<std::unique_ptr<NodeDef>>
-ConvertTFDialectOpToNodeDef(mlir::Operation* inst, llvm::StringRef name,
-                            bool ignore_unregistered_attrs);
+StatusOr<std::unique_ptr<NodeDef>> ConvertTFDialectOpToNodeDef(
+    mlir::Operation* inst, llvm::StringRef name,
+    bool ignore_unregistered_attrs);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 692d0eaf962..42ce5c533a2 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -64,6 +64,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/loader_util.h"
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -72,8 +73,11 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
@@ -141,6 +145,13 @@ bool IsResourceOutputShapesAttribute(const AttrValue& attr_value,
   return false;
 }
 
+void LoadImporterDialects(mlir::MLIRContext& context) {
+  // Load dialects involved in the conversion
+  mlir::DialectRegistry registry;
+  mlir::RegisterAllTensorFlowDialects(registry);
+  registry.loadAll(&context);
+}
+
 // This class is used to generate new MLIR function name strings that are both
 // unique in the TF function library `flib_` and unique among the name strings
 // generated by the class object during its lifetime.
@@ -171,6 +182,8 @@ class NameUniquifier : public OpOrArgNameMapper {
 
 Status UpgradeLegacyGraph(Graph* graph, FunctionLibraryDefinition* flib_def,
                           bool restrict_functionalization_to_tpu_nodes) {
+  TF_RETURN_IF_ERROR(GenerateResourceSharedNameIfEmpty(*graph, *flib_def));
+
   // If `restrict_functionalization_to_tpu_nodes` is true let filter function
   // return true for `_tpu_replicate` nodes, otherwise don't set filter.
   NodeFilter node_filter =
@@ -298,21 +311,6 @@ class ImporterBase {
     return ::tensorflow::ConvertTensorProto(value, &builder_);
   }
 
-  // Converts the tensor shape proto into an MLIR shape attribute.
-  StatusOr<mlir::TF::ShapeAttr> ConvertTensorShapeProto(
-      const TensorShapeProto& shape) {
-    if (shape.unknown_rank())
-      return mlir::TF::ShapeAttr::get(builder_.getContext(), llvm::None);
-
-    llvm::SmallVector<int64_t, 4> dims;
-    dims.reserve(shape.dim().size());
-    for (const auto& dim : shape.dim()) {
-      dims.push_back(dim.size());
-    }
-    return mlir::TF::ShapeAttr::get(builder_.getContext(),
-                                    llvm::makeArrayRef(dims));
-  }
-
   // Converts func name in graphdef to mlir::SymbolRefAttribute.
   StatusOr<mlir::FlatSymbolRefAttr> ConvertFunctionCallName(
       const std::string& func_name);
@@ -1130,74 +1128,36 @@ StatusOr<mlir::FlatSymbolRefAttr> ImporterBase::ConvertFunctionCallName(
 StatusOr<mlir::Attribute> ImporterBase::ConvertAttributeValue(
     const AttrValue& value) {
   switch (value.value_case()) {
-    case AttrValue::kI:
-      return builder_.getI64IntegerAttr(value.i());
-    case AttrValue::kS:
-      return builder_.getStringAttr(value.s());
-    case AttrValue::kF:
-      return builder_.getFloatAttr(builder_.getF32Type(), value.f());
-    case AttrValue::kB:
-      return builder_.getBoolAttr(value.b());
-    case AttrValue::kType: {
-      mlir::Type type;
-      TF_RETURN_IF_ERROR(ConvertDataType(value.type(), builder_, &type));
-      return mlir::TypeAttr::get(type);
-    }
-    case AttrValue::kShape:
-      return ConvertTensorShapeProto(value.shape());
-    case AttrValue::kTensor:
-      return ConvertTensorProto(value.tensor());
-    case AttrValue::kList: {
-      absl::InlinedVector<mlir::Attribute, 8> attrs;
-      for (const auto& item : value.list().i())
-        attrs.push_back(builder_.getI64IntegerAttr(item));
-      for (const auto& item : value.list().s())
-        attrs.push_back(builder_.getStringAttr(item));
-      for (const auto& item : value.list().f())
-        attrs.push_back(builder_.getFloatAttr(builder_.getF32Type(), item));
-      for (const auto& item : value.list().b())
-        attrs.push_back(builder_.getBoolAttr(item));
-      for (const auto& item : value.list().type()) {
-        mlir::Type type;
-        TF_RETURN_IF_ERROR(ConvertDataType(DataType(item), builder_, &type));
-        attrs.push_back(mlir::TypeAttr::get(type));
-      }
-      for (const auto& item : value.list().shape()) {
-        TF_ASSIGN_OR_RETURN(auto attr, ConvertTensorShapeProto(item));
-        attrs.push_back(attr);
-      }
-      for (const auto& item : value.list().tensor()) {
-        TF_ASSIGN_OR_RETURN(auto attr, ConvertTensorProto(item));
-        attrs.push_back(attr);
-      }
-      for (const auto& item : value.list().func()) {
-        TF_ASSIGN_OR_RETURN(auto attr, ConvertFunctionCallName(item.name()));
-        if (item.attr_size() != 0)
-          return errors::Unimplemented(
-              "func attributes with non-zero attr.size()");
-        attrs.push_back(attr);
-      }
-      return builder_.getArrayAttr(
-          llvm::makeArrayRef(attrs.begin(), attrs.end()));
-    }
     case AttrValue::kFunc: {
       // TODO(b/156546237): Unify kFunc/NameAttrList attribute representation.
       // Currently kFunc/NameAttrList attributes in a kList/repeated AttrValue
       // will not use this representation.
       NamedAttrList attrs;
       for (const auto& func_attr : value.func().attr()) {
-        TF_ASSIGN_OR_RETURN(auto attr, ConvertAttributeValue(func_attr.second));
+        TF_ASSIGN_OR_RETURN(
+            auto attr, ImporterBase::ConvertAttributeValue(func_attr.second));
         attrs.push_back(builder_.getNamedAttr(func_attr.first, attr));
       }
       auto func_attrs = builder_.getDictionaryAttr(attrs);
       return mlir::TF::FuncAttr::get(context_, value.func().name(), func_attrs);
     }
-    case AttrValue::VALUE_NOT_SET:
-      return builder_.getUnitAttr();
-    // kPlaceholder is not implemented.
+    case AttrValue::kList: {
+      if (!value.list().func().empty()) {
+        absl::InlinedVector<mlir::Attribute, 8> attrs;
+        for (const auto& item : value.list().func()) {
+          TF_ASSIGN_OR_RETURN(auto attr, ConvertFunctionCallName(item.name()));
+          if (item.attr_size() != 0)
+            return errors::Unimplemented(
+                "func attributes with non-zero attr.size()");
+          attrs.push_back(attr);
+        }
+        return builder_.getArrayAttr(
+            llvm::makeArrayRef(attrs.begin(), attrs.end()));
+      }
+      return ConvertNonFuncAttributeValue(value, &builder_);
+    }
     default:
-      return errors::Unimplemented(
-          absl::StrCat("Attribute ", value.DebugString()));
+      return ConvertNonFuncAttributeValue(value, &builder_);
   }
 }
 
@@ -2136,11 +2096,7 @@ StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
     mlir::MLIRContext* context, const Graph& graph,
     const GraphDebugInfo& debug_info, const FunctionLibraryDefinition& flib_def,
     const GraphImportConfig& specs, llvm::StringRef func_name) {
-  // Load dialects involved in the conversion
-  context->loadDialect<mlir::StandardOpsDialect>();
-  context->loadDialect<mlir::TF::TensorFlowDialect>();
-  context->loadDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
-
+  LoadImporterDialects(*context);
   mlir::OwningModuleRef module =
       mlir::ModuleOp::create(mlir::UnknownLoc::get(context));
   std::unordered_map<std::string, std::string> tf_name_to_mlir_name;
@@ -3197,6 +3153,7 @@ Status CreateSavedModelIR(
 StatusOr<mlir::OwningModuleRef> SavedModelObjectGraphImporter::Convert(
     SavedModelV2Bundle* saved_model, absl::Span<std::string> exported_names,
     mlir::MLIRContext* context, bool add_default_attributes) {
+  LoadImporterDialects(*context);
   GraphDebugInfo dummy_debug_info;
   const GraphDebugInfo& debug_info =
       saved_model->debug_info() ? *saved_model->debug_info() : dummy_debug_info;
@@ -3276,6 +3233,7 @@ class SavedModelSignatureDefImporter {
   static StatusOr<mlir::OwningModuleRef> Convert(
       const SavedModelBundle& bundle, absl::Span<std::string> exported_names,
       mlir::MLIRContext* context, bool upgrade_legacy) {
+    LoadImporterDialects(*context);
     SavedModelSignatureDefImporter importer(bundle, exported_names, context);
     TF_RETURN_IF_ERROR(importer.InitializeGraph(upgrade_legacy));
     return importer.ConvertSignatures();
@@ -3562,6 +3520,7 @@ Status SavedModelSignatureDefImporter::LiftVariables() {
   mlir::StatusScopedDiagnosticHandler diag_handler(module_->getContext());
 
   mlir::PassManager pm(module_->getContext());
+  SetCrashReproducer(pm);
   pm.addPass(mlir::tf_executor::CreateTFExecutorGraphPruningPass());
   pm.addPass(mlir::CreateExecutorDialectToFunctionalConversionPass());
   pm.addPass(
@@ -3648,6 +3607,8 @@ stream_executor::port::StatusOr<mlir::OwningModuleRef> ConvertFunctionToMlir(
   tensorflow::GraphDebugInfo dummy_debug_info;
   tensorflow::GraphImportConfig specs;
   specs.graph_as_function = true;
+  for (const auto* control_ret_node : fbody->control_ret_nodes)
+    specs.control_outputs.push_back(control_ret_node->name());
   return GraphDefImporter::Convert(context, *fbody->graph, dummy_debug_info,
                                    flib_def, specs, name);
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
index b646e14b71d..f63cb091a09 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "llvm/Support/MemoryBuffer.h"
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/Translation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
@@ -86,6 +87,9 @@ static LogicalResult MlirToGraphdefTranslateFunction(
 }
 
 static TranslateFromMLIRRegistration mlir_to_graphdef_translate(
-    "mlir-to-graphdef", MlirToGraphdefTranslateFunction);
+    "mlir-to-graphdef", MlirToGraphdefTranslateFunction,
+    [](DialectRegistry& registry) {
+      mlir::RegisterAllTensorFlowDialects(registry);
+    });
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc b/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
index 5236bdeffbf..22e6559a0f2 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/translate_tf_dialect_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/Translation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
 
 namespace mlir {
@@ -67,6 +68,7 @@ static LogicalResult MlirToTfNodeDef(ModuleOp module,
 // Test only translation to convert a simple MLIR module with a single TF
 // dialect op to NodeDef.
 static TranslateFromMLIRRegistration translate_from_mlir_registration(
-    "test-only-mlir-to-tf-nodedef", MlirToTfNodeDef);
+    "test-only-mlir-to-tf-nodedef", MlirToTfNodeDef,
+    mlir::RegisterAllTensorFlowDialects);
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.cc b/tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.cc
new file mode 100644
index 00000000000..4792e220b17
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.cc
@@ -0,0 +1,80 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.h"
+
+namespace tensorflow {
+
+Status GenerateResourceSharedNameIfEmpty(Graph& graph,
+                                         FunctionLibraryDefinition& flib_def) {
+  auto is_resource_op_with_empty_shared_name = [](const NodeDef& node_def,
+                                                  const OpDef& op_def) {
+    // Only upgrade when it is a resource handle op.
+    if (op_def.output_arg().size() != 1 ||
+        op_def.output_arg(0).type() != tensorflow::DT_RESOURCE)
+      return false;
+
+    // If the OpDef has "use_node_name_sharing" field, then it is valid to use
+    // node names as shared names.
+    if (!std::any_of(op_def.attr().begin(), op_def.attr().end(),
+                     [](const auto& attr_def) {
+                       return attr_def.name() == "use_node_name_sharing" &&
+                              attr_def.type() == "bool";
+                     }))
+      return false;
+
+    if (!std::any_of(op_def.attr().begin(), op_def.attr().end(),
+                     [](const auto& attr_def) {
+                       return attr_def.name() == "shared_name" &&
+                              attr_def.type() == "string";
+                     }))
+      return false;
+
+    auto iter = node_def.attr().find("shared_name");
+    if (iter == node_def.attr().end()) return true;
+    return iter->second.s().empty();
+  };
+
+  // Upgrade nodes in the graph.
+  for (auto* node : graph.nodes()) {
+    if (is_resource_op_with_empty_shared_name(node->def(), node->op_def())) {
+      node->AddAttr("shared_name", node->name());
+    }
+  }
+
+  // Upgrade nodes in the functions.
+  auto func_names = flib_def.ListFunctionNames();
+  for (const auto& func_name : func_names) {
+    const FunctionDef* orig = flib_def.Find(func_name);
+    DCHECK(orig);
+    auto copy = *orig;
+    for (auto& node_def : *copy.mutable_node_def()) {
+      const OpDef* op_def = nullptr;
+      TF_RETURN_IF_ERROR(flib_def.LookUpOpDef(node_def.op(), &op_def));
+      if (is_resource_op_with_empty_shared_name(node_def, *op_def)) {
+        // Use the concat of function name and node name for such ops in a
+        // function as the shared_name. "@" is used as the separator because it
+        // is not allowed in the function name or the node name.
+        (*node_def.mutable_attr())["shared_name"].set_s(
+            absl::StrCat(node_def.name(), "@", func_name));
+      }
+    }
+    TF_RETURN_IF_ERROR(flib_def.ReplaceFunction(func_name, copy));
+  }
+
+  return tensorflow::Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.h b/tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.h
new file mode 100644
index 00000000000..3502572c410
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.h
@@ -0,0 +1,32 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_UPGRADE_GRAPH_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_UPGRADE_GRAPH_H_
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Generate the shared_name for resource handle ops in the graph and functions
+// if their shared_names are empty. Resource handle ops with empty shared_name
+// may have undesired semantics.
+Status GenerateResourceSharedNameIfEmpty(Graph& graph,
+                                         FunctionLibraryDefinition& flib_def);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_UPGRADE_GRAPH_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
index 0dbda2e4f9c..b55a5aa5243 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
 
 #include "absl/types/optional.h"
+#include "absl/types/variant.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -31,9 +32,6 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Parser.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
@@ -51,12 +49,14 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
 #include "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
 #include "tensorflow/compiler/mlir/xla/type_to_shape.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/logging.h"
@@ -64,34 +64,19 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-// Parses the MLIR module from the mlir_module_string.
-Status ParseMlirModule(llvm::StringRef mlir_module_string,
-                       mlir::MLIRContext* mlir_context,
-                       mlir::OwningModuleRef* mlir_module) {
-  TF_RET_CHECK(!mlir_module_string.empty())
-      << "unexpected empty serialized MLIR module string";
-  TF_RET_CHECK(mlir_module) << "unexpected null MLIR module pointer";
-
-  // Make sure we catch any error reported by MLIR and forward it to the TF
-  // error reporting system.
-  mlir::StatusScopedDiagnosticHandler error_handler(mlir_context);
-
-  // Parse the module.
-  *mlir_module = mlir::parseSourceString(mlir_module_string, mlir_context);
-  if (!*mlir_module) {
-    return error_handler.Combine(
-        errors::InvalidArgument("could not parse MLIR module"));
+// Extracts shape from XlaArgument as TensorShape. If shape is a xla::Shape,
+// that is converted to a TensorShape.
+StatusOr<TensorShape> GetTensorShapeFromXlaArgument(const XlaArgument& arg) {
+  if (absl::holds_alternative<xla::Shape>(arg.shape)) {
+    TensorShape arg_shape;
+    TF_RETURN_IF_ERROR(
+        XLAShapeToTensorShape(absl::get<xla::Shape>(arg.shape), &arg_shape));
+    return arg_shape;
+  } else {
+    return absl::get<TensorShape>(arg.shape);
   }
-
-  return Status::OK();
 }
 
-// Arguments to a computation can be either a tensor or resource.
-struct TensorOrResourceShape {
-  TensorShape shape;
-  bool is_resource = false;
-};
-
 // Converts arg_shapes to xla::Shape's and store into xla_input_shapes.
 Status GetXlaInputShapes(
     mlir::ModuleOp module, llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
@@ -285,52 +270,67 @@ static void RegisterDialects(mlir::DialectRegistry& registry) {
 
 }  //  namespace
 
+void CreateConvertMlirToXlaHloPipeline(
+    mlir::OpPassManager& pm, llvm::StringRef device_type,
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes) {
+  pm.addPass(mlir::TF::CreateTFFunctionalControlFlowToRegions());
+  pm.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::TF::CreateTensorListOpsDecompositionPass());
+
+  // TODO(b/159127949): Stack and TensorArray decomposition passes do not handle
+  // region based control flow yet. So convert back to functional control flow.
+  pm.addPass(mlir::TF::CreateTFRegionControlFlowToFunctional());
+  pm.addPass(mlir::TF::CreateStackOpsDecompositionPass());
+  pm.addPass(mlir::TF::CreateTensorArrayOpsDecompositionPass());
+  pm.addPass(mlir::TFDevice::CreateDecomposeResourceOpsPass());
+  pm.addPass(mlir::TF::CreatePromoteResourcesToArgsPass());
+  pm.addPass(mlir::createSymbolDCEPass());
+  // Guarantee all functions have one use, which enables shape inference.
+  pm.addPass(mlir::TF::CreateGuaranteeAllFuncsOneUsePass());
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  // LegalizeTFControlFlow encapsulates arguments for control flow operations
+  // with a tuple argument which break the assumption of resource lifting
+  // inside PromoteResourcesToArgs.
+  pm.addPass(mlir::mhlo::createLegalizeTFControlFlowPass());
+
+  pm.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(
+      /*allow_partial_conversion=*/true, /*legalize_chlo=*/true,
+      /*tf2xla_fallback_device_type=*/device_type));
+  for (auto& target_pass : custom_legalization_passes) {
+    pm.addNestedPass<mlir::FuncOp>(std::move(target_pass));
+  }
+  pm.addPass(mlir::mhlo::CreateLegalizeTFCommunicationPass());
+  pm.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
+  // Run shape inference pass to propagate shapes through tensor_cast operations
+  // from static to dynamic shapes. This could be generated if the shape
+  // inference was originally missing in a TF op but the corresponding HLO op
+  // had static shape after lowering.
+  pm.addPass(mlir::TF::CreateTFShapeInferencePass());
+  // Run LegalizeTFPass again because the previous legalization passes can
+  // expose more graph pruning and canonicalization opportunities that are
+  // necessary for the second LegalizeTFPass(allow_partial_conversion=false)
+  // invocation.
+  pm.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(
+      /*allow_partial_conversion=*/false, /*legalize_chlo=*/true,
+      /*tf2xla_fallback_device_type=*/device_type));
+  // In order to export to XLA, we must sink constants to control flow regions,
+  // since XLA uses functional control flow.
+  pm.addNestedPass<mlir::FuncOp>(
+      mlir::mhlo::createSinkConstantsToControlFlowPass());
+}
+
 Status ConvertMLIRToXlaComputation(
     mlir::ModuleOp module_op, llvm::StringRef device_type,
     xla::XlaComputation* xla_computation, bool use_tuple_args,
     bool return_tuple,
     const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
-    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes) {
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes) {
   mlir::PassManager tf2xla(module_op.getContext());
-  tf2xla.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
-  tf2xla.addPass(mlir::TF::CreateTensorListOpsDecompositionPass());
-  tf2xla.addPass(mlir::TF::CreateStackOpsDecompositionPass());
-  tf2xla.addPass(mlir::TF::CreateTensorArrayOpsDecompositionPass());
-  tf2xla.addPass(mlir::TFDevice::CreateDecomposeResourceOpsPass());
-  tf2xla.addPass(mlir::TF::CreatePromoteResourcesToArgsPass());
-  tf2xla.addPass(mlir::createSymbolDCEPass());
-  // Guarantee all functions have one use, which enables shape inference.
-  tf2xla.addPass(mlir::TF::CreateGuaranteeAllFuncsOneUsePass());
-  tf2xla.addPass(mlir::TF::CreateTFShapeInferencePass());
-  // LegalizeTFControlFlow encapsulates arguments for control flow operations
-  // with a tuple argument which break the assumption of resource lifting
-  // inside PromoteResourcesToArgs.
-  tf2xla.addPass(mlir::mhlo::createLegalizeTFControlFlowPass());
-
-  tf2xla.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(
-      /*allow_partial_conversion=*/true, /*legalize_chlo=*/true,
-      /*tf2xla_fallback_device_type=*/device_type));
-  for (auto& target_pass : custom_legalization_passes) {
-    tf2xla.addNestedPass<mlir::FuncOp>(std::move(target_pass));
-  }
-  tf2xla.addPass(mlir::mhlo::CreateLegalizeTFCommunicationPass());
-  tf2xla.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
-  // Run shape inference pass to propagate shapes through tensor_cast operations
-  // from static to dynamic shapes. This could be generated if the shape
-  // inference was originally missing in a TF op but the corresponding HLO op
-  // had static shape after lowering.
-  tf2xla.addPass(mlir::TF::CreateTFShapeInferencePass());
-  // Run LegalizeTFPass again because the previous legalization passes can
-  // expose more graph pruning and canonicalization opportunities that are
-  // necessary for the second LegalizeTFPass(allow_partial_conversion=false)
-  // invocation.
-  tf2xla.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(
-      /*allow_partial_conversion=*/false, /*legalize_chlo=*/true,
-      /*tf2xla_fallback_device_type=*/device_type));
-  // In order to export to XLA, we must sink constants to control flow regions,
-  // since XLA uses functional control flow.
-  tf2xla.addNestedPass<mlir::FuncOp>(
-      mlir::mhlo::createSinkConstantsToControlFlowPass());
+  applyTensorflowAndCLOptions(tf2xla);
+  CreateConvertMlirToXlaHloPipeline(tf2xla, device_type,
+                                    custom_legalization_passes);
 
   if (VLOG_IS_ON(1)) {
     // Print the whole module after each pass which requires disabling
@@ -361,12 +361,13 @@ Status ConvertMLIRToXlaComputation(
   return Status::OK();
 }
 
-static Status CompileMlirToXlaHlo(
+Status CompileMlirToXlaHlo(
     mlir::ModuleOp module_op, llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
-    llvm::StringRef device_type, bool use_tuple_args,
+    llvm::StringRef device_type, bool use_tuple_args, bool use_return_tuple,
     XlaHelpers::ShapeRepresentationFn shape_representation_fn,
     XlaCompilationResult* compilation_result,
-    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes) {
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes) {
   if (VLOG_IS_ON(1))
     tensorflow::DumpMlirOpToFile("mlir_compile_before", module_op);
 
@@ -383,9 +384,8 @@ static Status CompileMlirToXlaHlo(
   compilation_result->computation = std::make_shared<xla::XlaComputation>();
   TF_RETURN_IF_ERROR(ConvertMLIRToXlaComputation(
       module_op, device_type, compilation_result->computation.get(),
-      use_tuple_args,
-      /*return_tuple=*/true, shape_representation_fn,
-      std::move(custom_legalization_passes)));
+      use_tuple_args, use_return_tuple, shape_representation_fn,
+      custom_legalization_passes));
 
   // Construct mapping from XlaComputation's arg to input edges of execute
   // node.
@@ -412,21 +412,22 @@ Status CompileSerializedMlirToXlaHlo(
     llvm::StringRef device_type, bool use_tuple_args,
     const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
     XlaCompilationResult* compilation_result,
-    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes) {
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes) {
   mlir::MLIRContext mlir_context;
   RegisterDialects(mlir_context.getDialectRegistry());
   mlir::OwningModuleRef mlir_module;
 
   TF_RETURN_IF_ERROR(
-      ParseMlirModule(mlir_module_string, &mlir_context, &mlir_module));
+      DeserializeMlirModule(mlir_module_string, &mlir_context, &mlir_module));
   llvm::SmallVector<TensorOrResourceShape, 4> tensor_or_resource_shapes;
   tensor_or_resource_shapes.reserve(arg_shapes.size());
   for (const auto& arg_shape : arg_shapes)
     tensor_or_resource_shapes.push_back({arg_shape});
   return CompileMlirToXlaHlo(mlir_module.get(), tensor_or_resource_shapes,
                              device_type, use_tuple_args,
-                             shape_representation_fn, compilation_result,
-                             std::move(custom_legalization_passes));
+                             /*use_return_tuple=*/true, shape_representation_fn,
+                             compilation_result, custom_legalization_passes);
 }
 
 // Rewrites the given module with specified args. For each of the constant args,
@@ -434,8 +435,8 @@ Status CompileSerializedMlirToXlaHlo(
 // removed from the signature. For resource args, their subtypes are populated.
 // Returns the original indices for the other arguments on success.
 static StatusOr<std::vector<int>> RewriteWithArgs(
-    mlir::ModuleOp module, llvm::ArrayRef<const XlaArgument> args) {
-  mlir::FuncOp main_fn = module.lookupSymbol<mlir::FuncOp>("main");
+    mlir::ModuleOp module_op, llvm::ArrayRef<XlaArgument> args) {
+  mlir::FuncOp main_fn = module_op.lookupSymbol<mlir::FuncOp>("main");
   std::vector<int> params;
 
   bool has_resource_args = false;
@@ -447,7 +448,9 @@ static StatusOr<std::vector<int>> RewriteWithArgs(
     if (xla_arg.kind == XlaArgument::kResource) {
       mlir::Type element_type;
       TF_RETURN_IF_ERROR(ConvertDataType(xla_arg.type, builder, &element_type));
-      auto resource_shape = absl::get<TensorShape>(xla_arg.shape).dim_sizes();
+      TF_ASSIGN_OR_RETURN(TensorShape arg_shape,
+                          GetTensorShapeFromXlaArgument(xla_arg));
+      auto resource_shape = arg_shape.dim_sizes();
       llvm::SmallVector<int64_t, 4> resource_subtype_shape(
           resource_shape.begin(), resource_shape.end());
       auto resource_subtype =
@@ -473,7 +476,7 @@ static StatusOr<std::vector<int>> RewriteWithArgs(
                         ConvertTensor(xla_arg.constant_value, &builder));
     // TODO(hinsu): Use the actual location of the constant.
     auto constant = builder.create<mlir::TF::ConstOp>(
-        mlir::UnknownLoc::get(module.getContext()), value_attr);
+        mlir::UnknownLoc::get(module_op.getContext()), value_attr);
     mlir_arg.replaceAllUsesWith(constant);
     args_to_erase.push_back(idx);
   }
@@ -495,16 +498,54 @@ static StatusOr<std::vector<int>> RewriteWithArgs(
 }
 
 Status CompileGraphToXlaHlo(
-    const Graph& graph, llvm::ArrayRef<const XlaArgument> args,
-    llvm::StringRef device_type, bool use_tuple_args,
-    const FunctionLibraryDefinition& flib_def, const GraphDebugInfo& debug_info,
+    mlir::ModuleOp module_op, llvm::ArrayRef<XlaArgument> args,
+    llvm::StringRef device_type, bool use_tuple_args, bool use_return_tuple,
     const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
     XlaCompilationResult* compilation_result,
-    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes) {
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes) {
+  TF_ASSIGN_OR_RETURN(std::vector<int> remaining_params,
+                      RewriteWithArgs(module_op, args));
+  llvm::SmallVector<TensorOrResourceShape, 4> arg_shapes;
+  arg_shapes.reserve(remaining_params.size());
+  for (unsigned idx : remaining_params) {
+    const auto& arg = args[idx];
+    TF_ASSIGN_OR_RETURN(TensorShape arg_shape,
+                        GetTensorShapeFromXlaArgument(arg));
+    arg_shapes.push_back({arg_shape,
+                          /*is_resource=*/arg.kind == XlaArgument::kResource});
+  }
+
+  mlir::PassManager pm(module_op.getContext());
+  applyTensorflowAndCLOptions(pm);
+  mlir::TF::StandardPipelineOptions tf_options;
+  mlir::TF::CreateTFStandardPipeline(pm, tf_options);
+  {
+    mlir::StatusScopedDiagnosticHandler diag_handler(module_op.getContext());
+    if (failed(pm.run(module_op))) return diag_handler.ConsumeStatus();
+  }
+
+  auto status = CompileMlirToXlaHlo(
+      module_op, arg_shapes, device_type, use_tuple_args, use_return_tuple,
+      shape_representation_fn, compilation_result, custom_legalization_passes);
+  compilation_result->input_mapping = remaining_params;
+  return status;
+}
+
+Status CompileGraphToXlaHlo(
+    const Graph& graph, llvm::ArrayRef<XlaArgument> args,
+    llvm::ArrayRef<std::string> control_rets, llvm::StringRef device_type,
+    bool use_tuple_args, const FunctionLibraryDefinition& flib_def,
+    const GraphDebugInfo& debug_info,
+    const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+    XlaCompilationResult* compilation_result,
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes) {
   mlir::MLIRContext context;
   RegisterDialects(context.getDialectRegistry());
   GraphImportConfig config;
   config.graph_as_function = true;
+  config.control_outputs = control_rets;
   // Disable shape inference during import as some TensorFlow op fails during
   // shape inference with dynamic shaped operands. This in turn causes the
   // import to fail. Shape inference during import is going to be removed and
@@ -515,30 +556,11 @@ Status CompileGraphToXlaHlo(
       ConvertGraphToMlir(graph, debug_info, flib_def, config, &context);
   if (!module_or.ok()) return module_or.status();
 
-  mlir::ModuleOp module = module_or.ValueOrDie().get();
-  TF_ASSIGN_OR_RETURN(std::vector<int> remaining_params,
-                      RewriteWithArgs(module, {args.data(), args.size()}));
-  llvm::SmallVector<TensorOrResourceShape, 4> arg_shapes;
-  arg_shapes.reserve(remaining_params.size());
-  for (unsigned idx : remaining_params) {
-    const auto& arg = args[idx];
-    arg_shapes.push_back({absl::get<TensorShape>(arg.shape),
-                          /*is_resource=*/arg.kind == XlaArgument::kResource});
-  }
-
-  mlir::PassManager pm(&context);
-  mlir::TF::StandardPipelineOptions tf_options;
-  mlir::TF::CreateTFStandardPipeline(pm, tf_options);
-  {
-    mlir::StatusScopedDiagnosticHandler diag_handler(module.getContext());
-    if (failed(pm.run(module))) return diag_handler.ConsumeStatus();
-  }
-
-  auto status = CompileMlirToXlaHlo(
-      module, arg_shapes, device_type, use_tuple_args, shape_representation_fn,
-      compilation_result, std::move(custom_legalization_passes));
-  compilation_result->input_mapping = remaining_params;
-  return status;
+  mlir::ModuleOp module_op = module_or.ValueOrDie().get();
+  return CompileGraphToXlaHlo(module_op, args, device_type, use_tuple_args,
+                              /*use_return_tuple=*/true,
+                              shape_representation_fn, compilation_result,
+                              custom_legalization_passes);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
index 5c64a65ecbd..40230de406b 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h
@@ -16,10 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_COMPILE_MLIR_UTIL_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_COMPILE_MLIR_UTIL_H_
 
+#include <memory>
+
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/tf2xla/xla_argument.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
@@ -30,6 +33,14 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Populates the supplied passmanager with the passes required to run the
+// TF MLIR to XLA HLO MLIR conversion/legalization. Custom legalization passes
+// can be populated in `custom_legalization_passes`.
+void CreateConvertMlirToXlaHloPipeline(
+    mlir::OpPassManager& pm, llvm::StringRef device_type,
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes);
+
 // Lowers MLIR module to XLA HLO inside an XlaComputation. The input module
 // should only contain operations in tf dialect. If the input module contains
 // operation in the tf_executor dialect, for example, returns an error.
@@ -61,7 +72,24 @@ Status ConvertMLIRToXlaComputation(
     xla::XlaComputation* xla_computation, bool use_tuple_args,
     bool return_tuple,
     const XlaHelpers::ShapeRepresentationFn shape_representation_fn = nullptr,
-    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes = {});
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes = {});
+
+// Helper struct representing argument tensor or resource handle shapes.
+struct TensorOrResourceShape {
+  TensorShape shape;
+  bool is_resource = false;
+};
+
+// Compiles a MLIR module into XLA HLO, generates all accompanying metadata and
+// stores them in CompilationResult.
+Status CompileMlirToXlaHlo(
+    mlir::ModuleOp module_op, llvm::ArrayRef<TensorOrResourceShape> arg_shapes,
+    llvm::StringRef device_type, bool use_tuple_args, bool use_return_tuple,
+    XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+    XlaCompilationResult* compilation_result,
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes);
 
 // Compiles a serialized MLIR module into XLA HLO, generates all accompanying
 // metadata and stores them in CompilationResult.
@@ -70,17 +98,33 @@ Status CompileSerializedMlirToXlaHlo(
     llvm::StringRef device_type, bool use_tuple_args,
     const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
     XlaCompilationResult* compilation_result,
-    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes = {});
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes = {});
 
-// Same as the above but takes input as TensorFlow Graph.
-// TODO(lyandy): Allow populating of targets/control outputs.
+// Compiles a TensorFlow Graph (already converted to MLIR, imported with
+// tf_executor dialect still present) into XLA HLO, generates all accompanying
+// metadata and stores them in CompilationResult. This will rewrite arguments
+// and run the TensorFlow standard pipeline prior to invoking
+// `CompileMlirToXlaHlo`.
 Status CompileGraphToXlaHlo(
-    const Graph& graph, llvm::ArrayRef<const XlaArgument> args,
-    llvm::StringRef device_type, bool use_tuple_args,
-    const FunctionLibraryDefinition& flib_def, const GraphDebugInfo& debug_info,
+    mlir::ModuleOp module_op, llvm::ArrayRef<XlaArgument> args,
+    llvm::StringRef device_type, bool use_tuple_args, bool use_return_tuple,
     const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
     XlaCompilationResult* compilation_result,
-    std::vector<std::unique_ptr<mlir::Pass>> custom_legalization_passes = {});
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes);
+
+// Compiles a TensorFlow Graph into XLA HLO, generates all accompanying metadata
+// and stores them in CompilationResult.
+Status CompileGraphToXlaHlo(
+    const Graph& graph, llvm::ArrayRef<XlaArgument> args,
+    llvm::ArrayRef<std::string> control_rets, llvm::StringRef device_type,
+    bool use_tuple_args, const FunctionLibraryDefinition& flib_def,
+    const GraphDebugInfo& debug_info,
+    const XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+    XlaCompilationResult* compilation_result,
+    llvm::MutableArrayRef<std::unique_ptr<mlir::Pass>>
+        custom_legalization_passes = {});
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_pass.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_pass.cc
new file mode 100644
index 00000000000..57267ff027f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_pass.cc
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
+
+namespace {
+void CreateConvertMlirToXlaHloPipelineWithDefaults(mlir::OpPassManager& pm) {
+  tensorflow::CreateConvertMlirToXlaHloPipeline(
+      pm, /*device_type=*/"XLA_CPU_JIT",
+      /*custom_legalization_passes=*/{});
+}
+
+mlir::PassPipelineRegistration<> pipeline(
+    "tf-to-hlo-pipeline",
+    "Convert TF dialect to HLO dialect (used for compilation in bridge).",
+    CreateConvertMlirToXlaHloPipelineWithDefaults);
+}  // anonymous namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc
deleted file mode 100644
index 80e2c1132fd..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc
+++ /dev/null
@@ -1,542 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
-
-#include "tensorflow/cc/framework/scope.h"
-#include "tensorflow/cc/ops/function_ops.h"
-#include "tensorflow/cc/ops/resource_variable_ops.h"
-#include "tensorflow/compiler/tf2xla/shape_util.h"
-#include "tensorflow/compiler/tf2xla/xla_compiler.h"
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/graph/testlib.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/stream_executor/lib/statusor.h"
-
-namespace tensorflow {
-namespace {
-
-// A dummy shape representation function that simply converts given shape into
-// an xla::Shape without assigning any layouts.
-xla::StatusOr<xla::Shape> TestShapeRepresentation(const TensorShape& shape,
-                                                  DataType type,
-                                                  bool use_fast_memory) {
-  xla::Shape xla_shape;
-  TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape));
-  return xla_shape;
-}
-
-TEST(CompileSerializedMlirToXlaHloTest, InvalidSerializedMlirModule) {
-  constexpr char invalid_mlir_module[] =
-      "totally @invalid MLIR module {here} <-";
-  std::vector<TensorShape> arg_shapes;
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      invalid_mlir_module, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
-  EXPECT_EQ(s.code(), tensorflow::errors::Code::INVALID_ARGUMENT);
-  EXPECT_EQ(s.ToString(),
-            "Invalid argument: could not parse MLIR module-:1:1: error: "
-            "custom op 'totally' is unknown\n");
-}
-
-constexpr llvm::StringRef kBinaryAddModule = R"(
-  module attributes {tf.versions = {producer = 179 : i32}} {
-    func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
-      %0 = "tf.AddV2"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", name = "add"} : (tensor<f32>, tensor<f32>) -> tensor<f32>
-      return %0 : tensor<f32>
-    }
-  }
-)";
-
-TEST(CompileSerializedMlirToXlaHloTest, TupleArgs) {
-  std::vector<TensorShape> arg_shapes(2, TensorShape());
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      kBinaryAddModule, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
-  TF_ASSERT_OK(s);
-
-  const xla::HloModuleConfig module_config(
-      compilation_result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      compilation_result.computation->proto(), module_config);
-  TF_ASSERT_OK(status_or_hlo_module.status());
-  constexpr char expected_hlo_module_string[] = R"(HloModule main.6
-
-ENTRY %main.6 (arg_tuple.1: (f32[], f32[])) -> (f32[]) {
-  %arg_tuple.1 = (f32[], f32[]) parameter(0)
-  %get-tuple-element.2 = f32[] get-tuple-element((f32[], f32[]) %arg_tuple.1), index=0
-  %get-tuple-element.3 = f32[] get-tuple-element((f32[], f32[]) %arg_tuple.1), index=1
-  %add.4 = f32[] add(f32[] %get-tuple-element.2, f32[] %get-tuple-element.3)
-  ROOT %tuple.5 = (f32[]) tuple(f32[] %add.4)
-}
-
-)";
-  EXPECT_EQ(expected_hlo_module_string,
-            status_or_hlo_module.ValueOrDie()->ToString());
-
-  // Expect an in order input mapping.
-  EXPECT_EQ(compilation_result.input_mapping, std::vector<int>({0, 1}));
-
-  // Expect a single tuple-shape, containing two F32 scalars.
-  EXPECT_EQ(compilation_result.xla_input_shapes.size(), 1);
-  xla::Shape expected_input_shape =
-      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {}),
-                                      xla::ShapeUtil::MakeShape(xla::F32, {})});
-  EXPECT_EQ(compilation_result.xla_input_shapes.front(), expected_input_shape);
-
-  // Expect output shape is a tuple shape containing a single F32 Scalar type.
-  const xla::Shape output_shape =
-      xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {});
-  const xla::Shape tuple_output_shape =
-      xla::ShapeUtil::MakeTupleShape({output_shape});
-  EXPECT_EQ(compilation_result.xla_output_shape, tuple_output_shape);
-
-  // Expect exactly 1 OutputDescription.
-  EXPECT_EQ(compilation_result.outputs.size(), 1);
-  const XlaCompiler::OutputDescription& output_desc =
-      compilation_result.outputs.front();
-  EXPECT_EQ(output_desc.type, DataType::DT_FLOAT);
-  EXPECT_EQ(output_desc.shape, TensorShape());
-  EXPECT_FALSE(output_desc.is_constant);
-  EXPECT_FALSE(output_desc.is_tensor_list);
-
-  // Expect no resource updates from computation.
-  EXPECT_TRUE(compilation_result.resource_updates.empty());
-}
-
-TEST(CompileSerializedMlirToXlaHloTest, IndividualArgs) {
-  std::vector<TensorShape> arg_shapes(2, TensorShape());
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      kBinaryAddModule, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/false, TestShapeRepresentation, &compilation_result);
-  TF_ASSERT_OK(s);
-
-  const xla::HloModuleConfig module_config(
-      compilation_result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      compilation_result.computation->proto(), module_config);
-  TF_ASSERT_OK(status_or_hlo_module.status());
-  constexpr char expected_hlo_module_string[] = R"(HloModule main.5
-
-ENTRY %main.5 (Arg_0.1: f32[], Arg_1.2: f32[]) -> (f32[]) {
-  %Arg_0.1 = f32[] parameter(0)
-  %Arg_1.2 = f32[] parameter(1)
-  %add.3 = f32[] add(f32[] %Arg_0.1, f32[] %Arg_1.2)
-  ROOT %tuple.4 = (f32[]) tuple(f32[] %add.3)
-}
-
-)";
-  EXPECT_EQ(expected_hlo_module_string,
-            status_or_hlo_module.ValueOrDie()->ToString());
-
-  // Expect an in order input mapping.
-  EXPECT_EQ(compilation_result.input_mapping, std::vector<int>({0, 1}));
-
-  // Expect two inputs, each containing a F32 scalar.
-  EXPECT_EQ(compilation_result.xla_input_shapes.size(), 2);
-  xla::Shape expected_input_shape = xla::ShapeUtil::MakeShape(xla::F32, {});
-  EXPECT_EQ(compilation_result.xla_input_shapes[0], expected_input_shape);
-  EXPECT_EQ(compilation_result.xla_input_shapes[1], expected_input_shape);
-
-  // Expect output shape is a tuple shape containing a single F32 Scalar type.
-  const xla::Shape output_shape =
-      xla::ShapeUtil::MakeShape(xla::PrimitiveType::F32, {});
-  const xla::Shape tuple_output_shape =
-      xla::ShapeUtil::MakeTupleShape({output_shape});
-  EXPECT_EQ(compilation_result.xla_output_shape, tuple_output_shape);
-
-  // Expect exactly 1 OutputDescription.
-  EXPECT_EQ(compilation_result.outputs.size(), 1);
-  const XlaCompiler::OutputDescription& output_desc =
-      compilation_result.outputs.front();
-  EXPECT_EQ(output_desc.type, DataType::DT_FLOAT);
-  EXPECT_EQ(output_desc.shape, TensorShape());
-  EXPECT_FALSE(output_desc.is_constant);
-  EXPECT_FALSE(output_desc.is_tensor_list);
-
-  // Expect no resource updates from computation.
-  EXPECT_TRUE(compilation_result.resource_updates.empty());
-}
-
-// Tests that foldable ops are constant-folded to enable legalization of ops
-// that require compile time constant operand.
-TEST(CompileSerializedMlirToXlaHloTest, CompileTimeConstantFoldedSuccess) {
-  // "tf.Shape" can only be folded away after shape inference. tf.Reshape can
-  // only be lowered when tf.Shape is folded into a constant.
-  constexpr char mlir_module[] = R"(
-    module attributes {tf.versions = {producer = 179 : i32}} {
-      func @main(%arg0: tensor<10x19xf32>, %arg1: tensor<19x10xf32> {mhlo.is_same_data_across_replicas}) -> tensor<10x19xf32> {
-        %0 = "tf.Shape"(%arg0) : (tensor<10x19xf32>) -> tensor<2xi64>
-        %1 = "tf.Reshape"(%arg1, %0) : (tensor<19x10xf32>, tensor<2xi64>) -> tensor<10x19xf32>
-        return %1 : tensor<10x19xf32>
-      }
-    }
-  )";
-
-  std::vector<TensorShape> arg_shapes{TensorShape({10, 19}),
-                                      TensorShape({19, 10})};
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      mlir_module, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
-  TF_ASSERT_OK(s);
-
-  const xla::HloModuleConfig module_config(
-      compilation_result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      compilation_result.computation->proto(), module_config);
-  TF_ASSERT_OK(status_or_hlo_module.status());
-  constexpr char expected_hlo_module_string[] = R"(HloModule main.6
-
-ENTRY %main.6 (arg_tuple.1: (f32[10,19], f32[19,10])) -> (f32[10,19]) {
-  %arg_tuple.1 = (f32[10,19]{1,0}, f32[19,10]{1,0}) parameter(0), parameter_replication={false,true}
-  %get-tuple-element.2 = f32[10,19]{1,0} get-tuple-element((f32[10,19]{1,0}, f32[19,10]{1,0}) %arg_tuple.1), index=0
-  %get-tuple-element.3 = f32[19,10]{1,0} get-tuple-element((f32[10,19]{1,0}, f32[19,10]{1,0}) %arg_tuple.1), index=1
-  %reshape.4 = f32[10,19]{1,0} reshape(f32[19,10]{1,0} %get-tuple-element.3)
-  ROOT %tuple.5 = (f32[10,19]{1,0}) tuple(f32[10,19]{1,0} %reshape.4)
-}
-
-)";
-  EXPECT_EQ(expected_hlo_module_string,
-            status_or_hlo_module.ValueOrDie()->ToString());
-}
-
-TEST(CompileSerializedMlirToXlaHloTest, ShapeInference) {
-  constexpr char mlir_module[] = R"(
-    module attributes {tf.versions = {producer = 179 : i32}} {
-      func @main(%arg0: tensor<*xf32>, %arg1: tensor<?x19xf32>) -> tensor<?x19xf32> {
-        %0 = "tf.MatMul"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", transpose_a = false, transpose_b = false} : (tensor<*xf32>, tensor<?x19xf32>) -> tensor<?x19xf32>
-        return %0 : tensor<?x19xf32>
-      }
-    }
-  )";
-
-  std::vector<TensorShape> arg_shapes{TensorShape({10, 17}),
-                                      TensorShape({17, 19})};
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      mlir_module, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
-  TF_ASSERT_OK(s);
-
-  const xla::HloModuleConfig module_config(
-      compilation_result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      compilation_result.computation->proto(), module_config);
-  TF_ASSERT_OK(status_or_hlo_module.status());
-
-  constexpr char expected_signature[] =
-      R"((arg_tuple.1: (f32[10,17], f32[17,19])) -> (f32[10,19]))";
-  EXPECT_THAT(status_or_hlo_module.ValueOrDie()->ToString(),
-              ::testing::HasSubstr(expected_signature));
-}
-
-TEST(CompileSerializedMlirToXlaHloTest, ShapeInferenceAfterLegalization) {
-  constexpr char mlir_module[] = R"(
-    module attributes {tf.versions = {producer = 179 : i32}} {
-      func @main(%arg0: tensor<8x16x16x64xbf16>, %arg1: tensor<64xf32>) -> (tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<*xf32>) {
-        %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg1, %arg1, %arg1) {data_format = "NHWC", device = "", epsilon = 9.99999974E-5 : f32, exponential_avg_factor = 1.000000e+00 : f32, is_training = false} : (tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>) -> (tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<*xf32>)
-        return %0#0, %0#1, %0#2, %0#3, %0#4, %0#5 : tensor<8x16x16x64xbf16>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<*xf32>
-      }
-    }
-  )";
-
-  std::vector<TensorShape> arg_shapes{TensorShape({8, 16, 16, 64}),
-                                      TensorShape({64})};
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      mlir_module, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
-  TF_ASSERT_OK(s);
-
-  const xla::HloModuleConfig module_config(
-      compilation_result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      compilation_result.computation->proto(), module_config);
-  TF_ASSERT_OK(status_or_hlo_module.status());
-
-  constexpr char expected_signature[] =
-      R"(-> (bf16[8,16,16,64], f32[64], f32[64], f32[64], f32[64], f32[0]))";
-  EXPECT_THAT(status_or_hlo_module.ValueOrDie()->ToString(),
-              ::testing::HasSubstr(expected_signature));
-}
-
-TEST(CompileSerializedMlirToXlaHloTest, ConstantFoldHook) {
-  constexpr char mlir_module[] = R"(
-module attributes {tf.versions = {producer = 179 : i32}} {
-  func @main() -> (tensor<0xi32>, tensor<0xi32>) {
-    %0 = "tf.Const"() {value = dense<[]> : tensor<0xi32>} : () -> tensor<0xi32>
-    %r0, %r1 = "tf.BroadcastGradientArgs"(%0, %0) {T = i32} : (tensor<0xi32>, tensor<0xi32>) -> (tensor<0xi32>, tensor<0xi32>)
-    return %r0, %r1 : tensor<0xi32>, tensor<0xi32>
-  }
-}
-)";
-
-  std::vector<TensorShape> arg_shapes(2, TensorShape());
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      mlir_module, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
-  TF_ASSERT_OK(s);
-
-  const xla::HloModuleConfig module_config(
-      compilation_result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      compilation_result.computation->proto(), module_config);
-  TF_ASSERT_OK(status_or_hlo_module.status());
-  constexpr char expected_hlo_module_string[] = R"(HloModule main.4
-
-ENTRY %main.4 (arg_tuple.1: ()) -> (s32[0], s32[0]) {
-  %arg_tuple.1 = () parameter(0)
-  %constant.2 = s32[0]{0} constant({})
-  ROOT %tuple.3 = (s32[0]{0}, s32[0]{0}) tuple(s32[0]{0} %constant.2, s32[0]{0} %constant.2)
-}
-
-)";
-  EXPECT_EQ(expected_hlo_module_string,
-            status_or_hlo_module.ValueOrDie()->ToString());
-}
-
-// The following xla::OpSharding protos are used:
-//  Serialized string:
-//   "\08\03\1A\02\01\02\22\02\00\01"
-//  Proto debug string:
-//   type: OTHER
-//   tile_assignment_dimensions: 1
-//   tile_assignment_dimensions: 2
-//   tile_assignment_devices: 0
-//   tile_assignment_devices: 1
-//
-//  Serialized string:
-//   "\08\01\1A\01\01\22\01\00"
-//  Proto debug string:
-//   type: MAXIMAL
-//   tile_assignment_dimensions: 1
-//   tile_assignment_devices: 0
-//
-//  Serialized string:
-//   ""
-//  Proto debug string (empty but would equivalent to):
-//   type: REPLICATED
-TEST(CompileSerializedMlirToXlaHloTest, ArgumentSharding) {
-  constexpr char mlir_module[] = R"(
-module attributes {tf.versions = {producer = 179 : i32}} {
-  func @main(%arg0: tensor<128x10xf32> {mhlo.sharding = "\08\03\1A\02\01\02\22\02\00\01"}, %arg1: tensor<10x1024xf32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg2: tensor<128x1024xf32> {mhlo.sharding = ""}) {
-    return
-  }
-}
-)";
-
-  std::vector<TensorShape> arg_shapes{TensorShape({128, 10}),
-                                      TensorShape({10, 1024}),
-                                      TensorShape({128, 1024})};
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      mlir_module, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
-  TF_ASSERT_OK(s);
-
-  const xla::HloModuleConfig module_config(
-      compilation_result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      compilation_result.computation->proto(), module_config);
-  TF_ASSERT_OK(status_or_hlo_module.status());
-  constexpr char expected_hlo_module_string[] = R"(HloModule main.6
-
-ENTRY %main.6 (arg_tuple.1: (f32[128,10], f32[10,1024], f32[128,1024])) -> () {
-  %arg_tuple.1 = (f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) parameter(0), sharding={{devices=[1,2]0,1}, {maximal device=0}, {replicated}}
-  %get-tuple-element.2 = f32[128,10]{1,0} get-tuple-element((f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) %arg_tuple.1), index=0
-  %get-tuple-element.3 = f32[10,1024]{1,0} get-tuple-element((f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) %arg_tuple.1), index=1
-  %get-tuple-element.4 = f32[128,1024]{1,0} get-tuple-element((f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) %arg_tuple.1), index=2
-  ROOT %tuple.5 = () tuple()
-}
-
-)";
-  EXPECT_EQ(expected_hlo_module_string,
-            status_or_hlo_module.ValueOrDie()->ToString());
-}
-
-TEST(CompileSerializedMlirToXlaHloTest, BadArgumentSharding) {
-  constexpr char mlir_module[] = R"(
-module attributes {tf.versions = {producer = 179 : i32}} {
-  func @main(%arg0: tensor<128x10xf32> {mhlo.sharding = "bad_sharding"}) {
-    return
-  }
-}
-)";
-
-  std::vector<TensorShape> arg_shapes{TensorShape({128, 10})};
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      mlir_module, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
-  ASSERT_FALSE(s.ok());
-  EXPECT_EQ(s.error_message(),
-            "failed to parse argument sharding 0 'bad_sharding'");
-}
-
-TEST(CompileSerializedMlirToXlaHloTest, ResultSharding) {
-  constexpr char mlir_module[] = R"(
-module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 351 : i32}} {
-  func @main(%arg0: tensor<128x10xf32>, %arg1: tensor<10x1024xf32>, %arg2: tensor<128x1024xf32>) -> (tensor<128x10xf32> {mhlo.sharding = "\08\03\1A\02\01\02\22\02\00\01"}, tensor<10x1024xf32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"}, tensor<128x1024xf32> {mhlo.sharding = ""}) {
-    return %arg0, %arg1, %arg2 : tensor<128x10xf32>, tensor<10x1024xf32>, tensor<128x1024xf32>
-  }
-}
-)";
-
-  std::vector<TensorShape> arg_shapes{TensorShape({128, 10}),
-                                      TensorShape({10, 1024}),
-                                      TensorShape({128, 1024})};
-  XlaCompiler::CompilationResult compilation_result;
-
-  Status s = CompileSerializedMlirToXlaHlo(
-      mlir_module, arg_shapes, "XLA_CPU_JIT",
-      /*use_tuple_args=*/true, TestShapeRepresentation, &compilation_result);
-  TF_ASSERT_OK(s);
-
-  const xla::HloModuleConfig module_config(
-      compilation_result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      compilation_result.computation->proto(), module_config);
-  TF_ASSERT_OK(status_or_hlo_module.status());
-  constexpr char expected_hlo_module_string[] = R"(HloModule main.9
-
-ENTRY %main.9 (arg_tuple.1: (f32[128,10], f32[10,1024], f32[128,1024])) -> (f32[128,10], f32[10,1024], f32[128,1024]) {
-  %arg_tuple.1 = (f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) parameter(0)
-  %get-tuple-element.2 = f32[128,10]{1,0} get-tuple-element((f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) %arg_tuple.1), index=0
-  %reshape.5 = f32[128,10]{1,0} reshape(f32[128,10]{1,0} %get-tuple-element.2)
-  %get-tuple-element.3 = f32[10,1024]{1,0} get-tuple-element((f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) %arg_tuple.1), index=1
-  %reshape.6 = f32[10,1024]{1,0} reshape(f32[10,1024]{1,0} %get-tuple-element.3)
-  %get-tuple-element.4 = f32[128,1024]{1,0} get-tuple-element((f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) %arg_tuple.1), index=2
-  %reshape.7 = f32[128,1024]{1,0} reshape(f32[128,1024]{1,0} %get-tuple-element.4)
-  ROOT %tuple.8 = (f32[128,10]{1,0}, f32[10,1024]{1,0}, f32[128,1024]{1,0}) tuple(f32[128,10]{1,0} %reshape.5, f32[10,1024]{1,0} %reshape.6, f32[128,1024]{1,0} %reshape.7), sharding={{devices=[1,2]0,1}, {maximal device=0}, {replicated}}
-}
-
-)";
-  EXPECT_EQ(expected_hlo_module_string,
-            status_or_hlo_module.ValueOrDie()->ToString());
-}
-
-// Verify that conversion from Graph to MLIR and empty shape representation
-// function is successful.
-TEST(CompileGraphToXlaHlo, Basic) {
-  FunctionLibraryDefinition flib_def(OpRegistry::Global(), {});
-  Graph graph(OpRegistry::Global());
-
-  Node* arg = test::graph::Arg(&graph, 0, DT_FLOAT);
-  test::graph::Retval(&graph, 0, arg);
-
-  XlaCompiler::CompilationResult result;
-  XlaCompiler::Argument compiler_arg;
-  compiler_arg.kind = XlaCompiler::Argument::kParameter;
-  compiler_arg.shape = TensorShape();
-
-  TF_ASSERT_OK(
-      CompileGraphToXlaHlo(graph, /*args=*/{compiler_arg}, "XLA_CPU_JIT",
-                           /*use_tuple_args=*/false, flib_def, GraphDebugInfo(),
-                           /*shape_representation_fn=*/nullptr, &result));
-
-  const xla::HloModuleConfig module_config(
-      result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      result.computation->proto(), module_config);
-  ASSERT_TRUE(status_or_hlo_module.ok());
-
-  constexpr char expected_hlo_module_string[] = R"(HloModule main.3
-
-ENTRY %main.3 (Arg_0.1: f32[]) -> (f32[]) {
-  %Arg_0.1 = f32[] parameter(0)
-  ROOT %tuple.2 = (f32[]) tuple(f32[] %Arg_0.1)
-}
-
-)";
-
-  EXPECT_EQ(expected_hlo_module_string,
-            status_or_hlo_module.ValueOrDie()->ToString());
-}
-
-// Tests a conversion from Graph to MLIR with resource arguments.
-TEST(CompileGraphToXlaHlo, Resources) {
-  FunctionLibraryDefinition flib_def(OpRegistry::Global(), {});
-  Graph graph(OpRegistry::Global());
-
-  Scope scope = Scope::NewRootScope().ExitOnError();
-  auto val = ops::_Arg(scope.WithOpName("arg0"), DT_FLOAT, 0);
-  auto var = ops::_Arg(scope.WithOpName("arg1"), DT_RESOURCE, 1);
-  auto assign =
-      ops::AssignVariableOp(scope.WithOpName("assign_variable"), var, val);
-  TF_ASSERT_OK(scope.ToGraph(&graph));
-
-  XlaCompiler::CompilationResult result;
-  XlaCompiler::Argument arg0;
-  arg0.kind = XlaCompiler::Argument::kParameter;
-  arg0.shape = TensorShape({2});
-  XlaCompiler::Argument arg1;
-  arg1.kind = XlaCompiler::Argument::kResource;
-  arg1.shape = TensorShape({2});
-  arg1.type = DT_FLOAT;
-
-  TF_ASSERT_OK(
-      CompileGraphToXlaHlo(graph, /*args=*/{arg0, arg1}, "XLA_CPU_JIT",
-                           /*use_tuple_args=*/false, flib_def, GraphDebugInfo(),
-                           /*shape_representation_fn=*/nullptr, &result));
-
-  EXPECT_EQ(result.outputs.size(), 0);
-  ASSERT_EQ(result.resource_updates.size(), 1);
-  const auto& resource_update = result.resource_updates[0];
-  EXPECT_EQ(resource_update.input_index, 1);
-  EXPECT_EQ(resource_update.modified, true);
-  EXPECT_EQ(resource_update.shape, TensorShape({2}));
-  EXPECT_EQ(resource_update.type, DT_FLOAT);
-
-  const xla::HloModuleConfig module_config(
-      result.computation->GetProgramShape().ValueOrDie());
-  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
-      result.computation->proto(), module_config);
-  ASSERT_TRUE(status_or_hlo_module.ok());
-
-  constexpr char expected_hlo_module_string[] =
-      R"(HloModule main.4, input_output_alias={ {0}: (1, {}, may-alias) }
-
-ENTRY %main.4 (Arg_0.1: f32[2], Arg_1.2: f32[2]) -> (f32[2]) {
-  %Arg_1.2 = f32[2]{0} parameter(1)
-  %Arg_0.1 = f32[2]{0} parameter(0)
-  ROOT %tuple.3 = (f32[2]{0}) tuple(f32[2]{0} %Arg_0.1)
-}
-
-)";
-
-  EXPECT_EQ(expected_hlo_module_string,
-            status_or_hlo_module.ValueOrDie()->ToString());
-}
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.cc
new file mode 100644
index 00000000000..98bfbbe608a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.cc
@@ -0,0 +1,113 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h"
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+
+// Converts non func AttrValue proto into an MLIR attribute. Func attribute is
+// exclused in this function because the function might be renamed when the
+// function definition is imported.
+StatusOr<mlir::Attribute> ConvertNonFuncAttributeValue(const AttrValue& value,
+                                                       mlir::Builder* builder) {
+  switch (value.value_case()) {
+    case AttrValue::kI:
+      return builder->getI64IntegerAttr(value.i());
+    case AttrValue::kS:
+      return builder->getStringAttr(value.s());
+    case AttrValue::kF:
+      return builder->getFloatAttr(builder->getF32Type(), value.f());
+    case AttrValue::kB:
+      return builder->getBoolAttr(value.b());
+    case AttrValue::kType: {
+      mlir::Type type;
+      TF_RETURN_IF_ERROR(ConvertDataType(value.type(), *builder, &type));
+      return mlir::TypeAttr::get(type);
+    }
+    case AttrValue::kShape:
+      return ConvertTensorShapeProto(value.shape(), builder->getContext());
+    case AttrValue::kTensor:
+      return ConvertTensorProto(value.tensor(), builder);
+    case AttrValue::kList: {
+      absl::InlinedVector<mlir::Attribute, 8> attrs;
+      for (const auto& item : value.list().i())
+        attrs.push_back(builder->getI64IntegerAttr(item));
+      for (const auto& item : value.list().s())
+        attrs.push_back(builder->getStringAttr(item));
+      for (const auto& item : value.list().f())
+        attrs.push_back(builder->getFloatAttr(builder->getF32Type(), item));
+      for (const auto& item : value.list().b())
+        attrs.push_back(builder->getBoolAttr(item));
+      for (const auto& item : value.list().type()) {
+        mlir::Type type;
+        TF_RETURN_IF_ERROR(ConvertDataType(DataType(item), *builder, &type));
+        attrs.push_back(mlir::TypeAttr::get(type));
+      }
+      for (const auto& item : value.list().shape()) {
+        TF_ASSIGN_OR_RETURN(
+            auto attr, ConvertTensorShapeProto(item, builder->getContext()));
+        attrs.push_back(attr);
+      }
+      for (const auto& item : value.list().tensor()) {
+        TF_ASSIGN_OR_RETURN(auto attr, ConvertTensorProto(item, builder));
+        attrs.push_back(attr);
+      }
+      if (!value.list().func().empty()) {
+        return tensorflow::errors::Unimplemented(
+            absl::StrCat("Attribute ", value.DebugString()));
+      }
+      return builder->getArrayAttr(
+          llvm::makeArrayRef(attrs.begin(), attrs.end()));
+    }
+    case AttrValue::VALUE_NOT_SET:
+      return builder->getUnitAttr();
+    // kPlaceholder is not implemented.
+    default:
+      return tensorflow::errors::Unimplemented(
+          absl::StrCat("Attribute ", value.DebugString()));
+  }
+}
+
+StatusOr<mlir::Attribute> ConvertAttributeValue(const AttrValue& value,
+                                                mlir::Builder* builder) {
+  switch (value.value_case()) {
+    case AttrValue::kFunc: {
+      // TODO(b/156546237): Unify kFunc/NameAttrList attribute representation.
+      // Currently kFunc/NameAttrList attributes in a kList/repeated AttrValue
+      // will not use this representation.
+      mlir::NamedAttrList attrs;
+      for (const auto& func_attr : value.func().attr()) {
+        TF_ASSIGN_OR_RETURN(auto attr,
+                            ConvertAttributeValue(func_attr.second, builder));
+        attrs.push_back(builder->getNamedAttr(func_attr.first, attr));
+      }
+      auto func_attrs = builder->getDictionaryAttr(attrs);
+      return mlir::TF::FuncAttr::get(builder->getContext(), value.func().name(),
+                                     func_attrs);
+    }
+    default:
+      return ConvertNonFuncAttributeValue(value, builder);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h b/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h
new file mode 100644
index 00000000000..c95ed60273d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CONVERT_ATTR_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CONVERT_ATTR_H_
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+
+using stream_executor::port::StatusOr;
+
+// Converts non func AttrValue proto into an MLIR attribute. Func attribute is
+// exclused in this function because the function might be renamed when the
+// function definition is imported.
+StatusOr<mlir::Attribute> ConvertNonFuncAttributeValue(const AttrValue& value,
+                                                       mlir::Builder* builder);
+
+// Converts all kinds of AttrValue proto into an MLIR attribute.
+StatusOr<mlir::Attribute> ConvertAttributeValue(const AttrValue& value,
+                                                mlir::Builder* builder);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CONVERT_ATTR_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
index 05e1f059029..98328212c88 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
@@ -214,6 +214,20 @@ mlir::TF::ShapeAttr ConvertTypeToTensorShapeAttr(const mlir::Type& type) {
   return mlir::TF::ShapeAttr::get(type.getContext(), ArrayRef<int64_t>());
 }
 
+// Converts the tensor shape proto into an MLIR shape attribute.
+StatusOr<mlir::Attribute> ConvertTensorShapeProto(const TensorShapeProto& shape,
+                                                  mlir::MLIRContext* context) {
+  if (shape.unknown_rank())
+    return mlir::TF::ShapeAttr::get(context, llvm::None);
+
+  llvm::SmallVector<int64_t, 4> dims;
+  dims.reserve(shape.dim().size());
+  for (const auto& dim : shape.dim()) {
+    dims.push_back(dim.size());
+  }
+  return mlir::TF::ShapeAttr::get(context, llvm::makeArrayRef(dims));
+}
+
 // Converts an MLIR dense string elements attribute to a TensorFlow tensor
 // proto.
 void ConvertStringElementsAttr(
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h
index e7cde4db936..294453ebcfd 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h
@@ -48,6 +48,10 @@ PartialTensorShape ConvertTypeToTensorShape(const mlir::Type& type);
 // Converts an MLIR shaped type to a TensorFlow shape attribute.
 mlir::TF::ShapeAttr ConvertTypeToTensorShapeAttr(const mlir::Type& type);
 
+// Converts a TensorFlow shape attribute to an MLIR shape attribute.
+StatusOr<mlir::Attribute> ConvertTensorShapeProto(const TensorShapeProto& shape,
+                                                  mlir::MLIRContext* context);
+
 // Converts an MLIR elements attribute to a TensorFlow tensor proto.
 Status ConvertToTensorProto(mlir::ElementsAttr attr,
                             TensorProto* output_tensor);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
index febf2bc096d..6c1cab435d3 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
@@ -182,4 +182,55 @@ std::string DumpRawStringToFile(llvm::StringRef name, llvm::StringRef content,
   return filepath;
 }
 
+void SetCrashReproducer(mlir::PassManager& pm, llvm::StringRef dir_path) {
+  std::string path = dir_path.str();
+  if (path.empty()) {
+    if (getenv("MLIR_CRASH_REPRODUCER_DIRECTORY"))
+      path = getenv("MLIR_CRASH_REPRODUCER_DIRECTORY");
+    else if (getenv("TEST_UNDECLARED_OUTPUTS_DIR"))
+      path = "sponge";
+  }
+  if (path.empty()) {
+    LOG_FIRST_N(INFO, 1) << "disabling MLIR crash reproducer, set env var "
+                            "`MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.";
+    return;
+  }
+
+  // Output dirs "sponge" (case-insensitive) have a special meaning: Dump into
+  // the directory specified by the environment variable
+  // TEST_UNDECLARED_OUTPUTS_DIR.
+  string lower_path = absl::AsciiStrToLower(path);
+  if (lower_path == "sponge") {
+    if (!tensorflow::io::GetTestUndeclaredOutputsDir(&path)) {
+      LOG(ERROR) << "MLIR crash reproducer is set to '" << dir_path.str()
+                 << "', but environment variable TEST_UNDECLARED_OUTPUTS_DIR "
+                    "is not set, so cannot dump anywhere.";
+      return;
+    }
+  }
+
+  auto* env = tensorflow::Env::Default();
+  auto status = env->RecursivelyCreateDir(path);
+  if (!status.ok()) {
+    LOG(WARNING) << "cannot create directory '" + path +
+                        "': " + status.error_message();
+    return;
+  }
+
+  path += "/mlir_reproducer_";
+
+  if (!tensorflow::Env::Default()->CreateUniqueFileName(&path, ".mlir")) {
+    LOG(WARNING)
+        << "cannot create unique filename, won't enable MLIR crash reproducer.";
+    return;
+  }
+  pm.enableCrashReproducerGeneration(path, /*genLocalReproducer=*/false);
+}
+
+void applyTensorflowAndCLOptions(mlir::PassManager& pm,
+                                 llvm::StringRef dir_path) {
+  mlir::applyPassManagerCLOptions(pm);
+  SetCrashReproducer(pm, dir_path);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h
index 726eed8974e..133285864f6 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
@@ -64,6 +65,22 @@ std::string GetDumpDirFromEnvVar();
 std::string DumpRawStringToFile(llvm::StringRef name, llvm::StringRef content,
                                 llvm::StringRef dirname = "");
 
+// Enable the crash reproducer on the provided PassManager to the provided
+// directory path. If the provided path is empty, it is retrieved from the
+// environment variable `MLIR_CRASH_REPRODUCER_DIRECTORY`. If the provided path
+// is the string "sponge", the file will be included in the sponge "Output
+// Files" by looking up the environment to infer the directory path.
+void SetCrashReproducer(mlir::PassManager& pm, llvm::StringRef dir_path = "");
+
+// This applies both the PassManagerCLOptions provided by MLIR along with any
+// tensorflow specific options.
+//
+// Note that this function should be in a more appropriate file, but it is
+// unclear what a proper file would be as no other functions would currently be
+// in the file also.
+void applyTensorflowAndCLOptions(mlir::PassManager& pm,
+                                 llvm::StringRef dir_path = "");
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DUMP_MLIR_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
index 67c2aebf121..cad5f2bae98 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
@@ -127,11 +127,12 @@ Status ConvertAttribute(const mlir::FlatSymbolRefAttr& attr, AttrValue* value) {
   return Status::OK();
 }
 
-Status ConvertAttribute(const mlir::TF::FuncAttr& attr, AttrValue* value) {
+Status ConvertAttribute(const mlir::TF::FuncAttr& attr, bool remove_ref_type,
+                        AttrValue* value) {
   TF_RETURN_IF_ERROR(
       ConvertAttribute(attr.GetName().cast<mlir::FlatSymbolRefAttr>(), value));
   TF_RETURN_IF_ERROR(ConvertAttributes(attr.GetAttrs().getValue(),
-                                       /*attrs_to_ignore=*/{},
+                                       /*attrs_to_ignore=*/{}, remove_ref_type,
                                        value->mutable_func()->mutable_attr()));
   return Status::OK();
 }
@@ -159,15 +160,18 @@ Status ConvertAttribute(const mlir::StringAttr& attr, AttrValue* value) {
   return Status::OK();
 }
 
-Status ConvertAttribute(mlir::Type type, AttrValue* value) {
+Status ConvertAttribute(mlir::Type type, bool remove_ref_type,
+                        AttrValue* value) {
   DataType dtype;
   TF_RETURN_IF_ERROR(ConvertToDataType(type, &dtype));
+  if (tensorflow::IsRefType(dtype)) dtype = tensorflow::RemoveRefType(dtype);
   value->set_type(dtype);
   return Status::OK();
 }
 
-Status ConvertAttribute(const mlir::TypeAttr& type, AttrValue* value) {
-  return ConvertAttribute(type.getValue(), value);
+Status ConvertAttribute(const mlir::TypeAttr& type, bool remove_ref_type,
+                        AttrValue* value) {
+  return ConvertAttribute(type.getValue(), remove_ref_type, value);
 }
 
 Status ConvertAttribute(const mlir::UnitAttr& attr, AttrValue* value) {
@@ -175,7 +179,8 @@ Status ConvertAttribute(const mlir::UnitAttr& attr, AttrValue* value) {
   return Status::OK();
 }
 
-Status ConvertAttribute(const mlir::ArrayAttr& attr, AttrValue* value) {
+Status ConvertAttribute(const mlir::ArrayAttr& attr, bool remove_ref_type,
+                        AttrValue* value) {
   auto* list = value->mutable_list();
   for (mlir::Attribute a : attr.getValue()) {
     if (auto attr = a.dyn_cast<mlir::BoolAttr>()) {
@@ -215,7 +220,8 @@ Status ConvertAttribute(const mlir::ArrayAttr& attr, AttrValue* value) {
       if (auto shaped_type = elt_type.dyn_cast<mlir::ShapedType>()) {
         elt_type = shaped_type.getElementType();
       }
-      TF_RETURN_IF_ERROR(ConvertAttribute(elt_type, &attr_val));
+      TF_RETURN_IF_ERROR(
+          ConvertAttribute(elt_type, remove_ref_type, &attr_val));
       list->add_type(attr_val.type());
     } else if (auto attr = a.dyn_cast<mlir::TF::ShapeAttr>()) {
       AttrValue attr_val;
@@ -228,18 +234,6 @@ Status ConvertAttribute(const mlir::ArrayAttr& attr, AttrValue* value) {
   return Status::OK();
 }
 
-// Updates NodeDef constructed out of an MLIR Case/IfW/While op to map it to
-// either TensorFlow StatelessX or X op depending on the additional attribute.
-void UpdateCompositeOp(NodeDef* node_def) {
-  auto it = node_def->mutable_attr()->find("is_stateless");
-  if (it != node_def->attr().end()) {
-    if (it->second.b()) {
-      *node_def->mutable_op() = "Stateless" + node_def->op();
-    }
-    node_def->mutable_attr()->erase(it);
-  }
-}
-
 // Returns true if the executor/control dialect op should map to Ref node in
 // TensorFlow Graph. For control dialect NextIteration it uses the 1st operand
 // type. For executor dialect NextIteration it uses the 2nd operand type. For
@@ -291,7 +285,6 @@ StatusOr<llvm::StringRef> GetTensorFlowOpName(llvm::StringRef op_name) {
 }
 
 StatusOr<std::unique_ptr<NodeDef>> GetOperationNodeDef(
-    const absl::flat_hash_set<absl::string_view>& attrs_to_ignore,
     mlir::Operation* inst, llvm::StringRef name) {
   auto node_def = absl::make_unique<NodeDef>();
   // Note: we do not use NodeBuilder or NodeDefBuilder as that would require
@@ -321,6 +314,14 @@ StatusOr<std::unique_ptr<NodeDef>> GetOperationNodeDef(
   node_def->set_name(name.str());
   node_def->set_op(std::string(op_name.str()));
 
+  // Update NodeDef constructed out of an MLIR Case/If/While op to map it to
+  // either TensorFlow StatelessX or X op depending on the additional attribute.
+  if (llvm::isa<mlir::TF::CaseOp, mlir::TF::IfOp, mlir::TF::WhileOp>(inst)) {
+    auto stateless = inst->getAttrOfType<mlir::BoolAttr>("is_stateless");
+    if (stateless && stateless.getValue())
+      *node_def->mutable_op() = "Stateless" + node_def->op();
+  }
+
   // Add inputs to the NodeDef based on the number of operands. This is required
   // as later when edges are added to the Node using Graph::AddEdge the
   // associated NodeDef is not updated.
@@ -331,27 +332,17 @@ StatusOr<std::unique_ptr<NodeDef>> GetOperationNodeDef(
     node_def->set_device(std::string(attr.getValue()));
   }
 
-  // Add the node attributes.
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      ConvertAttributes(inst->getAttrs(), attrs_to_ignore,
-                        node_def->mutable_attr()),
-      "while converting attributes for node: ", name.str());
-
   // Add the node debug info.
   TF_RETURN_IF_ERROR(ConvertLocation(
       inst->getLoc(), node_def->mutable_experimental_debug_info()));
 
-  if (node_def->op() == "Case") UpdateCompositeOp(node_def.get());
-  if (node_def->op() == "If") UpdateCompositeOp(node_def.get());
-  if (node_def->op() == "While") UpdateCompositeOp(node_def.get());
-
   return node_def;
 }
 
 Status ConvertAttributes(
     const llvm::ArrayRef<mlir::NamedAttribute> attrs,
     const absl::flat_hash_set<absl::string_view>& attrs_to_ignore,
-    AttrValueMap* values) {
+    bool remove_ref_type, AttrValueMap* values) {
   AttrValueMap func_call_attrs;
   for (const mlir::NamedAttribute& named_attr : attrs) {
     auto name_strref = named_attr.first.str();
@@ -376,7 +367,7 @@ Status ConvertAttributes(
       continue;
     }
     if (auto func_attr = attr.dyn_cast<mlir::TF::FuncAttr>()) {
-      TF_RETURN_IF_ERROR(ConvertAttribute(func_attr, &value));
+      TF_RETURN_IF_ERROR(ConvertAttribute(func_attr, remove_ref_type, &value));
       func_call_attrs[string(name)] = value;
       continue;
     }
@@ -388,11 +379,13 @@ Status ConvertAttributes(
     TF_RETURN_IF_ERROR(
         llvm::TypeSwitch<mlir::Attribute, Status>(attr)
             .Case<mlir::BoolAttr, mlir::IntegerAttr, mlir::FloatAttr,
-                  mlir::StringAttr, mlir::ArrayAttr, mlir::ElementsAttr,
-                  mlir::TypeAttr, mlir::UnitAttr, mlir::TF::ShapeAttr>(
-                [&](auto derived_attr) {
-                  return ConvertAttribute(derived_attr, &value);
-                })
+                  mlir::StringAttr, mlir::ElementsAttr, mlir::UnitAttr,
+                  mlir::TF::ShapeAttr>([&](auto derived_attr) {
+              return ConvertAttribute(derived_attr, &value);
+            })
+            .Case<mlir::ArrayAttr, mlir::TypeAttr>([&](auto derived_attr) {
+              return ConvertAttribute(derived_attr, remove_ref_type, &value);
+            })
             .Default([&](mlir::Attribute) {
               return errors::Unimplemented(
                   "Unhandled attribute kind for attribute '", name_strref,
@@ -419,28 +412,6 @@ Status ConvertAttributes(
   return Status::OK();
 }
 
-// Sets type attribute with the given name. If the attribute already exists with
-// a different value, returns an error.
-Status SetTypeAttribute(absl::string_view name, mlir::Type type,
-                        AttrValueMap* values) {
-  DataType dtype;
-  TF_RETURN_IF_ERROR(ConvertScalarTypeToDataType(type, &dtype));
-  if (tensorflow::IsRefType(dtype)) dtype = tensorflow::RemoveRefType(dtype);
-  AttrValue value;
-  value.set_type(dtype);
-
-  auto result = values->insert({string(name), value});
-  if (!result.second) {
-    DataType actual_dtype = result.first->second.type();
-    if (actual_dtype != dtype) {
-      return errors::InvalidArgument("Expected ", DataType_Name(dtype), " '",
-                                     name, "' attribute but found ",
-                                     DataType_Name(actual_dtype));
-    }
-  }
-  return Status::OK();
-}
-
 Status SetShapeAttribute(absl::string_view name, mlir::ShapedType shaped_type,
                          AttrValueMap* values) {
   tensorflow::TensorShapeProto tshape;
@@ -469,26 +440,6 @@ Status SetShapeAttribute(absl::string_view name, mlir::ShapedType shaped_type,
   return Status::OK();
 }
 
-Status SetSizeAttribute(absl::string_view name, size_t size,
-                        AttrValueMap* values) {
-  AttrValue value;
-  value.set_i(size);
-
-  auto result = values->insert({string(name), value});
-  if (!result.second) {
-    // This should be extremely rare as it means we are adding the same
-    // attribute multiple times/have some redundancy in representing this
-    // attribute.
-    size_t actual_size = result.first->second.i();
-    // Just check via string output as we shouldn't get here and if we do they
-    // should be trivially the same, else fail.
-    if (actual_size != size)
-      return errors::InvalidArgument("Expected '", name, "' attribute to be ",
-                                     size, " but found ", actual_size);
-  }
-  return Status::OK();
-}
-
 bool IsLegacyCallInstruction(mlir::Operation* inst) {
   return llvm::dyn_cast<mlir::TF::LegacyCallOp>(inst);
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
index 58fe39fa4e8..d1e0fd12f26 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
@@ -50,12 +50,8 @@ Status AddTensorFlowOpPrefix(std::string);
 StatusOr<llvm::StringRef> GetTensorFlowOpName(llvm::StringRef);
 
 // Converts an MLIR operation to TensorFlow NodeDef with given node name. This
-// name should be unique to the graph it is being inserted into. `op_name_func`
-// is to map the op name of `inst` to its op name in TensorFlow. "name" and
-// "device" attributes are ignored by default. Use attrs_to_ignore to specify
-// any other attributes that should be ignored.
+// name should be unique to the graph it is being inserted into.
 StatusOr<std::unique_ptr<NodeDef>> GetOperationNodeDef(
-    const absl::flat_hash_set<absl::string_view>& attrs_to_ignore,
     mlir::Operation* inst, llvm::StringRef name);
 
 // Converts MLIR attributes with values to their tensorflow equivalent.
@@ -64,23 +60,13 @@ StatusOr<std::unique_ptr<NodeDef>> GetOperationNodeDef(
 Status ConvertAttributes(
     const llvm::ArrayRef<mlir::NamedAttribute> attrs,
     const absl::flat_hash_set<absl::string_view>& attrs_to_ignore,
-    AttrValueMap* values);
-
-// Sets type attribute with the given name. If the attribute already exists with
-// a different value, returns an error.
-Status SetTypeAttribute(absl::string_view name, mlir::Type type,
-                        AttrValueMap* values);
+    bool remove_ref_type, AttrValueMap* values);
 
 // Sets shape attribute with the given name. If the attribute already exists
 // with a different value, returns an error.
 Status SetShapeAttribute(absl::string_view name, mlir::ShapedType shape,
                          AttrValueMap* values);
 
-// Sets the given size_t value as an integer attribute with the given name.
-// If the attribute already exists with a different value, returns an error.
-Status SetSizeAttribute(absl::string_view name, size_t size,
-                        AttrValueMap* values);
-
 // Returns true if the given instruction is an mlir::TF::LegacyCallOp or the
 // result of such an operation transformed by the
 // ExecutorToControlDialectConversion pass.
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.cc
new file mode 100644
index 00000000000..8e9495c0454
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.cc
@@ -0,0 +1,56 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
+
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+
+std::string SerializeMlirModule(mlir::ModuleOp module_op) {
+  std::string serialized_mlir_module;
+  llvm::raw_string_ostream os(serialized_mlir_module);
+  mlir::OpPrintingFlags print_flags;
+  print_flags.enableDebugInfo();
+  module_op.print(os, print_flags);
+  return std::move(os.str());
+}
+
+Status DeserializeMlirModule(llvm::StringRef serialized_mlir_module,
+                             mlir::MLIRContext* mlir_context,
+                             mlir::OwningModuleRef* mlir_module) {
+  TF_RET_CHECK(!serialized_mlir_module.empty())
+      << "unexpected empty serialized MLIR module string";
+  TF_RET_CHECK(mlir_module) << "unexpected null MLIR module pointer";
+
+  // Make sure we catch any error reported by MLIR and forward it to the TF
+  // error reporting system.
+  mlir::StatusScopedDiagnosticHandler error_handler(mlir_context);
+
+  // Parse the module.
+  *mlir_module = mlir::parseSourceString(serialized_mlir_module, mlir_context);
+  if (!*mlir_module)
+    return error_handler.Combine(
+        errors::InvalidArgument("could not parse MLIR module"));
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h
new file mode 100644
index 00000000000..12d1c39132e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SERIALIZE_MLIR_MODULE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SERIALIZE_MLIR_MODULE_UTILS_H_
+
+#include <string>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+
+// Prints a MLIR module `module_op` and returns it as a string.
+std::string SerializeMlirModule(mlir::ModuleOp module_op);
+
+// Parses a MLIR module from `mlir_module_string` into `mlir_module` with
+// context `mlir_context`.
+Status DeserializeMlirModule(llvm::StringRef serialized_mlir_module,
+                             mlir::MLIRContext* mlir_context,
+                             mlir::OwningModuleRef* mlir_module);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SERIALIZE_MLIR_MODULE_UTILS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.cc
new file mode 100644
index 00000000000..d82d61ecf9e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.cc
@@ -0,0 +1,414 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/export_utils.h"
+#include "tensorflow/compiler/mlir/utils/array_container_utils.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+#define DEBUG_TYPE "tf-shape-inference-utils"
+
+using ::tensorflow::int64;
+using tensorflow::shape_inference::DimensionHandle;
+using tensorflow::shape_inference::InferenceContext;
+using tensorflow::shape_inference::ShapeHandle;
+
+namespace mlir {
+namespace TF {
+
+namespace {
+
+// Extracts attributes from a MLIR operation, including derived attributes, into
+// one NamedAttrList.
+NamedAttrList GetAllAttributesFromOperation(Operation* op) {
+  NamedAttrList attr_list;
+  attr_list.append(op->getAttrDictionary().getValue());
+
+  if (auto derived = dyn_cast<DerivedAttributeOpInterface>(op)) {
+    auto materialized = derived.materializeDerivedAttributes();
+    attr_list.append(materialized.getValue());
+  }
+
+  return attr_list;
+}
+
+// Extracts a PartialTensorShape from the MLIR type.
+Optional<tensorflow::PartialTensorShape> GetShapeFromMlirType(Type t) {
+  if (auto ranked_type = t.dyn_cast<RankedTensorType>()) {
+    // Convert the MLIR shape indices (int64_t) to TensorFlow indices
+    // (int64).
+    ArrayRef<int64_t> shape = ranked_type.getShape();
+    SmallVector<int64, 8> tf_shape(shape.begin(), shape.end());
+    return tensorflow::PartialTensorShape(
+        MutableArrayRefToSpan<int64>(tf_shape));
+  }
+  return None;
+}
+
+// Gets the subtype's shape and data type for `type`. Templated to support both
+// ResourceType and VariantType.
+template <typename T>
+std::unique_ptr<std::vector<
+    std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>
+GetSubtypesHelper(Type type) {
+  auto type_with_subtypes =
+      type.cast<TensorType>().getElementType().dyn_cast<T>();
+  if (!type_with_subtypes || type_with_subtypes.getSubtypes().empty()) {
+    return nullptr;
+  }
+  auto shapes_and_types = std::make_unique<std::vector<
+      std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>();
+  for (auto subtype : type_with_subtypes.getSubtypes()) {
+    auto shape = GetShapeFromMlirType(subtype);
+    // handle_shapes_and_types requires all shapes to be known. So if any
+    // subtype is unknown, clear the vector.
+    if (!shape) {
+      shapes_and_types = nullptr;
+      break;
+    }
+    tensorflow::DataType dtype;
+    auto status =
+        tensorflow::ConvertToDataType(subtype.getElementType(), &dtype);
+    assert(status.ok() && "Unknown element type");
+    shapes_and_types->emplace_back(*shape, dtype);
+  }
+  return shapes_and_types;
+}
+
+// Gets the subtype's shape and data type for `type`.
+std::unique_ptr<std::vector<
+    std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>
+GetSubtypes(Type type) {
+  auto subclasses = GetSubtypesHelper<TF::ResourceType>(type);
+  if (subclasses) return subclasses;
+  return GetSubtypesHelper<TF::VariantType>(type);
+}
+
+// Returns a shape inference function call failure at `location`.
+LogicalResult EmitErrorFromShapeFunction(Optional<Location> location,
+                                         StringRef op_name,
+                                         StringRef error_message) {
+  LLVM_DEBUG(llvm::dbgs() << "Shape inference error for '" << op_name
+                          << "': " << error_message << "\n");
+  return emitOptionalError(
+      location,
+      llvm::formatv(
+          "TensorFlow shape inference function errored for op '{0}': {1}",
+          op_name, error_message)
+          .str());
+}
+
+// Extracts shape from a shape handle and inference context.
+Optional<SmallVector<int64_t, 8>> GetShapeFromHandle(InferenceContext& context,
+                                                     const ShapeHandle& sh) {
+  if (!context.RankKnown(sh)) return None;
+  SmallVector<int64_t, 8> shape;
+  for (int dim : llvm::seq<int>(0, context.Rank(sh)))
+    shape.push_back(context.Value(context.Dim(sh, dim)));
+  return shape;
+}
+
+// Creates a tensor type from a shape handle and element type.
+TensorType CreateTensorType(InferenceContext& context, const ShapeHandle& sh,
+                            Type element_type) {
+  auto shape = GetShapeFromHandle(context, sh);
+  if (shape.hasValue())
+    return RankedTensorType::get(shape.getValue(), element_type);
+  return UnrankedTensorType::get(element_type);
+}
+
+// Creates a ShapedTypeComponent from a shape handle and element type.
+ShapedTypeComponents CreateShapedTypeComponents(InferenceContext& context,
+                                                const ShapeHandle& sh,
+                                                Type element_type) {
+  auto shape = GetShapeFromHandle(context, sh);
+  if (shape.hasValue())
+    return ShapedTypeComponents(shape.getValue(), element_type);
+  return ShapedTypeComponents(element_type);
+}
+
+}  // namespace
+
+LogicalResult InferReturnTypeComponentsForTFOp(
+    Optional<Location> location, Operation* op, int64_t graph_version,
+    OperandAsConstantFn operand_as_constant_fn,
+    OpResultAsShapeFn op_result_as_shape_fn,
+    ResultElementTypeFn result_element_type_fn,
+    SmallVectorImpl<ShapedTypeComponents>& inferred_return_shapes) {
+  assert(op->getName().getDialect() ==
+         TensorFlowDialect::getDialectNamespace());
+
+  auto op_name_or =
+      tensorflow::GetTensorFlowOpName(op->getName().getStringRef());
+  if (!op_name_or.ok()) {
+    LLVM_DEBUG(llvm::dbgs() << "Skipping inference for unregistered op '"
+                            << op->getName().getStringRef() << "'.\n");
+    return emitOptionalError(location, "op is unregistered");
+  }
+  llvm::StringRef op_name = op_name_or.ConsumeValueOrDie();
+
+  // Get information from the registry and check if we have a shape function for
+  // this op.
+  const tensorflow::OpRegistrationData* op_reg_data =
+      tensorflow::OpRegistry::Global()->LookUp(op_name.str());
+  if (!op_reg_data) {
+    LLVM_DEBUG(llvm::dbgs() << "Skipping inference for unregistered op '"
+                            << op_name << "'.\n");
+    return emitOptionalError(location, "op is unregistered");
+  }
+  if (!op_reg_data->shape_inference_fn) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Skipping inference for op without shape function '"
+               << op_name << "'.\n");
+    return emitOptionalError(location, "missing shape function");
+  }
+
+  // Convert the operation attributes to be able to use the InferenceContext
+  // and the TensorFlow shape function.
+  tensorflow::AttrValueMap attributes;
+  auto attr_status = tensorflow::GetAttrValuesFromOperation(
+      op, op_name, op_reg_data, /*ignore_unregistered_attrs=*/true,
+      &attributes);
+  if (!attr_status.ok()) {
+    LLVM_DEBUG(llvm::dbgs() << "Error creating attribute map for '" << op_name
+                            << "': " << attr_status.error_message() << "\n");
+    return emitOptionalError(location, attr_status.error_message());
+  }
+
+  // Collect an array with input values for constant operands and input shapes
+  // for all the operands.
+  const int num_operands = op->getNumOperands();
+  std::vector<const tensorflow::Tensor*> input_tensors(num_operands);
+  std::vector<tensorflow::PartialTensorShape> input_shapes(num_operands);
+  std::vector<tensorflow::Tensor> tensors(num_operands);
+  std::vector<std::unique_ptr<std::vector<
+      std::pair<tensorflow::PartialTensorShape, tensorflow::DataType>>>>
+      handle_shapes_and_types(num_operands);
+  for (auto it : llvm::enumerate(op->getOperands())) {
+    Value operand = it.value();
+    size_t index = it.index();
+
+    // If the operand is constant, then convert it to Tensor.
+    if (auto attr = operand_as_constant_fn(operand)) {
+      tensorflow::Tensor* input_tensor = &tensors[index];
+      auto status =
+          tensorflow::ConvertToTensor(attr.cast<ElementsAttr>(), input_tensor);
+      if (status.ok()) {
+        input_tensors[index] = input_tensor;
+      } else {
+        LLVM_DEBUG(llvm::dbgs() << "Error converting input " << index
+                                << " of op '" << op_name << "' to Tensor: "
+                                << status.error_message() << "\n");
+      }
+    }
+
+    Type operand_type = operand.getType();
+    if (auto shape = GetShapeFromMlirType(operand_type)) {
+      input_shapes[index] = *shape;
+    }
+    // Collect the handle shapes and types for a resource/variant.
+    handle_shapes_and_types[index] = GetSubtypes(operand_type);
+  }
+
+  // Perform the shape inference using an InferenceContext with the input
+  // shapes. This object is abstracting the information that the ShapeInference
+  // function operates on.
+  InferenceContext c(graph_version, tensorflow::AttrSlice(&attributes),
+                     op_reg_data->op_def, input_shapes, input_tensors,
+                     /*input_tensors_as_shapes=*/{}, handle_shapes_and_types);
+  auto status = c.Run(op_reg_data->shape_inference_fn);
+  if (!status.ok())
+    return EmitErrorFromShapeFunction(location, op_name,
+                                      status.error_message());
+
+  // Determine if, during shape computation, the shape functions attempted to
+  // query an input operand as shape where the input was not known/constant.
+  bool requires_inputs =
+      any_of(llvm::seq<int>(0, c.num_inputs()), [&](int input) {
+        return c.requested_input_tensor_as_partial_shape(input) &&
+               !input_tensors[input];
+      });
+  if (requires_inputs) {
+    LLVM_DEBUG(llvm::dbgs() << "\trequired input\n");
+    std::vector<ShapeHandle> input_tensors_as_shapes;
+    for (int input : llvm::seq<int>(0, c.num_inputs())) {
+      if (c.requested_input_tensor_as_partial_shape(input) &&
+          !input_tensors[input]) {
+        LLVM_DEBUG(llvm::dbgs() << "Requesting " << input << " as shape\n");
+        auto op_result = op->getOperand(input).dyn_cast<OpResult>();
+        if (!op_result) continue;
+        // Resize on first valid shape computed.
+        input_tensors_as_shapes.resize(c.num_inputs());
+        auto handle = op_result_as_shape_fn(c, op_result);
+        LLVM_DEBUG(llvm::dbgs() << "Requested " << input << " as shape "
+                                << (handle.Handle() ? "found" : "not found"));
+        if (handle.Handle()) input_tensors_as_shapes[input] = handle;
+      }
+    }
+
+    // Attempt to compute the unknown operands as shapes.
+    // Note: in the case where no partial outputs could be computed, this
+    // would be empty.
+    if (!input_tensors_as_shapes.empty()) {
+      c.set_input_tensors_as_shapes(input_tensors_as_shapes);
+      auto status = c.Run(op_reg_data->shape_inference_fn);
+      if (!status.ok())
+        return EmitErrorFromShapeFunction(location, op_name,
+                                          status.error_message());
+    }
+  }
+
+  // Update the shape for each of the operation result if the InferenceContext
+  // has more precise shapes recorded.
+  for (int output : llvm::seq<int>(0, c.num_outputs())) {
+    ShapeHandle shape_handle = c.output(output);
+    LLVM_DEBUG(llvm::dbgs() << "Inferred output " << output << " : "
+                            << c.DebugString(shape_handle) << "\n");
+
+    Type new_element_type = result_element_type_fn(output);
+    // Populate the handle shapes for a resource/variant.
+    if (new_element_type &&
+        new_element_type.isa<TF::ResourceType, TF::VariantType>()) {
+      auto handle_shapes_types = c.output_handle_shapes_and_types(output);
+      if (handle_shapes_types) {
+        SmallVector<TensorType, 1> subtypes;
+        Builder b(op->getContext());
+        for (const auto& shape_n_type : *handle_shapes_types) {
+          Type element_type;
+          auto status =
+              tensorflow::ConvertDataType(shape_n_type.dtype, b, &element_type);
+          assert(status.ok() && "Unknown element type");
+          subtypes.push_back(
+              CreateTensorType(c, shape_n_type.shape, element_type));
+        }
+        if (new_element_type.isa<TF::ResourceType>()) {
+          new_element_type = TF::ResourceType::get(subtypes, op->getContext());
+        } else {
+          new_element_type = TF::VariantType::get(subtypes, op->getContext());
+        }
+      }
+    }
+    inferred_return_shapes.push_back(
+        CreateShapedTypeComponents(c, shape_handle, new_element_type));
+  }
+
+  return success();
+}
+
+LogicalResult InferReturnTypeComponentsForTFOp(
+    Optional<Location> location, Operation* op, int64_t graph_version,
+    SmallVectorImpl<ShapedTypeComponents>& inferred_return_shapes) {
+  if (auto type_op = dyn_cast<InferTypeOpInterface>(op)) {
+    auto attributes = GetAllAttributesFromOperation(op);
+    SmallVector<Type, 4> inferred_return_types;
+    auto result = type_op.inferReturnTypes(
+        op->getContext(), location, op->getOperands(),
+        DictionaryAttr::get(attributes, op->getContext()), op->getRegions(),
+        inferred_return_types);
+    if (failed(result)) return failure();
+
+    inferred_return_shapes.resize(inferred_return_types.size());
+    for (auto inferred_return_type : llvm::enumerate(inferred_return_types)) {
+      if (auto shaped_type =
+              inferred_return_type.value().dyn_cast<ShapedType>()) {
+        if (shaped_type.hasRank()) {
+          inferred_return_shapes[inferred_return_type.index()] =
+              ShapedTypeComponents(shaped_type.getShape(),
+                                   shaped_type.getElementType());
+        } else {
+          inferred_return_shapes[inferred_return_type.index()] =
+              ShapedTypeComponents(shaped_type.getElementType());
+        }
+      }
+    }
+
+    return success();
+  }
+
+  if (auto shape_type_op = dyn_cast<InferShapedTypeOpInterface>(op)) {
+    auto attributes = GetAllAttributesFromOperation(op);
+    return shape_type_op.inferReturnTypeComponents(
+        op->getContext(), location, op->getOperands(),
+        DictionaryAttr::get(attributes, op->getContext()), op->getRegions(),
+        inferred_return_shapes);
+  }
+
+  auto operand_as_constant_fn = [](Value operand) -> Attribute {
+    Attribute attr;
+    if (matchPattern(operand, m_Constant(&attr))) return attr;
+    return nullptr;
+  };
+
+  auto op_result_as_shape_fn = [](InferenceContext& ic,
+                                  OpResult op_result) -> ShapeHandle {
+    auto rt = op_result.getType().dyn_cast<RankedTensorType>();
+    if (!rt || rt.getRank() != 1 || !rt.hasStaticShape()) return {};
+
+    std::vector<DimensionHandle> dims(rt.getDimSize(0), ic.UnknownDim());
+    Attribute attr;
+    if (matchPattern(op_result, m_Constant(&attr))) {
+      auto elements = attr.dyn_cast<DenseIntElementsAttr>();
+      if (elements)
+        for (auto element : llvm::enumerate(elements.getIntValues()))
+          dims[element.index()] = ic.MakeDim(element.value().getSExtValue());
+    }
+    return ic.MakeShape(dims);
+  };
+
+  auto result_element_type_fn = [](int) -> Type { return nullptr; };
+
+  return InferReturnTypeComponentsForTFOp(
+      location, op, graph_version, operand_as_constant_fn,
+      op_result_as_shape_fn, result_element_type_fn, inferred_return_shapes);
+}
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h
new file mode 100644
index 00000000000..eda2bc49514
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SHAPE_INFERENCE_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SHAPE_INFERENCE_UTILS_H_
+
+#include <cstdint>
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace mlir {
+namespace TF {
+
+// Function that takes in a value and extracts a constant from it, if available.
+// If the value cannot be resolved as a constant, a nullptr will be returned.
+// Certain shape functions require constant values as arguments.
+using OperandAsConstantFn = llvm::function_ref<Attribute(Value)>;
+
+// Function that takes in an operation result and computes a shape (can be
+// partial) value. Certain shape functions require shape values as arguments.
+using OpResultAsShapeFn =
+    llvm::function_ref<tensorflow::shape_inference::ShapeHandle(
+        tensorflow::shape_inference::InferenceContext&, OpResult)>;
+
+// Function that takes a result index and returns the element type. Element
+// types are necessary for handle types (resource, variant).
+using ResultElementTypeFn = llvm::function_ref<Type(int)>;
+
+// Runs TensorFlow shape inference associated to the op type registered in the
+// TensorFlow op registry based on the Graph version, operands, and attributes.
+// Invoking this shape function will create conversions of parameters to the
+// TensorFlow Graph equivalent data structures and back to MLIR equivalent data
+// structures. This does not use a natively implemented shape inference in MLIR,
+// and instead is temporary until shape functions are reimplemented/migrated to
+// being in MLIR instead of the TensorFlow op registry.
+LogicalResult InferReturnTypeComponentsForTFOp(
+    Optional<Location> location, Operation* op, int64_t graph_version,
+    OperandAsConstantFn operand_as_constant_fn,
+    OpResultAsShapeFn op_result_as_shape_fn,
+    ResultElementTypeFn result_element_type_fn,
+    SmallVectorImpl<ShapedTypeComponents>& inferred_return_shapes);
+
+// Runs TensorFlow shape inference for an operation for a given Graph version.
+// If an operation implements the `InferTypeOpInterface` or
+// `InferShapedTypeOpInterface` interfaces, those are used instead but with
+// derived attributes populated. Otherwise the above function is used but with
+// default `operand_as_constant_fn` and `op_result_as_shape_fn` that only
+// extracts a value if the operands are constant (no partial evaluation, and an
+// empty `result_element_type_fn`. Element types with subtypes (DT_RESOURCE,
+// DT_VARIANT) are not supported.
+LogicalResult InferReturnTypeComponentsForTFOp(
+    Optional<Location> location, Operation* op, int64_t graph_version,
+    SmallVectorImpl<ShapedTypeComponents>& inferred_return_shapes);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SHAPE_INFERENCE_UTILS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc b/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc
new file mode 100644
index 00000000000..bcc3fe62f99
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc
@@ -0,0 +1,334 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Translation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
+#include "tensorflow/compiler/mlir/utils/string_container_utils.h"
+#include "tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.h"
+#include "tensorflow/compiler/tf2xla/xla_argument.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+
+// NOLINTNEXTLINE
+llvm::cl::opt<std::string> input_types(
+    "tf-xla-input-types",
+    llvm::cl::desc("XLA input argument types (kinds), separated by ','. "
+                   "Supported types include ['parameter', 'resource']. If "
+                   "empty, all arguments are assumed to be parameters."),
+    llvm::cl::init(""));
+
+namespace tensorflow {
+
+namespace {
+
+mlir::LogicalResult PrintHloModuleText(
+    const XlaCompilationResult& compilation_result, llvm::raw_ostream& output) {
+  const xla::HloModuleConfig module_config(
+      compilation_result.computation->GetProgramShape().ValueOrDie());
+  auto status_or_hlo_module = xla::HloModule::CreateFromProto(
+      compilation_result.computation->proto(), module_config);
+  if (!status_or_hlo_module.ok()) {
+    LOG(ERROR) << "Conversion to HLO module failed: "
+               << status_or_hlo_module.status().ToString();
+    return mlir::failure();
+  }
+
+  xla::HloModule* hlo_module = status_or_hlo_module.ValueOrDie().get();
+
+  output << hlo_module->ToString();
+
+  if (!compilation_result.input_mapping.empty())
+    output << "// InputMapping {"
+           << absl::StrJoin(compilation_result.input_mapping, ", ") << "}\n";
+
+  for (const auto& xla_input_shape : compilation_result.xla_input_shapes)
+    output << "// XlaInputShape " << xla_input_shape.ToString() << '\n';
+
+  output << "// XlaOutputShape "
+         << compilation_result.xla_output_shape.ToString() << '\n';
+
+  for (const auto& xla_output_description : compilation_result.outputs) {
+    output << "// XlaOutputDescription type="
+           << DataTypeString(xla_output_description.type) << " shape=("
+           << absl::StrJoin(xla_output_description.shape.dim_sizes(), ", ")
+           << ')';
+    if (xla_output_description.input_index >= 0)
+      output << " input_index=" << xla_output_description.input_index;
+    if (xla_output_description.is_constant) output << " constant";
+    if (xla_output_description.is_tensor_list) output << " tensor_list";
+    output << '\n';
+  }
+
+  for (const auto& resource_update : compilation_result.resource_updates) {
+    output << "// ResourceUpdate input_index=" << resource_update.input_index
+           << " type=" << DataTypeString(resource_update.type) << " shape=("
+           << absl::StrJoin(resource_update.shape.dim_sizes(), " ") << ')';
+    if (resource_update.modified) output << " modified";
+    output << '\n';
+  }
+
+  return mlir::success();
+}
+
+Status ParseArgumentShapes(
+    absl::string_view input_shapes_str,
+    llvm::SmallVectorImpl<TensorOrResourceShape>& arg_shapes) {
+  arg_shapes.clear();
+  std::vector<std::vector<int>> input_shapes_vector;
+  TF_RETURN_IF_ERROR(ParseNodeShapes(input_shapes_str, input_shapes_vector));
+  arg_shapes.resize(input_shapes_vector.size());
+  for (const auto& shape : llvm::enumerate(input_shapes_vector))
+    TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(
+        shape.value(), &arg_shapes[shape.index()].shape));
+
+  return Status::OK();
+}
+
+Status ParseDataTypes(absl::string_view data_types_str,
+                      llvm::SmallVectorImpl<DataType>& data_types) {
+  data_types.clear();
+  std::vector<std::string> input_dtypes_vector;
+  TF_RETURN_IF_ERROR(ParseNodeDataTypes(data_types_str, input_dtypes_vector));
+  data_types.resize(input_dtypes_vector.size(), DT_INVALID);
+  for (auto data_type : llvm::enumerate(input_dtypes_vector)) {
+    if (!DataType_Parse(data_type.value(), &data_types[data_type.index()]))
+      return errors::InvalidArgument("Invalid dtype at index ",
+                                     data_type.index(), ": ",
+                                     data_type.value());
+    const auto& resolved_dtype = data_types[data_type.index()];
+    if (resolved_dtype == DT_INVALID || resolved_dtype == DT_STRING ||
+        resolved_dtype == DT_RESOURCE || resolved_dtype == DT_VARIANT ||
+        IsRefType(resolved_dtype))
+      return errors::InvalidArgument("Unsupported dtype at index ",
+                                     data_type.index(), ": ",
+                                     data_type.value());
+  }
+
+  return Status::OK();
+}
+
+Status ParseArgumentKinds(
+    absl::string_view input_types_str,
+    llvm::SmallVectorImpl<XlaArgument::Kind>& argument_kinds) {
+  argument_kinds.clear();
+  if (input_types_str.empty()) return Status::OK();
+
+  std::vector<absl::string_view> argument_kind_strs =
+      absl::StrSplit(input_types_str, ',');
+  argument_kinds.reserve(argument_kind_strs.size());
+  for (const auto& argument_kind_str : llvm::enumerate(argument_kind_strs)) {
+    const auto& value = argument_kind_str.value();
+    if (value == "parameter") {
+      argument_kinds.push_back(XlaArgument::Kind::kParameter);
+    } else if (value == "resource") {
+      argument_kinds.push_back(XlaArgument::Kind::kResource);
+    } else {
+      return errors::InvalidArgument(
+          "Unsupported TF/XLA argument kind at index ",
+          argument_kind_str.index(), ": ", value);
+    }
+  }
+
+  return Status::OK();
+}
+
+Status ParseXlaArguments(absl::string_view input_shapes_str,
+                         absl::string_view input_dtypes_str,
+                         absl::string_view arg_kinds_str,
+                         llvm::SmallVectorImpl<XlaArgument>& xla_arguments) {
+  xla_arguments.clear();
+  std::vector<std::vector<int>> input_shapes_vector;
+  TF_RETURN_IF_ERROR(
+      tensorflow::ParseNodeShapes(input_shapes_str, input_shapes_vector));
+  llvm::SmallVector<DataType, 4> dtypes_vector;
+  TF_RETURN_IF_ERROR(ParseDataTypes(input_dtypes_str, dtypes_vector));
+  llvm::SmallVector<XlaArgument::Kind, 4> arg_kinds_vector;
+  TF_RETURN_IF_ERROR(ParseArgumentKinds(arg_kinds_str, arg_kinds_vector));
+
+  if (input_shapes_vector.empty())
+    input_shapes_vector.resize(dtypes_vector.size());
+
+  if (arg_kinds_vector.empty())
+    arg_kinds_vector.resize(input_shapes_vector.size(),
+                            XlaArgument::Kind::kParameter);
+
+  if (input_shapes_vector.size() != dtypes_vector.size() ||
+      input_shapes_vector.size() != arg_kinds_vector.size())
+    return errors::InvalidArgument(
+        "Input shapes, dtypes, and types/kinds must be of the same "
+        "length, but got ",
+        input_shapes_vector.size(), ", ", dtypes_vector.size(), ", and ",
+        arg_kinds_vector.size(), " respectively");
+
+  xla_arguments.resize(input_shapes_vector.size());
+  for (const auto& arg_components :
+       llvm::zip(xla_arguments, input_shapes_vector, dtypes_vector,
+                 arg_kinds_vector)) {
+    XlaArgument& arg = std::get<0>(arg_components);
+    TensorShape shape;
+    TF_RETURN_IF_ERROR(
+        TensorShapeUtils::MakeShape(std::get<1>(arg_components), &shape));
+    arg.shape = std::move(shape);
+    arg.type = std::get<2>(arg_components);
+    arg.kind = std::get<3>(arg_components);
+  }
+
+  return Status::OK();
+}
+
+}  // anonymous namespace
+
+static mlir::LogicalResult MlirTfToHloTextTranslateFunction(
+    mlir::ModuleOp module_op, llvm::raw_ostream& output) {
+  if (!module_op) return mlir::failure();
+
+  llvm::SmallVector<TensorOrResourceShape, 4> arg_shapes;
+  auto args_status =
+      ParseArgumentShapes(mlir::StringRefToView(input_shapes), arg_shapes);
+  if (!args_status.ok()) {
+    LOG(ERROR) << args_status.ToString();
+    return mlir::failure();
+  }
+
+  XlaCompilationResult compilation_result;
+  auto compilation_status = CompileMlirToXlaHlo(
+      module_op, arg_shapes, /*device_type=*/"XLA_CPU_JIT", emit_use_tuple_arg,
+      emit_return_tuple, IdentityShapeRepresentationFn(), &compilation_result,
+      /*custom_legalization_passes=*/{});
+  if (!compilation_status.ok()) {
+    LOG(ERROR) << "TF/XLA compilation failed: "
+               << compilation_status.ToString();
+    return mlir::failure();
+  }
+
+  return PrintHloModuleText(compilation_result, output);
+}
+
+static mlir::LogicalResult MlirTfGraphToHloTextTranslateFunction(
+    mlir::ModuleOp module_op, llvm::raw_ostream& output) {
+  if (!module_op) return mlir::failure();
+
+  llvm::SmallVector<XlaArgument, 4> xla_arguments;
+  auto args_status = ParseXlaArguments(
+      mlir::StringRefToView(input_shapes), mlir::StringRefToView(input_dtypes),
+      mlir::StringRefToView(input_types), xla_arguments);
+  if (!args_status.ok()) {
+    LOG(ERROR) << args_status.ToString();
+    return mlir::failure();
+  }
+
+  XlaCompilationResult compilation_result;
+  auto compilation_status = CompileGraphToXlaHlo(
+      module_op, xla_arguments, /*device_type=*/"XLA_CPU_JIT",
+      emit_use_tuple_arg, emit_return_tuple, IdentityShapeRepresentationFn(),
+      &compilation_result, /*custom_legalization_passes=*/{});
+  if (!compilation_status.ok()) {
+    LOG(ERROR) << "TF/XLA compilation failed: "
+               << compilation_status.ToString();
+    return mlir::failure();
+  }
+
+  return PrintHloModuleText(compilation_result, output);
+}
+
+static void RegisterMlirInputDialects(mlir::DialectRegistry& registry) {
+  registry.insert<mlir::StandardOpsDialect, mlir::TF::TensorFlowDialect>();
+}
+
+static void RegisterGraphInputDialects(mlir::DialectRegistry& registry) {
+  RegisterMlirInputDialects(registry);
+  registry.insert<mlir::tf_executor::TensorFlowExecutorDialect>();
+}
+
+static mlir::OwningModuleRef SerializedMlirStringAttrToMlirModuleTranslate(
+    llvm::StringRef input, mlir::MLIRContext* context) {
+  mlir::Attribute attr = mlir::parseAttribute(input, context);
+  if (!attr || !attr.isa<mlir::StringAttr>()) {
+    LOG(ERROR) << "Input is not parsable as a MLIR StringAttr.";
+    return nullptr;
+  }
+  auto str_attr = attr.cast<mlir::StringAttr>();
+
+  RegisterMlirInputDialects(context->getDialectRegistry());
+  mlir::OwningModuleRef module_ref;
+  auto status =
+      DeserializeMlirModule(str_attr.getValue().str(), context, &module_ref);
+  if (!status.ok()) {
+    LOG(ERROR) << status.ToString();
+    return nullptr;
+  }
+
+  return module_ref;
+}
+
+static mlir::LogicalResult MlirModuleToSerializedMlirStringAttrTranslate(
+    mlir::ModuleOp module_op, llvm::raw_ostream& output) {
+  output << "\"";
+  std::string serialized_module = SerializeMlirModule(module_op);
+  llvm::printEscapedString(serialized_module, output);
+  output << "\"";
+  return mlir::success();
+}
+
+}  // namespace tensorflow
+
+static mlir::TranslateFromMLIRRegistration MlirTfToHloTextTranslate(
+    "mlir-tf-to-hlo-text", tensorflow::MlirTfToHloTextTranslateFunction,
+    tensorflow::RegisterMlirInputDialects);
+
+static mlir::TranslateFromMLIRRegistration MlirTfGraphToHloTextTranslate(
+    "mlir-tf-graph-to-hlo-text",
+    tensorflow::MlirTfGraphToHloTextTranslateFunction,
+    tensorflow::RegisterGraphInputDialects);
+
+static mlir::TranslateToMLIRRegistration SerializedMlirStringAttrToMlirModule(
+    "mlir-tf-str-attr-to-mlir",
+    tensorflow::SerializedMlirStringAttrToMlirModuleTranslate);
+
+static mlir::TranslateFromMLIRRegistration MlirModuleToSerializedMlirStringAttr(
+    "mlir-tf-mlir-to-str-attr",
+    tensorflow::MlirModuleToSerializedMlirStringAttrTranslate,
+    tensorflow::RegisterMlirInputDialects);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
index 843d491c330..3516e3a65d9 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
@@ -374,9 +374,8 @@ GetGeneralTPUExecutionDeviceAssignment(
     return (x + bound_x * (y + bound_y * z)) * bound_core + core;
   };
 
-  std::vector<bool> used_device_ids(
-      location_to_id(bound_x - 1, bound_y - 1, bound_z - 1, bound_core - 1),
-      false);
+  std::vector<bool> used_device_ids(bound_x * bound_y * bound_z * bound_core,
+                                    false);
   TPUDevicesAndHosts devices_and_hosts(
       num_replicas, llvm::SmallVector<TPUDeviceAndHost, 8>(
                         num_cores_per_replica, TPUDeviceAndHost()));
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index 19eb5b2c476..8cf06259142 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -760,8 +760,8 @@ TEST(TPURewriteDeviceUtilTest, TestGetHostDeviceTPUReplicate) {
       devices;
   auto replicate = builder.create<mlir::tf_device::ReplicateOp>(
       mlir::UnknownLoc::get(&context), /*num_replicas=*/2, devices,
-      llvm::ArrayRef<std::pair<llvm::ArrayRef<mlir::Value>, mlir::Type>>{},
-      llvm::ArrayRef<mlir::Value>{}, llvm::ArrayRef<mlir::Type>{});
+      llvm::ArrayRef<std::pair<mlir::ValueRange, mlir::Type>>{},
+      mlir::ValueRange{}, mlir::TypeRange{});
   builder.setInsertionPoint(&replicate.body().front(),
                             replicate.body().front().begin());
 
diff --git a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
index 144e22750ca..e5408cef828 100644
--- a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
@@ -13,40 +13,33 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/ToolOutputFile.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
-#include "mlir/IR/AsmState.h"  // from @llvm-project
 #include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "mlir/InitAllPasses.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Support/FileUtilities.h"  // from @llvm-project
 #include "mlir/Support/MlirOptMain.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/register_passes.h"
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
 #include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/platform/logging.h"
 
 int main(int argc, char **argv) {
   tensorflow::InitMlir y(&argc, &argv);
 
   mlir::registerAllPasses();
+  mlir::mhlo::registerAllMhloPasses();
+  mlir::lmhlo::registerAllLmhloPasses();
+  mlir::mhlo::registerAllMhloPasses();
 
   mlir::DialectRegistry registry;
   mlir::registerAllDialects(registry);
+  mlir::RegisterAllTensorFlowDialects(registry);
+  mlir::mhlo::registerAllMhloDialects(registry);
   registry.insert<mlir::shape::ShapeDialect>();
-  registry.insert<mlir::TF::TensorFlowDialect>();
   registry.insert<mlir::TFL::TensorFlowLiteDialect>();
-  registry.insert<mlir::tf_device::TensorFlowDeviceDialect>();
-  registry.insert<mlir::tf_executor::TensorFlowExecutorDialect>();
-  registry.insert<mlir::tf_saved_model::TensorFlowSavedModelDialect>();
+  registry.insert<mlir::kernel_gen::tf_framework::TFFrameworkDialect>();
   return failed(
       mlir::MlirOptMain(argc, argv, "TensorFlow pass driver\n", registry));
 }
diff --git a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
index 9b0b3aaa82b..3ea92a70ec7 100644
--- a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
@@ -111,8 +111,6 @@ int main(int argc, char** argv) {
 
   if (import_saved_model_object_graph) {
     mlir::MLIRContext context;
-    context.loadAllGloballyRegisteredDialects();
-
     auto module_or = tensorflow::SavedModelObjectGraphToMlirImport(
         input_filename, tags, exported_names, &context);
     if (!module_or.status().ok()) return 1;
@@ -120,8 +118,6 @@ int main(int argc, char** argv) {
     module_or.ConsumeValueOrDie()->print(output->os());
   } else if (import_saved_model_signature_defs) {
     mlir::MLIRContext context;
-    context.loadAllGloballyRegisteredDialects();
-
     auto module_or = tensorflow::SavedModelSignatureDefsToMlirImport(
         input_filename, tags, exported_names, &context, upgrade_legacy);
     if (!module_or.status().ok()) return 1;
@@ -141,7 +137,6 @@ int main(int argc, char** argv) {
       llvm::SourceMgr sourceMgr;
       sourceMgr.AddNewSourceBuffer(std::move(ownedBuffer), llvm::SMLoc());
       mlir::MLIRContext context;
-      context.loadAllGloballyRegisteredDialects();
       mlir::SourceMgrDiagnosticHandler diagnostic_handler(sourceMgr, &context);
       return (*requested_translation)(sourceMgr, os, &context);
     };
diff --git a/tensorflow/compiler/mlir/tfjs/BUILD b/tensorflow/compiler/mlir/tfjs/BUILD
index 4db960085ec..34686cc0f68 100644
--- a/tensorflow/compiler/mlir/tfjs/BUILD
+++ b/tensorflow/compiler/mlir/tfjs/BUILD
@@ -1,3 +1,9 @@
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//third_party/mlir:tblgen.bzl", "gentbl")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
@@ -16,6 +22,7 @@ filegroup(
 
 gentbl(
     name = "tfjs_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-op-decls",
@@ -70,6 +77,7 @@ cc_library(
 
 gentbl(
     name = "tfjs_optimize_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         (
             "-gen-rewriters",
@@ -117,7 +125,6 @@ cc_library(
         ":tfjs_optimize",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:decode_constant_pass",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_passes",
@@ -141,7 +148,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
         "//tensorflow/compiler/mlir/tensorflow:export_utils",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
@@ -179,7 +185,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
diff --git a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.cc b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.cc
index 331bed09dce..5ea3f51b475 100644
--- a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.cc
+++ b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h"
 
-namespace mlir {
-namespace tfjs {
-
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.cc.inc"
 
+namespace mlir {
+namespace tfjs {
+
 //===----------------------------------------------------------------------===//
 // TFJSDialect
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
index 9c98c9b0e19..bc52e3a0c7a 100644
--- a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
+++ b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
@@ -29,15 +29,9 @@ limitations under the License.
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 
-namespace mlir {
-namespace tfjs {
-
 #include "tensorflow/compiler/mlir/tfjs/ir/tfjs_dialect.h.inc"
 
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h.inc"
 
-}  // namespace tfjs
-}  // namespace mlir
-
 #endif  // TENSORFLOW_COMPILER_MLIR_TFJS_IR_TFJS_OPS_H_
diff --git a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td
index 134aa010d8c..e2539c2f6d8 100644
--- a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td
+++ b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td
@@ -39,7 +39,7 @@ def TFJSDialect : Dialect {
     TF graphs to be deployed on TFJS.
   }];
 
-  let cppNamespace = "tfjs";
+  let cppNamespace = "::mlir::tfjs";
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tfjs/tests/BUILD b/tensorflow/compiler/mlir/tfjs/tests/BUILD
index 5789480c3ba..979a9b773f2 100644
--- a/tensorflow/compiler/mlir/tfjs/tests/BUILD
+++ b/tensorflow/compiler/mlir/tfjs/tests/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(licenses = ["notice"])
diff --git a/tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD b/tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD
index 5c8d37da2f0..1fc3d51cb24 100644
--- a/tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD
+++ b/tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 licenses(["notice"])
diff --git a/tensorflow/compiler/mlir/tfjs/transforms/optimize.cc b/tensorflow/compiler/mlir/tfjs/transforms/optimize.cc
index a3678f7d154..5d3ee121577 100644
--- a/tensorflow/compiler/mlir/tfjs/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/tfjs/transforms/optimize.cc
@@ -50,7 +50,7 @@ void Optimize::runOnFunction() {
   auto *ctx = &getContext();
   auto func = getFunction();
 
-  populateWithGenerated(ctx, &patterns);
+  populateWithGenerated(ctx, patterns);
   applyPatternsAndFoldGreedily(func, patterns);
 }
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tfr/BUILD b/tensorflow/compiler/mlir/tfr/BUILD
new file mode 100644
index 00000000000..2861dd92d5d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/BUILD
@@ -0,0 +1,362 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
+load("//tensorflow/compiler/mlir/tfr:build_defs.bzl", "gen_op_libraries")
+load(
+    "//third_party/mlir:tblgen.bzl",
+    "gentbl",
+)
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+
+package(
+    default_visibility = [
+        ":friends",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+package_group(
+    name = "friends",
+    includes = ["//third_party/mlir:subpackages"],
+    packages = [
+        "//learning/brain/experimental/mlir/tfr/...",
+        "//tensorflow/c/...",
+        "//tensorflow/compiler/...",
+    ],
+)
+
+filegroup(
+    name = "tfr_ops_td_files",
+    srcs = [
+        "ir/tfr_ops.td",
+        "//tensorflow/compiler/mlir/tensorflow:ir/tf_op_base.td",
+        "//tensorflow/compiler/mlir/tensorflow:ir/tf_op_interfaces.td",
+        "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:include/mlir/Dialect/Shape/IR/ShapeBase.td",
+        "@llvm-project//mlir:include/mlir/Dialect/Shape/IR/ShapeOps.td",
+        "@llvm-project//mlir:include/mlir/IR/SymbolInterfaces.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/ControlFlowInterfaces.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
+    ],
+)
+
+gentbl(
+    name = "tfr_ops_inc_gen",
+    tbl_outs = [
+        (
+            "-gen-op-decls",
+            "ir/tfr_ops.h.inc",
+        ),
+        (
+            "-gen-op-defs",
+            "ir/tfr_ops.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "ir/tfr_ops.td",
+    td_srcs = [
+        ":tfr_ops_td_files",
+    ],
+)
+
+cc_library(
+    name = "tfr",
+    srcs = [
+        "ir/tfr_ops.cc",
+        "ir/tfr_ops.cc.inc",
+    ],
+    hdrs = [
+        "ir/tfr_ops.h",
+        "ir/tfr_ops.h.inc",
+        "ir/tfr_types.h",
+    ],
+    deps = [
+        ":tfr_ops_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ControlFlowInterfaces",
+        "@llvm-project//mlir:Dialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Shape",
+        "@llvm-project//mlir:SideEffects",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+cc_library(
+    name = "utils",
+    srcs = [
+        "utils/utils.cc",
+    ],
+    hdrs = [
+        "utils/utils.h",
+    ],
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "passes",
+    srcs = [
+        "passes/canonicalize.cc",
+        "passes/decompose.cc",
+        "passes/raise_to_tf.cc",
+    ],
+    hdrs = [
+        "passes/passes.h",
+    ],
+    deps = [
+        ":tfr",
+        ":utils",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/memory",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:SCFToStandard",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_binary(
+    name = "tfr-opt",
+    srcs = ["passes/tfr_opt.cc"],
+    deps = [
+        ":passes",
+        ":tfr",
+        "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir:passes",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
+        "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Shape",
+        "@llvm-project//mlir:StandardOps",
+    ],
+)
+
+glob_lit_tests(
+    data = [
+        ":test_utilities",
+        "@llvm-project//mlir:run_lit.sh",
+    ],
+    driver = "//tensorflow/compiler/mlir:run_lit.sh",
+    test_file_exts = ["mlir"],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/mlir/tfr:tfr-opt",
+        "@llvm-project//llvm:FileCheck",
+        "@llvm-project//llvm:not",
+    ],
+)
+
+cc_library(
+    name = "tfr_decompose_ctx",
+    srcs = ["integration/tfr_decompose_ctx.cc"],
+    hdrs = ["integration/tfr_decompose_ctx.h"],
+    deps = [
+        ":passes",
+        ":tfr",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_attr",
+        "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
+        "//tensorflow/compiler/mlir/tensorflow:convert_type",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Shape",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+tf_cc_test(
+    name = "tfr_decompose_ctx_test",
+    srcs = ["integration/tfr_decompose_ctx_test.cc"],
+    deps = [
+        ":tfr_decompose_ctx",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "graph_decompose_pass",
+    srcs = ["integration/graph_decompose_pass.cc"],
+    hdrs = ["integration/graph_decompose_pass.h"],
+    deps = [
+        ":tfr_decompose_ctx",
+        "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
+        "//tensorflow/stream_executor/lib",
+        "@llvm-project//mlir:IR",
+    ],
+    alwayslink = 1,
+)
+
+tf_py_test(
+    name = "graph_decompose_test",
+    size = "small",
+    srcs = ["integration/graph_decompose_test.py"],
+    data = ["//tensorflow/compiler/mlir/tfr/resources:decomposition_lib"],
+    python_version = "PY3",
+    tags = [
+        "no_pip",
+        "no_windows",  # TODO(b/170752141)
+        "nomac",  # TODO(b/170752141)
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/tfr/resources:composite_ops",
+        "//tensorflow/python/eager:def_function",
+    ],
+)
+
+cc_library(
+    name = "node_expansion_pass",
+    srcs = ["integration/node_expansion_pass.cc"],
+    hdrs = ["integration/node_expansion_pass.h"],
+    deps = [
+        ":tfr_decompose_ctx",
+        "//tensorflow/core/common_runtime/eager:core",
+        "//tensorflow/core/common_runtime/eager:eager_op_rewrite_registry",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings",
+    ],
+    alwayslink = 1,
+)
+
+tf_py_test(
+    name = "node_expansion_test",
+    size = "small",
+    srcs = ["integration/node_expansion_test.py"],
+    data = ["//tensorflow/compiler/mlir/tfr/resources:decomposition_lib"],
+    python_version = "PY3",
+    tags = [
+        "no_pip",
+        "no_windows",  # TODO(b/170752141)
+        "nomac",  # TODO(b/170752141)
+    ],
+    deps = [
+        "//tensorflow/compiler/mlir/tfr/resources:composite_ops",
+    ],
+)
+
+tf_python_pybind_extension(
+    name = "tfr_wrapper",
+    srcs = ["python/tfr_wrapper.cc"],
+    module_name = "tfr_wrapper",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tfr",
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:pybind11_status",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Shape",
+        "@llvm-project//mlir:StandardOps",
+        "@pybind11",
+    ],
+)
+
+py_library(
+    name = "composite",
+    srcs = ["python/composite.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_library(
+    name = "tfr_gen",
+    srcs = ["python/tfr_gen.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/compiler/mlir/tfr:tfr_wrapper",
+    ],
+)
+
+tf_py_test(
+    name = "tfr_gen_test",
+    size = "small",
+    srcs = ["python/tfr_gen_test.py"],
+    python_version = "PY3",
+    tags = ["no_pip"],
+    deps = [
+        ":composite",
+        ":tfr_gen",
+        "//tensorflow/compiler/mlir/python/mlir_wrapper:filecheck_wrapper",
+        "//tensorflow/compiler/mlir/tfr/resources:test_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+py_library(
+    name = "op_reg_gen",
+    srcs = ["python/op_reg_gen.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_test(
+    name = "op_reg_gen_test",
+    size = "small",
+    srcs = ["python/op_reg_gen_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":composite",
+        ":op_reg_gen",
+        "//tensorflow/compiler/mlir/python/mlir_wrapper:filecheck_wrapper",
+    ],
+)
+
+py_library(
+    name = "test_utils",
+    srcs = ["python/test_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+gen_op_libraries(
+    name = "one_op",
+    src = "define_op_template.py",
+)
diff --git a/tensorflow/compiler/mlir/tfr/build_defs.bzl b/tensorflow/compiler/mlir/tfr/build_defs.bzl
new file mode 100644
index 00000000000..2b92d8a652a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/build_defs.bzl
@@ -0,0 +1,116 @@
+"""BUILD extension for TF composition project."""
+
+load("//tensorflow:tensorflow.bzl", "py_binary", "tf_custom_op_library", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+
+def gen_op_libraries(
+        name,
+        src,
+        deps = [],
+        tags = [],
+        test = False):
+    """gen_op_libraries() generates all cc and py libraries for composite op source.
+
+    Args:
+        name: used as the name component of all the generated libraries.
+        src: File contains the composite ops.
+        deps: Libraries the 'src' depends on.
+        tags:
+        test:
+    """
+    if not src.endswith(".py") or name == src[:-3]:
+        fail("'src' %s conflicts with op Python wrapper. Rename it to be different from 'name'." % src)
+
+    gen_op_lib_exec = src[:-3]  # Strip off the .py
+    py_binary(
+        name = gen_op_lib_exec,
+        srcs = [src],
+        srcs_version = "PY2AND3",
+        python_version = "PY3",
+        deps = [
+            "//tensorflow/compiler/mlir/tfr:op_reg_gen",
+            "//tensorflow/compiler/mlir/tfr:tfr_gen",
+            "//tensorflow/compiler/mlir/tfr:composite",
+        ] + deps,
+    )
+
+    registed_op = "registed_" + name
+    native.genrule(
+        name = registed_op,
+        srcs = [],
+        outs = [name + ".inc.cc"],
+        cmd = "$(location %s) --output=$@ --gen_register_op=true" % gen_op_lib_exec,
+        exec_tools = [":" + gen_op_lib_exec],
+        tags = tags,
+    )
+
+    native.cc_library(
+        name = name + "_cc",
+        testonly = test,
+        srcs = [":" + registed_op],
+        deps = [
+            "//tensorflow/core:framework",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:protos_all_cc",
+        ],
+        alwayslink = 1,
+    )
+
+    tf_custom_op_library(
+        name = name + ".so",
+        srcs = [":" + registed_op],
+    )
+
+    tf_gen_op_wrapper_py(
+        name = "gen_" + name,
+        out = "gen_" + name + ".py",
+        deps = [
+            ":%s_cc" % name,
+        ],
+    )
+
+    tf_custom_op_py_library(
+        name = name,
+        dso = [":%s.so" % name],
+        kernels = [":%s_cc" % name],
+        srcs_version = "PY2AND3",
+        deps = [
+            ":gen_%s" % name,
+        ],
+    )
+
+    # Link the register op and rebuild the binary
+    gen_tfr_lib_exec = gen_op_lib_exec + "_with_op_library"
+    py_binary(
+        name = gen_tfr_lib_exec,
+        main = src,
+        srcs = [src],
+        srcs_version = "PY2AND3",
+        python_version = "PY3",
+        deps = [
+            "//tensorflow/compiler/mlir/tfr:op_reg_gen",
+            "//tensorflow/compiler/mlir/tfr:tfr_gen",
+            "//tensorflow/compiler/mlir/tfr:composite",
+            ":%s" % name,
+        ] + deps,
+    )
+
+    native.genrule(
+        name = name + "_mlir",
+        srcs = [],
+        outs = [name + ".mlir"],
+        cmd = "$(location %s) --output=$@ --gen_register_op=false" % gen_tfr_lib_exec,
+        exec_tools = [":" + gen_tfr_lib_exec],
+        tags = tags,
+    )
+
+    native.py_library(
+        name = name + "_py",
+        srcs = [src],
+        srcs_version = "PY2AND3",
+        deps = [
+            "//tensorflow/compiler/mlir/tfr:op_reg_gen",
+            "//tensorflow/compiler/mlir/tfr:tfr_gen",
+            "//tensorflow/compiler/mlir/tfr:composite",
+        ] + deps,
+    )
diff --git a/tensorflow/compiler/mlir/tfr/define_op_template.py b/tensorflow/compiler/mlir/tfr/define_op_template.py
new file mode 100644
index 00000000000..c0db2981d2d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/define_op_template.py
@@ -0,0 +1,64 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A template to define composite ops."""
+
+# pylint: disable=g-direct-tensorflow-import
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+from tensorflow.compiler.mlir.tfr.python.composite import Composite
+from tensorflow.compiler.mlir.tfr.python.op_reg_gen import gen_register_op
+from tensorflow.compiler.mlir.tfr.python.tfr_gen import tfr_gen_from_module
+from tensorflow.python.platform import app
+from tensorflow.python.platform import flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    'output', None,
+    'Path to write the genereated register op file and MLIR file.')
+
+flags.DEFINE_bool('gen_register_op', True,
+                  'Generate register op cc file or tfr mlir file.')
+
+flags.mark_flag_as_required('output')
+
+
+@Composite('TestRandom', derived_attrs=['T: numbertype'], outputs=['o: T'])
+def _composite_random_op():
+  pass
+
+
+def main(_):
+  if FLAGS.gen_register_op:
+    assert FLAGS.output.endswith('.cc')
+    generated_code = gen_register_op(sys.modules[__name__], '_composite_')
+  else:
+    assert FLAGS.output.endswith('.mlir')
+    generated_code = tfr_gen_from_module(sys.modules[__name__], '_composite_')
+
+  dirname = os.path.dirname(FLAGS.output)
+  if not os.path.exists(dirname):
+    os.makedirs(dirname)
+  with open(FLAGS.output, 'w') as f:
+    f.write(generated_code)
+
+
+if __name__ == '__main__':
+  app.run(main=main)
diff --git a/tensorflow/compiler/mlir/tfr/examples/mnist/BUILD b/tensorflow/compiler/mlir/tfr/examples/mnist/BUILD
new file mode 100644
index 00000000000..eeaee926c87
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/examples/mnist/BUILD
@@ -0,0 +1,60 @@
+load("//tensorflow:tensorflow.bzl", "py_binary")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow/compiler/mlir/tfr:build_defs.bzl", "gen_op_libraries")
+
+package(
+    default_visibility = [
+        ":friends",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+package_group(
+    name = "friends",
+    includes = ["//third_party/mlir:subpackages"],
+    packages = [
+        "//tensorflow/compiler/mlir/tfr/...",
+    ],
+)
+
+gen_op_libraries(
+    name = "mnist_ops",
+    src = "ops_defs.py",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+tf_py_test(
+    name = "mnist_ops_test",
+    size = "small",
+    srcs = ["mnist_ops_test.py"],
+    data = [":mnist_ops_mlir"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "no_windows",  # TODO(b/170752141)
+        "nomac",  # TODO(b/170752141)
+    ],
+    deps = [
+        ":mnist_ops",
+        ":mnist_ops_py",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/compiler/mlir/tfr:test_utils",
+    ],
+)
+
+py_binary(
+    name = "mnist_train",
+    srcs = ["mnist_train.py"],
+    data = [":mnist_ops_mlir"],
+    python_version = "PY3",
+    deps = [
+        ":mnist_ops",
+        ":mnist_ops_py",
+        "//tensorflow:tensorflow_py",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfr/examples/mnist/mnist_ops_test.py b/tensorflow/compiler/mlir/tfr/examples/mnist/mnist_ops_test.py
new file mode 100644
index 00000000000..d25b424279f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/examples/mnist/mnist_ops_test.py
@@ -0,0 +1,126 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for tensorflow.compiler.mlir.tfr.examples.mnist.ops_defs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tensorflow as tf
+
+from tensorflow.compiler.mlir.tfr.examples.mnist import gen_mnist_ops
+from tensorflow.compiler.mlir.tfr.examples.mnist import ops_defs
+from tensorflow.compiler.mlir.tfr.python import test_utils
+from tensorflow.python.framework import load_library
+from tensorflow.python.platform import test
+
+_lib_dir = os.path.dirname(gen_mnist_ops.__file__)
+_lib_name = os.path.basename(gen_mnist_ops.__file__)[4:].replace('.py', '.so')
+load_library.load_op_library(os.path.join(_lib_dir, _lib_name))
+
+
+class MnistOpsDefsTest(test_utils.OpsDefsTest):
+
+  def test_new_conv2d_relu(self):
+    input_ = tf.random.uniform([1, 4, 4, 1])
+    filter_ = tf.random.uniform([2, 2, 1, 8])
+    bias = tf.zeros([8])
+    kwargs = {
+        'input_': input_,
+        'filter_': filter_,
+        'bias': bias,
+        'stride_w': 2,
+        'stride_h': 2,
+        'dilation_w': 1,
+        'dilation_h': 1,
+        'padding': 'SAME',
+        'act': 'RELU'
+    }
+
+    self._assertOpAndComposite([input_, filter_, bias],
+                               tf.function(gen_mnist_ops.new_conv2d),
+                               ops_defs._composite_conv_add_relu, kwargs)
+
+  def test_new_conv2d_relu6(self):
+    input_ = tf.random.uniform([1, 4, 4, 1])
+    filter_ = tf.random.uniform([2, 2, 1, 8])
+    bias = tf.zeros([8])
+    kwargs = {
+        'input_': input_,
+        'filter_': filter_,
+        'bias': bias,
+        'stride_w': 2,
+        'stride_h': 2,
+        'dilation_w': 1,
+        'dilation_h': 1,
+        'padding': 'SAME',
+        'act': 'RELU6'
+    }
+
+    self._assertOpAndComposite([input_, filter_, bias],
+                               tf.function(gen_mnist_ops.new_conv2d),
+                               ops_defs._composite_conv_add_relu, kwargs)
+
+  def test_new_conv2d_tanh(self):
+    self.skipTest('Fix tanh gradients')
+    input_ = tf.random.uniform([1, 4, 4, 1])
+    filter_ = tf.random.uniform([2, 2, 1, 8])
+    bias = tf.zeros([8])
+    kwargs = {
+        'input_': input_,
+        'filter_': filter_,
+        'bias': bias,
+        'stride_w': 2,
+        'stride_h': 2,
+        'dilation_w': 1,
+        'dilation_h': 1,
+        'padding': 'SAME',
+        'act': 'TANH'
+    }
+
+    self._assertOpAndComposite([input_, filter_, bias],
+                               tf.function(gen_mnist_ops.new_conv2d),
+                               ops_defs._composite_conv_add_relu, kwargs)
+
+  def test_new_fully_connected(self):
+    input_ = tf.random.uniform([2, 4])
+    filter_ = tf.random.uniform([3, 4])
+    bias = tf.zeros([3])
+    kwargs = {'input_': input_, 'filter_': filter_, 'bias': bias, 'act': 'RELU'}
+
+    self._assertOpAndComposite([input_, filter_, bias],
+                               tf.function(gen_mnist_ops.new_fully_connected),
+                               ops_defs._composite_fully_connected, kwargs)
+
+  def test_new_max_pool(self):
+    input_ = tf.random.uniform([8, 4, 4, 1])
+    kwargs = {
+        'input_': input_,
+        'stride_w': 2,
+        'stride_h': 2,
+        'filter_width': 1,
+        'filter_height': 1,
+        'padding': 'SAME',
+    }
+
+    self._assertOpAndComposite([input_],
+                               tf.function(gen_mnist_ops.new_max_pool),
+                               ops_defs._composite_max_pool, kwargs)
+
+
+if __name__ == '__main__':
+  os.environ[
+      'TF_MLIR_TFR_LIB_DIR'] = 'tensorflow/compiler/mlir/tfr/examples/mnist'
+  test.main()
diff --git a/tensorflow/compiler/mlir/tfr/examples/mnist/mnist_train.py b/tensorflow/compiler/mlir/tfr/examples/mnist/mnist_train.py
new file mode 100644
index 00000000000..a4adcf86d5b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/examples/mnist/mnist_train.py
@@ -0,0 +1,179 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MNIST model float training script with TensorFlow graph execution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from tensorflow.compiler.mlir.tfr.examples.mnist import gen_mnist_ops
+from tensorflow.compiler.mlir.tfr.examples.mnist import ops_defs  # pylint: disable=unused-import
+from tensorflow.python.framework import load_library
+
+flags.DEFINE_integer('train_steps', 200, 'Number of steps in training.')
+
+_lib_dir = os.path.dirname(gen_mnist_ops.__file__)
+_lib_name = os.path.basename(gen_mnist_ops.__file__)[4:].replace('.py', '.so')
+load_library.load_op_library(os.path.join(_lib_dir, _lib_name))
+
+# MNIST dataset parameters.
+num_classes = 10  # total classes (0-9 digits).
+num_features = 784  # data features (img shape: 28*28).
+num_channels = 1
+
+# Training parameters.
+learning_rate = 0.01
+display_step = 10
+batch_size = 128
+
+# Network parameters.
+n_hidden_1 = 32  # 1st conv layer number of neurons.
+n_hidden_2 = 64  # 2nd conv layer number of neurons.
+n_hidden_3 = 1024  # 1st fully connected layer of neurons.
+flatten_size = num_features // 16 * n_hidden_2
+
+seed = 66478
+
+weights = {
+    'f1':
+        tf.Variable(
+            tf.random.truncated_normal([5, 5, num_channels, n_hidden_1],
+                                       stddev=0.1,
+                                       seed=seed)),
+    'f2':
+        tf.Variable(
+            tf.random.truncated_normal([5, 5, n_hidden_1, n_hidden_2],
+                                       stddev=0.1,
+                                       seed=seed)),
+    'f3':
+        tf.Variable(
+            tf.random.truncated_normal([n_hidden_3, flatten_size],
+                                       stddev=0.1,
+                                       seed=seed)),
+    'f4':
+        tf.Variable(
+            tf.random.truncated_normal([num_classes, n_hidden_3],
+                                       stddev=0.1,
+                                       seed=seed)),
+}
+
+biases = {
+    'b1': tf.Variable(tf.zeros([n_hidden_1])),
+    'b2': tf.Variable(tf.zeros([n_hidden_2])),
+    'b3': tf.Variable(tf.zeros([n_hidden_3])),
+    'b4': tf.Variable(tf.zeros([num_classes])),
+}
+
+
+class FloatModel(tf.Module):
+  """Float inference for mnist model."""
+
+  @tf.function
+  def __call__(self, data):
+    """The Model definition."""
+    x = tf.reshape(data, [-1, 28, 28, 1])
+
+    # 2D convolution, with 'SAME' padding (i.e. the output feature map has
+    # the same size as the input).
+
+    # NOTE: The data/x/input is always specified in floating point precision.
+    # output shape: [-1, 28, 28, 32]
+    conv1 = gen_mnist_ops.new_conv2d(x, weights['f1'], biases['b1'], 1, 1, 1, 1,
+                                     'SAME', 'RELU')
+
+    # Max pooling. The kernel size spec {ksize} also follows the layout of
+    # the data. Here we have a pooling window of 2, and a stride of 2.
+    # output shape: [-1, 14, 14, 32]
+    max_pool1 = gen_mnist_ops.new_max_pool(conv1, 2, 2, 2, 2, 'SAME')
+
+    # output shape: [-1, 14, 14, 64]
+    conv2 = gen_mnist_ops.new_conv2d(max_pool1, weights['f2'], biases['b2'], 1,
+                                     1, 1, 1, 'SAME', 'RELU')
+
+    # output shape: [-1, 7, 7, 64]
+    max_pool2 = gen_mnist_ops.new_max_pool(conv2, 2, 2, 2, 2, 'SAME')
+
+    # Reshape the feature map cuboid into a 2D matrix to feed it to the
+    # fully connected layers.
+    # output shape: [-1, 7*7*64]
+    reshape = tf.reshape(max_pool2, [-1, flatten_size])
+
+    # output shape: [-1, 1024]
+    fc1 = gen_mnist_ops.new_fully_connected(reshape, weights['f3'],
+                                            biases['b3'], 'RELU')
+    # output shape: [-1, 10]
+    return gen_mnist_ops.new_fully_connected(fc1, weights['f4'], biases['b4'])
+
+
+def grad(model, inputs, labels, trainable_variables):
+  with tf.GradientTape() as tape:
+    logits = model(inputs)
+    loss_value = tf.reduce_mean(
+        tf.nn.softmax_cross_entropy_with_logits(labels, logits))
+    grads = tape.gradient(loss_value, trainable_variables)
+  correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
+  accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
+  return accuracy, loss_value, grads
+
+
+def training_step(model, inputs, labels, optimizer, step):
+  trainable_variables = list(weights.values()) + list(biases.values())
+  accuracy, loss_value, grads = grad(model, inputs, labels, trainable_variables)
+  if step % display_step == 0:
+    print('Step %d:' % step)
+    print('    Loss = %f' % loss_value)
+    print('    Batch accuracy: %f' % accuracy)
+  optimizer.apply_gradients(zip(grads, trainable_variables))
+
+
+def get_next_batch(iter_):
+  features = next(iter_)
+  images, labels = features['image'], features['label']
+  return (mnist_preprocess(images), tf.one_hot(labels, num_classes))
+
+
+def mnist_preprocess(x):
+  x_float = tf.cast(x, tf.float32)
+  return x_float / 255.0
+
+
+def train(model, dataset, optimizer):
+  iter_ = iter(dataset)
+  for step in range(flags.FLAGS.train_steps):
+    inputs, labels = get_next_batch(iter_)
+    training_step(model, inputs, labels, optimizer, step)
+
+
+def main(_):
+  # TODO(fengliuai): put this in some automatically generated code.
+  os.environ[
+      'TF_MLIR_TFR_LIB_DIR'] = 'tensorflow/compiler/mlir/tfr/examples/mnist'
+  # Create an mnist float model with the specified float state.
+  model = FloatModel()
+  optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
+
+  ds_train = tfds.load('mnist', split='train', shuffle_files=True)
+  ds_train = ds_train.shuffle(1024).batch(batch_size).prefetch(64)
+
+  train(model, ds_train, optimizer)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/tensorflow/compiler/mlir/tfr/examples/mnist/ops_defs.py b/tensorflow/compiler/mlir/tfr/examples/mnist/ops_defs.py
new file mode 100644
index 00000000000..0cf4678892e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/examples/mnist/ops_defs.py
@@ -0,0 +1,217 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Defines all the new composite ops used in the mnist example."""
+
+# pylint: disable=g-direct-tensorflow-import
+# pylint: disable=missing-function-docstring
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+import tensorflow as tf
+
+from tensorflow.compiler.mlir.tfr.python import composite
+from tensorflow.compiler.mlir.tfr.python.op_reg_gen import gen_register_op
+from tensorflow.compiler.mlir.tfr.python.tfr_gen import tfr_gen_from_module
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import app
+from tensorflow.python.platform import flags
+
+Composite = composite.Composite
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    'output', None,
+    'Path to write the genereated register op file and MLIR file.')
+
+flags.DEFINE_bool('gen_register_op', True,
+                  'Generate register op cc file or tfr mlir file.')
+
+
+@Composite(
+    'NewConv2D',
+    inputs=['input_: T', 'filter_: T', 'bias: T'],
+    attrs=[
+        'stride_w: int', 'stride_h: int', 'dilation_w: int', 'dilation_h: int',
+        'padding: {"SAME", "VALID"}', 'act: {"", "RELU", "RELU6", "TANH"} = ""'
+    ],
+    derived_attrs=['T: {float, int8}'],
+    outputs=['o: T'])
+def _composite_conv_add_relu(input_, filter_, bias, stride_w, stride_h,
+                             dilation_w, dilation_h, padding, act):
+  res = tf.raw_ops.Conv2D(
+      input=input_,
+      filter=filter_,
+      strides=[1, stride_w, stride_h, 1],
+      dilations=[1, dilation_w, dilation_h, 1],
+      padding=padding)
+  res = tf.raw_ops.Add(x=res, y=bias)
+  if act == 'RELU':
+    return tf.raw_ops.Relu(features=res)
+  elif act == 'RELU6':
+    return tf.raw_ops.Relu6(features=res)
+  elif act == 'TANH':
+    return tf.raw_ops.Tanh(x=res)
+  else:
+    return res
+
+
+@tf.RegisterGradient('NewConv2D')
+def _conv_add_relu_grad(op, grad):
+  act = op.get_attr('act')
+  y = op.outputs[0]
+  if act == 'RELU':
+    grad = gen_nn_ops.relu_grad(grad, y)
+  elif act == 'RELU6':
+    grad = gen_nn_ops.relu6_grad(grad, y)
+  elif act == 'TANH':
+    y = math_ops.conj(y)
+    grad = gen_math_ops.tanh_grad(y, grad)
+
+  broadcast_shape = tf.shape(y)
+  input_value_shape = tf.shape(op.inputs[2])
+  _, reduction_axes = tf.raw_ops.BroadcastGradientArgs(
+      s0=broadcast_shape, s1=input_value_shape)
+  updates_grad_reshaped = tf.reduce_sum(
+      grad, axis=reduction_axes, keepdims=True)
+  bias_grad = tf.reshape(updates_grad_reshaped, input_value_shape)
+
+  dilations = [1, op.get_attr('dilation_w'), op.get_attr('dilation_h'), 1]
+  strides = [1, op.get_attr('stride_w'), op.get_attr('stride_h'), 1]
+  padding = op.get_attr('padding')
+  shape_0, shape_1 = tf.shape_n([op.inputs[0], op.inputs[1]])
+  return [
+      tf.compat.v1.nn.conv2d_backprop_input(
+          shape_0,
+          op.inputs[1],
+          grad,
+          strides=strides,
+          padding=padding,
+          dilations=dilations,
+          data_format='NHWC'),
+      tf.compat.v1.nn.conv2d_backprop_filter(
+          op.inputs[0],
+          shape_1,
+          grad,
+          strides=strides,
+          padding=padding,
+          dilations=dilations,
+          data_format='NHWC'), bias_grad
+  ]
+
+
+@Composite(
+    'NewFullyConnected',
+    inputs=['input_: T', 'filter_: T', 'bias: T'],
+    attrs=['act: {"", "RELU", "RELU6", "TANH"} = ""'],
+    derived_attrs=['T: {float, int8}'],
+    outputs=['o: T'])
+def _composite_fully_connected(input_, filter_, bias, act):
+  res = tf.raw_ops.MatMul(
+      a=input_, b=filter_, transpose_a=False, transpose_b=True)
+  res = tf.raw_ops.Add(x=res, y=bias)
+  if act == 'RELU':
+    return tf.raw_ops.Relu(features=res)
+  elif act == 'RELU6':
+    return tf.raw_ops.Relu6(features=res)
+  elif act == 'TANH':
+    return tf.raw_ops.Tanh(x=res)
+  else:
+    return res
+
+
+@tf.RegisterGradient('NewFullyConnected')
+def _fully_connected_grad(op, grad):
+  act = op.get_attr('act')
+  y = op.outputs[0]
+  if act == 'RELU':
+    grad = gen_nn_ops.relu_grad(grad, y)
+  elif act == 'RELU6':
+    grad = gen_nn_ops.relu6_grad(grad, y)
+  elif act == 'TANH':
+    y = math_ops.conj(y)
+    grad = gen_math_ops.tanh_grad(y, grad)
+
+  broadcast_shape = tf.shape(y)
+  input_value_shape = tf.shape(op.inputs[2])
+  _, reduction_axes = tf.raw_ops.BroadcastGradientArgs(
+      s0=broadcast_shape, s1=input_value_shape)
+  updates_grad_reshaped = tf.reduce_sum(
+      grad, axis=reduction_axes, keepdims=True)
+  bias_grad = tf.reshape(updates_grad_reshaped, input_value_shape)
+
+  a = math_ops.conj(op.inputs[0])
+  b = math_ops.conj(op.inputs[1])
+  grad_a = gen_math_ops.mat_mul(grad, b)
+  grad_b = gen_math_ops.mat_mul(grad, a, transpose_a=True)
+  return [grad_a, grad_b, bias_grad]
+
+
+@Composite(
+    'NewMaxPool',
+    inputs=['input_: T'],
+    attrs=[
+        'stride_w: int', 'stride_h: int', 'filter_width: int',
+        'filter_height: int', 'padding: {"SAME", "VALID"}'
+    ],
+    derived_attrs=['T: {float, int8}'],
+    outputs=['o: T'])
+def _composite_max_pool(input_, stride_w, stride_h, filter_width, filter_height,
+                        padding):
+  ksize = [1, filter_width, filter_height, 1]
+  strides = [1, stride_w, stride_h, 1]
+  return tf.raw_ops.MaxPool(
+      input=input_, ksize=ksize, strides=strides, padding=padding)
+
+
+@tf.RegisterGradient('NewMaxPool')
+def _max_pool_grad(op, grad):
+  filter_width = op.get_attr('filter_width')
+  filter_height = op.get_attr('filter_height')
+  stride_w = op.get_attr('stride_w')
+  stride_h = op.get_attr('stride_h')
+  padding = op.get_attr('padding')
+  return tf.raw_ops.MaxPoolGrad(
+      orig_input=op.inputs[0],
+      orig_output=op.outputs[0],
+      grad=grad,
+      ksize=[1, filter_width, filter_height, 1],
+      strides=[1, stride_w, stride_h, 1],
+      padding=padding,
+      data_format='NHWC')
+
+
+def main(_):
+  if FLAGS.gen_register_op:
+    assert FLAGS.output.endswith('.cc')
+    generated_code = gen_register_op(sys.modules[__name__], '_composite_')
+  else:
+    assert FLAGS.output.endswith('.mlir')
+    generated_code = tfr_gen_from_module(sys.modules[__name__], '_composite_',)
+
+  dirname = os.path.dirname(FLAGS.output)
+  if not os.path.exists(dirname):
+    os.makedirs(dirname)
+  with open(FLAGS.output, 'w') as f:
+    f.write(generated_code)
+
+
+if __name__ == '__main__':
+  app.run(main=main)
diff --git a/tensorflow/compiler/mlir/tfr/examples/pad/BUILD b/tensorflow/compiler/mlir/tfr/examples/pad/BUILD
new file mode 100644
index 00000000000..ef08caff939
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/examples/pad/BUILD
@@ -0,0 +1,45 @@
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow/compiler/mlir/tfr:build_defs.bzl", "gen_op_libraries")
+
+package(
+    default_visibility = [
+        ":friends",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+package_group(
+    name = "friends",
+    includes = ["//third_party/mlir:subpackages"],
+    packages = [
+        "//tensorflow/compiler/mlir/tfr/...",
+    ],
+)
+
+gen_op_libraries(
+    name = "pad_ops",
+    src = "ops_defs.py",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+tf_py_test(
+    name = "pad_ops_test",
+    size = "small",
+    srcs = ["pad_ops_test.py"],
+    data = [":pad_ops_mlir"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "no_windows",  # TODO(b/170752141)
+        "nomac",  # TODO(b/170752141)
+    ],
+    deps = [
+        ":pad_ops",
+        ":pad_ops_py",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/compiler/mlir/tfr:test_utils",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfr/examples/pad/ops_defs.py b/tensorflow/compiler/mlir/tfr/examples/pad/ops_defs.py
new file mode 100644
index 00000000000..4b072a58f08
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/examples/pad/ops_defs.py
@@ -0,0 +1,168 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Defines the mirror pad and mirror pad grad."""
+
+# pylint: disable=g-direct-tensorflow-import
+# pylint: disable=missing-function-docstring
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+
+import tensorflow as tf
+
+from tensorflow.compiler.mlir.tfr.python import composite
+from tensorflow.compiler.mlir.tfr.python.op_reg_gen import gen_register_op
+from tensorflow.compiler.mlir.tfr.python.tfr_gen import tfr_gen_from_module
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.platform import app
+from tensorflow.python.platform import flags
+
+Composite = composite.Composite
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    'output', None,
+    'Path to write the genereated register op file and MLIR file.')
+
+flags.DEFINE_bool('gen_register_op', True,
+                  'Generate register op cc file or tfr mlir file.')
+
+
+@Composite(
+    'NewMirrorPad',
+    inputs=['input_: T', 'paddings: Tpaddings'],
+    attrs=['mode: {"REFLECT", "SYMMETRIC"}'],
+    derived_attrs=['T: type', 'Tpaddings: {int32, int64} = DT_INT32'],
+    outputs=['output: T'])
+def _composite_mirror_pad(input_, paddings, mode):
+  shape = input_.shape.as_list()
+  for i in range(len(shape)):
+    rdims = tf.raw_ops.OneHot(
+        indices=i, depth=len(shape), on_value=True, off_value=False, axis=-1)
+    rarray = tf.raw_ops.Reverse(tensor=input_, dims=rdims)
+
+    left_padding_size = tf.raw_ops.GatherNd(params=paddings, indices=[i, 0])
+    right_padding_size = tf.raw_ops.GatherNd(params=paddings, indices=[i, 1])
+
+    if mode == 'REFLECT':
+      left_padding, _ = tf.raw_ops.SplitV(
+          value=rarray,
+          size_splits=[left_padding_size, -1],
+          axis=i,
+          num_split=2)
+      _, right_padding = tf.raw_ops.SplitV(
+          value=rarray,
+          size_splits=[-1, right_padding_size],
+          axis=i,
+          num_split=2)
+    else:
+      _, left_padding = tf.raw_ops.SplitV(
+          value=rarray,
+          size_splits=[-1, left_padding_size],
+          axis=i,
+          num_split=2)
+      right_padding, _ = tf.raw_ops.SplitV(
+          value=rarray,
+          size_splits=[right_padding_size, -1],
+          axis=i,
+          num_split=2)
+
+    input_ = tf.raw_ops.Concat(
+        concat_dim=i, values=[left_padding, input_, right_padding])
+  return input_
+
+
+@tf.RegisterGradient('NewMirrorPad')
+def _mirror_pad_grad(op, grad):
+  mode = op.get_attr('mode')
+  return [gen_array_ops.mirror_pad_grad(grad, op.inputs[1], mode=mode), None]
+
+
+@Composite(
+    'NewMirrorPadGrad',
+    inputs=['input_: T', 'paddings: Tpaddings'],
+    attrs=['mode: {"REFLECT", "SYMMETRIC"}'],
+    derived_attrs=['T: type', 'Tpaddings: {int32, int64} = DT_INT32'],
+    outputs=['output: T'])
+def _composite_mirror_pad_grad(input_, paddings, mode):
+  shape = input_.shape.as_list()
+  for i in range(len(shape)):
+    rdims = tf.raw_ops.OneHot(
+        indices=i, depth=len(shape), on_value=True, off_value=False, axis=-1)
+    left_padding_size = tf.raw_ops.GatherNd(params=paddings, indices=[i, 0])
+    right_padding_size = tf.raw_ops.GatherNd(params=paddings, indices=[i, 1])
+
+    left_padding, core, right_padding = tf.raw_ops.SplitV(
+        value=input_,
+        size_splits=[left_padding_size, -1, right_padding_size],
+        axis=i,
+        num_split=3)
+    reversed_left_padding = tf.raw_ops.Reverse(tensor=left_padding, dims=rdims)
+    reversed_right_padding = tf.raw_ops.Reverse(
+        tensor=right_padding, dims=rdims)
+    zero_like = tf.raw_ops.ZerosLike(x=core)
+    left_offset, _ = tf.raw_ops.SplitV(
+        value=zero_like,
+        size_splits=[-1, left_padding_size],
+        axis=i,
+        num_split=2)
+    right_offset, _ = tf.raw_ops.SplitV(
+        value=zero_like,
+        size_splits=[-1, right_padding_size],
+        axis=i,
+        num_split=2)
+
+    if mode == 'REFLECT':
+      from_left_padding = tf.raw_ops.Concat(
+          concat_dim=i, values=[left_offset, reversed_left_padding])
+      from_right_padding = tf.raw_ops.Concat(
+          concat_dim=i, values=[reversed_right_padding, right_offset])
+    else:
+      from_left_padding = tf.raw_ops.Concat(
+          concat_dim=i, values=[reversed_left_padding, left_offset])
+      from_right_padding = tf.raw_ops.Concat(
+          concat_dim=i, values=[right_offset, reversed_right_padding])
+    input_ = tf.raw_ops.AddN(
+        inputs=[from_left_padding, core, from_right_padding])
+
+  return input_
+
+
+@tf.RegisterGradient('NewMirrorPadGrad')
+def _mirror_pad_grad_grad(op, grad):
+  mode = op.get_attr('mode')
+  return [gen_array_ops.mirror_pad(grad, op.inputs[1], mode=mode), None]
+
+
+def main(_):
+  if FLAGS.gen_register_op:
+    assert FLAGS.output.endswith('.cc')
+    generated_code = gen_register_op(sys.modules[__name__], '_composite_')
+  else:
+    assert FLAGS.output.endswith('.mlir')
+    generated_code = tfr_gen_from_module(sys.modules[__name__], '_composite_')
+
+  dirname = os.path.dirname(FLAGS.output)
+  if not os.path.exists(dirname):
+    os.makedirs(dirname)
+  with open(FLAGS.output, 'w') as f:
+    f.write(generated_code)
+
+
+if __name__ == '__main__':
+  app.run(main=main)
diff --git a/tensorflow/compiler/mlir/tfr/examples/pad/pad_ops_test.py b/tensorflow/compiler/mlir/tfr/examples/pad/pad_ops_test.py
new file mode 100644
index 00000000000..11f6e0acbf2
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/examples/pad/pad_ops_test.py
@@ -0,0 +1,96 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for tensorflow.compiler.mlir.tfr.examples.pad.ops_defs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+from absl.testing import parameterized
+import tensorflow as tf
+
+from tensorflow.compiler.mlir.tfr.examples.pad import gen_pad_ops
+from tensorflow.compiler.mlir.tfr.examples.pad import ops_defs
+from tensorflow.compiler.mlir.tfr.python import test_utils
+from tensorflow.python.framework import load_library
+from tensorflow.python.platform import test
+
+_lib_dir = os.path.dirname(gen_pad_ops.__file__)
+_lib_name = os.path.basename(gen_pad_ops.__file__)[4:].replace('.py', '.so')
+load_library.load_op_library(os.path.join(_lib_dir, _lib_name))
+
+
+class PadOpsDefsTest(test_utils.OpsDefsTest, parameterized.TestCase):
+
+  @parameterized.named_parameters(('ReflectMode', 'REFLECT'),
+                                  ('SymmetricMode', 'SYMMETRIC'))
+  def test_mirror_pad(self, mode):
+    input_ = tf.constant([[1, 2, 3], [4, 5, 6]], dtype=tf.float32)
+    paddings = tf.constant([[
+        1,
+        1,
+    ], [2, 2]])
+    kwargs = {
+        'input': input_,
+        'paddings': paddings,
+        'mode': mode,
+    }
+    kwargs_ = {
+        'input_': input_,
+        'paddings': paddings,
+        'mode': mode,
+    }
+    # Make sure the composition python function is correct
+    self._assertOpAndComposite([input_], tf.raw_ops.MirrorPad,
+                               ops_defs._composite_mirror_pad, kwargs_, kwargs)
+    # Make sure the translation and decomposition is correct
+    self._assertOpAndComposite([input_],
+                               tf.function(gen_pad_ops.new_mirror_pad),
+                               ops_defs._composite_mirror_pad, kwargs_)
+
+  @parameterized.named_parameters(('ReflectMode', 'REFLECT'),
+                                  ('SymmetricMode', 'SYMMETRIC'))
+  def test_mirror_pad_grad(self, mode):
+    input_ = tf.constant([[2, 1, 1, 2, 3, 3, 2], [2, 1, 1, 2, 3, 3, 2],
+                          [5, 4, 4, 5, 6, 6, 5], [5, 4, 4, 5, 6, 6, 5]],
+                         dtype=tf.float32)
+    paddings = tf.constant([[
+        1,
+        1,
+    ], [2, 2]])
+    kwargs = {
+        'input': input_,
+        'paddings': paddings,
+        'mode': mode,
+    }
+    kwargs_ = {
+        'input_': input_,
+        'paddings': paddings,
+        'mode': mode,
+    }
+    # Make sure the composition python function is correct
+    self._assertOpAndComposite([input_], tf.raw_ops.MirrorPadGrad,
+                               ops_defs._composite_mirror_pad_grad, kwargs_,
+                               kwargs)
+    # Make sure the translation and decomposition is correct
+    self._assertOpAndComposite([input_],
+                               tf.function(gen_pad_ops.new_mirror_pad_grad),
+                               ops_defs._composite_mirror_pad_grad, kwargs_)
+
+
+if __name__ == '__main__':
+  os.environ[
+      'TF_MLIR_TFR_LIB_DIR'] = 'tensorflow/compiler/mlir/tfr/examples/pad'
+  test.main()
diff --git a/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.cc b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.cc
new file mode 100644
index 00000000000..99890e9f621
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.cc
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.h"
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+namespace tfr {
+
+bool GraphDecomposePass::IsEnabled(const ConfigProto& config_proto) const {
+  const char* tfr_lib_env_val = getenv(std::string(kTFRLibEnv).c_str());
+  return tfr_lib_env_val != nullptr;
+}
+
+Status GraphDecomposePass::Run(const ConfigProto& config_proto,
+                               mlir::ModuleOp module) {
+  if (!IsEnabled(config_proto)) {
+    LOG_FIRST_N(INFO, 1) << "Skipping Graph Decomposition Pass, decompositin "
+                            "library was not found";
+    return Status::OK();
+  }
+
+  LOG_FIRST_N(INFO, 1) << "Run Graph Decomposition Passes";
+
+  TF_RETURN_IF_ERROR(DecomposeGraph(module));
+
+  LOG_FIRST_N(INFO, 1) << "Finish Graph Decomposition Passes";
+
+  return Status::OK();
+}
+
+namespace {
+constexpr int kMlirGraphDecomposePassPriority = -1;
+
+static mlir_pass_registration::MlirOptimizationPassRegistration
+    register_mlir_graph_decompose_pass(kMlirGraphDecomposePassPriority,
+                                       std::make_unique<GraphDecomposePass>());
+}  // namespace
+
+}  // namespace tfr
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.h b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.h
new file mode 100644
index 00000000000..dd93e99f04b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_pass.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_GRAPH_DECOMPOSE_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_GRAPH_DECOMPOSE_PASS_H_
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/mlir_graph_optimization_pass.h"
+#include "tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+namespace tfr {
+
+// An optimization pass that decompose the composite ops in a module according
+// to the decomposition library. Currently the decomposition library is loaded
+// each time the pass runs. A special environment variable is set to locate the
+// decomposition library.
+class GraphDecomposePass : public MlirOptimizationPass {
+ public:
+  llvm::StringRef name() const override { return "tfr"; }
+
+  // Whether to run this pass. If this is enabled, the GraphDef will be imported
+  // to MLIR even no tf composition file is found.
+  bool IsEnabled(const ConfigProto& config_proto) const override;
+
+  // This should be used as a thin mapper around mlir::ModulePass::runOnModule
+  // API integrated with the Tensorflow runtime.
+  Status Run(const ConfigProto& config_proto, mlir::ModuleOp module) override;
+};
+
+}  // namespace tfr
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_GRAPH_DECOMPOSE_PASS_H_
diff --git a/tensorflow/compiler/mlir/tfr/integration/graph_decompose_test.py b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_test.py
new file mode 100644
index 00000000000..d573b8e7195
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/integration/graph_decompose_test.py
@@ -0,0 +1,83 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for tensorflow.compiler.mlir.tfr.integrattion.graph_decompose."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.compiler.mlir.tfr.resources import gen_composite_ops
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import load_library
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+_lib_dir = os.path.dirname(gen_composite_ops.__file__)
+_lib_name = os.path.basename(gen_composite_ops.__file__)[4:].replace(
+    '.py', '.so')
+load_library.load_op_library(os.path.join(_lib_dir, _lib_name))
+
+
+class GraphDecomposeTest(test.TestCase):
+
+  def testAddN(self):
+    add = def_function.function(gen_composite_ops.my_add_n)
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t2 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t3 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    sq1 = add([t1])
+    sq2 = add([t1, t2])
+    sq3 = add([t1, t2, t3])
+    self.assertAllEqual(sq1.numpy().reshape(-1), [1, 2, 3, 4])
+    self.assertAllEqual(sq2.numpy().reshape(-1), [2, 4, 6, 8])
+    self.assertAllEqual(sq3.numpy().reshape(-1), [3, 6, 9, 12])
+
+  def testBiasedDense(self):
+    biased_dense = def_function.function(gen_composite_ops.my_biased_dense)
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t2 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t3 = constant_op.constant([[-10.0, -10.0], [-10.0, -10.0]])
+    sq = biased_dense(t1, t2, t3)
+    self.assertAllEqual(sq.numpy().reshape(-1), [-3, 0, 5, 12])
+
+  def testBiasedDenseRelu(self):
+    biased_dense = def_function.function(gen_composite_ops.my_biased_dense)
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t2 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t3 = constant_op.constant([[-10.0, -10.0], [-10.0, -10.0]])
+    sq = biased_dense(t1, t2, t3, act='relu')
+    self.assertAllEqual(sq.numpy().reshape(-1), [0, 0, 5, 12])
+
+  def testWithKnownKernel(self):
+
+    @def_function.function
+    def biasd_dense_elu(x, y, z):
+      dot = gen_composite_ops.my_biased_dense(x, y, z)
+      return nn_ops.elu(dot)  # with known kernel, should not expand.
+
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t2 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t3 = constant_op.constant([[-10.0, -10.0], [-10.0, -10.0]])
+    sq = biasd_dense_elu(t1, t2, t3)
+    self.assertAllClose(sq.numpy().reshape(-1), [-0.950213, 0, 5, 12])
+
+
+if __name__ == '__main__':
+  os.environ['TF_MLIR_TFR_LIB_DIR'] = 'tensorflow/compiler/mlir/tfr/resources'
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.cc b/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.cc
new file mode 100644
index 00000000000..61c4d1c8953
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.cc
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.h"
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+namespace tfr {
+
+Status CompositeOpExpansion::Run(EagerOperation* orig_op,
+                                 std::unique_ptr<EagerOperation>* out_op) {
+  if (!IsEnabled()) return Status::OK();
+  if (orig_op->Device() != kVariantDeviceNull) return Status::OK();
+
+  LOG_FIRST_N(INFO, 1) << "Run Node Expansion Passes";
+
+  // Get the FunctionDef and insert that into the context
+  const NodeDef& ndef = orig_op->MutableAttrs()->BuildNodeDef();
+  auto& ctx = orig_op->EagerContext();
+  Fprint128 cache_key =
+      orig_op->MutableAttrs()->CacheKey(orig_op->DeviceName());
+  // Include soft placement policy in cache key since the placement strategy
+  // can change and thus affect which kernel is picked.
+  auto x = FingerprintCat64(cache_key.high64, cache_key.low64);
+  std::string fname =
+      absl::StrCat("_expanded_", ndef.name(), "_", std::to_string(x));
+  if (!ctx.FindFunctionByName(fname)) {
+    TF_ASSIGN_OR_RETURN(auto func, ExpandNode(ndef, fname));
+    TF_RETURN_IF_ERROR(ctx.AddFunctionDef(func));
+  }
+
+  // Rewrite the out_op to be the call op. This essentially a deep copy of the
+  // orig_op, except the op name.
+  auto* new_op = new EagerOperation(&ctx);
+  TF_RETURN_IF_ERROR(
+      new_op->Reset(fname.c_str(), orig_op->DeviceName().c_str()));
+  for (auto input : orig_op->GetInputs()) {
+    TF_RETURN_IF_ERROR(new_op->AddInput(input));
+  }
+  new_op->MutableAttrs()->CopyAttributes(orig_op->Attrs());
+  out_op->reset(new_op);
+
+  LOG_FIRST_N(INFO, 1)
+      << "Finish Node Expansion Passes. Rewrite the op to call function: "
+      << fname;
+
+  return Status::OK();
+}
+
+REGISTER_REWRITE(EagerOpRewriteRegistry::POST_PLACEMENT, CompositeOpExpansion);
+
+}  // namespace tfr
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.h b/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.h
new file mode 100644
index 00000000000..b1e4911b541
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/integration/node_expansion_pass.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_NODE_EXPANSION_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_NODE_EXPANSION_PASS_H_
+
+#include "tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h"
+#include "tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+namespace tfr {
+
+// An optimization pass that decompose the composite ops in a module according
+// to the decomposition library. Currently the decomposition library is loaded
+// each time the pass runs. A special environment variable is set to locate the
+// decomposition library.
+class CompositeOpExpansion : public EagerOpRewrite {
+ public:
+  CompositeOpExpansion(string name, string file, string line)
+      : EagerOpRewrite(name, file, line) {}
+
+  Status Run(EagerOperation* orig_op,
+             std::unique_ptr<tensorflow::EagerOperation>* out_op) override;
+
+ private:
+  // Whether to run this pass. If this is enabled, the NodeDef will be imported
+  // to MLIR even no tf composition file is found.
+  bool IsEnabled() {
+    const char* tfr_lib_env_val = getenv(string(kTFRLibEnv).c_str());
+    return tfr_lib_env_val != nullptr;
+  }
+};
+
+}  // namespace tfr
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_NODE_EXPANSION_PASS_H_
diff --git a/tensorflow/compiler/mlir/tfr/integration/node_expansion_test.py b/tensorflow/compiler/mlir/tfr/integration/node_expansion_test.py
new file mode 100644
index 00000000000..f99b52fe65a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/integration/node_expansion_test.py
@@ -0,0 +1,78 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for tensorflow.compiler.mlir.tfr.integrattion.node_expansion."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.compiler.mlir.tfr.resources import gen_composite_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import load_library
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+_lib_dir = os.path.dirname(gen_composite_ops.__file__)
+_lib_name = os.path.basename(gen_composite_ops.__file__)[4:].replace(
+    '.py', '.so')
+load_library.load_op_library(os.path.join(_lib_dir, _lib_name))
+
+
+class NodeExpansionTest(test.TestCase):
+
+  def testAddN(self):
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t2 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t3 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    sq1 = gen_composite_ops.my_add_n([t1])
+    sq2 = gen_composite_ops.my_add_n([t1, t2])
+    sq3 = gen_composite_ops.my_add_n([t1, t2, t3])
+    self.assertAllEqual(sq1.numpy().reshape(-1), [1, 2, 3, 4])
+    self.assertAllEqual(sq2.numpy().reshape(-1), [2, 4, 6, 8])
+    self.assertAllEqual(sq3.numpy().reshape(-1), [3, 6, 9, 12])
+
+  def testBiasedDense(self):
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t2 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t3 = constant_op.constant([[-10.0, -10.0], [-10.0, -10.0]])
+    sq = gen_composite_ops.my_biased_dense(t1, t2, t3)
+    self.assertAllEqual(sq.numpy().reshape(-1), [-3, 0, 5, 12])
+
+  def testBiasedDenseRelu(self):
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t2 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t3 = constant_op.constant([[-10.0, -10.0], [-10.0, -10.0]])
+    sq = gen_composite_ops.my_biased_dense(t1, t2, t3, act='relu')
+    self.assertAllEqual(sq.numpy().reshape(-1), [0, 0, 5, 12])
+
+  def testWithKnownKernel(self):
+
+    def biasd_dense_elu(x, y, z):
+      dot = gen_composite_ops.my_biased_dense(x, y, z)
+      return nn_ops.elu(dot)  # with known kernel, should not expand.
+
+    t1 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t2 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    t3 = constant_op.constant([[-10.0, -10.0], [-10.0, -10.0]])
+    sq = biasd_dense_elu(t1, t2, t3)
+    self.assertAllClose(sq.numpy().reshape(-1), [-0.950213, 0, 5, 12])
+
+
+if __name__ == '__main__':
+  os.environ['TF_MLIR_TFR_LIB_DIR'] = 'tensorflow/compiler/mlir/tfr/resources'
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
new file mode 100644
index 00000000000..61e96548579
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.cc
@@ -0,0 +1,222 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h"
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/SourceMgr.h"
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Identifier.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Verifier.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_attr.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
+#include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.h"
+#include "tensorflow/compiler/mlir/tfr/passes/passes.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+namespace tfr {
+
+const char* const kTFRLibEnv = "TF_MLIR_TFR_LIB_DIR";
+
+StatusOr<std::unique_ptr<TFRDecomposeContext>> TFRDecomposeContext::Get(
+    mlir::MLIRContext* mlir_ctx) {
+  Env* env = Env::Default();
+  std::string tfr_lib_dir;
+  TF_RETURN_IF_ERROR(ReadStringFromEnvVar(
+      kTFRLibEnv, "tensorflow/compiler/mlir/tfr/resources", &tfr_lib_dir));
+  string composite_mlir_dir = io::JoinPath(env->GetRunfilesDir(), tfr_lib_dir);
+  std::vector<string> files;
+  TF_RETURN_IF_ERROR(env->GetChildren(composite_mlir_dir, &files));
+  if (files.empty()) {
+    return errors::Internal(absl::StrCat(
+        "Failed to find the decomposition lib from path ", composite_mlir_dir));
+  }
+  std::string tfr_raw_text;
+  for (const auto& file : files) {
+    string fullpath = io::JoinPath(composite_mlir_dir, file);
+    if (env->MatchPath(fullpath, io::JoinPath(composite_mlir_dir, "*.mlir"))) {
+      std::string text;
+      TF_RETURN_IF_ERROR(ReadFileToString(env, fullpath, &text));
+      tfr_raw_text.append(text);
+    }
+  }
+
+  auto ctx = TFRDecomposeContext::GetFromText(tfr_raw_text, mlir_ctx);
+  if (!ctx) {
+    return errors::Internal(absl::StrCat(
+        "Failed to load the imported decomposition lib: ", tfr_raw_text));
+  }
+  return ctx;
+}
+
+std::unique_ptr<TFRDecomposeContext> TFRDecomposeContext::GetFromText(
+    StringPiece tfr_raw_text, mlir::MLIRContext* mlir_ctx) {
+  mlir_ctx->allowUnregisteredDialects(/*allow=*/true);
+  // Load dialects involved in the conversion
+  mlir::DialectRegistry& registry = mlir_ctx->getDialectRegistry();
+  // clang-format off
+  registry.insert<mlir::StandardOpsDialect,
+                  mlir::scf::SCFDialect,
+                  mlir::shape::ShapeDialect,
+                  mlir::TF::TensorFlowDialect,
+                  mlir::tf_device::TensorFlowDeviceDialect,
+                  mlir::tf_executor::TensorFlowExecutorDialect,
+                  mlir::TFR::TFRDialect>();
+  // clang-format on
+
+  // Load the TFR functions in a mlir::ModuleOp
+  auto memory_buffer = llvm::MemoryBuffer::getMemBuffer(
+      llvm::StringRef(tfr_raw_text.data(), tfr_raw_text.size()));
+  llvm::SourceMgr source_mgr;
+  source_mgr.AddNewSourceBuffer(std::move(memory_buffer), llvm::SMLoc());
+  mlir::OwningModuleRef module = mlir::parseSourceFile(source_mgr, mlir_ctx);
+  // The MLIRContext owns the module
+  auto module_op = module.release();
+
+  // Create the context
+  return absl::make_unique<TFRDecomposeContext>(module_op);
+}
+
+StatusOr<FunctionDef> TFRDecomposeContext::ExpandNode(const NodeDef& node_def,
+                                                      StringPiece func_name) {
+  const OpDef* op_def;
+  TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef(node_def.op(), &op_def));
+  DataTypeVector input_dtys, output_dtys;
+  TF_RETURN_IF_ERROR(InputTypesForNode(node_def, *op_def, &input_dtys));
+  TF_RETURN_IF_ERROR(OutputTypesForNode(node_def, *op_def, &output_dtys));
+
+  mlir::MLIRContext* context = tfr_module_.getContext();
+  llvm::SmallVector<mlir::Type, 4> input_tys, output_tys;
+  mlir::Builder builder(context);
+  for (auto ty : input_dtys) {
+    mlir::Type elt_ty;
+    TF_RETURN_IF_ERROR(ConvertDataType(ty, builder, &elt_ty));
+    mlir::TensorType mlir_ty = mlir::UnrankedTensorType::get(elt_ty);
+    input_tys.push_back(mlir_ty);
+  }
+  for (auto ty : output_dtys) {
+    mlir::Type elt_ty;
+    TF_RETURN_IF_ERROR(ConvertDataType(ty, builder, &elt_ty));
+    mlir::TensorType mlir_ty = mlir::UnrankedTensorType::get(elt_ty);
+    output_tys.push_back(mlir_ty);
+  }
+  llvm::SmallVector<mlir::NamedAttribute, 4> attrs;
+  for (const auto& attr : node_def.attr()) {
+    TF_ASSIGN_OR_RETURN(auto mlir_attr,
+                        ConvertAttributeValue(attr.second, &builder));
+    attrs.push_back({mlir::Identifier::get(attr.first, context), mlir_attr});
+  }
+
+  mlir::Location loc = mlir::UnknownLoc::get(context);
+  mlir::ModuleOp module = mlir::ModuleOp::create(loc);
+  mlir::FunctionType func_type =
+      mlir::FunctionType::get(input_tys, output_tys, context);
+  llvm::StringRef func_name_str(func_name.data(), func_name.size());
+  auto func = mlir::FuncOp::create(loc, func_name_str, func_type, {});
+  module.push_back(func);
+  func.addEntryBlock();
+  mlir::OpBuilder op_builder(func.getBody());
+
+  // Create the TF op
+  const std::string tf_op_full_name = absl::StrCat("tf.", node_def.op());
+  mlir::OperationState op_state(loc, tf_op_full_name);
+  op_state.addOperands(func.getArguments());
+  op_state.addTypes(output_tys);
+  op_state.addAttributes(attrs);
+  mlir::Operation* tf_op = op_builder.createOperation(op_state);
+  op_builder.create<mlir::ReturnOp>(loc, tf_op->getResults());
+
+  // Run the decompose passes on the module
+  TF_RETURN_IF_ERROR(DecomposeGraph(module));
+
+  // Export the result as a FunctionDef.
+  FunctionDef func_def;
+  TF_RETURN_IF_ERROR(
+      ConvertMlirFunctionToFunctionLibraryDef(func, export_confs_, &func_def));
+  module.erase();
+  return func_def;
+}
+
+Status TFRDecomposeContext::DecomposeGraph(mlir::ModuleOp user_module) {
+  // Call the decompose passes by using the external symbol table.
+  if (failed(pm_.run(user_module))) {
+    return errors::Internal("Failed to run the decompose passes.");
+  }
+  return Status::OK();
+}
+
+// Constructor of the decompose context.
+TFRDecomposeContext::TFRDecomposeContext(mlir::ModuleOp tfr_module)
+    : tfr_module_(tfr_module), pm_(tfr_module_.getContext()) {
+  mlir::OpPassManager& func_pm = pm_.nest<mlir::FuncOp>();
+
+  // Prepare the imported graph.
+  func_pm.addPass(mlir::CreateExecutorDialectToFunctionalConversionPass());
+
+  // Run TFR lowering, inlining and raising to tf.
+  func_pm.addPass(mlir::TFR::CreateDecomposeTFOpsPass(tfr_module_));
+  func_pm.addPass(mlir::TFR::CreateRaiseToTFOpsPass(
+      tfr_module_, /*materialize_derived_attrs=*/true));
+
+  // Prepare to be exported.
+  func_pm.addPass(mlir::CreateFunctionalToExecutorDialectConversionPass());
+  pm_.addPass(mlir::CreateBreakUpIslandsPass());
+}
+
+void TFRDecomposeContext::Destroy() { tfr_module_.erase(); }
+
+StatusOr<FunctionDef> ExpandNode(const NodeDef& node_def,
+                                 StringPiece func_name) {
+  mlir::MLIRContext mlir_ctx;
+  TF_ASSIGN_OR_RETURN(auto ctx, TFRDecomposeContext::Get(&mlir_ctx));
+  return ctx->ExpandNode(node_def, func_name);
+}
+
+Status DecomposeGraph(mlir::ModuleOp user_module) {
+  mlir::MLIRContext* mlir_ctx = user_module.getContext();
+  TF_ASSIGN_OR_RETURN(auto ctx, TFRDecomposeContext::Get(mlir_ctx));
+  return ctx->DecomposeGraph(user_module);
+}
+
+}  // namespace tfr
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h
new file mode 100644
index 00000000000..6e33bbf0b0c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h
@@ -0,0 +1,81 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_TFR_DECOMPOSE_CTX_H_
+#define TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_TFR_DECOMPOSE_CTX_H_
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+namespace tfr {
+
+extern const char* const kTFRLibEnv;
+
+using stream_executor::port::StatusOr;
+
+// An wrapper for all the objects used to decompose a module (graph mode) and
+// node_def (eager mode). Note that this class owns the decomposition library.
+class TFRDecomposeContext {
+ public:
+  // The entry function to get a decompose context. All the required passes have
+  // been initialized.
+  static StatusOr<std::unique_ptr<TFRDecomposeContext>> Get(
+      mlir::MLIRContext* mlir_ctx);
+
+  // Constructor of the decompose context. To share the decompose library, the
+  // whole decompose TFR function library is loaded.
+  explicit TFRDecomposeContext(mlir::ModuleOp tfr_module);
+
+  // Constructs the decompose context from the tfr text module and the mlir
+  // context. The tfr text module is added to the mlir context.
+  static std::unique_ptr<TFRDecomposeContext> GetFromText(
+      StringPiece tfr_raw_text, mlir::MLIRContext* mlir_ctx);
+
+  // Decomposes the op in the NodeDef to a set of primitive ops according to the
+  // decompose library in the context. Wrap the decomposed result in a
+  // FunctionDef.
+  StatusOr<FunctionDef> ExpandNode(const NodeDef& node_def,
+                                   StringPiece func_name);
+
+  // Runs the decompose passes on the user_module.
+  Status DecomposeGraph(mlir::ModuleOp user_module);
+
+  // Erases the tfr_module created.
+  void Destroy();
+
+ private:
+  mlir::ModuleOp tfr_module_;
+  mlir::PassManager pm_;
+
+  GraphExportConfig export_confs_;
+};
+
+// Decomposes the NodeDef to a set of primitive ops according to the decompose
+// library loaded. Wrap the decomposed result in a FunctionDef.
+StatusOr<FunctionDef> ExpandNode(const NodeDef& node_def,
+                                 StringPiece func_name);
+
+// Decomposes the ops in the ModuleOp to a set of primitive ops according to
+// decompose library in the context.
+Status DecomposeGraph(mlir::ModuleOp user_module);
+
+}  // namespace tfr
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFR_INTEGRATION_TFR_DECOMPOSE_CTX_H_
diff --git a/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx_test.cc b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx_test.cc
new file mode 100644
index 00000000000..3d83b8d5535
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx_test.cc
@@ -0,0 +1,162 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfr/integration/tfr_decompose_ctx.h"
+
+#include <string>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+using testing::ElementsAreArray;
+using testing::Test;
+using NodeAndType = std::pair<std::string, tensorflow::DataType>;
+
+namespace tensorflow {
+
+REGISTER_OP("MyAddN")
+    .Input("inputs: N * T")
+    .Output("sum: T")
+    .Attr("N: int >= 1")
+    .Attr("T: {numbertype, variant}")
+    .SetIsCommutative()
+    .SetIsAggregate()
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+REGISTER_OP("RiscAdd")
+    .Input("x: T")
+    .Input("y: T")
+    .Output("z: T")
+    .Attr(
+        "T: {bfloat16, half, float, double, uint8, int8, int16, int32, int64, "
+        "complex64, complex128, string}")
+    .SetShapeFn(shape_inference::UnchangedShape);
+
+namespace {
+
+constexpr char tfr_raw_text[] = R"(
+
+tfr.func @tf__my_add_n(%values: !tfr.tensor_list,
+                       %n: i64 {tfr.name="N"}) -> !tfr.tensor {
+  %index = constant 0 : index
+  %cst = constant 1 : i64
+  %eq = cmpi "eq", %n, %cst : i64
+  %v1 = tfr.get_element %values[%index] : (!tfr.tensor_list, index) -> !tfr.tensor
+  %res = scf.if %eq -> !tfr.tensor {
+    scf.yield %v1 : !tfr.tensor
+  } else {
+    %step = index_cast %cst : i64 to index
+    %end = index_cast %n : i64 to index
+    %reduce = scf.for %i = %step to %end step %step iter_args(%reduce_iter=%v1) -> !tfr.tensor {
+      %v = tfr.get_element %values[%i] : (!tfr.tensor_list, index) -> !tfr.tensor
+      %reduce_next =  tfr.call @tf__risc_add(%reduce_iter, %v) : (!tfr.tensor, !tfr.tensor) -> !tfr.tensor
+      scf.yield %reduce_next : !tfr.tensor
+    }
+    scf.yield %reduce : !tfr.tensor
+  }
+  tfr.return %res : !tfr.tensor
+}
+
+tfr.func @tf__risc_add_(!tfr.tensor<T>, !tfr.tensor<T>) -> !tfr.tensor<T> attributes{T}
+)";
+
+class TFRDecomposeContextTest : public Test {
+ protected:
+  void SetUp() override {
+    test_ctx_ = tfr::TFRDecomposeContext::GetFromText(tfr_raw_text, &ctx_);
+  }
+
+  void TearDown() override { test_ctx_->Destroy(); }
+
+  mlir::MLIRContext ctx_;
+  std::unique_ptr<tfr::TFRDecomposeContext> test_ctx_;
+};
+
+std::vector<NodeAndType> NodesSequenceOf(const FunctionDef& graph) {
+  std::vector<NodeAndType> nodes;
+  for (auto& node : graph.node_def()) {
+    nodes.push_back({node.op(), node.attr().at("T").type()});
+  }
+  return nodes;
+}
+
+TEST_F(TFRDecomposeContextTest, FLOAT_1_ins) {
+  std::vector<NodeDefBuilder::NodeOut> src_list;
+  src_list.emplace_back("input", 0, DT_FLOAT);
+  NodeDef test_node;
+  auto status = NodeDefBuilder("float_add", "MyAddN")
+                    .Input(src_list)
+                    .Finalize(&test_node);
+  EXPECT_TRUE(status.ok());
+  auto decomposed = test_ctx_->ExpandNode(test_node, "test");
+  EXPECT_TRUE(decomposed.ok());
+  std::vector<NodeAndType> expected_results{{"Identity", DT_FLOAT}};
+  EXPECT_THAT(NodesSequenceOf(decomposed.ValueOrDie()),
+              ElementsAreArray(expected_results));
+}
+
+TEST_F(TFRDecomposeContextTest, FLOAT_3_ins) {
+  std::vector<NodeDefBuilder::NodeOut> src_list;
+  src_list.emplace_back("in0", 0, DT_FLOAT);
+  src_list.emplace_back("in1", 0, DT_FLOAT);
+  src_list.emplace_back("in2", 0, DT_FLOAT);
+  NodeDef test_node;
+  auto status = NodeDefBuilder("float_add_3", "MyAddN")
+                    .Input(src_list)
+                    .Finalize(&test_node);
+  EXPECT_TRUE(status.ok());
+  auto decomposed = test_ctx_->ExpandNode(test_node, "test");
+  EXPECT_TRUE(decomposed.ok());
+
+  std::vector<NodeAndType> expected_results{{"RiscAdd", DT_FLOAT},
+                                            {"RiscAdd", DT_FLOAT}};
+  EXPECT_THAT(NodesSequenceOf(decomposed.ValueOrDie()),
+              ElementsAreArray(expected_results));
+}
+
+TEST_F(TFRDecomposeContextTest, INT32_3_ins) {
+  std::vector<NodeDefBuilder::NodeOut> src_list;
+  src_list.emplace_back("in0", 0, DT_INT32);
+  src_list.emplace_back("in1", 0, DT_INT32);
+  src_list.emplace_back("in2", 0, DT_INT32);
+  NodeDef test_node;
+  auto status =
+      NodeDefBuilder("int_add", "MyAddN").Input(src_list).Finalize(&test_node);
+  EXPECT_TRUE(status.ok());
+  auto decomposed = test_ctx_->ExpandNode(test_node, "test");
+  EXPECT_TRUE(decomposed.ok());
+
+  std::vector<NodeAndType> expected_results{{"RiscAdd", DT_INT32},
+                                            {"RiscAdd", DT_INT32}};
+  EXPECT_THAT(NodesSequenceOf(decomposed.ValueOrDie()),
+              ElementsAreArray(expected_results));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
new file mode 100644
index 00000000000..c0ef5c3b387
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
@@ -0,0 +1,590 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.h"
+
+#include <algorithm>
+#include <string>
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/FunctionImplementation.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tfr/ir/tfr_types.h"
+
+namespace mlir {
+
+namespace TFR {
+
+//===----------------------------------------------------------------------===//
+// InlinerInterface
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// This class defines the interface for inlining within the TFR dialect.
+struct TFRInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  // Returns true if the given region 'src' can be inlined into the region
+  // 'dest' that is attached to an operation registered to the current dialect.
+  bool isLegalToInline(Region *dest, Region *src,
+                       BlockAndValueMapping &) const final {
+    return true;
+  }
+
+  // Returns true if the given operation 'op', that is registered to this
+  // dialect, can be inlined into the region 'dest' that is attached to an
+  // operation registered to the current dialect.
+  bool isLegalToInline(Operation *op, Region *dest,
+                       BlockAndValueMapping &) const final {
+    return true;
+  }
+
+  // Handle the given inlined terminator by replacing it with a new operation
+  // as necessary. Required when the region has only one block.
+  void handleTerminator(Operation *op,
+                        ArrayRef<Value> valuesToRepl) const final {
+    auto retValOp = dyn_cast<TFRReturnOp>(op);
+    if (!retValOp) return;
+
+    for (auto ret_value : llvm::zip(valuesToRepl, retValOp.operands())) {
+      std::get<0>(ret_value).replaceAllUsesWith(std::get<1>(ret_value));
+    }
+  }
+
+  // Attempts to materialize a conversion for a type mismatch between a call
+  // from this dialect, and a callable region. This method should generate an
+  // operation that takes 'input' as the only operand, and produces a single
+  // result of 'resultType'. If a conversion can not be generated, nullptr
+  // should be returned.
+  Operation *materializeCallConversion(OpBuilder &builder, Value input,
+                                       Type result_type,
+                                       Location conversion_loc) const final {
+    if (!result_type.isa<IntegerType>()) return nullptr;
+    return builder.create<TruncateIOp>(conversion_loc, result_type, input);
+  }
+};
+}  // namespace
+
+//===----------------------------------------------------------------------===//
+// TFR Dialect
+//===----------------------------------------------------------------------===//
+
+TFRDialect::TFRDialect(MLIRContext *context)
+    : Dialect(/*name=*/"tfr", context, TypeID::get<TFRDialect>()) {
+  addTypes<TFRTensorType, TFRTensorListType, TFRAttrType>();
+  addOperations<
+#define GET_OP_LIST
+#include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc.inc"
+      >();
+
+  addInterfaces<TFRInlinerInterface>();
+}
+
+bool TFRType::classof(Type type) {
+  return llvm::isa<TFRDialect>(type.getDialect());
+}
+
+//===----------------------------------------------------------------------===//
+// Custom op methods
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(ConstantTensorOp op) {
+  auto input_type = op.arg().getType();
+  auto output_type = op.out().getType();
+
+  if (auto output_tensor_type = output_type.dyn_cast<TFRTensorType>()) {
+    return success();
+  }
+
+  auto output_tensor_type = output_type.dyn_cast<RankedTensorType>();
+  if (!output_tensor_type || !output_tensor_type.hasStaticShape()) {
+    op.emitError("output type should be static and ranked.");
+    return failure();
+  }
+
+  if (output_tensor_type.getRank() == 0) {
+    bool same_scalar = output_tensor_type.getElementType() == input_type;
+    if (!same_scalar) {
+      op.emitError("input and output should have the same scalar types.");
+    }
+    return success(same_scalar);
+  }
+
+  if (auto input_vector_type = input_type.dyn_cast<VectorType>()) {
+    bool same_element_type = output_tensor_type.getElementType() ==
+                             input_vector_type.getElementType();
+    bool same_shape =
+        output_tensor_type.getShape() == input_vector_type.getShape();
+    if (!same_element_type || !same_shape) {
+      op.emitError("input and output should have same shape and element type.");
+    }
+    return success(same_element_type && same_shape);
+  }
+
+  op.emitError("input can not be converted to an output tensor.");
+  return failure();
+}
+
+static LogicalResult Verify(TFRFuncOp func) {
+  // Collect all attribute names used by the tensor and tensor list arguments
+  // and returns. Also, collect the names of all the attribute arguments as the
+  // defined list. Later on, the used attribute names will be verified to be in
+  // the defined list.
+  llvm::SmallVector<StringAttr, 4> used_attrs;
+
+  // While scanning the arguments, record the start/end indices of each argument
+  // type, so the order can be verified as well.
+  // TODO(fengliuai): the attribute arguments with default values need to be
+  // at the end?
+  int first_tensor = -1, last_tensor = -1, first_tensor_list = -1,
+      last_tensor_list = -1, first_attr = -1;
+
+  for (auto arg : llvm::enumerate(func.getType().getInputs())) {
+    Type arg_type = arg.value();
+
+    if (auto tensor = arg_type.dyn_cast<TFRTensorType>()) {
+      if (first_tensor == -1) {
+        first_tensor = arg.index();
+      }
+      last_tensor = arg.index();
+      auto used = tensor.getAttrKeys();
+      used_attrs.append(used.begin(), used.end());
+      continue;
+    }
+
+    if (auto tensor_list = arg_type.dyn_cast<TFRTensorListType>()) {
+      if (first_tensor_list == -1) {
+        first_tensor_list = arg.index();
+      }
+      last_tensor_list = arg.index();
+      auto used = tensor_list.getAttrKeys();
+      used_attrs.append(used.begin(), used.end());
+      continue;
+    }
+
+    if (!arg_type.isa<TensorType>()) {
+      if (first_attr == -1) {
+        first_attr = arg.index();
+      }
+      auto name =
+          func.getArgAttrOfType<StringAttr>(arg.index(), kAttrArgumentNameAttr);
+      if (!name) {
+        func.emitError(
+            llvm::Twine(arg.index()) +
+            " attribute argument doesn't have a tfr.name attribute.");
+        return failure();
+      }
+      continue;
+    }
+
+    func.emitError("Builtin TensorType isn't allowed as the argument.");
+    return failure();
+  }
+
+  // Verify the argument order: tensors, tensor list, attributes; and also
+  // verify there is at most one tensor list argument.
+  if (first_tensor_list != -1 && first_tensor_list < last_tensor) {
+    func.emitError(
+        "tfr.tensor argument should be before tfr.tensor_list argument.");
+    return failure();
+  }
+  if (first_attr != -1 && first_attr < last_tensor_list) {
+    func.emitError(
+        "tfr.tensor_list argument should be before non tensor arguments.");
+    return failure();
+  }
+  if (first_tensor_list != last_tensor_list) {
+    func.emitError("More than one tfr.tensor_list argument isn't allowed.");
+    return failure();
+  }
+
+  // Verify the result order: tensor, tensor list, and also verify at most one
+  // tensor list result.
+  bool seen_tensor_list = false;
+  for (auto result_type : func.getType().getResults()) {
+    if (auto tensor = result_type.dyn_cast<TFRTensorType>()) {
+      if (seen_tensor_list) {
+        func.emitError(
+            "tfr.tensor result should be before tfr.tensor_list result.");
+        return failure();
+      }
+      auto used = tensor.getAttrKeys();
+      used_attrs.append(used.begin(), used.end());
+      continue;
+    }
+
+    if (auto tensor_list = result_type.dyn_cast<TFRTensorListType>()) {
+      if (seen_tensor_list) {
+        func.emitError("More than one tfr.tensor_list result isn't allowed.");
+        return failure();
+      }
+      seen_tensor_list = true;
+      auto used = tensor_list.getAttrKeys();
+      used_attrs.append(used.begin(), used.end());
+      continue;
+    }
+
+    func.emitError(
+        "None tfr.tensor/tfr.tensor_list results aren't allowed as a "
+        "result.");
+    return failure();
+  }
+
+  // Verify that all the used attributes are in the attribute arguments.
+  llvm::SmallVector<StringAttr, 4> undefined_attrs;
+  for (auto attr : used_attrs) {
+    if (!func.getAttr(attr.getValue())) {
+      undefined_attrs.push_back(attr);
+    }
+  }
+  if (!undefined_attrs.empty()) {
+    llvm::SmallVector<std::string, 4> attr_names(undefined_attrs.size());
+    std::transform(undefined_attrs.begin(), undefined_attrs.end(),
+                   attr_names.begin(),
+                   [](StringAttr attr) { return attr.getValue().str(); });
+    func.emitError(llvm::Twine("Undefined attributes are used: ",
+                               llvm::join(attr_names, ",")));
+    return failure();
+  }
+
+  return success();
+}
+
+static ParseResult ParseFuncOp(OpAsmParser &parser, OperationState *result) {
+  auto build_func_type = [](Builder &builder, ArrayRef<Type> arg_types,
+                            ArrayRef<Type> results, impl::VariadicFlag,
+                            std::string &) {
+    return builder.getFunctionType(arg_types, results);
+  };
+  return impl::parseFunctionLikeOp(parser, *result, /*allowVariadic=*/false,
+                                   build_func_type);
+}
+
+static void PrintFuncOp(OpAsmPrinter &p, TFRFuncOp op) {
+  FunctionType fn_type = op.getType();
+  impl::printFunctionLikeOp(p, op, fn_type.getInputs(), /*isVariadic=*/false,
+                            fn_type.getResults());
+}
+
+}  // namespace TFR
+}  // namespace mlir
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc.inc"
+
+namespace mlir {
+namespace TFR {
+struct ConvertConstToTensorConst : public OpRewritePattern<ConstantTensorOp> {
+  using OpRewritePattern<ConstantTensorOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ConstantTensorOp cst_tensor_op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = cst_tensor_op.getLoc();
+    Type out_type = cst_tensor_op.getType();
+    Operation *new_cst = nullptr;
+
+    ArrayAttr array;
+    if (matchPattern(cst_tensor_op.arg(), m_Constant(&array))) {
+      llvm::DenseSet<Type> all_types;
+      for (auto it : array) {
+        all_types.insert(it.getType());
+      }
+      if (all_types.size() != 1) return failure();
+      ShapedType new_out_type = RankedTensorType::get(
+          {static_cast<int64_t>(array.size())}, *all_types.begin());
+      DenseElementsAttr attr =
+          DenseElementsAttr::get(new_out_type, array.getValue());
+      new_cst = rewriter.create<TF::ConstOp>(loc, new_out_type, attr);
+      if (out_type.isa<TFRTensorType>()) {
+        new_cst = rewriter.create<CastOp>(loc, out_type, new_cst->getResult(0));
+      }
+      rewriter.replaceOp(cst_tensor_op, new_cst->getResult(0));
+      return success();
+    }
+
+    Attribute scalar;
+    if (matchPattern(cst_tensor_op.arg(), m_Constant(&scalar))) {
+      Type new_out_type = RankedTensorType::get({}, scalar.getType());
+      new_cst = rewriter.create<TF::ConstOp>(loc, new_out_type, scalar);
+      if (out_type.isa<TFRTensorType>()) {
+        new_cst = rewriter.create<CastOp>(loc, out_type, new_cst->getResult(0));
+      }
+      rewriter.replaceOp(cst_tensor_op, new_cst->getResult(0));
+      return success();
+    }
+    return failure();
+  }
+};
+
+struct RemoveRedundantCast : public OpRewritePattern<CastOp> {
+  using OpRewritePattern<CastOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(CastOp cast_op,
+                                PatternRewriter &rewriter) const override {
+    auto preceding_cast =
+        llvm::dyn_cast_or_null<CastOp>(cast_op.arg().getDefiningOp());
+    if (!preceding_cast) {
+      return failure();
+    }
+    Value input = preceding_cast.arg();
+    Type input_type = input.getType();
+    Type output_type = cast_op.getType();
+
+    // If the two types are the same, the back-to-back tfr.cast ops can be
+    // removed.
+    if (input_type == output_type || output_type.isa<UnrankedTensorType>()) {
+      rewriter.replaceOp(cast_op, {input});
+      return success();
+    }
+
+    // If the rank of the input tensor isn't ranked, we replace the pair
+    // with tf.EnsureShape op so it can be removed after shape inference or
+    // confirmed at runtime.
+    if (input_type.isa<UnrankedTensorType>() && output_type.isa<ShapedType>()) {
+      auto shape = output_type.cast<ShapedType>().getShape();
+      auto shape_attr = TF::ShapeAttr::get(rewriter.getContext(), shape);
+      rewriter.replaceOpWithNewOp<TF::EnsureShapeOp>(cast_op, output_type,
+                                                     input, shape_attr);
+    }
+
+    return success();
+  }
+};
+
+struct GetTensorShape : public OpRewritePattern<GetShapeOp> {
+  using OpRewritePattern<GetShapeOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(GetShapeOp shape_op,
+                                PatternRewriter &rewriter) const override {
+    Operation *preceding_op = shape_op.arg().getDefiningOp();
+    if (auto cast_op = llvm::dyn_cast_or_null<CastOp>(preceding_op)) {
+      // replace this pair by shape.shape_of, so the folding works.
+      rewriter.replaceOpWithNewOp<shape::ShapeOfOp>(shape_op, cast_op.arg());
+      return success();
+    }
+    return failure();
+  }
+};
+
+struct RemoveRedundantGetElement : public OpRewritePattern<GetElementOp> {
+  using OpRewritePattern<GetElementOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(GetElementOp ge_op,
+                                PatternRewriter &rewriter) const override {
+    IntegerAttr index;
+    if (!matchPattern(ge_op.index(), m_Constant(&index))) {
+      return failure();
+    }
+    auto preceding_build_list = llvm::dyn_cast_or_null<BuildListOp>(
+        ge_op.tensor_list().getDefiningOp());
+    if (!preceding_build_list ||
+        preceding_build_list.getNumOperands() <= index.getInt()) {
+      return failure();
+    }
+    Value input = preceding_build_list.getOperand(index.getInt());
+    Type output_type = ge_op.getType();
+    if (input.getType() != output_type &&
+        !output_type.isa<UnrankedTensorType>()) {
+      return failure();
+    }
+    rewriter.replaceOp(ge_op, {input});
+    return success();
+  }
+};
+
+struct BuildConstantListAsAttr : public OpRewritePattern<BuildListOp> {
+  using OpRewritePattern<BuildListOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(BuildListOp bl_op,
+                                PatternRewriter &rewriter) const override {
+    SmallVector<Attribute, 4> array_list;
+    array_list.reserve(bl_op.getNumOperands());
+    for (const auto &operand : bl_op.getOperands()) {
+      Attribute array_elt;
+      if (!matchPattern(operand, m_Constant(&array_elt))) {
+        return failure();
+      }
+      array_list.push_back(array_elt);
+    }
+    auto array_attr = rewriter.getArrayAttr(array_list);
+    rewriter.replaceOpWithNewOp<TFR::ConstOp>(bl_op, array_attr);
+    return success();
+  }
+};
+
+void ConstantTensorOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<ConvertConstToTensorConst>(context);
+}
+
+void CastOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                         MLIRContext *context) {
+  results.insert<RemoveRedundantCast>(context);
+}
+
+void GetShapeOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                             MLIRContext *context) {
+  results.insert<GetTensorShape>(context);
+}
+
+void GetElementOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<RemoveRedundantGetElement>(context);
+}
+
+void BuildListOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                              MLIRContext *context) {
+  results.insert<BuildConstantListAsAttr>(context);
+}
+
+OpFoldResult TFR::EqualOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "equal op has two operands");
+  auto ctx = getContext();
+  if (operands[0] == operands[1]) return BoolAttr::get(/*value=*/true, ctx);
+  return BoolAttr::get(/*value=*/false, ctx);
+}
+
+OpFoldResult ConstOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.empty() && "constant has no operands");
+
+  // Return the held attribute value.
+  return value();
+}
+
+// CallableOpInterface
+Region *TFRFuncOp::getCallableRegion() {
+  return isExternal() ? nullptr : &body().front();
+}
+
+// CallableOpInterface
+ArrayRef<Type> TFRFuncOp::getCallableResults() {
+  return getType().getResults();
+}
+
+//===----------------------------------------------------------------------===//
+// Dialect type definitions
+//===----------------------------------------------------------------------===//
+
+// Parses a TFR type.
+//   tfr_type ::= tensor_type | tensor_list_type | attr_type
+//   string_list ::= `[` string-literal (, string-literal)+ `]`
+//   tensor_type ::= `tensor`
+//                 | `tensor<` (string-literal | string_list)  '>'
+//   tensor_list_type ::= `tensor_list`
+//                      | `tensor_list<` (string-literal | string_list)  '>'
+//   attr_type ::= `attr`
+Type TFRDialect::parseType(DialectAsmParser &parser) const {
+  Location loc = parser.getEncodedSourceLoc(parser.getNameLoc());
+  MLIRContext *ctx = loc.getContext();
+
+  StringRef typeNameSpelling;
+  if (failed(parser.parseKeyword(&typeNameSpelling))) return {};
+  llvm::SmallVector<StringAttr, 4> attrs;
+  if (succeeded(parser.parseOptionalLess())) {
+    bool l_square_parsed = false;
+    if (succeeded(parser.parseOptionalLSquare())) {
+      l_square_parsed = true;
+    }
+
+    do {
+      StringRef attr;
+      if (failed(parser.parseKeyword(&attr))) return {};
+      attrs.push_back(StringAttr::get(attr, ctx));
+    } while (succeeded(parser.parseOptionalComma()));
+
+    if (l_square_parsed && failed(parser.parseRSquare())) {
+      parser.emitError(parser.getNameLoc(), "expected ']'");
+    }
+
+    if (failed(parser.parseGreater())) {
+      parser.emitError(parser.getNameLoc(), "expected '>'");
+    }
+  }
+
+  if (typeNameSpelling == "tensor") {
+    return TFRTensorType::getChecked(attrs, loc);
+  } else if (typeNameSpelling == "tensor_list") {
+    return TFRTensorListType::getChecked(attrs, loc);
+  } else if (typeNameSpelling == "attr") {
+    return TFRAttrType::getChecked(loc);
+  } else {
+    parser.emitError(parser.getNameLoc(), "unknown type " + typeNameSpelling);
+    return {};
+  }
+}
+
+void TFRDialect::printType(Type type, DialectAsmPrinter &os) const {
+  llvm::ArrayRef<StringAttr> attrs;
+
+  if (type.isa<TFRAttrType>()) {
+    os << "attr";
+    return;
+  }
+  if (auto tensor_ty = type.dyn_cast<TFRTensorType>()) {
+    attrs = tensor_ty.getAttrKeys();
+    os << "tensor";
+  } else if (auto tensor_list_ty = type.dyn_cast<TFRTensorListType>()) {
+    attrs = tensor_list_ty.getAttrKeys();
+    os << "tensor_list";
+  } else {
+    llvm_unreachable("Unhandled tfr type");
+  }
+
+  if (attrs.empty()) return;
+  os << "<";
+
+  if (attrs.size() > 1) {
+    os << "[";
+  }
+
+  llvm::interleaveComma(attrs, os,
+                        [&](StringAttr attr) { os << attr.getValue(); });
+
+  if (attrs.size() > 1) {
+    os << "]";
+  }
+  os << ">";
+}
+
+}  // namespace TFR
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.h b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.h
new file mode 100644
index 00000000000..cb36ee28351
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.h
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_OPS_H_
+
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
+#include "mlir/IR/FunctionSupport.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/ControlFlowInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFR {
+
+constexpr char kAttrArgumentNameAttr[] = "tfr.name";
+constexpr char kAttrArgumentDefaultAttr[] = "tfr.default";
+
+class TFRDialect : public Dialect {
+ public:
+  explicit TFRDialect(MLIRContext *context);
+
+  static StringRef getDialectNamespace() { return "tfr"; }
+
+  // Parse a type registered to this dialect.
+  Type parseType(DialectAsmParser &parser) const override;
+
+  // Prints a type registered to this dialect.
+  void printType(Type ty, DialectAsmPrinter &os) const override;
+};
+
+}  // namespace TFR
+}  // namespace mlir
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_OPS_H_
diff --git a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td
new file mode 100644
index 00000000000..562b3f79955
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td
@@ -0,0 +1,435 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is the operation definition file for TFR
+
+#ifndef DIALECT_TFR_OPS_
+#define DIALECT_TFR_OPS_
+
+include "mlir/Dialect/Shape/IR/ShapeBase.td"
+include "mlir/IR/OpBase.td"
+include "mlir/IR/SymbolInterfaces.td"
+include "mlir/Interfaces/CallInterfaces.td"
+include "mlir/Interfaces/ControlFlowInterfaces.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td"
+
+//===----------------------------------------------------------------------===//
+// Dialect
+//===----------------------------------------------------------------------===//
+
+def TFR_Dialect : Dialect {
+  let name = "tfr";
+
+  let description = [{
+    The TensorFlow Composition dialect.
+  }];
+
+  let cppNamespace = "::mlir::TFR";
+}
+
+//===----------------------------------------------------------------------===//
+// Type classes
+//===----------------------------------------------------------------------===//
+
+// tensor argument types
+class TFR_Type<string name> : DialectType<TFR_Dialect,
+    CPred<"$_self.isa<mlir::TFR::" # name # "Type>()">,
+    "TFR " # name #" type">,
+    BuildableType<"$_builder.getType<mlir::TFR::" # name # "Type>()">;
+def TFR_TensorType : TFR_Type<"TFRTensor">;
+def TFR_TensorListType : TFR_Type<"TFRTensorList">;
+def TFR_AllTensorTypes : Type<Or<[
+    TFR_TensorType.predicate,
+    TFR_TensorListType.predicate]>, "all tensor related types">;
+
+// attribute argument types
+def TFR_AttrType : TFR_Type<"TFRAttr">;
+def TFR_AttrScalarType: TypeAlias<TF_ElementType, "scalar attribute">;
+def TFR_AttrVectorType : VectorOf<[TF_ElementType, TFR_AttrType]>;
+def TFR_AllAttrTypes : Type<Or<[
+    TFR_AttrType.predicate,
+    Index.predicate,
+    TFR_AttrScalarType.predicate,
+    TFR_AttrVectorType.predicate]>, "all attribute related types">;
+
+// all allowed arguments types
+def TFR_allowedArgType : Type<Or<[
+    TFR_AllTensorTypes.predicate,
+    TFR_AllAttrTypes.predicate]>, "allowed tfr.call operand types">;
+
+def TFR_allowedConstValues : Attr<Or<[
+    FlatSymbolRefAttr.predicate,
+    TypeAttr.predicate,
+    StrAttr.predicate,
+    ArrayAttr.predicate]>, "allowed tfr.constant value"> {
+  let storageType = "Attribute";
+  let returnType = "Attribute";
+  let convertFromStorage = "$_self";
+  let constBuilderCall = "$0";
+}
+
+// all allowed result types
+def TFR_allowedResultType : TypeAlias<TFR_AllTensorTypes,
+  "allowed tfr.call result types">;
+
+// standard tensor type and tfr.tensor types can be casted to each other.
+def TFR_singleTensorType : Type<Or<[
+    TFR_TensorType.predicate,
+    TF_Tensor.predicate]>, "single tensor or tfr.tensor type">;
+
+// all allowed build list input types
+def TFR_allowedBuiltListType : Type<Or<[
+    TFR_TensorType.predicate,
+    TF_ElementType.predicate]>, "single tfr.tensor or tensor element type">;
+
+// all allowed build list result types
+def TFR_allowedListResultType : Type<Or<[
+    TFR_TensorListType.predicate,
+    TFR_AttrType.predicate]>, "tfr.tensor_list or tfr.attr type">;
+
+//===----------------------------------------------------------------------===//
+// Op classes
+//===----------------------------------------------------------------------===//
+
+class TFR_Op<string mnemonic, list<OpTrait> traits> :
+    Op<TFR_Dialect, mnemonic, traits>;
+
+def TFR_CallOp : TFR_Op<"call", [CallOpInterface]> {
+  let description = [{
+    The `call` operation represents a direct call to a function that is within
+    the same symbol scope as the callee. The operands and result types of the
+    call must match the specified function type. The callee is encoded as a
+    symbol reference attribute named "callee".
+
+    Example:
+
+    ```mlir
+    %2 = tfr.call @my_add(%0, %1) : (tfr.tensor, f32) -> tfr.tensor_list
+    ```
+
+    Note that the operands of the `call` operation can only be with tfr.tensor,
+    tfr.tensor_list, tfr.attr and mlir float and integer types. The results of
+    the `call` operation can only be with tfr.tensor and tfr.tensor_list types.
+  }];
+
+  let arguments = (ins
+    FlatSymbolRefAttr:$callee,
+    Variadic<TFR_allowedArgType>:$args);
+
+  let results = (outs
+    Variadic<TFR_allowedResultType>:$outs);
+
+  let extraClassDeclaration = [{
+    StringRef getCallee() { return callee(); }
+
+    // Get the argument operands to the called function.
+    operand_range getArgOperands() { return args(); }
+
+    // Return the callee of this operation.
+    CallInterfaceCallable getCallableForCallee() { return calleeAttr(); }
+  }];
+
+  let assemblyFormat = [{
+    $callee `(` $args `)` attr-dict `:` functional-type($args, results)
+  }];
+}
+
+def TFR_CastOp : TFR_Op<"cast", [NoSideEffect]> {
+  let description = [{
+    The `cast` operation converts the operand with built-in tensor type to
+    tfr.tensor type, or vice versa.
+
+    Example:
+
+    ```mlir
+    %1 = tfr.cast(%0) : tensor<f32> -> !tfr.tensor
+    %3 = tfr.cast(%1) : !tfr.tensor -> tensor<f32>
+    ```
+  }];
+
+  let arguments = (ins TFR_singleTensorType:$arg);
+
+  let results = (outs TFR_singleTensorType:$out);
+
+  let extraClassDeclaration = [{
+    // Return element type of the input tensor type. Only available when the
+    // input is a MLIR built-in tensor type.
+    Attribute getInputElementType() {
+      if (auto ty = arg().getType().dyn_cast<TensorType>()) {
+        return TypeAttr::get(ty.getElementType());
+      }
+      return {};
+    }
+  }];
+
+  let hasCanonicalizer = 1;
+}
+
+def TFR_GetShapeOp : TFR_Op<"get_shape", [NoSideEffect]> {
+  let description = [{
+    The `get_shape` operation gets the shape of a tfr.tensor and returns
+    !shape.shape type.
+
+    Example:
+
+    ```mlir
+    %1 = "tfr.get_shape"(%0) : !tfr.tensor -> !shape.shape
+    %1 = tfr.get_shape %0 -> !shape.shape
+    ```
+  }];
+
+  let arguments = (ins TFR_TensorType:$arg);
+
+  let results = (outs Shape_ShapeType:$out);
+
+  let assemblyFormat = "$arg attr-dict `->` type($out)";
+
+  let hasCanonicalizer = 1;
+}
+
+def TFR_GetElementTypeOp : TFR_Op<"get_element_type", [NoSideEffect]> {
+  let description = [{
+    The `get_element_type` operation gets the element type of a tfr.tensor and
+    returns !tfr.attr.
+
+    Example:
+
+    ```mlir
+    %1 = "tfr.get_element_type"(%0) : !tfr.tensor -> !tfr.attr
+    %1 = tfr.get_element_type %0 -> !tfr.attr
+    ```
+  }];
+
+  let arguments = (ins TFR_TensorType:$arg);
+
+  let results = (outs TFR_AttrType:$out);
+
+  let assemblyFormat = "$arg attr-dict `->` type($out)";
+}
+
+def TFR_EqualOp : TFR_Op<"equal", [NoSideEffect, SameTypeOperands]> {
+  let description = [{
+    The `equal` operation compares the values of the tfr.attr type arguments.
+    The operation returns an i1 boolean indicating if the two values are the
+    same.
+    Example:
+
+    ```mlir
+    %x = tfr.equal %lhs, %rhs -> i1
+    %x = "tfr.equal"(%lhs, %rhs) : (!tfr.attr, !tfr.attr) -> i1
+    ```
+  }];
+
+  let arguments = (ins
+      TFR_AttrType:$lhs,
+      TFR_AttrType:$rhs
+  );
+  let results = (outs BoolLike:$result);
+
+  let hasFolder = 1;
+
+  let assemblyFormat = "$lhs `,` $rhs attr-dict `->` type($result)";
+}
+
+def TFR_ConstOp : TFR_Op<"constant", [ConstantLike, NoSideEffect]> {
+  let description = [{
+    The `attr` operation stores TF op's attribute, which doesn't support
+    arithmetic operations.
+
+    Example:
+
+    ```mlir
+    %1 = "tfr.constant"() { value: i32 } : () -> !tfr.attr
+    %2 = "tfr.constant"() { value: [i32, f32] } : () -> !tfr.attr
+    %3 = tfr.constant [i32, f32] -> !tfr.attr
+    %4 = tfr.constant f32 -> !tfr.attr
+    ```
+  }];
+
+  let arguments = (ins TFR_allowedConstValues:$value);
+
+  let results = (outs TFR_AttrType:$out);
+
+  let hasFolder = 1;
+
+  let builders = [OpBuilder<"Attribute value",
+    [{
+      auto* ctx = value.getContext();
+      $_state.addAttribute("value", value);
+      $_state.addTypes(TFRAttrType::get(ctx));
+    }]>
+  ];
+
+  let assemblyFormat = [{
+    $value attr-dict `->` type($out)
+  }];
+}
+
+def TFR_ConstantTensorOp : TFR_Op<"constant_tensor", [NoSideEffect]> {
+  let description = [{
+    The `constant_tensor` operation converts the operand with non-built-in
+    tensor type to built-in tensor type or tfr.tensor type. If it is built-in
+    tensor type, the shape shouldn't be changed during the conversion.
+
+    Example:
+
+    ```mlir
+    %1 = tfr.contant_tensor(%0) : f32 -> tensor<f32>
+    %3 = tfr.contant_tensor(%2) : vector<1xf32> -> tensor<1xf32>
+    ```
+  }];
+
+  let arguments = (ins TFR_AllAttrTypes:$arg);
+
+  let results = (outs TFR_singleTensorType:$out);
+
+  let hasCanonicalizer = 1;
+
+  let verifier = [{ return Verify(*this); }];
+}
+
+def TFR_GetElementOp : TFR_Op<"get_element", [NoSideEffect]> {
+  let description = [{
+    The `get_element` operation extracts one tfr.tensor element from a
+    tfr.tensor_list.
+
+    Example:
+
+    ```mlir
+    %2 = tfr.get_element %1[%0] : (tfr.tensor, index) -> tfr.tensor
+    ```
+  }];
+
+  let arguments = (ins
+    TFR_TensorListType:$tensor_list,
+    Index:$index);
+
+  let results = (outs TFR_TensorType:$out);
+
+  let hasCanonicalizer = 1;
+
+  let assemblyFormat = [{
+    $tensor_list `[` $index `]` attr-dict `:`
+      `(` type($tensor_list) `,` type($index) `)` `->` type($out)
+  }];
+}
+
+def TFR_BuildListOp : TFR_Op<"build_list", [NoSideEffect]> {
+  let description = [{
+   The `build_list` operation builds a tensor list from a list of tensors, or
+   an tfr.attr from a list of scalars.
+
+    Example:
+
+    ```mlir
+    %3 = tfr.build_list(%2, %1, %0) :
+      (tfr.tensor, tfr.tensor, tfr.tensor) -> tfr.tensor_list
+    %3 = tfr.build_list(%2, %1, %0) : (i32, i32, i32) -> tfr.attr
+    ```
+  }];
+
+  let arguments = (ins Variadic<TFR_allowedBuiltListType>:$tensors);
+
+  let results = (outs TFR_allowedListResultType:$out);
+
+  let hasCanonicalizer = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Function related classes
+//===----------------------------------------------------------------------===//
+
+def TFR_TFRFuncOp : TFR_Op<"func", [HasParent<"ModuleOp">,
+                                    DeclareOpInterfaceMethods<CallableOpInterface>,
+                                    FunctionLike,
+                                    IsolatedFromAbove, Symbol]> {
+  let summary = "TFR Function defines a composition of other ops";
+
+  let description = [{
+    Defines a function that can be used to decompose an TF function call to
+    the invocation of a set of other TF ops.
+
+    Syntax:
+
+    ```
+    op ::= `tfr.func` symbol-ref-id `(` argument-list `)` (`->`
+    function-result-list)? function-attributes? region
+    ```
+
+    Example:
+
+    ```mlir
+    tfr.func @foo(%arg0: !tfr.tensor, %arg1: !tfr.tensor_list<T>,
+                  %arg2: int {tfr.name="T", tfr.default=1})
+        attributes {qux: "quux"} {
+      tfr.return
+    }
+    ```
+
+    Note the arguments are ordered by the following rule:
+      tfr.tensor > tfr.tensor_list > tfr.attr/i32/...,
+    and only one trfr.tensor_list argument is allowed.
+  }];
+
+  let arguments = (ins
+    TypeAttr:$type,
+    StrAttr:$sym_name
+  );
+
+  let results = (outs);
+
+  // When the regions is empty, the tfr.func is an external function and used
+  // to model the element type constraints of the tf op. Otherwise, there is one
+  // region containing the composition.
+  let regions = (region VariadicRegion<AnyRegion>:$body);
+
+  let skipDefaultBuilders = 1;
+
+  let builders = [
+    OpBuilder<"StringRef name, FunctionType type, "
+              "ArrayRef<NamedAttribute> attrs = {}">
+  ];
+
+  let extraClassDeclaration = [{
+    // FunctionLike trait needs access to the functions below.
+    friend class OpTrait::FunctionLike<TFRFuncOp>;
+
+    // Hooks for the input/output type enumeration in FunctionLike .
+    unsigned getNumFuncArguments() { return getType().getNumInputs(); }
+    unsigned getNumFuncResults() { return getType().getNumResults(); }
+  }];
+
+  let verifier = [{ return Verify(*this); }];
+  let parser = [{ return ParseFuncOp(parser, &result); }];
+  let printer = [{ PrintFuncOp(p, *this); }];
+}
+
+def TFR_TFRReturnOp : TFR_Op<"return", [HasParent<"TFRFuncOp">, NoSideEffect,
+                                        ReturnLike, Terminator]> {
+  let description = [{
+    A terminator operation for regions that appear in the body of  `tfr.func`
+    functions. The operands to the `tfr.return` are the result values returned
+    by an invocation of the `tfr.func`.
+
+    Note that only the tfr.tensor and tfr.tensor_list can be returned.
+  }];
+
+  let arguments = (ins Variadic<TFR_allowedResultType>:$operands);
+
+  let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?";
+}
+
+#endif // DIALECT_TFR_OPS_
diff --git a/tensorflow/compiler/mlir/tfr/ir/tfr_types.h b/tensorflow/compiler/mlir/tfr/ir/tfr_types.h
new file mode 100644
index 00000000000..4bda8f34658
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/ir/tfr_types.h
@@ -0,0 +1,115 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_TYPES_H_
+#define TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_TYPES_H_
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/TypeSupport.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFR {
+
+class TFRType : public Type {
+ public:
+  using Type::Type;
+
+  static bool classof(Type type);
+};
+
+namespace detail {
+
+struct TFRTypeStorage final
+    : public TypeStorage,
+      public llvm::TrailingObjects<TFRTypeStorage, StringAttr> {
+  using KeyTy = ArrayRef<StringAttr>;
+
+  explicit TFRTypeStorage(unsigned num_attrs) : num_attrs(num_attrs) {}
+
+  static TFRTypeStorage* construct(TypeStorageAllocator& allocator, KeyTy key) {
+    // Allocate a new storage instance.
+    auto byteSize = TFRTypeStorage::totalSizeToAlloc<StringAttr>(key.size());
+    auto rawMem = allocator.allocate(byteSize, alignof(TFRTypeStorage));
+    auto result = ::new (rawMem) TFRTypeStorage(key.size());
+
+    // Copy in the string attributes into the trailing storage.
+    std::uninitialized_copy(key.begin(), key.end(),
+                            result->getTrailingObjects<StringAttr>());
+    return result;
+  }
+
+  bool operator==(const KeyTy& attrs) const { return attrs == GetAttrs(); }
+
+  KeyTy GetAttrs() const {
+    return {getTrailingObjects<StringAttr>(), num_attrs};
+  }
+
+  unsigned num_attrs;
+};
+
+template <typename Derived>
+class TFRTypeImpl : public Type::TypeBase<Derived, TFRType, TFRTypeStorage> {
+ public:
+  using Base = Type::TypeBase<Derived, TFRType, TFRTypeStorage>;
+  using TFRBase = TFRTypeImpl<Derived>;
+  using Base::Base;
+
+  static Derived get(ArrayRef<StringAttr> attrs, MLIRContext* context) {
+    return Base::get(context, attrs);
+  }
+
+  static Derived getChecked(ArrayRef<StringAttr> attrs, Location loc) {
+    return Base::getChecked(loc, attrs);
+  }
+
+  static Derived get(MLIRContext* context) { return get({}, context); }
+
+  // TODO(fengliuai): fix the implementation
+  static LogicalResult verifyConstructionInvariants(
+      Location loc, ArrayRef<StringAttr> attrs) {
+    return success();
+  }
+
+  ArrayRef<StringAttr> getAttrKeys() { return Base::getImpl()->GetAttrs(); }
+};
+}  // namespace detail
+
+class TFRTensorType : public detail::TFRTypeImpl<TFRTensorType> {
+ public:
+  using TFRBase::TFRBase;
+  static std::string getTypeName() { return "TFRTensorType"; }
+};
+
+class TFRTensorListType : public detail::TFRTypeImpl<TFRTensorListType> {
+ public:
+  using TFRBase::TFRBase;
+  static std::string getTypeName() { return "TFRTensorListType"; }
+};
+
+class TFRAttrType : public Type::TypeBase<TFRAttrType, TFRType, TypeStorage> {
+ public:
+  using Base::Base;
+  static std::string getTypeName() { return "TFRAttrType"; }
+};
+
+}  // namespace TFR
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_TYPES_H_
diff --git a/tensorflow/compiler/mlir/tfr/passes/canonicalize.cc b/tensorflow/compiler/mlir/tfr/passes/canonicalize.cc
new file mode 100644
index 00000000000..d399a10a35e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/passes/canonicalize.cc
@@ -0,0 +1,160 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <iterator>
+#include <memory>
+
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
+#include "mlir/Transforms/LoopUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.h"
+#include "tensorflow/compiler/mlir/tfr/passes/passes.h"
+
+//===----------------------------------------------------------------------===//
+// Canonicalization patterns for the scf.for and scf.if ops. They are used to
+// optimize the control flow in the tfr function. Technically, both patterns
+// should be upstreamed to be part of the op definition.
+// TODO(fengliuai): sync with the llvm upstream for both patterns.
+//
+namespace mlir {
+namespace TFR {
+
+namespace {
+
+struct UnrollSCFForOp : public OpRewritePattern<scf::ForOp> {
+  using OpRewritePattern<scf::ForOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(scf::ForOp for_op,
+                                PatternRewriter &rewriter) const override {
+    Location loc = for_op.getLoc();
+    APInt lower_bound, upper_bound, step;
+    if (!matchPattern(for_op.lowerBound(), m_ConstantInt(&lower_bound)) ||
+        !matchPattern(for_op.upperBound(), m_ConstantInt(&upper_bound)) ||
+        !matchPattern(for_op.step(), m_ConstantInt(&step))) {
+      return failure();
+    }
+    uint64_t trip_count = (upper_bound - lower_bound).sdiv(step).getZExtValue();
+    if (trip_count <= 0) return failure();
+
+    // TODO(fengliuai): use loopUnrollByFactor once the iter_arg is supported
+
+    Block *single_block = for_op.getBody();
+    BlockAndValueMapping mapping;
+    Value iv = for_op.getInductionVar();
+    for (auto iter_op :
+         llvm::zip(for_op.getRegionIterArgs(), for_op.initArgs())) {
+      mapping.map(std::get<0>(iter_op), std::get<1>(iter_op));
+    }
+    mapping.map(iv, for_op.lowerBound());
+    for (auto i = 0; i < trip_count; ++i) {
+      if (!iv.use_empty()) {
+        // iv' = iv + step * i;
+        Value iter = rewriter.create<ConstantIndexOp>(loc, i);
+        Value step_cst =
+            rewriter.create<ConstantIndexOp>(loc, step.getSExtValue());
+        Value stride = rewriter.create<MulIOp>(loc, step_cst, iter);
+        Value iv_unroll =
+            rewriter.create<AddIOp>(loc, mapping.lookup(iv), stride);
+        mapping.map(iv, iv_unroll);
+      }
+
+      Operation *terminator_op;
+      for (auto it = single_block->begin(); it != single_block->end(); ++it) {
+        terminator_op = rewriter.clone(*it, mapping);
+      }
+      // Map the block arguments to the yield results.
+      for (auto iter_op : llvm::zip(for_op.getRegionIterArgs(),
+                                    terminator_op->getOperands())) {
+        mapping.map(std::get<0>(iter_op), std::get<1>(iter_op));
+      }
+      rewriter.eraseOp(terminator_op);
+    }
+    SmallVector<Value, 4> returned;
+    for (Value arg : for_op.getRegionIterArgs()) {
+      returned.push_back(mapping.lookup(arg));
+    }
+    rewriter.replaceOp(for_op, returned);
+    return success();
+  }
+};
+
+// TODO(fengliuai): up stream this pattern.
+struct SimplifySCFIfOp : public OpRewritePattern<scf::IfOp> {
+  using OpRewritePattern<scf::IfOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(scf::IfOp if_op,
+                                PatternRewriter &rewriter) const override {
+    // Then branch
+    if (matchPattern(if_op.condition(), m_NonZero())) {
+      return InlineRegion(if_op.getLoc(), rewriter, if_op, &if_op.thenRegion());
+    }
+
+    // Else branch
+    if (matchPattern(if_op.condition(), m_Zero())) {
+      if (if_op.elseRegion().empty()) {
+        // Remove the op
+        rewriter.eraseOp(if_op);
+        return success();
+      } else {
+        return InlineRegion(if_op.getLoc(), rewriter, if_op,
+                            &if_op.elseRegion());
+      }
+    }
+
+    // Not a constant condition
+    return failure();
+  }
+
+ private:
+  LogicalResult InlineRegion(Location loc, PatternRewriter &rewriter,
+                             Operation *inline_point, Region *region) const;
+};
+
+LogicalResult SimplifySCFIfOp::InlineRegion(Location loc,
+                                            PatternRewriter &rewriter,
+                                            Operation *inline_point,
+                                            Region *region) const {
+  InlinerInterface interface(loc.getContext());
+  if (failed(inlineRegion(interface, region, inline_point, {},
+                          inline_point->getResults(), loc,
+                          /*shouldCloneInlinedRegion=*/true))) {
+    return failure();
+  }
+
+  // If the inlining was successful then erase the scf.if op.
+  rewriter.eraseOp(inline_point);
+  return success();
+}
+
+}  // namespace
+
+void populateSCFOpsCanonicalizationPatterns(OwningRewritePatternList &results,
+                                            MLIRContext *context) {
+  results.insert<UnrollSCFForOp, SimplifySCFIfOp>(context);
+}
+
+}  // namespace TFR
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tfr/passes/decompose.cc b/tensorflow/compiler/mlir/tfr/passes/decompose.cc
new file mode 100644
index 00000000000..9265437cca9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/passes/decompose.cc
@@ -0,0 +1,280 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <iterator>
+#include <numeric>
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.h"
+#include "tensorflow/compiler/mlir/tfr/ir/tfr_types.h"
+#include "tensorflow/compiler/mlir/tfr/passes/passes.h"
+#include "tensorflow/compiler/mlir/tfr/utils/utils.h"
+
+//===----------------------------------------------------------------------===//
+// The pass to decompose unregistered TF ops with the TFR compose function.
+//
+namespace mlir {
+namespace TFR {
+
+namespace {
+
+// Decompose the TF ops with the registered composition library.
+struct DecomposeTFOpsPass
+    : public PassWrapper<DecomposeTFOpsPass, FunctionPass> {
+
+  explicit DecomposeTFOpsPass(llvm::Optional<ModuleOp> external_tfr_module)
+      : external_tfr_module(external_tfr_module) {}
+
+  void runOnFunction() override;
+
+ private:
+  // Apply canonicalization, mainly constant folding, on the function.
+  void ApplyCanonicalization();
+
+  // Rewrite unregistered TF ops to TFR func call ops. Return failure if all the
+  // ops are registered or the compose function doesn't exist.
+  LogicalResult RewriteUnregisteredTFOps();
+
+  // Inline the TFR func call ops.
+  LogicalResult InlineTFRFuncCalls();
+
+  // Optional external symbol table to look up the TFR function.
+  llvm::Optional<ModuleOp> external_tfr_module;
+};
+
+void DecomposeTFOpsPass::ApplyCanonicalization() {
+  OwningRewritePatternList patterns;
+
+  auto* context = &getContext();
+  for (auto* op : context->getRegisteredOperations()) {
+    op->getCanonicalizationPatterns(patterns, context);
+  }
+  populateSCFOpsCanonicalizationPatterns(patterns, context);
+
+  applyPatternsAndFoldGreedily(getFunction(), patterns);
+}
+
+LogicalResult DecomposeTFOpsPass::RewriteUnregisteredTFOps() {
+  FuncOp func = getFunction();
+  SymbolTable table(external_tfr_module.hasValue()
+                        ? *external_tfr_module
+                        : func.getParentOfType<ModuleOp>());
+  OpBuilder builder(func);
+  bool changed = false;
+  func.walk([&table, &builder, &changed](Operation* op) {
+    // Only the un-registered ops requires decomposition. The remaining ones
+    // either will be constant folded or lowered by the rules defined in the
+    // bridge.
+    if (op->isRegistered()) {
+      return;
+    }
+
+    // Find out the compose function
+    auto compose_func_name = GetComposeFuncName(op->getName().getStringRef());
+    auto compose_func = table.lookup<TFRFuncOp>(compose_func_name);
+    if (!compose_func || compose_func.isExternal()) {
+      // There are no decomposition methods defined for this op, skip.
+      return;
+    }
+
+    auto compose_func_type = compose_func.getType();
+    builder.setInsertionPoint(op);
+    TFRTensorType unconstrainted_tensor_type = builder.getType<TFRTensorType>();
+
+    // Create the new operands. This is mapping the operands from the target
+    // TF ops to the TFR function arguments. If the TFR function argument is
+    // a tensor_list, a "tfr.build_list" op is used to concat the available
+    // TF op operands. If the TFR function argument isn't a tensor/tensor_list,
+    // a constant is created by using the attribute stored in the TF op or the
+    // default value in the argument attribute.
+    llvm::SmallVector<Value, 4> new_operands;
+    for (auto arg : llvm::enumerate(compose_func_type.getInputs())) {
+      if (auto tensor_type = arg.value().dyn_cast<TFRTensorType>()) {
+        auto casted = builder.create<CastOp>(op->getLoc(), tensor_type,
+                                             op->getOperand(arg.index()));
+        new_operands.push_back(casted);
+      } else if (auto list_type = arg.value().dyn_cast<TFRTensorListType>()) {
+        llvm::SmallVector<Value, 4> variadic_operands;
+        for (int i = arg.index(); i < op->getNumOperands(); i++) {
+          auto casted = builder.create<CastOp>(
+              op->getLoc(), unconstrainted_tensor_type, op->getOperand(i));
+          variadic_operands.push_back(casted);
+        }
+        auto build_list_op = builder.create<BuildListOp>(
+            op->getLoc(), list_type, variadic_operands);
+        new_operands.push_back(build_list_op.out());
+      } else {
+        auto attr_name = compose_func.getArgAttrOfType<StringAttr>(
+            arg.index(), kAttrArgumentNameAttr);
+        auto attribute = op->getAttr(attr_name.getValue());
+        if (!attribute) {
+          attribute =
+              compose_func.getArgAttr(arg.index(), kAttrArgumentDefaultAttr);
+        }
+        Value attr_cst;
+        // Wrap these special attributes as a special TFR constant, so the SSA
+        // value has a valid type to be used as TFR function argument. These
+        // attributes are not expected to be manipulated by the lowering passes.
+        if (attribute.isa<TypeAttr>() || attribute.isa<ArrayAttr>() ||
+            attribute.isa<StringAttr>() || attribute.isa<FlatSymbolRefAttr>()) {
+          TFRAttrType output_type = TFRAttrType::get(builder.getContext());
+          attr_cst =
+              builder.create<ConstOp>(op->getLoc(), output_type, attribute);
+        } else {
+          attr_cst = builder.create<ConstantOp>(op->getLoc(), attribute);
+        }
+        new_operands.push_back(attr_cst);
+      }
+    }
+
+    // Create the TFR call op
+    auto new_op = builder.create<CallOp>(
+        op->getLoc(), compose_func_type.getResults(),
+        builder.getSymbolRefAttr(compose_func.getName()), new_operands);
+
+    // Replace the use of the old op. This is mapping the results from the
+    // target TF ops to the TFR function returns. If the TFR function return is
+    // a tensor_list, "tfr.get_element" op is used to extract the required TF
+    // op result.
+    llvm::SmallVector<Value, 4> new_results;
+    for (auto res : llvm::enumerate(compose_func_type.getResults())) {
+      if (res.value().dyn_cast<TFRTensorType>()) {
+        new_results.push_back(new_op.getResult(res.index()));
+      } else if (auto list_type = res.value().dyn_cast<TFRTensorListType>()) {
+        for (int i = res.index(), j = 0; i < op->getNumResults(); i++, j++) {
+          auto index =
+              builder.create<ConstantOp>(op->getLoc(), builder.getIndexAttr(j));
+          auto element_op = builder.create<GetElementOp>(
+              op->getLoc(), unconstrainted_tensor_type,
+              new_op.getResult(res.index()), index.getResult());
+          new_results.push_back(element_op.out());
+        }
+      }
+    }
+    for (auto res : llvm::zip(op->getResults(), new_results)) {
+      auto casted = builder.create<CastOp>(
+          op->getLoc(), std::get<0>(res).getType(), std::get<1>(res));
+      std::get<0>(res).replaceAllUsesWith(casted.out());
+    }
+    op->erase();
+    changed |= true;
+  });
+
+  // If `changed` is false, it is considered as a failure, so the recursive
+  // rewrite will stop.
+  return success(changed);
+}
+
+LogicalResult DecomposeTFOpsPass::InlineTFRFuncCalls() {
+  // The Inliner will automatically use the registered dialect inliner.
+  InlinerInterface inliner(&getContext());
+  FuncOp func = getFunction();
+  SymbolTable table(external_tfr_module.hasValue()
+                        ? *external_tfr_module
+                        : func.getParentOfType<ModuleOp>());
+
+  // The inliner only inlines the TFR call op.
+  bool changed = false;
+  auto walk_result = func.walk([&](CallOp call_op) {
+    auto callee = table.lookup<TFRFuncOp>(call_op.callee());
+    if (!callee || callee.isExternal()) return WalkResult::advance();
+    if (failed(inlineCall(inliner,
+                          cast<CallOpInterface>(call_op.getOperation()),
+                          cast<CallableOpInterface>(callee.getOperation()),
+                          callee.getCallableRegion(),
+                          /**shouldCloneInLinedRegion=*/true))) {
+      // This failure is usually because the decompose function is not defined.
+      // This call will be raised to TF ops.
+      return WalkResult::interrupt();
+    }
+    call_op.erase();
+    changed |= true;
+    return WalkResult::advance();
+  });
+
+  if (walk_result.wasInterrupted()) {
+    signalPassFailure();
+    return failure();
+  }
+
+  // If `changed` is false, it is considered as a failure, so the recursive
+  // rewrite will stop.
+  return success(changed);
+}
+
+void DecomposeTFOpsPass::runOnFunction() {
+  // Set a maximum iteration threshold in case there are infinite loops in the
+  // call stack.
+  int max_iterators = 10;
+  do {
+    // canonicalization
+    ApplyCanonicalization();
+
+    // rewrite unregistered tf ops. Failed either because no ops can be
+    // decomposed or the compose function isn't defined.
+    auto rewrite_status = RewriteUnregisteredTFOps();
+    // inline the tfr call op until there are no tfr.call op can be inlined.
+    auto inline_status = InlineTFRFuncCalls();
+
+    if (failed(rewrite_status) && failed(inline_status)) {
+      break;
+    }
+  } while (max_iterators-- >= 0);
+}
+
+}  // namespace
+
+// Creates an instance of the pass to decompose the TF ops.
+std::unique_ptr<OperationPass<FuncOp>> CreateDecomposeTFOpsPass(
+    llvm::Optional<ModuleOp> tfr_module) {
+  return std::make_unique<DecomposeTFOpsPass>(tfr_module);
+}
+
+static PassRegistration<DecomposeTFOpsPass> pass(
+    "tfr-decompose",
+    "Decompose TF ops with the registered composition library.",
+    [] { return CreateDecomposeTFOpsPass(); });
+
+}  // namespace TFR
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tfr/passes/passes.h b/tensorflow/compiler/mlir/tfr/passes/passes.h
new file mode 100644
index 00000000000..5c27d81ace8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/passes/passes.h
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_PASSES_H_
+
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFR {
+
+void populateSCFOpsCanonicalizationPatterns(OwningRewritePatternList &results,
+                                            MLIRContext *context);
+
+// Decompose ops.
+std::unique_ptr<OperationPass<FuncOp>> CreateDecomposeTFOpsPass(
+    llvm::Optional<ModuleOp> tfr_module = llvm::None);
+
+// Raise to TF ops.
+std::unique_ptr<OperationPass<FuncOp>> CreateRaiseToTFOpsPass(
+    llvm::Optional<ModuleOp> tfr_module = llvm::None,
+    bool materialize_derived_attrs = false);
+
+}  // namespace TFR
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_PASSES_H_
diff --git a/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc b/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
new file mode 100644
index 00000000000..f3fe9618c62
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
@@ -0,0 +1,474 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <iterator>
+#include <numeric>
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.h"
+#include "tensorflow/compiler/mlir/tfr/ir/tfr_types.h"
+#include "tensorflow/compiler/mlir/tfr/passes/passes.h"
+#include "tensorflow/compiler/mlir/tfr/utils/utils.h"
+
+//===----------------------------------------------------------------------===//
+// The pass to rewrite the TFR function call ops by TF ops. The callee of the
+// TFR function call defines the signatures of the TF ops.
+//
+namespace mlir {
+namespace TFR {
+
+namespace {
+
+// This pattern is to rewrite the "tfr.call" op and the "tfr.cast" ops on the
+// operands by a TF op with "tfr.cast" ops on the results. The result type of
+// the new TF op is an unranked tensor with element type derived.
+class RewriteTFRCallOp : public OpRewritePattern<CallOp> {
+  using OpRewritePattern<CallOp>::OpRewritePattern;
+
+ public:
+  explicit RewriteTFRCallOp(MLIRContext* context, const SymbolTable& table,
+                            bool materialize_derived_attrs)
+      : OpRewritePattern<CallOp>(context),
+        symbol_table_(table),
+        materialize_derived_attrs_(materialize_derived_attrs) {}
+
+  LogicalResult matchAndRewrite(CallOp call_op,
+                                PatternRewriter& rewriter) const override;
+
+ private:
+  // Derives the attribute values for the attributes attached to the
+  // `input_tfr_type`. These attributes are only for the element type of the
+  // inputs, and these type information has been collected in the `input_types`.
+  // The result is stored in `derived_attrs` as the named attributes. Returns
+  // failure if the attributes stored in the `input_tfr_type` violates the
+  // assumptions.
+  LogicalResult AddDerivedAttrs(
+      PatternRewriter& rewriter, Type input_tfr_type,
+      ArrayRef<Attribute> input_types,
+      llvm::StringMap<Attribute>* derived_attrs) const;
+
+  // Collects the operands and attributes for the TF op. At the same time, it
+  // collects all the derived attribute values to derive the output types of the
+  // TF op.
+  LogicalResult CollectInputsAndAttributes(
+      PatternRewriter& rewriter, TFRFuncOp signature, CallOp call_op,
+      SmallVectorImpl<Value>* inputs, NamedAttrList* arg_attrs,
+      llvm::StringMap<Attribute>* derived_attrs) const;
+
+  // Uses the collected attribute values to derive all the output types.
+  LogicalResult DeriveOutputTypes(FunctionType signature,
+                                  const llvm::StringMap<Attribute>& attrs,
+                                  SmallVectorImpl<Type>* output_types) const;
+
+  // Creates the TF op and also the necessary tfr.cast ops to replace the
+  // original TFR call op.
+  LogicalResult CreateAndReplaceOp(
+      PatternRewriter& rewriter, CallOp call_op,
+      const SmallVectorImpl<Type>& output_types,
+      const SmallVectorImpl<Value>& inputs, const NamedAttrList& attr_list,
+      const llvm::StringMap<Attribute>& derived_attrs) const;
+
+  // Adds a tf.Cast op if the tfr.tensor attribute indicated a fixed element
+  // type.
+  // TODO(fengliuai): This method is required when the operand types are not set
+  // by the frontend correctly.
+  Value CastToNonDerivedType(PatternRewriter& rewriter, Location loc,
+                             CastOp cast_op, Type input_tfr_type) const {
+    auto tensor_type = input_tfr_type.dyn_cast<TFRTensorType>();
+    if (!tensor_type) return cast_op.arg();
+
+    auto attr_names = tensor_type.getAttrKeys();
+    if (attr_names.empty() || attr_names.size() > 1) return cast_op.arg();
+    StringRef tfr_type_attr = attr_names[0].getValue();
+    if (!fixed_elt_type_attrs_.contains(tfr_type_attr)) return cast_op.arg();
+
+    Type result_elt_type;
+    if (tfr_type_attr == "i32_") {
+      result_elt_type = rewriter.getI32Type();
+    } else if (tfr_type_attr == "i64_") {
+      result_elt_type = rewriter.getI64Type();
+    } else if (tfr_type_attr == "f32_") {
+      result_elt_type = rewriter.getF32Type();
+    } else if (tfr_type_attr == "i1_") {
+      result_elt_type = rewriter.getI1Type();
+    } else {
+      return cast_op.arg();
+    }
+
+    Type original_input_type =
+        cast_op.getInputElementType().cast<TypeAttr>().getValue();
+    if (result_elt_type != original_input_type) {
+      UnrankedTensorType result_type = UnrankedTensorType::get(result_elt_type);
+      return rewriter.create<TF::CastOp>(loc, result_type, cast_op.arg());
+    }
+    return cast_op.arg();
+  }
+
+  // For variadic operands, we have to enforce them to use the same types.
+  // TODO(fengliuai): This method is required when the operand types are not set
+  // by the frontend correctly.
+  void CastValuesToSameType(PatternRewriter& rewriter, Location loc,
+                            const llvm::SmallVectorImpl<Attribute>& input_types,
+                            llvm::SmallVectorImpl<Value>& input_values) const {
+    if (input_types.size() <= 1) return;
+
+    Type target_input_type = input_types[0].cast<TypeAttr>().getValue();
+    auto result_type = UnrankedTensorType::get(target_input_type);
+    for (auto i = 1; i < input_types.size(); ++i) {
+      Type current_input_type = input_types[i].cast<TypeAttr>().getValue();
+      if (current_input_type != target_input_type) {
+        input_values[i] =
+            rewriter.create<TF::CastOp>(loc, result_type, input_values[i]);
+      }
+    }
+  }
+
+  const SymbolTable& symbol_table_;
+  const bool materialize_derived_attrs_;
+  const llvm::SmallDenseSet<StringRef, 4> fixed_elt_type_attrs_{"i32_", "i64_",
+                                                                "f32_", "i1_"};
+};
+
+LogicalResult RewriteTFRCallOp::AddDerivedAttrs(
+    PatternRewriter& rewriter, Type input_tfr_type,
+    ArrayRef<Attribute> input_types,
+    llvm::StringMap<Attribute>* derived_attrs) const {
+  // If there is an attribute associated to the input in the signature, we
+  // store it as an derived attribute.
+  if (auto tensor_type = input_tfr_type.dyn_cast<TFRTensorType>()) {
+    auto attr_names = tensor_type.getAttrKeys();
+    if (attr_names.empty()) return success();
+
+    if (attr_names.size() == 1) {
+      derived_attrs->insert({attr_names[0].getValue(), input_types[0]});
+      return success();
+    }
+  }
+
+  // If there is an attribute associated to the input in the signature,
+  // we store it as an derived attribute.
+  if (auto list_type = input_tfr_type.dyn_cast<TFRTensorListType>()) {
+    auto attr_names = list_type.getAttrKeys();
+    if (attr_names.empty()) return success();
+
+    // N*T case
+    if (attr_names.size() == 2) {
+      derived_attrs->insert({attr_names[0].getValue(),
+                             rewriter.getI32IntegerAttr(input_types.size())});
+      // Note that this uses the first element of the list to infer the T value.
+      // A tf.Cast is required to cast the other inputs to the same type.
+      derived_attrs->insert({attr_names[1].getValue(), input_types[0]});
+      return success();
+    }
+
+    // list(dtype) case
+    if (attr_names.size() == 1) {
+      derived_attrs->insert(
+          {attr_names[0].getValue(), rewriter.getArrayAttr(input_types)});
+      return success();
+    }
+  }
+
+  return failure();
+}
+
+LogicalResult RewriteTFRCallOp::CollectInputsAndAttributes(
+    PatternRewriter& rewriter, TFRFuncOp signature, CallOp call_op,
+    SmallVectorImpl<Value>* inputs, NamedAttrList* arg_attrs,
+    llvm::StringMap<Attribute>* derived_attrs) const {
+  for (const auto& operand : llvm::enumerate(signature.getType().getInputs())) {
+    // If the index is larger than the operand number of the call_op, the
+    // default value of the operand needs to be used.
+    if (operand.index() >= call_op.getNumOperands()) {
+      auto attr_name = signature.getArgAttrOfType<StringAttr>(
+          operand.index(), kAttrArgumentNameAttr);
+      auto attr_value =
+          signature.getArgAttr(operand.index(), kAttrArgumentDefaultAttr);
+      arg_attrs->push_back(
+          rewriter.getNamedAttr(attr_name.getValue(), attr_value));
+      continue;
+    }
+
+    // The index is valid for the call_op.
+    Value input = call_op.getOperand(operand.index());
+    Operation* input_op = input.getDefiningOp();
+    auto input_tfr_type = signature.getType().getInputs()[operand.index()];
+
+    // There are three cases for the preceding input_op:
+
+    // 1. The preceding op can be a tfr.cast op, which will be fused to the
+    // current op, so the result op has input with tensor type.
+    if (auto cast_op = dyn_cast_or_null<CastOp>(input_op)) {
+      Value input_to_cast = CastToNonDerivedType(rewriter, call_op.getLoc(),
+                                                 cast_op, input_tfr_type);
+      inputs->push_back(input_to_cast);
+      if (failed(AddDerivedAttrs(rewriter, input_tfr_type,
+                                 {cast_op.getInputElementType()},
+                                 derived_attrs))) {
+        return failure();
+      }
+      continue;
+    }
+
+    // 2. The preceding op is a tfr.build_list op, which collects multiple
+    // values with tensor types via the tfr.cast ops. These ops will be fused
+    // to the current op as well, so all the tfr.cast op inputs will be inputs
+    // to the result op.
+    if (auto list_op = dyn_cast_or_null<BuildListOp>(input_op)) {
+      // Find out all the inputs to the build list op
+      // TODO(fengliuai): make build_list op only take tensor argument
+      llvm::SmallVector<Attribute, 4> list_input_types;
+      llvm::SmallVector<Value, 4> list_inputs;
+      for (auto list_input : list_op.getOperands()) {
+        auto cast_op = dyn_cast_or_null<CastOp>(list_input.getDefiningOp());
+        if (!cast_op) return failure();
+        list_inputs.push_back(cast_op.arg());
+        list_input_types.push_back(cast_op.getInputElementType());
+      }
+      CastValuesToSameType(rewriter, call_op.getLoc(), list_input_types,
+                           list_inputs);
+      inputs->append(list_inputs.begin(), list_inputs.end());
+      if (failed(AddDerivedAttrs(rewriter, input_tfr_type, list_input_types,
+                                 derived_attrs))) {
+        return failure();
+      }
+      continue;
+    }
+
+    // 3. The preceding op is a constant, thus the value of this constant is
+    // used to create an attribute of the result op, according to the signature.
+    Attribute arg_value;
+    // A failure indicates the argument isn't a constant value, so we should
+    // not use it as an attribute.
+    if (!matchPattern(input, m_Constant(&arg_value))) {
+      return failure();
+    }
+    auto attr_name = signature.getArgAttrOfType<StringAttr>(
+        operand.index(), kAttrArgumentNameAttr);
+    arg_attrs->push_back(
+        rewriter.getNamedAttr(attr_name.getValue(), arg_value));
+  }
+  return success();
+}
+
+// For each output, uses the attribute name associated to the tfr types to find
+// out the attribute value from the collected `attrs` and create the output type
+// of the result op by using the attribute value as the element type.
+LogicalResult RewriteTFRCallOp::DeriveOutputTypes(
+    FunctionType signature, const llvm::StringMap<Attribute>& attrs,
+    SmallVectorImpl<Type>* output_types) const {
+  for (auto res : llvm::enumerate(signature.getResults())) {
+    if (auto tensor_type = res.value().dyn_cast<TFRTensorType>()) {
+      // tfr.tensor should only have one attribute attached.
+      auto attr_key = tensor_type.getAttrKeys().front();
+      output_types->push_back(UnrankedTensorType::get(
+          attrs.lookup(attr_key.getValue()).cast<TypeAttr>().getValue()));
+      continue;
+    }
+
+    if (auto list_type = res.value().dyn_cast<TFRTensorListType>()) {
+      // There are two cases: N*T or list(dtype)
+      auto attr_keys = list_type.getAttrKeys();
+      // N*T case
+      if (attr_keys.size() == 2) {
+        // The first one is N, and the second one is T
+        int list_size =
+            attrs.lookup(attr_keys[0].getValue()).cast<IntegerAttr>().getInt();
+        Type list_type =
+            attrs.lookup(attr_keys[1].getValue()).cast<TypeAttr>().getValue();
+        for (int i = 0; i < list_size; ++i) {
+          output_types->push_back(UnrankedTensorType::get(list_type));
+        }
+        continue;
+      }
+      // TODO(fengliuai): list(dtype) case
+    }
+    return failure();
+  }
+  return success();
+}
+
+LogicalResult RewriteTFRCallOp::CreateAndReplaceOp(
+    PatternRewriter& rewriter, CallOp call_op,
+    const SmallVectorImpl<Type>& output_types,
+    const SmallVectorImpl<Value>& inputs, const NamedAttrList& attr_list,
+    const llvm::StringMap<Attribute>& derived_attrs) const {
+  // Create the new op
+  Location loc = call_op.getLoc();
+  rewriter.setInsertionPointAfter(call_op);
+  std::string tf_op_name = GetTFOpName(call_op.callee());
+  OperationState new_state(loc, tf_op_name, inputs, output_types, attr_list);
+  Operation* new_op = rewriter.createOperation(new_state);
+  if (materialize_derived_attrs_) {
+    for (const auto& attr : derived_attrs) {
+      // Add or update the derived attribute with the value. Skip the fixed
+      // element type attributes, in case they are present in the NodeDef.
+      if (!fixed_elt_type_attrs_.contains(attr.first())) {
+        new_op->setAttr(attr.first(), attr.second);
+      }
+    }
+  }
+
+  // Create the tfr.cast ops on the results and replace the uses of the
+  // original call op.
+  TFRTensorType unconstrainted_type = rewriter.getType<TFRTensorType>();
+  SmallVector<Value, 4> new_results;
+  for (auto res : llvm::enumerate(call_op.getResultTypes())) {
+    Type res_type = res.value();
+    if (res_type.dyn_cast<TFRTensorType>()) {
+      Value new_res = new_op->getResult(res.index());
+      auto casted = rewriter.create<CastOp>(loc, res_type, new_res);
+      new_results.push_back(casted.out());
+    } else if (auto list_type = res.value().dyn_cast<TFRTensorListType>()) {
+      SmallVector<Value, 4> tensor_list;
+      for (int i = res.index(); i < new_op->getNumResults(); i++) {
+        Value new_res = new_op->getResult(i);
+        auto casted =
+            rewriter.create<CastOp>(loc, unconstrainted_type, new_res);
+        tensor_list.push_back(casted.out());
+      }
+      auto list_op = rewriter.create<BuildListOp>(loc, res_type, tensor_list);
+      new_results.push_back(list_op.out());
+    }
+  }
+  rewriter.replaceOp(call_op, new_results);
+  return success();
+}
+
+LogicalResult RewriteTFRCallOp::matchAndRewrite(
+    CallOp call_op, PatternRewriter& rewriter) const {
+  // Get the func op and verify that it is external. The type of this external
+  // func op is used as the signature of the corresponding TF ops. All the
+  // external func ops have the trailing underscore.
+  std::string external_callee_name = call_op.callee().str().append("_");
+  TFRFuncOp func = symbol_table_.lookup<TFRFuncOp>(external_callee_name);
+  if (!func || !func.isExternal()) return failure();
+  // Get the inputs and attributes. The attributes include these from the
+  // argument list and also these derived from the inputs.
+  SmallVector<Value, 4> inputs;
+  NamedAttrList argument_attrs;
+  llvm::StringMap<Attribute> derived_attrs;
+  if (failed(CollectInputsAndAttributes(rewriter, func, call_op, &inputs,
+                                        &argument_attrs, &derived_attrs))) {
+    return failure();
+  }
+
+  // Derive the output types. The result type is derived by using the
+  // attributes attched to the result type of the signature. The attribute
+  // value should be either in the attribute argument list or the derived
+  // attribute from the input tensors. All the result type
+  // are unranked, and shape inference should be applied afterwards.
+  SmallVector<Type, 4> output_types;
+
+  // Merge the attributes from the argument list to the derived ones.
+  for (auto& attr : argument_attrs) {
+    derived_attrs.insert({attr.first, attr.second});
+  }
+
+  // Derive the output types by using the attributes attached to the tfr
+  // types.
+  if (failed(DeriveOutputTypes(func.getType(), derived_attrs, &output_types))) {
+    return failure();
+  }
+
+  // Create the new op and replace the old TFR call op.
+  return CreateAndReplaceOp(rewriter, call_op, output_types, inputs,
+                            argument_attrs, derived_attrs);
+}
+
+// Raise TFR call ops to the TF ops.
+struct RaiseToTFOpsPass : public PassWrapper<RaiseToTFOpsPass, FunctionPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<TFRDialect, TF::TensorFlowDialect, scf::SCFDialect,
+                    StandardOpsDialect>();
+  }
+
+  explicit RaiseToTFOpsPass(llvm::Optional<ModuleOp> tfr_module,
+                            bool materialize_derived_attrs)
+      : external_tfr_module(tfr_module),
+        materialize_derived_attrs(materialize_derived_attrs) {}
+
+  void runOnFunction() override;
+
+ private:
+  llvm::Optional<ModuleOp> external_tfr_module;
+  const bool materialize_derived_attrs;
+};
+
+void RaiseToTFOpsPass::runOnFunction() {
+  FuncOp func = getFunction();
+  MLIRContext* ctx = &getContext();
+  SymbolTable table(external_tfr_module.hasValue()
+                        ? *external_tfr_module
+                        : func.getParentOfType<ModuleOp>());
+
+  OwningRewritePatternList patterns;
+  patterns.insert<RewriteTFRCallOp>(ctx, table, materialize_derived_attrs);
+  for (auto* op : ctx->getRegisteredOperations()) {
+    op->getCanonicalizationPatterns(patterns, ctx);
+  }
+
+  applyPatternsAndFoldGreedily(func, patterns);
+}
+}  // namespace
+
+// Creates an instance of the pass to raise TFR call ops to the TF ops.
+std::unique_ptr<OperationPass<FuncOp>> CreateRaiseToTFOpsPass(
+    llvm::Optional<ModuleOp> tfr_module, bool materialize_derived_attrs) {
+  return std::make_unique<RaiseToTFOpsPass>(tfr_module,
+                                            materialize_derived_attrs);
+}
+
+static PassRegistration<RaiseToTFOpsPass> pass(
+    "tfr-raise-to-tf", "Raise all the TFR call ops to TF ops.",
+    [] { return CreateRaiseToTFOpsPass(); });
+
+}  // namespace TFR
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tfr/passes/tfr_opt.cc b/tensorflow/compiler/mlir/tfr/passes/tfr_opt.cc
new file mode 100644
index 00000000000..8f06f278369
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/passes/tfr_opt.cc
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/InitAllPasses.h"  // from @llvm-project
+#include "mlir/Support/MlirOptMain.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.h"
+
+int main(int argc, char **argv) {
+  tensorflow::InitMlir y(&argc, &argv);
+
+  mlir::registerAllPasses();
+
+  mlir::DialectRegistry registry;
+  registry.insert<mlir::scf::SCFDialect, mlir::TF::TensorFlowDialect,
+                  mlir::StandardOpsDialect, mlir::shape::ShapeDialect,
+                  mlir::TFR::TFRDialect>();
+  return failed(mlir::MlirOptMain(argc, argv, "TFR Pass Driver\n", registry));
+}
diff --git a/tensorflow/compiler/mlir/tfr/python/composite.py b/tensorflow/compiler/mlir/tfr/python/composite.py
new file mode 100644
index 00000000000..7f558ce2fe7
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/python/composite.py
@@ -0,0 +1,56 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Op composition registration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+# TODO(fengliuai): add the tf_export decrator
+class Composite(object):
+  """A decorator to register a function as a composition for an TF operator.
+
+  The argument to the decorator must be the name of a TF raw operator the
+  function composites for. Decorated function must take positional arguments
+  which corresponds to the input and attributes in OpDef of the TF operation.
+  # TODO(fengliuai): more documents here.
+
+  Example:
+    @composite.Composite('AddN')
+    def _compose_add_n(inputs, N):
+      if N == 1:
+        ....
+  """
+
+  # TODO(fengliuai): support input_binding and output_binding so the arguments
+  # are not positional.
+  def __init__(self,
+               op_name,
+               inputs=None,
+               attrs=None,
+               derived_attrs=None,
+               outputs=None):
+    self._op_name = op_name
+    self._inputs = inputs
+    self._attrs = attrs
+    self._derived_attrs = derived_attrs
+    self._outputs = outputs
+
+  def __call__(self, compose_fn):
+    # TODO(fengliuai): more sanity check of the input function and make sure
+    # the bounded arguments of the function matches the 'inputs' and 'attrs'.
+    setattr(compose_fn, '_tfr_op_name', self._op_name)
+    return compose_fn
diff --git a/tensorflow/compiler/mlir/tfr/python/op_reg_gen.py b/tensorflow/compiler/mlir/tfr/python/op_reg_gen.py
new file mode 100644
index 00000000000..99b2dfdedc4
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/python/op_reg_gen.py
@@ -0,0 +1,147 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""op_reg_gen: Generate op registration code from composite op code."""
+
+# pylint: disable=invalid-name
+# pylint: disable=missing-function-docstring
+# pylint: disable=g-direct-tensorflow-import
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast as ast
+
+from tensorflow.python.autograph.pyct import transformer
+from tensorflow.python.autograph.pyct import transpiler
+from tensorflow.python.framework import op_def_registry
+from tensorflow.python.util import tf_inspect
+
+_COMPOSITE_ARG_LIST = ['op_name', 'inputs', 'attrs', 'derived_attrs', 'outputs']
+
+
+class OpRegGenImpl(transformer.CodeGenerator):
+  """Visit the AST and generate C++ op registration functions."""
+
+  def __init__(self, ctx):
+    super(OpRegGenImpl, self).__init__(ctx)
+    self.ctx = ctx
+
+  def visit_Name(self, node):
+    return node.id
+
+  def visit_Constant(self, node):
+    return node.value
+
+  def visit_keyword(self, node):
+    return node.arg, self.visit(node.value)
+
+  def visit_List(self, node):
+    return [self.visit(cst) for cst in node.elts]
+
+  def visit_arguments(self, node):
+    return [self.visit(arg) for arg in node.args]
+
+  def visit_FunctionDef(self, node):
+    # TODO(fengliuai): create one utility method to match different apis and
+    # shared it with the tfr_gen.py module.
+    compose_dec = []
+    for dec in node.decorator_list:
+      if isinstance(dec, ast.Call):
+        if isinstance(dec.func, ast.Attribute) and dec.func.attr == 'Composite':
+          compose_dec.append(dec)
+        if isinstance(dec.func, ast.Name) and dec.func.id == 'Composite':
+          compose_dec.append(dec)
+
+    if not compose_dec:
+      # skip a non-composition function
+      return
+    elif len(compose_dec) > 1:
+      raise KeyError('More than one TF ops decomposes for.')
+
+    all_dec_args = {}
+    for arg_name, arg_value in zip(_COMPOSITE_ARG_LIST, compose_dec[0].args):
+      all_dec_args[arg_name] = self.visit(arg_value)
+
+    kw_dec_args = dict([self.visit(kw) for kw in compose_dec[0].keywords])
+
+    if all_dec_args.keys() & kw_dec_args.keys():
+      raise KeyError('More arguments than expected.')
+
+    all_dec_args.update(kw_dec_args)
+
+    op_name = all_dec_args['op_name']
+    op_def = op_def_registry.get(op_name)
+    if op_def:
+      if len(all_dec_args) > 1:
+        # Op has been registered, so it is a user error to specify op def.
+        raise ValueError('Op has been registered: ' + op_name)
+      else:
+        # Op has been registered, then we don't need to generate register code.
+        return
+
+    # Validates the function inputs match what are in the decorator.
+    inputs = all_dec_args.get('inputs', [])
+    attrs = all_dec_args.get('attrs', [])
+    expected_args = [arg.split(':')[0] for arg in inputs + attrs]
+    all_func_args = self.visit(node.args)
+
+    if len(expected_args) != len(all_func_args):
+      raise KeyError('Composition arguments do not match the registration.')
+
+    cxx_reg_code = '\nREGISTER_OP("{0}")'.format(op_name)
+    for input_ in inputs:
+      cxx_reg_code += '\n    .Input("{0}")'.format(input_)
+    for attr in attrs:
+      py_str = attr.replace('"', '\'')
+      cxx_reg_code += '\n    .Attr("{0}")'.format(py_str)
+    for attr in all_dec_args.get('derived_attrs', []):
+      py_str = attr.replace('"', '\'')
+      cxx_reg_code += '\n    .Attr("{0}")'.format(py_str)
+    for output_ in all_dec_args.get('outputs', []):
+      cxx_reg_code += '\n    .Output("{0}")'.format(output_)
+    cxx_reg_code += ';\n'
+    self.emit(cxx_reg_code)
+
+
+class OpRegGen(transpiler.GenericTranspiler):
+  """Transforms Python objects into TFR MLIR source code."""
+
+  def transform_ast(self, node, ctx):
+    gen = OpRegGenImpl(ctx)
+    gen.visit(node)
+    return gen.code_buffer
+
+
+def op_reg_gen(func):
+  """Parse a function and emit the TFR functions."""
+  op_reg_code, _ = OpRegGen().transform(func, None)
+  return op_reg_code
+
+
+def gen_register_op(source, method_prefix=None):
+  """Parse a python code and emit the TFR functions from a target class."""
+  mlir_funcs = [
+      op_reg_gen(func)
+      for name, func in tf_inspect.getmembers(source, tf_inspect.isfunction)
+      if not method_prefix or name.startswith(method_prefix)
+  ]
+  headers = r"""
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+  """
+  code = '\n'.join(mlir_funcs)
+  return headers + code + '}  // namespace tensorflow\n'
diff --git a/tensorflow/compiler/mlir/tfr/python/op_reg_gen_test.py b/tensorflow/compiler/mlir/tfr/python/op_reg_gen_test.py
new file mode 100644
index 00000000000..6392015ba4d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/python/op_reg_gen_test.py
@@ -0,0 +1,81 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `op_reg_gen` module."""
+
+# pylint: disable=missing-function-docstring
+# pylint: disable=invalid-name
+# pylint: disable=g-direct-tensorflow-import
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+from tensorflow.compiler.mlir.python.mlir_wrapper import filecheck_wrapper as fw
+from tensorflow.compiler.mlir.tfr.python import composite
+from tensorflow.compiler.mlir.tfr.python.op_reg_gen import gen_register_op
+from tensorflow.python.platform import test
+
+
+Composite = composite.Composite
+
+
+@composite.Composite(
+    'TestNoOp', derived_attrs=['T: numbertype'], outputs=['o1: T'])
+def _composite_no_op():
+  pass
+
+
+@Composite(
+    'TestCompositeOp',
+    inputs=['x: T', 'y: T'],
+    attrs=['act: {"", "relu"}', 'trans: bool = true'],
+    derived_attrs=['T: numbertype'],
+    outputs=['o1: T', 'o2: T'])
+def _composite_op(x, y, act, trans):
+  return x + act, y + trans
+
+
+class TFRGenTensorTest(test.TestCase):
+  """MLIR Generation Tests for MLIR TFR Program."""
+
+  def test_op_reg_gen(self):
+    cxx_code = gen_register_op(sys.modules[__name__])
+    cxx_code_exp = r"""
+      CHECK-NEXT: #include "third_party/tensorflow/core/framework/op.h"
+      CHECK-EMPTY
+      CHECK-LABEL: namespace tensorflow {
+      CHECK-EMPTY
+      CHECK-LABEL: REGISTER_OP("TestNoOp")
+      CHECK-NEXT:      .Attr("T: numbertype")
+      CHECK-NEXT:      .Output("o1: T");
+      CHECK-EMPTY
+      CHECK-LABEL: REGISTER_OP("TestCompositeOp")
+      CHECK-NEXT:      .Input("x: T")
+      CHECK-NEXT:      .Input("y: T")
+      CHECK-NEXT:      .Attr("act: {'', 'relu'}")
+      CHECK-NEXT:      .Attr("trans: bool = true")
+      CHECK-NEXT:      .Attr("T: numbertype")
+      CHECK-NEXT:      .Output("o1: T")
+      CHECK-NEXT:      .Output("o2: T");
+      CHECK-EMPTY
+      CHECK-LABEL:  }  // namespace tensorflow
+    """
+    self.assertTrue(fw.check(str(cxx_code), cxx_code_exp), str(cxx_code))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/compiler/mlir/tfr/python/test_utils.py b/tensorflow/compiler/mlir/tfr/python/test_utils.py
new file mode 100644
index 00000000000..62aa3e39105
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/python/test_utils.py
@@ -0,0 +1,48 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test utils for composite op definition."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.platform import test
+
+
+class OpsDefsTest(test.TestCase):
+  """Test utils."""
+
+  def _assertOpAndComposite(self, vars_, compute_op, compute_composite, kwargs,
+                            op_kwargs=None):
+    if op_kwargs is None:
+      op_kwargs = kwargs
+
+    # compute with op.
+    with backprop.GradientTape() as gt:
+      for var_ in vars_:
+        gt.watch(var_)
+      y = compute_op(**op_kwargs)  # uses op and decomposites by the graph pass.
+      grads = gt.gradient(y, vars_)  # uses registered gradient function.
+
+    # compute with composition
+    with backprop.GradientTape() as gt:
+      for var_ in vars_:
+        gt.watch(var_)
+      re_y = compute_composite(**kwargs)  # uses composite function.
+      re_grads = gt.gradient(re_y, vars_)  # uses gradients compposite function.
+
+    for v, re_v in zip(y, re_y):
+      self.assertAllClose(v, re_v)
+    for g, re_g in zip(grads, re_grads):
+      self.assertAllClose(g, re_g)
diff --git a/tensorflow/compiler/mlir/tfr/python/tfr_gen.py b/tensorflow/compiler/mlir/tfr/python/tfr_gen.py
new file mode 100644
index 00000000000..3bf89c7a2d5
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/python/tfr_gen.py
@@ -0,0 +1,1377 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""tfr_gen: Generate mlir tfr decomposition function from python code."""
+
+# pylint: disable=invalid-name
+# pylint: disable=missing-function-docstring
+# pylint: disable=g-direct-tensorflow-import
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import enum
+import os
+import re
+import types
+from typing import List, Tuple
+import gast as ast
+
+from tensorflow.compiler.mlir.tfr import tfr_wrapper as tfr
+from tensorflow.core.framework import types_pb2
+from tensorflow.python.autograph.converters import control_flow
+from tensorflow.python.autograph.converters import return_statements
+from tensorflow.python.autograph.impl import api
+from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import cfg
+from tensorflow.python.autograph.pyct import qual_names
+from tensorflow.python.autograph.pyct import transformer
+from tensorflow.python.autograph.pyct import transpiler
+from tensorflow.python.autograph.pyct.static_analysis import activity
+from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
+from tensorflow.python.autograph.pyct.static_analysis import reaching_fndefs
+from tensorflow.python.autograph.pyct.static_analysis import type_inference
+from tensorflow.python.framework import load_library
+from tensorflow.python.framework import op_def_registry
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import tf_inspect
+
+
+class TFRTypes(enum.Enum):
+  """All the supported types.
+
+    1-3: tfr types
+    4-99: mlir built-in types
+    100-199: TF related translator internal types
+    200- : Python related translator internal types
+  """
+  TENSOR = 1
+  TENSOR_LIST = 2
+  ATTR = 3
+  NONE = 4
+  SHAPE = 5  # shape -> !shape.shape
+  I1 = 21
+  I32 = 22
+  I64 = 23
+  F32 = 24
+  INDEX = 25
+  AG_UNDEFINED_VAL = 100
+  AG_BUILTIN_FUNC = 101
+  TF_RAW_OP = 102
+  TF_REGION = 103
+  TF_TENSOR_SHAPE_FUNC = 104  # shape.as_list
+  TF_TENSOR_SHAPE_LIST = 105  # shape.as_list()
+  PY_BUILTIN_FUNC = 200
+
+  # As these are not real types, __getattribute__ helps them appear more like
+  # actual types (i.e. class definitions).
+  def __getattribute__(self, name):
+    if name == 'shape' and object.__getattribute__(self, 'value') == 1:
+      return TFRTypes.SHAPE
+    if name == 'as_list' and object.__getattribute__(self, 'value') == 5:
+      return TFRTypes.TF_TENSOR_SHAPE_FUNC
+    return object.__getattribute__(self, name)
+
+  def __str__(self):
+    if self.value < 4:  # pylint: disable=comparison-with-callable
+      return '!tfr.' + self.name.lower()
+    elif self.value < 10:  # pylint: disable=comparison-with-callable
+      return '!shape.' + self.name.lower()
+    else:
+      return self.name.lower()
+
+
+_attribute_types = [
+    TFRTypes.I1, TFRTypes.I32, TFRTypes.I64, TFRTypes.F32, TFRTypes.INDEX,
+    TFRTypes.ATTR
+]
+
+
+def _get_type_from_proto(arg_def=None, attr_def=None):
+  if not arg_def:
+    if attr_def.type == 'bool':
+      return TFRTypes.I1
+    elif attr_def.type == 'int32':
+      return TFRTypes.I32
+    elif attr_def.type == 'int' or attr_def.type == 'int64':
+      return TFRTypes.I64
+    elif attr_def.type == 'float':
+      return TFRTypes.F32
+    else:
+      return TFRTypes.ATTR
+
+  if arg_def.number_attr or arg_def.type_list_attr:
+    return TFRTypes.TENSOR_LIST
+  else:
+    return TFRTypes.TENSOR
+
+
+def _get_type_info_from_proto(arg_def=None, attr_def=None):
+  attr_type = _get_type_from_proto(arg_def, attr_def)
+  if not arg_def:
+    return '{}{{tfr.name="{}"}}'.format(attr_type, attr_def.name)
+  else:
+    attr_names = []
+    if arg_def.number_attr:
+      attr_names.append(arg_def.number_attr)
+    if arg_def.type_attr:
+      attr_names.append(arg_def.type_attr)
+    if arg_def.type_list_attr:
+      attr_names.append(arg_def.type_list_attr)
+
+    # TODO(fengliuai): currently we don't support backward type inference, so we
+    # have to store these non-derivable type in the signatures, and then they
+    # can be used to cast the values when raising to tf ops.
+    if arg_def.type == types_pb2.DT_FLOAT:
+      attr_names.append('f32_')
+    elif arg_def.type == types_pb2.DT_INT32:
+      attr_names.append('i32_')
+    elif arg_def.type == types_pb2.DT_INT64:
+      attr_names.append('i64_')
+    elif arg_def.type == types_pb2.DT_BOOL:
+      attr_names.append('i1_')
+
+    if not attr_names:
+      return str(attr_type)
+    else:
+      return '{}<{}>'.format(attr_type, ','.join(attr_names))
+
+
+def _get_val_from_proto(attr_type, attr_val):
+  if attr_type == TFRTypes.I1:
+    return 'true' if attr_val.b else 'false'
+  elif attr_type == TFRTypes.I32 or attr_type == TFRTypes.I64:
+    return attr_val.i
+  elif attr_type == TFRTypes.F32:
+    return attr_val.f
+  elif attr_type == TFRTypes.ATTR:
+    # string
+    if attr_val.HasField('s'):
+      return '"{}"'.format(attr_val.s.decode())
+    # type
+    if attr_val.HasField('type'):
+      if attr_val.type == types_pb2.DT_FLOAT:
+        return 'f32'
+      elif attr_val.type == types_pb2.DT_INT32:
+        return 'i32'
+      elif attr_val.type == types_pb2.DT_INT64:
+        return 'i64'
+      elif attr_val.type == types_pb2.DT_BOOL:
+        return 'i1'
+    # list
+    if attr_val.HasField('list'):
+      if attr_val.list.f:
+        elt_ty = TFRTypes.F32
+        values = attr_val.list.f
+      elif attr_val.list.i:
+        elt_ty = TFRTypes.I64
+        values = attr_val.list.i
+      else:
+        elt_ty = TFRTypes.NONE
+        values = []
+      array_attr_elts = ['{}:{}'.format(val, elt_ty) for val in values]
+      return '[{}]'.format(','.join(array_attr_elts))
+  raise NotImplementedError(
+      'Proto AttrValue not recoganized. type: {}, value: {}'.format(
+          attr_type, attr_val))
+
+
+def _collect_derived_attrs_from_proto(op_def):
+  derived_attrs = set()
+  for arg in op_def.input_arg:
+    if arg.type_attr:
+      derived_attrs.add(arg.type_attr)
+    if arg.number_attr:
+      derived_attrs.add(arg.number_attr)
+    if arg.type_list_attr:
+      derived_attrs.add(arg.type_list_attr)
+
+    # TODO(fengliuai): currently we don't support backward type inference, so we
+    # have to store these non-derivable type in the signatures, and then they
+    # can be used to cast the values when raising to tf ops.
+    if arg.type == types_pb2.DT_FLOAT:
+      derived_attrs.add('f32_')
+    elif arg.type == types_pb2.DT_INT32:
+      derived_attrs.add('i32_')
+    elif arg.type == types_pb2.DT_INT64:
+      derived_attrs.add('i64_')
+    elif arg.type == types_pb2.DT_BOOL:
+      derived_attrs.add('i1_')
+  return derived_attrs
+
+
+def _require_tensor_list(arg_def):
+  return arg_def.type_list_attr or arg_def.number_attr
+
+
+def _camel_to_snake(name):
+  s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
+  return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
+
+
+class OpDefCache(object):
+  """A Dict to cache the OpDef for the Python function name."""
+
+  def __init__(self):
+    self._op_defs = {}
+
+  def lookup(self, f_name, func_def=None, optional=False):
+    if f_name in self._op_defs.keys():
+      return self._op_defs[f_name]
+
+    if isinstance(func_def, types.FunctionType):
+      if not hasattr(func_def, '_tfr_op_name'):
+        # skip a non-composition function
+        if optional:
+          return (None, None)
+        else:
+          raise KeyError('OpDef does not exist: ' + f_name)
+      op_name = getattr(func_def, '_tfr_op_name')
+    elif not func_def:
+      op_name = f_name
+    else:
+      # TODO(fengliuai): create one utility method to match different apis.
+      compose_dec = []
+      for dec in func_def.decorator_list:
+        if isinstance(dec, ast.Call):
+          if isinstance(dec.func,
+                        ast.Attribute) and dec.func.attr == 'Composite':
+            compose_dec.append(dec)
+          if isinstance(dec.func, ast.Name) and dec.func.id == 'Composite':
+            compose_dec.append(dec)
+
+      if not compose_dec:
+        # skip a non-composition function
+        if optional:
+          return (None, None)
+        else:
+          raise KeyError('OpDef does not exist: ' + f_name)
+      elif len(compose_dec) > 1:
+        raise KeyError('More than one TF ops decomposes for.')
+      else:
+        op_name = compose_dec[0].args[0].value
+
+    op_def = op_def_registry.get(op_name)
+    if not op_def:
+      raise ValueError('Not a registered op: ' + op_name)
+    derived_attrs = _collect_derived_attrs_from_proto(op_def)
+    self._op_defs[f_name] = (op_def, derived_attrs)
+    return (op_def, derived_attrs)
+
+  def mlir_external_funcs(self):
+    tfr_funcs = []
+    for _, (op_def, derived_attrs) in sorted(self._op_defs.items()):
+      tfr_func = '\ntfr.func @tf__{}_('.format(_camel_to_snake(op_def.name))
+
+      # tensor inputs
+      inputs = [
+          _get_type_info_from_proto(arg_def) for arg_def in op_def.input_arg
+      ]
+
+      # attribute inputs. The attribute with default values are moved backwards.
+      non_derived_attrs = [
+          attr for attr in op_def.attr if attr.name not in derived_attrs
+      ]
+      attrs_no_default = [
+          attr for attr in non_derived_attrs
+          if not attr.HasField('default_value')
+      ]
+      attrs_with_default = [
+          attr for attr in non_derived_attrs if attr.HasField('default_value')
+      ]
+      attr_names = set()
+      for attr_def in attrs_no_default + attrs_with_default:
+        inputs.append(_get_type_info_from_proto(None, attr_def))
+        attr_names.add(attr_def.name)
+
+      # tensor outputs
+      outputs = [
+          _get_type_info_from_proto(arg_def) for arg_def in op_def.output_arg
+      ]
+
+      inputs = ','.join(inputs)
+      outputs = ','.join(outputs)
+      attrs = ','.join(sorted(derived_attrs.union(attr_names)))
+      tfr_funcs.append('{}{}) -> ({}) attributes {{{}}}'.format(
+          tfr_func, inputs, outputs, attrs))
+    return tfr_funcs
+
+
+_PY_TYPE_TO_TFR = {
+    bool: TFRTypes.I1,
+    int: TFRTypes.I64,
+    float: TFRTypes.F32,
+}
+
+_AG_FIXED_RETURN_TYPE = {
+    'for_stmt': type(None),
+    'if_stmt': type(None),
+    'Undefined': TFRTypes.AG_UNDEFINED_VAL,
+}
+
+QN = qual_names.QN
+
+# TODO(mdan): Fix this with an importable module.
+AG_MODULE = api._TRANSPILER._extra_locals['ag__']  # pylint:disable=protected-access
+
+
+class TFRTypeResolver(type_inference.Resolver):
+  """Resolve types for the external names, calls and arguments."""
+
+  def __init__(self, op_defs):
+    super(TFRTypeResolver, self).__init__()
+    self._op_defs = op_defs
+
+    # This pattern matching mechanism works with the functional form generated
+    # by autograph:
+    #
+    #   for i in data:
+    #     print(i)
+    #
+    # generates:
+    #
+    #   def loop_body(itr):
+    #     i = itr
+    #     print(i)
+    #   ag__.for_stmt(target)
+    #
+    # The mechanism lets us infer the type of the itr argument based on that of
+    # target.
+    self._for_loop_target_types = {}  # Maps body function name to iterated.
+    self._for_loop_body_fns = {}  # Used only to avoid collisions.
+
+  def res_name(self, ns, types_ns, name):
+    name_str = str(name)
+    if name_str in ns:
+      ns_val = ns[name_str]
+      return {type(ns_val)}, ns_val
+    if name_str in __builtins__:
+      return {TFRTypes.PY_BUILTIN_FUNC}, __builtins__[name_str]
+    # This name is not in the namespace because the autograph transformation
+    # is not backloaded into Python.
+    if name_str == 'ag__':
+      return {type(AG_MODULE)}, AG_MODULE
+
+    return None, None
+
+  def res_value(self, ns, value):
+    if value is None:
+      return {TFRTypes.NONE}
+    if value in (TFRTypes.SHAPE, TFRTypes.TF_TENSOR_SHAPE_FUNC):
+      # See TFRTypes.__getattrbute__.
+      # TODO(mdan): Replacing the enum with classes would avoid this overlap.
+      return {value}
+    # TODO(mdan): Index more efficiently. Could do a name check instead.
+    if any(v is value for v in AG_MODULE.__dict__.values()):
+      return {TFRTypes.AG_BUILTIN_FUNC}
+    if getattr(value, '__name__', None) == 'tensorflow.raw_ops':
+      return {types.ModuleType}
+    if hasattr(value, '__module__'):
+      # All the imported operations, which are not autograph built-ins, are
+      # considered to be TF raw ops.
+      # TODO(fengliuai): refine the condition so we only matche tensorflow
+      # ops here.
+      return {TFRTypes.TF_RAW_OP}
+    # TODO(mdan): Is ATTR equivalent to string?
+    return {_PY_TYPE_TO_TFR.get(type(value), TFRTypes.ATTR)}
+
+  def res_call(self, ns, types_ns, node, f_type, args, keywords):
+    name = anno.Basic.QN.of(node.func)
+    if f_type == (TFRTypes.AG_BUILTIN_FUNC,):
+
+      if name == QN(QN('ag__'), attr='if_stmt'):
+        nouts = node.args[6].value
+        # TODO(mdan): Look at the actual types out of if_body.
+        side_effects = {
+            qual_names.QN(n.value): {TFRTypes.TENSOR}
+            for n in node.args[5].elts[:nouts]
+        }
+        return {type(None)}, side_effects
+
+      if name == QN(QN('ag__'), attr='for_stmt'):
+        assert isinstance(node.args[2], ast.Name)
+        body_fn_name = str(anno.Basic.QN.of(node.args[2]))
+        assert body_fn_name not in self._for_loop_body_fns, (
+            'Previously used here: {}. Are you reusing the Resolver across '
+            'transformations?').format(self._for_loop_body_fns[body_fn_name])
+        self._for_loop_body_fns[body_fn_name] = anno.Basic.ORIGIN.of(node)
+
+        iterated_type = args[0]
+        assert iterated_type & {
+            TFRTypes.TENSOR_LIST, TFRTypes.TENSOR, List[int]
+        }, (
+            iterated_type)
+        self._for_loop_target_types[body_fn_name] = iterated_type
+
+        return {type(None)}, None
+
+      # TODO(mdan): Actually resolve the type here instead.
+      ret_type = _AG_FIXED_RETURN_TYPE.get(name.qn[1], None)
+      if ret_type is not None:
+        return {ret_type}, None
+      raise NotImplementedError('return type of {}'.format(name))
+
+    elif f_type == (TFRTypes.TF_RAW_OP,):
+      op_name = name.qn[1]
+      op_def, _ = self._op_defs.lookup(op_name)
+      if len(op_def.output_arg) == 1:
+        return {_get_type_from_proto(op_def.output_arg[0])}, None
+      return ({tuple(_get_type_from_proto(arg) for arg in op_def.output_arg)},
+              None)
+
+    elif f_type == (TFRTypes.PY_BUILTIN_FUNC,):
+      assert name.is_simple()
+      if name == QN('range'):
+        return {List[int]}, None
+
+      if name == QN('len'):
+        return {TFRTypes.INDEX}, None
+
+    elif f_type == (TFRTypes.TF_TENSOR_SHAPE_FUNC,):
+      return {TFRTypes.TF_TENSOR_SHAPE_LIST}, None
+
+    raise NotImplementedError('Function:', name, f_type)
+
+  def res_arg(self, ns, types_ns, f_name, name, type_anno, f_is_local):
+    if f_is_local:
+      f_name_str = str(f_name)
+      if f_name_str in self._for_loop_target_types:
+        # See autograph/converters/control_flow.py - the function has a single
+        # argument, the iterate before any expansion.
+        assert self._for_loop_target_types[f_name_str] & {List[int]}
+        # Assume all loops are TF loops. Then the iterates are autoboxed into
+        # Tensors.
+        return {TFRTypes.INDEX}
+      else:
+        return None
+
+    func = ns[f_name]
+
+    op_def, derived_attrs = self._op_defs.lookup(f_name, func)
+    if op_def is None:
+      return None
+    pos = tf_inspect.getfullargspec(func).args.index(str(name))
+
+    if pos < len(op_def.input_arg):
+      arg_def = op_def.input_arg[pos]
+      return {_get_type_from_proto(arg_def)}
+    elif pos < len(op_def.input_arg) + len(op_def.attr) - len(derived_attrs):
+      non_derived_attr_pos = pos - len(op_def.input_arg)
+      for attr_def in op_def.attr:
+        # derived attribute, skip this one and continue to the next one.
+        if attr_def.name in derived_attrs:
+          continue
+        if non_derived_attr_pos == 0:
+          return {_get_type_from_proto(None, attr_def)}
+        non_derived_attr_pos -= 1
+
+    raise ValueError('Argument is not defined in OpDef: ' + str(name))
+
+  def res_subscript(self, ns, types_ns, node_or_slice, value, slice_):
+    assert len(value) == 1
+    value, = tuple(value)
+    if value == TFRTypes.TF_TENSOR_SHAPE_LIST:
+      # TODO(mdan): This is not entirely correct for multi-element slices.
+      return {int}
+    elif value in (TFRTypes.TENSOR_LIST, TFRTypes.TENSOR):
+      # TODO(mdan): This is not entirely correct for multi-element slices.
+      return {TFRTypes.TENSOR}
+    raise NotImplementedError('slice of {}'.format(value))
+
+  def res_compare(self, ns, types_ns, node, left, right):
+    # TODO(fengliuai): make sure left and right are compatible
+    return {TFRTypes.I1}
+
+  def res_binop(self, ns, types_ns, node, left, right):
+    # TODO(fengliuai): make sure left and right are compatible
+    return left
+
+
+class SymbolTable(object):
+  """Symbol Table for python code."""
+
+  def __init__(self):
+    self.symbols = []
+    self.enter_scope()
+    self.scf_scope = 0
+    # reserved key words
+    self.insert_symbol('len', 'len', TFRTypes.PY_BUILTIN_FUNC)
+
+  def enter_scope(self, scf_scope=False):
+    """Enter a new scope - at function level."""
+    self.symbols.append({'types': {}, 'symbols': {}})
+    self.curr_table = self.symbols[len(self.symbols) - 1]
+    if scf_scope:
+      self.scf_scope += 1
+
+  def insert_symbol(self, name, value, type_):
+    self.curr_table['symbols'][name] = (value, type_)
+    # TODO(mdan): Use the inferred type rather than tracking it here.
+    # The following field is decrepcated.
+    self.curr_table['types'][name] = type_
+    return value
+
+  def exit_scope(self):
+    self.symbols.pop()
+    self.curr_table = self.symbols[len(self.symbols) - 1]
+    if self.scf_scope > 0:
+      self.scf_scope -= 1
+
+  def in_scf_scope(self):
+    return self.scf_scope > 0
+
+  def lookup(self, name):
+    curr_idx = len(self.symbols) - 1
+    while curr_idx >= 0 and (name not in self.symbols[curr_idx]['symbols']):
+      curr_idx -= 1
+    if curr_idx < 0:
+      return None
+    return self.symbols[curr_idx]['symbols'][name]
+
+
+class TFRGen(transformer.CodeGenerator):
+  """Visit the AST and generate MLIR TFR functions."""
+
+  def __init__(self, ctx, op_defs):
+    super(TFRGen, self).__init__(ctx)
+    self.ctx = ctx
+    self.symbol_table = SymbolTable()
+    self._op_defs = op_defs
+
+  def _create_mlir_loc(self, loc):
+    """Creates mlir location from autograph ORIGIN value.
+
+    Args:
+      loc: OriginInfo
+
+    Returns:
+      A serialized mlir location string.
+    """
+    if loc is not None and loc.loc.filename:
+      file_name = os.path.basename(loc.loc.filename)
+      return 'loc("{}":{}:{})'.format(file_name, loc.loc.lineno,
+                                      loc.loc.col_offset)
+    else:
+      return 'loc(unknown)'
+
+  def _emit_with_loc(self, op_str, node=None):
+    """Emit the mlir operation with the location associated with the node.
+
+    Args:
+      op_str: The mlir operation string to be emitted.
+      node: The node of the AST tree, the mlir operation translated from.
+    """
+    loc = ''
+    if node:
+      loc = self._create_mlir_loc(
+          anno.getanno(node, anno.Basic.ORIGIN, default=None))
+    self.emit(op_str + ' ' + loc)
+
+  def _get_inferred_type(self, node, default=None):
+    types_ = anno.getanno(node, anno.Static.TYPES, None)
+    if not types_:
+      print('WARN: no Static.TYPES annotation. Fix the type inference pass: ')
+      self.debug_print(node)
+      return default
+    if types_ and len(types_) > 1:
+      raise ValueError('ambiguous inferred type for "{}": {}'.format(
+          node, types_))
+
+    type_, = types_
+    # TODO(fengliuai): Tuple is added here to make return tuple work.
+    if type_ is list or type_ is Tuple:
+      # TODO(fengliuai): Seems like we need to move the followed list handling
+      # to the type inference and we shouldn't just put 'list' there. Otherwise
+      # we couldn't find out the right type for the Name node.
+      if not isinstance(node, ast.List):
+        return default
+      all_types = [
+          anno.getanno(elt, anno.Static.TYPES, None) for elt in node.elts
+      ]
+      if (TFRTypes.TENSOR,) in all_types:
+        # For the elt which is not tfr.tensor, tfr.constant_tensor needs to be
+        # use to cast it to a tfr.tensor.
+        return TFRTypes.TENSOR_LIST
+      else:
+        return TFRTypes.ATTR
+
+    if default is not None and type_ != default:
+      print('WARN: type annotation {}({}) does not match {}({})'.format(
+          type_, type(type_), default, type(default)))
+      self.debug_print(node)
+
+    return type_
+
+  def _pack_tensor_list(self, value):
+    # This is packing a list of tensors, then the axis is 0.
+    axis = self._ssa_name('zero')
+    self._emit_with_loc('\n{} = constant 0 : i64'.format(axis))
+    casted = self._ssa_name('pack')
+    self.emit('\n{} = tfr.call @tf__pack({}, {})'.format(casted, value, axis))
+    self._emit_with_loc(' : (!tfr.tensor_list, i64) -> !tfr.tensor')
+    # load the op def of tf.Pack
+    self._op_defs.lookup('Pack')
+    return casted, TFRTypes.TENSOR
+
+  def _index_to_I64(self, value, ty):
+    if ty == TFRTypes.INDEX:
+      casted = self._ssa_name('casted')
+      self._emit_with_loc('\n{} = index_cast {} : index to i64'.format(
+          casted, value))
+      return casted, TFRTypes.I64
+    else:
+      return value, ty
+
+  def _value_to_tensor(self, value, ty, node):
+    value, ty = self._index_to_I64(value, ty)
+    cst_tensor = self._ssa_name('cst')
+    self.emit('\n{} = "tfr.constant_tensor"({})'.format(cst_tensor, value))
+    self._emit_with_loc(' : ({}) -> !tfr.tensor'.format(ty), node)
+    return cst_tensor, TFRTypes.TENSOR
+
+  def _ssa_name(self, prefix):
+    if isinstance(prefix, qual_names.QN):
+      assert prefix.is_simple(), 'ANF transform should have cleaned this up'
+      prefix = prefix.ssf()
+    return '%' + self.ctx.namer.new_symbol(prefix, set())
+
+  def _op_def(self, op_name):
+    return op_def_registry.get(op_name)
+
+  def visit_block(self, block):
+    return [self.visit(item) for item in block]
+
+  def visit_Pass(self, node):
+    if self.symbol_table.in_scf_scope():
+      self._emit_with_loc('\nscf.yield', node)
+    else:
+      self._emit_with_loc('\ntfr.return', node)
+
+  def visit_Attribute(self, node):
+    node_type = self._get_inferred_type(node, None)
+    if isinstance(node.value, ast.Name):
+      if node.value.id == 'ag__':
+        # some variables are assigned with 'ag__.xxx' method, we should handle
+        # them following the autograph convensions.
+        return (node.attr, TFRTypes.AG_BUILTIN_FUNC)
+
+      if node_type == TFRTypes.TF_RAW_OP:
+        # This branch is used when it is inside tensorflow
+        return (node.attr, TFRTypes.TF_RAW_OP)
+
+      value, _ = self.visit(node.value)
+      tensor_type = self._get_inferred_type(node.value, None)
+      # TODO(fengliuai): use node_type once it
+      if node_type == TFRTypes.SHAPE:
+        print('TODO: use "node_type"')
+      if node.attr == 'shape' and tensor_type == TFRTypes.TENSOR:
+        ssa_value = self._ssa_name('shape')
+        self._emit_with_loc(
+            '\n{} = tfr.get_shape {} -> !shape.shape'.format(ssa_value, value),
+            node)
+        return (ssa_value, TFRTypes.SHAPE)
+
+    if isinstance(node.value, ast.Attribute):
+      if isinstance(node.value.value, ast.Name):
+        if node.value.value.id == 'tf' and node.value.attr == 'raw_ops':
+          # This branch is used when it is outside tensorflow
+          return (node.attr, TFRTypes.TF_RAW_OP)
+
+      value, ty = self.visit(node.value)
+      # TODO(fengliuai): use node_type once it
+      if node_type == TFRTypes.TF_TENSOR_SHAPE_FUNC:
+        print('TODO: use "node_type"')
+      if ty == TFRTypes.SHAPE and node.attr == 'as_list':
+        return (value, TFRTypes.TF_TENSOR_SHAPE_FUNC)
+
+    raise NotImplementedError('Attribute kind not recoganized.')
+
+  def visit_Assign(self, node):
+    values = self.visit(node.value)
+    if isinstance(node.targets[0], ast.Tuple):
+      targets = [elt.id for elt in node.targets[0].elts]
+    elif isinstance(node.targets[0], ast.Name):
+      targets = [node.targets[0].id]
+    else:
+      raise NotImplementedError('Assignment target type not recoganized.')
+
+    if isinstance(values, list):
+      if len(targets) == len(values):
+        for key, value in zip(targets, values):
+          ssa_value, ty_ = value
+          ty = self._get_inferred_type(node.value, ty_)
+          self.symbol_table.insert_symbol(key, ssa_value, ty)
+      elif len(values) == 1:
+        n, ty = values[0]
+        assert ty == TFRTypes.TENSOR_LIST
+        # assign a tensor_list to multiple variables
+        for idx, key in enumerate(targets):
+          idx_name = self._ssa_name('idx')
+          self._emit_with_loc(
+              '\n{} = constant {} : index'.format(idx_name, idx), node)
+          elt_name = self._ssa_name('elt')
+          self.emit('\n{} = tfr.get_element {}[{}]'.format(
+              elt_name, n, idx_name))
+          self._emit_with_loc(' : (!tfr.tensor_list, index) -> !tfr.tensor',
+                              node)
+          self.symbol_table.insert_symbol(key, elt_name, TFRTypes.TENSOR)
+      elif len(targets) == 1:
+        ssa_names = [n for n, _ in values]
+        tys = [t for _, t in values]
+        self.symbol_table.insert_symbol(targets[0], ssa_names, tys)
+    else:
+      self.symbol_table.insert_symbol(targets[0], values[0], values[1])
+
+  def _emit_binary_op(self, op, lhs, lhs_ty, rhs, rhs_ty):
+    assert lhs_ty, rhs_ty
+    if isinstance(op, ast.Sub):
+      code = 'sub'
+    elif isinstance(op, ast.Add):
+      code = 'add'
+    else:
+      raise NotImplementedError('BinOp operator not recognized' + op)
+
+    if lhs_ty == TFRTypes.I64:
+      suffix = 'i'
+    elif lhs_ty == TFRTypes.F32:
+      suffix = 'f'
+    else:
+      raise NotImplementedError('BinOp operand type not recognized' + op)
+
+    ret = self._ssa_name(code)
+    self._emit_with_loc(
+        '\n{} = {}{} {}, {} : {}'.format(ret, code, suffix, lhs, rhs, lhs_ty),
+        op)
+    return ret, lhs_ty
+
+  def visit_AugAssign(self, node):
+    lhs, lhs_ty = self.visit(node.target)
+    rhs, rhs_ty = self.visit(node.value)
+    ret, ret_ty = self._emit_binary_op(node.op, lhs, lhs_ty, rhs, rhs_ty)
+    self.symbol_table.insert_symbol(node.target.id, ret, ret_ty)
+
+  def visit_BinOp(self, node):
+    lhs, lhs_ty = self.visit(node.left)
+    rhs, rhs_ty = self.visit(node.right)
+    return self._emit_binary_op(node.op, lhs, lhs_ty, rhs, rhs_ty)
+
+  def visit_BoolOp(self, node):
+    values = [self.visit(value) for value in node.values]
+    # TODO(fengliuai): Handle more ast node types.
+    if isinstance(node.op, ast.Or):
+      raise NotImplementedError('Or operator not recognized')
+    elif isinstance(node.op, ast.And):
+      raise NotImplementedError('And operator not recognized')
+
+  def visit_Call(self, node):
+    func_name, func_type = self.visit(node.func)
+    _ = self._get_inferred_type(node.func, func_type)
+    if func_type == TFRTypes.AG_BUILTIN_FUNC:
+      if func_name == 'if_stmt':
+        cond, _ = self.visit(node.args[0])
+        body, _ = self.visit(node.args[1])
+        orelse, _ = self.visit(node.args[2])
+        get_state, _ = self.visit(node.args[3])
+        nouts = int(node.args[6].value)
+        out_symbols = []
+        # The out symbols are just a Tuple of names
+        for out in node.args[5].elts[:nouts]:
+          val, ty = self.symbol_table.lookup(out.value)
+          if ty != TFRTypes.AG_UNDEFINED_VAL:
+            raise ValueError('if stmt out symbol is not defined.')
+          out_symbols.append(out.value)
+        return self._visit_if_stmt(cond, body, orelse, get_state, out_symbols,
+                                   node)
+      elif func_name == 'for_stmt':
+        range_ = self._visit_iter(node.args[0])
+        body, _ = self.visit(node.args[2])
+        get_state, _ = self.visit(node.args[3])
+        loop_carried = [out.value for out in node.args[5].elts]
+        # TODO(fengliuai): opt is not used here.
+        return self._visit_for_stmt(range_, body, get_state, loop_carried, node)
+      elif func_name == 'Undefined':
+        val = self._ssa_name(node.args[0].value)
+        return (val, TFRTypes.AG_UNDEFINED_VAL)
+      elif func_name == 'UndefinedReturnValue':
+        val = self._ssa_name('return_val')
+        return (val, TFRTypes.AG_UNDEFINED_VAL)
+
+    if func_type == TFRTypes.TF_RAW_OP:
+      return self._visit_tf_op(func_name, node.args, node.keywords, node)
+
+    if func_type == TFRTypes.TF_TENSOR_SHAPE_FUNC:
+      return (func_name, TFRTypes.TF_TENSOR_SHAPE_LIST)
+
+    if func_type == TFRTypes.PY_BUILTIN_FUNC:
+      if func_name == 'len':
+        arg, ty = self.visit(node.args[0])
+        ty = self._get_inferred_type(node.args[0], ty)
+        assert ty == TFRTypes.TF_TENSOR_SHAPE_LIST, ty
+        len_value = self._ssa_name('len')
+        self._emit_with_loc(
+            '\n{} = shape.rank {} : !shape.shape -> !shape.size'.format(
+                len_value, arg), node)
+        size_value = self._ssa_name('len_size')
+        self._emit_with_loc(
+            '\n{} = shape.size_to_index {} : !shape.size'.format(
+                size_value, len_value), node)
+        return (size_value, TFRTypes.INDEX)
+
+    raise NotImplementedError('call operator not recognized: {} {}'.format(
+        func_name, func_type))
+
+  def visit_Compare(self, node):
+    lhs, lhs_ty = self.visit(node.left)
+    for op, right in zip(node.ops, node.comparators):
+      rhs, _ = self.visit(right)
+      if isinstance(op, ast.Eq):
+        pred = 'eq'
+      elif isinstance(op, ast.Lt):
+        pred = 'ult'
+      elif isinstance(op, ast.LtE):
+        pred = 'ule'
+      elif isinstance(op, ast.Gt):
+        pred = 'ugt'
+      elif isinstance(op, ast.GtE):
+        pred = 'uge'
+      elif isinstance(op, ast.NotEq):
+        pred = 'ne'
+      else:
+        raise NotImplementedError('Compare operator not recognized')
+
+      ret = self._ssa_name(pred)
+      if lhs_ty == TFRTypes.ATTR:
+        self._emit_with_loc(
+            '\n{} = tfr.equal {}, {} -> i1'.format(ret, lhs, rhs), node)
+      else:
+        if lhs_ty == TFRTypes.I64:
+          code = 'cmpi'
+        elif lhs_ty == TFRTypes.F32:
+          code = 'cmpf'
+        else:
+          raise NotImplementedError('Compare operand type not recognized')
+        self._emit_with_loc(
+            '\n{} = {} "{}", {}, {} : {}'.format(ret, code, pred, lhs, rhs,
+                                                 lhs_ty), node)
+
+      return ret, TFRTypes.I1
+
+  def visit_Constant(self, node):
+    cst_name = self._ssa_name('cst')
+    if node.value is None:
+      cst_ty = TFRTypes.NONE
+    elif isinstance(node.value, bool):
+      cst_ty = self._get_inferred_type(node)
+      cst_val = str(node.value).lower()
+      self._emit_with_loc('\n{} = constant {}'.format(cst_name, cst_val), node)
+    else:
+      cst_ty = self._get_inferred_type(node)
+      cst_val = node.value
+      if cst_ty == TFRTypes.ATTR:
+        self._emit_with_loc(
+            '\n{} = tfr.constant "{}" -> {}'.format(cst_name, cst_val, cst_ty),
+            node)
+      else:
+        self._emit_with_loc(
+            '\n{} = constant {} : {}'.format(cst_name, cst_val, cst_ty), node)
+    return cst_name, cst_ty
+
+  def visit_FunctionDef(self, node):
+    op_def, derived_attrs = self._op_defs.lookup(node.name, node, True)
+    if op_def is None:
+      # Nested function. Insert it to symbol table for looking up later.
+      self.symbol_table.insert_symbol(node.name, node, None)
+      return
+    op_name = op_def.name
+    if self.symbol_table.lookup(op_name):
+      raise LookupError('Composition has not been registered for op: ' +
+                        op_name)
+    else:
+      self.symbol_table.insert_symbol(node.name, None, None)
+
+    self.symbol_table.enter_scope()
+    self.emit('\ntfr.func @tf__{0}('.format(_camel_to_snake(op_name)))
+
+    arg_list = []
+    idx = 0
+    max_idx = len(op_def.input_arg) + len(op_def.attr)
+    for arg in node.args.args:
+      arg_name = self._ssa_name(anno.getanno(arg, anno.Basic.QN))
+      arg_type = anno.getanno(arg, anno.Static.TYPES)[0]
+
+      arg_attr = ''
+      if idx >= len(op_def.input_arg):
+        attr_def = op_def.attr[idx - len(op_def.input_arg)]
+        # skip the derived attributes
+        while attr_def.name in derived_attrs and (idx + 1) < max_idx:
+          idx += 1
+          attr_def = op_def.attr[idx - len(op_def.input_arg)]
+        if idx >= max_idx:
+          raise ValueError('Argument is not defined in OpDef: ' + arg_name)
+
+        arg_attr += '{{tfr.name="{}"'.format(attr_def.name)
+        if attr_def.HasField('default_value'):
+          default_val = _get_val_from_proto(arg_type, attr_def.default_value)
+          arg_attr += ',tfr.default={}'.format(default_val)
+        arg_attr += '}'
+
+      idx += 1
+      arg_str = '{}: {}{}'.format(arg_name, arg_type, arg_attr)
+      arg_list.append(arg_str)
+      self.symbol_table.insert_symbol(arg.id, arg_name, arg_type)
+
+    ret_type_list = []
+    for ret_def in op_def.output_arg:
+      if ret_def.number_attr or ret_def.type_list_attr:
+        ret_type_list.append(str(TFRTypes.TENSOR_LIST))
+      else:
+        ret_type_list.append(str(TFRTypes.TENSOR))
+
+    self.emit('{}) -> ({}) {{'.format(', '.join(arg_list),
+                                      ', '.join(ret_type_list)))
+    self.visit_block(node.body)
+    self._emit_with_loc('\n}', node)
+    self.symbol_table.exit_scope()
+
+  def visit_arguments(self, node):
+    # TODO(fengliuai): return ordered the types and names.
+    # We need to order the arguments to match the assumption in the TFR dialect.
+    raise NotImplementedError('arguments not supported.')
+
+  def visit_Lambda(self, node):
+    raise NotImplementedError('Lambda not supported.')
+
+  def _get_mlir_ssa_values(self, name_prefix, out_types):
+    """Create MLIR convention SSA values."""
+    out_ssa_values = []
+    if not out_types:
+      return '', out_ssa_values
+
+    out_name = self._ssa_name(name_prefix)
+    if len(out_types) == 1:
+      out_name_suffix = ''
+      out_ssa_values.append(out_name)
+    else:
+      # For multiple returns, MLIR uses '%s:i' when they are defined and
+      # '%s#i' when they are used.
+      out_name_suffix = ':{}'.format(len(out_types))
+      for idx, _ in enumerate(out_types):
+        out_ssa_values.append('{}#{}'.format(out_name, idx))
+
+    return '{}{}'.format(out_name, out_name_suffix), out_ssa_values
+
+  def _visit_if_stmt(self, cond, body_def, orelse_def, get_state, out_symbols,
+                     node):
+    self.emit('\n')
+    ret_str, ret_ssa_values = self._get_mlir_ssa_values(
+        'if_stmt', [TFRTypes.TENSOR] * len(out_symbols))
+    if ret_ssa_values:
+      self.emit(ret_str + ' = ')
+
+    # add ssa values to the symbol table
+    out_types = []
+    for symbol, ssa_value in zip(out_symbols, ret_ssa_values):
+      self.symbol_table.insert_symbol(symbol, ssa_value, TFRTypes.TENSOR)
+      out_types.append(str(TFRTypes.TENSOR))
+
+    self.emit('scf.if {} -> ({}) {{'.format(cond, ', '.join(out_types)))
+    # Create a new scope in case the local variables are leaked.
+    self.symbol_table.enter_scope(scf_scope=True)
+    self.visit_block(body_def.body)
+    self.visit_block(get_state.body)
+    self.symbol_table.exit_scope()
+
+    self.emit('\n} else {')
+
+    # Create a new scope in case the local variables are leaked.
+    self.symbol_table.enter_scope(scf_scope=True)
+    self.visit_block(orelse_def.body)
+    self.visit_block(get_state.body)
+    self.symbol_table.exit_scope()
+
+    self._emit_with_loc('\n}', node)
+    return list(zip(ret_ssa_values, out_types))
+
+  def _visit_iter(self, node):
+    if isinstance(node, ast.Call):
+      f_name = anno.getanno(node.func, anno.Basic.QN)
+      if f_name == QN('range'):
+        args = [self.visit(arg) for arg in node.args]
+        begin = None
+        step = None
+        end = None
+        if len(args) == 1:
+          end, end_ty = args[0]
+        elif len(args) == 2:
+          begin, begin_ty = args[0]
+          end, end_ty = args[1]
+        elif len(args) == 3:
+          begin, begin_ty = args[0]
+          end, end_ty = args[1]
+          step, step_ty = args[2]
+
+        if begin is None:
+          begin = self._ssa_name('begin')
+          self._emit_with_loc('\n{} = constant 0 : index'.format(begin), node)
+        elif begin_ty != TFRTypes.INDEX:
+          begin_ = self._ssa_name('begin')
+          self._emit_with_loc(
+              '\n{} = index_cast {} : {} to index'.format(
+                  begin_, begin, begin_ty), node)
+          begin = begin_
+
+        if end_ty != TFRTypes.INDEX:
+          end_ = self._ssa_name('end')
+          self._emit_with_loc(
+              '\n{} = index_cast {} : {} to index'.format(end_, end, end_ty),
+              node)
+          end = end_
+
+        if step is None:
+          step = self._ssa_name('step')
+          self._emit_with_loc('\n{} = constant 1 : index'.format(step), node)
+        elif step_ty != TFRTypes.INDEX:
+          step_ = self._ssa_name('step')
+          self._emit_with_loc(
+              '\n{} = index_cast {} : {} to index'.format(step_, step, step_ty),
+              node)
+          step = step_
+
+        return begin, end, step
+
+    raise NotImplementedError('Iterator entity not supported.' + node)
+
+  def _visit_for_stmt(self, range_, body_def, get_state, loop_carried, node):
+    self.emit('\n')
+    ret_str, ret_ssa_values = self._get_mlir_ssa_values(
+        'for_stmt', [TFRTypes.TENSOR] * len(loop_carried))
+    if ret_ssa_values:
+      self.emit(ret_str + ' = ')
+
+    # Before enter the loop, we use the original ssa values as the initial
+    # values to the loop iteration arguments. We also create new ssa values as
+    # the returns of the scf for statements. The symbol table needs to be
+    # updated to these new ssa values before it enters the scope of the loop.
+    out_types = []
+    init_values = []
+    for symbol, ssa_value in zip(loop_carried, ret_ssa_values):
+      init, ty = self.symbol_table.lookup(symbol)
+      self.symbol_table.insert_symbol(symbol, ssa_value, ty)
+      out_types.append(str(ty))
+      init_values.append((init, ty))
+
+    # Create a new scope in case the local variables are leaked.
+    self.symbol_table.enter_scope(scf_scope=True)
+
+    # Create the iteration variable with index type
+    assert len(body_def.args.args) == 1
+    it_name = body_def.args.args[0].id
+    it = self._ssa_name(it_name)
+    self.symbol_table.insert_symbol(it_name, it, TFRTypes.INDEX)
+
+    self.emit('scf.for {} = {} to {} step {} '.format(it, range_[0], range_[1],
+                                                      range_[2]))
+    if loop_carried:
+      iter_args = []
+      for symbol, init in zip(loop_carried, init_values):
+        # create new ssa values for the loop carried variables
+        it_arg = self._ssa_name('it_arg')
+        self.symbol_table.insert_symbol(symbol, it_arg, init[1])
+        iter_args.append('{} = {}'.format(it_arg, init[0]))
+      self.emit('iter_args({}) '.format(', '.join(iter_args)))
+      self.emit('-> ({}) {{'.format(', '.join(out_types)))
+    else:
+      self.emit(' {')
+    self.visit_block(body_def.body)
+    self.visit_block(get_state.body)
+    self.symbol_table.exit_scope()
+    self._emit_with_loc('\n}', node)
+    return list(zip(ret_ssa_values, out_types))
+
+  def _emit_default_constant_from_proto(self, attr_def):
+    """emit mlir constant statement from default value of the ArgDef proto."""
+    name = self._ssa_name('cst')
+    cst_ty = _get_type_from_proto(None, attr_def)
+    cst_val = _get_val_from_proto(cst_ty, attr_def.default_value)
+    if cst_ty == TFRTypes.ATTR:
+      self._emit_with_loc('\n{} = tfr.constant {} -> {}'.format(
+          name, cst_val, cst_ty))
+    elif cst_ty == TFRTypes.I1:
+      self._emit_with_loc('\n{} = constant {}'.format(name, cst_val))
+    else:
+      self._emit_with_loc('\n{} = constant {} : {}'.format(
+          name, cst_val, cst_ty))
+    return name, cst_ty
+
+  def visit_keyword(self, node):
+    return node.arg, self.visit(node.value)
+
+  def _visit_tf_op(self, op_name, args, keywords, node):
+    op_def, derived_attrs = self._op_defs.lookup(op_name)
+    ret_tys = [_get_type_from_proto(arg) for arg in op_def.output_arg]
+
+    ret_str, ret_ssa_values = self._get_mlir_ssa_values(op_name, ret_tys)
+
+    arg_strs = []
+    ty_strs = []
+    for arg in args:
+      value, ty = self.visit(arg)
+      arg_strs.append(value)
+      ty_strs.append(str(ty))
+
+    input_args = [arg for arg in op_def.input_arg]
+    attrs_no_default = [
+        attr for attr in op_def.attr
+        if not attr.HasField('default_value') and attr.name not in derived_attrs
+    ]
+    attrs_with_default = [
+        attr for attr in op_def.attr
+        if attr.HasField('default_value') and attr.name not in derived_attrs
+    ]
+
+    kw_args = {}
+    for arg in keywords:
+      value, (ssa_name, ty) = self.visit(arg)
+      ty = self._get_inferred_type(arg.value, ty)
+
+      # TODO(fengliuai): implement the "rename_to" for the customization in
+      # tensorflow/core/api_def/base_api/*
+      if value == 'axis':
+        value = 'split_dim'
+
+      kw_args[value] = (ssa_name, ty)
+
+    # tensor arguments and attribute arguments
+    ordered_args = input_args + attrs_no_default + attrs_with_default
+    for attr_def in ordered_args[len(args):]:
+      if attr_def.name in kw_args:
+        value, ty = kw_args[attr_def.name]
+        if attr_def in input_args:
+          if ty in _attribute_types:
+            # the argument shouldn't be used as tf op calls directly.
+            value, ty = self._value_to_tensor(value, ty, node)
+          if ty is TFRTypes.TENSOR_LIST and not _require_tensor_list(attr_def):
+            value, ty = self._pack_tensor_list(value)
+      else:
+        value, ty = self._emit_default_constant_from_proto(attr_def)
+      arg_strs.append(value)
+      ty_strs.append(str(ty))
+
+    if ret_ssa_values:
+      self.emit('\n{} = '.format(ret_str))
+
+    self.emit('tfr.call @tf__{}('.format(_camel_to_snake(op_name)))
+    arg_str = ', '.join(arg_strs)
+    arg_ty_str = ', '.join(ty_strs)
+    ret_ty_str = ', '.join([str(ty) for ty in ret_tys])
+    self._emit_with_loc(
+        '{}) : ({}) -> ({})'.format(arg_str, arg_ty_str, ret_ty_str), node)
+    return list(zip(ret_ssa_values, ret_tys))
+
+  def visit_If(self, node):
+    raise NotImplementedError('If not supported.')
+
+  def visit_Name(self, node):
+    val, lookup_type = self.symbol_table.lookup(node.id)
+    type_ = self._get_inferred_type(node, lookup_type)
+    return val, type_
+
+  def visit_Return(self, node):
+    values = self.visit(node.value)
+    if self.symbol_table.in_scf_scope():
+      self.emit('\nscf.yield ')
+    else:
+      self.emit('\ntfr.return ')
+    if not values:
+      return
+
+    if isinstance(values, list):
+      vals, tys = zip(*values)
+    else:
+      vals = values[0]
+      tys = values[1]
+
+    if isinstance(tys, list) or isinstance(tys, tuple):
+      tys = [str(t) for t in tys]
+      self._emit_with_loc('{} : {}'.format(', '.join(vals), ', '.join(tys)),
+                          node)
+    elif tys != TFRTypes.NONE:
+      # TODO(fengliuai): scf region yield uses this branch. Fix it.
+      self._emit_with_loc('{} : {}'.format(vals, tys), node)
+
+  def visit_Subscript(self, node):
+    val, ty = self.visit(node.value)
+    type_ = self._get_inferred_type(node.value, ty)
+
+    # TODO(fengliuai): Here we hardcode the node.slice here to get the index
+    # type. Use the visit method once the type inference is done.
+    # slice_val, slice_ty = self.visit(node.slice)
+    if isinstance(node.slice, ast.Index):
+      if isinstance(node.slice.value, ast.Constant):
+        # TODO(fengliuai): promote to an assignment
+        idx_val = self._ssa_name('cst')
+        self._emit_with_loc(
+            '\n{} = constant {} : index'.format(idx_val,
+                                                node.slice.value.value), node)
+      else:
+        idx_val, _ = self.visit(node.slice.value)
+    else:
+      raise NotImplementedError('non-index slice not supported.')
+
+    elt = self._ssa_name('elt')
+    if type_ == TFRTypes.TENSOR_LIST:
+      self.emit('\n{} = tfr.get_element {}[{}] '.format(elt, val, idx_val))
+      self._emit_with_loc(': (!tfr.tensor_list, index) -> !tfr.tensor', node)
+      return (elt, TFRTypes.TENSOR)
+    elif type_ == TFRTypes.TF_TENSOR_SHAPE_LIST:
+      size_ = self._ssa_name('size')
+      self.emit('\n{} = shape.get_extent {}, {}'.format(size_, val, idx_val))
+      self._emit_with_loc(': !shape.shape, index -> !shape.size', node)
+      self._emit_with_loc(
+          '\n{} = shape.size_to_index {} : !shape.size'.format(elt, size_),
+          node)
+      return (elt, TFRTypes.INDEX)
+
+  def visit_List(self, node):
+    out_type = self._get_inferred_type(node)
+    vals = []
+    tys = []
+    for elt in node.elts:
+      val, ty = self.visit(elt)
+      if ty in _attribute_types and out_type == TFRTypes.TENSOR_LIST:
+        # This list is a tensor list, then cast all the input values to tensors.
+        val, ty = self._value_to_tensor(val, ty, node)
+      else:
+        # We shouldn't use index type to build the list because list will be use
+        # as attribute.
+        val, ty = self._index_to_I64(val, ty)
+      vals.append(val)
+      tys.append(str(ty))
+
+    list_val = self._ssa_name('list')
+    self.emit('\n{} = "tfr.build_list"({})'.format(list_val, ', '.join(vals)))
+    self._emit_with_loc(' : ({}) -> {}'.format(', '.join(tys), out_type), node)
+    return (list_val, out_type)
+
+  def visit_Tuple(self, node):
+    return [self.visit(elt) for elt in node.elts]
+
+  def visit_UnaryOp(self, node):
+    value, ty = self.visit(node.operand)
+    if isinstance(node.op, ast.USub):
+      zero_value = self._ssa_name('zero')
+      self._emit_with_loc('\n{} = constant 0 : {}'.format(zero_value, ty), node)
+      ssa_value = self._ssa_name('cst')
+      if ty == TFRTypes.I32 or ty == TFRTypes.I64:
+        self._emit_with_loc(
+            '\n{} = subi {}, {} : {}'.format(ssa_value, zero_value, value, ty),
+            node)
+      elif ty == TFRTypes.F32:
+        self._emit_with_loc(
+            '\n{} = subf {}, {} : {}'.format(ssa_value, zero_value, value, ty),
+            node)
+      else:
+        raise NotImplementedError('USub type not recognized: ' + str(ty))
+      return ssa_value, ty
+    raise NotImplementedError('USub operator not recognized')
+
+  def visit_For(self, node):
+    raise NotImplementedError('For operator not recognized')
+
+  def visit_While(self, node):
+    raise NotImplementedError('While operator not recognized')
+
+  def visit_Try(self, node):
+    # Only handles the body of the try statement.
+    self.visit_block(node.body)
+
+
+def _apply_py_to_tf_passes(node, ctx):
+  """Apply transformations from PyToTF to match tf.function tracing."""
+  # TODO(fengliuai): we don't know which passes are required, thus we evalute
+  # each one when the corresponding node is handled.
+  # copied from PyToTF.transform_ast
+  node = return_statements.transform(node, ctx, False)
+  node = control_flow.transform(node, ctx)
+  return node
+
+
+class TfrGen(transpiler.GenericTranspiler):
+  """Transforms Python objects into TFR MLIR source code."""
+
+  def __init__(self, op_defs):
+    self._op_defs = op_defs
+
+  def transform_ast(self, node, ctx):
+    node = _apply_py_to_tf_passes(node, ctx)
+    # TODO(mdan): Enable this.
+    # node = anf.transform(node, ctx)
+
+    graphs = cfg.build(node)
+    node = qual_names.resolve(node)
+    node = activity.resolve(node, ctx)
+    node = reaching_definitions.resolve(node, ctx, graphs)
+    node = reaching_fndefs.resolve(node, ctx, graphs)
+    node = type_inference.resolve(node, ctx, graphs,
+                                  TFRTypeResolver(self._op_defs))
+
+    mlir_generator = TFRGen(ctx, self._op_defs)
+    mlir_generator.visit(node)
+    return mlir_generator.code_buffer
+
+
+def tfr_gen(func, op_defs):
+  """Parse a function and emit the TFR functions."""
+  mlir_code, _ = TfrGen(op_defs).transform(func, None)
+  assert tfr.verify(mlir_code), 'mlir code not verified: {}'.format(mlir_code)
+  return mlir_code
+
+
+def tfr_gen_from_module(source, method_prefix=None, op_libraries=None):
+  """Parse the input source module and emit the TFR functions."""
+  op_defs = OpDefCache()
+
+  # Load the op library so the op is added to the op registry. This is
+  # required when the op cc_library couldn't be statically linked in open
+  # source.
+  # This is a no op if the op shared library couldn't be found in the same
+  # directory of the op Python API.
+  # TODO(fengliuai): make the .so file path configurable.
+  if op_libraries:
+    prefix_len = len('gen_')
+    for m in op_libraries:
+      lib_dir = os.path.dirname(m.__file__)
+      lib_name = os.path.basename(m.__file__)[prefix_len:].replace('.py', '.so')
+      lib_path = os.path.join(lib_dir, lib_name)
+      if os.path.exists(lib_path):
+        logging.info('load file: ' + lib_path)
+        load_library.load_op_library(lib_path)
+  else:
+    # The op library is generated from the source module, then we load all the
+    # .so file in the directory
+    lib_dir = os.path.dirname(source.__file__)
+    for lib_name in os.listdir(lib_dir):
+      if lib_name.endswith('.so'):
+        lib_path = os.path.join(lib_dir, lib_name)
+        logging.info('load file: ' + lib_path)
+        load_library.load_op_library(lib_path)
+
+  mlir_funcs = [
+      tfr_gen(func, op_defs)
+      for name, func in tf_inspect.getmembers(source, tf_inspect.isfunction)
+      if not method_prefix or name.startswith(method_prefix)
+  ]
+
+  return '\n'.join(mlir_funcs + op_defs.mlir_external_funcs())
diff --git a/tensorflow/compiler/mlir/tfr/python/tfr_gen_test.py b/tensorflow/compiler/mlir/tfr/python/tfr_gen_test.py
new file mode 100644
index 00000000000..88696490c4a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/python/tfr_gen_test.py
@@ -0,0 +1,563 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tfr_gen` module."""
+
+# pylint: disable=missing-function-docstring
+# pylint: disable=invalid-name
+# pylint: disable=g-direct-tensorflow-import
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+from tensorflow.compiler.mlir.python.mlir_wrapper import filecheck_wrapper as fw
+from tensorflow.compiler.mlir.tfr.python import composite
+from tensorflow.compiler.mlir.tfr.python.tfr_gen import tfr_gen_from_module as tfr_gen
+from tensorflow.compiler.mlir.tfr.resources import gen_test_ops as test_ops
+from tensorflow.python.ops import gen_array_ops as array_ops
+from tensorflow.python.ops import gen_math_ops as math_ops
+from tensorflow.python.platform import test
+
+
+Composite = composite.Composite
+
+#--- test fn for mlir location ---
+
+
+@Composite('TestInputNOp')
+def _tfr_loc_test(x):
+  n = 10
+  x_sum = x[0]
+  for i in range(1, n):
+    x_sum = math_ops.Add(x_sum, x[i])
+  return x_sum
+
+
+#--- test fn for tfr tensors ---
+
+
+@composite.Composite('TestNoOp')
+def _tfr_tensor_empty_arg():
+  pass
+
+
+@composite.Composite('TestIdentityOp')
+def _tfr_tensor_tensor(x):
+  return x
+
+
+@composite.Composite('TestIdentityNOp')
+def _tfr_tensor_tensor_list(x):
+  return x
+
+
+@composite.Composite('TestInputNOp')
+def _tfr_tensor_tensor_list_get_elt(x):
+  return x[1]
+
+
+@composite.Composite('TestOutputNOp')
+def _tfr_tensor_tensor_list_output(x):
+  return [x, x]
+
+
+@composite.Composite('TestTwoInputsOp')
+def _tfr_tensor_tensor_list_split(x, y, pred):
+  z, _ = array_ops.Split(axis=0, value=x, num_split=2)
+  (y, pred)  # pylint: disable=pointless-statement
+  return z
+
+
+@composite.Composite('TestNumAttrsOp')
+def _tfr_tensor_tensor_with_cst(x1, y1, x2, y2):
+  x = array_ops.OneHot(
+      indices=[0, 2, -1, x1], depth=y1, on_value=True, off_value=False)
+  (x, x2, y2)  # pylint: disable=pointless-statement
+  return
+
+
+@composite.Composite('TestTwoOutputsOp')
+def _tfr_tensor_two_output(x):
+  z = array_ops.Split(axis=0, value=x, num_split=2)
+  return z[0], z[1]
+
+
+#--- test fn for scf control flow ---
+
+
+@composite.Composite('TestTwoInputsOp')
+def _tfr_control_flow_if(x, y, pred):
+  if pred:
+    return x
+  else:
+    return y
+
+
+@composite.Composite('TestThreeInputsOp')
+def _tfr_control_flow_nested_if(x, y, z, select):
+  if select == 'x':
+    return x
+  elif select == 'y':
+    return y
+  else:
+    return z
+
+
+@composite.Composite('TestInputNOp')
+def _tfr_control_flow_range_for(x):
+  # TODO(fengliuai): use len(x) instead
+  n = 10
+  x_sum = x[0]
+  for i in range(1, n):
+    x_sum = math_ops.Add(x_sum, x[i])
+  return x_sum
+
+
+#--- test fn for tf ops ---
+
+
+@composite.Composite('TestComplexTFOp')
+def _tfr_tf_ops_complex(lhs, rhs):
+  left_padding, _ = array_ops.SplitV(
+      value=lhs, size_splits=[rhs, -1], axis=0, num_split=2)
+  _, right_padding = array_ops.SplitV(
+      value=lhs, size_splits=[rhs, rhs], axis=1, num_split=2)
+  return [left_padding, right_padding]
+
+
+@composite.Composite('TestIdentityOp')
+def _tfr_tf_ops_tensor(x):
+  return array_ops.Identity(x)
+
+
+@composite.Composite('TestTwoInputsOp')
+def _tfr_tf_ops_tensors(x, y, pred):
+  if pred:
+    return math_ops.Add(x, y)
+  else:
+    return array_ops.Concat(0, [x, y])
+
+
+@composite.Composite('TestInputNOp')
+def _tfr_tf_ops_with_defaults(ins):
+  return test_ops.TestTwoInputsOp(ins[0], ins[1])
+
+
+#--- test fn for tfr attributes ---
+
+
+@composite.Composite('TestNumAttrsOp')
+def _tfr_attrs_num_type(x, y, x1, y1):
+  # int
+  z0 = [x, y]
+  z1 = x == y
+  z2 = x < y
+  z3 = x <= y
+  z4 = x > y
+  z5 = x >= y
+  z6 = x != y
+  z7 = x + y
+  z8 = x - y
+  z8 += x
+  z8 += 1
+  (z0, z1, z2, z3, z4, z5, z6, z7, z8)  # pylint: disable=pointless-statement
+
+  # float
+  z9 = x1 > y1
+  z10 = x1 + y1
+  z11 = [x1, y1]
+  (z9, z10, z11)  # pylint: disable=pointless-statement
+  return
+
+
+@composite.Composite('TestNonNumAttrsOp')
+def _tfr_attrs_tfr_type(x, y, z):
+  z1 = x == y
+  z2 = x == 'test'
+  z3 = y == z
+  (z1, z2, z3)  # pylint: disable=pointless-statement
+  return
+
+
+#--- test fn for shapes ---
+
+
+@composite.Composite('TestIdentityOp')
+def _tfr_shapes(x):
+  s1 = x.shape
+  s3 = x.shape.as_list()
+
+  for i in range(len(s3)):
+    s3[i]  # pylint: disable=pointless-statement
+
+  for i in range(1, len(s3), 2):
+    s3[i]  # pylint: disable=pointless-statement
+
+  s5 = array_ops.Shape(x)
+  (s1, s3, s5)  # pylint: disable=pointless-statement
+  return x
+
+
+class TFRGenTestBase(test.TestCase):
+
+  def _check_code(self, tfr_code, exp_tfr_code):
+    return self.assertTrue(fw.check(str(tfr_code), exp_tfr_code), str(tfr_code))
+
+
+class TFRGenTensorTest(TFRGenTestBase):
+  """MLIR Generation Tests for MLIR TFR Program."""
+
+  def test_tfr_loc(self):
+    mlir_code = tfr_gen(sys.modules[__name__], '_tfr_loc', [test_ops])
+    mlir_code_exp = r"""
+      CHECK-LABEL: tfr.func @tf__test_input_n_op(%x: !tfr.tensor_list) -> (!tfr.tensor) {
+      CHECK-NEXT:   %[[n:.*]] = constant 10 : i64
+      CHECK-SAME        loc("tfr_gen_test.py":%{{.*}}:6)
+      CHECK-NEXT:   %[[cst:.*]] = constant 0 : index
+      CHECK-SAME        loc("tfr_gen_test.py":%[[sum_line:.*]]:10)
+      CHECK-NEXT:   %[[elt:.*]] = tfr.get_element %x[%[[cst]]] : (!tfr.tensor_list, index) -> !tfr.tensor
+      CHECK-SAME        loc("tfr_gen_test.py":%[[sum_line]]:10)
+      CHECK-NEXT:   %[[cst_1:.*]] = constant 1 : i64
+      CHECK-SAME        loc("tfr_gen_test.py":%[[for_line:.*]]:2)
+      CHECK-NEXT:   %[[begin:.*]] = index_cast %[[cst_1]] : i64 to index
+      CHECK-SAME        loc("tfr_gen_test.py":%[[for_line]]:2)
+      CHECK-NEXT:   %[[end:.*]] = index_cast %[[n]] : i64 to index
+      CHECK-SAME        loc("tfr_gen_test.py":%[[for_line]]:2)
+      CHECK-NEXT:   %[[step:.*]] = constant 1 : index
+      CHECK-SAME        loc("tfr_gen_test.py":%[[for_line]]:2)
+      CHECK-NEXT:   %[[for_stmt:.*]] = scf.for %[[itr_1:.*]] = %[[begin]] to %[[end]] step %[[step]]
+      CHECK-SAME:       iter_args(%[[it_arg:.*]] = %[[elt]]) -> (!tfr.tensor) {
+      CHECK-NEXT:     %[[elt_1:.*]] = tfr.get_element %x[%itr_1] : (!tfr.tensor_list, index) -> !tfr.tensor
+      CHECK-SAME        loc("tfr_gen_test.py":%[[add_line:.*]]:34)
+      CHECK-NEXT:     %[[Add:.*]] = tfr.call @tf__add(%[[it_arg]], %[[elt_1]]) : (!tfr.tensor, !tfr.tensor) -> (!tfr.tensor)
+      CHECK-SAME        loc("tfr_gen_test.py":%[[add_line]]:12)
+      CHECK-NEXT:     scf.yield %[[Add]] : !tfr.tensor
+      CHECK-SAME        loc(unknown)
+      CHECK-NEXT:   }
+      CHECK-SAME        loc("tfr_gen_test.py":%[[for_line]]:2)
+      CHECK-NEXT:   %{{.*}} = constant true
+      CHECK-SAME        loc(unknown)
+      CHECK-NEXT:   tfr.return %[[for_stmt]] : !tfr.tensor
+      CHECK-SAME        loc(unknown)
+      CHECK-NEXT: }
+      CHECK-SAME        loc("tfr_gen_test.py":%{{def_line:.*}}:0)
+    """
+    self._check_code(mlir_code, mlir_code_exp)
+
+  def test_tfr_tensors(self):
+    mlir_code = tfr_gen(sys.modules[__name__], '_tfr_tensor', [test_ops])
+    mlir_code_exp = r"""
+      CHECK-LABEL: tfr.func @tf__test_no_op() -> () {
+      CHECK-NEXT:    tfr.return
+      CHECK-NEXT: }
+
+      CHECK-LABEL: tfr.func @tf__test_identity_op(%x: !tfr.tensor) -> (!tfr.tensor) {
+      CHECK-NEXT: constant true
+      CHECK-NEXT:    tfr.return %x : !tfr.tensor
+      CHECK-NEXT: }
+
+      CHECK-LABEL: tfr.func @tf__test_identity_n_op(%x: !tfr.tensor_list) -> (!tfr.tensor_list) {
+      CHECK-NEXT: constant true
+      CHECK-NEXT:    tfr.return %x : !tfr.tensor_list
+      CHECK-NEXT: }
+
+      CHECK-LABEL: tfr.func @tf__test_input_n_op(%x: !tfr.tensor_list) -> (!tfr.tensor) {
+      CHECK-NEXT: constant true
+      CHECK-NEXT: %[[index:.*]] = constant 1 : index
+      CHECK-NEXT: %[[sub:.*]] = tfr.get_element %x[%cst_1] : (!tfr.tensor_list, index) -> !tfr.tensor
+      CHECK-NEXT: tfr.return %[[sub]] : !tfr.tensor
+      CHECK-NEXT: }
+
+      CHECK-LABEL: tfr.func @tf__test_output_n_op(%x: !tfr.tensor) -> (!tfr.tensor_list) {
+      CHECK-NEXT: constant true
+      CHECK-NEXT: %[[list:.*]] = "tfr.build_list"(%x, %x) : (!tfr.tensor, !tfr.tensor) -> !tfr.tensor_list
+      CHECK-NEXT: tfr.return %[[list]] : !tfr.tensor_list
+      CHECK-NEXT: }
+
+      CHECK-LABEL: tfr.func @tf__test_two_inputs_op(%x: !tfr.tensor, %y: !tfr.tensor, %pred: i1{tfr.name="pred",tfr.default=false}) -> (!tfr.tensor) {
+      CHECK-NEXT: %[[cst:.*]] = constant 0 : i64
+      CHECK-NEXT: %[[cst_1:.*]] = constant 2 : i64
+      CHECK-NEXT: %[[cst_2:.*]] = "tfr.constant_tensor"(%[[cst]]) : (i64) -> !tfr.tensor
+      CHECK-NEXT: %[[Split:.*]] = tfr.call @tf__split(%[[cst_2]], %x, %[[cst_1]]) : (!tfr.tensor, !tfr.tensor, i64) -> (!tfr.tensor_list)
+      CHECK-NEXT: %[[cst_4:.*]] = constant 0 : index
+      CHECK-NEXT: %[[elt:.*]] = tfr.get_element %[[Split]][%idx] : (!tfr.tensor_list, index) -> !tfr.tensor
+      CHECK-NEXT: %[[cst_5:.*]] = constant 1 : index
+      CHECK-NEXT: %[[elt_1:.*]] = tfr.get_element %[[Split]][%idx_1] : (!tfr.tensor_list, index) -> !tfr.tensor
+      CHECK-NEXT: constant true
+      CHECK-NEXT: tfr.return %[[elt]] : !tfr.tensor
+      CHECK-NEXT: }
+
+      CHECK-LABEL: tfr.func @tf__test_num_attrs_op(%x1: i64{tfr.name="x1",tfr.default=-10}, %y1: i64{tfr.name="y1",tfr.default=1}, %x2: f32{tfr.name="x2",tfr.default=0.0}, %y2: f32{tfr.name="y2",tfr.default=-3.0}) -> () {
+      CHECK-NEXT: %[[cst:.*]] = constant 0 : i64
+      CHECK-NEXT: %[[cst_1:.*]] = constant 2 : i64
+      CHECK-NEXT: %[[cst_2:.*]] = constant 1 : i64
+      CHECK-NEXT: %[[zero:.*]] = constant 0 : i64
+      CHECK-NEXT: %[[cst_3:.*]] = subi %zero, %cst_2 : i64
+      CHECK-NEXT: %[[list:.*]] = "tfr.build_list"(%[[cst]], %[[cst_1]], %[[cst_3]], %x1) : (i64, i64, i64, i64) -> !tfr.attr
+      CHECK-NEXT: %[[cst_4:.*]] = constant true
+      CHECK-NEXT: %[[cst_5:.*]] = constant false
+      CHECK-NEXT: %[[cst_6:.*]] = "tfr.constant_tensor"(%[[list]]) : (!tfr.attr) -> !tfr.tensor
+      CHECK-NEXT: %[[cst_7:.*]] = "tfr.constant_tensor"(%y1) : (i64) -> !tfr.tensor
+      CHECK-NEXT: %[[cst_8:.*]] = "tfr.constant_tensor"(%[[cst_4]]) : (i1) -> !tfr.tensor
+      CHECK-NEXT: %[[cst_9:.*]] = "tfr.constant_tensor"(%[[cst_5]]) : (i1) -> !tfr.tensor
+      CHECK-NEXT: %[[cst_10:.*]] = constant -1 : i64
+      CHECK-NEXT: %[[OneHot:.*]] = tfr.call @tf__one_hot(%[[cst_6]], %[[cst_7]], %[[cst_8]], %[[cst_9]], %[[cst_10]])
+      CHECK-SAME:   (!tfr.tensor, !tfr.tensor, !tfr.tensor, !tfr.tensor, i64) -> (!tfr.tensor)
+      CHECK-NEXT: constant true
+      CHECK-NEXT: tfr.return
+      CHECK-NEXT: }
+
+      CHECK-LABEL: tfr.func @tf__test_two_outputs_op(%x: !tfr.tensor) -> (!tfr.tensor, !tfr.tensor) {
+      CHECK-NEXT: %[[cst:.*]] = constant 0 : i64
+      CHECK-NEXT: %[[cst_1:.*]] = constant 2 : i64
+      CHECK-NEXT: %[[cst_2:.*]] = "tfr.constant_tensor"(%[[cst]]) : (i64) -> !tfr.tensor
+      CHECK-NEXT: %[[Split:.*]] = tfr.call @tf__split(%[[cst_2]], %x, %[[cst_1]]) : (!tfr.tensor, !tfr.tensor, i64) -> (!tfr.tensor_list)
+      CHECK-NEXT: constant true
+      CHECK-NEXT: %[[cst_4:.*]] = constant 0 : index
+      CHECK-NEXT: %[[elt:.*]] = tfr.get_element %[[Split]][%cst_4] : (!tfr.tensor_list, index) -> !tfr.tensor
+      CHECK-NEXT: %[[cst_5:.*]] = constant 1 : index
+      CHECK-NEXT: %[[elt_1:.*]] = tfr.get_element %[[Split]][%cst_5] : (!tfr.tensor_list, index) -> !tfr.tensor
+      CHECK-NEXT: tfr.return %[[elt]], %[[elt_1]] : !tfr.tensor, !tfr.tensor
+      CHECK-NEXT: }
+    """
+    self._check_code(mlir_code, mlir_code_exp)
+
+  def test_tfr_control_flow(self):
+    mlir_code = tfr_gen(sys.modules[__name__], '_tfr_control_flow', [test_ops])
+    mlir_code_exp = r"""
+      CHECK-LABEL: tfr.func @tf__test_two_inputs_op(%x: !tfr.tensor, %y: !tfr.tensor,
+      CHECK-SAME:     %pred: i1{tfr.name="pred",tfr.default=false}) -> (!tfr.tensor) {
+      CHECK-NEXT: %[[if:.*]] = scf.if %pred -> (!tfr.tensor) {
+      CHECK-NEXT:   constant true
+      CHECK-NEXT:   scf.yield %x : !tfr.tensor
+      CHECK-NEXT: } else {
+      CHECK-NEXT:   constant true
+      CHECK-NEXT:   scf.yield %y : !tfr.tensor
+      CHECK-NEXT:   }
+      CHECK-NEXT:   tfr.return %if_stmt : !tfr.tensor
+      CHECK-NEXT: }
+
+      CHECK-LABEL: tfr.func @tf__test_three_inputs_op(%x: !tfr.tensor, %y: !tfr.tensor, %z: !tfr.tensor,
+      CHECK-SAME:     %select: !tfr.attr{tfr.name="act",tfr.default="z"}) -> (!tfr.tensor) {
+      CHECK-NEXT:   %[[cst:.*]] = tfr.constant "x" -> !tfr.attr
+      CHECK-NEXT:   %[[eq:.*]] = tfr.equal %select, %[[cst]] -> i1
+      CHECK-NEXT:   %[[if_stmt:.*]] = scf.if %[[eq]] -> (!tfr.tensor) {
+      CHECK-NEXT:     %[[cst_1:.*]] = constant true
+      CHECK-NEXT:     scf.yield %x : !tfr.tensor
+      CHECK-NEXT:   } else {
+      CHECK-NEXT:     %[[cst_2:.*]] = tfr.constant "y" -> !tfr.attr
+      CHECK-NEXT:     %[[eq_1:.*]] = tfr.equal %select, %[[cst_2]] -> i1
+      CHECK-NEXT:     %[[if_stmt1:.*]] = scf.if %[[eq_1]] -> (!tfr.tensor) {
+      CHECK-NEXT:       %[[cst_3:.*]] = constant true
+      CHECK-NEXT:       scf.yield %y : !tfr.tensor
+      CHECK-NEXT:     } else {
+      CHECK-NEXT:       %[[cst_4:.*]] = constant true
+      CHECK-NEXT:       scf.yield %z : !tfr.tensor
+      CHECK-NEXT:     }
+      CHECK-NEXT:     scf.yield %[[if_stmt1]] : !tfr.tensor
+      CHECK-NEXT:   }
+      CHECK-NEXT:   tfr.return %[[if_stmt]] : !tfr.tensor
+      CHECK-NEXT: }
+
+      CHECK-LABEL: tfr.func @tf__test_input_n_op(%x: !tfr.tensor_list) -> (!tfr.tensor) {
+      CHECK-NEXT:   %[[n:.*]] = constant 10 : i64
+      CHECK-NEXT:   %[[cst:.*]] = constant 0 : index
+      CHECK-NEXT:   %[[elt:.*]] = tfr.get_element %x[%[[cst]]] : (!tfr.tensor_list, index) -> !tfr.tensor
+      CHECK-NEXT:   %[[cst_1:.*]] = constant 1 : i64
+      CHECK-NEXT:   %[[begin:.*]] = index_cast %[[cst_1]] : i64 to index
+      CHECK-NEXT:   %[[end:.*]] = index_cast %[[n]] : i64 to index
+      CHECK-NEXT:   %[[step:.*]] = constant 1 : index
+      CHECK-NEXT:   %[[for_stmt:.*]] = scf.for %[[itr_1:.*]] = %[[begin]] to %[[end]] step %[[step]]
+      CHECK-SAME:       iter_args(%[[it_arg:.*]] = %[[elt]]) -> (!tfr.tensor) {
+      CHECK-NEXT:     %[[elt_1:.*]] = tfr.get_element %x[%itr_1] : (!tfr.tensor_list, index) -> !tfr.tensor
+      CHECK-NEXT:     %[[Add:.*]] = tfr.call @tf__add(%[[it_arg]], %[[elt_1]]) : (!tfr.tensor, !tfr.tensor) -> (!tfr.tensor)
+      CHECK-NEXT:     scf.yield %[[Add]] : !tfr.tensor
+      CHECK-NEXT:   }
+      CHECK-NEXT:   %{{.*}} = constant true
+      CHECK-NEXT:   tfr.return %[[for_stmt]] : !tfr.tensor
+      CHECK-NEXT: }
+    """
+    self._check_code(mlir_code, mlir_code_exp)
+
+  def test_tfr_tf_ops(self):
+    mlir_code = tfr_gen(sys.modules[__name__], '_tfr_tf_ops', [test_ops])
+    mlir_code_exp = r"""
+      CHECK-LABEL: tfr.func @tf__test_complex_tf_op(%lhs: !tfr.tensor, %rhs: !tfr.tensor) -> (!tfr.tensor_list) {
+      CHECK-NEXT:   %[[cst:.*]] = constant 1 : i64
+      CHECK-NEXT:   %[[zero:.*]] = constant 0 : i64
+      CHECK-NEXT:   %[[cst_1:.*]] = subi %[[zero]], %cst : i64
+      CHECK-NEXT:   %[[cst_2:.*]] = "tfr.constant_tensor"(%[[cst_1]]) : (i64) -> !tfr.tensor
+      CHECK-NEXT:   %[[list:.*]] = "tfr.build_list"(%rhs, %[[cst_2]]) : (!tfr.tensor, !tfr.tensor) -> !tfr.tensor_list
+      CHECK-NEXT:   %[[cst_3:.*]] = constant 0 : i64
+      CHECK-NEXT:   %[[cst_4:.*]] = constant 2 : i64
+      CHECK-NEXT:   %[[zero_1:.*]] = constant 0 : i64
+      CHECK-NEXT:   %[[pack:.*]] = tfr.call @tf__pack(%[[list]], %[[zero_1]]) : (!tfr.tensor_list, i64) -> !tfr.tensor
+      CHECK-NEXT:   %[[cst_5:.*]] = "tfr.constant_tensor"(%[[cst_3]]) : (i64) -> !tfr.tensor
+      CHECK-NEXT:   %[[SplitV:.*]] = tfr.call @tf__split_v(%lhs, %[[pack]], %[[cst_5]], %[[cst_4]])
+      CHECK-NEXT:   %[[idx:.*]] = constant 0 : index
+      CHECK-NEXT:   %[[elt:.*]] = tfr.get_element %SplitV[%idx] : (!tfr.tensor_list, index) -> !tfr.tensor
+      CHECK-NEXT:   %[[idx_1:.*]] = constant 1 : index
+      CHECK-NEXT:   %[[elt_1:.*]] = tfr.get_element %SplitV[%idx_1] : (!tfr.tensor_list, index) -> !tfr.tensor
+      CHECK-NEXT:   %[[list_1:.*]] = "tfr.build_list"(%rhs, %rhs) : (!tfr.tensor, !tfr.tensor) -> !tfr.tensor_list
+      CHECK-NEXT:   %[[cst_6:.*]] = constant 1 : i64
+      CHECK-NEXT:   %[[cst_7:.*]] = constant 2 : i64
+      CHECK-NEXT:   %[[zero_2:.*]] = constant 0 : i64
+      CHECK-NEXT:   %[[pack_1:.*]] = tfr.call @tf__pack(%[[list_1]], %[[zero_2]]) : (!tfr.tensor_list, i64) -> !tfr.tensor
+      CHECK-NEXT:   %[[cst_8:.*]] = "tfr.constant_tensor"(%[[cst_6]]) : (i64) -> !tfr.tensor
+      CHECK-NEXT:   %[[SplitV_1:.*]] = tfr.call @tf__split_v(%lhs, %[[pack_1]], %[[cst_8]], %[[cst_7]])
+      CHECK-NEXT:   %[[idx_2:.*]] = constant 0 : index
+      CHECK-NEXT:   %[[elt_2:.*]] = tfr.get_element %SplitV_1[%idx_2] : (!tfr.tensor_list, index) -> !tfr.tensor
+      CHECK-NEXT:   %[[idx_3:.*]] = constant 1 : index
+      CHECK-NEXT:   %[[elt_3:.*]] = tfr.get_element %SplitV_1[%idx_3] : (!tfr.tensor_list, index) -> !tfr.tensor
+      CHECK-NEXT:   %[[cst_9:.*]] = constant true
+      CHECK-NEXT:   %[[list_2:.*]] = "tfr.build_list"(%[[elt]], %[[elt_3]]) : (!tfr.tensor, !tfr.tensor) -> !tfr.tensor_list
+      CHECK-NEXT:   tfr.return %[[list_2]] : !tfr.tensor_list
+      CHECK-NEXT:   }
+
+      CHECK-LABEL: tfr.func @tf__test_identity_op(%x: !tfr.tensor) -> (!tfr.tensor) {
+      CHECK-NEXT:    %cst = constant true
+      CHECK-NEXT:    %[[Id:.*]] = tfr.call @tf__identity(%x) : (!tfr.tensor) -> (!tfr.tensor)
+      CHECK-NEXT:    tfr.return %[[Id]] : !tfr.tensor
+      CHECK-NEXT: }
+
+      CHECK-LABEL: tfr.func @tf__test_two_inputs_op(%x: !tfr.tensor, %y: !tfr.tensor,
+      CHECK-SAME:     %pred: i1{tfr.name="pred",tfr.default=false}) -> (!tfr.tensor) {
+      CHECK-NEXT:   %[[if_stmt:.*]] = scf.if %pred -> (!tfr.tensor) {
+      CHECK-NEXT:     %cst = constant true
+      CHECK-NEXT:     %[[Add:.*]] = tfr.call @tf__add(%x, %y) : (!tfr.tensor, !tfr.tensor) -> (!tfr.tensor)
+      CHECK-NEXT:     scf.yield %[[Add]] : !tfr.tensor
+      CHECK-NEXT:   } else {
+      CHECK-NEXT:     %cst_1 = constant true
+      CHECK-NEXT:     %[[cst_2:.*]] = constant 0 : i64
+      CHECK-NEXT:     %[[list:.*]] = "tfr.build_list"(%x, %y) : (!tfr.tensor, !tfr.tensor) -> !tfr.tensor_list
+      CHECK-NEXT:     %[[Concat:.*]] = tfr.call @tf__concat(%[[cst_2]], %[[list]]) : (i64, !tfr.tensor_list) -> (!tfr.tensor)
+      CHECK-NEXT:     scf.yield %[[Concat]] : !tfr.tensor
+      CHECK-NEXT:   }
+      CHECK-NEXT:   tfr.return %[[if_stmt]] : !tfr.tensor
+      CHECK-NEXT: }
+
+      CHECK-LABEL: tfr.func @tf__test_input_n_op(%ins: !tfr.tensor_list) -> (!tfr.tensor) {
+      CHECK-NEXT:   %cst = constant true
+      CHECK-NEXT:   %[[cst_1:.*]] = constant 0 : index
+      CHECK-NEXT:   %[[elt:.*]] = tfr.get_element %ins[%cst_1] : (!tfr.tensor_list, index) -> !tfr.tensor
+      CHECK-NEXT:   %[[cst_2:.*]] = constant 1 : index
+      CHECK-NEXT:   %[[elt_1:.*]] = tfr.get_element %ins[%cst_2] : (!tfr.tensor_list, index) -> !tfr.tensor
+      CHECK-NEXT:   %[[cst_3:.*]] = constant false
+      CHECK-NEXT:   %[[call:.*]] = tfr.call @tf__test_two_inputs_op(
+      CHECK-SAME:     %[[elt]], %[[elt_1]], %[[cst_3]]) : (!tfr.tensor, !tfr.tensor, i1) -> (!tfr.tensor)
+      CHECK-NEXT:   tfr.return %[[call]] : !tfr.tensor
+      CHECK-NEXT: }
+
+      CHECK-LABEL: tfr.func @tf__add_(!tfr.tensor<T>,!tfr.tensor<T>) -> (!tfr.tensor<T>) attributes {T}
+
+      CHECK-LABEL: tfr.func @tf__concat_(!tfr.tensor<i32_>,!tfr.tensor_list<N,T>) -> (!tfr.tensor<T>) attributes {N,T,i32_}
+
+      CHECK-LABEL: tfr.func @tf__identity_(!tfr.tensor<T>) -> (!tfr.tensor<T>) attributes {T}
+
+      CHECK-LABEL: tfr.func @tf__pack_(!tfr.tensor_list<N,T>,i64{tfr.name="axis"}) -> (!tfr.tensor<T>) attributes {N,T,axis}
+
+      CHECK-LABEL: tfr.func @tf__split_v_(!tfr.tensor<T>,!tfr.tensor<Tlen>,!tfr.tensor<i32_>,i64{tfr.name="num_split"}) -> (!tfr.tensor_list<num_split,T>) attributes {T,Tlen,i32_,num_split}
+
+      CHECK-LABEL: tfr.func @tf__test_two_inputs_op_(!tfr.tensor<T>,!tfr.tensor<T>,i1{tfr.name="pred"}) -> (!tfr.tensor<T>) attributes {T,pred}
+
+      CHECK-LABEL: tfr.func @tf__test_complex_tf_op_(!tfr.tensor<T>,!tfr.tensor<Tlen>,i64{tfr.name="N"}) -> (!tfr.tensor_list<N,T>) attributes {N,T,Tlen}
+
+      CHECK-LABEL: tfr.func @tf__test_identity_op_(!tfr.tensor<T>) -> (!tfr.tensor<T>) attributes {T}
+
+      CHECK-LABEL: tfr.func @tf__test_two_inputs_op_(!tfr.tensor<T>,!tfr.tensor<T>,i1{tfr.name="pred"}) -> (!tfr.tensor<T>) attributes {T,pred}
+
+      CHECK-LABEL: tfr.func @tf__test_input_n_op_(!tfr.tensor_list<N,T>) -> (!tfr.tensor<T>) attributes {N,T}
+    """
+    self._check_code(mlir_code, mlir_code_exp)
+
+  def test_tfr_attrs(self):
+    mlir_code = tfr_gen(sys.modules[__name__], '_tfr_attrs', [test_ops])
+    mlir_code_exp = r"""
+      CHECK-LABEL: tfr.func @tf__test_num_attrs_op(
+      CHECK-SAME:     %x: i64{tfr.name="x1",tfr.default=-10},
+      CHECK-SAME:     %y: i64{tfr.name="y1",tfr.default=1},
+      CHECK-SAME:     %x1: f32{tfr.name="x2",tfr.default=0.0},
+      CHECK-SAME:     %y1: f32{tfr.name="y2",tfr.default=-3.0}) -> () {
+      CHECK-NEXT: %{{.*}} = "tfr.build_list"(%x, %y) : (i64, i64) -> !tfr.attr
+      CHECK-NEXT: %{{.*}} = cmpi "eq", %x, %y : i64
+      CHECK-NEXT: %{{.*}} = cmpi "ult", %x, %y : i64
+      CHECK-NEXT: %{{.*}} = cmpi "ule", %x, %y : i64
+      CHECK-NEXT: %{{.*}} = cmpi "ugt", %x, %y : i64
+      CHECK-NEXT: %{{.*}} = cmpi "uge", %x, %y : i64
+      CHECK-NEXT: %{{.*}} = cmpi "ne", %x, %y : i64
+      CHECK-NEXT: %{{.*}} = addi %x, %y : i64
+      CHECK-NEXT: %{{.*}} = subi %x, %y : i64
+      CHECK-NEXT: %[[add_1:.*]] = addi %sub, %x : i64
+      CHECK-NEXT: %[[cst:.*]] = constant 1 : i64
+      CHECK-NEXT: %{{.*}} = addi %[[add_1]], %[[cst]] : i64
+      CHECK-NEXT: %{{.*}} = cmpf "ugt", %x1, %y1 : f32
+      CHECK-NEXT: %{{.*}} = addf %x1, %y1 : f32
+      CHECK-NEXT: %{{.*}} = "tfr.build_list"(%x1, %y1) : (f32, f32) -> !tfr.attr
+      CHECK-NEXT: %{{.*}} = constant true
+      CHECK-NEXT: tfr.return
+      CHECK-NEXT: }
+
+      CHECK-LABEL: tfr.func @tf__test_non_num_attrs_op(
+      CHECK-SAME:     %x: !tfr.attr{tfr.name="z"},
+      CHECK-SAME:     %y: !tfr.attr{tfr.name="x",tfr.default="hello"},
+      CHECK-SAME:     %z: !tfr.attr{tfr.name="y",tfr.default=f32}) -> () {
+      CHECK-NEXT: %{{.*}} = tfr.equal %x, %y -> i1
+      CHECK-NEXT: %[[cst:.*]] = tfr.constant "test" -> !tfr.attr
+      CHECK-NEXT: %{{.*}} = tfr.equal %x, %[[cst]] -> i1
+      CHECK-NEXT: %{{.*}} = tfr.equal %y, %z -> i1
+      CHECK-NEXT: %{{.*}} = constant true
+      CHECK-NEXT: tfr.return
+      CHECK-NEXT: }
+    """
+    self._check_code(mlir_code, mlir_code_exp)
+
+  def test_tf_tensor_shape(self):
+    mlir_code = tfr_gen(sys.modules[__name__], '_tfr_shapes', [test_ops])
+    mlir_code_exp = r"""
+      CHECK-LABEL: tfr.func @tf__test_identity_op(%x: !tfr.tensor) -> (!tfr.tensor) {
+      CHECK-NEXT:   %[[shape:.*]] = tfr.get_shape %x -> !shape.shape
+
+      CHECK-NEXT:   %[[shape_1:.*]] = tfr.get_shape %x -> !shape.shape
+      CHECK-NEXT:   %[[len:.*]] = shape.rank %[[shape_1]] : !shape.shape -> !shape.size
+      CHECK-NEXT:   %[[index:.*]] = shape.size_to_index %[[len]] : !shape.size
+      CHECK-NEXT:   %[[begin:.*]] = constant 0 : index
+      CHECK-NEXT:   %[[step:.*]] = constant 1 : index
+      CHECK-NEXT:   scf.for %[[itr_1:.*]] = %[[begin]] to %[[index]] step %[[step]]  {
+      CHECK-NEXT:     %[[size:.*]] = shape.get_extent %[[shape_1]], %[[itr_1]]: !shape.shape, index -> !shape.size
+      CHECK-NEXT:     %[[elt:.*]] = shape.size_to_index %[[size]] : !shape.size
+      CHECK-NEXT:     scf.yield
+      CHECK-NEXT:   }
+
+      CHECK-NEXT:   %[[cst:.*]] = constant 1 : i64
+      CHECK-NEXT:   %[[len_1:.*]] = shape.rank %shape_1 : !shape.shape -> !shape.size
+      CHECK-NEXT:   %[[len_size_1:.*]] = shape.size_to_index %[[len_1]] : !shape.size
+      CHECK-NEXT:   %[[cst_1:.*]] = constant 2 : i64
+      CHECK-NEXT:   %[[begin_1:.*]] = index_cast %[[cst]] : i64 to index
+      CHECK-NEXT:   %[[step_1:.*]] = index_cast %[[cst_1]] : i64 to index
+      CHECK-NEXT:   scf.for %[[itr_3:.*]] = %[[begin_1]] to %[[len_size_1]] step %[[step_1]]
+
+      CHECK:        %[[cst:.*]] = tfr.constant i32 -> !tfr.attr
+      CHECK-NEXT:   %[[Shape:.*]] = tfr.call @tf__shape(%x, %[[cst]]) : (!tfr.tensor, !tfr.attr) -> (!tfr.tensor)
+      CHECK-NEXT:   %{{.*}} = constant true
+      CHECK-NEXT:   tfr.return %x : !tfr.tensor
+      CHECK-NEXT: }
+    """
+    self._check_code(mlir_code, mlir_code_exp)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/compiler/mlir/tfr/python/tfr_wrapper.cc b/tensorflow/compiler/mlir/tfr/python/tfr_wrapper.cc
new file mode 100644
index 00000000000..b7372cffe2d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/python/tfr_wrapper.cc
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/AsmState.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Verifier.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+
+PYBIND11_MODULE(tfr_wrapper, m) {
+  m.def("verify", [](std::string input) {
+    mlir::MLIRContext ctx(/*loadAllDialects=*/true);
+    auto& registry = ctx.getDialectRegistry();
+    registry.insert<mlir::scf::SCFDialect, mlir::TF::TensorFlowDialect,
+                    mlir::StandardOpsDialect, mlir::shape::ShapeDialect,
+                    mlir::TFR::TFRDialect>();
+    ctx.getDialectRegistry().loadAll(&ctx);
+
+    llvm::SourceMgr source_mgr = llvm::SourceMgr();
+    source_mgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(input),
+                                  llvm::SMLoc());
+    auto module = mlir::parseSourceFile(source_mgr, &ctx);
+    if (!module) {
+      return false;
+    }
+
+    mlir::SourceMgrDiagnosticHandler sourceMgrHandler(source_mgr, &ctx);
+    if (failed(mlir::verify(*module))) {
+      module->emitError("Invalid MLIR module: failed verification.");
+      return false;
+    }
+    return true;
+  });
+}
diff --git a/tensorflow/compiler/mlir/tfr/resources/BUILD b/tensorflow/compiler/mlir/tfr/resources/BUILD
new file mode 100644
index 00000000000..62ca65c5b57
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/resources/BUILD
@@ -0,0 +1,97 @@
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+
+package(
+    default_visibility = [
+        ":friends",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+package_group(
+    name = "friends",
+    includes = ["//third_party/mlir:subpackages"],
+    packages = [
+        "//learning/brain/experimental/mlir/tfr/...",
+        "//tensorflow/compiler/mlir/...",
+    ],
+)
+
+filegroup(
+    name = "decomposition_lib",
+    srcs = ["decomposition_lib.mlir"],
+)
+
+cc_library(
+    name = "composite_ops_cc",
+    srcs = ["composite_ops.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+    alwayslink = 1,
+)
+
+tf_custom_op_library(
+    name = "composite_ops.so",
+    srcs = [
+        "composite_ops.cc",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_composite_ops",
+    out = "gen_composite_ops.py",
+    deps = [
+        ":composite_ops_cc",
+    ],
+)
+
+tf_custom_op_py_library(
+    name = "composite_ops",
+    dso = [":composite_ops.so"],
+    kernels = [":composite_ops_cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":gen_composite_ops",
+    ],
+)
+
+cc_library(
+    name = "test_ops_cc",
+    srcs = ["test_ops.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+    alwayslink = 1,
+)
+
+tf_custom_op_library(
+    name = "test_ops.so",
+    srcs = [
+        "test_ops.cc",
+    ],
+)
+
+tf_gen_op_wrapper_py(
+    name = "gen_test_ops",
+    out = "gen_test_ops.py",
+    deps = [
+        ":test_ops_cc",
+    ],
+)
+
+tf_custom_op_py_library(
+    name = "test_ops",
+    dso = ["test_ops.so"],
+    kernels = [
+        ":test_ops_cc",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":gen_test_ops",
+    ],
+)
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_ops.cc b/tensorflow/compiler/mlir/tfr/resources/composite_ops.cc
similarity index 58%
rename from tensorflow/c/eager/parallel_device/parallel_device_ops.cc
rename to tensorflow/compiler/mlir/tfr/resources/composite_ops.cc
index 1decffca047..8120625bc89 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_ops.cc
+++ b/tensorflow/compiler/mlir/tfr/resources/composite_ops.cc
@@ -14,13 +14,26 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
 
-// TODO(allenl): Figure out if we need this op, and if so whether we should move
-// it to core TF. Right now the eager C API does some checking of op
-// registrations before calling into custom devices, but we may be able to avoid
-// that.
-REGISTER_OP("DeviceID")
-    .Output("device_id: int64")
-    .SetIsStateful()
-    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
+namespace tensorflow {
+
+REGISTER_OP("MyAddN")
+    .Input("inputs: N * T")
+    .Output("sum: T")
+    .Attr("N: int >= 1")
+    .Attr("T: {numbertype, variant}")
+    .SetIsCommutative()
+    .SetIsAggregate();
+
+REGISTER_OP("MyBiasedDense")
+    .Input("input: T")
+    .Input("weight: T")
+    .Input("bias: T")
+    .Output("out: T")
+    .Attr("T: {float, int8}")
+    .Attr("act: {'', 'relu', 'relu6'} = ''");
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfr/resources/decomposition_lib.mlir b/tensorflow/compiler/mlir/tfr/resources/decomposition_lib.mlir
new file mode 100644
index 00000000000..f67d24c9fec
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/resources/decomposition_lib.mlir
@@ -0,0 +1,109 @@
+// A test resource file which contains some pre-defined internal tfr.functions
+// for decomposition and external tfr.functions for raising the decomposition
+// result to the ops in the TF dialect.
+//
+// All the tfr.func functions are supposed to be translated from the Python
+// function with tf.composite annotation.
+// All the external tfr.func functions modeles the op signature defined by
+// OpDefs.
+
+tfr.func @tf__my_add_n(%values: !tfr.tensor_list,
+                       %n: i64 {tfr.name="N"}) -> !tfr.tensor {
+  %index = constant 0 : index
+  %cst = constant 1 : i64
+  %eq = cmpi "eq", %n, %cst : i64
+  %v1 = tfr.get_element %values[%index] : (!tfr.tensor_list, index) -> !tfr.tensor
+  %res = scf.if %eq -> !tfr.tensor {
+    scf.yield %v1 : !tfr.tensor
+  } else {
+    %step = index_cast %cst : i64 to index
+    %end = index_cast %n : i64 to index
+    %reduce = scf.for %i = %step to %end step %step iter_args(%reduce_iter=%v1) -> !tfr.tensor {
+      %v = tfr.get_element %values[%i] : (!tfr.tensor_list, index) -> !tfr.tensor
+      %reduce_next =  tfr.call @tf__add(%reduce_iter, %v) : (!tfr.tensor, !tfr.tensor) -> !tfr.tensor
+      scf.yield %reduce_next : !tfr.tensor
+    }
+    scf.yield %reduce : !tfr.tensor
+  }
+  tfr.return %res : !tfr.tensor
+}
+
+// Translated from tf.compose Python function.
+tfr.func @tf__my_biased_dense(%input: !tfr.tensor, %weight: !tfr.tensor,
+                              %bias: !tfr.tensor,
+                              %act: !tfr.attr{tfr.name="act", tfr.default=""}) -> !tfr.tensor {
+  %dot = tfr.call @tf__mat_mul(%input, %weight) : (!tfr.tensor, !tfr.tensor) -> !tfr.tensor
+  %add = tfr.call @tf__add(%dot, %bias) : (!tfr.tensor, !tfr.tensor) -> !tfr.tensor
+
+  %relu = tfr.constant "relu" -> !tfr.attr
+  %relu6 = tfr.constant "relu6" -> !tfr.attr
+
+  %is_relu = tfr.equal %act, %relu -> i1
+  %res = scf.if %is_relu -> !tfr.tensor {
+    %applied_relu = tfr.call @tf__relu(%add) : (!tfr.tensor) -> !tfr.tensor
+    scf.yield %applied_relu : !tfr.tensor
+  } else {
+    %is_relu6 = tfr.equal %act, %relu6 -> i1
+    %res1 = scf.if %is_relu6 -> !tfr.tensor {
+      %applied_relu6 = tfr.call @tf__relu6(%add) : (!tfr.tensor) -> !tfr.tensor
+      scf.yield %applied_relu6 : !tfr.tensor
+    } else {
+      scf.yield %add : !tfr.tensor
+    }
+    scf.yield %res1 : !tfr.tensor
+  }
+  tfr.return %res : !tfr.tensor
+}
+
+// This is a wong decomposition and used to verify that tf.Elu isn't decomposed
+// since its kernel has been registered.
+tfr.func @tf__elu_(%input: !tfr.tensor) -> !tfr.tensor {
+  tfr.return %input : !tfr.tensor
+}
+
+// Translated from:
+//
+// REGISTER_OP("Add")
+//     .Input("x: T")
+//     .Input("y: T")
+//     .Output("z: T")
+//     .Attr(
+//         "T: {bfloat16, half, float, double, uint8, int8, int16, int32, int64, "
+//         "complex64, complex128, string}")
+tfr.func @tf__add_(!tfr.tensor<T>, !tfr.tensor<T>)
+    -> !tfr.tensor<T> attributes{T}
+
+// Translated from:
+//
+// REGISTER_OP("MatMul")
+//     .Input("a: T")
+//     .Input("b: T")
+//     .Output("product: T")
+//     .Attr("transpose_a: bool = false")
+//     .Attr("transpose_b: bool = false")
+//     .Attr("T: {bfloat16, half, float, double, int32, int64, complex64, complex128}")
+// T is a derived attribute.
+// transpose_a and transpose_b is materialized attributes.
+tfr.func @tf__mat_mul_(!tfr.tensor<T>, !tfr.tensor<T>,
+                      i1 {tfr.name="transpose_a", tfr.default=false},
+                      i1 {tfr.name="transpose_b", tfr.default=false})
+    -> !tfr.tensor<T> attributes{T}
+
+// Translated from:
+//
+// REGISTER_OP("Relu")
+//     .Input("features: T")
+//     .Output("activations: T")
+//     .Attr("T: {realnumbertype, qint8}")
+// T is a derived attribute.
+tfr.func @tf__relu_(!tfr.tensor<T>) -> !tfr.tensor<T> attributes{T}
+
+
+// Translated from:
+//
+// REGISTER_OP("Relu6")
+//     .Input("features: T")
+//     .Output("activations: T")
+//     .Attr("T: {realnumbertype}")
+// T is a derived attribute.
+tfr.func @tf__relu6_(!tfr.tensor<T>) -> !tfr.tensor<T> attributes{T}
diff --git a/tensorflow/compiler/mlir/tfr/resources/test_ops.cc b/tensorflow/compiler/mlir/tfr/resources/test_ops.cc
new file mode 100644
index 00000000000..3aaa0850805
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/resources/test_ops.cc
@@ -0,0 +1,86 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+
+namespace tensorflow {
+
+REGISTER_OP("TestNoOp");
+
+REGISTER_OP("TestIdentityOp")
+    .Input("input: T")
+    .Output("output: T")
+    .Attr("T: numbertype");
+
+REGISTER_OP("TestIdentityNOp")
+    .Input("input: N * T")
+    .Output("output: N * T")
+    .Attr("N: int >= 1")
+    .Attr("T: numbertype");
+
+REGISTER_OP("TestInputNOp")
+    .Input("input: N * T")
+    .Output("output: T")
+    .Attr("N: int >= 1")
+    .Attr("T: numbertype");
+
+REGISTER_OP("TestOutputNOp")
+    .Input("input: T")
+    .Output("output: N * T")
+    .Attr("N: int >= 1")
+    .Attr("T: numbertype");
+
+REGISTER_OP("TestTwoInputsOp")
+    .Input("lhs: T")
+    .Input("rhs: T")
+    .Output("output: T")
+    .Attr("T: numbertype")
+    .Attr("pred: bool = false");
+
+REGISTER_OP("TestComplexTFOp")
+    .Input("lhs: T")
+    .Input("rhs: Tlen")
+    .Output("output: N * T")
+    .Attr("N: int >= 1")
+    .Attr("T: numbertype")
+    .Attr("Tlen: {int32, int64} = DT_INT64");
+
+REGISTER_OP("TestNumAttrsOp")
+    .Attr("x1: int = -10")
+    .Attr("y1: int = 1")
+    .Attr("x2: float = 0.0")
+    .Attr("y2: float = -3.0");
+
+REGISTER_OP("TestNonNumAttrsOp")
+    .Attr("z: shape")
+    .Attr("x: string = 'hello'")
+    .Attr("y: type = DT_FLOAT");
+
+REGISTER_OP("TestThreeInputsOp")
+    .Input("x: T")
+    .Input("y: T")
+    .Input("z: T")
+    .Output("output: T")
+    .Attr("T: numbertype")
+    .Attr("act: {'x', 'y', 'z'} = 'z'");
+
+REGISTER_OP("TestTwoOutputsOp")
+    .Input("input: T")
+    .Output("output1: T")
+    .Output("output2: T")
+    .Attr("T: numbertype");
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfr/tests/control_flow.mlir b/tensorflow/compiler/mlir/tfr/tests/control_flow.mlir
new file mode 100644
index 00000000000..8dacd57653f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/tests/control_flow.mlir
@@ -0,0 +1,57 @@
+// RUN: tfr-opt %s -tfr-decompose -verify-diagnostics -split-input-file | FileCheck %s
+
+tfr.func @tf__my_pack(%values: !tfr.tensor_list,
+                      %n: i32 {tfr.name="N"},
+                      %axis: i32 {tfr.name="axis"}) -> !tfr.tensor {
+  %index = constant 0 : index
+  %cst = constant 1 : i32
+  %eq = cmpi "eq", %n, %cst : i32
+  %v1 = tfr.get_element %values[%index] : (!tfr.tensor_list, index) -> !tfr.tensor
+  %temp = tfr.call @tf__expand_dims(%v1, %axis) : (!tfr.tensor, i32) -> !tfr.tensor
+  %res = scf.if %eq -> !tfr.tensor {
+    scf.yield %temp : !tfr.tensor
+  } else {
+    %step = index_cast %cst : i32 to index
+    %end = index_cast %n : i32 to index
+    %reduce = scf.for %i = %step to %end step %step iter_args(%reduce_iter=%temp) -> !tfr.tensor {
+      %v = tfr.get_element %values[%i] : (!tfr.tensor_list, index) -> !tfr.tensor
+      %temp1 =  tfr.call @tf__expand_dims(%v, %axis) : (!tfr.tensor, i32) -> !tfr.tensor
+      %reduce_next =  tfr.call @tf__risc_concat(%reduce_iter, %temp1, %axis) : (!tfr.tensor, !tfr.tensor, i32) -> !tfr.tensor
+      scf.yield %reduce_next : !tfr.tensor
+    }
+    scf.yield %reduce : !tfr.tensor
+  }
+  tfr.return %res : !tfr.tensor
+}
+
+// CHECK-LABEL: pack_one
+func @pack_one(%arg0: tensor<2x3xf32>) -> tensor<1x2x3xf32> {
+  %0 = "tf.MyPack"(%arg0) {N=1:i32, axis=0:i32} : (tensor<2x3xf32>) -> tensor<1x2x3xf32>
+  return %0 : tensor<1x2x3xf32>
+
+// CHECK-NEXT: %[[AXIS:.*]] = constant 0 : i32
+// CHECK-NEXT: %[[CAST:.*]] = "tfr.cast"(%arg0) : (tensor<2x3xf32>) -> !tfr.tensor
+// CHECK-NEXT: %[[ED:.*]] = tfr.call @tf__expand_dims(%[[CAST]], %[[AXIS]]) : (!tfr.tensor, i32) -> !tfr.tensor
+// CHECK-NEXT: %[[BACK:.*]] = "tfr.cast"(%[[ED]]) : (!tfr.tensor) -> tensor<1x2x3xf32>
+// CHECK-NEXT: return %[[BACK]] : tensor<1x2x3xf32>
+}
+
+// CHECK-LABEL: pack_multiple
+func @pack_multiple(%arg0: tensor<2x3xf32>,
+                    %arg1: tensor<2x3xf32>,
+                    %arg2: tensor<2x3xf32>) -> tensor<3x2x3xf32> {
+  %0 = "tf.MyPack"(%arg0, %arg1, %arg2) {N=3:i32, axis=0:i32} : (tensor<2x3xf32>, tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<3x2x3xf32>
+  return %0 : tensor<3x2x3xf32>
+
+// CHECK-NEXT: %[[AXIS:.*]] = constant 0 : i32
+// CHECK-NEXT: %[[CAST0:.*]] = "tfr.cast"(%arg0) : (tensor<2x3xf32>) -> !tfr.tensor
+// CHECK-NEXT: %[[CAST1:.*]] = "tfr.cast"(%arg1) : (tensor<2x3xf32>) -> !tfr.tensor
+// CHECK-NEXT: %[[CAST2:.*]] = "tfr.cast"(%arg2) : (tensor<2x3xf32>) -> !tfr.tensor
+// CHECK-NEXT: %[[EX0:.*]] = tfr.call @tf__expand_dims(%[[CAST0]], %[[AXIS]]) : (!tfr.tensor, i32) -> !tfr.tensor
+// CHECK-NEXT: %[[EX1:.*]] = tfr.call @tf__expand_dims(%[[CAST1]], %[[AXIS]]) : (!tfr.tensor, i32) -> !tfr.tensor
+// CHECK-NEXT: %[[CONCAT1:.*]] = tfr.call @tf__risc_concat(%[[EX0]], %[[EX1]], %c0_i32) : (!tfr.tensor, !tfr.tensor, i32) -> !tfr.tensor
+// CHECK-NEXT: %[[EX2:.*]] = tfr.call @tf__expand_dims(%[[CAST2]], %[[AXIS]]) : (!tfr.tensor, i32) -> !tfr.tensor
+// CHECK-NEXT: %[[CONCAT2:.*]] = tfr.call @tf__risc_concat(%[[CONCAT1]], %[[EX2]], %[[AXIS]]) : (!tfr.tensor, !tfr.tensor, i32) -> !tfr.tensor
+// CHECK-NEXT: %[[BACK:.*]] = "tfr.cast"(%[[CONCAT2]]) : (!tfr.tensor) -> tensor<3x2x3xf32>
+// CHECK-NEXT: return %[[BACK]] : tensor<3x2x3xf32>
+}
diff --git a/tensorflow/compiler/mlir/tfr/tests/decompose.mlir b/tensorflow/compiler/mlir/tfr/tests/decompose.mlir
new file mode 100644
index 00000000000..97f12c9fedb
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/tests/decompose.mlir
@@ -0,0 +1,84 @@
+// RUN: tfr-opt %s -tfr-decompose -verify-diagnostics -split-input-file | FileCheck %s
+
+// CHECK-LABEL: @tf__fake_no_op
+tfr.func @tf__fake_no_op(%arg0: !tfr.tensor) -> !tfr.tensor {
+  tfr.return %arg0 : !tfr.tensor
+
+// CHECK-NEXT: tfr.return %arg0 : !tfr.tensor
+}
+
+// CHECK-LABEL: @tf__intermediate
+tfr.func @tf__intermediate(%arg0: !tfr.tensor) -> !tfr.tensor {
+  %0 = tfr.call @tf__risc(%arg0) : (!tfr.tensor) -> !tfr.tensor
+  tfr.return %0 : !tfr.tensor
+
+// CHECK-NEXT: %[[id:.*]] = tfr.call @tf__risc(%arg0) : (!tfr.tensor) -> !tfr.tensor
+// CHECK-NEXT: tfr.return %[[id]] : !tfr.tensor
+}
+
+// CHECK-LABEL: @tf__fused_n
+tfr.func @tf__fused_n(
+    %arg0: !tfr.tensor,
+    %arg1: !tfr.tensor_list,
+    %arg2: index {tfr.name="A",tfr.default=1:index})
+  -> !tfr.tensor_list {
+  %0 = tfr.call @tf__intermediate(%arg0) : (!tfr.tensor) -> !tfr.tensor
+  %1 = tfr.get_element %arg1[%arg2] : (!tfr.tensor_list, index) -> !tfr.tensor
+  %2 = tfr.call @tf__intermediate(%1) : (!tfr.tensor) -> !tfr.tensor
+  %3 = "tfr.build_list"(%0, %2) : (!tfr.tensor, !tfr.tensor) -> !tfr.tensor_list
+  tfr.return %3 : !tfr.tensor_list
+
+// CHECK-NEXT: %[[id1:.*]] = tfr.call @tf__intermediate(%arg0) : (!tfr.tensor) -> !tfr.tensor
+// CHECK-NEXT: %[[ge:.*]] = tfr.get_element %arg1[%arg2] : (!tfr.tensor_list, index) -> !tfr.tensor
+// CHECK-NEXT: %[[id2:.*]] = tfr.call @tf__intermediate(%[[ge]]) : (!tfr.tensor) -> !tfr.tensor
+// CHECK-NEXT: %[[bl:.*]] = "tfr.build_list"(%[[id1]], %[[id2]]) : (!tfr.tensor, !tfr.tensor) -> !tfr.tensor_list
+// CHECK-NEXT: tfr.return %[[bl]] : !tfr.tensor_list
+}
+
+//------------------------
+
+// CHECK-LABEL: decompose_tf_no_op
+func @decompose_tf_no_op(%arg0: tensor<1x2x3x4x!tf.string>) -> tensor<1x2x3x4x!tf.string> {
+  %0 = "tf.FakeNoOp"(%arg0) : (tensor<1x2x3x4x!tf.string>) -> tensor<1x2x3x4x!tf.string>
+  return %0 : tensor<1x2x3x4x!tf.string>
+
+// CHECK-NEXT: return %arg0
+}
+
+// CHECK-LABEL: decompose_tf_intermediate
+func @decompose_tf_intermediate(%arg0: tensor<1x2x3x4x!tf.string>) -> tensor<1x2x3x4x!tf.string> {
+  %0 = "tf.Intermediate"(%arg0) : (tensor<1x2x3x4x!tf.string>) -> tensor<1x2x3x4x!tf.string>
+  return %0 : tensor<1x2x3x4x!tf.string>
+
+// CHECK-NEXT: %[[casted:.*]] = "tfr.cast"(%arg0) : (tensor<1x2x3x4x!tf.string>) -> !tfr.tensor
+// CHECK-NEXT: %[[id:.*]] = tfr.call @tf__risc(%[[casted]]) : (!tfr.tensor) -> !tfr.tensor
+// CHECK-NEXT: %[[back:.*]] = "tfr.cast"(%[[id]]) : (!tfr.tensor) -> tensor<1x2x3x4x!tf.string>
+// CHECK-NEXT: return %[[back]]
+}
+
+// CHECK-LABEL: decompose_fused_n_default
+func @decompose_fused_n_default(%arg0: tensor<1x2x3x4x!tf.string>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
+  %0:2 = "tf.FusedN"(%arg0, %arg1, %arg2) : (tensor<1x2x3x4x!tf.string>, tensor<f32>, tensor<f32>) -> (tensor<1x2x3x4x!tf.string>, tensor<f32>)
+  return %0#1 : tensor<f32>
+
+// CHECK-NEXT: %[[in0:.*]] = "tfr.cast"(%arg0) : (tensor<1x2x3x4x!tf.string>) -> !tfr.tensor
+// CHECK-NEXT: %[[in2:.*]] = "tfr.cast"(%arg2) : (tensor<f32>) -> !tfr.tensor
+// CHECK-NEXT: %[[id0:.*]] = tfr.call @tf__risc(%[[in0]]) : (!tfr.tensor) -> !tfr.tensor
+// CHECK-NEXT: %[[id2:.*]] = tfr.call @tf__risc(%[[in2]]) : (!tfr.tensor) -> !tfr.tensor
+// CHECK-NEXT: %[[back:.*]] = "tfr.cast"(%[[id2]]) : (!tfr.tensor) -> tensor<f32>
+// CHECK-NEXT: return %[[back]] : tensor<f32>
+}
+
+// CHECK-LABEL: decompose_fused_n
+func @decompose_fused_n(%arg0: tensor<1x2x3x4x!tf.string>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
+  %0:2 = "tf.FusedN"(%arg0, %arg1, %arg2) {A=0:index} : (tensor<1x2x3x4x!tf.string>, tensor<f32>, tensor<f32>) -> (tensor<1x2x3x4x!tf.string>, tensor<f32>)
+  return %0#1 : tensor<f32>
+
+// CHECK-NEXT: %[[in0:.*]] = "tfr.cast"(%arg0) : (tensor<1x2x3x4x!tf.string>) -> !tfr.tensor
+// CHECK-NEXT: %[[in1:.*]] = "tfr.cast"(%arg1) : (tensor<f32>) -> !tfr.tensor
+// CHECK-NEXT: %[[id0:.*]] = tfr.call @tf__risc(%[[in0]]) : (!tfr.tensor) -> !tfr.tensor
+// CHECK-NEXT: %[[id1:.*]] = tfr.call @tf__risc(%[[in1]]) : (!tfr.tensor) -> !tfr.tensor
+// CHECK-NEXT: %[[back:.*]] = "tfr.cast"(%[[id1]]) : (!tfr.tensor) -> tensor<f32>
+// CHECK-NEXT: return %[[back]] : tensor<f32>
+}
+
diff --git a/tensorflow/compiler/mlir/tfr/tests/end2end.mlir b/tensorflow/compiler/mlir/tfr/tests/end2end.mlir
new file mode 100644
index 00000000000..5738020ccdb
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/tests/end2end.mlir
@@ -0,0 +1,235 @@
+// RUN: tfr-opt %s -tfr-decompose -tfr-raise-to-tf -canonicalize -verify-diagnostics -split-input-file | FileCheck %s
+
+//=================> User models, from GraphDef <====================
+
+// CHECK-LABEL: my_identity
+func @my_identity(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  %0 = "tf.MyIdentity"(%arg0) : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  return %0 : tensor<2x3xf32>
+
+// CHECK-NEXT: return %arg0 : tensor<2x3xf32>
+}
+
+// CHECK-LABEL: my_rsqrt
+func @my_rsqrt(%arg0: tensor<2x3xf32>) -> tensor<3x2x3xf32> {
+  %0 = "tf.MyRsqrt"(%arg0) : (tensor<2x3xf32>) -> tensor<3x2x3xf32>
+  return %0 : tensor<3x2x3xf32>
+
+// CHECK-NEXT: %[[RE:.*]] = "tf.RiscReciprocal"(%arg0) : (tensor<2x3xf32>) -> tensor<*xf32>
+// CHECK-NEXT: %[[SQRT:.*]] = "tf.RiscSqrt"(%[[RE]]) : (tensor<*xf32>) -> tensor<*xf32>
+// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[SQRT]]) {shape = #tf.shape<3x2x3>} : (tensor<*xf32>) -> tensor<3x2x3xf32>
+// CHECK-NEXT: return %[[ES]] : tensor<3x2x3xf32>
+}
+
+// CHECK-LABEL: my_leaky_relu
+func @my_leaky_relu(%arg0: tensor<2x3xf32>) -> tensor<3x2x3xf32> {
+  %0 = "tf.MyLeakyRelu"(%arg0) {alpha=3.0 : f32} : (tensor<2x3xf32>) -> tensor<3x2x3xf32>
+  return %0 : tensor<3x2x3xf32>
+
+// CHECK-NEXT: %[[ALPHA:.*]] = "tf.Const"() {value = dense<3.000000e+00> : tensor<f32>} : () -> tensor<f32>
+// CHECK-NEXT: %[[SHAPE:.*]] = "tf.RiscShape"(%arg0) {T = i32} : (tensor<2x3xf32>) -> tensor<*xi32>
+// CHECK-NEXT: %[[ALPHA1:.*]] = "tf.RiscBroadcast"(%[[ALPHA]], %[[SHAPE]]) : (tensor<f32>, tensor<*xi32>) -> tensor<*xf32>
+// CHECK-NEXT: %[[MAX:.*]] = "tf.RiscMaximum"(%arg0, %[[ALPHA1]]) : (tensor<2x3xf32>, tensor<*xf32>) -> tensor<*xf32>
+// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[MAX]]) {shape = #tf.shape<3x2x3>} : (tensor<*xf32>) -> tensor<3x2x3xf32>
+// CHECK-NEXT: return %[[ES]] : tensor<3x2x3xf32>
+}
+
+// CHECK-LABEL: my_leaky_relu_with_default
+func @my_leaky_relu_with_default(%arg0: tensor<2x3xf32>) -> tensor<3x2x3xf32> {
+  %0 = "tf.MyLeakyRelu"(%arg0) : (tensor<2x3xf32>) -> tensor<3x2x3xf32>
+  return %0 : tensor<3x2x3xf32>
+
+// CHECK-NEXT: %[[ALPHA:.*]] = "tf.Const"() {value = dense<2.000000e-01> : tensor<f32>} : () -> tensor<f32>
+// CHECK-NEXT: %[[SHAPE:.*]] = "tf.RiscShape"(%arg0) {T = i32} : (tensor<2x3xf32>) -> tensor<*xi32>
+// CHECK-NEXT: %[[ALPHA1:.*]] = "tf.RiscBroadcast"(%[[ALPHA]], %[[SHAPE]]) : (tensor<f32>, tensor<*xi32>) -> tensor<*xf32>
+// CHECK-NEXT: %[[MAX:.*]] = "tf.RiscMaximum"(%arg0, %[[ALPHA1]]) : (tensor<2x3xf32>, tensor<*xf32>) -> tensor<*xf32>
+// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[MAX]]) {shape = #tf.shape<3x2x3>} : (tensor<*xf32>) -> tensor<3x2x3xf32>
+// CHECK-NEXT: return %[[ES]] : tensor<3x2x3xf32>
+}
+
+// CHECK-LABEL: my_cast
+func @my_cast(%arg0: tensor<2x3xf32>) -> tensor<2x3xi32> {
+  %0 = "tf.MyCast"(%arg0) {Tout=i32} : (tensor<2x3xf32>) -> tensor<2x3xi32>
+  return %0 : tensor<2x3xi32>
+
+// CHECK-NEXT: %[[CAST:.*]] = "tf.RiscCast"(%arg0) {Tout = i32} : (tensor<2x3xf32>) -> tensor<*xi32>
+// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[CAST]]) {shape = #tf.shape<2x3>} : (tensor<*xi32>) -> tensor<2x3xi32>
+// CHECK-NEXT: return %[[ES]] : tensor<2x3xi32>
+}
+
+// CHECK-LABEL: my_pack_single_input
+func @my_pack_single_input(%arg0: tensor<2x3xf32>) -> tensor<3x2x3xf32> {
+  %0 = "tf.MyPack"(%arg0) {N=1:i32, axis=0:i32} : (tensor<2x3xf32>) -> tensor<3x2x3xf32>
+  return %0 : tensor<3x2x3xf32>
+
+// CHECK-NEXT: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT: %[[ED:.*]] = "tf.ExpandDims"(%arg0, %[[AXIS]]) : (tensor<2x3xf32>, tensor<i32>) -> tensor<*xf32>
+// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[ED]]) {shape = #tf.shape<3x2x3>} : (tensor<*xf32>) -> tensor<3x2x3xf32>
+// CHECK-NEXT: return %[[ES]] : tensor<3x2x3xf32>
+}
+
+// CHECK-LABEL: my_pack_multiple_inputs
+func @my_pack_multiple_inputs(%arg0: tensor<2x3xf32>, %arg1: tensor<2x3xf32>, %arg2: tensor<2x3xf32>) -> tensor<3x2x3xf32> {
+  %0 = "tf.MyPack"(%arg0, %arg1, %arg2) {N=3:i32, axis=0:i32} : (tensor<2x3xf32>, tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<3x2x3xf32>
+  return %0 : tensor<3x2x3xf32>
+
+// CHECK-NEXT: %[[AXIS:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK-NEXT: %[[ED0:.*]] = "tf.ExpandDims"(%arg0, %[[AXIS]]) : (tensor<2x3xf32>, tensor<i32>) -> tensor<*xf32>
+// CHECK-NEXT: %[[ED1:.*]] = "tf.ExpandDims"(%arg1, %[[AXIS]]) : (tensor<2x3xf32>, tensor<i32>) -> tensor<*xf32>
+// CHECK-NEXT: %[[CC0:.*]] = "tf.RiscConcat"(%[[ED0]], %[[ED1]]) {axis = 0 : i32} : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+// CHECK-NEXT: %[[ED2:.*]] = "tf.ExpandDims"(%arg2, %[[AXIS]]) : (tensor<2x3xf32>, tensor<i32>) -> tensor<*xf32>
+// CHECK-NEXT: %[[CC1:.*]] = "tf.RiscConcat"(%[[CC0]], %[[ED2]]) {axis = 0 : i32} : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[CC1]]) {shape = #tf.shape<3x2x3>} : (tensor<*xf32>) -> tensor<3x2x3xf32>
+// CHECK-NEXT: return %[[ES]] : tensor<3x2x3xf32>
+}
+
+// CHECK-LABEL: my_add_n_single_input
+func @my_add_n_single_input(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  %0 = "tf.MyAddN"(%arg0) {N=1:i32} : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  return %0 : tensor<2x3xf32>
+
+// CHECK-NEXT: return %arg0 : tensor<2x3xf32>
+}
+
+// CHECK-LABEL: my_add_n_multiple_inputs
+func @my_add_n_multiple_inputs(%arg0: tensor<2x3xf32>, %arg1: tensor<2x3xf32>, %arg2: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  %0 = "tf.MyAddN"(%arg0, %arg1, %arg2) {N=3:i32} : (tensor<2x3xf32>, tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<2x3xf32>
+  return %0 : tensor<2x3xf32>
+
+// CHECK-NEXT: %[[ADD0:.*]] = "tf.RiscAdd"(%arg0, %arg1) : (tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<*xf32>
+// CHECK-NEXT: %[[ADD1:.*]] = "tf.RiscAdd"(%[[ADD0]], %arg2) : (tensor<*xf32>, tensor<2x3xf32>) -> tensor<*xf32>
+// CHECK-NEXT: %[[ES:.*]] = "tf.EnsureShape"(%[[ADD1]]) {shape = #tf.shape<2x3>} : (tensor<*xf32>) -> tensor<2x3xf32>
+// CHECK-NEXT: return %[[ES]] : tensor<2x3xf32>
+}
+
+// CHECK-LABEL: my_map_and_batch_dataset
+func @my_map_and_batch_dataset(%input: tensor<*x!tf.variant>,
+                               %other1: tensor<*xf32>,
+                               %other2: tensor<*xi32>) -> tensor<*x!tf.variant> {
+  %0 = "tf.MyMapAndBatchDataset"(%input, %other1, %other2)
+    {batch_size=1000 : i64, num_parallel_calls = 8 : i64, drop_remainder = 0 : i1,
+     func = @"__some_func", output_types = [f32], output_shapes = [#tf.shape<>], preserve_cardinality = true}
+    : (tensor<*x!tf.variant>, tensor<*xf32>, tensor<*xi32>) -> tensor<*x!tf.variant>
+  return %0 : tensor<*x!tf.variant>
+
+// CHECK-NEXT: %[[BATCH:.*]] = "tf.Const"() {value = dense<1000> : tensor<i64>} : () -> tensor<i64>
+// CHECK-NEXT: %[[PARAL:.*]] = "tf.Const"() {value = dense<8> : tensor<i64>} : () -> tensor<i64>
+// CHECK-NEXT: %[[KEEP:.*]] = "tf.Const"() {value = dense<false> : tensor<i1>} : () -> tensor<i1>
+// CHECK-NEXT: %[[CAST:.*]] = "tf.Cast"(%arg2) {Truncate = false} : (tensor<*xi32>) -> tensor<*xf32>
+// CHECK-NEXT: %[[RET:.*]] = "tf.MapAndBatchDatasetV0"(%arg0, %[[BATCH]], %[[PARAL]], %[[KEEP]], %arg1, %[[CAST]])
+// CHECK-SAME: {f = @__some_func, output_shapes = [#tf.shape<>], output_types = [f32], preserve_cardinality = true} : (tensor<*x!tf.variant>, tensor<i64>, tensor<i64>, tensor<i1>, tensor<*xf32>, tensor<*xf32>) -> tensor<*x!tf.variant>
+// CHECK-NEXT: return %[[RET]] : tensor<*x!tf.variant>
+}
+
+//=================> decomposition functions, translated from tf.compose api <====================
+tfr.func @tf__my_identity(%value: !tfr.tensor) -> !tfr.tensor {
+  tfr.return %value : !tfr.tensor
+}
+
+tfr.func @tf__my_cast(%value: !tfr.tensor, %tout: !tfr.attr{tfr.name="Tout"}) -> !tfr.tensor {
+  %0 = tfr.call @tf__risc_cast(%value, %tout) : (!tfr.tensor, !tfr.attr) -> !tfr.tensor
+  tfr.return %0 : !tfr.tensor
+}
+
+tfr.func @tf__my_rsqrt(%value: !tfr.tensor) -> !tfr.tensor {
+  %1 = tfr.call @tf__risc_reciprocal(%value) : (!tfr.tensor) -> !tfr.tensor
+  %2 = tfr.call @tf__risc_sqrt(%1) : (!tfr.tensor) -> !tfr.tensor
+  tfr.return %2 : !tfr.tensor
+}
+
+tfr.func @tf__my_leaky_relu(%value: !tfr.tensor, %alpha: f32 {tfr.name="alpha", tfr.default=0.2:f32}) -> !tfr.tensor {
+  %1 = tfr.call @tf__risc_shape(%value) : (!tfr.tensor) -> !tfr.tensor
+  %2 = "tfr.constant_tensor"(%alpha) : (f32) -> tensor<f32>
+  %t = "tfr.cast"(%2) : (tensor<f32>) -> !tfr.tensor
+  %3 = tfr.call @tf__risc_broadcast(%t, %1) : (!tfr.tensor, !tfr.tensor) -> !tfr.tensor
+  %4 = tfr.call @tf__risc_maximum(%value, %3) : (!tfr.tensor, !tfr.tensor) -> !tfr.tensor
+  tfr.return %4  : !tfr.tensor
+}
+
+// TODO(fengliuai): use shape dialect to manipulate the shape then this can be decomposed further.
+tfr.func @tf__my_expand_dims(%value: !tfr.tensor, %axis: i32 {tfr.name="axis"}) -> !tfr.tensor {
+  %axis_cst = "tfr.constant_tensor"(%axis) : (i32) -> tensor<i32>
+  %dim = "tfr.cast"(%axis_cst) : (tensor<i32>) -> !tfr.tensor
+  %0 = tfr.call @tf__expand_dims(%value, %dim) : (!tfr.tensor, !tfr.tensor) -> !tfr.tensor
+  tfr.return %0 : !tfr.tensor
+}
+
+tfr.func @tf__my_pack(%values: !tfr.tensor_list,
+                      %n: i32 {tfr.name="N"},
+                      %axis: i32 {tfr.name="axis"}) -> !tfr.tensor {
+  %index = constant 0 : index
+  %cst = constant 1 : i32
+  %eq = cmpi "eq", %n, %cst : i32
+  %v1 = tfr.get_element %values[%index] : (!tfr.tensor_list, index) -> !tfr.tensor
+  %temp = tfr.call @tf__my_expand_dims(%v1, %axis) : (!tfr.tensor, i32) -> !tfr.tensor
+  %res = scf.if %eq -> !tfr.tensor {
+    scf.yield %temp : !tfr.tensor
+  } else {
+    %step = index_cast %cst : i32 to index
+    %end = index_cast %n : i32 to index
+    %reduce = scf.for %i = %step to %end step %step iter_args(%reduce_iter=%temp) -> !tfr.tensor {
+      %v = tfr.get_element %values[%i] : (!tfr.tensor_list, index) -> !tfr.tensor
+      %temp1 =  tfr.call @tf__my_expand_dims(%v, %axis) : (!tfr.tensor, i32) -> !tfr.tensor
+      %reduce_next =  tfr.call @tf__risc_concat(%reduce_iter, %temp1, %axis) : (!tfr.tensor, !tfr.tensor, i32) -> !tfr.tensor
+      scf.yield %reduce_next : !tfr.tensor
+    }
+    scf.yield %reduce : !tfr.tensor
+  }
+  tfr.return %res : !tfr.tensor
+}
+
+tfr.func @tf__my_add_n(%values: !tfr.tensor_list,
+                       %n: i32 {tfr.name="N"}) -> !tfr.tensor {
+  %index = constant 0 : index
+  %cst = constant 1 : i32
+  %eq = cmpi "eq", %n, %cst : i32
+  %v1 = tfr.get_element %values[%index] : (!tfr.tensor_list, index) -> !tfr.tensor
+  %res = scf.if %eq -> !tfr.tensor {
+    scf.yield %v1 : !tfr.tensor
+  } else {
+    %step = index_cast %cst : i32 to index
+    %end = index_cast %n : i32 to index
+    %reduce = scf.for %i = %step to %end step %step iter_args(%reduce_iter=%v1) -> !tfr.tensor {
+      %v = tfr.get_element %values[%i] : (!tfr.tensor_list, index) -> !tfr.tensor
+      %reduce_next =  tfr.call @tf__risc_add(%reduce_iter, %v) : (!tfr.tensor, !tfr.tensor) -> !tfr.tensor
+      scf.yield %reduce_next : !tfr.tensor
+    }
+    scf.yield %reduce : !tfr.tensor
+  }
+  tfr.return %res : !tfr.tensor
+}
+
+tfr.func @tf__my_map_and_batch_dataset(
+    %input_dataset: !tfr.tensor,
+    %other_arguments: !tfr.tensor_list,
+    %batch_size: i64 {tfr.name="batch_size"},
+    %num_parallel_calls: i64 {tfr.name="num_parallel_calls"},
+    %drop_remainder: i1 {tfr.name="drop_remainder"},
+    %f: !tfr.attr {tfr.name="func"},
+    %output_types: !tfr.attr {tfr.name="output_types"},
+    %output_shapes: !tfr.attr {tfr.name="output_shapes"},
+    %preserve_cardinality: i1 {tfr.name="preserve_cardinality", tfr.default=false}) -> !tfr.tensor {
+  %batch = "tfr.constant_tensor"(%batch_size) : (i64) -> tensor<i64>
+  %batch1 = "tfr.cast"(%batch) : (tensor<i64>) -> !tfr.tensor
+  %calls = "tfr.constant_tensor"(%num_parallel_calls) : (i64) -> tensor<i64>
+  %calls1 = "tfr.cast"(%calls) : (tensor<i64>) -> !tfr.tensor
+  %drop = "tfr.constant_tensor"(%drop_remainder) : (i1) -> tensor<i1>
+  %drop1 = "tfr.cast"(%drop) : (tensor<i1>) -> !tfr.tensor
+  %ret = tfr.call @tf__map_and_batch_dataset_v0(%input_dataset, %batch1, %calls1, %drop1, %other_arguments, %f, %output_types, %output_shapes, %preserve_cardinality)
+    : (!tfr.tensor, !tfr.tensor, !tfr.tensor, !tfr.tensor, !tfr.tensor_list, !tfr.attr, !tfr.attr, !tfr.attr, i1) -> !tfr.tensor
+  tfr.return %ret : !tfr.tensor
+}
+
+//=================>  signatures of the primitive ops with kernels, modeled as external TFR function <==
+tfr.func @tf__risc_cast_(!tfr.tensor, !tfr.attr{tfr.name="Tout"}) -> !tfr.tensor<Tout> attributes{Tout}
+tfr.func @tf__risc_add_(!tfr.tensor<T>, !tfr.tensor<T>) -> !tfr.tensor<T> attributes{T}
+tfr.func @tf__risc_concat_(!tfr.tensor<T>, !tfr.tensor<T>, i32{tfr.name="axis"}) -> !tfr.tensor<T> attributes{T}
+tfr.func @tf__risc_broadcast_(!tfr.tensor<T>, !tfr.tensor<Tidx>) -> !tfr.tensor<T> attributes{T, Tidx}
+tfr.func @tf__risc_reciprocal_(!tfr.tensor<T>) -> !tfr.tensor<T> attributes{T}
+tfr.func @tf__risc_sqrt_(!tfr.tensor<T>) -> !tfr.tensor<T> attributes{T}
+tfr.func @tf__risc_shape_(!tfr.tensor, !tfr.attr{tfr.name="T", tfr.default=i32}) -> !tfr.tensor<T> attributes{T}
+tfr.func @tf__risc_maximum_(!tfr.tensor<T>, !tfr.tensor<T>) -> !tfr.tensor<T> attributes{T}
+tfr.func @tf__expand_dims_(!tfr.tensor<T>, !tfr.tensor<Tdim>) -> !tfr.tensor<T> attributes{T, Tdim}
+tfr.func @tf__map_and_batch_dataset_v0_(!tfr.tensor<T>, !tfr.tensor, !tfr.tensor, !tfr.tensor, !tfr.tensor_list<Targuments>,
+  !tfr.attr{tfr.name="f"}, !tfr.attr{tfr.name="output_types"}, !tfr.attr{tfr.name="output_shapes"}, i1{tfr.name="preserve_cardinality"})
+  -> !tfr.tensor<T> attributes{T, Targuments}
diff --git a/tensorflow/compiler/mlir/tfr/tests/ops.mlir b/tensorflow/compiler/mlir/tfr/tests/ops.mlir
new file mode 100644
index 00000000000..b074985c591
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/tests/ops.mlir
@@ -0,0 +1,381 @@
+// RUN: tfr-opt %s -verify-diagnostics -split-input-file | tfr-opt | FileCheck %s
+// RUN: tfr-opt %s -canonicalize -verify-diagnostics -split-input-file | FileCheck %s -check-prefix=CANON
+
+// Tests for types, ops with custom constraints, verifiers, printer or parser
+// methods.
+
+// CHECK-LABEL: tensor_type_noconstraint
+func @tensor_type_noconstraint() -> !tfr.tensor
+
+// -----
+
+// CHECK-LABEL: tensor_type
+func @tensor_type() -> !tfr.tensor<T>
+
+// -----
+
+// CHECK-LABEL: tensor_list_type_noconstraint
+func @tensor_list_type_noconstraint() -> !tfr.tensor_list
+
+// -----
+
+// CHECK-LABEL: tensor_list_type_array_like
+func @tensor_list_type_array_like() -> !tfr.tensor_list<[N, T]>
+
+// -----
+
+// CHECK-LABEL: tensor_list_type_tuple_like
+func @tensor_list_type_tuple_like() -> !tfr.tensor_list<input_T>
+
+// -----
+
+// expected-error@+1 {{unbalanced '>' character in pretty dialect name}}
+func @tensor_invalid_1() -> !tfr.tensor<[N, T>
+
+// -----
+
+// expected-error@+1 {{unexpected nul or EOF in pretty dialect name}}
+func @tensor_invalid_2() -> !tfr.tensor<[N, T]
+
+// -----
+
+// CHECK-LABEL: call_op
+func @call_op(%arg0: !tfr.tensor<T>, %arg1: !tfr.tensor_list<TL>, %arg2: i32) -> !tfr.tensor<K> {
+  %0 = tfr.call @Foo(%arg0, %arg1, %arg2) : (!tfr.tensor<T>, !tfr.tensor_list<TL>, i32) -> !tfr.tensor<K>
+  return %0 : !tfr.tensor<K>
+}
+
+// -----
+
+// CHECK-LABEL: call_op_arg_attr(%arg0: i32) -> !tfr.tensor<K>
+func @call_op_arg_attr(%arg0: i32) -> !tfr.tensor<K> {
+  %0 = tfr.call @Bar(%arg0) : (i32) -> !tfr.tensor<K>
+  return %0 : !tfr.tensor<K>
+}
+
+// -----
+
+func @call_op_invalid_1(%arg0: tensor<?xf32>) -> !tfr.tensor<K> {
+  // expected-error@+1 {{got 'tensor<?xf32>'}}
+  %0 = tfr.call @Huu(%arg0)  : (tensor<?xf32>) -> !tfr.tensor<K>
+  return %0 : !tfr.tensor<K>
+}
+
+// -----
+
+// CHECK-LABEL: get_shape
+func @get_shape(%arg0: !tfr.tensor) -> (!shape.shape, !shape.shape) {
+  %0 = tfr.get_shape %arg0 -> !shape.shape
+  %1 = "tfr.get_shape"(%arg0) : (!tfr.tensor) -> !shape.shape
+  return %0, %1 : !shape.shape, !shape.shape
+}
+
+// -----
+
+// CHECK-LABEL: get_real_shape
+// CANON-LABEL: get_real_shape
+func @get_real_shape(%arg0: tensor<1x2xf32>) -> tensor<1xindex> {
+  %0 = "tfr.cast"(%arg0) : (tensor<1x2xf32>) -> !tfr.tensor
+  %1 = tfr.get_shape %0 -> !shape.shape
+  %2 = shape.to_extent_tensor %1 : !shape.shape -> tensor<1xindex>
+  return %2 : tensor<1xindex>
+
+// CANON-NEXT: %[[s:.*]] = shape.const_shape [1, 2] : tensor<?xindex>
+// CANON-NEXT: %[[e:.*]] = shape.to_extent_tensor %[[s]] : tensor<?xindex> -> tensor<1xindex>
+// CANON-NEXT: return %[[e]] : tensor<1xindex>
+}
+
+// -----
+
+func @get_element_type(%arg0: !tfr.tensor) -> (!tfr.attr, !tfr.attr) {
+  %0 = tfr.get_element_type %arg0 -> !tfr.attr
+  %1 = "tfr.get_element_type"(%arg0) : (!tfr.tensor) -> !tfr.attr
+  return %0, %1 : !tfr.attr, !tfr.attr
+}
+
+// -----
+
+// CHECK-LABEL: from_tf_tensor
+func @from_tf_tensor(%arg0: tensor<?xf32>) -> !tfr.tensor<K> {
+  %0 = "tfr.cast"(%arg0) : (tensor<?xf32>) -> !tfr.tensor<K>
+  return %0 : !tfr.tensor<K>
+}
+
+// -----
+
+// CHECK-LABEL: to_tf_tensor
+func @to_tf_tensor(%arg0: !tfr.tensor<T>) -> tensor<?xi32> {
+  %0 = "tfr.cast"(%arg0) : (!tfr.tensor<T>) -> tensor<?xi32>
+  return %0 : tensor<?xi32>
+}
+
+// -----
+
+// CHECK-LABEL: constant
+func @constant() -> (!tfr.attr, !tfr.attr, !tfr.attr, !tfr.attr) {
+  %0 = tfr.constant f32 -> !tfr.attr
+  %1 = tfr.constant [f32, i32] -> !tfr.attr
+  %2 = "tfr.constant"() {value = f32} : () -> !tfr.attr
+  %3 = "tfr.constant"() {value = [f32, i32]} : () -> !tfr.attr
+  return %0, %1, %2, %3 : !tfr.attr, !tfr.attr, !tfr.attr, !tfr.attr
+}
+
+// -----
+
+// CHECK-LABEL: equal
+// CANON-LABEL: equal
+func @equal() -> (i1, i1, i1, i1) {
+  %0 = tfr.constant f32 -> !tfr.attr
+  %1 = tfr.constant f32 -> !tfr.attr
+  %2 = tfr.constant i32 -> !tfr.attr
+  %same_type = tfr.equal %0,%1 -> i1
+  %diff_type = tfr.equal %0,%2 -> i1
+
+  %3 = tfr.constant "hello" -> !tfr.attr
+  %4 = tfr.constant "hello" -> !tfr.attr
+  %5 = tfr.constant "how are you" -> !tfr.attr
+  %same_str = tfr.equal %3,%4 -> i1
+  %diff_str = tfr.equal %3,%5 -> i1
+  return %same_type, %diff_type, %same_str, %diff_str  : i1, i1, i1, i1
+
+// CANON-NEXT: %true = constant true
+// CANON-NEXT: %false = constant false
+// CANON-NEXT: return %true, %false, %true, %false : i1, i1, i1, i1
+}
+
+// -----
+
+// CHECK-LABEL: constant_tensor_scalar
+func @constant_tensor_scalar(%arg0: i32) -> tensor<i32> {
+  %0 = "tfr.constant_tensor"(%arg0) : (i32) -> tensor<i32>
+  return %0 : tensor<i32>
+}
+
+// -----
+
+// CHECK-LABEL: constant_tensor_vector
+func @constant_tensor_vector(%arg0: vector<1x2xi32>) -> tensor<1x2xi32> {
+  %0 = "tfr.constant_tensor"(%arg0) : (vector<1x2xi32>) -> tensor<1x2xi32>
+  return %0 : tensor<1x2xi32>
+}
+
+// -----
+
+// CHECK-LABEL: constant_tensor_array
+// CANON-LABEL: constant_tensor_array
+func @constant_tensor_array() -> !tfr.tensor {
+  %0 = tfr.constant [1, -1, 3] -> !tfr.attr
+  %1 = "tfr.constant_tensor"(%0) : (!tfr.attr) -> !tfr.tensor
+  return %1 : !tfr.tensor
+
+// CANON-NEXT: "tf.Const"() {value = dense<[1, -1, 3]> : tensor<3xi64>} : () -> tensor<3xi64>
+// CANON-NEXT: "tfr.cast"(%0) : (tensor<3xi64>) -> !tfr.tensor
+// CANON-NEXT: return
+}
+
+// -----
+
+// CHECK-LABEL: constant_tensor_scalar
+// CANON-LABEL: constant_tensor_scalar
+func @constant_tensor_scalar() -> !tfr.tensor {
+  %0 = "std.constant"() {value = 42 : i32} : () -> i32
+  %1 = "tfr.constant_tensor"(%0) : (i32) -> !tfr.tensor
+  return %1 : !tfr.tensor
+
+// CANON-NEXT: "tf.Const"() {value = dense<42> : tensor<i32>} : () -> tensor<i32>
+// CANON-NEXT: "tfr.cast"(%0) : (tensor<i32>) -> !tfr.tensor
+// CANON-NEXT: return
+}
+
+// -----
+
+func @constant_tensor_invalid_0(%arg0: i32) -> tensor<f32> {
+    // expected-error@+1 {{input and output should have the same scalar types.}}
+  %0 = "tfr.constant_tensor"(%arg0) : (i32) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+func @constant_tensor_invalid_1(%arg0: vector<1xi32>) -> tensor<?xi32> {
+    // expected-error@+1 {{output type should be static and ranked}}
+  %0 = "tfr.constant_tensor"(%arg0) : (vector<1xi32>) -> tensor<?xi32>
+  return %0 : tensor<?xi32>
+}
+
+// -----
+
+func @constant_tensor_invalid_2(%arg0: vector<1xi32>) -> tensor<1xf32> {
+    // expected-error@+1 {{input and output should have same shape and element type}}
+  %0 = "tfr.constant_tensor"(%arg0) : (vector<1xi32>) -> tensor<1xf32>
+  return %0 : tensor<1xf32>
+}
+
+// -----
+
+func @constant_tensor_invalid_3(%arg0: vector<1xi32>) -> tensor<1x1xi32> {
+    // expected-error@+1 {{input and output should have same shape and element type}}
+  %0 = "tfr.constant_tensor"(%arg0) : (vector<1xi32>) -> tensor<1x1xi32>
+  return %0 : tensor<1x1xi32>
+}
+
+// -----
+
+func @constant_tensor_invalid_4(%arg0: i32) -> tensor<1x1xi32> {
+    // expected-error@+1 {{input can not be converted to an output tensor}}
+  %0 = "tfr.constant_tensor"(%arg0) : (i32) -> tensor<1x1xi32>
+  return %0 : tensor<1x1xi32>
+}
+
+// -----
+
+// CHECK-LABEL: get_element
+func @get_element(%arg0: !tfr.tensor_list<T>) -> !tfr.tensor {
+  %cst = "std.constant"() {value = 1 : index} : () -> index
+  %0 = tfr.get_element %arg0[%cst] : (!tfr.tensor_list<T>, index) -> !tfr.tensor
+  return %0 : !tfr.tensor
+}
+
+// -----
+
+// CHECK-LABEL: build_list
+func @build_list(%arg0: !tfr.tensor<A>, %arg1: !tfr.tensor<B>) -> !tfr.tensor_list {
+  %0 = "tfr.build_list"(%arg0, %arg1) : (!tfr.tensor<A>, !tfr.tensor<B>) -> !tfr.tensor_list
+  return %0 : !tfr.tensor_list
+}
+
+// -----
+
+// CHECK-LABEL: build_const_list
+// CANON-LABEL: build_const_list
+func @build_const_list() -> !tfr.attr {
+  %0 = "std.constant"() {value = 42 : i32} : () -> i32
+  %1 = "std.constant"() {value = 41 : i32} : () -> i32
+  %2 = "tfr.build_list"(%0, %1) : (i32, i32) -> !tfr.attr
+  return %2 : !tfr.attr
+
+// CANON-NEXT: %[[c:.*]] = tfr.constant [42 : i32, 41 : i32] -> !tfr.attr
+// CANON-NEXT: return %[[c]] : !tfr.attr
+}
+
+// -----
+
+// CHECK-LABEL: tfr.func
+tfr.func @External(%arg0: !tfr.tensor<A>,
+              %arg1: !tfr.tensor_list<C>,
+              %arg2: i32 {tfr.name = "A"},
+              %arg3: !tfr.attr {tfr.name = "T"})
+  -> (!tfr.tensor<A>, !tfr.tensor_list<C>)
+  attributes {A, C}
+
+// -----
+
+// CHECK-LABEL: tfr.func
+tfr.func @Foo(%arg0: !tfr.tensor<A>,
+              %arg1: !tfr.tensor_list<C>,
+              %arg2: i32 {tfr.name = "A"},
+              %arg3: vector<1xi32> {tfr.name = "C"})
+  -> (!tfr.tensor<A>, !tfr.tensor_list<C>)
+  attributes {A, C} {
+  tfr.return %arg0, %arg1 : !tfr.tensor<A>, !tfr.tensor_list<C>
+}
+
+// -----
+
+// CHECK-LABEL: tfr.func
+tfr.func @Bar(%arg0: !tfr.tensor<A>,
+              %arg2: i32 {tfr.name = "B"},
+              %arg3: vector<1xi32> {tfr.name = "C"})
+  -> (!tfr.tensor<A>, !tfr.tensor<A>)
+  attributes {A} {
+  tfr.return %arg0, %arg0 : !tfr.tensor<A>, !tfr.tensor<A>
+}
+
+// -----
+
+// expected-error@+1 {{Undefined attributes are used: A}}
+tfr.func @Foo_undefined_attr(%arg0: !tfr.tensor<A>,
+              %arg1: !tfr.tensor_list<A>,
+              %arg2: i32 {tfr.name = "A"},
+              %arg3: vector<1xi32> {tfr.name = "C"}) ->
+    (!tfr.tensor<A>, !tfr.tensor_list<A>) {
+  tfr.return %arg0, %arg1 : !tfr.tensor<A>, !tfr.tensor_list<A>
+}
+
+// -----
+
+// expected-error@+1 {{3 attribute argument doesn't have a tfr.name attribute}}
+tfr.func @Foo_unnamed_attr(%arg0: !tfr.tensor<A>,
+              %arg1: !tfr.tensor_list<A>,
+              %arg2: i32 {tfr.name = "A"},
+              %arg3: vector<1xi32>) ->
+    (!tfr.tensor<A>, !tfr.tensor_list<A>) {
+  tfr.return %arg0, %arg1 : !tfr.tensor<A>, !tfr.tensor_list<A>
+}
+
+// -----
+
+// expected-error@+1 {{tfr.tensor_list argument should be before non tensor arguments}}
+tfr.func @Foo_invalid_arg_order(%arg0: !tfr.tensor<A>,
+              %arg2: i32 {tfr.name = "A"},
+              %arg1: !tfr.tensor_list<A>,
+              %arg3: vector<1xi32> {tfr.name = "C"}) ->
+    (!tfr.tensor<A>, !tfr.tensor_list<A>) {
+  tfr.return %arg0, %arg1 : !tfr.tensor<A>, !tfr.tensor_list<A>
+}
+
+// -----
+
+// expected-error@+1 {{tfr.tensor argument should be before tfr.tensor_list argument.}}
+tfr.func @Foo_invalid_arg_order0(
+              %arg1: !tfr.tensor_list,
+              %arg0: !tfr.tensor,
+              %arg2: i32 {tfr.name = "A"},
+              %arg3: vector<1xi32> {tfr.name = "C"}) ->
+    (!tfr.tensor, !tfr.tensor_list) {
+  tfr.return %arg0, %arg1 : !tfr.tensor, !tfr.tensor_list
+}
+
+// -----
+
+// expected-error@+1 {{tfr.tensor result should be before tfr.tensor_list result}}
+tfr.func @Foo_invalid_result_order(%arg0: !tfr.tensor<A>,
+              %arg1: !tfr.tensor_list<A>,
+              %arg2: i32 {tfr.name = "A"},
+              %arg3: vector<1xi32> {tfr.name = "C"}) ->
+    (!tfr.tensor_list<A>, !tfr.tensor<A>) {
+  tfr.return %arg1, %arg0 : !tfr.tensor_list<A>, !tfr.tensor<A>
+}
+
+// -----
+
+// expected-error@+1 {{More than one tfr.tensor_list argument isn't allowed}}
+tfr.func @Foo_multiple_tensor_list_args(%arg0: !tfr.tensor<A>,
+              %arg1: !tfr.tensor_list<A>,
+              %arg2: !tfr.tensor_list<A>,
+              %arg3: i32 {tfr.name = "A"},
+              %arg4: vector<1xi32> {tfr.name = "C"}) ->
+    (!tfr.tensor<A>, !tfr.tensor_list<A>) {
+  tfr.return %arg0, %arg1 : !tfr.tensor<A>, !tfr.tensor_list<A>
+}
+
+// -----
+
+// expected-error@+1 {{More than one tfr.tensor_list result isn't allowed}}
+tfr.func @Foo_multiple_tensor_list_results(%arg0: !tfr.tensor<C>,
+              %arg1: !tfr.tensor_list<A>,
+              %arg2: i32 {tfr.name = "A"},
+              %arg3: vector<1xi32> {tfr.name = "C"}) ->
+    (!tfr.tensor_list<A>, !tfr.tensor_list<A>) {
+  tfr.return %arg1, %arg1 : !tfr.tensor_list<A>, !tfr.tensor_list<A>
+}
+
+// -----
+
+// expected-error@+1 {{None tfr.tensor/tfr.tensor_list results aren't allowed as a result}}
+tfr.func @Foo_return_attr(%arg0: !tfr.tensor<C>,
+              %arg1: !tfr.tensor_list<A>,
+              %arg2: i32 {tfr.name = "A"},
+              %arg3: vector<1xi32> {tfr.name = "C"}) -> i32 {
+  tfr.return %arg2 : i32
+}
diff --git a/tensorflow/compiler/mlir/tfr/tests/raise_to_tf.mlir b/tensorflow/compiler/mlir/tfr/tests/raise_to_tf.mlir
new file mode 100644
index 00000000000..41d0ee6271d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/tests/raise_to_tf.mlir
@@ -0,0 +1,76 @@
+// RUN: tfr-opt %s -tfr-raise-to-tf -verify-diagnostics -split-input-file | FileCheck %s
+
+tfr.func @tf__risc_same_(!tfr.tensor<T>) -> !tfr.tensor<T> attributes {T}
+tfr.func @tf__risc_concat_(!tfr.tensor_list<N, T>) -> !tfr.tensor<T> attributes {T, N}
+tfr.func @tf__risc_split_(!tfr.tensor<T>, i32 {tfr.name="N"}) -> !tfr.tensor_list<N, T> attributes {T, N}
+tfr.func @tf__risc_cast_(!tfr.tensor, !tfr.attr {tfr.name="K"}) -> !tfr.tensor<K> attributes {T, K}
+
+// CHECK-LABEL: decompose_tf_same
+func @decompose_tf_same(%arg0: tensor<1x2x3x4x!tf.string>) -> tensor<1x2x3x4x!tf.string> {
+  %0 = "tfr.cast"(%arg0) : (tensor<1x2x3x4x!tf.string>) -> !tfr.tensor
+  %1 = tfr.call @tf__risc_same(%0) : (!tfr.tensor) -> !tfr.tensor
+  %2 = "tfr.cast"(%1) : (!tfr.tensor) -> tensor<1x2x3x4x!tf.string>
+  return %2 : tensor<1x2x3x4x!tf.string>
+
+// CHECK: %[[id:.*]] = "tf.RiscSame"(%arg0) : (tensor<1x2x3x4x!tf.string>) -> tensor<*x!tf.string>
+// CHECK: %[[es:.*]] = "tf.EnsureShape"(%[[id]]) {shape = #tf.shape<1x2x3x4>} : (tensor<*x!tf.string>) -> tensor<1x2x3x4x!tf.string>
+// CHECK: return %[[es]] : tensor<1x2x3x4x!tf.string>
+}
+
+// CHECK-LABEL: decompose_tf_consecutive
+func @decompose_tf_consecutive(%arg0: tensor<1x2x3x4x!tf.string>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<f32> {
+  %0 = "tfr.cast"(%arg0) : (tensor<1x2x3x4x!tf.string>) -> !tfr.tensor
+  %1 = "tfr.cast"(%arg2) : (tensor<f32>) -> !tfr.tensor
+  %2 = tfr.call @tf__risc_same(%0) : (!tfr.tensor) -> !tfr.tensor
+  %3 = tfr.call @tf__risc_same(%1) : (!tfr.tensor) -> !tfr.tensor
+  %4 = "tfr.cast"(%3) : (!tfr.tensor) -> tensor<f32>
+  return %4 : tensor<f32>
+
+// CHECK: %[[id0:.*]] = "tf.RiscSame"(%arg0) : (tensor<1x2x3x4x!tf.string>) -> tensor<*x!tf.string>
+// CHECK: %[[id2:.*]] = "tf.RiscSame"(%arg2) : (tensor<f32>) -> tensor<*xf32>
+// CHECK: %[[es:.*]] = "tf.EnsureShape"(%[[id2]]) {shape = #tf.shape<>} : (tensor<*xf32>) -> tensor<f32>
+// CHECK: return %[[es]] : tensor<f32>
+}
+
+// CHECK-LABEL: decompose_tf_concat_n
+func @decompose_tf_concat_n(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>) -> tensor<3xf32> {
+  %0 = "tfr.cast"(%arg0) : (tensor<f32>) -> !tfr.tensor
+  %1 = "tfr.cast"(%arg1) : (tensor<f32>) -> !tfr.tensor
+  %2 = "tfr.cast"(%arg2) : (tensor<f32>) -> !tfr.tensor
+  %3 = "tfr.build_list"(%0, %1, %2) : (!tfr.tensor, !tfr.tensor, !tfr.tensor) -> !tfr.tensor_list
+  %concat = tfr.call @tf__risc_concat(%3) : (!tfr.tensor_list) -> !tfr.tensor
+  %4 = "tfr.cast"(%concat) : (!tfr.tensor) -> tensor<3xf32>
+  return %4 : tensor<3xf32>
+
+// CHECK: %[[concat:.*]] = "tf.RiscConcat"(%arg0, %arg1, %arg2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<*xf32>
+// CHECK: %[[es:.*]] = "tf.EnsureShape"(%[[concat]]) {shape = #tf.shape<3>} : (tensor<*xf32>) -> tensor<3xf32>
+// CHECK: return %[[es]] : tensor<3xf32>
+}
+
+// CHECK-LABEL: decompose_tf_split
+func @decompose_tf_split(%arg0: tensor<3xf32>) -> (tensor<f32>) {
+  %0 = "tfr.cast"(%arg0) : (tensor<3xf32>) -> !tfr.tensor
+  %n = std.constant 3: i32
+  %split = tfr.call @tf__risc_split(%0, %n) : (!tfr.tensor, i32) -> !tfr.tensor_list
+  %i0 = std.constant 0: index
+  %s0 = tfr.get_element %split[%i0] : (!tfr.tensor_list, index) -> !tfr.tensor
+  %4 = "tfr.cast"(%s0) : (!tfr.tensor) -> tensor<f32>
+  return %4 : tensor<f32>
+
+// CHECK: %[[split:.*]]:3 = "tf.RiscSplit"(%arg0) {N = 3 : i32} : (tensor<3xf32>) -> (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>)
+// CHECK: %[[es:.*]] = "tf.EnsureShape"(%[[split]]#0) {shape = #tf.shape<>} : (tensor<*xf32>) -> tensor<f32>
+// CHECK: return %[[es]] : tensor<f32>
+}
+
+// CHECK-LABEL: decompose_tf_cast
+func @decompose_tf_cast(%arg0: tensor<f32>) -> tensor<i32> {
+  %0 = "tfr.cast"(%arg0) : (tensor<f32>) -> !tfr.tensor
+  %t = tfr.constant i32 -> !tfr.attr
+  %concat = tfr.call @tf__risc_cast(%0, %t) : (!tfr.tensor, !tfr.attr) -> !tfr.tensor
+  %4 = "tfr.cast"(%concat) : (!tfr.tensor) -> tensor<i32>
+  return %4 : tensor<i32>
+
+// CHECK: %[[tfcast:.*]] = "tf.RiscCast"(%arg0) {K = i32} : (tensor<f32>) -> tensor<*xi32>
+// CHECK: %[[es:.*]] = "tf.EnsureShape"(%[[tfcast]]) {shape = #tf.shape<>} : (tensor<*xi32>) -> tensor<i32>
+// CHECK: return %[[es]] : tensor<i32>
+}
diff --git a/tensorflow/compiler/mlir/tfr/utils/utils.cc b/tensorflow/compiler/mlir/tfr/utils/utils.cc
new file mode 100644
index 00000000000..6c08b682cb0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/utils/utils.cc
@@ -0,0 +1,78 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfr/utils/utils.h"
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFR {
+
+std::string GetComposeFuncName(StringRef tf_op_name) {
+  std::string compose_func_name;
+  for (int i = 0; i < tf_op_name.size(); ++i) {
+    if (tf_op_name[i] == '_') {
+      // The field name must not contain "_"s. "_Arg" and "_RetVal" are special
+      // op names and we can return empty string to skip the decomposition.
+      return {};
+    }
+    if (tf_op_name[i] == '.') {
+      compose_func_name.push_back('_');
+    } else if (tf_op_name[i] >= 'A' && tf_op_name[i] <= 'Z') {
+      compose_func_name.push_back('_');
+      compose_func_name.push_back(tf_op_name[i] + 'a' - 'A');
+    } else {
+      compose_func_name.push_back(tf_op_name[i]);
+    }
+  }
+  return compose_func_name;
+}
+
+std::string GetTFOpName(StringRef compose_func_name) {
+  std::string tf_op_name;
+  bool after_underscore = false;
+  for (int i = 0; i < compose_func_name.size(); ++i) {
+    if (compose_func_name[i] >= 'A' && compose_func_name[i] <= 'Z') {
+      // The field name must not contain uppercase letters.
+      return {};
+    }
+    if (after_underscore) {
+      if (compose_func_name[i] >= 'a' && compose_func_name[i] <= 'z') {
+        tf_op_name.push_back(compose_func_name[i] + 'A' - 'a');
+        after_underscore = false;
+      } else {
+        // The character after a "_" must be a lowercase letter.
+        return {};
+      }
+    } else if (compose_func_name[i] == '_') {  // first time visit '_'
+      if (i + 1 < compose_func_name.size() && compose_func_name[i + 1] == '_') {
+        tf_op_name.push_back('.');
+        i++;
+      }
+      after_underscore = true;
+    } else {
+      tf_op_name.push_back(compose_func_name[i]);
+    }
+  }
+  if (after_underscore) {
+    // Trailing "_".
+    return {};
+  }
+  return tf_op_name;
+}
+
+}  // namespace TFR
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tfr/utils/utils.h b/tensorflow/compiler/mlir/tfr/utils/utils.h
new file mode 100644
index 00000000000..26c7250d95a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/utils/utils.h
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_UTILS_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_UTILS_UTILS_H_
+
+#include <string>
+
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace mlir {
+namespace TFR {
+
+// This is a hardcoded rule for mapping a TF op name to the corresponding
+// TFR function name. Examples:
+//   tf.Pack => tf__pack
+//   tf.ConcatV2 => tf__concat_v2
+// TODO(fengliuai): move to an util file.
+std::string GetComposeFuncName(StringRef tf_op_name);
+
+// This is a hardcoded rule for mapping a TFR function op name to the
+// corresponding TF opname. Examples:
+//   tf__pack -> tf.Pack
+//   tf__concat_v2 => tf.ConcatV2
+std::string GetTFOpName(StringRef compose_func_name);
+
+}  // namespace TFR
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_UTILS_UTILS_H_
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index 3c88318064b..b4cbf765c79 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -1,5 +1,17 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "get_compatible_with_cloud",
+    "tf_cc_binary",
+)
+load(
+    "//tensorflow/core/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
 
 package(
     default_visibility = [":friends"],
@@ -9,61 +21,118 @@ package(
 package_group(
     name = "friends",
     includes = ["//third_party/mlir:subpackages"],
-    packages = ["//tensorflow/compiler/mlir/..."],
+    packages = [
+        "//tensorflow/compiler/mlir/...",
+        "//tensorflow/core/kernels/mlir_generated/...",
+    ],
 )
 
 cc_library(
-    name = "cubin_creator",
-    srcs = ["cubin_creator.cc"],
-    hdrs = ["cubin_creator.h"],
-    copts = if_cuda(["-DGOOGLE_CUDA=1"]),
+    name = "kernel_creator",
+    srcs = ["kernel_creator.cc"],
+    hdrs = ["kernel_creator.h"],
+    copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]) + if_rocm_is_configured(["-DTENSORFLOW_USE_ROCM=1"]),
     deps = [
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
-        "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LLVMDialect",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:TargetNVVMIR",
-        "@llvm-project//mlir:Transforms",
-        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/hlo",
+        "//tensorflow/compiler/mlir/hlo:all_passes",
+        "//tensorflow/compiler/mlir/hlo:hlo_legalize_to_lhlo",
+        "//tensorflow/compiler/mlir/hlo:legalize_to_linalg",
+        "//tensorflow/compiler/mlir/hlo:legalize_trigonometric_to_approximation",
         "//tensorflow/compiler/mlir/hlo:lhlo",
+        "//tensorflow/compiler/mlir/hlo:lhlo_fuse_linalg",
+        "//tensorflow/compiler/mlir/hlo:lhlo_legalize_to_affine",
+        "//tensorflow/compiler/mlir/hlo:lhlo_legalize_to_gpu",
+        "//tensorflow/compiler/mlir/hlo:transform_unranked_hlo",  # buildcleaner: keep
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:compile_mlir_util_no_tf_dialect_passes",
+        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
+        "//tensorflow/compiler/mlir/tools/kernel_gen/transforms:passes",
         "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
-        "//tensorflow/compiler/mlir/hlo:materialize_broadcasts",  # buildcleaner: keep
-        "//tensorflow/compiler/mlir/hlo:unfuse_batch_norm",  # buildcleaner: keep
         "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service/gpu:stream_executor_util",
         "//tensorflow/compiler/xla/service/gpu:target_constants",
         "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
         "//tensorflow/compiler/xla/service/mlir_gpu:kernel_lowering",
-        "//tensorflow/core:cuda_libdevice_path",
+        "//tensorflow/compiler/xla/service/mlir_gpu:passes",
         "//tensorflow/core:lib",
-    ] + if_cuda(["//tensorflow/stream_executor/gpu:asm_compiler"]),
+        "//tensorflow/core/platform:cuda_libdevice_path",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AffineToStandard",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
+        "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:GPUToGPURuntimeTransforms",
+        "@llvm-project//mlir:GPUToNVVMTransforms",
+        "@llvm-project//mlir:GPUTransforms",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LinalgOps",
+        "@llvm-project//mlir:LinalgTransforms",
+        "@llvm-project//mlir:NVVMDialect",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:SCFToGPUPass",
+        "@llvm-project//mlir:SCFToStandard",
+        "@llvm-project//mlir:SCFTransforms",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
 )
 
 tf_cc_binary(
-    name = "tf_to_cubin",
-    srcs = ["tf_to_cubin.cc"],
-    visibility = ["//tensorflow/core/kernels/mlir_generated:__pkg__"],
+    name = "tf_to_gpu_binary",
+    srcs = ["tf_to_gpu_binary.cc"],
+    visibility = [
+        "//tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary:__pkg__",
+        "//tensorflow/core/kernels/mlir_generated:__pkg__",
+    ],
     deps = [
-        ":cubin_creator",
+        ":kernel_creator",
         "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
+tf_cc_binary(
+    name = "tf_to_kernel",
+    srcs = ["tf_to_kernel.cc"],
+    visibility = [
+        "//tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel:__pkg__",
+        "//tensorflow/core/kernels/mlir_generated:__pkg__",
+    ],
+    deps = [
+        ":kernel_creator",
+        "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Analysis",
+        "@llvm-project//llvm:CodeGen",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:Target",
+        "@llvm-project//llvm:X86CodeGen",  # fixdeps: keep
+        "@llvm-project//llvm:X86Disassembler",  # fixdeps: keep
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TargetLLVMIR",
     ],
 )
 
 tf_cc_binary(
     name = "kernel-gen-opt",
     srcs = ["tools/kernel-gen-opt/kernel-gen-opt.cc"],
-    visibility = ["//tensorflow/compiler/mlir/tools/kernel_gen/tests:__pkg__"],
+    visibility = ["//tensorflow/compiler/mlir/tools/kernel_gen/tests:__subpackages__"],
     deps = [
         "//tensorflow/compiler/mlir/hlo:all_passes",
         "//tensorflow/compiler/mlir/hlo:hlo_dialect_registration",
@@ -90,3 +159,16 @@ cc_library(
         "@llvm-project//mlir:mlir_runner_utils",
     ],
 )
+
+cc_library(
+    name = "tf_cuda_runtime_wrappers",
+    srcs = ["tf_cuda_runtime_wrappers.cc"],
+    compatible_with = get_compatible_with_cloud(),
+    copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]),
+    deps = [
+        "//tensorflow/core/platform/default/build_config:stream_executor_cuda",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:mlir_c_runner_utils",
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
deleted file mode 100644
index 3b6af7f699c..00000000000
--- a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-//===- cubin_creator.cc -----------------------------------------*- C++ -*-===//
-//
-// This file implements the function to compile a TF kernel function to a cubin.
-//
-//===----------------------------------------------------------------------===//
-#include "tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h"
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/memory/memory.h"
-#include "absl/strings/escaping.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Debug.h"
-#include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Dialect.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Parser.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Target/NVVMIR.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
-#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
-#include "tensorflow/compiler/xla/debug_options_flags.h"
-#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
-#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
-#include "tensorflow/compiler/xla/service/gpu/target_constants.h"
-#include "tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h"
-#include "tensorflow/core/platform/cuda_libdevice_path.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/path.h"
-#if GOOGLE_CUDA
-#include "tensorflow/stream_executor/gpu/asm_compiler.h"
-#endif
-
-namespace {
-using tensorflow::Status;
-using xla::InternalError;
-using xla::StatusOr;
-
-StatusOr<std::string> GetLibdeviceDir(
-    const xla::HloModuleConfig& hlo_module_config) {
-  for (const std::string& cuda_root : tensorflow::CandidateCudaRoots(
-           hlo_module_config.debug_options().xla_gpu_cuda_data_dir())) {
-    std::string libdevice_dir =
-        tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
-    VLOG(2) << "Looking for libdevice at " << libdevice_dir;
-    if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
-      VLOG(2) << "Found libdevice dir " << libdevice_dir;
-      return libdevice_dir;
-    }
-  }
-  return InternalError(
-      "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice");
-}
-
-struct MaterializeBroadcastsPass
-    : public mlir::PassWrapper<MaterializeBroadcastsPass, mlir::FunctionPass> {
-  void runOnFunction() override {
-    mlir::ConversionTarget conversionTarget(getContext());
-    mlir::OwningRewritePatternList conversionPatterns;
-
-    // Consider the mhlo dialect legal for tests.
-    conversionTarget.addLegalDialect<mlir::mhlo::MhloDialect>();
-    // The conversion uses helpers from the Standard dialect.
-    conversionTarget.addLegalDialect<mlir::StandardOpsDialect>();
-
-    mlir::mhlo::SetupMaterializeBroadcastsLegality(&getContext(),
-                                                   &conversionTarget);
-    mlir::mhlo::PopulateMaterializeBroadcastsPatterns(&getContext(),
-                                                      &conversionPatterns);
-
-    if (failed(applyPartialConversion(getFunction(), conversionTarget,
-                                      conversionPatterns))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-struct UnfuseBatchNormPass
-    : public mlir::PassWrapper<UnfuseBatchNormPass, mlir::FunctionPass> {
-  void runOnFunction() override {
-    mlir::OwningRewritePatternList patterns;
-    mlir::mhlo::PopulateUnfuseBatchNormPatterns(&getContext(), &patterns);
-    mlir::applyPatternsAndFoldGreedily(getOperation(), patterns);
-  }
-};
-
-Status LowerTfOpToLhloWithDynamicShapes(mlir::ModuleOp module) {
-  mlir::PassManager pm(module.getContext());
-  auto enable_if_vlog_is_on = [](mlir::Pass* pass, mlir::Operation* op) {
-    return VLOG_IS_ON(1);
-  };
-  pm.enableIRPrinting(/*shouldPrintBeforePass=*/{},
-                      /*shouldPrintAfterPass=*/enable_if_vlog_is_on,
-                      /*printModuleScope=*/false,
-                      /*printAfterOnlyOnChange=*/false, llvm::dbgs());
-  pm.addNestedPass<mlir::FuncOp>(mlir::mhlo::createLegalizeTFPass(false));
-  pm.addNestedPass<mlir::FuncOp>(
-      absl::make_unique<MaterializeBroadcastsPass>());
-  pm.addNestedPass<mlir::FuncOp>(absl::make_unique<UnfuseBatchNormPass>());
-  pm.addPass(mlir::mhlo::createLegalizeToLhloPass(
-      /*results_escape_functions=*/true));
-  pm.addNestedPass<mlir::FuncOp>(mlir::lmhlo::createLhloCopyRemovalPass());
-
-  if (failed(pm.run(module))) {
-    return InternalError("Lowering TF to LHLO failed.");
-  }
-  return Status::OK();
-}
-
-struct PropagateTensorFlowABIKnowledge
-    : public mlir::PassWrapper<PropagateTensorFlowABIKnowledge,
-                               mlir::OperationPass<mlir::LLVM::LLVMFuncOp>> {
-  explicit PropagateTensorFlowABIKnowledge(mlir::FunctionType type,
-                                           llvm::ArrayRef<uint32_t> same_shape_)
-      : func_type(type), same_shape(same_shape_) {}
-
-  void runOnOperation() override {
-    // We know due to tensorflow ABI that the offset is always 0 and that the
-    // innermost stride is always 1. To make this visible to the compiler,
-    // we insert constants into the code and replace usages accordingly.
-    // We do not change the signature so that we keep a somewhat stable ABI
-    // that is easy to undertand by tools.
-    // We also know that tensorflow aligns all allocated pointers by 16, so
-    // we pass this on. Furthermore, we know that arguments never alias. More
-    // precicely, they may only alias (due to reuse) if the kernel does not
-    // read from a position it previously has written to. We express this with
-    // the noalias attribute.
-    mlir::LLVM::LLVMFuncOp func = getOperation();
-
-    // This only works if the function is local and we can rewrite it.
-    if (func.isExternal()) return;
-
-    mlir::OpBuilder b(func.getBody());
-    // Steal the LLVM representation of the index type from the third argument.
-    auto index_type = func.getArgument(3).getType();
-    mlir::Value one = b.create<mlir::LLVM::ConstantOp>(
-        func.getLoc(), index_type, b.getIntegerAttr(b.getIndexType(), 1));
-    mlir::Value zero = b.create<mlir::LLVM::ConstantOp>(
-        func.getLoc(), index_type, b.getIntegerAttr(b.getIndexType(), 0));
-    uint32_t arg_pos = 0;
-    std::vector<uint32_t> positions;
-    // Collect the agument and return types of the surrounding function.
-    auto arg_types = llvm::to_vector<4>(llvm::concat<const mlir::Type>(
-        func_type.getInputs(), func_type.getResults()));
-    for (mlir::Type arg_type : arg_types) {
-      if (!arg_type.isa<mlir::MemRefType>()) {
-        func.emitError() << "argument of surrounding func is not ranked memref";
-        signalPassFailure();
-        return;
-      }
-      positions.push_back(arg_pos);
-      // Set alignment and aliasing on the pointers.
-      func.setArgAttr(arg_pos + 1, "llvm.noalias", b.getBoolAttr(true));
-      func.setArgAttr(arg_pos + 1, "llvm.align", b.getIndexAttr(16));
-      // Replace the offset with zero. Offset is argument number 3.
-      func.getArgument(arg_pos + 2).replaceAllUsesWith(zero);
-      // Forward over base_ptr, aligned_ptr, offset, size and stride arguments.
-      arg_pos += 3 + arg_type.cast<mlir::MemRefType>().getRank() * 2;
-      // Replace the last stride with constant 1.
-      func.getArgument(arg_pos - 1).replaceAllUsesWith(one);
-    }
-
-    // If we have knowledge that some arguments have the same shape, we
-    // can use that here. Simply replace usages of the shape parameters within
-    // the function body to a single shape parameter.
-    if (!same_shape.empty()) {
-      auto first = same_shape.front();
-      auto first_offset = positions.at(first);
-      auto first_type = arg_types[first].cast<mlir::ShapedType>();
-      uint32_t rank = first_type.getRank();
-      for (auto same : same_shape.drop_front(1)) {
-        uint32_t same_offset = positions.at(same);
-        auto same_type = arg_types[same].cast<mlir::ShapedType>();
-        if (same_type.getRank() != rank) {
-          func.emitOpError() << "same shape constraints on arguments with "
-                                "non-matching shapes: #"
-                             << first << " and #" << same;
-          signalPassFailure();
-          continue;
-        }
-
-        for (uint32_t i = 0; i < 2 * rank; ++i) {
-          // Replace uses for second arg data with first arg.
-          auto same_arg = func.getArgument(same_offset + 3 + i);
-          auto first_arg = func.getArgument(first_offset + 3 + i);
-          same_arg.replaceAllUsesWith(first_arg);
-        }
-      }
-    }
-  }
-
-  mlir::FunctionType func_type;
-  llvm::ArrayRef<uint32_t> same_shape;
-};
-
-Status PropagateTensorFlowABIKnowledgeToKernel(
-    mlir::ModuleOp module, llvm::ArrayRef<uint32_t> same_shape) {
-  // Grab the original signature from the single function.
-  auto func = *module.getBody()->op_begin<mlir::FuncOp>();
-
-  mlir::PassManager pm(module.getContext());
-  auto enable_if_vlog_is_on = [](mlir::Pass*, mlir::Operation*) {
-    return VLOG_IS_ON(1);
-  };
-  pm.enableIRPrinting(/*shouldPrintBeforePass=*/{},
-                      /*shouldPrintAfterPass=*/enable_if_vlog_is_on,
-                      /*printModuleScope=*/false,
-                      /*printAfterOnlyOnChange=*/false, llvm::dbgs());
-  auto& kernel_pm = pm.nest<::mlir::gpu::GPUModuleOp>();
-  kernel_pm.addNestedPass<mlir::LLVM::LLVMFuncOp>(
-      absl::make_unique<PropagateTensorFlowABIKnowledge>(func.getType(),
-                                                         same_shape));
-
-  if (failed(pm.run(module))) {
-    return InternalError("Static knowledge propagation failed.");
-  }
-  return Status::OK();
-}
-
-}  // namespace
-
-StatusOr<std::vector<uint8_t>> tensorflow::kernel_gen::GenerateCubinForTfCode(
-    llvm::StringRef tf_code, std::pair<int32_t, int32_t> compute_capability,
-    llvm::ArrayRef<uint32_t> tile_sizes, llvm::ArrayRef<uint32_t> same_shape,
-    llvm::ArrayRef<uint32_t> unroll_factors) {
-  mlir::MLIRContext context;
-  mlir::RegisterAllTensorFlowDialects(context.getDialectRegistry());
-  mlir::OwningModuleRef module = mlir::parseSourceString(tf_code, &context);
-
-  TF_RETURN_IF_ERROR(LowerTfOpToLhloWithDynamicShapes(module.get()));
-  {
-    xla::mlir_gpu::LowerLHLOToGPUOptions options;
-    options.tile_sizes = tile_sizes;
-    options.unroll_factors = unroll_factors;
-    options.collapse_parallel_loops = false;
-    options.use_approximations = true;
-    TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerLHLOToGPU(module.get(), options));
-  }
-  TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToNVVM(module.get()));
-  TF_RETURN_IF_ERROR(
-      PropagateTensorFlowABIKnowledgeToKernel(module.get(), same_shape));
-
-  mlir::OwningModuleRef kernel_module =
-      xla::mlir_gpu::ExtractKernelModule(*module).ValueOrDie();
-  llvm::LLVMContext llvmContext;
-  auto llvmModule = mlir::translateModuleToNVVMIR(*kernel_module, llvmContext);
-  if (!llvmModule) {
-    return InternalError("Could not translate MLIR module to NVVM");
-  }
-
-  llvmModule->setModuleIdentifier("acme");
-  llvmModule->setDataLayout(xla::gpu::nvptx::kDataLayout);
-
-  xla::HloModuleConfig config;
-  config.set_debug_options(xla::GetDebugOptionsFromFlags());
-
-  auto enable_fusion = [](llvm::TargetMachine* target) {
-    target->Options.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
-  };
-
-  TF_ASSIGN_OR_RETURN(std::string libdevice_dir, GetLibdeviceDir(config));
-  TF_ASSIGN_OR_RETURN(
-      std::string ptx,
-      xla::gpu::nvptx::CompileToPtx(llvmModule.get(), compute_capability,
-                                    config, libdevice_dir, enable_fusion));
-  VLOG(1) << ptx;
-
-#if GOOGLE_CUDA
-  return tensorflow::se::CompileGpuAsm(
-      std::get<0>(compute_capability), std::get<1>(compute_capability),
-      ptx.c_str(), xla::gpu::PtxOptsFromConfig(config));
-#else
-  return InternalError(
-      "GOOGLE_CUDA not defined. Did you specify --config=cuda ?");
-#endif
-}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
index 29939f227db..2630f97f825 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
@@ -1,4 +1,6 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//third_party/mlir:tblgen.bzl", "gentbl")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
 
 package(
     default_visibility = ["//tensorflow/compiler/mlir/tools/kernel_gen:friends"],
@@ -7,6 +9,7 @@ package(
 
 gentbl(
     name = "tf_framework_ops_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         ("-gen-op-decls", "tf_framework_ops.h.inc"),
         ("-gen-op-defs", "tf_framework_ops.cc.inc"),
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
index 8c02a734f1d..b3d92773be4 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
@@ -77,9 +77,9 @@ LogicalResult Verify<AllocRawOp>(AllocRawOp op) {
   return success();
 }
 
-#define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc.inc"
-
 }  // namespace tf_framework
 }  // namespace kernel_gen
 }  // namespace mlir
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc.inc"
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h
index d2612a38799..aab090cc5e0 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h
@@ -38,12 +38,12 @@ class OpKernelContextType
   using Base::Base;
 };
 
-#define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_dialect.h.inc"
-#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h.inc"
-
 }  // namespace tf_framework
 }  // namespace kernel_gen
 }  // namespace mlir
 
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_dialect.h.inc"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h.inc"
+
 #endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_IR_TF_FRAMEWORK_OPS_H_
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
index bc390a5aaa5..e6e29bcbdc2 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
@@ -29,7 +29,7 @@ def TFFramework_Dialect : Dialect {
     This dialect contains operations and types for that correspond to
     TensorFlow C++ Framework.
   }];
-  let cppNamespace = "kernel_gen::tf_framework";
+  let cppNamespace = "::mlir::kernel_gen::tf_framework";
 }
 
 def TFFramework_OpKernelContextType : DialectType<TFFramework_Dialect,
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
new file mode 100644
index 00000000000..99d3eadf156
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
@@ -0,0 +1,249 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+//===- kernel_creator.cc ----------------------------------------*- C++ -*-===//
+//
+// This file implements the function to compile a TF kernel function to gpu
+// binary (hsaco for AMD, cubin for NVIDIA) or to a gpu binary with host side.
+//
+//===----------------------------------------------------------------------===//
+#include "tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h"
+
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"  // from @llvm-project
+#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"  // from @llvm-project
+#include "mlir/Conversion/SCFToGPU/SCFToGPUPass.h"  // from @llvm-project
+#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/ParallelLoopMapper.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/Passes.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/NVVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/Passes.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/Transforms.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Transforms/Bufferize.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/passes.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/path.h"
+
+namespace tensorflow {
+namespace kernel_gen {
+namespace {
+
+using tensorflow::Status;
+using xla::InternalError;
+using xla::StatusOr;
+
+constexpr llvm::StringRef kGpuBinaryAttrName = "gpu.binary";
+
+Status LowerTFtoGPU(mlir::ModuleOp module, bool gpu_binary_only,
+                    llvm::ArrayRef<uint32_t> tile_sizes,
+                    llvm::ArrayRef<uint32_t> unroll_factors) {
+  mlir::PassManager pm(module.getContext());
+  applyTensorflowAndCLOptions(pm);
+
+  if (gpu_binary_only) {
+    pm.addPass(mlir::mhlo::createLegalizeTFPass(
+        /*allow_partial_conversion=*/false, /*legalize_chlo=*/true));
+    pm.addNestedPass<mlir::FuncOp>(
+        mlir::kernel_gen::transforms::CreateMaterializeBroadcastsPass());
+    pm.addNestedPass<mlir::FuncOp>(
+        mlir::kernel_gen::transforms::CreateUnfuseBatchNormPass());
+    pm.addPass(mlir::mhlo::createLegalizeToLhloPass(
+        /*results_escape_functions=*/true));
+    // Moving `AllocOp`s and inserting missing `DeallocOp`s
+    pm.addPass(::mlir::createBufferPlacementPass());
+    pm.addNestedPass<mlir::FuncOp>(mlir::createCopyRemovalPass());
+    pm.addPass(mlir::kernel_gen::transforms::CreateShapeToDescriptorsPass());
+  } else {
+    pm.addPass(mlir::mhlo::createLegalizeTFPass(
+        /*allow_partial_conversion=*/false, /*legalize_chlo=*/false));
+    pm.addPass(mlir::mhlo::createChloLegalizeToHloPass());
+    pm.addPass(mlir::createTransformUnrankedHloPass());
+    pm.addPass(mlir::kernel_gen::transforms::CreateShapeToDescriptorsPass());
+    // Clean up the IR created above. In particular, operations on descriptors
+    // are simplified here.
+    pm.addPass(mlir::createCanonicalizerPass());
+    pm.addPass(mlir::kernel_gen::transforms::CreateBufferizePass());
+    pm.addPass(mlir::kernel_gen::transforms::CreateParallelLoopsToSequential());
+  }
+
+  // Clean up the IR for further processing.
+  pm.addPass(mlir::createCanonicalizerPass());
+  // We have to anticipate later unrolling in tiling to make sure that we get
+  // the requested tiling after unrolling. Compute the new tiling here if
+  // needed.
+  llvm::SmallVector<unsigned, 4> tiling_for_unrolling;
+  llvm::SmallVector<int64_t, 4> as_int64;
+  if (!unroll_factors.empty()) {
+    tiling_for_unrolling.reserve(tile_sizes.size());
+    for (auto pair : llvm::zip(tile_sizes, unroll_factors)) {
+      tiling_for_unrolling.push_back(std::get<0>(pair) * std::get<1>(pair));
+      as_int64.push_back(std::get<1>(pair));
+    }
+  } else {
+    tiling_for_unrolling.append(tile_sizes.begin(), tile_sizes.end());
+  }
+  // Transform LHLO operations to LinAlg.
+  pm.addPass(::mlir::lmhlo::createLegalizeLhloToLinalgPass());
+  // Fuse linalg operations.
+  pm.addPass(::mlir::lmhlo::createLhloFuseLinalgPass(
+      /*use_parallel_loops=*/true, tiling_for_unrolling));
+  // Transform the Linalg operations inside of the loop nest into parallel
+  // loops.
+  pm.addPass(::mlir::createConvertLinalgToParallelLoopsPass());
+  // Canonicalize the code to simplify index computations. This is needed so
+  // that loop bounds have the same value.
+  pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
+  pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
+  // Fuse the inner-most loops.
+  pm.addPass(xla::mlir_gpu::createFuseInnerParallelLoopsPass());
+  // Run CSE to ensure that loads and stores to the same subview get
+  // recognized as such.
+  pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
+  // Forward stores to buffers to loads.
+  pm.addPass(xla::mlir_gpu::createStoreForwardingPass());
+  // Remove now unused temporary buffers.
+  pm.addPass(xla::mlir_gpu::createDeadTempBufferRemovalPass());
+  if (!unroll_factors.empty()) {
+    pm.addPass(::mlir::createParallelLoopTilingPass(as_int64));
+  }
+  // Some basic cleanup.
+  pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
+  pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
+  // Greedily map the remaining loop to GPU hardware dimensions.
+  pm.addPass(xla::mlir_gpu::createMapParallelLoopsPass());
+  // Apply the mapping.
+  pm.addPass(mlir::createParallelLoopToGpuPass());
+
+  // Embed TF Framework ops.
+  if (!gpu_binary_only) {
+    pm.addPass(mlir::kernel_gen::tf_framework::CreateEmbedTFFrameworkPass());
+  }
+
+  // Some basic cleanup.
+  pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
+  pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
+  // Make loops with min bounds into a conditional plus static bounds.
+  // Only do this if we unrolled in the first place.
+  if (!unroll_factors.empty()) {
+    pm.addNestedPass<::mlir::FuncOp>(mlir::createForLoopSpecializationPass());
+  }
+  // Approximate Tanh using standard operations.
+  pm.addNestedPass<::mlir::FuncOp>(
+      ::mlir::mhlo::createLegalizeTrigonometricToApproximationPass());
+  // Take launches to launches with kernels.
+  pm.addPass(::mlir::createGpuKernelOutliningPass());
+
+  if (gpu_binary_only) {
+    // Make kernel signature deterministic so that we can call it externally.
+    pm.addPass(xla::mlir_gpu::createRewriteKernelSignaturePass());
+  }
+  pm.addPass(::mlir::createLowerAffinePass());
+  pm.addPass(::mlir::createLowerToCFGPass());
+  if (failed(pm.run(module))) {
+    return InternalError("Lowering to GPU kernels failed.");
+  }
+  return Status::OK();
+}
+
+Status LowerGPUToLLVM(mlir::ModuleOp module, bool gpu_binary_only,
+                      llvm::ArrayRef<uint32_t> same_shape,
+                      llvm::StringRef gpu_binary_attr_name,
+                      llvm::ArrayRef<std::string> architectures,
+                      bool generate_fatbin) {
+  mlir::PassManager pm(module.getContext());
+  applyTensorflowAndCLOptions(pm);
+
+  auto& kernel_pm = pm.nest<mlir::gpu::GPUModuleOp>();
+  if (gpu_binary_only) {
+    // Grab the original signature from the single function.
+    kernel_pm.addNestedPass<mlir::LLVM::LLVMFuncOp>(
+        mlir::kernel_gen::transforms::CreatePropagateTensorFlowABIKnowledgePass(
+            same_shape));
+  }
+  kernel_pm.addPass(mlir::createStripDebugInfoPass());
+  kernel_pm.addPass(mlir::kernel_gen::transforms::CreateGpuKernelToBlobPass(
+      gpu_binary_attr_name, architectures, generate_fatbin));
+
+  if (!gpu_binary_only) {
+    pm.addPass(mlir::kernel_gen::transforms::CreateTFKernelToLLVMPass());
+    pm.addPass(mlir::createCanonicalizerPass());
+    pm.addPass(mlir::createCSEPass());
+  }
+  return failed(pm.run(module)) ? InternalError("Lowering to LLVM IR failed.")
+                                : Status::OK();
+}
+
+}  // namespace
+
+StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
+    mlir::MLIRContext& context, llvm::StringRef tf_code, bool gpu_binary_only,
+    llvm::ArrayRef<std::string> architectures,
+    llvm::ArrayRef<uint32_t> tile_sizes, llvm::ArrayRef<uint32_t> same_shape,
+    llvm::ArrayRef<uint32_t> unroll_factors, bool generate_fatbin) {
+  mlir::RegisterAllTensorFlowDialects(context.getDialectRegistry());
+  mlir::OwningModuleRef module = mlir::parseSourceString(tf_code, &context);
+  TF_RETURN_IF_ERROR(
+      LowerTFtoGPU(module.get(), gpu_binary_only, tile_sizes, unroll_factors));
+#if !defined(TENSORFLOW_USE_ROCM) && !defined(GOOGLE_CUDA)
+  return InternalError(
+      "Neither TENSORFLOW_USE_ROCM nor GOOGLE_CUDA are defined."
+      " Did you specify either --config=rocm or --config=cuda ?");
+#endif
+
+#if TENSORFLOW_USE_ROCM
+  TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToROCDL(module.get()));
+#elif GOOGLE_CUDA
+  TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToNVVM(module.get()));
+#endif
+  TF_RETURN_IF_ERROR(LowerGPUToLLVM(module.get(), gpu_binary_only, same_shape,
+                                    kGpuBinaryAttrName, architectures,
+                                    generate_fatbin));
+  return module;
+}
+
+StatusOr<std::string> ExtractGpuBinary(mlir::ModuleOp module) {
+  auto gpu_modules = module.getOps<mlir::gpu::GPUModuleOp>();
+  if (std::distance(gpu_modules.begin(), gpu_modules.end()) != 1) {
+    return InternalError("There should be exactly one GPU Module");
+  }
+  mlir::gpu::GPUModuleOp gpu_mod = *gpu_modules.begin();
+  auto blob = gpu_mod.getAttrOfType<mlir::StringAttr>(kGpuBinaryAttrName);
+  if (blob == nullptr) {
+    return InternalError("No binary blob found in the module");
+  }
+  return blob.getValue().str();
+}
+
+}  // namespace kernel_gen
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
similarity index 52%
rename from tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h
rename to tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
index 47626ba9d0d..6767944d539 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
@@ -13,30 +13,40 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-//===- cubin_creator.h ------------------------------------------*- C++ -*-===//
+//===- kernel_creator.h -----------------------------------------*- C++ -*-===//
 //
-// This file declares the function to compile a TF kernel function to a cubin.
+// This file declares the function to compile a TF kernel function to gpu
+// binary (hsaco for AMD, cubin for NVIDIA) or to a gpu binary with host side.
 //
 //===----------------------------------------------------------------------===//
-#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_CUBIN_CREATOR_H_
-#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_CUBIN_CREATOR_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_KERNEL_CREATOR_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_KERNEL_CREATOR_H_
 
 #include <utility>
-#include <vector>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace tensorflow {
 namespace kernel_gen {
-xla::StatusOr<std::vector<uint8_t>> GenerateCubinForTfCode(
-    llvm::StringRef tf_code,
-    std::pair<int32_t, int32_t> compute_capability = {7, 5},
+
+// Converts TF code to LLVM/NVVM. If `gpu_binary_only` is true, then the
+// conversion stops after gpu_binary blob is generated. If `gpu_binary_only` is
+// false, lowers the host side to LLVM Dialect.
+xla::StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
+    mlir::MLIRContext& context, llvm::StringRef tf_code, bool gpu_binary_only,
+    llvm::ArrayRef<std::string> architectures = {"sm_75"},
     llvm::ArrayRef<uint32_t> tile_sizes = {16, 64},
     llvm::ArrayRef<uint32_t> same_shape = {},
-    llvm::ArrayRef<uint32_t> unroll_factors = {});
+    llvm::ArrayRef<uint32_t> unroll_factors = {}, bool generate_fatbin = true);
+
+// Extracts gpu_binary from the converted module.
+xla::StatusOr<std::string> ExtractGpuBinary(mlir::ModuleOp module);
+
 }  // namespace kernel_gen
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_CUBIN_CREATOR_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_KERNEL_CREATOR_H_
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/tests/BUILD
new file mode 100644
index 00000000000..25568398442
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/BUILD
@@ -0,0 +1,25 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+
+package(licenses = ["notice"])
+
+glob_lit_tests(
+    data = [
+        ":test_utilities",
+        "@llvm-project//mlir:run_lit.sh",
+    ],
+    driver = "//tensorflow/compiler/mlir:run_lit.sh",
+    test_file_exts = ["mlir"],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/mlir:tf-opt",
+        "//tensorflow/compiler/mlir/hlo:mlir-hlo-opt",
+        "//tensorflow/compiler/mlir/tools/kernel_gen:kernel-gen-opt",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/bufferize.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/bufferize.mlir
new file mode 100644
index 00000000000..1a278365464
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/bufferize.mlir
@@ -0,0 +1,78 @@
+// RUN: kernel-gen-opt %s --bufferize | FileCheck %s
+
+// CHECK-LABEL: @extract_element
+// CHECK-SAME: (%[[ARG:.*]]: memref<?xf32>) -> f32
+func @extract_element(%arg : tensor<?xf32>) -> f32 {
+  // CHECK: %[[C0:.*]] = constant 0 : index
+  // CHECK: %[[RESULT:.*]] = load %[[ARG]][%[[C0]]]
+  // CHECK: return %[[RESULT]]
+  %c0 = constant 0 : index
+  %result = extract_element %arg[%c0] : tensor<?xf32>
+  return %result : f32
+}
+
+// CHECK-LABEL: @tensor_load
+// CHECK-SAME: (%[[ARG:.*]]: memref<?xf32>) -> memref<?xf32>
+func @tensor_load(%arg : memref<?xf32>) -> tensor<?xf32> {
+  // CHECK: return %[[ARG]] : memref<?xf32>
+  %result = tensor_load %arg : memref<?xf32>
+  return %result : tensor<?xf32>
+}
+
+// CHECK-LABEL: @tensor_from_elements
+// CHECK-SAME: (%[[A:.*]]: f32) -> memref<3xf32>
+func @tensor_from_elements(%a : f32) -> tensor<3xf32> {
+  // CHECK: %[[B:.*]] = constant 1.2
+  // CHECK: %[[C:.*]] = constant 2.3
+  // CHECK: %[[MEM:.*]] = alloca() : memref<3xf32>
+  // CHECK: %[[C0:.*]] = constant 0 : index
+  // CHECK: store %[[A]], %[[MEM]][%[[C0]]] : memref<3xf32>
+  // CHECK: %[[C1:.*]] = constant 1 : index
+  // CHECK: store %[[B]], %[[MEM]][%[[C1]]] : memref<3xf32>
+  // CHECK: %[[C2:.*]] = constant 2 : index
+  // CHECK: store %[[C]], %[[MEM]][%[[C2]]] : memref<3xf32>
+  // CHECK: return %[[MEM]] : memref<3xf32>
+  %b = constant 1.2 : f32
+  %c = constant 2.3 : f32
+  %result = tensor_from_elements %a, %b, %c : tensor<3xf32>
+  return %result : tensor<3xf32>
+}
+
+// CHECK-LABEL: @dynamic_tensor_from_elements
+// CHECK-SAME: (%[[ARG:.*]]: memref<*xf32>) -> memref<?xindex>
+func @dynamic_tensor_from_elements(%arg : tensor<*xf32>) -> tensor<?xindex> {
+  // CHECK: %[[C3:.*]] = constant 3 : index
+  // CHECK: %[[MEM:.*]] = alloca(%c3) : memref<?xindex>
+  // CHECK: %[[C0:.*]] = constant 0 : index
+  // CHECK: %[[C1:.*]] = constant 1 : index
+  // CHECK: scf.parallel (%[[I:.*]]) = (%[[C0]]) to (%[[C3]]) step (%[[C1]]) {
+  // CHECK:   %[[ELEM:.*]] = dim %[[ARG]], %[[I]] : memref<*xf32>
+  // CHECK:   store %[[ELEM]], %[[MEM]][%[[I]]] : memref<?xindex>
+  // CHECK:   scf.yield
+  // CHECK: }
+  // CHECK: return %[[MEM]] : memref<?xindex>
+  %c3 = constant 3 : index
+  %result = dynamic_tensor_from_elements %c3 {
+  ^bb0(%i : index):
+    %elem = dim %arg, %i : tensor<*xf32>
+    yield %elem : index
+  } : tensor<?xindex>
+  return %result : tensor<?xindex>
+}
+
+// CHECK-LABEL: @assuming
+// CHECK-SAME: (%[[WITNESS:.*]]: !shape.witness, %[[ARG:.*]]: memref<?xf32>)
+// CHECK-SAME: -> memref<?xf32>
+func @assuming(%witness: !shape.witness, %arg : memref<?xf32>)
+              -> tensor<?xf32> {
+  // CHECK-NEXT: %[[ASSUMING_RESULT:.*]] = shape.assuming %[[WITNESS]]
+  // CHECK-SAME:     -> (memref<?xf32>) {
+  // CHECK-NEXT:   shape.assuming_yield %[[ARG]] : memref<?xf32>
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return %[[ASSUMING_RESULT]] : memref<?xf32>
+  %assuming_result = shape.assuming %witness -> (tensor<?xf32>) {
+    %result = tensor_load %arg : memref<?xf32>
+    shape.assuming_yield %result : tensor<?xf32>
+  }
+  return %assuming_result : tensor<?xf32>
+}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/embed_tf_framework.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/embed_tf_framework.mlir
new file mode 100644
index 00000000000..bb0f1926cda
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/embed_tf_framework.mlir
@@ -0,0 +1,37 @@
+// RUN: kernel-gen-opt %s -embed-tf-framework -split-input-file | FileCheck %s
+
+// CHECK-LABEL: func @tf_entry(
+// CHECK-SAME:    [[CTX:%.*]]: !tf_framework.op_kernel_context,
+// CHECK-SAME:    [[SIZE_0:%.*]]: index,
+// CHECK-SAME:    [[SIZE_2:%.*]]: index) -> index attributes {tf_entry} {
+func @tf_entry(%size_0 : index , %size_2 : index) -> index
+    attributes {tf_entry} {
+  %buf = alloc(%size_0, %size_2)[] : memref<?x10x?xf32>
+  dealloc %buf : memref<?x10x?xf32>
+  std.return %size_0 : index
+}
+// CHECK-NEXT: [[VAL_3:%.*]] = tf_framework.alloc_raw
+// CHECK-SAME:   ([[CTX]], [[SIZE_0]], [[SIZE_2]]) : memref<?x10x?xf32>
+// CHECK-NEXT: tf_framework.dealloc_raw([[CTX]], [[VAL_3]]) : memref<?x10x?xf32>
+// CHECK-NEXT: return [[SIZE_0]] : index
+
+// -----
+
+// CHECK-LABEL: func @non_tf_entry(
+// CHECK-SAME:    [[SIZE_0:%.*]]: index, [[SIZE_2:%.*]]: index) -> index
+func @non_tf_entry(%size_0 : index , %size_2 : index) -> index {
+  std.return %size_0 : index
+}
+
+// -----
+
+// CHECK-LABEL: func @tf_entry(
+func @tf_entry(%size : index) attributes {tf_entry} {
+  %buf = alloc()[%size] : memref<64xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
+  dealloc %buf : memref<64xf32, affine_map<(d0)[s0] -> (d0 + s0)>>
+  std.return
+}
+// CHECK_NOT: alloc_raw
+// CHECK: alloc()
+// CHECK_NOT: dealloc_raw
+// CHECK: dealloc %
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/invalid.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/invalid.mlir
new file mode 100644
index 00000000000..1d1b3319515
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/invalid.mlir
@@ -0,0 +1,7 @@
+// RUN: kernel-gen-opt %s -split-input-file -verify-diagnostics
+
+func @alloc_raw(%ctx: !tf_framework.op_kernel_context, %size : index) {
+  // expected-error @+1 {{`dyn_sizes` count 1 does not match dynamic dimensions}}
+  %buf = tf_framework.alloc_raw(%ctx, %size) : memref<?x10x?xi8>
+  return
+}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/ops.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/ops.mlir
new file mode 100644
index 00000000000..fc8e7c97ec8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/ops.mlir
@@ -0,0 +1,25 @@
+// RUN: kernel-gen-opt %s | FileCheck %s
+// Verify the printed output can be parsed.
+// RUN: kernel-gen-opt %s | kernel-gen-opt | FileCheck %s
+// Verify the generic form can be parsed.
+// RUN: kernel-gen-opt -mlir-print-op-generic %s | kernel-gen-opt | FileCheck %s
+
+// CHECK-LABEL: func @alloc_raw
+func @alloc_raw(%ctx: !tf_framework.op_kernel_context,
+                   %size_0 : index , %size_2 : index) {
+  %buf_0 = tf_framework.alloc_raw(%ctx) : memref<10xi8>
+  %buf_1 = tf_framework.alloc_raw(%ctx, %size_0, %size_2) : memref<?x10x?xi8>
+  return
+}
+
+// CHECK-LABEL: func @dealloc_raw
+func @dealloc_raw(%ctx: !tf_framework.op_kernel_context, %memref : memref<?x10xf32>) {
+  tf_framework.dealloc_raw(%ctx, %memref) : memref<?x10xf32>
+  return
+}
+
+// CHECK-LABEL: func @null_context
+func @null_context() {
+  tf_framework.null_context() : !tf_framework.op_kernel_context
+  return
+}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/parallel_loops_to_sequential.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/parallel_loops_to_sequential.mlir
new file mode 100644
index 00000000000..df059759ecc
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/parallel_loops_to_sequential.mlir
@@ -0,0 +1,17 @@
+// RUN: kernel-gen-opt %s --parallel-loops-to-sequential | FileCheck %s
+
+// CHECK-LABEL: @parallel_loop
+func @parallel_loop(%lb_0 : index, %lb_1 : index,
+                     %ub_0 : index, %ub_1 : index,
+                     %s_0 : index, %s_1 : index,
+                     %buf: memref<?x?xindex>) {
+  scf.parallel (%i0, %i1) = (%lb_0, %lb_1) to (%ub_0, %ub_1) step (%s_0, %s_1) {
+    %sum_elem = addi %i0, %i1 : index
+    store %sum_elem, %buf[%i0, %i1] : memref<?x?xindex>
+  }
+  return
+}
+// CHECK: scf.for [[I_0:%.*]] = [[LB_0:%.*]] to [[UB_0:%.*]] step [[S_0:%.*]]
+// CHECK:   scf.for [[I_1:%.*]] = [[LB_1:%.*]] to [[UB_1:%.*]] step [[S_1:%.*]]
+// CHECK:     [[SUM:%.*]] = addi [[I_0]], [[I_1]] : index
+// CHECK:     store [[SUM]], {{%.*}}{{\[}}[[I_0]], [[I_1]]] : memref<?x?xindex>
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tanh.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tanh.mlir
new file mode 100644
index 00000000000..53d02322c55
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tanh.mlir
@@ -0,0 +1,20 @@
+// RUN: tf-opt %s --xla-legalize-tf | mlir-hlo-opt --transform-unranked-hlo | kernel-gen-opt -allow-unregistered-dialect --shape-to-descriptors --canonicalize --bufferize | FileCheck %s
+
+// Test whether all shape computations required for tanh can be lowered to
+// the standard dialect, scf and descriptors. We check for a sparse pattern here,
+// as each lowering pattern is already tested and we just care for the
+// integration.
+// TODO: Expand this pattern once things have stabilized.
+// CHECK-LABEL: @tanh
+func @tanh(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK: alloca
+  // CHECK: scf.parallel
+  // CHECK-NOT: tensor_load
+  // CHECK: scf.for
+  // CHECK-NOT: tensor_from_elements
+  // CHECK: mhlo.reshape_memref_cast
+  // CHECK: lmhlo.tanh
+  // CHECK: mhlo.reshape_memref_cast
+  %0 = "tf.Tanh"(%arg0) { } : (tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf-legalize-to-mlhlo.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf-legalize-to-mlhlo.mlir
new file mode 100644
index 00000000000..2fc585d9e9d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf-legalize-to-mlhlo.mlir
@@ -0,0 +1,26 @@
+// RUN: tf-opt %s --xla-legalize-tf='legalize-chlo=false' | mlir-hlo-opt --transform-unranked-hlo --chlo-legalize-to-hlo | kernel-gen-opt --shape-to-descriptors --canonicalize --bufferize
+
+func @acos(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.Acos"(%arg0) { } : (tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+func @tan(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.Tan"(%arg0) { } : (tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+func @tanh(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.Tanh"(%arg0) { } : (tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+func @sin(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.Sin"(%arg0) { } : (tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+func @sinh(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.Sinh"(%arg0) { } : (tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir
new file mode 100644
index 00000000000..b943321e95b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir
@@ -0,0 +1,75 @@
+// RUN: kernel-gen-opt %s -tf-kernel-to-llvm -split-input-file | FileCheck %s
+
+// CHECK: llvm.func @_mlir_ciface_tf_alloc_raw
+// CHECK-SAME:  (!llvm.ptr<i8>, !llvm.i64) -> !llvm.ptr<i8>
+
+// CHECK-LABEL: llvm.func @alloc_raw(
+// CHECK-SAME:    [[TF_CTX:%.*]]: !llvm.ptr<i8>,
+// CHECK-SAME:    [[SIZE_0:%.*]]: !llvm.i64,
+// CHECK-SAME:    [[SIZE_2:%.*]]: !llvm.i64) -> [[DESC_TY:!.*]] {
+func @alloc_raw(%ctx: !tf_framework.op_kernel_context,
+                %size_0 : index , %size_2 : index) -> memref<?x10x?xf32> {
+  %buf = tf_framework.alloc_raw(%ctx, %size_0, %size_2) : memref<?x10x?xf32>
+  std.return %buf : memref<?x10x?xf32>
+}
+// Compute number of elements.
+// CHECK: [[SIZE_1:%.*]] = llvm.mlir.constant(10 : index) : !llvm.i64
+// CHECK: [[NUM_ELEM_0:%.*]] = llvm.mul [[SIZE_0]], [[SIZE_1]] : !llvm.i64
+// CHECK: [[NUM_ELEM_1:%.*]] = llvm.mul [[NUM_ELEM_0]], [[SIZE_2]] : !llvm.i64
+
+// Compute the size of an individual element.
+// CHECK: [[NULL:%.*]] = llvm.mlir.null : !llvm.ptr<float>
+// CHECK: [[C1:%.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK: [[GEP:%.*]] = llvm.getelementptr [[NULL]]{{\[}}[[C1]]]
+// CHECK-SAME:            (!llvm.ptr<float>, !llvm.i64) -> !llvm.ptr<float>
+// CHECK: [[SIZE_OF_FLOAT:%.*]] = llvm.ptrtoint [[GEP]]
+// CHECK-SAME:            !llvm.ptr<float> to !llvm.i64
+
+// Allocate memory.
+// CHECK: [[NUM_BYTES:%.*]] = llvm.mul [[NUM_ELEM_1]], [[SIZE_OF_FLOAT]]
+// CHECK: [[BYTES_PTR:%.*]] = llvm.call @{{.*}}([[TF_CTX]], [[NUM_BYTES]])
+// CHECK-SAME:                  (!llvm.ptr<i8>, !llvm.i64) -> !llvm.ptr<i8>
+
+// Build memref descriptor.
+// CHECK: [[DESC_0:%.*]] = llvm.mlir.undef : [[DESC_TY]]
+
+// Set pointers and offset.
+// CHECK: [[FLOAT_PTR:%.*]] = llvm.bitcast [[BYTES_PTR]]
+// CHECK-SAME:                  !llvm.ptr<i8> to !llvm.ptr<float>
+// CHECK: [[DESC_1:%.*]] = llvm.insertvalue [[FLOAT_PTR]], [[DESC_0]][0]
+// CHECK: [[DESC_2:%.*]] = llvm.insertvalue [[FLOAT_PTR]], [[DESC_1]][1]
+// CHECK: [[C0:%.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+// CHECK: [[DESC_3:%.*]] = llvm.insertvalue [[C0]], [[DESC_2]][2] : [[DESC_TY]]
+
+// Set sizes and strides.
+// CHECK: [[STRIDE_2:%.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK: [[DESC_4:%.*]] = llvm.insertvalue [[SIZE_2]], [[DESC_3]][3, 2]
+// CHECK: [[DESC_5:%.*]] = llvm.insertvalue [[STRIDE_2]], [[DESC_4]][4, 2]
+// CHECK: [[STRIDE_1:%.*]] = llvm.mul [[STRIDE_2]], [[SIZE_2]] : !llvm.i64
+// CHECK: [[DESC_6:%.*]] = llvm.insertvalue [[SIZE_1]], [[DESC_5]][3, 1]
+// CHECK: [[DESC_7:%.*]] = llvm.insertvalue [[STRIDE_1]], [[DESC_6]][4, 1]
+// CHECK: [[STRIDE_0:%.*]] = llvm.mul [[STRIDE_1]], [[SIZE_1]] : !llvm.i64
+// CHECK: [[DESC_8:%.*]] = llvm.insertvalue [[SIZE_0]], [[DESC_7]][3, 0]
+// CHECK: [[DESC_9:%.*]] = llvm.insertvalue [[STRIDE_0]], [[DESC_8]][4, 0]
+// CHECK: llvm.return [[DESC_9]] : [[DESC_TY]]
+
+// -----
+
+// CHECK: llvm.func @_mlir_ciface_tf_dealloc_raw(!llvm.ptr<i8>, !llvm.ptr<i8>)
+
+// CHECK-LABEL: llvm.func @dealloc_raw(
+// CHECK-SAME:    [[TF_CTX:%.*]]: !llvm.ptr<i8>,
+func @dealloc_raw(%ctx: !tf_framework.op_kernel_context,
+                  %memref : memref<?x10xf32>) {
+  tf_framework.dealloc_raw(%ctx, %memref) : memref<?x10xf32>
+  return
+}
+// Extract allocated ptr from the memref descriptor.
+// CHECK: %{{.*}} = llvm.mlir.undef : [[DESC_TY:!.*]]
+// CHECK: [[FLOAT_PTR:%.*]] = llvm.extractvalue %{{.*}}[0] : [[DESC_TY]]
+// CHECK-NEXT: [[VOID_PTR:%.*]] = llvm.bitcast [[FLOAT_PTR]]
+// CHECK-SAME:                   !llvm.ptr<float> to !llvm.ptr<i8>
+
+// Deallocate.
+// CHECK: llvm.call @_mlir_ciface_tf_dealloc_raw(
+// CHECK-SAME: [[TF_CTX]], [[VOID_PTR]]) : (!llvm.ptr<i8>, !llvm.ptr<i8>) -> ()
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary/BUILD
new file mode 100644
index 00000000000..6aef5c05fe9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary/BUILD
@@ -0,0 +1,17 @@
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+
+package(licenses = ["notice"])
+
+glob_lit_tests(
+    data = [
+        "//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_gpu_binary",
+        "@llvm-project//mlir:run_lit.sh",
+    ],
+    default_tags = [
+        # We need access to the CUDA SDK.
+        "gpu",
+        "no_rocm",
+    ],
+    driver = "//tensorflow/compiler/mlir:run_lit.sh",
+    test_file_exts = ["mlir"],
+)
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary/abs.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary/abs.mlir
new file mode 100644
index 00000000000..edb023e5fe7
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary/abs.mlir
@@ -0,0 +1,6 @@
+// RUN: tf_to_gpu_binary --input=%s --output=%t --same_shape=0,1 --unroll_factors=4 --tile_sizes=256 --arch=sm_70
+func @abs(%arg0: tensor<?xf16>) -> tensor<?xf16> {
+  %0 = "tf.Abs"(%arg0) { }
+    : (tensor<?xf16>) -> tensor<?xf16>
+  return %0 : tensor<?xf16>
+}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary/ceil.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary/ceil.mlir
new file mode 100644
index 00000000000..25b79c47f4e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary/ceil.mlir
@@ -0,0 +1,6 @@
+// RUN: tf_to_gpu_binary --input=%s --output=%t --same_shape=0,1 --unroll_factors=4 --tile_sizes=256 --arch=sm_70
+func @ceil(%arg0: tensor<?xf64>) -> tensor<?xf64> {
+  %0 = "tf.Ceil"(%arg0) { }
+    : (tensor<?xf64>) -> tensor<?xf64>
+  return %0 : tensor<?xf64>
+}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary/tanh.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary/tanh.mlir
new file mode 100644
index 00000000000..69632f498a9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary/tanh.mlir
@@ -0,0 +1,5 @@
+// RUN: tf_to_gpu_binary --input=%s --output=%t --same_shape=0,1 --unroll_factors=4 --tile_sizes=256 --arch=sm_70
+func @tanh(%arg0: tensor<?xf32>) -> tensor<?xf32> {
+  %0 = "tf.Tanh"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/BUILD
new file mode 100644
index 00000000000..24e288c246c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/BUILD
@@ -0,0 +1,17 @@
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+
+package(licenses = ["notice"])
+
+glob_lit_tests(
+    data = [
+        "//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_kernel",
+        "@llvm-project//mlir:run_lit.sh",
+    ],
+    default_tags = [
+        # We need access to the CUDA SDK.
+        "gpu",
+        "no_rocm",
+    ],
+    driver = "//tensorflow/compiler/mlir:run_lit.sh",
+    test_file_exts = ["mlir"],
+)
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/tanh.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/tanh.mlir
new file mode 100644
index 00000000000..85bea1795a5
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/tanh.mlir
@@ -0,0 +1,6 @@
+// RUN: tf_to_kernel --input=%s --output=%t --same_shape=0,1 --unroll_factors=4 --tile_sizes=256 --arch=sm_70,compute_75
+
+func @tanh(%arg: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.Tanh"(%arg) : (tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_cuda_runtime_wrappers.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_cuda_runtime_wrappers.cc
new file mode 100644
index 00000000000..06d613e0599
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_cuda_runtime_wrappers.cc
@@ -0,0 +1,113 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implements C wrappers around the CUDA library for easy linking in ORC jit.
+// Also adds some debugging helpers that are helpful when writing MLIR code to
+// run on GPUs.
+
+#include <cassert>
+#include <numeric>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/ExecutionEngine/CRunnerUtils.h"  // from @llvm-project
+
+#if GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda.h"
+
+#define CUDA_REPORT_IF_ERROR(expr)                                             \
+  [](CUresult result) {                                                        \
+    if (!result)                                                               \
+      return;                                                                  \
+    const char *name = nullptr;                                                \
+    cuGetErrorName(result, &name);                                             \
+    if (!name)                                                                 \
+      name = "<unknown>";                                                      \
+    llvm::errs() << "'" << #expr << "' failed with '" << name << "'\n";        \
+  }(expr)
+
+extern "C" CUmodule mgpuModuleLoad(void *data) {
+  CUmodule module = nullptr;
+  CUDA_REPORT_IF_ERROR(cuModuleLoadData(&module, data));
+  return module;
+}
+
+extern "C" CUfunction mgpuModuleGetFunction(CUmodule module, const char *name) {
+  CUfunction function = nullptr;
+  CUDA_REPORT_IF_ERROR(cuModuleGetFunction(&function, module, name));
+  return function;
+}
+
+// The wrapper uses intptr_t instead of CUDA's unsigned int to match
+// the type of MLIR's index type. This avoids the need for casts in the
+// generated MLIR code.
+extern "C" void mgpuLaunchKernel(CUfunction function, intptr_t gridX,
+                                 intptr_t gridY, intptr_t gridZ,
+                                 intptr_t blockX, intptr_t blockY,
+                                 intptr_t blockZ, int32_t smem, CUstream stream,
+                                 void **params, void **extra) {
+  CUDA_REPORT_IF_ERROR(cuLaunchKernel(function, gridX, gridY, gridZ, blockX,
+                                      blockY, blockZ, smem, stream, params,
+                                      extra));
+}
+
+extern "C" CUstream mgpuStreamCreate() {
+  static CUstream stream = []() {
+    // TODO(b/170649852): This is neither thread-safe nor handles
+    // creation/descruction of one stream per context.
+    CUstream stream = nullptr;
+    CUDA_REPORT_IF_ERROR(cuStreamCreate(&stream, CU_STREAM_NON_BLOCKING));
+    return stream;
+  }();
+  return stream;
+}
+
+extern "C" void mgpuStreamSynchronize(CUstream stream) {
+  CUDA_REPORT_IF_ERROR(cuStreamSynchronize(stream));
+}
+
+/// Helper functions for writing mlir example code
+
+// Allows to register byte array with the CUDA runtime. Helpful until we have
+// transfer functions implemented.
+extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) {
+  CUDA_REPORT_IF_ERROR(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0));
+}
+
+// Allows to register a MemRef with the CUDA runtime. Helpful until we have
+// transfer functions implemented.
+extern "C" void
+mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
+                          int64_t elementSizeBytes) {
+
+  llvm::SmallVector<int64_t, 4> denseStrides(rank);
+  llvm::ArrayRef<int64_t> sizes(descriptor->sizes, rank);
+  llvm::ArrayRef<int64_t> strides(sizes.end(), rank);
+
+  std::partial_sum(sizes.rbegin(), sizes.rend(), denseStrides.rbegin(),
+                   std::multiplies<int64_t>());
+  auto sizeBytes = denseStrides.front() * elementSizeBytes;
+
+  // Only densely packed tensors are currently supported.
+  std::rotate(denseStrides.begin(), denseStrides.begin() + 1,
+              denseStrides.end());
+  denseStrides.back() = 1;
+  assert(strides == llvm::makeArrayRef(denseStrides));
+
+  auto ptr = descriptor->data + descriptor->offset * elementSizeBytes;
+  mgpuMemHostRegister(ptr, sizeBytes);
+}
+
+#endif
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_gpu_binary.cc
similarity index 56%
rename from tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
rename to tensorflow/compiler/mlir/tools/kernel_gen/tf_to_gpu_binary.cc
index 96831689600..84c2bf46b55 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_gpu_binary.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-//===- tf_to_cubin.cc -------------------------------------------*- C++ -*-===//
+//===- tf_to_gpu_binary.cc --------------------------------------*- C++ -*-===//
 //
-// This file implements the entry point to compile a tf op to a cubin file.
+// This file implements the entry point to compile a tf op to a gpu binary
 //
 //===----------------------------------------------------------------------===//
 #include <string>
@@ -23,10 +23,44 @@
 
 #include "absl/strings/string_view.h"
 #include "llvm/Support/CommandLine.h"
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/init_mlir.h"
-#include "tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+namespace kernel_gen {
+namespace {
+
+xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
+                std::string architecture, llvm::ArrayRef<uint32_t> tile_sizes,
+                llvm::ArrayRef<uint32_t> same_shape,
+                llvm::ArrayRef<uint32_t> unroll_factors) {
+  // Read TF code.
+  std::string tf_code;
+  TF_RETURN_IF_ERROR(
+      ReadFileToString(Env::Default(), input_file.str(), &tf_code));
+  // Compile.
+  mlir::MLIRContext context;
+  TF_ASSIGN_OR_RETURN(
+      mlir::OwningModuleRef module,
+      GenerateKernelForTfCode(context, tf_code, /*gpu_binary_only=*/true,
+                              architecture, tile_sizes, same_shape,
+                              unroll_factors, /*generate_fatbin=*/false));
+  // Extract gpu_binary.
+  TF_ASSIGN_OR_RETURN(std::string gpu_binary, ExtractGpuBinary(*module));
+
+  // Write gpu_binary blob.
+  TF_RETURN_IF_ERROR(
+      WriteStringToFile(Env::Default(), output_file.str(), gpu_binary));
+  return xla::Status::OK();
+}
+
+}  // namespace
+}  // namespace kernel_gen
+}  // namespace tensorflow
 
 int main(int argc, char** argv) {
   llvm::cl::opt<std::string> input_file("input", llvm::cl::desc("input file"),
@@ -35,9 +69,9 @@ int main(int argc, char** argv) {
   llvm::cl::opt<std::string> output_file(
       "output", llvm::cl::desc("output file"), llvm::cl::value_desc("filename"),
       llvm::cl::init("foo.bin"));
-  llvm::cl::opt<int32_t> architecture(
-      "arch", llvm::cl::desc("target architecture (e.g. 50 for sm_50)"),
-      llvm::cl::init(50));
+  llvm::cl::opt<std::string> architecture(
+      "arch", llvm::cl::desc("target architecture (e.g. sm_50)"),
+      llvm::cl::init("sm_50"));
   llvm::cl::list<uint32_t> tile_sizes(
       "tile_sizes", llvm::cl::desc("tile sizes to use"), llvm::cl::ZeroOrMore,
       llvm::cl::CommaSeparated);
@@ -51,38 +85,15 @@ int main(int argc, char** argv) {
       llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated);
 
   tensorflow::InitMlir y(&argc, &argv);
+  mlir::registerPassManagerCLOptions();
   llvm::cl::ParseCommandLineOptions(argc, argv, "TF op GPU kernel generator\n");
 
-  std::pair<int32_t, int32_t> compute_capability(architecture / 10,
-                                                 architecture % 10);
-
-  std::string tf_code;
-  auto read_status = tensorflow::ReadFileToString(tensorflow::Env::Default(),
-                                                  input_file, &tf_code);
-  if (!read_status.ok()) {
-    LOG(ERROR) << read_status;
-    return 1;
-  }
-
-  auto cubin = tensorflow::kernel_gen::GenerateCubinForTfCode(
-      tf_code, compute_capability, tile_sizes, same_shape, unroll_factors);
-
-  if (!cubin.ok()) {
-    LOG(ERROR) << cubin.status();
-    return 1;
-  }
-
-  std::vector<uint8_t> cubin_data = cubin.ConsumeValueOrDie();
-
-  auto status = tensorflow::WriteStringToFile(
-      tensorflow::Env::Default(), output_file,
-      absl::string_view{reinterpret_cast<char*>(cubin_data.data()),
-                        cubin_data.size()});
-
+  auto status =
+      tensorflow::kernel_gen::Run(input_file, output_file, architecture,
+                                  tile_sizes, same_shape, unroll_factors);
   if (!status.ok()) {
     LOG(ERROR) << status;
     return 1;
   }
-
   return 0;
 }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
new file mode 100644
index 00000000000..87c8e57804b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
@@ -0,0 +1,162 @@
+// Copyright 2020 The TensorFlow Runtime Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//===- tf_to_kernel.cc ------------------------------------------*- C++ -*-===//
+//
+// This file implements the entry point to compile a tf op to a kernel.
+//
+//===----------------------------------------------------------------------===//
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Target/LLVMIR.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+namespace kernel_gen {
+namespace {
+
+static llvm::codegen::RegisterCodeGenFlags CGF;
+
+std::unique_ptr<llvm::TargetMachine> GetTargetMachine(llvm::Module* module) {
+  llvm::Triple triple(module->getTargetTriple());
+  if (triple.getTriple().empty()) {
+    triple = llvm::Triple(llvm::sys::getDefaultTargetTriple());
+    module->setTargetTriple(triple.getTriple());
+  }
+
+  std::string error;
+  const llvm::Target* target =
+      llvm::TargetRegistry::lookupTarget("", triple, error);
+  if (!target) {
+    return nullptr;
+  }
+
+  llvm::TargetOptions target_options =
+      llvm::codegen::InitTargetOptionsFromCodeGenFlags(llvm::Triple());
+  return std::unique_ptr<llvm::TargetMachine>(target->createTargetMachine(
+      triple.str(), "generic", "", target_options, llvm::Reloc::Model::PIC_));
+}
+
+// Compiles the given MLIR module via LLVM into an executable binary format.
+xla::StatusOr<std::string> EmitToBinary(mlir::ModuleOp module) {
+  // Translate the module.
+  llvm::LLVMContext llvm_context;
+  std::unique_ptr<llvm::Module> llvm_module =
+      mlir::translateModuleToLLVMIR(module, llvm_context);
+
+  // Set up the output stream.
+  llvm::SmallString<8> outstr;
+  llvm::raw_svector_ostream ostream(outstr);
+  ostream.SetUnbuffered();
+
+  llvm::legacy::PassManager codegen_passes;
+  codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
+      llvm::Triple(llvm_module->getTargetTriple())));
+
+  // TODO(b/163818770): Apply optimizations before dumping .a file.
+  auto target_machine = GetTargetMachine(llvm_module.get());
+  llvm_module->setDataLayout(target_machine->createDataLayout());
+  if (target_machine->addPassesToEmitFile(codegen_passes, ostream, nullptr,
+                                          llvm::CGFT_ObjectFile, false)) {
+    return xla::InternalError("Failed add passes to emit file");
+  }
+  codegen_passes.run(*llvm_module);
+  return ostream.str().str();
+}
+
+xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
+                llvm::ArrayRef<std::string> architectures,
+                llvm::ArrayRef<uint32_t> tile_sizes,
+                llvm::ArrayRef<uint32_t> same_shape,
+                llvm::ArrayRef<uint32_t> unroll_factors) {
+  // Read TF code.
+  std::string tf_code;
+  TF_RETURN_IF_ERROR(
+      ReadFileToString(Env::Default(), input_file.str(), &tf_code));
+  // Compile.
+  mlir::MLIRContext context;
+  TF_ASSIGN_OR_RETURN(
+      mlir::OwningModuleRef module,
+      GenerateKernelForTfCode(context, tf_code, /*gpu_binary_only=*/false,
+                              architectures, tile_sizes, same_shape,
+                              unroll_factors));
+  // Get binary.
+  TF_ASSIGN_OR_RETURN(std::string binary, EmitToBinary(*module));
+
+  // Write .a file.
+  TF_RETURN_IF_ERROR(
+      WriteStringToFile(Env::Default(), output_file.str(), binary));
+  return xla::Status::OK();
+}
+
+}  // namespace
+}  // namespace kernel_gen
+}  // namespace tensorflow
+
+int main(int argc, char** argv) {
+  llvm::cl::opt<std::string> input_file("input", llvm::cl::desc("input file"),
+                                        llvm::cl::value_desc("filename"),
+                                        llvm::cl::init("foo.mlir"));
+  llvm::cl::opt<std::string> output_file(
+      "output", llvm::cl::desc("output file"), llvm::cl::value_desc("filename"),
+      llvm::cl::init("foo.bin"));
+  llvm::cl::list<std::string> architectures(
+      "arch", llvm::cl::desc("target architectures (e.g. sm_70 or compute_75)"),
+      llvm::cl::OneOrMore, llvm::cl::CommaSeparated);
+  llvm::cl::list<uint32_t> tile_sizes(
+      "tile_sizes", llvm::cl::desc("tile sizes to use"), llvm::cl::ZeroOrMore,
+      llvm::cl::CommaSeparated);
+  llvm::cl::list<uint32_t> unroll_factors(
+      "unroll_factors",
+      llvm::cl::desc("factors to unroll by, separated by commas"),
+      llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated);
+  llvm::cl::list<uint32_t> same_shape(
+      "same_shape",
+      llvm::cl::desc("arguments with same shape, separated by commas"),
+      llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated);
+
+  tensorflow::InitMlir y(&argc, &argv);
+  llvm::InitializeNativeTarget();
+  llvm::InitializeNativeTargetAsmPrinter();
+  mlir::registerPassManagerCLOptions();
+  llvm::cl::ParseCommandLineOptions(argc, argv, "TF op GPU kernel generator\n");
+
+  auto status =
+      tensorflow::kernel_gen::Run(input_file, output_file, architectures,
+                                  tile_sizes, same_shape, unroll_factors);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+    return 1;
+  }
+  return 0;
+}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
index b0f22b40f5b..b2595d2ad3a 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
@@ -1,4 +1,14 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//third_party/mlir:tblgen.bzl", "gentbl")
+load(
+    "//tensorflow/core/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
 
 package(
     default_visibility = ["//tensorflow/compiler/mlir/tools/kernel_gen:friends"],
@@ -28,6 +38,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
@@ -51,6 +62,7 @@ cc_library(
 
 gentbl(
     name = "kernel_gen_passes_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [("-gen-pass-decls -name KernelGen", "kernel_gen_passes.h.inc")],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "passes.td",
@@ -62,32 +74,59 @@ cc_library(
     srcs = [
         "bufferize_pass.cc",
         "embed_tf_framework_pass.cc",
+        "gpu_kernel_to_blob_pass.cc",
+        "materialize_broadcasts_pass.cc",
+        "parallel_loops_to_sequential.cc",
+        "propagate_tf_abi_knowledge_pass.cc",
         "shape_to_descriptors_pass.cc",
-        "tf_framework_legalize_to_llvm_pass.cc",
+        "tf_kernel_to_llvm_pass.cc",
+        "unfuse_batch_norm_pass.cc",
     ],
     hdrs = ["passes.h"],
+    copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]) + if_rocm_is_configured(["-DTENSORFLOW_USE_ROCM=1"]),
     deps = [
+        "//tensorflow/compiler/mlir/hlo:materialize_broadcasts",  # buildcleaner: keep
+        "//tensorflow/compiler/mlir/hlo:unfuse_batch_norm",  # buildcleaner: keep
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla/service/gpu:target_constants",
+        "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
+        "//tensorflow/core/platform:cuda_libdevice_path",
+        "//tensorflow/core:lib",
         ":bufferize",
         ":embed_tf_framework",
         ":kernel_gen_passes_inc_gen",
         ":tf_framework_legalize_to_llvm",
-        "//tensorflow/compiler/mlir/hlo",
-        "//tensorflow/compiler/mlir/hlo:hlo_legalize_to_lhlo",
-        "//tensorflow/compiler/mlir/hlo:lhlo",
-        "//tensorflow/compiler/mlir/hlo:lhlo_legalize_to_llvm",
-        "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:GPUToGPURuntimeTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:LLVMTransforms",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:Shape",
-        "@llvm-project//mlir:ShapeToSCF",
+        "@llvm-project//mlir:TargetNVVMIR",
+        "@llvm-project//mlir:TargetROCDLIR",
         "@llvm-project//mlir:ShapeToStandard",
+        "@llvm-project//mlir:SCFToStandard",
         "@llvm-project//mlir:ShapeTransforms",
         "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
-    ],
+        "@llvm-project//llvm:TransformUtils",
+        "//tensorflow/compiler/mlir/hlo",
+        "//tensorflow/compiler/mlir/hlo:hlo_legalize_to_lhlo",
+        "//tensorflow/compiler/mlir/hlo:lhlo",
+        "//tensorflow/compiler/xla/service/gpu:stream_executor_util",
+        "//tensorflow/compiler/mlir/hlo:lhlo_legalize_to_llvm",
+        "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
+    ] + if_cuda_is_configured([
+        "//tensorflow/stream_executor/gpu:asm_compiler",
+    ]) + if_rocm_is_configured([
+        "//tensorflow/core/platform:rocm_rocdl_path",
+    ]),
 )
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc
index 3d5c820e6dd..f2b5e14bd30 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc
@@ -15,17 +15,22 @@ limitations under the License.
 
 // This file implements logic for translating mixed IR to buffer form.
 
+#include "mlir/Transforms/Bufferize.h"  // from @llvm-project
+
 #include <cstddef>
 #include <memory>
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 
 namespace mlir {
@@ -35,10 +40,10 @@ namespace transforms {
 namespace {
 
 class TensorFromElementsOpConverter
-    : public BufferAssignmentOpConversionPattern<TensorFromElementsOp> {
+    : public BufferizeOpConversionPattern<TensorFromElementsOp> {
  public:
-  using BufferAssignmentOpConversionPattern<
-      TensorFromElementsOp>::BufferAssignmentOpConversionPattern;
+  using BufferizeOpConversionPattern<
+      TensorFromElementsOp>::BufferizeOpConversionPattern;
 
   LogicalResult matchAndRewrite(
       TensorFromElementsOp op, ArrayRef<Value> operands,
@@ -58,11 +63,63 @@ class TensorFromElementsOpConverter
   }
 };
 
-class TensorLoadOpConversion
-    : public BufferAssignmentOpConversionPattern<TensorLoadOp> {
+class DynamicTensorFromElementsOpConverter
+    : public BufferizeOpConversionPattern<DynamicTensorFromElementsOp> {
  public:
-  using BufferAssignmentOpConversionPattern<
-      TensorLoadOp>::BufferAssignmentOpConversionPattern;
+  using BufferizeOpConversionPattern<
+      DynamicTensorFromElementsOp>::BufferizeOpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      DynamicTensorFromElementsOp op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    // Allocate memory on stack.
+    Location loc = op.getLoc();
+    DynamicTensorFromElementsOp::Adaptor transformed(operands);
+    RankedTensorType tensor_ty = op.getType().cast<RankedTensorType>();
+    MemRefType memref_type =
+        MemRefType::get(tensor_ty.getShape(), tensor_ty.getElementType());
+    Value result = rewriter.create<AllocaOp>(loc, memref_type,
+                                             transformed.dynamicExtents());
+
+    // Collect loop bounds.
+    int64_t rank = tensor_ty.getRank();
+    Value zero = rewriter.create<ConstantIndexOp>(loc, 0);
+    Value one = rewriter.create<ConstantIndexOp>(loc, 1);
+    SmallVector<Value, 4> lower_bounds(rank, zero);
+    SmallVector<Value, 4> steps(rank, one);
+    SmallVector<Value, 4> upper_bounds;
+    int next_dynamic_index = 0;
+    for (int i = 0; i < rank; i++) {
+      Value ub = tensor_ty.isDynamicDim(i)
+                     ? transformed.dynamicExtents()[next_dynamic_index++]
+                     : rewriter.create<ConstantIndexOp>(
+                           loc, memref_type.getDimSize(i));
+      upper_bounds.push_back(ub);
+    }
+
+    // Generate tensor elements.
+    rewriter.create<scf::ParallelOp>(
+        loc, lower_bounds, upper_bounds, steps,
+        [&](OpBuilder &b, Location loc, ValueRange ivs) {
+          BlockAndValueMapping mapping;
+          mapping.map(op.body().getArguments(), ivs);
+          for (auto &nested_op : op.getBody()->without_terminator())
+            b.clone(nested_op, mapping);
+          auto yield_op = llvm::cast<YieldOp>(op.getBody()->getTerminator());
+          b.create<StoreOp>(loc, mapping.lookup(yield_op.value()), result, ivs);
+          b.create<scf::YieldOp>(loc);
+        });
+
+    rewriter.replaceOp(op, {result});
+    return success();
+  }
+};
+
+class TensorLoadOpConversion
+    : public BufferizeOpConversionPattern<TensorLoadOp> {
+ public:
+  using BufferizeOpConversionPattern<
+      TensorLoadOp>::BufferizeOpConversionPattern;
 
   LogicalResult matchAndRewrite(
       TensorLoadOp op, ArrayRef<Value> operands,
@@ -74,17 +131,17 @@ class TensorLoadOpConversion
 };
 
 class ExtractElementOpConversion
-    : public BufferAssignmentOpConversionPattern<ExtractElementOp> {
+    : public BufferizeOpConversionPattern<ExtractElementOp> {
  public:
-  using BufferAssignmentOpConversionPattern<
-      ExtractElementOp>::BufferAssignmentOpConversionPattern;
+  using BufferizeOpConversionPattern<
+      ExtractElementOp>::BufferizeOpConversionPattern;
 
   LogicalResult matchAndRewrite(
       ExtractElementOp op, ArrayRef<Value> operands,
       ConversionPatternRewriter &rewriter) const final {
     ExtractElementOpAdaptor adaptor(operands);
 
-    if (!adaptor.aggregate().getType().isa<MemRefType>()) {
+    if (!adaptor.aggregate().getType().isa<BaseMemRefType>()) {
       return failure();
     }
 
@@ -94,15 +151,49 @@ class ExtractElementOpConversion
   }
 };
 
+template <typename OpTy>
+class SimpleOpResultConversion : public BufferizeOpConversionPattern<OpTy> {
+ public:
+  using BufferizeOpConversionPattern<OpTy>::BufferizeOpConversionPattern;
+  using BufferizeOpConversionPattern<OpTy>::converter;
+
+  LogicalResult matchAndRewrite(
+      OpTy op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    rewriter.replaceOpWithNewOp<OpTy>(op, converter.convertType(op.getType()),
+                                      operands);
+    return success();
+  }
+};
+
+class TensorCastOpConverter
+    : public BufferizeOpConversionPattern<TensorCastOp> {
+ public:
+  using BufferizeOpConversionPattern<
+      TensorCastOp>::BufferizeOpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      TensorCastOp op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    Value arg = operands.front();
+    if (!arg.getType().isa<BaseMemRefType>()) return failure();
+
+    auto result_ty = converter.convertType(op.getType());
+    rewriter.replaceOpWithNewOp<MemRefCastOp>(op, arg, result_ty);
+
+    return success();
+  }
+};
+
 }  // namespace
 
 void populateStandardBufferizePattern(MLIRContext *context,
-                                      BufferAssignmentPlacer *bufferAssignment,
-                                      TypeConverter *converter,
+                                      BufferizeTypeConverter *converter,
                                       OwningRewritePatternList *patterns) {
   patterns->insert<ExtractElementOpConversion, TensorFromElementsOpConverter,
-                   TensorLoadOpConversion>(context, bufferAssignment,
-                                           converter);
+                   DynamicTensorFromElementsOpConverter,
+                   SimpleOpResultConversion<SelectOp>, TensorLoadOpConversion,
+                   TensorCastOpConverter>(context, *converter);
 }
 
 }  // namespace transforms
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc
index ef07c801bc4..9a531515012 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <memory>
 
 #include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -26,8 +28,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
+#include "mlir/Transforms/Bufferize.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
@@ -67,25 +68,25 @@ class UnrankedTensorStoreTestOnlyPattern
 };
 
 struct BufferizePass : public BufferizePassBase<BufferizePass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<lmhlo::LmhloDialect>();
+  }
+
  public:
   void runOnOperation() override {
-    OwningRewritePatternList patterns;
     auto& context = getContext();
     ConversionTarget target(context);
-    target.addLegalDialect<lmhlo::LmhloDialect>();
-    target.addLegalDialect<StandardOpsDialect>();
-    target.addLegalDialect<scf::SCFDialect>();
-    target.addLegalOp<ModuleOp>();
-    target.addLegalOp<ModuleTerminatorOp>();
+    target.addLegalDialect<lmhlo::LmhloDialect, scf::SCFDialect,
+                           StandardOpsDialect>();
+    target.addLegalOp<ModuleOp, ModuleTerminatorOp>();
     target.addIllegalDialect<mhlo::MhloDialect>();
-    target.addIllegalOp<TensorFromElementsOp>();
-    target.addIllegalOp<ExtractElementOp>();
-    target.addIllegalOp<TensorLoadOp>();
+    target.addIllegalOp<DynamicTensorFromElementsOp, ExtractElementOp,
+                        TensorFromElementsOp, TensorLoadOp, TensorCastOp>();
     target.addDynamicallyLegalOp<TensorStoreOp>([&](TensorStoreOp op) {
       return !op.tensor().getType().isa<UnrankedTensorType>();
     });
 
-    BufferAssignmentTypeConverter converter;
+    BufferizeTypeConverter converter;
     auto typesAreLegal = [&converter](Operation* op) {
       return converter.isLegal(op->getOperandTypes()) &&
              converter.isLegal(op->getResultTypes());
@@ -96,26 +97,20 @@ struct BufferizePass : public BufferizePassBase<BufferizePass> {
       return converter.isLegal(inputs) && converter.isLegal(results) &&
              converter.isLegal(&op.getBody());
     });
-    target.addDynamicallyLegalOp<CallOp>(typesAreLegal);
-    target.addDynamicallyLegalOp<ReturnOp>(typesAreLegal);
+    target.addDynamicallyLegalOp<CallOp, ReturnOp, SelectOp, shape::AssumingOp>(
+        typesAreLegal);
+
+    OwningRewritePatternList patterns;
+    mhlo::populateHLOToLHLOConversionPattern(&context, &converter, &patterns);
+    populateWithBufferizeOpConversionPatterns<ReturnOp, ReturnOp,
+                                              lmhlo::CopyOp>(
+        &context, converter, patterns);
+    populateStandardBufferizePattern(&context, &converter, &patterns);
+    populateShapeTypeConversionPatterns(&context, converter, patterns);
+    patterns.insert<UnrankedTensorStoreTestOnlyPattern>(&context);
 
     auto module = getOperation();
-    WalkResult result = module.walk([&](FuncOp func) -> WalkResult {
-      BufferAssignmentPlacer bufferAssignment(func);
-      OwningRewritePatternList patterns;
-      mhlo::populateHLOToLHLOConversionPattern(
-          func.getContext(), &bufferAssignment, &converter, &patterns);
-      populateWithBufferAssignmentOpConversionPatterns<
-          ReturnOp, ReturnOp, lmhlo::CopyOp,
-          /*allowMemrefFunctionResults=*/true>(&context, &bufferAssignment,
-                                               &converter, &patterns);
-      populateStandardBufferizePattern(func.getContext(), &bufferAssignment,
-                                       &converter, &patterns);
-      patterns.insert<UnrankedTensorStoreTestOnlyPattern>(func.getContext());
-
-      return applyPartialConversion(func, target, patterns);
-    });
-    if (result.wasInterrupted()) {
+    if (failed(applyPartialConversion(module, target, patterns))) {
       signalPassFailure();
     }
   }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc
index a0cfcae65d1..6aea4d9c619 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc
@@ -36,6 +36,10 @@ static constexpr StringRef kTFEntry = "tf_entry";
 // * std.dealloc becomes tf_framework.dealloc_raw.
 class EmbedTFFrameworkPass
     : public EmbedTFFrameworkPassBase<EmbedTFFrameworkPass> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<mlir::kernel_gen::tf_framework::TFFrameworkDialect>();
+  }
+
  public:
   void runOnOperation() override {
     ModuleOp m = getOperation();
@@ -68,7 +72,7 @@ class EmbedTFFrameworkPass
 
 }  // namespace
 
-std::unique_ptr<OperationPass<ModuleOp> > createEmbedTFFrameworkPass() {
+std::unique_ptr<OperationPass<ModuleOp> > CreateEmbedTFFrameworkPass() {
   return std::make_unique<EmbedTFFrameworkPass>();
 }
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
new file mode 100644
index 00000000000..46bf13b7d20
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
@@ -0,0 +1,227 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/Target/NVVMIR.h"  // from @llvm-project
+#include "mlir/Target/ROCDLIR.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
+#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
+#include "tensorflow/compiler/xla/service/gpu/target_constants.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/platform/cuda_libdevice_path.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/path.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/gpu/asm_compiler.h"
+#elif TENSORFLOW_USE_ROCM
+#include "tensorflow/core/platform/rocm_rocdl_path.h"
+#endif
+
+namespace mlir {
+namespace kernel_gen {
+namespace transforms {
+namespace {
+
+#define GEN_PASS_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+using xla::InternalError;
+
+class GpuKernelToBlobPass
+    : public GpuKernelToBlobPassBase<GpuKernelToBlobPass> {
+ public:
+  GpuKernelToBlobPass(mlir::StringRef blob_annotation,
+                      llvm::ArrayRef<std::string> architectures,
+                      bool generate_fatbin) {
+    blob_annotation_ = blob_annotation.str();
+    architectures_ = architectures;
+    generate_fatbin_ = generate_fatbin;
+  }
+
+  void runOnOperation() override {
+    mlir::gpu::GPUModuleOp gpu_module = getOperation();
+    auto blob_or = GetGpuBinaryBlob(gpu_module);
+    if (blob_or.ok()) {
+      const auto& blob = blob_or.ValueOrDie();
+      std::string blob_string(blob.begin(), blob.end());
+      gpu_module.setAttr(blob_annotation_,
+                         mlir::StringAttr::get(blob_string, &getContext()));
+      return;
+    }
+    return signalPassFailure();
+  }
+
+  xla::StatusOr<std::vector<uint8_t>> GetGpuBinaryBlob(
+      mlir::gpu::GPUModuleOp gpu_module) {
+    if (architectures_.empty()) {
+      return InternalError("Expected at least one GPU architecture.");
+    }
+    if (!generate_fatbin_ && architectures_.size() > 1) {
+      return InternalError(
+          "Can only generate machine code for more than one architecture as a "
+          "fatbin.");
+    }
+
+    llvm::LLVMContext llvmContext;
+
+#if TENSORFLOW_USE_ROCM
+    auto llvmModule = mlir::translateModuleToROCDLIR(gpu_module, llvmContext);
+    if (!llvmModule) {
+      return InternalError("Could not translate MLIR module to ROCDL IR");
+    }
+
+    llvmModule->setModuleIdentifier("acme");
+
+    xla::HloModuleConfig config;
+    config.set_debug_options(xla::GetDebugOptionsFromFlags());
+
+    // TODO(b/169066682): Support fatbin on ROCm.
+    if (generate_fatbin_) {
+      return InternalError("Fatbins are not yet supported for ROCm.");
+    }
+
+    // Parse ROCm architecture.
+    absl::string_view consumable_arch(architectures_.front());
+    if (!absl::ConsumePrefix(&consumable_arch, "gfx")) {
+      return InternalError(
+          "Could not parse ROCm architecture prefix (expected gfx)");
+    }
+    uint32_t arch;
+    if (!absl::SimpleAtoi(consumable_arch, &arch)) {
+      return InternalError("Could not parse ROCm architecture number");
+    }
+
+    std::string libdevice_dir = tensorflow::RocdlRoot();
+    return xla::gpu::amdgpu::CompileToHsaco(llvmModule.get(), arch, config,
+                                            libdevice_dir);
+
+#elif GOOGLE_CUDA
+    auto llvmModule = mlir::translateModuleToNVVMIR(gpu_module, llvmContext);
+    if (!llvmModule) {
+      return InternalError("Could not translate MLIR module to NVVM");
+    }
+
+    llvmModule->setModuleIdentifier("acme");
+    llvmModule->setDataLayout(xla::gpu::nvptx::kDataLayout);
+
+    xla::HloModuleConfig config;
+    config.set_debug_options(xla::GetDebugOptionsFromFlags());
+
+    auto enable_fusion = [](llvm::TargetMachine* target) {
+      target->Options.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
+    };
+
+    // Compile and collect requested cubin and PTX images.
+    std::vector<tensorflow::se::CubinOrPTXImage> images;
+    TF_ASSIGN_OR_RETURN(std::string libdevice_dir, GetLibdeviceDir(config));
+    auto gpu_asm_opts = xla::gpu::PtxOptsFromConfig(config);
+    for (const std::string& arch_str : architectures_) {
+      // Parse CUDA architecture.
+      absl::string_view consumable_arch(arch_str);
+      bool is_compute_profile;
+      if (absl::ConsumePrefix(&consumable_arch, "compute_")) {
+        is_compute_profile = true;
+      } else if (absl::ConsumePrefix(&consumable_arch, "sm_")) {
+        is_compute_profile = false;
+      } else {
+        return InternalError(
+            "Could not parse cuda architecture prefix (expected sm_ or "
+            "compute_)");
+      }
+      uint32_t arch;
+      if (!absl::SimpleAtoi(consumable_arch, &arch)) {
+        return InternalError("Could not parse cuda architecture number");
+      }
+
+      uint32_t cc_major = arch / 10;
+      uint32_t cc_minor = arch % 10;
+      // Module may be changed by CompileToPtx.
+      auto llvm_module_copy = llvm::CloneModule(*llvmModule);
+      TF_ASSIGN_OR_RETURN(
+          std::string ptx,
+          xla::gpu::nvptx::CompileToPtx(llvm_module_copy.get(),
+                                        std::make_pair(cc_major, cc_minor),
+                                        config, libdevice_dir, enable_fusion));
+      VLOG(1) << ptx;
+      TF_ASSIGN_OR_RETURN(std::vector<uint8_t> gpu_asm,
+                          tensorflow::se::CompileGpuAsm(
+                              cc_major, cc_minor, ptx.c_str(), gpu_asm_opts));
+
+      if (!generate_fatbin_) {
+        // Skip fatbin generation and return the first and only GPU machine
+        // code. This is currently only used for `tf_to_gpu_binary` and will
+        // eventually disappear.
+        return gpu_asm;
+      }
+
+      // Collect cubin (and ptx image if requested).
+      images.push_back({absl::StrCat("sm_", arch), std::move(gpu_asm)});
+      if (is_compute_profile) {
+        std::vector<uint8_t> ptx_bytes;
+        std::copy(ptx.begin(), ptx.end(), std::back_inserter(ptx_bytes));
+        images.push_back(
+            {absl::StrCat("compute_", arch), std::move(ptx_bytes)});
+      }
+    }
+
+    // TODO(b/169870789): Revisit the use of fatbins.
+    // Bundle cubin and PTX images into a single fatbin.
+    return tensorflow::se::BundleGpuAsm(images,
+                                        gpu_asm_opts.preferred_cuda_dir);
+#endif
+
+    return InternalError(
+        "Neither TENSORFLOW_USE_ROCM nor GOOGLE_CUDA are defined."
+        " Did you specify either --config=rocm or --config=cuda ?");
+  }
+
+ private:
+  xla::StatusOr<std::string> GetLibdeviceDir(
+      const xla::HloModuleConfig& hlo_module_config) {
+    for (const std::string& cuda_root : tensorflow::CandidateCudaRoots(
+             hlo_module_config.debug_options().xla_gpu_cuda_data_dir())) {
+      std::string libdevice_dir =
+          tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
+      VLOG(2) << "Looking for libdevice at " << libdevice_dir;
+      if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
+        VLOG(2) << "Found libdevice dir " << libdevice_dir;
+        return libdevice_dir;
+      }
+    }
+    return InternalError(
+        "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice");
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
+    mlir::StringRef blob_annotation, ArrayRef<std::string> architectures,
+    bool generate_fatbin) {
+  return std::make_unique<GpuKernelToBlobPass>(blob_annotation, architectures,
+                                               generate_fatbin);
+}
+
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/materialize_broadcasts_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/materialize_broadcasts_pass.cc
new file mode 100644
index 00000000000..dd3f32e2b3c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/materialize_broadcasts_pass.cc
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+
+namespace mlir {
+namespace kernel_gen {
+namespace transforms {
+namespace {
+
+#define GEN_PASS_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+struct MaterializeBroadcastsPass
+    : public MaterializeBroadcastsPassBase<MaterializeBroadcastsPass> {
+  void runOnFunction() override {
+    mlir::ConversionTarget conversionTarget(getContext());
+    mlir::OwningRewritePatternList conversionPatterns;
+
+    // Consider the mhlo dialect legal for tests.
+    conversionTarget.addLegalDialect<mlir::mhlo::MhloDialect>();
+    // The conversion uses helpers from the Standard dialect.
+    conversionTarget.addLegalDialect<mlir::StandardOpsDialect>();
+
+    mlir::mhlo::SetupMaterializeBroadcastsLegality(&getContext(),
+                                                   &conversionTarget);
+    mlir::mhlo::PopulateMaterializeBroadcastsPatterns(&getContext(),
+                                                      &conversionPatterns);
+
+    if (failed(applyPartialConversion(getFunction(), conversionTarget,
+                                      conversionPatterns))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::FunctionPass> CreateMaterializeBroadcastsPass() {
+  return std::make_unique<MaterializeBroadcastsPass>();
+}
+
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/parallel_loops_to_sequential.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/parallel_loops_to_sequential.cc
new file mode 100644
index 00000000000..7981dbe5534
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/parallel_loops_to_sequential.cc
@@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+
+namespace mlir {
+namespace kernel_gen {
+namespace transforms {
+namespace {
+
+#define GEN_PASS_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+struct ParallelLoopsToSequentialPass
+    : public ParallelLoopsToSequentialBase<ParallelLoopsToSequentialPass> {
+  void runOnFunction() override {
+    mlir::OwningRewritePatternList patterns;
+    mlir::populateLoopToStdConversionPatterns(patterns, &getContext());
+
+    mlir::ConversionTarget target(getContext());
+    target.addIllegalOp<mlir::scf::ParallelOp>();
+    target.addLegalOp<mlir::scf::ForOp, mlir::scf::IfOp>();
+    target.markUnknownOpDynamicallyLegal([](mlir::Operation*) { return true; });
+    if (failed(applyPartialConversion(getOperation(), target, patterns)))
+      signalPassFailure();
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::FunctionPass> CreateParallelLoopsToSequential() {
+  return std::make_unique<ParallelLoopsToSequentialPass>();
+}
+
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
index e65d8402fb2..5fd4091b2c0 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <memory>
 
+#include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 
@@ -25,20 +27,19 @@ namespace mlir {
 namespace kernel_gen {
 namespace tf_framework {
 
-// Test pass for applying TF Framework -> LLVM patterns.
-std::unique_ptr<OperationPass<ModuleOp> >
-createTestTFFrameworkLegalizeToLLVMPass();
-
 // Pass to replace some of the Standard ops with TF Framework ops.
 // * adds tf_framework::OpKernelContextType argument to the function
 // * std.alloc becomes tf_framework.alloc_raw
 // * std.dealloc becomes tf_framework.dealloc_raw
-std::unique_ptr<OperationPass<ModuleOp> > createEmbedTFFrameworkPass();
+std::unique_ptr<OperationPass<ModuleOp> > CreateEmbedTFFrameworkPass();
 
 }  // namespace tf_framework
 
 namespace transforms {
 
+// Pass for applying LLVM legalization patterns.
+std::unique_ptr<OperationPass<ModuleOp> > CreateTFKernelToLLVMPass();
+
 // Pass to tranform shape computations in shape dialect to standard and scf
 // using memref descriptors.
 std::unique_ptr<OperationPass<ModuleOp> > CreateShapeToDescriptorsPass();
@@ -47,6 +48,25 @@ std::unique_ptr<OperationPass<ModuleOp> > CreateShapeToDescriptorsPass();
 // buffers.
 std::unique_ptr<OperationPass<ModuleOp> > CreateBufferizePass();
 
+// Pass to materialize broadcasts.
+std::unique_ptr<FunctionPass> CreateMaterializeBroadcastsPass();
+
+// Pass to convert scf::ParallelOp to scf::ForOp.
+std::unique_ptr<FunctionPass> CreateParallelLoopsToSequential();
+
+// Pass to propagate TF ABI knowledge, e.g. offsets, alignment.
+std::unique_ptr<OperationPass<LLVM::LLVMFuncOp>>
+CreatePropagateTensorFlowABIKnowledgePass(
+    llvm::ArrayRef<uint32_t> same_shape = {});
+
+// Pass to annotate GPU Module with its PTX.
+std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
+    mlir::StringRef blob_annotation = "",
+    ArrayRef<std::string> architectures = {}, bool generate_fatbin = true);
+
+// Pass to unfuse batch norm.
+std::unique_ptr<FunctionPass> CreateUnfuseBatchNormPass();
+
 }  // namespace transforms
 
 #define GEN_PASS_REGISTRATION
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
index 6a0e328f212..a8b2506bd1c 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
@@ -13,30 +13,67 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TF_FRAMEWORK_PASSES
-#define TF_FRAMEWORK_PASSES
+#ifndef TF_KERNEL_GEN_PASSES
+#define TF_KERNEL_GEN_PASSES
 
 include "mlir/Pass/PassBase.td"
 
-def TestTFFrameworkLegalizeToLLVMPass
-    : Pass<"test-tf-framework-legalize-to-llvm", "ModuleOp"> {
-  let summary = "Test pass for applying TF Framework -> LLVM patterns.";
-  let constructor = "tf_framework::createTestTFFrameworkLegalizeToLLVMPass()";
+def TFKernelToLLVMPass : Pass<"tf-kernel-to-llvm", "ModuleOp"> {
+  let summary = "Pass for applying LLVM legalization patterns.";
+  let constructor = "transforms::CreateTFKernelToLLVMPass()";
 }
 
 def EmbedTFFrameworkPass : Pass<"embed-tf-framework", "ModuleOp"> {
   let summary = "Pass to embed TF Framework for allocation and error reporting";
-  let constructor = "tf_framework::createEmbedTFFrameworkPass()";
+  let constructor = "tf_framework::CreateEmbedTFFrameworkPass()";
 }
 
-def ShapeToDescriptorsPass : Pass<"test-shape-to-descriptors", "ModuleOp"> {
+def ShapeToDescriptorsPass : Pass<"shape-to-descriptors", "ModuleOp"> {
   let summary = "Pass to transform shape computations to descriptors";
   let constructor = "transforms::CreateShapeToDescriptorsPass()";
 }
 
-def BufferizePass : Pass<"test-bufferize", "ModuleOp"> {
+def BufferizePass : Pass<"bufferize", "ModuleOp"> {
   let summary = "Pass to transform operations on values to buffer based ones";
   let constructor = "transforms::CreateBufferizePass()";
 }
 
-#endif // TF_FRAMEWORK_PASSES
+def MaterializeBroadcastsPass : FunctionPass<"materialize-broadcast"> {
+  let summary = "Pass to materialize broadcasts";
+  let constructor = "transforms::CreateMaterializeBroadcastsPass()";
+}
+
+def UnfuseBatchNormPass : FunctionPass<"unfuse-batch-norm"> {
+  let summary = "Pass to unfuse batch norm";
+  let constructor = "transforms::CreateUnfuseBatchNormPass()";
+}
+
+def GpuKernelToBlobPass : Pass<"gpu-kernel-to-blob", "gpu::GPUModuleOp"> {
+  let summary = "Pass to annotate GPU Module with its PTX";
+  let options = [
+    Option<"blob_annotation_", "blob-annotation", "std::string",
+           /*default=*/"", "Blob attribute name">,
+    ListOption<"architectures_", "arch", "std::string", "GPU architectures">,
+    Option<"generate_fatbin_", "generate-fatbin", "bool", /*default=*/"true",
+           "Bundle machine code for the different architectures in one "
+           "fatbin.">,
+  ];
+  let constructor = "transforms::CreateGpuKernelToBlobPass()";
+}
+
+def ParallelLoopsToSequential : FunctionPass<"parallel-loops-to-sequential"> {
+  let summary = "Pass to convert scf::ParallelOp to scf::ForOp";
+  let constructor = "transforms::CreateParallelLoopsToSequential()";
+}
+
+def PropagateTensorFlowABIKnowledgePass
+    : Pass<"propagate-tf-abi-knowledge", "LLVM::LLVMFuncOp"> {
+  let summary = "Pass to propagate TF ABI knowledge, e.g. offsets, alignment";
+  let options = [
+    ListOption<"same_shape_", "same-shape", "uint32_t",
+               "List of same shape args">,
+  ];
+  let constructor = "transforms::CreatePropagateTensorFlowABIKnowledgePass()";
+}
+
+#endif // TF_KERNEL_GEN_PASSES
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/propagate_tf_abi_knowledge_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/propagate_tf_abi_knowledge_pass.cc
new file mode 100644
index 00000000000..3b568f5f25f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/propagate_tf_abi_knowledge_pass.cc
@@ -0,0 +1,137 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+
+namespace mlir {
+namespace kernel_gen {
+namespace transforms {
+namespace {
+
+#define GEN_PASS_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+struct PropagateTensorFlowABIKnowledgePass
+    : public PropagateTensorFlowABIKnowledgePassBase<
+          PropagateTensorFlowABIKnowledgePass> {
+  explicit PropagateTensorFlowABIKnowledgePass(
+      llvm::ArrayRef<uint32_t> same_shape) {
+    same_shape_ = same_shape;
+  }
+
+  void runOnOperation() override {
+    // We know due to tensorflow ABI that the offset is always 0 and that the
+    // innermost stride is always 1. To make this visible to the compiler,
+    // we insert constants into the code and replace usages accordingly.
+    // We do not change the signature so that we keep a somewhat stable ABI
+    // that is easy to undertand by tools.
+    // We also know that tensorflow aligns all allocated pointers by 16, so
+    // we pass this on. Furthermore, we know that arguments never alias. More
+    // precicely, they may only alias (due to reuse) if the kernel does not
+    // read from a position it previously has written to. We express this with
+    // the noalias attribute.
+    mlir::LLVM::LLVMFuncOp func = getOperation();
+
+    // This only works if the function is local and we can rewrite it.
+    if (func.isExternal()) return;
+
+    auto function_list =
+        func.getParentOfType<ModuleOp>().getOps<mlir::FuncOp>();
+    if (function_list.empty()) {
+      func.emitError() << "No possible kernel function found";
+      return signalPassFailure();
+    }
+    auto func_iterator = function_list.begin();
+    if (std::next(func_iterator) != function_list.end()) {
+      func.emitError() << "More than one possible kernel function detected";
+      return signalPassFailure();
+    }
+    // Note that this dereference is necessary to prevent a
+    // stack-use-after-return error.
+    auto func_type = (*func_iterator).getType();
+
+    mlir::OpBuilder b(func.getBody());
+    // Steal the LLVM representation of the index type from the third argument.
+    auto index_type = func.getArgument(3).getType();
+    mlir::Value one = b.create<mlir::LLVM::ConstantOp>(
+        func.getLoc(), index_type, b.getIntegerAttr(b.getIndexType(), 1));
+    mlir::Value zero = b.create<mlir::LLVM::ConstantOp>(
+        func.getLoc(), index_type, b.getIntegerAttr(b.getIndexType(), 0));
+    uint32_t arg_pos = 0;
+    std::vector<uint32_t> positions;
+    // Collect the agument and return types of the surrounding function.
+    auto arg_types = llvm::to_vector<4>(llvm::concat<const mlir::Type>(
+        func_type.getInputs(), func_type.getResults()));
+    for (mlir::Type arg_type : arg_types) {
+      if (!arg_type.isa<mlir::MemRefType>()) {
+        func.emitError() << "argument of surrounding func is not ranked memref";
+        return signalPassFailure();
+      }
+      positions.push_back(arg_pos);
+      // Set alignment and aliasing on the pointers.
+      func.setArgAttr(arg_pos + 1, "llvm.noalias", b.getBoolAttr(true));
+      func.setArgAttr(arg_pos + 1, "llvm.align", b.getIndexAttr(16));
+      // Replace the offset with zero. Offset is argument number 3.
+      func.getArgument(arg_pos + 2).replaceAllUsesWith(zero);
+      // Forward over base_ptr, aligned_ptr, offset, size and stride arguments.
+      arg_pos += 3 + arg_type.cast<mlir::MemRefType>().getRank() * 2;
+      // Replace the last stride with constant 1.
+      func.getArgument(arg_pos - 1).replaceAllUsesWith(one);
+    }
+
+    // If we have knowledge that some arguments have the same shape, we
+    // can use that here. Simply replace usages of the shape parameters within
+    // the function body to a single shape parameter.
+    if (same_shape_.empty()) {
+      return;
+    }
+    auto first = same_shape_.front();
+    auto first_offset = positions.at(first);
+    auto first_type = arg_types[first].cast<mlir::ShapedType>();
+    uint32_t rank = first_type.getRank();
+    for (int i = 1, e = same_shape_.size(); i < e; ++i) {
+      uint32_t same = same_shape_[i];
+      uint32_t same_offset = positions.at(same);
+      auto same_type = arg_types[same].cast<mlir::ShapedType>();
+      if (same_type.getRank() != rank) {
+        func.emitOpError() << "same shape constraints on arguments with "
+                              "non-matching shapes: #"
+                           << first << " and #" << same;
+        return signalPassFailure();
+      }
+
+      for (uint32_t i = 0; i < 2 * rank; ++i) {
+        // Replace uses for second arg data with first arg.
+        auto same_arg = func.getArgument(same_offset + 3 + i);
+        auto first_arg = func.getArgument(first_offset + 3 + i);
+        same_arg.replaceAllUsesWith(first_arg);
+      }
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::LLVM::LLVMFuncOp>>
+CreatePropagateTensorFlowABIKnowledgePass(llvm::ArrayRef<uint32_t> same_shape) {
+  return std::make_unique<PropagateTensorFlowABIKnowledgePass>(same_shape);
+}
+
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h
index 4efc1e95bc8..f73a14b9be0 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h
@@ -21,6 +21,7 @@ limitations under the License.
 namespace mlir {
 
 class BufferAssignmentPlacer;
+class BufferizeTypeConverter;
 class LLVMTypeConverter;
 class MLIRContext;
 class OwningRewritePatternList;
@@ -44,8 +45,7 @@ namespace transforms {
 /// Collects a set of patterns that bufferize operations from the standard
 /// dialect.
 void populateStandardBufferizePattern(MLIRContext *context,
-                                      BufferAssignmentPlacer *bufferAssignment,
-                                      TypeConverter *converter,
+                                      BufferizeTypeConverter *converter,
                                       OwningRewritePatternList *patterns);
 }  // namespace transforms
 }  // namespace kernel_gen
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_to_descriptors_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_to_descriptors_pass.cc
index 28d3647bb63..f5d01808c1b 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_to_descriptors_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_to_descriptors_pass.cc
@@ -16,7 +16,6 @@ limitations under the License.
 // This file combines patterns for lowering shape dialect to standard ops,
 // structured control flow and descriptors.
 
-#include "mlir/Conversion/ShapeToSCF/ShapeToSCF.h"  // from @llvm-project
 #include "mlir/Conversion/ShapeToStandard/ShapeToStandard.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
@@ -24,7 +23,6 @@ limitations under the License.
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
 
@@ -38,6 +36,10 @@ namespace {
 
 struct ShapeToDescriptorsPass
     : public ShapeToDescriptorsPassBase<ShapeToDescriptorsPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<scf::SCFDialect>();
+  }
+
  public:
   void runOnOperation() override {
     MLIRContext &ctx = getContext();
@@ -47,12 +49,15 @@ struct ShapeToDescriptorsPass
     target.addIllegalDialect<shape::ShapeDialect>();
     target.addLegalDialect<scf::SCFDialect>();
     target.addLegalDialect<StandardOpsDialect>();
+    // Don't mark the primary Cstr/Assuming ops as illegal, so they can be
+    // lowered at a later time to assertions.
+    target.addLegalOp<shape::AssumingOp, shape::AssumingYieldOp,
+                      shape::CstrRequireOp>();
 
     // Setup conversion patterns.
     OwningRewritePatternList patterns;
     populateShapeRewritePatterns(&ctx, patterns);
     populateShapeToStandardConversionPatterns(patterns, &ctx);
-    populateShapeToSCFConversionPatterns(patterns, &ctx);
 
     // Apply conversion.
     auto module = getOperation();
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
index 3ce111ff3ff..431919c2de7 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
@@ -171,7 +171,8 @@ class DeallocRawOpConverter : public ConvertToLLVMCallOpPattern<DeallocRawOp> {
  protected:
   StringRef GetFuncName() const override { return kCInterfaceDealloc; }
   LLVMType GetFuncType() const override {
-    return LLVM::LLVMType::getFunctionTy(getVoidType(), getVoidPtrType(),
+    return LLVM::LLVMType::getFunctionTy(getVoidType(),
+                                         {getVoidPtrType(), getVoidPtrType()},
                                          /*isVarArg=*/false);
   }
 };
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
similarity index 70%
rename from tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm_pass.cc
rename to tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
index 42e89433dff..b2fcc424a50 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_kernel_to_llvm_pass.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"  // from @llvm-project
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
@@ -25,14 +26,17 @@ limitations under the License.
 
 namespace mlir {
 namespace kernel_gen {
-namespace tf_framework {
+namespace transforms {
 namespace {
 
 #define GEN_PASS_CLASSES
 #include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
 
-class TestTFFrameworkToLLVMPass
-    : public TestTFFrameworkLegalizeToLLVMPassBase<TestTFFrameworkToLLVMPass> {
+class TFKernelToLLVMPass : public TFKernelToLLVMPassBase<TFKernelToLLVMPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<LLVM::LLVMDialect>();
+  }
+
  public:
   void runOnOperation() override {
     ModuleOp m = getOperation();
@@ -46,16 +50,19 @@ class TestTFFrameworkToLLVMPass
     // Populate patterns.
     OwningRewritePatternList patterns;
     populateStdToLLVMConversionPatterns(type_converter, patterns);
-    PopulateTFFrameworkToLLVMConversionPatterns(&type_converter, &patterns);
+    tf_framework::PopulateTFFrameworkToLLVMConversionPatterns(&type_converter,
+                                                              &patterns);
+    populateGpuToLLVMConversionPatterns(type_converter, patterns, "gpu.binary");
     lmhlo::PopulateLhloToLLVMConversionPatterns(&type_converter, &patterns);
 
     // Set target.
     ConversionTarget target(getContext());
     target.addLegalDialect<LLVM::LLVMDialect>();
-    target.addIllegalDialect<tf_framework::TFFrameworkDialect>();
-    target.addLegalOp<ModuleOp, ModuleTerminatorOp>();
+    target
+        .addIllegalDialect<gpu::GPUDialect, tf_framework::TFFrameworkDialect>();
+    target.addIllegalOp<LLVM::DialectCastOp>();
 
-    if (failed(applyFullConversion(m, target, patterns))) {
+    if (failed(applyPartialConversion(m, target, patterns))) {
       signalPassFailure();
     }
   }
@@ -63,11 +70,10 @@ class TestTFFrameworkToLLVMPass
 
 }  // namespace
 
-std::unique_ptr<OperationPass<ModuleOp> >
-createTestTFFrameworkLegalizeToLLVMPass() {
-  return std::make_unique<TestTFFrameworkToLLVMPass>();
+std::unique_ptr<OperationPass<ModuleOp> > CreateTFKernelToLLVMPass() {
+  return std::make_unique<TFKernelToLLVMPass>();
 }
 
-}  // namespace tf_framework
+}  // namespace transforms
 }  // namespace kernel_gen
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/unfuse_batch_norm_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/unfuse_batch_norm_pass.cc
new file mode 100644
index 00000000000..d2773d91b07
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/unfuse_batch_norm_pass.cc
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h"
+
+namespace mlir {
+namespace kernel_gen {
+namespace transforms {
+namespace {
+
+#define GEN_PASS_CLASSES
+#include "tensorflow/compiler/mlir/tools/kernel_gen/transforms/kernel_gen_passes.h.inc"
+
+struct UnfuseBatchNormPass
+    : public UnfuseBatchNormPassBase<UnfuseBatchNormPass> {
+  void runOnFunction() override {
+    mlir::OwningRewritePatternList patterns;
+    mlir::mhlo::PopulateUnfuseBatchNormPatterns(&getContext(), &patterns);
+    mlir::applyPatternsAndFoldGreedily(getOperation(), patterns);
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::FunctionPass> CreateUnfuseBatchNormPass() {
+  return std::make_unique<UnfuseBatchNormPass>();
+}
+
+}  // namespace transforms
+}  // namespace kernel_gen
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/utils/array_container_utils.h b/tensorflow/compiler/mlir/utils/array_container_utils.h
new file mode 100644
index 00000000000..c1a898185d9
--- /dev/null
+++ b/tensorflow/compiler/mlir/utils/array_container_utils.h
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_UTILS_ARRAY_CONTAINER_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_UTILS_ARRAY_CONTAINER_UTILS_H_
+
+#include "absl/types/span.h"
+#include "llvm/ADT/ArrayRef.h"
+
+namespace mlir {
+
+template <typename T>
+inline llvm::ArrayRef<T> SpanToArrayRef(absl::Span<const T> span) {
+  return llvm::ArrayRef<T>(span.data(), span.size());
+}
+
+template <typename T>
+inline llvm::ArrayRef<T> SpanToArrayRef(absl::Span<T> span) {
+  return llvm::ArrayRef<T>(span.data(), span.size());
+}
+
+template <typename T>
+inline llvm::MutableArrayRef<T> SpanToMutableArrayRef(absl::Span<T> span) {
+  return llvm::MutableArrayRef<T>(span.data(), span.size());
+}
+
+template <typename T>
+inline absl::Span<const T> ArrayRefToSpan(llvm::ArrayRef<T> ref) {
+  return absl::Span<const T>(ref.data(), ref.size());
+}
+
+template <typename T>
+inline absl::Span<T> MutableArrayRefToSpan(llvm::MutableArrayRef<T> ref) {
+  return absl::Span<T>(ref.data(), ref.size());
+}
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_UTILS_ARRAY_CONTAINER_UTILS_H_
diff --git a/tensorflow/compiler/mlir/utils/name_utils.cc b/tensorflow/compiler/mlir/utils/name_utils.cc
new file mode 100644
index 00000000000..bc4e80f5aa1
--- /dev/null
+++ b/tensorflow/compiler/mlir/utils/name_utils.cc
@@ -0,0 +1,99 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/utils/name_utils.h"
+
+#include <cctype>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "mlir/IR/Identifier.h"  // from @llvm-project
+
+namespace mlir {
+
+namespace {
+// Checks if a character is legal for a TensorFlow node name, with special
+// handling if a character is at the beginning.
+bool IsLegalChar(char c, bool first_char) {
+  if (isalpha(c)) return true;
+  if (isdigit(c)) return true;
+  if (c == '.') return true;
+  if (c == '_') return true;
+
+  // First character of a node name can only be a letter, digit, dot or
+  // underscore.
+  if (first_char) return false;
+
+  if (c == '/') return true;
+  if (c == '-') return true;
+
+  return false;
+}
+}  // anonymous namespace
+
+void LegalizeNodeName(std::string& name) {
+  if (name.empty()) return;
+
+  if (!IsLegalChar(name[0], /*first_char=*/true)) name[0] = '.';
+
+  for (char& c : llvm::drop_begin(name, 1))
+    if (!IsLegalChar(c, /*first_char=*/false)) c = '.';
+}
+
+std::string GetNameFromLoc(Location loc) {
+  llvm::SmallVector<llvm::StringRef, 8> loc_names;
+  llvm::SmallVector<Location, 8> locs;
+  locs.push_back(loc);
+  bool names_is_nonempty = false;
+
+  while (!locs.empty()) {
+    Location curr_loc = locs.pop_back_val();
+
+    if (auto name_loc = curr_loc.dyn_cast<NameLoc>()) {
+      // Add name in NameLoc. For NameLoc we also account for names due to ops
+      // in functions where the op's name is first.
+      auto name = name_loc.getName().strref().split('@').first;
+      loc_names.push_back(name);
+      if (!name.empty()) names_is_nonempty = true;
+      continue;
+    } else if (auto call_loc = curr_loc.dyn_cast<CallSiteLoc>()) {
+      // Add name if CallSiteLoc's callee has a NameLoc (as should be the
+      // case if imported with DebugInfo).
+      if (auto name_loc = call_loc.getCallee().dyn_cast<NameLoc>()) {
+        auto name = name_loc.getName().strref().split('@').first;
+        loc_names.push_back(name);
+        if (!name.empty()) names_is_nonempty = true;
+        continue;
+      }
+    } else if (auto fused_loc = curr_loc.dyn_cast<FusedLoc>()) {
+      // Push all locations in FusedLoc in reverse order, so locations are
+      // visited based on order in FusedLoc.
+      auto reversed_fused_locs = llvm::reverse(fused_loc.getLocations());
+      locs.append(reversed_fused_locs.begin(), reversed_fused_locs.end());
+      continue;
+    }
+
+    // Location is not a supported, so an empty StringRef is added.
+    loc_names.push_back(llvm::StringRef());
+  }
+
+  if (names_is_nonempty)
+    return llvm::join(loc_names.begin(), loc_names.end(), ";");
+
+  return "";
+}
+
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/utils/name_utils.h b/tensorflow/compiler/mlir/utils/name_utils.h
new file mode 100644
index 00000000000..4b08a41feec
--- /dev/null
+++ b/tensorflow/compiler/mlir/utils/name_utils.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_UTILS_NAME_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_UTILS_NAME_UTILS_H_
+
+#include <string>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Location.h"  // from @llvm-project
+
+namespace mlir {
+
+// Converts characters in name that are considered illegal in TensorFlow Node
+// name to '.'.
+void LegalizeNodeName(std::string& name);
+
+// Creates a TensorFlow node name from a location.
+std::string GetNameFromLoc(Location loc);
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_UTILS_NAME_UTILS_H_
diff --git a/tensorflow/compiler/mlir/utils/string_container_utils.h b/tensorflow/compiler/mlir/utils/string_container_utils.h
new file mode 100644
index 00000000000..fb2fa06ca4d
--- /dev/null
+++ b/tensorflow/compiler/mlir/utils/string_container_utils.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_UTILS_STRING_CONTAINER_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_UTILS_STRING_CONTAINER_UTILS_H_
+
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+
+inline absl::string_view StringRefToView(llvm::StringRef ref) {
+  return absl::string_view(ref.data(), ref.size());
+}
+
+inline llvm::StringRef StringViewToRef(absl::string_view view) {
+  return llvm::StringRef(view.data(), view.size());
+}
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_UTILS_STRING_CONTAINER_UTILS_H_
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 32a2ed1c272..1919446a365 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -1,5 +1,7 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//third_party/mlir:tblgen.bzl", "gentbl")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_native_cc_binary")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
 
 package(
     default_visibility = [":friends"],
@@ -15,6 +17,7 @@ package_group(
         "//learning/brain/experimental/mlir/...",
         "//learning/brain/google/xla/kernels/...",
         "//learning/brain/google/xla/mlir/...",
+        "//learning/deepmind/partir/...",
         "//learning/pathways/data_parallel/tf2xla/...",
         "//platforms/xla/...",
         "//tensorflow/compiler/mlir/...",
@@ -27,6 +30,7 @@ package_group(
 
 gentbl(
     name = "xla_legalize_tf_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         ("-gen-rewriters", "transforms/generated_legalize_tf.inc"),
     ],
@@ -132,6 +136,7 @@ cc_library(
         ":hlo_module_importer",
         ":hlo_utils",
         ":mlir_hlo_to_hlo",
+        ":translate_cl_options",
         "//tensorflow/compiler/mlir/hlo",
         "//tensorflow/compiler/mlir/hlo:lhlo",
         "//tensorflow/compiler/xla:debug_options_flags",
@@ -237,8 +242,8 @@ cc_library(
     hdrs = ["mlir_hlo_to_hlo.h"],
     deps = [
         ":type_to_shape",
+        "//tensorflow/compiler/mlir:name_utils",
         "//tensorflow/compiler/mlir/hlo",
-        "//tensorflow/compiler/mlir/hlo:hlo_dialect_force_registration",
         "//tensorflow/compiler/mlir/tensorflow:convert_type",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/tf2xla:common",
@@ -323,6 +328,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "translate_cl_options",
+    srcs = ["xla_mlir_translate_cl.cc"],
+    hdrs = ["xla_mlir_translate_cl.h"],
+    deps = [
+        "@llvm-project//llvm:Support",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "xla_mlir_translate",
     srcs = ["xla_mlir_translate.cc"],
@@ -331,6 +346,7 @@ cc_library(
         ":hlo_to_mlir_hlo",
         ":mhlo_to_lhlo_with_xla",
         ":mlir_hlo_to_hlo",
+        ":translate_cl_options",
         "//tensorflow/compiler/jit:xla_cpu_jit",
         "//tensorflow/compiler/jit:xla_gpu_jit",
         "//tensorflow/compiler/mlir/hlo",
@@ -361,6 +377,7 @@ tf_native_cc_binary(
 
 gentbl(
     name = "operator_writer_inc",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [("", "operator_writers.inc")],
     tblgen = ":operator_writer_gen",
     td_file = "//tensorflow/compiler/mlir/hlo:include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td",
@@ -389,14 +406,12 @@ cc_library(
         ":xla_legalize_tf_with_tf2xla",
         "//tensorflow/compiler/mlir/hlo",
         "//tensorflow/compiler/mlir/hlo:chlo_legalize_to_hlo",
-        "//tensorflow/compiler/mlir/hlo:hlo_dialect_force_registration",
         "//tensorflow/compiler/mlir/hlo:hlo_legalize_to_lhlo",
         "//tensorflow/compiler/mlir/hlo:legalize_control_flow",
-        "//tensorflow/compiler/mlir/hlo:legalize_tanh_to_approximation",
         "//tensorflow/compiler/mlir/hlo:legalize_to_linalg",
         "//tensorflow/compiler/mlir/hlo:legalize_to_standard",
+        "//tensorflow/compiler/mlir/hlo:legalize_trigonometric_to_approximation",
         "//tensorflow/compiler/mlir/hlo:lhlo",
-        "//tensorflow/compiler/mlir/hlo:lhlo_copy_removal",
         "//tensorflow/compiler/mlir/hlo:lhlo_fuse_linalg",
         "//tensorflow/compiler/mlir/hlo:lhlo_legalize_to_affine",
         "//tensorflow/compiler/mlir/hlo:lhlo_legalize_to_gpu",
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
index a63fc12c285..253156b44a5 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
@@ -46,13 +46,11 @@ limitations under the License.
 
 using llvm::APInt;
 using llvm::makeArrayRef;
-using mlir::DenseElementsAttr;
 using mlir::DenseIntElementsAttr;
 using mlir::FuncOp;
 using mlir::NamedAttribute;
 using mlir::Operation;
 using mlir::RankedTensorType;
-using mlir::ShapedType;
 using mlir::Type;
 using mlir::Value;
 
@@ -142,31 +140,42 @@ tensorflow::Status HloFunctionImporter::ImportAsRegion(
   return ImportInstructions(computation, block);
 }
 
-tensorflow::Status HloFunctionImporter::ImportInstructions(
-    const HloComputation& computation, mlir::Block* block) {
+StatusOr<Value> HloFunctionImporter::ImportInstructionsImpl(
+    const xla::HloComputation& computation,
+    const llvm::SmallVectorImpl<Value>& arguments, mlir::OpBuilder* builder) {
   // Setup the input parameters.
   const int num_parameters = computation.num_parameters();
+
+  if (arguments.size() != num_parameters)
+    return InvalidArgument("Caller vs callee argument sizes do not match");
+
   for (int i = 0; i < num_parameters; i++) {
     auto hlo_parameter = computation.parameter_instruction(i);
-    instruction_value_map_[hlo_parameter] = block->getArgument(i);
+    instruction_value_map_[hlo_parameter] = arguments[i];
   }
 
-  mlir::OpBuilder builder = mlir::OpBuilder::atBlockEnd(block);
   for (auto instruction : computation.MakeInstructionPostOrder()) {
     TF_ASSIGN_OR_RETURN(auto new_operation,
-                        ImportInstruction(instruction, &builder));
+                        ImportInstruction(instruction, builder));
     if (new_operation) {
       instruction_value_map_[instruction] = new_operation->getResult(0);
     }
   }
 
+  // Setup the return type (HLO only supports a single return value).
+  return GetMlirValue(computation.root_instruction());
+}
+
+Status HloFunctionImporter::ImportInstructions(
+    const HloComputation& computation, mlir::Block* block) {
+  llvm::SmallVector<Value, 4> arguments(block->args_begin(), block->args_end());
+  mlir::OpBuilder builder = mlir::OpBuilder::atBlockEnd(block);
+  TF_ASSIGN_OR_RETURN(Value result,
+                      ImportInstructionsImpl(computation, arguments, &builder));
+
   // TODO(suderman): Add location tracking details.
   mlir::Location loc = builder.getUnknownLoc();
 
-  // Setup the return type (HLO only supports a single return value).
-  TF_ASSIGN_OR_RETURN(auto result,
-                      GetMlirValue(computation.root_instruction()));
-
   // Create terminator op depending on the parent op of this region.
   if (llvm::isa<FuncOp>(block->getParentOp())) {
     builder.create<mlir::ReturnOp>(loc, result);
@@ -176,15 +185,29 @@ tensorflow::Status HloFunctionImporter::ImportInstructions(
   return tensorflow::Status::OK();
 }
 
-StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
+StatusOr<Value> HloFunctionImporter::ImportInstructions(
+    const xla::HloComputation& computation,
+    const llvm::SmallVectorImpl<Value>& arguments, mlir::OpBuilder* builder) {
+  mlir::Block* block = builder->getBlock();
+  if (block == nullptr)
+    return InvalidArgument(
+        "ImportInstructions requires a valid block in the builder");
+
+  HloFunctionImporter importer(
+      block->getParent()->getParentOfType<mlir::ModuleOp>(), {}, builder);
+  return importer.ImportInstructionsImpl(computation, arguments, builder);
+}
+
+StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
     HloInstruction* instruction, mlir::OpBuilder* func_builder) {
   TF_ASSIGN_OR_RETURN(auto operands, GetOperands(instruction));
   TF_ASSIGN_OR_RETURN(auto result_type, ConvertShapeToType<RankedTensorType>(
                                             instruction->shape(), *builder_));
-  llvm::SmallVector<NamedAttribute, 10> attributes = {builder_->getNamedAttr(
-      "name", builder_->getStringAttr(instruction->name()))};
-  mlir::Location loc = func_builder->getUnknownLoc();
+  mlir::Location loc =
+      mlir::NameLoc::get(func_builder->getIdentifier(instruction->name()),
+                         func_builder->getContext());
 
+  llvm::SmallVector<NamedAttribute, 10> attributes;
   switch (instruction->opcode()) {
     case HloOpcode::kParameter: {
       return nullptr;
@@ -216,8 +239,8 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
     return new_operation;                                                     \
   }
     case HloOpcode::kBroadcast: {
-      // Note that the HLO broadcast is more powerful than the XLA broadcast op.
-      // BroadcastInDim offers a superset of the HLO op's functionality.
+      // Note that the HLO broadcast is more powerful than the XLA broadcast
+      // op. BroadcastInDim offers a superset of the HLO op's functionality.
       attributes.push_back(
           builder_->getNamedAttr("broadcast_dimensions",
                                  ConvertDimensions(instruction->dimensions())));
@@ -419,13 +442,27 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
     }
     case HloOpcode::kSort: {
       auto sort_instruction = Cast<HloSortInstruction>(instruction);
+
+      llvm::SmallVector<Type, 4> return_types = {result_type};
+      if (mlir::TupleType tuple_ty = result_type.dyn_cast<mlir::TupleType>()) {
+        return_types = llvm::to_vector<6>(tuple_ty.getTypes());
+      }
+
       auto sort_op = func_builder->create<mlir::mhlo::SortOp>(
-          loc, result_type, operands,
+          loc, return_types, operands,
           builder_->getI64IntegerAttr(sort_instruction->sort_dimension()),
           builder_->getBoolAttr(sort_instruction->is_stable()));
       TF_RETURN_IF_ERROR(
           ImportAsRegion(*sort_instruction->to_apply(), &sort_op.comparator()));
-      return sort_op.getOperation();
+
+      // Check if the output needs to be tupled.
+      if (return_types.size() == 1 && return_types.front() == result_type) {
+        return sort_op.getOperation();
+      }
+
+      return func_builder
+          ->create<mlir::mhlo::TupleOp>(loc, result_type, sort_op.getResults())
+          .getOperation();
     }
     case HloOpcode::kConditional: {
       llvm::SmallVector<Type, 4> rets;
@@ -446,7 +483,8 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
         return op.getOperation();
       }
 
-      // Otherwise, it is a indexed conditional and should be mapped to Case op.
+      // Otherwise, it is a indexed conditional and should be mapped to Case
+      // op.
       TF_RETURN_IF_ERROR(GetMlirTypes(
           {instruction->branch_computation(0)->root_instruction()}, &rets));
 
@@ -462,8 +500,8 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
       return op.getOperation();
     }
     case HloOpcode::kConcatenate: {
-      // TODO(b/132057942): Support taking an uint64_t instead of an IntegerAttr
-      // for concatenate dimension.
+      // TODO(b/132057942): Support taking an uint64_t instead of an
+      // IntegerAttr for concatenate dimension.
       return func_builder
           ->create<mlir::mhlo::ConcatenateOp>(
               loc, result_type, operands,
@@ -667,6 +705,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
       NoAttributeCase(kAnd, AndOp);
       NoAttributeCase(kAtan2, Atan2Op);
       NoAttributeCase(kBitcastConvert, BitcastConvertOp);
+      NoAttributeCase(kCbrt, CbrtOp);
       NoAttributeCase(kConvert, ConvertOp);
       NoAttributeCase(kCeil, CeilOp);
       NoAttributeCase(kClamp, ClampOp);
@@ -691,9 +730,9 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
       NoAttributeCase(kReal, RealOp);
       NoAttributeCase(kRemainder, RemOp);
       NoAttributeCase(kReplicaId, ReplicaIdOp);
-      // The dimensions attribute is not present on the HLO Reshape instruction.
-      // If dimensions are non-default, the XLA builder implements it as a
-      // separate transpose.
+      // The dimensions attribute is not present on the HLO Reshape
+      // instruction. If dimensions are non-default, the XLA builder
+      // implements it as a separate transpose.
       NoAttributeCase(kReshape, ReshapeOp);
       NoAttributeCase(kRoundNearestAfz, RoundOp);
       NoAttributeCase(kRsqrt, RsqrtOp);
@@ -708,9 +747,9 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
       NoAttributeCase(kTanh, TanhOp);
       NoAttributeCase(kTuple, TupleOp);
       NoAttributeCase(kXor, XorOp);
-      // TODO(b/129422361) Copy needs special handling because it is not defined
-      // in tensorflow/compiler/xla/client/xla_builder.h.
-      // See operation semantics in
+      // TODO(b/129422361) Copy needs special handling because it is not
+      // defined in tensorflow/compiler/xla/client/xla_builder.h. See
+      // operation semantics in
       // g3doc/platforms/xla/g3doc/internal/hlo_semantics#copy
       NoAttributeCase(kCopy, CopyOp);
 #undef NoAttributeCase
@@ -724,6 +763,20 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
                          &fusion.fused_computation()));
       return fusion.getOperation();
     }
+    case HloOpcode::kBitcast:
+      return func_builder
+          ->create<mlir::mhlo::BitcastOp>(loc, result_type, operands,
+                                          attributes)
+          .getOperation();
+    case HloOpcode::kReducePrecision: {
+      auto op = func_builder->create<mlir::mhlo::ReducePrecisionOp>(
+          loc, result_type, operands[0], attributes);
+      op.exponent_bitsAttr(func_builder->getIntegerAttr(
+          func_builder->getI32Type(), instruction->exponent_bits()));
+      op.mantissa_bitsAttr(func_builder->getIntegerAttr(
+          func_builder->getI32Type(), instruction->mantissa_bits()));
+      return op.getOperation();
+    }
     case HloOpcode::kAddDependency:
       // Arbitrary op code that I suspect we will not implement for quite a
       // while and allows testing handling of unknown ops. Selected because it
@@ -742,6 +795,28 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
   }
 }
 
+StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
+    HloInstruction* instruction, mlir::OpBuilder* func_builder) {
+  TF_ASSIGN_OR_RETURN(mlir::Operation * op,
+                      ImportInstructionImpl(instruction, func_builder));
+  if (op == nullptr) return op;
+
+  // See MlirToHloConversionOptions for more about layouts.
+  //
+  // Minor-to-major is a permutation of [0, rank), presenting tensor dimensions
+  // in physical minor-to-major order.
+  if (instruction->shape().IsArray() &&
+      instruction->shape().layout() !=
+          LayoutUtil::MakeDescendingLayout(
+              instruction->shape().dimensions().size())) {
+    llvm::SmallVector<int64_t, 4> minor_to_major(
+        instruction->shape().layout().minor_to_major().begin(),
+        instruction->shape().layout().minor_to_major().end());
+    op->setAttr("minor_to_major", builder_->getIndexTensorAttr(minor_to_major));
+  }
+  return op;
+}
+
 StatusOr<llvm::SmallVector<mlir::Value, 4>> HloFunctionImporter::GetOperands(
     HloInstruction* instruction) {
   llvm::SmallVector<mlir::Value, 4> operands;
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.h b/tensorflow/compiler/mlir/xla/hlo_function_importer.h
index e0cc89004cf..4a75b079d76 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.h
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.h
@@ -55,6 +55,13 @@ class HloFunctionImporter {
   static Status ImportAsRegion(const xla::HloComputation& computation,
                                mlir::Region* region, mlir::Builder* builder);
 
+  // Imports the given computation to the given place specified by `builder`.
+  // `arguments` contains values for all parameters.
+  static StatusOr<mlir::Value> ImportInstructions(
+      const xla::HloComputation& computation,
+      const llvm::SmallVectorImpl<mlir::Value>& arguments,
+      mlir::OpBuilder* builder);
+
  private:
   HloFunctionImporter(mlir::ModuleOp module,
                       std::unordered_map<const xla::HloComputation*,
@@ -80,10 +87,16 @@ class HloFunctionImporter {
   // Assumes that the block already has correct arguments populated.
   tensorflow::Status ImportInstructions(const HloComputation& computation,
                                         mlir::Block* block);
+  StatusOr<mlir::Value> ImportInstructionsImpl(
+      const xla::HloComputation& computation,
+      const llvm::SmallVectorImpl<mlir::Value>& arguments,
+      mlir::OpBuilder* builder);
 
   // Imports an instruction.
   StatusOr<mlir::Operation*> ImportInstruction(xla::HloInstruction* instruction,
                                                mlir::OpBuilder* func_builder);
+  StatusOr<mlir::Operation*> ImportInstructionImpl(
+      HloInstruction* instruction, mlir::OpBuilder* func_builder);
 
   // Gets the MLIR operand values from an HLO Instruction.
   StatusOr<llvm::SmallVector<mlir::Value, 4>> GetOperands(
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
index ac5e01a0abf..daea2d9b8f6 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
@@ -135,12 +135,16 @@ StatusOr<XlaOp> MlirHloBuilder::CustomCallInternal(
     const string& call_target_name, absl::Span<const XlaOp> operands,
     const Shape& shape, const string& opaque,
     absl::optional<absl::Span<const Shape>> operand_shapes_with_layout,
-    bool has_side_effect) {
+    bool has_side_effect,
+    absl::Span<const std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+        output_operand_aliasing) {
   if (operand_shapes_with_layout.has_value())
     return Unimplemented(
         "CustomCall doesn't support operands shapes with layout");
   TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
                                          shape, builder_));
+  TF_RET_CHECK(output_operand_aliasing.empty())
+      << "MLIR CustomCallOp does not support output_operand_aliasing yet";
   auto op = builder_.create<mlir::mhlo::CustomCallOp>(
       loc_, ty, GetValues(operands), builder_.getStringAttr(call_target_name),
       /*has_side_effect=*/builder_.getBoolAttr(has_side_effect),
@@ -239,11 +243,22 @@ StatusOr<XlaOp> MlirHloBuilder::SortInternal(const Shape& shape,
                                              int64 dimension, bool is_stable) {
   TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
                                          shape, builder_));
+  llvm::SmallVector<mlir::Type, 4> sort_types = {ty};
+  if (auto tuple_ty = ty.dyn_cast<mlir::TupleType>()) {
+    sort_types = llvm::to_vector<6>(tuple_ty.getTypes());
+  }
+
   auto op = builder_.create<mlir::mhlo::SortOp>(
-      loc_, ty, GetValues(operands), builder_.getI64IntegerAttr(dimension),
-      builder_.getBoolAttr(is_stable));
+      loc_, sort_types, GetValues(operands),
+      builder_.getI64IntegerAttr(dimension), builder_.getBoolAttr(is_stable));
   TF_RETURN_IF_ERROR(ImportComputation(comparator.proto(), &op.comparator()));
-  return MakeXlaOp(op);
+
+  if (ty.isa<mlir::TupleType>()) {
+    auto tuple = builder_.create<mlir::mhlo::TupleOp>(loc_, op.getResults());
+    return MakeXlaOp(tuple);
+  }
+
+  return MakeXlaOp(op.getResult(0));
 }
 
 StatusOr<XlaOp> MlirHloBuilder::WhileInternal(const Shape& shape,
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
index 00b7aa4d0b0..59b4bc7b1e0 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
@@ -135,7 +135,9 @@ class MlirHloBuilder : public XlaBuilder {
       const string& call_target_name, absl::Span<const XlaOp> operands,
       const Shape& shape, const string& opaque,
       absl::optional<absl::Span<const Shape>> operand_shapes_with_layout,
-      bool has_side_effect) override;
+      bool has_side_effect,
+      absl::Span<const std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+          output_operand_aliasing) override;
 
   StatusOr<XlaOp> ReduceInternal(
       const Shape& shape, absl::Span<const XlaOp> all_operands,
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index 5398cd70777..ccfcebab60e 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "mlir/IR/UseDefLists.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
+#include "tensorflow/compiler/mlir/utils/name_utils.h"
 #include "tensorflow/compiler/mlir/xla/type_to_shape.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
@@ -105,6 +106,9 @@ static mlir::LogicalResult GetXlaOp(
 // TODO(hpucha): This should be consolidated into a general place.
 static int ConvertAPInt(llvm::APInt i) { return i.getSExtValue(); }
 
+static uint32_t Convertuint32_t(uint32_t i) { return i; }
+static uint64_t Convertuint64_t(uint64_t i) { return i; }
+
 // Convert APFloat to double.
 static double ConvertAPFloat(llvm::APFloat value) {
   const auto& semantics = value.getSemantics();
@@ -430,6 +434,27 @@ static xla::FrontendAttributes CreateOpFrontendAttributesFromAttribute(
   return frontend_attributes;
 }
 
+// Returns a OpMetadata proto based on the location of the op. If the location
+// is unknown, an empty proto is returned. `op_name` are populated with the op
+// location (converted). FileLineColLoc locations are populated by taking the
+// file name and line number, and populating `source_file` and `source_line`
+// respectively.
+static xla::OpMetadata CreateOpMetadataFromLocation(mlir::Operation* op) {
+  xla::OpMetadata metadata;
+  if (op->getLoc().isa<mlir::UnknownLoc>()) return metadata;
+
+  std::string name = mlir::GetNameFromLoc(op->getLoc());
+  mlir::LegalizeNodeName(name);
+  metadata.set_op_name(name);
+
+  if (auto file_line_col_loc = op->getLoc().dyn_cast<mlir::FileLineColLoc>()) {
+    metadata.set_source_file(file_line_col_loc.getFilename().str());
+    metadata.set_source_line(file_line_col_loc.getLine());
+  }
+
+  return metadata;
+}
+
 // Checks if all shardings are set.
 static bool AllOptionalShardingsAreSet(
     llvm::ArrayRef<absl::optional<xla::OpSharding>> shardings) {
@@ -474,12 +499,14 @@ class ConvertToHloModule {
   // single value.
   explicit ConvertToHloModule(
       mlir::ModuleOp module, bool use_tuple_args, bool return_tuple,
-      tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn)
+      tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+      MlirToHloConversionOptions options)
       : module_(module),
         module_builder_("main"),
         use_tuple_args_(use_tuple_args),
         return_tuple_(return_tuple),
-        shape_representation_fn_(shape_representation_fn) {
+        shape_representation_fn_(shape_representation_fn),
+        options_(options) {
     if (!shape_representation_fn_)
       shape_representation_fn_ = tensorflow::IdentityShapeRepresentationFn();
   }
@@ -560,6 +587,8 @@ class ConvertToHloModule {
 
   // Unique suffix to give to the name of the next lowered region.
   size_t region_id_ = 0;
+
+  MlirToHloConversionOptions options_;
 };
 
 }  // namespace
@@ -761,7 +790,7 @@ LogicalResult ExportXlaOp(InfeedOp op, OpLoweringContext ctx) {
 LogicalResult ExportXlaOp(IotaOp op, OpLoweringContext ctx) {
   auto& value_map = *ctx.values;
   value_map[op] = xla::Iota(ctx.builder, xla::TypeToShape(op.getType()),
-                            op.iota_dimension().getSExtValue());
+                            op.iota_dimension());
   return success();
 }
 
@@ -887,8 +916,8 @@ LogicalResult ExportXlaOp(RngBitGeneratorOp op, OpLoweringContext ctx) {
   auto result = op.getResult();
   auto xla_arg_1 = value_map[*op.getODSOperands(0).begin()];
   auto xla_result = xla::RngBitGenerator(
-      static_cast<xla::RandomAlgorithm>(op.rng_algorithm().getSExtValue()),
-      Unwrap(xla_arg_1), xla::TypeToShape(result.getType()).tuple_shapes(1));
+      static_cast<xla::RandomAlgorithm>(op.rng_algorithm()), Unwrap(xla_arg_1),
+      xla::TypeToShape(result.getType()).tuple_shapes(1));
   value_map[result] = xla_result;
   return mlir::success();
 }
@@ -983,9 +1012,14 @@ LogicalResult ExportXlaOp(SortOp op, OpLoweringContext ctx) {
                                                      &comparator)))
     return failure();
 
+  auto tupled = xla::Sort(GetTuple(op.operands(), ctx), comparator,
+                          op.dimension(), op.is_stable());
+
   auto& value_map = *ctx.values;
-  value_map[op] = xla::Sort(GetTuple(op.operands(), ctx), comparator,
-                            op.dimension().getSExtValue(), op.is_stable());
+  // MLIR's sort supports multiple returns, untuple all the results of XLA's.
+  for (auto it : llvm::enumerate(op.getResults())) {
+    value_map[it.value()] = xla::GetTupleElement(tupled, it.index());
+  }
   return success();
 }
 
@@ -1034,7 +1068,7 @@ LogicalResult ExportXlaOp(FusionOp op, OpLoweringContext ctx) {
   llvm::SmallVector<xla::XlaOp, 4> operands;
   for (auto operand : op.operands()) operands.push_back(values[operand]);
 
-  xla::XlaOp fusion = xla::internal::XlaBuilderBuildFusion(
+  xla::XlaOp fusion = xla::internal::XlaBuilderFriend::BuildFusion(
       ctx.builder, operands,
       absl::string_view(op.fusion_kind()->data(), op.fusion_kind()->size()),
       fused_computation);
@@ -1048,6 +1082,15 @@ LogicalResult ExportXlaOp(FusionOp op, OpLoweringContext ctx) {
   return success();
 }
 
+LogicalResult ExportXlaOp(BitcastOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaOp operand;
+  if (failed(GetXlaOp(op.operand(), value_map, &operand, op))) return failure();
+  value_map[op] = xla::internal::XlaBuilderFriend::BuildBitcast(
+      ctx.builder, operand, xla::TypeToShape(op.getType()));
+  return success();
+}
+
 }  // namespace
 }  // namespace mhlo
 }  // namespace mlir
@@ -1057,18 +1100,19 @@ LogicalResult ExportXlaOp(FusionOp op, OpLoweringContext ctx) {
 namespace mlir {
 namespace {
 
-StatusOr<xla::Literal> CreateLiteralFromAttr(ElementsAttr attr) {
+StatusOr<xla::Literal> CreateArrayLiteralFromAttr(ElementsAttr attr,
+                                                  xla::Layout layout) {
   if (attr.isa<OpaqueElementsAttr>())
     return tensorflow::errors::Unimplemented(
         "Opaque elements attr not supported");
 
   xla::Shape shape = xla::TypeToShape(attr.getType());
 
-#define ELEMENTS_ATTR_TO_LITERAL(xla_type, cpp_type)       \
-  case xla_type: {                                         \
-    xla::Array<cpp_type> source_data(shape.dimensions());  \
-    source_data.SetValues(attr.getValues<cpp_type>());     \
-    return xla::LiteralUtil::CreateFromArray(source_data); \
+#define ELEMENTS_ATTR_TO_LITERAL(xla_type, cpp_type)                         \
+  case xla_type: {                                                           \
+    xla::Array<cpp_type> source_data(shape.dimensions());                    \
+    source_data.SetValues(attr.getValues<cpp_type>());                       \
+    return xla::LiteralUtil::CreateFromArrayWithLayout(source_data, layout); \
   }
 
   switch (shape.element_type()) {
@@ -1098,7 +1142,7 @@ StatusOr<xla::Literal> CreateLiteralFromAttr(ElementsAttr attr) {
       }
       xla::Array<xla::half> source_data(shape.dimensions());
       source_data.SetValues(values);
-      return xla::LiteralUtil::CreateFromArray(source_data);
+      return xla::LiteralUtil::CreateFromArrayWithLayout(source_data, layout);
     }
     case xla::PrimitiveType::BF16: {
       xla::Array<double> source_data(shape.dimensions());
@@ -1115,7 +1159,7 @@ StatusOr<xla::Literal> CreateLiteralFromAttr(ElementsAttr attr) {
       }
       source_data.SetValues(values_double);
       return xla::LiteralUtil::ConvertF64ToBF16(
-          xla::LiteralUtil::CreateFromArray(source_data));
+          xla::LiteralUtil::CreateFromArrayWithLayout(source_data, layout));
     }
     default:
       return tensorflow::errors::Internal(absl::StrCat(
@@ -1124,13 +1168,46 @@ StatusOr<xla::Literal> CreateLiteralFromAttr(ElementsAttr attr) {
 #undef ELEMENTS_ATTR_TO_LITERAL
 }
 
+xla::Layout ExtractLayout(mlir::Operation* op, int rank) {
+  if (auto attr =
+          op->getAttrOfType<mlir::DenseIntElementsAttr>("minor_to_major")) {
+    llvm::SmallVector<int64, 4> minor_to_major;
+    minor_to_major.reserve(attr.size());
+    for (const llvm::APInt& i : attr) {
+      minor_to_major.push_back(i.getZExtValue());
+    }
+    return xla::LayoutUtil::MakeLayout(minor_to_major);
+  }
+  return xla::LayoutUtil::MakeDescendingLayout(rank);
+}
+
 LogicalResult ConvertToHloModule::Lower(
     mlir::Operation* inst, bool is_entry_function,
     llvm::ArrayRef<absl::optional<xla::OpSharding>> ret_shardings,
     xla::XlaBuilder* builder,
     ConvertToHloModule::ValueLoweringMap* value_lowering,
     xla::XlaComputation* result) {
+  // See MlirToHloConversionOptions for more about layouts.
+  auto propagate_layouts = [this](mlir::Operation* inst, xla::XlaOp xla_op) {
+    if (options_.propagate_layouts) {
+      auto* shape = xla::internal::XlaBuilderFriend::GetInstruction(xla_op)
+                        ->mutable_shape();
+      if (shape->tuple_shapes().empty())
+        *shape->mutable_layout() =
+            ExtractLayout(inst, shape->dimensions().size()).ToProto();
+    }
+  };
+
   if (succeeded(ExportXlaOperator(inst, {value_lowering, this, builder}))) {
+    if (inst->getNumResults() == 1) {
+      auto iter = value_lowering->find(inst->getResult(0));
+      if (iter == value_lowering->end()) {
+        inst->emitOpError(
+            "inst has a result, but it's not found in value_lowering");
+        return failure();
+      }
+      propagate_layouts(inst, iter->second);
+    }
     return success();
   }
 
@@ -1156,16 +1233,19 @@ LogicalResult ConvertToHloModule::Lower(
     if (failed(GetXlaOp(operand, value_map, &xla_operand, op)))
       return failure();
     value_map[op.getResult()] = xla_operand;
+    propagate_layouts(inst, xla_operand);
     return success();
   }
 
-  // TODO(jpienaar): This doesn't support layouts yet.
   if (matchPattern(inst, m_Constant(&const_attr))) {
-    auto literal_or = CreateLiteralFromAttr(const_attr);
+    xla::Layout layout;
+    layout = ExtractLayout(inst, const_attr.getType().getRank());
+    auto literal_or = CreateArrayLiteralFromAttr(const_attr, layout);
     if (!literal_or.ok())
       return inst->emitError(literal_or.status().ToString());
-    value_map[inst->getResult(0)] =
-        xla::ConstantLiteral(builder, literal_or.ValueOrDie());
+    auto constant = xla::ConstantLiteral(builder, literal_or.ValueOrDie());
+    value_map[inst->getResult(0)] = constant;
+
     return success();
   }
 
@@ -1618,22 +1698,24 @@ LogicalResult AddDynamicParameterBindings(mlir::ModuleOp module,
 }  // namespace
 
 Status ConvertRegionToComputation(mlir::Region* region,
-                                  xla::XlaComputation* func) {
+                                  xla::XlaComputation* func,
+                                  MlirToHloConversionOptions options) {
   mlir::ModuleOp module;
-  ConvertToHloModule converter(module, true, true, {});
+  ConvertToHloModule converter(module, true, true, {}, options);
   if (failed(converter.LowerRegionAsComputation(region, func)))
     return tensorflow::errors::Internal(
         "failed to convert region to computation");
   return Status::OK();
 }
 
-Status ConvertMlirHloToHlo(mlir::ModuleOp module, xla::HloProto* hlo_proto,
-                           bool use_tuple_args, bool return_tuple,
-                           const tensorflow::XlaHelpers::ShapeRepresentationFn
-                               shape_representation_fn) {
+Status ConvertMlirHloToHlo(
+    mlir::ModuleOp module, xla::HloProto* hlo_proto, bool use_tuple_args,
+    bool return_tuple,
+    const tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+    MlirToHloConversionOptions options) {
   mlir::StatusScopedDiagnosticHandler diag_handler(module.getContext());
   ConvertToHloModule converter(module, use_tuple_args, return_tuple,
-                               shape_representation_fn);
+                               shape_representation_fn, options);
   if (failed(converter.Run())) return diag_handler.ConsumeStatus();
   auto hlo_module = converter.ConsumeMainProto();
   hlo_proto->mutable_hlo_module()->Swap(&hlo_module);
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h
index 6f2b5a6db95..4ca3e586128 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h
@@ -25,6 +25,18 @@ limitations under the License.
 
 namespace mlir {
 
+struct MlirToHloConversionOptions {
+  // Best-effort propagation of the layouts. These layouts serve as performance
+  // hints to the backend.
+  //
+  // Note that non-array shapes are not carrying layouts, and users have to
+  // figure out the proper layouts of them through context. This is one of the
+  // reasons why the attribute-based solution is temporary.
+  //
+  // TODO(timshen): Investigate the necessity of having layouts in MHLO.
+  bool propagate_layouts = false;
+};
+
 // Converts a MLIR module in HLO dialect into a HloModuleProto. If
 // use_tuple_args is set, then the entry computations's arguments are converted
 // to a tuple and passed as a single parameter.
@@ -32,15 +44,19 @@ namespace mlir {
 // are converted to a tuple even when there is only a single return value.
 // Multiple return values are always converted to a tuple and returned as a
 // single value.
+//
+// TODO(timshen): move other options into `options`.
 Status ConvertMlirHloToHlo(mlir::ModuleOp module, ::xla::HloProto* hlo_proto,
                            bool use_tuple_args, bool return_tuple,
                            const tensorflow::XlaHelpers::ShapeRepresentationFn
-                               shape_representation_fn = nullptr);
+                               shape_representation_fn = nullptr,
+                           MlirToHloConversionOptions options = {});
 
 // Converts a region to a computation. It returns a standalone module that
 // contains the converted region as the entry computation.
 Status ConvertRegionToComputation(mlir::Region* region,
-                                  ::xla::XlaComputation* func);
+                                  ::xla::XlaComputation* func,
+                                  MlirToHloConversionOptions options = {});
 
 // Creates XlaOp equivalent of a given MLIR operation using the operand info
 // from `value_lowering` map.
diff --git a/tensorflow/compiler/mlir/xla/operator_writer_gen.cc b/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
index 407a7d3da38..801c04496f0 100644
--- a/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
+++ b/tensorflow/compiler/mlir/xla/operator_writer_gen.cc
@@ -165,6 +165,11 @@ static bool OperatorWritersMain(raw_ostream& os, RecordKeeper& records) {
         "frontend_attributes(lowering_context.builder, "
         "CreateOpFrontendAttributesFromAttribute(op));\n\n";
 
+  // Create a scoped object to assign op metadata to generated XLA ops.
+  os << "  xla::XlaScopedOpMetadataAssignment "
+        "op_metadata(lowering_context.builder, "
+        "CreateOpMetadataFromLocation(op));\n\n";
+
   // Retrieve all the definitions derived from HLO_Op and sort by record name.
   for (const auto* def : records.getAllDerivedDefinitions("HLO_Op")) {
     // Skip operations that have a custom exporter.
diff --git a/tensorflow/compiler/mlir/xla/tests/BUILD b/tensorflow/compiler/mlir/xla/tests/BUILD
index 2631e2b6757..754b14f4b13 100644
--- a/tensorflow/compiler/mlir/xla/tests/BUILD
+++ b/tensorflow/compiler/mlir/xla/tests/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/fusion_layouts.hlotxt b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/fusion_layouts.hlotxt
new file mode 100644
index 00000000000..781e203510b
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/fusion_layouts.hlotxt
@@ -0,0 +1,16 @@
+// RUN: tf-mlir-translate -hlo-text-to-lhlo -optimize-xla-hlo=false %s | FileCheck %s
+
+HloModule TestModule
+
+// CHECK: func @TestComputation
+
+FusedComputation {
+  // CHECK: tensor_load %arg0 {minor_to_major = dense<[0, 1]> : tensor<2xindex>}
+  x = f32[3, 2]{0,1} parameter(0)
+  ROOT y = f32[3, 2]{0,1} add(x, x)
+}
+
+ENTRY TestComputation {
+  x = f32[3, 2]{0,1} parameter(0)
+  ROOT y = f32[3, 2]{0,1} fusion(x), kind=kLoop, calls=FusedComputation
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir
index 2e1b63b0db7..e7312e2114c 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/ops.mlir
@@ -316,12 +316,61 @@ func @main(%value0: tensor<2x2xf32>) -> tensor<2x2xf32> {
 // CHECK: %[[VIEW0:.*]] = std.view %[[ARG2]]{{.*}} : memref<100xi8> to memref<5x5xi32>
 // CHECK: %[[VIEW1:.*]] = std.view %[[ARG3]]{{.*}} : memref<100xi8> to memref<5x5xf32>
 // CHECK: "lmhlo.sort"(%[[ARG0]], %[[ARG1]], %[[VIEW0]], %[[VIEW1]])
-func @main(%key: tensor<5x5xi32>, %value: tensor<5x5xf32>) -> tuple<tensor<5x5xi32>, tensor<5x5xf32>> {
-  %res = "mhlo.sort"(%key, %value) ({
+func @main(%key: tensor<5x5xi32>, %value: tensor<5x5xf32>) -> (tensor<5x5xi32>, tensor<5x5xf32>) {
+  %res:2 = "mhlo.sort"(%key, %value) ({
   ^bb0(%a: tensor<i32>, %b: tensor<i32>, %c: tensor<f32>, %d: tensor<f32>):
     %ret = "mhlo.compare"(%c, %d) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     "mhlo.return"(%ret) : (tensor<i1>) -> ()
-  }) {dimension = 1 : i64, is_stable = true}: (tensor<5x5xi32>, tensor<5x5xf32>) -> tuple<tensor<5x5xi32>, tensor<5x5xf32>>
+  }) {dimension = 1 : i64, is_stable = true}: (tensor<5x5xi32>, tensor<5x5xf32>) -> (tensor<5x5xi32>, tensor<5x5xf32>)
 
-  return %res : tuple<tensor<5x5xi32>, tensor<5x5xf32>>
+  return %res#0, %res#1 : tensor<5x5xi32>, tensor<5x5xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: %[[ARG0:.*]]: memref<f32> {{.*}}lmhlo.params = 0
+// CHECK-SAME: %[[ARG1:.*]]: memref<f32> {{.*}}lmhlo.params = 1
+// CHECK-SAME: %[[ARG2:.*]]: memref<4xi8>
+// CHECK: "lmhlo.fusion"() ( {
+// CHECK:   %[[VAR0:.*]] = tensor_load %[[ARG0]] : memref<f32>
+// CHECK:   %[[VAR1:.*]] = tensor_load %[[ARG1]] : memref<f32>
+// CHECK:   %[[VAR2:.*]] = mhlo.add %[[VAR0]], %[[VAR1]] : tensor<f32>
+// CHECK:   tensor_store %[[VAR2]], %[[MEMREF:.*]] : memref<f32>
+// CHECK:   "lmhlo.terminator"() : () -> ()
+// CHECK: }) : () -> ()
+func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
+  %result = "mhlo.fusion"(%arg0, %arg1) ( {
+    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+      %result = "mhlo.add"(%arg2, %arg3): (tensor<f32>, tensor<f32>) -> tensor<f32>
+      "mhlo.return"(%result) : (tensor<f32>) -> ()
+    }) { fusion_kind = "kLoop" } : (tensor<f32>, tensor<f32>) -> tensor<f32>
+
+  return %result : tensor<f32>
+}
+
+// -----
+
+// CHECK-LABEL: func @main
+// CHECK: "lmhlo.fusion"() ( {
+// CHECK:   %[[VAL0:.*]] = tensor_load %{{.*}} : memref<f32>
+// CHECK:   %[[VAL1:.*]] = tensor_load %{{.*}} : memref<f32>
+// CHECK:   %[[VAL2:.*]] = tensor_load %{{.*}} : memref<f32>
+// CHECK:   tensor_store %[[VAL0]], %{{.*}} : memref<f32>
+// CHECK:   tensor_store %[[VAL1]], %{{.*}} : memref<f32>
+// CHECK:   tensor_store %[[VAL2]], %{{.*}} : memref<f32>
+// CHECK:   "lmhlo.terminator"() : () -> ()
+// CHECK: }) : () -> ()
+func @main(%arg0: tuple<tuple<tensor<f32>>, tensor<f32>>, %arg1: tuple<tensor<f32>>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>> {
+  %result = "mhlo.fusion"(%arg0, %arg1) ( {
+    ^bb0(%arg2: tuple<tuple<tensor<f32>>, tensor<f32>>, %arg3: tuple<tensor<f32>>):
+      %0 = "mhlo.get_tuple_element"(%arg2) {index = 0 : i32} : (tuple<tuple<tensor<f32>>, tensor<f32>>) -> tuple<tensor<f32>>
+      %1 = "mhlo.get_tuple_element"(%0) {index = 0 : i32} : (tuple<tensor<f32>>) -> tensor<f32>
+      %2 = "mhlo.get_tuple_element"(%arg2) {index = 1 : i32} : (tuple<tuple<tensor<f32>>, tensor<f32>>) -> tensor<f32>
+      %3 = "mhlo.get_tuple_element"(%arg3) {index = 0 : i32} : (tuple<tensor<f32>>) -> tensor<f32>
+      %4 = "mhlo.tuple"(%1, %2, %3) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>>
+      "mhlo.return"(%4) : (tuple<tensor<f32>, tensor<f32>, tensor<f32>>) -> ()
+    }) { fusion_kind = "kLoop" } : (tuple<tuple<tensor<f32>>, tensor<f32>>, tuple<tensor<f32>>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>>
+
+  return %result : tuple<tensor<f32>, tensor<f32>, tensor<f32>>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
index cffb15022b0..5a07d9303f0 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
@@ -60,20 +60,20 @@ func @batchmatmulv2_dynamic(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>)
   return %0 : tensor<?x?x?xf32>
 }
 
-func @batchmatmulv2_adj_real(%arg0: tensor<5x2xf32>, %arg1: tensor<2x4xf32>) -> tensor<5x4xf32> {
+func @batchmatmulv2_adj_real(%arg0: tensor<2x5xf32>, %arg1: tensor<4x2xf32>) -> tensor<5x4xf32> {
 // CHECK-LABEL:   func @batchmatmulv2_adj_real
 // CHECK:           "mhlo.dot_general"({{.*}}, {{.*}}) {dot_dimension_numbers = {
 // CHECK-SAME:        lhs_batching_dimensions = dense<> : tensor<0xi64>,
 // CHECK-SAME:        lhs_contracting_dimensions = dense<0> : tensor<1xi64>,
 // CHECK-SAME:        rhs_batching_dimensions = dense<> : tensor<0xi64>,
 // CHECK-SAME:        rhs_contracting_dimensions = dense<1> : tensor<1xi64>}}
-  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = true, device = ""} : (tensor<5x2xf32>, tensor<2x4xf32>) -> tensor<5x4xf32>
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = true, device = ""} : (tensor<2x5xf32>, tensor<4x2xf32>) -> tensor<5x4xf32>
   return %0 : tensor<5x4xf32>
 }
 
-func @batchmatmulv2_adj_complex(%arg0: tensor<5x2xcomplex<f32>>, %arg1: tensor<2x4xcomplex<f32>>) -> tensor<5x4xcomplex<f32>> {
+func @batchmatmulv2_adj_complex(%arg0: tensor<2x5xcomplex<f32>>, %arg1: tensor<4x2xcomplex<f32>>) -> tensor<5x4xcomplex<f32>> {
 // CHECK-LABEL:   func @batchmatmulv2_adj_complex(
-// CHECK-SAME:                                    [[LHS:%.*]]: tensor<5x2xcomplex<f32>>, [[RHS:%.*]]: tensor<2x4xcomplex<f32>>) -> tensor<5x4xcomplex<f32>> {
+// CHECK-SAME:                                    [[LHS:%.*]]: tensor<2x5xcomplex<f32>>, [[RHS:%.*]]: tensor<4x2xcomplex<f32>>) -> tensor<5x4xcomplex<f32>> {
 // CHECK:           [[LHSRE:%.*]] = "mhlo.real"([[LHS]])
 // CHECK:           [[LHSIM:%.*]] = "mhlo.imag"([[LHS]])
 // CHECK:           [[LHSIMNEG:%.*]] = "mhlo.negate"([[LHSIM]])
@@ -84,6 +84,6 @@ func @batchmatmulv2_adj_complex(%arg0: tensor<5x2xcomplex<f32>>, %arg1: tensor<2
 // CHECK:           [[RHSCONJ:%.*]] = "mhlo.complex"([[RHSRE]], [[RHSIMNEG]])
 // CHECK:           shape.shape_of [[LHSCONJ]]
 // CHECK:           shape.shape_of [[RHSCONJ]]
-  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = true, device = ""} : (tensor<5x2xcomplex<f32>>, tensor<2x4xcomplex<f32>>) -> tensor<5x4xcomplex<f32>>
+  %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = true, device = ""} : (tensor<2x5xcomplex<f32>>, tensor<4x2xcomplex<f32>>) -> tensor<5x4xcomplex<f32>>
   return %0 : tensor<5x4xcomplex<f32>>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
index 5f3e40f923f..7f37dbb0479 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
@@ -2,6 +2,7 @@
 // (unlike the rest), since this is the primary use case for such ops and
 // verification of shapes and broadcasts is desired.
 // RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=true" -canonicalize %s | FileCheck %s
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=false" %s | FileCheck --check-prefix CHLO %s
 
 //===----------------------------------------------------------------------===//
 // Binary op legalizations.
@@ -58,6 +59,15 @@ func @add_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi3
   return %0: tensor<?x?xi32>
 }
 
+// CHECK-LABEL: func @broadcast_add_unranked
+// CHLO-LABEL: func @broadcast_add_unranked
+func @broadcast_add_unranked(%arg0: tensor<1xi32>, %arg1: tensor<*xi32>) -> tensor<*xi32> {
+  // CHECK: tf.Add
+  // CHLO: chlo.broadcast_add %arg0, %arg1
+  %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xi32>, tensor<*xi32>) -> tensor<*xi32>
+  return %0: tensor<*xi32>
+}
+
 // CHECK-LABEL: func @div
 func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   // CHECK-NEXT:  %0 = mhlo.divide %arg0, %arg0 : tensor<2xi32>
@@ -139,9 +149,9 @@ func @broadcast_shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<2x4xui8
 }
 
 // CHECK-LABEL: func @and
-func @and(%arg0: tensor<2xi1>) -> tensor<2xi1> {
+func @and(%arg0: tensor<2xi1>, %arg1: tensor<2xi1>) -> tensor<2xi1> {
   // CHECK-NEXT:  mhlo.and
-  %0 = "tf.LogicalAnd"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
+  %0 = "tf.LogicalAnd"(%arg0, %arg1) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
   return %0: tensor<2xi1>
 }
 
@@ -153,9 +163,9 @@ func @and_unranked(%arg0: tensor<*xi1>, %arg1: tensor<*xi1>) -> tensor<*xi1> {
 }
 
 // CHECK-LABEL: func @or
-func @or(%arg0: tensor<2xi1>) -> tensor<2xi1> {
+func @or(%arg0: tensor<2xi1>, %arg1: tensor<2xi1>) -> tensor<2xi1> {
   // CHECK-NEXT:  mhlo.or
-  %0 = "tf.LogicalOr"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
+  %0 = "tf.LogicalOr"(%arg0, %arg1) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
   return %0: tensor<2xi1>
 }
 
@@ -187,9 +197,9 @@ func @pow(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 //===----------------------------------------------------------------------===//
 
 // CHECK-LABEL: func @equal
-func @equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg0) {comparison_direction = "EQ"}
-  %0 = "tf.Equal"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+func @equal(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"}
+  %0 = "tf.Equal"(%arg0, %arg1) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0: tensor<2xi1>
 }
 
@@ -255,9 +265,9 @@ func @equal_unranked(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*xi1>
 }
 
 // CHECK-LABEL: func @notequal
-func @notequal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg0) {comparison_direction = "NE"}
-  %0 = "tf.NotEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+func @notequal(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg1) {comparison_direction = "NE"}
+  %0 = "tf.NotEqual"(%arg0, %arg1) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0: tensor<2xi1>
 }
 
@@ -268,9 +278,9 @@ func @notequal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 //===----------------------------------------------------------------------===//
 
 // CHECK-LABEL: func @greater
-func @greater(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK: "mhlo.compare"(%arg0, %arg0) {comparison_direction = "GT"}
-  %0 = "tf.Greater"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+func @greater(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK: "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GT"}
+  %0 = "tf.Greater"(%arg0, %arg1) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0: tensor<2xi1>
 }
 
@@ -300,29 +310,29 @@ func @greater_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<?xi1
 }
 
 // CHECK-LABEL: func @greater_uranked
-func @greater_uranked(%arg0: tensor<*xi32>) -> tensor<*xi1> {
+func @greater_uranked(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*xi1> {
   // CHECK:  "tf.Greater"
-  %0 = "tf.Greater"(%arg0, %arg0) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi1>
+  %0 = "tf.Greater"(%arg0, %arg1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi1>
   return %0: tensor<*xi1>
 }
 
 // CHECK-LABEL: func @greater_equal
-func @greater_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg0) {comparison_direction = "GE"}
-  %0 = "tf.GreaterEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+func @greater_equal(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GE"}
+  %0 = "tf.GreaterEqual"(%arg0, %arg1) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0: tensor<2xi1>
 }
 
 // CHECK-LABEL: func @less
-func @less(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg0) {comparison_direction = "LT"}
-  %0 = "tf.Less"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+func @less(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg1) {comparison_direction = "LT"}
+  %0 = "tf.Less"(%arg0, %arg1) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0: tensor<2xi1>
 }
 
 // CHECK-LABEL: func @less_equal
-func @less_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg0) {comparison_direction = "LE"}
-  %0 = "tf.LessEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+func @less_equal(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg1) {comparison_direction = "LE"}
+  %0 = "tf.LessEqual"(%arg0, %arg1) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0: tensor<2xi1>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
index 93eac3821b2..767e0be8d6a 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
@@ -1,27 +1,27 @@
 // RUN: tf-opt -xla-legalize-tf-control-flow %s | FileCheck %s
 
 // CHECK-LABEL: @if
-func @if(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>)
-attributes  {tf._input_shapes = ["tfshape$", "tfshape$"]} {
-  // CHECK: [[VAL0:%.+]] = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<f32>, [[ARG1:%.+]]: tensor<f32>)
+func @if(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>) {
+  // CHECK: [[VAL0:%.+]] = "mhlo.compare"([[ARG0]], [[ARG1]]) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
   %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
-  // CHECK: [[VAL1:%.+]] = "mhlo.tuple"(%arg0, %arg1)
+  // CHECK: [[VAL1:%.+]] = "mhlo.tuple"([[ARG0]], [[ARG1]])
   // CHECK: [[VAL2:%.+]] = "mhlo.if"([[VAL0]], [[VAL1]], [[VAL1]]) ( {
-  // CHECK: ^bb0(%arg2: tuple<tensor<f32>, tensor<f32>>):
-  // CHECK:   [[VAL4:%.+]] = "mhlo.get_tuple_element"(%arg2) {index = 0 : i32}
-  // CHECK:   [[VAL5:%.+]] = "mhlo.get_tuple_element"(%arg2) {index = 1 : i32}
+  // CHECK: ^bb0([[THEN_ARG:%.+]]: tuple<tensor<f32>, tensor<f32>>):
+  // CHECK:   [[VAL4:%.+]] = "mhlo.get_tuple_element"([[THEN_ARG]]) {index = 0 : i32}
+  // CHECK:   [[VAL5:%.+]] = "mhlo.get_tuple_element"([[THEN_ARG]]) {index = 1 : i32}
   // CHECK:   [[VAL6:%.+]] = call @cond_true([[VAL4]], [[VAL5]])
   // CHECK:   [[VAL7:%.+]] = "mhlo.tuple"([[VAL6]])
   // CHECK:   "mhlo.return"([[VAL7]]) : (tuple<tensor<f32>>) -> ()
   // CHECK: },  {
-  // CHECK: ^bb0(%arg2: tuple<tensor<f32>, tensor<f32>>)
-  // CHECK:   [[VAL4:%.+]] = "mhlo.get_tuple_element"(%arg2) {index = 0 : i32}
-  // CHECK:   [[VAL5:%.+]] = "mhlo.get_tuple_element"(%arg2) {index = 1 : i32}
+  // CHECK: ^bb0([[ELSE_ARG:%.+]]: tuple<tensor<f32>, tensor<f32>>)
+  // CHECK:   [[VAL4:%.+]] = "mhlo.get_tuple_element"([[ELSE_ARG]]) {index = 0 : i32}
+  // CHECK:   [[VAL5:%.+]] = "mhlo.get_tuple_element"([[ELSE_ARG]]) {index = 1 : i32}
   // CHECK:   [[VAL6:%.+]] = call @cond_false([[VAL4]], [[VAL5]])
   // CHECK:   [[VAL7:%.+]] = "mhlo.tuple"([[VAL6]])
-  // CHECK: "mhlo.return"([[VAL7]]) : (tuple<tensor<f32>>) -> ()
+  // CHECK:   "mhlo.return"([[VAL7]]) : (tuple<tensor<f32>>) -> ()
   // CHECK: })
-  %1 = "tf.If"(%0, %arg0, %arg1) {Tcond = "tfdtype$DT_BOOL", Tin = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _lower_using_switch_merge = true, _output_shapes = ["tfshape$"], device = "", else_branch = @cond_false, is_stateless = true, name = "cond", output_shapes = [#tf.shape<>], then_branch = @cond_true} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  %1 = "tf.If"(%0, %arg0, %arg1) {else_branch = @cond_false, is_stateless = true, then_branch = @cond_true} : (tensor<i1>, tensor<f32>, tensor<f32>) -> tensor<f32>
 
   // CHECK: [[VAL3:%.+]] = "mhlo.get_tuple_element"([[VAL2]]) {index = 0 : i32}
   // CHECK: return [[VAL3]]
@@ -41,6 +41,38 @@ attributes  {tf._input_shapes = ["tfshape$", "tfshape$"]} {
 }
 
 
+// CHECK-LABEL: @ifRegion
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<f32>, [[ARG1:%.+]]: tensor<f32>)
+func @ifRegion(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>) {
+  // CHECK: [[VAL0:%.+]] = "mhlo.compare"([[ARG0]], [[ARG1]]) {comparison_direction = "GT"}
+  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  // CHECK: [[VAL1:%.+]] = "mhlo.tuple"([[ARG0]])
+  // CHECK: [[VAL2:%.+]] = "mhlo.tuple"([[ARG1]])
+  // CHECK: [[VAL3:%.+]] = "mhlo.if"([[VAL0]], [[VAL1]], [[VAL2]]) ( {
+  %1 = "tf.IfRegion"(%0) ( {
+  // CHECK: ^{{[a-z0-9]+}}([[TRUE_ARG:%.+]]: tuple<tensor<f32>>):
+    // CHECK: [[VAL5:%.+]] = "mhlo.get_tuple_element"([[TRUE_ARG]]) {index = 0 : i32}
+    // CHECK: [[VAL6:%.+]] = "mhlo.log"([[VAL5]])
+    %2 = "mhlo.log"(%arg0) : (tensor<f32>) -> tensor<f32>
+    // CHECK: [[VAL7:%.+]] = "mhlo.tuple"([[VAL6]])
+    // CHECK: "mhlo.return"([[VAL7]])
+    "tf.Yield"(%2) : (tensor<f32>) -> ()
+  }, {
+  // CHECK: ^{{[a-z0-9]+}}([[FALSE_ARG:%.+]]: tuple<tensor<f32>>):
+    // CHECK: [[VAL5:%.+]] = "mhlo.get_tuple_element"([[FALSE_ARG]]) {index = 0 : i32}
+    // CHECK: [[VAL6:%.+]] = "mhlo.exponential"([[VAL5]])
+    %2 = "mhlo.exponential"(%arg1) : (tensor<f32>) -> tensor<f32>
+    // CHECK: [[VAL7:%.+]] = "mhlo.tuple"([[VAL6]])
+    // CHECK: "mhlo.return"([[VAL7]])
+    "tf.Yield"(%2) : (tensor<f32>) -> ()
+  // CHECK: }) : (tensor<i1>, tuple<tensor<f32>>, tuple<tensor<f32>>) -> tuple<tensor<f32>>
+  }) {is_stateless = true} : (tensor<i1>) -> tensor<f32>
+  // CHECK: [[VAL4:%.+]] = "mhlo.get_tuple_element"([[VAL3]]) {index = 0 : i32}
+  // CHECK: return [[VAL4]]
+  return %1 : tensor<f32>
+}
+
+
 // CHECK-LABEL: func @case
 // CHECK-SAME:  %[[BRANCH_INDEX:.*]]: tensor<i32>, %[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>) -> (tensor<f32>, tensor<f32>)
 func @case(%index: tensor<i32>, %arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
@@ -85,26 +117,62 @@ func @floor(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>
 }
 
 
+// CHECK-LABEL: func @caseRegion
+// CHECK-SAME:  ([[BRANCH_INDEX:%.+]]: tensor<i32>, [[ARG0:.+]]: tensor<f32>, [[ARG1:%.+]]: tensor<f32>)
+func @caseRegion(%index: tensor<i32>, %arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
+  // CHECK: [[VAL0:%.+]] = "mhlo.tuple"([[ARG1]])
+  // CHECK: [[VAL1:%.+]] = "mhlo.tuple"([[ARG0]], [[ARG1]])
+  // CHECK: [[VAL2:%.+]] = "mhlo.tuple"([[ARG0]], [[ARG1]])
+  // CHECK: [[VAL3:%.+]]:2 = "mhlo.case"([[BRANCH_INDEX]], [[VAL0]], [[VAL1]], [[VAL2]]) ( {
+  %0:2 = "tf.CaseRegion"(%index) ( {
+  // CHECK: ^{{[a-z0-9]+}}([[BRANCH0_ARG:%.+]]: tuple<tensor<f32>>):
+    // CHECK: [[VAL4:%.+]] = "mhlo.get_tuple_element"([[BRANCH0_ARG]]) {index = 0 : i32}
+    // CHECK: [[VAL5:%.+]] = "mhlo.exponential"([[VAL4]])
+    %1 = "mhlo.exponential"(%arg1) : (tensor<f32>) -> tensor<f32>
+    // CHECK: "mhlo.return"([[VAL5]], [[VAL4]])
+    "tf.Yield"(%1, %arg1) : (tensor<f32>, tensor<f32>) -> ()
+  }, {
+  // CHECK: ^{{[a-z0-9]+}}([[BRANCH1_ARG:%.+]]: tuple<tensor<f32>, tensor<f32>>):
+    // CHECK: [[VAL4:%.+]] = "mhlo.get_tuple_element"([[BRANCH1_ARG]]) {index = 0 : i32}
+    // CHECK: [[VAL5:%.+]] = "mhlo.get_tuple_element"([[BRANCH1_ARG]]) {index = 1 : i32}
+    // CHECK: [[VAL6:%.+]] = "mhlo.log"([[VAL4]])
+    %1 = "mhlo.log"(%arg0) : (tensor<f32>) -> tensor<f32>
+    // CHECK: "mhlo.return"([[VAL6]], [[VAL5]])
+    "tf.Yield"(%1, %arg1) : (tensor<f32>, tensor<f32>) -> ()
+  }, {
+  // CHECK: ^{{[a-z0-9]+}}([[BRANCH2_ARG:%.+]]: tuple<tensor<f32>, tensor<f32>>):
+    // CHECK: [[VAL4:%.+]] = "mhlo.get_tuple_element"([[BRANCH2_ARG]]) {index = 0 : i32}
+    // CHECK: [[VAL5:%.+]] = "mhlo.get_tuple_element"([[BRANCH2_ARG]]) {index = 1 : i32}
+    // CHECK: [[VAL6:%.+]] = "mhlo.floor"([[VAL4]])
+    %1 = "mhlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
+    // CHECK: "mhlo.return"([[VAL6]], [[VAL5]])
+    "tf.Yield"(%1, %arg1) : (tensor<f32>, tensor<f32>) -> ()
+  // CHECK: }) : (tensor<i32>, tuple<tensor<f32>>, tuple<tensor<f32>, tensor<f32>>, tuple<tensor<f32>, tensor<f32>>) -> (tensor<f32>, tensor<f32>)
+  }) {is_stateless = true} : (tensor<i32>) -> (tensor<f32>, tensor<f32>)
+  // CHECK: return [[VAL3]]#0, [[VAL3]]#1 : tensor<f32>, tensor<f32>
+  return %0#0, %0#1 : tensor<f32>, tensor<f32>
+}
+
+
 // CHECK-LABEL: func @while
-func @while(%arg0: tensor<f32> {tf_saved_model.index_path = [0]}) -> (tensor<i32> {tf_saved_model.index_path = []})
-attributes  {tf._input_shapes = ["tfshape$"]} {
+func @while() -> tensor<i32> {
   // CHECK: [[VAL0:%.+]] = mhlo.constant dense<0>
   // CHECK: [[VAL1:%.+]] = mhlo.constant dense<-1>
   %0 = mhlo.constant dense<0> : tensor<i32>
   %1 = mhlo.constant dense<-1> : tensor<i32>
   // CHECK: [[VAL2:%.+]] = "mhlo.tuple"([[VAL0]], [[VAL1]], [[VAL0]])
   // CHECK: [[VAL3:%.+]] = "mhlo.while"([[VAL2]]) ( {
-  // CHECK:   ^bb0(%arg1: tuple<tensor<i32>, tensor<i32>, tensor<i32>>):
-  // CHECK:   [[VAL7:%.+]] = "mhlo.get_tuple_element"(%arg1) {index = 0 : i32}
-  // CHECK:   [[VAL8:%.+]] = "mhlo.get_tuple_element"(%arg1) {index = 1 : i32}
-  // CHECK:   [[VAL9:%.+]] = "mhlo.get_tuple_element"(%arg1) {index = 2 : i32}
+  // CHECK: ^bb0([[COND_ARG:%.+]]: tuple<tensor<i32>, tensor<i32>, tensor<i32>>):
+  // CHECK:   [[VAL7:%.+]] = "mhlo.get_tuple_element"([[COND_ARG]]) {index = 0 : i32}
+  // CHECK:   [[VAL8:%.+]] = "mhlo.get_tuple_element"([[COND_ARG]]) {index = 1 : i32}
+  // CHECK:   [[VAL9:%.+]] = "mhlo.get_tuple_element"([[COND_ARG]]) {index = 2 : i32}
   // CHECK:   [[VAL10:%.+]] = call @while_cond([[VAL7]], [[VAL8]], [[VAL9]])
   // CHECK:   "mhlo.return"([[VAL10]])
   // CHECK: },  {
-  // CHECK: ^bb0(%arg1: tuple<tensor<i32>, tensor<i32>, tensor<i32>>):
-  // CHECK:   [[VAL7:%.+]] = "mhlo.get_tuple_element"(%arg1) {index = 0 : i32}
-  // CHECK:   [[VAL8:%.+]] = "mhlo.get_tuple_element"(%arg1) {index = 1 : i32}
-  // CHECK:   [[VAL9:%.+]] = "mhlo.get_tuple_element"(%arg1) {index = 2 : i32}
+  // CHECK: ^bb0([[BODY_ARG:%.+]]: tuple<tensor<i32>, tensor<i32>, tensor<i32>>):
+  // CHECK:   [[VAL7:%.+]] = "mhlo.get_tuple_element"([[BODY_ARG]]) {index = 0 : i32}
+  // CHECK:   [[VAL8:%.+]] = "mhlo.get_tuple_element"([[BODY_ARG]]) {index = 1 : i32}
+  // CHECK:   [[VAL9:%.+]] = "mhlo.get_tuple_element"([[BODY_ARG]]) {index = 2 : i32}
   // CHECK:   [[VAL10:%.+]]:3 = call @while_body([[VAL7]], [[VAL8]], [[VAL9]])
   // CHECK:   [[VAL11:%.+]] = "mhlo.tuple"([[VAL10]]#0, [[VAL10]]#1, [[VAL10]]#2)
   // CHECK:   "mhlo.return"([[VAL11]])
@@ -113,19 +181,134 @@ attributes  {tf._input_shapes = ["tfshape$"]} {
   // CHECK: [[VAL5:%.+]] = "mhlo.get_tuple_element"([[VAL3]]) {index = 1 : i32}
   // CHECK: [[VAL6:%.+]] = "mhlo.get_tuple_element"([[VAL3]]) {index = 2 : i32}
   // CHECK: return [[VAL6]]
-  %2:3 = "tf.While"(%0, %1, %0) {T = ["tfdtype$DT_INT32", "tfdtype$DT_INT32", "tfdtype$DT_INT32"], _lower_using_switch_merge = true, _num_original_outputs = 3 : i64, _output_shapes = ["tfshape$", "tfshape$", "tfshape$"], body = @while_body, cond = @while_cond, device = "", is_stateless = true, name = "while", output_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>], parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
+  %2:3 = "tf.While"(%0, %1, %0) {body = @while_body, cond = @while_cond, is_stateless = true, parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
   return %2#2 : tensor<i32>
 }
-func @while_cond(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i1>
-attributes  {tf._input_shapes = ["tfshape$", "tfshape$", "tfshape$"]} {
+func @while_cond(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<i1> {
   %0 = mhlo.constant dense<10> : tensor<i32>
   %1 = "mhlo.compare"(%arg2, %0) {comparison_direction = "LT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
   return %1 : tensor<i1>
 }
-func @while_body(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
-attributes  {tf._input_shapes = ["tfshape$", "tfshape$", "tfshape$"]} {
+func @while_body(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>) {
   %0 = mhlo.constant dense<1> : tensor<i32>
   %1 = mhlo.add %arg2, %0 : tensor<i32>
   %2 = mhlo.add %arg0, %0 : tensor<i32>
   return %2, %arg1, %1 : tensor<i32>, tensor<i32>, tensor<i32>
 }
+
+
+// CHECK-LABEL: func @whileRegion
+func @whileRegion() -> tensor<i32> {
+  // CHECK: [[VAL0:%.+]] = mhlo.constant dense<0>
+  %0 = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: [[VAL1:%.+]] = mhlo.constant dense<-1>
+  %1 = mhlo.constant dense<-1> : tensor<i32>
+  // CHECK: [[VAL2:%.+]] = "mhlo.tuple"([[VAL0]], [[VAL1]], [[VAL0]])
+  // CHECK: [[VAL3:%.+]] = "mhlo.while"([[VAL2]]) ( {
+  %2:3 = "tf.WhileRegion"(%0, %1, %0) ( {
+  // CHECK: ^bb0([[COND_ARG:%.+]]: tuple<tensor<i32>, tensor<i32>, tensor<i32>>):
+  ^cond(%carg0: tensor<i32>, %carg1: tensor<i32>, %carg2: tensor<i32>):
+    // CHECK: [[VAL7:%.+]] = "mhlo.get_tuple_element"([[COND_ARG]]) {index = 0 : i32}
+    // CHECK: [[VAL8:%.+]] = "mhlo.get_tuple_element"([[COND_ARG]]) {index = 1 : i32}
+    // CHECK: [[VAL9:%.+]] = "mhlo.get_tuple_element"([[COND_ARG]]) {index = 2 : i32}
+    // CHECK: [[VAL10:%.+]] = mhlo.constant dense<10>
+    %3 = mhlo.constant dense<10> : tensor<i32>
+    // CHECK: [[VAL11:%.+]] = "mhlo.compare"([[VAL9]], [[VAL10]]) {comparison_direction = "LT"}
+    %4 = "mhlo.compare"(%carg2, %3) {comparison_direction = "LT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    // CHECK: "mhlo.return"([[VAL11]])
+    "tf.Yield"(%4) : (tensor<i1>) -> ()
+  }, {
+  // CHECK: ^bb0([[BODY_ARG:%.+]]: tuple<tensor<i32>, tensor<i32>, tensor<i32>>):
+  ^body(%barg0: tensor<i32>, %barg1: tensor<i32>, %barg2: tensor<i32>):
+    // CHECK: [[VAL7:%.+]] = "mhlo.get_tuple_element"([[BODY_ARG]]) {index = 0 : i32}
+    // CHECK: [[VAL8:%.+]] = "mhlo.get_tuple_element"([[BODY_ARG]]) {index = 1 : i32}
+    // CHECK: [[VAL9:%.+]] = "mhlo.get_tuple_element"([[BODY_ARG]]) {index = 2 : i32}
+    // CHECK: [[VAL10:%.+]] = mhlo.constant dense<1>
+    %5 = mhlo.constant dense<1> : tensor<i32>
+    // CHECK: [[VAL11:%.+]] = mhlo.add [[VAL9]], [[VAL10]]
+    %6 = mhlo.add %barg2, %5 : tensor<i32>
+    // CHECK: [[VAL12:%.+]] = mhlo.add [[VAL7]], [[VAL10]]
+    %7 = mhlo.add %barg0, %5 : tensor<i32>
+    // CHECK: [[VAL13:%.+]] = "mhlo.tuple"([[VAL12]], [[VAL8]], [[VAL11]])
+    // CHECK: "mhlo.return"([[VAL13]])
+    "tf.Yield"(%7, %barg1, %6) : (tensor<i32>, tensor<i32>, tensor<i32>) -> ()
+  }) {is_stateless = true, parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>)
+  // CHECK: }) : (tuple<tensor<i32>, tensor<i32>, tensor<i32>>) -> tuple<tensor<i32>, tensor<i32>, tensor<i32>>
+  // CHECK: [[VAL4:%.+]] = "mhlo.get_tuple_element"([[VAL3]]) {index = 0 : i32}
+  // CHECK: [[VAL5:%.+]] = "mhlo.get_tuple_element"([[VAL3]]) {index = 1 : i32}
+  // CHECK: [[VAL6:%.+]] = "mhlo.get_tuple_element"([[VAL3]]) {index = 2 : i32}
+  // CHECK: return [[VAL6]]
+  return %2#2 : tensor<i32>
+}
+
+
+// CHECK-LABEL: func @whileRegionImplicitInputs
+// CHECK-SAME:  ([[ARG0:%.+]]: tensor<i32>)
+func @whileRegionImplicitInputs(%arg0: tensor<i32>) -> tensor<i32> {
+  // CHECK: [[VAL0:%.+]] = mhlo.constant dense<0>
+  %0 = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: [[VAL1:%.+]] = mhlo.constant dense<-1>
+  %1 = mhlo.constant dense<-1> : tensor<i32>
+  // CHECK: [[VAL2:%.+]] = "mhlo.tuple"([[ARG0]], [[VAL0]], [[VAL1]])
+  // CHECK: [[VAL3:%.+]] = "mhlo.while"([[VAL2]]) ( {
+  %2 = "tf.WhileRegion"(%arg0) ( {
+  // CHECK: ^bb0([[COND_ARG:%.+]]: tuple<tensor<i32>, tensor<i32>, tensor<i32>>):
+  ^cond(%carg0: tensor<i32>):
+    // CHECK: [[VAL5:%.+]] = "mhlo.get_tuple_element"([[COND_ARG]]) {index = 0 : i32}
+    // CHECK: [[VAL6:%.+]] = "mhlo.get_tuple_element"([[COND_ARG]]) {index = 1 : i32}
+    // CHECK: [[VAL7:%.+]] = "mhlo.get_tuple_element"([[COND_ARG]]) {index = 2 : i32}
+    // CHECK: [[VAL8:%.+]] = "mhlo.compare"([[VAL5]], [[VAL6]]) {comparison_direction = "LT"}
+    %3 = "mhlo.compare"(%carg0, %0) {comparison_direction = "LT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    // CHECK: "mhlo.return"([[VAL8]])
+    "tf.Yield"(%3) : (tensor<i1>) -> ()
+  }, {
+  // CHECK: ^bb0([[BODY_ARG:%.+]]: tuple<tensor<i32>, tensor<i32>, tensor<i32>>):
+  ^body(%barg0: tensor<i32>):
+    // CHECK: [[VAL5:%.+]] = "mhlo.get_tuple_element"([[BODY_ARG]]) {index = 0 : i32}
+    // CHECK: [[VAL6:%.+]] = "mhlo.get_tuple_element"([[BODY_ARG]]) {index = 1 : i32}
+    // CHECK: [[VAL7:%.+]] = "mhlo.get_tuple_element"([[BODY_ARG]]) {index = 2 : i32}
+    // CHECK: [[VAL8:%.+]] = mhlo.add [[VAL5]], [[VAL7]]
+    %3 = mhlo.add %barg0, %1 : tensor<i32>
+    // CHECK: [[VAL9:%.+]] = mhlo.add [[VAL5]], [[VAL8]]
+    %4 = mhlo.add %barg0, %3 : tensor<i32>
+    // CHECK: [[VAL10:%.+]] = "mhlo.tuple"([[VAL9]], [[VAL6]], [[VAL7]])
+    // CHECK: "mhlo.return"([[VAL10]])
+    "tf.Yield"(%4) : (tensor<i32>) -> ()
+  }) {is_stateless = true, parallel_iterations = 10 : i64} : (tensor<i32>) -> tensor<i32>
+  // CHECK: }) : (tuple<tensor<i32>, tensor<i32>, tensor<i32>>) -> tuple<tensor<i32>, tensor<i32>, tensor<i32>>
+  // CHECK: [[VAL4:%.+]] = "mhlo.get_tuple_element"([[VAL3]]) {index = 0 : i32}
+  // CHECK: return [[VAL4]]
+  return %2 : tensor<i32>
+}
+
+
+// CHECK-LABEL: func @whileRegionMultipleImplicitInputs
+func @whileRegionMultipleImplicitInputs() {
+  // CHECK: [[VAL0:%.+]] = mhlo.constant dense<0>
+  %0 = mhlo.constant dense<0> : tensor<i32>
+  // CHECK: [[VAL1:%.+]] = mhlo.constant dense<-1>
+  %1 = mhlo.constant dense<-1> : tensor<i32>
+  // CHECK: [[VAL2:%.+]] = "mhlo.tuple"([[VAL0]], [[VAL1]])
+  // CHECK: [[VAL3:%.+]] = "mhlo.while"([[VAL2]]) ( {
+  "tf.WhileRegion"() ( {
+  // CHECK: ^bb0([[COND_ARG:%.+]]: tuple<tensor<i32>, tensor<i32>>):
+    // CHECK: [[VAL4:%.+]] = "mhlo.get_tuple_element"([[COND_ARG]]) {index = 0 : i32}
+    // CHECK: [[VAL5:%.+]] = "mhlo.get_tuple_element"([[COND_ARG]]) {index = 1 : i32}
+    // CHECK: [[VAL6:%.+]] = "mhlo.compare"([[VAL4]], [[VAL5]]) {comparison_direction = "LT"}
+    %2 = "mhlo.compare"(%0, %1) {comparison_direction = "LT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    // CHECK: "mhlo.return"([[VAL6]])
+    "tf.Yield"(%2) : (tensor<i1>) -> ()
+  }, {
+  // CHECK: ^bb0([[BODY_ARG:%.+]]: tuple<tensor<i32>, tensor<i32>>):
+    // CHECK: [[VAL4:%.+]] = "mhlo.get_tuple_element"([[COND_ARG]]) {index = 0 : i32}
+    // CHECK: [[VAL5:%.+]] = "mhlo.get_tuple_element"([[COND_ARG]]) {index = 1 : i32}
+    // CHECK: [[VAL6:%.+]] = mhlo.add [[VAL4]], [[VAL5]]
+    %2 = mhlo.add %0, %1 : tensor<i32>
+    // CHECK: [[VAL7:%.+]] = "mhlo.tuple"([[VAL4]], [[VAL5]])
+    // CHECK: "mhlo.return"([[VAL7]])
+    "tf.Yield"() : () -> ()
+  }) {is_stateless = true, parallel_iterations = 10 : i64} : () -> ()
+  // CHECK: }) : (tuple<tensor<i32>, tensor<i32>>) -> tuple<tensor<i32>, tensor<i32>>
+  // CHECK: return
+  return
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
index df4f0303a84..a21a78cf7f4 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
@@ -125,9 +125,9 @@ func @constant(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 }
 
 // CHECK-LABEL: func @greater
-func @greater(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg0) {comparison_direction = "GT"}
-  %0 = "tf.Greater"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+func @greater(%arg0: tensor<2xi32>, %arg1: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GT"}
+  %0 = "tf.Greater"(%arg0, %arg1) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0: tensor<2xi1>
 }
 
@@ -220,13 +220,6 @@ func @sparse_to_dense(%arg0: tensor<3x2xi32>, %arg1: tensor<3xf32>, %arg2: tenso
   return %0 : tensor<3x3xf32>
 }
 
-// CHECK-LABEL: fft
-func @fft(%arg0: tensor<3x5x8xcomplex<f32>>) -> tensor<3x5x8xcomplex<f32>> {
-  // CHECK: "mhlo.fft"(%arg0)
-  %0 = "tf.FFT"(%arg0) : (tensor<3x5x8xcomplex<f32>>) -> tensor<3x5x8xcomplex<f32>>
-  return %0 : tensor<3x5x8xcomplex<f32>>
-}
-
 // CHECK-LABEL: reverse_sequence
 func @reverse_sequence(%arg0: tensor<4x2x3x1x1xi32>, %arg1: tensor<3xi32>) -> tensor<4x2x3x1x1xi32> {
   // CHECK-NOT: tf.ReverseSequence
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 56d4236c0a0..23137eff774 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -342,7 +342,7 @@ func @fusedBatchNormGradV3_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<
 }
 
 // CHECK-LABEL: fusedBatchNormGradV3_Training
-func @fusedBatchNormGradV3_Training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x8x8x8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>, %arg5: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
+func @fusedBatchNormGradV3_Training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x8x8x8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>, %arg5: tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<0xf32>, tensor<*xf32>) {
   // CHECK-NEXT: %[[grad:.*]] = "mhlo.convert"(%arg0) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[act:.*]] = "mhlo.convert"(%arg1) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[training:.*]] = "mhlo.batch_norm_grad"(%[[act]], %arg2, %arg3, %arg4, %[[grad]]) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8x8x8x8xf32>) -> tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>
@@ -350,10 +350,11 @@ func @fusedBatchNormGradV3_Training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x
   // CHECK-NEXT: %[[scale_backprop:.*]] = "mhlo.get_tuple_element"(%[[training]]) {index = 1 : i32} : (tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>) -> tensor<8xf32>
   // CHECK-NEXT: %[[offset_backprop:.*]] = "mhlo.get_tuple_element"(%[[training]]) {index = 2 : i32} : (tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>) -> tensor<8xf32>
   // CHECK-NEXT: %[[x_backprop:.*]] = "mhlo.convert"(%[[tact]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
-  // CHECK-NEXT: return %[[x_backprop]] : tensor<8x8x8x8xf32>
+  // CHECK: return %[[x_backprop]]
+  // CHECK-SAME: tensor<8x8x8x8xf32>
 
-  %0:5 = "tf.FusedBatchNormGradV3"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
-  return %0#0 : tensor<8x8x8x8xf32>
+  %0:5 = "tf.FusedBatchNormGradV3"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<0xf32>, tensor<*xf32>)
+  return %0#0, %0#3, %0#4 : tensor<8x8x8x8xf32>, tensor<0xf32>, tensor<*xf32>
 }
 
 // CHECK-LABEL: fusedBatchNormGradV3_noTraining_mixed_precision
@@ -439,6 +440,17 @@ func @fusedBatchNormGradV3_Training_NCHW(%arg0: tensor<8x8x8x8xf32>, %arg1: tens
 // Bias op legalizations.
 //===----------------------------------------------------------------------===//
 
+// CHECK-LABEL: func @biasAdd_default
+func @biasAdd_default(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
+  // CHECK: %[[ARG0_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK: %[[ARG0_EXTENTS:.+]] = shape.to_extent_tensor %[[ARG0_SHAPE]]
+  // CHECK: %[[ARG1_BCAST:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, %[[ARG0_EXTENTS]])
+  // CHECK-SAME:   {broadcast_dimensions = dense<3> : tensor<1xi64>}
+  // CHECK: %[[RESULT:.+]] = mhlo.add %arg0, %[[ARG1_BCAST]]
+  %0 = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT"} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+  return %0 : tensor<1x32x10x32xi32>
+}
+
 // CHECK-LABEL: func @biasAdd_NHWC
 func @biasAdd_NHWC(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
   // CHECK: %[[ARG0_SHAPE:.+]] = shape.shape_of %arg0
@@ -472,6 +484,57 @@ func @biasAdd_dynamic(%arg0: tensor<?x?x?x?xi32>, %arg1: tensor<?xi32>) -> tenso
   return %0 : tensor<?x?x?x?xi32>
 }
 
+
+//===----------------------------------------------------------------------===//
+// ClipByValue
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @clip
+func @clip(%arg0 : tensor<f32>, %arg1 : tensor<f32>, %arg2 : tensor<f32>) -> tensor<f32> {
+  // CHECK: [[VAL:%.+]] = "mhlo.clamp"(%arg1, %arg0, %arg2)
+
+  %0 = "tf.ClipByValue"(%arg0, %arg1, %arg2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK: return [[VAL]]
+  return %0 : tensor<f32>
+}
+
+// CHECK-LABEL: @clip_dynamic
+func @clip_dynamic(%arg0 : tensor<?xf32>, %arg1 : tensor<?xf32>, %arg2 : tensor<?xf32>) -> tensor<?xf32> {
+  // CHECK-DAG: [[CLAMP:%.+]] = "mhlo.clamp"(%arg1, %arg0, %arg2)
+  %0 = "tf.ClipByValue"(%arg0, %arg1, %arg2) : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+
+  // CHECK: return [[CLAMP]]
+  return %0 : tensor<?xf32>
+}
+
+// CHECK-LABEL: @clip_static_broadcast
+func @clip_static_broadcast(%arg0 : tensor<5xf32>, %arg1 : tensor<f32>, %arg2 : tensor<f32>) -> tensor<5xf32> {
+  // CHECK-DAG: [[SHP:%.+]] = mhlo.constant dense<5>
+  // CHECK-DAG: [[SHPIDX:%.+]] = tensor_cast [[SHP]]
+  // CHECK-DAG: [[BROADCAST_MIN:%.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, [[SHPIDX]]) {broadcast_dimensions = dense<> : tensor<0xi64>}
+  // CHECK-DAG: [[BROADCAST_MAX:%.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg2, [[SHPIDX]]) {broadcast_dimensions = dense<> : tensor<0xi64>}
+  // CHECK-DAG: [[CLAMP:%.+]] = "mhlo.clamp"([[BROADCAST_MIN]], %arg0, [[BROADCAST_MAX]])
+  %0 = "tf.ClipByValue"(%arg0, %arg1, %arg2) : (tensor<5xf32>, tensor<f32>, tensor<f32>) -> tensor<5xf32>
+
+  // CHECK: return [[CLAMP]]
+  return %0 : tensor<5xf32>
+}
+
+
+// CHECK-LABEL: @clip_dynamic_broadcast
+func @clip_dynamic_broadcast(%arg0 : tensor<?xf32>, %arg1 : tensor<f32>, %arg2 : tensor<f32>) -> tensor<?xf32> {
+  // CHECK-DAG: [[SHP:%.+]] = shape.shape_of %arg0
+  // CHECK-DAG: [[EXT:%.+]] = shape.to_extent_tensor [[SHP]]
+  // CHECK-DAG: [[SHPIDX:%.+]] = index_cast %1
+  // CHECK-DAG: [[BROADCAST_MIN:%.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg1, [[SHPIDX]]) {broadcast_dimensions = dense<> : tensor<0xi64>}
+  // CHECK-DAG: [[BROADCAST_MAX:%.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg2, [[SHPIDX]]) {broadcast_dimensions = dense<> : tensor<0xi64>}
+  // CHECK-DAG: [[CLAMP:%.+]] = "mhlo.clamp"([[BROADCAST_MIN]], %arg0, [[BROADCAST_MAX]])
+  %0 = "tf.ClipByValue"(%arg0, %arg1, %arg2) : (tensor<?xf32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
+
+  // CHECK: return [[CLAMP]]
+  return %0 : tensor<?xf32>
+}
+
 //===----------------------------------------------------------------------===//
 // DiagPart
 //===----------------------------------------------------------------------===//
@@ -1269,6 +1332,15 @@ func @maxpool_3d_same_padding(%arg0: tensor<2x8x13x25x7xf32>) -> tensor<2x8x4x7x
   return %0 : tensor<2x8x4x7x7xf32>
 }
 
+// CHECK-LABEL: maxpool_explicit_padding
+func @maxpool_explicit_padding(%arg0: tensor<2x12x20x7xi32>) -> tensor<2x3x5x7xi32> {
+  // CHECK: tf.MaxPool
+  // TODO(b/165938852): need to support explicit padding in max_pool.
+
+  %0 = "tf.MaxPool"(%arg0) {data_format = "NHWC", ksize = [1, 2, 2, 1], padding = "EXPLICIT", strides = [1, 4, 4, 1]} : (tensor<2x12x20x7xi32>) -> tensor<2x3x5x7xi32>
+  return %0 : tensor<2x3x5x7xi32>
+}
+
 //===----------------------------------------------------------------------===//
 // MaxPoolGrad op legalizations.
 //===----------------------------------------------------------------------===//
@@ -1755,6 +1827,20 @@ func @simple_logsoftmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
 // Fast Fourier Transform op legalization.
 //===----------------------------------------------------------------------===//
 
+// CHECK-LABEL: func @fft_1D
+func @fft_1D(%arg0: tensor<8xcomplex<f32>>) -> tensor<8xcomplex<f32>> {
+  // CHECK: "mhlo.fft"(%arg0) {fft_length = dense<8> : tensor<1xi64>, fft_type = "FFT"} : (tensor<8xcomplex<f32>>
+  %0 = "tf.FFT"(%arg0) : (tensor<8xcomplex<f32>>) -> tensor<8xcomplex<f32>>
+  return %0 : tensor<8xcomplex<f32>>
+}
+
+// CHECK-LABEL: func @ifft_1D
+func @ifft_1D(%arg0: tensor<8xcomplex<f32>>) -> tensor<8xcomplex<f32>> {
+  // CHECK: "mhlo.fft"(%arg0) {fft_length = dense<8> : tensor<1xi64>, fft_type = "IFFT"} : (tensor<8xcomplex<f32>>
+  %0 = "tf.IFFT"(%arg0) : (tensor<8xcomplex<f32>>) -> tensor<8xcomplex<f32>>
+  return %0 : tensor<8xcomplex<f32>>
+}
+
 // CHECK-LABEL: func @rfft_1D
 func @rfft_1D(%arg0: tensor<8xf32>) -> tensor<8xcomplex<f32>> {
   %fftlength = "tf.Const"() {value = dense<[8]> : tensor<1xi32>} : () -> (tensor<1xi32>)
@@ -1763,6 +1849,48 @@ func @rfft_1D(%arg0: tensor<8xf32>) -> tensor<8xcomplex<f32>> {
   return %0 : tensor<8xcomplex<f32>>
 }
 
+// CHECK-LABEL: func @rfft_1D_padded
+func @rfft_1D_padded(%arg0: tensor<7xf32>) -> tensor<8xcomplex<f32>> {
+  %fftlength = "tf.Const"() {value = dense<[8]> : tensor<1xi32>} : () -> (tensor<1xi32>)
+  // CHECK: %[[PADDED:.*]] = "mhlo.pad"(%arg0, %2) {edge_padding_high = dense<1> : tensor<1xi64>, edge_padding_low = dense<0> : tensor<1xi64>, interior_padding = dense<0> : tensor<1xi64>} : (tensor<7xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK: "mhlo.fft"(%[[PADDED]]) {fft_length = dense<8> : tensor<1xi64>, fft_type = "RFFT"} : (tensor<8xf32>
+  %0 = "tf.RFFT"(%arg0, %fftlength) : (tensor<7xf32>, tensor<1xi32>) -> tensor<8xcomplex<f32>>
+  return %0 : tensor<8xcomplex<f32>>
+}
+
+// CHECK-LABEL: func @rfft_1D_sliced
+func @rfft_1D_sliced(%arg0: tensor<2x9xf32>) -> tensor<2x8xcomplex<f32>> {
+  %fftlength = "tf.Const"() {value = dense<[8]> : tensor<1xi32>} : () -> (tensor<1xi32>)
+  // CHECK: %[[SLICED:.*]] = "mhlo.slice"(%arg0) {limit_indices = dense<[2, 8]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<2x9xf32>) -> tensor<2x8xf32>
+  // CHECK: "mhlo.fft"(%[[SLICED]]) {fft_length = dense<8> : tensor<1xi64>, fft_type = "RFFT"} : (tensor<2x8xf32>
+  %0 = "tf.RFFT"(%arg0, %fftlength) : (tensor<2x9xf32>, tensor<1xi32>) -> tensor<2x8xcomplex<f32>>
+  return %0 : tensor<2x8xcomplex<f32>>
+}
+
+// CHECK-LABEL: func @irfft_1D
+func @irfft_1D(%arg0: tensor<8xcomplex<f32>>) -> tensor<5xf32> {
+  %fftlength = "tf.Const"() {value = dense<[8]> : tensor<1xi32>} : () -> (tensor<1xi32>)
+  // CHECK: %[[SLICED:.*]] = "mhlo.slice"(%arg0) {limit_indices = dense<5> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<8xcomplex<f32>>) -> tensor<5xcomplex<f32>>
+  // CHECK: "mhlo.fft"(%[[SLICED]]) {fft_length = dense<5> : tensor<1xi64>, fft_type = "IRFFT"} : (tensor<5xcomplex<f32>>
+  %0 = "tf.IRFFT"(%arg0, %fftlength) : (tensor<8xcomplex<f32>>, tensor<1xi32>) -> tensor<5xf32>
+  return %0 : tensor<5xf32>
+}
+
+// CHECK-LABEL: fft_1D_dynamic
+func @fft_1D_dynamic(%arg0: tensor<?xcomplex<f32>>) -> tensor<8xcomplex<f32>> {
+  // CHECK: "tf.FFT"
+  %0 = "tf.FFT"(%arg0) : (tensor<?xcomplex<f32>>) -> tensor<8xcomplex<f32>>
+  return %0 : tensor<8xcomplex<f32>>
+}
+
+// CHECK-LABEL: rfft_1D_dynamic
+func @rfft_1D_dynamic(%arg0: tensor<?xf32>) -> tensor<8xcomplex<f32>> {
+  %fftlength = "tf.Const"() {value = dense<[8]> : tensor<1xi32>} : () -> (tensor<1xi32>)
+  // CHECK: "tf.RFFT"
+  %0 = "tf.RFFT"(%arg0, %fftlength) : (tensor<?xf32>, tensor<1xi32>) -> tensor<8xcomplex<f32>>
+  return %0 : tensor<8xcomplex<f32>>
+}
+
 //===----------------------------------------------------------------------===//
 // Shape op legalization.
 //===----------------------------------------------------------------------===//
@@ -1881,7 +2009,7 @@ func @abs_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 // CHECK-LABEL: @acos
 // CHLO-LABEL: @acos
 func @acos(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK:  "chlo.acos"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+  // CHECK:  chlo.acos %arg0 : tensor<2xf32>
 // CHLO:   %[[VAL_1:.*]] = "mhlo.compare"({{.*}}) {comparison_direction = "NE"}
 // CHLO:   %[[VAL_5:.*]] = mhlo.multiply %arg0, %arg0
 // CHLO:   %[[VAL_4:.*]] = mhlo.constant dense<1.000000e+00>
@@ -1902,24 +2030,41 @@ func @acos(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 // CHECK-LABEL: @acos_dynamic
 // CHLO-LABEL: @acos_dynamic
 func @acos_dynamic(%arg0: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK:  "chlo.acos"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
-// CHLO:   %[[VAL_1:.*]] = "mhlo.compare"({{.*}}) {comparison_direction = "NE"}
-// CHLO:   %[[VAL_5:.*]] = mhlo.multiply %arg0, %arg0
-// CHLO:   %[[VAL_4:.*]] = "chlo.constant_like"(%arg0) {value = 1.000000e+00 : f32}
-// CHLO:   %[[VAL_6:.*]] = mhlo.subtract %[[VAL_4]], %[[VAL_5]]
-// CHLO:   %[[VAL_7:.*]] = "mhlo.sqrt"(%[[VAL_6]])
-// CHLO:   %[[VAL_8:.*]] = "chlo.constant_like"(%arg0) {value = 1.000000e+00 : f32}
-// CHLO:   %[[VAL_9:.*]] = mhlo.add %[[VAL_8]], %arg0
-// CHLO:   %[[VAL_10:.*]] = mhlo.atan2 %[[VAL_7]], %[[VAL_9]]
-// CHLO:   %[[VAL_3:.*]] = "chlo.constant_like"(%arg0) {value = 2.000000e+00 : f32}
-// CHLO:   %[[VAL_11:.*]] = mhlo.multiply %[[VAL_3]], %[[VAL_10]]
-// CHLO:   %[[VAL_12:.*]] = "chlo.constant_like"(%arg0) {value = 3.14159274 : f32}
-// CHLO:   %[[VAL_13:.*]] = "mhlo.select"(%[[VAL_1]], %[[VAL_11]], %[[VAL_12]])
-// CHLO:       return %[[VAL_13]]
+  // CHECK:  chlo.acos %arg0 : tensor<*xf32>
+  // `tf.Acos` is lowered to `chlo.constant_like` operations which can only be
+  // lowered further on ranked tensors.  Unranked CHLO must be transformed to
+  // ranked code before further lowering.
+  // CHLO: "tf.Acos"
   %0 = "tf.Acos"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL: @tan
+// CHECK-SAME: (%[[ARG:.*]]: tensor<2xf32>) -> tensor<2xf32>
+// CHLO-LABEL: @tan
+// CHLO-SAME: (%[[ARG:.*]]: tensor<2xf32>) -> tensor<2xf32>
+func @tan(%arg : tensor<2xf32>) -> tensor<2xf32> {
+  // CHECK: chlo.tan %[[ARG]] : tensor<2xf32>
+  // CHLO: %[[SINE:.*]] = "mhlo.sine"(%[[ARG]])
+  // CHLO  %[[COSINE:.*]] = "mhlo.cosine"(%[[ARG]])
+  // CHLO  %[[RESULT:.*]] = "mhlo.divide"(%[[SINE]], %[[COSINE]])
+  %result = "tf.Tan"(%arg) : (tensor<2xf32>) -> tensor<2xf32>
+  return %result : tensor<2xf32>
+}
+
+// CHECK-LABEL: @tan_unranked
+// CHECK-SAME: (%[[ARG:.*]]: tensor<*xf32>) -> tensor<*xf32>
+// CHLO-LABEL: @tan_unranked
+// CHLO-SAME: (%[[ARG:.*]]: tensor<*xf32>) -> tensor<*xf32>
+func @tan_unranked(%arg : tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK: chlo.tan %[[ARG]] : tensor<*xf32>
+  // CHLO: %[[SINE:.*]] = "mhlo.sine"(%[[ARG]])
+  // CHLO  %[[COSINE:.*]] = "mhlo.cosine"(%[[ARG]])
+  // CHLO  %[[RESULT:.*]] = "mhlo.divide"(%[[SINE]], %[[COSINE]])
+  %result = "tf.Tan"(%arg) : (tensor<*xf32>) -> tensor<*xf32>
+  return %result : tensor<*xf32>
+}
+
 // CHECK-LABEL: func @cast_dynamic_i2f
 func @cast_dynamic_i2f(%arg0: tensor<?xi32>) -> tensor<?xf32> {
   // CHECK: "mhlo.convert"(%arg0) : (tensor<?xi32>) -> tensor<?xf32>
@@ -2032,6 +2177,13 @@ func @floor_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   return %0 : tensor<*xf32>
 }
 
+// CHECK-LABEL: func @invert_op_unranked
+func @invert_op_unranked(%arg0: tensor<*xi32>) -> tensor<*xi32> {
+  // CHECK:  "mhlo.not"(%arg0) : (tensor<*xi32>) -> tensor<*xi32>
+  %0 = "tf.Invert"(%arg0) : (tensor<*xi32>) -> tensor<*xi32>
+  return %0 : tensor<*xi32>
+}
+
 // CHECK-LABEL: @is_finite
 func @is_finite(%arg0: tensor<2xf32>) -> tensor<2xi1> {
   // CHECK:  "mhlo.is_finite"(%arg0) : (tensor<2xf32>) -> tensor<2xi1>
@@ -2316,10 +2468,10 @@ func @reshape(%arg0: tensor<2xf32>, %arg1: tensor<2xi32>) -> tensor<2x1xf32> {
 }
 
 // CHECK-LABEL: reshape_dynamic
-func @reshape_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<2xi32>) -> tensor<1x1xf32> {
-  // CHECK:  "mhlo.reshape"
-  %0 = "tf.Reshape"(%arg0, %arg1) : (tensor<?xf32>, tensor<2xi32>) -> tensor<1x1xf32>
-  return %0 : tensor<1x1xf32>
+func @reshape_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<2xi32>) -> tensor<?x?xf32> {
+  // CHECK:  "mhlo.dynamic_reshape"
+  %0 = "tf.Reshape"(%arg0, %arg1) : (tensor<?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
 }
 
 // CHECK-LABEL: reshape_unranked
@@ -2350,6 +2502,25 @@ func @expand_dims(%arg0: tensor<2xf32>, %axis: tensor<i32>) -> tensor<1x2xf32> {
   return %0 : tensor<1x2xf32>
 }
 
+// CHECK-LABEL: expand_dims_dynamic
+func @expand_dims_dynamic(%arg0: tensor<?x?xf32>) -> tensor<?x1x?xf32> {
+  %axis = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> (tensor<i32>)
+
+  // CHECK-DAG: [[SHAPEOF:%.+]] = shape.shape_of %arg0
+  // CHECK-DAG: [[CST0:%.+]] = constant 0
+  // CHECK-DAG: [[CST1:%.+]] = constant 1
+  // CHECK-DAG: [[GETEXTENT0:%.+]] = shape.get_extent [[SHAPEOF]], [[CST0]]
+  // CHECK-DAG: [[CST1_0:%.+]] = constant 1
+  // CHECK-DAG: [[GETEXTENT1:%.+]] = shape.get_extent [[SHAPEOF]], [[CST1_0]]
+  // CHECK-DAG: [[FROMEXTENTS:%.+]] = shape.from_extents [[GETEXTENT0]], [[CST1]], [[GETEXTENT1]]
+  // CHECK-DAG: [[TOEXTENTS:%.+]] = shape.to_extent_tensor [[FROMEXTENTS]]
+  // CHECK-DAG: [[RESHAPE:%.+]] = "mhlo.dynamic_reshape"(%arg0, [[TOEXTENTS]])
+  %0 = "tf.ExpandDims"(%arg0, %axis) : (tensor<?x?xf32>, tensor<i32>) -> tensor<?x1x?xf32>
+
+  // CHECK: return [[RESHAPE]]
+  return %0 : tensor<?x1x?xf32>
+}
+
 // CHECK-LABEL: func @sign
 // CHECK-SAME: [[ARG:%arg.*]]: tensor<1x2x3x4xf32>
 func @sign(%arg0: tensor<1x2x3x4xf32>) -> tensor<1x2x3x4xf32> {
@@ -2942,6 +3113,15 @@ func @max(%arg0: tensor<4x8xf16>) -> tensor<4x1xf16> {
   return %0 : tensor<4x1xf16>
 }
 
+// CHECK-LABEL: func @max_qint
+// Regression test to ensure we don't crash getting the initial value for
+// tf.Max when using quantized integer types.
+func @max_qint(%arg0: tensor<4x8x!tf.qint8>) -> tensor<4x1x!tf.qint8> {
+  %dimension = "tf.Const"() { value = dense<1> : tensor<1xi64> } : () -> tensor<1xi64>
+  %0 = "tf.Max"(%arg0, %dimension) { keep_dims = true }: (tensor<4x8x!tf.qint8>, tensor<1xi64>) -> tensor<4x1x!tf.qint8>
+  return %0 : tensor<4x1x!tf.qint8>
+}
+
 // CHECK-LABEL: func @max_dynamic
 func @max_dynamic(%arg0: tensor<4x?xf16>) -> tensor<4x1xf16> {
     // CHECK: %[[CAST:.*]] = "mhlo.convert"(%arg0) : (tensor<4x?xf16>) -> tensor<4x?xf16>
@@ -2976,6 +3156,15 @@ func @min(%arg0: tensor<4x8xf16>) -> tensor<4x1xf16> {
   return %0 : tensor<4x1xf16>
 }
 
+// CHECK-LABEL: func @min_qint
+// Regression test to ensure we don't crash getting the initial value for
+// tf.Min when using quantized integer types.
+func @min_qint(%arg0: tensor<4x8x!tf.qint8>) -> tensor<4x1x!tf.qint8> {
+  %dimension = "tf.Const"() { value = dense<1> : tensor<1xi64> } : () -> tensor<1xi64>
+  %0 = "tf.Min"(%arg0, %dimension) { keep_dims = true }: (tensor<4x8x!tf.qint8>, tensor<1xi64>) -> tensor<4x1x!tf.qint8>
+  return %0 : tensor<4x1x!tf.qint8>
+}
+
 // CHECK-LABEL: func @prod
 func @prod(%arg0: tensor<4x8xf16>) -> tensor<4x1xf16> {
   // CHECK: %[[CAST:.*]] = "mhlo.convert"(%arg0) : (tensor<4x8xf16>) -> tensor<4x8xf32>
@@ -2993,6 +3182,15 @@ func @prod(%arg0: tensor<4x8xf16>) -> tensor<4x1xf16> {
   return %0 : tensor<4x1xf16>
 }
 
+// CHECK-LABEL: func @prod_qint
+// Regression test to ensure we don't crash getting the initial value for
+// tf.Prod when using quantized integer types.
+func @prod_qint(%arg0: tensor<4x8x!tf.qint8>) -> tensor<4x1x!tf.qint8> {
+  %dimension = "tf.Const"() { value = dense<1> : tensor<1xi64> } : () -> tensor<1xi64>
+  %0 = "tf.Prod"(%arg0, %dimension) { keep_dims = true }: (tensor<4x8x!tf.qint8>, tensor<1xi64>) -> tensor<4x1x!tf.qint8>
+  return %0 : tensor<4x1x!tf.qint8>
+}
+
 // CHECK-LABEL: @all
 func @all(%input: tensor<4x8xi1>) -> tensor<4xi1> {
   %dims = "tf.Const"() { value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
@@ -3685,15 +3883,13 @@ func @topk_v2(%input: tensor<16x16xf32>) -> (tensor<16x8xf32>, tensor<16x8xi32>)
   %k = "tf.Const"() {value = dense<8> : tensor<i32>} : () -> tensor<i32>
 
   // CHECK:      %[[IOTA:.*]] = "mhlo.iota"() {iota_dimension = 1 : i64}
-  // CHECK-NEXT: %[[SORT:.*]] = "mhlo.sort"(%[[INPUT]], %[[IOTA]]) ( {
+  // CHECK-NEXT: %[[SORT:.*]]:2 = "mhlo.sort"(%[[INPUT]], %[[IOTA]]) ( {
   // CHECK-NEXT: ^{{.*}}(%[[LHS:.*]]: tensor<f32>, %[[RHS:.*]]: tensor<f32>, %{{.*}}: tensor<i32>, %{{.*}}: tensor<i32>):
   // CHECK-NEXT:   %[[CMP:.*]] = "mhlo.compare"(%[[LHS]], %[[RHS]]) {comparison_direction = "GT"}
   // CHECK-NEXT:   "mhlo.return"(%[[CMP]])
-  // CHECK-NEXT: }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> tuple<tensor<16x16xf32>, tensor<16x16xi32>>
-  // CHECK-NEXT: %[[TUPL0:.*]] = "mhlo.get_tuple_element"(%[[SORT]]) {index = 0 : i32}
-  // CHECK-NEXT: %[[TUPL1:.*]] = "mhlo.get_tuple_element"(%[[SORT]]) {index = 1 : i32}
-  // CHECK-NEXT: %[[VAL:.*]] = "mhlo.slice"(%[[TUPL0]]) {limit_indices = dense<[16, 8]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-  // CHECK-NEXT: %[[IDX:.*]] = "mhlo.slice"(%[[TUPL1]]) {limit_indices = dense<[16, 8]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+  // CHECK-NEXT: }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> (tensor<16x16xf32>, tensor<16x16xi32>)
+  // CHECK-NEXT: %[[VAL:.*]] = "mhlo.slice"(%[[SORT]]#0) {limit_indices = dense<[16, 8]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+  // CHECK-NEXT: %[[IDX:.*]] = "mhlo.slice"(%[[SORT]]#1) {limit_indices = dense<[16, 8]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
   // CHECK-NEXT: return %[[VAL]], %[[IDX]]
   %0:2 = "tf.TopKV2"(%input, %k): (tensor<16x16xf32>, tensor<i32>) -> (tensor<16x8xf32>, tensor<16x8xi32>)
   return %0#0, %0#1: tensor<16x8xf32>, tensor<16x8xi32>
@@ -4060,12 +4256,11 @@ func @random_shuffle_1D_16(%input: tensor<16xf32>) -> tensor<16xf32> {
   // CHECK: [[LOWER:%.*]] = mhlo.constant dense<0> : tensor<i32>
   // CHECK: [[UPPER:%.*]] = mhlo.constant dense<-1> : tensor<i32>
   // CHECK: [[RNG:%.*]] = "mhlo.rng_uniform"([[LOWER]], [[UPPER]], [[SHAPE]])
-  // CHECK: [[SORT:%.*]] = "mhlo.sort"([[RNG]], [[INPUT]]) ( {
+  // CHECK: [[SORT:%.*]]:2 = "mhlo.sort"([[RNG]], [[INPUT]]) ( {
   // CHECK: ^{{.*}}([[ARG1:%.*]]: tensor<i32>, [[ARG2:%.*]]: tensor<i32>, {{.*}}: tensor<f32>, {{.*}}: tensor<f32>):
   // CHECK:   "mhlo.compare"([[ARG1]], [[ARG2]]) {comparison_direction = "LT"}
-  // CHECK: }) {dimension = -1 : i64, is_stable = true} : (tensor<16xi32>, tensor<16xf32>) -> tuple<tensor<16xi32>, tensor<16xf32>>
-  // CHECK: [[RES:%.*]] = "mhlo.get_tuple_element"([[SORT]]) {index = 1 : i32}
-  // CHECK: return [[RES]]
+  // CHECK: }) {dimension = -1 : i64, is_stable = true} : (tensor<16xi32>, tensor<16xf32>) -> (tensor<16xi32>, tensor<16xf32>)
+  // CHECK: return [[SORT]]#1
   %0 = "tf.RandomShuffle"(%input) : (tensor<16xf32>) -> (tensor<16xf32>)
   return %0: tensor<16xf32>
 }
@@ -4074,10 +4269,8 @@ func @random_shuffle_1D_16(%input: tensor<16xf32>) -> tensor<16xf32> {
 func @random_shuffle_1D_10240(%input: tensor<10240xf32>) -> tensor<10240xf32> {
   // CHECK: mhlo.rng_uniform
   // CHECK: mhlo.sort
-  // CHECK: mhlo.get_tuple_element
   // CHECK: mhlo.rng_uniform
   // CHECK: mhlo.sort
-  // CHECK: mhlo.get_tuple_element
   %0 = "tf.RandomShuffle"(%input) : (tensor<10240xf32>) -> (tensor<10240xf32>)
   return %0: tensor<10240xf32>
 }
@@ -4859,3 +5052,20 @@ func @xla_gather_i32(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>) ->
   %0 = "tf.XlaGather"(%arg0, %arg1, %cst) {dimension_numbers = "\0A\01\01\12\01\00\1A\01\00 \01", indices_are_sorted = true} : (tensor<200x100x300xf32>, tensor<10x2xi32>, tensor<3xi32>) -> tensor<10x1x300xf32>
   return %0 : tensor<10x1x300xf32>
 }
+
+
+// CHECK: func @stridedslice_with_i32
+func @stridedslice_with_i32(%arg0: tensor<i32>) -> tensor<4xf32> attributes {tf.entry_function = {control_outputs = "", inputs = "const_0_arg", outputs = "identity_0_retval_RetVal"}} {
+// CHECK-NOT: tf.StridedSlice
+// CHECK: [[DYNSLICE:%.*]] = "mhlo.dynamic-slice
+// CHECK: [[RESHAPE:%.*]] = "mhlo.reshape"([[DYNSLICE]])
+// CHECK: return [[RESHAPE]]
+  %0 = "tf.Const"() {value = dense<[[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00, 7.000000e+00]]> : tensor<2x4xf32>} : () -> tensor<2x4xf32>
+  %1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %3 = "tf.AddV2"(%arg0, %1) {_xla_inferred_shapes = [#tf.shape<>], device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %4 = "tf.Pack"(%3) {_xla_inferred_shapes = [#tf.shape<1>], axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %5 = "tf.Pack"(%arg0) {_xla_inferred_shapes = [#tf.shape<1>], axis = 0 : i64, device = ""} : (tensor<i32>) -> tensor<1xi32>
+  %6 = "tf.StridedSlice"(%0, %5, %4, %2) {_xla_inferred_shapes = [#tf.shape<4>], begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2x4xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xf32>
+  return %6 : tensor<4xf32>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/BUILD b/tensorflow/compiler/mlir/xla/tests/translate/BUILD
index c4e747c90f3..7dc66edd9e1 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/BUILD
+++ b/tensorflow/compiler/mlir/xla/tests/translate/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(licenses = ["notice"])
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/case.mlir b/tensorflow/compiler/mlir/xla/tests/translate/case.mlir
index 1032bb723c5..cea0599adb0 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/case.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/case.mlir
@@ -1,10 +1,10 @@
 // RUN: tf-mlir-translate -split-input-file -mlir-hlo-to-hlo-text %s | FILECHECK_OPTS="" FileCheck %s
 
 func @main() -> tensor<f32> {
-  %cst = constant  {name = "constant"} dense<1> : tensor<i32>
-  %cst_0 = constant  {name = "constant.1"} dense<5.600000e+01> : tensor<f32>
-  %cst_1 = constant  {name = "constant.2"} dense<1.200000e+01> : tensor<f32>
-  %cst_2 = constant  {name = "constant.3"} dense<1.300000e+01> : tensor<f32>
+  %cst = constant dense<1> : tensor<i32>
+  %cst_0 = constant dense<5.600000e+01> : tensor<f32>
+  %cst_1 = constant dense<1.200000e+01> : tensor<f32>
+  %cst_2 = constant dense<1.300000e+01> : tensor<f32>
   %0 = "mhlo.case"(%cst, %cst_0, %cst_1, %cst_2) ( {
   ^bb0(%arg0: tensor<f32>):
     %1 = "mhlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
@@ -17,7 +17,7 @@ func @main() -> tensor<f32> {
   ^bb0(%arg0: tensor<f32>):
     %1 = "mhlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {name = "conditional"} : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  }) : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
   return %0 : tensor<f32>
 }
 
@@ -48,23 +48,23 @@ func @main() -> tensor<f32> {
 // -----
 
 func @main() -> (tensor<f32>, tensor<f32>) {
-  %cst = constant  {name = "constant"} dense<1> : tensor<i32>
-  %cst_0 = constant  {name = "constant.1"} dense<5.600000e+01> : tensor<f32>
-  %cst_1 = constant  {name = "constant.2"} dense<1.200000e+01> : tensor<f32>
-  %cst_2 = constant  {name = "constant.3"} dense<1.300000e+01> : tensor<f32>
+  %cst = constant dense<1> : tensor<i32>
+  %cst_0 = constant dense<5.600000e+01> : tensor<f32>
+  %cst_1 = constant dense<1.200000e+01> : tensor<f32>
+  %cst_2 = constant dense<1.300000e+01> : tensor<f32>
   %0:2 = "mhlo.case"(%cst, %cst_0, %cst_1, %cst_2) ( {
   ^bb0(%arg0: tensor<f32>):
-    %1 = "mhlo.negate"(%arg0) {name = "negate"} : (tensor<f32>) -> tensor<f32>
+    %1 = "mhlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
     "mhlo.return"(%1, %1) : (tensor<f32>, tensor<f32>) -> ()
   },  {
   ^bb0(%arg0: tensor<f32>):
-    %1 = "mhlo.copy"(%arg0) {name = "copy"} : (tensor<f32>) -> tensor<f32>
+    %1 = "mhlo.copy"(%arg0) : (tensor<f32>) -> tensor<f32>
     "mhlo.return"(%1, %1) : (tensor<f32>, tensor<f32>) -> ()
   },  {
   ^bb0(%arg0: tensor<f32>):
-    %1 = "mhlo.floor"(%arg0) {name = "floor"} : (tensor<f32>) -> tensor<f32>
+    %1 = "mhlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
     "mhlo.return"(%1, %1) : (tensor<f32>, tensor<f32>) -> ()
-  }) {name = "conditional"} : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  }) : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
   return %0#0, %0#1 : tensor<f32>, tensor<f32>
 }
 
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/case_conditional.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/case_conditional.hlotxt
index 62f0d7a59e4..1fa7367763e 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/case_conditional.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/case_conditional.hlotxt
@@ -26,21 +26,21 @@ ENTRY %indexed_conditional () -> f32[] {
 }
 
 // CHECK-LABEL: func @main() -> tensor<f32>
-// CHECK: %[[INDEX:.*]] = constant  {name = "constant"} dense<1> : tensor<i32>
-// CHECK: %[[OPERAND_1:.*]] = constant  {name = "{{.*}}"} dense<5.600000e+01> : tensor<f32>
-// CHECK: %[[OPERAND_2:.*]] = constant  {name = "{{.*}}"} dense<1.200000e+01> : tensor<f32>
-// CHECK: %[[OPERAND_3:.*]] = constant  {name = "{{.*}}"} dense<1.300000e+01> : tensor<f32>
+// CHECK: %[[INDEX:.*]] = constant dense<1> : tensor<i32>
+// CHECK: %[[OPERAND_1:.*]] = constant dense<5.600000e+01> : tensor<f32>
+// CHECK: %[[OPERAND_2:.*]] = constant dense<1.200000e+01> : tensor<f32>
+// CHECK: %[[OPERAND_3:.*]] = constant dense<1.300000e+01> : tensor<f32>
 // CHECK: %[[RESULT:.*]] = "mhlo.case"(%[[INDEX]], %[[OPERAND_1]], %[[OPERAND_2]], %[[OPERAND_3]]) ( {
 // CHECK:   ^bb0(%[[ARG_1:.*]]: tensor<f32>):
-// CHECK:     %[[RES_1:.*]] = "mhlo.negate"(%[[ARG_1]]) {name = "{{.*}}"} : (tensor<f32>) -> tensor<f32>
+// CHECK:     %[[RES_1:.*]] = "mhlo.negate"(%[[ARG_1]]) : (tensor<f32>) -> tensor<f32>
 // CHECK:     "mhlo.return"(%[[RES_1]]) : (tensor<f32>) -> ()
 // CHECK:   },  {
 // CHECK:   ^bb0(%[[ARG_2:.*]]: tensor<f32>):
-// CHECK:     %[[RES_2:.*]] = "mhlo.copy"(%[[ARG_2]]) {name = "{{.*}}"} : (tensor<f32>) -> tensor<f32>
+// CHECK:     %[[RES_2:.*]] = "mhlo.copy"(%[[ARG_2]]) : (tensor<f32>) -> tensor<f32>
 // CHECK:     "mhlo.return"(%[[RES_2]]) : (tensor<f32>) -> ()
 // CHECK:   },  {
 // CHECK:   ^bb0(%[[ARG_3:.*]]: tensor<f32>):
-// CHECK:     %[[RES_3:.*]] = "mhlo.floor"(%[[ARG_3]]) {name = "{{.*}}"} : (tensor<f32>) -> tensor<f32>
+// CHECK:     %[[RES_3:.*]] = "mhlo.floor"(%[[ARG_3]]) : (tensor<f32>) -> tensor<f32>
 // CHECK:     "mhlo.return"(%[[RES_3]]) : (tensor<f32>) -> ()
-// CHECK:   }) {name = "{{.*}}"} : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK:   }) : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
 // CHECK: return %[[RESULT]] : tensor<f32>
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
index 316eda4c4aa..c078191d170 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
@@ -362,7 +362,9 @@ func @main(%arg0: tensor<2x3xf32>, %arg1: tensor<5x5xf32>) -> tensor<1x2x3xf32>
 // CHECK:  [[VAL_1:%.*]] = f32[2,3] parameter(0)
 // CHECK:  [[VAL_2:%.*]] = f32[5,5] parameter(1)
 // CHECK:  ROOT
-// CHECK-SAME:  f32[1,2,3] custom-call(f32[2,3] [[VAL_1]], f32[5,5] [[VAL_2]]), custom_call_target="foo", backend_config="bar"
+// CHECK-SAME:  f32[1,2,3] custom-call(f32[2,3] [[VAL_1]], f32[5,5] [[VAL_2]])
+// CHECK-SAME:  custom_call_target="foo"
+// CHECK-SAME:  backend_config="bar"
 
 // -----
 
@@ -437,7 +439,7 @@ func @main(%arg0: tensor<200x100x300xf32>, %arg1: tensor<10x2xi32>) -> tensor<10
   // CHECK-SAME:  index_vector_dim=1
   // CHECK-SAME:  slice_sizes={1,1,300}
   // CHECK-SAME:  indices_are_sorted=true
-  %0 = "mhlo.gather"(%arg0, %arg1) {dimension_numbers = {collapsed_slice_dims = dense<[0, 1]> : tensor<2xi64>, index_vector_dim = 1 : i64, offset_dims = dense<1> : tensor<1xi64>, start_index_map = dense<[0, 1]> : tensor<2xi64>}, indices_are_sorted = true, name = "gather", slice_sizes = dense<[1, 1, 300]> : tensor<3xi64>} : (tensor<200x100x300xf32>, tensor<10x2xi32>) -> tensor<10x300xf32>
+  %0 = "mhlo.gather"(%arg0, %arg1) {dimension_numbers = {collapsed_slice_dims = dense<[0, 1]> : tensor<2xi64>, index_vector_dim = 1 : i64, offset_dims = dense<1> : tensor<1xi64>, start_index_map = dense<[0, 1]> : tensor<2xi64>}, indices_are_sorted = true, slice_sizes = dense<[1, 1, 300]> : tensor<3xi64>} : (tensor<200x100x300xf32>, tensor<10x2xi32>) -> tensor<10x300xf32>
     return %0 : tensor<10x300xf32>
 }
 
@@ -500,7 +502,7 @@ func @main() -> tensor<1x10xf32> {
 func @main(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
   %0 = "mhlo.map"(%arg0, %arg1) ( {
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):       // no predecessors
-    %1 = mhlo.add %arg2, %arg3 {name = "add"} : tensor<f32>
+    %1 = mhlo.add %arg2, %arg3 : tensor<f32>
     "mhlo.return"(%1) : (tensor<f32>) -> ()
   }) {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
@@ -737,7 +739,7 @@ func @main(%arg0: tensor<i1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) ->
   // CHECK:  %[[ARG2:.*]] = s32[2,3] parameter(2)
 
   // CHECK:  ROOT %[[RES:.*]] = s32[2,3] select(pred[2,3] %[[COND]], s32[2,3] %[[ARG1]], s32[2,3] %[[ARG2]])
-  %0 = "mhlo.select"(%arg0, %arg1, %arg2) {name = "select.4"} : (tensor<i1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+  %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   return %0 : tensor<2x3xi32>
 }
 
@@ -946,19 +948,20 @@ func @main(%arg0: tensor<4xi1>, %arg1: tensor<4xi1>) -> tensor<4xi1> {
 
 // CHECK:  HloModule
 func @main(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) {
-  %0 = "mhlo.sort"(%input0, %input1) ( {
+  %0:2 = "mhlo.sort"(%input0, %input1) ( {
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>):
     %7 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
     "mhlo.return"(%7) : (tensor<i1>) -> ()
-  }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> tuple<tensor<16x16xf32>, tensor<16x16xi32>>
+  }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> (tensor<16x16xf32>, tensor<16x16xi32>)
   return
 }
 
 // CHECK: %[[SORT_CMP:.*]] ([[ARG0:.*]]: f32[], [[ARG1:.*]]: f32[], {{.*}}: s32[], {{.*}}: s32[]) -> pred[] {
 // CHECK:   ROOT %compare.8 = pred[] compare(f32[] %[[ARG0]], f32[] %[[ARG1]]), direction=GT
 
-// CHECK: ENTRY %{{.*}} ([[MAIN_ARG0:.*]]: f32[16,16], [[MAIN_ARG1:.*]]: s32[16,16]) -> (f32[16,16], s32[16,16]) {
-// CHECK:   ROOT %{{.*}} = (f32[16,16], s32[16,16]) sort(f32[16,16] %[[MAIN_ARG0]], s32[16,16] %[[MAIN_ARG1]]), dimensions={1}, is_stable=true, to_apply=%[[SORT_CMP]]
+// CHECK: [[SORT:%.+]] = (f32[16,16], s32[16,16]) sort(f32[16,16] %Arg_0.1, s32[16,16] %Arg_1.2), dimensions={1}, is_stable=true, to_apply=%[[SORT_CMP]]
+// CHECK: [[GET0:%.+]] = f32[16,16] get-tuple-element((f32[16,16], s32[16,16]) [[SORT]]), index=0
+// CHECK: ROOT [[GET1:%.+]] = s32[16,16] get-tuple-element((f32[16,16], s32[16,16]) [[SORT]]), index=1
 
 
 // -----
@@ -1099,3 +1102,33 @@ func @main(%arg: tensor<3xui64>) -> tuple<tensor<3xui64>, tensor<2x2xui32>> {
   %0 = "mhlo.rng_bit_generator"(%arg) {rng_algorithm = 2 : i32} : (tensor<3xui64>) -> tuple<tensor<3xui64>, tensor<2x2xui32>>
   return %0 : tuple<tensor<3xui64>, tensor<2x2xui32>>
 }
+
+// -----
+
+// CHECK:  HloModule
+func @main(%arg: tensor<3x4xf32>) -> tensor<3x4xf32> {
+// CHECK: %[[ARG0:.*]] = f32[3,4] parameter(0)
+// CHECK: ROOT %[[RESULT:.*]] = f32[3,4] cbrt(f32[3,4] %[[ARG0]])
+  %0 = "mhlo.cbrt"(%arg) : (tensor<3x4xf32>) -> tensor<3x4xf32>
+  return %0 : tensor<3x4xf32>
+}
+
+// -----
+
+// CHECK:  HloModule
+func @main(%arg: tensor<3x4xf32>) -> tensor<3x4xf32> {
+// CHECK: %[[ARG0:.*]] = f32[3,4] parameter(0)
+// CHECK: ROOT %[[RESULT:.*]] = f32[3,4] reduce-precision(f32[3,4] %[[ARG0]]), exponent_bits=8, mantissa_bits=10
+  %0 = "mhlo.reduce_precision"(%arg) {exponent_bits = 8 : i32, mantissa_bits = 10 : i32} : (tensor<3x4xf32>) -> tensor<3x4xf32>
+  return %0 : tensor<3x4xf32>
+}
+
+// -----
+
+// CHECK:  HloModule
+func @main(%arg: tensor<3x4xf32>) -> tensor<3x4x1xf32> {
+// CHECK: %[[ARG0:.*]] = f32[3,4] parameter(0)
+// CHECK: ROOT %[[RESULT:.*]] = f32[3,4,1] bitcast(f32[3,4] %[[ARG0]])
+  %0 = "mhlo.bitcast"(%arg) : (tensor<3x4xf32>) -> tensor<3x4x1xf32>
+  return %0 : tensor<3x4x1xf32>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/fully_connected_reference_model.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/fully_connected_reference_model.hlotxt
index 86adcf0710f..4cc70be0965 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/fully_connected_reference_model.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/fully_connected_reference_model.hlotxt
@@ -9,95 +9,95 @@ ENTRY %tfcompile.48 {
   %arg0.1 = f32[1,300] parameter(0)
   %arg1.2 = f32[1,300,3,1] parameter(1)
 
-  // CHECK-NEXT: %0 = "mhlo.reshape"(%arg0) {name = "reshape.3"} : (tensor<1x300xf32>) -> tensor<1x300xf32>
+  // CHECK-NEXT: %0 = "mhlo.reshape"(%arg0) : (tensor<1x300xf32>) -> tensor<1x300xf32>
   %reshape.3 = f32[1,300] reshape(%arg0.1)
 
-  // CHECK-NEXT: %1 = "mhlo.transpose"(%0) {name = "transpose.27", permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x300xf32>) -> tensor<300x1xf32>
+  // CHECK-NEXT: %1 = "mhlo.transpose"(%0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<1x300xf32>) -> tensor<300x1xf32>
   %transpose.27 = f32[300,1] transpose(%reshape.3), dimensions={1,0}
 
-  // CHECK-NEXT: %2 = "mhlo.reshape"(%1) {name = "reshape.28"} : (tensor<300x1xf32>) -> tensor<300x1x1xf32>
+  // CHECK-NEXT: %2 = "mhlo.reshape"(%1) : (tensor<300x1xf32>) -> tensor<300x1x1xf32>
   %reshape.28 = f32[300,1,1] reshape(%transpose.27)
 
-  // CHECK-NEXT: %3 = "mhlo.reshape"(%2) {name = "reshape.29"} : (tensor<300x1x1xf32>) -> tensor<300x1xf32>
+  // CHECK-NEXT: %3 = "mhlo.reshape"(%2) : (tensor<300x1x1xf32>) -> tensor<300x1xf32>
   %reshape.29 = f32[300,1] reshape(%reshape.28)
 
-  // CHECK-NEXT: %4 = "mhlo.broadcast_in_dim"(%3) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>, name = "broadcast.30"} : (tensor<300x1xf32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %4 = "mhlo.broadcast_in_dim"(%3) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<300x1xf32>) -> tensor<300x1x5xf32>
   %broadcast.30 = f32[300,1,5] broadcast(%reshape.29), dimensions={0,1}
 
-  // CHECK-NEXT: %cst = constant  {name = "constant.8"} dense<1.000000e+00> : tensor<f32>
+  // CHECK-NEXT: %cst = constant  dense<1.000000e+00> : tensor<f32>
   %constant.8 = f32[] constant(1)
 
-  // CHECK-NEXT: %5 = "mhlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<> : tensor<0xi64>, name = "broadcast.9"} : (tensor<f32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %5 = "mhlo.broadcast_in_dim"(%cst) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<300x1x5xf32>
   %broadcast.9 = f32[300,1,5] broadcast(%constant.8), dimensions={}
 
-  // CHECK-NEXT: %6 = mhlo.multiply %4, %5 {name = "multiply.31"} : tensor<300x1x5xf32>
+  // CHECK-NEXT: %6 = mhlo.multiply %4, %5 : tensor<300x1x5xf32>
   %multiply.31 = f32[300,1,5] multiply(%broadcast.30, %broadcast.9)
 
-  // CHECK-NEXT: %cst_0 = constant  {name = "constant.32"} dense<0.000000e+00> : tensor<f32>
+  // CHECK-NEXT: %cst_0 = constant  dense<0.000000e+00> : tensor<f32>
   %constant.32 = f32[] constant(0)
 
-  // CHECK-NEXT: %7 = "mhlo.broadcast_in_dim"(%cst_0) {broadcast_dimensions = dense<> : tensor<0xi64>, name = "broadcast.33"} : (tensor<f32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %7 = "mhlo.broadcast_in_dim"(%cst_0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<300x1x5xf32>
   %broadcast.33 = f32[300,1,5] broadcast(%constant.32), dimensions={}
 
-  // CHECK-NEXT: %8 = "mhlo.compare"(%6, %7) {comparison_direction = "GT", name = "compare.34"} : (tensor<300x1x5xf32>, tensor<300x1x5xf32>) -> tensor<300x1x5xi1>
+  // CHECK-NEXT: %8 = "mhlo.compare"(%6, %7) {comparison_direction = "GT"} : (tensor<300x1x5xf32>, tensor<300x1x5xf32>) -> tensor<300x1x5xi1>
   %compare.34 = pred[300,1,5] compare(%multiply.31, %broadcast.33), direction=GT
 
-  // CHECK-NEXT: %cst_1 = constant  {name = "constant.10"} dense<0.000000e+00> : tensor<f32>
+  // CHECK-NEXT: %cst_1 = constant  dense<0.000000e+00> : tensor<f32>
   %constant.10 = f32[] constant(0)
 
-  // CHECK-NEXT: %9 = "mhlo.broadcast_in_dim"(%cst_1) {broadcast_dimensions = dense<> : tensor<0xi64>, name = "broadcast.11"} : (tensor<f32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %9 = "mhlo.broadcast_in_dim"(%cst_1) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<300x1x5xf32>
   %broadcast.11 = f32[300,1,5] broadcast(%constant.10), dimensions={}
 
-  // CHECK-NEXT: %cst_2 = constant  {name = "constant.40"} dense<0.000000e+00> : tensor<f32>
+  // CHECK-NEXT: %cst_2 = constant  dense<0.000000e+00> : tensor<f32>
   %constant.40 = f32[] constant(0)
 
-  // CHECK-NEXT: %10 = "mhlo.broadcast_in_dim"(%cst_2) {broadcast_dimensions = dense<> : tensor<0xi64>, name = "broadcast.41"} : (tensor<f32>) -> tensor<300x5xf32>
+  // CHECK-NEXT: %10 = "mhlo.broadcast_in_dim"(%cst_2) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<300x5xf32>
   %broadcast.41 = f32[300,5] broadcast(%constant.40), dimensions={}
 
-  // CHECK-NEXT: %11 = "mhlo.copy"(%arg1) {name = "copy.1"} : (tensor<1x300x3x1xf32>) -> tensor<1x300x3x1xf32>
+  // CHECK-NEXT: %11 = "mhlo.copy"(%arg1) : (tensor<1x300x3x1xf32>) -> tensor<1x300x3x1xf32>
   %copy.1 = f32[1,300,3,1] copy(%arg1.2)
 
-  // CHECK-NEXT: %12 = "mhlo.reshape"(%11) {name = "reshape.4"} : (tensor<1x300x3x1xf32>) -> tensor<1x300x3x1xf32>
+  // CHECK-NEXT: %12 = "mhlo.reshape"(%11) : (tensor<1x300x3x1xf32>) -> tensor<1x300x3x1xf32>
   %reshape.4 = f32[1,300,3,1] reshape(%copy.1)
 
-  // CHECK-NEXT: %13 = "mhlo.reshape"(%12) {name = "reshape.24"} : (tensor<1x300x3x1xf32>) -> tensor<1x300x3xf32>
+  // CHECK-NEXT: %13 = "mhlo.reshape"(%12) : (tensor<1x300x3x1xf32>) -> tensor<1x300x3xf32>
   %reshape.24 = f32[1,300,3] reshape(%reshape.4)
 
-  // CHECK-NEXT: %14 = "mhlo.transpose"(%13) {name = "transpose.25", permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x300x3xf32>) -> tensor<300x1x3xf32>
+  // CHECK-NEXT: %14 = "mhlo.transpose"(%13) {permutation = dense<[1, 0, 2]> : tensor<3xi64>} : (tensor<1x300x3xf32>) -> tensor<300x1x3xf32>
   %transpose.25 = f32[300,1,3] transpose(%reshape.24), dimensions={1,0,2}
 
-  // CHECK-NEXT: %15 = "mhlo.reshape"(%14) {name = "reshape.26"} : (tensor<300x1x3xf32>) -> tensor<300x3xf32>
+  // CHECK-NEXT: %15 = "mhlo.reshape"(%14) : (tensor<300x1x3xf32>) -> tensor<300x3xf32>
   %reshape.26 = f32[300,3] reshape(%transpose.25)
 
-  // CHECK-NEXT: %cst_3 = constant  {name = "constant.35"} dense<{{\[\[}}-1.060230e-01, 1.215050e-01, 8.002390e-01, -7.688850e-01, 0.0966112986], [6.890140e-01, -4.070560e-01, -0.797852993, 3.789250e-03, -2.088810e-01], [-6.085290e-01, 2.766170e-02, 2.685570e-01, 5.774010e-01, -4.284370e-01]]> : tensor<3x5xf32>
+  // CHECK-NEXT: %cst_3 = constant  dense<{{\[\[}}-1.060230e-01, 1.215050e-01, 8.002390e-01, -7.688850e-01, 0.0966112986], [6.890140e-01, -4.070560e-01, -0.797852993, 3.789250e-03, -2.088810e-01], [-6.085290e-01, 2.766170e-02, 2.685570e-01, 5.774010e-01, -4.284370e-01]]> : tensor<3x5xf32>
   %constant.35 = f32[3,5] constant({ { -0.106023, 0.121505, 0.800239, -0.768885, 0.0966113 }, { 0.689014, -0.407056, -0.797853, 0.00378925, -0.208881 }, { -0.608529, 0.0276617, 0.268557, 0.577401, -0.428437 } })
 
   // TODO(b/129709049) consider making this default precision config implied.
-  // CHECK-NEXT: %16 = "mhlo.dot"(%15, %cst_3) {name = "dot.36", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<300x3xf32>, tensor<3x5xf32>) -> tensor<300x5xf32>
+  // CHECK-NEXT: %16 = "mhlo.dot"(%15, %cst_3) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<300x3xf32>, tensor<3x5xf32>) -> tensor<300x5xf32>
   %dot.36 = f32[300,5] dot(%reshape.26, %constant.35), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 
-  // CHECK-NEXT: %cst_4 = constant  {name = "constant.37"} dense<0.000000e+00> : tensor<5xf32>
+  // CHECK-NEXT: %cst_4 = constant  dense<0.000000e+00> : tensor<5xf32>
   %constant.37 = f32[5]{0} constant({0, 0, 0, 0, 0})
 
-  // CHECK-NEXT: %17 = "mhlo.broadcast_in_dim"(%cst_4) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "broadcast.38"} : (tensor<5xf32>) -> tensor<300x5xf32>
+  // CHECK-NEXT: %17 = "mhlo.broadcast_in_dim"(%cst_4) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<5xf32>) -> tensor<300x5xf32>
   %broadcast.38 = f32[300,5] broadcast(%constant.37), dimensions={1}
 
-  // CHECK-NEXT: %18 = mhlo.add %16, %17 {name = "add.39"} : tensor<300x5xf32>
+  // CHECK-NEXT: %18 = mhlo.add %16, %17 : tensor<300x5xf32>
   %add.39 = f32[300,5] add(%dot.36, %broadcast.38)
 
-  // CHECK-NEXT: %19 = mhlo.maximum %10, %18 {name = "maximum.42"} : tensor<300x5xf32>
+  // CHECK-NEXT: %19 = mhlo.maximum %10, %18 : tensor<300x5xf32>
   %maximum.42 = f32[300,5] maximum(%broadcast.41, %add.39)
 
-  // CHECK-NEXT: %20 = "mhlo.reshape"(%19) {name = "reshape.44"} : (tensor<300x5xf32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %20 = "mhlo.reshape"(%19) : (tensor<300x5xf32>) -> tensor<300x1x5xf32>
   %reshape.44 = f32[300,1,5] reshape(%maximum.42)
 
-  // CHECK-NEXT: %21 = "mhlo.select"(%8, %9, %20) {name = "select.45"} : (tensor<300x1x5xi1>, tensor<300x1x5xf32>, tensor<300x1x5xf32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %21 = "mhlo.select"(%8, %9, %20) : (tensor<300x1x5xi1>, tensor<300x1x5xf32>, tensor<300x1x5xf32>) -> tensor<300x1x5xf32>
   %select.45 = f32[300,1,5] select(%compare.34, %broadcast.11, %reshape.44)
 
-  // CHECK-NEXT: %22 = "mhlo.reshape"(%21) {name = "reshape.46"} : (tensor<300x1x5xf32>) -> tensor<300x1x5xf32>
+  // CHECK-NEXT: %22 = "mhlo.reshape"(%21) : (tensor<300x1x5xf32>) -> tensor<300x1x5xf32>
   %reshape.46 = f32[300,1,5] reshape(%select.45)
 
-  // CHECK-NEXT: %23 = "mhlo.tuple"(%22) {name = "tuple.47"} : (tensor<300x1x5xf32>) -> tuple<tensor<300x1x5xf32>>
+  // CHECK-NEXT: %23 = "mhlo.tuple"(%22) : (tensor<300x1x5xf32>) -> tuple<tensor<300x1x5xf32>>
   // CHECK-NEXT: return %23 : tuple<tensor<300x1x5xf32>>
   ROOT %tuple.47 = (f32[300,1,5]) tuple(%reshape.46)
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
index 4d4e0213da8..cce49b16c6c 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
@@ -13,12 +13,12 @@ ENTRY %dummy_main (Arg_0.1: f32[]) -> f32[] {
   %Arg_0.1 = f32[4]{0} parameter(0)
   %Arg_1.2 = f32[4]{0} parameter(1)
 
-  // CHECK-NEXT:  mhlo.add %arg0, %arg1 {name = "{{.*}}"} : tensor<4xf32>
-  %add.3 = f32[4]{0} add(f32[4]{0} %Arg_0.1, f32[4]{0} %Arg_1.2)
+  // CHECK-NEXT:  mhlo.add %arg0, %arg1 : tensor<4xf32>
+  %add.42 = f32[4]{0} add(f32[4]{0} %Arg_0.1, f32[4]{0} %Arg_1.2)
 
   // TODO(b/129709049) consider making this default precision config inferred.
-  // CHECK-NEXT:  "mhlo.dot"(%0, %arg1) {name = "{{.*}}", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<4xf32>, tensor<4xf32>) -> tensor<f32>
-  ROOT %dot.4 = f32[] dot(f32[4]{0} %add.3, f32[4]{0} %Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+  // CHECK-NEXT:  "mhlo.dot"(%0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<4xf32>, tensor<4xf32>) -> tensor<f32>
+  ROOT %dot.4 = f32[] dot(f32[4]{0} %add.42, f32[4]{0} %Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
 }
 
 // CHECK-LABEL:  func @test_after_all
@@ -26,7 +26,7 @@ ENTRY %dummy_main (Arg_0.1: f32[]) -> f32[] {
 %test_after_all (token0: token[], token1: token[] ) -> token[] {
   token0 = token[] parameter(0)
   token1 = token[] parameter(1)
-  // CHECK-NEXT:  "mhlo.after_all"([[VAL_0]], [[VAL_1]]) {name = "{{.*}}"} : (!mhlo.token, !mhlo.token) -> !mhlo.token
+  // CHECK-NEXT:  "mhlo.after_all"([[VAL_0]], [[VAL_1]]) : (!mhlo.token, !mhlo.token) -> !mhlo.token
   ROOT after-all = token[] after-all(token0, token1)
 }
 
@@ -75,10 +75,10 @@ add {
 %test_broadcast_in_dim {
   %Arg_0.1 = f32[1, 2] parameter(0)
 
-  // CHECK-NEXT:  "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>, name = "{{.*}}"} : (tensor<1x2xf32>) -> tensor<1x2x3xf32>
+  // CHECK-NEXT:  "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x2xf32>) -> tensor<1x2x3xf32>
   %broadcast.2 = f32[1,2,3] broadcast(%Arg_0.1), dimensions={0,1}
 
-  // CHECK-NEXT:  "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>, name = "{{.*}}"} : (tensor<1x2xf32>) -> tensor<3x1x2xf32>
+  // CHECK-NEXT:  "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x2xf32>) -> tensor<3x1x2xf32>
   ROOT broadcast.4 = f32[3,1,2] broadcast(%Arg_0.1), dimensions={1, 2}
 }
 
@@ -113,7 +113,7 @@ add {
 // CHECK-SAME:  ([[ARG:%.*]]: tensor<1x291x291xf32>) -> tensor<1x291x291xf32>
 %test_cholesky (a: f32[1,291,291]) -> f32[1,291,291] {
   %a = f32[1,291,291] parameter(0)
-  // CHECK-NEXT:  "mhlo.cholesky"([[ARG]]) {lower = true, name = {{.*}}} : (tensor<1x291x291xf32>) -> tensor<1x291x291xf32>
+  // CHECK-NEXT:  "mhlo.cholesky"([[ARG]]) {lower = true} : (tensor<1x291x291xf32>) -> tensor<1x291x291xf32>
   ROOT %out = f32[1,291,291] cholesky(f32[1,291,291] %a), lower=true
 }
 
@@ -124,16 +124,16 @@ add {
   %Arg_1.2 = f32[4] parameter(1)
   %Arg_2.3 = f32[] parameter(2)
 
-  // CHECK-NEXT:  "mhlo.clamp"(%arg0, %arg1, %arg2) {name = "{{.*}}"} : (tensor<f32>, tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK-NEXT:  "mhlo.clamp"(%arg0, %arg1, %arg2) : (tensor<f32>, tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
   ROOT %clamp.3 = f32[4] clamp(f32[] %Arg_0.1, f32[4] %Arg_1.2, f32[] %Arg_2.3)
 }
 
 // CHECK-LABEL:  func @test_collective_permute
 // CHECK-SAME:  ([[ARG:%.*]]: tensor<128x32xf32>) -> tensor<128x32xf32>
 %test_collective_permute (input: f32[128,32]) -> f32[128,32] {
-  %input = f32[128,32]{0,1} parameter(0)
-  // CHECK-NEXT:  "mhlo.collective_permute"([[ARG]]) {name = {{.*}}, source_target_pairs = dense<{{\[\[}}0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>} : (tensor<128x32xf32>) -> tensor<128x32xf32>
-  ROOT root = f32[128,32]{0,1} collective-permute(%input), source_target_pairs={{0,1},{1,2},{2,3}}
+  %input = f32[128,32]{1,0} parameter(0)
+  // CHECK-NEXT:  "mhlo.collective_permute"([[ARG]]) {source_target_pairs = dense<{{\[\[}}0, 1], [1, 2], [2, 3]]> : tensor<3x2xi64>} : (tensor<128x32xf32>) -> tensor<128x32xf32>
+  ROOT root = f32[128,32]{1,0} collective-permute(%input), source_target_pairs={{0,1},{1,2},{2,3}}
 }
 
 
@@ -143,14 +143,14 @@ add {
   %Arg_1.2 = f32[3] parameter(1)
   %Arg_2.3 = f32[3] parameter(2)
 
-  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg1) {comparison_direction = "EQ", name = "{{.*}}"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
+  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
   %compare.4 = pred[3] compare(Arg_0.1, Arg_1.2), direction=EQ
 
-  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg1) {comparison_direction = "LE", name = "{{.*}}"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
+  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg1) {comparison_direction = "LE"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
   %compare.5 = pred[3] compare(Arg_0.1, Arg_1.2), direction=LE
 
   // Requires broadcast of compatible tensors.
-  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg2) {comparison_direction = "GT", name = "{{.*}}"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
+  // CHECK-NEXT:  "mhlo.compare"(%arg0, %arg2) {comparison_direction = "GT"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
   ROOT %compare.6 = pred[3] compare(Arg_0.1, Arg_2.3), direction=GT
 }
 
@@ -159,7 +159,7 @@ add {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
 
-  // CHECK-NEXT:  "mhlo.complex"(%arg0, %arg1) {name = "{{.*}}"} : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xcomplex<f32>>
+  // CHECK-NEXT:  "mhlo.complex"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xcomplex<f32>>
   ROOT %complex.3 = c64[4] complex(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
 
@@ -176,12 +176,12 @@ add {
 %test_constant {
 
   // Scalar/0D tensor constant
-  // CHECK-NEXT:  %cst = constant {name = "{{.*}}"} dense<1> : tensor<i64>
+  // CHECK-NEXT:  %cst = constant dense<1> : tensor<i64>
   %constant.0 = s64[] constant(1)
 
   // Note that double brackets "[[" have to be escaped as they denote variables
   // in FileCheck. The only way to do so is to drop into regex with "{{"
-  // CHECK-NEXT:  constant  {name = "{{.*}}"} dense<{{\[\[\[\[}}1.000000e+00]], {{\[\[}}2.000000e+00]]], {{\[\[\[}}3.000000e+00]], {{\[\[}}4.000000e+00]]]]> : tensor<2x2x1x1xf32>
+  // CHECK-NEXT:  constant  dense<{{\[\[\[\[}}1.000000e+00]], {{\[\[}}2.000000e+00]]], {{\[\[\[}}3.000000e+00]], {{\[\[}}4.000000e+00]]]]> : tensor<2x2x1x1xf32>
   %constant.1 = f32[2,2,1,1]{3,2,1,0} constant({{{{1.0}},{{2.0}}},{{{3.0}},{{4.0}}}}), metadata={op_type="Conv2D" op_name="embedded_inference/conv_model/conv_0/Conv2D"}
 
   // CHECK: dense<[1, 2, 4, 8]> : tensor<4xui64>
@@ -206,15 +206,15 @@ add {
 %test_conv {
   %arg0.1 = f32[256,32,32,6]{3,2,1,0} parameter(0), metadata={op_name="HLO_Args"}
 
-  // CHECK-NEXT:  %0 = "mhlo.copy"(%arg0) {name = "{{.*}}"} : (tensor<256x32x32x6xf32>) -> tensor<256x32x32x6xf32>
+  // CHECK-NEXT:  %0 = "mhlo.copy"(%arg0) {minor_to_major = dense<[2, 1, 3, 0]> : tensor<4xindex>} : (tensor<256x32x32x6xf32>) -> tensor<256x32x32x6xf32>
   %copy.1 = f32[256,32,32,6]{2,1,3,0} copy(%arg0.1), metadata={op_name="HLO_Args"}
 
-  // CHECK-NEXT:  %1 = "mhlo.reshape"(%0) {name = "{{.*}}"} : (tensor<256x32x32x6xf32>) -> tensor<256x32x32x6xf32>
+  // CHECK-NEXT:  %1 = "mhlo.reshape"(%0) {minor_to_major = dense<[2, 1, 3, 0]> : tensor<4xindex>} : (tensor<256x32x32x6xf32>) -> tensor<256x32x32x6xf32>
   %reshape.2 = f32[256,32,32,6]{2,1,3,0} reshape(%copy.1)
 
   // Note that double brackets "[[" have to be escaped as they denote variables
   // in FileCheck. The only way to do so is to drop into regex with "{{"
-  // CHECK-NEXT:  %cst = constant  {name = "{{.*}}"} dense<{{\[\[\[\[}}5.000000e-01]], {{\[\[}}-6.000000e-01]]], {{\[\[\[}}3.000000e-01]], {{\[\[}}-1.000000e-01]]]]> : tensor<2x2x1x1xf32>
+  // CHECK-NEXT:  %cst = constant  dense<{{\[\[\[\[}}5.000000e-01]], {{\[\[}}-6.000000e-01]]], {{\[\[\[}}3.000000e-01]], {{\[\[}}-1.000000e-01]]]]> : tensor<2x2x1x1xf32>
   %constant.3 = f32[2,2,1,1]{3,2,1,0} constant({{{{0.5}}, {{-0.6}}}, {{{0.3}}, {{-0.1}}}}), metadata={op_type="Conv2D" op_name="embedded_inference/conv_model/conv_0/Conv2D"}
 
   // CHECK-NEXT:  %2 = "mhlo.convolution"(%1, %cst) {
@@ -241,10 +241,10 @@ add {
 
   %convolution.4 = f32[16,30,30,256]{2,1,3,0} convolution(%reshape.2, %constant.3), window={size=3x3 stride=4x5 pad=44_45x60_60 rhs_dilate=2x3}, dim_labels=b01f_01io->f01b, metadata={op_type="Conv2D" op_name="embedded_inference/conv_model/conv_0/Conv2D"}
 
-  // CHECK-NEXT:  %3 = "mhlo.reshape"(%2) {name = "{{.*}}"} : (tensor<16x30x30x256xf32>) -> tensor<256x30x30x16xf32>
+  // CHECK-NEXT:  %3 = "mhlo.reshape"(%2) : (tensor<16x30x30x256xf32>) -> tensor<256x30x30x16xf32>
   %reshape.5 = f32[256,30,30,16]{3,2,1,0} reshape(%convolution.4), metadata={op_name="HLO_Retvals"}
 
-  // CHECK-NEXT:  "mhlo.tuple"(%3) {name = "{{.*}}"} : (tensor<256x30x30x16xf32>) -> tuple<tensor<256x30x30x16xf32>>
+  // CHECK-NEXT:  "mhlo.tuple"(%3) : (tensor<256x30x30x16xf32>) -> tuple<tensor<256x30x30x16xf32>>
   ROOT %tuple.6 = (f32[256,30,30,16]{3,2,1,0}) tuple(%reshape.5), metadata={op_name="HLO_Retvals"}
 }
 
@@ -263,10 +263,10 @@ add {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
 
-  // CHECK-NEXT:  %0 = "mhlo.convert"(%arg0) {name = "{{.*}}"} : (tensor<4xf32>) -> tensor<4xf64>
+  // CHECK-NEXT:  %0 = "mhlo.convert"(%arg0) : (tensor<4xf32>) -> tensor<4xf64>
   %convert.3 = f64[4] convert(f32[4] %Arg_0.1)
 
-  // CHECK-NEXT:  %1 = "mhlo.convert"(%arg1) {name = "{{.*}}"} : (tensor<4xf32>) -> tensor<4xf64>
+  // CHECK-NEXT:  %1 = "mhlo.convert"(%arg1) : (tensor<4xf32>) -> tensor<4xf64>
   %convert.4 = f64[4] convert(f32[4] %Arg_1.2)
 
   // CHECK-NEXT:  mhlo.add %0, %1
@@ -277,7 +277,7 @@ add {
 %test_cosine (arg0.1: f32[1,16,16,3]) -> f32[1,16,16,3] {
   %arg0.1 = f32[1,16,16,3]{3,2,1,0} parameter(0), metadata={op_name="HLO_Args"}
 
-  // CHECK-NEXT:  "mhlo.cosine"(%arg0) {name = "{{.*}}"} : (tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32>
+  // CHECK-NEXT:  "mhlo.cosine"(%arg0) : (tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32>
   ROOT %cosine.3 = f32[1,16,16,3]{3,2,1,0} cosine(f32[1,16,16,3]{3,2,1,0} %arg0.1)
 }
 
@@ -286,7 +286,7 @@ add {
 %test_custom_call (arg1: f32[2,3], arg2: f32[5,5]) -> f32[1,2,3] {
   %arg1 = f32[2,3] parameter(0)
   %arg2 = f32[5,5] parameter(1)
-// CHECK:  "mhlo.custom_call"([[ARG_0]], [[ARG_1]]) {backend_config = "bar", call_target_name = "foo", has_side_effect = true, name = {{.*}}} : (tensor<2x3xf32>, tensor<5x5xf32>) -> tensor<1x2x3xf32>
+// CHECK:  "mhlo.custom_call"([[ARG_0]], [[ARG_1]]) {backend_config = "bar", call_target_name = "foo", has_side_effect = true, minor_to_major = {{.*}}} : (tensor<2x3xf32>, tensor<5x5xf32>) -> tensor<1x2x3xf32>
   ROOT %custom-call = f32[1,2,3]{0,2,1} custom-call(f32[2,3] %arg1, f32[5,5] %arg2), custom_call_target="foo", backend_config="bar", custom_call_has_side_effect=true
 }
 
@@ -295,7 +295,7 @@ add {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
 
-  // CHECK-NEXT:  mhlo.divide %arg0, %arg1 {name = "{{.*}}"} : tensor<4xf32>
+  // CHECK-NEXT:  mhlo.divide %arg0, %arg1 : tensor<4xf32>
   ROOT %divide.3 = f32[4] divide(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
 
@@ -304,17 +304,17 @@ add {
   %Arg_0.1 = f32[1, 4] parameter(0)
   %Arg_1.2 = f32[4, 1] parameter(1)
 
-  // CHECK-NEXT:  %0 = "mhlo.dot"(%arg0, %arg1) {name = "{{.*}}", precision_config = ["HIGH", "HIGHEST"]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<f32>
+  // CHECK-NEXT:  %0 = "mhlo.dot"(%arg0, %arg1) {precision_config = ["HIGH", "HIGHEST"]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<f32>
   dot.3 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}, operand_precision={high,highest}
 
-  // CHECK-NEXT:  %1 = "mhlo.dot"(%arg0, %arg1) {name = "{{.*}}", precision_config = ["HIGHEST", "DEFAULT"]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<f32>
+  // CHECK-NEXT:  %1 = "mhlo.dot"(%arg0, %arg1) {precision_config = ["HIGHEST", "DEFAULT"]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<f32>
   dot.4 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}, operand_precision={highest,default}
 
-  // CHECK-NEXT:  %2 = "mhlo.dot"(%arg0, %arg1) {name = "{{.*}}", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<f32>
+  // CHECK-NEXT:  %2 = "mhlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<f32>
   %dot.5 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}, operand_precision={default,default}
 
   // TODO(b/129709049) consider making this default precision config inferred.
-  // CHECK-NEXT:  "mhlo.dot"(%arg0, %arg1) {name = "{{.*}}", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<f32>
+  // CHECK-NEXT:  "mhlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x4xf32>, tensor<4x1xf32>) -> tensor<f32>
   ROOT %dot.6 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 }
 
@@ -325,17 +325,17 @@ add {
   %Arg_0.1 = f32[4, 1] parameter(0)
   %Arg_1.2 = f32[1, 4] parameter(1)
 
-  // CHECK-NEXT:  [[R0:%.+]] = "mhlo.dot_general"([[ARG0]], [[ARG1]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<> : tensor<0xi64>, lhs_contracting_dimensions = dense<0> : tensor<1xi64>, rhs_batching_dimensions = dense<> : tensor<0xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, name = "{{.*}}", precision_config = ["HIGH", "HIGHEST"]}
+  // CHECK-NEXT:  [[R0:%.+]] = "mhlo.dot_general"([[ARG0]], [[ARG1]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<> : tensor<0xi64>, lhs_contracting_dimensions = dense<0> : tensor<1xi64>, rhs_batching_dimensions = dense<> : tensor<0xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, precision_config = ["HIGH", "HIGHEST"]}
   dot.3 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={1}, operand_precision={high,highest}
 
-  // CHECK-NEXT:  [[R1:%.+]] = "mhlo.dot_general"([[ARG0]], [[ARG1]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<> : tensor<0xi64>, lhs_contracting_dimensions = dense<0> : tensor<1xi64>, rhs_batching_dimensions = dense<> : tensor<0xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, name = "{{.*}}", precision_config = ["HIGHEST", "DEFAULT"]}
+  // CHECK-NEXT:  [[R1:%.+]] = "mhlo.dot_general"([[ARG0]], [[ARG1]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<> : tensor<0xi64>, lhs_contracting_dimensions = dense<0> : tensor<1xi64>, rhs_batching_dimensions = dense<> : tensor<0xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, precision_config = ["HIGHEST", "DEFAULT"]}
   dot.4 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={1}, operand_precision={highest,default}
 
-  // CHECK-NEXT:  [[R2:%.+]] = "mhlo.dot_general"([[ARG0]], [[ARG1]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<> : tensor<0xi64>, lhs_contracting_dimensions = dense<0> : tensor<1xi64>, rhs_batching_dimensions = dense<> : tensor<0xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, name = "{{.*}}", precision_config = ["DEFAULT", "DEFAULT"]}
+  // CHECK-NEXT:  [[R2:%.+]] = "mhlo.dot_general"([[ARG0]], [[ARG1]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<> : tensor<0xi64>, lhs_contracting_dimensions = dense<0> : tensor<1xi64>, rhs_batching_dimensions = dense<> : tensor<0xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, precision_config = ["DEFAULT", "DEFAULT"]}
   %dot.5 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={1}, operand_precision={default,default}
 
   // TODO(b/129709049) consider making this default precision config inferred.
-  // CHECK-NEXT:  "mhlo.dot_general"([[ARG0]], [[ARG1]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<> : tensor<0xi64>, lhs_contracting_dimensions = dense<0> : tensor<1xi64>, rhs_batching_dimensions = dense<> : tensor<0xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, name = "{{.*}}", precision_config = ["DEFAULT", "DEFAULT"]}
+  // CHECK-NEXT:  "mhlo.dot_general"([[ARG0]], [[ARG1]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<> : tensor<0xi64>, lhs_contracting_dimensions = dense<0> : tensor<1xi64>, rhs_batching_dimensions = dense<> : tensor<0xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, precision_config = ["DEFAULT", "DEFAULT"]}
   ROOT %dot.6 = f32[] dot(Arg_0.1, Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={1}
 }
 
@@ -376,7 +376,7 @@ add {
 %test_exponential (arg0.1: f32[16]) -> f32[16] {
   %arg0.1 = f32[16] parameter(0)
 
-  // CHECK-NEXT:  "mhlo.exponential"(%arg0) {name = "{{.*}}"} : (tensor<16xf32>) -> tensor<16xf32>
+  // CHECK-NEXT:  "mhlo.exponential"(%arg0) : (tensor<16xf32>) -> tensor<16xf32>
   ROOT %exp.2 = f32[16] exponential(f32[16] %arg0.1)
 }
 
@@ -384,7 +384,7 @@ add {
 %test_expm1 (arg0.1: f32[16]) -> f32[16] {
   %arg0.1 = f32[16] parameter(0)
 
-  // CHECK:  "mhlo.exponential_minus_one"(%arg0) {name = "{{.*}}"} : (tensor<16xf32>) -> tensor<16xf32>
+  // CHECK:  "mhlo.exponential_minus_one"(%arg0) : (tensor<16xf32>) -> tensor<16xf32>
   ROOT %expm1.2 = f32[16] exponential-minus-one(f32[16] %arg0.1)
 }
 
@@ -400,7 +400,7 @@ add {
 %test_floor (arg0.1: f32[16]) -> f32[16] {
   %arg0.1 = f32[16] parameter(0)
 
-  // CHECK-NEXT:  "mhlo.floor"([[A0]]) {name = "{{.*}}"} : (tensor<16xf32>) -> tensor<16xf32>
+  // CHECK-NEXT:  "mhlo.floor"([[A0]]) : (tensor<16xf32>) -> tensor<16xf32>
   ROOT %floor.2 = f32[16] floor(f32[16] %arg0.1)
 }
 
@@ -430,7 +430,7 @@ add {
 // CHECK-SAME:  ([[ARG:%.*]]: tensor<4x2xf32>)
 %test_get_dimension_size (Arg_0.1: f32[4,2]) -> s32[] {
   %Arg_0.1 = f32[4,2] parameter(0)
-  // CHECK-NEXT:  "mhlo.get_dimension_size"([[ARG]]) {dimension = 1 : i32, name = "{{.*}}"} : (tensor<4x2xf32>) -> tensor<i32>
+  // CHECK-NEXT:  "mhlo.get_dimension_size"([[ARG]]) {dimension = 1 : i32} : (tensor<4x2xf32>) -> tensor<i32>
   ROOT %get-dimension-size.2 = s32[] get-dimension-size(f32[4,2] %Arg_0.1), dimensions={1}
 }
 
@@ -438,7 +438,7 @@ add {
 %test_imag (Arg_0.1: c64[4]) -> f32[4] {
   %Arg_0.1 = c64[4] parameter(0)
 
-  // CHECK-NEXT:  "mhlo.imag"(%arg0) {name = "{{.*}}"} : (tensor<4xcomplex<f32>>) -> tensor<4xf32>
+  // CHECK-NEXT:  "mhlo.imag"(%arg0) : (tensor<4xcomplex<f32>>) -> tensor<4xf32>
   ROOT %imag.3 = f32[4] imag(c64[4] %Arg_0.1)
 }
 
@@ -468,7 +468,7 @@ add {
 %test_log (arg0.1: f32[16]) -> f32[16] {
   %arg0.1 = f32[16] parameter(0)
 
-  // CHECK-NEXT:  "mhlo.log"(%arg0) {name = "{{.*}}"} : (tensor<16xf32>) -> tensor<16xf32>
+  // CHECK-NEXT:  "mhlo.log"(%arg0) : (tensor<16xf32>) -> tensor<16xf32>
   ROOT %log.2 = f32[16] log(f32[16] %arg0.1)
 }
 
@@ -476,7 +476,7 @@ add {
 %test_log1p (arg0.1: f32[16]) -> f32[16] {
   %arg0.1 = f32[16] parameter(0)
 
-  // CHECK:  "mhlo.log_plus_one"(%arg0) {name = "{{.*}}"} : (tensor<16xf32>) -> tensor<16xf32>
+  // CHECK:  "mhlo.log_plus_one"(%arg0) : (tensor<16xf32>) -> tensor<16xf32>
   ROOT %log1p.2 = f32[16] log-plus-one(f32[16] %arg0.1)
 }
 
@@ -507,7 +507,7 @@ add {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
 
-  // CHECK-NEXT:  mhlo.maximum %arg0, %arg1 {name = "{{.*}}"} : tensor<4xf32>
+  // CHECK-NEXT:  mhlo.maximum %arg0, %arg1 : tensor<4xf32>
   ROOT %maximum.3 = f32[4] maximum(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
 
@@ -516,7 +516,7 @@ add {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
 
-  // CHECK-NEXT:  mhlo.minimum %arg0, %arg1 {name = "{{.*}}"} : tensor<4xf32>
+  // CHECK-NEXT:  mhlo.minimum %arg0, %arg1 : tensor<4xf32>
   ROOT %minimum.3 = f32[4] minimum(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
 
@@ -525,7 +525,7 @@ add {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
 
-  // CHECK-NEXT:  %0 = mhlo.multiply %arg0, %arg1 {name = "{{.*}}"} : tensor<4xf32>
+  // CHECK-NEXT:  %0 = mhlo.multiply %arg0, %arg1 : tensor<4xf32>
   ROOT %multiply.3 = f32[4] multiply(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
 
@@ -533,7 +533,7 @@ add {
 %test_negate (arg0.1: f32[16]) -> f32[16] {
   %arg0.1 = f32[16] parameter(0)
 
-  // CHECK-NEXT:  "mhlo.negate"(%arg0) {name = "{{.*}}"} : (tensor<16xf32>) -> tensor<16xf32>
+  // CHECK-NEXT:  "mhlo.negate"(%arg0) : (tensor<16xf32>) -> tensor<16xf32>
   ROOT %negate.2 = f32[16] negate(f32[16] %arg0.1)
 }
 
@@ -541,7 +541,7 @@ add {
 %test_not (arg0.1: pred[16]) -> pred[16] {
   %arg0.1 = pred[16] parameter(0)
 
-  // CHECK:  "mhlo.not"(%arg0) {name = "{{.*}}"} : (tensor<16xi1>) -> tensor<16xi1>
+  // CHECK:  "mhlo.not"(%arg0) : (tensor<16xi1>) -> tensor<16xi1>
   ROOT %not.2 = pred[16] not(pred[16] %arg0.1)
 }
 
@@ -595,7 +595,7 @@ add {
 %test_popcnt (arg0.1: s32[16]) -> s32[16] {
   %arg0.1 = s32[16] parameter(0)
 
-  // CHECK:  "mhlo.popcnt"(%arg0) {name = "{{.*}}"} : (tensor<16xi32>) -> tensor<16xi32>
+  // CHECK:  "mhlo.popcnt"(%arg0) : (tensor<16xi32>) -> tensor<16xi32>
   ROOT %popcnt.2 = s32[16] popcnt(s32[16] %arg0.1)
 }
 
@@ -604,7 +604,7 @@ add {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
 
-  // CHECK-NEXT:  mhlo.power %arg0, %arg1 {name = "{{.*}}"} : tensor<4xf32>
+  // CHECK-NEXT:  mhlo.power %arg0, %arg1 : tensor<4xf32>
   ROOT %power.3 = f32[4] power(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
 
@@ -632,7 +632,7 @@ add {
 %test_real (Arg_0.1: c64[4]) -> f32[4] {
   %Arg_0.1 = c64[4] parameter(0)
 
-  // CHECK-NEXT:  "mhlo.real"(%arg0) {name = "{{.*}}"} : (tensor<4xcomplex<f32>>) -> tensor<4xf32>
+  // CHECK-NEXT:  "mhlo.real"(%arg0) : (tensor<4xcomplex<f32>>) -> tensor<4xf32>
   ROOT %real.3 = f32[4] real(c64[4] %Arg_0.1)
 }
 
@@ -687,7 +687,7 @@ add {
   // CHECK:  {dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<f32>
   %reduce.4 = f32[] reduce(%reduce.2, %Arg_2.3), dimensions={0}, to_apply=%reduce_helper.3
 
-  // CHECK:  %4 = mhlo.subtract [[VAL2]], [[VAL4]] {name = "{{.*}}"} : tensor<f32>
+  // CHECK:  %4 = mhlo.subtract [[VAL2]], [[VAL4]] : tensor<f32>
   %sub.5 = f32[] subtract(%reduce.3, %reduce.4)
 
   ROOT %tuple.6 = ((f32[], f32[]), f32[]) tuple(%reduce.1, %sub.5)
@@ -741,7 +741,7 @@ add {
 %test_rsqrt (arg0.1: f32[16]) -> f32[16] {
   %arg0.1 = f32[16] parameter(0)
 
-  // CHECK:  "mhlo.rsqrt"([[ARG0]]) {name = "{{.*}}"} : (tensor<16xf32>) -> tensor<16xf32>
+  // CHECK:  "mhlo.rsqrt"([[ARG0]]) : (tensor<16xf32>) -> tensor<16xf32>
   ROOT %rsqrt.2 = f32[16] rsqrt(f32[16] %arg0.1)
 }
 
@@ -788,7 +788,7 @@ add {
   %Arg_1.2 = s32[2,3] parameter(1)
   %Arg_2.3 = s32[2,3] parameter(2)
 
-  // CHECK:  "mhlo.select"(%arg0, %arg1, %arg2) {name = "{{.*}}"} : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+  // CHECK:  "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   ROOT %select.4 = s32[2,3] select(pred[2,3] %Arg_0.1, s32[2,3] %Arg_1.2, s32[2,3] %Arg_2.3)
 }
 
@@ -835,7 +835,7 @@ add {
 %test_set_dimension_size (Arg_0.1: f32[4,4], Arg_1.2: s32[]) -> f32[4,<=4] {
   %Arg_0.1 = f32[4,4] parameter(0)
   %Arg_1.2 = s32[] parameter(1)
-  // CHECK-NEXT:  "mhlo.set_dimension_size"([[ARG]], [[SIZE]]) {dimension = 1 : i32, name = "{{.*}}"} : (tensor<4x4xf32>, tensor<i32>) -> tensor<4x4xf32>
+  // CHECK-NEXT:  "mhlo.set_dimension_size"([[ARG]], [[SIZE]]) {dimension = 1 : i32} : (tensor<4x4xf32>, tensor<i32>) -> tensor<4x4xf32>
   ROOT %set-dimension-size.2 = f32[4,<=4] set-dimension-size(f32[4,4] %Arg_0.1, s32[] %Arg_1.2), dimensions={1}
 }
 
@@ -843,7 +843,7 @@ add {
 %test_sine (arg0.1: f32[1,16,16,3]) -> f32[1,16,16,3] {
   %arg0.1 = f32[1,16,16,3]{3,2,1,0} parameter(0), metadata={op_name="HLO_Args"}
 
-  // CHECK-NEXT:  "mhlo.sine"(%arg0) {name = "{{.*}}"} : (tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32>
+  // CHECK-NEXT:  "mhlo.sine"(%arg0) : (tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32>
   ROOT %sine.3 = f32[1,16,16,3]{3,2,1,0} sine(f32[1,16,16,3]{3,2,1,0} %arg0.1)
 }
 
@@ -862,7 +862,7 @@ add {
 // CHECK-SAME:  [[ARG:%.*]]: tensor<1024xf32>) -> tensor<1024xf32>
 // CHECK:  "mhlo.sort"([[ARG]]) ( {
 // CHECK:    ^bb0([[ARG0:%.*]]: tensor<f32>, [[ARG1:%.*]]: tensor<f32>):
-// CHECK:      [[CMP:%.*]] = "mhlo.compare"([[ARG0]], [[ARG1]]) {comparison_direction = "LT", name = "lt"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+// CHECK:      [[CMP:%.*]] = "mhlo.compare"([[ARG0]], [[ARG1]]) {comparison_direction = "LT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
 // CHECK:      "mhlo.return"([[CMP]]) : (tensor<i1>) -> ()
 // CHECK:    }) {dimension = 0 : i64, is_stable = true} : (tensor<1024xf32>) -> tensor<1024xf32>
 
@@ -871,7 +871,7 @@ add {
   %Arg_0.1 = f32[4] parameter(0)
   %Arg_1.2 = f32[4] parameter(1)
 
-  // CHECK-NEXT:  mhlo.subtract %arg0, %arg1 {name = "{{.*}}"} : tensor<4xf32>
+  // CHECK-NEXT:  mhlo.subtract %arg0, %arg1 : tensor<4xf32>
   ROOT %subtract.3 = f32[4] subtract(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
 }
 
@@ -879,7 +879,7 @@ add {
 %test_tanh (arg0.1: f32[1,16,16,3]) -> f32[1,16,16,3] {
   %arg0.1 = f32[1,16,16,3]{3,2,1,0} parameter(0), metadata={op_name="HLO_Args"}
 
-  // CHECK-NEXT:  "mhlo.tanh"(%arg0) {name = "{{.*}}"} : (tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32>
+  // CHECK-NEXT:  "mhlo.tanh"(%arg0) : (tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32>
   ROOT %tanh.3 = f32[1,16,16,3]{3,2,1,0} tanh(f32[1,16,16,3]{3,2,1,0} %arg0.1), metadata={op_type="Tanh" op_name="embedded_inference/tanh_model/Tanh"}
 }
 
@@ -887,7 +887,7 @@ add {
 %test_transpose {
   %Arg_0.1 = s32[1,2,3,4] parameter(0)
 
-  // CHECK:  "mhlo.transpose"(%arg0) {name = "{{.*}}", permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
+  // CHECK:  "mhlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
   ROOT %transpose.2 = s32[2,1,4,3] transpose(s32[1,2,3,4] %Arg_0.1), dimensions={1,0,3,2}
 }
 
@@ -909,10 +909,10 @@ add {
   %Arg_0.1 = s32[1] parameter(0)
   %Arg_1.2 = f32[1, 2] parameter(1)
 
-  // CHECK-NEXT:  %0 = "mhlo.tuple"(%arg0) {name = "{{.*}}"} : (tensor<1xi32>) -> tuple<tensor<1xi32>>
+  // CHECK-NEXT:  %0 = "mhlo.tuple"(%arg0) : (tensor<1xi32>) -> tuple<tensor<1xi32>>
   %tuple.3 = (s32[1]) tuple(%Arg_0.1)
 
-  // CHECK:  "mhlo.tuple"(%arg0, %arg1) {name = "{{.*}}"} : (tensor<1xi32>, tensor<1x2xf32>) -> tuple<tensor<1xi32>, tensor<1x2xf32>>
+  // CHECK:  "mhlo.tuple"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xf32>) -> tuple<tensor<1xi32>, tensor<1x2xf32>>
   ROOT %tuple.4 = (s32[1], f32[1,2]) tuple(%Arg_0.1, %Arg_1.2)
 }
 
@@ -934,11 +934,11 @@ add {
   %arg0.1 = s64[] parameter(0), metadata={op_name="HLO_Args"}
   // CHECK-NEXT:  "mhlo.while"(%arg0) ( {
   // CHECK-NEXT:  ^bb0(%arg1: tensor<i64>):	// no predecessors
-  // CHECK-NEXT:  [[CMP:%.*]] = "mhlo.compare"(%arg1, %arg1) {comparison_direction = "LT", name = "{{.*}}"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  // CHECK-NEXT:  [[CMP:%.*]] = "mhlo.compare"(%arg1, %arg1) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
   // CHECK-NEXT:  "mhlo.return"([[CMP]]) : (tensor<i1>) -> ()
   // CHECK-NEXT:  },  {
   // CHECK-NEXT:  ^bb0(%arg1: tensor<i64>):	// no predecessors
-  // CHECK-NEXT:  [[ADD:%.*]] = mhlo.add %arg1, %arg1 {name = "{{.*}}"} : tensor<i64>
+  // CHECK-NEXT:  [[ADD:%.*]] = mhlo.add %arg1, %arg1 : tensor<i64>
   // CHECK-NEXT:  "mhlo.return"([[ADD]]) : (tensor<i64>) -> ()
   // CHECK-NEXT:  }) : (tensor<i64>) -> tensor<i64>
   ROOT %while.2 = s64[] while(%arg0.1), body=%loop, condition=%cond
@@ -992,8 +992,8 @@ add {
   %Arg_1.2 = c128[2] parameter(1)
   %abs.4 = f64[2] abs(c128[2] %Arg_1.2)
 
-  // CHECK:  "mhlo.abs"(%[[ARG0]]) {name = "{{.*}}"} : (tensor<2xcomplex<f32>>) -> tensor<2xf32>
-  // CHECK:  "mhlo.abs"(%[[ARG1]]) {name = "{{.*}}"} : (tensor<2xcomplex<f64>>) -> tensor<2xf64>
+  // CHECK:  "mhlo.abs"(%[[ARG0]]) : (tensor<2xcomplex<f32>>) -> tensor<2xf32>
+  // CHECK:  "mhlo.abs"(%[[ARG1]]) : (tensor<2xcomplex<f64>>) -> tensor<2xf64>
   ROOT %tuple.5 = (f32[2], f64[2]) tuple(f32[2] %abs.3, f64[2] %abs.4)
 }
 
@@ -1002,7 +1002,7 @@ add {
 %unsigned_int(Arg_0.1: u16[4]) -> u16[4] {
   %Arg_0.1 = u16[4] parameter(0)
 
-  // CHECK: "mhlo.not"(%[[ARG0]]) {name = "{{.*}}"} : (tensor<4xui16>) -> tensor<4xui16>
+  // CHECK: "mhlo.not"(%[[ARG0]]) : (tensor<4xui16>) -> tensor<4xui16>
   ROOT %not.2 = u16[4] not(u16[4] %Arg_0.1)
 }
 
@@ -1014,3 +1014,26 @@ add {
   ROOT %rng-bit-generator.2 = (u64[3], u32[2,2]) rng-bit-generator(u64[3] %Arg_0.1), algorithm=rng_philox
 }
 
+// CHECK-LABEL:  func @cbrt
+// CHECK-SAME:    (%[[ARG0:.*]]: tensor<3x4xf32>)
+%cbrt (Arg_0.1: f32[3,4]) -> f32[3,4] {
+  %Arg_0.1 = f32[3,4] parameter(0)
+  // CHECK: "mhlo.cbrt"(%[[ARG0]]) : (tensor<3x4xf32>) -> tensor<3x4xf32>
+  ROOT %cbrt = f32[3,4] cbrt(f32[3,4] %Arg_0.1)
+}
+
+// CHECK-LABEL:  func @bitcast
+// CHECK-SAME:    (%[[ARG0:.*]]: tensor<3x4xf32>) -> tensor<3x4x1xf32>
+%bitcast (Arg_0.1: f32[3,4]) -> f32[3,4,1] {
+  %Arg_0.1 = f32[3,4] parameter(0)
+  // CHECK: "mhlo.bitcast"(%[[ARG0]]) : (tensor<3x4xf32>) -> tensor<3x4x1xf32>
+  ROOT %bitcast = f32[3,4,1] bitcast(f32[3,4] %Arg_0.1)
+}
+
+// CHECK-LABEL:  func @reduce_precision
+// CHECK-SAME:    (%[[ARG0:.*]]: tensor<3x4xf32>)
+%reduce_precision (Arg_0.1: f32[3,4]) -> f32[3,4] {
+  %Arg_0.1 = f32[3,4] parameter(0)
+  // CHECK: "mhlo.reduce_precision"(%[[ARG0]]) {exponent_bits = 8 : i32, mantissa_bits = 10 : i32} : (tensor<3x4xf32>) -> tensor<3x4xf32>
+  ROOT %reduce_precision = f32[3,4] reduce-precision(f32[3,4] %Arg_0.1), exponent_bits=8, mantissa_bits=10
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/layouts_and_names.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/layouts_and_names.hlotxt
new file mode 100644
index 00000000000..da07dc0a76b
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/layouts_and_names.hlotxt
@@ -0,0 +1,11 @@
+// RUN: tf-mlir-translate -mlir-print-debuginfo -hlo-text-to-mlir-hlo %s -o - | FileCheck %s
+
+HloModule Test
+
+// CHECK-LABEL: func @main
+ENTRY A {
+  %input = f16[128,224,224,4] parameter(0)
+  %filter = f16[64,7,7,4] parameter(1)
+  // %0 = "mhlo.convolution"{{.*}}minor_to_major = dense<[1, 3, 2, 0]> : tensor<4xindex>{{.*}} loc("root.42")
+  ROOT %root.42 = f16[128,64,112,112]{1,3,2,0} convolution(%input, %filter), dim_labels=b01f_o01i->bf01, window={size=7x7 stride=2x2 pad=3_3x3_3}
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/layouts_and_names.mlir b/tensorflow/compiler/mlir/xla/tests/translate/layouts_and_names.mlir
new file mode 100644
index 00000000000..2ef0aaf3f50
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/layouts_and_names.mlir
@@ -0,0 +1,34 @@
+// RUN: tf-mlir-translate -mlir-hlo-to-hlo-text-with-layouts %s | FileCheck %s
+
+// Checks exporting layouts
+
+// CHECK:  HloModule
+func @main(%arg0: tensor<128x224x224x4xf16>, %arg1: tensor<64x7x7x4xf16>) -> tensor<128x64x112x112xf16> {
+  // CHECK: %convolution.{{.*}} = f16[128,64,112,112]{1,3,2,0} convolution{{.*}}op_name="root.42"
+  %0 = "mhlo.convolution"(%arg0, %arg1) {
+    batch_group_count = 1 : i64,
+    dimension_numbers = {
+      input_batch_dimension = 0 : i64,
+      input_feature_dimension = 3 : i64,
+      input_spatial_dimensions = dense<[ 1, 2 ]> : tensor<2xi64>,
+      kernel_input_feature_dimension = 3 : i64,
+      kernel_output_feature_dimension = 0 : i64,
+      kernel_spatial_dimensions = dense<[ 1, 2 ]> : tensor<2xi64>,
+      output_batch_dimension = 0 : i64,
+      output_feature_dimension = 1 : i64,
+      output_spatial_dimensions = dense<[ 2, 3 ]> : tensor<2xi64>
+    },
+    feature_group_count = 1 : i64,
+    lhs_dilations = dense<1> : tensor<2xi64>,
+    minor_to_major = dense<[ 1, 3, 2, 0 ]> : tensor<4xindex>,
+    padding = dense<3> : tensor<2x2xi64>,
+    precision_config = [ "DEFAULT", "DEFAULT" ],
+    rhs_dilations = dense<1> : tensor<2xi64>,
+    window_strides = dense<2> : tensor<2xi64>
+  } : (tensor<128x224x224x4xf16>, tensor<64x7x7x4xf16>)-> tensor<128x64x112x112xf16> loc("root.42")
+
+  // CHECK: s32[1,1]{0,1} constant({ {42} })
+  %cst_1 = "std.constant"() {value = dense<[[42]]> : tensor<1x1xi32>, minor_to_major = dense<[0, 1]> : tensor<2xindex>} : () -> tensor<1x1xi32>
+
+  return %0 : tensor<128x64x112x112xf16>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/location_to_op_metadata.mlir b/tensorflow/compiler/mlir/xla/tests/translate/location_to_op_metadata.mlir
new file mode 100644
index 00000000000..2182ce6106d
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/location_to_op_metadata.mlir
@@ -0,0 +1,43 @@
+// RUN: tf-mlir-translate -split-input-file -mlir-hlo-to-hlo-text %s | FileCheck %s --dump-input=always
+
+// CHECK-LABEL: %main
+func @main(%arg0: !mhlo.token) -> !mhlo.token {
+  %0 = "mhlo.after_all"(%arg0) : (!mhlo.token) -> !mhlo.token loc(unknown)
+  return %0 : !mhlo.token
+}
+
+// CHECK: after-all
+// CHECK-NOT: metadata
+
+// -----
+
+// CHECK-LABEL: %main
+func @main(%arg0: !mhlo.token) -> !mhlo.token {
+  %0 = "mhlo.after_all"(%arg0) : (!mhlo.token) -> !mhlo.token loc("AfterAll")
+  return %0 : !mhlo.token
+}
+
+// CHECK: after-all
+// CHECK-SAME: metadata={op_name="AfterAll"}
+
+// -----
+
+// CHECK-LABEL: %main
+func @main(%arg0: !mhlo.token) -> !mhlo.token {
+  %0 = "mhlo.after_all"(%arg0) : (!mhlo.token) -> !mhlo.token loc("name@function")
+  return %0 : !mhlo.token
+}
+
+// CHECK: after-all
+// CHECK-SAME: metadata={op_name="name"}
+
+// -----
+
+// CHECK-LABEL: %main
+func @main(%arg0: !mhlo.token) -> !mhlo.token {
+  %0 = "mhlo.after_all"(%arg0) : (!mhlo.token) -> !mhlo.token loc("file_name":2:8)
+  return %0 : !mhlo.token
+}
+
+// CHECK: after-all
+// CHECK-SAME: metadata={source_file="file_name" source_line=2}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/simple.hlo b/tensorflow/compiler/mlir/xla/tests/translate/simple.hlo
index d97c5150335..4c288aee956 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/simple.hlo
+++ b/tensorflow/compiler/mlir/xla/tests/translate/simple.hlo
@@ -139,8 +139,8 @@ dynamic_parameter_binding {
 }
 
 # CHECK-LABEL: func @main(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<f32> {
-# CHECK-NEXT:   %0 = mhlo.add %arg0, %arg1 {name = "add.3"} : tensor<4xf32>
+# CHECK-NEXT:   %0 = mhlo.add %arg0, %arg1 : tensor<4xf32>
 # TODO(b/129709049) consider making this default precision config inferred.
-# CHECK-NEXT:   %1 = "mhlo.dot"(%0, %arg1) {name = "dot.4", precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<4xf32>, tensor<4xf32>) -> tensor<f32>
+# CHECK-NEXT:   %1 = "mhlo.dot"(%0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<4xf32>, tensor<4xf32>) -> tensor<f32>
 # CHECK-NEXT:   return %1 : tensor<f32>
 # CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/types.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/types.hlotxt
index 855b1c4bcd5..f7e1ba9ff15 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/types.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/types.hlotxt
@@ -4,25 +4,25 @@ HloModule tfcompile.1
 
 // CHECK-LABEL: func @main() -> tensor<i1> {
 ENTRY %tfcompile.1 {
-  // CHECK-NEXT: %cst = constant  {name = "constant.0"} dense<1.000000e+00> : tensor<f32>
+  // CHECK-NEXT: %cst = constant dense<1.000000e+00> : tensor<f32>
   %constant.0 = f32[] constant(1)
 
-  // CHECK-NEXT: %cst_0 = constant  {name = "constant.1"} dense<1.000000e+00> : tensor<f64>
+  // CHECK-NEXT: %cst_0 = constant dense<1.000000e+00> : tensor<f64>
   %constant.1 = f64[] constant(1)
 
-  // CHECK-NEXT: %cst_1 = constant  {name = "constant.2"} dense<1> : tensor<i8>
+  // CHECK-NEXT: %cst_1 = constant dense<1> : tensor<i8>
   %constant.2 = s8[] constant(1)
 
-  // CHECK-NEXT: %cst_2 = constant  {name = "constant.3"} dense<1> : tensor<i16>
+  // CHECK-NEXT: %cst_2 = constant dense<1> : tensor<i16>
   %constant.3 = s16[] constant(1)
 
-  // CHECK-NEXT: %cst_3 = constant  {name = "constant.4"} dense<1> : tensor<i32>
+  // CHECK-NEXT: %cst_3 = constant dense<1> : tensor<i32>
   %constant.4 = s32[] constant(1)
 
-  // CHECK-NEXT: %cst_4 = constant  {name = "constant.5"} dense<1> : tensor<i64>
+  // CHECK-NEXT: %cst_4 = constant dense<1> : tensor<i64>
   %constant.5 = s64[] constant(1)
 
-  // CHECK-NEXT: %cst_5 = constant  {name = "constant.6"} dense<true> : tensor<i1>
+  // CHECK-NEXT: %cst_5 = constant dense<true> : tensor<i1>
   // CHECK-NEXT: return %cst_5 : tensor<i1>
   ROOT %constant.6 = pred[] constant(1)
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/while.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/while.hlotxt
index 126bc88ec7a..f989104323a 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/while.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/while.hlotxt
@@ -26,4 +26,4 @@ ENTRY %foo (arg0.1: s64[]) -> s64[] {
   // CHECK:     "mhlo.return"
   // CHECK: }) : (tensor<i64>) -> tensor<i64>
   ROOT %while.2 = s64[] while(%arg0.1), body=%loop, condition=%cond
-}
\ No newline at end of file
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/while.mlir b/tensorflow/compiler/mlir/xla/tests/translate/while.mlir
index 61d7aadb23f..f852ef06421 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/while.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/while.mlir
@@ -10,11 +10,11 @@ module {
     // CHECK:   %[[A0]] = s64[] parameter(0)
     // CHECK:   ROOT %compare.7 = pred[] compare(s64[] %[[A0]], s64[] %[[A0]]), direction=LT
     ^bb0(%arg1: tensor<i64>):
-      %1 = "mhlo.compare"(%arg1, %arg1) {comparison_direction = "LT", name = "compare.2"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+      %1 = "mhlo.compare"(%arg1, %arg1) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
       "mhlo.return"(%1) : (tensor<i1>) -> ()
     },  {
     ^bb0(%arg1: tensor<i64>):
-      %1 = mhlo.add %arg1, %arg1 {name = "compare.0"} : tensor<i64>
+      %1 = mhlo.add %arg1, %arg1 : tensor<i64>
       "mhlo.return"(%1) : (tensor<i64>) -> ()
     }) : (tensor<i64>) -> tensor<i64>
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index 2c733bb5ca2..9c85242dca8 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // This file implements logic for lowering TensorFlow dialect to XLA dialect.
 
+#include <cctype>
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
@@ -42,6 +43,7 @@ limitations under the License.
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
@@ -70,7 +72,8 @@ constexpr char kShardingAttr[] = "mhlo.sharding";
 
 class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<chlo::HloClientDialect, mhlo::MhloDialect>();
+    registry.insert<chlo::HloClientDialect, mhlo::MhloDialect,
+                    shape::ShapeDialect, StandardOpsDialect>();
   }
 
  public:
@@ -116,9 +119,9 @@ class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
 static bool IsDefaultDataFormat(StringRef format) { return format == "NHWC"; }
 
 /// Returns the feature dimension for the given format and input type.
-static size_t GetFeatureDimension(StringAttr format,
+static size_t GetFeatureDimension(StringRef format,
                                   RankedTensorType inputType) {
-  return IsDefaultDataFormat(format.getValue()) ? inputType.getRank() - 1 : 1;
+  return IsDefaultDataFormat(format) ? inputType.getRank() - 1 : 1;
 }
 
 // Gets all integer values from the given attribute and push them to `values`.
@@ -728,12 +731,33 @@ static void CreateWhile32(Location loc, int num_iterations,
 // BatchNorm op utilities.
 //===----------------------------------------------------------------------===//
 
-static IntegerAttr getFeatureDimensionAttr(Builder &b, StringAttr format,
+static IntegerAttr getFeatureDimensionAttr(Builder &b, StringRef format,
                                            Value input) {
   return b.getI64IntegerAttr(
       GetFeatureDimension(format, input.getType().cast<RankedTensorType>()));
 }
 
+//===----------------------------------------------------------------------===//
+// FFT op utilities.
+//===----------------------------------------------------------------------===//
+// Returns the 1D i64 elements attribute populated with the inner-most dim of
+// the value.
+static DenseIntElementsAttr GetInnerDimFromValue(ShapedType type,
+                                                 Builder *builder) {
+  if (type.getRank() == 0) {
+    return builder->getI64TensorAttr({});
+  }
+  return builder->getI64TensorAttr(type.getShape().back());
+}
+
+// Returns True if the inner-most dim is static.
+bool CheckInnerDimStatic(ShapedType type, Builder *builder) {
+  if (!type.hasRank()) {
+    return false;
+  }
+  return !type.isDynamicDim(type.getShape().size() - 1);
+}
+
 //===----------------------------------------------------------------------===//
 // MatMul op utilities.
 //===----------------------------------------------------------------------===//
@@ -1104,7 +1128,7 @@ class ConvertBiasAddOp : public OpRewritePattern<TF::BiasAddOp> {
                                 PatternRewriter &rewriter) const override {
     auto loc = op.getLoc();
     auto feature_dim = GetFeatureDimension(
-        op.data_formatAttr(), op.value().getType().cast<RankedTensorType>());
+        op.data_format(), op.value().getType().cast<RankedTensorType>());
     auto bias_broadcast = Broadcast1DToFeatureDim(loc, op.value(), op.bias(),
                                                   feature_dim, rewriter);
     rewriter.replaceOpWithNewOp<AddOp>(op, op.value(), bias_broadcast);
@@ -1683,6 +1707,80 @@ class ConvertEinsumOp : public OpRewritePattern<TF::EinsumOp> {
   }
 };
 
+template <typename OpTy>
+class ConvertFFTOp : public OpRewritePattern<OpTy> {
+ public:
+  using OpRewritePattern<OpTy>::OpRewritePattern;
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    auto input_ty = op.input().getType().template cast<ShapedType>();
+    if (!input_ty.hasRank()) {
+      return failure();
+    }
+    auto input_shape = input_ty.getShape();
+    DenseIntElementsAttr fft_length_attr;
+    if (!matchPattern(op.fft_length(), m_Constant(&fft_length_attr))) {
+      return failure();
+    }
+    int64_t fft_length;
+    if (fft_length_attr.getNumElements() != 0) {
+      fft_length = fft_length_attr.getValue<IntegerAttr>(0).getInt();
+    } else {
+      return failure();
+    }
+
+    std::string fft_string = "RFFT";
+    if (typeid(OpTy) == typeid(TF::IRFFTOp)) {
+      fft_length = fft_length / 2 + 1;
+      fft_string = "IRFFT";
+    }
+    auto loc = op.getLoc();
+
+    // The inner-most dim cannot be dynamic.
+    if (input_ty.isDynamicDim(input_shape.size() - 1)) {
+      return failure();
+    }
+
+    auto expected_shape = llvm::to_vector<4>(input_shape.drop_back());
+    expected_shape.push_back(fft_length);
+
+    // Zero pad or truncate the last axis
+    Value reshaped = op.input();
+    SmallVector<int64_t, 4> begin_indices(input_shape.size(), 0);
+    SmallVector<int64_t, 4> strides(input_shape.size(), 1);
+
+    // Last dim larger than fft_length, slice the input
+    if (input_shape.back() > fft_length) {
+      reshaped = rewriter.create<SliceOp>(
+          op.getLoc(),
+          RankedTensorType::get(expected_shape, input_ty.getElementType()),
+          op.input(), GetI64ElementsAttr(begin_indices, &rewriter),
+          GetI64ElementsAttr(expected_shape, &rewriter),
+          GetI64ElementsAttr(strides, &rewriter));
+
+      // Last dim smaller than fft_length, zero-pad the input
+    } else if (input_ty.getShape().back() < fft_length) {
+      SmallVector<int64_t, 4> no_padding(input_shape.size(), 0);
+      SmallVector<int64_t, 4> padding(input_shape.size() - 1, 0);
+      padding.push_back(fft_length - input_shape.back());
+      Value zero =
+          GetScalarConstOfType(input_ty.getElementType(), loc, 0, &rewriter);
+      reshaped = rewriter.create<PadOp>(
+          loc, RankedTensorType::get(expected_shape, input_ty.getElementType()),
+          op.input(), zero, GetI64ElementsAttr(no_padding, &rewriter),
+          GetI64ElementsAttr(padding, &rewriter),
+          GetI64ElementsAttr(no_padding, &rewriter));
+    }
+
+    rewriter.replaceOpWithNewOp<FftOp>(op, op.getType(), reshaped, fft_string,
+                                       rewriter.getI64TensorAttr(fft_length));
+    return success();
+  }
+};
+
+using ConvertRFFTOp = ConvertFFTOp<TF::RFFTOp>;
+using ConvertIRFFTOp = ConvertFFTOp<TF::IRFFTOp>;
+
 // The base class to convert TensorFlow FusedBatchNormGrad*Op to HLO
 // BatchNormGradOp for training and a sequence of binary ops for inference.
 // TODO(b/145536565): move to legalize_tf_patterns.td if it applies.
@@ -1716,7 +1814,7 @@ class ConvertFusedBatchNormGradBase
     act = rewriter.create<ConvertOp>(loc, act, kernel_type);
 
     auto feature_dim_attr =
-        getFeatureDimensionAttr(rewriter, op.data_formatAttr(), act);
+        getFeatureDimensionAttr(rewriter, op.data_format(), act);
     auto feature_dim = feature_dim_attr.getValue().getSExtValue();
 
     // Gets the result values.
@@ -1731,7 +1829,7 @@ class ConvertFusedBatchNormGradBase
 
       auto training_op = rewriter.create<BatchNormGradOp>(
           loc, result_type, act, scale, mean, var, grad, op.epsilon(),
-          feature_dim_attr.getValue());
+          feature_dim);
 
       x_backprop =
           rewriter.create<GetTupleElementOp>(loc, training_op.getResult(), 0);
@@ -1783,11 +1881,27 @@ class ConvertFusedBatchNormGradBase
     }
 
     x_backprop = rewriter.create<ConvertOp>(loc, x_backprop, act_ele_type);
-    // It doesn't matter what values we provide for the last 2 results.
-    rewriter.replaceOp(op,
-                       {/*x_backprop=*/x_backprop,
-                        /*scale_backprop=*/scale_backprop,
-                        /*offset_backprop=*/offset_backprop, op.x(), op.x()});
+    Value last_val[2];
+    if (op.getResult(3).use_empty() && op.getResult(4).use_empty()) {
+      // It doesn't matter what values we provide for the last 2 results.
+      last_val[0] = last_val[1] = op.x();
+    } else {
+      auto const_val = rewriter.create<ConstOp>(
+          op.getLoc(),
+          DenseElementsAttr::get<float>(
+              RankedTensorType::get({0}, getElementTypeOrSelf(op.getResult(3))),
+              0.0));
+      auto maybe_cast = [&](Value val, Type t) -> Value {
+        if (val.getType() == t) return val;
+        return rewriter.create<TensorCastOp>(op.getLoc(), t, val);
+      };
+      last_val[0] = maybe_cast(const_val, op.getResult(3).getType());
+      last_val[1] = maybe_cast(const_val, op.getResult(4).getType());
+    }
+    rewriter.replaceOp(
+        op, {/*x_backprop=*/x_backprop,
+             /*scale_backprop=*/scale_backprop,
+             /*offset_backprop=*/offset_backprop, last_val[0], last_val[1]});
     return success();
   }
 };
@@ -1810,7 +1924,7 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
   LogicalResult matchAndRewrite(FusedBatchNormOpT op,
                                 PatternRewriter &rewriter) const override {
     auto feature_dim =
-        getFeatureDimensionAttr(rewriter, op.data_formatAttr(), op.x());
+        getFeatureDimensionAttr(rewriter, op.data_format(), op.x());
 
     auto input_type_tensor = op.x().getType().template cast<TensorType>();
     auto input_element_type = input_type_tensor.getElementType();
@@ -1851,7 +1965,7 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
 
       auto bn_train_op = rewriter.create<mhlo::BatchNormTrainingOp>(
           op.getLoc(), result_type, bn_train_input, op.scale(), op.offset(),
-          op.epsilon(), feature_dim.getValue());
+          op.epsilon(), feature_dim.getInt());
       // HLO op outputs a tuple of tensors. Extract those results.
       auto bn_train_op_result = bn_train_op.getResult();
       Value y_out = rewriter.create<mhlo::GetTupleElementOp>(
@@ -1938,7 +2052,7 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
           op.getLoc(),
           /*result_type=*/bn_train_input_type_tensor, bn_train_input,
           op.scale(), op.offset(), op.mean(), op.variance(), op.epsilon(),
-          feature_dim.getValue());
+          feature_dim.getInt());
 
       // Convert back to input type to stay aligned with expected output type
       // for TF op.
@@ -2376,6 +2490,12 @@ class ConvertMaxPoolOp : public OpRewritePattern<OpTy> {
     Type element_type =
         op.input().getType().template cast<TensorType>().getElementType();
     if (!element_type.isSignlessIntOrFloat()) return failure();
+    tensorflow::Padding padding;
+    if (!GetPaddingFromString(op.padding().str(), &padding).ok())
+      return failure();
+    if (padding == tensorflow::Padding::EXPLICIT) {
+      return failure();
+    }
     Location loc = op.getLoc();
     ConstOp init = GetScalarLimitConstOfType(element_type, loc,
                                              hlo::kInfinityLowest, &rewriter);
@@ -3087,7 +3207,7 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
     // axis. For instance, if there are 4 dims, we can support a
     // shrink_axis_mask of 0001 (1), 0011 (3), 0111 (7), or 1111 (15), but no
     // other.
-    bool shrink_axis_mask_ok = op.shrink_axis_mask().isMask();
+    bool shrink_axis_mask_ok = llvm::isMask_64(op.shrink_axis_mask());
     if (!shrink_axis_mask_ok)
       return rewriter.notifyMatchFailure(
           op,
@@ -3096,27 +3216,27 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
 
     // When begin/end values are dynamic, the ellipsis mask, if set, must refer
     // to the last dimension.
-    int ellipsis_mask = op.ellipsis_mask().getZExtValue();
+    int ellipsis_mask = op.ellipsis_mask();
     if (!(ellipsis_mask == 0 || ellipsis_mask == (1 << last_dim)))
       return rewriter.notifyMatchFailure(
           op,
           "requires that ellipsis_mask, if set, refer to the last dimension of "
           "input (when begin/end values are dynamic)");
 
-    APInt begin_mask = op.begin_mask();
-    if (!begin_mask.isNullValue())
+    uint64_t begin_mask = op.begin_mask();
+    if (begin_mask)
       return rewriter.notifyMatchFailure(
           op,
           "requires that begin_mask is either set to 0 or not set when "
           "begin/end values are dynamic");
-    APInt end_mask = op.end_mask();
-    if (!end_mask.isNullValue())
+    uint64_t end_mask = op.end_mask();
+    if (end_mask)
       return rewriter.notifyMatchFailure(
           op,
           "requires that end_mask is either set to 0 or not set when begin/end "
           "values are dynamic");
-    APInt new_axis_mask = op.new_axis_mask();
-    if (!new_axis_mask.isNullValue())
+    uint64_t new_axis_mask = op.new_axis_mask();
+    if (new_axis_mask)
       return rewriter.notifyMatchFailure(
           op,
           "requires that new_axis_mask is either set to 0 or not set when "
@@ -3148,11 +3268,12 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
     SmallVector<Value, 4> slice_begin_indices;
     // For the dimensions that are to be sliced, all have slice sizes of 1.
     SmallVector<int64_t, 4> slice_sizes(slicing_dim_size, 1);
-    auto input_element_ty = input_ty.getElementType();
+    auto begin_element_ty =
+        op.begin().getType().cast<ShapedType>().getElementType();
     // Scalar tensor type.
-    TensorType type = RankedTensorType::get(/*shape=*/{}, input_element_ty);
+    TensorType type = RankedTensorType::get(/*shape=*/{}, begin_element_ty);
     Location loc = op.getLoc();
-    auto zero = GetScalarConstOfType(input_element_ty, loc, 0, &rewriter);
+    auto zero = GetScalarConstOfType(begin_element_ty, loc, 0, &rewriter);
     for (int d = 0; d < slicing_dim_size; ++d) {
       auto index = rewriter.create<SliceOp>(
           loc, op.begin(), GetI64ElementsAttr({d}, &rewriter),
@@ -3163,7 +3284,7 @@ class ConvertStridedSliceOp : public OpRewritePattern<TF::StridedSliceOp> {
       // If the index is negative, wrap it around with dimension size.
       auto index_negative =
           rewriter.create<TF::LessOp>(loc, reshaped_index, zero);
-      auto input_val = GetScalarConstOfType(input_element_ty, loc,
+      auto input_val = GetScalarConstOfType(begin_element_ty, loc,
                                             input_shape[d], &rewriter);
       auto wrapped_index =
           rewriter.create<TF::AddOp>(loc, input_val, reshaped_index);
@@ -3502,6 +3623,13 @@ class ConvertLinSpaceOp : public OpRewritePattern<TF::LinSpaceOp> {
 /// `is_accumulation` controls whether it uses higher precision for the actual
 /// reduction. This is set to false for ops like max where there is no precision
 /// concerns.
+//
+// The Derived class should have a static method to return the initial value to
+// use for reduction:
+//   static Value GetInitialValue(Type reduce_element_type, Location loc,
+//                                PatternRewriter *rewriter);
+// The reduce_element_type is guaranteed to be a float, int, or complex type
+// suitable for use with GetScalarConstOfType or GetScalarLimitConstOfType.
 template <typename Derived, typename OpTy, typename ReductionOp,
           bool is_accumulation = true>
 class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
@@ -3535,6 +3663,14 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
 
     Location loc = op.getLoc();
     Type element_type = input_ty.getElementType();
+
+    // Only float, int, and complex types are currently supported.
+    if (!element_type.isa<FloatType>() && !element_type.isa<IntegerType>() &&
+        !element_type.isa<ComplexType>()) {
+      return rewriter.notifyMatchFailure(
+          op, "element type must be float, int, or complex type");
+    }
+
     // Convert to an accumulation type to not lose precision when doing
     // repeated arithmetic operations.
     Type reduce_element_type =
@@ -4372,7 +4508,7 @@ class ConvertOneHotOp : public OpRewritePattern<TF::OneHotOp> {
     }
 
     int64_t depth = depth_attr.getValue<APInt>({}).getSExtValue();
-    int64_t axis = op.axis().getSExtValue();
+    int64_t axis = op.axis();
     if (axis == -1) axis = indices_shape.size();
 
     llvm::SmallVector<int64_t, 4> broadcast_dims(indices_shape.size());
@@ -4602,10 +4738,8 @@ class ConvertTopKV2Op : public OpRewritePattern<TF::TopKV2Op> {
                             &rewriter);
 
     // Get the sorted input and index tuple element.
-    auto tuple_first_element =
-        rewriter.create<mhlo::GetTupleElementOp>(op.getLoc(), sort_op, 0);
-    auto tuple_second_element =
-        rewriter.create<mhlo::GetTupleElementOp>(op.getLoc(), sort_op, 1);
+    auto tuple_first_element = sort_op.getResult(0);
+    auto tuple_second_element = sort_op.getResult(1);
 
     SmallVector<int64_t, 4> begin_indices(input_rank, 0);
     auto end_indices = llvm::to_vector<4>(input_type.getShape());
@@ -4648,7 +4782,7 @@ class ConvertUnpackOp : public OpRewritePattern<TF::UnpackOp> {
     if (!value_type) return failure();
 
     int64_t value_rank = value_type.getRank();
-    int64_t axis = op.axis().getSExtValue();
+    int64_t axis = op.axis();
     if (axis < 0) axis += value_rank;
 
     // Parameters for constructing each slice.
@@ -4891,8 +5025,7 @@ class ConvertRandomShuffleOp : public OpRewritePattern<TF::RandomShuffleOp> {
         BuildSortComparisonBody({i32_type, input_type.getElementType()},
                                 /*direction=*/"LT", &sorted.comparator(),
                                 &rewriter);
-        current = rewriter.create<GetTupleElementOp>(op.getLoc(),
-                                                     sorted.getResult(), 1);
+        current = sorted.getResult(1);
       }
       rewriter.replaceOp(op, current);
       return success();
@@ -5090,6 +5223,46 @@ class ConvertXlaDynamicUpdateSliceOp
   }
 };
 
+// Converts ClipByValue to XLA's clamp operation. Includes the broadcasting
+// semantics for static and dynamic cases.
+class ConvertClipByValueOp : public OpRewritePattern<TF::ClipByValueOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::ClipByValueOp op,
+                                PatternRewriter &rewriter) const override {
+    Value input = op.t();
+    Value min = op.clip_value_min();
+    Value max = op.clip_value_max();
+
+    auto input_ty = input.getType().cast<ShapedType>();
+    auto min_ty = min.getType().cast<ShapedType>();
+    auto max_ty = max.getType().cast<ShapedType>();
+
+    if (!input_ty.hasRank() || !min_ty.hasRank() || !max_ty.hasRank()) {
+      return failure();
+    }
+
+    auto shape = rewriter.create<TF::ShapeOp>(
+        op.getLoc(),
+        RankedTensorType::get({input_ty.getRank()}, rewriter.getI32Type()),
+        input);
+
+    if (min_ty != input_ty) {
+      min =
+          rewriter.create<TF::BroadcastToOp>(op.getLoc(), input_ty, min, shape);
+    }
+
+    if (max_ty != input_ty) {
+      max =
+          rewriter.create<TF::BroadcastToOp>(op.getLoc(), input_ty, max, shape);
+    }
+
+    rewriter.replaceOpWithNewOp<mhlo::ClampOp>(op, input_ty, min, input, max);
+    return success();
+  }
+};
+
 // Converts the Cumsum or Cumprod TensorFlow op to the HLO ReduceWindow op by
 // setting appropriate window dimensions, with the given aggregation op as the
 // reduction function. The input tensor needs to have a static shape, and 'axis'
@@ -5229,6 +5402,101 @@ class ConvertShapeOp : public OpRewritePattern<TF::ShapeOp> {
   }
 };
 
+class ConvertDynamicReshapeOp : public OpRewritePattern<TF::ReshapeOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::ReshapeOp op,
+                                PatternRewriter &rewriter) const override {
+    auto tensor = op.tensor();
+    auto shape = op.shape();
+
+    auto tensor_ty = tensor.getType().cast<ShapedType>();
+    auto shape_ty = shape.getType().cast<ShapedType>();
+    auto result_ty = op.getType().cast<ShapedType>();
+
+    if (!result_ty.hasRank() || !tensor_ty.hasRank() || !shape_ty.hasRank()) {
+      return failure();
+    }
+
+    // Handle with the static case.
+    if (result_ty.hasStaticShape()) {
+      return failure();
+    }
+
+    rewriter.replaceOpWithNewOp<mhlo::DynamicReshapeOp>(op, result_ty, tensor,
+                                                        shape);
+    return success();
+  }
+};
+
+class ConvertDynamicExpandDimsOp : public OpRewritePattern<TF::ExpandDimsOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::ExpandDimsOp op,
+                                PatternRewriter &rewriter) const override {
+    auto input = op.input();
+    auto input_ty = input.getType().cast<ShapedType>();
+    auto result_ty = op.getType().cast<ShapedType>();
+    if (!result_ty.hasRank() || !input_ty.hasRank() ||
+        result_ty.hasStaticShape()) {
+      return failure();
+    }
+
+    DenseIntElementsAttr expand_dims_attr;
+    if (!matchPattern(op.dim(), m_Constant(&expand_dims_attr))) {
+      return failure();
+    }
+
+    auto shape = rewriter.create<shape::ShapeOfOp>(
+        op.getLoc(),
+        RankedTensorType::get({input_ty.getRank()}, rewriter.getIndexType()),
+        input);
+    auto expand_dims = llvm::to_vector<6>(expand_dims_attr.getIntValues());
+
+    llvm::SmallVector<Value, 4> dims;
+    dims.resize(result_ty.getRank());
+
+    auto inserted_dim = expand_dims_attr.getValue({})
+                            .cast<IntegerAttr>()
+                            .getValue()
+                            .getSExtValue();
+
+    // Handle the negative value use case.
+    if (inserted_dim < 0) {
+      inserted_dim += result_ty.getRank();
+      // This means the value is completely incorrect, just return.
+      if (inserted_dim < 0) {
+        return failure();
+      }
+    }
+
+    dims[inserted_dim] = rewriter.create<ConstantIndexOp>(op.getLoc(), 1);
+
+    for (int i = 0; i < dims.size() - 1; i++) {
+      // Add the extracted dim.
+      auto index = rewriter.create<ConstantIndexOp>(op.getLoc(), i);
+      auto dim = rewriter.create<shape::GetExtentOp>(
+          op.getLoc(), rewriter.getIndexType(), shape, index);
+
+      dims[i >= inserted_dim ? i + 1 : i] = dim;
+    }
+
+    auto from_extents = rewriter.create<shape::FromExtentsOp>(
+        op.getLoc(), shape::ShapeType::get(op.getContext()), dims);
+
+    auto to_extent_tensor = rewriter.create<shape::ToExtentTensorOp>(
+        op.getLoc(),
+        RankedTensorType::get({result_ty.getRank()}, rewriter.getIndexType()),
+        from_extents);
+
+    rewriter.replaceOpWithNewOp<mhlo::DynamicReshapeOp>(op, result_ty, input,
+                                                        to_extent_tensor);
+    return success();
+  }
+};
+
 // Converts a TF QR op to HLO.
 class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
  public:
@@ -5728,7 +5996,7 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
 void EmitLegalizationErrors(Operation *op,
                             const DenseSet<Operation *> &nonlegalized_ops) {
   // Track the legalization failures by mapping op name to information about
-  // that failure: the number of unlegalized occurances of the op, and one
+  // that failure: the number of unlegalized occurrences of the op, and one
   // example operation that failed.
   std::map<StringRef, std::pair<int, Operation *>> op_name_to_error_info;
   DenseSet<Operation *> error_ops;
@@ -5823,12 +6091,6 @@ LogicalResult legalizeTF(
   ConversionTarget target(*context);
   if (legalize_chlo) {
     target.addIllegalDialect<chlo::HloClientDialect>();
-
-    // Mark ConstantLikeOp as dynamically legal only when it doesn't have a
-    // static result type so that it gets canonicalized to MHLO constant.
-    target.addDynamicallyLegalOp<chlo::ConstantLikeOp>([](Operation *op) {
-      return !op->getResultTypes().front().cast<ShapedType>().hasStaticShape();
-    });
   } else {
     target.addLegalDialect<chlo::HloClientDialect>();
   }
@@ -5858,14 +6120,16 @@ LogicalResult legalizeTF(
 
 void PopulateLegalizeTfPatterns(MLIRContext *context,
                                 OwningRewritePatternList *patterns) {
-  populateWithGenerated(context, patterns);
+  populateWithGenerated(context, *patterns);
   patterns->insert<
       ConvertAllOp, ConvertAnyOp, ConvertArgMaxOp, ConvertBatchMatMulV2Op,
       ConvertBiasAddOp, ConvertBroadcastToOp, ConvertBF16FloorDivOp,
-      ConvertConv2DOp, ConvertConv3DOp, ConvertDepthConv2DOp,
-      ConvertConv2DBackpropFilterOp, ConvertConv3DBackpropFilterOp,
-      ConvertConv2DBackpropInputOp, ConvertConv3DBackpropInputOp,
-      ConvertCumprodOp, ConvertCumsumOp, ConvertDiagPartOp, ConvertEinsumOp,
+      ConvertClipByValueOp, ConvertConv2DOp, ConvertConv3DOp,
+      ConvertDepthConv2DOp, ConvertConv2DBackpropFilterOp,
+      ConvertConv3DBackpropFilterOp, ConvertConv2DBackpropInputOp,
+      ConvertConv3DBackpropInputOp, ConvertCumprodOp, ConvertCumsumOp,
+      ConvertDiagPartOp, ConvertDynamicExpandDimsOp, ConvertDynamicReshapeOp,
+      ConvertEinsumOp, ConvertRFFTOp, ConvertIRFFTOp,
       ConvertFusedBatchNormGradOp, ConvertFusedBatchNormGradV2Op,
       ConvertFusedBatchNormGradV3Op, ConvertFusedBatchNormV2Op,
       ConvertFusedBatchNormV3Op, ConvertInfeedDequeueTupleOp,
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_communication.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_communication.cc
index 1f884b1bdea..6320ad2032b 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_communication.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_communication.cc
@@ -60,6 +60,10 @@ const char kXlaHostTransferOriginalTypeAttr[] =
 // ops other than certain control flow ops (`mhlo.if`, `mhlo.while`).
 class LegalizeTFCommunication
     : public PassWrapper<LegalizeTFCommunication, OperationPass<ModuleOp>> {
+  void getDependentDialects(DialectRegistry& registry) const override {
+    registry.insert<mhlo::MhloDialect>();
+  }
+
  public:
   void runOnOperation() override;
 };
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
index 760252331e0..692b2af7cff 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
@@ -20,30 +20,24 @@ limitations under the License.
 #include <cstdint>
 #include <iterator>
 #include <numeric>
+#include <tuple>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/StringSet.h"
-#include "llvm/ADT/iterator_range.h"
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/core/util/tensor_format.h"
 
 using mlir::PassRegistration;
 
@@ -64,7 +58,7 @@ createLegalizeTFControlFlowPass() {
 
 namespace {
 
-void Detuple(Value tuple, Operation::result_range replace, OpBuilder* builder) {
+void Detuple(Value tuple, ValueRange replace, OpBuilder* builder) {
   // De-tuple the results of the xla hlo if result.
   for (auto result_it : llvm::enumerate(replace)) {
     auto get_tuple_value = builder->create<mhlo::GetTupleElementOp>(
@@ -102,7 +96,7 @@ void ImportXlaRegion(mlir::FuncOp func, Region* dest_region, Location loc,
   }
 }
 
-void LowerIf(TF::IfOp op, ModuleOp module) {
+void LowerIf(TF::IfOp op) {
   Location loc = op.getLoc();
   OpBuilder builder(op);
 
@@ -111,7 +105,7 @@ void LowerIf(TF::IfOp op, ModuleOp module) {
   SmallVector<Value, 3> inputs(op.input());
   auto tuple_input = builder.create<mhlo::TupleOp>(loc, inputs);
 
-  // Create the new if op with tuple inputs.
+  // Create the new `mhlo.if` op with tuple inputs.
   auto result_type = builder.getTupleType(op.getResultTypes());
   auto if_op = builder.create<mhlo::IfOp>(loc, result_type, op.cond(),
                                           tuple_input, tuple_input);
@@ -119,15 +113,15 @@ void LowerIf(TF::IfOp op, ModuleOp module) {
   // Import the regions for both the true and false cases. These regions
   // must be updated to tuple the return results together and use the xla hlo
   // return op.
-  ImportXlaRegion(op.then_func(), &if_op.true_branch(), loc);
-  ImportXlaRegion(op.else_func(), &if_op.false_branch(), loc);
+  ImportXlaRegion(op.then_function(), &if_op.true_branch(), loc);
+  ImportXlaRegion(op.else_function(), &if_op.false_branch(), loc);
 
-  // De-tuple the results of the xla hlo if result.
+  // De-tuple the results of the `mhlo.if`.
   Detuple(if_op.getResult(), op.getResults(), &builder);
   op.erase();
 }
 
-void LowerCase(TF::CaseOp op, ModuleOp module) {
+void LowerCase(TF::CaseOp op) {
   Location loc = op.getLoc();
   OpBuilder builder(op);
 
@@ -137,17 +131,16 @@ void LowerCase(TF::CaseOp op, ModuleOp module) {
   auto tuple_input = builder.create<mhlo::TupleOp>(loc, inputs);
 
   // Create replica of input tuple for each branch
-  SmallVector<Value, 4> n_tuple_inputs(op.branches().size(), tuple_input);
+  SmallVector<Value, 4> n_tuple_inputs(op.num_branches(), tuple_input);
 
-  // Create the new case op with tuple inputs.
+  // Create the new `mhlo.case` op with tuple inputs.
   auto case_op =
       builder.create<mhlo::CaseOp>(loc, op.getResultTypes(), op.branch_index(),
                                    n_tuple_inputs, op.branches().size());
 
   // Import the regions for all branches.
-  for (unsigned i = 0; i < op.branches().size(); ++i) {
-    mlir::FuncOp branch_func = module.lookupSymbol<mlir::FuncOp>(
-        op.branches()[i].cast<SymbolRefAttr>());
+  for (unsigned i = 0; i < op.num_branches(); ++i) {
+    mlir::FuncOp branch_func = op.branch_function(i);
     ImportXlaRegion(branch_func, &case_op.branches()[i], loc,
                     /*tuple_return=*/false);
   }
@@ -156,7 +149,7 @@ void LowerCase(TF::CaseOp op, ModuleOp module) {
   op.erase();
 }
 
-void LowerWhile(TF::WhileOp op, ModuleOp module) {
+void LowerWhile(TF::WhileOp op) {
   Location loc = op.getLoc();
   OpBuilder builder(op);
 
@@ -166,36 +159,238 @@ void LowerWhile(TF::WhileOp op, ModuleOp module) {
   builder.setInsertionPoint(op);
   Value tuple_input = builder.create<mhlo::TupleOp>(loc, inputs);
 
-  // Create the new while op with tuple inputs.
+  // Create the new `mhlo.while` op with tuple inputs.
   auto while_op = builder.create<mhlo::WhileOp>(
       loc, builder.getTupleType(op.getResultTypes()), tuple_input);
 
   // Import the regions for both the cond and body. These regions must be
   // updated to tuple the return results together and use the xla hlo return op.
-  ImportXlaRegion(op.body_func(), &while_op.body(), loc);
-  ImportXlaRegion(op.cond_func(), &while_op.cond(), loc,
+  ImportXlaRegion(op.body_function(), &while_op.body(), loc);
+  ImportXlaRegion(op.cond_function(), &while_op.cond(), loc,
                   /*tuple_return=*/false);
 
-  // De-tuple the results of the xla hlo while.
+  // De-tuple the results of the `mhlo.while`.
+  Detuple(while_op.getResult(), op.getResults(), &builder);
+  op.erase();
+}
+
+// Replaces all block arguments of a block with a single block arg of Tuple
+// type `tuple_type`. Single block arguments are removed and remapped to
+// get_tuple_element(tuple_arg, index).
+void ReplaceBlockArgs(Block* block, Type tuple_type, OpBuilder* builder) {
+  auto tuple_arg = block->addArgument(tuple_type);
+  Detuple(tuple_arg, block->getArguments().drop_back(1), builder);
+  for (int i = block->getNumArguments() - 2; i >= 0; --i)
+    block->eraseArgument(i);
+}
+
+// Replaces implicitly captured value uses with tuple block argument.
+// get_tuple_element's are created to extract specific values. Values from
+// get_tuple_element's are returned in the order of `implicit_inputs`.
+llvm::SmallVector<Value, 4> ReplaceImplicitInputs(
+    Block* block, int offset, ArrayRef<Value> implicit_inputs,
+    OpBuilder* builder) {
+  llvm::SmallVector<Value, 4> implicit_input_elements;
+  implicit_input_elements.reserve(implicit_inputs.size());
+
+  Region* region = block->getParent();
+  assert(block->getNumArguments() == 1);
+
+  BlockArgument tuple_arg = block->getArgument(0);
+  for (auto& implicit_input : llvm::enumerate(implicit_inputs)) {
+    Value implicit_input_value = implicit_input.value();
+    auto get_tuple_element = builder->create<mhlo::GetTupleElementOp>(
+        implicit_input_value.getLoc(), tuple_arg,
+        implicit_input.index() + offset);
+    implicit_input_elements.emplace_back(get_tuple_element.getResult());
+    for (auto& use :
+         llvm::make_early_inc_range(implicit_input_value.getUses())) {
+      if (!region->isAncestor(use.getOwner()->getParentRegion())) continue;
+      use.set(get_tuple_element.getResult());
+    }
+  }
+
+  return implicit_input_elements;
+}
+
+// Finds and replaces implicitly captured value uses with tuple block argument.
+// A tuple of implicitly captured values is also created and returned, for use
+// as an operand to the associated mhlo control flow op.
+Value TupleImplicitInputs(Region& region, Location loc, OpBuilder* builder) {
+  llvm::SetVector<Value> implicit_inputs;
+  getUsedValuesDefinedAbove(region, region, implicit_inputs);
+  llvm::ArrayRef<Value> implicit_inputs_ref = implicit_inputs.getArrayRef();
+  Value tuple_input = builder->create<mhlo::TupleOp>(loc, implicit_inputs_ref);
+  Block& block = region.front();
+  // `tf.CaseRegion`/`tf.IfRegion` are expected to have no block arguments and
+  // instead all inputs used by their branch regions are implicitly captured
+  // from above.
+  assert(block.getNumArguments() == 0);
+  block.addArgument(tuple_input.getType());
+  builder->setInsertionPointToStart(&block);
+  ReplaceImplicitInputs(&block, /*offset=*/0, implicit_inputs_ref, builder);
+  return tuple_input;
+}
+
+// Replaces block terminator (tf.Yield) with `mhlo.return`. Additional results
+// can be returned if `extra_results` is not empty. If `tuple_return` is
+// set, a tuple of the return values will be set as the terminator operand.
+void ReplaceTerminator(Block* block, ArrayRef<Value> extra_results,
+                       OpBuilder* builder, bool tuple_return = true) {
+  Operation* terminator = block->getTerminator();
+  assert(isa<TF::YieldOp>(terminator));
+  Location loc = terminator->getLoc();
+
+  builder->setInsertionPoint(terminator);
+  auto results = llvm::to_vector<4>(terminator->getOperands());
+  results.append(extra_results.begin(), extra_results.end());
+  if (tuple_return) {
+    auto tuple_results = builder->create<mhlo::TupleOp>(loc, results);
+    builder->create<mhlo::ReturnOp>(loc, tuple_results.getResult());
+  } else {
+    builder->create<mhlo::ReturnOp>(loc, results);
+  }
+
+  terminator->erase();
+}
+
+void LowerIfRegion(TF::IfRegionOp op) {
+  Location loc = op.getLoc();
+  OpBuilder builder(op);
+
+  // Tuple implicit inputs per region and update terminators to return tuples.
+  builder.setInsertionPoint(op);
+  Value then_input = TupleImplicitInputs(op.then_branch(), loc, &builder);
+  ReplaceTerminator(&op.then_branch().front(), /*extra_results=*/{}, &builder);
+
+  builder.setInsertionPoint(op);
+  Value else_input = TupleImplicitInputs(op.else_branch(), loc, &builder);
+  ReplaceTerminator(&op.else_branch().front(), /*extra_results=*/{}, &builder);
+
+  // Create the new `mhlo.if` op with tuple inputs and take ownership of regions
+  // from `tf.IfRegion` op.
+  builder.setInsertionPoint(op);
+  auto result_type = builder.getTupleType(op.getResultTypes());
+  auto if_op = builder.create<mhlo::IfOp>(loc, result_type, op.cond(),
+                                          then_input, else_input);
+  if_op.true_branch().takeBody(op.then_branch());
+  if_op.false_branch().takeBody(op.else_branch());
+
+  // De-tuple the results of the `mhlo.if`.
+  Detuple(if_op.getResult(), op.getResults(), &builder);
+  op.erase();
+}
+
+void LowerCaseRegion(TF::CaseRegionOp op) {
+  Location loc = op.getLoc();
+  OpBuilder builder(op);
+
+  llvm::SmallVector<Value, 4> branch_inputs;
+  branch_inputs.reserve(op.branches().size());
+  // Tuple implicit inputs per region and update terminators.
+  for (Region& region : op.branches()) {
+    builder.setInsertionPoint(op);
+    Value branch_input = TupleImplicitInputs(region, loc, &builder);
+    branch_inputs.emplace_back(branch_input);
+    ReplaceTerminator(&region.front(), /*extra_results=*/{}, &builder,
+                      /*tuple_return=*/false);
+  }
+
+  // Create the new `mhlo.case` op with tuple inputs and take ownership of
+  // regions from `tf.CaseRegion` op.
+  builder.setInsertionPoint(op);
+  auto case_op =
+      builder.create<mhlo::CaseOp>(loc, op.getResultTypes(), op.branch_index(),
+                                   branch_inputs, branch_inputs.size());
+  for (auto region : llvm::zip(case_op.branches(), op.branches()))
+    std::get<0>(region).takeBody(std::get<1>(region));
+
+  op.replaceAllUsesWith(case_op.getResults());
+  op.erase();
+}
+
+void LowerWhileRegion(TF::WhileRegionOp op) {
+  Location loc = op.getLoc();
+  OpBuilder builder(op);
+
+  // XLA prefers tuple arguments for control flow due to XLA not supporting
+  // multiple return values.
+  SmallVector<Value, 3> inputs(op.input());
+  const int inputs_size = inputs.size();
+  llvm::SetVector<Value> implicit_inputs;
+  getUsedValuesDefinedAbove(op.getOperation()->getRegions(), implicit_inputs);
+  inputs.append(implicit_inputs.begin(), implicit_inputs.end());
+
+  builder.setInsertionPoint(op);
+  Value tuple_input = builder.create<mhlo::TupleOp>(loc, inputs);
+
+  // Create the new `mhlo.while` op with tuple inputs. Implicit inputs are also
+  // returned.
+  auto while_result_types = llvm::to_vector<4>(op.getResultTypes());
+  while_result_types.reserve(while_result_types.size() +
+                             implicit_inputs.size());
+  for (const auto& implicit_input : implicit_inputs)
+    while_result_types.emplace_back(implicit_input.getType());
+  auto while_op = builder.create<mhlo::WhileOp>(
+      loc, builder.getTupleType(while_result_types), tuple_input);
+
+  // Rewrite cond and associated block arguments and terminator. Ownership of
+  // cond region is transfered over from `tf.WhileRegion` to `mhlo.while`.
+  Region& cond = while_op.cond();
+  cond.takeBody(op.cond());
+  Block& cond_block = cond.front();
+  builder.setInsertionPointToStart(&cond_block);
+  ReplaceBlockArgs(&cond_block, tuple_input.getType(), &builder);
+  ReplaceImplicitInputs(&cond_block, inputs_size, implicit_inputs.getArrayRef(),
+                        &builder);
+  // Cond always returns a single result of bool type.
+  ReplaceTerminator(&cond_block, /*extra_results=*/{}, &builder,
+                    /*tuple_return=*/false);
+
+  // Rewrite body and associated block arguments and terminator. Ownership of
+  // body region is transfered over from `tf.WhileRegion` to `mhlo.while`.
+  Region& body = while_op.body();
+  body.takeBody(op.body());
+  Block& body_block = body.front();
+  builder.setInsertionPointToStart(&body_block);
+  ReplaceBlockArgs(&body_block, tuple_input.getType(), &builder);
+  // Capture implicit inputs that were added as a tuple block arguments. These
+  // are to be returned by the body in addition to explicit inputs.
+  auto implicit_input_elements = ReplaceImplicitInputs(
+      &body_block, inputs_size, implicit_inputs.getArrayRef(), &builder);
+  ReplaceTerminator(&body_block, implicit_input_elements, &builder);
+
+  // De-tuple the results of the `mhlo.while`.
+  builder.setInsertionPoint(op);
   Detuple(while_op.getResult(), op.getResults(), &builder);
   op.erase();
 }
 }  // namespace
 
 void LegalizeTFControlFlow::runOnOperation() {
-  auto module = getOperation();
-
-  module.walk([&](Operation* op) {
+  getOperation().walk([&](Operation* op) {
     if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
-      LowerWhile(while_op, module);
+      LowerWhile(while_op);
+      return;
+    }
+    if (auto while_region_op = dyn_cast<TF::WhileRegionOp>(op)) {
+      LowerWhileRegion(while_region_op);
       return;
     }
     if (auto if_op = dyn_cast<TF::IfOp>(op)) {
-      LowerIf(if_op, module);
+      LowerIf(if_op);
+      return;
+    }
+    if (auto if_region_op = dyn_cast<TF::IfRegionOp>(op)) {
+      LowerIfRegion(if_region_op);
       return;
     }
     if (auto case_op = dyn_cast<TF::CaseOp>(op)) {
-      LowerCase(case_op, module);
+      LowerCase(case_op);
+      return;
+    }
+    if (auto case_region_op = dyn_cast<TF::CaseRegionOp>(op)) {
+      LowerCaseRegion(case_region_op);
       return;
     }
   });
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index 73ce305091c..52bbbf6f9da 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -31,7 +31,7 @@ def IEEEFloatTensor : TensorOf<[F16, F32, F64]>;
 //===----------------------------------------------------------------------===//
 
 def FeatureDimension : NativeCodeCall<
-    "getFeatureDimensionAttr($_builder, $0, $1)">;
+    "getFeatureDimensionAttr($_builder, $0.getValue(), $1)">;
 def FalseBoolAttr : AttrConstraint<CPred<"!$_self.getValue()">>;
 def TrueBoolAttr : AttrConstraint<CPred<"$_self.getValue()">>;
 
@@ -86,7 +86,7 @@ def AreBroadcastCompatible : Constraint<CPred<"AreBroadcastCompatible($0, $1)">,
     "types must be broadcastable">;
 
 class DirectBinaryPat<Op FromOp, Op ToOp>
-  : Pat<(FromOp AnyRankedTensor:$l, AnyRankedTensor:$r),
+  : Pat<(FromOp AnyTensor:$l, AnyTensor:$r),
         (ToOp $l, $r, (BinBroadcastDimensions $l, $r))>;
 
 foreach fromToBinPair = [[TF_AddOp, HLOClient_BroadcastAddOp],
@@ -285,9 +285,19 @@ def : Pat<(TF_AllToAllOp AnyRankedTensor:$input, (TF_ConstOp $group_assignment),
 // FFT op patterns.
 //===----------------------------------------------------------------------===//
 
-def : Pat<(TF_RFFTOp $input, (TF_ConstOp I32ElementsAttr:$fft_length)),
-          (HLO_FftOp $input, HLO_FFT_TYPE_RFFT,
-           (CastElementsToI64Elements $fft_length))>;
+def GetInnerDimFromValue : NativeCodeCall<
+  "GetInnerDimFromValue($0.getType().cast<ShapedType>(), &$_builder)">;
+
+def CheckInnerDimStatic
+  : Constraint<CPred<"CheckInnerDimStatic($0.getType().cast<ShapedType>(), &$_builder)">>;
+
+def : Pat<(TF_FFTOp:$res $input),
+          (HLO_FftOp $input, HLO_FFT_TYPE_FFT, (GetInnerDimFromValue $res)),
+          [(CheckInnerDimStatic $input)]>;
+
+def : Pat<(TF_IFFTOp:$res $input),
+          (HLO_FftOp $input, HLO_FFT_TYPE_IFFT, (GetInnerDimFromValue $res)),
+          [(CheckInnerDimStatic $input)]>;
 
 //===----------------------------------------------------------------------===//
 // GatherV2 op patterns.
@@ -562,12 +572,14 @@ def : Pat<(TF_ReverseV2Op AnyRankedTensor:$values, (TF_ConstOp $axis)),
 foreach Mapping = [
                    [TF_AbsOp, HLO_AbsOp],
                    [TF_AcosOp, HLOClient_AcosOp],
+                   [TF_AtanOp, HLOClient_AtanOp],
                    [TF_CeilOp, HLO_CeilOp],
                    [TF_ComplexAbsOp, HLO_AbsOp],
                    [TF_CosOp, HLO_CosOp],
                    [TF_ExpOp, HLO_ExpOp],
                    [TF_FloorOp, HLO_FloorOp],
                    [TF_ImagOp, HLO_ImagOp],
+                   [TF_InvertOp, HLO_NotOp],
                    [TF_IsFiniteOp, HLO_IsFiniteOp],
                    [TF_LogOp, HLO_LogOp],
                    [TF_Log1pOp, HLO_Log1pOp],
@@ -576,27 +588,16 @@ foreach Mapping = [
                    [TF_RealOp, HLO_RealOp],
                    [TF_RsqrtOp, HLO_RsqrtOp],
                    [TF_SigmoidOp, HLO_LogisticOp],
+                   [TF_SinhOp, HLOClient_SinhOp],
                    [TF_SinOp, HLO_SinOp],
                    [TF_SqrtOp, HLO_SqrtOp],
                    [TF_TanhOp, HLO_TanhOp],
+                   [TF_TanOp, HLOClient_TanOp],
                   ] in {
  def : Pat<(Mapping[0] HLO_Tensor:$input),
            (Mapping[1] $input)>;
 }
 
-// Expand acos to MHLO dialect as follows:
-//   acos(x) = 2 * atan(sqrt(1 - x^2) / (1 + x))  if x != -1
-//           = pi                                 if x == -1
-def : Pat<(HLOClient_AcosOp $input), (HLO_SelectOp
-  (HLO_CompareOp $input, (HLO_ConstantLike<"0"> $input),
-     HLO_COMPARISON_DIRECTION_NE),
-  (HLO_MulOp (HLO_ConstantLike<"2.0f"> $input),
-    (HLO_Atan2Op
-      (HLO_SqrtOp (HLO_SubOp
-         (HLO_ConstantLike<"1"> $input), (HLO_MulOp $input, $input))),
-      (HLO_AddOp (HLO_ConstantLike<"1"> $input), $input))),
-  (HLO_ConstantLike<"M_PI"> $input))>;
-
 // TODO(bixia): Lower Cast with a Complex type source operand or with
 // Truncate=True for floating point value conversions.
 def : Pat<(TF_CastOp HLO_Tensor:$arg, ConstBoolAttrFalse),
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index 2f73d1a54df..b392e91e22f 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -146,13 +146,12 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
     TypeID::get<TF::HSVToRGBOp>(),
     TypeID::get<TF::IFFT2DOp>(),
     TypeID::get<TF::IFFT3DOp>(),
-    TypeID::get<TF::IFFTOp>(),
     TypeID::get<TF::IRFFT2DOp>(),
     TypeID::get<TF::IRFFT3DOp>(),
-    TypeID::get<TF::IRFFTOp>(),
     TypeID::get<TF::IgammaOp>(),
     TypeID::get<TF::IgammacOp>(),
     TypeID::get<TF::IgammaGradAOp>(),
+    TypeID::get<TF::InTopKV2Op>(),
     TypeID::get<TF::InvertOp>(),
     TypeID::get<TF::InvOp>(),
     TypeID::get<TF::LRNOp>(),
@@ -162,7 +161,6 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
     TypeID::get<TF::LeftShiftOp>(),
     TypeID::get<TF::LessEqualOp>(),
     TypeID::get<TF::LessOp>(),
-    TypeID::get<TF::LgammaOp>(),
     TypeID::get<TF::ListDiffOp>(),
     TypeID::get<TF::LogicalAndOp>(),
     TypeID::get<TF::LogicalNotOp>(),
@@ -179,6 +177,7 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
     TypeID::get<TF::MulOp>(),
     TypeID::get<TF::MultinomialOp>(),
     TypeID::get<TF::NegOp>(),
+    TypeID::get<TF::NextAfterOp>(),
     TypeID::get<TF::NonMaxSuppressionV4Op>(),
     TypeID::get<TF::NotEqualOp>(),
     TypeID::get<TF::PadOp>(),
@@ -200,6 +199,7 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
     TypeID::get<TF::ResizeBilinearOp>(),
     TypeID::get<TF::ResizeBilinearGradOp>(),
     TypeID::get<TF::ResizeNearestNeighborOp>(),
+    TypeID::get<TF::ResizeNearestNeighborGradOp>(),
     TypeID::get<TF::ReverseSequenceOp>(),
     TypeID::get<TF::RightShiftOp>(),
     TypeID::get<TF::RintOp>(),
@@ -228,6 +228,7 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
     TypeID::get<TF::StatelessTruncatedNormalOp>(),
     TypeID::get<TF::SubOp>(),
     TypeID::get<TF::TanOp>(),
+    TypeID::get<TF::TPUEmbeddingActivationsOp>(),
     TypeID::get<TF::TransposeOp>(),
     TypeID::get<TF::TruncateDivOp>(),
     TypeID::get<TF::TruncatedNormalOp>(),
@@ -240,6 +241,8 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
     TypeID::get<TF::XlaDotOp>(),
     TypeID::get<TF::XlaDynamicSliceOp>(),
     TypeID::get<TF::XlaDynamicUpdateSliceOp>(),
+    TypeID::get<TF::XlaEinsumOp>(),
+    TypeID::get<TF::XlaKeyValueSortOp>(),
     TypeID::get<TF::XlaPadOp>(),
     TypeID::get<TF::Xlog1pyOp>(),
     TypeID::get<TF::XlogyOp>()
diff --git a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc
index ef362d95b97..33cd2c66c45 100644
--- a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.cc
@@ -29,7 +29,9 @@ limitations under the License.
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
@@ -40,6 +42,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/xla/hlo_function_importer.h"
 #include "tensorflow/compiler/mlir/xla/hlo_utils.h"
 #include "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h"
+#include "tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
@@ -110,7 +113,7 @@ Status ConvertModule(std::unique_ptr<HloModule> hlo_module, ModuleOp module,
   // Run all HLO passes to produce an optimized module.
   auto result_or = backend->compiler()->RunHloPassesAndBufferAssignement(
       std::move(hlo_module), backend->default_stream_executor(),
-      backend->memory_allocator());
+      backend->memory_allocator(), optimize_xla_hlo);
   TF_RETURN_WITH_CONTEXT_IF_ERROR(result_or.status(),
                                   "running XLA pass pipeline");
   std::unique_ptr<HloModule> optimized_hlo_module =
@@ -276,27 +279,137 @@ Status LhloDialectEmitter::HandleSort(HloInstruction* instr) {
   return EmitSortOp(instr).status();
 }
 
-Status LhloDialectEmitter::CreateView(const HloInstruction* instr,
-                                      const Shape& current_shape,
-                                      ::xla::ShapeIndex* current_shape_index,
-                                      SmallVectorImpl<Value>* values) {
-  if (current_shape.IsTuple()) {
-    for (int i = 0; i < current_shape.tuple_shapes().size(); i++) {
-      current_shape_index->push_back(i);
-      TF_RETURN_IF_ERROR(CreateView(instr, current_shape.tuple_shapes(i),
-                                    current_shape_index, values));
-      current_shape_index->pop_back();
+// Walks MHLO::TupleOp recursively.
+Status WalkTuplePostOrder(Value v,
+                          const std::function<Status(Value)>& visitor) {
+  if (auto* op = v.getDefiningOp()) {
+    if (auto tuple = dyn_cast<mhlo::TupleOp>(op)) {
+      for (Value sub_v : tuple.val()) {
+        TF_RETURN_IF_ERROR(WalkTuplePostOrder(sub_v, visitor));
+      }
+      return Status::OK();
     }
-    return Status::OK();
   }
+  return visitor(v);
+}
+
+// This function removes all uses of a fused region argument, and rewire those
+// uses to a `tensor_load %memref`, where %memref is caller argument.
+//
+// It also flattens all input/output tuples into more region arguments /
+// results.
+StatusOr<Value> LhloDialectEmitter::RewriteFusionOperand(
+    const HloInstruction* root, const Shape& shape,
+    ::xla::ShapeIndex* shape_index, OpBuilder* b, Location loc) {
+  if (shape.IsTuple()) {
+    llvm::SmallVector<Value, 4> values;
+    for (int i = 0; i < shape.tuple_shapes_size(); i++) {
+      shape_index->push_back(i);
+      TF_ASSIGN_OR_RETURN(
+          auto v, RewriteFusionOperand(root, shape.tuple_shapes(i), shape_index,
+                                       b, loc));
+      values.push_back(v);
+      shape_index->pop_back();
+    }
+    return Value(b->create<mhlo::TupleOp>(loc, values));
+  }
+  TF_ASSIGN_OR_RETURN(Value memref,
+                      GetOrCreateArrayView(root, shape, *shape_index));
+  auto load = b->create<TensorLoadOp>(loc, memref);
+  if (shape.layout() !=
+      xla::LayoutUtil::MakeDescendingLayout(shape.dimensions().size())) {
+    llvm::SmallVector<int64_t, 4> minor_to_major(
+        shape.layout().minor_to_major().begin(),
+        shape.layout().minor_to_major().end());
+    load.setAttr("minor_to_major", b->getIndexTensorAttr(minor_to_major));
+  }
+  return load.getResult();
+}
+
+StatusOr<lmhlo::FusionOp> LhloDialectEmitter::EmitFusionOp(
+    HloInstruction* instr) {
+  Location loc = getLocation(instr);
+
+  auto* fusion_instr = ::xla::Cast<::xla::HloFusionInstruction>(instr);
+
+  auto fusion = builder_.create<lmhlo::FusionOp>(getLocation(instr),
+                                                 ArrayRef<NamedAttribute>{});
+  auto after_fusion = builder_.saveInsertionPoint();
+  builder_ = mlir::OpBuilder(fusion);
+
+  auto region_builder = OpBuilder::atBlockBegin(&fusion.region().front());
+
+  llvm::SmallVector<Value, 8> arguments;
+  for (int i = 0; i < instr->operands().size(); i++) {
+    const HloInstruction* operand = instr->operand(i);
+    xla::ShapeIndex shape_index;
+    TF_ASSIGN_OR_RETURN(
+        auto arg, RewriteFusionOperand(operand, operand->shape(), &shape_index,
+                                       &region_builder, loc));
+    arguments.push_back(arg);
+  }
+
+  TF_ASSIGN_OR_RETURN(Value result,
+                      ::xla::HloFunctionImporter::ImportInstructions(
+                          *fusion_instr->fused_instructions_computation(),
+                          arguments, &region_builder));
+
+  {
+    int i = 0;
+    llvm::SmallVector<Value, 4> output;
+    TF_RETURN_IF_ERROR(GetOrCreateView(instr, &output));
+    TF_RETURN_IF_ERROR(WalkTuplePostOrder(result, [&](Value v) mutable {
+      region_builder.create<TensorStoreOp>(loc, v, output[i++]);
+      return Status::OK();
+    }));
+    if (i != output.size()) {
+      return ::xla::InternalError("output sizes don't match");
+    }
+  }
+
+  // Fold GTE/Tuple pairs.
+  //
+  // Since the fused region refers to values in its parent region, we can't
+  // call applyPatternAndFoldGreedily. We optimize it manually.
+  //
+  // Only walk once, because post-ordering is exactly what we need for GTE
+  // optimizations.
+  fusion.region().walk([](mhlo::GetTupleElementOp gte) {
+    SmallVector<Value, 4> folded_values;
+    if (succeeded(OpBuilder(gte).tryFold(gte, folded_values))) {
+      gte.replaceAllUsesWith(folded_values[0]);
+    }
+  });
+
+  // Effectively a DCE on the region.
+  {
+    llvm::SmallVector<mlir::Operation*, 4> ops;
+    fusion.region().walk([&](mlir::Operation* op) { ops.push_back(op); });
+    // Visit the user first.
+    std::reverse(ops.begin(), ops.end());
+    for (auto op : ops) {
+      if (isOpTriviallyDead(op)) op->erase();
+    }
+  }
+
+  builder_.restoreInsertionPoint(after_fusion);
+  return fusion;
+}
+
+Status LhloDialectEmitter::HandleFusion(HloInstruction* instr) {
+  return EmitFusionOp(instr).status();
+}
+
+StatusOr<Value> LhloDialectEmitter::GetOrCreateArrayView(
+    const ::xla::HloInstruction* instr, const ::xla::Shape& current_shape,
+    const ::xla::ShapeIndex& shape_index) {
   TF_ASSIGN_OR_RETURN(Type out_type, ::xla::ConvertShapeToType<MemRefType>(
                                          current_shape, builder_));
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
-                      assignment_.GetUniqueSlice(instr, *current_shape_index));
+                      assignment_.GetUniqueSlice(instr, shape_index));
   Value alloc = allocations_[slice.allocation()];
-  if (alloc.getType() == out_type) {
-    values->push_back(alloc);
-    return Status::OK();
+  if (alloc.getType() == out_type && slice.offset() == 0) {
+    return alloc;
   }
 
   auto out_memref_type = out_type.dyn_cast<MemRefType>();
@@ -304,6 +417,13 @@ Status LhloDialectEmitter::CreateView(const HloInstruction* instr,
     return tensorflow::errors::Internal(
         "Expected memref type when creating a view for leaf type of a tuple.");
 
+  // Cache generated ViewOp and StaticMemRefCastOp by (instruction,
+  // shape_index).
+  auto& cached_value = slices_[std::make_pair(instr, shape_index)];
+  if (cached_value) {
+    return cached_value;
+  }
+
   Value byte_shift =
       builder_.create<ConstantIndexOp>(alloc.getLoc(), slice.offset());
 
@@ -327,7 +447,24 @@ Status LhloDialectEmitter::CreateView(const HloInstruction* instr,
   if (physical_out_type != out_type)
     result = builder_.create<lmhlo::StaticMemRefCastOp>(loc, out_memref_type,
                                                         result);
-  values->push_back(result);
+  return cached_value = result;
+}
+
+Status LhloDialectEmitter::GetOrCreateViewImpl(
+    const HloInstruction* instr, const Shape& current_shape,
+    ::xla::ShapeIndex* current_shape_index, SmallVectorImpl<Value>* values) {
+  if (current_shape.IsTuple()) {
+    for (int i = 0; i < current_shape.tuple_shapes().size(); i++) {
+      current_shape_index->push_back(i);
+      TF_RETURN_IF_ERROR(GetOrCreateViewImpl(
+          instr, current_shape.tuple_shapes(i), current_shape_index, values));
+      current_shape_index->pop_back();
+    }
+    return Status::OK();
+  }
+  TF_ASSIGN_OR_RETURN(
+      auto v, GetOrCreateArrayView(instr, current_shape, *current_shape_index));
+  values->push_back(v);
   return Status::OK();
 }
 
@@ -336,25 +473,8 @@ Status LhloDialectEmitter::CreateView(const HloInstruction* instr,
 // create another view to adjust the slice for the shape of the instruction.
 Status LhloDialectEmitter::GetOrCreateView(const HloInstruction* instr,
                                            SmallVectorImpl<Value>* values) {
-  // Cache generated ViewOp and StaticMemRefCastOp by instruction. We could have
-  // gone fancier to do the following cacheing:
-  //   %range = ViewOp(%allocation, %offset) : memref<i8xSIZE>
-  //   %typed_range = ViewOp(%range) : memref<f32x...>
-  //
-  // where %range is cached. This in theory gives easier time for alias
-  // analysis, since the identity of %range defines alias. However,
-  // %typed_range can't be cached, as different buffers with different types and
-  // shapes may still alias. Creating two ViewOps doesn't seem to worth the
-  // effort for a slightly easier aliasing, so we don't over optimize here.
-  auto result = slices_.try_emplace(instr, llvm::SmallVector<Value, 4>{});
-  llvm::SmallVectorImpl<Value>& new_values = result.first->second;
-  if (result.second) {
-    ::xla::ShapeIndex shape_index;
-    TF_RETURN_IF_ERROR(
-        CreateView(instr, instr->shape(), &shape_index, &new_values));
-  }
-  values->insert(values->end(), new_values.begin(), new_values.end());
-  return Status::OK();
+  ::xla::ShapeIndex shape_index;
+  return GetOrCreateViewImpl(instr, instr->shape(), &shape_index, values);
 }
 
 Status LhloDialectEmitter::Initialize() {
@@ -373,7 +493,7 @@ Status LhloDialectEmitter::Initialize() {
 
   if (computation_.IsEntryComputation()) {
     // Sort the rather arbitrarily ordered allocations to match the input/output
-    // parameters. Specifically We want to sort buffer allocations in the
+    // parameters. Specifically we want to sort buffer allocations in the
     // following order:
     // * Parameters always order before non-parameters.
     // * Different parameters order by parameter number.
@@ -436,8 +556,8 @@ Status LhloDialectEmitter::Initialize() {
     }
   }
 
-  FunctionType function_type = builder_.getFunctionType(
-      llvm::to_vector<8>(block->getArgumentTypes()), {});
+  FunctionType function_type =
+      builder_.getFunctionType(block->getArgumentTypes(), {});
   func_op.setType(function_type);
   func_op.setAllArgAttrs(args_attrs);
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h
index 89514116254..a57db3cb67e 100644
--- a/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h
+++ b/tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h
@@ -43,6 +43,7 @@ class LhloDialectEmitter : public ::xla::DfsHloVisitorWithDefault {
         i8_type_(builder_.getIntegerType(8)) {}
 
   ::xla::StatusOr<lmhlo::SortOp> EmitSortOp(::xla::HloInstruction* instr);
+  ::xla::StatusOr<lmhlo::FusionOp> EmitFusionOp(::xla::HloInstruction* instr);
 
  private:
   template <typename OpType>
@@ -57,21 +58,31 @@ class LhloDialectEmitter : public ::xla::DfsHloVisitorWithDefault {
   }
 
   tensorflow::Status HandleSort(::xla::HloInstruction* instr) final;
+  tensorflow::Status HandleFusion(::xla::HloInstruction* instr) final;
 
   // Helper function that recursively visits the tuple structure in
   // `current_shape`, and reconstruct a matching lmhlo::TupleOp.
   // Each leaf node is converted to an std.view op with corresponding offsets.
   // If no tuple presents, it simply returns a view of the buffer.
-  tensorflow::Status CreateView(const ::xla::HloInstruction* instr,
-                                const ::xla::Shape& current_shape,
-                                ::xla::ShapeIndex* current_shape_index,
-                                SmallVectorImpl<Value>* values);
+  tensorflow::Status GetOrCreateViewImpl(const ::xla::HloInstruction* instr,
+                                         const ::xla::Shape& current_shape,
+                                         ::xla::ShapeIndex* current_shape_index,
+                                         SmallVectorImpl<Value>* values);
 
   // Helper function to create view/tuple of views to a buffer for a given
   // instruction result.
   tensorflow::Status GetOrCreateView(const ::xla::HloInstruction* instr,
                                      SmallVectorImpl<Value>* values);
 
+  ::xla::StatusOr<Value> GetOrCreateArrayView(
+      const ::xla::HloInstruction* instr, const ::xla::Shape& current_shape,
+      const ::xla::ShapeIndex& current_shape_index);
+
+  ::xla::StatusOr<Value> RewriteFusionOperand(const ::xla::HloInstruction* root,
+                                              const ::xla::Shape& shape,
+                                              ::xla::ShapeIndex* shape_index,
+                                              OpBuilder* b, Location loc);
+
   // Return an MLIR location for an HLO instruction.
   Location getLocation(::xla::HloInstruction* inst) {
     return NameLoc::get(builder_.getIdentifier(inst->name()),
@@ -102,7 +113,8 @@ class LhloDialectEmitter : public ::xla::DfsHloVisitorWithDefault {
   //
   // `slices_` is populated lazily in the `GetOrCreateView()` helper as we
   // process every instruction.
-  llvm::DenseMap<const xla::HloInstruction*, llvm::SmallVector<Value, 1>>
+  absl::flat_hash_map<std::pair<const xla::HloInstruction*, xla::ShapeIndex>,
+                      Value>
       slices_;
 
   // The BufferAssignment computed by XLA ahead of time.
diff --git a/tensorflow/compiler/mlir/xla/type_to_shape.cc b/tensorflow/compiler/mlir/xla/type_to_shape.cc
index b725f56b455..3822e10089b 100644
--- a/tensorflow/compiler/mlir/xla/type_to_shape.cc
+++ b/tensorflow/compiler/mlir/xla/type_to_shape.cc
@@ -102,8 +102,7 @@ Shape TypeToShape(mlir::Type type) {
   if (ptype != PrimitiveType::PRIMITIVE_TYPE_INVALID)
     return ShapeUtil::MakeShape(ptype, {});
 
-  if (type.isBF16() || type.isF32() || type.isF64() ||
-      type.isa<mlir::IntegerType>()) {
+  if (type.isIntOrFloat()) {
     auto* context = type.getContext();
     mlir::emitError(mlir::UnknownLoc::get(context))
         << "lowering should have been handled by primitive type lowering for "
@@ -140,7 +139,8 @@ Shape TypeToShape(mlir::Type type) {
       for (const auto& e : llvm::enumerate(strides)) {
         strides_with_indices.push_back({e.value(), e.index()});
       }
-      std::sort(strides_with_indices.begin(), strides_with_indices.end());
+      std::stable_sort(strides_with_indices.begin(),
+                       strides_with_indices.end());
 
       llvm::SmallVector<int64, 4> minor_to_major;
       int64_t stride = 1;
@@ -149,7 +149,7 @@ Shape TypeToShape(mlir::Type type) {
 
         // Either the affine map is not perfectly strided, or the dimensions
         // recovered from strides don't match the actual dimensions in shapes.
-        if (stride != pr.first) return {};
+        if (stride != pr.first && m.getShape()[pr.second] != 1) return {};
 
         stride *= m.getShape()[pr.second];
       }
diff --git a/tensorflow/compiler/mlir/xla/type_to_shape_test.cc b/tensorflow/compiler/mlir/xla/type_to_shape_test.cc
index a4a2bc42d99..97417748b64 100644
--- a/tensorflow/compiler/mlir/xla/type_to_shape_test.cc
+++ b/tensorflow/compiler/mlir/xla/type_to_shape_test.cc
@@ -196,5 +196,22 @@ TEST(TypeToShapeTest, ConvertMemRefToShape) {
   EXPECT_TRUE(ShapeUtil::Equal(converted, shape));
 }
 
+TEST(TypeToShapeTest, ConvertMemRefToShape2) {
+  Shape shape = ShapeUtil::MakeShapeWithLayout(PrimitiveType::C64, {2, 4, 3, 3},
+                                               {2, 3, 1, 0});
+  MLIRContext context;
+  mlir::Builder builder(&context);
+
+  StatusOr<mlir::Type> mlir_type =
+      ConvertShapeToType<MemRefType>(shape, builder);
+  ASSERT_TRUE(mlir_type.ok());
+  mlir::Type type = mlir_type.ConsumeValueOrDie();
+  Shape converted = TypeToShape(type);
+  EXPECT_TRUE(ShapeUtil::Equal(
+      converted, ShapeUtil::MakeShapeWithLayout(PrimitiveType::C64,
+                                                {2, 4, 3, 3}, {2, 3, 1, 0})));
+  EXPECT_TRUE(ShapeUtil::Equal(converted, shape));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc b/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc
index d5c598615b7..3ee70db1813 100644
--- a/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/xla/hlo_to_mlir_hlo.h"
 #include "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h"
 #include "tensorflow/compiler/mlir/xla/transforms/mhlo_to_lhlo_with_xla.h"
+#include "tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
@@ -33,19 +34,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
 
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> emit_use_tuple_arg(
-    "emit-use-tuple-args",
-    llvm::cl::desc(
-        "Emit HLO modules using tuples as args for the entry computation"),
-    llvm::cl::init(false));
-
-// NOLINTNEXTLINE
-static llvm::cl::opt<bool> emit_return_tuple(
-    "emit-return-tuple",
-    llvm::cl::desc("Emit HLO modules with entry computations returning tuple"),
-    llvm::cl::init(false));
-
 namespace xla {
 
 namespace {
@@ -136,13 +124,16 @@ static StatusOr<std::unique_ptr<HloModule>> HloModuleFromProto(
   return HloModule::CreateFromProto(module_proto, module_config);
 }
 
-static mlir::LogicalResult MlirHloToHloTextTranslateFunction(
-    mlir::ModuleOp module, llvm::raw_ostream& output) {
+static mlir::LogicalResult MlirHloToHloTextTranslateFunctionImpl(
+    mlir::ModuleOp module, llvm::raw_ostream& output, bool with_layouts) {
   if (!module) return mlir::failure();
 
   HloProto hloProto;
+  mlir::MlirToHloConversionOptions options;
+  options.propagate_layouts = with_layouts;
   Status status = mlir::ConvertMlirHloToHlo(
-      module, &hloProto, emit_use_tuple_arg, emit_return_tuple);
+      module, &hloProto, emit_use_tuple_arg, emit_return_tuple,
+      /*shape_representation_fn=*/nullptr, options);
   if (!status.ok()) {
     LOG(ERROR) << "Module conversion failed: " << status;
     return mlir::failure();
@@ -158,9 +149,8 @@ static mlir::LogicalResult MlirHloToHloTextTranslateFunction(
 
   HloModule* hlo_module = statusOrHloModule.ValueOrDie().get();
 
-  // We don't interpret or use layouts
   output << hlo_module->ToString(
-      HloPrintOptions().set_include_layout_in_shapes(false));
+      HloPrintOptions().set_include_layout_in_shapes(with_layouts));
 
   // Output alias information as comments in the HLO text.
   hlo_module->input_output_alias_config().ForEachAlias(
@@ -174,6 +164,18 @@ static mlir::LogicalResult MlirHloToHloTextTranslateFunction(
   return mlir::success();
 }
 
+static mlir::LogicalResult MlirHloToHloTextTranslateFunction(
+    mlir::ModuleOp module, llvm::raw_ostream& output) {
+  return MlirHloToHloTextTranslateFunctionImpl(module, output,
+                                               /*with_layouts=*/false);
+}
+
+static mlir::LogicalResult MlirHloToHloTextWithLayoutsTranslateFunction(
+    mlir::ModuleOp module, llvm::raw_ostream& output) {
+  return MlirHloToHloTextTranslateFunctionImpl(module, output,
+                                               /*with_layouts=*/true);
+}
+
 }  // namespace xla
 
 static void RegisterInputDialects(mlir::DialectRegistry& registry) {
@@ -188,6 +190,10 @@ static mlir::TranslateFromMLIRRegistration MlirHloToHloTextTranslate(
     "mlir-hlo-to-hlo-text", xla::MlirHloToHloTextTranslateFunction,
     RegisterInputDialects);
 
+static mlir::TranslateFromMLIRRegistration MlirHloToHloTextWithLayoutsTranslate(
+    "mlir-hlo-to-hlo-text-with-layouts",
+    xla::MlirHloToHloTextWithLayoutsTranslateFunction, RegisterInputDialects);
+
 static mlir::TranslateToMLIRRegistration HloToHloMlirTranslate(
     "hlo-to-mlir-hlo", xla::HloToMlirHloTranslateFunction);
 
diff --git a/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.cc b/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.cc
new file mode 100644
index 00000000000..7eb1fb40f5e
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.cc
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.h"
+
+// NOLINTNEXTLINE
+llvm::cl::opt<bool> emit_use_tuple_arg(
+    "emit-use-tuple-args",
+    llvm::cl::desc(
+        "Emit HLO modules using tuples as args for the entry computation"),
+    llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+llvm::cl::opt<bool> emit_return_tuple(
+    "emit-return-tuple",
+    llvm::cl::desc("Emit HLO modules with entry computations returning tuple"),
+    llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+llvm::cl::opt<bool> optimize_xla_hlo(
+    "optimize-xla-hlo",
+    llvm::cl::desc("Enable optimizations when translating XLA HLO -> LHLO"),
+    llvm::cl::init(true));
diff --git a/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.h b/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.h
new file mode 100644
index 00000000000..14a2878dff8
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/xla_mlir_translate_cl.h
@@ -0,0 +1,29 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_XLA_XLA_MLIR_TRANSLATE_CL_H_
+#define TENSORFLOW_COMPILER_MLIR_XLA_XLA_MLIR_TRANSLATE_CL_H_
+
+#include "llvm/Support/CommandLine.h"
+
+// This file contains command-line options aimed to provide the parameters
+// required by the MLIR module to XLA HLO conversion. It is only intended to be
+// included by binaries.
+
+extern llvm::cl::opt<bool> emit_use_tuple_arg;
+extern llvm::cl::opt<bool> emit_return_tuple;
+extern llvm::cl::opt<bool> optimize_xla_hlo;
+
+#endif  // TENSORFLOW_COMPILER_MLIR_XLA_XLA_MLIR_TRANSLATE_CL_H_
diff --git a/tensorflow/compiler/plugin/BUILD b/tensorflow/compiler/plugin/BUILD
index c2ba5cb3ecd..dc1c2391e94 100644
--- a/tensorflow/compiler/plugin/BUILD
+++ b/tensorflow/compiler/plugin/BUILD
@@ -13,6 +13,8 @@
 # limitations under the License.
 # ==============================================================================
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 """Configuration file for an XLA plugin.
 
   please don't check in changes to this file. to prevent changes appearing
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 30b8a7e5561..eb0cde57591 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")  # buildifier: disable=same-origin-load
 load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
@@ -326,7 +327,6 @@ tf_xla_py_test(
     name = "self_adjoint_eig_op_test",
     size = "medium",
     srcs = ["self_adjoint_eig_op_test.py"],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -392,7 +392,6 @@ tf_xla_py_test(
     size = "small",
     timeout = "moderate",
     srcs = ["matrix_inverse_op_test.py"],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -415,7 +414,6 @@ tf_xla_py_test(
     size = "small",
     timeout = "moderate",
     srcs = ["matrix_solve_op_test.py"],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -638,7 +636,6 @@ tf_xla_py_test(
     name = "extract_image_patches_op_test",
     size = "small",
     srcs = ["extract_image_patches_op_test.py"],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -695,7 +692,6 @@ tf_xla_py_test(
     name = "fft_test",
     size = "medium",
     srcs = ["fft_test.py"],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 6,
     tags = [
@@ -757,6 +753,7 @@ tf_xla_py_test(
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "notsan",  # TODO(b/171000704): data race
     ],
     deps = [
         ":xla_test",
@@ -1017,7 +1014,6 @@ tf_xla_py_test(
         "cpu",
         "cpu_ondemand",
     ],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     shard_count = 5,
     tags = [
@@ -1280,7 +1276,9 @@ tf_xla_py_test(
     python_version = "PY3",
     shard_count = 10,
     tags = [
+        "no_oss",  # b/170479349
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "notap",  # b/170479349
         "optonly",
     ],
     deps = [
@@ -1350,7 +1348,6 @@ tf_xla_py_test(
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
-        "notap",  # b/162025277
     ],
     deps = [
         ":xla_test",
@@ -1388,7 +1385,6 @@ tf_xla_py_test(
     name = "unary_ops_test",
     size = "medium",
     srcs = ["unary_ops_test.py"],
-    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
@@ -1483,6 +1479,7 @@ tf_xla_py_test(
         "//tensorflow/compiler/tf2xla/python:xla",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework",
+        "//tensorflow/python:image_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
     ],
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 07a41d67520..59c8c544347 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -474,7 +474,6 @@ class BinaryOpsTest(xla_test.XLATestCase):
           expected=np.array([1 << 32, 1 << 36, 1 << 32, 1 << 36],
                             dtype=np.int64))
 
-  @test_util.disable_mlir_bridge("Enable tf.NextAfter Compilation")
   def testNextAfter(self):
     for dtype in self.numeric_types:
       if dtype in [np.float32, np.float64]:
diff --git a/tensorflow/compiler/tests/cholesky_op_test.py b/tensorflow/compiler/tests/cholesky_op_test.py
index 4bd2dfd9244..41877d39381 100644
--- a/tensorflow/compiler/tests/cholesky_op_test.py
+++ b/tensorflow/compiler/tests/cholesky_op_test.py
@@ -28,7 +28,6 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -61,7 +60,7 @@ class CholeskyOpTest(xla_test.XLATestCase):
           dtypes.as_dtype(x.dtype), shape=x.shape)
       with self.test_scope():
         chol = linalg_ops.cholesky(placeholder)
-      verification = math_ops.matmul(chol, chol, adjoint_b=True)
+      verification = test_util.matmul_without_tf32(chol, chol, adjoint_b=True)
       self._verifyCholeskyBase(sess, placeholder, x, chol, verification, atol)
 
   def testBasic(self):
diff --git a/tensorflow/compiler/tests/data_format_ops_test.py b/tensorflow/compiler/tests/data_format_ops_test.py
index 08d44256b50..ca833326a50 100644
--- a/tensorflow/compiler/tests/data_format_ops_test.py
+++ b/tensorflow/compiler/tests/data_format_ops_test.py
@@ -63,6 +63,22 @@ class XlaDataFormatDimMapTest(xla_test.XLATestCase):
     self._test([-4, -3, -2, -1, 0, 1, 2, 3], "qwer", "rewq",
                [3, 2, 1, 0, 3, 2, 1, 0])
 
+    self._test(0, "NDHWC", "NCDHW", 0)
+    self._test(1, "NDHWC", "NCDHW", 2)
+    self._test(2, "NDHWC", "NCDHW", 3)
+    self._test(3, "NDHWC", "NCDHW", 4)
+    self._test(4, "NDHWC", "NCDHW", 1)
+    self._test([1, 4], "NDHWC", "NCDHW", [2, 1])
+    self._test([1, 4, -2], "NDHWC", "NCDHW", [2, 1, 4])
+    self._test([1, -3, -2], "NDHWC", "NCDHW", [2, 3, 4])
+    self._test([[1, -4], [1, -1]], "NDHWC", "NCDHW", [[2, 2], [2, 1]])
+
+    self._test([1, -3, -2], "NDHWC", "NCDHW", [2, 3, 4])
+    self._test([-5, -4, -3, -2, -1, 0, 1, 2, 3, 4], "NDHWC", "DHWNC",
+               [3, 0, 1, 2, 4, 3, 0, 1, 2, 4])
+    self._test([-5, -4, -3, -2, -1, 0, 1, 2, 3, 4], "NDHWC", "WHDCN",
+               [4, 2, 1, 0, 3, 4, 2, 1, 0, 3])
+
 
 class XlaPermuteOpTest(xla_test.XLATestCase):
 
diff --git a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
index 9d278cfbb28..08aad66abe1 100644
--- a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
@@ -29,7 +29,6 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -65,7 +64,8 @@ class MatrixTriangularSolveOpTest(xla_test.XLATestCase):
       with self.test_scope():
         x = linalg_ops.matrix_triangular_solve(
             placeholder_a, placeholder_b, lower=lower, adjoint=adjoint)
-      verification = math_ops.matmul(placeholder_ca, x, adjoint_a=adjoint)
+      verification = test_util.matmul_without_tf32(
+          placeholder_ca, x, adjoint_a=adjoint)
       self._VerifyTriangularSolveBase(sess, placeholder_a, placeholder_ca,
                                       placeholder_b, a, clean_a, b,
                                       verification, atol)
diff --git a/tensorflow/compiler/tests/qr_op_test.py b/tensorflow/compiler/tests/qr_op_test.py
index 5fcf254db82..de318e9dfde 100644
--- a/tensorflow/compiler/tests/qr_op_test.py
+++ b/tensorflow/compiler/tests/qr_op_test.py
@@ -24,12 +24,18 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_without_tensor_float_32(
+    "XLA QR op calls matmul. Also, matmul used for verification. Also with "
+    'TensorFloat-32, mysterious "Unable to launch cuBLAS gemm" error '
+    "occasionally occurs")
+# TODO(b/165435566): Fix "Unable to launch cuBLAS gemm" error
 class QrOpTest(xla_test.XLATestCase, parameterized.TestCase):
 
   def AdjustedNorm(self, x):
@@ -64,16 +70,26 @@ class QrOpTest(xla_test.XLATestCase, parameterized.TestCase):
     xx = math_ops.matmul(x, x, adjoint_a=True)
     identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0)
     precision = self.AdjustedNorm(xx.eval() - self.evaluate(identity))
-    self.assertTrue(np.all(precision < 5.0))
+    self.assertTrue(np.all(precision < 6.0))
 
-  def _test(self, dtype, shape, full_matrices):
+  def _random_matrix(self, dtype, shape):
     np.random.seed(1)
-    x_np = np.random.uniform(
-        low=-1.0, high=1.0, size=np.prod(shape)).reshape(shape).astype(dtype)
 
+    def rng():
+      return np.random.uniform(
+          low=-1.0, high=1.0, size=np.prod(shape)).reshape(shape).astype(dtype)
+
+    x_np = rng()
+    if np.issubdtype(dtype, np.complexfloating):
+      x_np += rng() * dtype(1j)
+    return x_np
+
+  def _test(self, x_np, full_matrices, full_rank=True):
+    dtype = x_np.dtype
+    shape = x_np.shape
     with self.session() as sess:
       x_tf = array_ops.placeholder(dtype)
-      with self.test_scope():
+      with self.device_scope():
         q_tf, r_tf = linalg_ops.qr(x_tf, full_matrices=full_matrices)
       q_tf_val, r_tf_val = sess.run([q_tf, r_tf], feed_dict={x_tf: x_np})
 
@@ -91,24 +107,39 @@ class QrOpTest(xla_test.XLATestCase, parameterized.TestCase):
           np_q_reshape[i, :, :], _ = np.linalg.qr(
               x_reshape[i, :, :], mode="reduced")
       np_q = np.reshape(np_q_reshape, q_dims)
-      self.CompareOrthogonal(np_q, q_tf_val, min(shape[-2:]))
+      if full_rank:
+        # Q is unique up to sign/phase if the matrix is full-rank.
+        self.CompareOrthogonal(np_q, q_tf_val, min(shape[-2:]))
       self.CheckApproximation(x_np, q_tf_val, r_tf_val)
       self.CheckUnitary(q_tf_val)
 
-  SIZES = [1, 2, 5, 10, 32, 100, 300]
-  DTYPES = [np.float32]
+  SIZES = [1, 2, 5, 10, 32, 100, 300, 603]
+  DTYPES = [np.float32, np.complex64]
   PARAMS = itertools.product(SIZES, SIZES, DTYPES)
 
   @parameterized.parameters(*PARAMS)
   def testQR(self, rows, cols, dtype):
-    # TODO(b/111317468): Test other types.
     for full_matrices in [True, False]:
       # Only tests the (3, 2) case for small numbers of rows/columns.
       for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
-        self._test(dtype, batch_dims + (rows, cols), full_matrices)
+        x_np = self._random_matrix(dtype, batch_dims + (rows, cols))
+        self._test(x_np, full_matrices)
 
   def testLarge2000x2000(self):
-    self._test(np.float32, (2000, 2000), full_matrices=True)
+    x_np = self._random_matrix(np.float32, (2000, 2000))
+    self._test(x_np, full_matrices=True)
+
+  @parameterized.parameters((23, 25), (513, 23))
+  def testZeroColumn(self, rows, cols):
+    x_np = self._random_matrix(np.complex64, (rows, cols))
+    x_np[:, 7] = 0.
+    self._test(x_np, full_matrices=True)
+
+  @parameterized.parameters((4, 4), (514, 20))
+  def testRepeatedColumn(self, rows, cols):
+    x_np = self._random_matrix(np.complex64, (rows, cols))
+    x_np[:, 1] = x_np[:, 2]
+    self._test(x_np, full_matrices=True, full_rank=False)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/special_math_test.py b/tensorflow/compiler/tests/special_math_test.py
index bd105bb5e95..5e7f8763743 100644
--- a/tensorflow/compiler/tests/special_math_test.py
+++ b/tensorflow/compiler/tests/special_math_test.py
@@ -31,6 +31,7 @@ import six
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import gradient_checker_v2
@@ -54,6 +55,16 @@ def _igammac(a, x):
   return math_ops.igammac(a, x)
 
 
+@def_function.function(experimental_compile=True)
+def _polygamma(n, x):
+  return math_ops.polygamma(n, x)
+
+
+@def_function.function(experimental_compile=True)
+def _zeta(a, q):
+  return math_ops.zeta(a, q)
+
+
 # This is df/da / df/dx, where f = igamma.
 def implicit_reparameterization_grad(a, x):
   log_prob = math_ops.xlogy(a - 1., x) - math_ops.lgamma(a) - x
@@ -136,6 +147,208 @@ class Log1pTest(xla_test.XLATestCase, parameterized.TestCase):
     self._test_range(0., 3., dtype, rtol, atol, is_negative=False)
 
 
+class ZetaTest(xla_test.XLATestCase, parameterized.TestCase):
+
+  def setUp(self):
+    if flags.FLAGS.vary_seed:
+      entropy = os.urandom(64)
+      if six.PY2:
+        answer = int(entropy.encode('hex'), 16)
+      else:
+        answer = int.from_bytes(entropy, 'big')
+      np.random.seed(answer % (2**32 - 1))
+    super(ZetaTest, self).setUp()
+
+  def adjust_tolerance_for_tpu(self, dtype, rtol, atol):
+    if self.device not in ['TPU']:
+      return rtol, atol
+
+    if dtype == np.float32:
+      return 2e-2, 1e-7
+    return 2e-4, 1e-20
+
+  @test_util.disable_mlir_bridge('TODO(b/165736950): Add support in MLIR')
+  def testBadValues(self):
+    q = np.random.uniform(low=0.3, high=20., size=[10])
+    with self.session() as sess:
+      with self.test_scope():
+        y = _zeta(np.float64(1.), q)
+      actual = sess.run(y)
+    # When x == 1, this is the Harmonic series.
+    self.assertTrue(np.all(np.isinf(actual)))
+
+    with self.session() as sess:
+      with self.test_scope():
+        y = _zeta(np.float64(0.1), q)
+      actual = sess.run(y)
+    # When x < 1, this is undefined.
+    self.assertTrue(np.all(np.isnan(actual)))
+
+    with self.session() as sess:
+      with self.test_scope():
+        y = _zeta([1., 1.1], [-1.1, -1.])
+      actual = sess.run(y)
+
+    # When q is negative, zeta is not defined
+    # if q is an integer or x is not an integer.
+    self.assertTrue(np.all(np.isinf(actual)))
+
+  @parameterized.parameters((np.float32, 1e-2, 1e-11),
+                            (np.float64, 1e-4, 1e-30))
+  @test_util.disable_mlir_bridge('TODO(b/165736950): Add support in MLIR')
+  def testLargeXSmallQ(self, dtype, rtol, atol):
+    rtol, atol = self.adjust_tolerance_for_tpu(dtype, rtol, atol)
+    if self.device not in ['XLA_GPU', 'XLA_CPU'] and dtype == np.float64:
+      # TODO(b/165739664): Figure out why on TPU F64 Zeta sometimes returns
+      # infs.
+      self.skipTest(
+          'Skipping test because some F64 operations are numerically '
+          'unstable on TPU.')
+
+    x = np.random.uniform(low=100., high=200., size=[NUM_SAMPLES]).astype(dtype)
+    q = np.random.uniform(low=0.3, high=1., size=[NUM_SAMPLES]).astype(dtype)
+
+    expected_values = sps.zeta(x, q)
+    with self.session() as sess:
+      with self.test_scope():
+        y = _zeta(x, q)
+      actual = sess.run(y)
+
+    self.assertAllClose(expected_values, actual, atol=atol, rtol=rtol)
+
+  @parameterized.parameters((np.float32, 1e-2, 1e-11),
+                            (np.float64, 1e-4, 1e-30))
+  @test_util.disable_mlir_bridge('TODO(b/165736950): Add support in MLIR')
+  def testSmallValues(self, dtype, rtol, atol):
+    rtol, atol = self.adjust_tolerance_for_tpu(dtype, rtol, atol)
+    # Test values near zero.
+    x = np.random.uniform(low=1.1, high=10., size=[NUM_SAMPLES]).astype(dtype)
+    q = np.random.uniform(
+        low=np.finfo(dtype).tiny, high=1., size=[NUM_SAMPLES]).astype(dtype)
+
+    expected_values = sps.zeta(x, q)
+    with self.session() as sess:
+      with self.test_scope():
+        actual = sess.run(_zeta(x, q))
+    self.assertAllClose(expected_values, actual, atol=atol, rtol=rtol)
+
+  @parameterized.parameters((np.float32, 1e-2, 1e-11),
+                            (np.float64, 1e-4, 1e-30))
+  @test_util.disable_mlir_bridge('TODO(b/165736950): Add support in MLIR')
+  def testMediumValues(self, dtype, rtol, atol):
+    rtol, atol = self.adjust_tolerance_for_tpu(dtype, rtol, atol)
+    x = np.random.uniform(low=1.1, high=100., size=[NUM_SAMPLES]).astype(dtype)
+    q = np.random.uniform(low=1., high=1e1, size=[NUM_SAMPLES]).astype(dtype)
+
+    expected_values = sps.zeta(x, q)
+    with self.session() as sess:
+      with self.test_scope():
+        actual = sess.run(_zeta(x, q))
+    self.assertAllClose(expected_values, actual, atol=atol, rtol=rtol)
+
+  @parameterized.parameters((np.float32, 2e-2, 1e-5), (np.float64, 1e-4, 1e-30))
+  @test_util.disable_mlir_bridge('TODO(b/165736950): Add support in MLIR')
+  def testLargeValues(self, dtype, rtol, atol):
+    x = np.random.uniform(
+        low=100., high=int(1e3), size=[NUM_SAMPLES]).astype(dtype)
+    q = np.random.uniform(
+        low=1., high=int(1e1), size=[NUM_SAMPLES]).astype(dtype)
+
+    expected_values = sps.zeta(x, q)
+    with self.session() as sess:
+      with self.test_scope():
+        actual = sess.run(_zeta(x, q))
+    self.assertAllClose(expected_values, actual, atol=atol, rtol=rtol)
+
+
+class PolygammaTest(xla_test.XLATestCase, parameterized.TestCase):
+
+  def setUp(self):
+    if flags.FLAGS.vary_seed:
+      entropy = os.urandom(64)
+      if six.PY2:
+        answer = int(entropy.encode('hex'), 16)
+      else:
+        answer = int.from_bytes(entropy, 'big')
+      np.random.seed(answer % (2**32 - 1))
+    super(PolygammaTest, self).setUp()
+
+  def adjust_tolerance_for_tpu(self, dtype, rtol, atol):
+    if self.device not in ['TPU']:
+      return rtol, atol
+
+    if dtype == np.float32:
+      return 2e-2, 1e-7
+    return 2e-4, 1e-20
+
+  @test_util.disable_mlir_bridge('TODO(b/165736950): Add support in MLIR')
+  def testBadValues(self):
+    x = np.random.uniform(low=0.3, high=20., size=[10])
+    with self.session() as sess:
+      with self.test_scope():
+        y = _polygamma(np.float64(-1.), x)
+      actual = sess.run(y)
+    # Not defined for negative numbers.
+    self.assertTrue(np.all(np.isnan(actual)))
+
+    with self.session() as sess:
+      with self.test_scope():
+        y = _polygamma(np.float64(0.1), x)
+      actual = sess.run(y)
+    # Not defined for non-integers.
+    self.assertTrue(np.all(np.isnan(actual)))
+
+  @parameterized.parameters((np.float32, 1e-2, 1e-11),
+                            (np.float64, 1e-4, 1e-30))
+  @test_util.disable_mlir_bridge('TODO(b/165736950): Add support in MLIR')
+  def testRecoverDigamma(self, dtype, rtol, atol):
+    rtol, atol = self.adjust_tolerance_for_tpu(dtype, rtol, atol)
+    if self.device not in ['XLA_GPU', 'XLA_CPU'] and dtype == np.float64:
+      self.skipTest(
+          'Skipping test because some F64 operations are '
+          'numerically unstable on TPU.'
+      )
+
+    x = np.random.uniform(low=0.1, high=50., size=[NUM_SAMPLES]).astype(dtype)
+    expected_values = sps.digamma(x)
+    with self.session() as sess:
+      with self.test_scope():
+        y = _polygamma(dtype(0.), x)
+      actual = sess.run(y)
+
+    self.assertAllClose(expected_values, actual, atol=atol, rtol=rtol)
+
+  @parameterized.parameters((np.float32, 1e-2, 1e-11),
+                            (np.float64, 1e-4, 1e-30))
+  @test_util.disable_mlir_bridge('TODO(b/165736950): Add support in MLIR')
+  def testSmallN(self, dtype, rtol, atol):
+    rtol, atol = self.adjust_tolerance_for_tpu(dtype, rtol, atol)
+    # Test values near zero.
+    n = np.random.randint(low=1, high=5, size=[NUM_SAMPLES]).astype(dtype)
+    x = np.random.uniform(
+        low=np.finfo(dtype).tiny, high=1., size=[NUM_SAMPLES]).astype(dtype)
+
+    expected_values = sps.polygamma(n, x)
+    with self.session() as sess:
+      with self.test_scope():
+        actual = sess.run(_polygamma(n, x))
+    self.assertAllClose(expected_values, actual, atol=atol, rtol=rtol)
+
+  @parameterized.parameters((np.float32, 1e-2, 1e-11),
+                            (np.float64, 1e-4, 1e-30))
+  @test_util.disable_mlir_bridge('TODO(b/165736950): Add support in MLIR')
+  def testMediumLargeN(self, dtype, rtol, atol):
+    rtol, atol = self.adjust_tolerance_for_tpu(dtype, rtol, atol)
+    n = np.random.randint(low=5, high=10, size=[NUM_SAMPLES]).astype(dtype)
+    x = np.random.uniform(low=1., high=1e1, size=[NUM_SAMPLES]).astype(dtype)
+
+    expected_values = sps.polygamma(n, x)
+    with self.session() as sess:
+      with self.test_scope():
+        actual = sess.run(_polygamma(n, x))
+    self.assertAllClose(expected_values, actual, atol=atol, rtol=rtol)
+
+
 class IgammaTest(xla_test.XLATestCase, parameterized.TestCase):
 
   def setUp(self):
diff --git a/tensorflow/compiler/tests/stateful_random_ops_test.py b/tensorflow/compiler/tests/stateful_random_ops_test.py
index 343969c40d7..239b99de19e 100644
--- a/tensorflow/compiler/tests/stateful_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateful_random_ops_test.py
@@ -25,7 +25,9 @@ import numpy as np
 
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.client import device_lib
+from tensorflow.python.compat import compat
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
@@ -156,6 +158,10 @@ class StatefulRandomOpsTest(xla_test.XLATestCase, parameterized.TestCase):
   def testNewStateThreeFry(self):
     """Tests that the new state is correct (for ThreeFry).
     """
+    if compat.forward_compatible(2020, 10, 25):
+      self.skipTest("The expected values in this test is inconsistent with "
+                    "CPU/GPU. testXLAEqualsCPU has the correct checks of the "
+                    "new states for the new version.")
     with ops.device(xla_device_name()):
       counter = 57
       key = 0x1234
@@ -171,6 +177,10 @@ class StatefulRandomOpsTest(xla_test.XLATestCase, parameterized.TestCase):
   def testNewStatePhilox(self):
     """Tests that the new state is correct (for Philox).
     """
+    if compat.forward_compatible(2020, 10, 25):
+      self.skipTest("The expected values in this test is inconsistent with "
+                    "CPU/GPU. testXLAEqualsCPU has the correct checks of the "
+                    "new states for the new version.")
     with ops.device(xla_device_name()):
       counter_low = 57
       counter_high = 283
@@ -204,13 +214,39 @@ class StatefulRandomOpsTest(xla_test.XLATestCase, parameterized.TestCase):
     """Tests that XLA and CPU kernels generate the same integers."""
     seed = 1234
     shape = [315, 49]
-    with ops.device("/device:CPU:0"):
-      cpu = (random.Generator.from_seed(seed=seed, alg=random.RNG_ALG_PHILOX)
-             .uniform_full_int(shape=shape, dtype=dtype))
-    with ops.device(xla_device_name()):
-      xla = (random.Generator.from_seed(seed=seed, alg=random.RNG_ALG_PHILOX)
-             .uniform_full_int(shape=shape, dtype=dtype))
-    self.assertAllEqual(cpu, xla)
+    if compat.forward_compatible(2020, 10, 25):
+      with ops.device("/device:CPU:0"):
+        cpu_gen = random.Generator.from_seed(
+            seed=seed, alg=random.RNG_ALG_PHILOX)
+      with ops.device(xla_device_name()):
+        xla_gen = random.Generator.from_seed(
+            seed=seed, alg=random.RNG_ALG_PHILOX)
+      # Repeat multiple times to make sure that the state after
+      # number-generation are the same between CPU and XLA.
+      for _ in range(5):
+        with ops.device("/device:CPU:0"):
+          # Test both number-generation and skip
+          cpu = cpu_gen.uniform_full_int(shape=shape, dtype=dtype)
+          cpu_gen.skip(100)
+        with ops.device(xla_device_name()):
+          xla = xla_gen.uniform_full_int(shape=shape, dtype=dtype)
+          xla_gen.skip(100)
+        self.assertAllEqual(cpu, xla)
+        self.assertAllEqual(cpu_gen.state, xla_gen.state)
+    else:
+      # The old version doesn't guarantee that CPU and XLA are in the same state
+      # after number-generation, which is a bug.
+      with ops.device("/device:CPU:0"):
+        cpu = (
+            random.Generator.from_seed(
+                seed=seed, alg=random.RNG_ALG_PHILOX).uniform_full_int(
+                    shape=shape, dtype=dtype))
+      with ops.device(xla_device_name()):
+        xla = (
+            random.Generator.from_seed(
+                seed=seed, alg=random.RNG_ALG_PHILOX).uniform_full_int(
+                    shape=shape, dtype=dtype))
+      self.assertAllEqual(cpu, xla)
 
   def _testRngIsNotConstant(self, rng, dtype):
     # Tests that 'rng' does not always return the same value.
@@ -364,4 +400,5 @@ class StatefulRandomOpsTest(xla_test.XLATestCase, parameterized.TestCase):
 
 if __name__ == "__main__":
   ops.enable_eager_execution()
+  config.set_soft_device_placement(False)
   test.main()
diff --git a/tensorflow/compiler/tests/stateless_random_ops_test.py b/tensorflow/compiler/tests/stateless_random_ops_test.py
index f9d792806b0..23e827f18e8 100644
--- a/tensorflow/compiler/tests/stateless_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateless_random_ops_test.py
@@ -21,7 +21,11 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
+from tensorflow.python.compiler.xla import xla
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.kernel_tests.random import util as \
 random_test_util
 from tensorflow.python.ops import array_ops
@@ -39,6 +43,26 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
       allowed_types.update({dtypes.int32, dtypes.int64})
     return self.all_tf_types & allowed_types
 
+  @test_util.run_v2_only
+  def testForcedCompile(self):
+    """Tests whole-function forced-compilation.
+
+    This test checks that stateless_random_* can be used in forced-compilation
+    scenarios (e.g. TPU). The new version of stateless_random_* requires the
+    intermediate tensor `alg` to be compile-time constant, so we need to check
+    that this requirement is met. We use xla.compile instead of tf.function's
+    experimental_compile because the latter doesn't throw an error even if the
+    compile-time-constant constraint is not met.
+    """
+    if config.list_logical_devices('TPU'):
+      self.skipTest('To accommodate OSS, xla.compile support for TPU is not '
+                    'linked in.')
+    @def_function.function
+    def f(x):
+      return xla.compile(
+          lambda x: stateless.stateless_random_normal([], seed=x), [x])
+    f([1, 2])
+
   def testDeterminism(self):
     # Stateless values should be equal iff the seeds are equal (roughly)
     with self.session(), self.test_scope():
@@ -138,7 +162,7 @@ class StatelessRandomOpsBenchmark(test.Benchmark):
 
   def _benchmarkUniform(self, name, dtype, use_xla_jit):
 
-    def BuilderFn():
+    def builder_fn():
       shape = (10, 1000, 1000)
       seed_var = variables.Variable((312, 456),
                                     dtype=dtypes.int32,
@@ -147,7 +171,7 @@ class StatelessRandomOpsBenchmark(test.Benchmark):
           shape, seed=seed_var, dtype=dtype)
       return '%s.shape%s' % (name, shape), [random_t]
 
-    xla_test.Benchmark(self, BuilderFn, use_xla_jit=use_xla_jit, device='cpu')
+    xla_test.Benchmark(self, builder_fn, use_xla_jit=use_xla_jit, device='cpu')
 
   def benchmarkUniformF32(self):
     self._benchmarkUniform(
@@ -167,4 +191,5 @@ class StatelessRandomOpsBenchmark(test.Benchmark):
 
 
 if __name__ == '__main__':
+  config.set_soft_device_placement(False)
   test.main()
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index b5f82bcff12..f3f6fa8ae52 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -542,7 +542,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
     for dtype in self.float_types:
 
       def quantize_and_dequantize_v2(x):
-        return array_ops.quantize_and_dequantize_v2(
+        return array_ops.quantize_and_dequantize(
             x, -127, 127, signed_input=True, num_bits=8)
 
       self._assertOpOutputMatchesExpected(
@@ -551,7 +551,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
           expected=np.array([-1., -0.5, 0., 0.296875], dtype=dtype))
 
       def quantize_and_dequantize_v2_round_half_up(x):
-        return array_ops.quantize_and_dequantize_v2(
+        return array_ops.quantize_and_dequantize(
             x,
             -1,
             1.0,
@@ -575,7 +575,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
                             dtype=dtype))
 
       def quantize_and_dequantize_v2_round_half_to_even(x):
-        return array_ops.quantize_and_dequantize_v2(
+        return array_ops.quantize_and_dequantize(
             x,
             -1.0,
             1.0,
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index 3e9f5e8c5dd..b80b6263992 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -18,12 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
 from tensorflow.compiler.tf2xla.python import xla
 from tensorflow.compiler.xla import xla_data_pb2
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
@@ -299,6 +302,78 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
           args=(np.arange(12, dtype=np.int32).astype(dtype).reshape([3, 4]),),
           expected=np.array([0, 45, 120, 231], dtype=dtype))
 
+  @test_util.disable_mlir_bridge('Not supported yet')
+  def testVariadicReduce(self):
+    for dtype in set(self.numeric_types).intersection(
+        set([np.float32, np.complex64])):
+
+      @def_function.function
+      def kahan_sum_reducer(t0, t1):
+        (s0, c0), (s1, c1) = t0, t1
+        s0minusc = s0 - (c0 + c1)
+        t = s1 + s0minusc
+        c = (t - s1) - s0minusc
+        s = t
+        return s, c
+
+      def kahan_sum_reduction(dims, output_idx):
+
+        def fn(x):
+          arg = array_ops.zeros([], dtype)  # pylint: disable=cell-var-from-loop
+          reducer = kahan_sum_reducer.get_concrete_function(
+              (arg, arg), (arg, arg))
+
+          return xla.variadic_reduce(
+              (x, array_ops.zeros_like(x)),
+              init_value=(arg, arg),
+              dimensions_to_reduce=dims,
+              reducer=reducer)[output_idx]
+
+        return fn
+
+      xs = np.array([1e5, np.pi, -1e5, np.exp(1.)])
+      xs = np.array([xs, xs[::-1] / 3, xs / 7], dtype)
+      self._assertOpOutputMatchesExpected(
+          kahan_sum_reduction(dims=[], output_idx=0),
+          args=(xs,), expected=xs)
+      self._assertOpOutputMatchesExpected(
+          kahan_sum_reduction(dims=[], output_idx=1),
+          args=(xs,), expected=np.zeros_like(xs))
+      shuffle_indices = np.argsort(np.random.randn(xs.shape[0]))
+      self._assertOpOutputMatchesExpected(
+          kahan_sum_reduction(dims=[0], output_idx=0),
+          args=(xs[shuffle_indices],),
+          expected=np.array([np.exp(1) / 3 + 1e5 * 8 / 7,
+                             np.pi * 8 / 7 - 1e5 / 3,
+                             -1e5 * 8 / 7 + np.pi / 3,
+                             np.exp(1) * 8 / 7 + 1e5 / 3], dtype=dtype))
+      error_term_equality = functools.partial(self.assertAllClose, atol=.005)
+      self._assertOpOutputMatchesExpected(
+          kahan_sum_reduction(dims=[0], output_idx=1),
+          args=(xs[shuffle_indices],), expected=np.zeros_like(xs[0]),
+          equality_fn=error_term_equality)
+      shuffle_indices = np.argsort(np.random.randn(xs.shape[1]))
+      self._assertOpOutputMatchesExpected(
+          kahan_sum_reduction(dims=[1], output_idx=0),
+          args=(xs[:, shuffle_indices],),
+          expected=np.array([np.pi + np.exp(1.),
+                             (np.pi + np.exp(1.)) / 3,
+                             (np.pi + np.exp(1.)) / 7], dtype=dtype))
+      self._assertOpOutputMatchesExpected(
+          kahan_sum_reduction(dims=[1], output_idx=1),
+          args=(xs[:, shuffle_indices],), expected=np.zeros_like(xs[:, 0]),
+          equality_fn=error_term_equality)
+      # Now, shuffle both dims.
+      xs = xs[np.argsort(np.random.randn(xs.shape[0]))]
+      xs = xs[:, np.argsort(np.random.randn(xs.shape[1]))]
+      self._assertOpOutputMatchesExpected(
+          kahan_sum_reduction(dims=[0, 1], output_idx=0),
+          args=(xs,), expected=dtype((np.pi + np.exp(1.)) * 31 / 21))
+      self._assertOpOutputMatchesExpected(
+          kahan_sum_reduction(dims=[0, 1], output_idx=1),
+          args=(xs,), expected=dtype(0),
+          equality_fn=error_term_equality)
+
   @test_util.disable_mlir_bridge('Not supported yet')
   def testSelectAndScatter(self):
     for dtype in set(self.numeric_types).intersection(
diff --git a/tensorflow/compiler/tests/xla_test.py b/tensorflow/compiler/tests/xla_test.py
index 8c31629c234..de97c6ff210 100644
--- a/tensorflow/compiler/tests/xla_test.py
+++ b/tensorflow/compiler/tests/xla_test.py
@@ -237,8 +237,8 @@ class XLATestCase(test.TestCase):
         'test_session not supported on XLATestCase, please use session')
 
   @contextlib.contextmanager
-  def test_scope(self):
-    """Test scope that runs tests on `self.device`.
+  def device_scope(self):
+    """Scope that runs tests on `self.device`.
 
     Yields:
       A scope to apply to the operators under test.
@@ -246,6 +246,15 @@ class XLATestCase(test.TestCase):
     with ops.device('device:{}:0'.format(self.device)):
       yield
 
+  def test_scope(self):
+    """Deprecated alias of `device_scope`.
+
+    This should be avoided as the name starts with `test`, so test runners
+    treat it as a test. This interferes with class decorators that operate on
+    each test method.
+    """
+    return self.device_scope()
+
 
 def Benchmark(tf_bench,
               builder_fn,
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index 44fb5513886..a82c1c485b9 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -3,13 +3,13 @@
 #   and provide TensorRT operators and converter package.
 #   APIs are meant to change over time.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
     "tf_copts",
     "tf_cuda_library",
     "tf_custom_op_library_additional_deps",
-    "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
 )
 
@@ -21,6 +21,9 @@ load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "pybind_extension")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_additional_all_protos",
@@ -33,8 +36,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 cc_library(
     name = "tensorrt_stub",
     srcs = if_tensorrt([
@@ -69,7 +70,7 @@ tf_cuda_cc_test(
     deps = [
         "//tensorflow/core/common_runtime/gpu:gpu_init",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ] + if_tensorrt([
@@ -107,18 +108,20 @@ cc_library(
         ":common_utils",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
-        "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/core:framework",
         "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:lib_proto_parsing",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
         "//tensorflow/core:stream_executor_headers_lib",
         "//tensorflow/core/common_runtime:core_cpu_lib_no_ops",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/stream_executor/lib",
-    ] + if_tensorrt([":tensorrt_lib"]) + tf_custom_op_library_additional_deps(),
+    ] + if_tensorrt([
+        ":tensorrt_lib",
+        "@local_config_cuda//cuda:cuda_headers",
+    ]) + tf_custom_op_library_additional_deps(),
     alwayslink = 1,
 )
 
@@ -480,7 +483,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
+        "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -492,15 +495,32 @@ tf_cuda_cc_test(
 
 # Library for the segmenting portion of TensorRT operation creation
 cc_library(
-    name = "segment",
-    srcs = ["segment/segment.cc"],
+    name = "union_find",
+    srcs = ["segment/union_find.cc"],
     hdrs = [
-        "segment/segment.h",
         "segment/union_find.h",
     ],
     copts = tf_copts(),
+    deps = [
+        ":utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+cc_library(
+    name = "segment",
+    srcs = ["segment/segment.cc"],
+    hdrs = [
+        "segment/segment.h",
+    ],
+    copts = tf_copts(),
     deps = [
         ":common_utils",
+        ":union_find",
         ":utils",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index c4fc3e4f5da..28c08cd2ddc 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -733,6 +733,8 @@ Status ConvertAfterShapes(const ConversionParams& params) {
   }
   segment_options.minimum_segment_size = params.minimum_segment_size;
   segment_options.use_implicit_batch = params.use_implicit_batch;
+  if (segment_options.use_implicit_batch)
+    segment_options.maximum_batch_size = params.max_batch_size;
   segment_options.allow_dynamic_non_batch_dim =
       AllowDynamicNonBatchDimension(params);
 
@@ -753,13 +755,10 @@ Status ConvertAfterShapes(const ConversionParams& params) {
   // Get the EngineInfo for each segment.
   std::unordered_map<string, Node*> node_map;
   TF_RETURN_IF_ERROR(BuildNodeMap(graph, &node_map));
-  float total_num_nodes_in_segments = 0.;
   std::vector<EngineInfo> engine_segments;
   engine_segments.reserve(initial_segments.size());
   std::vector<Node*> reverse_topo_order;
   GetPostOrder(graph, &reverse_topo_order);
-  size_t total_engine_bytes_size = 0;
-  std::vector<size_t> engine_bytes_size;
   segment::SegmentNodesVector converted_segments;
   converted_segments.reserve(initial_segments.size());
   string engine_name_prefix =
@@ -791,9 +790,6 @@ Status ConvertAfterShapes(const ConversionParams& params) {
       continue;
     }
 
-    engine_bytes_size.push_back(curr_engine.segment_graph_def.ByteSizeLong());
-    total_engine_bytes_size += engine_bytes_size.back();
-    total_num_nodes_in_segments += curr_segment.size();
     engine_segments.push_back(std::move(curr_engine));
     converted_segments.push_back(std::move(curr_segment));
 
@@ -832,13 +828,9 @@ Status ConvertAfterShapes(const ConversionParams& params) {
   engine_nodes.resize(engine_segments.size());
   for (int i = 0; i < engine_segments.size(); ++i) {
     auto& engine = engine_segments.at(i);
-    // Partition the workspace size by the average of node ratio and segment
-    // graphdef size
-    engine.max_workspace_size_bytes =
-        params.max_workspace_size_bytes *
-        (engine_bytes_size.at(i) / total_engine_bytes_size +
-         converted_segments.at(i).size() / total_num_nodes_in_segments) /
-        2.0;
+    // TODO(b/170762693): implement the heuristic to calculate
+    // max_workspace_size_bytes.
+    engine.max_workspace_size_bytes = params.max_workspace_size_bytes;
     VLOG(1) << "Assigned " << engine.max_workspace_size_bytes << " bytes to "
             << engine.engine_name;
     auto status = CreateTRTNode(params, engine_segments, i,
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
index 3b0553426c0..be3bb51dbed 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
@@ -151,7 +151,8 @@ TEST(ConvertGraphTest, GetDeviceAndAllocator) {
 
 class ConvertAfterShapesTest : public ::testing::Test {
  public:
-  Status RunConvertAfterShape(Scope s, GraphDef* output_graph_def) {
+  Status RunConvertAfterShape(Scope s, GraphDef* output_graph_def,
+                              int maximum_batch_size = 1000) {
     // Create GraphProperties.
     grappler::GrapplerItem item;
     TF_EXPECT_OK(s.ToGraphDef(&item.graph));
@@ -162,6 +163,7 @@ class ConvertAfterShapesTest : public ::testing::Test {
     const std::vector<string> output_names{"output"};
     ConversionParams params;
     params.output_names = &output_names;
+    params.max_batch_size = maximum_batch_size;
     params.max_workspace_size_bytes = 8 << 20;
     params.output_graph_def = output_graph_def;
     params.minimum_segment_size = 1;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index c0c3f25177e..d09485c35c7 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
@@ -429,11 +430,52 @@ Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
   return Status::OK();
 }
 
+std::string GetLayerNameSuffix(absl::string_view sub_op_name,
+                               absl::optional<int> sub_op_instance) {
+  std::string op_suffix(sub_op_name);
+  if (sub_op_instance.has_value()) {
+    op_suffix =
+        absl::StrCat(op_suffix, "_", std::to_string(sub_op_instance.value()));
+  }
+  return op_suffix;
+}
+
+// Sets the name of an ILayer using the name of the node_def. If the operation
+// represented by the ILayer is generated by the converter to support the
+// conversion of node_def, callers need to specify a non-empty sub_op_name
+// to be appended to the name of node_def to avoid layer name conflicts. If the
+// operation is generated multiple times, callers also need to specify
+// sub_op_instance to be appended to the name of the layers to avoid layer name
+// conflicts.
+void SetLayerName(nvinfer1::ILayer* layer, const NodeDef& node_def,
+                  absl::string_view sub_op_name = "",
+                  absl::optional<int> sub_op_instance = absl::nullopt) {
+  std::string sub_op_suffix = GetLayerNameSuffix(sub_op_name, sub_op_instance);
+  if (sub_op_suffix.empty()) {
+    layer->setName(node_def.name().c_str());
+  } else {
+    layer->setName(absl::StrCat(node_def.name(), "-", sub_op_suffix).c_str());
+  }
+}
+
+// Sets the name of an ILayer using the format of
+// "main_op_name"_"sub_op_name"_"sub_op_instance".
+void SetLayerName(nvinfer1::ILayer* layer, absl::string_view main_op_name,
+                  absl::string_view sub_op_name,
+                  absl::optional<int> sub_op_instance = absl::nullopt) {
+  std::string layer_name_suffix =
+      GetLayerNameSuffix(sub_op_name, sub_op_instance);
+  layer->setName(absl::StrCat(main_op_name, "-", layer_name_suffix).c_str());
+}
+
 nvinfer1::ITensor* Converter::CreateConstantLayer(
     const TRT_ShapedWeights& weights, const nvinfer1::Dims& dims) {
   nvinfer1::Weights trt_weights = weights.GetTrtWeights();
   nvinfer1::IConstantLayer* layer = network()->addConstant(dims, trt_weights);
   if (!layer) return nullptr;
+  SetLayerName(layer, "_tftrt_constant_",
+               std::to_string(next_constant_layer_id_));
+  next_constant_layer_id_++;
   nvinfer1::ITensor* trt_tensor = layer->getOutput(0);
 #if !IS_TRT_VERSION_GE(5, 1, 3, 0)
   // TODO(laigd): there is a bug in TensorRT 5.0 library that, if we don't set
@@ -1313,6 +1355,7 @@ Status Converter::AddInputTensor(const string& name, nvinfer1::DataType dtype,
 
 Status Converter::RenameAndMarkOutputTensors(
     const std::vector<Converter::EngineOutputInfo>& output_tensors) {
+  int output_index = 0;
   for (const auto& output : output_tensors) {
     TRT_TensorOrWeights tensor_or_weights;
     TF_RETURN_IF_ERROR(
@@ -1341,6 +1384,7 @@ Status Converter::RenameAndMarkOutputTensors(
       nvinfer1::IShuffleLayer* layer = network()->addShuffle(*tensor);
       TFTRT_RETURN_ERROR_IF_NULLPTR(
           layer, StrCat("Output Copy for ", tensor->getName()));
+      SetLayerName(layer, tensor->getName(), "shuffle", output_index);
       MarkQuantizationRangesAsInferrable(tensor, layer->getOutput(0));
       tensor = layer->getOutput(0);
     }
@@ -1349,6 +1393,7 @@ Status Converter::RenameAndMarkOutputTensors(
     // Set type after marking as output. TRT only supports setType for engine
     // outputs and inputs (type is inferred otherwise).
     tensor->setType(output.trt_dtype);
+    output_index++;
     VLOG(1) << "Marking output TRT tensor " << output.source_tensor_name
             << " with data type " << DebugString(output.trt_dtype)
             << ", which feeds TF node " << output.dest_node_name;
@@ -1475,8 +1520,9 @@ Status Converter::GetTensorOrWeights(const string& name,
 
 Status Converter::TransposeTensor(nvinfer1::ITensor* input_tensor,
                                   const std::vector<int>& order_with_batch_dim,
-                                  absl::string_view name,
-                                  nvinfer1::ITensor** output_tensor) {
+                                  nvinfer1::ITensor** output_tensor,
+                                  const NodeDef& node_def,
+                                  absl::string_view sub_op_name) {
   const auto dims = input_tensor->getDimensions();
   const int order_size = use_implicit_batch_ ? order_with_batch_dim.size() - 1
                                              : order_with_batch_dim.size();
@@ -1491,7 +1537,8 @@ Status Converter::TransposeTensor(nvinfer1::ITensor* input_tensor,
 
   nvinfer1::IShuffleLayer* layer = this->network()->addShuffle(*input_tensor);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Transpose");
-  layer->setName(std::basic_string<char>(name).c_str());
+  SetLayerName(layer, node_def, sub_op_name);
+
   MarkQuantizationRangesAsInferrable(input_tensor, layer->getOutput(0));
 
   nvinfer1::Permutation permutation;
@@ -1555,7 +1602,9 @@ Status Converter::GetWeightRange(const TRT_ShapedWeights& weights,
 Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
                                         const nvinfer1::Dims& dims,
                                         const bool validation_only,
-                                        nvinfer1::ITensor** tensor) {
+                                        nvinfer1::ITensor** tensor,
+                                        const NodeDef& node_def,
+                                        absl::optional<int> op_instance) {
   const nvinfer1::Dims input_dims = input.GetTrtDims();
   // If one of input_dims and dims doesn't have static shape, it means some of
   // the dims are unknown or need to be inferred. And we don't do further checks
@@ -1586,6 +1635,7 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
       nvinfer1::IShuffleLayer* layer =
           this->network()->addShuffle(*input.tensor());
       TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Reshape");
+      SetLayerName(layer, node_def, "shuffle", op_instance);
       layer->setReshapeDimensions(dims);
       MarkQuantizationRangesAsInferrable(input.tensor(), layer->getOutput(0));
       *tensor = layer->getOutput(0);
@@ -2086,6 +2136,7 @@ Status Conv2DPaddingHelper(OpConverterParams* params, const TFAttrs& attrs,
         *tensor, nvinfer1::DimsHW((*padding)[0].first, (*padding)[1].first),
         nvinfer1::DimsHW((*padding)[0].second, (*padding)[1].second));
     TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, params->node_def.name());
+    SetLayerName(pad_layer, params->node_def, "pad");
     params->converter->MarkQuantizationRangesAsInferrable(
         tensor, pad_layer->getOutput(0));
     *padding = {{0, 0}, {0, 0}};
@@ -2186,7 +2237,7 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
   const bool need_transpose = (data_format == "NHWC");
   if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        tensor, {0, 3, 1, 2}, StrCat(node_def.name(), "_to_NCHW"), &tensor));
+        tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW"));
   }
   // Dimensions of transposed tensor.
   const auto tensor_dim = tensor->getDimensions();
@@ -2252,7 +2303,6 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
 #else
     layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
 #endif
-    layer->setName(node_def.name().c_str());
     layer->setNbGroups(num_groups);
     conv_layer = layer;
   } else {
@@ -2269,11 +2319,11 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
 #else
     layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
 #endif
-    layer->setName(node_def.name().c_str());
     layer->setNbGroups(num_groups);
     layer->setDilation(dilation);
     conv_layer = layer;
   }
+  SetLayerName(conv_layer, node_def, "conv");
   nvinfer1::ITensor* output_tensor = conv_layer->getOutput(0);
   // Add an extra padding for Deconv because TRT doesn't accept the
   // argument output_shape and thus the TRT output shape could be wrong
@@ -2306,13 +2356,13 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
           params->converter->network()->addPadding(*output_tensor, pre_padding,
                                                    post_padding);
       output_tensor = padding_layer->getOutput(0);
+      SetLayerName(padding_layer, node_def, "pad");
     }
   }
   // Restore transpose.
   if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        output_tensor, {0, 2, 3, 1}, StrCat(node_def.name(), "_to_NHWC"),
-        &output_tensor));
+        output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC"));
   }
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -2370,7 +2420,7 @@ Status ConvertTranspose(OpConverterParams* params) {
   // Start conversion.
   nvinfer1::ITensor* output_tensor = nullptr;
   TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-      input_tensor, perm, params->node_def.name(), &output_tensor));
+      input_tensor, perm, &output_tensor, params->node_def));
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
 }
@@ -2401,6 +2451,7 @@ Status ConvertShape(OpConverterParams* params) {
   nvinfer1::IShapeLayer* shape_layer =
       params->converter->network()->addShape(*inputs.at(0).tensor());
   TFTRT_RETURN_ERROR_IF_NULLPTR(shape_layer, params->node_def.name());
+  SetLayerName(shape_layer, params->node_def, "shape");
   params->outputs->push_back(TRT_TensorOrWeights(shape_layer->getOutput(0)));
   return Status::OK();
 #else
@@ -2471,7 +2522,7 @@ Status ConvertReshape(OpConverterParams* params) {
   nvinfer1::ITensor* output_tensor = nullptr;
   TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
       input_tensor, output_nonbatch_dims, params->validation_only,
-      &output_tensor));
+      &output_tensor, params->node_def));
   if (params->validation_only) return Status::OK();
 
   // Record the conversion result.
@@ -2514,7 +2565,8 @@ Status ConvertExpandDims(OpConverterParams* params) {
     nvinfer1::Dims new_dims;
     TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims));
     TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-        input_tensor, new_dims, /*validation_only=*/false, &output_tensor));
+        input_tensor, new_dims, /*validation_only=*/false, &output_tensor,
+        params->node_def));
   }
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -2524,7 +2576,8 @@ Status Converter::DynamicReshape(nvinfer1::ITensor* input,
                                  std::vector<std::pair<int, int>> slices,
                                  OpConverterParams* params,
                                  nvinfer1::ITensor** output,
-                                 std::vector<int> size_for_added_dims) {
+                                 std::vector<int> size_for_added_dims,
+                                 absl::optional<int> op_instance) {
   *output = nullptr;
   // DynamicReshape relies on INetworkDefinition::addShape that was introduced
   // in TensorRT 6.
@@ -2536,9 +2589,11 @@ Status Converter::DynamicReshape(nvinfer1::ITensor* input,
   nvinfer1::ITensor* shape = network()->addShape(*input)->getOutput(0);
   // Build new shape = shape[:trt_axis] + [1] + shape[trt_axis:]
   std::vector<nvinfer1::ITensor const*> concat_inputs;
-  for (int i = 0; i < std::max(slices.size(), size_for_added_dims.size());
-       i++) {
+  int max_num_slices = std::max(slices.size(), size_for_added_dims.size());
+  int op_instance_value = op_instance.has_value() ? op_instance.value() : 0;
+  for (int i = 0; i < max_num_slices; i++) {
     nvinfer1::ITensor* tensor;
+    int slice_instance = i * max_num_slices + op_instance_value;
     // maybe_add_a_dimension(i);
     if (i < size_for_added_dims.size() && size_for_added_dims[i] >= 0) {
       TF_RETURN_IF_ERROR(
@@ -2546,11 +2601,11 @@ Status Converter::DynamicReshape(nvinfer1::ITensor* input,
       concat_inputs.push_back(tensor);
     }
     if (i < slices.size()) {
-      concat_inputs.push_back(
-          network()
-              ->addSlice(*shape, {1, {slices[i].first}},
-                         {1, {slices[i].second - slices[i].first}}, {1, {1}})
-              ->getOutput(0));
+      nvinfer1::ISliceLayer* slice_layer = network()->addSlice(
+          *shape, {1, {slices[i].first}},
+          {1, {slices[i].second - slices[i].first}}, {1, {1}});
+      concat_inputs.push_back(slice_layer->getOutput(0));
+      SetLayerName(slice_layer, params->node_def, "slice", slice_instance);
     }
   }
   nvinfer1::IConcatenationLayer* concat_layer = network()->addConcatenation(
@@ -2560,6 +2615,7 @@ Status Converter::DynamicReshape(nvinfer1::ITensor* input,
   nvinfer1::ITensor* new_shape = concat_layer->getOutput(0);
   // Reshape input using new shape
   nvinfer1::IShuffleLayer* shuffle = network()->addShuffle(*input);
+  SetLayerName(shuffle, params->node_def, "shuffle", op_instance);
   shuffle->setInput(1, *new_shape);
   *output = shuffle->getOutput(0);
   return Status::OK();
@@ -2572,7 +2628,8 @@ Status Converter::DynamicReshape(nvinfer1::ITensor* input,
 Status Converter::DynamicExpandDims(nvinfer1::ITensor* input,
                                     const nvinfer1::Dims& dims, int axis,
                                     OpConverterParams* params,
-                                    nvinfer1::ITensor** output) {
+                                    nvinfer1::ITensor** output,
+                                    absl::optional<int> op_instance) {
   if (params->validation_only) {
     *output = nullptr;
     return errors::Internal(
@@ -2588,7 +2645,7 @@ Status Converter::DynamicExpandDims(nvinfer1::ITensor* input,
   if (axis != dims.nbDims) {
     slices.push_back(std::pair<int, int>{axis, dims.nbDims});
   }
-  return DynamicReshape(input, slices, params, output, extra_dims);
+  return DynamicReshape(input, slices, params, output, extra_dims, op_instance);
 }
 
 Status Converter::SqueezeTensor(nvinfer1::ITensor* input,
@@ -2616,7 +2673,8 @@ Status Converter::SqueezeTensor(nvinfer1::ITensor* input,
   VLOG(2) << "input_dims" << input_dims;
   TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(*input_dims, &new_dims));
   TF_RETURN_IF_ERROR(PrepareTensorForShape(TRT_TensorOrWeights(input), new_dims,
-                                           /*validation_only=*/false, output));
+                                           /*validation_only=*/false, output,
+                                           params->node_def));
   return Status::OK();
 }
 
@@ -2680,11 +2738,11 @@ Status ConvertSqueeze(OpConverterParams* params) {
 }
 
 template <typename Container>
-Status ConvertStridedSliceHelper(OpConverterParams* params,
-                                 const TRT_TensorOrWeights& input,
-                                 Container begin, Container size,
-                                 const Container& stride,
-                                 const nvinfer1::Dims* final_shape = nullptr) {
+Status ConvertStridedSliceHelper(
+    OpConverterParams* params, const TRT_TensorOrWeights& input,
+    Container begin, Container size, const Container& stride,
+    const nvinfer1::Dims* final_shape = nullptr,
+    absl::optional<int> op_instance = absl::nullopt) {
   const auto& node_def = params->node_def;
   // Get input dims.
   nvinfer1::Dims dims = input.GetTrtDims();
@@ -2709,6 +2767,7 @@ Status ConvertStridedSliceHelper(OpConverterParams* params,
                                      node_def.op(), ", at ", node_def.name());
     }
   }
+
 // TRT 5.1 adds ISliceLayer. For older versions, we attempt to use the
 // padding layer with negative padding.
 #if IS_TRT_VERSION_GE(5, 1, 3, 1)
@@ -2723,12 +2782,13 @@ Status ConvertStridedSliceHelper(OpConverterParams* params,
 
   nvinfer1::ISliceLayer* layer = params->converter->network()->addSlice(
       *input.tensor(), begin_dims, size_dims, stride_dims);
+  SetLayerName(layer, params->node_def, "slice", op_instance);
   nvinfer1::ITensor* tensor = layer->getOutput(0);
   // Reshape for shrink_axis.
   if (final_shape) {
     TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
         TRT_TensorOrWeights(tensor), *final_shape, /*validation_only=*/false,
-        &tensor));
+        &tensor, node_def, op_instance));
   }
   params->outputs->push_back(TRT_TensorOrWeights(tensor));
   return Status::OK();
@@ -2782,6 +2842,7 @@ Status ConvertStridedSliceHelper(OpConverterParams* params,
     if (params->validation_only) return Status::OK();
     nvinfer1::IShuffleLayer* layer =
         params->converter->network()->addShuffle(*input.tensor());
+    SetLayerName(layer, params->node_def, "shuffle", op_instance);
     params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
     return Status::OK();
   } else if (pad_dims.size() == 1) {
@@ -2830,30 +2891,32 @@ Status ConvertStridedSliceHelper(OpConverterParams* params,
   nvinfer1::ITensor* tensor = input.tensor();
   if (need_reshape) {
     TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-        input, reshape_dims, /*validation_only=*/false, &tensor));
+        input, reshape_dims, /*validation_only=*/false, &tensor, node_def,
+        op_instance));
   }
   if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        tensor, transpose_order, StrCat(node_def.name(), "_for_pad"), &tensor));
+        tensor, transpose_order, &tensor, node_def, "for_pad", op_instance));
   }
   // Add padding layer
   nvinfer1::IPaddingLayer* layer = params->converter->network()->addPadding(
       *tensor, pre_padding, post_padding);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, params->node_def, "pad");
   params->converter->MarkQuantizationRangesAsInferrable(tensor,
                                                         layer->getOutput(0));
   tensor = layer->getOutput(0);
   // Restore transpose
   if (need_transpose) {
-    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        tensor, inv_transpose_order, StrCat(node_def.name(), "_after_pad"),
-        &tensor));
+    TF_RETURN_IF_ERROR(
+        params->converter->TransposeTensor(tensor, inv_transpose_order, &tensor,
+                                           node_def, "after_pad", op_instance));
   }
   // Reshape for shrink_axis.
   if (final_shape) {
     TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
         TRT_TensorOrWeights(tensor), *final_shape, /*validation_only=*/false,
-        &tensor));
+        &tensor, node_def, op_instance));
   } else if (need_reshape) {
     // Restore reshape.
     // Calculate output dimensions
@@ -2876,7 +2939,7 @@ Status ConvertStridedSliceHelper(OpConverterParams* params,
                                                  /*ignore_first_dim=*/true));
     TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
         TRT_TensorOrWeights(tensor), new_dims, /*validation_only=*/false,
-        &tensor));
+        &tensor, node_def, op_instance));
   }
 
   params->outputs->push_back(TRT_TensorOrWeights(tensor));
@@ -3166,8 +3229,7 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group,
   const bool need_transpose = is_ndhwc;
   if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        tensor, {0, 4, 1, 2, 3}, StrCat(node_def.name(), "_to_NCDHW"),
-        &tensor));
+        tensor, {0, 4, 1, 2, 3}, &tensor, node_def, "to_NCDHW"));
   }
 
   // group == 0 signifies that this is a depthwise convolution, so set
@@ -3206,7 +3268,6 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group,
       layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
     }
 
-    layer->setName(node_def.name().c_str());
     layer->setNbGroups(num_groups);
     conv_layer = layer;
   } else {
@@ -3222,18 +3283,17 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group,
       layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
     }
 
-    layer->setName(node_def.name().c_str());
     layer->setNbGroups(num_groups);
     layer->setDilationNd(dilation_dhw);
     conv_layer = layer;
   }
+  SetLayerName(conv_layer, node_def, "conv");
   nvinfer1::ITensor* output_tensor = conv_layer->getOutput(0);
 
   // Restore transpose.
   if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        output_tensor, {0, 2, 3, 4, 1}, StrCat(node_def.name(), "_to_NDHWC"),
-        &output_tensor));
+        output_tensor, {0, 2, 3, 4, 1}, &output_tensor, node_def, "to_NDHWC"));
   }
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -3302,8 +3362,7 @@ Status ConvertPool3D(OpConverterParams* params) {
   if (data_format == "NDHWC") {
     // NDHWC => NCDHW
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        tensor, {0, 4, 1, 2, 3}, StrCat(node_def.name(), "_to_NCDHW"),
-        &tensor));
+        tensor, {0, 4, 1, 2, 3}, &tensor, node_def, "to_NCDHW"));
   }
 
   const nvinfer1::Dims3 stride(tf_stride[d_index], tf_stride[h_index],
@@ -3324,14 +3383,13 @@ Status ConvertPool3D(OpConverterParams* params) {
     // SAME_UPPER means that post padding is preferred.
     layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER);
   }
-  layer->setName(node_def.name().c_str());
+  SetLayerName(layer, node_def, "pooling");
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   if (data_format == "NDHWC") {
     // NCDHW => NDHWC
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        output_tensor, {0, 2, 3, 4, 1}, StrCat(node_def.name(), "_to_NDHWC"),
-        &output_tensor));
+        output_tensor, {0, 2, 3, 4, 1}, &output_tensor, node_def, "to_NDHWC"));
   }
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
@@ -3426,7 +3484,7 @@ Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) {
   const bool need_transpose = (data_format == "NHWC");
   if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        tensor, {0, 3, 1, 2}, StrCat(node_def.name(), "_to_NCHW"), &tensor));
+        tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW"));
   }
 
   nvinfer1::DimsHW kernel_size;
@@ -3482,7 +3540,7 @@ Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) {
 #else
   conv_layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
 #endif
-  conv_layer->setName(node_def.name().c_str());
+  SetLayerName(conv_layer, node_def, "conv");
   conv_layer->setNbGroups(1);
   conv_layer->setDilation(dilation);
   nvinfer1::ITensor* output_tensor = conv_layer->getOutput(0);
@@ -3493,13 +3551,13 @@ Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) {
         params->converter->network()->addActivation(*output_tensor,
                                                     op_pair->second);
     TFTRT_RETURN_ERROR_IF_NULLPTR(activation_layer, node_def.name());
+    SetLayerName(activation_layer, node_def, "activation");
     output_tensor = activation_layer->getOutput(0);
   }
   // Restore transpose.
   if (need_transpose) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        output_tensor, {0, 2, 3, 1}, StrCat(node_def.name(), "_to_NHWC"),
-        &output_tensor));
+        output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC"));
   }
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -3541,7 +3599,7 @@ Status ConvertPool(OpConverterParams* params) {
     h_index = 1;
     w_index = 2;
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        tensor, {0, 3, 1, 2}, StrCat(node_def.name(), "_to_NCHW"), &tensor));
+        tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW"));
   }
 
   const auto tf_stride = attrs.get<std::vector<int64>>("strides");
@@ -3575,6 +3633,7 @@ Status ConvertPool(OpConverterParams* params) {
         *tensor, nvinfer1::DimsHW(padding[0].first, padding[1].first),
         nvinfer1::DimsHW(padding[0].second, padding[1].second));
     TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, node_def.name());
+    SetLayerName(pad_layer, node_def, "pad");
     params->converter->MarkQuantizationRangesAsInferrable(
         tensor, pad_layer->getOutput(0));
     padding = {{0, 0}, {0, 0}};
@@ -3604,13 +3663,12 @@ Status ConvertPool(OpConverterParams* params) {
 #else
   layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first});
 #endif
-  layer->setName(node_def.name().c_str());
+  SetLayerName(layer, node_def, "pooling");
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   if (data_format == "NHWC") {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        output_tensor, {0, 2, 3, 1}, StrCat(node_def.name(), "_to_NHWC"),
-        &output_tensor));
+        output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC"));
   }
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -3633,6 +3691,7 @@ Status ConvertLeakyRelu(OpConverterParams* params) {
       params->converter->network()->addActivation(
           *inputs.at(0).tensor(), nvinfer1::ActivationType::kLEAKY_RELU);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def, "activation");
   layer->setAlpha(alpha);
   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
   return Status::OK();
@@ -3655,12 +3714,14 @@ Status ConvertLeakyRelu(OpConverterParams* params) {
       params->converter->network()->addElementWise(
           *tensor, *const_alpha_tensor, nvinfer1::ElementWiseOperation::kPROD);
   TFTRT_RETURN_ERROR_IF_NULLPTR(mul_layer, node_def.name());
+  SetLayerName(mul_layer, node_def, "mul");
   // max(x, alpha * x)
   nvinfer1::IElementWiseLayer* max_layer =
       params->converter->network()->addElementWise(
           *tensor, *mul_layer->getOutput(0),
           nvinfer1::ElementWiseOperation::kMAX);
   TFTRT_RETURN_ERROR_IF_NULLPTR(max_layer, node_def.name());
+  SetLayerName(mul_layer, node_def, "max");
   nvinfer1::ITensor* output_tensor = max_layer->getOutput(0);
   params->converter->MarkQuantizationRangesAsInferrable(
       output_tensor, mul_layer->getOutput(0));
@@ -3705,6 +3766,7 @@ Status ConvertClipByValue(OpConverterParams* params) {
   layer->setAlpha(clip_value_min);
   layer->setBeta(clip_value_max);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def, "activation");
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   params->converter->ProvideQuantizationRange(output_tensor, clip_value_min,
                                               clip_value_max);
@@ -3748,7 +3810,7 @@ Status ConvertActivation(OpConverterParams* params) {
       params->converter->network()->addActivation(*inputs.at(0).tensor(),
                                                   op_pair->second);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  layer->setName(node_def.name().c_str());
+  SetLayerName(layer, node_def, "activation");
   // Set parameters.
 #if IS_TRT_VERSION_GE(5, 1, 2, 0)
   if (node_def.op() == "Elu") {
@@ -3852,7 +3914,7 @@ Status ConvertRelu6(OpConverterParams* params) {
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   layer->setAlpha(0.0f);
   layer->setBeta(6.0f);
-  layer->setName(node_def.name().c_str());
+  SetLayerName(layer, node_def, "activation");
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 6.0f);
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
@@ -3867,6 +3929,7 @@ Status ConvertRelu6(OpConverterParams* params) {
       params->converter->network()->addActivation(
           *tensor, nvinfer1::ActivationType::kRELU);
   TFTRT_RETURN_ERROR_IF_NULLPTR(relu_layer, node_def.name());
+  SetLayerName(relu_layer, node_def, "activation");
 
   // Large range of relu is problematic during quantization in INT8 precision
   // mode. Setting dynamic range of relu = [0.f, 6.0f] helps with quantization.
@@ -3888,6 +3951,7 @@ Status ConvertRelu6(OpConverterParams* params) {
           *relu_layer->getOutput(0), *const6_tensor,
           nvinfer1::ElementWiseOperation::kMIN);
   TFTRT_RETURN_ERROR_IF_NULLPTR(relu6_layer, node_def.name());
+  SetLayerName(relu6_layer, node_def, "min");
   nvinfer1::ITensor* output_tensor = relu6_layer->getOutput(0);
   params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 6.0f);
 
@@ -3932,6 +3996,7 @@ Status ConvertBiasAddInt8WithoutCalibration(OpConverterParams* params) {
     nvinfer1::IShuffleLayer* shuffle_layer =
         params->converter->network()->addShuffle(*tensor);
     TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name());
+    SetLayerName(shuffle_layer, node_def, "shuffle", /*op_instance=*/0);
     params->converter->MarkQuantizationRangesAsInferrable(
         tensor, shuffle_layer->getOutput(0));
 
@@ -3963,6 +4028,7 @@ Status ConvertBiasAddInt8WithoutCalibration(OpConverterParams* params) {
       *tensor, mode, weights.GetTrtWeights(), empty_weights.GetTrtWeights(),
       empty_weights.GetTrtWeights());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def, "scale");
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
@@ -3971,6 +4037,7 @@ Status ConvertBiasAddInt8WithoutCalibration(OpConverterParams* params) {
     nvinfer1::IShuffleLayer* shuffle_layer =
         params->converter->network()->addShuffle(*output_tensor);
     TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name());
+    SetLayerName(shuffle_layer, node_def, "shuffle", /*op_instance=*/1);
     // NOTE: for same reason as mentioned above we need to apply the reshape
     // unconditionally.
     nvinfer1::Dims reshape_dims = original_dims;
@@ -4055,13 +4122,16 @@ Status ConvertBiasAdd(OpConverterParams* params) {
   // Convert input to a TRT tensor
   nvinfer1::ITensor* input_tensor{nullptr};
   TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(0), input_shape, params->validation_only, &input_tensor));
+      inputs.at(0), input_shape, params->validation_only, &input_tensor,
+      node_def,
+      /*op_instance=*/0));
 
   // Finally, reshape bias. Since the bias is usually a constant, this will
   // normally happen at conversion-time.
   nvinfer1::ITensor* bias_tensor{nullptr};
   TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(1), bias_shape, params->validation_only, &bias_tensor));
+      inputs.at(1), bias_shape, params->validation_only, &bias_tensor, node_def,
+      /*op_instance=*/1));
   VLOG(2) << "Bias shape adjusted to " << DebugString(bias_shape);
 
   if (params->validation_only) return Status::OK();
@@ -4070,6 +4140,7 @@ Status ConvertBiasAdd(OpConverterParams* params) {
       params->converter->network()->addElementWise(
           *input_tensor, *bias_tensor, nvinfer1::ElementWiseOperation::kSUM);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def, "sum");
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
@@ -4298,16 +4369,18 @@ Status ConvertBinary(OpConverterParams* params) {
   nvinfer1::ITensor* tensor_r = nullptr;
   // This will also convert constants to tensors, and set quantization ranges.
   TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      operand_l, broadcasted_dims_l, params->validation_only, &tensor_l));
+      operand_l, broadcasted_dims_l, params->validation_only, &tensor_l,
+      node_def, /*op_instance=*/0));
   TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      operand_r, broadcasted_dims_r, params->validation_only, &tensor_r));
+      operand_r, broadcasted_dims_r, params->validation_only, &tensor_r,
+      node_def, /*op_instance=*/1));
   if (params->validation_only) return Status::OK();
 
   // Add ElementWise layer.
   nvinfer1::ILayer* layer = params->converter->network()->addElementWise(
       *tensor_l, *tensor_r, op_pair->second);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  layer->setName(node_def.name().c_str());
+  SetLayerName(layer, node_def);
   nvinfer1::ITensor* trt_tensor = layer->getOutput(0);
 
 #if IS_TRT_VERSION_GE(5, 1, 0, 0)
@@ -4315,6 +4388,7 @@ Status ConvertBinary(OpConverterParams* params) {
     layer = params->converter->network()->addUnary(
         *trt_tensor, nvinfer1::UnaryOperation::kFLOOR);
     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+    SetLayerName(layer, node_def, "floor");
     trt_tensor = layer->getOutput(0);
   }
 #endif
@@ -4353,10 +4427,12 @@ Status ConvertRsqrt(OpConverterParams* params) {
   nvinfer1::IUnaryLayer* sqrt_layer = params->converter->network()->addUnary(
       *tensor, nvinfer1::UnaryOperation::kSQRT);
   TFTRT_RETURN_ERROR_IF_NULLPTR(sqrt_layer, node_def.name());
+  SetLayerName(sqrt_layer, node_def, "sqrt");
   // Recip
   nvinfer1::IUnaryLayer* recip_layer = params->converter->network()->addUnary(
       *sqrt_layer->getOutput(0), nvinfer1::UnaryOperation::kRECIP);
   TFTRT_RETURN_ERROR_IF_NULLPTR(recip_layer, node_def.name());
+  SetLayerName(recip_layer, node_def, "recip");
   params->outputs->push_back(TRT_TensorOrWeights(recip_layer->getOutput(0)));
   return Status::OK();
 }
@@ -4408,7 +4484,7 @@ Status ConvertUnary(OpConverterParams* params) {
   nvinfer1::IUnaryLayer* layer =
       params->converter->network()->addUnary(*tensor, op_pair->second);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  layer->setName(node_def.name().c_str());
+  SetLayerName(layer, node_def);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   // Set quantization ranges.
@@ -4453,6 +4529,7 @@ Status ConvertSquare(OpConverterParams* params) {
           *inputs.at(0).tensor(), *const2_tensor,
           nvinfer1::ElementWiseOperation::kPOW);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
@@ -4511,6 +4588,7 @@ Status ConvertReduce(OpConverterParams* params) {
   nvinfer1::ILayer* layer = params->converter->network()->addReduce(
       *tensor, reduce_operation, axes, keep_dims);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
 
   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
   return Status::OK();
@@ -4585,21 +4663,25 @@ Status ConvertPack(OpConverterParams* params) {
   nvinfer1::Dims expanded_dims;
   TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(tensor_dims, &expanded_dims));
   std::vector<nvinfer1::ITensor*> expanded_tensors;
+  int input_index = 0;
   for (const TRT_TensorOrWeights& input : inputs) {
     nvinfer1::ITensor* expanded_tensor = nullptr;
     if (input.is_tensor() && !params->use_implicit_batch &&
         !HasStaticShape(dims)) {
       if (!params->validation_only) {
         TF_RETURN_IF_ERROR(params->converter->DynamicExpandDims(
-            input.tensor(), dims, trt_axis, params, &expanded_tensor));
+            input.tensor(), dims, trt_axis, params, &expanded_tensor,
+            input_index));
       }
     } else {
       TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-          input, expanded_dims, params->validation_only, &expanded_tensor));
+          input, expanded_dims, params->validation_only, &expanded_tensor,
+          node_def, input_index));
     }
     if (!params->validation_only) {
       expanded_tensors.push_back(expanded_tensor);
     }
+    input_index++;
   }
   if (params->validation_only) return Status::OK();
 
@@ -4615,6 +4697,7 @@ Status ConvertPack(OpConverterParams* params) {
           const_cast<nvinfer1::ITensor**>(expanded_tensors.data()),
           expanded_tensors.size());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def, "concat");
   // Note that trt_axis stays the same even after expanding tensors at the axis.
   layer->setAxis(trt_axis);
   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
@@ -4696,7 +4779,7 @@ Status ConvertPad(OpConverterParams* params) {
   if (pad_index[0] == 1) {
     legit_pad = false;
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        tensor, {0, 3, 2, 1}, StrCat(node_def.name(), "_to_pad"), &tensor));
+        tensor, {0, 3, 2, 1}, &tensor, node_def, "to_pad"));
     permuted_pad_index[0] = 3;
   }
 
@@ -4714,13 +4797,13 @@ Status ConvertPad(OpConverterParams* params) {
   nvinfer1::IPaddingLayer* layer = params->converter->network()->addPadding(
       *tensor, pre_padding, post_padding);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   params->converter->MarkQuantizationRangesAsInferrable(tensor, output_tensor);
 
   if (!legit_pad) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-        output_tensor, {0, 3, 2, 1}, StrCat(node_def.name(), "_from_pad"),
-        &output_tensor));
+        output_tensor, {0, 3, 2, 1}, &output_tensor, node_def, "from_pad"));
   }
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
@@ -4780,7 +4863,7 @@ Status ConvertSplitHelper(OpConverterParams* params,
   for (int i = 0; i < num_splits; ++i) {
     begin[trt_axis + 1] = i * split_size_on_axis;
     TF_RETURN_IF_ERROR(ConvertStridedSliceHelper(
-        params, input, begin, size, stride, final_shape_for_unpack_ptr));
+        params, input, begin, size, stride, final_shape_for_unpack_ptr, i));
   }
   return Status::OK();
 }
@@ -4854,6 +4937,7 @@ Status ConvertCast(OpConverterParams* params) {
   nvinfer1::ITensor* input = params->inputs.at(0).tensor();
   nvinfer1::IIdentityLayer* layer =
       params->converter->network()->addIdentity(*input);
+  SetLayerName(layer, node_def);
   layer->setPrecision(nvinfer1::DataType::kFLOAT);
 
   if (layer->getOutput(0)->getType() != nvinfer1::DataType::kFLOAT) {
@@ -4911,6 +4995,7 @@ Status ConvertConcat(OpConverterParams* params) {
           const_cast<nvinfer1::ITensor* const*>(input_tensors.data()),
           input_tensors.size());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
   layer->setAxis(trt_axis);
   params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
   return Status::OK();
@@ -5057,7 +5142,7 @@ Status ConvertFusedBatchNorm(OpConverterParams* params) {
       combined_scale_weights.GetTrtWeights(),
       dummy_power_weights.GetTrtWeights());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-  layer->setName(node_def.name().c_str());
+  SetLayerName(layer, node_def);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -5137,6 +5222,7 @@ Status ConvertGather(OpConverterParams* params) {
   nvinfer1::IGatherLayer* layer = params->converter->network()->addGather(
       *params_tensor, *indices_input.tensor(), trt_axis);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   nvinfer1::Dims trt_gather_output_dims = output_tensor->getDimensions();
@@ -5163,7 +5249,7 @@ Status ConvertGather(OpConverterParams* params) {
 
     TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
         TRT_TensorOrWeights(output_tensor), trt_gather_output_dims,
-        /*validation_only=*/false, &output_tensor));
+        /*validation_only=*/false, &output_tensor, node_def));
   }
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
@@ -5173,7 +5259,7 @@ Status ConvertGather(OpConverterParams* params) {
 Status ConvertFullyConnectedHelper(OpConverterParams* params,
                                    nvinfer1::ITensor* tensor_a,
                                    TRT_ShapedWeights weights_b,
-                                   bool transpose_b, const string& node_name) {
+                                   bool transpose_b, const NodeDef& node_def) {
   // Reshape input to 3D - this will be a no-op unless using int8 precision.
   auto input_dim = tensor_a->getDimensions();
   while (input_dim.nbDims < 3) {
@@ -5181,7 +5267,7 @@ Status ConvertFullyConnectedHelper(OpConverterParams* params,
   }
   TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
       TRT_TensorOrWeights(tensor_a), input_dim, /*validation_only=*/false,
-      &tensor_a));
+      &tensor_a, node_def, /*op_instance=*/0));
 
   // FC layer will transpose weights, so we need to pre-transpose.
   TRT_ShapedWeights weights(weights_b.TrtDType());
@@ -5197,7 +5283,8 @@ Status ConvertFullyConnectedHelper(OpConverterParams* params,
       params->converter->network()->addFullyConnected(
           *tensor_a, noutput, weights.GetTrtWeights(), biases.GetTrtWeights());
 
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_name);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
   // Reshape output to 1D - this will be a no-op unless using int8 precision.
@@ -5205,7 +5292,7 @@ Status ConvertFullyConnectedHelper(OpConverterParams* params,
   output_dim.nbDims = 1;
   TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
       TRT_TensorOrWeights(output_tensor), output_dim, /*validation_only=*/false,
-      &output_tensor));
+      &output_tensor, node_def, /*op_instance=*/1));
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -5214,7 +5301,7 @@ Status ConvertFullyConnectedHelper(OpConverterParams* params,
 Status ConvertMatMulHelper(OpConverterParams* params,
                            TRT_TensorOrWeights input_a,
                            TRT_TensorOrWeights input_b, bool transpose_a,
-                           bool transpose_b, string node_name) {
+                           bool transpose_b, const NodeDef& node_def) {
   // TODO: ReorderCKtoKC is currently not general enough to transpose weights
   // that are not 2D.
   if ((transpose_a && input_a.is_weights() &&
@@ -5252,7 +5339,7 @@ Status ConvertMatMulHelper(OpConverterParams* params,
   if (should_use_fc || (can_use_fc && params->converter->precision_mode() ==
                                           TrtPrecisionMode::INT8)) {
     return ConvertFullyConnectedHelper(
-        params, input_a.tensor(), input_b.weights(), transpose_b, node_name);
+        params, input_a.tensor(), input_b.weights(), transpose_b, node_def);
   }
 
   const auto get_matrix_op = [](nvinfer1::ITensor* in,
@@ -5293,7 +5380,8 @@ Status ConvertMatMulHelper(OpConverterParams* params,
           *tensor_a, get_matrix_op(tensor_a, transpose_a), *tensor_b,
           get_matrix_op(tensor_b, transpose_b));
 
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_name);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -5316,7 +5404,7 @@ Status ConvertMatMul(OpConverterParams* params) {
   bool transpose_b = attrs.get<bool>("transpose_b");
 
   return ConvertMatMulHelper(params, inputs.at(0), inputs.at(1), transpose_a,
-                             transpose_b, node_def.name());
+                             transpose_b, node_def);
 }
 
 Status ConvertBatchMatMul(OpConverterParams* params) {
@@ -5379,14 +5467,16 @@ Status ConvertBatchMatMul(OpConverterParams* params) {
   nvinfer1::ITensor* tensor_l = nullptr;
   nvinfer1::ITensor* tensor_r = nullptr;
   TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(0), broadcasted_dims_l, params->validation_only, &tensor_l));
+      inputs.at(0), broadcasted_dims_l, params->validation_only, &tensor_l,
+      node_def));
   TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(1), broadcasted_dims_r, params->validation_only, &tensor_r));
+      inputs.at(1), broadcasted_dims_r, params->validation_only, &tensor_r,
+      node_def));
   if (params->validation_only) return Status::OK();
 
   return ConvertMatMulHelper(params, TRT_TensorOrWeights(tensor_l),
                              TRT_TensorOrWeights(tensor_r), transpose_a,
-                             transpose_b, node_def.name());
+                             transpose_b, node_def);
 }
 
 Status ConvertSoftmax(OpConverterParams* params) {
@@ -5408,6 +5498,7 @@ Status ConvertSoftmax(OpConverterParams* params) {
   nvinfer1::ISoftMaxLayer* layer =
       params->converter->network()->addSoftMax(*tensor);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
   // Tensorflow SoftMax assumes applying softmax on the last dimension.
   layer->setAxes(1 << (num_trt_dims - 1));
 
@@ -5452,6 +5543,7 @@ Status ConvertArgMinMax(OpConverterParams* params) {
   nvinfer1::ITopKLayer* layer = params->converter->network()->addTopK(
       *inputs.at(0).tensor(), topk_op, 1, reduce_axes);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def, "topk");
   nvinfer1::ITensor* output_indices_tensor = layer->getOutput(1);
 
   // Squeeze on axis.
@@ -5462,7 +5554,7 @@ Status ConvertArgMinMax(OpConverterParams* params) {
   nvinfer1::ITensor* output_tensor = nullptr;
   TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
       TRT_TensorOrWeights(output_indices_tensor), new_dims,
-      /*validation_only=*/false, &output_tensor));
+      /*validation_only=*/false, &output_tensor, node_def));
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
@@ -5508,6 +5600,7 @@ Status ConvertTopK(OpConverterParams* params) {
   nvinfer1::ITopKLayer* layer =
       params->converter->network()->addTopK(*tensor, op, k, reduce_axes);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
 
   nvinfer1::ITensor* output_value_tensor = layer->getOutput(0);
   nvinfer1::ITensor* output_indices_tensor = layer->getOutput(1);
@@ -5583,6 +5676,7 @@ Status ConvertDepthSpaceShuffle(OpConverterParams* params) {
   nvinfer1::IShuffleLayer* first_shuffle =
       params->converter->network()->addShuffle(*inputs.at(0).tensor());
   TFTRT_RETURN_ERROR_IF_NULLPTR(first_shuffle, node_def.name());
+  SetLayerName(first_shuffle, node_def, "shuffle", /*op_instance=*/0);
   if (data_format == "NHWC") {
     first_shuffle->setFirstTranspose({2, 0, 1});
   }
@@ -5592,6 +5686,7 @@ Status ConvertDepthSpaceShuffle(OpConverterParams* params) {
   nvinfer1::IShuffleLayer* second_shuffle =
       params->converter->network()->addShuffle(*first_shuffle->getOutput(0));
   TFTRT_RETURN_ERROR_IF_NULLPTR(second_shuffle, node_def.name());
+  SetLayerName(second_shuffle, node_def, "shuffle", /*op_instance=*/1);
   second_shuffle->setReshapeDimensions(second_shuffle_shape);
   if (data_format == "NHWC") {
     second_shuffle->setSecondTranspose({1, 2, 0});
@@ -5619,9 +5714,11 @@ Status ConvertSquaredDifference(OpConverterParams* params) {
   nvinfer1::ITensor* tensor_l = nullptr;
   nvinfer1::ITensor* tensor_r = nullptr;
   TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(0), broadcasted_dims_l, params->validation_only, &tensor_l));
+      inputs.at(0), broadcasted_dims_l, params->validation_only, &tensor_l,
+      node_def));
   TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
-      inputs.at(1), broadcasted_dims_r, params->validation_only, &tensor_r));
+      inputs.at(1), broadcasted_dims_r, params->validation_only, &tensor_r,
+      node_def));
   if (params->validation_only) return Status::OK();
 
   // Subtract x - y.
@@ -5629,12 +5726,15 @@ Status ConvertSquaredDifference(OpConverterParams* params) {
       params->converter->network()->addElementWise(
           *tensor_l, *tensor_r, nvinfer1::ElementWiseOperation::kSUB);
   TFTRT_RETURN_ERROR_IF_NULLPTR(sub, node_def.name());
+  SetLayerName(sub, node_def, "sub");
+
   // Multiply (x - y) * (x - y).
   nvinfer1::IElementWiseLayer* mul =
       params->converter->network()->addElementWise(
           *sub->getOutput(0), *sub->getOutput(0),
           nvinfer1::ElementWiseOperation::kPROD);
   TFTRT_RETURN_ERROR_IF_NULLPTR(mul, node_def.name());
+  SetLayerName(mul, node_def, "mul");
 
   params->outputs->push_back(TRT_TensorOrWeights(mul->getOutput(0)));
   return Status::OK();
@@ -5772,6 +5872,7 @@ Status ConvertCombinedNMS(OpConverterParams* params) {
   nvinfer1::IPluginV2Layer* layer = params->converter->network()->addPluginV2(
       &plugin_inputs[0], static_cast<int>(plugin_inputs.size()), *plugin);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def, "plugin");
 
   // Set plugin outputs
   nvinfer1::ITensor* output_nmsed_boxes = layer->getOutput(1);
@@ -5785,8 +5886,8 @@ Status ConvertCombinedNMS(OpConverterParams* params) {
   nvinfer1::ITensor* output_nmsed_scores = nullptr;
   nvinfer1::ITensor* output_nmsed_classes = nullptr;
 
-  auto shrink_last_dim = [params](nvinfer1::ITensor* in_tensor,
-                                  nvinfer1::ITensor** out_tensor) {
+  auto shrink_last_dim = [&](int output_index, nvinfer1::ITensor** out_tensor) {
+    nvinfer1::ITensor* in_tensor = layer->getOutput(output_index);
     nvinfer1::Dims dims = in_tensor->getDimensions();
     if (dims.d[dims.nbDims - 1] != 1) {
       return errors::Internal("Expect last dims to be 1, for tensor ",
@@ -5795,15 +5896,12 @@ Status ConvertCombinedNMS(OpConverterParams* params) {
     --dims.nbDims;
     TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
         TRT_TensorOrWeights(in_tensor), dims,
-        /*validation_only=*/false, out_tensor));
+        /*validation_only=*/false, out_tensor, node_def, output_index));
     return Status::OK();
   };
-  TF_RETURN_IF_ERROR(
-      shrink_last_dim(layer->getOutput(2), &output_nmsed_scores));
-  TF_RETURN_IF_ERROR(
-      shrink_last_dim(layer->getOutput(3), &output_nmsed_classes));
-  TF_RETURN_IF_ERROR(
-      shrink_last_dim(layer->getOutput(0), &output_num_detections));
+  TF_RETURN_IF_ERROR(shrink_last_dim(2, &output_nmsed_scores));
+  TF_RETURN_IF_ERROR(shrink_last_dim(3, &output_nmsed_classes));
+  TF_RETURN_IF_ERROR(shrink_last_dim(0, &output_num_detections));
 #endif  // IS_TRT_VERSION_GE(6, 0, 0, 0)
 
   params->outputs->push_back(TRT_TensorOrWeights(output_nmsed_boxes));
@@ -5845,6 +5943,12 @@ Status ConvertResize(OpConverterParams* params) {
   // Verify resize mode. Initialize resize mode if supported.
   nvinfer1::ResizeMode resize_mode;
   if (node_def.op() == "ResizeBilinear") {
+#if IS_TRT_VERSION_GE(7, 1, 0, 0)
+    if (!align_corners) {
+      return errors::InvalidArgument(
+          "Cannot Convert Bilinear Resize when align_corners=False");
+    }
+#endif
     resize_mode = nvinfer1::ResizeMode::kLINEAR;
   } else if (node_def.op() == "ResizeNearestNeighbor") {
     resize_mode = nvinfer1::ResizeMode::kNEAREST;
@@ -5858,7 +5962,7 @@ Status ConvertResize(OpConverterParams* params) {
 
   // Transpose tensor from NHWC to NCHW format.
   TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-      tensor, {0, 3, 1, 2}, StrCat(node_def.name(), "_to_NCHW"), &tensor));
+      tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW"));
 
   // Calculate output dimensions.
   // Given input dimensions [N, C, H, W] and output size [H_out, W_out],
@@ -5875,6 +5979,7 @@ Status ConvertResize(OpConverterParams* params) {
   nvinfer1::IResizeLayer* layer =
       params->converter->network()->addResize(*tensor);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  SetLayerName(layer, node_def);
 
   // Set layer parameters.
   layer->setResizeMode(resize_mode);
@@ -5885,7 +5990,7 @@ Status ConvertResize(OpConverterParams* params) {
   nvinfer1::ITensor* output = layer->getOutput(0);
 
   TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-      output, {0, 2, 3, 1}, StrCat(node_def.name(), "_to_NHWC"), &output));
+      output, {0, 2, 3, 1}, &output, node_def, "to_NHWC"));
   params->outputs->push_back(TRT_TensorOrWeights(output));
   // Success
   return Status::OK();
@@ -5934,6 +6039,7 @@ Status ConvertAddN(OpConverterParams* params) {
     nvinfer1::ILayer* layer = params->converter->network()->addElementWise(
         *lhs, *rhs, nvinfer1::ElementWiseOperation::kSUM);
     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+    SetLayerName(layer, node_def, std::to_string(i));
     lhs = layer->getOutput(0);
   }
   params->outputs->push_back(TRT_TensorOrWeights(lhs));
@@ -6056,6 +6162,8 @@ Status ConvertGraphDefToEngine(
 
   VLOG(1) << "Starting to convert TensorFlow ops to TensorRT layers";
   std::vector<Converter::EngineOutputInfo> output_tensors;
+  int num_layers = converter->network()->getNbLayers();
+  absl::flat_hash_set<const char*> layer_names;
   // Graph nodes are already topologically sorted during construction
   for (const auto& node_def : gdef.node()) {
     const string& node_name = node_def.name();
@@ -6134,6 +6242,25 @@ Status ConvertGraphDefToEngine(
     } else {
       TF_RETURN_IF_ERROR(converter->ConvertNode(node_def));
     }
+
+    // To support TF-TRT profiling, we ensure each ILayer has a non-empty name.
+    // BuildCudaEngine returns an error if there is any ILayer name collision.
+    // We want to report the error here before BuildCudaEngine in a more
+    // meaningful way.
+    int new_num_layers = converter->network()->getNbLayers();
+    for (int i = num_layers; i < new_num_layers; i++) {
+      auto layer = converter->network()->getLayer(i);
+      if (layer->getName() == nullptr ||
+          !layer_names.insert(layer->getName()).second) {
+        std::string error_message =
+            absl::StrCat("Converting node ", node_name, ", op=", node_def.op(),
+                         layer->getName() ? "create a layer with name collision"
+                                          : "create a layer without a name");
+        LOG_WARNING_WITH_PREFIX << error_message;
+        return errors::Internal(error_message);
+      }
+    }
+    num_layers = new_num_layers;
   }
   TF_RETURN_IF_ERROR(converter->RenameAndMarkOutputTensors(output_tensors));
   if (convert_successfully) *convert_successfully = true;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index a621735fad1..4a84793e254 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h"
@@ -515,14 +516,18 @@ class Converter {
 
   // Transpose 'input_tensor' with given permutation 'order_with_batch_dim' to
   // 'output_tensor'. The permutation 'order_with_batch_dim' contains the batch
-  // dimension which should always be 0.
+  // dimension which should always be 0. If this is for adding a transpose layer
+  // to support the conversion of 'node_def', callers need to provide a
+  // non-empty 'sub_op_name' appended to the name of 'node_def' to avoid layer
+  // name conflicts.
   Status TransposeTensor(nvinfer1::ITensor* input_tensor,
                          const std::vector<int>& order_with_batch_dim,
-                         absl::string_view name,
-                         nvinfer1::ITensor** output_tensor);
+                         nvinfer1::ITensor** output_tensor,
+                         const NodeDef& node_def,
+                         absl::string_view sub_op_name = "");
 
-  // Converts 'input' into 'tensor' with shape specified by 'dims' (which
-  // doesn't contain the batch dimension).
+  // Converts 'input' of 'node_def' into 'tensor' with shape specified by 'dims'
+  // (which doesn't contain the batch dimension).
   //
   // If validation_only is true, it doesn't do the conversion but only do some
   // minimum validation for the eligibility of the conversion, and *tensor will
@@ -530,7 +535,9 @@ class Converter {
   Status PrepareTensorForShape(const TRT_TensorOrWeights& input,
                                const nvinfer1::Dims& dims,
                                const bool validation_only,
-                               nvinfer1::ITensor** tensor);
+                               nvinfer1::ITensor** tensor,
+                               const NodeDef& node_def,
+                               absl::optional<int> op_instance = absl::nullopt);
 
   // Reshapes a dynamic shape tensor by removing or adding dimensions of size 1,
   // and/or permuting the dimensions. The new shape is derived from the shape of
@@ -575,12 +582,14 @@ class Converter {
   Status DynamicReshape(nvinfer1::ITensor* input,
                         std::vector<std::pair<int, int>> slices,
                         OpConverterParams* params, nvinfer1::ITensor** output,
-                        std::vector<int> size_for_added_dims = {});
+                        std::vector<int> size_for_added_dims = {},
+                        absl::optional<int> op_instance = absl::nullopt);
 
   // Inserts a singleton dimension at axis for a dynamic shape tensor.
   Status DynamicExpandDims(nvinfer1::ITensor* input, const nvinfer1::Dims& dims,
                            int axis, OpConverterParams* params,
-                           nvinfer1::ITensor** output);
+                           nvinfer1::ITensor** output,
+                           absl::optional<int> op_instance = absl::nullopt);
 
   // Helper function to add a squeeze op to the network.
   //
@@ -667,6 +676,10 @@ class Converter {
   // acceptable by TRT.
   int batch_size_ = -1;
 
+  // Assign a ID to each constant layer we create, so that we can assign a
+  // unique name to the layer.
+  int next_constant_layer_id_ = 0;
+
   friend class ConverterTest;
   friend class OpConverterTest;
 };
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 72348c3cede..86e6f0dd345 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <functional>
 #include <memory>
+#include <type_traits>
 #include <unordered_map>
 #include <vector>
 
@@ -203,6 +204,23 @@ void ExpectTrtDimsEqualsArray(const std::vector<int>& lhs,
       << "  actual: " << DebugString(rhs);
 }
 
+void ExpectTrtLayerNames(absl::Span<const std::string> names,
+                         nvinfer1::INetworkDefinition* network) {
+  EXPECT_EQ(network->getNbLayers(), names.size());
+
+  for (int i = 0; i < network->getNbLayers(); i++) {
+    auto layer = network->getLayer(i);
+    EXPECT_EQ(layer->getName(), names[i]);
+  }
+}
+
+void VerifyTrtLayerNameNotEmpty(nvinfer1::INetworkDefinition* network) {
+  for (int i = 0; i < network->getNbLayers(); i++) {
+    auto layer = network->getLayer(i);
+    EXPECT_NE(layer->getName(), nullptr);
+  }
+}
+
 Matcher<std::vector<float>> ArrayFloatNear(const std::vector<float>& values,
                                            float max_abs_error = 1e-5,
                                            bool nan_sensitive = false) {
@@ -803,6 +821,8 @@ TEST_F(ConverterTest, ConvertNode) {
   TF_EXPECT_OK(GetTensorOrWeights("my_op:1", &actual_output_2));
   EXPECT_EQ(&output_tensors[1], actual_output_2.tensor());
   EXPECT_EQ(125, actual_output_2.tensor()->getDimensions().d[0]);
+
+  VerifyTrtLayerNameNotEmpty(converter_->network());
 }
 
 TEST_F(ConverterTest, AddAndGetInputs) {
@@ -832,6 +852,8 @@ TEST_F(ConverterTest, AddAndGetInputs) {
   ExpectTrtDimsEqualsArray({1}, inputs[0].tensor()->getDimensions());
   ExpectTrtDimsEqualsArray({2, 3}, inputs[2].tensor()->getDimensions());
   ExpectTrtDimsEqualsArray({5, 3}, inputs[3].tensor()->getDimensions());
+
+  VerifyTrtLayerNameNotEmpty(converter_->network());
 }
 
 TEST_F(ConverterTest, RenameAndMarkOutputTensors) {
@@ -880,30 +902,33 @@ TEST_F(ConverterTest, RenameAndMarkOutputTensors) {
   }
   EXPECT_EQ("my_output", string(output_tensors[0]->getName()));
   EXPECT_EQ("my_output_1", string(output_tensors[1]->getName()));
+
+  VerifyTrtLayerNameNotEmpty(converter_->network());
 }
 
 TEST_F(ConverterTest, TransposeTensor) {
   nvinfer1::ITensor* input_tensor = converter_->network()->addInput(
       "", nvinfer1::DataType::kFLOAT, GetTestDims({2, 3, 5}));
   nvinfer1::ITensor* output_tensor = nullptr;
-
+  NodeDef dummy_node_def = MakeNodeDef("dummy_op", "DummyOp", {});
   // Rank doesn't match.
   ExpectStatus(
-      converter_->TransposeTensor(input_tensor, {0, 1}, "Bad perm",
-                                  &output_tensor),
+      converter_->TransposeTensor(input_tensor, {0, 1}, &output_tensor,
+                                  dummy_node_def, "sub1"),
       error::INVALID_ARGUMENT,
       "Rank of perm for transpose does not match with that of the input");
 
   // Transpose at batch dimension.
-  ExpectStatus(converter_->TransposeTensor(input_tensor, {1, 0, 2, 3},
-                                           "Batch perm", &output_tensor),
-               error::UNIMPLEMENTED,
-               "Transpose at batch dimension is not supported.");
+  ExpectStatus(
+      converter_->TransposeTensor(input_tensor, {1, 0, 2, 3}, &output_tensor,
+                                  dummy_node_def, "sub2"),
+      error::UNIMPLEMENTED, "Transpose at batch dimension is not supported.");
 
   // OK.
-  TF_EXPECT_OK(converter_->TransposeTensor(input_tensor, {0, 3, 1, 2}, "OK",
-                                           &output_tensor));
+  TF_EXPECT_OK(converter_->TransposeTensor(
+      input_tensor, {0, 3, 1, 2}, &output_tensor, dummy_node_def, "sub3"));
   ExpectTrtDimsEqualsArray({5, 2, 3}, output_tensor->getDimensions());
+  ExpectTrtLayerNames({"dummy_op-sub3"}, converter_->network());
 }
 
 void TestPrepareTensorForShape(
@@ -922,9 +947,11 @@ void TestPrepareTensorForShape(
   }
   nvinfer1::ITensor* output_tensor = nullptr;
 
+  NodeDef dummy_node_def = MakeNodeDef("dummy_op", "DummyOp", {});
   for (bool validation_only : {false, true}) {
     const Status status = converter->PrepareTensorForShape(
-        input, GetTestDims(reshape_dims), validation_only, &output_tensor);
+        input, GetTestDims(reshape_dims), validation_only, &output_tensor,
+        dummy_node_def);
     if (expected_code == error::OK) {
       TF_EXPECT_OK(status);
       if (validation_only) {
@@ -978,6 +1005,8 @@ TEST_F(ConverterTest, PrepareTensorForShape) {
                             /*input_is_tensor=*/false, converter_.get(),
                             weight_store_, error::INVALID_ARGUMENT,
                             "Shape is not fully defined");
+
+  VerifyTrtLayerNameNotEmpty(converter_->network());
 }
 
 TEST_F(ConverterTest, MaybeUpdateBatchSize) {
@@ -1051,6 +1080,8 @@ TEST_F(ConverterTest, ProvideQuantizationRange) {
   // Symmetric range
   converter_->ProvideQuantizationRange(&fake_tensor, -6.123f, 6.123f);
   EXPECT_EQ(6.123f, quantization_ranges()[&fake_tensor]);
+
+  VerifyTrtLayerNameNotEmpty(converter_->network());
 }
 
 TEST_F(ConverterTest, MaybeApplyQuantizationRanges) {
@@ -1077,6 +1108,8 @@ TEST_F(ConverterTest, MaybeApplyQuantizationRanges) {
   EXPECT_EQ(infer_3.getDynamicRange(), 5.0f);
   EXPECT_EQ(not_infer.getDynamicRange(), 100.0f);
 #endif
+
+  VerifyTrtLayerNameNotEmpty(int8_converter->network());
 }
 
 TEST_F(ConverterTest, PropagateQuantizationRanges) {
@@ -1099,6 +1132,8 @@ TEST_F(ConverterTest, PropagateQuantizationRanges) {
     EXPECT_EQ(5.0f, ranges[&infer[i]]);
   }
   EXPECT_EQ(ranges.count(&not_infer), 0);
+
+  VerifyTrtLayerNameNotEmpty(converter_->network());
 }
 
 TEST_F(ConverterTest, GetTrtBroadcastShape) {
@@ -1202,6 +1237,8 @@ TEST_F(ConverterTest, GetTrtBroadcastShape) {
                  "(tensor #dims 4 vs broadcast #dims 5)");
   symmetric_test({2, 3}, {7, 5}, kIsTensor, kIsTensor, {}, {},
                  error::INVALID_ARGUMENT, "Infeasible broadcast scheme");
+
+  VerifyTrtLayerNameNotEmpty(converter_->network());
 }
 
 TEST_F(ConverterTest, CreateConstantLayer) {
@@ -1216,6 +1253,8 @@ TEST_F(ConverterTest, CreateConstantLayer) {
         << DebugString(tensor->getType());
     ExpectTrtDimsEqualsArray({3, 10}, tensor->getDimensions());
   }
+
+  VerifyTrtLayerNameNotEmpty(converter_->network());
 }
 
 class ConvertGraphDefToEngineTest : public ::testing::Test {
@@ -1575,6 +1614,9 @@ class OpConverterTest : public ::testing::Test {
                      const char* expected_msg_substr = nullptr) {
     ExpectStatus(converter_->ConvertNode(node->def()), expected_code,
                  expected_msg_substr);
+    if (expected_code == error::OK) {
+      VerifyTrtLayerNameNotEmpty(converter_->network());
+    }
   }
 
   // Helper method to run both validation and conversion, when the expected
@@ -1709,12 +1751,12 @@ class ParameterizedOpConverterTestBase
           std::tuple<TrtTestMode, DataType, TrtPrecisionMode>> {
  public:
   ParameterizedOpConverterTestBase()
-      : trt_mode(std::get<0>(GetParam())),
-        tf_type(std::get<1>(GetParam())),
-        converter_precision(std::get<2>(GetParam())) {}
+      : trt_mode_(std::get<0>(GetParam())),
+        tf_type_(std::get<1>(GetParam())),
+        converter_precision_(std::get<2>(GetParam())) {}
 
   void Reset() {
-    OpConverterTest::Reset(converter_precision, trt_mode);
+    OpConverterTest::Reset(converter_precision_, trt_mode_);
     input_data_.clear();
   }
 
@@ -1750,7 +1792,7 @@ class ParameterizedOpConverterTestBase
     if (!partial_input_shape_dims.empty()) {
       partial_shape = partial_input_shape_dims;
     } else {
-      if (trt_mode == TrtTestMode::kDynamicShape) {
+      if (trt_mode_ == TrtTestMode::kDynamicShape) {
         // In dynamic shape mode we make all dims unknown.
         partial_shape = std::vector<int32>(dims.size(), -1);
       } else {
@@ -1776,7 +1818,7 @@ class ParameterizedOpConverterTestBase
   void AddTestTensor(const string& name, const std::vector<int32>& dims,
                      const std::vector<T>& values = {},
                      const std::vector<int32>& partial_input_shape_dims = {}) {
-    AddTestTensor<T>(name, dims, tf_type, values, partial_input_shape_dims);
+    AddTestTensor<T>(name, dims, tf_type_, values, partial_input_shape_dims);
   }
 
   // Builds and runs the converted network. Checks output tensor shape. Tests
@@ -1796,7 +1838,7 @@ class ParameterizedOpConverterTestBase
           TensorShapeUtils::MakeShape(expected_output_dims[i], &shape));
       string out_name = (n_output == 1) ? name : StrCat(name, ":", i);
       DataType out_tf_type =
-          out_tf_types.size() > i ? out_tf_types[i] : tf_type;
+          out_tf_types.size() > i ? out_tf_types[i] : tf_type_;
       InputOutputData data{
           out_name, ConstructTensor(shape.num_elements(), 0, out_tf_type)};
       output_data.push_back(data);
@@ -1840,9 +1882,9 @@ class ParameterizedOpConverterTestBase
   }
 
  protected:
-  const TrtTestMode trt_mode;
-  const DataType tf_type;
-  const TrtPrecisionMode converter_precision;
+  const TrtTestMode trt_mode_;
+  const DataType tf_type_;
+  const TrtPrecisionMode converter_precision_;
   DataVec input_data_;
 };
 
@@ -2075,7 +2117,7 @@ TEST_P(OpConverterTest1, ConvertFusedBatchNorm) {
                                      37.342354, 41.013527, 30.9738,   34.469433,
                                      45.018955, 48.59309,  59.369415, 63.04059};
   for (auto get_node_def : get_node_def_vec) {
-    NodeDef tmp_node_def = get_node_def(tf_type, "NCHW", true, 0);
+    NodeDef tmp_node_def = get_node_def(tf_type_, "NCHW", true, 0);
     std::string op_name = tmp_node_def.op();
     std::vector<TestParam> test_param{
         {"NHWC", 0, false, 0,
@@ -2097,7 +2139,7 @@ TEST_P(OpConverterTest1, ConvertFusedBatchNorm) {
          errors::Unimplemented(StrCat("The input \"variance\" for ", op_name,
                                       " must be a constant, at my_batchnorm"))},
         {"NCHW", 0, false, 0.01}};  // The last one is the only test that runs.
-    if (trt_mode == TrtTestMode::kDynamicShape) {
+    if (trt_mode_ == TrtTestMode::kDynamicShape) {
       test_param.push_back(
           {"NCHW", 0, false, 0.01,
            errors::InvalidArgument(
@@ -2107,7 +2149,7 @@ TEST_P(OpConverterTest1, ConvertFusedBatchNorm) {
     for (auto p : test_param) {
       Reset();
       NodeDef node_def =
-          get_node_def(tf_type, p.data_format, p.is_training, p.epsilon);
+          get_node_def(tf_type_, p.data_format, p.is_training, p.epsilon);
       for (int i = 0; i < node_input.size(); i++) {
         if (i == 0 || i == p.tensor_input_idx) {
           // The first input (x) is always added as a tensor, and it hase shape
@@ -2126,7 +2168,7 @@ TEST_P(OpConverterTest1, ConvertFusedBatchNorm) {
           // the first arg is a tensor. TODO(tfeher) Check if one can relax this
           // restriction.
           Status expected_status =
-              (i != 0 && trt_mode == TrtTestMode::kImplicitBatch)
+              (i != 0 && trt_mode_ == TrtTestMode::kImplicitBatch)
                   ? errors::InvalidArgument(
                         StrCat("Batch size doesn't match for tensor ",
                                node_input[i].name,
@@ -2134,19 +2176,19 @@ TEST_P(OpConverterTest1, ConvertFusedBatchNorm) {
                                "converter batch size: 3 vs 2"))
                   : Status::OK();
           std::vector<int> partial_input_shape;
-          if (i == 0 && trt_mode == TrtTestMode::kDynamicShape &&
+          if (i == 0 && trt_mode_ == TrtTestMode::kDynamicShape &&
               !p.keep_channel_unknown) {
             // keep channel dim static (known)
             partial_input_shape.resize(4, -1);
             partial_input_shape[1] = node_input[i].dims[1];
           }
-          AddTestTensor(node_input[i].name, node_input[i].dims, tf_type,
+          AddTestTensor(node_input[i].name, node_input[i].dims, tf_type_,
                         node_input[i].val, partial_input_shape,
                         expected_status);
 
         } else {
           AddTestWeights(node_input[i].name, node_input[i].dims,
-                         node_input[i].val, tf_type);
+                         node_input[i].val, tf_type_);
         }
       }
       TestOpConverter("my_batchnorm", node_def, node_input[0].dims,
@@ -2154,12 +2196,12 @@ TEST_P(OpConverterTest1, ConvertFusedBatchNorm) {
                       ArrayFloatNear(expected_output));
     }
   }
-}  // namespace convert
+}
 
 TEST_P(OpConverterTest1, ConvertTranspose) {
   // Get the NodeDef for Transpose.
   Scope s = Scope::NewRootScope();
-  auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type_);
   auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32);
   auto transpose = ops::Transpose(s.WithOpName("my_transpose"), input, weights);
   const NodeDef& node_def = transpose.operation.node()->def();
@@ -2187,13 +2229,13 @@ TEST_P(OpConverterTest1, ConvertTranspose) {
           {},
           {3, 2, 1, 1},
           {3, 2, 1, 0},
-          (trt_mode == TrtTestMode::kImplicitBatch)
+          (trt_mode_ == TrtTestMode::kImplicitBatch)
               ? Status(error::UNIMPLEMENTED,
                        "Transpose at batch dimension is not supported")
               : Status::OK()},
       TestParamBase{{1, 1, 2, 3}, {}, {1, 3, 1, 2}, {0, 3, 1, 2}},
   };
-  if (trt_mode == TrtTestMode::kDynamicShape) {
+  if (trt_mode_ == TrtTestMode::kDynamicShape) {
     // Dynamic shape tests where some shapes are known
     test_params.push_back(TestParamBase{
         {1, 1, 2, 3}, {-1, 1, 2, -1}, {1, 3, 1, 2}, {0, 3, 1, 2}});
@@ -2317,19 +2359,22 @@ TEST_F(OpConverterTest, ConvertReshape) {
 TEST_P(OpConverterTest1, ConvertShape) {
   // Get the NodeDef for Shape op.
   Scope s = Scope::NewRootScope();
-  auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type_);
   auto shape = ops::Shape(s.WithOpName("my_shape"), input);
   const NodeDef& node_def = shape.operation.node()->def();
 
   Status conversion_status =
-      (trt_mode == TrtTestMode::kImplicitBatch)
+      (trt_mode_ == TrtTestMode::kImplicitBatch)
           ? errors::Unimplemented(
                 "Shape is only supported for explicit batch mode.")
           : Status::OK();
   std::vector<TestParamBase> test_params = {
-      TestParamBase{{1, 2, 3}, {}, {3}, {}, conversion_status},
-      // Add input as weight (we use non empty param ({1}) to trigger this).
-      TestParamBase{{1, 2, 3}, {}, {3}, {1}, conversion_status},
+// TODO(b/166274212): Enable the test parameter for TensorRT 7.1.3.
+#if !IS_TRT_VERSION_GE(7, 1, 3, 0)
+    TestParamBase{{1, 2, 3}, {}, {3}, {}, conversion_status},
+#endif
+    // Add input as weight (we use non empty param ({1}) to trigger this).
+    TestParamBase{{1, 2, 3}, {}, {3}, {1}, conversion_status},
   };
 
   auto input_is_weight = [](const TestParamBase p) { return !p.param.empty(); };
@@ -2343,7 +2388,7 @@ TEST_P(OpConverterTest1, ConvertShape) {
     // we use for the unit test have no actual input tensor when it is converted
     // to a TensorRT network.
     int n_elements = 0;
-    if (input_is_weight(p) || trt_mode != TrtTestMode::kExplicitBatch) {
+    if (input_is_weight(p) || trt_mode_ != TrtTestMode::kExplicitBatch) {
       // Calculate the number of elements for adding input data.
       n_elements = std::accumulate(p.input_dims.begin(), p.input_dims.end(), 1,
                                    std::multiplies<int>());
@@ -2352,7 +2397,7 @@ TEST_P(OpConverterTest1, ConvertShape) {
     if (!input_is_weight(p)) {
       AddTestTensor("input", p.input_dims, input_val);
     } else {
-      AddTestWeights("input", p.input_dims, input_val, tf_type);
+      AddTestWeights("input", p.input_dims, input_val, tf_type_);
     }
     TestOpConverter("my_shape", node_def, p.expected_output_dims, p.status,
                     p.runtime_status, ElementsAreArray(p.input_dims),
@@ -2617,7 +2662,7 @@ TEST_P(OpConverterTest2, ConvertBiasAdd) {
   for (const string& data_format : {"NHWC", "NCHW"}) {
     for (const int trt_input_rank : {1, 2, 3, 4}) {
       Reset();
-      NodeDef node_def = get_biasadd_nodedef(data_format, tf_type);
+      NodeDef node_def = get_biasadd_nodedef(data_format, tf_type_);
 
       // Add input, dims_array will be like {2, 1, ..., 1, 3}
       std::vector<int32> dims_array(trt_input_rank + 1, 1);
@@ -2639,7 +2684,7 @@ TEST_P(OpConverterTest2, ConvertBiasAdd) {
       for (int i = 0; i < channel_size; ++i) {
         bias[i] = i + 1;  // bias will be {1, 2, 3, ...}
       }
-      AddTestWeights("weights", {channel_size}, bias, tf_type);
+      AddTestWeights("weights", {channel_size}, bias, tf_type_);
 
       // Build and run the engine.
       std::vector<float> output_data;
@@ -2675,7 +2720,7 @@ NodeDef GetBinaryOpNodeDef(DataType dtype) {
 TEST_P(OpConverterTest2, ConvertBinary) {
   {
     AttrValue dtype;
-    dtype.set_type(tf_type);
+    dtype.set_type(tf_type_);
     // Both inputs are weights.
     Reset();
     NodeDef node_def =
@@ -2720,19 +2765,19 @@ TEST_P(OpConverterTest2, ConvertBinary) {
         if (!op_test_info.count(op_name)) {
           FAIL() << "Binary op test map does not contain op " << op_name;
         }
-        NodeDef node_def = op_test_info[op_name].first(tf_type);
+        NodeDef node_def = op_test_info[op_name].first(tf_type_);
         std::vector<std::string> input_names;
         std::vector<std::vector<int>> input_dims;
         std::vector<std::vector<float>> input_values;
         if (operand_1_is_tensor) {
           AddTestTensor("input1", {2, 1, 2}, {3, 6, 3, 6});
         } else {
-          AddTestWeights("input1", {1, 2}, std::vector<float>{3, 6}, tf_type);
+          AddTestWeights("input1", {1, 2}, std::vector<float>{3, 6}, tf_type_);
         }
         if (operand_2_is_tensor) {
           AddTestTensor("input2", {2, 2, 1}, {2, 3, 2, 3});
         } else {
-          AddTestWeights("input2", {2, 1}, std::vector<float>{2, 3}, tf_type);
+          AddTestWeights("input2", {2, 1}, std::vector<float>{2, 3}, tf_type_);
         }
         TestOpConverter("my_binary", node_def, {2, 2, 2}, Status::OK(),
                         Status::OK(),
@@ -2939,10 +2984,10 @@ TEST_P(OpConverterTest2, ConvertSquare) {
     // Input is weights, should fail.
     Reset();
     Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+    auto input = ops::Placeholder(s.WithOpName("input"), tf_type_);
     auto square = ops::Square(s.WithOpName("my_square"), input);
     NodeDef node_def = square.operation.node()->def();
-    AddTestWeights("input", {1, 2, 3}, {1, 2, 3, 4, -5, 6}, tf_type);
+    AddTestWeights("input", {1, 2, 3}, {1, 2, 3, 4, -5, 6}, tf_type_);
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
         "The input \"x\" for Square must be a tensor, at my_square");
@@ -2951,7 +2996,7 @@ TEST_P(OpConverterTest2, ConvertSquare) {
   Reset();
 
   Scope s = Scope::NewRootScope();
-  auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type_);
   auto square = ops::Square(s.WithOpName("my_square"), input);
   NodeDef node_def = square.operation.node()->def();
 
@@ -2964,7 +3009,7 @@ TEST_P(OpConverterTest2, ConvertSquare) {
     inputs[i] = value;
     expected_outputs[i] = value * value;
   }
-  AddTestTensor("input", {1, 1, 20}, tf_type, inputs);
+  AddTestTensor("input", {1, 1, 20}, tf_type_, inputs);
 
   TestOpConverter("my_square", node_def, {1, 1, 20}, Status::OK(), Status::OK(),
                   ArrayFloatNear(expected_outputs, 0));
@@ -3091,7 +3136,7 @@ TEST_P(OpConverterTest1, ConvertActivation) {
   {
     // Input is weights, should fail.
     Reset();
-    const NodeDef& node_def = CreateUnaryOp<ops::Relu>(tf_type);
+    const NodeDef& node_def = CreateUnaryOp<ops::Relu>(tf_type_);
     AddTestWeights<int32>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
@@ -3148,7 +3193,7 @@ TEST_P(OpConverterTest1, ConvertActivation) {
       FAIL() << "Activation op test map does not contain op " << op_name;
     }
     Reset();
-    NodeDef node_def = op_map[op_name].first(tf_type);
+    NodeDef node_def = op_map[op_name].first(tf_type_);
     const std::vector<float> input = {-100, -2, -1, 0, 1, 88};
     AddTestTensor("input", p.input_dims, input);
 
@@ -3176,7 +3221,7 @@ TEST_P(OpConverterTest1, ConvertActivation) {
 TEST_P(OpConverterTest1, ConvertExpandDims) {
   // Get the NodeDef for ExpandDims.
   Scope s = Scope::NewRootScope();
-  auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_type_);
   auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32);
   auto expanddims =
       ops::ExpandDims(s.WithOpName("my_expanddims"), input, weights);
@@ -3204,7 +3249,7 @@ TEST_P(OpConverterTest1, ConvertExpandDims) {
                     {},
                     {1, 1, 1, 2, 3},
                     {0},
-                    trt_mode == TrtTestMode::kImplicitBatch
+                    trt_mode_ == TrtTestMode::kImplicitBatch
                         ? Status(error::UNIMPLEMENTED,
                                  "TensorRT does not allow manipulation of the "
                                  "batch dimension, at my_expanddims")
@@ -3213,7 +3258,7 @@ TEST_P(OpConverterTest1, ConvertExpandDims) {
                     {},
                     {1, 1, 1, 2, 3},
                     {-5},
-                    trt_mode == TrtTestMode::kImplicitBatch
+                    trt_mode_ == TrtTestMode::kImplicitBatch
                         ? Status(error::UNIMPLEMENTED,
                                  "TensorRT does not allow manipulation of the "
                                  "batch dimension, at my_expanddims")
@@ -3251,7 +3296,7 @@ TEST_P(OpConverterTest1, ConvertExpandDims) {
 }
 
 TEST_P(OpConverterTest1, ConvertSqueeze) {
-  const bool use_implicit_batch = (trt_mode == TrtTestMode::kImplicitBatch);
+  const bool use_implicit_batch = (trt_mode_ == TrtTestMode::kImplicitBatch);
   // Get the NodeDef for Squeeze.
   auto get_squeeze_nodedef = [](std::vector<int> axes,
                                 DataType tf_type) -> NodeDef {
@@ -3274,7 +3319,7 @@ TEST_P(OpConverterTest1, ConvertSqueeze) {
           {},            // input partial dims
           {2, 3},        // expected output dims
           {},            // axis
-          trt_mode == TrtTestMode::kExplicitBatch
+          trt_mode_ == TrtTestMode::kExplicitBatch
               ? Status::OK()
               : Status{error::UNIMPLEMENTED,
                        "Squeeze is not implemented for empty squeeze_dims, at "
@@ -3333,7 +3378,7 @@ TEST_P(OpConverterTest1, ConvertSqueeze) {
              "Dimension 2 with size 2 cannot be squeezed because it must be "
              "size 1, at my_squeeze"}};
 
-  if (trt_mode == TrtTestMode::kDynamicShape) {
+  if (trt_mode_ == TrtTestMode::kDynamicShape) {
     // In this test we try to squeeze axis=2 which has size > 1. In dynamic
     // shape mode the converter sees only -1, so it cannot catch this error.
     squeeze_non_singleton.status = Status::OK();  // conversion status
@@ -3348,7 +3393,7 @@ TEST_P(OpConverterTest1, ConvertSqueeze) {
   for (TestParamBase p : test_params) {
     SCOPED_TRACE(p);
     Reset();
-    NodeDef node_def = get_squeeze_nodedef(p.param, tf_type);
+    NodeDef node_def = get_squeeze_nodedef(p.param, tf_type_);
     AddTestTensor("input", p.input_dims, {1, 2, 3, 4, 5, 6},
                   p.partial_input_dims);
     TestOpConverter("my_squeeze", node_def, p.expected_output_dims, p.status,
@@ -4103,14 +4148,14 @@ TEST_F(OpConverterTest, ConvertSlice) {
 
 TEST_P(OpConverterTest1, ConvertConv2D) {
   // Get nodedef for Conv2D layer.
-  DataType tf_type_loc = tf_type;
+  DataType tf_type = tf_type_;
   auto get_conv2d_nodedef =
-      [tf_type_loc](std::vector<int> strides = {1, 1, 1, 1},
-                    string padding = "SAME", string data_format = "NCHW",
-                    std::vector<int> dilations = {1, 1, 1, 1}) -> NodeDef {
+      [tf_type](std::vector<int> strides = {1, 1, 1, 1},
+                string padding = "SAME", string data_format = "NCHW",
+                std::vector<int> dilations = {1, 1, 1, 1}) -> NodeDef {
     Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), tf_type_loc);
-    auto filter = ops::Placeholder(s.WithOpName("weights"), tf_type_loc);
+    auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+    auto filter = ops::Placeholder(s.WithOpName("weights"), tf_type);
     ops::Conv2D::Attrs attrs =
         ops::Conv2D::Attrs().DataFormat(data_format).Dilations(dilations);
     auto conv2d = ops::Conv2D(s.WithOpName("my_conv2d"), input, filter, strides,
@@ -4203,12 +4248,12 @@ TEST_P(OpConverterTest1, ConvertConv2D) {
         node_def, error::UNIMPLEMENTED,
         "Stride must be 1 for batch and channel dimensions, at my_conv2d");
   }
-  if (trt_mode == TrtTestMode::kDynamicShape) {
+  if (trt_mode_ == TrtTestMode::kDynamicShape) {
     Reset();
     NodeDef node_def = get_conv2d_nodedef();
     // Channel dim unknown, should fail.
     AddTestTensorWithTFDims("input", {-1, -1, -1, -1},
-                            TfDataTypeToTrt(tf_type));
+                            TfDataTypeToTrt(tf_type_));
     AddTestWeights<float>("weights", {1, 2, 1, 1}, {-1, 1});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
@@ -4230,8 +4275,6 @@ TEST_P(OpConverterTest1, ConvertConv2D) {
 
   // Ok.
   std::vector<TestParams> ok_params = {
-// TODO(b/162447069): Enable the test parameters for TRT 7.1.3.x.
-#if !IS_TRT_VERSION_GE(7, 1, 3, 0)
     // Basic
     TestParams{/*input_dims=*/{1, 1, 2, 3},
                /*input=*/{0, 1, 2, 3, 3, 4},
@@ -4243,9 +4286,6 @@ TEST_P(OpConverterTest1, ConvertConv2D) {
                /*dilations=*/{1, 1, 1, 1},
                /*expected_output_dims=*/{1, 1, 2, 2},
                /*expected_output=*/{1, 1, 0, 1}},
-#endif
-// TODO(b/162448349): Enable the test parameters for TRT 7.1.3.x.
-#if !IS_TRT_VERSION_GE(7, 1, 3, 0)
     // SAME padding (Asymmetric)
     TestParams{/*input_dims=*/{1, 1, 2, 3},
                /*input=*/{0, 1, 2, 3, 3, 4},
@@ -4268,9 +4308,6 @@ TEST_P(OpConverterTest1, ConvertConv2D) {
                /*dilations=*/{1, 1, 1, 1},
                /*expected_output_dims=*/{1, 1, 2, 3},
                /*expected_output=*/{1, 2, -1, 3, 1, -3}},
-#endif
-// TODO(b/162447069): Enable the test parameters for TRT 7.1.3.x.
-#if !IS_TRT_VERSION_GE(7, 1, 3, 0)
     // NHWC
     TestParams{/*input_dims=*/{1, 2, 3, 1},
                /*input=*/{0, 1, 2, 3, 3, 4},
@@ -4304,7 +4341,6 @@ TEST_P(OpConverterTest1, ConvertConv2D) {
                /*dilations=*/{1, 1, 1, 1},
                /*expected_output_dims=*/{1, 1, 2, 2},
                /*expected_output=*/{1, 0, 1, 3}},
-#endif
   };
 
   for (int i = 0; i < ok_params.size(); i++) {
@@ -4313,15 +4349,15 @@ TEST_P(OpConverterTest1, ConvertConv2D) {
         get_conv2d_nodedef(ok_params[i].strides, ok_params[i].padding,
                            ok_params[i].data_format, ok_params[i].dilations);
     std::vector<int> partial_input_shape;
-    if (trt_mode == TrtTestMode::kDynamicShape) {
+    if (trt_mode_ == TrtTestMode::kDynamicShape) {
       // The channel dim cannot have unknown size, fix that.
       partial_input_shape.resize(ok_params[i].input_dims.size(), -1);
       int channel_id = (ok_params[i].data_format == "NCHW") ? 1 : 3;
       partial_input_shape[channel_id] = ok_params[i].input_dims[channel_id];
     }
 
-    AddTestTensor("input", ok_params[i].input_dims, tf_type, ok_params[i].input,
-                  partial_input_shape);
+    AddTestTensor("input", ok_params[i].input_dims, tf_type_,
+                  ok_params[i].input, partial_input_shape);
     AddTestWeights<float>("weights", ok_params[i].filter_dims,
                           ok_params[i].filter);
 
@@ -4848,7 +4884,7 @@ TEST_P(OpConverterTest1, ConvertPool) {
   for (int nDim : test_nDims) {
     // Input is weights, should fail.
     Reset();
-    NodeDef node_def = get_pool_nodedef(tf_type, nDim);
+    NodeDef node_def = get_pool_nodedef(tf_type_, nDim);
 
     AddTestWeights<float>("input", {1, 1, 1, 2, 3}, {1, 2, 3, 4, 5, 6});
     RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
@@ -4957,7 +4993,7 @@ TEST_P(OpConverterTest1, ConvertPool) {
       for (bool is_max_pooling : {true, false}) {
         Reset();
         NodeDef node_def =
-            get_pool_nodedef(tf_type, nDim, ksize, strides, p.padding,
+            get_pool_nodedef(tf_type_, nDim, ksize, strides, p.padding,
                              data_format, is_max_pooling);
         AddTestTensor("input", input_dims, input);
         TestOpConverter("my_pool", node_def, expected_output_dims, Status::OK(),
@@ -5019,7 +5055,7 @@ TEST_F(OpConverterTest, ConvertTopK) {
 TEST_P(OpConverterTest3, ConvertGather) {
   // Get the NodeDef for GatherV2.
   Scope s = Scope::NewRootScope();
-  auto params = ops::Placeholder(s.WithOpName("params"), tf_type);
+  auto params = ops::Placeholder(s.WithOpName("params"), tf_type_);
   auto indices = ops::Placeholder(s.WithOpName("indices"), DT_INT32);
   auto axis = ops::Placeholder(s.WithOpName("axis"), DT_INT32);
   auto gather = ops::GatherV2(s.WithOpName("my_gather"), params, indices, axis);
@@ -5027,7 +5063,7 @@ TEST_P(OpConverterTest3, ConvertGather) {
   {
     // Axis is a tensor, should fail.
     Reset();
-    AddTestTensor("params", {1, 1, 2, 3}, tf_type, {});
+    AddTestTensor("params", {1, 1, 2, 3}, tf_type_, {});
     AddTestTensor("indices", {1, 2}, DT_INT32, {});
     AddTestTensor("axis", {1}, DT_INT32, {});
     RunValidationAndConversion(
@@ -5072,7 +5108,7 @@ TEST_P(OpConverterTest3, ConvertGather) {
                  /*expected_output_shape=*/{2, 1, 1, 3},
                  /*expected_output=*/{4, 5, 6, 1, 2, 3},
                  /*params_is_tensor=*/true,
-                 trt_mode == TrtTestMode::kImplicitBatch
+                 trt_mode_ == TrtTestMode::kImplicitBatch
                      ? Status{error::UNIMPLEMENTED,
                               "TensorRT does not allow manipulation of the"
                               " batch dimension, at my_gather"}
@@ -5085,7 +5121,7 @@ TEST_P(OpConverterTest3, ConvertGather) {
                  /*expected_output_shape=*/{2, 1, 2, 1},
                  /*expected_output=*/{3, 1, 6, 4},
                  /*params_is_tensor=*/true,
-                 trt_mode == TrtTestMode::kImplicitBatch
+                 trt_mode_ == TrtTestMode::kImplicitBatch
                      ? Status{error::UNIMPLEMENTED,
                               "Indices must have a batch size of 1 when params"
                               " is a tensor."}
@@ -5099,7 +5135,7 @@ TEST_P(OpConverterTest3, ConvertGather) {
                  /*expected_output_shape=*/{2, 1, 2},
                  /*expected_output=*/{2, 3, 5, 6},
                  /*params_is_tensor=*/false,
-                 trt_mode == TrtTestMode::kImplicitBatch
+                 trt_mode_ == TrtTestMode::kImplicitBatch
                      ? Status{error::UNIMPLEMENTED,
                               "The input axis must be zero when params is a"
                               " weight."}
@@ -5112,13 +5148,13 @@ TEST_P(OpConverterTest3, ConvertGather) {
                  /*expected_output_shape=*/{2},
                  /*expected_output=*/{2, 4},
                  /*params_is_tensor=*/true,
-                 trt_mode == TrtTestMode::kImplicitBatch  // conversion_status
+                 trt_mode_ == TrtTestMode::kImplicitBatch  // conversion_status
                      ? Status{error::UNIMPLEMENTED,
                               "TensorRT does not allow manipulation of the "
                               "batch dimension, at my_gather"}
                      : Status::OK(),
-                 Status::OK(),                            // runtime_status
-                 trt_mode == TrtTestMode::kImplicitBatch  // add_index_status
+                 Status::OK(),                             // runtime_status
+                 trt_mode_ == TrtTestMode::kImplicitBatch  // add_index_status
                      ? Status{error::INVALID_ARGUMENT,
                               "Batch size doesn't match for tensor indices: "
                               "Provided batch size does not match converter "
@@ -5233,7 +5269,7 @@ TEST_P(OpConverterTest3, ConvertGather) {
     if (p.params_is_tensor) {
       AddTestTensor("params", p.params_shape, params_input);
     } else {
-      AddTestWeights("params", p.params_shape, params_input, tf_type);
+      AddTestWeights("params", p.params_shape, params_input, tf_type_);
     }
     AddTestTensor("indices", p.indices_shape, DT_INT32, p.indices, {},
                   p.add_index_status);
@@ -5273,7 +5309,7 @@ TEST_P(OpConverterTest1, ConvertReduce) {
   {
     // Input is weights, should fail.
     Reset();
-    const NodeDef node_def = CreateReduceOp<ops::Sum>(tf_type, false);
+    const NodeDef node_def = CreateReduceOp<ops::Sum>(tf_type_, false);
     AddTestWeights<float>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
     AddTestWeights<int32>("axis", {1}, {1});
     RunValidationAndConversion(
@@ -5283,7 +5319,7 @@ TEST_P(OpConverterTest1, ConvertReduce) {
   {
     // Axis is weights, should fail.
     Reset();
-    const NodeDef node_def = CreateReduceOp<ops::Sum>(tf_type, false);
+    const NodeDef node_def = CreateReduceOp<ops::Sum>(tf_type_, false);
     AddTestTensor("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
     AddTestTensor("axis", {1}, DT_INT32, {1});
     RunValidationAndConversion(
@@ -5343,7 +5379,7 @@ TEST_P(OpConverterTest1, ConvertReduce) {
       for (auto p : params) {
         SCOPED_TRACE(StrCat(op.name, keep_dims ? "keep_dims" : ""));
         Reset();
-        NodeDef node_def = op.get_node(tf_type, keep_dims);
+        NodeDef node_def = op.get_node(tf_type_, keep_dims);
 
         AddTestTensor("input", p.input_dims, p.input_values);
         AddTestWeights<int32>("axis", {static_cast<int>(p.axis.size())},
@@ -5363,7 +5399,7 @@ TEST_P(OpConverterTest1, ConvertReduce) {
             int ax_positive = ax >= 0 ? ax : ax + rank;
             // Zero marks elements that we will remove later.
             expected_output_dims[ax_positive] = keep_dims ? 1 : 0;
-            if (trt_mode == TrtTestMode::kImplicitBatch &&
+            if (trt_mode_ == TrtTestMode::kImplicitBatch &&
                 (ax == 0 || ax == -rank)) {
               p.conversion_status = errors::Unimplemented(
                   "TensorRT does not allow manipulation of the batch "
@@ -5399,7 +5435,7 @@ TEST_P(OpConverterTest1, ConvertUnary) {
   {
     // Input is weights, should fail.
     Reset();
-    const NodeDef node_def = CreateUnaryOp<ops::Neg>(tf_type);
+    const NodeDef node_def = CreateUnaryOp<ops::Neg>(tf_type_);
     AddTestWeights<float>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
@@ -5455,7 +5491,7 @@ TEST_P(OpConverterTest1, ConvertUnary) {
     if (!op_map.count(op_name)) {
       FAIL() << "Unary op test map does not contain op " << op_name;
     }
-    NodeDef node_def = op_map[op_name].first(tf_type);
+    NodeDef node_def = op_map[op_name].first(tf_type_);
 
     // TODO(bixia): we assume this test is only instantiated for DT_FLOAT for
     // now. Need to find a better way to express input and output types.
@@ -5463,7 +5499,7 @@ TEST_P(OpConverterTest1, ConvertUnary) {
     // TODO(tfeher): improve tests by defining an expected output data type and
     // check that. Currently only the shape and values of the output are
     // checked.
-    DataType input_tf_type = op_name == "Cast" ? DT_HALF : tf_type;
+    DataType input_tf_type = op_name == "Cast" ? DT_HALF : tf_type_;
 
     std::vector<float> input_values{-0.9f, 0.6f, 0.0f, -3.5f, 100.0f, 2.9f};
     AddTestTensor("input", p.input_dims, input_tf_type, input_values);
@@ -6030,7 +6066,7 @@ TEST_P(OpConverterTest2, ConvertPack) {
        /*axis=*/1,
        /*expected_output_dims=*/{1, 2, 2, 3},
        /*expected_output=*/InitTestVector<float>(12),
-       trt_mode == TrtTestMode::kImplicitBatch
+       trt_mode_ == TrtTestMode::kImplicitBatch
            ? Status{error::UNIMPLEMENTED,
                     "The input \"values_1\" for Pack must be a tensor, at "
                     "my_pack"}
@@ -6056,7 +6092,7 @@ TEST_P(OpConverterTest2, ConvertPack) {
        /*axis=*/-4,
        /*expected_output_dims=*/{2, 1, 2, 3},
        /*expected_output=*/InitTestVector<float>(12),
-       trt_mode == TrtTestMode::kImplicitBatch
+       trt_mode_ == TrtTestMode::kImplicitBatch
            ? Status{error::UNIMPLEMENTED,
                     "TensorRT does not allow manipulation of the batch "
                     "dimension, at my_pack"}
@@ -6116,7 +6152,7 @@ TEST_P(OpConverterTest2, ConvertPack) {
       },
   };
   // Inputs have inconsistent shapes, should fail.
-  if (trt_mode != TrtTestMode::kDynamicShape) {
+  if (trt_mode_ != TrtTestMode::kDynamicShape) {
     params.push_back(TestParams{
         /*input_shapes=*/{{1, 2, 3}, {1, 3, 2}},
         /*partial_input_shapes=*/{{}, {}},
@@ -6136,7 +6172,7 @@ TEST_P(OpConverterTest2, ConvertPack) {
     // TODO(tfeher) Add dynamic shapes test once TRT handles shape error
     // decently
   }
-  if (trt_mode == TrtTestMode::kDynamicShape) {
+  if (trt_mode_ == TrtTestMode::kDynamicShape) {
     // Test with mixed dynamic / static shape input tensors
     params.push_back(
         TestParams{/*input_shapes=*/{{1, 2, 3}, {1, 2, 3}},
@@ -6152,14 +6188,14 @@ TEST_P(OpConverterTest2, ConvertPack) {
     const int num_inputs = p.input_shapes.size();
     EXPECT_EQ(num_inputs, p.input_values.size());
 
-    NodeDef node_def = GetPackNodeDef(tf_type, num_inputs, p.axis);
+    NodeDef node_def = GetPackNodeDef(tf_type_, num_inputs, p.axis);
     // Create inputs.
     for (int j = 0; j < num_inputs; ++j) {
       if (j == 1 && p.input_1_is_weight) {
         AddTestWeights(StrCat("values_", j), p.input_shapes[j],
-                       p.input_values[j], tf_type);
+                       p.input_values[j], tf_type_);
       } else {
-        AddTestTensor(StrCat("values_", j), p.input_shapes[j], tf_type,
+        AddTestTensor(StrCat("values_", j), p.input_shapes[j], tf_type_,
                       p.input_values[j], p.partial_input_shapes[j]);
       }
     }
@@ -6687,7 +6723,7 @@ TEST_P(OpConverterTest2, ConvertSquaredDifference) {
   {
     // Input is a weight, should fail.
     Reset();
-    NodeDef node_def = GetSquaredDifferenceNodeDef(tf_type);
+    NodeDef node_def = GetSquaredDifferenceNodeDef(tf_type_);
     AddTestWeights<float>("x", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
     AddTestTensor("y", {1, 1, 2, 3});
     RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
@@ -6714,7 +6750,7 @@ TEST_P(OpConverterTest2, ConvertSquaredDifference) {
        /*value_y=*/std::vector<float>(7 * 5, 0),
        /*expected_output_dims=*/{1, 1, 2, 3},
        /*expected_output=*/common_input,
-       trt_mode == TrtTestMode::kDynamicShape
+       trt_mode_ == TrtTestMode::kDynamicShape
            ? Status::OK()
            : errors::InvalidArgument("Infeasible broadcast scheme"),
        errors::Internal(
@@ -6740,7 +6776,7 @@ TEST_P(OpConverterTest2, ConvertSquaredDifference) {
 
   for (auto p : params) {
     Reset();
-    NodeDef node_def = GetSquaredDifferenceNodeDef(tf_type);
+    NodeDef node_def = GetSquaredDifferenceNodeDef(tf_type_);
     AddTestTensor("x", p.dims_x, p.value_x);
     AddTestTensor("y", p.dims_y, p.value_y);
     TestOpConverter("my_squared_diff", node_def, p.expected_output_dims,
@@ -6776,9 +6812,7 @@ template <typename OpType, DataType dtype>
 void TestConvertResize(OpConverterTest* test) {
   typedef typename EnumToDataType<dtype>::Type CType;
 
-  std::vector<ResizeTestParams<CType>> params{
-// TODO(b/162442839): Enable the test parameters for TRT 7.1.3.x.
-#if !IS_TRT_VERSION_GE(7, 1, 3, 0)
+  std::vector<ResizeTestParams<CType>> params {
     {
         /*input_dims=*/{1, 2, 1},       // H, W, C
         /*output_resize_dims=*/{2, 3},  // H_out, W_out
@@ -6790,7 +6824,6 @@ void TestConvertResize(OpConverterTest* test) {
         /*expected_bilinear_output_values=*/
         CastTestVector<float, CType>({2.0f, 0.f, -1.0f, 2.0f, 0.f, -1.0f}),
     },
-#endif
     {
       /*input_dims=*/{1, 2, 1},           // H, W, C
           /*output_resize_dims=*/{2, 3},  // H_out, W_out
@@ -6804,6 +6837,13 @@ void TestConvertResize(OpConverterTest* test) {
     }
   };
 
+// This use case is not supported as of TRT version 7.1
+#if IS_TRT_VERSION_GE(7, 1, 0, 0)
+  if (std::is_same<OpType, ops::ResizeBilinear>::value) {
+    params.erase(params.begin());
+  }
+#endif
+
   for (int i = 0; i < params.size(); ++i) {
     test->Reset();
     // Create resize node.
@@ -6846,7 +6886,7 @@ TEST_F(OpConverterTest, ConvertResize) {
     // First input is weight, should fail.
     Reset();
     NodeDef node_def =
-        MakeResizeNodeDef<ops::ResizeBilinear>("my_resize", DT_FLOAT, false);
+        MakeResizeNodeDef<ops::ResizeBilinear>("my_resize", DT_FLOAT, true);
     AddTestWeights<float>("input", {1, 2}, {1, 2});
     AddTestWeights<int>("size", {1, 2}, {1, 2});
     RunValidationAndConversion(
@@ -6858,7 +6898,7 @@ TEST_F(OpConverterTest, ConvertResize) {
     // output dimension is a tensor, should fail.
     Reset();
     NodeDef node_def =
-        MakeResizeNodeDef<ops::ResizeBilinear>("my_resize", DT_FLOAT, false);
+        MakeResizeNodeDef<ops::ResizeBilinear>("my_resize", DT_FLOAT, true);
     AddTestTensor("input", {1, 2});
     AddTestTensor("size", {1, 2});
     RunValidationAndConversion(
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
index 1337a733f91..84f25d355ae 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
@@ -478,7 +478,7 @@ absl::Span<const OpInfo::TensorProperties> GetInputsToDeterminateBatchSize(
       "Add",
       "AddV2",
       "Mul",
-      "Sub"
+      "Sub",
       "Div",
       "FloorDiv",
       "RealDiv",
@@ -646,10 +646,12 @@ ClusterBatchSize GetClusterBatchSizeForNode(
     return cluster_batch_size;
   }
 
+  // As shape inference cannot provide any useful information about the batch
+  // size, we keep it as missing.
   if (!graph_properties ||
       !graph_properties->HasInputProperties(node->name())) {
     VLOG(3) << "doesn't have input property";
-    return cluster_batch_size.SetBatchSizeValue(-1);
+    return cluster_batch_size;
   }
 
   const std::vector<OpInfo::TensorProperties>& input_properties =
@@ -660,7 +662,8 @@ ClusterBatchSize GetClusterBatchSizeForNode(
   const TensorShapeProto* leading_shape = optional_leading_shape.value();
 
   DCHECK(!leading_shape->unknown_rank() && leading_shape->dim_size() >= 2);
-  return cluster_batch_size.SetBatchSizeValue(leading_shape->dim(0).size());
+  VLOG(3) << "has batch size " << leading_shape->dim(0).size();
+  return cluster_batch_size.SetBatchSize(leading_shape->dim(0).size());
 }
 
 void AddSegmentForNode(const grappler::GraphProperties* graph_properties,
@@ -668,12 +671,28 @@ void AddSegmentForNode(const grappler::GraphProperties* graph_properties,
                        SimpleNode* node,
                        const DeviceNameUtils::ParsedName& device_name,
                        bool use_implicit_batch) {
-  segments->emplace_back(
-      node,
+  ClusterProperty property(
       GetClusterBatchSizeForNode(graph_properties,
                                  node == nullptr ? nullptr : node->tf_node(),
                                  use_implicit_batch),
       device_name);
+  segments->emplace_back(node, std::move(property));
+}
+
+bool OpBatchSizeExceedMaximumBatchSize(
+    const grappler::GraphProperties* graph_properties, const Node* node,
+    bool use_implicit_batch, absl::optional<int> maximum_batch_size) {
+  ClusterBatchSize cluster_batch_size =
+      GetClusterBatchSizeForNode(graph_properties, node, use_implicit_batch);
+  // If the batch size is dynamic, then the negative dynamic batch size
+  // identifier shall never be larger than the positive max batch size.
+  if (cluster_batch_size.HasBatchSize() && maximum_batch_size.has_value() &&
+      cluster_batch_size.GetBatchSize() > maximum_batch_size.value()) {
+    VLOG(2) << "OP batch size " << cluster_batch_size.GetBatchSize()
+            << "  max_batch_size " << maximum_batch_size.value();
+    return true;
+  }
+  return false;
 }
 
 }  // namespace
@@ -690,6 +709,10 @@ Status SegmentGraph(const Graph* tf_graph,
         "Explicit batch mode should allow dynamic non-batch dimensions");
   }
 
+  if (options.use_implicit_batch && !options.maximum_batch_size.has_value()) {
+    return errors::Internal("Implicit batch mode requires maximum_batch_size");
+  }
+
   if (!options.allow_dynamic_non_batch_dim && !graph_properties) {
     return errors::Internal(
         "Need graph propertities to disallow dynamic non-batch dimensions");
@@ -768,6 +791,14 @@ Status SegmentGraph(const Graph* tf_graph,
             << "(Op type: " << node->tf_node()->type_string() << "), "
             << "(Op name: " << node->name() << ")";
         exclude_node("Denylisted with the env var TF_TRT_OP_DENYLIST");
+      } else if (OpBatchSizeExceedMaximumBatchSize(
+                     graph_properties, node->tf_node(),
+                     options.use_implicit_batch, options.maximum_batch_size)) {
+        LOG_WARNING_WITH_PREFIX
+            << "Implicit batch mode requires OP batch size not larger than "
+            << "the converter maximum batch size: "
+            << "(Op name: " << node->name() << ")";
+        exclude_node("OP batch size too large");
       } else {
         VLOG(2) << "Accepted as a TF-TRT candidate, "
                 << "(Op type: " << node->tf_node()->type_string() << "), "
@@ -819,9 +850,9 @@ Status SegmentGraph(const Graph* tf_graph,
     // step until no output edges can be further contracted. This is because
     // contracting an output edge may unblock new edges for contracting.
     ClusterBatchSize expected_batch_size =
-        node_segments[node->id()].BatchSize();
+        node_segments[node->id()].Property().BatchSize();
     DeviceNameUtils::ParsedName expected_device_name =
-        node_segments[node->id()].DeviceName();
+        node_segments[node->id()].Property().DeviceName();
     VLOG(3) << "batch size " << expected_batch_size;
     while (true) {
       std::set<const SimpleEdge*, SimpleEdgePtrCompare> contract_edges;
@@ -842,7 +873,7 @@ Status SegmentGraph(const Graph* tf_graph,
           continue;
         }
         // Out node must have compatible batch size.
-        ClusterBatchSize out_batch_size = out_cluster->BatchSize();
+        ClusterBatchSize out_batch_size = out_cluster->Property().BatchSize();
         ClusterBatchSize merged_batch_size = expected_batch_size;
         if (!merged_batch_size.MergeIfCompatible(out_batch_size)) {
           VLOG(3) << "... ... incompatible batch sizes "
@@ -852,7 +883,7 @@ Status SegmentGraph(const Graph* tf_graph,
         }
 
         const DeviceNameUtils::ParsedName& out_device_name =
-            out_cluster->DeviceName();
+            out_cluster->Property().DeviceName();
         absl::optional<DeviceNameUtils::ParsedName> merged_device_name =
             MergeIfCompatible(expected_device_name, out_device_name);
         if (!merged_device_name.has_value()) {
@@ -898,11 +929,13 @@ Status SegmentGraph(const Graph* tf_graph,
           graph->RemoveEdge(r);
         }
       }
-      if (expected_batch_size != node_segments[node->id()].BatchSize()) {
+      if (expected_batch_size !=
+          node_segments[node->id()].Property().BatchSize()) {
         return errors::Internal(
             "expected batch size is not the same as the actual batch size");
       }
-      if (expected_device_name != node_segments[node->id()].DeviceName()) {
+      if (expected_device_name !=
+          node_segments[node->id()].Property().DeviceName()) {
         return errors::Internal(
             "expected device name is not the same as the actual device name");
       }
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.h b/tensorflow/compiler/tf2tensorrt/segment/segment.h
index 3f79983cfd2..bab6e089fa4 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <set>
 #include <vector>
 
+#include "absl/types/optional.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
@@ -38,6 +39,9 @@ struct SegmentOptions {
   // Segment must contain at least this many nodes.
   int minimum_segment_size = 2;
   bool use_implicit_batch = true;
+  // The maximum batch size used to build the engines in the graph, when
+  // use_implicit_batch is true.
+  absl::optional<int> maximum_batch_size = absl::nullopt;
   // When use_implicit_batch is false or when we are building dynamic engines,
   // we allow dynamic non-batch dimensions.
   bool allow_dynamic_non_batch_dim = false;
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
index bf277328fe7..ee406c9743f 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment_test.cc
@@ -108,8 +108,9 @@ class SegmentTest : public ::testing::Test {
     segment_options_.allow_dynamic_non_batch_dim = true;
   }
 
-  void EnableImplicitBatchModeForStaticEngine() {
+  void EnableImplicitBatchModeForStaticEngine(int maximum_batch_size = 1000) {
     segment_options_.use_implicit_batch = true;
+    segment_options_.maximum_batch_size = maximum_batch_size;
     segment_options_.allow_dynamic_non_batch_dim = false;
   }
 
@@ -487,7 +488,11 @@ TEST_F(SegmentTest, TwoChainsDiffBatchSizes) {
   const std::set<string> all_nodes = {"const-scalar", "output-0", "output-1"};
   EnableImplicitBatchModeForStaticEngine();
   RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes,
-          {{"output-0", "const-scalar"}});
+          /*expected_segments=*/{{"output-0", "const-scalar"}});
+
+  EnableImplicitBatchModeForStaticEngine(1);
+  RunTest(&g, &static_graph_properties, all_nodes, all_nodes, all_nodes,
+          /*expected_segments=*/{});
 }
 
 TEST_F(SegmentTest, SameRankImplicitBroadcastingStaticBatchSize) {
diff --git a/tensorflow/compiler/tf2tensorrt/segment/union_find.cc b/tensorflow/compiler/tf2tensorrt/segment/union_find.cc
new file mode 100644
index 00000000000..9aa7783b637
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/segment/union_find.cc
@@ -0,0 +1,128 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2tensorrt/segment/union_find.h"
+
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+namespace segment {
+
+namespace {
+template <typename T>
+inline bool CheckIfCompatible(const absl::optional<T>& a,
+                              const absl::optional<T>& b) {
+  if (a.has_value() && b.has_value()) {
+    return *a == *b;
+  }
+  return true;
+}
+
+template <typename T>
+inline bool UnifyValues(absl::optional<T>& a, absl::optional<T>& b) {
+  if (a.has_value()) {
+    b = a;
+  } else {
+    a = b;
+  }
+  return true;
+}
+
+template <typename T>
+inline absl::optional<T> MergeCompatible(const absl::optional<T>& a,
+                                         const absl::optional<T>& b) {
+  DCHECK(CheckIfCompatible(a, b));
+  return a.has_value() ? a : b;
+}
+
+}  // namespace
+
+ClusterBatchSize::ClusterBatchSize() : batch_size_(absl::nullopt) {}
+
+bool ClusterBatchSize::operator==(const ClusterBatchSize& other) {
+  return batch_size_ == other.batch_size_;
+}
+
+ClusterBatchSize& ClusterBatchSize::SetBatchSize(int batch_size) {
+  SetBatchSize(static_cast<absl::optional<int>>(batch_size));
+  return *this;
+}
+
+ClusterBatchSize& ClusterBatchSize::SetBatchSize(
+    const absl::optional<int>& batch_size) {
+  batch_size_ = MergeCompatible<int>(batch_size_, batch_size);
+  return *this;
+}
+
+bool ClusterBatchSize::HasBatchSize() const { return batch_size_.has_value(); }
+
+int ClusterBatchSize::GetBatchSize() const {
+  DCHECK(HasBatchSize());
+  return batch_size_.value();
+}
+
+bool ClusterBatchSize::MergeIfCompatible(const ClusterBatchSize& other) {
+  if (!CheckIfCompatible(batch_size_, other.batch_size_)) {
+    return false;
+  }
+  SetBatchSize(other.batch_size_);
+  return true;
+}
+
+string ClusterBatchSize::ToString() const {
+  string s;
+  absl::StrAppendFormat(&s, "batch_size=(");
+  if (HasBatchSize()) {
+    absl::StrAppendFormat(&s, "%d", GetBatchSize());
+  } else {
+    absl::StrAppendFormat(&s, "?");
+  }
+  absl::StrAppend(&s, ")");
+  return s;
+}
+
+ClusterProperty::ClusterProperty(const ClusterBatchSize& batch_size,
+                                 const DeviceNameUtils::ParsedName& device_name)
+    : batch_size_(batch_size), device_name_(device_name) {}
+
+Status ClusterProperty::Merge(const ClusterProperty& other) {
+  ClusterBatchSize merged_batch_size(batch_size_);
+  if (!merged_batch_size.MergeIfCompatible(other.batch_size_)) {
+    return errors::Internal(
+        "trying to merge clusters with incompatible batch sizes.");
+  }
+
+  absl::optional<DeviceNameUtils::ParsedName> merged_device_name =
+      MergeIfCompatible(device_name_, other.device_name_);
+  if (!merged_device_name.has_value()) {
+    return errors::Internal(
+        "trying to merge clusters with incompatible device assignment.");
+  }
+
+  batch_size_ = std::move(merged_batch_size);
+  device_name_ = std::move(merged_device_name.value());
+  return Status::OK();
+}
+
+}  // namespace segment
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/segment/union_find.h b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
index b91f5771ce5..c72ea1f7553 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/union_find.h
+++ b/tensorflow/compiler/tf2tensorrt/segment/union_find.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
 #define TENSORFLOW_COMPILER_TF2TENSORRT_SEGMENT_UNION_FIND_H_
 
-#include "absl/strings/str_format.h"
 #include "absl/types/optional.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
 
@@ -28,110 +28,67 @@ namespace segment {
 // ClusterBatchSize is a data structure to record the batch size we have seen
 // for a cluster during segmentation.
 //
-// When constructing clusters for implicit batch mode, we support the
-// with both dynamic batch size and static batch size. We restrict nodes inside
-// a cluster to either have dynamic batch size or have the same value for static
-// batch size. For this reason, we use a field has_dynamic_batch_value_ to keep
-// track of whether the cluster has any node with dynamic batch size. We use
-// field static_batch_value_ to keep track of whether the cluster has any node
-// with static batch size and what the value of the static batch size, if any.
-// Examples:
+// With the help of shape inference, all the dynamic batch sizes are converted
+// to a negative integer number.
+// If the number is -1, then nothing is known about the dynamic batch size.
+// Ideally, we should not put nodes with -1 batch size into the same cluster,
+// as they will likely have different batch sizes at runtime. However, we
+// currently treat -1 as an equivalent class for simple implementation. We may
+// need to revise this if it causes performance issues.
+// If the number is strictly less than -1, then it represents a equivalent
+// class. It is infered that all the nodes with the same equivalent class
+// (strictly less than -1) shall have the same batch size at runtime.
+//
+// When constructing clusters for implicit batch mode, we support both
+// dynamic batch sizes and static batch sizes. As all the nodes inside the same
+// cluster shall have the same batch size at runtime, we restrict nodes inside a
+// cluster to either have the same dynamic batch size equivalent class or the
+// same static batch size value.
+//
 // cluster:  a = a1[1,3] + a1[1,3]
-// ClusterBatchSize: has_dynamic_batch_size_ = false
-//                   static_batch_value_ = {has value, 1}
+// ClusterBatchSize: batch_size_ = 1
 //
 // cluster:  b = b1[-1,3] + b2[-1, 3]
-// ClusterBatchSize: has_dynamic_batch_size_ = true
-//                   static_batch_value_ = {has no value}
+// ClusterBatchSize: batch_size_ = -1
 //
-// cluster:  a = a1[1,3] + a1[1,3]; b = b1[-1,3] + b2[-1, 3]
-// ClusterBatchSize: has_dynamic_batch_size_ = true
-//                   static_batch_value_ = {has value, 1}
+// cluster:  c = c1[-2,3] + c2[-2, 3]
+// ClusterBatchSize: batch_size_ = -2
 //
 // When constructing cluster for explicit batch mode, all ClusterBatchSize is
 // irrelevant.
 //
-//
-absl::optional<int> static_batch_value_;
+
 class ClusterBatchSize {
  public:
-  ClusterBatchSize()
-      : has_dynamic_batch_value_(false), static_batch_value_(absl::nullopt) {}
+  ClusterBatchSize();
 
-  bool operator==(const ClusterBatchSize& b) {
-    return HasDynamicBatchValue() == b.HasDynamicBatchValue() &&
-           static_batch_value_ == b.static_batch_value_;
-  }
+  bool operator==(const ClusterBatchSize& other);
+  bool operator!=(const ClusterBatchSize& other) { return !(*this == other); }
 
-  bool operator!=(const ClusterBatchSize& b) { return !(*this == b); }
+  // Sets the batch size assuming that the object doesn't have a batch size yet:
+  //   A non-negative input representing a static batch size value.
+  //   A negative input representing a dynamic batch size equivalent class.
+  ClusterBatchSize& SetBatchSize(int batch_size);
+  bool HasBatchSize() const;
+  int GetBatchSize() const;
 
-  int GetStaticBatchValue() const {
-    DCHECK(HasStaticBatchValue());
-    return static_batch_value_.value();
-  }
+  // Merge `other` into the current ClusterBatchSize if the two are not
+  // conflicting. Two ClusterBatchSizes are conflicting iff they both have a
+  // value and their values are different.
+  bool MergeIfCompatible(const ClusterBatchSize& other);
 
-  // Sets the batch size value assuming that the object doesn't have a batch
-  // size value yet:
-  //   a non-negative input value representing a known batch size.
-  //   a negative input value representing a dynamic batch size.
-  ClusterBatchSize SetBatchSizeValue(int value) {
-    if (value < 0) {
-      has_dynamic_batch_value_ = true;
-      return *this;
-    }
-    static_batch_value_ = value;
-    return *this;
-  }
-
-  bool MergeIfCompatible(const ClusterBatchSize& b) {
-    bool is_compatible = MergeIfCompatible(b.static_batch_value_);
-    if (!is_compatible) return false;
-
-    if (!HasDynamicBatchValue() && b.HasDynamicBatchValue()) {
-      has_dynamic_batch_value_ = true;
-    }
-
-    return true;
-  }
-
-  // Returns a string for the batch size value. If the object has a static
-  // batch size value, return a string for the value. If the object has a
-  // dynamic size value, return -1. Otherwise, returns -2 to represent that
-  // a batch size hasn't been set yet.
-  string ToString() const {
-    string s;
-    absl::StrAppendFormat(&s, "batch_size=(%d,%d,", HasDynamicBatchValue(),
-                          HasStaticBatchValue());
-    if (HasStaticBatchValue()) {
-      absl::StrAppendFormat(&s, "%d", GetStaticBatchValue());
-    }
-    absl::StrAppend(&s, ")");
-    return s;
-  }
+  // Returns a string for the batch size.
+  //   If the object has a static batch size, return a string representing a
+  //     non-negative integer.
+  //   If the object has a dynamic batch size, return a string representing a
+  //     negative integer as an equivalent class.
+  //   If the object doesn't have a batch size yet, return a "?" symbol string.
+  std::string ToString() const;
 
  private:
-  bool HasStaticBatchValue() const { return static_batch_value_.has_value(); }
-  bool HasDynamicBatchValue() const { return has_dynamic_batch_value_; }
+  ClusterBatchSize& SetBatchSize(const absl::optional<int>& batch_size);
 
- private:
-  bool MergeIfCompatible(const absl::optional<int>& b) {
-    bool is_compatible = !HasStaticBatchValue() || !b.has_value() ||
-                         GetStaticBatchValue() == b.value();
-    if (!is_compatible) {
-      return false;
-    }
-    if (!HasStaticBatchValue() && b.has_value()) {
-      static_batch_value_ = b;
-    }
-    return true;
-  }
-
- private:
-  // To track whether the cluster has any node with dynamic batch size.
-  bool has_dynamic_batch_value_;
-  // To track whether the cluster has any node with static batch size, and the
-  // unique value for static batch size.
-  absl::optional<int> static_batch_value_;
+  absl::optional<int> batch_size_;
 };
 
 inline std::ostream& operator<<(std::ostream& os,
@@ -139,89 +96,89 @@ inline std::ostream& operator<<(std::ostream& os,
   return os << batch_size.ToString();
 }
 
-// Represents a disjoint set of copyable values with type T. We use this data
-// structure to construct clusters for TRTEngineOp. As such, this data structure
-// has a field to record the batch size for the current cluster and merges the
-// corresponding batch sizes when merging two clusters. Most of the methods in
-// this class are side-effecting as they also compress the path from the object
-// to the parent of its containing set.
-template <typename T>
-class UnionFind {
+// Represents the accumulated properties of a cluster during segmentation,
+// including information about batch size and device assignment. Clusters shall
+// have compatible properties in order to be merged together.
+class ClusterProperty {
  public:
-  UnionFind() : size_(1), parent_(nullptr) {}
-  UnionFind(const T& v, ClusterBatchSize batch_size,
-            const DeviceNameUtils::ParsedName& device_name)
-      : size_(1),
-        cluster_batch_size_(batch_size),
-        cluster_device_name_(device_name),
-        parent_(nullptr),
-        value_(v) {}
-
-  // Returns the number of elements in the cluster and compresses the path from
-  // this object to the root of the cluster.
-  int Size() { return FindRoot()->size_; }
+  ClusterProperty() {}
+  ClusterProperty(const ClusterBatchSize& batch_size,
+                  const DeviceNameUtils::ParsedName& device_name);
 
   // Returns the batch size of the cluster and compresses the path from this
   // object to the root object.
-  ClusterBatchSize BatchSize() { return FindRoot()->cluster_batch_size_; }
+  const ClusterBatchSize& BatchSize() const { return batch_size_; }
 
   // Returns the device name of the cluster and compresses the path from this
   // object to the root object.
-  const DeviceNameUtils::ParsedName& DeviceName() {
-    return FindRoot()->cluster_device_name_;
-  }
+  const DeviceNameUtils::ParsedName& DeviceName() const { return device_name_; }
 
-  // Merges this cluster with 'other'. This cluster's size_ is updated to
-  // the size of the merged cluster; the size_ of 'other' becomes inaccessible
-  // as only the size_ of the root object is accessible.
-  Status Merge(UnionFind* other);
-
-  // Retrieves the value for the root of the cluster.
-  T& ParentValue() { return FindRoot()->value_; }
-
-  // Returns the value for the object.
-  T& Value() { return value_; }
+  Status Merge(const ClusterProperty& other);
 
  private:
-  // Returns the root object for the cluster and compresses the path from this
+  ClusterBatchSize batch_size_;
+  DeviceNameUtils::ParsedName device_name_;
+};
+
+// Represents a disjoint set of copyable value with type T and accumulated
+// property of the values with type P. Most of the methods in this class are
+// side-effecting as they also compress the path from the object to the parent
+// of its containing set.
+template <typename T, typename P = ClusterProperty>
+class UnionFind {
+ public:
+  UnionFind() : size_(1), parent_(nullptr) {}
+  UnionFind(const T& v, const P& p)
+      : size_(1), parent_(nullptr), value_(v), property_(p) {}
+  UnionFind(const T& v, P&& p)
+      : size_(1), parent_(nullptr), value_(v), property_(p) {}
+
+  // Returns the number of elements in the set and compresses the path from
+  // this object to the root of the set.
+  int Size() { return FindRoot()->size_; }
+
+  // Returns the accumulated property of all the elements in the set and
+  // compresses the path from this object to the root of the set.
+  const P& Property() { return FindRoot()->property_; }
+
+  // Merges this set with 'other'. This updates the size_ and property_ of the
+  // set. The size_ and property_ of 'other' becomes inaccessible as only the
+  // size_ and property_ of the root of the set is accessible.
+  Status Merge(UnionFind* other);
+
+  // Retrieves the value for the root of the set.
+  const T& ParentValue() { return FindRoot()->value_; }
+
+  // Returns the value for the object.
+  const T& Value() const { return value_; }
+
+ private:
+  // Returns the root object for the set and compresses the path from this
   // object to the root object.
   UnionFind* FindRoot();
 
   int size_;
-  ClusterBatchSize cluster_batch_size_;
-  DeviceNameUtils::ParsedName cluster_device_name_;
   UnionFind* parent_;
   T value_;
+  P property_;
 };
 
-template <typename T>
-Status UnionFind<T>::Merge(UnionFind* other) {
+template <typename T, typename P>
+Status UnionFind<T, P>::Merge(UnionFind* other) {
   UnionFind<T>* a = FindRoot();
   UnionFind<T>* b = other->FindRoot();
   if (a == b) return Status::OK();
 
-  ClusterBatchSize batch_size = a->cluster_batch_size_;
-  if (!batch_size.MergeIfCompatible(other->cluster_batch_size_)) {
-    return errors::Internal(
-        "trying to merge clusters with incompatible batch sizes.");
-  }
-
-  absl::optional<DeviceNameUtils::ParsedName> device_name =
-      MergeIfCompatible(a->cluster_device_name_, other->cluster_device_name_);
-  if (!device_name.has_value()) {
-    return errors::Internal(
-        "trying to merge clusters with incompatible device assignment.");
-  }
-
-  a->cluster_batch_size_ = batch_size;
-  a->cluster_device_name_ = *device_name;
+  P merged_property(a->property_);
+  TF_RETURN_IF_ERROR(merged_property.Merge(b->property_));
   b->parent_ = a;
   a->size_ += b->size_;
+  a->property_ = std::move(merged_property);
   return Status::OK();
 }
 
-template <typename T>
-UnionFind<T>* UnionFind<T>::FindRoot() {
+template <typename T, typename P>
+UnionFind<T, P>* UnionFind<T, P>::FindRoot() {
   if (!parent_) return this;
   // Path compression: update intermediate nodes to point to the root of the
   // equivalence class.
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index e9bcbcc6d83..5641339e7ef 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_cuda_cc_test", "tf_openmp_copts")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow:tensorflow.bzl", "if_libtpu", "tf_cc_binary", "tf_cc_test", "tf_copts", "tf_cuda_cc_test", "tf_openmp_copts")
 load(
     "//tensorflow/core/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
@@ -7,10 +8,11 @@ load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_additional_tensor_coding_deps",
     "tf_proto_library",
-    "tf_proto_library_cc",
 )
 load("//tensorflow/compiler/xla:xla.bzl", "xla_py_proto_library")
-load("//tensorflow:tensorflow.bzl", "tf_portable_proto_library")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load("//tensorflow/compiler/xla/service/cpu:build_defs.bzl", "runtime_copts")
 
@@ -40,6 +42,7 @@ package_group(
         "//tensorflow/...",
         "//tensorflow_models/...",
         "//third_party/mlperf/submissions/training/v0_7/models/...",
+        "//third_party/py/keras/...",
     ],
 )
 
@@ -78,19 +81,6 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
-# A proto library that is minimal in size and dependencies for platforms like Android.
-tf_portable_proto_library(
-    name = "portable_tf2xla_proto",
-    config_string = "allow_all:true",
-    header_outs = ["//tensorflow/compiler/tf2xla/tf2xla.proto.h"],
-    portable_deps = ["//tensorflow/core:portable_proto_lib"],
-    proto_deps = [
-        ":tf2xla_proto",
-        "//tensorflow/core:protos_all",
-    ],
-    visibility = ["//visibility:public"],
-)
-
 xla_py_proto_library(
     name = "tf2xla_py",
     has_services = False,
@@ -99,7 +89,7 @@ xla_py_proto_library(
     deps = [":tf2xla_proto"],
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "host_compute_metadata_proto",
     srcs = ["host_compute_metadata.proto"],
     cc_api_version = 2,
@@ -258,6 +248,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "//third_party/eigen3",
         "//tensorflow/core/framework:numeric_types",
+        "//tensorflow/core/platform:bfloat16",
     ] + tf_additional_tensor_coding_deps(),
     alwayslink = 1,
 )
@@ -303,14 +294,18 @@ cc_library(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/service:cpu_plugin",
         "//tensorflow/compiler/xla/service:platform_util",
-        "//tensorflow/compiler/xla/service/cpu:buffer_info_util",
-        "//tensorflow/compiler/xla/service/cpu:cpu_executable",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/stream_executor:platform",
-    ],
+    ] + if_libtpu(
+        if_false = [
+            "//tensorflow/compiler/xla/service:cpu_plugin",
+            "//tensorflow/compiler/xla/service/cpu:buffer_info_util",
+            "//tensorflow/compiler/xla/service/cpu:cpu_executable",
+        ],
+        if_true = [],
+    ),
 )
 
 cc_library(
@@ -334,6 +329,7 @@ cc_library(
         "xla_op_kernel.h",
         "xla_op_registry.h",
     ],
+    copts = tf_copts(),
     visibility = [":friends"],
     deps = [
         ":common",
@@ -349,10 +345,13 @@ cc_library(
         ":xla_helpers",
         ":xla_op_registry",
         ":xla_resource",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/types:variant",
         "//tensorflow/compiler/jit:common",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:shape_inference",
-        "//tensorflow/compiler/mlir/tensorflow:compile_mlir_util_no_tf_dialect_passes",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -370,11 +369,13 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/types:variant",
-    ],
+    ] + if_libtpu(
+        if_false = [
+            "//tensorflow/compiler/mlir:array_container_utils",
+            "//tensorflow/compiler/mlir/tensorflow:compile_mlir_util_no_tf_dialect_passes",
+        ],
+        if_true = [],
+    ),
     alwayslink = 1,
 )
 
@@ -448,8 +449,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:session_options",
-        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/common_runtime:core_cpu_internal",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
     ],
     alwayslink = 1,
 )
@@ -741,10 +742,10 @@ tf_cc_test(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:tensor_testutil",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
@@ -818,9 +819,9 @@ cc_library(
         ":frontend_attributes_util",
         ":functionalize_control_flow_util",
         ":tf2xla_util",
-        "//tensorflow/compiler/jit:union_find",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:union_find",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -846,9 +847,9 @@ cc_library(
         ":functionalize_control_flow_util",
         ":functionalize_while",
         ":tf2xla_util",
-        "//tensorflow/compiler/jit:union_find",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:union_find",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -876,14 +877,20 @@ cc_library(
 
 cc_library(
     name = "mlir_bridge_pass_registration",
-    srcs = [
-        "mlir_bridge_pass_registration.cc",
-    ],
-    deps = [
-        ":mlir_bridge_pass",
-        "//tensorflow/compiler/mlir:mlir_graph_optimization_pass_registration",
-        "//tensorflow/core:core_cpu",
-    ],
+    srcs = if_libtpu(
+        if_false = [
+            "mlir_bridge_pass_registration.cc",
+        ],
+        if_true = [],
+    ),
+    deps = if_libtpu(
+        if_false = [
+            ":mlir_bridge_pass",
+            "//tensorflow/compiler/mlir:mlir_graph_optimization_pass_registration",
+            "//tensorflow/core:core_cpu",
+        ],
+        if_true = [],
+    ),
     alwayslink = 1,
 )
 
@@ -934,9 +941,9 @@ cc_library(
         ":functionalize_cond",
         ":functionalize_control_flow_util",
         ":tf2xla_util",
-        "//tensorflow/compiler/jit:union_find",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:union_find",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/tf2xla/cc/BUILD b/tensorflow/compiler/tf2xla/cc/BUILD
index 973aafe1ad8..45a099baabe 100644
--- a/tensorflow/compiler/tf2xla/cc/BUILD
+++ b/tensorflow/compiler/tf2xla/cc/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_cc")
 
 package(
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.cc b/tensorflow/compiler/tf2xla/functionalize_cond.cc
index 54abccb4cfc..452b102fade 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.cc
@@ -25,9 +25,10 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/optional.h"
-#include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/tf2xla/frontend_attributes_util.h"
+#include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/compiler/xla/union_find.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 596fa8e8e38..2a3e35e0ffd 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -23,12 +23,12 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/types/optional.h"
-#include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/tf2xla/functionalize_cond.h"
 #include "tensorflow/compiler/tf2xla/functionalize_control_flow_util.h"
 #include "tensorflow/compiler/tf2xla/functionalize_while.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/union_find.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
diff --git a/tensorflow/compiler/tf2xla/functionalize_while.cc b/tensorflow/compiler/tf2xla/functionalize_while.cc
index dce5efe5557..79412c4abc8 100644
--- a/tensorflow/compiler/tf2xla/functionalize_while.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_while.cc
@@ -24,11 +24,11 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "absl/types/optional.h"
-#include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/tf2xla/frontend_attributes_util.h"
 #include "tensorflow/compiler/tf2xla/functionalize_cond.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/union_find.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/node_def_builder.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 26051c98cb7..7e1878682f2 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 
 package(
@@ -108,6 +109,7 @@ tf_kernel_library(
         "stack_ops.cc",
         "stateful_random_ops.cc",
         "stateless_random_ops.cc",
+        "stateless_random_ops_v2.cc",
         "strided_slice_op.cc",
         "tensor_array_ops.cc",
         "tensor_list_ops.cc",
@@ -187,6 +189,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:stateful_random_ops_header",
+        "//tensorflow/core/kernels:stateless_random_ops_v2_header",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index 88d7525e5d5..39f4beed0f4 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -290,6 +290,21 @@ xla::XlaOp IgammacImpl(xla::XlaOp x, xla::XlaOp y,
 
 XLA_MAKE_BINARY(Igammac, IgammacImpl(lhs, rhs, broadcast_helper));
 
+xla::XlaOp PolygammaImpl(xla::XlaOp n, xla::XlaOp x,
+                         const BCast& broadcast_helper) {
+  std::tie(n, x) = XlaBinaryOp::Broadcast(n, x, broadcast_helper);
+  return xla::Polygamma(n, x);
+}
+
+XLA_MAKE_BINARY(Polygamma, PolygammaImpl(lhs, rhs, broadcast_helper));
+
+xla::XlaOp ZetaImpl(xla::XlaOp x, xla::XlaOp q, const BCast& broadcast_helper) {
+  std::tie(x, q) = XlaBinaryOp::Broadcast(x, q, broadcast_helper);
+  return xla::Zeta(x, q);
+}
+
+XLA_MAKE_BINARY(Zeta, ZetaImpl(lhs, rhs, broadcast_helper));
+
 #undef XLA_MAKE_BINARY
 
 class ApproximateEqualOp : public XlaOpKernel {
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index 7e8d3d7002a..b461aa43153 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -186,7 +186,7 @@ class StatelessCategoricalOp : public CategoricalOp {
 
 REGISTER_XLA_OP(Name("StatelessMultinomial")
                     .CompileTimeConstantInput("num_samples")
-                    .TypeConstraint("T", {DT_FLOAT, DT_BFLOAT16})
+                    .TypeConstraint("T", {DT_DOUBLE, DT_FLOAT, DT_BFLOAT16})
                     .TypeConstraint("Tseed", DT_INT32),
                 StatelessCategoricalOp);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc b/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
index c1f60abc0d6..a62d15f7904 100644
--- a/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
@@ -35,15 +35,19 @@ class DataFormatDimMapOp : public XlaOpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("src_format", &src_format));
     string dst_format;
     OP_REQUIRES_OK(context, context->GetAttr("dst_format", &dst_format));
-    OP_REQUIRES(context, src_format.size() == 4,
-                errors::InvalidArgument(absl::StrCat(
-                    "Source format must of length 4, received src_format = ",
-                    src_format)));
+    OP_REQUIRES(context, src_format.size() == 4 or src_format.size() == 5,
+                errors::InvalidArgument(
+                    absl::StrCat("Source format must of length 4 or 5, "
+                                 "received src_format = ",
+                                 src_format)));
     OP_REQUIRES(
-        context, dst_format.size() == 4,
+        context, dst_format.size() == 4 or dst_format.size() == 5,
         errors::InvalidArgument(absl::StrCat(
-            "Destination format must of length 4, received dst_format = ",
+            "Destination format must of length 4 or 5, received dst_format = ",
             dst_format)));
+    for (int i = 0; i < src_format.size(); ++i) {
+      dst_idx_.push_back(-1);
+    }
     for (int i = 0; i < src_format.size(); ++i) {
       for (int j = 0; j < dst_format.size(); ++j) {
         if (dst_format[j] == src_format[i]) {
@@ -61,9 +65,10 @@ class DataFormatDimMapOp : public XlaOpKernel {
     auto builder = context->builder();
     xla::XlaOp dst_indices =
         xla::ConstantR1(builder, absl::Span<const int32>(dst_idx_));
-    xla::XlaOp four = xla::ConstantR0<int32>(builder, 4);
+    const int dims = dst_idx_.size();
+    xla::XlaOp rank = xla::ConstantR0<int32>(builder, dims);
     xla::XlaOp src_indices =
-        (xla::ConvertElementType(context->Input(0), xla::S32) + four) % four;
+        (xla::ConvertElementType(context->Input(0), xla::S32) + rank) % rank;
     xla::XlaOp output =
         xla::TorchIndexSelect(dst_indices, src_indices, /*dim=*/0);
     context->SetOutput(
@@ -71,7 +76,7 @@ class DataFormatDimMapOp : public XlaOpKernel {
   }
 
  private:
-  std::array<int32, 4> dst_idx_ = {{-1, -1, -1, -1}};
+  std::vector<int32> dst_idx_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(DataFormatDimMapOp);
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/dequantize_op.cc b/tensorflow/compiler/tf2xla/kernels/dequantize_op.cc
index 7ac38369eb4..ad94c1383f8 100644
--- a/tensorflow/compiler/tf2xla/kernels/dequantize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dequantize_op.cc
@@ -63,36 +63,27 @@ class DequantizeOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     DataType input_type = ctx->input_type(0);
 
-    double minrange, maxrange;
-
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsFloatScalar(1, &minrange));
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsFloatScalar(2, &maxrange));
-
-    float min_range = static_cast<float>(minrange);
-    float max_range = static_cast<float>(maxrange);
-    float full_range, half_range;
+    xla::XlaOp input = ctx->Input(0);
+    xla::XlaOp output = xla::ConvertElementType(input, xla::F32);
+    xla::XlaOp min_range = xla::ConvertElementType(ctx->Input(1), xla::F32);
+    xla::XlaOp max_range = xla::ConvertElementType(ctx->Input(2), xla::F32);
+    xla::XlaOp full_range;
+    xla::XlaOp half_range;
     if (input_type == DT_QINT8) {
-      full_range = get_fullrange<qint8>();
-      half_range = (full_range + 1.0f) / 2.0f;
+      full_range = ScalarLike(output, get_fullrange<qint8>());
+      half_range =
+          (full_range + ScalarLike(output, 1.0f)) / ScalarLike(output, 2.0f);
     } else {
       OP_REQUIRES(ctx, input_type == DT_QUINT8,
                   errors::InvalidArgument(
                       "Only support DT_QINT8 or DT_QUINT8, got ", input_type));
-      full_range = get_fullrange<quint8>();
-      half_range = 0.0f;
+      full_range = ScalarLike(output, get_fullrange<quint8>());
+      half_range = ScalarLike(output, 0.0f);
     }
 
-    float scale_factor = (max_range - min_range) / full_range;
+    xla::XlaOp scale = (max_range - min_range) / full_range;
 
-    xla::XlaOp input = ctx->Input(0);
-    xla::XlaOp output;
-
-    output = xla::ConvertElementType(input, xla::F32);
-
-    auto scale = ScalarLike(output, scale_factor);
-    auto halfrange = ScalarLike(output, half_range);
-    output = xla::Add(xla::Mul(xla::Add(output, halfrange), scale),
-                      ScalarLike(output, min_range));
+    output = xla::Add(xla::Mul(xla::Add(output, half_range), scale), min_range);
 
     if (dtype_ == DT_BFLOAT16) {
       output = xla::ConvertElementType(output, xla::BF16);
diff --git a/tensorflow/compiler/tf2xla/kernels/qr_op.cc b/tensorflow/compiler/tf2xla/kernels/qr_op.cc
index 66ec40a946b..7aebb76071f 100644
--- a/tensorflow/compiler/tf2xla/kernels/qr_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/qr_op.cc
@@ -41,7 +41,7 @@ class QROp : public XlaOpKernel {
   bool full_matrices_;
 };
 
-REGISTER_XLA_OP(Name("Qr").TypeConstraint("T", kFloatTypes), QROp);
+REGISTER_XLA_OP(Name("Qr"), QROp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
index ce4a46b45c8..1b470bf58df 100644
--- a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
@@ -182,6 +182,32 @@ class TensorScatterAddOp : public XlaOpKernel {
   }
 };
 
+class TensorScatterMaxOp : public XlaOpKernel {
+ public:
+  explicit TensorScatterMaxOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    CompileTensorScatter(context,
+                         [](xla::XlaOp x, xla::XlaOp y, xla::XlaBuilder*) {
+                           return xla::Max(x, y);
+                         });
+  }
+};
+
+class TensorScatterMinOp : public XlaOpKernel {
+ public:
+  explicit TensorScatterMinOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    CompileTensorScatter(context,
+                         [](xla::XlaOp x, xla::XlaOp y, xla::XlaBuilder*) {
+                           return xla::Min(x, y);
+                         });
+  }
+};
+
 class TensorScatterSubOp : public XlaOpKernel {
  public:
   explicit TensorScatterSubOp(OpKernelConstruction* context)
@@ -207,6 +233,8 @@ class TensorScatterUpdateOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(Name("TensorScatterAdd"), TensorScatterAddOp);
+REGISTER_XLA_OP(Name("TensorScatterMax"), TensorScatterMaxOp);
+REGISTER_XLA_OP(Name("TensorScatterMin"), TensorScatterMinOp);
 REGISTER_XLA_OP(Name("TensorScatterSub"), TensorScatterSubOp);
 REGISTER_XLA_OP(Name("TensorScatterUpdate"), TensorScatterUpdateOp);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 85917af6a65..75faa2eac81 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // XLA-specific Shape Ops.
 
+#include "absl/strings/str_format.h"
 #include "tensorflow/compiler/tf2xla/kernels/shape_util.h"
 #include "tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
@@ -24,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -65,6 +67,47 @@ class ShapeOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("Shape").CompilationOnly().IsMetadataOp(), ShapeOp);
 
+class XlaSetBoundOp : public XlaOpKernel {
+ public:
+  explicit XlaSetBoundOp(OpKernelConstruction* context)
+      : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const TensorShape input_shape = ctx->InputShape("input");
+    const TensorShape bound_shape = ctx->InputShape("bound");
+
+    OP_REQUIRES(
+        ctx,
+        ctx->InputType("bound") == DT_INT32 &&
+            ctx->InputType("input") == DT_INT32,
+        errors::InvalidArgument(
+            "XlaSetBound can only set bound for int32 scalar value: got",
+            input_shape.DebugString()));
+
+    OP_REQUIRES(
+        ctx, input_shape.dims() == 0,
+        errors::InvalidArgument("XlaSetBound should only be used to set a "
+                                "bound to the an int32 scalar value: got",
+                                input_shape.DebugString()));
+
+    OP_REQUIRES(
+        ctx, bound_shape.dims() == 0,
+        errors::InvalidArgument("XlaSetBound should only be used to set a "
+                                "bound to the an int32 scalar value: got",
+                                bound_shape.DebugString()));
+    int64 bound;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar("bound", &bound));
+
+    xla::XlaOp result = xla::CustomCall(
+        ctx->builder(), "SetBound", {ctx->Input("input")},
+        ctx->InputXlaShape("input").ValueOrDie(), absl::StrFormat("%d", bound));
+    ctx->SetOutput(0, result);
+  }
+};
+
+REGISTER_XLA_OP(Name("XlaSetBound").CompileTimeConstantInput("bound"),
+                XlaSetBoundOp);
+
 class ShapeNOp : public XlaOpKernel {
  public:
   explicit ShapeNOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
diff --git a/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
index 46d4b70606e..a46cceddced 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/rng_alg.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/math/math_util.h"
@@ -180,7 +181,7 @@ Status CompileImpl(
   }
   xla::Literal alg_literal;
   TF_RETURN_IF_ERROR(ctx->ConstantInput(alg_input_idx, &alg_literal));
-  auto alg = alg_literal.Get<Algorithm>({});
+  Algorithm alg = Algorithm(alg_literal.Get<int>({}));
   if (!(alg == RNG_ALG_THREEFRY || alg == RNG_ALG_PHILOX)) {
     return errors::InvalidArgument("Unsupported algorithm id: ", alg);
   }
@@ -407,5 +408,80 @@ REGISTER_XLA_OP(Name("StatefulUniformFullInt")
                                     {DT_INT32, DT_UINT32, DT_INT64, DT_UINT64}),
                 StatefulUniformFullIntOp);
 
+xla::XlaOp IncreaseCounter(Algorithm const& alg, xla::XlaOp counter,
+                           xla::XlaOp delta) {
+  // Multiplying 256 to be consistent with the CPU/GPU kernels
+  delta = delta * ConstantR0WithType(delta.builder(), xla::U64, 256);
+  if (alg == RNG_ALG_PHILOX) {
+    return xla::PhiloxIncreaseCounter(counter, delta);
+  } else {
+    return counter + delta;
+  }
+}
+
+xla::XlaOp PadRight(xla::XlaOp a, int n) {
+  return xla::Pad(a, xla::ScalarLike(a, 0),
+                  xla::MakeEdgePaddingConfig({{0, n}}));
+}
+
+template <typename AlgEnumType = int64, bool read_old_value = false>
+class RngSkipOp : public XlaOpKernel {
+ public:
+  explicit RngSkipOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    const int state_input_idx = 0;
+    const int alg_input_idx = 1;
+    const int delta_input_idx = 2;
+    xla::XlaOp var;
+    TensorShape var_shape;
+    OP_REQUIRES_OK(ctx,
+                   ctx->ReadVariableInput(state_input_idx, STATE_ELEMENT_DTYPE,
+                                          &var_shape, &var));
+    xla::Literal alg_literal;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInput(alg_input_idx, &alg_literal));
+    Algorithm alg = Algorithm(alg_literal.Get<AlgEnumType>({}));
+    OP_REQUIRES(ctx, alg == RNG_ALG_THREEFRY || alg == RNG_ALG_PHILOX,
+                errors::InvalidArgument("Unsupported algorithm id: ", alg));
+    OP_REQUIRES_OK(ctx, CheckStateShape(alg, var_shape));
+    if (read_old_value) {
+      auto counter_size = GetCounterSize(alg);
+      xla::XlaOp output = var;
+      if (RNG_MAX_COUNTER_SIZE > counter_size) {
+        // Because the size of `var` depends on the algorithm while we want the
+        // output to have a fixed size (to help shape inference), we fix the
+        // output size to be the maximal state size among algorithms, and right-
+        // pad it with zeros if var's size is smaller than that.
+        output = PadRight(output, RNG_MAX_COUNTER_SIZE - counter_size);
+      }
+      ctx->SetOutput(0, output);
+    }
+    xla::XlaOp counter;
+    xla::XlaOp key;
+    std::tie(counter, key) = StateAndKeyFromVariable(alg, var);
+    xla::XlaOp delta = ctx->Input(delta_input_idx);
+    delta = BitcastConvertType(delta, xla::U64);
+    auto new_counter = IncreaseCounter(alg, counter, delta);
+    var = StateAndKeyToVariable(alg, new_counter, key);
+    xla::PrimitiveType state_element_type;
+    OP_REQUIRES_OK(
+        ctx, DataTypeToPrimitiveType(STATE_ELEMENT_DTYPE, &state_element_type));
+    var = BitcastConvertType(var, state_element_type);
+    OP_REQUIRES_OK(
+        ctx, ctx->AssignVariable(state_input_idx, STATE_ELEMENT_DTYPE, var));
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(RngSkipOp);
+};
+
+REGISTER_XLA_OP(Name("RngSkip").CompileTimeConstantInput("algorithm"),
+                RngSkipOp<>);
+
+using RngReadAndSkipOp = RngSkipOp<int32, true>;
+
+REGISTER_XLA_OP(Name("RngReadAndSkip").CompileTimeConstantInput("alg"),
+                RngReadAndSkipOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index 13c3dbe489e..e606812bc4e 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -111,6 +111,8 @@ xla::XlaOp StatelessRngUniform(absl::string_view device_type_string,
   }
 }
 
+namespace {
+
 xla::XlaOp StatelessRngUniformFullInt(absl::string_view device_type_string,
                                       xla::XlaOp seeds,
                                       const xla::Shape& shape) {
@@ -140,8 +142,6 @@ xla::XlaOp StatelessRngUniformFullInt(absl::string_view device_type_string,
   }
 }
 
-namespace {
-
 class StatelessRandomUniformOp : public XlaOpKernel {
  public:
   explicit StatelessRandomUniformOp(OpKernelConstruction* ctx)
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
new file mode 100644
index 00000000000..e46fec3c576
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
@@ -0,0 +1,485 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/stateless_random_ops_v2.h"
+
+#include <cmath>
+
+#include "tensorflow/compiler/tf2xla/kernels/random_ops_util.h"
+#include "tensorflow/compiler/tf2xla/lib/random.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/prng.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/rng_alg.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/math/math_util.h"
+
+namespace tensorflow {
+
+namespace {
+
+inline xla::RandomAlgorithm AlgorithmToRandomAlgorithm(Algorithm const& alg) {
+  if (alg == RNG_ALG_PHILOX) {
+    return xla::RandomAlgorithm::RNG_PHILOX;
+  }
+  return xla::RandomAlgorithm::RNG_THREE_FRY;
+}
+
+inline Algorithm RandomAlgorithmToAlgorithm(xla::RandomAlgorithm const& alg) {
+  if (alg == xla::RandomAlgorithm::RNG_PHILOX) {
+    return RNG_ALG_PHILOX;
+  }
+  return RNG_ALG_THREEFRY;
+}
+
+xla::XlaOp GetCounter(xla::RandomAlgorithm const& alg, xla::XlaOp state) {
+  Algorithm alg_ = RandomAlgorithmToAlgorithm(alg);
+  return xla::Slice(state, {RNG_KEY_SIZE},
+                    {RNG_KEY_SIZE + GetCounterSize(alg_)}, {1});
+}
+
+xla::RngOutput BitGenerator(xla::RandomAlgorithm const& alg, xla::XlaOp key,
+                            xla::XlaOp counter, const xla::Shape& shape) {
+  key = BitcastConvertType(key, xla::U64);
+  counter = BitcastConvertType(counter, xla::U64);
+  xla::XlaOp state = xla::ConcatInDim(key.builder(), {key, counter}, 0);
+  xla::XlaOp result = xla::RngBitGenerator(alg, state, shape);
+  auto new_counter = GetCounter(alg, xla::GetTupleElement(result, 0));
+  new_counter = BitcastConvertType(new_counter, xla::S64);
+  return xla::RngOutput{/*value=*/xla::GetTupleElement(result, 1),
+                        /*state=*/new_counter};
+}
+
+std::tuple<xla::XlaOp, xla::XlaOp, Algorithm> GetKeyCounterAlg(
+    absl::string_view device_type_string, xla::XlaOp key) {
+  // The Philox algorithm may cause performance regression on other devices.
+  // Turn on the Philox algorithm for the CPU and GPU backends only.
+  if (device_type_string == DEVICE_GPU_XLA_JIT ||
+      device_type_string == DEVICE_CPU_XLA_JIT) {
+    auto counter_key = xla::ScramblePhiloxKey(key);
+    return std::make_tuple(counter_key.second, counter_key.first,
+                           RNG_ALG_PHILOX);
+  } else {
+    auto counter_shape =
+        xla::ShapeUtil::MakeShape(xla::U64, {RNG_MAX_COUNTER_SIZE});
+    auto counter = xla::Zeros(key.builder(), counter_shape);
+    return std::make_tuple(key, counter, RNG_ALG_THREEFRY);
+  }
+}
+
+}  // namespace
+
+xla::RngOutput StatelessRngUniformV2(xla::RandomAlgorithm const& alg,
+                                   xla::XlaOp key, xla::XlaOp counter,
+                                   const xla::Shape& shape, xla::XlaOp minval,
+                                   xla::XlaOp maxval) {
+  xla::XlaBuilder* builder = key.builder();
+  xla::PrimitiveType type = shape.element_type();
+  using std::placeholders::_1;
+  using std::placeholders::_2;
+  using std::placeholders::_3;
+  auto generator = std::bind(BitGenerator, alg, _1, _2, _3);
+  switch (type) {
+    case xla::F32:
+    case xla::F64:
+      return xla::UniformFloatingPointDistribution(key, counter, generator,
+                                                   minval, maxval, shape);
+    case xla::S32:
+    case xla::S64:
+    case xla::U32:
+    case xla::U64:
+      return UniformIntDistribution(key, counter, generator, minval, maxval,
+                                    shape);
+      break;
+    default:
+      return {builder->ReportError(xla::Unimplemented(
+                  "Types other than F32, S32, S64, U32 and U64 are not "
+                  "implemented by "
+                  "StatelessRngUniformV2; got %s",
+                  xla::primitive_util::LowercasePrimitiveTypeName(type))),
+              counter};
+  }
+}
+
+namespace {
+
+xla::RngOutput StatelessRngUniformFullInt(xla::RandomAlgorithm const& alg,
+                                          xla::XlaOp key, xla::XlaOp counter,
+                                          const xla::Shape& shape) {
+  xla::XlaBuilder* builder = key.builder();
+
+  xla::PrimitiveType type = shape.element_type();
+  xla::RngOutput output = BitGenerator(alg, key, counter, shape);
+  switch (type) {
+    case xla::U32:
+    case xla::U64:
+      return output;
+    case xla::S32:
+    case xla::S64:
+      return xla::RngOutput{BitcastConvertType(output.value, type),
+                            output.state};
+    default:
+      return {
+          builder->ReportError(xla::Unimplemented(
+              "Types other than U32, S32, U64 and S64 are not implemented by "
+              "StatelessRngUniformFullInt; got: %s",
+              xla::primitive_util::LowercasePrimitiveTypeName(type))),
+          output.state};
+  }
+}
+
+Status GetAlgorithm(XlaOpKernelContext* ctx, int alg_input_idx,
+                    xla::RandomAlgorithm* alg) {
+  auto alg_shape = ctx->InputShape(alg_input_idx);
+  if (alg_shape.dims() != 0) {
+    return errors::InvalidArgument("algorithm must be of shape [], not ",
+                                   alg_shape.DebugString());
+  }
+  xla::Literal alg_literal;
+  TF_RETURN_IF_ERROR(ctx->ConstantInput(alg_input_idx, &alg_literal));
+  auto alg_ = Algorithm(alg_literal.Get<int>({}));
+  *alg = AlgorithmToRandomAlgorithm(alg_);
+  return Status::OK();
+}
+
+xla::XlaOp MaybeSliceCounter(xla::RandomAlgorithm const& alg,
+                             TensorShape const& counter_shape,
+                             xla::XlaOp counter) {
+  auto input_counter_size = counter_shape.dim_size(0);
+  auto real_counter_size = GetCounterSize(RandomAlgorithmToAlgorithm(alg));
+  if (input_counter_size > real_counter_size) {
+    counter = xla::Slice(counter, {0}, {real_counter_size}, {1});
+  }
+  return counter;
+}
+
+class StatelessRandomUniformOp : public XlaOpKernel {
+ public:
+  explicit StatelessRandomUniformOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaBuilder* builder = ctx->builder();
+
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
+
+    const int key_input_idx = 1;
+    const int counter_input_idx = 2;
+    const int alg_input_idx = 3;
+    xla::XlaOp key = ctx->Input(key_input_idx);
+    xla::XlaOp counter = ctx->Input(counter_input_idx);
+
+    xla::RandomAlgorithm alg;
+    OP_REQUIRES_OK(ctx, GetAlgorithm(ctx, alg_input_idx, &alg));
+
+    auto counter_shape = ctx->InputShape(counter_input_idx);
+    OP_REQUIRES_OK(ctx, CheckKeyCounterShape(RandomAlgorithmToAlgorithm(alg),
+                                             ctx->InputShape(key_input_idx),
+                                             counter_shape));
+
+    DataType rng_dtype = dtype_ == DT_DOUBLE ? DT_DOUBLE : DT_FLOAT;
+    xla::Shape xla_shape;
+    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(rng_dtype, shape, &xla_shape));
+    xla::PrimitiveType rng_primitive_type = xla_shape.element_type();
+
+    counter = MaybeSliceCounter(alg, counter_shape, counter);
+
+    auto result = StatelessRngUniformV2(
+        alg, key, counter, xla_shape,
+        xla::ConstantR0WithType(builder, rng_primitive_type, 0.0),
+        xla::ConstantR0WithType(builder, rng_primitive_type, 1.0));
+    auto uniform = MaybeConvertF32ToBF16(result.value, dtype_);
+    ctx->SetOutput(0, uniform);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomUniformOp);
+};
+
+REGISTER_XLA_OP(Name("StatelessRandomUniformV2")
+                    .CompileTimeConstantInput("shape")
+                    .CompileTimeConstantInput("alg")
+                    .TypeConstraint("dtype",
+                                    {DT_DOUBLE, DT_FLOAT, DT_BFLOAT16}),
+                StatelessRandomUniformOp);
+
+class StatelessRandomUniformIntOp : public XlaOpKernel {
+ public:
+  explicit StatelessRandomUniformIntOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
+
+    const int key_input_idx = 1;
+    const int counter_input_idx = 2;
+    const int alg_input_idx = 3;
+    xla::XlaOp key = ctx->Input(key_input_idx);
+    xla::XlaOp counter = ctx->Input(counter_input_idx);
+
+    xla::RandomAlgorithm alg;
+    OP_REQUIRES_OK(ctx, GetAlgorithm(ctx, alg_input_idx, &alg));
+
+    auto counter_shape = ctx->InputShape(counter_input_idx);
+    OP_REQUIRES_OK(ctx, CheckKeyCounterShape(RandomAlgorithmToAlgorithm(alg),
+                                             ctx->InputShape(key_input_idx),
+                                             counter_shape));
+
+    const int minval_input_idx = 4;
+    const int maxval_input_idx = 5;
+    TensorShape minval_shape = ctx->InputShape(minval_input_idx);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(minval_shape),
+                errors::InvalidArgument("minval must be scalar, got shape ",
+                                        minval_shape.DebugString()));
+    TensorShape maxval_shape = ctx->InputShape(maxval_input_idx);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(maxval_shape),
+                errors::InvalidArgument("maxval must be scalar, got shape ",
+                                        maxval_shape.DebugString()));
+
+    xla::XlaOp minval = ctx->Input(minval_input_idx);
+    xla::XlaOp maxval = ctx->Input(maxval_input_idx);
+
+    xla::Shape xla_shape;
+    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, shape, &xla_shape));
+
+    counter = MaybeSliceCounter(alg, counter_shape, counter);
+    auto result =
+        StatelessRngUniformV2(alg, key, counter, xla_shape, minval, maxval);
+    ctx->SetOutput(0, result.value);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomUniformIntOp);
+};
+
+REGISTER_XLA_OP(Name("StatelessRandomUniformIntV2")
+                    .CompileTimeConstantInput("shape")
+                    .CompileTimeConstantInput("alg")
+                    .TypeConstraint("dtype",
+                                    {DT_INT32, DT_INT64, DT_UINT32, DT_UINT64}),
+                StatelessRandomUniformIntOp);
+
+class StatelessRandomUniformFullIntOp : public XlaOpKernel {
+ public:
+  explicit StatelessRandomUniformFullIntOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
+
+    const int key_input_idx = 1;
+    const int counter_input_idx = 2;
+    const int alg_input_idx = 3;
+    xla::XlaOp key = ctx->Input(key_input_idx);
+    xla::XlaOp counter = ctx->Input(counter_input_idx);
+
+    xla::RandomAlgorithm alg;
+    OP_REQUIRES_OK(ctx, GetAlgorithm(ctx, alg_input_idx, &alg));
+
+    auto counter_shape = ctx->InputShape(counter_input_idx);
+    OP_REQUIRES_OK(ctx, CheckKeyCounterShape(RandomAlgorithmToAlgorithm(alg),
+                                             ctx->InputShape(key_input_idx),
+                                             counter_shape));
+
+    xla::Shape xla_shape;
+    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, shape, &xla_shape));
+
+    counter = MaybeSliceCounter(alg, counter_shape, counter);
+    auto result = StatelessRngUniformFullInt(alg, key, counter, xla_shape);
+    ctx->SetOutput(0, result.value);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomUniformFullIntOp);
+};
+
+REGISTER_XLA_OP(Name("StatelessRandomUniformFullIntV2")
+                    .CompileTimeConstantInput("shape")
+                    .CompileTimeConstantInput("alg")
+                    .TypeConstraint("dtype",
+                                    {DT_INT32, DT_INT64, DT_UINT32, DT_UINT64}),
+                StatelessRandomUniformFullIntOp);
+
+class StatelessRandomNormalOp : public XlaOpKernel {
+ public:
+  explicit StatelessRandomNormalOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
+
+    const int key_input_idx = 1;
+    const int counter_input_idx = 2;
+    const int alg_input_idx = 3;
+    xla::XlaOp key = ctx->Input(key_input_idx);
+    xla::XlaOp counter = ctx->Input(counter_input_idx);
+
+    xla::RandomAlgorithm alg;
+    OP_REQUIRES_OK(ctx, GetAlgorithm(ctx, alg_input_idx, &alg));
+
+    auto counter_shape = ctx->InputShape(counter_input_idx);
+    OP_REQUIRES_OK(ctx, CheckKeyCounterShape(RandomAlgorithmToAlgorithm(alg),
+                                             ctx->InputShape(key_input_idx),
+                                             counter_shape));
+
+    DataType rng_dtype = dtype_ == DT_DOUBLE ? DT_DOUBLE : DT_FLOAT;
+
+    xla::Shape xla_shape;
+    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(rng_dtype, shape, &xla_shape));
+
+    using std::placeholders::_1;
+    using std::placeholders::_2;
+    using std::placeholders::_3;
+    auto generator = std::bind(BitGenerator, alg, _1, _2, _3);
+    counter = MaybeSliceCounter(alg, counter_shape, counter);
+    auto result = xla::NormalFloatingPointDistribution(key, counter, generator,
+                                                       xla_shape);
+    auto normal = MaybeConvertF32ToBF16(result.value, dtype_);
+    ctx->SetOutput(0, normal);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessRandomNormalOp);
+};
+
+REGISTER_XLA_OP(Name("StatelessRandomNormalV2")
+                    .CompileTimeConstantInput("shape")
+                    .CompileTimeConstantInput("alg")
+                    .TypeConstraint("dtype",
+                                    {DT_DOUBLE, DT_FLOAT, DT_BFLOAT16}),
+                StatelessRandomNormalOp);
+
+class StatelessTruncatedNormalOp : public XlaOpKernel {
+ public:
+  explicit StatelessTruncatedNormalOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
+
+    const int key_input_idx = 1;
+    const int counter_input_idx = 2;
+    const int alg_input_idx = 3;
+    xla::XlaOp key = ctx->Input(key_input_idx);
+    xla::XlaOp counter = ctx->Input(counter_input_idx);
+
+    xla::RandomAlgorithm alg;
+    OP_REQUIRES_OK(ctx, GetAlgorithm(ctx, alg_input_idx, &alg));
+
+    auto counter_shape = ctx->InputShape(counter_input_idx);
+    OP_REQUIRES_OK(ctx, CheckKeyCounterShape(RandomAlgorithmToAlgorithm(alg),
+                                             ctx->InputShape(key_input_idx),
+                                             counter_shape));
+
+    xla::XlaBuilder* builder = ctx->builder();
+
+    DataType rng_dtype = dtype_ == DT_DOUBLE ? DT_DOUBLE : DT_FLOAT;
+    xla::Shape xla_shape;
+    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(rng_dtype, shape, &xla_shape));
+
+    counter = MaybeSliceCounter(alg, counter_shape, counter);
+    auto result = StatelessRngUniformV2(
+        alg, key, counter, xla_shape,
+        xla::MinPositiveNormalValue(builder, xla_shape.element_type()),
+        xla::One(builder, xla_shape.element_type()));
+    xla::XlaOp truncated_normal = TruncatedNormal(result.value);
+    truncated_normal = MaybeConvertF32ToBF16(truncated_normal, dtype_);
+    ctx->SetOutput(0, truncated_normal);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessTruncatedNormalOp);
+};
+
+REGISTER_XLA_OP(Name("StatelessTruncatedNormalV2")
+                    .CompileTimeConstantInput("shape")
+                    .CompileTimeConstantInput("alg")
+                    .TypeConstraint("dtype",
+                                    {DT_DOUBLE, DT_FLOAT, DT_BFLOAT16}),
+                StatelessTruncatedNormalOp);
+
+class GetKeyCounterAlgOp : public XlaOpKernel {
+ public:
+  explicit GetKeyCounterAlgOp(OpKernelConstruction* ctx)
+      : XlaOpKernel(ctx),
+        device_type_string_(ctx->device_type().type_string()) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape seed_shape = ctx->InputShape(0);
+    OP_REQUIRES(ctx, seed_shape == TensorShape({2}),
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_shape.DebugString()));
+    xla::XlaOp seed = ctx->Input(0);
+
+    xla::XlaBuilder* builder = seed.builder();
+    xla::XlaOp seed0 = xla::Reshape(xla::Slice(seed, {0}, {1}, {1}), {});
+    xla::XlaOp seed1 = xla::Reshape(xla::Slice(seed, {1}, {2}, {1}), {});
+    xla::XlaOp key = ConvertElementType(seed0, xla::U64) |
+                     ShiftLeft(ConvertElementType(seed1, xla::U64),
+                               ConstantR0WithType(builder, xla::U64, 32));
+    auto key_counter_alg = GetKeyCounterAlg(device_type_string_, key);
+    key = std::get<0>(key_counter_alg);
+    auto counter = std::get<1>(key_counter_alg);
+    auto alg = std::get<2>(key_counter_alg);
+    key = xla::Reshape(key, {RNG_KEY_SIZE});
+    ctx->SetOutput(0, key);
+    ctx->SetOutput(1, counter);
+    ctx->SetOutput(2, ConstantR0(builder, static_cast<int>(alg)));
+  }
+
+ private:
+  string device_type_string_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GetKeyCounterAlgOp);
+};
+
+REGISTER_XLA_OP(Name("StatelessRandomGetKeyCounterAlg"), GetKeyCounterAlgOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 268317d84fc..943d92982cb 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -26,11 +26,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/ops_util.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mem.h"
@@ -290,6 +292,83 @@ class StridedSliceGradOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("Index", &index_type_));
   }
 
+  // When the begin / end is unknown, compile the gradient into dynamic update
+  // slice into a broadcasted 0s.
+  //
+  //    Broadcasted 0
+  // +----------------------+
+  // |         +----+       |
+  // |<-begin->|grad|<-end->| <== Dynamic update grad into 0s.
+  // |         +----+       |
+  // +----------------------+
+  void CompileAsDynamicUpdateSlice(XlaOpKernelContext* ctx,
+                                   const TensorShape& input_shape,
+                                   const xla::Literal& strides_literal) {
+    bool dummy = false;
+    Tensor strides_tensor;
+    PartialTensorShape processing_shape, final_shape;
+    absl::InlinedVector<int64, 4> begin;
+    absl::InlinedVector<int64, 4> end;
+    absl::InlinedVector<int64, 4> strides;
+
+    absl::InlinedVector<int64, 4> output_to_sparse_mapping;
+    absl::InlinedVector<int64, 4> output_to_processing_mapping;
+
+    OP_REQUIRES_OK(ctx, LiteralToHostTensor(strides_literal, index_type_,
+                                            &strides_tensor));
+    OP_REQUIRES_OK(
+        ctx, ValidateStridedSliceOp(
+                 nullptr, nullptr, strides_tensor, input_shape, begin_mask_,
+                 end_mask_, ellipsis_mask_, new_axis_mask_, shrink_axis_mask_,
+                 &processing_shape, &final_shape, &dummy, &dummy, &dummy,
+                 &begin, &end, &strides, &output_to_sparse_mapping,
+                 &output_to_processing_mapping));
+    for (int64 i = 0; i < processing_shape.dims(); ++i) {
+      OP_REQUIRES(
+          ctx, strides[i] == 1,
+          errors::InvalidArgument("Strides in strided slice grad have to be "
+                                  "one when inputs are not constant."));
+    }
+
+    auto zero = XlaHelpers::Zero(ctx->builder(), ctx->expected_output_dtype(0));
+    zero = xla::Broadcast(zero, input_shape.dim_sizes());
+    xla::XlaOp grad = ctx->Input(4);
+    xla::Shape grad_shape = ctx->InputXlaShape(4).ValueOrDie();
+    // Undo any new/shrink axes.
+    VLOG(1) << "xla grad shape" << grad_shape;
+    VLOG(1) << "input_shape" << input_shape.DebugString();
+    std::vector<xla::XlaOp> begins(processing_shape.dims(),
+                                   xla::Zero(ctx->builder(), xla::S32));
+    for (int64 i = 0; i < grad_shape.rank(); ++i) {
+      // Use grad shape, which is known, to update unknown processing shape.
+      // Grad shape is the output of the ValidateStridedSliceOp function in
+      // forward pass, thus we use output_to_processing_mapping.
+      if (output_to_processing_mapping[i] != -1) {
+        processing_shape.set_dim(output_to_processing_mapping[i],
+                                 grad_shape.dimensions(i));
+      }
+
+      // Similarly, use output_to_sparse_mapping to find out corresponding
+      // begin dim of the output, as indices for dynamic update slice.
+      int64 begin_dim = output_to_sparse_mapping[i];
+      if (begin_dim != -1) {
+        auto begin_index =
+            xla::Slice(ctx->Input(1), {begin_dim}, {begin_dim + 1}, {1});
+        auto begin_index_scalar = xla::Reshape(
+            xla::ShapeUtil::MakeScalarShape(xla::S32), begin_index);
+        begins[output_to_sparse_mapping[i]] = begin_index_scalar;
+      }
+    }
+    VLOG(1) << "processing_shape" << processing_shape.DebugString();
+    TensorShape full_processing_shape;
+    OP_REQUIRES(ctx, processing_shape.AsTensorShape(&full_processing_shape),
+                errors::InvalidArgument(
+                    "Processing shape ", processing_shape.DebugString(),
+                    " can't be fully inferred from grad shape"));
+    grad = xla::Reshape(grad, full_processing_shape.dim_sizes());
+    grad = xla::DynamicUpdateSlice(zero, grad, begins);
+    ctx->SetOutput(0, grad);
+  }
   void Compile(XlaOpKernelContext* ctx) override {
     TensorShape processing_shape, final_shape;
     absl::InlinedVector<int64, 4> begin;
@@ -298,12 +377,15 @@ class StridedSliceGradOp : public XlaOpKernel {
 
     TensorShape input_shape;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &input_shape));
-
     xla::Literal begin_literal, end_literal, strides_literal;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(1, &begin_literal));
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(2, &end_literal));
-    OP_REQUIRES_OK(ctx, ctx->ConstantInput(3, &strides_literal));
 
+    bool begin_is_constant = ctx->ConstantInput(1, &begin_literal).ok();
+    bool end_is_constant = ctx->ConstantInput(2, &end_literal).ok();
+    OP_REQUIRES_OK(ctx, ctx->ConstantInput(3, &strides_literal));
+    if (!(begin_is_constant && end_is_constant)) {
+      CompileAsDynamicUpdateSlice(ctx, input_shape, strides_literal);
+      return;
+    }
     Tensor begin_tensor, end_tensor, strides_tensor;
     OP_REQUIRES_OK(
         ctx, LiteralToHostTensor(begin_literal, index_type_, &begin_tensor));
@@ -446,7 +528,12 @@ class StridedSliceAssignOp : public XlaOpKernel {
 
     TensorShape lhs_shape;
     xla::XlaOp lhs;
-    OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &lhs_shape, &lhs));
+    if (ctx->input_type(0) == DT_RESOURCE) {
+      OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &lhs_shape, &lhs));
+    } else {
+      lhs_shape = ctx->InputShape(0);
+      lhs = ctx->Input(0);
+    }
 
     const TensorShape rhs_shape = ctx->InputShape(4);
 
@@ -504,7 +591,11 @@ class StridedSliceAssignOp : public XlaOpKernel {
 
     lhs = xla::DynamicUpdateSlice(lhs, rhs, slice_begin);
 
-    OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, lhs));
+    if (ctx->input_type(0) == DT_RESOURCE) {
+      OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, dtype_, lhs));
+    } else {
+      ctx->SetOutput(0, lhs);
+    }
   }
 
  private:
@@ -520,5 +611,11 @@ REGISTER_XLA_OP(Name("ResourceStridedSliceAssign")
                     .CompileTimeConstantInput("strides"),
                 StridedSliceAssignOp);
 
+REGISTER_XLA_OP(Name("TensorStridedSliceUpdate")
+                    .CompileTimeConstantInput("begin")
+                    .CompileTimeConstantInput("end")
+                    .CompileTimeConstantInput("strides"),
+                StridedSliceAssignOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_reduce_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_reduce_op.cc
index 8b481d55a80..555905ebe6b 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_reduce_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_reduce_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 namespace {
@@ -38,16 +39,28 @@ class XlaReduceOp : public XlaOpKernel {
         context, dims_set.size() == dimensions_to_reduce_.size(),
         errors::InvalidArgument("Duplicate dimension in dimensions_to_reduce "
                                 "argument to XlaReduce"));
+    if (context->HasAttr("N")) {  // variadic reduce
+      use_tuples_ = true;
+      OP_REQUIRES_OK(context, context->GetAttr("N", &n_));
+    } else {
+      use_tuples_ = false;
+      n_ = 1;
+    }
   }
 
   void Compile(XlaOpKernelContext* context) override {
-    const TensorShape input_shape = context->InputShape("input");
-    const TensorShape init_value_shape = context->InputShape("init_value");
+    OP_REQUIRES(context, n_ * 2 == context->num_inputs(),
+                errors::InvalidArgument("Expected ", n_ * 2, " inputs but got ",
+                                        context->num_inputs()));
+
+    const TensorShape input_shape = context->InputShape(0);
+    const TensorShape init_value_shape = context->InputShape(n_);
     const DataType dtype = context->input_type(0);
 
     const int rank = input_shape.dims();
     OP_REQUIRES(context, TensorShapeUtils::IsScalar(init_value_shape),
-                errors::InvalidArgument("init_value must be a scalar"));
+                errors::InvalidArgument("init_value must be a scalar but got ",
+                                        init_value_shape.DebugString()));
 
     auto dim_in_range = [rank](int64 dim) { return dim >= 0 && dim < rank; };
     OP_REQUIRES(context,
@@ -67,35 +80,58 @@ class XlaReduceOp : public XlaOpKernel {
     compile_options.always_return_tuple = false;
     compile_options.is_entry_computation = false;
     XlaCompiler::CompilationResult reducer;
-    OP_REQUIRES_OK(context, context->compiler()->CompileFunction(
-                                compile_options, *reducer_,
-                                {reducer_arg, reducer_arg}, &reducer));
+    OP_REQUIRES_OK(
+        context,
+        context->compiler()->CompileFunction(
+            compile_options, *reducer_,
+            std::vector<XlaCompiler::Argument>(n_ * 2, reducer_arg), &reducer));
 
-    xla::Shape scalar_shape;
-    OP_REQUIRES_OK(context,
-                   TensorShapeToXLAShape(dtype, TensorShape(), &scalar_shape));
+    xla::Shape expected_shape;
+    OP_REQUIRES_OK(
+        context, TensorShapeToXLAShape(dtype, TensorShape(), &expected_shape));
+    if (use_tuples_) {
+      expected_shape = xla::ShapeUtil::MakeTupleShape(
+          std::vector<xla::Shape>(n_, expected_shape));
+    }
     OP_REQUIRES(
         context,
-        xla::ShapeUtil::Compatible(reducer.xla_output_shape, scalar_shape),
+        xla::ShapeUtil::Compatible(reducer.xla_output_shape, expected_shape),
         errors::InvalidArgument(
             "Invalid output shape of XlaReduce reducer. Expected ",
-            xla::ShapeUtil::HumanString(scalar_shape), " got ",
+            xla::ShapeUtil::HumanString(expected_shape), " got ",
             xla::ShapeUtil::HumanString(reducer.xla_output_shape)));
 
+    std::vector<xla::XlaOp> inputs;
+    std::vector<xla::XlaOp> inits;
+    inputs.reserve(n_);
+    inits.reserve(n_);
+    for (int i = 0; i < n_; i++) {
+      inputs.emplace_back(context->Input(i));
+      inits.emplace_back(context->Input(n_ + i));
+    }
     xla::XlaOp output =
-        xla::Reduce(context->Input("input"), context->Input("init_value"),
-                    *reducer.computation, dimensions_to_reduce_);
-    context->SetOutput(0, output);
+        xla::Reduce(context->builder(), inputs, inits, *reducer.computation,
+                    dimensions_to_reduce_);
+    if (use_tuples_) {
+      for (int i = 0; i < n_; i++) {
+        context->SetOutput(i, xla::GetTupleElement(output, i));
+      }
+    } else {
+      context->SetOutput(0, output);
+    }
   }
 
  private:
   const NameAttrList* reducer_;
   std::vector<int64> dimensions_to_reduce_;
+  bool use_tuples_;
+  int n_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaReduceOp);
 };
 
 REGISTER_XLA_OP(Name("XlaReduce"), XlaReduceOp);
+REGISTER_XLA_OP(Name("XlaVariadicReduce"), XlaReduceOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index 531679d3905..703f6c2eb72 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -1,5 +1,8 @@
 # Utilities for building XLA computations.
 
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 package(
     default_visibility = ["//tensorflow/compiler/tf2xla:friends"],
     licenses = ["notice"],  # Apache 2.0
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
index eefef26dc24..b46429ef0d1 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
@@ -38,7 +38,7 @@ auto* mlir_bridge_gauge_v2 = monitoring::Gauge<bool, 0>::New(
 // encapsulated graph to a particular device.
 Status MlirBridgePass::Run(const ConfigProto& config_proto,
                            mlir::ModuleOp module) {
-  if (!config_proto.experimental().enable_mlir_bridge()) {
+  if (!IsEnabled(config_proto)) {
     VLOG(0) << "Skipping MLIR TPU Bridge, session flag not enabled";
     mlir_bridge_gauge_v2->GetCell()->Set(false);
     return Status::OK();
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.h b/tensorflow/compiler/tf2xla/mlir_bridge_pass.h
index f7541e634d4..bbddeb6a967 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.h
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.h
@@ -30,7 +30,10 @@ class MlirBridgePass : public MlirOptimizationPass {
   llvm::StringRef name() const override { return "bridge"; }
 
   bool IsEnabled(const ConfigProto& config_proto) const override {
-    return config_proto.experimental().enable_mlir_bridge();
+    return config_proto.experimental().enable_mlir_bridge() ||
+           tensorflow::GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge ==
+               tensorflow::ConfigProto::Experimental::
+                   MLIR_BRIDGE_ROLLOUT_ENABLED;
   }
 
   // This should be used as a thin mapper around mlir::ModulePass::runOnModule
@@ -47,7 +50,9 @@ class MlirBridgeV1CompatPass : public MlirV1CompatOptimizationPass {
 
   bool IsEnabled(const ConfigProto& config_proto) const override {
     return config_proto.experimental().enable_mlir_bridge() ||
-           tensorflow::GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge;
+           GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge ==
+               tensorflow::ConfigProto::Experimental::
+                   MLIR_BRIDGE_ROLLOUT_ENABLED;
   }
 
   // This should be used as a thin mapper around mlir::ModulePass::runOnModule
diff --git a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
index db1a6929934..ac4d1f28803 100644
--- a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
@@ -90,18 +90,6 @@ Status ConvertOutputInfo(const tf2xla::Config& config,
   return ParseOutputArrayInfo(array_names, &specs->outputs);
 }
 
-static void RegisterDialects() {
-  static bool init_once = []() {
-    mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
-    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
-    mlir::registerDialect<mlir::StandardOpsDialect>();
-    mlir::registerDialect<mlir::mhlo::MhloDialect>();
-    mlir::registerDialect<mlir::shape::ShapeDialect>();
-    return true;
-  }();
-  (void)init_once;
-}
-
 }  // namespace
 
 Status ConvertGraphDefToXlaViaMlir(
@@ -150,9 +138,7 @@ Status ConvertGraphDefToXlaViaMlir(
     }
   }
 
-  RegisterDialects();
   mlir::MLIRContext context;
-  context.loadAllGloballyRegisteredDialects();
   TF_ASSIGN_OR_RETURN(
       mlir::OwningModuleRef module,
       ConvertGraphdefToMlir(pruned_graph_def, debug_info, specs, &context));
@@ -175,7 +161,7 @@ Status ConvertGraphDefToXlaViaMlir(
   return ConvertMLIRToXlaComputation(*module, /*device_type=*/"XLA_CPU_JIT",
                                      computation,
                                      /*use_tuple_args=*/false,
-                                     /*always_return_tuple=*/true);
+                                     /*return_tuple=*/true);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/ops/BUILD b/tensorflow/compiler/tf2xla/ops/BUILD
index b116a09dd02..50ff1ea5d16 100644
--- a/tensorflow/compiler/tf2xla/ops/BUILD
+++ b/tensorflow/compiler/tf2xla/ops/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_custom_op_library",
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index f4b9e9654d2..471cc029a59 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -291,6 +291,16 @@ dimension_numbers: a serialized xla::DotDimensionNumbers proto.
 precision_config: a serialized xla::PrecisionConfig proto.
 )doc");
 
+REGISTER_OP("XlaSetBound")
+    .Input("input: int32")
+    .Input("bound: int32")
+    .Output("output: int32")
+    .SetShapeFn(shape_inference::UnknownShape)
+    .Doc(
+        R"doc(Set a bound for the given input value as a hint to Xla compiler,
+        returns the same value.
+)doc");
+
 REGISTER_OP("XlaDynamicSlice")
     .Input("input: T")
     .Input("start_indices: Tindices")
@@ -465,6 +475,60 @@ reducer: a reducer function to apply
 dimensions_to_reduce: dimension numbers over which to reduce
 )doc");
 
+REGISTER_OP("XlaVariadicReduce")
+    .Input("input: N * T")
+    .Input("init_value: N * T")
+    .Attr("N: int >= 1")
+    .Attr("T: numbertype")
+    .Attr("dimensions_to_reduce: list(int)")
+    .Attr("reducer: func")
+    .Output("output: N * T")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      int n;
+      TF_RETURN_IF_ERROR(c->GetAttr("N", &n));
+      for (int i = 0; i < n; i++) {
+        for (int j = 0; j < n; j++) {
+          c->MergeInput(i, c->input(j));
+        }
+      }
+      if (c->RankKnown(c->input(0))) {
+        int rank = c->Rank(c->input(0));
+        std::vector<int64> dimensions_to_reduce;
+        TF_RETURN_IF_ERROR(
+            c->GetAttr("dimensions_to_reduce", &dimensions_to_reduce));
+        std::set<int64> dims_set(dimensions_to_reduce.begin(),
+                                 dimensions_to_reduce.end());
+        auto dim_in_range = [rank](int64 dim) {
+          return dim >= 0 && dim < rank;
+        };
+        const int dimensions_to_reduce_size = dimensions_to_reduce.size();
+        if (rank < dimensions_to_reduce_size ||
+            dims_set.size() != dimensions_to_reduce.size() ||
+            !absl::c_all_of(dimensions_to_reduce, dim_in_range)) {
+          return errors::InvalidArgument(
+              "Invalid dimensions_to_reduce argument to XlaVariadicReduce");
+        }
+        for (int i = 0; i < n; i++) {
+          c->set_output(
+              i, c->UnknownShapeOfRank(rank - dimensions_to_reduce.size()));
+        }
+      } else {
+        for (int i = 0; i < n; i++) {
+          c->set_output(i, c->input(i));
+        }
+      }
+      return Status::OK();
+    })
+    .Doc(R"doc(
+Wraps the variadic XLA Reduce operator, documented at
+ https://www.tensorflow.org/performance/xla/operation_semantics#variadic_reduce.
+
+input: the input tensor(s)
+init_value: scalar initial value(s) for the reduction
+reducer: a reducer function to apply
+dimensions_to_reduce: dimension numbers over which to reduce
+)doc");
+
 REGISTER_OP("XlaReduceWindow")
     .Input("input: T")
     .Input("init_value: T")
@@ -728,7 +792,7 @@ REGISTER_OP("XlaGather")
     .Input("slice_sizes: Tindices")
     .Attr("dimension_numbers: string")
     .Attr("indices_are_sorted: bool")
-    .Attr("T: numbertype")
+    .Attr("T: {numbertype, bool}")
     .Attr("Tindices: {int32, int64}")
     .Output("output: T")
     .SetShapeFn(shape_inference::UnknownShape)
@@ -749,10 +813,10 @@ REGISTER_OP("XlaScatter")
     .Attr("update_computation: func")
     .Attr("dimension_numbers: string")
     .Attr("indices_are_sorted: bool")
-    .Attr("T: numbertype")
+    .Attr("T: {numbertype, bool}")
     .Attr("Tindices: {int32, int64}")
     .Output("output: T")
-    .SetShapeFn(UnchangedRank)
+    .SetShapeFn(shape_inference::UnchangedShape)
     .Doc(R"doc(
 Wraps the XLA Scatter operator documented at
   https://www.tensorflow.org/xla/operation_semantics#scatter.
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index 846dafa2570..2e5667bc02f 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -206,6 +206,8 @@ igamma = _broadcasting_binary_op(math_ops.igamma)
 igamma_grad_a = _broadcasting_binary_op(gen_math_ops.igamma_grad_a)
 random_gamma_grad = _broadcasting_binary_op(gen_random_ops.random_gamma_grad)
 igammac = _broadcasting_binary_op(math_ops.igammac)
+polygamma = _broadcasting_binary_op(math_ops.polygamma)
+zeta = _broadcasting_binary_op(math_ops.zeta)
 
 
 def _binary_op(fn):
@@ -338,6 +340,7 @@ def random_uniform(minval, maxval, dims, name=None):
 
 recv = gen_xla_ops.xla_recv
 reduce = gen_xla_ops.xla_reduce
+variadic_reduce = gen_xla_ops.xla_variadic_reduce
 
 
 def reduce_window(operand,
@@ -387,6 +390,14 @@ def reduce_window(operand,
 
 replica_id = gen_xla_ops.xla_replica_id
 
+# Set a static bound for the given input value as a hint to Xla compiler,
+# returns the same value.
+# Usage:
+# def f(t, p):
+#   p = xla.set_bound(p, 3) # Tells xla the constraint that p <= 3.
+#   return t[:p]            # xla knows the bound of the slice is 3.
+set_bound = gen_xla_ops.xla_set_bound
+
 
 def reshape(x, new_sizes, dimensions=None, name=None):
   if dimensions is not None:
diff --git a/tensorflow/compiler/tf2xla/resource_operation_table.cc b/tensorflow/compiler/tf2xla/resource_operation_table.cc
index 2db431c0413..860c3a40424 100644
--- a/tensorflow/compiler/tf2xla/resource_operation_table.cc
+++ b/tensorflow/compiler/tf2xla/resource_operation_table.cc
@@ -83,6 +83,8 @@ CreateResourceOpInfoMap() {
   add("ResourceScatterSub"                   , kReadWrite, kVariable);
   add("ResourceScatterUpdate"                , kReadWrite, kVariable);
   add("ResourceStridedSliceAssign"           , kReadWrite, kVariable);
+  add("RngReadAndSkip"                       , kReadWrite, kVariable);
+  add("RngSkip"                              , kReadWrite, kVariable);
   add("StatefulStandardNormalV2"             , kReadWrite, kVariable);
   add("StatefulTruncatedNormal"              , kReadWrite, kVariable);
   add("StatefulUniform"                      , kReadWrite, kVariable);
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index f8319cd446a..5c8cfdde9e4 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/shape_inference.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
 #include "tensorflow/compiler/tf2xla/graph_compiler.h"
 #include "tensorflow/compiler/tf2xla/rearrange_function_argument.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
@@ -57,6 +56,11 @@ limitations under the License.
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/util/dump_graph.h"
 
+#ifndef LIBTPU_ON_GCE
+#include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
+#include "tensorflow/compiler/mlir/utils/array_container_utils.h"
+#endif
+
 namespace tensorflow {
 namespace {
 
@@ -623,8 +627,28 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   graph_optimizer_options.inline_with_single_device_body_placer = true;
   graph_optimizer_options.ignore_noinline = is_inside_mustcompile;
 
-  optimizer.Optimize(flib_runtime_, flib_runtime_->env(),
-                     /*device=*/nullptr, &graph, graph_optimizer_options);
+  {
+    GraphShapeInfo shape_info;
+    InferShapes(graph.get(), /*arg_shapes=*/{},
+                flib_runtime_->GetFunctionLibraryDefinition(), &shape_info)
+        .IgnoreError();
+    auto node_name_index = graph->BuildNodeNameIndex();
+    std::unordered_map<string, std::vector<PartialTensorShape>> shape_map;
+    for (const auto& node_shape_info : shape_info) {
+      const string& node_name = node_shape_info.first;
+      const std::vector<InferredShape>& output_shapes = node_shape_info.second;
+      const auto& node_iter = node_name_index.find(node_name);
+      if (node_iter != node_name_index.end()) {
+        auto& partial_shapes = shape_map[node_name];
+        for (const auto& inferred_shape : output_shapes) {
+          partial_shapes.push_back(inferred_shape.shape);
+        }
+      }
+    }
+    graph_optimizer_options.shape_map = &shape_map;
+    optimizer.Optimize(flib_runtime_, flib_runtime_->env(),
+                       /*device=*/nullptr, &graph, graph_optimizer_options);
+  }
 
   // Run shape inference on the graph and optimize the graph again.
   GraphShapeInfo shape_info;
@@ -729,18 +753,32 @@ Status XlaCompiler::CompileFunction(
   }
 
   VLOG(1) << "====================================================";
-  if (GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge) {
+#ifdef LIBTPU_ON_GCE
+  if (GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge ==
+      ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED) {
+    VLOG(1) << "MLIR is not supported in this environment.";
+  }
+  TF_RETURN_IF_ERROR(
+      CompileGraph(options, function_id, std::move(graph), args, result));
+#else
+  if (GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge ==
+      ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED) {
     VLOG(1) << "Using MLIR bridge";
     GraphDebugInfo debug_info;
+    std::vector<std::string> control_rets;
+    for (const auto* control_ret_node : fbody->control_ret_nodes) {
+      control_rets.push_back(control_ret_node->name());
+    }
     TF_RETURN_IF_ERROR(CompileGraphToXlaHlo(
-        std::move(*graph), {args.data(), args.size()},
-        options_.device_type.type_string(), options.use_tuple_arg,
+        std::move(*graph), mlir::SpanToArrayRef<XlaCompiler::Argument>(args),
+        control_rets, options_.device_type.type_string(), options.use_tuple_arg,
         *options_.flib_def, debug_info, options_.shape_representation_fn,
         result));
   } else {
     TF_RETURN_IF_ERROR(
         CompileGraph(options, function_id, std::move(graph), args, result));
   }
+#endif
   VLOG(1) << "====================================================";
 
   cache_[{function_id, arg_vector}] = *result;
@@ -1143,7 +1181,11 @@ Status ValidateGraph(const Graph* graph,
       return errors::InvalidArgument(absl::StrCat(
           "Detected unsupported operations when trying to compile graph ", name,
           " on ", device_type.type_string(), ": ", node->def().op(), " (",
-          s.error_message(), ")", FormatNodeForError(*node)));
+          s.error_message(), ")", FormatNodeForError(*node),
+          "One approach is to outside compile the unsupported ops to run on "
+          "CPUs by enabling soft placement "
+          "`tf.config.set_soft_device_placement(True)`."
+          " This has a potential performance penalty."));
     }
     return Status::OK();
   };
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index b0d93cde846..762700eaea8 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -129,8 +129,6 @@ class XlaCompiler {
 
     // Resource updates are converted into input / output of xla. The two
     // buffers are aliased with other if this option is true.
-    //
-    // Currently only supports TPU.
     bool alias_resource_update = false;
   };
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index e37f4659185..ac6d065e775 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -134,6 +134,13 @@ XlaOpRegistry::~XlaOpRegistry() = default;
   result.first->second.op_filter = op_filter;
 }
 
+/* static */ bool XlaOpRegistry::IsCompilationDevice(
+    const string& device_name) {
+  XlaOpRegistry& registry = Instance();
+  mutex_lock lock(registry.mutex_);
+  return registry.backends_.find(device_name) != registry.backends_.end();
+}
+
 /* static */ bool XlaOpRegistry::GetCompilationDevice(
     const string& device_name, const DeviceRegistration** registration) {
   XlaOpRegistry& registry = Instance();
@@ -365,6 +372,19 @@ std::vector<const KernelDef*> XlaOpRegistry::DeviceKernels(
   return ops;
 }
 
+/*static*/ const std::unordered_set<std::string>*
+XlaOpRegistry::CompileTimeConstantInputArgNames(const string& op) {
+  XlaOpRegistry& registry = Instance();
+  mutex_lock lock(registry.mutex_);
+  auto it = registry.ops_.find(op);
+  static auto empty_set = new std::unordered_set<std::string>;
+  if (it == registry.ops_.end() || it->second.empty()) {
+    return empty_set;
+  } else {
+    return &it->second.front()->compile_time_constant_inputs;
+  }
+}
+
 /* static */ Status XlaOpRegistry::CompileTimeConstantInputs(
     const NodeDef& node_def, const OpKernel* op_kernel, const OpDef* op_def,
     std::vector<int>* result) {
@@ -385,21 +405,10 @@ std::vector<const KernelDef*> XlaOpRegistry::DeviceKernels(
                                compile_time_constant_inputs_from_attr.end()));
     compile_time_constant_inputs = &compile_time_constant_inputs_from_attr;
   } else {
-    const string& op = node_def.op();
-
-    XlaOpRegistry& registry = Instance();
-    mutex_lock lock(registry.mutex_);
-    auto it = registry.ops_.find(op);
-    if (it == registry.ops_.end() || it->second.empty()) {
+    compile_time_constant_inputs =
+        CompileTimeConstantInputArgNames(node_def.op());
+    if (compile_time_constant_inputs->empty()) {
       return Status::OK();
-    } else {
-      // The test in IsCompatible ensures that if there are multiple matching
-      // registrations for this op name, they all have the same value of
-      // compile_time_constant_inputs, so only the first match is returned.
-      //
-      // TODO(sanjoy): This can probably be a std::vector<string>.
-      compile_time_constant_inputs =
-          &it->second.front()->compile_time_constant_inputs;
     }
   }
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h
index af720fb4bb9..36657208a28 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.h
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.h
@@ -153,6 +153,10 @@ class XlaOpRegistry {
   static void RegisterCompilationDevice(const string& device_name,
                                         const DeviceRegistration& registration);
 
+  // Returns whether the device name is for the JIT device used exclusively for
+  // TF2XLA conversion.
+  static bool IsCompilationDevice(const string& device_name);
+
   // Returns the JIT device name associated with 'device_name', setting
   // 'jit_device_name', 'requires_jit', and 'enabled_jit_by_default', if they
   // are not null. Returns false and leaves the outputs unchanged if no matching
@@ -198,6 +202,11 @@ class XlaOpRegistry {
                                      /*op_def=*/nullptr, result);
   }
 
+  // Return names of arguments for a given op which are supposed to be
+  // constants.
+  static const std::unordered_set<std::string>*
+  CompileTimeConstantInputArgNames(const string& op);
+
   // Returns true if `op` is a "metadata" op, one that only looks at the shapes
   // of its operands and not their values.
   static bool IsMetadataOp(const string& op);
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 35fa6a617f0..831da22e033 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -1,8 +1,9 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "tf_cc_test")
 load(
     "//tensorflow/core/platform:build_config.bzl",
-    "tf_proto_library_cc",
-    "tf_proto_library_py",
+    "tf_proto_library",
 )
 
 package(
@@ -55,20 +56,14 @@ filegroup(
     visibility = [":friends"],
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "xla_data_proto",
     srcs = ["xla_data.proto"],
     cc_api_version = 2,
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library_py(
-    name = "xla_data_proto",  # bzl adds a _py suffix
-    srcs = ["xla_data.proto"],
-    visibility = ["//visibility:public"],
-)
-
-tf_proto_library_cc(
+tf_proto_library(
     name = "xla_proto",
     srcs = ["xla.proto"],
     cc_api_version = 2,
@@ -79,16 +74,6 @@ tf_proto_library_cc(
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library_py(
-    name = "xla_proto",  # bzl adds a _py suffix
-    srcs = ["xla.proto"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":xla_data_proto_py",
-        "//tensorflow/compiler/xla/service:hlo_proto_py",
-    ],
-)
-
 cc_library(
     name = "bit_cast",
     hdrs = ["bit_cast.h"],
@@ -292,6 +277,7 @@ tf_cc_test(
         ":types",
         ":util",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:bfloat16",
     ],
 )
 
@@ -335,7 +321,7 @@ cc_library(
         ":xla_data_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
@@ -541,7 +527,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":types",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -678,8 +664,8 @@ cc_library(
         ":statusor",
         ":types",
         "//tensorflow/core:lib",
-        "//tensorflow/core:regexp_internal",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:regexp",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -969,6 +955,11 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "union_find",
+    hdrs = ["union_find.h"],
+)
+
 # -----------------------------------------------------------------------------
 
 # This is a headers target that extra XLA devices can use to prevent circular dependencies.  Devices that are compiled as separate shared objects can also use it to prevent linking of library code.
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index a51970bb168..409cf37762b 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -1,6 +1,8 @@
 # Description:
 #   XLA client libraries.
 
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
@@ -128,7 +130,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:source_map_util",
         "//tensorflow/compiler/xla/service:stream_pool",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
@@ -148,7 +150,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:compile_only_service",
         "//tensorflow/compiler/xla/service:compiler",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "@com_google_absl//absl/memory",
         "@llvm-project//llvm:Support",
     ],
@@ -172,7 +174,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:optional",
@@ -255,6 +257,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index eb09e9c8867..92d222f32b2 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -1,5 +1,7 @@
 # Common computation builders for XLA.
 
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "generate_backend_suites", "xla_test")
 
 package(
@@ -305,6 +307,20 @@ xla_test(
         "//tensorflow/compiler/xla/tests:test_macros_header",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:tensor_float_32_utils",
+    ],
+)
+
+cc_library(
+    name = "lu_decomposition",
+    srcs = ["lu_decomposition.cc"],
+    hdrs = ["lu_decomposition.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:xla_builder",
     ],
 )
 
@@ -345,6 +361,9 @@ cc_library(
     hdrs = ["sorting.h"],
     deps = [
         ":comparators",
+        ":constants",
+        ":loops",
+        ":slicing",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -576,6 +595,7 @@ cc_library(
         ":loops",
         ":math",
         ":matrix",
+        ":qr",
         ":slicing",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
diff --git a/tensorflow/compiler/xla/client/lib/logdet.cc b/tensorflow/compiler/xla/client/lib/logdet.cc
index 8f37c393922..18cd0870f2a 100644
--- a/tensorflow/compiler/xla/client/lib/logdet.cc
+++ b/tensorflow/compiler/xla/client/lib/logdet.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/loops.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/qr.h"
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -33,13 +34,46 @@ limitations under the License.
 
 namespace xla {
 
-// let G = root(A) be the Cholesky root of the matrix A
-// log(det(A)) = 2*sum(log(vecdiag(G)))
+// log(det(A)) = sum(log(vecdiag(QR(A).r))), since R is triangular and Q is
+// orthonormal
 XlaOp LogDet(XlaOp a) {
-  XlaOp cholesky = Cholesky(a, /*bool lower=*/true);
+  return a.builder()->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape a_shape, a.builder()->GetShape(a));
+    // Compute the number of Householder transformations required on 'a' by
+    // determining the number of rows in 'a' that are already triangular. The
+    // determinant of Q is -1 ^ (number of Householder transfomations)
+    auto rows = Iota(a.builder(), ShapeUtil::ChangeElementType(a_shape, S32),
+                     a_shape.rank() - 2);
+    auto cols = Iota(a.builder(), ShapeUtil::ChangeElementType(a_shape, S32),
+                     a_shape.rank() - 1);
+    auto in_lower_triangle = Lt(cols, rows);
+    auto is_zero = Eq(a, ScalarLike(a, 0));
+    auto num_zeros_in_triangle_per_row = Einsum(
+        ConvertElementType(And(in_lower_triangle, is_zero), S32), "...a->...");
+    TF_ASSIGN_OR_RETURN(auto row_shape,
+                        a.builder()->GetShape(num_zeros_in_triangle_per_row));
+    rows = Iota(a.builder(), row_shape, row_shape.rank() - 1);
+    auto num_triangle_rows =
+        Einsum(ConvertElementType(Eq(rows, num_zeros_in_triangle_per_row), S32),
+               "...a->...");
+    auto num_rows =
+        ScalarLike(num_triangle_rows, a_shape.dimensions(a_shape.rank() - 2));
 
-  return ScalarLike(a, 2) *
-         Einsum(Log(cholesky), "...aa->...", xla::PrecisionConfig::HIGHEST);
+    TF_ASSIGN_OR_RETURN(auto qr, QRDecomposition(a, true));
+    // Get the and log of the determinant based on the values along the diagonal
+    // of R.
+    auto log_abs_det = Einsum(Log(Abs(qr.r)), "...aa->...");
+    auto sign_diag = Reduce(
+        Sign(Einsum(qr.r, "...aa->...a")),
+        One(a.builder(), a_shape.element_type()),
+        CreateScalarMultiplyComputation(a_shape.element_type(), a.builder()),
+        {a_shape.rank() - 2});
+    return sign_diag * log_abs_det *
+           Select(ConvertElementType(Rem(num_rows - num_triangle_rows,
+                                         ScalarLike(num_triangle_rows, 2)),
+                                     PRED),
+                  ScalarLike(sign_diag, -1.0), ScalarLike(sign_diag, 1.0));
+  });
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/logdet_test.cc b/tensorflow/compiler/xla/client/lib/logdet_test.cc
index 54af41f77f6..319d819ed98 100644
--- a/tensorflow/compiler/xla/client/lib/logdet_test.cc
+++ b/tensorflow/compiler/xla/client/lib/logdet_test.cc
@@ -51,6 +51,26 @@ XLA_TEST_F(LogDetTest, Simple) {
                              xla::ErrorSpec(1e-4));
 }
 
+XLA_TEST_F(LogDetTest, SimpleTriangle) {
+  xla::XlaBuilder builder(TestName());
+
+  xla::Array2D<float> a_vals({
+      {4, 6, 8, 10},
+      {4, -39, 62, 73},
+      {0, 0, -146, 166},
+      {4, 6, 8, 320},
+  });
+
+  float expected = -15.9131355f;
+
+  xla::XlaOp a;
+  auto a_data = CreateR2Parameter<float>(a_vals, 0, "a", &builder, &a);
+  xla::LogDet(a);
+
+  ComputeAndCompareR0<float>(&builder, expected, {a_data.get()},
+                             xla::ErrorSpec(1e-4));
+}
+
 XLA_TEST_F(LogDetTest, SimpleBatched) {
   xla::XlaBuilder builder(TestName());
 
diff --git a/tensorflow/compiler/xla/client/lib/lu_decomposition.cc b/tensorflow/compiler/xla/client/lib/lu_decomposition.cc
new file mode 100644
index 00000000000..2920b6f56b5
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/lu_decomposition.cc
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/lu_decomposition.h"
+
+#include <vector>
+
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+
+LuDecompositionResult LuDecomposition(XlaOp a) {
+  XlaBuilder* builder = a.builder();
+  XlaOp result = builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+    const int ndims = a_shape.rank();
+    TF_RET_CHECK(ndims >= 2);
+    const int64 m = ShapeUtil::GetDimension(a_shape, -2);
+    const int64 n = ShapeUtil::GetDimension(a_shape, -1);
+    const int num_batch_dims = a_shape.dimensions().size() - 2;
+    const std::vector<int64> batch_dims(
+        a_shape.dimensions().begin(),
+        a_shape.dimensions().begin() + num_batch_dims);
+
+    std::vector<int64> pivot_dims = batch_dims;
+    pivot_dims.push_back(std::min(m, n));
+    std::vector<int64> perm_dims = batch_dims;
+    perm_dims.push_back(m);
+    Shape lu_shape = ShapeUtil::MakeTupleShape(
+        {a_shape, ShapeUtil::MakeShape(S32, pivot_dims),
+         ShapeUtil::MakeShape(S32, perm_dims)});
+    // The TPU compiler has a rewrite pass that lowers an LuDecomposition
+    // CustomCall.
+    // TODO(phawkins): upgrade LU decomposition to a first-class HLO operator
+    // and implement it on other backends.
+    return CustomCall(a.builder(), "LuDecomposition", {a}, lu_shape);
+  });
+  return LuDecompositionResult{GetTupleElement(result, 0),
+                               GetTupleElement(result, 1),
+                               GetTupleElement(result, 2)};
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/lu_decomposition.h b/tensorflow/compiler/xla/client/lib/lu_decomposition.h
new file mode 100644
index 00000000000..3f5703510a3
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/lu_decomposition.h
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_LU_DECOMPOSITION_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_LU_DECOMPOSITION_H_
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+// Computes the LU decomposition with partial pivoting of a batch of matrices.
+//
+// Given a (batched) matrix a with shape [..., m, n], computes the matrix
+// decomposition A = P @ L @ U where P is a permutation matrix, L is a
+// lower-triangular matrix with unit diagonal entries, and U is an
+// upper-triangular matrix.
+//
+// L and U are returned as a single matrix [..., m, n] containing both L and U
+// packed in the same array. The unit diagonal of L is not represented
+// explicitly.
+//
+// The permutation matrix P is returned in two forms, both as `pivots`, which is
+// an s32[..., min(m, n)] array that describes a sequence of row-swaps in the
+// style of LAPACK's xGETRF API, and `permutation`, which is a s32[..., m] array
+// which gives the permutation to apply to the rows. We return both
+// representations because they are each useful for different purposes; `pivots`
+// is useful for computing the sign of a determinant, whereas `permutation` can
+// be used via a Gather operation to permute the rows of a matrix.
+//
+// This method is only implemented on TPU at the moment.
+// TODO(b/168208200): the implementation only supports F32 arrays. Handle the
+// complex case.
+struct LuDecompositionResult {
+  // The LU decomposition, with both L and U packed into an array with shape
+  // [..., m, n].
+  XlaOp lu;
+  // An array of shape s32[..., min(m, n)] containing the pivot rows.
+  XlaOp pivots;
+  // An array of shape s32[..., m], containing an another representation of the
+  // pivots as a permutation.
+  XlaOp permutation;
+};
+
+LuDecompositionResult LuDecomposition(XlaOp a);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_LU_DECOMPOSITION_H_
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index cd9f88a74ce..76cc6f0159b 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -203,7 +203,7 @@ static XlaOp ErfcImpl32(XlaOp x) {
 // Precondition: abs(x) <= 1.  Otherwise, use ErfcImpl.
 //
 // This follows Cephes's f32 implementation of erf.
-static XlaOp ErfImpl32(XlaOp x) {
+static XlaOp ErfImpl32Cephes(XlaOp x) {
   // Coefficients for by erf(f32), from Cephes.
   //
   // erf(x) = x P(x^2), 0 < x < 1
@@ -291,11 +291,31 @@ XlaOp Erfc(XlaOp x) {
     // (not surprising!), so upcast to f32 in this case.
     return DoWithUpcastToF32(x, {BF16, F16}, [](XlaOp x) {
       return Select(Gt(Abs(x), ScalarLike(x, 1)), ErfcImpl32(x),
-                    ScalarLike(x, 1) - ErfImpl32(x));
+                    ScalarLike(x, 1) - ErfImpl32Cephes(x));
     });
   });
 }
 
+// Compute a polynomial approximation of the error function.
+// This is the same approximation used by Eigen.
+static XlaOp ErfImpl32(XlaOp x) {
+  static const std::array<float, 7> kAlpha{
+      -2.72614225801306e-10f, 2.77068142495902e-08f,  -2.10102402082508e-06f,
+      -5.69250639462346e-05f, -7.34990630326855e-04f, -2.95459980854025e-03f,
+      -1.60960333262415e-02f,
+  };
+
+  static const std::array<float, 5> kBeta{
+      -1.45660718464996e-05f, -2.13374055278905e-04f, -1.68282697438203e-03f,
+      -7.37332916720468e-03f, -1.42647390514189e-02f,
+  };
+
+  x = Clamp(ScalarLike(x, -4.f), x, ScalarLike(x, 4.f));
+  auto x2 = x * x;
+  return x * EvaluatePolynomial<float>(x2, kAlpha) /
+         EvaluatePolynomial<float>(x2, kBeta);
+}
+
 XlaOp Erf(XlaOp x) {
   auto& b = *x.builder();
   return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -310,10 +330,8 @@ XlaOp Erf(XlaOp x) {
     }
     // Erf(c)Impl don't have enough precision when run with bf16 intermediates
     // (not surprising!), so upcast to f32 in this case.
-    return DoWithUpcastToF32(x, {BF16, F16}, [](XlaOp x) {
-      return Select(Lt(Abs(x), ScalarLike(x, 1)), ErfImpl32(x),
-                    ScalarLike(x, 1) - ErfcImpl32(x));
-    });
+    return DoWithUpcastToF32(x, {BF16, F16},
+                             [](XlaOp x) { return ErfImpl32(x); });
   });
 }
 
@@ -1832,4 +1850,139 @@ XlaOp RegularizedIncompleteBeta(XlaOp a, XlaOp b, XlaOp x) {
   });
 }
 
+XlaOp Polygamma(XlaOp n, XlaOp x) {
+  auto& builder = *x.builder();
+  auto doit = [](XlaOp n, XlaOp x, PrimitiveType type) -> XlaOp {
+    XlaOp n_plus_one = n + ScalarLike(n, 1.);
+    XlaOp sign =
+        (ScalarLike(n, 2.) * Rem(n, ScalarLike(n, 2.)) - ScalarLike(n, 1.));
+
+    const double nan = std::numeric_limits<double>::quiet_NaN();
+
+    XlaOp output = Select(Eq(n, ScalarLike(n, 0.)), Digamma(x),
+                          sign * Exp(Lgamma(n_plus_one)) * Zeta(n_plus_one, x));
+    // Check that n is a natural number.
+    output = Select(Or(Ne(n, Floor(n)), Lt(n, ScalarLike(n, 0.))),
+                    ScalarLike(n, nan), output);
+    return output;
+  };
+  return builder.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto n_shape, builder.GetShape(n));
+    TF_ASSIGN_OR_RETURN(auto x_shape, builder.GetShape(x));
+    if (n_shape != x_shape) {
+      return InvalidArgument(
+          "Arguments to Polygamma must have equal shapes and types; "
+          "got %s and %s",
+          n_shape.ToString(), x_shape.ToString());
+    }
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("Zeta", x));
+    bool needs_upcast =
+        n_shape.element_type() == F16 || x_shape.element_type() == BF16;
+
+    if (needs_upcast) {
+      n = ConvertElementType(n, F32);
+      x = ConvertElementType(x, F32);
+    }
+    XlaOp result = doit(n, x, n_shape.element_type());
+    if (needs_upcast) {
+      result = ConvertElementType(result, n_shape.element_type());
+    }
+    return result;
+  });
+}
+
+XlaOp Zeta(XlaOp x, XlaOp q) {
+  auto& builder = *x.builder();
+  auto doit = [&builder](XlaOp x, XlaOp q, PrimitiveType type) -> XlaOp {
+    // (2k) ! / B_{2k}, where B_{2k} are the Bernoulli numbers.
+    // These are ordered in reverse.
+    static const std::array<double, 12> kZetaCoeffs{
+        -7.1661652561756670113e18,
+        1.8152105401943546773e17,
+        -4.5979787224074726105e15,
+        1.1646782814350067249e14,
+        -2.950130727918164224e12,
+        7.47242496e10,
+        -1.8924375803183791606e9,
+        47900160.0,
+        -1209600.0,
+        30240.0,
+        -720.0,
+        12.0,
+    };
+
+    // For speed we'll always use 9 iterations for the initial series estimate,
+    // and a 12 term expansion for the Euler-Maclaurin formula.
+
+    XlaOp a = q;
+    XlaOp neg_power = ScalarLike(a, 0.);
+    XlaOp initial_sum = Pow(q, Neg(x));
+    for (int i = 0; i < 9; ++i) {
+      a = a + ScalarLike(a, 1.);
+      neg_power = Pow(a, Neg(x));
+      initial_sum = initial_sum + neg_power;
+    }
+    a = a + ScalarLike(a, 1.);
+    neg_power = Pow(a, Neg(x));
+    XlaOp s = initial_sum + neg_power * a / (x - ScalarLike(a, 1.));
+    XlaOp a_inverse_square = Reciprocal(Square(a));
+    XlaOp horner_sum = ScalarLike(a, 0.);
+    XlaOp factor = ScalarLike(a, 1.);
+    // Use Horner's rule for this.
+    // Note this differs from Cephes which does a 'naive' polynomial evaluation.
+    // Using Horner's rule allows to avoid some NaN's and Infs from happening,
+    // resulting in more numerically stable code.
+    for (int i = 0; i < 11; ++i) {
+      factor =
+          (x - ScalarLike(x, 22 - 2 * i)) * (x - ScalarLike(x, 21 - 2 * i));
+      horner_sum = factor * a_inverse_square *
+                   (horner_sum + ScalarLike(a, 1. / kZetaCoeffs[i]));
+    }
+    s = s + neg_power *
+                (ScalarLike(neg_power, 0.5) +
+                 x / a * (ScalarLike(a, 1. / kZetaCoeffs[11]) + horner_sum));
+
+    const double nan = std::numeric_limits<double>::quiet_NaN();
+    const double inf = std::numeric_limits<double>::infinity();
+    // Use the initial zeta sum without the correction term coming
+    // from Euler-Maclaurin if it is accurate enough.
+    XlaOp output =
+        Select(Lt(Abs(neg_power), Abs(initial_sum) * Epsilon(&builder, type)),
+               initial_sum, s);
+    // This is the harmonic series.
+    output = Select(Eq(x, ScalarLike(x, 1.)), ScalarLike(x, inf), output);
+    // Function is not defined for x < 1.
+    output = Select(Lt(x, ScalarLike(x, 1.)), ScalarLike(x, nan), output);
+    // If q <= 0, then when q is an integer or x is not an integer, this is
+    // NaN.
+    XlaOp domain_error = And(Le(q, ScalarLike(x, 0.)), Ne(x, Floor(x)));
+    XlaOp negative_integer_q = And(Le(q, ScalarLike(x, 0.)), Eq(q, Floor(q)));
+    output = Select(negative_integer_q, ScalarLike(x, inf), output);
+    output = Select(domain_error, ScalarLike(x, nan), output);
+    return output;
+  };
+  return builder.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(auto x_shape, builder.GetShape(x));
+    TF_ASSIGN_OR_RETURN(auto q_shape, builder.GetShape(q));
+    if (x_shape != q_shape) {
+      return InvalidArgument(
+          "Arguments to Zeta must have equal shapes and types; got %s and %s",
+          x_shape.ToString(), q_shape.ToString());
+    }
+    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("Zeta", x));
+    bool needs_upcast =
+        x_shape.element_type() == F16 || x_shape.element_type() == BF16;
+
+    if (needs_upcast) {
+      x = ConvertElementType(x, F32);
+      q = ConvertElementType(q, F32);
+    }
+    XlaOp result = doit(x, q, x_shape.element_type());
+    if (needs_upcast) {
+      result = ConvertElementType(result, x_shape.element_type());
+    }
+    return result;
+  });
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h
index f03348c0a57..e6b5ac992cc 100644
--- a/tensorflow/compiler/xla/client/lib/math.h
+++ b/tensorflow/compiler/xla/client/lib/math.h
@@ -72,6 +72,12 @@ XlaOp RandomGammaGrad(XlaOp a, XlaOp x);
 // Computes an approximation of the complementary incomplete gamma function.
 XlaOp Igammac(XlaOp a, XlaOp x);
 
+// Computes the Polygamma of two arguments.
+XlaOp Polygamma(XlaOp n, XlaOp x);
+
+// Computes the Riemann zeta function of two arguments.
+XlaOp Zeta(XlaOp x, XlaOp q);
+
 // Rounds the given number to even when the number is equidistant between two
 // integers.
 XlaOp RoundToEven(XlaOp x);
diff --git a/tensorflow/compiler/xla/client/lib/matrix.cc b/tensorflow/compiler/xla/client/lib/matrix.cc
index ec1cc7e0487..dbb73602801 100644
--- a/tensorflow/compiler/xla/client/lib/matrix.cc
+++ b/tensorflow/compiler/xla/client/lib/matrix.cc
@@ -395,7 +395,6 @@ xla::XlaOp Einsum(xla::XlaOp x, absl::Span<const int64> x_config, xla::XlaOp y,
     }
 
     DotDimensionNumbers dnums;
-    std::vector<int64> lhs_outer_dims;
     auto is_batch_dim = [&](int64 d) {
       return x_map.contains(d) && y_map.contains(d) && output_map.contains(d);
     };
@@ -408,11 +407,13 @@ xla::XlaOp Einsum(xla::XlaOp x, absl::Span<const int64> x_config, xla::XlaOp y,
     };
 
     absl::InlinedVector<int64, 8> rhs_outer_dims;
+    absl::InlinedVector<int64, 8> lhs_outer_dims;
     absl::InlinedVector<int64, 8> rhs_delete_dims;
     absl::InlinedVector<int64, 8> lhs_delete_dims;
     for (int64 i = 0; i < x_rank; ++i) {
       auto dim_name = x_config[i];
       const int64 rhs_dim = rhs_dimension_number(dim_name);
+
       if (is_batch_dim(dim_name)) {
         if (x_shape.dimensions(i) == y_shape.dimensions(rhs_dim)) {
           dnums.add_lhs_batch_dimensions(i);
@@ -448,30 +449,34 @@ xla::XlaOp Einsum(xla::XlaOp x, absl::Span<const int64> x_config, xla::XlaOp y,
     }
 
     absl::c_sort(rhs_outer_dims);
-
     absl::InlinedVector<int64, 8> output_transpose_dims;
-    absl::InlinedVector<int64, 8> output_reduce_dims;
-    auto output_dimension_number = [&](int64 d) {
+
+    auto output_dimension_number = [&](int64 d) -> absl::optional<int64> {
       auto pos = absl::c_find(output_config, d);
       if (pos == output_config.end()) {
-        const int64 dim =
-            output_transpose_dims.size() + output_reduce_dims.size();
-        output_reduce_dims.push_back(dim);
-      } else {
-        output_transpose_dims.push_back(pos - output_config.begin());
+        return absl::nullopt;
       }
+      return pos - output_config.begin();
     };
 
     for (auto d : dnums.lhs_batch_dimensions()) {
-      output_dimension_number(x_config[d]);
+      output_transpose_dims.push_back(*output_dimension_number(x_config[d]));
     }
 
     for (auto d : lhs_outer_dims) {
-      output_dimension_number(x_config[d]);
+      if (auto output_dim = output_dimension_number(x_config[d])) {
+        output_transpose_dims.push_back(*output_dim);
+        continue;
+      }
+      lhs_delete_dims.push_back(d);
     }
 
     for (auto d : rhs_outer_dims) {
-      output_dimension_number(y_config[d]);
+      if (auto output_dim = output_dimension_number(y_config[d])) {
+        output_transpose_dims.push_back(*output_dim);
+        continue;
+      }
+      rhs_delete_dims.push_back(d);
     }
 
     const int64 transpose_rank = output_transpose_dims.size();
@@ -482,29 +487,31 @@ xla::XlaOp Einsum(xla::XlaOp x, absl::Span<const int64> x_config, xla::XlaOp y,
 
     // Remove ones that where broadcasted from the x and the y shape and adjust
     // the dimension numbers that are more minor than those dimensions.
+    absl::c_sort(lhs_delete_dims);
     DeleteDimsFromContainer(lhs_delete_dims, &x_shape,
                             dnums.mutable_lhs_batch_dimensions(),
                             dnums.mutable_lhs_contracting_dimensions());
+
+    absl::c_sort(rhs_delete_dims);
     DeleteDimsFromContainer(rhs_delete_dims, &y_shape,
                             dnums.mutable_rhs_batch_dimensions(),
                             dnums.mutable_rhs_contracting_dimensions());
     if (!lhs_delete_dims.empty()) {
-      x = Reshape(x, x_shape.dimensions());
+      x = Reduce(x, ScalarLike(x, 0),
+                 CreateScalarAddComputation(x_shape.element_type(), builder),
+                 lhs_delete_dims);
     }
 
     if (!rhs_delete_dims.empty()) {
-      y = Reshape(y, y_shape.dimensions());
+      y = Reduce(y, ScalarLike(y, 0),
+                 CreateScalarAddComputation(y_shape.element_type(), builder),
+                 rhs_delete_dims);
     }
 
     PrecisionConfig precision_proto;
     precision_proto.add_operand_precision(precision);
     precision_proto.add_operand_precision(precision);
     auto dot = DotGeneral(x, y, dnums, &precision_proto);
-    if (!output_reduce_dims.empty()) {
-      dot = Reduce(dot, ScalarLike(dot, 0),
-                   CreateScalarAddComputation(x_shape.element_type(), builder),
-                   output_reduce_dims);
-    }
     dot = Transpose(dot, transpose_dims);
     if (transpose_rank == output_rank) {
       return dot;
diff --git a/tensorflow/compiler/xla/client/lib/prng.cc b/tensorflow/compiler/xla/client/lib/prng.cc
index cc5639f1be1..60086773d18 100644
--- a/tensorflow/compiler/xla/client/lib/prng.cc
+++ b/tensorflow/compiler/xla/client/lib/prng.cc
@@ -487,6 +487,10 @@ std::pair<XlaOp, XlaOp> BoxMullerTransform(XlaOp x0, XlaOp x1) {
 
 }  // namespace
 
+XlaOp PhiloxIncreaseCounter(XlaOp counter, XlaOp delta) {
+  return Uint128ToOp(Uint128AddUint64(Uint128FromOp(counter), delta));
+}
+
 RngOutput ThreeFryBitGenerator(XlaOp key, XlaOp initial_state,
                                const Shape& shape) {
   PrimitiveType type = shape.element_type();
diff --git a/tensorflow/compiler/xla/client/lib/prng.h b/tensorflow/compiler/xla/client/lib/prng.h
index 107fd884de3..20ad223403d 100644
--- a/tensorflow/compiler/xla/client/lib/prng.h
+++ b/tensorflow/compiler/xla/client/lib/prng.h
@@ -89,6 +89,9 @@ RngOutput NormalFloatingPointDistribution(XlaOp key, XlaOp initial_state,
 xla::XlaOp ConcatScalars(xla::XlaBuilder* builder,
                          absl::Span<const xla::XlaOp> scalars);
 
+// Increases Philox counter (an uint128) by a delta (an uint64).
+xla::XlaOp PhiloxIncreaseCounter(xla::XlaOp counter, xla::XlaOp delta);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_PRNG_H_
diff --git a/tensorflow/compiler/xla/client/lib/qr.cc b/tensorflow/compiler/xla/client/lib/qr.cc
index b2eecbac309..88a17f94a24 100644
--- a/tensorflow/compiler/xla/client/lib/qr.cc
+++ b/tensorflow/compiler/xla/client/lib/qr.cc
@@ -35,301 +35,7 @@ namespace xla {
 
 namespace {
 
-std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
-                                 absl::Span<const int64> ys) {
-  std::vector<int64> output(xs.size() + ys.size());
-  std::copy(xs.begin(), xs.end(), output.begin());
-  std::copy(ys.begin(), ys.end(), output.begin() + xs.size());
-  return output;
-}
 
-// Computes a Householder reflection of the form:
-// H = I - tau v v.T.
-// such that
-// H . ( x1  ) = ( x1   )
-//     ( x2  ) = ( x2   )
-//     ( ... ) = ( ...  )
-//     ( xk  ) = ( beta )
-//     ( ... )   ( 0    )
-//     ( ... )   ( 0    )
-// Unlike the usual formulation, we allow the caller to supply 'k' rather than
-// only providing the relevant part of 'x' to maintain XLA's static shape
-// invariant. In addition, the implementation supports batching.
-// Pseudo-code, without batching:
-//   alpha = x[k]
-//   x_copy = np.copy(x)
-//   x_copy[:k+1] = 0
-//   xnorm = norm2(x_copy)
-//   if xnorm == 0:
-//     beta = alpha
-//     tau = 0
-//     v = np.zeros_like(x)
-//   else:
-//     beta = - np.sign(alpha) * dlapy2(alpha, xnorm)
-//     tau = (beta - alpha) / beta
-//     v = x / (alpha - beta)
-//   v[k] = 1
-//   return (v, tau, beta)
-// TODO(phawkins): LAPACK's xLARFG implementation has code for handling
-// overflows in the norm/beta calculations. Perhaps do the same here.
-Status House(XlaOp x, XlaOp k, absl::Span<const int64> batch_dims,
-             const int64 m, XlaOp* v, XlaOp* tau, XlaOp* beta) {
-  XlaBuilder* const builder = x.builder();
-  TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
-  const PrimitiveType type = x_shape.element_type();
-
-  std::vector<int64> batch_dim_ids(batch_dims.size());
-  std::iota(batch_dim_ids.begin(), batch_dim_ids.end(), 0);
-  const int64 minor_dim = batch_dims.size();
-
-  XlaOp zero = ScalarLike(x, 0.0);
-  XlaOp one = ScalarLike(x, 1.0);
-
-  // alpha = x[k]
-  XlaOp alpha = Reshape(DynamicSliceInMinorDims(x, {k}, {1}), batch_dims);
-
-  // Compute x[k+1:] (padded with zeros in elements 0..k)
-  XlaOp iota = Iota(builder, S32, m);
-  XlaOp x_after_k = Mul(x, ConvertElementType(Gt(iota, k), type),
-                        /*broadcast_dimensions=*/{minor_dim});
-
-  // sigma = np.dot(x[k+1:], x[k+1:])
-  auto sigma = Reduce(x_after_k * x_after_k, zero,
-                      CreateScalarAddComputation(type, builder), {minor_dim});
-  // mu = np.sqrt(x[k]*x[k] + sigma)
-  auto mu = Sqrt(Square(alpha) + sigma);
-
-  auto sigma_is_zero = Eq(sigma, zero);
-
-  *beta = Select(sigma_is_zero, alpha, Select(Lt(alpha, zero), one, -one) * mu);
-  *tau = Select(sigma_is_zero, Broadcast(zero, batch_dims),
-                (*beta - alpha) / *beta);
-  auto divisor =
-      Select(sigma_is_zero, Broadcast(one, batch_dims), alpha - *beta);
-
-  auto e_k = Broadcast(ConvertElementType(Eq(iota, k), type),
-                       std::vector<int64>(batch_dims.size(), 1));
-
-  // Form v as [0, 0, ..., 1] ++ x[k+1:] / divisor
-  // If sigma is zero, x[k+1:] is zero, so use any non-zero divisor.
-  *v = e_k + Div(x_after_k, divisor, /*broadcast_dimensions=*/batch_dim_ids);
-  return Status::OK();
-}
-
-// Householder QR decomposition. Algorithm 5.2.1 from Golub and Van
-// Loan "Matrix Computations", 4th Edition. This is an unblocked implementation
-// used as an inner routine of the blocked implementation.
-// Algorithm is adapted slightly so the shapes inside the loop are static, at
-// the cost of some redundant computation. Since this is used as an inner block
-// kernel, accumulates the Householder transformations (vs, taus) rather than
-// the matrix q.
-// Equivalent Python code, without batching:
-// def qr(a):
-//   m = a.shape[0]
-//   n = a.shape[1]
-//   vs = np.zeros([m, n])
-//   taus = np.zeros([n])
-//   for j in xrange(min(m, n)):
-//     v, tau, beta = house(a[:, j], j)
-//     # Unusually, we apply the Householder transformation to the entirety of
-//     # a, wasting FLOPs to maintain the static shape invariant that XLA
-//     # requires. For columns that precede j this has no effect.
-//     a[:, :] -= tau * np.dot(v[:, np.newaxis],
-//                              np.dot(v[np.newaxis, :], a[:, :]))
-//     # Form column j explicitly rather than relying on the precision of the
-//     # Householder update.
-//     a[j, j] = beta
-//     a[j+1:, j] = np.zeros([m - j - 1], dtype=a.dtype)
-//     vs[:, j] = v
-//     taus[j] = tau
-//   return (q, vs, taus)
-struct QRBlockResult {
-  // The factored R value
-  XlaOp r;
-
-  // Representation of the Householder matrices I - beta v v.T
-  XlaOp taus;  // Shape: [..., n]
-  XlaOp vs;    // Shape: [..., m, n]
-};
-StatusOr<QRBlockResult> QRBlock(XlaOp a, PrecisionConfig::Precision precision) {
-  XlaBuilder* builder = a.builder();
-  TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-  const int num_dims = a_shape.rank();
-  if (num_dims < 2) {
-    return InvalidArgument("Argument to QR must have rank >= 2; got shape %s",
-                           a_shape.ToString());
-  }
-  PrimitiveType type = a_shape.element_type();
-
-  const int64 m = ShapeUtil::GetDimension(a_shape, -2);
-  const int64 n = ShapeUtil::GetDimension(a_shape, -1);
-
-  const int64 num_batch_dims = num_dims - 2;
-  std::vector<int64> batch_dims(num_batch_dims);
-  for (int i = 0; i < num_batch_dims; ++i) {
-    batch_dims[i] = ShapeUtil::GetDimension(a_shape, i);
-  }
-
-  std::vector<int64> batch_dim_indices(num_batch_dims);
-  std::iota(batch_dim_indices.begin(), batch_dim_indices.end(), 0);
-
-  auto qr_body_fn = [&](XlaOp j, absl::Span<const XlaOp> values,
-                        XlaBuilder* builder) -> StatusOr<std::vector<XlaOp>> {
-    auto a = values[0];
-    auto vs = values[1];
-    auto taus = values[2];
-
-    // v, beta = house(a[:, j], j)
-    auto x = DynamicSliceInMinorDims(a, {j}, {1});
-    XlaOp v, tau, beta;
-    TF_RETURN_IF_ERROR(House(Collapse(x, {num_dims - 2, num_dims - 1}), j,
-                             batch_dims, m, &v, &tau, &beta));
-
-    std::vector<int64> shape = batch_dims;
-    shape.push_back(1);
-    shape.push_back(m);
-    auto v_broadcast = Reshape(v, shape);
-    // a[:, :] -= tau * np.dot(v[:, np.newaxis],
-    //                          np.dot(v[np.newaxis, :], a[:, :]))
-    auto vva = BatchDot(v_broadcast, a, precision);
-    vva = BatchDot(v_broadcast, true, vva, false, precision);
-    a = a - Mul(tau, vva,
-                /*broadcast_dimensions=*/batch_dim_indices);
-
-    // It is more precise to populate column 'k' explicitly, rather than
-    // computing it implicitly by applying the Householder transformation.
-    // a[k,k] = beta
-    // a[k+1:,k] = np.zeros([m-k-1], dtype=a.dtype)
-    auto iota = Reshape(Iota(a.builder(), S32, m), {m, 1});
-    auto predecessor_mask = ConvertElementType(Lt(iota, j), type);
-    auto mask = Broadcast(ConvertElementType(Eq(iota, j), type),
-                          std::vector<int64>(batch_dims.size(), 1));
-    auto new_x = Mul(x, predecessor_mask,
-                     /*broadcast_dimensions=*/{num_dims - 2, num_dims - 1}) +
-                 Mul(beta, mask, /*broadcast_dimensions=*/batch_dim_indices);
-    // Update a[:,j]
-    std::vector<int64> dim_ids(num_dims);
-    std::iota(dim_ids.begin(), dim_ids.end(), 0);
-    new_x = BroadcastInDim(new_x, ConcatVectors(batch_dims, {m, n}),
-                           /*broadcast_dimensions=*/dim_ids);
-    const int64 minor_dim = batch_dims.size();
-    auto iota_mn = Iota(
-        builder, ShapeUtil::MakeShape(S32, ConcatVectors(batch_dims, {m, n})),
-        minor_dim + 1);
-    a = Select(Eq(iota_mn, j), new_x, a);
-
-    // vs[:, j] = v
-    std::vector<int64> vs_broadcast_dims(batch_dims.size() + 1);
-    std::iota(vs_broadcast_dims.begin(), vs_broadcast_dims.end(), 0);
-    auto vs_zeros = ZerosLike(vs);
-    auto vs_update = Select(
-        Eq(iota_mn, j),
-        Add(vs_zeros, v, /*broadcast_dimensions=*/vs_broadcast_dims), vs_zeros);
-    vs = vs + vs_update;
-
-    // taus[j] = tau
-    std::vector<int64> tau_broadcast_dims(batch_dims.size());
-    std::iota(tau_broadcast_dims.begin(), tau_broadcast_dims.end(), 0);
-
-    auto iota_n =
-        Iota(builder, ShapeUtil::MakeShape(S32, ConcatVectors(batch_dims, {n})),
-             minor_dim);
-    auto taus_zeros = ZerosLike(taus);
-    auto taus_update = Select(
-        Eq(iota_n, j),
-        Add(taus_zeros, tau, /*broadcast_dimensions=*/tau_broadcast_dims),
-        taus_zeros);
-    taus = taus + taus_update;
-    return std::vector<XlaOp>{a, vs, taus};
-  };
-
-  auto vs = Zeros(
-      builder, ShapeUtil::MakeShape(type, ConcatVectors(batch_dims, {m, n})));
-  auto taus = Zeros(builder,
-                    ShapeUtil::MakeShape(type, ConcatVectors(batch_dims, {n})));
-
-  TF_ASSIGN_OR_RETURN(auto values, ForEachIndex(std::min(m, n), S32, qr_body_fn,
-                                                {a, vs, taus}, "qr", builder));
-
-  QRBlockResult result;
-  result.r = values[0];
-  result.vs = values[1];
-  result.taus = values[2];
-  return result;
-}
-
-// Computes W and Y such that I-WY is equivalent to the sequence of Householder
-// transformations given by vs and taus.
-// Golub and van Loan, "Matrix Computations", algorithm 5.1.2.
-// Y = np.zeros([m, n])
-// W = np.zeros([m, n])
-// Y[:, 0] = vs[:, 0]
-// W[:, 0] = -taus[0] * vs[:, 0]
-// for j in xrange(1, n):
-//   v = vs[:, j]
-//   z = -taus[j] * v - taus[j] * np.dot(W, np.dot(Y.T, v))
-//   W[:, j] = z
-//   Y[:, j] = v
-// return W
-// There is no need to return Y since at termination of the loop it is equal to
-// vs.
-StatusOr<XlaOp> ComputeWYRepresentation(PrimitiveType type,
-                                        absl::Span<const int64> batch_dims,
-                                        XlaOp vs, XlaOp taus, int64 m, int64 n,
-                                        PrecisionConfig::Precision precision) {
-  std::vector<int64> batch_dim_indices(batch_dims.size());
-  std::iota(batch_dim_indices.begin(), batch_dim_indices.end(), 0);
-  int64 n_index = batch_dims.size() + 1;
-
-  auto body_fn = [&](XlaOp j, absl::Span<const XlaOp> values,
-                     XlaBuilder* builder) -> StatusOr<std::vector<XlaOp>> {
-    // w has shape [..., m, n]
-    auto w = values[0];
-    const auto vs = values[1];
-    const auto taus = values[2];
-
-    // Want j values in range [1, ... n).
-    j = j + ConstantR0<int32>(builder, 1);
-    // vs has shape [..., m, 1]
-    auto v = DynamicSliceInMinorDims(vs, {j}, {1});
-    // beta has shape [..., 1]
-    auto beta = DynamicSliceInMinorDims(taus, {j}, {1});
-
-    auto iota_mn = Iota(
-        builder, ShapeUtil::MakeShape(S32, ConcatVectors(batch_dims, {m, n})),
-        n_index);
-
-    // y has shape [..., m, n]
-    auto y = Select(Ge(iota_mn, j), ZerosLike(vs), vs);
-
-    // yv has shape [..., n, 1]
-    auto yv = BatchDot(y, true, v, false, precision);
-    // wyv has shape [..., m, 1]
-    auto wyv = BatchDot(w, yv, precision);
-
-    auto z = Mul(
-        -beta, v + wyv,
-        /*broadcast_dimensions=*/ConcatVectors(batch_dim_indices, {n_index}));
-
-    w = DynamicUpdateSliceInMinorDims(w, z, {j});
-
-    return std::vector<XlaOp>{w, vs, taus};
-  };
-
-  XlaBuilder* builder = vs.builder();
-  auto w = Zeros(builder,
-                 ShapeUtil::MakeShape(type, ConcatVectors(batch_dims, {m, n})));
-  auto v = SliceInMinorDims(vs, {0}, {1});
-  auto beta = SliceInMinorDims(taus, {0}, {1});
-  auto bv =
-      Mul(-beta, v,
-          /*broadcast_dimensions=*/ConcatVectors(batch_dim_indices, {n_index}));
-  w = UpdateSliceInMinorDims(w, bv, {0});
-
-  TF_ASSIGN_OR_RETURN(auto values, ForEachIndex(n - 1, S32, body_fn,
-                                                {w, vs, taus}, "wy", builder));
-  return values[0];
-}
 
 }  // namespace
 
@@ -340,14 +46,12 @@ StatusOr<XlaOp> ComputeWYRepresentation(PrimitiveType type,
 //   q = np.eye(m)
 //   for i in xrange(0, min(m, n), block_size):
 //     k = min(block_size, min(m, n) - s)
-//     (a, vs, taus) = qr(a[i:, i:i+k])
-//     y = vs
-//     w = ComputeWYRepresentation(vs, taus, m-i, k)
-//     a[i:, i+r:] += np.dot(y, np.dot(w.T, a[i:, i+k:]))
-//     q[:, i:] += np.dot(q[:, i:], np.dot(w, y.T))
+//     (a, taus) = qr(a[i:, i:i+k])
+//     y = np.eye(m, n) + np.tril(a, -1)
+//     t = CompactWYRepresentation(vs, taus, m-i, k)
+//     a[i:, i+k:] += (y @ t.T) @ (y.T @ a[i:, i+k:])
+//     q[:, i:] += (q[:, i:] @ y) @ (y @ t.T).T
 //   return (q, a)
-// TODO(phawkins): consider using UT transformations (in the form I - V U V')
-// rather than WY transformations.
 StatusOr<QRDecompositionResult> QRDecomposition(
     XlaOp a, bool full_matrices, int64 block_size,
     PrecisionConfig::Precision precision) {
@@ -358,8 +62,6 @@ StatusOr<QRDecompositionResult> QRDecomposition(
     return InvalidArgument("Arguments to QR must have rank >= 2: got shape %s",
                            a_shape.ToString());
   }
-  PrimitiveType type = a_shape.element_type();
-
   const int64 m = ShapeUtil::GetDimension(a_shape, -2);
   const int64 n = ShapeUtil::GetDimension(a_shape, -1);
   const int64 p = std::min(m, n);
@@ -369,53 +71,21 @@ StatusOr<QRDecompositionResult> QRDecomposition(
                            block_size);
   }
 
-  const int64 num_batch_dims = num_dims - 2;
-  std::vector<int64> batch_dims(num_batch_dims);
-  for (int i = 0; i < num_batch_dims; ++i) {
-    batch_dims[i] = ShapeUtil::GetDimension(a_shape, i);
-  }
+  Shape q_shape = a_shape;
+  q_shape.mutable_dimensions().back() = m;
 
-  auto q = Broadcast(IdentityMatrix(builder, type, m, m), batch_dims);
-  for (int64 i = 0; i < p; i += block_size) {
-    int64 k = std::min(block_size, p - i);
-
-    auto a_block = SliceInMinorDims(a, {i, i}, {m, i + k});
-    TF_ASSIGN_OR_RETURN(auto qr_block, QRBlock(a_block, precision));
-
-    a = UpdateSliceInMinorDims(a, qr_block.r, {i, i});
-
-    // Compute the I-WY block representation of a product of Householder
-    // matrices.
-    TF_ASSIGN_OR_RETURN(
-        auto w, ComputeWYRepresentation(type, batch_dims, qr_block.vs,
-                                        qr_block.taus, m - i, k, precision));
-    auto y = qr_block.vs;
-
-    // a[i:, i+k:] += np.dot(Y, np.dot(W.T, a[i:, i+k:]))
-    auto a_panel = SliceInMinorDims(a, {i, i + k}, {m, n});
-    auto a_update = BatchDot(w, true, a_panel, false, precision);
-    a_update = BatchDot(y, a_update, precision);
-    a_panel = a_panel + a_update;
-    a = UpdateSliceInMinorDims(a, a_panel, {i, i + k});
-
-    // q[:, i:] += np.dot(np.dot(q[:, i:], W), Y.T))
-    auto q_panel = SliceInMinorDims(q, {0, i}, {m, m});
-    auto q_update = BatchDot(q_panel, w, precision);
-    q_update = BatchDot(q_update, false, y, true, precision);
-    q_panel = q_panel + q_update;
-    q = UpdateSliceInMinorDims(q, q_panel, {0, i});
-  }
-  QRDecompositionResult result;
+  Shape qr_shape = ShapeUtil::MakeTupleShape({q_shape, a_shape});
+  auto qr = CustomCall(a.builder(), "QrDecomposition", {a}, qr_shape);
+  auto q = GetTupleElement(qr, 0);
+  auto r = GetTupleElement(qr, 1);
 
   // full_matrices is false when only a partial result in needed. Slice to the
   // needed dimensions here.
   if (!full_matrices) {
     q = SliceInMinorDims(q, {0, 0}, {m, p});
-    a = SliceInMinorDims(a, {0, 0}, {p, n});
+    r = SliceInMinorDims(r, {0, 0}, {p, n});
   }
-  result.q = q;
-  result.r = a;
-  return result;
+  return QRDecompositionResult{q, r};
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/qr_test.cc b/tensorflow/compiler/xla/client/lib/qr_test.cc
index a61f243e126..f1d2e4ddb1c 100644
--- a/tensorflow/compiler/xla/client/lib/qr_test.cc
+++ b/tensorflow/compiler/xla/client/lib/qr_test.cc
@@ -27,12 +27,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
 
 namespace {
 
 using QrTest = xla::ClientLibraryTestBase;
 
 XLA_TEST_F(QrTest, Simple) {
+  // Test fails with TensorFloat-32 enabled
+  tensorflow::enable_tensor_float_32_execution(false);
   xla::XlaBuilder builder(TestName());
 
   xla::Array2D<float> a_vals({
@@ -61,6 +64,8 @@ XLA_TEST_F(QrTest, Simple) {
 }
 
 XLA_TEST_F(QrTest, ZeroDiagonal) {
+  // Test fails with TensorFloat-32 enabled
+  tensorflow::enable_tensor_float_32_execution(false);
   xla::XlaBuilder builder(TestName());
 
   xla::Array2D<float> a_vals({
@@ -88,6 +93,8 @@ XLA_TEST_F(QrTest, ZeroDiagonal) {
 }
 
 XLA_TEST_F(QrTest, SimpleBatched) {
+  // Test fails with TensorFloat-32 enabled
+  tensorflow::enable_tensor_float_32_execution(false);
   xla::XlaBuilder builder(TestName());
 
   xla::Array3D<float> a_vals({
diff --git a/tensorflow/compiler/xla/client/lib/slicing_test.cc b/tensorflow/compiler/xla/client/lib/slicing_test.cc
index 8e2e713c45c..10e27285f02 100644
--- a/tensorflow/compiler/xla/client/lib/slicing_test.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing_test.cc
@@ -206,10 +206,12 @@ XLA_TEST_F(SlicingTest, DoubleEmptyIndexSelect) {
   xla::XlaOp input, index;
   Literal l(ShapeUtil::MakeShape(F32, {0, 1, 2, 0}));
   Literal i(ShapeUtil::MakeShape(S32, {0}));
-  auto input_data =
-      CreateParameterAndTransferLiteral(0, l, "input", &builder, &input);
-  auto index_data =
-      CreateParameterAndTransferLiteral(1, i, "index", &builder, &index);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input_data,
+      CreateParameterAndTransferLiteral(0, l, "input", &builder, &input));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto index_data,
+      CreateParameterAndTransferLiteral(1, i, "index", &builder, &index));
   TorchIndexSelect(input, index, 0);
   ComputeAndCompareLiteral(&builder, l, {input_data.get(), index_data.get()});
 }
@@ -219,8 +221,9 @@ XLA_TEST_F(SlicingTest, EmptyIndexSelectNonZero) {
 
   xla::XlaOp input, index;
   Literal l(ShapeUtil::MakeShape(F32, {0, 2}));
-  auto input_data =
-      CreateParameterAndTransferLiteral(0, l, "input", &builder, &input);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input_data,
+      CreateParameterAndTransferLiteral(0, l, "input", &builder, &input));
   auto index_data =
       CreateR1Parameter<int>({0, 0, 0}, 1, "index", &builder, &index);
   TorchIndexSelect(input, index, 0);
diff --git a/tensorflow/compiler/xla/client/lib/sorting.cc b/tensorflow/compiler/xla/client/lib/sorting.cc
index 750237c2000..abb0054558f 100644
--- a/tensorflow/compiler/xla/client/lib/sorting.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting.cc
@@ -16,6 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
 
 #include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/loops.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -27,6 +30,20 @@ XlaOp TopK(XlaOp input, int64 k) {
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
     int last_dim = input_shape.dimensions_size() - 1;
+    int64 last_dim_size = input_shape.dimensions(last_dim);
+    // TODO(b/148796364): tune these constants for better performance.
+    const int64 kPerPartitionSize = 8192;        // 2^13
+    const int64 kLastDimSizeThreshold = 524288;  // 2^19
+    const int64 kMinNumPartitions = 8;
+    const int64 kMinimalK = 1000;
+    if ((k >= kMinimalK) && (k < kPerPartitionSize) &&
+        (kPerPartitionSize / k > 2) && last_dim_size >= kLastDimSizeThreshold) {
+      int64 num_partitions =
+          CeilOfRatio(last_dim_size - k, kPerPartitionSize - k);
+      if (num_partitions >= kMinNumPartitions) {
+        return TopKWithPartitions(input, k, num_partitions);
+      }
+    }
 
     Shape iota_shape =
         ShapeUtil::MakeShape(S32, AsInt64Slice(input_shape.dimensions()));
@@ -80,30 +97,35 @@ XlaOp TopKWithPartitions(XlaOp input, int64 k, int64 num_partitions) {
       }
     }
 
-    XlaOp values, indices;
-    for (int64 partition = 0; partition < num_partitions; partition++) {
-      std::vector<int64> start_indices(input_shape.dimensions_size(), 0);
-      std::vector<int64> limit_indices(input_dims.begin(), input_dims.end());
-      std::vector<int64> strides(input_shape.dimensions_size(), 1);
-      start_indices[last_dim] = partition * per_partition_size;
-      limit_indices[last_dim] =
-          std::min((partition + 1) * per_partition_size, last_dim_size);
-      // Slice value and indices for this partition..
-      XlaOp sliced_input = Slice(input, start_indices, limit_indices, strides);
+    auto topk_body_fn =
+        [&](XlaOp partition, absl::Span<const XlaOp> values_and_indices,
+            XlaBuilder* builder) -> StatusOr<std::vector<XlaOp>> {
+      auto values = values_and_indices[0];
+      auto indices = values_and_indices[1];
+      auto input = values_and_indices[2];
+      auto iota_s32 = values_and_indices[3];
+
+      // Slice value and indices for this partition.
+      XlaOp start = Mul(Add(partition, ConstantR0<int32>(builder, 1)),
+                        ConstantR0<int32>(builder, per_partition_size));
+      XlaOp sliced_input =
+          DynamicSliceInMinorDims(input, {start}, {per_partition_size});
       XlaOp sliced_indices =
-          Slice(iota_s32, start_indices, limit_indices, strides);
+          DynamicSliceInMinorDims(iota_s32, {start}, {per_partition_size});
       // Concat with previous results.
-      if (partition > 0) {
-        sliced_input = ConcatInDim(builder, {values, sliced_input}, last_dim);
-        sliced_indices =
-            ConcatInDim(builder, {indices, sliced_indices}, last_dim);
-      }
+      sliced_input = ConcatInDim(builder, {values, sliced_input}, last_dim);
+      sliced_indices =
+          ConcatInDim(builder, {indices, sliced_indices}, last_dim);
       // Sort this slice
       XlaOp sort_result =
           Sort({sliced_input, sliced_indices},
                CreateScalarGtComputation({input_shape.element_type(), S32},
                                          sliced_indices.builder()),
-               last_dim, /*is_stable=*/true);
+               last_dim, true);
+
+      std::vector<int64> start_indices(input_shape.dimensions_size(), 0);
+      std::vector<int64> limit_indices(input_dims.begin(), input_dims.end());
+      std::vector<int64> strides(input_shape.dimensions_size(), 1);
       // Slice topk.
       start_indices[last_dim] = 0;
       limit_indices[last_dim] = k;
@@ -111,8 +133,42 @@ XlaOp TopKWithPartitions(XlaOp input, int64 k, int64 num_partitions) {
                      limit_indices, strides);
       indices = Slice(GetTupleElement(sort_result, 1), start_indices,
                       limit_indices, strides);
-    }
-    return Tuple(builder, {values, indices});
+      return std::vector<XlaOp>{values, indices, input, iota_s32};
+    };
+
+    // Get the values and indices for the first topk so that they can
+    // be passed to the while loop.
+    std::vector<int64> start_indices(input_shape.dimensions_size(), 0);
+    std::vector<int64> limit_indices(input_dims.begin(), input_dims.end());
+    std::vector<int64> strides(input_shape.dimensions_size(), 1);
+    start_indices[last_dim] = 0;
+    limit_indices[last_dim] = per_partition_size;
+    // Slice value and indices for the first partition.
+    XlaOp sliced_input = Slice(input, start_indices, limit_indices, strides);
+    XlaOp sliced_indices =
+        Slice(iota_s32, start_indices, limit_indices, strides);
+    // Sort this slice
+    XlaOp sort_result =
+        Sort({sliced_input, sliced_indices},
+             CreateScalarGtComputation({input_shape.element_type(), S32},
+                                       sliced_indices.builder()),
+             last_dim, /*is_stable=*/true);
+
+    // Slice topk.
+    start_indices[last_dim] = 0;
+    limit_indices[last_dim] = k;
+    XlaOp values = Slice(GetTupleElement(sort_result, 0), start_indices,
+                         limit_indices, strides);
+    XlaOp indices = Slice(GetTupleElement(sort_result, 1), start_indices,
+                          limit_indices, strides);
+
+    // Pass the result of the first TopK to the while loop and do
+    // num_partition - 1 iterations.
+    TF_ASSIGN_OR_RETURN(auto values_and_indices,
+                        ForEachIndex(num_partitions - 1, S32, topk_body_fn,
+                                     {values, indices, input, iota_s32},
+                                     "topk_with_partition", builder));
+    return Tuple(builder, {values_and_indices[0], values_and_indices[1]});
   });
 }
 
diff --git a/tensorflow/compiler/xla/client/lib/sorting_test.cc b/tensorflow/compiler/xla/client/lib/sorting_test.cc
index e01f6faf59e..e820d5bfe6f 100644
--- a/tensorflow/compiler/xla/client/lib/sorting_test.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting_test.cc
@@ -118,6 +118,19 @@ XLA_TEST_F(SortingTest, TopK3From8Values5Partitions) {
   ComputeAndCompareR1<float>(&builder, {7.0, 6.0, 5.0}, {});
 }
 
+XLA_TEST_F(SortingTest, DISABLED_TopKLargeInput) {
+  XlaBuilder builder(TestName());
+  Array<float> input({2, 1000000});
+  input.FillRandom(1.0f, 2.0f);
+  auto x =
+      CreateConstantFromLiteral(LiteralUtil::CreateFromArray(input), &builder);
+  Array2D<float> expected_array(2, 1000);
+  expected_array.Fill(2.0f);
+  xla::GetTupleElement(xla::TopK(x, 1000), 0);
+  ErrorSpec error_spec(10.0f, 10.0f);
+  ComputeAndCompareR2<float>(&builder, expected_array, {}, error_spec);
+}
+
 XLA_TEST_F(SortingTest, TopK3From8Indices5Partitions) {
   XlaBuilder builder(TestName());
   auto x_rev =
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 1389f548c5d..82a6128025f 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -267,9 +267,9 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::RunAsync(
 }
 
 static ShapedBuffer MaybeOwningShapeTreeToShapedBuffer(
-    Shape const& on_host_shape, const ShapeTree<MaybeOwningDeviceMemory>& tree,
-    se::Platform* platform, int device_ordinal) {
-  ShapedBuffer result(on_host_shape, tree.shape(), platform, device_ordinal);
+    const ShapeTree<MaybeOwningDeviceMemory>& tree, se::Platform* platform,
+    int device_ordinal) {
+  ShapedBuffer result(tree.shape(), platform, device_ordinal);
   auto it = tree.begin();
   auto out_it = result.buffers().begin();
   for (; it != tree.end(); ++it, ++out_it) {
@@ -299,8 +299,8 @@ StatusOr<ExecutionOutput> LocalExecutable::RunAsync(
     shaped_buffer_ptrs.reserve(arguments.size());
     for (size_t i = 0; i < arguments.size(); ++i) {
       shaped_buffers.push_back(MaybeOwningShapeTreeToShapedBuffer(
-          *argument_host_shapes[i], arguments[i].Buffers(),
-          backend_->platform(), stream->parent()->device_ordinal()));
+          arguments[i].Buffers(), backend_->platform(),
+          stream->parent()->device_ordinal()));
       shaped_buffer_ptrs.push_back(&shaped_buffers.back());
     }
 
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 34d78f9d933..41212e69b2e 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
+#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
@@ -42,6 +43,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/macros.h"
 
 namespace xla {
 
@@ -117,14 +119,23 @@ HloComputationProto CreateReduceOr(int64 reducer_id,
   }
   return reducer;
 }
+
+bool InstrIsSetBound(const HloInstructionProto* instr_proto) {
+  HloOpcode opcode = StringToHloOpcode(instr_proto->opcode()).ValueOrDie();
+  if (opcode == HloOpcode::kCustomCall &&
+      instr_proto->custom_call_target() == "SetBound") {
+    return true;
+  }
+  return false;
+}
 }  // namespace
 
 namespace internal {
 
-XlaOp XlaBuilderBuildFusion(XlaBuilder* builder,
-                            absl::Span<const XlaOp> operands,
-                            absl::string_view fusion_kind,
-                            const XlaComputation& fused_computation) {
+XlaOp XlaBuilderFriend::BuildFusion(XlaBuilder* builder,
+                                    absl::Span<const XlaOp> operands,
+                                    absl::string_view fusion_kind,
+                                    const XlaComputation& fused_computation) {
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     instr.set_fusion_kind(std::string(fusion_kind));
@@ -138,6 +149,21 @@ XlaOp XlaBuilderBuildFusion(XlaBuilder* builder,
   });
 }
 
+XlaOp XlaBuilderFriend::BuildBitcast(XlaBuilder* builder, XlaOp operand,
+                                     const Shape& shape) {
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    *instr.mutable_shape() = shape.ToProto();
+    return builder->AddInstruction(std::move(instr), HloOpcode::kBitcast,
+                                   {operand});
+  });
+}
+
+HloInstructionProto* XlaBuilderFriend::GetInstruction(XlaOp op) {
+  return &op.builder()
+              ->instructions_[op.builder()->handle_to_index_[op.handle_]];
+}
+
 }  // namespace internal
 
 XlaOp operator-(XlaOp x) { return Neg(x); }
@@ -293,7 +319,6 @@ void XlaBuilder::IsConstantVisitor(const int64 op_handle,
       // GetDimensionSize is always considered constant in XLA -- If a dynamic
       // dimension is presented, -1 is returned.
       break;
-
     // Non functional ops.
     case HloOpcode::kRng:
     case HloOpcode::kAllReduce:
@@ -306,6 +331,11 @@ void XlaBuilder::IsConstantVisitor(const int64 op_handle,
       // cannot be constant.  We cannot set is_functional=false in other similar
       // cases since we're already relying on IsConstant to return true.
     case HloOpcode::kCustomCall:
+      if (instr.custom_call_target() == "SetBound") {
+        // Set bound is considered constant -- the bound is used as the value.
+        break;
+      }
+      TF_FALLTHROUGH_INTENDED;
     case HloOpcode::kWhile:
       // TODO(b/32495713): We aren't checking the condition and body
       // computations themselves.
@@ -661,8 +691,10 @@ XlaOp XlaBuilder::BinaryOpNoBroadcast(HloOpcode binop, const Shape& shape,
 
 StatusOr<XlaOp> XlaBuilder::Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
                                     ComparisonDirection direction) {
-  return Compare(shape, lhs, rhs, direction,
-                 Comparison::DefaultComparisonType(shape.element_type()));
+  TF_ASSIGN_OR_RETURN(auto operand_shape, GetShape(lhs));
+  return Compare(
+      shape, lhs, rhs, direction,
+      Comparison::DefaultComparisonType(operand_shape.element_type()));
 }
 
 StatusOr<XlaOp> XlaBuilder::Compare(const Shape& shape, XlaOp lhs, XlaOp rhs,
@@ -1692,7 +1724,9 @@ XlaOp XlaBuilder::CustomCall(
     const string& call_target_name, absl::Span<const XlaOp> operands,
     const Shape& shape, const string& opaque,
     absl::optional<absl::Span<const Shape>> operand_shapes_with_layout,
-    bool has_side_effect) {
+    bool has_side_effect,
+    absl::Span<const std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+        output_operand_aliasing) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     if (absl::StartsWith(call_target_name, "$")) {
       return InvalidArgument(
@@ -1724,7 +1758,8 @@ XlaOp XlaBuilder::CustomCall(
       }
     }
     return CustomCallInternal(call_target_name, operands, shape, opaque,
-                              operand_shapes_with_layout, has_side_effect);
+                              operand_shapes_with_layout, has_side_effect,
+                              output_operand_aliasing);
   });
 }
 
@@ -1732,7 +1767,9 @@ StatusOr<XlaOp> XlaBuilder::CustomCallInternal(
     const string& call_target_name, absl::Span<const XlaOp> operands,
     const Shape& shape, const string& opaque,
     absl::optional<absl::Span<const Shape>> operand_shapes_with_layout,
-    bool has_side_effect) {
+    bool has_side_effect,
+    absl::Span<const std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+        output_operand_aliasing) {
   HloInstructionProto instr;
   *instr.mutable_shape() = shape.ToProto();
   instr.set_custom_call_target(call_target_name);
@@ -1744,6 +1781,16 @@ StatusOr<XlaOp> XlaBuilder::CustomCallInternal(
     }
   }
   instr.set_custom_call_has_side_effect(has_side_effect);
+  for (const auto& pair : output_operand_aliasing) {
+    auto aliasing = instr.add_custom_call_output_operand_aliasing();
+    aliasing->set_operand_index(pair.second.first);
+    for (int64 index : pair.second.second) {
+      aliasing->add_operand_shape_index(index);
+    }
+    for (int64 index : pair.first) {
+      aliasing->add_output_shape_index(index);
+    }
+  }
   return AddInstruction(std::move(instr), HloOpcode::kCustomCall, operands);
 }
 
@@ -1751,7 +1798,9 @@ XlaOp XlaBuilder::CustomCall(
     const string& call_target_name, absl::Span<const XlaOp> operands,
     const XlaComputation& computation, const Shape& shape, const string& opaque,
     absl::optional<absl::Span<const Shape>> operand_shapes_with_layout,
-    bool has_side_effect) {
+    bool has_side_effect,
+    absl::Span<const std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+        output_operand_aliasing) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     if (absl::StartsWith(call_target_name, "$")) {
@@ -1789,6 +1838,16 @@ XlaOp XlaBuilder::CustomCall(
       }
     }
     AddCalledComputation(computation, &instr);
+    for (const auto& pair : output_operand_aliasing) {
+      auto aliasing = instr.add_custom_call_output_operand_aliasing();
+      aliasing->set_operand_index(pair.second.first);
+      for (int64 index : pair.second.second) {
+        aliasing->add_operand_shape_index(index);
+      }
+      for (int64 index : pair.first) {
+        aliasing->add_output_shape_index(index);
+      }
+    }
     return AddInstruction(std::move(instr), HloOpcode::kCustomCall, operands);
   });
 }
@@ -3086,6 +3145,15 @@ StatusOr<XlaComputation> XlaBuilder::BuildDynamicInferenceGraph(XlaOp root_op) {
       case HloOpcode::kConstant:
         SetInstructionAsConstant(new_instr, id, new_shape, false);
         break;
+      case HloOpcode::kCustomCall:
+        if (instr_proto->custom_call_target() == "SetBound") {
+          SetInstructionAsConstant(new_instr, id, new_shape, true);
+          break;
+        } else {
+          return InvalidArgument(
+              "Dynamic inferencing on custom call %s is not supported",
+              instr_proto->DebugString());
+        }
       case HloOpcode::kParameter:
         SetInstructionAsConstant(new_instr, id, new_shape, true);
         break;
@@ -3149,7 +3217,8 @@ StatusOr<XlaComputation> XlaBuilder::BuildDynamicInferenceGraph(XlaOp root_op) {
     TF_ASSIGN_OR_RETURN(HloOpcode opcode,
                         StringToHloOpcode(instr_proto->opcode()));
     if (next_operand >= instr_proto->operand_ids_size() ||
-        opcode == HloOpcode::kGetDimensionSize) {
+        opcode == HloOpcode::kGetDimensionSize ||
+        InstrIsSetBound(instr_proto)) {
       // No more operands to process, process self.
       int64 new_id = ++global_id;
       VLOG(3) << "new_id: " << new_id << "instr: " << instr_proto->name();
@@ -3235,26 +3304,33 @@ StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
                         LookUpInstructionByHandle(handle));
 
     if (instr_proto->opcode() ==
-        HloOpcodeString(HloOpcode::kGetDimensionSize)) {
-      // At this point, BuildConstantSubGraph should never encounter a
-      // GetDimensionSize with a dynamic dimension. IsConstant check would have
-      // failed at the beginning of this function.
-      //
-      // Replace GetDimensionSize with a Constant representing the static bound
-      // of the shape.
-      int64 dimension = instr_proto->dimensions(0);
-      int64 operand_handle = instr_proto->operand_ids(0);
-      TF_ASSIGN_OR_RETURN(const HloInstructionProto* operand_proto,
-                          LookUpInstructionByHandle(operand_handle));
+            HloOpcodeString(HloOpcode::kGetDimensionSize) ||
+        InstrIsSetBound(instr_proto)) {
+      int32 constant_value = -1;
+      if (instr_proto->opcode() ==
+          HloOpcodeString(HloOpcode::kGetDimensionSize)) {
+        // At this point, BuildConstantSubGraph should never encounter a
+        // GetDimensionSize with a dynamic dimension. IsConstant check would
+        // have failed at the beginning of this function.
+        //
+        // Replace GetDimensionSize with a Constant representing the static
+        // bound of the shape.
+        int64 dimension = instr_proto->dimensions(0);
+        int64 operand_handle = instr_proto->operand_ids(0);
+        TF_ASSIGN_OR_RETURN(const HloInstructionProto* operand_proto,
+                            LookUpInstructionByHandle(operand_handle));
 
-      int32 constant_dimension_size = -1;
-      if (!(operand_proto->shape().is_dynamic_dimension(dimension) &&
-            dynamic_dimension_is_minus_one)) {
-        constant_dimension_size =
-            static_cast<int32>(operand_proto->shape().dimensions(dimension));
+        if (!(operand_proto->shape().is_dynamic_dimension(dimension) &&
+              dynamic_dimension_is_minus_one)) {
+          constant_value =
+              static_cast<int32>(operand_proto->shape().dimensions(dimension));
+        }
+      } else {
+        TF_RET_CHECK(
+            absl::SimpleAtoi(instr_proto->backend_config(), &constant_value));
       }
 
-      Literal literal = LiteralUtil::CreateR0(constant_dimension_size);
+      Literal literal = LiteralUtil::CreateR0(constant_value);
 
       HloInstructionProto const_instr;
       *const_instr.mutable_shape() = literal.shape().ToProto();
@@ -3286,6 +3362,9 @@ StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
     if (instr_src->opcode() == HloOpcodeString(HloOpcode::kGetDimensionSize)) {
       continue;
     }
+    if (InstrIsSetBound(instr_src)) {
+      continue;
+    }
     auto* instr = entry.add_instructions();
 
     *instr = *instr_src;
@@ -3826,31 +3905,39 @@ XlaOp Call(XlaBuilder* builder, const XlaComputation& computation,
   return builder->Call(computation, operands);
 }
 
-XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
-                 absl::Span<const XlaOp> operands, const Shape& shape,
-                 const string& opaque, bool has_side_effect) {
+XlaOp CustomCall(
+    XlaBuilder* builder, const string& call_target_name,
+    absl::Span<const XlaOp> operands, const Shape& shape, const string& opaque,
+    bool has_side_effect,
+    absl::Span<const std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+        output_operand_aliasing) {
   return builder->CustomCall(call_target_name, operands, shape, opaque,
                              /*operand_shapes_with_layout=*/absl::nullopt,
-                             has_side_effect);
+                             has_side_effect, output_operand_aliasing);
 }
 
-XlaOp CustomCallWithComputation(XlaBuilder* builder,
-                                const string& call_target_name,
-                                absl::Span<const XlaOp> operands,
-                                const XlaComputation& computation,
-                                const Shape& shape, const string& opaque,
-                                bool has_side_effect) {
-  return builder->CustomCall(
-      call_target_name, operands, computation, shape, opaque,
-      /*operand_shapes_with_layout=*/absl::nullopt, has_side_effect);
+XlaOp CustomCallWithComputation(
+    XlaBuilder* builder, const string& call_target_name,
+    absl::Span<const XlaOp> operands, const XlaComputation& computation,
+    const Shape& shape, const string& opaque, bool has_side_effect,
+    absl::Span<const std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+        output_operand_aliasing) {
+  return builder->CustomCall(call_target_name, operands, computation, shape,
+                             opaque,
+                             /*operand_shapes_with_layout=*/absl::nullopt,
+                             has_side_effect, output_operand_aliasing);
 }
 
-XlaOp CustomCallWithLayout(XlaBuilder* builder, const string& call_target_name,
-                           absl::Span<const XlaOp> operands, const Shape& shape,
-                           absl::Span<const Shape> operand_shapes_with_layout,
-                           const string& opaque, bool has_side_effect) {
+XlaOp CustomCallWithLayout(
+    XlaBuilder* builder, const string& call_target_name,
+    absl::Span<const XlaOp> operands, const Shape& shape,
+    absl::Span<const Shape> operand_shapes_with_layout, const string& opaque,
+    bool has_side_effect,
+    absl::Span<const std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+        output_operand_aliasing) {
   return builder->CustomCall(call_target_name, operands, shape, opaque,
-                             operand_shapes_with_layout, has_side_effect);
+                             operand_shapes_with_layout, has_side_effect,
+                             output_operand_aliasing);
 }
 
 XlaOp Complex(const XlaOp lhs, const XlaOp rhs,
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index f841a1a75a0..f736ae1d470 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -47,13 +47,21 @@ namespace xla {
 
 class XlaBuilder;
 class XlaOp;
+class HloInstruction;
 
 namespace internal {
 
-XlaOp XlaBuilderBuildFusion(XlaBuilder* builder,
-                            absl::Span<const XlaOp> operands,
-                            absl::string_view fusion_kind,
-                            const XlaComputation& fused_computation);
+struct XlaBuilderFriend {
+  static XlaOp BuildFusion(XlaBuilder* builder,
+                           absl::Span<const XlaOp> operands,
+                           absl::string_view fusion_kind,
+                           const XlaComputation& fused_computation);
+
+  static XlaOp BuildBitcast(XlaBuilder* builder, XlaOp operand,
+                            const Shape& shape);
+
+  static HloInstructionProto* GetInstruction(XlaOp op);
+};
 
 }  // namespace internal
 
@@ -107,6 +115,7 @@ class XlaOp {
 
   friend class XlaBuilder;
   friend class MlirHloBuilder;
+  friend struct internal::XlaBuilderFriend;
 
   // < 0 means "invalid handle".
   int64 handle_;
@@ -164,6 +173,15 @@ class XlaBuilder {
   // OpMetadata attached until a call to ClearOpMetadata.
   void SetOpMetadata(OpMetadata metadata) { metadata_ = std::move(metadata); }
 
+  // Swaps the passed op metadata with the ones currently set.
+  //
+  // Returns the old op metadata.
+  OpMetadata SwapOpMetadata(OpMetadata metadata) {
+    OpMetadata old_metadata = std::move(metadata_);
+    metadata_ = std::move(metadata);
+    return old_metadata;
+  }
+
   // Similar to SetOpMetadata, but only set the metadata for the next op.
   void SetOneShotOpMetadata(OpMetadata metadata) {
     metadata_ = std::move(metadata);
@@ -584,7 +602,9 @@ class XlaBuilder {
       const string& call_target_name, absl::Span<const XlaOp> operands,
       const Shape& shape_with_layout, const string& opaque,
       absl::optional<absl::Span<const Shape>> operand_shapes_with_layout,
-      bool has_side_effect);
+      bool has_side_effect,
+      absl::Span<const std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+          output_operand_aliasing);
 
   // Internal version of CustomCall without computation that doesn't do op
   // specific error handling and expects arguments to be legal. CustomCall
@@ -593,14 +613,18 @@ class XlaBuilder {
       const string& call_target_name, absl::Span<const XlaOp> operands,
       const Shape& shape_with_layout, const string& opaque,
       absl::optional<absl::Span<const Shape>> operand_shapes_with_layout,
-      bool has_side_effect);
+      bool has_side_effect,
+      absl::Span<const std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+          output_operand_aliasing);
 
   XlaOp CustomCall(
       const string& call_target_name, absl::Span<const XlaOp> operands,
       const XlaComputation& computation, const Shape& shape_with_layout,
       const string& opaque,
       absl::optional<absl::Span<const Shape>> operand_shapes_with_layout,
-      bool has_side_effect);
+      bool has_side_effect,
+      absl::Span<const std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+          output_operand_aliasing);
 
   XlaOp Reduce(XlaOp operand, XlaOp init_value,
                const XlaComputation& computation,
@@ -1049,18 +1073,25 @@ class XlaBuilder {
                       const string& outfeed_config);
   friend XlaOp Call(XlaBuilder* builder, const XlaComputation& computation,
                     absl::Span<const XlaOp> operands);
-  friend XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
-                          absl::Span<const XlaOp> operands, const Shape& shape,
-                          const string& opaque, bool has_side_effect);
+  friend XlaOp CustomCall(
+      XlaBuilder* builder, const string& call_target_name,
+      absl::Span<const XlaOp> operands, const Shape& shape,
+      const string& opaque, bool has_side_effect,
+      absl::Span<const std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+          output_operand_aliasing);
   friend XlaOp CustomCallWithComputation(
       XlaBuilder* builder, const string& call_target_name,
       absl::Span<const XlaOp> operands, const XlaComputation& computation,
-      const Shape& shape, const string& opaque, bool has_side_effect);
+      const Shape& shape, const string& opaque, bool has_side_effect,
+      absl::Span<const std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+          output_operand_aliasing);
   friend XlaOp CustomCallWithLayout(
       XlaBuilder* builder, const string& call_target_name,
       absl::Span<const XlaOp> operands, const Shape& shape_with_layout,
       absl::Span<const Shape> operand_shapes_with_layout, const string& opaque,
-      bool has_side_effect);
+      bool has_side_effect,
+      absl::Span<const std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+          output_operand_aliasing);
   friend XlaOp Complex(XlaOp real, XlaOp imag,
                        absl::Span<const int64> broadcast_dimensions);
   friend XlaOp Conj(XlaOp operand);
@@ -1284,9 +1315,7 @@ class XlaBuilder {
     return LookUpInstructionByHandleInternal<InstructionType>(op.handle());
   }
 
-  friend XlaOp internal::XlaBuilderBuildFusion(
-      XlaBuilder* builder, absl::Span<const XlaOp> operands,
-      absl::string_view fusion_kind, const XlaComputation& fused_computation);
+  friend struct internal::XlaBuilderFriend;
 };
 
 // RAII-style object: sets the current sharding assignment in builder on
@@ -1339,6 +1368,25 @@ class XlaScopedFrontendAttributesAssignment {
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaScopedFrontendAttributesAssignment);
 };
+
+// RAII-style object: sets the current op metadata in builder on construction,
+// and sets back to the previous assignment on destruction.
+class XlaScopedOpMetadataAssignment {
+ public:
+  XlaScopedOpMetadataAssignment(xla::XlaBuilder* builder, OpMetadata metadata)
+      : builder_(builder) {
+    saved_ = builder_->SwapOpMetadata(metadata);
+  }
+
+  ~XlaScopedOpMetadataAssignment() { builder_->SwapOpMetadata(saved_); }
+
+ private:
+  xla::XlaBuilder* const builder_;
+  OpMetadata saved_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaScopedOpMetadataAssignment);
+};
+
 // Free functions for building XlaOps. The intention is that these will
 // become the public API for building XlaOps rather than calling methods on
 // XlaBuilder directly.
@@ -1777,30 +1825,39 @@ XlaOp Call(XlaBuilder* builder, const XlaComputation& computation,
 // backend, a call instruction is emitted which targets a symbol with the name
 // |call_target_name|.  |call_target_name| and |opaque| can arbitrary strings,
 // but |call_target_name| should be short as it may be used in labels. |opaque|
-// can encode arbitrarily large amounts of information.
-XlaOp CustomCall(XlaBuilder* builder, const string& call_target_name,
-                 absl::Span<const XlaOp> operands, const Shape& shape,
-                 const string& opaque = "", bool has_side_effect = false);
+// can encode arbitrarily large amounts of information. |has_side_effect|
+// specifies whether the instruction can have side effects.
+// |output_operand_aliasing| specifies a list of output/operand buffer pairs
+// that alias each other, where the output buffer is represented as a
+// ShapeIndex, and the operand buffer is represented as the operand index and
+// the ShapeIndex.
+XlaOp CustomCall(
+    XlaBuilder* builder, const string& call_target_name,
+    absl::Span<const XlaOp> operands, const Shape& shape,
+    const string& opaque = "", bool has_side_effect = false,
+    absl::Span<const std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+        output_operand_aliasing = {});
 
 // Overload which constructs a custom call that applies an Xla computation.
-XlaOp CustomCallWithComputation(XlaBuilder* builder,
-                                const string& call_target_name,
-                                absl::Span<const XlaOp> operands,
-                                const XlaComputation& computation,
-                                const Shape& shape, const string& opaque = "",
-                                bool has_side_effect = false);
+XlaOp CustomCallWithComputation(
+    XlaBuilder* builder, const string& call_target_name,
+    absl::Span<const XlaOp> operands, const XlaComputation& computation,
+    const Shape& shape, const string& opaque = "", bool has_side_effect = false,
+    absl::Span<const std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+        output_operand_aliasing = {});
 
 // Overload which constructs a custom call with fixed layouts. The operands will
 // have the layouts specified by |operand_shapes_with_layout| when provided to
 // external code, and the external code is expected to produce a result with the
 // layout specified by |shape_with_layout|. All shapes in |shape_with_layout|
 // and |operand_shapes_with_layout| must have layouts.
-XlaOp CustomCallWithLayout(XlaBuilder* builder, const string& call_target_name,
-                           absl::Span<const XlaOp> operands,
-                           const Shape& shape_with_layout,
-                           absl::Span<const Shape> operand_shapes_with_layout,
-                           const string& opaque = "",
-                           bool has_side_effect = false);
+XlaOp CustomCallWithLayout(
+    XlaBuilder* builder, const string& call_target_name,
+    absl::Span<const XlaOp> operands, const Shape& shape_with_layout,
+    absl::Span<const Shape> operand_shapes_with_layout,
+    const string& opaque = "", bool has_side_effect = false,
+    absl::Span<const std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+        output_operand_aliasing = {});
 
 // The following methods enqueue element-wise binary arithmetic operations
 // onto the computation. The shapes of the operands have to match unless one
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index 7011c946203..bfd13c8ddf5 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -19,6 +19,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -1203,5 +1205,16 @@ TEST_F(XlaBuilderTest, AddFrontendAttribute) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   ExpectInstructionsAttributesMatch(*module, expected);
 }
+
+TEST_F(XlaBuilderTest, ComparisonType) {
+  XlaBuilder b(TestName());
+  (void)Le(ConstantR0<int32>(&b, 1), ConstantR0<int32>(&b, 2));
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+  ASSERT_THAT(root, op::Compare(op::Constant(), op::Constant()));
+  EXPECT_EQ(Comparison::Type::kSigned,
+            DynCast<HloCompareInstruction>(root)->type());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/g3doc/index.md b/tensorflow/compiler/xla/g3doc/index.md
index 51d666fba9a..45abd9b4c92 100644
--- a/tensorflow/compiler/xla/g3doc/index.md
+++ b/tensorflow/compiler/xla/g3doc/index.md
@@ -121,8 +121,8 @@ example.
 
 ### AOT (Ahead-of-time) compilation for CPU with `tfcompile`
 
-You can also use a standalone [`tfcompile`](./tfcompile) tool,
-which converts TensorFlow graph into executable code (for x86-64 CPU only).
+You can also use a standalone [`tfcompile`](./tfcompile.md) tool, which converts
+TensorFlow graph into executable code (for x86-64 CPU only).
 
 ## Inspect compiled programs
 
@@ -196,7 +196,7 @@ Apart from TensorFlow, XLA programs can be generated by:
     [XLA source](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla)
     on Github!
 
-<iframe frameborder="0" allowfullscreen="1" allow="accelerometer; autoplay;
-encrypted-media; gyroscope; picture-in-picture" width="640" height="360"
+<iframe frameborder="0" allow="accelerometer; autoplay;
+encrypted-media; gyroscope; picture-in-picture; fullscreen" width="640" height="360"
 src="https://www.youtube.com/embed/kAOanJczHA0?origin=https%3A%2F%2Fwww.tensorflow.org&amp;autohide=1&amp;showinfo=0&amp;video-id=kAOanJczHA0&amp;enablejsapi=1&amp;widgetid=1"
 id="widget2" data-title="YouTube video player"></iframe>
diff --git a/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb b/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb
index c0160f2766c..d7799093583 100644
--- a/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb
+++ b/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb
@@ -169,7 +169,7 @@
         "  model.set_weights(initial_weights)\n",
         "\n",
         "warmup(model, x_train, y_train, x_test, y_test)\n",
-        "%time train_model(model, x_train, y_train, x_test, y_test)\n",
+        "train_model(model, x_train, y_train, x_test, y_test)\n",
         "\n",
         "scores = model.evaluate(x_test, y_test, verbose=1)\n",
         "print('Test loss:', scores[0])\n",
diff --git a/tensorflow/compiler/xla/pjrt/BUILD b/tensorflow/compiler/xla/pjrt/BUILD
index 5b3b75eb352..1ff96db8637 100644
--- a/tensorflow/compiler/xla/pjrt/BUILD
+++ b/tensorflow/compiler/xla/pjrt/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
@@ -25,7 +26,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/synchronization",
     ],
@@ -108,7 +109,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
         "//tensorflow/stream_executor:event",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/synchronization",
@@ -140,11 +141,12 @@ cc_library(
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_cost_analysis",
         "//tensorflow/compiler/xla/service:maybe_owning_device_memory",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
-        "//tensorflow/core:allocator",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:allocator",
         "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/lib:traceme_encode",
@@ -166,6 +168,47 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tpu_client",
+    srcs = ["tpu_client.cc"],
+    hdrs = ["tpu_client.h"],
+    visibility = [
+        "//learning/brain/research/jax:__subpackages__",
+        "//learning/deepmind/tensorflow/tensorfn:__subpackages__",
+        "//learning/pathways:__subpackages__",
+        "//tensorflow/compiler/xla:friends",
+    ],
+    deps = [
+        ":local_device_state",
+        ":pjrt_client",
+        ":tracked_device_buffer",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/service:computation_placer",
+        "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/tpu:tpu_executor_dlsym_initializer",
+        "//tensorflow/core/tpu:tpu_on_demand_compiler",
+        "//tensorflow/stream_executor:device_memory",
+        "//tensorflow/stream_executor:stream",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/tpu:tpu_computation_placer",
+        "//tensorflow/stream_executor/tpu:tpu_executable_interface",
+        "//tensorflow/stream_executor/tpu:tpu_executor",
+        "//tensorflow/stream_executor/tpu:tpu_executor_interface",
+        "//tensorflow/stream_executor/tpu:tpu_platform_interface",
+        "//tensorflow/stream_executor/tpu:tpu_topology_external",
+        "//tensorflow/stream_executor/tpu:tpu_transfer_manager",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+    ],
+)
+
 cc_library(
     name = "interpreter_device",
     srcs = ["interpreter_device.cc"],
@@ -208,6 +251,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core/common_runtime:bfc_allocator",
         "//tensorflow/core/common_runtime/gpu:gpu_mem_allocator",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/stream_executor:tf_allocator_adapter",
     ] + if_cuda(["@local_config_nccl//:nccl"]),
 )
diff --git a/tensorflow/compiler/xla/pjrt/cpu_device.cc b/tensorflow/compiler/xla/pjrt/cpu_device.cc
index e2543bda7df..c571ef2a4df 100644
--- a/tensorflow/compiler/xla/pjrt/cpu_device.cc
+++ b/tensorflow/compiler/xla/pjrt/cpu_device.cc
@@ -28,7 +28,7 @@ CpuDevice::CpuDevice(int id,
     : PjRtDevice(id, std::move(local_device_state), kCpuPlatformName,
                  /*device_kind=*/kCpuPlatformName) {}
 
-StatusOr<std::shared_ptr<PjRtClient>> GetCpuClient(bool asynchronous) {
+StatusOr<std::unique_ptr<PjRtClient>> GetCpuClient(bool asynchronous) {
   TF_ASSIGN_OR_RETURN(se::Platform * platform,
                       PlatformUtil::GetPlatform("Host"));
   if (platform->VisibleDeviceCount() <= 0) {
@@ -56,7 +56,7 @@ StatusOr<std::shared_ptr<PjRtClient>> GetCpuClient(bool asynchronous) {
     devices.push_back(std::move(device));
   }
 
-  return std::make_shared<PjRtClient>(
+  return std::make_unique<PjRtClient>(
       kCpuPlatformName, client, std::move(devices), /*host_id=*/0,
       /*allocator=*/nullptr, /*host_memory_allocator=*/nullptr,
       /*should_stage_host_to_device_transfers=*/false,
diff --git a/tensorflow/compiler/xla/pjrt/cpu_device.h b/tensorflow/compiler/xla/pjrt/cpu_device.h
index ad0079b1c4a..1036d8fedbb 100644
--- a/tensorflow/compiler/xla/pjrt/cpu_device.h
+++ b/tensorflow/compiler/xla/pjrt/cpu_device.h
@@ -28,7 +28,7 @@ class CpuDevice : public PjRtDevice {
   CpuDevice(int id, std::unique_ptr<LocalDeviceState> local_device_state);
 };
 
-StatusOr<std::shared_ptr<PjRtClient>> GetCpuClient(bool asynchronous);
+StatusOr<std::unique_ptr<PjRtClient>> GetCpuClient(bool asynchronous);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/pjrt/distributed/BUILD b/tensorflow/compiler/xla/pjrt/distributed/BUILD
index 175b4268dda..4cd6093dc48 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/BUILD
+++ b/tensorflow/compiler/xla/pjrt/distributed/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library_cc")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
@@ -6,7 +7,7 @@ licenses(["notice"])
 
 package(default_visibility = ["//tensorflow:internal"])
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "protocol_proto",
     srcs = ["protocol.proto"],
     has_services = 1,
diff --git a/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc b/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc
index 298c41c7f58..c56b41861b0 100644
--- a/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc
+++ b/tensorflow/compiler/xla/pjrt/gpu_multistream_test.cc
@@ -28,7 +28,7 @@ namespace {
 // computation wait for the inputs to be produced before executing.
 TEST(GpuMultiStream, Basics) {
   TF_ASSERT_OK_AND_ASSIGN(
-      std::shared_ptr<PjRtClient> client,
+      std::unique_ptr<PjRtClient> client,
       GetNvidiaGpuClient(/*asynchronous=*/true, GpuAllocatorConfig(),
                          /*distributed_client=*/nullptr, /*node_id=*/0));
 
diff --git a/tensorflow/compiler/xla/pjrt/interpreter_device.cc b/tensorflow/compiler/xla/pjrt/interpreter_device.cc
index c1149f2dbf9..376d8687892 100644
--- a/tensorflow/compiler/xla/pjrt/interpreter_device.cc
+++ b/tensorflow/compiler/xla/pjrt/interpreter_device.cc
@@ -28,7 +28,7 @@ InterpreterDevice::InterpreterDevice(
     : PjRtDevice(id, std::move(local_device_state), kInterpreterPlatformName,
                  /*device_kind=*/kInterpreterPlatformName) {}
 
-StatusOr<std::shared_ptr<PjRtClient>> GetInterpreterClient() {
+StatusOr<std::unique_ptr<PjRtClient>> GetInterpreterClient() {
   TF_ASSIGN_OR_RETURN(se::Platform * platform,
                       PlatformUtil::GetPlatform("Interpreter"));
   if (platform->VisibleDeviceCount() != 1) {
@@ -50,7 +50,7 @@ StatusOr<std::shared_ptr<PjRtClient>> GetInterpreterClient() {
       absl::make_unique<InterpreterDevice>(0, std::move(device_state));
   devices.push_back(std::move(device));
 
-  return std::make_shared<PjRtClient>(
+  return std::make_unique<PjRtClient>(
       kInterpreterPlatformName, client, std::move(devices), /*host_id=*/0,
       /*allocator=*/nullptr, /*host_memory_allocator=*/nullptr,
       /*should_stage_host_to_device_transfers=*/false,
diff --git a/tensorflow/compiler/xla/pjrt/interpreter_device.h b/tensorflow/compiler/xla/pjrt/interpreter_device.h
index cf732f70124..4038d8dbf11 100644
--- a/tensorflow/compiler/xla/pjrt/interpreter_device.h
+++ b/tensorflow/compiler/xla/pjrt/interpreter_device.h
@@ -29,7 +29,7 @@ class InterpreterDevice : public PjRtDevice {
                     std::unique_ptr<LocalDeviceState> local_device_state);
 };
 
-StatusOr<std::shared_ptr<PjRtClient>> GetInterpreterClient();
+StatusOr<std::unique_ptr<PjRtClient>> GetInterpreterClient();
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
index 512ff81ef6e..df92921c39d 100644
--- a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
+++ b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_host_allocator.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_mem_allocator.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/tf_allocator_adapter.h"
 
 namespace xla {
@@ -89,12 +90,20 @@ StatusOr<std::unique_ptr<se::MultiDeviceAdapter>> CreateBFCAllocator(
   CHECK_GT(local_devices.size(), 0);
   const se::Platform* platform = local_devices.front()->executor()->platform();
   std::vector<se::MultiDeviceAdapter::AllocatorWithStream> allocators;
+  bool enable_unified_memory;
+  Status status = tensorflow::ReadBoolFromEnvVar("TF_FORCE_UNIFIED_MEMORY",
+                                                 false, &enable_unified_memory);
+  if (!status.ok()) {
+    LOG(ERROR) << "Unable to read TF_FORCE_UNIFIED_MEMORY: "
+               << status.error_message();
+  }
+
   for (auto& local_device : local_devices) {
     se::StreamExecutor* executor = local_device->executor();
     int device_ordinal = executor->device_ordinal();
     auto sub_allocator = absl::make_unique<tensorflow::GPUMemAllocator>(
         executor, tensorflow::PlatformGpuId(device_ordinal),
-        /*use_unified_memory=*/false,
+        /*use_unified_memory=*/enable_unified_memory,
         /*alloc_visitors=*/std::vector<tensorflow::SubAllocator::Visitor>(),
         /*free_visitors=*/std::vector<tensorflow::SubAllocator::Visitor>());
 
@@ -104,7 +113,10 @@ StatusOr<std::unique_ptr<se::MultiDeviceAdapter>> CreateBFCAllocator(
       return Unavailable("Failed to query available memory from device %i",
                          device_ordinal);
     }
-    size_t allocator_memory = free_memory * memory_fraction;
+    // To allow full GPU memory to be visible to the BFC allocator if using
+    // unified memory.
+    size_t allocator_memory =
+        enable_unified_memory ? total_memory : free_memory * memory_fraction;
     if (preallocate) {
       LOG(INFO) << "XLA backend allocating " << allocator_memory
                 << " bytes on device " << device_ordinal
@@ -289,7 +301,7 @@ GpuDevice::GpuDevice(int id,
     : PjRtDevice(id, std::move(local_device_state), kGpuPlatformName,
                  std::move(device_kind), node_id) {}
 
-StatusOr<std::shared_ptr<PjRtClient>> GetNvidiaGpuClient(
+StatusOr<std::unique_ptr<PjRtClient>> GetNvidiaGpuClient(
     bool asynchronous, const GpuAllocatorConfig& allocator_config,
     std::shared_ptr<DistributedRuntimeClient> distributed_client, int node_id) {
   TF_ASSIGN_OR_RETURN(LocalClient * xla_client, GetGpuXlaClient());
@@ -312,13 +324,12 @@ StatusOr<std::shared_ptr<PjRtClient>> GetNvidiaGpuClient(
     devices = BuildLocalDevices(std::move(local_device_states));
   }
 
-  std::shared_ptr<PjRtClient> pyclient = std::make_shared<GpuClient>(
+  return std::unique_ptr<PjRtClient>(std::make_unique<GpuClient>(
       "gpu", xla_client, std::move(devices),
       /*node_id=*/node_id, std::move(allocator),
       std::move(host_memory_allocator),
       /*should_stage_host_to_device_transfers=*/true,
-      /*gpu_run_options=*/std::move(gpu_run_options));
-  return pyclient;
+      /*gpu_run_options=*/std::move(gpu_run_options)));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h
index 4f22a169bd8..f480a37429a 100644
--- a/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h
+++ b/tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h
@@ -53,7 +53,7 @@ struct GpuAllocatorConfig {
 
 // distributed_client may be nullptr in non-distributed settings.
 // distributed_client should not be Open()ed before calling this function.
-StatusOr<std::shared_ptr<PjRtClient>> GetNvidiaGpuClient(
+StatusOr<std::unique_ptr<PjRtClient>> GetNvidiaGpuClient(
     bool asynchronous, const GpuAllocatorConfig& allocator_config,
     std::shared_ptr<DistributedRuntimeClient> distributed_client, int node_id);
 
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
index 099c7729679..02ae37b71db 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
@@ -90,6 +90,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/local_device_state.h"
 #include "tensorflow/compiler/xla/pjrt/tracked_device_buffer.h"
 #include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@@ -282,6 +283,11 @@ StatusOr<absl::flat_hash_set<int>> PjRtClient::GetParametersThatMustBeDonated(
   return parameters_to_donate;
 }
 
+std::unique_ptr<HloCostAnalysis> PjRtClient::GetHloCostAnalysis() {
+  return absl::make_unique<HloCostAnalysis>(
+      client_->backend().compiler()->ShapeSizeBytesFunction());
+}
+
 namespace {
 
 // Ensures that it is safe to deallocate any buffers that have been enqueued in
@@ -894,6 +900,7 @@ void PjRtBuffer::WaitForOutstandingDonationHold() {
 
 StatusOr<std::shared_ptr<TrackedDeviceBuffer>> PjRtBuffer::Release(
     bool wait_for_operations_to_complete) {
+  tensorflow::profiler::TraceMe trace_me("PjRtBuffer::Release");
   std::shared_ptr<TrackedDeviceBuffer> device_buffer;
   TrackedDeviceBuffer::StreamAndEventContainer events;
   {
@@ -1257,6 +1264,14 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtBuffer::CopyToDevice(
         "CopyToDevice cannot accept the same source and destination devices");
   }
 
+  // Copying across PjRtClients involves a copy through the host.
+  if (dst_device->client() != client_) {
+    TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteral());
+    return FromHostBuffer(literal->untyped_data(), literal->shape(),
+                          HostBufferSemantics::kZeroCopy, nullptr,
+                          dst_device->client(), dst_device);
+  }
+
   TF_ASSIGN_OR_RETURN(LocalDeviceState * dst_local_device,
                       dst_device->GetLocalDeviceState());
   LocalDeviceState* transfer_local_device =
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.h b/tensorflow/compiler/xla/pjrt/pjrt_client.h
index 1bed959e3e6..cb4ef9da85b 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.h
@@ -195,6 +195,9 @@ class PjRtClient {
     return absl::optional<std::string>();
   }
 
+  // Returns a backend-specific HLO cost analysis visitor.
+  virtual std::unique_ptr<HloCostAnalysis> GetHloCostAnalysis();
+
  protected:
   friend class PjRtBuffer;
   virtual void EnqueueCrossHostReceive(
@@ -560,8 +563,10 @@ class PjRtBuffer {
     return GetBufferWithHold(ScopedHold::kExternalReference);
   }
 
-  // Copies the buffer to device `dst_device`. Returns an error if the buffer is
-  // already on dst_device.
+  // Copies the buffer to device `dst_device`, performing a d2d transfer when
+  // `dst_device` is sharing the same Client, and performing a d2h and h2d copy
+  // if `dst_device` lives on a different Client.
+  // Returns an error if the buffer is already on dst_device.
   StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDevice(PjRtDevice* dst_device);
 
   // Copies the buffer to the remote device encoded in serialized_descriptor.
@@ -695,7 +700,7 @@ struct ExecuteOptions {
   int32 launch_id = 0;
   // If non-null, an opaque context passed to an execution that may be used to
   // supply additional arguments to a derived class of PjRtExecutable.
-  ExecuteContext* context = nullptr;
+  const ExecuteContext* context = nullptr;
 };
 
 // Represents a compiled computation that can be executed given handles to
diff --git a/tensorflow/compiler/xla/pjrt/tpu_client.cc b/tensorflow/compiler/xla/pjrt/tpu_client.cc
new file mode 100644
index 00000000000..b2af6e79980
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/tpu_client.cc
@@ -0,0 +1,247 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/pjrt/tpu_client.h"
+
+#include <memory>
+#include <vector>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/pjrt/local_device_state.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/tracked_device_buffer.h"
+#include "tensorflow/compiler/xla/service/shaped_buffer.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/stream.h"
+#include "tensorflow/stream_executor/tpu/tpu_computation_placer.h"
+#include "tensorflow/stream_executor/tpu/tpu_executable_interface.h"
+#include "tensorflow/stream_executor/tpu/tpu_executor_interface.h"
+#include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
+#include "tensorflow/stream_executor/tpu/tpu_stream.h"
+
+namespace tf_tpu = tensorflow::tpu;
+
+namespace xla {
+namespace {
+
+class TpuDeviceState : public LocalDeviceState {
+ public:
+  TpuDeviceState(se::StreamExecutor* executor, LocalClient* client,
+                 bool asynchronous);
+
+  Status ThenMemcpyDeviceToDevice(se::Stream* transfer_stream,
+                                  se::Stream* dst_stream,
+                                  se::DeviceMemoryBase src_buffer,
+                                  se::DeviceMemoryBase dst_buffer) override;
+};
+
+TpuDeviceState::TpuDeviceState(se::StreamExecutor* executor,
+                               LocalClient* client, bool asynchronous)
+    : LocalDeviceState(executor, client, LocalDeviceState::kAsynchronous,
+                       asynchronous,
+                       /*allow_event_reuse=*/false) {}
+
+Status TpuDeviceState::ThenMemcpyDeviceToDevice(
+    se::Stream* transfer_stream, se::Stream* dst_stream,
+    se::DeviceMemoryBase src_buffer, se::DeviceMemoryBase dst_buffer) {
+  auto* transfer_tpu_stream = tensorflow::down_cast<tf_tpu::TpuStream*>(
+      transfer_stream->implementation());
+  tf_tpu::TpuTopologyExternal topology =
+      tf_tpu::TpuPlatformInterface::GetRegisteredPlatform()->topology();
+  // TODO(b/157179600): use device-to-device transfers when implemented instead
+  // of copying via host.
+  if (topology.version() == kTpuV4) {
+    LOG(WARNING)
+        << "device-to-device transfers not yet implemented, copying via host";
+    auto* dst_tpu_stream =
+        tensorflow::down_cast<tf_tpu::TpuStream*>(dst_stream->implementation());
+    TF_RET_CHECK(src_buffer.size() == dst_buffer.size());
+    auto host_tmp = std::make_unique<char[]>(src_buffer.size());
+    TF_RETURN_IF_ERROR(transfer_tpu_stream->EnqueueTransferDeviceToHost(
+        src_buffer, host_tmp.get(), src_buffer.size()));
+    dst_stream->ThenWaitFor(transfer_stream);
+    TF_RETURN_IF_ERROR(dst_tpu_stream->EnqueueTransferHostToDevice(
+        dst_buffer, host_tmp.get(), dst_buffer.size()));
+    transfer_stream->ThenWaitFor(dst_stream);
+    char* tmp = host_tmp.release();
+    dst_stream->ThenDoHostCallback([tmp] { delete[] tmp; });
+  } else {
+    TF_RETURN_IF_ERROR(transfer_tpu_stream->EnqueueOnTpuDeviceSendRecvLocal(
+        src_buffer, dst_buffer));
+  }
+  return Status::OK();
+}
+
+class PjRtTpuClient : public PjRtClient {
+ public:
+  PjRtTpuClient(LocalClient* client,
+                std::vector<std::unique_ptr<PjRtDevice>> devices, int host_id,
+                tf_tpu::TpuPlatformInterface* tpu_platform);
+
+  StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override;
+
+  bool EnqueueD2DTransfersOnSrcStream() const override {
+    return tpu_platform_->topology().version() == kTpuV4;
+  }
+
+  StatusOr<absl::optional<std::string>> ExecutableFingerprint(
+      const PjRtExecutable& executable) const override;
+
+ private:
+  tf_tpu::TpuPlatformInterface* tpu_platform_;
+};
+
+PjRtTpuClient::PjRtTpuClient(LocalClient* client,
+                             std::vector<std::unique_ptr<PjRtDevice>> devices,
+                             int host_id,
+                             tf_tpu::TpuPlatformInterface* tpu_platform)
+    : PjRtClient("tpu", client, std::move(devices), host_id,
+                 /*allocator=*/nullptr,
+                 /*host_memory_allocator=*/nullptr,
+                 /*should_stage_host_to_device_transfers=*/false,
+                 /*gpu_run_options=*/nullptr),
+      tpu_platform_(tpu_platform) {}
+
+StatusOr<DeviceAssignment> PjRtTpuClient::GetDefaultDeviceAssignment(
+    int num_replicas, int num_partitions) const {
+  tf_tpu::TpuPlatformInterface* platform =
+      tf_tpu::TpuPlatformInterface::GetRegisteredPlatform();
+  tf_tpu::TpuHostLocationExternal host = platform->GetTpuHostLocation();
+  int num_local_devices = host.Cores(kTensorCore).size();
+  if (num_replicas * num_partitions <= num_local_devices) {
+    return tf_tpu::TpuComputationPlacer::AssignLocalDevices(host, num_replicas,
+                                                            num_partitions);
+  }
+  // Fallback to default global device assignment if we can't run locally.
+  return PjRtClient::GetDefaultDeviceAssignment(num_replicas, num_partitions);
+}
+
+StatusOr<absl::optional<std::string>> PjRtTpuClient::ExecutableFingerprint(
+    const PjRtExecutable& executable) const {
+  if (executable.client() != this) {
+    return InvalidArgument(
+        "Passed executable from different client (platform '%s') to "
+        "PjRtTpuClient::ExecutableFingerprint",
+        executable.client()->platform_name());
+  }
+  if (executable.executables().size() > 1) {
+    LOG(INFO) << "ExecutableFingerprint not fully implemented for MPMD "
+                 "executables, fingerprint may not be unique.";
+  }
+  xla::TpuExecutableInterface* tpu_executable =
+      tensorflow::down_cast<xla::TpuExecutableInterface*>(
+          executable.executables()[0]->executable());
+  return absl::optional<std::string>(tpu_executable->fingerprint());
+}
+
+StatusOr<std::vector<std::unique_ptr<PjRtDevice>>> GetTpuDevices(
+    LocalClient* client,
+    std::vector<std::unique_ptr<LocalDeviceState>> local_device_states) {
+  std::vector<std::unique_ptr<PjRtDevice>> devices;
+  tf_tpu::TpuTopologyExternal topology =
+      tf_tpu::TpuPlatformInterface::GetRegisteredPlatform()->topology();
+
+  std::map<int, int> core_id_to_device_ordinal;
+  for (int i = 0; i < client->device_count(); ++i) {
+    se::StreamExecutor* executor =
+        client->backend().stream_executor(i).ValueOrDie();
+    tf_tpu::TpuExecutorInterface* tpu_executor =
+        tensorflow::down_cast<tf_tpu::TpuExecutorInterface*>(
+            executor->implementation());
+    core_id_to_device_ordinal[tpu_executor->GetCoreLocationExternal().Id()] = i;
+  }
+
+  for (const tf_tpu::TpuCoreLocationExternal& core :
+       topology.cores(TpuCoreTypeEnum::kTensorCore)) {
+    auto it = core_id_to_device_ordinal.find(core.Id());
+    int device_ordinal =
+        (it != core_id_to_device_ordinal.end()) ? it->second : -1;
+    int host_id = topology.IdForHost(core.host_coordinates());
+    const tf_tpu::TpuDimensionsExternal coords = core.chip_coordinates();
+    std::array<int, 3> coords_array = {coords.x, coords.y, coords.z};
+    std::unique_ptr<LocalDeviceState> local_device_state;
+    if (device_ordinal >= 0) {
+      local_device_state = std::move(local_device_states[device_ordinal]);
+    }
+    auto device = absl::make_unique<PjRtTpuDevice>(
+        core, std::move(local_device_state), host_id, coords_array,
+        std::string(tf_tpu::TpuVersionEnumToString(topology.version())));
+    devices.push_back(std::move(device));
+  }
+  return devices;
+}
+
+}  // namespace
+
+StatusOr<std::shared_ptr<PjRtClient>> GetTpuClient(
+    bool asynchronous, absl::Duration init_retry_timeout) {
+  tf_tpu::TpuPlatformInterface* platform =
+      tf_tpu::TpuPlatformInterface::GetRegisteredPlatform(
+          /*initialize_platform=*/true, /*num_tries=*/1);
+  if (platform == nullptr) {
+    return InvalidArgument("TpuPlatform is not available.");
+  }
+  // NOTE: We retry in a loop since some pod failures are transient (e.g. some
+  // RPCs may timeout waiting for other hosts to come up, but will succeed
+  // at a later point if retried).
+  auto start = absl::Now();
+  // TODO(b/165870356): TpuPlatform::Initialized() always returns true!
+  auto status = platform->Initialize({});
+  while (!platform->Initialized()) {
+    status = platform->Initialize({});
+    if (!status.ok()) {
+      LOG(ERROR) << "Platform initialization failed: " << status;
+      if ((absl::Now() - start) >= init_retry_timeout) {
+        return status;
+      }
+    }
+  }
+  if (platform->VisibleDeviceCount() <= 0) {
+    return InvalidArgument("No TPU devices found.");
+  }
+  LocalClientOptions options;
+  options.set_platform(platform);
+  TF_ASSIGN_OR_RETURN(LocalClient * client,
+                      ClientLibrary::GetOrCreateLocalClient(options));
+
+  std::vector<std::unique_ptr<LocalDeviceState>> local_device_states;
+  local_device_states.reserve(client->device_count());
+  for (int i = 0; i < client->device_count(); ++i) {
+    se::StreamExecutor* executor =
+        client->backend().stream_executor(i).ValueOrDie();
+    local_device_states.push_back(
+        absl::make_unique<TpuDeviceState>(executor, client, asynchronous));
+  }
+
+  TF_ASSIGN_OR_RETURN(auto devices,
+                      GetTpuDevices(client, std::move(local_device_states)));
+  int host_id = platform->GetTpuHostLocation().Id();
+
+  return std::shared_ptr<PjRtClient>(absl::make_unique<PjRtTpuClient>(
+      client, std::move(devices), host_id, platform));
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/tpu_client.h b/tensorflow/compiler/xla/pjrt/tpu_client.h
new file mode 100644
index 00000000000..1a458c1480b
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/tpu_client.h
@@ -0,0 +1,60 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_TPU_CLIENT_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_TPU_CLIENT_H_
+
+#include <array>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/stream_executor/tpu/tpu_topology.h"
+
+namespace xla {
+
+class PjRtTpuDevice : public PjRtDevice {
+ public:
+  PjRtTpuDevice(const tensorflow::tpu::TpuCoreLocationExternal core,
+                std::unique_ptr<LocalDeviceState> local_device_state,
+                int host_id, const std::array<int, 3>& coords,
+                std::string device_kind)
+      : PjRtDevice(core.Id(), std::move(local_device_state),
+                   /*platform_name=*/"tpu", std::move(device_kind), host_id),
+        core_(core),
+        coords_(coords) {}
+
+  const std::array<int, 3>& coords() const { return coords_; }
+  int core_on_chip() const { return core_.index(); }
+  const tensorflow::tpu::TpuCoreLocationExternal core() const { return core_; }
+
+  std::string DebugString() const override {
+    return absl::StrFormat("TPU_%i(host=%i,(%i,%i,%i,%i))", id(), host_id(),
+                           coords_[0], coords_[1], coords_[2], core_.index());
+  }
+
+ private:
+  const tensorflow::tpu::TpuCoreLocationExternal core_;
+  const std::array<int, 3> coords_;
+};
+
+StatusOr<std::shared_ptr<PjRtClient>> GetTpuClient(
+    bool asynchronous,
+    absl::Duration init_retry_timeout = absl::ZeroDuration());
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_TPU_CLIENT_H_
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 046fadb405b..2db43727fbd 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/core/platform:build_config.bzl", "pyx_library")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_py_test_deps")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
@@ -6,7 +7,10 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "pybind_extension")
 
 package(
-    default_visibility = ["//tensorflow:internal"],
+    default_visibility = [
+        "//learning/pathways/data_parallel/jax:__subpackages__",
+        "//tensorflow:internal",
+    ],
     licenses = ["notice"],  # Apache 2.0
 )
 
@@ -24,6 +28,18 @@ pyx_library(
     srcs = ["custom_call_for_test.pyx"],
 )
 
+py_test(
+    name = "xla_client_backend_independent_test",
+    srcs = ["xla_client_backend_independent_test.py"],
+    python_version = "PY3",
+    tags = ["no_oss"],  # TODO(phawkins): This test passes, but requires --config=monolithic.
+    deps = [
+        ":xla_client",
+        ":xla_extension",
+        "@absl_py//absl/testing:absltest",
+    ] + xla_py_test_deps(),
+)
+
 py_library(
     name = "xla_client_test",
     testonly = 1,
@@ -264,6 +280,7 @@ cc_library(
         "//tensorflow/core/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:optional",
         "@pybind11",
     ],
@@ -284,6 +301,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:comparators",
+        "//tensorflow/compiler/xla/client/lib:lu_decomposition",
         "//tensorflow/compiler/xla/client/lib:math",
         "//tensorflow/compiler/xla/client/lib:qr",
         "//tensorflow/compiler/xla/client/lib:self_adjoint_eig",
@@ -428,6 +446,7 @@ pybind_extension(
         "//tensorflow/compiler/xla/pjrt:interpreter_device",
         "//tensorflow/compiler/xla/pjrt:nvidia_gpu_device",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:tpu_client",
         "//tensorflow/compiler/xla/pjrt:tracked_device_buffer",
         "//tensorflow/compiler/xla/pjrt/distributed",
         "//tensorflow/compiler/xla/pjrt/distributed:client",
diff --git a/tensorflow/compiler/xla/python/dlpack.cc b/tensorflow/compiler/xla/python/dlpack.cc
index 974816407ee..67afa25d23e 100644
--- a/tensorflow/compiler/xla/python/dlpack.cc
+++ b/tensorflow/compiler/xla/python/dlpack.cc
@@ -321,7 +321,8 @@ StatusOr<std::unique_ptr<PyBuffer>> DLPackManagedTensorToBuffer(
                       DLDataTypeToPrimitiveType(dlmt->dl_tensor.dtype));
 
   std::vector<int64> minor_to_major;
-  if (dlmt->dl_tensor.strides && !absl::c_find(dimensions, 0)) {
+  if (dlmt->dl_tensor.strides &&
+      absl::c_find(dimensions, 0) == dimensions.end()) {
     absl::Span<int64 const> strides(
         reinterpret_cast<int64*>(dlmt->dl_tensor.strides),
         dlmt->dl_tensor.ndim);
diff --git a/tensorflow/compiler/xla/python/jax_jit.cc b/tensorflow/compiler/xla/python/jax_jit.cc
index 6594125d493..944b4c20a8a 100644
--- a/tensorflow/compiler/xla/python/jax_jit.cc
+++ b/tensorflow/compiler/xla/python/jax_jit.cc
@@ -28,11 +28,12 @@ limitations under the License.
 
 #include <exception>
 #include <memory>
-#include <optional>
 #include <stdexcept>
+#include <utility>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/synchronization/notification.h"
 #include "absl/types/optional.h"
 #include "pybind11/cast.h"
 #include "pybind11/numpy.h"
@@ -90,9 +91,7 @@ struct ArgSignature {
 template <typename H>
 H AbslHashValue(H h, const ArgSignature& s) {
   h = H::combine(std::move(h), s.dtype);
-  if (!s.shape.empty()) {
-    h = H::combine_contiguous(std::move(h), &s.shape.front(), s.shape.size());
-  }
+  h = H::combine_contiguous(std::move(h), s.shape.data(), s.shape.size());
   return h;
 }
 
@@ -123,17 +122,25 @@ struct CallSignature {
   std::vector<py::object> static_args;
   // A PyTreeDef for each positional dynamic (i.e. not static) argument.
   std::vector<PyTreeDef> dynamic_positional_args_treedef;
-  // Keyword arguments. Sorted by the interned keyword pointers.
+  // Keyword arguments. Sorted by the keyword name.
   std::vector<KwargEntry> keyword_args;
   // Shape and dtype for both the dynamic positional arguments and the keyword
-  // arguments (sorted by interned keyword pointers).
+  // arguments (sorted by keyword name).
   std::vector<ArgSignature> dynamic_args_signatures;
+  PjRtDevice* device;
 
   bool operator==(const CallSignature& other) const {
-    return std::tie(dynamic_positional_args_treedef, static_args, keyword_args,
-                    dynamic_args_signatures) ==
-           std::tie(other.dynamic_positional_args_treedef, other.static_args,
-                    other.keyword_args, other.dynamic_args_signatures);
+    return std::tie(dynamic_positional_args_treedef, keyword_args,
+                    dynamic_args_signatures, device) ==
+               std::tie(other.dynamic_positional_args_treedef,
+                        other.keyword_args, other.dynamic_args_signatures,
+                        other.device) &&
+           // `==` on py:objects is the Python `is`. We need equal.
+           std::equal(static_args.begin(), static_args.end(),
+                      other.static_args.begin(), other.static_args.end(),
+                      [](const py::object& a, const py::object& b) {
+                        return a.equal(b);
+                      });
   }
   bool operator!=(const CallSignature& other) const {
     return !(*this == other);
@@ -175,12 +182,13 @@ H AbslHashValue(H h, const CallSignature& s) {
   // TODO(jblespiau): We should either ban non-hashable objects from jit or we
   // should hash them by object identity.
   h = H::combine_contiguous(std::move(h),
-                            &s.dynamic_positional_args_treedef.front(),
+                            s.dynamic_positional_args_treedef.data(),
                             s.dynamic_positional_args_treedef.size());
-  h = H::combine_contiguous(std::move(h), &s.keyword_args.front(),
+  h = H::combine_contiguous(std::move(h), s.keyword_args.data(),
                             s.keyword_args.size());
-  h = H::combine_contiguous(std::move(h), &s.dynamic_args_signatures.front(),
+  h = H::combine_contiguous(std::move(h), s.dynamic_args_signatures.data(),
                             s.dynamic_args_signatures.size());
+  h = H::combine(std::move(h), s.device);
   return h;
 }
 
@@ -188,7 +196,7 @@ std::string CallSignature::DebugString() const {
   std::vector<std::string> static_args_str;
   static_args_str.reserve(static_args.size());
   for (auto& static_arg : static_args) {
-    static_args_str.emplace_back(py::cast<std::string>(static_arg.str()));
+    static_args_str.emplace_back(py::cast<std::string>(py::str(static_arg)));
   }
 
   std::vector<std::string> signature_str;
@@ -222,27 +230,35 @@ std::string CallSignature::DebugString() const {
 
 struct CacheEntry {
   std::shared_ptr<xla::PyExecutable> executable;
-  xla::PjRtDevice* device;
   PyTreeDef out_pytree_def;
-  // These are the objects required to create a `DeviceArray` object.
-  // We use Python types within the vector because this is what we will be
-  // returning to Python. No need to convert back and forth.
-  // We need py::object to maintain the objects alive.
-  std::vector<py::object> out_avals;
-  std::vector<py::object> out_lazy_exprs;
+  // Callables (one for each output) to call on each output to get the Python
+  // object (usually a DeviceArray) that we should return.
+  // TODO(jblespiau): The goal of the C++ codepath being to be fast, thus, we
+  // should not call into Python. It will be trivial to fix this when
+  // omnistaging is the only option & when DeviceArray and PyBuffer are merged).
+  std::vector<py::function> handlers;
+
+  // Ensures a single thread performs the compilation for a given executable.
+  //
+  // The first thread (holding the GIL) will create the CacheEntry associated to
+  // a signature and if the object has been insterted already, other threads
+  // will wait for the notification.
+  absl::Notification compilation_complete;
+  absl::optional<Status> compilation_error = absl::nullopt;
+  // Trivial computation will fallback to Python.
+  // Running a jax(pmap) will also fallback to Python.
+  bool fall_back_to_python = false;
 };
 
 // A `CompiledFunction` is associated to a `jax.jit(f)` and takes care of the
 // bookkeeping of the different signatures used and the dispatch of calls to
-// the correct underlying `PyExecutable`.
-// TODO(jblespiau): This class is thread-unsafe. Note that using a mutex for the
-// full `Call` will lead to a deadlock because it goes back to Python which will
-// release the GIL.
+// the correct underlying `PyExecutable`. This class is thread-safe.
 class CompiledFunction {
  public:
-  CompiledFunction(py::function fun, py::function cache_miss_fun,
-                   py::function python_f_jitted, bool jax_enable_x64,
-                   bool jax_disable_jit, std::vector<int> static_argnums);
+  CompiledFunction(py::function fun, py::function cache_miss,
+                   py::function get_device, py::function get_jax_enable_x64,
+                   py::function get_jax_disable_jit,
+                   std::vector<int> static_argnums);
   ~CompiledFunction();
 
   // This function will:
@@ -259,28 +275,22 @@ class CompiledFunction {
     return inspect->attr("signature")(fun_);
   }
 
+  int cache_size() const { return executables_.size(); }
+
  private:
-  CacheEntry& GetCacheEntry(const py::args& args, const py::kwargs& kwargs,
+  // Returns nullptr if not present in the cache.
+  CacheEntry* GetCacheEntryIfPresent(const CallSignature& signature);
+  // Should never return nullptr.
+  CacheEntry* AddCacheEntry(const py::args& args, const py::kwargs& kwargs,
                             const CallSignature& signature,
-                            absl::optional<py::tuple> cache_miss_return);
-  CacheEntry& SetAndReturnCacheEntry(
-      const py::args& args, const py::kwargs& kwargs,
-      const CallSignature& signature,
-      absl::optional<py::tuple> cache_miss_return = absl::nullopt);
-  bool JitIsDisabled() { return GetDisableJit() || jax_disable_jit_; }
+                            py::object out_and_fastpath_data);
+  bool JitIsDisabled() { return GetDisableJit() || jax_disable_jit_.value(); }
+
+  bool always_fallback_to_python_ = false;
 
   const py::function fun_;  // The Python function to jit.
-  // The Python function in charge of returning a `xla::PyExecutable` from
-  // the arguments passed to `jitted_f`.
-  const py::function cache_miss_fun_;
-  // A function to call as fallback. This is the result of calling the Python
-  // `jax.jit`.
-  // TODO(jblespiau): Delete this when the C++ codepath supports all features.
-  const py::function python_f_jitted_;
-
-  // The value of the Python flag when the object was created.
-  const bool jax_enable_x64_;
-  const bool jax_disable_jit_;
+  // See JAX _cpp_jit in api.py for documentation.
+  const py::function cache_miss_;
 
   // We need to know the static arguments to remove them from the arguments
   // passed to the underlying PyExecutable. In sorted order.
@@ -292,21 +302,43 @@ class CompiledFunction {
   // `CompiledFunction` is being instantiated from Python, the clients are not
   // yet available (done after GoogleInit). They will be during the first call
   // to `Call`.
-  std::shared_ptr<xla::PyClient> pyclient_ = nullptr;
+  // A function taking no arguments and returning the default device and whether
+  // jax.jit has been committed to it.
+  const py::function get_jax_enable_x64_;
+  const py::function get_jax_disable_jit_;
+  const py::function get_device_;
+
+  // The writing of the following is protected by the mutex.
+  absl::Mutex mu_;
+  // The value of the Python flag. The value will be computed only during the
+  // first object call, because GoogleInit must have been executed.
+  absl::optional<bool> jax_enable_x64_ = absl::nullopt;
+  absl::optional<bool> jax_disable_jit_ = absl::nullopt;
+
+  // The logic if the following:
+  // - if `device` or `backend` are not specified to `jax.jit`, we will use
+  //   the input sticky buffer device, or `default_device_` if there is no
+  //   such sticky buffer.
+  // - When one of `device` or `backend` is specified, this will determine
+  //   the `default_device_` which will be used as the targeted device. In
+  //   which case, we will always copy input buffers to this device.
+  std::shared_ptr<xla::PyClient> default_pyclient_ = nullptr;
+  xla::ClientAndPtr<PjRtDevice> default_pydevice_;
   xla::PjRtDevice* default_device_ = nullptr;
+  bool is_committed_;
 };
 
-CompiledFunction::CompiledFunction(py::function fun,
-                                   py::function cache_miss_fun,
-                                   py::function python_f_jitted,
-                                   bool jax_enable_x64, bool jax_disable_jit,
+CompiledFunction::CompiledFunction(py::function fun, py::function cache_miss,
+                                   py::function get_device,
+                                   py::function get_jax_enable_x64,
+                                   py::function get_jax_disable_jit,
                                    std::vector<int> static_argnums)
     : fun_(std::move(fun)),
-      cache_miss_fun_(std::move(cache_miss_fun)),
-      python_f_jitted_(std::move(python_f_jitted)),
-      jax_enable_x64_(jax_enable_x64),
-      jax_disable_jit_(jax_disable_jit),
-      static_argnums_(std::move(static_argnums)) {
+      cache_miss_(std::move(cache_miss)),
+      static_argnums_(std::move(static_argnums)),
+      get_jax_enable_x64_(get_jax_enable_x64),
+      get_jax_disable_jit_(get_jax_disable_jit),
+      get_device_(std::move(get_device)) {
   std::sort(static_argnums_.begin(), static_argnums_.end());
 }
 
@@ -318,6 +350,34 @@ CompiledFunction::~CompiledFunction() {
 
 namespace {
 
+// The equivalent of the Python jax/lazy.py::is_trivial:
+// return (type(lexpr.input) is ArrayVar and
+//         lexpr.dims == tuple(range(len(lexpr.shape))))
+//
+// Expects *only* instances of `DeviceArray`.
+bool HasTrivialLazyExpr(py::handle device_array) {
+  static const auto* lazy_module =
+      new py::module(py::module::import("jax.lazy"));
+
+  auto lexpr = py::getattr(device_array, "_lazy_expr");
+  auto input = py::getattr(lexpr, "input");
+  if (!input.get_type().is(lazy_module->attr("ArrayVar"))) {
+    return false;
+  }
+  py::tuple dims = py::cast<py::tuple>(lexpr.attr("dims"));
+  py::tuple shape = py::cast<py::tuple>(lexpr.attr("shape"));
+
+  for (int i = 0; i < shape.size(); ++i) {
+    if (dims[i].is_none()) {
+      return false;
+    }
+    if (py::cast<int>(dims[i]) != i) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // The resulting information of the parsing and conversion of the arguments.
 struct ParsedArgumentsAsBuffers {
   // The call signature will be filled during 2 steps:
@@ -370,8 +430,10 @@ void FlattenArguments(const py::args& args, const py::kwargs& py_kwargs,
   // Keyword arguments.
   std::vector<std::pair<py::handle, py::handle>> kwargs(py_kwargs.begin(),
                                                         py_kwargs.end());
-  // We first intern the keys, then sort them (by pointer) and then create
-  // the signatures.
+  // We first intern the keys, then sort them (by name, as in the Python path)
+  // (see also PyTreeDef::Flatten) and then create the signatures.
+  // TODO(jblespiau): We should be able to sort the keys by interned-key
+  // pointers, but this requires the Python compilation to do the same.
   arguments.signature.keyword_args.resize(kwargs.size());
   for (size_t i = 0; i < kwargs.size(); ++i) {
     // Intern the key if not already interned.
@@ -388,7 +450,7 @@ void FlattenArguments(const py::args& args, const py::kwargs& py_kwargs,
   std::sort(kwargs.begin(), kwargs.end(),
             [](const std::pair<py::handle, py::handle>& a,
                const std::pair<py::handle, py::handle>& b) {
-              return a.first.ptr() < b.first.ptr();
+              return a.first < b.first;
             });
   for (size_t i = 0; i < kwargs.size(); ++i) {
     arguments.signature.keyword_args[i].key = kwargs[i].first;
@@ -457,7 +519,7 @@ StatusOr<std::unique_ptr<xla::PjRtBuffer>> ScalarToBuffer(
       "%s", absl::StrCat(
                 "Not supported: The C++ jax jit execution path, only accepts "
                 "DeviceArray, Numpy arrays, or Python scalars. Got type ",
-                py::cast<std::string>(scalar.get_type().str())));
+                py::cast<std::string>(py::str(scalar.get_type()))));
 }
 
 const py::dtype* DtypeTo32BitDtype(const py::dtype& dtype) {
@@ -470,28 +532,37 @@ const py::dtype* DtypeTo32BitDtype(const py::dtype& dtype) {
   static const auto* complex64_dt = new py::dtype("complex64");
   static const auto* complex128_dt = new py::dtype("complex128");
 
-  if (dtype == *int64_dt) {
+  if (dtype.equal(*int64_dt)) {
     return int32_dt;
   }
-  if (dtype == *float64_dt) {
+  if (dtype.equal(*float64_dt)) {
     return float32_dt;
   }
-  if (dtype == *uint64_dt) {
+  if (dtype.equal(*uint64_dt)) {
     return uint32_dt;
   }
-  if (dtype == *complex128_dt) {
+  if (dtype.equal(*complex128_dt)) {
     return complex64_dt;
   }
 
   return nullptr;
 }
 
+bool IsFloat0(py::array arg) {
+  static const auto* dtypes_module =
+      new py::module(py::module::import("jax.dtypes"));
+  static const auto* float0_dtype =
+      new py::handle(dtypes_module->attr("float0"));
+  return float0_dtype->is(arg.attr("dtype"));
+}
+
 // Converts flattened arguments contained in ParsedArgumentsAsBuffers in
 // place. If arguments are `DeviceArray`, they must all be on the same `Device`.
 //
-// Returns `OkStatus()` on success.
+// Returns `OkStatus()` on success. Returning an error should lead to calling
+// the Python fallback.
 Status ConvertArgsToBuffers(bool jax_enable_x64, xla::PyClient& pyclient,
-                            xla::PjRtDevice* default_device,
+                            xla::PjRtDevice* default_device, bool is_committed,
                             ParsedArgumentsAsBuffers& arguments) {
   std::vector<xla::PjRtBuffer*>& arg_buffers = arguments.arg_buffers;
   auto& keep_alive = arguments.keep_alive;
@@ -505,44 +576,49 @@ Status ConvertArgsToBuffers(bool jax_enable_x64, xla::PyClient& pyclient,
   const auto& device_array = xla_module->attr("DeviceArray");
 
   static const auto* numpy_module = new py::module(py::module::import("numpy"));
-  const auto& array = numpy_module->attr("array");
+  const auto& np_array = numpy_module->attr("array");
 
-  // TODO(phawkins): consider device stickiness.
-  // We first check whether any `DeviceArray` is present and whether they are
-  // attached to any specific device. See also
+  // When the jitted function is not committed, we first check whether any
+  // sticky `DeviceArray` is present and on which device they live. See also:
   // https://github.com/google/jax/pull/1884
   // https://github.com/google/jax/pull/1916 for the rationale why the
   // computation follows the data locality.
   // It's also similar to PyTorch's behavior.
   xla::PjRtDevice* data_device = nullptr;
-  for (py::handle arg : arguments.flat_dynamic_args) {
-    if (py::isinstance(arg, device_array)) {
-      xla::PyBuffer* buffer;
-      try {
-        // This can fail, e.g. when device_buffer is a `DeviceConstant`.
-        buffer = py::cast<xla::PyBuffer*>(arg.attr("device_buffer"));
-      } catch (const py::cast_error& e) {
-        return InvalidArgument(
-            "%s",
-            absl::StrCat("[jaxjit] Unsupported subclass of `DeviceArray`: "
-                         "`device_buffer` field is of type ",
-                         py::cast<std::string>(
-                             arg.attr("device_buffer").get_type().str()),
-                         " while a `PyBuffer` was expected."
+  if (is_committed) {
+    data_device = default_device;
+  } else {
+    for (py::handle arg : arguments.flat_dynamic_args) {
+      // We specically only deal with DeviceArray (not ShardedDeviceArray).
+      // (Can happen in jit(pmap), e.g. "test_jit_nested_donate_ignored").
+      if (arg.get_type().is(device_array)) {
+        xla::PyBuffer* buffer;
+        if (arg.attr("_device").is_none()) {  // Skip non-sticky devices.
+          continue;
+        }
+        try {
+          // This can fail, e.g. when device_buffer is a `DeviceConstant`.
+          buffer = py::cast<xla::PyBuffer*>(arg.attr("device_buffer"));
+        } catch (const py::cast_error& e) {
+          return InvalidArgument(
+              "%s",
+              absl::StrCat("[jaxjit] Unsupported subclass of `DeviceArray`: "
+                           "`device_buffer` field is of type ",
+                           py::cast<std::string>(
+                               arg.attr("device_buffer").get_type().str()),
+                           " while a `PyBuffer` was expected."
 
-                         ));
-      }
-      xla::PjRtDevice* device = buffer->buffer()->device();
-      if (data_device && (device != data_device)) {
-        return InvalidArgument(
-            "%s",
-            absl::StrCat(
-                "Arguments to a jit-compiled function must be colocated on the "
-                "same device. Arguments were found to be on the two following "
-                "different devices: ",
-                device->DebugString(), " and ", data_device->DebugString()));
-      } else {
-        data_device = device;
+                           ));
+        }
+        xla::PjRtDevice* device = buffer->buffer()->device();
+        if (data_device && (device != data_device)) {
+          throw std::invalid_argument(absl::StrCat(
+              "primitive arguments must be colocated on the same device ("
+              "C++ jax.jit). Arguments are on devices: ",
+              device->DebugString(), " and ", data_device->DebugString()));
+        } else {
+          data_device = device;
+        }
       }
     }
   }
@@ -550,16 +626,31 @@ Status ConvertArgsToBuffers(bool jax_enable_x64, xla::PyClient& pyclient,
     // No `DeviceArray` were found default to `default_device`.
     data_device = default_device;
   }
+  CHECK(data_device);
+  arguments.signature.device = data_device;
   xla::PjRtClient* pjrt_client = data_device->client();
 
   for (py::handle arg : arguments.flat_dynamic_args) {
-    // We do not support here d2d transparent transfers.
-    // We assumes all the `DeviceArray` are already on the correct and shared
-    // device.
-    if (py::isinstance(arg, device_array)) {
-      xla::PyBuffer* buffer =
-          py::cast<xla::PyBuffer*>(arg.attr("device_buffer"));
-      arg_buffers.push_back(buffer->buffer());
+    if (arg.get_type().is(device_array)) {
+      if (!HasTrivialLazyExpr(arg)) {
+        return InvalidArgument(
+            "Non-trivial lazy expression not supported in C++. "
+            "Falling back to Python.");
+      }
+
+      PyBuffer* buffer = py::cast<xla::PyBuffer*>(arg.attr("device_buffer"));
+      if (buffer->device().contents == data_device) {
+        arg_buffers.push_back(buffer->buffer());
+      } else {
+        // source and target platforms are the same, but different device.
+        // Perform a device-to-device copy.
+        // buffers from different XLA backends are passed through the host.
+        std::unique_ptr<PjRtBuffer> copied_buffer =
+            ValueOrThrow(buffer->buffer()->CopyToDevice(data_device));
+        arg_buffers.push_back(copied_buffer.get());
+        keep_alive.emplace_back(std::move(copied_buffer));
+      }
+
       ArgSignature sig;
       sig.dtype = buffer->shape().element_type();
       sig.shape.assign(buffer->shape().dimensions().begin(),
@@ -570,12 +661,17 @@ Status ConvertArgsToBuffers(bool jax_enable_x64, xla::PyClient& pyclient,
       // TODO(jblespiau): Can we improve this call? Do we need the underlying
       // GlobalPyRefManager() and co?
       py::array numpy_array = py::cast<py::array>(arg);
+      if (IsFloat0(numpy_array)) {
+        return InvalidArgument(
+            "float0 numpy arrays not supported in C++. "
+            "It will fallback to Python.");
+      }
       // If jax_enable_x64 is not set, we need to coerce 32 bits types.
       // Note that this is calling back to Python!
       if (!jax_enable_x64) {
         const py::dtype* to_dtype = DtypeTo32BitDtype(numpy_array.dtype());
         if (to_dtype) {
-          numpy_array = array(numpy_array, to_dtype);
+          numpy_array = np_array(numpy_array, *to_dtype);
         }
       }
       std::unique_ptr<xla::PyBuffer> buffer =
@@ -587,6 +683,7 @@ Status ConvertArgsToBuffers(bool jax_enable_x64, xla::PyClient& pyclient,
 
       ArgSignature sig;
       sig.dtype = buffer->shape().element_type();
+      sig.weak_type = false;
       sig.shape.assign(buffer->shape().dimensions().begin(),
                        buffer->shape().dimensions().end());
       arguments.signature.dynamic_args_signatures.push_back(sig);
@@ -612,121 +709,149 @@ Status ConvertArgsToBuffers(bool jax_enable_x64, xla::PyClient& pyclient,
 
 }  // namespace
 
-CacheEntry& CompiledFunction::GetCacheEntry(
-    const py::args& args, const py::kwargs& kwargs,
-    const CallSignature& signature,
-    absl::optional<py::tuple> cache_miss_return) {
+CacheEntry* CompiledFunction::GetCacheEntryIfPresent(
+    const CallSignature& signature) {
   auto found_iterator = executables_.find(signature);
   if (found_iterator != executables_.end()) {  // Cache hit!
-    return *(found_iterator->second);
+    if (!found_iterator->second->compilation_complete.HasBeenNotified()) {
+      py::gil_scoped_release gil_release;
+      found_iterator->second->compilation_complete.WaitForNotification();
+    }
+    if (found_iterator->second->compilation_error) {
+      throw std::invalid_argument(
+          found_iterator->second->compilation_error.value().error_message());
+    }
+    return found_iterator->second.get();
   }
-  return SetAndReturnCacheEntry(args, kwargs, signature, cache_miss_return);
+  return nullptr;
 }
-CacheEntry& CompiledFunction::SetAndReturnCacheEntry(
-    const py::args& args, const py::kwargs& kwargs,
-    const CallSignature& signature,
-    absl::optional<py::tuple> cache_miss_return) {
+
+CacheEntry* CompiledFunction::AddCacheEntry(const py::args& args,
+                                            const py::kwargs& kwargs,
+                                            const CallSignature& signature,
+                                            py::object out_and_fastpath_data) {
   // We need to insert the element.
   auto result = executables_.emplace(signature, std::make_unique<CacheEntry>());
   auto it = result.first;
-
+  CacheEntry* cache_entry = it->second.get();
   // CallSignatures in the cache own their keyword argument reference.
   result.first->first.IncRef();
 
-  // Cache miss? Call the Python cache miss function.
-  py::tuple executable_and_pytree;
-  if (cache_miss_return) {
-    executable_and_pytree = cache_miss_return.value();
-  } else {
-    executable_and_pytree = cache_miss_fun_(*args, **kwargs);
+  py::tuple tuple = py::cast<py::tuple>(out_and_fastpath_data);
+  CHECK_EQ(tuple.size(), 2);
+  if (tuple[1].is_none()) {
+    cache_entry->fall_back_to_python = true;
+    cache_entry->compilation_complete.Notify();
+    return cache_entry;
   }
-  if (executable_and_pytree.size() != 4) {
-    throw std::runtime_error(
-        "AssertionError: The cache miss function should return 4 "
-        "arguments.");
+
+  py::tuple executable_handlers_out_tree = py::cast<py::tuple>(tuple[1]);
+  CHECK_EQ(executable_handlers_out_tree.size(), 3);
+
+  auto executable = py::cast<std::shared_ptr<xla::PyExecutable>>(
+      executable_handlers_out_tree[0]);
+  std::vector<py::function> handlers;
+  for (const auto& handler :
+       py::cast<py::list>(executable_handlers_out_tree[1])) {
+    handlers.push_back(py::cast<py::function>(handler));
   }
-  it->second->executable = py::cast<std::shared_ptr<xla::PyExecutable>>(
-      std::move(executable_and_pytree[0]));
+  auto out_tree = py::cast<PyTreeDef>(executable_handlers_out_tree[2]);
+
+  cache_entry->executable = std::move(executable);
   int num_devices =
-      it->second->executable->pjrt_executable().local_devices().size();
-  if (num_devices != 1) {
-    throw std::runtime_error(absl::StrCat(
-        "Running on more than a single device is not currently supported."
-        "The underlying PjRtExecutable has ",
-        num_devices));
-  }
-  it->second->device =
-      it->second->executable->pjrt_executable().local_devices()[0];
-  it->second->out_pytree_def = py::cast<PyTreeDef>(executable_and_pytree[1]);
+      cache_entry->executable->pjrt_executable().local_devices().size();
+  // The presence of jit(pmap) is detected from Python.
+  CHECK_EQ(num_devices, 1);
 
-  py::list shaped_arrays =
-      py::reinterpret_borrow<py::object>(executable_and_pytree[2]);
-  py::list lazy_expressions =
-      py::reinterpret_borrow<py::object>(executable_and_pytree[3]);
+  cache_entry->handlers = std::move(handlers);
+  cache_entry->out_pytree_def = std::move(out_tree);
 
-  it->second->out_avals.reserve(shaped_arrays.size());
-  it->second->out_lazy_exprs.reserve(lazy_expressions.size());
-
-  int num_outputs = shaped_arrays.size();
-  for (int i = 0; i < num_outputs; ++i) {
-    py::object shaped_array =
-        py::reinterpret_borrow<py::object>(shaped_arrays[i]);
-    py::object lazy_expr =
-        py::reinterpret_borrow<py::object>(lazy_expressions[i]);
-
-    it->second->out_avals.push_back(shaped_array);
-    it->second->out_lazy_exprs.push_back(lazy_expr);
-  }
-
-  return *(it->second);
+  cache_entry->compilation_complete.Notify();
+  return cache_entry;
 }
 
 py::object CompiledFunction::Call(py::args args, py::kwargs kwargs) {
+  if (always_fallback_to_python_) {
+    return py::cast<py::tuple>(cache_miss_(*args, **kwargs))[0];
+  }
+  // Delayed values are retrieved on the first call to `Call`.
+  if (!default_device_) {
+    // As we are calling Python code, that may release the GIL, we first hold
+    // mu_ before holding the GIL.
+    py::gil_scoped_release gil_release;
+    {
+      absl::MutexLock lock1(&mu_);
+      py::gil_scoped_acquire gil_aquire;
+
+      jax_enable_x64_ = py::cast<bool>(get_jax_enable_x64_());
+      jax_disable_jit_ = py::cast<bool>(get_jax_disable_jit_());
+      if (!default_device_) {
+        py::object device_and_is_committed = get_device_();
+        try {
+          default_pydevice_ = py::cast<ClientAndPtr<PjRtDevice>>(
+              device_and_is_committed.attr("default_device"));
+        } catch (const py::cast_error& e) {
+          // Pathways and Cloud TPU 2VM runtime.
+          always_fallback_to_python_ = true;
+          return py::cast<py::tuple>(cache_miss_(*args, **kwargs))[0];
+        }
+        default_pyclient_ = default_pydevice_.client;
+        default_device_ = default_pydevice_.contents;
+        if (!default_device_) {  // UPTC
+          always_fallback_to_python_ = true;
+          return py::cast<py::tuple>(cache_miss_(*args, **kwargs))[0];
+        }
+        is_committed_ =
+            py::cast<bool>(device_and_is_committed.attr("committed_to_device"));
+      }
+    }
+  }
+  CHECK(default_device_);
   if (JitIsDisabled()) {
     return fun_(*args, **kwargs);
   }
   ParsedArgumentsAsBuffers arguments;
   FlattenArguments(args, kwargs, static_argnums_, arguments);
 
-  absl::optional<py::tuple> cache_miss_result = absl::nullopt;
-  if (!default_device_) {
-    cache_miss_result = cache_miss_fun_(*args, **kwargs);
-    auto executable = py::cast<std::shared_ptr<xla::PyExecutable>>(
-        cache_miss_result.value()[0]);
-
-    pyclient_ = executable->client();
-    default_device_ = executable->LocalDevices()[0].contents;
-  }
-
-  // The C++ jit do not support Tracers arguments yet. The Python-based jit
-  // function will be called if any of the dynamic arguments is unsupported.
-  if (!ConvertArgsToBuffers(jax_enable_x64_, *pyclient_, default_device_,
-                            arguments)
+  // The C++ jit do not support Tracers arguments inputs yet. The Python-based
+  // jit function will be called if any of the dynamic arguments is unsupported.
+  if (!ConvertArgsToBuffers(jax_enable_x64_.value(), *default_pyclient_,
+                            default_device_, is_committed_, arguments)
            .ok()) {
-    return python_f_jitted_(*args, **kwargs);
+    return py::cast<py::tuple>(cache_miss_(*args, **kwargs))[0];
   }
 
-  CacheEntry& cache_entry =
-      GetCacheEntry(args, kwargs, arguments.signature, cache_miss_result);
+  CacheEntry* cache_entry = GetCacheEntryIfPresent(arguments.signature);
 
+  if (!cache_entry) {
+    py::object out_and_fastpath_data = cache_miss_(*args, **kwargs);
+    cache_entry = GetCacheEntryIfPresent(arguments.signature);
+    if (!cache_entry) {
+      cache_entry = AddCacheEntry(args, kwargs, arguments.signature,
+                                  out_and_fastpath_data);
+    }
+    CHECK(cache_entry);
+    if (cache_entry->fall_back_to_python) {
+      return py::cast<py::tuple>(out_and_fastpath_data)[0];
+    }
+    // As we have already computed the results, we can return it.
+    // It's even *required* e.g. if there are donated arguments, because
+    // otherwise the buffer which has been donated already will be invalid.
+    return py::cast<py::tuple>(out_and_fastpath_data)[0];
+  }
+  CHECK(cache_entry);
+  if (cache_entry->fall_back_to_python) {
+    return py::cast<py::tuple>(cache_miss_(*args, **kwargs))[0];
+  }
   std::vector<std::unique_ptr<xla::PyBuffer>> outputs =
-      ValueOrThrow(cache_entry.executable->PjRtExecute(arguments.arg_buffers));
-
-  static const auto* xla_module =
-      new py::module(py::module::import("jax.interpreters.xla"));
-  const auto& device_array = xla_module->attr("DeviceArray");
-
-  const std::vector<py::object>& out_avals = cache_entry.out_avals;
-  const std::vector<py::object>& out_lazy_exprs = cache_entry.out_lazy_exprs;
+      ValueOrThrow(cache_entry->executable->PjRtExecute(arguments.arg_buffers));
 
+  const std::vector<py::function>& handlers = cache_entry->handlers;
   py::list flat_device_arrays;
   for (int i = 0; i < outputs.size(); ++i) {
-    flat_device_arrays.append(device_array(
-        /*aval=*/out_avals[i], /*device=*/outputs[i]->device(),
-        /*lazy_expr=*/out_lazy_exprs[i],
-        /*device_buffer=*/std::move(outputs[i])));
+    flat_device_arrays.append(handlers[i](std::move(outputs[i])));
   }
-  return cache_entry.out_pytree_def.Unflatten(flat_device_arrays);
+  return cache_entry->out_pytree_def.Unflatten(flat_device_arrays);
 }
 
 }  // namespace
@@ -743,17 +868,28 @@ void BuildJaxjitSubmodule(pybind11::module& m) {
   jitlib.def("get_disable_jit", &GetDisableJit);
   jitlib.def(
       "jit",
-      [](py::function fun, py::function cache_miss_fun,
-         py::function fallback_on_unsupported_argument, bool jax_enable_x64,
-         bool jax_disable_jit,
+      [](py::function fun, py::function cache_miss, py::function get_device,
+         py::function get_jax_enable_x64, py::function get_jax_disable_jit,
          std::vector<int> static_argnums) -> std::unique_ptr<CompiledFunction> {
         return std::make_unique<CompiledFunction>(
-            std::move(fun), std::move(cache_miss_fun),
-            std::move(fallback_on_unsupported_argument), jax_enable_x64,
-            jax_disable_jit, std::move(static_argnums));
+            std::move(fun), std::move(cache_miss), std::move(get_device),
+            std::move(get_jax_enable_x64), std::move(get_jax_disable_jit),
+            std::move(static_argnums));
       });
 
   // Only for testing purposes
+  cfun.def("_cache_size", &CompiledFunction::cache_size);
+  jitlib.def("_DtypeTo32BitDtype", [](const py::object obj) -> py::object {
+    py::dtype dtype = py::dtype::from_args(obj);
+    const py::dtype* res = DtypeTo32BitDtype(dtype);
+    if (res) {
+      return *res;
+    } else {
+      return py::none();
+    }
+  });
+  jitlib.def("_is_float0", &IsFloat0);
+  jitlib.def("_is_trivial", &HasTrivialLazyExpr);
   jitlib.def("_ScalarToBuffer", [](py::handle scalar, bool jax_enable_x64,
                                    std::shared_ptr<xla::PyClient> client) {
     xla::PjRtClient* pjrt_client = client->pjrt_client();
diff --git a/tensorflow/compiler/xla/python/ops.cc b/tensorflow/compiler/xla/python/ops.cc
index 3ac4709b160..04e68f9a563 100644
--- a/tensorflow/compiler/xla/python/ops.cc
+++ b/tensorflow/compiler/xla/python/ops.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "pybind11/attr.h"
 #include "pybind11/pybind11.h"
 #include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/client/lib/lu_decomposition.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/qr.h"
 #include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
@@ -186,6 +187,13 @@ void BuildOpsSubmodule(py::module* m) {
         return std::make_pair(qr.q, qr.r);
       },
       py::arg("operand"), py::arg("full_matrices"));
+  ops.def(
+      "LU",
+      [](XlaOp a) -> StatusOr<std::tuple<XlaOp, XlaOp, XlaOp>> {
+        LuDecompositionResult lu = LuDecomposition(a);
+        return std::make_tuple(lu.lu, lu.pivots, lu.permutation);
+      },
+      py::arg("operand"));
   ops.def(
       "Eigh",
       [](XlaOp a, bool lower, int64 max_iter,
@@ -283,6 +291,7 @@ void BuildOpsSubmodule(py::module* m) {
   ops.def("RandomGammaGrad", &RandomGammaGrad, py::arg("a"), py::arg("x"));
   ops.def("RegularizedIncompleteBeta", &RegularizedIncompleteBeta, py::arg("a"),
           py::arg("b"), py::arg("x"));
+  ops.def("Zeta", &Zeta, py::arg("x"), py::arg("q"));
 
 #define BINARY_OP(op)                                                 \
   ops.def(                                                            \
diff --git a/tensorflow/compiler/xla/python/py_client.cc b/tensorflow/compiler/xla/python/py_client.cc
index 6df11322564..07b915c640c 100644
--- a/tensorflow/compiler/xla/python/py_client.cc
+++ b/tensorflow/compiler/xla/python/py_client.cc
@@ -30,6 +30,8 @@ namespace xla {
 namespace py = pybind11;
 namespace pprof = tensorflow::tfprof::pprof;
 
+PyClient::PyClient(std::unique_ptr<PjRtClient> pjrt_client)
+    : pjrt_client_(std::move(pjrt_client)) {}
 PyClient::PyClient(std::shared_ptr<PjRtClient> pjrt_client)
     : pjrt_client_(std::move(pjrt_client)) {}
 
diff --git a/tensorflow/compiler/xla/python/py_client.h b/tensorflow/compiler/xla/python/py_client.h
index f12a4ae4f0a..08249722d6c 100644
--- a/tensorflow/compiler/xla/python/py_client.h
+++ b/tensorflow/compiler/xla/python/py_client.h
@@ -88,6 +88,7 @@ ClientAndPtr<T> WrapWithClient(std::shared_ptr<PyClient> client, T* contents) {
 // We use a wrapper class to add Python-specific functionality.
 class PyClient : public std::enable_shared_from_this<PyClient> {
  public:
+  explicit PyClient(std::unique_ptr<PjRtClient> pjrt_client);
   explicit PyClient(std::shared_ptr<PjRtClient> pjrt_client);
 
   PjRtClient* pjrt_client() const { return pjrt_client_.get(); }
diff --git a/tensorflow/compiler/xla/python/tpu_driver/BUILD b/tensorflow/compiler/xla/python/tpu_driver/BUILD
index 4725becdedf..bda1db6a466 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/BUILD
+++ b/tensorflow/compiler/xla/python/tpu_driver/BUILD
@@ -1,24 +1,24 @@
-load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library_cc")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")
 load(
     "//tensorflow/compiler/xla/python/tpu_driver:platform/external/tools.bzl",
     "external_deps",
     "go_grpc_library",
-    "go_proto_library",
 )
 
 licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//visibility:public"])
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "tpu_driver_proto",
     srcs = ["tpu_driver.proto"],
     cc_api_version = 2,
     protodeps = [],
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "tpu_service_proto",
     srcs = ["tpu_service.proto"],
     has_services = 1,
@@ -77,6 +77,7 @@ cc_library(
 cc_library(
     name = "direct_tpu_driver",
     srcs = ["direct_tpu_driver.cc"],
+    compatible_with = [],
     deps = [
         ":tpu_driver",
         "@com_google_absl//absl/strings:str_format",
@@ -115,10 +116,22 @@ cc_library(
     alwayslink = 1,
 )
 
-go_proto_library(
-    name = "tpu_service_go_proto",
-    compatible_with = ["//buildenv/target:gce"],
-    deps = [":tpu_service_proto"],
+cc_library(
+    name = "pod_tpu_driver",
+    srcs = ["pod_tpu_driver.cc"],
+    deps = [
+        ":grpc_tpu_driver",
+        ":tpu_driver",
+        ":tpu_driver_proto_cc",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "//tensorflow/compiler/xla/pjrt:semaphore",
+        "//tensorflow/compiler/xla/pjrt:worker_thread",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        tf_grpc_cc_dependency(),
+    ] + external_deps(),
+    alwayslink = 1,
 )
 
 go_grpc_library(
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/BUILD b/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
index c460cc36f08..9d98d0cf654 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/BUILD
@@ -1,3 +1,9 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "pybind_extension")
 
 package(
@@ -11,6 +17,7 @@ cc_library(
     hdrs = [
         "tpu_client.h",
     ],
+    compatible_with = [],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -24,11 +31,12 @@ cc_library(
         "//tensorflow/compiler/xla/python/tpu_driver",
         "//tensorflow/compiler/xla/python/tpu_driver:direct_tpu_driver",
         "//tensorflow/compiler/xla/python/tpu_driver:grpc_tpu_driver",
+        "//tensorflow/compiler/xla/python/tpu_driver:pod_tpu_driver",
         "//tensorflow/compiler/xla/python/tpu_driver:recording_tpu_driver",
         "//tensorflow/compiler/xla/python/tpu_driver:tpu_driver_proto_cc",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/compiler/xla/service:shaped_buffer",
-        "//tensorflow/core:allocator",
+        "//tensorflow/core/framework:allocator",
         "//tensorflow/core/platform:env",
         "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/memory",
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
index e4fb2cdfd41..0602d096aaa 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
@@ -588,7 +588,7 @@ PyTpuExecutable::ExecuteResult PyTpuExecutable::ExecuteHelper(
 static const absl::Duration kWarnExecutionDelay = absl::Seconds(10);
 
 // Delay before terminating a stalled execute call.
-static const absl::Duration kMaxExecutionDelay = absl::Seconds(120);
+static const absl::Duration kMaxExecutionDelay = absl::Minutes(60);
 
 Status WaitForExecuteEvent(tpu_driver::Event* event) {
   absl::optional<Status> opt_status;
diff --git a/tensorflow/compiler/xla/python/tpu_driver/platform/external/tools.bzl b/tensorflow/compiler/xla/python/tpu_driver/platform/external/tools.bzl
index 99b07b6c787..12a4390d317 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/platform/external/tools.bzl
+++ b/tensorflow/compiler/xla/python/tpu_driver/platform/external/tools.bzl
@@ -16,10 +16,6 @@
 Build dependencies and utilities for the TPU driver interface.
 """
 
-def go_proto_library(**kwargs):
-    # A dummy macro placeholder for compatibility reason.
-    pass
-
 def go_grpc_library(**kwargs):
     # A dummy macro placeholder for compatibility reason.
     pass
diff --git a/tensorflow/compiler/xla/python/tpu_driver/pod_tpu_driver.cc b/tensorflow/compiler/xla/python/tpu_driver/pod_tpu_driver.cc
new file mode 100644
index 00000000000..a5a6cbabb82
--- /dev/null
+++ b/tensorflow/compiler/xla/python/tpu_driver/pod_tpu_driver.cc
@@ -0,0 +1,977 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "absl/container/btree_map.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_split.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/xla/pjrt/semaphore.h"
+#include "tensorflow/compiler/xla/pjrt/worker_thread.h"
+#include "tensorflow/compiler/xla/python/tpu_driver/grpc_tpu_driver.h"
+#include "tensorflow/compiler/xla/python/tpu_driver/tpu_driver.h"
+#include "tensorflow/compiler/xla/python/tpu_driver/tpu_driver.pb.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+
+namespace tpu_driver {
+namespace {
+
+#define CHECK_EXISTS_OR_RETURN(container, target_op_id, operation_id)  \
+  {                                                                    \
+    auto p = CheckHandleExists(container, target_op_id, operation_id); \
+    if (p != nullptr) return p;                                        \
+  }
+
+using xla::Status;
+using xla::WorkerThread;
+
+const char kPodTpuDriverPrefix[] = "grpc+pod://";
+
+class PodTpuDriver;
+
+class PodEvent : public Event {
+ public:
+  explicit PodEvent(PodTpuDriver* driver, int64_t operation_id)
+      : driver_(driver), operation_id_(operation_id) {}
+  int64_t operation_id() const { return operation_id_; }
+
+  xla::Status Await() override;
+
+  absl::optional<xla::Status> AwaitWithTimeout(
+      absl::Duration duration) override;
+
+  void AddCallback(std::function<void(Status)> callback) override;
+
+ private:
+  PodTpuDriver* driver_;
+  const int64_t operation_id_;
+};
+
+class ErrorEvent : public PodEvent {
+ public:
+  explicit ErrorEvent(PodTpuDriver* driver, int64_t operation_id, Status status)
+      : PodEvent(driver, operation_id) {
+    status_ = status;
+  }
+
+  xla::Status Await() override { return status_; }
+  absl::optional<xla::Status> AwaitWithTimeout(
+      absl::Duration duration) override {
+    return status_;
+  }
+  void AddCallback(std::function<void(Status)> callback) override {
+    callback(status_);
+  }
+
+ private:
+  Status status_;
+};
+
+class CombinedEvent : public PodEvent {
+ public:
+  explicit CombinedEvent(PodTpuDriver* driver, int64_t operation_id,
+                         std::vector<std::shared_ptr<Event>> events)
+      : PodEvent(driver, operation_id), events_(events) {
+    for (auto& event : events_) {
+      event->AddCallback([this](Status s) { IncrementAndCheckComplete(s); });
+    }
+  }
+
+  xla::Status Await() override {
+    for (auto& event : events_) {
+      TF_RETURN_IF_ERROR(event->Await());
+    }
+    return Status::OK();
+  }
+
+  absl::optional<xla::Status> AwaitWithTimeout(
+      absl::Duration duration) override {
+    for (auto& event : events_) {
+      auto start_time = absl::Now();
+      auto status = event->AwaitWithTimeout(duration);
+      duration -= absl::Now() - start_time;
+      if (status == absl::nullopt) {
+        return absl::nullopt;
+      } else {
+        TF_RETURN_IF_ERROR(status.value());
+      }
+    }
+    return Status::OK();
+  }
+
+  void AddCallback(std::function<void(Status)> callback)
+      TF_LOCKS_EXCLUDED(mu_) override {
+    bool all_events_completed = false;
+    {
+      absl::MutexLock l(&mu_);
+      all_events_completed = events_completed_ == events_.size();
+    }
+    if (all_events_completed) {
+      callback(event_status_);
+    } else {
+      absl::MutexLock l(&mu_);
+      callbacks_.push_back(std::move(callback));
+    }
+  }
+
+ private:
+  void IncrementAndCheckComplete(Status s) TF_LOCKS_EXCLUDED(mu_) {
+    std::vector<std::function<void(Status)>> callbacks;
+    {
+      absl::MutexLock l(&mu_);
+
+      event_status_ = s;
+      events_completed_++;
+      if (events_completed_ == events_.size()) {
+        // Copy callbacks to a temporary to be invoked outside the mutex.
+        callbacks.assign(callbacks_.begin(), callbacks_.end());
+        callbacks_.clear();
+      } else {
+        return;
+      }
+    }
+
+    for (const auto& callback : callbacks) {
+      callback(event_status_);
+    }
+  }
+
+  absl::Mutex mu_;
+  std::vector<std::shared_ptr<Event>> events_;
+  std::vector<std::function<void(Status)>> callbacks_ ABSL_GUARDED_BY(mu_);
+  int64_t events_completed_ ABSL_GUARDED_BY(mu_) = 0;
+  Status event_status_;
+};
+
+class PodBufferHandle : public BufferHandle {
+ public:
+  explicit PodBufferHandle(PodTpuDriver* driver, int64_t operation_id,
+                           int64_t size_in_bytes,
+                           absl::optional<xla::ShapeProto> shape,
+                           int64_t core_id)
+      : driver_(driver),
+        operation_id_(operation_id),
+        size_in_bytes_(size_in_bytes),
+        shape_(shape),
+        event_(std::make_shared<PodEvent>(driver_, operation_id_)),
+        core_id_(core_id) {}
+
+  std::shared_ptr<Event> OnReady() override { return event_; }
+  int64_t size_in_bytes() override { return size_in_bytes_; }
+  absl::optional<xla::ShapeProto> shape() override { return shape_; }
+
+  int64_t operation_id() const { return operation_id_; }
+  int64_t core_id() const { return core_id_; }
+
+ private:
+  PodTpuDriver* driver_;
+  const int64_t operation_id_;
+  const int64_t size_in_bytes_;
+  const absl::optional<xla::ShapeProto> shape_;
+  std::shared_ptr<PodEvent> event_;
+  const int64_t core_id_;
+};
+
+class PodCompiledProgramHandle : public CompiledProgramHandle {
+ public:
+  explicit PodCompiledProgramHandle(PodTpuDriver* driver, int64_t operation_id)
+      : driver_(driver),
+        operation_id_(operation_id),
+        event_(std::make_shared<PodEvent>(driver_, operation_id_)) {}
+
+  std::shared_ptr<Event> OnReady() override { return event_; }
+
+  xla::Status program_shape(xla::ProgramShapeProto* program_shape) override;
+
+  int64_t operation_id() const { return operation_id_; }
+
+ private:
+  PodTpuDriver* driver_;
+  const int64_t operation_id_;
+  std::shared_ptr<PodEvent> event_;
+};
+
+class PodLoadedProgramHandle : public LoadedProgramHandle {
+ public:
+  explicit PodLoadedProgramHandle(PodTpuDriver* driver, int64_t operation_id,
+                                  int64_t core_id)
+      : driver_(driver),
+        operation_id_(operation_id),
+        core_id_(core_id),
+        event_(std::make_shared<PodEvent>(driver_, operation_id_)) {}
+
+  std::shared_ptr<Event> OnReady() override { return event_; }
+
+  int64_t operation_id() const { return operation_id_; }
+  int64_t core_id() const { return core_id_; }
+
+ private:
+  PodTpuDriver* driver_;
+  const int64_t operation_id_;
+  const int64_t core_id_;
+  std::shared_ptr<PodEvent> event_;
+};
+
+struct EventInFlight {
+  EventInFlight()
+      : underlying_event(nullptr),
+        create_fn(nullptr),
+        incomplete_deps(),
+        callbacks() {}
+
+  std::shared_ptr<Event> underlying_event;
+  std::function<std::shared_ptr<Event>(void)> create_fn;
+
+  absl::flat_hash_set<int64_t> incomplete_deps;
+  std::vector<std::function<void(Status)>> callbacks;
+};
+
+class PodTpuDriver : public TpuDriver {
+ public:
+  explicit PodTpuDriver(const TpuDriverConfig& config,
+                        std::shared_ptr<::grpc::ChannelCredentials> creds)
+      : config_(config),
+        creds_(creds),
+        event_thread_(tensorflow::Env::Default(), "grpc_pod_event_thread") {
+    std::vector<std::string> workers = absl::StrSplit(
+        absl::StripPrefix(config.worker(), kPodTpuDriverPrefix), ',');
+
+    int worker_count = 0;
+
+    // Flag for environments where local core # == all cores in TPU system #,
+    // which means that we are connecting to separate TPU systems or we are in
+    // a test environment.
+    bool in_local_core_environment = false;
+
+    for (const auto& worker : workers) {
+      TpuDriverConfig worker_config(config_);
+      *(worker_config.mutable_worker()) = absl::StrCat("grpc://", worker);
+      auto tpu_driver =
+          CreateGrpcTpuDriver(worker_config, creds_).ConsumeValueOrDie();
+
+      SystemInfo driver_info;
+      tpu_driver->QuerySystemInfo(&driver_info);
+
+      if (driver_info.core_count() == driver_info.local_core_size()) {
+        drivers_.insert({worker_count, std::move(tpu_driver)});
+        in_local_core_environment = true;
+      } else {
+        drivers_.insert({driver_info.host_id(), std::move(tpu_driver)});
+      }
+
+      worker_count++;
+    }
+
+    absl::flat_hash_set<std::tuple<int, int, int>> processed_chips;
+
+    for (int driver_num = 0; driver_num < workers.size(); ++driver_num) {
+      SystemInfo driver_info;
+      drivers_[driver_num]->QuerySystemInfo(&driver_info);
+
+      for (const auto& tpu_chip : driver_info.tpu_chip()) {
+        std::tuple<int, int, int> coord{tpu_chip.chip_coord().x(),
+                                        tpu_chip.chip_coord().y(),
+                                        tpu_chip.chip_coord().z()};
+        // We only want to add chips that we have not seen before if we are in a
+        // TPU pod slice, or we are only seeing local cores (e.g. we are
+        // connected to individual TPUs or we are in a test environment).
+        if (!processed_chips.contains(coord) ||
+            driver_info.core_count() == driver_info.local_core_size()) {
+          *(pod_info_.add_tpu_chip()) = tpu_chip;
+          processed_chips.insert(coord);
+        }
+      }
+
+      *(pod_info_.mutable_cpu()) = driver_info.cpu();
+    }
+
+    // Process all the unique chips that we have seen.
+    int core_count = 0;
+    for (auto& tpu_chip : *pod_info_.mutable_tpu_chip()) {
+      for (auto& tpu_core : *tpu_chip.mutable_core()) {
+        int current_core = tpu_core.id();
+        if (in_local_core_environment) {
+          current_core = core_count;
+        }
+
+        core_to_driver_.insert(
+            {current_core, drivers_[tpu_chip.host_id()].get()});
+        core_to_driver_id_.insert({current_core, tpu_chip.host_id()});
+        core_to_driver_core_.insert({current_core, tpu_core.id()});
+
+        tpu_core.set_id(current_core);
+        tpu_core.set_core_on_host_index(current_core);
+        *(pod_info_.add_local_core()) = tpu_core;
+
+        core_count++;
+      }
+
+      // We are setting host_id to zero because we want this to look like one
+      // host with many cores from the perspective of tpu_client.cc.
+      tpu_chip.set_host_id(0);
+    }
+
+    pod_info_.set_chip_count(pod_info_.tpu_chip_size());
+    pod_info_.set_core_count(pod_info_.local_core_size());
+
+    // We want this to look like one host with many TPU chips/cores connected.
+    pod_info_.set_host_count(1);
+    pod_info_.set_host_id(0);
+  }
+
+  ~PodTpuDriver() override {
+    // TODO(frankchn): Unload all handles, and wait for all events to finish.
+  }
+
+  void QuerySystemInfo(SystemInfo* system_info) override {
+    *system_info = pod_info_;
+  }
+
+  xla::Status Reset() override {
+    for (auto& driver : drivers_) {
+      TF_RETURN_IF_ERROR(driver.second->Reset());
+    }
+    return xla::Status::OK();
+  }
+
+  std::unique_ptr<BufferHandle> Allocate(
+      int32_t core_id, MemoryRegion region, int64_t num_bytes,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+
+    ScheduleRequest(
+        operation_id,
+        [this, core_id, region, num_bytes,
+         operation_id]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          underlying_buffers_.insert(
+              {operation_id,
+               core_to_driver_[core_id]->Allocate(core_to_driver_core_[core_id],
+                                                  region, num_bytes, {})});
+          return underlying_buffers_[operation_id]->OnReady();
+        },
+        deps);
+
+    return absl::make_unique<PodBufferHandle>(this, operation_id, num_bytes,
+                                              absl::nullopt, core_id);
+  }
+
+  std::unique_ptr<BufferHandle> Allocate(
+      int32_t core_id, MemoryRegion region, const xla::ShapeProto& shape,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+
+    ScheduleRequest(
+        operation_id,
+        [this, core_id, region, shape,
+         operation_id]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          underlying_buffers_.insert(
+              {operation_id,
+               core_to_driver_[core_id]->Allocate(core_to_driver_core_[core_id],
+                                                  region, shape, {})});
+          return underlying_buffers_[operation_id]->OnReady();
+        },
+        deps);
+
+    return absl::make_unique<PodBufferHandle>(
+        this, operation_id, ComputeBytesFromShape(shape), shape, core_id);
+  }
+
+  std::unique_ptr<BufferHandle> AllocateTuple(
+      int32_t core_id, MemoryRegion region,
+      absl::Span<BufferHandle* const> children,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+
+    std::vector<int64_t> children_ids;
+    for (int i = 0; i < children.size(); ++i) {
+      auto child_op_id =
+          static_cast<PodBufferHandle* const>(children[i])->operation_id();
+      deps.insert(child_op_id);
+      children_ids.push_back(child_op_id);
+    }
+
+    ScheduleRequest(
+        operation_id,
+        [this, core_id, region, children_ids,
+         operation_id]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_)
+            -> std::shared_ptr<Event> {
+          std::vector<BufferHandle*> child_buffers;
+          child_buffers.reserve(children_ids.size());
+          for (int i = 0; i < children_ids.size(); ++i) {
+            CHECK_EXISTS_OR_RETURN(underlying_buffers_, children_ids[i],
+                                   operation_id);
+            child_buffers.push_back(underlying_buffers_[children_ids[i]].get());
+          }
+
+          underlying_buffers_.insert(
+              {operation_id,
+               core_to_driver_[core_id]->AllocateTuple(
+                   core_to_driver_core_[core_id], region, child_buffers, {})});
+          return underlying_buffers_[operation_id]->OnReady();
+        },
+        deps);
+
+    return absl::make_unique<PodBufferHandle>(this, operation_id, 0,
+                                              absl::nullopt, core_id);
+  }
+
+  std::shared_ptr<Event> Deallocate(
+      std::unique_ptr<BufferHandle> handle,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+    deps.insert(static_cast<PodBufferHandle*>(handle.get())->operation_id());
+
+    auto op_id = static_cast<PodBufferHandle*>(handle.get())->operation_id();
+    auto core_id = static_cast<PodBufferHandle*>(handle.get())->core_id();
+
+    ScheduleRequest(
+        operation_id,
+        [this, operation_id, op_id,
+         core_id]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) -> std::shared_ptr<Event> {
+          CHECK_EXISTS_OR_RETURN(underlying_buffers_, op_id, operation_id);
+
+          auto buf_iter = underlying_buffers_.find(op_id);
+          auto underlying_hn = std::move(buf_iter->second);
+          underlying_buffers_.erase(buf_iter);
+
+          return core_to_driver_[core_id]->Deallocate(std::move(underlying_hn),
+                                                      {});
+        },
+        deps);
+
+    return std::make_shared<PodEvent>(this, operation_id);
+  }
+
+  std::shared_ptr<Event> TransferToDevice(
+      const void* src, BufferHandle* dst,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+    deps.insert(static_cast<PodBufferHandle*>(dst)->operation_id());
+
+    auto op_id = static_cast<PodBufferHandle*>(dst)->operation_id();
+    auto core_id = static_cast<PodBufferHandle*>(dst)->core_id();
+
+    ScheduleRequest(
+        operation_id,
+        [this, src, operation_id, op_id,
+         core_id]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) -> std::shared_ptr<Event> {
+          CHECK_EXISTS_OR_RETURN(underlying_buffers_, op_id, operation_id);
+
+          auto buf_iter = underlying_buffers_.find(op_id);
+          return core_to_driver_[core_id]->TransferToDevice(
+              src, buf_iter->second.get(), {});
+        },
+        deps);
+
+    return std::make_shared<PodEvent>(this, operation_id);
+  }
+
+  std::shared_ptr<Event> TransferFromDevice(
+      const BufferHandle* src, void* dst,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+    deps.insert(static_cast<const PodBufferHandle*>(src)->operation_id());
+
+    auto op_id = static_cast<const PodBufferHandle*>(src)->operation_id();
+    auto core_id = static_cast<const PodBufferHandle*>(src)->core_id();
+
+    ScheduleRequest(
+        operation_id,
+        [this, dst, operation_id, op_id,
+         core_id]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) -> std::shared_ptr<Event> {
+          CHECK_EXISTS_OR_RETURN(underlying_buffers_, op_id, operation_id);
+          auto buf_iter = underlying_buffers_.find(op_id);
+          return core_to_driver_[core_id]->TransferFromDevice(
+              buf_iter->second.get(), dst, {});
+        },
+        deps);
+
+    return std::make_shared<PodEvent>(this, operation_id);
+  }
+
+  std::shared_ptr<Event> TransferFromDeviceToDevice(
+      const BufferHandle* src, BufferHandle* dst,
+      absl::Span<Event* const> wait_for) override {
+    auto src_core_id = static_cast<const PodBufferHandle*>(src)->core_id();
+    auto dst_core_id = static_cast<PodBufferHandle*>(dst)->core_id();
+
+    auto src_driver_id = core_to_driver_id_[src_core_id];
+    auto dst_driver_id = core_to_driver_id_[dst_core_id];
+
+    if (src_driver_id == dst_driver_id) {
+      // They are in the same host, we can schedule it normally
+      int64_t operation_id = GetOperationId();
+      auto deps = GetDependencyOperationIds(wait_for);
+      deps.insert(static_cast<const PodBufferHandle*>(src)->operation_id());
+      deps.insert(static_cast<PodBufferHandle*>(dst)->operation_id());
+
+      auto src_op_id = static_cast<const PodBufferHandle*>(src)->operation_id();
+      auto dst_op_id = static_cast<PodBufferHandle*>(dst)->operation_id();
+
+      ScheduleRequest(
+          operation_id,
+          [this, operation_id, src_op_id, dst_op_id, dst_core_id]()
+              TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) -> std::shared_ptr<Event> {
+                CHECK_EXISTS_OR_RETURN(underlying_buffers_, src_op_id,
+                                       operation_id);
+                CHECK_EXISTS_OR_RETURN(underlying_buffers_, dst_op_id,
+                                       operation_id);
+
+                auto src_iter = underlying_buffers_.find(src_op_id);
+                auto dst_iter = underlying_buffers_.find(dst_op_id);
+                return core_to_driver_[dst_core_id]->TransferFromDeviceToDevice(
+                    src_iter->second.get(), dst_iter->second.get(), {});
+              },
+          deps);
+      return std::make_shared<PodEvent>(this, operation_id);
+    } else {
+      // src and dst are on different hosts, we have to bounce through us.
+      auto dst_size = dst->size_in_bytes();
+      char* host_buf = new char[dst_size];
+
+      auto src_event = TransferFromDevice(src, host_buf, wait_for);
+      auto dst_event = TransferToDevice(host_buf, dst, {src_event.get()});
+      dst_event->AddCallback(
+          [src_event, host_buf](xla::Status status) { delete[] host_buf; });
+      return dst_event;
+    }
+  }
+
+  std::unique_ptr<CompiledProgramHandle> CompileProgram(
+      const xla::HloProto& source, int32_t num_replicas,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+
+    ScheduleRequest(
+        operation_id,
+        [this, operation_id, source,
+         num_replicas]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+          auto cph_iterator =
+              underlying_cph_
+                  .insert(
+                      {operation_id,
+                       std::vector<std::unique_ptr<CompiledProgramHandle>>()})
+                  .first;
+
+          std::vector<std::shared_ptr<Event>> collected_events;
+          for (int i = 0; i < drivers_.size(); ++i) {
+            auto current_cph =
+                drivers_[i]->CompileProgram(source, num_replicas, {});
+            cph_iterator->second.push_back(std::move(current_cph));
+            collected_events.push_back(cph_iterator->second[i]->OnReady());
+          }
+          return std::make_shared<CombinedEvent>(this, operation_id,
+                                                 collected_events);
+        },
+        deps);
+
+    return absl::make_unique<PodCompiledProgramHandle>(this, operation_id);
+  }
+
+  std::unique_ptr<LoadedProgramHandle> LoadProgram(
+      int32_t core_id, const CompiledProgramHandle* handle,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+    deps.insert(
+        static_cast<const PodCompiledProgramHandle*>(handle)->operation_id());
+    auto cph_op_id =
+        static_cast<const PodCompiledProgramHandle*>(handle)->operation_id();
+
+    ScheduleRequest(
+        operation_id,
+        [this, operation_id, cph_op_id,
+         core_id]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) -> std::shared_ptr<Event> {
+          CHECK_EXISTS_OR_RETURN(underlying_cph_, cph_op_id, operation_id);
+          auto cph_iter = underlying_cph_.find(cph_op_id);
+
+          underlying_lph_.insert(
+              {operation_id,
+               core_to_driver_[core_id]->LoadProgram(
+                   core_to_driver_core_[core_id],
+                   cph_iter->second[core_to_driver_id_[core_id]].get(), {})});
+
+          return underlying_lph_[operation_id]->OnReady();
+        },
+        deps);
+
+    return absl::make_unique<PodLoadedProgramHandle>(this, operation_id,
+                                                     core_id);
+  }
+
+  std::shared_ptr<Event> UnloadProgram(
+      std::unique_ptr<LoadedProgramHandle> handle,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+    auto deps = GetDependencyOperationIds(wait_for);
+    deps.insert(
+        static_cast<PodLoadedProgramHandle*>(handle.get())->operation_id());
+    auto op_id =
+        static_cast<PodLoadedProgramHandle*>(handle.get())->operation_id();
+    auto core_id =
+        static_cast<PodLoadedProgramHandle*>(handle.get())->core_id();
+
+    ScheduleRequest(
+        operation_id,
+        [this, operation_id, op_id,
+         core_id]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) -> std::shared_ptr<Event> {
+          CHECK_EXISTS_OR_RETURN(underlying_lph_, op_id, operation_id);
+          auto lph_iter = underlying_lph_.find(op_id);
+          auto event = core_to_driver_[core_id]->UnloadProgram(
+              std::move(lph_iter->second), {});
+          underlying_lph_.erase(lph_iter);
+
+          return event;
+        },
+        deps);
+
+    return std::make_shared<PodEvent>(this, operation_id);
+  }
+
+  std::shared_ptr<Event> ExecuteProgram(
+      LoadedProgramHandle* program, absl::Span<BufferHandle* const> inputs,
+      absl::Span<BufferHandle* const> outputs,
+      const xla::DeviceAssignmentProto& device_assignment,
+      absl::Span<Event* const> wait_for) override {
+    int64_t operation_id = GetOperationId();
+
+    auto deps = GetDependencyOperationIds(wait_for);
+    deps.insert(static_cast<PodLoadedProgramHandle*>(program)->operation_id());
+
+    auto op_id = static_cast<PodLoadedProgramHandle*>(program)->operation_id();
+    auto core_id = static_cast<PodLoadedProgramHandle*>(program)->core_id();
+
+    std::vector<int64_t> input_op_ids;
+    std::vector<int64_t> output_op_ids;
+
+    for (auto* input : inputs) {
+      auto input_dep =
+          static_cast<PodBufferHandle* const>(input)->operation_id();
+      input_op_ids.push_back(input_dep);
+      deps.insert(input_dep);
+    }
+    for (auto* output : outputs) {
+      auto output_dep =
+          static_cast<PodBufferHandle* const>(output)->operation_id();
+      output_op_ids.push_back(output_dep);
+      deps.insert(output_dep);
+    }
+
+    ScheduleRequest(
+        operation_id,
+        [this, operation_id, core_id, op_id, input_op_ids, output_op_ids,
+         device_assignment]() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_)
+            -> std::shared_ptr<Event> {
+          std::vector<BufferHandle*> underlying_inputs;
+          std::vector<BufferHandle*> underlying_outputs;
+
+          underlying_inputs.reserve(input_op_ids.size());
+          for (auto input_op_id : input_op_ids) {
+            CHECK_EXISTS_OR_RETURN(underlying_buffers_, input_op_id,
+                                   operation_id);
+            underlying_inputs.push_back(underlying_buffers_[input_op_id].get());
+          }
+          underlying_outputs.reserve(output_op_ids.size());
+          for (auto output_op_id : output_op_ids) {
+            CHECK_EXISTS_OR_RETURN(underlying_buffers_, output_op_id,
+                                   operation_id);
+            underlying_outputs.push_back(
+                underlying_buffers_[output_op_id].get());
+          }
+
+          CHECK_EXISTS_OR_RETURN(underlying_lph_, op_id, operation_id);
+          LoadedProgramHandle* handle = underlying_lph_[op_id].get();
+          return core_to_driver_[core_id]->ExecuteProgram(
+              handle, underlying_inputs, underlying_outputs, device_assignment,
+              {});
+        },
+        deps);
+
+    return std::make_shared<PodEvent>(this, operation_id);
+  }
+
+  std::unique_ptr<TpuLinearizer> GetLinearizer() override {
+    return drivers_[0]->GetLinearizer();
+  }
+
+  // Helper methods for Event scheduling
+
+  absl::optional<Status> WaitForEvent(int64_t event_id, absl::Duration duration)
+      TF_LOCKS_EXCLUDED(mu_) {
+    std::shared_ptr<Event> underlying_event;
+
+    {
+      absl::MutexLock l(&mu_);
+      auto event = events_.find(event_id);
+
+      if (event == events_.end()) {
+        auto event_status = abnormal_event_status_.find(event_id);
+        if (event_status == abnormal_event_status_.end()) {
+          return Status::OK();
+        } else {
+          return event_status->second;
+        }
+      }
+
+      auto done = [this, event_id]() {
+        mu_.AssertHeld();
+        // The event was either completed and erased from the map or we have
+        // an underlying event available to us.
+        return events_.count(event_id) == 0 ||
+               (events_[event_id]->underlying_event != nullptr &&
+                events_[event_id]->underlying_event.use_count() != 0);
+      };
+
+      auto status = mu_.AwaitWithTimeout(absl::Condition(&done), duration);
+      if (!status) {
+        return absl::nullopt;
+      }
+
+      if (events_.count(event_id) > 0) {
+        underlying_event = events_[event_id]->underlying_event;
+      } else {
+        underlying_event = nullptr;
+      }
+    }
+
+    // Wait for the underlying event without holding on to the event_lock_, or
+    // else incoming events will not be processed.
+    if (underlying_event != nullptr) {
+      return underlying_event->AwaitWithTimeout(duration);
+    } else {
+      absl::MutexLock l(&mu_);
+      auto event_status = abnormal_event_status_.find(event_id);
+      if (event_status == abnormal_event_status_.end()) {
+        return Status::OK();
+      } else {
+        return event_status->second;
+      }
+    }
+  }
+
+  void AddCallbackForEvent(int64_t event_id, std::function<void(Status)> fn)
+      TF_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock l(&mu_);
+    auto event = events_.find(event_id);
+
+    if (event == events_.end()) {
+      auto event_status = abnormal_event_status_.find(event_id);
+      if (event_status == abnormal_event_status_.end()) {
+        fn(Status::OK());
+      } else {
+        fn(event_status->second);
+      }
+    } else {
+      if (event->second->underlying_event != nullptr &&
+          event->second->underlying_event.use_count() != 0) {
+        event->second->underlying_event->AddCallback(fn);
+      } else {
+        event->second->callbacks.push_back(std::move(fn));
+      }
+    }
+  }
+
+  xla::Status GetCompiledProgramShape(int64_t op_id,
+                                      xla::ProgramShapeProto* program_shape)
+      TF_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock l(&mu_);
+
+    auto done = [this, op_id]() {
+      mu_.AssertHeld();
+      return underlying_cph_.contains(op_id);
+    };
+    mu_.Await(absl::Condition(&done));
+
+    return underlying_cph_[op_id][0]->program_shape(program_shape);
+  }
+
+ private:
+  const TpuDriverConfig& config_;
+  std::shared_ptr<::grpc::ChannelCredentials> creds_;
+
+  absl::flat_hash_map<int32_t, std::unique_ptr<TpuDriver>> drivers_;
+  absl::flat_hash_map<int32_t, int32_t> core_to_driver_id_;
+  absl::flat_hash_map<int32_t, TpuDriver*> core_to_driver_;
+  absl::flat_hash_map<int32_t, int32_t> core_to_driver_core_;
+  SystemInfo pod_info_;
+
+  absl::Mutex mu_;
+
+  absl::flat_hash_map<int64_t, std::unique_ptr<BufferHandle>>
+      underlying_buffers_ ABSL_GUARDED_BY(mu_);
+  absl::flat_hash_map<int64_t,
+                      std::vector<std::unique_ptr<CompiledProgramHandle>>>
+      underlying_cph_ ABSL_GUARDED_BY(mu_);
+  absl::flat_hash_map<int64_t, std::unique_ptr<LoadedProgramHandle>>
+      underlying_lph_ ABSL_GUARDED_BY(mu_);
+
+  absl::btree_map<int64_t, std::unique_ptr<EventInFlight>> events_
+      ABSL_GUARDED_BY(mu_);
+  absl::flat_hash_map<int64_t, Status> abnormal_event_status_
+      ABSL_GUARDED_BY(mu_);
+
+  std::atomic<int64_t> operation_id_counter_{0};
+
+  WorkerThread event_thread_;
+
+  int64_t GetOperationId() { return operation_id_counter_++; }
+
+  absl::flat_hash_set<int64_t> GetDependencyOperationIds(
+      absl::Span<Event* const> wait_for) {
+    absl::flat_hash_set<int64_t> deps;
+    for (auto* event : wait_for) {
+      deps.insert(static_cast<PodEvent* const>(event)->operation_id());
+    }
+    return deps;
+  }
+
+  // EventCompleted is executed on the event_thread_ worker thread. We want
+  // to propagate the fact that the event is completed to any subsequent events
+  // that might depend on this event.
+  void EventCompleted(int64_t event_id, Status status) TF_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock l(&mu_);
+
+    absl::btree_map<int64_t, std::unique_ptr<EventInFlight>>::iterator
+        curr_event;
+    if (!status.ok()) abnormal_event_status_.insert({event_id, status});
+    curr_event = events_.find(event_id);
+
+    DCHECK(curr_event->second->callbacks.empty());
+    DCHECK(curr_event->second->incomplete_deps.empty());
+
+    for (auto& event : events_) {
+      event.second->incomplete_deps.erase(event_id);
+      // The if statement conditions on both
+      //  - all previous events have completed (incomplete_deps.empty())
+      //  - the op creating this event has not been called yet
+      //    (event.second.create_fn != nullptr)
+      // We call the create_fn that creates the event and adds any relevant
+      // callbacks to the actual event, before setting create_fn to nullptr
+      // to indicate that it has already been called
+      if (event.second->incomplete_deps.empty() &&
+          event.second->create_fn != nullptr) {
+        // We were the last unfilled dependency, all other dependencies are
+        // filled. We can now fire the create function.
+        event.second->underlying_event = event.second->create_fn();
+        for (auto& fn : event.second->callbacks) {
+          event.second->underlying_event->AddCallback(std::move(fn));
+        }
+        event.second->callbacks.clear();
+        event.second->create_fn = nullptr;
+      }
+    }
+
+    // We erase the current event to signal that it has finished.
+    events_.erase(curr_event);
+  }
+
+  void ScheduleRequest(int64_t operation_id,
+                       std::function<std::shared_ptr<Event>(void)> fn,
+                       const absl::flat_hash_set<int64_t>& deps)
+      TF_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock l(&mu_);
+    absl::btree_map<int64_t, std::unique_ptr<EventInFlight>>::iterator event;
+    absl::flat_hash_set<int64_t> incomplete_deps;
+
+    event = events_.insert({operation_id, absl::make_unique<EventInFlight>()})
+                .first;
+    for (const auto& dep : deps) {
+      if (events_.count(dep) > 0) incomplete_deps.insert(dep);
+    }
+
+    if (incomplete_deps.empty()) {
+      // All dependencies have been fulfilled, we execute the request
+      // immediately and add a callback to inform our event fulfilled thread
+      // when it is done.
+      event->second->create_fn = nullptr;
+      event->second->underlying_event = fn();
+      event->second->underlying_event->AddCallback(
+          [this, operation_id](Status status) {
+            event_thread_.Schedule([this, operation_id, status]() {
+              EventCompleted(operation_id, status);
+            });
+          });
+    } else {
+      // There are some dependencies that are not yet fulfilled. We attach
+      // the request to the event, and will execute it in the EventFulfilled
+      // worker thread when all its dependencies are fulfilled.
+      event->second->create_fn = std::move(fn);
+      event->second->incomplete_deps = std::move(incomplete_deps);
+      event->second->callbacks.push_back([this, operation_id](Status status) {
+        event_thread_.Schedule([this, operation_id, status]() {
+          EventCompleted(operation_id, status);
+        });
+      });
+    }
+  }
+
+  template <typename T>
+  std::shared_ptr<Event> CheckHandleExists(
+      absl::flat_hash_map<int64_t, T>& container, int64_t target_op_id,
+      int64_t operation_id) {
+    if (container.count(target_op_id) == 0) {
+      return std::make_shared<ErrorEvent>(
+          this, operation_id,
+          tensorflow::errors::InvalidArgument("Handle ", target_op_id,
+                                              " does not exist."));
+    }
+    return nullptr;
+  }
+};
+
+xla::Status PodEvent::Await() {
+  return driver_->WaitForEvent(operation_id_, absl::InfiniteDuration()).value();
+}
+
+absl::optional<xla::Status> PodEvent::AwaitWithTimeout(
+    absl::Duration duration) {
+  return driver_->WaitForEvent(operation_id_, duration);
+}
+
+void PodEvent::AddCallback(std::function<void(Status)> callback) {
+  driver_->AddCallbackForEvent(operation_id_, std::move(callback));
+}
+
+xla::StatusOr<std::unique_ptr<TpuDriver>> CreatePodTpuDriver(
+    const TpuDriverConfig& config,
+    std::shared_ptr<::grpc::ChannelCredentials> creds) {
+  return std::unique_ptr<TpuDriver>(new PodTpuDriver(config, creds));
+}
+
+xla::Status PodCompiledProgramHandle::program_shape(
+    xla::ProgramShapeProto* program_shape) {
+  return driver_->GetCompiledProgramShape(operation_id(), program_shape);
+}
+
+}  // namespace
+
+REGISTER_TPU_DRIVER(kPodTpuDriverPrefix,
+                    [](const TpuDriverConfig& config)
+                        -> xla::StatusOr<std::unique_ptr<TpuDriver>> {
+                      return CreatePodTpuDriver(
+                          config,
+                          ::grpc::InsecureChannelCredentials());  // NOLINT
+                    });
+
+}  // namespace tpu_driver
diff --git a/tensorflow/compiler/xla/python/tpu_driver/recording_tpu_driver.cc b/tensorflow/compiler/xla/python/tpu_driver/recording_tpu_driver.cc
index da51380c104..49a19cf9e7a 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/recording_tpu_driver.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/recording_tpu_driver.cc
@@ -127,8 +127,11 @@ class RecordingLoadedProgramHandle : public LoadedProgramHandle {
 class RecordingTpuDriver : public TpuDriver {
  public:
   explicit RecordingTpuDriver(std::unique_ptr<TpuDriver> driver,
-                              const std::string recording_path)
-      : driver_(std::move(driver)), recording_path_(recording_path) {
+                              const std::string recording_path,
+                              const bool flush)
+      : driver_(std::move(driver)),
+        recording_path_(recording_path),
+        flush_(flush) {
     auto file_status = tensorflow::Env::Default()->NewAppendableFile(
         recording_path_, &log_file_);
     if (!file_status.ok()) {
@@ -466,6 +469,7 @@ class RecordingTpuDriver : public TpuDriver {
  private:
   std::unique_ptr<TpuDriver> driver_;
   const std::string recording_path_;
+  const bool flush_;
 
   std::unique_ptr<tensorflow::WritableFile> log_file_;
 
@@ -499,6 +503,22 @@ class RecordingTpuDriver : public TpuDriver {
                         "corrupt. Error: "
                      << data_status.ToString();
       }
+
+      if (flush_) {
+        auto flush_status = log_file_->Flush();
+        if (!flush_status.ok()) {
+          LOG(WARNING) << "Unable to flush data to log file. File possibly "
+                          "corrupt. Error: "
+                       << flush_status.ToString();
+        }
+
+        auto sync_status = log_file_->Sync();
+        if (!sync_status.ok()) {
+          LOG(WARNING) << "Unable to sync log file. File possibly "
+                          "corrupt. Error: "
+                       << sync_status.ToString();
+        }
+      }
     }
   }
 
@@ -521,6 +541,7 @@ xla::StatusOr<std::unique_ptr<TpuDriver>> RegisterRecordingTpuDriver(
 
   std::string file;
   std::string worker;
+  bool flush = false;
 
   for (const auto& config : configs) {
     std::vector<std::string> kv =
@@ -531,6 +552,11 @@ xla::StatusOr<std::unique_ptr<TpuDriver>> RegisterRecordingTpuDriver(
     if (kv[0] == "worker") {
       worker = kv[1];
     }
+    if (kv[0] == "flush") {
+      if (kv[1] == "true" || kv[1] == "1") {
+        flush = true;
+      }
+    }
   }
 
   TpuDriverConfig worker_config;
@@ -541,7 +567,7 @@ xla::StatusOr<std::unique_ptr<TpuDriver>> RegisterRecordingTpuDriver(
   auto driver = driver_status.ConsumeValueOrDie();
 
   return std::unique_ptr<TpuDriver>(
-      new RecordingTpuDriver(std::move(driver), file));
+      new RecordingTpuDriver(std::move(driver), file, flush));
 }
 
 // To record a sequence of operations, set the worker configuration string to
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index d5977f4f0cf..f5c1c2d5fa8 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/interpreter_device.h"
 #include "tensorflow/compiler/xla/pjrt/nvidia_gpu_device.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/tpu_client.h"
 #include "tensorflow/compiler/xla/python/bfloat16.h"
 #include "tensorflow/compiler/xla/python/dlpack.h"
 #include "tensorflow/compiler/xla/python/jax_jit.h"
@@ -75,7 +76,6 @@ namespace {
 
 namespace py = pybind11;
 
-
 struct Uniquer {
   absl::Mutex mu;
   NameUniquer name_uniquer TF_GUARDED_BY(mu);
@@ -171,13 +171,13 @@ class TraceMeWrapper : public tensorflow::profiler::TraceMeWrapper {
 void BuildProfilerSubmodule(py::module* m) {
   py::module profiler =
       m->def_submodule("profiler", "TensorFlow profiler integration");
-  py::class_<tensorflow::ProfilerServer,
-             std::unique_ptr<tensorflow::ProfilerServer>>
+  py::class_<tensorflow::profiler::ProfilerServer,
+             std::unique_ptr<tensorflow::profiler::ProfilerServer>>
       profiler_server_class(profiler, "ProfilerServer");
   profiler.def(
       "start_server",
-      [](int port) -> std::unique_ptr<tensorflow::ProfilerServer> {
-        auto server = absl::make_unique<tensorflow::ProfilerServer>();
+      [](int port) -> std::unique_ptr<tensorflow::profiler::ProfilerServer> {
+        auto server = absl::make_unique<tensorflow::profiler::ProfilerServer>();
         server->StartProfilerServer(port);
         return server;
       },
@@ -206,6 +206,23 @@ bool IsOptimizedBuild() {
 #endif  // NDEBUG
 }
 
+// Safe version of ShapeUtil::MakeShapeWithLayout that fails gracefully on
+// invalid input.
+StatusOr<Shape> MakeShapeWithLayout(
+    PrimitiveType element_type, absl::Span<const int64> dims,
+    absl::optional<absl::Span<const int64>> minor_to_major) {
+  TF_ASSIGN_OR_RETURN(Shape shape,
+                      ShapeUtil::MakeValidatedShape(element_type, dims));
+  if (minor_to_major) {
+    *shape.mutable_layout() = LayoutUtil::MakeLayout(*minor_to_major);
+    TF_RETURN_IF_ERROR(
+        LayoutUtil::ValidateLayoutForShape(shape.layout(), shape));
+  } else {
+    shape.clear_layout();
+  }
+  return shape;
+}
+
 }  // namespace
 
 PYBIND11_MODULE(xla_extension, m) {
@@ -262,15 +279,13 @@ PYBIND11_MODULE(xla_extension, m) {
       .def_static(
           "array_shape",
           [](PrimitiveType type, py::object dims_seq,
-             absl::optional<py::object> layout_seq) -> Shape {
+             absl::optional<py::object> layout_seq) -> StatusOr<Shape> {
             std::vector<int64> dims = IntSequenceToVector(dims_seq);
             if (layout_seq) {
               std::vector<int64> layout = IntSequenceToVector(*layout_seq);
-              return ShapeUtil::MakeShapeWithLayout(type, dims, layout);
+              return MakeShapeWithLayout(type, dims, layout);
             } else {
-              Shape shape = ShapeUtil::MakeShape(type, dims);
-              shape.clear_layout();
-              return shape;
+              return MakeShapeWithLayout(type, dims, absl::nullopt);
             }
           },
           "Constructs an array shape.", py::arg("type"), py::arg("dims"),
@@ -278,16 +293,14 @@ PYBIND11_MODULE(xla_extension, m) {
       .def_static(
           "array_shape",
           [](py::dtype dtype, py::object dims_seq,
-             absl::optional<py::object> layout_seq) -> Shape {
+             absl::optional<py::object> layout_seq) -> StatusOr<Shape> {
             PrimitiveType type = ValueOrThrow(DtypeToPrimitiveType(dtype));
             std::vector<int64> dims = IntSequenceToVector(dims_seq);
             if (layout_seq) {
               std::vector<int64> layout = IntSequenceToVector(*layout_seq);
-              return ShapeUtil::MakeShapeWithLayout(type, dims, layout);
+              return MakeShapeWithLayout(type, dims, layout);
             } else {
-              Shape shape = ShapeUtil::MakeShape(type, dims);
-              shape.clear_layout();
-              return shape;
+              return MakeShapeWithLayout(type, dims, absl::nullopt);
             }
           },
           "Constructs an array shape.", py::arg("type"), py::arg("dims"),
@@ -430,8 +443,13 @@ PYBIND11_MODULE(xla_extension, m) {
           })
       .def_property(
           "device_assignment",
-          [](const CompileOptions& options) {
-            return options.executable_build_options.device_assignment();
+          [](const CompileOptions& options)
+              -> absl::optional<DeviceAssignment> {
+            return options.executable_build_options.has_device_assignment()
+                       ? absl::optional<DeviceAssignment>(
+                             options.executable_build_options
+                                 .device_assignment())
+                       : absl::nullopt;
           },
           [](CompileOptions& options,
              const DeviceAssignment& device_assignment) {
@@ -466,32 +484,31 @@ PYBIND11_MODULE(xla_extension, m) {
              return local_device->client()->TransferToInfeedLocal(
                  literal, local_device->device_ordinal());
            })
-      .def(
-          "transfer_from_outfeed",
-          [](const PjRtDevice& device,
-             const Shape& shape) -> StatusOr<py::object> {
-            GlobalPyRefManager()->CollectGarbage();
-            std::shared_ptr<Literal> literal_shared;
-            {
-              py::gil_scoped_release gil_release;
-              TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
-                                  device.GetLocalDeviceState());
-              Shape shape_with_layout = shape;
-              ShapeUtil::ForEachMutableSubshape(
-                  &shape_with_layout, [](Shape* subshape, const ShapeIndex&) {
-                    if (!subshape->has_layout()) {
-                      LayoutUtil::SetToDefaultLayout(subshape);
-                    }
-                  });
-              TF_ASSIGN_OR_RETURN(
-                  Literal literal,
-                  local_device->client()->TransferFromOutfeedLocal(
-                      shape_with_layout, local_device->device_ordinal()));
+      .def("transfer_from_outfeed",
+           [](const PjRtDevice& device,
+              const Shape& shape) -> StatusOr<py::object> {
+             GlobalPyRefManager()->CollectGarbage();
+             std::shared_ptr<Literal> literal_shared;
+             {
+               py::gil_scoped_release gil_release;
+               TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
+                                   device.GetLocalDeviceState());
+               Shape shape_with_layout = shape;
+               ShapeUtil::ForEachMutableSubshape(
+                   &shape_with_layout, [](Shape* subshape, const ShapeIndex&) {
+                     if (!subshape->has_layout()) {
+                       LayoutUtil::SetToDefaultLayout(subshape);
+                     }
+                   });
+               TF_ASSIGN_OR_RETURN(
+                   Literal literal,
+                   local_device->client()->TransferFromOutfeedLocal(
+                       shape_with_layout, local_device->device_ordinal()));
 
-              literal_shared = std::make_shared<Literal>(std::move(literal));
-            }
-            return LiteralToPython(std::move(literal_shared));
-          });
+               literal_shared = std::make_shared<Literal>(std::move(literal));
+             }
+             return LiteralToPython(std::move(literal_shared));
+           });
 
   py::class_<CpuDevice, PjRtDevice, ClientAndPtr<CpuDevice>>(m, "CpuDevice")
       .def("__repr__", [](const CpuDevice& device) {
@@ -553,13 +570,13 @@ PYBIND11_MODULE(xla_extension, m) {
   m.def(
       "get_cpu_client",
       [](bool asynchronous) -> StatusOr<std::shared_ptr<PyClient>> {
-        TF_ASSIGN_OR_RETURN(std::shared_ptr<PjRtClient> client,
+        TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
                             GetCpuClient(asynchronous));
         return std::make_shared<PyClient>(std::move(client));
       },
       py::arg("asynchronous") = true);
   m.def("get_interpreter_client", []() -> StatusOr<std::shared_ptr<PyClient>> {
-    TF_ASSIGN_OR_RETURN(std::shared_ptr<PjRtClient> client,
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
                         GetInterpreterClient());
     return std::make_shared<PyClient>(std::move(client));
   });
@@ -569,7 +586,7 @@ PYBIND11_MODULE(xla_extension, m) {
          std::shared_ptr<DistributedRuntimeClient> distributed_client,
          int node_id) -> StatusOr<std::shared_ptr<PyClient>> {
         TF_ASSIGN_OR_RETURN(
-            std::shared_ptr<PjRtClient> client,
+            std::unique_ptr<PjRtClient> client,
             GetNvidiaGpuClient(asynchronous, allocator_config,
                                std::move(distributed_client), node_id));
         return std::make_shared<PyClient>(std::move(client));
@@ -577,6 +594,14 @@ PYBIND11_MODULE(xla_extension, m) {
       py::arg("asynchronous") = true,
       py::arg("allocator_config") = GpuAllocatorConfig(),
       py::arg("distributed_client") = nullptr, py::arg("node_id") = 0);
+  m.def(
+      "get_tpu_client",
+      [](bool asynchronous) -> StatusOr<std::shared_ptr<PyClient>> {
+        TF_ASSIGN_OR_RETURN(std::shared_ptr<PjRtClient> client,
+                            GetTpuClient(asynchronous));
+        return std::make_shared<PyClient>(std::move(client));
+      },
+      py::arg("asynchronous") = true);
 
   py::class_<Traceback::Frame>(m, "Frame")
       .def_readonly("file_name", &Traceback::Frame::file_name)
@@ -820,6 +845,14 @@ PYBIND11_MODULE(xla_extension, m) {
                              hlo_module.config().debug_options(),
                              RenderedGraphFormat::kDot);
         });
+  m.def(
+      "hlo_module_cost_analysis",
+      [](PyClient* client,
+         const HloModule& module) -> StatusOr<std::map<string, float>> {
+        auto analysis = client->pjrt_client()->GetHloCostAnalysis();
+        TF_RETURN_IF_ERROR(module.entry_computation()->Accept(analysis.get()));
+        return analysis->properties();
+      });
 
   py::class_<XlaOp> xla_op_class(m, "XlaOp");
 
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 38c55c6fe5d..3de0ffcc2f8 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -90,11 +90,16 @@ def _gpu_backend_factory(distributed_client=None, node_id=0):
       node_id=node_id)
 
 
+def _tpu_backend_factory():
+  return _xla.get_tpu_client(asynchronous=True)
+
+
 # Backend factories, keyed by user-visible name, in increasing priority order.
 _local_backend_factories = collections.OrderedDict([
     ('interpreter', _interpreter_backend_factory),
     ('cpu', _cpu_backend_factory),
     ('gpu', _gpu_backend_factory),
+    ('tpu', _tpu_backend_factory),
 ])
 
 
@@ -113,16 +118,17 @@ def _get_local_backends():
 
   _local_backends = collections.OrderedDict()
   for name, factory in _local_backend_factories.items():
-    logging.vlog(2, "Initializing backend '%s'" % name)
+    logging.vlog(1, "Initializing backend '%s'" % name)
     try:
       backend = factory()
-    except RuntimeError:
+    except RuntimeError as err:
       if name == 'cpu':
         # We always expect CPU to initialize successfully.
         raise
       else:
         # If the backend isn't built into the binary, or if it has no devices,
         # we expect a RuntimeError.
+        logging.vlog(1, "Error initializing backend '%s': %s" % (name, err))
         continue
     _local_backends[name] = backend
   return _local_backends
@@ -144,7 +150,8 @@ def get_local_backend(name=None):
     try:
       return backends[name]
     except KeyError:
-      raise RuntimeError('Unknown backend {}'.format(name))
+      raise RuntimeError(
+          'Unknown backend %s. Available: %s' % (name, list(backends.keys())))
 
   return list(backends.values())[-1]
 
@@ -191,8 +198,8 @@ XLA_ELEMENT_TYPE_TO_DTYPE = {
     PrimitiveType.F64: np.dtype('float64'),
     PrimitiveType.C64: np.dtype('complex64'),
     PrimitiveType.C128: np.dtype('complex128'),
-    PrimitiveType.TUPLE: np.dtype(np.object),
-    PrimitiveType.TOKEN: np.dtype(np.object),
+    PrimitiveType.TUPLE: np.dtype(np.object_),
+    PrimitiveType.TOKEN: np.dtype(np.object_),
 }
 
 # Note the conversion on the key. Numpy has a known issue wherein dtype hashing
diff --git a/tensorflow/compiler/xla/python/xla_client_backend_independent_test.py b/tensorflow/compiler/xla/python/xla_client_backend_independent_test.py
new file mode 100644
index 00000000000..180bb040cc4
--- /dev/null
+++ b/tensorflow/compiler/xla/python/xla_client_backend_independent_test.py
@@ -0,0 +1,147 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Backend-independent tests for the Python XLA client."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+from absl.testing import absltest
+import numpy as np
+
+from tensorflow.compiler.xla.python import xla_client
+
+# pylint: disable=g-import-not-at-top
+try:
+  import portpicker
+except ImportError:
+  portpicker = None
+# pylint: enable=g-import-not-at-top
+
+ops = xla_client.ops
+
+
+class ShapeTest(absltest.TestCase):
+
+  def testInvalidShapes(self):
+    with self.assertRaisesRegex(RuntimeError,
+                                "shape's dimensions must not be < 0.*"):
+      xla_client.Shape.array_shape(xla_client.PrimitiveType.F32, [-2, 4])
+
+    with self.assertRaisesRegex(
+        RuntimeError, "layout minor_to_major field contains 1 element.*"):
+      xla_client.Shape.array_shape(xla_client.PrimitiveType.F32, [2, 4], [3])
+
+    with self.assertRaisesRegex(
+        RuntimeError, "layout minor_to_major field has out-of-bounds value.*"):
+      xla_client.Shape.array_shape(xla_client.PrimitiveType.F32, [2, 4],
+                                   [1, -1])
+
+
+class ComputationPrinting(absltest.TestCase):
+
+  def ExampleComputation(self):
+    builder = xla_client.XlaBuilder("acomputation")
+    p0 = ops.Parameter(builder, 0, xla_client.shape_from_pyval(np.float32(0)))
+    p1 = ops.Parameter(builder, 1,
+                       xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
+    x = ops.Mul(p0, p1)
+    ops.Add(x, x)
+    return builder.build()
+
+  def testComputationToHloText(self):
+    computation = self.ExampleComputation()
+    hlo_text = computation.as_hlo_text()
+    self.assertTrue(hlo_text.startswith("HloModule acomputation"))
+
+  def testComputationToHloGraph(self):
+    computation = self.ExampleComputation()
+    hlo_dot_graph = computation.as_hlo_dot_graph()
+    self.assertTrue(hlo_dot_graph.startswith("digraph "))
+
+  def testHloModuleToHloText(self):
+    computation = self.ExampleComputation()
+    hlo_text = computation.as_hlo_module().to_string()
+    self.assertTrue(hlo_text.startswith("HloModule acomputation"))
+
+  def testHloModuleToHloGraph(self):
+    computation = self.ExampleComputation()
+    hlo_dot_graph = xla_client._xla.hlo_module_to_dot_graph(
+        computation.as_hlo_module())
+    self.assertTrue(hlo_dot_graph.startswith("digraph "))
+
+
+class ComputationHashTest(absltest.TestCase):
+
+  def testHash(self):
+    builder0 = xla_client.XlaBuilder("computation0")
+    p0 = ops.Parameter(builder0, 0, xla_client.shape_from_pyval(np.float32(0)))
+    p1 = ops.Parameter(builder0, 1,
+                       xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
+    ops.Mul(p0, p1)
+    computation0 = builder0.build()
+
+    builder1 = xla_client.XlaBuilder("computation1")
+    p0 = ops.Parameter(builder1, 0, xla_client.shape_from_pyval(np.float32(0)))
+    p1 = ops.Parameter(builder1, 1,
+                       xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
+    ops.Mul(p0, p1)
+    computation1 = builder1.build()
+
+    self.assertEqual(computation0.hash(), computation1.hash())
+
+
+class AliasTest(absltest.TestCase):
+
+  def testSetUpAlias(self):
+    c = xla_client.XlaBuilder(self.id())
+    p1 = ops.Parameter(
+        c, 0,
+        xla_client.shape_from_pyval(np.array(
+            1.0, np.float32)).with_major_to_minor_layout_if_absent())
+    p2 = ops.Parameter(
+        c, 1,
+        xla_client.shape_from_pyval(np.array(
+            1.0, np.float32)).with_major_to_minor_layout_if_absent())
+    out = ops.Add(p1, p2)
+    c.setup_alias([], 0, [])
+    c.build(out)
+
+
+class ProfilerTest(absltest.TestCase):
+
+  def testTraceMe(self):
+    # TODO(phawkins): These tests just check that the TraceMe context manager
+    # acts like a context manager and doesn't explode. Ideally we'd check that
+    # the profiler saw the traceme too.
+    with xla_client.profiler.TraceMe("test1"):
+      pass
+    with xla_client.profiler.TraceMe("test2", foo=123):
+      pass
+    with self.assertRaises(ValueError):
+      with xla_client.profiler.TraceMe("test3"):
+        raise ValueError("test")
+
+  @unittest.skipIf(portpicker is None, "Test requires portpicker")
+  def testStartServer(self):
+    port = portpicker.pick_unused_port()
+    server = xla_client.profiler.start_server(port)
+    del server
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 49c57a27ac0..1f8befd79d3 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -1,4 +1,3 @@
-# Lint as: python3
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the Python extension-based XLA client."""
+"""Backend-dependent tests for the Python XLA client."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -37,12 +36,6 @@ try:
 except ImportError:
   custom_call_for_test = None
 
-try:
-  import portpicker
-except ImportError:
-  portpicker = None
-# pylint: enable=g-import-not-at-top
-
 bfloat16 = xla_client.bfloat16
 ops = xla_client.ops
 
@@ -105,7 +98,7 @@ def TestFactory(xla_backend, cloud_tpu=False):
                                 c,
                                 arguments=(),
                                 expected=None,
-                                rtol=1e-7,
+                                rtol=1e-4,
                                 atol=0):
       self._ExecuteAndAssertWith(
           functools.partial(np.testing.assert_allclose, rtol=rtol, atol=atol),
@@ -142,27 +135,6 @@ def TestFactory(xla_backend, cloud_tpu=False):
       ops.Add(x, x)
       return builder.build()
 
-    def testComputationToHloText(self):
-      computation = self.ExampleComputation()
-      hlo_text = computation.as_hlo_text()
-      self.assertTrue(hlo_text.startswith("HloModule acomputation"))
-
-    def testComputationToHloGraph(self):
-      computation = self.ExampleComputation()
-      hlo_dot_graph = computation.as_hlo_dot_graph()
-      self.assertTrue(hlo_dot_graph.startswith("digraph "))
-
-    def testHloModuleToHloText(self):
-      computation = self.ExampleComputation()
-      hlo_text = computation.as_hlo_module().to_string()
-      self.assertTrue(hlo_text.startswith("HloModule acomputation"))
-
-    def testHloModuleToHloGraph(self):
-      computation = self.ExampleComputation()
-      hlo_dot_graph = xla_client._xla.hlo_module_to_dot_graph(
-          computation.as_hlo_module())
-      self.assertTrue(hlo_dot_graph.startswith("digraph "))
-
     @unittest.skipIf(cloud_tpu, "not implemented")
     def testCompiledHloModuleToHloText(self):
       computation = self.ExampleComputation()
@@ -173,31 +145,15 @@ def TestFactory(xla_backend, cloud_tpu=False):
       self.assertTrue(hlo_text.startswith("HloModule acomputation"))
       self.assertIn("fusion", hlo_text)
 
+    @unittest.skipIf(cloud_tpu, "not implemented")
+    def testFlopEstimate(self):
+      computation = self.ExampleComputation()
+      properties = xla_client._xla.hlo_module_cost_analysis(
+          self.backend, computation.as_hlo_module())
+      self.assertEqual(properties["flops"], 8.0)
+
   tests.append(ComputationPrinting)
 
-  class ComputationHashTest(absltest.TestCase):
-
-    def testHash(self):
-      builder0 = xla_client.XlaBuilder("computation0")
-      p0 = ops.Parameter(builder0, 0,
-                         xla_client.shape_from_pyval(np.float32(0)))
-      p1 = ops.Parameter(
-          builder0, 1, xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
-      ops.Mul(p0, p1)
-      computation0 = builder0.build()
-
-      builder1 = xla_client.XlaBuilder("computation1")
-      p0 = ops.Parameter(builder1, 0,
-                         xla_client.shape_from_pyval(np.float32(0)))
-      p1 = ops.Parameter(
-          builder1, 1, xla_client.shape_from_pyval(np.zeros((4,), np.float32)))
-      ops.Mul(p0, p1)
-      computation1 = builder1.build()
-
-      self.assertEqual(computation0.hash(), computation1.hash())
-
-  tests.append(ComputationHashTest)
-
   class ComputationsWithConstantsTest(ComputationTest):
     """Tests focusing on Constant ops."""
 
@@ -1894,24 +1850,6 @@ def TestFactory(xla_backend, cloud_tpu=False):
 
   tests.append(SetShardingTest)
 
-  class AliasTest(ComputationTest):
-
-    def testSetUpAlias(self):
-      c = self._NewComputation()
-      p1 = ops.Parameter(
-          c, 0,
-          xla_client.shape_from_pyval(
-              NumpyArrayF32(1.0)).with_major_to_minor_layout_if_absent())
-      p2 = ops.Parameter(
-          c, 1,
-          xla_client.shape_from_pyval(
-              NumpyArrayF32(1.0)).with_major_to_minor_layout_if_absent())
-      out = ops.Add(p1, p2)
-      c.setup_alias([], 0, [])
-      c = c.build(out)
-
-  tests.append(AliasTest)
-
   testcase_shapes = [
       (),
       (1,),
@@ -2015,28 +1953,6 @@ def TestFactory(xla_backend, cloud_tpu=False):
 
   tests.append(BufferProtocolTest)
 
-  class ProfilerTest(absltest.TestCase):
-
-    def testTraceMe(self):
-      # TODO(phawkins): These tests just check that the TraceMe context manager
-      # acts like a context manager and doesn't explode. Ideally we'd check that
-      # the profiler saw the traceme too.
-      with xla_client.profiler.TraceMe("test1"):
-        pass
-      with xla_client.profiler.TraceMe("test2", foo=123):
-        pass
-      with self.assertRaises(ValueError):
-        with xla_client.profiler.TraceMe("test3"):
-          raise ValueError("test")
-
-    @unittest.skipIf(portpicker is None, "Test requires portpicker")
-    def testStartServer(self):
-      port = portpicker.pick_unused_port()
-      server = xla_client.profiler.start_server(port)
-      del server
-
-  tests.append(ProfilerTest)
-
   class TracebackTest(absltest.TestCase):
 
     def setUp(self):
diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD
index 6e345b06e43..15022d1a879 100644
--- a/tensorflow/compiler/xla/rpc/BUILD
+++ b/tensorflow/compiler/xla/rpc/BUILD
@@ -1,12 +1,14 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")
 load(
     "//tensorflow:tensorflow.bzl",
+    "if_libtpu",
     "tf_cc_binary",
     "tf_cc_test",
 )
 load(
     "//tensorflow/core/platform:build_config.bzl",
-    "tf_proto_library_cc",
+    "tf_proto_library",
 )
 load(
     "//tensorflow/compiler/xla:xla.bzl",
@@ -18,7 +20,7 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "xla_service_proto",
     srcs = ["xla_service.proto"],
     has_services = 1,
@@ -50,13 +52,15 @@ cc_library(
     srcs = ["grpc_service_main.cc"],
     deps = [
         ":grpc_service",
-        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "@com_google_absl//absl/strings:str_format",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
-        "@com_google_absl//absl/strings:str_format",
         tf_grpc_cc_dependency(),
-    ],
+    ] + if_libtpu(
+        if_false = ["//tensorflow/compiler/xla/service:cpu_plugin"],
+        if_true = [],
+    ),
 )
 
 tf_cc_binary(
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index dd16bd32dd1..491d1d67877 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1,13 +1,22 @@
 # Description:
 #   XLA service implementation.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
 load(
     "//tensorflow/core/platform:build_config.bzl",
-    "tf_proto_library_cc",
-    "tf_proto_library_py",
+    "tf_proto_library",
 )
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "internal_hlo_deps")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "internal_cuda_deps")
 load(
     "//tensorflow/core/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
@@ -30,7 +39,7 @@ package_group(
     packages = ["//learning/brain/experimental/tf_runtime/..."],
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "hlo_proto",
     srcs = ["hlo.proto"],
     cc_api_version = 2,
@@ -38,20 +47,13 @@ tf_proto_library_cc(
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library_py(
-    name = "hlo_proto",  # bzl adds a _py suffix only to the OSS target.
-    srcs = ["hlo.proto"],
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/compiler/xla:xla_data_proto_py"],
-)
-
-tf_proto_library_cc(
+tf_proto_library(
     name = "hlo_profile_printer_data",
     srcs = ["hlo_profile_printer_data.proto"],
     cc_api_version = 2,
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "hlo_execution_profile_data",
     srcs = ["hlo_execution_profile_data.proto"],
     cc_api_version = 2,
@@ -83,6 +85,7 @@ cc_library(
     deps = [
         ":bfloat16_support",
         ":hlo",
+        ":hlo_dataflow_analysis",
         ":hlo_pass",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
@@ -201,7 +204,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/core:lib",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -446,9 +449,9 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/core:human_readable_json",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/platform:human_readable_json",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -477,6 +480,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/types:optional",
     ],
 )
@@ -876,7 +880,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -896,7 +900,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//third_party/eigen3",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -946,7 +950,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -982,7 +986,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1010,7 +1014,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -1021,7 +1025,7 @@ cc_library(
         ":service",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu:cpu_transfer_manager",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
     ],
 )
 
@@ -1054,14 +1058,14 @@ cc_library(
         ":service",
         "//tensorflow/compiler/xla/service/gpu:gpu_compiler",
         "//tensorflow/compiler/xla/service/gpu:gpu_transfer_manager",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
     ] + if_cuda_is_configured([
         "//tensorflow/compiler/xla/service/gpu:nvptx_compiler",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",
     ]) + if_rocm_is_configured([
         "//tensorflow/compiler/xla/service/gpu:amdgpu_compiler",
         "//tensorflow/core/platform/default/build_config:stream_executor_rocm",
-    ]),
+    ]) + internal_cuda_deps(),
 )
 
 cc_library(
@@ -1069,10 +1073,10 @@ cc_library(
     deps = [
         ":service",
         "//tensorflow/compiler/xla/service/gpu:gpu_transfer_manager",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
     ] + if_cuda_is_configured([
         "//tensorflow/compiler/xla/service/mlir_gpu:mlir_compiler_impl",
-    ]),
+    ]) + internal_cuda_deps(),
 )
 
 cc_library(
@@ -1082,7 +1086,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/interpreter:compiler",
         "//tensorflow/compiler/xla/service/interpreter:interpreter_transfer_manager",
         "//tensorflow/compiler/xla/service/interpreter:platform",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
     ],
 )
 
@@ -1100,7 +1104,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
@@ -1122,8 +1126,8 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:ptr_util",
-        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/memory",
     ],
@@ -1146,6 +1150,10 @@ cc_library(
         ":maybe_owning_device_memory",
         ":shaped_buffer",
         ":stream_pool",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@com_google_absl//absl/types:variant",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:shape_tree",
@@ -1156,15 +1164,11 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor:device_description",
         "//tensorflow/stream_executor:device_memory_allocator",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/types:variant",
-    ],
+    ] + internal_hlo_deps(),
 )
 
 cc_library(
@@ -1184,7 +1188,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -1217,7 +1221,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_memory",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -1258,7 +1262,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -1684,6 +1688,7 @@ cc_library(
     hdrs = ["multi_output_fusion.h"],
     deps = [
         ":hlo",
+        ":hlo_dataflow_analysis",
         ":hlo_dce",
         ":hlo_pass",
         ":hlo_reachability",
@@ -1703,7 +1708,6 @@ cc_library(
     srcs = ["hlo_creation_utils.cc"],
     hdrs = [
         "hlo_creation_utils.h",
-        "//tensorflow/compiler/xla:literal_util",
     ],
     deps = [
         ":hlo",
@@ -1938,6 +1942,29 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "qr_expander",
+    srcs = ["qr_expander.cc"],
+    hdrs = ["qr_expander.h"],
+    deps = [
+        ":op_expander_pass",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/client/lib:loops",
+        "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/client/lib:slicing",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
 cc_library(
     name = "convolution_4d_expander",
     srcs = ["convolution_4d_expander.cc"],
@@ -2340,10 +2367,13 @@ cc_library(
         ":call_inliner",
         ":hlo",
         ":hlo_casting_utils",
+        ":hlo_cse",
         ":hlo_dce",
         ":hlo_pass",
         ":hlo_pass_pipeline",
+        ":hlo_verifier",
         ":tuple_simplifier",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -2412,6 +2442,42 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "space_to_batch_converter",
+    srcs = ["space_to_batch_converter.cc"],
+    hdrs = ["space_to_batch_converter.h"],
+    deps = [
+        ":hlo",
+        ":hlo_creation_utils",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "space_to_batch_converter_test",
+    size = "small",
+    srcs = ["space_to_batch_converter_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_matchers",
+        ":space_to_batch_converter",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+    ],
+)
+
 cc_library(
     name = "while_loop_analysis",
     srcs = ["while_loop_analysis.cc"],
@@ -2789,7 +2855,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
@@ -2824,7 +2890,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
     ],
     alwayslink = True,  # Contains per-platform transfer manager registration
 )
@@ -2887,7 +2953,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/memory",
     ],
@@ -3425,6 +3491,8 @@ cc_library(
     hdrs = ["memory_space_assignment_utils.h"],
     deps = [
         ":heap_simulator",
+        ":hlo",
+        ":hlo_casting_utils",
     ],
 )
 
@@ -4156,7 +4224,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -4259,7 +4327,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:types",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -4271,7 +4339,7 @@ tf_cc_test(
         ":stream_pool",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
     ],
 )
 
@@ -4328,7 +4396,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//third_party/eigen3",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
@@ -4677,7 +4745,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/core:lib",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 214cbfa93a7..76b0236fcdd 100755
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -913,7 +913,7 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
        (Match(lhs, m::Multiply(m::Op(&c), m::Op(&a))) &&
         Match(rhs, m::MultiplyAnyOrder(m::Op().Is(c), m::Op(&b))))) &&
       (ShapeUtil::ElementIsIntegral(add->shape()) ||
-       IsAllFpConstantPowerOf2(c))) {
+       options_.enable_floats_are_real() || IsAllFpConstantPowerOf2(c))) {
     return ReplaceWithNewInstruction(
         add, HloInstruction::CreateBinary(
                  add->shape(), HloOpcode::kMultiply,
@@ -1300,7 +1300,15 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
       auto replacement =
           computation_->AddInstruction(concatenate->CloneWithNewOperands(
               concatenate->shape(), new_operands));
-      ReplaceInstructionIfSameShape(concatenate, replacement);
+
+      // Recurse to handle multiple disjoint sequence of inputs. The
+      // logic above merge only 1 sequential series of
+      // inputs. Otherwise, it can lead to the FixPass optimization
+      // hitting its threshold.
+      if (ReplaceInstructionIfSameShape(concatenate, replacement)) {
+        return HandleConcatenate(replacement);
+      }
+
       return Status::OK();
     }
   }
@@ -2702,6 +2710,17 @@ Status AlgebraicSimplifierVisitor::HandleMultiply(HloInstruction* multiply) {
     return Status::OK();
   }
 
+  {
+    HloInstruction* abs_operand;
+    if (lhs == rhs && Match(lhs, m::Abs(m::Op(&abs_operand))) &&
+        !ShapeUtil::ElementIsComplex(abs_operand->shape())) {
+      TF_RETURN_IF_ERROR(multiply->ReplaceOperandWith(0, abs_operand));
+      TF_RETURN_IF_ERROR(multiply->ReplaceOperandWith(1, abs_operand));
+      changed_ = true;
+      return Status::OK();
+    }
+  }
+
   {
     HloInstruction *convert_operand, *operand;
     // Mul(Convert(Pred), operand) => select(pred, operand, 0)
@@ -3303,6 +3322,9 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
   // padding with a pad with non-negative padding followed by a slice.
   bool all_zero = true;
   bool has_negative = false;
+  // Used to possibly split off the unchanged padding dimensions.
+  std::vector<int64> padding_dimensions;
+  int64 dimension_index = 0;
   for (auto& padding_dimension : pad->padding_config().dimensions()) {
     if (padding_dimension.edge_padding_low() < 0 ||
         padding_dimension.edge_padding_high() < 0) {
@@ -3311,12 +3333,93 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
     if (padding_dimension.edge_padding_low() != 0 ||
         padding_dimension.edge_padding_high() != 0) {
       all_zero = false;
+      padding_dimensions.push_back(dimension_index);
+    } else if (padding_dimension.interior_padding()) {
+      padding_dimensions.push_back(dimension_index);
     }
+    dimension_index++;
   }
 
   if (all_zero) {
-    ReplaceInstructionIfSameShape(pad, pad->mutable_operand(0));
-    return Status::OK();
+    if (ReplaceInstructionIfSameShape(pad, pad->mutable_operand(0))) {
+      return Status::OK();
+    }
+  }
+
+  // The context of this optimization can be found at b/163617402
+  // It tries to capture the case of pad(broadcast(x)), where
+  // x->shape().dimensions(), or broadcast(x)->dimensions(), is
+  // a subset of the padded dimensions in pad->config(),
+  // and the padded dimensions in pad->config() is in turn a strict
+  // subset of broadcast->shape().dimensions(). The combined op can be
+  // rewritten to broadcast2(pad(broadcast1(x))), where broadcast1 extends
+  // x  with dimensions that need to be padded, and broadcast2 extends
+  // the result of padding to full dimensions.
+  // TODO(qyi): for future extensions: The condition for broadcast(x)
+  // ->dimensions() to be a subset of padded dimensions in pad->config()
+  // does not have to be strictly required, but it makes the calculation
+  // for optimization easier, so it is required by the current implementation.
+  // Only the second condition between the padded dimensions and the
+  // dimensions of the final shape have to be enforced for the optimization
+  // to make sense. If needed to remove the first constraint, the shape
+  // calculations across the implementation need to be re-adjusted.
+  auto pad_dims = padding_dimensions.size();
+  if (pad_dims < dimension_index &&
+      pad->operand(0)->opcode() == HloOpcode::kBroadcast &&
+      pad->operand(0)->user_count() == 1 &&
+      pad->operand(0)->operand(0)->shape().rank() <= pad_dims) {
+    // Check broadcast operand dimensions is a subset of pading_dimensions.
+    // If not, skip the optimization.
+    bool opt_is_valid = true;
+    std::vector<int64> broadcast_dimensions;
+    HloBroadcastInstruction* broadcast =
+        static_cast<HloBroadcastInstruction*>(pad->mutable_operand(0));
+    for (auto broadcast_index : broadcast->dimensions()) {
+      bool found = false;
+      for (int i = 0; i < pad_dims; ++i) {
+        if (broadcast_index == padding_dimensions[i]) {
+          broadcast_dimensions.push_back(i);
+          found = true;
+          break;
+        }
+      }
+      if (!found) {
+        opt_is_valid = false;
+        break;
+      }
+    }
+    if (opt_is_valid) {
+      auto pad_shape = pad->shape();
+      auto broadcast_shape = broadcast->shape();
+      auto pad_shape1 = pad_shape;
+      auto broadcast_shape1 = broadcast_shape;
+      PaddingConfig pad_config;
+      for (int i = padding_dimensions.size() - 1; i >= 0; --i) {
+        int64 j = padding_dimensions[i];
+        while (--dimension_index > j) {
+          broadcast_shape1.DeleteDimension(dimension_index);
+          pad_shape1.DeleteDimension(dimension_index);
+        }
+      }
+      while (--dimension_index >= 0) {
+        broadcast_shape1.DeleteDimension(dimension_index);
+        pad_shape1.DeleteDimension(dimension_index);
+      }
+      for (auto dimension_to_pad : padding_dimensions) {
+        auto dimension = pad_config.add_dimensions();
+        *dimension = pad->padding_config().dimensions(dimension_to_pad);
+      }
+      *broadcast->mutable_shape() = broadcast_shape1;
+      *broadcast->mutable_dimensions() = broadcast_dimensions;
+      simplifier_->UpdateLayout(broadcast->mutable_shape());
+      auto pad2 =
+          computation_->AddInstruction(pad->CloneWithNewShape(pad_shape1));
+      *pad2->mutable_padding_config() = pad_config;
+      simplifier_->UpdateLayout(pad2->mutable_shape());
+      auto broadcast2 = computation_->AddInstruction(
+          HloInstruction::CreateBroadcast(pad_shape, pad2, padding_dimensions));
+      return ReplaceInstruction(pad, broadcast2);
+    }
   }
 
   if (has_negative) {
@@ -3351,7 +3454,8 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
         pad->shape(), nonzero_pad->mutable_shape()));
     simplifier_->UpdateLayout(nonzero_pad->mutable_shape());
 
-    // Second, construct the slice instruction to perform the negative padding.
+    // Second, construct the slice instruction to perform the negative
+    // padding.
     std::vector<int64> start_indices;
     std::vector<int64> end_indices;
     std::vector<int64> strides;
@@ -4012,8 +4116,10 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
       new_limits[i] -= low;
     }
     if (slice_in_padding) {
-      return ReplaceInstruction(
-          slice, MakeBroadcastHlo(pad->mutable_operand(1), {}, slice->shape()));
+      HloInstruction* broadcast =
+          MakeBroadcastHlo(pad->mutable_operand(1), {}, slice->shape());
+      *(broadcast->mutable_shape()) = slice->shape();
+      return ReplaceInstruction(slice, broadcast);
     }
     if (slice_undoes_pad && ReplaceInstructionIfSameShape(slice, pad_operand)) {
       return Status::OK();
@@ -4022,6 +4128,7 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
       TF_ASSIGN_OR_RETURN(HloInstruction * new_slice,
                           MakeSliceHlo(pad_operand, new_starts, new_limits,
                                        slice->slice_strides()));
+      *(new_slice->mutable_shape()) = slice->shape();
       return ReplaceInstruction(slice, new_slice);
     }
   }
@@ -4085,9 +4192,18 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
     VLOG(3) << "Sink broadcast through slice";
     VLOG(3) << "Original slice: " << slice->ToString();
     VLOG(3) << "Original broadcast: " << broadcast->ToString();
-    TF_ASSIGN_OR_RETURN(auto new_slice,
-                        MakeSliceHlo(broadcast_operand, new_slice_starts,
-                                     new_slice_limits, new_slice_strides));
+    auto new_slice_shape = broadcast_operand->shape();
+    for (int64 i = 0; i < broadcast_operand->shape().rank(); ++i) {
+      int64 size_i = (new_slice_limits[i] - new_slice_starts[i] +
+                      new_slice_strides[i] - 1) /
+                     new_slice_strides[i];
+      new_slice_shape.set_dimensions(i, size_i);
+    }
+    simplifier_->UpdateLayout(&new_slice_shape);
+    HloComputation* computation = broadcast_operand->parent();
+    auto new_slice = computation->AddInstruction(HloInstruction::CreateSlice(
+        new_slice_shape, broadcast_operand, new_slice_starts, new_slice_limits,
+        new_slice_strides));
     auto new_broadcast = HloInstruction::CreateBroadcast(
         slice->shape(), new_slice, broadcast->dimensions());
     VLOG(3) << "New slice: " << slice->ToString();
@@ -4187,9 +4303,15 @@ Status AlgebraicSimplifierVisitor::HandleDynamicSlice(
     VLOG(3) << "Original broadcast: " << operand->ToString();
     HloInstruction* new_dynamic_slice = broadcast_operand;
     if (!new_slice_sizes.empty()) {
-      TF_ASSIGN_OR_RETURN(
-          new_dynamic_slice,
-          MakeDynamicSliceHlo(broadcast_operand, new_indices, new_slice_sizes));
+      auto new_ds_shape = broadcast_operand->shape();
+      for (int64 i = 0; i < broadcast_operand->shape().rank(); ++i) {
+        new_ds_shape.set_dimensions(i, new_slice_sizes[i]);
+      }
+      simplifier_->UpdateLayout(&new_ds_shape);
+      HloComputation* computation = broadcast_operand->parent();
+      new_dynamic_slice =
+          computation->AddInstruction(HloInstruction::CreateDynamicSlice(
+              new_ds_shape, broadcast_operand, new_indices, new_slice_sizes));
     }
     auto new_broadcast = HloInstruction::CreateBroadcast(
         dynamic_slice->shape(), new_dynamic_slice, operand->dimensions());
@@ -5167,10 +5289,10 @@ StatusOr<bool> AlgebraicSimplifierVisitor::SwapConvOperands(
   if (!reverse_dimensions.empty()) {
     TF_ASSIGN_OR_RETURN(kernel, MakeReverseHlo(kernel, reverse_dimensions));
   }
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * new_convolution,
-      MakeConvolveHlo(kernel, input, /*feature_group_count=*/1, swapped_window,
-                      swapped_dnums, precision_config));
+  TF_ASSIGN_OR_RETURN(HloInstruction * new_convolution,
+                      MakeConvolveHlo(kernel, input, /*feature_group_count=*/1,
+                                      /*batch_group_count=*/1, swapped_window,
+                                      swapped_dnums, precision_config));
 
   convolution->SetupDerivedInstruction(new_convolution);
   TF_RETURN_IF_ERROR(ReplaceInstruction(convolution, new_convolution));
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index 9f2a3404116..cabecec4eb8 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -97,6 +97,14 @@ class AlgebraicSimplifierOptions {
     return enable_scalar_multiply_reduction_;
   }
 
+  // Also the algebraic simplifer to treat floating point values like real
+  // numbers.
+  void set_enable_floats_are_real(bool enable_floats_are_real) {
+    enable_floats_are_real_ = enable_floats_are_real;
+  }
+
+  bool enable_floats_are_real() const { return enable_floats_are_real_; }
+
   // If enable_window_reduce_replacement is true, the kReduceWindow instruction
   // can be optimized by replacement with simpler operations.
   void set_enable_window_reduce_to_reduce_replacement(
@@ -158,6 +166,7 @@ class AlgebraicSimplifierOptions {
   bool enable_conv_simplification_{true};
   bool enable_conv_operand_swap_{true};
   bool enable_scalar_multiply_reduction_{false};
+  bool enable_floats_are_real_{false};
   bool enable_window_reduce_to_reduce_replacement_{true};
   bool enable_reduce_of_reshape_{true};
   bool replace_transpose_with_bitcast_{true};
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 70147f6ecad..c4f3ea4087b 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -117,6 +117,22 @@ TEST_F(AlgebraicSimplifierTest, FactorFpAddition) {
                   m::ConstantScalar(0.125))));
 }
 
+// (Abs(A)) * (Abs(A)) => (A*A)
+TEST_F(AlgebraicSimplifierTest, SquareOfAbs) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p = f32[] parameter(0)
+      a = f32[] abs(p)
+      ROOT z = f32[] multiply(a, a)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(0))));
+}
+
 // (A*C1) * (B*C2) => (A*B)*(C1*C2)
 TEST_F(AlgebraicSimplifierTest, MultiplyChain) {
   const char* kModuleStr = R"(
@@ -2319,7 +2335,7 @@ TEST_F(AlgebraicSimplifierTest, ConcatenateOfBroadcastBecomesPad) {
 TEST_F(AlgebraicSimplifierTest, SimplifyConcatenateOfSlices) {
   auto m = CreateNewVerifiedModule();
   Shape r2f32 = ShapeUtil::MakeShape(F32, {100, 99});
-  Shape concat_shape = ShapeUtil::MakeShape(F32, {50, 80});
+  Shape concat_shape = ShapeUtil::MakeShape(F32, {50, 90});
   HloComputation::Builder builder(TestName());
   HloInstruction* param0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, r2f32, "param0"));
@@ -2366,10 +2382,15 @@ TEST_F(AlgebraicSimplifierTest, SimplifyConcatenateOfSlices) {
   HloInstruction* slice7 = builder.AddInstruction(HloInstruction::CreateSlice(
       ShapeUtil::MakeShape(F32, {50, 10}), param1, /*start_indices=*/{50, 79},
       /*limit_indices=*/{100, 89}, /*strides=*/{1, 1}));
+  // Can merge 'slice7' and 'slice8'.
+  HloInstruction* slice8 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param1, /*start_indices=*/{50, 89},
+      /*limit_indices=*/{100, 99}, /*strides=*/{1, 1}));
 
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       concat_shape,
-      {slice0, slice1, slice2, slice3, slice4, slice5, slice6, slice7}, 1));
+      {slice0, slice1, slice2, slice3, slice4, slice5, slice6, slice7, slice8},
+      1));
   auto computation = m->AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(default_options_);
@@ -2384,6 +2405,12 @@ TEST_F(AlgebraicSimplifierTest, SimplifyConcatenateOfSlices) {
       ShapeUtil::Equal(computation->root_instruction()->operand(3)->shape(),
                        ShapeUtil::MakeShape(F32, {50, 30})));
   EXPECT_EQ(computation->root_instruction()->operand(3)->slice_starts(1), 40);
+
+  // The operand 6 should be  merge of 'slice7' and 'slice8', so its
+  // shape should have dimensions {50, 20}
+  EXPECT_TRUE(
+      ShapeUtil::Equal(computation->root_instruction()->operand(5)->shape(),
+                       ShapeUtil::MakeShape(F32, {50, 20})));
 }
 
 // Test that a simplification which changes layouts is not performed if layout
@@ -6955,5 +6982,57 @@ TEST_F(AlgebraicSimplifierTest, UnaryVariadicReduce) {
               GmockMatch(m::Add(m::Parameter(0), m::Parameter(1))));
 }
 
+TEST_F(AlgebraicSimplifierTest, BroadcastAndPadReorder) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      c1 = pred[] constant(true)
+      b2 = pred[32,1,768]{2,1,0} broadcast(pred[] c1), dimensions={}
+      c3 = pred[] constant(false)
+      ROOT p4 = pred[4096,1,768]{2,1,0} pad(pred[32,1,768]{2,1,0} b2, pred[] c3), padding=0_4064x0_0x0_0
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Broadcast(
+                  m::Pad(m::Broadcast(m::Constant()), m::Constant()))));
+}
+
+TEST_F(AlgebraicSimplifierTest, BroadcastAndPadReorderWithUse) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      c1 = pred[] constant(true)
+      b2 = pred[1,768,32]{2,1,0} broadcast(pred[] c1), dimensions={}
+      c3 = pred[] constant(false)
+      p4 = pred[1,768,4096]{2,1,0} pad(pred[1,768,32]{2,1,0} b2, pred[] c3), padding=0_0x0_0x0_4064
+      ROOT p5 = (pred[1,768,4096]{2,1,0}) tuple(pred[1,768,4096]{2,1,0} p4)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::Broadcast(
+                  m::Pad(m::Broadcast(m::Constant()), m::Constant())))));
+}
+
+TEST_F(AlgebraicSimplifierTest, BroadcastAndPadReorderWithNonScalar) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      c1 = pred[32] parameter(0)
+      b2 = pred[1,768,32]{2,1,0} broadcast(pred[32] c1), dimensions={2}
+      c3 = pred[] constant(false)
+      p4 = pred[1,768,4096]{2,1,0} pad(pred[1,768,32]{2,1,0} b2, pred[] c3), padding=0_0x0_0x0_4064
+      ROOT p5 = (pred[1,768,4096]{2,1,0}) tuple(pred[1,768,4096]{2,1,0} p4)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Tuple(m::Broadcast(
+                  m::Pad(m::Broadcast(m::Parameter()), m::Constant())))));
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/all_reduce_simplifier.cc b/tensorflow/compiler/xla/service/all_reduce_simplifier.cc
index 541006f04d5..18a0fdc1a70 100644
--- a/tensorflow/compiler/xla/service/all_reduce_simplifier.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_simplifier.cc
@@ -31,27 +31,7 @@ StatusOr<bool> AllReduceSimplifier::Run(HloModule* module) {
   TF_ASSIGN_OR_RETURN(
       auto replication,
       HloReplicationAnalysis::Run(module, /*cross_partition_spmd=*/false));
-  std::vector<HloInstruction*> all_reduces_to_replace;
-  for (auto computation : module->computations()) {
-    for (HloInstruction* inst : computation->MakeInstructionPostOrder()) {
-      if (!inst->shape().IsArray()) {
-        // We currently do not change tuple-shaped all-reduce.
-        // Until XLA will support Token fed AllReduce(), the PyTorch client code
-        // uses a fake data token (constant) which relies on this pass to not
-        // optimize out (being fed within a tuple input).
-        continue;
-      }
-      if (inst->IsCrossReplicaAllReduce() &&
-          replication->HloInstructionIsReplicatedAt(inst->operand(0), {})) {
-        all_reduces_to_replace.push_back(inst);
-      }
-    }
-  }
-
-  bool changed = false;
-  if (all_reduces_to_replace.empty()) {
-    return changed;
-  }
+  std::vector<std::pair<HloInstruction*, int64>> all_reduces_to_replace;
 
   // Returns the size of a replica group if all groups have the same size, or -1
   // if they have different sizes.
@@ -71,7 +51,40 @@ StatusOr<bool> AllReduceSimplifier::Run(HloModule* module) {
     return replica_group_size;
   };
 
-  for (auto all_reduce : all_reduces_to_replace) {
+  for (auto computation : module->computations()) {
+    for (HloInstruction* inst : computation->MakeInstructionPostOrder()) {
+      if (!inst->shape().IsArray()) {
+        // We currently do not change tuple-shaped all-reduce.
+        // Until XLA will support Token fed AllReduce(), the PyTorch client code
+        // uses a fake data token (constant) which relies on this pass to not
+        // optimize out (being fed within a tuple input).
+        continue;
+      }
+      if (!inst->IsCrossReplicaAllReduce()) {
+        continue;
+      }
+      int64 group_size = get_replica_group_size(inst);
+      if (group_size == -1) {
+        continue;
+      }
+      if (replication->HloInstructionIsReplicatedAt(inst->operand(0), {}) ||
+          group_size == 1) {
+        all_reduces_to_replace.push_back({inst, group_size});
+      }
+    }
+  }
+
+  bool changed = false;
+
+  for (auto all_reduce_and_group_size : all_reduces_to_replace) {
+    auto all_reduce = all_reduce_and_group_size.first;
+    const int64 replica_group_size = all_reduce_and_group_size.second;
+    if (replica_group_size == 1) {
+      TF_RETURN_IF_ERROR(all_reduce->parent()->ReplaceInstruction(
+          all_reduce, all_reduce->mutable_operand(0)));
+      changed = true;
+      continue;
+    }
     if (all_reduce->to_apply()->instruction_count() != 3 ||
         all_reduce->to_apply()->num_parameters() != 2) {
       continue;
@@ -79,10 +92,6 @@ StatusOr<bool> AllReduceSimplifier::Run(HloModule* module) {
     HloInstruction* replacement;
     switch (all_reduce->to_apply()->root_instruction()->opcode()) {
       case HloOpcode::kAdd: {
-        int64 replica_group_size = get_replica_group_size(all_reduce);
-        if (replica_group_size == -1) {
-          continue;
-        }
         // Create the multiplier:
         //   broadcast(convert_to_matching_type(s32 group size))
         auto multiplier =
diff --git a/tensorflow/compiler/xla/service/all_reduce_simplifier_test.cc b/tensorflow/compiler/xla/service/all_reduce_simplifier_test.cc
index 4914836b34a..1e938594cc3 100644
--- a/tensorflow/compiler/xla/service/all_reduce_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/all_reduce_simplifier_test.cc
@@ -167,5 +167,30 @@ test {
           m::Parameter(0), m::AllReduce(m::Parameter(1)))));
 }
 
+TEST_F(AllReduceSimplifierTest, TrivialSubgroupAllReduce) {
+  const char* kModuleStr = R"(
+HloModule m
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add.2 = f32[] add(a, b)
+}
+
+
+test {
+  p0 = f32[8,16] parameter(0), parameter_replication={false}
+  ROOT all-reduce = f32[8,16] all-reduce(p0),
+    replica_groups={{0},{1},{2},{3},{4},{5},{6},{7}},
+    to_apply=sum
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(
+                                           kModuleStr, /*replica_count=*/8));
+  AllReduceSimplifier simplifier(/*replica_count=*/8);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Parameter(0)));
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index cc501161ce9..19927ae1576 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -143,13 +143,10 @@ StatusOr<std::vector<GlobalDataHandle>> AllocationTracker::DeconstructTuple(
   // We only need to care about replica id 0 here, since the GlobalDataHandle is
   // the same for all buffers across replicas.
   const ShapedBuffer* shaped_buffer = replicated_buffers[0];
-  if (!shaped_buffer->on_host_shape().IsTuple()) {
+  if (!shaped_buffer->on_device_shape().IsTuple()) {
     return InvalidArgument("global data handle %d is not a tuple",
                            data.handle());
   }
-  // If the on-host representation is a tuple, then the on-device one should be
-  // as well.
-  TF_RET_CHECK(shaped_buffer->on_device_shape().IsTuple());
 
   if (ShapeUtil::IsNestedTuple(shaped_buffer->on_device_shape())) {
     return Unimplemented("Deconstructing nested tuples is not implemented.");
@@ -160,7 +157,6 @@ StatusOr<std::vector<GlobalDataHandle>> AllocationTracker::DeconstructTuple(
        i < ShapeUtil::TupleElementCount(shaped_buffer->on_device_shape());
        ++i) {
     auto element_buffer = ShapedBuffer(
-        ShapeUtil::GetTupleElementShape(shaped_buffer->on_host_shape(), i),
         ShapeUtil::GetTupleElementShape(shaped_buffer->on_device_shape(), i),
         shaped_buffer->platform(), shaped_buffer->device_ordinal());
     element_buffer.set_buffer(shaped_buffer->buffer(/*index=*/{i}),
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
index 23d2a9225a8..73210e6b3dc 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -159,19 +160,20 @@ Status BFloat16ConversionFoldingVisitor::TryFoldBF16Conversions(
 
 Status BFloat16ConversionFoldingVisitor::DefaultAction(HloInstruction* hlo) {
   // Do not fold BF16 conversions for instructions related to tuples, entry and
-  // exit of a computation, fusion, convert, side-effecting instructions and
-  // control flow.
-  if (hlo->opcode() == HloOpcode::kTuple ||            //
-      hlo->opcode() == HloOpcode::kGetTupleElement ||  //
-      hlo->opcode() == HloOpcode::kConstant ||         //
-      hlo->opcode() == HloOpcode::kParameter ||        //
-      hlo->opcode() == HloOpcode::kFusion ||           //
-      hlo->opcode() == HloOpcode::kBitcastConvert ||   //
-      hlo->opcode() == HloOpcode::kConvert ||          //
-      hlo->opcode() == HloOpcode::kCall ||             //
-      hlo->opcode() == HloOpcode::kCustomCall ||       //
-      hlo->opcode() == HloOpcode::kWhile ||            //
-      hlo->opcode() == HloOpcode::kConditional ||      //
+  // exit of a computation, fusion, convert, side-effecting instructions,
+  // in-place operations and control flow.
+  if (hlo->opcode() == HloOpcode::kTuple ||                      //
+      hlo->opcode() == HloOpcode::kGetTupleElement ||            //
+      hlo->opcode() == HloOpcode::kConstant ||                   //
+      hlo->opcode() == HloOpcode::kParameter ||                  //
+      hlo->opcode() == HloOpcode::kFusion ||                     //
+      hlo->opcode() == HloOpcode::kBitcastConvert ||             //
+      hlo->opcode() == HloOpcode::kConvert ||                    //
+      hlo->opcode() == HloOpcode::kCall ||                       //
+      hlo->opcode() == HloOpcode::kCustomCall ||                 //
+      hlo->opcode() == HloOpcode::kWhile ||                      //
+      hlo->opcode() == HloOpcode::kConditional ||                //
+      HloDataflowAnalysis::IsInPlaceOperation(hlo->opcode()) ||  //
       hlo->HasSideEffectNoRecurse()) {
     return Status::OK();
   }
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
index a0fe0eaa1d9..f9e19493a86 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
@@ -598,6 +598,31 @@ bool BFloat16Propagation::ResolveInconsistencyOfAliasingBuffersHelper(
         type = F32;
         break;
       }
+      // In order to find aliases due to in-place operations, use
+      // GetInPlaceInputOutputPairs. Ideally, we'd use HloAliasAnalysis here,
+      // but this code works with HloModules that aren't ready yet to use
+      // HloAliasAnalysis (e.g., their computation graphs may not have been
+      // flattened yet).
+      for (const auto& operand_and_output_index :
+           HloDataflowAnalysis::GetInPlaceInputOutputPairs(hlo)) {
+        if (operand_and_output_index.second == index) {
+          const HloUse& operand = operand_and_output_index.first;
+          for (const auto* value :
+               dataflow_
+                   ->GetValueSet(hlo->operand(operand.operand_number),
+                                 operand.operand_index)
+                   .values()) {
+            auto value_type = ValueTypeAfterChange(value);
+            if (value_type == BF16) {
+              continue;
+            }
+            CHECK_EQ(value_type, F32);
+            type = F32;
+            break;
+          }
+        }
+      }
+
       // It's possible that a user has been changed from BF16 to F32
       // during this final adjustment pass, so we need to check
       // AllUsersConsumeBF16() again.
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
index 02d79025f1b..9a898833373 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
@@ -1156,4 +1156,30 @@ ENTRY entry {
   EXPECT_FALSE(PropagatePrecision(module.get()));
 }
 
+TEST_F(BFloat16PropagationTest, DynamicUpdateSlice) {
+  // This test is crafted so that the DUS has an f32 input (due to parameter)
+  // and bf16 output (due to dot). But we should enforce DUS operand 0 and
+  // output to get the same precision since it's an in-place operation.
+  const string module_str = R"(
+HloModule Module
+
+ENTRY main {
+  param = f32[128,128] parameter(0)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  dynamic-update-slice = f32[128,128] dynamic-update-slice(param, broadcast.6, constant.3, constant.3)
+  ROOT dot = f32[128,128] dot(dynamic-update-slice, dynamic-update-slice), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  EXPECT_FALSE(PropagatePrecision(module.get()));
+
+  HloInstruction* dus = module->entry_computation()->GetInstructionWithName(
+      "dynamic-update-slice");
+  EXPECT_FALSE(OutputsBF16(dus));
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index a0989d5765e..db34f054f35 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -1007,102 +1007,6 @@ bool BufferAssigner::MaybeAssignBuffer(BufferAllocation* allocation,
   return true;
 }  // namespace xla
 
-Status BufferAssigner::MergeInplaceOpBuffers(BufferAssignment* assignment) {
-  // Try allocate same buffer for dynamic update slice's operand and output.
-
-  // If memory_space_assignment is run and there is information about a color in
-  // preset assignments, don't merge those buffers. We expect
-  // memory_space_assignment to have merged these buffers. If
-  // memory_space_assignment didn't merge these buffers and have assigned
-  // different offsets to the operand and the output buffer, merging the buffers
-  // can cause memory corruption if memory_space_assignment assigned a different
-  // buffer at the same offset.
-  absl::flat_hash_set<int64> excluded_colors;
-  if (preset_assignments_) {
-    for (const auto& color_and_info :
-         preset_assignments_->assignment_informations()) {
-      excluded_colors.insert(color_and_info.first);
-    }
-  }
-
-  // TODO(yunxing): Moving this logic to alias analysis and add must-alias rule
-  // to operations that can be done in place.
-  for (HloComputation* computation : assignment->module().computations()) {
-    for (HloInstruction* instruction : computation->instructions()) {
-      if (!(instruction->opcode() == HloOpcode::kDynamicUpdateSlice ||
-            (instruction->opcode() == HloOpcode::kFusion &&
-             (instruction->fused_expression_root()->opcode() ==
-              HloOpcode::kDynamicUpdateSlice)))) {
-        continue;
-      }
-      if (instruction->parent()->IsFusionComputation()) {
-        continue;
-      }
-      if (instruction->operand_count() == 0) {
-        continue;
-      }
-
-      // The operand can't share the same buffer with the user based on dataflow
-      // analysis.
-      if (!assignment->dataflow_analysis().CanShareOperandBufferWithUser(
-              instruction->mutable_operand(0), {}, instruction, {})) {
-        continue;
-      }
-      HloBuffer& instruction_buffer =
-          assignment->alias_analysis().GetUniqueBufferAt(instruction, {});
-
-      HloBuffer& operand_buffer =
-          assignment->alias_analysis().GetUniqueBufferAt(
-              instruction->operand(0), {});
-
-      // The instruction or operand color is excluded because it was assigned by
-      // memory_space_assignment.
-      if (excluded_colors.contains(instruction_buffer.color()) ||
-          excluded_colors.contains(operand_buffer.color())) {
-        continue;
-      }
-
-      // Already have the same buffer. No need to merge those.
-      if (instruction_buffer.id() == operand_buffer.id()) {
-        continue;
-      }
-
-      // Do not perform in-place dynamic update slice if the operand buffer is
-      // read-only.
-      if (HloBufferIsReadOnly(operand_buffer)) {
-        continue;
-      }
-
-      bool interfere = false;
-
-      for (const HloValue* instruction_value : instruction_buffer.values()) {
-        for (const HloValue* operand_value : operand_buffer.values()) {
-          if (assignment->hlo_ordering().MayInterfere(
-                  *instruction_value, *operand_value,
-                  assignment->dataflow_analysis())) {
-            interfere = true;
-            break;
-          }
-        }
-      }
-      if (interfere) {
-        continue;
-      }
-      if (assignment->alias_analysis().BufferLivesOut(instruction_buffer)) {
-        continue;
-      }
-      if (instruction_buffer.color() != operand_buffer.color()) {
-        continue;
-      }
-      VLOG(3) << "Merging inplace " << instruction_buffer << " and "
-              << operand_buffer;
-      assignment->alias_analysis().MergeBuffers(instruction_buffer,
-                                                operand_buffer);
-    }
-  }
-  return Status::OK();
-}
-
 Status BufferAssigner::AssignSingleHloBuffer(
     const HloBuffer* hlo_buffer, bool is_thread_local,
     absl::flat_hash_map<const HloComputation*,
@@ -1654,7 +1558,6 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::CreateAssignment(
   VLOG(3) << "After coloring:";
   XLA_VLOG_LINES(3,
                  assignment->alias_analysis().dataflow_analysis().ToString());
-  TF_RETURN_IF_ERROR(MergeInplaceOpBuffers(assignment.get()));
 
   std::vector<const HloComputation*> thread_local_computations;
   std::vector<const HloComputation*> global_computations;
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 60422965832..dfde46ca4b1 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -635,10 +635,6 @@ class BufferAssigner {
       absl::flat_hash_set<const HloBuffer*>* assigned_buffers,
       BufferAssignment* assignment);
 
-  // Promotes operations (DUS, scatter) to be done in place: If an operation can
-  // be done in place, merge its buffer with its operand buffer.
-  Status MergeInplaceOpBuffers(BufferAssignment* assignment);
-
   // Assigns a single hlo buffer to an HLO allocation.
   Status AssignSingleHloBuffer(
       const HloBuffer* hlo_buffer, bool is_thread_local,
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index bc024f7144b..b49ca649f9a 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -1925,8 +1925,10 @@ ENTRY main {
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_text));
   HloInstruction* parameter =
       m->entry_computation()->GetInstructionWithName("get-tuple-element.4");
-  HloInstruction* dus =
+  HloInstruction* dus1 =
       m->entry_computation()->GetInstructionWithName("dynamic-update-slice.5");
+  HloInstruction* dus2 =
+      m->entry_computation()->GetInstructionWithName("dynamic-update-slice.9");
 
   auto buffers = RunBufferAssignment(m.get());
 
@@ -1934,8 +1936,10 @@ ENTRY main {
     const BufferAllocation& parameter_alloc =
         GetTopLevelAllocation(*buffers, parameter);
 
-    const BufferAllocation& dus_alloc = GetTopLevelAllocation(*buffers, dus);
-    EXPECT_NE(parameter_alloc, dus_alloc);
+    const BufferAllocation& dus1_alloc = GetTopLevelAllocation(*buffers, dus1);
+    EXPECT_EQ(parameter_alloc, dus1_alloc);
+    const BufferAllocation& dus2_alloc = GetTopLevelAllocation(*buffers, dus2);
+    EXPECT_EQ(parameter_alloc, dus2_alloc);
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/cholesky_expander.cc b/tensorflow/compiler/xla/service/cholesky_expander.cc
index 20576cdc52d..4abfe1b018e 100644
--- a/tensorflow/compiler/xla/service/cholesky_expander.cc
+++ b/tensorflow/compiler/xla/service/cholesky_expander.cc
@@ -35,8 +35,6 @@ limitations under the License.
 
 namespace xla {
 
-namespace {
-
 // The Cholesky–Banachiewicz algorithm. See
 // https://en.wikipedia.org/wiki/Cholesky_decomposition#The_Cholesky–Banachiewicz_and_Cholesky–Crout_algorithms
 // for a description.
@@ -54,78 +52,81 @@ namespace {
 //     l = temp / l[..., j, j) * mask + l
 //   return l
 // Returns a (result, error) pair.
-std::pair<XlaOp, XlaOp> CholeskyUnblocked(
+StatusOr<std::pair<XlaOp, XlaOp>> CholeskyExpander::CholeskyUnblocked(
     XlaOp a, PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
-  auto result = [&]() -> StatusOr<std::pair<XlaOp, XlaOp>> {
-    TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
-    const int n_dims = a_shape.rank();
-    const int64 n = ShapeUtil::GetDimension(a_shape, -1);
-    auto major_dims = AsInt64Slice(a_shape.dimensions())
-                          .subspan(
-                              /*pos=*/0,
-                              /*len=*/n_dims - 2);
+  TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+  const int ndims = a_shape.rank();
+  const int64 n = ShapeUtil::GetDimension(a_shape, -1);
+  std::vector<int64> error_dims(a_shape.dimensions().begin(),
+                                a_shape.dimensions().end());
+  error_dims.back() = error_dims.at(ndims - 2) = 1;
 
-    auto matrix_dims = AsInt64Slice(a_shape.dimensions())
-                           .subspan(
-                               /*pos=*/0,
-                               /*len=*/n_dims);
+  auto major_dims = AsInt64Slice(a_shape.dimensions())
+                        .subspan(
+                            /*pos=*/0,
+                            /*len=*/ndims - 2);
 
-    XlaOp l = ZerosLike(a);
+  auto matrix_dims = AsInt64Slice(a_shape.dimensions())
+                         .subspan(
+                             /*pos=*/0,
+                             /*len=*/ndims);
 
-    // Construct the for loop body to iterate over rows.
-    auto body_fn =
-        [&](XlaOp i, absl::Span<const XlaOp> loop_vars,
-            XlaBuilder* body_builder) -> StatusOr<std::vector<XlaOp>> {
-      std::vector<int64> row_shape_dims(major_dims.begin(), major_dims.end());
-      std::vector<int64> col_shape_dims(major_dims.begin(), major_dims.end());
-      auto body_a = loop_vars[0];
-      auto body_l = loop_vars[1];
-      auto seen_error = loop_vars[2];
-      auto iota_row = Iota(body_builder, ShapeUtil::MakeShape(S32, matrix_dims),
-                           n_dims - 1);
-      auto iota_col = Iota(body_builder, ShapeUtil::MakeShape(S32, matrix_dims),
-                           n_dims - 2);
+  XlaOp l = ZerosLike(a);
 
-      auto mask_pred = Ge(iota_col, iota_row);
-      mask_pred = And(mask_pred, Eq(iota_row, i));
-      auto mask_zeros =
-          Zeros(body_builder,
-                ShapeUtil::MakeShape(a_shape.element_type(), matrix_dims));
-      // L * L.T, This matrix has of a lot of multiplying with zero
-      // (namely, L[:, j:] = 0) and redundant computation, but it is faster
-      // than slice.
-      auto l_square = BatchDot(body_l, false, body_l, true, precision);
+  // Construct the for loop body to iterate over rows.
+  auto body_fn = [&](XlaOp i, absl::Span<const XlaOp> loop_vars,
+                     XlaBuilder* body_builder) -> StatusOr<std::vector<XlaOp>> {
+    std::vector<int64> row_shape_dims(major_dims.begin(), major_dims.end());
+    std::vector<int64> col_shape_dims(major_dims.begin(), major_dims.end());
+    auto body_a = loop_vars[0];
+    auto body_l = loop_vars[1];
+    auto seen_error = loop_vars[2];
+    auto iota_row =
+        Iota(body_builder, ShapeUtil::MakeShape(S32, matrix_dims), ndims - 1);
+    auto iota_col =
+        Iota(body_builder, ShapeUtil::MakeShape(S32, matrix_dims), ndims - 2);
 
-      // A - L*L.T
-      l_square = body_a - l_square;
-      auto l_ii = DynamicSliceInMinorDims(l_square, {i, i}, {1, 1});
+    auto mask_pred = Ge(iota_col, iota_row);
+    mask_pred = And(mask_pred, Eq(iota_row, i));
+    auto mask_zeros =
+        Zeros(body_builder,
+              ShapeUtil::MakeShape(a_shape.element_type(), matrix_dims));
+    // L * L.T, This matrix has of a lot of multiplying with zero
+    // (namely, L[:, j:] = 0) and redundant computation, but it is faster
+    // than slice.
+    auto l_square =
+        BatchDot(body_l, false, MaybeConjugate(body_l, true), true, precision);
+
+    // A - L*L.T
+    l_square = body_a - l_square;
+    auto l_ii = DynamicSliceInMinorDims(l_square, {i, i}, {1, 1});
+    if (ShapeUtil::ElementIsComplex(a_shape)) {
+      auto sqrt = Sqrt(Real(l_ii));
+      l_ii = Complex(sqrt, ZerosLike(sqrt));
+      seen_error = Or(seen_error, IsNan(sqrt));
+    } else {
       l_ii = Sqrt(l_ii);
-      // L = (A - L*L.T) / l_ii * mask + L
-      body_l = Select(mask_pred, l_square / l_ii, mask_zeros) + body_l;
+      seen_error = Or(seen_error, IsNan(l_ii));
+    }
+    // L = (A - L*L.T) / l_ii * mask + L
+    body_l = Select(mask_pred, l_square / l_ii, mask_zeros) + body_l;
 
-      seen_error =
-          Or(seen_error, Any(Or(Le(l_ii, ZerosLike(l_ii)), IsNan(l_ii))));
+    return std::vector<XlaOp>{body_a, body_l, seen_error};
+  };
 
-      return std::vector<XlaOp>{body_a, body_l, seen_error};
-    };
+  TF_ASSIGN_OR_RETURN(
+      auto cholesky_while,
+      ForEachIndex(
+          n, S32, body_fn,
+          {a, l, Zeros(builder, ShapeUtil::MakeShape(PRED, error_dims))},
+          "unblocked", builder));
 
-    TF_ASSIGN_OR_RETURN(
-        auto cholesky_while,
-        ForEachIndex(n, S32, body_fn, {a, l, ConstantR0<bool>(builder, false)},
-                     "unblocked", builder));
-
-    return std::make_pair(cholesky_while[1], cholesky_while[2]);
-  }();
-  if (!result.ok()) {
-    XlaOp error = builder->ReportError(result.status());
-    return {error, error};
-  }
-  return result.ValueOrDie();
+  return std::make_pair(cholesky_while[1], cholesky_while[2]);
 }
 
-XlaOp BuildCholesky(XlaOp a, int64 block_size,
-                    PrecisionConfig::Precision precision) {
+XlaOp CholeskyExpander::BuildCholesky(XlaOp a, int64 block_size,
+                                      PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
@@ -143,64 +144,77 @@ XlaOp BuildCholesky(XlaOp a, int64 block_size,
           ShapeUtil::HumanString(a_shape));
     }
 
-    if (primitive_util::IsComplexType(a_shape.element_type())) {
-      return Unimplemented(
-          "Complex types are not implemented in Cholesky; got shape %s",
-          ShapeUtil::HumanString(a_shape));
-    }
-
     if (block_size < 1) {
       return InvalidArgument(
           "block_size argument to Cholesky must be >= 1; got %d", block_size);
     }
 
+    std::vector<int64> error_dims(a_shape.dimensions().begin(),
+                                  a_shape.dimensions().end());
+    error_dims.back() = error_dims.at(ndims - 2) = 1;
+    std::vector<int64> error_dim_indices(ndims);
+    absl::c_iota(error_dim_indices, 0);
+
     // Blocked left-looking Cholesky factorization.
     // Algorithm 1 from
     // Haidar, Azzam, et al. "High-performance Cholesky factorization for
     // GPU-only execution." Proceedings of General Purpose GPUs. ACM, 2017.
     XlaOp l = ZerosLike(a);
-    XlaOp seen_error = ConstantR0<bool>(builder, false);
+    XlaOp seen_error = Zeros(builder, ShapeUtil::MakeShape(PRED, error_dims));
     for (int64 i = 0; i < n; i += block_size) {
       int64 k = std::min(block_size, n - i);
+      auto panel = SliceInMinorDims(a, {i, i}, {n, i + k});
       if (i > 0) {
         // TODO(phawkins): consider implementing SYRK for the diagonal part of
         // the panel.
         // a[i:, i:i+k] -= np.dot(l[i:, :i], np.transpose(l[i:i+k, :i]))
         auto lhs = SliceInMinorDims(l, {i, 0}, {n, i});
         auto rhs = SliceInMinorDims(l, {i, 0}, {i + k, i});
-        auto delta = BatchDot(lhs, false, rhs, true, precision);
-        auto before = SliceInMinorDims(a, {i, i}, {n, i + k});
-        a = UpdateSliceInMinorDims(a, before - delta, {i, i});
+        auto delta =
+            BatchDot(lhs, false, MaybeConjugate(rhs, true), true, precision);
+        panel = panel - delta;
       }
 
       // l[i:i+k, i:i+k] = cholesky_unblocked(a[i:i+k, i:i+k])
-      auto x = SliceInMinorDims(a, {i, i}, {i + k, i + k});
+      auto x = SliceInMinorDims(panel, {0, 0}, {k, k});
       XlaOp factorized;
+      // TODO(b/167896062): A failure in one element of a batch shouldn't fail
+      // other elements.
       XlaOp factorized_error;
-      std::tie(factorized, factorized_error) = CholeskyUnblocked(x, precision);
+      if (k == 1) {
+        if (ShapeUtil::ElementIsComplex(a_shape)) {
+          auto sqrt = Sqrt(Real(x));
+          factorized = Complex(sqrt, ZerosLike(sqrt));
+          factorized_error = IsNan(sqrt);
+        } else {
+          factorized = Sqrt(x);
+          factorized_error = IsNan(factorized);
+        }
+      } else {
+        TF_ASSIGN_OR_RETURN(auto tile_output, CholeskyUnblocked(x, precision));
+        std::tie(factorized, factorized_error) = tile_output;
+      }
       seen_error = Or(seen_error, factorized_error);
       l = UpdateSliceInMinorDims(l, factorized, {i, i});
 
       if (i + k < n) {
         // l[i+k:, i:i+k] =
         //     trsm_right_transpose(l[i:i+k, i:i+k], a[i+k:, i:i+k])
-        auto panel = SliceInMinorDims(a, {i + k, i}, {n, i + k});
-        auto update =
-            TriangularSolve(factorized, panel,
-                            /*left_side=*/false,
-                            /*lower=*/true,
-                            /*unit_diagonal=*/false,
-                            /*transpose_a=*/TriangularSolveOptions::TRANSPOSE);
+        auto update = TriangularSolve(
+            factorized, SliceInMinorDims(panel, {k, 0}, {n - i, k}),
+            /*left_side=*/false,
+            /*lower=*/true,
+            /*unit_diagonal=*/false,
+            /*transpose_a=*/TriangularSolveOptions::ADJOINT);
         l = UpdateSliceInMinorDims(l, update, {i + k, i});
       }
     }
-    return Select(seen_error,
-                  FullLike(l, std::numeric_limits<float>::quiet_NaN()), l);
+    return Select(
+        BroadcastInDim(seen_error, a_shape.dimensions(), error_dim_indices),
+        FullLike(l, std::numeric_limits<float>::quiet_NaN()), l);
   });
 }
 
-}  // namespace
-
 bool CholeskyExpander::InstructionMatchesPattern(HloInstruction* instruction) {
   return instruction->opcode() == HloOpcode::kCholesky;
 }
diff --git a/tensorflow/compiler/xla/service/cholesky_expander.h b/tensorflow/compiler/xla/service/cholesky_expander.h
index d2958db1b8c..ee8531d0f48 100644
--- a/tensorflow/compiler/xla/service/cholesky_expander.h
+++ b/tensorflow/compiler/xla/service/cholesky_expander.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CHOLESKY_EXPANDER_H_
 
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/service/op_expander_pass.h"
 
 namespace xla {
@@ -31,7 +32,13 @@ class CholeskyExpander : public OpExpanderPass {
   StatusOr<HloInstruction*> ExpandInstruction(
       HloInstruction* instruction) override;
 
+  virtual StatusOr<std::pair<XlaOp, XlaOp>> CholeskyUnblocked(
+      XlaOp a, PrecisionConfig::Precision precision);
+
  private:
+  XlaOp BuildCholesky(XlaOp a, int64 block_size,
+                      PrecisionConfig::Precision precision);
+
   // Mapping from op signatures to existing computations.
   absl::flat_hash_map<string, HloComputation*> computation_cache_;
 };
diff --git a/tensorflow/compiler/xla/service/collective_ops_utils.h b/tensorflow/compiler/xla/service/collective_ops_utils.h
index eb96d537fa8..4eaa9101cc4 100644
--- a/tensorflow/compiler/xla/service/collective_ops_utils.h
+++ b/tensorflow/compiler/xla/service/collective_ops_utils.h
@@ -82,22 +82,6 @@ struct RendezvousKey {
         collective_op_kind(collective_op_kind),
         op_id(op_id) {}
 
-  static RendezvousKey FromInstruction(
-      const RunId& run_id, std::vector<GlobalDeviceId> global_devices,
-      int num_local_participants, const HloInstruction* instr) {
-    CollectiveOpKind collective_op_kind;
-    int64 op_id;
-
-    std::tie(collective_op_kind, op_id) =
-        instr->channel_id().has_value()
-            ? std::make_pair(kCrossModule, instr->channel_id().value())
-            : std::make_pair(
-                  kCrossReplica,
-                  static_cast<int64>(instr->GetModule()->unique_id()));
-    return RendezvousKey(run_id, std::move(global_devices),
-                         num_local_participants, collective_op_kind, op_id);
-  }
-
   template <typename H>
   friend H AbslHashValue(H h, const RendezvousKey& k) {
     return H::combine(std::move(h), k.run_id, k.global_devices,
diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc
index f03b27cdcc7..653f4555a77 100644
--- a/tensorflow/compiler/xla/service/compiler.cc
+++ b/tensorflow/compiler/xla/service/compiler.cc
@@ -28,14 +28,6 @@ namespace xla {
 /* static */ tensorflow::mutex Compiler::platform_compiler_mutex_(
     tensorflow::LINKER_INITIALIZED);
 
-StatusOr<
-    std::tuple<std::unique_ptr<HloModule>, std::unique_ptr<BufferAssignment>>>
-Compiler::RunHloPassesAndBufferAssignement(
-    std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
-    se::DeviceMemoryAllocator* device_allocator) {
-  return Unimplemented("This compiler does not support this method");
-}
-
 std::vector<std::unique_ptr<tensorflow::protobuf::Message>>
 Compiler::ComputeBackendConfigs(const HloInstruction& hlo,
                                 se::StreamExecutor* executor) const {
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index 312a068ba65..253caac195c 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -188,7 +188,10 @@ class Compiler {
       std::tuple<std::unique_ptr<HloModule>, std::unique_ptr<BufferAssignment>>>
   RunHloPassesAndBufferAssignement(std::unique_ptr<HloModule> module,
                                    se::StreamExecutor* executor,
-                                   se::DeviceMemoryAllocator* device_allocator);
+                                   se::DeviceMemoryAllocator* device_allocator,
+                                   bool optimize) {
+    return Unimplemented("This compiler does not support this method");
+  }
 
   // Compiles the HLO module for execution on a device given by the executor,
   // and returns an executable object or an error status. No HLO passes are
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion.cc b/tensorflow/compiler/xla/service/conditional_code_motion.cc
index ce80b4cfc15..855e75a76e0 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion.cc
+++ b/tensorflow/compiler/xla/service/conditional_code_motion.cc
@@ -23,17 +23,20 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -95,31 +98,63 @@ class BoundaryVisitor {
   absl::flat_hash_set<HloInstruction*> visited_;
 };
 
+template <class OpCollection>
+int64 CountNonLeafOps(const OpCollection& ops) {
+  absl::flat_hash_set<HloInstruction*> op_set;
+  for (auto op : ops) {
+    if (!op_set.contains(op) && op->opcode() != HloOpcode::kConstant) {
+      op_set.insert(op);
+    }
+  }
+  return op_set.size();
+}
+
 // Returns estimation of potential reuses carried by a given pair of
 // instructions.  Use different integers to classify different levels
 // of reuses This is used as a placeholder only, assuming all
 // instructions can be fused to enable data reuses
 int64 ReusesCarriedBy(HloInstruction* op, HloInstruction* user) {
+  // Reuses in some way work like forces that pull instructions
+  // towards each other. We use a number 0-10 to classify how strong the force
+  // is between a pair of operations. Given a group of instructions that can be
+  // moved together, if the forces inside a conditional are stronger, the group
+  // will be moved incide or remain inside the conditional; otherwise, it will
+  // be moved outside to or remain outside of the conditional.
   VLOG(2) << "ConditionalCodeMotion: Add reuses carried by instr: "
           << op->ToString() << "=>" << user->ToString() << "\n";
   switch (user->opcode()) {
     case HloOpcode::kGetTupleElement:
-    case HloOpcode::kTuple:
       return 0;
+    case HloOpcode::kConvert:
+      // Because convert is treated not moveable when following Dot or
+      // convolution, here if op is dot or convolution, they must be separated
+      // by a conditional boundary. Here we do not try to pull convert inside
+      // conditionals to be together with the dot or convolution.
+      switch (op->opcode()) {
+        case HloOpcode::kConvolution:
+        case HloOpcode::kDot:
+          return 0;
+        default:
+          break;
+      }
+      break;
     default:
       break;
   }
   switch (op->opcode()) {
-      // These instructions are lightweight and easy to fuse.
+      // These instructions do not carry weight of reuse themselves.
+    case HloOpcode::kParameter:
     case HloOpcode::kConstant:
     case HloOpcode::kGetTupleElement:
       return 0;
-    default:
-      // Assume fusion will not happen anyway if user count > 1)
-      if (op->user_count() > 1) {
-        return 0;
-      }
+    case HloOpcode::kConditional:
       return 10;
+    default: {
+      // Assume the reuse decreases with increasing user count.
+      int count1 = CountNonLeafOps(op->users());
+      int count2 = CountNonLeafOps(user->operands());
+      return 10 / count1 / count2;
+    }
   }
 }
 
@@ -177,17 +212,35 @@ Status CopyInOrOutOfConditional(
   absl::InlinedVector<HloInstruction*, 4> new_operands;
   for (int i = 0; i < op->operands().size(); ++i) {
     auto op_i = op->operands()[i];
-    VLOG(2) << "Looking for operand:" << op_i->ToString() << "\n";
+    VLOG(2) << "Looking for " << op_i->ToString() << "\n";
     if (ContainsKey(hoisted_instructions, op_i)) {
       auto new_op_i =
           FindOrDie(hoisted_instructions, op_i).operands()[dest_index];
-      VLOG(2) << "new operand:" << new_op_i->ToString() << "\n";
+      VLOG(2) << "new instruction:" << new_op_i->ToString() << "\n";
       new_operands.push_back(new_op_i);
     } else {
-      CHECK(op_i->opcode() == HloOpcode::kConstant);
-      auto new_op_i = parent->AddInstruction(op_i->Clone());
-      VLOG(2) << "new operand:" << new_op_i->ToString() << "\n";
-      new_operands.push_back(new_op_i);
+      switch (op_i->opcode()) {
+        case HloOpcode::kConstant: {
+          auto new_op_i = parent->AddInstruction(op_i->Clone());
+          VLOG(2) << "new instruction:" << new_op_i->ToString() << "\n";
+          new_operands.push_back(new_op_i);
+          break;
+        }
+        case HloOpcode::kGetTupleElement: {
+          auto gte = Cast<HloGetTupleElementInstruction>(op_i);
+          int64 index = gte->tuple_index();
+          HloInstruction* root = parent->root_instruction();
+          CHECK(root->opcode() == HloOpcode::kTuple &&
+                index < root->operand_count());
+          auto new_op_i = root->mutable_operand(index);
+          VLOG(2) << "new instruction:" << new_op_i->ToString() << "\n";
+          new_operands.push_back(new_op_i);
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unexpected out-of-boundary instruction:"
+                     << op_i->ToString() << "\n";
+      }
     }
   }
   HloInstruction* new_instruction = parent->AddInstruction(
@@ -298,125 +351,130 @@ StatusOr<bool> ConvertSpecialMove(HloInstruction* conditional,
     return false;
   }
 
-  HloInstruction* old_root =
-      conditional->branch_computation(0)->root_instruction();
-  if (old_root->opcode() != HloOpcode::kTuple) {
-    return false;
-  } else {
-    VLOG(2) << "BEFORE :" << conditional->parent()->parent()->ToString();
-    // Identify the gte using `index'.
-    auto find_gte = [](const HloInstruction* conditional_result,
-                       int64 index) -> HloInstruction* {
-      for (HloInstruction* instr : conditional_result->users()) {
-        if (instr->opcode() != HloOpcode::kGetTupleElement) {
-          return nullptr;
-        }
-        if (instr->tuple_index() == index) {
-          return instr;
-        }
-      }
-      return nullptr;
-    };
-
-    // Captures tuple indices refering to converts to be rematerialized/hoisted.
-    absl::flat_hash_set<int64> kspecial_convert = FindSpecialConverts(
-        old_root, branch_count, conditional, is_layout_sensitive);
-
-    // Exit if we cannot find any converts to be hoisted.
-    if (kspecial_convert.empty()) {
+  // Determining whether all branch roots are tuples
+  for (int branch_num = 0; branch_num < branch_count; ++branch_num) {
+    HloInstruction* branch_root =
+        conditional->branch_computation(branch_num)->root_instruction();
+    if (branch_root->opcode() != HloOpcode::kTuple) {
       return false;
     }
+  }
 
-    TF_RETURN_IF_ERROR(
-        RestructureConditionalInstruction(conditional->parent(), conditional));
-
-    for (int branch = 0; branch < branch_count; branch++) {
-      old_root = conditional->branch_computation(branch)->root_instruction();
-      absl::flat_hash_map<HloInstruction*, int64> map_inst_to_tuple_index;
-      std::vector<HloInstruction*> new_operands(old_root->operand_count());
-      absl::flat_hash_set<HloInstruction*> to_hoist_set;
-
-      for (int64 operand_num = 0; operand_num < old_root->operand_count();
-           ++operand_num) {
-        map_inst_to_tuple_index[old_root->mutable_operand(operand_num)] =
-            operand_num;
+  HloInstruction* old_root =
+      conditional->branch_computation(0)->root_instruction();
+  VLOG(2) << "BEFORE :" << conditional->parent()->parent()->ToString();
+  // Identify the gte using `index'.
+  auto find_gte = [](const HloInstruction* conditional_result,
+                     int64 index) -> HloInstruction* {
+    for (HloInstruction* instr : conditional_result->users()) {
+      if (instr->opcode() != HloOpcode::kGetTupleElement) {
+        return nullptr;
       }
-      for (int64 operand_num = 0; operand_num < old_root->operand_count();
-           ++operand_num) {
-        HloInstruction* hoist = old_root->mutable_operand(operand_num);
-        if (!kspecial_convert.contains(operand_num)) {
-          new_operands[operand_num] = old_root->mutable_operand(operand_num);
-          continue;
-        }
-
-        to_hoist_set.insert(hoist);
-        int64 new_tuple_count = old_root->operand_count();
-
-        // Replace the hoisted instr in the tuple with the operand/operands.
-        // We will replace at least one of the operands of the hoist at the
-        // tuple place; the rest will be added at the end.
-        bool inplace = true;
-        CHECK(!hoist->operands().empty());
-        for (HloInstruction* prod : hoist->operands()) {
-          if (inplace) {
-            map_inst_to_tuple_index[prod] = map_inst_to_tuple_index[hoist];
-            new_operands[map_inst_to_tuple_index[hoist]] = prod;
-            inplace = false;
-          } else {
-            map_inst_to_tuple_index[prod] = new_tuple_count++;
-            new_operands.push_back(prod);
-          }
-        }
+      if (instr->tuple_index() == index) {
+        return instr;
       }
+    }
+    return nullptr;
+  };
 
-      // Create the new root instruction.
-      HloComputation* cur_branch = conditional->branch_computation(branch);
-      HloInstruction* new_branch_root =
-          cur_branch->AddInstruction(HloInstruction::CreateTuple(new_operands));
-      // The shape can vary since the operands to convert are now
-      // being returned through the branches' root.
-      cur_branch->set_root_instruction(new_branch_root, true /*new shape*/);
-      TF_CHECK_OK(cur_branch->RemoveInstruction(old_root));
+  // Captures tuple indices refering to converts to be rematerialized/hoisted.
+  absl::flat_hash_set<int64> kspecial_convert = FindSpecialConverts(
+      old_root, branch_count, conditional, is_layout_sensitive);
 
-      // Only one of the branches needs to change the conditional->parent().
-      if (branch != 0) {
+  // Exit if we cannot find any converts to be hoisted.
+  if (kspecial_convert.empty()) {
+    return false;
+  }
+
+  TF_RETURN_IF_ERROR(
+      RestructureConditionalInstruction(conditional->parent(), conditional));
+
+  for (int branch = 0; branch < branch_count; branch++) {
+    old_root = conditional->branch_computation(branch)->root_instruction();
+    absl::flat_hash_map<HloInstruction*, int64> map_inst_to_tuple_index;
+    std::vector<HloInstruction*> new_operands(old_root->operand_count());
+    absl::flat_hash_set<HloInstruction*> to_hoist_set;
+
+    for (int64 operand_num = 0; operand_num < old_root->operand_count();
+         ++operand_num) {
+      map_inst_to_tuple_index[old_root->mutable_operand(operand_num)] =
+          operand_num;
+    }
+    for (int64 operand_num = 0; operand_num < old_root->operand_count();
+         ++operand_num) {
+      HloInstruction* hoist = old_root->mutable_operand(operand_num);
+      if (!kspecial_convert.contains(operand_num)) {
+        new_operands[operand_num] = old_root->mutable_operand(operand_num);
         continue;
       }
-      HloComputation* conditional_parent = conditional->parent();
-      HloInstruction* newconditional =
-          conditional_parent->AddInstruction(HloInstruction::CreateConditional(
-              cur_branch->root_instruction()->shape(),
-              conditional->mutable_operand(0),
-              absl::MakeSpan(conditional->branch_computations()),
-              absl::MakeSpan(conditional->operands()).subspan(1)));
-      // Ensure that all the users of conditional refer to the new one.
-      TF_RETURN_IF_ERROR(
-          conditional->ReplaceAllUsesWithDifferentShape(newconditional));
-      TF_CHECK_OK(conditional_parent->RemoveInstruction(conditional));
-      conditional = newconditional;
-      // Add the hoisted instructions in the parent.
-      for (HloInstruction* hoist : to_hoist_set) {
-        VLOG(2) << "Hoisting instruction:" << hoist->ToString();
-        int64 hoist_index = map_inst_to_tuple_index[hoist];
-        // Find out the gte that captured the hoisted instr result.
-        HloInstruction* gte_hoist = find_gte(conditional, hoist_index);
-        CHECK(gte_hoist != nullptr);
-        std::vector<HloInstruction*> new_operands;
-        for (HloInstruction* op : hoist->operands()) {
-          HloInstruction* gte = conditional_parent->AddInstruction(
-              HloInstruction::CreateGetTupleElement(
-                  op->shape(), conditional, map_inst_to_tuple_index[op]));
-          new_operands.push_back(gte);
+
+      to_hoist_set.insert(hoist);
+      int64 new_tuple_count = old_root->operand_count();
+
+      // Replace the hoisted instr in the tuple with the operand/operands.
+      // We will replace at least one of the operands of the hoist at the
+      // tuple place; the rest will be added at the end.
+      bool inplace = true;
+      CHECK(!hoist->operands().empty());
+      for (HloInstruction* prod : hoist->operands()) {
+        if (inplace) {
+          map_inst_to_tuple_index[prod] = map_inst_to_tuple_index[hoist];
+          new_operands[map_inst_to_tuple_index[hoist]] = prod;
+          inplace = false;
+        } else {
+          map_inst_to_tuple_index[prod] = new_tuple_count++;
+          new_operands.push_back(prod);
         }
-        HloInstruction* hoisted = conditional_parent->AddInstruction(
-            hoist->CloneWithNewOperands(hoist->shape(), new_operands));
-        VLOG(2) << "Hoisted instruction in parent:" << hoisted->ToString();
-        TF_RETURN_IF_ERROR(gte_hoist->ReplaceAllUsesWith(hoisted));
-        TF_CHECK_OK(conditional_parent->RemoveInstruction(gte_hoist));
       }
-      // No need to explicitly delete a hoisted instruction since if its dead
-      // then the subsequent DCE will remove it.
     }
+
+    // Create the new root instruction.
+    HloComputation* cur_branch = conditional->branch_computation(branch);
+    HloInstruction* new_branch_root =
+        cur_branch->AddInstruction(HloInstruction::CreateTuple(new_operands));
+    // The shape can vary since the operands to convert are now
+    // being returned through the branches' root.
+    cur_branch->set_root_instruction(new_branch_root, true /*new shape*/);
+    TF_CHECK_OK(cur_branch->RemoveInstruction(old_root));
+
+    // Only one of the branches needs to change the conditional->parent().
+    if (branch != 0) {
+      continue;
+    }
+    HloComputation* conditional_parent = conditional->parent();
+    HloInstruction* newconditional =
+        conditional_parent->AddInstruction(HloInstruction::CreateConditional(
+            cur_branch->root_instruction()->shape(),
+            conditional->mutable_operand(0),
+            absl::MakeSpan(conditional->branch_computations()),
+            absl::MakeSpan(conditional->operands()).subspan(1)));
+    // Ensure that all the users of conditional refer to the new one.
+    TF_RETURN_IF_ERROR(
+        conditional->ReplaceAllUsesWithDifferentShape(newconditional));
+    TF_CHECK_OK(conditional_parent->RemoveInstruction(conditional));
+    conditional = newconditional;
+    // Add the hoisted instructions in the parent.
+    for (HloInstruction* hoist : to_hoist_set) {
+      VLOG(2) << "Hoisting instruction:" << hoist->ToString();
+      int64 hoist_index = map_inst_to_tuple_index[hoist];
+      // Find out the gte that captured the hoisted instr result.
+      HloInstruction* gte_hoist = find_gte(conditional, hoist_index);
+      CHECK(gte_hoist != nullptr);
+      std::vector<HloInstruction*> new_operands;
+      for (HloInstruction* op : hoist->operands()) {
+        HloInstruction* gte = conditional_parent->AddInstruction(
+            HloInstruction::CreateGetTupleElement(op->shape(), conditional,
+                                                  map_inst_to_tuple_index[op]));
+        new_operands.push_back(gte);
+      }
+      HloInstruction* hoisted = conditional_parent->AddInstruction(
+          hoist->CloneWithNewOperands(hoist->shape(), new_operands));
+      VLOG(2) << "Hoisted instruction in parent:" << hoisted->ToString();
+      TF_RETURN_IF_ERROR(gte_hoist->ReplaceAllUsesWith(hoisted));
+      TF_CHECK_OK(conditional_parent->RemoveInstruction(gte_hoist));
+    }
+    // No need to explicitly delete a hoisted instruction since if its dead
+    // then the subsequent DCE will remove it.
   }
   VLOG(2) << "AFTER :" << conditional->parent()->parent()->ToString();
   return true;
@@ -446,7 +504,7 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionOut(
           << conditional_parent->ToString(HloPrintOptions::Fingerprint())
           << "\n";
   int64 op_index = 0;
-  for (Boundary b : new_boundaries) {
+  for (const Boundary& b : new_boundaries) {
     HloInstruction* op = b.operands()[0];
     CHECK(op != nullptr);
     VLOG(2) << "Mapping new boundary instr: " << op->ToString() << "\n";
@@ -477,6 +535,7 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionOut(
     int64 index = tuple_opd->tuple_index();
     CHECK(old_root->operands().size() > index);
     HloInstruction* old_opd = old_root->operands()[index];
+    VLOG(2) << "old opd = " << old_opd << "\n";
     CHECK(ContainsKey(hoisted_instructions, old_opd));
     HloInstruction* new_opd = hoisted_instructions[old_opd].operands()[0];
     CHECK(old_opd != nullptr);
@@ -492,7 +551,7 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionOut(
   for (int i = 0; i < branch_count; i++) {
     auto computation = conditional->branch_computation(i);
     std::vector<HloInstruction*> elements;
-    for (auto b1 : new_boundaries) {
+    for (const auto& b1 : new_boundaries) {
       HloInstruction* op = b1.operands()[i];
       CHECK(op != nullptr);
       VLOG(2) << "Adding to root " << i << " with " << op->ToString() << "\n";
@@ -503,15 +562,24 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionOut(
     computation->set_root_instruction(tuple, true);
     VLOG(2) << "computation is :" << computation->ToString() << "\n";
     // Remove hoisted instructions from the branches.
-    for (auto b2 : to_move_out) {
-      VLOG(2) << "Removing boundary:" << b2.ToString() << "\n";
-      TF_RETURN_IF_ERROR(computation->RemoveInstruction(b2.operands()[i]));
+    for (const auto& b2 : to_move_out) {
+      auto instr_to_remove = b2.operands()[i];
+      // Double check to make sure it is safe to delete the instruction.
+      // Complications may arise due to some operations in the alternative
+      // branches (branches 1..n) being placed into the boundaries multiple
+      // times.
+      if (!computation->IsMarkedAsDead(instr_to_remove) &&
+          instr_to_remove->user_count() == 0) {
+        VLOG(2) << "Removing boundary:" << b2.ToString() << "\n";
+        TF_RETURN_IF_ERROR(computation->RemoveInstruction(instr_to_remove));
+      }
     }
   }
   // Change conditional instruction shape to the shape of the new root.
   HloInstruction* new_root =
       conditional->branch_computation(0)->root_instruction();
   *conditional->mutable_shape() = new_root->shape();
+
   //
   VLOG(1) << "done moving instructions out of branches\n"
           << conditional_parent->ToString(HloPrintOptions::Fingerprint())
@@ -535,16 +603,26 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionIn(
   absl::flat_hash_map<HloInstruction*, Boundary> hoisted_instructions;
   int64 to_move_in_size = to_move_in.size();
   int64 branch_count = conditional->branch_count();
+  HloGetTupleElementInstruction* tuple_use =
+      DynCast<HloGetTupleElementInstruction>(to_move_in[0].operands()[0]);
+  // If use_index is -1, the old conditional root entry used by to_move_in
+  // instructions still need to be included as an entry of the modified
+  // conditional root, and the new result of the to_move_in instructions
+  // need to be added as an extra entry of the modified root; otherwise, the
+  // old root entry will be replaced with the new result in the modified root.
+  // The entry replacement should be allowed only if tuple_use has <=1 users.
+  int64 use_index = (tuple_use != nullptr && tuple_use->user_count() == 1)
+                        ? tuple_use->tuple_index()
+                        : -1;
+  VLOG(2) << "Tuple use index = " << use_index << "\n";
   // Number of old conditional entries still to be used outside.
   // If conditional shape is not tuple, will create a tuple and use subscript
   // 0 to save the old operand being used.
-  int64 op_index = conditional->shape().IsTuple()
-                       ? conditional->shape().tuple_shapes_size() - 1
-                       : 0;
-  HloGetTupleElementInstruction* tuple_use =
-      dynamic_cast<HloGetTupleElementInstruction*>(to_move_in[0].operands()[0]);
-  int64 use_index = (tuple_use != nullptr) ? tuple_use->tuple_index() : -1;
-  VLOG(2) << "Tuple use index = " << use_index << "\n";
+  int64 op_index =
+      conditional->shape().IsTuple()
+          ? ((use_index >= 0) ? conditional->shape().tuple_shapes_size() - 1
+                              : conditional->shape().tuple_shapes_size())
+          : 0;
   // Use to map the tuple_use instruction to its operand;
   Boundary b_opd_use(Boundary::Position::kInsideBranch);
   Boundary b_old_root(Boundary::Position::kInsideBranch);
@@ -582,6 +660,7 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionIn(
       // to replace the conditional directly in the new computation.
       b_opd_use.mutable_operands().push_back(conditional);
     }
+
     HloInstruction* new_root =
         computation->AddInstruction(HloInstruction::CreateTuple(operands));
     VLOG(2) << "setting new root: " << new_root->ToString() << "\n";
@@ -592,29 +671,41 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionIn(
     }
     VLOG(2) << "new branch computation: " << computation->ToString() << "\n";
   }
+  // Update get tuple element index of the conditional.
+  if (use_index != -1) {
+    for (auto* user : conditional->users()) {
+      if (user->opcode() == HloOpcode::kGetTupleElement &&
+          user->tuple_index() > use_index) {
+        user->set_tuple_index(user->tuple_index() - 1);
+      }
+    }
+  }
   hoisted_instructions[conditional] = b_old_root;
   int64 cp_start = 0;
   if (use_index >= 0) {
+    VLOG(2) << "Mapping GTE: " << tuple_use->ToString() << "\n";
     hoisted_instructions[tuple_use] = b_opd_use;
-    cp_start = 1;
   }
-  for (int64 i = cp_start; i < to_move_in_size; i++) {
-    Boundary b_to_move = to_move_in[i];
+  cp_start = (tuple_use != nullptr) ? 1 : 0;
+  for (int64 to_move_index = cp_start; to_move_index < to_move_in_size;
+       to_move_index++) {
+    Boundary b_to_move = to_move_in[to_move_index];
     HloInstruction* op = b_to_move.operands()[0];
     CHECK(op != nullptr);
     bool to_be_used_outside = true;
     VLOG(2) << "Mapping new boundary instr: " << op->ToString() << "\n";
-    if (i < to_move_in_size - 1 && op->user_count() == 1 &&
-        op->users()[0] == to_move_in[i + 1].operands()[0]) {
+    if (to_move_index < to_move_in_size - 1 && op->user_count() == 1 &&
+        op->users()[0] == to_move_in[to_move_index + 1].operands()[0]) {
       to_be_used_outside = false;
       VLOG(2) << "Instruction is not to be used outside the branch\n";
     }
     Boundary b(Boundary::Position::kInsideBranch);
     for (int i = 0; i < branch_count; i++) {
       auto computation = conditional->branch_computation(i);
+      VLOG(2) << "Copying to branch: " << i << "\n";
       TF_RETURN_IF_ERROR(CopyInOrOutOfConditional(b_to_move, i, computation,
                                                   hoisted_instructions));
-      VLOG(2) << "After Copying to branch: " << computation->ToString() << "\n";
+      VLOG(2) << "Done:" << computation->ToString() << "\n";
       if (to_be_used_outside) {
         auto new_op = hoisted_instructions[op].operands()[i];
         auto new_root = computation->root_instruction();
@@ -648,12 +739,23 @@ StatusOr<bool> ConditionalCodeMotion::MoveInstructionIn(
   // Remove hoisted instructions from the branches.
   for (int64 i = to_move_in_size - 1; i >= 0; i--) {
     Boundary boundary_to_move_in = to_move_in[i];
-    VLOG(2) << "Removing boundary:" << boundary_to_move_in.ToString() << "\n";
     HloInstruction* op = boundary_to_move_in.operands()[0];
-    for (auto user : op->users()) {
-      VLOG(2) << "Has User: " << user->ToString() << "\n";
+    if (op->user_count() == 0) {
+      VLOG(2) << "Removing boundary:" << boundary_to_move_in.ToString() << "\n";
+      TF_RETURN_IF_ERROR(conditional->parent()->RemoveInstruction(op));
+      VLOG(2) << "Done removing boundary.\n";
+    }
+  }
+
+  // Reset shapes of user gtes to the new shape.
+  if (use_index != -1) {
+    for (auto* user : conditional->users()) {
+      if (user->opcode() == HloOpcode::kGetTupleElement) {
+        VLOG(2) << "Resetting shape of user: " << user->ToString() << "\n";
+        *user->mutable_shape() =
+            conditional->shape().tuple_shapes(user->tuple_index());
+      }
     }
-    TF_RETURN_IF_ERROR(conditional->parent()->RemoveInstruction(op));
   }
   VLOG(1) << "Done moving instructions inside branches\n"
           << conditional->parent()->ToString(HloPrintOptions::Fingerprint())
@@ -669,16 +771,23 @@ class GroupConnectedBoundaries {
   HloComputation* conditional_parent_;
   bool is_layout_sensitive_;
   // Instructions that have been visited but are not going to be moved.
-  absl::flat_hash_set<HloInstruction*> visited_;
+  absl::flat_hash_map<HloInstruction*, int>& visited_count_;
 
  public:
-  explicit GroupConnectedBoundaries(HloInstruction* conditional,
-                                    bool is_layout_sensitive)
+  explicit GroupConnectedBoundaries(
+      HloInstruction* conditional, bool is_layout_sensitive,
+      absl::flat_hash_map<HloInstruction*, int>& visited_count)
       : conditional_(conditional),
         conditional_parent_(conditional->parent()),
-        is_layout_sensitive_(is_layout_sensitive) {}
-  // Returns true if `instruction` is worth hoisting out.
-  bool WorthHoisting(HloInstruction* instruction) {
+        is_layout_sensitive_(is_layout_sensitive),
+        visited_count_(visited_count) {}
+  void clear_recently_visited() {
+    for (const auto& boundary : new_boundaries_) {
+      visited_count_.erase(boundary.operands()[0]);
+    }
+  }
+  // Returns true if `instruction` is worth hoisting.
+  bool WorthHoisting(HloInstruction* instruction, bool is_inside_branch) {
     // This is needed for the "moving-in" transformation, to prevent the root
     // of the parent computation (which contains the conditional) to be moved
     // inside the conditional.
@@ -686,6 +795,8 @@ class GroupConnectedBoundaries {
         instruction == conditional_parent_->root_instruction()) {
       return false;
     }
+    // TOOD[b/169182921] The following cost model is rather incomplete. Will
+    // need to extend to cover most of element-wise ops.
     switch (instruction->opcode()) {
       case HloOpcode::kConvert:
         // If Convert is after AllReduce, it is worth moving out AllReduce
@@ -693,29 +804,44 @@ class GroupConnectedBoundaries {
         // ops such as Dot or Convolutional, it is better to keep convert
         // within conditional so that convert can be fused with Dot or
         // Convolutional.
-        //
-        // TODO(b/154283721): figure out the scenario when convert can be
-        // fused with AllReduce out of conditional.
         switch (instruction->operand(0)->opcode()) {
           case HloOpcode::kAllReduce:
           case HloOpcode::kReshape:
+          case HloOpcode::kGetTupleElement:
             return true;
           default:
-            VLOG(2) << "Instruction is convert and its operand is not know to "
+            VLOG(2) << "Instruction is convert and its operand is not known to "
                        "be worth hoisting\n";
             return false;
         }
+      case HloOpcode::kGetTupleElement:
+        switch (instruction->operand(0)->opcode()) {
+          // do not move GTE if its operand is a parameter
+          case HloOpcode::kParameter:
+            return false;
+          default:
+            return true;
+        }
       case HloOpcode::kAllReduce:
+        // It is not safe to move collective ops from outside to inside
+        // conditional branches, as it may cause synchronization problems,
+        // when different layouts are assigned to different branches.
+        return is_inside_branch;
+      case HloOpcode::kAbs:
+      case HloOpcode::kReduce:
       case HloOpcode::kAdd:
       case HloOpcode::kPower:
+      case HloOpcode::kCopy:
       case HloOpcode::kConstant:
       case HloOpcode::kSubtract:
       case HloOpcode::kMultiply:
       case HloOpcode::kDivide:
       case HloOpcode::kTuple:
       case HloOpcode::kSqrt:
+      case HloOpcode::kRsqrt:
       case HloOpcode::kReshape:
-      case HloOpcode::kGetTupleElement:
+      case HloOpcode::kMinimum:
+      case HloOpcode::kMaximum:
         return true;
       default:
         VLOG(2) << "Instruction is not known to be worth hoisting\n";
@@ -728,14 +854,20 @@ class GroupConnectedBoundaries {
       // The operand must be an instruction that is not going to be moved (if
       // user is inside the conditional); otherwise it must be the conditional
       // itself and its user must be outside of the conditional.
-      if (!ContainsKey(visited_, op) && op != conditional_) {
+      if (!ContainsKey(visited_count_, op) && op != conditional_) {
         continue;
       }
-      // Only consider single-user cases as reuseable.
-      if (user->opcode() == HloOpcode::kGetTupleElement &&
-          user->user_count() == 1) {
+      if (auto tuple_gte = DynCast<HloGetTupleElementInstruction>(user)) {
+        if (op->opcode() == HloOpcode::kConditional) {
+          auto tuple = op->branch_computation(0)->root_instruction();
+          if (tuple->opcode() == HloOpcode::kTuple) {
+            auto index = tuple_gte->tuple_index();
+            CHECK(index < tuple->operand_count());
+            op = tuple->mutable_operand(index);
+          }
+        }
         reuses += ReusesCarriedBy(op, user->users()[0]);
-      } else if (op->user_count() == 1) {
+      } else {
         reuses += ReusesCarriedBy(op, user);
       }
     }
@@ -753,6 +885,7 @@ class GroupConnectedBoundaries {
     // some aspects of the overall algorithm need to be redesigned to
     // accommandate the change.
     if (all_users.size() > 1) {
+      VLOG(2) << "Having multiple users from: " << user->ToString() << "\n";
       return 0;
     }
     if (!all_users.empty()) {
@@ -774,7 +907,7 @@ class GroupConnectedBoundaries {
             }
           }
         }
-      } else if (ContainsKey(visited_, op)) {
+      } else if (ContainsKey(visited_count_, op)) {
         reuses += ReusesCarriedBy(user, op);
       }
       VLOG(2) << "reuses after instruction " << user->ToString() << ":"
@@ -822,16 +955,49 @@ class GroupConnectedBoundaries {
     }
     return b2;
   }
-  int64 CountNonLeafOps(const xla::HloInstruction::InstructionVector& ops) {
-    int64 count = 0;
-    absl::flat_hash_set<HloInstruction*> op_set;
-    for (auto op : ops) {
-      if (!op_set.contains(op) && op->opcode() != HloOpcode::kConstant) {
-        count++;
-        op_set.insert(op);
+
+  // Checking whether it is safe to move a boundary when visited through a
+  // dependent already considered for moving.
+  bool IsSafeToMoveBoundary(const Boundary& next_boundary) {
+    int64 next_boundary_count =
+        (next_boundary.IsInsideBranch())
+            ? next_boundary.operands()[0]->user_count()
+            : CountNonLeafOps(next_boundary.operands()[0]->operands());
+    if (next_boundary_count <= 1) {
+      // If boundary has only a single or no dependent, safe to move.
+      return true;
+    } else {
+      if (!ContainsKey(visited_count_, next_boundary.operands()[0])) {
+        VLOG(2) << "Skip next boundary " << next_boundary.ToString() << "\n"
+                << " because it has multiple dependents: "
+                << next_boundary_count << "\n";
+        visited_count_[next_boundary.operands()[0]] = 1;
+        new_boundaries_.push_back(next_boundary);
+      } else {
+        auto pos = std::find(new_boundaries_.begin(), new_boundaries_.end(),
+                             next_boundary);
+        if (pos != new_boundaries_.end() ||
+            next_boundary.operands().size() == 1) {
+          int count = ++visited_count_[next_boundary.operands()[0]];
+          if (count == next_boundary_count) {
+            VLOG(2) << "Recovering next boundary " << next_boundary.ToString()
+                    << "\n"
+                    << " because all of its dependents have been visited: "
+                    << next_boundary_count << "\n";
+            visited_count_.erase(next_boundary.operands()[0]);
+            if (pos != new_boundaries_.end()) {
+              new_boundaries_.erase(pos);
+            }
+            return true;
+          }
+        } else {
+          VLOG(2) << "Skip incompatible multi-dependent boundary: "
+                  << next_boundary.ToString() << ":" << next_boundary_count
+                  << "\n";
+        }
       }
     }
-    return count;
+    return false;
   }
   // This function is reused both for moving the boundary outside or into a
   // conditional. As the result, the readability is somewhat compromised.
@@ -846,7 +1012,8 @@ class GroupConnectedBoundaries {
       VLOG(2) << "visiting boundary " << b.ToString() << "\n";
       if ((b.IsOutsideBranch() || InstructionWithinBranchIdentical(
                                       b.operands(), is_layout_sensitive_)) &&
-          WorthHoisting(b.operands()[0])) {
+          IsSafeToMoveBoundary(b) &&
+          WorthHoisting(b.operands()[0], b.IsInsideBranch())) {
         connected_boundaries_.push_back(b);
         VLOG(2) << "boundary can be moved\n";
         int64 operand_count = (b.IsInsideBranch())
@@ -854,38 +1021,25 @@ class GroupConnectedBoundaries {
                                   : b.operands()[0]->users().size();
         for (int i = 0; i < operand_count; i++) {
           Boundary next_boundary = GetNextBoundary(b, i);
-          int64 next_boundary_count =
-              (next_boundary.IsInsideBranch())
-                  ? next_boundary.operands()[0]->user_count()
-                  : CountNonLeafOps(next_boundary.operands()[0]->operands());
-          // only consider adding an exclusive producor into the same group.
-          if (next_boundary_count == 1) {
-            VLOG(2) << "Add operand " << i << " to visit later\n";
-            visitor.AddToWorkList(next_boundary);
-          } else {
-            VLOG(2) << "Next boundary " << i
-                    << " has multiple uses: " << next_boundary_count << "\n";
-            if (!ContainsKey(visited_, next_boundary.operands()[0])) {
-              visited_.insert(next_boundary.operands()[0]);
-              new_boundaries_.push_back(next_boundary);
-            }
-          }
+          VLOG(2) << "Add operand/user " << i << " to visit later\n";
+          visitor.AddToWorkList(next_boundary);
         }
       } else {
         VLOG(2) << "boundary cannot be moved\n";
-        visited_.insert(b.operands()[0]);
+        visited_count_[b.operands()[0]] = 1;
         new_boundaries_.push_back(b);
       }
     }
   }
-  std::vector<Boundary> BoundariesToMoveInOrOut(const Boundary& b) {
+  std::vector<Boundary> BoundariesToMoveInOrOut(HloInstruction* conditional,
+                                                const Boundary& b) {
     // At the beginning of optimization, a conditional itself is added to a
     // worklist. Here the conditional is expanded into two sets of boundaries:
     // the first set contains the boundary that is inside branches and
     // contains the root of all branches; the second set of boundaries
     // contains all the users of the conditional.
     HloInstruction* inst = b.operands()[0];
-    if (inst->opcode() == HloOpcode::kConditional) {
+    if (inst == conditional) {
       int branch_count = inst->branch_count();
       // Add conditional roots as a new boundary to visit.
       Boundary boundary_in(Boundary::Position::kInsideBranch);
@@ -914,9 +1068,12 @@ class GroupConnectedBoundaries {
 
 ConditionalCodeMotion::Decision ConditionalCodeMotion::ConsiderCodeMotion(
     HloInstruction* conditional, const Boundary& cur_boundary,
-    std::vector<Boundary>& to_move, std::vector<Boundary>& new_boundaries) {
-  GroupConnectedBoundaries connect(conditional, is_layout_sensitive_);
-  auto move_in_or_out = connect.BoundariesToMoveInOrOut(cur_boundary);
+    std::vector<Boundary>& to_move, std::vector<Boundary>& new_boundaries,
+    absl::flat_hash_map<HloInstruction*, int>& visited_count) {
+  GroupConnectedBoundaries connect(conditional, is_layout_sensitive_,
+                                   visited_count);
+  auto move_in_or_out =
+      connect.BoundariesToMoveInOrOut(conditional, cur_boundary);
   if (!move_in_or_out.empty()) {
     auto benefit = connect.BenefitForMovingBoundaries(move_in_or_out);
     VLOG(2) << "benefit of moving in or out "
@@ -929,16 +1086,37 @@ ConditionalCodeMotion::Decision ConditionalCodeMotion::ConsiderCodeMotion(
       // at the first entry of the sequence is sufficient to know which
       // direction the move is intended.
       to_move = move_in_or_out;
-      return to_move[0].IsInsideBranch() ? Decision::kMoveOutOfBranch
-                                         : Decision::kMoveIntoBranch;
+      return Decision(to_move[0].IsInsideBranch()
+                          ? Decision::Direction::kMoveOutOfBranch
+                          : Decision::Direction::kMoveIntoBranch,
+                      benefit);
+    } else {
+      connect.clear_recently_visited();
     }
   } else {
     connect.AddNewBoundaries(new_boundaries);
   }
-  return ConditionalCodeMotion::Decision::kNoChange;
+  return Decision(Decision::Direction::kNoChange, 0);
 }
 
 StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
+  VLOG(2) << "Begin a new pass of conditional code motion optimization.\n";
+  // Use to support debugging of optimization, by disabling the opt after it has
+  // been applied a pre-determined times (to isolate impact of transformations).
+  if (!ConsumeFuel("conditional_code_motion", [&] {
+        return "Skipping conditional opt after allowed limit reaching 0.\n";
+      })) {
+    return false;
+  }
+  bool changed = false;
+  bool cleanup_changed = false;
+  {
+    HloPassPipeline subpipeline("before_conditional_code_motion");
+    subpipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/is_layout_sensitive_);
+    subpipeline.AddPass<HloDCE>();
+    TF_ASSIGN_OR_RETURN(auto cleanup_changed_now, subpipeline.Run(module));
+    cleanup_changed |= cleanup_changed_now;
+  }
   // Gather all the conditional ops in the module ahead of time, to avoid
   // potential complications of modifying the code that affecting traversal.
   std::vector<HloInstruction*> conditional_ops;
@@ -956,12 +1134,26 @@ StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
             conditional_computations[branch_i] = 0;
           }
         }
-        conditional_ops.push_back(instr);
+        if (instr->shape().IsTuple()) {
+          bool can_change_tuple_shape = true;
+          for (auto user : instr->users()) {
+            VLOG(2) << "user is : " << user->ToString() << "\n";
+            if (user->opcode() != HloOpcode::kGetTupleElement) {
+              can_change_tuple_shape = false;
+            }
+          }
+          if (can_change_tuple_shape) {
+            conditional_ops.push_back(instr);
+          }
+        } else {
+          conditional_ops.push_back(instr);
+        }
       }
     }
   }
 
-  bool changed = false;
+  // Use to collect mappings between cloned instructions.
+  HloCloneContext clone_context(module);
   for (HloInstruction* conditional : conditional_ops) {
     int branch_count = conditional->branch_count();
     // check for shared conditional computations
@@ -975,7 +1167,13 @@ StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
     }
 
     // Boundaries to move out or to move into the branches.
-    std::vector<Boundary> to_move_out, to_move_in, new_boundaries;
+    std::vector<std::vector<Boundary> > to_move_out, to_move_in;
+    std::vector<std::vector<Boundary> > new_boundaries_for_moveout;
+    std::vector<std::vector<Boundary> > new_boundaries_for_movein;
+    // Number of times each instruction has been visited for moving.
+    absl::flat_hash_map<HloInstruction*, int> visited_count;
+    int benefit_move_out = 0, benefit_move_in = 0;
+    Decision::Direction final_d = Decision::Direction::kNoChange;
     // The conditional is moved into a worklist as the seed (starting point).
     // The conditional will be expanded into multiple seeds (starting points),
     // its roots and its users, when it is visited by GroupConnectedBoundaries.
@@ -983,76 +1181,130 @@ StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
     // so that the other seeding boundaries can be visited in turn.
     BoundaryVisitor visitor(conditional);
     VLOG(2) << "Analyzing conditional:" << conditional->ToString() << "\n";
-    ConditionalCodeMotion::Decision d = Decision::kNoChange;
-    // The following loop breaks out as soon as a decision to modify the
-    // conditional is reached --- irrespective of whether visitor is empty.
-    while (d == Decision::kNoChange && visitor.HasNextBoundary()) {
+    // Try visit all the boundaries, collect the analysis results, and save
+    // all the benefitical non-conflicting decisions. If two decisions conflict
+    // with each other, save the more benefitical one.
+    while (visitor.HasNextBoundary()) {
       std::vector<Boundary> to_move, next_boundary;
       Boundary boundary = visitor.PopNextBoundary();
       VLOG(2) << "Analyzing boundary:" << boundary.ToString() << "\n";
-      d = ConsiderCodeMotion(conditional, boundary, to_move, next_boundary);
-      if (d != Decision::kNoChange && conditional_is_shared) {
-        for (int i = 0; i < branch_count; ++i) {
-          HloComputation* branch_i = conditional->branch_computation(i);
-          if (conditional_computations[branch_i] > 0) {
-            // Cloning is absolutely needed if the computation is shared by
-            // different branches, but the cloning can be potentially avoided
-            // if the sharing is only among branches of the same conditional.
-            // If cloning these branches causes a problem due to space issues,
-            // a fix can pass a vector of unique branches to the actual
-            // transformations, as an alternative representation of the
-            // conditional branches to be modified. Right now we assume the
-            // overhead of cloning is minimal since later stages of the compiler
-            // inline all the computations anyway.
-            HloComputation* clone_i =
-                conditional->parent()->parent()->AddEmbeddedComputation(
-                    branch_i->Clone());
-            conditional->set_branch_computation(i, clone_i);
-            conditional_computations[branch_i]--;
+      auto d = ConsiderCodeMotion(conditional, boundary, to_move, next_boundary,
+                                  visited_count);
+      switch (d.GetDirection()) {
+        case Decision::Direction::kMoveOutOfBranch:
+          VLOG(2) << "Local Decision is move out of branch\n";
+          to_move_out.push_back(to_move);
+          new_boundaries_for_moveout.push_back(next_boundary);
+          benefit_move_out += d.GetBenefit();
+          if (benefit_move_out >= benefit_move_in) {
+            final_d = Decision::Direction::kMoveOutOfBranch;
+            VLOG(2) << "Current Decision is move out of branch ("
+                    << to_move_out.size() << ")\n";
+          } else {
+            VLOG(2) << "Current Decision remains move into branch\n";
           }
-        }
-        to_move.clear();
-        next_boundary.clear();
-        VLOG(2) << "Cloned branches as needed: " << conditional->ToString()
-                << "\n";
-        // Need to reanalyze the cloned code to generate correct result.
-        d = ConsiderCodeMotion(conditional, boundary, to_move, next_boundary);
-      }
-      switch (d) {
-        case Decision::kMoveOutOfBranch:
-          VLOG(2) << "Decision is move out of branch\n";
-          to_move_out.insert(to_move_out.end(), to_move.begin(), to_move.end());
-          new_boundaries.insert(new_boundaries.end(), next_boundary.begin(),
-                                next_boundary.end());
           break;
-        case Decision::kMoveIntoBranch:
+        case Decision::Direction::kMoveIntoBranch:
           VLOG(2) << "Decision is move into branch\n";
-          to_move_in.insert(to_move_in.end(), to_move.begin(), to_move.end());
-          new_boundaries.insert(new_boundaries.end(), next_boundary.begin(),
-                                next_boundary.end());
+          to_move_in.push_back(to_move);
+          new_boundaries_for_movein.push_back(next_boundary);
+          benefit_move_in += d.GetBenefit();
+          if (benefit_move_out >= benefit_move_in) {
+            VLOG(2) << "Current Decision remains move out of branch\n";
+          } else {
+            final_d = Decision::Direction::kMoveIntoBranch;
+            VLOG(2) << "Current Decision is move into branch ("
+                    << to_move_in.size() << ")\n";
+          }
           break;
-        case Decision::kNoChange:
+        case Decision::Direction::kNoChange:
           VLOG(2) << "Decision is no change\n";
           for (const Boundary& b : next_boundary) {
             visitor.AddToWorkList(b);
+            VLOG(2) << "Adding new boundary to worklist:" << b.ToString()
+                    << "\n";
           }
           break;
       }
     }
+    // If modification is to be made, need to clone the shared branches.
+    if (final_d != Decision::Direction::kNoChange && conditional_is_shared) {
+      for (int i = 0; i < branch_count; ++i) {
+        HloComputation* branch_i = conditional->branch_computation(i);
+        if (conditional_computations[branch_i] > 0) {
+          // Cloning is absolutely needed if the computation is shared by
+          // different branches, but the cloning can be potentially avoided
+          // if the sharing is only among branches of the same conditional.
+          // If cloning these branches causes a problem due to space issues,
+          // a fix can pass a vector of unique branches to the actual
+          // transformations, as an alternative representation of the
+          // conditional branches to be modified. Right now we assume the
+          // overhead of cloning is minimal since later stages of the compiler
+          // inline all the computations anyway.
+          HloComputation* clone_i =
+              conditional->parent()->parent()->AddEmbeddedComputation(
+                  branch_i->Clone("clone", &clone_context));
+          conditional->set_branch_computation(i, clone_i);
+          conditional_computations[branch_i]--;
+          // Need to translate the analysis result to generate correct result.
+          auto update_boundary = [&](Boundary& boundary) {
+            auto cloned_instr =
+                clone_context.FindInstruction(boundary.operands()[i]);
+            CHECK(cloned_instr != nullptr);
+            VLOG(2) << "boundary before cloning:" << boundary.operands()[i]
+                    << "\n";
+            boundary.mutable_operands()[i] = cloned_instr;
+            VLOG(2) << "boundary after cloning:" << boundary.operands()[i]
+                    << "\n";
+          };
+          // Only boundaries to move out need to be updated.
+          if (final_d == Decision::Direction::kMoveOutOfBranch) {
+            for (int i = 0; i < to_move_out.size(); ++i) {
+              std::vector<Boundary>& m = to_move_out[i];
+              std::for_each(m.begin(), m.end(), update_boundary);
+            }
+            for (int i = 0; i < new_boundaries_for_moveout.size(); ++i) {
+              std::vector<Boundary>& m = new_boundaries_for_moveout[i];
+              std::for_each(m.begin(), m.end(), update_boundary);
+            }
+          }
+        }
+      }
+      VLOG(2) << "Cloned branches as needed: " << conditional->ToString()
+              << "\n";
+    }
     // At most one of to_move_out or to_move_in can be non-empty, since there is
     // only one optimization decision.
-    if (!to_move_out.empty()) {
-      TF_ASSIGN_OR_RETURN(
-          bool result,
-          MoveInstructionOut(conditional, to_move_out, new_boundaries));
-      VLOG(2) << "moving out result:" << result << "\n";
-      changed |= result;
-    } else if (!to_move_in.empty()) {
-      TF_ASSIGN_OR_RETURN(
-          bool result,
-          MoveInstructionIn(conditional, to_move_in, new_boundaries));
-      VLOG(2) << "moving in result:" << result << "\n";
-      changed |= result;
+    if (final_d == Decision::Direction::kMoveOutOfBranch) {
+      CHECK(to_move_out.size() == new_boundaries_for_moveout.size());
+      for (int i = 0; i < to_move_out.size(); ++i) {
+        TF_ASSIGN_OR_RETURN(bool result,
+                            MoveInstructionOut(conditional, to_move_out[i],
+                                               new_boundaries_for_moveout[i]));
+        changed |= result;
+      }
+      VLOG(2) << "Done moving out of branches " << to_move_out.size()
+              << " times. \n";
+      if (!ConsumeFuel("conditional_code_motion", [&] {
+            return "Skipping conditional opt after allowed limit reaching 0.\n";
+          })) {
+        break;
+      }
+    } else if (final_d == Decision::Direction::kMoveIntoBranch) {
+      CHECK(to_move_in.size() == new_boundaries_for_movein.size());
+      for (int i = 0; i < to_move_in.size(); ++i) {
+        TF_ASSIGN_OR_RETURN(bool result,
+                            MoveInstructionIn(conditional, to_move_in[i],
+                                              new_boundaries_for_movein[i]));
+        changed |= result;
+      }
+      VLOG(2) << "Done moving into branches " << to_move_in.size()
+              << " times. \n";
+      if (!ConsumeFuel("conditional_code_motion", [&] {
+            return "Skipping conditional opt after allowed limit reaching 0.\n";
+          })) {
+        break;
+      }
     } else if (pursue_full_conditional_code_motion_ && !conditional_is_shared) {
       // Invoke special handling for convert rematerialization/hoisting
       // We need to make sure no sharing is present in the branches because no
@@ -1061,17 +1313,30 @@ StatusOr<bool> ConditionalCodeMotion::Run(HloModule* module) {
       TF_ASSIGN_OR_RETURN(
           bool convert_result,
           ConvertSpecialMove(conditional, is_layout_sensitive_));
+      if (convert_result) {
+        VLOG(2) << "Done special moving of convert\n";
+        if (!ConsumeFuel("conditional_code_motion", [&] {
+              return "Skipping conditional opt after allowed limit reaching "
+                     "0.\n";
+            })) {
+          break;
+        }
+      }
       changed |= convert_result;
     }
   }
   if (changed) {
     HloPassPipeline subpipeline(
         "after_conditional_code_motion_after_convert_hoisting");
+    VLOG(2) << "starting after motion passes: DCE\n";
     subpipeline.AddPass<HloDCE>();
     subpipeline.AddPass<TupleSimplifier>();
     subpipeline.AddPass<HloDCE>();
-    TF_ASSIGN_OR_RETURN(bool cleanup_changed, subpipeline.Run(module));
-    changed |= cleanup_changed;
+    TF_ASSIGN_OR_RETURN(auto cleanup_changed_now, subpipeline.Run(module));
+    cleanup_changed |= cleanup_changed_now;
+  }
+  if (cleanup_changed) {
+    VLOG(2) << "subpipeline cleanup have modified code\n";
   }
   return changed;
 }
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion.h b/tensorflow/compiler/xla/service/conditional_code_motion.h
index 68a2aa58235..eaec91cfb00 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion.h
+++ b/tensorflow/compiler/xla/service/conditional_code_motion.h
@@ -52,6 +52,9 @@ class Boundary {
     }
     return res;
   }
+  bool operator==(const Boundary& that) {
+    return ContainersEqual(operands_, that.operands_);
+  }
 
  private:
   // Boundary instructions in the conditional branches, one from each branch
@@ -78,13 +81,30 @@ class ConditionalCodeMotion : public HloModulePass {
   StatusOr<bool> Run(HloModule* module) override;
 
   // Optimization decision for each boundary of the conditional instruction.
-  enum class Decision { kMoveOutOfBranch, kMoveIntoBranch, kNoChange };
+  class Decision {
+   public:
+    enum class Direction : uint8 {
+      kMoveOutOfBranch,
+      kMoveIntoBranch,
+      kNoChange
+    };
+
+   public:
+    Decision(Direction direction, int benefit)
+        : direction_(direction), benefit_(benefit) {}
+    Direction GetDirection() const { return direction_; }
+    int GetBenefit() const { return benefit_; }
+
+   private:
+    Direction direction_;
+    int benefit_;
+  };
   // If the optimization decision is NO_CHANGE, new_boundary is set to nullptr;
   // otherwise, it is set to the new boundary after proposed optimization.
-  virtual Decision ConsiderCodeMotion(HloInstruction* conditional,
-                                      const Boundary& cur_boundary,
-                                      std::vector<Boundary>& to_move,
-                                      std::vector<Boundary>& new_boundaries);
+  virtual Decision ConsiderCodeMotion(
+      HloInstruction* conditional, const Boundary& cur_boundary,
+      std::vector<Boundary>& to_move, std::vector<Boundary>& new_boundaries,
+      absl::flat_hash_map<HloInstruction*, int>& visited_count);
 
  private:
   const bool is_layout_sensitive_;
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion_test.cc b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
index b91f3813980..3b40acf54e3 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_code_motion_test.cc
@@ -78,6 +78,52 @@ ENTRY main {
   EXPECT_THAT(root, AllOf(op::Tuple(op::Convert(), op::GetTupleElement())));
 }
 
+TEST_F(ConditionalCodeMotionTest, VerifyConditionalAnalysisWithWhileTuple) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveDotOpOut
+
+    body {
+      %p_body = (f32[2], bf16[2], s32[]) parameter(0)
+      %val = f32[2] get-tuple-element(p_body), index=0
+      %val2 = bf16[2] get-tuple-element(p_body), index=1
+      %const = s32[] constant(-1)
+      ROOT root = (f32[2], bf16[], s32[]) tuple(%val, %val2, %const)
+    }
+
+    condition {
+      %p_cond = (f32[2], bf16[2], s32[]) parameter(0)
+      %gte = s32[] get-tuple-element(%p_cond), index=2
+      %const = s32[] constant(42)
+      ROOT result = pred[] compare(%gte, %const), direction=EQ
+    }
+
+    on_true {
+      %arg_tuple.1 = f32[2] parameter(0)
+      %const = s32[] constant(42)
+      %add.8493 = f32[2] add(f32[2] %arg_tuple.1, f32[2] %arg_tuple.1)
+      %convert.2894 = bf16[2] convert(f32[2] %add.8493)
+      ROOT %tuple.1 = (f32[2], bf16[2], s32[]) tuple(%add.8493, %convert.2894, %const)
+    }
+    on_false {
+      %arg_tuple.1 = f32[2] parameter(0)
+      %const = s32[] constant(42)
+      %add.8493 = f32[2] add(f32[2] %arg_tuple.1, f32[2] %arg_tuple.1)
+      %convert.2894 = bf16[2] convert(f32[2] %add.8493)
+      %while_init = (f32[2], bf16[2], s32[]) tuple(%add.8493, %convert.2894, %const)
+      ROOT while = (f32[2], bf16[2], s32[]) while(%while_init), condition=condition, body=body
+    }
+    ENTRY main {
+      pred.1 = pred[] parameter(0)
+      arg_tuple.11 = f32[2] parameter(1)
+      ROOT conditional = (f32[2], bf16[2], s32[]) conditional(pred.1, arg_tuple.11, arg_tuple.11), true_computation=on_true, false_computation=on_false
+    }
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_FALSE(pass.Run(&*module).ValueOrDie());
+}
+
 TEST_F(ConditionalCodeMotionTest, MoveConvertOutConditionalRoot) {
   absl::string_view hlo_string =
       R"(
@@ -158,6 +204,44 @@ ENTRY main {
   EXPECT_THAT(root, AllOf(op::Tuple(op::Convert())));
 }
 
+TEST_F(ConditionalCodeMotionTest, ConditionalShapeNotMutable) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveDotOpOut
+
+on_true {
+  %arg_tuple.1 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.1 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.1), index=0
+  %reshape.8493 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.1)
+  %add.8493 = f32[2,512,364]{2,1,0} add(f32[2,512,364]{2,1,0} %reshape.8493, f32[2,512,364]{2,1,0} %reshape.8493)
+  %convert.2894 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %add.8493)
+  ROOT %tuple.1 = ( bf16[2,512,364]{2,1,0}) tuple(%convert.2894)
+}
+
+on_false {
+  %arg_tuple.2 = (f32[93184,4]{1,0}) parameter(0)
+  %get-tuple-element.3 = f32[93184,4]{1,0} get-tuple-element(%arg_tuple.2), index=0
+  %reshape.9717 = f32[2,512,364]{2,1,0} reshape(f32[93184,4]{1,0} %get-tuple-element.3)
+  %add.8493 = f32[2,512,364]{2,1,0} add(f32[2,512,364]{2,1,0} %reshape.9717, f32[2,512,364]{2,1,0} %reshape.9717)
+  %sub.8493 = f32[2,512,364]{2,1,0} subtract(f32[2,512,364]{2,1,0} %add.8493, f32[2,512,364]{2,1,0} %reshape.9717)
+  %convert.3604 = bf16[2,512,364]{2,1,0} convert(f32[2,512,364]{2,1,0} %reshape.9717), metadata={op_type="Cast" op_name="gradients/Cast_125_grad/Cast"}
+  ROOT %tuple.2 = (bf16[2,512,364]{2,1,0}) tuple(%convert.3604)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  arg_tuple.11 = (f32[93184,4]{1,0}) parameter(1)
+  arg_tuple.22 = (f32[93184,4]{1,0}) parameter(2)
+  conditional = (bf16[2,512,364]{2,1,0}) conditional(pred.1, arg_tuple.11, arg_tuple.22), true_computation=on_true, false_computation=on_false
+  get-first-index = bf16[2,512,364]{2,1,0} get-tuple-element(conditional), index=0
+  ROOT result = (bf16[2,512,364]{2,1,0}, (bf16[2,512,364]{2,1,0})) tuple(get-first-index, conditional)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_FALSE(pass.Run(&*module).ValueOrDie());
+}
+
 TEST_F(ConditionalCodeMotionTest, MoveConvertOut) {
   absl::string_view hlo_string =
       R"(
@@ -196,17 +280,16 @@ ENTRY main {
   const HloInstruction* conditional =
       FindInstruction(module.get(), "conditional");
   const HloComputation* on_true = conditional->branch_computation(0);
-  ASSERT_EQ(on_true->instruction_count(), 2);
+  ASSERT_EQ(on_true->instruction_count(), 1);
   const HloComputation* on_false = conditional->branch_computation(1);
-  ASSERT_EQ(on_false->instruction_count(), 2);
+  ASSERT_EQ(on_false->instruction_count(), 1);
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(
       root,
-      AllOf(op::Tuple(op::Add(op::Convert(op::Reshape(op::GetTupleElement(
-                                  op::GetTupleElement(op::Conditional())))),
-                              op::Convert(op::Reshape(op::GetTupleElement(
-                                  op::GetTupleElement(op::Conditional()))))))));
+      AllOf(op::Tuple(op::Add(
+          op::Convert(op::Reshape(op::GetTupleElement(op::Conditional()))),
+          op::Convert(op::Reshape(op::GetTupleElement(op::Conditional())))))));
 }
 
 TEST_F(ConditionalCodeMotionTest, UserShareOperandCannotBeMoved) {
@@ -297,7 +380,7 @@ on_false {
   get-tuple-element.2 = f32[] get-tuple-element(arg_tuple.2), index=0
   constant.3 = f32[] constant(1)
   constant.4 = f32[] constant(2)
-  add.4 = f32[] add(get-tuple-element.2, constant.3)
+  add.4 = f32[] add(constant.4, constant.3)
   add.5 = f32[] add(get-tuple-element.2, constant.4)
   add.6 = f32[] add(add.4, add.5)
   ROOT tuple.4 = (f32[]) tuple(add.6)
@@ -322,7 +405,7 @@ ENTRY main {
   const HloComputation* on_true = conditional->branch_computation(0);
   ASSERT_EQ(on_true->instruction_count(), 1);
   const HloComputation* on_false = conditional->branch_computation(1);
-  ASSERT_EQ(on_false->instruction_count(), 1);
+  ASSERT_EQ(on_false->instruction_count(), 3);
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(
@@ -505,6 +588,7 @@ ENTRY main {
   pred.1 = pred[] parameter(0)
   arg_tuple.3 = (bf16[2,54,168,128], bf16[2,52,168,128]) parameter(1)
   arg_tuple.4 = (bf16[2,86,104,128], bf16[2,84,104,128]) parameter(2)
+  arg_tuple.5 = f32[3,3,128,128] parameter(3)
   conditional = (f32[3,3,128,128])
     conditional(pred.1, arg_tuple.3, arg_tuple.4), true_computation=on_true,
     false_computation=on_false
@@ -519,6 +603,7 @@ ENTRY main {
   ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
   const HloInstruction* conditional =
       FindInstruction(module.get(), "conditional");
+  CHECK(conditional != nullptr);
   const HloComputation* on_true = conditional->branch_computation(0);
   ASSERT_EQ(on_true->instruction_count(), 5);
   const HloComputation* on_false = conditional->branch_computation(1);
@@ -537,6 +622,89 @@ ENTRY main {
               op::AllReduce(op::GetTupleElement(op::Conditional())))))));
 }
 
+TEST_F(ConditionalCodeMotionTest, DoNotMoveAllReduceIn) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+%add.64 (x.139: bf16[], y.139: bf16[]) -> bf16[] {
+  %x.139 = bf16[]{:T(512)} parameter(0)
+  %y.139 = bf16[]{:T(512)} parameter(1)
+  ROOT %add.44073 = bf16[]{:T(512)} add(bf16[]{:T(512)} %x.139, bf16[]{:T(512)} %y.139)
+}
+
+%add.181 (x.256: bf16[], y.256: bf16[]) -> bf16[] {
+  %x.256 = bf16[]{:T(512)} parameter(0)
+  %y.256 = bf16[]{:T(512)} parameter(1)
+  ROOT %add.44842 = bf16[]{:T(512)} add(bf16[]{:T(512)} %x.256, bf16[]{:T(512)} %y.256)
+}
+
+on_true {
+  arg_tuple.1 = (bf16[2,54,168,128], bf16[2,52,168,128]) parameter(0)
+  get-tuple-element.11 = bf16[2,54,168,128] get-tuple-element(arg_tuple.1), index=0
+  get-tuple-element.12 = bf16[2,52,168,128] get-tuple-element(arg_tuple.1), index=1
+  convolution.1 = bf16[3,3,128,128] convolution(bf16[2,54,168,128]
+    get-tuple-element.11, bf16[2,52,168,128]
+    get-tuple-element.12), window={size=52x168 pad=0_0x1_1},
+    dim_labels=f01b_i01o->01bf
+  add.1 = bf16[3,3,128,128] add(bf16[3,3,128,128] convolution.1, bf16[3,3,128,128] convolution.1)
+  ROOT tuple.1 = (bf16[3,3,128,128]) tuple(add.1)
+}
+
+on_false {
+  arg_tuple.2 = (bf16[2,86,104,128], bf16[2,84,104,128]) parameter(0)
+  get-tuple-element.21 = bf16[2,86,104,128]
+    get-tuple-element(arg_tuple.2), index=0
+  get-tuple-element.22 = bf16[2,84,104,128]
+    get-tuple-element(arg_tuple.2), index=1
+  convolution.2 = bf16[3,3,128,128]
+    convolution(bf16[2,86,104,128] get-tuple-element.21, bf16[2,84,104,128]
+    get-tuple-element.22), window={size=84x104 pad=0_0x1_1},
+    dim_labels=f01b_i01o->01bf
+  add.2 = bf16[3,3,128,128] add(bf16[3,3,128,128] convolution.2, bf16[3,3,128,128] convolution.2)
+  ROOT tuple.2 = (bf16[3,3,128,128]) tuple(add.2)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  arg_tuple.3 = (bf16[2,54,168,128], bf16[2,52,168,128]) parameter(1)
+  arg_tuple.4 = (bf16[2,86,104,128], bf16[2,84,104,128]) parameter(2)
+  arg_tuple.5 = f32[3,3,128,128] parameter(3)
+  conditional = (bf16[3,3,128,128])
+    conditional(pred.1, arg_tuple.3, arg_tuple.4), true_computation=on_true,
+    false_computation=on_false
+  get-first-index = bf16[3,3,128,128] get-tuple-element(conditional), index=0
+  all-reduce.2 = bf16[3,3,128,128]
+    all-reduce(bf16[3,3,128,128] %get-first-index),
+    channel_id=485, replica_groups={{0,1}}, use_global_device_ids=true,
+    to_apply=%add.181, metadata={op_type="Conv2DBackpropFilter"
+    op_name="gradients/resnet50/conv2d_22/Conv2D_grad/Conv2DBackpropFilter"}
+  convert.2 = f32[3,3,128,128]
+    convert(bf16[3,3,128,128] %all-reduce.2),
+    metadata={op_type="Cast" op_name="Cast_15"}
+   ROOT result = (f32[3,3,128,128]) tuple(convert.2)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_FALSE(pass.Run(&*module).ValueOrDie());
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  CHECK(conditional != nullptr);
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 6);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 6);
+
+  // Checks if conditional shape has changed.
+  ASSERT_TRUE(ShapeUtil::Compatible(
+      conditional->shape(), ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(
+                                BF16, {3, 3, 128, 128})})));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Tuple(op::Convert(op::AllReduce(
+                        op::GetTupleElement(op::Conditional()))))));
+}
+
 TEST_F(ConditionalCodeMotionTest, MovePowOpIn) {
   absl::string_view hlo_string =
       R"(
@@ -581,7 +749,47 @@ ENTRY main {
   EXPECT_THAT(root, AllOf(op::GetTupleElement(op::Conditional())));
 }
 
-TEST_F(ConditionalCodeMotionTest, MovePowInWithSharedBranch) {
+TEST_F(ConditionalCodeMotionTest, MoveInWithMultipleGTE) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+on_true {
+  arg_tuple.1 = (f32[10]) parameter(0)
+  get-tuple-element.1 = f32[10] get-tuple-element(arg_tuple.1), index=0
+  add.1 = f32[10] add(get-tuple-element.1, get-tuple-element.1)
+  ROOT tuple.3 = (f32[10]) tuple(add.1)
+}
+
+on_false {
+  arg_tuple.2 = (f32[10]) parameter(0)
+  get-tuple-element.2 = f32[10] get-tuple-element(arg_tuple.2), index=0
+  mul.1 = f32[10] multiply(get-tuple-element.2, get-tuple-element.2)
+  ROOT tuple.4 = (f32[10]) tuple(mul.1)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  tuple.1 = (f32[10]) parameter(1)
+  tuple.2 = (f32[10]) parameter(2)
+  conditional = (f32[10])
+    conditional(pred.1, tuple.1, tuple.2), true_computation=on_true,
+    false_computation=on_false
+  get-first-index = f32[10] get-tuple-element(conditional), index=0
+  get-first-index.2 = f32[10] get-tuple-element(conditional), index=0
+  pow.1 = f32[10] power(get-first-index, get-first-index.2)
+  ROOT tuple.3 = (f32[10], f32[10]) tuple(pow.1, get-first-index.2)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Tuple(op::GetTupleElement(op::Conditional()),
+                              op::GetTupleElement(op::Conditional())));
+}
+
+TEST_F(ConditionalCodeMotionTest, MoveOutWithSharedBranch) {
   absl::string_view hlo_string =
       R"(
 HloModule RemoveIdenticalInstruction
@@ -610,12 +818,16 @@ ENTRY main {
   const HloInstruction* conditional =
       FindInstruction(module.get(), "conditional");
   const HloComputation* on_true = conditional->branch_computation(0);
-  ASSERT_EQ(on_true->instruction_count(), 5);
+  ASSERT_EQ(on_true->instruction_count(), 1);
   const HloComputation* on_false = conditional->branch_computation(1);
-  ASSERT_EQ(on_false->instruction_count(), 5);
+  ASSERT_EQ(on_false->instruction_count(), 1);
 
   HloInstruction* root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, AllOf(op::GetTupleElement(op::Conditional())));
+  EXPECT_THAT(
+      root, AllOf(op::Power(op::Add(op::GetTupleElement(op::Conditional()),
+                                    op::GetTupleElement(op::Conditional())),
+                            op::Add(op::GetTupleElement(op::Conditional()),
+                                    op::GetTupleElement(op::Conditional())))));
 }
 
 TEST_F(ConditionalCodeMotionTest, MovePowInWithNonTupleRoot) {
@@ -728,6 +940,257 @@ ENTRY main {
   EXPECT_THAT(root, AllOf(op::GetTupleElement(op::Conditional())));
 }
 
+TEST_F(ConditionalCodeMotionTest, MoveCopyInBranch) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+branch1 {
+  arg_tuple.1 = (s32[], f32[10,3]{0,1}) parameter(0)
+  constant.1 = s32[] constant(4)
+  get-tuple-element.1 = s32[] get-tuple-element(arg_tuple.1), index=0
+  add.1 = s32[] add(get-tuple-element.1, constant.1)
+  get-tuple-element.2 = f32[10,3]{0,1} get-tuple-element(arg_tuple.1), index=1
+  slice.1 = f32[4,3]{0,1} slice(get-tuple-element.2),
+   slice={[0:4:1], [0:3:1]}
+  constant.2 = f32[] constant(0.0)
+  ROOT tuple.1 = (f32[4,3]{0,1}, s32[],f32[]) tuple(slice.1, add.1, constant.2)
+}
+
+branch2 {
+  arg_tuple.2 = (s32[], f32[4,3]{1,0}) parameter(0)
+  get-tuple-element.3 = s32[] get-tuple-element(arg_tuple.2), index=0
+  copy.1 = s32[] copy(get-tuple-element.3)
+  get-tuple-element.4 = f32[4,3]{1,0} get-tuple-element(arg_tuple.2), index=1
+  copy.2 = f32[4,3]{0,1} copy(get-tuple-element.4)
+  constant.2 = f32[] constant(0.0)
+  ROOT tuple.2 = (f32[4,3]{0,1}, s32[], f32[]) tuple(copy.2, copy.1, constant.2)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  tuple.3 = (s32[], f32[10,3]{0,1}) parameter(1)
+  tuple.4 = (s32[], f32[4,3]{1,0}) parameter(2)
+  conditional = (f32[4,3]{0,1}, s32[], f32[])
+    conditional(pred.1, tuple.3, tuple.4), true_computation=branch1,
+    false_computation=branch2
+  get-zero-index = f32[4,3]{0,1} get-tuple-element(conditional), index=0
+  get-first-index = s32[] get-tuple-element(conditional), index=1
+  get-second-index = f32[] get-tuple-element(conditional), index=2
+  copy.3 = f32[4,3]{1,0} copy(get-zero-index)
+  ROOT tuple.5 = (f32[4,3]{0,1}, s32[], f32[]) tuple(copy.3, get-first-index,
+                 get-second-index)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+  VLOG(1) << module->ToString();
+
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 9);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 8);
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::Tuple(op::GetTupleElement(op::Conditional(), 2),
+                              op::GetTupleElement(op::Conditional(), 0),
+                              op::GetTupleElement(op::Conditional(), 1))));
+}
+
+TEST_F(ConditionalCodeMotionTest, MoveReplicatedTupleEntryOut) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+%add.64 (x.139: bf16[], y.139: bf16[]) -> bf16[] {
+  %x.139 = bf16[]{:T(512)} parameter(0)
+  %y.139 = bf16[]{:T(512)} parameter(1)
+  ROOT %add.44073 = bf16[]{:T(512)} add(bf16[]{:T(512)} %x.139, bf16[]{:T(512)} %y.139)
+}
+
+%add.181 (x.256: bf16[], y.256: bf16[]) -> bf16[] {
+  %x.256 = bf16[]{:T(512)} parameter(0)
+  %y.256 = bf16[]{:T(512)} parameter(1)
+  ROOT %add.44842 = bf16[]{:T(512)} add(bf16[]{:T(512)} %x.256, bf16[]{:T(512)} %y.256)
+}
+
+on_true {
+  arg_tuple.1 = (bf16[2,54,168,128], bf16[2,52,168,128]) parameter(0)
+  get-tuple-element.11 = bf16[2,54,168,128] get-tuple-element(arg_tuple.1), index=0
+  get-tuple-element.12 = bf16[2,52,168,128] get-tuple-element(arg_tuple.1), index=1
+  convolution.1 = bf16[3,3,128,128] convolution(bf16[2,54,168,128]
+    get-tuple-element.11, bf16[2,52,168,128]
+    get-tuple-element.12), window={size=52x168 pad=0_0x1_1},
+    dim_labels=f01b_i01o->01bf
+  all-reduce.1 = bf16[3,3,128,128]
+    all-reduce(bf16[3,3,128,128] %convolution.1),
+    channel_id=188, replica_groups={{0,1}}, use_global_device_ids=true,
+    to_apply=%add.64
+  convert.1 = f32[3,3,128,128] convert(bf16[3,3,128,128] %all-reduce.1)
+  all-reduce.3 = bf16[3,3,128,128]
+    all-reduce(bf16[3,3,128,128] %convolution.1),
+    channel_id=188, replica_groups={{0,1}}, use_global_device_ids=true,
+    to_apply=%add.64
+  convert.3 = f32[3,3,128,128] convert(bf16[3,3,128,128] %all-reduce.3)
+  ROOT tuple.1 = (f32[3,3,128,128], f32[3,3,128,128]) tuple(convert.1, convert.3)
+}
+
+on_false {
+  arg_tuple.2 = (bf16[2,86,104,128], bf16[2,84,104,128]) parameter(0)
+  get-tuple-element.21 = bf16[2,86,104,128]
+    get-tuple-element(arg_tuple.2), index=0
+  get-tuple-element.22 = bf16[2,84,104,128]
+    get-tuple-element(arg_tuple.2), index=1
+  convolution.2 = bf16[3,3,128,128]
+    convolution(bf16[2,86,104,128] get-tuple-element.21, bf16[2,84,104,128]
+    get-tuple-element.22), window={size=84x104 pad=0_0x1_1},
+    dim_labels=f01b_i01o->01bf
+  all-reduce.2 = bf16[3,3,128,128]
+    all-reduce(bf16[3,3,128,128] %convolution.2),
+    channel_id=485, replica_groups={{0,1}}, use_global_device_ids=true,
+    to_apply=%add.181
+  convert.2 = f32[3,3,128,128]
+    convert(bf16[3,3,128,128] %all-reduce.2)
+  ROOT tuple.2 = (f32[3,3,128,128], f32[3,3,128,128]) tuple(convert.2, convert.2)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  arg_tuple.3 = (bf16[2,54,168,128], bf16[2,52,168,128]) parameter(1)
+  arg_tuple.4 = (bf16[2,86,104,128], bf16[2,84,104,128]) parameter(2)
+  conditional = (f32[3,3,128,128], f32[3,3,128,128])
+    conditional(pred.1, arg_tuple.3, arg_tuple.4), true_computation=on_true,
+    false_computation=on_false
+  get-first-index = f32[3,3,128,128]
+    get-tuple-element(conditional), index=0
+  add.1 = f32[3,3,128,128] add(f32[3,3,128,128] get-first-index, f32[3,3,128,128] get-first-index)
+  ROOT result = (f32[3,3,128,128]) tuple(add.1)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional");
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 5);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 5);
+
+  // Checks if conditional shape has changed.
+  ASSERT_TRUE(ShapeUtil::Compatible(
+      conditional->shape(), ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(
+                                BF16, {3, 3, 128, 128})})));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Tuple(op::Add(
+          op::Convert(op::AllReduce(op::GetTupleElement(op::Conditional()))),
+          op::Convert(
+              op::AllReduce(op::GetTupleElement(op::Conditional())))))));
+}
+
+TEST_F(ConditionalCodeMotionTest, DoNotMoveWithExtraOperand) {
+  absl::string_view hlo_string =
+      R"(
+HloModule RemoveIdenticalInstruction
+
+branch {
+  arg.1 = f32[10] parameter(0)
+  ROOT add.1 = f32[10] add(arg.1, arg.1)
+}
+
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  tuple.1 = f32[10] parameter(1)
+  tuple.2 = f32[10] parameter(2)
+  conditional = f32[10]
+    conditional(pred.1, tuple.1, tuple.2), true_computation=branch,
+    false_computation=branch
+  ROOT pow.1 = f32[10] power(conditional, tuple.2)
+}
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_FALSE(pass.Run(&*module).ValueOrDie());
+}
+
+TEST_F(ConditionalCodeMotionTest, MultipleIndependentMoveIns) {
+  absl::string_view hlo_string =
+      R"(
+HloModule FromNMT
+
+%add.31755 (x.139: f32[], y.139: bf16[]) -> bf16[] {
+  %x.139 = bf16[]{:T(512)} parameter(0)
+  %y.139 = bf16[]{:T(512)} parameter(1)
+  ROOT %add.44073 = bf16[]{:T(512)} add(bf16[]{:T(512)} %x.139, bf16[]{:T(512)} %y.139)
+}
+
+%nmt.1 {
+  %wide_param.3 = (bf16[1024,4096]{1,0}, bf16[18,64,1024]{2,1,0}, s32[]) parameter(0)
+  %get-tuple-element.16525 = bf16[1024,4096]{1,0} get-tuple-element((bf16[1024,4096]{1,0}, bf16[18,64,1024]{2,1,0}, s32[]) %wide_param.3), index=0
+  %get-tuple-element.16527 = bf16[18,64,1024]{2,1,0} get-tuple-element((bf16[1024,4096]{1,0}, bf16[18,64,1024]{2,1,0}, s32[]) %wide_param.3), index=1
+  %get-tuple-element.16588 = s32[] get-tuple-element((bf16[1024,4096]{1,0}, bf16[18,64,1024]{2,1,0}, s32[]) %wide_param.3), index=2
+  %add.3764 = s32[] add(s32[] %get-tuple-element.16588, s32[] %get-tuple-element.16588), metadata={op_type="Sub" op_name="sub"}
+  %reshape.9821 = s32[1]{0} reshape(s32[] %add.3764)
+  %reshape.9822 = s32[] reshape(s32[1]{0} %reshape.9821)
+  %constant.13127 = s32[] constant(0)
+  %dynamic-slice.1245 = bf16[1,64,1024]{2,1,0} dynamic-slice(bf16[18,64,1024]{2,1,0} %get-tuple-element.16527, s32[] %reshape.9822, s32[] %constant.13127, s32[] %constant.13127), dynamic_slice_sizes={1,64,1024}
+  %reshape.9825 = bf16[64,1024]{1,0} reshape(bf16[1,64,1024]{2,1,0} %dynamic-slice.1245), metadata={op_type="GatherV2" op_name="GatherV2"}
+  %logistic.814 = bf16[64,1024]{1,0} logistic(bf16[64,1024]{1,0} %reshape.9825), metadata={op_type="Sigmoid" op_name="Sigmoid"}
+  %multiply.4890 = bf16[64,1024]{1,0} multiply(bf16[64,1024]{1,0} %reshape.9825, bf16[64,1024]{1,0} %logistic.814), metadata={op_type="Mul" op_name="mul"}
+  %tanh.573 = bf16[64,1024]{1,0} tanh(bf16[64,1024]{1,0} %reshape.9825), metadata={op_type="Tanh" op_name="Tanh"}
+  %multiply.4891 = bf16[64,1024]{1,0} multiply(bf16[64,1024]{1,0} %logistic.814, bf16[64,1024]{1,0} %tanh.573), metadata={op_type="Mul" op_name="mul_1"}
+  %add.3766 = bf16[64,1024]{1,0} add(bf16[64,1024]{1,0} %multiply.4890, bf16[64,1024]{1,0} %multiply.4891), metadata={op_type="AddV2" op_name="add_1"}
+  %multiply.4894 = bf16[64,1024]{1,0} multiply(bf16[64,1024]{1,0} %add.3766, bf16[64,1024]{1,0} %logistic.814), metadata={op_type="Mul" op_name="gradients_1/mul_grad/Mul"}
+  %constant.10568 = bf16[] constant(1), metadata={op_type="TanhGrad" op_name="gradients/Tanh_1_grad/TanhGrad"}
+  %broadcast.7198 = bf16[64,1024]{1,0} broadcast(bf16[] %constant.10568), dimensions={}, metadata={op_type="TanhGrad" op_name="gradients/Tanh_1_grad/TanhGrad"}
+  %multiply.4896 = bf16[64,1024]{1,0} multiply(bf16[64,1024]{1,0} %tanh.573, bf16[64,1024]{1,0} %tanh.573), metadata={op_type="TanhGrad" op_name="gradients/Tanh_1_grad/TanhGrad"}
+  %constant.10571 = bf16[] constant(1), metadata={op_type="SigmoidGrad" op_name="gradients/Sigmoid_grad/SigmoidGrad"}
+  %broadcast.7201 = bf16[64,1024]{1,0} broadcast(bf16[] %constant.10571), dimensions={}, metadata={op_type="SigmoidGrad" op_name="gradients/Sigmoid_grad/SigmoidGrad"}
+  %subtract.1702 = bf16[64,1024]{1,0} subtract(bf16[64,1024]{1,0} %broadcast.7201, bf16[64,1024]{1,0} %logistic.814), metadata={op_type="SigmoidGrad" op_name="gradients/Sigmoid_grad/SigmoidGrad"}
+  %multiply.4907 = bf16[64,1024]{1,0} multiply(bf16[64,1024]{1,0} %tanh.573, bf16[64,1024]{1,0} %add.3766), metadata={op_type="Mul" op_name="gradients/mul_2_grad/Mul_1"}
+  %multiply.4908 = bf16[64,1024]{1,0} multiply(bf16[64,1024]{1,0} %multiply.4907, bf16[64,1024]{1,0} %logistic.814), metadata={op_type="SigmoidGrad" op_name="gradients/Sigmoid_2_grad/SigmoidGrad"}
+  %dot.781 = bf16[64,4096]{1,0} dot(bf16[64,1024]{1,0} %multiply.4908, bf16[1024,4096]{1,0} %get-tuple-element.16525), lhs_contracting_dims={1}, rhs_contracting_dims={0}, metadata={op_type="MatMul" op_name="MatMul"}
+  ROOT %tuple.3200 = (bf16[64,1024]{1,0}, bf16[64,4096]{1,0}, s32[]) tuple(bf16[64,1024]{1,0} %multiply.4894, bf16[64,4096]{1,0} %dot.781, s32[] %reshape.9822)
+  }
+ENTRY main {
+  pred.1 = pred[] parameter(0)
+  arg_tuple.3 = (bf16[1024,4096]{1,0}, bf16[18,64,1024]{2,1,0}, s32[]) parameter(1)
+  arg_tuple.4 = (bf16[1024,4096]{1,0}, bf16[18,64,1024]{2,1,0}, s32[]) parameter(2)
+  %arg.2 = s32[] parameter(3)
+  %conditional.3 = (bf16[64,1024]{1,0}, bf16[64,4096]{1,0}, s32[]) conditional(pred.1, arg_tuple.3, arg_tuple.4), true_computation=nmt.1, false_computation=nmt.1
+  %get-tuple-element.15889 = bf16[64,1024]{1,0} get-tuple-element((bf16[64,1024]{1,0}, bf16[64,4096]{1,0}, s32[]) %conditional.3), index=0, metadata={op_type="Case" op_name="switch_case/indexed_case"}
+  %multiply.4596 = bf16[64,1024]{1,0} multiply(bf16[64,1024]{1,0} %get-tuple-element.15889, bf16[64,1024]{1,0} %get-tuple-element.15889), metadata={op_type="L2Loss" op_name="global_norm/L2Loss"}
+  %constant.10279 = bf16[] constant(0), metadata={op_type="L2Loss" op_name="global_norm/L2Loss"}
+  %reduce.844 = bf16[] reduce(bf16[64,1024]{1,0} %multiply.4596, bf16[] %constant.10279), dimensions={0,1}, to_apply=%add.31755, metadata={op_type="L2Loss" op_name="global_norm/L2Loss"}
+  %get-tuple-element.15890 = bf16[64,4096]{1,0} get-tuple-element((bf16[64,1024]{1,0}, bf16[64,4096]{1,0}, s32[]) %conditional.3), index=1, metadata={op_type="Case" op_name="switch_case/indexed_case"}
+  %multiply.4597 = bf16[64,4096]{1,0} multiply(bf16[64,4096]{1,0} %get-tuple-element.15890, bf16[64,4096]{1,0} %get-tuple-element.15890), metadata={op_type="L2Loss" op_name="global_norm/L2Loss"}
+  %constant.10280 = bf16[] constant(0), metadata={op_type="L2Loss" op_name="global_norm/L2Loss"}
+  %reduce.845 = bf16[] reduce(bf16[64,4096]{1,0} %multiply.4597, bf16[] %constant.10280), dimensions={0,1}, to_apply=%add.31755, metadata={op_type="L2Loss" op_name="global_norm/L2Loss"}
+  %multiply.4667 = bf16[] multiply(bf16[] %reduce.845, bf16[]{:T(128)} %reduce.844), metadata={op_type="L2Loss" op_name="global_norm/L2Loss"}
+  ROOT %tuple.3200 = (bf16[], s32[]) tuple(%multiply.4667, s32[] %arg.2)
+  }
+)";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  ConditionalCodeMotion pass(true, true);
+  ASSERT_TRUE(pass.Run(&*module).ValueOrDie());
+  const HloInstruction* conditional =
+      FindInstruction(module.get(), "conditional.3");
+  CHECK(conditional != nullptr);
+  const HloComputation* on_true = conditional->branch_computation(0);
+  ASSERT_EQ(on_true->instruction_count(), 27);
+  const HloComputation* on_false = conditional->branch_computation(1);
+  ASSERT_EQ(on_false->instruction_count(), 27);
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Tuple(op::GetTupleElement(op::Conditional()),
+                                    op::Parameter())));
+}
+
 }  // namespace conditional_opt
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/convolution_group_converter.cc b/tensorflow/compiler/xla/service/convolution_group_converter.cc
index 323bf44dcd3..f5506b894fd 100644
--- a/tensorflow/compiler/xla/service/convolution_group_converter.cc
+++ b/tensorflow/compiler/xla/service/convolution_group_converter.cc
@@ -300,7 +300,8 @@ Status ConvolutionVisitor::HandleBatchGroupCount(HloInstruction* convolution) {
     window_dim->set_window_dilation(1);
     HloInstruction* new_convolution =
         MakeConvolveHlo(activation, filter, convolution->feature_group_count(),
-                        window, dim_numbers, convolution->precision_config())
+                        /*batch_group_count=*/1, window, dim_numbers,
+                        convolution->precision_config())
             .ValueOrDie();
     convolution->SetupDerivedInstruction(new_convolution);
     TF_CHECK_OK(computation_->ReplaceInstruction(
@@ -649,7 +650,8 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
   window_dim->set_window_reversal(false);
   window_dim->set_window_dilation(1);
   HloInstruction* new_convolution =
-      MakeConvolveHlo(activation, filter, 1, window, dim_numbers,
+      MakeConvolveHlo(activation, filter, /*feature_group_count=*/1,
+                      /*batch_group_count=*/1, window, dim_numbers,
                       convolution->precision_config())
           .ValueOrDie();
   convolution->SetupDerivedInstruction(new_convolution);
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index b88120d8128..e313dbe2415 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -217,10 +217,9 @@ bool IndicesToCopyForConditional(const HloDataflowAnalysis& dataflow,
 
 // Add kCopy instructions around the given kWhile instruction to eliminate any
 // possible live range interference of HLO values assuming a dependency-based
-// ordering (HloDependencyOrdering). Copies are added conservatively. There
-// likely are copies which are not strictly necessary, but they are removed
-// later in the pass via RemoveUnnecessaryCopies.
-//
+// ordering. Copies are added conservatively. There  likely are copies which are
+// not strictly necessary, but they are removed later in the pass via
+// RemoveUnnecessaryCopies.
 //
 // Elements (each ShapeIndex) in the loop state are considered independently.  A
 // copy is added to each element of the loop state which is modified in the
@@ -362,6 +361,19 @@ Status AddCopiesForConditional(const HloAliasAnalysis& alias_analysis,
   return Status::OK();
 }
 
+// Add copies for the operands of in-place operations. RemoveUnnecessaryCopies
+// will remove the unnecessary copies.
+Status AddCopiesForInPlaceOperation(const HloAliasAnalysis& alias_analysis,
+                                    HloInstruction* in_place_op,
+                                    int64 operand_number) {
+  VLOG(2) << "Adding copies for in-place operation " << in_place_op->name();
+  HloInstruction* operand = in_place_op->mutable_operand(operand_number);
+  TF_ASSIGN_OR_RETURN(HloInstruction * deep_copy,
+                      in_place_op->parent()->DeepCopyInstruction(operand));
+  TF_RETURN_IF_ERROR(operand->ReplaceUseWith(in_place_op, deep_copy));
+  return Status::OK();
+}
+
 // Conservatively adds copies before root instruction of entry computation and
 // each aliased parameter to resolve interference of aliased input and output
 // buffer. We later rely on RemoveUnnecessaryCopies to drop the unnecessary
@@ -509,6 +521,12 @@ class CopyRemover {
     // value. The map is used to construct the copy info map below.
     absl::flat_hash_map<const HloValue*, ValueNode*> value_to_node;
     for (const HloBuffer& buffer : alias_analysis.buffers()) {
+      // No copies should have been inserted within fused computations, so no
+      // need to remove them. HloOrdering isn't compatible with HloValues inside
+      // fusions, so skip copy removal for them.
+      if (buffer.values().at(0)->defining_instruction()->IsFused()) {
+        continue;
+      }
       // Verify values contained in the buffer are strictly ordered. This
       // should always be the case after adding copies to eliminate
       // interference. Specifically, the addition of the control flow edges
@@ -591,7 +609,7 @@ class CopyRemover {
   void CreateCopyMap(
       const HloModule& module,
       const absl::flat_hash_map<const HloValue*, ValueNode*>& value_to_node) {
-    for (HloComputation* computation : module.computations()) {
+    for (HloComputation* computation : module.MakeNonfusionComputations()) {
       for (HloInstruction* instruction : computation->instructions()) {
         // Add copies with unambiguous source values to the map. Copies with
         // ambiguous sources are not removable.
@@ -858,30 +876,13 @@ class CopyRemover {
   // We cannot use LiveRangeStrictlyBefore because HloValue::uses() is not
   // updated as copies are removed.
   bool LiveRangeBefore(const ValueNode& a, const ValueNode& b) {
-    VLOG(3) << "Checking live range of " << *a.value << " WRT " << *b.value;
-    bool is_live_range_before = [&] {
-      if (a.uses.empty()) {
-        VLOG(2) << "Empty uses for " << *a.value;
-        return ordering_.IsDefinedBefore(*a.value, *b.value);
-      }
-      for (const HloUse* use : a.uses) {
-        VLOG(3) << "Checking use " << *use << " against " << *b.value;
-        if (!ordering_.UseIsBeforeValueDefinition(*use, *b.value, dataflow_)) {
-          VLOG(2) << "Use " << *use << " is NOT before " << *b.value;
-          return false;
-        }
-        VLOG(3) << "Use " << *use << " is before " << *b.value;
-      }
-      return true;
-    }();
-    if (is_live_range_before) {
-      VLOG(2) << "  Live range of " << a.value->ToShortString() << " is before "
-              << b.value->ToShortString();
-    } else {
-      VLOG(2) << "  Live range of " << a.value->ToShortString()
-              << " is not before " << b.value->ToShortString();
+    if (a.uses.empty()) {
+      VLOG(2) << "Empty uses for " << *a.value;
+      return ordering_.IsDefinedBefore(*a.value, *b.value);
     }
-    return is_live_range_before;
+    return absl::c_all_of(a.uses, [&](const HloUse* use) {
+      return ordering_.UseIsBeforeValueDefinition(*use, *b.value, dataflow_);
+    });
   }
 
   // Returns whether 'node' is the last node in its list.
@@ -1005,7 +1006,7 @@ Status CopyInsertion::AddCopiesToResolveInterference(HloModule* module) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
                       HloAliasAnalysis::Run(module, can_share_buffer_));
 
-  for (HloComputation* computation : module->MakeComputationPostOrder()) {
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
     for (HloInstruction* instruction :
          computation->MakeInstructionPostOrder()) {
       if (instruction->opcode() == HloOpcode::kWhile) {
@@ -1013,6 +1014,15 @@ Status CopyInsertion::AddCopiesToResolveInterference(HloModule* module) {
       } else if (instruction->opcode() == HloOpcode::kConditional) {
         TF_RETURN_IF_ERROR(
             AddCopiesForConditional(*alias_analysis, instruction));
+      } else {
+        for (const auto& operand_and_output_index :
+             HloDataflowAnalysis::GetInPlaceInputOutputPairs(instruction)) {
+          const HloUse& operand = operand_and_output_index.first;
+          CHECK_EQ(operand.operand_index, ShapeIndex{})
+              << "Support for non-{} shape operand not currently implemented.";
+          TF_RETURN_IF_ERROR(AddCopiesForInPlaceOperation(
+              *alias_analysis, instruction, operand.operand_number));
+        }
       }
     }
   }
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index 3ee6b200da5..78730cbdcb8 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -2530,5 +2530,250 @@ ENTRY Entry {
   EXPECT_EQ(CountCopies(*module), 1);
 }
 
+TEST_F(CopyInsertionTest, DynamicUpdateSliceNoCopy) {
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  negate = f32[1280,1,128] negate(param)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  ROOT dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(negate, broadcast.6, constant.3, constant.3, constant.3)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 0);
+}
+
+TEST_F(CopyInsertionTest, FusedDynamicUpdateSliceNoCopy) {
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+fused_computation {
+  param0 = f32[1280,1,128] parameter(0)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  ROOT dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(param0, broadcast.6, constant.3, constant.3, constant.3)
+}
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  negate = f32[1280,1,128] negate(param)
+  ROOT fusion = f32[1280,1,128] fusion(negate), kind=kLoop, calls=fused_computation
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 0);
+}
+
+TEST_F(CopyInsertionTest, DynamicUpdateSliceCopy) {
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  negate = f32[1280,1,128] negate(param)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  add = f32[1280,1,128] add(negate, negate)
+  dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(negate, broadcast.6, constant.3, constant.3, constant.3)
+  ROOT tuple = (f32[1280,1,128], f32[1280,1,128]) tuple(add, dynamic-update-slice.5)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
+}
+
+TEST_F(CopyInsertionTest, DynamicUpdateSliceParameterShareCopy) {
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  ROOT dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(param, broadcast.6, constant.3, constant.3, constant.3)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
+}
+
+TEST_F(CopyInsertionTest, FusedDynamicUpdateSliceCopy) {
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+fused_computation {
+  param0 = f32[1280,1,128] parameter(0)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  ROOT dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(param0, broadcast.6, constant.3, constant.3, constant.3)
+}
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  negate = f32[1280,1,128] negate(param)
+  add = f32[1280,1,128] add(negate, negate)
+  fusion = f32[1280,1,128] fusion(negate), kind=kLoop, calls=fused_computation
+  ROOT tuple = (f32[1280,1,128], f32[1280,1,128]) tuple(negate, fusion)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
+}
+
+TEST_F(CopyInsertionTest, ChainDynamicUpdateSliceCopy) {
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+ENTRY main {
+  state = (s32[], f32[1280,1,128]{2,1,0}) parameter(0)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128]{2,1,0} broadcast(constant.1), dimensions={}
+  get-tuple-element.4 = f32[1280,1,128]{2,1,0} get-tuple-element(state), index=1
+  get-tuple-element.3 = s32[] get-tuple-element(state), index=0
+  constant.2 = s32[] constant(128)
+  add.5 = s32[] add(get-tuple-element.3, constant.2)
+  constant.3 = s32[] constant(0)
+  dynamic-update-slice.5 = f32[1280,1,128]{2,1,0} dynamic-update-slice(get-tuple-element.4, broadcast.6, constant.3, constant.3, constant.3)
+  dynamic-update-slice.9 = f32[1280,1,128]{2,1,0} dynamic-update-slice(dynamic-update-slice.5, broadcast.6, constant.3, constant.3, constant.3)
+  ROOT tuple.85 = (s32[], s32[], s32[2]{0}, f32[1280,1,128]{2,1,0}) tuple(add.5, dynamic-update-slice.9)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
+}
+
+TEST_F(CopyInsertionTest, FusedDynamicUpdateSliceCopy2) {
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+fused_computation.1 {
+  param0 = f32[1280,1,128] parameter(0)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  ROOT dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(param0, broadcast.6, constant.3, constant.3, constant.3)
+}
+
+fused_computation.2 {
+  param0 = f32[1280,1,128] parameter(0)
+  param1 = f32[1280,1,128] parameter(1)
+  slice = f32[128,1,128] slice(param1), slice={[0:128], [0:1], [0:128]}
+  constant.3 = s32[] constant(0)
+  ROOT dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(param0, slice, constant.3, constant.3, constant.3)
+}
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  negate = f32[1280,1,128] negate(param)
+  add = f32[1280,1,128] add(negate, negate)
+  fusion1 = f32[1280,1,128] fusion(negate), kind=kLoop, calls=fused_computation.1
+  ROOT fusion2 = f32[1280,1,128] fusion(fusion1, negate), kind=kLoop, calls=fused_computation.2
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
+}
+
+TEST_F(CopyInsertionTest, MultiOutputFusedDynamicUpdateSliceCopy) {
+  // Tests multi-output fusion with two DUS outputs, requiring two copies.
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+fused_computation {
+  param0 = f32[1280,1,128] parameter(0)
+  param1 = f32[1280,1,128] parameter(1)
+  param2 = f32[1280,1,128] parameter(2)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  add.1 = f32[1280,1,128] add(param0, param0)
+  dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(param1, broadcast.6, constant.3, constant.3, constant.3)
+  dynamic-update-slice.6 = f32[1280,1,128] dynamic-update-slice(param2, broadcast.6, constant.3, constant.3, constant.3)
+  ROOT tuple.1 = (f32[1280,1,128], f32[1280,1,128], f32[1280,1,128]) tuple(add.1, dynamic-update-slice.5, dynamic-update-slice.6)
+}
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  negate0 = f32[1280,1,128] negate(param)
+  negate1 = f32[1280,1,128] negate(param)
+  negate2 = f32[1280,1,128] negate(param)
+  fusion = (f32[1280,1,128], f32[1280,1,128], f32[1280,1,128]) fusion(negate0, negate1, negate2), kind=kLoop, calls=fused_computation
+  gte0 = f32[1280,1,128] get-tuple-element(fusion), index=0
+  gte1 = f32[1280,1,128] get-tuple-element(fusion), index=1
+  gte2 = f32[1280,1,128] get-tuple-element(fusion), index=2
+  add0 = f32[1280,1,128] add(negate0, gte0)
+  add1 = f32[1280,1,128] add(negate1, gte1)
+  add2 = f32[1280,1,128] add(negate2, gte2)
+  ROOT tuple = (f32[1280,1,128], f32[1280,1,128], f32[1280,1,128]) tuple(add0, add1, add2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 2);
+}
+
+TEST_F(CopyInsertionTest, MultiOutputFusedDynamicUpdateSliceNoCopy) {
+  // Same as above, but negate1 is not used beyond fusion, so it only needs one
+  // copy for negate0.
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+fused_computation {
+  param0 = f32[1280,1,128] parameter(0)
+  param1 = f32[1280,1,128] parameter(1)
+  param2 = f32[1280,1,128] parameter(2)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  add.1 = f32[1280,1,128] add(param0, param0)
+  dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(param1, broadcast.6, constant.3, constant.3, constant.3)
+  dynamic-update-slice.6 = f32[1280,1,128] dynamic-update-slice(param2, broadcast.6, constant.3, constant.3, constant.3)
+  ROOT tuple.1 = (f32[1280,1,128], f32[1280,1,128], f32[1280,1,128]) tuple(add.1, dynamic-update-slice.5, dynamic-update-slice.6)
+}
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  negate0 = f32[1280,1,128] negate(param)
+  negate1 = f32[1280,1,128] negate(param)
+  negate2 = f32[1280,1,128] negate(param)
+  fusion = (f32[1280,1,128], f32[1280,1,128], f32[1280,1,128]) fusion(negate0, negate1, negate2), kind=kLoop, calls=fused_computation
+  gte0 = f32[1280,1,128] get-tuple-element(fusion), index=0
+  gte1 = f32[1280,1,128] get-tuple-element(fusion), index=1
+  gte2 = f32[1280,1,128] get-tuple-element(fusion), index=2
+  add0 = f32[1280,1,128] add(negate0, gte0)
+  add1 = f32[1280,1,128] add(gte1, gte1)
+  add2 = f32[1280,1,128] add(negate2, gte2)
+  ROOT tuple = (f32[1280,1,128], f32[1280,1,128], f32[1280,1,128]) tuple(add0, add1, add2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get());
+  EXPECT_EQ(CountCopies(*module), 1);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index b622b712f82..0cc27e32749 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -1,11 +1,15 @@
 # Description:
 #    LLVM-based CPU backend for XLA.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/compiler/xla:xla.bzl", "ORC_JIT_MEMORY_MAPPER_TARGETS")
 load(
     "//third_party/mkl:build_defs.bzl",
     "mkl_deps",
 )
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_openmp_copts")
 load(":build_defs.bzl", "runtime_copts")
 load("//tensorflow/core/platform:build_config.bzl", "if_llvm_system_z_available")
@@ -87,7 +91,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:generic_transfer_manager",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/memory",
@@ -130,11 +134,14 @@ cc_library(
         ":target_machine_features",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:Affine",
         "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
-        "@llvm-project//mlir:ExecutionEngineUtils",
         "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LinalgOps",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:VectorOps",
         "//tensorflow/compiler/xla/service:copy_insertion",
-        "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:dump",
         "//tensorflow/compiler/xla/service:topk_rewriter",
         "//tensorflow/compiler/xla/service:map_inliner",
@@ -161,6 +168,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:call_inliner",
         "//tensorflow/compiler/xla/service:cholesky_expander",
+        "//tensorflow/compiler/xla/service:qr_expander",
         "//tensorflow/compiler/xla/service:conditional_simplifier",
         "//tensorflow/compiler/xla/service:convolution_group_converter",
         "//tensorflow/compiler/xla/service:dot_decomposer",
@@ -196,9 +204,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:zero_sized_hlo_elimination",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "@llvm-project//llvm:Core",
-        "@llvm-project//llvm:MC",
         "@llvm-project//llvm:Object",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:Target",
@@ -314,11 +321,11 @@ cc_library(
         "//tensorflow/compiler/xla/service:maybe_owning_device_memory",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/platform:logging",
         "//tensorflow/core/platform:macros",
         "//tensorflow/core/platform:mutex",
         "//tensorflow/core/platform:platform_port",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/core/platform:types",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/stream_executor:device_memory_allocator",
@@ -480,7 +487,6 @@ cc_library(
         ":cpu_runtime",
         ":ir_emission_utils",
         ":mlir_emitter",
-        ":mlir_matmul_codegen_strategy",
         ":target_machine_features",
         ":tiled_dot_emitter",
         ":vector_support_library",
@@ -502,6 +508,7 @@ cc_library(
         "@llvm-project//mlir:EDSC",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgOps",
+        "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:StandardOps",
     ],
 )
@@ -1136,24 +1143,3 @@ cc_library(
         "@llvm-project//mlir:VectorToLLVM",
     ],
 )
-
-cc_library(
-    name = "mlir_matmul_codegen_strategy",
-    srcs = ["mlir_matmul_codegen_strategy.cc"],
-    hdrs = ["mlir_matmul_codegen_strategy.h"],
-    deps = [
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Affine",
-        "@llvm-project//mlir:Analysis",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LinalgOps",
-        "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:Transforms",
-        "@llvm-project//mlir:VectorOps",
-        "@llvm-project//mlir:VectorToSCF",
-    ],
-)
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index d8bf15ecdeb..1ffafd37a27 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -42,7 +42,12 @@ limitations under the License.
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/Dialect/Vector/VectorOps.h"  // from @llvm-project
 #include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/cpu_function_runtime.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -98,6 +103,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/logistic_expander.h"
 #include "tensorflow/compiler/xla/service/map_inliner.h"
+#include "tensorflow/compiler/xla/service/qr_expander.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/rng_bit_generator_expander.h"
 #include "tensorflow/compiler/xla/service/rng_expander.h"
@@ -121,6 +127,21 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
 
+namespace {
+
+// We need to explicitly load all the dialects we will involved in emitting the
+// IR. This is only needed because of how MLIR is bolted into XLA and does not
+// make use of the MLIR infrastructure (like using a proper pass pipeline).
+// Hopefully this will all go away at some point in favor of a better
+// integration.
+void LoadMLIRDialects(mlir::MLIRContext& context) {
+  context.loadDialect<mlir::linalg::LinalgDialect, mlir::scf::SCFDialect,
+                      mlir::vector::VectorDialect, mlir::StandardOpsDialect,
+                      mlir::AffineDialect>();
+}
+
+}  // namespace
+
 namespace xla {
 namespace cpu {
 using BufferInfo = cpu_function_runtime::BufferInfo;
@@ -164,8 +185,6 @@ CpuCompiler::CpuCompiler() {
   // Initialize LLVM's MC layer for the native target.
   llvm::InitializeNativeTarget();
   llvm::InitializeNativeTargetAsmPrinter();
-
-  mlir::registerAllDialects();
 }
 
 namespace {
@@ -263,6 +282,7 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
 
   pipeline.AddPass<ComparisonExpander>();
   pipeline.AddPass<CholeskyExpander>();
+  pipeline.AddPass<QrExpander>();
   pipeline.AddPass<TriangularSolveExpander>();
 
   // Inline computations with a single call site.
@@ -542,9 +562,11 @@ StatusOr<
     std::tuple<std::unique_ptr<HloModule>, std::unique_ptr<BufferAssignment>>>
 CpuCompiler::RunHloPassesAndBufferAssignement(
     std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
-    se::DeviceMemoryAllocator* device_allocator) {
-  TF_ASSIGN_OR_RETURN(
-      module, RunHloPasses(std::move(module), executor, device_allocator));
+    se::DeviceMemoryAllocator* device_allocator, bool optimize) {
+  if (optimize) {
+    TF_ASSIGN_OR_RETURN(
+        module, RunHloPasses(std::move(module), executor, device_allocator));
+  }
 
   // Select an order for emitting the HLO instructions for each computation.
   // Using this sequence enables tighter buffer liveness analysis and reduced
@@ -622,7 +644,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
 
   // Compile must be thread-safe so create a new LLVM context for the module.
   mlir::MLIRContext mlir_context;
-  mlir_context.loadAllGloballyRegisteredDialects();
+  LoadMLIRDialects(mlir_context);
   llvm::LLVMContext llvm_context;
   auto llvm_module =
       absl::make_unique<llvm::Module>("__compute_module", llvm_context);
@@ -834,7 +856,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
 
   // Compile must be thread-safe so create a new LLVM context for the module.
   mlir::MLIRContext mlir_context;
-  mlir_context.loadAllGloballyRegisteredDialects();
+  LoadMLIRDialects(mlir_context);
   llvm::LLVMContext llvm_context;
   llvm::Module llvm_module("__compute_module", llvm_context);
   llvm_module.setDataLayout(target_machine->createDataLayout());
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index d28ccd985a3..5c056fcacaa 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -138,9 +138,10 @@ class CpuCompiler : public LLVMCompiler {
 
   StatusOr<
       std::tuple<std::unique_ptr<HloModule>, std::unique_ptr<BufferAssignment>>>
-  RunHloPassesAndBufferAssignement(
-      std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
-      se::DeviceMemoryAllocator* device_allocator) override;
+  RunHloPassesAndBufferAssignement(std::unique_ptr<HloModule> module,
+                                   se::StreamExecutor* executor,
+                                   se::DeviceMemoryAllocator* device_allocator,
+                                   bool optimize) override;
 
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 7431e829b8e..02bc445ce9a 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -210,8 +210,7 @@ StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
     absl::Span<MaybeOwningDeviceMemory> buffers,
     absl::Span<ExecutionInput> arguments) {
   se::Stream* stream = run_options->stream();
-  ExecutionOutput result(/*on_host_shape=*/result_shape(),
-                         /*on_device_shape=*/result_shape(),
+  ExecutionOutput result(/*on_device_shape=*/result_shape(),
                          run_options->allocator(),
                          stream->parent()->device_ordinal());
   const HloInputOutputAliasConfig& input_output_alias =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index 9460cc55e10..42c6c9839bf 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -95,7 +95,7 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
   // Cost condition: not fuse (simple, expensive producers) and (consumers who
   // reuse operand elements).
   if (producer->opcode() != HloOpcode::kFusion && is_expensive(*producer) &&
-      consumer->ReusesOperandElements(operand_index)) {
+      ReusesOperandElements(consumer, operand_index)) {
     VLOG(2) << "Fusion is not profitable.";
     return false;
   }
@@ -132,7 +132,7 @@ bool CpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
       fusion_node_evaluations_.emplace(consumer,
                                        FusionNodeIndexingEvaluation(consumer));
     }
-    if (fusion_node_evaluations_.at(consumer).AverageCodeDuplicationTooHigh(
+    if (fusion_node_evaluations_.at(consumer).CodeDuplicationTooHigh(
             producer)) {
       return false;
     }
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
index e21ed7ad60e..bfd8e9e111a 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@@ -250,9 +250,9 @@ StatusOr<Shape> CpuTransferManager::TransferBuffersFromOutfeedInternal(
                              size);
     }
 
-    if (size <= 0) {
-      return InvalidArgument("Outfeed shape must have positive size; got %d",
-                             size);
+    if (size < 0) {
+      return InvalidArgument(
+          "Outfeed shape must have non-negative size; got %d", size);
     }
 
     int32 size_32 = static_cast<int32>(size);
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 2b3865b4dba..ba8b74a64a5 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "mlir/Dialect/Linalg/EDSC/Intrinsics.h"  // from @llvm-project
+#include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"  // from @llvm-project
 #include "mlir/EDSC/Builders.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -36,7 +37,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/mlir_emitter.h"
-#include "tensorflow/compiler/xla/service/cpu/mlir_matmul_codegen_strategy.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
@@ -304,14 +304,17 @@ Status DotOpEmitter::EmitLinalgMatmul() {
           }
         }
 
-        llvm::SmallVector<mlir::IteratorType, 4> types(
+        llvm::SmallVector<mlir::IteratorType, 4> iteratorTypes(
             parallel_exprs.size(), mlir::IteratorType::Parallel);
-        types.push_back(mlir::IteratorType::Reduction);
+        iteratorTypes.push_back(mlir::IteratorType::Reduction);
 
         mlir::edsc::StructuredIndexed s_a(a), s_b(b), s_c(c);
-        mlir::edsc::makeGenericLinalgOp(types, {s_b(b_exprs), s_c(c_exprs)},
-                                        {s_a(parallel_exprs)},
-                                        mlir::edsc::ops::macRegionBuilder);
+        mlir::edsc::makeGenericLinalgOp(
+            /*iteratorTypes=*/iteratorTypes,
+            /*inputs=*/{s_b(b_exprs), s_c(c_exprs)},
+            /*outputBuffers=*/{s_a(parallel_exprs)},
+            /*initTensors=*/{},
+            /*resultTensorTypes=*/{}, mlir::edsc::ops::macRegionBuilder);
         mlir::edsc::intrinsics::std_ret();
 
         mlir::linalg::LinalgTilingOptions tilingOptions;
@@ -319,7 +322,7 @@ Status DotOpEmitter::EmitLinalgMatmul() {
         int64 alignment =
             target_machine_features_.minimum_alignment_for_allocation(
                 ShapeUtil::ByteSizeOf(dot_info_.result_shape));
-        mlir_strategy::MatmulCodegenStrategy strategy;
+        mlir::linalg::CodegenStrategy strategy;
         strategy.tile<mlir::linalg::GenericOp>(tilingOptions)
             .promote<mlir::linalg::GenericOp>(
                 mlir::linalg::LinalgPromotionOptions()
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 36566d6c25f..54822323137 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -449,7 +449,7 @@ Status IrEmitter::HandleInfeed(HloInstruction* instruction) {
 Status IrEmitter::EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
                                     llvm::Value* program_buffer_address) {
   int64 length = ByteSizeOf(shape);
-  if (length <= 0 || length > std::numeric_limits<int32>::max()) {
+  if (length < 0 || length > std::numeric_limits<int32>::max()) {
     return InvalidArgument(
         "xfeed (infeed or outfeed) buffer length %d is outside the valid "
         "size range",
diff --git a/tensorflow/compiler/xla/service/cpu/mlir_matmul_codegen_strategy.cc b/tensorflow/compiler/xla/service/cpu/mlir_matmul_codegen_strategy.cc
deleted file mode 100644
index ea89071a967..00000000000
--- a/tensorflow/compiler/xla/service/cpu/mlir_matmul_codegen_strategy.cc
+++ /dev/null
@@ -1,269 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/cpu/mlir_matmul_codegen_strategy.h"
-
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Debug.h"
-#include "mlir/Analysis/SliceAnalysis.h"  // from @llvm-project
-#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"  // from @llvm-project
-#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/Transforms/Hoisting.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/Utils/Utils.h"  // from @llvm-project
-#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
-#include "mlir/Dialect/SCF/Utils.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/Dialect/Vector/EDSC/Intrinsics.h"  // from @llvm-project
-#include "mlir/Dialect/Vector/VectorOps.h"  // from @llvm-project
-#include "mlir/Dialect/Vector/VectorTransforms.h"  // from @llvm-project
-#include "mlir/IR/AffineExpr.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
-#include "mlir/IR/Dominance.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/IR/Visitors.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/LoopUtils.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
-
-// TODO(kramerb): Remove this once strategy is in mlir core.
-
-using namespace mlir;          // NOLINT
-using namespace mlir::linalg;  // NOLINT
-
-#define DEBUG_TYPE "matmul-codegen-strategy"
-
-namespace xla {
-namespace cpu {
-namespace mlir_strategy {
-
-//===----------------------------------------------------------------------===//
-// TODO: Cleanup and upstream these to go into core. Please ignore for now !
-//===----------------------------------------------------------------------===//
-static void hoistRedundantCopies(FuncOp func) {
-  bool changed = true;
-  while (changed) {
-    changed = false;
-    func.walk([&](linalg::FillOp op) {
-      auto loop = op.getParentOfType<scf::ForOp>();
-      if (!loop) return;
-
-      for (auto operand : op.getOperands())
-        if (!loop.isDefinedOutsideOfLoop(operand)) return;
-
-      // Hoist fill before.
-      op.getOperation()->moveBefore(loop);
-      changed = true;
-    });
-
-    func.walk([&](linalg::CopyOp op) {
-      auto loop = op.getParentOfType<scf::ForOp>();
-      if (!loop) return;
-
-      for (auto operand : op.getOperands())
-        if (!loop.isDefinedOutsideOfLoop(operand)) return;
-
-      Value sourceView = op.getInput(0);
-      while (auto subViewOp = sourceView.getDefiningOp<SubViewOp>())
-        sourceView = subViewOp.getViewSource();
-
-      // Source traces back to a block argument.
-      if (sourceView.isa<BlockArgument>()) {
-        op.getOperation()->moveBefore(loop);
-      } else {
-        assert(sourceView.getDefiningOp<ViewOp>() ||
-               sourceView.getDefiningOp<AllocOp>() ||
-               sourceView.getDefiningOp<AllocaOp>());
-        op.getOperation()->moveAfter(loop);
-      }
-      changed = true;
-    });
-  }
-}
-
-/// Substitute scf.for = %lb to %ub step %step by an AffineExpr expressing:
-///   `%lb + %step * new_dim` where
-/// 1. the AffineExpr for %lb is either an AffineConstantExpr or an
-/// AffineDimExpr depending on whether the value is constant or not.
-/// 2. the AffineExpr for %step is either an AffineConstantExpr or an
-/// AffineSymbolExpr depending on whether the value is constant or not.
-///
-static void substitute(scf::ForOp forOp, SmallVectorImpl<AffineExpr> &exprs,
-                       SmallVectorImpl<Value> &dims,
-                       SmallVectorImpl<Value> &symbols) {
-  MLIRContext *ctx = forOp.getContext();
-  auto lbConstant = forOp.lowerBound().getDefiningOp<ConstantIndexOp>();
-  AffineExpr lb = lbConstant ? getAffineConstantExpr(lbConstant.getValue(), ctx)
-                             : getAffineDimExpr(dims.size(), ctx);
-
-  auto stepConstant = forOp.step().getDefiningOp<ConstantIndexOp>();
-  AffineExpr step = stepConstant
-                        ? getAffineConstantExpr(stepConstant.getValue(), ctx)
-                        : getAffineSymbolExpr(symbols.size(), ctx);
-
-  if (!lbConstant) dims.push_back(forOp.lowerBound());
-  if (!stepConstant) symbols.push_back(forOp.step());
-  exprs.push_back(lb + step * getAffineDimExpr(dims.size(), ctx));
-
-  auto ubConstant = forOp.upperBound().getDefiningOp<ConstantIndexOp>();
-  AffineExpr ub = ubConstant ? getAffineConstantExpr(ubConstant.getValue(), ctx)
-                             : getAffineDimExpr(dims.size(), ctx);
-  if (!ubConstant) dims.push_back(forOp.upperBound());
-  exprs.push_back(ub);
-
-  dims.push_back(forOp.getInductionVar());
-}
-
-/// Traverse the .
-static void substitute(AffineMinOp minOp, SmallVectorImpl<AffineExpr> &exprs,
-                       SmallVectorImpl<Value> &dims,
-                       SmallVectorImpl<Value> &symbols) {
-  MLIRContext *ctx = minOp.getContext();
-  for (Value v : minOp.getDimOperands()) {
-    if (auto forOp = scf::getForInductionVarOwner(v)) {
-      substitute(forOp, exprs, dims, symbols);
-      continue;
-    }
-    if (auto parentMinOp = v.getDefiningOp<AffineMinOp>()) {
-      substitute(parentMinOp, exprs, dims, symbols);
-      continue;
-    }
-    exprs.push_back(getAffineDimExpr(dims.size(), ctx));
-    dims.push_back(v);
-  }
-}
-
-/// Perform folding of chains of AffineMinOp.
-struct AffineMinCanonicalizationPattern : public OpRewritePattern<AffineMinOp> {
-  using OpRewritePattern<AffineMinOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(AffineMinOp minOp,
-                                PatternRewriter &rewriter) const override;
-};
-
-LogicalResult AffineMinCanonicalizationPattern::matchAndRewrite(
-    AffineMinOp minOp, PatternRewriter &rewriter) const {
-  LLVM_DEBUG(llvm::dbgs() << "\nCanonicalize AffineMin: "
-                          << *minOp.getOperation() << "\n");
-
-  int64_t min = std::numeric_limits<int64_t>::max();
-  for (auto e : minOp.map().getResults())
-    if (auto cstExpr = e.dyn_cast<AffineConstantExpr>())
-      min = std::min(min, cstExpr.getValue());
-  if (min == std::numeric_limits<int64_t>::max()) return failure();
-
-  SmallVector<AffineExpr, 4> exprs;
-  SmallVector<Value, 4> dims, symbols;
-  substitute(minOp, exprs, dims, symbols);
-
-  SmallVector<Value, 4> operands = dims;
-  operands.append(symbols.begin(), symbols.end());
-
-  MLIRContext *ctx = minOp.getContext();
-  auto map = AffineMap::get(dims.size(), symbols.size(), exprs, ctx);
-  LLVM_DEBUG(llvm::dbgs() << "Substitution map: " << map << "\n");
-
-  SmallVector<AffineExpr, 4> modExprs;
-  for (unsigned idx = 0, e = map.getNumResults(); idx < e; ++idx)
-    modExprs.push_back(getAffineDimExpr(idx, ctx) % min);
-  map = AffineMap::get(map.getNumResults(), 0, modExprs, ctx).compose(map);
-  canonicalizeMapAndOperands(&map, &operands);
-  map = simplifyAffineMap(map);
-
-  LLVM_DEBUG(llvm::dbgs() << "Post mod: " << map << "\n";
-             llvm::interleaveComma(operands, llvm::dbgs()));
-
-  if (!llvm::all_of(map.getResults(), [](AffineExpr e) {
-        if (auto cst = e.dyn_cast<AffineConstantExpr>())
-          return cst.getValue() == 0;
-        return false;
-      }))
-    return failure();
-
-  rewriter.replaceOpWithNewOp<ConstantIndexOp>(minOp, min);
-  return success();
-}
-//===----------------------------------------------------------------------===//
-// END TODO
-//===----------------------------------------------------------------------===//
-
-void MatmulCodegenStrategy::transform(FuncOp func) const {
-  MLIRContext *context = func.getContext();
-  // Emplace patterns one at a time while also maintaining a simple chained
-  // state transition.
-  unsigned stepCount = 0;
-  SmallVector<OwningRewritePatternList, 4> stage1Patterns;
-  auto zeroState = Identifier::get(std::to_string(stepCount), context);
-  auto currentState = zeroState;
-  for (auto &t : transformation_sequence) {
-    auto nextState = Identifier::get(std::to_string(++stepCount), context);
-    auto marker = (currentState == zeroState)
-                      ? linalg::LinalgMarker({}, nextState)
-                      : linalg::LinalgMarker(currentState, nextState);
-    stage1Patterns.emplace_back(t->buildRewritePatterns(context, marker));
-    currentState = nextState;
-  }
-
-  OwningRewritePatternList stage2Patterns =
-      linalg::getLinalgTilingCanonicalizationPatterns(context);
-  stage2Patterns.insert<AffineMinCanonicalizationPattern>(context);
-
-  auto stage3Transforms = [](Operation *op) {
-    // Some of these may be too aggressive as a stage 3 that is applied on each
-    // stage 1 application and may have to be split out to post staged patterns
-    // application (in which case they could just be passes, TBD).
-    PassManager pm(op->getContext());
-    pm.addPass(createLoopInvariantCodeMotionPass());
-    if (failed(pm.run(op->getParentOfType<ModuleOp>())))
-      llvm_unreachable("Unexpected failure in cleanup pass pipeline.");
-    promoteSingleIterationLoops(cast<FuncOp>(op));
-    hoistViewAllocOps(cast<FuncOp>(op));
-    hoistRedundantVectorTransfers(cast<FuncOp>(op));
-    hoistRedundantCopies(cast<FuncOp>(op));
-    return success();
-  };
-  linalg::applyStagedPatterns(func, stage1Patterns, stage2Patterns,
-                              stage3Transforms);
-
-  //===--------------------------------------------------------------------===//
-  // Post staged patterns transforms
-  //===--------------------------------------------------------------------===//
-  // Programmatic controlled lowering of vector.contract only.
-  OwningRewritePatternList vectorContractLoweringPatterns;
-  vectorContractLoweringPatterns
-      .insert<ContractionOpToOuterProductOpLowering,
-              ContractionOpToMatmulOpLowering, ContractionOpLowering>(
-          vector_transforms_options, context);
-  applyPatternsAndFoldGreedily(func, vectorContractLoweringPatterns);
-
-  // Programmatic controlled lowering of vector.transfer only.
-  OwningRewritePatternList vectorToLoopsPatterns;
-  populateVectorToSCFConversionPatterns(vectorToLoopsPatterns, context,
-                                        vector_to_scf_options);
-  applyPatternsAndFoldGreedily(func, vectorToLoopsPatterns);
-}
-
-}  // namespace mlir_strategy
-}  // namespace cpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/mlir_matmul_codegen_strategy.h b/tensorflow/compiler/xla/service/cpu/mlir_matmul_codegen_strategy.h
deleted file mode 100644
index 3b11b750c47..00000000000
--- a/tensorflow/compiler/xla/service/cpu/mlir_matmul_codegen_strategy.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_EDGE_BENCHMARKS_STRATEGIES_MATMULCODEGENSTRATEGIES_H_
-#define MLIR_EDGE_BENCHMARKS_STRATEGIES_MATMULCODEGENSTRATEGIES_H_
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"  // from @llvm-project
-#include "mlir/Dialect/Vector/VectorOps.h"  // from @llvm-project
-#include "mlir/Dialect/Vector/VectorTransforms.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-
-// TODO(kramerb): Remove this once strategy is in mlir core.
-
-namespace xla {
-namespace cpu {
-namespace mlir_strategy {
-
-/// Abstract Transformation class applied in a sequence that also handles state
-/// through markers.
-struct Transformation {
-  virtual ~Transformation() = default;
-  virtual mlir::OwningRewritePatternList buildRewritePatterns(
-      mlir::MLIRContext *context, mlir::linalg::LinalgMarker m) = 0;
-  mlir::linalg::LinalgMarker marker;
-};
-
-/// Promotion transformation enqueues a particular stage-1 pattern for
-/// `Tile<LinalgOpType>`with the appropriate `options`.
-// TODO: variadic LinalgOpTypes.
-template <typename LinalgOpType>
-struct Tile : public Transformation {
-  explicit Tile(mlir::linalg::LinalgTilingOptions options) : options(options) {}
-
-  mlir::OwningRewritePatternList buildRewritePatterns(
-      mlir::MLIRContext *context, mlir::linalg::LinalgMarker m) override {
-    mlir::OwningRewritePatternList tiling_patterns;
-    tiling_patterns.insert<mlir::linalg::LinalgTilingPattern<LinalgOpType>>(
-        context, options, m);
-    return tiling_patterns;
-  }
-
- private:
-  mlir::linalg::LinalgTilingOptions options;
-};
-
-/// Promotion transformation enqueues a particular stage-1 pattern for
-/// `Promote<LinalgOpType>`with the appropriate `options`.
-// TODO: variadic LinalgOpTypes.
-template <typename LinalgOpType>
-struct Promote : public Transformation {
-  explicit Promote(mlir::linalg::LinalgPromotionOptions options)
-      : options(options) {}
-
-  mlir::OwningRewritePatternList buildRewritePatterns(
-      mlir::MLIRContext *context, mlir::linalg::LinalgMarker m) override {
-    mlir::OwningRewritePatternList promotion_patterns;
-    promotion_patterns
-        .insert<mlir::linalg::LinalgPromotionPattern<LinalgOpType>>(context,
-                                                                    options, m);
-    return promotion_patterns;
-  }
-
- private:
-  mlir::linalg::LinalgPromotionOptions options;
-};
-
-/// Vectorization transformation enqueues a particular stage-1 pattern for
-/// `LinalgVectorizationPattern<LinalgOpType>` as well as copy to vector
-/// transfer rewrite forwarding patterns.
-// TODO: variadic LinalgOpTypes.
-template <typename LinalgOpType>
-struct Vectorize : public Transformation {
-  mlir::OwningRewritePatternList buildRewritePatterns(
-      mlir::MLIRContext *context, mlir::linalg::LinalgMarker m) override {
-    mlir::OwningRewritePatternList vectorization_patterns;
-    // FillOp may interfere with forwarding patterns atm, so we bump up the
-    // priority of LinalgCopyVTRForwardingPattern /
-    // LinalgCopyVTWForwardingPattern.
-    vectorization_patterns
-        .insert<mlir::linalg::LinalgVectorizationPattern<LinalgOpType>>(context,
-                                                                        m);
-    vectorization_patterns.insert<mlir::linalg::LinalgCopyVTRForwardingPattern,
-                                  mlir::linalg::LinalgCopyVTWForwardingPattern>(
-        context,
-        /*benefit=*/2);
-    return vectorization_patterns;
-  }
-};
-
-/// Matmul-specific strategy object controls how a linalg.matmul is
-/// progressively lowered.
-/// The strategy uses a 3-level staged patterns strategy which allows ordering
-/// transformations by using the Linalg `applyStagedPatterns` function, where:
-///   1. The first stage consists of the successive `tile`, `promote` and
-///   `vectorize` patterns, applied sequentially.
-///   2. The second stage consists of common local canonicalization patterns
-///   that are applied eagerly after each stage-1 pattern.
-///   3. the third stage consists of more global transformation, also applied
-///   eagerly, after all stage-2 patterns. Such more global transformations
-struct MatmulCodegenStrategy {
-  /// Append a pattern to add a level of tiling for `LinalgOpType` with tiling
-  /// `options`.
-  template <typename LinalgOpType>
-  MatmulCodegenStrategy &tile(mlir::linalg::LinalgTilingOptions options) {
-    transformation_sequence.emplace_back(new Tile<LinalgOpType>(options));
-    return *this;
-  }
-  /// Conditionally append a pattern to add a level of tiling for `LinalgOpType`
-  /// with tiling `options`.
-  template <typename LinalgOpType>
-  MatmulCodegenStrategy &tileIf(bool b,
-                                mlir::linalg::LinalgTilingOptions options) {
-    return b ? tile<LinalgOpType>(options) : *this;
-  }
-  /// Append a pattern to add a level of promotion for `LinalgOpType` with
-  /// promotion `options`.
-  template <typename LinalgOpType>
-  MatmulCodegenStrategy &promote(mlir::linalg::LinalgPromotionOptions options) {
-    transformation_sequence.emplace_back(new Promote<LinalgOpType>(options));
-    return *this;
-  }
-  /// Conditionally append a pattern to add a level of promotion for
-  /// `LinalgOpType` with promotion `options`.
-  template <typename LinalgOpType>
-  MatmulCodegenStrategy &promoteIf(
-      bool b, mlir::linalg::LinalgPromotionOptions options) {
-    return b ? promote<LinalgOpType>(options) : *this;
-    return *this;
-  }
-  /// Append a pattern to rewrite `LinalgOpType` as a vector operation.
-  template <typename LinalgOpType>
-  MatmulCodegenStrategy &vectorize() {
-    transformation_sequence.emplace_back(new Vectorize<LinalgOpType>());
-    return *this;
-  }
-  /// Conditionally append a pattern to rewrite `LinalgOpType` as a vector
-  /// operation.
-  template <typename LinalgOpType>
-  MatmulCodegenStrategy &vectorizeIf(bool b) {
-    return b ? vectorize<LinalgOpType>() : *this;
-    return *this;
-  }
-  /// Configure the post staged-patterns late vector transformations.
-  MatmulCodegenStrategy &setVectorTransformsOptions(
-      mlir::vector::VectorTransformsOptions options) {
-    vector_transforms_options = options;
-    return *this;
-  }
-  /// Configure the post staged-patterns late vector.transfer to scf conversion.
-  MatmulCodegenStrategy &setVectorTransferToSCFOptions(
-      mlir::VectorTransferToSCFOptions options) {
-    vector_to_scf_options = options;
-    return *this;
-  }
-
-  /// Apply the transformation patterns in sequence with cleanup transformations
-  /// interleaved.
-  void transform(mlir::FuncOp func) const;
-
- private:
-  mlir::LogicalResult postPatternTransforms(mlir::Operation *func) const;
-
-  mlir::vector::VectorTransformsOptions vector_transforms_options;
-  mlir::VectorTransferToSCFOptions vector_to_scf_options;
-  llvm::SmallVector<std::unique_ptr<Transformation>, 4> transformation_sequence;
-};
-
-}  // namespace mlir_strategy
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // MLIR_EDGE_BENCHMARKS_STRATEGIES_MATMULCODEGENSTRATEGIES_H_
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
index ffbd0d68ce9..23f5a5c434f 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.cc
@@ -31,9 +31,15 @@ ParallelLoopEmitter::ParallelLoopEmitter(
 
 std::vector<llvm_ir::IrArray::Index>
 ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
-                                                   llvm::Type* index_type) {
+                                                   llvm::Type* index_type,
+                                                   llvm::Value* base_index) {
   CHECK_NE(index_type, nullptr);
 
+  CHECK_EQ(base_index, nullptr)
+      << "XLA CPU implementation of"
+      << " ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock doesn't support"
+      << " base_index, but it was requested.";
+
   CHECK(!shape_.IsTuple());
   CHECK(!ShapeUtil::IsScalar(shape_));
 
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
index a604e1db222..a11fd44f1ce 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_loop_emitter.h
@@ -61,7 +61,8 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   ~ParallelLoopEmitter() override = default;
 
   std::vector<llvm_ir::IrArray::Index> EmitIndexAndSetExitBasicBlock(
-      absl::string_view loop_name, llvm::Type* index_type) override;
+      absl::string_view loop_name, llvm::Type* index_type,
+      llvm::Value* base_index) override;
 
  private:
   const DynamicLoopBounds* dynamic_loop_bounds_;
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
index 225102e6ae6..48f2248d2d7 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
@@ -143,7 +143,8 @@ int64 ParallelTaskAssignment::GetTargetParallelTaskCount(
   // TODO(b/27458679) Parallelize instructions which are skipped here.
   auto opcode = instruction->opcode();
   if (llvm_ir::MayBeImplementedAsInPlaceDynamicUpdateSlice(instruction) ||
-      instruction->shape().IsTuple() || opcode == HloOpcode::kRng) {
+      instruction->shape().IsTuple() || opcode == HloOpcode::kRng ||
+      opcode == HloOpcode::kConstant) {
     return 1;
   }
 
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
index e22210a61f2..5b454379876 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
@@ -191,5 +191,19 @@ TEST_F(ParallelTaskAssignmentTest, AllReduceNotParallelized) {
   EXPECT_FALSE(changed);
 }
 
+TEST_F(ParallelTaskAssignmentTest, ConstantNotParallelized) {
+  constexpr char hlo_string[] = R"(
+  HloModule TestTaskParallel_constant
+    ENTRY const {
+      ROOT constant = f32[1234567] constant({...})
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(m.get()));
+  EXPECT_FALSE(changed);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index 527071d5f31..aab9556d135 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -1,6 +1,8 @@
 # Description:
 #    Tests for LLVM-based CPU backend for XLA.
 
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
index b2ed9bd5f31..f6925ce5c80 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
@@ -56,6 +56,34 @@ CHECK: private unnamed_addr constant [48 x i8]
                                 /*match_optimized_ir=*/false);
 }
 
+TEST_F(CpuOutfeedTest, OutfeedEmpty) {
+  const string hlo_text = R"(
+HloModule Outfeed
+
+ENTRY main {
+  const_a = f32[2,0] constant({{}, {}})
+  token0 = token[] after-all()
+  outfeed = token[] outfeed(f32[2,0] const_a, token0)
+  ROOT root = () tuple()
+}
+)";
+
+  string filecheck_pattern = R"(
+CHECK: private unnamed_addr constant [0 x i8]
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
+
+  CpuAotCompilationOptions options{
+      /*triple=*/kTargetTripleForHost, /*cpu_name=*/kTargetCpuForHost,
+      /*features=*/"",
+      /*entry_point_name=*/"entry",
+      /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
+
+  CompileAheadOfTimeAndVerifyIr(std::move(module), options, filecheck_pattern,
+                                /*match_optimized_ir=*/false);
+}
+
 TEST_F(CpuOutfeedTest, OutfeedTokenInTuple) {
   const string hlo_text = R"(
 HloModule OutfeedTokenInTuple
diff --git a/tensorflow/compiler/xla/service/cpu/xfeed_manager_test.cc b/tensorflow/compiler/xla/service/cpu/xfeed_manager_test.cc
index e36eff09009..e364c0f1b42 100644
--- a/tensorflow/compiler/xla/service/cpu/xfeed_manager_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/xfeed_manager_test.cc
@@ -128,6 +128,22 @@ TEST_F(InfeedManagerTest, MultiThreaded) {
   ProcessNextBuffer(length);
 }
 
+TEST_F(InfeedManagerTest, OutfeedBasic) {
+  TestInfeedBuffer* b = new TestInfeedBuffer(32, /*expect_shape_match=*/true);
+  cpu::runtime::XfeedManager* xfeed = cpu::runtime::GetXfeedManager(0);
+  xfeed->outfeed()->EnqueueBuffersAtomically({b});
+
+  ProcessNextOutfeedBuffer(32, ShapeUtil::MakeShape(U8, {32}));
+}
+
+TEST_F(InfeedManagerTest, OutfeedEmpty) {
+  TestInfeedBuffer* b = new TestInfeedBuffer(0, /*expect_shape_match=*/true);
+  cpu::runtime::XfeedManager* xfeed = cpu::runtime::GetXfeedManager(0);
+  xfeed->outfeed()->EnqueueBuffersAtomically({b});
+
+  ProcessNextOutfeedBuffer(0, ShapeUtil::MakeShape(U8, {0}));
+}
+
 TEST_F(InfeedManagerTest, OutfeedWrongShape) {
   TestInfeedBuffer* b = new TestInfeedBuffer(32, /*expect_shape_match=*/false);
   cpu::runtime::XfeedManager* xfeed = cpu::runtime::GetXfeedManager(0);
diff --git a/tensorflow/compiler/xla/service/dot_as_convolution_util.cc b/tensorflow/compiler/xla/service/dot_as_convolution_util.cc
index b95636c7039..3adde5f7d48 100644
--- a/tensorflow/compiler/xla/service/dot_as_convolution_util.cc
+++ b/tensorflow/compiler/xla/service/dot_as_convolution_util.cc
@@ -49,14 +49,11 @@ bool ConvSpatialDimensionIsParallel(const WindowDimension& wd, int64 lhs_size) {
   return false;
 }
 
-/* static */ absl::optional<DotGeneralAsConvolutionDimsInfo>
-ParseDotGeneralFromConvolution(const HloInstruction* conv) {
+/* static */ DotConvolutionDimsInfo ParseConvolutionDimsInfo(
+    const HloInstruction* conv) {
   CHECK_EQ(conv->opcode(), HloOpcode::kConvolution);
-  if (conv->feature_group_count() != 1 || conv->batch_group_count() != 1) {
-    return absl::nullopt;
-  }
   const auto& conv_dims = conv->convolution_dimension_numbers();
-  DotGeneralAsConvolutionDimsInfo dims;
+  DotConvolutionDimsInfo dims;
   dims.lhs_non_contracting_dims.push_back(
       {conv_dims.input_batch_dimension(), -1,
        conv_dims.output_batch_dimension(), -1});
@@ -98,10 +95,10 @@ ParseDotGeneralFromConvolution(const HloInstruction* conv) {
         // padding N - 1,  high padding N - 1 and window reversal.
         dims.rhs_non_contracting_dims.push_back({lhs, rhs, output, i});
       } else {
-        return absl::nullopt;
+        dims.conv_spatial_dims.push_back({lhs, rhs, output, i});
       }
     } else {
-      return absl::nullopt;
+      dims.conv_spatial_dims.push_back({lhs, rhs, output, i});
     }
   }
 
@@ -110,8 +107,7 @@ ParseDotGeneralFromConvolution(const HloInstruction* conv) {
 
 StatusOr<std::unique_ptr<HloInstruction>>
 CreateShardedConvForDotGeneralConvolution(
-    const HloInstruction& conv,
-    const DotGeneralAsConvolutionDimsInfo& dot_dnums,
+    const HloInstruction& conv, const DotConvolutionDimsInfo& dot_dnums,
     HloInstruction* sharded_lhs_hlo, HloInstruction* sharded_rhs_hlo) {
   CHECK_EQ(conv.opcode(), HloOpcode::kConvolution);
   const auto& conv_dnums = conv.convolution_dimension_numbers();
@@ -141,22 +137,23 @@ CreateShardedConvForDotGeneralConvolution(
     wd->set_padding_high(wd->size() - 1);
     wd->set_padding_low(wd->size() - 1);
   }
-  TF_ASSIGN_OR_RETURN(Shape sharded_conv_shape,
-                      ShapeInference::InferConvolveShape(
-                          sharded_lhs_hlo->shape(), sharded_rhs_hlo->shape(),
-                          /*feature_group_count=*/1,
-                          /*batch_group_count=*/1, window, conv_dnums));
+  TF_ASSIGN_OR_RETURN(
+      Shape sharded_conv_shape,
+      ShapeInference::InferConvolveShape(
+          sharded_lhs_hlo->shape(), sharded_rhs_hlo->shape(),
+          /*feature_group_count=*/conv.feature_group_count(),
+          /*batch_group_count=*/conv.batch_group_count(), window, conv_dnums));
   *sharded_conv_shape.mutable_layout() = conv.shape().layout();
   return HloInstruction::CreateConvolve(
       sharded_conv_shape, sharded_lhs_hlo, sharded_rhs_hlo,
-      /*feature_group_count=*/1,
-      /*batch_group_count=*/1, window, conv_dnums, conv.precision_config());
+      /*feature_group_count=*/conv.feature_group_count(),
+      /*batch_group_count=*/conv.batch_group_count(), window, conv_dnums,
+      conv.precision_config());
 }
 
-DotGeneralAsConvolutionDimsInfo ParseDotGeneralFromDot(
-    const HloInstruction* dot) {
+DotConvolutionDimsInfo ParseDotGeneralFromDot(const HloInstruction* dot) {
   const auto& dot_dim_numbs = dot->dot_dimension_numbers();
-  dot_as_convolution_util::DotGeneralAsConvolutionDimsInfo dnums;
+  dot_as_convolution_util::DotConvolutionDimsInfo dnums;
   for (int64 i = 0; i < dot_dim_numbs.lhs_batch_dimensions().size(); ++i) {
     dnums.batch_dims.emplace_back();
     dnums.batch_dims.back().lhs = dot_dim_numbs.lhs_batch_dimensions(i);
diff --git a/tensorflow/compiler/xla/service/dot_as_convolution_util.h b/tensorflow/compiler/xla/service/dot_as_convolution_util.h
index 81914b193a3..16a542208d2 100644
--- a/tensorflow/compiler/xla/service/dot_as_convolution_util.h
+++ b/tensorflow/compiler/xla/service/dot_as_convolution_util.h
@@ -25,8 +25,9 @@ limitations under the License.
 namespace xla {
 namespace dot_as_convolution_util {
 
-// Describes the dimensions of a convolution that can be interpreted as a dot.
-struct DotGeneralAsConvolutionDimsInfo {
+// Describes the dimensions of a convolution that can be interpreted as a dot
+// or a normal convolution.
+struct DotConvolutionDimsInfo {
   // The dimension numbers for the operands and output corresponding to a
   // logical dimension (e.g., batch, contracting, non-contracting). If an
   // operand or the output doesn't have the logical dimension, it is set to
@@ -43,23 +44,22 @@ struct DotGeneralAsConvolutionDimsInfo {
   std::vector<DimNums> contracting_dims;
   std::vector<DimNums> lhs_non_contracting_dims;
   std::vector<DimNums> rhs_non_contracting_dims;
+  std::vector<DimNums> conv_spatial_dims;
 };
 
-// Parses a convolution and returns a DotGeneralAsConvolutionDimsInfo if it can
-// be interpreted as a dot, or absl::nullopt otherwise.
-absl::optional<DotGeneralAsConvolutionDimsInfo> ParseDotGeneralFromConvolution(
-    const HloInstruction* conv);
+// Parses a convolution and returns a DotGeneralAsConvolutionDimsInfo. If it can
+// be interpreted as a dot, there is no conv_spatial_dims.
+DotConvolutionDimsInfo ParseConvolutionDimsInfo(const HloInstruction* conv);
 
 // Creates sharded convolution instruction that can be interpreted as a dot.
 // This is a utility for per-op partitioners.
 //  - 'conv' is the original convolution instruction.
-//  - 'dot_dnums' is the result of ParseDotGeneralFromConvolution() for 'conv'.
+//  - 'dot_dnums' is the result of ParseDotConvolutionDimsInfo() for 'conv'.
 //  - 'sharded_lhs_hlo' and 'sharded_rhs_hlo' are sharded inputs for the result
 //    convolution instruction.
 StatusOr<std::unique_ptr<HloInstruction>>
 CreateShardedConvForDotGeneralConvolution(
-    const HloInstruction& conv,
-    const DotGeneralAsConvolutionDimsInfo& dot_dnums,
+    const HloInstruction& conv, const DotConvolutionDimsInfo& dot_dnums,
     HloInstruction* sharded_lhs_hlo, HloInstruction* sharded_rhs_hlo);
 
 // Check if a spatial dim is parallel batch dimension.
@@ -68,10 +68,9 @@ CreateShardedConvForDotGeneralConvolution(
 // dilation B.
 bool ConvSpatialDimensionIsParallel(const WindowDimension& wd, int64 lhs_size);
 
-// Returns a DotGeneralAsConvolutionDimsInfo from a kDot instruction, where all
+// Returns a DotConvolutionDimsInfo from a kDot instruction, where all
 // the spatial_dim values are set to -1.
-DotGeneralAsConvolutionDimsInfo ParseDotGeneralFromDot(
-    const HloInstruction* dot);
+DotConvolutionDimsInfo ParseDotGeneralFromDot(const HloInstruction* dot);
 
 }  // namespace dot_as_convolution_util
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_padder.cc b/tensorflow/compiler/xla/service/dynamic_padder.cc
index 9b4d24bbbe9..e728cd75caf 100644
--- a/tensorflow/compiler/xla/service/dynamic_padder.cc
+++ b/tensorflow/compiler/xla/service/dynamic_padder.cc
@@ -39,12 +39,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/platform/errors.h"
 
 namespace xla {
 
 namespace {
 
+auto* dynamic_padding_gauge = tensorflow::monitoring::Gauge<bool, 0>::New(
+    "/tensorflow/core/use_dynamic_padding_gauge",
+    "Tracks if dynamic padder is used.");
+
 // ChooseIdentityValue looks at the instruction's operand, returns a
 // identity value which, when padded, doesn't change the result of the
 // instruction.
@@ -179,6 +184,22 @@ StatusOr<bool> ReplaceSetSize(HloInstruction* instr) {
   return true;
 }
 
+StatusOr<bool> ReplaceSetBound(HloInstruction* instr) {
+  if (instr->opcode() != HloOpcode::kCustomCall ||
+      instr->custom_call_target() != "SetBound") {
+    return false;
+  }
+
+  TF_RET_CHECK(Shape::Equal().IgnoreDynamicDimension()(
+      instr->shape(), instr->operand(0)->shape()))
+      << "instr->shape() " << instr->shape().ToString() << " , "
+      << "instruction operand shape " << instr->operand(0)->shape();
+  HloInstruction* operand = instr->mutable_operand(0);
+
+  TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(operand));
+  return true;
+}
+
 bool ShouldSkipPadOnOperand(const HloInstruction* inst, int64 operand_num,
                             int64 dimension) {
   if ((inst->opcode() == HloOpcode::kReduceWindow ||
@@ -1335,6 +1356,7 @@ StatusOr<bool> DynamicPadder::Run(HloModule* module) {
               operand, input_dim, operand_dynamic_size, identity_value);
           TF_RETURN_IF_ERROR(inst->ReplaceOperandWith(operand_num, padded));
           operand = inst->mutable_operand(operand_num);
+          dynamic_padding_gauge->GetCell()->Set(true);
           changed = true;
         }
       }
@@ -1370,7 +1392,10 @@ StatusOr<bool> DynamicPadder::Run(HloModule* module) {
   for (auto* computation : module->computations()) {
     for (auto instruction : computation->MakeInstructionPostOrder()) {
       TF_ASSIGN_OR_RETURN(bool replaced_set_size, ReplaceSetSize(instruction));
+      TF_ASSIGN_OR_RETURN(bool replaced_set_bound,
+                          ReplaceSetBound(instruction));
       changed = changed || replaced_set_size;
+      changed = changed || replaced_set_bound;
     }
   }
 
@@ -1378,6 +1403,7 @@ StatusOr<bool> DynamicPadder::Run(HloModule* module) {
   TF_ASSIGN_OR_RETURN(changed, dce.Run(module));
   VLOG(2) << "Post DynamicPadder HLO:";
   XLA_VLOG_LINES(2, module->ToString());
+  dynamic_padding_gauge->GetCell()->Set(changed);
   return changed;
 }
 
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index d5cf2ee9ac0..e9a3c6b3018 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -59,11 +59,11 @@ void ExecutionInput::SetUnownedBuffer(const ShapeIndex& index,
   unowned_indices_.insert(index);
 }
 
-xla::StatusOr<xla::ShapedBuffer> ExecutionInput::ToShapedBuffer(
+StatusOr<ShapedBuffer> ExecutionInput::ToShapedBuffer(
     se::DeviceMemoryAllocator* allocator, int device_ordinal) const {
   const Shape& input_shape = shape();
-  xla::ShapedBuffer shaped_buffer(input_shape, input_shape,
-                                  allocator->platform(), device_ordinal);
+  ShapedBuffer shaped_buffer(input_shape, allocator->platform(),
+                             device_ordinal);
   for (const auto& index_buffer : Buffers()) {
     const tensorflow::se::OwningDeviceMemory* mem =
         index_buffer.second.AsOwningDeviceMemory();
@@ -93,8 +93,7 @@ StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStream(
 
 static ExecutionInput MakeMaybeOwningDeviceMemoryTree(
     const ShapedBuffer& shaped_buffer) {
-  ExecutionInput result(shaped_buffer.on_device_shape(),
-                        shaped_buffer.on_host_shape());
+  ExecutionInput result(shaped_buffer.on_device_shape());
   shaped_buffer.buffers().ForEachElement(
       [&](const ShapeIndex& index, const se::DeviceMemoryBase& mem) {
         result.SetBuffer(index, MaybeOwningDeviceMemory(mem));
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 2e3ddedfb8c..1e1b3436a3c 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -60,15 +60,24 @@ namespace xla {
 //   with their indices absent from unowned_indices_.
 class ExecutionInput {
  public:
-  explicit ExecutionInput(xla::Shape shape, xla::Shape host_shape)
+  explicit ExecutionInput(xla::Shape shape) : buffers_(std::move(shape)) {
+    SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape()));
+  }
+  // TODO(b/170310047): remove this overload.
+  ExecutionInput(xla::Shape shape, xla::Shape host_shape)
       : buffers_(std::move(shape)) {
-    SetHostShape(std::move(host_shape));
+    SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape()));
   }
 
-  explicit ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers,
-                          xla::Shape host_shape)
+  explicit ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers)
       : buffers_(std::move(buffers)) {
-    SetHostShape(std::move(host_shape));
+    SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape()));
+  }
+  // TODO(b/170310047): remove this overload.
+  ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers,
+                 xla::Shape host_shape)
+      : buffers_(std::move(buffers)) {
+    SetHostShape(ShapeUtil::DeviceShapeToHostShape(buffers_.shape()));
   }
 
   ExecutionInput(ExecutionInput&&) = default;
@@ -144,10 +153,13 @@ class ExecutionOutput {
                   std::vector<se::OwningDeviceMemory> to_be_released)
       : result_(std::move(result)),
         to_be_released_(std::move(to_be_released)) {}
+  // TODO(b/170310047): remove this overload.
   ExecutionOutput(Shape on_host_shape, Shape on_device_shape,
                   se::DeviceMemoryAllocator* allocator, int device_ordinal)
-      : result_(std::move(on_host_shape), std::move(on_device_shape), allocator,
-                device_ordinal) {}
+      : result_(std::move(on_device_shape), allocator, device_ordinal) {}
+  ExecutionOutput(Shape on_device_shape, se::DeviceMemoryAllocator* allocator,
+                  int device_ordinal)
+      : result_(std::move(on_device_shape), allocator, device_ordinal) {}
   ExecutionOutput(ExecutionOutput&&) = default;
   ExecutionOutput& operator=(ExecutionOutput&&) = default;
 
diff --git a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.cc b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.cc
index 75d39298aa3..17d3fb2b3d6 100644
--- a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.cc
+++ b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.cc
@@ -25,35 +25,36 @@ limitations under the License.
 namespace xla {
 
 FusionNodeIndexingEvaluation::FusionNodeIndexingEvaluation(
-    const HloInstruction* fusion)
+    const HloInstruction* fusion, int64 root_usage_count)
     : fusion_(fusion) {
-  total_emitted_instructions_ = 0;
   HloInstruction* root = fusion->fused_expression_root();
   indexing_users_[root].insert(fusion);
-  index_usage_count_[fusion] = 1;
+  index_usage_count_[fusion] = root_usage_count;
   RecomputeCache();
 }
 
-bool FusionNodeIndexingEvaluation::AverageCodeDuplicationTooHigh(
-    const HloInstruction* producer) const {
-  // This constant is arbitrarily chosen. Essentially we don't want to have too
-  // much code duplication, because it slows down the compilation time. There is
-  // a tradeoff between compilation time and runtime here.
-  const int64 kAllowedCodeDuplication = 15;
+// This constant is arbitrarily chosen. Essentially we don't want to have too
+// much code duplication, because it slows down the compilation time. There is
+// a tradeoff between compilation time and runtime here.
+const int64 FusionNodeIndexingEvaluation::kAllowedCodeDuplication = 15;
 
-  // index_usage_count_ contains an entry for each instruction in the fusion
-  // computation (except parameter instructions), plus an entry for the 'fusion'
-  // instruction. So the size of this map is already one bigger than the number
-  // of instructions in the fusion node that are emitted, thus accounting for
-  // the number of instructions after 'producer' is fused.
-  return EvaluateTotalEmittedInstructions(producer) /
-             index_usage_count_.size() >
-         kAllowedCodeDuplication;
+bool FusionNodeIndexingEvaluation::CodeDuplicationTooHigh(
+    const HloInstruction* producer) const {
+  return EvaluateEmittedInstructions(producer) > kAllowedCodeDuplication;
 }
 
-int64 FusionNodeIndexingEvaluation::EvaluateTotalEmittedInstructions(
+bool FusionNodeIndexingEvaluation::MaxCodeDuplicationTooHigh() const {
+  for (const auto& entry : index_usage_count_) {
+    if (entry.second > kAllowedCodeDuplication) {
+      return true;
+    }
+  }
+  return false;
+}
+
+int64 FusionNodeIndexingEvaluation::EvaluateEmittedInstructions(
     const HloInstruction* producer) const {
-  int64 total = total_emitted_instructions_;
+  int64 total = 0;
   for (const auto* user : indexing_users_.at(producer)) {
     total += index_usage_count_.at(user);
   }
@@ -96,19 +97,9 @@ void FusionNodeIndexingEvaluation::UpdateIndexUsageCount(
     const HloInstruction* instruction) {
   int64 total = 0;
   for (const auto* user : indexing_users_[instruction]) {
-    int64 weight = 1;
-    // Concatenate is special: the index differs for each operand, so
-    // in the worst case we have to deal with as many index values as
-    // the number of operands of Concatenate. By considering the worst
-    // case, we are more conservative than necessary regarding
-    // counting the index usage.
-    if (user->opcode() == HloOpcode::kConcatenate) {
-      weight = user->operand_count();
-    }
-    total += index_usage_count_.at(user) * weight;
+    total += index_usage_count_.at(user);
   }
   CHECK(index_usage_count_.emplace(instruction, total).second);
-  total_emitted_instructions_ += total;
 }
 
 void FusionNodeIndexingEvaluation::UpdateIndexingUsersOfOperands(
diff --git a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h
index 9630986d188..abe154a5149 100644
--- a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h
+++ b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h
@@ -24,19 +24,22 @@ limitations under the License.
 namespace xla {
 class FusionNodeIndexingEvaluation {
  public:
-  explicit FusionNodeIndexingEvaluation(const HloInstruction* fusion);
+  explicit FusionNodeIndexingEvaluation(const HloInstruction* fusion,
+                                        int64 root_usage_count = 1);
 
-  // Evaluate the average number of times an instruction is emitted inside the
-  // fusion node, if 'producer' is fused into 'fusion_'. If this average
-  // duplication is "too high" (some arbitrary chosen constant), returns
-  // true.
-  bool AverageCodeDuplicationTooHigh(const HloInstruction* producer) const;
+  // Evaluate the number of times 'producer' would be emitted if it is fused
+  // into 'fusion_'. If the duplication is "too high" (some arbitrary chosen
+  // constant), returns true.
+  bool CodeDuplicationTooHigh(const HloInstruction* producer) const;
 
-  // Evaluate the total number of times an instruction is emitted inside the
-  // fusion node, if 'producer' is fused into 'fusion_'. An instruction may be
-  // emitted several times, once for each different index value with which it is
-  // indexed.
-  int64 EvaluateTotalEmittedInstructions(const HloInstruction* producer) const;
+  // Evaluate the maximum code duplication inside the fusion node. If the
+  // maximum code duplication is "too high" (some arbitrary chosen constant),
+  // returns true.
+  bool MaxCodeDuplicationTooHigh() const;
+
+  // Evaluate the number of times 'producer' would be emitted if it is fused
+  // into 'fusion_'.
+  int64 EvaluateEmittedInstructions(const HloInstruction* producer) const;
 
   // Update the evaluation cache after having fused 'producer' into 'fusion_'.
   // 'producer' is the cloned instruction which is now part of the fusion
@@ -56,6 +59,8 @@ class FusionNodeIndexingEvaluation {
       HloInstruction* fusion_operand);
 
  private:
+  static const int64 kAllowedCodeDuplication;
+
   // Computes the 'indexing_users_' and 'index_usage_count_' maps based on the
   // current instructions inside the fusion node. Also updates
   // 'total_emitted_instructions_' accordingly.
@@ -84,9 +89,6 @@ class FusionNodeIndexingEvaluation {
 
   // The fusion instruction.
   const HloInstruction* fusion_;
-
-  // The total number of emitted instructions.
-  int64 total_emitted_instructions_;
 };
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation_test.cc b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation_test.cc
index b20f52d2d62..b00abdc9abf 100644
--- a/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation_test.cc
+++ b/tensorflow/compiler/xla/service/fusion_node_indexing_evaluation_test.cc
@@ -29,7 +29,7 @@ using FusionNodeIndexingEvaluationTest = HloTestBase;
 
 // Subclass of InstructionFusion exposing the protected methods Fuse and
 // FuseInstruction for testing. Also adds the FusionNodeIndexingEvaluation to
-// track the average code duplication due to indexing HloInstructions with
+// track the code duplication due to indexing HloInstructions with
 // different index values.
 class InstructionFusionForTesting : public InstructionFusion {
  public:
@@ -61,8 +61,8 @@ class InstructionFusionForTesting : public InstructionFusion {
     return InstructionFusion::Fuse(producer, consumer);
   }
 
-  int64 EvaluateTotalEmittedInstructions(const HloInstruction* producer,
-                                         const HloInstruction* consumer) {
+  int64 EvaluateEmittedInstructions(const HloInstruction* producer,
+                                    const HloInstruction* consumer) {
     if (consumer->opcode() != HloOpcode::kFusion) {
       return 0;
     }
@@ -71,8 +71,8 @@ class InstructionFusionForTesting : public InstructionFusion {
       fusion_node_evaluations_.emplace(consumer,
                                        FusionNodeIndexingEvaluation(consumer));
     }
-    return fusion_node_evaluations_.at(consumer)
-        .EvaluateTotalEmittedInstructions(producer);
+    return fusion_node_evaluations_.at(consumer).EvaluateEmittedInstructions(
+        producer);
   }
 
  private:
@@ -109,8 +109,7 @@ TEST_F(FusionNodeIndexingEvaluationTest, FuseThreeInstructions) {
   HloInstruction* slice1 = sub->mutable_operand(0);
   HloInstruction* slice2 = sub->mutable_operand(1);
   auto fusion = instruction_fusion.Fuse(slice1, sub);
-  EXPECT_EQ(instruction_fusion.EvaluateTotalEmittedInstructions(slice2, fusion),
-            3);
+  EXPECT_EQ(instruction_fusion.EvaluateEmittedInstructions(slice2, fusion), 1);
   instruction_fusion.Fuse(slice2, fusion);
 }
 
@@ -151,37 +150,31 @@ TEST_F(FusionNodeIndexingEvaluationTest, ExponentialDuplicationPattern) {
   HloInstruction* slice2_1 = add2->mutable_operand(1);
   auto fusion = instruction_fusion.Fuse(slice2_0, add2);
   // So far we have fused add2 and slice2.0. So when we also fuse slice2.1, we
-  // expect to emit 3 instructions.
-  EXPECT_EQ(
-      instruction_fusion.EvaluateTotalEmittedInstructions(slice2_1, fusion), 3);
+  // expect to emit it 1 time.
+  EXPECT_EQ(instruction_fusion.EvaluateEmittedInstructions(slice2_1, fusion),
+            1);
   instruction_fusion.Fuse(slice2_1, fusion);
   HloInstruction* add1 = fusion->mutable_operand(0);
   EXPECT_EQ(add1->opcode(), HloOpcode::kAdd);
-  // If we fuse add1 into 'fusion', it needs to be emitted twice, adding 2 to
-  // the sum.
-  EXPECT_EQ(instruction_fusion.EvaluateTotalEmittedInstructions(add1, fusion),
-            5);
+  // If we fuse add1 into 'fusion', it needs to be emitted twice.
+  EXPECT_EQ(instruction_fusion.EvaluateEmittedInstructions(add1, fusion), 2);
   instruction_fusion.Fuse(add1, fusion);
   HloInstruction* slice1_0 = fusion->mutable_operand(0);
   EXPECT_EQ(slice1_0->opcode(), HloOpcode::kSlice);
-  // If we fuse slice1.0 into 'fusion', it needs to be emitted twice, adding 2
-  // to the sum.
-  EXPECT_EQ(
-      instruction_fusion.EvaluateTotalEmittedInstructions(slice1_0, fusion), 7);
+  // If we fuse slice1.0 into 'fusion', it needs to be emitted twice.
+  EXPECT_EQ(instruction_fusion.EvaluateEmittedInstructions(slice1_0, fusion),
+            2);
   instruction_fusion.Fuse(slice1_0, fusion);
   HloInstruction* slice1_1 = fusion->mutable_operand(0);
   EXPECT_EQ(slice1_1->opcode(), HloOpcode::kSlice);
-  // If we fuse slice1.1 into 'fusion', it needs to be emitted twice, adding 2
-  // to the sum.
-  EXPECT_EQ(
-      instruction_fusion.EvaluateTotalEmittedInstructions(slice1_1, fusion), 9);
+  // If we fuse slice1.1 into 'fusion', it needs to be emitted twice.
+  EXPECT_EQ(instruction_fusion.EvaluateEmittedInstructions(slice1_1, fusion),
+            2);
   instruction_fusion.Fuse(slice1_1, fusion);
   HloInstruction* add0 = fusion->mutable_operand(0);
   EXPECT_EQ(add0->opcode(), HloOpcode::kAdd);
-  // If we fuse add0 into 'fusion', it needs to be emitted twice, adding 4 to
-  // the sum.
-  EXPECT_EQ(instruction_fusion.EvaluateTotalEmittedInstructions(add0, fusion),
-            13);
+  // If we fuse add0 into 'fusion', it needs to be emitted four times.
+  EXPECT_EQ(instruction_fusion.EvaluateEmittedInstructions(add0, fusion), 4);
   instruction_fusion.Fuse(add0, fusion);
 }
 
@@ -212,10 +205,9 @@ ENTRY entry_computation {
   HloInstruction* add0 = fusion->mutable_operand(0);
   EXPECT_EQ(add0->opcode(), HloOpcode::kAdd);
   // Here, the cache for the fusion node needs to be recomputed. Make sure we
-  // still get the same evaluation as before when we incrementally built the
+  // still get the same evaluation as before when we incrementally build the
   // cache.
-  EXPECT_EQ(instruction_fusion.EvaluateTotalEmittedInstructions(add0, fusion),
-            13);
+  EXPECT_EQ(instruction_fusion.EvaluateEmittedInstructions(add0, fusion), 4);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index c09757fe1af..d2febb5fb73 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -69,13 +69,8 @@ void GenericTransferManager::TransferLiteralFromDevice(
     TF_RET_CHECK(stream->parent()->device_ordinal() ==
                  device_buffer.device_ordinal());
 
-    // The on-host and on-device shape should always be the same for the generic
-    // transfer manager.
-    TF_RET_CHECK(ShapeUtil::Equal(device_buffer.on_device_shape(),
-                                  device_buffer.on_host_shape()));
-
     TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
-        device_buffer.on_host_shape(),
+        device_buffer.on_device_shape(),
         [&](const Shape& subshape, const ShapeIndex& index) -> Status {
           if (subshape.IsArray()) {
             stream->ThenMemcpy(
@@ -103,20 +98,15 @@ Status GenericTransferManager::TransferLiteralToDeviceAsync(
           << ShapeUtil::HumanString(shape)
           << "; device buffer: " << device_buffer;
 
-  // The on-host and on-device shape should always be the same for the generic
-  // transfer manager.
-  TF_RET_CHECK(ShapeUtil::Equal(device_buffer.on_device_shape(),
-                                device_buffer.on_host_shape()));
-
   TF_RET_CHECK(
-      ShapeUtil::Compatible(literal.shape(), device_buffer.on_host_shape()));
+      ShapeUtil::Compatible(literal.shape(), device_buffer.on_device_shape()));
   TF_RET_CHECK(stream->parent()->device_ordinal() ==
                device_buffer.device_ordinal());
 
   TF_RETURN_IF_ERROR(WriteTupleIndexTablesAsync(stream, device_buffer));
 
   return ShapeUtil::ForEachSubshapeWithStatus(
-      device_buffer.on_host_shape(),
+      device_buffer.on_device_shape(),
       [&](const Shape& device_subshape, const ShapeIndex& index) -> Status {
         se::DeviceMemoryBase device_memory = device_buffer.buffer(index);
         if (device_subshape.IsArray()) {
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index d1d0827981e..9463454ae0b 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -1,9 +1,10 @@
 # Description:
 #   GPU-specific components in XLA service implementation.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow/core/platform:build_config.bzl",
-    "tf_proto_library_cc",
+    "tf_proto_library",
 )
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
@@ -26,6 +27,14 @@ load(
     "//tensorflow/core/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "if_nccl")
 load("//third_party/mlir:tblgen.bzl", "gentbl")
 
@@ -50,7 +59,7 @@ filegroup(
     ]),
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "backend_configs",
     srcs = ["backend_configs.proto"],
     cc_api_version = 2,
@@ -207,7 +216,9 @@ cc_library(
     deps = [
         ":backend_configs_cc",
         ":buffer_allocations",
+        ":cudnn_batchnorm_runner",
         ":gpu_constants",
+        ":gpu_conv_runner",
         ":gpu_executable",
         ":ir_emission_utils",
         ":nccl_all_reduce_thunk",
@@ -254,6 +265,8 @@ cc_library(
         ":target_util",
         ":thunk",
         ":thunk_emitter",
+        "//tensorflow/compiler/mlir:name_utils",
+        "//tensorflow/compiler/mlir/hlo",
         "//tensorflow/compiler/mlir/hlo:lhlo",
         "//tensorflow/compiler/mlir/xla:hlo_utils",
         "//tensorflow/compiler/mlir/xla:mhlo_to_lhlo_with_xla",
@@ -264,6 +277,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:union_find",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
@@ -311,6 +325,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
+        "//tensorflow/compiler/xla/service/llvm_ir:kernel_support_library",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter",
@@ -362,7 +377,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
@@ -381,7 +396,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -397,7 +412,7 @@ cc_library(
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
     ],
 )
 
@@ -447,7 +462,8 @@ tf_cuda_library(
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
     ] + if_cuda([
         "//tensorflow/stream_executor/cuda:cuda_activation",
         "//tensorflow/stream_executor/cuda:cuda_gpu_executor",
@@ -586,7 +602,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/lib:scoped_annotation",
         "//tensorflow/stream_executor",
@@ -632,7 +648,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@llvm-project//llvm:Core",
@@ -676,7 +692,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core/protobuf:autotuning_proto_cc",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/core/util/proto:proto_utils",
         "//tensorflow/stream_executor:blas",
         "//tensorflow/stream_executor:device_memory",
@@ -713,7 +729,7 @@ cc_library(
         "//tensorflow/core/protobuf:autotuning_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/core/util/proto:proto_utils",
         "//tensorflow/stream_executor:device_memory_allocator",
     ] + if_cuda_is_configured([
@@ -738,7 +754,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:dnn",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
@@ -760,7 +777,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
@@ -815,7 +832,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor:blas",
     ] + if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
@@ -838,7 +855,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor:blas",
         "//tensorflow/stream_executor:device_memory_allocator",
     ]),
@@ -896,6 +913,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_reachability",
+        "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1033,6 +1051,24 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "gpu_conv_padding_legalization_test",
+    srcs = ["gpu_conv_padding_legalization_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":gpu_conv_padding_legalization",
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "cudnn_pad_for_convolutions",
     srcs = ["cudnn_pad_for_convolutions.cc"],
@@ -1122,7 +1158,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:generic_transfer_manager",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "@com_google_absl//absl/memory",
         "@llvm-project//llvm:Core",
     ],
@@ -1151,7 +1187,8 @@ cc_library(
         ":gpu_layout_assignment",
         ":gpu_sanitize_constant_names",
         ":gpu_scatter_expander",
-        ":horizontal_fusion",
+        ":horizontal_input_fusion",
+        ":horizontal_loop_fusion",
         ":instruction_fusion",
         ":ir_emission_utils",
         ":ir_emitter",
@@ -1201,6 +1238,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:llvm_compiler",
         "//tensorflow/compiler/xla/service:logistic_expander",
+        "//tensorflow/compiler/xla/service:qr_expander",
+        "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:rng_bit_generator_expander",
         "//tensorflow/compiler/xla/service:rng_expander",
         "//tensorflow/compiler/xla/service:slice_sinker",
@@ -1217,8 +1256,8 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:regexp_internal",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:regexp",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/stream_executor:stream_executor_headers",
         "@com_google_absl//absl/memory",
@@ -1281,7 +1320,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
-        "//tensorflow/core:cuda_libdevice_path",
+        "//tensorflow/core/platform:cuda_libdevice_path",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/lib:traceme",
@@ -1367,7 +1406,7 @@ cc_library(
         ":xfeed_queue",
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
     ],
@@ -1405,7 +1444,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:layout_assignment",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
     ],
 )
 
@@ -1492,6 +1531,8 @@ cc_library(
     hdrs = ["stream_executor_util.h"],
     copts = tf_copts(),
     deps = [
+        ":ir_emission_utils",
+        ":launch_dimensions",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
@@ -1499,11 +1540,11 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_module_config",
-        "//tensorflow/core:cuda_libdevice_path",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:regexp_internal",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:cuda_libdevice_path",
+        "//tensorflow/core/platform:regexp",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/stream_executor:kernel_spec",
         "//tensorflow/stream_executor/gpu:gpu_asm_opts",
@@ -1526,7 +1567,7 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo_module_config",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor:stream_executor_headers",
         "//tensorflow/stream_executor/gpu:asm_compiler",
     ]),
@@ -1585,7 +1626,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:pattern_matcher",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
     ],
 )
 
@@ -1661,7 +1702,7 @@ tf_cc_test(
     ],
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "gpu_autotuning_proto",
     srcs = ["gpu_autotuning.proto"],
     cc_api_version = 2,
@@ -1679,7 +1720,7 @@ cc_library(
     deps = [
         ":gpu_autotuning_proto_cc",
         "//tensorflow/compiler/xla:debug_options_flags",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/core/protobuf:autotuning_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
@@ -1726,10 +1767,11 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "horizontal_fusion",
-    srcs = ["horizontal_fusion.cc"],
-    hdrs = ["horizontal_fusion.h"],
+    name = "horizontal_loop_fusion",
+    srcs = ["horizontal_loop_fusion.cc"],
+    hdrs = ["horizontal_loop_fusion.h"],
     deps = [
+        ":gpu_fusible",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_creation_utils",
@@ -1742,11 +1784,11 @@ cc_library(
 )
 
 tf_cc_test(
-    name = "horizontal_fusion_test",
-    srcs = ["horizontal_fusion_test.cc"],
+    name = "horizontal_loop_fusion_test",
+    srcs = ["horizontal_loop_fusion_test.cc"],
     deps = [
         ":fusion_merger",
-        ":horizontal_fusion",
+        ":horizontal_loop_fusion",
         ":instruction_fusion",
         ":multi_output_fusion",
         "//tensorflow/compiler/jit:xla_gpu_jit",
@@ -1766,6 +1808,45 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "horizontal_input_fusion",
+    srcs = ["horizontal_input_fusion.cc"],
+    hdrs = ["horizontal_input_fusion.h"],
+    deps = [
+        ":gpu_fusible",
+        ":ir_emission_utils",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_creation_utils",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "horizontal_input_fusion_test",
+    srcs = ["horizontal_input_fusion_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":horizontal_input_fusion",
+        ":multi_output_fusion",
+        "//tensorflow/compiler/jit:xla_gpu_jit",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
+        "//tensorflow/compiler/xla/service/gpu/tests:gpu_codegen_test",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 cc_library(
     name = "reduction_degenerate_dim_remover",
     srcs = ["reduction_degenerate_dim_remover.cc"],
@@ -1891,6 +1972,7 @@ cc_library(
 
 gentbl(
     name = "xla_thunks_ops_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
     tbl_outs = [
         ("-gen-op-decls", "ir/xla_thunks_ops.h.inc"),
         ("-gen-op-defs", "ir/xla_thunks_ops.cc.inc"),
@@ -1921,16 +2003,3 @@ cc_library(
         "@llvm-project//mlir:LLVMDialect",
     ],
 )
-
-# Library with XLA thunks dialect static initialization.
-cc_library(
-    name = "xla_thunks_dialect_registration",
-    srcs = [
-        "ir/dialect_registration.cc",
-    ],
-    deps = [
-        ":xla_thunks_ops",
-        "@llvm-project//mlir:IR",
-    ],
-    alwayslink = 1,
-)
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
index 9b192aaa8e1..21b4ef40d97 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_comparator.cc
@@ -610,13 +610,21 @@ static StatusOr<bool> DeviceCompare(se::Stream* stream,
       executor->GetDeviceDescription().threads_per_block_limit();
   gpu_device_info.threads_per_warp =
       executor->GetDeviceDescription().threads_per_warp();
+  gpu_device_info.shared_memory_per_block =
+      executor->GetDeviceDescription().shared_memory_per_block();
+  gpu_device_info.threads_per_core_limit =
+      executor->GetDeviceDescription().threads_per_core_limit();
+  gpu_device_info.core_count = executor->GetDeviceDescription().core_count();
   LaunchDimensions dim =
       CalculateLaunchDimensions(buffer_shape, gpu_device_info);
 
-  stream->ThenLaunch(se::ThreadDim(dim.threads_per_block()),
-                     se::BlockDim(dim.block_count()), *comparison_kernel,
-                     lhs_typed, rhs_typed, static_cast<float>(kTolerance),
-                     buffer_size, out_param.cref());
+  LaunchDimensions::Dim3D thread_counts = dim.thread_counts_per_block();
+  LaunchDimensions::Dim3D block_counts = dim.block_counts();
+  stream->ThenLaunch(
+      se::ThreadDim(thread_counts.x, thread_counts.y, thread_counts.z),
+      se::BlockDim(block_counts.x, block_counts.y, block_counts.z),
+      *comparison_kernel, lhs_typed, rhs_typed, static_cast<float>(kTolerance),
+      buffer_size, out_param.cref());
 
   uint64 result = -1;
   CHECK_EQ(out_param->size(), sizeof(result));
diff --git a/tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc b/tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc
index c34c299fea8..4ac5784e51a 100644
--- a/tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc
@@ -45,10 +45,7 @@ CholeskyThunk::CholeskyThunk(ThunkInfo thunk_info,
       info_buffer_(info_buffer),
       type_(type),
       batch_size_(batch_size),
-      a_batch_stride_(
-          n * n *
-          ShapeUtil::ByteSizeOfPrimitiveType(
-              thunk_info.hlo_instruction->operand(0)->shape().element_type())),
+      a_batch_stride_(n * n * ShapeUtil::ByteSizeOfPrimitiveType(type)),
       n_(n) {}
 
 Status CholeskyThunk::ExecuteOnStream(const ExecuteParams& params) {
diff --git a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
index b3b5cf7e048..88982d3c034 100644
--- a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/refcounting_hash_map.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
@@ -217,16 +218,23 @@ RefcountingHashMap<RendezvousKey, Rendezvous>& GlobalRendezvousMap() {
 
 }  // anonymous namespace
 
+CollectivePermuteConfig GetCollectivePermuteConfig(
+    const HloInstruction* instr) {
+  CollectivePermuteConfig config;
+  auto* collective_permute = Cast<HloCollectivePermuteInstruction>(instr);
+  config.source_target_pairs = collective_permute->source_target_pairs();
+  return config;
+}
+
 CollectivePermuteThunk::CollectivePermuteThunk(
-    ThunkInfo thunk_info, const BufferAllocation::Slice& src,
-    const BufferAllocation::Slice& dest)
+    ThunkInfo thunk_info, CollectivePermuteConfig&& config,
+    const BufferAllocation::Slice& src, const BufferAllocation::Slice& dest)
     : Thunk(kCollectivePermute, thunk_info),
-      hlo_instruction_(thunk_info.hlo_instruction),
+      config_(std::move(config)),
       src_(src),
       dest_(dest) {}
 
 Status CollectivePermuteThunk::ExecuteOnStream(const ExecuteParams& params) {
-  auto* instr = Cast<HloCollectivePermuteInstruction>(hlo_instruction_);
   auto op_profiler =
       params.profiler->MakeScopedInstructionProfiler(profile_index());
 
@@ -245,7 +253,7 @@ Status CollectivePermuteThunk::ExecuteOnStream(const ExecuteParams& params) {
 
   // Figure out which replicas our data is copied to.
   std::vector<int64> dest_replicas;
-  for (const auto& src_dest : instr->source_target_pairs()) {
+  for (const auto& src_dest : config_.source_target_pairs) {
     if (src_dest.first == replica_id) {
       dest_replicas.push_back(src_dest.second);
     }
@@ -260,7 +268,7 @@ Status CollectivePermuteThunk::ExecuteOnStream(const ExecuteParams& params) {
 
   // If no replica writes into us (i.e. we aren't the target of any copies), our
   // contract is that we zero our output.
-  if (absl::c_none_of(instr->source_target_pairs(),
+  if (absl::c_none_of(config_.source_target_pairs,
                       [&](std::pair<int64, int64> src_dest) {
                         return src_dest.second == replica_id;
                       })) {
diff --git a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.h b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.h
index 44cc6a1c64e..bef86eec9af 100644
--- a/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/collective_permute_thunk.h
@@ -19,23 +19,30 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
 namespace xla {
 namespace gpu {
 
+struct CollectivePermuteConfig {
+  std::vector<std::pair<int64, int64>> source_target_pairs;
+};
+
+CollectivePermuteConfig GetCollectivePermuteConfig(const HloInstruction* instr);
+
 // Thunk that implements the collective-permute HLO.
 class CollectivePermuteThunk : public Thunk {
  public:
-  CollectivePermuteThunk(ThunkInfo thunk_info,
+  CollectivePermuteThunk(ThunkInfo thunk_info, CollectivePermuteConfig&& config,
                          const BufferAllocation::Slice& src,
                          const BufferAllocation::Slice& dest);
 
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
-  const HloInstruction* hlo_instruction_;
-  BufferAllocation::Slice src_;
-  BufferAllocation::Slice dest_;
+  const CollectivePermuteConfig config_;
+  const BufferAllocation::Slice src_;
+  const BufferAllocation::Slice dest_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
index 4cff48a89da..6560c1a819c 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -17,43 +17,51 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
 namespace gpu {
 
-ConditionalThunk::ConditionalThunk(
-    ThunkInfo thunk_info,
-    const BufferAllocation::Slice& branch_index_buffer_index,
-    absl::Span<const BufferAllocation::Slice> branch_operand_buffer_indexes,
-    std::vector<ThunkSequence> branch_thunk_sequences)
-    : Thunk(Kind::kConditional, thunk_info),
-      hlo_instruction_(thunk_info.hlo_instruction),
-      branch_index_is_bool_(
-          thunk_info.hlo_instruction->operand(0)->shape().element_type() ==
-          PRED),
-      branch_index_buffer_index_(branch_index_buffer_index),
-      branch_operand_buffer_indexes_(branch_operand_buffer_indexes.begin(),
-                                     branch_operand_buffer_indexes.end()) {
-  // Pass nullptr as the HloInstruction* to the branch_thunks_
+ConditionalThunkConfig GetConditionalThunkConfig(
+    const HloInstruction* instr,
+    std::vector<ThunkSequence>&& branch_thunk_sequences,
+    std::vector<absl::optional<size_t>>&& branch_profile_indices) {
+  ConditionalThunkConfig config;
+  config.branch_index_is_bool =
+      instr->operand(0)->shape().element_type() == PRED;
+  config.branch_count = instr->branch_count();
+  // Pass nullptr as the HloInstruction* to the branch_thunks
   // constructors because these SequentialThunks are logically "part of"
   // this ConditionalThunk, and shouldn't be profiled separately from it.
-  branch_thunks_.reserve(branch_thunk_sequences.size());
+  config.branch_thunks.reserve(branch_thunk_sequences.size());
   for (auto& branch_thunk_sequence : branch_thunk_sequences) {
-    branch_thunks_.emplace_back(
-        new SequentialThunk(ThunkInfo(), std::move(branch_thunk_sequence)));
+    config.branch_thunks.emplace_back(new SequentialThunk(
+        Thunk::ThunkInfo(), std::move(branch_thunk_sequence)));
   }
+  config.branch_profile_indices = std::move(branch_profile_indices);
+  return config;
 }
 
+ConditionalThunk::ConditionalThunk(
+    ThunkInfo thunk_info, ConditionalThunkConfig&& config,
+    const BufferAllocation::Slice& branch_index_buffer_index,
+    absl::Span<const BufferAllocation::Slice> branch_operand_buffer_indexes)
+    : Thunk(Kind::kConditional, thunk_info),
+      config_(std::move(config)),
+      branch_index_buffer_index_(branch_index_buffer_index),
+      branch_operand_buffer_indexes_(branch_operand_buffer_indexes.begin(),
+                                     branch_operand_buffer_indexes.end()) {}
+
 Status ConditionalThunk::Initialize(const GpuExecutable& executable,
                                     se::StreamExecutor* executor) {
-  if (branch_index_is_bool_) {
-    TF_RET_CHECK(branch_thunks_.size() == 2);
+  if (config_.branch_index_is_bool) {
+    TF_RET_CHECK(config_.branch_thunks.size() == 2);
   } else {
-    TF_RET_CHECK(!branch_thunks_.empty());
+    TF_RET_CHECK(!config_.branch_thunks.empty());
   }
-  for (auto& branch_thunk : branch_thunks_) {
+  for (auto& branch_thunk : config_.branch_thunks) {
     TF_RETURN_IF_ERROR(branch_thunk->Initialize(executable, executor));
   }
   return Status::OK();
@@ -69,7 +77,7 @@ Status ConditionalThunk::ExecuteOnStream(const ExecuteParams& params) {
   bool pred = false;
   se::DeviceMemoryBase branch_index_address =
       params.buffer_allocations->GetDeviceAddress(branch_index_buffer_index_);
-  if (branch_index_is_bool_) {
+  if (config_.branch_index_is_bool) {
     stream.ThenMemcpy(&pred, branch_index_address, sizeof(bool));
   } else {
     stream.ThenMemcpy(&branch_index, branch_index_address, sizeof(int32));
@@ -81,20 +89,20 @@ Status ConditionalThunk::ExecuteOnStream(const ExecuteParams& params) {
         "Failed to retrieve branch_index value on stream %p: %s.", &stream,
         block_status.error_message());
   }
-  if (branch_index_is_bool_) {
+  if (config_.branch_index_is_bool) {
     branch_index = pred ? 0 : 1;
   } else {
     // Handle default scenario for branch_index not in [0, num_branches).
-    if (branch_index < 0 || branch_index >= hlo_instruction_->branch_count()) {
-      branch_index = hlo_instruction_->branch_count() - 1;
+    if (branch_index < 0 || branch_index >= config_.branch_count) {
+      branch_index = config_.branch_count - 1;
     }
   }
 
   // Execute the branch computation corresponding to the value of branch_index.
   profiler.StartHloComputation();
-  TF_RETURN_IF_ERROR(branch_thunks_[branch_index]->ExecuteOnStream(params));
-  profiler.FinishHloComputation(
-      hlo_instruction_->branch_computation(branch_index));
+  TF_RETURN_IF_ERROR(
+      config_.branch_thunks[branch_index]->ExecuteOnStream(params));
+  profiler.FinishHloComputation(config_.branch_profile_indices[branch_index]);
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
index f91f1c52146..bf4280cdb12 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@@ -30,6 +30,18 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+struct ConditionalThunkConfig {
+  bool branch_index_is_bool;
+  int64 branch_count;
+  std::vector<std::unique_ptr<SequentialThunk>> branch_thunks;
+  std::vector<absl::optional<size_t>> branch_profile_indices;
+};
+
+ConditionalThunkConfig GetConditionalThunkConfig(
+    const HloInstruction* instr,
+    std::vector<ThunkSequence>&& branch_thunk_sequences,
+    std::vector<absl::optional<size_t>>&& branch_profile_indices);
+
 // ConditionalThunk implements the conditional instruction on GPU by reading the
 // predicate of the conditional and executing the true or the false computation
 // depending on the value of the predicate.
@@ -43,10 +55,9 @@ namespace gpu {
 class ConditionalThunk : public Thunk {
  public:
   ConditionalThunk(
-      ThunkInfo thunk_info,
+      ThunkInfo thunk_info, ConditionalThunkConfig&& config,
       const BufferAllocation::Slice& branch_index_buffer_index,
-      absl::Span<const BufferAllocation::Slice> branch_operand_buffer_indexes,
-      std::vector<ThunkSequence> branch_thunk_sequences);
+      absl::Span<const BufferAllocation::Slice> branch_operand_buffer_indexes);
 
   ConditionalThunk(const ConditionalThunk&) = delete;
   ConditionalThunk& operator=(const ConditionalThunk&) = delete;
@@ -56,11 +67,9 @@ class ConditionalThunk : public Thunk {
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
-  const HloInstruction* hlo_instruction_;
-  const bool branch_index_is_bool_;
+  const ConditionalThunkConfig config_;
   BufferAllocation::Slice branch_index_buffer_index_;
   std::vector<BufferAllocation::Slice> branch_operand_buffer_indexes_;
-  std::vector<std::unique_ptr<SequentialThunk>> branch_thunks_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index 3048db95c39..efa3a5802d6 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -31,15 +31,16 @@ namespace xla {
 namespace gpu {
 
 ConvolutionThunk::ConvolutionThunk(
-    ThunkInfo thunk_info, std::vector<BufferAllocation::Slice> operand_slices,
+    ThunkInfo thunk_info, GpuConvConfig&& config,
+    std::vector<BufferAllocation::Slice> operand_slices,
     BufferAllocation::Slice result_slice, BufferAllocation::Slice scratch_slice,
     BufferAllocation::Slice tuple_result_slice)
     : Thunk(Kind::kConvolution, thunk_info),
-      cudnn_call_(Cast<HloCustomCallInstruction>(thunk_info.hlo_instruction)),
       operand_buffers_(std::move(operand_slices)),
       result_buffer_(result_slice),
       scratch_buffer_(scratch_slice),
-      tuple_result_buffer_(tuple_result_slice) {}
+      tuple_result_buffer_(tuple_result_slice),
+      config_(std::move(config)) {}
 
 Status ConvolutionThunk::ExecuteOnStream(const ExecuteParams& params) {
   const auto& buffer_allocations = *params.buffer_allocations;
@@ -57,7 +58,7 @@ Status ConvolutionThunk::ExecuteOnStream(const ExecuteParams& params) {
 
   auto op_profiler =
       params.profiler->MakeScopedInstructionProfiler(profile_index());
-  TF_RETURN_IF_ERROR(RunGpuConv(cudnn_call_, absl::MakeSpan(operand_se_buffers),
+  TF_RETURN_IF_ERROR(RunGpuConv(config_, absl::MakeSpan(operand_se_buffers),
                                 result_buffer, scratch, params.stream));
 
   // Write the output tuple.
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
index 03fae88c6dc..7f8377ebe4c 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@@ -43,7 +43,7 @@ class ConvolutionThunk : public Thunk {
   // write a tuple (result, scratch_memory) into `tuple_result_buffer`.
   //
   // operand_slices should be in the same order as cudnn_call->operands().
-  ConvolutionThunk(ThunkInfo thunk_info,
+  ConvolutionThunk(ThunkInfo thunk_info, GpuConvConfig&& config,
                    std::vector<BufferAllocation::Slice> operand_slices,
                    BufferAllocation::Slice result_slice,
                    BufferAllocation::Slice scratch_slice,
@@ -55,11 +55,13 @@ class ConvolutionThunk : public Thunk {
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
-  const HloCustomCallInstruction* cudnn_call_;
   std::vector<BufferAllocation::Slice> operand_buffers_;
   BufferAllocation::Slice result_buffer_;
   BufferAllocation::Slice scratch_buffer_;
   BufferAllocation::Slice tuple_result_buffer_;
+
+  // Convolution config
+  const GpuConvConfig config_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_runner.cc
index adf6b68096d..6b01151b48a 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_runner.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -109,26 +110,23 @@ DnnBatchDescriptors MakeBatchNormDescriptors(const Shape& shape,
   return batch_descs;
 }
 
-void AssignCommonParams(const HloInstruction* batchnorm,
+void AssignCommonParams(const CudnnBatchNormConfig& config,
                         CudnnBatchNormParamsCommon* params,
                         const se::DeviceMemoryBase& operand,
-                        const se::DeviceMemory<float>& scale, float epsilon,
-                        int64 feature_index) {
+                        const se::DeviceMemory<float>& scale) {
   // The BatchNormTraining HLO outputs a tuple of three elements: output data,
   // batch mean, and batch variance.  We want to make our descriptors based on
   // the shape of the output data. Batchnorm backward call outputs a tuple of
   // three elements: grad data, grad offset, and grad scale.  We want to make
   // our descriptors based on the shape of the grad data.
-  const Shape& shape = batchnorm->shape().IsTuple()
-                           ? batchnorm->shape().tuple_shapes(0)
-                           : batchnorm->shape();
+  const Shape& shape = config.output_shape;
   DnnBatchDescriptors batch_descs =
-      MakeBatchNormDescriptors(shape, feature_index);
+      MakeBatchNormDescriptors(shape, config.feature_index);
   params->operand_desc = batch_descs.input_desc;
   params->scale_offset_desc = batch_descs.scale_offset_desc;
   params->operand = operand;
   params->scale = scale;
-  params->epsilon = epsilon;
+  params->epsilon = config.epsilon;
 }
 
 template <typename ElemType>
@@ -211,22 +209,33 @@ void RunCudnnBatchNormBackwardImpl(CudnnBatchNormBackwardParams* params,
 
 }  // namespace
 
+CudnnBatchNormConfig GetCudnnBatchNormConfig(const HloInstruction* instr,
+                                             float epsilon,
+                                             int64 feature_index) {
+  CudnnBatchNormConfig config;
+
+  config.output_shape = instr->shape().IsTuple()
+                            ? instr->shape().tuple_shapes(0)
+                            : instr->shape();
+  config.output_type = config.output_shape.element_type();
+  config.epsilon = epsilon;
+  config.feature_index = feature_index;
+  return config;
+}
+
 Status RunCudnnBatchNormForwardInference(
-    const HloInstruction* batchnorm, se::DeviceMemoryBase operand,
+    const CudnnBatchNormConfig& config, se::DeviceMemoryBase operand,
     se::DeviceMemoryBase output, se::DeviceMemory<float> scale,
     se::DeviceMemory<float> offset, se::DeviceMemory<float> mean,
-    se::DeviceMemory<float> variance, float epsilon, int64 feature_index,
-    se::Stream* stream) {
+    se::DeviceMemory<float> variance, se::Stream* stream) {
   CudnnBatchNormForwardInferenceParams inference_params;
-  AssignCommonParams(batchnorm, &inference_params.common, operand, scale,
-                     epsilon, feature_index);
+  AssignCommonParams(config, &inference_params.common, operand, scale);
   inference_params.offset = offset;
   inference_params.mean = mean;
   inference_params.variance = variance;
   inference_params.output = output;
 
-  PrimitiveType output_primitive_type = batchnorm->shape().element_type();
-  switch (output_primitive_type) {
+  switch (config.output_type) {
     case F16:
       RunCudnnBatchNormForwardInferenceImpl<Eigen::half>(&inference_params,
                                                          stream);
@@ -235,29 +244,27 @@ Status RunCudnnBatchNormForwardInference(
       RunCudnnBatchNormForwardInferenceImpl<float>(&inference_params, stream);
       break;
     default:
-      return Unimplemented("Primitive type not implemented for \"%s\" ",
-                           batchnorm->ToString());
+      return Unimplemented(
+          "Primitive type %s not implemented for batchnorm forward inference",
+          primitive_util::LowercasePrimitiveTypeName(config.output_type)
+              .c_str());
   }
   return Status::OK();
 }
 
 Status RunCudnnBatchNormForwardTraining(
-    const HloInstruction* batchnorm, se::DeviceMemoryBase operand,
+    const CudnnBatchNormConfig& config, se::DeviceMemoryBase operand,
     se::DeviceMemoryBase output_data, se::DeviceMemory<float> output_mean,
     se::DeviceMemory<float> output_inv_stddev, se::DeviceMemory<float> scale,
-    se::DeviceMemory<float> offset, float epsilon, int64 feature_index,
-    se::Stream* stream) {
+    se::DeviceMemory<float> offset, se::Stream* stream) {
   CudnnBatchNormForwardTrainingParams forward_params;
-  AssignCommonParams(batchnorm, &forward_params.common, operand, scale, epsilon,
-                     feature_index);
+  AssignCommonParams(config, &forward_params.common, operand, scale);
   forward_params.offset = offset;
   forward_params.output_data = output_data;
   forward_params.output_mean = output_mean;
   forward_params.output_inv_stddev = output_inv_stddev;
 
-  PrimitiveType output_primitive_type =
-      batchnorm->shape().tuple_shapes(0).element_type();
-  switch (output_primitive_type) {
+  switch (config.output_type) {
     case F16:
       RunCudnnBatchNormForwardTrainingImpl<Eigen::half>(&forward_params,
                                                         stream);
@@ -266,22 +273,23 @@ Status RunCudnnBatchNormForwardTraining(
       RunCudnnBatchNormForwardTrainingImpl<float>(&forward_params, stream);
       break;
     default:
-      return Unimplemented("Primitive type not implemented for \"%s\" ",
-                           batchnorm->ToString());
+      return Unimplemented(
+          "Primitive type %s not implemented for batchnorm forward training",
+          primitive_util::LowercasePrimitiveTypeName(config.output_type)
+              .c_str());
   }
   return Status::OK();
 }
 
 Status RunCudnnBatchNormBackward(
-    const HloInstruction* batchnorm, se::DeviceMemoryBase operand,
+    const CudnnBatchNormConfig& config, se::DeviceMemoryBase operand,
     se::DeviceMemoryBase output_grad_data, se::DeviceMemoryBase grad_output,
     se::DeviceMemory<float> output_grad_scale,
     se::DeviceMemory<float> output_grad_offset, se::DeviceMemory<float> scale,
     se::DeviceMemory<float> mean, se::DeviceMemory<float> inv_stddev,
-    float epsilon, int64 feature_index, se::Stream* stream) {
+    se::Stream* stream) {
   CudnnBatchNormBackwardParams backward_params;
-  AssignCommonParams(batchnorm, &backward_params.common, operand, scale,
-                     epsilon, feature_index);
+  AssignCommonParams(config, &backward_params.common, operand, scale);
   backward_params.output_grad_data = output_grad_data;
   backward_params.grad_output = grad_output;
   backward_params.output_grad_scale = output_grad_scale;
@@ -289,9 +297,7 @@ Status RunCudnnBatchNormBackward(
   backward_params.mean = mean;
   backward_params.inv_stddev = inv_stddev;
 
-  PrimitiveType output_primitive_type =
-      batchnorm->shape().tuple_shapes(0).element_type();
-  switch (output_primitive_type) {
+  switch (config.output_type) {
     case F16:
       RunCudnnBatchNormBackwardImpl<Eigen::half>(&backward_params, stream);
       break;
@@ -299,8 +305,10 @@ Status RunCudnnBatchNormBackward(
       RunCudnnBatchNormBackwardImpl<float>(&backward_params, stream);
       break;
     default:
-      return Unimplemented("Primitive type not implemented for \"%s\" ",
-                           batchnorm->ToString());
+      return Unimplemented(
+          "Primitive type %s not implemented for batchnorm backward",
+          primitive_util::LowercasePrimitiveTypeName(config.output_type)
+              .c_str());
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_runner.h b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_runner.h
index 9a630d013f7..b0791b01868 100755
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_runner.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_runner.h
@@ -28,27 +28,36 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+struct CudnnBatchNormConfig {
+  Shape output_shape;
+  PrimitiveType output_type;
+  float epsilon;
+  int64 feature_index;
+};
+
+CudnnBatchNormConfig GetCudnnBatchNormConfig(const HloInstruction *instr,
+                                             float epsilon,
+                                             int64 feature_index);
+
 Status RunCudnnBatchNormForwardInference(
-    const HloInstruction* batchnorm, se::DeviceMemoryBase operand,
+    const CudnnBatchNormConfig &config, se::DeviceMemoryBase operand,
     se::DeviceMemoryBase output, se::DeviceMemory<float> scale,
     se::DeviceMemory<float> offset, se::DeviceMemory<float> mean,
-    se::DeviceMemory<float> variance, float epsilon, int64 feature_index,
-    se::Stream* stream);
+    se::DeviceMemory<float> variance, se::Stream *stream);
 
 Status RunCudnnBatchNormForwardTraining(
-    const HloInstruction* batchnorm, se::DeviceMemoryBase operand,
+    const CudnnBatchNormConfig &config, se::DeviceMemoryBase operand,
     se::DeviceMemoryBase output_data, se::DeviceMemory<float> output_mean,
     se::DeviceMemory<float> output_inv_stddev, se::DeviceMemory<float> scale,
-    se::DeviceMemory<float> offset, float epsilon, int64 feature_index,
-    se::Stream* stream);
+    se::DeviceMemory<float> offset, se::Stream *stream);
 
 Status RunCudnnBatchNormBackward(
-    const HloInstruction* batchnorm, se::DeviceMemoryBase operand,
+    const CudnnBatchNormConfig &config, se::DeviceMemoryBase operand,
     se::DeviceMemoryBase output_grad_data, se::DeviceMemoryBase grad_output,
     se::DeviceMemory<float> output_grad_scale,
     se::DeviceMemory<float> output_grad_offset, se::DeviceMemory<float> scale,
     se::DeviceMemory<float> mean, se::DeviceMemory<float> inv_stddev,
-    float epsilon, int64 feature_index, se::Stream* stream);
+    se::Stream *stream);
 }  // namespace gpu
 }  // namespace xla
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUDNN_BATCHNORM_RUNNER_H_
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
index e91b2c4d0d2..dae490e0d18 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.cc
@@ -31,90 +31,21 @@ namespace gpu {
 
 namespace dnn = se::dnn;
 
-namespace {
-void CheckInputOutputPrimitivetypeAreValid(const HloInstruction* hlo) {
-  // All input and output statistics variables must be F32. Also, the last
-  // operand for CudnnBatchNormForwardInference, CudnnBatchNormForwardTraining,
-  // and CudnnBatchNormBackward is the feature_index which must be S64.
-  // The allowed types for non-statistics variables are as follows:
-  // CudnnBatchNormForwardInference:
-  //            operand[0]: {half, float}
-  //                out[0]: {half, float}
-  // CudnnBatchNormForwardTraining:
-  //            operand[0]: {half, float}
-  //                out[0]: {half, float}
-  // CudnnBatchNormBackward:
-  //            operand[0]: {half, float}
-  //            operand[4]: {half, float}
-  //                out[0]: {half, float}
-  // Note non-statistics inputs and outputs mentioned above should be of the
-  // same type.
-
-  // Check Inputs.
-  int64 num_operands = hlo->operand_count();
-  PrimitiveType operand_primitive_type =
-      hlo->operand(0)->shape().element_type();
-  CHECK(operand_primitive_type == F16 || operand_primitive_type == F32)
-      << "Not yet implemented";
-
-  for (int i = 1; i < num_operands - 2; i++) {
-    if (hlo->custom_call_target() == kCudnnBatchNormBackwardCallTarget &&
-        i == 4) {
-      // The first operand to batchnorm grad is the input and the 4th operand is
-      // the grad_output, both of which can be Eigen::half.
-      CHECK_EQ(hlo->operand(i)->shape().element_type(), operand_primitive_type)
-          << "Invalid datatype";
-      continue;
-    }
-    CHECK_EQ(hlo->operand(i)->shape().element_type(), F32)
-        << "Not yet implemented";
-  }
-
-  // The last operand is the feature index which must be int64.
-  CHECK_EQ(hlo->operand(num_operands - 1)->shape().element_type(), S64)
-      << "Not yet implemented";
-
-  // Check Outputs.
-  if (hlo->shape().IsTuple()) {
-    CHECK_EQ(hlo->shape().tuple_shapes(0).element_type(),
-             operand_primitive_type)
-        << "Invalid datatype";
-
-    for (int j = 1; j < hlo->shape().tuple_shapes_size(); j++) {
-      CHECK_EQ(hlo->shape().tuple_shapes(j).element_type(), F32)
-          << "Not yet implemented";
-    }
-  } else {
-    CHECK_EQ(hlo->shape().element_type(), operand_primitive_type)
-        << "Invalid datatype";
-  }
-}
-}  // namespace
-
 CudnnBatchNormForwardInferenceThunk::CudnnBatchNormForwardInferenceThunk(
-    ThunkInfo thunk_info, const BufferAllocation::Slice& operand,
+    ThunkInfo thunk_info, CudnnBatchNormConfig&& config,
+    const BufferAllocation::Slice& operand,
     const BufferAllocation::Slice& scale, const BufferAllocation::Slice& offset,
     const BufferAllocation::Slice& mean,
-    const BufferAllocation::Slice& variance, float epsilon, int64 feature_index,
+    const BufferAllocation::Slice& variance,
     const BufferAllocation::Slice& output)
     : Thunk(Thunk::Kind::kCudnnBatchNormForwardInference, thunk_info),
-      hlo_instruction_(thunk_info.hlo_instruction),
+      config_(std::move(config)),
       operand_(operand),
       scale_(scale),
       offset_(offset),
       mean_(mean),
       variance_(variance),
-      epsilon_(epsilon),
-      feature_index_(feature_index),
-      output_(output) {
-  const auto* hlo = hlo_instruction_;
-  CHECK_EQ(hlo->opcode(), HloOpcode::kCustomCall);
-  CHECK_EQ(hlo->custom_call_target(),
-           kCudnnBatchNormForwardInferenceCallTarget);
-  CHECK(
-      LayoutUtil::LayoutsInShapesEqual(hlo->shape(), hlo->operand(0)->shape()));
-  CheckInputOutputPrimitivetypeAreValid(hlo);
-}
+      output_(output) {}
 
 Status CudnnBatchNormForwardInferenceThunk::ExecuteOnStream(
     const ExecuteParams& params) {
@@ -131,8 +62,7 @@ Status CudnnBatchNormForwardInferenceThunk::ExecuteOnStream(
       buffer_allocations.GetDeviceAddress(variance_));
   auto& stream = *params.stream;
   TF_RETURN_IF_ERROR(RunCudnnBatchNormForwardInference(
-      hlo_instruction_, operand, output_base, scale, offset, mean, variance,
-      epsilon_, feature_index_, &stream));
+      config_, operand, output_base, scale, offset, mean, variance, &stream));
 
   if (!stream.ok()) {
     return InternalError("BatchNormalizationForward call failed.");
@@ -141,32 +71,22 @@ Status CudnnBatchNormForwardInferenceThunk::ExecuteOnStream(
 }
 
 CudnnBatchNormForwardTrainingThunk::CudnnBatchNormForwardTrainingThunk(
-    ThunkInfo thunk_info, const BufferAllocation::Slice& operand,
+    ThunkInfo thunk_info, CudnnBatchNormConfig&& config,
+    const BufferAllocation::Slice& operand,
     const BufferAllocation::Slice& scale, const BufferAllocation::Slice& offset,
-    float epsilon, int64 feature_index,
     const BufferAllocation::Slice& output_data,
     const BufferAllocation::Slice& output_mean,
     const BufferAllocation::Slice& output_inv_stddev,
     const BufferAllocation::Slice& output_tuple)
     : Thunk(Thunk::Kind::kCudnnBatchNormForwardTraining, thunk_info),
-      hlo_instruction_(thunk_info.hlo_instruction),
+      config_(std::move(config)),
       operand_(operand),
       scale_(scale),
       offset_(offset),
-      epsilon_(epsilon),
-      feature_index_(feature_index),
       output_data_(output_data),
       output_mean_(output_mean),
       output_inv_stddev_(output_inv_stddev),
-      output_tuple_(output_tuple) {
-  const auto* hlo = hlo_instruction_;
-  CHECK_EQ(hlo->opcode(), HloOpcode::kCustomCall);
-  CHECK_EQ(hlo->custom_call_target(), kCudnnBatchNormForwardTrainingCallTarget);
-  CHECK_EQ(hlo->shape().tuple_shapes_size(), 3);
-  CHECK(LayoutUtil::LayoutsInShapesEqual(hlo->shape().tuple_shapes(0),
-                                         hlo->operand(0)->shape()));
-  CheckInputOutputPrimitivetypeAreValid(hlo);
-}
+      output_tuple_(output_tuple) {}
 
 Status CudnnBatchNormForwardTrainingThunk::ExecuteOnStream(
     const ExecuteParams& params) {
@@ -185,10 +105,10 @@ Status CudnnBatchNormForwardTrainingThunk::ExecuteOnStream(
       params.profiler->MakeScopedInstructionProfiler(profile_index());
   auto& stream = *params.stream;
   TF_RETURN_IF_ERROR(RunCudnnBatchNormForwardTraining(
-      hlo_instruction_, operand, output_data, output_mean, output_inv_stddev,
+      config_, operand, output_data, output_mean, output_inv_stddev,
       se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(scale_)),
       se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(offset_)),
-      epsilon_, feature_index_, &stream));
+      &stream));
 
   // Write the output tuple.
   const int kNumOutputs = 3;
@@ -207,37 +127,26 @@ Status CudnnBatchNormForwardTrainingThunk::ExecuteOnStream(
 }
 
 CudnnBatchNormBackwardThunk::CudnnBatchNormBackwardThunk(
-    ThunkInfo thunk_info, const BufferAllocation::Slice& operand,
+    ThunkInfo thunk_info, CudnnBatchNormConfig&& config,
+    const BufferAllocation::Slice& operand,
     const BufferAllocation::Slice& scale, const BufferAllocation::Slice& mean,
     const BufferAllocation::Slice& inv_stddev,
-    const BufferAllocation::Slice& grad_output, float epsilon,
-    int64 feature_index, const BufferAllocation::Slice& output_grad_data,
+    const BufferAllocation::Slice& grad_output,
+    const BufferAllocation::Slice& output_grad_data,
     const BufferAllocation::Slice& output_grad_scale,
     const BufferAllocation::Slice& output_grad_offset,
     const BufferAllocation::Slice& output_tuple)
     : Thunk(Thunk::Kind::kCudnnBatchNormBackward, thunk_info),
-      hlo_instruction_(thunk_info.hlo_instruction),
+      config_(std::move(config)),
       operand_(operand),
       scale_(scale),
       mean_(mean),
       inv_stddev_(inv_stddev),
       grad_output_(grad_output),
-      epsilon_(epsilon),
-      feature_index_(feature_index),
       output_grad_data_(output_grad_data),
       output_grad_scale_(output_grad_scale),
       output_grad_offset_(output_grad_offset),
-      output_tuple_(output_tuple) {
-  const auto* hlo = hlo_instruction_;
-  CHECK_EQ(hlo->opcode(), HloOpcode::kCustomCall);
-  CHECK_EQ(hlo->custom_call_target(), kCudnnBatchNormBackwardCallTarget);
-  CHECK_EQ(hlo->shape().tuple_shapes_size(), 3);
-  CHECK(LayoutUtil::LayoutsInShapesEqual(hlo->shape().tuple_shapes(0),
-                                         hlo->operand(0)->shape()));
-  CHECK(LayoutUtil::LayoutsInShapesEqual(hlo->shape().tuple_shapes(0),
-                                         hlo->operand(4)->shape()));
-  CheckInputOutputPrimitivetypeAreValid(hlo);
-}
+      output_tuple_(output_tuple) {}
 
 Status CudnnBatchNormBackwardThunk::ExecuteOnStream(
     const ExecuteParams& params) {
@@ -256,12 +165,12 @@ Status CudnnBatchNormBackwardThunk::ExecuteOnStream(
       params.profiler->MakeScopedInstructionProfiler(profile_index());
   se::Stream* stream = params.stream;
   TF_RETURN_IF_ERROR(RunCudnnBatchNormBackward(
-      hlo_instruction_, operand, output_grad_data, grad_output,
-      output_grad_scale, output_grad_offset,
+      config_, operand, output_grad_data, grad_output, output_grad_scale,
+      output_grad_offset,
       se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(scale_)),
       se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(mean_)),
       se::DeviceMemory<float>(buffer_allocations.GetDeviceAddress(inv_stddev_)),
-      epsilon_, feature_index_, stream));
+      stream));
 
   // Write the output tuple.
   const int kNumOutputs = 3;
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
index bb46017b8fb..d45e284ea2c 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -47,12 +48,12 @@ namespace gpu {
 class CudnnBatchNormForwardInferenceThunk : public Thunk {
  public:
   CudnnBatchNormForwardInferenceThunk(ThunkInfo thunk_info,
+                                      CudnnBatchNormConfig&& config,
                                       const BufferAllocation::Slice& operand,
                                       const BufferAllocation::Slice& scale,
                                       const BufferAllocation::Slice& offset,
                                       const BufferAllocation::Slice& mean,
                                       const BufferAllocation::Slice& variance,
-                                      float epsilon, int64 feature_index,
                                       const BufferAllocation::Slice& output);
 
   CudnnBatchNormForwardInferenceThunk(
@@ -63,23 +64,22 @@ class CudnnBatchNormForwardInferenceThunk : public Thunk {
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
-  const HloInstruction* hlo_instruction_;
+  CudnnBatchNormConfig config_;
   BufferAllocation::Slice operand_;
   BufferAllocation::Slice scale_;
   BufferAllocation::Slice offset_;
   BufferAllocation::Slice mean_;
   BufferAllocation::Slice variance_;
-  float epsilon_;
-  int64 feature_index_;
   BufferAllocation::Slice output_;
 };
 
 class CudnnBatchNormForwardTrainingThunk : public Thunk {
  public:
   CudnnBatchNormForwardTrainingThunk(
-      ThunkInfo thunk_info, const BufferAllocation::Slice& operand,
+      ThunkInfo thunk_info, CudnnBatchNormConfig&& config,
+      const BufferAllocation::Slice& operand,
       const BufferAllocation::Slice& scale,
-      const BufferAllocation::Slice& offset, float epsilon, int64 feature_index,
+      const BufferAllocation::Slice& offset,
       const BufferAllocation::Slice& output_data,
       const BufferAllocation::Slice& output_mean,
       const BufferAllocation::Slice& output_inv_stddev,
@@ -93,12 +93,10 @@ class CudnnBatchNormForwardTrainingThunk : public Thunk {
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
-  const HloInstruction* hlo_instruction_;
+  CudnnBatchNormConfig config_;
   BufferAllocation::Slice operand_;
   BufferAllocation::Slice scale_;
   BufferAllocation::Slice offset_;
-  float epsilon_;
-  int64 feature_index_;
   BufferAllocation::Slice output_data_;
   BufferAllocation::Slice output_mean_;
   BufferAllocation::Slice output_inv_stddev_;
@@ -108,12 +106,12 @@ class CudnnBatchNormForwardTrainingThunk : public Thunk {
 class CudnnBatchNormBackwardThunk : public Thunk {
  public:
   CudnnBatchNormBackwardThunk(ThunkInfo thunk_info,
+                              CudnnBatchNormConfig&& config,
                               const BufferAllocation::Slice& operand,
                               const BufferAllocation::Slice& scale,
                               const BufferAllocation::Slice& mean,
                               const BufferAllocation::Slice& inv_stddev,
                               const BufferAllocation::Slice& grad_output,
-                              float epsilon, int64 feature_index,
                               const BufferAllocation::Slice& output_grad_data,
                               const BufferAllocation::Slice& output_grad_scale,
                               const BufferAllocation::Slice& output_grad_offset,
@@ -126,14 +124,12 @@ class CudnnBatchNormBackwardThunk : public Thunk {
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
-  const HloInstruction* hlo_instruction_;
+  const CudnnBatchNormConfig config_;
   BufferAllocation::Slice operand_;
   BufferAllocation::Slice scale_;
   BufferAllocation::Slice mean_;
   BufferAllocation::Slice inv_stddev_;
   BufferAllocation::Slice grad_output_;
-  float epsilon_;
-  int64 feature_index_;
   BufferAllocation::Slice output_grad_data_;
   BufferAllocation::Slice output_grad_scale_;
   BufferAllocation::Slice output_grad_offset_;
diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
index dae15659402..c9b2318af79 100644
--- a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc
@@ -24,29 +24,12 @@ namespace gpu {
 CustomCallThunk::CustomCallThunk(
     ThunkInfo thunk_info, void* call_target,
     std::vector<ShapeTree<BufferAllocation::Slice>> operand_slices,
-    ShapeTree<BufferAllocation::Slice> result_slices, std::string opaque)
+    ShapeTree<BufferAllocation::Slice> result_slices, const std::string& opaque)
     : Thunk(Thunk::kCustomCall, thunk_info),
-      hlo_instruction_(thunk_info.hlo_instruction),
       call_target_(call_target),
       operand_slices_(std::move(operand_slices)),
       result_slices_(std::move(result_slices)),
-      opaque_(std::move(opaque)) {
-  const HloInstruction* instr = hlo_instruction_;
-  CHECK_EQ(instr->operand_count(), operand_slices_.size());
-  for (int64 i = 0; i < instr->operand_count(); ++i) {
-    const auto& s1 = operand_slices_[i].shape();
-    const auto& s2 = instr->operand(i)->shape();
-    CHECK(ShapeUtil::Equal(s1, s2)) << absl::StreamFormat(
-        "Shape mismatch between instr->operand(%d) and "
-        "operand_slices[%d].shape(): %s vs %s",
-        i, i, s1.ToString(), s2.ToString());
-  }
-  CHECK(ShapeUtil::Equal(instr->shape(), result_slices.shape()))
-      << absl::StreamFormat(
-             "Shape mismatch between instr->shape() and result_slices.shape(): "
-             "%s vs %s.",
-             instr->shape().ToString(), result_slices.shape().ToString());
-}
+      opaque_(opaque) {}
 
 // For each leaf in a preorder traversal of `slices`, appends its device address
 // to `buffers`.
diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.h b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.h
index 31c03f5252f..f36eaa9cef2 100644
--- a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.h
@@ -41,16 +41,16 @@ class CustomCallThunk : public Thunk {
   CustomCallThunk(
       ThunkInfo thunk_info, void* call_target,
       std::vector<ShapeTree<BufferAllocation::Slice>> operand_slices,
-      ShapeTree<BufferAllocation::Slice> result_slices, std::string opaque);
+      ShapeTree<BufferAllocation::Slice> result_slices,
+      const std::string& opaque);
 
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
-  const HloInstruction* hlo_instruction_;
   void* call_target_;
   std::vector<ShapeTree<BufferAllocation::Slice>> operand_slices_;
   ShapeTree<BufferAllocation::Slice> result_slices_;
-  std::string opaque_;
+  const std::string opaque_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/dummy_all_reduce_thunk.cc b/tensorflow/compiler/xla/service/gpu/dummy_all_reduce_thunk.cc
index 318b8aff176..4cc19a23201 100644
--- a/tensorflow/compiler/xla/service/gpu/dummy_all_reduce_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/dummy_all_reduce_thunk.cc
@@ -14,10 +14,22 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
 namespace xla {
 namespace gpu {
 
+struct NcclAllReduceConfig::AuxData {};
+
+NcclAllReduceConfig::NcclAllReduceConfig(NcclAllReduceConfig &&) = default;
+NcclAllReduceConfig::~NcclAllReduceConfig() = default;
+
+NcclAllReduceConfig GetNcclAllReduceConfig(const HloInstruction *instr,
+                                           int64 replica_count) {
+  NcclAllReduceConfig config = {};
+  return config;
+}
+
 /* static */ bool NcclAllReduceThunk::NcclIsEnabled() {
   return false;  // Skylark selects this source file if NCCL is disabled.
 }
@@ -32,20 +44,16 @@ Status NcclAllReduceThunk::ExecuteOnStream(const ExecuteParams& params) {
       "compiler, which is necessary to build the NCCL source library.");
 }
 
-NcclAllReduceThunk::~NcclAllReduceThunk() = default;
-
 /*static*/ absl::flat_hash_set<GlobalDeviceId>
 NcclAllReduceThunk::DevicesWithOpenNcclChannels() {
   return {};
 }
 
-struct NcclAllReduceThunk::AuxData {};
-
 NcclAllReduceThunk::NcclAllReduceThunk(
-    ThunkInfo thunk_info, int64 replica_count,
+    ThunkInfo thunk_info, NcclAllReduceConfig &&config,
     std::vector<NcclAllReduceThunk::Buffer> buffers)
     : Thunk(Thunk::kNcclAllReduce, thunk_info),
-      replica_count_(replica_count),
+      config_(std::move(config)),
       buffers_(std::move(buffers)) {}
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.cc b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
index ccd661d8ade..a9e6cd05c31 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
@@ -24,15 +24,16 @@ namespace xla {
 namespace gpu {
 
 ForThunk::ForThunk(ThunkInfo thunk_info, const int64 loop_limit,
-                   std::unique_ptr<ThunkSequence> body_thunk_sequence)
+                   std::unique_ptr<ThunkSequence> body_thunk_sequence,
+                   absl::optional<size_t> body_profile_index)
     : Thunk(Kind::kWhile, thunk_info),
-      hlo_instruction_(thunk_info.hlo_instruction),
       loop_limit_(loop_limit),
       body_thunk_sequence_(absl::make_unique<SequentialThunk>(
           // Pass nullptr as the HloInstruction* to the body_thunk_sequence_
           // constructor because this SequentialThunk is logically "part of"
           // this ForThunk, and shouldn't be profiled separately from it.
-          ThunkInfo(), std::move(*body_thunk_sequence))) {}
+          ThunkInfo(), std::move(*body_thunk_sequence))),
+      body_profile_index_(body_profile_index) {}
 
 Status ForThunk::Initialize(const GpuExecutable& executable,
                             se::StreamExecutor* executor) {
@@ -41,15 +42,14 @@ Status ForThunk::Initialize(const GpuExecutable& executable,
 }
 
 Status ForThunk::ExecuteOnStream(const ExecuteParams& params) {
-  VLOG(2) << "Executing ForThunk with " << loop_limit_ << " iters for "
-          << (hlo_instruction_ ? hlo_instruction_->ToString() : "<null>");
+  VLOG(2) << "Executing ForThunk with " << loop_limit_ << " iters";
   auto op_profiler =
       params.profiler->MakeScopedInstructionProfiler(profile_index());
   for (int64 i = 0; i < loop_limit_; ++i) {
     params.profiler->StartHloComputation();
     // Invoke loop body thunk sequence.
     TF_RETURN_IF_ERROR(body_thunk_sequence_->ExecuteOnStream(params));
-    params.profiler->FinishHloComputation(hlo_instruction_->while_body());
+    params.profiler->FinishHloComputation(body_profile_index_);
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h
index b6ee950737e..9a8bd069290 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h
@@ -32,7 +32,8 @@ namespace gpu {
 class ForThunk : public Thunk {
  public:
   ForThunk(ThunkInfo thunk_info, const int64 loop_limit,
-           std::unique_ptr<ThunkSequence> body_thunk_sequence);
+           std::unique_ptr<ThunkSequence> body_thunk_sequence,
+           absl::optional<size_t> body_profile_index_);
   ForThunk(const ForThunk&) = delete;
   ForThunk& operator=(const ForThunk&) = delete;
 
@@ -41,9 +42,9 @@ class ForThunk : public Thunk {
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
-  const HloInstruction* hlo_instruction_;
   const int64 loop_limit_;
   std::unique_ptr<SequentialThunk> body_thunk_sequence_;
+  const absl::optional<size_t> body_profile_index_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
index a499dc70e23..23706cb9973 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -201,8 +201,10 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
   // Merging into all users enables the removal of 'fusion' from the
   // computation.
   if (!absl::c_all_of(fusion->users(), [&](const HloInstruction* user) {
-        return user->opcode() == HloOpcode::kFusion &&
-               IsProducerConsumerFusible(*fusion, *user);
+        return IsProducerConsumerFusible(*fusion, *user) &&
+               // Do not fuse into bitcast ops, which are no-ops and do not
+               // generate any GPU code.
+               user->opcode() != HloOpcode::kBitcast;
       })) {
     VLOG(3) << "Not merging " << fusion->name()
             << ": Some of its users are not loop/input fusion kernels.";
@@ -283,7 +285,15 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
   // Merge fused instructions from 'fusion' into each user.
   std::vector<HloInstruction*> users = fusion->users();
   for (HloInstruction* user : users) {
-    user->MergeFusionInstruction(fusion);
+    if (user->opcode() == HloOpcode::kFusion) {
+      user->MergeFusionInstruction(fusion);
+    } else {
+      HloInstruction* fused_user =
+          computation_->AddInstruction(HloInstruction::CreateFusion(
+              user->shape(), ChooseFusionKind(*fusion, *user), user));
+      TF_CHECK_OK(computation_->ReplaceInstruction(user, fused_user));
+      fused_user->MergeFusionInstruction(fusion);
+    }
     changed_ = true;
   }
   ++total_merged_;
@@ -296,7 +306,7 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
                            })
           << " }";
   // Remove 'fusion' instruction.
-  CHECK_EQ(0, fusion->user_count());
+  CHECK_EQ(0, fusion->user_count()) << fusion->ToString();
   return computation_->RemoveInstruction(fusion);
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
index cc4894f4c00..7468114516d 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
@@ -234,6 +234,54 @@ TEST_F(FusionMergerTest, WillMergeIntoInputFusion) {
               op::Fusion(op::Parameter()));
 }
 
+TEST_F(FusionMergerTest, WillMergeIntoUnfusedConsumer) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule jit_matmul.36
+
+    max (parameter.13: f32[], parameter.14: f32[]) -> f32[] {
+      parameter.13 = f32[] parameter(0)
+      parameter.14 = f32[] parameter(1)
+      ROOT maximum.15 = f32[] maximum(f32[] parameter.13, f32[] parameter.14)
+    }
+
+    add (parameter.29: f32[], parameter.30: f32[]) -> f32[] {
+      parameter.29 = f32[] parameter(0)
+      parameter.30 = f32[] parameter(1)
+      ROOT add.31 = f32[] add(f32[] parameter.29, f32[] parameter.30)
+    }
+
+    fused_computation.1 (param_1.4: f32[200,200,200], param_2.1: f32[200,200]) -> f32[200,200] {
+      param_1.4 = f32[200,200,200]{2,1,0} parameter(0)
+      param_2.1 = f32[200,200]{1,0} parameter(1)
+      broadcast.3 = f32[200,200,200]{2,1,0} broadcast(f32[200,200]{1,0} param_2.1), dimensions={0,2}
+      subtract.0 = f32[200,200,200]{2,1,0} subtract(f32[200,200,200]{2,1,0} param_1.4, f32[200,200,200]{2,1,0} broadcast.3)
+      exponential.0 = f32[200,200,200]{2,1,0} exponential(f32[200,200,200]{2,1,0} subtract.0)
+      constant.27 = f32[] constant(0)
+      ROOT reduce.0 = f32[200,200]{1,0} reduce(f32[200,200,200]{2,1,0} exponential.0, f32[] constant.27), dimensions={1}, to_apply=add
+    }
+
+    fused_computation.3 (param_0.7: f32[200,200], param_1.9: f32[200,200]) -> f32[200,200,200] {
+      param_1.9 = f32[200,200]{1,0} parameter(1)
+      broadcast.10 = f32[200,200,200]{2,1,0} broadcast(f32[200,200]{1,0} param_1.9), dimensions={0,1}
+      param_0.7 = f32[200,200]{1,0} parameter(0)
+      broadcast.8 = f32[200,200,200]{2,1,0} broadcast(f32[200,200]{1,0} param_0.7), dimensions={1,2}
+      ROOT add.1 = f32[200,200,200]{2,1,0} add(f32[200,200,200]{2,1,0} broadcast.10, f32[200,200,200]{2,1,0} broadcast.8)
+    }
+
+    ENTRY entry (parameter.1: f32[200,200], parameter.2: f32[200,200]) -> f32[200,200] {
+      parameter.2 = f32[200,200]{1,0} parameter(1)
+      parameter.1 = f32[200,200]{1,0} parameter(0)
+      fusion.3 = f32[200,200,200]{2,1,0} fusion(f32[200,200]{1,0} parameter.2, f32[200,200]{1,0} parameter.1), kind=kLoop, calls=fused_computation.3
+      constant.11 = f32[] constant(-inf)
+      reduce.16 = f32[200,200]{1,0} reduce(f32[200,200,200]{2,1,0} fusion.3, f32[] constant.11), dimensions={1}, to_apply=max
+      ROOT fusion.1 = f32[200,200]{1,0} fusion(f32[200,200,200]{2,1,0} fusion.3, f32[200,200]{1,0} reduce.16), kind=kInput, calls=fused_computation.1
+    })")
+                    .ValueOrDie();
+  EXPECT_TRUE(FusionMerger().Run(module.get()).ValueOrDie());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Fusion(op::Fusion(), op::Parameter(), op::Parameter()));
+}
+
 TEST_F(FusionMergerTest, WillNotMergeReduceUnfriendlyLayouts) {
   auto module = ParseAndReturnVerifiedModule(R"(
     HloModule m
@@ -421,6 +469,165 @@ TEST_F(FusionMergerTest, WillMergeExpensiveFusionsWithSingleConsumer) {
   EXPECT_TRUE(FusionMerger().Run(module.get()).ValueOrDie());
 }
 
+TEST_F(FusionMergerTest, NoMergeBecauseCodeDuplication) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule module
+
+and.reduce_sub_computation {
+  x = pred[] parameter(0)
+  y = pred[] parameter(1)
+  ROOT and = pred[] and(x, y)
+}
+
+fused_computation.1 {
+  param_4.658 = f32[2,20,256]{2,0,1} parameter(4)
+  slice.1385 = f32[2,1,256]{2,0,1} slice(param_4.658), slice={[0:2], [11:12], [0:256]}
+  constant.6847 = s32[] constant(0)
+  broadcast.4823 = s32[3]{0} broadcast(constant.6847), dimensions={}
+  param_9.415 = s32[3]{0} parameter(9)
+  compare.700 = pred[3]{0} compare(broadcast.4823, param_9.415), direction=LE
+  constant.6846 = pred[] constant(true)
+  reduce.221 = pred[] reduce(compare.700, constant.6846), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2933 = pred[2,1,256]{2,0,1} broadcast(reduce.221), dimensions={}
+  param_5.528 = f32[2,512]{1,0} parameter(5)
+  slice.1384 = f32[2,256]{1,0} slice(param_5.528), slice={[0:2], [0:256]}
+  bitcast.341 = f32[2,1,256]{2,0,1} bitcast(slice.1384)
+  constant.5418 = f32[] constant(0)
+  broadcast.3227 = f32[2,1,256]{2,0,1} broadcast(constant.5418), dimensions={}
+  select.173 = f32[2,1,256]{2,0,1} select(broadcast.2933, bitcast.341, broadcast.3227)
+  add.573 = f32[2,1,256]{2,0,1} add(slice.1385, select.173)
+  param_0.299 = s32[] parameter(0)
+  constant.5157 = s32[] constant(11)
+  dynamic-update-slice.189 = f32[2,20,256]{2,0,1} dynamic-update-slice(param_4.658, add.573, param_0.299, constant.5157, param_0.299)
+  slice.1383 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.189), slice={[0:2], [10:11], [0:256]}
+  constant.6800 = s32[] constant(0)
+  broadcast.4803 = s32[3]{0} broadcast(constant.6800), dimensions={}
+  param_8.484 = s32[3]{0} parameter(8)
+  compare.681 = pred[3]{0} compare(broadcast.4803, param_8.484), direction=LE
+  constant.6798 = pred[] constant(true)
+  reduce.203 = pred[] reduce(compare.681, constant.6798), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2932 = pred[2,1,256]{2,0,1} broadcast(reduce.203), dimensions={}
+  param_3.1169 = f32[2,512]{1,0} parameter(3)
+  slice.1382 = f32[2,256]{1,0} slice(param_3.1169), slice={[0:2], [0:256]}
+  bitcast.340 = f32[2,1,256]{2,0,1} bitcast(slice.1382)
+  select.172 = f32[2,1,256]{2,0,1} select(broadcast.2932, bitcast.340, broadcast.3227)
+  add.572 = f32[2,1,256]{2,0,1} add(slice.1383, select.172)
+  constant.5154 = s32[] constant(10)
+  dynamic-update-slice.188 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.189, add.572, param_0.299, constant.5154, param_0.299)
+  slice.1381 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.188), slice={[0:2], [9:10], [0:256]}
+  constant.6794 = s32[] constant(0)
+  broadcast.4801 = s32[3]{0} broadcast(constant.6794), dimensions={}
+  param_7.478 = s32[3]{0} parameter(7)
+  compare.679 = pred[3]{0} compare(broadcast.4801, param_7.478), direction=LE
+  constant.6793 = pred[] constant(true)
+  reduce.201 = pred[] reduce(compare.679, constant.6793), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2930 = pred[2,1,256]{2,0,1} broadcast(reduce.201), dimensions={}
+  param_2.1685 = f32[2,512]{1,0} parameter(2)
+  slice.1380 = f32[2,256]{1,0} slice(param_2.1685), slice={[0:2], [0:256]}
+  bitcast.339 = f32[2,1,256]{2,0,1} bitcast(slice.1380)
+  select.171 = f32[2,1,256]{2,0,1} select(broadcast.2930, bitcast.339, broadcast.3227)
+  add.571 = f32[2,1,256]{2,0,1} add(slice.1381, select.171)
+  constant.5153 = s32[] constant(9)
+  dynamic-update-slice.187 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.188, add.571, param_0.299, constant.5153, param_0.299)
+  slice.1379 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.187), slice={[0:2], [8:9], [0:256]}
+  constant.6788 = s32[] constant(0)
+  broadcast.4799 = s32[3]{0} broadcast(constant.6788), dimensions={}
+  param_6.495 = s32[3]{0} parameter(6)
+  compare.677 = pred[3]{0} compare(broadcast.4799, param_6.495), direction=LE
+  constant.6786 = pred[] constant(true)
+  reduce.199 = pred[] reduce(compare.677, constant.6786), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2929 = pred[2,1,256]{2,0,1} broadcast(reduce.199), dimensions={}
+  param_1.1408 = f32[2,512]{1,0} parameter(1)
+  slice.1378 = f32[2,256]{1,0} slice(param_1.1408), slice={[0:2], [0:256]}
+  bitcast.338 = f32[2,1,256]{2,0,1} bitcast(slice.1378)
+  select.170 = f32[2,1,256]{2,0,1} select(broadcast.2929, bitcast.338, broadcast.3227)
+  add.570 = f32[2,1,256]{2,0,1} add(slice.1379, select.170)
+  constant.5152 = s32[] constant(8)
+  ROOT dynamic-update-slice.186 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.187, add.570, param_0.299, constant.5152, param_0.299)
+}
+
+fused_computation.2 {
+  param_4.655 = f32[2,20,256]{2,0,1} parameter(4)
+  slice.1369 = f32[2,1,256]{2,0,1} slice(param_4.655), slice={[0:2], [7:8], [0:256]}
+  param_6.483 = pred[] parameter(6)
+  broadcast.2927 = pred[2,1,256]{2,0,1} broadcast(param_6.483), dimensions={}
+  param_5.525 = f32[2,512]{1,0} parameter(5)
+  slice.1368 = f32[2,256]{1,0} slice(param_5.525), slice={[0:2], [0:256]}
+  bitcast.333 = f32[2,1,256]{2,0,1} bitcast(slice.1368)
+  constant.5415 = f32[] constant(0)
+  broadcast.3225 = f32[2,1,256]{2,0,1} broadcast(constant.5415), dimensions={}
+  select.161 = f32[2,1,256]{2,0,1} select(broadcast.2927, bitcast.333, broadcast.3225)
+  add.549 = f32[2,1,256]{2,0,1} add(slice.1369, select.161)
+  param_0.265 = s32[] parameter(0)
+  constant.5151 = s32[] constant(7)
+  dynamic-update-slice.185 = f32[2,20,256]{2,0,1} dynamic-update-slice(param_4.655, add.549, param_0.265, constant.5151, param_0.265)
+  slice.1367 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.185), slice={[0:2], [6:7], [0:256]}
+  constant.6782 = s32[] constant(0)
+  broadcast.4797 = s32[3]{0} broadcast(constant.6782), dimensions={}
+  param_9.391 = s32[3]{0} parameter(9)
+  compare.675 = pred[3]{0} compare(broadcast.4797, param_9.391), direction=LE
+  constant.6781 = pred[] constant(true)
+  reduce.197 = pred[] reduce(compare.675, constant.6781), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2926 = pred[2,1,256]{2,0,1} broadcast(reduce.197), dimensions={}
+  param_3.1167 = f32[2,512]{1,0} parameter(3)
+  slice.1366 = f32[2,256]{1,0} slice(param_3.1167), slice={[0:2], [0:256]}
+  bitcast.332 = f32[2,1,256]{2,0,1} bitcast(slice.1366)
+  select.160 = f32[2,1,256]{2,0,1} select(broadcast.2926, bitcast.332, broadcast.3225)
+  add.548 = f32[2,1,256]{2,0,1} add(slice.1367, select.160)
+  constant.5150 = s32[] constant(6)
+  dynamic-update-slice.184 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.185, add.548, param_0.265, constant.5150, param_0.265)
+  slice.1365 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.184), slice={[0:2], [5:6], [0:256]}
+  constant.6776 = s32[] constant(0)
+  broadcast.4794 = s32[3]{0} broadcast(constant.6776), dimensions={}
+  param_8.464 = s32[3]{0} parameter(8)
+  compare.673 = pred[3]{0} compare(broadcast.4794, param_8.464), direction=LE
+  constant.6775 = pred[] constant(true)
+  reduce.195 = pred[] reduce(compare.673, constant.6775), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2925 = pred[2,1,256]{2,0,1} broadcast(reduce.195), dimensions={}
+  param_2.1684 = f32[2,512]{1,0} parameter(2)
+  slice.1364 = f32[2,256]{1,0} slice(param_2.1684), slice={[0:2], [0:256]}
+  bitcast.331 = f32[2,1,256]{2,0,1} bitcast(slice.1364)
+  select.159 = f32[2,1,256]{2,0,1} select(broadcast.2925, bitcast.331, broadcast.3225)
+  add.547 = f32[2,1,256]{2,0,1} add(slice.1365, select.159)
+  constant.5149 = s32[] constant(5)
+  dynamic-update-slice.183 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.184, add.547, param_0.265, constant.5149, param_0.265)
+  slice.1363 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.183), slice={[0:2], [4:5], [0:256]}
+  constant.6770 = s32[] constant(0)
+  broadcast.4792 = s32[3]{0} broadcast(constant.6770), dimensions={}
+  param_7.458 = s32[3]{0} parameter(7)
+  compare.671 = pred[3]{0} compare(broadcast.4792, param_7.458), direction=LE
+  constant.6769 = pred[] constant(true)
+  reduce.193 = pred[] reduce(compare.671, constant.6769), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2924 = pred[2,1,256]{2,0,1} broadcast(reduce.193), dimensions={}
+  param_1.1405 = f32[2,512]{1,0} parameter(1)
+  slice.1362 = f32[2,256]{1,0} slice(param_1.1405), slice={[0:2], [0:256]}
+  bitcast.330 = f32[2,1,256]{2,0,1} bitcast(slice.1362)
+  select.158 = f32[2,1,256]{2,0,1} select(broadcast.2924, bitcast.330, broadcast.3225)
+  add.546 = f32[2,1,256]{2,0,1} add(slice.1363, select.158)
+  constant.5148 = s32[] constant(4)
+  ROOT dynamic-update-slice.182 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.183, add.546, param_0.265, constant.5148, param_0.265)
+}
+
+ENTRY main {
+  param_0.0 = s32[] parameter(0)
+  param_1.0 = f32[2,512]{1,0} parameter(1)
+  param_2.0 = f32[2,512]{1,0} parameter(2)
+  param_3.0 = f32[2,512]{1,0} parameter(3)
+  param_4.0 = f32[2,20,256]{2,1,0} parameter(4)
+  param_5.0 = f32[2,512]{1,0} parameter(5)
+  param_6.0 = s32[3]{0} parameter(6)
+  param_7.0 = s32[3]{0} parameter(7)
+  param_8.0 = s32[3]{0} parameter(8)
+  param_9.0 = s32[3]{0} parameter(9)
+  fusion.1 = f32[2,20,256]{2,0,1} fusion(param_0.0, param_1.0, param_2.0, param_3.0, param_4.0, param_5.0, param_6.0, param_7.0, param_8.0, param_9.0), kind=kLoop, calls=fused_computation.1
+  param_10 = pred[] parameter(10)
+  ROOT fusion.2 = f32[2,20,256]{2,0,1} fusion(param_0.0, param_1.0, param_2.0, param_3.0, fusion.1, param_5.0, param_10, param_7.0, param_8.0, param_9.0), kind=kLoop, calls=fused_computation.2
+}
+  )")
+                    .ValueOrDie();
+  EXPECT_FALSE(FusionMerger().Run(module.get()).ValueOrDie());
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
index 0320496ea98..5a8265a53a6 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_algorithm_picker.cc
@@ -115,6 +115,8 @@ static StatusOr<absl::optional<se::blas::AlgorithmType>> DoUncachedGemmAutotune(
   absl::optional<se::blas::AlgorithmType> first_algorithm;
   std::vector<AutotuneResult> profile_results;
 
+  GpuGemmConfig config = GetGpuGemmConfig(gemm);
+
   for (se::blas::AlgorithmType algorithm : algorithms) {
     // Make sure the output buffer always has the same value if we use
     // the bias parameter.
@@ -129,8 +131,7 @@ static StatusOr<absl::optional<se::blas::AlgorithmType>> DoUncachedGemmAutotune(
     // for all algorithms if we're targeting < sm_50.  But because we pass a
     // non-null ProfileResult, DoGemmWithAlgorithm should always return true,
     // and the actual success-ness is returned in ProfileResult::is_valid.
-    CHECK(RunGemm(gemm, backend_config, lhs_buffer, rhs_buffer, output_buffer,
-                  stream,
+    CHECK(RunGemm(config, lhs_buffer, rhs_buffer, output_buffer, stream,
                   /*implements_whole_instruction=*/true,
                   /*profile_index=*/-1,
                   /*profiler=*/nullptr,
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
index e55df0bb230..ea4f3951a3d 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@@ -33,32 +33,40 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-GemmThunk::GemmThunk(ThunkInfo thunk_info,
+GpuGemmConfig GetGpuGemmConfig(const HloInstruction *gemm) {
+  GpuGemmConfig config;
+  config.output_shape = gemm->shape();
+  config.lhs_shape = gemm->operand(0)->shape();
+  config.rhs_shape = gemm->operand(1)->shape();
+  auto backend_config_or = gemm->backend_config<GemmBackendConfig>();
+  config.backend_config = std::move(backend_config_or.ValueOrDie());
+  return config;
+}
+
+GemmThunk::GemmThunk(ThunkInfo thunk_info, GpuGemmConfig &&config,
                      const BufferAllocation::Slice &lhs_buffer,
                      const BufferAllocation::Slice &rhs_buffer,
                      const BufferAllocation::Slice &output_buffer,
-                     bool implements_whole_instruction,
-                     const GemmBackendConfig &backend_config)
+                     bool implements_whole_instruction)
     : Thunk(Kind::kGemm, thunk_info),
-      hlo_instruction_(thunk_info.hlo_instruction),
+      config_(std::move(config)),
       lhs_buffer_(lhs_buffer),
       rhs_buffer_(rhs_buffer),
       output_buffer_(output_buffer),
-      implements_whole_instruction_(implements_whole_instruction),
-      backend_config_(backend_config) {}
+      implements_whole_instruction_(implements_whole_instruction) {}
 
 Status GemmThunk::ExecuteOnStream(const ExecuteParams &params) {
   auto get_device_address = [&](const BufferAllocation::Slice &slice) {
     return params.buffer_allocations->GetDeviceAddress(slice);
   };
 
-  VLOG(3) << "Running GEMM thunk on instruction: " << hlo_instruction_;
+  VLOG(3) << "Running GEMM thunk";
   se::DeviceMemoryBase lhs_data = get_device_address(lhs_buffer_);
   se::DeviceMemoryBase rhs_data = get_device_address(rhs_buffer_);
   se::DeviceMemoryBase output_data = get_device_address(output_buffer_);
-  return RunGemm(hlo_instruction_, backend_config_, lhs_data, rhs_data,
-                 output_data, params.stream, implements_whole_instruction_,
-                 profile_index(), params.profiler);
+  return RunGemm(config_, lhs_data, rhs_data, output_data, params.stream,
+                 implements_whole_instruction_, profile_index(),
+                 params.profiler);
 }
 
 // This struct contains the metadata of a matrix, e.g., its base address and
@@ -160,8 +168,7 @@ static bool DoGemmWithAlgorithm(
       .ok();
 }
 
-Status RunGemm(const HloInstruction *gemm,
-               const GemmBackendConfig &backend_config,
+Status RunGemm(const GpuGemmConfig &gemm_config,
                se::DeviceMemoryBase lhs_buffer, se::DeviceMemoryBase rhs_buffer,
                se::DeviceMemoryBase output_buffer, se::Stream *stream,
                bool implements_whole_instruction,
@@ -170,14 +177,11 @@ Status RunGemm(const HloInstruction *gemm,
                se::blas::ProfileResult *profile_result,
                absl::optional<se::blas::AlgorithmType> algorithm) {
   VLOG(2) << "Executing a GemmThunk";
-  CHECK(IsCublasGemm(*gemm));
 
-  const Shape &output_shape = gemm->shape();
-  const HloInstruction *lhs = gemm->operand(0);
-  const HloInstruction *rhs = gemm->operand(1);
-
-  const Shape &lhs_shape = lhs->shape();
-  const Shape &rhs_shape = rhs->shape();
+  const Shape &output_shape = gemm_config.output_shape;
+  const Shape &lhs_shape = gemm_config.lhs_shape;
+  const Shape &rhs_shape = gemm_config.rhs_shape;
+  const GemmBackendConfig &backend_config = gemm_config.backend_config;
 
   const DotDimensionNumbers &dim_nums = backend_config.dot_dimension_numbers();
   CHECK_EQ(dim_nums.lhs_batch_dimensions_size(),
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
index 1a51a7d4e0c..9d6613dbe77 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
@@ -33,17 +33,26 @@ namespace gpu {
 
 // This class stores everything that StreamExecutor needs to launch a BLAS gemm.
 // It is generated by IrEmitter.
-//
+
+struct GpuGemmConfig {
+  Shape lhs_shape;
+  Shape rhs_shape;
+  Shape output_shape;
+  GemmBackendConfig backend_config;
+};
+
+GpuGemmConfig GetGpuGemmConfig(const HloInstruction* gemm);
+
 // This is thread-compatible.
 class GemmThunk : public Thunk {
  public:
   // Constructs a thunk that computes "output = (lhs <dot> rhs) * alpha" using
   // BLAS gemm (alpha is stored in the instruction GemmBackendConfig).
-  GemmThunk(ThunkInfo thunk_info, const BufferAllocation::Slice& lhs_buffer,
+  GemmThunk(ThunkInfo thunk_info, GpuGemmConfig&& config,
+            const BufferAllocation::Slice& lhs_buffer,
             const BufferAllocation::Slice& rhs_buffer,
             const BufferAllocation::Slice& output_buffer,
-            bool implements_whole_instruction,
-            const GemmBackendConfig& backend_config);
+            bool implements_whole_instruction);
 
   GemmThunk(const GemmThunk&) = delete;
   GemmThunk& operator=(const GemmThunk&) = delete;
@@ -51,28 +60,27 @@ class GemmThunk : public Thunk {
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
-  const HloInstruction* hlo_instruction_;
+  const GpuGemmConfig config_;
   const BufferAllocation::Slice lhs_buffer_;
   const BufferAllocation::Slice rhs_buffer_;
   const BufferAllocation::Slice output_buffer_;
-  bool implements_whole_instruction_;
-  GemmBackendConfig backend_config_;
+  const bool implements_whole_instruction_;
 };
 
 // Run the given GEMM instruction `gemm` subject to the configuration
-// in `backend_config` and the passed buffers.
+// in `gemm_config` and the passed buffers.
 //
 // `implements_whole_instruction` is used for the default profiler creation
 // if the `profiler` is not supplied. False value indicates that the created
 // profiler will not specifically profile the `gemm` instruction.
 //
 // If `algorithm` is provided, it overrides the one specified in
-// `backend_config`.
+// `gemm_config.backend_config`.
 Status RunGemm(
-    const HloInstruction* gemm, const GemmBackendConfig& backend_config,
-    se::DeviceMemoryBase lhs_buffer, se::DeviceMemoryBase rhs_buffer,
-    se::DeviceMemoryBase output_buffer, se::Stream* stream,
-    bool implements_whole_instruction, absl::optional<int64> profile_index,
+    const GpuGemmConfig& gemm_config, se::DeviceMemoryBase lhs_buffer,
+    se::DeviceMemoryBase rhs_buffer, se::DeviceMemoryBase output_buffer,
+    se::Stream* stream, bool implements_whole_instruction,
+    absl::optional<int64> profile_index,
     HloExecutionProfiler* profiler = nullptr,
     se::blas::ProfileResult* profile_result = nullptr,
     absl::optional<se::blas::AlgorithmType> algorithm = absl::nullopt);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 77fcf2c59f7..feedff0e0b3 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -59,7 +59,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_sanitize_constant_names.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_scatter_expander.h"
-#include "tensorflow/compiler/xla/service/gpu/horizontal_fusion.h"
+#include "tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h"
+#include "tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
@@ -91,6 +92,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/logistic_expander.h"
+#include "tensorflow/compiler/xla/service/qr_expander.h"
+#include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/rng_bit_generator_expander.h"
 #include "tensorflow/compiler/xla/service/rng_expander.h"
 #include "tensorflow/compiler/xla/service/slice_sinker.h"
@@ -150,6 +153,8 @@ Status GpuCompiler::OptimizeHloModule(
     pipeline.AddPass<ZeroSizedHloElimination>();
 
     pipeline.AddPass<GpuScatterExpander>();
+    // TODO(phawkins): replace QR decompositions with calls to cuSOLVER.
+    pipeline.AddPass<QrExpander>();
 
     pipeline.AddPass<DynamicIndexSplitter>();
 
@@ -226,6 +231,7 @@ Status GpuCompiler::OptimizeHloModule(
       // pass.AddPass<SliceSinker>();
 
       pass.AddPass<HloDCE>();
+      pass.AddPass<ReshapeMover>();
       pass.AddPass<HloConstantFolding>();
       pass.AddPass<ConditionalSimplifier>();
     }
@@ -236,8 +242,7 @@ Status GpuCompiler::OptimizeHloModule(
           return IsMatrixMultiplication(dot)
                      ? candidate_operands
                      : TransposeFolding::OperandIndices{};
-        },
-        TransposeFolding::NeverFoldTranspose);
+        });
     pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
     pipeline.AddPass<HloDCE>();
 
@@ -301,12 +306,14 @@ Status GpuCompiler::OptimizeHloModule(
     TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
 
     HloPassPipeline horizontal_fusion("horizontal_fusion");
-    horizontal_fusion.AddPass<GpuHorizontalFusion>();
+    horizontal_fusion.AddPass<GpuHorizontalLoopFusion>();
+    horizontal_fusion.AddPass<GpuHorizontalInputFusion>();
     horizontal_fusion.AddPass<HloCSE>(/*is_layout_sensitive=*/true,
                                       /*only_fusion_computations=*/true);
     horizontal_fusion.AddPass<HloDCE>();
     TF_RETURN_IF_ERROR(horizontal_fusion.Run(hlo_module).status());
   }
+
   {
     HloPassPipeline pipeline("all_reduce_combiner");
     pipeline.AddPass<AllReduceCombiner>(
@@ -483,7 +490,8 @@ static Status CompileModuleToLlvmIrImpl(
     int pointer_size, const HloProfileIndexMap* profile_index_map,
     std::unique_ptr<llvm::Module>* llvm_module,
     std::unique_ptr<BufferAssignment>* buffer_assignment,
-    std::unique_ptr<ThunkSchedule>* thunk_schedule) {
+    std::unique_ptr<ThunkSchedule>* thunk_schedule,
+    std::vector<GpuExecutable::ConstantInfo>* constants) {
   *llvm_module = absl::make_unique<llvm::Module>("", *llvm_context);
 
   (*llvm_module)->setTargetTriple(target_triple);
@@ -516,7 +524,6 @@ static Status CompileModuleToLlvmIrImpl(
   DumpHloModuleIfEnabled(*hlo_module, **buffer_assignment,
                          "after_optimizations");
 
-  mlir::registerAllDialects();
   mlir::MLIRContext mlir_context;
 
   IrEmitterContext ir_emitter_context(
@@ -531,8 +538,6 @@ static Status CompileModuleToLlvmIrImpl(
       IrEmitterUnnested::Create(hlo_module->config(), entry_computation,
                                 &ir_emitter_context));
 
-  TF_RETURN_IF_ERROR(ir_emitter->EmitConstantGlobals());
-
   {
     XLA_SCOPED_LOGGING_TIMER("GpuCompiler::RunBackend - IR emission");
 
@@ -581,6 +586,10 @@ static Status CompileModuleToLlvmIrImpl(
     *thunk_schedule = absl::make_unique<ThunkSchedule>(
         std::make_unique<ThunkSequence>(std::move(thunk_sequence)),
         std::move(stream_assignment), std::move(thunk_to_hlo));
+
+    if (constants) {
+      *constants = std::move(ir_emitter_context.constants());
+    }
   }
 
   return Status::OK();
@@ -612,6 +621,9 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
       stream_exec->GetDeviceDescription().threads_per_warp();
   gpu_device_info.shared_memory_per_block =
       stream_exec->GetDeviceDescription().shared_memory_per_block();
+  gpu_device_info.threads_per_core_limit =
+      stream_exec->GetDeviceDescription().threads_per_core_limit();
+  gpu_device_info.core_count = stream_exec->GetDeviceDescription().core_count();
 
   absl::optional<CudaComputeCapability> cuda_compute_capability =
       [&]() -> absl::optional<CudaComputeCapability> {
@@ -646,12 +658,13 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   std::unique_ptr<llvm::Module> llvm_module;
   std::unique_ptr<BufferAssignment> buffer_assignment;
   std::unique_ptr<ThunkSchedule> thunk_schedule;
+  std::vector<GpuExecutable::ConstantInfo> constants;
 
   TF_RETURN_IF_ERROR(CompileModuleToLlvmIrImpl(
       module.get(), &llvm_context, target_triple_, data_layout_,
       stream_exec->platform()->Name(), gpu_device_info, cuda_compute_capability,
       GetCanShareBuffer(), pointer_size_, profile_index_map.get(), &llvm_module,
-      &buffer_assignment, &thunk_schedule));
+      &buffer_assignment, &thunk_schedule, &constants));
 
   if (user_pre_optimization_hook_) {
     user_pre_optimization_hook_(*llvm_module);
@@ -697,7 +710,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
       backend_result.first, backend_result.second, gpu_version,
       std::move(thunk_schedule), std::move(module),
       std::move(buffer_assignment), std::move(profile_printer),
-      std::move(profile_index_map));
+      std::move(profile_index_map), std::move(constants));
   if (embed_ir_in_executable) {
     DCHECK_NE("", ir_module_string_before_opt);
     gpu_executable->set_ir_module_string(ir_module_string_before_opt);
@@ -731,7 +744,7 @@ StatusOr<std::unique_ptr<llvm::Module>> CompileModuleToLlvmIr(
       hlo_module, llvm_context, target_triple, data_layout, platform_name,
       gpu_device_info, cuda_compute_capability, DummyCanShareBufferFunction,
       pointer_size, /*profile_index_map=*/nullptr, &llvm_module,
-      &buffer_assignment, &thunk_schedule));
+      &buffer_assignment, &thunk_schedule, nullptr));
   return llvm_module;
 }
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
index 8fb741323f3..925caadbb97 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_algorithm_picker.cc
@@ -122,24 +122,28 @@ std::vector<AlgorithmDesc> GetAlgorithms(CudnnConvKind kind,
 }
 
 StatusOr<std::vector<se::dnn::ProfileResult>> GetMIOpenAlgorithms(
-    const HloCustomCallInstruction* conv,
+    const HloCustomCallInstruction* instr,
     absl::Span<se::DeviceMemoryBase> operand_buffers,
     se::DeviceMemoryBase result_buffer, se::StreamExecutor* stream_exec,
     ScratchAllocator* scratch_allocator, se::Stream* stream) {
   std::vector<se::dnn::ProfileResult> algorithms;
 
-  TF_ASSIGN_OR_RETURN(se::dnn::ConvolutionKind kind,
-                      GetDnnConvolutionKind(conv));
+  TF_ASSIGN_OR_RETURN(GpuConvConfig config, GetGpuConvConfig(instr));
 
-  TF_ASSIGN_OR_RETURN(se::dnn::DataType dtype, GetDnnDataType(conv));
+  TF_ASSIGN_OR_RETURN(se::dnn::ConvolutionKind kind,
+                      GetDNNConvKindFromCudnnConvKind(config.kind));
+
+  TF_ASSIGN_OR_RETURN(se::dnn::DataType dtype,
+                      GetDNNDataTypeFromPrimitiveType(config.output_type));
 
   TF_ASSIGN_OR_RETURN(GpuConvParams params,
-                      GetGpuConvParams(conv, operand_buffers, result_buffer));
+                      GetGpuConvParams(config, operand_buffers, result_buffer));
 
   bool succ = stream_exec->GetMIOpenConvolveAlgorithms(
-      kind, dtype, stream, params.input_descriptor, params.input_buf,
-      params.filter_descriptor, params.filter_buf, params.output_descriptor,
-      params.output_buf, params.conv_desc, scratch_allocator, &algorithms);
+      kind, dtype, stream, params.config.input_descriptor, params.input_buf,
+      params.config.filter_descriptor, params.filter_buf,
+      params.config.output_descriptor, params.output_buf,
+      params.config.conv_desc, scratch_allocator, &algorithms);
   DCHECK(succ);
 
   return algorithms;
@@ -442,6 +446,8 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
       GetComputeCapability(stream_exec_), GetCudnnVersion(stream_exec_),
       blas_version, canonical_hlo);
 
+  TF_ASSIGN_OR_RETURN(GpuConvConfig config, GetGpuConvConfig(instr));
+
   for (const AlgorithmDesc& alg : GetAlgorithms(kind, stream_exec_)) {
     XLA_SCOPED_LOGGING_TIMER_LEVEL(
         absl::StrCat("CudnnConvAlgorithmPicker::PickBestAlgorithm algo ",
@@ -465,7 +471,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheCuda(
     options.profile_result = &profile_result;
     options.algo_override = alg;
     Status launch_status =
-        RunGpuConv(instr, absl::MakeSpan(operand_buffers), result_buffer,
+        RunGpuConv(config, absl::MakeSpan(operand_buffers), result_buffer,
                    &scratch_allocator, stream, options);
 
     if (!launch_status.ok()) {
@@ -700,6 +706,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
     *result.mutable_run_time() = tensorflow::proto_utils::ToDurationProto(
         absl::Milliseconds(profile_result.elapsed_time_in_ms()));
   } else {
+    TF_ASSIGN_OR_RETURN(GpuConvConfig config, GetGpuConvConfig(instr));
     for (const auto& miopen_alg : algorithms) {
       const auto& alg = miopen_alg.algorithm();
       XLA_SCOPED_LOGGING_TIMER_LEVEL(
@@ -717,7 +724,7 @@ GpuConvAlgorithmPicker::PickBestAlgorithmNoCacheRocm(
       options.algo_override = alg;
       options.scratch_size_override = miopen_alg.scratch_size();
       Status launch_status =
-          RunGpuConv(instr, absl::MakeSpan(operand_buffers), result_buffer,
+          RunGpuConv(config, absl::MakeSpan(operand_buffers), result_buffer,
                      &scratch_allocator, stream, options);
 
       if (!launch_status.ok()) {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization.cc
index 5fa102ac785..94f9a96c0fe 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization.cc
@@ -313,7 +313,11 @@ bool GpuConvPaddingLegalization::CanonicalizeBackwardInputConvolution(
                             new_backward_conv_window.mutable_dimensions(i));
     }
     // Decreasing the padding by X *increases* the size of our output by X.
-    int64 dim = backward_conv_dnums.output_spatial_dimensions(i);
+    // Note that we have swapped input spatial dimensions with output spatial
+    // dimensions to be compatible with the cuDNN API, so
+    // input_spatial_dimensions(i) gives the i-th spatial dimension of the
+    // output.
+    int64 dim = backward_conv_dnums.input_spatial_dimensions(i);
     new_backward_conv_shape.set_dimensions(
         dim, new_backward_conv_shape.dimensions(dim) +
                  std::abs(padding_low - padding_high));
@@ -353,7 +357,11 @@ bool GpuConvPaddingLegalization::CanonicalizeBackwardInputConvolution(
   for (size_t i = 0; i < backward_conv->window().dimensions_size(); ++i) {
     int64 padding_low = backward_conv->window().dimensions(i).padding_low();
     int64 padding_high = backward_conv->window().dimensions(i).padding_high();
-    int64 dim = backward_conv_dnums.output_spatial_dimensions(i);
+    // Note that we have swapped input spatial dimensions with output spatial
+    // dimensions to be compatible with the cuDNN API, so
+    // input_spatial_dimensions(i) gives the i-th spatial dimension of the
+    // output.
+    int64 dim = backward_conv_dnums.input_spatial_dimensions(i);
     if (padding_low > padding_high) {
       // If the amount of low padding (of the old backward convolution) is
       // larger, we internally pad the low end of the activations and slice
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization_test.cc
new file mode 100644
index 00000000000..c214486e18f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization_test.cc
@@ -0,0 +1,97 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_conv_padding_legalization.h"
+
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+using ::testing::_;
+
+using GpuConvPaddingLegalizationTest = HloTestBase;
+
+TEST_F(GpuConvPaddingLegalizationTest, BackwardInputConvolve) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+  HloModule convolution_module
+ENTRY %convolution (operand f64[2,2,2,3]{3,2,1,0}) -> (f64[2,2,4,4]{3,2,1,0}, u8[0]) {
+  %operand = f64[2,2,2,3]{3,2,1,0} parameter(0)
+  %kernel = f64[2,3,2,3]{3,2,1,0} constant(
+  {
+    { /*i0=0*/
+    { /*i1=0*/
+      { 0.29629629629629628, 0.30246913580246915, 0.30864197530864196 },
+      { 0.31481481481481483, 0.32098765432098764, 0.3271604938271605 }
+    },
+    { /*i1=1*/
+      { 0.25925925925925924, 0.26543209876543211, 0.27160493827160492 },
+      { 0.27777777777777779, 0.2839506172839506, 0.29012345679012347 }
+    },
+    { /*i1=2*/
+      { 0.22222222222222221, 0.22839506172839505, 0.23456790123456789 },
+      { 0.24074074074074073, 0.24691358024691357, 0.25308641975308643 }
+    }
+    },
+    { /*i0=1*/
+    { /*i1=0*/
+      { 0.18518518518518517, 0.19135802469135801, 0.19753086419753085 },
+      { 0.20370370370370369, 0.20987654320987653, 0.21604938271604937 }
+    },
+    { /*i1=1*/
+      { 0.14814814814814814, 0.15432098765432098, 0.16049382716049382 },
+      { 0.16666666666666666, 0.1728395061728395, 0.17901234567901234 }
+    },
+    { /*i2=2*/
+      { 0.1111111111111111, 0.11728395061728394, 0.12345679012345678 },
+      { 0.12962962962962962, 0.13580246913580246, 0.1419753086419753 }
+    }
+    }
+  })
+  %reverse = f64[2,3,2,3]{3,2,1,0} reverse(%kernel), dimensions={0,1}
+  ROOT %custom-call = (f64[2,2,4,4]{3,2,1,0}, u8[0]{0}) custom-call(f64[2,2,2,3]{3,2,1,0} %operand, f64[2,3,2,3]{3,2,1,0} %reverse), window={size=2x3 stride=2x2 pad=0_0x0_1}, dim_labels=bf01_01io->b01f, custom_call_target="__cudnn$convBackwardInput", backend_config="{\"algorithm\":\"0\",\"tensor_ops_enabled\":false,\"conv_result_scale\":1,\"activation_mode\":\"0\",\"side_input_scale\":0}"
+}
+                                               )")
+                    .ValueOrDie();
+  ASSERT_TRUE(GpuConvPaddingLegalization().Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              op::Tuple(op::Slice(op::GetTupleElement(
+                            op::CustomCall(kCudnnConvBackwardInputCallTarget, _,
+                                           op::Reverse(op::Constant())),
+                            0)),
+                        op::GetTupleElement()));
+  auto slice = root->operand(0);
+  Shape expected_slice_shape = ShapeUtil::MakeShape(F64, {2, 2, 4, 4});
+  EXPECT_TRUE(ShapeUtil::Equal(slice->shape(), expected_slice_shape));
+  auto conv = slice->operand(0);
+  Shape expected_conv_shape = ShapeUtil::MakeShape(F64, {2, 2, 4, 5});
+  EXPECT_TRUE(ShapeUtil::Equal(conv->shape(), expected_conv_shape));
+}
+
+}  // anonymous namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.cc b/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.cc
index 5cc5fa7d16d..e0ccbad3a01 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.cc
@@ -79,16 +79,16 @@ Status RunGpuConvForward(GpuConvParams params,
                          DeviceMemory<ElementType> filter_buf,
                          DeviceMemory<OutputType> output_buf,
                          AlgorithmConfig algorithm) {
-  if (params.conv_result_scale != 1) {
+  if (params.config.conv_result_scale != 1) {
     return InternalError(
         "StreamExecutor doesn't support scaled convolution: %lf.",
-        params.conv_result_scale);
+        params.config.conv_result_scale);
   }
-  stream->ThenConvolveWithAlgorithm(
-      params.input_descriptor, input_buf, params.filter_descriptor, filter_buf,
-      params.conv_desc, params.output_descriptor, &output_buf,
-      scratch_allocator, algorithm, options.profile_result);
-  return Status::OK();
+  return stream->ConvolveWithAlgorithm(
+      params.config.input_descriptor, input_buf,
+      params.config.filter_descriptor, filter_buf, params.config.conv_desc,
+      params.config.output_descriptor, &output_buf, scratch_allocator,
+      algorithm, options.profile_result);
 }
 
 template <typename ElementType, typename BiasType, typename OutputType>
@@ -103,13 +103,14 @@ Status RunGpuConvForwardActivation(GpuConvParams params,
   bias_desc.set_count(1)
       .set_height(1)
       .set_width(1)
-      .set_feature_map_count(params.output_descriptor.feature_map_count())
-      .set_layout(params.output_descriptor.layout());
+      .set_feature_map_count(
+          params.config.output_descriptor.feature_map_count())
+      .set_layout(params.config.output_descriptor.layout());
 
   se::DeviceMemory<OutputType> side_input(params.fusion->side_input_buf);
   // If there is no side input, use output as the side input.
   if (side_input.is_null()) {
-    if (params.fusion->side_input_scale != 0) {
+    if (params.config.fusion->side_input_scale != 0) {
       return InternalError(
           "Side input scale is not 0, yet no side input buffer is "
           "provided");
@@ -123,15 +124,14 @@ Status RunGpuConvForwardActivation(GpuConvParams params,
     side_input = output_buf;
   }
 
-  stream->ThenFusedConvolveWithAlgorithm(
-      params.input_descriptor, input_buf, params.conv_result_scale,
-      params.filter_descriptor, filter_buf, params.conv_desc, side_input,
-      params.fusion->side_input_scale, bias_desc,
-      DeviceMemory<BiasType>(params.fusion->bias_buf), params.fusion->mode,
-      params.output_descriptor, &output_buf, scratch_allocator, algorithm,
-      options.profile_result);
-
-  return Status::OK();
+  return stream->FusedConvolveWithAlgorithm(
+      params.config.input_descriptor, input_buf,
+      params.config.conv_result_scale, params.config.filter_descriptor,
+      filter_buf, params.config.conv_desc, side_input,
+      params.config.fusion->side_input_scale, bias_desc,
+      DeviceMemory<BiasType>(params.fusion->bias_buf),
+      params.config.fusion->mode, params.config.output_descriptor, &output_buf,
+      scratch_allocator, algorithm, options.profile_result);
 }
 
 // StreamExecutor supports various data types via overloading, and the support
@@ -152,31 +152,33 @@ Status RunGpuConvInternalImpl(GpuConvParams params,
                               DeviceMemory<ElementType> filter_buf,
                               DeviceMemory<OutputType> output_buf,
                               AlgorithmConfig algorithm) {
-  switch (params.kind) {
+  switch (params.config.kind) {
     case CudnnConvKind::kForward:
       return RunGpuConvForward(params, scratch_allocator, stream, options,
                                input_buf, filter_buf, output_buf, algorithm);
     case CudnnConvKind::kBackwardInput:
-      if (params.conv_result_scale != 1) {
+      if (params.config.conv_result_scale != 1) {
         return InternalError(
             "StreamExecutor doesn't support scaled convolution: %lf.",
-            params.conv_result_scale);
+            params.config.conv_result_scale);
       }
-      stream->ThenConvolveBackwardDataWithAlgorithm(
-          params.filter_descriptor, filter_buf, params.output_descriptor,
-          output_buf, params.conv_desc, params.input_descriptor, &input_buf,
-          scratch_allocator, algorithm, options.profile_result);
+      return stream->ConvolveBackwardDataWithAlgorithm(
+          params.config.filter_descriptor, filter_buf,
+          params.config.output_descriptor, output_buf, params.config.conv_desc,
+          params.config.input_descriptor, &input_buf, scratch_allocator,
+          algorithm, options.profile_result);
       break;
     case CudnnConvKind::kBackwardFilter:
-      if (params.conv_result_scale != 1) {
+      if (params.config.conv_result_scale != 1) {
         return InternalError(
             "StreamExecutor doesn't support scaled convolution: %lf.",
-            params.conv_result_scale);
+            params.config.conv_result_scale);
       }
-      stream->ThenConvolveBackwardFilterWithAlgorithm(
-          params.input_descriptor, input_buf, params.output_descriptor,
-          output_buf, params.conv_desc, params.filter_descriptor, &filter_buf,
-          scratch_allocator, algorithm, options.profile_result);
+      return stream->ConvolveBackwardFilterWithAlgorithm(
+          params.config.input_descriptor, input_buf,
+          params.config.output_descriptor, output_buf, params.config.conv_desc,
+          params.config.filter_descriptor, &filter_buf, scratch_allocator,
+          algorithm, options.profile_result);
       break;
     case CudnnConvKind::kForwardActivation: {
       return RunGpuConvForwardActivation<ElementType, BiasType, OutputType>(
@@ -198,7 +200,7 @@ Status RunGpuConvInternalImpl(GpuConvParams params,
                               DeviceMemory<ElementType> filter_buf,
                               DeviceMemory<OutputType> output_buf,
                               AlgorithmConfig algorithm) {
-  switch (params.kind) {
+  switch (params.config.kind) {
     case CudnnConvKind::kForward:
       return RunGpuConvForward(params, scratch_allocator, stream, options,
                                input_buf, filter_buf, output_buf, algorithm);
@@ -221,7 +223,7 @@ Status RunGpuConvImpl(const GpuConvParams& params,
   auto input_buf = se::DeviceMemory<ElementType>(params.input_buf);
   auto filter_buf = se::DeviceMemory<ElementType>(params.filter_buf);
   auto output_buf = se::DeviceMemory<OutputType>(params.output_buf);
-  AlgorithmConfig algorithm = params.algorithm;
+  AlgorithmConfig algorithm = params.config.algorithm;
 
   if (options.algo_override.has_value()) {
     algorithm = AlgorithmConfig(*options.algo_override);
@@ -241,7 +243,8 @@ Status RunGpuConvImpl(const GpuConvParams& params,
   if (!stream->ok()) {
     return InternalError(
         "Unable to launch convolution with type %s and algorithm (%d, %s)",
-        CudnnConvKindToString(params.kind), algorithm.algorithm()->algo_id(),
+        CudnnConvKindToString(params.config.kind),
+        algorithm.algorithm()->algo_id(),
         algorithm.algorithm_no_scratch().has_value()
             ? absl::StrCat(algorithm.algorithm_no_scratch()->algo_id())
             : "none");
@@ -251,95 +254,83 @@ Status RunGpuConvImpl(const GpuConvParams& params,
 
 }  // anonymous namespace
 
-StatusOr<GpuConvParams> GetGpuConvParams(
-    const HloCustomCallInstruction* conv,
-    absl::Span<se::DeviceMemoryBase> operand_buffers,
-    se::DeviceMemoryBase result_buffer) {
-  GpuConvParams params;
+StatusOr<GpuConvConfig> GetGpuConvConfig(
+    const HloCustomCallInstruction* cudnn_call) {
+  GpuConvConfig config;
+
+  config.input_type = cudnn_call->operand(0)->shape().element_type();
+  config.output_type = cudnn_call->shape().tuple_shapes(0).element_type();
 
   TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config,
-                      conv->backend_config<CudnnConvBackendConfig>());
-  TF_ASSIGN_OR_RETURN(params.kind, GetCudnnConvKind(conv));
-  const Shape* input_shape;
-  const Shape* filter_shape;
-  const Shape* output_shape;
+                      cudnn_call->backend_config<CudnnConvBackendConfig>());
+  TF_ASSIGN_OR_RETURN(config.kind, GetCudnnConvKind(cudnn_call));
 
   // The third field is scratch size stored from conv_algorithm_picker
   // The operand is added to the shape field of the conv instruction
   // in GpuConvAlgorithmPicker::RunOnInstruction() call.
-  params.algorithm = se::dnn::AlgorithmConfig(
+  config.algorithm = se::dnn::AlgorithmConfig(
       se::dnn::AlgorithmDesc(backend_config.algorithm(),
                              backend_config.tensor_ops_enabled()),
-      conv->shape().tuple_shapes(1).dimensions(0));
-  params.conv_result_scale = backend_config.conv_result_scale();
+      cudnn_call->shape().tuple_shapes(1).dimensions(0));
+  config.conv_result_scale = backend_config.conv_result_scale();
 
-  switch (params.kind) {
+  Shape operand0_shape = cudnn_call->operand(0)->shape();
+  Shape operand1_shape = cudnn_call->operand(1)->shape();
+  Shape result_shape = cudnn_call->shape().tuple_shapes(0);
+
+  switch (config.kind) {
     case CudnnConvKind::kForward:
-      input_shape = &conv->operand(0)->shape();
-      filter_shape = &conv->operand(1)->shape();
-      output_shape = &conv->shape().tuple_shapes(0);
-      params.input_buf = operand_buffers[0];
-      params.filter_buf = operand_buffers[1];
-      params.output_buf = result_buffer;
+    case CudnnConvKind::kForwardActivation:
+      config.input_shape = operand0_shape;
+      config.filter_shape = operand1_shape;
+      config.output_shape = result_shape;
       break;
     case CudnnConvKind::kBackwardInput:
-      input_shape = &conv->shape().tuple_shapes(0);
-      filter_shape = &conv->operand(1)->shape();
-      output_shape = &conv->operand(0)->shape();
-      params.input_buf = result_buffer;
-      params.filter_buf = operand_buffers[1];
-      params.output_buf = operand_buffers[0];
+      config.input_shape = result_shape;
+      config.filter_shape = operand1_shape;
+      config.output_shape = operand0_shape;
       break;
     case CudnnConvKind::kBackwardFilter:
-      input_shape = &conv->operand(0)->shape();
-      filter_shape = &conv->shape().tuple_shapes(0);
-      output_shape = &conv->operand(1)->shape();
-      params.input_buf = operand_buffers[0];
-      params.filter_buf = result_buffer;
-      params.output_buf = operand_buffers[1];
+      config.input_shape = operand0_shape;
+      config.filter_shape = result_shape;
+      config.output_shape = operand1_shape;
       break;
-    case CudnnConvKind::kForwardActivation: {
-      input_shape = &conv->operand(0)->shape();
-      filter_shape = &conv->operand(1)->shape();
-      output_shape = &conv->shape().tuple_shapes(0);
-      params.fusion.emplace();
-      GpuConvParams::FusionParams& fusion = *params.fusion;
-      if (!se::dnn::ActivationMode_IsValid(backend_config.activation_mode())) {
-        return InternalError("Bad activation mode: %s",
-                             backend_config.ShortDebugString());
-      }
-      fusion.mode = static_cast<se::dnn::ActivationMode>(
-          backend_config.activation_mode());
-      fusion.side_input_scale = backend_config.side_input_scale();
-      params.input_buf = operand_buffers[0];
-      params.filter_buf = operand_buffers[1];
-      params.output_buf = result_buffer;
-      params.fusion->bias_buf = operand_buffers[2];
-      if (operand_buffers.size() >= 4) {
-        params.fusion->side_input_buf = operand_buffers[3];
-      }
-    }
+    default:
+      return InternalError("Unknown convolution kind");
   }
 
-  const Window& window = conv->window();
+  if (config.kind == CudnnConvKind::kForwardActivation) {
+    config.fusion.emplace();
+    GpuConvConfig::FusionConfig& fusion = *config.fusion;
+    if (!se::dnn::ActivationMode_IsValid(backend_config.activation_mode())) {
+      return InternalError("Bad activation mode: %s",
+                           backend_config.ShortDebugString());
+    }
+    fusion.mode =
+        static_cast<se::dnn::ActivationMode>(backend_config.activation_mode());
+    fusion.side_input_scale = backend_config.side_input_scale();
+  }
+
+  const Window& window = cudnn_call->window();
   const ConvolutionDimensionNumbers& dnums =
-      conv->convolution_dimension_numbers();
+      cudnn_call->convolution_dimension_numbers();
 
   VLOG(3) << "Convolution Algorithm: "
-          << params.algorithm.algorithm()->algo_id();
+          << config.algorithm.algorithm()->algo_id();
   VLOG(3) << "tensor_ops_enabled: "
-          << params.algorithm.algorithm()->tensor_ops_enabled();
-  VLOG(3) << "Convolution kind: " << CudnnConvKindToString(params.kind);
-  VLOG(3) << "input shape: " << ShapeUtil::HumanStringWithLayout(*input_shape);
+          << config.algorithm.algorithm()->tensor_ops_enabled();
+  VLOG(3) << "Convolution kind: " << CudnnConvKindToString(config.kind);
+  VLOG(3) << "input shape: "
+          << ShapeUtil::HumanStringWithLayout(config.input_shape);
   VLOG(3) << "filter shape: "
-          << ShapeUtil::HumanStringWithLayout(*filter_shape);
+          << ShapeUtil::HumanStringWithLayout(config.filter_shape);
   VLOG(3) << "Output shape: "
-          << ShapeUtil::HumanStringWithLayout(*output_shape);
+          << ShapeUtil::HumanStringWithLayout(config.output_shape);
   VLOG(3) << "Window: { " << window.ShortDebugString() << " }";
   VLOG(3) << "Dim nums: { " << dnums.ShortDebugString() << " }";
 
   const int num_dimensions = window.dimensions_size();
-  CHECK_LE(num_dimensions, 3) << conv->ToString();
+  CHECK_LE(num_dimensions, 3) << cudnn_call->ToString();
 
   // cuDNN does not support 1D convolutions. We therefore express 1D
   // convolutions as 2D convolutions where the first spatial dimension is 1.
@@ -353,18 +344,18 @@ StatusOr<GpuConvParams> GetGpuConvParams(
       window.dimensions_size() > 0 && window.dimensions()[0].window_reversal();
 
   CHECK_EQ(num_dimensions, dnums.input_spatial_dimensions_size())
-      << conv->ToString();
+      << cudnn_call->ToString();
   CHECK_EQ(num_dimensions, dnums.kernel_spatial_dimensions_size())
-      << conv->ToString();
+      << cudnn_call->ToString();
   CHECK_EQ(num_dimensions, dnums.output_spatial_dimensions_size())
-      << conv->ToString();
+      << cudnn_call->ToString();
   for (const WindowDimension& dim : window.dimensions()) {
-    CHECK_EQ(dims_reversed, dim.window_reversal()) << conv->ToString();
-    CHECK_EQ(dim.padding_low(), dim.padding_high()) << conv->ToString();
+    CHECK_EQ(dims_reversed, dim.window_reversal()) << cudnn_call->ToString();
+    CHECK_EQ(dim.padding_low(), dim.padding_high()) << cudnn_call->ToString();
     CHECK_EQ(dim.base_dilation(), 1)
         << "cudnn does not support base dilation; it "
            "must be made explicit with a kPad: "
-        << conv->ToString();
+        << cudnn_call->ToString();
   }
 
   // cuDNN's convolution APIs support the BDYX layout for activations/output and
@@ -373,12 +364,16 @@ StatusOr<GpuConvParams> GetGpuConvParams(
   FilterLayout filter_dl;
   DataLayout output_dl;
 
+  const Shape* input_shape = &config.input_shape;
+  const Shape* filter_shape = &config.filter_shape;
+  const Shape* output_shape = &config.output_shape;
+
   TF_ASSIGN_OR_RETURN(std::tie(input_dl, filter_dl, output_dl),
                       XlaConvLayoutsToStreamExecutorLayouts(
                           dnums, input_shape->layout(), filter_shape->layout(),
                           output_shape->layout()));
 
-  BatchDescriptor& input_descriptor = params.input_descriptor;
+  BatchDescriptor& input_descriptor = config.input_descriptor;
   input_descriptor = BatchDescriptor(effective_num_dimensions);
   input_descriptor.set_layout(input_dl)
       .set_feature_map_count(
@@ -391,7 +386,7 @@ StatusOr<GpuConvParams> GetGpuConvParams(
         input_shape->dimensions(dnums.input_spatial_dimensions(dim)));
   }
 
-  FilterDescriptor& filter_descriptor = params.filter_descriptor;
+  FilterDescriptor& filter_descriptor = config.filter_descriptor;
   filter_descriptor = FilterDescriptor(effective_num_dimensions);
   filter_descriptor.set_layout(filter_dl)
       .set_input_feature_map_count(
@@ -404,11 +399,11 @@ StatusOr<GpuConvParams> GetGpuConvParams(
         filter_shape->dimensions(dnums.kernel_spatial_dimensions(dim)));
   }
 
-  params.conv_desc = ConvolutionDescriptor(effective_num_dimensions);
-  params.conv_desc.set_group_count(conv->feature_group_count());
-  params.conv_desc.set_convolution_not_crosscorr(dims_reversed);
+  config.conv_desc = ConvolutionDescriptor(effective_num_dimensions);
+  config.conv_desc.set_group_count(cudnn_call->feature_group_count());
+  config.conv_desc.set_convolution_not_crosscorr(dims_reversed);
   for (int dim = 0; dim < num_dimensions; ++dim) {
-    params.conv_desc
+    config.conv_desc
         .set_zero_padding(
             static_cast<DimIndex>(effective_num_dimensions - dim - 1),
             window.dimensions(dim).padding_low())
@@ -420,7 +415,7 @@ StatusOr<GpuConvParams> GetGpuConvParams(
             window.dimensions(dim).window_dilation());
   }
 
-  BatchDescriptor& output_descriptor = params.output_descriptor;
+  BatchDescriptor& output_descriptor = config.output_descriptor;
   output_descriptor = BatchDescriptor(effective_num_dimensions);
   output_descriptor.set_layout(output_dl)
       .set_feature_map_count(
@@ -437,32 +432,70 @@ StatusOr<GpuConvParams> GetGpuConvParams(
     input_descriptor.set_spatial_dim(static_cast<DimIndex>(dim), 1);
     output_descriptor.set_spatial_dim(static_cast<DimIndex>(dim), 1);
     filter_descriptor.set_spatial_dim(static_cast<DimIndex>(dim), 1);
-    params.conv_desc.set_zero_padding(static_cast<DimIndex>(dim), 0)
+    config.conv_desc.set_zero_padding(static_cast<DimIndex>(dim), 0)
         .set_filter_stride(static_cast<DimIndex>(dim), 1);
   }
 
+  return config;
+}
+
+StatusOr<GpuConvParams> GetGpuConvParams(
+    const GpuConvConfig& config,
+    absl::Span<se::DeviceMemoryBase> operand_buffers,
+    se::DeviceMemoryBase result_buffer) {
+  GpuConvParams params;
+  params.config = config;
+
+  switch (config.kind) {
+    case CudnnConvKind::kForward:
+    case CudnnConvKind::kForwardActivation:
+      params.input_buf = operand_buffers[0];
+      params.filter_buf = operand_buffers[1];
+      params.output_buf = result_buffer;
+      break;
+    case CudnnConvKind::kBackwardInput:
+      params.input_buf = result_buffer;
+      params.filter_buf = operand_buffers[1];
+      params.output_buf = operand_buffers[0];
+      break;
+    case CudnnConvKind::kBackwardFilter:
+      params.input_buf = operand_buffers[0];
+      params.filter_buf = result_buffer;
+      params.output_buf = operand_buffers[1];
+      break;
+  }
+
+  if (config.kind == CudnnConvKind::kForwardActivation) {
+    params.fusion.emplace();
+    GpuConvParams::FusionParams& fusion = *params.fusion;
+    fusion.bias_buf = operand_buffers[2];
+    if (operand_buffers.size() >= 4) {
+      fusion.side_input_buf = operand_buffers[3];
+    }
+  }
+
   return params;
 }
 
-Status RunGpuConv(const HloCustomCallInstruction* conv,
+Status RunGpuConv(const gpu::GpuConvConfig& config,
                   absl::Span<se::DeviceMemoryBase> operand_buffers,
                   se::DeviceMemoryBase result_buffer,
                   se::DeviceMemoryBase scratch_buf, se::Stream* stream,
                   RunConvOptions options) {
   ScratchBufAllocator scratch_allocator(scratch_buf);
-  return RunGpuConv(conv, operand_buffers, result_buffer, &scratch_allocator,
+  return RunGpuConv(config, operand_buffers, result_buffer, &scratch_allocator,
                     stream, options);
 }
 
-Status RunGpuConv(const HloCustomCallInstruction* conv,
+Status RunGpuConv(const gpu::GpuConvConfig& config,
                   absl::Span<se::DeviceMemoryBase> operand_buffers,
                   se::DeviceMemoryBase result_buffer,
                   se::ScratchAllocator* scratch_allocator, se::Stream* stream,
                   RunConvOptions options) {
   TF_ASSIGN_OR_RETURN(GpuConvParams params,
-                      GetGpuConvParams(conv, operand_buffers, result_buffer));
+                      GetGpuConvParams(config, operand_buffers, result_buffer));
 
-  PrimitiveType input_primitive_type = conv->operand(0)->shape().element_type();
+  PrimitiveType input_primitive_type = config.input_type;
   switch (input_primitive_type) {
     case F16:
       return RunGpuConvImpl<Eigen::half, Eigen::half, Eigen::half>(
@@ -474,8 +507,7 @@ Status RunGpuConv(const HloCustomCallInstruction* conv,
       return RunGpuConvImpl<double, double, double>(params, scratch_allocator,
                                                     stream, options);
     case S8: {
-      PrimitiveType output_primitive_type =
-          conv->shape().tuple_shapes(0).element_type();
+      PrimitiveType output_primitive_type = config.output_type;
       switch (output_primitive_type) {
         case F32:
           return RunGpuConvImpl<int8, float, float>(params, scratch_allocator,
@@ -484,12 +516,11 @@ Status RunGpuConv(const HloCustomCallInstruction* conv,
           return RunGpuConvImpl<int8, float, int8>(params, scratch_allocator,
                                                    stream, options);
         default:
-          return Unimplemented("Unimplemented convolution %s",
-                               conv->ToString());
+          return Unimplemented("Unimplemented convolution");
       }
     }
     default:
-      return Unimplemented("Unimplemented convolution %s", conv->ToString());
+      return Unimplemented("Unimplemented convolution");
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h b/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h
index 3b8ce0f0f1c..5d27e6d6da7 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_CONV_RUNNER_H_
 
 #include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
@@ -25,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/dnn.h"
 
 namespace xla {
 namespace gpu {
@@ -40,10 +42,10 @@ struct RunConvOptions {
   absl::optional<size_t> scratch_size_override;
 };
 
-// Implementation struct exposed for debugging and log analysis.
-struct GpuConvParams {
-  // Here are the fields related to cuDNN's fused convolution. The result thus
-  // is defined as:
+// Structure to describe static properties of a GPU convolution.
+struct GpuConvConfig {
+  // Field related to cuDNN's fused convolution are in FusionConfig &
+  // FusionParams structures. The result thus is defined as:
   //   activation(conv_result_scale * conv(x, w) +
   //       side_input_scale * side_input + broadcast(bias))
   //
@@ -54,23 +56,39 @@ struct GpuConvParams {
   // added to the final results.
   //
   // side_input_buf, if valid, must have the same shape as the output buffer.
-  struct FusionParams {
+  struct FusionConfig {
     se::dnn::ActivationMode mode;
     double side_input_scale;
+  };
+
+  PrimitiveType input_type;
+  PrimitiveType output_type;
+  CudnnConvKind kind;
+  se::dnn::AlgorithmConfig algorithm;
+  double conv_result_scale;
+
+  se::dnn::BatchDescriptor input_descriptor;
+  se::dnn::FilterDescriptor filter_descriptor;
+  se::dnn::BatchDescriptor output_descriptor;
+  se::dnn::ConvolutionDescriptor conv_desc;
+
+  Shape input_shape;
+  Shape filter_shape;
+  Shape output_shape;
+  absl::optional<FusionConfig> fusion;
+};
+
+// Implementation struct exposed for debugging and log analysis.
+struct GpuConvParams {
+  GpuConvConfig config;
+  struct FusionParams {
     se::DeviceMemoryBase bias_buf;
     se::DeviceMemoryBase side_input_buf;  // nullable
   };
 
-  CudnnConvKind kind;
-  se::dnn::BatchDescriptor input_descriptor;
-  se::dnn::FilterDescriptor filter_descriptor;
-  se::dnn::BatchDescriptor output_descriptor;
   se::DeviceMemoryBase input_buf;
   se::DeviceMemoryBase filter_buf;
   se::DeviceMemoryBase output_buf;
-  se::dnn::ConvolutionDescriptor conv_desc;
-  se::dnn::AlgorithmConfig algorithm;
-  double conv_result_scale;
 
   absl::optional<FusionParams> fusion;
 };
@@ -89,21 +107,24 @@ struct GpuConvParams {
 // allocator and take note of how much memory is used.  The next time you call
 // the same conv, you can provide an explicitly preallocated scratch buffer of
 // that size, if you like.
-Status RunGpuConv(const HloCustomCallInstruction* conv,
+Status RunGpuConv(const GpuConvConfig& conv_config,
                   absl::Span<se::DeviceMemoryBase> operand_buffers,
                   se::DeviceMemoryBase result_buffer,
                   se::DeviceMemoryBase scratch_buf, se::Stream* stream,
                   RunConvOptions = {});
 
-Status RunGpuConv(const HloCustomCallInstruction* conv,
+Status RunGpuConv(const GpuConvConfig& conv_config,
                   absl::Span<se::DeviceMemoryBase> operand_buffers,
                   se::DeviceMemoryBase result_buffer,
                   se::ScratchAllocator* scratch_allocator, se::Stream* stream,
                   RunConvOptions = {});
 
+StatusOr<GpuConvConfig> GetGpuConvConfig(
+    const HloCustomCallInstruction* cudnn_call);
+
 // Implementation details exposed for debugging and log analysis.
 StatusOr<GpuConvParams> GetGpuConvParams(
-    const HloCustomCallInstruction* conv,
+    const GpuConvConfig& conv_config,
     absl::Span<se::DeviceMemoryBase> operand_buffers,
     se::DeviceMemoryBase result_buffer);
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_device_info.h b/tensorflow/compiler/xla/service/gpu/gpu_device_info.h
index 7352bad1a66..afb773c4527 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_device_info.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_device_info.h
@@ -32,6 +32,8 @@ struct GpuDeviceInfo {
   int threads_per_block_limit;
   int threads_per_warp;
   int shared_memory_per_block;
+  int threads_per_core_limit;
+  int core_count;
 };
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 726f1963545..1a0d1e0beb6 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -60,14 +60,16 @@ GpuExecutable::GpuExecutable(
     std::shared_ptr<HloModule> hlo_module,
     std::shared_ptr<const BufferAssignment> assignment,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
+    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
+    std::vector<ConstantInfo> globals)
     : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                  std::move(hlo_profile_index_map)),
       text_(text),
       binary_(binary),
       gpu_version_(gpu_version),
       thunk_schedule_(std::move(thunk_schedule)),
-      assignment_(std::move(assignment)) {
+      assignment_(std::move(assignment)),
+      constants_(std::move(globals)) {
   CHECK(has_module() && assignment_);
   GpuDebugInfoManager::Get()->RegisterModule(module().name(), shared_module(),
                                              assignment_);
@@ -280,28 +282,23 @@ GpuExecutable::ResolveConstantGlobals(se::Stream* stream) {
   se::ModuleHandle module_handle;
   TF_RETURN_IF_ERROR(executor->LoadModule(module_spec, &module_handle));
 
-  for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
-       ++i) {
-    const BufferAllocation& allocation = assignment_->GetAllocation(i);
-    if (allocation.is_constant()) {
-      TF_ASSIGN_OR_RETURN(
-          se::DeviceMemoryBase global,
-          executor->GetUntypedSymbol(
-              llvm_ir::ConstantBufferAllocationToGlobalName(allocation),
-              module_handle));
-      VLOG(3) << "Resolved global "
-              << llvm_ir::ConstantBufferAllocationToGlobalName(allocation)
-              << " to " << global.opaque();
-      InsertOrDie(&globals, i, global);
+  for (const auto& info : constants_) {
+    const Literal& literal = info.content;
 
-      const Literal& literal =
-          llvm_ir::LiteralForConstantAllocation(allocation);
-      CHECK(literal.shape().IsArray());
-      if (!ShouldEmitLiteralInLlvmIr(literal)) {
-        VLOG(3) << "H2D memcpy for constant with shape "
-                << ShapeUtil::HumanString(literal.shape());
-        stream->ThenMemcpy(&global, literal.untyped_data(), allocation.size());
-      }
+    TF_ASSIGN_OR_RETURN(auto global, executor->GetUntypedSymbol(
+                                         info.symbol_name, module_handle));
+    VLOG(3) << "Resolved global " << info.symbol_name << " to "
+            << global.opaque();
+
+    CHECK(literal.shape().IsArray());
+    if (!ShouldEmitLiteralInLlvmIr(literal)) {
+      VLOG(3) << "H2D memcpy for constant with shape "
+              << ShapeUtil::HumanString(literal.shape());
+      stream->ThenMemcpy(&global, literal.untyped_data(), literal.size_bytes());
+    }
+
+    if (info.allocation_index != -1) {
+      InsertOrDie(&globals, info.allocation_index, global);
     }
   }
 
@@ -334,7 +331,11 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::BufferForAllocation(
     }
     return registered_buffer;
   } else if (allocation.is_constant()) {
-    return FindOrDie(*globals, arg_idx);
+    auto it = globals->find(arg_idx);
+    if (it == globals->end()) {
+      return se::DeviceMemoryBase();
+    }
+    return it->second;
   } else {
     // Allocate each allocation that might escape, or is the temp buffer.
     CHECK(allocation.maybe_live_out() || allocation.IsPreallocatedTempBuffer());
@@ -449,8 +450,7 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
   HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
   const Shape& root_shape = root->shape();
   auto device_ordinal = executor->device_ordinal();
-  ExecutionOutput result(/*on_host_shape=*/root->shape(),
-                         /*on_device_shape=*/root->shape(), memory_allocator,
+  ExecutionOutput result(/*on_device_shape=*/root->shape(), memory_allocator,
                          device_ordinal);
 
   TF_ASSIGN_OR_RETURN(BufferAllocations buffer_allocations,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 516fa9b269a..613880fd44b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -49,6 +49,12 @@ namespace gpu {
 // This is an immutable data type after initialization, and thus thread safe.
 class GpuExecutable : public Executable {
  public:
+  struct ConstantInfo {
+    std::string symbol_name;
+    xla::Literal content;
+    int allocation_index = -1;
+  };
+
   // We need to share ownership of hlo_module and assignment with profiler to
   // safely keep a reference to these objects during tracing period, thus they
   // are passed as shared pointers.
@@ -58,7 +64,8 @@ class GpuExecutable : public Executable {
                 std::shared_ptr<HloModule> hlo_module,
                 std::shared_ptr<const BufferAssignment> assignment,
                 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
-                std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
+                std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map,
+                std::vector<ConstantInfo> constants);
   ~GpuExecutable() override;
 
   int64 SizeOfGeneratedCodeInBytes() const override;
@@ -169,6 +176,8 @@ class GpuExecutable : public Executable {
   std::map<stream_executor::StreamExecutor*, BufferAllocToDeviceMemoryMap>
       module_globals_ TF_GUARDED_BY(module_handle_mutex_);
 
+  std::vector<ConstantInfo> constants_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(GpuExecutable);
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index bb4184ff76f..b69b32c17c5 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -143,29 +143,27 @@ bool IsInputFusibleReduction(const HloInstruction& instr) {
          IsReductionFromOrToContiguousDimensions(instr);
 }
 
+const HloInstruction* GetRealHeroForMultiOutputFusion(
+    const HloInstruction& instr) {
+  if (instr.opcode() != HloOpcode::kFusion) {
+    return &instr;
+  }
+  auto fused_expression_root = instr.fused_expression_root();
+  if (!instr.IsMultiOutputFusion()) {
+    return fused_expression_root;
+  }
+  // If possible, we want to pick a reduction-from-or-to-contiguous-dims
+  // operand of the fusion root, because it has the most constraints.
+  for (const auto* inst : fused_expression_root->operands()) {
+    if (IsReductionFromOrToContiguousDimensions(*inst)) {
+      return inst;
+    }
+  }
+  return fused_expression_root->operands()[0];
+}
+
 bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
                                           const HloInstruction& instr2) {
-  // Returns the instructions that determines the emitter used for lowering,
-  // sometimes referred to as "the real hero".
-  auto get_real_hero =
-      [&](const HloInstruction* instr) -> const HloInstruction* {
-    if (instr->opcode() != HloOpcode::kFusion) {
-      return instr;
-    }
-    auto fused_expression_root = instr->fused_expression_root();
-    if (!instr->IsMultiOutputFusion()) {
-      return fused_expression_root;
-    }
-    // If possible, we want to pick a reduction-to-vector operand of the
-    // fusion root, because it has the most constraints.
-    for (const auto* inst : fused_expression_root->operands()) {
-      if (IsReductionFromOrToContiguousDimensions(*inst)) {
-        return inst;
-      }
-    }
-    return fused_expression_root->operands()[0];
-  };
-
   // Multi-output fusion kernels share a common parallel loop. The loop
   // dimensions are determined by instruction shapes.
   auto get_loop_shape = [&](const HloInstruction* element_instr) {
@@ -181,8 +179,8 @@ bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
   // root ops should have equal output shapes. An exception are
   // reduction-to-vector ops. Here the input shapes of the reduction (first
   // operand shape) and the reduction dimensions need to match.
-  auto* instr_1 = get_real_hero(&instr1);
-  auto* instr_2 = get_real_hero(&instr2);
+  auto* instr_1 = GetRealHeroForMultiOutputFusion(instr1);
+  auto* instr_2 = GetRealHeroForMultiOutputFusion(instr2);
   if (IsReductionFromOrToContiguousDimensions(*instr_1) &&
       IsReductionFromOrToContiguousDimensions(*instr_2) &&
       !AreFusedReductionOutputsConsistent({instr_1, instr_2}, instr_1)) {
@@ -240,28 +238,42 @@ bool IsLoopFusible(const HloInstruction& instr) {
           instr.opcode() == HloOpcode::kTranspose);
 }
 
-bool IsFusible(const HloInstruction& instr) {
-  return IsInputFusible(instr) || IsLoopFusible(instr);
-}
-
 bool IsProducerConsumerFusible(const HloInstruction& producer,
                                const HloInstruction& consumer) {
-  if (!IsLoopFusible(producer) || !IsFusible(consumer)) {
+  if (!IsLoopFusible(producer)) {
+    VLOG(5) << "Producer " << producer.name() << " is not loop-fusible";
     return false;
   }
+
+  if (!IsInputFusible(consumer) && !IsLoopFusible(consumer)) {
+    VLOG(5) << "Consumer " << consumer.name()
+            << "is not input-fusible and not loop-fusible";
+    return false;
+  }
+
   // Skip multiple output fusion. It's not yet supported.
   if (producer.IsMultiOutputFusion()) {
+    VLOG(5) << "Producer " << producer.name()
+            << " is not fusible as it is a multi-output fusion";
     return false;
   }
+
   if (CreatesNestedLoop(producer, consumer)) {
+    VLOG(5) << "Fusing " << producer.name() << " into " << consumer.name()
+            << " creates nested loop";
     return false;
   }
+
   // Do not fuse into reduce input fusions if the resulting kernel would suffer
   // from poor data locality (due to unfriendly input layouts).
   if (IsInputFusibleReduction(consumer) &&
       !LayoutsAreReduceInputFusionFriendly(producer, consumer)) {
+    VLOG(5) << "Layout of " << producer.name()
+            << " is not fusion-friendly for consumer reduction "
+            << consumer.name();
     return false;
   }
+
   // Fuse scalar constants into loop fusion nodes. This reduces the number of
   // parameters and makes matching scalar broadcasts easier.
   //
@@ -270,10 +282,14 @@ bool IsProducerConsumerFusible(const HloInstruction& producer,
   // but fused constants are handled by shrared CPU/GPU code and always emitted
   // in the IR/PTX.  The external constant representation makes for faster
   // compiles and significantly smaller assembly code.
-  if (producer.opcode() == HloOpcode::kConstant) {
-    return ShapeUtil::IsEffectiveScalar(producer.shape()) &&
-           consumer.opcode() == HloOpcode::kFusion;
+  if (producer.opcode() == HloOpcode::kConstant &&
+      (!ShapeUtil::IsEffectiveScalar(producer.shape()) ||
+       consumer.opcode() != HloOpcode::kFusion)) {
+    VLOG(5) << "Not fusing constant " << producer.name() << " into "
+            << consumer.name();
+    return false;
   }
+
   return true;
 }
 
@@ -347,8 +363,13 @@ static int64 SharedMemoryUsage(const HloInstruction& instr) {
 // This limit is also often good for performance.  In a fusion with many
 // operands, each GPU thread likely has to do a lot of work, and so possibly
 // uses a lot of registers, thus limiting occupancy.
+//
+// If the fusion is a producer/consumer fusion and instr1 is the
+// consumer and instr2 is the producer, set is_consumer_producer_fusion
+// to true to enable more fusion.
 bool FusionWouldBeTooLarge(const HloInstruction& instr1,
-                           const HloInstruction& instr2) {
+                           const HloInstruction& instr2,
+                           bool is_consumer_producer_fusion) {
   if (SharedMemoryUsage(instr1) + SharedMemoryUsage(instr2) >
       kSharedMemoryBudgetInBytes) {
     VLOG(5) << "Shared memory usage of fusion of " << instr1.ToString()
@@ -404,6 +425,17 @@ bool FusionWouldBeTooLarge(const HloInstruction& instr1,
   // producer -> consumer relationship.
   operands.erase(&instr1);
   operands.erase(&instr2);
+
+  // If we generate the same numbers of inputs and outputs as
+  // before, it won't be bigger after fusion. So accept the fusion.
+  // As this is a consumer_producer fusion, this does not change the
+  // consumer numbers of output. So no need to check it.
+  if (is_consumer_producer_fusion &&
+      operands.size() <= instr1.operands().size()) {
+    return false;
+  }
+
+  // Does the new fusion have more operands and outputs than the max?
   return operands.size() + num_output_buffers > kMaxOperandsAndOutputsPerFusion;
 }
 
@@ -490,5 +522,24 @@ HloInstruction::FusionKind ChooseFusionKind(const HloInstruction& /*producer*/,
                                   : HloInstruction::FusionKind::kLoop;
 }
 
+bool IsConsumerTheOnlyNonRootUser(const HloInstruction& instr,
+                                  const HloInstruction& consumer) {
+  return absl::c_all_of(instr.users(), [&](const HloInstruction* user) {
+    if (user->opcode() == HloOpcode::kGetTupleElement) {
+      // Skip GTE.
+      return IsConsumerTheOnlyNonRootUser(*user, consumer);
+    }
+    if (user == &consumer) {
+      // `user` is `consumer`.
+      return true;
+    }
+    if (user == user->parent()->root_instruction()) {
+      // Consumed by ROOT.
+      return true;
+    }
+    return false;
+  });
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
index e2a42ecb0a3..9fa098a3394 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
@@ -26,11 +26,6 @@ namespace gpu {
 
 constexpr int64 kMaxOperandsAndOutputsPerFusion = 64;
 
-// Whether 'instr' can occur inside fusions, i.e. whether it is a candidate
-// for being fused. Note that further restrictions apply, e.g. Scatter must
-// be the root of an input fusion.
-bool IsFusible(const HloInstruction& instr);
-
 bool IsInputFusible(const HloInstruction& instr);
 
 bool IsLoopFusible(const HloInstruction& instr);
@@ -64,14 +59,23 @@ bool IsInputFusibleScatter(const HloInstruction& instr);
 // Determines whether the combination of `instr1` and `instr2` into a (possibly
 // multi-output) fusion would be "too large" -- i.e., have more operands and
 // outputs than is allowed or occupy too much shared memory.
+// If the fusion is a producer/consumer fusion and instr1 is the
+// consumer and instr2 is the producer, set consumer_producer_fusion
+// to true to enable more fusion.
 bool FusionWouldBeTooLarge(const HloInstruction& instr1,
-                           const HloInstruction& instr2);
+                           const HloInstruction& instr2,
+                           bool is_consumer_producer_fusion = false);
 
 // Check if fusing producer and consumer will generate a nested loop, e.g. both
 // producer and consumer are `reduce-window` HLO instructions.
 bool CreatesNestedLoop(const HloInstruction& producer,
                        const HloInstruction& consumer);
 
+// Returns the instruction that determines the emitter used for lowering,
+// sometimes referred to as "the real hero".
+const HloInstruction* GetRealHeroForMultiOutputFusion(
+    const HloInstruction& instr);
+
 // Whether instruction shapes are compatible for multi-output fusion, i.e.
 // whether the emitters support lowering the resulting fusion.
 // This function works for both, sibling and producer-consumer multi-output
@@ -101,6 +105,10 @@ bool IsFusibleAsMultiOutputFusionRoot(const HloInstruction& instr);
 HloInstruction::FusionKind ChooseFusionKind(const HloInstruction& producer,
                                             const HloInstruction& consumer);
 
+// Returns whether `consumer` is the only non-root user of `instr`.
+bool IsConsumerTheOnlyNonRootUser(const HloInstruction& instr,
+                                  const HloInstruction& consumer);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc
index 1f83ec71984..e73c4885e9e 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.cc
@@ -90,6 +90,15 @@ void HloExecutionProfiler::FinishHloComputation(
   }
 }
 
+void HloExecutionProfiler::FinishHloComputation(
+    absl::optional<size_t> profile_index) {
+  if (do_profile_) {
+    profile_->SetCyclesTakenBy(
+        profile_index.value(),
+        GetCyclesTaken(&timers_, sub_streams_, stream_, clock_rate_ghz_));
+  }
+}
+
 void HloExecutionProfiler::StartHloInstruction() {
   if (do_profile_) {
     InitAndStartTimer(&timers_, stream_);
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h
index 1189143e3f9..860fa167790 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h
+++ b/tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h
@@ -53,6 +53,11 @@ class HloExecutionProfiler {
   // the time that the computation took to execute in the profile.
   void FinishHloComputation(const HloComputation* computation);
 
+  // If profiling is enabled stops the timer for a (sub)computation with the
+  // given profile index and records the time that the computation took to
+  // execute in the profile.
+  void FinishHloComputation(absl::optional<size_t> profile_index);
+
   // If profiling is enabled, starts a per-operation timer.
   void StartHloInstruction();
 
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
index 332db83b6ad..26a22005dae 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -83,6 +83,8 @@ void HloToIrBindings::EmitBasePointersForHlos(
           if (non_io_hlo->opcode() == HloOpcode::kConstant) {
             llvm::Value* global_for_constant = module_->getGlobalVariable(
                 llvm_ir::ConstantHloToGlobalName(*non_io_hlo));
+            CHECK(global_for_constant)
+                << llvm_ir::ConstantHloToGlobalName(*non_io_hlo);
             BindHloToIrValue(*non_io_hlo, global_for_constant);
           } else {
             llvm::Type* pointee_type =
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.cc b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.cc
new file mode 100644
index 00000000000..9287f9a92b7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.cc
@@ -0,0 +1,167 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h"
+
+#include <algorithm>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+// Gets the representative input shape of the multi-output fusion.
+Shape GetInputShapeForMultiOutputFusion(const HloInstruction& instr) {
+  // Get the HLO that determines the emitter used for lowering.
+  const HloInstruction* real_hero = GetRealHeroForMultiOutputFusion(instr);
+  if (real_hero->operands().empty()) {
+    // Simply return an empty shape if the representative node has no input
+    // operands.
+    return Shape();
+  } else {
+    return real_hero->operand(0)->shape();
+  }
+}
+
+class HorizontalInputFusionImpl {
+ public:
+  explicit HorizontalInputFusionImpl(HloComputation* computation)
+      : computation_(computation) {}
+
+  ~HorizontalInputFusionImpl() {}
+
+  StatusOr<bool> Run();
+
+ private:
+  HloComputation* computation_;
+};  // HorizontalInputFusionImpl
+
+// Compares one-by-one the dimensions of `shape_a` and `shape_b` from left to
+// right.
+bool CompareShapeDimsFromLeftToRight(const Shape& shape_a,
+                                     const Shape& shape_b) {
+  if (shape_a.rank() != shape_b.rank()) {
+    return shape_a.rank() < shape_b.rank();
+  }
+  auto dims_a = shape_a.dimensions();
+  auto dims_b = shape_b.dimensions();
+  for (size_t i = 0; i < dims_a.size(); ++i) {
+    if (dims_a[i] != dims_b[i]) {
+      return dims_a[i] < dims_b[i];
+    }
+  }
+  return true;
+}
+
+std::vector<HloInstruction*> FindAndSortFusionCandidates(
+    HloInstruction* consumer) {
+  absl::flat_hash_set<HloInstruction*> fusion_instr_set;
+  for (auto opnd : consumer->operands()) {
+    HloInstruction* predecessor = opnd->LatestNonGteAncestor();
+    // Find out the input fusion instructions whose only consumer is `consumer`.
+    // This guarantees that fusing these candidates will never create cycles, as
+    // there is no back edge.
+    if (IsReduceInputFusion(*predecessor) &&
+        IsConsumerTheOnlyNonRootUser(*predecessor, *consumer)) {
+      fusion_instr_set.insert(predecessor);
+    }
+  }
+
+  std::vector<HloInstruction*> fusion_instrs;
+  fusion_instrs.insert(fusion_instrs.end(), fusion_instr_set.begin(),
+                       fusion_instr_set.end());
+
+  std::sort(fusion_instrs.begin(), fusion_instrs.end(),
+            [&](const HloInstruction* a, const HloInstruction* b) {
+              Shape shape_a = GetInputShapeForMultiOutputFusion(*a);
+              Shape shape_b = GetInputShapeForMultiOutputFusion(*b);
+              if (!ShapeUtil::EqualIgnoringElementType(shape_a, shape_b)) {
+                // Sort shapes according to dimensions, so that the same input
+                // shapes will be placed adjacent each other.
+                return CompareShapeDimsFromLeftToRight(shape_a, shape_b);
+              }
+              // Sort `fusion_instrs` according to instruction counts, because
+              // we'd like to fuse together computations of similar sizes.
+              return a->fused_instruction_count() <
+                     b->fused_instruction_count();
+            });
+
+  return fusion_instrs;
+}
+
+StatusOr<bool> HorizontalInputFusionImpl::Run() {
+  bool changed = false;
+  XLA_VLOG_LINES(3, computation_->ToString());
+
+  // Using def-to-use order is sound since we do not modify users.
+  std::vector<HloInstruction*> def_to_use_order =
+      computation_->MakeInstructionPostOrder();
+  for (auto consumer : def_to_use_order) {
+    auto candidates = FindAndSortFusionCandidates(consumer);
+    if (candidates.empty()) {
+      continue;
+    }
+
+    size_t fusion_anchor_id = 0;
+    for (size_t j = 1; j < candidates.size(); ++j) {
+      HloInstruction* fusion_anchor = candidates[fusion_anchor_id];
+      HloInstruction* fused = candidates[j];
+      if (ShapesCompatibleForMultiOutputFusion(*fusion_anchor, *fused) &&
+          !FusionWouldBeTooLarge(*fusion_anchor, *fused)) {
+        VLOG(3) << "Fuse " << fused->ToString() << " into "
+                << fusion_anchor->ToString();
+        fusion_anchor->MergeFusionInstructionIntoMultiOutput(fused);
+        changed = true;
+      } else {
+        // Update the `fusion_anchor_id` since `fused` is either not
+        // compatible or not beneficial to be fused with current fusion anchor.
+        VLOG(3) << j - fusion_anchor_id - 1 << " instructions are fused.";
+        fusion_anchor_id = j;
+      }
+    }
+  }
+
+  return changed;
+}
+
+}  // namespace
+
+StatusOr<bool> GpuHorizontalInputFusion::RunOnComputation(
+    HloComputation* computation) {
+  HorizontalInputFusionImpl horizontal_fusion_impl(computation);
+  return horizontal_fusion_impl.Run();
+}
+
+StatusOr<bool> GpuHorizontalInputFusion::Run(HloModule* module) {
+  bool changed = false;
+  VLOG(2) << "Run horizontal input fusion.";
+  for (auto* comp : module->MakeNonfusionComputations()) {
+    TF_ASSIGN_OR_RETURN(changed, RunOnComputation(comp));
+  }
+
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h
new file mode 100644
index 00000000000..85313d03412
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h
@@ -0,0 +1,57 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_INPUT_FUSION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_INPUT_FUSION_H_
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+namespace gpu {
+
+// This optimization pass horizontally fuses kInput fusions to both reduce the
+// kernel launch overhead and increase parallelism degree. See
+// GpuHorizontalFusion for general description and motivation about horizontal
+// fusion. GpuHorizontalFusion deals with kLoop fusions while this pass deals
+// with kInput fusions.
+//
+// Following GpuHorizontalFusion, a simple yet effective heuristic is used
+// to search the fusion candidates while avoiding creating cycles. That is,
+// we simply search for fusion candidates by looking for instructions whose
+// outputs are all consumed by the same instruction. This catches the typical
+// target cases; often, the candidate instructions are just consumed by the
+// ROOT tuple of the entry computation.
+class GpuHorizontalInputFusion : public HloModulePass {
+ public:
+  GpuHorizontalInputFusion() {}
+
+  absl::string_view name() const override {
+    return "gpu_horizontal_input_fusion";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  StatusOr<bool> RunOnComputation(HloComputation*);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_INPUT_FUSION_H_
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc
new file mode 100644
index 00000000000..8ecfbb5a8d2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_input_fusion_test.cc
@@ -0,0 +1,216 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/horizontal_input_fusion.h"
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class HorizontalInputFusionTest : public GpuCodegenTest {};
+
+TEST_F(HorizontalInputFusionTest, BasicTest) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+ HloModule BasicTest
+
+  %add_f16 {
+    %x = f16[] parameter(0)
+    %y = f16[] parameter(1)
+    ROOT %add = f16[] add(%x, %y)
+  }
+
+ fused_computation.1 {
+   arg.1 = f16[1024]{0} parameter(0)
+   constant0 = f16[] constant(0)
+   ROOT reduce1 = f16[] reduce(arg.1, constant0), dimensions={0}, to_apply=%add_f16
+ }
+
+ fused_computation.2 {
+   arg.1 = f16[1024]{0} parameter(0)
+   constant0 = f16[] constant(0)
+   ROOT reduce1 = f16[] reduce(arg.1, constant0), dimensions={0}, to_apply=%add_f16
+ }
+
+ ENTRY entry_computation {
+   arg.1 = f16[1024]{0} parameter(0)
+   arg.2 = f16[1024]{0} parameter(1)
+   fusion.1 = f16[] fusion(arg.1), kind=kInput, calls=fused_computation.1
+   fusion.2 = f16[] fusion(arg.2), kind=kInput, calls=fused_computation.2
+   ROOT tuple.1 = (f16[], f16[]) tuple(fusion.1, fusion.2)
+ }
+)")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(GpuHorizontalInputFusion().Run(module.get()).ValueOrDie());
+
+  const HloInstruction* entry_root =
+      module->entry_computation()->root_instruction();
+  EXPECT_THAT(entry_root, op::Tuple((op::GetTupleElement(op::Fusion())),
+                                    (op::GetTupleElement(op::Fusion()))));
+
+  const HloInstruction* fusion = entry_root->operand(0)->operand(0);
+  ASSERT_TRUE(fusion->IsMultiOutputFusion());
+  EXPECT_THAT(fusion->fused_expression_root(),
+              op::Tuple(op::Reduce(), op::Reduce()));
+}
+
+TEST_F(HorizontalInputFusionTest, ManyInputFusions) {
+  auto module = CreateNewVerifiedModule();
+
+  HloComputation* reduce_computation;
+  {
+    auto embedded_builder = HloComputation::Builder("add");
+    auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeShape(F32, {}), "lhs"));
+    auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        1, ShapeUtil::MakeShape(F32, {}), "rhs"));
+    embedded_builder.AddInstruction(
+        HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
+    reduce_computation =
+        module->AddEmbeddedComputation(embedded_builder.Build());
+  }
+
+  HloComputation::Builder builder(TestName());
+  std::vector<HloInstruction*> var_outs;
+  auto input_shape = ShapeUtil::MakeShape(F32, {1024, 1024});
+  auto output_shape = ShapeUtil::MakeShape(F32, {1024});
+  for (int64 i = 0; i < 130; ++i) {
+    // %fused_computation.3 (param_0: f32[1024,1024], param_1: f32[]) ->
+    // f32[1024] {
+    //  %param_0 = f32[1024,1024]{1,0} parameter(0)
+    //  %param_1 = f32[] parameter(1)
+    //  %broadcast = f32[1024,1024]{1,0} broadcast(f32[] %param_1),
+    //  dimensions={}
+    //  %multiply = f32[1024,1024]{1,0}
+    //      multiply(f32[1024,1024]{1,0} %param_0, f32[1024,1024]{1,0}
+    //      %broadcast)
+    //  %constant0 = f32[] constant(0)
+    //  ROOT %reduce = f32[1024]{0}
+    //      reduce(f32[1024,1024]{1,0} %multiply, f32[] %constant0),
+    //          dimensions={1}, to_apply=%add
+    // }
+    HloInstruction* param_var_in = builder.AddInstruction(
+        HloInstruction::CreateParameter(i * 2 + 0, input_shape, "var.in"));
+    HloInstruction* param_alpha =
+        builder.AddInstruction(HloInstruction::CreateParameter(
+            i * 2 + 1, ShapeUtil::MakeShape(F32, {}), "alpha"));
+    auto alpha_broadcasted = builder.AddInstruction(
+        HloInstruction::CreateBroadcast(input_shape, param_alpha, {}));
+    auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+        input_shape, HloOpcode::kMultiply, param_var_in, alpha_broadcasted));
+    HloInstruction* const0 = builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0)));
+    auto reduce = builder.AddInstruction(HloInstruction::CreateReduce(
+        output_shape, mul, const0, {1}, reduce_computation));
+    var_outs.push_back(reduce);
+  }
+  builder.AddInstruction(HloInstruction::CreateTuple(var_outs));
+  module->AddEntryComputation(builder.Build());
+
+  // Verify that horizontal fusion is kicked in. Check that there are multiple
+  // `reduce` instructions fused into the same fusion. 6 is just a randomly
+  // picked number as we don't exactly know how large the fusion will be
+  // created due to the `FusionWouldBeTooLarge` constraint.
+  CompileAndVerifyIr(module->Clone(), R"(CHECK: reduce-group-6)",
+                     /*match_optimized_ir=*/false);
+
+  // Testing with the entire gpu optimization pipeline.
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(HorizontalInputFusionTest, MultiOutputFusionTest) {
+  // This tests the below pattern. One known issue is that gtes (to fusions) can
+  // be removed after their producer fusions are merged. In the below case, gte2
+  // and gte6 will be gone if Fusion2 is fused into Fusion1.
+  //
+  // Fusion1   Fusion2
+  //  |   |    |     |
+  //  |  gte1 gte2   |
+  //  |   |    |     |
+  //  |   Fusion3    |
+  //  |    |   |     |
+  // gte3 gte4 gte5 gte6
+  //  \  |     |    /
+  //  =====ROOT=====
+  //
+  auto module = ParseAndReturnVerifiedModule(R"(
+ HloModule MultiOutputFusionTest
+
+  %add_f16 {
+    %x = f16[] parameter(0)
+    %y = f16[] parameter(1)
+    ROOT %add = f16[] add(%x, %y)
+  }
+
+ fused_computation.1 {
+   arg.1 = f16[1024]{0} parameter(0)
+   constant0 = f16[] constant(0)
+   reduce.1 = f16[] reduce(arg.1, constant0), dimensions={0}, to_apply=%add_f16
+   add.0 = f16[1024] add(arg.1, arg.1)
+   ROOT tuple.1 = (f16[], f16[1024]) tuple(reduce.1, add.0)
+ }
+
+ fused_computation.2 {
+   arg.1 = f16[1024]{0} parameter(0)
+   constant0 = f16[] constant(0)
+   reduce.1 = f16[] reduce(arg.1, constant0), dimensions={0}, to_apply=%add_f16
+   add.0 = f16[1024] add(arg.1, arg.1)
+   ROOT tuple.1 = (f16[], f16[1024]) tuple(reduce.1, add.0)
+ }
+
+ fused_computation.3 {
+   arg.0 = f16[1024]{0} parameter(0)
+   arg.1 = f16[1024]{0} parameter(1)
+   add.0 = f16[1024] add(arg.0, arg.1)
+   mul.0 = f16[1024] multiply(arg.0, arg.1)
+   ROOT tuple.1 = (f16[1024], f16[1024]) tuple(add.0, mul.0)
+ }
+
+ ENTRY entry_computation {
+   arg.1 = f16[1024]{0} parameter(0)
+   arg.2 = f16[1024]{0} parameter(1)
+   fusion.1 = (f16[],f16[1024]) fusion(arg.1), kind=kInput, calls=fused_computation.1
+   fusion.2 = (f16[],f16[1024]) fusion(arg.2), kind=kInput, calls=fused_computation.2
+   gte.3 = f16[] get-tuple-element(fusion.1), index=0
+   gte.1 = f16[1024]{0} get-tuple-element(fusion.1), index=1
+   gte.2 = f16[1024]{0} get-tuple-element(fusion.2), index=1
+   gte.6 = f16[] get-tuple-element(fusion.2), index=0
+   fusion.3 = (f16[1024],f16[1024]) fusion(gte.1, gte.2),
+       kind=kLoop, calls=fused_computation.3
+   gte.4 = f16[1024] get-tuple-element(fusion.3), index=0
+   gte.5 = f16[1024]{0} get-tuple-element(fusion.3), index=1
+   ROOT tuple.1 = (f16[], f16[1024]{0}, f16[], f16[1024]{0})
+       tuple(gte.3, gte.4, gte.5, gte.6)
+ }
+)")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(GpuHorizontalInputFusion().Run(module.get()).ValueOrDie());
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_fusion.cc b/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.cc
similarity index 93%
rename from tensorflow/compiler/xla/service/gpu/horizontal_fusion.cc
rename to tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.cc
index 6d663c66b50..9d1e0533a91 100644
--- a/tensorflow/compiler/xla/service/gpu/horizontal_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.cc
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/horizontal_fusion.h"
+#include "tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.h"
 
 #include <algorithm>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/util/env_var.h"
@@ -66,12 +67,12 @@ PrimitiveType GetUniqueOutputTypeOfFusion(const HloInstruction& instr) {
   return first_output_type;
 }
 
-class HorizontalFusionImpl {
+class HorizontalLoopFusionImpl {
  public:
-  explicit HorizontalFusionImpl(HloComputation* computation)
+  explicit HorizontalLoopFusionImpl(HloComputation* computation)
       : computation_(computation) {}
 
-  ~HorizontalFusionImpl() {}
+  ~HorizontalLoopFusionImpl() {}
 
   StatusOr<bool> Run();
 
@@ -113,7 +114,7 @@ class HorizontalFusionImpl {
   };
 
   HloComputation* computation_;
-};  // HorizontalFusionImpl
+};  // HorizontalLoopFusionImpl
 
 bool IsFusionSupported(const HloInstruction& instr) {
   // Support only kLoop fusion now.
@@ -137,25 +138,6 @@ bool IsFusionSupported(const HloInstruction& instr) {
   return true;
 }
 
-bool IsConsumerTheOnlyNonRootUser(const HloInstruction& instr,
-                                  const HloInstruction& consumer) {
-  return absl::c_all_of(instr.users(), [&](const HloInstruction* user) {
-    if (user->opcode() == HloOpcode::kGetTupleElement) {
-      // Skip GTE.
-      return IsConsumerTheOnlyNonRootUser(*user, consumer);
-    } else if (user == &consumer) {
-      // `user` is `consumer`.
-      return true;
-    } else if (user == user->parent()->root_instruction()) {
-      // Consumed by ROOT is always fine, since it is impossible to create
-      // cycles through ROOT.
-      return true;
-    } else {
-      return false;
-    }
-  });
-}
-
 // Returns whether `instr` is a profitable candidate to be horizontally fused.
 // Since the primary benefit of horizontal fusion comes from reducing the
 // kernel launch overhead, we want to exclude the instructions with
@@ -221,7 +203,7 @@ bool HasOnlyRowMajorLayout(const HloInstruction& fusion_instr) {
   return true;
 }
 
-void HorizontalFusionImpl::FusionCandidates::Initialize(
+void HorizontalLoopFusionImpl::FusionCandidates::Initialize(
     HloInstruction* consumer) {
   // First, find out all fusion instructions. We will filter out
   // unsupported/non-profitable cases below.
@@ -275,7 +257,7 @@ void HorizontalFusionImpl::FusionCandidates::Initialize(
 
 // Gets a next span of fusion instructions to be fused.
 absl::Span<HloInstruction*>
-HorizontalFusionImpl::FusionCandidates::GetNextSpanOfFusions() {
+HorizontalLoopFusionImpl::FusionCandidates::GetNextSpanOfFusions() {
   if (pos_ >= fusion_instrs_.size()) {
     return absl::Span<HloInstruction*>();
   }
@@ -333,7 +315,7 @@ HorizontalFusionImpl::FusionCandidates::GetNextSpanOfFusions() {
   return absl::MakeSpan(fusion_instrs_).subspan(left, right - left);
 }
 
-Status HorizontalFusionImpl::CreateFusedComputation(
+Status HorizontalLoopFusionImpl::CreateFusedComputation(
     absl::Span<HloInstruction*> fused_fusion_instrs,
     std::unique_ptr<HloComputation>* uniq_computation,
     std::vector<HloInstruction*>* bound_operands) {
@@ -441,7 +423,7 @@ Status HorizontalFusionImpl::CreateFusedComputation(
   return Status::OK();
 }
 
-Status HorizontalFusionImpl::Fuse(
+Status HorizontalLoopFusionImpl::Fuse(
     absl::Span<HloInstruction*> fused_fusion_instrs) {
   // Fuse fused_fusion_instrs and replace them with the new fused computation.
   std::unique_ptr<HloComputation> uniq_computation;
@@ -483,7 +465,7 @@ Status HorizontalFusionImpl::Fuse(
   return Status::OK();
 }
 
-StatusOr<bool> HorizontalFusionImpl::Run() {
+StatusOr<bool> HorizontalLoopFusionImpl::Run() {
   bool changed = false;
   XLA_VLOG_LINES(3, computation_->ToString());
 
@@ -492,7 +474,7 @@ StatusOr<bool> HorizontalFusionImpl::Run() {
       computation_->MakeInstructionPostOrder();
   for (size_t i = 0; i < def_to_use_order.size(); ++i) {
     auto consumer = def_to_use_order[i];
-    HorizontalFusionImpl::FusionCandidates fusion_candidates(consumer);
+    HorizontalLoopFusionImpl::FusionCandidates fusion_candidates(consumer);
     while (true) {
       auto fusions = fusion_candidates.GetNextSpanOfFusions();
       if (fusions.empty()) {
@@ -512,13 +494,13 @@ StatusOr<bool> HorizontalFusionImpl::Run() {
 
 }  // namespace
 
-StatusOr<bool> GpuHorizontalFusion::RunOnComputation(
+StatusOr<bool> GpuHorizontalLoopFusion::RunOnComputation(
     HloComputation* computation) {
-  HorizontalFusionImpl horizontal_fusion_impl(computation);
+  HorizontalLoopFusionImpl horizontal_fusion_impl(computation);
   return horizontal_fusion_impl.Run();
 }
 
-StatusOr<bool> GpuHorizontalFusion::Run(HloModule* module) {
+StatusOr<bool> GpuHorizontalLoopFusion::Run(HloModule* module) {
   bool changed = false;
   VLOG(2) << "Run horizontal fusion.";
   for (auto* comp : module->MakeNonfusionComputations()) {
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_fusion.h b/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.h
similarity index 91%
rename from tensorflow/compiler/xla/service/gpu/horizontal_fusion.h
rename to tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.h
index 9a804949b1c..3824c5df352 100644
--- a/tensorflow/compiler/xla/service/gpu/horizontal_fusion.h
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_FUSION_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_FUSION_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_LOOP_FUSION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_LOOP_FUSION_H_
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -94,11 +94,13 @@ namespace gpu {
 // output dims of the concatenate will be used as the kernel launch dims.
 // Instruction bitcasts can be used for Reshape2 and Reshape3 as long as the
 // outputs of Mul and Add are row-major.
-class GpuHorizontalFusion : public HloModulePass {
+class GpuHorizontalLoopFusion : public HloModulePass {
  public:
-  GpuHorizontalFusion() {}
+  GpuHorizontalLoopFusion() {}
 
-  absl::string_view name() const override { return "gpu_horizontal_fusion"; }
+  absl::string_view name() const override {
+    return "gpu_horizontal_loop_fusion";
+  }
 
   StatusOr<bool> Run(HloModule* module) override;
 
@@ -109,4 +111,4 @@ class GpuHorizontalFusion : public HloModulePass {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_FUSION_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_HORIZONTAL_LOOP_FUSION_H_
diff --git a/tensorflow/compiler/xla/service/gpu/horizontal_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion_test.cc
similarity index 93%
rename from tensorflow/compiler/xla/service/gpu/horizontal_fusion_test.cc
rename to tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion_test.cc
index bad589964ff..8091330cd47 100644
--- a/tensorflow/compiler/xla/service/gpu/horizontal_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/gpu/horizontal_fusion.h"
+#include "tensorflow/compiler/xla/service/gpu/horizontal_loop_fusion.h"
 
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/gpu/fusion_merger.h"
@@ -37,9 +37,9 @@ namespace {
 
 namespace op = xla::testing::opcode_matchers;
 
-class HorizontalFusionTest : public HloTestBase {};
+class HorizontalLoopFusionTest : public HloTestBase {};
 
-TEST_F(HorizontalFusionTest, BasicTest) {
+TEST_F(HorizontalLoopFusionTest, BasicTest) {
   auto module = ParseAndReturnVerifiedModule(R"(
  HloModule BasicTest
 
@@ -70,7 +70,7 @@ TEST_F(HorizontalFusionTest, BasicTest) {
 )")
                     .ValueOrDie();
 
-  EXPECT_TRUE(GpuHorizontalFusion().Run(module.get()).ValueOrDie());
+  EXPECT_TRUE(GpuHorizontalLoopFusion().Run(module.get()).ValueOrDie());
   EXPECT_TRUE(HloDCE().Run(module.get()).ValueOrDie());
 
   const HloInstruction* entry_root =
@@ -88,7 +88,7 @@ TEST_F(HorizontalFusionTest, BasicTest) {
 }
 
 // Horizontal fusion should not be triggered as fusion will create cycles.
-TEST_F(HorizontalFusionTest, NegativeTestForCycle) {
+TEST_F(HorizontalLoopFusionTest, NegativeTestForCycle) {
   auto module = ParseAndReturnVerifiedModule(R"(
  HloModule NegativeTestForCycle
 
@@ -122,10 +122,10 @@ TEST_F(HorizontalFusionTest, NegativeTestForCycle) {
 )")
                     .ValueOrDie();
 
-  EXPECT_FALSE(GpuHorizontalFusion().Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(GpuHorizontalLoopFusion().Run(module.get()).ValueOrDie());
 }
 
-TEST_F(HorizontalFusionTest, NegativeTestForIncompatibleTypes) {
+TEST_F(HorizontalLoopFusionTest, NegativeTestForIncompatibleTypes) {
   auto module = ParseAndReturnVerifiedModule(R"(
  HloModule NegativeTestForIncompatibleTypes
 
@@ -158,10 +158,10 @@ TEST_F(HorizontalFusionTest, NegativeTestForIncompatibleTypes) {
 )")
                     .ValueOrDie();
 
-  EXPECT_FALSE(GpuHorizontalFusion().Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(GpuHorizontalLoopFusion().Run(module.get()).ValueOrDie());
 }
 
-TEST_F(HorizontalFusionTest, HorizontalFusionAfterVerticalFusion) {
+TEST_F(HorizontalLoopFusionTest, HorizontalLoopFusionAfterVerticalFusion) {
   auto module = ParseAndReturnVerifiedModule(R"(
  HloModule MergeSharedFusionInstruction
 
@@ -190,7 +190,7 @@ TEST_F(HorizontalFusionTest, HorizontalFusionAfterVerticalFusion) {
   fusion.AddPass<xla::gpu::GpuInstructionFusion>(/*may_duplicate=*/false);
   fusion.AddPass<xla::gpu::GpuInstructionFusion>(/*may_duplicate=*/true);
   EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
-  EXPECT_TRUE(GpuHorizontalFusion().Run(module.get()).ValueOrDie());
+  EXPECT_TRUE(GpuHorizontalLoopFusion().Run(module.get()).ValueOrDie());
 
   VLOG(2) << "Dump after horizontal fusion:";
   VLOG(2) << module->ToString();
@@ -198,7 +198,7 @@ TEST_F(HorizontalFusionTest, HorizontalFusionAfterVerticalFusion) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module), ErrorSpec{0, 0}));
 }
 
-TEST_F(HorizontalFusionTest, GradientDescentOptimizerLike) {
+TEST_F(HorizontalLoopFusionTest, GradientDescentOptimizerLike) {
   HloComputation::Builder builder(TestName());
 
   std::vector<HloInstruction*> var_outs;
@@ -229,7 +229,7 @@ TEST_F(HorizontalFusionTest, GradientDescentOptimizerLike) {
   EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{0, 0}));
 }
 
-TEST_F(HorizontalFusionTest, FusingDifferentOutputs) {
+TEST_F(HorizontalLoopFusionTest, FusingDifferentOutputs) {
   auto module = ParseAndReturnVerifiedModule(R"(
  HloModule HeterogeneousMultiOutputFusions
 
@@ -280,7 +280,7 @@ TEST_F(HorizontalFusionTest, FusingDifferentOutputs) {
 )")
                     .ValueOrDie();
 
-  EXPECT_TRUE(GpuHorizontalFusion().Run(module.get()).ValueOrDie());
+  EXPECT_TRUE(GpuHorizontalLoopFusion().Run(module.get()).ValueOrDie());
   EXPECT_TRUE(HloDCE().Run(module.get()).ValueOrDie());
 
   VLOG(2) << "Dump after horizontal fusion:";
@@ -289,7 +289,7 @@ TEST_F(HorizontalFusionTest, FusingDifferentOutputs) {
   EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module), ErrorSpec{0, 0}));
 }
 
-TEST_F(HorizontalFusionTest, RMSPropLike) {
+TEST_F(HorizontalLoopFusionTest, RMSPropLike) {
   HloComputation::Builder builder(TestName());
 
   std::vector<HloInstruction*> all_outputs;
@@ -364,7 +364,7 @@ TEST_F(HorizontalFusionTest, RMSPropLike) {
   EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{1.0e-5, 1.0e-5}));
 }
 
-TEST_F(HorizontalFusionTest, NegativeTestForDynamicUpdateSlice) {
+TEST_F(HorizontalLoopFusionTest, NegativeTestForDynamicUpdateSlice) {
   auto module = ParseAndReturnVerifiedModule(R"(
   HloModule NegativeTestForDynamicUpdateSlice
 
@@ -400,7 +400,7 @@ TEST_F(HorizontalFusionTest, NegativeTestForDynamicUpdateSlice) {
   })")
                     .ValueOrDie();
 
-  EXPECT_FALSE(GpuHorizontalFusion().Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(GpuHorizontalLoopFusion().Run(module.get()).ValueOrDie());
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
index 5fe459a70bc..dc3a0c788ac 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.cc
@@ -26,14 +26,13 @@ InfeedThunk::InfeedThunk(
     ThunkInfo thunk_info,
     const ShapeTree<BufferAllocation::Slice>& infeed_slices)
     : Thunk(Kind::kInfeed, thunk_info),
-      hlo_instruction_(thunk_info.hlo_instruction),
       infeed_slices_(infeed_slices) {}
 
 Status InfeedThunk::ExecuteOnStream(const ExecuteParams& params) {
   auto& stream = *params.stream;
   auto& buffer_allocations = *params.buffer_allocations;
 
-  VLOG(2) << "Infeeding to GPU: " << hlo_instruction_->ToString();
+  VLOG(2) << "Infeeding to GPU";
 
   auto op_profiler =
       params.profiler->MakeScopedInstructionProfiler(profile_index());
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
index ab410661ba1..ec33235c466 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/infeed_thunk.h
@@ -43,7 +43,6 @@ class InfeedThunk : public Thunk {
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
-  const HloInstruction* hlo_instruction_;
   const ShapeTree<BufferAllocation::Slice> infeed_slices_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index b994ead17ca..b90e4d85f80 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -60,18 +60,22 @@ bool GpuInstructionFusion::ShouldFuseInexpensiveChecks(HloInstruction* consumer,
 
   // Output fusions are not currently supported on GPUs.
   if (producer->opcode() == HloOpcode::kFusion) {
+    VLOG(4) << "Producer " << producer->name() << " is a fusion op";
     return false;
   }
   // Cost condition: not fuse (simple, expensive producers) and (consumers who
   // reuse operand elements).
-  if (producer->opcode() != HloOpcode::kFusion &&
-      consumer->ReusesOperandElements(operand_index) &&
-      is_expensive(*producer)) {
+  if (producer->opcode() != HloOpcode::kFusion && is_expensive(*producer) &&
+      ReusesOperandElements(consumer, operand_index)) {
+    VLOG(4) << "Do not fuse simple, expensive producer " << producer->name()
+            << " and consumer which reuses operand elements.";
     return false;
   }
 
   if (!IsProducerConsumerFusible(*producer, *consumer) ||
       !InstructionFusion::ShouldFuse(consumer, operand_index)) {
+    VLOG(4) << "Producer " << producer->name()
+            << " is not fusible or should not be fused.";
     return false;
   }
   return true;
@@ -87,7 +91,8 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
   auto producer = consumer->operand(operand_index);
 
   // The following checks are potentially expensive.
-  if (FusionWouldBeTooLarge(*consumer, *producer)) {
+  if (FusionWouldBeTooLarge(*consumer, *producer,
+                            /*is_consumer_producer_fusion=*/true)) {
     VLOG(5) << "Fusion of (" << producer->ToString() << ") into ("
             << consumer->ToString() << ") would be too large";
     return false;
@@ -107,8 +112,12 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
     fusion_node_evaluations_.emplace(consumer,
                                      FusionNodeIndexingEvaluation(consumer));
   }
-  return !fusion_node_evaluations_.at(consumer).AverageCodeDuplicationTooHigh(
-      producer);
+  if (fusion_node_evaluations_.at(consumer).CodeDuplicationTooHigh(producer)) {
+    VLOG(5) << "Fusion of " << producer->name() << " into " << consumer->name()
+            << " would result in overly large code duplication.";
+    return false;
+  }
+  return true;
 }
 
 bool GpuInstructionFusion::ShouldFuseIntoMultiOutput(HloInstruction* consumer,
diff --git a/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.cc b/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.cc
index 154612824ef..4f4409ab896 100644
--- a/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.cc
@@ -35,8 +35,9 @@ XLAThunksDialect::XLAThunksDialect(MLIRContext *context)
       >();
 }
 
+}  // namespace xla_thunks
+
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.cc.inc"
 
-}  // namespace xla_thunks
 }  // namespace mlir
diff --git a/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.h b/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.h
index ede9adb9ab1..bc0da6a8fc8 100644
--- a/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.h
+++ b/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.h
@@ -33,10 +33,11 @@ class XLAThunksDialect : public Dialect {
   static StringRef getDialectNamespace() { return "xla_thunks"; }
 };
 
+}  // namespace xla_thunks
+
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.h.inc"
 
-}  // namespace xla_thunks
 }  // namespace mlir
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_XLA_THUNKS_OPS_H_
diff --git a/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.td b/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.td
index 38602550864..eb203e6917d 100644
--- a/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.td
+++ b/tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.td
@@ -21,12 +21,6 @@ limitations under the License.
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
 include "mlir/IR/OpBase.td"
 
-class LLVMPointerTo<Type ty>
-    : ContainerType<ty,
-                    CPred<"$_self.cast<::mlir::LLVM::LLVMType>().isPointerTy()">,
-                    "$_self.cast<::mlir::LLVM::LLVMType>().getPointerElementTy()",
-                    "LLVM pointer">;
-
 def XLAThunks_Dialect : Dialect {
   let name = "xla_thunks";
   let cppNamespace = "xla_thunks";
@@ -45,12 +39,12 @@ def AllocationSlice : StructAttr<"AllocationSlice", XLAThunks_Dialect, [
 
 def MemzeroThunkOp : ThunkOp<"execute_memzero_thunk"> {
   let arguments = (ins
-    LLVMPointerTo<LLVMI<8>>:$execute_params,
+    LLVM_PointerTo<LLVM_i8>:$execute_params,
     AllocationSlice:$allocation_slice
   );
   let results = (outs
     I<1>:$ok,
-    LLVMPointerTo<LLVMI<8>>:$error_message
+    LLVM_PointerTo<LLVM_i8>:$error_message
   );
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 9d4ec358bd3..7743d19497d 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -472,39 +472,6 @@ StatusOr<CudnnConvKind> GetCudnnConvKind(
   return InternalError("Unexpected call target: %s", target);
 }
 
-StatusOr<se::dnn::ConvolutionKind> GetDnnConvolutionKind(
-    const HloCustomCallInstruction* instr) {
-  absl::string_view target = instr->custom_call_target();
-  if (target == kCudnnConvForwardCallTarget) {
-    return se::dnn::ConvolutionKind::FORWARD;
-  }
-  if (target == kCudnnConvBackwardInputCallTarget) {
-    return se::dnn::ConvolutionKind::BACKWARD_DATA;
-  }
-  if (target == kCudnnConvBackwardFilterCallTarget) {
-    return se::dnn::ConvolutionKind::BACKWARD_FILTER;
-  }
-  return InternalError("Unexpected call target: %s", target);
-}
-
-StatusOr<se::dnn::DataType> GetDnnDataType(
-    const HloCustomCallInstruction* conv) {
-  PrimitiveType output_primitive_type =
-      conv->shape().tuple_shapes(0).element_type();
-  switch (output_primitive_type) {
-    case F16:
-      return se::dnn::ToDataType<Eigen::half>::value;
-    case F32:
-      return se::dnn::ToDataType<float>::value;
-    case F64:
-      return se::dnn::ToDataType<double>::value;
-    default:
-      break;
-  }
-  return InternalError("Unsupported convolution datatype : %s",
-                       conv->ToString());
-}
-
 string CudnnConvKindToString(CudnnConvKind kind) {
   switch (kind) {
     case CudnnConvKind::kForward:
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
index 6f731b2936f..a782eb3f507 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@@ -55,12 +55,6 @@ enum class CudnnConvKind {
 
 StatusOr<CudnnConvKind> GetCudnnConvKind(const HloCustomCallInstruction* instr);
 
-StatusOr<se::dnn::ConvolutionKind> GetDnnConvolutionKind(
-    const HloCustomCallInstruction* instr);
-
-StatusOr<se::dnn::DataType> GetDnnDataType(
-    const HloCustomCallInstruction* conv);
-
 // Converts a CudnnConvKind value to a string.
 string CudnnConvKindToString(CudnnConvKind kind);
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 31203b9c5f0..2215881271c 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -30,12 +30,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
 #include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
@@ -98,6 +100,64 @@ Status IrEmitter::DefaultAction(HloInstruction* hlo) {
                 .MakeElementGenerator(hlo, operand_to_generator));
 }
 
+Status IrEmitter::EmitConstants(const HloComputation& computation,
+                                bool lookup_indices) {
+  for (HloInstruction* instr : computation.instructions()) {
+    if (instr->opcode() != HloOpcode::kConstant) {
+      continue;
+    }
+    Literal& literal = *Cast<HloConstantInstruction>(instr)->mutable_literal();
+    const bool should_emit_initializer = ShouldEmitLiteralInLlvmIr(literal);
+    llvm::ArrayType* global_type =
+        llvm::ArrayType::get(b_.getInt8Ty(), literal.size_bytes());
+    llvm::Constant* initializer =
+        should_emit_initializer
+            ? llvm_ir::ConvertLiteralToIrConstant(literal, module_)
+            : llvm::ConstantAggregateZero::get(global_type);
+    if (should_emit_initializer) {
+      VLOG(3) << "Emitted initializer for constant with shape "
+              << ShapeUtil::HumanString(literal.shape());
+    }
+
+    // These globals will be looked up by name by GpuExecutable so we need to
+    // give them an external linkage.  Not all of their uses are visible in
+    // the LLVM IR (e.g. TupleThunk) so we can't give then a linkage that
+    // merely preserves their names (like available_externally), we also need
+    // to ensure that they stick around even if they're "unused".
+    //
+    // We may have to be more more clever here in the future if we notice that
+    // we're keeping around too many globals because of their linkage.
+    unsigned global_address_space = llvm_ir::GetGlobalMemoryAddressSpace(
+        *ir_emitter_context_->llvm_module());
+
+    std::string global_name = llvm_ir::ConstantHloToGlobalName(*instr);
+
+    llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable(
+        global_type, /*isConstant=*/should_emit_initializer,
+        llvm::GlobalValue::ExternalLinkage,
+        /*Initializer=*/initializer, global_name,
+        /*TLMode=*/llvm::GlobalValue::NotThreadLocal,
+        /*AddressSpace=*/global_address_space,
+        /*isExternallyInitialized=*/false);
+    global_for_const->setAlignment(llvm::Align(kConstantBufferAlignBytes));
+    ir_emitter_context_->llvm_module()->getGlobalList().push_back(
+        global_for_const);
+
+    GpuExecutable::ConstantInfo info;
+    info.symbol_name = global_name;
+    info.content = literal.Clone();
+    if (lookup_indices) {
+      auto maybe_slice =
+          ir_emitter_context_->buffer_assignment().GetUniqueSlice(instr, {});
+      if (maybe_slice.ok()) {
+        info.allocation_index = maybe_slice.ValueOrDie().index();
+      }
+    }
+    ir_emitter_context_->constants().push_back(std::move(info));
+  }
+  return Status::OK();
+}
+
 Status IrEmitter::HandleConstant(HloInstruction* constant) {
   return Status::OK();
 }
@@ -175,10 +235,12 @@ Status IrEmitter::EmitCallToNestedComputation(
   llvm::Function*& emitted_function =
       computation_to_ir_function_[&nested_computation];
   if (emitted_function == nullptr) {
-    IrEmitterNested ir_emitter_nested(hlo_module_config_, nested_computation,
-                                      ir_emitter_context_);
-    TF_RETURN_IF_ERROR(ir_emitter_nested.CodegenNestedComputation());
-    emitted_function = ir_emitter_nested.GetEmittedFunction();
+    TF_ASSIGN_OR_RETURN(
+        auto ir_emitter_nested,
+        IrEmitterNested::Create(hlo_module_config_, nested_computation,
+                                ir_emitter_context_));
+    TF_RETURN_IF_ERROR(ir_emitter_nested->CodegenNestedComputation());
+    emitted_function = ir_emitter_nested->GetEmittedFunction();
   }
 
   // Operands are in default address space for non-AMDGPU target.
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 50e9f06ef08..1a387528220 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -105,6 +105,12 @@ class IrEmitter : public DfsHloVisitorWithDefault,
 
   llvm::IRBuilder<>* builder() { return &b_; }
 
+  // Emits constants to generated LLVM IR, and also populate related
+  // inforamtion to ir_emitter_context for large-constant initializations. If
+  // `lookup_indices` is true, the allocation index associated with the constant
+  // is also populated.
+  Status EmitConstants(const HloComputation& computation, bool lookup_indices);
+
  protected:
   // Constructs an IrEmitter with the given IrEmitter context.
   // ir_emitter_context is owned by the caller and should outlive the IrEmitter
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
index 7d5a8d032e6..34b93ca5b3f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_context.h
@@ -17,14 +17,19 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_IR_EMITTER_CONTEXT_H_
 
 #include "llvm/IR/Module.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 
 namespace xla {
 namespace gpu {
+
 // IrEmitterContext encapsulates common (mutable and immutable) data structures
 // used by both IrEmitterNested and IrEmitterUnnested, such as the buffer
 // assignment and the name uniquer.
@@ -44,7 +49,11 @@ class IrEmitterContext {
         cuda_compute_capability_(cuda_compute_capability),
         profile_index_map_(profile_index_map),
         mlir_context_(mlir_context),
-        llvm_module_(llvm_module) {}
+        llvm_module_(llvm_module) {
+    mlir_context_
+        ->loadDialect<mlir::lmhlo::LmhloDialect, mlir::mhlo::MhloDialect,
+                      mlir::StandardOpsDialect>();
+  }
   // Disallow copy and assign.
   IrEmitterContext(const IrEmitterContext&) = delete;
   IrEmitterContext& operator=(const IrEmitterContext&) = delete;
@@ -64,6 +73,8 @@ class IrEmitterContext {
   llvm::Module* llvm_module() { return llvm_module_; }
   NameUniquer* name_uniquer() { return &name_uniquer_; }
 
+  std::vector<GpuExecutable::ConstantInfo>& constants() { return constants_; }
+
  private:
   const HloModule* hlo_module_;
   const BufferAssignment* buffer_assignment_;
@@ -74,6 +85,7 @@ class IrEmitterContext {
   mlir::MLIRContext* mlir_context_;
   llvm::Module* llvm_module_;
   NameUniquer name_uniquer_;
+  std::vector<GpuExecutable::ConstantInfo> constants_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
index e96c5f05e60..5fc091ed8e7 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
@@ -41,6 +41,16 @@ IrEmitterNested::IrEmitterNested(const HloModuleConfig& hlo_module_config,
     : IrEmitter(hlo_module_config, ir_emitter_context, /*is_nested=*/true),
       nested_computation_(nested_computation) {}
 
+StatusOr<std::unique_ptr<IrEmitterNested>> IrEmitterNested::Create(
+    const HloModuleConfig& hlo_module_config,
+    const HloComputation& nested_computation,
+    IrEmitterContext* ir_emitter_context) {
+  std::unique_ptr<IrEmitterNested> emitter(new IrEmitterNested(
+      hlo_module_config, nested_computation, ir_emitter_context));
+  TF_RETURN_IF_ERROR(emitter->EmitConstants(nested_computation, false));
+  return emitter;
+}
+
 // Nested function serves the same purpose on GPU as a thread-local function on
 // a CPU.
 Status IrEmitterNested::CodegenNestedComputation() {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h
index ce825851bcc..8ed76cabcda 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.h
@@ -39,12 +39,11 @@ namespace gpu {
 //
 class IrEmitterNested : public IrEmitter {
  public:
-  // Constructs an LLVM IR emitter for a nested HLO computation. `function` is
-  // the containing IR function this emitter produces IR to. See
-  // IrEmitter::IrEmitter for the meanings of other arguments.
-  IrEmitterNested(const HloModuleConfig& hlo_module_config,
-                  const HloComputation& nested_computation,
-                  IrEmitterContext* ir_emitter_context);
+  static StatusOr<std::unique_ptr<IrEmitterNested>> Create(
+      const HloModuleConfig& hlo_module_config,
+      const HloComputation& nested_computation,
+      IrEmitterContext* ir_emitter_context);
+
   IrEmitterNested(const IrEmitterNested&) = delete;
   IrEmitterNested& operator=(const IrEmitterNested&) = delete;
 
@@ -62,6 +61,13 @@ class IrEmitterNested : public IrEmitter {
   Status CodegenNestedComputation();
 
  private:
+  // Constructs an LLVM IR emitter for a nested HLO computation. `function` is
+  // the containing IR function this emitter produces IR to. See
+  // IrEmitter::IrEmitter for the meanings of other arguments.
+  IrEmitterNested(const HloModuleConfig& hlo_module_config,
+                  const HloComputation& nested_computation,
+                  IrEmitterContext* ir_emitter_context);
+
   const HloComputation& nested_computation_;
   llvm::Function* emitted_function_;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index f88c70b1a33..b94a7458df2 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/mlir/utils/name_utils.h"
 #include "tensorflow/compiler/mlir/xla/hlo_utils.h"
 #include "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h"
 #include "tensorflow/compiler/mlir/xla/type_to_shape.h"
@@ -90,6 +91,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/union_find.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -141,7 +143,7 @@ void UpdateLaunchDimensions(const LaunchDimensions& launch_dims, Thunk* thunk,
   llvm::LLVMContext& llvm_context = llvm_module->getContext();
   llvm::ConstantInt* threads_per_block_ir_value = llvm::ConstantInt::get(
       llvm::IntegerType::get(llvm_context, /*NumBits=*/32),
-      launch_dims.threads_per_block());
+      launch_dims.thread_counts_per_block().x);
   // Our launch bounds are exact, so we can specify them as reqntidx rather than
   // maxntidx.
   nvvm_annotations_node->addOperand(llvm::MDNode::get(
@@ -151,24 +153,22 @@ void UpdateLaunchDimensions(const LaunchDimensions& launch_dims, Thunk* thunk,
        llvm::ConstantAsMetadata::get(threads_per_block_ir_value)}));
 }
 
-const BufferAllocation* GetAllocation(
-    mlir::BlockArgument func_arg, const BufferAssignment& buffer_assignment) {
+int64_t GetAllocationIndex(mlir::BlockArgument func_arg) {
   auto func_op =
       mlir::cast<mlir::FuncOp>(func_arg.getParentRegion()->getParentOp());
-  int64 allocation_index = func_op
-                               .getArgAttrOfType<mlir::IntegerAttr>(
-                                   func_arg.getArgNumber(), "lmhlo.alloc")
-                               .getValue()
-                               .getSExtValue();
-  return &buffer_assignment.GetAllocation(allocation_index);
+  return func_op
+      .getArgAttrOfType<mlir::IntegerAttr>(func_arg.getArgNumber(),
+                                           "lmhlo.alloc")
+      .getValue()
+      .getSExtValue();
 }
 
 StatusOr<BufferAllocation::Slice> GetAllocationSliceForMlir(
-    mlir::Value v, const BufferAssignment& buffer_assignment) {
+    mlir::Value v, absl::Span<const BufferAllocation> allocations) {
   int64 size = v.getType().cast<mlir::MemRefType>().getSizeInBits() / 8;
 
   if (auto arg = v.dyn_cast<mlir::BlockArgument>()) {
-    return BufferAllocation::Slice(GetAllocation(arg, buffer_assignment), 0,
+    return BufferAllocation::Slice(&allocations[GetAllocationIndex(arg)], 0,
                                    size);
   }
 
@@ -185,8 +185,8 @@ StatusOr<BufferAllocation::Slice> GetAllocationSliceForMlir(
     }
     if (auto view = mlir::dyn_cast<mlir::ViewOp>(op)) {
       return BufferAllocation::Slice(
-          GetAllocation(view.source().cast<mlir::BlockArgument>(),
-                        buffer_assignment),
+          &allocations[GetAllocationIndex(
+              view.source().cast<mlir::BlockArgument>())],
           mlir::cast<mlir::ConstantOp>(view.byte_shift().getDefiningOp())
               .value()
               .cast<mlir::IntegerAttr>()
@@ -202,12 +202,29 @@ StatusOr<BufferAllocation::Slice> GetAllocationSliceForMlir(
       "StaticMemRefCastOp(ViewOp(arg))");
 }
 
-absl::string_view GetHloName(mlir::Operation* op) {
-  if (auto attr = op->getAttrOfType<mlir::StringAttr>("name")) {
-    auto ref = attr.getValue();
-    return absl::string_view(ref.data(), ref.size());
+StatusOr<std::vector<MlirBufferSlice>> GetMlirBufferSlices(
+    mlir::Operation* op, mlir::OperandRange operands,
+    absl::Span<const BufferAllocation> allocations) {
+  const auto buffer_is_written = [op](mlir::Value operand) {
+    llvm::SmallVector<mlir::MemoryEffects::EffectInstance, 2> effects;
+    mlir::cast<mlir::MemoryEffectOpInterface>(op).getEffectsOnValue(operand,
+                                                                    effects);
+    return absl::c_any_of(
+        effects, [](const mlir::MemoryEffects::EffectInstance& instance) {
+          return mlir::isa<mlir::MemoryEffects::Write>(instance.getEffect());
+        });
+  };
+
+  std::vector<MlirBufferSlice> slices;
+  for (mlir::Value operand : operands) {
+    slices.emplace_back();
+    auto& slice = slices.back();
+    TF_ASSIGN_OR_RETURN(slice.buffer_slice,
+                        GetAllocationSliceForMlir(operand, allocations));
+    slice.written = buffer_is_written(operand);
+    slice.shape = TypeToShape(operand.getType());
   }
-  return "";
+  return slices;
 }
 
 }  // namespace
@@ -229,6 +246,7 @@ StatusOr<std::unique_ptr<IrEmitterUnnested>> IrEmitterUnnested::Create(
   auto emitter = std::unique_ptr<IrEmitterUnnested>(new IrEmitterUnnested(
       hlo_module_config, hlo_computation, ir_emitter_context));
   TF_RETURN_IF_ERROR(emitter->lhlo_scratch_emitter_.Initialize());
+  TF_RETURN_IF_ERROR(emitter->EmitConstants(*hlo_computation, true));
   return std::move(emitter);
 }
 
@@ -387,6 +405,62 @@ llvm::Type* GetIndexTypeForKernel(const HloInstruction* hlo, int64 launch_size,
   return b->getInt32Ty();
 }
 
+// The same as GetIndexTypeForKernel, but works with MLIR ops.
+llvm::Type* GetIndexTypeForKernelFromMlir(mlir::Operation* op,
+                                          int64 launch_size,
+                                          llvm::IRBuilder<>* b) {
+  auto shape_in_range = [&](const Shape& s) {
+    bool in_range = true;
+    ShapeUtil::ForEachSubshape(s, [&](const Shape& sub_shape,
+                                      const ShapeIndex& /*index*/) {
+      if (sub_shape.IsArray() && !IsInt32(ShapeUtil::ElementsIn(sub_shape))) {
+        in_range = false;
+      }
+    });
+
+    return in_range;
+  };
+
+  llvm::Type* i64_ty = b->getInt64Ty();
+  // Check launch dimension
+  if (!IsInt32(launch_size)) {
+    return i64_ty;
+  }
+
+  // Check the size of result tensors
+  for (auto result : op->getResults()) {
+    if (!shape_in_range(TypeToShape(result.getType()))) {
+      return i64_ty;
+    }
+  }
+
+  auto hlo_shape_in_range = [&](mlir::Value operand) -> bool {
+    return shape_in_range(TypeToShape(operand.getType()));
+  };
+
+  // Check the size of input tensors
+  if (!absl::c_all_of(op->getOperands(), hlo_shape_in_range)) {
+    return i64_ty;
+  }
+
+  // Check the size of the internal result tensors
+  if (auto fusion = mlir::cast<mlir::lmhlo::FusionOp>(op)) {
+    auto result = fusion.region().walk([&](mlir::Operation* op) {
+      for (mlir::Value result : op->getResults()) {
+        if (!hlo_shape_in_range(result)) {
+          return mlir::WalkResult::interrupt();
+        }
+      }
+      return mlir::WalkResult::advance();
+    });
+    if (result.wasInterrupted()) {
+      return i64_ty;
+    }
+  }
+
+  return b->getInt32Ty();
+}
+
 // Gets the input shape of the ROOT slices, which will be used as the kernel
 // launch dims. The slice input fusion requires the input shapes of the ROOT
 // slices to be the same although the (slice) output shapes can be different.
@@ -1366,13 +1440,6 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
 
   TF_ASSIGN_OR_RETURN(auto sort_op, lhlo_scratch_emitter_.EmitSortOp(sort));
   result.op = sort_op;
-  result.name = GetHloName(sort_op);
-  // The name in sort op has no semantics, and it's for debug only. If the name
-  // doesn't exist, we should use a namer (e.g. count-based).
-  // TODO(timshen): use a namer instead of relying on the HloInstruction names.
-  if (result.name.empty()) {
-    result.name = sort->name();
-  }
   const auto& buffer_assignment = ir_emitter_context_->buffer_assignment();
   auto& slice = result.extra_slice;
   TF_ASSIGN_OR_RETURN(slice.buffer_slice,
@@ -1382,74 +1449,57 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
 
   result.thunk_info = GetThunkInfo(sort);
 
-  return EmitMlirSort(result);
+  return EmitSortFromMlir(result);
 }
 
-Status IrEmitterUnnested::EmitMlirSort(MlirEmitterInput input) {
-  const auto& buffer_assignment = ir_emitter_context_->buffer_assignment();
-  auto sort_op = mlir::cast<mlir::lmhlo::SortOp>(input.op);
-
-  int operand_count = sort_op.operands().size();
-  std::vector<xla::Shape> operand_shapes(operand_count);
-  std::vector<MlirBufferSlice> slices;
-  std::vector<xla::Shape> output_shapes(sort_op.output().size());
-
-  for (int i = 0; i < operand_count; i++) {
-    operand_shapes[i] =
-        TypeToShape(sort_op.operands()[i].getType().cast<mlir::MemRefType>());
-  }
-
-  // Craft n + 1 slices, where the first n are output parameters, and the last
-  // is the on-device tuple storage. We don't need n operands because sorting
-  // kernels are always in-place.
-  for (int i = 0; i < operand_count; i++) {
-    output_shapes[i] =
-        TypeToShape(sort_op.output()[i].getType().cast<mlir::MemRefType>());
-    MlirBufferSlice slice;
-    TF_ASSIGN_OR_RETURN(
-        slice.buffer_slice,
-        GetAllocationSliceForMlir(sort_op.output()[i], buffer_assignment));
-    slice.written = true;
-    slice.shape = operand_shapes[i];
-    slices.push_back(slice);
-  }
-  slices.push_back(input.extra_slice);
+Status IrEmitterUnnested::EmitSortFromMlir(MlirEmitterInput mlir_input) {
+  absl::Span<const BufferAllocation> allocations(
+      ir_emitter_context_->buffer_assignment().Allocations());
+  auto sort_op = mlir::cast<mlir::lmhlo::SortOp>(mlir_input.op);
+  std::string name = mlir::GetNameFromLoc(sort_op.getLoc());
+  TF_ASSIGN_OR_RETURN(
+      std::vector<MlirBufferSlice> operands,
+      GetMlirBufferSlices(sort_op, sort_op.operands(), allocations));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<MlirBufferSlice> outputs,
+      GetMlirBufferSlices(sort_op, sort_op.output(), allocations));
+  outputs.push_back(mlir_input.extra_slice);
 
   std::vector<std::unique_ptr<Thunk>> thunks;
 
-  Shape keys_shape = operand_shapes[0];
-  int64 dimension_to_sort = sort_op.dimension().getSExtValue();
-  for (int64 i = 0; i < operand_count; ++i) {
+  Shape keys_shape = operands[0].shape;
+  int64 dimension_to_sort = sort_op.dimension();
+  for (int64 i = 0; i < operands.size(); ++i) {
     // We assume that the layout of all involved operands and outputs is the
     // same.
     TF_RET_CHECK(
-        LayoutUtil::LayoutsInShapesEqual(keys_shape, operand_shapes[i]));
+        LayoutUtil::LayoutsInShapesEqual(keys_shape, operands[i].shape));
     TF_RET_CHECK(
-        LayoutUtil::LayoutsInShapesEqual(keys_shape, output_shapes[i]));
+        LayoutUtil::LayoutsInShapesEqual(keys_shape, outputs[i].shape));
 
     // If possible, we share buffers. If that is not possible, we need to copy
     // the values, because the emitter does the sorting in-place.
     TF_ASSIGN_OR_RETURN(
         auto destination_buffer,
-        GetAllocationSliceForMlir(sort_op.output()[i], buffer_assignment));
+        GetAllocationSliceForMlir(sort_op.output()[i], allocations));
     TF_ASSIGN_OR_RETURN(
         auto source_address,
-        GetAllocationSliceForMlir(sort_op.operands()[i], buffer_assignment));
+        GetAllocationSliceForMlir(sort_op.operands()[i], allocations));
     if (destination_buffer != source_address) {
       // TODO(b/26783907): Figure out why we never seem to share buffers for
       // key/value sort.
-      VLOG(2) << input.name << " requires initial D2D copy for operand " << i;
+      VLOG(2) << name << " requires initial D2D copy for operand " << i;
       thunks.push_back(absl::make_unique<DeviceToDeviceCopyThunk>(
           Thunk::ThunkInfo(),
           /*source_address=*/source_address,
           /*destination_buffer=*/destination_buffer,
-          /*mem_size=*/ShapeUtil::ByteSizeOf(operand_shapes[i])));
+          /*mem_size=*/ShapeUtil::ByteSizeOf(operands[i].shape)));
     }
   }
 
   uint64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
   int64 num_stages = tensorflow::Log2Ceiling(dimension_to_sort_bound);
-  VLOG(2) << input.name << " requires " << num_stages << " stages.";
+  VLOG(2) << name << " requires " << num_stages << " stages.";
   CHECK_GE(1ULL << num_stages, dimension_to_sort_bound);
   CHECK_LT(1ULL << (num_stages - 1), dimension_to_sort_bound);
 
@@ -1513,10 +1563,10 @@ Status IrEmitterUnnested::EmitMlirSort(MlirEmitterInput input) {
   // we have not enough threads, or not enough shared memory. Also it does not
   // give a speedup if the tile size is < 128.
   int64 total_shared_memory_needed = 0;
-  for (int64 i = 0; i < operand_count; ++i) {
+  for (int64 i = 0; i < operands.size(); ++i) {
     total_shared_memory_needed +=
         kTileSize *
-        ShapeUtil::ByteSizeOfPrimitiveType(operand_shapes[i].element_type());
+        ShapeUtil::ByteSizeOfPrimitiveType(operands[i].shape.element_type());
   }
   bool no_tiling =
       kTileSize < 128 ||
@@ -1529,7 +1579,7 @@ Status IrEmitterUnnested::EmitMlirSort(MlirEmitterInput input) {
       "kTileSize=%d < 128, "
       "kThreadsPerBlock=%d > threads_per_block_limit=%d, "
       "total_shared_memory_needed=%d > shared_memory_per_block=%d",
-      input.name, (no_tiling ? "won't" : "will"), kTileSize, kThreadsPerBlock,
+      name, (no_tiling ? "won't" : "will"), kTileSize, kThreadsPerBlock,
       ir_emitter_context_->gpu_device_info().threads_per_block_limit,
       total_shared_memory_needed,
       ir_emitter_context_->gpu_device_info().shared_memory_per_block);
@@ -1537,32 +1587,32 @@ Status IrEmitterUnnested::EmitMlirSort(MlirEmitterInput input) {
   uint64 num_blocks = CeilOfRatio(num_iterations, kThreadsPerBlock);
   LaunchDimensions tiled_launch_dimensions(num_blocks, kThreadsPerBlock);
   VLOG(2) << absl::StreamFormat("%s launch dims: %d blocks, %d threads/block",
-                                input.name, num_blocks, kThreadsPerBlock);
+                                name, num_blocks, kThreadsPerBlock);
 
   std::vector<llvm_ir::IrArray> ir_arrays;
   auto emit_kernel = [&](absl::Span<const int64> xor_masks) {
     VLOG(2) << absl::StreamFormat(
-        "%s uses kernel for xor masks [%s]", input.name,
+        "%s uses kernel for xor masks [%s]", name,
         absl::StrJoin(xor_masks, ", ", [](std::string* out, int64 xor_mask) {
           absl::StrAppendFormat(out, "0x%x", xor_mask);
         }));
-    thunks.push_back(BuildKernelThunkForMlir(input.name, Thunk::ThunkInfo(),
-                                             slices, &ir_arrays));
+    thunks.push_back(
+        BuildKernelThunkForMlir(name, Thunk::ThunkInfo(), outputs, &ir_arrays));
     LaunchDimensions launch_dimensions = xor_masks.size() > 1
                                              ? tiled_launch_dimensions
                                              : standard_launch_dimensions;
     UpdateLaunchDimensions(launch_dimensions, thunks.back().get(),
                            ir_emitter_context_->llvm_module());
     std::vector<IrArray> values_arrays;
-    values_arrays.reserve(operand_count);
-    for (int64 i = 0; i < operand_count; ++i) {
+    values_arrays.reserve(operands.size());
+    for (int64 i = 0; i < operands.size(); ++i) {
       values_arrays.push_back(ir_arrays[i]);
     }
     TF_ASSIGN_OR_RETURN(
         const HloComputation* comparator,
         GetOrCreateSubComputationFromRegion(&sort_op.comparator()));
     return llvm_ir::EmitSortInPlace(
-        dimension_to_sort, values_arrays, IrName(input.name), xor_masks, &b_,
+        dimension_to_sort, values_arrays, IrName(name), xor_masks, &b_,
         launch_dimensions,
         xor_masks.size() > 1 ? num_iterations_in_sort_dim
                              : standard_num_iterations_in_sort_dim,
@@ -1595,17 +1645,16 @@ Status IrEmitterUnnested::EmitMlirSort(MlirEmitterInput input) {
     TF_RETURN_IF_ERROR(emit_kernel(xor_masks));
   }
   VLOG(2) << absl::StreamFormat(
-      "%s requires %d thunks (including any D2D copies)", input.name,
-      thunks.size());
+      "%s requires %d thunks (including any D2D copies)", name, thunks.size());
 
-  AddThunkToThunkSequence(
-      absl::make_unique<SequentialThunk>(input.thunk_info, std::move(thunks)));
-  if (operand_count > 1) {
+  AddThunkToThunkSequence(absl::make_unique<SequentialThunk>(
+      mlir_input.thunk_info, std::move(thunks)));
+  if (operands.size() > 1) {
     // Emit the tuple as part of the last stage of sorting.
     // We are currently in the block sorted.in_bounds.after.
     b_.SetInsertPoint(b_.GetInsertBlock()->getTerminator());
     llvm_ir::EmitTuple(
-        ir_arrays[operand_count],
+        ir_arrays.back(),
         absl::MakeSpan(ir_arrays).subspan(0, ir_arrays.size() - 1), &b_);
   }
   return Status::OK();
@@ -1624,9 +1673,10 @@ Status IrEmitterUnnested::HandleReplicaId(HloInstruction* hlo) {
 }
 
 Status IrEmitterUnnested::HandleCollectivePermute(HloInstruction* hlo) {
+  CollectivePermuteConfig config = GetCollectivePermuteConfig(hlo);
   AddThunkToThunkSequence(absl::make_unique<CollectivePermuteThunk>(
-      GetThunkInfo(hlo), GetAllocationSlice(*hlo->operand(0)),
-      GetAllocationSlice(*hlo)));
+      GetThunkInfo(hlo), std::move(config),
+      GetAllocationSlice(*hlo->operand(0)), GetAllocationSlice(*hlo)));
   return Status::OK();
 }
 
@@ -1658,9 +1708,10 @@ Status IrEmitterUnnested::HandleAllReduce(HloInstruction* crs) {
           *crs, crs->shape().IsTuple() ? ShapeIndex({i}) : ShapeIndex({}));
       tuple_element_buffers.push_back(buffers[i].destination_buffer);
     }
+    NcclAllReduceConfig config =
+        GetNcclAllReduceConfig(crs, hlo_module_config_.replica_count());
     auto all_reduce_thunk = absl::make_unique<NcclAllReduceThunk>(
-        GetThunkInfo(crs),
-        /*replica_count=*/hlo_module_config_.replica_count(),
+        GetThunkInfo(crs), std::move(config),
         /*buffers=*/std::move(buffers));
     if (crs->shape().IsTuple()) {
       std::vector<std::unique_ptr<Thunk>> thunks;
@@ -2252,11 +2303,19 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildWhileThunk(
       IrEmitterUnnested::Create(hlo_module_config_, body, ir_emitter_context_));
   TF_RETURN_IF_ERROR(body->Accept(ir_emitter_body.get()));
 
+  const auto* index_map = ir_emitter_context_->profile_index_map();
+  absl::optional<size_t> condition_profile_index, body_profile_index;
+  if (index_map) {
+    condition_profile_index = index_map->GetProfileIndexFor(*condition);
+    body_profile_index = index_map->GetProfileIndexFor(*body);
+  }
+
   return std::unique_ptr<Thunk>(new WhileThunk(
       GetThunkInfo(hlo),
       GetAllocationSlice(*condition->root_instruction()),  // cond result
       ir_emitter_condition->ConsumeThunkSequence(),
-      ir_emitter_body->ConsumeThunkSequence()));
+      ir_emitter_body->ConsumeThunkSequence(), condition_profile_index,
+      body_profile_index));
 }
 
 StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildForThunk(
@@ -2272,8 +2331,15 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildForThunk(
       IrEmitterUnnested::Create(hlo_module_config_, body, ir_emitter_context_));
   TF_RETURN_IF_ERROR(body->Accept(ir_emitter_body.get()));
 
+  const auto* index_map = ir_emitter_context_->profile_index_map();
+  absl::optional<size_t> body_profile_index;
+  if (index_map) {
+    body_profile_index = index_map->GetProfileIndexFor(*body);
+  }
+
   return std::unique_ptr<Thunk>(new ForThunk(
-      GetThunkInfo(hlo), loop_limit, ir_emitter_body->ConsumeThunkSequence()));
+      GetThunkInfo(hlo), loop_limit, ir_emitter_body->ConsumeThunkSequence(),
+      body_profile_index));
 }
 
 StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildConditionalThunk(
@@ -2285,7 +2351,15 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildConditionalThunk(
 
   std::vector<BufferAllocation::Slice> branch_operands;
   std::vector<ThunkSequence> branch_thunks;
-  for (int j = 0; j < hlo->branch_count(); ++j) {
+  std::vector<absl::optional<size_t>> branch_profile_indices;
+
+  int branch_count = hlo->branch_count();
+  branch_thunks.reserve(branch_count);
+  branch_profile_indices.reserve(branch_count);
+
+  const auto* index_map = ir_emitter_context_->profile_index_map();
+
+  for (int j = 0; j < branch_count; ++j) {
     branch_operands.emplace_back(GetAllocationSlice(*hlo->operand(j + 1)));
     HloComputation* branch_computation = hlo->branch_computation(j);
     TF_ASSIGN_OR_RETURN(
@@ -2294,17 +2368,25 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildConditionalThunk(
                                   ir_emitter_context_));
     TF_CHECK_OK(branch_computation->Accept(ir_emitter.get()));
     branch_thunks.push_back(std::move(*ir_emitter->ConsumeThunkSequence()));
+
+    absl::optional<size_t> profile_index;
+    if (index_map) {
+      profile_index = index_map->GetProfileIndexFor(*branch_computation);
+    }
+    branch_profile_indices.push_back(profile_index);
   }
 
+  ConditionalThunkConfig config = GetConditionalThunkConfig(
+      hlo, std::move(branch_thunks), std::move(branch_profile_indices));
   return std::unique_ptr<Thunk>(new ConditionalThunk(
-      GetThunkInfo(hlo), GetAllocationSlice(*hlo->operand(0)), branch_operands,
-      std::move(branch_thunks)));
+      GetThunkInfo(hlo), std::move(config),
+      GetAllocationSlice(*hlo->operand(0)), branch_operands));
 }
 
 Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
     const HloInstruction& hlo,
     const llvm_ir::ElementGenerator& element_generator, KernelThunk* thunk,
-    int unroll_factor) {
+    int unroll_factor, bool few_waves) {
   VLOG(3) << bindings_.ToString();
 
   bool multi_output = hlo.shape().IsTuple();
@@ -2315,7 +2397,8 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
           << ShapeUtil::HumanStringWithLayout(hlo.shape())
           << " for unroll_factor " << unroll_factor;
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      element_shape, ir_emitter_context_->gpu_device_info(), unroll_factor);
+      element_shape, ir_emitter_context_->gpu_device_info(), unroll_factor,
+      few_waves);
   UpdateLaunchDimensions(launch_dimensions, thunk,
                          ir_emitter_context_->llvm_module());
   if (!multi_output) {
@@ -2401,8 +2484,27 @@ Status IrEmitterUnnested::EmitTargetElementLoop(
 
   std::unique_ptr<KernelThunk> kernel_thunk =
       BuildKernelThunk(&hlo, /*implements_whole_instruction=*/true);
+
+  // Check if we want to schedule grid size that has fewer SM waves.
+  // This speed up computations in some cases.
+  bool few_waves = false;
+  auto few_waves_allow_instr = [](const HloInstruction* instr) {
+    return instr->IsElementwise() || instr->opcode() == HloOpcode::kParameter ||
+           // We need to make the codegen broadcast aware before enabling
+           // more broadcast pattern.
+           (instr->opcode() == HloOpcode::kBroadcast &&
+            instr->dimensions().empty());
+  };
+  if (hlo.opcode() == HloOpcode::kFusion) {
+    few_waves =
+        absl::c_all_of(hlo.fused_instructions_computation()->instructions(),
+                       few_waves_allow_instr);
+  } else {
+    few_waves = few_waves_allow_instr(&hlo);
+  }
+
   Status emit_status = EmitTargetElementLoopInThunk(
-      hlo, body_emitter, kernel_thunk.get(), unroll_factor);
+      hlo, body_emitter, kernel_thunk.get(), unroll_factor, few_waves);
   thunk_sequence_.emplace_back(std::move(kernel_thunk));
 
   return emit_status;
@@ -2886,7 +2988,7 @@ void IrEmitterUnnested::EmitEpilogueForReduction(
                                              current_output);
         llvm::Value* warp_id =
             b_.CreateUDiv(thread_id_info.thread_id_x, constant(kWarpSize));
-        ksl.If(is_zero(thread_id_info.lane_id), [&] {
+        ksl.If("intra_warp_reduce_write", is_zero(thread_id_info.lane_id), [&] {
           llvm::Value* shmem_output_addr =
               shared_to_global(b_.CreateInBoundsGEP(
                   shared_cache, {b_.getInt32(0), constant(j), warp_id}));
@@ -2894,7 +2996,7 @@ void IrEmitterUnnested::EmitEpilogueForReduction(
         });
 
         EmitSyncThreads();
-        ksl.If(is_zero(warp_id), [&] {
+        ksl.If("inter_warp_reduce", is_zero(warp_id), [&] {
           llvm::Value* block_accum_addr = shared_to_global(b_.CreateInBoundsGEP(
               shared_cache,
               {b_.getInt32(0), constant(j), thread_id_info.lane_id}));
@@ -2914,10 +3016,11 @@ void IrEmitterUnnested::EmitEpilogueForReduction(
           EmitFullWarpShuffleDownLoopForReduce(
               reducers[i], element_type,
               /*block_accum_addr*/ selected_value);
-          ksl.If(is_zero(thread_id_info.thread_id_x), [&] {
-            TF_CHECK_OK(EmitAtomicOperationForNestedComputation(
-                *reducers[i], output_address, block_accum_addr));
-          });
+          ksl.If("reduction_atomic_update", is_zero(thread_id_info.thread_id_x),
+                 [&] {
+                   TF_CHECK_OK(EmitAtomicOperationForNestedComputation(
+                       *reducers[i], output_address, block_accum_addr));
+                 });
         });
 
       } else {
@@ -2952,10 +3055,11 @@ void IrEmitterUnnested::EmitEpilogueForReduction(
             b_.CreateICmpULT(thread_id_info.thread_id_x,
                              tiling_kernel_info.output_tile_bounds[kDimY]));
 
-        ksl.If(b_.CreateAnd(has_output, is_zero(thread_id_info.lane_id)), [&] {
-          TF_CHECK_OK(EmitAtomicOperationForNestedComputation(
-              *reducers[i], output_address, shmem_transposed_addr));
-        });
+        ksl.If("reduction_atomic_update",
+               b_.CreateAnd(has_output, is_zero(thread_id_info.lane_id)), [&] {
+                 TF_CHECK_OK(EmitAtomicOperationForNestedComputation(
+                     *reducers[i], output_address, shmem_transposed_addr));
+               });
       }
     }
   }
@@ -2991,6 +3095,28 @@ void IrEmitterUnnested::EmitPrintfWithThreadId(
   });
 }
 
+namespace {
+
+// Obtains the corresponding index of the out_instr in the outputs of the
+// `unnested_hlo`.
+ShapeIndex CreateShapeIndexForOutputInstruction(
+    const HloInstruction& unnested_hlo, const HloInstruction& out_instr) {
+  if (!unnested_hlo.IsMultiOutputFusion()) {
+    return ShapeIndex({});
+  }
+  const auto& all_outputs = unnested_hlo.fused_expression_root()->operands();
+  for (size_t i = 0; i < all_outputs.size(); ++i) {
+    if (all_outputs[i] == &out_instr) {
+      return ShapeIndex({static_cast<int64>(i)});
+    }
+  }
+  LOG(FATAL) << " Fusion root does not contain output instruction; "
+             << " fusion: " << unnested_hlo.ToString()
+             << ", output instruction: " << out_instr.ToString();
+}
+
+}  // namespace
+
 void IrEmitterUnnested::EmitTileElementForReduction(
     HloInstruction* unnested_hlo, const Shape& reduction_operand_shape,
     absl::Span<HloInstruction* const> output_instructions,
@@ -2998,7 +3124,6 @@ void IrEmitterUnnested::EmitTileElementForReduction(
     const ReductionCodegenInfo& reduction_info,
     absl::Span<HloComputation* const> reducers, int64 x_iter_num) {
   VLOG(10) << "Emit tile element for reduce " << unnested_hlo->ToString();
-  bool returns_tuple = output_instructions.size() > 1;
   int partial_result_index = reduction_info.IsRowReduction() ? 0 : x_iter_num;
 
   InlinedVector<llvm_ir::ElementGenerator, 1> input_gens;
@@ -3015,7 +3140,8 @@ void IrEmitterUnnested::EmitTileElementForReduction(
 
     for (int i = 0, e = output_instructions.size(); i != e; ++i) {
       const HloInstruction* inst = output_instructions[i];
-      ShapeIndex idx = returns_tuple ? ShapeIndex({i}) : ShapeIndex({});
+      ShapeIndex idx =
+          CreateShapeIndexForOutputInstruction(*unnested_hlo, *inst);
       if (IsReductionFromOrToContiguousDimensions(*inst)) {
         input_gens.push_back(fused_emitter.GetGenerator(inst->operand(0)));
       } else {
@@ -3748,71 +3874,41 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo(
                               reduction_dimensions.is_row_reduction);
 }
 
-Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(
+void IrEmitterUnnested::EmitIRForReduction(
     HloInstruction* unnested_hlo,
-    absl::Span<HloInstruction* const> output_instructions) {
-  bool returns_tuple = output_instructions.size() > 1;
-  VLOG(10) << "Emitting reduction to vector " << unnested_hlo->ToString();
-
+    absl::Span<HloInstruction* const> output_instructions,
+    ReductionCodegenInfo* reduction_info, const Shape& input_shape) {
   std::vector<HloInstruction*> reduce_instructions;
   InlinedVector<ShapeIndex, 1> reduction_output_shape_indices;
   InlinedVector<HloComputation*, 1> reducers;
-
-  // Build an initializer thunk to initialize each reduction output.
-  std::vector<std::unique_ptr<Thunk>> thunks;
-  for (int i = 0; i < output_instructions.size(); ++i) {
+  for (size_t i = 0; i < output_instructions.size(); ++i) {
     if (!IsReductionFromOrToContiguousDimensions(*output_instructions[i])) {
       continue;
     }
 
     HloInstruction* output_instruction = output_instructions[i];
     reduce_instructions.push_back(output_instruction);
-    ShapeIndex idx = returns_tuple ? ShapeIndex({i}) : ShapeIndex({});
-    reduction_output_shape_indices.push_back(idx);
+    reduction_output_shape_indices.push_back(
+        CreateShapeIndexForOutputInstruction(*unnested_hlo,
+                                             *output_instruction));
     reducers.push_back(output_instruction->to_apply());
-
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk,
-                        BuildInitializerThunk(unnested_hlo, idx));
-    thunks.push_back(std::move(initializer_thunk));
   }
+  CHECK(reduce_instructions.size() != 0)
+      << " expect at least one reduce instructions.";
 
-  const HloInstruction* first_reduce = reduce_instructions.at(0);
-  if (output_instructions.size() > 1) {
-    if (!AreFusedReductionOutputsConsistent(output_instructions,
-                                            first_reduce)) {
-      return InternalError("Inconsistent reduction fusion outputs");
-    }
-  }
-
-  // Build a kernel thunk to compute all the outputs.
-  std::unique_ptr<KernelThunk> kernel_thunk =
-      BuildKernelThunk(unnested_hlo, /*implements_whole_instruction=*/false);
-
-  const Shape& input_shape = first_reduce->operand(0)->shape();
-  // The layout of a reduction input is either set by LayoutAssignment for
-  // unnested kReduce or by InstructionFusion for fused kReduce.
-  CHECK(input_shape.has_layout()) << "LayoutAssignment or InstructionFusion "
-                                     "doesn't set the input layout of "
-                                  << first_reduce->ToString();
-
-  ReductionCodegenInfo reduction_info =
-      ComputeReductionCodegenInfo(unnested_hlo, first_reduce);
   const KernelMappingScheme& mapping_scheme =
-      reduction_info.GetKernelMappingScheme();
+      reduction_info->GetKernelMappingScheme();
   LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(),
                                      mapping_scheme.GetThreadsPerBlock());
-  VLOG(3) << "Launch dimensions of " << unnested_hlo->name()
-          << ": number of blocks: " << mapping_scheme.GetNumberOfBlocks()
-          << " - threads per block: " << mapping_scheme.GetThreadsPerBlock();
   llvm::Type* index_ty = GetIndexTypeForKernel(
       unnested_hlo, launch_dimensions.launch_bound(), &b_);
-  EmitPrologueForReduction(unnested_hlo, &reduction_info, reduce_instructions,
+  EmitPrologueForReduction(unnested_hlo, reduction_info, reduce_instructions,
                            index_ty);
   EmitElementFunction emit_reduction_tile =
       [&](const llvm_ir::IrArray::Index& index, llvm::Value* y_loc,
           llvm::Value* x_loc, int64 x_iter_num) {
         EmitTileElementForReduction(unnested_hlo, input_shape,
-                                    output_instructions, index, reduction_info,
+                                    output_instructions, index, *reduction_info,
                                     reducers, x_iter_num);
       };
 
@@ -3821,70 +3917,185 @@ Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(
       [&](const ThreadIdInfo& thread_id_info, const IrArray::Index& index,
           const string& loop_name, llvm::Value* tile_height,
           llvm::Value* tile_width, KernelSupportLibrary* ksl) {
-        EmitTile(reduction_info.GetKernelMappingScheme(), index, loop_name, ksl,
-                 thread_id_info, tile_height, tile_width, emit_reduction_tile);
+        EmitTile(reduction_info->GetKernelMappingScheme(), index, loop_name,
+                 ksl, thread_id_info, tile_height, tile_width,
+                 emit_reduction_tile);
       });
-  EmitEpilogueForReduction(index_ty, unnested_hlo, reduction_info,
+  EmitEpilogueForReduction(index_ty, unnested_hlo, *reduction_info,
                            reduce_instructions, reduction_output_shape_indices,
                            reducers, tiling_kernel_info);
+}
 
+namespace {
+
+// Returns whether the `instr` is either a constant, a scalar, or a
+// broadcasted constant/scalar.
+bool IsBroadcastedConstantOrScalar(const HloInstruction& instr) {
+  return instr.IsConstant() || ShapeUtil::IsScalar(instr.shape()) ||
+         (HloOpcode::kBroadcast == instr.opcode() &&
+          (instr.operand(0)->IsConstant() ||
+           ShapeUtil::IsScalar(instr.operand(0)->shape())));
+}
+
+// Divides output_instructions into groups. Different groups will be executed
+// in parallel. Generally speaking, we'd like to run the reduce instructions
+// in parallel without incurring too much recomputation overhead. The current
+// heuristic is to place reduce instructions who share nothing or only
+// (broadcasted) scalars/constants into different groups; otherwise, they are
+// placed in the same group. Non-reduce instructions always go with the reduce
+// instructions into the same group so long as they share any predecessors.
+std::vector<std::vector<HloInstruction*>> DivideOutputInstructionsIntoGroups(
+    HloInstruction* unnested_hlo,
+    absl::Span<HloInstruction* const> output_instructions) {
+  CHECK(!output_instructions.empty());
+  if (output_instructions.size() == 1) {
+    return {{output_instructions[0]}};
+  }
+
+  std::vector<tensorflow::UnionFind<HloInstruction*>> disjoint_sets(
+      output_instructions.size());
+  for (size_t i = 0; i < output_instructions.size(); ++i) {
+    disjoint_sets[i].Get() = output_instructions[i];
+  }
+
+  std::unique_ptr<HloReachabilityMap> reachability_map =
+      HloReachabilityMap::Build(unnested_hlo->fused_instructions_computation());
+  for (auto* instr : unnested_hlo->fused_instructions()) {
+    std::vector<int64> reached_output_ids;
+    for (size_t oid = 0; oid < output_instructions.size(); ++oid) {
+      if (HloOpcode::kReduce == output_instructions[oid]->opcode() &&
+          (IsBroadcastedConstantOrScalar(*instr))) {
+        // Do not group output reduce instructions through broadcasted
+        // constants or scalars, as the recomputation should be acceptable.
+        VLOG(3) << "Skip broadcasted constant or scalar " << instr->ToString();
+        continue;
+      }
+      // Now group output instructions if they have common predecessors.
+      if (reachability_map->IsReachable(instr, output_instructions[oid])) {
+        VLOG(3) << "Reaching " << output_instructions[oid]->ToString()
+                << " from " << instr->ToString();
+        reached_output_ids.push_back(oid);
+      }
+    }
+    for (size_t j = 1; j < reached_output_ids.size(); ++j) {
+      disjoint_sets[reached_output_ids[0]].Merge(
+          &disjoint_sets[reached_output_ids[j]]);
+    }
+  }
+  // Place output instructions in the same set into the same group.
+  absl::flat_hash_map<HloInstruction*, std::vector<HloInstruction*>> groups;
+  for (size_t oid = 0; oid < output_instructions.size(); ++oid) {
+    groups[disjoint_sets[oid].Get()].push_back(output_instructions.at(oid));
+  }
+
+  std::vector<std::vector<HloInstruction*>> ret;
+  absl::c_for_each(
+      groups, [&](auto& iter) { ret.emplace_back(std::move(iter.second)); });
+  return ret;
+}
+
+}  // namespace
+
+Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions(
+    HloInstruction* unnested_hlo,
+    absl::Span<HloInstruction* const> output_instructions) {
+  bool returns_tuple = output_instructions.size() > 1;
+  VLOG(10) << "Emitting reduction to vector " << unnested_hlo->ToString();
+
+  // Build an initializer thunk to initialize each reduction output.
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  for (int i = 0; i < output_instructions.size(); ++i) {
+    if (!IsReductionFromOrToContiguousDimensions(*output_instructions[i])) {
+      continue;
+    }
+
+    ShapeIndex idx = returns_tuple ? ShapeIndex({i}) : ShapeIndex({});
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk,
+                        BuildInitializerThunk(unnested_hlo, idx));
+    thunks.push_back(std::move(initializer_thunk));
+  }
+
+  // Build a kernel thunk to compute all the outputs.
+  const HloInstruction* first_reduce = nullptr;
+  for (int i = 0; i < output_instructions.size(); ++i) {
+    if (IsReductionFromOrToContiguousDimensions(*output_instructions[i])) {
+      first_reduce = output_instructions[i];
+      break;
+    }
+  }
+  CHECK(first_reduce);
+  if (output_instructions.size() > 1) {
+    if (!AreFusedReductionOutputsConsistent(output_instructions,
+                                            first_reduce)) {
+      return InternalError("Inconsistent reduction fusion outputs");
+    }
+  }
+  const Shape& input_shape = first_reduce->operand(0)->shape();
+  // The layout of a reduction input is either set by LayoutAssignment for
+  // unnested kReduce or by InstructionFusion for fused kReduce.
+  CHECK(input_shape.has_layout()) << "LayoutAssignment or InstructionFusion "
+                                     "doesn't set the input layout of "
+                                  << first_reduce->ToString();
+
+  // Group output instructions. Each group will be executed in parallel.
+  std::vector<std::vector<HloInstruction*>> instr_groups =
+      DivideOutputInstructionsIntoGroups(unnested_hlo, output_instructions);
+  VLOG(2) << StrCat("Generate in ", instr_groups.size(), " groups for ",
+                    unnested_hlo->ToString());
+  std::unique_ptr<KernelThunk> kernel_thunk =
+      BuildKernelThunk(unnested_hlo, /*implements_whole_instruction=*/false);
+  KernelSupportLibrary ksl(&b_, llvm_ir::UnrollMode::kDefaultUnroll);
+  for (size_t i = 0; i < instr_groups.size(); ++i) {
+    // Create a new ReductionCodegenInfo instance as it contains states for
+    // code generation per reduction group. For now, let's always use the very
+    // first reduce as representative to construct ReductionCodegenInfo, since
+    // all the reductions are required to have the same shape and layout as
+    // verified by `AreFusedReductionOutputsConsistent()`. We can loosen the
+    // constraint later when the needs arise.
+    ReductionCodegenInfo reduction_info =
+        ComputeReductionCodegenInfo(unnested_hlo, first_reduce);
+    auto emit_reduction_func = [&] {
+      EmitIRForReduction(unnested_hlo, instr_groups[i], &reduction_info,
+                         input_shape);
+    };
+    // Use raw block_id_y to select the i-th parallel reduction to run. Using
+    // block_id_y instead of block_id_x simplifies the index calculation
+    // for reduction code generation as the block_id_y is orthogonal to
+    // the indices used within the reductions.
+    llvm::CallInst* raw_block_id_y = gpu::EmitCallToTargetIntrinsic(
+        gpu::TargetIntrinsicID::kBlockIdy, {}, {}, &b_);
+    llvm_ir::AddRangeMetadata(0, instr_groups.size(),
+                              llvm::cast<llvm::Instruction>(raw_block_id_y));
+    llvm::Value* guarding_cond =
+        b_.CreateICmpEQ(raw_block_id_y, b_.getInt32(i));
+    ksl.If(StrCat("reduce-group-", i), guarding_cond, emit_reduction_func);
+  }
+  ReductionCodegenInfo reduction_info =
+      ComputeReductionCodegenInfo(unnested_hlo, first_reduce);
+  const KernelMappingScheme& mapping_scheme =
+      reduction_info.GetKernelMappingScheme();
+  // block_y_count is set to instr_groups.size(), so that each reduction group
+  // can be run in parallel by a different BlockIdy.
+  LaunchDimensions launch_dimensions(
+      {/*x=*/mapping_scheme.GetNumberOfBlocks(),
+       /*y=*/static_cast<int64>(instr_groups.size()),
+       /*z=*/1},
+      {/*x=*/mapping_scheme.GetThreadsPerBlock(), /*y=*/1, /*z=*/1});
+  VLOG(3) << "Launch dimensions of " << unnested_hlo->name()
+          << ": number of blocks: " << mapping_scheme.GetNumberOfBlocks()
+          << " - threads per block: " << mapping_scheme.GetThreadsPerBlock();
   UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(),
                          ir_emitter_context_->llvm_module());
 
   thunks.push_back(std::move(kernel_thunk));
-  auto sequential_thunk = absl::make_unique<SequentialThunk>(
-      GetThunkInfo(unnested_hlo), std::move(thunks));
+  std::unique_ptr<SequentialThunk> sequential_thunk =
+      absl::make_unique<SequentialThunk>(GetThunkInfo(unnested_hlo),
+                                         std::move(thunks));
   AddThunkToThunkSequence(std::move(sequential_thunk));
 
   return Status::OK();
 }
 
-Status IrEmitterUnnested::EmitConstantGlobals() {
-  for (const BufferAllocation& allocation :
-       ir_emitter_context_->buffer_assignment().Allocations()) {
-    if (!allocation.is_constant()) {
-      continue;
-    }
-
-    const Literal& literal = llvm_ir::LiteralForConstantAllocation(allocation);
-    const bool should_emit_initializer = ShouldEmitLiteralInLlvmIr(literal);
-    llvm::ArrayType* global_type =
-        llvm::ArrayType::get(b_.getInt8Ty(), allocation.size());
-    llvm::Constant* initializer =
-        should_emit_initializer
-            ? llvm_ir::ConvertLiteralToIrConstant(literal, module_)
-            : llvm::ConstantAggregateZero::get(global_type);
-    if (should_emit_initializer) {
-      VLOG(3) << "Emitted initializer for constant with shape "
-              << ShapeUtil::HumanString(literal.shape());
-    }
-
-    // These globals will be looked up by name by GpuExecutable so we need to
-    // give them an external linkage.  Not all of their uses are visible in
-    // the LLVM IR (e.g. TupleThunk) so we can't give then a linkage that
-    // merely preserves their names (like available_externally), we also need
-    // to ensure that they stick around even if they're "unused".
-    //
-    // We may have to be more more clever here in the future if we notice that
-    // we're keeping around too many globals because of their linkage.
-    unsigned global_address_space = llvm_ir::GetGlobalMemoryAddressSpace(
-        *ir_emitter_context_->llvm_module());
-    llvm::GlobalVariable* global_for_const = new llvm::GlobalVariable(
-        global_type, /*isConstant=*/should_emit_initializer,
-        llvm::GlobalValue::ExternalLinkage,
-        /*Initializer=*/initializer,
-        llvm_ir::ConstantBufferAllocationToGlobalName(allocation),
-        /*TLMode=*/llvm::GlobalValue::NotThreadLocal,
-        /*AddressSpace=*/global_address_space,
-        /*isExternallyInitialized=*/false);
-    global_for_const->setAlignment(llvm::Align(kConstantBufferAlignBytes));
-    ir_emitter_context_->llvm_module()->getGlobalList().push_back(
-        global_for_const);
-  }
-
-  return Status::OK();
-}
-
 // Emits code for slices based on the below structure. An if statement with
 // a guarding condition is generated for each ROOT slice.
 //
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index b9146dd8fae..5cc5e206167 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -58,7 +58,6 @@ struct MlirBufferSlice : public BufferSlice {
 
 struct MlirEmitterInput {
   mlir::Operation* op;
-  absl::string_view name;
   Thunk::ThunkInfo thunk_info;
   MlirBufferSlice extra_slice;
 };
@@ -161,7 +160,7 @@ class IrEmitterUnnested : public IrEmitter,
   Status HandleScatter(HloInstruction* scatter) override;
   Status HandleSelect(HloInstruction* select) override;
   Status HandleSort(HloInstruction* sort) override;
-  Status EmitMlirSort(MlirEmitterInput input);
+  Status EmitSortFromMlir(MlirEmitterInput mlir_input);
   Status HandleTriangularSolve(HloInstruction* hlo) override;
   Status HandleTupleSelect(HloInstruction* tuple_select) override;
   Status HandleAllReduce(HloInstruction* crs) override;
@@ -178,10 +177,7 @@ class IrEmitterUnnested : public IrEmitter,
   // `unroll_factor` is greater than one.
   Status EmitTargetElementLoopInThunk(
       const HloInstruction& hlo, const llvm_ir::ElementGenerator& body_emitter,
-      KernelThunk* thunk, int unroll_factor);
-
-  // Emits LLVM global variables corresponding to constant instructions.
-  Status EmitConstantGlobals();
+      KernelThunk* thunk, int unroll_factor, bool few_waves = false);
 
   Status Postprocess(HloInstruction* hlo) override;
 
@@ -372,6 +368,16 @@ class IrEmitterUnnested : public IrEmitter,
   // }
   // ```
   //
+  // Moreover, a heuristic is implemented to divide the reduce instructions
+  // into groups for parallelization (see `DivideOutputInstructionsIntoGroups`
+  // for details about the heuristic.) Reduce instructions in the same group
+  // will run sequentially while different groups will run in parallel.
+  //
+  // we use raw block_id_y to select the reduce groups for execution without
+  // complicating the index calculation in the code generation of the reduce
+  // instructions. In other words, a block_id_y is assigned to a group and so
+  // different groups can be run in parallel.
+  //
   // output_instructions: Output instructions in the computation: instruction
   // itself if it's not a fusion, fusion root if fusion is not multi-output, and
   // elements of the fusion multi-output tuple otherwise.
@@ -404,11 +410,10 @@ class IrEmitterUnnested : public IrEmitter,
   // the process. `scatter` may be fused, scatter indices are taken from
   // `scatter_indices_gen`, updates from`updates_gen`. The output buffer is
   // expected to have the operand values in it already. If unique_indices
-  // is false, we will use an atomic update. Using false for unique_indices
-  // is safe only when it is guaranteed that there are no duplicate
-  // indices.
-  // When using unique_indices=true, it is the caller's responsibility to
-  // ensure there is no overlap.
+  // is false, we will use an atomic update. Using true for unique_indices
+  // behaves properly only when it is guaranteed that the indices to be
+  // updated do not overlap. The caller is responsible for ensuring this is
+  // the case.
   Status EmitScatter(Thunk* thunk, HloInstruction* scatter,
                      const llvm_ir::ElementGenerator& scatter_indices_gen,
                      const llvm_ir::ElementGenerator& updates_gen);
@@ -519,6 +524,12 @@ class IrEmitterUnnested : public IrEmitter,
       absl::Span<HloComputation* const> reducers,
       const TilingKernelInfo& tiling_kernel_info);
 
+  // Emits code for reductions in the output_instructions.
+  void EmitIRForReduction(HloInstruction* unnested_hlo,
+                          absl::Span<HloInstruction* const> output_instructions,
+                          ReductionCodegenInfo* reduction_info,
+                          const Shape& input_shape);
+
   // For each reducer, emits the shuffle-down loop to accumulate the partial
   // result to the global result.
   void EmitFullWarpShuffleDownLoopForAllReduces(
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index 19fef37db7e..6c138258aa0 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -115,9 +115,8 @@ Status KernelThunk::ExecuteOnStream(const ExecuteParams& params) {
 
   auto op_profiler =
       params.profiler->MakeScopedInstructionProfiler(profile_index());
-  return ExecuteKernelOnStream(*kernel, buffer_args,
-                               launch_dimensions.threads_per_block(),
-                               launch_dimensions.block_count(), params.stream);
+  return ExecuteKernelOnStream(*kernel, buffer_args, launch_dimensions,
+                               params.stream);
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/launch_dimensions.cc b/tensorflow/compiler/xla/service/gpu/launch_dimensions.cc
index 3668a521ec7..5dbbb2d65da 100644
--- a/tensorflow/compiler/xla/service/gpu/launch_dimensions.cc
+++ b/tensorflow/compiler/xla/service/gpu/launch_dimensions.cc
@@ -26,8 +26,11 @@ namespace gpu {
 
 std::ostream& operator<<(std::ostream& out,
                          const LaunchDimensions& launch_dims) {
-  out << absl::StrFormat("[block: %d, thread: %d]", launch_dims.block_count(),
-                         launch_dims.threads_per_block());
+  LaunchDimensions::Dim3D block_counts = launch_dims.block_counts();
+  LaunchDimensions::Dim3D thread_counts = launch_dims.thread_counts_per_block();
+  out << absl::StrFormat("[block: {%d, %d, %d}, thread: {%d, %d, %d}]",
+                         block_counts.x, block_counts.y, block_counts.z,
+                         thread_counts.x, thread_counts.y, thread_counts.z);
   return out;
 }
 
@@ -53,7 +56,7 @@ static int64 ThreadsPerBlockLimit(GpuDeviceInfo gpu_device_info) {
 // Calculates the launch dimensions used to invoke `hlo`.
 LaunchDimensions CalculateLaunchDimensions(const Shape& shape,
                                            GpuDeviceInfo gpu_device_info,
-                                           int unroll_factor) {
+                                           int unroll_factor, bool few_waves) {
   int64 num_elements = ShapeUtil::ElementsIn(shape);
   if (num_elements <= 1) {
     return LaunchDimensions();
@@ -87,6 +90,11 @@ LaunchDimensions CalculateLaunchDimensions(const Shape& shape,
   }
 
   int64 block_count = CeilOfRatio(num_elements, threads_per_block);
+  if (few_waves) {
+    threads_per_block = std::min(threads_per_block, int64{128});
+    block_count = gpu_device_info.core_count *
+                  (gpu_device_info.threads_per_core_limit / threads_per_block);
+  }
   VLOG(2) << absl::StrFormat(
       "Initialized the block count to ceil(# of elements / threads per "
       "block) = ceil(%d/%d) = %d",
diff --git a/tensorflow/compiler/xla/service/gpu/launch_dimensions.h b/tensorflow/compiler/xla/service/gpu/launch_dimensions.h
index 1a5a9d618e4..1472141a80e 100644
--- a/tensorflow/compiler/xla/service/gpu/launch_dimensions.h
+++ b/tensorflow/compiler/xla/service/gpu/launch_dimensions.h
@@ -29,24 +29,37 @@ namespace gpu {
 // number of threads per block.
 class LaunchDimensions {
  public:
+  struct Dim3D {
+    int64 x, y, z;
+  };
+
   // The default constructor creates a launch dimension that indicate
   // single-threaded execution.
-  LaunchDimensions() : block_count_(1), threads_per_block_(1) {}
+  LaunchDimensions()
+      : block_counts_({1, 1, 1}), thread_counts_per_block_({1, 1, 1}) {}
 
-  LaunchDimensions(int64 block_count, int64 threads_per_block)
-      : block_count_(block_count), threads_per_block_(threads_per_block) {}
+  LaunchDimensions(int64 block_x_count, int64 thread_x_count_per_block)
+      : block_counts_({block_x_count, 1, 1}),
+        thread_counts_per_block_({thread_x_count_per_block, 1, 1}) {}
 
-  bool IsSinglethreaded() const {
-    return block_count_ == 1 && threads_per_block_ == 1;
+  LaunchDimensions(const Dim3D& block_counts,
+                   const Dim3D& thread_counts_per_block)
+      : block_counts_(block_counts),
+        thread_counts_per_block_(thread_counts_per_block) {}
+
+  Dim3D block_counts() const { return block_counts_; }
+
+  Dim3D thread_counts_per_block() const { return thread_counts_per_block_; }
+
+  int64 launch_bound() const {
+    return block_counts_.x * thread_counts_per_block_.x * block_counts_.y *
+           thread_counts_per_block_.y * block_counts_.z *
+           thread_counts_per_block_.z;
   }
 
-  int64 block_count() const { return block_count_; }
-  int64 threads_per_block() const { return threads_per_block_; }
-  int64 launch_bound() const { return block_count() * threads_per_block(); }
-
  private:
-  int64 block_count_;
-  int64 threads_per_block_;
+  Dim3D block_counts_;
+  Dim3D thread_counts_per_block_;
 };
 
 std::ostream& operator<<(std::ostream& out,
@@ -54,7 +67,8 @@ std::ostream& operator<<(std::ostream& out,
 
 LaunchDimensions CalculateLaunchDimensions(const Shape& shape,
                                            GpuDeviceInfo gpu_device_info,
-                                           int unroll_factor = 1);
+                                           int unroll_factor = 1,
+                                           bool few_waves = false);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
index c3ef02a04f2..eb6291172fe 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 04af67a70b9..36b676565b5 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -149,7 +149,7 @@ std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
   }
 
   llvm::TargetOptions target_options =
-      llvm::codegen::InitTargetOptionsFromCodeGenFlags();
+      llvm::codegen::InitTargetOptionsFromCodeGenFlags(llvm::Triple());
 
   // Set the verbose assembly options.
   target_options.MCOptions.AsmVerbose = false;
diff --git a/tensorflow/compiler/xla/service/gpu/memset_thunk.h b/tensorflow/compiler/xla/service/gpu/memset_thunk.h
index 8a1890a0769..fb18b7041b7 100644
--- a/tensorflow/compiler/xla/service/gpu/memset_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/memset_thunk.h
@@ -55,7 +55,7 @@ class Memset32BitValueThunk : public Thunk {
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
-  uint32 value_;
+  const uint32 value_;
   const BufferAllocation::Slice dest_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
index fcbd9e760c6..fa73ac261f8 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -104,6 +105,13 @@ HloInstruction* SelectPreferredFusionCandidate(
 std::vector<HloInstruction*> GetProducerConsumerMultiOutputFusionCandidates(
     const HloInstruction* producer, const HloReachabilityMap& reachability) {
   std::vector<HloInstruction*> fusion_candidates;
+  // If there is only one user, and it is not a multi-output fusion node, this
+  // fusion possibility was already considered and rejected by the FusionMerger
+  // pass. No need to try again!
+  if (producer->user_count() == 1 &&
+      !producer->users()[0]->IsMultiOutputFusion()) {
+    return fusion_candidates;
+  }
   for (HloInstruction* consumer : producer->users()) {
     VLOG(3) << "Looking at producer " << producer->name()
             << " and its consumer " << consumer->name();
@@ -141,6 +149,16 @@ std::vector<HloInstruction*> GetProducerConsumerMultiOutputFusionCandidates(
               << " would be too large of a fusion.";
       continue;
     }
+    // Make sure the emitter can codegen the fusion op efficiently. We currently
+    // can have exponential time/memory requirements for emitting certain fusion
+    // ops, in which case we don't want to fuse.
+    // TODO(b/119692968): Remove this once fixed in the emitter.
+    if (FusedIrEmitter::IsFusedIrEmitterInefficient(consumer, producer)) {
+      VLOG(3) << "Fusion of " << producer->name() << " into "
+              << consumer->name()
+              << " would result in overly large code duplication.";
+      continue;
+    }
     fusion_candidates.push_back(consumer);
   }
   return fusion_candidates;
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
index 4d269665b42..6cb66290a9a 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@@ -798,6 +798,86 @@ TEST_F(MultiOutputFusionTest, MultiOutputFusionDUS) {
 // Check that we don't fuse too many reductions together.
 TEST_F(MultiOutputFusionTest, SharedMemoryBudget) {
   auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    fused_computation0 {
+      p0 = f32[64,64] parameter(0)
+      p1 = f32[64,64] parameter(1)
+      p2 = f32[] parameter(2)
+      add = f32[64,64] add(p0, p1)
+      ROOT reduce = f32[64] reduce(f32[64,64] add, f32[] p2), dimensions={0},
+        to_apply=scalar_add_computation
+    }
+    fused_computation1 {
+      p0 = f32[64,64] parameter(0)
+      p1 = f32[64,64] parameter(1)
+      p2 = f32[] parameter(2)
+      add = f32[64,64] add(p0, p1)
+      ROOT reduce = f32[64] reduce(f32[64,64] add, f32[] p2), dimensions={0},
+        to_apply=scalar_add_computation
+    }
+    fused_computation2 {
+      p0 = f32[64,64] parameter(0)
+      p1 = f32[64,64] parameter(1)
+      p2 = f32[] parameter(2)
+      add = f32[64,64] add(p0, p1)
+      ROOT reduce = f32[64] reduce(f32[64,64] add, f32[] p2), dimensions={0},
+        to_apply=scalar_add_computation
+    }
+    fused_computation3 {
+      p0 = f32[64,64] parameter(0)
+      p1 = f32[64,64] parameter(1)
+      p2 = f32[] parameter(2)
+      add = f32[64,64] add(p0, p1)
+      ROOT reduce = f32[64] reduce(f32[64,64] add, f32[] p2), dimensions={0},
+        to_apply=scalar_add_computation
+    }
+    fused_computation4 {
+      p0 = f32[64,64] parameter(0)
+      p1 = f32[64,64] parameter(1)
+      p2 = f32[] parameter(2)
+      add = f32[64,64] add(p0, p1)
+      ROOT reduce = f32[64] reduce(f32[64,64] add, f32[] p2), dimensions={0},
+        to_apply=scalar_add_computation
+    }
+    fused_computation5 {
+      p0 = f32[64,64] parameter(0)
+      p1 = f32[64,64] parameter(1)
+      p2 = f32[] parameter(2)
+      add = f32[64,64] add(p0, p1)
+      ROOT reduce = f32[64] reduce(f32[64,64] add, f32[] p2), dimensions={0},
+        to_apply=scalar_add_computation
+    }
+    fused_computation6 {
+      p0 = f32[64,64] parameter(0)
+      p1 = f32[64,64] parameter(1)
+      p2 = f32[] parameter(2)
+      add = f32[64,64] add(p0, p1)
+      ROOT reduce = f32[64] reduce(f32[64,64] add, f32[] p2), dimensions={0},
+        to_apply=scalar_add_computation
+    }
+    fused_computation7 {
+      p0 = f32[64,64] parameter(0)
+      p1 = f32[64,64] parameter(1)
+      p2 = f32[] parameter(2)
+      add = f32[64,64] add(p0, p1)
+      ROOT reduce = f32[64] reduce(f32[64,64] add, f32[] p2), dimensions={0},
+        to_apply=scalar_add_computation
+    }
+    fused_computation8 {
+      p0 = f32[64,64] parameter(0)
+      p1 = f32[64,64] parameter(1)
+      p2 = f32[] parameter(2)
+      add = f32[64,64] add(p0, p1)
+      ROOT reduce = f32[64] reduce(f32[64,64] add, f32[] p2), dimensions={0},
+        to_apply=scalar_add_computation
+    }
+    fused_computation9 {
+      p0 = f32[64,64] parameter(0)
+      p1 = f32[64,64] parameter(1)
+      p2 = f32[] parameter(2)
+      add = f32[64,64] add(p0, p1)
+      ROOT reduce = f32[64] reduce(f32[64,64] add, f32[] p2), dimensions={0},
+        to_apply=scalar_add_computation
+    }
     ENTRY computation {
       zero = f32[] constant(0)
       param0 = f32[64,64] parameter(0)
@@ -810,36 +890,16 @@ TEST_F(MultiOutputFusionTest, SharedMemoryBudget) {
       param7 = f32[64,64] parameter(7)
       param8 = f32[64,64] parameter(8)
       param9 = f32[64,64] parameter(9)
-      add0 = f32[64,64] add(param0, param1)
-      add1 = f32[64,64] add(param1, param2)
-      add2 = f32[64,64] add(param2, param3)
-      add3 = f32[64,64] add(param3, param4)
-      add4 = f32[64,64] add(param4, param5)
-      add5 = f32[64,64] add(param5, param6)
-      add6 = f32[64,64] add(param6, param7)
-      add7 = f32[64,64] add(param7, param8)
-      add8 = f32[64,64] add(param8, param9)
-      add9 = f32[64,64] add(param9, param0)
-      out0 = f32[64] reduce(f32[64,64] add0, f32[] zero), dimensions={0},
-        to_apply=scalar_add_computation
-      out1 = f32[64] reduce(f32[64,64] add1, f32[] zero), dimensions={0},
-        to_apply=scalar_add_computation
-      out2 = f32[64] reduce(f32[64,64] add2, f32[] zero), dimensions={0},
-        to_apply=scalar_add_computation
-      out3 = f32[64] reduce(f32[64,64] add3, f32[] zero), dimensions={0},
-        to_apply=scalar_add_computation
-      out4 = f32[64] reduce(f32[64,64] add4, f32[] zero), dimensions={0},
-        to_apply=scalar_add_computation
-      out5 = f32[64] reduce(f32[64,64] add5, f32[] zero), dimensions={0},
-        to_apply=scalar_add_computation
-      out6 = f32[64] reduce(f32[64,64] add6, f32[] zero), dimensions={0},
-        to_apply=scalar_add_computation
-      out7 = f32[64] reduce(f32[64,64] add7, f32[] zero), dimensions={0},
-        to_apply=scalar_add_computation
-      out8 = f32[64] reduce(f32[64,64] add8, f32[] zero), dimensions={0},
-        to_apply=scalar_add_computation
-      out9 = f32[64] reduce(f32[64,64] add9, f32[] zero), dimensions={0},
-        to_apply=scalar_add_computation
+      out0 = f32[64] fusion(param0, param1, zero), kind=kInput, calls=fused_computation0
+      out1 = f32[64] fusion(param1, param2, zero), kind=kInput, calls=fused_computation1
+      out2 = f32[64] fusion(param2, param3, zero), kind=kInput, calls=fused_computation2
+      out3 = f32[64] fusion(param3, param4, zero), kind=kInput, calls=fused_computation3
+      out4 = f32[64] fusion(param4, param5, zero), kind=kInput, calls=fused_computation4
+      out5 = f32[64] fusion(param5, param6, zero), kind=kInput, calls=fused_computation5
+      out6 = f32[64] fusion(param6, param7, zero), kind=kInput, calls=fused_computation6
+      out7 = f32[64] fusion(param7, param8, zero), kind=kInput, calls=fused_computation7
+      out8 = f32[64] fusion(param8, param9, zero), kind=kInput, calls=fused_computation8
+      out9 = f32[64] fusion(param9, param0, zero), kind=kInput, calls=fused_computation9
       ROOT out = (f32[64], f32[64], f32[64], f32[64], f32[64], f32[64], f32[64], f32[64], f32[64]) tuple(f32[64] out0, f32[64] out1, f32[64] out2, f32[64] out3, f32[64] out4, f32[64] out5, f32[64] out6, f32[64] out7, f32[64] out8, f32[64] out9)
     }
   )"))
@@ -849,5 +909,165 @@ TEST_F(MultiOutputFusionTest, SharedMemoryBudget) {
   EXPECT_EQ(2, CountMultiOutputFusions(module.get()));
 }
 
+TEST_F(MultiOutputFusionTest, NoFusionToAvoidCodeDuplication) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule module
+
+and.reduce_sub_computation {
+  x = pred[] parameter(0)
+  y = pred[] parameter(1)
+  ROOT and = pred[] and(x, y)
+}
+
+fused_computation.1 {
+  param_4.658 = f32[2,20,256]{2,0,1} parameter(4)
+  slice.1385 = f32[2,1,256]{2,0,1} slice(param_4.658), slice={[0:2], [11:12], [0:256]}
+  constant.6847 = s32[] constant(0)
+  broadcast.4823 = s32[3]{0} broadcast(constant.6847), dimensions={}
+  param_9.415 = s32[3]{0} parameter(9)
+  compare.700 = pred[3]{0} compare(broadcast.4823, param_9.415), direction=LE
+  constant.6846 = pred[] constant(true)
+  reduce.221 = pred[] reduce(compare.700, constant.6846), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2933 = pred[2,1,256]{2,0,1} broadcast(reduce.221), dimensions={}
+  param_5.528 = f32[2,512]{1,0} parameter(5)
+  slice.1384 = f32[2,256]{1,0} slice(param_5.528), slice={[0:2], [0:256]}
+  bitcast.341 = f32[2,1,256]{2,0,1} bitcast(slice.1384)
+  constant.5418 = f32[] constant(0)
+  broadcast.3227 = f32[2,1,256]{2,0,1} broadcast(constant.5418), dimensions={}
+  select.173 = f32[2,1,256]{2,0,1} select(broadcast.2933, bitcast.341, broadcast.3227)
+  add.573 = f32[2,1,256]{2,0,1} add(slice.1385, select.173)
+  param_0.299 = s32[] parameter(0)
+  constant.5157 = s32[] constant(11)
+  dynamic-update-slice.189 = f32[2,20,256]{2,0,1} dynamic-update-slice(param_4.658, add.573, param_0.299, constant.5157, param_0.299)
+  slice.1383 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.189), slice={[0:2], [10:11], [0:256]}
+  constant.6800 = s32[] constant(0)
+  broadcast.4803 = s32[3]{0} broadcast(constant.6800), dimensions={}
+  param_8.484 = s32[3]{0} parameter(8)
+  compare.681 = pred[3]{0} compare(broadcast.4803, param_8.484), direction=LE
+  constant.6798 = pred[] constant(true)
+  reduce.203 = pred[] reduce(compare.681, constant.6798), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2932 = pred[2,1,256]{2,0,1} broadcast(reduce.203), dimensions={}
+  param_3.1169 = f32[2,512]{1,0} parameter(3)
+  slice.1382 = f32[2,256]{1,0} slice(param_3.1169), slice={[0:2], [0:256]}
+  bitcast.340 = f32[2,1,256]{2,0,1} bitcast(slice.1382)
+  select.172 = f32[2,1,256]{2,0,1} select(broadcast.2932, bitcast.340, broadcast.3227)
+  add.572 = f32[2,1,256]{2,0,1} add(slice.1383, select.172)
+  constant.5154 = s32[] constant(10)
+  dynamic-update-slice.188 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.189, add.572, param_0.299, constant.5154, param_0.299)
+  slice.1381 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.188), slice={[0:2], [9:10], [0:256]}
+  constant.6794 = s32[] constant(0)
+  broadcast.4801 = s32[3]{0} broadcast(constant.6794), dimensions={}
+  param_7.478 = s32[3]{0} parameter(7)
+  compare.679 = pred[3]{0} compare(broadcast.4801, param_7.478), direction=LE
+  constant.6793 = pred[] constant(true)
+  reduce.201 = pred[] reduce(compare.679, constant.6793), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2930 = pred[2,1,256]{2,0,1} broadcast(reduce.201), dimensions={}
+  param_2.1685 = f32[2,512]{1,0} parameter(2)
+  slice.1380 = f32[2,256]{1,0} slice(param_2.1685), slice={[0:2], [0:256]}
+  bitcast.339 = f32[2,1,256]{2,0,1} bitcast(slice.1380)
+  select.171 = f32[2,1,256]{2,0,1} select(broadcast.2930, bitcast.339, broadcast.3227)
+  add.571 = f32[2,1,256]{2,0,1} add(slice.1381, select.171)
+  constant.5153 = s32[] constant(9)
+  dynamic-update-slice.187 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.188, add.571, param_0.299, constant.5153, param_0.299)
+  slice.1379 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.187), slice={[0:2], [8:9], [0:256]}
+  constant.6788 = s32[] constant(0)
+  broadcast.4799 = s32[3]{0} broadcast(constant.6788), dimensions={}
+  param_6.495 = s32[3]{0} parameter(6)
+  compare.677 = pred[3]{0} compare(broadcast.4799, param_6.495), direction=LE
+  constant.6786 = pred[] constant(true)
+  reduce.199 = pred[] reduce(compare.677, constant.6786), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2929 = pred[2,1,256]{2,0,1} broadcast(reduce.199), dimensions={}
+  param_1.1408 = f32[2,512]{1,0} parameter(1)
+  slice.1378 = f32[2,256]{1,0} slice(param_1.1408), slice={[0:2], [0:256]}
+  bitcast.338 = f32[2,1,256]{2,0,1} bitcast(slice.1378)
+  select.170 = f32[2,1,256]{2,0,1} select(broadcast.2929, bitcast.338, broadcast.3227)
+  add.570 = f32[2,1,256]{2,0,1} add(slice.1379, select.170)
+  constant.5152 = s32[] constant(8)
+  ROOT dynamic-update-slice.186 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.187, add.570, param_0.299, constant.5152, param_0.299)
+}
+
+fused_computation.2 {
+  param_4.655 = f32[2,20,256]{2,0,1} parameter(4)
+  slice.1369 = f32[2,1,256]{2,0,1} slice(param_4.655), slice={[0:2], [7:8], [0:256]}
+  param_6.483 = pred[] parameter(6)
+  broadcast.2927 = pred[2,1,256]{2,0,1} broadcast(param_6.483), dimensions={}
+  param_5.525 = f32[2,512]{1,0} parameter(5)
+  slice.1368 = f32[2,256]{1,0} slice(param_5.525), slice={[0:2], [0:256]}
+  bitcast.333 = f32[2,1,256]{2,0,1} bitcast(slice.1368)
+  constant.5415 = f32[] constant(0)
+  broadcast.3225 = f32[2,1,256]{2,0,1} broadcast(constant.5415), dimensions={}
+  select.161 = f32[2,1,256]{2,0,1} select(broadcast.2927, bitcast.333, broadcast.3225)
+  add.549 = f32[2,1,256]{2,0,1} add(slice.1369, select.161)
+  param_0.265 = s32[] parameter(0)
+  constant.5151 = s32[] constant(7)
+  dynamic-update-slice.185 = f32[2,20,256]{2,0,1} dynamic-update-slice(param_4.655, add.549, param_0.265, constant.5151, param_0.265)
+  slice.1367 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.185), slice={[0:2], [6:7], [0:256]}
+  constant.6782 = s32[] constant(0)
+  broadcast.4797 = s32[3]{0} broadcast(constant.6782), dimensions={}
+  param_9.391 = s32[3]{0} parameter(9)
+  compare.675 = pred[3]{0} compare(broadcast.4797, param_9.391), direction=LE
+  constant.6781 = pred[] constant(true)
+  reduce.197 = pred[] reduce(compare.675, constant.6781), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2926 = pred[2,1,256]{2,0,1} broadcast(reduce.197), dimensions={}
+  param_3.1167 = f32[2,512]{1,0} parameter(3)
+  slice.1366 = f32[2,256]{1,0} slice(param_3.1167), slice={[0:2], [0:256]}
+  bitcast.332 = f32[2,1,256]{2,0,1} bitcast(slice.1366)
+  select.160 = f32[2,1,256]{2,0,1} select(broadcast.2926, bitcast.332, broadcast.3225)
+  add.548 = f32[2,1,256]{2,0,1} add(slice.1367, select.160)
+  constant.5150 = s32[] constant(6)
+  dynamic-update-slice.184 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.185, add.548, param_0.265, constant.5150, param_0.265)
+  slice.1365 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.184), slice={[0:2], [5:6], [0:256]}
+  constant.6776 = s32[] constant(0)
+  broadcast.4794 = s32[3]{0} broadcast(constant.6776), dimensions={}
+  param_8.464 = s32[3]{0} parameter(8)
+  compare.673 = pred[3]{0} compare(broadcast.4794, param_8.464), direction=LE
+  constant.6775 = pred[] constant(true)
+  reduce.195 = pred[] reduce(compare.673, constant.6775), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2925 = pred[2,1,256]{2,0,1} broadcast(reduce.195), dimensions={}
+  param_2.1684 = f32[2,512]{1,0} parameter(2)
+  slice.1364 = f32[2,256]{1,0} slice(param_2.1684), slice={[0:2], [0:256]}
+  bitcast.331 = f32[2,1,256]{2,0,1} bitcast(slice.1364)
+  select.159 = f32[2,1,256]{2,0,1} select(broadcast.2925, bitcast.331, broadcast.3225)
+  add.547 = f32[2,1,256]{2,0,1} add(slice.1365, select.159)
+  constant.5149 = s32[] constant(5)
+  dynamic-update-slice.183 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.184, add.547, param_0.265, constant.5149, param_0.265)
+  slice.1363 = f32[2,1,256]{2,0,1} slice(dynamic-update-slice.183), slice={[0:2], [4:5], [0:256]}
+  constant.6770 = s32[] constant(0)
+  broadcast.4792 = s32[3]{0} broadcast(constant.6770), dimensions={}
+  param_7.458 = s32[3]{0} parameter(7)
+  compare.671 = pred[3]{0} compare(broadcast.4792, param_7.458), direction=LE
+  constant.6769 = pred[] constant(true)
+  reduce.193 = pred[] reduce(compare.671, constant.6769), dimensions={0}, to_apply=and.reduce_sub_computation
+  broadcast.2924 = pred[2,1,256]{2,0,1} broadcast(reduce.193), dimensions={}
+  param_1.1405 = f32[2,512]{1,0} parameter(1)
+  slice.1362 = f32[2,256]{1,0} slice(param_1.1405), slice={[0:2], [0:256]}
+  bitcast.330 = f32[2,1,256]{2,0,1} bitcast(slice.1362)
+  select.158 = f32[2,1,256]{2,0,1} select(broadcast.2924, bitcast.330, broadcast.3225)
+  add.546 = f32[2,1,256]{2,0,1} add(slice.1363, select.158)
+  constant.5148 = s32[] constant(4)
+  ROOT dynamic-update-slice.182 = f32[2,20,256]{2,0,1} dynamic-update-slice(dynamic-update-slice.183, add.546, param_0.265, constant.5148, param_0.265)
+}
+
+ENTRY main {
+  param_0.0 = s32[] parameter(0)
+  param_1.0 = f32[2,512]{1,0} parameter(1)
+  param_2.0 = f32[2,512]{1,0} parameter(2)
+  param_3.0 = f32[2,512]{1,0} parameter(3)
+  param_4.0 = f32[2,20,256]{2,1,0} parameter(4)
+  param_5.0 = f32[2,512]{1,0} parameter(5)
+  param_6.0 = s32[3]{0} parameter(6)
+  param_7.0 = s32[3]{0} parameter(7)
+  param_8.0 = s32[3]{0} parameter(8)
+  param_9.0 = s32[3]{0} parameter(9)
+  fusion.1 = f32[2,20,256]{2,0,1} fusion(param_0.0, param_1.0, param_2.0, param_3.0, param_4.0, param_5.0, param_6.0, param_7.0, param_8.0, param_9.0), kind=kLoop, calls=fused_computation.1
+  param_10 = pred[] parameter(10)
+  fusion.2 = f32[2,20,256]{2,0,1} fusion(param_0.0, param_1.0, param_2.0, param_3.0, fusion.1, param_5.0, param_10, param_7.0, param_8.0, param_9.0), kind=kLoop, calls=fused_computation.2
+  ROOT root = (f32[2,20,256]{2,0,1}, f32[2,20,256]{2,0,1}) tuple(fusion.1, fusion.2)
+}
+  )")
+                    .ValueOrDie();
+  EXPECT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
index 25ab9a7ce6e..b13f71c5a13 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
@@ -514,11 +514,40 @@ void RendezvousNcclAllReduce::CleanupImpl(std::shared_ptr<NcclClique> handle,
 // header.  In particular, this stores the thunk's cache of all NcclCliques it's
 // ever used.  This causes those cliques to stay alive as long as the thunk
 // lives, which is how we avoid expensive reinitialization of NCCL cliques.
-struct NcclAllReduceThunk::AuxData {
+struct NcclAllReduceConfig::AuxData {
   tensorflow::mutex mu;
   absl::flat_hash_set<std::shared_ptr<NcclClique>> cliques TF_GUARDED_BY(mu);
 };
 
+NcclAllReduceConfig::NcclAllReduceConfig(NcclAllReduceConfig&&) = default;
+NcclAllReduceConfig::~NcclAllReduceConfig() = default;
+
+NcclAllReduceConfig GetNcclAllReduceConfig(const HloInstruction* instr,
+                                           int64 replica_count) {
+  NcclAllReduceConfig config;
+  config.operand_count = instr->operands().size();
+  config.operand_element_type.reserve(config.operand_count);
+  for (int i = 0; i < config.operand_count; i++) {
+    config.operand_element_type.push_back(
+        instr->operand(i)->shape().element_type());
+  }
+  config.replica_count = replica_count;
+  config.replica_groups = instr->replica_groups();
+  auto reduction_kind = MatchReductionComputation(instr->to_apply());
+  CHECK(reduction_kind.has_value());
+  config.reduction_kind = reduction_kind.value();
+
+  if (instr->channel_id().has_value()) {
+    config.collective_op_kind = RendezvousKey::kCrossModule;
+    config.op_id = instr->channel_id().value();
+  } else {
+    config.collective_op_kind = RendezvousKey::kCrossReplica;
+    config.op_id = static_cast<int64>(instr->GetModule()->unique_id());
+  }
+  config.aux_data = std::make_unique<NcclAllReduceConfig::AuxData>();
+  return config;
+}
+
 /*static*/ bool NcclAllReduceThunk::CanImplement(const HloInstruction* crs) {
   auto operands_are_supported = [crs]() {
     return absl::c_all_of(crs->operands(), [](HloInstruction* operand) {
@@ -541,14 +570,12 @@ NcclAllReduceThunk::DevicesWithOpenNcclChannels() {
 }
 
 NcclAllReduceThunk::NcclAllReduceThunk(
-    ThunkInfo thunk_info, int64 replica_count,
+    ThunkInfo thunk_info, NcclAllReduceConfig&& config,
     std::vector<NcclAllReduceThunk::Buffer> buffers)
     : Thunk(Thunk::kNcclAllReduce, thunk_info),
-      hlo_instruction_(thunk_info.hlo_instruction),
-      replica_count_(replica_count),
-      buffers_(std::move(buffers)),
-      aux_data_(absl::make_unique<AuxData>()) {
-  CHECK_EQ(hlo_instruction_->operand_count(), buffers_.size());
+      config_(std::move(config)),
+      buffers_(std::move(buffers)) {
+  CHECK_EQ(config_.operand_count, buffers_.size());
 }
 
 // Figures out which devices (named by their replica-ids) are participating in
@@ -558,7 +585,6 @@ Status NcclAllReduceThunk::ExecuteOnStream(const ExecuteParams& params) {
   auto op_profiler =
       params.profiler->MakeScopedInstructionProfiler(profile_index());
 
-  auto* instr = Cast<HloAllReduceInstruction>(hlo_instruction_);
   int64 local_device_ordinal = params.stream->parent()->device_ordinal();
   GlobalDeviceId global_device_id;
   if (params.gpu_global_device_ids) {
@@ -574,10 +600,10 @@ Status NcclAllReduceThunk::ExecuteOnStream(const ExecuteParams& params) {
   // the same collective group as the caller.
   TF_ASSIGN_OR_RETURN(
       std::vector<int64> global_participating_replicas,
-      GetParticipatingReplicas(global_device_id, instr->replica_groups(),
-                               replica_count_, *params.device_assn));
+      GetParticipatingReplicas(global_device_id, config_.replica_groups,
+                               config_.replica_count, *params.device_assn));
   if (IsGlobalNcclConfig() &&
-      global_participating_replicas.size() != replica_count_) {
+      global_participating_replicas.size() != config_.replica_count) {
     return InvalidArgument(
         "Partial replica groups are not allowed when using NCCL_COMM_ID "
         "environment configuration.");
@@ -605,10 +631,10 @@ Status NcclAllReduceThunk::ExecuteOnStream(const ExecuteParams& params) {
   }
   absl::c_sort(global_devices);
 
-  // Find or create the rendezvous for this collective operation.
-  RendezvousKey rendezvous_key = RendezvousKey::FromInstruction(
-      params.run_id, global_devices, local_devices.size(), hlo_instruction_);
-
+  // Create the rendezvous for this collective operation.
+  RendezvousKey rendezvous_key(params.run_id, global_devices,
+                               local_devices.size(), config_.collective_op_kind,
+                               config_.op_id);
   if (VLOG_IS_ON(2)) {
     std::vector<std::string> local_participants;
     for (const auto& entry : local_devices) {
@@ -633,15 +659,12 @@ Status NcclAllReduceThunk::ExecuteOnStream(const ExecuteParams& params) {
         params.buffer_allocations->GetDeviceAddress(buffer.source_buffer);
     pbuffer.destination_data =
         params.buffer_allocations->GetDeviceAddress(buffer.destination_buffer);
-    pbuffer.primitive_type =
-        hlo_instruction_->operand(i)->shape().element_type();
+    pbuffer.primitive_type = config_.operand_element_type[i];
     participant.buffers.push_back(pbuffer);
   }
   participant.local_devices = std::move(local_devices);
   participant.nccl_unique_id_callback = params.nccl_unique_id_callback;
-  auto reduction_kind = MatchReductionComputation(hlo_instruction_->to_apply());
-  CHECK(reduction_kind.has_value());
-  participant.reduction_kind = *reduction_kind;
+  participant.reduction_kind = config_.reduction_kind;
 
   auto rendezvous_factory = [](const RendezvousKey& k) {
     return absl::make_unique<RendezvousNcclAllReduce>(k);
@@ -658,13 +681,11 @@ Status NcclAllReduceThunk::ExecuteOnStream(const ExecuteParams& params) {
   // Keep the clique we used alive for as long as this Thunk lives.  Creating
   // new NCCL cliques is expensive, and this is how we avoid thrashing them.
   {
-    tensorflow::mutex_lock lock(aux_data_->mu);
-    aux_data_->cliques.insert(std::move(clique));
+    tensorflow::mutex_lock lock(config_.aux_data->mu);
+    config_.aux_data->cliques.insert(std::move(clique));
   }
   return Status::OK();
 }
 
-NcclAllReduceThunk::~NcclAllReduceThunk() {}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
index cbd4fd3aa51..20e4adef7b1 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
@@ -18,11 +18,13 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
@@ -30,6 +32,30 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+struct NcclAllReduceConfig {
+  int64 operand_count;
+  std::vector<PrimitiveType> operand_element_type;
+  int64 replica_count;
+  std::vector<ReplicaGroup> replica_groups;
+  ReductionKind reduction_kind;
+  RendezvousKey::CollectiveOpKind collective_op_kind;
+  int64 op_id;
+
+  NcclAllReduceConfig() = default;
+  NcclAllReduceConfig(NcclAllReduceConfig &&);
+  ~NcclAllReduceConfig();
+
+  // Extra data stored in NcclAllReduceThunk whose types we don't want exposed
+  // in the header file.  (This is mainly because the implementation of
+  // NcclAllReduceThunk is different depending on whether CUDA is enabled in the
+  // build, and we don't want to expose *that* mess in the header.)
+  struct AuxData;
+  std::unique_ptr<AuxData> aux_data;
+};
+
+NcclAllReduceConfig GetNcclAllReduceConfig(const HloInstruction *instr,
+                                           int64 replica_count);
+
 // Thunk that performs a NCCL-based All-Reduce among CUDA GPU-based replicas.
 class NcclAllReduceThunk : public Thunk {
  public:
@@ -56,9 +82,8 @@ class NcclAllReduceThunk : public Thunk {
     BufferAllocation::Slice source_buffer;
     BufferAllocation::Slice destination_buffer;
   };
-  NcclAllReduceThunk(ThunkInfo thunk_info, int64 replica_count,
+  NcclAllReduceThunk(ThunkInfo thunk_info, NcclAllReduceConfig &&config,
                      std::vector<Buffer> buffers);
-  ~NcclAllReduceThunk() override;
 
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
@@ -67,16 +92,8 @@ class NcclAllReduceThunk : public Thunk {
   static bool CanImplement(const HloInstruction* crs);
 
  private:
-  // Extra data stored in NcclAllReduceThunk whose types we don't want exposed
-  // in the header file.  (This is mainly because the implementation of
-  // NcclAllReduceThunk is different depending on whether CUDA is enabled in the
-  // build, and we don't want to expose *that* mess in the header.)
-  struct AuxData;
-
-  const HloInstruction* hlo_instruction_;
-  const int64 replica_count_;
+  const NcclAllReduceConfig config_;
   const std::vector<Buffer> buffers_;
-  std::unique_ptr<AuxData> aux_data_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index eefa4661d37..77c54e48a70 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -198,6 +198,42 @@ absl::optional<bool> CanShareBufferHint(const HloInstruction* user,
   return absl::nullopt;
 }
 
+// Try to load ptx from files defined in the FLAGS. If successful, return true.
+bool MaybeLoadPtxFromFile(const HloModule* module, std::string* ptx) {
+  // If the xla_gpu_ptx_file options is set, be explicit when a file is used
+  // and warn when a file is not used to ease catching typo in filename.
+  std::string prefix = xla::FilenameFor(*module, "", *ptx);
+  std::string matched_filename;
+  for (const string& full_filename :
+       module->config().debug_options().xla_gpu_ptx_file()) {
+    // To ease comparing many PTX versions, accept different suffixes then
+    // the original filename.
+    auto filename = tensorflow::io::Basename(full_filename);
+    if (absl::StartsWith(filename, prefix)) {
+      matched_filename = full_filename;
+      VLOG(0) << "RunBackend() - Will load PTX from file: " << full_filename;
+      break;
+    }
+  }
+  if (module->config().debug_options().xla_gpu_ptx_file().size() > 0 &&
+      matched_filename.empty()) {
+    VLOG(0) << "RunBackend() - For module with prefix '" << prefix
+            << "', we did not found a PTX file to load.";
+  }
+
+  if (!matched_filename.empty()) {
+    std::ifstream ifs(matched_filename, std::ifstream::in);
+    *ptx = std::string(std::istreambuf_iterator<char>(ifs),
+                       std::istreambuf_iterator<char>());
+    CHECK(!ptx->empty()) << "Empty or non existing PTX file: "
+                         << matched_filename;
+    return true;
+  }
+  return false;
+}
+
+}  // namespace
+
 // Prints a warning if the ptx->sass JIT in the driver has known bugs.
 //
 // Using such a driver only a problem if we fail to use ptxas to compile our ptx
@@ -238,42 +274,6 @@ void WarnIfBadDriverJITVersion() {
   });
 }
 
-// Try to load ptx from files defined in the FLAGS. If successful, return true.
-bool MaybeLoadPtxFromFile(const HloModule* module, std::string* ptx) {
-  // If the xla_gpu_ptx_file options is set, be explicit when a file is used
-  // and warn when a file is not used to ease catching typo in filename.
-  std::string prefix = xla::FilenameFor(*module, "", *ptx);
-  std::string matched_filename;
-  for (const string& full_filename :
-       module->config().debug_options().xla_gpu_ptx_file()) {
-    // To ease comparing many PTX versions, accept different suffixes then
-    // the original filename.
-    auto filename = tensorflow::io::Basename(full_filename);
-    if (absl::StartsWith(filename, prefix)) {
-      matched_filename = full_filename;
-      VLOG(0) << "RunBackend() - Will load PTX from file: " << full_filename;
-      break;
-    }
-  }
-  if (module->config().debug_options().xla_gpu_ptx_file().size() > 0 &&
-      matched_filename.empty()) {
-    VLOG(0) << "RunBackend() - For module with prefix '" << prefix
-            << "', we did not found a PTX file to load.";
-  }
-
-  if (!matched_filename.empty()) {
-    std::ifstream ifs(matched_filename, std::ifstream::in);
-    *ptx = std::string(std::istreambuf_iterator<char>(ifs),
-                       std::istreambuf_iterator<char>());
-    CHECK(!ptx->empty()) << "Empty or non existing PTX file: "
-                         << matched_filename;
-    return true;
-  }
-  return false;
-}
-
-}  // namespace
-
 NVPTXCompiler::NVPTXCompiler()
     : GpuCompiler(stream_executor::cuda::kCudaPlatformId, nvptx::kTargetTriple,
                   nvptx::kDataLayout) {}
@@ -415,7 +415,9 @@ std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
                   "using $PATH.",
                   hlo_module_config);
             }
-          } else {
+          } else if (maybe_cubin.status().code() !=
+                     tensorflow::error::Code::UNIMPLEMENTED) {
+            // If unimplemented is returned, we fallback to the driver.
             LOG(FATAL) << "ptxas returned an error during compilation of ptx "
                           "to sass: '"
                        << maybe_cubin.status() << "'  "
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
index e69be947522..3e19b35af19 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@@ -30,6 +30,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+void WarnIfBadDriverJITVersion();
+
 // NVPTXCompiler generates efficient GPU executables for NVPTX target.
 class NVPTXCompiler : public GpuCompiler {
  public:
diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc
index 83066a4addf..6eef1b9f0b9 100644
--- a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.cc
@@ -14,26 +14,34 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/outfeed_thunk.h"
+
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/gpu/outfeed_manager.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace xla {
 namespace gpu {
 
-OutfeedThunk::OutfeedThunk(ThunkInfo thunk_info,
+OutfeedConfig GetOutfeedConfig(const HloInstruction* instr) {
+  OutfeedConfig config;
+  config.input_shape = instr->operand(0)->shape();
+  return config;
+}
+
+OutfeedThunk::OutfeedThunk(ThunkInfo thunk_info, OutfeedConfig&& config,
                            ShapeTree<BufferAllocation::Slice> outfeed_slices)
     : Thunk(Kind::kOutfeed, thunk_info),
-      hlo_instruction_(thunk_info.hlo_instruction),
+      config_(std::move(config)),
       outfeed_slices_(std::move(outfeed_slices)) {}
 
 Status OutfeedThunk::ExecuteOnStream(const ExecuteParams& params) {
   auto& stream = *params.stream;
   auto& buffer_allocations = *params.buffer_allocations;
 
-  VLOG(2) << "Outfeeding from GPU: " << hlo_instruction_->ToString();
+  VLOG(2) << "Outfeeding from GPU";
 
   auto op_profiler =
       params.profiler->MakeScopedInstructionProfiler(profile_index());
@@ -42,13 +50,12 @@ Status OutfeedThunk::ExecuteOnStream(const ExecuteParams& params) {
       outfeed_manager->BlockingGetNextDestination();
 
   // Nothing to be done for empty tuples.
-  if (ShapeUtil::IsEmptyTuple(hlo_instruction_->operand(0)->shape())) {
+  if (ShapeUtil::IsEmptyTuple(config_.input_shape)) {
     return Status::OK();
   }
-  CHECK(ShapeUtil::Compatible(hlo_instruction_->operand(0)->shape(),
-                              outfeed_buffers->shape()))
+  CHECK(ShapeUtil::Compatible(config_.input_shape, outfeed_buffers->shape()))
       << "XLA program outfeed request of shape "
-      << hlo_instruction_->operand(0)->shape().ToString()
+      << config_.input_shape.ToString()
       << " did not match the runtime's outfeed buffer of shape "
       << outfeed_buffers->shape().ToString();
 
diff --git a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.h b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.h
index 9174e605783..60c64858ee7 100644
--- a/tensorflow/compiler/xla/service/gpu/outfeed_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/outfeed_thunk.h
@@ -25,6 +25,12 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+struct OutfeedConfig {
+  Shape input_shape;
+};
+
+OutfeedConfig GetOutfeedConfig(const HloInstruction* instr);
+
 // A thunk that outfeeds data. Data must be already resident on the host. This
 // thunk performs a host to device copy from the buffer allocated for the
 // outfeed op to the host location.
@@ -32,7 +38,7 @@ class OutfeedThunk : public Thunk {
  public:
   // Constructs a OutfeedThunk that copies data to the host-side
   // outfeed queue from the buffers in the given shape tree.
-  OutfeedThunk(ThunkInfo thunk_info,
+  OutfeedThunk(ThunkInfo thunk_info, OutfeedConfig&& config,
                ShapeTree<BufferAllocation::Slice> outfeed_slices);
 
   OutfeedThunk(const OutfeedThunk&) = delete;
@@ -41,7 +47,7 @@ class OutfeedThunk : public Thunk {
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
-  const HloInstruction* hlo_instruction_;
+  const OutfeedConfig config_;
   const ShapeTree<BufferAllocation::Slice> outfeed_slices_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
index f9937ba77de..45c4f25d8e8 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/gpu/target_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -58,7 +59,8 @@ ParallelLoopEmitter::ParallelLoopEmitter(
 
 std::vector<llvm_ir::IrArray::Index>
 ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
-                                                   llvm::Type* index_type) {
+                                                   llvm::Type* index_type,
+                                                   llvm::Value* base_index) {
   // Emit the following code in LLVM IR:
   //   linear_index = blockIdx.x * blockDim.x + threadIdx.x;
   //   if (linear_index < num_elements) {
@@ -75,7 +77,7 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
   std::vector<llvm_ir::IrArray::Index> array_indices;
   llvm::Value* block_id =
       EmitCallToTargetIntrinsic(TargetIntrinsicID::kBlockIdx, {}, {}, b_);
-  llvm_ir::AddRangeMetadata(0, launch_dimensions_.block_count(),
+  llvm_ir::AddRangeMetadata(0, launch_dimensions_.block_counts().x,
                             static_cast<llvm::Instruction*>(block_id));
   block_id = b_->CreateZExtOrTrunc(block_id, index_type, "block_id");
 
@@ -85,16 +87,17 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
   // %ntid.x is currently specified as 1024.
   llvm::Value* thread_id =
       EmitCallToTargetIntrinsic(TargetIntrinsicID::kThreadIdx, {}, {}, b_);
-  llvm_ir::AddRangeMetadata(0, launch_dimensions_.threads_per_block(),
+  llvm_ir::AddRangeMetadata(0, launch_dimensions_.thread_counts_per_block().x,
                             static_cast<llvm::Instruction*>(thread_id));
   thread_id = b_->CreateZExtOrTrunc(thread_id, index_type, "thread_id");
 
   llvm::Value* linear_index_base = b_->CreateAdd(
-      b_->CreateMul(block_id,
-                    llvm::ConstantInt::get(
-                        index_type, launch_dimensions_.threads_per_block()),
-                    "",
-                    /*HasNUW=*/true, /*HasNSW=*/true),
+      b_->CreateMul(
+          block_id,
+          llvm::ConstantInt::get(
+              index_type, launch_dimensions_.thread_counts_per_block().x),
+          "",
+          /*HasNUW=*/true, /*HasNSW=*/true),
       thread_id, "linear_index", /*HasNUW=*/true, /*HasNSW=*/true);
 
   // Add an @llvm.assume(linear_index < threads_per_block * num_blocks).
@@ -109,9 +112,9 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
       llvm::Intrinsic::assume,
       {b_->CreateICmpULT(
           linear_index_base,
-          llvm::ConstantInt::get(index_type,
-                                 launch_dimensions_.threads_per_block() *
-                                     launch_dimensions_.block_count()),
+          llvm::ConstantInt::get(
+              index_type, launch_dimensions_.thread_counts_per_block().x *
+                              launch_dimensions_.block_counts().x),
           "linear_index_in_range")},
       {}, b_);
 
@@ -121,6 +124,12 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
         "linear_index_base", /*HasNUW=*/true, /*HasNSW=*/true);
   }
 
+  if (base_index != nullptr) {
+    linear_index_base =
+        b_->CreateAdd(linear_index_base, base_index, "linear_index_plus_base",
+                      /*HasNUW=*/true, /*HasNSW=*/true);
+  }
+
   array_indices.emplace_back(linear_index_base, shape_, b_);
   for (int i = 1; i < unroll_factor_; ++i) {
     llvm::Value* linear_index =
@@ -146,5 +155,43 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name,
   return array_indices;
 }
 
+Status ParallelLoopEmitter::EmitLoop(absl::string_view loop_name,
+                                     llvm::Type* index_type) {
+  if (index_type == nullptr) {
+    index_type = b_->getInt64Ty();
+  }
+  int64 total_threads = launch_dimensions_.launch_bound();
+  int64 num_elements = ShapeUtil::ElementsIn(shape_);
+  // If all the elements are handled by the current threads, no need
+  // to add a loop inside the kernel.
+  if (total_threads * unroll_factor_ >= num_elements) {
+    VLOG(1) << "ParallelLoopEmitter::EmitLoop fallback";
+    return LoopEmitter::EmitLoop(loop_name, index_type);
+  }
+
+  KernelSupportLibrary ksl(b_, llvm_ir::UnrollMode::kDefaultUnroll);
+  auto constant = [&](int64 val) {
+    return llvm::ConstantInt::get(index_type, val);
+  };
+
+  TF_RETURN_IF_ERROR(ksl.ForWithStatus(
+      "loop", constant(0), constant(num_elements),
+      constant(total_threads * unroll_factor_), [&](llvm::Value* base_indvar) {
+        for (const llvm_ir::IrArray::Index& array_index :
+             EmitIndexAndSetExitBasicBlock(loop_name, index_type,
+                                           base_indvar)) {
+          TF_RETURN_IF_ERROR(body_emitter_(array_index));
+        }
+        return Status::OK();
+      }));
+
+  // Set the insertion point of b_ to the loop exit, so that
+  // code emitted for later instructions will be correctly placed.
+  if (exit_bb_ != nullptr) {
+    b_->SetInsertPoint(exit_bb_);
+  }
+  return Status::OK();
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
index 0a6b5430b23..5e142ec3832 100644
--- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.h
@@ -57,7 +57,11 @@ class ParallelLoopEmitter : public llvm_ir::LoopEmitter {
   ~ParallelLoopEmitter() override = default;
 
   std::vector<llvm_ir::IrArray::Index> EmitIndexAndSetExitBasicBlock(
-      absl::string_view loop_name, llvm::Type* index_type) override;
+      absl::string_view loop_name, llvm::Type* index_type,
+      llvm::Value* base_index) override;
+
+  Status EmitLoop(absl::string_view loop_name = "",
+                  llvm::Type* index_type = nullptr);
 
  private:
   // The thread and block dimension to parallelize the loop on.
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
index d7468a31377..7293b1485fc 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
@@ -209,16 +209,18 @@ StatusOr<std::unique_ptr<se::KernelBase>> CreateKernel(
 
 Status ExecuteKernelOnStream(const se::KernelBase& kernel,
                              absl::Span<const se::DeviceMemoryBase> args,
-                             int64 threads_per_block, int64 block_count,
-                             se::Stream* stream) {
+                             const LaunchDimensions& dims, se::Stream* stream) {
   static constexpr int kKernelArgsLimit = 1024;
   auto kernel_args = absl::make_unique<se::KernelArgsArray<kKernelArgsLimit>>();
   for (const se::DeviceMemoryBase& buf : args) {
     kernel_args->add_device_memory_argument(buf);
   }
-  return stream->parent()->Launch(stream, se::ThreadDim(threads_per_block),
-                                  se::BlockDim(block_count), kernel,
-                                  *kernel_args);
+  LaunchDimensions::Dim3D thread_counts = dims.thread_counts_per_block();
+  LaunchDimensions::Dim3D block_counts = dims.block_counts();
+  return stream->parent()->Launch(
+      stream, se::ThreadDim(thread_counts.x, thread_counts.y, thread_counts.z),
+      se::BlockDim(block_counts.x, block_counts.y, block_counts.z), kernel,
+      *kernel_args);
 }
 
 se::GpuAsmOpts PtxOptsFromConfig(const HloModuleConfig& hlo_module_config) {
@@ -317,5 +319,35 @@ void InitializeBuffer(se::Stream* stream, PrimitiveType buffer_type,
   }
 }
 
+StatusOr<se::dnn::ConvolutionKind> GetDNNConvKindFromCudnnConvKind(
+    CudnnConvKind kind) {
+  switch (kind) {
+    case CudnnConvKind::kBackwardFilter:
+      return se::dnn::BACKWARD_FILTER;
+    case CudnnConvKind::kBackwardInput:
+      return se::dnn::BACKWARD_DATA;
+    case CudnnConvKind::kForward:
+      return se::dnn::FORWARD;
+    default:
+      break;
+  }
+  return InternalError("Unexpected convolution kind");
+}
+
+StatusOr<se::dnn::DataType> GetDNNDataTypeFromPrimitiveType(
+    PrimitiveType type) {
+  switch (type) {
+    case F16:
+      return se::dnn::ToDataType<Eigen::half>::value;
+    case F32:
+      return se::dnn::ToDataType<float>::value;
+    case F64:
+      return se::dnn::ToDataType<double>::value;
+    default:
+      break;
+  }
+  return InternalError("Unsupported convolution datatype");
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
index 0a5e0e93a51..2b58496e05c 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/layout.h"
+#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -71,8 +73,7 @@ StatusOr<std::unique_ptr<se::KernelBase>> CreateKernel(
 // Runs loaded kernel on the stream with the provided arguments.
 Status ExecuteKernelOnStream(const se::KernelBase& kernel,
                              absl::Span<const se::DeviceMemoryBase> args,
-                             int64 threads_per_block, int64 block_count,
-                             se::Stream* stream);
+                             const LaunchDimensions& dims, se::Stream* stream);
 
 // Create GpuAsmOpts out of HloModuleConfig.
 se::GpuAsmOpts PtxOptsFromConfig(const HloModuleConfig& hlo_module_config);
@@ -86,6 +87,10 @@ se::GpuAsmOpts PtxOptsFromConfig(const HloModuleConfig& hlo_module_config);
 void InitializeBuffer(se::Stream* stream, PrimitiveType buffer_type,
                       int64* rng_state, se::DeviceMemoryBase buffer);
 
+StatusOr<se::dnn::ConvolutionKind> GetDNNConvKindFromCudnnConvKind(
+    CudnnConvKind kind);
+StatusOr<se::dnn::DataType> GetDNNDataTypeFromPrimitiveType(PrimitiveType type);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index 809b277317f..681e025ba1f 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -4,6 +4,8 @@
 # TODO(jlebar): None of these tests actually use the GPU, so they should not
 # need to run on machines with GPUs present.
 
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 load(
@@ -219,6 +221,28 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "parallel_reduction_test",
+    srcs = [
+        "parallel_reduction_test.cc",
+    ],
+    tags = tf_cuda_tests_tags() + ["no_rocm"],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service/gpu:gpu_executable",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "gpu_copy_test",
     srcs = ["gpu_copy_test.cc"],
@@ -375,6 +399,8 @@ tf_cc_test(
         ":gpu_codegen_test",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service/gpu:gpu_fusible",
+        "//tensorflow/compiler/xla/service/gpu:instruction_fusion",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -528,8 +554,15 @@ filegroup(
 # Binary with only the thunks dialect registered, for testing purposes.
 tf_cc_binary(
     name = "xla-thunks-opt",
+    srcs = ["xla_thunks_opt.cc"],
     deps = [
-        "//tensorflow/compiler/mlir:tf_mlir_opt_main",
-        "//tensorflow/compiler/xla/service/gpu:xla_thunks_dialect_registration",
+        "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/hlo:hlo_dialect_registration",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/xla/service/gpu:xla_thunks_ops",
+        "//tensorflow/core:lib",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
+        "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:Shape",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc
index 674b436a8e3..811705d2b17 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_fusion_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include <utility>
 
+#include "tensorflow/compiler/xla/service/gpu/gpu_fusible.h"
+#include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
@@ -54,6 +56,37 @@ TEST_F(GpuFusionTest, FusedReshape) {
       )");
 }
 
+// Check that we limit the number of operands to fusions we create.
+TEST_F(GpuFusionTest, FusedBiggerThenThresholdButDoNotChangeTheFusionl) {
+  constexpr int64 kNumParams = kMaxOperandsAndOutputsPerFusion + 1;
+
+  // Compute
+  //   p0 + p1 + p2 + ... + pn,
+  // Use so many parameters that they do not fit into one fusion.
+  auto module = CreateNewVerifiedModule();
+  HloComputation::Builder b(TestName());
+  Shape input_shape = ShapeUtil::MakeShape(F32, {10, 100});
+  Shape slice_shape = ShapeUtil::MakeShape(F32, {10, 2});
+  Shape concat_shape = ShapeUtil::MakeShape(F32, {10, 2 * kNumParams});
+  HloInstruction* input =
+      b.AddInstruction(HloInstruction::CreateParameter(0, input_shape, "p"));
+
+  std::vector<HloInstruction*> slice_params;
+  for (int64 i = 0; i < kNumParams; ++i) {
+    slice_params.push_back(b.AddInstruction(HloInstruction::CreateSlice(
+        slice_shape, input, {0, 0}, {10, 2}, {1, 1})));
+  }
+  b.AddInstruction(
+      HloInstruction::CreateConcatenate(concat_shape, slice_params, 1));
+  module->AddEntryComputation(b.Build());
+  EXPECT_TRUE(GpuInstructionFusion(false).Run(module.get()).ValueOrDie());
+  EXPECT_TRUE(module->entry_computation()->root_instruction()->opcode() ==
+              HloOpcode::kFusion);
+  for (HloInstruction* instr : module->entry_computation()->instructions()) {
+    EXPECT_TRUE(instr->opcode() != HloOpcode::kSlice);
+  }
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
index d1bece038e0..6ed378adfeb 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -833,7 +833,7 @@ TEST_F(GpuKernelTilingTest, RowReductionCorrectShmemUsage) {
   )";
   auto hlo_module = ParseAndReturnVerifiedModule(kHloString).ValueOrDie();
   auto expected_ir = R"(
-; CHECK: shared_cache_{{[0-9]*}} = private addrspace({{[0-9]*}}) global [1 x [32 x float]]
+; CHECK: shared_cache_{{[0-9]*}} = private unnamed_addr addrspace({{[0-9]*}}) global [1 x [32 x float]]
   )";
   CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
index 2f139563b4a..200829efddb 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
@@ -148,8 +148,8 @@ TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedSine) {
     HloModule test_module
 
     ENTRY SineFunc {
-      p0 = f32[160000]{0} parameter(0)
-      ROOT s = f32[160000]{0} sine(p0)
+      p0 = f32[1600000]{0} parameter(0)
+      ROOT s = f32[1600000]{0} sine(p0)
     })";
   auto hlo_module =
       ParseAndReturnVerifiedModule(kUnfusedAddModule, config).ValueOrDie();
@@ -182,8 +182,8 @@ TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedCosine) {
     HloModule test_module
 
     ENTRY SineFunc {
-      p0 = f32[160000]{0} parameter(0)
-      ROOT s = f32[160000]{0} cosine(p0)
+      p0 = f32[1600000]{0} parameter(0)
+      ROOT s = f32[1600000]{0} cosine(p0)
     })";
   auto hlo_module =
       ParseAndReturnVerifiedModule(kUnfusedAddModule, config).ValueOrDie();
@@ -216,8 +216,8 @@ TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedPower) {
     HloModule test_module
 
     ENTRY SineFunc {
-      p0 = f32[160000]{0} parameter(0)
-      ROOT s = f32[160000]{0} power(p0, p0)
+      p0 = f32[1600000]{0} parameter(0)
+      ROOT s = f32[1600000]{0} power(p0, p0)
     })";
   auto hlo_module =
       ParseAndReturnVerifiedModule(kUnfusedAddModule, config).ValueOrDie();
@@ -241,8 +241,8 @@ TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedAtan2) {
     HloModule test_module
 
     ENTRY SineFunc {
-      p0 = f32[160000]{0} parameter(0)
-      ROOT s = f32[160000]{0} atan2(p0, p0)
+      p0 = f32[16000000]{0} parameter(0)
+      ROOT s = f32[16000000]{0} atan2(p0, p0)
     })";
   auto hlo_module =
       ParseAndReturnVerifiedModule(kUnfusedAddModule, config).ValueOrDie();
diff --git a/tensorflow/compiler/xla/service/gpu/tests/parallel_reduction_test.cc b/tensorflow/compiler/xla/service/gpu/tests/parallel_reduction_test.cc
new file mode 100644
index 00000000000..06e547dfe34
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/parallel_reduction_test.cc
@@ -0,0 +1,190 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+class ParallelReductionTest : public GpuCodegenTest {
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    // The test contains a MOF fusion and the XLA optimizer passes
+    // don't like this.
+    debug_options.set_xla_disable_all_hlo_passes(true);
+    return debug_options;
+  }
+};
+
+TEST_F(ParallelReductionTest, TwoParallelReductions) {
+  const char* hlo_text = R"(
+HloModule TwoParallelReductions
+
+%add_f32 {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+%fused_computation {
+  %param0 = f32[1024] parameter(0)
+  %param1 = f32[1024] parameter(1)
+  %constant0 = f32[] constant(0)
+  %reduce1 = f32[] reduce(%param0, %constant0), dimensions={0}, to_apply=%add_f32
+  %reduce2 = f32[] reduce(%param1, %constant0), dimensions={0}, to_apply=%add_f32
+  ROOT %tuple = (f32[], f32[]) tuple(%reduce1, %reduce2)
+}
+
+ENTRY %cluster {
+  %param0 = f32[1024] parameter(0)
+  %param1 = f32[1024] parameter(1)
+  ROOT %fusion = (f32[], f32[])
+      fusion(%param0, %param1), kind=kInput, calls=%fused_computation
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+CHECK: reduce-group-0
+CHECK: reduce-group-1
+CHECK-NOT: reduce-group-2
+)",
+                     /*match_optimized_ir=*/false);
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(ParallelReductionTest, ManyParallelReductions) {
+  std::unique_ptr<VerifiedHloModule> module = CreateNewVerifiedModule();
+  // Simply use a number not too large to avoid long compilation time
+  // and not too small for meaningful test.
+  const size_t num_reduces = 32;
+
+  HloComputation* reduce_computation;
+  {
+    auto embedded_builder = HloComputation::Builder("add");
+    HloInstruction* lhs =
+        embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+            0, ShapeUtil::MakeShape(F32, {}), "lhs"));
+    HloInstruction* rhs =
+        embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+            1, ShapeUtil::MakeShape(F32, {}), "rhs"));
+    embedded_builder.AddInstruction(
+        HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
+    reduce_computation =
+        module->AddEmbeddedComputation(embedded_builder.Build());
+  }
+
+  Shape input_shape = ShapeUtil::MakeShape(F32, {1024});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {});
+  HloComputation* fusion_computation;
+  {
+    auto fusion_builder = HloComputation::Builder("fusion_computation");
+    std::vector<HloInstruction*> outputs;
+    HloInstruction* constant = fusion_builder.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0)));
+    for (size_t i = 0; i < num_reduces; ++i) {
+      HloInstruction* param = fusion_builder.AddInstruction(
+          HloInstruction::CreateParameter(i, input_shape, "param"));
+      HloInstruction* output =
+          fusion_builder.AddInstruction(HloInstruction::CreateReduce(
+              output_shape, param, constant, {0}, reduce_computation));
+      outputs.push_back(output);
+    }
+    fusion_builder.AddInstruction(HloInstruction::CreateTuple(outputs));
+    fusion_computation = module->AddEmbeddedComputation(fusion_builder.Build());
+  }
+
+  HloComputation::Builder b(TestName());
+  std::vector<HloInstruction*> entry_params;
+  std::vector<Shape> output_shapes;
+  for (size_t i = 0; i < num_reduces; ++i) {
+    HloInstruction* param = b.AddInstruction(
+        HloInstruction::CreateParameter(i, input_shape, "param"));
+    entry_params.push_back(param);
+    output_shapes.push_back(output_shape);
+  }
+  b.AddInstruction(HloInstruction::CreateFusion(
+      ShapeUtil::MakeTupleShape(output_shapes),
+      HloInstruction::FusionKind::kInput, entry_params, fusion_computation));
+  module->AddEntryComputation(b.Build());
+
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(ParallelReductionTest, ThreeReductionGroups) {
+  const char* hlo_text = R"(
+HloModule ThreeReductionGroups
+
+%add_f32 {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+%fused_computation {
+  %param0 = f32[1024,128] parameter(0)
+  %param1 = f32[1024,128] parameter(1)
+  %param2 = f32[1024,128] parameter(2)
+  %constant0 = f32[] constant(0)
+  // %mul0, %reduce0, and %reduce1 should go into a group.
+  %broadcast0 = f32[1024,128] broadcast(%constant0), dimensions={}
+  %mul0 = f32[1024,128] multiply(param0, broadcast0)
+  %reduce0 = f32[128] reduce(%mul0, %constant0), dimensions={0}, to_apply=%add_f32
+  %reduce1 = f32[128] reduce(%param0, %constant0), dimensions={0}, to_apply=%add_f32
+  // %reduce2 and %reduce3 should go into another group.
+  %reduce2 = f32[128] reduce(%param1, %constant0), dimensions={0}, to_apply=%add_f32
+  %reduce3 = f32[128] reduce(%param1, %constant0), dimensions={0}, to_apply=%add_f32
+  // %reduce4 and %mul2 should go into the other group, although broadcast0 is
+  // reused.
+  %mul1 = f32[1024,128] multiply(param2, broadcast0)
+  %reduce4 = f32[128] reduce(%mul1, %constant0), dimensions={0}, to_apply=%add_f32
+  %mul2 = f32[1024,128] multiply(param2, param2)
+  ROOT %tuple =
+      (f32[1024, 128], f32[128], f32[128], f32[128], f32[128], f32[128], f32[1024, 128])
+      tuple(%mul2, %reduce0, %reduce4, %reduce3, %reduce2, %reduce1, %mul0)
+}
+
+ENTRY %cluster {
+  %param0 = f32[1024,128] parameter(0)
+  %param1 = f32[1024,128] parameter(1)
+  %param2 = f32[1024,128] parameter(2)
+  ROOT %fusion =
+      (f32[1024, 128], f32[128], f32[128], f32[128], f32[128], f32[128], f32[1024, 128])
+      fusion(%param0, %param1, %param2), kind=kInput, calls=%fused_computation
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+  CompileAndVerifyIr(std::move(hlo_module),
+                     R"(
+CHECK: reduce-group-0
+CHECK: reduce-group-1
+CHECK: reduce-group-2
+CHECK-NOT: reduce-group-3
+)",
+                     /*match_optimized_ir=*/false);
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
index 215c2e627ae..5f97452ff71 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_vectorization_test.cc
@@ -336,8 +336,17 @@ ENTRY %cluster {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
                           ParseAndReturnVerifiedModule(hlo_text));
-  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
-                                R"(
+  const se::DeviceDescription& device_description =
+      backend().default_stream_executor()->GetDeviceDescription();
+  int cc_major = 0, cc_minor = 0;
+  device_description.cuda_compute_capability(&cc_major, &cc_minor);
+
+  string expected;
+  if (cc_major < 6) {
+    // We do not vectorize for GPU before Pascal.
+    expected = "CHECK-NOT: ld.global.nc.v2.f32";
+  } else {
+    expected = R"(
 CHECK: ld.global.nc.v2.f32
 CHECK: st.global.v2.f32
 CHECK: st.global.v2.f32
@@ -350,7 +359,9 @@ CHECK: st.global.v2.f32
 CHECK: ld.global.nc.v2.f32
 CHECK: st.global.v2.f32
 CHECK: st.global.v2.f32
-)");
+)";
+  }
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module), expected);
 
   EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
 }
diff --git a/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo b/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo
index c9e7daeb3bc..f625abe6612 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/scatter.hlo
@@ -1,6 +1,6 @@
 // RUN: hlo_to_llvm_ir %s | FileCheck %s
 
-// CHECK-LABEL: define void @scatter_TensorFlowScatterV1(i8* noalias align 64 dereferenceable(36) %alloc0, i8* noalias align 16 dereferenceable(36) %alloc1, i8* noalias align 16 dereferenceable(24) %alloc2, i8* noalias align 16 dereferenceable(8) %alloc3) {
+// CHECK-LABEL: define void @scatter_TensorFlowScatterV1(i8* noalias align 16 dereferenceable(36) %alloc0, i8* noalias align 16 dereferenceable(24) %alloc1, i8* noalias align 16 dereferenceable(8) %alloc2) {
 // CHECK: entry:
 // CHECK:         %[[VAL_32:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_0:.*]] = getelementptr inbounds i8, i8* %[[VAL_1:.*]], i64 0
@@ -43,8 +43,8 @@
 // CHECK:         store atomic i32 %[[VAL_36]], i32* %[[VAL_31]] unordered, align 4
 // CHECK:         br label %[[VAL_23]]
 // CHECK: !nvvm.annotations = !{!0, !1}
-// CHECK: !0 = !{void (i8*, i8*, i8*, i8*)* @scatter_TensorFlowScatterV1, !"kernel", i32 1}
-// CHECK: !1 = !{void (i8*, i8*, i8*, i8*)* @scatter_TensorFlowScatterV1, !"reqntidx", i32 6}
+// CHECK: !0 = !{void (i8*, i8*, i8*)* @scatter_TensorFlowScatterV1, !"kernel", i32 1}
+// CHECK: !1 = !{void (i8*, i8*, i8*)* @scatter_TensorFlowScatterV1, !"reqntidx", i32 6}
 // CHECK: !2 = !{i32 0, i32 1}
 // CHECK: !3 = !{i32 0, i32 6}
 // CHECK: !4 = !{}
@@ -72,7 +72,7 @@ ENTRY main {
 
 // -----
 
-// CHECK-LABEL: define void @scatter_ScatterIntoScalar(i8* noalias align 64 dereferenceable(4) %alloc0, i8* noalias align 16 dereferenceable(4) %alloc1, i8* noalias align 16 dereferenceable(4) %alloc2, i8* noalias align 16 %alloc3) {
+// CHECK-LABEL: define void @scatter_ScatterIntoScalar(i8* noalias align 16 dereferenceable(4) %alloc0, i8* noalias align 16 dereferenceable(4) %alloc1, i8* noalias align 16 %alloc2) {
 // CHECK:       entry:
 // CHECK:         %[[VAL_60:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_37:.*]] = getelementptr inbounds i8, i8* %[[VAL_38:.*]], i64 0
@@ -104,8 +104,8 @@ ENTRY main {
 // CHECK:         store atomic i32 %[[VAL_62]], i32* %[[VAL_39]] unordered, align 4
 // CHECK:         br label %[[VAL_57]]
 // CHECK: !nvvm.annotations = !{!0, !1}
-// CHECK: !0 = !{void (i8*, i8*, i8*, i8*)* @scatter_ScatterIntoScalar, !"kernel", i32 1}
-// CHECK: !1 = !{void (i8*, i8*, i8*, i8*)* @scatter_ScatterIntoScalar, !"reqntidx", i32 1}
+// CHECK: !0 = !{void (i8*, i8*, i8*)* @scatter_ScatterIntoScalar, !"kernel", i32 1}
+// CHECK: !1 = !{void (i8*, i8*, i8*)* @scatter_ScatterIntoScalar, !"reqntidx", i32 1}
 // CHECK: !2 = !{i32 0, i32 1}
 // CHECK: !3 = !{}
 
@@ -131,7 +131,7 @@ ENTRY main {
 
 // -----
 
-// CHECK-LABEL: define void @scatter_TensorFlowScatter_Mul(i8* noalias align 64 dereferenceable(36) %alloc0, i8* noalias align 16 dereferenceable(36) %alloc1, i8* noalias align 16 dereferenceable(24) %alloc2, i8* noalias align 16 dereferenceable(8) %alloc3) {
+// CHECK-LABEL: define void @scatter_TensorFlowScatter_Mul(i8* noalias align 16 dereferenceable(36) %alloc0, i8* noalias align 16 dereferenceable(24) %alloc1, i8* noalias align 16 dereferenceable(8) %alloc2) {
 // CHECK:         %[[VAL_63:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_64:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_98:.*]] = alloca i32, align 4
@@ -188,8 +188,8 @@ ENTRY main {
 // CHECK:         %[[VAL_109:.*]] = extractvalue { i32, i1 } %[[VAL_107]], 1
 // CHECK:         br i1 %[[VAL_109]], label %[[VAL_96]], label %[[VAL_104]]
 // CHECK: !nvvm.annotations = !{!0, !1}
-// CHECK: !0 = !{void (i8*, i8*, i8*, i8*)* @scatter_TensorFlowScatter_Mul, !"kernel", i32 1}
-// CHECK: !1 = !{void (i8*, i8*, i8*, i8*)* @scatter_TensorFlowScatter_Mul, !"reqntidx", i32 6}
+// CHECK: !0 = !{void (i8*, i8*, i8*)* @scatter_TensorFlowScatter_Mul, !"kernel", i32 1}
+// CHECK: !1 = !{void (i8*, i8*, i8*)* @scatter_TensorFlowScatter_Mul, !"reqntidx", i32 6}
 // CHECK: !2 = !{i32 0, i32 1}
 // CHECK: !3 = !{i32 0, i32 6}
 // CHECK: !4 = !{}
@@ -216,7 +216,7 @@ ENTRY main {
 
 // -----
 
-// CHECK-LABEL: define void @scatter_ScalarUpdate(i8* noalias align 64 dereferenceable(16) %alloc0, i8* noalias align 16 dereferenceable(16) %alloc1, i8* noalias align 16 dereferenceable(4) %alloc2, i8* noalias align 16 dereferenceable(4) %alloc3) {
+// CHECK-LABEL: define void @scatter_ScalarUpdate(i8* noalias align 16 dereferenceable(16) %alloc0, i8* noalias align 16 dereferenceable(4) %alloc1, i8* noalias align 16 dereferenceable(4) %alloc2) {
 // CHECK:       entry:
 // CHECK:         %[[VAL_146:.*]] = alloca i32, align 4
 // CHECK:         %[[VAL_118:.*]] = getelementptr inbounds i8, i8* %[[VAL_119:.*]], i64 0
@@ -253,8 +253,8 @@ ENTRY main {
 // CHECK:         store atomic i32 %[[VAL_148]], i32* %[[VAL_145]] unordered, align 4
 // CHECK:         br label %[[VAL_138]]
 // CHECK: !nvvm.annotations = !{!0, !1}
-// CHECK: !0 = !{void (i8*, i8*, i8*, i8*)* @scatter_ScalarUpdate, !"kernel", i32 1}
-// CHECK: !1 = !{void (i8*, i8*, i8*, i8*)* @scatter_ScalarUpdate, !"reqntidx", i32 1}
+// CHECK: !0 = !{void (i8*, i8*, i8*)* @scatter_ScalarUpdate, !"kernel", i32 1}
+// CHECK: !1 = !{void (i8*, i8*, i8*)* @scatter_ScalarUpdate, !"reqntidx", i32 1}
 // CHECK: !2 = !{i32 0, i32 1}
 // CHECK: !3 = !{}
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/xla_thunks_opt.cc b/tensorflow/compiler/xla/service/gpu/tests/xla_thunks_opt.cc
new file mode 100644
index 00000000000..97c3b3a5bde
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/xla_thunks_opt.cc
@@ -0,0 +1,39 @@
+/* Copyright 2020 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/InitAllPasses.h"  // from @llvm-project
+#include "mlir/Support/MlirOptMain.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
+#include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/xla/service/gpu/ir/xla_thunks_ops.h"
+#include "tensorflow/core/platform/init_main.h"
+
+int main(int argc, char **argv) {
+  tensorflow::InitMlir y(&argc, &argv);
+
+  mlir::registerAllPasses();
+
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::RegisterAllTensorFlowDialects(registry);
+  mlir::mhlo::registerAllMhloDialects(registry);
+  registry.insert<mlir::shape::ShapeDialect>();
+  registry.insert<mlir::xla_thunks::XLAThunksDialect>();
+  return failed(
+      mlir::MlirOptMain(argc, argv, "XLA-Thunk pass driver\n", registry));
+}
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 7a9fedec629..64b685db379 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -69,10 +69,6 @@ class Thunk {
   };
 
   struct ThunkInfo {
-    // Optional. It's only used by subclasses which haven't been migrated away
-    // from HloInstructions. Once the migration is done, Thunks should be fully
-    // serializable.
-    const HloInstruction* hlo_instruction = nullptr;
     absl::optional<int64> profile_index;
     std::string profile_annotation;
   };
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc b/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc
index 690d0c9de56..4c6c5bb846d 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc
@@ -19,9 +19,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/fft_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/infeed_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/outfeed_thunk.h"
@@ -37,6 +39,65 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+namespace {
+void CheckBatchNormInputOutputPrimitivetypeAreValid(const HloInstruction* hlo) {
+  // All input and output statistics variables must be F32. Also, the last
+  // operand for CudnnBatchNormForwardInference, CudnnBatchNormForwardTraining,
+  // and CudnnBatchNormBackward is the feature_index which must be S64.
+  // The allowed types for non-statistics variables are as follows:
+  // CudnnBatchNormForwardInference:
+  //            operand[0]: {half, float}
+  //                out[0]: {half, float}
+  // CudnnBatchNormForwardTraining:
+  //            operand[0]: {half, float}
+  //                out[0]: {half, float}
+  // CudnnBatchNormBackward:
+  //            operand[0]: {half, float}
+  //            operand[4]: {half, float}
+  //                out[0]: {half, float}
+  // Note non-statistics inputs and outputs mentioned above should be of the
+  // same type.
+
+  // Check Inputs.
+  int64 num_operands = hlo->operand_count();
+  PrimitiveType operand_primitive_type =
+      hlo->operand(0)->shape().element_type();
+  CHECK(operand_primitive_type == F16 || operand_primitive_type == F32)
+      << "Not yet implemented";
+
+  for (int i = 1; i < num_operands - 2; i++) {
+    if (hlo->custom_call_target() == kCudnnBatchNormBackwardCallTarget &&
+        i == 4) {
+      // The first operand to batchnorm grad is the input and the 4th operand is
+      // the grad_output, both of which can be Eigen::half.
+      CHECK_EQ(hlo->operand(i)->shape().element_type(), operand_primitive_type)
+          << "Invalid datatype";
+      continue;
+    }
+    CHECK_EQ(hlo->operand(i)->shape().element_type(), F32)
+        << "Not yet implemented";
+  }
+
+  // The last operand is the feature index which must be int64.
+  CHECK_EQ(hlo->operand(num_operands - 1)->shape().element_type(), S64)
+      << "Not yet implemented";
+
+  // Check Outputs.
+  if (hlo->shape().IsTuple()) {
+    CHECK_EQ(hlo->shape().tuple_shapes(0).element_type(),
+             operand_primitive_type)
+        << "Invalid datatype";
+
+    for (int j = 1; j < hlo->shape().tuple_shapes_size(); j++) {
+      CHECK_EQ(hlo->shape().tuple_shapes(j).element_type(), F32)
+          << "Not yet implemented";
+    }
+  } else {
+    CHECK_EQ(hlo->shape().element_type(), operand_primitive_type)
+        << "Invalid datatype";
+  }
+}
+}  // namespace
 std::unique_ptr<Thunk> ThunkEmitter::BuildFftThunk(const HloInstruction* inst) {
   const HloInstruction* operand = inst->operand(0);
   return absl::make_unique<FftThunk>(
@@ -72,15 +133,14 @@ std::unique_ptr<Thunk> ThunkEmitter::BuildTriangularSolveThunk(
 
 std::unique_ptr<Thunk> ThunkEmitter::BuildGemmThunk(
     const HloInstruction* inst) {
-  auto config_or = inst->backend_config<GemmBackendConfig>();
-  GemmBackendConfig gemm_config = std::move(config_or.ValueOrDie());
+  GpuGemmConfig config = GetGpuGemmConfig(inst);
   const HloInstruction* lhs = inst->operand(0);
   const HloInstruction* rhs = inst->operand(1);
 
   // The bias is passed inside the output buffer. If those buffers are shared
   // we can just use it, otherwise copy the bias values into the output buffer
   // first.
-  if (gemm_config.beta() != 0.0) {
+  if (config.backend_config.beta() != 0.0) {
     const HloInstruction* bias = inst->operand(2);
     CHECK_EQ(bias->shape(), inst->shape());
     if (GetAllocationSlice(*bias) != GetAllocationSlice(*inst)) {
@@ -91,22 +151,22 @@ std::unique_ptr<Thunk> ThunkEmitter::BuildGemmThunk(
           /*destination_buffer=*/GetAllocationSlice(*inst),
           /*mem_size=*/ShapeUtil::ByteSizeOf(inst->shape())));
       thunks.push_back(absl::make_unique<GemmThunk>(
-          context_->GetThunkInfo(inst),
+          context_->GetThunkInfo(inst), std::move(config),
           GetAllocationSlice(*lhs),   // The buffer assigned to LHS.
           GetAllocationSlice(*rhs),   // The buffer assigned to RHS.
           GetAllocationSlice(*inst),  // The output buffer.
-          /*implements_whole_instruction=*/false, std::move(gemm_config)));
+          /*implements_whole_instruction=*/false));
       return absl::make_unique<SequentialThunk>(context_->GetThunkInfo(inst),
                                                 std::move(thunks));
     }
   }
 
   return absl::make_unique<GemmThunk>(
-      context_->GetThunkInfo(inst),
+      context_->GetThunkInfo(inst), std::move(config),
       GetAllocationSlice(*lhs),   // The buffer assigned to LHS.
       GetAllocationSlice(*rhs),   // The buffer assigned to RHS.
       GetAllocationSlice(*inst),  // The output buffer.
-      /*implements_whole_instruction=*/true, std::move(gemm_config));
+      /*implements_whole_instruction=*/true);
 }
 
 std::unique_ptr<Thunk> ThunkEmitter::BuildInfeedThunk(
@@ -133,8 +193,9 @@ std::unique_ptr<Thunk> ThunkEmitter::BuildOutfeedThunk(
       *slice = status_or_slice.ValueOrDie();
     }
   });
+  OutfeedConfig config = GetOutfeedConfig(inst);
   return absl::make_unique<OutfeedThunk>(context_->GetThunkInfo(inst),
-                                         std::move(slices));
+                                         std::move(config), std::move(slices));
 }
 
 Status ThunkEmitter::HandleCustomCall(HloInstruction* custom_call) {
@@ -154,16 +215,20 @@ Status ThunkEmitter::HandleCustomCall(HloInstruction* custom_call) {
     CHECK(feature_index->IsConstant());
     int64 feature_index_value = feature_index->literal().Get<int64>({});
 
+    CHECK_EQ(custom_call->shape().tuple_shapes_size(), 3);
+    CHECK(LayoutUtil::LayoutsInShapesEqual(custom_call->shape().tuple_shapes(0),
+                                           custom_call->operand(0)->shape()));
+    CheckBatchNormInputOutputPrimitivetypeAreValid(custom_call);
+    CudnnBatchNormConfig config = GetCudnnBatchNormConfig(
+        custom_call, epsilon_value, feature_index_value);
     AddThunkToThunkSequence(
         absl::make_unique<CudnnBatchNormForwardInferenceThunk>(
-            context_->GetThunkInfo(custom_call),
+            context_->GetThunkInfo(custom_call), std::move(config),
             /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
             /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
             /*offset=*/GetAllocationSlice(*custom_call->operand(2)),
             /*mean=*/GetAllocationSlice(*custom_call->operand(3)),
             /*variance=*/GetAllocationSlice(*custom_call->operand(4)),
-            /*epsilon=*/epsilon_value,
-            /*feature_index=*/feature_index_value,
             /*output=*/GetAllocationSlice(*custom_call)));
     return Status::OK();
   }
@@ -183,14 +248,14 @@ Status ThunkEmitter::HandleCustomCall(HloInstruction* custom_call) {
     auto output_data = GetAllocationSlice(*custom_call, {0});
     auto output_mean = GetAllocationSlice(*custom_call, {1});
     auto output_inv_stddev = GetAllocationSlice(*custom_call, {2});
+    CudnnBatchNormConfig config = GetCudnnBatchNormConfig(
+        custom_call, epsilon_value, feature_index_value);
     AddThunkToThunkSequence(
         absl::make_unique<CudnnBatchNormForwardTrainingThunk>(
-            context_->GetThunkInfo(custom_call),
+            context_->GetThunkInfo(custom_call), std::move(config),
             /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
             /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
             /*offset=*/GetAllocationSlice(*custom_call->operand(2)),
-            /*epsilon=*/epsilon_value,
-            /*feature_index=*/feature_index_value,
             /*output_data=*/output_data,
             /*output_mean=*/output_mean,
             /*output_inv_stddev=*/output_inv_stddev,
@@ -212,15 +277,22 @@ Status ThunkEmitter::HandleCustomCall(HloInstruction* custom_call) {
     auto output_grad_data = GetAllocationSlice(*custom_call, {0});
     auto output_grad_scale = GetAllocationSlice(*custom_call, {1});
     auto output_grad_offset = GetAllocationSlice(*custom_call, {2});
+    CHECK_EQ(custom_call->shape().tuple_shapes_size(), 3);
+    CHECK(LayoutUtil::LayoutsInShapesEqual(custom_call->shape().tuple_shapes(0),
+                                           custom_call->operand(0)->shape()));
+    CHECK(LayoutUtil::LayoutsInShapesEqual(custom_call->shape().tuple_shapes(0),
+                                           custom_call->operand(4)->shape()));
+    CheckBatchNormInputOutputPrimitivetypeAreValid(custom_call);
+
+    CudnnBatchNormConfig config = GetCudnnBatchNormConfig(
+        custom_call, epsilon_value, feature_index_value);
     AddThunkToThunkSequence(absl::make_unique<CudnnBatchNormBackwardThunk>(
-        context_->GetThunkInfo(custom_call),
+        context_->GetThunkInfo(custom_call), std::move(config),
         /*operand=*/GetAllocationSlice(*custom_call->operand(0)),
         /*scale=*/GetAllocationSlice(*custom_call->operand(1)),
         /*mean=*/GetAllocationSlice(*custom_call->operand(2)),
         /*inv_stddev=*/GetAllocationSlice(*custom_call->operand(3)),
         /*grad_output=*/GetAllocationSlice(*custom_call->operand(4)),
-        /*epsilon=*/epsilon_value,
-        /*feature_index=*/feature_index_value,
         /*output_grad_data=*/output_grad_data,
         /*output_grad_scale=*/output_grad_scale,
         /*output_grad_offset=*/output_grad_offset,
@@ -238,9 +310,13 @@ Status ThunkEmitter::HandleCustomCall(HloInstruction* custom_call) {
     auto conv_result_slice = GetAllocationSlice(*custom_call, {0});
     auto scratch_slice = GetAllocationSlice(*custom_call, {1});
 
+    TF_ASSIGN_OR_RETURN(
+        GpuConvConfig config,
+        GetGpuConvConfig(Cast<HloCustomCallInstruction>(custom_call)));
     AddThunkToThunkSequence(absl::make_unique<ConvolutionThunk>(
-        context_->GetThunkInfo(custom_call), std::move(operand_slices),
-        conv_result_slice, scratch_slice, tuple_result_slice));
+        context_->GetThunkInfo(custom_call), std::move(config),
+        std::move(operand_slices), conv_result_slice, scratch_slice,
+        tuple_result_slice));
     return Status::OK();
   }
 
@@ -310,11 +386,26 @@ Status ThunkEmitter::HandleCustomCall(HloInstruction* custom_call) {
       return slices;
     };
     std::vector<ShapeTree<BufferAllocation::Slice>> operand_slices;
-    for (const auto* operand : custom_call->operands()) {
+    for (int64 i = 0; i < custom_call->operand_count(); i++) {
+      const auto* operand = custom_call->operand(i);
       operand_slices.push_back(get_slices_for_instr(operand));
+      const auto& s1 = operand_slices.back().shape();
+      const auto& s2 = operand->shape();
+      CHECK(ShapeUtil::Equal(s1, s2)) << absl::StreamFormat(
+          "Shape mismatch between operand shape and "
+          "slice shape for operand %d: %s vs %s",
+          i, s1.ToString(), s2.ToString());
     }
     ShapeTree<BufferAllocation::Slice> result_slices =
         get_slices_for_instr(custom_call);
+    CHECK(ShapeUtil::Equal(custom_call->shape(), result_slices.shape()))
+        << absl::StreamFormat(
+               "Shape mismatch between instr->shape() and "
+               "result_slices.shape(): "
+               "%s vs %s.",
+               custom_call->shape().ToString(),
+               result_slices.shape().ToString());
+
     AddThunkToThunkSequence(absl::make_unique<CustomCallThunk>(
         context_->GetThunkInfo(custom_call), call_target,
         std::move(operand_slices), std::move(result_slices),
@@ -385,7 +476,6 @@ Thunk::ThunkInfo ThunkEmitter::EmissionContext::GetThunkInfo(
     const HloInstruction* hlo) const {
   CHECK(hlo);
   Thunk::ThunkInfo info;
-  info.hlo_instruction = hlo;
   info.profile_annotation = absl::StrFormat(
       "Thunk:#hlo_op=%s,hlo_module=%s#", hlo->name(), hlo->GetModule()->name());
   return info;
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index 792479df4ac..6397ad3bee0 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -27,9 +27,10 @@ WhileThunk::WhileThunk(
     ThunkInfo thunk_info,
     const BufferAllocation::Slice& condition_result_buffer_index,
     std::unique_ptr<ThunkSequence> condition_thunk_sequence,
-    std::unique_ptr<ThunkSequence> body_thunk_sequence)
+    std::unique_ptr<ThunkSequence> body_thunk_sequence,
+    absl::optional<size_t> condition_profile_index,
+    absl::optional<size_t> body_profile_index)
     : Thunk(Kind::kWhile, thunk_info),
-      hlo_instruction_(thunk_info.hlo_instruction),
       condition_result_buffer_index_(condition_result_buffer_index),
       // Pass nullptr as the HloInstruction* to the condition_thunk_sequence_
       // and body_thunk_sequence_ constructors because these SequentialThunks
@@ -38,7 +39,9 @@ WhileThunk::WhileThunk(
       condition_thunk_sequence_(absl::make_unique<SequentialThunk>(
           ThunkInfo(), std::move(*condition_thunk_sequence))),
       body_thunk_sequence_(absl::make_unique<SequentialThunk>(
-          ThunkInfo(), std::move(*body_thunk_sequence))) {}
+          ThunkInfo(), std::move(*body_thunk_sequence))),
+      condition_profile_index_(condition_profile_index),
+      body_profile_index_(body_profile_index) {}
 
 Status WhileThunk::Initialize(const GpuExecutable& executable,
                               se::StreamExecutor* executor) {
@@ -62,7 +65,7 @@ Status WhileThunk::ExecuteOnStream(const ExecuteParams& params) {
     profiler.StartHloComputation();
     VLOG(3) << "Executing condition computation";
     TF_RETURN_IF_ERROR(condition_thunk_sequence_->ExecuteOnStream(params));
-    profiler.FinishHloComputation(hlo_instruction_->while_condition());
+    profiler.FinishHloComputation(condition_profile_index_);
 
     // Copy the result of condition computation and break the loop if 'false'.
     bool condition_result;
@@ -86,7 +89,7 @@ Status WhileThunk::ExecuteOnStream(const ExecuteParams& params) {
     // Invoke thunk sequence for while 'body' computation, and pass on
     // 'profiler' to measure the timing of the thunks in 'body_thunk_sequence_'.
     TF_RETURN_IF_ERROR(body_thunk_sequence_->ExecuteOnStream(params));
-    profiler.FinishHloComputation(hlo_instruction_->while_body());
+    profiler.FinishHloComputation(body_profile_index_);
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h
index 707bac15bb2..707edbdc192 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h
@@ -42,7 +42,9 @@ class WhileThunk : public Thunk {
   WhileThunk(ThunkInfo thunk_info,
              const BufferAllocation::Slice& condition_result_buffer_index,
              std::unique_ptr<ThunkSequence> condition_thunk_sequence,
-             std::unique_ptr<ThunkSequence> body_thunk_sequence);
+             std::unique_ptr<ThunkSequence> body_thunk_sequence,
+             absl::optional<size_t> condition_profile_index,
+             absl::optional<size_t> body_profile_index);
   WhileThunk(const WhileThunk&) = delete;
   WhileThunk& operator=(const WhileThunk&) = delete;
 
@@ -51,10 +53,11 @@ class WhileThunk : public Thunk {
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
-  const HloInstruction* hlo_instruction_;
   const BufferAllocation::Slice condition_result_buffer_index_;
   std::unique_ptr<SequentialThunk> condition_thunk_sequence_;
   std::unique_ptr<SequentialThunk> body_thunk_sequence_;
+  const absl::optional<size_t> condition_profile_index_;
+  const absl::optional<size_t> body_profile_index_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index c3a7b3a5c14..ac94b2e1d24 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -35,7 +35,7 @@ import "tensorflow/compiler/xla/xla_data.proto";
 option cc_enable_arenas = true;
 
 // Serialization of HloInstruction.
-// Next ID: 74
+// Next ID: 75
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -232,6 +232,11 @@ message HloInstructionProto {
   // kCustomCall.
   bool custom_call_has_side_effect = 65;
 
+  // A list of CustomCallOutputOperandAliasing pairs that specifies aliasing
+  // buffers between output and operands for kCustomCall.
+  repeated xla.CustomCallOutputOperandAliasing
+      custom_call_output_operand_aliasing = 74;
+
   // The delta value for kRngGetAndUpdateState.
   int64 delta = 66;
 
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
index 384ae272dc1..cf09ddeec27 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis.cc
@@ -308,6 +308,39 @@ class BufferValueMap {
     }
   }
 
+  void ComputeInPlaceOperationAliasedBuffers(
+      const HloValue& value, std::vector<BufferNumber>* aliased_buffers) {
+    VLOG(3) << "Compute aliases for in-place operations (e.g. "
+               "kDynamicUpdateSlice and kScatter)";
+    for (const HloPosition& position : value.positions()) {
+      HloInstruction* instruction = position.instruction;
+      for (const auto& operand_and_output_index :
+           HloDataflowAnalysis::GetInPlaceInputOutputPairs(instruction)) {
+        if (position.index == operand_and_output_index.second) {
+          const HloUse& operand = operand_and_output_index.first;
+          const HloValue& operand_value = dataflow_.GetUniqueValueAt(
+              instruction->operand(operand.operand_number),
+              operand.operand_index);
+          VLOG(3) << " operand value " << operand_value.ToShortString()
+                  << " aliases.";
+          aliased_buffers->push_back(GetBufferForValue(operand_value));
+        }
+      }
+    }
+
+    for (const HloUse& use : value.uses()) {
+      for (const auto& operand_and_output_index :
+           HloDataflowAnalysis::GetInPlaceInputOutputPairs(use.instruction)) {
+        if (use == operand_and_output_index.first) {
+          const HloValue& use_value = dataflow_.GetUniqueValueAt(
+              use.instruction, operand_and_output_index.second);
+          VLOG(3) << " use value " << use_value.ToShortString() << " aliases.";
+          aliased_buffers->push_back(GetBufferForValue(use_value));
+        }
+      }
+    }
+  }
+
   // Compute and return a vector of buffers that the given value must be
   // contained in due to HLO aliasing rules.
   std::vector<BufferNumber> ComputeAliasedBuffers(const HloValue& value) {
@@ -318,6 +351,7 @@ class BufferValueMap {
     ComputeInputOutputAliasedBuffers(value, &aliased_buffers);
     ComputeWhileAliasedBuffers(value, &aliased_buffers);
     ComputeConditionalAliasedBuffers(value, &aliased_buffers);
+    ComputeInPlaceOperationAliasedBuffers(value, &aliased_buffers);
     // Uniquify aliased buffers.
     absl::c_sort(aliased_buffers);
     aliased_buffers.erase(
diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
index 2666cb0872d..5e94f1d173e 100644
--- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc
@@ -1062,6 +1062,118 @@ TEST_F(HloAliasAnalysisTest, MergeBuffersReverse) {
   analysis.BufferLivesOut(analysis.buffers()[0]);
 }
 
+TEST_F(HloAliasAnalysisTest, DynamicUpdateSlice) {
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  Shape update_shape = ShapeUtil::MakeShape(F32, {4});
+  Shape index_shape = ShapeUtil::MakeShape(S32, {});
+  auto builder = HloComputation::Builder(TestName());
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param0"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, update_shape, "param1"));
+  auto param2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, index_shape, "param2"));
+  auto copy0 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kCopy, param0));
+  auto dynamic_update_slice = builder.AddInstruction(
+      HloInstruction::CreateDynamicUpdateSlice(shape, copy0, param1, {param2}));
+
+  module_->AddEntryComputation(builder.Build());
+  SCOPED_TRACE(module_->ToString());
+
+  HloAliasAnalysis& analysis = RunAnalysis();
+
+  EXPECT_EQ(analysis.GetUniqueBufferAt(copy0),
+            analysis.GetUniqueBufferAt(dynamic_update_slice));
+}
+
+TEST_F(HloAliasAnalysisTest, DynamicUpdateSliceMultiOutputFusion) {
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+fused_computation {
+  param0 = f32[1280,1,128] parameter(0)
+  param1 = f32[1280,1,128] parameter(1)
+  param2 = f32[1280,1,128] parameter(2)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  add.1 = f32[1280,1,128] add(param0, param0)
+  dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(param1, broadcast.6, constant.3, constant.3, constant.3)
+  dynamic-update-slice.6 = f32[1280,1,128] dynamic-update-slice(param2, broadcast.6, constant.3, constant.3, constant.3)
+  ROOT tuple.1 = (f32[1280,1,128], f32[1280,1,128], f32[1280,1,128]) tuple(add.1, dynamic-update-slice.5, dynamic-update-slice.6)
+}
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  negate0 = f32[1280,1,128] negate(param)
+  negate1 = f32[1280,1,128] negate(param)
+  negate2 = f32[1280,1,128] negate(param)
+  ROOT fusion = (f32[1280,1,128], f32[1280,1,128], f32[1280,1,128]) fusion(negate0, negate1, negate2), kind=kLoop, calls=fused_computation
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(module_, ParseAndReturnVerifiedModule(hlo_string));
+
+  SCOPED_TRACE(module_->ToString());
+
+  HloAliasAnalysis& analysis = RunAnalysis();
+  LOG(INFO) << analysis.ToString();
+
+  // Expect negate1 and negate2 to alias with fusion{1} and fusion{2}
+  // respectively (due to DUS), but not negate0 and fusion{0}.
+  const HloInstruction* fusion =
+      module_->entry_computation()->GetInstructionWithName("fusion");
+  const HloInstruction* negate0 =
+      module_->entry_computation()->GetInstructionWithName("negate0");
+  const HloInstruction* negate1 =
+      module_->entry_computation()->GetInstructionWithName("negate1");
+  const HloInstruction* negate2 =
+      module_->entry_computation()->GetInstructionWithName("negate2");
+  EXPECT_EQ(analysis.GetUniqueBufferAt(negate1),
+            analysis.GetUniqueBufferAt(fusion, {1}));
+  EXPECT_EQ(analysis.GetUniqueBufferAt(negate2),
+            analysis.GetUniqueBufferAt(fusion, {2}));
+  EXPECT_NE(analysis.GetUniqueBufferAt(negate0),
+            analysis.GetUniqueBufferAt(fusion, {0}));
+}
+
+TEST_F(HloAliasAnalysisTest, ChainedDynamicUpdateSliceFusion) {
+  // CPU and GPU backends may generate fusions with dynamic update slices
+  // feeding each other. They expect the fusion to not be in-place if that is
+  // the case.
+  absl::string_view hlo_string = R"(
+HloModule Module
+
+fused_computation {
+  param0 = f32[1280,1,128] parameter(0)
+  constant.1 = f32[] constant(0)
+  broadcast.6 = f32[128,1,128] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  dynamic-update-slice.5 = f32[1280,1,128] dynamic-update-slice(param0, broadcast.6, constant.3, constant.3, constant.3)
+  ROOT dynamic-update-slice.6 = f32[1280,1,128] dynamic-update-slice(dynamic-update-slice.5, broadcast.6, constant.3, constant.3, constant.3)
+}
+
+ENTRY main {
+  param = f32[1280,1,128] parameter(0)
+  negate0 = f32[1280,1,128] negate(param)
+  ROOT fusion = f32[1280,1,128] fusion(negate0), kind=kLoop, calls=fused_computation
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(module_, ParseAndReturnVerifiedModule(hlo_string));
+
+  SCOPED_TRACE(module_->ToString());
+
+  HloAliasAnalysis& analysis = RunAnalysis();
+  LOG(INFO) << analysis.ToString();
+
+  const HloInstruction* fusion =
+      module_->entry_computation()->GetInstructionWithName("fusion");
+  const HloInstruction* negate0 =
+      module_->entry_computation()->GetInstructionWithName("negate0");
+  EXPECT_NE(analysis.GetUniqueBufferAt(negate0),
+            analysis.GetUniqueBufferAt(fusion));
+}
+
 TEST_F(HloAliasAnalysisTest, BitcastInterference) {
   // A bitcast value simultaneously live with its operand should not cause
   // interference.
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 14daf680ac9..6323d0903a4 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -93,10 +93,13 @@ HloComputation::HloComputation(
 }
 
 HloInstruction* HloComputation::AddInstruction(
-    std::unique_ptr<HloInstruction> instruction) {
+    std::unique_ptr<HloInstruction> instruction, const std::string& new_name) {
   CHECK(instruction->opcode() != HloOpcode::kParameter)
       << "Parameter instructions cannot be added to a computation after "
       << "it has been built";
+  if (!new_name.empty()) {
+    instruction->SetAndSanitizeName(new_name);
+  }
   return AddInstructionInternal(std::move(instruction));
 }
 
@@ -315,6 +318,8 @@ Status HloComputation::RemoveInstructionImpl(HloInstruction* instruction,
   (*inst_it->second)->set_parent(nullptr);
   to_be_deleted_.emplace_back(inst_it->second->release());
   to_be_deleted_.back()->DetachFromOperandsAndUsers();
+  // Clear all operands to avoid Null operands.
+  to_be_deleted_.back()->RemoveAllOperands();
   to_be_deleted_.back()->MarkAsDead();
   instructions_.erase(inst_it->second);
   instruction_iterators_.erase(inst_it);
@@ -380,6 +385,9 @@ void HloComputation::ComputeInstructionPostOrder(
   dfs_stack.push_back(root);
   while (!dfs_stack.empty()) {
     const auto current = dfs_stack.back();
+    CHECK_EQ(current->parent(), this)
+        << "Instruction " << current->name()
+        << " is not in the current computation (" << name() << ").";
     auto it = visited->find(current);
     if (it != visited->end()) {
       if (it->second == kVisited) {
@@ -836,8 +844,9 @@ ProgramShape HloComputation::ComputeProgramShape(bool include_ids) const {
   return program_shape;
 }
 
-bool HloComputation::Equal(const HloComputation& other,
-                           bool is_layout_sensitive) const {
+bool HloComputation::EqualInternal(const HloComputation& other,
+                                   bool is_layout_sensitive,
+                                   bool ignore_channel_id_values) const {
   if (this == &other) {
     return true;
   }
@@ -855,15 +864,21 @@ bool HloComputation::Equal(const HloComputation& other,
       continue;
     }
     visited.emplace(pair);
-    // TODO(b/123082518): Avoid recursively invoking == because it may
+    // TODO(b/123082518): Avoid recursively invoking Equal because it may
     // cause a stack overflow with deeply nested subcomputations.
-    bool identical_ignoring_operands = pair.first->Identical(
-        *pair.second,
-        [](const HloInstruction*, const HloInstruction*) { return true; },
-        [](const HloComputation* a, const HloComputation* b) {
-          return *a == *b;
-        },
-        is_layout_sensitive);
+    auto operands_eq = [](const HloInstruction*, const HloInstruction*) {
+      return true;
+    };
+    auto comp_eq = [&](const HloComputation* a, const HloComputation* b) {
+      return a->EqualInternal(*b, is_layout_sensitive,
+                              ignore_channel_id_values);
+    };
+    bool identical_ignoring_operands =
+        ignore_channel_id_values
+            ? pair.first->IdenticalIgnoringChannelIdValues(
+                  *pair.second, operands_eq, comp_eq, is_layout_sensitive)
+            : pair.first->Identical(*pair.second, operands_eq, comp_eq,
+                                    is_layout_sensitive);
     if (!identical_ignoring_operands) {
       return false;
     }
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index d640007886c..d618a527070 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -122,7 +122,8 @@ class HloComputation {
 
   // Add an instruction to the computation. The computation takes ownership of
   // the instruction.
-  HloInstruction* AddInstruction(std::unique_ptr<HloInstruction> instruction);
+  HloInstruction* AddInstruction(std::unique_ptr<HloInstruction> instruction,
+                                 const std::string& new_name = "");
 
   // Remove the param_no'th parameter from the computation.
   // Note this is only applicatable to the computation for the fusion
@@ -310,7 +311,19 @@ class HloComputation {
   ProgramShape ComputeProgramShape(bool include_ids = true) const;
 
   // Return whether `*this` and `other` are functionally equivalent.
-  bool Equal(const HloComputation& other, bool is_layout_sensitive) const;
+  bool Equal(const HloComputation& other, bool is_layout_sensitive) const {
+    return EqualInternal(other, is_layout_sensitive,
+                         /*ignore_channel_id_values=*/false);
+  }
+
+  // Same as Equal() but ignores channel ID value mismatches on instructions, as
+  // long as the two instructions both have channel IDs or neither has a channel
+  // ID.
+  bool EqualIgnoringChannelIdValues(const HloComputation& other,
+                                    bool is_layout_sensitive) const {
+    return EqualInternal(other, is_layout_sensitive,
+                         /*ignore_channel_id_values=*/true);
+  }
 
   // Return whether `*this` and `other` are functionally equivalent.
   bool operator==(const HloComputation& other) const {
@@ -489,6 +502,10 @@ class HloComputation {
   HloInstruction* AddInstructionInternal(
       std::unique_ptr<HloInstruction> instruction);
 
+  // Internal helper for comparison with different options.
+  bool EqualInternal(const HloComputation& other, bool is_layout_sensitive,
+                     bool ignore_channel_id_values) const;
+
   // Fuses HLOs in instructions_to_fuse into fusion_instruction.
   //
   // Pre-condition: fusion_instruction's opcode is kFusion.
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index 4ba67888409..4aeeb6d27ac 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -92,16 +92,17 @@ StatusOr<HloInstruction*> MakeSliceHlo(HloInstruction* operand,
 
 StatusOr<HloInstruction*> MakeConvolveHlo(
     HloInstruction* lhs, HloInstruction* rhs, int64 feature_group_count,
-    const Window& window, const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 batch_group_count, const Window& window,
+    const ConvolutionDimensionNumbers& dimension_numbers,
     const PrecisionConfig& precision_config) {
   HloComputation* computation = lhs->parent();
   CHECK_EQ(computation, rhs->parent());
   TF_ASSIGN_OR_RETURN(Shape convolve_shape,
                       ShapeInference::InferConvolveShape(
-                          lhs->shape(), rhs->shape(), feature_group_count, 1,
-                          window, dimension_numbers));
+                          lhs->shape(), rhs->shape(), feature_group_count,
+                          batch_group_count, window, dimension_numbers));
   return computation->AddInstruction(HloInstruction::CreateConvolve(
-      convolve_shape, lhs, rhs, feature_group_count, 1, window,
+      convolve_shape, lhs, rhs, feature_group_count, batch_group_count, window,
       dimension_numbers, precision_config));
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h
index 2b17ae3d967..53eeeffb858 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h
@@ -61,7 +61,8 @@ StatusOr<HloInstruction*> MakeSliceHlo(HloInstruction* operand,
 // containing `lhs` and `rhs` (`lhs` and `rhs` must be in the same computation).
 StatusOr<HloInstruction*> MakeConvolveHlo(
     HloInstruction* lhs, HloInstruction* rhs, int64 feature_group_count,
-    const Window& window, const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 batch_group_count, const Window& window,
+    const ConvolutionDimensionNumbers& dimension_numbers,
     const PrecisionConfig& precision_config);
 
 // Creates a transpose HLO instruction and adds it to the computation containing
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index a46d20d5808..bc1063f9d48 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <queue>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -32,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -42,7 +44,45 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
+namespace {
+// CalculatePostOrderSchedule traverses a module and assign a ordinal to each
+// instruction based the postorder dependency.
+int64 CalculatePostOrderScheduleHelper(
+    const HloComputation* comp, int64 start_ordinal,
+    absl::flat_hash_map<HloInstruction*, int64>* ordinal_map) {
+  int64 ordinal = start_ordinal;
+  for (HloInstruction* instruction : comp->MakeInstructionPostOrder()) {
+    if (instruction->opcode() == HloOpcode::kCall ||
+        instruction->opcode() == HloOpcode::kConditional) {
+      for (const HloComputation* called_computation :
+           instruction->called_computations()) {
+        ordinal = CalculatePostOrderScheduleHelper(called_computation, ordinal,
+                                                   ordinal_map);
+      }
+    }
+    if (instruction->opcode() == HloOpcode::kWhile) {
+      ordinal = CalculatePostOrderScheduleHelper(instruction->while_condition(),
+                                                 ordinal, ordinal_map);
+      ordinal = CalculatePostOrderScheduleHelper(instruction->while_body(),
+                                                 ordinal, ordinal_map);
+    }
+    // It's possible that in some unit tests the computation graph is not
+    // flatten (meaning we could have multiple callers for one computation). In
+    // that case the oridinal_map will see the instruction multiple times. We
+    // consider that case to be ok as it only shows up in unit tests.
+    ordinal_map->insert({instruction, ordinal++});
+  }
+  return ordinal;
+}
 
+absl::flat_hash_map<HloInstruction*, int64> CalculatePostOrderSchedule(
+    const HloModule& module) {
+  absl::flat_hash_map<HloInstruction*, int64> map;
+  CalculatePostOrderScheduleHelper(module.entry_computation(), 0, &map);
+  return map;
+}
+
+}  // namespace
 using absl::StrAppend;
 using absl::StrCat;
 
@@ -392,6 +432,23 @@ bool HloDataflowAnalysis::UpdateSendValueSet(HloInstruction* send) {
   return changed;
 }
 
+bool HloDataflowAnalysis::UpdateCustomCallValueSet(
+    HloInstruction* custom_call) {
+  CHECK_EQ(custom_call->opcode(), HloOpcode::kCustomCall);
+  bool changed = false;
+  for (const auto& aliasing : Cast<HloCustomCallInstruction>(custom_call)
+                                  ->output_to_operand_aliasing()) {
+    const HloValueSet& operand_value_set = GetValueSet(
+        custom_call->operand(aliasing.second.first), aliasing.second.second);
+    HloValueSet& value_set = GetValueSet(custom_call, aliasing.first);
+    if (value_set != operand_value_set) {
+      value_set = operand_value_set;
+      changed = true;
+    }
+  }
+  return changed;
+}
+
 bool HloDataflowAnalysis::UpdateCopyStartValueSet(HloInstruction* copy_start) {
   CHECK_EQ(copy_start->opcode(), HloOpcode::kCopyStart);
   bool changed = false;
@@ -717,6 +774,8 @@ bool HloDataflowAnalysis::UpdateInstructionValueSet(
       return UpdateAddDependencyValueSet(instruction);
     case HloOpcode::kBitcast:
       return UpdateBitcastValueSet(instruction);
+    case HloOpcode::kCustomCall:
+      return UpdateCustomCallValueSet(instruction);
     case HloOpcode::kSetDimensionSize:
       return UpdateSetDimensionSizeValueSet(instruction);
     case HloOpcode::kDomain:
@@ -757,27 +816,35 @@ bool HloDataflowAnalysis::UpdateInstructionValueSet(
 }
 
 void HloDataflowAnalysis::Propagate() {
-  std::queue<HloInstruction*> worklist;
+  using Work = std::pair<int64, HloInstruction*>;
+  // Avoid duplicating work by preferring work items early in the post order
+  // schedule. Intuitively, we start from entry parameters and propagate buffers
+  // updates throughout the module only once.
+  std::priority_queue<Work, std::vector<Work>, std::greater<Work>> worklist;
   absl::flat_hash_set<HloInstruction*> workset;
-  auto add_to_worklist = [&worklist, &workset](HloInstruction* instruction) {
+  auto priority_map = CalculatePostOrderSchedule(module_);
+  auto add_to_worklist = [&priority_map, &worklist,
+                          &workset](HloInstruction* instruction) {
     if (workset.insert(instruction).second) {
-      worklist.push(instruction);
+      worklist.emplace(priority_map[instruction], instruction);
     }
   };
 
-  for (HloComputation* computation : module_.computations()) {
-    for (HloInstruction* instruction : computation->instructions()) {
+  auto comps = module_.MakeComputationPostOrder();
+  for (HloComputation* computation : comps) {
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
       add_to_worklist(instruction);
     }
   }
   VLOG(1) << "SSA_FORM_: " << ssa_form_;
 
   while (!worklist.empty()) {
-    HloInstruction* instruction = worklist.front();
+    HloInstruction* instruction = worklist.top().second;
     auto add_to_worklist = [&](HloInstruction* todo) {
       if (workset.insert(todo).second) {
         VLOG(1) << "  Adding todo : " << todo->name();
-        worklist.push(todo);
+        worklist.emplace(priority_map[todo], todo);
       }
     };
     worklist.pop();
@@ -970,6 +1037,22 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
           define_value_at(/*index=*/{1});
           define_value_at(/*index=*/{2});
           break;
+        case HloOpcode::kCustomCall: {
+          absl::flat_hash_set<ShapeIndex> aliasing_indices;
+          for (const auto& aliasing :
+               Cast<HloCustomCallInstruction>(instruction)
+                   ->output_to_operand_aliasing()) {
+            aliasing_indices.insert(aliasing.first);
+          }
+          ShapeUtil::ForEachSubshape(
+              instruction->shape(),
+              [&](const Shape& /*subshape*/, const ShapeIndex& index) {
+                if (!aliasing_indices.contains(index)) {
+                  define_value_at(index);
+                }
+              });
+          break;
+        }
         default:
           define_all_values();
           break;
@@ -1130,69 +1213,49 @@ bool HloDataflowAnalysis::DoesNotUseOperandBuffer(
   return true;
 }
 
-// Given a fusion whose root is a dynamic-update-slice op, determines whether
-// the fusion's output buffer can be shared with the buffer of fusion_param,
-// which must be a fused parameter of the fusion.
-//
-// Preconditions:
-//
-//  - fusion's root is a dynamic-update-slice op.
-//  - fusion_param is a parameter within the fusion.
-//
-// fusion_param may point to a subelement of the actual parameter instruction if
-// the param is a tuple; i.e. fusion_param->index() need not be the empty list.
-//
-// Returns true if:
-//
-//  * fusion_param is used by the root of dynamic-update-slice as the "base" of
-//    the update, i.e. the thing being updated, AND
-//  * all other uses of fusion_param are dynamic-slices that slice the same
-//    indices as are overwritten in the dynamic-update-slice.
-//
-// In the case that there are no other uses of fusion_param (last bullet point
-// is vacuously true) it's easy to see why an in-place DUS is safe; this is just
-// the "natural" implementation of DUS.  If there are other users, in-place DUS
-// is safe on the assumption that the thread which writes element i of the
-// output will be the only one to read element i of fusion_param (via the
-// dynamic-slice ops).
-static bool CanDoInPlaceDynamicUpdateSlice(HloInstruction* fusion,
-                                           const HloValue& fusion_param_value) {
-  auto* root =
-      Cast<HloDynamicUpdateSliceInstruction>(fusion->fused_expression_root());
-  auto* fusion_param = fusion_param_value.instruction();
-  CHECK_EQ(fusion_param->opcode(), HloOpcode::kParameter);
-  CHECK_EQ(fusion_param->parent(), fusion->fused_instructions_computation());
+/*static*/ bool HloDataflowAnalysis::IsInPlaceOperation(HloOpcode opcode) {
+  return opcode == HloOpcode::kDynamicUpdateSlice ||
+         opcode == HloOpcode::kScatter;
+}
 
-  // fusion_param must be used by the root as the "base" of the
-  // dynamic-update-slice.  The natural way to check this would be
-  //
-  //   `if (root->operand(0) != fusion_param)`
-  //
-  // but we also have to handle the case where the fusion parameter is
-  // tuple-shaped and we're considering just one element of that tuple, i.e.
-  // fusion_param.index() != {}.
-  if (absl::c_count_if(fusion_param_value.uses(), [&](const HloUse& use) {
-        return use.instruction == root;
-      }) != 1) {
-    return false;
+/*static*/ std::vector<std::pair<HloUse, ShapeIndex>>
+HloDataflowAnalysis::GetInPlaceInputOutputPairs(HloInstruction* instruction) {
+  if (IsInPlaceOperation(instruction->opcode())) {
+    return {{HloUse{instruction, 0, {}}, {}}};
+  } else if (instruction->opcode() != HloOpcode::kFusion) {
+    return {};
   }
-
-  // All other uses of fusion_param must be dynamic-slices that slice the same
-  // indices as are overwritten by the dynamic-update-slice.
-  for (const HloUse& use : fusion_param_value.uses()) {
-    auto* user = use.instruction;
-    if (user == root) {
-      continue;
+  std::vector<std::pair<HloUse, ShapeIndex>> input_output_pairs;
+  for (auto& indexed_shape : ShapeUtil::GetLeafShapes(instruction->shape())) {
+    const HloInstruction* hlo_generating_output =
+        instruction->fused_expression_root();
+    for (int64 i = 0; i < indexed_shape.index.size(); ++i) {
+      if (hlo_generating_output->opcode() == HloOpcode::kTuple) {
+        hlo_generating_output =
+            hlo_generating_output->operand(indexed_shape.index[i]);
+      } else {
+        CHECK_EQ(i, indexed_shape.index.size() - 1);
+      }
     }
 
-    // Check that `user` is a dynamic-slice op and has the same slice indices as
-    // `root`.
-    auto* ds = DynCast<HloDynamicSliceInstruction>(user);
-    if (!ds || ds->index_operands() != root->index_operands()) {
-      return false;
+    if (IsInPlaceOperation(hlo_generating_output->opcode())) {
+      ShapeIndex operand_index;
+      const HloInstruction* fusion_parameter =
+          hlo_generating_output->operand(0);
+      while (fusion_parameter->opcode() == HloOpcode::kGetTupleElement) {
+        operand_index.push_front(fusion_parameter->tuple_index());
+        fusion_parameter = fusion_parameter->operand(0);
+      }
+
+      if (fusion_parameter->opcode() == HloOpcode::kParameter) {
+        input_output_pairs.emplace_back(
+            HloUse{instruction, fusion_parameter->parameter_number(),
+                   operand_index},
+            indexed_shape.index);
+      }
     }
   }
-  return true;
+  return input_output_pairs;
 }
 
 bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
@@ -1213,24 +1276,17 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
     return false;
   }
 
-  if (user->opcode() == HloOpcode::kFusion) {
-    // Get the parameter associated with 'operand';
-    HloInstruction* fusion_param =
-        user->fused_parameter(user->operand_index(operand));
-
-    const HloValue& fusion_param_value =
-        GetValueDefinedAt(fusion_param, operand_index);
-
-    // TODO(b/80315712): This code is in a bit of a weird intermediate state
-    // at the moment. The in-place DUS check really needs to be common to all
-    // backends, so it runs first. Then we run the backend-specific check if
-    // provided, or go through the target-independent check if not.
-    // Unfortunately, the notionally "target-independent" path actually contains
-    // some target-specific code, so we can't run all of it *in addition* to the
-    // target-specific function, like the interface documentation says.
-    if (user->fused_expression_root()->opcode() ==
-        HloOpcode::kDynamicUpdateSlice) {
-      return CanDoInPlaceDynamicUpdateSlice(user, fusion_param_value);
+  // Must-alias relationship returns true for in-place operations (DUS and DUS
+  // fusions), regardless of the backend.
+  for (const auto& operand_and_output_index :
+       GetInPlaceInputOutputPairs(user)) {
+    if (operand_and_output_index.second != user_index) {
+      continue;
+    }
+    for (const HloUse& use : GetUniqueValueAt(operand, operand_index).uses()) {
+      if (use == operand_and_output_index.first) {
+        return true;
+      }
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index bec592aeb20..c3aad04023f 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -49,6 +49,9 @@ class HloDataflowAnalysis {
   // Infrastructure for passing may-alias hints: HLO passes can populate the
   // may-alias table. If an empty optional is returned, default rules are used.
   //
+  // Must-alias rules (as defined by GetInPlaceInputOutputPairs) cannot be
+  // overriden using backend-specific overrides.
+  //
   // The first parameter of the function should be the instruction, the
   // second parameter should be an operand of the instruction. The third
   // parameter should be the output index of the instruction.
@@ -160,6 +163,15 @@ class HloDataflowAnalysis {
 
   const HloModule& module() const { return module_; }
 
+  // Returns true if the operation is an in-place operation and its operand 0
+  // must alias with the output.
+  static bool IsInPlaceOperation(HloOpcode opcode);
+
+  // Returns a vector consisting of the HloUse (operand number and shape index)
+  // and output shape index of the in-place operations within this HLO.
+  static std::vector<std::pair<HloUse, ShapeIndex>> GetInPlaceInputOutputPairs(
+      HloInstruction* instruction);
+
  protected:
   HloDataflowAnalysis(const HloModule& module, bool ssa_form,
                       bool bitcast_defines_value = false,
@@ -204,6 +216,7 @@ class HloDataflowAnalysis {
   bool UpdateCallValueSet(HloInstruction* call);
   bool UpdateConditionalValueSet(HloInstruction* conditional);
   bool UpdateCopyValueSet(HloInstruction* copy);
+  bool UpdateCustomCallValueSet(HloInstruction* custom_call);
   bool UpdateDomainValueSet(HloInstruction* domain);
   bool UpdateGetTupleElementValueSet(HloInstruction* gte);
   bool UpdateParameterValueSet(HloInstruction* parameter);
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 551ffb52031..1fa6fe95c40 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -2324,36 +2324,6 @@ TEST_F(CanShareOperandBufferWithUserTest,
       dataflow_analysis_->CanShareOperandBufferWithUser(param, {}, fusion, {}));
 }
 
-TEST_F(CanShareOperandBufferWithUserTest, DUSWithSliceWithDifferentIndices) {
-  const char* kModule = R"(
-    HloModule test
-
-    fused_computation {
-      p0 = f32[10,20,30] parameter(0)
-      p1 = s32[] parameter(1)
-      p2 = s32[] parameter(2)
-      p3 = s32[] parameter(3)
-      slice = f32[1,1,30] dynamic-slice(p0, p1, p2, p3), dynamic_slice_sizes={1,1,30}
-      ROOT dus = f32[10,20,30] dynamic-update-slice(p0, slice, p1, p3, p2)
-    }
-
-    ENTRY test {
-      p0 = f32[10,20,30] parameter(0)
-      p1 = s32[] parameter(1)
-      p2 = s32[] parameter(2)
-      p3 = s32[] parameter(3)
-      ROOT fusion = f32[10,20,30] fusion(p0, p1, p2, p3), kind=kLoop, calls=fused_computation
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(module_, ParseAndReturnVerifiedModule(kModule));
-  auto* fusion = module_->entry_computation()->root_instruction();
-  auto* param = module_->entry_computation()->parameter_instruction(0);
-
-  RunAnalysis();
-  EXPECT_FALSE(
-      dataflow_analysis_->CanShareOperandBufferWithUser(param, {}, fusion, {}));
-}
-
 TEST_F(CanShareOperandBufferWithUserTest, DUSWithSliceWithSameIndices) {
   const char* kModule = R"(
     HloModule test
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index d5f0c62adc1..4fb7edd0104 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -1157,7 +1157,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
       const int64 feature_group_index =
           out_index[output_z_dim] / output_feature_group_size;
 
-      const int64 batch_group_index = out_index[output_z_dim];
+      const int64 depthwise_multiplier =
+          batch_group_count > 1 ? output_z_size / input_batch_size : 1;
+      const int64 batch_group_index =
+          out_index[output_z_dim] / depthwise_multiplier;
 
       ElementwiseT result_val = static_cast<ElementwiseT>(0);
       DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size(),
@@ -1218,7 +1221,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
               feature_group_index * input_feature_group_size + rhs_iz;
 
           int64 lhs_linear_index = lhs_linear_spatial_index;
-
           lhs_linear_index += out_index[output_batch_dim] *
                               lhs_dim_multipliers[input_batch_dim];
 
@@ -1233,7 +1235,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
               lhs_dim_multipliers[input_batch_dim];
 
           lhs_linear_index += iz * lhs_dim_multipliers[input_z_dim];
-
           int64 rhs_linear_index = rhs_linear_spatial_index;
 
           rhs_linear_index += out_index[output_z_dim] *
@@ -2299,8 +2300,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
     std::vector<int64> input_index(operand_shape.dimensions_size());
     std::vector<int64> update_index(updates_shape.dimensions_size());
-    std::vector<int64> input_scatter_index_clamped(
-        operand_shape.dimensions_size());
 
     UpdateScatterIndexToInputIndex update_scatter_index_to_input_index(
         &scatter->scatter_dimension_numbers(), /*input_shape=*/operand_shape,
@@ -2789,7 +2788,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   // bound, call `f` with the base index.
   static void IterateThroughWindow(
       const Shape& window_shape, const Window& window, const Shape& base_shape,
-      const absl::Span<const int64>& window_count_index,
+      const absl::Span<const int64> window_count_index,
       const std::function<void(const std::vector<int64>&)>& f) {
     const int64 rank = base_shape.rank();
     DimensionVector window_index(rank);
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index bb01fdd0e15..41488dcdaaa 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -568,6 +568,19 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           std::max(static_cast<int64>(proto.batch_group_count()), int64{1}));
       custom_call_instr->set_custom_call_has_side_effect(
           proto.custom_call_has_side_effect());
+      std::vector<std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+          output_to_operand_aliasing;
+      for (const auto& aliasing : proto.custom_call_output_operand_aliasing()) {
+        output_to_operand_aliasing.emplace_back(
+            ShapeIndex(aliasing.output_shape_index().begin(),
+                       aliasing.output_shape_index().end()),
+            std::pair<int64, ShapeIndex>{
+                aliasing.operand_index(),
+                ShapeIndex(aliasing.operand_shape_index().begin(),
+                           aliasing.operand_shape_index().end())});
+      }
+      custom_call_instr->set_output_to_operand_aliasing(
+          std::move(output_to_operand_aliasing));
       break;
     }
     case HloOpcode::kPad:
@@ -1942,6 +1955,56 @@ Status HloInstruction::CopyAllControlDepsFrom(const HloInstruction* inst) {
   return Status::OK();
 }
 
+bool HloInstruction::IdenticalInternal(
+    const HloInstruction& other,
+    const std::function<bool(const HloInstruction*, const HloInstruction*)>&
+        eq_operands,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations,
+    bool layout_sensitive, bool ignore_channel_id_values) const {
+  // An instruction is always identical to itself.
+  if (this == &other) {
+    return true;
+  }
+
+  // Identical instruction must have the same opcode, shape, and identical
+  // operands.
+  if (opcode() != other.opcode()) {
+    return false;
+  }
+  if (!(layout_sensitive ? ShapeUtil::Equal(shape(), other.shape())
+                         : ShapeUtil::Compatible(shape(), other.shape()))) {
+    return false;
+  }
+  if (operands().size() != other.operands().size()) {
+    return false;
+  }
+
+  // Two AllReduces are Identical if they have the same channel_id.
+  // Their operands don't have to be Identical.
+  if (!IsCrossModuleAllReduce()) {
+    // Use an explicit loop rather than ContainerEquals, because copying
+    // around std::functions may be too expensive in some cases.
+    for (size_t i = 0; i < operands().size(); ++i) {
+      if (!eq_operands(operand(i), other.operand(i))) {
+        return false;
+      }
+    }
+  }
+
+  if (backend_config_ != other.backend_config_) {
+    return false;
+  }
+
+  if (ignore_channel_id_values) {
+    if (auto channel_inst = DynCast<HloChannelInstruction>(this)) {
+      return channel_inst->IdenticalSlowPathIgnoringChannelIdValues(
+          other, eq_computations);
+    }
+  }
+  return IdenticalSlowPath(other, eq_computations);
+}
+
 void HloInstruction::AppendOperand(HloInstruction* operand) {
   if (operand->parent() != nullptr) {
     DCHECK(!operand->parent()->IsMarkedAsDead(operand))
@@ -3370,6 +3433,11 @@ class HloInstruction::FusionReusesParamElements {
         // that.
         value_it = cache->find(&hlo);
         value_it->second = new_val;
+        // Fold() minimizes the UseKind value. If it is already minimum, we can
+        // break the loop early.
+        if (new_val == UseKind::kReuse) {
+          break;
+        }
       }
     }
     return value_it->second;
@@ -3991,6 +4059,10 @@ const Shape& HloInstruction::outfeed_shape() const {
   return Cast<HloOutfeedInstruction>(this)->outfeed_shape();
 }
 
+Shape* HloInstruction::mutable_outfeed_shape() {
+  return Cast<HloOutfeedInstruction>(this)->mutable_outfeed_shape();
+}
+
 const string& HloInstruction::outfeed_config() const {
   return Cast<HloOutfeedInstruction>(this)->outfeed_config();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 7db128b4d34..9675a2f0f0d 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -1122,41 +1122,23 @@ class HloInstruction {
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations = std::equal_to<const HloComputation*>(),
       bool layout_sensitive = true) const {
-    // An instruction is always identical to itself.
-    if (this == &other) {
-      return true;
-    }
+    return IdenticalInternal(other, eq_operands, eq_computations,
+                             layout_sensitive,
+                             /*ignore_channel_id_values=*/false);
+  }
 
-    // Identical instruction must have the same opcode, shape, and identical
-    // operands.
-    if (opcode() != other.opcode()) {
-      return false;
-    }
-    if (!(layout_sensitive ? ShapeUtil::Equal(shape(), other.shape())
-                           : ShapeUtil::Compatible(shape(), other.shape()))) {
-      return false;
-    }
-    if (operands().size() != other.operands().size()) {
-      return false;
-    }
-
-    // Two AllReduces are Identical if they have the same channel_id.
-    // Their operands don't have to be Identical.
-    if (!IsCrossModuleAllReduce()) {
-      // Use an explicit loop rather than ContainerEquals, because copying
-      // around std::functions may be too expensive in some cases.
-      for (size_t i = 0; i < operands().size(); ++i) {
-        if (!eq_operands(operand(i), other.operand(i))) {
-          return false;
-        }
-      }
-    }
-
-    if (backend_config_ != other.backend_config_) {
-      return false;
-    }
-
-    return IdenticalSlowPath(other, eq_computations);
+  // Same as Identical() but ignores channel ID value mismatches, as long as
+  // both have channel IDs or neither has a channel ID.
+  bool IdenticalIgnoringChannelIdValues(
+      const HloInstruction& other,
+      const std::function<bool(const HloInstruction*, const HloInstruction*)>&
+          eq_operands = std::equal_to<const HloInstruction*>(),
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations = std::equal_to<const HloComputation*>(),
+      bool layout_sensitive = true) const {
+    return IdenticalInternal(other, eq_operands, eq_computations,
+                             layout_sensitive,
+                             /*ignore_channel_id_values=*/true);
   }
 
   // Generates a hash value of an HLO instruction. Hash considers
@@ -1787,6 +1769,9 @@ class HloInstruction {
   // Returns the shape for the Outfeed instruction.
   const Shape& outfeed_shape() const;
 
+  // Returns the mutable shape for the Outfeed instruction.
+  Shape* mutable_outfeed_shape();
+
   // Delegates to HloCollectiveInstruction::replica_groups.
   const std::vector<ReplicaGroup>& replica_groups() const;
 
@@ -1926,6 +1911,8 @@ class HloInstruction {
   // by factory methods.
   HloInstruction(HloOpcode opcode, const Shape& shape);
 
+  void RemoveAllOperands() { operands_.clear(); }
+
   void RemoveOperandAt(int index) {
     operands_.erase(operands_.begin() + index);
   }
@@ -1962,6 +1949,14 @@ class HloInstruction {
  private:
   friend class HloComputation;
 
+  bool IdenticalInternal(
+      const HloInstruction& other,
+      const std::function<bool(const HloInstruction*, const HloInstruction*)>&
+          eq_operands,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations,
+      bool layout_sensitive, bool ignore_channel_id_values) const;
+
   // Implementation for non-common logic of CloneWithNewOperands.
   virtual std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
       const Shape& shape, absl::Span<HloInstruction* const> new_operands,
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index df225e27aad..45b2d885d8e 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -447,7 +447,10 @@ std::vector<string> HloChannelInstruction::ExtraAttributesToStringImpl(
 bool HloChannelInstruction::IdenticalSlowPath(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
-    /*eq_computations*/) const {
+        eq_computations) const {
+  if (!IdenticalSlowPathIgnoringChannelIdValues(other, eq_computations)) {
+    return false;
+  }
   const auto& casted_other = static_cast<const HloChannelInstruction&>(other);
   return channel_id() == casted_other.channel_id();
 }
@@ -475,7 +478,7 @@ std::vector<string> HloSendRecvInstruction::ExtraAttributesToStringImpl(
   return attrs;
 }
 
-bool HloSendRecvInstruction::IdenticalSlowPath(
+bool HloSendRecvInstruction::IdenticalSlowPathIgnoringChannelIdValues(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
@@ -596,13 +599,14 @@ std::vector<string> HloCollectiveInstruction::ExtraAttributesToStringImpl(
   return result;
 }
 
-bool HloCollectiveInstruction::IdenticalSlowPath(
+bool HloCollectiveInstruction::IdenticalSlowPathIgnoringChannelIdValues(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
   const auto& casted_other =
       static_cast<const HloCollectiveInstruction&>(other);
-  return HloChannelInstruction::IdenticalSlowPath(other, eq_computations) &&
+  return HloChannelInstruction::IdenticalSlowPathIgnoringChannelIdValues(
+             other, eq_computations) &&
          constrain_layout() == casted_other.constrain_layout() &&
          absl::c_equal(replica_groups(), casted_other.replica_groups(),
                        [](const ReplicaGroup& a, const ReplicaGroup& b) {
@@ -645,12 +649,13 @@ HloInstructionProto HloAllGatherInstruction::ToProto() const {
   return proto;
 }
 
-bool HloAllGatherInstruction::IdenticalSlowPath(
+bool HloAllGatherInstruction::IdenticalSlowPathIgnoringChannelIdValues(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
   const auto& casted_other = static_cast<const HloAllGatherInstruction&>(other);
-  return HloCollectiveInstruction::IdenticalSlowPath(other, eq_computations) &&
+  return HloCollectiveInstruction::IdenticalSlowPathIgnoringChannelIdValues(
+             other, eq_computations) &&
          all_gather_dimension_ == casted_other.all_gather_dimension() &&
          use_global_device_ids() == casted_other.use_global_device_ids();
 }
@@ -691,12 +696,13 @@ std::vector<string> HloAllReduceInstruction::ExtraAttributesToStringImpl(
   return result;
 }
 
-bool HloAllReduceInstruction::IdenticalSlowPath(
+bool HloAllReduceInstruction::IdenticalSlowPathIgnoringChannelIdValues(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
   const auto& casted_other = static_cast<const HloAllReduceInstruction&>(other);
-  return HloCollectiveInstruction::IdenticalSlowPath(other, eq_computations) &&
+  return HloCollectiveInstruction::IdenticalSlowPathIgnoringChannelIdValues(
+             other, eq_computations) &&
          constrain_layout() == casted_other.constrain_layout() &&
          use_global_device_ids() == casted_other.use_global_device_ids() &&
          eq_computations(to_apply(), casted_other.to_apply());
@@ -747,12 +753,13 @@ std::vector<string> HloAllToAllInstruction::ExtraAttributesToStringImpl(
   return result;
 }
 
-bool HloAllToAllInstruction::IdenticalSlowPath(
+bool HloAllToAllInstruction::IdenticalSlowPathIgnoringChannelIdValues(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
   const auto& casted_other = static_cast<const HloAllToAllInstruction&>(other);
-  return HloCollectiveInstruction::IdenticalSlowPath(other, eq_computations) &&
+  return HloCollectiveInstruction::IdenticalSlowPathIgnoringChannelIdValues(
+             other, eq_computations) &&
          split_dimension_ == casted_other.split_dimension();
 }
 
@@ -788,7 +795,7 @@ HloCollectivePermuteInstruction::ExtraAttributesToStringImpl(
   return result;
 }
 
-bool HloCollectivePermuteInstruction::IdenticalSlowPath(
+bool HloCollectivePermuteInstruction::IdenticalSlowPathIgnoringChannelIdValues(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
@@ -797,7 +804,8 @@ bool HloCollectivePermuteInstruction::IdenticalSlowPath(
   }
   const auto& casted_other =
       static_cast<const HloCollectivePermuteInstruction&>(other);
-  return HloChannelInstruction::IdenticalSlowPath(other, eq_computations) &&
+  return HloChannelInstruction::IdenticalSlowPathIgnoringChannelIdValues(
+             other, eq_computations) &&
          absl::c_equal(source_target_pairs(),
                        casted_other.source_target_pairs(),
                        [](const std::pair<int64, int64>& a,
@@ -2387,6 +2395,16 @@ HloInstructionProto HloCustomCallInstruction::ToProto() const {
     }
   }
   proto.set_custom_call_has_side_effect(custom_call_has_side_effect_);
+  for (const auto& pair : output_to_operand_aliasing_) {
+    auto aliasing = proto.add_custom_call_output_operand_aliasing();
+    aliasing->set_operand_index(pair.second.first);
+    for (int64 index : pair.first) {
+      aliasing->add_output_shape_index(index);
+    }
+    for (int64 index : pair.second.second) {
+      aliasing->add_operand_shape_index(index);
+    }
+  }
   return proto;
 }
 
@@ -2424,6 +2442,16 @@ std::vector<string> HloCustomCallInstruction::ExtraAttributesToStringImpl(
   if (custom_call_has_side_effect_) {
     extra.push_back("custom_call_has_side_effect=true");
   }
+  if (!output_to_operand_aliasing_.empty()) {
+    std::vector<string> pair_strings;
+    for (const auto& pair : output_to_operand_aliasing_) {
+      pair_strings.push_back(StrCat(pair.first.ToString(), ": (",
+                                    pair.second.first, ", ",
+                                    pair.second.second.ToString(), ")"));
+    }
+    extra.push_back(StrCat("output_to_operand_aliasing={",
+                           StrJoin(pair_strings, ", "), "}"));
+  }
   return extra;
 }
 
@@ -2467,6 +2495,10 @@ bool HloCustomCallInstruction::IdenticalSlowPath(
       casted_other.custom_call_has_side_effect()) {
     return false;
   }
+  if (output_to_operand_aliasing_ !=
+      casted_other.output_to_operand_aliasing()) {
+    return false;
+  }
   // Note: backend_config comparison is done in Identical, which is the
   // intended/exposed way to compare computations, and so not repeated here.
   return custom_call_target_ == casted_other.custom_call_target_;
@@ -2491,6 +2523,7 @@ HloCustomCallInstruction::CloneWithNewOperandsImpl(
   cloned->set_feature_group_count(feature_group_count_);
   cloned->set_batch_group_count(batch_group_count_);
   cloned->set_custom_call_has_side_effect(custom_call_has_side_effect_);
+  cloned->set_output_to_operand_aliasing(output_to_operand_aliasing_);
   return std::move(cloned);
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index 17368e8b714..88e874347bd 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -244,6 +244,15 @@ class HloChannelInstruction : public HloInstruction {
   absl::optional<int64> channel_id() const { return channel_id_; }
   void set_channel_id(const absl::optional<int64>& channel_id);
 
+  // Whether this instruction is identical to `other` except for the values of
+  // channel IDs, as long as both have channel IDs or neither has a channel ID.
+  virtual bool IdenticalSlowPathIgnoringChannelIdValues(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const {
+    return channel_id_.has_value() == other.channel_id().has_value();
+  }
+
  protected:
   explicit HloChannelInstruction(HloOpcode opcode, const Shape& shape,
                                  const absl::optional<int64>& channel_id);
@@ -252,10 +261,13 @@ class HloChannelInstruction : public HloInstruction {
 
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
+
+  // Do not override IdenticalSlowPath(). Override
+  // IdenticalSlowPathIgnoringChannelIdValues() instead.
   bool IdenticalSlowPath(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
-          eq_computations) const override;
+          eq_computations) const final;
 
   absl::optional<int64> channel_id_;
 };
@@ -275,7 +287,7 @@ class HloSendRecvInstruction : public HloChannelInstruction {
  private:
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
-  bool IdenticalSlowPath(
+  bool IdenticalSlowPathIgnoringChannelIdValues(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations) const override;
@@ -363,7 +375,7 @@ class HloCollectiveInstruction : public HloChannelInstruction {
 
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
-  bool IdenticalSlowPath(
+  bool IdenticalSlowPathIgnoringChannelIdValues(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations) const override;
@@ -390,7 +402,7 @@ class HloAllGatherInstruction : public HloCollectiveInstruction {
   HloInstructionProto ToProto() const override;
 
  private:
-  bool IdenticalSlowPath(
+  bool IdenticalSlowPathIgnoringChannelIdValues(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations) const override;
@@ -434,7 +446,7 @@ class HloAllReduceInstruction : public HloCollectiveInstruction {
   HloInstructionProto ToProto() const override;
 
  private:
-  bool IdenticalSlowPath(
+  bool IdenticalSlowPathIgnoringChannelIdValues(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations) const override;
@@ -471,7 +483,7 @@ class HloAllToAllInstruction : public HloCollectiveInstruction {
   HloInstructionProto ToProto() const override;
 
  private:
-  bool IdenticalSlowPath(
+  bool IdenticalSlowPathIgnoringChannelIdValues(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations) const override;
@@ -501,7 +513,7 @@ class HloCollectivePermuteInstruction : public HloChannelInstruction {
  private:
   std::vector<string> ExtraAttributesToStringImpl(
       const HloPrintOptions& options) const override;
-  bool IdenticalSlowPath(
+  bool IdenticalSlowPathIgnoringChannelIdValues(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations) const override;
@@ -1182,6 +1194,8 @@ class HloOutfeedInstruction : public HloInstruction {
                                  absl::string_view outfeed_config);
   // Returns the shape for the Outfeed instruction.
   const Shape& outfeed_shape() const { return outfeed_shape_; }
+  // Returns the mutable shape for the Outfeed instruction.
+  Shape* mutable_outfeed_shape() { return &outfeed_shape_; }
   // Returns the config for the Outfeed instruction.
   const string& outfeed_config() const { return outfeed_config_; }
   void set_outfeed_config(const string& config) { outfeed_config_ = config; }
@@ -1416,6 +1430,20 @@ class HloCustomCallInstruction : public HloInstruction {
     CHECK(layout_constrained());
     return operand_shapes_with_layout_;
   }
+  // Gets a list of output/operand buffer pairs that alias each other, where the
+  // output buffer is represented as a ShapeIndex, and the operand buffer is
+  // represented as the operand index and the ShapeIndex. By default this list
+  // is empty.
+  const std::vector<std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>&
+  output_to_operand_aliasing() const {
+    return output_to_operand_aliasing_;
+  }
+  // Sets the list of output/operand buffer pairs that alias each other.
+  void set_output_to_operand_aliasing(
+      std::vector<std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+          aliasing) {
+    output_to_operand_aliasing_ = std::move(aliasing);
+  }
 
  private:
   std::vector<string> ExtraAttributesToStringImpl(
@@ -1444,6 +1472,10 @@ class HloCustomCallInstruction : public HloInstruction {
   std::vector<Shape> operand_shapes_with_layout_;
   // Whether this custom call has a side-effect.
   bool custom_call_has_side_effect_;
+  // A list of output/operand buffer pairs that alias each other. See comment of
+  // output_to_operand_aliasing().
+  std::vector<std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+      output_to_operand_aliasing_;
 };
 
 class HloPadInstruction : public HloInstruction {
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc
index 749193a83ef..3c44b390969 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/service/hlo_lexer.cc
@@ -387,6 +387,12 @@ TokKind HloLexer::LexNumberOrPattern() {
     return TokKind::kNegInf;
   }
 
+  static LazyRE2 neg_nan = {"-nan"};
+  if (RE2::Consume(&consumable, *neg_nan)) {
+    current_ptr_ = consumable.begin();
+    return TokKind::kNegNan;
+  }
+
   return TokKind::kError;
 }
 
@@ -502,6 +508,8 @@ string TokKindToString(TokKind kind) {
       return "kw_nan";
     case TokKind::kw_inf:
       return "kw_inf";
+    case TokKind::kNegNan:
+      return "kNegNan";
     case TokKind::kNegInf:
       return "kNegInf";
     case TokKind::kPrimitiveType:
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.h b/tensorflow/compiler/xla/service/hlo_lexer.h
index b8c7debaab4..4068ad76581 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.h
+++ b/tensorflow/compiler/xla/service/hlo_lexer.h
@@ -65,6 +65,7 @@ enum class TokKind {
   kw_nan,
   kw_inf,
 
+  kNegNan,  // -nan
   kNegInf,  // -inf
 
   // Typed tokens.
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.cc b/tensorflow/compiler/xla/service/hlo_module_config.cc
index eaed707607d..8158d198799 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_config.cc
@@ -51,12 +51,14 @@ string HloModuleConfig::compilation_cache_key() const {
   string key = absl::StrCat("profiling=", hlo_profiling_enabled());
   StrAppend(&key, "::(");
   std::vector<string> params;
-  for (const ShapeLayout& param_layout :
-       entry_computation_layout_->parameter_layouts()) {
-    params.push_back(param_layout.shape().DebugString());
+  if (entry_computation_layout_.has_value()) {
+    for (const ShapeLayout& param_layout :
+         entry_computation_layout_->parameter_layouts()) {
+      params.push_back(param_layout.shape().DebugString());
+    }
+    StrAppend(&key, absl::StrJoin(params, ", "), ") => ",
+              entry_computation_layout_->result_shape().SerializeAsString());
   }
-  StrAppend(&key, absl::StrJoin(params, ", "), ") => ",
-            entry_computation_layout_->result_shape().SerializeAsString());
   if (seed() != 0) {
     // TODO(b/32083678): force recompilation to reset global state.
     static std::atomic<int> counter{0};
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index e2bbda3a607..d04a7695f3c 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -212,6 +212,7 @@ class HloParserImpl : public HloParser {
     kEnum,
     kRandomAlgorithm,
     kAliasing,
+    kInstructionAliasing,
   };
 
   struct AttrConfig {
@@ -346,6 +347,12 @@ class HloParserImpl : public HloParser {
   // fails.
   bool ParseAliasing(AliasingData* data);
 
+  // Parses the per-instruction aliasing information from string `s`, returns
+  // `false` if it fails.
+  bool ParseInstructionOutputOperandAliasing(
+      std::vector<std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>*
+          aliasing_output_operand_pairs);
+
   bool ParseShapeIndex(ShapeIndex* out);
 
   // Returns true if the current token is the beginning of a shape.
@@ -598,6 +605,58 @@ bool HloParserImpl::ParseAliasing(AliasingData* data) {
   return true;
 }
 
+bool HloParserImpl::ParseInstructionOutputOperandAliasing(
+    std::vector<std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>*
+        aliasing_output_operand_pairs) {
+  if (!ParseToken(
+          TokKind::kLbrace,
+          "Expects '{' at the start of instruction aliasing description")) {
+    return false;
+  }
+
+  while (lexer_.GetKind() != TokKind::kRbrace) {
+    ShapeIndex out;
+    if (!ParseShapeIndex(&out)) {
+      return false;
+    }
+    std::string errmsg =
+        "Expected format: <output_shape_index>: (<operand_index>, "
+        "<operand_shape_index>)";
+    if (!ParseToken(TokKind::kColon, errmsg)) {
+      return false;
+    }
+
+    if (!ParseToken(TokKind::kLparen, errmsg)) {
+      return false;
+    }
+    int64 operand_index;
+    ParseInt64(&operand_index);
+    if (!ParseToken(TokKind::kComma, errmsg)) {
+      return false;
+    }
+    ShapeIndex operand_shape_index;
+    if (!ParseShapeIndex(&operand_shape_index)) {
+      return false;
+    }
+
+    aliasing_output_operand_pairs->emplace_back(
+        out, std::pair<int64, ShapeIndex>{operand_index, operand_shape_index});
+    if (!ParseToken(TokKind::kRparen, errmsg)) {
+      return false;
+    }
+
+    if (!EatIfPresent(TokKind::kComma)) {
+      break;
+    }
+  }
+  if (!ParseToken(
+          TokKind::kRbrace,
+          "Expects '}' at the end of instruction aliasing description")) {
+    return false;
+  }
+  return true;
+}
+
 // ::= 'HloModule' name computations
 bool HloParserImpl::ParseHloModule(HloModule* module) {
   if (lexer_.GetKind() != TokKind::kw_HloModule) {
@@ -1777,6 +1836,8 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
       optional<std::vector<Shape>> operand_layout_constraints;
       optional<bool> custom_call_has_side_effect;
       optional<HloComputation*> to_apply;
+      optional<std::vector<std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>>
+          output_to_operand_aliasing;
       attrs["custom_call_target"] = {/*required=*/true, AttrTy::kString,
                                      &custom_call_target};
       attrs["window"] = {/*required=*/false, AttrTy::kWindow, &window};
@@ -1792,6 +1853,9 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
                                               &custom_call_has_side_effect};
       attrs["to_apply"] = {/*required=*/false, AttrTy::kHloComputation,
                            &to_apply};
+      attrs["output_to_operand_aliasing"] = {/*required=*/false,
+                                             AttrTy::kInstructionAliasing,
+                                             &output_to_operand_aliasing};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
@@ -1861,6 +1925,10 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
         custom_call_instr->set_custom_call_has_side_effect(
             *custom_call_has_side_effect);
       }
+      if (output_to_operand_aliasing.has_value()) {
+        custom_call_instr->set_output_to_operand_aliasing(
+            std::move(*output_to_operand_aliasing));
+      }
       break;
     }
     case HloOpcode::kDot: {
@@ -2649,6 +2717,7 @@ bool HloParserImpl::ParseDenseLiteral(Literal* literal, const Shape& shape) {
       case TokKind::kInt:
       case TokKind::kDecimal:
       case TokKind::kw_nan:
+      case TokKind::kNegNan:
       case TokKind::kw_inf:
       case TokKind::kNegInf: {
         add_one_elem_seen();
@@ -3223,6 +3292,19 @@ bool HloParserImpl::ParseAttributeHelper(
             ->emplace(aliasing_data);
         return true;
       }
+      case AttrTy::kInstructionAliasing: {
+        std::vector<std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>
+            aliasing_output_operand_pairs;
+        if (!ParseInstructionOutputOperandAliasing(
+                &aliasing_output_operand_pairs)) {
+          return false;
+        }
+        static_cast<optional<
+            std::vector<std::pair<ShapeIndex, std::pair<int64, ShapeIndex>>>>*>(
+            attr_out_ptr)
+            ->emplace(std::move(aliasing_output_operand_pairs));
+        return true;
+      }
     }
   }();
   if (!success) {
@@ -4293,6 +4375,9 @@ bool HloParserImpl::ParseDouble(double* result) {
     case TokKind::kw_nan:
       *result = std::numeric_limits<double>::quiet_NaN();
       break;
+    case TokKind::kNegNan:
+      *result = -std::numeric_limits<double>::quiet_NaN();
+      break;
     case TokKind::kw_inf:
       *result = std::numeric_limits<double>::infinity();
       break;
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 620e67c3a2f..3cb9a1c564b 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -991,6 +991,19 @@ ENTRY %CustomCallWithHasSideEffect (p0: (f32[2,2], f32[42,2,3]), p1: f32[123,4])
   ROOT %custom-call = (f32[1,2,3]{0,2,1}, f32[1,2,3]{1,2,0}) custom-call((f32[2,2]{0,1}, f32[42,2,3]{0,1,2}) %p0, f32[123,4]{0,1} %p1), custom_call_target="baz", custom_call_has_side_effect=true
 }
 
+)"
+},
+// CustomCallWithAliasing
+{
+"CustomCallWithAliasing",
+R"(HloModule CustomCallWithAliasing
+
+ENTRY %CustomCallWithAliasing (p0: (f32[2,2], f32[42,2,3]), p1: f32[123,4]) -> (f32[123,4], f32[2,2], f32[1,2,3]) {
+  %p0 = (f32[2,2]{0,1}, f32[42,2,3]{0,1,2}) parameter(0)
+  %p1 = f32[123,4]{0,1} parameter(1)
+  ROOT %custom-call = (f32[123,4]{0,1}, f32[2,2]{0,1}, f32[1,2,3]{0,1,2}) custom-call((f32[2,2]{0,1}, f32[42,2,3]{0,1,2}) %p0, f32[123,4]{0,1} %p1), custom_call_target="baz", output_to_operand_aliasing={{0}: (1, {}), {1}: (0, {0})}
+}
+
 )"
 },
 // Parse c64 literal
@@ -2107,6 +2120,19 @@ ENTRY %ShortConstant.v4 () -> f32[67,89] {
   EXPECT_EQ(result.ValueOrDie()->ToString(HloPrintOptions()), original);
 }
 
+TEST_F(HloParserTest, NegativeNan) {
+  const string original = R"(HloModule NegativeNan_module
+
+ENTRY %NegativeNan () -> bf16[2] {
+  ROOT %constant = bf16[2]{0} constant({-nan, -nan})
+}
+
+)";
+  auto result = ParseAndReturnUnverifiedModule(original);
+  EXPECT_EQ(Status::OK(), result.status());
+  EXPECT_EQ(result.ValueOrDie()->ToString(HloPrintOptions()), original);
+}
+
 TEST_F(HloParserTest, AttributesAnyOrder) {
   const string original = R"(HloModule any_order_module
 
diff --git a/tensorflow/compiler/xla/service/hlo_pass_fix.h b/tensorflow/compiler/xla/service/hlo_pass_fix.h
index a22a394c6a4..1533c53ba45 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_fix.h
+++ b/tensorflow/compiler/xla/service/hlo_pass_fix.h
@@ -43,11 +43,13 @@ class HloPassFix : public Pass {
     while (changed_this_iteration) {
       TF_ASSIGN_OR_RETURN(changed_this_iteration, Pass::Run(module));
       changed |= changed_this_iteration;
-      VLOG(3) << "changed_this_iteration: " << changed_this_iteration;
+      VLOG(3) << Pass::name() << " iteration " << iteration_count
+              << " changed_this_iteration: " << changed_this_iteration;
       ++iteration_count;
       if (iteration_count == kLimit) {
-        VLOG(1) << "Unexpectedly high number of iterations in HLO passes, "
-                   "exiting fixed point loop.";
+        VLOG(1) << "Unexpectedly high number of iterations in HLO passes '"
+                << Pass::name() << "' for module '" << module->name()
+                << "'. Exiting fixed point loop.";
         // Return false in case this is fixed point is nested.
         return false;
       }
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index b07ab10827a..74c385f16bd 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -58,6 +58,7 @@ StatusOr<bool> HloPassPipeline::RunPassesInternal(
   TF_RETURN_IF_ERROR(RunInvariantCheckers(hlo, last_pass_name));
   bool changed = false;
   for (HloPassInterface* pass : passes) {
+    XLA_SCOPED_LOGGING_TIMER(absl::StrCat("HLO pass: ", pass->name()));
     absl::string_view pass_name = pass->name();
     VLOG(1) << "  HLO pass " << pass_name;
     VLOG(2) << "  Module hash " << hlo->Hash();
@@ -69,6 +70,9 @@ StatusOr<bool> HloPassPipeline::RunPassesInternal(
     }
     TF_ASSIGN_OR_RETURN(bool pass_changed, RunHelper(pass, hlo));
     changed |= pass_changed;
+    if (pass_changed) {
+      VLOG(3) << "  Pass caused changes" << pass->name();
+    }
     TF_RETURN_IF_ERROR(RunInvariantCheckers(hlo, pass_name));
     last_pass_name = string(pass_name);
     if (!pass->IsPassPipeline()) {
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 7f974a618a8..59b1ac31e9b 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -135,12 +135,17 @@ struct Item {
   // The buffers used by this instruction.
   BufferIdList buffers_used;
 
+  bool is_skip_node = false;
+
  private:
   friend class InstructionList;
 
   // Items are arranged in a doubly linked list.
-  Item* next;
-  Item* prev;
+  Item* next = nullptr;
+  Item* prev = nullptr;
+
+  Item* prev_skip_node = nullptr;
+  Item* next_skip_node = nullptr;
 
   // List is ordered by position, which can however be duplicated as
   // new instructions are inserted.  See InsertBeforeInstructions
@@ -152,11 +157,23 @@ using ItemList = absl::InlinedVector<Item*, 3>;
 
 // Class which maintains an ordered list of instructions with fast insertion
 // before arbitrary elements.
+//
+// This is a skip list structure that has two lanes: express lane and slow lane.
+// All nodes are presented on the slow lane but a node can be promoted into
+// express lane for fast iteration.
+//
+// In the following case, node 2 and node + 1 are connected via an express lane.
+//                    +--------------------------+----------->: Express lane
+//                    |                          |
+//       node1<-> node 2 <-> .. <-> node n <-> node n+1 <->...: Slow lane
+//
 class InstructionList {
  public:
   explicit InstructionList(const HloInstructionSequence& order) {
     int64 position = 0;
     Item* last = nullptr;
+    last_skip_node_ = nullptr;
+    first_skip_node_ = nullptr;
     for (HloInstruction* inst : order.instructions()) {
       // Add a new item to the linked list.
       Item* item = new Item;
@@ -198,6 +215,9 @@ class InstructionList {
   Item* first() const { return first_; }
   Item* next(Item* item) const { return item->next; }
 
+  Item* first_skip_node() const { return first_skip_node_; }
+  Item* next_skip_node(Item* item) const { return item->next_skip_node; }
+
   // Creates an Item for the given instruction, but doesn't add it to the list.
   // (Use InsertBeforeInstructions to add the Item to the list.)
   Item* CreateItem(HloInstruction* inst) {
@@ -266,6 +286,27 @@ class InstructionList {
     return InsertBefore(to_insert, min_position_item);
   }
 
+  // Scan the list and promote nodes to express lane if should_promote(Item)
+  // returns true;
+  void PromoteNodesToSkip(std::function<bool(Item*)> should_promote) {
+    int64 count = 0;
+    for (auto* item = first(); item != nullptr; item = next(item)) {
+      if (should_promote(item)) {
+        count += 1;
+        if (first_skip_node_ == nullptr) {
+          first_skip_node_ = item;
+        }
+        item->is_skip_node = true;
+        item->prev_skip_node = last_skip_node_;
+        if (last_skip_node_ != nullptr) {
+          last_skip_node_->next_skip_node = item;
+        }
+        last_skip_node_ = item;
+      }
+    }
+    VLOG(1) << " Rematerialization has " << count << " items in express lane";
+  }
+
   void InsertAfterInstructions(Item* to_insert,
                                absl::Span<Item* const> after_instructions) {
     VLOG(3) << "InsertAfterInstructions: " << to_insert->instruction->name()
@@ -301,6 +342,44 @@ class InstructionList {
   void InsertBefore(Item* item, Item* before) {
     VLOG(3) << "InsertBefore: " << item->instruction->name() << " before "
             << before->instruction->name();
+    // Always place new nodes on express lane for the ease of implementation.
+    item->is_skip_node = true;
+    // Find the next express node starting from 'before'. Set up the node's
+    // express pointers.
+    Item* cursor = before;
+    while (cursor != nullptr && !cursor->is_skip_node) {
+      cursor = cursor->next;
+    }
+    CHECK(cursor == nullptr || cursor->is_skip_node);
+    if (cursor == nullptr) {
+      //
+      // last_skip_node_<---+                              : express lane
+      //                    |
+      //           ...<->`item`<-> .. <-> `cursor`(null)   : slow lane
+      //
+      // Reached the end. Set the prev_express to last_skip_node, and reset
+      // last_skip.
+      item->prev_skip_node = last_skip_node_;
+      item->next_skip_node = nullptr;
+      last_skip_node_ = item;
+    } else {
+      //
+      //     <-+------------+----------------+--------->   : express lane
+      //       |            |                |
+      // prev_express..<->`item`<-> .. <-> `cursor` <-> ...: slow lane
+      //
+      // Reached the next skip node, sets up express pointers accordingly.
+      CHECK(cursor->is_skip_node);
+      item->prev_skip_node = cursor->prev_skip_node;
+      if (item->prev_skip_node != nullptr) {
+        item->prev_skip_node->next_skip_node = item;
+      }
+      item->next_skip_node = cursor;
+      cursor->prev_skip_node = item;
+    }
+    if (first_skip_node_ == cursor) {
+      first_skip_node_ = item;
+    }
     // Insert new item into linked list.
     item->prev = before->prev;
     item->next = before;
@@ -319,6 +398,12 @@ class InstructionList {
 
   Item* first_;
 
+  // First skip node of this list.
+  Item* first_skip_node_;
+
+  // Last skip node of this list.
+  Item* last_skip_node_;
+
   // Item for each instruction.
   absl::flat_hash_map<const HloInstruction*, Item*> item_map_;
 };
@@ -460,6 +545,15 @@ class MemoryUsageTracker {
   // values.
   int64 memory_usage() const { return memory_usage_; }
 
+  //
+  int64 AllocatedSize(Item* item) const {
+    int64 size = 0;
+    for (auto buffer_id : item->buffers_defined) {
+      size += AllocatedSize(buffer_id);
+    }
+    return size;
+  }
+
   // Check invariants of the data structure. This is expensive to call.
   bool Check() const;
 
@@ -652,7 +746,6 @@ MemoryUsageTracker::MemoryUsageTracker(
           .CreateFlattenedSet();
   absl::flat_hash_map<const LogicalBuffer*, BufferId>
       logical_buffer_to_buffer_id;
-
   for (auto* item = instruction_list_.first(); item != nullptr;
        item = instruction_list_.next(item)) {
     const HloInstruction* const instruction = item->instruction;
@@ -1186,8 +1279,9 @@ MemoryUsageTracker::PickRematerializationCandidates(
   VLOG(5) << "Picking candidate block with size in [" << min_block_size << ", "
           << max_block_size << "]";
 
-  for (auto* start_item = instruction_list.first(); start_item != nullptr;
-       start_item = instruction_list.next(start_item)) {
+  for (auto* start_item = instruction_list.first_skip_node();
+       start_item != nullptr;
+       start_item = instruction_list.next_skip_node(start_item)) {
     std::vector<Item*> block =
         GetInitialBlock(instruction_list, *this, start_item, min_block_size);
     if (block.size() < min_block_size) {
@@ -1427,12 +1521,13 @@ StatusOr<int64> CompressInstruction(MemoryUsageTracker* memory_tracker,
           << ") to" << compact_shape.ToString(true);
 
   HloComputation* computation = best->parent();
-
   HloInstruction* compressed = computation->AddInstruction(
-      HloInstruction::CreateUnary(compact_shape, HloOpcode::kCopy, best));
+      HloInstruction::CreateUnary(compact_shape, HloOpcode::kCopy, best),
+      /*new_name=*/best->name() + ".remat_compressed");
 
   HloInstruction* uncompressed = computation->AddInstruction(
-      HloInstruction::CreateUnary(best->shape(), HloOpcode::kCopy, compressed));
+      HloInstruction::CreateUnary(best->shape(), HloOpcode::kCopy, compressed),
+      /*new_name=*/best->name() + ".remat_uncompressed");
 
   Item* compressed_item = instruction_list->CreateItem(compressed);
   compressed_item->placed = true;
@@ -1566,7 +1661,7 @@ StatusOr<int64> HloRematerialization::CalledComputationsMemoryUsage(
 
 StatusOr<bool> HloRematerialization::RematerializeComputation(
     HloComputation* computation, HloSchedule* schedule,
-    int64 memory_limit_bytes) {
+    int64 memory_limit_bytes, int64 min_remat_size) {
   VLOG(1) << "Rematerializing computation " << computation->name()
           << " with limit " << HumanReadableNumBytes(memory_limit_bytes);
   VLOG(1) << "peak memory usage is "
@@ -1577,6 +1672,10 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
   MemoryUsageTracker memory_tracker(
       computation, size_function_, compact_shape_function_,
       *points_to_analysis_, instruction_list, mode_);
+
+  instruction_list.PromoteNodesToSkip([&](Item* item) {
+    return memory_tracker.AllocatedSize(item) >= min_remat_size;
+  });
   bool changed = false;
 
   // If the rematerialization makes the source instruction dead, then the
@@ -1622,43 +1721,46 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
     // single instruction rematerialization is considered first.
     int min_block_size = 1;
     int max_block_size = 1;
+    // Only trigger rematerialization when the memory usage changes.
+    if (memory_tracker.AllocatedSize(item) + callee_usage > 0) {
+      while (memory_tracker.memory_usage() + callee_usage >
+             memory_limit_bytes) {
+        VLOG(2) << "Over memory limit at instruction " << instruction->name()
+                << ", using "
+                << HumanReadableNumBytes(memory_tracker.memory_usage() +
+                                         callee_usage)
+                << ", limit is " << HumanReadableNumBytes(memory_limit_bytes);
 
-    while (memory_tracker.memory_usage() + callee_usage > memory_limit_bytes) {
-      VLOG(2) << "Over memory limit at instruction " << instruction->name()
-              << ", using "
-              << HumanReadableNumBytes(memory_tracker.memory_usage() +
-                                       callee_usage)
-              << ", limit is " << HumanReadableNumBytes(memory_limit_bytes);
+        TF_ASSIGN_OR_RETURN(
+            InstructionsAdded instructions_added,
+            RematerializeBestBlock(min_block_size, max_block_size,
+                                   &memory_tracker, &instruction_list,
+                                   memory_limit_bytes, &rematerializable_map,
+                                   &remat_move_instructions));
+        net_instructions_added += instructions_added.net_instructions_added;
+        remat_count += instructions_added.remat_count;
 
-      TF_ASSIGN_OR_RETURN(InstructionsAdded instructions_added,
-                          RematerializeBestBlock(
-                              min_block_size, max_block_size, &memory_tracker,
-                              &instruction_list, memory_limit_bytes,
-                              &rematerializable_map, &remat_move_instructions));
-      net_instructions_added += instructions_added.net_instructions_added;
-      remat_count += instructions_added.remat_count;
-
-      VLOG(1) << "memory_usage after rematerialization = "
-              << HumanReadableNumBytes(memory_tracker.memory_usage());
-      if (instructions_added.remat_count == 0) {
-        // Unable to find a block to rematerialize.
-        // Consider doubling the block size.
-        min_block_size = max_block_size + 1;
-        max_block_size = 2 * max_block_size;
-      } else {
-        // Found a valid block. Reset to start looking for single instructions
-        // again.
-        max_rematerialized_block_size_ =
-            std::max(max_rematerialized_block_size_, max_block_size);
-        changed = true;
-        min_block_size = 1;
-        max_block_size = 1;
-      }
-      if (max_block_size > block_size_limit_) {
-        break;
+        VLOG(1) << "memory_usage after rematerialization = "
+                << HumanReadableNumBytes(memory_tracker.memory_usage());
+        if (instructions_added.remat_count == 0) {
+          // Unable to find a block to rematerialize.
+          // Consider doubling the block size.
+          min_block_size = max_block_size + 1;
+          max_block_size = 2 * max_block_size;
+        } else {
+          // Found a valid block. Reset to start looking for single instructions
+          // again.
+          max_rematerialized_block_size_ =
+              std::max(max_rematerialized_block_size_, max_block_size);
+          changed = true;
+          min_block_size = 1;
+          max_block_size = 1;
+        }
+        if (max_block_size > block_size_limit_) {
+          break;
+        }
       }
     }
-
     const CallSite* callsite = call_graph_node.GetCallSite(instruction);
     if (callsite != nullptr &&
         callsite->context() == CallContext::kSequential &&
@@ -1683,10 +1785,12 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
           TF_ASSIGN_OR_RETURN(
               bool subcomputation_changed,
               RematerializeComputation(called_computation, schedule,
-                                       subcomputation_memory_limit_bytes));
+                                       subcomputation_memory_limit_bytes,
+                                       min_remat_size));
           changed |= subcomputation_changed;
         }
       }
+
       TF_ASSIGN_OR_RETURN(callee_usage,
                           CalledComputationsMemoryUsage(instruction));
     }
@@ -1786,14 +1890,12 @@ StatusOr<bool> HloRematerialization::Run(HloModule* module) {
       module_output_size;
   VLOG(1) << "Peak memory usage of module (before): "
           << HumanReadableNumBytes(before_peak_memory);
-
   // Subcomputations called by the entry computation will also be
   // rematerialized.
   TF_ASSIGN_OR_RETURN(
       bool changed,
       RematerializeComputation(module->entry_computation(), &module->schedule(),
-                               adjusted_memory_limit_bytes));
-
+                               adjusted_memory_limit_bytes, min_remat_size_));
   // Rematerialization can introduce dead code. This occurs if all uses of an
   // instruction are replaced with rematerializations of the instruction.
 
@@ -1838,7 +1940,6 @@ StatusOr<bool> HloRematerialization::Run(HloModule* module) {
         HumanReadableNumBytes(memory_limit_bytes_), memory_limit_bytes_,
         HumanReadableNumBytes(current_peak_memory), current_peak_memory);
   }
-
   return changed;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index 72221fa8a32..878bb2a8eef 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -85,7 +85,8 @@ class HloRematerialization : public HloModulePass {
       RematerializationSizes* sizes, RematerializationPass pass_location,
       int block_size_limit,
       CompactShapeFunction compact_shape_function = nullptr,
-      RematerializationMode mode = RematerializationMode::kRecomputeAndCompress)
+      RematerializationMode mode = RematerializationMode::kRecomputeAndCompress,
+      int64 min_remat_size = 0)
       : size_function_(size_function),
         memory_limit_bytes_(memory_limit_bytes),
         sizes_(sizes),
@@ -94,7 +95,8 @@ class HloRematerialization : public HloModulePass {
         compact_shape_function_(compact_shape_function == nullptr
                                     ? DefaultCompactShapeFunction
                                     : std::move(compact_shape_function)),
-        mode_(mode) {}
+        mode_(mode),
+        min_remat_size_(min_remat_size) {}
   ~HloRematerialization() override = default;
 
   absl::string_view name() const override { return "rematerialization"; }
@@ -114,7 +116,8 @@ class HloRematerialization : public HloModulePass {
   // and inserted into 'order'.
   virtual StatusOr<bool> RematerializeComputation(HloComputation* computation,
                                                   HloSchedule* schedule,
-                                                  int64 memory_limit_bytes);
+                                                  int64 memory_limit_bytes,
+                                                  int64 min_remat_size);
 
   // Computes and returns the peak memory used by the given computation. The
   // peak memory is the maximum total size of all live HLO instruction values at
@@ -185,6 +188,8 @@ class HloRematerialization : public HloModulePass {
   int max_rematerialized_block_size_ = 0;
 
   RematerializationMode mode_;
+
+  int64 min_remat_size_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index 5176e2f99e5..35f39e9a342 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -41,7 +41,8 @@ using ::testing::_;
 class HloRematerializationTest : public RematerializationTestBase {
  protected:
   StatusOr<bool> RunHloRematerialization(int64 memory_limit_bytes,
-                                         HloModule* module) {
+                                         HloModule* module,
+                                         int64 min_remat_size = 0) {
     TF_EXPECT_OK(verifier().Run(module).status());
     HloMemoryScheduler scheduler(
         [](const BufferValue& buffer) { return ByteSizeOf(buffer.shape()); },
@@ -51,7 +52,9 @@ class HloRematerializationTest : public RematerializationTestBase {
         ByteSizeOf, memory_limit_bytes,
         /*sizes=*/nullptr,
         HloRematerialization::RematerializationPass::kPreFusion,
-        /*block_size_limit=*/1);
+        /*block_size_limit=*/1, nullptr,
+        HloRematerialization::RematerializationMode::kRecomputeAndCompress,
+        min_remat_size);
     return remat.Run(module);
   }
 };
@@ -96,6 +99,26 @@ TEST_F(HloRematerializationTest, SingleComputation) {
             remat_bcast);
 }
 
+// Test rematerialization of a single computation that contains nodes that
+// doesn't contain node worth using remat.
+TEST_F(HloRematerializationTest, SingleComputationNoWorthRemat) {
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation =
+      module->AddEntryComputation(MakeRematerializableComputation());
+
+  // Find and save the original broadcast instruction which should be
+  // rematerialized.
+  const HloInstruction* slice = computation->root_instruction();
+  ASSERT_THAT(slice, op::Slice(op::Concatenate(op::Broadcast(_), _)));
+
+  // Set the minimum remat size to 14KiB, meaning no nodes should be remat.
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          RunHloRematerialization(
+                              /*memory_limit_bytes=*/14 * 1024, module.get(),
+                              /*min_remat_size=*/14 * 1024));
+  EXPECT_FALSE(changed);
+}
+
 // Test rematerialization of a single computation produced by
 // MakeRematerializableComputation but with a sufficiently high memory limit
 // such that no instructions are rematerialized.
@@ -577,17 +600,67 @@ class CompressingRematerializationTest : public RematerializationTestBase {
   }
 
   StatusOr<bool> RunHloRematerialization(int64 memory_limit_bytes,
-                                         HloModule* module) {
+                                         HloModule* module,
+                                         int64 min_remat_size = 0) {
     TF_EXPECT_OK(verifier().Run(module).status());
     HloRematerialization remat(
         ShapeSizePadMinorTo64, memory_limit_bytes,
         /*sizes=*/nullptr,
         HloRematerialization::RematerializationPass::kPreFusion,
-        /*block_size_limit=*/1, ChooseCompactLayoutForShape);
+        /*block_size_limit=*/1, ChooseCompactLayoutForShape,
+        HloRematerialization::RematerializationMode::kCompressOnly,
+        min_remat_size);
     return remat.Run(module);
   }
 };
 
+// Test rematerialization only remats big buffer that pass certain limits.
+TEST_F(CompressingRematerializationTest, OnlyRematBigBuffer) {
+  const string& hlo_string = R"(
+HloModule fusion, is_scheduled=true
+
+%add_float {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %x, f32[] %y)
+}
+
+ENTRY %entry {
+  %param.0 = f32[] parameter(0)
+  %constant = f32[] constant(0)
+  %broadcast.0 = f32[64,2]{1,0} broadcast(f32[] %param.0), dimensions={}
+  %broadcast.1 = f32[10,2]{1,0} broadcast(f32[] %param.0), dimensions={}
+  %negate = f32[64,2]{1,0} negate(f32[64,2]{1,0} broadcast.0)
+  %reduce.0 = f32[] reduce(f32[64,2]{1,0} %negate, f32[] %constant), dimensions={1, 0}, to_apply=%add_float
+  %reduce.1 = f32[] reduce(f32[64,2]{1,0} %broadcast.0, f32[] %constant), dimensions={1, 0}, to_apply=%add_float
+  %reduce.2 = f32[] reduce(f32[10,2]{1,0} %broadcast.1, f32[] %constant), dimensions={1, 0}, to_apply=%add_float  
+  %add = f32[] add(f32[] %reduce.0, f32[] %reduce.1)
+  ROOT %add.2 = f32[] add(f32[] %add, f32[] %reduce.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  // Only rematerialize buffers which have shaep f32[64, 2]. Buffers with shape
+  // f32[10, 2] are ignored.
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloRematerialization(
+                                            /*memory_limit_bytes=*/30 * 1024,
+                                            module.get(), 10 * 1024));
+  EXPECT_TRUE(changed);
+  HloInstruction* broadcast =
+      module->entry_computation()->GetInstructionWithName("broadcast.0");
+  HloInstruction* broadcast_2 =
+      module->entry_computation()->GetInstructionWithName("broadcast.1");
+  HloInstruction* reduce =
+      module->entry_computation()->GetInstructionWithName("reduce.1");
+  HloInstruction* reduce_2 =
+      module->entry_computation()->GetInstructionWithName("reduce.2");
+  EXPECT_THAT(reduce,
+              op::Reduce(op::Copy(op::Copy(broadcast)), op::Constant()));
+  EXPECT_THAT(reduce_2, op::Reduce(broadcast_2, op::Constant()));
+}
+
 // Test rematerialization of a single instruction.
 TEST_F(CompressingRematerializationTest, SingleRemat) {
   const string& hlo_string = R"(
diff --git a/tensorflow/compiler/xla/service/hlo_replication_analysis.cc b/tensorflow/compiler/xla/service/hlo_replication_analysis.cc
index dec119d8aba..2d1edbefd97 100644
--- a/tensorflow/compiler/xla/service/hlo_replication_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_replication_analysis.cc
@@ -129,6 +129,13 @@ bool DetermineHloInstructionIsReplicated(
     return true;
   }
 
+  if (hlo->opcode() == HloOpcode::kCustomCall &&
+      (hlo->custom_call_target() == "X64SplitLow" ||
+       hlo->custom_call_target() == "X64SplitHigh" ||
+       hlo->custom_call_target() == "X64Combine")) {
+    return all_operands_replicated(hlo);
+  }
+
   if (hlo->IsElementwise() ||                             //
       hlo->opcode() == HloOpcode::kConcatenate ||         //
       hlo->opcode() == HloOpcode::kConvolution ||         //
diff --git a/tensorflow/compiler/xla/service/hlo_replication_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_replication_analysis_test.cc
index c2d86e808c2..cc0f4c86f4d 100644
--- a/tensorflow/compiler/xla/service/hlo_replication_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_replication_analysis_test.cc
@@ -501,6 +501,36 @@ ENTRY entry {
       FindInstruction(module.get(), "conditional"), {1}));
 }
 
+TEST_F(HloReplicationAnalysisTest, X64SplitCombine) {
+  const string module_str = R"(
+HloModule SimpleTupleSelect
+
+ENTRY entry {
+  param = (f64[]) parameter(0)
+  gte = f64[] get-tuple-element(param), index=0
+  param-low = f32[] custom-call(gte), custom_call_target="X64SplitLow"
+  param-high = f32[] custom-call(gte), custom_call_target="X64SplitHigh"
+  ROOT result-combine = f64[] custom-call(param-low, param-high), custom_call_target="X64Combine"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto param = module->entry_computation()->parameter_instruction(0);
+  param->set_parameter_replicated_at_leaf_buffers(absl::Span<const bool>{true});
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloReplicationAnalysis> analysis,
+                          HloReplicationAnalysis::Run(
+                              module.get(), /*cross_partition_spmd=*/false));
+  EXPECT_TRUE(analysis->HloInstructionIsReplicatedAt(
+      FindInstruction(module.get(), "gte"), {}));
+  EXPECT_TRUE(analysis->HloInstructionIsReplicatedAt(
+      FindInstruction(module.get(), "param-low"), {}));
+  EXPECT_TRUE(analysis->HloInstructionIsReplicatedAt(
+      FindInstruction(module.get(), "param-high"), {}));
+  EXPECT_TRUE(analysis->HloInstructionIsReplicatedAt(
+      FindInstruction(module.get(), "result-combine"), {}));
+}
+
 TEST_F(HloReplicationAnalysisTest, SimpleTupleSelect) {
   const string module_str = R"(
 HloModule SimpleTupleSelect
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 3a5e7ca6f40..0d71c6d49ed 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -211,8 +211,7 @@ static std::vector<ExecutionInput> ExecutionInputsFromScopedShapedBuffers(
             *buffer_tree.mutable_element(index) = execution_input_buffer;
           }
         });
-    execution_inputs.emplace_back(std::move(buffer_tree),
-                                  input_buffer.on_host_shape());
+    execution_inputs.emplace_back(std::move(buffer_tree));
   }
   return execution_inputs;
 }
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 4244cdaceea..977f6ee8ea6 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -56,6 +56,13 @@ HloSharding HloSharding::PartialTile(
 
 HloSharding HloSharding::PartialTile(
     const Array<int64>& tile_assignment_last_dim_replicate) {
+  if (tile_assignment_last_dim_replicate.dimensions().back() == 1) {
+    auto new_tile_dims = tile_assignment_last_dim_replicate.dimensions();
+    new_tile_dims.pop_back();
+    auto fully_tiled = tile_assignment_last_dim_replicate;
+    fully_tiled.Reshape(new_tile_dims);
+    return HloSharding(fully_tiled);
+  }
   std::vector<std::set<int64>> sorted_groups(
       tile_assignment_last_dim_replicate.num_elements() /
       tile_assignment_last_dim_replicate.dimensions().back());
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util.cc b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
index e1e506b2892..18f76c5253b 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/array.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
@@ -347,13 +348,21 @@ HloSharding GatherOutputSharding(const HloSharding& index_sharding,
       index_dim++;
     }
   }
+
+  if (index_sharding.ReplicateOnLastTileDim()) {
+    output_tile_assignment_dims.push_back(
+        index_sharding.tile_assignment().dimensions().back());
+  }
+
   Array<int64> new_tile_assignment = index_sharding.tile_assignment();
   if (new_tile_assignment.num_elements() !=
       Product(output_tile_assignment_dims)) {
     return HloSharding::Replicate();
   }
   new_tile_assignment.Reshape(output_tile_assignment_dims);
-  return HloSharding::Tile(new_tile_assignment);
+  return index_sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(new_tile_assignment)
+             : HloSharding::Tile(new_tile_assignment);
 }
 
 HloSharding GatherIndexSharding(const HloSharding& output_sharding,
@@ -379,13 +388,20 @@ HloSharding GatherIndexSharding(const HloSharding& output_sharding,
         index_tile_assignment_dims.begin() + dnums.index_vector_dim(), 1);
   }
 
+  if (output_sharding.ReplicateOnLastTileDim()) {
+    index_tile_assignment_dims.push_back(
+        output_sharding.tile_assignment().dimensions().back());
+  }
+
   Array<int64> new_tile_assignment = output_sharding.tile_assignment();
   if (new_tile_assignment.num_elements() !=
       Product(index_tile_assignment_dims)) {
     return HloSharding::Replicate();
   }
   new_tile_assignment.Reshape(index_tile_assignment_dims);
-  return HloSharding::Tile(new_tile_assignment);
+  return output_sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(new_tile_assignment)
+             : HloSharding::Tile(new_tile_assignment);
 }
 
 HloSharding GatherEffectiveOutputSharding(const HloInstruction& hlo) {
@@ -455,13 +471,19 @@ HloSharding ScatterIndexSharding(const HloSharding& data_sharding,
   if (index_tile_assignment_dims.size() < hlo->operand(1)->shape().rank()) {
     index_tile_assignment_dims.push_back(1);
   }
+  if (data_sharding.ReplicateOnLastTileDim()) {
+    index_tile_assignment_dims.push_back(
+        data_sharding.tile_assignment().dimensions().back());
+  }
   Array<int64> new_tile_assignment = data_sharding.tile_assignment();
   if (new_tile_assignment.num_elements() !=
       Product(index_tile_assignment_dims)) {
     return HloSharding::Replicate();
   }
   new_tile_assignment.Reshape(index_tile_assignment_dims);
-  return HloSharding::Tile(new_tile_assignment);
+  return data_sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(new_tile_assignment)
+             : HloSharding::Tile(new_tile_assignment);
 }
 
 HloSharding ScatterDataSharding(const HloSharding& index_sharding,
@@ -481,13 +503,19 @@ HloSharding ScatterDataSharding(const HloSharding& index_sharding,
       index_dim++;
     }
   }
+  if (index_sharding.ReplicateOnLastTileDim()) {
+    data_tile_assignment_dims.push_back(
+        index_sharding.tile_assignment().dimensions().back());
+  }
   Array<int64> new_tile_assignment = index_sharding.tile_assignment();
   if (new_tile_assignment.num_elements() !=
       Product(data_tile_assignment_dims)) {
     return HloSharding::Replicate();
   }
   new_tile_assignment.Reshape(data_tile_assignment_dims);
-  return HloSharding::Tile(new_tile_assignment);
+  return index_sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(new_tile_assignment)
+             : HloSharding::Tile(new_tile_assignment);
 }
 
 HloSharding ScatterEffectiveIndexSharding(const HloSharding& index_sharding,
@@ -614,9 +642,15 @@ absl::optional<HloSharding> PassthroughOperandToGatherOutputOrScatterUpdate(
     }
     passthrough_tile[offset_dim] = dim_partitions;
   }
+  if (operand_sharding.ReplicateOnLastTileDim()) {
+    passthrough_tile.push_back(
+        operand_sharding.tile_assignment().dimensions().back());
+  }
   Array<int64> tile_assignment = operand_sharding.tile_assignment();
   tile_assignment.Reshape(passthrough_tile);
-  return HloSharding::Tile(tile_assignment);
+  return operand_sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(tile_assignment)
+             : HloSharding::Tile(tile_assignment);
 }
 
 // Inverse of PassthroughOperandToGatherOutputOrScatterUpdate.
@@ -650,12 +684,19 @@ absl::optional<HloSharding> PassthroughGatherOutputOrScatterUpdateToOperand(
     }
     passthrough_tile[i] = dim_partitions;
   }
+
+  if (update_or_gather_sharding.ReplicateOnLastTileDim()) {
+    passthrough_tile.push_back(
+        update_or_gather_sharding.tile_assignment().dimensions().back());
+  }
   Array<int64> tile_assignment = update_or_gather_sharding.tile_assignment();
   if (tile_assignment.num_elements() != Product(passthrough_tile)) {
     return absl::nullopt;
   }
   tile_assignment.Reshape(passthrough_tile);
-  return HloSharding::Tile(tile_assignment);
+  return update_or_gather_sharding.ReplicateOnLastTileDim()
+             ? HloSharding::PartialTile(tile_assignment)
+             : HloSharding::Tile(tile_assignment);
 }
 
 }  // namespace
@@ -776,29 +817,51 @@ IdentityValueAndHloOpcodeForScatterReduceComputation(
                 "add/or/multiply/add/min/max");
 }
 
-std::vector<int64> DevicesForSharding(
-    const HloSharding& sharding, const std::vector<int64>& available_devices) {
-  std::vector<int64> devices;
-  if (sharding.IsReplicated()) {
-    for (int64 d : available_devices) {
-      if (!HloSharding::IsReservedDevice(d)) {
-        devices.push_back(d);
-      }
+namespace {
+
+void DevicesForShardingInternal(
+    const HloSharding& sharding,
+    const absl::flat_hash_set<int64>& available_devices,
+    absl::flat_hash_set<int64>* used) {
+  if (sharding.IsTuple()) {
+    for (const auto& subsharding : sharding.tuple_elements()) {
+      DevicesForShardingInternal(subsharding, available_devices, used);
     }
-    return devices;
+    return;
   }
 
-  for (int64 i : available_devices) {
-    if (sharding.UsesDevice(i)) {
-      devices.push_back(i);
+  if (sharding.IsReplicated()) {
+    for (int64 device : available_devices) {
+      if (!HloSharding::IsReservedDevice(device)) {
+        used->insert(device);
+      }
+    }
+    return;
+  }
+
+  DCHECK(std::all_of(
+      sharding.tile_assignment().begin(), sharding.tile_assignment().end(),
+      [&](int64 device) { return available_devices.contains(device); }));
+  sharding.tile_assignment().Each([&](absl::Span<const int64> /*indices*/,
+                                      int64 device) { used->insert(device); });
+}
+
+}  // namespace
+
+std::vector<int64> DevicesForSharding(
+    const HloSharding& sharding, const std::vector<int64>& available_devices) {
+  absl::flat_hash_set<int64> available_set;
+  for (int64 device : available_devices) {
+    available_set.insert(device);
+  }
+  absl::flat_hash_set<int64> used_set;
+  DevicesForShardingInternal(sharding, available_set, &used_set);
+  std::vector<int64> devices;
+  for (int64 device : available_devices) {
+    if (used_set.contains(device)) {
+      devices.push_back(device);
     }
   }
-  DCHECK(std::all_of(sharding.tile_assignment().begin(),
-                     sharding.tile_assignment().end(), [&](int64 device) {
-                       return std::find(available_devices.begin(),
-                                        available_devices.end(),
-                                        device) != available_devices.end();
-                     }));
   return devices;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 0af2a45bfc7..4be0c5259cc 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -801,6 +801,28 @@ Status ShapeVerifier::HandleCustomCall(HloInstruction* instruction) {
       TF_RET_CHECK(LayoutUtil::HasLayout(operand_shape_with_layout));
     }
   }
+  for (const auto& pair : custom_call->output_to_operand_aliasing()) {
+    TF_RET_CHECK(pair.second.first < custom_call->operand_count())
+        << "Invalid aliasing operand index.";
+    TF_RET_CHECK(ShapeUtil::IndexIsValid(
+        custom_call->operand(pair.second.first)->shape(), pair.second.second))
+        << "Invalid aliasing operand shape index.";
+    TF_RET_CHECK(ShapeUtil::IndexIsValid(custom_call->shape(), pair.first))
+        << "Invalid aliasing output shape index.";
+    const Shape& output_subshape =
+        ShapeUtil::GetSubshape(custom_call->shape(), pair.first);
+    const Shape& operand_subshape = ShapeUtil::GetSubshape(
+        custom_call->operand(pair.second.first)->shape(), pair.second.second);
+    if (layout_sensitive_) {
+      TF_RET_CHECK(operand_subshape == output_subshape)
+          << "Different aliasing shapes: " << operand_subshape.ToString()
+          << " vs " << output_subshape.ToString();
+    } else {
+      TF_RET_CHECK(ShapeUtil::Compatible(output_subshape, operand_subshape))
+          << "Different aliasing shapes: " << operand_subshape.ToString()
+          << " vs " << output_subshape.ToString();
+    }
+  }
   return Status::OK();
 }
 
@@ -1037,7 +1059,7 @@ namespace {
 // inputs.
 Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
   switch (instruction->opcode()) {
-    // White list the following opcodes for mixed-precision check, because
+    // Allow-list the following opcodes for mixed-precision check, because
     // they involve data pass through or grouping via tuples, where the
     // precisions of buffers can be different.
     case HloOpcode::kCall:
@@ -1160,6 +1182,7 @@ Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
       case HloOpcode::kCopyDone:
       case HloOpcode::kCopyStart:
       case HloOpcode::kCustomCall:
+      case HloOpcode::kDynamicUpdateSlice:
       case HloOpcode::kGetTupleElement:
       case HloOpcode::kInfeed:
       case HloOpcode::kOutfeed:
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index 1f71c9586d5..0df30166a1c 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -494,6 +494,28 @@ TEST_F(HloVerifierTest, ScalarIndexDynamicUpdateSlice) {
   ASSERT_TRUE(status.ok());
 }
 
+TEST_F(HloVerifierTestAllowMixedPrecision, DynamicUpdateSliceMixedPrecision) {
+  const char* const kDynamicUpdateSliceMixedPrecision = R"(
+    HloModule kDynamicUpdateSliceMixedPrecision
+
+    ENTRY %entry (parameter.0: f32[32,511,2048], parameter.1: bf16[32,511,512], parameter.2: s32[], parameter.3: s32[], parameter.4: s32[]) -> bf16[32,511,2048] {
+      %parameter.0 = f32[32,511,2048] parameter(0)
+      %parameter.1 = bf16[32,511,512] parameter(1)
+      %parameter.2 = s32[] parameter(2)
+      %parameter.3 = s32[] parameter(3)
+      %parameter.4 = s32[] parameter(4)
+      ROOT %dus = bf16[32,511,2048] dynamic-update-slice(f32[32,511,2048] %parameter.0, bf16[32,511,512] %parameter.1, s32[] %parameter.2, s32[] %parameter.3, s32[] %parameter.4)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(
+                                           kDynamicUpdateSliceMixedPrecision));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Expected instruction to have shape equal to "
+                        "f32[32,511,2048], actual shape is bf16[32,511,2048]"));
+}
+
 TEST_F(HloVerifierTestLayoutSensitive, AddWithLayoutChangeNotAllowed) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto module, ParseAndReturnUnverifiedModule(kAddWithLayoutChangeHlo));
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index b290b1bd68b..11472f55792 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -516,11 +516,12 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
         continue;
       }
 
-      VLOG(5) << "Considering fusion of: " << instruction->ToString();
       std::vector<int64>& sorted_operand_numbers = next_entry.second;
 
       for (int64 i : sorted_operand_numbers) {
         HloInstruction* operand = instruction->mutable_operand(i);
+        VLOG(5) << "Considering fusion of: " << instruction->ToString()
+                << " with operand " << operand->name();
 
         if (!operand->IsFusible()) {
           VLOG(3) << "Operand (" << operand->ToString() << ") is not fusible";
@@ -601,6 +602,9 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
     VLOG(1) << FusionConfigToString(*fusion_config);
     module->set_config(module_config);
   }
+
+  reachability_.reset();
+
   VLOG(1) << "Fusion count: " << fuse_count;
 
   return changed;
@@ -710,4 +714,23 @@ HloInstruction::FusionKind InstructionFusion::ChooseKind(
   return HloInstruction::FusionKind::kLoop;
 }
 
+bool InstructionFusion::ReusesOperandElements(const HloInstruction* consumer,
+                                              int64 operand_index) {
+  auto operand = consumer->operand(operand_index);
+  auto it = reused_fusion_operands_.find(consumer);
+  if (it != reused_fusion_operands_.end() && it->second.contains(operand)) {
+    return true;
+  }
+  bool reuses = consumer->ReusesOperandElements(operand_index);
+  // If a parameter was reused, we can cache this information. Fusion
+  // computations only ever grow, so it becomes more likely that a parameter is
+  // reused, but a reused parameter will never become *not* reused.
+  if (reuses) {
+    // We cache the operand corresponding to the fusion parameter, because the
+    // parameter pointers would be invalidated after the next fusion.
+    reused_fusion_operands_[consumer].insert(operand);
+  }
+  return reuses;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h
index 90d9da48e33..d51bf700371 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/instruction_fusion.h
@@ -1,4 +1,3 @@
-#include "absl/container/flat_hash_map.h"
 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,6 +19,8 @@ limitations under the License.
 #include <functional>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/service/fusion_queue.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -138,6 +139,11 @@ class InstructionFusion : public HloModulePass {
     return config_collection_mode_;
   }
 
+  // Returns whether 'consumer' may reuse elements of its `operand_index`th
+  // operand.
+  bool ReusesOperandElements(const HloInstruction* consumer,
+                             int64 operand_index);
+
  private:
   // The set of producers whose consumers we cannot fuse into.
   using HloInstructionSet = std::unordered_set<HloInstruction*>;
@@ -172,6 +178,11 @@ class InstructionFusion : public HloModulePass {
   // Configuration mode.
   FusionConfigCollection config_collection_mode_;
 
+  // Caches which operands are reused inside fusion computations.
+  absl::flat_hash_map<const HloInstruction*,
+                      absl::flat_hash_set<const HloInstruction*>>
+      reused_fusion_operands_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(InstructionFusion);
 };
 
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index 3444d4cae42..c134b7ba6a6 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
@@ -52,6 +53,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_subcomputation_unification",
         "//tensorflow/compiler/xla/service:layout_assignment",
         "//tensorflow/compiler/xla/service:map_inliner",
+        "//tensorflow/compiler/xla/service:qr_expander",
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:triangular_solve_expander",
         "//tensorflow/compiler/xla/service:while_loop_simplifier",
@@ -119,9 +121,9 @@ cc_library(
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/platform:macros",
         "//tensorflow/core/platform:mutex",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/core/platform:types",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index a059482d832..3f3e74dbb62 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/interpreter/executable.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
 #include "tensorflow/compiler/xla/service/map_inliner.h"
+#include "tensorflow/compiler/xla/service/qr_expander.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/triangular_solve_expander.h"
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
@@ -82,6 +83,7 @@ Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
 
   pipeline.AddPass<DynamicIndexSplitter>();
   pipeline.AddPass<CholeskyExpander>();
+  pipeline.AddPass<QrExpander>();
   pipeline.AddPass<ComparisonExpander>();
   pipeline.AddPass<TriangularSolveExpander>();
   pipeline.AddPass<LayoutAssignment>(
diff --git a/tensorflow/compiler/xla/service/interpreter/executable_base.cc b/tensorflow/compiler/xla/service/interpreter/executable_base.cc
index 745750bffe1..00998994c0a 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable_base.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable_base.cc
@@ -56,7 +56,7 @@ StatusOr<ExecutionOutput> InterpreterExecutableBase::ExecuteAsyncOnStream(
   }
   for (auto& argument : arguments) {
     const ShapeTree<MaybeOwningDeviceMemory>& buffers = argument.Buffers();
-    argument_buffers.push_back(ShapedBuffer(buffers.shape(), buffers.shape(),
+    argument_buffers.push_back(ShapedBuffer(buffers.shape(),
                                             /*platform=*/nullptr,
                                             /*device_ordinal=*/device_ordinal));
     auto in_it = buffers.begin();
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 67bfb7da20a..9940b032558 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -1,6 +1,8 @@
 # Description:
 #    Libraries for helping construct LLVM IR for XLA backends.
 
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
@@ -156,10 +158,10 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:elemental_ir_emitter",
+        "//tensorflow/compiler/xla/service:fusion_node_indexing_evaluation",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Core",
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index 0371ce71874..f8514a6cba3 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -17,13 +17,13 @@ limitations under the License.
 
 #include <algorithm>
 #include <functional>
+#include <utility>
 
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/elemental_ir_emitter.h"
+#include "tensorflow/compiler/xla/service/fusion_node_indexing_evaluation.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -44,37 +44,32 @@ using llvm_ir::IrArray;
 Status FusedIrEmitter::DefaultAction(const HloInstruction* hlo) {
   indexed_generators_[hlo] =
       [=](const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-    if (llvm::Value* generated_value = FindOrDefault(
-            generated_value_cache_[hlo], index.multidim(), nullptr)) {
-      llvm::BasicBlock* generated_value_bb = nullptr;
-      if (auto* generated_instruction =
-              llvm::dyn_cast<llvm::Instruction>(generated_value)) {
-        generated_value_bb = generated_instruction->getParent();
-      }
-      // Ideally, we should be able to reuse the cached generated value if it
-      // dominates the current insertion block. However, the check for dominance
-      // can be expensive and unreliable when the function is being constructed.
-      //
-      // It's also worth experimenting what if we don't do caching at all.
-      // LLVM's CSE or GVN should be able to easily merge common subexpressions
-      // that would be regenerated without caching. But this might increase the
-      // JIT compilation time.
-      if (generated_value_bb == nullptr ||
-          generated_value_bb == b_->GetInsertBlock()) {
+    auto cache = generated_value_cache_.find(hlo);
+    if (cache != generated_value_cache_.end()) {
+      auto key = std::make_pair(b_->GetInsertBlock(), index.multidim());
+      if (llvm::Value* generated_value =
+              FindOrDefault(cache->second, key, nullptr)) {
+        VLOG(3) << "The cached generated value is reused.";
+        return generated_value;
+      }
+      auto null_key = std::make_pair(nullptr, index.multidim());
+      if (llvm::Value* generated_value =
+              FindOrDefault(cache->second, null_key, nullptr)) {
         VLOG(3) << "The cached generated value is reused.";
         return generated_value;
       }
-      VLOG(3) << "The cached generated value can't be reused, because it is in "
-                 "a different BB ("
-              << generated_value_bb->getName().str()
-              << ") from the current insertion block ("
-              << b_->GetInsertBlock()->getName().str() << ").";
     }
 
     TF_ASSIGN_OR_RETURN(llvm::Value* const generated_value,
                         elemental_emitter_->MakeElementGenerator(
                             hlo, indexed_generators_)(index));
-    generated_value_cache_[hlo][index.multidim()] = generated_value;
+    llvm::BasicBlock* generated_value_bb = nullptr;
+    if (auto* generated_instruction =
+            llvm::dyn_cast<llvm::Instruction>(generated_value)) {
+      generated_value_bb = generated_instruction->getParent();
+    }
+    generated_value_cache_[hlo][std::make_pair(
+        generated_value_bb, index.multidim())] = generated_value;
     return generated_value;
   };
   return Status::OK();
@@ -214,99 +209,15 @@ bool FusedIrEmitter::IsFusedIrEmitterInefficient(
   if (consumer->opcode() != HloOpcode::kFusion) {
     return false;
   }
-  // Collects for each instruction in the fusion node from which (indirect)
-  // users newly created index values are passed. Roughly speaking, we reuse
-  // index values if the shapes are equal when ignoring the element type (we may
-  // reuse also if the shape change is a bitcast, but we don't consider that
-  // here). By ignoring potential reuses our estimate whether the fusion emitter
-  // is inefficient is a bit more conservative than necessary.
-  absl::flat_hash_map<const HloInstruction*,
-                      absl::flat_hash_set<const HloInstruction*>>
-      indexing_users;
-  // Stores the number of different index accesses for each instruction in the
-  // fusion node. The fusion emitter caches access with the same index, so this
-  // value indicates how many times a specific instruction will be emitted.
-  absl::flat_hash_map<const HloInstruction*, int64> index_usage_count;
-  index_usage_count[consumer] = 1;
-
-  auto evaluate_fusion_computation = [&indexing_users, &index_usage_count](
-                                         const HloInstruction* fusion) {
-    auto postorder =
-        fusion->fused_instructions_computation()->MakeInstructionPostOrder();
-    std::reverse(postorder.begin(), postorder.end());
-    for (const auto* instruction : postorder) {
-      if (instruction->opcode() == HloOpcode::kParameter) {
-        continue;
-      }
-      int64& total = index_usage_count[instruction];
-      if (indexing_users[instruction].empty()) {
-        total = index_usage_count[fusion];
-      } else {
-        total = 0;
-        for (const auto* user : indexing_users[instruction]) {
-          int64 weight = 1;
-          // Concatenate is special: the index differs for each operand, so
-          // in the worst case we have to deal with as many index values as
-          // the number of operands of Concatenate. By considering the worst
-          // case, we are more conservative than necessary regarding
-          // refusing to fuse.
-          if (user->opcode() == HloOpcode::kConcatenate) {
-            weight = user->operand_count();
-          }
-          total += index_usage_count[user] * weight;
-        }
-      }
-      for (const auto* operand : instruction->operands()) {
-        // For simplicity we assume that all shape and layout changing
-        // operations except Transposes invalidate index reuse. Transposes are
-        // special: although they are shape changing, we can reuse the
-        // multi-dimensional index for the operand by permuting it.
-        if (instruction->opcode() == HloOpcode::kTranspose ||
-            Shape::Equal().IgnoreElementType()(operand->shape(),
-                                               instruction->shape())) {
-          // If the index is reused, it means the operand gets index values
-          // from the same set of (indirect) users as 'instruction' itself.
-          indexing_users[operand].insert(indexing_users[instruction].begin(),
-                                         indexing_users[instruction].end());
-        } else {
-          // If the index is not reused, it means 'instruction' computes a
-          // new index derived from the index it gets.
-          indexing_users[operand].insert(instruction);
-        }
-      }
-    }
-  };
-  evaluate_fusion_computation(consumer);
-
-  // Also account for the 'producer' if it would be fused. Find the operand it
-  // corresponds to.
-  for (int64 operand_num = 0; operand_num < consumer->operand_count();
-       ++operand_num) {
-    if (consumer->operand(operand_num) == producer) {
-      auto instruction = consumer->fused_parameter(operand_num);
-      int64& total = index_usage_count[producer];
-      total = 0;
-      for (const auto* user : indexing_users[instruction]) {
-        total += index_usage_count[user];
-      }
-      break;
-    }
+  FusionNodeIndexingEvaluation eval_consumer(consumer);
+  if (producer->opcode() != HloOpcode::kFusion) {
+    return eval_consumer.CodeDuplicationTooHigh(producer);
   }
-
-  // If 'producer' is a fusion node as well, also evaluate it.
-  if (producer->opcode() == HloOpcode::kFusion) {
-    evaluate_fusion_computation(producer);
-  }
-
-  // Sum up the total number of emitted ops.
-  int64 total = 0;
-  for (const auto& entry : index_usage_count) {
-    total += entry.second;
-  }
-
-  // Check that the code duplication has at most a factor of 15 (where 15 is an
-  // arbitrary constant that seems to work).
-  return total > 15 * index_usage_count.size();
+  // If 'producer' is a fusion node as well, also evaluate it. Pass the
+  // evaluated duplication of the fusion node if it is merged into consumer.
+  FusionNodeIndexingEvaluation eval_producer(
+      producer, eval_consumer.EvaluateEmittedInstructions(producer));
+  return eval_producer.MaxCodeDuplicationTooHigh();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
index d13b0262180..e19e970cb24 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <map>
 #include <unordered_map>
+#include <utility>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/optional.h"
@@ -153,9 +154,10 @@ class FusedIrEmitter : public ConstDfsHloVisitorWithDefault {
 
   // Cache of generated values, lest we regenerate an element of a node with
   // multiple outgoing edges
-  absl::flat_hash_map<
-      const HloInstruction*,
-      absl::flat_hash_map<std::vector<llvm::Value*>, llvm::Value*>>
+  absl::flat_hash_map<const HloInstruction*,
+                      absl::flat_hash_map<std::pair<const llvm::BasicBlock*,
+                                                    std::vector<llvm::Value*>>,
+                                          llvm::Value*>>
       generated_value_cache_;
 };
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index b6b3b2dd8b3..9d7f06f4f68 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -130,8 +130,14 @@ IrArray::Index LoopEmitter::EmitDynamicIndex(ForLoopNest* loop_nest,
 }
 
 std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
-    absl::string_view loop_name, llvm::Type* index_type) {
+    absl::string_view loop_name, llvm::Type* index_type,
+    llvm::Value* base_index) {
   CHECK_NE(index_type, nullptr);
+  CHECK_EQ(base_index, nullptr)
+      << "XLA CPU implementation of"
+      << " LoopEmitter::EmitIndexAndSetExitBasicBlock doesn't support"
+      << " base_index, but it was requested.";
+
   if (ShapeUtil::IsScalar(shape_)) {
     // No loop needed, so set exit_bb_ to nullptr.
     exit_bb_ = nullptr;
@@ -164,7 +170,8 @@ Status LoopEmitter::EmitLoop(absl::string_view loop_name,
   }
 
   for (const IrArray::Index& array_index :
-       EmitIndexAndSetExitBasicBlock(loop_name, index_type)) {
+       EmitIndexAndSetExitBasicBlock(loop_name, index_type,
+                                     /*base_index*/ nullptr)) {
     TF_RETURN_IF_ERROR(body_emitter_(array_index));
   }
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
index 008205a642a..a356741f74b 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
@@ -71,11 +71,13 @@ class LoopEmitter {
   // specifies the element, will return multiple indices if the loop is
   // unrolled.
   std::vector<IrArray::Index> EmitIndexAndSetExitBasicBlock() {
-    return EmitIndexAndSetExitBasicBlock(/*loop_name=*/"", b_->getInt64Ty());
+    return EmitIndexAndSetExitBasicBlock(/*loop_name=*/"", b_->getInt64Ty(),
+                                         /*base_index*/ nullptr);
   }
 
   virtual std::vector<IrArray::Index> EmitIndexAndSetExitBasicBlock(
-      absl::string_view loop_name, llvm::Type* index_type);
+      absl::string_view loop_name, llvm::Type* index_type,
+      llvm::Value* base_index);
 
   // Emits a complete loop nest for every element in the given shape.
   Status EmitLoop(absl::string_view loop_name = "",
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index c53f2c19695..5b133a521e3 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -29,6 +29,105 @@ const HeapSimulator::Chunk kDummyChunk{-1, -1};
 // pow(kWhileExecutionCount, nesting_level) times.
 const int kWhileExecutionCount = 5;
 
+bool LooksLikeAnActivation(const HloInstruction* inst) {
+  for (HloInstruction* user : inst->users()) {
+    switch (user->opcode()) {
+      case HloOpcode::kConvolution:
+      case HloOpcode::kDot:
+        if (user->operand(0) == inst) {
+          return true;
+        }
+        break;
+      case HloOpcode::kGather:
+        if (user->operand(1) == inst) {
+          return true;
+        }
+        break;
+      case HloOpcode::kFusion:
+        for (int i = 0; i < user->operand_count(); ++i) {
+          if (user->operand(i) == inst &&
+              LooksLikeAnActivation(user->fused_parameter(i))) {
+            return true;
+          }
+        }
+        break;
+      case HloOpcode::kBitcast:
+        return LooksLikeAnActivation(user);
+      default:
+        return true;
+    }
+  }
+  return false;
+}
+
+bool IsCrossProgramPrefetchCandidate(
+    const HloValue& value, const MemorySpaceAssignment::Options& options) {
+  return value.instruction()->parent() ==
+             value.instruction()->GetModule()->entry_computation() &&
+         value.instruction()->opcode() == HloOpcode::kParameter &&
+         (!value.shape().has_layout() ||
+          value.shape().layout().memory_space() !=
+              options.alternate_memory_space) &&
+         value.index().size() == 1 && value.shape().IsArray() &&
+         !value.uses().empty() &&
+         options.size_fn(value) <= options.max_size_in_bytes &&
+         absl::c_all_of(value.uses(), [&](const HloUse& use) {
+           const HloInstruction* inst =
+               use.instruction->operand(use.operand_number);
+
+           // Skip the LooksLikeAnActivation test since we're testing the
+           // parent GTE and its children below.
+           if (inst->opcode() == HloOpcode::kBitcast &&
+               inst->operand(0)->opcode() == HloOpcode::kGetTupleElement &&
+               inst->operand(0)->operand(0)->opcode() ==
+                   HloOpcode::kParameter) {
+             return true;
+           }
+
+           return inst->opcode() == HloOpcode::kGetTupleElement &&
+                  !LooksLikeAnActivation(inst);
+         });
+}
+
+absl::optional<MemorySpaceAssignment::BufferInterval>
+FindCrossProgramPrefetchCandidate(
+    const HloAliasAnalysis& alias_analysis, const HloLiveRange& hlo_live_range,
+    const MemorySpaceAssignment::Options& options) {
+  std::vector<MemorySpaceAssignment::BufferInterval> candidates;
+  for (const HloBuffer& buffer : alias_analysis.buffers()) {
+    CHECK_GE(buffer.values().size(), 1);
+    const HloValue* value = buffer.values().at(0);
+    if (IsCrossProgramPrefetchCandidate(*value, options)) {
+      MemorySpaceAssignment::BufferInterval interval;
+      interval.buffer = value;
+      interval.size = options.size_fn(*value);
+      interval.start = 0;
+      interval.end = hlo_live_range.schedule_end_time();
+      interval.need_allocation = true;
+      interval.colocations = {++buffer.values().begin(), buffer.values().end()};
+      candidates.emplace_back(interval);
+    }
+  }
+
+  // The buffer_interval_compare ought to do a good job picking the most
+  // appropriate buffer to cross program prefetch, but empirically, it makes
+  // worse choices than just picking the largest buffer.
+  // TODO(b/152421603): Investigate.
+  auto size_compare = [](const auto& x, const auto& y) {
+    return x.size < y.size;
+  };
+  auto& compare = options.default_cross_program_prefetch_heuristic &&
+                          options.buffer_interval_compare
+                      ? *options.buffer_interval_compare
+                      : size_compare;
+
+  auto best_candidate = absl::c_max_element(candidates, compare);
+  if (best_candidate == candidates.end()) {
+    return absl::nullopt;
+  }
+  return *best_candidate;
+}
+
 }  // namespace
 
 /*static*/ StatusOr<std::unique_ptr<MemorySpaceAssignmentCostAnalysis>>
@@ -64,12 +163,16 @@ float MemorySpaceAssignmentCostAnalysis::GetAlternateMemoryBenefit(
         while_nest_multiplier = it->second;
       } else {
         while_nest_multiplier = tensorflow::MathUtil::IPow<float>(
-            kWhileExecutionCount, CalculateWhileLoopNestLevel(&instruction));
+            kWhileExecutionCount,
+            CalculateComputationNestLevel(&instruction,
+                                          /*while_only=*/true));
         cache->while_nest_multiplier[&instruction] = while_nest_multiplier;
       }
     } else {
       while_nest_multiplier = tensorflow::MathUtil::IPow<float>(
-          kWhileExecutionCount, CalculateWhileLoopNestLevel(&instruction));
+          kWhileExecutionCount,
+          CalculateComputationNestLevel(&instruction,
+                                        /*while_only=*/true));
     }
     return (elapsed_time_due_to_memory - elapsed_time_due_to_alternate_mem) *
            while_nest_multiplier;
@@ -119,18 +222,14 @@ float MemorySpaceAssignmentCostAnalysis::GetMemoryBoundedness(
     }
   }
 
-  // Get performance slowdown in seconds of prefetching current BufferInterval
-  // causing to other BufferIntervals.
-  float alternate_mem_slowdown =
-      GetInstructionElapsedDueToMemorySlowdown(interval.size);
-
-  // Divide by the size of the buffer to prioritize smaller buffers that will
-  // give the largest alternate memory benefit.
-  return (alternate_mem_benefit - alternate_mem_slowdown) / interval.size;
+  // Penalize larger buffers by dividing the benefit by the square root of the
+  // size. Empirically, we observed this resulted in better performance compared
+  // to dividing by the size.
+  return alternate_mem_benefit / std::sqrt(interval.size);
 }
 
-int MemorySpaceAssignmentCostAnalysis::CalculateWhileLoopNestLevel(
-    const HloInstruction* instruction) const {
+int MemorySpaceAssignmentCostAnalysis::CalculateComputationNestLevel(
+    const HloInstruction* instruction, bool while_only) const {
   int nest_level = 0;
   const HloComputation* computation = instruction->parent();
   while (!computation->IsEntryComputation()) {
@@ -138,7 +237,7 @@ int MemorySpaceAssignmentCostAnalysis::CalculateWhileLoopNestLevel(
     auto callsites = node.caller_callsites();
     CHECK_EQ(callsites.size(), 1) << "The module is not flattened!";
     auto callsite = callsites[0];
-    if (callsite.instruction()->opcode() == HloOpcode::kWhile) {
+    if (!while_only || callsite.instruction()->opcode() == HloOpcode::kWhile) {
       ++nest_level;
     }
     computation = callsite.instruction()->parent();
@@ -284,6 +383,8 @@ CostAnalysisPrefetchIntervalPicker::CostAnalysisPrefetchIntervalPicker(
     float preferred_async_copy_to_overlap_ratio)
     : while_nest_level_(
           cost_analysis.hlo_live_range().instruction_schedule().size(), 0),
+      computation_nest_level_(
+          cost_analysis.hlo_live_range().instruction_schedule().size(), 0),
       cost_analysis_(cost_analysis),
       min_async_copy_to_overlap_ratio_(min_async_copy_to_overlap_ratio),
       max_async_copy_to_overlap_ratio_(max_async_copy_to_overlap_ratio),
@@ -307,9 +408,12 @@ CostAnalysisPrefetchIntervalPicker::CostAnalysisPrefetchIntervalPicker(
       instructions_elapsed_time.resize(logical_time + 1, 0.0);
       while_nest_level_.resize(logical_time + 1, 0);
     }
-    int nest_level = cost_analysis_.CalculateWhileLoopNestLevel(
-        instruction_and_logical_time.first);
-    while_nest_level_[logical_time] = nest_level;
+    int while_nest_level = cost_analysis_.CalculateComputationNestLevel(
+        instruction_and_logical_time.first, /*while_only=*/true);
+    while_nest_level_[logical_time] = while_nest_level;
+    int computation_nest_level = cost_analysis_.CalculateComputationNestLevel(
+        instruction_and_logical_time.first, /*while_only=*/false);
+    computation_nest_level_[logical_time] = computation_nest_level;
     if (instruction->opcode() == HloOpcode::kWhile ||
         instruction->opcode() == HloOpcode::kConditional) {
       continue;
@@ -317,8 +421,8 @@ CostAnalysisPrefetchIntervalPicker::CostAnalysisPrefetchIntervalPicker(
     float elapsed_time = cost_analysis_.GetInstructionElapsed(
         *instruction_and_logical_time.first);
     instructions_elapsed_time[logical_time] =
-        elapsed_time *
-        tensorflow::MathUtil::IPow<float>(kWhileExecutionCount, nest_level);
+        elapsed_time * tensorflow::MathUtil::IPow<float>(kWhileExecutionCount,
+                                                         while_nest_level);
   }
   // As an optimization, create a cumulative sum vector of elapsed time.
   float cumsum = 0.0;
@@ -388,14 +492,14 @@ int64 CostAnalysisPrefetchIntervalPicker::LatestPrefetchStartTime(
             /*output_in_alternate_mem=*/false);
     inst_elapsed_reduction = elapsed_time - elapsed_time_in_alternate_mem;
   }
-  int end_nest_level = while_nest_level_[end_time];
+  int end_nest_level = computation_nest_level_[end_time];
 
   // Find the latest time we're allowed to start prefetching.
   float min_interval = min_async_copy_to_overlap_ratio_ * async_copy_elapsed;
   int latest_prefetch_time;
   for (latest_prefetch_time = end_time - 1;
        latest_prefetch_time >= start_time &&
-       (while_nest_level_[latest_prefetch_time] != end_nest_level ||
+       (computation_nest_level_[latest_prefetch_time] != end_nest_level ||
         min_interval >
             GetLogicalIntervalElapsed(latest_prefetch_time, end_time) +
                 inst_elapsed_reduction);
@@ -416,13 +520,13 @@ int64 CostAnalysisPrefetchIntervalPicker::PreferredPrefetchStartTime(
       preferred_async_copy_to_overlap_ratio_ * async_copy_elapsed;
   float best_interval = GetLogicalIntervalElapsed(earliest_prefetch_start_time,
                                                   prefetch_end_time);
-  int end_nest_level = while_nest_level_[prefetch_end_time];
+  int end_nest_level = computation_nest_level_[prefetch_end_time];
   for (int64 prefetch_start_time = earliest_prefetch_start_time + 1;
        prefetch_start_time <= latest_prefetch_start_time;
        ++prefetch_start_time) {
     float interval =
         GetLogicalIntervalElapsed(prefetch_start_time, prefetch_end_time);
-    if (while_nest_level_[prefetch_start_time] == end_nest_level &&
+    if (computation_nest_level_[prefetch_start_time] == end_nest_level &&
         std::abs(preferred_interval - interval) <
             std::abs(preferred_interval - best_interval)) {
       best_interval = interval;
@@ -436,10 +540,11 @@ int64 CostAnalysisPrefetchIntervalPicker::LatestPrefetchEndTime(
     int64 original_prefetch_end_time, int64 proposed_prefetch_end_time) const {
   // Iterate towards the beginning until we find a suitable end time that is the
   // same while nest level as the original prefetch end time.
-  int64 original_nest_level = while_nest_level_[original_prefetch_end_time];
+  int64 original_nest_level =
+      computation_nest_level_[original_prefetch_end_time];
   int64 new_prefetch_end_time;
   for (new_prefetch_end_time = proposed_prefetch_end_time;
-       while_nest_level_[new_prefetch_end_time] != original_nest_level;
+       computation_nest_level_[new_prefetch_end_time] != original_nest_level;
        --new_prefetch_end_time) {
   }
   return new_prefetch_end_time;
@@ -460,7 +565,7 @@ void CostAnalysisPrefetchIntervalPicker::Begin(const HloUse& use,
           /*output_in_alternate_mem=*/false);
   inst_elapsed_reduction_ = elapsed_time - elapsed_time_in_alternate_mem;
   end_logical_time_ = end_time;
-  int end_nest_level = while_nest_level_[end_logical_time_];
+  int end_nest_level = computation_nest_level_[end_logical_time_];
 
   // Find the latest time we're allowed to start prefetching.
   float min_interval = min_async_copy_to_overlap_ratio_ * async_copy_elapsed_;
@@ -472,7 +577,7 @@ void CostAnalysisPrefetchIntervalPicker::Begin(const HloUse& use,
                        max_overlap_multiplier_ * async_copy_elapsed_;
   for (earliest_prefetch_time_ = start_time;
        earliest_prefetch_time_ <= end_logical_time_ &&
-       (while_nest_level_[earliest_prefetch_time_] != end_nest_level ||
+       (computation_nest_level_[earliest_prefetch_time_] != end_nest_level ||
         max_interval < GetLogicalIntervalElapsed(earliest_prefetch_time_,
                                                  end_logical_time_));
        ++earliest_prefetch_time_) {
@@ -510,8 +615,8 @@ int64 CostAnalysisPrefetchIntervalPicker::Next() {
   if (using_increasing_prefetch_time_iterator_) {
     int64 prefetch_time = increasing_prefetch_time_iterator_++;
     while (increasing_prefetch_time_iterator_ <= latest_prefetch_time_ &&
-           while_nest_level_[increasing_prefetch_time_iterator_] !=
-               while_nest_level_[end_logical_time_]) {
+           computation_nest_level_[increasing_prefetch_time_iterator_] !=
+               computation_nest_level_[end_logical_time_]) {
       ++increasing_prefetch_time_iterator_;
     }
     if (decreasing_prefetch_time_iterator_ >= earliest_prefetch_time_) {
@@ -521,8 +626,8 @@ int64 CostAnalysisPrefetchIntervalPicker::Next() {
   } else {
     int64 prefetch_time = decreasing_prefetch_time_iterator_--;
     while (decreasing_prefetch_time_iterator_ >= earliest_prefetch_time_ &&
-           while_nest_level_[decreasing_prefetch_time_iterator_] !=
-               while_nest_level_[end_logical_time_]) {
+           computation_nest_level_[decreasing_prefetch_time_iterator_] !=
+               computation_nest_level_[end_logical_time_]) {
       --decreasing_prefetch_time_iterator_;
     }
     if (increasing_prefetch_time_iterator_ <= latest_prefetch_time_) {
@@ -566,11 +671,11 @@ float CostAnalysisPrefetchIntervalPicker::GetLogicalIntervalElapsed(
   // Since elapsed_time_cumsum_ is already weighed by the while loop nesting
   // level, normalize the elapsed time by dividing with the nesting factor of
   // the interval (start and end times).
-  int interval_nest_level = GetMinWhileNestLevel(start_time, end_time);
+  int interval_while_nest_level = GetMinWhileNestLevel(start_time, end_time);
   return (elapsed_time_cumsum_[end_time - 1] -
           elapsed_time_cumsum_[start_time]) /
          tensorflow::MathUtil::IPow<float>(kWhileExecutionCount,
-                                           interval_nest_level);
+                                           interval_while_nest_level);
 }
 
 std::string CostAnalysisPrefetchIntervalPicker::ToDebugString() const {
@@ -713,12 +818,13 @@ void AlternateMemoryBestFitHeap::CreateAllocationValues(
 }
 
 void AlternateMemoryBestFitHeap::FindAliases(
-    std::vector<AllocationValue>* allocation_values) const {
+    std::vector<AllocationValue>* allocation_values,
+    bool skip_values_with_no_uses) const {
   absl::flat_hash_map<const HloInstruction*, const AllocationValue*>
       values_by_defining_inst;
   for (AllocationValue& value : *allocation_values) {
     // Skip the value if it doesn't have any uses.
-    if (value.uses().empty()) {
+    if (value.uses().empty() && skip_values_with_no_uses) {
       continue;
     }
     CHECK_EQ(values_by_defining_inst.count(value.defining_instruction()), 0);
@@ -985,6 +1091,17 @@ void AlternateMemoryBestFitHeap::DumpDebugStringsIfEnabled() const {
 }
 
 HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
+  if (options_.enable_cross_program_prefetch) {
+    absl::optional<AlternateMemoryBestFitHeap::BufferInterval>
+        prefetch_candidate = FindCrossProgramPrefetchCandidate(
+            alias_analysis_, hlo_live_range_, options_);
+    if (prefetch_candidate) {
+      HloModule* module =
+          prefetch_candidate->buffer->instruction()->GetModule();
+      AllocateCrossProgramPrefetchBuffer(module, prefetch_candidate);
+    }
+  }
+
   std::vector<BufferInterval> sorted_buffer_intervals =
       GetSortedBufferIntervals();
 
@@ -1036,6 +1153,12 @@ HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
       continue;
     }
 
+    if (interval.size > available_heap_size()) {
+      VLOG(3) << "Skip " << interval.buffer->ToShortString()
+              << " because the buffer is larger than the heap size.";
+      continue;
+    }
+
     auto colocated_intervals = GetSortedColocatedIntervals(interval);
 
     if (AreIntervalsReservedInAlternateMemory(colocated_intervals)) {
@@ -1084,6 +1207,7 @@ HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
     bool repacked = false;
     for (int retry_number = 0; retry_number < options_.max_retries;
          retry_number++) {
+      AddRequiredAssignmentsForColocatedIntervals(colocated_intervals);
       bool final_retry = (retry_number == options_.max_retries - 1);
       options_.prefetch_interval_picker->SetRetryNumber(retry_number);
       Result result =
@@ -1094,7 +1218,8 @@ HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
           (!final_retry && result_failed_because_of_async_copy(result))) {
         UncommitPendingChunks(absl::MakeSpan(allocation_values));
         VLOG(2) << "Couldn't allocate. Retry number " << retry_number;
-      } else if (result_is(result, Result::kFailOutOfMemory) &&
+      } else if ((result_is(result, Result::kFailOutOfMemory) ||
+                  options_.repack_after_every_allocation) &&
                  num_repacks_ < options_.max_repacks && !repacked) {
         UncommitPendingChunks(absl::MakeSpan(allocation_values));
         ++num_repacks_;
@@ -1128,10 +1253,9 @@ HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
   return result_;
 }
 
-void AlternateMemoryBestFitHeap::CreateAllocationValuesFromColocatedIntervals(
+void AlternateMemoryBestFitHeap::AddRequiredAssignmentsForColocatedIntervals(
     absl::Span<const AlternateMemoryBestFitHeap::BufferInterval* const>
-        colocated_intervals,
-    std::vector<MemorySpaceAssignment::AllocationValue>& allocation_values) {
+        colocated_intervals) {
   // TODO(berkin): For now, place the phi values due to conditionals in
   // default memory.
   for (const BufferInterval* colocated_interval : colocated_intervals) {
@@ -1150,12 +1274,17 @@ void AlternateMemoryBestFitHeap::CreateAllocationValuesFromColocatedIntervals(
       }
     }
   }
+}
 
+void AlternateMemoryBestFitHeap::CreateAllocationValuesFromColocatedIntervals(
+    absl::Span<const AlternateMemoryBestFitHeap::BufferInterval* const>
+        colocated_intervals,
+    std::vector<MemorySpaceAssignment::AllocationValue>& allocation_values) {
   // Create AllocationValues for all the colocated intervals.
   for (const auto& colocated_interval : colocated_intervals) {
     CreateAllocationValues(*colocated_interval, allocation_values);
   }
-  FindAliases(&allocation_values);
+  FindAliases(&allocation_values, /*skip_values_with_no_uses=*/true);
 }
 
 AlternateMemoryBestFitHeap::Result
@@ -1166,7 +1295,7 @@ AlternateMemoryBestFitHeap::AllocateAllocationValues(
   // Data structure to contain the preferred offset for a given computation.
   // We ensure that the same offset will be allocated outside the while loop
   // as well as inside the while loop.
-  absl::flat_hash_map<const HloComputation*, int64>
+  absl::flat_hash_map<const HloComputation*, AliasedOffset*>
       preferred_offset_for_computation;
 
   Result result = Result::kSuccess;
@@ -1174,7 +1303,7 @@ AlternateMemoryBestFitHeap::AllocateAllocationValues(
     int64 definition_time =
         instruction_schedule.at(allocation_value.defining_instruction());
 
-    absl::optional<int64> preferred_offset;
+    AliasedOffset* preferred_offset = nullptr;
     auto preferred_offset_it =
         preferred_offset_for_computation.find(allocation_value.computation());
     if (preferred_offset_it != preferred_offset_for_computation.end()) {
@@ -1273,10 +1402,13 @@ AlternateMemoryBestFitHeap::AllocateAllocationValues(
         }
       }
 
-      // Bitcasts don't define buffers and don't directly consume buffers.  Skip
-      // allocating buffers for bitcast uses. The uses that feed from bitcasts
-      // will be handled specially.
-      if (hlo_use.instruction->opcode() != HloOpcode::kBitcast) {
+      // Bitcasts don't define buffers and don't directly consume buffers. Skip
+      // allocating buffers for bitcast uses (unless they are the root
+      // instruction). The uses that feed from bitcasts will be handled
+      // specially.
+      if (hlo_use.instruction->opcode() != HloOpcode::kBitcast ||
+          hlo_use.instruction ==
+              hlo_use.instruction->parent()->root_instruction()) {
         AllocationRequest request;
         // Rarely, (e.g., when conditional true and false parameters are the
         // same), definition time can be the time of the conditional and use
@@ -1319,7 +1451,7 @@ AlternateMemoryBestFitHeap::AllocateAllocationValues(
       if (hlo_use.instruction->opcode() == HloOpcode::kWhile &&
           aliased_allocation->memory_space() == MemorySpace::kAlternate) {
         preferred_offset_for_computation[hlo_use.instruction->while_body()] =
-            aliased_allocation->chunk().offset;
+            GetAliasedOffset(*aliased_allocation);
       }
     }
   }
@@ -1360,6 +1492,28 @@ absl::optional<AsynchronousCopy> AsynchronousCopyOrdering::ViolatesOrdering(
   return absl::nullopt;
 }
 
+AlternateMemoryBestFitHeap::AliasedOffset*
+AlternateMemoryBestFitHeap::GetAliasedOffset(
+    const MemorySpaceAssignment::Allocation& allocation) {
+  auto aliased_offset_it = aliased_offset_map_.find(&allocation);
+  CHECK(aliased_offset_it != aliased_offset_map_.end());
+  return aliased_offset_it->second;
+}
+
+void AlternateMemoryBestFitHeap::CreateOrAddToAliasedOffset(
+    const MemorySpaceAssignment::Allocation& allocation,
+    AlternateMemoryBestFitHeap::AliasedOffset* aliased_offset) {
+  CHECK(allocation.memory_space() == MemorySpace::kAlternate);
+  CHECK(!aliased_offset_map_.contains(&allocation));
+  if (!aliased_offset) {
+    aliased_offsets_.push_back({allocation.chunk().offset});
+    aliased_offset = &aliased_offsets_.back();
+  }
+  CHECK_EQ(allocation.chunk().offset, aliased_offset->offset);
+  CHECK(aliased_offset->allocations.insert(&allocation).second);
+  aliased_offset_map_[&allocation] = aliased_offset;
+}
+
 /*static*/ MemorySpaceAssignment::Allocation*
 AlternateMemoryBestFitHeap::GetLiveAllocationAt(
     const MemorySpaceAssignment::AllocationSequence& allocations, int64 time) {
@@ -1435,10 +1589,11 @@ void AlternateMemoryBestFitHeap::AllocateCrossProgramPrefetchBuffer(
   AddAsyncCopy(*allocations.back(), MemorySpace::kAlternate,
                chunk_candidate.chunk, prefetch_candidate->start,
                cross_program_prefetch_end_time, latest_prefetch_time,
-               &allocations,
+               &allocations, /*aliased_offset=*/nullptr,
                /*is_cross_program_prefetch=*/true);
   absl::c_for_each(uses, [&](auto& use) { allocations.back()->AddUse(use); });
-  int64 cross_program_prefetch_offset = allocations.back()->chunk().offset;
+  AliasedOffset* cross_program_prefetch_offset =
+      GetAliasedOffset(*allocations.back());
 
   if (free_buffer) {
     VLOG(2) << "Adding an end-of-program prefetch for freed "
@@ -1446,8 +1601,10 @@ void AlternateMemoryBestFitHeap::AllocateCrossProgramPrefetchBuffer(
     AddAsyncCopy(*allocations.front(), MemorySpace::kAlternate,
                  chunk_candidate.chunk, end_of_program_prefetch_start_time,
                  end_of_program_prefetch_end_time,
-                 end_of_program_prefetch_end_time, &allocations);
-    CHECK_EQ(cross_program_prefetch_offset, allocations.back()->chunk().offset);
+                 end_of_program_prefetch_end_time, &allocations,
+                 cross_program_prefetch_offset);
+    CHECK_EQ(cross_program_prefetch_offset->offset,
+             allocations.back()->chunk().offset);
   }
 
   for (auto& allocation : allocations) {
@@ -1477,7 +1634,7 @@ void AlternateMemoryBestFitHeap::AllocateCrossProgramPrefetchBuffer(
   ClearPendingChunks();
 }
 
-absl::optional<RequiredMemoryAssignment>
+absl::optional<AlternateMemoryBestFitHeap::RequiredMemoryAssignment>
 AlternateMemoryBestFitHeap::RequiredMemoryAssignmentAt(const HloValue* buffer,
                                                        int64 time) const {
   auto required_assignment_it = required_assignments_.find(buffer);
@@ -1495,7 +1652,7 @@ AlternateMemoryBestFitHeap::RequiredMemoryAssignmentAt(const HloValue* buffer,
   return required_assignment_at_time;
 }
 
-absl::optional<RequiredMemoryAssignment>
+absl::optional<AlternateMemoryBestFitHeap::RequiredMemoryAssignment>
 AlternateMemoryBestFitHeap::AliasedRequiredAssignmentForUse(
     const AllocationValue::Use& use) const {
   absl::optional<RequiredMemoryAssignment> required_assignment;
@@ -1521,26 +1678,26 @@ AlternateMemoryBestFitHeap::AliasedRequiredAssignmentForUse(
 void AlternateMemoryBestFitHeap::AddAliasedRequiredAssignment(
     const HloInstruction* instruction, ShapeIndex index,
     const MemorySpaceAssignment::Allocation* aliased_allocation) {
-  absl::optional<Chunk> chunk;
+  AliasedOffset* offset = nullptr;
   if (aliased_allocation->memory_space() == MemorySpace::kAlternate) {
-    chunk = aliased_allocation->chunk();
+    offset = GetAliasedOffset(*aliased_allocation);
   }
   AddRequiredAssignment(instruction, index, aliased_allocation->memory_space(),
-                        chunk);
+                        offset);
 }
 
 void AlternateMemoryBestFitHeap::AddRequiredAssignment(
     const HloValue* value, const HloInstruction* instruction,
     MemorySpaceAssignment::MemorySpace memory_space, int64 time,
-    absl::optional<HeapSimulator::Chunk> chunk) {
+    AliasedOffset* offset) {
   // Check for existing required assignment at this time and make sure it is the
   // same as this if there is one.
   auto existing_required_assignment = RequiredMemoryAssignmentAt(value, time);
   if (existing_required_assignment) {
     CHECK(memory_space == existing_required_assignment->memory_space)
         << "inst = " << instruction->ToString() << " at " << time;
-    CHECK((!chunk && !existing_required_assignment->chunk) ||
-          chunk->offset == existing_required_assignment->chunk->offset);
+    CHECK((!offset && !existing_required_assignment->offset) ||
+          offset == existing_required_assignment->offset);
     VLOG(3) << "Not adding required assignment because there is one already: "
             << value->ToShortString() << " at " << time << " at "
             << (memory_space == MemorySpace::kDefault ? "def" : "alt");
@@ -1548,7 +1705,7 @@ void AlternateMemoryBestFitHeap::AddRequiredAssignment(
     VLOG(3) << "Adding required assignment: " << value->ToShortString()
             << " at " << time << " at "
             << (memory_space == MemorySpace::kDefault ? "def" : "alt");
-    RequiredMemoryAssignment required_assignment{memory_space, time, chunk};
+    RequiredMemoryAssignment required_assignment{memory_space, time, offset};
     required_assignments_[value].push_back(required_assignment);
     pending_required_assignments_.push_back({value, required_assignment});
   }
@@ -1556,13 +1713,13 @@ void AlternateMemoryBestFitHeap::AddRequiredAssignment(
 
 void AlternateMemoryBestFitHeap::AddRequiredAssignment(
     const HloInstruction* instruction, ShapeIndex index,
-    MemorySpace memory_space, absl::optional<Chunk> chunk) {
+    MemorySpace memory_space, AliasedOffset* offset) {
   const HloValue* value =
       &alias_analysis_.dataflow_analysis().GetUniqueValueAt(instruction, index);
   int64 instruction_time =
       hlo_live_range_.instruction_schedule().at(instruction);
   AddRequiredAssignment(value, instruction, memory_space, instruction_time,
-                        chunk);
+                        offset);
 }
 
 void AlternateMemoryBestFitHeap::AddInputAndOutputRequiredAssignments() {
@@ -1711,8 +1868,8 @@ void AlternateMemoryBestFitHeap::UncommitPendingChunks(
                     ? "def"
                     : "alt")
             << " time = " << required_assignment.time << " off = "
-            << (required_assignment.chunk ? required_assignment.chunk->offset
-                                          : -1);
+            << (required_assignment.offset ? required_assignment.offset->offset
+                                           : -1);
     for (auto it = required_assignment_vector.begin();
          it != required_assignment_vector.end(); ++it) {
       if (*it == value_and_required_assignment.second) {
@@ -1726,7 +1883,8 @@ void AlternateMemoryBestFitHeap::UncommitPendingChunks(
 
 void AlternateMemoryBestFitHeap::FinalizeAllocations(
     absl::Span<AllocationValue> allocation_values) {
-  absl::flat_hash_map<int64, std::vector<MemorySpaceAssignment::Allocation*>>
+  absl::flat_hash_map<const AliasedOffset*,
+                      std::vector<MemorySpaceAssignment::Allocation*>>
       colocation_map;
   for (AllocationValue& allocation_value : allocation_values) {
     for (auto& allocation : *allocation_value.allocation_sequence()) {
@@ -1736,12 +1894,12 @@ void AlternateMemoryBestFitHeap::FinalizeAllocations(
       MemorySpaceAssignment::Allocation* inserted_allocation =
           allocations_->back().get();
       if (inserted_allocation->memory_space() == MemorySpace::kAlternate) {
-        colocation_map[inserted_allocation->chunk().offset].push_back(
+        colocation_map[GetAliasedOffset(*inserted_allocation)].push_back(
             inserted_allocation);
       }
     }
   }
-  // Assume allocations that received the same offset need to be colocated.
+  // The allocations that have the same AliasedOffset need to be colocated.
   // Export these to repack_allocation_blocks_ so that we can repack them to
   // reduce fragmentation.
   for (auto& colocation : colocation_map) {
@@ -1768,6 +1926,8 @@ void AlternateMemoryBestFitHeap::ClearPendingChunks() {
   pending_chunks_.clear();
   pending_async_copies_.clear();
   pending_required_assignments_.clear();
+  aliased_offset_map_.clear();
+  aliased_offsets_.clear();
 }
 
 void AlternateMemoryBestFitHeap::AddToPendingChunks(
@@ -1843,15 +2003,25 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
       const auto& prev_allocation = allocation_sequence->back();
       CHECK(prev_allocation->memory_space() ==
             required_assignment_at_start->memory_space);
-      CHECK_EQ(prev_allocation->chunk().offset,
-               required_assignment_at_start->chunk->offset);
+      CHECK_EQ(GetAliasedOffset(*prev_allocation),
+               required_assignment_at_start->offset);
       prev_allocation->Extend(request.start_time);
     } else {
+      absl::optional<Chunk> aliased_chunk = absl::nullopt;
+      if (required_assignment_at_start->memory_space ==
+          MemorySpace::kAlternate) {
+        aliased_chunk =
+            Chunk{required_assignment_at_start->offset->offset, request.size};
+      }
       allocation_sequence->push_back(
           absl::make_unique<MemorySpaceAssignment::Allocation>(
               defining_position, required_assignment_at_start->memory_space,
-              required_assignment_at_start->chunk, request.start_time,
-              request.start_time));
+              aliased_chunk, request.start_time, request.start_time));
+      if (required_assignment_at_start->memory_space ==
+          MemorySpace::kAlternate) {
+        CreateOrAddToAliasedOffset(*allocation_sequence->back(),
+                                   required_assignment_at_start->offset);
+      }
     }
   }
 
@@ -1935,7 +2105,7 @@ void AlternateMemoryBestFitHeap::AddAsyncCopy(
     MemorySpace memory_space, absl::optional<Chunk> chunk, int64 start_time,
     int64 end_time, int64 copy_done_schedule_before_time,
     MemorySpaceAssignment::AllocationSequence* allocations,
-    bool is_cross_program_prefetch) {
+    AliasedOffset* aliased_offset, bool is_cross_program_prefetch) {
   VLOG(3) << "Copy to "
           << (memory_space == MemorySpaceAssignment::MemorySpace::kDefault
                   ? "default"
@@ -1957,6 +2127,7 @@ void AlternateMemoryBestFitHeap::AddAsyncCopy(
     prefetch_interval_tree_.Add(start_time, copy_done_schedule_before_time,
                                 kDummyChunk);
     async_copy_ordering_.AddCopy(pending_async_copies_.back());
+    CreateOrAddToAliasedOffset(*allocations->back(), aliased_offset);
   } else {
     eviction_interval_tree_.Add(start_time, copy_done_schedule_before_time,
                                 kDummyChunk);
@@ -2033,9 +2204,9 @@ AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
   alternate_mem_interval.start = request.start_time;
 
   // Prefer the offset that was previously used for the previous allocation.
-  absl::optional<int64> preferred_offset;
+  AliasedOffset* preferred_offset = nullptr;
   if (prev_allocation != nullptr) {
-    preferred_offset = prev_allocation->chunk().offset;
+    preferred_offset = GetAliasedOffset(*prev_allocation);
     // If there is a previous allocation, set the start time one after the end
     // of the previous allocation's end.
     alternate_mem_interval.start = prev_allocation->end_time() + 1;
@@ -2045,13 +2216,13 @@ AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
     // Sanity check that if there is a preferred offset provided in the request,
     // it matches with the previous allocation.
     CHECK(!preferred_offset || request.preferred_offset == preferred_offset)
-        << "preferred_offset = " << *preferred_offset
-        << ", request.preferred_offset = " << *request.preferred_offset;
+        << "preferred_offset = " << preferred_offset->offset
+        << ", request.preferred_offset = " << request.preferred_offset->offset;
     preferred_offset = request.preferred_offset;
   }
 
   VLOG(3) << "We can eliminate copy to alternate memory. Preferred offset = "
-          << (preferred_offset ? *preferred_offset : -1);
+          << (preferred_offset ? preferred_offset->offset : -1);
   // In case there are additional uses after this use, we rely on the last use
   // time to try to reserve a chunk in the heap simulator. This is to prevent
   // the following scenario:
@@ -2099,6 +2270,9 @@ AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
           absl::make_unique<MemorySpaceAssignment::Allocation>(
               defining_position, MemorySpace::kAlternate,
               chunk_candidate->chunk, request.start_time, request.end_time));
+      CreateOrAddToAliasedOffset(
+          *request.allocation_value->allocation_sequence()->back(),
+          preferred_offset);
     }
     request.allocation_value->allocation_sequence()->back()->AddUse(
         request.use->hlo_use);
@@ -2162,7 +2336,8 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Evict(
     AddAsyncCopy(*prev_allocation, MemorySpace::kDefault,
                  /*chunk=*/absl::nullopt, eviction_start_time,
                  prev_allocation->end_time(), eviction_end_time,
-                 request.allocation_value->allocation_sequence());
+                 request.allocation_value->allocation_sequence(),
+                 /*aliased_offset=*/nullptr);
   } else {
     if (eviction_violates_outstanding_copies) {
       VLOG(3) << "This violates the maximum async copies.";
@@ -2180,7 +2355,8 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Evict(
         VLOG(3) << "Eviction successful.";
         AddAsyncCopy(*prev_allocation, MemorySpace::kDefault,
                      /*chunk=*/absl::nullopt, time, time + 1, time + 1,
-                     request.allocation_value->allocation_sequence());
+                     request.allocation_value->allocation_sequence(),
+                     /*aliased_offset=*/nullptr);
         eviction_scheduled = true;
         break;
       }
@@ -2332,7 +2508,8 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Prefetch(
       AddAsyncCopy(prev_allocation_in_default_mem, MemorySpace::kAlternate,
                    chunk_candidate->chunk, alternate_mem_interval.start,
                    request.end_time, prefetch_end_time,
-                   request.allocation_value->allocation_sequence());
+                   request.allocation_value->allocation_sequence(),
+                   request.preferred_offset);
 
       request.allocation_value->allocation_sequence()->back()->AddUse(
           request.use->hlo_use);
@@ -2351,7 +2528,7 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Prefetch(
 
 absl::optional<AlternateMemoryBestFitHeap::ChunkCandidate>
 AlternateMemoryBestFitHeap::FindBestChunkCandidate(
-    const AllocationRequest& request, absl::optional<int64> preferred_offset,
+    const AllocationRequest& request, const AliasedOffset* preferred_offset,
     BufferInterval* alternate_mem_interval) const {
   int64 end_time = request.end_time;
   if (!preferred_offset) {
@@ -2397,8 +2574,8 @@ AlternateMemoryBestFitHeap::FindBestChunkCandidate(
   // only.
   alternate_mem_interval->end = end_time;
   ChunkCandidate chunk_candidate =
-      FindChunkCandidate(*alternate_mem_interval, *preferred_offset);
-  if (chunk_candidate.chunk.offset == *preferred_offset) {
+      FindChunkCandidate(*alternate_mem_interval, preferred_offset->offset);
+  if (chunk_candidate.chunk.offset == preferred_offset->offset) {
     return chunk_candidate;
   }
   return absl::nullopt;
@@ -2457,107 +2634,6 @@ MemorySpaceAssignment::GetMemoryBoundednessBufferIntervalCompare(
   };
 }
 
-namespace {
-
-bool LooksLikeAnActivation(const HloInstruction* inst) {
-  for (HloInstruction* user : inst->users()) {
-    switch (user->opcode()) {
-      case HloOpcode::kConvolution:
-      case HloOpcode::kDot:
-        if (user->operand(0) == inst) {
-          return true;
-        }
-        break;
-      case HloOpcode::kGather:
-        if (user->operand(1) == inst) {
-          return true;
-        }
-        break;
-      case HloOpcode::kFusion:
-        for (int i = 0; i < user->operand_count(); ++i) {
-          if (user->operand(i) == inst &&
-              LooksLikeAnActivation(user->fused_parameter(i))) {
-            return true;
-          }
-        }
-        break;
-      case HloOpcode::kBitcast:
-        return LooksLikeAnActivation(user);
-      default:
-        return true;
-    }
-  }
-  return false;
-}
-
-bool IsCrossProgramPrefetchCandidate(
-    const HloValue& value, const MemorySpaceAssignment::Options& options) {
-  return value.instruction()->parent() ==
-             value.instruction()->GetModule()->entry_computation() &&
-         value.instruction()->opcode() == HloOpcode::kParameter &&
-         (!value.shape().has_layout() ||
-          value.shape().layout().memory_space() !=
-              options.alternate_memory_space) &&
-         value.index().size() == 1 && value.shape().IsArray() &&
-         !value.uses().empty() &&
-         options.size_fn(value) <= options.max_size_in_bytes &&
-         absl::c_all_of(value.uses(), [&](const HloUse& use) {
-           const HloInstruction* inst =
-               use.instruction->operand(use.operand_number);
-
-           // Skip the LooksLikeAnActivation test since we're testing the
-           // parent GTE and its children below.
-           if (inst->opcode() == HloOpcode::kBitcast &&
-               inst->operand(0)->opcode() == HloOpcode::kGetTupleElement &&
-               inst->operand(0)->operand(0)->opcode() ==
-                   HloOpcode::kParameter) {
-             return true;
-           }
-
-           return inst->opcode() == HloOpcode::kGetTupleElement &&
-                  !LooksLikeAnActivation(inst);
-         });
-}
-
-absl::optional<MemorySpaceAssignment::BufferInterval>
-FindCrossProgramPrefetchCandidate(
-    const HloAliasAnalysis& alias_analysis, const HloLiveRange& hlo_live_range,
-    const MemorySpaceAssignment::Options& options) {
-  std::vector<MemorySpaceAssignment::BufferInterval> candidates;
-  for (const HloBuffer& buffer : alias_analysis.buffers()) {
-    CHECK_GE(buffer.values().size(), 1);
-    const HloValue* value = buffer.values().at(0);
-    if (IsCrossProgramPrefetchCandidate(*value, options)) {
-      MemorySpaceAssignment::BufferInterval interval;
-      interval.buffer = value;
-      interval.size = options.size_fn(*value);
-      interval.start = 0;
-      interval.end = hlo_live_range.schedule_end_time();
-      interval.need_allocation = true;
-      interval.colocations = {++buffer.values().begin(), buffer.values().end()};
-      candidates.emplace_back(interval);
-    }
-  }
-
-  // The buffer_interval_compare ought to do a good job picking the most
-  // appropriate buffer to cross program prefetch, but empirically, it makes
-  // worse choices than just picking the largest buffer.
-  // TODO(b/152421603): Investigate.
-  auto size_compare = [](const auto& x, const auto& y) {
-    return x.size < y.size;
-  };
-  auto& compare = options.default_cross_program_prefetch_heuristic &&
-                          options.buffer_interval_compare
-                      ? *options.buffer_interval_compare
-                      : size_compare;
-
-  auto best_candidate = absl::c_max_element(candidates, compare);
-  if (best_candidate == candidates.end()) {
-    return absl::nullopt;
-  }
-  return *best_candidate;
-}
-}  // namespace
 
 /*static*/ StatusOr<std::unique_ptr<PresetAssignments>>
 MemorySpaceAssignment::Run(HloModule* module,
@@ -2608,13 +2684,6 @@ Status MemorySpaceAssignment::FindAllocationSequence(
   auto algorithm = absl::make_unique<AlternateMemoryBestFitHeap>(
       &allocations_, options_, alias_analysis, hlo_live_range);
 
-  if (options_.enable_cross_program_prefetch) {
-    absl::optional<AlternateMemoryBestFitHeap::BufferInterval>
-        prefetch_candiate = FindCrossProgramPrefetchCandidate(
-            alias_analysis, hlo_live_range, options_);
-    algorithm->AllocateCrossProgramPrefetchBuffer(module_, prefetch_candiate);
-  }
-
   HeapSimulator::Options heap_simulator_options;
   heap_simulator_options.may_reuse_operand_buffers = false;
   TF_RETURN_IF_ERROR(HeapSimulator::Run(std::move(algorithm), *module_,
@@ -2747,15 +2816,21 @@ HloInstruction* MemorySpaceAssignment::Allocation::AddGetTupleElements() {
 }
 
 std::string MemorySpaceAssignment::Allocation::ToString() const {
-  return absl::StrCat("Allocation in ",
-                      memory_space_ == MemorySpace::kDefault ? "def" : "alt",
-                      " defined at ", defining_position_.ToString());
+  std::string memory_space_str = "def";
+  if (memory_space_ == MemorySpace::kAlternate) {
+    memory_space_str = absl::StrCat("alt (off: ", chunk_->offset, ")");
+  }
+  return absl::StrCat("Allocation in ", memory_space_str, " defined at ",
+                      defining_position_.ToString());
 }
 
 std::string MemorySpaceAssignment::CopyAllocation::ToString() const {
-  return absl::StrCat("Copy Allocation in ",
-                      memory_space_ == MemorySpace::kDefault ? "def" : "alt",
-                      " from ", prev_allocation_.ToString());
+  std::string memory_space_str = "def";
+  if (memory_space_ == MemorySpace::kAlternate) {
+    memory_space_str = absl::StrCat("alt (off: ", chunk_->offset, ")");
+  }
+  return absl::StrCat("Copy Allocation in ", memory_space_str, " from ",
+                      prev_allocation_.ToString());
 }
 
 Status MemorySpaceAssignment::CopyAllocation::Process(
@@ -3285,6 +3360,7 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
                   last_use_instruction, parameter_time, last_use_time,
                   absl::StrCat(indent_string, "  ")));
             } else {
+              last_use_time = std::min(last_use_time, end_time);
               TF_RETURN_IF_ERROR(add_allocation_and_verify(
                   parameter_time, last_use_time, chunk, value));
             }
@@ -3303,12 +3379,13 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
         TF_RETURN_IF_ERROR(split_conditional_buffer(
             last_use_instruction, time_bound.start, time_bound.end, " "));
       } else if (!value->uses().empty()) {
+        last_use_time = std::min(last_use_time, time_bound.end);
         VLOG(3) << " buffer: " << buffer.ToString()
                 << " value: " << value->ToShortString() << ": ("
-                << time_bound.start << ", " << time_bound.end
+                << time_bound.start << ", " << last_use_time
                 << ") off: " << chunk.offset << ", size: " << chunk.size;
         TF_RETURN_IF_ERROR(add_allocation_and_verify(
-            time_bound.start, time_bound.end, chunk, value));
+            time_bound.start, last_use_time, chunk, value));
       }
     }
   }
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
index 04737663424..cb459c68be1 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -149,9 +149,11 @@ class MemorySpaceAssignmentCostAnalysis {
 
   int64 GetScheduleEndTime() const;
 
-  // Returns the number of nested while loop levels this instruction resides in.
-  // 0 means it is not in a while loop.
-  int CalculateWhileLoopNestLevel(const HloInstruction* instruction) const;
+  // Returns the number of nested computation levels this instruction resides
+  // in. If while_only is true, it returns the while loop nest level and 0
+  // means the instruction is not in a while loop.
+  int CalculateComputationNestLevel(const HloInstruction* instruction,
+                                    bool while_only) const;
 
   const HloLiveRange& hlo_live_range() const { return *hlo_live_range_; }
 
@@ -360,6 +362,7 @@ class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
   // (in cumulative sum) and while nesting level.
   std::vector<float> elapsed_time_cumsum_;
   std::vector<int> while_nest_level_;
+  std::vector<int> computation_nest_level_;
   // Maintain the index of the most recent (before this instruction) nest level
   // change in order to efficiently determine the minimum nest level in an
   // interval.
@@ -376,7 +379,7 @@ class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
   int64 end_logical_time_;
   int64 earliest_prefetch_time_;
   int64 latest_prefetch_time_;
-  bool using_increasing_prefetch_time_iterator_;
+  bool using_increasing_prefetch_time_iterator_ = true;
   int64 increasing_prefetch_time_iterator_;
   int64 decreasing_prefetch_time_iterator_;
 };
@@ -459,6 +462,9 @@ class MemorySpaceAssignment {
     // max_repacks is greater than 0.
     MemorySpaceAssignmentRepacker* repacker = nullptr;
 
+    // This is only useful for testing, repack after every allocation.
+    bool repack_after_every_allocation = false;
+
     // If true, tries allocating buffers across (e.g., before and inside a while
     // loop body) sequential calls (kWhile, kCall, and kConditional).
     bool allocate_across_sequential_calls = false;
@@ -728,6 +734,16 @@ class MemorySpaceAssignment {
       // All the positions where this use aliases with. The aliased positions
       // must get the same allocation.
       std::vector<HloPosition> aliases;
+
+      bool operator==(const Use& other) const {
+        return hlo_use == other.hlo_use && time == other.time &&
+               aliases == other.aliases;
+      }
+
+      template <typename H>
+      friend H AbslHashValue(H h, const Use& s) {
+        return H::combine(std::move(h), s.hlo_use, s.time, s.aliases);
+      }
     };
 
     AllocationValue(const HloValue* value, const HloPosition& position,
@@ -823,6 +839,8 @@ class MemorySpaceAssignment {
 
   AllocationSequence allocations_;
 
+  HloModule* module() { return module_; }
+
  private:
   // Process calls Process methods of the allocations after the allocations have
   // been finalized.
@@ -871,29 +889,6 @@ class MemorySpaceAssignment {
   absl::flat_hash_map<int64, std::vector<HloInstruction*>> schedule_before_;
 };
 
-// This struct contains mandatory memory assignments at a given time. E.g., an
-// input's required memory assignment time would correspond to the definition
-// time of the parameter instruction, and an output's time would correspond to
-// the time of last use.
-struct RequiredMemoryAssignment {
-  MemorySpaceAssignment::MemorySpace memory_space;
-  int64 time;
-  absl::optional<HeapSimulator::Chunk> chunk;
-
-  bool equals_ignoring_time(const RequiredMemoryAssignment& other) const {
-    return memory_space == other.memory_space && chunk == other.chunk;
-  }
-
-  bool operator==(const RequiredMemoryAssignment& other) const {
-    return memory_space == other.memory_space && time == other.time &&
-           chunk == other.chunk;
-  }
-
-  bool operator!=(const RequiredMemoryAssignment& other) const {
-    return !(*this == other);
-  }
-};
-
 // A struct representing an asynchronous copy with its logical start and end
 // time and its destination memory space.
 struct AsynchronousCopy {
@@ -972,6 +967,38 @@ class AlternateMemoryBestFitHeap
 
   HeapSimulator::Result<HloValue> Finish() override;
 
+ protected:
+  // Given a buffer interval, returns the colocated intervals. Unlike the
+  // similar GlobalDecreasingSizeBestFitHeap::GetTransitiveColocations, it
+  // returns the colocated intervals sorted by scheduled time.
+  std::vector<const BufferInterval*> GetSortedColocatedIntervals(
+      const BufferInterval& interval) const;
+
+  // Given a BufferInterval, creates AllocationValue objects and corresponding
+  // AllocationSequences and appends them into allocation_sequence_list_.
+  void CreateAllocationValues(
+      const BufferInterval& buffer_interval,
+      std::vector<AllocationValue>& allocation_values) const;
+
+  // Given colocated intervals, populates allocation_values with the
+  // corresponding AllocationValue objects.
+  void CreateAllocationValuesFromColocatedIntervals(
+      absl::Span<const AlternateMemoryBestFitHeap::BufferInterval* const>
+          colocated_intervals,
+      std::vector<MemorySpaceAssignment::AllocationValue>& allocation_values);
+
+  // Go through all the uses in the AllocationValues and find the aliasing
+  // positions.
+  void FindAliases(std::vector<AllocationValue>* allocation_values,
+                   bool skip_values_with_no_uses) const;
+
+  MemorySpaceAssignment::AllocationSequence* allocations() {
+    return allocations_;
+  }
+  const MemorySpaceAssignment::Options& options() { return options_; }
+  const HloAliasAnalysis& alias_analysis() { return alias_analysis_; }
+  const HloLiveRange& hlo_live_range() { return hlo_live_range_; }
+
  private:
   // We inherit AllocationBlock struct to attach the Allocation information to
   // make importing repacked offsets easier.
@@ -980,6 +1007,13 @@ class AlternateMemoryBestFitHeap
     MemorySpaceAssignment::Allocation* allocation;
   };
 
+  // A data structure we use to associate Allocation objects that are aliased
+  // and must get the same offset.
+  struct AliasedOffset {
+    int64 offset;
+    absl::flat_hash_set<const MemorySpaceAssignment::Allocation*> allocations;
+  };
+
   // An allocation request for a use segment. A use segment is the time segment
   // between the definition and the first use, and the time segment between the
   // uses of a buffer. For example, the time between the definition and Use1, is
@@ -1007,11 +1041,34 @@ class AlternateMemoryBestFitHeap
     int64 size;
     bool allow_no_copy_alternate_mem_allocation;
     absl::optional<int64> earliest_prefetch_time;
-    absl::optional<int64> preferred_offset;
+    AliasedOffset* preferred_offset;
     const MemorySpaceAssignment::AllocationValue::Use* use;
     MemorySpaceAssignment::AllocationValue* allocation_value;
   };
 
+  // This struct contains mandatory memory assignments at a given time. E.g., an
+  // input's required memory assignment time would correspond to the definition
+  // time of the parameter instruction, and an output's time would correspond to
+  // the time of last use.
+  struct RequiredMemoryAssignment {
+    MemorySpaceAssignment::MemorySpace memory_space;
+    int64 time;
+    AliasedOffset* offset;
+
+    bool equals_ignoring_time(const RequiredMemoryAssignment& other) const {
+      return memory_space == other.memory_space && offset == other.offset;
+    }
+
+    bool operator==(const RequiredMemoryAssignment& other) const {
+      return memory_space == other.memory_space && time == other.time &&
+             offset == other.offset;
+    }
+
+    bool operator!=(const RequiredMemoryAssignment& other) const {
+      return !(*this == other);
+    }
+  };
+
   // Result of an allocation, prefetch, eviction etc. request.  The result is
   // either kSuccess or a bitwise OR of one or more failures. The values are
   // unique powers of two. To check if a result contains a particular failure,
@@ -1068,6 +1125,17 @@ class AlternateMemoryBestFitHeap
            result_is(result, Result::kFailViolatesAsyncCopyOrdering);
   }
 
+  // Returns the AliasedOffset object associated with the allocation.
+  AliasedOffset* GetAliasedOffset(
+      const MemorySpaceAssignment::Allocation& allocation);
+
+  // If aliased_offset is non-null, this method adds the allocation to
+  // aliased_offset. Otherwise, it creates a new AliasedOffset object and adds
+  // the allocation to this new AliasedOffset.
+  void CreateOrAddToAliasedOffset(
+      const MemorySpaceAssignment::Allocation& allocation,
+      AliasedOffset* aliased_offset);
+
   // Given an allocation sequence, returns the live allocation at time with a
   // preference towards allocations in alternate memory. Returns nullptr if no
   // allocation is alive at that time.
@@ -1078,18 +1146,6 @@ class AlternateMemoryBestFitHeap
   bool IsUseAllowedInAlternateMemory(const AllocationValue& value,
                                      const HloUse& use) const;
 
-  // Given a BufferInterval, creates AllocationValue objects and corresponding
-  // AllocationSequences and appends them into allocation_sequence_list_.
-  void CreateAllocationValues(
-      const BufferInterval& buffer_interval,
-      std::vector<AllocationValue>& allocation_values) const;
-
-  // Given colocated intervals, populates allocation_values with the
-  // corresponding AllocationValue objects.
-  void CreateAllocationValuesFromColocatedIntervals(
-      absl::Span<const BufferInterval* const> colocated_intervals,
-      std::vector<AllocationValue>& allocation_values);
-
   // Finds allocations for allocation values generated from colocated intervals.
   // All of the allocation values have a must-alias relationship with each
   // other. Returns either kSuccess if all of the sites could be placed in the
@@ -1097,10 +1153,6 @@ class AlternateMemoryBestFitHeap
   Result AllocateAllocationValues(
       absl::Span<AllocationValue> allocation_values);
 
-  // Go through all the uses in the AllocationValues and find the aliasing
-  // positions.
-  void FindAliases(std::vector<AllocationValue>* allocation_values) const;
-
   // Finds an allocation for an allocation request for a segment (see the
   // documentation for AllocationRequest above how a segment is defined).
   //
@@ -1140,7 +1192,7 @@ class AlternateMemoryBestFitHeap
   // availability if no preferred offset is given, or at the preferred_offset if
   // it is given.
   absl::optional<ChunkCandidate> FindBestChunkCandidate(
-      const AllocationRequest& request, absl::optional<int64> preferred_offset,
+      const AllocationRequest& request, const AliasedOffset* preferred_offset,
       BufferInterval* alternate_mem_interval) const;
 
   // Returns the required assignment at a particular time, if available.
@@ -1152,6 +1204,11 @@ class AlternateMemoryBestFitHeap
   absl::optional<RequiredMemoryAssignment> AliasedRequiredAssignmentForUse(
       const AllocationValue::Use& use) const;
 
+  // Goes through the colocated intervals and adds any required assignment.
+  void AddRequiredAssignmentsForColocatedIntervals(
+      absl::Span<const AlternateMemoryBestFitHeap::BufferInterval* const>
+          colocated_intervals);
+
   // Propagates aliased required assignment for a given position.
   void AddAliasedRequiredAssignment(
       const HloInstruction* instruction, ShapeIndex index,
@@ -1162,10 +1219,10 @@ class AlternateMemoryBestFitHeap
   void AddRequiredAssignment(const HloValue* value,
                              const HloInstruction* instruction,
                              MemorySpace memory_space, int64 time,
-                             absl::optional<Chunk> chunk = absl::nullopt);
+                             AliasedOffset* offset = nullptr);
   void AddRequiredAssignment(const HloInstruction* instruction,
                              ShapeIndex index, MemorySpace memory_space,
-                             absl::optional<Chunk> chunk = absl::nullopt);
+                             AliasedOffset* offset = nullptr);
 
   // Adds input and outputs as required assignments.
   void AddInputAndOutputRequiredAssignments();
@@ -1176,12 +1233,6 @@ class AlternateMemoryBestFitHeap
   bool AreIntervalsReservedInAlternateMemory(
       absl::Span<const BufferInterval* const> colocated_intervals) const;
 
-  // Given a buffer interval, returns the colocated intervals. Unlike the
-  // similar GlobalDecreasingSizeBestFitHeap::GetTransitiveColocations, it
-  // returns the colocated intervals sorted by scheduled time.
-  std::vector<const BufferInterval*> GetSortedColocatedIntervals(
-      const BufferInterval& interval) const;
-
   // Since the allocations are recorded to the AllocationSequence, we don't
   // maintain result_ in GlobalDecreasingSizeBestFitHeap. Override AddToChunkMap
   // to avoid unnecessarily adding the chunk to the chunk map.
@@ -1216,6 +1267,7 @@ class AlternateMemoryBestFitHeap
                     int64 start_time, int64 end_time,
                     int64 copy_done_schedule_before_time,
                     MemorySpaceAssignment::AllocationSequence* allocations,
+                    AliasedOffset* aliased_offset,
                     bool is_cross_program_prefetch = false);
 
   // This method is used for committing the chunk candidate but adding it to
@@ -1284,6 +1336,11 @@ class AlternateMemoryBestFitHeap
   std::vector<AsynchronousCopy> pending_async_copies_;
   std::vector<std::pair<const HloValue*, RequiredMemoryAssignment>>
       pending_required_assignments_;
+  // The data structure that contains AliasedOffset objects and Allocation to
+  // AliasedOffset map for efficient lookup.
+  std::list<AliasedOffset> aliased_offsets_;
+  absl::flat_hash_map<const MemorySpaceAssignment::Allocation*, AliasedOffset*>
+      aliased_offset_map_;
   // This map contains required memory assignments for HloValues (e.g., input
   // and outputs).
   absl::flat_hash_map<const HloValue*, std::vector<RequiredMemoryAssignment>>
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
index cc4f740bc25..187076abe8a 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
@@ -232,6 +232,24 @@ class MemorySpaceAssignmentTest : public HloTestBase,
     return copies;
   }
 
+  int64 GetAlternateMemoryOffset(const PresetAssignments& preset_assignments,
+                                 const HloInstruction* instruction,
+                                 const ShapeIndex& index = {}) const {
+    // Returns the offset of the assignment, -1 if it's not in the alternate
+    // memory.
+    const HloModule* module = instruction->parent()->parent();
+    auto alias_analysis = HloAliasAnalysis::Run(module).ValueOrDie();
+    HloBuffer& buffer = alias_analysis->GetUniqueBufferAt(instruction, index);
+    for (auto& pos_and_chunk : preset_assignments.chunks()) {
+      for (auto& value : buffer.values()) {
+        if (pos_and_chunk.first == value->defining_position()) {
+          return pos_and_chunk.second.offset;
+        }
+      }
+    }
+    return -1;
+  }
+
   std::unique_ptr<HloModule> CreateEvictAndPrefetchModule() {
     HloComputation::Builder builder(TestName());
     Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
@@ -4066,22 +4084,78 @@ TEST_P(MemorySpaceAssignmentTest, MoveCopyDoneEarlier) {
             find_schedule_index(cos->operand(0)));
 }
 
+TEST_P(MemorySpaceAssignmentTest, BitcastRoot) {
+  // Tests against a bug where the root of entry computation is a bitcast
+  // instruction and it ends up getting an allocation in the alternate memory.
+  absl::string_view hlo_string = R"(
+HloModule primitive_computation_gather.4, is_scheduled=true
+
+%while_body {
+  %param.1 = (s32[], f32[3,3,3]) parameter(0)
+  %get-tuple-element.32 = s32[] get-tuple-element(%param.1), index=0
+  %copy.6 = s32[] copy(s32[] %get-tuple-element.32)
+  %constant.8 = s32[] constant(1)
+  %add = s32[] add(s32[] %copy.6, s32[] %constant.8)
+  %get-tuple-element.35 = f32[3,3,3] get-tuple-element(%param.1), index=1
+  negate = f32[3,3,3] negate(get-tuple-element.35)
+  ROOT %tuple.10 = (s32[], f32[3,3,3]) tuple(s32[] %add, f32[3,3,3] negate)
+}
+
+%while_cond {
+  %param.0 = (s32[], f32[3,3,3]) parameter(0)
+  %get-tuple-element = s32[] get-tuple-element(%param.0), index=0
+  %constant.3 = s32[] constant(3)
+  ROOT %compare = pred[] compare(s32[] %get-tuple-element, s32[] %constant.3), direction=LT
+}
+
+ENTRY %primitive_computation_gather.4 (parameter.1: f32[3,10,5], parameter.2: s32[3,1]) -> f32[3,3,3] {
+  %constant.1 = s32[] constant(0)
+  %copy.11 = s32[] copy(s32[] %constant.1)
+  %constant = f32[] constant(0)
+  %broadcast = f32[3,3,3] broadcast(f32[] %constant), dimensions={}
+  %tuple.8 = (s32[], f32[3,10,5], s32[3,1], f32[3,3,3]) tuple(s32[] %copy.11, f32[3,3,3] %broadcast)
+  %while = (s32[], f32[3,3,3]) while(%tuple.8), condition=%while_cond, body=%while_body
+  %get-tuple-element.7 = f32[3,3,3] get-tuple-element(%while), index=1
+  ROOT %bitcast.1 = f32[3,3,3] bitcast(f32[3,3,3] %get-tuple-element.7)
+}
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_TRUE(!root->shape().has_layout() ||
+              root->shape().layout().memory_space() == kDefaultMemorySpace);
+}
+
 // A mock MemorySpaceAssignmentRepacker class that accepst a map of
 // (start_time,offset) -> new_offset values. Using this map, the repacker
 // repacks the allocations to the new_offset.
 class FakeMemorySpaceAssignmentRepacker : public MemorySpaceAssignmentRepacker {
  public:
   explicit FakeMemorySpaceAssignmentRepacker(
-      absl::flat_hash_map<std::pair<int64, int64>, int64>& repack_map)
+      absl::flat_hash_map<std::pair<int64, int64>, int64>& repack_map,
+      std::function<void(absl::Span<AllocationBlock*>)> check_fun = nullptr,
+      bool always_return_modified = false)
       : MemorySpaceAssignmentRepacker(/*max_size=*/128, /*alignment=*/8),
-        repack_map_(repack_map) {}
+        repack_map_(repack_map),
+        check_fun_(check_fun),
+        always_return_modified_(always_return_modified) {}
 
   StatusOr<bool> Repack(absl::Span<AllocationBlock*> allocations) override {
     bool modified = false;
     for (AllocationBlock* block : allocations) {
-      VLOG(1) << "Alloc time: [" << block->start_time << ", " << block->end_time
-              << "] size: " << block->size
-              << " init offset: " << block->initial_offset;
+      absl::flat_hash_set<int64> colocations;
+      std::string colocations_str;
+      for (const AllocationBlock* colocation : block->colocations) {
+        absl::StrAppend(&colocations_str, colocation->id, ", ");
+        colocations.insert(colocation->id);
+      }
+      VLOG(1) << "Alloc id: " << block->id << " time: [" << block->start_time
+              << ", " << block->end_time << "] size: " << block->size
+              << " init offset: " << block->initial_offset << " colocations: {"
+              << colocations_str << "}";
       auto it = repack_map_.find({block->start_time, block->initial_offset});
       if (it != repack_map_.end()) {
         modified = true;
@@ -4090,8 +4164,6 @@ class FakeMemorySpaceAssignmentRepacker : public MemorySpaceAssignmentRepacker {
         block->offset = block->initial_offset;
       }
       for (AllocationBlock* colocation : block->colocations) {
-        VLOG(1) << "  [" << colocation->start_time << ", "
-                << colocation->end_time << "]";
         if (it != repack_map_.end()) {
           colocation->offset = it->second;
         } else {
@@ -4099,13 +4171,18 @@ class FakeMemorySpaceAssignmentRepacker : public MemorySpaceAssignmentRepacker {
         }
       }
     }
+    if (check_fun_) {
+      check_fun_(allocations);
+    }
 
-    return modified;
+    return always_return_modified_ || modified;
   }
 
  private:
   // A map from (start_time, offset) to new_offset.
   absl::flat_hash_map<std::pair<int64, int64>, int64> repack_map_;
+  std::function<void(absl::Span<AllocationBlock*>)> check_fun_;
+  bool always_return_modified_;
 };
 
 TEST_P(MemorySpaceAssignmentTest, Repack) {
@@ -4229,6 +4306,181 @@ TEST_P(MemorySpaceAssignmentTest, Repack) {
   EXPECT_EQ(d->shape().layout().memory_space(), kAlternateMemorySpace);
 }
 
+TEST_P(MemorySpaceAssignmentTest, RepackExportsAliasedOffsets) {
+  // This test is that we are correctly exporting aliased offsets for repacking.
+  // In this example, the buffer produced at HLO "a" will be allocated first,
+  // and will consist of four allocations:
+  //    1) a produced in the alternate memory (and then evicted to the default
+  //    memory). 2) a prefetched to the alternate memory to be used by q and
+  //    while HLOs. 3) a used within the while loop body. 4) the output of while
+  //    HLO, used by u.
+  //
+  // Since a will be allocated first (the test is crafted to prioritize sine
+  // HLO), all four allocations should get the same (zero) offsets. However,
+  // while allocations 2, 3, and 4 need to be colocated with each other,
+  // allocation 1 doesn't need to be colocated with the other three.
+  absl::string_view hlo_string = R"(
+  HloModule bug, is_scheduled=true
+
+  while_condition {
+    param1 = (f32[2,4], f32[2,4]) parameter(0)
+    ROOT cond = pred[] constant(true)
+  }
+
+  while_body {
+    param2 = (f32[2,4], f32[2,4]) parameter(0)
+    gte2 = f32[2,4] get-tuple-element(param2), index=0
+    gte3 = f32[2,4] get-tuple-element(param2), index=1
+    add = f32[2,4] add(gte2, gte3)
+    ROOT tuple2 = (f32[2,4], f32[2,4]) tuple(add, gte3)
+  }
+
+  ENTRY Entry {
+    param0 = f32[2,4] parameter(0)
+    a = f32[2,4] sine(param0)
+    b = f32[2,4] negate(a)
+    c = f32[2,4] negate(b)
+    d = f32[2,4] negate(c)
+    e = f32[2,4] negate(d)
+    f = f32[2,4] negate(e)
+    g = f32[2,4] negate(f)
+    h = f32[2,4] negate(g)
+    i = f32[2,4] negate(h)
+    j = f32[2,4] negate(i)
+    k = f32[2,4] negate(j)
+    l = f32[2,4] negate(k)
+    m = f32[2,4] negate(l)
+    n = f32[2,4] negate(m)
+    o = f32[2,4] negate(n)
+    p = f32[2,4] negate(o)
+    q = f32[2,4] add(p, a)
+    tuple = (f32[2,4], f32[2,4]) tuple(q, a)
+    while = (f32[2,4], f32[2,4]) while(tuple), condition=while_condition, body=while_body
+    gte0 = f32[2,4] get-tuple-element(while), index=0
+    gte1 = f32[2,4] get-tuple-element(while), index=1
+    r = f32[2,4] negate(gte0)
+    s = f32[2,4] negate(r)
+    t = f32[2,4] negate(s)
+    constant = f32[] constant(0)
+    broadcast = f32[8,4] broadcast(constant), dimensions={}
+    cos = f32[8,4] cosine(broadcast)
+    u = f32[2,4] add(t, gte1)
+    v = f32[2,4] add(u, param0)
+    w = f32[8,4] negate(cos)
+    ROOT tuple3 = (f32[2,4], f32[8,4]) tuple(v, w)
+  }
+  )";
+
+  MemorySpaceAssignment::BufferIntervalCompare buffer_interval_compare =
+      [](const MemorySpaceAssignment::BufferInterval& a,
+         const MemorySpaceAssignment::BufferInterval& b) {
+        auto get_opcode_priority = [](const HloOpcode& opcode) {
+          switch (opcode) {
+            case HloOpcode::kSin:
+              return 0;
+            case HloOpcode::kCos:
+              return 1;
+            case HloOpcode::kTanh:
+              return 2;
+            default:
+              return 3;
+          }
+        };
+
+        return get_opcode_priority(a.buffer->defining_instruction()->opcode()) <
+               get_opcode_priority(b.buffer->defining_instruction()->opcode());
+      };
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  InstructionCountPrefetchIntervalPicker prefetch_interval_picker(2, 10);
+  absl::flat_hash_map<std::pair<int64, int64>, int64> repack_map;
+
+  // Expect that of the four separate allocations for the "a" buffer, the first
+  // and the next three are in separate colocations.
+  auto check_fun =
+      [](absl::Span<MemorySpaceAssignmentRepacker::AllocationBlock*>
+             allocations) {
+        EXPECT_TRUE(allocations.at(0)->colocations.size() == 1 ||
+                    allocations.at(0)->colocations.size() == 3);
+        EXPECT_EQ(allocations.at(1)->colocations.size(), 3);
+        EXPECT_EQ(allocations.at(2)->colocations.size(), 3);
+        EXPECT_TRUE(allocations.at(3)->colocations.size() == 1 ||
+                    allocations.at(3)->colocations.size() == 3);
+      };
+  FakeMemorySpaceAssignmentRepacker repacker =
+      FakeMemorySpaceAssignmentRepacker(repack_map, check_fun);
+  MemorySpaceAssignment::Options options;
+  options.max_size_in_bytes = 128;
+  options.alignment_in_bytes = 8;
+  options.verify = true;
+  options.max_repacks = 1;
+  options.repacker = &repacker;
+  AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/-1,
+                    buffer_interval_compare, &prefetch_interval_picker,
+                    options);
+}
+
+TEST_P(MemorySpaceAssignmentTest,
+       RepackShouldntEraseRequiredAssignmentForConditionalOutput) {
+  // This is a test case for b/171040271. Repacks erase the required assignments
+  // (since some required assignments are inserted conditionally based on
+  // allocation decisions), including the fact that conditional outputs are
+  // always required to get assignments in the default memory. After repacking,
+  // this required assignment was never added back, causing conditionals to get
+  // alternate-memory allocations.
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation {
+    p0 = (f32[3]) parameter(0)
+    gte = f32[3] get-tuple-element(p0), index=0
+    neg1 = f32[3] negate(gte)
+    ROOT tuple1 = (f32[3]) tuple(neg1)
+  }
+
+  false_computation {
+    p0 = (f32[3]) parameter(0)
+    gte = f32[3] get-tuple-element(p0), index=0
+    neg2 = f32[3] negate(gte)
+    ROOT tuple2 = (f32[3]) tuple(neg2)
+  }
+
+  ENTRY entry {
+    p0 = f32[3] parameter(0)
+    p1 = pred[] parameter(1)
+    copy = f32[3] copy(p0)
+    tuple = (f32[3]) tuple(copy)
+    conditional = (f32[3]) conditional(p1, tuple, tuple), true_computation=true_computation, false_computation=false_computation
+    ROOT gte = f32[3] get-tuple-element(conditional), index=0
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  absl::flat_hash_map<std::pair<int64, int64>, int64> repack_map;
+  FakeMemorySpaceAssignmentRepacker repacker =
+      FakeMemorySpaceAssignmentRepacker(repack_map, nullptr,
+                                        /*always_return_modified=*/true);
+  MemorySpaceAssignment::Options options;
+  options.max_size_in_bytes = 128;
+  options.alignment_in_bytes = 8;
+  options.verify = true;
+  options.max_repacks = 10;
+  options.repacker = &repacker;
+  options.repack_after_every_allocation = true;
+  InstructionCountPrefetchIntervalPicker prefetch_interval_picker(2, 10);
+  AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/-1,
+                    /*buffer_interval_compare=*/{}, &prefetch_interval_picker,
+                    options);
+  // Make sure the root of the entry computation is in the default memory space.
+  EXPECT_EQ(module->entry_computation()
+                ->root_instruction()
+                ->shape()
+                .layout()
+                .memory_space(),
+            kDefaultMemorySpace);
+}
+
 TEST_P(MemorySpaceAssignmentTest, Determinism) {
   // Run memory space assignment a few times to make sure every time it compiles
   // to the same thing.
@@ -4244,6 +4496,47 @@ TEST_P(MemorySpaceAssignmentTest, Determinism) {
   }
 }
 
+TEST_P(MemorySpaceAssignmentTest, InPlaceOp) {
+  // Tests that in-place ops like DynamicUpdateSlice get the same allocation as
+  // its input.
+  absl::string_view hlo_string = R"(
+HloModule Module, is_scheduled=true
+
+fused_computation {
+  param0 = f32[2,3] parameter(0)
+  constant.1 = f32[] constant(0)
+  broadcast = f32[2,1] broadcast(constant.1), dimensions={}
+  constant.3 = s32[] constant(0)
+  ROOT dynamic-update-slice.5 = f32[2,3] dynamic-update-slice(param0, broadcast, constant.3, constant.3)
+}
+
+ENTRY main {
+  param = f32[2,3] parameter(0)
+  negate = f32[2,3] negate(param)
+  fusion = f32[2,3] fusion(negate), kind=kLoop, calls=fused_computation
+  ROOT add = f32[2,3] add(fusion, fusion)
+}
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  auto preset_assignments = AssignMemorySpace(module.get());
+  HloInstruction* negate_instruction =
+      module->entry_computation()->GetInstructionWithName("negate");
+  int64 negate_offset =
+      GetAlternateMemoryOffset(*preset_assignments, negate_instruction);
+  HloInstruction* fusion_instruction =
+      module->entry_computation()->GetInstructionWithName("fusion");
+  int64 fusion_offset =
+      GetAlternateMemoryOffset(*preset_assignments, fusion_instruction);
+  // We expect negate and fusion to get the same offsets.
+  EXPECT_EQ(negate_offset, fusion_offset);
+  const bool allocate_across_sequential_calls = GetParam();
+  if (allocate_across_sequential_calls) {
+    EXPECT_NE(negate_offset, -1);
+  }
+}
+
 INSTANTIATE_TEST_SUITE_P(MemorySpaceAssignmentInstantiation,
                          MemorySpaceAssignmentTest,
                          ::testing::Values(false, true));
@@ -4918,5 +5211,75 @@ TEST_F(CostAnalysisPrefetchIntervalPickerTest, NestedWhile) {
             4);
 }
 
+TEST_F(CostAnalysisPrefetchIntervalPickerTest, ConsecutiveConditionals) {
+  // This is a test for b/170668492, where prefetching for consecutive
+  // conditionals can cause the prefetch to start in the conditional's
+  // computation.
+  absl::string_view hlo_string = R"(
+  HloModule bug, is_scheduled=true
+
+  true_computation.0 {
+    p0 = (f32[3]{0}) parameter(0)                   // 5
+    gte = f32[3]{0} get-tuple-element(p0), index=0  // 6
+    ROOT neg1 = f32[3]{0} negate(gte)               // 7
+  }
+
+  false_computation.0 {
+    p0 = (f32[3]{0}) parameter(0)                   // 8
+    gte = f32[3]{0} get-tuple-element(p0), index=0  // 9
+    ROOT neg2 = f32[3]{0} negate(gte)               // 10
+  }
+
+  true_computation.1 {
+    p0 = (f32[3]{0}) parameter(0)                   // 12
+    gte = f32[3]{0} get-tuple-element(p0), index=0  // 13
+    ROOT neg1 = f32[3]{0} negate(gte)               // 14
+  }
+
+  false_computation.1 {
+    p0 = (f32[3]{0}) parameter(0)                   // 15
+    gte = f32[3]{0} get-tuple-element(p0), index=0  // 16
+    ROOT neg2 = f32[3]{0} negate(gte)               // 17
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)       // 0
+    p1 = f32[3]{0} parameter(1)       // 1
+    p2 = pred[] parameter(2)          // 2
+    tuple0 = (f32[3]{0}) tuple(p0)    // 3
+    tuple1 = (f32[3]{0}) tuple(p1)    // 4
+    conditional0 = f32[3]{0} conditional(p2, tuple0, tuple0), true_computation=true_computation.0, false_computation=false_computation.0  // 11
+    conditional1 = f32[3]{0} conditional(p2, tuple1, tuple1), true_computation=true_computation.1, false_computation=false_computation.1  // 18
+    ROOT tuple2 = (f32[3]{0}, f32[3]{0}) tuple(conditional0, conditional1)  // 19
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  HloCostAnalysis hlo_cost_analysis(ShapeSize);
+  TF_ASSERT_OK_AND_ASSIGN(auto cost_analysis,
+                          FakeMemorySpaceAssignmentCostAnalysis::Create(
+                              hlo_cost_analysis, *module));
+  CostAnalysisPrefetchIntervalPicker interval_picker(
+      *cost_analysis,
+      /*min_async_copy_to_overlap_ratio=*/1.0,
+      /*max_async_copy_to_overlap_ratio=*/12.0,
+      /*preferred_async_copy_to_overlap_ratio=*/2.0);
+
+  LOG(INFO) << module->ToString();
+
+  HloInstruction* conditional1 =
+      module->entry_computation()->GetInstructionWithName("conditional1");
+  const HloUse use{conditional1, /*operand_number=*/1, /*operand_index=*/{0}};
+  const Shape& shape =
+      module->entry_computation()->parameter_instruction(0)->shape();
+
+  // Expect that the prefetch to start before conditional0's called
+  // computations.
+  EXPECT_LT(interval_picker.LatestPrefetchStartTime(shape, /*start_time=*/0,
+                                                    /*end_time=*/11, &use),
+            5);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_utils.cc b/tensorflow/compiler/xla/service/memory_space_assignment_utils.cc
index 0c44ae0d766..aad943aaad7 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_utils.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_utils.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/memory_space_assignment_utils.h"
 
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+
 namespace xla {
 
 bool MemorySpaceAssignmentUtils::IsValueAllowedInAlternateMemory(
@@ -87,6 +90,17 @@ bool MemorySpaceAssignmentUtils::IsValueAllowedInAlternateMemory(
         return false;
       }
     }
+    if (auto* custom_call =
+            DynCast<HloCustomCallInstruction>(position.instruction)) {
+      for (const auto& pair : custom_call->output_to_operand_aliasing()) {
+        if (position.index == pair.first) {
+          VLOG(4) << "Keeping value " << value->ToShortString()
+                  << " in default mem because it is a custom-call output that "
+                     "aliases an operand buffer.";
+          return false;
+        }
+      }
+    }
   }
 
   return true;
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
index 68bcde4f7ee..4eaed3a12e6 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
@@ -1,6 +1,14 @@
 # Description:
 #   MLIR-GPU-specific components in XLA service implementation.
 
+load("//third_party/mlir:tblgen.bzl", "gentbl")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow/core/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
@@ -41,9 +49,12 @@ cc_library(
     srcs = ["emission_context.cc"],
     hdrs = ["emission_context.h"],
     deps = [
+        "//tensorflow/compiler/mlir/hlo",
+        "//tensorflow/compiler/mlir/hlo:lhlo",
         "//tensorflow/compiler/xla/service:hlo",
         "@com_google_absl//absl/strings",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
     ],
 )
 
@@ -65,7 +76,7 @@ cc_library(
         ":emission_context",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service/gpu:target_constants",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "@llvm-project//llvm:Core",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
@@ -84,7 +95,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@llvm-project//llvm:Core",
         "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:LLVMTransforms",
@@ -106,7 +117,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/gpu:stream_executor_util",
         "//tensorflow/compiler/xla/service/gpu:target_constants",
         "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
-        "//tensorflow/core:cuda_libdevice_path",
+        "//tensorflow/core/platform:cuda_libdevice_path",
         "//tensorflow/core:lib",
         "//tensorflow/stream_executor/gpu:asm_compiler",
     ]),
@@ -156,11 +167,21 @@ cc_library(
     ],
 )
 
+gentbl(
+    name = "passes_inc_gen",
+    compatible_with = get_compatible_with_cloud(),
+    tbl_outs = [("-gen-pass-decls -name XlaMlirGpu", "passes.h.inc")],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes.td",
+    td_srcs = ["@llvm-project//mlir:PassBaseTdFiles"],
+)
+
 cc_library(
     name = "passes",
     srcs = ["passes.cc"],
     hdrs = ["passes.h"],
     deps = [
+        ":passes_inc_gen",
         "//tensorflow/compiler/mlir/hlo:lhlo",
         "@com_google_absl//absl/memory",
         "@llvm-project//llvm:Support",
@@ -170,6 +191,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:SCFTransforms",
+        "@llvm-project//mlir:SideEffects",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Transforms",
     ],
@@ -182,15 +204,14 @@ cc_library(
     deps = [
         ":passes",
         "//tensorflow/compiler/mlir/hlo",
-        "//tensorflow/compiler/mlir/hlo:hlo_dialect_force_registration",
         "//tensorflow/compiler/mlir/hlo:hlo_legalize_to_lhlo",
-        "//tensorflow/compiler/mlir/hlo:legalize_tanh_to_approximation",
         "//tensorflow/compiler/mlir/hlo:legalize_to_linalg",
+        "//tensorflow/compiler/mlir/hlo:legalize_trigonometric_to_approximation",
         "//tensorflow/compiler/mlir/hlo:lhlo",
-        "//tensorflow/compiler/mlir/hlo:lhlo_copy_removal",
         "//tensorflow/compiler/mlir/hlo:lhlo_fuse_linalg",
         "//tensorflow/compiler/mlir/hlo:lhlo_legalize_to_affine",
         "//tensorflow/compiler/mlir/hlo:lhlo_legalize_to_gpu",
+        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
@@ -199,6 +220,7 @@ cc_library(
         "@llvm-project//mlir:CFGTransforms",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:GPUToNVVMTransforms",
+        "@llvm-project//mlir:GPUToROCDLTransforms",
         "@llvm-project//mlir:GPUTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
@@ -207,6 +229,7 @@ cc_library(
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ROCDLDialect",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:SCFToGPUPass",
         "@llvm-project//mlir:SCFTransforms",
@@ -255,6 +278,25 @@ tf_cc_binary(
         "//tensorflow/core:lib",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffects",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_cc_binary(
+    name = "xla-mlir-gpu-opt",
+    srcs = ["xla_mlir_gpu_opt.cc"],
+    visibility = ["//tensorflow/compiler/xla/service/mlir_gpu/tests:__subpackages__"],
+    deps = [
+        ":passes",
+        "//tensorflow/compiler/mlir/hlo:all_passes",
+        "//tensorflow/compiler/mlir/hlo:hlo_dialect_registration",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SideEffects",
         "@llvm-project//mlir:Support",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/emission_context.cc b/tensorflow/compiler/xla/service/mlir_gpu/emission_context.cc
index cb5ea946c1b..06c7ebd1099 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/emission_context.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/emission_context.cc
@@ -16,8 +16,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/mlir_gpu/emission_context.h"
 
 #include "absl/strings/substitute.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
 namespace xla {
@@ -25,7 +28,8 @@ namespace mlir_gpu {
 
 EmissionContext::EmissionContext(std::unique_ptr<HloModule> module)
     : module_(std::move(module)), context_() {
-  context_.loadAllGloballyRegisteredDialects();
+  context_.loadDialect<mlir::mhlo::MhloDialect, mlir::lmhlo::LmhloDialect,
+                       mlir::StandardOpsDialect>();
   error_handler_ = [](const ErrorMap& instructions_with_error,
                       HloModule* module) {
     std::set<const HloComputation*> computations_with_error;
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/BUILD
index 8f56548ce77..74eef71870e 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/BUILD
@@ -1,6 +1,8 @@
 # Description:
 #   MLIR-GPU-specific convolution in XLA service implementation.
 
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
@@ -72,12 +74,14 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:test",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Affine",
         "@llvm-project//mlir:AffineToStandardTransforms",
-        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
         "@llvm-project//mlir:CFGTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMTransforms",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Transforms",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
index f7a7decff76..c868d205310 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Conversion/SCFToStandard/SCFToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
+#include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
@@ -46,7 +48,7 @@ std::string CompileHloConvAndGetMlir(absl::string_view hlo_text) {
       hlo_module.entry_computation()->root_instruction();
 
   mlir::MLIRContext context;
-  context.loadAllGloballyRegisteredDialects();
+  context.loadDialect<mlir::AffineDialect, mlir::StandardOpsDialect>();
   mlir::OwningModuleRef mlir_module(
       mlir::ModuleOp::create(mlir::UnknownLoc::get(&context)));
 
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index 1b2edec7d61..a664a316e13 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"  // from @llvm-project
+#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"  // from @llvm-project
 #include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/SCFToGPU/SCFToGPUPass.h"  // from @llvm-project
 #include "mlir/Conversion/SCFToStandard/SCFToStandard.h"  // from @llvm-project
@@ -26,6 +27,7 @@ limitations under the License.
 #include "mlir/Dialect/GPU/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/Transforms.h"  // from @llvm-project
@@ -33,11 +35,12 @@ limitations under the License.
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
+#include "mlir/Transforms/Bufferize.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/xla/service/mlir_gpu/passes.h"
 #include "tensorflow/compiler/xla/util.h"
 
@@ -46,7 +49,7 @@ namespace mlir_gpu {
 
 Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) {
   mlir::PassManager pm(module.getContext());
-  applyPassManagerCLOptions(pm);
+  tensorflow::applyTensorflowAndCLOptions(pm);
 
   // We have to anticipate later unrolling in tiling to make sure that we get
   // the requested tiling after unrolling. Compute the new tiling here if
@@ -71,7 +74,7 @@ Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) {
   // Next, we can strip the outer fusion operation.
   pm.addPass(createFusionOpRemoverPass());
   // Remove unnecessary LHLO copies.
-  pm.addPass(::mlir::lmhlo::createLhloCopyRemovalPass());
+  pm.addPass(::mlir::createCopyRemovalPass());
   // Transform LHLO operations to LinAlg.
   pm.addPass(::mlir::lmhlo::createLegalizeLhloToLinalgPass());
   // Fuse linalg operations.
@@ -120,10 +123,8 @@ Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) {
   // Approximate of requested.
   if (options.use_approximations) {
     pm.addNestedPass<::mlir::FuncOp>(
-        ::mlir::mhlo::createLegalizeTanhToApproximationPass());
+        ::mlir::mhlo::createLegalizeTrigonometricToApproximationPass());
   }
-  // Move scalar operations into the launch to ensure smaller signatures.
-  pm.addPass(createMoveScalarComputationsIntoGpuLaunchPass());
   // Take launches to launches with kernels.
   pm.addPass(::mlir::createGpuKernelOutliningPass());
   // Make sure the kernel signature resembled the original function's
@@ -179,7 +180,7 @@ class LowerToNVVMPass
 Status LowerKernelBodiesToNVVM(mlir::ModuleOp module) {
   // We cannot verify as the signature of the kernel is rewritten.
   ::mlir::PassManager pm(module.getContext(), /*verifyPasses=*/false);
-  applyPassManagerCLOptions(pm);
+  tensorflow::applyTensorflowAndCLOptions(pm);
 
   // Rewrite kernel functions to LLVM IR.
   auto& kernelPm = pm.nest<::mlir::gpu::GPUModuleOp>();
@@ -197,6 +198,85 @@ Status LowerKernelBodiesToNVVM(mlir::ModuleOp module) {
   return Status::OK();
 }
 
+namespace {
+
+/// A pass that does the final lowering to ROCDL. It collects all the patterns
+/// that are currently required, currently mixing std, linalg and gpu.
+class LowerToROCDLPass
+    : public ::mlir::PassWrapper<
+          LowerToROCDLPass, ::mlir::OperationPass<::mlir::gpu::GPUModuleOp>> {
+  void getDependentDialects(mlir::DialectRegistry& registry) const override {
+    registry.insert<mlir::ROCDL::ROCDLDialect, mlir::LLVM::LLVMDialect>();
+  }
+
+ public:
+  void runOnOperation() override {
+    ::mlir::gpu::GPUModuleOp m = getOperation();
+
+    ::mlir::OwningRewritePatternList patterns;
+    ::mlir::populateGpuRewritePatterns(m.getContext(), patterns);
+    ::mlir::applyPatternsAndFoldGreedily(m, patterns);
+    patterns.clear();
+
+    ::mlir::LLVMTypeConverter converter(m.getContext());
+    ::mlir::populateStdToLLVMConversionPatterns(converter, patterns);
+    // TODO(b/145824979) Remove linalg once sliceop is in std.
+    ::mlir::populateLinalgToLLVMConversionPatterns(converter, patterns,
+                                                   &getContext());
+    ::mlir::populateGpuToROCDLConversionPatterns(converter, patterns);
+    ::mlir::populateAffineToStdConversionPatterns(patterns, m.getContext());
+
+    ::mlir::ConversionTarget target(getContext());
+    target.addIllegalDialect<::mlir::gpu::GPUDialect>();
+    target
+        .addIllegalOp<mlir::LLVM::CosOp, mlir::LLVM::ExpOp, mlir::LLVM::FAbsOp,
+                      mlir::LLVM::FCeilOp, mlir::LLVM::LogOp,
+                      mlir::LLVM::Log10Op, mlir::LLVM::Log2Op>();
+    target.addIllegalOp<mlir::FuncOp>();
+    target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
+    target.addLegalDialect<::mlir::ROCDL::ROCDLDialect>();
+    // TODO(csigg): Remove once we support replacing non-root ops.
+    target.addLegalOp<::mlir::gpu::GPUModuleOp, ::mlir::gpu::ModuleEndOp,
+                      ::mlir::gpu::YieldOp>();
+    if (failed(mlir::applyFullConversion(m, target, patterns))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+Status LowerKernelBodiesToROCDL(mlir::ModuleOp module) {
+  // We cannot verify as the signature of the kernel is rewritten.
+  ::mlir::PassManager pm(module.getContext(), /*verifyPasses=*/false);
+  tensorflow::applyTensorflowAndCLOptions(pm);
+
+  auto enable_if_vlog_is_on = [](mlir::Pass*, mlir::Operation*) {
+    return VLOG_IS_ON(1);
+  };
+  pm.enableIRPrinting(/*shouldPrintBeforePass=*/{},
+                      /*shouldPrintAfterPass=*/enable_if_vlog_is_on,
+                      /*printModuleScope=*/false,
+                      /*printAfterOnlyOnChange=*/false,
+                      /*out=*/llvm::dbgs());
+
+  // Rewrite kernel functions to LLVM IR.
+  auto& kernelPm = pm.nest<::mlir::gpu::GPUModuleOp>();
+  kernelPm.addPass(::mlir::createLowerToCFGPass());
+  kernelPm.addPass(absl::make_unique<LowerToROCDLPass>());
+
+  // Some basic cleanup.
+  kernelPm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass());
+  kernelPm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass());
+  // Remove all location information to prevent a debug build.
+  kernelPm.addPass(::mlir::createStripDebugInfoPass());
+
+  if (failed(pm.run(module))) {
+    return InternalError("Lowering to ROCDL IR failed.");
+  }
+  return Status::OK();
+}
+
 StatusOr<mlir::ModuleOp> ExtractKernelModule(mlir::ModuleOp module) {
   auto kernelModule = ::mlir::ModuleOp::create(module.getLoc());
   // TODO(b/137624192): This also needs to resolve naming conflicts.
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
index bd633bb06cb..290550142ec 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h
@@ -36,6 +36,8 @@ Status LowerLHLOToGPU(mlir::ModuleOp module,
 
 Status LowerKernelBodiesToNVVM(mlir::ModuleOp module);
 
+Status LowerKernelBodiesToROCDL(mlir::ModuleOp module);
+
 StatusOr<mlir::ModuleOp> ExtractKernelModule(mlir::ModuleOp module);
 
 }  // namespace mlir_gpu
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
index c7977aa776a..f00f46b83c1 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
@@ -438,7 +438,6 @@ StatusOr<std::unique_ptr<gpu::KernelThunk>> TransformKernelToXlaThunk(
 
   // Finally, create the thunk and set the launch dimensions.
   gpu::Thunk::ThunkInfo info;
-  info.hlo_instruction = instr;
   auto thunk = absl::make_unique<gpu::KernelThunk>(info, buffers,
                                                    kernel.getName().str());
 
@@ -563,9 +562,20 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
       auto ptx, xla::gpu::nvptx::CompileToPtx(llvmModule.get(),
                                               GetGpuVersion(stream_exec),
                                               config, GetLibdeviceDir(config)));
-  TF_ASSIGN_OR_RETURN(
-      auto cubin, se::CompileGpuAsm(stream_exec->device_ordinal(), ptx.c_str(),
-                                    gpu::PtxOptsFromConfig(config)));
+  // Allow to fallback to the driver compilation when ptxas isn't able to
+  // compile.
+  StatusOr<std::vector<uint8>> maybe_cubin =
+      se::CompileGpuAsm(stream_exec->device_ordinal(), ptx.c_str(),
+                        gpu::PtxOptsFromConfig(config));
+  std::vector<uint8> cubin;
+  if (maybe_cubin.ok()) {
+    cubin = std::move(maybe_cubin).ValueOrDie();
+  } else if (maybe_cubin.status().code() ==
+             tensorflow::error::Code::UNIMPLEMENTED) {
+    xla::gpu::WarnIfBadDriverJITVersion();
+  } else {
+    return maybe_cubin.status();
+  }
 
   auto thunk_schedule = absl::make_unique<ThunkSchedule>(
       std::make_unique<gpu::ThunkSequence>(std::move(thunk_sequence)),
@@ -580,7 +590,7 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
   return {absl::make_unique<GpuExecutable>(
       ptx, cubin, GetGpuVersion(stream_exec), std::move(thunk_schedule),
       emission_context.releaseHloModule(), std::move(buffer_assignment),
-      nullptr, nullptr)};
+      nullptr, nullptr, std::vector<GpuExecutable::ConstantInfo>())};
 }
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> MlirCompilerImpl::Compile(
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/passes.cc b/tensorflow/compiler/xla/service/mlir_gpu/passes.cc
index 887f14e90d9..84751bc0507 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/passes.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/passes.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Transforms/LoopUtils.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h"
@@ -32,8 +33,10 @@ namespace xla {
 namespace mlir_gpu {
 namespace {
 
-struct FusionOpRemoverPass
-    : public mlir::PassWrapper<FusionOpRemoverPass, ::mlir::FunctionPass> {
+#define GEN_PASS_CLASSES
+#include "tensorflow/compiler/xla/service/mlir_gpu/passes.h.inc"
+
+struct FusionOpRemoverPass : FusionOpRemoverPassBase<FusionOpRemoverPass> {
   void runOnFunction() override {
     getFunction().walk([&](mlir::lmhlo::FusionOp op) {
       mlir::OpBuilder builder(op);
@@ -52,8 +55,22 @@ struct FusionOpRemoverPass
   }
 };
 
-struct StoreForwardingPass
-    : mlir::PassWrapper<StoreForwardingPass, mlir::FunctionPass> {
+template <typename EffectTy>
+bool HasEffectsOnValue(mlir::Value value, mlir::Operation* op) {
+  auto mem_effects_interface =
+      mlir::dyn_cast_or_null<mlir::MemoryEffectOpInterface>(op);
+  if (!mem_effects_interface) {
+    return false;
+  }
+  llvm::SmallVector<mlir::MemoryEffects::EffectInstance, 2> effects;
+  mem_effects_interface.getEffects(effects);
+  return llvm::any_of(effects,
+                      [op](const mlir::MemoryEffects::EffectInstance& effect) {
+                        return mlir::isa<EffectTy>(effect.getEffect());
+                      });
+}
+
+struct StoreForwardingPass : StoreForwardingPassBase<StoreForwardingPass> {
   mlir::StoreOp findStore(mlir::Operation* op,
                           std::function<bool(mlir::StoreOp)> matches) {
     // Search from op upwards in the current block.
@@ -86,10 +103,9 @@ struct StoreForwardingPass
     while (auto subviewOp = mlir::dyn_cast_or_null<mlir::SubViewOp>(defOp)) {
       defOp = subviewOp.source().getDefiningOp();
     }
-    if (auto allocOp = mlir::dyn_cast_or_null<mlir::AllocOp>(defOp)) {
-      return allocOp.getOperation();
-    }
-    return nullptr;
+    return HasEffectsOnValue<mlir::MemoryEffects::Allocate>(memref, defOp)
+               ? defOp
+               : nullptr;
   }
 
   // Retrieves AllocOp from the cache or actually looks for it.
@@ -100,7 +116,7 @@ struct StoreForwardingPass
     if (allocOpIt != memrefToAllocOp->end()) {
       return allocOpIt->second;
     }
-    auto allocOp = SearchAllocOp(memref);
+    mlir::Operation* allocOp = SearchAllocOp(memref);
     memrefToAllocOp->insert({memref, allocOp});
     return allocOp;
   }
@@ -132,7 +148,7 @@ struct StoreForwardingPass
 };
 
 struct DeadTempBufferRemovalPass
-    : mlir::PassWrapper<DeadTempBufferRemovalPass, ::mlir::FunctionPass> {
+    : DeadTempBufferRemovalPassBase<DeadTempBufferRemovalPass> {
   bool operationConsideredDead(mlir::Operation* op) {
     for (auto result : op->getResults()) {
       if (!llvm::all_of(result.getUsers(), [&](mlir::Operation* op) {
@@ -168,13 +184,18 @@ struct DeadTempBufferRemovalPass
 
   void runOnFunction() override {
     llvm::SmallVector<mlir::Operation*, 8> dead_ops;
-    getFunction().walk([&](mlir::AllocOp allocOp) {
-      if (!operationConsideredDead(allocOp)) {
+    getFunction().walk([&](mlir::Operation* op) {
+      if (op->getNumResults() != 1 ||
+          !HasEffectsOnValue<mlir::MemoryEffects::Allocate>(op->getResult(0),
+                                                            op)) {
+        return;
+      }
+      if (!operationConsideredDead(op)) {
         return;
       }
 
       // TODO(herhut): There should be a generic helper for this.
-      recursiveErase(allocOp, &dead_ops);
+      recursiveErase(op, &dead_ops);
     });
     for (auto op : dead_ops) {
       op->erase();
@@ -182,66 +203,8 @@ struct DeadTempBufferRemovalPass
   }
 };
 
-struct MoveScalarComputationsIntoGpuLaunchPass
-    : mlir::PassWrapper<MoveScalarComputationsIntoGpuLaunchPass,
-                        mlir::FunctionPass> {
-  static bool isInliningBeneficiary(mlir::Operation* op) {
-    return llvm::isa<mlir::ConstantOp, mlir::DimOp, mlir::SelectOp,
-                     mlir::CmpIOp>(op);
-  }
-
-  static bool extractBeneficiaryOps(
-      mlir::Operation* op, llvm::SmallVectorImpl<mlir::Operation*>* ops,
-      llvm::SetVector<mlir::Value> args) {
-    if (!isInliningBeneficiary(op)) {
-      return false;
-    }
-
-    ops->push_back(op);
-    for (auto operand : op->getOperands()) {
-      // It is an existing arg, keep going.
-      if (args.count(operand)) {
-        continue;
-      }
-      mlir::Operation* definingOp = operand.getDefiningOp();
-      if (!definingOp || !extractBeneficiaryOps(definingOp, ops, args)) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  static void inlineOperationsIntoLaunch(mlir::gpu::LaunchOp launch) {
-    llvm::SetVector<mlir::Value> used_above;
-    mlir::getUsedValuesDefinedAbove(launch.body(), used_above);
-    mlir::BlockAndValueMapping inlined_map;
-    for (mlir::Value v : used_above) {
-      llvm::SmallVector<mlir::Operation*, 8> ops_to_move;
-      mlir::Operation* definingOp = v.getDefiningOp();
-      if (definingOp &&
-          extractBeneficiaryOps(definingOp, &ops_to_move, used_above)) {
-        mlir::OpBuilder b(launch.body());
-        for (mlir::Operation* op : llvm::reverse(ops_to_move)) {
-          auto result = b.clone(*op, inlined_map);
-          for (auto pair : llvm::zip(op->getResults(), result->getResults())) {
-            mlir::replaceAllUsesInRegionWith(std::get<0>(pair),
-                                             std::get<1>(pair), launch.body());
-          }
-          inlined_map.map(op->getResults(), result->getResults());
-        }
-      }
-    }
-  }
-
-  void runOnFunction() override {
-    mlir::FuncOp fun = getFunction();
-    fun.walk(
-        [](mlir::gpu::LaunchOp launch) { inlineOperationsIntoLaunch(launch); });
-  }
-};
-
 struct RewriteKernelSignaturePass
-    : mlir::PassWrapper<RewriteKernelSignaturePass, mlir::FunctionPass> {
+    : RewriteKernelSignaturePassBase<RewriteKernelSignaturePass> {
   void runOnFunction() override {
     mlir::FuncOp func = getFunction();
     mlir::ModuleOp module = func.getParentOfType<mlir::ModuleOp>();
@@ -349,15 +312,14 @@ struct RewriteKernelSignaturePass
   }
 };
 
-struct MapParallelLoopsPass
-    : public mlir::PassWrapper<MapParallelLoopsPass, mlir::FunctionPass> {
+struct MapParallelLoopsPass : MapParallelLoopsPassBase<MapParallelLoopsPass> {
   void runOnFunction() override {
     mlir::greedilyMapParallelSCFToGPU(getFunction().getBody());
   }
 };
 
 struct FuseInnerParallelLoopsPass
-    : public mlir::PassWrapper<FuseInnerParallelLoopsPass, mlir::FunctionPass> {
+    : FuseInnerParallelLoopsPassBase<FuseInnerParallelLoopsPass> {
   void runOnFunction() override {
     getFunction().walk([](mlir::scf::ParallelOp op) {
       mlir::scf::naivelyFuseParallelOps(op.region());
@@ -366,12 +328,10 @@ struct FuseInnerParallelLoopsPass
 };
 
 struct ParallelLoopCollapsingToFirstDimPass
-    : public mlir::PassWrapper<ParallelLoopCollapsingToFirstDimPass,
-                               mlir::OperationPass<mlir::ModuleOp>> {
-  void runOnOperation() override {
-    mlir::Operation* module = getOperation();
-
-    module->walk([&](mlir::scf::ParallelOp op) {
+    : ParallelLoopCollapsingToFirstDimPassBase<
+          ParallelLoopCollapsingToFirstDimPass> {
+  void runOnFunction() override {
+    getFunction().walk([&](mlir::scf::ParallelOp op) {
       unsigned num_loops = op.getNumLoops();
       std::vector<unsigned> combinedLoops;
       combinedLoops.reserve(num_loops);
@@ -397,11 +357,6 @@ std::unique_ptr<mlir::FunctionPass> createDeadTempBufferRemovalPass() {
   return absl::make_unique<DeadTempBufferRemovalPass>();
 }
 
-std::unique_ptr<mlir::FunctionPass>
-createMoveScalarComputationsIntoGpuLaunchPass() {
-  return absl::make_unique<MoveScalarComputationsIntoGpuLaunchPass>();
-}
-
 std::unique_ptr<mlir::FunctionPass> createRewriteKernelSignaturePass() {
   return absl::make_unique<RewriteKernelSignaturePass>();
 }
@@ -414,7 +369,7 @@ std::unique_ptr<mlir::FunctionPass> createMapParallelLoopsPass() {
   return absl::make_unique<MapParallelLoopsPass>();
 }
 
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+std::unique_ptr<mlir::FunctionPass>
 createParallelLoopCollapsingToFirstDimPass() {
   return absl::make_unique<ParallelLoopCollapsingToFirstDimPass>();
 }
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/passes.h b/tensorflow/compiler/xla/service/mlir_gpu/passes.h
index e3840628a2e..832321387c6 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/passes.h
+++ b/tensorflow/compiler/xla/service/mlir_gpu/passes.h
@@ -37,10 +37,6 @@ std::unique_ptr<mlir::FunctionPass> createStoreForwardingPass();
 /// that loads and stores are side-effect free (in bounds, no aliasing, etc.).
 std::unique_ptr<mlir::FunctionPass> createDeadTempBufferRemovalPass();
 
-/// Moves scalar computations to the GPULaunchOp body.
-std::unique_ptr<mlir::FunctionPass>
-createMoveScalarComputationsIntoGpuLaunchPass();
-
 /// Sorts the operands to the kernel for a deterministic order. First operands
 /// that are defined by function arguments, followed by operands that are
 /// returned from the function. This only works for simple functions without
@@ -57,9 +53,12 @@ std::unique_ptr<mlir::FunctionPass> createFuseInnerParallelLoopsPass();
 std::unique_ptr<mlir::FunctionPass> createMapParallelLoopsPass();
 
 /// Collapses all loop dimension into the first one.
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+std::unique_ptr<mlir::FunctionPass>
 createParallelLoopCollapsingToFirstDimPass();
 
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/compiler/xla/service/mlir_gpu/passes.h.inc"
+
 }  // namespace mlir_gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/passes.td b/tensorflow/compiler/xla/service/mlir_gpu/passes.td
new file mode 100644
index 00000000000..55fe15ad6ff
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/passes.td
@@ -0,0 +1,91 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_PASSES_TD_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_PASSES_TD_
+
+include "mlir/Pass/PassBase.td"
+
+def FusionOpRemoverPass : FunctionPass<"mlir-gpu-fusion-op-remover"> {
+  let summary = "Removes lhlo fusion ops by inlining their regions.";
+  let constructor = "createFusionOpRemoverPass()";
+  let description = [{
+    Replaces a FusionOp by the operations contained in its region.
+  }];
+}
+
+def StoreForwardingPass : FunctionPass<"mlir-gpu-store-forwarding"> {
+  let summary = "Limited pass to forward stores to loads.";
+  let constructor = "createStoreForwardingPass()";
+  let description = [{
+    Replaces a load that immediately follows a store to the same address with
+    the stored value.
+  }];
+}
+
+def DeadTempBufferRemovalPass
+    : FunctionPass<"mlir-gpu-dead-temp-buffer-removal"> {
+  let summary = "Removal of dead temp buffers.";
+  let constructor = "createDeadTempBufferRemovalPass()";
+  let description = [{
+    Removes temporary buffers that are only written to but never read from or
+    that are read but the read value is not used. Needs an analysis that proves
+    that loads and stores are side-effect free (in bounds, no aliasing, etc.).
+  }];
+}
+
+def RewriteKernelSignaturePass
+    : FunctionPass<"mlir-gpu-rewrite-signatures"> {
+  let summary = "Rewrite kernel signatures to be deterministic.";
+  let constructor = "createRewriteKernelSignaturePass()";
+  let description = [{
+    Sorts the operands to the kernel for a deterministic order. First operands
+    that are defined by function arguments, followed by operands that are
+    returned from the function. This only works for simple functions without
+    control flow and can be used in cases where the kernel is extracted and used
+    independently of the host-side code.
+  }];
+}
+
+def MapParallelLoopsPass
+    : FunctionPass<"mlir-gpu-map-parallel-loops"> {
+  let summary = "Greedily maps loops to GPU hardware dimensions.";
+  let constructor = "createMapParallelLoopsPass()";
+  let description = [{
+    Greedily maps loops to GPU hardware dimensions.
+  }];
+}
+
+def FuseInnerParallelLoopsPass
+    : FunctionPass<"mlir-gpu-fuse-inner-parallel-loops"> {
+  let summary = "Limited pass to forward stores to loads.";
+  let constructor = "createFuseInnerParallelLoopsPass()";
+  let description = [{
+    Directs parallel loop fusion to the inner loops. This cannot be done with
+    a passmanager alone ATM, as nested pass managers require operations to
+    be closed from above.
+  }];
+}
+
+def ParallelLoopCollapsingToFirstDimPass
+    : FunctionPass<"mlir-gpu-collapse-parallel-loops"> {
+  let summary = "Collaps n-dimensional loops into one-dimensional ones.";
+  let constructor = "createParallelLoopCollapsingToFirstDimPass()";
+  let description = [{
+    Collapses all loop dimension of a parallel loop into the first one.
+  }];
+}
+
+#endif // TENSORFLOW_COMPILER_XLA_SERVICE_MLIR_GPU_PASSES_TD_
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
index 850d5f5a0cf..9bd5e3350fa 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
@@ -18,13 +19,16 @@ package_group(
 )
 
 glob_lit_tests(
-    data = [":test_utilities"],
+    data = [
+        ":test_utilities",
+        "@llvm-project//mlir:run_lit.sh",
+    ],
     default_tags = tf_cuda_tests_tags() + [
         "no_pip",
         "config-cuda-only",
         "no_rocm",
     ],
-    driver = "@llvm-project//mlir:run_lit.sh",
+    driver = "//tensorflow/compiler/mlir:run_lit.sh",
     exclude = [
         # TODO(b/137624192): Reenable once we can fuse reductions.
         "fused_reduce.hlo",
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/passes/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/tests/passes/BUILD
new file mode 100644
index 00000000000..b1b7de5c4e6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/passes/BUILD
@@ -0,0 +1,24 @@
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+glob_lit_tests(
+    data = [
+        ":test_utilities",
+        "@llvm-project//mlir:run_lit.sh",
+    ],
+    driver = "//tensorflow/compiler/mlir:run_lit.sh",
+    test_file_exts = ["mlir"],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/xla/service/mlir_gpu:xla-mlir-gpu-opt",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/passes/dead_temp_buffer_removal.mlir b/tensorflow/compiler/xla/service/mlir_gpu/tests/passes/dead_temp_buffer_removal.mlir
new file mode 100644
index 00000000000..58132f4ea45
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/passes/dead_temp_buffer_removal.mlir
@@ -0,0 +1,72 @@
+// RUN: xla-mlir-gpu-opt --mlir-gpu-dead-temp-buffer-removal %s | FileCheck %s
+
+// CHECK-LABEL: @dead
+func @dead() {
+  // CHECK-NOT: alloc
+  %0 = alloc() : memref<42xi32>
+  %c0 = constant 0 : i32
+  %c12 = constant 12 : index
+  // CHECK-NOT: store
+  store %c0, %0[%c12] : memref<42xi32>
+  return
+}
+
+// CHECK-LABEL: @dead_alloca
+func @dead_alloca() {
+  // CHECK-NOT: alloca
+  %0 = alloc() : memref<42xi32>
+  %c0 = constant 0 : i32
+  %c12 = constant 12 : index
+  // CHECK-NOT: store
+  store %c0, %0[%c12] : memref<42xi32>
+  return
+}
+
+// CHECK-LABEL: @dead_load
+func @dead_load() {
+  // CHECK-NOT: alloc
+  %0 = alloc() : memref<42xi32>
+  %c0 = constant 0 : i32
+  %c12 = constant 12 : index
+  store %c0, %0[%c12] : memref<42xi32>
+  %1 = load %0[%c12] : memref<42xi32>
+  return
+}
+
+// CHECK-LABEL: @used_load
+func @used_load() -> i32 {
+  // CHECK: alloc
+  %0 = alloc() : memref<42xi32>
+  %c0 = constant 0 : i32
+  %c12 = constant 12 : index
+  store %c0, %0[%c12] : memref<42xi32>
+  %1 = load %0[%c12] : memref<42xi32>
+  return %1 : i32
+}
+
+// CHECK-LABEL: @dead_subview
+func @dead_subview() {
+  // CHECK-NOT: alloc
+  %0 = alloc() : memref<42xi32>
+  %c0 = constant 0 : i32
+  %c1 = constant 1 : index
+  %c4 = constant 4 : index
+  %c12 = constant 12 : index
+  store %c0, %0[%c12] : memref<42xi32>
+  %1 = subview %0[%c12][%c4][%c1] : memref<42xi32> to memref<?xi32, affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>>
+  return
+}
+
+// CHECK-LABEL: @used_subview
+func @used_subview() -> i32 {
+  // CHECK: alloc
+  %0 = alloc() : memref<42xi32>
+  %c0 = constant 0 : i32
+  %c1 = constant 1 : index
+  %c4 = constant 4 : index
+  %c12 = constant 12 : index
+  store %c0, %0[%c12] : memref<42xi32>
+  %1 = subview %0[%c12][%c4][%c1] : memref<42xi32> to memref<?xi32, affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>>
+  %2 = load %1[%c1] : memref<?xi32, affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>>
+  return %2 : i32
+}
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/passes/fusion_op_remover.mlir b/tensorflow/compiler/xla/service/mlir_gpu/tests/passes/fusion_op_remover.mlir
new file mode 100644
index 00000000000..69ebbbd5a72
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/passes/fusion_op_remover.mlir
@@ -0,0 +1,20 @@
+// RUN: xla-mlir-gpu-opt --mlir-gpu-fusion-op-remover %s | FileCheck %s
+
+// CHECK-LABEL: func @fusion_memref
+func @fusion_memref(%input1: memref<10xf32>, %input2: memref<10xf32>,
+                   %input3: memref<10xf32>, %out: memref<10xf32>) -> () {
+  // CHECK-NOT: lmhlo.fusion
+  "lmhlo.fusion"() ( {
+    %0 = tensor_load %input1 : memref<10xf32>
+    %1 = tensor_load %input2 : memref<10xf32>
+    %2 = "mhlo.add"(%0, %1) {name = "add"}
+      : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
+    %3 = tensor_load %input3 : memref<10xf32>
+    %4 = "mhlo.multiply"(%2, %3) {name = "multiply"}
+      : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
+    tensor_store %4, %out : memref<10xf32>
+  // CHECK-NOT: lmhlo.terminator
+    "lmhlo.terminator"() : () -> ()
+  } ) : () -> ()
+  return
+}
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/passes/rewrite_kernel_signatures.mlir b/tensorflow/compiler/xla/service/mlir_gpu/tests/passes/rewrite_kernel_signatures.mlir
new file mode 100644
index 00000000000..cff1989f05b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/passes/rewrite_kernel_signatures.mlir
@@ -0,0 +1,138 @@
+// RUN: xla-mlir-gpu-opt --mlir-gpu-rewrite-signatures %s --split-input-file --verify-diagnostics | FileCheck %s
+
+module attributes {gpu.container_module} {
+
+// CHECK-LABEL: @kernel_module
+gpu.module @kernel_module {
+  // CHECK-LABEL: gpu.func @kernel
+  // CHECK-SAME: %{{.*}}: memref<32xf32>, %{{.*}}: memref<16xf32>,
+  // CHECK-SAME: %{{.*}}: memref<8xf32>
+  gpu.func @kernel(%arg0: memref<8xf32>, %arg1: memref<16xf32>,
+                   %arg2: memref<32xf32>) kernel {
+    gpu.return
+  }
+}
+
+  // CHECK-LABEL: @caller
+func @caller(%arg0: memref<32xf32>, %arg1: memref<16xf32>) -> memref<8xf32> {
+  %cst = constant 8 : index
+  %res = alloc() : memref<8xf32>
+
+  // CHECK: gpu.launch_func
+  // CHECK-SAME: index, memref<32xf32>, memref<16xf32>, memref<8xf32>)
+  "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %res, %arg1, %arg0)
+      { kernel = @kernel_module::@kernel }
+      : (index, index, index, index, index, index,
+         memref<8xf32>, memref<16xf32>, memref<32xf32>) -> ()
+
+  return %res : memref<8xf32>
+}
+
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+
+gpu.module @kernel_module {
+  // expected-error @+1 {{number of kernel arguments does not match numberof arguments and results of surrounding function}}
+  gpu.func @kernel(%arg0: memref<16xf32>, %arg1: memref<32xf32>) kernel {
+    gpu.return
+  }
+}
+
+func @caller(%arg0: memref<32xf32>, %arg1: memref<16xf32>) -> memref<8xf32> {
+  %cst = constant 8 : index
+  %res = alloc() : memref<8xf32>
+
+  "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %arg1, %arg0)
+      { kernel = @kernel_module::@kernel }
+      : (index, index, index, index, index, index,
+         memref<16xf32>, memref<32xf32>) -> ()
+
+  return %res : memref<8xf32>
+}
+
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+
+gpu.module @kernel_module {
+  // expected-error @+1 {{result 0 of containing function is not an argument to the kernel}}
+  gpu.func @kernel(%arg0: memref<16xf32>, %arg1: memref<32xf32>,
+                   %arg2: memref<8xf32>) kernel {
+    gpu.return
+  }
+}
+
+func @caller(%arg0: memref<32xf32>, %arg1: memref<16xf32>) -> memref<8xf32> {
+  %cst = constant 8 : index
+  %res = alloc() : memref<8xf32>
+  %fake = alloc() : memref<8xf32>
+
+  "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %arg1, %arg0, %fake)
+      { kernel = @kernel_module::@kernel }
+      : (index, index, index, index, index, index,
+         memref<16xf32>, memref<32xf32>, memref<8xf32>) -> ()
+
+  return %res : memref<8xf32>
+}
+
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+
+gpu.module @kernel_module {
+  // expected-error @+1 {{argument 1 to containing function is not an argument to the kernel}}
+  gpu.func @kernel(%arg0: memref<16xf32>, %arg1: memref<32xf32>,
+                   %arg2: memref<8xf32>) kernel {
+    gpu.return
+  }
+}
+
+func @caller(%arg0: memref<32xf32>, %arg1: memref<16xf32>) -> memref<8xf32> {
+  %cst = constant 8 : index
+  %res = alloc() : memref<8xf32>
+  %fake = alloc() : memref<16xf32>
+
+  "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %fake, %arg0, %res)
+      { kernel = @kernel_module::@kernel }
+      : (index, index, index, index, index, index,
+         memref<16xf32>, memref<32xf32>, memref<8xf32>) -> ()
+
+  return %res : memref<8xf32>
+}
+
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+
+gpu.module @kernel_module {
+  gpu.func @kernel(%arg0: memref<8xf32>, %arg1: memref<16xf32>,
+                   %arg2: memref<32xf32>) kernel {
+    gpu.return
+  }
+}
+
+// expected-error @+1 {{surrounding function has more than one block}}
+func @caller(%arg0: memref<32xf32>, %arg1: memref<16xf32>) -> memref<8xf32> {
+  %cst = constant 8 : index
+  %res = alloc() : memref<8xf32>
+  br ^bb1
+
+  ^bb1:
+  "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %res, %arg1, %arg0)
+      { kernel = @kernel_module::@kernel }
+      : (index, index, index, index, index, index,
+         memref<8xf32>, memref<16xf32>, memref<32xf32>) -> ()
+
+  return %res : memref<8xf32>
+}
+
+}
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/passes/store_forwarding_pass.mlir b/tensorflow/compiler/xla/service/mlir_gpu/tests/passes/store_forwarding_pass.mlir
new file mode 100644
index 00000000000..8b993bb56a5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/passes/store_forwarding_pass.mlir
@@ -0,0 +1,72 @@
+// RUN: xla-mlir-gpu-opt --mlir-gpu-store-forwarding %s | FileCheck %s
+
+// CHECK-LABEL: @forward
+func @forward() -> f32 {
+  %0 = alloc() : memref<1024xf32>
+  %c42 = constant 24 : index
+  // CHECK: %[[CST:.*]] = constant 1.0
+  %c1 = constant 1.0 : f32
+  store %c1, %0[%c42] : memref<1024xf32>
+  // CHECK-NOT: load
+  %1 = load %0[%c42] : memref<1024xf32>
+  // CHECK: return %[[CST]]
+  return %1 : f32
+}
+
+// CHECK-LABEL: @forward_alloca
+func @forward_alloca() -> f32 {
+  %0 = alloca() : memref<1024xf32>
+  %c42 = constant 24 : index
+  // CHECK: %[[CST:.*]] = constant 1.0
+  %c1 = constant 1.0 : f32
+  store %c1, %0[%c42] : memref<1024xf32>
+  // CHECK-NOT: load
+  %1 = load %0[%c42] : memref<1024xf32>
+  // CHECK: return %[[CST]]
+  return %1 : f32
+}
+
+// CHECK-LABEL: @wrong_index
+func @wrong_index() -> f32 {
+  %0 = alloc() : memref<1024xf32>
+  %c42 = constant 24 : index
+  %c12 = constant 12 : index
+  %c1 = constant 1.0 : f32
+  store %c1, %0[%c42] : memref<1024xf32>
+  // CHECK: %[[RES:.*]] = load
+  %1 = load %0[%c12] : memref<1024xf32>
+  // CHECK: return %[[RES]]
+  return %1 : f32
+}
+
+// CHECK-LABEL: @wrong_memref
+func @wrong_memref() -> f32 {
+  %0 = alloc() : memref<1024xf32>
+  %1 = alloc() : memref<1024xf32>
+  %c42 = constant 24 : index
+  %c1 = constant 1.0 : f32
+  store %c1, %0[%c42] : memref<1024xf32>
+  // CHECK: %[[RES:.*]] = load
+  %2 = load %1[%c42] : memref<1024xf32>
+  // CHECK: return %[[RES]]
+  return %2 : f32
+}
+
+// CHECK-LABEL: @with_parallel_loop
+func @with_parallel_loop() {
+  %0 = alloc() : memref<1024xf32>
+  %c0 = constant 0 : index
+  %c42 = constant 24 : index
+  %c1 = constant 1 : index
+  // CHECK: %[[CST:.*]] = constant 1.100000e+01 : f32
+  %c11 = constant 1.100000e+01 : f32
+  store %c11, %0[%c42] : memref<1024xf32>
+  // CHECK: scf.parallel
+  scf.parallel (%i0) = (%c0) to (%c42) step (%c1) {
+    // CHECK-NOT: load
+    %1 = load %0[%c42] : memref<1024xf32>
+    // CHECK-NEXT: store %[[CST]]
+    store %1, %0[%c0] : memref<1024xf32>
+  }
+  return
+}
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/xla_mlir_gpu_opt.cc b/tensorflow/compiler/xla/service/mlir_gpu/xla_mlir_gpu_opt.cc
new file mode 100644
index 00000000000..cbda9a30a07
--- /dev/null
+++ b/tensorflow/compiler/xla/service/mlir_gpu/xla_mlir_gpu_opt.cc
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/InitAllPasses.h"  // from @llvm-project
+#include "mlir/Support/MlirOptMain.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/register.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/register_passes.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/passes.h"
+
+int main(int argc, char **argv) {
+  mlir::registerAllPasses();
+  mlir::mhlo::registerAllMhloPasses();
+  mlir::lmhlo::registerAllLmhloPasses();
+  xla::mlir_gpu::registerXlaMlirGpuPasses();
+
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::mhlo::registerAllMhloDialects(registry);
+
+  return failed(mlir::MlirOptMain(
+      argc, argv, "XLA mlir gpu backend pass driver\n", registry));
+}
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc
index a21cec538d1..c5c2d081686 100644
--- a/tensorflow/compiler/xla/service/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -338,6 +339,21 @@ bool MultiOutputFusion::LegalToFuseMainConstraints(HloInstruction* instr1,
   if (!ShapesCompatibleForFusion(instr1, instr2)) {
     return false;
   }
+
+  // If both nodes are in-place operations and they use a common in-place
+  // operand, we can't fuse these two.
+  for (const auto& operand_and_output_index1 :
+       HloDataflowAnalysis::GetInPlaceInputOutputPairs(instr1)) {
+    const HloInstruction* operand =
+        instr1->operand(operand_and_output_index1.first.operand_number);
+    for (const auto& operand_and_output_index2 :
+         HloDataflowAnalysis::GetInPlaceInputOutputPairs(instr2)) {
+      if (operand ==
+          instr2->operand(operand_and_output_index2.first.operand_number)) {
+        return false;
+      }
+    }
+  }
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/qr_expander.cc b/tensorflow/compiler/xla/service/qr_expander.cc
new file mode 100644
index 00000000000..d1b1526ed30
--- /dev/null
+++ b/tensorflow/compiler/xla/service/qr_expander.cc
@@ -0,0 +1,466 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/qr_expander.h"
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/loops.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace xla {
+
+namespace {
+
+std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
+                                 absl::Span<const int64> ys) {
+  std::vector<int64> output;
+  output.reserve(xs.size() + ys.size());
+  std::copy(xs.begin(), xs.end(), std::back_inserter(output));
+  std::copy(ys.begin(), ys.end(), std::back_inserter(output));
+  return output;
+}
+
+// Computes a Householder reflection of the form:
+// H = I - tau v v.T.
+// such that
+// H . ( x1  ) = ( x1   )
+//     ( x2  ) = ( x2   )
+//     ( ... ) = ( ...  )
+//     ( xk  ) = ( beta )
+//     ( ... )   ( 0    )
+//     ( ... )   ( 0    )
+// Unlike the usual formulation, we allow the caller to supply 'k' rather than
+// only providing the relevant part of 'x' to maintain XLA's static shape
+// invariant. In addition, the implementation supports batching.
+// Pseudo-code, without batching:
+//   alpha = x[k]
+//   x_copy = np.copy(x)
+//   x_copy[:k+1] = 0
+//   xnorm = norm2(x_copy)
+//   if xnorm == 0 and np.imag(alpha) == 0:
+//     beta = alpha
+//     tau = 0
+//     v = np.zeros_like(x)
+//   else:
+//     beta = -np.sign(np.real(alpha)) * np.sqrt(alpha * np.conj(alpha) + xnorm)
+//     if np.issubdtype(x.dtype, np.complexfloating):
+//       tau = (beta - alpha) / beta
+//     else:
+//       tau = (beta - np.real(alpha) / beta) + (-np.imag(alpha) / beta) * 1j
+//     v = x / (alpha - beta)
+//   v[k] = 1
+//   return (v, tau, beta)
+// TODO(phawkins): LAPACK's xLARFG implementation has code for handling
+// overflows in the norm/beta calculations. Perhaps do the same here.
+Status House(XlaOp x, XlaOp k, absl::Span<const int64> batch_dims,
+             const int64 m, XlaOp* v, XlaOp* tau, XlaOp* beta) {
+  XlaBuilder* const builder = x.builder();
+  TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
+  const PrimitiveType type = x_shape.element_type();
+
+  std::vector<int64> batch_dim_ids(batch_dims.size());
+  std::iota(batch_dim_ids.begin(), batch_dim_ids.end(), 0);
+  const int64 minor_dim = batch_dims.size();
+
+  XlaOp zero = ScalarLike(x, 0.0);
+
+  // alpha = x[k]
+  XlaOp alpha = Reshape(DynamicSliceInMinorDims(x, {k}, {1}), batch_dims);
+
+  // Compute x[k+1:] (padded with zeros in elements 0..k)
+  XlaOp iota = Iota(builder, S32, m);
+  XlaOp x_after_k = Mul(x, ConvertElementType(Gt(iota, k), type),
+                        /*broadcast_dimensions=*/{minor_dim});
+
+  XlaOp sigma_is_zero;
+  if (primitive_util::IsComplexType(type)) {
+    // sigma = np.dot(x[k+1:], np.conj(x[k+1:]))
+    // TODO(phawkins): this calculation may be numerically unstable.
+    auto x_squared = Real(x_after_k * Conj(x_after_k));
+    auto sigma =
+        Reduce(x_squared, ScalarLike(x_squared, 0.0),
+               CreateScalarAddComputation(
+                   primitive_util::ComplexComponentType(type), builder),
+               {minor_dim});
+    // mu = np.sqrt(x[k]*np.con(x[k]) + sigma)
+    auto mu = Sqrt(Real(alpha * Conj(alpha)) + sigma);
+
+    sigma_is_zero = Eq(sigma, ScalarLike(sigma, 0));
+    sigma_is_zero = And(sigma_is_zero, Eq(Imag(alpha), ScalarLike(sigma, 0)));
+
+    *beta = Select(Lt(Real(alpha), ScalarLike(sigma, 0)), ScalarLike(mu, 1),
+                   ScalarLike(mu, -1)) *
+            mu;
+    *beta = Select(sigma_is_zero, Real(alpha), *beta);
+    *tau = Complex((*beta - Real(alpha)) / *beta, -Imag(alpha) / *beta);
+  } else {
+    // sigma = np.dot(x[k+1:], x[k+1:])
+    // TODO(phawkins): this calculation may be numerically unstable.
+    auto sigma = Reduce(x_after_k * x_after_k, zero,
+                        CreateScalarAddComputation(type, builder), {minor_dim});
+    // mu = np.sqrt(x[k]*x[k] + sigma)
+    auto mu = Sqrt(Square(alpha) + sigma);
+    sigma_is_zero = Eq(sigma, zero);
+
+    XlaOp one = ScalarLike(x, 1.0);
+    *beta = Select(Lt(alpha, zero), one, -one) * mu;
+    *beta = Select(sigma_is_zero, alpha, *beta);
+    *tau = (*beta - alpha) / *beta;
+  }
+  *tau = Select(sigma_is_zero, ZerosLike(*tau), *tau);
+
+  auto divisor =
+      Select(sigma_is_zero, Broadcast(ScalarLike(alpha, 1), batch_dims),
+             alpha - ConvertElementType(*beta, type));
+
+  auto e_k = Broadcast(ConvertElementType(Eq(iota, k), type),
+                       std::vector<int64>(batch_dims.size(), 1));
+
+  // Form v as [0, 0, ..., 1] ++ x[k+1:] / divisor
+  // If sigma is zero, x[k+1:] is zero, so use any non-zero divisor.
+  *v = e_k + Div(x_after_k, divisor, /*broadcast_dimensions=*/batch_dim_ids);
+  return Status::OK();
+}
+
+}  // namespace
+
+// Householder QR decomposition. Algorithm 5.2.1 from Golub and Van
+// Loan "Matrix Computations", 4th Edition. This is an unblocked implementation
+// used as an inner routine of the blocked implementation.
+// Algorithm is adapted slightly so the shapes inside the loop are static, at
+// the cost of some redundant computation. Since this is used as an inner block
+// kernel, accumulates the Householder transformations (vs, taus) rather than
+// the matrix q.
+// Equivalent Python code, without batching:
+// def qr(a):
+//   m = a.shape[0]
+//   n = a.shape[1]
+//   taus = np.zeros([n])
+//   for j in xrange(min(m, n)):
+//     v, tau, beta = house(a[:, j], j)
+//     a[:, j+1:] -= np.conj(tau) * np.dot(v[:, np.newaxis],
+//                                np.dot(np.conj(v[np.newaxis, :]), a[:, j+1:]))
+//     # Form column j explicitly rather than relying on the precision of the
+//     # Householder update.
+//     a[j, j] = beta
+//     a[j+1:, j] = v[j+1:]
+//     taus[j] = tau
+//   return (a, taus)
+StatusOr<QrExpander::QrResult> QrExpander::QrBlock(
+    XlaOp a, PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+  const int num_dims = a_shape.rank();
+  if (num_dims < 2) {
+    return InvalidArgument("Argument to QR must have rank >= 2; got shape %s",
+                           a_shape.ToString());
+  }
+  PrimitiveType type = a_shape.element_type();
+
+  const int64 m = ShapeUtil::GetDimension(a_shape, -2);
+  const int64 n = ShapeUtil::GetDimension(a_shape, -1);
+
+  const int64 num_batch_dims = num_dims - 2;
+  std::vector<int64> batch_dims(num_batch_dims);
+  for (int i = 0; i < num_batch_dims; ++i) {
+    batch_dims[i] = ShapeUtil::GetDimension(a_shape, i);
+  }
+
+  std::vector<int64> batch_dim_indices(num_batch_dims);
+  std::iota(batch_dim_indices.begin(), batch_dim_indices.end(), 0);
+
+  auto qr_body_fn = [&](XlaOp j, absl::Span<const XlaOp> values,
+                        XlaBuilder* builder) -> StatusOr<std::vector<XlaOp>> {
+    auto a = values[0];
+    auto taus = values[1];
+
+    // v, tau, beta = house(a[:, j], j)
+    auto x = DynamicSliceInMinorDims(a, {j}, {1});
+    XlaOp v, tau, beta;
+    TF_RETURN_IF_ERROR(House(Collapse(x, {num_dims - 2, num_dims - 1}), j,
+                             batch_dims, m, &v, &tau, &beta));
+
+    const int64 minor_dim = batch_dims.size();
+    auto iota_mn = Iota(
+        builder, ShapeUtil::MakeShape(S32, ConcatVectors(batch_dims, {m, n})),
+        minor_dim + 1);
+
+    std::vector<int64> shape = batch_dims;
+    shape.push_back(1);
+    shape.push_back(m);
+    auto v_broadcast = Reshape(v, shape);
+    // a[:, j+1:] -= np.conj(tau) * (v[:, np.newaxis] @
+    //     (np.conj(v[np.newaxis, :]) @ a[:, j+1:]))
+    // We use masking rather than a loop-variant shape to handle the j+1:
+    // indexing.
+    auto vva = BatchDot(MaybeConjugate(v_broadcast, true),
+                        Select(Lt(j, iota_mn), a, ZerosLike(a)), precision);
+    vva = BatchDot(v_broadcast, true, vva, false, precision);
+    a = a - Mul(MaybeConjugate(tau, true), vva,
+                /*broadcast_dimensions=*/batch_dim_indices);
+
+    // a[j, j] = beta
+    // a[j+1:,j] = v[j+1:]
+    auto iota = Reshape(Iota(a.builder(), S32, m), {m, 1});
+    auto predecessor_mask = ConvertElementType(Lt(iota, j), type);
+    auto mask = Broadcast(ConvertElementType(Eq(iota, j), type),
+                          std::vector<int64>(batch_dims.size(), 1));
+    auto successor_mask = Gt(Iota(a.builder(), S32, m), j);
+    auto new_x = Mul(x, predecessor_mask,
+                     /*broadcast_dimensions=*/{num_dims - 2, num_dims - 1}) +
+                 Mul(ConvertElementType(beta, type), mask,
+                     /*broadcast_dimensions=*/batch_dim_indices);
+    new_x = Add(
+        new_x, Select(Broadcast(successor_mask, batch_dims), v, ZerosLike(v)),
+        /*broadcast_dimensions=*/ConcatVectors(batch_dim_indices, {minor_dim}));
+    // Update a[:,j]
+    std::vector<int64> dim_ids(num_dims);
+    std::iota(dim_ids.begin(), dim_ids.end(), 0);
+    new_x = BroadcastInDim(new_x, ConcatVectors(batch_dims, {m, n}),
+                           /*broadcast_dimensions=*/dim_ids);
+    a = Select(Eq(iota_mn, j), new_x, a);
+
+    // taus[j] = tau
+    std::vector<int64> tau_broadcast_dims(batch_dims.size());
+    std::iota(tau_broadcast_dims.begin(), tau_broadcast_dims.end(), 0);
+
+    auto iota_n =
+        Iota(builder, ShapeUtil::MakeShape(S32, ConcatVectors(batch_dims, {n})),
+             minor_dim);
+    auto taus_zeros = ZerosLike(taus);
+    auto taus_update = Select(
+        Eq(iota_n, j),
+        Add(taus_zeros, tau, /*broadcast_dimensions=*/tau_broadcast_dims),
+        taus_zeros);
+    taus = taus + taus_update;
+    return std::vector<XlaOp>{a, taus};
+  };
+
+  auto taus = Zeros(
+      builder,
+      ShapeUtil::MakeShape(type, ConcatVectors(batch_dims, {std::min(m, n)})));
+
+  TF_ASSIGN_OR_RETURN(auto values, ForEachIndex(std::min(m, n), S32, qr_body_fn,
+                                                {a, taus}, "qr", builder));
+
+  QrResult result;
+  result.a = values[0];
+  result.taus = values[1];
+  return result;
+}
+
+// Computes an upper triangular matrix T such that (I - Y @ T @ Y^t) is a
+// product of the elementary Householder reflectors given by `vs` and `taus`.
+//
+// Schreiber, Robert, and Charles Van Loan. "A storage-efficient WY
+// representation for products of Householder transformations." SIAM Journal on
+// Scientific and Statistical Computing 10.1 (1989): 53-57.
+//
+// def compact_wy(vs, taus):
+//   m, n = vs.shape[-2:]
+//   t = np.eye(n) * -taus
+//   # We premultiply Y.T @ vs, since we would prefer to compute a single matrix
+//   # multiplication to many matrix-vector products.
+//   vtv = -taus[None, :] * np.triu(np.conj(vs.T) @ vs, 1) + np.eye(n)
+//   for i in range(1, n):
+//     t[:, i] = scipy.linalg.blas.strmm(t, vtv[:, i])
+//   return t
+StatusOr<XlaOp> QrExpander::CompactWYRepresentation(
+    PrimitiveType type, absl::Span<const int64> batch_dims, XlaOp vs,
+    XlaOp taus, int64 m, int64 n, PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = vs.builder();
+
+  std::vector<int64> batch_dim_indices(batch_dims.size());
+  std::iota(batch_dim_indices.begin(), batch_dim_indices.end(), 0);
+  int64 n_index = batch_dims.size() + 1;
+
+  auto body_fn = [&](XlaOp j, absl::Span<const XlaOp> values,
+                     XlaBuilder* builder) -> StatusOr<std::vector<XlaOp>> {
+    // w has shape [..., m, n]
+    auto t = values[0];
+    const auto vtv = values[1];
+
+    // yv has shape [..., n, 1]
+    auto yv = DynamicSliceInMinorDims(vtv, {j}, {1});
+
+    // z has shape [..., n, 1]
+    auto z = BatchDot(t, yv, precision);
+
+    t = DynamicUpdateSliceInMinorDims(t, z, {j});
+
+    return std::vector<XlaOp>{t, vtv};
+  };
+
+  auto tau_scale = BroadcastInDim(-taus, ConcatVectors(batch_dims, {1, n}),
+                                  ConcatVectors(batch_dim_indices, {n_index}));
+
+  auto eye = Broadcast(IdentityMatrix(builder, type, n, n), batch_dims);
+  auto t = eye;
+
+  auto vtv = BatchDot(MaybeConjugate(vs, true), /*transpose_x=*/true, vs,
+                      /*transpose_y=*/false, precision);
+  vtv = Select(TriangleMask(vtv, 0), ZerosLike(vtv), vtv);
+  vtv = (vtv + eye) * tau_scale;
+
+  TF_ASSIGN_OR_RETURN(auto values,
+                      ForEachIndex(n, S32, body_fn, {t, vtv}, "wy", builder));
+  return values[0];
+}
+
+// Block Householder QR Factorization. Algorithm 5.2.2 of Golub and van Loan.
+// def qr_blocked(a, block_size):
+//   m = a.shape[0]
+//   n = a.shape[1]
+//   q = np.eye(m)
+//   for i in xrange(0, min(m, n), block_size):
+//     k = min(block_size, min(m, n) - s)
+//     (a, taus) = qr(a[i:, i:i+k])
+//     y = np.eye(m, n) + np.tril(a, -1)
+//     t = CompactWYRepresentation(vs, taus, m-i, k)
+//     a[i:, i+k:] += (y @ np.conj(t.T)) @ (np.conj(y.T) @ a[i:, i+k:])
+//     q[:, i:] += (q[:, i:] @ y) @ np.conj((y @ np.conj(t.T)).T)
+//   return (q, a)
+StatusOr<XlaOp> QrExpander::BuildQrDecomposition(
+    XlaOp a, int64 block_size, PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+  const int num_dims = a_shape.rank();
+  if (num_dims < 2) {
+    return InvalidArgument("Arguments to QR must have rank >= 2: got shape %s",
+                           a_shape.ToString());
+  }
+  PrimitiveType type = a_shape.element_type();
+
+  const int64 m = ShapeUtil::GetDimension(a_shape, -2);
+  const int64 n = ShapeUtil::GetDimension(a_shape, -1);
+  const int64 p = std::min(m, n);
+
+  if (block_size < 1) {
+    return InvalidArgument("block_size argument to QR must be >= 1; got %d",
+                           block_size);
+  }
+
+  const int64 num_batch_dims = num_dims - 2;
+  std::vector<int64> batch_dims(num_batch_dims);
+  for (int i = 0; i < num_batch_dims; ++i) {
+    batch_dims[i] = ShapeUtil::GetDimension(a_shape, i);
+  }
+
+  auto q = Broadcast(IdentityMatrix(builder, type, m, m), batch_dims);
+  for (int64 i = 0; i < p; i += block_size) {
+    int64 k = std::min(block_size, p - i);
+
+    auto a_block = SliceInMinorDims(a, {i, i}, {m, i + k});
+    TF_ASSIGN_OR_RETURN(auto qr_block, QrBlock(a_block, precision));
+    auto y = Add(
+        IdentityMatrix(builder, type, m - i, k),
+        Select(TriangleMask(qr_block.a, -1), qr_block.a, ZerosLike(qr_block.a)),
+        /*broadcast_dimensions=*/{num_dims - 2, num_dims - 1});
+
+    a = UpdateSliceInMinorDims(a, qr_block.a, {i, i});
+
+    // Compute the I + Y @ T @ Y^t block representation of a product of
+    // Householder matrices.
+    TF_ASSIGN_OR_RETURN(
+        auto t, CompactWYRepresentation(type, batch_dims, y, qr_block.taus,
+                                        m - i, k, precision));
+
+    // a[i:, i+k:] += (y @ np.conj(t.T)) @ (np.conj(y.T) @ a[i:, i+k:])
+    auto yt = BatchDot(y, /*transpose_x=*/false, MaybeConjugate(t, true),
+                       /*transpose_y=*/true, precision);
+    auto a_panel = SliceInMinorDims(a, {i, i + k}, {m, n});
+    auto a_update =
+        BatchDot(MaybeConjugate(y, true), /*transpose_x=*/true, a_panel,
+                 /*transpose_y=*/false, precision);
+    a_update = BatchDot(yt, a_update, precision);
+    a_panel = a_panel + a_update;
+    a = UpdateSliceInMinorDims(a, a_panel, {i, i + k});
+
+    // q[:, i:] += (q[:, i:] @ y) @ np.conj((y @ np.conj(t.T)).T)
+    auto q_panel = SliceInMinorDims(q, {0, i}, {m, m});
+    auto q_update = BatchDot(q_panel, y, precision);
+    q_update =
+        BatchDot(q_update, /*transpose_x=*/false, MaybeConjugate(yt, true),
+                 /*transpose_y=*/true, precision);
+    q_panel = q_panel + q_update;
+    q = UpdateSliceInMinorDims(q, q_panel, {0, i});
+  }
+
+  return Tuple(builder, {q, UpperTriangle(a)});
+}
+
+bool QrExpander::InstructionMatchesPattern(HloInstruction* instruction) {
+  return instruction->opcode() == HloOpcode::kCustomCall &&
+         instruction->custom_call_target() == "QrDecomposition";
+}
+
+StatusOr<HloInstruction*> QrExpander::ExpandInstruction(
+    HloInstruction* instruction) {
+  const string name =
+      absl::StrFormat("xla.qr_%s", instruction->operand(0)->shape().ToString());
+
+  HloModule* module = instruction->parent()->parent();
+
+  HloComputation*& computation =
+      computation_cache_.emplace(name, nullptr).first->second;
+  if (!computation) {
+    // Builds a new expansion.
+    //
+    // TODO(b/62327888): We do something unusual here: we build the computation
+    // using the XlaBuilder API, which is nominally an XLA client API. We do
+    // this because the external APIs for building complicated computations
+    // (XlaBuilder) are much more ergonomic than the internal ones. As it turns
+    // out, XlaBuilder isn't really a client API—what it does is build a
+    // HloModuleProto protocol buffer, that we can then deserialize and clone
+    // into our HloModule. Ideally we would avoid the protocol buffer step;
+    // that is left as an exercise for future work.
+    XlaBuilder builder(name);
+    XlaOp a = Parameter(&builder, 0, instruction->operand(0)->shape(), "a");
+    TF_ASSIGN_OR_RETURN(
+        XlaOp l, BuildQrDecomposition(a,
+                                      /*block_size=*/128,
+                                      /*precision=*/PrecisionConfig::HIGHEST));
+
+    TF_ASSIGN_OR_RETURN(XlaComputation xla_computation, builder.Build(l));
+
+    TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
+                        xla_computation.GetProgramShape());
+    HloModuleConfig config(program_shape);
+    TF_ASSIGN_OR_RETURN(auto new_module, HloModule::CreateFromProto(
+                                             xla_computation.proto(), config));
+    HloCloneContext context(module);
+    computation =
+        module->DeepCloneComputation(new_module->entry_computation(), &context);
+  }
+
+  return instruction->parent()->AddInstruction(HloInstruction::CreateCall(
+      instruction->shape(), instruction->operands(), computation));
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/qr_expander.h b/tensorflow/compiler/xla/service/qr_expander.h
new file mode 100644
index 00000000000..669ace39efb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/qr_expander.h
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_QR_EXPANDER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_QR_EXPANDER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/service/op_expander_pass.h"
+
+namespace xla {
+
+class QrExpander : public OpExpanderPass {
+ public:
+  absl::string_view name() const override { return "qr_expander"; }
+
+ protected:
+  bool InstructionMatchesPattern(HloInstruction* instruction) override;
+
+  StatusOr<HloInstruction*> ExpandInstruction(
+      HloInstruction* instruction) override;
+
+  struct QrResult {
+    // The upper-triangular matrix R, packed together with the lower-triangular
+    // elementary Householder reflectors `vs` below the diagonal.
+    XlaOp a;
+
+    // Representation of the Householder matrices I - beta v v.T
+    XlaOp taus;  // Shape: [..., min(m, n)]
+  };
+
+  virtual StatusOr<QrResult> QrBlock(XlaOp a,
+                                     PrecisionConfig::Precision precision);
+
+  virtual StatusOr<XlaOp> CompactWYRepresentation(
+      PrimitiveType type, absl::Span<const int64> batch_dims, XlaOp vs,
+      XlaOp taus, int64 m, int64 n, PrecisionConfig::Precision precision);
+
+ private:
+  StatusOr<XlaOp> BuildQrDecomposition(XlaOp a, int64 block_size,
+                                       PrecisionConfig::Precision precision);
+
+  // Mapping from op signatures to existing computations.
+  absl::flat_hash_map<string, HloComputation*> computation_cache_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_QR_EXPANDER_H_
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index 473a9ca7456..67c7896cebd 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -31,13 +31,18 @@ limitations under the License.
 
 namespace xla {
 
-ShapedBuffer::ShapedBuffer(Shape on_host_shape, Shape on_device_shape,
-                           const se::Platform* platform, int device_ordinal)
-    : on_host_shape_(std::move(on_host_shape)),
-      on_device_shape_(std::move(on_device_shape)),
+ShapedBuffer::ShapedBuffer(Shape on_device_shape, const se::Platform* platform,
+                           int device_ordinal)
+    : on_device_shape_(std::move(on_device_shape)),
       platform_(platform),
       device_ordinal_(device_ordinal),
-      buffers_(&on_device_shape_) {}
+      buffers_(&on_device_shape_) {
+  on_host_shape_ = ShapeUtil::DeviceShapeToHostShape(on_device_shape_);
+}
+
+ShapedBuffer::ShapedBuffer(Shape on_host_shape, Shape on_device_shape,
+                           const se::Platform* platform, int device_ordinal)
+    : ShapedBuffer(on_device_shape, platform, device_ordinal) {}
 
 ShapedBuffer::ShapedBuffer(ShapedBuffer&& s)
     : on_host_shape_(std::move(s.on_host_shape_)),
@@ -52,8 +57,8 @@ ShapedBuffer::ShapedBuffer(ShapedBuffer&& s)
 }
 
 ShapedBuffer& ShapedBuffer::operator=(ShapedBuffer&& s) {
-  on_host_shape_ = std::move(s.on_host_shape_);
   on_device_shape_ = std::move(s.on_device_shape_);
+  on_host_shape_ = std::move(s.on_host_shape_);
   platform_ = s.platform_;
   device_ordinal_ = s.device_ordinal_;
   buffers_ = std::move(s.buffers_);
@@ -68,12 +73,9 @@ ShapedBuffer::~ShapedBuffer() {}
 
 StatusOr<ShapedBuffer> ShapedBuffer::SubShapedBuffer(
     const ShapeIndex& index) const {
-  TF_ASSIGN_OR_RETURN(const Shape* host_sub_shape,
-                      ShapeUtil::TryGetSubshape(on_host_shape(), index));
   TF_ASSIGN_OR_RETURN(const Shape* device_sub_shape,
                       ShapeUtil::TryGetSubshape(on_device_shape(), index));
-  ShapedBuffer sub_shaped_buffer(*host_sub_shape, *device_sub_shape, platform_,
-                                 device_ordinal_);
+  ShapedBuffer sub_shaped_buffer(*device_sub_shape, platform_, device_ordinal_);
   TF_ASSIGN_OR_RETURN(ShapeTree<se::DeviceMemoryBase> sub_buffers,
                       buffers_.SubShapeTree(index));
   sub_shaped_buffer.set_buffers(std::move(sub_buffers));
@@ -88,12 +90,11 @@ void ShapedBuffer::clear() {
 }
 
 string ShapedBuffer::ToString() const {
-  string s = absl::StrCat(
-      "ShapedBuffer(", platform_->Name(), ":", device_ordinal(),
-      "), on-host shape=" + ShapeUtil::HumanStringWithLayout(on_host_shape()),
-      ", on-device shape=" +
-          ShapeUtil::HumanStringWithLayout(on_device_shape()),
-      ":\n");
+  string s =
+      absl::StrCat("ShapedBuffer(", platform_->Name(), ":", device_ordinal(),
+                   "), on-device shape=" +
+                       ShapeUtil::HumanStringWithLayout(on_device_shape()),
+                   ":\n");
   ShapeUtil::ForEachSubshape(
       on_device_shape(),
       [this, &s](const Shape& subshape, const ShapeIndex& index) {
@@ -116,13 +117,19 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer) {
   return out;
 }
 
+ScopedShapedBuffer::ScopedShapedBuffer(Shape on_device_shape,
+                                       se::DeviceMemoryAllocator* allocator,
+                                       int device_ordinal)
+    : ShapedBuffer(std::move(on_device_shape), allocator->platform(),
+                   device_ordinal),
+      allocator_(allocator) {}
+
 ScopedShapedBuffer::ScopedShapedBuffer(Shape on_host_shape,
                                        Shape on_device_shape,
                                        se::DeviceMemoryAllocator* allocator,
                                        int device_ordinal)
-    : ShapedBuffer(std::move(on_host_shape), std::move(on_device_shape),
-                   allocator->platform(), device_ordinal),
-      allocator_(allocator) {}
+    : ScopedShapedBuffer(std::move(on_device_shape), allocator,
+                         device_ordinal) {}
 
 ScopedShapedBuffer::ScopedShapedBuffer(ShapedBuffer shaped_buffer,
                                        se::DeviceMemoryAllocator* allocator)
@@ -171,13 +178,11 @@ void ScopedShapedBuffer::Deallocate() {
 }
 
 ScopedShapedBuffer ScopedShapedBuffer::TakeSubTree(ShapeIndexView index) {
-  const xla::Shape& sub_on_host_shape =
-      xla::ShapeUtil::GetSubshape(on_host_shape(), {index});
   const xla::Shape& sub_on_device_shape =
       xla::ShapeUtil::GetSubshape(on_device_shape(), {index});
 
-  ScopedShapedBuffer output(sub_on_host_shape, sub_on_device_shape,
-                            memory_allocator(), device_ordinal());
+  ScopedShapedBuffer output(sub_on_device_shape, memory_allocator(),
+                            device_ordinal());
   auto src_it = buffers().find(index);
   auto dst_it = output.buffers().begin();
   while (dst_it != output.buffers().end()) {
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index 995b0ece7cd..7f1248998a6 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -43,6 +43,10 @@ class ShapedBuffer {
   // both the on-host and on-device shape are required. The on-device shape
   // determines the number of device allocations (DeviceMemoryBase) held by the
   // ShapedBuffer.
+  ShapedBuffer(Shape on_device_shape, const se::Platform* platform,
+               int device_ordinal);
+
+  // TODO(b/170310047): remove this overload.
   ShapedBuffer(Shape on_host_shape, Shape on_device_shape,
                const se::Platform* platform, int device_ordinal);
 
@@ -97,14 +101,18 @@ class ShapedBuffer {
   // Reset the shape of this shaped buffer and underlying buffer structure.
   //
   // Precondition: EqualStructure(this->on_device_shape_, on_device_shape).
-  void set_shapes(const Shape& on_host_shape, const Shape& on_device_shape) {
+  void set_shapes(const Shape& on_device_shape) {
     CHECK(ShapeUtil::EqualStructure(on_device_shape, on_device_shape_))
         << "Structures are not the same. new: " << on_device_shape
         << ", old: " << on_device_shape_;
-    on_host_shape_ = on_host_shape;
+    on_host_shape_ = ShapeUtil::DeviceShapeToHostShape(on_device_shape);
     on_device_shape_ = on_device_shape;
     buffers_.replace_shape_ptr(&on_device_shape_);
   }
+  // TODO(b/170310047): remove this overload.
+  void set_shapes(const Shape& on_host_shape, const Shape& on_device_shape) {
+    set_shapes(on_device_shape);
+  }
 
   // Returns the underlying ShapeTree containing all the device addresses in the
   // ShapedBuffer.
@@ -119,7 +127,6 @@ class ShapedBuffer {
   string ToString() const;
 
  protected:
-  // The shape of the data when represented on the host.
   Shape on_host_shape_;
 
   // The shape of the data on the device.
@@ -148,6 +155,10 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer);
 class ScopedShapedBuffer : public ShapedBuffer {
  public:
   // Creates a ScopedShapedBuffer with null DeviceMemoryBases at each index.
+  explicit ScopedShapedBuffer(Shape on_device_shape,
+                              se::DeviceMemoryAllocator* allocator,
+                              int device_ordinal);
+  // TODO(b/170310047): remove this overload.
   explicit ScopedShapedBuffer(Shape on_host_shape, Shape on_device_shape,
                               se::DeviceMemoryAllocator* allocator,
                               int device_ordinal);
diff --git a/tensorflow/compiler/xla/service/shaped_buffer_test.cc b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
index a2c208d62e4..49751d10c5a 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer_test.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
@@ -97,12 +97,12 @@ class TestAllocator : public se::DeviceMemoryAllocator {
 TEST(ScopedShapedBufferTest, TestMoveAssignmentOperator) {
   Shape s = ShapeUtil::MakeShape(F32, {1});
   TestAllocator allocator;
-  ScopedShapedBuffer sb1(s, s, &allocator, /*device_ordinal=*/0);
+  ScopedShapedBuffer sb1(s, &allocator, /*device_ordinal=*/0);
   sb1.set_buffer(
       allocator.Allocate(/*device_ordinal=*/0, /*size=*/42).ValueOrDie(),
       /*index=*/{});
 
-  ScopedShapedBuffer sb2(s, s, &allocator, /*device_ordinal=*/1);
+  ScopedShapedBuffer sb2(s, &allocator, /*device_ordinal=*/1);
   sb2.set_buffer(
       allocator.Allocate(/*device_ordinal=*/1, /*size=*/10).ValueOrDie(),
       /*index=*/{});
@@ -119,7 +119,7 @@ TEST(ScopedShapedBufferTest, TestTakeSubTree) {
   s = xla::ShapeUtil::MakeTupleShape(std::vector<xla::Shape>(2, s));
   s = xla::ShapeUtil::MakeTupleShape(std::vector<xla::Shape>(3, s));
 
-  ScopedShapedBuffer sb(s, s, &allocator, /*device_ordinal=*/0);
+  ScopedShapedBuffer sb(s, &allocator, /*device_ordinal=*/0);
   sb.buffers().ForEachMutableElement(
       [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
         TF_ASSERT_OK_AND_ASSIGN(
@@ -156,8 +156,7 @@ TEST(ScopedShapedBufferTest, TestSubShapeTree) {
   Shape tuple_shape =
       xla::ShapeUtil::MakeTupleShape({array_shape, array_shape});
   TestAllocator allocator;
-  ScopedShapedBuffer sb(tuple_shape, tuple_shape, &allocator,
-                        /*device_ordinal=*/0);
+  ScopedShapedBuffer sb(tuple_shape, &allocator, /*device_ordinal=*/0);
   sb.buffers().ForEachMutableElement(
       [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
         TF_ASSERT_OK_AND_ASSIGN(
@@ -182,7 +181,7 @@ void BM_TakeSubTree(int iters, int depth, int fan_out) {
     std::vector<xla::Shape> shapes(fan_out, shape);
     shape = xla::ShapeUtil::MakeTupleShape(shapes);
   }
-  xla::ScopedShapedBuffer shaped_buffer(shape, shape, /*allocator=*/&allocator,
+  xla::ScopedShapedBuffer shaped_buffer(shape, /*allocator=*/&allocator,
                                         /*device_ordinal=*/0);
   tensorflow::testing::StartTiming();
   for (int i = 0; i < iters; ++i) {
diff --git a/tensorflow/compiler/xla/service/sharding_propagation.cc b/tensorflow/compiler/xla/service/sharding_propagation.cc
index 7136ce82e25..6524973a08e 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation.cc
@@ -246,9 +246,31 @@ bool MaybeImproveInstructionSharding(HloSharding sharding,
     instruction->set_sharding(std::move(sharding));
     return true;
   }
-  auto merged = MergeSharding(instruction->sharding(), &sharding,
-                              may_combine_partial_sharding);
-  if (merged) {
+  int64 sharding_tiles = sharding.NumTiles();
+  if (MergeSharding(instruction->sharding(), &sharding,
+                    may_combine_partial_sharding)) {
+    // Override existing tiled sharding only when the new sharding is compatible
+    // with the existing one. This avoids unexpected resharding when `sharding`
+    // just has more tiles than existing sharding but they are not mergeable.
+    if (instruction->shape().IsArray() &&
+        !instruction->sharding().IsTileMaximal() &&
+        sharding.NumTiles() == sharding_tiles) {
+      std::vector<int64> diff_dims;
+      for (int64 i = 0; i < instruction->shape().rank(); ++i) {
+        if (instruction->sharding().tile_assignment().dim(i) ==
+            sharding.tile_assignment().dim(i)) {
+          continue;
+        }
+        if (instruction->sharding().tile_assignment().dim(i) != 1) {
+          return false;
+        }
+        diff_dims.push_back(i);
+      }
+      if (hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+              sharding, diff_dims) != instruction->sharding()) {
+        return false;
+      }
+    }
     instruction->set_sharding(std::move(sharding));
     return true;
   }
@@ -476,7 +498,7 @@ bool SupportSpatialPartitioning(const HloInstruction* instruction,
 
 bool InferDotShardingFromOperands(
     HloInstruction* instruction,
-    const dot_as_convolution_util::DotGeneralAsConvolutionDimsInfo& dnums,
+    const dot_as_convolution_util::DotConvolutionDimsInfo& dnums,
     bool may_combine_partial_sharding) {
   auto from_operand = [&](int64 operand_index) {
     auto operand = instruction->operand(operand_index);
@@ -543,9 +565,41 @@ bool InferDotShardingFromOperands(
 bool InferConvolutionShardingFromOperands(HloInstruction* instruction,
                                           int64 aggressiveness,
                                           bool may_combine_partial_sharding) {
-  if (auto dot_dims = dot_as_convolution_util::ParseDotGeneralFromConvolution(
-          instruction)) {
-    return InferDotShardingFromOperands(instruction, *dot_dims,
+  auto get_partitions_for_dims =
+      [&](const HloInstruction* inst,
+          absl::Span<
+              const dot_as_convolution_util::DotConvolutionDimsInfo::DimNums>
+              dims,
+          int lhs_or_rhs) {
+        int64 partitions = 1;
+        if (!inst->has_sharding()) {
+          return partitions;
+        }
+        const auto& sharding = inst->sharding();
+        if (sharding.IsTileMaximal()) {
+          return partitions;
+        }
+        for (const auto& dim : dims) {
+          if (lhs_or_rhs == 0) {
+            partitions *= sharding.tile_assignment().dim(dim.lhs);
+          } else {
+            CHECK_EQ(lhs_or_rhs, 1);
+            partitions *= sharding.tile_assignment().dim(dim.rhs);
+          }
+        }
+        return partitions;
+      };
+  auto dot_dims =
+      dot_as_convolution_util::ParseConvolutionDimsInfo(instruction);
+  const int64 lhs_conv_spatial_partitions = get_partitions_for_dims(
+      instruction->operand(0), dot_dims.conv_spatial_dims, 0);
+  const int64 rhs_conv_spatial_partitions = get_partitions_for_dims(
+      instruction->operand(1), dot_dims.conv_spatial_dims, 1);
+  if (dot_dims.conv_spatial_dims.empty() ||
+      (lhs_conv_spatial_partitions == 1 && rhs_conv_spatial_partitions == 1 &&
+       instruction->batch_group_count() == 1 &&
+       instruction->feature_group_count() == 1)) {
+    return InferDotShardingFromOperands(instruction, dot_dims,
                                         may_combine_partial_sharding);
   }
   const auto& dnums = instruction->convolution_dimension_numbers();
@@ -597,6 +651,10 @@ bool CanPropagateThroughAtAgressiveLevel(const HloInstruction& inst,
       inst.opcode() != HloOpcode::kReshape) {
     return false;
   }
+  // Broadcast propagation should have at least aggressiveness 2.
+  if (aggressiveness < 2 && inst.opcode() == HloOpcode::kBroadcast) {
+    return false;
+  }
   return true;
 }
 
@@ -743,14 +801,18 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       return changed;
     }
     case HloOpcode::kBroadcast: {
-      const HloInstruction* op = instruction->operand(0);
-      if (!IsSpatiallyPartitioned(op) || op->sharding().IsReplicated()) {
+      // Make forward propagation through broadcast low priority to avoid
+      // resharding after broadcast.
+      if (aggressiveness < 3) {
         return false;
       }
-      // Heuristic: If an operand is more than 8 times fewer elements than its
-      // output, do not propagate sharding.
-      if (ShapeUtil::ElementsIn(instruction->shape()) >
-          8 * ShapeUtil::ElementsIn(op->shape())) {
+      // Do not override existing tile sharding. This is likely from users.
+      if (IsSpatiallyPartitioned(instruction) &&
+          !instruction->sharding().IsTileMaximal()) {
+        return false;
+      }
+      const HloInstruction* op = instruction->operand(0);
+      if (!IsSpatiallyPartitioned(op) || op->sharding().IsReplicated()) {
         return false;
       }
       // The output will be tiled along the broadcasted dimension the same way
@@ -1031,7 +1093,7 @@ bool InferShardingFromOperands(HloInstruction* instruction,
 
 HloSharding InferDotOperandSharding(
     const HloInstruction* instruction,
-    const dot_as_convolution_util::DotGeneralAsConvolutionDimsInfo& dnums,
+    const dot_as_convolution_util::DotConvolutionDimsInfo& dnums,
     int64 operand_index, bool may_combine_partial_sharding) {
   auto operand = instruction->operand(operand_index);
   auto other = instruction->operand(1 - operand_index);
@@ -1185,10 +1247,10 @@ absl::optional<HloSharding> GetShardingFromUser(
       return HloSharding::Tile(new_tile_assignment);
     }
     case HloOpcode::kConvolution: {
-      if (auto dot_dims =
-              dot_as_convolution_util::ParseDotGeneralFromConvolution(&user)) {
+      auto dot_dims = dot_as_convolution_util::ParseConvolutionDimsInfo(&user);
+      if (dot_dims.conv_spatial_dims.empty()) {
         int64 op_idx = user.operand_index(&instruction);
-        return InferDotOperandSharding(&user, *dot_dims, op_idx,
+        return InferDotOperandSharding(&user, dot_dims, op_idx,
                                        may_combine_partial_sharding);
       }
       return absl::nullopt;
@@ -1376,6 +1438,9 @@ absl::optional<HloSharding> GetShardingFromUser(
 bool InferShardingFromUsers(HloInstruction* instruction,
                             const ComputationMap& computation_map,
                             int64 aggressiveness, bool is_spmd) {
+  if (aggressiveness < 2 && instruction->opcode() == HloOpcode::kBroadcast) {
+    return false;
+  }
   if (!SupportSpatialPartitioning(instruction, computation_map, is_spmd)) {
     return false;
   }
@@ -1657,17 +1722,11 @@ StatusOr<bool> ShardingPropagation::Run(HloModule* module) {
   // indefinitely.
   int64 iterations = 0;
   auto run_to_fix_point = [&](int64 aggressiveness) {
-    absl::flat_hash_set<const HloInstruction*> workset;
-    for (const HloComputation* computation : module->computations()) {
-      for (const HloInstruction* instruction : computation->instructions()) {
-        // Remove the instructions where the sharding was provided from the
-        // outside so we don't modify them.
-        if (!provided_shardings.contains(instruction)) {
-          workset.insert(instruction);
-        }
-      }
-    }
-    while (!workset.empty()) {
+    absl::flat_hash_set<const HloInstruction*> already_inferred_from_operands;
+    absl::flat_hash_set<const HloInstruction*> already_inferred_from_users;
+    bool changed_last_iter = true;
+    while (changed_last_iter) {
+      changed_last_iter = false;
       int64 inferred_from_operand_counter = 0;
       int64 inferred_from_user_counter = 0;
       int64 instruction_counter = 0;
@@ -1680,17 +1739,14 @@ StatusOr<bool> ShardingPropagation::Run(HloModule* module) {
         for (const HloInstruction* instruction : instructions) {
           already_sharded_counter += (instruction->has_sharding() ? 1 : 0);
         }
-
-        instructions.erase(
-            std::remove_if(instructions.begin(), instructions.end(),
-                           [&](HloInstruction* instruction) {
-                             return !workset.contains(instruction);
-                           }),
-            instructions.end());
-
         // First iterate the HLO graph in post order taking shardings from
         // operands.
         for (HloInstruction* instruction : instructions) {
+          if (already_inferred_from_operands.contains(instruction) ||
+              provided_shardings.contains(instruction)) {
+            continue;
+          }
+          already_inferred_from_operands.insert(instruction);
           if (InferShardingFromOperands(instruction, computation_map, is_spmd_,
                                         aggressiveness)) {
             ++inferred_from_operand_counter;
@@ -1698,31 +1754,37 @@ StatusOr<bool> ShardingPropagation::Run(HloModule* module) {
             VLOG(2) << "Add sharding (forward-pass): "
                     << instruction->ToString();
             maybe_computation_propagation(instruction);
-            for (auto user : instruction->users()) {
-              if (!provided_shardings.contains(user)) {
-                workset.insert(user);
-              }
+            for (auto operand : instruction->operands()) {
+              already_inferred_from_users.erase(operand);
             }
-          } else {
-            workset.erase(instruction);
+            for (auto user : instruction->users()) {
+              already_inferred_from_operands.erase(user);
+            }
+            changed_last_iter = true;
           }
         }
 
         // Then iterate the HLO graph in reverse post order taking shardings
         // from users.
         for (auto it = instructions.rbegin(); it != instructions.rend(); ++it) {
+          if (already_inferred_from_users.contains(*it) ||
+              provided_shardings.contains(*it)) {
+            continue;
+          }
+          already_inferred_from_users.insert(*it);
           if (InferShardingFromUsers(*it, computation_map, aggressiveness,
                                      is_spmd_)) {
             ++inferred_from_user_counter;
             any_changed = true;
             VLOG(2) << "Add sharding (backward-pass): " << (*it)->ToString();
             maybe_computation_propagation(*it);
-            workset.insert(*it);
             for (auto operand : (*it)->operands()) {
-              if (!provided_shardings.contains(operand)) {
-                workset.insert(operand);
-              }
+              already_inferred_from_users.erase(operand);
             }
+            for (auto user : (*it)->users()) {
+              already_inferred_from_operands.erase(user);
+            }
+            changed_last_iter = true;
           }
         }
       }
@@ -1733,11 +1795,13 @@ StatusOr<bool> ShardingPropagation::Run(HloModule* module) {
               << inferred_from_operand_counter;
       VLOG(1) << "  shardings inferred from users: "
               << inferred_from_user_counter;
+      VLOG(1) << "  aggressiveness: " << aggressiveness;
       ++iterations;
     }
   };
-  run_to_fix_point(0);
-  run_to_fix_point(1);
+  for (int64 aggressiveness = 0; aggressiveness < 4; ++aggressiveness) {
+    run_to_fix_point(aggressiveness);
+  }
 
   VLOG(1) << "Sharding propagation completed after " << iterations
           << " iterations";
diff --git a/tensorflow/compiler/xla/service/sharding_propagation_test.cc b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
index 03c77c2038c..8c4d8fc24ff 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
@@ -65,22 +65,6 @@ ENTRY %elementwise {
               op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
 }
 
-TEST_F(ShardingPropagationTest, BroadcastForwardPassNoSharding) {
-  const char* const hlo_string = R"(
-HloModule module
-ENTRY %broadcast {
-  %param0 = f32[7,11]{1,0} parameter(0),
-    sharding={devices=[2,2]0,1,2,3}
-  %broadcast = f32[5,7,11,13]{3,2,1,0} broadcast(%param0), dimensions={1,2}
-  ROOT %copy = f32[5,7,11,13]{3,2,1,0} copy(%broadcast)
-})";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed,
-                          ShardingPropagation().Run(module.get()));
-  EXPECT_FALSE(changed);
-}
-
 // Regression Test for b/129569657.
 TEST_F(ShardingPropagationTest, BroadcastForwardPass) {
   const char* const hlo_string = R"(
@@ -530,6 +514,26 @@ ENTRY %pad {
               op::Sharding("{devices=[2,2]0,1,2,3}"));
 }
 
+TEST_F(ShardingPropagationTest, PartialReplicatedPadForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %pad {
+  %input = f32[11,17]{1,0} parameter(0),
+    sharding={devices=[2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  %pad_value = f32[] parameter(1)
+  %pad = f32[27,51]{1,0} pad(%input, %pad_value), padding=2_4_1x1_1_2
+  ROOT %copy = f32[27,51]{1,0} copy(%pad)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "pad"),
+      op::Sharding("{devices=[2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, ShardedPreferredOverReplicated) {
   const char* const hlo_string = R"(
 HloModule module
@@ -653,6 +657,25 @@ ENTRY %slice {
               op::Sharding("{devices=[2,1]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, PartialReplicatedStridedSlice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %slice {
+  %param = f32[17,13]{1,0} parameter(0),
+    sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+  %slice = f32[7,5]{1,0} slice(%param), slice={[1:15:2], [5:10:1]}
+  ROOT %tuple = (f32[7,5]{1,0}) tuple(%slice)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "slice"),
+              op::Sharding("{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, ReduceWindowBackwardPass) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1263,6 +1286,38 @@ ENTRY %conv {
               op::Sharding("{replicated}"));
 }
 
+TEST_F(ShardingPropagationTest,
+       ConvolutionFilterIFOFPartitionedInputPartialReplicate) {
+  const char* const hlo_string = R"(
+  HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,112,112,12] parameter(0)
+  %lhs.copy = f32[128,112,112,12] copy(f32[128,112,112,12] %lhs),
+    sharding={devices=[1,1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %rhs = f32[7,7,12,64] parameter(1)
+  %rhs.copy = f32[7,7,12,64] copy(f32[7,7,12,64] %rhs),
+    sharding={devices=[1,1,2,2]0,1,2,3}
+  %conv = f32[128,56,56,64] convolution(
+    f32[128,112,112,12] %lhs.copy,
+    f32[7,7,12,64] %rhs.copy),
+    window={size=7x7 stride=2x2 pad=3_3x3_3},
+    dim_labels=b01f_01io->b01f
+  ROOT %copy = f32[128,56,56,64] copy(conv)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  VLOG(1) << module->ToString();
+
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "conv"),
+      op::Sharding("{devices=[1,1,1,2,2]0,2,1,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, ConcatFromUserUnshardedDim) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1408,11 +1463,11 @@ ENTRY entry {
                           ShardingPropagation().Run(module.get()));
   EXPECT_TRUE(changed);
   EXPECT_THAT(FindInstruction(module.get(), "tp"),
-              op::Sharding("{{devices=[3,1]0,1,2}}"));
+              op::Sharding("{{devices=[1,2]0,1}}"));
   EXPECT_THAT(FindInstruction(module.get(), "tgte"),
-              op::Sharding("{devices=[3,1]0,1,2}"));
+              op::Sharding("{devices=[1,2]0,1}"));
   EXPECT_THAT(FindInstruction(module.get(), "ttr"),
-              op::Sharding("{devices=[1,3]0,1,2}"));
+              op::Sharding("{devices=[2,1]0,1}"));
   EXPECT_THAT(FindInstruction(module.get(), "tr"),
               op::Sharding("{{devices=[1,3]0,1,2}}"));
   EXPECT_THAT(FindInstruction(module.get(), "fp"),
@@ -1774,6 +1829,28 @@ ENTRY entry {
               op::Sharding("{devices=[2,1]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, GatherFromIndex_PartialReplicate) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={replicated}
+  %indices = s32[3] parameter(1),
+   sharding={devices=[2,2]0,1,2,3 last_tile_dim_replicate}
+  %gather = f32[3,9] gather(%input, %indices), offset_dims={1},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1,
+    slice_sizes={1,9}
+  ROOT %copy = f32[3,9] copy(%gather)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "gather"),
+              op::Sharding("{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, GatherFromDataOperand) {
   const char* hlo_string = R"(
 HloModule module
@@ -1795,6 +1872,28 @@ ENTRY entry {
               op::Sharding("{devices=[1,2]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, GatherFromDataOperand_PartialReplicate) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %gather = f32[3,9] gather(%input, %indices), offset_dims={1},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1,
+    slice_sizes={1,9}
+  ROOT %copy = f32[3,9] copy(%gather)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "gather"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, GatherToIndex) {
   const char* hlo_string = R"(
 HloModule module
@@ -1816,6 +1915,28 @@ ENTRY entry {
               op::Sharding("{devices=[2]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, GatherToIndex_PartialReplicate) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={replicated}
+  %p1 = s32[3] parameter(1)
+  %indices = s32[3] copy(%p1)
+  ROOT %gather = f32[3,9] gather(%input, %indices), offset_dims={1},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1,
+    slice_sizes={1,9},
+    sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "indices"),
+              op::Sharding("{devices=[2,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, GatherToIndex2) {
   const char* hlo_string = R"(
 HloModule module
@@ -1839,6 +1960,30 @@ ENTRY entry {
               op::Sharding("{devices=[1,2,1]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, GatherToIndex2_PartialReplicate) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = bf16[2,4819,4] parameter(0), sharding={replicated}
+  %p1 = s32[2,1000,2] parameter(1)
+  %indices = s32[2,1000,2] copy(%p1)
+  ROOT %gather = bf16[2,1000,4]
+    gather(bf16[2,4819,4] %input, s32[2,1000,2] %indices),
+    offset_dims={2}, collapsed_slice_dims={0,1},
+    start_index_map={0,1}, index_vector_dim=2, slice_sizes={1,1,4},
+    sharding={devices=[1,2,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      FindInstruction(module.get(), "indices"),
+      op::Sharding("{devices=[1,2,1,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, GatherToIndex3) {
   const char* hlo_string = R"(
 HloModule module
@@ -1883,6 +2028,27 @@ ENTRY entry {
               op::Sharding("{devices=[1,2]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, GatherToDataOperand_PartialReplicate) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %p0 = f32[2,9] parameter(0)
+  %input = f32[2,9] copy(%p0)
+  %indices = s32[3] parameter(1), sharding={replicated}
+  ROOT %gather = f32[3,9] gather(%input, %indices), offset_dims={1},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1,
+    slice_sizes={1,9}, sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "input"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, DataOperandToScatter) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1914,6 +2080,38 @@ ENTRY entry {
               op::Sharding("{devices=[1,2]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, DataOperandToScatter_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0),
+   sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %updates = f32[3,9] parameter(2), sharding={replicated}
+  %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+  ROOT %copy = f32[2,9] copy(%scatter)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "scatter"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, UpdateOperandToScatter) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1945,6 +2143,70 @@ ENTRY entry {
               op::Sharding("{devices=[1,2]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, UpdateOperandToScatter_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={replicated}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %updates = f32[3,9] parameter(2),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1
+  ROOT %copy = f32[2,9] copy(%scatter)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "scatter"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
+TEST_F(ShardingPropagationTest, ScatterToDataOperand_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %p0 = f32[2,9] parameter(0)
+  %input = f32[2,9] copy(%p0)
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %updates = f32[3,9] parameter(2), sharding={replicated}
+  ROOT %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "input"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, ScatterToDataOperand) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1976,6 +2238,38 @@ ENTRY entry {
               op::Sharding("{devices=[1,2]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, ScatterToUpdateOperand_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0)
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %p2 = f32[3,9] parameter(2)
+  %updates = f32[3,9] copy(%p2)
+  ROOT %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "updates"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, ScatterToUpdateOperand) {
   const char* const hlo_string = R"(
 HloModule module
@@ -2038,6 +2332,38 @@ ENTRY entry {
               op::Sharding("{devices=[2]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, ScatterUpdateToIndex_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={replicated}
+  %p1 = s32[3] parameter(1), sharding={replicated}
+  %indices = s32[3] copy(%p1)
+  %updates = f32[3,9] parameter(2),
+    sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "indices"),
+              op::Sharding("{devices=[2,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, ScatterIndexToUpdate) {
   const char* const hlo_string = R"(
 HloModule module
@@ -2069,6 +2395,38 @@ ENTRY entry {
               op::Sharding("{devices=[2,1]0,1}"));
 }
 
+TEST_F(ShardingPropagationTest, ScatterIndexToUpdate_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={replicated}
+  %indices = s32[3] parameter(1),
+    sharding={devices=[2,2]0,1,2,3 last_tile_dim_replicate}
+  %p2 = f32[3,9] parameter(2), sharding={replicated}
+  %updates = f32[3,9] copy(%p2)
+  ROOT %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "updates"),
+              op::Sharding("{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}"));
+}
+
 TEST_F(ShardingPropagationTest, PartialShardingOnElementwise) {
   const char* const hlo_string = R"(
 HloModule module
diff --git a/tensorflow/compiler/xla/service/space_to_batch_converter.cc b/tensorflow/compiler/xla/service/space_to_batch_converter.cc
new file mode 100644
index 00000000000..47aee8ed5a8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/space_to_batch_converter.cc
@@ -0,0 +1,478 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/service/space_to_batch_converter.h"
+
+#include <map>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/bitmap.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+namespace {
+
+// ConvolutionVisitor traverses the HLO computation and rewrites Convolution
+// operations with small batch counts into convolutions with larger batch
+// counts by moving space to batch.
+class ConvolutionVisitor : public DfsHloVisitorWithDefault {
+ public:
+  // Default visitor action is to do nothing and return OK.
+  Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
+    return Status::OK();
+  }
+
+  Status HandleConvolution(HloInstruction* convolution) override;
+
+  // Runs the visitor on a computation.
+  static bool Run(int64 limit_on_batch_size, HloComputation* computation);
+
+  // Returns whether any convolution ops were rewritten.
+  const bool changed() const { return changed_; }
+
+  ~ConvolutionVisitor() override = default;
+
+ private:
+  explicit ConvolutionVisitor(int64 limit_on_batch_size,
+                              HloComputation* computation)
+      : computation_(computation), limit_on_batch_size_(limit_on_batch_size) {}
+
+  // Current HloComputation instance the ConvolutionVisitor is traversing.
+  HloComputation* computation_;
+
+  // Whether rewrite has occurred.
+  bool changed_ = false;
+
+  // Limit on batch size to apply this technique on.
+  int64 limit_on_batch_size_;
+};
+
+bool ConvolutionVisitor::Run(int64 limit_on_batch_size,
+                             HloComputation* computation) {
+  ConvolutionVisitor visitor(limit_on_batch_size, computation);
+  TF_CHECK_OK(computation->Accept(&visitor));
+  return visitor.changed_;
+}
+
+Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
+  VLOG(1) << "Handling conv " << convolution->ToString();
+  changed_ = false;
+
+  ConvolutionDimensionNumbers dim_numbers =
+      convolution->convolution_dimension_numbers();
+
+  // If there are no spatial dims, we return.
+  if (dim_numbers.input_spatial_dimensions_size() < 1) {
+    return Status::OK();
+  }
+
+  // This is the spatial dimension we choose to spilt.
+  constexpr int64 kChosenSpatialDim = 0;
+  constexpr int64 kLowLimitForSplitCount = 4;
+  constexpr int64 kHighLimitForSplitCount = 24;
+
+  // Batch in batch_group_count has different semantics (it isn't true batch).
+  // Consider supporting this case in future if needed.
+  if (convolution->batch_group_count() != 1) {
+    return Status::OK();
+  }
+
+  if (convolution->window().dimensions(kChosenSpatialDim).window_dilation() !=
+      1) {
+    return Status::OK();
+  }
+
+  // TODO(b/168316428): Support base dilations.
+  if (convolution->window().dimensions(kChosenSpatialDim).base_dilation() !=
+      1) {
+    return Status::OK();
+  }
+
+  int64 activations_batch_dim = dim_numbers.input_batch_dimension();
+
+  const int64 old_batch_size =
+      convolution->operand(0)->shape().dimensions(activations_batch_dim);
+
+  if (old_batch_size > limit_on_batch_size_) {
+    return Status::OK();
+  }
+
+  auto kernel = convolution->mutable_operand(1);
+  const auto& kernel_shape = kernel->shape();
+  const int64 kernel_spatial_dim_size = kernel_shape.dimensions(
+      dim_numbers.kernel_spatial_dimensions(kChosenSpatialDim));
+
+  auto activations = convolution->mutable_operand(0);
+
+  int64 spatial_dimension_to_split =
+      dim_numbers.input_spatial_dimensions(kChosenSpatialDim);
+
+  const int64 input_dim_size = activations->shape().dimensions(
+      dim_numbers.input_spatial_dimensions(kChosenSpatialDim));
+
+  const int64 inherent_low_padding =
+      convolution->window().dimensions(kChosenSpatialDim).padding_low();
+  const int64 inherent_high_padding =
+      convolution->window().dimensions(kChosenSpatialDim).padding_high();
+  const bool inherent_padding_needed =
+      inherent_low_padding != 0 || inherent_high_padding != 0;
+
+  const int64 stride =
+      convolution->window().dimensions(kChosenSpatialDim).stride();
+
+  const int64 spatial_size =
+      input_dim_size + inherent_low_padding + inherent_high_padding;
+  VLOG(1) << "spatial size " << spatial_size;
+
+  int64 min_pad_size = INT64_MAX;
+  int64 num_splits;
+  // Explore several splitting points; choose one that requires least padding.
+  // This padding is done so that we can evenly reshape.
+  for (int64 j = kHighLimitForSplitCount; j >= kLowLimitForSplitCount; j--) {
+    if (input_dim_size / j < kernel_spatial_dim_size) {
+      continue;
+    }
+
+    if (spatial_size < j) {
+      continue;
+    }
+
+    const int64 output_offsets = convolution->shape().dimensions(
+        dim_numbers.output_spatial_dimensions(kChosenSpatialDim));
+    const int64 output_offsets_per_split = CeilOfRatio(output_offsets, j);
+
+    const int64 spatial_split_size = output_offsets_per_split * stride;
+
+    // Pad spatial dim
+    const int64 pad_size = spatial_split_size * j - spatial_size;
+    if (pad_size >= 0 && pad_size < min_pad_size) {
+      min_pad_size = pad_size;
+      num_splits = j;
+    }
+  }
+
+  // No suitable split found.
+  if (min_pad_size == INT64_MAX) {
+    return Status::OK();
+  }
+
+  // By now, we are certain that the space-to-batch transormation is going to
+  // take place.
+
+  // Create the new convolution dim numbers.
+  auto new_dim_numbers = dim_numbers;
+
+  // We'd need transposition of activations here such that batch and space dim
+  // that is being split are adjacent (in that order).
+  if (spatial_dimension_to_split != activations_batch_dim + 1) {
+    int64 pushed_counter = 0;
+    std::vector<int64> transpose_dims;
+    int64 new_batch_dim, new_spatial_dim;
+    for (int i = 0; i < activations->shape().rank(); ++i) {
+      if (i == activations_batch_dim) {
+        continue;
+      }
+      if (i == spatial_dimension_to_split) {
+        new_dim_numbers.set_input_batch_dimension(pushed_counter);
+        transpose_dims.push_back(activations_batch_dim);
+        new_batch_dim = pushed_counter;
+        pushed_counter++;
+        new_spatial_dim = pushed_counter;
+      }
+
+      if (i == dim_numbers.input_feature_dimension()) {
+        new_dim_numbers.set_input_feature_dimension(pushed_counter);
+      } else {
+        for (int j = 0; j < dim_numbers.input_spatial_dimensions_size(); ++j) {
+          if (i == dim_numbers.input_spatial_dimensions(j)) {
+            new_dim_numbers.set_input_spatial_dimensions(j, pushed_counter);
+            break;
+          }
+        }
+      }
+      transpose_dims.push_back(i);
+      pushed_counter++;
+    }
+
+    activations_batch_dim = new_batch_dim;
+    spatial_dimension_to_split = new_spatial_dim;
+    TF_ASSIGN_OR_RETURN(activations,
+                        MakeTransposeHlo(activations, transpose_dims));
+  }
+
+  const int64 output_offsets = convolution->shape().dimensions(
+      dim_numbers.output_spatial_dimensions(kChosenSpatialDim));
+  const int64 output_offsets_per_split =
+      CeilOfRatio(output_offsets, num_splits);
+
+  const int64 spatial_split_size = output_offsets_per_split * stride;
+  const int64 slice_size =
+      (output_offsets_per_split - 1) * stride + kernel_spatial_dim_size;
+
+  VLOG(1) << "spatial_split_size " << spatial_split_size << " stride "
+          << stride;
+
+  // Pad spatial dim.
+  const int64 pad_size = spatial_split_size * num_splits - spatial_size;
+
+  VLOG(1) << "spatial_dimension_to_split " << spatial_dimension_to_split
+          << " num_splits " << num_splits << " kernel_spatial_dim_size "
+          << kernel_spatial_dim_size;
+
+  // Because we are splitting the spatial dimension, if convolution needed
+  // padding in the spatial dimension, we materialize it.
+  if (pad_size != 0 || inherent_padding_needed) {
+    PaddingConfig padding_config =
+        MakeNoPaddingConfig(activations->shape().dimensions_size());
+    padding_config.mutable_dimensions(spatial_dimension_to_split)
+        ->set_edge_padding_high(inherent_high_padding + pad_size);
+    padding_config.mutable_dimensions(spatial_dimension_to_split)
+        ->set_edge_padding_low(inherent_low_padding);
+    HloInstruction* padding =
+        computation_->AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::Zero(activations->shape().element_type())));
+    TF_ASSIGN_OR_RETURN(activations,
+                        MakePadHlo(activations, padding, padding_config));
+  }
+  VLOG(1) << "Initial padded activations shape "
+          << activations->shape().ToString();
+
+  // Now we reorganize the activations. E.g. if the shape [B, SPACE] was [1, 16]
+  // and 4 splits were needed, we first create [4, 4]. Next, to deal with halo
+  // in the spatial dimension, we first pad that dimension. E.g. if halo size
+  // was 2, we'd create a shape of [4, 6]. We then flatten the shape such that
+  // A = [1, 24]. Now, we rotate the flattened 24 dimension left by 2 (with
+  // -2 low padding and +2 high padding) to create shape B. Then, we select
+  // between A and B such that halo regions are placed into A at the right
+  // locations.
+
+  // The benefit of the above mentioned scheme is that it allows for batch
+  // growth. Here are some examples of the size increases it causes for a 3x3
+  // kernel.
+  // with batch=1, [1,16] -> [4,4] ->   [4,6] ->   [1,24] growth of 8.
+  // with batch=2, [2,16] -> [8,4] ->   [8,6] ->   [1,48] growth of 16.
+  // with batch=3, [3,16] -> [12,4] -> [12,6] -> [1,72] growth of 24.
+
+  std::vector<int64> reshape_dimensions(
+      activations->shape().dimensions().begin(),
+      activations->shape().dimensions().end());
+
+  reshape_dimensions[spatial_dimension_to_split] = spatial_split_size;
+  reshape_dimensions[activations_batch_dim] = num_splits * old_batch_size;
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * batch_increased_reshape,
+                      MakeReshapeHlo(reshape_dimensions, activations));
+  convolution->SetupDerivedInstruction(batch_increased_reshape);
+
+  VLOG(1) << "First reshape done " << batch_increased_reshape->ToString();
+
+  PaddingConfig padding_config =
+      MakeNoPaddingConfig(batch_increased_reshape->shape().dimensions_size());
+  padding_config.mutable_dimensions(spatial_dimension_to_split)
+      ->set_edge_padding_high(slice_size - spatial_split_size);
+  HloInstruction* padding =
+      computation_->AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::Zero(batch_increased_reshape->shape().element_type())));
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * pad_applied,
+      MakePadHlo(batch_increased_reshape, padding, padding_config));
+
+  VLOG(1) << "Padding done " << pad_applied->ToString();
+
+  auto straightened_activations_dims = reshape_dimensions;
+  straightened_activations_dims[spatial_dimension_to_split] =
+      num_splits * slice_size;
+  straightened_activations_dims[activations_batch_dim] = old_batch_size;
+
+  VLOG(1) << "slice_size " << slice_size;
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * straightened_activations,
+      MakeReshapeHlo(straightened_activations_dims, pad_applied));
+
+  VLOG(1) << "Straightening done";
+
+  PaddingConfig rotation_padding_config =
+      MakeNoPaddingConfig(straightened_activations->shape().dimensions_size());
+  rotation_padding_config.mutable_dimensions(spatial_dimension_to_split)
+      ->set_edge_padding_high(slice_size - spatial_split_size);
+  rotation_padding_config.mutable_dimensions(spatial_dimension_to_split)
+      ->set_edge_padding_low(spatial_split_size - slice_size);
+  HloInstruction* rotation_padding =
+      computation_->AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::Zero(straightened_activations->shape().element_type())));
+  TF_ASSIGN_OR_RETURN(HloInstruction * rotated_activations,
+                      MakePadHlo(straightened_activations, rotation_padding,
+                                 rotation_padding_config));
+  convolution->SetupDerivedInstruction(rotated_activations);
+
+  // Build a constant PRED to decide which elements in the split dimension
+  // are from halo.
+  tensorflow::core::Bitmap b(num_splits * slice_size);
+  for (int k = 0; k < num_splits * slice_size; ++k) {
+    if (k % slice_size < spatial_split_size) {
+      b.set(k);
+    } else {
+      b.clear(k);
+    }
+  }
+
+  auto arg_literal = LiteralUtil::CreateR1(b);
+  HloInstruction* slice_mask = computation_->AddInstruction(
+      HloInstruction::CreateConstant(std::move(arg_literal)));
+
+  // Broadcast the mask in all dimensions of the activations.
+  HloInstruction* shape_mask =
+      MakeBroadcastHlo(slice_mask, {spatial_dimension_to_split},
+                       straightened_activations->shape().dimensions());
+
+  VLOG(1) << "Shape mask made " << shape_mask->ToString();
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * select,
+                      MakeSelectHlo(shape_mask, straightened_activations,
+                                    rotated_activations, convolution));
+  VLOG(1) << "Select generated" << select->ToString();
+
+  // Increase batch size for one last time.
+  std::vector<int64> combined_batch_dimensions(
+      pad_applied->shape().dimensions().begin(),
+      pad_applied->shape().dimensions().end());
+
+  combined_batch_dimensions[activations_batch_dim] =
+      old_batch_size * num_splits;
+  TF_ASSIGN_OR_RETURN(activations,
+                      MakeReshapeHlo(combined_batch_dimensions, select));
+
+  VLOG(1) << "Batch merge done " << activations->ToString();
+
+  // Now, we rewrite the convolution with a larger batch.
+  const auto& activations_shape = activations->shape();
+  const int64 rank = activations_shape.dimensions_size();
+
+  // We will generate output such that batch is followed by the split spatial
+  // dimension.
+  std::vector<int64> transpose_dims(convolution->shape().rank());
+  int dim_count = 0;
+  std::map<int64, int64> dim_map;
+
+  for (int j = 0; j < dim_numbers.output_spatial_dimensions_size(); ++j) {
+    if (j == kChosenSpatialDim) {
+      dim_map[dim_numbers.output_batch_dimension()] = dim_count;
+      new_dim_numbers.set_output_batch_dimension(dim_count++);
+    }
+    dim_map[dim_numbers.output_spatial_dimensions(j)] = dim_count;
+    new_dim_numbers.set_output_spatial_dimensions(j, dim_count);
+    dim_count++;
+  }
+
+  dim_map[dim_numbers.output_feature_dimension()] = dim_count;
+  new_dim_numbers.set_output_feature_dimension(dim_count);
+
+  int p = 0;
+  for (const auto& entry : dim_map) {
+    transpose_dims[p] = entry.second;
+    p++;
+  }
+
+  auto new_window = convolution->window();
+  new_window.mutable_dimensions(kChosenSpatialDim)->set_padding_high(0);
+  new_window.mutable_dimensions(kChosenSpatialDim)->set_padding_low(0);
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * new_conv,
+      MakeConvolveHlo(activations, /*rhs=*/convolution->mutable_operand(1),
+                      convolution->feature_group_count(),
+                      convolution->batch_group_count(), new_window,
+                      new_dim_numbers, convolution->precision_config()));
+  convolution->SetupDerivedInstruction(new_conv);
+
+  VLOG(1) << "new_conv " << new_conv->ToString();
+
+  const int64 output_split_spatial_dim =
+      new_dim_numbers.output_spatial_dimensions(kChosenSpatialDim);
+  const int64 output_batch_dim = new_dim_numbers.output_batch_dimension();
+
+  Shape new_shape = new_conv->shape();
+  const int64 new_batch_size = new_shape.dimensions(output_batch_dim);
+  const int64 new_spatial_dim_size =
+      new_shape.dimensions(output_split_spatial_dim);
+
+  CHECK_EQ(new_batch_size % old_batch_size, 0);
+
+  const int64 output_split_batch_size = new_batch_size / old_batch_size;
+
+  std::vector<int64> new_dimensions(new_conv->shape().dimensions().begin(),
+                                    new_conv->shape().dimensions().end());
+  new_dimensions[output_split_spatial_dim] =
+      output_split_batch_size * new_spatial_dim_size;
+  new_dimensions[new_dim_numbers.output_batch_dimension()] = old_batch_size;
+
+  // Reshape the output of the new conv into the old convolutions shape.
+  TF_ASSIGN_OR_RETURN(HloInstruction * reshape,
+                      MakeReshapeHlo(new_dimensions, new_conv));
+  convolution->SetupDerivedInstruction(reshape);
+
+  std::vector<int64> start_indices(rank, 0),
+      end_indices(new_dimensions.begin(), new_dimensions.end()),
+      strides(rank, 1);
+  end_indices[output_split_spatial_dim] = convolution->shape().dimensions(
+      dim_numbers.output_spatial_dimensions(kChosenSpatialDim));
+
+  // This slicing is getting rid of the padding we added to evenly divide space.
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * output_slice,
+      MakeSliceHlo(reshape, start_indices, end_indices, strides));
+  convolution->SetupDerivedInstruction(output_slice);
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * output_transpose,
+                      MakeTransposeHlo(output_slice, transpose_dims));
+  convolution->SetupDerivedInstruction(output_transpose);
+
+  VLOG(1) << "output_transpose " << output_transpose->ToString();
+
+  changed_ = true;
+  return computation_->ReplaceInstruction(convolution, output_transpose);
+}
+
+}  // namespace
+
+StatusOr<bool> ConvolutionSpaceToBatchConverter::Run(HloModule* module) {
+  XLA_VLOG_LINES(2, "ConvolutionSpaceToBatchConverter::Run(), before:\n" +
+                        module->ToString());
+  bool changed = false;
+  for (auto* comp : module->MakeNonfusionComputations()) {
+    if (ConvolutionVisitor::Run(limit_on_batch_size_, comp)) {
+      changed = true;
+    }
+  }
+  XLA_VLOG_LINES(2, "ConvolutionSpaceToBatchConverter::Run(), after:\n" +
+                        module->ToString());
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/space_to_batch_converter.h b/tensorflow/compiler/xla/service/space_to_batch_converter.h
new file mode 100644
index 00000000000..a92abda0337
--- /dev/null
+++ b/tensorflow/compiler/xla/service/space_to_batch_converter.h
@@ -0,0 +1,45 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SPACE_TO_BATCH_CONVERTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SPACE_TO_BATCH_CONVERTER_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+namespace xla {
+
+// A pass which rewrites convolutions such that space dimension is turned into
+// batch.
+class ConvolutionSpaceToBatchConverter : public HloModulePass {
+ public:
+  explicit ConvolutionSpaceToBatchConverter(int64 limit_on_batch_size = 1)
+      : limit_on_batch_size_(limit_on_batch_size) {}
+
+  absl::string_view name() const override {
+    return "convolution-space-to-batch-converter";
+  }
+
+  // Run convolution rewriting on the given computation. Returns whether the
+  // computation was changed.
+  StatusOr<bool> Run(HloModule* module) override;
+
+  int64 limit_on_batch_size_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SPACE_TO_BATCH_CONVERTER_H_
diff --git a/tensorflow/compiler/xla/service/space_to_batch_converter_test.cc b/tensorflow/compiler/xla/service/space_to_batch_converter_test.cc
new file mode 100644
index 00000000000..bbc3882cde9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/space_to_batch_converter_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/space_to_batch_converter.h"
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+namespace {
+
+using ConvolutionSpaceToBatchConverterTest = HloTestBase;
+namespace op = testing::opcode_matchers;
+
+TEST_F(ConvolutionSpaceToBatchConverterTest, SimpleBatch1) {
+  string hlo_string = R"(
+  
+  HloModule module
+ENTRY computation {
+  %p0 = bf16[1,258,258,32] parameter(0)
+  %p1 = bf16[3,3,32,32] parameter(1)
+  ROOT %convolution = bf16[1,256,256,32] convolution(%p0, %p1), window={size=3x3}, 
+  dim_labels=b01f_01io->b01f
+}
+
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  auto computation = module->entry_computation();
+  ConvolutionSpaceToBatchConverter converter;
+  ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Transpose());
+  EXPECT_THAT(root->operand(0), op::Slice());
+  auto reshape = root->operand(0)->operand(0);
+  EXPECT_THAT(reshape, op::Reshape());
+  EXPECT_THAT(reshape->operand(0), op::Convolution());
+  const int64 batch_dim = reshape->operand(0)
+                              ->convolution_dimension_numbers()
+                              .output_batch_dimension();
+  // Verify that the transform has increased the batch size.
+  EXPECT_GT(reshape->operand(0)->shape().dimensions(batch_dim), 1);
+}
+
+TEST_F(ConvolutionSpaceToBatchConverterTest, SimpleBatch2) {
+  string hlo_string = R"(
+  HloModule module
+  ENTRY computation {
+    %p0 = bf16[2,258,258,32] parameter(0)
+    %p1 = bf16[3,3,32,32] parameter(1)
+    ROOT %convolution = bf16[2,256,256,32] convolution(%p0, %p1), window={size=3x3}, 
+    dim_labels=b01f_01io->b01f
+  }
+
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  ConvolutionSpaceToBatchConverter converter(/*limit_on_batch_size=*/2);
+  ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
+  auto computation = module->entry_computation();
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Transpose());
+  EXPECT_THAT(root->operand(0), op::Slice());
+  auto reshape = root->operand(0)->operand(0);
+  EXPECT_THAT(reshape, op::Reshape());
+  EXPECT_THAT(reshape->operand(0), op::Convolution());
+  const int64 batch_dim = reshape->operand(0)
+                              ->convolution_dimension_numbers()
+                              .output_batch_dimension();
+  // Verify that the transform has increased the batch size.
+  EXPECT_GT(reshape->operand(0)->shape().dimensions(batch_dim), 1);
+}
+
+TEST_F(ConvolutionSpaceToBatchConverterTest, Batch4WithStrideAndPad) {
+  string hlo_string = R"(
+  HloModule module
+  ENTRY computation {
+    %p0 = bf16[4,224,224,3]{3,2,1,0} parameter(0)
+    %p1 = bf16[7,7,3,64]{3,2,1,0} parameter(1)
+  
+    ROOT %convolution.3 = bf16[4,112,112,64]{3,2,1,0} convolution(%p0, %p1), 
+      window={size=7x7 stride=2x2 pad=3_3x3_3}, dim_labels=b01f_01io->b01f
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  auto computation = module->entry_computation();
+  ConvolutionSpaceToBatchConverter converter(/*limit_on_batch_size=*/4);
+  ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_THAT(root, op::Transpose());
+  EXPECT_THAT(root->operand(0), op::Slice());
+  auto reshape = root->operand(0)->operand(0);
+  EXPECT_THAT(reshape, op::Reshape());
+  EXPECT_THAT(reshape->operand(0), op::Convolution());
+  const int64 batch_dim = reshape->operand(0)
+                              ->convolution_dimension_numbers()
+                              .output_batch_dimension();
+
+  EXPECT_GT(reshape->operand(0)->shape().dimensions(batch_dim), 4);
+}
+
+TEST_F(ConvolutionSpaceToBatchConverterTest, Batch1WithKernelDilation) {
+  string hlo_string = R"(
+  
+  HloModule module
+ENTRY computation {
+  %p2 = bf16[1,7,7,128]{3,0,2,1} parameter(0)
+  %p3 = bf16[1,1,512,128]{3,2,1,0} parameter(1)
+  ROOT %c = bf16[1,14,14,512]{3,0,2,1} convolution(%p2, %p3),
+    window={size=1x1 pad=0_1x0_1 lhs_dilate=2x2 rhs_reversal=1x1},
+    dim_labels=b01f_01oi->b01f
+}
+
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  ConvolutionSpaceToBatchConverter converter;
+  ASSERT_FALSE(converter.Run(module.get()).ValueOrDie());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/BUILD b/tensorflow/compiler/xla/service/spmd/BUILD
index d2243d30adf..9ebaaa8242f 100644
--- a/tensorflow/compiler/xla/service/spmd/BUILD
+++ b/tensorflow/compiler/xla/service/spmd/BUILD
@@ -1,5 +1,6 @@
 # Description: SPMD partitioning pass.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
@@ -23,6 +24,7 @@ cc_library(
         "spmd_partitioner_util.cc",
     ],
     hdrs = [
+        "convolution_handler.h",
         "spmd_partitioner.h",
         "spmd_partitioner_util.h",
     ],
diff --git a/tensorflow/compiler/xla/service/spmd/convolution_handler.cc b/tensorflow/compiler/xla/service/spmd/convolution_handler.cc
index 01d7ea2ff14..0d34c5b62e9 100644
--- a/tensorflow/compiler/xla/service/spmd/convolution_handler.cc
+++ b/tensorflow/compiler/xla/service/spmd/convolution_handler.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/xla/service/spmd/convolution_handler.h"
+
 #include "absl/algorithm/container.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/dot_as_convolution_util.h"
@@ -32,24 +34,36 @@ limitations under the License.
 
 namespace xla {
 namespace spmd {
+
 namespace {
 
-// Partition convolution.
-StatusOr<HloInstruction*> PartitionConvolution(
+// Partition convolution with batch group count.
+StatusOr<HloInstruction*> PartitionConvolutionWithBatchGroupCount(
     PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding, const Window& conv_window,
-    HloInstruction* original_hlo, int64 num_partitions,
-    const SpmdPartitionerOptions& options, HloInstruction* partition_id,
-    HloModule* module, SpmdBuilder* b);
-
-// Partition convolution with only paralell dims are tiled
-StatusOr<HloInstruction*> PartitionConvolutionWithParallelDimension(
-    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding, const Window& conv_window,
-    HloInstruction* original_hlo, int64 num_partitions, SpmdBuilder* b) {
+    const HloSharding& output_sharding,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*,
+        const Window& conv_window)>& create_sharded_conv,
+    const Window& conv_window, HloInstruction* original_hlo,
+    int64 num_partitions, SpmdBuilder* b) {
   TF_RET_CHECK(original_hlo->opcode() == HloOpcode::kConvolution);
+  if (original_hlo->batch_group_count() == 1 ||
+      original_hlo->batch_group_count() < num_partitions) {
+    return nullptr;
+  }
 
   const auto& dnums = original_hlo->convolution_dimension_numbers();
+  // Only supports batch_group_size equals input_batch_size case.
+  const int64 input_batch_size =
+      lhs.base_shape().dimensions(dnums.input_batch_dimension());
+  const int64 kernel_output_feature_size =
+      rhs.base_shape().dimensions(dnums.kernel_output_feature_dimension());
+  if (input_batch_size != kernel_output_feature_size ||
+      original_hlo->batch_group_count() != input_batch_size) {
+    return nullptr;
+  }
+
+  // Map RHS indices to LHS indices.
   std::vector<int64> rhs_to_lhs_indices(output_base_shape.rank());
   rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
       dnums.input_batch_dimension();
@@ -59,73 +73,149 @@ StatusOr<HloInstruction*> PartitionConvolutionWithParallelDimension(
     rhs_to_lhs_indices[dnums.kernel_spatial_dimensions(i)] =
         dnums.input_spatial_dimensions(i);
   }
+
+  // Map LHS indices to RHS indices.
   std::vector<int64> lhs_to_rhs_indices(output_base_shape.rank());
   for (int64 i = 0; i < rhs_to_lhs_indices.size(); ++i) {
     lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
   }
+
+  // Map LHS indices to output indices.
+  std::vector<int64> lhs_to_output_indices(lhs.base_shape().rank(), -1);
+  lhs_to_output_indices[dnums.input_batch_dimension()] =
+      dnums.output_feature_dimension();
+  lhs_to_output_indices[dnums.input_feature_dimension()] =
+      dnums.output_batch_dimension();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    lhs_to_output_indices[dnums.input_spatial_dimensions(i)] =
+        dnums.output_spatial_dimensions(i);
+  }
+
+  // Align LHS or RHS to other operand if input batch dim or kernel output
+  // feature dim is partitioned.
   auto aligned_rhs_sharding =
       hlo_sharding_util::TransposeSharding(lhs.sharding(), rhs_to_lhs_indices);
   auto aligned_lhs_sharding =
       hlo_sharding_util::TransposeSharding(rhs.sharding(), lhs_to_rhs_indices);
 
-  // Handling cases where all the partitioned dimensions are parallel
-  // dimensions.
-  int64 lhs_parallel_dim_partitions = 1;
-  int64 rhs_parallel_dim_partitions = 1;
-  std::vector<int64> parallel_spatial_dims;
-  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
-    int64 lhs_dim = dnums.input_spatial_dimensions(i);
-    int64 lhs_size = lhs.base_shape().dimensions(lhs_dim);
-    const auto& wd = conv_window.dimensions(i);
-    int64 rhs_dim = dnums.kernel_spatial_dimensions(i);
-    if (dot_as_convolution_util::ConvSpatialDimensionIsParallel(wd, lhs_size)) {
-      parallel_spatial_dims.emplace_back(i);
-      lhs_parallel_dim_partitions *= ShardCountAtDim(lhs.sharding(), lhs_dim);
-      rhs_parallel_dim_partitions *= ShardCountAtDim(rhs.sharding(), rhs_dim);
-    }
-  }
-  bool lhs_partition_dims_are_parallel =
-      (lhs_parallel_dim_partitions == num_partitions);
-  bool rhs_partition_dims_are_parallel =
-      (rhs_parallel_dim_partitions == num_partitions);
-
-  // If there is a parallel dim and all the partitioned dimensions are parallel
-  // dimensions in either LHS or RHS, simply create partitioned convolutions.
-  if (parallel_spatial_dims.empty() || ((!lhs_partition_dims_are_parallel) &&
-                                        (!rhs_partition_dims_are_parallel))) {
+  bool lhs_batch_dim_is_partitioned =
+      (ShardCountAtDim(lhs.sharding(), dnums.input_batch_dimension()) ==
+       num_partitions);
+  bool rhs_output_feature_dim_is_partitioned =
+      (ShardCountAtDim(rhs.sharding(),
+                       dnums.kernel_output_feature_dimension()) ==
+       num_partitions);
+  if (!lhs_batch_dim_is_partitioned && !rhs_output_feature_dim_is_partitioned) {
     return nullptr;
   }
-  // Reshard LHS or RHS to partition at parallel dimensions as the other
-  // operand.
-  if (lhs_partition_dims_are_parallel) {
+  // Reshard LHS or RHS to partition at batch dimension or output feature
+  // dimension as the other operand.
+  if (lhs_batch_dim_is_partitioned) {
+    rhs = rhs.Reshard(aligned_rhs_sharding);
+  } else {
+    lhs = lhs.Reshard(aligned_lhs_sharding);
+  }
+  // Align output sharding after LHS and RHS sharding are consistent.
+  auto aligned_output_sharding = hlo_sharding_util::TransposeSharding(
+      lhs.sharding(), lhs_to_output_indices);
+
+  // Create partitioned convolution.
+  TF_ASSIGN_OR_RETURN(
+      auto sharded_conv,
+      create_sharded_conv(lhs.hlo(), rhs.hlo(), b, conv_window));
+  sharded_conv->set_sharding(aligned_output_sharding);
+  return PartitionedHlo(sharded_conv, output_base_shape, lhs.state())
+      .Reshard(output_sharding)
+      .hlo();
+}
+
+// Partition convolution with feature group count.
+StatusOr<HloInstruction*> PartitionConvolutionWithFeatureGroupCount(
+    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
+    const HloSharding& output_sharding,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*,
+        const Window& conv_window)>& create_sharded_conv,
+    const Window& conv_window, HloInstruction* original_hlo,
+    int64 num_partitions, SpmdBuilder* b) {
+  TF_RET_CHECK(original_hlo->opcode() == HloOpcode::kConvolution);
+  if (original_hlo->feature_group_count() == 1 ||
+      original_hlo->feature_group_count() < num_partitions) {
+    return nullptr;
+  }
+
+  const auto& dnums = original_hlo->convolution_dimension_numbers();
+  const int64 input_feature_size =
+      lhs.base_shape().dimensions(dnums.input_feature_dimension());
+  const int64 kernel_output_feature_size =
+      rhs.base_shape().dimensions(dnums.kernel_output_feature_dimension());
+  if (input_feature_size != kernel_output_feature_size ||
+      input_feature_size % original_hlo->feature_group_count() != 0) {
+    return nullptr;
+  }
+
+  // Align RHS indices to LHS.
+  std::vector<int64> rhs_to_lhs_indices(output_base_shape.rank());
+  rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
+      dnums.input_feature_dimension();
+  rhs_to_lhs_indices[dnums.kernel_input_feature_dimension()] =
+      dnums.input_batch_dimension();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    rhs_to_lhs_indices[dnums.kernel_spatial_dimensions(i)] =
+        dnums.input_spatial_dimensions(i);
+  }
+
+  // Align LHS indices to RHS.
+  std::vector<int64> lhs_to_rhs_indices(output_base_shape.rank());
+  for (int64 i = 0; i < rhs_to_lhs_indices.size(); ++i) {
+    lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
+  }
+
+  // Align LHS indices to output.
+  std::vector<int64> lhs_to_output_indices(output_base_shape.rank());
+  lhs_to_output_indices[dnums.input_feature_dimension()] =
+      dnums.output_feature_dimension();
+  lhs_to_output_indices[dnums.input_batch_dimension()] =
+      dnums.output_batch_dimension();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    lhs_to_output_indices[dnums.input_spatial_dimensions(i)] =
+        dnums.output_spatial_dimensions(i);
+  }
+
+  // Align LHS or RHS if input_feature_dim or kernel_output_feature_dim is
+  // partitioned.
+  auto aligned_rhs_sharding =
+      hlo_sharding_util::TransposeSharding(lhs.sharding(), rhs_to_lhs_indices);
+  auto aligned_lhs_sharding =
+      hlo_sharding_util::TransposeSharding(rhs.sharding(), lhs_to_rhs_indices);
+
+  bool lhs_feature_dim_is_partitioned =
+      (ShardCountAtDim(lhs.sharding(), dnums.input_feature_dimension()) ==
+       num_partitions);
+  bool rhs_output_feature_dim_is_partitioned =
+      (ShardCountAtDim(rhs.sharding(),
+                       dnums.kernel_output_feature_dimension()) ==
+       num_partitions);
+  if (!lhs_feature_dim_is_partitioned &&
+      !rhs_output_feature_dim_is_partitioned) {
+    return nullptr;
+  }
+  // Reshard LHS or RHS to partition at input feature dimension or output
+  // feature dimension as the other operand.
+  if (lhs_feature_dim_is_partitioned) {
     rhs = rhs.Reshard(aligned_rhs_sharding);
   } else {
     lhs = lhs.Reshard(aligned_lhs_sharding);
   }
 
-  // Get LHS and RHS sharded shape.
-  auto lhs_shard_shape = MakePartitionedShape(lhs.base_shape(), lhs.sharding());
-  auto rhs_shard_shape = MakePartitionedShape(rhs.base_shape(), rhs.sharding());
+  // Align output sharding after LHS and RHS sharding are consistent.
+  auto aligned_output_sharding = hlo_sharding_util::TransposeSharding(
+      lhs.sharding(), lhs_to_output_indices);
 
-  // Update convolution window.
-  auto new_window = conv_window;
-  for (const auto& spatial_dim : parallel_spatial_dims) {
-    auto wd = new_window.mutable_dimensions(spatial_dim);
-    wd->set_size(lhs_shard_shape.dimensions(
-        dnums.input_spatial_dimensions(spatial_dim)));
-    wd->set_stride(std::max<int64>(1, wd->size() - 1));
-    wd->set_base_dilation(wd->size());
-  }
   TF_ASSIGN_OR_RETURN(
-      Shape sharded_conv_shape,
-      ShapeInference::InferConvolveShape(
-          lhs_shard_shape, rhs_shard_shape, original_hlo->feature_group_count(),
-          original_hlo->batch_group_count(), new_window, dnums));
-  auto sharded_conv = b->AddInstruction(HloInstruction::CreateConvolve(
-      sharded_conv_shape, lhs.hlo(), rhs.hlo(),
-      original_hlo->feature_group_count(), original_hlo->batch_group_count(),
-      new_window, dnums, original_hlo->precision_config()));
-  sharded_conv->set_sharding(original_hlo->sharding());
+      auto sharded_conv,
+      create_sharded_conv(lhs.hlo(), rhs.hlo(), b, conv_window));
+  sharded_conv->set_sharding(aligned_output_sharding);
   return PartitionedHlo(sharded_conv, output_base_shape, lhs.state())
       .Reshard(output_sharding)
       .hlo();
@@ -136,9 +226,12 @@ StatusOr<HloInstruction*> PartitionConvolutionWithParallelDimension(
 StatusOr<HloInstruction*>
 PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
     PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding, const Window& conv_window,
-    HloInstruction* original_hlo, HloInstruction* partition_id,
-    HloModule* module, SpmdBuilder* b) {
+    const HloSharding& output_sharding,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*,
+        const Window& conv_window)>& create_sharded_conv,
+    const Window& conv_window, HloInstruction* original_hlo,
+    HloInstruction* partition_id, HloModule* module, SpmdBuilder* b) {
   TF_RET_CHECK(original_hlo->opcode() == HloOpcode::kConvolution);
   TF_RET_CHECK(!lhs.sharding().IsTileMaximal() &&
                !rhs.sharding().IsTileMaximal());
@@ -188,6 +281,22 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
     rhs = rhs.Reshard(aligned_rhs_sharding).PadWithValue(zero);
   }
 
+  if (original_hlo->feature_group_count() > 1 &&
+      (lhs.sharding().tile_assignment().dim(dnums.input_feature_dimension()) >
+           1 ||
+       rhs.sharding().tile_assignment().dim(
+           dnums.kernel_output_feature_dimension()) > 1)) {
+    return nullptr;
+  }
+
+  if (original_hlo->batch_group_count() > 1 &&
+      (lhs.sharding().tile_assignment().dim(dnums.input_batch_dimension()) >
+           1 ||
+       rhs.sharding().tile_assignment().dim(
+           dnums.kernel_output_feature_dimension()) > 1)) {
+    return nullptr;
+  }
+
   // Reshard RHS so that each shard computes the partial sum of the full
   // shape result, and add AllReduce. See HandleConvolutionTiledLhsAndRhs()
   // that reshards LHS.
@@ -214,7 +323,7 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
     int64 lhs_dimension = dnums.input_spatial_dimensions(i);
     int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
     int64 shard_count = rhs.sharding().tile_assignment().dim(rhs_dimension);
-    auto wd = conv_window.dimensions(i);
+    const auto& wd = conv_window.dimensions(i);
     if (wd.base_dilation() != 1 || wd.window_reversal()) {
       return nullptr;
     }
@@ -260,7 +369,7 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
     // Calculate the left and right halo sizes as described in the comments
     // above. It calculcates the halo sizes with dilation, so we apply
     // CeilOfRatio({left,right}_halo_size, window_dilation).
-    auto wd = conv_window.dimensions(i);
+    const auto& wd = conv_window.dimensions(i);
     int64 padding_low = wd.padding_low();
     int64 padding_high = wd.padding_high();
     int64 base = lhs.base_shape().dimensions(lhs_dimension);
@@ -387,10 +496,9 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
     rhs_with_halo = *concat;
   }
 
-  auto conv = b->AddInstruction(HloInstruction::CreateConvolve(
-      output_base_shape, conv_lhs, rhs_with_halo,
-      original_hlo->feature_group_count(), original_hlo->batch_group_count(),
-      new_window, dnums, original_hlo->precision_config()));
+  TF_ASSIGN_OR_RETURN(
+      auto conv, create_sharded_conv(conv_lhs, rhs_with_halo, b, new_window));
+
   auto ar = collective_ops_creator.create_cross_partition_all_reduce(
       b, conv, MakeBinaryAdd(original_hlo->shape().element_type(), module), {},
       (*lhs.state().next_channel_id)++);
@@ -405,9 +513,12 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
 StatusOr<HloInstruction*>
 PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
     PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding, const Window& conv_window,
-    HloInstruction* original_hlo, HloInstruction* partition_id,
-    HloModule* module, SpmdBuilder* b) {
+    const HloSharding& output_sharding,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*,
+        const Window& conv_window)>& create_sharded_conv,
+    const Window& conv_window, HloInstruction* original_hlo,
+    HloInstruction* partition_id, HloModule* module, SpmdBuilder* b) {
   TF_RET_CHECK(original_hlo->opcode() == HloOpcode::kConvolution);
   TF_RET_CHECK(!lhs.sharding().IsTileMaximal() &&
                !rhs.sharding().IsTileMaximal());
@@ -430,7 +541,7 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
     lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
   }
 
-  Window window = conv_window;
+  const Window& window = conv_window;
   std::vector<int64> reversed_rhs_dims;
   for (int64 i = 0; i < window.dimensions_size(); ++i) {
     if (window.dimensions(i).window_reversal()) {
@@ -480,6 +591,21 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
         rhs.Reshard(aligned_rhs_sharding).PadWithValue(zero, reversed_rhs_dims);
   }
 
+  if (original_hlo->feature_group_count() > 1 &&
+      (lhs.sharding().tile_assignment().dim(dnums.input_feature_dimension()) >
+           1 ||
+       rhs.sharding().tile_assignment().dim(
+           dnums.kernel_output_feature_dimension()) > 1)) {
+    return nullptr;
+  }
+
+  if (original_hlo->batch_group_count() > 1 &&
+      (lhs.sharding().tile_assignment().dim(dnums.input_batch_dimension()) >
+           1 ||
+       rhs.sharding().tile_assignment().dim(
+           dnums.kernel_output_feature_dimension()) > 1)) {
+    return nullptr;
+  }
   // Reshard LHS by exchanging halo such that each shard computes the partial
   // sum of the full shape result, and add AllReduce.
   //
@@ -505,7 +631,7 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
     int64 lhs_dimension = dnums.input_spatial_dimensions(i);
     int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
     int64 shard_count = lhs.sharding().tile_assignment().dim(lhs_dimension);
-    auto wd = window.dimensions(i);
+    const auto& wd = window.dimensions(i);
     if (wd.base_dilation() != 1) {
       // TODO(wangtao): support parallel dim if it is replicate here.
       return nullptr;
@@ -540,7 +666,7 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
 
     // Calculate the left and right halo sizes as described in the comments
     // above.
-    auto wd = window.dimensions(i);
+    const auto& wd = window.dimensions(i);
     int64 padding_low = wd.padding_low();
     int64 padding_high = wd.padding_high();
     int64 base = lhs.base_shape().dimensions(lhs_dimension);
@@ -597,11 +723,8 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
     lhs_with_halo = *concat;
   }
 
-  auto conv = b->AddInstruction(HloInstruction::CreateConvolve(
-      output_base_shape, lhs_with_halo, rhs.hlo(),
-      original_hlo->feature_group_count(), original_hlo->batch_group_count(),
-      new_window, original_hlo->convolution_dimension_numbers(),
-      original_hlo->precision_config()));
+  TF_ASSIGN_OR_RETURN(
+      auto conv, create_sharded_conv(lhs_with_halo, rhs.hlo(), b, new_window));
   auto ar =
       lhs.state().collective_ops_creator.create_cross_partition_all_reduce(
           b, conv, MakeBinaryAdd(output_base_shape.element_type(), module), {},
@@ -616,8 +739,11 @@ PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
 // RHS.
 StatusOr<HloInstruction*> PartitionConvolutionTiledOutput(
     PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding, const Window& conv_window,
-    HloInstruction* original_hlo, SpmdBuilder* b) {
+    const HloSharding& output_sharding,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*,
+        const Window& conv_window)>& create_sharded_conv,
+    const Window& conv_window, HloInstruction* original_hlo, SpmdBuilder* b) {
   TF_RET_CHECK(original_hlo->opcode() == HloOpcode::kConvolution);
   const auto& dnums = original_hlo->convolution_dimension_numbers();
   TF_RET_CHECK(!output_sharding.IsTileMaximal());
@@ -668,19 +794,13 @@ StatusOr<HloInstruction*> PartitionConvolutionTiledOutput(
         resharded_operand_and_window->shard_window.dimensions(
             dnums.input_spatial_dimensions(i));
   }
+
   TF_ASSIGN_OR_RETURN(
-      Shape sharded_conv_shape,
-      ShapeInference::InferConvolveShape(
-          resharded_operand_and_window->sharded_input->shape(),
-          rhs.hlo()->shape(), original_hlo->feature_group_count(),
-          original_hlo->batch_group_count(), new_window, dnums));
+      auto sharded_conv,
+      create_sharded_conv(resharded_operand_and_window->sharded_input,
+                          rhs.hlo(), b, new_window));
+
   auto shard_shape = MakePartitionedShape(output_base_shape, output_sharding);
-  *sharded_conv_shape.mutable_layout() = shard_shape.layout();
-  auto sharded_conv = b->AddInstruction(HloInstruction::CreateConvolve(
-      sharded_conv_shape, resharded_operand_and_window->sharded_input,
-      rhs.hlo(), original_hlo->feature_group_count(),
-      original_hlo->batch_group_count(), new_window, dnums,
-      original_hlo->precision_config()));
   if (!resharded_operand_and_window->dynamic_slice_index_on_output
            .has_value()) {
     CHECK(ShapeUtil::Compatible(shard_shape, sharded_conv->shape()));
@@ -692,132 +812,40 @@ StatusOr<HloInstruction*> PartitionConvolutionTiledOutput(
       shard_shape.dimensions()));
 }
 
-StatusOr<HloInstruction*> PartitionConvolutionGroupOnParallelDim(
-    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding, const Window& conv_window,
-    HloInstruction* original_hlo, const ConvolutionDimsMapping& dims_mapping,
-    int64 num_partitions, const SpmdPartitionerOptions& options,
-    HloInstruction* partition_id, HloModule* module, SpmdBuilder* b) {
-  std::vector<int64> lhs_dims;
-  std::vector<int64> rhs_dims;
-  std::vector<int64> output_dims;
-  auto lhs_sharding_dims_adjusted_to_output =
-      lhs.sharding().IsReplicated()
-          ? std::vector<int64>(lhs.base_shape().rank(), 1)
-          : lhs.sharding().tile_assignment().dimensions();
-  auto rhs_sharding_dims_adjusted_to_output =
-      rhs.sharding().IsReplicated()
-          ? std::vector<int64>(rhs.base_shape().rank(), 1)
-          : rhs.sharding().tile_assignment().dimensions();
-  auto output_sharding_dims_adjusted_to_lhs =
-      output_sharding.tile_assignment().dimensions();
-  bool lhs_rhs_dims_matching = true;
-  for (const auto& dim : dims_mapping.parallel_spatial_dims) {
-    lhs_dims.push_back(dim.lhs);
-    rhs_dims.push_back(dim.rhs);
-    output_dims.push_back(dim.output);
-    if (lhs_sharding_dims_adjusted_to_output[dim.lhs] !=
-        rhs_sharding_dims_adjusted_to_output[dim.rhs]) {
-      lhs_rhs_dims_matching = false;
-    }
-    lhs_sharding_dims_adjusted_to_output[dim.lhs] =
-        output_sharding.tile_assignment().dim(dim.output);
-    rhs_sharding_dims_adjusted_to_output[dim.rhs] =
-        output_sharding.tile_assignment().dim(dim.output);
-    output_sharding_dims_adjusted_to_lhs[dim.output] =
-        lhs.sharding().tile_assignment().dim(dim.lhs);
-  }
-  auto lhs_grouped = GroupShardingOnDims(lhs.sharding(), lhs_dims);
-  auto rhs_grouped = GroupShardingOnDims(rhs.sharding(), rhs_dims);
-  auto output_grouped = GroupShardingOnDims(output_sharding, output_dims);
-  if (lhs_rhs_dims_matching) {
-    if (ShapeUtil::ByteSizeOf(lhs.base_shape()) >
-        ShapeUtil::ByteSizeOf(rhs.base_shape())) {
-      rhs_grouped = AlignGroupsWith(std::move(rhs_grouped), lhs_grouped);
-      rhs = rhs.Reshard(UngroupSharding(rhs_grouped));
-    } else {
-      lhs_grouped = AlignGroupsWith(std::move(lhs_grouped), rhs_grouped);
-      lhs = lhs.Reshard(UngroupSharding(lhs_grouped));
-    }
-    auto reshaped_output_tiling = output_sharding.tile_assignment();
-    reshaped_output_tiling.Reshape(output_sharding_dims_adjusted_to_lhs);
-    output_grouped = AlignGroupsWith(
-        GroupShardingOnDims(HloSharding::Tile(reshaped_output_tiling),
-                            output_dims),
-        lhs_grouped);
-  } else {
-    auto reshaped_lhs_tiling = lhs.sharding().tile_assignment();
-    reshaped_lhs_tiling.Reshape(lhs_sharding_dims_adjusted_to_output);
-    lhs_grouped = AlignGroupsWith(
-        GroupShardingOnDims(HloSharding::Tile(reshaped_lhs_tiling), lhs_dims),
-        output_grouped);
-    lhs = lhs.Reshard(UngroupSharding(lhs_grouped));
-    auto reshaped_rhs_tiling = rhs.sharding().tile_assignment();
-    reshaped_rhs_tiling.Reshape(rhs_sharding_dims_adjusted_to_output);
-    rhs_grouped = AlignGroupsWith(
-        GroupShardingOnDims(HloSharding::Tile(reshaped_rhs_tiling), rhs_dims),
-        output_grouped);
-    rhs = rhs.Reshard(UngroupSharding(rhs_grouped));
-  }
-
-  // Update LHS and RHS sharding and shape.
-  lhs.hlo()->set_sharding(lhs_grouped.sharding);
-  rhs.hlo()->set_sharding(rhs_grouped.sharding);
-  CHECK(lhs.hlo() != rhs.hlo() || lhs_grouped.sharding == rhs_grouped.sharding);
-  auto per_group_partitioner_state = CreatePerGroupPartitioningState(
-      lhs.state(), lhs_grouped.device_groups, b);
-  auto grouped_lhs_base_shape =
-      GetPerGroupBaseShape(lhs_grouped, lhs.base_shape());
-  auto grouped_lhs_shard_shape =
-      MakePartitionedShape(grouped_lhs_base_shape, lhs.sharding());
-  // Update convolution window with the new shape
-  auto new_window = conv_window;
-  for (const auto& dim : dims_mapping.parallel_spatial_dims) {
-    auto wd = new_window.mutable_dimensions(dim.spatial);
-    wd->set_size(grouped_lhs_shard_shape.dimensions(dim.lhs));
-    wd->set_stride(std::max<int64>(1, wd->size() - 1));
-    wd->set_base_dilation(wd->size());
-  }
-
-  auto new_partition_id =
-      lhs.state().collective_ops_creator.create_partition_id(b);
-  TF_ASSIGN_OR_RETURN(
-      auto conv,
-      PartitionConvolution(
-          PartitionedHlo(lhs.hlo(), grouped_lhs_base_shape,
-                         per_group_partitioner_state),
-          PartitionedHlo(rhs.hlo(),
-                         GetPerGroupBaseShape(rhs_grouped, rhs.base_shape()),
-                         per_group_partitioner_state),
-          GetPerGroupBaseShape(output_grouped, output_base_shape),
-          output_grouped.sharding, new_window, original_hlo,
-          num_partitions / output_grouped.device_groups.size(), options,
-          new_partition_id, module, b));
-  // Reset the LHS sharding to the ungrouped one.
-  lhs.hlo()->set_sharding(UngroupSharding(lhs_grouped));
-  rhs.hlo()->set_sharding(UngroupSharding(rhs_grouped));
-  conv->set_sharding(UngroupSharding(output_grouped));
-  return PartitionedHlo(conv, output_base_shape, lhs.state())
-      .Reshard(output_sharding)
-      .hlo();
-}
-
 // Partition convolution with only one kind of dims partitioned.
 StatusOr<HloInstruction*> PartitionConvolutionBaseCase(
     PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding, const Window& conv_window,
-    HloInstruction* original_hlo, int64 num_partitions,
-    const SpmdPartitionerOptions& options, HloInstruction* partition_id,
-    HloModule* module, SpmdBuilder* b) {
+    const HloSharding& output_sharding,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*,
+        const Window& conv_window)>& create_sharded_conv,
+    const Window& conv_window, HloInstruction* original_hlo,
+    int64 num_partitions, const SpmdPartitionerOptions& options,
+    HloInstruction* partition_id, HloModule* module, SpmdBuilder* b) {
   TF_RET_CHECK(original_hlo->opcode() == HloOpcode::kConvolution);
 
-  // Case 1: Either RHS or LHS is only partitioned at parallel dimensions.
-  TF_ASSIGN_OR_RETURN(auto parallel_partitioned_conv,
-                      PartitionConvolutionWithParallelDimension(
-                          lhs, rhs, output_base_shape, output_sharding,
-                          conv_window, original_hlo, num_partitions, b));
-  if (parallel_partitioned_conv) {
-    return parallel_partitioned_conv;
+  // Case 1: Handle depthwise convolution with batch group count or
+  // feature group count.
+  if (original_hlo->batch_group_count() > 1) {
+    TF_ASSIGN_OR_RETURN(
+        auto parallel_partitioned_conv,
+        PartitionConvolutionWithBatchGroupCount(
+            lhs, rhs, output_base_shape, output_sharding, create_sharded_conv,
+            conv_window, original_hlo, num_partitions, b));
+    if (parallel_partitioned_conv) {
+      return parallel_partitioned_conv;
+    }
+  }
+
+  if (original_hlo->feature_group_count() > 1) {
+    TF_ASSIGN_OR_RETURN(
+        auto parallel_partitioned_conv,
+        PartitionConvolutionWithFeatureGroupCount(
+            lhs, rhs, output_base_shape, output_sharding, create_sharded_conv,
+            conv_window, original_hlo, num_partitions, b));
+    if (parallel_partitioned_conv) {
+      return parallel_partitioned_conv;
+    }
   }
 
   // Case 2: both RHS and LHS are tiled.
@@ -830,8 +858,8 @@ StatusOr<HloInstruction*> PartitionConvolutionBaseCase(
       TF_ASSIGN_OR_RETURN(
           auto partitioned_conv,
           PartitionConvolutionWithSpatialDimensionHaloExchangeOnLHS(
-              lhs, rhs, output_base_shape, output_sharding, conv_window,
-              original_hlo, partition_id, module, b));
+              lhs, rhs, output_base_shape, output_sharding, create_sharded_conv,
+              conv_window, original_hlo, partition_id, module, b));
       if (partitioned_conv) {
         return partitioned_conv;
       }
@@ -839,8 +867,8 @@ StatusOr<HloInstruction*> PartitionConvolutionBaseCase(
       TF_ASSIGN_OR_RETURN(
           auto partitioned_conv,
           PartitionConvolutionWithSpatialDimensionHaloExchangeOnRHS(
-              lhs, rhs, output_base_shape, output_sharding, conv_window,
-              original_hlo, partition_id, module, b));
+              lhs, rhs, output_base_shape, output_sharding, create_sharded_conv,
+              conv_window, original_hlo, partition_id, module, b));
 
       if (partitioned_conv) {
         return partitioned_conv;
@@ -853,7 +881,7 @@ StatusOr<HloInstruction*> PartitionConvolutionBaseCase(
     TF_ASSIGN_OR_RETURN(auto partitioned_conv,
                         PartitionConvolutionTiledOutput(
                             lhs, rhs, output_base_shape, output_sharding,
-                            conv_window, original_hlo, b));
+                            create_sharded_conv, conv_window, original_hlo, b));
 
     if (partitioned_conv) {
       return partitioned_conv;
@@ -862,151 +890,156 @@ StatusOr<HloInstruction*> PartitionConvolutionBaseCase(
   return nullptr;
 }
 
+StatusOr<std::unique_ptr<HloInstruction>> CreateShardedConvConvolution(
+    const HloInstruction& conv,
+    const dot_as_convolution_util::DotConvolutionDimsInfo& dot_dnums,
+    HloInstruction* sharded_lhs_hlo, HloInstruction* sharded_rhs_hlo,
+    const Window& conv_window) {
+  CHECK_EQ(conv.opcode(), HloOpcode::kConvolution);
+  const auto& conv_dnums = conv.convolution_dimension_numbers();
+  auto window = conv.window();
+  for (const auto& dim : dot_dnums.batch_dims) {
+    auto wd = window.mutable_dimensions(dim.spatial_dim);
+    wd->set_size(sharded_lhs_hlo->shape().dimensions(
+        conv_dnums.input_spatial_dimensions(dim.spatial_dim)));
+    wd->set_stride(std::max<int64>(1, wd->size() - 1));
+    wd->set_base_dilation(wd->size());
+  }
+  for (const auto& dim : dot_dnums.contracting_dims) {
+    if (dim.spatial_dim < 0) {
+      continue;
+    }
+    auto wd = window.mutable_dimensions(dim.spatial_dim);
+    wd->set_size(sharded_lhs_hlo->shape().dimensions(
+        conv_dnums.input_spatial_dimensions(dim.spatial_dim)));
+  }
+  for (const auto& dim : dot_dnums.rhs_non_contracting_dims) {
+    if (dim.spatial_dim < 0) {
+      continue;
+    }
+    auto wd = window.mutable_dimensions(dim.spatial_dim);
+    wd->set_size(sharded_rhs_hlo->shape().dimensions(
+        conv_dnums.kernel_spatial_dimensions(dim.spatial_dim)));
+    wd->set_padding_high(wd->size() - 1);
+    wd->set_padding_low(wd->size() - 1);
+  }
+
+  for (const auto& dim : dot_dnums.conv_spatial_dims) {
+    auto wd = window.mutable_dimensions(dim.spatial_dim);
+    const auto& new_window_dimension = conv_window.dimensions(dim.spatial_dim);
+    wd->set_size(new_window_dimension.size());
+    wd->set_padding_high(new_window_dimension.padding_high());
+    wd->set_padding_low(new_window_dimension.padding_low());
+  }
+
+  int64 feature_group_count = conv.feature_group_count();
+  if (feature_group_count > 1) {
+    feature_group_count = sharded_lhs_hlo->shape().dimensions(
+                              conv_dnums.input_feature_dimension()) /
+                          sharded_rhs_hlo->shape().dimensions(
+                              conv_dnums.kernel_input_feature_dimension());
+  }
+
+  int64 batch_group_count = conv.batch_group_count();
+  if (batch_group_count > 1) {
+    batch_group_count =
+        sharded_lhs_hlo->shape().dimensions(conv_dnums.input_batch_dimension());
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      Shape sharded_conv_shape,
+      ShapeInference::InferConvolveShape(
+          sharded_lhs_hlo->shape(), sharded_rhs_hlo->shape(),
+          feature_group_count, batch_group_count, window, conv_dnums));
+  *sharded_conv_shape.mutable_layout() = conv.shape().layout();
+  return HloInstruction::CreateConvolve(
+      sharded_conv_shape, sharded_lhs_hlo, sharded_rhs_hlo, feature_group_count,
+      batch_group_count, window, conv_dnums, conv.precision_config());
+}
+
+}  // namespace
+
 // Partition convolution.
 StatusOr<HloInstruction*> PartitionConvolution(
     PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding, const Window& conv_window,
-    HloInstruction* original_hlo, int64 num_partitions,
-    const SpmdPartitionerOptions& options, HloInstruction* partition_id,
-    HloModule* module, SpmdBuilder* b) {
+    const HloSharding& output_sharding, const DotConvDimsMapping& dims_mapping,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*,
+        const Window& conv_window)>& create_sharded_conv,
+    const Window& conv_window, HloInstruction* original_hlo,
+    int64 num_partitions, const SpmdPartitionerOptions& options,
+    HloInstruction* partition_id, HloModule* module, SpmdBuilder* b) {
   TF_RET_CHECK(original_hlo->opcode() == HloOpcode::kConvolution);
 
-  TF_ASSIGN_OR_RETURN(
-      auto try_partitioned_conv,
-      PartitionConvolutionBaseCase(lhs, rhs, output_base_shape, output_sharding,
-                                   conv_window, original_hlo, num_partitions,
-                                   options, partition_id, module, b));
+  TF_ASSIGN_OR_RETURN(auto try_partitioned_conv,
+                      PartitionConvolutionBaseCase(
+                          lhs, rhs, output_base_shape, output_sharding,
+                          create_sharded_conv, conv_window, original_hlo,
+                          num_partitions, options, partition_id, module, b));
   if (try_partitioned_conv) {
     return try_partitioned_conv;
   }
 
-  const auto& dnums = original_hlo->convolution_dimension_numbers();
-  spmd::ConvolutionDimsMapping mapping;
-  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
-    int64 lhs_dim = dnums.input_spatial_dimensions(i);
-    int64 lhs_size = lhs.base_shape().dimensions(lhs_dim);
-    const auto& wd = original_hlo->window().dimensions(i);
-    int64 rhs_dim = dnums.kernel_spatial_dimensions(i);
-    int64 output_dim = dnums.output_spatial_dimensions(i);
-    if (dot_as_convolution_util::ConvSpatialDimensionIsParallel(wd, lhs_size)) {
-      mapping.parallel_spatial_dims.emplace_back();
-      mapping.parallel_spatial_dims.back().lhs = lhs_dim;
-      mapping.parallel_spatial_dims.back().rhs = rhs_dim;
-      mapping.parallel_spatial_dims.back().output = output_dim;
-      mapping.parallel_spatial_dims.back().spatial = i;
-    } else {
-      mapping.non_parallel_spatial_dims.emplace_back();
-      mapping.non_parallel_spatial_dims.back().lhs = lhs_dim;
-      mapping.non_parallel_spatial_dims.back().rhs = rhs_dim;
-      mapping.non_parallel_spatial_dims.back().output = output_dim;
-      mapping.non_parallel_spatial_dims.back().spatial = i;
-    }
-  }
-
-  // lhs_rhs_or_output: 0 lhs, 1 rhs, 2 output.
-  auto get_partitions_for_dims =
-      [&](const HloSharding& sharding,
-          absl::Span<const ConvolutionDimsMapping::DimsMapping> dims,
-          int lhs_rhs_or_output) {
-        int64 partitions = 1;
-        if (sharding.IsTileMaximal()) {
-          return partitions;
-        }
-        for (const auto& dim : dims) {
-          if (lhs_rhs_or_output == 0) {
-            partitions *= sharding.tile_assignment().dim(dim.lhs);
-          } else if (lhs_rhs_or_output == 1) {
-            partitions *= sharding.tile_assignment().dim(dim.rhs);
-          } else {
-            CHECK_EQ(lhs_rhs_or_output, 2);
-            partitions *= sharding.tile_assignment().dim(dim.output);
-          }
-        }
-        return partitions;
-      };
-
-  const int64 lhs_parallel_spatial_partitions =
-      get_partitions_for_dims(lhs.sharding(), mapping.parallel_spatial_dims, 0);
-  const int64 rhs_parallel_spatial_partitions =
-      get_partitions_for_dims(rhs.sharding(), mapping.parallel_spatial_dims, 1);
-  const int64 output_parallel_spatial_partitions = get_partitions_for_dims(
-      original_hlo->sharding(), mapping.parallel_spatial_dims, 2);
-
-  // Recursively partition on different types of dimensions.
-  //
-  // Case 1: Group partitions by parallel spatial dims.
-  if (lhs_parallel_spatial_partitions == rhs_parallel_spatial_partitions &&
-      lhs_parallel_spatial_partitions == output_parallel_spatial_partitions &&
-      lhs_parallel_spatial_partitions > 1) {
-    TF_ASSIGN_OR_RETURN(auto try_partitioned_conv,
-                        PartitionConvolutionGroupOnParallelDim(
-                            lhs, rhs, output_base_shape, output_sharding,
-                            conv_window, original_hlo, mapping, num_partitions,
-                            options, partition_id, module, b));
-    if (try_partitioned_conv) {
-      return try_partitioned_conv;
-    }
-  }
-
   return nullptr;
 }
 
-}  // namespace
-
 Status SpmdPartitioningVisitor::HandleConvolution(HloInstruction* hlo) {
-  auto dot_dnums = dot_as_convolution_util::ParseDotGeneralFromConvolution(hlo);
-  if (dot_dnums) {
-    // Use HandleDotHelper() for convs that are actually einsums.
-    spmd::DotGeneralDimsMapping mapping;
-    for (const auto& dims : dot_dnums->batch_dims) {
-      mapping.batch_dims.emplace_back();
-      mapping.batch_dims.back().lhs = dims.lhs;
-      mapping.batch_dims.back().rhs = dims.rhs;
-      mapping.batch_dims.back().output = dims.output;
-    }
-    for (const auto& dims : dot_dnums->contracting_dims) {
-      mapping.contracting_dims.emplace_back();
-      mapping.contracting_dims.back().lhs = dims.lhs;
-      mapping.contracting_dims.back().rhs = dims.rhs;
-      mapping.contracting_dims.back().output = dims.output;
-    }
-    for (const auto& dims : dot_dnums->lhs_non_contracting_dims) {
-      mapping.lhs_non_contracting_dims.emplace_back();
-      mapping.lhs_non_contracting_dims.back().lhs = dims.lhs;
-      mapping.lhs_non_contracting_dims.back().rhs = dims.rhs;
-      mapping.lhs_non_contracting_dims.back().output = dims.output;
-    }
-    for (const auto& dims : dot_dnums->rhs_non_contracting_dims) {
-      mapping.rhs_non_contracting_dims.emplace_back();
-      mapping.rhs_non_contracting_dims.back().lhs = dims.lhs;
-      mapping.rhs_non_contracting_dims.back().rhs = dims.rhs;
-      mapping.rhs_non_contracting_dims.back().output = dims.output;
-    }
-    auto create_sharded_conv =
-        [&](HloInstruction* lhs_hlo, HloInstruction* rhs_hlo,
-            spmd::SpmdBuilder* b) -> StatusOr<HloInstruction*> {
+  auto dims_info = dot_as_convolution_util::ParseConvolutionDimsInfo(hlo);
+  spmd::DotConvDimsMapping mapping;
+  for (const auto& dims : dims_info.batch_dims) {
+    mapping.batch_dims.emplace_back();
+    mapping.batch_dims.back().lhs = dims.lhs;
+    mapping.batch_dims.back().rhs = dims.rhs;
+    mapping.batch_dims.back().output = dims.output;
+    mapping.batch_dims.back().spatial = dims.spatial_dim;
+  }
+  for (const auto& dims : dims_info.contracting_dims) {
+    mapping.contracting_dims.emplace_back();
+    mapping.contracting_dims.back().lhs = dims.lhs;
+    mapping.contracting_dims.back().rhs = dims.rhs;
+    mapping.contracting_dims.back().output = dims.output;
+    mapping.contracting_dims.back().spatial = dims.spatial_dim;
+  }
+  for (const auto& dims : dims_info.lhs_non_contracting_dims) {
+    mapping.lhs_non_contracting_dims.emplace_back();
+    mapping.lhs_non_contracting_dims.back().lhs = dims.lhs;
+    mapping.lhs_non_contracting_dims.back().rhs = dims.rhs;
+    mapping.lhs_non_contracting_dims.back().output = dims.output;
+    mapping.lhs_non_contracting_dims.back().spatial = dims.spatial_dim;
+  }
+  for (const auto& dims : dims_info.rhs_non_contracting_dims) {
+    mapping.rhs_non_contracting_dims.emplace_back();
+    mapping.rhs_non_contracting_dims.back().lhs = dims.lhs;
+    mapping.rhs_non_contracting_dims.back().rhs = dims.rhs;
+    mapping.rhs_non_contracting_dims.back().output = dims.output;
+    mapping.rhs_non_contracting_dims.back().spatial = dims.spatial_dim;
+  }
+  for (const auto& dims : dims_info.conv_spatial_dims) {
+    mapping.conv_spatial_dims.emplace_back();
+    mapping.conv_spatial_dims.back().lhs = dims.lhs;
+    mapping.conv_spatial_dims.back().rhs = dims.rhs;
+    mapping.conv_spatial_dims.back().output = dims.output;
+    mapping.conv_spatial_dims.back().spatial = dims.spatial_dim;
+  }
+  auto create_sharded_conv =
+      [&](HloInstruction* lhs_hlo, HloInstruction* rhs_hlo,
+          spmd::SpmdBuilder* b,
+          const Window& conv_window) -> StatusOr<HloInstruction*> {
+    if (dims_info.conv_spatial_dims.empty()) {
       TF_ASSIGN_OR_RETURN(
           auto sharded_conv,
           dot_as_convolution_util::CreateShardedConvForDotGeneralConvolution(
-              *hlo, *dot_dnums, lhs_hlo, rhs_hlo));
+              *hlo, dims_info, lhs_hlo, rhs_hlo));
       return b->AddInstruction(std::move(sharded_conv));
-    };
-    return HandleDotHelper(hlo, mapping, create_sharded_conv);
-  }
+    } else {
+      TF_ASSIGN_OR_RETURN(auto sharded_conv,
+                          CreateShardedConvConvolution(*hlo, dims_info, lhs_hlo,
+                                                       rhs_hlo, conv_window));
+      return b->AddInstruction(std::move(sharded_conv));
+    }
+  };
 
-  auto lhs = GetPartitionedHlo(hlo->operand(0));
-  auto rhs = GetPartitionedHlo(hlo->operand(1));
-  TF_ASSIGN_OR_RETURN(
-      auto partitioned_conv,
-      PartitionConvolution(lhs, rhs, hlo->shape(), hlo->sharding(),
-                           hlo->window(), hlo, num_partitions_, options_,
-                           partition_id_, module_, &b_));
-
-  if (partitioned_conv) {
-    SetPartitionedHlo(hlo, [&] { return partitioned_conv; });
-    return Status::OK();
-  }
-  return DefaultAction(hlo);
+  return HandleDotHelper(hlo, mapping, create_sharded_conv);
 }
 
 }  // namespace spmd
diff --git a/tensorflow/compiler/xla/service/spmd/convolution_handler.h b/tensorflow/compiler/xla/service/spmd/convolution_handler.h
new file mode 100644
index 00000000000..2d929da54e7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/convolution_handler.h
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_CONVOLUTION_HANDLER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_CONVOLUTION_HANDLER_H_
+
+#include "tensorflow/compiler/xla/service/dot_as_convolution_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+
+namespace xla {
+namespace spmd {
+
+// Partition convolution.
+StatusOr<HloInstruction*> PartitionConvolution(
+    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
+    const HloSharding& output_sharding, const DotConvDimsMapping& dims_mapping,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*,
+        const Window& conv_window)>& create_sharded_conv,
+    const Window& conv_window, HloInstruction* original_hlo,
+    int64 num_partitions, const SpmdPartitionerOptions& options,
+    HloInstruction* partition_id, HloModule* module, SpmdBuilder* b);
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_CONVOLUTION_HANDLER_H_
diff --git a/tensorflow/compiler/xla/service/spmd/dot_handler.cc b/tensorflow/compiler/xla/service/spmd/dot_handler.cc
index da432965497..45bd79bfc75 100644
--- a/tensorflow/compiler/xla/service/spmd/dot_handler.cc
+++ b/tensorflow/compiler/xla/service/spmd/dot_handler.cc
@@ -19,15 +19,19 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/service/spmd/convolution_handler.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/numbers.h"
@@ -36,7 +40,7 @@ namespace xla {
 namespace spmd {
 
 Status SpmdPartitioningVisitor::HandleDot(HloInstruction* hlo) {
-  DotGeneralDimsMapping mapping;
+  DotConvDimsMapping mapping;
   const auto& dnums = hlo->dot_dimension_numbers();
   int64 next_output_dim = 0;
   for (int64 i = 0; i < dnums.lhs_batch_dimensions_size(); ++i) {
@@ -71,8 +75,9 @@ Status SpmdPartitioningVisitor::HandleDot(HloInstruction* hlo) {
     mapping.rhs_non_contracting_dims.back().rhs = i;
     mapping.rhs_non_contracting_dims.back().output = next_output_dim++;
   }
-  auto create_sharded_dot = [&](HloInstruction* l, HloInstruction* r,
-                                SpmdBuilder* b) -> StatusOr<HloInstruction*> {
+  auto create_sharded_dot =
+      [&](HloInstruction* l, HloInstruction* r, SpmdBuilder* b,
+          const Window& conv_window) -> StatusOr<HloInstruction*> {
     TF_ASSIGN_OR_RETURN(
         auto sharded_dot_shape,
         ShapeInference::InferDotOpShape(l->shape(), r->shape(),
@@ -86,19 +91,32 @@ Status SpmdPartitioningVisitor::HandleDot(HloInstruction* hlo) {
 
 namespace {
 
+std::vector<int64> GetAllDevicesInOrder(const HloSharding& sharding) {
+  CHECK(!sharding.IsTileMaximal());
+  std::vector<int64> results;
+  results.reserve(sharding.tile_assignment().num_elements());
+  sharding.tile_assignment().Each(
+      [&](absl::Span<const int64> /* indices */, int64 device) {
+        results.push_back(device);
+      });
+  return results;
+}
+
 StatusOr<HloInstruction*> PartitionBaseCase(
     PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding,
-    const DotGeneralDimsMapping& dims_mapping, int64 num_partitions,
+    const HloSharding& output_sharding, const DotConvDimsMapping& dims_mapping,
+    int64 num_partitions,
     const std::function<StatusOr<HloInstruction*>(
-        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot,
-    HloModule* module, HloInstruction* original_hlo, int64 lhs_batch_partitions,
-    int64 rhs_batch_partitions, int64 output_batch_partitions,
-    int64 lhs_contracting_partitions, int64 rhs_contracting_partitions,
-    int64 lhs_non_contracting_partitions, int64 rhs_non_contracting_partitions,
+        HloInstruction*, HloInstruction*, SpmdBuilder*,
+        const Window& conv_window)>& create_sharded_dot,
+    const Window& conv_window, HloModule* module, HloInstruction* original_hlo,
+    int64 lhs_batch_partitions, int64 rhs_batch_partitions,
+    int64 output_batch_partitions, int64 lhs_contracting_partitions,
+    int64 rhs_contracting_partitions, int64 lhs_non_contracting_partitions,
+    int64 rhs_non_contracting_partitions,
     int64 output_lhs_non_contracting_partitions,
     int64 output_rhs_non_contracting_partitions,
-    int64 threshold_for_windowed_einsum_mib, SpmdBuilder* b,
+    const SpmdPartitionerOptions& options, SpmdBuilder* b,
     std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
         windowed_dot_general_loops,
     bool may_reshard_without_detecting_match) {
@@ -116,7 +134,7 @@ StatusOr<HloInstruction*> PartitionBaseCase(
   std::vector<int64> output_to_lhs_indices(output_base_shape.rank(), -1);
   std::vector<int64> output_to_rhs_indices(output_base_shape.rank(), -1);
   auto populate_indices_mapping =
-      [&](const DotGeneralDimsMapping::DimsMapping& mapping) {
+      [&](const DotConvDimsMapping::DimsMapping& mapping) {
         if (mapping.lhs >= 0) {
           lhs_to_rhs_indices[mapping.lhs] = mapping.rhs;
           lhs_to_output_indices[mapping.lhs] = mapping.output;
@@ -142,6 +160,9 @@ StatusOr<HloInstruction*> PartitionBaseCase(
   for (const auto& mapping : dims_mapping.rhs_non_contracting_dims) {
     populate_indices_mapping(mapping);
   }
+  for (const auto& mapping : dims_mapping.conv_spatial_dims) {
+    populate_indices_mapping(mapping);
+  }
   auto lhs_sharding_transposed_to_match_rhs =
       hlo_sharding_util::TransposeShardingWithCollapsedDims(
           lhs_sharding, lhs_to_rhs_indices, rhs_to_lhs_indices);
@@ -166,7 +187,8 @@ StatusOr<HloInstruction*> PartitionBaseCase(
   if (lhs_batch_partitions == rhs_batch_partitions &&
       rhs_batch_partitions == num_partitions &&
       lhs_sharding_transposed_to_match_rhs == rhs_sharding) {
-    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(lhs.hlo(), rhs.hlo(), b));
+    TF_ASSIGN_OR_RETURN(
+        auto dot, create_sharded_dot(lhs.hlo(), rhs.hlo(), b, conv_window));
     dot->set_sharding(*lhs_sharding_transposed_to_match_output);
     return PartitionedHlo(dot, output_base_shape, lhs.state())
         .Reshard(output_sharding)
@@ -192,7 +214,8 @@ StatusOr<HloInstruction*> PartitionBaseCase(
       }
       auto resharded_rhs = rhs.Reshard(*lhs_sharding_transposed_to_match_rhs);
       TF_ASSIGN_OR_RETURN(
-          auto dot, create_sharded_dot(lhs.hlo(), resharded_rhs.hlo(), b));
+          auto dot,
+          create_sharded_dot(lhs.hlo(), resharded_rhs.hlo(), b, conv_window));
       return dot;
     }
     // RHS and output are batch partitioned in the same way.
@@ -208,7 +231,8 @@ StatusOr<HloInstruction*> PartitionBaseCase(
       }
       auto resharded_lhs = lhs.Reshard(*rhs_sharding_transposed_to_match_lhs);
       TF_ASSIGN_OR_RETURN(
-          auto dot, create_sharded_dot(resharded_lhs.hlo(), rhs.hlo(), b));
+          auto dot,
+          create_sharded_dot(resharded_lhs.hlo(), rhs.hlo(), b, conv_window));
       return dot;
     }
     return nullptr;
@@ -306,8 +330,8 @@ StatusOr<HloInstruction*> PartitionBaseCase(
         dot_rhs = slice;
       }
     }
-    TF_ASSIGN_OR_RETURN(auto dot,
-                        create_sharded_dot(dot_lhs, dot_rhs, &body_b));
+    TF_ASSIGN_OR_RETURN(
+        auto dot, create_sharded_dot(dot_lhs, dot_rhs, &body_b, conv_window));
     if (windowed_at_contracting_dims) {
       // Accumulate the partial output to the result buffer.
       o = body_b.AddInstruction(
@@ -408,7 +432,7 @@ StatusOr<HloInstruction*> PartitionBaseCase(
   if (output_lhs_non_contracting_partitions == num_partitions &&
       output_sharding_transposed_to_match_lhs == lhs_sharding &&
       ShapeSizeInBytes(rhs.base_shape()) >=
-          threshold_for_windowed_einsum_mib * 1024 * 1024) {
+          options.threshold_for_windowed_einsum_mib * 1024 * 1024) {
     if (rhs_contracting_partitions == num_partitions) {
       return emit_windowed_dot_general(0, 1, true, false);
     }
@@ -422,7 +446,7 @@ StatusOr<HloInstruction*> PartitionBaseCase(
   if (output_rhs_non_contracting_partitions == num_partitions &&
       output_sharding_transposed_to_match_rhs == rhs_sharding &&
       ShapeSizeInBytes(lhs.base_shape()) >=
-          threshold_for_windowed_einsum_mib * 1024 * 1024) {
+          options.threshold_for_windowed_einsum_mib * 1024 * 1024) {
     if (lhs_contracting_partitions == num_partitions) {
       return emit_windowed_dot_general(1, 0, true, false);
     }
@@ -461,10 +485,12 @@ StatusOr<HloInstruction*> PartitionBaseCase(
       rhs =
           rhs.Reshard(*lhs_sharding_transposed_to_match_rhs).PadWithValue(zero);
     }
-    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(lhs.hlo(), rhs.hlo(), b));
+    TF_ASSIGN_OR_RETURN(
+        auto dot, create_sharded_dot(lhs.hlo(), rhs.hlo(), b, conv_window));
     auto ar =
         lhs.state().collective_ops_creator.create_cross_partition_all_reduce(
-            b, dot, MakeBinaryAdd(output_base_shape.element_type(), module), {},
+            b, dot, MakeBinaryAdd(output_base_shape.element_type(), module),
+            {GetAllDevicesInOrder(lhs.sharding())},
             (*lhs.state().next_channel_id)++);
     ar->set_sharding(HloSharding::Replicate());
     return PartitionedHlo(ar, output_base_shape, lhs.state())
@@ -477,8 +503,8 @@ StatusOr<HloInstruction*> PartitionBaseCase(
       output_lhs_non_contracting_partitions == num_partitions &&
       lhs_sharding_transposed_to_match_output == output_sharding) {
     auto rhs_replicated = rhs.Reshard(HloSharding::Replicate()).hlo();
-    TF_ASSIGN_OR_RETURN(auto dot,
-                        create_sharded_dot(lhs.hlo(), rhs_replicated, b));
+    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(lhs.hlo(), rhs_replicated,
+                                                     b, conv_window));
     return dot;
   }
 
@@ -487,8 +513,8 @@ StatusOr<HloInstruction*> PartitionBaseCase(
       output_rhs_non_contracting_partitions == num_partitions &&
       rhs_sharding_transposed_to_match_output == output_sharding) {
     auto lhs_replicated = lhs.Reshard(HloSharding::Replicate()).hlo();
-    TF_ASSIGN_OR_RETURN(auto dot,
-                        create_sharded_dot(lhs_replicated, rhs.hlo(), b));
+    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(lhs_replicated, rhs.hlo(),
+                                                     b, conv_window));
     return dot;
   }
 
@@ -499,8 +525,9 @@ StatusOr<HloInstruction*> PartitionBaseCase(
           lhs.Reshard(*output_sharding_transposed_to_match_lhs);
       auto resharded_rhs =
           rhs.Reshard(*output_sharding_transposed_to_match_rhs);
-      TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(resharded_lhs.hlo(),
-                                                       resharded_rhs.hlo(), b));
+      TF_ASSIGN_OR_RETURN(
+          auto dot, create_sharded_dot(resharded_lhs.hlo(), resharded_rhs.hlo(),
+                                       b, conv_window));
       return dot;
     }
     // Output is partitioned along LHS non-contracting dimensions.
@@ -509,8 +536,8 @@ StatusOr<HloInstruction*> PartitionBaseCase(
           lhs.Reshard(*output_sharding_transposed_to_match_lhs);
       auto replicated_rhs = rhs.Reshard(HloSharding::Replicate());
       TF_ASSIGN_OR_RETURN(
-          auto dot,
-          create_sharded_dot(resharded_lhs.hlo(), replicated_rhs.hlo(), b));
+          auto dot, create_sharded_dot(resharded_lhs.hlo(),
+                                       replicated_rhs.hlo(), b, conv_window));
       return dot;
     }
     // Output is partitioned along RHS non-contracting dimensions.
@@ -518,8 +545,9 @@ StatusOr<HloInstruction*> PartitionBaseCase(
       auto replicated_lhs = lhs.Reshard(HloSharding::Replicate());
       auto resharded_rhs =
           rhs.Reshard(*output_sharding_transposed_to_match_rhs);
-      TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(replicated_lhs.hlo(),
-                                                       resharded_rhs.hlo(), b));
+      TF_ASSIGN_OR_RETURN(
+          auto dot, create_sharded_dot(replicated_lhs.hlo(),
+                                       resharded_rhs.hlo(), b, conv_window));
       return dot;
     }
   }
@@ -562,9 +590,11 @@ StatusOr<HloInstruction*> PartitionBaseCase(
       rhs =
           rhs.Reshard(*lhs_sharding_transposed_to_match_rhs).PadWithValue(zero);
     }
-    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(lhs.hlo(), rhs.hlo(), b));
+    TF_ASSIGN_OR_RETURN(
+        auto dot, create_sharded_dot(lhs.hlo(), rhs.hlo(), b, conv_window));
     return lhs.state().collective_ops_creator.create_cross_partition_all_reduce(
-        b, dot, MakeBinaryAdd(output_base_shape.element_type(), module), {},
+        b, dot, MakeBinaryAdd(output_base_shape.element_type(), module),
+        {GetAllDevicesInOrder(lhs.sharding())},
         (*lhs.state().next_channel_id)++);
   }
   return nullptr;
@@ -572,26 +602,28 @@ StatusOr<HloInstruction*> PartitionBaseCase(
 
 StatusOr<HloInstruction*> PartitionDot(
     PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding,
-    const DotGeneralDimsMapping& dims_mapping, int64 num_partitions,
+    const HloSharding& output_sharding, const DotConvDimsMapping& dims_mapping,
+    int64 num_partitions,
     const std::function<StatusOr<HloInstruction*>(
-        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot,
-    HloModule* module, HloInstruction* original_hlo,
-    int64 threshold_for_windowed_einsum_mib, SpmdBuilder* b,
+        HloInstruction*, HloInstruction*, SpmdBuilder*,
+        const Window& conv_window)>& create_sharded_dot,
+    const Window& conv_window, HloModule* module, HloInstruction* original_hlo,
+    const SpmdPartitionerOptions& options, SpmdBuilder* b,
     std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
         windowed_dot_general_loops);
 
 StatusOr<HloInstruction*> PartitionDotGroupOnBatch(
     PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding,
-    const DotGeneralDimsMapping& dims_mapping, int64 num_partitions,
-    int64 lhs_contracting_partitions, int64 rhs_contracting_partitions,
-    int64 lhs_non_contracting_partitions, int64 rhs_non_contracting_partitions,
+    const HloSharding& output_sharding, const DotConvDimsMapping& dims_mapping,
+    int64 num_partitions, int64 lhs_contracting_partitions,
+    int64 rhs_contracting_partitions, int64 lhs_non_contracting_partitions,
+    int64 rhs_non_contracting_partitions,
     const std::function<StatusOr<HloInstruction*>(
-        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot,
-    HloModule* module, HloInstruction* original_hlo,
+        HloInstruction*, HloInstruction*, SpmdBuilder*,
+        const Window& conv_window)>& create_sharded_dot,
+    const Window& conv_window, HloModule* module, HloInstruction* original_hlo,
     bool require_matching_devices_to_group,
-    int64 threshold_for_windowed_einsum_mib, SpmdBuilder* b,
+    const SpmdPartitionerOptions& options, SpmdBuilder* b,
     std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
         windowed_dot_general_loops) {
   std::vector<std::pair<HloInstruction*, HloSharding>>
@@ -804,9 +836,8 @@ StatusOr<HloInstruction*> PartitionDotGroupOnBatch(
                    GetPerGroupBaseShape(output_grouped, output_base_shape),
                    output_grouped.sharding, dims_mapping,
                    num_partitions / output_grouped.device_groups.size(),
-                   create_sharded_dot, module, original_hlo,
-                   threshold_for_windowed_einsum_mib, b,
-                   windowed_dot_general_loops));
+                   create_sharded_dot, conv_window, module, original_hlo,
+                   options, b, windowed_dot_general_loops));
   dot->set_sharding(UngroupSharding(output_grouped));
   return PartitionedHlo(dot, output_base_shape, lhs.state())
       .Reshard(output_sharding)
@@ -816,17 +847,18 @@ StatusOr<HloInstruction*> PartitionDotGroupOnBatch(
 StatusOr<HloInstruction*> PartitionDotGroupOnNonContracting(
     bool lhs_matching, PartitionedHlo matching, PartitionedHlo other,
     int64 matching_contracting_partitions, int64 other_contracting_partitions,
-    absl::Span<const DotGeneralDimsMapping::DimsMapping>
+    absl::Span<const DotConvDimsMapping::DimsMapping>
         partitioned_non_contractin_dims,
     int64 other_non_contracting_partitions,
     int64 output_other_non_contracting_partitions,
     const Shape& output_base_shape, const HloSharding& output_sharding,
-    const DotGeneralDimsMapping& dims_mapping, int64 num_partitions,
+    const DotConvDimsMapping& dims_mapping, int64 num_partitions,
     const std::function<StatusOr<HloInstruction*>(
-        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot,
-    HloModule* module, HloInstruction* original_hlo,
+        HloInstruction*, HloInstruction*, SpmdBuilder*,
+        const Window& conv_window)>& create_sharded_dot,
+    const Window& conv_window, HloModule* module, HloInstruction* original_hlo,
     bool require_matching_devices_to_group,
-    int64 threshold_for_windowed_einsum_mib, SpmdBuilder* b,
+    const SpmdPartitionerOptions& options, SpmdBuilder* b,
     std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
         windowed_dot_general_loops) {
   std::vector<std::pair<HloInstruction*, HloSharding>>
@@ -921,7 +953,7 @@ StatusOr<HloInstruction*> PartitionDotGroupOnNonContracting(
             other.sharding(), {other_group_dims[0]},
             {other.sharding().tile_assignment().dimensions().back() /
              group_count}),
-        output_grouped);
+        output_grouped, /*ignore_group_order=*/true);
     other = other.Reshard(UngroupSharding(grouped));
     partially_replicated_other = other.hlo();
     top_level_sharding_to_reset.emplace_back(other.hlo(), other.sharding());
@@ -949,25 +981,25 @@ StatusOr<HloInstruction*> PartitionDotGroupOnNonContracting(
                    GetPerGroupBaseShape(output_grouped, output_base_shape),
                    output_grouped.sharding, dims_mapping,
                    num_partitions / matching_grouped.device_groups.size(),
-                   create_sharded_dot, module, original_hlo,
-                   threshold_for_windowed_einsum_mib, b,
-                   windowed_dot_general_loops));
+                   create_sharded_dot, conv_window, module, original_hlo,
+                   options, b, windowed_dot_general_loops));
   return dot;
 }
 
 StatusOr<HloInstruction*> PartitionDotGroupOnContracting(
     PartitionedHlo lhs, PartitionedHlo rhs,
-    absl::Span<const DotGeneralDimsMapping::DimsMapping>
+    absl::Span<const DotConvDimsMapping::DimsMapping>
         partitioned_contractin_dims,
     int64 output_batch_partitions, int64 output_lhs_non_contracting_partitions,
     int64 output_rhs_non_contracting_partitions, const Shape& output_base_shape,
-    const HloSharding& output_sharding,
-    const DotGeneralDimsMapping& dims_mapping, int64 num_partitions,
+    const HloSharding& output_sharding, const DotConvDimsMapping& dims_mapping,
+    int64 num_partitions,
     const std::function<StatusOr<HloInstruction*>(
-        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot,
-    HloModule* module, HloInstruction* original_hlo,
+        HloInstruction*, HloInstruction*, SpmdBuilder*,
+        const Window& conv_window)>& create_sharded_dot,
+    const Window& conv_window, HloModule* module, HloInstruction* original_hlo,
     bool require_matching_devices_to_group,
-    int64 threshold_for_windowed_einsum_mib, SpmdBuilder* b,
+    const SpmdPartitionerOptions& options, SpmdBuilder* b,
     std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
         windowed_dot_general_loops) {
   std::vector<std::pair<HloInstruction*, HloSharding>>
@@ -1043,7 +1075,8 @@ StatusOr<HloInstruction*> PartitionDotGroupOnContracting(
             {output_sharding.tile_assignment().num_dimensions() - 1},
             {output_sharding.tile_assignment().dimensions().back() /
              group_count}),
-        lhs_grouped);
+        lhs_grouped,
+        /*ignore_group_order=*/true);
     outer_output_tmp_sharding = UngroupSharding(grouped);
     inner_output_sharding = std::move(grouped.sharding);
   } else {
@@ -1088,10 +1121,9 @@ StatusOr<HloInstruction*> PartitionDotGroupOnContracting(
           PartitionedHlo(rhs.hlo(),
                          GetPerGroupBaseShape(rhs_grouped, rhs.base_shape()),
                          inner_state),
-          MakePartitionedShape(output_base_shape, outer_output_tmp_sharding),
-          inner_output_sharding, dims_mapping, num_partitions / group_count,
-          create_sharded_dot, module, original_hlo,
-          threshold_for_windowed_einsum_mib, b, windowed_dot_general_loops));
+          output_base_shape, inner_output_sharding, dims_mapping,
+          num_partitions / group_count, create_sharded_dot, conv_window, module,
+          original_hlo, options, b, windowed_dot_general_loops));
   if (!dot) {
     return nullptr;
   }
@@ -1107,31 +1139,73 @@ StatusOr<HloInstruction*> PartitionDotGroupOnContracting(
                                       inverse_grouped.device_groups, b)
           .collective_ops_creator.create_cross_partition_all_reduce(
               b, dot, MakeBinaryAdd(output_base_shape.element_type(), module),
-              {}, (*lhs.state().next_channel_id)++);
+              {GetAllDevicesInOrder(inverse_grouped.sharding)},
+              (*lhs.state().next_channel_id)++);
   ar->set_sharding(outer_output_tmp_sharding);
   return PartitionedHlo(ar, output_base_shape, lhs.state())
       .Reshard(output_sharding)
       .hlo();
 }
 
+DotConvDimsMapping ConvertDimsMappingWithFeatureGroupCount(
+    const DotConvDimsMapping& dims_mapping, HloInstruction* original_hlo) {
+  const auto& dnums = original_hlo->convolution_dimension_numbers();
+  DotConvDimsMapping new_dims_mapping;
+  new_dims_mapping.batch_dims = dims_mapping.batch_dims;
+  new_dims_mapping.conv_spatial_dims = dims_mapping.conv_spatial_dims;
+  // Append batch dims.
+  new_dims_mapping.batch_dims.emplace_back();
+  new_dims_mapping.batch_dims.back().lhs = dnums.input_feature_dimension();
+  new_dims_mapping.batch_dims.back().rhs =
+      dnums.kernel_output_feature_dimension();
+  new_dims_mapping.batch_dims.back().output = dnums.output_feature_dimension();
+  new_dims_mapping.batch_dims.back().spatial = -1;
+  // Setup non contracting dims.
+  new_dims_mapping.lhs_non_contracting_dims.emplace_back();
+  new_dims_mapping.lhs_non_contracting_dims.back().lhs =
+      dnums.input_batch_dimension();
+  new_dims_mapping.rhs_non_contracting_dims.emplace_back();
+  new_dims_mapping.rhs_non_contracting_dims.back().rhs =
+      dnums.kernel_input_feature_dimension();
+  return new_dims_mapping;
+}
+
+DotConvDimsMapping ConvertDimsMappingWithBatchGroupCount(
+    const DotConvDimsMapping& dims_mapping, HloInstruction* original_hlo) {
+  const auto& dnums = original_hlo->convolution_dimension_numbers();
+  DotConvDimsMapping new_dims_mapping;
+  new_dims_mapping.batch_dims = dims_mapping.batch_dims;
+  new_dims_mapping.conv_spatial_dims = dims_mapping.conv_spatial_dims;
+  new_dims_mapping.contracting_dims = dims_mapping.contracting_dims;
+  // Append batch dims.
+  new_dims_mapping.batch_dims.emplace_back();
+  new_dims_mapping.batch_dims.back().lhs = dnums.input_batch_dimension();
+  new_dims_mapping.batch_dims.back().rhs =
+      dnums.kernel_output_feature_dimension();
+  new_dims_mapping.batch_dims.back().output = dnums.output_feature_dimension();
+  new_dims_mapping.batch_dims.back().spatial = -1;
+  return new_dims_mapping;
+}
+
 // Recursive partitioning function. If there are partial dimensions matching in
 // the operands and output, group the devices and recursively partition the
 // in-group dot.
 StatusOr<HloInstruction*> PartitionDot(
     PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding,
-    const DotGeneralDimsMapping& dims_mapping, int64 num_partitions,
+    const HloSharding& output_sharding, const DotConvDimsMapping& dims_mapping,
+    int64 num_partitions,
     const std::function<StatusOr<HloInstruction*>(
-        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot,
-    HloModule* module, HloInstruction* original_hlo,
+        HloInstruction*, HloInstruction*, SpmdBuilder*,
+        const Window& conv_window)>& create_sharded_dot,
+    const Window& conv_window, HloModule* module, HloInstruction* original_hlo,
     bool require_matching_devices_to_group,
-    int64 threshold_for_windowed_einsum_mib, SpmdBuilder* b,
+    const SpmdPartitionerOptions& options, SpmdBuilder* b,
     std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
         windowed_dot_general_loops) {
   // lhs_rhs_or_output: 0 lhs, 1 rhs, 2 output.
   auto get_partitions_for_dims =
       [&](const HloSharding& sharding,
-          absl::Span<const DotGeneralDimsMapping::DimsMapping> dims,
+          absl::Span<const DotConvDimsMapping::DimsMapping> dims,
           int lhs_rhs_or_output) {
         int64 partitions = 1;
         if (sharding.IsTileMaximal()) {
@@ -1167,19 +1241,112 @@ StatusOr<HloInstruction*> PartitionDot(
       output_sharding, dims_mapping.lhs_non_contracting_dims, 2);
   const int64 output_rhs_non_contracting_partitions = get_partitions_for_dims(
       output_sharding, dims_mapping.rhs_non_contracting_dims, 2);
+  const int64 lhs_conv_spatial_partitions = get_partitions_for_dims(
+      lhs.sharding(), dims_mapping.conv_spatial_dims, 0);
+  const int64 rhs_conv_spatial_partitions = get_partitions_for_dims(
+      rhs.sharding(), dims_mapping.conv_spatial_dims, 1);
+  const int64 output_conv_spatial_partitions = get_partitions_for_dims(
+      output_sharding, dims_mapping.conv_spatial_dims, 2);
   // Before we find partial matches along the dimensions, invoke base case again
   // without may_reshard_without_detecting_match.
+
+  // Try partition the purely spatially-partitioned convolution with convolution
+  // spatial dimension partitioned or depthwise parallel dimension partitioned.
+  bool is_conv_spatial_dim_partitioned =
+      (lhs_conv_spatial_partitions > 1 || rhs_conv_spatial_partitions > 1 ||
+       output_conv_spatial_partitions > 1);
+  bool is_conv_batch_or_contracting_dim_partitioned =
+      (lhs_batch_partitions > 1 || rhs_batch_partitions > 1 ||
+       output_batch_partitions > 1 ||
+       (lhs_contracting_partitions > 1 && rhs_contracting_partitions > 1));
+  if ((!dims_mapping.conv_spatial_dims.empty() &&
+       is_conv_spatial_dim_partitioned &&
+       !is_conv_batch_or_contracting_dim_partitioned) ||
+      (original_hlo->opcode() == HloOpcode::kConvolution &&
+       (original_hlo->batch_group_count() > 1 ||
+        original_hlo->feature_group_count() > 1))) {
+    // Partition with kernel_input_feature_dim > 1 and feature_group_count > 1
+    // is not supported.
+    const auto& dnums = original_hlo->convolution_dimension_numbers();
+    if (original_hlo->feature_group_count() > 1 &&
+        rhs.hlo()->shape().dimensions(dnums.kernel_input_feature_dimension()) >
+            1) {
+      return nullptr;
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        auto partitioned_conv,
+        PartitionConvolution(lhs, rhs, output_base_shape, output_sharding,
+                             dims_mapping, create_sharded_dot, conv_window,
+                             original_hlo, num_partitions, options,
+                             lhs.state().partition_id, module, b));
+
+    if (partitioned_conv) {
+      return partitioned_conv;
+    }
+
+    // Recursively partition on different types of dimensions for convolution.
+    // Case 0.a: Group partitions by feature group count.
+    if (original_hlo->feature_group_count() > 1 ||
+        original_hlo->batch_group_count() > 1) {
+      DotConvDimsMapping new_dims_mapping;
+      if (original_hlo->feature_group_count() > 1) {
+        new_dims_mapping =
+            ConvertDimsMappingWithFeatureGroupCount(dims_mapping, original_hlo);
+      }
+
+      if (original_hlo->batch_group_count() > 1) {
+        new_dims_mapping =
+            ConvertDimsMappingWithBatchGroupCount(dims_mapping, original_hlo);
+      }
+
+      const int64 conv_lhs_contracting_partitions = get_partitions_for_dims(
+          lhs.sharding(), new_dims_mapping.contracting_dims, 0);
+      const int64 conv_rhs_contracting_partitions = get_partitions_for_dims(
+          rhs.sharding(), new_dims_mapping.contracting_dims, 1);
+      const int64 conv_lhs_non_contracting_partitions = get_partitions_for_dims(
+          lhs.sharding(), new_dims_mapping.lhs_non_contracting_dims, 0);
+      const int64 conv_rhs_non_contracting_partitions = get_partitions_for_dims(
+          rhs.sharding(), new_dims_mapping.rhs_non_contracting_dims, 1);
+      const int64 conv_lhs_batch_partitions = get_partitions_for_dims(
+          lhs.sharding(), new_dims_mapping.batch_dims, 0);
+      const int64 conv_rhs_batch_partitions = get_partitions_for_dims(
+          rhs.sharding(), new_dims_mapping.batch_dims, 1);
+      const int64 conv_output_batch_partitions = get_partitions_for_dims(
+          output_sharding, new_dims_mapping.batch_dims, 2);
+      if ((conv_lhs_batch_partitions == conv_output_batch_partitions ||
+           conv_rhs_batch_partitions == conv_output_batch_partitions) &&
+          conv_output_batch_partitions > 1) {
+        TF_ASSIGN_OR_RETURN(
+            auto try_partitioned_conv,
+            PartitionDotGroupOnBatch(
+                lhs, rhs, output_base_shape, output_sharding, new_dims_mapping,
+                num_partitions, conv_lhs_contracting_partitions,
+                conv_rhs_contracting_partitions,
+                conv_lhs_non_contracting_partitions,
+                conv_rhs_non_contracting_partitions, create_sharded_dot,
+                conv_window, module, original_hlo,
+                require_matching_devices_to_group, options, b,
+                windowed_dot_general_loops));
+        if (try_partitioned_conv) {
+          return try_partitioned_conv;
+        }
+      }
+      return nullptr;
+    }
+  }
+
   TF_ASSIGN_OR_RETURN(
       auto try_partitioned_dot,
       PartitionBaseCase(
           lhs, rhs, output_base_shape, output_sharding, dims_mapping,
-          num_partitions, create_sharded_dot, module, original_hlo,
+          num_partitions, create_sharded_dot, conv_window, module, original_hlo,
           lhs_batch_partitions, rhs_batch_partitions, output_batch_partitions,
           lhs_contracting_partitions, rhs_contracting_partitions,
           lhs_non_contracting_partitions, rhs_non_contracting_partitions,
           output_lhs_non_contracting_partitions,
-          output_rhs_non_contracting_partitions,
-          threshold_for_windowed_einsum_mib, b, windowed_dot_general_loops,
+          output_rhs_non_contracting_partitions, options, b,
+          windowed_dot_general_loops,
           /*may_reshard_without_detecting_match=*/false));
   if (try_partitioned_dot) {
     return try_partitioned_dot;
@@ -1197,9 +1364,9 @@ StatusOr<HloInstruction*> PartitionDot(
             lhs, rhs, output_base_shape, output_sharding, dims_mapping,
             num_partitions, lhs_contracting_partitions,
             rhs_contracting_partitions, lhs_non_contracting_partitions,
-            rhs_non_contracting_partitions, create_sharded_dot, module,
-            original_hlo, require_matching_devices_to_group,
-            threshold_for_windowed_einsum_mib, b, windowed_dot_general_loops));
+            rhs_non_contracting_partitions, create_sharded_dot, conv_window,
+            module, original_hlo, require_matching_devices_to_group, options, b,
+            windowed_dot_general_loops));
     if (dot) {
       return dot;
     }
@@ -1222,7 +1389,6 @@ StatusOr<HloInstruction*> PartitionDot(
                  ShapeUtil::ByteSizeOf(rhs.hlo()->shape()) <=
              rhs_non_contracting_partitions *
                  ShapeUtil::ByteSizeOf(lhs.hlo()->shape()));
-
     TF_ASSIGN_OR_RETURN(
         auto dot,
         PartitionDotGroupOnNonContracting(
@@ -1238,9 +1404,9 @@ StatusOr<HloInstruction*> PartitionDot(
             lhs_matching ? output_rhs_non_contracting_partitions
                          : output_lhs_non_contracting_partitions,
             output_base_shape, output_sharding, dims_mapping, num_partitions,
-            create_sharded_dot, module, original_hlo,
-            require_matching_devices_to_group,
-            threshold_for_windowed_einsum_mib, b, windowed_dot_general_loops));
+            create_sharded_dot, conv_window, module, original_hlo,
+            require_matching_devices_to_group, options, b,
+            windowed_dot_general_loops));
     if (dot) {
       return dot;
     }
@@ -1248,7 +1414,7 @@ StatusOr<HloInstruction*> PartitionDot(
   if (lhs_non_contracting_partitions > 1 &&
       output_lhs_non_contracting_partitions > 1) {
     // If part of LHS non-contracting dims match output, try them.
-    std::vector<DotGeneralDimsMapping::DimsMapping> matching_dims;
+    std::vector<DotConvDimsMapping::DimsMapping> matching_dims;
     for (const auto& dim : dims_mapping.lhs_non_contracting_dims) {
       int64 lhs_partitions = lhs.sharding().tile_assignment().dim(dim.lhs);
       if (lhs_partitions > 1 &&
@@ -1258,16 +1424,15 @@ StatusOr<HloInstruction*> PartitionDot(
     }
     if (!matching_dims.empty()) {
       TF_ASSIGN_OR_RETURN(
-          auto dot,
-          PartitionDotGroupOnNonContracting(
-              /*lhs_matching=*/true, lhs, rhs, lhs_contracting_partitions,
-              rhs_contracting_partitions, matching_dims,
-              rhs_non_contracting_partitions,
-              output_rhs_non_contracting_partitions, output_base_shape,
-              output_sharding, dims_mapping, num_partitions, create_sharded_dot,
-              module, original_hlo, require_matching_devices_to_group,
-              threshold_for_windowed_einsum_mib, b,
-              windowed_dot_general_loops));
+          auto dot, PartitionDotGroupOnNonContracting(
+                        /*lhs_matching=*/true, lhs, rhs,
+                        lhs_contracting_partitions, rhs_contracting_partitions,
+                        matching_dims, rhs_non_contracting_partitions,
+                        output_rhs_non_contracting_partitions,
+                        output_base_shape, output_sharding, dims_mapping,
+                        num_partitions, create_sharded_dot, conv_window, module,
+                        original_hlo, require_matching_devices_to_group,
+                        options, b, windowed_dot_general_loops));
       if (dot) {
         return dot;
       }
@@ -1276,7 +1441,7 @@ StatusOr<HloInstruction*> PartitionDot(
   if (rhs_non_contracting_partitions > 1 &&
       output_rhs_non_contracting_partitions > 1) {
     // If part of RHS non-contracting dims match output, try them.
-    std::vector<DotGeneralDimsMapping::DimsMapping> matching_dims;
+    std::vector<DotConvDimsMapping::DimsMapping> matching_dims;
     for (const auto& dim : dims_mapping.rhs_non_contracting_dims) {
       int64 rhs_partitions = rhs.sharding().tile_assignment().dim(dim.rhs);
       if (rhs_partitions > 1 &&
@@ -1286,16 +1451,15 @@ StatusOr<HloInstruction*> PartitionDot(
     }
     if (!matching_dims.empty()) {
       TF_ASSIGN_OR_RETURN(
-          auto dot,
-          PartitionDotGroupOnNonContracting(
-              /*lhs_matching=*/false, rhs, lhs, rhs_contracting_partitions,
-              lhs_contracting_partitions, matching_dims,
-              lhs_non_contracting_partitions,
-              output_lhs_non_contracting_partitions, output_base_shape,
-              output_sharding, dims_mapping, num_partitions, create_sharded_dot,
-              module, original_hlo, require_matching_devices_to_group,
-              threshold_for_windowed_einsum_mib, b,
-              windowed_dot_general_loops));
+          auto dot, PartitionDotGroupOnNonContracting(
+                        /*lhs_matching=*/false, rhs, lhs,
+                        rhs_contracting_partitions, lhs_contracting_partitions,
+                        matching_dims, lhs_non_contracting_partitions,
+                        output_lhs_non_contracting_partitions,
+                        output_base_shape, output_sharding, dims_mapping,
+                        num_partitions, create_sharded_dot, conv_window, module,
+                        original_hlo, require_matching_devices_to_group,
+                        options, b, windowed_dot_general_loops));
       if (dot) {
         return dot;
       }
@@ -1312,15 +1476,16 @@ StatusOr<HloInstruction*> PartitionDot(
             output_lhs_non_contracting_partitions,
             output_rhs_non_contracting_partitions, output_base_shape,
             output_sharding, dims_mapping, num_partitions, create_sharded_dot,
-            module, original_hlo, require_matching_devices_to_group,
-            threshold_for_windowed_einsum_mib, b, windowed_dot_general_loops));
+            conv_window, module, original_hlo,
+            require_matching_devices_to_group, options, b,
+            windowed_dot_general_loops));
     if (dot) {
       return dot;
     }
   }
   if (lhs_contracting_partitions > 1 && rhs_contracting_partitions > 1) {
     // If part of contracting dims match, try them.
-    std::vector<DotGeneralDimsMapping::DimsMapping> matching_dims;
+    std::vector<DotConvDimsMapping::DimsMapping> matching_dims;
     for (const auto& dim : dims_mapping.contracting_dims) {
       int64 lhs_partitions = lhs.sharding().tile_assignment().dim(dim.lhs);
       if (lhs_partitions > 1 &&
@@ -1330,15 +1495,14 @@ StatusOr<HloInstruction*> PartitionDot(
     }
     if (!matching_dims.empty()) {
       TF_ASSIGN_OR_RETURN(
-          auto dot,
-          PartitionDotGroupOnContracting(
-              lhs, rhs, matching_dims, output_batch_partitions,
-              output_lhs_non_contracting_partitions,
-              output_rhs_non_contracting_partitions, output_base_shape,
-              output_sharding, dims_mapping, num_partitions, create_sharded_dot,
-              module, original_hlo, require_matching_devices_to_group,
-              threshold_for_windowed_einsum_mib, b,
-              windowed_dot_general_loops));
+          auto dot, PartitionDotGroupOnContracting(
+                        lhs, rhs, matching_dims, output_batch_partitions,
+                        output_lhs_non_contracting_partitions,
+                        output_rhs_non_contracting_partitions,
+                        output_base_shape, output_sharding, dims_mapping,
+                        num_partitions, create_sharded_dot, conv_window, module,
+                        original_hlo, require_matching_devices_to_group,
+                        options, b, windowed_dot_general_loops));
       if (dot) {
         return dot;
       }
@@ -1358,8 +1522,8 @@ StatusOr<HloInstruction*> PartitionDot(
         PartitionDot(PartitionedHlo(lhs.hlo(), lhs.base_shape(), inner_state),
                      PartitionedHlo(rhs.hlo(), rhs.base_shape(), inner_state),
                      output_base_shape, grouped_output.sharding, dims_mapping,
-                     output_sharding.NumTiles(), create_sharded_dot, module,
-                     original_hlo, threshold_for_windowed_einsum_mib, b,
+                     output_sharding.NumTiles(), create_sharded_dot,
+                     conv_window, module, original_hlo, options, b,
                      windowed_dot_general_loops));
     if (dot) {
       return dot;
@@ -1372,13 +1536,13 @@ StatusOr<HloInstruction*> PartitionDot(
       auto dot,
       PartitionBaseCase(
           lhs, rhs, output_base_shape, output_sharding, dims_mapping,
-          num_partitions, create_sharded_dot, module, original_hlo,
+          num_partitions, create_sharded_dot, conv_window, module, original_hlo,
           lhs_batch_partitions, rhs_batch_partitions, output_batch_partitions,
           lhs_contracting_partitions, rhs_contracting_partitions,
           lhs_non_contracting_partitions, rhs_non_contracting_partitions,
           output_lhs_non_contracting_partitions,
-          output_rhs_non_contracting_partitions,
-          threshold_for_windowed_einsum_mib, b, windowed_dot_general_loops,
+          output_rhs_non_contracting_partitions, options, b,
+          windowed_dot_general_loops,
           /*may_reshard_without_detecting_match=*/true));
   if (dot) {
     return dot;
@@ -1388,12 +1552,13 @@ StatusOr<HloInstruction*> PartitionDot(
 
 StatusOr<HloInstruction*> PartitionDot(
     PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
-    const HloSharding& output_sharding,
-    const DotGeneralDimsMapping& dims_mapping, int64 num_partitions,
+    const HloSharding& output_sharding, const DotConvDimsMapping& dims_mapping,
+    int64 num_partitions,
     const std::function<StatusOr<HloInstruction*>(
-        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot,
-    HloModule* module, HloInstruction* original_hlo,
-    int64 threshold_for_windowed_einsum_mib, SpmdBuilder* b,
+        HloInstruction*, HloInstruction*, SpmdBuilder*,
+        const Window& conv_window)>& create_sharded_dot,
+    const Window& conv_window, HloModule* module, HloInstruction* original_hlo,
+    const SpmdPartitionerOptions& options, SpmdBuilder* b,
     std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
         windowed_dot_general_loops) {
   // First try partitioning without resharding the groups, then try allow
@@ -1402,18 +1567,18 @@ StatusOr<HloInstruction*> PartitionDot(
     TF_ASSIGN_OR_RETURN(
         auto try_partition,
         PartitionDot(lhs, rhs, output_base_shape, output_sharding, dims_mapping,
-                     num_partitions, create_sharded_dot, module, original_hlo,
-                     require_matching_devices_to_group,
-                     threshold_for_windowed_einsum_mib, b,
-                     windowed_dot_general_loops));
+                     num_partitions, create_sharded_dot, conv_window, module,
+                     original_hlo, require_matching_devices_to_group, options,
+                     b, windowed_dot_general_loops));
     if (try_partition) {
       return try_partition;
     }
   }
 
   // Default action.
-  TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(lhs.Replicate().hlo(),
-                                                   rhs.Replicate().hlo(), b));
+  TF_ASSIGN_OR_RETURN(
+      auto dot, create_sharded_dot(lhs.Replicate().hlo(), rhs.Replicate().hlo(),
+                                   b, conv_window));
   dot->set_sharding(HloSharding::Replicate());
   return PartitionedHlo(dot, output_base_shape, lhs.state())
       .Reshard(output_sharding)
@@ -1423,17 +1588,22 @@ StatusOr<HloInstruction*> PartitionDot(
 }  // namespace
 
 Status SpmdPartitioningVisitor::HandleDotHelper(
-    HloInstruction* hlo, const DotGeneralDimsMapping& dims_mapping,
+    HloInstruction* hlo, const DotConvDimsMapping& dims_mapping,
     const std::function<StatusOr<HloInstruction*>(
-        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot) {
+        HloInstruction*, HloInstruction*, SpmdBuilder*,
+        const Window& conv_window)>& create_sharded_dot) {
   auto& lhs = GetPartitionedHlo(hlo->operand(0));
   auto& rhs = GetPartitionedHlo(hlo->operand(1));
+  Window conv_window;
+  if (hlo->opcode() == HloOpcode::kConvolution) {
+    conv_window = hlo->window();
+  }
+
   TF_ASSIGN_OR_RETURN(
       auto partitioned_dot,
       PartitionDot(lhs, rhs, hlo->shape(), hlo->sharding(), dims_mapping,
-                   num_partitions_, create_sharded_dot, module_, hlo,
-                   options_.threshold_for_windowed_einsum_mib, &b_,
-                   &windowed_dot_general_loops_));
+                   num_partitions_, create_sharded_dot, conv_window, module_,
+                   hlo, options_, &b_, &windowed_dot_general_loops_));
   SetPartitionedHlo(hlo, [&] { return partitioned_dot; });
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/spmd/schedule_aware_all_gather_cse.cc b/tensorflow/compiler/xla/service/spmd/schedule_aware_all_gather_cse.cc
index cc97d5ebda7..bdc96afba88 100644
--- a/tensorflow/compiler/xla/service/spmd/schedule_aware_all_gather_cse.cc
+++ b/tensorflow/compiler/xla/service/spmd/schedule_aware_all_gather_cse.cc
@@ -88,7 +88,7 @@ StatusOr<bool> RunOnComputation(HloComputation* comp, bool for_replicas,
 
     auto& earlier_ags = operand_to_ag[ag->operand(0)];
     bool found = false;
-    int64 lowest_user_h = lowest_user_height(ag);
+    int64 ag_height = height[ag];
     for (auto& eag : earlier_ags) {
       auto old_channel_id = ag->channel_id();
       if (eag->channel_id() && ag->channel_id()) {
@@ -100,7 +100,7 @@ StatusOr<bool> RunOnComputation(HloComputation* comp, bool for_replicas,
       }
       found = true;
       ag->set_channel_id(old_channel_id);
-      if (lowest_user_height(eag) > lowest_user_h + distance_threshold) {
+      if (lowest_user_height(eag) > ag_height + distance_threshold) {
         eag = ag;
         continue;
       }
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index f16b7bacda3..ceb81330639 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
@@ -216,6 +217,125 @@ HloInstruction* SpmdBuilder::AddInstruction(
   if (visiting_hlo_) {
     instructions_[visiting_hlo_].push_back(hlo);
   }
+  if (hlo->opcode() == HloOpcode::kBroadcast) {
+    for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+      if (!absl::c_linear_search(hlo->dimensions(), i)) {
+        broadcast_dims_[hlo].insert(i);
+      }
+    }
+  }
+  if (hlo->IsElementwise() && hlo->operand_count() > 0) {
+    absl::flat_hash_set<int64> broadcast_dims;
+    for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+      broadcast_dims.insert(i);
+    }
+    for (int64 i = 0; i < hlo->operand_count(); ++i) {
+      auto it = broadcast_dims_.find(hlo->operand(i));
+      if (it == broadcast_dims_.end()) {
+        broadcast_dims.clear();
+        break;
+      }
+      for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+        if (!it->second.contains(i)) {
+          broadcast_dims.erase(i);
+        }
+      }
+    }
+    if (!broadcast_dims.empty()) {
+      broadcast_dims_[hlo] = std::move(broadcast_dims);
+    }
+  }
+  if (hlo->opcode() == HloOpcode::kTranspose) {
+    auto it = broadcast_dims_.find(hlo->operand(0));
+    if (it != broadcast_dims_.end()) {
+      absl::flat_hash_set<int64> xpose_broadcast_dims;
+      std::vector<int64> reverse_map(hlo->shape().rank());
+      for (int64 i = 0; i < reverse_map.size(); ++i) {
+        reverse_map[hlo->dimensions(i)] = i;
+      }
+      for (int64 dim : it->second) {
+        xpose_broadcast_dims.insert(reverse_map[dim]);
+      }
+      broadcast_dims_[hlo] = std::move(xpose_broadcast_dims);
+    }
+  }
+  if (hlo->opcode() == HloOpcode::kReshape &&
+      Product(hlo->shape().dimensions()) > 0) {
+    auto it = broadcast_dims_.find(hlo->operand(0));
+    if (it != broadcast_dims_.end()) {
+      absl::flat_hash_set<int64> reshape_broadcast_dims;
+      for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+        reshape_broadcast_dims.insert(i);
+      }
+      std::vector<int64> before_dim_size_stack;
+      std::vector<int64> after_dim_size_stack;
+      for (int64 i = hlo->operand(0)->shape().rank() - 1; i >= 0; --i) {
+        before_dim_size_stack.push_back(hlo->operand(0)->shape().dimensions(i));
+      }
+      for (int64 i = hlo->shape().rank() - 1; i >= 0; --i) {
+        after_dim_size_stack.push_back(hlo->shape().dimensions(i));
+      }
+      while (!before_dim_size_stack.empty() && !after_dim_size_stack.empty()) {
+        int64 before_size = before_dim_size_stack.back();
+        int64 after_size = after_dim_size_stack.back();
+        int64 current_before_dim =
+            hlo->operand(0)->shape().rank() - before_dim_size_stack.size();
+        int64 current_after_dim =
+            hlo->shape().rank() - after_dim_size_stack.size();
+        before_dim_size_stack.pop_back();
+        after_dim_size_stack.pop_back();
+        if (!it->second.contains(current_before_dim)) {
+          reshape_broadcast_dims.erase(current_after_dim);
+        }
+        if (before_size == after_size) {
+          continue;
+        }
+        if (before_size % after_size == 0) {
+          // Split dim.
+          before_dim_size_stack.push_back(before_size / after_size);
+        } else if (after_size % before_size == 0) {
+          // Merge dim.
+          after_dim_size_stack.push_back(after_size / before_size);
+        } else {
+          // Other cases, mark all remaining dims as non-broadcast.
+          for (int64 i = current_after_dim; i < hlo->shape().rank(); ++i) {
+            reshape_broadcast_dims.erase(i);
+          }
+          break;
+        }
+      }
+      if (!before_dim_size_stack.empty() || !after_dim_size_stack.empty()) {
+        reshape_broadcast_dims.clear();
+      }
+      if (!reshape_broadcast_dims.empty()) {
+        broadcast_dims_[hlo] = std::move(reshape_broadcast_dims);
+      }
+    }
+  }
+  if (hlo->opcode() == HloOpcode::kSlice ||
+      hlo->opcode() == HloOpcode::kDynamicSlice) {
+    auto it = broadcast_dims_.find(hlo->operand(0));
+    if (it != broadcast_dims_.end()) {
+      auto dims = it->second;
+      broadcast_dims_[hlo] = std::move(dims);
+    }
+  }
+  if (hlo->opcode() == HloOpcode::kPad) {
+    auto it = broadcast_dims_.find(hlo->operand(0));
+    if (it != broadcast_dims_.end()) {
+      absl::flat_hash_set<int64> pad_broadcast_dims;
+      for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+        const auto& dim = hlo->padding_config().dimensions(i);
+        if (dim.edge_padding_low() == 0 && dim.edge_padding_high() == 0 &&
+            dim.interior_padding() == 0 && it->second.contains(i)) {
+          pad_broadcast_dims.insert(i);
+        }
+      }
+      if (!pad_broadcast_dims.empty()) {
+        broadcast_dims_[hlo] = std::move(pad_broadcast_dims);
+      }
+    }
+  }
   return hlo;
 }
 
@@ -1099,23 +1219,25 @@ PartitionedHlo PartitionedHlo::ReshardWithCollectivePermute(
     const HloSharding& target) const {
   CHECK(CanReshardWithCollectivePermute(sharding(), target))
       << sharding().ToString() << " to " << target.ToString();
-  if (hlo()->opcode() == HloOpcode::kBroadcast) {
-    // If hlo() is a broadcast, check if data is already the same between
-    // source/destination pairs.
-    std::vector<int64> new_dims;
-    for (int64 i = 0; i < hlo()->shape().rank(); ++i) {
-      if (!absl::c_linear_search(hlo()->dimensions(), i)) {
-        new_dims.push_back(i);
+  if (auto broadcast_dims = state_.b->BroadcastDimsForCreatedHlo(hlo())) {
+    if (!(*broadcast_dims)->empty()) {
+      // If hlo() has broadcast dims, check if data is already the same between
+      // source/destination pairs.
+      std::vector<int64> broadcast_dims_vector;
+      for (int64 i = 0; i < hlo()->shape().rank(); ++i) {
+        if ((*broadcast_dims)->contains(i)) {
+          broadcast_dims_vector.push_back(i);
+        }
+      }
+      if (hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+              sharding(), broadcast_dims_vector) ==
+          hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(
+              target, broadcast_dims_vector)) {
+        auto copy = state_.b->AddInstruction(HloInstruction::CreateUnary(
+            hlo()->shape(), HloOpcode::kCopy, hlo()));
+        copy->set_sharding(target);
+        return PartitionedHlo(copy, base_shape_, state_);
       }
-    }
-    if (hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(sharding(),
-                                                                 new_dims) ==
-        hlo_sharding_util::PartiallyReplicateTiledShardingOnDims(target,
-                                                                 new_dims)) {
-      auto copy = state_.b->AddInstruction(
-          HloInstruction::CreateUnary(hlo()->shape(), HloOpcode::kCopy, hlo()));
-      copy->set_sharding(target);
-      return PartitionedHlo(copy, base_shape_, state_);
     }
   }
   std::vector<std::pair<int64, int64>> src_dst_pairs;
@@ -1289,7 +1411,7 @@ namespace {
 // gather/scatter slice size 1.
 bool GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
     const PartitionedHlo& operand, absl::Span<const int64> index_map,
-    absl::Span<const int64> slice_size, int64 num_partitions) {
+    absl::Span<const int64> slice_size) {
   if (operand.sharding().IsTileMaximal()) {
     return false;
   }
@@ -1300,7 +1422,7 @@ bool GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
           operand.sharding().tile_assignment().dim(dim);
     }
   }
-  return trivial_slice_dims_partitions == num_partitions;
+  return trivial_slice_dims_partitions == operand.sharding().NumTiles();
 }
 
 // Returns the min and max for the indices (replicated) in a scatter/gather
@@ -1451,10 +1573,23 @@ Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
               update_dim_to_index_dim);
       CHECK(new_updates_sharding.has_value());
       updates = updates.Reshard(*new_updates_sharding);
+      // Update collective_ops_creator and partition_id for partial replicate.
+      auto collective_ops_creator = collective_ops_creator_;
+      auto partition_id = partition_id_;
+      if (indices.sharding().ReplicateOnLastTileDim()) {
+        auto sharding_grouped = GroupShardingOnDims(
+            indices.sharding(),
+            {indices.sharding().tile_assignment().num_dimensions() - 1});
+        auto per_group_partitioner_state = CreatePerGroupPartitioningState(
+            indices.state(), sharding_grouped.device_groups, &b_);
+        collective_ops_creator =
+            per_group_partitioner_state.collective_ops_creator;
+        partition_id = per_group_partitioner_state.partition_id;
+      }
       // To avoid accumulating the initial operand multiple times during
       // all-reduce, we use identity operands for all non-zero partitions.
       auto not_partition_zero = b_.AddInstruction(HloInstruction::CreateConvert(
-          ShapeUtil::MakeScalarShape(PRED), partition_id_));
+          ShapeUtil::MakeScalarShape(PRED), partition_id));
       not_partition_zero = b_.AddInstruction(HloInstruction::CreateBroadcast(
           ShapeUtil::ChangeElementType(identity->shape(), PRED),
           not_partition_zero, {}));
@@ -1465,7 +1600,7 @@ Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
       auto pscatter = b_.AddInstruction(scatter->CloneWithNewOperands(
           scatter->shape(), {select_operand, indices.hlo(), updates.hlo()}));
       auto all_reduce =
-          collective_ops_creator_.create_cross_partition_all_reduce(
+          collective_ops_creator.create_cross_partition_all_reduce(
               &b_, pscatter, scatter->to_apply(), {}, NewChannel());
       all_reduce->set_sharding(HloSharding::Replicate());
       SetPartitionedHlo(hlo, [&]() {
@@ -1495,8 +1630,7 @@ Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
       return Status::OK();
     }
     if (GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
-            operand, scatter_dims_to_operand_dims, slice_size,
-            num_partitions_) &&
+            operand, scatter_dims_to_operand_dims, slice_size) &&
         ShapeSizeInBytes(updates.base_shape()) <
             ShapeSizeInBytes(scatter->shape())) {
       // Operand is sharded on trivial slice dims (update slice size 1). We can
@@ -2371,8 +2505,7 @@ Status SpmdPartitioningVisitor::HandleGather(HloInstruction* hlo) {
       return Status::OK();
     }
     if (GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
-            operand, start_index_map, gather->gather_slice_sizes(),
-            num_partitions_) &&
+            operand, start_index_map, gather->gather_slice_sizes()) &&
         ShapeSizeInBytes(gather->shape()) <
             ShapeSizeInBytes(gather->operand(0)->shape())) {
       indices = indices.Reshard(HloSharding::Replicate());
@@ -2434,7 +2567,17 @@ Status SpmdPartitioningVisitor::HandleGather(HloInstruction* hlo) {
           pgather->shape(), HloOpcode::kSelect, broadcast_filter,
           CreateZero(pgather->shape(), &b_), pgather));
       // Combine from different partitions.
-      auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+      auto collective_ops_creator = collective_ops_creator_;
+      if (operand.sharding().ReplicateOnLastTileDim()) {
+        auto sharding_grouped = GroupShardingOnDims(
+            operand.sharding(),
+            {operand.sharding().tile_assignment().num_dimensions() - 1});
+        auto per_group_partitioner_state = CreatePerGroupPartitioningState(
+            operand.state(), sharding_grouped.device_groups, &b_);
+        collective_ops_creator =
+            per_group_partitioner_state.collective_ops_creator;
+      }
+      auto ar = collective_ops_creator.create_cross_partition_all_reduce(
           &b_, filtered,
           MakeBinaryAdd(filtered->shape().element_type(), module_), {},
           NewChannel());
@@ -2874,18 +3017,37 @@ Status SpmdPartitioningVisitor::HandleRng(HloInstruction* hlo) {
   }
 
   TF_RET_CHECK(!hlo->sharding().IsTileMaximal());
-  SetPartitionedHlo(hlo, [&] {
-    // Replicate the operands and run partitioned Rng on all devices.
-    std::vector<HloInstruction*> new_operands;
-    for (int64 i = 0; i < hlo->operand_count(); ++i) {
-      new_operands.push_back(GetPartitionedHlo(hlo->operand(i))
-                                 .Reshard(HloSharding::Replicate())
-                                 .hlo());
-    }
-    return b_.AddInstruction(HloInstruction::CreateRng(
+  // Replicate the operands and run partitioned Rng on all devices.
+  std::vector<HloInstruction*> new_operands;
+  for (int64 i = 0; i < hlo->operand_count(); ++i) {
+    new_operands.push_back(GetPartitionedHlo(hlo->operand(i))
+                               .Reshard(HloSharding::Replicate())
+                               .hlo());
+  }
+
+  if (!hlo->sharding().ReplicateOnLastTileDim()) {
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(HloInstruction::CreateRng(
+          MakePartitionedShape(hlo->shape(), hlo->sharding()),
+          hlo->random_distribution(), new_operands));
+    });
+  } else {
+    std::vector<int64> group_dims(
+        hlo->sharding().tile_assignment().num_dimensions() - 1);
+    std::iota(group_dims.begin(), group_dims.end(), 0);
+    auto sharding_grouped = GroupShardingOnDims(hlo->sharding(), group_dims);
+    auto per_group_state = CreatePerGroupPartitioningState(
+        MakePartitioningState(), sharding_grouped.device_groups, &b_);
+    auto rng = b_.AddInstruction(HloInstruction::CreateRng(
         MakePartitionedShape(hlo->shape(), hlo->sharding()),
         hlo->random_distribution(), new_operands));
-  });
+    rng->set_sharding(HloSharding::AssignDevice(0));
+    SetPartitionedHlo(hlo, [&]() {
+      return PartitionedHlo(rng, rng->shape(), per_group_state)
+          .Replicate()
+          .hlo();
+    });
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
index 6447d08be41..86c1a97b0d2 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -74,6 +75,16 @@ class SpmdBuilder : public HloComputation::Builder {
 
   HloInstruction* visiting_hlo() const { return visiting_hlo_; }
 
+  // Wrapper of queries to broadcast_dims_.
+  absl::optional<const absl::flat_hash_set<int64>*> BroadcastDimsForCreatedHlo(
+      const HloInstruction* hlo) {
+    auto it = broadcast_dims_.find(hlo);
+    if (it == broadcast_dims_.end()) {
+      return absl::nullopt;
+    }
+    return &it->second;
+  }
+
  private:
   // Currently visiting instruction.
   HloInstruction* visiting_hlo_;
@@ -81,6 +92,12 @@ class SpmdBuilder : public HloComputation::Builder {
   // Map from the currently visiting (old) instruction to new instructions
   // created during SPMD partitioning.
   HloInstructionMap<std::vector<HloInstruction*>> instructions_;
+
+  // Maps from each created instruction to a set of dimensions that are from
+  // broadcasts or elementwise ops over broadcasts. This means elements along
+  // these dimensions have the same value.
+  absl::flat_hash_map<const HloInstruction*, absl::flat_hash_set<int64>>
+      broadcast_dims_;
 };
 
 // A set of functions that create the cross-partition collective ops.
@@ -330,27 +347,11 @@ class PartitionedHlo {
   PartitioningState state_;
 };
 
-struct DotGeneralDimsMapping {
+struct DotConvDimsMapping {
   // The dimension numbers for the operands and output corresponding to a
   // logical dimension (e.g., batch, contracting, non-contracting). If an
   // operand or the output doesn't have the logical dimension, it is set to
   // -1.
-  struct DimsMapping {
-    int64 lhs;
-    int64 rhs;
-    int64 output;
-  };
-  std::vector<DimsMapping> batch_dims;
-  std::vector<DimsMapping> contracting_dims;
-  std::vector<DimsMapping> lhs_non_contracting_dims;
-  std::vector<DimsMapping> rhs_non_contracting_dims;
-};
-
-struct ConvolutionDimsMapping {
-  // The dimension numbers for the operands and output corresponding to a
-  // logical dimension (e.g., batch, parallel, non-parallel). If an
-  // operand or the output doesn't have the logical dimension, it is set to
-  // -1.
   struct DimsMapping {
     int64 lhs;
     int64 rhs;
@@ -358,8 +359,11 @@ struct ConvolutionDimsMapping {
     // input mapped to index in input_spatial_dimensions().
     int64 spatial;
   };
-  std::vector<DimsMapping> parallel_spatial_dims;
-  std::vector<DimsMapping> non_parallel_spatial_dims;
+  std::vector<DimsMapping> batch_dims;
+  std::vector<DimsMapping> contracting_dims;
+  std::vector<DimsMapping> lhs_non_contracting_dims;
+  std::vector<DimsMapping> rhs_non_contracting_dims;
+  std::vector<DimsMapping> conv_spatial_dims;
 };
 
 class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
@@ -403,10 +407,11 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
   Status HandlePartitionId(HloInstruction* hlo) override;
 
   // Implementation of dot partitioning given DotGeneralDimsMapping.
-  Status HandleDotHelper(
-      HloInstruction* hlo, const DotGeneralDimsMapping& dims_mapping,
-      const std::function<StatusOr<HloInstruction*>(
-          HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot);
+  Status HandleDotHelper(HloInstruction* hlo,
+                         const DotConvDimsMapping& dims_mapping,
+                         const std::function<StatusOr<HloInstruction*>(
+                             HloInstruction*, HloInstruction*, SpmdBuilder*,
+                             const Window& conv_window)>& create_sharded_dot);
 
   // Common handle for elementwise HLOs.
   Status HandleElementwise(HloInstruction* hlo);
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
index 089c4c339a4..a4dd0e5441b 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -2003,6 +2003,36 @@ ENTRY entry {
   EXPECT_THAT(root, op::DynamicSlice(pad, _));
 }
 
+TEST_F(SpmdPartitioningTest, PartialReplicatePad) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[11,7] parameter(0),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %param1 = f32[] parameter(1), sharding={replicated}
+  ROOT %pad = f32[27,22] pad(%param0, %param1), padding=2_4_1x2_1_2,
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+
+  auto param0 = AllOf(op::Parameter(), op::Shape("f32[11,4]"));
+  auto after_halo_exchange =
+      AllOf(op::Shape("f32[11,4]"),
+            op::DynamicSlice(
+                AllOf(op::Shape("f32[11,5]"),
+                      op::Concatenate(op::CollectivePermute(op::Slice(param0)),
+                                      param0)),
+                op::Constant(), _));
+  auto pad = op::Pad(after_halo_exchange, op::Parameter(1));
+  EXPECT_THAT(root, AllOf(op::DynamicSlice(pad, op::Constant(), _),
+                          op::Shape("f32[27,11]")));
+}
+
 TEST_F(SpmdPartitioningTest, SliceAlongNonPartitionedDimension) {
   const char* const hlo_string = R"(
 HloModule module
@@ -2060,6 +2090,61 @@ ENTRY entry {
             op::Shape("f32[63,14,126]")));
 }
 
+TEST_F(SpmdPartitioningTest,
+       PartialReplicateSliceAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0), sharding={devices=[1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %slice = f32[128,11,257] slice(%param0),
+    slice={[0:128:1], [2:13:1], [0:257:1]}, sharding={devices=[1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(op::Parameter(), op::Shape("f32[128,14,129]"));
+  EXPECT_THAT(root, AllOf(op::Slice(param0), op::Shape("f32[128,11,129]")));
+}
+
+TEST_F(SpmdPartitioningTest, PartialReplicateSliceAlongPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0), sharding={devices=[1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %slice = f32[63,14,251] slice(%param0),
+    slice={[2:128:2], [0:14:1], [5:256:1]}, sharding={devices=[1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(op::Parameter(), op::Shape("f32[128,14,129]"));
+  EXPECT_THAT(
+      root,
+      AllOf(
+          op::Slice(AllOf(
+              op::DynamicSlice(
+                  AllOf(op::Concatenate(
+                            param0,
+                            AllOf(op::CollectivePermute(op::Slice(param0)),
+                                  op::Shape("f32[128,14,2]"))),
+                        op::Shape("f32[128,14,131]")),
+                  op::Constant(), op::Constant(),
+                  op::Add(op::Multiply(op::Reshape(op::DynamicSlice(
+                                           op::Constant(), op::PartitionId())),
+                                       op::Constant()),
+                          op::Constant())),
+              op::Shape("f32[128,14,126]"))),
+          op::Shape("f32[63,14,126]")));
+}
+
 TEST_F(SpmdPartitioningTest, SortAlongNonPartitionedDimension) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3293,6 +3378,30 @@ ENTRY entry {
   EXPECT_THAT(root, AllOf(op::Dot(lhs, rhs), op::Shape("f32[24,19648]")));
 }
 
+TEST_F(SpmdPartitioningTest, DotPartialDeviceOrder) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,256,4096] parameter(0), sharding={devices=[1,1,2,2]1,3,0,2 last_tile_dim_replicate}
+  %rhs = f32[4096,2048] parameter(1), sharding={devices=[2,2]3,1,2,0}
+  ROOT %dot = f32[16,256,2048] dot(%lhs, %rhs),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={2}, rhs_contracting_dims={0},
+    sharding={devices=[1,1,2,2]2,3,0,1 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Parameter(0), op::Shape("f32[16,256,2048]"));
+  auto rhs = AllOf(op::Parameter(1), op::Shape("f32[2048,1024]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Dot(lhs, rhs)),
+                          op::Shape("f32[16,256,1024]")));
+}
+
 TEST_F(SpmdPartitioningTest, EinsumBatchPartitioned) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3843,6 +3952,35 @@ ENTRY entry {
                           op::Shape("s32[2]")));
 }
 
+TEST_F(SpmdPartitioningTest, PartialReplicatedRng) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = s32[] parameter(0), sharding={replicated}
+  %rhs = s32[] parameter(1), sharding={replicated}
+  ROOT %rng = s32[8]{0} rng(%lhs, %rhs),
+      distribution=rng_uniform,
+      sharding={devices=[2,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Parameter(0), op::Shape("s32[]"));
+  auto rhs = AllOf(op::Parameter(1), op::Shape("s32[]"));
+  auto partition_id =
+      AllOf(op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId())),
+            op::Shape("u32[]"));
+  EXPECT_THAT(
+      root, AllOf(op::AllReduce(op::Select(
+                      op::Broadcast(op::Compare(partition_id, op::Constant())),
+                      op::Rng(lhs, rhs), op::Broadcast(op::Constant()))),
+                  op::Shape("s32[4]")));
+}
+
 TEST_F(SpmdPartitioningTest, DynamicSliceAlongNonPartitionedDimension) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3920,6 +4058,26 @@ ENTRY entry {
                           op::Shape("f32[3,5]")));
 }
 
+TEST_F(SpmdPartitioningTest, PassthroughGather_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  ROOT %gather = f32[3,9] gather(%input, %indices), offset_dims={1},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1,
+    slice_sizes={1,9}, sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Gather(op::Parameter(0), op::Parameter(1)),
+                          op::Shape("f32[3,5]")));
+}
+
 TEST_F(SpmdPartitioningTest, IndexPassthroughGather) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3939,6 +4097,27 @@ ENTRY entry {
                           op::Shape("f32[8,2,2]")));
 }
 
+TEST_F(SpmdPartitioningTest, IndexPassthroughGather_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[2,9,8] parameter(0), sharding={replicated}
+  %indices = s32[4,2,4] parameter(1),
+    sharding={devices=[2,1,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %gather = f32[8,4,4] gather(%input, %indices), offset_dims={0},
+    collapsed_slice_dims={0,1}, start_index_map={0,1}, index_vector_dim=1,
+    slice_sizes={1,1,8},
+    sharding={devices=[1,2,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Gather(op::Parameter(0), op::Parameter(1)),
+                          op::Shape("f32[8,2,2]")));
+}
+
 TEST_F(SpmdPartitioningTest, GatherPartitionedOnTrivialSliceDims) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3968,6 +4147,37 @@ ENTRY entry {
   EXPECT_THAT(root, AllOf(op::AllReduce(masked), op::Shape("f32[2,3,9]")));
 }
 
+TEST_F(SpmdPartitioningTest,
+       GatherPartitionedOnTrivialSliceDims_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[17,9] parameter(0),
+    sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+  %indices = s32[2,3] parameter(1), sharding={replicated}
+  ROOT %gather = f32[2,3,9] gather(%input, %indices), offset_dims={2},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=2,
+    slice_sizes={1,9}, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto offset =
+      op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId()));
+  auto min = AllOf(op::Broadcast(offset), op::Shape("s32[2,3]"));
+  auto max = AllOf(op::Broadcast(op::Add(offset, op::Constant())),
+                   op::Shape("s32[2,3]"));
+  auto clamp = op::Clamp(min, op::Parameter(1), max);
+  auto gather = op::Gather(op::Parameter(0), op::Subtract(clamp, min));
+  auto mask =
+      op::Or(op::Lt(op::Parameter(1), min), op::Gt(op::Parameter(1), max));
+  auto masked =
+      op::Select(op::Broadcast(mask), op::Broadcast(op::Constant()), gather);
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::AllReduce(masked), op::Shape("f32[2,3,9]")));
+}
+
 TEST_F(SpmdPartitioningTest, PassthroughScatter) {
   const char* const hlo_string = R"(
 HloModule module
@@ -3998,6 +4208,39 @@ ENTRY entry {
                           op::Shape("f32[2,5]")));
 }
 
+TEST_F(SpmdPartitioningTest, PassthroughScatter_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %updates = f32[3,9] parameter(2),
+    sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      sharding={devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Scatter(op::Parameter(0), op::Parameter(1),
+                                      op::Parameter(2)),
+                          op::Shape("f32[2,5]")));
+}
+
 TEST_F(SpmdPartitioningTest, IndexPassthroughScatter) {
   const char* const hlo_string = R"(
 HloModule module
@@ -4032,6 +4275,42 @@ ENTRY entry {
             op::Shape("f32[2,9,8]")));
 }
 
+TEST_F(SpmdPartitioningTest, IndexPassthroughScatter_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9,8] parameter(0), sharding={replicated}
+  %indices = s32[4,2,4] parameter(1),
+    sharding={devices=[2,1,2,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  %updates = f32[4,4,8] parameter(2),
+    sharding={devices=[2,2,1,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
+  ROOT %scatter = f32[2,9,8] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={2},
+      inserted_window_dims={0,1},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=1, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::AllReduce(op::Scatter(
+                op::Select(op::Broadcast(op::Convert(op::Reshape())),
+                           op::Broadcast(op::Constant()), op::Parameter(0)),
+                op::Parameter(1), op::Parameter(2))),
+            op::Shape("f32[2,9,8]")));
+}
+
 TEST_F(SpmdPartitioningTest, IndexPassthroughScatter_Min) {
   const char* const hlo_string = R"(
 HloModule module
@@ -4100,6 +4379,43 @@ ENTRY entry {
                     op::Shape("f32[9,9]")));
 }
 
+TEST_F(SpmdPartitioningTest,
+       ScatterPartitionedOnTrivialSliceDims_PartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[17,9] parameter(0),
+    sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+  %indices = s32[2,3] parameter(1), sharding={replicated}
+  %updates = f32[2,3,9] parameter(2), sharding={replicated}
+  ROOT %scatter = f32[17,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={2},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=2,
+      sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto offset =
+      op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId()));
+  auto indices = op::Subtract(
+      op::Parameter(1), AllOf(op::Broadcast(offset), op::Shape("s32[2,3]")));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::Scatter(op::Parameter(0), indices, op::Parameter(2)),
+                    op::Shape("f32[9,9]")));
+}
+
 TEST_F(SpmdPartitioningTest, TiledReversePassthrough) {
   const char* const hlo_string = R"(
 HloModule module
@@ -5091,6 +5407,733 @@ ENTRY entry {
   EXPECT_THAT(root, partially_replicated);
 }
 
+TEST_F(SpmdPartitioningTest, PartitionConvWithBathGroupCount) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,1,1,2]0,1}
+  %rhs = f32[16,801,1,1024] parameter(1)
+  %rhs.copy = f32[16,801,1,1024] copy(%rhs),
+    sharding={devices=[1,1,1,2]0,1}
+  ROOT %conv = f32[5,1,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=f01b_i01o->01bf,batch_group_count=1024,
+    window={size=801x1 pad=2_2x0_0},
+    sharding={devices=[1,1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  EXPECT_THAT(root,
+              AllOf(op::Convolution(lhs, rhs), op::Shape("f32[5,1,1,512]")));
+}
+
+TEST_F(SpmdPartitioningTest, PartitionConvWithBathGroupCountRHSAlignWithLHS) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,1,1,2]0,1}
+  %rhs = f32[16,801,1,1024] parameter(1)
+  %rhs.copy = f32[16,801,1,1024] copy(%rhs),
+    sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[5,1,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=f01b_i01o->01bf,batch_group_count=1024,
+    window={size=801x1 pad=2_2x0_0},
+    sharding={devices=[1,1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(
+                       op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                       op::Reshape(), op::Constant(), op::Constant())),
+                   op::Shape("f32[16,401,1,1024]"));
+  auto resharded_rhs = AllOf(
+      op::Slice(op::Reshape(op::Transpose(op::AllToAll(op::Reshape(rhs))))),
+      op::Shape("f32[16,801,1,512]"));
+  EXPECT_THAT(root, AllOf(op::Convolution(lhs, resharded_rhs),
+                          op::Shape("f32[5,1,1,512]")));
+}
+
+TEST_F(SpmdPartitioningTest, PartitionConvWithBathGroupCountLHSAlignWithRHS) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[16,801,1,1024] parameter(1)
+  %rhs.copy = f32[16,801,1,1024] copy(%rhs),
+    sharding={devices=[1,1,1,2]0,1}
+  ROOT %conv = f32[5,1,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=f01b_i01o->01bf,batch_group_count=1024,
+    window={size=801x1 pad=2_2x0_0},
+    sharding={devices=[1,1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(
+                       op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                       op::Reshape(), op::Constant(), op::Constant())),
+                   op::Shape("f32[16,401,1,1024]"));
+  auto resharded_lhs = AllOf(
+      op::Slice(op::Reshape(op::Transpose(op::AllToAll(op::Reshape(lhs))))),
+      op::Shape("f32[16,801,1,512]"));
+  EXPECT_THAT(root, AllOf(op::Convolution(resharded_lhs, rhs),
+                          op::Shape("f32[5,1,1,512]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartitionConvWithBathGroupCountOutputAlignWithLHS) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,1,1,2]0,1}
+  %rhs = f32[16,801,1,1024] parameter(1)
+  %rhs.copy = f32[16,801,1,1024] copy(%rhs),
+    sharding={devices=[1,1,1,2]0,1}
+  ROOT %conv = f32[5,1,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=f01b_i01o->01bf,batch_group_count=1024,
+    window={size=801x1 pad=2_2x0_0},
+    sharding={devices=[2,1,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto conv = AllOf(op::Convolution(lhs, rhs), op::Shape("f32[5,1,1,512]"));
+  EXPECT_THAT(root, AllOf(op::Reshape(op::Transpose(op::AllToAll(
+                              op::Reshape(op::Pad(conv, op::Constant()))))),
+                          op::Shape("f32[3,1,1,1024]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartitionConvWithBathGroupCountOutputAlignWithRHS) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[16,801,1,1024] parameter(1)
+  %rhs.copy = f32[16,801,1,1024] copy(%rhs),
+    sharding={devices=[1,1,1,2]0,1}
+  ROOT %conv = f32[5,1,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=f01b_i01o->01bf,batch_group_count=1024,
+    window={size=801x1 pad=2_2x0_0},
+    sharding={devices=[2,1,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(
+                       op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                       op::Reshape(), op::Constant(), op::Constant())),
+                   op::Shape("f32[16,401,1,1024]"));
+  auto resharded_lhs = AllOf(
+      op::Slice(op::Reshape(op::Transpose(op::AllToAll(op::Reshape(lhs))))),
+      op::Shape("f32[16,801,1,512]"));
+  auto conv =
+      AllOf(op::Convolution(resharded_lhs, rhs), op::Shape("f32[5,1,1,512]"));
+  EXPECT_THAT(root, AllOf(op::Reshape(op::Transpose(op::AllToAll(
+                              op::Reshape(op::Pad(conv, op::Constant()))))),
+                          op::Shape("f32[3,1,1,1024]")));
+}
+
+TEST_F(SpmdPartitioningTest, PartitionConvWithFeatureGroupCount) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,1,1,2]0,1}
+  %rhs = f32[5,1,1,1024] parameter(1)
+  %rhs.copy = f32[5,1,1,1024] copy(%rhs),
+    sharding={devices=[1,1,1,2]0,1}
+  ROOT %conv = f32[16,801,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=b01f_01io->b01f,feature_group_count=1024,
+    window={size=5x1 pad=2_2x0_0},
+    sharding={devices=[1,1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[5,1,1,512]"));
+  EXPECT_THAT(root,
+              AllOf(op::Convolution(lhs, rhs), op::Shape("f32[16,801,1,512]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartitionConvWithFeatureGroupCountRHSAlignWithLHS) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,1,1,2]0,1}
+  %rhs = f32[5,1,1,1024] parameter(1)
+  %rhs.copy = f32[5,1,1,1024] copy(%rhs),
+    sharding={devices=[2,1,1,1]0,1}
+  ROOT %conv = f32[16,801,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=b01f_01io->b01f,feature_group_count=1024,
+    window={size=5x1 pad=2_2x0_0},
+    sharding={devices=[1,1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(
+                       op::Pad(op::Parameter(), op::Constant()), op::Reshape(),
+                       op::Constant(), op::Constant(), op::Constant())),
+                   op::Shape("f32[3,1,1,1024]"));
+  auto resharded_rhs = AllOf(
+      op::Slice(op::Reshape(op::Transpose(op::AllToAll(op::Reshape(rhs))))),
+      op::Shape("f32[5,1,1,512]"));
+  EXPECT_THAT(root, AllOf(op::Convolution(lhs, resharded_rhs),
+                          op::Shape("f32[16,801,1,512]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartitionConvWithFeatureGroupCountLHSAlignWithRHS) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[5,1,1,1024] parameter(1)
+  %rhs.copy = f32[5,1,1,1024] copy(%rhs),
+    sharding={devices=[1,1,1,2]0,1}
+  ROOT %conv = f32[16,801,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=b01f_01io->b01f,feature_group_count=1024,
+    window={size=5x1 pad=2_2x0_0},
+    sharding={devices=[1,1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(
+                       op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                       op::Reshape(), op::Constant(), op::Constant())),
+                   op::Shape("f32[16,401,1,1024]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[5,1,1,512]"));
+  auto resharded_lhs = AllOf(
+      op::Slice(op::Reshape(op::Transpose(op::AllToAll(op::Reshape(lhs))))),
+      op::Shape("f32[16,801,1,512]"));
+  EXPECT_THAT(root, AllOf(op::Convolution(resharded_lhs, rhs),
+                          op::Shape("f32[16,801,1,512]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartitionConvWithFeatureGroupCountAlignOuputWithLHS) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,1,1,2]0,1}
+  %rhs = f32[5,1,1,1024] parameter(1)
+  %rhs.copy = f32[5,1,1,1024] copy(%rhs),
+    sharding={devices=[1,1,1,2]0,1}
+  ROOT %conv = f32[16,801,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=b01f_01io->b01f,feature_group_count=1024,
+    window={size=5x1 pad=2_2x0_0},
+    sharding={devices=[2,1,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[5,1,1,512]"));
+  auto conv = AllOf(op::Convolution(lhs, rhs), op::Shape("f32[16,801,1,512]"));
+  EXPECT_THAT(root,
+              AllOf(op::Reshape(op::Transpose(op::AllToAll(op::Reshape(conv)))),
+                    op::Shape("f32[8,801,1,1024]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartitionConvGroupOnFeatureGroupCount_RHSPartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,2,1,2]0,1,2,3}
+  %rhs = f32[5,1,1,1024] parameter(1)
+  %rhs.copy = f32[5,1,1,1024] copy(%rhs),
+    sharding={devices=[1,1,1,2,2]0,2,1,3 last_tile_dim_replicate}
+  ROOT %conv = f32[16,801,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=b01f_01io->b01f,feature_group_count=1024,
+    window={size=5x1 pad=2_2x0_0},
+    sharding={devices=[1,2,1,2]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(
+                       op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                       op::Reshape(), op::Constant(), op::Reshape())),
+                   op::Shape("f32[16,401,1,512]"));
+  auto left_halo = AllOf(op::Shape("f32[16,2, 1, 512]"),
+                         op::CollectivePermute(op::Slice(lhs)));
+  auto right_halo = AllOf(op::Shape("f32[16,2, 1, 512]"),
+                          op::CollectivePermute(op::Slice(lhs)));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[5,1,1,512]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Convolution(
+                op::Select(_, op::Concatenate(left_halo, lhs, right_halo), _),
+                rhs),
+            op::Shape("f32[16, 401, 1, 512]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartitionConvGroupOnFeatureGroupCount_RHSAlignWithOutput) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,2,1,2]0,1,2,3}
+  %rhs = f32[5,1,1,1024] parameter(1), sharding={replicated}
+  ROOT %conv = f32[16,801,1,1024] convolution(%lhs.copy, %rhs),
+    dim_labels=b01f_01io->b01f,feature_group_count=1024,
+    window={size=5x1 pad=2_2x0_0},
+    sharding={devices=[1,2,1,2]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(
+                       op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                       op::Reshape(), op::Constant(), op::Reshape())),
+                   op::Shape("f32[16,401,1,512]"));
+  auto left_halo = AllOf(op::Shape("f32[16,2, 1, 512]"),
+                         op::CollectivePermute(op::Slice(lhs)));
+  auto right_halo = AllOf(op::Shape("f32[16,2, 1, 512]"),
+                          op::CollectivePermute(op::Slice(lhs)));
+  auto rhs =
+      AllOf(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                             op::Constant(), op::Reshape()),
+            op::Shape("f32[5,1,1,512]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Convolution(
+                op::Select(_, op::Concatenate(left_halo, lhs, right_halo), _),
+                rhs),
+            op::Shape("f32[16, 401, 1, 512]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartitionConvGroupOnFeatureGroupCount_LHSAlignWithOutput) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[2,1,1,1,2]0,1,2,3 last_tile_dim_replicate}
+  %rhs = f32[5,1,1,1024] parameter(1)
+  %rhs.copy = f32[5,1,1,1024] copy(%rhs),
+    sharding={devices=[1,1,1,2,2]0,2,1,3 last_tile_dim_replicate}
+  ROOT %conv = f32[16,801,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=b01f_01io->b01f,feature_group_count=1024,
+    window={size=5x1 pad=2_2x0_0},
+    sharding={devices=[1,2,1,2]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Constant(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[8,801,1,1024]"));
+  auto resharded_lhs =
+      AllOf(op::Reshape(op::Transpose(op::AllToAll(op::Reshape(
+                op::Pad(op::DynamicSlice(lhs, op::Constant(), op::Constant(),
+                                         op::Constant(), op::Reshape()),
+                        op::Constant()))))),
+            op::Shape("f32[16,401,1,512]"));
+  auto left_halo = AllOf(op::Shape("f32[16,2, 1, 512]"),
+                         op::CollectivePermute(op::Slice(resharded_lhs)));
+  auto right_halo = AllOf(op::Shape("f32[16,2, 1, 512]"),
+                          op::CollectivePermute(op::Slice(resharded_lhs)));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[5,1,1,512]"));
+  EXPECT_THAT(
+      root,
+      AllOf(
+          op::Convolution(
+              op::Select(
+                  _, op::Concatenate(left_halo, resharded_lhs, right_halo), _),
+              rhs),
+          op::Shape("f32[16, 401, 1, 512]")));
+}
+
+TEST_F(SpmdPartitioningTest, PartitionConvGroupOnBatchGroupCount) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,2,1,2]0,1,2,3}
+  %rhs = f32[16,801,1,1024] parameter(1)
+  %rhs.copy = f32[16,801,1,1024] copy(%rhs),
+    sharding={devices=[1,2,1,2]0,1,2,3}
+  ROOT %conv = f32[5,1,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=f01b_i01o->01bf,batch_group_count=1024,
+    window={size=801x1 pad=2_2x0_0},
+    sharding={devices=[1,1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Select(_,
+                 op::Copy(op::DynamicSlice(
+                     op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                     op::Reshape(), op::Constant(), op::Reshape())),
+                 _),
+      op::Shape("f32[16,401,1,512]"));
+  auto left_halo = AllOf(op::Shape("f32[16,2, 1, 512]"),
+                         op::CollectivePermute(op::Slice(lhs)));
+  auto right_halo = AllOf(op::Shape("f32[16,2, 1, 512]"),
+                          op::CollectivePermute(op::Slice(lhs)));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(
+                       op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                       op::Reshape(), op::Constant(), op::Reshape())),
+                   op::Shape("f32[16,401,1,512]"));
+  auto conv = AllOf(op::Convolution(op::Concatenate(left_halo, lhs, right_halo),
+                                    op::Select(_, rhs, _)),
+                    op::Shape("f32[5,1,1,512]"));
+  EXPECT_THAT(root, AllOf(op::CollectivePermute(op::AllReduce(conv)),
+                          op::Shape("f32[5,1,1,512]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       PartitionConvWithFeatureGroupCountAlignOuputWithRHS) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[5,1,1,1024] parameter(1)
+  %rhs.copy = f32[5,1,1,1024] copy(%rhs),
+    sharding={devices=[1,1,1,2]0,1}
+  ROOT %conv = f32[16,801,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=b01f_01io->b01f,feature_group_count=1024,
+    window={size=5x1 pad=2_2x0_0},
+    sharding={devices=[2,1,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(
+                       op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                       op::Reshape(), op::Constant(), op::Constant())),
+                   op::Shape("f32[16,401,1,1024]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[5,1,1,512]"));
+  auto resharded_lhs = AllOf(
+      op::Slice(op::Reshape(op::Transpose(op::AllToAll(op::Reshape(lhs))))),
+      op::Shape("f32[16,801,1,512]"));
+  auto conv = AllOf(op::Convolution(resharded_lhs, rhs),
+                    op::Shape("f32[16,801,1,512]"));
+  EXPECT_THAT(root,
+              AllOf(op::Reshape(op::Transpose(op::AllToAll(op::Reshape(conv)))),
+                    op::Shape("f32[8,801,1,1024]")));
+}
+
+TEST_F(SpmdPartitioningTest, PartitionConvWithFeatureGroupCountBackProp) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[16,801,1,1024] parameter(0)
+  %lhs.copy = f32[16,801,1,1024] copy(%lhs),
+    sharding={devices=[1,1,1,2]0,1}
+  %rhs = f32[5,1,1024,1] parameter(1)
+  %rhs.copy = f32[5,1,1024,1] copy(%rhs),
+    sharding={devices=[1,1,2,1]0,1}
+  ROOT %conv = f32[16,801,1,1024] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=b01f_01oi->b01f,feature_group_count=1024,
+    window={size=5x1 pad=2_2x0_0 rhs_reversal=1x1},
+    sharding={devices=[1,1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[16,801,1,512]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Reshape(), op::Constant())),
+      op::Shape("f32[5,1,512,1]"));
+  EXPECT_THAT(root,
+              AllOf(op::Convolution(lhs, rhs), op::Shape("f32[16,801,1,512]")));
+}
+
+TEST_F(SpmdPartitioningTest, NoReshardOnBroadcastDims) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[2,3] parameter(0)
+  %param1 = f32[2,3,20] parameter(1)
+  %br0 = f32[20,2,20,3,20] broadcast(%param0), dimensions={1,3}, sharding={devices=[2,1,2,1,2]0,1,2,3,4,5,6,7}
+  %br1 = f32[20,2,20,3,20] broadcast(%param1), dimensions={1,3,4}, sharding={devices=[2,1,2,1,2]0,1,2,3,4,5,6,7}
+  %add = f32[20,2,20,3,20] add(%br0, %br1), sharding={devices=[2,1,2,1,2]0,1,2,3,4,5,6,7}
+  %reshape = f32[10,4,10,6,20] reshape(%br0), sharding={devices=[2,1,2,1,2]0,1,2,3,4,5,6,7}
+  %transpose = f32[2,3,20,20,20] transpose(%br0), dimensions={1,3,0,2,4}, sharding={devices=[1,1,2,2,2]0,1,2,3,4,5,6,7}
+  %copy_add0 = f32[20,2,20,3,20] copy(%add), sharding={devices=[2,1,2,1,2]6,7,2,3,4,5,0,1}
+  %copy_add1 = f32[20,2,20,3,20] copy(%add), sharding={devices=[2,1,2,1,2]7,6,3,2,5,4,0,1}
+  %copy_reshape = f32[10,4,10,6,20] copy(%reshape), sharding={devices=[2,1,2,1,2]7,6,3,2,5,4,0,1}
+  %copy_transpose = f32[2,3,20,20,20] copy(%transpose), sharding={devices=[1,1,2,2,2]7,6,3,2,5,4,0,1}
+  ROOT %tuple = (f32[20,2,20,3,20], f32[20,2,20,3,20], f32[10,4,10,6,20], f32[2,3,20,20,20])
+    tuple(%copy_add0, %copy_add1, %copy_reshape, %copy_transpose),
+    sharding={{devices=[2,1,2,1,2]6,7,2,3,4,5,0,1},{devices=[2,1,2,1,2]7,6,3,2,5,4,0,1},{devices=[2,1,2,1,2]7,6,3,2,5,4,0,1},{devices=[1,1,2,2,2]7,6,3,2,5,4,0,1}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  // Reshard on copy_add0 only happens on broadcast dims, can be skipped.
+  auto copy_add0 =
+      op::Copy(op::Copy(op::Add(op::Broadcast(_), op::Broadcast(_))));
+  // Reshard on copy_add1 also happens on non-broadcast dims.
+  auto copy_add1 = op::Copy(
+      op::CollectivePermute(op::Add(op::Broadcast(_), op::Broadcast(_))));
+  // Reshard on copy_reshape only happens on broadcast dims, can be skipped.
+  auto copy_reshape = op::Copy(op::Copy(op::Reshape(op::Broadcast(_))));
+  // Reshard on copy_transpose only happens on broadcast dims, can be skipped.
+  auto copy_transpose = op::Copy(op::Copy(op::Transpose(op::Broadcast(_))));
+  EXPECT_THAT(root,
+              op::Tuple(copy_add0, copy_add1, copy_reshape, copy_transpose));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionFilterIFOFPartitionedInputPartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,112,112,12] parameter(0)
+  %lhs.copy = f32[128,112,112,12] copy(f32[128,112,112,12] %lhs),
+    sharding={devices=[1,1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %rhs = f32[7,7,12,64] parameter(1)
+  %rhs.copy = f32[7,7,12,64] copy(f32[7,7,12,64] %rhs),
+    sharding={devices=[1,1,2,2]0,1,2,3}
+  ROOT %conv = f32[128,56,56,64] convolution(
+    f32[128,112,112,12] %lhs.copy,
+    f32[7,7,12,64] %rhs.copy),
+    window={size=7x7 stride=2x2 pad=3_3x3_3},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[128,112,112,6]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Reshape(), op::Reshape())),
+      op::Shape("f32[7,7,6,32]"));
+
+  EXPECT_THAT(
+      root,
+      AllOf(op::CollectivePermute(op::AllReduce(op::Convolution(lhs, rhs))),
+            op::Shape("f32[128,56,56,32]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionInputKernelNonContractingDimPartialReplicate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,256] parameter(0)
+  %lhs.copy = f32[128,56,56,256] copy(%lhs),
+  sharding={devices=[1,1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+  %rhs = f32[128,28,28,512] parameter(1)
+  %rhs.copy = f32[128,28,28,512] copy(%rhs),
+  sharding={devices=[1,1,1,2,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %conv = f32[1,1,256,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=0_-1x0_-1 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf,
+    sharding={devices=[1,1,2,2]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[128,56,56,128]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[128,28,28,256]"));
+
+  EXPECT_THAT(root, AllOf(op::Convolution(lhs, op::CollectivePermute(rhs)),
+                          op::Shape("f32[1,1,128,256]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionInputSpatialDimAndFeatureDimParttiioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[8,210,210,12] parameter(0)
+  %lhs.copy = f32[8,210,210,12] copy(f32[8,210,210,12] %lhs),
+    sharding={devices=[1,2,1,2]0,1,2,3}
+  %rhs = f32[3,3,12,32] parameter(1)
+  %rhs.copy = f32[3,3,12,32] copy(f32[3,3,12,32] %rhs),
+    sharding={devices=[1,1,2,1,2]0,1,2,3 last_tile_dim_replicate}
+  ROOT %conv = f32[8,210,210,32] convolution(
+    f32[8,210,210,12] %lhs.copy,
+    f32[3,3,12,32] %rhs.copy),
+    window={size=3x3 pad=1_1x1_1},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,2,1,1,2]0,1,2,3 last_tile_dim_replicate}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[8,105,210,6]"));
+  auto left_halo =
+      AllOf(op::CollectivePermute(op::Slice(lhs)), op::Shape("f32[8,1,210,6]"));
+  auto right_halo =
+      AllOf(op::CollectivePermute(op::Slice(lhs)), op::Shape("f32[8,1,210,6]"));
+  auto exchanged_lhs = AllOf(
+      op::Select(op::And(_, _), op::Concatenate(left_halo, lhs, right_halo),
+                 op::Broadcast(_)),
+      op::Shape("f32[8,107,210,6]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Reshape(), op::Constant())),
+      op::Shape("f32[3,3,6,32]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(
+                              exchanged_lhs, op::CollectivePermute(rhs))),
+                          op::Shape("f32[8,105,210,32]")));
+}
+
 }  // namespace
 }  // namespace spmd
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
index 0edbd4f2b8d..f3f3a95ea0a 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
@@ -576,8 +576,8 @@ absl::optional<HloInstruction*> PadFromPartialReplicateShape(
     int64 max_right_halo_size =
         right_halo_size_function.MaxInRange(0, src_shard_count - 1);
     pad_config.mutable_dimensions(dim)->set_edge_padding_high(std::max(
-        0LL, padded_dst_shape.dimensions(dim) -
-                 padded_src_shape.dimensions(dim) - max_right_halo_size));
+        int64{0}, padded_dst_shape.dimensions(dim) -
+                      padded_src_shape.dimensions(dim) - max_right_halo_size));
     auto padded_concat_shape = ShapeInference::InferPadShape(
                                    concat->shape(), zero->shape(), pad_config)
                                    .ValueOrDie();
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
index f6f15481b55..4fc193d9622 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
@@ -362,8 +362,8 @@ absl::optional<HloInstruction*> PadFromPartialReplicateShape(
 // dimensions by dynamic slice.
 // For example, if partial_sharding is
 // {devices=[1,2,2]0,1,2,3 last_tile_dim_replicate}
-// Target tile dims is {2, 2}, the returned compatible sharding will be
-// sharding={devices=[1,2,2]0,2,1,3 last_tile_dim_replicate}.
+// Target sharding is {devices=[2,2]0,1,2,3}, the returned compatible sharding
+// will be sharding={devices=[2,2]0,2,1,3}.
 // If patial replicate sharding is not partial replicate or can't reshard to
 // target_tile_dims by dynamic slice, return absl::nullopt.
 // If target_sharding is already compatible, returns it.
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index 0fd64209152..913bfed926a 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -169,8 +169,7 @@ Status TransferManager::TransferArrayToDeviceAsync(
         "%d < %d",
         dest.size(), GetByteSizeRequirement(on_device_shape));
   }
-  ShapedBuffer shaped_buffer(/*on_host_shape=*/literal.shape(), on_device_shape,
-                             stream->parent()->platform(),
+  ShapedBuffer shaped_buffer(on_device_shape, stream->parent()->platform(),
                              stream->parent()->device_ordinal());
   shaped_buffer.set_buffer(dest, /*index=*/{});
   return TransferLiteralToDevice(stream, literal, shaped_buffer,
@@ -194,8 +193,7 @@ void TransferManager::TransferArrayFromDevice(
                            "%d < %d",
                            source.size(), GetByteSizeRequirement(shape)));
   }
-  ShapedBuffer shaped_buffer(/*on_host_shape=*/shape, shape,
-                             stream->parent()->platform(),
+  ShapedBuffer shaped_buffer(shape, stream->parent()->platform(),
                              stream->parent()->device_ordinal());
   shaped_buffer.set_buffer(source, /*index=*/{});
   return TransferLiteralFromDevice(stream, shaped_buffer, literal,
@@ -406,8 +404,8 @@ StatusOr<ScopedShapedBuffer> TransferManager::AllocateScopedShapedBuffer(
   Shape on_device_shape = HostShapeToDeviceShape(on_host_shape);
   TF_RET_CHECK(LayoutUtil::HasLayout(on_device_shape));
 
-  ScopedShapedBuffer shaped_buffer(on_host_shape, std::move(on_device_shape),
-                                   allocator, device_ordinal);
+  ScopedShapedBuffer shaped_buffer(std::move(on_device_shape), allocator,
+                                   device_ordinal);
 
   // Allocate an appropriate sized buffer for each element in the shape
   // including the tuple pointer arrays.
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index c0670d26eee..c49d7d899e7 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -51,7 +51,11 @@ class TransferManager {
   // pre-allocated by the host, e.g. TransferLiteralToDevice, without the user
   // needing to consider device-specific behaviors.
   virtual Shape HostShapeToDeviceShape(const Shape& host_shape) const {
-    return host_shape;
+    // Strips off any preexisting tiling or memory space information.
+    // TODO(phawkins): fix clients not to including tiling or memory space
+    // information in shapes passed to this function and turn this into an
+    // assertion.
+    return ShapeUtil::DeviceShapeToHostShape(host_shape);
   }
 
   // Base class for specifying platform specific transfer metadata that can be
@@ -189,6 +193,7 @@ class TransferManager {
   // shapes, and returns static shapes with dynamic shapes updated.
   // The shape of the buffer also have to be compatible with the host shape and
   // device shape.
+  // TODO(b/170310047): remove host_shape.
   virtual Status ReadDynamicShapes(se::Stream* stream,
                                    ShapedBuffer* device_buffer,
                                    Shape* host_shape, Shape* device_shape);
diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc
index c9799453939..614dfc4ffe6 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding.cc
@@ -35,17 +35,46 @@ TransposeFolding::OperandIndices CanFoldOperandsIntoDot(
     const HloInstruction& dot,
     const TransposeFolding::TransposableGemmOperandsFn&
         transposable_gemm_operands) {
-  if (HloOpcode::kDot != dot.opcode() ||
-      dot.dot_dimension_numbers().lhs_batch_dimensions_size() != 0) {
+  if (HloOpcode::kDot != dot.opcode()) {
     return {};
   }
 
+  if (!absl::c_equal(dot.dot_dimension_numbers().lhs_batch_dimensions(),
+                     dot.dot_dimension_numbers().rhs_batch_dimensions())) {
+    return {};
+  }
+
+  int64 num_batch_dims =
+      dot.dot_dimension_numbers().lhs_batch_dimensions_size();
+  int64 expected_rank = 2 + num_batch_dims;
+  auto is_r2_transpose = [&](const HloInstruction& transpose) {
+    if (transpose.opcode() != HloOpcode::kTranspose) {
+      return false;
+    }
+    const auto& transpose_dims = transpose.dimensions();
+    if (transpose_dims.size() != expected_rank) {
+      return false;
+    }
+
+    // Check that the transpose doesn't touch any batch dimensions, but does
+    // transpose the non-batch ones.
+    for (int64 i = 0; i != expected_rank; ++i) {
+      bool is_batch = absl::c_linear_search(
+          dot.dot_dimension_numbers().lhs_batch_dimensions(),
+          transpose_dims[i]);
+      if ((transpose_dims[i] == i) != is_batch) {
+        return false;
+      }
+    }
+    return true;
+  };
+
   TransposeFolding::OperandIndices operand_set;
   for (int64 i = 0; i < dot.operand_count(); ++i) {
     auto& operand = *dot.operand(i);
-    if (operand.IsRank2Transpose()) {
+    if (is_r2_transpose(operand)) {
       operand_set.push_back(i);
-    } else if (operand.shape().rank() != 2) {
+    } else if (operand.shape().rank() != expected_rank) {
       return {};
     }
   }
@@ -84,25 +113,25 @@ Status FoldTransposeIntoDot(InstructionOperandsPair pair) {
   HloInstruction* new_lhs = dot->mutable_operand(0);
   HloInstruction* new_rhs = dot->mutable_operand(1);
 
-  CHECK_EQ(new_dim_numbers.lhs_batch_dimensions_size(), 0);
-  CHECK_EQ(new_dim_numbers.rhs_batch_dimensions_size(), 0);
   CHECK_EQ(new_dim_numbers.lhs_contracting_dimensions_size(), 1);
   CHECK_EQ(new_dim_numbers.rhs_contracting_dimensions_size(), 1);
 
   for (int64 operand_index : pair.second) {
-    // We've checked that there aren't any batch dimensions and that the inputs
-    // are rank 2, and shape inference guarantees that there is exactly one
-    // contracting dimension.
+    // We checked that the batch dimensions are not touched by the transpose,
+    // and shape inference guarantees that there is exactly one contracting
+    // dimension.
     if (operand_index == 0) {
       CHECK_EQ(new_lhs->opcode(), HloOpcode::kTranspose);
       new_dim_numbers.set_lhs_contracting_dimensions(
-          0, 1 - new_dim_numbers.lhs_contracting_dimensions(0));
+          0,
+          new_lhs->dimensions(new_dim_numbers.lhs_contracting_dimensions(0)));
       new_lhs = new_lhs->mutable_operand(0);
     } else {
       CHECK_EQ(operand_index, 1);
       CHECK_EQ(new_rhs->opcode(), HloOpcode::kTranspose);
       new_dim_numbers.set_rhs_contracting_dimensions(
-          0, 1 - new_dim_numbers.rhs_contracting_dimensions(0));
+          0,
+          new_rhs->dimensions(new_dim_numbers.rhs_contracting_dimensions(0)));
       new_rhs = new_rhs->mutable_operand(0);
     }
   }
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index 8a2112c87dc..3fe69d22e9c 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -42,7 +42,7 @@ namespace {
 
 class TransposeFoldingTest : public HloTestBase {
  protected:
-  void FoldTranspose(HloModule* module) {
+  bool FoldTranspose(HloModule* module) {
     TransposeFolding transpose_folding(
         [](const HloInstruction& dot,
            const TransposeFolding::OperandIndices& candidate_operands) {
@@ -52,7 +52,9 @@ class TransposeFoldingTest : public HloTestBase {
            const TransposeFolding::OperandIndices& candidate_operands) {
           return candidate_operands;
         });
-    EXPECT_IS_OK(transpose_folding.Run(module).status());
+    auto folded = transpose_folding.Run(module);
+    EXPECT_IS_OK(folded.status());
+    return *folded;
   }
 };
 
@@ -465,5 +467,81 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeLhs) {
       new_conv->convolution_dimension_numbers().output_spatial_dimensions(1));
 }
 
+TEST_F(TransposeFoldingTest, FoldBatchDotTranspose) {
+  string hlo_string = R"(
+HloModule FoldBatchDotTranspose
+
+ENTRY entry_computation {
+  x = f32[7,7,2,3]{3,2,1,0} parameter(0)
+  y = f32[7,7,2,3]{3,2,1,0} parameter(1)
+  transpose = f32[7,7,3,2]{3,2,1,0} transpose(y), dimensions={0,1,3,2}
+  ROOT dot = f32[7,7,2,2]{3,2,1,0} dot(x, transpose), lhs_contracting_dims={3},
+            rhs_contracting_dims={2}, lhs_batch_dims={0,1}, rhs_batch_dims={0,1}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  ASSERT_TRUE(FoldTranspose(module.get()));
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Dot(op::Parameter(0), op::Parameter(1),
+                      /*lhs_contracting_dim=*/3, /*rhs_contracting_dim=*/3));
+}
+
+TEST_F(TransposeFoldingTest, NoFoldBatchDotTransposeBatch) {
+  string hlo_string = R"(
+HloModule NoFoldBatchDotTransposeBatch
+
+ENTRY entry_computation {
+  x = f32[7,7,2,3]{3,2,1,0} parameter(0)
+  y = f32[7,7,2,3]{3,2,1,0} parameter(1)
+  transpose = f32[7,7,3,2]{3,2,1,0} transpose(y), dimensions={1,0,3,2}
+  ROOT dot = f32[7,7,2,2]{3,2,1,0} dot(x, transpose), lhs_contracting_dims={3},
+            rhs_contracting_dims={2}, lhs_batch_dims={0,1}, rhs_batch_dims={0,1}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_FALSE(FoldTranspose(module.get()));
+}
+
+TEST_F(TransposeFoldingTest, FoldBatchDotTransposeNonContiguousBatch) {
+  string hlo_string = R"(
+HloModule FoldBatchDotTransposeNonContiguousBatch
+
+ENTRY entry_computation {
+  x = f32[7,2,7,3]{3,2,1,0} parameter(0)
+  y = f32[7,2,7,3]{3,2,1,0} parameter(1)
+  transpose = f32[7,3,7,2]{3,2,1,0} transpose(y), dimensions={0,3,2,1}
+  ROOT dot = f32[7,7,2,2]{3,2,1,0} dot(x, transpose), lhs_contracting_dims={3},
+            rhs_contracting_dims={1}, lhs_batch_dims={0,2}, rhs_batch_dims={0,2}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  ASSERT_TRUE(FoldTranspose(module.get()));
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Dot(op::Parameter(0), op::Parameter(1),
+                      /*lhs_contracting_dim=*/3, /*rhs_contracting_dim=*/3));
+}
+
+TEST_F(TransposeFoldingTest, NoFoldBatchDotTransposeIdentity) {
+  string hlo_string = R"(
+HloModule NoFoldBatchDotTransposeIdentity
+
+ENTRY entry_computation {
+  x = f32[7,7,2,3]{3,2,1,0} parameter(0)
+  y = f32[7,7,3,2]{3,2,1,0} parameter(1)
+  transpose = f32[7,7,3,2]{3,2,1,0} transpose(y), dimensions={0,1,2,3}
+  ROOT dot = f32[7,7,2,2]{3,2,1,0} dot(x, transpose), lhs_contracting_dims={3},
+            rhs_contracting_dims={2}, lhs_batch_dims={0,1}, rhs_batch_dims={0,1}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_FALSE(FoldTranspose(module.get()));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/triangular_solve_expander.cc b/tensorflow/compiler/xla/service/triangular_solve_expander.cc
index d54eb9e78c3..4015c69e3e2 100644
--- a/tensorflow/compiler/xla/service/triangular_solve_expander.cc
+++ b/tensorflow/compiler/xla/service/triangular_solve_expander.cc
@@ -89,16 +89,23 @@ XlaOp DiagonalBlocks(XlaOp a, int64 block_size) {
     // The last block might be smaller than the block size,
     // so we will need to pad it
     if (n % block_size != 0) {
-      // Pad with zeros
+      // Pad with identity matrix.
       auto last_blocks =
           SliceInMinorDims(a, {n - n % block_size, n - n % block_size}, {n, n});
       PaddingConfig config = MakeNoPaddingConfig(ndims);
       int64 padding = block_size - n % block_size;
-      config.mutable_dimensions(ndims - 1)->set_edge_padding_high(padding);
       config.mutable_dimensions(ndims - 2)->set_edge_padding_high(padding);
       last_blocks =
           Pad(last_blocks, Zero(builder, shape.element_type()), config);
 
+      auto eye =
+          IdentityMatrix(builder, shape.element_type(), padding, padding);
+      config = MakeNoPaddingConfig(ndims);
+      config.mutable_dimensions(ndims - 2)->set_edge_padding_low(n %
+                                                                 block_size);
+      eye = Pad(eye, Zero(builder, shape.element_type()), config);
+      last_blocks = ConcatInDim(builder, {last_blocks, eye}, ndims - 1);
+
       // Add a singleton dimension
       // i.e. [..., block_size, block_size] -> [..., 1, block_size, block_size]
       TF_ASSIGN_OR_RETURN(Shape blocks_shape, builder->GetShape(last_blocks));
@@ -121,134 +128,6 @@ XlaOp DiagonalBlocks(XlaOp a, int64 block_size) {
   });
 }
 
-XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower, bool transpose_a,
-                           bool conjugate_a,
-                           PrecisionConfig::Precision precision) {
-  XlaBuilder* builder = diag_blocks.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    // Input is a batch of square lower triangular square matrices. Its shape is
-    // (..., size, size). We resize this to (num_blocks, size, size).
-    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(diag_blocks));
-    int64 block_size = ShapeUtil::GetDimension(shape, -1);
-    int64 num_blocks = ShapeUtil::ElementsIn(shape) /
-                       tensorflow::MathUtil::IPow(block_size, 2);
-    diag_blocks = Reshape(diag_blocks, {num_blocks, block_size, block_size});
-
-    // The input must be triangular because we rely on that when doing
-    // multiplications later on
-    diag_blocks = Triangle(diag_blocks, /*lower=*/lower);
-
-    // Rescale blocks to be unit triangular, but avoid dividing by
-    // zero (which can happen if the last block was padded) otherwise it will
-    // introduce nans which will propagate
-    auto diags = GetMatrixDiagonal(diag_blocks);
-    auto ones = FullLike(diags, 1);
-    diags = Select(Eq(diags, Zero(builder, shape.element_type())), ones, diags);
-    auto scaled_diag_blocks = Div(diag_blocks, diags, {0, 2});
-
-    // We can now use the fact that for an upper triangular matrix
-    // [[L11, 0], [L21, L22]], given the inverses L11' and L22', we have
-    // L22' = -L22' * L21 * L11'. In our case, L21 is a vector and our blocks
-    // have been rescaled to be unit triangular, so L22 = L22' = 1.
-
-    // Initialize the output matrix with -1s on the diagonal. We use -1 instead
-    // of 1 because we cannot do matrix-vector multiplies with variable shapes
-    // inside of a loop, or do irregularly shaped in-place updates. Hence,
-    // L21 <- -L22 * L21 * L11 cannot be done naively. Instead, we update the
-    // entire row i.e. we calculate
-    // [L21 L22 0] <- -[L21 L22 0] @ diag_blocks([L11', -I, -I])
-    // which means [L21 L22 0] <- [-L21 * L11', L22, 0].
-    auto identity =
-        IdentityMatrix(builder, shape.element_type(), block_size, block_size);
-    auto neg_identity = -identity;
-
-    // The first or last  diagonal element should be set to 1 instead of -1
-    // though, since we never update it
-    auto pos_one = Reshape(One(builder, shape.element_type()), {1, 1});
-    auto start_index = ConstantR0<int>(builder, (lower) ? 0 : block_size - 1);
-    auto output_block =
-        DynamicUpdateSlice(neg_identity, pos_one,
-                           /*start_indices=*/{start_index, start_index});
-
-    // Broadcast diag([1, -1, -1, ...]) to every block
-    XlaOp output = Broadcast(output_block,
-                             /*broadcast_sizes=*/{num_blocks});
-
-    // Now we construct a loop that performs matrix-vector multiplications
-    // inverting the blocks one row at a time
-    std::vector<Shape> tuple_shapes = {
-        // The loop iteration counter is a scalar, incremented each iteration.
-        ShapeUtil::MakeShape(S32, {}),
-        // The output has the shape of A, with one row updated each iteration.
-        ShapeUtil::MakeShape(shape.element_type(),
-                             {num_blocks, block_size, block_size}),
-        // The input is a loop invariant.
-        ShapeUtil::MakeShape(shape.element_type(),
-                             {num_blocks, block_size, block_size})};
-    Shape tuple_shape = ShapeUtil::MakeTupleShape(tuple_shapes);
-
-    auto init_i = One(builder, S32);
-    auto init = Tuple(builder, {init_i, output, scaled_diag_blocks});
-
-    // Construct the loop condition function.
-    std::unique_ptr<XlaBuilder> condb =
-        builder->CreateSubBuilder("InvertDiagCond");
-    {
-      auto i = GetTupleElement(
-          Parameter(condb.get(), 0, tuple_shape, "InvertDiagCondTuple"), 0);
-      Lt(i, ConstantR0<int32>(condb.get(), block_size));
-    }
-    TF_ASSIGN_OR_RETURN(auto cond, condb->Build());
-
-    // Construct the loop body function.
-    std::unique_ptr<XlaBuilder> bodyb =
-        builder->CreateSubBuilder("InvertDiagBody");
-    {
-      auto input_tuple =
-          Parameter(bodyb.get(), 0, tuple_shape, "InvertDiagBodyTuple");
-
-      auto i = GetTupleElement(input_tuple, 0);
-      auto body_out = GetTupleElement(input_tuple, 1);
-      auto body_input = GetTupleElement(input_tuple, 2);
-
-      auto zero = ConstantR0<int32>(bodyb.get(), 0);
-      auto j = (lower) ? i : ScalarLike(i, block_size - 1) - i;
-      auto input_row =
-          DynamicSlice(body_input, {zero, j, zero},
-                       /*slice_sizes=*/{num_blocks, 1, block_size});
-
-      // We want -L21 L11^{-1}
-      DotDimensionNumbers dnums;
-      dnums.add_lhs_batch_dimensions(0);
-      dnums.add_rhs_batch_dimensions(0);
-      dnums.add_lhs_contracting_dimensions(2);
-      dnums.add_rhs_contracting_dimensions(1);
-      PrecisionConfig precision_proto;
-      precision_proto.add_operand_precision(precision);
-      precision_proto.add_operand_precision(precision);
-      auto update = -DotGeneral(input_row, body_out, dnums, &precision_proto);
-
-      body_out = DynamicUpdateSlice(body_out, update, {zero, j, zero});
-
-      auto next_i = i + ScalarLike(i, 1);
-      Tuple(bodyb.get(), {next_i, body_out, body_input});
-    }
-    TF_ASSIGN_OR_RETURN(auto body, bodyb->Build());
-
-    // Construct the While loop and return the result,
-    // return while_loop(cond_fun, body_fun, init)[1]
-    auto invert_while = While(cond, body, init);
-    auto inv_diag_blocks = GetTupleElement(invert_while, 1);
-
-    // Undo the scaling
-    inv_diag_blocks = Div(inv_diag_blocks, diags,
-                          /*broadcast_dimensions=*/{0, 1});
-
-    // Reshape back to original batch major dimensions
-    return Reshape(inv_diag_blocks, AsInt64Slice(shape.dimensions()));
-  });
-}
-
 XlaOp SolveWithInvertedDiagonalBlocks(XlaOp a, XlaOp b, XlaOp inv_diag_blocks,
                                       bool left_side, bool lower,
                                       bool transpose_a, bool conjugate_a,
@@ -357,10 +236,140 @@ XlaOp SolveWithInvertedDiagonalBlocks(XlaOp a, XlaOp b, XlaOp inv_diag_blocks,
   });
 }
 
-XlaOp BuildTriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
-                           bool transpose_a, bool conjugate_a,
-                           bool unit_diagonal, int64 block_size,
-                           PrecisionConfig::Precision precision) {
+}  // namespace
+
+XlaOp TriangularSolveExpander::InvertDiagonalBlocks(
+    XlaOp diag_blocks, bool lower_triangular,
+    PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = diag_blocks.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    // Input is a batch of square lower triangular square matrices. Its shape is
+    // (..., size, size). We resize this to (num_blocks, size, size).
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(diag_blocks));
+    int64 block_size = ShapeUtil::GetDimension(shape, -1);
+    int64 num_blocks = ShapeUtil::ElementsIn(shape) /
+                       tensorflow::MathUtil::IPow(block_size, 2);
+    diag_blocks = Reshape(diag_blocks, {num_blocks, block_size, block_size});
+
+    // The input must be triangular because we rely on that when doing
+    // multiplications later on
+    diag_blocks = Triangle(diag_blocks, /*lower=*/lower_triangular);
+
+    // Rescale blocks to be unit triangular, but avoid dividing by
+    // zero (which can happen if the last block was padded) otherwise it will
+    // introduce nans which will propagate
+    auto diags = GetMatrixDiagonal(diag_blocks);
+    auto ones = FullLike(diags, 1);
+    diags = Select(Eq(diags, Zero(builder, shape.element_type())), ones, diags);
+    auto scaled_diag_blocks = Div(diag_blocks, diags, {0, 2});
+
+    // We can now use the fact that for an upper triangular matrix
+    // [[L11, 0], [L21, L22]], given the inverses L11' and L22', we have
+    // L22' = -L22' * L21 * L11'. In our case, L21 is a vector and our blocks
+    // have been rescaled to be unit triangular, so L22 = L22' = 1.
+
+    // Initialize the output matrix with -1s on the diagonal. We use -1 instead
+    // of 1 because we cannot do matrix-vector multiplies with variable shapes
+    // inside of a loop, or do irregularly shaped in-place updates. Hence,
+    // L21 <- -L22 * L21 * L11 cannot be done naively. Instead, we update the
+    // entire row i.e. we calculate
+    // [L21 L22 0] <- -[L21 L22 0] @ diag_blocks([L11', -I, -I])
+    // which means [L21 L22 0] <- [-L21 * L11', L22, 0].
+    auto identity =
+        IdentityMatrix(builder, shape.element_type(), block_size, block_size);
+    auto neg_identity = -identity;
+
+    // The first or last  diagonal element should be set to 1 instead of -1
+    // though, since we never update it
+    auto pos_one = Reshape(One(builder, shape.element_type()), {1, 1});
+    auto start_index =
+        ConstantR0<int>(builder, lower_triangular ? 0 : block_size - 1);
+    auto output_block =
+        DynamicUpdateSlice(neg_identity, pos_one,
+                           /*start_indices=*/{start_index, start_index});
+
+    // Broadcast diag([1, -1, -1, ...]) to every block
+    XlaOp output = Broadcast(output_block,
+                             /*broadcast_sizes=*/{num_blocks});
+
+    // Now we construct a loop that performs matrix-vector multiplications
+    // inverting the blocks one row at a time
+    std::vector<Shape> tuple_shapes = {
+        // The loop iteration counter is a scalar, incremented each iteration.
+        ShapeUtil::MakeShape(S32, {}),
+        // The output has the shape of A, with one row updated each iteration.
+        ShapeUtil::MakeShape(shape.element_type(),
+                             {num_blocks, block_size, block_size}),
+        // The input is a loop invariant.
+        ShapeUtil::MakeShape(shape.element_type(),
+                             {num_blocks, block_size, block_size})};
+    Shape tuple_shape = ShapeUtil::MakeTupleShape(tuple_shapes);
+
+    auto init_i = One(builder, S32);
+    auto init = Tuple(builder, {init_i, output, scaled_diag_blocks});
+
+    // Construct the loop condition function.
+    std::unique_ptr<XlaBuilder> condb =
+        builder->CreateSubBuilder("InvertDiagCond");
+    {
+      auto i = GetTupleElement(
+          Parameter(condb.get(), 0, tuple_shape, "InvertDiagCondTuple"), 0);
+      Lt(i, ConstantR0<int32>(condb.get(), block_size));
+    }
+    TF_ASSIGN_OR_RETURN(auto cond, condb->Build());
+
+    // Construct the loop body function.
+    std::unique_ptr<XlaBuilder> bodyb =
+        builder->CreateSubBuilder("InvertDiagBody");
+    {
+      auto input_tuple =
+          Parameter(bodyb.get(), 0, tuple_shape, "InvertDiagBodyTuple");
+
+      auto i = GetTupleElement(input_tuple, 0);
+      auto body_out = GetTupleElement(input_tuple, 1);
+      auto body_input = GetTupleElement(input_tuple, 2);
+
+      auto zero = ConstantR0<int32>(bodyb.get(), 0);
+      auto j = lower_triangular ? i : ScalarLike(i, block_size - 1) - i;
+      auto input_row =
+          DynamicSlice(body_input, {zero, j, zero},
+                       /*slice_sizes=*/{num_blocks, 1, block_size});
+
+      // We want -L21 L11^{-1}
+      DotDimensionNumbers dnums;
+      dnums.add_lhs_batch_dimensions(0);
+      dnums.add_rhs_batch_dimensions(0);
+      dnums.add_lhs_contracting_dimensions(2);
+      dnums.add_rhs_contracting_dimensions(1);
+      PrecisionConfig precision_proto;
+      precision_proto.add_operand_precision(precision);
+      precision_proto.add_operand_precision(precision);
+      auto update = -DotGeneral(input_row, body_out, dnums, &precision_proto);
+
+      body_out = DynamicUpdateSlice(body_out, update, {zero, j, zero});
+
+      auto next_i = i + ScalarLike(i, 1);
+      Tuple(bodyb.get(), {next_i, body_out, body_input});
+    }
+    TF_ASSIGN_OR_RETURN(auto body, bodyb->Build());
+
+    // Construct the While loop and return the result,
+    // return while_loop(cond_fun, body_fun, init)[1]
+    auto invert_while = While(cond, body, init);
+    auto inv_diag_blocks = GetTupleElement(invert_while, 1);
+    // Undo the scaling
+    inv_diag_blocks = Div(inv_diag_blocks, diags,
+                          /*broadcast_dimensions=*/{0, 1});
+
+    // Reshape back to original batch major dimensions
+    return Reshape(inv_diag_blocks, AsInt64Slice(shape.dimensions()));
+  });
+}
+
+XlaOp TriangularSolveExpander::BuildTriangularSolve(
+    XlaOp a, XlaOp b, bool left_side, bool lower, bool transpose_a,
+    bool conjugate_a, bool unit_diagonal, int64 block_size,
+    PrecisionConfig::Precision precision) {
   XlaBuilder* builder = a.builder();
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
@@ -422,6 +431,11 @@ XlaOp BuildTriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
       return b;
     }
 
+    // Degenerate case: 1x1 matrices.
+    if (ShapeUtil::GetDimension(a_shape, -1) == 1) {
+      return unit_diagonal ? b : Div(b, MaybeConjugate(a, conjugate_a));
+    }
+
     // TODO(phawkins): consider pushing triangle masking into
     // InvertDiagonalBlocks.
     if (unit_diagonal) {
@@ -440,8 +454,7 @@ XlaOp BuildTriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
     auto diag_blocks = DiagonalBlocks(a, block_size);
 
     // We invert these blocks in parallel using batched matrix-vector products
-    auto inv_diag_blocks = InvertDiagonalBlocks(diag_blocks, lower, transpose_a,
-                                                conjugate_a, precision);
+    auto inv_diag_blocks = InvertDiagonalBlocks(diag_blocks, lower, precision);
 
     // We now find the solution using GEMMs
     auto x =
@@ -452,8 +465,6 @@ XlaOp BuildTriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
   });
 }
 
-}  // namespace
-
 TriangularSolveExpander::TriangularSolveExpander(int64 block_size)
     : block_size_(block_size) {}
 
diff --git a/tensorflow/compiler/xla/service/triangular_solve_expander.h b/tensorflow/compiler/xla/service/triangular_solve_expander.h
index 362e8557229..3f9e58a3246 100644
--- a/tensorflow/compiler/xla/service/triangular_solve_expander.h
+++ b/tensorflow/compiler/xla/service/triangular_solve_expander.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_TRIANGULAR_SOLVE_EXPANDER_H_
 
 #include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/service/op_expander_pass.h"
 
 namespace xla {
@@ -35,6 +36,14 @@ class TriangularSolveExpander : public OpExpanderPass {
   StatusOr<HloInstruction*> ExpandInstruction(
       HloInstruction* instruction) override;
 
+  virtual XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower_triangular,
+                                     PrecisionConfig::Precision precision);
+
+  XlaOp BuildTriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                             bool transpose_a, bool conjugate_a,
+                             bool unit_diagonal, int64 block_size,
+                             PrecisionConfig::Precision precision);
+
  private:
   // Block size for BuildTriangularSolve
   const int64 block_size_;
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index c80123bcd50..785fdecbfa0 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -37,23 +37,15 @@ namespace m = match;
 using absl::optional;
 using hlo_query::ContainsInstrWithOpcode;
 
-// Tries to remove elements in a while loop's tuple that aren't used within the
-// loop.
-//
-// Specifically, if a loop is tuple-shaped, and there exists some element of
-// that tuple that is not used by the loop condition and is not used by the loop
-// body except to pass it to the next iteration of the loop, then we can remove
-// that element from the loop's tuples.
-static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
-  CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
-
-  // Don't try this transformation if the while loop isn't removable, since if
-  // it succeeds ultimately we're going to have to replace the old while loop
-  // with a new one.
-  if (!while_op->parent()->IsSafelyRemovable(while_op)) {
-    VLOG(2) << "Can't remove dead parameters from non-removable while op.";
-    return false;
-  }
+// This is a utility function that removes the given tuple indices from the
+// while loop init, body, and condition. The final shape returned is still the
+// same as before.
+static StatusOr<HloInstruction*> RemoveDeadTupleIndices(
+    HloInstruction* while_op, absl::flat_hash_set<int64>& used_tuple_indices) {
+  // Build up maps from the old/new to the new/old tuple indices.
+  std::vector<int64> new_to_old_tuple_idx(used_tuple_indices.begin(),
+                                          used_tuple_indices.end());
+  absl::c_sort(new_to_old_tuple_idx);
 
   HloModule* module = while_op->GetModule();
   HloComputation* computation = while_op->parent();
@@ -62,107 +54,8 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   HloComputation* while_body = while_op->while_body();
   HloInstruction* while_body_root = while_body->root_instruction();
 
-  if (!while_init->shape().IsTuple()) {
-    VLOG(2) << "While op's carried value isn't tuple shaped.";
-    return false;
-  }
-
-  if (while_body_root->opcode() != HloOpcode::kTuple) {
-    VLOG(2) << "While body's root is not a tuple(...) instruction.";
-    return false;
-  }
-
   auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
 
-  // Bail if param0 of while_cond or while_body has users which aren't of type
-  // get-tuple-element.
-  for (const HloInstruction* instr : {while_body->parameter_instruction(0),
-                                      while_cond->parameter_instruction(0)}) {
-    for (const HloInstruction* user : instr->users()) {
-      if (user->opcode() != HloOpcode::kGetTupleElement) {
-        VLOG(2) << "Cowardly refusing to analyze while loop with "
-                << instr->ToString(print_no_metadata)
-                << " used by non-GTE instruction "
-                << user->ToString(print_no_metadata) << " in computation "
-                << instr->parent()->name();
-        return false;
-      }
-    }
-  }
-
-  const int64 tuple_size = ShapeUtil::TupleElementCount(while_init->shape());
-  if (tuple_size == 0) {
-    VLOG(2) << "Can't remove elements from while loop's tuple -- it's already "
-               "empty.";
-    return false;
-  }
-
-  absl::flat_hash_set<int64> used_tuple_indices;
-  for (HloComputation* comp : {while_body, while_cond}) {
-    // The HLO verifier ensures that while_input's shape matches while_init's
-    // shape, which we verified above is a tuple.
-    HloInstruction* while_input = comp->parameter_instruction(0);
-
-    for (const HloInstruction* user : while_input->users()) {
-      // This user doesn't count if it's only used by the while body's root, and
-      // the root places the tuple element into the same index of the tuple as
-      // it came from.  That just amounts to us carrying the variable through
-      // the loop.
-      //
-      // Careful: HloInstruction::operand_index returns the first index the
-      // operand appears in, but it may appear more than once!
-      if (user->user_count() == 1 && user->users().front() == while_body_root &&
-          while_body_root->operand_index(user) == user->tuple_index() &&
-          absl::c_count(while_body_root->operands(), user) == 1) {
-        continue;
-      }
-
-      used_tuple_indices.insert(user->tuple_index());
-      if (used_tuple_indices.size() == tuple_size) {
-        VLOG(2) << "Loop " << while_op->ToString(print_no_metadata)
-                << " uses all of its inputs; no simplification possible.";
-        return false;
-      }
-    }
-  }
-
-  // If a tuple element is not passed unmodified from the while body's param0
-  // through to the while body's root, count that element as "used", since
-  // removing that element would be observable.
-  for (int64 i = 0; i < while_body_root->operand_count(); ++i) {
-    if (used_tuple_indices.contains(i)) {
-      continue;
-    }
-
-    auto* operand = while_body_root->operand(i);
-    if (operand->opcode() != HloOpcode::kGetTupleElement ||
-        operand->operand(0) != while_body->parameter_instruction(0) ||
-        operand->tuple_index() != i) {
-      VLOG(2) << "Tuple index " << i
-              << " is not passed through loop body unmodified.";
-      used_tuple_indices.insert(i);
-
-      if (used_tuple_indices.size() == tuple_size) {
-        VLOG(2) << "Loop " << while_op->ToString(print_no_metadata)
-                << " uses all of its inputs; no simplification possible.";
-        return false;
-      }
-    }
-  }
-
-  // If we got here, used_tuple_indices.size() < tuple_size, meaning some
-  // elements of the loop's tuple aren't used by while_body or while_cond.
-  CHECK_LT(used_tuple_indices.size(), tuple_size);
-
-  VLOG(1) << "Eliminating " << tuple_size - used_tuple_indices.size()
-          << " elements from tuple of "
-          << while_op->ToString(print_no_metadata);
-
-  // Build up maps from the old/new to the new/old tuple indices.
-  std::vector<int64> new_to_old_tuple_idx(used_tuple_indices.begin(),
-                                          used_tuple_indices.end());
-  absl::c_sort(new_to_old_tuple_idx);
-
   absl::flat_hash_map<int64, int64> old_to_new_tuple_idx;
   for (int64 new_idx = 0; new_idx < new_to_old_tuple_idx.size(); ++new_idx) {
     int64 old_idx = new_to_old_tuple_idx[new_idx];
@@ -288,6 +181,7 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   // The tuple simplifier will then simplify this if possible, removing
   // new_tuple and while_init.
   std::vector<HloInstruction*> new_tuple_elems;
+  const int64 tuple_size = ShapeUtil::TupleElementCount(while_init->shape());
   for (int64 old_idx = 0; old_idx < tuple_size; ++old_idx) {
     auto new_tuple_idx_it = old_to_new_tuple_idx.find(old_idx);
     if (new_tuple_idx_it != old_to_new_tuple_idx.end()) {
@@ -305,9 +199,293 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   HloInstruction* new_tuple =
       computation->AddInstruction(HloInstruction::CreateTuple(new_tuple_elems));
   TF_RETURN_IF_ERROR(computation->ReplaceInstruction(while_op, new_tuple));
+
+  return new_while_op;
+}
+
+// Tries to remove elements in a while loop's tuple that aren't used within the
+// loop.
+//
+// Specifically, if a loop is tuple-shaped, and there exists some element of
+// that tuple that is not used by the loop condition and is not used by the loop
+// body except to pass it to the next iteration of the loop, then we can remove
+// that element from the loop's tuples.
+static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
+  CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
+
+  // Don't try this transformation if the while loop isn't removable, since if
+  // it succeeds ultimately we're going to have to replace the old while loop
+  // with a new one.
+  if (!while_op->parent()->IsSafelyRemovable(while_op)) {
+    VLOG(2) << "Can't remove dead parameters from non-removable while op.";
+    return false;
+  }
+
+  HloInstruction* while_init = while_op->mutable_operand(0);
+  HloComputation* while_cond = while_op->while_condition();
+  HloComputation* while_body = while_op->while_body();
+  HloInstruction* while_body_root = while_body->root_instruction();
+
+  if (!while_init->shape().IsTuple()) {
+    VLOG(2) << "While op's carried value isn't tuple shaped.";
+    return false;
+  }
+
+  if (while_body_root->opcode() != HloOpcode::kTuple) {
+    VLOG(2) << "While body's root is not a tuple(...) instruction.";
+    return false;
+  }
+
+  auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
+
+  // Bail if param0 of while_cond or while_body has users which aren't of type
+  // get-tuple-element.
+  for (const HloInstruction* instr : {while_body->parameter_instruction(0),
+                                      while_cond->parameter_instruction(0)}) {
+    for (const HloInstruction* user : instr->users()) {
+      if (user->opcode() != HloOpcode::kGetTupleElement) {
+        VLOG(2) << "Cowardly refusing to analyze while loop with "
+                << instr->ToString(print_no_metadata)
+                << " used by non-GTE instruction "
+                << user->ToString(print_no_metadata) << " in computation "
+                << instr->parent()->name();
+        return false;
+      }
+    }
+  }
+
+  const int64 tuple_size = ShapeUtil::TupleElementCount(while_init->shape());
+  if (tuple_size == 0) {
+    VLOG(2) << "Can't remove elements from while loop's tuple -- it's already "
+               "empty.";
+    return false;
+  }
+
+  absl::flat_hash_set<int64> used_tuple_indices;
+  for (HloComputation* comp : {while_body, while_cond}) {
+    // The HLO verifier ensures that while_input's shape matches while_init's
+    // shape, which we verified above is a tuple.
+    HloInstruction* while_input = comp->parameter_instruction(0);
+
+    for (const HloInstruction* user : while_input->users()) {
+      // This user doesn't count if it's only used by the while body's root, and
+      // the root places the tuple element into the same index of the tuple as
+      // it came from.  That just amounts to us carrying the variable through
+      // the loop.
+      //
+      // Careful: HloInstruction::operand_index returns the first index the
+      // operand appears in, but it may appear more than once!
+      if (user->user_count() == 1 && user->users().front() == while_body_root &&
+          while_body_root->operand_index(user) == user->tuple_index() &&
+          absl::c_count(while_body_root->operands(), user) == 1) {
+        continue;
+      }
+
+      used_tuple_indices.insert(user->tuple_index());
+      if (used_tuple_indices.size() == tuple_size) {
+        VLOG(2) << "Loop " << while_op->ToString(print_no_metadata)
+                << " uses all of its inputs; no simplification possible.";
+        return false;
+      }
+    }
+  }
+
+  // If a tuple element is not passed unmodified from the while body's param0
+  // through to the while body's root, count that element as "used", since
+  // removing that element would be observable.
+  for (int64 i = 0; i < while_body_root->operand_count(); ++i) {
+    if (used_tuple_indices.contains(i)) {
+      continue;
+    }
+
+    auto* operand = while_body_root->operand(i);
+    if (operand->opcode() != HloOpcode::kGetTupleElement ||
+        operand->operand(0) != while_body->parameter_instruction(0) ||
+        operand->tuple_index() != i) {
+      VLOG(2) << "Tuple index " << i
+              << " is not passed through loop body unmodified.";
+      used_tuple_indices.insert(i);
+
+      if (used_tuple_indices.size() == tuple_size) {
+        VLOG(2) << "Loop " << while_op->ToString(print_no_metadata)
+                << " uses all of its inputs; no simplification possible.";
+        return false;
+      }
+    }
+  }
+
+  // If we got here, used_tuple_indices.size() < tuple_size, meaning some
+  // elements of the loop's tuple aren't used by while_body or while_cond.
+  CHECK_LT(used_tuple_indices.size(), tuple_size);
+
+  VLOG(1) << "Eliminating " << tuple_size - used_tuple_indices.size()
+          << " elements from tuple of "
+          << while_op->ToString(print_no_metadata);
+
+  TF_ASSIGN_OR_RETURN(while_op,
+                      RemoveDeadTupleIndices(while_op, used_tuple_indices));
+
   return true;
 }
 
+// This is a helper function for TryRemoveRepeatedWhileTupleIndices. It removes
+// duplicates by replacing them with tuple_index, followed by a call to
+// RemoveDeadTupleIndices.
+static StatusOr<HloInstruction*> TryRemoveRepeatedWhileTupleIndicesHelper(
+    HloInstruction* while_op, const int64 tuple_index,
+    absl::flat_hash_set<int64>& duplicates) {
+  HloComputation* while_cond = while_op->while_condition();
+  HloComputation* while_body = while_op->while_body();
+  HloInstruction* while_init = while_op->mutable_operand(0);
+
+  VLOG(2) << "while_init " << while_init->ToString() << " operands "
+          << while_init->operand_count();
+  VLOG(2) << "while_body_root " << while_body->root_instruction()->ToString()
+          << " operands " << while_body->root_instruction()->operand_count();
+
+  // Change the loop body and condition such that uses of the duplicates are
+  // replaced with the original tuple element.
+  for (HloComputation* comp : {while_body, while_cond}) {
+    auto new_get = comp->AddInstruction(HloInstruction::CreateGetTupleElement(
+        comp->parameter_instruction(0)->shape().tuple_shapes(tuple_index),
+        comp->parameter_instruction(0), tuple_index));
+
+    std::vector<HloInstruction*> instrs_to_replace;
+    for (auto* instr : comp->instructions()) {
+      if (instr->opcode() == HloOpcode::kGetTupleElement &&
+          duplicates.contains(instr->tuple_index()) &&
+          instr->operand(0) == comp->parameter_instruction(0)) {
+        instrs_to_replace.push_back(instr);
+      }
+    }
+
+    for (auto instr : instrs_to_replace) {
+      TF_RETURN_IF_ERROR(comp->ReplaceInstruction(instr, new_get));
+    }
+  }
+
+  // We know which tuple indices are useful; i.e, those which aren't duplicates.
+  absl::flat_hash_set<int64> used_tuple_indices;
+  for (int index = 0; index < while_init->shape().tuple_shapes_size();
+       ++index) {
+    if (!duplicates.count(index)) {
+      used_tuple_indices.insert(index);
+    }
+  }
+
+  // Remove the duplicate tuple elements.
+  TF_ASSIGN_OR_RETURN(while_op,
+                      RemoveDeadTupleIndices(while_op, used_tuple_indices));
+
+  return while_op;
+}
+
+// If the while loop init passes the same values to several tuple indices, and
+// if the body keeps on passing them through, we can remove the duplicates.
+static StatusOr<bool> TryRemoveRepeatedWhileTupleIndices(
+    HloInstruction* while_op) {
+  CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
+
+  int index_to_investigate = 0;
+  // Don't try this transformation if the while loop isn't removable, since if
+  // it succeeds ultimately we're going to have to replace the old while loop
+  // with a new one.
+  if (!while_op->parent()->IsSafelyRemovable(while_op)) {
+    VLOG(2) << "Can't remove dead parameters from non-removable while op.";
+    return false;
+  }
+
+  HloInstruction* while_init = while_op->mutable_operand(0);
+  HloComputation* while_cond = while_op->while_condition();
+  HloComputation* while_body = while_op->while_body();
+  HloInstruction* while_body_root = while_body->root_instruction();
+
+  if (!while_init->shape().IsTuple()) {
+    VLOG(2) << "While op's carried value isn't tuple shaped.";
+    return false;
+  }
+
+  bool changed = false;
+  while (index_to_investigate < while_init->shape().tuple_shapes_size()) {
+    if (!while_init->shape().IsTuple() ||
+        while_init->opcode() != HloOpcode::kTuple) {
+      VLOG(2) << "While op's carried value isn't tuple shaped.";
+      return false;
+    }
+
+    if (while_body_root->opcode() != HloOpcode::kTuple) {
+      VLOG(2) << "While body's root is not a tuple(...) instruction.";
+      return false;
+    }
+
+    auto& while_shape = while_init->shape();
+    VLOG(2) << "Iterating " << index_to_investigate;
+
+    absl::flat_hash_set<int64> duplicates;
+    auto* pivot_init_elem = while_init->operand(index_to_investigate);
+    auto* pivot_body_elem = while_body_root->operand(index_to_investigate);
+    if (pivot_body_elem->opcode() == HloOpcode::kGetTupleElement &&
+        pivot_body_elem->operand(0) == while_body->parameter_instruction(0)) {
+      if (pivot_body_elem->tuple_index() != index_to_investigate) {
+        VLOG(2) << "Mismatch between pivot_body_elem->tuple_index() "
+                << pivot_body_elem->tuple_index() << " index_to_investigate "
+                << index_to_investigate;
+        index_to_investigate++;
+        continue;
+      }
+    } else {
+      index_to_investigate++;
+      continue;
+    }
+
+    // Look from index_to_investigate onwards to see if it is repeated.
+    for (int64 i = index_to_investigate + 1;
+         i < while_shape.tuple_shapes_size(); ++i) {
+      auto* init_elem = while_init->operand(i);
+      auto* body_elem = while_body_root->operand(i);
+      if (body_elem->opcode() == HloOpcode::kGetTupleElement &&
+          body_elem->operand(0) == while_body->parameter_instruction(0)) {
+        if (body_elem->tuple_index() != i) {
+          VLOG(2) << "Mismatch between body_elem->tuple_index() "
+                  << body_elem->tuple_index() << " i " << i;
+          continue;
+        }
+      } else {
+        continue;
+      }
+
+      if (pivot_init_elem == init_elem) {
+        VLOG(2) << "init_elem " << init_elem->ToString() << " pivot_init_elem "
+                << pivot_init_elem->ToString();
+        VLOG(2) << "body_elem " << body_elem->ToString() << " pivot_body_elem "
+                << pivot_body_elem->ToString();
+        duplicates.insert(i);
+      }
+    }
+
+    // If duplicates are found, call the helper to remove them.
+    if (!duplicates.empty()) {
+      VLOG(2) << "Duplicate found " << duplicates.size() << " pivot_init "
+              << pivot_init_elem->ToString();
+      TF_ASSIGN_OR_RETURN(while_op,
+                          TryRemoveRepeatedWhileTupleIndicesHelper(
+                              while_op, index_to_investigate, duplicates));
+      changed = true;
+      VLOG(2) << "Changed while_op " << while_op->ToString()
+              << " while_op operand count " << while_op->operand_count();
+      // Update the while loop variables so we can continue looking for
+      // duplicates of a different index.
+      while_init = while_op->mutable_operand(0);
+      while_cond = while_op->while_condition();
+      while_body = while_op->while_body();
+      while_body_root = while_body->root_instruction();
+    }
+    index_to_investigate++;
+  }
+
+  return changed;
+}
+
 // Removes each loop parameter (i.e. member of the while loop tuple) that is a
 // constant and is the same in the while loop body and the while loop init.
 static StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
@@ -1048,6 +1226,7 @@ StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
 
     TF_ASSIGN_OR_RETURN(result, TryRemoveWhileLoop(while_op));
     changed |= result;
+
     if (result) {
       // Don't continue simplifying after successfully removing the while loop
       // -- that would result in use-after-free nastiness.
@@ -1067,6 +1246,12 @@ StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
     // successful, meaning that `while_op` is no longer valid after one of these
     // transformations returns true.
 
+    TF_ASSIGN_OR_RETURN(result, TryRemoveRepeatedWhileTupleIndices(while_op));
+    changed |= result;
+    if (result) {
+      continue;
+    }
+
     TF_ASSIGN_OR_RETURN(result, TryFlattenNestedTuples(while_op));
     changed |= result;
     if (result) {
@@ -1074,6 +1259,7 @@ StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
     }
 
     TF_ASSIGN_OR_RETURN(result, TryRemoveDeadWhileParams(while_op));
+
     changed |= result;
     if (result) {
       continue;
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index d715fb3857a..c93cb5dc347 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -794,5 +794,51 @@ TEST_F(WhileLoopSimplifierTest, MergeInductionVariables_SkipS16) {
           .ValueOrDie());
 }
 
+TEST_F(WhileLoopSimplifierTest, RemoveRepeatedParams) {
+  const string hlo_string = R"(
+  HloModule SwappingTupleElements
+
+  SwappingTupleElements.body {
+    loop_var = (s32[], s32[], s32[]) parameter(0)
+    get-tuple-element = s32[] get-tuple-element(loop_var), index=0
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var), index=1
+    get-tuple-element.2 = s32[] get-tuple-element(loop_var), index=2
+    y = s32[] add(get-tuple-element.1, get-tuple-element.2)
+    ROOT tuple = (s32[], s32[], s32[]) tuple(s32[] get-tuple-element, y,
+      s32[] get-tuple-element.2)
+  }
+
+  SwappingTupleElements.always_true {
+   param = (s32[], s32[], s32[]) parameter(0)
+   get-tuple-element = s32[] get-tuple-element(param), index=0
+   get-tuple-element.1 = s32[] get-tuple-element(param), index=1
+   ROOT less-than = pred[] compare(get-tuple-element, get-tuple-element.1), direction=LT
+  }
+
+  ENTRY SwappingTupleElements {
+   x = s32[] parameter(0)
+   y = s32[] parameter(1)
+   tuple.1 = (s32[], s32[], s32[]) tuple(s32[] x, s32[] y, s32[] x)
+   ROOT while = (s32[], s32[], s32[]) while(tuple.1),
+     condition=SwappingTupleElements.always_true,
+     body=SwappingTupleElements.body
+  }
+  )";
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  HloInstruction* new_while = FindFirstWhile(m.get());
+  Shape new_while_shape = ParseShape("(s32[], s32[])").ValueOrDie();
+  EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->root_instruction()->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->parameter_instruction(0)->shape(),
+      new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_condition()->parameter_instruction(0)->shape(),
+      new_while_shape));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index 73bb3327784..b1c96e9becf 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -70,6 +70,8 @@ struct IndexTableEntry {
 
 template <typename ContainerType, typename IteratorType, typename ValueType>
 class ShapeTreeIterator;
+template <typename ContainerType, typename IteratorType, typename ValueType>
+class ShapeTreeLeafIterator;
 
 // A ShapeTree<T> is a recursive data structure which mirrors the structure of a
 // XLA shape and holds a value of type T for each subshape (i.e. tuple or array)
@@ -158,23 +160,25 @@ class ShapeTree {
   using reverse_iterator = std::reverse_iterator<iterator>;
   using const_reverse_iterator = std::reverse_iterator<const_iterator>;
 
+  using leaf_iterator =
+      ShapeTreeLeafIterator<std::vector<Node>,
+                            typename std::vector<Node>::iterator,
+                            std::pair<ShapeIndex, T>>;
+  using const_leaf_iterator =
+      ShapeTreeLeafIterator<const std::vector<Node>,
+                            typename std::vector<Node>::const_iterator,
+                            const std::pair<ShapeIndex, T>>;
+  using reverse_leaf_iterator = std::reverse_iterator<leaf_iterator>;
+  using const_reverse_leaf_iterator =
+      std::reverse_iterator<const_leaf_iterator>;
+
   // begin/end for iterating over all nodes.
-  iterator begin() {
-    return iterator(&nodes_, nodes_.begin(),
-                    /*iterate_leaves_only=*/false);
-  }
-  iterator end() {
-    return iterator(&nodes_, nodes_.end(),
-                    /*iterate_leaves_only=*/false);
-  }
+  iterator begin() { return iterator(&nodes_, nodes_.begin()); }
+  iterator end() { return iterator(&nodes_, nodes_.end()); }
   const_iterator begin() const {
-    return const_iterator(&nodes_, nodes_.begin(),
-                          /*iterate_leaves_only=*/false);
-  }
-  const_iterator end() const {
-    return const_iterator(&nodes_, nodes_.end(),
-                          /*iterate_leaves_only=*/false);
+    return const_iterator(&nodes_, nodes_.begin());
   }
+  const_iterator end() const { return const_iterator(&nodes_, nodes_.end()); }
 
   // rbegin/rend for iterating over all nodes in reverse.
   reverse_iterator rbegin() { return reverse_iterator(end()); }
@@ -188,37 +192,33 @@ class ShapeTree {
 
   // leaf_begin()/leaf_end() iterates over all leaf nodes (nodes with no
   // children).
-  iterator leaf_begin() {
-    return iterator(&nodes_, nodes_.begin(),
-                    /*iterate_leaves_only=*/true);
+  leaf_iterator leaf_begin() { return leaf_iterator(&nodes_, nodes_.begin()); }
+  leaf_iterator leaf_end() { return leaf_iterator(&nodes_, nodes_.end()); }
+  const_leaf_iterator leaf_begin() const {
+    return const_leaf_iterator(&nodes_, nodes_.begin());
   }
-  iterator leaf_end() {
-    return iterator(&nodes_, nodes_.end(),
-                    /*iterate_leaves_only=*/true);
-  }
-  const_iterator leaf_begin() const {
-    return const_iterator(&nodes_, nodes_.begin(),
-                          /*iterate_leaves_only=*/true);
-  }
-  const_iterator leaf_end() const {
-    return const_iterator(&nodes_, nodes_.end(),
-                          /*iterate_leaves_only=*/true);
+  const_leaf_iterator leaf_end() const {
+    return const_leaf_iterator(&nodes_, nodes_.end());
   }
   // range-based iterator for leaf_begin()/leaf_end().
-  tensorflow::gtl::iterator_range<iterator> leaves() {
+  tensorflow::gtl::iterator_range<leaf_iterator> leaves() {
     return tensorflow::gtl::make_range(leaf_begin(), leaf_end());
   }
-  tensorflow::gtl::iterator_range<const_iterator> leaves() const {
+  tensorflow::gtl::iterator_range<const_leaf_iterator> leaves() const {
     return tensorflow::gtl::make_range(leaf_begin(), leaf_end());
   }
 
-  reverse_iterator leaf_rbegin() { return reverse_iterator(leaf_end()); }
-  reverse_iterator leaf_rend() { return reverse_iterator(leaf_begin()); }
-  const_reverse_iterator leaf_rbegin() const {
-    return const_reverse_iterator(leaf_end());
+  reverse_leaf_iterator leaf_rbegin() {
+    return reverse_leaf_iterator(leaf_end());
   }
-  const_reverse_iterator leaf_rend() const {
-    return const_reverse_iterator(leaf_begin());
+  reverse_leaf_iterator leaf_rend() {
+    return reverse_leaf_iterator(leaf_begin());
+  }
+  const_reverse_leaf_iterator leaf_rbegin() const {
+    return const_reverse_leaf_iterator(leaf_end());
+  }
+  const_reverse_leaf_iterator leaf_rend() const {
+    return const_reverse_leaf_iterator(leaf_begin());
   }
 
   // Returns an iterator pointing to the given ShapeIndex.
@@ -226,12 +226,12 @@ class ShapeTree {
   iterator find(ShapeIndexView index) {
     Node* element = Lookup(index);
     auto element_iter = nodes_.begin() + (element - &nodes_[0]);
-    return iterator(&nodes_, element_iter, /*iterate_leaves_only=*/false);
+    return iterator(&nodes_, element_iter);
   }
   const_iterator find(ShapeIndexView index) const {
-    Node* element = Lookup(index);
+    const Node* element = Lookup(index);
     auto element_iter = nodes_.cbegin() + (element - &nodes_[0]);
-    return const_iterator(&nodes_, element_iter, /*iterate_leaves_only=*/false);
+    return const_iterator(&nodes_, element_iter);
   }
 
   // Returns the number of leaf nodes in the tree.
@@ -343,21 +343,11 @@ template <typename ContainerType, typename IteratorType, typename ValueType>
 class ShapeTreeIterator
     : public std::iterator<std::bidirectional_iterator_tag, ValueType> {
  public:
-  ShapeTreeIterator(ContainerType* nodes, IteratorType node,
-                    bool iterate_leaves_only)
-      : nodes_(nodes),
-        node_(std::move(node)),
-        iterate_leaves_only_(iterate_leaves_only) {
-    while (iterate_leaves_only && node_ != nodes_->end() && !node_->is_leaf) {
-      ++node_;
-    }
-  }
+  ShapeTreeIterator(ContainerType* nodes, IteratorType node)
+      : nodes_(nodes), node_(std::move(node)) {}
 
   ShapeTreeIterator& operator++() {
     ++node_;
-    while (iterate_leaves_only_ && node_ != nodes_->end() && !node_->is_leaf) {
-      ++node_;
-    }
     return *this;
   }
   ShapeTreeIterator operator++(int) {
@@ -368,9 +358,6 @@ class ShapeTreeIterator
 
   ShapeTreeIterator& operator--() {
     --node_;
-    while (iterate_leaves_only_ && node_ > nodes_->begin() && !node_->is_leaf) {
-      --node_;
-    }
     return *this;
   }
   ShapeTreeIterator operator--(int) {
@@ -385,14 +372,66 @@ class ShapeTreeIterator
   bool operator!=(const ShapeTreeIterator& other) const {
     return node_ != other.node_;
   }
-  ValueType& operator*() { return node_->data; }
-  ValueType* operator->() { return &node_->data; }
+  ValueType& operator*() const { return node_->data; }
+  ValueType* operator->() const { return &node_->data; }
+
+ private:
+  ContainerType* nodes_;
+  IteratorType node_;
+};
+
+// Internal iterator that performs a pre-order walk of the leaves. This is cheap
+// to copy. The iterator value_type is equivalent to a std::pair<ShapeIndex,T>&,
+// similar to std::map.
+template <typename ContainerType, typename IteratorType, typename ValueType>
+class ShapeTreeLeafIterator
+    : public std::iterator<std::bidirectional_iterator_tag, ValueType> {
+ public:
+  ShapeTreeLeafIterator(ContainerType* nodes, IteratorType node)
+      : nodes_(nodes), node_(std::move(node)) {
+    while (node_ != nodes_->end() && !node_->is_leaf) {
+      ++node_;
+    }
+  }
+
+  ShapeTreeLeafIterator& operator++() {
+    ++node_;
+    while (node_ != nodes_->end() && !node_->is_leaf) {
+      ++node_;
+    }
+    return *this;
+  }
+  ShapeTreeLeafIterator operator++(int) {
+    auto i = *this;
+    ++(*this);
+    return i;
+  }
+
+  ShapeTreeLeafIterator& operator--() {
+    --node_;
+    while (node_ > nodes_->begin() && !node_->is_leaf) {
+      --node_;
+    }
+    return *this;
+  }
+  ShapeTreeLeafIterator operator--(int) {
+    auto i = *this;
+    --(*this);
+    return i;
+  }
+
+  bool operator==(const ShapeTreeLeafIterator& other) const {
+    return node_ == other.node_;
+  }
+  bool operator!=(const ShapeTreeLeafIterator& other) const {
+    return node_ != other.node_;
+  }
+  ValueType& operator*() const { return node_->data; }
+  ValueType* operator->() const { return &node_->data; }
 
  private:
   ContainerType* nodes_;
   IteratorType node_;
-  // True if we should not include interior nodes in our walk.
-  const bool iterate_leaves_only_;
 };
 
 template <typename T>
@@ -648,7 +687,9 @@ void ShapeTree<T>::CopySubtreeFrom(const ShapeTree<T>& other,
                                    const ShapeIndex& target_base_index) {
   CHECK(ShapeUtil::Compatible(
       ShapeUtil::GetSubshape(shape(), target_base_index),
-      ShapeUtil::GetSubshape(other.shape(), source_base_index)));
+      ShapeUtil::GetSubshape(other.shape(), source_base_index)))
+      << ShapeUtil::GetSubshape(shape(), target_base_index) << " vs "
+      << ShapeUtil::GetSubshape(other.shape(), source_base_index);
   ForEachMutableElement([this, &other, &source_base_index, &target_base_index](
                             const ShapeIndex& index, T* data) {
     // Copy the data element only if index is in the
diff --git a/tensorflow/compiler/xla/shape_tree_test.cc b/tensorflow/compiler/xla/shape_tree_test.cc
index 2b6c484bc4f..c294355e269 100644
--- a/tensorflow/compiler/xla/shape_tree_test.cc
+++ b/tensorflow/compiler/xla/shape_tree_test.cc
@@ -485,6 +485,30 @@ TEST_F(ShapeTreeTest, ReverseIterateOrder) {
                }));
 }
 
+// Ensures that we can find an element at an index that we know ahead of time to
+// be occupied in a 'ShapeTree' via the 'find' API.
+TEST_F(ShapeTreeTest, Find) {
+  ShapeTree<int> t(nested_tuple_shape_, 42);
+  auto found = t.find({1, 0});
+  EXPECT_NE(found, t.end());
+  // The found key must be the same key we searched for.
+  EXPECT_EQ(found->first, ShapeIndex({1, 0}));
+  // The 'ShapeTree' has 42 at every position.
+  EXPECT_EQ(found->second, 42);
+}
+
+// Ensures that we can find an element at an index that we know ahead of time to
+// be occupied in a 'const ShapeTree' via the 'find' API.
+TEST_F(ShapeTreeTest, ConstFind) {
+  const ShapeTree<int> t(nested_tuple_shape_, 42);
+  auto found = t.find({1, 0});
+  EXPECT_NE(found, t.end());
+  // The found key must be the same key we searched for.
+  EXPECT_EQ(found->first, ShapeIndex({1, 0}));
+  // The 'ShapeTree' has 42 at every position.
+  EXPECT_EQ(found->second, 42);
+}
+
 TEST_F(ShapeTreeTest, IterateOrderLeaves) {
   ShapeTree<int> t(nested_tuple_shape_, 42);
   std::vector<ShapeIndex> v;
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 0833919b124..0c877bf6102 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -1623,4 +1623,14 @@ static Shape MergeDimensions(absl::Span<const size_t> segs,
   return absl::nullopt;
 }
 
+Shape ShapeUtil::DeviceShapeToHostShape(Shape s) {
+  ForEachMutableSubshape(&s, [](Shape* subshape, const ShapeIndex& index) {
+    if (subshape->IsArray()) {
+      subshape->mutable_layout()->clear_tiles();
+      subshape->mutable_layout()->set_memory_space(Layout::kDefaultMemorySpace);
+    }
+  });
+  return s;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 3f69a8b0aca..5a5695d32ee 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -783,6 +783,10 @@ class ShapeUtil {
   static absl::optional<std::vector<int64>> FindTranspose021(const Shape& a,
                                                              const Shape& b);
 
+  // Strips device-specific information, namely tiling and memory-space
+  // information, from a shape.
+  static Shape DeviceShapeToHostShape(Shape s);
+
  private:
   // Validates the shape size is sane. This makes sure it's safe to do
   // calculations in int64 without overflowing.
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 17444c042e7..98ed49ad76a 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -6,7 +6,14 @@ load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "genrule")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     default_visibility = [":friends"],
@@ -64,9 +71,9 @@ cc_library(
     hdrs = ["manifest_checking_test.h"],
     deps = [
         ":test_macros_header",
-        "//tensorflow/core:regexp_internal",
         "//tensorflow/core:test",
         "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:regexp",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
@@ -164,8 +171,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:interpreter_plugin",  # reference backend
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
@@ -226,8 +233,8 @@ cc_library(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -305,7 +312,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//third_party/eigen3",
         "@com_google_absl//absl/memory",
@@ -380,12 +387,7 @@ xla_test(
     name = "conv_depthwise_backprop_filter_test",
     timeout = "long",
     srcs = ["conv_depthwise_backprop_filter_test.cc"],
-    # these backends do not natively handle batch group counts.
-    disabled_backends = [
-        "gpu",
-        "cpu",
-    ],
-    shard_count = 6,
+    shard_count = 40,
     deps = [
         ":test_macros_header",
         "//tensorflow/compiler/xla:execution_options_util",
@@ -507,8 +509,8 @@ xla_test(
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/core:lib",
-        "//tensorflow/core:regexp_internal",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:regexp",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
@@ -553,8 +555,8 @@ xla_test(
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
     ],
 )
 
@@ -1456,8 +1458,8 @@ xla_test(
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_memory_allocator",
     ],
 )
@@ -1913,8 +1915,8 @@ xla_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
     ],
 )
 
@@ -1922,9 +1924,14 @@ xla_test(
     name = "concat_test",
     srcs = ["concat_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":hlo_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
@@ -1932,9 +1939,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
     ],
 )
@@ -1952,8 +1956,8 @@ xla_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
     ],
@@ -1982,8 +1986,8 @@ xla_test(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
     ],
 )
 
@@ -2043,8 +2047,8 @@ xla_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
     ],
 )
 
@@ -2398,8 +2402,8 @@ xla_test(
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_memory_allocator",
     ],
 )
@@ -2521,8 +2525,8 @@ xla_test(
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_memory_allocator",
     ],
 )
@@ -2681,6 +2685,7 @@ xla_test(
 xla_test(
     name = "cholesky_test",
     srcs = ["cholesky_test.cc"],
+    real_hardware_only = True,
     tags = [
         "no_rocm",
         "optonly",
@@ -2699,5 +2704,6 @@ xla_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
+        "//tensorflow/core/platform:tensor_float_32_utils",
     ],
 )
diff --git a/tensorflow/compiler/xla/tests/buffer_donation_test.cc b/tensorflow/compiler/xla/tests/buffer_donation_test.cc
index f78083fe2af..7915737178d 100644
--- a/tensorflow/compiler/xla/tests/buffer_donation_test.cc
+++ b/tensorflow/compiler/xla/tests/buffer_donation_test.cc
@@ -119,8 +119,7 @@ class BufferDonationTest : public HloTestBase {
             }
           });
 
-      args.emplace_back(
-          ExecutionInput(std::move(owned_buffers), argument_literal.shape()));
+      args.emplace_back(ExecutionInput(std::move(owned_buffers)));
     }
 
     StatusOr<ExecutionOutput> output_status =
diff --git a/tensorflow/compiler/xla/tests/cholesky_test.cc b/tensorflow/compiler/xla/tests/cholesky_test.cc
index e7f5ca5ed8e..4fa28736d4d 100644
--- a/tensorflow/compiler/xla/tests/cholesky_test.cc
+++ b/tensorflow/compiler/xla/tests/cholesky_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
 
 namespace xla {
 namespace {
@@ -60,6 +61,44 @@ XLA_TEST_F(CholeskyTest, NonPSDInput) {
                              ErrorSpec(1e-4, 1e-4));
 }
 
+XLA_TEST_F(CholeskyTest, NonPSDBatched) {
+  XlaBuilder builder(TestName());
+
+  Array3D<float> a_vals({
+      {
+          {10, 0, 0},
+          {1, 20, 0},
+          {1, 1, 30},
+      },
+      {
+          {1, 1, 1},
+          {1, 1, 1},
+          {1, 1, 1},
+      },
+  });
+
+  XlaOp a;
+  auto a_data = CreateR3Parameter<float>(a_vals, 0, "a", &builder, &a);
+  Cholesky(a, /*lower=*/true);
+
+  float nan = std::numeric_limits<float>::quiet_NaN();
+  Array3D<float> expected({
+      {
+          {3.16227766, 0., 0.},
+          {0.31622777, 4.4609416, 0.},
+          {0.31622777, 0.20175113, 5.46436606},
+      },
+      {
+          {nan, nan, nan},
+          {nan, nan, nan},
+          {nan, nan, nan},
+      },
+  });
+
+  ComputeAndCompareR3<float>(&builder, expected, {a_data.get()},
+                             ErrorSpec(1e-4, 1e-4));
+}
+
 XLA_TEST_F(CholeskyTest, Lower) {
   XlaBuilder builder(TestName());
 
@@ -180,7 +219,9 @@ class RandomCholeskyTest
     : public ClientLibraryTestBase,
       public ::testing::WithParamInterface<CholeskyTestCase> {};
 
-XLA_TEST_P(RandomCholeskyTest, Random) {
+XLA_TEST_P(RandomCholeskyTest, Real) {
+  // Test fails with TensorFloat-32 enabled
+  tensorflow::enable_tensor_float_32_execution(false);
   XlaBuilder builder(TestName());
 
   auto test_params = GetParam();
@@ -217,14 +258,65 @@ XLA_TEST_P(RandomCholeskyTest, Random) {
                              ErrorSpec(1e-4, 1e-4));
 }
 
+XLA_TEST_P(RandomCholeskyTest, Complex) {
+  // Test fails with TensorFloat-32 enabled
+  tensorflow::enable_tensor_float_32_execution(false);
+  XlaBuilder builder(TestName());
+
+  auto test_params = GetParam();
+  std::vector<int64> dimensions = {std::get<0>(test_params),
+                                   std::get<1>(test_params),
+                                   std::get<1>(test_params)};
+  bool lower = std::get<2>(test_params);
+  Shape shape = ShapeUtil::MakeShape(F32, dimensions);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto literal_real,
+      LiteralUtil::CreateRandomLiteral<F32>(shape, 0.0, 1.0));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto literal_imag,
+      LiteralUtil::CreateRandomLiteral<F32>(shape, 0.0, 1.0));
+
+  auto input_real = Parameter(&builder, 0, shape, "input_real");
+  auto input_imag = Parameter(&builder, 1, shape, "input_imag");
+  auto input = Complex(input_real, input_imag);
+  // Form a random positive definite matrix.
+  auto matrix = BatchDot(input, TransposeInMinorDims(Conj(input)),
+                         PrecisionConfig::HIGHEST);
+
+  auto cholesky = Triangle(Cholesky(matrix, lower), lower);
+
+  // Verify that ||matrix - cholesky * cholesky_t||_2 ~= 0
+  XlaOp verification;
+  if (lower) {
+    verification = BatchDot(cholesky, TransposeInMinorDims(Conj(cholesky)),
+                            PrecisionConfig::HIGHEST);
+  } else {
+    verification = BatchDot(TransposeInMinorDims(Conj(cholesky)), cholesky,
+                            PrecisionConfig::HIGHEST);
+  }
+  auto delta = matrix - verification;
+  Reduce(Abs(delta * Conj(delta)), ConstantR0<float>(&builder, 0.0),
+         CreateScalarAddComputation(F32, &builder), {0, 1, 2});
+
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data_real,
+                          client_->TransferToServer(literal_real));
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data_imag,
+                          client_->TransferToServer(literal_imag));
+  ComputeAndCompareR0<float>(&builder, 0.0,
+                             {input_data_real.get(), input_data_imag.get()},
+                             ErrorSpec(1e-4, 1e-4));
+}
+
 INSTANTIATE_TEST_SUITE_P(RandomCholeskyTestInstance, RandomCholeskyTest,
                          ::testing::Values(CholeskyTestCase{1, 1, true},
                                            CholeskyTestCase{1, 2, true},
                                            CholeskyTestCase{1, 50, true},
                                            CholeskyTestCase{1, 50, false},
+                                           CholeskyTestCase{1, 255, false},
                                            CholeskyTestCase{10, 5, true},
                                            CholeskyTestCase{5, 10, false},
-                                           CholeskyTestCase{2, 20, true}));
+                                           CholeskyTestCase{2, 20, true},
+                                           CholeskyTestCase{2, 129, true}));
 
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 0e99ede5d01..6acbb7a9cf0 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -605,7 +605,7 @@ XlaOp ClientLibraryTestBase::CreateConstantFromLiteral(const Literal& literal,
                                       : LiteralSlice(literal));
 }
 
-std::unique_ptr<GlobalData>
+StatusOr<std::unique_ptr<GlobalData>>
 ClientLibraryTestBase::CreateParameterAndTransferLiteral(int64 parameter_number,
                                                          const Literal& literal,
                                                          const string& name,
@@ -637,15 +637,14 @@ Literal ClientLibraryTestBase::MaybeConvertLiteralToBfloat16(
   return literal.Clone();
 }
 
-std::unique_ptr<GlobalData>
+StatusOr<std::unique_ptr<GlobalData>>
 ClientLibraryTestBase::CreateParameterAndTransferLiteral(
     int64 parameter_number, const Literal& literal, const string& name,
     const DeviceHandle* device_handle, XlaBuilder* builder,
     XlaOp* data_handle) {
   Literal param_literal = MaybeConvertLiteralToBfloat16(literal);
-  std::unique_ptr<GlobalData> data =
-      client_->TransferToServer(param_literal, device_handle)
-          .ConsumeValueOrDie();
+  TF_ASSIGN_OR_RETURN(auto data,
+                      client_->TransferToServer(param_literal, device_handle));
   *data_handle =
       Parameter(builder, parameter_number, param_literal.shape(), name);
   return data;
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 17bb70bdb42..3c9e37b8fa4 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -270,14 +270,14 @@ class ClientLibraryTestBase : public ManifestCheckingTest {
   // server, then stores into "data_handle" the global handle for that
   // parameter. When the use_bfloat16 flag is set but the literal has F32
   // elements, the literal will be converted to BF16 before being transferred.
-  std::unique_ptr<GlobalData> CreateParameterAndTransferLiteral(
+  StatusOr<std::unique_ptr<GlobalData>> CreateParameterAndTransferLiteral(
       int64 parameter_number, const Literal& literal, const string& name,
       XlaBuilder* builder, XlaOp* data_handle);
 
   // As above, but the caller can specify the device that the literal is
   // transferred to. If device_handle is nullptr, the literal will be
   // transferred to the default device.
-  std::unique_ptr<GlobalData> CreateParameterAndTransferLiteral(
+  StatusOr<std::unique_ptr<GlobalData>> CreateParameterAndTransferLiteral(
       int64 parameter_number, const Literal& literal, const string& name,
       const DeviceHandle* device_handle, XlaBuilder* builder,
       XlaOp* data_handle);
diff --git a/tensorflow/compiler/xla/tests/concat_test.cc b/tensorflow/compiler/xla/tests/concat_test.cc
index 4f5b525a342..9df83e30ad4 100644
--- a/tensorflow/compiler/xla/tests/concat_test.cc
+++ b/tensorflow/compiler/xla/tests/concat_test.cc
@@ -21,11 +21,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/core/platform/test.h"
@@ -34,6 +36,7 @@ namespace xla {
 namespace {
 
 using ConcatTest = ClientLibraryTestBase;
+using ConcatTestHlo = HloTestBase;
 using ::testing::HasSubstr;
 
 // Concatenate expects at least one argument.
@@ -518,6 +521,250 @@ XLA_TEST_F(ConcatTest, ConcatDeeplyNested) {
   ComputeAndCompareR1<float>(&builder, expected, {a_data.get()});
 }
 
+// TODO(b/169314478): Enable the test when the slow compilation is fixed.
+XLA_TEST_F(ConcatTestHlo, DISABLED_ConcatWithBitcast) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule jit_broken.874
+
+primitive_computation_add.866 {
+  parameter.867 = f32[] parameter(0)
+  parameter.868 = f32[] parameter(1)
+  ROOT add.869 = f32[] add(parameter.867, parameter.868)
+}
+
+ENTRY jit_broken.874 {
+  parameter.38 = f32[4,2]{1,0} parameter(0)
+  reshape.723 = f32[4,2,1]{2,1,0} reshape(parameter.38)
+  reshape.724 = f32[4,2,1]{2,1,0} reshape(parameter.38)
+  concatenate.42 = f32[4,2,2]{2,1,0} concatenate(reshape.723, reshape.724), dimensions={2}
+  slice.351 = f32[4,1,2]{2,1,0} slice(concatenate.42), slice={[0:4], [0:1], [0:2]}
+  reshape.1058 = f32[4,2]{1,0} reshape(slice.351)
+  slice.352 = f32[4,1]{1,0} slice(reshape.1058), slice={[0:4], [1:2]}
+  reshape.1059 = f32[4]{0} reshape(slice.352)
+  slice.353 = f32[4,1,1]{2,1,0} slice(concatenate.42), slice={[0:4], [0:1], [1:2]}
+  reshape.1060 = f32[4]{0} reshape(slice.353)
+  add.124 = f32[4]{0} add(reshape.1059, reshape.1060)
+  slice.354 = f32[4,1]{1,0} slice(reshape.1058), slice={[0:4], [0:1]}
+  reshape.1061 = f32[4]{0} reshape(slice.354)
+  slice.379 = f32[4,1,1]{2,1,0} slice(concatenate.42), slice={[0:4], [0:1], [0:1]}
+  reshape.1062 = f32[4]{0} reshape(slice.379)
+  add.89 = f32[4]{0} add(reshape.1061, reshape.1062)
+  subtract.126 = f32[4]{0} subtract(add.124, add.89)
+  is-finite.127 = pred[4]{0} is-finite(subtract.126)
+  not.128 = pred[4]{0} not(is-finite.127)
+  abs.129 = f32[4]{0} abs(subtract.126)
+  constant.130 = f32[] constant(inf)
+  broadcast.131 = f32[4]{0} broadcast(constant.130), dimensions={}
+  compare.132 = pred[4]{0} compare(abs.129, broadcast.131), direction=EQ, type=UNSIGNED
+  not.133 = pred[4]{0} not(compare.132)
+  and.134 = pred[4]{0} and(not.128, not.133)
+  add.135 = f32[4]{0} add(add.124, add.89)
+  maximum.125 = f32[4]{0} maximum(add.124, add.89)
+  abs.136 = f32[4]{0} abs(subtract.126)
+  negate.137 = f32[4]{0} negate(abs.136)
+  exponential.138 = f32[4]{0} exponential(negate.137)
+  log-plus-one.139 = f32[4]{0} log-plus-one(exponential.138)
+  add.140 = f32[4]{0} add(maximum.125, log-plus-one.139)
+  select.141 = f32[4]{0} select(and.134, add.135, add.140)
+  slice.356 = f32[4,1,1]{2,1,0} slice(concatenate.42), slice={[0:4], [0:1], [1:2]}
+  reshape.1064 = f32[4]{0} reshape(slice.356)
+  add.214 = f32[4]{0} add(select.141, reshape.1064)
+  slice.380 = f32[4,1,1]{2,1,0} slice(concatenate.42), slice={[0:4], [0:1], [0:1]}
+  reshape.1066 = f32[4]{0} reshape(slice.380)
+  add.179 = f32[4]{0} add(select.141, reshape.1066)
+  subtract.216 = f32[4]{0} subtract(add.214, add.179)
+  is-finite.217 = pred[4]{0} is-finite(subtract.216)
+  not.218 = pred[4]{0} not(is-finite.217)
+  abs.219 = f32[4]{0} abs(subtract.216)
+  constant.220 = f32[] constant(inf)
+  broadcast.221 = f32[4]{0} broadcast(constant.220), dimensions={}
+  compare.222 = pred[4]{0} compare(abs.219, broadcast.221), direction=EQ, type=UNSIGNED
+  not.223 = pred[4]{0} not(compare.222)
+  and.224 = pred[4]{0} and(not.218, not.223)
+  add.225 = f32[4]{0} add(add.214, add.179)
+  maximum.215 = f32[4]{0} maximum(add.214, add.179)
+  abs.226 = f32[4]{0} abs(subtract.216)
+  negate.227 = f32[4]{0} negate(abs.226)
+  exponential.228 = f32[4]{0} exponential(negate.227)
+  log-plus-one.229 = f32[4]{0} log-plus-one(exponential.228)
+  add.230 = f32[4]{0} add(maximum.215, log-plus-one.229)
+  select.231 = f32[4]{0} select(and.224, add.225, add.230)
+  slice.359 = f32[4,1,1]{2,1,0} slice(concatenate.42), slice={[0:4], [0:1], [1:2]}
+  reshape.1068 = f32[4]{0} reshape(slice.359)
+  add.304 = f32[4]{0} add(select.231, reshape.1068)
+  slice.381 = f32[4,1,1]{2,1,0} slice(concatenate.42), slice={[0:4], [0:1], [0:1]}
+  reshape.1070 = f32[4]{0} reshape(slice.381)
+  add.269 = f32[4]{0} add(select.231, reshape.1070)
+  subtract.306 = f32[4]{0} subtract(add.304, add.269)
+  is-finite.307 = pred[4]{0} is-finite(subtract.306)
+  not.308 = pred[4]{0} not(is-finite.307)
+  abs.309 = f32[4]{0} abs(subtract.306)
+  constant.310 = f32[] constant(inf)
+  broadcast.311 = f32[4]{0} broadcast(constant.310), dimensions={}
+  compare.312 = pred[4]{0} compare(abs.309, broadcast.311), direction=EQ, type=UNSIGNED
+  not.313 = pred[4]{0} not(compare.312)
+  and.314 = pred[4]{0} and(not.308, not.313)
+  add.315 = f32[4]{0} add(add.304, add.269)
+  maximum.305 = f32[4]{0} maximum(add.304, add.269)
+  abs.316 = f32[4]{0} abs(subtract.306)
+  negate.317 = f32[4]{0} negate(abs.316)
+  exponential.318 = f32[4]{0} exponential(negate.317)
+  log-plus-one.319 = f32[4]{0} log-plus-one(exponential.318)
+  add.320 = f32[4]{0} add(maximum.305, log-plus-one.319)
+  select.321 = f32[4]{0} select(and.314, add.315, add.320)
+  slice.362 = f32[4,1,1]{2,1,0} slice(concatenate.42), slice={[0:4], [0:1], [1:2]}
+  reshape.1072 = f32[4]{0} reshape(slice.362)
+  add.394 = f32[4]{0} add(select.321, reshape.1072)
+  slice.382 = f32[4,1,1]{2,1,0} slice(concatenate.42), slice={[0:4], [0:1], [0:1]}
+  reshape.1074 = f32[4]{0} reshape(slice.382)
+  add.359 = f32[4]{0} add(select.321, reshape.1074)
+  subtract.396 = f32[4]{0} subtract(add.394, add.359)
+  is-finite.397 = pred[4]{0} is-finite(subtract.396)
+  not.398 = pred[4]{0} not(is-finite.397)
+  abs.399 = f32[4]{0} abs(subtract.396)
+  constant.400 = f32[] constant(inf)
+  broadcast.401 = f32[4]{0} broadcast(constant.400), dimensions={}
+  compare.402 = pred[4]{0} compare(abs.399, broadcast.401), direction=EQ, type=UNSIGNED
+  not.403 = pred[4]{0} not(compare.402)
+  and.404 = pred[4]{0} and(not.398, not.403)
+  add.405 = f32[4]{0} add(add.394, add.359)
+  maximum.395 = f32[4]{0} maximum(add.394, add.359)
+  abs.406 = f32[4]{0} abs(subtract.396)
+  negate.407 = f32[4]{0} negate(abs.406)
+  exponential.408 = f32[4]{0} exponential(negate.407)
+  log-plus-one.409 = f32[4]{0} log-plus-one(exponential.408)
+  add.410 = f32[4]{0} add(maximum.395, log-plus-one.409)
+  select.411 = f32[4]{0} select(and.404, add.405, add.410)
+  slice.365 = f32[4,1,1]{2,1,0} slice(concatenate.42), slice={[0:4], [0:1], [1:2]}
+  reshape.1076 = f32[4]{0} reshape(slice.365)
+  add.484 = f32[4]{0} add(select.411, reshape.1076)
+  slice.383 = f32[4,1,1]{2,1,0} slice(concatenate.42), slice={[0:4], [0:1], [0:1]}
+  reshape.1078 = f32[4]{0} reshape(slice.383)
+  add.449 = f32[4]{0} add(select.411, reshape.1078)
+  subtract.486 = f32[4]{0} subtract(add.484, add.449)
+  is-finite.487 = pred[4]{0} is-finite(subtract.486)
+  not.488 = pred[4]{0} not(is-finite.487)
+  abs.489 = f32[4]{0} abs(subtract.486)
+  constant.490 = f32[] constant(inf)
+  broadcast.491 = f32[4]{0} broadcast(constant.490), dimensions={}
+  compare.492 = pred[4]{0} compare(abs.489, broadcast.491), direction=EQ, type=UNSIGNED
+  not.493 = pred[4]{0} not(compare.492)
+  and.494 = pred[4]{0} and(not.488, not.493)
+  add.495 = f32[4]{0} add(add.484, add.449)
+  maximum.485 = f32[4]{0} maximum(add.484, add.449)
+  abs.496 = f32[4]{0} abs(subtract.486)
+  negate.497 = f32[4]{0} negate(abs.496)
+  exponential.498 = f32[4]{0} exponential(negate.497)
+  log-plus-one.499 = f32[4]{0} log-plus-one(exponential.498)
+  add.500 = f32[4]{0} add(maximum.485, log-plus-one.499)
+  select.501 = f32[4]{0} select(and.494, add.495, add.500)
+  slice.368 = f32[4,1,1]{2,1,0} slice(concatenate.42), slice={[0:4], [0:1], [1:2]}
+  reshape.1080 = f32[4]{0} reshape(slice.368)
+  add.574 = f32[4]{0} add(select.501, reshape.1080)
+  slice.384 = f32[4,1,1]{2,1,0} slice(concatenate.42), slice={[0:4], [0:1], [0:1]}
+  reshape.1082 = f32[4]{0} reshape(slice.384)
+  add.539 = f32[4]{0} add(select.501, reshape.1082)
+  subtract.576 = f32[4]{0} subtract(add.574, add.539)
+  is-finite.577 = pred[4]{0} is-finite(subtract.576)
+  not.578 = pred[4]{0} not(is-finite.577)
+  abs.579 = f32[4]{0} abs(subtract.576)
+  constant.580 = f32[] constant(inf)
+  broadcast.581 = f32[4]{0} broadcast(constant.580), dimensions={}
+  compare.582 = pred[4]{0} compare(abs.579, broadcast.581), direction=EQ, type=UNSIGNED
+  not.583 = pred[4]{0} not(compare.582)
+  and.584 = pred[4]{0} and(not.578, not.583)
+  add.585 = f32[4]{0} add(add.574, add.539)
+  maximum.575 = f32[4]{0} maximum(add.574, add.539)
+  abs.586 = f32[4]{0} abs(subtract.576)
+  negate.587 = f32[4]{0} negate(abs.586)
+  exponential.588 = f32[4]{0} exponential(negate.587)
+  log-plus-one.589 = f32[4]{0} log-plus-one(exponential.588)
+  add.590 = f32[4]{0} add(maximum.575, log-plus-one.589)
+  select.591 = f32[4]{0} select(and.584, add.585, add.590)
+  slice.371 = f32[4,1,1]{2,1,0} slice(concatenate.42), slice={[0:4], [0:1], [1:2]}
+  reshape.1084 = f32[4]{0} reshape(slice.371)
+  add.664 = f32[4]{0} add(select.591, reshape.1084)
+  slice.385 = f32[4,1,1]{2,1,0} slice(concatenate.42), slice={[0:4], [0:1], [0:1]}
+  reshape.1086 = f32[4]{0} reshape(slice.385)
+  add.629 = f32[4]{0} add(select.591, reshape.1086)
+  subtract.666 = f32[4]{0} subtract(add.664, add.629)
+  is-finite.667 = pred[4]{0} is-finite(subtract.666)
+  not.668 = pred[4]{0} not(is-finite.667)
+  abs.669 = f32[4]{0} abs(subtract.666)
+  constant.670 = f32[] constant(inf)
+  broadcast.671 = f32[4]{0} broadcast(constant.670), dimensions={}
+  compare.672 = pred[4]{0} compare(abs.669, broadcast.671), direction=EQ, type=UNSIGNED
+  not.673 = pred[4]{0} not(compare.672)
+  and.674 = pred[4]{0} and(not.668, not.673)
+  add.675 = f32[4]{0} add(add.664, add.629)
+  maximum.665 = f32[4]{0} maximum(add.664, add.629)
+  abs.676 = f32[4]{0} abs(subtract.666)
+  negate.677 = f32[4]{0} negate(abs.676)
+  exponential.678 = f32[4]{0} exponential(negate.677)
+  log-plus-one.679 = f32[4]{0} log-plus-one(exponential.678)
+  add.680 = f32[4]{0} add(maximum.665, log-plus-one.679)
+  select.681 = f32[4]{0} select(and.674, add.675, add.680)
+  slice.374 = f32[4,1,1]{2,1,0} slice(concatenate.42), slice={[0:4], [0:1], [1:2]}
+  reshape.1088 = f32[4]{0} reshape(slice.374)
+  add.754 = f32[4]{0} add(select.681, reshape.1088)
+  slice.386 = f32[4,1,1]{2,1,0} slice(concatenate.42), slice={[0:4], [0:1], [0:1]}
+  reshape.1090 = f32[4]{0} reshape(slice.386)
+  add.719 = f32[4]{0} add(select.681, reshape.1090)
+  subtract.756 = f32[4]{0} subtract(add.754, add.719)
+  is-finite.757 = pred[4]{0} is-finite(subtract.756)
+  not.758 = pred[4]{0} not(is-finite.757)
+  abs.759 = f32[4]{0} abs(subtract.756)
+  constant.760 = f32[] constant(inf)
+  broadcast.761 = f32[4]{0} broadcast(constant.760), dimensions={}
+  compare.762 = pred[4]{0} compare(abs.759, broadcast.761), direction=EQ, type=UNSIGNED
+  not.763 = pred[4]{0} not(compare.762)
+  and.764 = pred[4]{0} and(not.758, not.763)
+  add.765 = f32[4]{0} add(add.754, add.719)
+  maximum.755 = f32[4]{0} maximum(add.754, add.719)
+  abs.766 = f32[4]{0} abs(subtract.756)
+  negate.767 = f32[4]{0} negate(abs.766)
+  exponential.768 = f32[4]{0} exponential(negate.767)
+  log-plus-one.769 = f32[4]{0} log-plus-one(exponential.768)
+  add.770 = f32[4]{0} add(maximum.755, log-plus-one.769)
+  select.771 = f32[4]{0} select(and.764, add.765, add.770)
+  slice.377 = f32[4,1,1]{2,1,0} slice(concatenate.42), slice={[0:4], [0:1], [1:2]}
+  reshape.1092 = f32[4]{0} reshape(slice.377)
+  add.844 = f32[4]{0} add(select.771, reshape.1092)
+  slice.387 = f32[4,1,1]{2,1,0} slice(concatenate.42), slice={[0:4], [0:1], [0:1]}
+  reshape.1094 = f32[4]{0} reshape(slice.387)
+  add.809 = f32[4]{0} add(select.771, reshape.1094)
+  subtract.846 = f32[4]{0} subtract(add.844, add.809)
+  is-finite.847 = pred[4]{0} is-finite(subtract.846)
+  not.848 = pred[4]{0} not(is-finite.847)
+  abs.849 = f32[4]{0} abs(subtract.846)
+  constant.850 = f32[] constant(inf)
+  broadcast.851 = f32[4]{0} broadcast(constant.850), dimensions={}
+  compare.852 = pred[4]{0} compare(abs.849, broadcast.851), direction=EQ, type=UNSIGNED
+  not.853 = pred[4]{0} not(compare.852)
+  and.854 = pred[4]{0} and(not.848, not.853)
+  add.855 = f32[4]{0} add(add.844, add.809)
+  maximum.845 = f32[4]{0} maximum(add.844, add.809)
+  abs.856 = f32[4]{0} abs(subtract.846)
+  negate.857 = f32[4]{0} negate(abs.856)
+  exponential.858 = f32[4]{0} exponential(negate.857)
+  log-plus-one.859 = f32[4]{0} log-plus-one(exponential.858)
+  add.860 = f32[4]{0} add(maximum.845, log-plus-one.859)
+  select.861 = f32[4]{0} select(and.854, add.855, add.860)
+  constant.865 = f32[] constant(0)
+  reduce.2 = f32[] reduce(select.861, constant.865), dimensions={0}, to_apply=primitive_computation_add.866
+  reduce.3 = f32[] reduce(select.861, constant.865), dimensions={0}, to_apply=primitive_computation_add.866
+  add.77 = f32[] add(reduce.2, reduce.3)
+  constant.719 = f32[] constant(0.125)
+  multiply = f32[] multiply(add.77, constant.719)
+  ROOT tuple.873 = (f32[]) tuple(multiply)
+})")
+                    .ConsumeValueOrDie();
+  auto input_array = absl::make_unique<Array2D<float>>(4, 2);
+  input_array->FillUnique(1.0f);
+  auto input = LiteralUtil::CreateR2FromArray2D<float>(*input_array);
+  EXPECT_TRUE(RunAndCompare(std::move(module), {&input}, absl::nullopt));
+}
+
 // Describes a binary rank-2 concatenation test.
 struct R2BinarySpec {
   int64 lhs_dim0;
@@ -578,7 +825,7 @@ XLA_TEST_F(ConcatTest, ConcatOperandsOfSameOperand) {
                              {x_data.get(), y_data.get()}, ErrorSpec(1e-4));
 }
 
-// Test that the HLO optimization to replace a concat of a bradcasted scalar
+// Test that the HLO optimization to replace a concat of a broadcasted scalar
 // produces the correct result in rank 1.
 XLA_TEST_F(ConcatTest, ConcatBroadcastArgument) {
   auto f32_scalar = ShapeUtil::MakeShape(xla::F32, {});
@@ -604,7 +851,7 @@ XLA_TEST_F(ConcatTest, ConcatBroadcastArgument) {
       {x_data.get(), y_data.get(), z_data.get()}, ErrorSpec(1e-4));
 }
 
-// Test that the HLO optimization to replace a concat of a bradcasted scalar
+// Test that the HLO optimization to replace a concat of a broadcasted scalar
 // produces the correct result in rank 3 with both high and low padding in
 // different dimensions.
 XLA_TEST_F(ConcatTest, ConcatBroadcastArgumentR3) {
diff --git a/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc b/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc
index ff7e7955876..4a7070a32f3 100644
--- a/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc
+++ b/tensorflow/compiler/xla/tests/conv_depthwise_backprop_filter_test.cc
@@ -45,13 +45,20 @@ class BatchGroupedConvolution2DTest
       public ::testing::WithParamInterface<
           ::testing::tuple<BatchGroupedConvolution2DSpec, bool>> {};
 
-static std::vector<BatchGroupedConvolution2DSpec> GetConv2DTestCases() {
+class BatchGroupedConvolution2DDepthTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<
+          ::testing::tuple<BatchGroupedConvolution2DSpec, bool>> {};
+
+static std::vector<BatchGroupedConvolution2DSpec> GetConv2DTestCases(
+    bool use_depth_multiplier) {
   std::vector<BatchGroupedConvolution2DSpec> config_set;
   std::vector<std::vector<int64>> config_options = {
-      {8, 5, 3, 2},      {4, 5, 5, 2},    {8, 7, 4, 128},
-      {16, 20, 20, 256}, {256, 7, 5, 4},  {256, 6, 6, 4},
-      {256, 8, 8, 512},  {64, 7, 7, 960}, {64, 14, 14, 576}};
+      {129, 10, 3, 2}, {4, 3, 3, 258}, {8, 4, 2, 128},
+      {8, 3, 2, 256},  {256, 7, 5, 4}, {128, 6, 6, 4},
+      {32, 5, 2, 129}, {16, 4, 3, 2},  {16, 3, 2, 64}};
 
+  int64 counter = 2;
   for (auto option : config_options) {
     int64 feature = option[3];
     int64 activation_size = option[1];
@@ -65,10 +72,16 @@ static std::vector<BatchGroupedConvolution2DSpec> GetConv2DTestCases() {
 
     config.activation_dims = {batch, activation_size, activation_size, feature};
 
-    config.kernel_dims = {batch, kernel_size, kernel_size, feature};
-
+    const int64 depthwise_multiplier = use_depth_multiplier ? counter++ : 1;
+    config.kernel_dims = {batch, kernel_size, kernel_size,
+                          feature * depthwise_multiplier};
+    // Don't let the counter grow too much, else the compute demand will grow.
+    if (counter == 4) {
+      counter = 2;
+    }
     int64 output_space_size = 3 + activation_size - kernel_size;
-    config.output_dims = {output_space_size, output_space_size, feature, 1};
+    config.output_dims = {output_space_size, output_space_size,
+                          feature * depthwise_multiplier, 1};
 
     config.activation_and_kernel_layout = {0, 3, 1, 2};
     config.output_layout = {2, 3, 0, 1};
@@ -123,11 +136,13 @@ string BatchGroupedConvolution2DTestDataToString(
 }
 
 string BuildHloTextBatchGroupedConvolution2D(
-    const BatchGroupedConvolution2DSpec& spec, bool use_bfloat16) {
+    const BatchGroupedConvolution2DSpec& spec, bool use_bfloat16,
+    bool scheduled = false) {
   const string data_type = GetFloatDataType(use_bfloat16);
+  const string scheduled_tag = scheduled ? ",is_scheduled=true" : "";
   return absl::StrFormat(
       R"(
-    HloModule TensorFlowDepthwiseConv, is_scheduled=true
+    HloModule TensorFlowDepthwiseConv %s
 
     ENTRY main {
       activation = %s[%s]{%s} parameter(0)
@@ -137,7 +152,7 @@ string BuildHloTextBatchGroupedConvolution2D(
           batch_group_count=%d
     }
     )",
-      data_type, absl::StrJoin(spec.activation_dims, ","),
+      scheduled_tag, data_type, absl::StrJoin(spec.activation_dims, ","),
       absl::StrJoin(spec.activation_and_kernel_layout, ","), data_type,
       absl::StrJoin(spec.kernel_dims, ","),
       absl::StrJoin(spec.activation_and_kernel_layout, ","), data_type,
@@ -161,23 +176,26 @@ XLA_TEST_P(BatchGroupedConvolution2DTest, DoIt) {
   }
 #endif
 
-  const string hlo_text =
-      BuildHloTextBatchGroupedConvolution2D(spec, use_bfloat16);
+  const string hlo_text = BuildHloTextBatchGroupedConvolution2D(
+      spec, use_bfloat16, /*scheduled=*/false);
 
-  EXPECT_TRUE(RunAndCompareNoHloPasses(
-      hlo_text, ErrorSpec{0.01, 0.01}, [](HloModule* module) -> Status {
-        BFloat16MixedPrecisionRemoval remover;
-        TF_RETURN_IF_ERROR(remover.Run(module).status());
-        Despecializer despecializer;
-        return despecializer.Run(module).status();
-      }));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{0.01, 0.01}));
 }
 
 INSTANTIATE_TEST_CASE_P(
     BatchGroupedConvolution2DTestWithRandomIndices,
     BatchGroupedConvolution2DTest,
-    ::testing::Combine(::testing::ValuesIn(GetConv2DTestCases()),
-                       ::testing::Bool()),
+    ::testing::Combine(
+        ::testing::ValuesIn(GetConv2DTestCases(/*use_depth_multiplier=*/false)),
+        ::testing::Bool()),
+    BatchGroupedConvolution2DTestDataToString);
+
+INSTANTIATE_TEST_CASE_P(
+    BatchGroupedConvolution2DDepthMultiplierTestWithRandomIndices,
+    BatchGroupedConvolution2DTest,
+    ::testing::Combine(
+        ::testing::ValuesIn(GetConv2DTestCases(/*use_depth_multiplier=*/true)),
+        ::testing::Bool()),
     BatchGroupedConvolution2DTestDataToString);
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 60ba27b2050..e06e2972f1c 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -69,12 +69,14 @@ XLA_TEST_F(DotOperationTest, DotOfInputTupleElem) {
   XlaBuilder builder(TestName());
 
   XlaOp param;
-  auto param_data = CreateParameterAndTransferLiteral(
-      0,
-      LiteralUtil::MakeTupleFromSlices(
-          {LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}}),
-           LiteralUtil::CreateR2<float>({{5, 6}, {7, 8}})}),
-      "arg0", &builder, &param);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto param_data,
+      CreateParameterAndTransferLiteral(
+          0,
+          LiteralUtil::MakeTupleFromSlices(
+              {LiteralUtil::CreateR2<float>({{1, 2}, {3, 4}}),
+               LiteralUtil::CreateR2<float>({{5, 6}, {7, 8}})}),
+          "arg0", &builder, &param));
   auto lhs = GetTupleElement(param, 0);
   auto rhs = GetTupleElement(param, 1);
   Dot(lhs, rhs);
diff --git a/tensorflow/compiler/xla/tests/exhaustive_binary_16_bit_test.cc b/tensorflow/compiler/xla/tests/exhaustive_binary_16_bit_test.cc
index dca8e31e792..f884bb9c0e0 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_binary_16_bit_test.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_binary_16_bit_test.cc
@@ -123,7 +123,7 @@ BINARY_TEST_16BIT(Min, {
 })
 
 // TODO(bixia): Pow fails with bfloat16 on CPU.
-BINARY_TEST_16BIT(DISABLED_ON_CPU(Pow), {
+BINARY_TEST_16BIT(DISABLED_ON_GPU(DISABLED_ON_CPU(Pow)), {
   // See b/162664705.
   known_incorrect_fn_ = [](int64 val) {
     Eigen::bfloat16 f;
diff --git a/tensorflow/compiler/xla/tests/exhaustive_binary_test_f32_f64.cc b/tensorflow/compiler/xla/tests/exhaustive_binary_test_f32_f64.cc
index 14d3b343b6c..c6feedf9e7f 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_binary_test_f32_f64.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_binary_test_f32_f64.cc
@@ -114,6 +114,10 @@ BINARY_TEST_FLOAT_32(Min, {
 //
 // TODO(bixia): Need to investigate the failure on CPU and file bugs.
 BINARY_TEST_FLOAT_32(DISABLED_ON_CPU(AbsComplex), {
+  // TODO(timshen): see b/162664705.
+  known_incorrect_fn_ = [this](int64 val) {
+    return std::isnan(this->ConvertValue(val));
+  };
   auto host_abs_complex = [](float x, float y) {
     return std::abs(std::complex<float>(x, y));
   };
@@ -198,6 +202,10 @@ BINARY_TEST_FLOAT_64(Min, {
 
 // TODO(bixia): Need to investigate the failure on CPU and file bugs.
 BINARY_TEST_FLOAT_64(DISABLED_ON_CPU(AbsComplex), {
+  // TODO(timshen): see b/162664705.
+  known_incorrect_fn_ = [this](int64 val) {
+    return std::isnan(this->ConvertValue(val));
+  };
   auto host_abs_complex = [](double x, double y) {
     return std::abs(std::complex<double>(x, y));
   };
diff --git a/tensorflow/compiler/xla/tests/exhaustive_unary_test_complex.cc b/tensorflow/compiler/xla/tests/exhaustive_unary_test_complex.cc
index b361bf94a6d..6a638d2106f 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_unary_test_complex.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_unary_test_complex.cc
@@ -97,6 +97,10 @@ using ExhaustiveC128UnaryTest = ExhaustiveComplexUnaryTestBase<C128>;
 
 // TODO(b/138578594): Enable the test for the CPU backend after fixing the bug.
 UNARY_TEST_COMPLEX_64(DISABLED_ON_CPU(Log), {
+  // TODO(timshen): see b/162664705.
+  known_incorrect_fn_ = [this](int64 val) {
+    return std::isnan(this->ConvertValue(val));
+  };
   Run(Log, [](complex64 x) { return std::log<float>(x); });
 })
 
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 663e7d81006..6c062deb363 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -414,6 +414,47 @@ StatusOr<::testing::AssertionResult> HloTestBase::RunAndCompareInternal(
              : ::testing::AssertionFailure() << output.status().error_message();
 }
 
+::testing::AssertionResult HloTestBase::RunReplicated(string_view hlo_string,
+                                                      bool run_hlo_passes,
+                                                      int64 num_replicas,
+                                                      string backend_config) {
+  auto module_or_status =
+      ParseAndReturnVerifiedModule(hlo_string, num_replicas);
+  if (!module_or_status.ok()) {
+    return ::testing::AssertionFailure()
+           << "Error while parsing HLO text format: "
+           << module_or_status.status().ToString();
+  }
+
+  std::unique_ptr<HloModule> module = std::move(module_or_status.ValueOrDie());
+  const auto& fake_arguments =
+      MakeFakeArguments(module.get()).ConsumeValueOrDie();
+  std::vector<Literal*> fake_argument_ptrs;
+  absl::c_transform(
+      fake_arguments, std::back_inserter(fake_argument_ptrs),
+      [](const Literal& literal) { return const_cast<Literal*>(&literal); });
+
+  if (!backend_config.empty()) {
+    // Set backend configuration if it is given.
+    HloInstruction* instruction =
+        module->entry_computation()->root_instruction();
+    instruction->set_raw_backend_config_string(backend_config);
+  }
+
+  HloRunner::ReplicatedExecuteOptions options;
+  options.num_replicas = num_replicas;
+  options.run_hlo_passes = run_hlo_passes;
+  options.use_threads = true;
+  for (auto argument : fake_argument_ptrs) {
+    options.arguments.push_back(argument);
+  }
+  auto output = test_runner_.ExecuteReplicated(std::move(module), options);
+
+  return output.ok()
+             ? ::testing::AssertionSuccess()
+             : ::testing::AssertionFailure() << output.status().error_message();
+}
+
 ::testing::AssertionResult HloTestBase::RunMultipleTimes(
     string_view hlo_string, bool run_hlo_passes,
     std::vector<ExecutionProfile>* profiles, string backend_config,
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index fc680e39682..e15c1dd5f55 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -234,6 +234,11 @@ class HloTestBase : public ManifestCheckingTest {
                                  ExecutionProfile* profile = nullptr,
                                  string backend_config = "") TF_MUST_USE_RESULT;
 
+  // Executes an hlo module with fake inputs on multiple replicas.
+  ::testing::AssertionResult RunReplicated(
+      const absl::string_view hlo_string, bool run_hlo_passes = true,
+      int64 num_replicas = 1, string backend_config = "") TF_MUST_USE_RESULT;
+
   // If assert_determinism is true, the assertion will fail unless all runs
   // produce exactly the same output.
   ::testing::AssertionResult RunMultipleTimes(
diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc
index 201c0da87f1..1a95f2fb549 100644
--- a/tensorflow/compiler/xla/tests/prng_test.cc
+++ b/tensorflow/compiler/xla/tests/prng_test.cc
@@ -122,7 +122,7 @@ XLA_TEST_F(PrngTest, DISABLED_ON_INTERPRETER(DISABLED_ON_GPU(
   bfloat16 interval = static_cast<bfloat16>(0.25);
   std::vector<int32> counts(static_cast<int64>((high - low) / interval), 0);
 
-  constexpr int64 count = 100;
+  constexpr int64 count = 1000;
   for (int64 seed = 0; seed < count; ++seed) {
     auto result = UniformTest<bfloat16>(low, high, {}, /*seed=*/seed);
     result.EachCell<bfloat16>([&](absl::Span<const int64>, bfloat16 value) {
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index b209669715e..7e5b699d5e2 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -365,8 +365,9 @@ XLA_TEST_P(ReduceWindowTest, R4UnitWindow) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input_array, LayoutUtil::MakeLayout({0, 3, 2, 1}));
   XlaOp input;
-  auto input_data = CreateParameterAndTransferLiteral(
-      0, input_literal, "parameter", &builder_, &input);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input_data, CreateParameterAndTransferLiteral(
+                           0, input_literal, "parameter", &builder_, &input));
 
   Padding padding = Padding::kSame;
   ReduceWindowAdd(input, {1, 1, 7, 1}, {1, 4, 1, 1}, padding);
@@ -423,8 +424,9 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorStride) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaOp input;
-  auto input_data = CreateParameterAndTransferLiteral(
-      0, input_literal, "parameter", &builder_, &input);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input_data, CreateParameterAndTransferLiteral(
+                           0, input_literal, "parameter", &builder_, &input));
 
   int win_len = 1;
   int stride = 8;
@@ -444,8 +446,9 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorUnitStride) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaOp input;
-  auto input_data = CreateParameterAndTransferLiteral(
-      0, input_literal, "parameter", &builder_, &input);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input_data, CreateParameterAndTransferLiteral(
+                           0, input_literal, "parameter", &builder_, &input));
 
   int win_len = 3;
   int stride = 1;
@@ -465,8 +468,9 @@ XLA_TEST_P(ReduceWindowTest, R4SecondMinorWin) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input_array, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaOp input;
-  auto input_data = CreateParameterAndTransferLiteral(
-      0, input_literal, "parameter", &builder_, &input);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input_data, CreateParameterAndTransferLiteral(
+                           0, input_literal, "parameter", &builder_, &input));
 
   int win_len = 8;
   int stride = 5;
@@ -631,8 +635,9 @@ class R4ReduceWindowTest : public ReduceWindowTestBase,
     Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
         input, LayoutUtil::MakeLayout(param.layout));
     XlaOp parameter;
-    auto input_arg = CreateParameterAndTransferLiteral(0, input_literal, "p0",
-                                                       &b, &parameter);
+    TF_ASSERT_OK_AND_ASSIGN(auto input_arg,
+                            CreateParameterAndTransferLiteral(
+                                0, input_literal, "p0", &b, &parameter));
 
     std::vector<std::pair<int64, int64>> padding(4);
     for (int i = 0; i < 4; ++i) {
@@ -1243,7 +1248,9 @@ class R2ReduceWindowTest : public ReduceWindowTestBase,
         input, LayoutUtil::MakeLayout(param.layout));
 
     XlaOp parameter;
-    CreateParameterAndTransferLiteral(0, input_literal, "p0", &b, &parameter);
+    TF_ASSERT_OK(CreateParameterAndTransferLiteral(0, input_literal, "p0", &b,
+                                                   &parameter)
+                     .status());
 
     std::vector<std::pair<int64, int64>> padding(2);
     for (int i = 0; i < 2; ++i) {
@@ -1443,8 +1450,9 @@ XLA_TEST_P(R1ReduceWindowTest, DoIt) {
   Literal input_literal =
       LiteralUtil::CreateR1(absl::Span<const float>(input_vector));
   XlaOp parameter;
-  auto input_arg =
-      CreateParameterAndTransferLiteral(0, input_literal, "p0", &b, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input_arg, CreateParameterAndTransferLiteral(0, input_literal, "p0",
+                                                        &b, &parameter));
 
   std::vector<std::pair<int64, int64>> padding(1);
   padding[0] = {param.pad_low[0], param.pad_high[0]};
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index 298136002e9..890156cc650 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -57,8 +57,9 @@ XLA_TEST_P(ReshapeTest, CollapseTrivial1x1) {
   input_array.Fill(1.0f);
   auto input_literal = LiteralUtil::CreateR2FromArray2D(input_array);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "parameter",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(
+                      0, input_literal, "parameter", &builder, &parameter));
   Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
 
   auto expected_literal = LiteralUtil::CreateR1<float>({1.0f});
@@ -70,8 +71,9 @@ XLA_TEST_P(ReshapeTest, CollapseTrivialR1EmptyDims) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateR1<float>({1.0f});
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "parameter",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(
+                      0, input_literal, "parameter", &builder, &parameter));
   Collapse(/*operand=*/parameter, /*dimensions=*/{});
 
   auto expected_literal = LiteralUtil::CreateR1<float>({1.0f});
@@ -83,8 +85,9 @@ XLA_TEST_P(ReshapeTest, CollapseTrivialR1OnlyDim) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateR1<float>({1.0f});
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "parameter",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(
+                      0, input_literal, "parameter", &builder, &parameter));
   Collapse(/*operand=*/parameter, /*dimensions=*/{0});
 
   auto expected_literal = LiteralUtil::CreateR1<float>({1.0f});
@@ -99,8 +102,9 @@ XLA_TEST_P(ReshapeTest, SingleElementArrayToScalar) {
   input_array.Fill(1.0f);
   auto input_literal = LiteralUtil::CreateR2FromArray2D(input_array);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "parameter",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(
+                      0, input_literal, "parameter", &builder, &parameter));
   auto reshape = Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
                          /*new_sizes=*/{});
   auto new_shape = builder.GetShape(reshape).ConsumeValueOrDie();
@@ -115,8 +119,9 @@ XLA_TEST_P(ReshapeTest, ScalarToSingleElementArray) {
 
   Literal param0_literal = LiteralUtil::CreateR0<float>(1.0f);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, param0_literal, "param0",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, param0_literal, "param0",
+                                                    &builder, &parameter));
   auto a = Neg(parameter);
   Reshape(/*operand=*/a, /*dimensions=*/{}, /*new_sizes=*/{1});
 
@@ -130,8 +135,9 @@ XLA_TEST_P(ReshapeTest, Trivial0x3) {
   Array2D<float> input_array(0, 3);
   auto input_literal = LiteralUtil::CreateR2FromArray2D(input_array);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
   auto expected_literal = LiteralUtil::CreateR1<float>({});
   ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
@@ -144,8 +150,9 @@ XLA_TEST_P(ReshapeTest, Trivial0x3WithParameter) {
   Literal param0_literal =
       LiteralUtil::CreateR2FromArray2D<float>(Array2D<float>(0, 3));
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, param0_literal, "param0",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, param0_literal, "param0",
+                                                    &builder, &parameter));
   Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
   auto expected_literal = LiteralUtil::CreateR1<float>({});
   ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
@@ -157,8 +164,9 @@ XLA_TEST_P(ReshapeTest, Trivial3x0) {
   Array2D<float> input_array(3, 0);
   auto input_literal = LiteralUtil::CreateR2FromArray2D(input_array);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
   auto expected_literal = LiteralUtil::CreateR1<float>({});
   ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
@@ -170,8 +178,9 @@ XLA_TEST_P(ReshapeTest, Trivial1x3) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateR2<float>({{1.0f, 2.0f, 3.0f}});
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
   auto expected_literal = LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f});
   ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
@@ -183,8 +192,9 @@ XLA_TEST_P(ReshapeTest, Trivial3x1) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateR2<float>({{1.0f}, {2.0f}, {3.0f}});
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Collapse(/*operand=*/parameter, /*dimensions=*/{0, 1});
   auto expected_literal = LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f});
   ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
@@ -196,8 +206,9 @@ XLA_TEST_P(ReshapeTest, R1ToR2_0_To_2x0) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateR1<float>({});
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0},
           /*new_sizes=*/{2, 0});
   auto expected_literal = LiteralUtil::CreateR2<float>({{}, {}});
@@ -211,8 +222,9 @@ XLA_TEST_P(ReshapeTest, R1ToR2_6_To_2x3) {
   auto input_literal =
       LiteralUtil::CreateR1<float>({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0},
           /*new_sizes=*/{2, 3});
   auto expected_literal =
@@ -226,8 +238,9 @@ XLA_TEST_P(ReshapeTest, Reshape0x2To2x0) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(Array2D<float>(0, 2));
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
           /*new_sizes=*/{2, 0});
   auto expected_literal = LiteralUtil::CreateR2<float>({{}, {}});
@@ -241,8 +254,9 @@ XLA_TEST_P(ReshapeTest, ReshapeRowToCol) {
   auto simple = MakeLinspaceArray2D(1.0f, 3.0f, 1, 3);
   auto input_literal = LiteralUtil::CreateFromArray(*simple);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
           /*new_sizes=*/{3, 1});
 
@@ -258,8 +272,9 @@ XLA_TEST_P(ReshapeTest, TransposeAsReshape) {
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = LiteralUtil::CreateFromArray(*a4x3);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
           /*new_sizes=*/{3, 4});
 
@@ -274,8 +289,9 @@ XLA_TEST_P(ReshapeTest, Transpose0x4) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(Array2D<float>(0, 4));
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Transpose(parameter, {1, 0});
   auto expected_literal = LiteralUtil::CreateR2<float>({{}, {}, {}, {}});
   ComputeAndCompareLiteral(&builder, expected_literal, {input.get()},
@@ -288,8 +304,9 @@ XLA_TEST_P(ReshapeTest, Transpose4x3) {
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = LiteralUtil::CreateFromArray(*a4x3);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Transpose(parameter, {1, 0});
 
   auto expected = ReferenceUtil::TransposeArray2D(*a4x3);
@@ -304,8 +321,9 @@ XLA_TEST_P(ReshapeTest, ReshapeSplitNoShuffleZeroElements) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(Array2D<float>(6, 0));
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
           /*new_sizes=*/{2, 3, 0, 0});
   auto expected_literal =
@@ -318,8 +336,9 @@ XLA_TEST_P(ReshapeTest, ReshapeR4ToR2ZeroElements) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(Array4D<float>(2, 3, 4, 0));
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3},
           /*new_sizes=*/{24, 0});
   auto expected_literal = LiteralUtil::CreateFromArray(Array2D<float>(24, 0));
@@ -334,8 +353,9 @@ XLA_TEST_P(ReshapeTest, ReshapeSplitNoShuffle) {
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = LiteralUtil::CreateFromArray(*a4x3);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1},
           /*new_sizes=*/{2, 6});
 
@@ -349,8 +369,9 @@ XLA_TEST_P(ReshapeTest, ReshapeSplitAndShuffleZeroElements) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(Array2D<float>(0, 6));
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
           /*new_sizes=*/{3, 0});
   auto expected_literal = LiteralUtil::CreateFromArray(Array2D<float>(3, 0));
@@ -365,8 +386,9 @@ XLA_TEST_P(ReshapeTest, ReshapeSplitAndShuffle) {
   auto a4x3 = MakeLinspaceArray2D(1.0f, 12.0f, 4, 3);
   auto input_literal = LiteralUtil::CreateFromArray(*a4x3);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{1, 0},
           /*new_sizes=*/{2, 6});
   Array2D<float> expected({{1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f},
@@ -391,8 +413,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_012) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests());
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2},
           /*new_sizes=*/{24});
   auto expected_literal = LiteralUtil::CreateR1<float>(
@@ -406,8 +429,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_012_Refine_83) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests());
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2},
           /*new_sizes=*/{8, 3});
   auto expected_literal = LiteralUtil::CreateR2<float>({{10, 11, 12},
@@ -426,8 +450,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R1_Collapse_120) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests());
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
           /*new_sizes=*/{24});
   auto expected_literal = LiteralUtil::CreateR1<float>(
@@ -441,8 +466,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R2_Collapse_120_Refine_83) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests());
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
           /*new_sizes=*/{8, 3});
   auto expected_literal = LiteralUtil::CreateR2<float>({{10, 20, 30},
@@ -461,8 +487,9 @@ XLA_TEST_P(ReshapeTest, DocR3_R3_Collapse_120_Refine_262) {
   XlaBuilder builder(TestName());
   auto input_literal = LiteralUtil::CreateFromArray(ArrayForDocR3Tests());
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{1, 2, 0},
           /*new_sizes=*/{2, 6, 2});
   auto expected_literal = LiteralUtil::CreateR3<float>(
@@ -494,8 +521,9 @@ XLA_TEST_P(ReshapeTest, FullyConnectedCollapse) {
   t2x2x2x3.FillWithYX(*filler2x3);
   auto input_literal = LiteralUtil::CreateFromArray(t2x2x2x3);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Collapse(/*operand=*/parameter, /*dimensions=*/{1, 2, 3});
   auto expected_literal = LiteralUtil::CreateR2<float>(
       {{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
@@ -519,8 +547,9 @@ XLA_TEST_P(ReshapeTest, FullyConnectedCollapseDesugared) {
   t(1, 0, 1, 1) = 7;
   auto input_literal = LiteralUtil::CreateFromArray(t);
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(/*operand=*/parameter, /*dimensions=*/{0, 1, 2, 3},
           /*new_sizes=*/{2, 4});
 
@@ -542,8 +571,9 @@ XLA_TEST_P(ReshapeTest, ToScalar) {
     input_literal.Set<float>(zeros, 83.0f);
 
     XlaOp parameter;
-    auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                   &b, &parameter);
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                      &b, &parameter));
     Reshape(parameter, dimensions, {});
 
     auto expected_literal = LiteralUtil::CreateR0<float>(83.0f);
@@ -556,8 +586,9 @@ XLA_TEST_P(ReshapeTest, BadDimensions) {
   XlaBuilder b(TestName());
   auto input_literal = LiteralUtil::CreateR1<float>({1.0f});
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input", &b,
-                                                 &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &b, &parameter));
   Reshape(parameter, {}, {});
   EXPECT_THAT(
       ExecuteToString(&b, {}),
@@ -568,8 +599,9 @@ XLA_TEST_P(ReshapeTest, BadNewSizes) {
   XlaBuilder b(TestName());
   auto input_literal = LiteralUtil::CreateR1<float>({1.0f, 2.0f});
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input", &b,
-                                                 &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &b, &parameter));
   Reshape(parameter, {1}, {});
   EXPECT_THAT(ExecuteToString(&b, {}),
               ::testing::HasSubstr("mismatched element counts"));
@@ -604,8 +636,9 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
        LayoutUtil::MakeLayout({0, 1, 2, 3}));
   // clang-format on
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
 
   Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 8});
 
@@ -639,8 +672,9 @@ XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{0, 1}, /*new_sizes=*/{3, 2, 1, 4});
 
   // clang-format off
@@ -666,8 +700,9 @@ XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4_Dimensions_10) {
       {200, 201, 202, 203, 204, 205, 206, 207},
   });
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, input_literal, "input",
+                                                    &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{1, 0}, /*new_sizes=*/{3, 2, 1, 4});
 
   // clang-format off
@@ -694,8 +729,9 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaOp parameter;
-  auto input_data = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                      &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          CreateParameterAndTransferLiteral(
+                              0, input_literal, "input", &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 1});
 
   Literal expected = LiteralUtil::ReshapeSlice({2, 1}, {1, 0}, input_literal);
@@ -713,8 +749,9 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaOp parameter;
-  auto input_data = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                      &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          CreateParameterAndTransferLiteral(
+                              0, input_literal, "input", &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{4, 2});
 
   Literal expected = LiteralUtil::ReshapeSlice({4, 2}, {1, 0}, input_literal);
@@ -733,8 +770,9 @@ XLA_TEST_P(ReshapeTest, R4ToR2_5x10x2x3_To_5x60_Dimensions_0213) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaOp parameter;
-  auto input_data = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                      &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          CreateParameterAndTransferLiteral(
+                              0, input_literal, "input", &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{0, 2, 1, 3},
           /*new_sizes=*/{5, 60});
 
@@ -759,8 +797,9 @@ XLA_TEST_P(ReshapeTest, NoopReshape) {
   Literal input_literal = LiteralUtil::CreateR4FromArray4DWithLayout(
       input_array, LayoutUtil::MakeLayout({1, 2, 3, 0}));
   XlaOp parameter;
-  auto input_data = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                      &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          CreateParameterAndTransferLiteral(
+                              0, input_literal, "input", &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{3, 0, 1, 2},
           /*new_sizes=*/{7, 2, 3, 5});
   XlaComputation computation = builder.Build().ConsumeValueOrDie();
@@ -793,8 +832,9 @@ XLA_TEST_P(ReshapeTest, R4ToR4Reshape_Trivial) {
         {{13, 14, 15, 16}, {17, 18, 19, 20}, {21, 22, 23, 24}}}});
 
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, literal_1x2x3x4, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, literal_1x2x3x4, "input",
+                                                    &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{0, 1, 2, 3},
           /*new_sizes=*/{1, 2, 3, 4});
 
@@ -808,8 +848,9 @@ XLA_TEST_P(ReshapeTest, R4ToR4Reshape) {
 
   XlaBuilder builder(TestName());
   XlaOp parameter;
-  auto input = CreateParameterAndTransferLiteral(0, literal_1x2x3x4, "input",
-                                                 &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto input, CreateParameterAndTransferLiteral(0, literal_1x2x3x4, "input",
+                                                    &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{1, 3, 2, 0},
           /*new_sizes=*/{2, 4, 3, 1});
 
@@ -840,8 +881,9 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeSimple) {
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaBuilder builder(TestName());
   XlaOp parameter;
-  auto input_data = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                      &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          CreateParameterAndTransferLiteral(
+                              0, input_literal, "input", &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
           /*new_sizes=*/new_bounds);
 
@@ -867,8 +909,9 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaBuilder builder(TestName());
   XlaOp parameter;
-  auto input_data = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                      &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          CreateParameterAndTransferLiteral(
+                              0, input_literal, "input", &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
           /*new_sizes=*/new_bounds);
 
@@ -894,8 +937,9 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaBuilder builder(TestName());
   XlaOp parameter;
-  auto input_data = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                      &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          CreateParameterAndTransferLiteral(
+                              0, input_literal, "input", &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
           /*new_sizes=*/new_bounds);
 
@@ -922,8 +966,9 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
       input, LayoutUtil::MakeLayout({3, 2, 1, 0}));
   XlaBuilder builder(TestName());
   XlaOp parameter;
-  auto input_data = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                      &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          CreateParameterAndTransferLiteral(
+                              0, input_literal, "input", &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{0, 1, 3, 2},
           /*new_sizes=*/new_bounds);
 
@@ -949,8 +994,9 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
       input, LayoutUtil::MakeLayout({0, 1, 2, 3}));
   XlaBuilder builder(TestName());
   XlaOp parameter;
-  auto input_data = CreateParameterAndTransferLiteral(0, input_literal, "input",
-                                                      &builder, &parameter);
+  TF_ASSERT_OK_AND_ASSIGN(auto input_data,
+                          CreateParameterAndTransferLiteral(
+                              0, input_literal, "input", &builder, &parameter));
   Reshape(parameter, /*dimensions=*/{1, 0, 2, 3},
           /*new_sizes=*/new_bounds);
 
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index fc1ca7d3105..aa02deb7bca 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -1,5 +1,7 @@
 # Tools and utilities that aid in XLA development and usage.
 
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_cuda_or_rocm",
@@ -264,7 +266,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/core:lib",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "@com_google_absl//absl/strings",
         "@com_google_protobuf//:protobuf_headers",
     ],
diff --git a/tensorflow/compiler/jit/union_find.h b/tensorflow/compiler/xla/union_find.h
similarity index 100%
rename from tensorflow/compiler/jit/union_find.h
rename to tensorflow/compiler/xla/union_find.h
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index 4034e5fdd27..6e7deda13f0 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -374,7 +374,9 @@ std::pair<float, float> SplitF64ToF32(double x) {
 
   // Only values within the range of F32 are supported, unless it is infinity.
   // Small values with large negative exponents would be rounded to zero.
-  CHECK(std::isfinite(x_f32)) << x;
+  if (!std::isfinite(x_f32)) {
+    LOG(WARNING) << "Out of range F64 constant detected: " << x;
+  }
 
   // The high float is simply the double rounded to the nearest float. Because
   // we are rounding to nearest with ties to even, the error introduced in
diff --git a/tensorflow/compiler/xla/util_test.cc b/tensorflow/compiler/xla/util_test.cc
index 69acc59d8a2..5477dfba18d 100644
--- a/tensorflow/compiler/xla/util_test.cc
+++ b/tensorflow/compiler/xla/util_test.cc
@@ -15,10 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/util.h"
 
+#include <limits>
 #include <list>
 
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/bfloat16.h"
 
 namespace xla {
 namespace {
@@ -103,5 +105,26 @@ TEST(UtilTest, SanitizeFileName) {
   EXPECT_EQ(SanitizeFileName("/A\\B[C]"), "_A_B_C_");
 }
 
+TEST(UtilTest, RoundTripFpToString) {
+  EXPECT_EQ(RoundTripFpToString(std::numeric_limits<Eigen::half>::quiet_NaN()),
+            "nan");
+  EXPECT_EQ(RoundTripFpToString(-std::numeric_limits<Eigen::half>::quiet_NaN()),
+            "-nan");
+  EXPECT_EQ(RoundTripFpToString(
+                std::numeric_limits<tensorflow::bfloat16>::quiet_NaN()),
+            "nan");
+  EXPECT_EQ(RoundTripFpToString(
+                -std::numeric_limits<tensorflow::bfloat16>::quiet_NaN()),
+            "-nan");
+  EXPECT_EQ(RoundTripFpToString(std::numeric_limits<float>::quiet_NaN()),
+            "nan");
+  EXPECT_EQ(RoundTripFpToString(-std::numeric_limits<float>::quiet_NaN()),
+            "-nan");
+  EXPECT_EQ(RoundTripFpToString(std::numeric_limits<double>::quiet_NaN()),
+            "nan");
+  EXPECT_EQ(RoundTripFpToString(-std::numeric_limits<double>::quiet_NaN()),
+            "-nan");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index d334f879c3e..7da8d2cb84d 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -245,6 +245,13 @@ message ComputationStats {
   double transcendental_count = 2;
 }
 
+// The type optimization profiles in use.
+enum ProfileType {
+  INVALID = 0;
+  WINDOW = 1;
+  FLAG = 2;
+}
+
 // Symbolization metadata for HLO Instructions.
 //
 // This metadata is used for debugging XLA code generation, as well as
@@ -268,6 +275,8 @@ message OpMetadata {
   // e.g. it could be the file and line of user code that generated the op.
   string source_file = 3;
   int32 source_line = 4;
+
+  repeated ProfileType profile_type = 5;
 }
 
 // Profile data from the execution of a computation.
@@ -691,3 +700,11 @@ message WhileLoopBackendConfig {
   // unknown-trip-count.
   KnownTripCount known_trip_count = 1;
 }
+
+// Specifies a pair of output/operand buffers for kCustomCall that alias each
+// other.
+message CustomCallOutputOperandAliasing {
+  repeated int64 output_shape_index = 1;
+  int64 operand_index = 2;
+  repeated int64 operand_shape_index = 3;
+}
diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD
index 172a970d207..1b699e7d8df 100644
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@@ -1,15 +1,15 @@
 # Description: Operations defined for XRT
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_custom_op_py_library",
-    "tf_gen_op_libs",
     "tf_gen_op_wrapper_py",
 )
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
 load(
     "//tensorflow/core/platform:build_config.bzl",
-    "tf_proto_library_cc",
-    "tf_proto_library_py",
+    "tf_proto_library",
 )
 
 package(
@@ -20,7 +20,7 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "xrt_proto",
     srcs = ["xrt.proto"],
     cc_api_version = 2,
@@ -33,12 +33,6 @@ tf_proto_library_cc(
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library_py(
-    name = "xrt_proto",  # bzl adds a _py suffix
-    srcs = ["xrt.proto"],
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "xrt_utils",
     srcs = [
@@ -80,7 +74,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor:device_memory_allocator",
diff --git a/tensorflow/compiler/xrt/cc/BUILD b/tensorflow/compiler/xrt/cc/BUILD
index 99ab50c8a8d..c8932150cb5 100644
--- a/tensorflow/compiler/xrt/cc/BUILD
+++ b/tensorflow/compiler/xrt/cc/BUILD
@@ -1,7 +1,4 @@
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_gen_op_wrappers_cc",
-)
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrappers_cc")
 
 package(
     default_visibility = ["//visibility:public"],
diff --git a/tensorflow/compiler/xrt/kernels/BUILD b/tensorflow/compiler/xrt/kernels/BUILD
index 494ba29e981..68c24f88703 100644
--- a/tensorflow/compiler/xrt/kernels/BUILD
+++ b/tensorflow/compiler/xrt/kernels/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 package(
     default_visibility = [
         "//learning/brain:__subpackages__",
diff --git a/tensorflow/compiler/xrt/tests/BUILD b/tensorflow/compiler/xrt/tests/BUILD
index 2f1faf1cdf1..724cfe38d54 100644
--- a/tensorflow/compiler/xrt/tests/BUILD
+++ b/tensorflow/compiler/xrt/tests/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_cc_test")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 935d8840831..adc59c67dce 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -46,12 +46,9 @@
 #
 # Public mobile targets, e.g. for Android:
 #
-# filegroup ":android_proto_srcs" - Protos
-# filegroup ":android_srcs" - Core sources
 # cc_library ":portable_tensorflow_lib" - Native library
 # cc_library ":portable_tensorflow_lib_lite" - Native library, without ops,
 #   supporting SELECTIVE_REGISTRATION feature.
-# portable_proto_library ":portable_proto_lib" (Google-internal)
 #
 # Note that :framework and :lib have incomplete transitive dependencies (they
 # declare but do not define some symbols) if framework_shared_object=True
@@ -65,15 +62,13 @@
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "cc_header_only_library",
     "if_android",
     "if_chromiumos",
     "if_cuda_or_rocm",
     "if_ios",
+    "if_libtpu",
     "if_mobile",
     "if_not_windows",
-    "if_tpu",
-    "tf_android_core_proto_headers",
     "tf_cc_test",
     "tf_cc_test_mkl",
     "tf_cc_tests",
@@ -81,28 +76,28 @@ load(
     "tf_cuda_library",
     "tf_defines_nortti_if_lite_protos",
     "tf_features_nomodules_if_mobile",
-    "tf_gen_op_libs",
-    "tf_genrule_cmd_append_to_srcs",
     "tf_opts_nortti_if_lite_protos",
-    "tf_portable_full_lite_protos",
     "transitive_hdrs",
 )
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
+
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "if_nccl")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tensorflow_opensource_extra_deps")
 
-# buildifier: disable=same-origin-load
-load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
-
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cc_tests_gpu")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_monitoring_framework_deps")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
 # For platform specific build config
 load(
     "//tensorflow/core/platform:build_config.bzl",
@@ -117,6 +112,7 @@ load(
     "tf_protos_all_impl",
     "tf_protos_grappler_impl",
     "tf_protos_profiler_impl",
+    "tf_tpu_dependencies",
 )
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
@@ -135,7 +131,6 @@ load(
     "if_mkl",
     "mkl_deps",
 )
-# Placeholder for Google-internal load statements.
 
 package(
     default_visibility = [
@@ -159,14 +154,8 @@ package_group(
 # Export the BUILD file so automated tooling can check licenses
 exports_files([
     "BUILD",
-    "ops/ops.pbtxt",
 ])
 
-package_group(
-    name = "experimental_access",
-    packages = ["//tensorflow/core/common_runtime/..."],
-)
-
 # Authorized users go here.
 package_group(name = "friends")
 
@@ -177,7 +166,6 @@ package_group(name = "friends")
 #
 # Note that some protos are in neither additional_core_proto_srcs nor this
 # filegroup; e.g.  ones with individual proto_library targets.
-# LINT.IfChange
 COMMON_PROTO_SRCS = [
     "//tensorflow/core/protobuf:bfc_memory_map.proto",
     "//tensorflow/core/protobuf:config.proto",
@@ -240,7 +228,6 @@ ERROR_CODES_PROTO_SRCS = [
     "//tensorflow/core/protobuf:error_codes.proto",
     "//tensorflow/core/lib/core:error_codes.proto",
 ]
-# LINT.ThenChange(//tensorflow/core/portable_proto_config.asciipb)
 
 CORE_PROTO_SRCS = COMMON_PROTO_SRCS + EXAMPLE_PROTO_SRCS + FRAMEWORK_PROTO_SRCS + UTIL_PROTO_SRCS + PROFILER_PROTO_SRCS + ERROR_CODES_PROTO_SRCS
 
@@ -248,6 +235,7 @@ tf_proto_library(
     name = "protos_all",
     srcs = [],
     cc_api_version = 2,
+    create_go_proto = False,
     make_default_target_header_only = True,
     protodeps = [
         "//tensorflow/core/example:protos_all",
@@ -286,7 +274,6 @@ cc_library(
     hdrs = ["//tensorflow/core/platform:base_hdrs"],
     copts = tf_copts(),
     tags = ["avoid_dep"],
-    visibility = [":__subpackages__"],
     deps = [
         "//tensorflow/core/platform",
         "//tensorflow/core/platform:byte_order",
@@ -302,17 +289,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "framework_bounds_check",
-    actual = "//tensorflow/core/framework:bounds_check",
-    visibility = ["//tensorflow/core/kernels:friends"],
-)
-
-alias(
-    name = "human_readable_json",
-    actual = "//tensorflow/core/platform:human_readable_json",
-)
-
 # Minimal lib so that tools used for mobile compilation
 # don't have to depend on lib/platformlib.
 cc_library(
@@ -374,22 +350,6 @@ cc_library(
     ],
 )
 
-# APIs defined in lib_experimental are for experimental usage and may be
-# subject to change. Its visibility is limited to selected packages.
-cc_library(
-    name = "lib_experimental",
-    hdrs = [
-        "//tensorflow/core/lib/core:legacy_lib_core_threadpool_options_header",
-    ],
-    visibility = [
-        ":experimental_access",
-        "//tensorflow/cc:__pkg__",
-    ],
-    deps = [
-        ":lib",
-    ],
-)
-
 alias(
     name = "feature_util",
     actual = "//tensorflow/core/example:feature_util",
@@ -458,7 +418,9 @@ tf_cuda_library(
         "//tensorflow/core/framework:control_flow.h",  # TODO(josh11b): Make internal?
         "//tensorflow/core/framework:dataset.h",
         "//tensorflow/core/framework:dataset_stateful_op_allowlist.h",
+        "//tensorflow/core/framework:device.h",
         "//tensorflow/core/framework:device_base.h",
+        "//tensorflow/core/framework:device_factory.h",
         "//tensorflow/core/framework:function.h",
         "//tensorflow/core/framework:function_handle_cache.h",
         "//tensorflow/core/framework:graph_def_util.h",
@@ -488,6 +450,7 @@ tf_cuda_library(
         "//tensorflow/core/framework:register_types_traits.h",
         "//tensorflow/core/framework:resource_mgr.h",
         "//tensorflow/core/framework:resource_op_kernel.h",
+        "//tensorflow/core/framework:rng_alg.h",
         "//tensorflow/core/framework:selective_registration.h",
         "//tensorflow/core/framework:session_state.h",
         "//tensorflow/core/framework:shape_inference.h",
@@ -522,30 +485,6 @@ tf_cuda_library(
     ],
 )
 
-# TODO(gonnet): Remove this alias once all users have been moved to the actual target.
-alias(
-    name = "allocator",
-    actual = "//tensorflow/core/framework:allocator",
-    visibility = ["//visibility:public"],
-)
-
-# TODO(gonnet): Remove this alias once all users have been moved to the actual target.
-alias(
-    name = "allocator_registry_impl",
-    actual = "//tensorflow/core/framework:allocator_registry_impl",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "overflow",
-    actual = "//tensorflow/core/util:overflow",
-)
-
-alias(
-    name = "exec_on_stall",
-    actual = "//tensorflow/core/util:exec_on_stall",
-)
-
 alias(
     name = "ptr_util",
     actual = "//tensorflow/core/util:ptr_util",
@@ -600,156 +539,7 @@ cc_library(
         ],
 )
 
-# Generates library per group of ops.
-tf_gen_op_libs(
-    is_external = False,
-    op_lib_names = [
-        "batch_ops",
-        "bitwise_ops",
-        "boosted_trees_ops",
-        "tensor_forest_ops",
-        "candidate_sampling_ops",
-        "checkpoint_ops",
-        "clustering_ops",
-        "collective_ops",
-        "control_flow_ops",
-        "count_ops",
-        "ctc_ops",
-        "data_flow_ops",
-        "dataset_ops",
-        "decode_proto_ops",
-        "encode_proto_ops",
-        "experimental_dataset_ops",
-        "function_ops",
-        "functional_ops",
-        "image_ops",
-        "io_ops",
-        "linalg_ops",
-        "list_ops",
-        "map_ops",
-        "lookup_ops",
-        "manip_ops",
-        "math_ops",
-        "mkl_nn_ops",
-        "nccl_ops",
-        "nn_ops",
-        "no_op",
-        "parsing_ops",
-        "random_grad",
-        "random_ops",
-        "special_math_ops",
-        "stateful_random_ops",
-        "remote_fused_graph_ops",
-        "rnn_ops",
-        "rpc_ops",
-        "scoped_allocator_ops",
-        "sdca_ops",
-        "set_ops",
-        "script_ops",
-        "sendrecv_ops",
-        "sparse_csr_matrix_ops",
-        "sparse_ops",
-        "spectral_ops",
-        "state_ops",
-        "stateless_random_ops",
-        "summary_ops",
-        "training_ops",
-    ],
-    deps = [
-        ":lib",
-        ":protos_all_cc",
-    ],
-)
-
-tf_gen_op_libs(
-    is_external = False,
-    op_lib_names = [
-        "logging_ops",
-    ],
-    deps = [
-        ":lib",
-        ":protos_all_cc",
-        # TODO(b/162630222): remove this dependency.
-        "//tensorflow/c/kernels:histogram_summary_op_lib",
-        "//tensorflow/c/kernels:merge_summary_op_lib",
-        "//tensorflow/c/kernels:summary_op_lib",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "string_ops",
-    ],
-    deps = [
-        ":lib_internal",
-        ":lib_proto_parsing",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "array_ops",
-    ],
-    deps = [
-        ":lib",
-        ":protos_all_cc",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "mkl_array_ops",
-    ],
-    deps = [":protos_all_cc"],
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "audio_ops",
-    ],
-    deps = [":lib"],
-)
-
-tf_gen_op_libs(
-    op_lib_names = ["debug_ops"],
-    deps = [":lib"],
-)
-
-tf_gen_op_libs(
-    is_external = False,
-    op_lib_names = [
-        "resource_variable_ops",
-    ],
-    deps = [":lib"],
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "tpu_configuration_ops",
-        "tpu_cross_replica_ops",
-        "tpu_embedding_ops",
-        "tpu_embedding_load_retrieve_ops",
-        "tpu_functional_ops",
-        "tpu_heartbeat_ops",
-        "tpu_host_compute_ops",
-        "tpu_infeed_ops",
-        "tpu_outfeed_ops",
-        "tpu_ordinal_selector_ops",
-        "tpu_replication_ops",
-    ],
-    deps = [
-        ":lib",
-        ":lib_proto_parsing",
-        ":protos_all_cc",
-        "//tensorflow/core/protobuf/tpu:optimization_parameters_proto_cc",
-        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_cc",
-        "//tensorflow/core/tpu:tpu_embedding_optimization_parameters_utils",
-        "//tensorflow/core/tpu:tpu_embedding_output_layout_utils",
-    ],
-)
-
-# And one for all user ops
+# One target for all user ops
 cc_library(
     name = "user_ops_op_lib",
     srcs = glob(["user_ops/**/*.cc"]),
@@ -760,212 +550,29 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "word2vec_ops",
-    srcs = ["ops/word2vec_ops.cc"],
-    linkstatic = 1,
-    visibility = ["//tensorflow:internal"],
-    deps = [":framework"],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "cudnn_rnn_ops",
-    srcs = [
-        "ops/cudnn_rnn_ops.cc",
-    ],
-    linkstatic = 1,
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        ":framework",
-        ":lib",
-        ":lib_internal",
-        ":stream_executor",
-        "//tensorflow/core/kernels:bounds_check_lib",
-    ],
-    alwayslink = 1,
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "cudnn_rnn_ops",
-    ],
-    deps = [
-        ":lib",
-    ],
-)
-
-cc_library(
-    name = "ragged_ops",
-    deps = [
-        ":ragged_array_ops_op_lib",
-        ":ragged_conversion_ops_op_lib",
-        ":ragged_math_ops_op_lib",
-    ],
-)
-
-tf_gen_op_libs(
-    op_lib_names = [
-        "ragged_array_ops",
-        "ragged_conversion_ops",
-        "ragged_math_ops",
-    ],
-    deps = ["//tensorflow/core/util:ragged_to_dense_util"],
-)
-
 cc_library(
     name = "ops",
     visibility = ["//visibility:public"],
     deps = [
-        ":array_ops_op_lib",
-        ":audio_ops_op_lib",
-        ":batch_ops_op_lib",
-        ":bitwise_ops_op_lib",
-        ":boosted_trees_ops_op_lib",
-        ":tensor_forest_ops_op_lib",
-        ":candidate_sampling_ops_op_lib",
-        ":checkpoint_ops_op_lib",
-        ":clustering_ops_op_lib",
-        ":collective_ops_op_lib",
-        ":control_flow_ops_op_lib",
-        ":count_ops_op_lib",
-        ":ctc_ops_op_lib",
-        ":cudnn_rnn_ops_op_lib",
-        ":data_flow_ops_op_lib",
-        ":dataset_ops_op_lib",
-        ":debug_ops_op_lib",
-        ":decode_proto_ops_op_lib",
-        ":encode_proto_ops_op_lib",
-        ":experimental_dataset_ops_op_lib",
-        ":function_ops_op_lib",
-        ":functional_ops_op_lib",
-        ":image_ops_op_lib",
-        ":io_ops_op_lib",
-        ":linalg_ops_op_lib",
-        ":list_ops_op_lib",
-        ":map_ops_op_lib",
-        ":logging_ops_op_lib",
-        ":lookup_ops_op_lib",
-        ":manip_ops_op_lib",
-        ":math_ops_op_lib",
-        ":nccl_ops_op_lib",
-        ":nn_ops_op_lib",
-        ":no_op_op_lib",
-        ":parsing_ops_op_lib",
-        ":ragged_ops",
-        ":random_ops_op_lib",
-        ":rnn_ops_op_lib",
-        ":special_math_ops_op_lib",
-        ":stateful_random_ops_op_lib",
-        ":remote_fused_graph_ops_op_lib",
-        ":resource_variable_ops_op_lib",
-        ":rpc_ops_op_lib",
-        ":scoped_allocator_ops_op_lib",
-        ":script_ops_op_lib",
-        ":sdca_ops_op_lib",
-        ":sendrecv_ops_op_lib",
-        ":set_ops_op_lib",
-        ":sparse_csr_matrix_ops_op_lib",
-        ":sparse_ops_op_lib",
-        ":summary_ops_op_lib",
-        ":spectral_ops_op_lib",
-        ":state_ops_op_lib",
-        ":stateless_random_ops_op_lib",
-        ":string_ops_op_lib",
-        ":training_ops_op_lib",
         ":user_ops_op_lib",
-        ":word2vec_ops",
         "//tensorflow/c/kernels:bitcast_op_lib",
         "//tensorflow/c/kernels:histogram_summary_op_lib",
         "//tensorflow/c/kernels:merge_summary_op_lib",
         "//tensorflow/c/kernels:summary_op_lib",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_passthrough_op",
+        "//tensorflow/core/ops:ops",
     ] + if_chromiumos(
         [],
-        # Non-tpu platforms don't need tpu dependency. It would be best to guard
-        # them by if_tpu. But there is no such flag yet.
+        # Non-tpu platforms don't need tpu dependency.
         [
-            ":tpu_configuration_ops_op_lib",
-            ":tpu_cross_replica_ops_op_lib",
-            ":tpu_embedding_ops_op_lib",
-            ":tpu_embedding_load_retrieve_ops_op_lib",
-            ":tpu_functional_ops_op_lib",
-            ":tpu_heartbeat_ops_op_lib",
-            ":tpu_host_compute_ops_op_lib",
-            ":tpu_infeed_ops_op_lib",
-            ":tpu_outfeed_ops_op_lib",
-            ":tpu_ordinal_selector_ops_op_lib",
-            ":tpu_replication_ops_op_lib",
             "//tensorflow/core/tpu/ops",
         ],
-    ) + if_mkl([
-        ":mkl_array_ops_op_lib",
-        ":mkl_nn_ops_op_lib",
-    ]) + if_tensorrt([
+    ) + if_tensorrt([
         "//tensorflow/compiler/tf2tensorrt:trt_engine_resource_ops_op_lib",
         "//tensorflow/compiler/tf2tensorrt:trt_op_libs",
-    ]),
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "array_grad",
-    srcs = ["ops/array_grad.cc"],
-    linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
-    visibility = ["//visibility:public"],
-    deps = [
-        ":array_ops_op_lib",
-        ":framework",
-        ":lib",
-        "//tensorflow/c/kernels:bitcast_op_lib",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "functional_grad",
-    srcs = ["ops/functional_grad.cc"],
-    linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
-    visibility = ["//visibility:public"],
-    deps = [
-        ":framework",
-        ":functional_ops_op_lib",
-        ":lib",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "math_grad",
-    srcs = [
-        "ops/math_grad.cc",
-        "ops/random_grad.cc",
-        "ops/stateless_random_grad.cc",
-    ],
-    linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
-    visibility = ["//visibility:public"],
-    deps = [
-        ":framework",
-        ":lib",
-        ":math_ops_op_lib",
-        ":protos_all_cc",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "nn_grad",
-    srcs = ["ops/nn_grad.cc"],
-    linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
-    visibility = ["//visibility:public"],
-    deps = [
-        ":framework",
-        ":lib",
-        ":nn_ops_op_lib",
-    ] + if_mkl([
-        ":mkl_nn_ops_op_lib",
-    ]),
-    alwayslink = 1,
+    ]) + if_libtpu(
+        if_false = ["//tensorflow/compiler/mlir/tensorflow:mlir_passthrough_op"],
+        if_true = [],
+    ),
 )
 
 alias(
@@ -1086,9 +693,7 @@ cc_library(
     ]) + if_tensorrt([
         "//tensorflow/compiler/tf2tensorrt:trt_engine_resource_op_kernels",
         "//tensorflow/compiler/tf2tensorrt:trt_op_kernels",
-    ]) + if_tpu([
-        "//tensorflow/core/tpu/kernels",
-    ]),
+    ]) + tf_tpu_dependencies(),
 )
 
 cc_library(
@@ -1166,7 +771,7 @@ cc_library(
 )
 
 # Test support library needed for higher-level (TensorFlow-specific) tests
-cc_library(
+tf_cuda_library(
     name = "testlib",
     testonly = 1,
     srcs = [
@@ -1199,10 +804,10 @@ cc_library(
         ":ops",
         ":protos_all_cc",
         ":test",
-        ":testlib_ops",
         # TODO(gunan): resolve dependency issues and load these kernels dynamically.
         ":testlib_kernels_impl",
         "//tensorflow/cc:scope",
+        "//tensorflow/core/common_runtime:testlib_ops",
         "//tensorflow/core/framework:fake_input",
         "//tensorflow/core/framework:function_testlib",
         "//tensorflow/core/framework:shape_inference_testutil",
@@ -1212,13 +817,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "testlib_ops",
-    testonly = 1,
-    actual =
-        "//tensorflow/core/common_runtime:testlib_ops",
-)
-
 # This is a link-only library to provide a DirectSession
 # implementation of the Session interface.
 tf_cuda_library(
@@ -1232,23 +830,9 @@ tf_cuda_library(
     alwayslink = 1,
 )
 
-# -----------------------------------------------------------------------------
-# MKL targets
-alias(
-    name = "mkl_graph_util",
-    actual = "//tensorflow/core/graph:mkl_graph_util",
-)
-
 # -----------------------------------------------------------------------------
 # Public Android targets
 
-# List of protos we want on android
-filegroup(
-    name = "android_proto_srcs",
-    srcs = CORE_PROTO_SRCS,
-    visibility = ["//visibility:public"],
-)
-
 # Sources required to build the TensorFlow framework without the runtime on
 # mobile platforms. This is essentially the sources required to build
 # tensorflow/core/framework:tensor without using granular targets.
@@ -1278,7 +862,7 @@ filegroup(
             "**/*main.cc",
         ],
     ),
-    visibility = ["//visibility:private"],
+    visibility = ["//visibility:public"],
 )
 
 # Sources required to build the TensorFlow framework with runtime on
@@ -1297,6 +881,7 @@ filegroup(
         "//tensorflow/core/graph:mobile_srcs_only_runtime",
         "//tensorflow/core/kernels:mobile_srcs",
         "//tensorflow/core/lib/io:mobile_srcs_only_runtime",
+        "//tensorflow/core/nccl:mobile_srcs",
         "//tensorflow/core/profiler:mobile_srcs",
         "//tensorflow/core/public:mobile_srcs_only_runtime",
         "//tensorflow/core/util/sparse:mobile_srcs_only_runtime",
@@ -1337,11 +922,98 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
-alias(
-    name = "android_srcs",
-    actual = ":mobile_srcs",
-    visibility = ["//visibility:public"],
-)
+# All the aliases for stuff under ops/
+# Once the dependencies move to the real targets, remove the aliases here!
+
+[
+    alias(
+        name = "%s" % (name,),
+        actual = "//tensorflow/core/ops:%s" % (name,),
+        visibility = ["//visibility:public"],
+    )
+    for name in [
+        "array_grad",
+        "array_ops_op_lib",
+        "audio_ops_op_lib",
+        "batch_ops_op_lib",
+        "bitwise_ops_op_lib",
+        "boosted_trees_ops_op_lib",
+        "candidate_sampling_ops_op_lib",
+        "checkpoint_ops_op_lib",
+        "clustering_ops_op_lib",
+        "collective_ops_op_lib",
+        "control_flow_ops_op_lib",
+        "count_ops_op_lib",
+        "ctc_ops_op_lib",
+        "cudnn_rnn_ops_op_lib",
+        "data_flow_ops_op_lib",
+        "dataset_ops_op_lib",
+        "debug_ops_op_lib",
+        "decode_proto_ops_op_lib",
+        "encode_proto_ops_op_lib",
+        "experimental_dataset_ops_op_lib",
+        "function_ops_op_lib",
+        "functional_grad",
+        "functional_ops_op_lib",
+        "image_ops_op_lib",
+        "io_ops_op_lib",
+        "linalg_ops_op_lib",
+        "list_ops_op_lib",
+        "logging_ops_op_lib",
+        "lookup_ops_op_lib",
+        "manip_ops_op_lib",
+        "map_ops_op_lib",
+        "math_grad",
+        "math_ops_op_lib",
+        "mkl_array_ops_op_lib",
+        "mkl_nn_ops_op_lib",
+        "nccl_ops_op_lib",
+        "nn_grad",
+        "nn_ops_op_lib",
+        "no_op_op_lib",
+        "parsing_ops_op_lib",
+        "portable_op_registrations_and_gradients",
+        "ragged_array_ops_op_lib",
+        "ragged_conversion_ops_op_lib",
+        "ragged_math_ops_op_lib",
+        "ragged_ops",
+        "random_grad_op_lib",
+        "random_ops_op_lib",
+        "remote_fused_graph_ops_op_lib",
+        "resource_variable_ops_op_lib",
+        "rnn_ops_op_lib",
+        "rpc_ops_op_lib",
+        "scoped_allocator_ops_op_lib",
+        "script_ops_op_lib",
+        "sdca_ops_op_lib",
+        "sendrecv_ops_op_lib",
+        "set_ops_op_lib",
+        "sparse_csr_matrix_ops_op_lib",
+        "sparse_ops_op_lib",
+        "special_math_ops_op_lib",
+        "spectral_ops_op_lib",
+        "state_ops_op_lib",
+        "stateful_random_ops_op_lib",
+        "stateless_random_ops_op_lib",
+        "stateless_random_ops_v2_op_lib",
+        "string_ops_op_lib",
+        "summary_ops_op_lib",
+        "tensor_forest_ops_op_lib",
+        "tpu_configuration_ops_op_lib",
+        "tpu_cross_replica_ops_op_lib",
+        "tpu_embedding_ops_op_lib",
+        "tpu_embedding_load_retrieve_ops_op_lib",
+        "tpu_functional_ops_op_lib",
+        "tpu_heartbeat_ops_op_lib",
+        "tpu_host_compute_ops_op_lib",
+        "tpu_infeed_ops_op_lib",
+        "tpu_outfeed_ops_op_lib",
+        "tpu_ordinal_selector_ops_op_lib",
+        "tpu_replication_ops_op_lib",
+        "training_ops_op_lib",
+        "word2vec_ops",
+    ]
+]
 
 # Native library support for mobile applications.  Does not contain
 # operators, use :portable_tensorflow_lib if you want full operator
@@ -1356,7 +1028,7 @@ alias(
 # Compiles to a trivial library on non-mobile to prevent irrelevant
 # build errors. If not building this e.g. as part of an android_binary,
 # a command such as the following must be used:
-# bazel build -c opt tensorflow/core:android_tensorflow_lib \
+# bazel build -c opt tensorflow/core:portable_tensorflow_lib \
 # --define=TENSORFLOW_PROTOS=lite \
 # --crosstool_top=//external:android/crosstool \
 # --cpu=armeabi-v7a \
@@ -1379,24 +1051,6 @@ cc_library(
     alwayslink = 1,
 )
 
-alias(
-    name = "android_tensorflow_lib_lite",
-    actual = ":portable_tensorflow_lib_lite",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "android_tensorflow_lib_lite_nortti",
-    actual = ":portable_tensorflow_lib_lite",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "android_tensorflow_lib_lite_nortti_lite_protos",
-    actual = ":portable_tensorflow_lib_lite",
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "mobile_additional_lib_deps",
     deps = tf_additional_lib_deps() + [
@@ -1407,26 +1061,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "ios_tensorflow_lib_lite",
-    actual = ":portable_tensorflow_lib_lite",
-    visibility = ["//visibility:public"],
-)
-
-# Full TensorFlow library with operator support. Use this unless reducing
-# binary size (by packaging a reduced operator set) is a concern.
-alias(
-    name = "android_tensorflow_lib",
-    actual = ":portable_tensorflow_lib",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "ios_tensorflow_lib",
-    actual = ":portable_tensorflow_lib",
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "portable_tensorflow_lib",
     srcs = if_mobile([":portable_op_registrations_and_gradients"]),
@@ -1447,52 +1081,6 @@ cc_library(
     alwayslink = 1,
 )
 
-alias(
-    name = "android_op_registrations_and_gradients",
-    actual = ":portable_op_registrations_and_gradients",
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "portable_op_registrations_and_gradients",
-    srcs = ["//tensorflow/c/kernels:android_all_ops"] + glob(
-        [
-            "ops/**/*.cc",
-            "ops/**/*.h",
-        ],
-        exclude = [
-            "**/*test.cc",
-            "**/*testutil*",
-            "**/*testlib*",
-            "**/*main.cc",
-            "**/tpu_*",
-        ],
-    ),
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "android_test_srcs",
-    testonly = 1,
-    # TODO(andrewharp/nhua):
-    # make more test-related sources portable e.g. "//tensorflow/core/platform:test.cc",
-    srcs = tf_portable_full_lite_protos(
-        full = [
-            "//tensorflow/core/framework:android_test_hdrs",
-            "//tensorflow/core/framework:android_test_srcs",
-            "//tensorflow/core/platform:android_test_srcs",
-            "//tensorflow/core/util:android_test_srcs",
-        ],
-        lite = [
-            "//tensorflow/core/framework:android_test_hdrs",
-            "//tensorflow/core/framework:android_test_srcs_no_core",
-            "//tensorflow/core/platform:android_test_srcs",
-            "//tensorflow/core/util:android_test_srcs",
-        ],
-    ),
-    visibility = ["//visibility:public"],
-)
-
 # This is like android_test_srcs, minus the things that are already in mobile_srcs.
 filegroup(
     name = "android_test_srcs_no_core",
@@ -1507,18 +1095,6 @@ filegroup(
 )
 
 # Portable library providing testing functionality for TensorFlow.
-alias(
-    name = "android_tensorflow_test_lib",
-    actual = ":portable_tensorflow_test_lib",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "ios_tensorflow_test_lib",
-    actual = ":portable_tensorflow_test_lib",
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "portable_tensorflow_test_lib",
     testonly = 1,
@@ -1538,7 +1114,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":portable_tensorflow_lib",
-        ":protos_all_cc",
         "//tensorflow/core/kernels:portable_tensorflow_kernels",
         "//tensorflow/core/platform/default/build_config:gtest",
         "//third_party/eigen3",
@@ -1591,105 +1166,8 @@ alias(
     ]
 ]
 
-# The following targets will be moved to core/protobuf. The aliases are only temporary
-# since moving existing users will require several CLs over several projects.
-[
-    [
-        alias(
-            name = "protobuf_%s_pyclif%s" % (proto_name, target_suffix),
-            actual = "//tensorflow/core/protobuf:%s_pyclif%s" % (proto_name, target_suffix),
-            visibility = ["//visibility:public"],
-        )
-        for target_suffix in [
-            "",
-            "_pb2",
-        ]
-    ]
-    for proto_name in [
-        "config",
-        "device_properties",
-        "graph_debug_info",
-        "meta_graph",
-        "saved_model",
-    ]
-]
-
 # -----------------------------------------------------------------------------
 # Internal targets
-
-alias(
-    name = "autotuning_proto",
-    actual = "//tensorflow/core/protobuf:autotuning_proto",
-    visibility = [
-        "//tensorflow:internal",
-    ],
-)
-
-alias(
-    name = "autotuning_proto_cc",
-    actual = "//tensorflow/core/protobuf:autotuning_proto_cc",
-    visibility = [
-        "//tensorflow:internal",
-    ],
-)
-
-alias(
-    name = "conv_autotuning_proto",
-    actual = "//tensorflow/core/protobuf:conv_autotuning_proto",
-    visibility = [
-        "//tensorflow:internal",
-    ],
-)
-
-alias(
-    name = "conv_autotuning_proto_cc",
-    actual = "//tensorflow/core/protobuf:conv_autotuning_proto_cc",
-    visibility = [
-        "//tensorflow:internal",
-    ],
-)
-
-alias(
-    name = "worker_proto_cc",
-    actual = "//tensorflow/core/protobuf:worker_proto_cc",
-    visibility = [
-        "//tensorflow:internal",
-    ],
-)
-
-alias(
-    name = "worker_service_proto_cc",
-    actual = "//tensorflow/core/protobuf:worker_service_proto_cc",
-    visibility = [
-        "//tensorflow:internal",
-    ],
-)
-
-alias(
-    name = "master_proto_cc",
-    actual = "//tensorflow/core/protobuf:master_proto_cc",
-    visibility = [
-        "//learning/brain/frameworks/uptc:__subpackages__",
-        "//tensorflow:internal",
-    ],
-)
-
-alias(
-    name = "master_service_proto_cc",
-    actual = "//tensorflow/core/protobuf:master_service_proto_cc",
-    visibility = [
-        "//tensorflow:internal",
-    ],
-)
-
-alias(
-    name = "eager_service_proto_cc",
-    actual = "//tensorflow/core/protobuf:eager_service_proto_cc",
-    visibility = [
-        "//tensorflow:internal",
-    ],
-)
-
 filegroup(
     name = "lib_internal_private_headers",
     srcs = [
@@ -1903,6 +1381,7 @@ cc_library(
         "//tensorflow/core/platform:denormal",
         "//tensorflow/core/platform:dynamic_annotations",
         "//tensorflow/core/platform:env",
+        "//tensorflow/core/platform:env_impl",
         "//tensorflow/core/platform:error",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:file_statistics",
@@ -2007,16 +1486,6 @@ cc_library(
     ],
 )
 
-alias(
-    name = "png_internal",
-    actual = "//tensorflow/core/lib/png:png_io",
-)
-
-alias(
-    name = "android_png_internal",
-    actual = "//tensorflow/core/lib/png:png_io",
-)
-
 cc_library(
     name = "tflite_portable_logging",
     hdrs = [
@@ -2040,8 +1509,8 @@ cc_library(
 )
 
 cc_library(
-    name = "android_jpeg_internal",
-    srcs = if_android([
+    name = "portable_jpeg_internal",
+    srcs = if_mobile([
         "lib/jpeg/jpeg_handle.cc",
         "lib/jpeg/jpeg_mem.cc",
         "//tensorflow/core/platform:jpeg_hdrs",
@@ -2055,7 +1524,7 @@ cc_library(
         "//tensorflow/core/platform/default:logging.h",
     ],
     copts = tf_copts(),
-    linkopts = ["-ldl"],
+    linkopts = if_android(["-ldl"]),
     deps = [
         ":core_stringpiece",
         "//tensorflow/core/platform:dynamic_annotations",
@@ -2068,8 +1537,8 @@ cc_library(
 )
 
 cc_library(
-    name = "android_gif_internal",
-    srcs = if_android([
+    name = "portable_gif_internal",
+    srcs = if_mobile([
         "lib/gif/gif_io.cc",
         "//tensorflow/core/platform:gif_hdrs",
     ]),
@@ -2082,7 +1551,7 @@ cc_library(
         "//tensorflow/core/platform/default:logging.h",
     ],
     copts = tf_copts(),
-    linkopts = ["-ldl"],
+    linkopts = if_android(["-ldl"]),
     deps = [
         "//tensorflow/core/platform:dynamic_annotations",
         "//tensorflow/core/platform:gif",
@@ -2103,11 +1572,6 @@ alias(
     actual = "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
 )
 
-alias(
-    name = "error_codes_proto_cc",
-    actual = "//tensorflow/core/lib/core:error_codes_proto_cc",
-)
-
 alias(
     name = "version_lib",
     actual = "//tensorflow/core/util:version_info",
@@ -2119,6 +1583,7 @@ filegroup(
         "//tensorflow/core/example:feature_util.h",
         "//tensorflow/core/framework:framework_internal_private_hdrs",
         "//tensorflow/core/graph:framework_internal_private_headers",
+        "//tensorflow/core/public:session_options.h",
         "//tensorflow/core/util:framework_internal_private_hdrs",
         "//tensorflow/core/util:memmapped_file_system_hdrs",
         "//tensorflow/core/util/sparse:framework_internal_private_headers_group",
@@ -2177,7 +1642,7 @@ cc_header_only_library(
         ":lib",
         ":lib_internal",
         ":version_lib",
-        "//tensorflow/core/kernels:bounds_check",
+        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/platform/default/build_config:platformlib",
     ],
 )
@@ -2229,6 +1694,7 @@ tf_cuda_library(
         "//tensorflow/core/framework:attr_value_proto_text",
         "//tensorflow/core/framework:attr_value_util",
         "//tensorflow/core/framework:bfloat16",
+        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/framework:common_shape_fns",
         "//tensorflow/core/framework:kernel_shape_util",
         "//tensorflow/core/framework:node_def_util",
@@ -2242,7 +1708,7 @@ tf_cuda_library(
         "//tensorflow/core/framework:shape_inference",
         "//tensorflow/core/framework:tensor",
         "//tensorflow/core/framework:tensor_shape",
-        "//tensorflow/core/kernels:bounds_check",
+        "//tensorflow/core/platform:env_impl",
         "//tensorflow/core/platform/default/build_config:platformlib",
         "//tensorflow/core/profiler/lib:annotated_traceme",
         "//tensorflow/core/profiler/lib:traceme",
@@ -2284,15 +1750,10 @@ cc_header_only_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":stream_executor",
+        "//tensorflow/core/platform:stream_executor",
     ],
 )
 
-alias(
-    name = "stream_executor",
-    actual = "//tensorflow/core/platform:stream_executor",
-)
-
 # Like stream_executor library, but compiles without --config=cuda
 # and does not include any cuda dependencies.
 alias(
@@ -2344,18 +1805,12 @@ tf_cuda_library(
         ":function_ops_op_lib",
         ":functional_grad",
         ":functional_ops_op_lib",
-        "//tensorflow/core/kernels:bounds_check",
+        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/kernels:required",
     ]),
     alwayslink = 1,
 )
 
-alias(
-    name = "core_cpu_impl",
-    actual =
-        "//tensorflow/core/common_runtime:core_cpu_impl",
-)
-
 alias(
     name = "core_cpu_lib",
     actual =
@@ -2368,18 +1823,6 @@ alias(
         "//tensorflow/core/common_runtime:core_cpu_internal",
 )
 
-alias(
-    name = "regexp_internal",
-    actual =
-        "//tensorflow/core/platform:regexp",
-    visibility = [
-        "//tensorflow/compiler:__subpackages__",
-        "//tensorflow/core/kernels:__subpackages__",
-        "//tensorflow/core/profiler:__subpackages__",
-        "//tensorflow/stream_executor:__subpackages__",
-    ],
-)
-
 alias(
     name = "direct_session_internal",
     actual =
@@ -2392,14 +1835,6 @@ alias(
     visibility = ["//visibility:public"],
 )
 
-alias(
-    name = "replay_log_proto_cc",
-    actual = "//tensorflow/core/protobuf:replay_log_proto_cc",
-    visibility = [
-        "//tensorflow:internal",
-    ],
-)
-
 alias(
     name = "gpu_runtime",
     actual =
@@ -2423,18 +1858,6 @@ cc_library(
     ],
 )
 
-# TODO(gonnet): Remove this alias once all users have been moved to the actual target.
-alias(
-    name = "tensor_testutil",
-    actual = "//tensorflow/core/framework:tensor_testutil",
-)
-
-# TODO(gonnet): Remove this alias once all users have been moved to the actual target.
-alias(
-    name = "shape_inference_testutil",
-    actual = "//tensorflow/core/framework:shape_inference_testutil",
-)
-
 # Main program for tests
 alias(
     name = "test_main",
@@ -2442,14 +1865,6 @@ alias(
     visibility = ["//tensorflow:internal"],
 )
 
-test_suite(
-    name = "low_level_tests",
-    tests = [
-        ":low_level_library_tests",
-        "//tensorflow/core/platform:low_level_library_tests",
-    ],
-)
-
 tf_cc_tests(
     name = "low_level_library_tests",
     size = "small",
@@ -2470,7 +1885,6 @@ tf_cc_tests(
         "//tensorflow/core/lib/random:legacy_lib_random_tests",
         "//tensorflow/core/lib/strings:legacy_low_level_library_tests",
     ],
-    create_named_test_suite = True,
     deps = [
         ":lib",
         ":lib_internal",
@@ -2506,22 +1920,6 @@ tf_cc_test(
     ],
 )
 
-test_suite(
-    name = "platform_tests",
-    tests = [
-        "//tensorflow/core/platform:abi_test",
-        "//tensorflow/core/platform:env_test",
-        "//tensorflow/core/platform:fake_python_env_test",
-        "//tensorflow/core/platform:file_system_test",
-        "//tensorflow/core/platform:numa_test",
-        "//tensorflow/core/platform:platform_strings_test",
-        "//tensorflow/core/platform:rocm_rocdl_path_test",
-        "//tensorflow/core/platform:setround_test",
-        "//tensorflow/core/platform:unbounded_work_queue_test",
-        "//tensorflow/core/platform:vmodule_test",
-    ],
-)
-
 tf_cc_test(
     name = "lib_jpeg_jpeg_mem_unittest",
     srcs = ["lib/jpeg/jpeg_mem_unittest.cc"],
@@ -2574,27 +1972,6 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
-    name = "framework_op_gen_lib_test",
-    size = "small",
-    srcs = ["//tensorflow/core/framework:op_gen_lib_test.cc"],
-    deps = [
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        "//tensorflow/core/framework:op_gen_lib",
-    ],
-)
-
-test_suite(
-    name = "higher_level_tests",
-    tests = [
-        ":core_higher_level_tests",
-        "//tensorflow/core/framework:higher_level_tests",
-        "//tensorflow/core/util:higher_level_tests",
-    ],
-)
-
 tf_cc_tests(
     name = "core_higher_level_tests",
     size = "small",
@@ -2613,7 +1990,6 @@ tf_cc_tests(
         "//tensorflow/core/graph:validate_test.cc",
         "//tensorflow/core/util/sparse:higher_level_tests_group",
     ],
-    create_named_test_suite = True,
     linkopts = select({
         "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
@@ -2650,22 +2026,6 @@ tf_cc_tests(
     ],
 )
 
-tf_cc_test(
-    name = "cudnn_rnn_ops_test_cc",
-    size = "small",
-    srcs = [
-        "ops/cudnn_rnn_ops_test.cc",
-    ],
-    deps = [
-        ":core",
-        ":framework",
-        ":lib",
-        ":test",
-        ":test_main",
-        ":testlib",
-    ],
-)
-
 tf_cc_test_mkl(
     name = "mkl_related_tests",
     size = "small",
@@ -2742,223 +2102,6 @@ tf_cc_tests_gpu(
     ],
 )
 
-tf_cc_test_gpu(
-    name = "variant_op_copy_test",
-    size = "small",
-    srcs = ["//tensorflow/core/framework:variant_op_copy_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    tags = tf_cuda_tests_tags(),
-    deps = [
-        ":core",
-        ":core_cpu",
-        ":core_cpu_internal",
-        ":direct_session",
-        ":framework",
-        ":framework_internal",
-        ":gpu_runtime",
-        ":lib",
-        ":lib_internal",
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        ":testlib",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:client_session",
-        "//tensorflow/cc:ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/core/kernels:array",
-        "//third_party/eigen3",
-    ],
-)
-
-tf_cc_test(
-    name = "framework_run_handler_util_test",
-    size = "small",
-    srcs = ["//tensorflow/core/framework:run_handler_util_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    deps = [
-        ":framework_internal",
-        ":lib",
-        ":test",
-        ":test_main",
-    ],
-)
-
-tf_cc_test(
-    name = "framework_run_handler_test",
-    size = "small",
-    srcs = ["//tensorflow/core/framework:run_handler_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    deps = [
-        ":core_cpu",
-        ":direct_session_internal",
-        ":framework_internal",
-        ":lib",
-        ":lib_internal",
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        ":testlib",
-        "//tensorflow/core/framework:tensor_testutil",
-        "//tensorflow/core/kernels:cwise_op",
-        "//tensorflow/core/kernels:matmul_op",
-        "//third_party/eigen3",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/synchronization",
-    ],
-)
-
-tf_cc_test(
-    name = "framework_op_segment_test",
-    size = "small",
-    srcs = ["//tensorflow/core/framework:op_segment_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    deps = [
-        ":core",
-        ":core_cpu",
-        ":core_cpu_internal",
-        ":direct_session_internal",
-        ":framework",
-        ":framework_internal",
-        ":lib",
-        ":lib_internal",
-        ":ops",
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        ":testlib",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core/kernels:cwise_op",
-        "//tensorflow/core/kernels:ops_util",
-        "//third_party/eigen3",
-    ],
-)
-
-tf_cc_test(
-    name = "ops_array_grad_test",
-    size = "small",
-    srcs = ["ops/array_grad_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    deps = [
-        ":core",
-        ":core_cpu",
-        ":core_cpu_internal",
-        ":direct_session_internal",
-        ":framework",
-        ":framework_internal",
-        ":lib",
-        ":lib_internal",
-        ":ops",
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        ":testlib",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core/kernels:array",
-        "//tensorflow/core/kernels:cwise_op",
-        "//tensorflow/core/kernels:function_ops",
-        "//tensorflow/core/kernels:math",
-        "//third_party/eigen3",
-    ],
-)
-
-tf_cc_test(
-    name = "ops_math_grad_test",
-    size = "small",
-    srcs = ["ops/math_grad_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    tags = ["no_gpu"],
-    deps = [
-        ":core",
-        ":core_cpu",
-        ":core_cpu_internal",
-        ":direct_session_internal",
-        ":framework",
-        ":framework_internal",
-        ":lib",
-        ":lib_internal",
-        ":ops",
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        ":testlib",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core/kernels:array",
-        "//tensorflow/core/kernels:data_flow",
-        "//tensorflow/core/kernels:function_ops",
-        "//tensorflow/core/kernels:math",
-        "//third_party/eigen3",
-    ],
-)
-
-tf_cc_test(
-    name = "ops_remote_fused_graph_ops_test",
-    size = "small",
-    srcs = ["ops/remote_fused_graph_ops_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    deps = [
-        ":core",
-        ":core_cpu",
-        ":core_cpu_internal",
-        ":framework",
-        ":framework_internal",
-        ":lib",
-        ":lib_internal",
-        ":ops",
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        ":testlib",
-        "//tensorflow/core/kernels:remote_fused_graph_ops",
-    ],
-)
-
-tf_cc_test(
-    name = "ops_tests",
-    size = "small",
-    srcs = [
-        "ops/array_ops_test.cc",
-        "ops/candidate_sampling_ops_test.cc",
-        "ops/control_flow_ops_test.cc",
-        "ops/ctc_ops_test.cc",
-        "ops/data_flow_ops_test.cc",
-        "ops/functional_ops_test.cc",
-        "ops/image_ops_test.cc",
-        "ops/io_ops_test.cc",
-        "ops/linalg_ops_test.cc",
-        "ops/math_ops_test.cc",
-        "ops/nn_ops_test.cc",
-        "ops/parsing_ops_test.cc",
-        "ops/random_ops_test.cc",
-        "ops/rnn_ops_test.cc",
-        "ops/set_ops_test.cc",
-        "ops/shape_function_test.cc",
-        "ops/sparse_csr_matrix_ops_test.cc",
-        "ops/sparse_ops_test.cc",
-        "ops/spectral_ops_test.cc",
-        "ops/state_ops_test.cc",
-        "ops/string_ops_test.cc",
-        "ops/training_ops_test.cc",
-    ],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    deps = [
-        ":core",
-        ":core_cpu",
-        ":core_cpu_internal",
-        ":framework",
-        ":framework_internal",
-        ":lib",
-        ":lib_internal",
-        ":ops",
-        ":protos_all_cc",
-        ":test",
-        ":test_main",
-        ":testlib",
-        "//tensorflow/cc:cc_ops",
-        "//third_party/eigen3",
-    ],
-)
-
 # Test data
 filegroup(
     name = "image_testdata",
@@ -2988,50 +2131,11 @@ filegroup(
         # GIF data with optimization
         "lib/gif/testdata/optimized.gif",
         # BMP data
-        "lib/bmp/testdata/lena.bmp",
-        "lib/bmp/testdata/rgb_small.bmp",
-        "lib/bmp/testdata/rgb_small_255.bmp",
-        "lib/bmp/testdata/rgba_small.bmp",
-        "lib/bmp/testdata/rgba_small_255.bmp",
-        "lib/bmp/testdata/grayscale_small.bmp",
-        "lib/bmp/testdata/grayscale_small_3channels.bmp",
-        "lib/bmp/testdata/grayscale_small_4channels.bmp",
+        "//tensorflow/core/lib/bmp:bmp_testdata",
     ],
     visibility = ["//visibility:public"],
 )
 
-filegroup(
-    name = "lmdb_testdata",
-    testonly = 1,
-    srcs = [
-        # A simple key-value store:
-        #   0 : 'b'
-        #   1 : 'b'
-        #    ...
-        #   9 : 'b'
-        # Which is then overwritten with:
-        #   0 : 'a'
-        #   1 : 'b'
-        #    ...
-        #   9 : 'j'
-        "lib/lmdb/testdata/data.mdb",
-        # LMDB, being a memory-mapped database, uses a different file format on
-        # big-endian systems.
-        "lib/lmdb/testdata/data_bigendian.mdb",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "cuda_libdevice_path",
-    actual = "//tensorflow/core/platform:cuda_libdevice_path",
-)
-
-# Normalize CORE_PROTO_SRCS to generate valid output file names.
-PORTABLE_PROTO_HEADERS_OUT = tf_android_core_proto_headers(CORE_PROTO_SRCS) + [
-    "//google/protobuf/any.proto.h",
-]
-
 transitive_hdrs(
     name = "headers",
     visibility = ["//tensorflow:__subpackages__"],
@@ -3040,7 +2144,7 @@ transitive_hdrs(
         ":framework",
         ":lib",
         ":protos_all_cc",
-        ":stream_executor",
         "//tensorflow/core/platform:platform_strings",
+        "//tensorflow/core/platform:stream_executor",
     ],
 )
diff --git a/tensorflow/core/api_def/BUILD b/tensorflow/core/api_def/BUILD
index dfa0b78cb17..f9e2adaec6b 100644
--- a/tensorflow/core/api_def/BUILD
+++ b/tensorflow/core/api_def/BUILD
@@ -6,6 +6,8 @@
 #   :python_api_def
 #   :java_api_def
 
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_binary",
@@ -37,9 +39,9 @@ filegroup(
     visibility = ["//tensorflow:internal"],
 )
 
-filegroup(
+alias(
     name = "java_api_def",
-    srcs = glob(["java_api/*"]),
+    actual = "//tensorflow/core/api_def/java_api:java_api_def",
     visibility = ["//tensorflow:internal"],
 )
 
diff --git a/tensorflow/core/api_def/README.md b/tensorflow/core/api_def/README.md
new file mode 100644
index 00000000000..76232442e8d
--- /dev/null
+++ b/tensorflow/core/api_def/README.md
@@ -0,0 +1,4 @@
+This folder contains the ApiDef proto definitions of TensorFlow operations.
+
+The canonical source of documentation for these operations can be found in
+the base_api/ directory.
diff --git a/tensorflow/core/api_def/base_api/api_def_Acos.pbtxt b/tensorflow/core/api_def/base_api/api_def_Acos.pbtxt
index 2184b644b23..dc018aec4aa 100644
--- a/tensorflow/core/api_def/base_api/api_def_Acos.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Acos.pbtxt
@@ -1,4 +1,11 @@
 op {
   graph_op_name: "Acos"
   summary: "Computes acos of x element-wise."
+  description: <<END
+
+  Provided an input tensor, the `tf.math.acos` operation returns the inverse cosine of each element of the tensor. If `y = tf.math.cos(x)` then, `x = tf.math.acos(y)`.
+
+  Input range is `[-1, 1]` and the output has a range of `[0, pi]`.
+
+END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_Add.pbtxt b/tensorflow/core/api_def/base_api/api_def_Add.pbtxt
index 7a408af380a..db0c12a0c59 100644
--- a/tensorflow/core/api_def/base_api/api_def_Add.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Add.pbtxt
@@ -4,5 +4,10 @@ op {
   description: <<END
 *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
 [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+
+Given two input tensors, the `tf.add` operation computes the sum for every element in the tensor.
+
+Both input and output have a range `(-inf, inf)`.
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyCenteredRMSProp.pbtxt
index c88d18d3b20..03f4020c45a 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyCenteredRMSProp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyCenteredRMSProp.pbtxt
@@ -34,6 +34,12 @@ END
     name: "rho"
     description: <<END
 Decay rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "momentum"
+    description: <<END
+Momentum Scale. Must be a scalar.
 END
   }
   in_arg {
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveGatherV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveGatherV2.pbtxt
new file mode 100644
index 00000000000..234e978d721
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveGatherV2.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CollectiveGatherV2"
+  summary: "Mutually accumulates multiple tensors of identical type and shape."
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ImageProjectiveTransformV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_ImageProjectiveTransformV3.pbtxt
new file mode 100644
index 00000000000..f8658d74501
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ImageProjectiveTransformV3.pbtxt
@@ -0,0 +1,63 @@
+op {
+  graph_op_name: "ImageProjectiveTransformV3"
+  visibility: HIDDEN
+  in_arg {
+    name: "images"
+    description: <<END
+4-D with shape `[batch, height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "transforms"
+    description: <<END
+2-D Tensor, `[batch, 8]` or `[1, 8]` matrix, where each row corresponds to a 3 x 3
+projective transformation matrix, with the last entry assumed to be 1. If there
+is one row, the same transformation will be applied to all images.
+END
+  }
+  in_arg {
+    name: "output_shape"
+    description: <<END
+1-D Tensor [new_height, new_width].
+END
+  }
+  in_arg {
+    name: "fill_value"
+    description: <<END
+float, the value to be filled when fill_mode is constant".
+END 
+  }
+  out_arg {
+    name: "transformed_images"
+    description: <<END
+4-D with shape
+`[batch, new_height, new_width, channels]`.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+Input dtype.
+END
+  }
+  attr {
+    name: "interpolation"
+    description: <<END
+Interpolation method, "NEAREST" or "BILINEAR".
+END
+  }
+  attr {
+    name: "fill_mode"
+    description: <<END
+Fill mode, "REFLECT", "WRAP", or "CONSTANT".
+END
+  }
+  summary: "Applies the given transform to each of the images."
+  description: <<END
+If one row of `transforms` is `[a0, a1, a2, b0, b1, b2, c0, c1]`, then it maps
+the *output* point `(x, y)` to a transformed *input* point
+`(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`, where
+`k = c0 x + c1 y + 1`. If the transformed point lays outside of the input
+image, the output pixel is set to fill_value.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OutfeedDequeueTupleV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_OutfeedDequeueTupleV2.pbtxt
new file mode 100644
index 00000000000..c8e044aff10
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OutfeedDequeueTupleV2.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "OutfeedDequeueTupleV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "device_ordinal"
+    description: <<END
+An int scalar tensor, representing the TPU device to use. This should be -1 when
+the Op is running on a TPU device, and >= 0 when the Op is running on the CPU
+device.
+END
+  }
+  out_arg {
+    name: "outputs"
+    description: <<END
+A list of tensors that will be read from the outfeed.
+END
+  }
+  attr {
+    name: "dtypes"
+    description: <<END
+The element types of each element in `outputs`.
+END
+  }
+  attr {
+    name: "shapes"
+    description: <<END
+The shapes of each tensor in `outputs`.
+END
+  }
+  summary: <<END
+Retrieve multiple values from the computation outfeed. Device ordinal is a
+tensor allowing dynamic outfeed.
+END
+  description: <<END
+This operation will block indefinitely until data is available. Output `i`
+corresponds to XLA tuple element `i`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OutfeedDequeueV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_OutfeedDequeueV2.pbtxt
new file mode 100644
index 00000000000..fc7a0b557ab
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OutfeedDequeueV2.pbtxt
@@ -0,0 +1,37 @@
+op {
+  graph_op_name: "OutfeedDequeueV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "device_ordinal"
+    description: <<END
+An int scalar tensor, representing the TPU device to use. This should be -1 when
+the Op is running on a TPU device, and >= 0 when the Op is running on the CPU
+device.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A tensor that will be read from the device outfeed.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of elements in the tensor.
+END
+  }
+  attr {
+    name: "shape"
+    description: <<END
+The shape of the tensor.
+END
+  }
+  summary: <<END
+Retrieves a single tensor from the computation outfeed. Device ordinal is a
+tensor allowing dynamic outfeed.
+END
+  description: <<END
+This operation will block indefinitely until data is available.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Qr.pbtxt b/tensorflow/core/api_def/base_api/api_def_Qr.pbtxt
index ac8f7597aae..3d8fbea9424 100644
--- a/tensorflow/core/api_def/base_api/api_def_Qr.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Qr.pbtxt
@@ -34,6 +34,10 @@ END
 Computes the QR decomposition of each inner matrix in `tensor` such that
 `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
 
+Currently, the gradient for the QR decomposition is well-defined only when
+the first `P` columns of the inner matrix are linearly independent, where
+`P` is the minimum of `M` and `N`, the 2 inner-most dimmensions of `tensor`.
+
 ```python
 # a is a tensor.
 # q is a tensor of orthonormal matrices.
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4.pbtxt
new file mode 100644
index 00000000000..a84ccb78436
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV4"
+  summary: "Returns the gradient of `QuantizeAndDequantizeV4`."
+  description: <<END
+This is almost identical to QuantizeAndDequantizeV2, except that it returns a
+gradient of 1 for inputs that are within the quantization range, or 0 otherwise.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt
new file mode 100644
index 00000000000..88ba0ea88cb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV4Grad"
+  summary: "Returns the gradient of `QuantizeAndDequantizeV4`."
+  description: <<END
+Returns a gradient of 1 for inputs that are within the quantization range,
+or 0 otherwise.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedTensorToVariantGradient.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedTensorToVariantGradient.pbtxt
new file mode 100644
index 00000000000..066d6b5eae4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedTensorToVariantGradient.pbtxt
@@ -0,0 +1,38 @@
+op {
+  graph_op_name: "RaggedTensorToVariantGradient"
+  visibility: HIDDEN
+  in_arg {
+    name: "encoded_ragged_grad"
+    description: <<END
+A `variant` Tensor containing encoded `RaggedTensor` gradients.
+END
+  }
+  in_arg {
+    name: "row_splits"
+    description: <<END
+Outermost row-splits that were used as input to the RaggedTensorToVariant op.
+END
+  }
+  in_arg {
+    name: "dense_values_shape"
+    description: <<END
+Shape of the dense_values that was used as an input to the
+RaggedTensorToVariant op.
+END
+  }
+  out_arg {
+    name: "dense_values_grad"
+    description: <<END
+Gradient for the dense_values of the RaggedTensorToVariant op.
+END
+  }
+  summary: <<END
+Helper used to compute the gradient for `RaggedTensorToVariant`.
+END
+  description: <<END
+Computes the gradient for the dense_values input to the RaggedTensorToVariant
+op, given the variant-encoded ragged gradients of the outputs, along with
+the outer row-splits and the shape of the dense-values that were provided as
+inputs to the RaggedTensorToVariant op.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyCenteredRMSProp.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyCenteredRMSProp.pbtxt
index 9cc033cc897..3f306768711 100644
--- a/tensorflow/core/api_def/base_api/api_def_ResourceApplyCenteredRMSProp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyCenteredRMSProp.pbtxt
@@ -34,6 +34,12 @@ END
     name: "rho"
     description: <<END
 Decay rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "momentum"
+    description: <<END
+Momentum Scale. Must be a scalar.
 END
   }
   in_arg {
diff --git a/tensorflow/core/api_def/base_api/api_def_RngReadAndSkip.pbtxt b/tensorflow/core/api_def/base_api/api_def_RngReadAndSkip.pbtxt
new file mode 100644
index 00000000000..e2c5bed2a87
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RngReadAndSkip.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "RngReadAndSkip"
+  visibility: HIDDEN
+  in_arg {
+    name: "resource"
+    description: <<END
+The handle of the resource variable that stores the state of the RNG.
+END
+  }
+  in_arg {
+    name: "alg"
+    description: <<END
+The RNG algorithm.
+END
+  }
+  in_arg {
+    name: "delta"
+    description: <<END
+The amount of advancement.
+END
+  }
+  out_arg {
+    name: "value"
+    description: <<END
+The old value of the resource variable, before incrementing. Since state size is algorithm-dependent, this output will be right-padded with zeros to reach shape int64[3] (the current maximal state size among algorithms).
+END
+  }
+  summary: "Advance the counter of a counter-based RNG."
+  description: <<END
+The state of the RNG after
+`rng_read_and_skip(n)` will be the same as that after `uniform([n])`
+(or any other distribution). The actual increment added to the
+counter is an unspecified implementation choice.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessRandomGetKeyCounterAlg.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessRandomGetKeyCounterAlg.pbtxt
new file mode 100644
index 00000000000..de5dd5490e5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessRandomGetKeyCounterAlg.pbtxt
@@ -0,0 +1,32 @@
+op {
+  graph_op_name: "StatelessRandomGetKeyCounterAlg"
+  visibility: HIDDEN
+  in_arg {
+    name: "seed"
+    description: <<END
+2 seeds (shape [2]).
+END
+  }
+  out_arg {
+    name: "key"
+    description: <<END
+Key for the counter-based RNG algorithm (shape uint64[1]).
+END
+  }
+  out_arg {
+    name: "counter"
+    description: <<END
+Counter for the counter-based RNG algorithm. Since counter size is algorithm-dependent, this output will be right-padded with zeros to reach shape uint64[2] (the current maximal counter size among algorithms).
+END
+  }
+  out_arg {
+    name: "alg"
+    description: <<END
+The RNG algorithm (shape int32[]).
+END
+  }
+  summary: "Picks the best algorithm based on device, and scrambles seed into key and counter."
+  description: <<END
+This op picks the best counter-based RNG algorithm based on device, and scrambles a shape-[2] seed into a key and a counter, both needed by the counter-based algorithm. The scrambling is opaque but approximately satisfies the property that different seed results in different key/counter pair (which will in turn result in different random numbers).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessRandomNormalV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessRandomNormalV2.pbtxt
new file mode 100644
index 00000000000..e1e5a237a9b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessRandomNormalV2.pbtxt
@@ -0,0 +1,46 @@
+op {
+  graph_op_name: "StatelessRandomNormalV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "key"
+    description: <<END
+Key for the counter-based RNG algorithm (shape uint64[1]).
+END
+  }
+  in_arg {
+    name: "counter"
+    description: <<END
+Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+END
+  }
+  in_arg {
+    name: "alg"
+    description: <<END
+The RNG algorithm (shape int32[]).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Random values with specified shape.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs deterministic pseudorandom values from a normal distribution."
+  description: <<END
+The generated values will have mean 0 and standard deviation 1.
+
+The outputs are a deterministic function of `shape`, `key`, `counter` and `alg`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformFullIntV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformFullIntV2.pbtxt
new file mode 100644
index 00000000000..566dc96d2b5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformFullIntV2.pbtxt
@@ -0,0 +1,46 @@
+op {
+  graph_op_name: "StatelessRandomUniformFullIntV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "key"
+    description: <<END
+Key for the counter-based RNG algorithm (shape uint64[1]).
+END
+  }
+  in_arg {
+    name: "counter"
+    description: <<END
+Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+END
+  }
+  in_arg {
+    name: "alg"
+    description: <<END
+The RNG algorithm (shape int32[]).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Random values with specified shape.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs deterministic pseudorandom random integers from a uniform distribution."
+  description: <<END
+The generated values are uniform integers covering the whole range of `dtype`.
+
+The outputs are a deterministic function of `shape`, `key`, `counter` and `alg`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformIntV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformIntV2.pbtxt
new file mode 100644
index 00000000000..ecc1292f9bc
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformIntV2.pbtxt
@@ -0,0 +1,58 @@
+op {
+  graph_op_name: "StatelessRandomUniformIntV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "key"
+    description: <<END
+Key for the counter-based RNG algorithm (shape uint64[1]).
+END
+  }
+  in_arg {
+    name: "counter"
+    description: <<END
+Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+END
+  }
+  in_arg {
+    name: "alg"
+    description: <<END
+The RNG algorithm (shape int32[]).
+END
+  }
+  in_arg {
+    name: "minval"
+    description: <<END
+Minimum value (inclusive, scalar).
+END
+  }
+  in_arg {
+    name: "maxval"
+    description: <<END
+Maximum value (exclusive, scalar).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Random values with specified shape.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs deterministic pseudorandom random integers from a uniform distribution."
+  description: <<END
+The generated values follow a uniform distribution in the range `[minval, maxval)`.
+
+The outputs are a deterministic function of `shape`, `key`, `counter`, `alg`, `minval` and `maxval`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformV2.pbtxt
new file mode 100644
index 00000000000..96ede0ce220
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessRandomUniformV2.pbtxt
@@ -0,0 +1,47 @@
+op {
+  graph_op_name: "StatelessRandomUniformV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "key"
+    description: <<END
+Key for the counter-based RNG algorithm (shape uint64[1]).
+END
+  }
+  in_arg {
+    name: "counter"
+    description: <<END
+Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+END
+  }
+  in_arg {
+    name: "alg"
+    description: <<END
+The RNG algorithm (shape int32[]).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Random values with specified shape.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs deterministic pseudorandom random values from a uniform distribution."
+  description: <<END
+The generated values follow a uniform distribution in the range `[0, 1)`. The
+lower bound 0 is included in the range, while the upper bound 1 is excluded.
+
+The outputs are a deterministic function of `shape`, `key`, `counter` and `alg`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatelessTruncatedNormalV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatelessTruncatedNormalV2.pbtxt
new file mode 100644
index 00000000000..2f5ba025af6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StatelessTruncatedNormalV2.pbtxt
@@ -0,0 +1,48 @@
+op {
+  graph_op_name: "StatelessTruncatedNormalV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "shape"
+    description: <<END
+The shape of the output tensor.
+END
+  }
+  in_arg {
+    name: "key"
+    description: <<END
+Key for the counter-based RNG algorithm (shape uint64[1]).
+END
+  }
+  in_arg {
+    name: "counter"
+    description: <<END
+Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+END
+  }
+  in_arg {
+    name: "alg"
+    description: <<END
+The RNG algorithm (shape int32[]).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+Random values with specified shape.
+END
+  }
+  attr {
+    name: "dtype"
+    description: <<END
+The type of the output.
+END
+  }
+  summary: "Outputs deterministic pseudorandom values from a truncated normal distribution."
+  description: <<END
+The generated values follow a normal distribution with mean 0 and standard
+deviation 1, except that values whose magnitude is more than 2 standard
+deviations from the mean are dropped and re-picked.
+
+The outputs are a deterministic function of `shape`, `key`, `counter` and `alg`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Tanh.pbtxt b/tensorflow/core/api_def/base_api/api_def_Tanh.pbtxt
index 362c88116ca..29166faf4b1 100644
--- a/tensorflow/core/api_def/base_api/api_def_Tanh.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Tanh.pbtxt
@@ -6,9 +6,11 @@ op {
   element in the tensor. Input range is `[-inf, inf]` and
   output range is `[-1,1]`.
 
-  ```python
-  x = tf.constant([-float("inf"), -5, -0.5, 1, 1.2, 2, 3, float("inf")])
-  tf.math.tanh(x) ==> [-1. -0.99990916 -0.46211717 0.7615942 0.8336547 0.9640276 0.9950547 1.]
-  ```
+  >>> x = tf.constant([-float("inf"), -5, -0.5, 1, 1.2, 2, 3, float("inf")])
+  >>> tf.math.tanh(x)
+  <tf.Tensor: shape=(8,), dtype=float32, numpy=
+  array([-1.        , -0.99990916, -0.46211717,  0.7615942 ,  0.8336547 ,
+          0.9640276 ,  0.9950547 ,  1.        ], dtype=float32)>
+
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorMapStackKeys.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorMapStackKeys.pbtxt
new file mode 100644
index 00000000000..a8ecb43328a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorMapStackKeys.pbtxt
@@ -0,0 +1,8 @@
+op {
+  graph_op_name: "TensorMapStackKeys"
+  summary: "Returns a Tensor stack of all keys in a tensor map."
+  description: <<END
+input_handle: the input map
+keys: the returned Tensor of all keys in the map
+END
+}
\ No newline at end of file
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorScatterUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorScatterUpdate.pbtxt
index 3cd2a9e9580..f10407e5ca5 100644
--- a/tensorflow/core/api_def/base_api/api_def_TensorScatterUpdate.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TensorScatterUpdate.pbtxt
@@ -33,72 +33,35 @@ This operation is very similar to `tf.scatter_nd`, except that the updates are
 scattered onto an existing tensor (as opposed to a zero-tensor). If the memory
 for the existing tensor cannot be re-used, a copy is made and updated.
 
-If `indices` contains duplicates, then their updates are accumulated (summed).
+If `indices` contains duplicates, then we pick the last update for the index.
 
-**WARNING**: The order in which updates are applied is nondeterministic, so the
-output will be nondeterministic if `indices` contains duplicates -- because
-of some numerical approximation issues, numbers summed in different order
-may yield different results.
+If an out of bound index is found on CPU, an error is returned.
+
+**WARNING**: There are some GPU specific semantics for this operation.
+- If an out of bound index is found, the index is ignored.
+- The order in which updates are applied is nondeterministic, so the output
+will be nondeterministic if `indices` contains duplicates.
 
 `indices` is an integer tensor containing indices into a new tensor of shape
-`shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+`shape`.
 
-    indices.shape[-1] <= shape.rank
+* `indices` must have at least 2 axes: `(num_updates, index_depth)`.
+* The last axis of `indices` is how deep to index into `tensor` so  this index
+  depth must be less than the rank of `tensor`: `indices.shape[-1] <= tensor.ndim`
 
-The last dimension of `indices` corresponds to indices into elements
-(if `indices.shape[-1] = shape.rank`) or slices
-(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-`shape`.  `updates` is a tensor with shape
+if `indices.shape[-1] = tensor.rank` this Op indexes and updates scalar elements.
+if `indices.shape[-1] < tensor.rank` it indexes and updates slices of the input
+`tensor`.
 
-    indices.shape[:-1] + shape[indices.shape[-1]:]
+Each `update` has a rank of `tensor.rank - indices.shape[-1]`.
+The overall shape of `updates` is:
 
-The simplest form of scatter is to insert individual elements in a tensor by
-index. For example, say we want to insert 4 scattered elements in a rank-1
-tensor with 8 elements.
+```
+indices.shape[:-1] + tensor.shape[indices.shape[-1]:]
+```
 
-<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
-</div>
+For usage examples see the python [tf.tensor_scatter_nd_update](
+https://www.tensorflow.org/api_docs/python/tf/tensor_scatter_nd_update) function
 
-In Python, this scatter operation would look like this:
-
-    >>> indices = tf.constant([[4], [3], [1], [7]])
-    >>> updates = tf.constant([9, 10, 11, 12])
-    >>> tensor = tf.ones([8], dtype=tf.int32)
-    >>> print(tf.tensor_scatter_nd_update(tensor, indices, updates))
-    tf.Tensor([ 1 11  1 10  9  1  1 12], shape=(8,), dtype=int32)
-
-We can also, insert entire slices of a higher rank tensor all at once. For
-example, if we wanted to insert two slices in the first dimension of a
-rank-3 tensor with two matrices of new values.
-
-In Python, this scatter operation would look like this:
-
-    >>> indices = tf.constant([[0], [2]])
-    >>> updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-    ...                         [7, 7, 7, 7], [8, 8, 8, 8]],
-    ...                        [[5, 5, 5, 5], [6, 6, 6, 6],
-    ...                         [7, 7, 7, 7], [8, 8, 8, 8]]])
-    >>> tensor = tf.ones([4, 4, 4], dtype=tf.int32)
-    >>> print(tf.tensor_scatter_nd_update(tensor, indices, updates).numpy())
-    [[[5 5 5 5]
-      [6 6 6 6]
-      [7 7 7 7]
-      [8 8 8 8]]
-     [[1 1 1 1]
-      [1 1 1 1]
-      [1 1 1 1]
-      [1 1 1 1]]
-     [[5 5 5 5]
-      [6 6 6 6]
-      [7 7 7 7]
-      [8 8 8 8]]
-     [[1 1 1 1]
-      [1 1 1 1]
-      [1 1 1 1]
-      [1 1 1 1]]]
-
-Note that on CPU, if an out of bound index is found, an error is returned.
-On GPU, if an out of bound index is found, the index is ignored.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_UnicodeScript.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnicodeScript.pbtxt
index bef6afb0cf0..603cb71b429 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnicodeScript.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_UnicodeScript.pbtxt
@@ -21,7 +21,15 @@ END
   description: <<END
 This operation converts Unicode code points to script codes corresponding to
 each code point. Script codes correspond to International Components for
-Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html.
+Unicode (ICU) UScriptCode values.
+
+See
+[ICU project docs](http://icu-project.org/apiref/icu4c/uscript_8h.html)
+for more details on script codes.
+
+For an example, see the unicode strings guide on [unicode scripts]
+(https://www.tensorflow.org/tutorials/load_data/unicode#representing_unicode).
+
 Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will
 match input shape.
 
diff --git a/tensorflow/core/api_def/java_api/BUILD b/tensorflow/core/api_def/java_api/BUILD
new file mode 100644
index 00000000000..682bfed7deb
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/BUILD
@@ -0,0 +1,15 @@
+# Description:
+#   Provides Java ApiDef access and ApiDef validation for TensorFlow.
+
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+package(
+    default_visibility = ["//visibility:private"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "java_api_def",
+    srcs = glob(["*"]),
+    visibility = ["//tensorflow:internal"],
+)
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV4.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV4.pbtxt
new file mode 100644
index 00000000000..8054405368a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV4.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV4Grad"
+}
diff --git a/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt
new file mode 100644
index 00000000000..8054405368a
--- /dev/null
+++ b/tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV4Grad"
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_Acos.pbtxt b/tensorflow/core/api_def/python_api/api_def_Acos.pbtxt
index f4d7f498b25..0bea617a075 100644
--- a/tensorflow/core/api_def/python_api/api_def_Acos.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Acos.pbtxt
@@ -1,9 +1,4 @@
 op {
   graph_op_name: "Acos"
-  endpoint {
-    name: "math.acos"
-  }
-  endpoint {
-    name: "acos"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DiagPart.pbtxt b/tensorflow/core/api_def/python_api/api_def_DiagPart.pbtxt
index 6a149848f69..c50370ccc78 100644
--- a/tensorflow/core/api_def/python_api/api_def_DiagPart.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DiagPart.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "DiagPart"
-  endpoint {
-    name: "linalg.tensor_diag_part"
-  }
-  endpoint {
-    name: "diag_part"
-    deprecation_version: 2
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Floor.pbtxt b/tensorflow/core/api_def/python_api/api_def_Floor.pbtxt
index 14accd2b20d..0c9600765d5 100644
--- a/tensorflow/core/api_def/python_api/api_def_Floor.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Floor.pbtxt
@@ -1,9 +1,4 @@
 op {
   graph_op_name: "Floor"
-  endpoint {
-    name: "math.floor"
-  }
-  endpoint {
-    name: "floor"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV4.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV4.pbtxt
new file mode 100644
index 00000000000..0ed576f0690
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV4.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV4Grad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt
new file mode 100644
index 00000000000..0ed576f0690
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "QuantizeAndDequantizeV4Grad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorScatterUpdate.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorScatterUpdate.pbtxt
index aa4ab86c7c3..96ff5fe0091 100644
--- a/tensorflow/core/api_def/python_api/api_def_TensorScatterUpdate.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_TensorScatterUpdate.pbtxt
@@ -1,11 +1,4 @@
 op {
   graph_op_name: "TensorScatterUpdate"
-  deprecation_message: "Use tensor_scatter_nd_update instead"
-  endpoint {
-    name: "tensor_scatter_nd_update"
-  }
-  endpoint {
-    name: "tensor_scatter_update"
-    deprecation_version: 2
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 73c1458eab4..cf053b0af51 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -1,7 +1,6 @@
 load(
     "//tensorflow:tensorflow.bzl",
-    "cc_header_only_library",
-    "if_tpu",
+    "if_libtpu",
     "tf_cc_test",
     "tf_cc_test_mkl",
     "tf_cc_tests",
@@ -10,12 +9,18 @@ load(
     "tf_openmp_copts",
 )
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
+
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
 # For platform specific build config
 load(
     "//tensorflow/core/platform:build_config.bzl",
@@ -88,8 +93,7 @@ cc_library(
     deps = [
         ":core_cpu",
         "//tensorflow/core/common_runtime/gpu:gpu_runtime",
-        "//tensorflow/core/common_runtime/sycl:sycl_runtime",
-    ] + if_tpu(["//tensorflow/core/tpu:tpu_runtime"]),
+    ] + if_libtpu(["//tensorflow/core/tpu:tpu_runtime"]),
 )
 
 filegroup(
@@ -296,6 +300,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:traceme",
     ],
 )
@@ -379,7 +384,6 @@ cc_library(
     hdrs = ["collective_param_resolver_local.h"],
     copts = tf_copts(),
     deps = [
-        ":device",
         ":device_mgr",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -495,28 +499,19 @@ cc_library(
 
 cc_library(
     name = "device",
-    srcs = ["device.cc"],
     hdrs = ["device.h"],
     copts = tf_copts(),
     deps = [
         "//tensorflow/core:framework_internal",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
     ],
 )
 
 cc_library(
     name = "device_factory",
-    srcs = ["device_factory.cc"],
     hdrs = ["device_factory.h"],
     copts = tf_copts(),
     deps = [
-        ":device",
-        ":session_options",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:framework_internal",
     ],
 )
 
@@ -546,6 +541,9 @@ cc_library(
     deps = [
         ":device_mgr",
         "//tensorflow/core:framework",
+        "//tensorflow/core/framework:device_attributes_proto_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
     ],
 )
 
@@ -662,6 +660,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1541,6 +1540,7 @@ cc_library(
         ":local_device",
         ":scoped_allocator",
         ":session_options",
+        "@com_google_absl//absl/base",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
@@ -1663,6 +1663,7 @@ tf_cuda_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "//tensorflow/core/grappler:grappler_item",
@@ -1730,11 +1731,11 @@ tf_cuda_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
-        "//tensorflow/core:lib_experimental",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/debug:debug_graph_utils",
         "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/nccl:collective_communicator",
         "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:profiler_backends",
         "//tensorflow/core/profiler/lib:profiler_session",
@@ -1864,6 +1865,7 @@ tf_cc_tests(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/nccl:collective_communicator",
         "//tensorflow/core/platform:regexp",
         "//tensorflow/core/util:protos_test_cc",
         "//third_party/eigen3",
@@ -2215,7 +2217,7 @@ tf_cc_test(
 
 tf_cuda_cc_test(
     name = "direct_session_test",
-    size = "small",
+    size = "medium",
     srcs = ["direct_session_test.cc"],
     args = [] + if_cuda(["--heap_check=local"]),  # The GPU tracer leaks memory
     linkstatic = tf_kernel_tests_linkstatic(),
@@ -2261,6 +2263,10 @@ tf_cc_test(
     size = "small",
     srcs = ["direct_session_test.cc"],
     linkstatic = tf_kernel_tests_linkstatic(),
+    tags = [
+        "noasan",  #b/168811551
+        "notsan",  #b/168811551
+    ],
     deps = [
         ":core",
         ":core_cpu",
diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index a6629286698..4d82b1d5170 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
 #define VALUE_IN_DEBUG_STRING false
@@ -217,6 +218,9 @@ void BaseCollectiveExecutor::StartAbort(const Status& s) {
   VLOG(1) << "BaseCollectiveExecutor::StartAbort " << s;
   cem_->GetParamResolver()->StartAbort(s);
   remote_access_->StartAbort(s);
+  if (cem_->GetNcclCommunicator() != nullptr) {
+    cem_->GetNcclCommunicator()->StartAbort(s);
+  }
 }
 
 void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
@@ -225,30 +229,29 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
                                           StatusCallback done) {
   const auto is_callback_called = std::make_shared<std::atomic<bool>>(false);
 
-  // On any individual collective Op failure we need to abort the
-  // BufRendezvous so that other Ops in the instance don't hang
-  // waiting for transmissions that will never happen.
-  StatusCallback done_safe = [this, done, is_callback_called](const Status& s) {
-    auto should_call_callback = !is_callback_called->exchange(true);
-    if (should_call_callback) {
-      if (!s.ok()) {
-        remote_access_->buf_rendezvous()->StartAbort(s);
-      }
-      done(s);
-    }
+  StatusCallback done_safe = [done = std::move(done),
+                              is_callback_called](const Status& s) {
+    bool called = is_callback_called->exchange(true);
+    CHECK(!called) << "done callback is called twice in "  // Crash OK
+                      "BaseCollectiveExecutor::ExecuteAsync. Please file a "
+                      "issue on https://github.com/tensorflow/tensorflow.";
+    done(s);
   };
 
   auto timeout_microseconds = static_cast<int64>(
       col_params.instance.impl_details.timeout_seconds * 1'000'000);
   if (timeout_microseconds > 0) {
+    // Ensure this BaseCollectiveExecutor is alive when StartAbort() is called.
+    Ref();
     // TODO(xldrx): Share the timeout watchdog thread among collectives.
     SchedNonBlockingClosureAfter(
-        timeout_microseconds, [is_callback_called, done_safe] {
+        timeout_microseconds, [this, is_callback_called] {
           if (!is_callback_called->load()) {
-            auto status = Status(error::DEADLINE_EXCEEDED,
-                                 "Collective has timed out during execution.");
-            done_safe(status);
+            Status status(error::DEADLINE_EXCEEDED,
+                          "Collective has timed out during execution.");
+            StartAbort(status);
           }
+          Unref();
         });
   }
 
@@ -269,8 +272,8 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
   }
   core::ScopedUnref unref(col_impl);
   auto col_ctx = std::make_shared<CollectiveContext>(
-      this, dev_mgr_, ctx, CtxParams(ctx), col_params, exec_key, step_id_,
-      input, output);
+      this, cem_->GetNcclCommunicator(), dev_mgr_, ctx, CtxParams(ctx),
+      col_params, exec_key, step_id_, input, output);
   status = col_impl->InitializeCollectiveContext(col_ctx);
   if (!status.ok()) {
     done_safe(status);
@@ -279,16 +282,18 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
   // Run on an unbounded work queue that can handle blocking work so as to not
   // starve executor threads.
   col_impl->Ref();
-  RunClosure([col_impl, col_ctx, done_safe, ctx]() {
+  profiler::TraceMeProducer producer("BaseCollectiveExecutor::ExecuteAsync");
+  RunClosure([col_impl, col_ctx, done_safe, ctx,
+              context_id = producer.GetContextId()]() {
     core::ScopedUnref unref(col_impl);
-    profiler::TraceMe activity(
+    profiler::TraceMeConsumer consumer(
         [ctx] {
           string op = profiler::TraceMeOp(ctx->op_kernel().name_view(),
                                           ctx->op_kernel().type_string_view());
           return profiler::TraceMeEncode(std::move(op),
                                          {{"id", ctx->step_id()}});
         },
-        profiler::TraceMeLevel::kInfo);
+        context_id);
     col_impl->Ref();
     col_impl->Run([col_impl, col_ctx, done_safe](const Status& s) {
       core::ScopedUnref unref(col_impl);
@@ -298,34 +303,36 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
 }
 
 void BaseCollectiveExecutor::CompleteParamsAsync(
-    const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
-    StatusCallback done) {
-  cp->instance.gpu_ring_order = *gpu_ring_order_;
+    const DeviceAttributes& device, CollectiveParams* cp,
+    CancellationManager* cancel_mgr, StatusCallback done) {
+  cp->group.gpu_ring_order = *gpu_ring_order_;
   const auto is_callback_called = std::make_shared<std::atomic<bool>>(false);
-  auto done_with_timeout = done;
+  auto done_safe = [is_callback_called,
+                    done = std::move(done)](const Status& s) {
+    bool called = is_callback_called->exchange(true);
+    CHECK(!called) << "done callback is called twice in "  // Crash OK
+                      "BaseCollectiveExecutor::ExecuteAsync. Please file a "
+                      "issue on https://github.com/tensorflow/tensorflow.";
+    done(s);
+  };
   auto timeout_microseconds =
       static_cast<int64>(cp->instance.impl_details.timeout_seconds * 1'000'000);
   if (timeout_microseconds > 0) {
+    // Ensure this BaseCollectiveExecutor is alive when StartAbort() is called.
+    Ref();
     // TODO(xldrx): Share the timeout watchdog thread among collectives.
-    SchedNonBlockingClosureAfter(
-        timeout_microseconds, [is_callback_called, done] {
-          auto should_call_callback = !is_callback_called->exchange(true);
-          if (should_call_callback) {
-            auto status =
-                Status(error::DEADLINE_EXCEEDED,
-                       "Collective has timed out waiting for other workers.");
-            done(status);
-          }
-        });
-    done_with_timeout = [is_callback_called, done](const Status& s) {
-      auto should_call_callback = !is_callback_called->exchange(true);
-      if (should_call_callback) {
-        done(s);
+    SchedNonBlockingClosureAfter(timeout_microseconds, [this,
+                                                        is_callback_called]() {
+      if (!is_callback_called->load()) {
+        Status status(error::DEADLINE_EXCEEDED,
+                      "Collective has timed out waiting for other workers.");
+        StartAbort(status);
       }
-    };
+      Unref();
+    });
   }
   cem_->GetParamResolver()->CompleteParamsAsync(device, cp, cancel_mgr,
-                                                done_with_timeout);
+                                                done_safe);
 }
 
 Status BaseCollectiveExecutor::CreateCollective(
@@ -396,9 +403,9 @@ void BaseCollectiveExecutor::UnblockDependencies(
   mutex_lock l(launch_mu_);
   if (launched_.find(col_params.instance.instance_key) == launched_.end()) {
     const string& task_name =
-        col_params.instance.task_names[col_params.default_rank];
+        col_params.group.task_names[col_params.default_rank];
     const int32 num_devices =
-        col_params.instance.num_devices_per_task.at(task_name);
+        col_params.group.num_devices_per_task.at(task_name);
     launched_[col_params.instance.instance_key] = num_devices;
   }
   if (--launched_[col_params.instance.instance_key] == 0) {
diff --git a/tensorflow/core/common_runtime/base_collective_executor.h b/tensorflow/core/common_runtime/base_collective_executor.h
index c9cea393378..4081b887add 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.h
+++ b/tensorflow/core/common_runtime/base_collective_executor.h
@@ -113,7 +113,7 @@ class BaseCollectiveExecutor : public CollectiveExecutor {
   void ExecuteAsync(OpKernelContext* ctx, const CollectiveParams& col_params,
                     const string& exec_key, StatusCallback done) override;
 
-  void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+  void CompleteParamsAsync(const DeviceAttributes& device, CollectiveParams* cp,
                            CancellationManager* cancel_mgr,
                            StatusCallback done) override;
 
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.cc b/tensorflow/core/common_runtime/collective_executor_mgr.cc
index fb49e0cd761..4b7f5796dd6 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr.cc
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.cc
@@ -26,12 +26,14 @@ namespace tensorflow {
 CollectiveExecutorMgr::CollectiveExecutorMgr(
     const ConfigProto& config, const DeviceMgr* dev_mgr,
     std::unique_ptr<DeviceResolverInterface> dev_resolver,
-    std::unique_ptr<ParamResolverInterface> param_resolver)
+    std::unique_ptr<ParamResolverInterface> param_resolver,
+    std::unique_ptr<NcclCommunicatorInterface> nccl_communicator)
     : dev_mgr_(dev_mgr),
       dev_resolver_(std::move(dev_resolver)),
       param_resolver_(std::move(param_resolver)),
       gpu_ring_order_(
           config.gpu_options().experimental().collective_ring_order()),
+      nccl_communicator_(std::move(nccl_communicator)),
       work_queue_(std::make_shared<UnboundedWorkQueue>(Env::Default(),
                                                        "collective_ops")) {}
 
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.h b/tensorflow/core/common_runtime/collective_executor_mgr.h
index 0b2a260ad24..6d65e7fb50b 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr.h
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.h
@@ -22,12 +22,15 @@ limitations under the License.
 namespace tensorflow {
 class ConfigProto;
 class DeviceMgr;
+class NcclManager;
 
 class CollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
  public:
-  CollectiveExecutorMgr(const ConfigProto& config, const DeviceMgr* dev_mgr,
-                        std::unique_ptr<DeviceResolverInterface> dev_resolver,
-                        std::unique_ptr<ParamResolverInterface> param_resolver);
+  CollectiveExecutorMgr(
+      const ConfigProto& config, const DeviceMgr* dev_mgr,
+      std::unique_ptr<DeviceResolverInterface> dev_resolver,
+      std::unique_ptr<ParamResolverInterface> param_resolver,
+      std::unique_ptr<NcclCommunicatorInterface> nccl_communicator);
 
   virtual ~CollectiveExecutorMgr();
 
@@ -43,6 +46,10 @@ class CollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
     return dev_resolver_.get();
   }
 
+  NcclCommunicatorInterface* GetNcclCommunicator() const override {
+    return nccl_communicator_.get();
+  }
+
   void GetStepSequenceAsync(const GetStepSequenceRequest* request,
                             GetStepSequenceResponse* response,
                             const StatusCallback& done) override;
@@ -64,6 +71,7 @@ class CollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
   std::unique_ptr<DeviceResolverInterface> dev_resolver_;
   std::unique_ptr<ParamResolverInterface> param_resolver_;
   string gpu_ring_order_;
+  std::unique_ptr<NcclCommunicatorInterface> nccl_communicator_;
   // Unbounded work queue for scheduling potentially-blocking work during
   // collective op execution.  Ownership is shared between `this` and
   // `CollectiveRemoteAccessLocal`.
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
index 784ea4adda1..1f0f4e82d86 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
+++ b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/nccl/collective_communicator.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -47,7 +48,8 @@ class CollectiveExecutorMgrTest : public ::testing::Test {
         new CollectiveParamResolverLocal(cp, device_mgr_.get(), drl.get(),
                                          task_name));
     cme_.reset(new CollectiveExecutorMgr(cp, device_mgr_.get(), std::move(drl),
-                                         std::move(prl)));
+                                         std::move(prl),
+                                         MaybeCreateNcclCommunicator()));
   }
 
   std::unique_ptr<CollectiveExecutorMgr> cme_;
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index ba21abcbaa8..9c46314af67 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <stddef.h>
 
 #include <algorithm>
-#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
@@ -37,10 +38,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-void CollectiveParamResolverLocal::InstanceRec::WaitForOutMu(mutex_lock& lock) {
-  while (!out_mu_available) out_cv.wait(lock);
-}
-
 CollectiveParamResolverLocal::CollectiveParamResolverLocal(
     const ConfigProto& config, const DeviceMgr* dev_mgr,
     DeviceResolverInterface* dev_resolver, const string& task_name)
@@ -70,16 +67,28 @@ const char* GetCollectiveName(const CollectiveParams* cp, bool nccl) {
     case GATHER_COLLECTIVE:
       return "RingGather";
 
+    case PERMUTE_COLLECTIVE:
+      return "Permute";
+
     default:
       return "undef";
   }
 }
+
+string TaskNameFromDeviceName(const string& device_name) {
+  DeviceNameUtils::ParsedName parsed_device;
+  CHECK(DeviceNameUtils::ParseFullName(device_name, &parsed_device));
+  string task_name;
+  CHECK(DeviceNameUtils::GetTaskName(parsed_device, &task_name));
+  return task_name;
+}
 }  // namespace
 
 void CollectiveParamResolverLocal::CompleteGroupLocal(
-    const string& device, CollectiveParams* cp, const GroupRecCallback& done) {
-  VLOG(1) << "CompleteGroupLocal device=" << device << " cp: " << cp << ": "
-          << cp->ToString();
+    const DeviceAttributes& device, CollectiveParams* cp,
+    const GroupRecCallback& done) {
+  VLOG(1) << "CompleteGroupLocal device=" << device.name() << " cp: " << cp
+          << ": " << cp->ToString();
   std::vector<StatusCallback> to_be_called;
   GroupRec* gr = nullptr;
   Status status;
@@ -88,9 +97,11 @@ void CollectiveParamResolverLocal::CompleteGroupLocal(
     auto it = group_table_.find(cp->group.group_key);
     if (it == group_table_.end()) {
       gr = new GroupRec;
+      mutex_lock grl(gr->mu);
       gr->group.group_key = cp->group.group_key;
       gr->group.group_size = cp->group.group_size;
       gr->group.device_type = cp->group.device_type;
+      gr->group.gpu_ring_order = cp->group.gpu_ring_order;
 
       // Initialize group runtime details.
       CollectiveImplementationInterface* col_impl;
@@ -139,13 +150,13 @@ void CollectiveParamResolverLocal::CompleteGroupLocal(
     // status.
     VLOG(2) << "gr device_type=" << gr->group.device_type
             << " cp device_type=" << cp->group.device_type
-            << " current device=" << device;
+            << " current device=" << device.name();
     if (gr->status.ok()) {
       // Check for consistency with existing GroupRec.
       if (cp->group.device_type != gr->group.device_type) {
         gr->status = errors::Internal(
-            "Collective Op ", cp->name, " is assigned to device ", device,
-            " with type ", cp->group.device_type.type_string(),
+            "Collective Op ", cp->name, " is assigned to device ",
+            device.name(), " with type ", cp->group.device_type.type_string(),
             " and group_key ", cp->group.group_key, " but that group has type ",
             gr->group.device_type.type_string());
       } else if (cp->group.group_size != gr->group.group_size) {
@@ -155,54 +166,59 @@ void CollectiveParamResolverLocal::CompleteGroupLocal(
             " but that group has size ", gr->group.group_size);
       }
     }
+    bool new_device = false;
     if (gr->status.ok()) {
       // Insert device if not already present.
-      auto it = gr->device_set.find(device);
-      if (it == gr->device_set.end()) {
-        if (gr->device_set.size() == gr->group.group_size) {
+      auto it = gr->devices.find(device.name());
+      if (it == gr->devices.end()) {
+        if (gr->devices.size() == gr->group.group_size) {
           // The group is already full.
           gr->status = errors::Internal(
-              "Collective Op ", cp->name, " is assigned to device ", device,
-              " and group_key ", cp->group.group_key,
+              "Collective Op ", cp->name, " is assigned to device ",
+              device.name(), " and group_key ", cp->group.group_key,
               " but that group doesn't contain that device.");
         } else {
           // This is a new device that has not yet joined the group.
-          gr->device_set.insert(device);
-          gr->device_list.push_back(device);
-          DeviceNameUtils::ParsedName parsed_device;
-          DeviceNameUtils::ParseFullName(device, &parsed_device);
-          string task_name = strings::StrCat("/job:", parsed_device.job,
-                                             "/replica:", parsed_device.replica,
-                                             "/task:", parsed_device.task);
-          gr->task_set.insert(task_name);
-          gr->task_list.push_back(task_name);
-          gr->group.num_tasks = static_cast<int32>(gr->task_set.size());
+          gr->devices[device.name()] = device;
+          new_device = true;
           if (VLOG_IS_ON(1)) {
             string dev_buf;
-            for (const auto& d : gr->device_set) {
-              strings::StrAppend(&dev_buf, ",", d);
+            for (const auto& d : gr->devices) {
+              strings::StrAppend(&dev_buf, ",", d.first);
             }
             VLOG(1) << "CompleteGroupLocal group_key=" << gr->group.group_key
                     << " group_size=" << gr->group.group_size << " (current"
                     << " devices)=(" << dev_buf << ") (number of"
                     << " devices pending)="
-                    << (gr->group.group_size - gr->device_set.size());
+                    << (gr->group.group_size - gr->devices.size());
           }
         }
+      } else {
+        // If the device already exists, check if the incarnation matches.
+        if (it->second.incarnation() != device.incarnation()) {
+          gr->status = errors::FailedPrecondition(
+              "Device ", device.name(),
+              " current incarnation doesn't match with one in the group. This "
+              "usually means this worker has restarted but the collective "
+              "leader hasn't, or this worker connects to a wrong cluster.");
+        }
       }
     }
 
     if (gr->status.ok()) {
-      cp->group.runtime_details = gr->group.runtime_details;
       // If the group is not yet complete, queue to wait for it.
       VLOG(2) << "group_size " << gr->group.group_size << " set size "
-              << gr->device_set.size() << " gr " << gr;
+              << gr->devices.size() << " gr " << gr;
 
-      if (gr->device_set.size() < gr->group.group_size) {
+      if (gr->devices.size() < gr->group.group_size) {
         gr->waiting.push_back(std::bind(done, std::placeholders::_1, gr));
         return;
       }
-      CHECK_EQ(gr->device_set.size(), gr->group.group_size);
+      CHECK_EQ(gr->devices.size(), gr->group.group_size);
+      // We get a full group. Fill in remaining fields in gr->group.
+      if (new_device) {
+        FinishGroup(gr);
+      }
     }
     // At this point, we either have a full group, or an error status.  Ensure
     // that all callbacks are invoked with the appropriate status.
@@ -230,16 +246,16 @@ typedef std::unordered_map<string, DevRec> TaskDeviceMap;
 typedef std::unordered_map<string, TaskDeviceMap> GlobalDeviceMap;
 
 // Create a populated GlobalDeviceMap from CollInstanceParams and localities.
-GlobalDeviceMap BuildDevRecs(const CollInstanceParams& ip,
+GlobalDeviceMap BuildDevRecs(const CollGroupParams& gp,
                              const std::vector<DeviceAttributes>& attributes) {
   GlobalDeviceMap gdm;
-  CHECK_EQ(ip.device_names.size(), ip.task_names.size());
-  CHECK_EQ(ip.device_names.size(), attributes.size());
-  for (int i = 0; i < ip.device_names.size(); ++i) {
-    TaskDeviceMap& tdm = gdm[ip.task_names[i]];
-    DevRec* dr = &tdm[ip.device_names[i]];
-    dr->task = ip.task_names[i];
-    dr->device = ip.device_names[i];
+  CHECK_EQ(gp.device_names.size(), gp.task_names.size());
+  CHECK_EQ(gp.device_names.size(), attributes.size());
+  for (int i = 0; i < gp.device_names.size(); ++i) {
+    TaskDeviceMap& tdm = gdm[gp.task_names[i]];
+    DevRec* dr = &tdm[gp.device_names[i]];
+    dr->task = gp.task_names[i];
+    dr->device = gp.device_names[i];
     dr->original_rank = i;
     dr->local_rank = 0;   // Will be populated later by OrderTaskDeviceMap.
     dr->global_rank = 0;  // Will be populated later by EstablishGlobalRank.
@@ -360,25 +376,23 @@ void OrderTaskDeviceMap(const string& gpu_ring_order, TaskDeviceMap* tdm) {
   }
 }
 
-// The first time a shared CollectiveParams is established for a
-// shared set of instances we compute a good rank order for all the
-// devices in the group, that is appropriate for a ring algorithm.
-// This order need not be the same across different instance groups
-// sharing the same device group where there is more than one good
-// order.
+// The first time a CollGroupParams is established for a group we compute a good
+// rank order for all the devices in the group, that is appropriate for a ring
+// algorithm.
 GlobalDeviceMap EstablishGlobalRank(
-    CollectiveParams* cp, const std::vector<DeviceAttributes>& attributes) {
+    const CollGroupParams& gp,
+    const std::vector<DeviceAttributes>& attributes) {
   VLOG(1) << "EstablishGlobalRank";
-  GlobalDeviceMap gdm = BuildDevRecs(cp->instance, attributes);
+  GlobalDeviceMap gdm = BuildDevRecs(gp, attributes);
   for (auto& iter : gdm) {
     TaskDeviceMap& tdm = iter.second;
-    OrderTaskDeviceMap(cp->instance.gpu_ring_order, &tdm);
+    OrderTaskDeviceMap(gp.gpu_ring_order, &tdm);
   }
   // Connect the global rank order by the order in which tasks first appear.
   std::set<string> ordered_tasks;
   int next_rank = 0;
-  for (int i = 0; i < cp->instance.task_names.size(); ++i) {
-    const string& task_name = cp->instance.task_names[i];
+  for (int i = 0; i < gp.task_names.size(); ++i) {
+    const string& task_name = gp.task_names[i];
     if (ordered_tasks.find(task_name) != ordered_tasks.end()) {
       continue;
     }
@@ -393,81 +407,104 @@ GlobalDeviceMap EstablishGlobalRank(
 }
 
 // Count the devices associated with each task and set
-// cp->same_num_devices_per_task.  Requires cp->instance.task_names
+// gp->same_num_devices_per_task.  Requires gp->task_names
 // be sorted.
-void SetDevPerTask(CollectiveParams* cp) {
-  cp->instance.num_devices_per_task.clear();
-  const string* last_task_name = &cp->instance.task_names[0];
+void SetDevPerTask(CollGroupParams* gp) {
+  gp->num_devices_per_task.clear();
+  const string* last_task_name = &gp->task_names[0];
   int count = 0;
-  for (const string& task_name : cp->instance.task_names) {
+  for (const string& task_name : gp->task_names) {
     if (task_name == *last_task_name) {
       ++count;
     } else {
-      cp->instance.num_devices_per_task[*last_task_name] = count;
+      gp->num_devices_per_task[*last_task_name] = count;
       count = 1;
       last_task_name = &task_name;
     }
   }
-  cp->instance.num_devices_per_task[*last_task_name] = count;
+  gp->num_devices_per_task[*last_task_name] = count;
 
-  cp->instance.same_num_devices_per_task = false;
+  gp->same_num_devices_per_task = false;
   int dev_per_task = -1;
-  for (const auto& task_dev : cp->instance.num_devices_per_task) {
+  for (const auto& task_dev : gp->num_devices_per_task) {
     if (dev_per_task == -1) {
       dev_per_task = task_dev.second;
     } else if (dev_per_task != task_dev.second) {
       return;
     }
   }
-  cp->instance.same_num_devices_per_task = true;
-  CHECK_EQ((cp->group.group_size % cp->group.num_tasks), 0);
+  gp->same_num_devices_per_task = true;
+  CHECK_EQ((gp->group_size % gp->num_tasks), 0);
 }
 
-// Sort cp->instance.device_names lexicographically, but do by first
-// computing a reordering permutation so we can keep cp->instance.task_names
+// Sort gp->device_names lexicographically, but do by first
+// computing a reordering permutation so we can keep gp->task_names
 // in corresponding order.
-void SortDevicesAndTasks(CollectiveParams* cp) {
-  VLOG(1) << "SortDevicesAndTasks " << cp << " instance " << &cp->instance;
-  CHECK(cp);
-  CHECK_EQ(cp->group.group_size, cp->instance.device_names.size());
-  CHECK_EQ(cp->group.group_size, cp->instance.task_names.size());
-  std::vector<int> perm(cp->group.group_size);
+void SortDevicesAndTasks(CollGroupParams* gp) {
+  VLOG(1) << "SortDevicesAndTasks " << gp << " " << gp;
+  CHECK(gp);
+  CHECK_EQ(gp->group_size, gp->device_names.size());
+  CHECK_EQ(gp->group_size, gp->task_names.size());
+  std::vector<int> perm(gp->group_size);
   // TODO(tucker): substitute std::iota when the windows build supports it.
   // std::iota(perm.begin(), perm.end(), 0);
   for (int i = 0; i < perm.size(); ++i) {
     perm[i] = i;
   }
-  std::sort(perm.begin(), perm.end(), [cp](int a, int b) {
-    return cp->instance.device_names[a] < cp->instance.device_names[b];
+  std::sort(perm.begin(), perm.end(), [gp](int a, int b) {
+    return gp->device_names[a] < gp->device_names[b];
   });
   std::vector<string> new_devs;
   std::vector<string> new_tasks;
-  new_devs.reserve(cp->group.group_size);
-  new_tasks.reserve(cp->group.group_size);
+  new_devs.reserve(gp->group_size);
+  new_tasks.reserve(gp->group_size);
   for (int pi : perm) {
-    new_devs.push_back(cp->instance.device_names[pi]);
-    new_tasks.push_back(cp->instance.task_names[pi]);
+    new_devs.push_back(gp->device_names[pi]);
+    new_tasks.push_back(gp->task_names[pi]);
   }
-  cp->instance.device_names = std::move(new_devs);
-  cp->instance.task_names = std::move(new_tasks);
-  VLOG(1) << "Modified device_names on " << cp;
-  SetDevPerTask(cp);
+  gp->device_names = std::move(new_devs);
+  gp->task_names = std::move(new_tasks);
+  VLOG(1) << "Modified device_names on " << gp;
+  SetDevPerTask(gp);
 }
 }  // namespace
 
+void CollectiveParamResolverLocal::FinishGroup(GroupRec* gr) {
+  gr->group.device_names.reserve(gr->devices.size());
+  gr->group.task_names.reserve(gr->devices.size());
+  std::vector<DeviceAttributes> attributes;
+  // Unique tasks. It's used to calculate num_tasks.
+  std::unordered_set<string> tasks;
+  attributes.reserve(gr->devices.size());
+  for (const auto& item : gr->devices) {
+    gr->group.device_names.push_back(item.first);
+    string task_name = TaskNameFromDeviceName(item.first);
+    gr->group.task_names.push_back(task_name);
+    tasks.insert(task_name);
+    attributes.push_back(item.second);
+  }
+  gr->group.num_tasks = static_cast<int32>(tasks.size());
+  // Sort device_names lexicographically, keeping task_names in corresponding
+  // order. Also set number of devices per task.
+  SortDevicesAndTasks(&gr->group);
+  // Establish the final order of gp->device_names and gp->task_names by
+  // considering localities of all devices.
+  CompleteDefaultRanking(attributes, &gr->group);
+}
+
 void CollectiveParamResolverLocal::CompleteTaskIsLocal(const string& task_name,
                                                        CollectiveParams* cp) {
   cp->task.is_local.resize(cp->group.group_size, false);
   for (int i = 0; i < cp->group.group_size; ++i) {
-    cp->task.is_local[i] = (cp->instance.task_names[i] == task_name);
+    cp->task.is_local[i] = (cp->group.task_names[i] == task_name);
   }
 }
 
 void CollectiveParamResolverLocal::SetDefaultRank(const string& device,
                                                   CollectiveParams* cp) {
-  CHECK_EQ(cp->group.group_size, cp->instance.device_names.size()) << cp;
+  CHECK_EQ(cp->group.group_size, cp->group.device_names.size()) << cp;
   for (int i = 0; i < cp->group.group_size; ++i) {
-    if (cp->instance.device_names[i] == device) {
+    if (cp->group.device_names[i] == device) {
       cp->default_rank = i;
       break;
     }
@@ -475,64 +512,15 @@ void CollectiveParamResolverLocal::SetDefaultRank(const string& device,
 }
 
 void CollectiveParamResolverLocal::InitInstanceSharedParams(
-    const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
-    const StatusCallback& done) {
+    const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir) {
   ir->shared.instance = cp->instance;
-  {
-    mutex_lock gl(gr->mu);
-    ir->shared.group = gr->group;
-    ir->shared.instance.device_names.assign(gr->device_list.begin(),
-                                            gr->device_list.end());
-    ir->shared.instance.task_names.assign(gr->task_list.begin(),
-                                          gr->task_list.end());
-    VLOG(2) << "Initialized names for instance: "
-            << ir->shared.instance.ToString();
-  }
   ir->shared.default_rank = -1;
 
-  // Sort device_names lexicographically, keeping task_names in corresponding
-  // order.  Also set number of devices per task.
-  SortDevicesAndTasks(&ir->shared);
-
-  // Get Locality data for all devices.
-
   // Set is_local and task_names in *shared prior to invoking
   // GetDeviceAttributesAsync.  In a distributed context this function can be
   // called by a derived class, some of the devices may be non-local and
   // GetDeviceAttributesAsync will use those fields to launch RPCs.
   CompleteTaskIsLocal(task_name_, &ir->shared);
-
-  // Because the callback may execute in a different thread, we release
-  // ir->out_mu here.  Before releasing, we mark it as unavailable for other
-  // threads.
-  ir->out_mu_available = false;
-  const auto device_names = ir->shared.instance.device_names;
-  const auto task_names = ir->shared.instance.task_names;
-  ir->out_mu.unlock();
-  std::vector<DeviceAttributes>* attributes = new std::vector<DeviceAttributes>;
-  // Suppress linter warning about access to shared without mutex because in
-  // principle the members are locked due to out_mu_available=false.
-  dev_resolver_->GetAllDeviceAttributesAsync(
-      ir->shared.instance.device_names,  // NOLINT
-      ir->shared.instance.task_names,    // NOLINT
-      attributes,
-      [this, gr, cp, ir, attributes, done](const Status& s)
-          TF_EXCLUSIVE_LOCK_FUNCTION(ir->out_mu) {
-            // Then we recover the lock in the callback thread that will hold it
-            // through the rest of the call chain.  Signal the cv now, any
-            // waiting threads will wake only when out_mu is released later.
-            ir->out_mu.lock();
-            DCHECK(!ir->out_mu_available);
-            ir->out_mu_available = true;
-            ir->out_cv.notify_all();
-            if (s.ok()) {
-              CompleteDefaultRanking(gr, cp, ir, *attributes);
-              done(Status::OK());
-            } else {
-              done(s);
-            }
-            delete attributes;
-          });
 }
 
 // NOTE(ayushd): The DeviceLocality objects in attributes will have LocalLinks
@@ -540,76 +528,55 @@ void CollectiveParamResolverLocal::InitInstanceSharedParams(
 // TensorFlow runtime.  This set of devices may be a superset of the devices
 // participating in this instance of collectives.
 void CollectiveParamResolverLocal::CompleteDefaultRanking(
-    const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
-    const std::vector<DeviceAttributes>& attributes) {
+    const std::vector<DeviceAttributes>& attributes, CollGroupParams* gp) {
   // Establish an instance-specific default rank order for devices
   // based on localities.  This rank order should be a good ring
   // order, if possible.
-  GlobalDeviceMap gdm = EstablishGlobalRank(&ir->shared, attributes);
+  GlobalDeviceMap gdm = EstablishGlobalRank(*gp, attributes);
   // Reflect the new global ranking on shared
-  size_t num_devices = ir->shared.group.group_size;
+  size_t num_devices = gp->group_size;
   std::vector<string> new_device_names(num_devices, "");
   std::vector<string> new_task_names(num_devices, "");
   for (const auto& git : gdm) {
     const TaskDeviceMap& tdm = git.second;
     for (const auto& tit : tdm) {
       const DevRec& dr = tit.second;
-      new_device_names[dr.global_rank] =
-          ir->shared.instance.device_names[dr.original_rank];
-      new_task_names[dr.global_rank] =
-          ir->shared.instance.task_names[dr.original_rank];
+      new_device_names[dr.global_rank] = gp->device_names[dr.original_rank];
+      new_task_names[dr.global_rank] = gp->task_names[dr.original_rank];
     }
   }
 
-  ir->shared.instance.device_names = new_device_names;
-  ir->shared.instance.task_names = new_task_names;
+  gp->device_names = new_device_names;
+  gp->task_names = new_task_names;
   if (VLOG_IS_ON(2)) {
     string buf;
     for (const auto& d : new_device_names) strings::StrAppend(&buf, "\n", d);
-    VLOG(2) << "Optimized device order for " << ir->shared.name << ": " << buf;
+    VLOG(2) << "Optimized device order for group " << gp->group_key << ": "
+            << buf;
   }
 }
 
-void CollectiveParamResolverLocal::CallbackWithStatus(
-    const InstanceRecCallback& done, InstanceRec* irec) {
-  Status s;
-  {
-    mutex_lock l(irec->out_mu);
-    irec->WaitForOutMu(l);
-    s = irec->status;
-  }
-  done(s, irec);
-}
-
-void CollectiveParamResolverLocal::FindInstanceRec(
-    const GroupRec* gr, CollectiveParams* cp, const InstanceRecCallback& done) {
+CollectiveParamResolverLocal::InstanceRec*
+CollectiveParamResolverLocal::GetOrCreateInstanceRec(const GroupRec* gr,
+                                                     CollectiveParams* cp) {
   InstanceRec* irec = nullptr;
-  bool exit_outside_locks = false;
   {
-    bool found_instance = false;
     mutex_lock l(instance_mu_);
     auto group_it = instance_table_.find(gr->group.group_key);
     if (group_it != instance_table_.end()) {
       auto instance_it = group_it->second.find(cp->instance.instance_key);
       if (instance_it != group_it->second.end()) {
         irec = instance_it->second.get();
-        {
-          mutex_lock l(irec->in_mu);
-          if (irec->is_init) {
-            exit_outside_locks = true;
-          } else {
-            irec->init_waiters.push_back([this, done](InstanceRec* irec) {
-              CallbackWithStatus(done, irec);
-            });
-            return;
-          }
-        }
-        found_instance = true;
       }
     }
-    if (!found_instance) {
+    if (irec == nullptr) {
       // Create new InstanceRec.
       irec = new InstanceRec;
+      {
+        mutex_lock il(irec->mu);
+        irec->known.resize(cp->group.group_size, false);
+      }
+      InitInstanceSharedParams(gr, cp, irec);
       instance_table_[gr->group.group_key][cp->instance.instance_key].reset(
           irec);
     }
@@ -620,77 +587,22 @@ void CollectiveParamResolverLocal::FindInstanceRec(
     status = status_;
   }
   if (!status.ok()) {
-    mutex_lock il(irec->out_mu);
-    irec->WaitForOutMu(il);
+    mutex_lock l(irec->mu);
     irec->status = status;
   }
-  if (exit_outside_locks) {
-    CallbackWithStatus(done, irec);
-    return;
-  }
-
-  CallInitInstanceSharedParams(gr, cp, irec, done);
-}
-
-void CollectiveParamResolverLocal::CallInitInstanceSharedParams(
-    const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
-    const InstanceRecCallback& done) TF_NO_THREAD_SAFETY_ANALYSIS {
-  // This function serves merely to make a function call that should
-  // be thread/mutex safe but violates the simple model applied by
-  // static analysis, so we turn off analysis only within this
-  // function body.
-  //
-  // A lock on ir->out_mu must be held* throughout the _bodies_ of the
-  // chain of function calls initiated here, each of which calls
-  // another as its last action, but it will be dropped within the
-  // callback defined below, which means that the lock can be dropped
-  // before all the function stack frames pop. The static analysis will
-  // not allow that.
-  //
-  // *the lock is dropped just before calling GetDeviceAttributesAsync, because
-  // there is no guarantee that the thread that executes the callback is the
-  // same as the one that locked ir->out_mu.  To prevent other threads from
-  // grabbing ir->out_mu, we mark ir->out_mu_available as false.  Hence, in
-  // principle, the lock is held throughout.
-  ir->out_mu.lock();
-  DCHECK(ir->out_mu_available);
-  ir->known.resize(cp->group.group_size, false);
-  InitInstanceSharedParams(
-      gr, cp, ir,
-      [this, ir, done](const Status& s) TF_UNLOCK_FUNCTION(ir->out_mu) {
-        DCHECK(ir->out_mu_available);
-        ir->status.Update(s);
-        ir->out_mu.unlock();
-        // Prepare to invoke any waiters that accumulated during
-        // initialization.
-        std::vector<IRConsumer> init_waiters;
-        {
-          mutex_lock tl(instance_mu_);
-          {
-            mutex_lock l(ir->in_mu);
-            ir->is_init = true;
-            if (!ir->init_waiters.empty()) {
-              std::swap(init_waiters, ir->init_waiters);
-            }
-          }
-        }
-        CallbackWithStatus(done, ir);
-        for (auto& f : init_waiters) {
-          f(ir);
-        }
-      });
+  return irec;
 }
 
 void CollectiveParamResolverLocal::CompleteParamsAsync(
-    const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
-    const StatusCallback& done) {
-  VLOG(1) << "CompleteParams local " << device << " for " << cp << ": "
+    const DeviceAttributes& device, CollectiveParams* cp,
+    CancellationManager* cancel_mgr, const StatusCallback& done) {
+  VLOG(1) << "CompleteParams local " << device.name() << " for " << cp << ": "
           << cp->ToString();
   CompleteGroupLocal(
       device, cp,
       [this, device, cp, done](const Status& s, const GroupRec* gr) {
         if (s.ok()) {
-          CompleteInstanceLocal(device, gr, cp, cp->is_source, done);
+          CompleteInstanceLocal(device.name(), gr, cp, cp->is_source, done);
         } else {
           done(s);
         }
@@ -734,34 +646,35 @@ void CollectiveParamResolverLocal::CompleteInstanceLocal(
 
   // Populate the group portion of *cp from *gr.  Most of it should already
   // match.
-  DCHECK_EQ(cp->group.group_key, gr->group.group_key);
-  DCHECK_EQ(cp->group.group_size, gr->group.group_size);
-  DCHECK_EQ(cp->group.device_type, gr->group.device_type);
-  cp->group = gr->group;
+  {
+    mutex_lock l(gr->mu);
+    DCHECK_EQ(cp->group.group_key, gr->group.group_key);
+    DCHECK_EQ(cp->group.group_size, gr->group.group_size);
+    DCHECK_EQ(cp->group.device_type, gr->group.device_type);
+    cp->group = gr->group;
+  }
 
-  // Get the shared InstanceRec for this instance.
-  FindInstanceRec(gr, cp,
-                  [this, device, gr, cp, is_source, done](const Status& s,
-                                                          InstanceRec* ir) {
-                    if (s.ok()) {
-                      CompleteInstanceFromInitializedIRec(device, gr, cp, ir,
-                                                          is_source, done);
-                    } else {
-                      done(s);
-                    }
-                  });
+  InstanceRec* ir = GetOrCreateInstanceRec(gr, cp);
+  CompleteInstanceFromInitializedIRec(device, gr, cp, ir, is_source, done);
 }
 
 void CollectiveParamResolverLocal::CompleteInstanceFromInitializedIRec(
     const string& device, const GroupRec* gr, CollectiveParams* cp,
     InstanceRec* ir, bool is_source, const StatusCallback& done) {
   auto expected_shape = cp->instance.shape;
+  Status status;
   // Populate the fields common across instance.
   {
-    mutex_lock l(ir->out_mu);
-    ir->WaitForOutMu(l);
-    // custom operator= does a deep copy.
-    cp->instance = ir->shared.instance;
+    mutex_lock l(ir->mu);
+    status = ir->status;
+    if (status.ok()) {
+      // custom operator= does a deep copy.
+      cp->instance = ir->shared.instance;
+    }
+  }
+  if (!status.ok()) {
+    done(status);
+    return;
   }
   if (expected_shape != cp->instance.shape) {
     done(errors::InvalidArgument(
@@ -779,7 +692,7 @@ void CollectiveParamResolverLocal::CompleteInstanceFromInitializedIRec(
   CompleteTaskIsLocal(task_name_, cp);
 
   CollectiveImplementationInterface* col_impl;
-  Status status = CollectiveRegistry::LookupParamResolverInstance(
+  status = CollectiveRegistry::LookupParamResolverInstance(
       cp->instance.impl_details.collective_name, &col_impl);
   if (!status.ok()) {
     done(status);
@@ -796,8 +709,7 @@ void CollectiveParamResolverLocal::CompleteInstanceFromInitializedIRec(
                      s = errors::Internal("Expected ir ", ir, " and irec ",
                                           irec, " to be equal");
                    } else {
-                     mutex_lock l(irec->out_mu);
-                     irec->WaitForOutMu(l);
+                     mutex_lock l(irec->mu);
                      s = irec->status;
                      cp->source_rank = irec->source_rank;
                    }
@@ -817,8 +729,7 @@ void CollectiveParamResolverLocal::WaitForGroup(InstanceRec* ir,
                                                 const IRConsumer& f) {
   std::vector<IRConsumer> ready_waiters;
   do {
-    mutex_lock l(ir->out_mu);
-    ir->WaitForOutMu(l);
+    mutex_lock l(ir->mu);
     if (!ir->status.ok()) {
       break;
     }
@@ -839,11 +750,11 @@ void CollectiveParamResolverLocal::WaitForGroup(InstanceRec* ir,
         }
       }
     }
-    if (ir->known_count < ir->shared.group.group_size) {
+    if (ir->known_count < cp->group.group_size) {
       ir->known_waiters.push_back(f);
       return;
     }
-    CHECK_EQ(ir->known_count, ir->shared.group.group_size);
+    CHECK_EQ(ir->known_count, cp->group.group_size);
     if (ir->source_rank < 0) {
       // NOTE(ayushd): changing the error message below would also require
       // updating CompleteParamsBroadcastForgotSend test in
@@ -906,8 +817,7 @@ void CollectiveParamResolverLocal::StartAbortLocal(const Status& s) {
   for (InstanceRec* ir : instances) {
     std::vector<IRConsumer> known_waiters;
     {
-      mutex_lock il(ir->out_mu);
-      ir->WaitForOutMu(il);
+      mutex_lock il(ir->mu);
       ir->status = s;
       known_waiters.swap(ir->known_waiters);
     }
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h
index 40f0f00affc..19d68a14a63 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.h
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -19,9 +19,11 @@ limitations under the License.
 #include <memory>
 #include <set>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 
@@ -45,7 +47,7 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
 
   ~CollectiveParamResolverLocal() override {}
 
-  void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+  void CompleteParamsAsync(const DeviceAttributes& device, CollectiveParams* cp,
                            CancellationManager* cancel_mgr,
                            const StatusCallback& done) override;
 
@@ -67,13 +69,10 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
 
   // Used to complete/verify CollGroup.
   struct GroupRec {
-    CollGroupParams group;
     mutable mutex mu;
+    CollGroupParams group TF_GUARDED_BY(mu);
     Status status TF_GUARDED_BY(mu);
-    std::set<string> device_set TF_GUARDED_BY(mu);
-    std::vector<string> device_list TF_GUARDED_BY(mu);
-    std::set<string> task_set TF_GUARDED_BY(mu);
-    std::vector<string> task_list TF_GUARDED_BY(mu);
+    std::unordered_map<string, DeviceAttributes> devices TF_GUARDED_BY(mu);
     std::vector<StatusCallback> waiting TF_GUARDED_BY(mu);
   };
 
@@ -85,73 +84,36 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   // callback.
   typedef std::function<void(const Status& s, const GroupRec* gr)>
       GroupRecCallback;
-  void CompleteGroupLocal(const string& device, CollectiveParams* cp,
+  void CompleteGroupLocal(const DeviceAttributes& device, CollectiveParams* cp,
                           const GroupRecCallback& done)
       TF_LOCKS_EXCLUDED(group_mu_);
 
+  // Finishes the group parameters once all members of the group are there.
+  void FinishGroup(GroupRec* gr) TF_EXCLUSIVE_LOCKS_REQUIRED(gr->mu);
+
   // Used to complete/verify CollInstance.
   struct InstanceRec;
 
   typedef std::function<void(InstanceRec*)> IRConsumer;
   struct InstanceRec {
-    // This structure has two mutexes so that a possibly long
-    // initialization can be done without holding the instance_mu_
-    // table lock the whole time (which can cause an excessive number
-    // of threads to block on it), and because the compiler may not
-    // permit mutex locks to be taken in more than one order.
-    //
-    // out_mu guards access to most of the fields.
-    // in_mu guards access to a queue of consumer callbacks wanting to
-    // read the fields guarded by out_mu.
-    //
-    // The in_mu should be locked only while holding instance_mu_; the
-    // out_mu should be locked only while not holding
-    // instance_mu_.
-    //
-    // When is_init is false (the initial value) any potential user
-    // other than the creator should queue a callback on init_waiters.
-    // As soon as the shared member of this structure is fully
-    // initialized is_init will be set true and those callbacks will
-    // be invoked.
-    //
-    // Once inserted in the table this structure will never be replaced
-    // so users can capture the pointer while holding instance_mu_,
-    // drop that lock, then take a lock on out_mu before
-    // reading/modifying its values.
-    mutex in_mu;
-    bool is_init TF_GUARDED_BY(in_mu);
-    std::vector<IRConsumer> init_waiters TF_GUARDED_BY(in_mu);
-
-    // A thread that wishes to acquire out_mu must ensure that it is available
-    // by invoking WaitForOutMu().
-    mutex out_mu;
-    condition_variable out_cv;
-    bool out_mu_available TF_GUARDED_BY(out_mu);
+    mutex mu;
     // Values to be shared by all instances, constant after initialization.
-    CollectiveParams shared TF_GUARDED_BY(out_mu);
-    // If an error occurs during initialization this structure stays in
-    // the table with a non-OK status.  Purging the table and restarting
-    // needs to be done at a higher level.
-    Status status TF_GUARDED_BY(out_mu);
+    CollectiveParams shared;
+    // If an error occurs during initialization this structure stays in the
+    // table with a non-OK status. Purging the table and restarting needs to be
+    // done at a higher level.
+    Status status TF_GUARDED_BY(mu);
 
     // These fields are used to count the instances that have called
     // in and become known while resolving broadcast source identity and
     // communicator key.
-    int source_rank TF_GUARDED_BY(out_mu);
-    string communicator_key TF_GUARDED_BY(out_mu);
-    int known_count TF_GUARDED_BY(out_mu);
-    std::vector<bool> known TF_GUARDED_BY(out_mu);
-    std::vector<IRConsumer> known_waiters TF_GUARDED_BY(out_mu);
+    int source_rank TF_GUARDED_BY(mu);
+    string communicator_key TF_GUARDED_BY(mu);
+    int known_count TF_GUARDED_BY(mu);
+    std::vector<bool> known TF_GUARDED_BY(mu);
+    std::vector<IRConsumer> known_waiters TF_GUARDED_BY(mu);
 
-    InstanceRec()
-        : is_init(false),
-          out_mu_available(true),
-          source_rank(-1),
-          known_count(0) {}
-
-    // If out_mu is unavailable during distributed device locality
-    // initialization, wait on out_cv until it is available again.
-    void WaitForOutMu(mutex_lock& lock) TF_EXCLUSIVE_LOCKS_REQUIRED(out_mu);
+    InstanceRec() : source_rank(-1), known_count(0) {}
   };
 
   // Find the InstanceRec with the same instance_key as cp.  If it doesn't
@@ -161,10 +123,7 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   // by CompleteGroupLocal. *cp must be populated with all the fields
   // required by InitInstanceSharedParams.  Ownership of InstanceRec stays
   // with this object and does not pass to the callback.
-  typedef std::function<void(const Status& s, InstanceRec* ir)>
-      InstanceRecCallback;
-  void FindInstanceRec(const GroupRec* gr, CollectiveParams* cp,
-                       const InstanceRecCallback& done)
+  InstanceRec* GetOrCreateInstanceRec(const GroupRec* gr, CollectiveParams* cp)
       TF_LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
 
   // Populate *ir with device membership from gr, then initialize to be specific
@@ -173,20 +132,12 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   // Preconditions:
   //  cp is populated with all DeviceLocalities
   void InitInstanceSharedParams(const GroupRec* gr, const CollectiveParams* cp,
-                                InstanceRec* ir, const StatusCallback& done)
-      TF_UNLOCK_FUNCTION(ir->out_mu) TF_LOCKS_EXCLUDED(gr->mu);
+                                InstanceRec* ir) TF_LOCKS_EXCLUDED(gr->mu);
 
-  void CallInitInstanceSharedParams(const GroupRec* gr,
-                                    const CollectiveParams* cp, InstanceRec* ir,
-                                    const InstanceRecCallback& done)
-      TF_LOCKS_EXCLUDED(ir->out_mu, gr->mu);
-
-  // Establishes the final order of ir->shared.instance.device_names and
-  // ir->shared.instance.task_names by considering localities of all devices.
-  void CompleteDefaultRanking(const GroupRec* gr, const CollectiveParams* cp,
-                              InstanceRec* ir,
-                              const std::vector<DeviceAttributes>& attributes)
-      TF_EXCLUSIVE_LOCKS_REQUIRED(ir->out_mu);
+  // Establishes the final order of gp->device_names and gp->task_names by
+  // considering localities of all devices.
+  void CompleteDefaultRanking(const std::vector<DeviceAttributes>& attributes,
+                              CollGroupParams* gp);
 
   // Finish populating *cp.
   // Precondition: *gr has been fully populated by CompleteGroupLocal.
@@ -202,12 +153,12 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
                                            CollectiveParams* cp,
                                            InstanceRec* ir, bool is_source,
                                            const StatusCallback& done)
-      TF_LOCKS_EXCLUDED(ir->out_mu);
+      TF_LOCKS_EXCLUDED(ir->mu);
 
   // Complete instance params after waiting for group.
   // Precondition: *cp has complete group data and default_rank.
   void WaitForGroup(InstanceRec* ir, CollectiveParams* cp, bool is_source,
-                    const IRConsumer& f) TF_LOCKS_EXCLUDED(ir->out_mu);
+                    const IRConsumer& f) TF_LOCKS_EXCLUDED(ir->mu);
 
   // If cp.device_names contains only devices local to this process
   // populates *localities, else returns an error.
@@ -226,10 +177,6 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   // best implementation.
   void AssignCollectiveType(CollectiveParams* cp);
 
-  // Helper to grab status under lock, invoke callback out of lock.
-  void CallbackWithStatus(const InstanceRecCallback& done, InstanceRec* irec)
-      TF_LOCKS_EXCLUDED(irec->out_mu);
-
   void StartAbortLocal(const Status& s)
       TF_LOCKS_EXCLUDED(status_mu_, group_mu_, instance_mu_);
 
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
index f23f03dc406..e1ac46f2e53 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_resolver_local.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -58,32 +59,27 @@ class CollectiveParamResolverLocalTest : public ::testing::Test {
   }
 
   void RunCompleteDefaultRanking(
-      const CollectiveParams& shared_cp,
-      const std::vector<DeviceAttributes>& attributes,
+      CollGroupParams group, const std::vector<DeviceAttributes>& attributes,
       const std::vector<int32>& gpu_ring_order,
       const std::vector<string>& expected_device_order) {
-    CollectiveParams cp;
-    cp.instance.device_names = shared_cp.instance.device_names;
-    CollectiveParamResolverLocal::InstanceRec ir;
-    {
-      mutex_lock l(ir.out_mu);
-      ir.shared.name = shared_cp.name;
-      ir.shared.group = shared_cp.group;
-      ir.shared.instance = shared_cp.instance;
-      if (!gpu_ring_order.empty()) {
-        ir.shared.instance.gpu_ring_order = "";
-        for (int i = 0; i < static_cast<int32>(gpu_ring_order.size() - 1);
-             ++i) {
-          ir.shared.instance.gpu_ring_order = strings::StrCat(
-              ir.shared.instance.gpu_ring_order, gpu_ring_order[i], ",");
-        }
-        ir.shared.instance.gpu_ring_order = strings::StrCat(
-            ir.shared.instance.gpu_ring_order, gpu_ring_order.back());
+    if (!gpu_ring_order.empty()) {
+      group.gpu_ring_order = "";
+      for (int i = 0; i < static_cast<int32>(gpu_ring_order.size() - 1); ++i) {
+        group.gpu_ring_order =
+            strings::StrCat(group.gpu_ring_order, gpu_ring_order[i], ",");
       }
-      VLOG(2) << "gpu_ring_order " << ir.shared.instance.gpu_ring_order;
-      prl_->CompleteDefaultRanking(nullptr, &cp, &ir, attributes);
-      EXPECT_EQ(ir.shared.instance.device_names, expected_device_order);
+      group.gpu_ring_order =
+          strings::StrCat(group.gpu_ring_order, gpu_ring_order.back());
     }
+    VLOG(2) << "gpu_ring_order " << group.gpu_ring_order;
+    prl_->CompleteDefaultRanking(attributes, &group);
+    EXPECT_EQ(group.device_names, expected_device_order);
+  }
+
+  DeviceAttributes GetDeviceAttributes(const string& device_name) {
+    Device* device = nullptr;
+    TF_CHECK_OK(device_mgr_->LookupDevice(device_name, &device));
+    return device->attributes();
   }
 
   string task_name_;
@@ -94,19 +90,15 @@ class CollectiveParamResolverLocalTest : public ::testing::Test {
 
 TEST_F(CollectiveParamResolverLocalTest, CompleteDefaultRanking) {
   constexpr int kNumGpus = 8;
-  CollectiveParams cp;
+  CollGroupParams group;
   std::vector<DeviceAttributes> attributes(kNumGpus);
-  cp.name = "PRLTest";
-  cp.group.device_type = DeviceType("GPU");
-  cp.group.num_tasks = 1;
-  cp.group.group_size = kNumGpus;
-  cp.instance.instance_key = 5;
-  cp.instance.type = REDUCTION_COLLECTIVE;
-  cp.instance.data_type = DataType(DT_FLOAT);
+  group.device_type = DeviceType("GPU");
+  group.num_tasks = 1;
+  group.group_size = kNumGpus;
   std::unordered_set<int> clique1 = {0, 1, 6, 7};
   for (int gpu_idx = 0; gpu_idx < kNumGpus; ++gpu_idx) {
-    cp.instance.task_names.push_back("/job:localhost/replica:0/task:0");
-    cp.instance.device_names.push_back(strings::StrCat(
+    group.task_names.push_back("/job:localhost/replica:0/task:0");
+    group.device_names.push_back(strings::StrCat(
         "/job:localhost/replica:0/task:0/device:GPU:", gpu_idx));
     DeviceLocality locality;
     // Build localities so that 0,1,6,7 and 2,3,4,5 form 2 strongly connected
@@ -131,7 +123,7 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteDefaultRanking) {
     }
     *attributes[gpu_idx].mutable_locality() = locality;
   }
-  RunCompleteDefaultRanking(cp, attributes, {1, 3, 5, 7, 6, 4, 2, 0},
+  RunCompleteDefaultRanking(group, attributes, {1, 3, 5, 7, 6, 4, 2, 0},
                             {
                                 "/job:localhost/replica:0/task:0/device:GPU:1",
                                 "/job:localhost/replica:0/task:0/device:GPU:3",
@@ -142,7 +134,7 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteDefaultRanking) {
                                 "/job:localhost/replica:0/task:0/device:GPU:2",
                                 "/job:localhost/replica:0/task:0/device:GPU:0",
                             });
-  RunCompleteDefaultRanking(cp, attributes, {7, 6, 5, 4, 3, 2, 1, 0},
+  RunCompleteDefaultRanking(group, attributes, {7, 6, 5, 4, 3, 2, 1, 0},
                             {
                                 "/job:localhost/replica:0/task:0/device:GPU:7",
                                 "/job:localhost/replica:0/task:0/device:GPU:6",
@@ -155,7 +147,7 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteDefaultRanking) {
                             });
   // With no gpu_ring_order passed, automatic link detection should kick in.
   // Starting at dev 0, the best order would be: 0,1,6,7,3,2,4,5
-  RunCompleteDefaultRanking(cp, attributes, {},
+  RunCompleteDefaultRanking(group, attributes, {},
                             {
                                 "/job:localhost/replica:0/task:0/device:GPU:0",
                                 "/job:localhost/replica:0/task:0/device:GPU:1",
@@ -182,12 +174,12 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsReduction1Task) {
     cp->instance.type = REDUCTION_COLLECTIVE;
     cp->instance.data_type = DataType(DT_FLOAT);
     cp->instance.shape = TensorShape({5});
-    cp->instance.device_names.push_back(
-        strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i));
     cp->instance.impl_details.subdiv_offsets.push_back(0);
     cp->is_source = false;
     Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
-      prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
+      string device =
+          strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i);
+      prl_->CompleteParamsAsync(GetDeviceAttributes(device), cp,
                                 nullptr /*CancellationManager*/,
                                 [&statuses, &note, i](const Status& s) {
                                   statuses[i] = s;
@@ -200,17 +192,17 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsReduction1Task) {
   }
   for (int i = 0; i < NUM_DEVS; ++i) {
     TF_ASSERT_OK(statuses[i]);
-    ASSERT_EQ(cps[i].instance.device_names.size(), 3);
+    ASSERT_EQ(cps[i].group.device_names.size(), 3);
     for (int j = 0; j < NUM_DEVS; ++j) {
       EXPECT_EQ(
           strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", j),
-          cps[i].instance.device_names[j]);
+          cps[i].group.device_names[j]);
       EXPECT_TRUE(cps[i].task.is_local[j]);
     }
     EXPECT_EQ(cps[i].instance.impl_details.subdiv_source_rank.size(), 0);
     EXPECT_FALSE(cps[i].is_source);
     EXPECT_EQ(cps[i].default_rank, i);
-    EXPECT_TRUE(cps[i].instance.same_num_devices_per_task);
+    EXPECT_TRUE(cps[i].group.same_num_devices_per_task);
   }
 }
 
@@ -225,8 +217,6 @@ void InitializeCollectiveParamsForBroadcast(int instance_key, int device_idx,
   cp->instance.type = BROADCAST_COLLECTIVE;
   cp->instance.data_type = DataType(DT_FLOAT);
   cp->instance.shape = TensorShape({5});
-  cp->instance.device_names.push_back(strings::StrCat(
-      "/job:localhost/replica:0/task:0/device:CPU:", device_idx));
   cp->instance.impl_details.subdiv_offsets.push_back(0);
   cp->is_source = is_source;
 }
@@ -240,7 +230,9 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcast1Task) {
     CollectiveParams* cp = &cps[i];
     InitializeCollectiveParamsForBroadcast(kInstanceKey, i, i == 1, cp);
     Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
-      prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
+      string device =
+          strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i);
+      prl_->CompleteParamsAsync(GetDeviceAttributes(device), cp,
                                 nullptr /*CancellationManager*/,
                                 [&statuses, &note, i](const Status& s) {
                                   statuses[i] = s;
@@ -253,16 +245,16 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcast1Task) {
   }
   for (int i = 0; i < NUM_DEVS; ++i) {
     TF_ASSERT_OK(statuses[i]);
-    ASSERT_EQ(cps[i].instance.device_names.size(), 3);
+    ASSERT_EQ(cps[i].group.device_names.size(), 3);
     for (int j = 0; j < NUM_DEVS; ++j) {
       EXPECT_EQ(
           strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", j),
-          cps[i].instance.device_names[j]);
+          cps[i].group.device_names[j]);
       EXPECT_TRUE(cps[i].task.is_local[j]);
     }
     EXPECT_EQ(cps[i].is_source, (i == 1));
     EXPECT_EQ(cps[i].default_rank, i);
-    EXPECT_TRUE(cps[i].instance.same_num_devices_per_task);
+    EXPECT_TRUE(cps[i].group.same_num_devices_per_task);
   }
 }
 
@@ -278,7 +270,9 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsBroadcastForgotSender) {
     CollectiveParams* cp = &cps[i];
     InitializeCollectiveParamsForBroadcast(kInstanceKey, i, false, cp);
     Env::Default()->SchedClosure([this, i, cp, &note, &statuses]() {
-      prl_->CompleteParamsAsync(cp->instance.device_names[0], cp,
+      string device =
+          strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i);
+      prl_->CompleteParamsAsync(GetDeviceAttributes(device), cp,
                                 nullptr /*CancellationManager*/,
                                 [&statuses, &note, i](const Status& s) {
                                   statuses[i] = s;
@@ -326,8 +320,8 @@ TEST_F(CollectiveParamResolverLocalTest, AbortPendingGroup) {
           strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i);
       cp[i] = MakeCollectiveParams(/*group_key*/ 100, /*instance_key*/ 100,
                                    /*is_source*/ i == 0);
-      prl_->CompleteParamsAsync(device, &cp[i], &cancel_mgr,
-                                [&done](const Status& s) {
+      prl_->CompleteParamsAsync(GetDeviceAttributes(device), &cp[i],
+                                &cancel_mgr, [&done](const Status& s) {
                                   EXPECT_EQ(s.code(), error::ABORTED);
                                   EXPECT_EQ(s.error_message(), "__aborted__");
                                   done.DecrementCount();
@@ -355,8 +349,8 @@ TEST_F(CollectiveParamResolverLocalTest, AbortPendingInstance) {
             strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i);
         cp[i] = MakeCollectiveParams(group_key, instance_key,
                                      /*is_source*/ i == 0);
-        prl_->CompleteParamsAsync(device, &cp[i], &cancel_mgr,
-                                  [&done](const Status& s) {
+        prl_->CompleteParamsAsync(GetDeviceAttributes(device), &cp[i],
+                                  &cancel_mgr, [&done](const Status& s) {
                                     EXPECT_EQ(s.code(), error::OK);
                                     done.DecrementCount();
                                   });
@@ -373,12 +367,13 @@ TEST_F(CollectiveParamResolverLocalTest, AbortPendingInstance) {
               strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i);
           cp[i] = MakeCollectiveParams(group_key, instance_key + 1,
                                        /*is_source*/ i == 0);
-          prl_->CompleteParamsAsync(
-              device, &cp[i], &cancel_mgr, [&done](const Status& s) {
-                EXPECT_EQ(s.code(), error::ABORTED);
-                EXPECT_EQ(s.error_message(), "__aborted__");
-                done.DecrementCount();
-              });
+          prl_->CompleteParamsAsync(GetDeviceAttributes(device), &cp[i],
+                                    &cancel_mgr, [&done](const Status& s) {
+                                      EXPECT_EQ(s.code(), error::ABORTED);
+                                      EXPECT_EQ(s.error_message(),
+                                                "__aborted__");
+                                      done.DecrementCount();
+                                    });
           start.DecrementCount();
         });
   }
@@ -402,8 +397,8 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsAfterAbortion) {
             strings::StrCat("/job:localhost/replica:0/task:0/device:CPU:", i);
         cp[i] = MakeCollectiveParams(group_key, instance_key,
                                      /*is_source*/ i == 0);
-        prl_->CompleteParamsAsync(device, &cp[i], &cancel_mgr,
-                                  [&done](const Status& s) {
+        prl_->CompleteParamsAsync(GetDeviceAttributes(device), &cp[i],
+                                  &cancel_mgr, [&done](const Status& s) {
                                     EXPECT_EQ(s.code(), error::OK);
                                     done.DecrementCount();
                                   });
@@ -418,7 +413,7 @@ TEST_F(CollectiveParamResolverLocalTest, CompleteParamsAfterAbortion) {
     Notification done;
     auto cp = MakeCollectiveParams(group_key, instance_key,
                                    /*is_source*/ true);
-    prl_->CompleteParamsAsync(device, &cp, &cancel_mgr,
+    prl_->CompleteParamsAsync(GetDeviceAttributes(device), &cp, &cancel_mgr,
                               [&done](const Status& s) {
                                 EXPECT_EQ(s.code(), error::ABORTED);
                                 EXPECT_EQ(s.error_message(), "__aborted__");
@@ -457,7 +452,8 @@ TEST_F(CollectiveParamResolverLocalTest, AbortNormalCompleteParamsAsync) {
               auto cp =
                   MakeCollectiveParams(/* group_key*/ key, /*instance_key*/ key,
                                        /*is_source*/ i == 0);
-              prl_->CompleteParamsAsync(device, &cp, &cancel_mgr,
+              prl_->CompleteParamsAsync(GetDeviceAttributes(device), &cp,
+                                        &cancel_mgr,
                                         [&status, &n](const Status& s) {
                                           status = s;
                                           n.Notify();
diff --git a/tensorflow/core/common_runtime/collective_rma_local.cc b/tensorflow/core/common_runtime/collective_rma_local.cc
index ec875d031b2..b958a25c091 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local.cc
@@ -21,9 +21,6 @@ namespace tensorflow {
 
 void CollectiveRemoteAccessLocal::StartAbort(const Status& s) {
   buf_rendezvous_.StartAbort(s);
-  if (errors::IsFailedPrecondition(s)) {
-    dev_resolver_->ClearCache();
-  }
 }
 
 void CollectiveRemoteAccessLocal::RecvFromPeer(
diff --git a/tensorflow/core/common_runtime/collective_util.cc b/tensorflow/core/common_runtime/collective_util.cc
index a94e6cb0a36..1a5001758ea 100644
--- a/tensorflow/core/common_runtime/collective_util.cc
+++ b/tensorflow/core/common_runtime/collective_util.cc
@@ -60,8 +60,8 @@ string SubdivPermDebugString(const CollectiveParams& col_params) {
     for (int di = 0; di < subdiv_perms[sdi].size(); ++di) {
       int idx = subdiv_perms[sdi][di];
       if (idx >= 0) {
-        CHECK_GT(col_params.instance.device_names.size(), idx);
-        strings::StrAppend(&buf, col_params.instance.device_names[idx], "\n");
+        CHECK_GT(col_params.group.device_names.size(), idx);
+        strings::StrAppend(&buf, col_params.group.device_names[idx], "\n");
       }
     }
     strings::StrAppend(&buf, " subdiv_offsets: ");
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index f87efb369ed..384ec836cdf 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -219,6 +219,7 @@ bool IsConstantFoldable(
     const std::unordered_map<string, std::vector<PartialTensorShape>>*
         shape_map,
     const std::function<bool(const Node*)>& consider,
+    int64 max_constant_size_in_bytes,
     std::unordered_map<const Node*, std::vector<Tensor>>*
         shape_replacement_map) {
   if (n->IsConstant()) {
@@ -233,6 +234,20 @@ bool IsConstantFoldable(
   if (consider && !consider(n)) {
     return false;
   }
+  if (shape_map != nullptr) {
+    // We can skip the node if an output is known to be oversized.
+    auto shape_it = shape_map->find(n->name());
+    if (shape_it != shape_map->end()) {
+      for (int64 i = 0; i < shape_it->second.size(); ++i) {
+        const auto& out_shape = shape_it->second[i];
+        if (out_shape.IsFullyDefined() &&
+            out_shape.num_elements() * DataTypeSize(n->output_type(i)) >
+                max_constant_size_in_bytes) {
+          return false;
+        }
+      }
+    }
+  }
   if (n->IsControlFlow() || n->IsSend() || n->IsRecv()) {
     return false;
   }
@@ -280,6 +295,7 @@ void ConsiderConstantFoldableNode(
     std::unordered_map<const Node*, std::vector<Tensor>>* shape_replacement_map,
     bool* internal_node_inserted) {
   if (IsConstantFoldable(n, opts.shape_map, opts.consider,
+                         opts.max_constant_size_in_bytes,
                          shape_replacement_map)) {
     // A node is constant provided all of its non-control incoming Tensors come
     // from constant nodes, or it's a shape Op with statically known inputs in
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index 9e2db9faaf1..83785e3341b 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -12,191 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
-// A Device is a something that can perform computations as part of a
-// model.  Devices can be local (runs computation on this machine), or
-// remote (contacts a device local to another machine using an RPC to
-// do the work).  Devices are registered in a DeviceSet, which is also
-// responsible for the Device <-> id mapping.
-//
-// Device names
-// * Every Device should have a unique name with the format:
-//     /job:___/replica:___/task:___/(gpu|cpu):___
-//   An example name would be "/job:train/replica:0/task:3/device:GPU:2".
-// * Task numbers are within the specified replica, so there are as
-//   many "task zeros" as replicas.
-
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_H_
 
-#include <memory>
-#include <string>
-
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/control_flow.h"
-#include "tensorflow/core/framework/device_attributes.pb.h"
-#include "tensorflow/core/framework/device_base.h"
-#include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/op_segment.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/graph/types.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/device_name_utils.h"
-
-namespace tensorflow {
-
-class Device : public DeviceBase {
- public:
-  // Callback type that takes a Status and returns void.
-  typedef std::function<void(const Status&)> DoneCallback;
-
-  Device(Env* env, const DeviceAttributes& device_attributes);
-  ~Device() override;
-
-  // Full name of this device (see top comment).
-  const std::string& name() const override { return device_attributes_.name(); }
-
-  // Parsed name of this device
-  const DeviceNameUtils::ParsedName& parsed_name() const {
-    return parsed_name_;
-  }
-
-  // Describes what kind of device this is.  This is intended to be
-  // human-readable and not computer-parsed, except that two devices
-  // with the same device_type() are expected to perform similarly
-  // (both from a computation and communication perspective).
-  const std::string& device_type() const {
-    return device_attributes_.device_type();
-  }
-
-  // Returns an aggregation of device attributes.
-  const DeviceAttributes& attributes() const override {
-    return device_attributes_;
-  }
-
-  // Performs the actual compute function.
-  //
-  // Subclasses may override this function if they wish to perform
-  // some initialization before each compute.
-  virtual void Compute(OpKernel* op_kernel, OpKernelContext* context) {
-    op_kernel->Compute(context);
-  }
-
-  // Asynchronous kernel's compute.
-  virtual void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
-                            AsyncOpKernel::DoneCallback done) {
-    op_kernel->ComputeAsync(context, std::move(done));
-  }
-
-  // Blocks until all operations queued on the device at the time of
-  // the call have completed.  Returns any error pending on the device
-  // at completion.
-  virtual Status Sync() = 0;
-
-  // Calls the given callback when all operations queued on the device at the
-  // time of the call have completed. The callback is passed any error pending
-  // on the device at completion.
-  // TODO(b/112409994): Consolidate these two APIs, removing the synchronous
-  // version.
-  virtual void Sync(const DoneCallback& done);
-
-  // On session completion, the executor may call Device::Sync() depending on
-  // flag settings. Override this to return false for devices that don't allow
-  // such calls. Instead, these devices must use other mechanisms (such as
-  // num_deferred_ops) to ensure the device has finished processing necessary
-  // work at session completion. In addition, for these devices, RefreshStatus
-  // must be called at session completion to retrieve execution result status.
-  //
-  // Devices that override this function must also implement RefreshStatus.
-  virtual bool AllowsSyncOnCompletion() const { return true; }
-
-  // This is used in conjunction with AllowsSyncOnCompletion to allow the
-  // executor to get execution result status at session completion.
-  //
-  // For supported devices, this call returns the underlying device stream's
-  // current status in a non-blocking way, without using blocking calls such as
-  // Stream::BlockHostUntilDone or Device::Sync. When applicable, the device
-  // status is also updated with the retrieved stream status.
-  virtual Status RefreshStatus() {
-    return errors::Unimplemented(
-        "RefreshStatus is not supported on this device.");
-  }
-
-  // Optionally modify the device's GraphDef before execution.
-  //
-  // This method should be considered experimental and is supplied to enable
-  // prototyping of TensorFlow device implementations that need to modify
-  // the GraphDef before execution.
-  //
-  // 'graph' supplies the partition of the graph assigned to this
-  // device.
-  virtual Status MaybeRewriteGraph(std::unique_ptr<Graph>* /*graph*/) {
-    return Status::OK();
-  }
-
-  // Sets `out_context` a new DeviceContext* for executing a graph, or nullptr
-  // if the device does not support contexts. Returns an error status if any
-  // error occurred while trying to create a context, otherwise OK.
-  //
-  // The caller takes ownership of one reference on the output DeviceContext*,
-  // and should call Unref().
-  virtual Status TryGetDeviceContext(DeviceContext** out_context) {
-    *out_context = nullptr;
-    return Status::OK();
-  }
-
-  // Returns the op segment of this device.  The caller can reuse op
-  // kernels registered for the same session running on this device.
-  OpSegment* op_segment() { return &op_seg_; }
-
-  // Returns the resource manager associated w/ this device.
-  virtual ResourceMgr* resource_manager() { return rmgr_; }
-
-  // Summarizes the status of this Device, for debugging.
-  std::string DebugString() const { return device_attributes_.DebugString(); }
-
-  // Assembles the parameter components into a complete DeviceAttributes value.
-  static DeviceAttributes BuildDeviceAttributes(
-      const std::string& name, DeviceType device, Bytes memory_limit,
-      const DeviceLocality& locality, const std::string& physical_device_desc);
-
-  static DeviceAttributes BuildDeviceAttributes(
-      const std::string& name, DeviceType device, Bytes memory_limit,
-      const DeviceLocality& locality) {
-    // Pass in an empty string as physical device name.
-    return BuildDeviceAttributes(name, device, memory_limit, locality, "");
-  }
-
-  // Clears the resource manager associated with this device.
-  void ClearResourceMgr() { rmgr_->Clear(); }
-
-  virtual bool IsLocal() const { return true; }
-
- protected:
-  void DeleteResourceMgr() {
-    delete rmgr_;
-    rmgr_ = nullptr;
-  }
-
- private:
-  const DeviceAttributes device_attributes_;
-  DeviceNameUtils::ParsedName parsed_name_;
-
-  // op_seg_ maps session handle and op name to OpKernel objects.
-  OpSegment op_seg_;
-
-  // Resources associated w/ this device. E.g., shared variables, etc.
-  ResourceMgr* rmgr_ = nullptr;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(Device);
-};
-
-}  // namespace tensorflow
+#include "tensorflow/core/framework/device.h"
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/device_factory.h b/tensorflow/core/common_runtime/device_factory.h
index 9d911c20e25..1b5a662639a 100644
--- a/tensorflow/core/common_runtime/device_factory.h
+++ b/tensorflow/core/common_runtime/device_factory.h
@@ -12,141 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_FACTORY_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_FACTORY_H_
 
-#include <string>
-#include <vector>
-
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-
-class Device;
-struct SessionOptions;
-
-class DeviceFactory {
- public:
-  virtual ~DeviceFactory() {}
-  static void Register(const std::string& device_type, DeviceFactory* factory,
-                       int priority);
-  static DeviceFactory* GetFactory(const std::string& device_type);
-
-  // Append to "*devices" all suitable devices, respecting
-  // any device type specific properties/counts listed in "options".
-  //
-  // CPU devices are added first.
-  static Status AddDevices(const SessionOptions& options,
-                           const std::string& name_prefix,
-                           std::vector<std::unique_ptr<Device>>* devices);
-
-  // Helper for tests.  Create a single device of type "type".  The
-  // returned device is always numbered zero, so if creating multiple
-  // devices of the same type, supply distinct name_prefix arguments.
-  static std::unique_ptr<Device> NewDevice(const string& type,
-                                           const SessionOptions& options,
-                                           const string& name_prefix);
-
-  // Iterate through all device factories and build a list of all of the
-  // possible physical devices.
-  //
-  // CPU is are added first.
-  static Status ListAllPhysicalDevices(std::vector<string>* devices);
-
-  // Get details for a specific device among all device factories.
-  // 'device_index' indexes into devices from ListAllPhysicalDevices.
-  static Status GetAnyDeviceDetails(
-      int device_index, std::unordered_map<string, string>* details);
-
-  // For a specific device factory list all possible physical devices.
-  virtual Status ListPhysicalDevices(std::vector<string>* devices) = 0;
-
-  // Get details for a specific device for a specific factory. Subclasses
-  // can store arbitrary device information in the map. 'device_index' indexes
-  // into devices from ListPhysicalDevices.
-  virtual Status GetDeviceDetails(int device_index,
-                                  std::unordered_map<string, string>* details) {
-    return Status::OK();
-  }
-
-  // Most clients should call AddDevices() instead.
-  virtual Status CreateDevices(
-      const SessionOptions& options, const std::string& name_prefix,
-      std::vector<std::unique_ptr<Device>>* devices) = 0;
-
-  // Return the device priority number for a "device_type" string.
-  //
-  // Higher number implies higher priority.
-  //
-  // In standard TensorFlow distributions, GPU device types are
-  // preferred over CPU, and by default, custom devices that don't set
-  // a custom priority during registration will be prioritized lower
-  // than CPU.  Custom devices that want a higher priority can set the
-  // 'priority' field when registering their device to something
-  // higher than the packaged devices.  See calls to
-  // REGISTER_LOCAL_DEVICE_FACTORY to see the existing priorities used
-  // for built-in devices.
-  static int32 DevicePriority(const std::string& device_type);
-};
-
-namespace dfactory {
-
-template <class Factory>
-class Registrar {
- public:
-  // Multiple registrations for the same device type with different priorities
-  // are allowed.  Priorities are used in two different ways:
-  //
-  // 1) When choosing which factory (that is, which device
-  //    implementation) to use for a specific 'device_type', the
-  //    factory registered with the highest priority will be chosen.
-  //    For example, if there are two registrations:
-  //
-  //      Registrar<CPUFactory1>("CPU", 125);
-  //      Registrar<CPUFactory2>("CPU", 150);
-  //
-  //    then CPUFactory2 will be chosen when
-  //    DeviceFactory::GetFactory("CPU") is called.
-  //
-  // 2) When choosing which 'device_type' is preferred over other
-  //    DeviceTypes in a DeviceSet, the ordering is determined
-  //    by the 'priority' set during registration.  For example, if there
-  //    are two registrations:
-  //
-  //      Registrar<CPUFactory>("CPU", 100);
-  //      Registrar<GPUFactory>("GPU", 200);
-  //
-  //    then DeviceType("GPU") will be prioritized higher than
-  //    DeviceType("CPU").
-  //
-  // The default priority values for built-in devices is:
-  // GPU: 210
-  // SYCL: 200
-  // GPUCompatibleCPU: 70
-  // ThreadPoolDevice: 60
-  // Default: 50
-  explicit Registrar(const std::string& device_type, int priority = 50) {
-    DeviceFactory::Register(device_type, new Factory(), priority);
-  }
-};
-
-}  // namespace dfactory
-
-#define REGISTER_LOCAL_DEVICE_FACTORY(device_type, device_factory, ...) \
-  INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY(device_type, device_factory,   \
-                                         __COUNTER__, ##__VA_ARGS__)
-
-#define INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY(device_type, device_factory, \
-                                               ctr, ...)                    \
-  static ::tensorflow::dfactory::Registrar<device_factory>                  \
-      INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY_NAME(ctr)(device_type,         \
-                                                       ##__VA_ARGS__)
-
-// __COUNTER__ must go through another macro to be properly expanded
-#define INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY_NAME(ctr) ___##ctr##__object_
-
-}  // namespace tensorflow
+#include "tensorflow/core/framework/device_factory.h"
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_FACTORY_H_
diff --git a/tensorflow/core/common_runtime/device_resolver_local.cc b/tensorflow/core/common_runtime/device_resolver_local.cc
index 9a898e72931..ba4cfa44f6e 100644
--- a/tensorflow/core/common_runtime/device_resolver_local.cc
+++ b/tensorflow/core/common_runtime/device_resolver_local.cc
@@ -15,41 +15,34 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_resolver_local.h"
 
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 
-void DeviceResolverLocal::GetAllDeviceAttributesAsync(
-    const std::vector<string>& devices, const std::vector<string>& tasks,
-    std::vector<DeviceAttributes>* attributes, const StatusCallback& done) {
-  attributes->clear();
-  for (const string& device_name : devices) {
-    Device* dev;
-    Status s = dev_mgr_->LookupDevice(device_name, &dev);
-    if (!s.ok()) {
-      done(s);
-      return;
-    }
-    attributes->push_back(dev->attributes());
-  }
-  done(Status::OK());
-}
-
-void DeviceResolverLocal::GetDeviceAttributesAsync(const string& device,
-                                                   const string& task,
-                                                   DeviceAttributes* attributes,
-                                                   const StatusCallback& done) {
+Status DeviceResolverLocal::GetDeviceAttributes(const string& device,
+                                                DeviceAttributes* attributes) {
   Device* dev;
+  // LookupDevice returns InvalidArgument if the device is not found.
   Status s = dev_mgr_->LookupDevice(device, &dev);
-  if (s.ok()) {
-    *attributes = dev->attributes();
+  if (errors::IsInvalidArgument(s)) {
+    return errors::NotFound(device, " not found");
+  } else if (!s.ok()) {
+    return s;
   }
-  done(s);
+  *attributes = dev->attributes();
+  return Status::OK();
 }
 
-Status DeviceResolverLocal::GetTaskCached(
+Status DeviceResolverLocal::GetAllDeviceAttributes(
     const string& task, std::vector<DeviceAttributes>* attributes) {
   return errors::Internal(
       "GetTaskCached is not supposed to be called in local collectives");
 }
 
+Status DeviceResolverLocal::UpdateDeviceAttributes(
+    const std::vector<DeviceAttributes>& attributes) {
+  return errors::Internal(
+      "UpdateDeviceAttributes shouldn't be called with local collectives");
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/device_resolver_local.h b/tensorflow/core/common_runtime/device_resolver_local.h
index 12b7dce8ab1..adb859abc1f 100644
--- a/tensorflow/core/common_runtime/device_resolver_local.h
+++ b/tensorflow/core/common_runtime/device_resolver_local.h
@@ -16,9 +16,11 @@ limitations under the License.
 #define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_RESOLVER_LOCAL_H_
 
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
 class DeviceMgr;
@@ -26,25 +28,16 @@ class DeviceMgr;
 // Implements DeviceResolverInterface in a single-task context.
 class DeviceResolverLocal : public DeviceResolverInterface {
  public:
-  DeviceResolverLocal(const DeviceMgr* dev_mgr) : dev_mgr_(dev_mgr) {}
+  explicit DeviceResolverLocal(const DeviceMgr* dev_mgr) : dev_mgr_(dev_mgr) {}
 
-  virtual ~DeviceResolverLocal() {}
+  Status GetDeviceAttributes(const string& device,
+                             DeviceAttributes* attributes) override;
 
-  void GetAllDeviceAttributesAsync(const std::vector<string>& devices,
-                                   const std::vector<string>& tasks,
-                                   std::vector<DeviceAttributes>* attributes,
-                                   const StatusCallback& done) override;
+  Status GetAllDeviceAttributes(
+      const string& task, std::vector<DeviceAttributes>* attributes) override;
 
-  void GetDeviceAttributesAsync(const string& device, const string& task,
-                                DeviceAttributes* attributes,
-                                const StatusCallback& done) override;
-
-  Status GetTaskCached(const string& task,
-                       std::vector<DeviceAttributes>* attributes) override;
-
-  void ClearTask(const string& task) override {}
-
-  void ClearCache() override {}
+  Status UpdateDeviceAttributes(
+      const std::vector<DeviceAttributes>& attributes) override;
 
  protected:
   const DeviceMgr* dev_mgr_;
diff --git a/tensorflow/core/common_runtime/device_resolver_local_test.cc b/tensorflow/core/common_runtime/device_resolver_local_test.cc
index 0ddf36907d8..45ea5654eb3 100644
--- a/tensorflow/core/common_runtime/device_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/device_resolver_local_test.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -46,35 +45,27 @@ class DeviceResolverLocalTest : public ::testing::Test {
 };
 
 TEST_F(DeviceResolverLocalTest, GetDeviceAttributesKnown) {
-  std::vector<DeviceAttributes> attributes;
-  std::vector<string> devices{"/job:localhost/replica:0/task:0/device:CPU:1",
-                              "/job:localhost/replica:0/task:0/device:CPU:2"};
-  Notification note;
-  Status status;
-  drl_->GetAllDeviceAttributesAsync(devices, /*tasks=*/{}, &attributes,
-                                    [&note, &status](const Status& s) {
-                                      status = s;
-                                      note.Notify();
-                                    });
-  note.WaitForNotification();
-  TF_EXPECT_OK(status);
-  EXPECT_EQ(2, attributes.size());
+  DeviceAttributes attributes;
+  TF_EXPECT_OK(drl_->GetDeviceAttributes(
+      "/job:localhost/replica:0/task:0/device:CPU:1", &attributes));
+  EXPECT_EQ(attributes.name(), "/job:localhost/replica:0/task:0/device:CPU:1");
 }
 
 TEST_F(DeviceResolverLocalTest, GetDeviceAttributesUnknown) {
+  DeviceAttributes attributes;
+  EXPECT_TRUE(errors::IsNotFound(drl_->GetDeviceAttributes(
+      "/job:localhost/replica:0/task:0/device:CPU:9", &attributes)));
+}
+
+TEST_F(DeviceResolverLocalTest, GetAllDeviceAttributes) {
   std::vector<DeviceAttributes> attributes;
-  // In some builds there may be 1 GPU, but there should never be 9.
-  std::vector<string> devices{"/job:localhost/replica:0/task:0/device:GPU:9"};
-  Notification note;
-  Status status;
-  drl_->GetAllDeviceAttributesAsync(devices, /*tasks=*/{}, &attributes,
-                                    [&note, &status](const Status& s) {
-                                      status = s;
-                                      note.Notify();
-                                    });
-  note.WaitForNotification();
-  EXPECT_FALSE(status.ok());
-  EXPECT_EQ(0, attributes.size());
+  EXPECT_TRUE(errors::IsInternal(
+      drl_->GetAllDeviceAttributes(/*task*/ "", &attributes)));
+}
+
+TEST_F(DeviceResolverLocalTest, UpdateDeviceAttributes) {
+  std::vector<DeviceAttributes> attributes;
+  EXPECT_TRUE(errors::IsInternal(drl_->UpdateDeviceAttributes(attributes)));
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 3a49f6f3232..5d574e3ffc0 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -65,6 +65,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/nccl/collective_communicator.h"
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
@@ -554,7 +555,8 @@ Status DirectSession::RunInternal(
                                            drl.get(),
                                            "/job:localhost/replica:0/task:0"));
       collective_executor_mgr_.reset(new CollectiveExecutorMgr(
-          options_.config, device_mgr_.get(), std::move(drl), std::move(cprl)));
+          options_.config, device_mgr_.get(), std::move(drl), std::move(cprl),
+          MaybeCreateNcclCommunicator()));
     }
     run_state.collective_executor.reset(new CollectiveExecutor::Handle(
         collective_executor_mgr_->FindOrCreate(step_id), true /*inherit_ref*/));
@@ -1340,12 +1342,10 @@ Status DirectSession::CreateExecutors(
       options_.config.experimental().has_session_metadata()
           ? &options_.config.experimental().session_metadata()
           : nullptr;
-  const CustomKernelCreator* custom_kernel_creator =
-      GetDefaultCustomKernelCreator();
   func_info->proc_flr.reset(new ProcessFunctionLibraryRuntime(
       device_mgr_.get(), options_.env, &options_.config, graph_def_version,
       func_info->flib_def.get(), optimizer_opts, thread_pools_[0].first,
-      /*parent=*/nullptr, custom_kernel_creator, session_metadata,
+      /*parent=*/nullptr, session_metadata,
       Rendezvous::Factory{
           [](const int64, const DeviceMgr* device_mgr, Rendezvous** r) {
             *r = new IntraProcessRendezvous(device_mgr);
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index eab508662e6..613449f572e 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -1965,7 +1965,6 @@ static void TestSessionInterOpThreadsImpl(bool use_function_lib,
       ->set_constant_folding(RewriterConfig::OFF);
   (*options.config.mutable_device_count())["CPU"] = 2;
   (*options.config.mutable_device_count())["GPU"] = 0;
-  (*options.config.mutable_device_count())["SYCL"] = 0;
 
   auto* p = options.config.add_session_inter_op_thread_pool();
   if (use_global_pools) p->set_global_name("large pool");
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index ee4cc2d8384..c314d296fd9 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -175,16 +175,10 @@ static void TestHWAccelerator(bool enableHWTrace) {
   test::FillValues<float>(&x_tensor, {1, 1});
   Node* x = test::graph::Constant(&graph, x_tensor);
   x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
-#ifdef TENSORFLOW_USE_SYCL
-  x->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
-#endif  // TENSORFLOW_USE_SYCL
 
   // y = A * x
   Node* y = test::graph::Matmul(&graph, a, x, false, false);
   y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:GPU:0");
-#ifdef TENSORFLOW_USE_SYCL
-  y->set_assigned_device_name("/job:localhost/replica:0/task:0/device:SYCL:0");
-#endif  // TENSORFLOW_USE_SYCL
 
   Node* y_neg = test::graph::Unary(&graph, "Neg", y);
   y_neg->set_assigned_device_name("/job:localhost/replica:0/task:0/cpu:0");
@@ -195,9 +189,6 @@ static void TestHWAccelerator(bool enableHWTrace) {
   SessionOptions options;
   (*options.config.mutable_device_count())["CPU"] = 1;
   (*options.config.mutable_device_count())["GPU"] = 1;
-#ifdef TENSORFLOW_USE_SYCL
-  (*options.config.mutable_device_count())["SYCL"] = 1;
-#endif  // TENSORFLOW_USE_SYCL
   options.config.set_allow_soft_placement(true);
   options.config.mutable_graph_options()->set_build_cost_model(1);
   std::unique_ptr<Session> session(NewSession(options));
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 9108b04ef05..91f60d6ebe2 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
@@ -82,6 +84,7 @@ tf_cuda_library(
         "//tensorflow/c/eager:immediate_execution_operation",
         "//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
         "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/nccl:collective_communicator",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -114,9 +117,12 @@ tf_cc_test(
         ":core",
         ":eager_operation",
         ":execute",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/kernels:cast_op",
     ],
 )
 
@@ -309,9 +315,7 @@ KERNEL_AND_DEVICE_DEPS = [
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:protos_all_cc",
     "//tensorflow/core/profiler/lib:annotated_traceme",
-    "//tensorflow/core/profiler/lib:connected_traceme",
     "//tensorflow/core/profiler/lib:traceme",
-    "//tensorflow/core/profiler/lib:traceme_encode",
     "//tensorflow/core/grappler/optimizers:meta_optimizer",
 ]
 
@@ -443,10 +447,11 @@ cc_library(
     copts = tf_copts(allow_exceptions = True),
     deps = [
         ":eager_op_rewrite_registry",
+        "//tensorflow/core:all_kernels",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
-        "//tensorflow/core:mkl_graph_util",
+        "//tensorflow/core/graph:mkl_graph_util",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 196c4635ac4..757ac1f7783 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/nccl/collective_communicator.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
@@ -72,21 +74,17 @@ auto* eager_context_created =
 
 EagerContext::EagerContext(
     const SessionOptions& opts,
-    ContextDevicePlacementPolicy default_device_placement_policy,
-    ContextMirroringPolicy default_mirroring_policy, bool async,
+    ContextDevicePlacementPolicy default_device_placement_policy, bool async,
     const bool lazy_copy_function_remote_inputs, const DeviceMgr* device_mgr,
     bool device_mgr_owned, Rendezvous* rendezvous,
-    const CustomKernelCreator* custom_kernel_creator,
     DistributedFunctionLibraryRuntime* cluster_flr)
     : ImmediateExecutionContext(kEager),
       opts_(opts),
       default_device_placement_policy_(default_device_placement_policy),
-      default_mirroring_policy_(default_mirroring_policy),
       local_device_manager_(device_mgr, device_mgr_owned),
       host_cpu_device_(device_mgr->HostCPU()),
       rendezvous_(rendezvous),
       thread_pool_(NewThreadPoolFromSessionOptions(opts)),
-      custom_kernel_creator_(custom_kernel_creator),
       cluster_flr_(cluster_flr),
       log_device_placement_(opts.config.log_device_placement()),
       allow_soft_placement_(opts.config.allow_soft_placement()),
@@ -100,7 +98,7 @@ EagerContext::EagerContext(
           "TF_EAGER_ENABLE_SMALL_TENSOR_CPU_PINNING", false)) {
   ResetPFLR(device_mgr, opts.env, &opts.config, TF_GRAPH_DEF_VERSION,
             &func_lib_def_, opts.config.graph_options().optimizer_options(),
-            thread_pool_.get(), cluster_flr, custom_kernel_creator_);
+            thread_pool_.get(), cluster_flr);
   // Starts exporting metrics through a platform-specific monitoring API (if
   // provided). For builds using "tensorflow/core/platform/default", this is
   // currently a no-op.
@@ -122,7 +120,7 @@ EagerContext::EagerContext(
       "/job:localhost/replica:0/task:0"));
   collective_executor_mgr_.Reset(
       new CollectiveExecutorMgr(opts.config, local_device_mgr(), std::move(drl),
-                                std::move(cprl)),
+                                std::move(cprl), MaybeCreateNcclCommunicator()),
       /*owned=*/true);
 }
 
@@ -186,8 +184,7 @@ void EagerContext::ResetPFLR(const DeviceMgr* device_mgr, Env* env,
                              const FunctionLibraryDefinition* lib_def,
                              const OptimizerOptions& optimizer_options,
                              thread::ThreadPool* thread_pool,
-                             DistributedFunctionLibraryRuntime* cluster_flr,
-                             const CustomKernelCreator* custom_kernel_creator) {
+                             DistributedFunctionLibraryRuntime* cluster_flr) {
   Rendezvous::Factory rendezvous_factory{
       [this](const int64 step_id, const DeviceMgr*, Rendezvous** r) {
         *r = CreateRendezvous(step_id);
@@ -195,7 +192,7 @@ void EagerContext::ResetPFLR(const DeviceMgr* device_mgr, Env* env,
       }};
   pflr_.reset(new ProcessFunctionLibraryRuntime(
       device_mgr, env, config, graph_def_version, lib_def, optimizer_options,
-      thread_pool, cluster_flr, custom_kernel_creator,
+      thread_pool, cluster_flr,
       /*session_metadata=*/nullptr, std::move(rendezvous_factory)));
 }
 
@@ -403,25 +400,6 @@ ContextDevicePlacementPolicy EagerContext::GetDevicePlacementPolicy() const {
   return default_device_placement_policy_;
 }
 
-void EagerContext::SetThreadLocalMirroringPolicy(
-    ContextMirroringPolicy policy) {
-  mutex_lock ml(policy_map_mu_);
-  mirroring_policy_[std::this_thread::get_id()] = policy;
-}
-
-ContextMirroringPolicy EagerContext::GetMirroringPolicy() const {
-  tf_shared_lock l(policy_map_mu_);
-  auto policy_map_it = mirroring_policy_.find(std::this_thread::get_id());
-  if (policy_map_it != mirroring_policy_.end()) {
-    return policy_map_it->second;
-  }
-  return default_mirroring_policy_;
-}
-
-bool EagerContext::MirrorTensors() const {
-  return GetMirroringPolicy() == MIRRORING_ALL;
-}
-
 bool EagerContext::LazyCopyFunctionRemoteInputs() const {
   return lazy_copy_function_remote_inputs_;
 }
@@ -595,7 +573,7 @@ Status EagerContext::FindFunctionOpData(
   return func_lib_def_.LookUp(name, op_data);
 }
 
-const FunctionDef* EagerContext::FindFunctionDef(const string& name) {
+const FunctionDef* EagerContext::FindFunctionDef(const string& name) const {
   return func_lib_def_.Find(name);
 }
 
@@ -741,6 +719,22 @@ Status EagerContext::AddFunctionDef(const FunctionDef& fdef,
       gtl::InsertOrUpdate(&registered_functions_, fdef.signature().name(),
                           registered_function);
     } else {
+      // The function has been registered before. If the function is the same,
+      // then we take a Ref() otherwise we error out.
+      const FunctionDef* prev_fdef =
+          func_lib_def_.Find(fdef.signature().name());
+      if (prev_fdef == nullptr) {
+        return errors::Internal("Function: ", fdef.signature().name(),
+                                " is in the cache but not in the library");
+      }
+      if (!FunctionDefsEqual(fdef, *prev_fdef)) {
+        return errors::InvalidArgument(
+            "Attempting to add a duplicate function with name: ",
+            fdef.signature().name(), " where the previous and current ",
+            "definitions differ. Previous definiton: ",
+            prev_fdef->DebugString(),
+            " and current definition: ", fdef.DebugString());
+      }
       registered_function->Ref();
     }
     is_first_ref = registered_function->RefCountIsOne();
@@ -1348,7 +1342,7 @@ Status EagerContext::SetMasterContextState(
   const auto* config = pflr_->config();
   ResetPFLR(local_device_manager_.Get(), env_, config, TF_GRAPH_DEF_VERSION,
             &func_lib_def_, config->graph_options().optimizer_options(),
-            thread_pool_.get(), cluster_flr_.Get(), custom_kernel_creator_);
+            thread_pool_.get(), cluster_flr_.Get());
 
   keep_alive_secs_ = keep_alive_secs;
   sleep_for_secs_ = std::max(1, keep_alive_secs_ / 2);
@@ -1450,7 +1444,7 @@ Status EagerContext::InitializeRemoteWorker(
   const auto* config = pflr_->config();
   ResetPFLR(local_device_manager_.Get(), env_, config, TF_GRAPH_DEF_VERSION,
             &func_lib_def_, config->graph_options().optimizer_options(),
-            thread_pool_.get(), cluster_flr_.Get(), custom_kernel_creator_);
+            thread_pool_.get(), cluster_flr_.Get());
   InitPrioritizedDeviceTypeList();
 
   ClearCachesAndThreadExecutors();
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 0daee4139fc..f48da696d48 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -79,33 +79,6 @@ namespace eager {
 class RemoteMgr;
 }  // namespace eager
 
-// LINT.IfChange
-// Note: Keep in sync with exported copy of enum in eager/c_api.h.
-enum ContextDevicePlacementPolicy {
-  // Running operations with input tensors on the wrong device will fail.
-  DEVICE_PLACEMENT_EXPLICIT = 0,
-  // Copy the tensor to the right device but log a warning.
-  DEVICE_PLACEMENT_WARN = 1,
-  // Silently copy the tensor, which has a performance cost since the operation
-  // will be blocked till the copy completes. This is the default policy.
-  DEVICE_PLACEMENT_SILENT = 2,
-  // Placement policy which silently copies int32 tensors but not other dtypes.
-  DEVICE_PLACEMENT_SILENT_FOR_INT32 = 3,
-};
-// LINT.ThenChange(//tensorflow/c/eager/c_api.h)
-
-// LINT.IfChange
-// Note: Keep in sync with exported copy of enum in eager/c_api_experimental.h.
-enum ContextMirroringPolicy {
-  // Do not maintain mirrors in a TensorHandle, instead make new TensorHandle
-  // copies with their own lifetime.
-  MIRRORING_NONE = 0,
-  // Mirroring any remote tensor handles, associating them with the lifetime of
-  // the local TensorHandle.
-  MIRRORING_ALL = 1,
-};
-// LINT.ThenChange(//tensorflow/c/eager/c_api_experimental.h)
-
 class RunMetadataListener {
  public:
   virtual ~RunMetadataListener() {}
@@ -126,7 +99,7 @@ class CustomDevice {
                                       const string& target_device_name,
                                       TensorHandle** result) = 0;
 
-  virtual Status Execute(EagerOperation* op, TensorHandle** retvals,
+  virtual Status Execute(const EagerOperation* op, TensorHandle** retvals,
                          int* num_retvals) = 0;
 };
 
@@ -149,11 +122,9 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
 
   EagerContext(const SessionOptions& opts,
                ContextDevicePlacementPolicy default_device_placement_policy,
-               ContextMirroringPolicy default_mirroring_policy, bool async,
-               const bool lazy_copy_function_remote_inputs,
+               bool async, const bool lazy_copy_function_remote_inputs,
                const DeviceMgr* device_mgr, bool device_mgr_owned,
                Rendezvous* rendezvous,
-               const CustomKernelCreator* custom_kernel_creator,
                DistributedFunctionLibraryRuntime* cluster_flr = nullptr);
 
   void Release() override { Unref(); }
@@ -200,7 +171,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   std::function<void(std::function<void()>)>* runner() { return &runner_; }
 
   // Specify a executor for this thread.
-  void SetExecutorForThread(EagerExecutor* executor);
+  void SetExecutorForThread(EagerExecutor* executor) override;
 
   const std::shared_ptr<std::vector<DeviceType>> prioritized_device_type_list()
       const {
@@ -209,15 +180,16 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   }
 
   // Clear pending nodes in thread executors and kernel caches.
-  void ClearCachesAndThreadExecutors();
+  void ClearCachesAndThreadExecutors() override;
   // Clear pending nodes in default executor and kernel caches.
   void ClearCachesAndDefaultExecutor();
 
   // Sets the device placement policy for the current thread.
-  void SetThreadLocalDevicePlacementPolicy(ContextDevicePlacementPolicy policy);
+  void SetThreadLocalDevicePlacementPolicy(
+      ContextDevicePlacementPolicy policy) override;
 
   // Returns the device placement policy for the current thread.
-  ContextDevicePlacementPolicy GetDevicePlacementPolicy() const;
+  ContextDevicePlacementPolicy GetDevicePlacementPolicy() const override;
 
   // Select an appropriate device for an operation.
   //
@@ -234,14 +206,6 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   Status SelectDevice(DeviceNameUtils::ParsedName preferred,
                       const NodeDef& ndef, Device** out) const;
 
-  // Sets the implicit copy policy for the current thread.
-  void SetThreadLocalMirroringPolicy(ContextMirroringPolicy);
-
-  // Returns the implicit copy policy for the current thread.
-  ContextMirroringPolicy GetMirroringPolicy() const;
-
-  bool MirrorTensors() const;
-
   bool LazyCopyFunctionRemoteInputs() const;
 
   bool FindFunctionByName(const string& name) const;
@@ -249,16 +213,19 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   Status FindFunctionOpData(const string& name,
                             const tensorflow::OpRegistrationData** op_data);
 
-  const FunctionDef* FindFunctionDef(const string& name);
+  const FunctionDef* FindFunctionDef(const string& name) const override;
 
   Device* HostCPU() const { return host_cpu_device_; }
   Device* CanonicalDevice(Device* d) const {
     return HostCPU() == d ? nullptr : d;
   }
+  const DeviceNameUtils::ParsedName& HostCPUParsedName() const override {
+    return HostCPU()->parsed_name();
+  }
 
   GraphCollector* GetGraphCollector() { return &graph_collector_; }
 
-  EagerExecutor& Executor();
+  EagerExecutor& Executor() override;
 
   // Add the given `fdef` to the local FunctionLibraryDefinition. And add an
   // entry to the KernelAndDevice cache for it if it's not exist.
@@ -289,9 +256,13 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   void AddKernelToCache(Fprint128 cache_key, KernelAndDevice* kernel);
 
   bool LogDevicePlacement() const { return log_device_placement_; }
-  void SetLogDevicePlacement(bool enable) { log_device_placement_ = enable; }
+  void SetLogDevicePlacement(bool enable) override {
+    log_device_placement_ = enable;
+  }
   bool AllowSoftPlacement() const { return allow_soft_placement_; }
-  void SetAllowSoftPlacement(bool enable) { allow_soft_placement_ = enable; }
+  void SetAllowSoftPlacement(bool enable) override {
+    allow_soft_placement_ = enable;
+  }
   bool LogMemory() const { return log_memory_; }
 
   Rendezvous* GetRendezvous() const { return rendezvous_; }
@@ -338,7 +309,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // TODO(apassos) clean up RunMetadata storage.
   mutex* MetadataMu() TF_LOCK_RETURNED(metadata_mu_) { return &metadata_mu_; }
   bool ShouldStoreGraphs() TF_LOCKS_EXCLUDED(metadata_mu_);
-  void SetShouldStoreGraphs(bool value);
+  void SetShouldStoreGraphs(bool value) override;
   RunMetadata* RunMetadataProto() { return &run_metadata_; }
   void ClearRunMetadata() TF_EXCLUSIVE_LOCKS_REQUIRED(metadata_mu_);
 
@@ -516,8 +487,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
                  const FunctionLibraryDefinition* lib_def,
                  const OptimizerOptions& optimizer_options,
                  thread::ThreadPool* thread_pool = nullptr,
-                 DistributedFunctionLibraryRuntime* cluster_flr = nullptr,
-                 const CustomKernelCreator* custom_kernel_creator = nullptr);
+                 DistributedFunctionLibraryRuntime* cluster_flr = nullptr);
 
   void ResetClusterFLR(DistributedFunctionLibraryRuntime* cluster_flr);
 
@@ -557,15 +527,12 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
 
   SessionOptions opts_;
   const ContextDevicePlacementPolicy default_device_placement_policy_;
-  const ContextMirroringPolicy default_mirroring_policy_;
 
   // Note: we cannot use C++11 thread_local here as there is no concept of a
   // thread-local-object-local variable in C++11.
   mutable mutex policy_map_mu_;
   std::unordered_map<std::thread::id, ContextDevicePlacementPolicy>
       device_placement_policy_ TF_GUARDED_BY(policy_map_mu_);
-  std::unordered_map<std::thread::id, ContextMirroringPolicy> mirroring_policy_
-      TF_GUARDED_BY(policy_map_mu_);
 
   OwnedOrUnownedHelper<const DeviceMgr> local_device_manager_;
   // Maintain copy of all previously created local device managers.
@@ -594,8 +561,6 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
 
   std::unique_ptr<thread::ThreadPool> thread_pool_;
 
-  const CustomKernelCreator* const custom_kernel_creator_;
-
   // EagerContext owns the DistributedFunctionLibraryRuntime(
   // EagerClusterFunctionLibraryRuntime) if using EagerService for remote
   // function execution (lazy_copy_function_remote_inputs_=true).
diff --git a/tensorflow/core/common_runtime/eager/context_test.cc b/tensorflow/core/common_runtime/eager/context_test.cc
index e577b1d8152..5daea98a48e 100644
--- a/tensorflow/core/common_runtime/eager/context_test.cc
+++ b/tensorflow/core/common_runtime/eager/context_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/context.h"
 
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/test.h"
@@ -22,6 +24,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+typedef FunctionDefHelper FDH;
+
 // Return a fake device.
 static Device* CreateDevice(const string& type, int n) {
   class FakeDevice : public Device {
@@ -56,11 +60,9 @@ class EagerContextTest : public ::testing::Test {
     InitDeviceManager();
     context_ = new EagerContext(
         opts, policy,
-        /* default_mirroring_policy */ MIRRORING_NONE,
         /* async */ false,
         /* lazy_copy_function_remote_inputs */ false, device_manager_,
         /* device_mgr_owned */ false, /* rendezvous */ nullptr,
-        /* custom_kernel_creator */ nullptr,
         /* cluster_flr */ nullptr);
   }
 
@@ -144,5 +146,103 @@ TEST_F(EagerContextTest, CompositeDeviceWithGivenName) {
   EXPECT_EQ(composite_device_1, composite_device_0);
 }
 
+TEST_F(EagerContextTest, AddFunctionDef) {
+  InitContext(SessionOptions(), DEVICE_PLACEMENT_EXPLICIT);
+  const Tensor kTwo = test::AsScalar<int64>(2);
+  const FunctionDef x_times_two = FDH::Define(
+      // Name
+      "XTimesTwo",
+      // Args
+      {"x: T"},
+      // Return values
+      {"y: T"},
+      // Attr def
+      {"T: {float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
+          {{"scale"}, "Cast", {"two"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+          {{"y"}, "Mul", {"x", "scale"}, {{"T", "$T"}}},
+      });
+  TF_EXPECT_OK(context()->AddFunctionDef(x_times_two));
+}
+
+TEST_F(EagerContextTest, AddFunctionDefRepeatSame) {
+  InitContext(SessionOptions(), DEVICE_PLACEMENT_EXPLICIT);
+  const Tensor kTwo = test::AsScalar<int64>(2);
+  const FunctionDef x_times_two = FDH::Define(
+      // Name
+      "XTimesTwo",
+      // Args
+      {"x: T"},
+      // Return values
+      {"y: T"},
+      // Attr def
+      {"T: {float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
+          {{"scale"}, "Cast", {"two"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+          {{"y"}, "Mul", {"x", "scale"}, {{"T", "$T"}}},
+      });
+  TF_EXPECT_OK(context()->AddFunctionDef(x_times_two));
+  const FunctionDef x_times_two_copy = FDH::Define(
+      // Name
+      "XTimesTwo",
+      // Args
+      {"x: T"},
+      // Return values
+      {"y: T"},
+      // Attr def
+      {"T: {float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
+          {{"scale"}, "Cast", {"two"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+          {{"y"}, "Mul", {"x", "scale"}, {{"T", "$T"}}},
+      });
+  TF_EXPECT_OK(context()->AddFunctionDef(x_times_two_copy));
+}
+
+TEST_F(EagerContextTest, AddFunctionDefRepeatDifferent) {
+  InitContext(SessionOptions(), DEVICE_PLACEMENT_EXPLICIT);
+  const Tensor kTwo = test::AsScalar<int64>(2);
+  const FunctionDef x_times_two = FDH::Define(
+      // Name
+      "XTimesTwo",
+      // Args
+      {"x: T"},
+      // Return values
+      {"y: T"},
+      // Attr def
+      {"T: {float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
+          {{"scale"}, "Cast", {"two"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+          {{"y"}, "Mul", {"x", "scale"}, {{"T", "$T"}}},
+      });
+  TF_EXPECT_OK(context()->AddFunctionDef(x_times_two));
+  const Tensor kThree = test::AsScalar<int64>(3);
+  // Same function name but body is different. This should error out.
+  const FunctionDef x_times_two_copy = FDH::Define(
+      // Name
+      "XTimesTwo",
+      // Args
+      {"x: T"},
+      // Return values
+      {"y: T"},
+      // Attr def
+      {"T: {float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"two"}, "Const", {}, {{"value", kThree}, {"dtype", DT_INT64}}},
+          {{"scale"}, "Cast", {"two"}, {{"SrcT", DT_INT64}, {"DstT", "$T"}}},
+          {{"y"}, "Mul", {"x", "scale"}, {{"T", "$T"}}},
+      });
+  Status s = context()->AddFunctionDef(x_times_two_copy);
+  EXPECT_FALSE(s.ok());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/core.cc b/tensorflow/core/common_runtime/eager/core.cc
index ff63c70a98f..d1e1218a370 100644
--- a/tensorflow/core/common_runtime/eager/core.cc
+++ b/tensorflow/core/common_runtime/eager/core.cc
@@ -208,12 +208,18 @@ Status EagerOperation::Execute(absl::Span<AbstractTensorHandle*> retvals,
       device = ctx_.HostCPU();
     }
   }
+
+  tensorflow::TensorHandle** retval_array =
+      reinterpret_cast<tensorflow::TensorHandle**>(retvals.data());
+  if (VariantDeviceIsCustom(device)) {
+    return absl::get<CustomDevice*>(device)->Execute(this, retval_array,
+                                                     num_retvals);
+  }
+
   if (device != kVariantDeviceNull) {
     SetDevice(device);
   }
-  return EagerExecute(
-      this, reinterpret_cast<tensorflow::TensorHandle**>(retvals.data()),
-      num_retvals);
+  return EagerExecute(this, retval_array, num_retvals);
 }
 
 }  //  namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h
index daa9e8f52fb..133799e6ab3 100644
--- a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h
+++ b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h
@@ -55,7 +55,8 @@ class EagerOpRewriteRegistry {
   // Phases at which the Eager op rewrite pass should run.
   // For now we only added PRE_EXECUTION. Expand as needed.
   enum Phase {
-    PRE_EXECUTION = 0  // right before executing an eager op
+    PRE_EXECUTION = 0,  // right before executing an eager op
+    POST_PLACEMENT = 1  // after device placement
   };
 
   // Add a rewrite pass to the registry.
@@ -70,7 +71,7 @@ class EagerOpRewriteRegistry {
   static EagerOpRewriteRegistry* Global();
 
  private:
-  static constexpr int32 kNumPhases = 1;
+  static constexpr int32 kNumPhases = 2;
   // Holds all the registered Eager op rewrites.
   std::array<std::unique_ptr<EagerOpRewrite>, kNumPhases> rewrites_;
 };
diff --git a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
index b433cc4dbb2..25b03e266f7 100644
--- a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
+++ b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
@@ -47,9 +47,8 @@ TEST(EagerOpRewriteRegistryTest, RegisterRewritePass) {
       "CPU", {}, "/job:localhost/replica:0/task:0/device:CPU:0"));
   tensorflow::EagerContext* ctx = new tensorflow::EagerContext(
       SessionOptions(),
-      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
-      &device_mgr, false, nullptr, nullptr);
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
+      false, &device_mgr, false, nullptr, nullptr);
   EagerOperation orig_op(ctx);
   std::unique_ptr<tensorflow::EagerOperation> out_op;
   EXPECT_EQ(Status::OK(),
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
index 947b67a4dab..6d1ecf64fcc 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -277,11 +277,6 @@ Status EagerOperation::AddInputList(
   return InferInputListAttrs(inputs.size());
 }
 
-Status EagerOperation::SetUseXla(bool enable) {
-  use_xla_ = enable;
-  return Status::OK();
-}
-
 Status EagerOperation::Reset(
     const char* op, const char* device_name, bool remote,
     EagerExecutor* executor,
@@ -313,7 +308,6 @@ Status EagerOperation::Reset(
         "registered in the binary running in this process.");
   }
   attrs_.Reset(op);
-  use_xla_ = false;
   stack_trace_.reset();
   is_function_ = is_function;
   cancellation_manager_ = nullptr;
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index 327411e19c9..2e35dd43582 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -120,8 +120,6 @@ class EagerOperation : public ImmediateExecutionOperation {
   Status InputLength(const char* input_name, int* length) override;
   Status OutputLength(const char* output_name, int* length) override;
 
-  Status SetUseXla(bool enable) override;
-
   void SetStackTrace(AbstractStackTrace stack_trace) override {
     stack_trace_ = stack_trace;
   }
@@ -227,7 +225,6 @@ class EagerOperation : public ImmediateExecutionOperation {
   // updated accordingly.
   VariantDevice device_;
 
-  bool use_xla_ = false;
   absl::optional<AbstractStackTrace> stack_trace_;
   bool is_function_;  // Conceptually const, but can't be because of Reset
   bool colocation_exempt_;
@@ -257,6 +254,11 @@ inline EagerOperation* OperationFromInterface(
   return down_cast<EagerOperation*>(operation);
 }
 
+inline const EagerOperation* OperationFromInterface(
+    const ImmediateExecutionOperation* operation) {
+  return down_cast<const EagerOperation*>(operation);
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
diff --git a/tensorflow/core/common_runtime/eager/eager_operation_test.cc b/tensorflow/core/common_runtime/eager/eager_operation_test.cc
index 352c7f03365..b85e84c336f 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation_test.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation_test.cc
@@ -27,9 +27,8 @@ TEST(EagerOperationTest, DeviceName) {
       "CPU", {}, "/job:localhost/replica:0/task:0/device:CPU:0"));
   auto ctx = new EagerContext(
       SessionOptions(),
-      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
-      &device_mgr, false, nullptr, nullptr, nullptr);
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
+      false, &device_mgr, false, nullptr, nullptr);
 
   auto op = new EagerOperation(ctx);
 
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 24582147479..5102681934e 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
@@ -302,12 +303,19 @@ Status GetDeviceForInput(const EagerContext& ctx, TensorHandle* tensor_handle,
     TF_RETURN_IF_ERROR(
         ctx.FindDeviceFromName(device_name.c_str(), &input_device));
     *result = input_device;
-  } else if (MTypeFromDType(tensor_handle->dtype) == HOST_MEMORY) {
-    *result = cpu_device;
   } else {
     Device* device = absl::get<Device*>(tensor_handle->device());
-    device_name = device != nullptr ? device->name() : cpu_device->name();
-    *result = (device == nullptr ? cpu_device : device);
+    const bool is_tpu = device != nullptr && device->device_type() == "TPU";
+    // int32 return values can be placed on TPUs.
+    const bool use_host_memory =
+        is_tpu ? MTypeFromDTypeIntsOnDevice(tensor_handle->dtype)
+               : MTypeFromDType(tensor_handle->dtype);
+    if (use_host_memory) {
+      *result = cpu_device;
+    } else {
+      device_name = device != nullptr ? device->name() : cpu_device->name();
+      *result = (device == nullptr ? cpu_device : device);
+    }
   }
   return Status::OK();
 }
@@ -330,6 +338,30 @@ void AppendTensorShapeToFingerprint(const PartialTensorShape& shape,
   }
 }
 
+Status GetFuncAttr(const EagerOperation* op, const EagerContext& ctx,
+                   const char* attr_name, bool* value) {
+  Status status = op->Attrs().Get(attr_name, value);
+  if (status.ok()) {
+    DVLOG(2) << "Caller explicitly specifies "
+             << (attr_name ? "=true " : "=false, ") << op->DebugString();
+    return Status::OK();
+  }
+
+  const FunctionDef* function_def =
+      ctx.pflr()->GetFunctionLibraryDefinition()->Find(op->Name());
+  if (function_def == nullptr) {
+    return errors::NotFound("Failed to find function '", op->Name(), "'");
+  }
+
+  status = GetNodeAttr(AttrSlice(&function_def->attr()), attr_name, value);
+  if (status.ok()) {
+    DVLOG(2) << "Function definition explicitly specifies "
+             << (attr_name ? "=true" : "=false");
+    return Status::OK();
+  }
+  return status;
+}
+
 Status MustCompileWithXLA(const EagerOperation* op, const EagerContext& ctx,
                           bool* compile_with_xla) {
   if (!op->is_function()) {
@@ -345,27 +377,8 @@ Status MustCompileWithXLA(const EagerOperation* op, const EagerContext& ctx,
     return Status::OK();
   }
 
-  // Does node have an explicit request to compile or not?
-  Status status = op->Attrs().Get(kXlaMustCompileAttr, compile_with_xla);
+  Status status = GetFuncAttr(op, ctx, kXlaMustCompileAttr, compile_with_xla);
   if (status.ok()) {
-    DVLOG(2) << "Caller explicitly requested "
-             << (*compile_with_xla ? "" : "not ")
-             << "to compile with XLA: " << op->DebugString();
-    return Status::OK();
-  }
-
-  // Does FunctionDef have an explicit request to compile or not?
-  const FunctionDef* function_def =
-      ctx.pflr()->GetFunctionLibraryDefinition()->Find(op->Name());
-  if (function_def == nullptr) {
-    return errors::NotFound("Failed to find function '", op->Name(), "'");
-  }
-
-  status = GetNodeAttr(AttrSlice(&function_def->attr()), kXlaMustCompileAttr,
-                       compile_with_xla);
-  if (status.ok()) {
-    DVLOG(2) << "Function definition explicitly specifies "
-             << (*compile_with_xla ? "" : "not ") << "to compile with XLA";
     return Status::OK();
   }
 
@@ -481,6 +494,7 @@ Status GetOrCreateKernelAndDevice(
     DVLOG(2) << "Creating new kernel for " << op->Name() << " on device "
              << DeviceNameOrUnspecified(op->Device());
     bool run_function_with_flr = false;
+    bool function_outputs_on_op_device = false;
     if (op->is_function()) {
       bool compile_with_xla;
       TF_RETURN_IF_ERROR(MustCompileWithXLA(op, ctx, &compile_with_xla));
@@ -492,6 +506,8 @@ Status GetOrCreateKernelAndDevice(
       } else {
         run_function_with_flr = true;
       }
+      GetFuncAttr(op, ctx, kOutputsOnOpDevice, &function_outputs_on_op_device)
+          .IgnoreError();
     }
 
     const NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
@@ -541,6 +557,7 @@ Status GetOrCreateKernelAndDevice(
           std::move(composite_devices),
           std::move(input_resource_variable_dtypes_and_shapes), runner,
           ctx.GetCollectiveExecutorHandle(), ctx.HostCPU(), op->Name(),
+          function_outputs_on_op_device,
           [&ctx](const int64 step_id) { return ctx.CreateRendezvous(step_id); },
           get_op_id));
     } else {
@@ -714,8 +731,23 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
   TF_RETURN_IF_ERROR(executor.status());
 
   core::RefCountPtr<KernelAndDevice> kernel;
-  TF_RETURN_IF_ERROR(
-      GetOrCreateKernelAndDevice(op, retvals, num_retvals, &kernel));
+  auto status = GetOrCreateKernelAndDevice(op, retvals, num_retvals, &kernel);
+
+  // Run all the registered rewrite pass after the placement, regardless whether
+  // the placement is successful or not. The passes can either create new ops
+  // (without placement) or update some fields of the input op.
+  std::unique_ptr<tensorflow::EagerOperation> out_op;
+  TF_RETURN_IF_ERROR(EagerOpRewriteRegistry::Global()->RunRewrite(
+      EagerOpRewriteRegistry::POST_PLACEMENT, op, &out_op));
+  if (out_op) {
+    op = out_op.get();
+    // If the out op doesn't have device, either because it is a new op or
+    // the op wasn't placed successfully, then we do the placement again.
+    if (op->Device() == kVariantDeviceNull) {
+      status = GetOrCreateKernelAndDevice(op, retvals, num_retvals, &kernel);
+    }
+  }
+  if (!status.ok()) return status;
 
   int num_outputs = kernel->num_outputs();
   TF_RETURN_IF_ERROR(ValidateInputTypeAndPlacement(&ctx, op, kernel));
@@ -1070,11 +1102,6 @@ Status EagerExecute(EagerOperation* op, TensorHandle** retvals,
       [&] { return absl::StrCat("EagerExecute: ", op->Name()); },
       profiler::TraceMeLevel::kInfo);
 
-  if (VariantDeviceIsCustom(op->Device())) {
-    return absl::get<CustomDevice*>(op->Device())
-        ->Execute(op, retvals, num_retvals);
-  }
-
   if (!op->Executor().Async()) {
     // In sync mode, always clear error to maintain the same behavior as before.
     // TODO(b/141004939): Remove this.
diff --git a/tensorflow/core/common_runtime/eager/execute_node_test.cc b/tensorflow/core/common_runtime/eager/execute_node_test.cc
index 54df63f0f73..5f0cd660f12 100644
--- a/tensorflow/core/common_runtime/eager/execute_node_test.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node_test.cc
@@ -37,7 +37,7 @@ class TestKernelAndDeviceFunc final : public KernelAndDeviceFunc {
             /*flr=*/nullptr, /*pflr=*/nullptr, /*input_devices=*/{},
             /*composite_devices=*/{}, /*input_resource_dtypes_and_shapes=*/{},
             /*runner=*/nullptr, /*collective_executor=*/nullptr,
-            host_cpu_device, /*name=*/"",
+            host_cpu_device, /*name=*/"", /*outputs_on_op_device=*/false,
             /*rendezvous_creator=*/nullptr, /*get_op_id=*/nullptr),
         test_input_devices_(std::move(input_devices)) {}
 
@@ -67,9 +67,8 @@ TEST(ExecuteNodeTest, ExecuteNodeArgs) {
 
   auto ctx = new EagerContext(
       SessionOptions(),
-      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
-      &device_mgr, false, nullptr, nullptr, nullptr);
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
+      false, &device_mgr, false, nullptr, nullptr);
 
   // Set a RemoteMgr to the EagerContext.
   auto remote_mgr = absl::make_unique<eager::RemoteMgr>(
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 5f0dce21e8e..d2327ff9592 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -40,9 +40,7 @@ limitations under the License.
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/platform/setround.h"
 #include "tensorflow/core/profiler/lib/annotated_traceme.h"
-#include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
-#include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
 #if !defined(IS_MOBILE_PLATFORM)
@@ -162,6 +160,17 @@ Status KernelAndDeviceFunc::InstantiateFunc(const Context& ctx,
   }
   options.composite_devices = composite_devices_;
   options.input_resource_dtypes_and_shapes = input_resource_dtypes_and_shapes_;
+  if (outputs_on_op_device_) {
+    const FunctionLibraryDefinition* lib_def =
+        pflr_->GetFunctionLibraryDefinition();
+    const FunctionDef* fdef = lib_def->Find(ndef.op());
+    if (fdef == nullptr) {
+      return errors::InvalidArgument("Failed to find function ", ndef.op());
+    }
+    for (int i = 0; i < fdef->signature().output_arg_size(); ++i) {
+      options.output_devices.push_back(options.target);
+    }
+  }
 
   const auto& it = ndef.attr().find("executor_type");
   if (it != ndef.attr().end()) {
@@ -310,7 +319,12 @@ Status KernelAndDeviceOp::Run(
   if (outputs != nullptr) {
     outputs->clear();
     for (int i = 0; i < context.num_outputs(); ++i) {
-      outputs->push_back(Tensor(*context.mutable_output(i)));
+      const auto* output_tensor = context.mutable_output(i);
+      if (output_tensor != nullptr) {
+        outputs->push_back(Tensor(*output_tensor));
+      } else {
+        outputs->push_back(Tensor());
+      }
     }
   }
   return Status::OK();
@@ -388,14 +402,6 @@ void KernelAndDeviceFunc::RunAsync(
 
   outputs->clear();
 
-  profiler::TraceMeProducer activity(
-      // To TraceMeConsumers in ExecutorState::Process/Finish.
-      [&] {
-        return profiler::TraceMeEncode(
-            "FunctionRun", {{"id", opts->step_id}, {"_r", 1} /*root_event*/});
-      },
-      profiler::ContextType::kTfExecutor, opts->step_id,
-      profiler::TraceMeLevel::kInfo);
   pflr_->Run(*opts, handle_, inputs, outputs,
              [opts, rendezvous, local_cm, step_container, this,
               done = std::move(done)](const Status& s) {
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index 0a765510d7b..f48452aa46b 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -47,6 +47,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+static constexpr const char* const kOutputsOnOpDevice = "_OutputsOnOpDevice";
+
 class ProcessFunctionLibraryRuntime;
 class FunctionLibraryRuntime;
 
@@ -270,12 +272,14 @@ class KernelAndDeviceFunc : public KernelAndDevice {
       std::function<void(std::function<void()>)>* runner,
       std::unique_ptr<CollectiveExecutor::Handle> collective_executor,
       Device* host_cpu_device, const string& name,
+      const bool outputs_on_op_device,
       std::function<Rendezvous*(const int64)> rendezvous_creator,
       std::function<int64()> get_op_id)
       : KernelAndDevice(flr, runner, std::move(collective_executor),
                         host_cpu_device),
         pflr_(pflr),
         handle_(kInvalidHandle),
+        outputs_on_op_device_(outputs_on_op_device),
         input_devices_(std::move(input_devices)),
         composite_devices_(std::move(composite_devices)),
         input_resource_dtypes_and_shapes_(
@@ -336,6 +340,11 @@ class KernelAndDeviceFunc : public KernelAndDevice {
   FunctionLibraryRuntime::Handle handle_;
   // Indicates whether the function needs to execute cross process.
   bool is_cross_process_;
+
+  // If true, function outputs are explicitly assigned to the default device;
+  // if false, the output devices are inferred by pflr_.
+  bool outputs_on_op_device_;
+
   // CPU devices are null. Resource handles' devices are actual backing
   // devices.
   std::vector<Device*> output_devices_;
diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
index 105b0d92460..9f5eb90ab64 100644
--- a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
@@ -51,14 +51,12 @@ class MklEagerOpRewrite : public EagerOpRewrite {
   static Status CreateGenericMklOp(EagerOperation* orig_op,
                                    std::unique_ptr<EagerOperation>* mkl_op);
 
-  // Creates new MKL op for Conv2D, Conv2DBackpropInput and
-  // Conv2DBackpropFilter.
-  static Status CreateMklConv2DOp(
-      EagerOperation* orig_op, std::unique_ptr<EagerOperation>* mkl_conv2d_op);
-
   // Rewrite rule for Conv2D, Conv2DBackpropInput and Conv2DBackpropFilter.
   static bool RewriteConv2D(EagerOperation* op);
 
+  // Rewrite rule for FusedBatchNormV3 and FusedBatchNormGradV3
+  static bool RewriteFusedBatchNormV3(EagerOperation* op);
+
   // Calls op-specific rewrite function to create new MKL op.
   Status RewriteToMklOp(EagerOperation* orig_op,
                         std::unique_ptr<EagerOperation>* mkl_op);
@@ -89,11 +87,36 @@ REGISTER_REWRITE(EagerOpRewriteRegistry::PRE_EXECUTION, MklEagerOpRewrite);
 // Constructor
 MklEagerOpRewrite::MklEagerOpRewrite(string name, string file, string line)
     : EagerOpRewrite(name, file, line), registered_kernels_map_() {
+  InsertMKLEagerOps({"AvgPool", AlwaysRewrite, CreateGenericMklOp});
+  InsertMKLEagerOps({"AvgPoolGrad", AlwaysRewrite, CreateGenericMklOp});
+  InsertMKLEagerOps({"AvgPool3D", AlwaysRewrite, CreateGenericMklOp});
+  InsertMKLEagerOps({"AvgPool3DGrad", AlwaysRewrite, CreateGenericMklOp});
   InsertMKLEagerOps({"BatchMatMul", AlwaysRewrite, CreateGenericMklOp});
   InsertMKLEagerOps({"BatchMatMulV2", AlwaysRewrite, CreateGenericMklOp});
-  InsertMKLEagerOps({"Conv2D", RewriteConv2D, CreateMklConv2DOp});
-  InsertMKLEagerOps({"Conv2DBackpropInput", RewriteConv2D, CreateMklConv2DOp});
-  InsertMKLEagerOps({"Conv2DBackpropFilter", RewriteConv2D, CreateMklConv2DOp});
+  InsertMKLEagerOps({"Conv2D", RewriteConv2D, CreateGenericMklOp});
+  InsertMKLEagerOps(
+      {"Conv2DBackpropFilter", RewriteConv2D, CreateGenericMklOp});
+  InsertMKLEagerOps({"Conv2DBackpropInput", RewriteConv2D, CreateGenericMklOp});
+  InsertMKLEagerOps({"Conv3D", RewriteConv2D, CreateGenericMklOp});
+  InsertMKLEagerOps(
+      {"Conv3DBackpropFilterV2", RewriteConv2D, CreateGenericMklOp});
+  InsertMKLEagerOps(
+      {"Conv3DBackpropInputV2", RewriteConv2D, CreateGenericMklOp});
+  InsertMKLEagerOps(
+      {"DepthwiseConv2dNative", RewriteConv2D, CreateGenericMklOp});
+  InsertMKLEagerOps({"DepthwiseConv2dNativeBackpropFilter", RewriteConv2D,
+                     CreateGenericMklOp});
+  InsertMKLEagerOps({"DepthwiseConv2dNativeBackpropInput", RewriteConv2D,
+                     CreateGenericMklOp});
+  InsertMKLEagerOps({"FusedBatchNorm", AlwaysRewrite, CreateGenericMklOp});
+  InsertMKLEagerOps({"FusedBatchNormGrad", AlwaysRewrite, CreateGenericMklOp});
+  InsertMKLEagerOps(
+      {"FusedBatchNormGradV2", AlwaysRewrite, CreateGenericMklOp});
+  InsertMKLEagerOps(
+      {"FusedBatchNormGradV3", RewriteFusedBatchNormV3, CreateGenericMklOp});
+  InsertMKLEagerOps({"FusedBatchNormV2", AlwaysRewrite, CreateGenericMklOp});
+  InsertMKLEagerOps(
+      {"FusedBatchNormV3", RewriteFusedBatchNormV3, CreateGenericMklOp});
   InsertMKLEagerOps({"MatMul", AlwaysRewrite, CreateGenericMklOp});
 };
 
@@ -142,16 +165,9 @@ Status MklEagerOpRewrite::SetupNewOp(
 
 Status MklEagerOpRewrite::CreateGenericMklOp(
     EagerOperation* orig_op, std::unique_ptr<EagerOperation>* mkl_op) {
-  const string mkl_op_name = mkl_op_registry::GetMklOpName(orig_op->Name());
-  TF_CHECK_OK(SetupNewOp(orig_op, mkl_op_name, mkl_op));
-  return Status::OK();
-}
-
-Status MklEagerOpRewrite::CreateMklConv2DOp(
-    EagerOperation* orig_op, std::unique_ptr<EagerOperation>* mkl_conv2d_op) {
   const string mkl_op_name =
       mkl_op_registry::GetMklNativeOpName(orig_op->Name());
-  TF_CHECK_OK(SetupNewOp(orig_op, mkl_op_name, mkl_conv2d_op));
+  TF_CHECK_OK(SetupNewOp(orig_op, mkl_op_name, mkl_op));
   return Status::OK();
 }
 
@@ -234,5 +250,15 @@ bool MklEagerOpRewrite::RewriteConv2D(EagerOperation* op) {
   return (padding != "EXPLICIT");
 }
 
+bool MklEagerOpRewrite::RewriteFusedBatchNormV3(EagerOperation* op) {
+  const NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
+  if (Check5DFormat(ndef)) {
+    VLOG(1) << "Eager Op Rewrite: FusedBatchNorm(Grad)V3 op currently does not "
+            << "support 5D tensors.";
+    return false;
+  }
+  return true;
+}
+
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc
index 4a4502e8f33..b56d97428b3 100644
--- a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite_test.cc
@@ -25,11 +25,15 @@ namespace tensorflow {
 
 class EagerOpRewriteTest : public ::testing::Test {
  public:
-  EagerOpRewriteTest() {}
+  EagerOpRewriteTest() : eager_ctx_(nullptr) {}
+  ~EagerOpRewriteTest() {
+    if (eager_ctx_) {
+      eager_ctx_->Unref();
+    }
+  }
 
   // Creates a new op to be used as input to MKL eager rewrite.
-  static std::unique_ptr<tensorflow::EagerOperation> CreateOp(
-      const string op_name) {
+  std::unique_ptr<tensorflow::EagerOperation> CreateOp(const string op_name) {
     std::unique_ptr<DeviceMgr> device_mgr =
         absl::make_unique<StaticDeviceMgr>(DeviceFactory::NewDevice(
             "CPU", {}, "/job:localhost/replica:0/task:0/device:CPU:0"));
@@ -37,24 +41,21 @@ class EagerOpRewriteTest : public ::testing::Test {
     bool lazy_remote_tensor_copy = false;
     tensorflow::Rendezvous* rendezvous =
         new tensorflow::IntraProcessRendezvous(device_mgr.get());
-    tensorflow::EagerContext* eager_ctx = new tensorflow::EagerContext(
+    eager_ctx_ = new tensorflow::EagerContext(
         SessionOptions(),
         tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-        tensorflow::ContextMirroringPolicy::MIRRORING_NONE, async,
-        lazy_remote_tensor_copy, device_mgr.get(), false, rendezvous,
-        GetDefaultCustomKernelCreator());
+        async, lazy_remote_tensor_copy, device_mgr.get(), false, rendezvous);
 
     EagerExecutor executor_(false);
     std::unique_ptr<tensorflow::EagerOperation> op(
-        new tensorflow::EagerOperation(eager_ctx));
+        new tensorflow::EagerOperation(eager_ctx_));
     EXPECT_EQ(Status::OK(),
               op.get()->Reset(op_name.c_str(), nullptr, false, &executor_));
-    eager_ctx->Unref();
     return op;
   }
 
   // Validates the result of MKL eager rewrite.
-  static void CheckRewrite(EagerOperation* orig_op, string expected_op_name) {
+  void CheckRewrite(EagerOperation* orig_op, string expected_op_name) {
     std::unique_ptr<tensorflow::EagerOperation> out_op;
     EagerOpRewriteRegistry::Global()->RunRewrite(
         EagerOpRewriteRegistry::PRE_EXECUTION, orig_op, &out_op);
@@ -67,64 +68,86 @@ class EagerOpRewriteTest : public ::testing::Test {
 
     EXPECT_EQ(actual_op_name, expected_op_name);
   }
+
+ protected:
+  tensorflow::EagerContext* eager_ctx_;
 };
 
-#define REGISTER_TEST(NAME, T, INPUT)                 \
-  TEST_F(EagerOpRewriteTest, NAME##_##T) {            \
-    auto orig_op = CreateOp("Conv2D");                \
-    orig_op->MutableAttrs()->Set("T", T);             \
-    orig_op->MutableAttrs()->Set("padding", "VALID"); \
-    CheckRewrite(orig_op.get(), "_MklNativeConv2D");  \
+#define CONV_OPS                                                      \
+  "Conv2D", "Conv2DBackpropInput", "Conv2DBackpropFilter", "Conv3D",  \
+      "Conv3DBackpropFilterV2", "Conv3DBackpropInputV2",              \
+      "DepthwiseConv2dNative", "DepthwiseConv2dNativeBackpropFilter", \
+      "DepthwiseConv2dNativeBackpropInput"
+
+#define REGISTER_TEST(NAME, T, INPUT)                                 \
+  TEST_F(EagerOpRewriteTest, NAME##_##T) {                            \
+    std::vector<string> conv_ops = {CONV_OPS};                        \
+    for (int i = 0; i < conv_ops.size(); ++i) {                       \
+      auto orig_op = CreateOp(conv_ops[i]);                           \
+      orig_op->MutableAttrs()->Set("T", T);                           \
+      orig_op->MutableAttrs()->Set("padding", "VALID");               \
+      CheckRewrite(orig_op.get(),                                     \
+                   mkl_op_registry::GetMklNativeOpName(conv_ops[i])); \
+    }                                                                 \
   }
-REGISTER_TEST_ALL_TYPES(Conv2D);
+REGISTER_TEST_ALL_TYPES(ConvOps_Positive);
 #undef REGISTER_TEST
 
-#define REGISTER_TEST(NAME, T, INPUT)                    \
-  TEST_F(EagerOpRewriteTest, NAME##_##T) {               \
-    auto orig_op = CreateOp("Conv2D");                   \
-    orig_op->MutableAttrs()->Set("T", T);                \
-    orig_op->MutableAttrs()->Set("padding", "EXPLICIT"); \
-    CheckRewrite(orig_op.get(), "Conv2D");               \
+#define REGISTER_TEST(NAME, T, INPUT)                      \
+  TEST_F(EagerOpRewriteTest, NAME##_##T) {                 \
+    std::vector<string> conv_ops = {CONV_OPS};             \
+    for (int i = 0; i < conv_ops.size(); ++i) {            \
+      auto orig_op = CreateOp(conv_ops[i]);                \
+      orig_op->MutableAttrs()->Set("T", T);                \
+      orig_op->MutableAttrs()->Set("padding", "EXPLICIT"); \
+      CheckRewrite(orig_op.get(), conv_ops[i]);            \
+    }                                                      \
   }
-REGISTER_TEST_ALL_TYPES(Conv2D_Explicit_Padding);
+REGISTER_TEST_ALL_TYPES(ConvOpsExplicitPadding_Negative);
 #undef REGISTER_TEST
 
-#define REGISTER_TEST(NAME, T, INPUT)                             \
-  TEST_F(EagerOpRewriteTest, NAME##_##T) {                        \
-    auto orig_op = CreateOp("Conv2DBackpropInput");               \
-    orig_op->MutableAttrs()->Set("T", T);                         \
-    orig_op->MutableAttrs()->Set("padding", "VALID");             \
-    CheckRewrite(orig_op.get(), "_MklNativeConv2DBackpropInput"); \
+#define REGISTER_TEST(NAME, T, INPUT)                            \
+  TEST_F(EagerOpRewriteTest, NAME##_##T) {                       \
+    std::vector<string> ops = {"AvgPool",                        \
+                               "AvgPoolGrad",                    \
+                               "AvgPool3D",                      \
+                               "AvgPool3DGrad",                  \
+                               "BatchMatMul",                    \
+                               "FusedBatchNorm",                 \
+                               "FusedBatchNormV2",               \
+                               "FusedBatchNormV3",               \
+                               "FusedBatchNormGrad",             \
+                               "FusedBatchNormGradV2",           \
+                               "FusedBatchNormGradV3",           \
+                               "MatMul"};                        \
+    for (int i = 0; i < ops.size(); ++i) {                       \
+      auto orig_op = CreateOp(ops[i]);                           \
+      orig_op->MutableAttrs()->Set("T", T);                      \
+      CheckRewrite(orig_op.get(),                                \
+                   mkl_op_registry::GetMklNativeOpName(ops[i])); \
+    }                                                            \
   }
-REGISTER_TEST_ALL_TYPES(Conv2DBackpropInput);
+REGISTER_TEST_ALL_TYPES(MostOps_Positive);
 #undef REGISTER_TEST
 
-#define REGISTER_TEST(NAME, T, INPUT)                              \
-  TEST_F(EagerOpRewriteTest, NAME##_##T) {                         \
-    auto orig_op = CreateOp("Conv2DBackpropFilter");               \
-    orig_op->MutableAttrs()->Set("T", T);                          \
-    orig_op->MutableAttrs()->Set("padding", "VALID");              \
-    CheckRewrite(orig_op.get(), "_MklNativeConv2DBackpropFilter"); \
+#define REGISTER_TEST(NAME, T, INPUT)                                 \
+  TEST_F(EagerOpRewriteTest, NAME##_##T) {                            \
+    std::vector<string> Fused_BN_ops = {"FusedBatchNormV3",           \
+                                        "FusedBatchNormGradV3"};      \
+    for (int i = 0; i < Fused_BN_ops.size(); ++i) {                   \
+      auto orig_op = CreateOp(Fused_BN_ops[i]);                       \
+      orig_op->MutableAttrs()->Set("T", T);                           \
+      orig_op->MutableAttrs()->Set("data_format", "" DATA_FORMAT ""); \
+      CheckRewrite(orig_op.get(), Fused_BN_ops[i]);                   \
+    }                                                                 \
   }
-REGISTER_TEST_ALL_TYPES(Conv2DBackpropFilter);
-#undef REGISTER_TEST
+#define DATA_FORMAT "NCDHW"
+REGISTER_TEST_ALL_TYPES(FusedBatchNormV3_5D_Negative_1);
 
-#define REGISTER_TEST(NAME, T, INPUT)               \
-  TEST_F(EagerOpRewriteTest, NAME##_##T) {          \
-    auto orig_op = CreateOp("BatchMatMul");         \
-    orig_op->MutableAttrs()->Set("T", T);           \
-    CheckRewrite(orig_op.get(), "_MklBatchMatMul"); \
-  }
-REGISTER_TEST_ALL_TYPES(BatchMatMul);
-#undef REGISTER_TEST
+#define DATA_FORMAT "NDHWC"
+REGISTER_TEST_ALL_TYPES(FusedBatchNormV3_5D_Negative_2);
 
-#define REGISTER_TEST(NAME, T, INPUT)          \
-  TEST_F(EagerOpRewriteTest, NAME##_##T) {     \
-    auto orig_op = CreateOp("MatMul");         \
-    orig_op->MutableAttrs()->Set("T", T);      \
-    CheckRewrite(orig_op.get(), "_MklMatMul"); \
-  }
-REGISTER_TEST_ALL_TYPES(MatMul);
+#undef DATA_FORMAT
 #undef REGISTER_TEST
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/placement_test.cc b/tensorflow/core/common_runtime/eager/placement_test.cc
index 4ea38d2f5f9..87cdfb61e99 100644
--- a/tensorflow/core/common_runtime/eager/placement_test.cc
+++ b/tensorflow/core/common_runtime/eager/placement_test.cc
@@ -85,11 +85,9 @@ class PlacementTest : public ::testing::Test {
     InitDeviceManager();
     context_ = new EagerContext(
         opts, policy,
-        /* default_mirroring_policy */ MIRRORING_NONE,
         /* async */ false,
         /* lazy_copy_function_remote_inputs */ false, device_manager_,
         /* device_mgr_owned */ false, /* rendezvous */ nullptr,
-        /* custom_kernel_creator */ nullptr,
         /* cluster_flr */ nullptr);
   }
 
diff --git a/tensorflow/core/common_runtime/eager/placement_utils.cc b/tensorflow/core/common_runtime/eager/placement_utils.cc
index 148c6c6ce03..619715f1cae 100644
--- a/tensorflow/core/common_runtime/eager/placement_utils.cc
+++ b/tensorflow/core/common_runtime/eager/placement_utils.cc
@@ -185,34 +185,35 @@ Status MaybePinToCustomDevice(VariantDevice* device, const EagerOperation& op) {
   if (VariantDeviceIsCustom(op.Device())) {
     *device = op.Device();
     return Status::OK();
+  } else if (!op.DeviceName().empty()) {
+    // Don't override explicit placements.
+    return Status::OK();
   }
 
+  // Ops are placed on a custom device if there's no other explicit requested
+  // placement and there is only one custom device in the op inputs.
   if (!op.Inputs().empty()) {
-    // We keep track of what we've seen with devices instead of booleans to be
-    // able to provide a meaningful error message below.
-    VariantDevice first = op.Inputs()[0]->device();
-    VariantDevice different = first;  // A different input device, if any.
-    VariantDevice custom = first;     // The first custom device seen, or an
-                                      // arbitrary non-custom device otherwise.
-    for (size_t i = 1; first == different && i < op.Inputs().size(); ++i) {
-      VariantDevice device = op.Inputs()[i]->device();
-      if (device != first) {
-        different = device;
-      }
-      if (!VariantDeviceIsCustom(custom) && VariantDeviceIsCustom(device)) {
-        custom = device;
-      }
-      if (different != first && VariantDeviceIsCustom(custom)) {
-        return errors::InvalidArgument(absl::StrCat(
-            "If an operation has one of its inputs in a custom device, then "
-            "all inputs should be on that same device. Operation ",
-            op.Name(), " has one input in custom device ",
-            VariantDeviceName(custom),
-            " and at least one input in a different device ",
-            VariantDeviceName(custom == first ? different : first)));
+    CustomDevice* first = nullptr;
+    for (const TensorHandle* input : op.Inputs()) {
+      if (VariantDeviceIsCustom(input->device())) {
+        CustomDevice* current = absl::get<CustomDevice*>(input->device());
+        if (first == nullptr) {
+          first = current;
+        } else if (first != current) {
+          return errors::InvalidArgument(absl::StrCat(
+              "If an operation has one of its inputs in a custom device, then "
+              "all inputs should be on that same custom device or another "
+              "physical device. Operation ",
+              op.Name(),
+              " has one input in custom "
+              "device ",
+              VariantDeviceName(first),
+              " and at least one input in a different custom device ",
+              VariantDeviceName(current)));
+        }
       }
     }
-    if (different == first && VariantDeviceIsCustom(custom)) {
+    if (first != nullptr) {
       *device = first;
       return Status::OK();
     }
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 620685ea3c1..da37ad1b480 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -980,7 +980,7 @@ void TensorHandle::Poison(Status status, const Device* d) {
 
 Status TensorHandle::CopyToDevice(const EagerContext& ctx,
                                   tensorflow::Device* d,
-                                  tensorflow::Tensor* output) {
+                                  tensorflow::Tensor* output) const {
   tensorflow::Device* dstd = (d == nullptr) ? ctx.HostCPU() : d;
   tensorflow::Device* srcd = absl::get<Device*>(DeviceOrHostCPU(ctx));
   const bool dst_cpu = dstd->tensorflow_gpu_device_info() == nullptr;
@@ -1116,6 +1116,28 @@ const char* TensorHandle::BackingDeviceName(Status* status) const {
   }
 }
 
+const char* TensorHandle::DeviceType(Status* status) const {
+  if (VariantDeviceIsCustom(device())) {
+    status->Update(
+        tensorflow::errors::Unimplemented("Custom device unsupported"));
+    return nullptr;
+  }
+  status->Update(WaitUnknownDevice());
+  tensorflow::Device* d = op_device();
+  return (d == nullptr) ? "CPU" : d->parsed_name().type.c_str();
+}
+
+int TensorHandle::DeviceId(Status* status) const {
+  if (VariantDeviceIsCustom(device())) {
+    status->Update(
+        tensorflow::errors::Unimplemented("Custom device unsupported"));
+    return -1;
+  }
+  status->Update(WaitUnknownDevice());
+  tensorflow::Device* d = op_device();
+  return (d == nullptr) ? 0 : d->parsed_name().id;
+}
+
 tensorflow::ImmediateExecutionTensorHandle* TensorHandle::Copy() {
   Ref();
   return this;
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index eed31b79b0f..b2bb24f5bc0 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -91,6 +91,10 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // Create a handle which packs the given handles of the same dtype and shape.
   // If handles are on different devices, assign the packed handle to a
   // CompositeDevice.
+  //
+  // The new tensor handle shares ownership of the given handle: their reference
+  // count will be increased by one after a call to `CreatePackedHandle`.
+  // TODO(b/170414377): Use `TensorHandlePtr` instead.
   static Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
                                    const tensorflow::DataType dtype,
                                    const tensorflow::TensorShape& shape,
@@ -127,6 +131,8 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
 
   const char* DeviceName(Status* status) const override;
   const char* BackingDeviceName(Status* status) const override;
+  const char* DeviceType(Status* status) const override;
+  int DeviceId(Status* status) const override;
   AbstractTensorInterface* Resolve(Status* status) override;
 
   ImmediateExecutionTensorHandle* Copy() override;
@@ -221,8 +227,9 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   void Poison(Status status, const Device* d);
 
   // TODO(b/154282629): Consider moving it to EagerContext.
+  // Copies to the tensor on the given device `d`, or to host iff `d` is null.
   Status CopyToDevice(const EagerContext& ctx, tensorflow::Device* d,
-                      tensorflow::Tensor* output);
+                      tensorflow::Tensor* output) const;
 
   Status InferenceShape(
       shape_inference::InferenceContext* const inference_context,
@@ -336,6 +343,11 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // shape.
   class PackedTensorHandleData {
    public:
+    // Initialize handle data from list of tensor handles.
+    // Ownership of the tensor handles is shared between the
+    // `PackedTensorHandleData` and the caller (the reference count for the
+    // given handles is incremented).
+    // TODO(b/170414377): Use `TensorHandlePtr` instead.
     PackedTensorHandleData(std::vector<TensorHandle*>&& handles,
                            const TensorShape& shape);
 
@@ -357,6 +369,7 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
     Status ExtractPackedHandle(const int index, TensorHandle** handle) const;
 
    private:
+    // TODO(b/170414377): Use `TensorHandlePtr` instead.
     const std::vector<TensorHandle*> handles_;
     const TensorShape shape_;
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index 6b3c464f674..715e7f48ef5 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -38,9 +38,8 @@ TEST(TensorHandle_ShapeTest, AsyncShape) {
       "CPU", {}, "/job:localhost/replica:0/task:0/device:CPU:0"));
   auto ctx = new EagerContext(
       SessionOptions(),
-      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
-      &device_mgr, false, nullptr, nullptr, nullptr);
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
+      false, &device_mgr, false, nullptr, nullptr);
   TensorHandle* sync_th =
       TensorHandle::CreateLocalHandle(std::move(t), nullptr, nullptr, ctx);
   TensorHandle* async_th = TensorHandle::CreateEmptyLocalHandle(
@@ -106,10 +105,9 @@ class PackedTensorHandleTest : public ::testing::Test {
     context_ = new EagerContext(
         SessionOptions(),
         tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-        tensorflow::ContextMirroringPolicy::MIRRORING_NONE, /* async= */ false,
+        /* async= */ false,
         /* lazy_copy_function_remote_inputs= */ false, device_mgr_,
         /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
-        /* custom_kernel_creator= */ nullptr,
         /* cluster_flr= */ nullptr);
   }
 
@@ -257,9 +255,8 @@ TEST(TensorHandle_ResourceDeviceTest, OnLocalDevice) {
   StaticDeviceMgr local_device_mgr(std::move(d0));
   auto ctx = new EagerContext(
       SessionOptions(),
-      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
-      &local_device_mgr, false, nullptr, nullptr, nullptr);
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
+      false, &local_device_mgr, false, nullptr, nullptr);
 
   tensorflow::DataType dtype = DT_RESOURCE;
   TensorShape shape = {2};
@@ -290,9 +287,8 @@ TEST(TensorHandle_ResourceDeviceTest, OnRemoteDevice) {
   StaticDeviceMgr local_device_mgr(std::move(d_local));
   auto ctx = new EagerContext(
       SessionOptions(),
-      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
-      &local_device_mgr, false, nullptr, nullptr, nullptr);
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
+      false, &local_device_mgr, false, nullptr, nullptr);
 
   std::unique_ptr<Device> d0(
       CreateDevice("CPU", "/job:worker/task:0/device:CPU:0", false));
@@ -346,10 +342,9 @@ class RemoteTensorHandleTest : public ::testing::Test {
     context_ = new EagerContext(
         SessionOptions(),
         tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-        tensorflow::ContextMirroringPolicy::MIRRORING_NONE, /* async= */ false,
+        /* async= */ false,
         /* lazy_copy_function_remote_inputs= */ false, device_mgr_,
         /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
-        /* custom_kernel_creator= */ nullptr,
         /* cluster_flr= */ nullptr);
   }
 
@@ -387,10 +382,9 @@ TEST_F(RemoteTensorHandleTest, UnknownRemoteDevice) {
   EagerContext* context = new EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, /* async= */ false,
+      /* async= */ false,
       /* lazy_copy_function_remote_inputs= */ false, &device_mgr,
       /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
-      /* custom_kernel_creator= */ nullptr,
       /* cluster_flr= */ nullptr);
 
   tensorflow::DataType dtype = DT_FLOAT;
@@ -414,4 +408,63 @@ TEST_F(RemoteTensorHandleTest, UnknownRemoteDevice) {
   context->Unref();
 }
 
+TEST(TensorHandle_DeviceNameTest, OnLocalDevice) {
+  std::vector<std::unique_ptr<Device>> devices;
+  devices.emplace_back(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:0"));
+  devices.emplace_back(
+      CreateDevice("GPU", "/job:localhost/replica:0/task:0/device:GPU:0"));
+  StaticDeviceMgr local_device_mgr(std::move(devices));
+  auto ctx = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT, false,
+      false, &local_device_mgr, false, nullptr, nullptr);
+
+  Device* dcpu = local_device_mgr.ListDevices()[0];
+  Device* dgpu = local_device_mgr.ListDevices()[1];
+  tensorflow::DataType dtype = DT_RESOURCE;
+  TensorShape shape = {2};
+  Tensor tcpu(dtype, shape);
+  Tensor tgpu(dtype, shape);
+  Status s;
+
+  TensorHandle* th_cpu =
+      TensorHandle::CreateLocalHandle(std::move(tcpu), dcpu, dcpu, dcpu, ctx);
+  const char* device_name = th_cpu->DeviceName(&s);
+  TF_EXPECT_OK(s);
+  ASSERT_TRUE(absl::StrContains(device_name, "CPU")) << device_name;
+  const char* backing_device_name = th_cpu->BackingDeviceName(&s);
+  TF_EXPECT_OK(s);
+  ASSERT_TRUE(absl::StrContains(backing_device_name, "CPU"))
+      << backing_device_name;
+  const char* device_type = th_cpu->DeviceType(&s);
+  TF_EXPECT_OK(s);
+  ASSERT_TRUE(absl::StrContains(device_type, "CPU")) << device_type;
+  int device_id = th_cpu->DeviceId(&s);
+  TF_EXPECT_OK(s);
+  ASSERT_EQ(0, device_id) << device_id;
+
+  TensorHandle* th_gpu =
+      TensorHandle::CreateLocalHandle(std::move(tgpu), dgpu, dgpu, dgpu, ctx);
+  device_name = th_gpu->DeviceName(&s);
+  TF_EXPECT_OK(s);
+  ASSERT_TRUE(absl::StrContains(device_name, "GPU")) << device_name;
+  backing_device_name = th_gpu->BackingDeviceName(&s);
+  TF_EXPECT_OK(s);
+  std::cout << "backing_device_name for GPU: " << backing_device_name
+            << std::endl;
+  ASSERT_TRUE(absl::StrContains(backing_device_name, "GPU"))
+      << backing_device_name;
+  device_type = th_gpu->DeviceType(&s);
+  TF_EXPECT_OK(s);
+  ASSERT_TRUE(absl::StrContains(device_type, "GPU")) << device_type;
+  device_id = th_gpu->DeviceId(&s);
+  TF_EXPECT_OK(s);
+  ASSERT_EQ(0, device_id) << device_id;
+
+  th_cpu->Unref();
+  th_gpu->Unref();
+  ctx->Unref();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index ff45e84bce8..cd3e73a30b9 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/framework/control_flow.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/log_memory.h"
+#include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_segment.h"
@@ -323,6 +324,12 @@ class ExecutorState {
   // REQUIRES: `!ready->empty()`.
   void ScheduleReady(TaggedNodeSeq* ready, TaggedNodeReadyQueue* inline_ready);
 
+  // A wrapper for runner_ to keep track of the pending queue length. Op
+  // execution should dispatch work using this function instead of using runner_
+  // directly.
+  template <typename Closure>
+  void RunTask(Closure&& c);
+
   // Clean up when this executor is done.
   void Finish();
   void ScheduleFinish();
@@ -423,6 +430,30 @@ ExecutorState<PropagatorStateType>::~ExecutorState() {
   delete slice_reader_cache_;
 }
 
+template <class PropagatorStateType>
+template <typename Closure>
+void ExecutorState<PropagatorStateType>::RunTask(Closure&& c) {
+  // Align the atomic variables at 64 bytes to avoid false-sharing, assuming the
+  // cacheline size is 64 bytes or smaller.
+  alignas(64) static std::atomic<int64_t> num_enqueue_ops{0};
+  alignas(64) static std::atomic<int64_t> num_dequeue_ops{0};
+
+  auto n_enqueues = num_enqueue_ops.fetch_add(1, std::memory_order_relaxed);
+  // Sample the queue length on every 16 enqueue operations. This amortizes the
+  // cost of metric updates across 16 operations.
+  if (n_enqueues % 16 == 0) {
+    auto n_dequeues = num_dequeue_ops.load(std::memory_order_relaxed);
+    metrics::UpdateGraphPendingQueueLength(n_enqueues - n_dequeues);
+  }
+
+  // mutable is needed because std::forward<Closure> in the lambda body may move
+  // the Closure `c`.
+  runner_([c = std::forward<Closure>(c)]() mutable {
+    num_dequeue_ops.fetch_add(1, std::memory_order_relaxed);
+    std::forward<Closure>(c)();
+  });
+}
+
 template <class PropagatorStateType>
 void ExecutorState<PropagatorStateType>::RunAsync(Executor::DoneCallback done) {
   TaggedNodeSeq ready;
@@ -629,8 +660,8 @@ template <class PropagatorStateType>
 void ExecutorState<PropagatorStateType>::Process(TaggedNode tagged_node,
                                                  int64 scheduled_nsec) {
   profiler::TraceMeConsumer activity(
-      // From TraceMeProducer in KernelAndDeviceFunc::RunAsync,
-      // DirectSession::RunInternal or GraphMgr::ExecuteAsync.
+      // From TraceMeProducer in DirectSession::RunInternal,
+      // GraphMgr::ExecuteAsync, or FunctionLibraryRuntime::Run.
       [&] {
         // NOTE: This tracing uses the iteration number from the first tagged
         // node that executes during this call to `Process()`. In principle,
@@ -1116,7 +1147,7 @@ void ExecutorState<PropagatorStateType>::ScheduleReady(
       // regardless of the `runner_` implementation, all kernels will run
       // sequentially on the same thread, and thread wakeup overhead and
       // executor mutex contention will be minimized.
-      runner_([this, ready = std::move(*ready), scheduled_nsec]() {
+      RunTask([this, ready = std::move(*ready), scheduled_nsec]() {
         for (auto& tagged_node : ready) {
           Process(tagged_node, scheduled_nsec);
         }
@@ -1131,7 +1162,7 @@ void ExecutorState<PropagatorStateType>::ScheduleReady(
     if (inline_ready == nullptr) {
       // Schedule to run all the ready ops in thread pool.
       for (auto& tagged_node : *ready) {
-        runner_([=]() { Process(tagged_node, scheduled_nsec); });
+        RunTask([=]() { Process(tagged_node, scheduled_nsec); });
       }
     } else {
       for (auto& tagged_node : *ready) {
@@ -1143,7 +1174,7 @@ void ExecutorState<PropagatorStateType>::ScheduleReady(
           if (curr_expensive_node) {
             // Dispatch to another thread since there is plenty of work to
             // do for this thread.
-            runner_(std::bind(&ExecutorState::Process, this,
+            RunTask(std::bind(&ExecutorState::Process, this,
                               *curr_expensive_node, scheduled_nsec));
           }
           curr_expensive_node = &tagged_node;
@@ -1156,7 +1187,7 @@ void ExecutorState<PropagatorStateType>::ScheduleReady(
       } else {
         // There are inline nodes to run already. We dispatch this expensive
         // node to other thread.
-        runner_(std::bind(&ExecutorState::Process, this, *curr_expensive_node,
+        RunTask(std::bind(&ExecutorState::Process, this, *curr_expensive_node,
                           scheduled_nsec));
       }
     }
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 72a08b4dc9d..b5a2e0a9ef9 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
@@ -325,7 +326,6 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
                              const FunctionLibraryDefinition* lib_def,
                              thread::ThreadPool* default_thread_pool,
                              const OptimizerOptions& optimizer_options,
-                             const CustomKernelCreator* custom_kernel_creator,
                              const SessionMetadata* session_metadata,
                              ProcessFunctionLibraryRuntime* parent);
 
@@ -389,7 +389,6 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
   const int graph_def_version_;
   const FunctionLibraryDefinition* const base_lib_def_;
   GraphOptimizer optimizer_;
-  const CustomKernelCreator* custom_kernel_creator_;
   const SessionMetadata* const session_metadata_;
   Executor::Args::Runner default_runner_;
   const string device_name_;
@@ -461,7 +460,6 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
     int graph_def_version, const FunctionLibraryDefinition* lib_def,
     thread::ThreadPool* default_thread_pool,
     const OptimizerOptions& optimizer_options,
-    const CustomKernelCreator* custom_kernel_creator,
     const SessionMetadata* session_metadata,
     ProcessFunctionLibraryRuntime* parent)
     : device_mgr_(dmgr),
@@ -471,7 +469,6 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
       graph_def_version_(graph_def_version),
       base_lib_def_(lib_def),
       optimizer_(optimizer_options),
-      custom_kernel_creator_(custom_kernel_creator),
       session_metadata_(session_metadata),
       default_runner_(nullptr),
       device_name_(device_ == nullptr
@@ -608,10 +605,12 @@ Status FunctionLibraryRuntimeImpl::CreateKernel(
     FunctionLibraryRuntime* flr, OpKernel** kernel) {
   // If a custom kernel creator is given, try that.
   Status s;
-  if (custom_kernel_creator_ != nullptr &&
-      custom_kernel_creator_->CanCreateKernel(*this, props)) {
+  const CustomKernelCreator* custom_kernel_creator =
+      GetDefaultCustomKernelCreator();
+  if (custom_kernel_creator &&
+      custom_kernel_creator->CanCreateKernel(*this, props)) {
     std::unique_ptr<OpKernel> ret;
-    s = custom_kernel_creator_->CreateKernel(this, props, &ret);
+    s = custom_kernel_creator->CreateKernel(this, props, &ret);
     if (s.ok()) {
       *kernel = ret.release();
     } else {
@@ -1143,6 +1142,14 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     return;
   }
 
+  profiler::TraceMeProducer activity(
+      // To TraceMeConsumers in ExecutorState::Process/Finish.
+      [&opts] {
+        return profiler::TraceMeEncode("FunctionRun", {{"id", opts.step_id}});
+      },
+      profiler::ContextType::kTfExecutor, opts.step_id,
+      profiler::TraceMeLevel::kInfo);
+
   Executor::Args exec_args;
   ExecutorArgsFromOptions(run_opts, frame, &exec_args);
 
@@ -1207,6 +1214,14 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   }
   DCHECK(run_opts.runner != nullptr);
 
+  profiler::TraceMeProducer activity(
+      // To TraceMeConsumers in ExecutorState::Process/Finish.
+      [&opts] {
+        return profiler::TraceMeEncode("FunctionRun", {{"id", opts.step_id}});
+      },
+      profiler::ContextType::kTfExecutor, opts.step_id,
+      profiler::TraceMeLevel::kInfo);
+
   Executor::Args exec_args;
   ExecutorArgsFromOptions(run_opts, frame, &exec_args);
   item->exec->RunAsync(exec_args, std::move(done));
@@ -1311,9 +1326,9 @@ Status FunctionLibraryRuntimeImpl::Clone(
     std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
     std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr,
     FunctionLibraryRuntime** out_flr, bool skip_flib_def) {
-  TF_RETURN_IF_ERROR(parent_->Clone(
-      env_, graph_def_version_, optimizer_.options(), custom_kernel_creator_,
-      out_lib_def, out_pflr, skip_flib_def));
+  TF_RETURN_IF_ERROR(parent_->Clone(env_, graph_def_version_,
+                                    optimizer_.options(), out_lib_def, out_pflr,
+                                    skip_flib_def));
   *out_flr = (*out_pflr)->GetFLR(device_->name());
   if (*out_flr != nullptr) {
     return Status::OK();
@@ -1359,12 +1374,11 @@ std::unique_ptr<FunctionLibraryRuntime> NewFunctionLibraryRuntime(
     Device* device, int graph_def_version,
     const FunctionLibraryDefinition* lib_def, thread::ThreadPool* thread_pool,
     const OptimizerOptions& optimizer_options,
-    const CustomKernelCreator* custom_kernel_creator,
     const SessionMetadata* session_metadata,
     ProcessFunctionLibraryRuntime* parent) {
   return std::unique_ptr<FunctionLibraryRuntime>(new FunctionLibraryRuntimeImpl(
       device_mgr, env, config, device, graph_def_version, lib_def, thread_pool,
-      optimizer_options, custom_kernel_creator, session_metadata, parent));
+      optimizer_options, session_metadata, parent));
 }
 
 class SymbolicGradientHelper {
diff --git a/tensorflow/core/common_runtime/function.h b/tensorflow/core/common_runtime/function.h
index e75e5c82a40..fae1f0f75c6 100644
--- a/tensorflow/core/common_runtime/function.h
+++ b/tensorflow/core/common_runtime/function.h
@@ -43,15 +43,11 @@ const CustomKernelCreator* GetDefaultCustomKernelCreator();
 // interpreter op kernel to execute a function. Else c->CreateKernel() can be
 // used to create a kernel that will compile the function with XLA and run the
 // resulting program.
-//
-// TODO(zhifengc/phawkins): b/32379046
 void RegisterDefaultCustomKernelCreator(CustomKernelCreator* c);
 
 // Creates a FunctionLibraryRuntime, which instantiates functions
 // defined in "lib_def" and executes functions on the "device".
-// "device_mgr" must contain the "device". If not nullptr,
-// "custom_kernel_creator" is consulted by the returned runtime to
-// create kernels.
+// "device_mgr" must contain the "device".
 //
 // The returned object does not take ownerships of "device" or
 // "lib_def".  The caller must ensure "device" and "lib_def" outlives
@@ -65,7 +61,6 @@ std::unique_ptr<FunctionLibraryRuntime> NewFunctionLibraryRuntime(
     Device* device, int graph_def_version,
     const FunctionLibraryDefinition* lib_def, thread::ThreadPool* thread_pool,
     const OptimizerOptions& optimizer_options,
-    const CustomKernelCreator* custom_kernel_creator,
     const SessionMetadata* session_metadata,
     ProcessFunctionLibraryRuntime* parent);
 
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 2d53b6a9db3..c52b4c50893 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -162,8 +162,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     pflr_.reset(new ProcessFunctionLibraryRuntime(
         device_mgr_.get(), Env::Default(), &options.config,
         TF_GRAPH_DEF_VERSION, lib_def_.get(), opts, /*thread_pool=*/nullptr,
-        /*parent=*/nullptr, /*custom_kernel_creator=*/nullptr,
-        /*session_metadata=*/nullptr,
+        /*parent=*/nullptr, /*session_metadata=*/nullptr,
         Rendezvous::Factory{
             [](const int64, const DeviceMgr* device_mgr, Rendezvous** r) {
               *r = new IntraProcessRendezvous(device_mgr);
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index 0786d9032a8..5493c3dc7a1 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -65,8 +65,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     pflr_.reset(new ProcessFunctionLibraryRuntime(
         device_mgr_.get(), Env::Default(), /*config=*/nullptr,
         TF_GRAPH_DEF_VERSION, lib_def_.get(), opts, default_thread_pool,
-        /*parent=*/nullptr, /*custom_kernel_creator=*/nullptr,
-        /*session_metadata=*/nullptr,
+        /*parent=*/nullptr, /*session_metadata=*/nullptr,
         Rendezvous::Factory{
             [](const int64, const DeviceMgr* device_mgr, Rendezvous** r) {
               *r = new IntraProcessRendezvous(device_mgr);
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index c738e490501..d500944c3e5 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -14,6 +14,9 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_tests_gpu")
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
 # For platform specific build config
 load(
     "//tensorflow/core/platform:build_config.bzl",
@@ -55,7 +58,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
     ],
 )
 
@@ -150,7 +153,6 @@ tf_cuda_library(
         ":gpu_id_impl",
         ":gpu_init_impl",
         ":gpu_lib",
-        "//tensorflow/core:core_cpu_impl",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -158,8 +160,9 @@ tf_cuda_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:stream_executor",
-        "//tensorflow/core/platform:tf32_utils",
+        "//tensorflow/core/common_runtime:core_cpu_impl",
+        "//tensorflow/core/platform:stream_executor",
+        "//tensorflow/core/platform:tensor_float_32_utils",
         "//tensorflow/core/profiler/lib:annotated_traceme",
         "//tensorflow/core/profiler/lib:scoped_annotation",
         "//third_party/eigen3",
@@ -178,7 +181,7 @@ tf_cuda_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
         "//third_party/eigen3",
     ] + if_static([":gpu_runtime_impl"]),
 )
@@ -216,8 +219,8 @@ tf_cuda_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor",
         "//tensorflow/core/framework:allocator",
+        "//tensorflow/core/platform:stream_executor",
     ],
 )
 
@@ -231,7 +234,7 @@ tf_cuda_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
     ] + if_static(
         [":gpu_init_impl"],
     ),
@@ -252,7 +255,7 @@ tf_cuda_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 57af898ecd2..17e0ee4da1f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -601,7 +601,11 @@ void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
 
 // Based on the semantics of Device::Sync this call should wait for
 // all streams not just the current one.
-Status BaseGPUDevice::Sync() { return GPUUtil::SyncAll(this); }
+Status BaseGPUDevice::Sync() {
+  return tensorflow_gpu_device_info()
+      ->stream->parent()
+      ->BlockHostUntilAllStreamsAreDone();
+}
 
 void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
                                  OpKernelContext* context,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index 6448fc56af7..d17607fa13d 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -230,9 +230,9 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimitAndNoPriority) {
 TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
   {
 #if TENSORFLOW_USE_ROCM
-    // Priority outside the range (0, 2) for AMD GPUs
+    // Priority outside the range (-1, 1) for AMD GPUs
     SessionOptions opts =
-        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 2}});
+        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-2, 1}});
 #else
     // Priority outside the range (-2, 0) for NVidia GPUs
     SessionOptions opts =
@@ -245,7 +245,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
 #if TENSORFLOW_USE_ROCM
     ExpectErrorMessageSubstr(
         status,
-        "Priority -1 is outside the range of supported priorities [0,2] for"
+        "Priority -2 is outside the range of supported priorities [-1,1] for"
         " virtual device 0 on GPU# 0");
 #else
     ExpectErrorMessageSubstr(
@@ -254,8 +254,9 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
   }
   {
 #if TENSORFLOW_USE_ROCM
-    // Priority outside the range (0, 2) for AMD GPUs
-    SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 3}});
+    // Priority outside the range (-1, 1) for AMD GPUs
+    SessionOptions opts =
+        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 2}});
 #else
     // Priority outside the range (-2, 0) for NVidia GPUs
     SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}});
@@ -267,7 +268,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
 #if TENSORFLOW_USE_ROCM
     ExpectErrorMessageSubstr(
         status,
-        "Priority 3 is outside the range of supported priorities [0,2] for"
+        "Priority 2 is outside the range of supported priorities [-1,1] for"
         " virtual device 0 on GPU# 0");
 #else
     ExpectErrorMessageSubstr(
@@ -288,26 +289,17 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimitAndPriority) {
 }
 
 TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
-#if TENSORFLOW_USE_ROCM
-  // Valid range for priority values on AMD GPUs in (0,2)
-  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}});
-#else
+  // Valid range for priority values on AMD GPUs in (-1,1)
   // Valid range for priority values on NVidia GPUs in (-2, 0)
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, -1}});
-#endif
   std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(2, devices.size());
   EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
   EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit());
-#if TENSORFLOW_USE_ROCM
-  EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
-  EXPECT_EQ(1, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
-#else
   EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
   EXPECT_EQ(-1, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
-#endif
   ASSERT_EQ(1, devices[0]->attributes().locality().links().link_size());
   ASSERT_EQ(1, devices[1]->attributes().locality().links().link_size());
   EXPECT_EQ(1, devices[0]->attributes().locality().links().link(0).device_id());
@@ -339,27 +331,18 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevicesWithPriority) {
   }
   {
     // Multile virtual devices with matching priority.
-#if TENSORFLOW_USE_ROCM
-    // Valid range for priority values on AMD GPUs in (0,2)
-    SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{2, 1}});
-#else
+    // Valid range for priority values on AMD GPUs in (-1,1)
     // Valid range for priority values on NVidia GPUs in (-2, 0)
     SessionOptions opts =
         MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 0}});
-#endif
     std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
         opts, kDeviceNamePrefix, &devices));
     EXPECT_EQ(2, devices.size());
     EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
     EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit());
-#if TENSORFLOW_USE_ROCM
-    EXPECT_EQ(2, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
-    EXPECT_EQ(1, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
-#else
     EXPECT_EQ(-1, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
     EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
-#endif
   }
 }
 
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index a7f17ba6a0d..d700576818a 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/common_runtime/device.h"
@@ -634,16 +635,19 @@ Status GraphExecutionState::InitBaseGraph(std::unique_ptr<Graph>&& new_graph) {
 Status GraphExecutionState::OptimizeGraph(
     const BuildGraphOptions& options, std::unique_ptr<Graph>* optimized_graph,
     std::unique_ptr<FunctionLibraryDefinition>* optimized_flib) {
-#ifndef IS_MOBILE_PLATFORM
+#ifdef IS_MOBILE_PLATFORM
+  return errors::InvalidArgument("Mobile platforms not supported");
+#else
   if (session_options_->config.graph_options().place_pruned_graph()) {
     return errors::InvalidArgument("Can't optimize a pruned graph");
   }
 
   if (grappler::MetaOptimizerEnabled(session_options_->config)) {
+    // Here we build the GrapplerItem before calling the optimizer.
     grappler::GrapplerItem item;
     item.id = "tf_graph";
-    graph_->ToGraphDef(&item.graph);
 
+    // Add devices to the GrapplerItem
     // It's ok to skip invalid device annotations in Grappler.
     for (const Device* d : device_set_->devices()) {
       Status added_device = item.AddDevice(d->name());
@@ -652,11 +656,7 @@ Status GraphExecutionState::OptimizeGraph(
     VLOG(3) << "Grappler available devices: "
             << absl::StrJoin(item.devices(), ", ");
 
-    // TODO(b/114748242): Add a unit test to test this bug fix.
-    if (flib_def_) {
-      *item.graph.mutable_library() = flib_def_->ToProto();
-    }
-
+    // Add fetches to the GrapplerItem.
     item.fetch.insert(item.fetch.end(),
                       options.callable_options.fetch().begin(),
                       options.callable_options.fetch().end());
@@ -669,6 +669,8 @@ Status GraphExecutionState::OptimizeGraph(
       item.fetch.push_back(tensor_connection.from_tensor());
     }
 
+    // Add feeds to the GrapplerItem if we know them.
+    absl::flat_hash_set<absl::string_view> node_names;
     if (!(options.callable_options.feed().empty() &&
           options.callable_options.tensor_connection().empty())) {
       std::vector<SafeTensorId> feeds;
@@ -683,7 +685,7 @@ Status GraphExecutionState::OptimizeGraph(
 
       // For feeds with tensor index 0 we try to find the corresponding node in
       // the graph to infer feed data type and shape.
-      std::unordered_set<std::string> feed_nodes;
+      absl::flat_hash_set<absl::string_view> feed_nodes;
 
       // For feeds with tensor index larger than 0, we can't infer data type or
       // shape from the graph. Currently we only support type and shape
@@ -702,7 +704,9 @@ Status GraphExecutionState::OptimizeGraph(
 
       // For feeds with tensor index == 0 we try to infer data type and tensor
       // shape from the graph, by looking at the fed node attributes.
+      node_names.reserve(graph_->num_nodes());
       for (const Node* node : graph_->nodes()) {
+        node_names.insert(node->name());
         if (feed_nodes.find(node->name()) == feed_nodes.end()) continue;
 
         // Try to get the type and shape of the feed node.
@@ -747,6 +751,39 @@ Status GraphExecutionState::OptimizeGraph(
       }
     }
 
+    // Validate that the feeds and fetches are valid.
+    if (node_names.empty()) {
+      // Collect all node names in the graph if we didn't already.
+      node_names.reserve(graph_->num_nodes());
+      for (const Node* node : graph_->nodes()) {
+        node_names.insert(node->name());
+      }
+    }
+    for (const auto& feed : item.feed) {
+      SafeTensorId tensor_id = ParseTensorName(feed.first);
+      if (node_names.find(tensor_id.node()) == node_names.end()) {
+        return errors::InvalidArgument("Invalid feed, no such node in graph: ",
+                                       feed.first);
+      }
+    }
+    for (const auto& fetch : item.fetch) {
+      SafeTensorId tensor_id = ParseTensorName(fetch);
+      if (node_names.find(tensor_id.node()) == node_names.end()) {
+        return errors::InvalidArgument("Invalid fetch, no such node in graph: ",
+                                       fetch);
+      }
+    }
+
+    // Convert Graph to GraphDef and add it to the GrapplerItem.
+    graph_->ToGraphDef(&item.graph);
+    // TODO(b/114748242): Add a unit test to test this bug fix.
+    if (flib_def_) {
+      *item.graph.mutable_library() = flib_def_->ToProto();
+    }
+
+    // Construct a virtual cluster and find the cpu_device, which the
+    // ConstantFolding optimizer will use for partial evaluation of the graph.
+    grappler::VirtualCluster cluster(device_set_);
     Device* cpu_device = nullptr;
     for (const auto& device : device_set_->devices()) {
       if (device->parsed_name().id == 0 &&
@@ -755,7 +792,8 @@ Status GraphExecutionState::OptimizeGraph(
         cpu_device = device;
       }
     }
-    grappler::VirtualCluster cluster(device_set_);
+
+    // Now we can run the MetaOptimizer on the constructed GrapplerItem.
     GraphDef new_graph;
     TF_RETURN_IF_ERROR(
         grappler::RunMetaOptimizer(std::move(item), session_options_->config,
@@ -778,9 +816,9 @@ Status GraphExecutionState::OptimizeGraph(
         TF_RETURN_IF_ERROR((*optimized_flib)->AddFunctionDef(fdef));
       }
     }
-
     optimized_graph->reset(new Graph(OpRegistry::Global()));
 
+    // Convert the optimized GraphDef back to a Graph.
     GraphConstructorOptions opts;
     opts.allow_internal_ops = true;
     TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, std::move(new_graph),
@@ -796,8 +834,6 @@ Status GraphExecutionState::OptimizeGraph(
   } else {
     return errors::InvalidArgument("Meta Optimizer disabled");
   }
-#else
-  return errors::InvalidArgument("Mobile platforms not supported");
 #endif  // IS_MOBILE_PLATFORM
 }
 
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
index ea38349d61c..e1d81a18464 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
@@ -80,20 +80,20 @@ Status HierarchicalTreeBroadcaster::InitializeCollectiveParams(
   CHECK_EQ(col_params->instance.impl_details.collective_name,
            "HierarchicalTreeBroadcast");
   const string& device_name =
-      col_params->instance.device_names[col_params->default_rank];
+      col_params->group.device_names[col_params->default_rank];
   // Start by counting the devices in each task.
   // Precondition: device_names must be sorted so that all devices in
   // the same task are adjacent.
   VLOG(2) << "Sorted task names: "
-          << absl::StrJoin(col_params->instance.task_names, ", ");
+          << absl::StrJoin(col_params->group.task_names, ", ");
   std::vector<int> dev_per_task;
-  const string* prior_task_name = &col_params->instance.task_names[0];
+  const string* prior_task_name = &col_params->group.task_names[0];
   int dev_count = 1;
   for (int di = 1; di < col_params->group.group_size; ++di) {
-    if (col_params->instance.task_names[di] != *prior_task_name) {
+    if (col_params->group.task_names[di] != *prior_task_name) {
       dev_per_task.push_back(dev_count);
       dev_count = 1;
-      prior_task_name = &col_params->instance.task_names[di];
+      prior_task_name = &col_params->group.task_names[di];
     } else {
       ++dev_count;
     }
@@ -135,14 +135,13 @@ Status HierarchicalTreeBroadcaster::InitializeCollectiveParams(
       if (source_task == ti) {
         // Source device belongs to this task.
         perm.push_back(col_params->source_rank);
-        participate =
-            col_params->instance.device_names[col_params->source_rank] ==
-            device_name;
+        participate = col_params->group.device_names[col_params->source_rank] ==
+                      device_name;
       } else {
         // Source does not belong to this task, choose dev 0.
         perm.push_back(device_count);
         participate =
-            col_params->instance.device_names[device_count] == device_name;
+            col_params->group.device_names[device_count] == device_name;
       }
       if (participate) col_params->subdiv_rank.push_back(ti);
       device_count += dev_per_task[ti];
@@ -165,7 +164,7 @@ Status HierarchicalTreeBroadcaster::InitializeCollectiveParams(
     int subdiv_source = 0;
     for (int di = 0; di < dev_per_task[ti]; di++) {
       perm.push_back(abs_di);
-      if (col_params->instance.device_names[abs_di] == device_name) {
+      if (col_params->group.device_names[abs_di] == device_name) {
         participate = true;
         col_params->subdiv_rank.push_back(di);
       }
@@ -417,11 +416,11 @@ void HierarchicalTreeBroadcaster::DispatchSend(int subdiv, int dst_rank,
       col_params_->instance.impl_details.subdiv_permutations[subdiv][dst_rank];
   VLOG(3) << "DispatchSend " << send_buf_key << " from_device "
           << col_ctx_->device_name << " to_device "
-          << col_params_->instance.device_names[dst_idx] << " subdiv=" << subdiv
+          << col_params_->group.device_names[dst_idx] << " subdiv=" << subdiv
           << " dst_rank=" << dst_rank << " dst_idx=" << dst_idx;
   col_ctx_->col_exec->remote_access()->PostToPeer(
-      col_params_->instance.device_names[dst_idx],
-      col_params_->instance.task_names[dst_idx], send_buf_key, col_ctx_->device,
+      col_params_->group.device_names[dst_idx],
+      col_params_->group.task_names[dst_idx], send_buf_key, col_ctx_->device,
       col_ctx_->op_ctx->op_device_context(),
       col_ctx_->op_ctx->output_alloc_attr(0), src_tensor,
       col_ctx_->device_locality, done);
@@ -435,12 +434,12 @@ void HierarchicalTreeBroadcaster::DispatchRecv(int subdiv, int src_rank,
   int src_idx =
       col_params_->instance.impl_details.subdiv_permutations[subdiv][src_rank];
   VLOG(3) << "DispatchRecv " << recv_buf_key << " from_device "
-          << col_params_->instance.device_names[src_idx] << " to_device "
+          << col_params_->group.device_names[src_idx] << " to_device "
           << col_ctx_->device_name << " subdiv=" << subdiv
           << " src_rank=" << src_rank << " src_idx=" << src_idx;
   col_ctx_->col_exec->remote_access()->RecvFromPeer(
-      col_params_->instance.device_names[src_idx],
-      col_params_->instance.task_names[src_idx],
+      col_params_->group.device_names[src_idx],
+      col_params_->group.task_names[src_idx],
       col_params_->task.is_local[src_idx], recv_buf_key, col_ctx_->device,
       col_ctx_->op_ctx->op_device_context(),
       col_ctx_->op_ctx->output_alloc_attr(0), dst_tensor,
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
index e2fb371bd23..112e6c9881c 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
@@ -328,8 +328,8 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
           dev_name = strings::StrCat(task_name, "/device:CPU:", di);
         }
         VLOG(2) << "dev=" << dev_name;
-        col_params_.instance.device_names.push_back(dev_name);
-        col_params_.instance.task_names.push_back(task_name);
+        col_params_.group.device_names.push_back(dev_name);
+        col_params_.group.task_names.push_back(task_name);
         col_params_.task.is_local.push_back(true);
       }
     }
@@ -337,7 +337,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
       for (int di = 0; di < num_devices_per_worker; di++) {
         int default_rank = wi * num_devices_per_worker + di;
         instances_.push_back(new DeviceInstance(
-            default_rank, col_params_.instance.device_names[default_rank],
+            default_rank, col_params_.group.device_names[default_rank],
             device_type, this));
       }
     }
@@ -539,8 +539,8 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
       string task_name = strings::StrCat("/job:worker/replica:0/task:", ti);
       for (int di = 0; di < num_gpus; di++) {
         string dev_name = strings::StrCat(task_name, "/device:GPU:", di);
-        cp->instance.task_names.push_back(task_name);
-        cp->instance.device_names.push_back(dev_name);
+        cp->group.task_names.push_back(task_name);
+        cp->group.device_names.push_back(dev_name);
       }
     }
   }
@@ -557,22 +557,16 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
       TF_CHECK_OK(parent_->dev_mgr_->LookupDevice(dev_name, &device_));
       col_params_.name = parent_->col_params_.name;
       col_params_.instance.data_type = parent_->col_params_.instance.data_type;
-      col_params_.group.group_key = parent_->col_params_.group.group_key;
+      col_params_.group = parent_->col_params_.group;
       col_params_.instance.instance_key =
           parent_->col_params_.instance.instance_key;
-      col_params_.group.device_type = parent_->col_params_.group.device_type;
-      col_params_.group.group_size = parent_->col_params_.group.group_size;
-      col_params_.instance.device_names =
-          parent_->col_params_.instance.device_names;
-      col_params_.instance.task_names =
-          parent_->col_params_.instance.task_names;
       col_params_.task.is_local = parent_->col_params_.task.is_local;
       col_params_.instance.impl_details.subdiv_permutations =
           parent_->col_params_.instance.impl_details.subdiv_permutations;
       col_params_.subdiv_rank = parent_->col_params_.subdiv_rank;
 
       int group_size = col_params_.group.group_size;
-      CHECK_EQ(group_size, col_params_.instance.device_names.size());
+      CHECK_EQ(group_size, col_params_.group.device_names.size());
       // Default rank is order in device_names.
       col_params_.default_rank = rank;
 
@@ -674,8 +668,9 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
           new HierarchicalTreeBroadcaster;
       core::ScopedUnref unref(broadcaster);
       auto col_ctx = std::make_shared<CollectiveContext>(
-          parent_->col_exec_, parent_->dev_mgr_.get(), &ctx, &op_params,
-          col_params_, exec_key, kStepId, input_tensor_ptr, output_tensor_ptr);
+          parent_->col_exec_, /*nccl_communicator*/ nullptr,
+          parent_->dev_mgr_.get(), &ctx, &op_params, col_params_, exec_key,
+          kStepId, input_tensor_ptr, output_tensor_ptr);
       TF_CHECK_OK(broadcaster->InitializeCollectiveContext(col_ctx));
 
       // Run the broadcast.
@@ -788,8 +783,8 @@ TEST_F(HierarchicalTreeBroadcasterTest, InitializeParams4TasksVariableGPU) {
     string task_name = strings::StrCat("/job:worker/replica:0/task:", ti);
     for (int di = 0; di < dev_per_task[ti]; di++) {
       string dev_name = strings::StrCat(task_name, "/device:GPU:", di);
-      cp.instance.task_names.push_back(task_name);
-      cp.instance.device_names.push_back(dev_name);
+      cp.group.task_names.push_back(task_name);
+      cp.group.device_names.push_back(dev_name);
       cp.group.group_size++;
     }
   }
diff --git a/tensorflow/core/common_runtime/inline_function_utils.cc b/tensorflow/core/common_runtime/inline_function_utils.cc
index 5a07573a430..362e4f2e0bc 100644
--- a/tensorflow/core/common_runtime/inline_function_utils.cc
+++ b/tensorflow/core/common_runtime/inline_function_utils.cc
@@ -587,6 +587,10 @@ Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
   //
   // If 'x' is a node in fbody->graph and its copy in 'g' is 'y', we
   // remember 'y' in node_map[x->id()].
+  std::unordered_set<string> fn_nodes;
+  for (Node* n : fbody->graph->op_nodes()) {
+    fn_nodes.insert(n->name());
+  }
   std::vector<Node*> node_map(fbody->graph->num_node_ids());
   for (Node* n : fbody->graph->op_nodes()) {
     NodeDef ndef = n->def();
@@ -605,6 +609,8 @@ Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
     const string prefix = strings::StrCat(caller->name(), "/");
     TF_RETURN_IF_ERROR(AddPrefixAndSuffixToNode(prefix, /*suffix=*/"", &ndef,
                                                 options.uniquify_frame_names));
+    TF_RETURN_IF_ERROR(
+        MaybeAddPrefixToColocationConstraints(fn_nodes, prefix, &ndef));
 
     Status added_node;
     Node* clone = g->AddNode(ndef, &added_node);
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index 1b1234d114f..0b6edf74daf 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -90,7 +90,7 @@ Benchmark::Benchmark(const string& device, Graph* g,
   pflr_ = std::unique_ptr<ProcessFunctionLibraryRuntime>(
       new ProcessFunctionLibraryRuntime(
           device_mgr_.get(), Env::Default(), nullptr, graph_def_version,
-          flib_def_.get(), OptimizerOptions(), pool_, nullptr, nullptr, nullptr,
+          flib_def_.get(), OptimizerOptions(), pool_, nullptr, nullptr,
           Rendezvous::Factory()));
 
   flr_ = pflr_->GetFLR(device_->name());
diff --git a/tensorflow/core/common_runtime/lower_if_op.cc b/tensorflow/core/common_runtime/lower_if_op.cc
index 5cde4f9049c..ff010ad8a63 100644
--- a/tensorflow/core/common_runtime/lower_if_op.cc
+++ b/tensorflow/core/common_runtime/lower_if_op.cc
@@ -65,6 +65,11 @@ class CondBuilder {
   // Adds input to both the then and else nodes from src:src_output.
   Status AddInput(Node* src, int src_output);
 
+  // Finalizes the node described by `node_builder`. If `coloc_attr_` is not
+  // nullptr, adds the colocation attr to the node before finalizing it.
+  Status SetColocationAndFinalize(NodeBuilder node_builder, Graph* graph,
+                                  Node** created_node);
+
   // The merged outputs of the then and else nodes.
   std::vector<NodeOut> outputs_;
 
@@ -72,6 +77,10 @@ class CondBuilder {
   Node* control_predecessor_;
   // The original If op.
   Node* if_op_;
+  // The colocation attr on the original If op. If it exists, control flow nodes
+  // created in the lowering (except the data Switch nodes) will inherit this
+  // attribute.
+  const AttrValue* coloc_attr_;
   // The node with the same name as the original If op:
   //   (a) IdentityN node with same outputs if 'keep_node_fetchable_ == true'
   //       and if the original If op had non-zero data outputs.
@@ -102,9 +111,10 @@ class CondBuilder {
 };
 
 CondBuilder::CondBuilder(Node* if_op, const NameAttrList& then_fn,
-                         const NameAttrList& else_fn,
-                         bool keep_node_fetchable, Graph* graph)
+                         const NameAttrList& else_fn, bool keep_node_fetchable,
+                         Graph* graph)
     : if_op_(if_op),
+      coloc_attr_(if_op_->attrs().Find(kColocationAttrName)),
       graph_(graph),
       name_(if_op->name()),
       keep_node_fetchable_(keep_node_fetchable),
@@ -126,27 +136,39 @@ CondBuilder::CondBuilder(Node* if_op, const NameAttrList& then_fn,
   }
 }
 
+Status CondBuilder::SetColocationAndFinalize(NodeBuilder node_builder,
+                                             Graph* graph,
+                                             Node** created_node) {
+  if (coloc_attr_ != nullptr) {
+    node_builder = node_builder.Attr(kColocationAttrName, *coloc_attr_);
+  }
+  return node_builder.Finalize(graph, created_node);
+}
+
 Status CondBuilder::CreatePivotNodes() {
   // Construct the basic cond body (consisting of feeding in the predicate to
   // create pivot nodes).
   Node* switch_pred;
-  TF_RETURN_IF_ERROR(NodeBuilder(NewName("switch_pred"), "Switch",
-                                 graph_->op_registry(), &debug_info_)
-                         .Input(NodeOut(pred_))
-                         .Input(NodeOut(pred_))
-                         .Device(if_op_->requested_device())
-                         .Finalize(graph_, &switch_pred));
+  TF_RETURN_IF_ERROR(
+      SetColocationAndFinalize(NodeBuilder(NewName("switch_pred"), "Switch",
+                                           graph_->op_registry(), &debug_info_)
+                                   .Input(NodeOut(pred_))
+                                   .Input(NodeOut(pred_))
+                                   .Device(if_op_->requested_device()),
+                               graph_, &switch_pred));
   control_predecessor_ = switch_pred;
-  TF_RETURN_IF_ERROR(NodeBuilder(NewName("pivot_f"), "Identity",
-                                 graph_->op_registry(), &debug_info_)
-                         .Input(switch_pred, kElseBranch)
-                         .Device(if_op_->requested_device())
-                         .Finalize(graph_, &pivot_f_));
-  TF_RETURN_IF_ERROR(NodeBuilder(NewName("pivot_t"), "Identity",
-                                 graph_->op_registry(), &debug_info_)
-                         .Input(switch_pred, kThenBranch)
-                         .Device(if_op_->requested_device())
-                         .Finalize(graph_, &pivot_t_));
+  TF_RETURN_IF_ERROR(
+      SetColocationAndFinalize(NodeBuilder(NewName("pivot_f"), "Identity",
+                                           graph_->op_registry(), &debug_info_)
+                                   .Input(switch_pred, kElseBranch)
+                                   .Device(if_op_->requested_device()),
+                               graph_, &pivot_f_));
+  TF_RETURN_IF_ERROR(
+      SetColocationAndFinalize(NodeBuilder(NewName("pivot_t"), "Identity",
+                                           graph_->op_registry(), &debug_info_)
+                                   .Input(switch_pred, kThenBranch)
+                                   .Device(if_op_->requested_device()),
+                               graph_, &pivot_t_));
   return Status::OK();
 }
 
@@ -160,19 +182,24 @@ Status CondBuilder::AddInput(Node* src, int src_output) {
   // Colocate the Switch node with the `src` node.
   //
   // This is to avoid unnecessary Host<->Device copies between src and the
-  // Switch node. This aligns with the implementation of legacy tf.cond in
-  // control_flow_ops.py. The legacy impl colocates the Switch with the
-  // input tensor which resets the device stack and forces the Switch to have
-  // the same device as the input node (if set) and sets the colocation _class
-  // attr. It also ignores the existing colocation constraints on the input node
-  // using colocate_with(ignore_existing=True).
-  TF_RETURN_IF_ERROR(NodeBuilder(NewName(src->name()), "Switch",
-                                 graph_->op_registry(), &debug_info)
-                         .Input(src, src_output)
-                         .Input(pred_)
-                         .Device(src->requested_device())
-                         .Attr("_class", {src->name()})
-                         .Finalize(graph_, &input));
+  // Switch node.
+  //
+  // NOTE(rachelim): Here, we don't use `CondBuilder::SetColocationAndFinalize`,
+  // and instead ignore the existing colocation stack. This is aligned with the
+  // legacy impl in control_flow_ops.py. The legacy impl colocates this Switch
+  // with the input tensor which resets the device stack and forces the Switch
+  // to have the same device as the input node (if set) and sets the colocation
+  // _class attr. It also ignores the existing colocation stack in the context
+  // by using colocate_with(ignore_existing=True).
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(NewName(src->name()), "Switch", graph_->op_registry(),
+                  &debug_info)
+          .Input(src, src_output)
+          .Input(pred_)
+          .Device(src->requested_device())
+          .Attr(kColocationAttrName,
+                {absl::StrCat(kColocationGroupPrefix, src->name())})
+          .Finalize(graph_, &input));
   then_call_builder_.Input(input, kThenBranch);
   else_call_builder_.Input(input, kElseBranch);
   return Status::OK();
@@ -198,6 +225,8 @@ Status CondBuilder::AddInputs() {
 
 Status CondBuilder::AddOutputs() {
   // Construct the then and else nodes.
+  // NOTE(rachelim): Here, we don't use `CondBuilder::SetColocationAndFinalize`
+  // because the colocation for branch nodes is applied in python.
   TF_RETURN_IF_ERROR(then_call_builder_.Finalize(graph_, &then_call_node_));
   graph_->AddControlEdge(pivot_t_, then_call_node_);
   TF_RETURN_IF_ERROR(else_call_builder_.Finalize(graph_, &else_call_node_));
@@ -207,12 +236,12 @@ Status CondBuilder::AddOutputs() {
   std::vector<Node*> merges(then_call_node_->num_outputs());
   outputs_.resize(merges.size());
   for (int i = 0; i < then_call_node_->num_outputs(); ++i) {
-    TF_RETURN_IF_ERROR(
+    TF_RETURN_IF_ERROR(SetColocationAndFinalize(
         NodeBuilder(NewName("output"), "Merge", graph_->op_registry(),
                     &debug_info_)
             .Input({NodeOut(then_call_node_, i), NodeOut(else_call_node_, i)})
-            .Device(if_op_->requested_device())
-            .Finalize(graph_, &merges[i]));
+            .Device(if_op_->requested_device()),
+        graph_, &merges[i]));
     outputs_[i] = NodeOut(merges[i], 0);
   }
 
@@ -226,12 +255,13 @@ Status CondBuilder::AddOutputs() {
   //
   // We will use this node to rewrite outgoing control edges from lowered 'If'
   // node. All data edges will read tensors directly from Merge nodes.
-  TF_RETURN_IF_ERROR(NodeBuilder(NewName("branch_executed"), "Merge",
-                                 graph_->op_registry(), &debug_info_)
-                         .Input({pivot_t_, pivot_f_})
-                         .ControlInputs({then_call_node_, else_call_node_})
-                         .Device(if_op_->requested_device())
-                         .Finalize(graph_, &branch_executed_node_));
+  TF_RETURN_IF_ERROR(SetColocationAndFinalize(
+      NodeBuilder(NewName("branch_executed"), "Merge", graph_->op_registry(),
+                  &debug_info_)
+          .Input({pivot_t_, pivot_f_})
+          .ControlInputs({then_call_node_, else_call_node_})
+          .Device(if_op_->requested_device()),
+      graph_, &branch_executed_node_));
 
   TF_RETURN_IF_ERROR(BuildLoweredIfOutput());
 
diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc
index b37e65a7ca5..71fe7dfaddb 100644
--- a/tensorflow/core/common_runtime/memory_types.cc
+++ b/tensorflow/core/common_runtime/memory_types.cc
@@ -48,13 +48,12 @@ struct EndpointEq {
 static Status ProcessMemoryTypes(
     const DeviceType& device_type, const Graph* g,
     const std::function<Status(const Edge*, MemoryType, MemoryType)>& fn) {
-  if (device_type != DEVICE_GPU && device_type != DEVICE_SYCL) {
-    // On non-GPU and non-SYCL devices, HOST_MEMORY and DEVICE_MEMORY are always
-    // compatible.
+  if (device_type != DEVICE_GPU) {
+    // On non-GPU devices, HOST_MEMORY and DEVICE_MEMORY are always compatible.
     return Status::OK();
   }
-  // For GPU and SYCL device, HOST_MEMORY and DEVICE_MEMORY is not
-  // compatible. I.e., a conversion/transfer must be done.
+  // For GPU, HOST_MEMORY and DEVICE_MEMORY is not compatible. I.e., a
+  // conversion/transfer must be done.
   //
   // {node id, slot id} -> memory type.
   typedef std::unordered_map<Endpoint, MemoryType, EndpointHash, EndpointEq>
diff --git a/tensorflow/core/common_runtime/memory_types_test.cc b/tensorflow/core/common_runtime/memory_types_test.cc
index e2ed7aadd9c..45e0a8b64c9 100644
--- a/tensorflow/core/common_runtime/memory_types_test.cc
+++ b/tensorflow/core/common_runtime/memory_types_test.cc
@@ -34,9 +34,6 @@ TEST(MemoryTypeChecker, Int32OK) {
   // There is a kernel for adding two int32s on host memory.
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_GPU, g));
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-  TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g));
-#endif  // TENSORFLOW_USE_SYCL
   delete g;
 }
 
@@ -56,15 +53,6 @@ TEST(MemoryTypeChecker, Int32NotOk) {
   TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_GPU, "/device:GPU:0", g));
   TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_GPU, g));
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-  // There is no kernel for casting int32/host memory to float/device
-  // memory.
-  EXPECT_TRUE(errors::IsInternal(ValidateMemoryTypes(DEVICE_SYCL, g)));
-
-  // But we can insert _HostSend/_HostRecv to ensure the invariant.
-  TF_EXPECT_OK(EnsureMemoryTypes(DEVICE_SYCL, "/device:SYCL:0", g));
-  TF_EXPECT_OK(ValidateMemoryTypes(DEVICE_SYCL, g));
-#endif  // TENSORFLOW_USE_SYCL
   delete g;
 }
 
@@ -86,12 +74,6 @@ TEST(MemoryTypeChecker, MemoryTypeForOutput) {
   // int Switch's output on GPU has HOST_MEMORY constraint.
   EXPECT_EQ(memory_type, HOST_MEMORY);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-  auto si = test::graph::Switch(g, test::graph::Constant(g, vi), pred);
-  TF_EXPECT_OK(MemoryTypeForOutput(DEVICE_SYCL, g, si, 0, &memory_type));
-  // int Switch's output on GPU has HOST_MEMORY constraint.
-  EXPECT_EQ(memory_type, HOST_MEMORY);
-#endif  // TENSORFLOW_USE_SYCL
   delete g;
 }
 
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index 1fcdc7507b4..76088b9a94c 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -300,6 +300,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.mkl_fused_conv2d = "_MklFusedConv2D";
     csinfo_.mkl_fused_depthwise_conv2d = "_MklFusedDepthwiseConv2dNative";
     csinfo_.mkl_fused_matmul = "_MklFusedMatMul";
+    csinfo_.mkl_native_conv2d_with_bias = "_MklNativeConv2DWithBias";
+    csinfo_.mkl_native_fused_batch_norm_ex = "_MklNativeFusedBatchNormEx";
+    csinfo_.mkl_native_fused_conv2d = "_MklNativeFusedConv2D";
+    csinfo_.mkl_native_fused_depthwise_conv2d =
+        "_MklNativeFusedDepthwiseConv2dNative";
+    csinfo_.mkl_native_fused_matmul = "_MklNativeFusedMatMul";
+    csinfo_.mkl_native_pad_with_conv2d = "_MklNativePadWithConv2D";
+    csinfo_.mkl_native_pad_with_fused_conv2d = "_MklNativePadWithFusedConv2D";
     csinfo_.mkl_pad_with_conv2d = "_MklPadWithConv2D";
     csinfo_.mkl_pad_with_fused_conv2d = "_MklPadWithFusedConv2D";
     csinfo_.pad = "Pad";
@@ -367,257 +375,244 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.sub = "Sub";
     // End - element-wise ops. See note above.
 
+    const bool native_fmt = NativeFormatEnabled();
     // NOTE: names are alphabetically sorted.
     rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add),
                       CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.add_v2,
-                      mkl_op_registry::GetMklOpName(csinfo_.add_v2),
-                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back(
-        {csinfo_.avg_pool, mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+        {csinfo_.add_v2, mkl_op_registry::GetMklOpName(csinfo_.add_v2),
+         CopyAttrsAll, RewriteIfAtleastOneMklInput, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.avg_pool,
+                      mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.avg_pool_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.avg_pool_grad),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.avg_pool3d, mkl_op_registry::GetMklOpName(csinfo_.avg_pool3d),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.avg_pool3d,
+                      mkl_op_registry::GetMklOpName(csinfo_.avg_pool3d),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.avg_pool3d_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.avg_pool3d_grad),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.batch_matmul,
                       mkl_op_registry::GetMklOpName(csinfo_.batch_matmul),
                       CopyAttrsAll, MatMulRewrite, kRewriteForOpNameChange});
     rinfo_.push_back({csinfo_.batch_matmul_v2,
                       mkl_op_registry::GetMklOpName(csinfo_.batch_matmul_v2),
                       CopyAttrsAll, MatMulRewrite, kRewriteForOpNameChange});
-    rinfo_.push_back(
-        {csinfo_.concat, mkl_op_registry::GetMklOpName(csinfo_.concat),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.concatv2, mkl_op_registry::GetMklOpName(csinfo_.concatv2),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+    rinfo_.push_back({csinfo_.concat,
+                      mkl_op_registry::GetMklOpName(csinfo_.concat),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.concatv2,
+                      mkl_op_registry::GetMklOpName(csinfo_.concatv2),
+                      CopyAttrsAll, ConcatV2Rewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.conjugate_transpose,
          mkl_op_registry::GetMklOpName(csinfo_.conjugate_transpose),
          CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange});
-    rinfo_.push_back({csinfo_.conv2d,
-                      mkl_op_registry::GetMklOpName(csinfo_.conv2d),
+    rinfo_.push_back(
+        {csinfo_.conv2d, mkl_op_registry::GetMklOpName(csinfo_.conv2d),
+         CopyAttrsConvCheckConstFilter, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.conv2d_with_bias,
+                      native_fmt ? csinfo_.mkl_native_conv2d_with_bias
+                                 : csinfo_.mkl_conv2d_with_bias,
                       CopyAttrsConvCheckConstFilter, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.conv2d_with_bias, csinfo_.mkl_conv2d_with_bias,
-                      CopyAttrsConvCheckConstFilter, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back({csinfo_.conv2d_grad_filter,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_filter),
-                      CopyAttrsConv, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsConv, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.conv2d_grad_filter_with_bias,
                       csinfo_.mkl_conv2d_grad_filter_with_bias, CopyAttrsConv,
-                      AlwaysRewrite, kRewriteForLayoutPropagation});
+                      AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.conv2d_grad_input,
                       mkl_op_registry::GetMklOpName(csinfo_.conv2d_grad_input),
-                      CopyAttrsConv, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.conv3d,
-                      mkl_op_registry::GetMklOpName(csinfo_.conv3d),
-                      CopyAttrsConvCheckConstFilter, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsConv, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back(
+        {csinfo_.conv3d, mkl_op_registry::GetMklOpName(csinfo_.conv3d),
+         CopyAttrsConvCheckConstFilter, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.conv3d_grad_filter,
                       mkl_op_registry::GetMklOpName(csinfo_.conv3d_grad_filter),
-                      CopyAttrsConv, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsConv, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.conv3d_grad_input,
                       mkl_op_registry::GetMklOpName(csinfo_.conv3d_grad_input),
-                      CopyAttrsConv, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsConv, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.depthwise_conv2d,
                       mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d),
                       CopyAttrsConv2DDepthwiseCheckConstFilter, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.depthwise_conv2d_grad_input,
          mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d_grad_input),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.depthwise_conv2d_grad_filter,
          mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d_grad_filter),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.dequantize, mkl_op_registry::GetMklOpName(csinfo_.dequantize),
-         CopyAttrsAll, DequantizeRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.dequantize,
+                      mkl_op_registry::GetMklOpName(csinfo_.dequantize),
+                      CopyAttrsAll, DequantizeRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.fused_batch_norm,
                       mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.fused_batch_norm_grad,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.fused_batch_norm_v2,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_v2),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.fused_batch_norm_grad_v2,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad_v2),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
 
     // Using CopyAttrsAll for V3 on CPU, as there are no additional
     // attributes.
     rinfo_.push_back(
         {csinfo_.fused_batch_norm_v3,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_v3),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, FusedBatchNormV3Rewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.fused_batch_norm_grad_v3,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad_v3),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+         CopyAttrsAll, FusedBatchNormV3Rewrite, GetRewriteCause()});
 #ifdef ENABLE_MKLDNN_V1
     rinfo_.push_back({csinfo_.fused_batch_norm_ex,
-                      csinfo_.mkl_fused_batch_norm_ex, CopyAttrsAll,
-                      FusedBatchNormExRewrite, kRewriteForLayoutPropagation});
+                      native_fmt ? csinfo_.mkl_native_fused_batch_norm_ex
+                                 : csinfo_.mkl_fused_batch_norm_ex,
+                      CopyAttrsAll, FusedBatchNormExRewrite,
+                      GetRewriteCause()});
 #endif
-    rinfo_.push_back({csinfo_.fused_conv2d, csinfo_.mkl_fused_conv2d,
+    rinfo_.push_back({csinfo_.fused_conv2d,
+                      native_fmt ? csinfo_.mkl_native_fused_conv2d
+                                 : csinfo_.mkl_fused_conv2d,
                       CopyAttrsFusedConv2D, FusedConv2DRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back({csinfo_.fused_depthwise_conv2d,
-                      csinfo_.mkl_fused_depthwise_conv2d, CopyAttrsFusedConv2D,
-                      FusedDepthwiseConv2DRewrite,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.fused_matmul, csinfo_.mkl_fused_matmul,
-                      CopyAttrsAllCheckConstFilter, FusedMatMulRewrite});
+                      native_fmt ? csinfo_.mkl_native_fused_depthwise_conv2d
+                                 : csinfo_.mkl_fused_depthwise_conv2d,
+                      CopyAttrsFusedConv2D, FusedDepthwiseConv2DRewrite,
+                      GetRewriteCause()});
+    rinfo_.push_back({csinfo_.fused_matmul,
+                      native_fmt ? csinfo_.mkl_native_fused_matmul
+                                 : csinfo_.mkl_fused_matmul,
+                      CopyAttrsAllCheckConstFilter, FusedMatMulRewrite,
+                      GetRewriteCause()});
 
-    rinfo_.push_back({csinfo_.identity,
-                      mkl_op_registry::GetMklOpName(csinfo_.identity),
-                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn),
-                      CopyAttrsAll, LrnRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back(
-        {csinfo_.lrn_grad, mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
-         CopyAttrsAll, LrnGradRewrite, kRewriteForLayoutPropagation});
+        {csinfo_.identity, mkl_op_registry::GetMklOpName(csinfo_.identity),
+         CopyAttrsAll, RewriteIfAtleastOneMklInput, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn),
+                      CopyAttrsAll, LrnRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.lrn_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.lrn_grad),
+                      CopyAttrsAll, LrnGradRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.matmul,
                       mkl_op_registry::GetMklOpName(csinfo_.matmul),
                       CopyAttrsAll, MatMulRewrite, kRewriteForOpNameChange});
-    rinfo_.push_back(
-        {csinfo_.leakyrelu, mkl_op_registry::GetMklOpName(csinfo_.leakyrelu),
-         CopyAttrsAll, LeakyReluRewrite, kRewriteForLayoutPropagation});
+    rinfo_.push_back({csinfo_.leakyrelu,
+                      mkl_op_registry::GetMklOpName(csinfo_.leakyrelu),
+                      CopyAttrsAll, LeakyReluRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.leakyrelu_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.leakyrelu_grad),
-                      CopyAttrsAll, LeakyReluRewrite,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.max_pool,
-                      mkl_op_registry::GetMklOpName(csinfo_.max_pool),
-                      CopyAttrsAll, NonDepthBatchWisePoolRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, LeakyReluRewrite, GetRewriteCause()});
+    rinfo_.push_back(
+        {csinfo_.max_pool, mkl_op_registry::GetMklOpName(csinfo_.max_pool),
+         CopyAttrsAll, NonDepthBatchWisePoolRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.max_pool_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad),
-                      CopyAttrsAll, MaxpoolGradRewrite,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.max_pool3d,
-                      mkl_op_registry::GetMklOpName(csinfo_.max_pool3d),
-                      CopyAttrsAll, NonDepthBatchWisePoolRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, MaxpoolGradRewrite, GetRewriteCause()});
+    rinfo_.push_back(
+        {csinfo_.max_pool3d, mkl_op_registry::GetMklOpName(csinfo_.max_pool3d),
+         CopyAttrsAll, NonDepthBatchWisePoolRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.max_pool3d_grad,
                       mkl_op_registry::GetMklOpName(csinfo_.max_pool3d_grad),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.maximum,
-                      mkl_op_registry::GetMklOpName(csinfo_.maximum),
-                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, Maxpool3DGradRewrite, GetRewriteCause()});
+    rinfo_.push_back(
+        {csinfo_.maximum, mkl_op_registry::GetMklOpName(csinfo_.maximum),
+         CopyAttrsAll, RewriteIfAtleastOneMklInput, GetRewriteCause()});
     rinfo_.push_back({csinfo_.mul, mkl_op_registry::GetMklOpName(csinfo_.mul),
                       CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.pad_with_conv2d, csinfo_.mkl_pad_with_conv2d,
+                      GetRewriteCause()});
+    rinfo_.push_back({csinfo_.pad_with_conv2d,
+                      native_fmt ? csinfo_.mkl_native_pad_with_conv2d
+                                 : csinfo_.mkl_pad_with_conv2d,
                       CopyAttrsPadWithConv2D, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back({csinfo_.pad_with_fused_conv2d,
-                      csinfo_.mkl_pad_with_fused_conv2d,
+                      native_fmt ? csinfo_.mkl_native_pad_with_fused_conv2d
+                                 : csinfo_.mkl_pad_with_fused_conv2d,
                       CopyAttrsPadWithFusedConv2D, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.quantized_avg_pool),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_concatv2,
                       mkl_op_registry::GetMklOpName(csinfo_.quantized_concatv2),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, ConcatV2Rewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_conv2d,
                       mkl_op_registry::GetMklOpName(csinfo_.quantized_conv2d),
                       CopyAttrsQuantizedConv2D, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.quantized_conv2d_per_channel,
          mkl_op_registry::GetMklOpName(csinfo_.quantized_conv2d_per_channel),
-         CopyAttrsQuantizedConv2D, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsQuantizedConv2D, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_conv2d_with_requantize,
                       mkl_op_registry::GetMklOpName(
                           csinfo_.quantized_conv2d_with_requantize),
                       CopyAttrsQuantizedConv2D, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.quantized_conv2d_with_bias,
          mkl_op_registry::GetMklOpName(csinfo_.quantized_conv2d_with_bias),
-         CopyAttrsQuantizedConv2D, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsQuantizedConv2D, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_conv2d_with_bias_and_requantize,
                       mkl_op_registry::GetMklOpName(
                           csinfo_.quantized_conv2d_with_bias_and_requantize),
                       CopyAttrsQuantizedConv2D, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.quantized_conv2d_and_relu,
          mkl_op_registry::GetMklOpName(csinfo_.quantized_conv2d_and_relu),
-         CopyAttrsQuantizedConv2D, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsQuantizedConv2D, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_conv2d_and_relu_and_requantize,
                       mkl_op_registry::GetMklOpName(
                           csinfo_.quantized_conv2d_and_relu_and_requantize),
                       CopyAttrsQuantizedConv2D, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_conv2d_with_bias_and_relu,
                       mkl_op_registry::GetMklOpName(
                           csinfo_.quantized_conv2d_with_bias_and_relu),
                       CopyAttrsQuantizedConv2D, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.quantized_conv2d_with_bias_and_relu_and_requantize,
          mkl_op_registry::GetMklOpName(
              csinfo_.quantized_conv2d_with_bias_and_relu_and_requantize),
-         CopyAttrsQuantizedConv2D, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsQuantizedConv2D, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_max_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.quantized_max_pool),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_conv2d_with_bias_sum_and_relu,
                       mkl_op_registry::GetMklOpName(
                           csinfo_.quantized_conv2d_with_bias_sum_and_relu),
                       CopyAttrsQuantizedConv2D, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.quantized_conv2d_with_bias_sum_and_relu_and_requantize,
          mkl_op_registry::GetMklOpName(
              csinfo_.quantized_conv2d_with_bias_sum_and_relu_and_requantize),
-         CopyAttrsQuantizedConv2D, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsQuantizedConv2D, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.quant_conv2d_with_bias_signed_sum_and_relu_and_requantize,
          mkl_op_registry::GetMklOpName(
              csinfo_.quant_conv2d_with_bias_signed_sum_and_relu_and_requantize),
-         CopyAttrsQuantizedConv2D, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsQuantizedConv2D, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.quantized_matmul_with_bias,
          mkl_op_registry::GetMklOpName(csinfo_.quantized_matmul_with_bias),
@@ -643,72 +638,65 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back(
         {csinfo_.quantized_depthwise_conv2d,
          mkl_op_registry::GetMklOpName(csinfo_.quantized_depthwise_conv2d),
-         CopyAttrsQuantizedConv2D, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsQuantizedConv2D, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantized_depthwise_conv2d_with_bias,
                       mkl_op_registry::GetMklOpName(
                           csinfo_.quantized_depthwise_conv2d_with_bias),
                       CopyAttrsQuantizedConv2D, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.quantized_depthwise_conv2d_with_bias_and_relu,
          mkl_op_registry::GetMklOpName(
              csinfo_.quantized_depthwise_conv2d_with_bias_and_relu),
-         CopyAttrsQuantizedConv2D, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsQuantizedConv2D, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back(
         {csinfo_.quantized_depthwise_conv2d_with_bias_and_relu_and_requantize,
          mkl_op_registry::GetMklOpName(
              csinfo_
                  .quantized_depthwise_conv2d_with_bias_and_relu_and_requantize),
-         CopyAttrsQuantizedConv2D, AlwaysRewrite,
-         kRewriteForLayoutPropagation});
+         CopyAttrsQuantizedConv2D, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.quantize_v2,
                       mkl_op_registry::GetMklOpName(csinfo_.quantize_v2),
-                      CopyAttrsAll, QuantizeOpRewrite,
-                      kRewriteForLayoutPropagation});
+                      CopyAttrsAll, QuantizeOpRewrite, GetRewriteCause()});
     rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.relu_grad, mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.relu6, mkl_op_registry::GetMklOpName(csinfo_.relu6),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.relu6_grad, mkl_op_registry::GetMklOpName(csinfo_.relu6_grad),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.requantize, mkl_op_registry::GetMklOpName(csinfo_.requantize),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.relu_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.relu_grad),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.relu6,
+                      mkl_op_registry::GetMklOpName(csinfo_.relu6),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.relu6_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.relu6_grad),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.requantize,
+                      mkl_op_registry::GetMklOpName(csinfo_.requantize),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
 #ifdef ENABLE_MKLDNN_V1
     // Optimized TanhGrad support exists only in DNNL 1.x.
     rinfo_.push_back({csinfo_.tanh, mkl_op_registry::GetMklOpName(csinfo_.tanh),
-                      CopyAttrsAll, AlwaysRewrite,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.tanh_grad, mkl_op_registry::GetMklOpName(csinfo_.tanh_grad),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.tanh_grad,
+                      mkl_op_registry::GetMklOpName(csinfo_.tanh_grad),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
 #endif  // ENABLE_MKLDNN_V1
+    rinfo_.push_back({csinfo_.reshape,
+                      mkl_op_registry::GetMklOpName(csinfo_.reshape),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
     rinfo_.push_back(
-        {csinfo_.reshape, mkl_op_registry::GetMklOpName(csinfo_.reshape),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
-    rinfo_.push_back({csinfo_.slice,
-                      mkl_op_registry::GetMklOpName(csinfo_.slice),
-                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      kRewriteForLayoutPropagation});
-    rinfo_.push_back(
-        {csinfo_.softmax, mkl_op_registry::GetMklOpName(csinfo_.softmax),
-         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+        {csinfo_.slice, mkl_op_registry::GetMklOpName(csinfo_.slice),
+         CopyAttrsAll, RewriteIfAtleastOneMklInput, GetRewriteCause()});
+    rinfo_.push_back({csinfo_.softmax,
+                      mkl_op_registry::GetMklOpName(csinfo_.softmax),
+                      CopyAttrsAll, AlwaysRewrite, GetRewriteCause()});
 
     rinfo_.push_back({csinfo_.squared_difference,
                       mkl_op_registry::GetMklOpName(csinfo_.squared_difference),
                       CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back({csinfo_.sub, mkl_op_registry::GetMklOpName(csinfo_.sub),
                       CopyAttrsAll, RewriteIfAtleastOneMklInput,
-                      kRewriteForLayoutPropagation});
+                      GetRewriteCause()});
     rinfo_.push_back({csinfo_.transpose,
                       mkl_op_registry::GetMklOpName(csinfo_.transpose),
                       CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange});
@@ -723,9 +711,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     minfo_.push_back({csinfo_.conv2d, csinfo_.bias_add,
                       csinfo_.conv2d_with_bias, GetConv2DOrBiasAdd});
 
-    minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad,
-                      csinfo_.conv2d_grad_filter_with_bias,
-                      GetConv2DBackpropFilterOrBiasAddGrad});
     // Merge Pad and Conv2d, only if the pad op is "Pad"
     // Doesn't merge if pad op is "PadV2" or "MirrorPad"
     minfo_.push_back(
@@ -734,76 +719,82 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     minfo_.push_back({csinfo_.pad, csinfo_.fused_conv2d,
                       csinfo_.pad_with_fused_conv2d, GetPadOrFusedConv2D});
 
-    // The fusion patterns in "finfo_" that show up first will get applied
-    // first, for example, graph "A->B->C-D" and finfo_ is {A->B->C to ABC,
-    // A->B->C->D to ABCD}, since the first gets applied first, the final
-    // graph will be ABC->D.
+    if (!native_fmt) {
+      minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad,
+                        csinfo_.conv2d_grad_filter_with_bias,
+                        GetConv2DBackpropFilterOrBiasAddGrad});
 
-    //
-    // Add rules to fuse sequences such as "Transpose (NCHW -> NHWC) + Conv2D
-    // (NHWC) + Transpose (NHWC->
-    // NCHW)" into "Conv2D (NCHW)". Such patterns occur frequently in Keras.
-    // Note: we use the term "merge" to combine (exactly) 2 nodes into one,
-    // while "fusion" is for 3+ nodes situation.
-    //
+      // The fusion patterns in "finfo_" that show up first will get applied
+      // first, for example, graph "A->B->C-D" and finfo_ is {A->B->C to ABC,
+      // A->B->C->D to ABCD}, since the first gets applied first, the final
+      // graph will be ABC->D.
 
-    // Transpose + Conv2d + Transpose:
-    std::vector<int> transpose_to_nhwc = {NCHW::dim::N, NCHW::dim::H,
-                                          NCHW::dim::W, NCHW::dim::C};
-    std::vector<int> transpose_to_nchw = {NHWC::dim::N, NHWC::dim::C,
-                                          NHWC::dim::H, NHWC::dim::W};
-    auto CheckForTransposeToNHWC =
-        std::bind(CheckForTranspose, std::placeholders::_1, transpose_to_nhwc);
-    auto CheckForConv2dOp =
-        std::bind(CheckForMklOp, std::placeholders::_1, csinfo_.conv2d);
-    auto CheckForTransposeToNCHW =
-        std::bind(CheckForTranspose, std::placeholders::_1, transpose_to_nchw);
-    auto FuseConv2D =
-        std::bind(FuseTransposeMklOpTranspose, std::placeholders::_1,
-                  std::placeholders::_2, std::placeholders::_3, "NCHW");
-    finfo_.push_back(
-        {"transpose-elimination for Conv2D",
-         {CheckForTransposeToNHWC, CheckForConv2dOp, CheckForTransposeToNCHW},
-         // CheckForMklOp
-         FuseConv2D,
-         CopyAttrsConv});
+      //
+      // Add rules to fuse sequences such as "Transpose (NCHW -> NHWC) + Conv2D
+      // (NHWC) + Transpose (NHWC->
+      // NCHW)" into "Conv2D (NCHW)". Such patterns occur frequently in Keras.
+      // Note: we use the term "merge" to combine (exactly) 2 nodes into one,
+      // while "fusion" is for 3+ nodes situation.
+      //
 
-    // Transpose + Conv3d + Transpose:
-    std::vector<int> transpose_to_ndhwc = {NCDHW::dim::N, NCDHW::dim::D,
-                                           NCDHW::dim::H, NCDHW::dim::W,
-                                           NCDHW::dim::C};
-    std::vector<int> transpose_to_ncdhw = {NDHWC::dim::N, NDHWC::dim::C,
-                                           NDHWC::dim::D, NDHWC::dim::H,
-                                           NDHWC::dim::W};
+      // Transpose + Conv2d + Transpose:
+      std::vector<int> transpose_to_nhwc = {NCHW::dim::N, NCHW::dim::H,
+                                            NCHW::dim::W, NCHW::dim::C};
+      std::vector<int> transpose_to_nchw = {NHWC::dim::N, NHWC::dim::C,
+                                            NHWC::dim::H, NHWC::dim::W};
+      auto CheckForTransposeToNHWC = std::bind(
+          CheckForTranspose, std::placeholders::_1, transpose_to_nhwc);
+      auto CheckForConv2dOp =
+          std::bind(CheckForMklOp, std::placeholders::_1, csinfo_.conv2d);
+      auto CheckForTransposeToNCHW = std::bind(
+          CheckForTranspose, std::placeholders::_1, transpose_to_nchw);
+      auto FuseConv2D =
+          std::bind(FuseTransposeMklOpTranspose, std::placeholders::_1,
+                    std::placeholders::_2, std::placeholders::_3, "NCHW");
+      finfo_.push_back(
+          {"transpose-elimination for Conv2D",
+           {CheckForTransposeToNHWC, CheckForConv2dOp, CheckForTransposeToNCHW},
+           // CheckForMklOp
+           FuseConv2D,
+           CopyAttrsConv});
 
-    auto CheckForTransposeToNDHWC =
-        std::bind(CheckForTranspose, std::placeholders::_1, transpose_to_ndhwc);
-    auto CheckForConv3dOp =
-        std::bind(CheckForMklOp, std::placeholders::_1, csinfo_.conv3d);
-    auto CheckForTransposeToNCDHW =
-        std::bind(CheckForTranspose, std::placeholders::_1, transpose_to_ncdhw);
-    auto FuseConv3D =
-        std::bind(FuseTransposeMklOpTranspose, std::placeholders::_1,
-                  std::placeholders::_2, std::placeholders::_3, "NCDHW");
+      // Transpose + Conv3d + Transpose:
+      std::vector<int> transpose_to_ndhwc = {NCDHW::dim::N, NCDHW::dim::D,
+                                             NCDHW::dim::H, NCDHW::dim::W,
+                                             NCDHW::dim::C};
+      std::vector<int> transpose_to_ncdhw = {NDHWC::dim::N, NDHWC::dim::C,
+                                             NDHWC::dim::D, NDHWC::dim::H,
+                                             NDHWC::dim::W};
 
-    finfo_.push_back(
-        {"transpose-elimination for Conv3D",
-         {CheckForTransposeToNDHWC, CheckForConv3dOp, CheckForTransposeToNCDHW},
-         // CheckForMklOp
-         FuseConv3D,
-         CopyAttrsConv});
+      auto CheckForTransposeToNDHWC = std::bind(
+          CheckForTranspose, std::placeholders::_1, transpose_to_ndhwc);
+      auto CheckForConv3dOp =
+          std::bind(CheckForMklOp, std::placeholders::_1, csinfo_.conv3d);
+      auto CheckForTransposeToNCDHW = std::bind(
+          CheckForTranspose, std::placeholders::_1, transpose_to_ncdhw);
+      auto FuseConv3D =
+          std::bind(FuseTransposeMklOpTranspose, std::placeholders::_1,
+                    std::placeholders::_2, std::placeholders::_3, "NCDHW");
 
-    auto CheckForMaxPool3DOp =
-        std::bind(CheckForMklOp, std::placeholders::_1, csinfo_.max_pool3d);
-    auto FuseMaxPool3D =
-        std::bind(FuseTransposeMklOpTranspose, std::placeholders::_1,
-                  std::placeholders::_2, std::placeholders::_3, "NCDHW");
-    finfo_.push_back({"transpose-elimination for MaxPool3D",
-                      {CheckForTransposeToNDHWC, CheckForMaxPool3DOp,
-                       CheckForTransposeToNCDHW},
-                      // CheckForMklOp
-                      FuseMaxPool3D,
-                      CopyAttrsPooling});
+      finfo_.push_back({"transpose-elimination for Conv3D",
+                        {CheckForTransposeToNDHWC, CheckForConv3dOp,
+                         CheckForTransposeToNCDHW},
+                        // CheckForMklOp
+                        FuseConv3D,
+                        CopyAttrsConv});
+
+      auto CheckForMaxPool3DOp =
+          std::bind(CheckForMklOp, std::placeholders::_1, csinfo_.max_pool3d);
+      auto FuseMaxPool3D =
+          std::bind(FuseTransposeMklOpTranspose, std::placeholders::_1,
+                    std::placeholders::_2, std::placeholders::_3, "NCDHW");
+      finfo_.push_back({"transpose-elimination for MaxPool3D",
+                        {CheckForTransposeToNDHWC, CheckForMaxPool3DOp,
+                         CheckForTransposeToNCDHW},
+                        // CheckForMklOp
+                        FuseMaxPool3D,
+                        CopyAttrsPooling});
+    }
   }
 
   // Standard interface to run pass
@@ -824,6 +815,16 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   /// of ops like MatMul, Transpose, which do not support Mkl layout)
   enum RewriteCause { kRewriteForLayoutPropagation, kRewriteForOpNameChange };
 
+  // Get the op rewrite cause depending on whether native format mode
+  // is enabled or not.
+  RewriteCause GetRewriteCause() {
+    if (NativeFormatEnabled()) {
+      return kRewriteForOpNameChange;
+    } else {
+      return kRewriteForLayoutPropagation;
+    }
+  }
+
   /// Structure to specify the name of an original node, its new name after
   /// rewrite, the number of inputs to the original node, the function to
   /// be used to copy attributes for the op, and the rule (if any) which
@@ -960,6 +961,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string mkl_fused_conv2d;
     string mkl_fused_depthwise_conv2d;
     string mkl_fused_matmul;
+    string mkl_native_conv2d_with_bias;
+    string mkl_native_fused_batch_norm_ex;
+    string mkl_native_fused_conv2d;
+    string mkl_native_fused_depthwise_conv2d;
+    string mkl_native_fused_matmul;
+    string mkl_native_pad_with_conv2d;
+    string mkl_native_pad_with_fused_conv2d;
     string mkl_pad_with_conv2d;
     string mkl_pad_with_fused_conv2d;
     string mul;
@@ -1113,7 +1121,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // merged with 'm'. If input 'm' is Conv2D, then check if there exists BiasAdd
   // node that can be merged with 'm'.
   static Node* GetConv2DOrBiasAdd(const Node* m) {
-    CHECK_NOTNULL(m);
+    DCHECK(m);
     Node* n = nullptr;
 
     DataType T_m;
@@ -1280,7 +1288,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // So 1st input of BiasAddGrad connects with 3rd input of
   // Conv2DBackpropFilter and vice versa.
   static Node* GetConv2DBackpropFilterOrBiasAddGrad(const Node* m) {
-    CHECK_NOTNULL(m);
+    DCHECK(m);
     Node* n = nullptr;
 
     DataType T_m;
@@ -1493,6 +1501,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     }
     return false;
   }
+  // For oneDNN, only int32 is supported for axis data type
+  static bool ConcatV2Rewrite(const Node* n) {
+    DataType T;
+    GetNodeAttr(n->def(), "Tidx", &T);
+    return (T == DT_INT32);
+  }
 
   static bool DequantizeRewrite(const Node* n) {
     DCHECK(n);
@@ -1534,7 +1548,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // @return - true (if it is not a depth/batch wise pooling case);
   //           false otherwise.
   static bool NonDepthBatchWisePoolRewrite(const Node* n) {
-    CHECK_NOTNULL(n);
+    DCHECK(n);
 
     string data_format_str;
     TensorFormat data_format;
@@ -1561,7 +1575,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // and use default Eigen. But for depth_radius=2, MKL DNN optimized
   // path is taken, i.e., eigen node is rewritten by MKl DNN node.
   static bool LrnRewrite(const Node* n) {
-    CHECK_NOTNULL(n);
+    DCHECK(n);
 
     int depth_radius;
     TF_CHECK_OK(GetNodeAttr(n->def(), "depth_radius", &depth_radius));
@@ -1579,7 +1593,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   }
 
   static bool LrnGradRewrite(const Node* n) {
-    CHECK_NOTNULL(n);
+    DCHECK(n);
     bool do_rewrite = false;
 
     for (const Edge* e : n->in_edges()) {
@@ -1673,8 +1687,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     }
     return true;
   }
+
   static bool MaxpoolGradRewrite(const Node* n) {
-    CHECK_NOTNULL(n);
+    DCHECK(n);
     bool do_rewrite = false;
     for (const Edge* e : n->in_edges()) {
       // Rewrite only if there is corresponding Maxpool, i.e workspace is
@@ -1691,6 +1706,32 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return do_rewrite;
   }
 
+  static bool Maxpool3DGradRewrite(const Node* n) {
+    DCHECK(n);
+    for (const Edge* e : n->in_edges()) {
+      // Rewrite only if there is corresponding Maxpool3D, i.e., workspace is
+      // available
+      if (e->dst()->type_string() == csinfo_.max_pool3d_grad &&
+          e->dst_input() == 1 &&
+          e->src()->type_string() ==
+              mkl_op_registry::GetMklOpName(csinfo_.max_pool3d) &&
+          e->src_output() == 0) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  static bool FusedBatchNormV3Rewrite(const Node* n) {
+    DCHECK(n);
+    if (Check5DFormat(n->def())) {
+      VLOG(1) << "Graph Rewrite: FusedBatchNorm(Grad)V3 op currently does not "
+              << "support 5D tensors.";
+      return false;
+    }
+    return true;
+  }
+
   static bool FusedBatchNormExRewrite(const Node* n) {
     DCHECK(n);
 
@@ -1736,7 +1777,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
             fused_ops == std::vector<string>{"BiasAdd", "Relu6"} ||
             fused_ops == std::vector<string>{"BiasAdd", "Elu"} ||
             fused_ops == std::vector<string>{"BiasAdd", "Add"} ||
-            fused_ops == std::vector<string>{"BiasAdd", "Add", "Relu"});
+            fused_ops == std::vector<string>{"BiasAdd", "Add", "Relu"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "Add", "Relu6"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "Add", "Elu"} ||
+            fused_ops == std::vector<string>{"LeakyRelu"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "LeakyRelu"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "Add", "LeakyRelu"});
   }
 
   static bool FusedDepthwiseConv2DRewrite(const Node* n) {
@@ -2039,7 +2085,7 @@ void MklLayoutRewritePass::GetNodesProducingTFTensorList(
     int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
   CHECK_LT(*input_idx, inputs.size());
   CHECK_GT(list_length, 0);
-  CHECK_NOTNULL(output_nodes);
+  DCHECK(output_nodes);
   output_nodes->reserve(list_length);
 
   while (list_length != 0) {
@@ -2076,7 +2122,7 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr<Graph>* g,
                                                       // device of the original
                                                       // node.
                   .Finalize(&**g, out));
-  CHECK_NOTNULL(*out);  // Make sure we got a valid object before using it
+  DCHECK(*out);  // Make sure we got a valid object before using it
 
   // If number of inputs to the original node is > 0, then we add
   // control dependency between 1st input (index 0) of the original node and
@@ -2104,7 +2150,7 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
     int list_length, std::vector<NodeBuilder::NodeOut>* output_nodes) {
   CHECK_LT(*input_idx, inputs.size());
   CHECK_GT(list_length, 0);
-  CHECK_NOTNULL(output_nodes);
+  DCHECK(output_nodes);
   output_nodes->reserve(list_length);
 
   while (list_length != 0) {
@@ -2132,9 +2178,9 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList(
 void MklLayoutRewritePass::GetNodeProducingMklTensor(
     std::unique_ptr<Graph>* g, const Node* orig_node, Node* n,
     int n_output_slot, Node** mkl_node, int* mkl_node_output_slot) {
-  CHECK_NOTNULL(n);
-  CHECK_NOTNULL(mkl_node);
-  CHECK_NOTNULL(mkl_node_output_slot);
+  DCHECK(n);
+  DCHECK(mkl_node);
+  DCHECK(mkl_node_output_slot);
 
   // If this is an MKL op, then it will create extra output for MKL layout.
   DataType T;
@@ -2153,7 +2199,7 @@ void MklLayoutRewritePass::GetNodeProducingMklTensor(
     // DummyMklTensor node has no input and generates only 1 output
     // (dummy Mkl tensor) as output slot number 0.
     GetDummyMklTensorNode(g, mkl_node, orig_node);
-    CHECK_NOTNULL(*mkl_node);
+    DCHECK(*mkl_node);
     *mkl_node_output_slot = 0;
   }
 }
@@ -2164,7 +2210,7 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     NodeBuilder* nb, const Node* old_node,
     std::vector<NodeBuilder::NodeOut>* workspace_tensors,
     bool are_workspace_tensors_available) {
-  CHECK_NOTNULL(workspace_tensors);
+  DCHECK(workspace_tensors);
   CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
 
   // TODO(nhasabni): Temporary solution to connect filter input of
@@ -2182,7 +2228,7 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     Node* filter_node = nullptr;
     TF_CHECK_OK(old_node->input_node(kConv2DBackpropInputFilterInputSlotIdx,
                                      &filter_node));
-    CHECK_NOTNULL(filter_node);
+    DCHECK(filter_node);
 
     // Now check which nodes receive from filter_node. Filter feeds as
     // 2nd input (slot 1) of _MklConv2D, _MklConv2DWithBias, and
@@ -2403,8 +2449,10 @@ Status MklLayoutRewritePass::CopyInputs(
     if (ArgIsList(arg)) {
       std::vector<NodeBuilder::NodeOut> new_node_inputs;
       int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingTFTensorList(old_node_inputs, &iidx, N,
-                                    &new_node_inputs);
+      if (N != 0) {
+        GetNodesProducingTFTensorList(old_node_inputs, &iidx, N,
+                                      &new_node_inputs);
+      }
       nb->Input(new_node_inputs);
     } else {
       nb->Input(old_node_inputs[iidx].first, old_node_inputs[iidx].second);
@@ -2430,14 +2478,14 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
     std::unique_ptr<Graph>* g, const Node* orig_node, NodeBuilder* nb,
     std::vector<NodeBuilder::NodeOut>* ws_tensors, bool* are_ws_tensors_added) {
   bool workspace_edge_added = false;  // Default initializer
-  CHECK_NOTNULL(are_ws_tensors_added);
+  DCHECK(are_ws_tensors_added);
   *are_ws_tensors_added = false;  // Default initializer
 
   DataType T;
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
   for (auto ws : wsinfo_) {
     if (orig_node->type_string() == ws.fwd_op &&
-        mkl_op_registry::IsMklLayoutDependentOp(
+        mkl_op_registry::IsMklOp(
             mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) {
       // If this op is a fwd op, then we need to check if there is an
       // edge from this node's fwd_slot to bwdop's bwd_slot. If there is
@@ -2464,7 +2512,7 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
         nb->Attr("workspace_enabled", false);
       }
     } else if (orig_node->type_string() == ws.bwd_op &&
-               mkl_op_registry::IsMklLayoutDependentOp(
+               mkl_op_registry::IsMklOp(
                    mkl_op_registry::GetMklOpName(orig_node->type_string()),
                    T)) {
       // If this op is a bwd op, then we need to add workspace edge and
@@ -2485,13 +2533,17 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
                 mkl_op_registry::GetMklOpName(ws.fwd_op) &&
             e->dst_input() == ws.bwd_slot) {
           nb->Attr("workspace_enabled", true);
-          CHECK_NOTNULL(ws_tensors);
+          DCHECK(ws_tensors);
           // Add workspace edge between fwd op and bwd op.
           ws_tensors->push_back(NodeBuilder::NodeOut(e->src(), ws.ws_fwd_slot));
-          // Add Mkl tensor edge for workspace edge between fwd op and bwd op.
-          ws_tensors->push_back(NodeBuilder::NodeOut(
-              e->src(), DataIndexToMetaDataIndex(ws.ws_fwd_slot,
-                                                 e->src()->num_outputs())));
+          // Check if we are running in native format mode. If so,
+          // we don't need to have an Mkl metadata tensor for the workspace.
+          if (!NativeFormatEnabled()) {
+            // Add Mkl tensor edge for workspace edge between fwd op and bwd op.
+            ws_tensors->push_back(NodeBuilder::NodeOut(
+                e->src(), DataIndexToMetaDataIndex(ws.ws_fwd_slot,
+                                                   e->src()->num_outputs())));
+          }
           *are_ws_tensors_added = true;
           // In terms of input ordering, we add these calls to add Input
           // here because workspace edge (and its Mkl tensor) is the last
@@ -2515,9 +2567,9 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
         Node* dmt_mkl_ws = nullptr;  // Dummy Mkl tensor for workspace
         GetDummyWorkspaceTensorNode(g, &dmt_ws, orig_node);
         GetDummyMklTensorNode(g, &dmt_mkl_ws, orig_node);
-        CHECK_NOTNULL(dmt_ws);
-        CHECK_NOTNULL(dmt_mkl_ws);
-        CHECK_NOTNULL(ws_tensors);
+        DCHECK(dmt_ws);
+        DCHECK(dmt_mkl_ws);
+        DCHECK(ws_tensors);
         // We add dummy tensor as workspace tensor.
         ws_tensors->push_back(NodeBuilder::NodeOut(dmt_ws, 0));
         // We add dummy tensor as Mkl tensor for workspace tensor.
@@ -2711,6 +2763,7 @@ void MklLayoutRewritePass::CopyAttrsFromPadAndFusedConv2D(
   float epsilon;
   std::vector<string> fused_ops;
   DataType Tpaddings;
+  float leakyrelu_alpha;
 
   // Get all attributes from old node.
   TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "T", &T));
@@ -2721,6 +2774,8 @@ void MklLayoutRewritePass::CopyAttrsFromPadAndFusedConv2D(
   TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "dilations", &dilations));
   TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "fused_ops", &fused_ops));
   TF_CHECK_OK(GetNodeAttr(fused_conv2d->def(), "epsilon", &epsilon));
+  TF_CHECK_OK(
+      GetNodeAttr(fused_conv2d->def(), "leakyrelu_alpha", &leakyrelu_alpha));
   TF_CHECK_OK(GetNodeAttr(pad->def(), "Tpaddings", &Tpaddings));
 
   // Add attributes to new node.
@@ -2733,6 +2788,7 @@ void MklLayoutRewritePass::CopyAttrsFromPadAndFusedConv2D(
   nb->Attr("epsilon", epsilon);
   nb->Attr("Tpaddings", Tpaddings);
   nb->Attr("fused_ops", fused_ops);
+  nb->Attr("leakyrelu_alpha", leakyrelu_alpha);
 }
 
 void MklLayoutRewritePass::CopyAttrsConv2DDepthwiseCheckConstFilter(
@@ -2901,6 +2957,7 @@ void MklLayoutRewritePass::CopyAttrsFusedConv2D(const Node* orig_node,
   std::vector<int32> strides;
   std::vector<int32> dilations;
   std::vector<string> fused_ops;
+  float leakyrelu_alpha;
 
   // Get all attributes from old node.
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
@@ -2911,6 +2968,8 @@ void MklLayoutRewritePass::CopyAttrsFusedConv2D(const Node* orig_node,
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "fused_ops", &fused_ops));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "epsilon", &epsilon));
+  TF_CHECK_OK(
+      GetNodeAttr(orig_node->def(), "leakyrelu_alpha", &leakyrelu_alpha));
 
   Node* filter_node = nullptr;
   TF_CHECK_OK(orig_node->input_node(1, &filter_node));
@@ -2925,6 +2984,7 @@ void MklLayoutRewritePass::CopyAttrsFusedConv2D(const Node* orig_node,
   nb->Attr("dilations", dilations);
   nb->Attr("fused_ops", fused_ops);
   nb->Attr("epsilon", epsilon);
+  nb->Attr("leakyrelu_alpha", leakyrelu_alpha);
 }
 
 void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
@@ -3171,8 +3231,8 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
       // BiasAdd has only 1 output (at slot 0) and merged node also has only 1
       // output (at slot 0).
       const int kConv2DWithBiasOutputSlot = 0;
-      CHECK_NOTNULL((*g)->AddEdge(new_node, kConv2DWithBiasOutputSlot, e->dst(),
-                                  e->dst_input()));
+      DCHECK((*g)->AddEdge(new_node, kConv2DWithBiasOutputSlot, e->dst(),
+                           e->dst_input()));
     }
   }
 
@@ -3465,8 +3525,8 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
         (*g)->AddControlEdge(new_node, e->dst(), true);
       }
     } else {
-      CHECK_NOTNULL((*g)->AddEdge(new_node, kMergedNodeBiasGradOutputIdx,
-                                  e->dst(), e->dst_input()));
+      DCHECK((*g)->AddEdge(new_node, kMergedNodeBiasGradOutputIdx, e->dst(),
+                           e->dst_input()));
     }
   }
   unique_node.clear();
@@ -3479,8 +3539,8 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
         (*g)->AddControlEdge(new_node, e->dst(), true);
       }
     } else {
-      CHECK_NOTNULL((*g)->AddEdge(new_node, kMergedNodeFilterGradOutputIdx,
-                                  e->dst(), e->dst_input()));
+      DCHECK((*g)->AddEdge(new_node, kMergedNodeFilterGradOutputIdx, e->dst(),
+                           e->dst_input()));
     }
   }
 
@@ -3501,8 +3561,8 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
 
 Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* m,
                                        Node* n) {
-  CHECK_NOTNULL(m);
-  CHECK_NOTNULL(n);
+  DCHECK(m);
+  DCHECK(n);
 
   if (((m->type_string() == csinfo_.bias_add &&
         n->type_string() == csinfo_.conv2d)) ||
@@ -3608,7 +3668,7 @@ Status MklLayoutRewritePass::RewriteNodeForLayoutPropagation(
         (*g)->AddControlEdge(*new_node, e->dst(), true);
       }
     } else {
-      CHECK_NOTNULL((*g)->AddEdge(
+      DCHECK((*g)->AddEdge(
           *new_node,
           GetTensorDataIndex(e->src_output(), e->src()->num_outputs()),
           e->dst(), e->dst_input()));
@@ -3642,7 +3702,21 @@ Status MklLayoutRewritePass::RewriteNodeForJustOpNameChange(
     return s;
   }
 
-  ri->copy_attrs(const_cast<const Node*>(orig_node), &nb, true);
+  std::vector<NodeBuilder::NodeOut> workspace_tensors;
+  bool are_workspace_tensors_available = false;
+  AddWorkSpaceEdgeIfNeeded(g, orig_node, &nb, &workspace_tensors,
+                           &are_workspace_tensors_available);
+  if (are_workspace_tensors_available) {
+    DCHECK_EQ(workspace_tensors.size(), 1);
+    nb.Input(workspace_tensors[0].node, workspace_tensors[0].index);
+  }
+
+  if (!NativeFormatEnabled()) {
+    ri->copy_attrs(const_cast<const Node*>(orig_node), &nb, true);
+  } else {
+    ri->copy_attrs(const_cast<const Node*>(orig_node), &nb, false);
+  }
+
   nb.Attr("_kernel", mkl_op_registry::kMklNameChangeOpLabel);
 
   // Finalize graph and get new node.
@@ -3751,7 +3825,7 @@ MklLayoutRewritePass::CheckForQuantizedNodeRewrite(const Node* n) const {
 
 const MklLayoutRewritePass::RewriteInfo*
 MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
-  CHECK_NOTNULL(n);
+  DCHECK(n);
 
   // QuantizedOps may have attributes other than "T", so decoupled the check
   // with a function, CheckForQuantizedNodeRewrite(const Node*).
@@ -3767,11 +3841,16 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
     return nullptr;
   }
 
-  // We make an exception for Conv2D, as the corresponding MKL ops
-  // currently do not support the case of padding == EXPLICIT yet.
+  // We make an exception for Conv2D and MaxPool related ops as
+  // the corresponding MKL ops currently do not support the case
+  // of padding == EXPLICIT yet.
   if (n->type_string() == csinfo_.conv2d ||
       n->type_string() == csinfo_.conv2d_grad_input ||
-      n->type_string() == csinfo_.conv2d_grad_filter) {
+      n->type_string() == csinfo_.conv2d_grad_filter ||
+      n->type_string() == csinfo_.max_pool ||
+      n->type_string() == csinfo_.max_pool_grad ||
+      n->type_string() == csinfo_.max_pool3d ||
+      n->type_string() == csinfo_.max_pool3d_grad) {
     string padding;
     TF_CHECK_OK(GetNodeAttr(n->def(), "padding", &padding));
     if (padding == "EXPLICIT") return nullptr;
@@ -3965,8 +4044,8 @@ bool MklLayoutRewritePass::FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
   if (IsConstant(e_metadata->src())) {
     Node* e_metadata_dst = e_metadata->dst();
     int e_metadata_in_slot = e_metadata->dst_input();
-    CHECK_NOTNULL((*g)->AddEdge(n_data, n_metadata_op_slot, e_metadata_dst,
-                                e_metadata_in_slot));
+    DCHECK((*g)->AddEdge(n_data, n_metadata_op_slot, e_metadata_dst,
+                         e_metadata_in_slot));
 
     (*g)->RemoveEdge(e_metadata);
     return true;
@@ -4038,7 +4117,7 @@ bool MklLayoutRewritePass::FixMklMetaDataEdges(std::unique_ptr<Graph>* g,
 
 bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
   bool result = false;
-  CHECK_NOTNULL(g);
+  DCHECK(g);
 
   DumpGraph("Before running MklLayoutRewritePass", &**g);
 
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
index 9bfa9418bf3..fda5ad93352 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
@@ -1557,6 +1557,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Dequantize_Negative_Non_SCALED_Mode) {
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"     \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['A', 'B', 'C']}"                                             \
         "node { name: 'E' op: 'Zeta'"                                          \
         "attr { key: 'T' value { type: " #T " } }"                             \
@@ -1588,6 +1589,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive1);
               "i:1, i:1} } }"                                                 \
               " attr { key: 'fused_ops'        value { list: {s: 'Relu'} } }" \
               " attr { key: 'epsilon'          value { f: 0.001 }}"           \
+              " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"             \
               " input: ['A', 'B', 'C']}"                                      \
               "node { name: 'E' op: 'Zeta'"                                   \
               "attr { key: 'T' value { type: " #T " } }"                      \
@@ -1620,6 +1622,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive2);
               " attr { key: 'fused_ops'"                                      \
               "             value { list: {s: 'BiasAdd', s: 'Relu'} } }"      \
               " attr { key: 'epsilon'          value { f: 0.001 }}"           \
+              " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"             \
               " input: ['A', 'B', 'C']}"                                      \
               "node { name: 'E' op: 'Zeta'"                                   \
               "attr { key: 'T' value { type: " #T " } }"                      \
@@ -1652,6 +1655,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive3);
               " attr { key: 'fused_ops'"                                      \
               "             value { list: {s: 'BiasAdd', s: 'Relu6'} } }"     \
               " attr { key: 'epsilon'          value { f: 0.001 }}"           \
+              " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"             \
               " input: ['A', 'B', 'C']}"                                      \
               "node { name: 'E' op: 'Zeta'"                                   \
               "attr { key: 'T' value { type: " #T " } }"                      \
@@ -1684,6 +1688,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive4);
               " attr { key: 'fused_ops'"                                      \
               "             value { list: {s: 'BiasAdd', s: 'Elu'} } }"       \
               " attr { key: 'epsilon'          value { f: 0.001 }}"           \
+              " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"             \
               " input: ['A', 'B', 'C']}"                                      \
               "node { name: 'E' op: 'Zeta'"                                   \
               "attr { key: 'T' value { type: " #T " } }"                      \
@@ -1717,6 +1722,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive5);
               " attr { key: 'fused_ops'"                                      \
               "             value { list: {s: 'BiasAdd', s: 'Add'} } }"       \
               " attr { key: 'epsilon'          value { f: 0.001 }}"           \
+              " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"             \
               " input: ['A', 'B', 'C', 'D']}"                                 \
               "node { name: 'F' op: 'Zeta'"                                   \
               "attr { key: 'T' value { type: " #T " } }"                      \
@@ -1752,6 +1758,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive6);
         " attr { key: 'fused_ops'"                                             \
         "             value { list: {s: 'BiasAdd', s: 'Add', s: 'Relu'} } }"   \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['A', 'B', 'C', 'D']}"                                        \
         "node { name: 'F' op: 'Zeta'"                                          \
         "attr { key: 'T' value { type: " #T " } }"                             \
@@ -1767,6 +1774,39 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive6);
 REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive7);
 #undef REGISTER_TEST
 
+// Rewrite test for _FusedConv2D Op with BiasAdd+LeakyRelu fusion
+#define REGISTER_TEST(NAME, T, INPUT)                                         \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                     \
+    InitGraph("node { name: 'A' op: '" #INPUT "'}"                            \
+              "node { name: 'B' op: '" #INPUT "'}"                            \
+              "node { name: 'C' op: '" #INPUT "'}"                            \
+              "node { name: 'D' op: '_FusedConv2D'"                           \
+              " attr { key: 'T'                value { type: " #T " } }"      \
+              " attr { key: 'num_args'         value { i: 1 } }"              \
+              " attr { key: 'data_format'      value { s: 'NCHW' } }"         \
+              " attr { key: 'strides'          value { list: {i: 1, i:1, "    \
+              "i:1, i:1} } }"                                                 \
+              " attr { key: 'padding'          value { s: 'SAME' } }"         \
+              " attr { key: 'dilations'        value { list: {i: 1, i:1, "    \
+              "i:1, i:1} } }"                                                 \
+              " attr { key: 'fused_ops'"                                      \
+              "             value { list: {s: 'BiasAdd', s: 'LeakyRelu'} } }" \
+              " attr { key: 'epsilon'          value { f: 0.001 }}"           \
+              " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"             \
+              " input: ['A', 'B', 'C']}"                                      \
+              "node { name: 'E' op: 'Zeta'"                                   \
+              "attr { key: 'T' value { type: " #T " } }"                      \
+              " input: ['D', 'C'] }");                                        \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                  \
+              "A(" #INPUT ");B(" #INPUT ");C(" #INPUT ");D(_MklFusedConv2D);" \
+              "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(Zeta)|A->D;"       \
+              "A:control->DMT/_0:control;A:control->DMT/_1:control;"          \
+              "A:control->DMT/_2:control;B->D:1;C->D:2;C->E:1;D->E;"          \
+              "DMT/_0->D:3;DMT/_1->D:4;DMT/_2->D:5");                         \
+  }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive8);
+#undef REGISTER_TEST
+
 // Rewrite test for _FusedDepthwiseConv2dNative Op fusion
 #define REGISTER_TEST(NAME, T, INPUT)                                          \
   TEST_F(MklLayoutPassTest, NAME##_##T) {                                      \
@@ -1785,6 +1825,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Positive7);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: " FUSED_OPS " } }"      \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['A', 'B', 'C']}"                                             \
         "node { name: 'E' op: 'Zeta'"                                          \
         "attr { key: 'T' value { type: " #T " } }"                             \
@@ -1835,6 +1876,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedDepthwiseConv2dNative_Positive4);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'Unsupported'} } }" \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['A', 'B', 'C']}"                                             \
         "node { name: 'E' op: 'Zeta'"                                          \
         "attr { key: 'T' value { type: " #T " } }"                             \
@@ -1864,6 +1906,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedConv2D_Negative1);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'Unsupported'} } }" \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['A', 'B', 'C']}"                                             \
         "node { name: 'E' op: 'Zeta'"                                          \
         "attr { key: 'T' value { type: " #T " } }"                             \
@@ -1923,6 +1966,7 @@ REGISTER_TEST(NodeRewrite_FusedConv2D_Negative2, DT_DOUBLE, DoubleInput);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"     \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['A', 'B', 'C']}"                                             \
         "node { name: 'E' op: 'Zeta'"                                          \
         "attr { key: 'T' value { type: " #T "} }"                              \
@@ -2018,6 +2062,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedMatMul_Negative);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"     \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['C', 'D', 'E']}"                                             \
         "node { name: 'G' op: 'Zeta'"                                          \
         " attr { key: 'T' value { type: " #T " } }"                            \
@@ -2062,6 +2107,7 @@ REGISTER_TEST_ALL_TYPES(NodeMerge_PadWithFusedConv2D_Positive1);
               " attr { key: 'fused_ops'"                                      \
               "             value { list: {s: 'BiasAdd', s: 'Relu'} } }"      \
               " attr { key: 'epsilon'          value { f: 0.001 }}"           \
+              " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"             \
               " input: ['C', 'D', 'E']}"                                      \
               "node { name: 'G' op: 'Zeta'"                                   \
               "attr { key: 'T' value { type: " #T "} }"                       \
@@ -2106,6 +2152,7 @@ REGISTER_TEST_ALL_TYPES(NodeMerge_PadWithFusedConv2D_Positive2);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'Unsupported'} } }" \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['C', 'D', 'E']}"                                             \
         "node { name: 'G' op: 'Zeta'"                                          \
         " attr { key: 'T' value { type: " #T " } }"                            \
@@ -2146,6 +2193,7 @@ REGISTER_TEST_ALL_TYPES(NodeMerge_PadWithFusedConv2D_Negative1);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"     \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['C', 'D', 'E']}"                                             \
         "node { name: 'G' op: 'Zeta'"                                          \
         " attr { key: 'T' value { type: " #T " } }"                            \
@@ -2189,6 +2237,7 @@ REGISTER_TEST_ALL_TYPES(NodeMerge_PadWithFusedConv2D_Negative2);
               " attr { key: 'fused_ops'"                                      \
               "             value { list: {s: 'BiasAdd', s: 'Relu'} } }"      \
               " attr { key: 'epsilon'          value { f: 0.001 }}"           \
+              " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"             \
               " input: ['C', 'D', 'E']}"                                      \
               "node { name: 'G' op: 'Zeta'"                                   \
               " attr { key: 'T' value { type: " #T " } }"                     \
@@ -2243,6 +2292,7 @@ REGISTER_TEST_ALL_TYPES(NodeMerge_PadWithFusedConv2D_Negative3);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"     \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['C', 'D', 'E']}"                                             \
         "node { name: 'G' op: 'Zeta'"                                          \
         " attr {key: 'T'                 value { type: " #T " } }"             \
@@ -2304,6 +2354,7 @@ REGISTER_TEST_ALL_TYPES(Input_ControlEdge_PadWithFusedConv2D_Positive);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"     \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['C', 'D', 'E']}"                                             \
         "node { name: 'G' op: 'Zeta'"                                          \
         " attr {key: 'T'                 value { type: " #T " } }"             \
@@ -2356,6 +2407,7 @@ REGISTER_TEST_ALL_TYPES(Output_ControlEdge_PadWithFusedConv2D_Positive);
         "} }"                                                                  \
         " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"     \
         " attr { key: 'epsilon'          value { f: 0.001 }}"                  \
+        " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"                    \
         " input: ['C', 'A', 'E']}"                                             \
         "node { name: 'G' op: '" #OUTPUT "'"                                   \
         " input: ['C', 'F']}");                                                \
@@ -3001,9 +3053,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_LeakyReluGrad_Negative);
 }
 REGISTER_TEST_ALL_TYPES(NodeRewrite_LeakyReluLeakyReluGrad_Positive);
 #undef REGISTER_TEST
-// clang-format on
 
-// clang-format off
 #ifdef ENABLE_MKLDNN_V1
 
 #define REGISTER_TEST(NAME, T, INPUT)                                        \
@@ -3055,7 +3105,7 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_TanhGrad_Positive);
       " input: ['B', 'A'] }"                                                   \
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"   \
       " input: ['A', 'C'] }");                                                 \
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),                                     \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                   \
             "A(" #INPUT ");B(_MklTanh);C(_MklTanhGrad);D(Zeta);DMT/_0(Const);" \
             "DMT/_1(Const)|A->B;A->C:1;A->D;A:control->DMT/_0:control;"        \
             "B->C;B:1->C:2;B:control->DMT/_1:control;C->D:1;DMT/_0->B:1;"      \
@@ -3064,276 +3114,372 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_TanhGrad_Positive);
 REGISTER_TEST_ALL_TYPES(NodeRewrite_TanhTanhGrad_Positive);
 #undef REGISTER_TEST
 #endif  // ENABLE_MKLDNN_V1
-// clang-format on
 
-TEST_F(MklLayoutPassTest, NodeRewrite_AvgPool_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'AvgPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklAvgPool);C(Zeta);DMT/_0(Const)|A->B;A->C;"
-            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);     \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'AvgPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(_MklAvgPool);C(Zeta);DMT/_0(Const)|A->B;A->C;"  \
+            "A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");                 \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_AvgPool_Positive);
+#undef REGISTER_TEST
 
-TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Int32Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'AvgPoolGrad' "
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A', 'B'] }"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['B', 'C'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Int32Input);B(Input);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);"
-            "DMT/_1(Const)|A->C;A:control->DMT/_0:control;"
-            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"
-            "DMT/_1->C:3");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);     \
+    InitGraph(                                                               \
+      "node { name: 'A' op: 'Int32Input'}"                                   \
+      "node { name: 'B' op: '" #INPUT "'}"                                   \
+      "node { name: 'C' op: 'AvgPoolGrad' "                                  \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"  \
+      " input: ['A', 'B'] }"                                                 \
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['B', 'C'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(Int32Input);B(" #INPUT ");C(_MklAvgPoolGrad);D(Zeta);"        \
+            "DMT/_0(Const);DMT/_1(Const)|A->C;A:control->DMT/_0:control;"    \
+            "A:control->DMT/_1:control;B->C:1;B->D;C->D:1;DMT/_0->C:2;"      \
+            "DMT/_1->C:3");                                                  \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_AvgPoolGrad_Positive);
+#undef REGISTER_TEST
 
-TEST_F(MklLayoutPassTest, NodeRewrite_AvgPoolAvgPoolGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'I' op: 'Int32Input'}"
-      "node { name: 'B' op: 'AvgPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'AvgPoolGrad' "
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['I', 'B'] }"
-      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'C'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklAvgPool);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);"
-            "DMT/_1(Const);I(Int32Input)|A->B;A->D;A:control->DMT/_0:control;"
-            "B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;DMT/_1->C:2;I->C;"
-            "I:control->DMT/_1:control");
+#define REGISTER_TEST(NAME, T, INPUT)                                                \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                            \
+    DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);             \
+    InitGraph(                                                                       \
+      "node { name: 'A' op: '" #INPUT "'}"                                           \
+      "node { name: 'I' op: 'Int32Input'}"                                           \
+      "node { name: 'B' op: 'AvgPool'"                                               \
+      " attr { key: 'T'            value { type: " #T " } }"                         \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                            \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"          \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                           \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"          \
+      " input: ['A'] }"                                                              \
+      "node { name: 'C' op: 'AvgPoolGrad' "                                          \
+      " attr { key: 'T'            value { type: " #T " } }"                         \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                            \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"          \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                           \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"          \
+      " input: ['I', 'B'] }"                                                         \
+      "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"         \
+      " input: ['A', 'C'] }");                                                       \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                         \
+            "A(" #INPUT ");B(_MklAvgPool);C(_MklAvgPoolGrad);D(Zeta);DMT/_0(Const);" \
+            "DMT/_1(Const);I(Int32Input)|A->B;A->D;A:control->DMT/_0:control;"       \
+            "B->C:1;B:1->C:3;C->D:1;DMT/_0->B:1;DMT/_1->C:2;I->C;"                   \
+            "I:control->DMT/_1:control");                                            \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_AvgPoolAvgPoolGrad_Positive);
+#undef REGISTER_TEST
 
-TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormGrad_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNormGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
-            "F(_MklFusedBatchNormGrad);G(Zeta)|A->F;A->G;"
-            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
-            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
-            "E->F:4;F->G:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                                \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                            \
+    DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);             \
+    InitGraph(                                                                       \
+      "node { name: 'A' op: '" #INPUT "'}"                                           \
+      "node { name: 'B' op: '" #INPUT "'}"                                           \
+      "node { name: 'C' op: '" #INPUT "'}"                                           \
+      "node { name: 'D' op: '" #INPUT "'}"                                           \
+      "node { name: 'E' op: '" #INPUT "'}"                                           \
+      "node { name: 'F' op: 'FusedBatchNormGrad'"                                    \
+      " attr { key: 'T'            value { type: " #T " } }"                         \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                            \
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"                            \
+      " attr { key: 'is_training'  value { b: true } }"                              \
+      " input: ['A', 'B', 'C', 'D', 'E'] }"                                          \
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"         \
+      " input: ['A', 'F'] }");                                                       \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                         \
+            "A(" #INPUT ");B(" #INPUT ");C(" #INPUT ");D(" #INPUT ");"               \
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);" \
+            "E(" #INPUT ");F(_MklFusedBatchNormGrad);G(Zeta)|A->F;A->G;"             \
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"                   \
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"                   \
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"                        \
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"           \
+            "E->F:4;F->G:1");                                                        \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormGrad_Positive);
+#undef REGISTER_TEST
 
-TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormGradV2_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNormGradV2'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'U'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
-            "F(_MklFusedBatchNormGradV2);G(Zeta)|A->F;A->G;"
-            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
-            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
-            "E->F:4;F->G:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                                \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                            \
+    DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);             \
+    InitGraph(                                                                       \
+      "node { name: 'A' op: '" #INPUT "'}"                                           \
+      "node { name: 'B' op: '" #INPUT "'}"                                           \
+      "node { name: 'C' op: 'Float32Input'}"                                         \
+      "node { name: 'D' op: 'Float32Input'}"                                         \
+      "node { name: 'E' op: 'Float32Input'}"                                         \
+      "node { name: 'F' op: 'FusedBatchNormGradV2'"                                  \
+      " attr { key: 'T'            value { type: " #T " } }"                         \
+      " attr { key: 'U'            value { type: DT_FLOAT } }"                       \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                            \
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"                            \
+      " attr { key: 'is_training'  value { b: true } }"                              \
+      " input: ['A', 'B', 'C', 'D', 'E'] }"                                          \
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"         \
+      " input: ['A', 'F'] }");                                                       \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                         \
+            "A(" #INPUT ");B(" #INPUT ");C(Float32Input);D(Float32Input);"           \
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);" \
+            "E(Float32Input);F(_MklFusedBatchNormGradV2);G(Zeta)|A->F;A->G;"         \
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"                   \
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"                   \
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"                        \
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"           \
+            "E->F:4;F->G:1");                                                        \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormGradV2_Positive);
+#undef REGISTER_TEST
 
 // T, U combination is not supported by MKL. Node will not be rewritten
 // into MKL node.
 TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormGradV2_Negative) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
   InitGraph(
-      "node { name: 'A' op: 'HalfInput'}"
-      "node { name: 'B' op: 'HalfInput'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNormGradV2'"
-      " attr { key: 'T'            value { type: DT_HALF } }"
-      " attr { key: 'U'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
-      " input: ['A', 'F'] }");
+    "node { name: 'A' op: 'HalfInput'}"
+    "node { name: 'B' op: 'HalfInput'}"
+    "node { name: 'C' op: 'Float32Input'}"
+    "node { name: 'D' op: 'Float32Input'}"
+    "node { name: 'E' op: 'Float32Input'}"
+    "node { name: 'F' op: 'FusedBatchNormGradV2'"
+    " attr { key: 'T'            value { type: DT_HALF } }"
+    " attr { key: 'U'            value { type: DT_FLOAT } }"
+    " attr { key: 'data_format'  value { s: 'NCHW' } }"
+    " attr { key: 'epsilon'      value { f: 0.0001 } }"
+    " attr { key: 'is_training'  value { b: true } }"
+    " input: ['A', 'B', 'C', 'D', 'E'] }"
+    "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
+    " input: ['A', 'F'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(HalfInput);B(HalfInput);C(Input);D(Input);E(Input);"
-            "F(FusedBatchNormGradV2);G(Zeta)|A->F;A->G;"
-            "B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");
+          "A(HalfInput);B(HalfInput);C(Float32Input);D(Float32Input);"
+          "E(Float32Input);F(FusedBatchNormGradV2);G(Zeta)|A->F;A->G;"
+          "B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");
 }
 
-TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNorm'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
-            "F(_MklFusedBatchNorm);G(Zeta)|A->F;A->G;"
-            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
-            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
-            "E->F:4;F->G:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                                \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                            \
+    DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);             \
+    InitGraph(                                                                       \
+      "node { name: 'A' op: '" #INPUT "'}"                                           \
+      "node { name: 'B' op: '" #INPUT "'}"                                           \
+      "node { name: 'C' op: '" #INPUT "'}"                                           \
+      "node { name: 'D' op: '" #INPUT "'}"                                           \
+      "node { name: 'E' op: '" #INPUT "'}"                                           \
+      "node { name: 'F' op: 'FusedBatchNorm'"                                        \
+      " attr { key: 'T'            value { type: " #T " } }"                         \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                            \
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"                            \
+      " attr { key: 'is_training'  value { b: true } }"                              \
+      " input: ['A', 'B', 'C', 'D', 'E'] }"                                          \
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"         \
+      " input: ['A', 'F'] }");                                                       \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                         \
+            "A(" #INPUT ");B(" #INPUT ");C(" #INPUT ");D(" #INPUT ");DMT/_0(Const);" \
+            "DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(" #INPUT ");" \
+            "F(_MklFusedBatchNorm);G(Zeta)|A->F;A->G;"                               \
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"                   \
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"                   \
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"                        \
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"           \
+            "E->F:4;F->G:1");                                                        \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNorm_Positive);
+#undef REGISTER_TEST
 
-TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV2_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNormV2'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'U'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
-            "F(_MklFusedBatchNormV2);G(Zeta)|A->F;A->G;"
-            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
-            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
-            "E->F:4;F->G:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                                \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                            \
+    DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);             \
+    InitGraph(                                                                       \
+      "node { name: 'A' op: '" #INPUT "'}"                                           \
+      "node { name: 'B' op: 'Float32Input'}"                                         \
+      "node { name: 'C' op: 'Float32Input'}"                                         \
+      "node { name: 'D' op: 'Float32Input'}"                                         \
+      "node { name: 'E' op: 'Float32Input'}"                                         \
+      "node { name: 'F' op: 'FusedBatchNormV2'"                                      \
+      " attr { key: 'T'            value { type: " #T " } }"                         \
+      " attr { key: 'U'            value { type: DT_FLOAT } }"                       \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                            \
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"                            \
+      " attr { key: 'is_training'  value { b: true } }"                              \
+      " input: ['A', 'B', 'C', 'D', 'E'] }"                                          \
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"         \
+      " input: ['A', 'F'] }");                                                       \
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),                                           \
+            "A(" #INPUT ");B(Float32Input);C(Float32Input);D(Float32Input);"         \
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);" \
+            "E(Float32Input);F(_MklFusedBatchNormV2);G(Zeta)|A->F;A->G;"             \
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"                   \
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"                   \
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"                        \
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"           \
+            "E->F:4;F->G:1");                                                        \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormV2_Positive);
+#undef REGISTER_TEST
 
 // T, U combination is not supported by MKL. Node will not be rewritten
 // into MKL node.
 TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV2_Negative) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
   InitGraph(
-      "node { name: 'A' op: 'HalfInput'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNormV2'"
-      " attr { key: 'T'            value { type: DT_HALF } }"
-      " attr { key: 'U'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
-      " input: ['A', 'F'] }");
+    "node { name: 'A' op: 'HalfInput'}"
+    "node { name: 'B' op: 'Float32Input'}"
+    "node { name: 'C' op: 'Float32Input'}"
+    "node { name: 'D' op: 'Float32Input'}"
+    "node { name: 'E' op: 'Float32Input'}"
+    "node { name: 'F' op: 'FusedBatchNormV2'"
+    " attr { key: 'T'            value { type: DT_HALF } }"
+    " attr { key: 'U'            value { type: DT_FLOAT } }"
+    " attr { key: 'data_format'  value { s: 'NCHW' } }"
+    " attr { key: 'epsilon'      value { f: 0.0001 } }"
+    " attr { key: 'is_training'  value { b: true } }"
+    " input: ['A', 'B', 'C', 'D', 'E'] }"
+    "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
+    " input: ['A', 'F'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(HalfInput);B(Input);C(Input);D(Input);E(Input);"
-            "F(FusedBatchNormV2);G(Zeta)|A->F;A->G;"
-            "B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");
+          "A(HalfInput);B(Float32Input);C(Float32Input);D(Float32Input);"
+          "E(Float32Input);F(FusedBatchNormV2);G(Zeta)|A->F;A->G;"
+          "B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");
 }
 
-TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV3_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNormV3'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'U'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'F'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);E(Input);"
-            "F(_MklFusedBatchNormV3);G(Zeta)|A->F;A->G;"
-            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
-            "A:control->DMT/_2:control;A:control->DMT/_3:control;"
-            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"
-            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"
-            "E->F:4;F->G:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                                \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                            \
+    DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);             \
+    InitGraph(                                                                       \
+      "node { name: 'A' op: '" #INPUT "'}"                                           \
+      "node { name: 'B' op: 'Float32Input'}"                                         \
+      "node { name: 'C' op: 'Float32Input'}"                                         \
+      "node { name: 'D' op: 'Float32Input'}"                                         \
+      "node { name: 'E' op: 'Float32Input'}"                                         \
+      "node { name: 'F' op: 'FusedBatchNormV3'"                                      \
+      " attr { key: 'T'            value { type: " #T " } }"                         \
+      " attr { key: 'U'            value { type: DT_FLOAT } }"                       \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                            \
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"                            \
+      " attr { key: 'is_training'  value { b: true } }"                              \
+      " input: ['A', 'B', 'C', 'D', 'E'] }"                                          \
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"         \
+      " input: ['A', 'F'] }");                                                       \
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),                                           \
+            "A(" #INPUT ");B(Float32Input);C(Float32Input);D(Float32Input);"         \
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);DMT/_4(Const);" \
+            "E(Float32Input);F(_MklFusedBatchNormV3);G(Zeta)|A->F;A->G;"             \
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"                   \
+            "A:control->DMT/_2:control;A:control->DMT/_3:control;"                   \
+            "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"                        \
+            "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;"           \
+            "E->F:4;F->G:1");                                                        \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormV3_Positive);
+#undef REGISTER_TEST
+
+#define REGISTER_TEST(NAME, T, INPUT)                                                \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                            \
+    InitGraph(                                                                       \
+      "node { name: 'A' op: '" #INPUT "'}"                                           \
+      "node { name: 'B' op: 'Float32Input'}"                                         \
+      "node { name: 'C' op: 'Float32Input'}"                                         \
+      "node { name: 'D' op: 'Float32Input'}"                                         \
+      "node { name: 'E' op: 'Float32Input'}"                                         \
+      "node { name: 'F' op: 'FusedBatchNormV3'"                                      \
+      " attr { key: 'T'            value { type: " #T " } }"                         \
+      " attr { key: 'U'            value { type: DT_FLOAT } }"                       \
+      " attr { key: 'data_format'  value { s: " DATA_FORMAT " } }"                   \
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"                            \
+      " attr { key: 'is_training'  value { b: true } }"                              \
+      " input: ['A', 'B', 'C', 'D', 'E'] }"                                          \
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"         \
+      " input: ['A', 'F'] }");                                                       \
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),                                           \
+            "A(" #INPUT ");B(Float32Input);C(Float32Input);"                         \
+            "D(Float32Input);E(Float32Input);F(FusedBatchNormV3);G(Zeta)"            \
+            "|A->F;A->G;B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");                        \
+}
+#define DATA_FORMAT "'NCDHW'"
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormV3_5D_Negative_1);
+
+#define DATA_FORMAT "'NDHWC'"
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormV3_5D_Negative_2);
+
+#undef DATA_FORMAT
+#undef REGISTER_TEST
 
 TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV3_Negative) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
   InitGraph(
-      "node { name: 'A' op: 'HalfInput'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'Input'}"
-      "node { name: 'F' op: 'FusedBatchNormV3'"
-      " attr { key: 'T'            value { type: DT_HALF } }"
-      " attr { key: 'U'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'epsilon'      value { f: 0.0001 } }"
-      " attr { key: 'is_training'  value { b: true } }"
-      " input: ['A', 'B', 'C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
-      " input: ['A', 'F'] }");
+    "node { name: 'A' op: 'HalfInput'}"
+    "node { name: 'B' op: 'Float32Input'}"
+    "node { name: 'C' op: 'Float32Input'}"
+    "node { name: 'D' op: 'Float32Input'}"
+    "node { name: 'E' op: 'Float32Input'}"
+    "node { name: 'F' op: 'FusedBatchNormV3'"
+    " attr { key: 'T'            value { type: DT_HALF } }"
+    " attr { key: 'U'            value { type: DT_FLOAT } }"
+    " attr { key: 'data_format'  value { s: 'NCHW' } }"
+    " attr { key: 'epsilon'      value { f: 0.0001 } }"
+    " attr { key: 'is_training'  value { b: true } }"
+    " input: ['A', 'B', 'C', 'D', 'E'] }"
+    "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
+    " input: ['A', 'F'] }");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(HalfInput);B(Input);C(Input);D(Input);E(Input);"
-            "F(FusedBatchNormV3);G(Zeta)|A->F;A->G;"
-            "B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");
+          "A(HalfInput);B(Float32Input);C(Float32Input);D(Float32Input);"
+          "E(Float32Input);F(FusedBatchNormV3);G(Zeta)|A->F;A->G;"
+          "B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");
 }
 
-// clang-format off
+#define REGISTER_TEST(NAME, T, INPUT)                                                \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                            \
+    InitGraph(                                                                       \
+      "node { name: 'A' op: '" #INPUT "'}"                                           \
+      "node { name: 'B' op: '" #INPUT "'}"                                           \
+      "node { name: 'C' op: 'Float32Input'}"                                         \
+      "node { name: 'D' op: 'Float32Input'}"                                         \
+      "node { name: 'E' op: 'Float32Input'}"                                         \
+      "node { name: 'F' op: 'Float32Input'}"                                         \
+      "node { name: 'G' op: 'FusedBatchNormGradV3'"                                  \
+      " attr { key: 'T'            value { type: " #T " } }"                         \
+      " attr { key: 'U'            value { type: DT_FLOAT } }"                       \
+      " attr { key: 'data_format'  value { s: " DATA_FORMAT " } }"                   \
+      " attr { key: 'epsilon'      value { f: 0.0001 } }"                            \
+      " attr { key: 'is_training'  value { b: true } }"                              \
+      " input: ['A', 'B', 'C', 'D', 'E', 'F'] }"                                     \
+      "node { name: 'H' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"         \
+      " input: ['A', 'G'] }");                                                       \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                         \
+            "A(" #INPUT ");B(" #INPUT ");C(Float32Input);D(Float32Input);"           \
+            "E(Float32Input);F(Float32Input);G(FusedBatchNormGradV3);H(Zeta)"        \
+            "|A->G;A->H;B->G:1;C->G:2;D->G:3;E->G:4;F->G:5;G->H:1");                 \
+}
+#define DATA_FORMAT "'NCDHW'"
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormGradV3_5D_Negative_1);
+
+#define DATA_FORMAT "'NDHWC'"
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormGradV3_5D_Negative_2);
+
+#undef DATA_FORMAT
+#undef REGISTER_TEST
+
 #ifdef ENABLE_MKLDNN_V1
 #define REGISTER_TEST(NAME, T, INPUT)                                        \
   TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
@@ -3425,7 +3571,6 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative1);
 REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative2);
 #undef REGISTER_TEST
 #endif  // ENABLE_MKLDNN_V1
-// clang-format on
 
 TEST_F(MklLayoutPassTest, NodeRewrite_QuantizedDepthwiseConv2D_Positive) {
   InitGraph(
@@ -3464,428 +3609,486 @@ TEST_F(MklLayoutPassTest, NodeRewrite_QuantizedDepthwiseConv2D_Positive) {
 /////////////////////////////////////////////////////////////////////
 
 // If any of the inputs is an MKL op, then rewrite Slice to Mkl op.
-TEST_F(MklLayoutPassTest, NodeRewrite_Ctxbased_Slice_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'M' op: '_MklInput'}"
-      "node { name: 'N' op: '_MklInput'}"
-      "node { name: 'C' op: '_MklConv2D'"
-      " attr { key: 'T'                value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'      value { s: 'NCHW' } }"
-      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
-      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'          value { s: 'SAME' } }"
-      " input: ['A', 'B', 'M', 'N']}"
-      "node { name: 'D' op: 'Int32Input'}"
-      "node { name: 'E' op: 'Int32Input'}"
-      "node { name: 'F' op: 'Slice'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'Index'        value { type: DT_INT32 } }"
-      " input: ['C', 'D', 'E'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'C'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(_MklConv2D);D(Int32Input);"
-            "DMT/_0(Const);DMT/_1(Const);"
-            "E(Int32Input);F(_MklSlice);G(Zeta);M(_MklInput);N(_MklInput)|"
-            "A->C;A->G;B->C:1;C->F;C->G:1;C:2->F:3;"
-            "C:control->DMT/_0:control;C:control->DMT/"
-            "_1:control;"
-            "D->F:1;DMT/_0->F:4;DMT/_1->F:5;"
-            "E->F:2;M->C:2;N->C:3");
+#define REGISTER_TEST(NAME, T, INPUT)                                           \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                       \
+    InitGraph(                                                                  \
+      "node { name: 'A' op: '" #INPUT "'}"                                      \
+      "node { name: 'B' op: '" #INPUT "'}"                                      \
+      "node { name: 'M' op: '_MklInput'}"                                       \
+      "node { name: 'N' op: '_MklInput'}"                                       \
+      "node { name: 'C' op: '_MklConv2D'"                                       \
+      " attr { key: 'T'                value { type: " #T " } }"                \
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"                   \
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"                    \
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }" \
+      " attr { key: 'padding'          value { s: 'SAME' } }"                   \
+      " input: ['A', 'B', 'M', 'N']}"                                           \
+      "node { name: 'D' op: 'Int32Input'}"                                      \
+      "node { name: 'E' op: 'Int32Input'}"                                      \
+      "node { name: 'F' op: 'Slice'"                                            \
+      " attr { key: 'T'            value { type: " #T " } }"                    \
+      " attr { key: 'Index'        value { type: DT_INT32 } }"                  \
+      " input: ['C', 'D', 'E'] }"                                               \
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"    \
+      " input: ['A', 'C'] }");                                                  \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                    \
+            "A(" #INPUT ");B(" #INPUT ");C(_MklConv2D);D(Int32Input);"          \
+            "DMT/_0(Const);DMT/_1(Const);"                                      \
+            "E(Int32Input);F(_MklSlice);G(Zeta);M(_MklInput);N(_MklInput)|"     \
+            "A->C;A->G;B->C:1;C->F;C->G:1;C:2->F:3;"                            \
+            "C:control->DMT/_0:control;C:control->DMT/"                         \
+            "_1:control;"                                                       \
+            "D->F:1;DMT/_0->F:4;DMT/_1->F:5;"                                   \
+            "E->F:2;M->C:2;N->C:3");                                            \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_Ctxbased_Slice_Positive);
+#undef REGISTER_TEST
 
 // If none of the inputs is an MKL op, then Slice should not be rewritten.
-TEST_F(MklLayoutPassTest, NodeRewrite_Ctxbased_Slice_Negative) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Int32Input'}"
-      "node { name: 'C' op: 'Int32Input'}"
-      "node { name: 'D' op: 'Slice'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'Index'        value { type: DT_INT32 } }"
-      " input: ['A', 'B', 'C'] }"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Int32Input);C(Int32Input);"
-            "D(Slice);E(Zeta)|A->D;A->E;B->D:1;C->D:2;D->E:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'Int32Input'}"                                   \
+      "node { name: 'C' op: 'Int32Input'}"                                   \
+      "node { name: 'D' op: 'Slice'"                                         \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'Index'        value { type: DT_INT32 } }"               \
+      " input: ['A', 'B', 'C'] }"                                            \
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'D'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(Int32Input);C(Int32Input);"                     \
+            "D(Slice);E(Zeta)|A->D;A->E;B->D:1;C->D:2;D->E:1");              \
 }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_Ctxbased_Slice_Negative);
+#undef REGISTER_TEST
 
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to rewriting node for workspace edges
 /////////////////////////////////////////////////////////////////////
 
 /* Test LRN->MaxPool->MaxPoolGrad->LRNGrad replacement by workspace nodes. */
-TEST_F(MklLayoutPassTest, MaxPoolLRN_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'LRN'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['B'] }"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'MaxPoolGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['B', 'C', 'D'] }"
-      "node { name: 'F' op: 'Input'}"
-      "node { name: 'G' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['E', 'F', 'B'] }"
-      "node { name: 'H' op: 'Input'}"
-      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['H', 'G'] }");
-  EXPECT_EQ(
-      DoMklLayoutOptimizationPass(),
-      "A(Input);B(_MklLRN);C(_MklMaxPool);D(Input);DMT/_0(Const);DMT/_1(Const);"
-      "DMT/_2(Const);E(_MklMaxPoolGrad);F(Input);G(_MklLRNGrad);H(Input);"
-      "I(Zeta)|A->B;A:control->DMT/_0:control;B->C;B->E;B->G:2;B:1->G:3;"
-      "B:2->C:1;B:2->E:4;B:2->G:6;B:3->G:7;B:control->DMT/_1:control;C->E:1;"
-      "C:1->E:3;C:2->E:5;C:3->E:7;D->E:2;DMT/_0->B:1;DMT/_1->E:6;DMT/_2->G:5;"
-      "E->G;E:1->G:4;E:control->DMT/_2:control;F->G:1;G->I:1;H->I");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'LRN'"                                           \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'alpha'        value { f: 0.001 } }"                     \
+      " attr { key: 'beta'         value { f: 0.75 } }"                      \
+      " attr { key: 'bias'         value { f: 1.0 } }"                       \
+      " attr { key: 'depth_radius' value { i: 2 } }"                         \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"  \
+      " input: ['B'] }"                                                      \
+      "node { name: 'D' op: '" #INPUT "'}"                                   \
+      "node { name: 'E' op: 'MaxPoolGrad'"                                   \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"  \
+      " input: ['B', 'C', 'D'] }"                                            \
+      "node { name: 'F' op: '" #INPUT "'}"                                   \
+      "node { name: 'G' op: 'LRNGrad'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'alpha'        value { f: 0.001 } }"                     \
+      " attr { key: 'beta'         value { f: 0.75 } }"                      \
+      " attr { key: 'bias'         value { f: 1.0 } }"                       \
+      " attr { key: 'depth_radius' value { i: 2 } }"                         \
+      " input: ['E', 'F', 'B'] }"                                            \
+      "node { name: 'H' op: '" #INPUT "'}"                                   \
+      "node { name: 'I' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['H', 'G'] }");                                               \
+    EXPECT_EQ(                                                               \
+      DoMklLayoutOptimizationPass(),                                         \
+      "A(" #INPUT ");B(_MklLRN);C(_MklMaxPool);D(" #INPUT ");DMT/_0(Const);" \
+      "DMT/_1(Const);DMT/_2(Const);E(_MklMaxPoolGrad);F(" #INPUT ");"        \
+      "G(_MklLRNGrad);H(" #INPUT ");I(Zeta)|A->B;A:control->DMT/_0:control;" \
+      "B->C;B->E;B->G:2;B:1->G:3;B:2->C:1;B:2->E:4;B:2->G:6;B:3->G:7;"       \
+      "B:control->DMT/_1:control;C->E:1;C:1->E:3;C:2->E:5;C:3->E:7;D->E:2;"  \
+      "DMT/_0->B:1;DMT/_1->E:6;DMT/_2->G:5;E->G;E:1->G:4;"                   \
+      "E:control->DMT/_2:control;F->G:1;G->I:1;H->I");                       \
 }
+REGISTER_TEST_FLOAT32(MaxPoolLRN_Positive);
+// TODO(nhasabni): Enable bfloat16 test when we enable the operator.
+#undef REGISTER_TEST
 
 /* Test LRN->LRNGrad replacement by workspace nodes. */
-TEST_F(MklLayoutPassTest, LRN_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'LRN'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['C', 'D', 'B'] }"
-      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);E(_MklLRNGrad);F(Zeta)|"
-            "A->B;A:control->DMT/_0:control;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;"
-            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
-            "D->E:1;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:5;E->F:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                           \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                       \
+    InitGraph(                                                                  \
+      "node { name: 'A' op: '" #INPUT "'}"                                      \
+      "node { name: 'B' op: 'LRN'"                                              \
+      " attr { key: 'T'            value { type: " #T " } }"                    \
+      " attr { key: 'alpha'        value { f: 0.001 } }"                        \
+      " attr { key: 'beta'         value { f: 0.75 } }"                         \
+      " attr { key: 'bias'         value { f: 1.0 } }"                          \
+      " attr { key: 'depth_radius' value { i: 2 } }"                            \
+      " input: ['A'] }"                                                         \
+      "node { name: 'C' op: '" #INPUT "'}"                                      \
+      "node { name: 'D' op: '" #INPUT "'}"                                      \
+      "node { name: 'E' op: 'LRNGrad'"                                          \
+      " attr { key: 'T'            value { type: " #T " } }"                    \
+      " attr { key: 'alpha'        value { f: 0.001 } }"                        \
+      " attr { key: 'beta'         value { f: 0.75 } }"                         \
+      " attr { key: 'bias'         value { f: 1.0 } }"                          \
+      " attr { key: 'depth_radius' value { i: 2 } }"                            \
+      " input: ['C', 'D', 'B'] }"                                               \
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"    \
+      " input: ['C', 'E'] }");                                                  \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                    \
+            "A(" #INPUT ");B(_MklLRN);C(" #INPUT ");D(" #INPUT ");"             \
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(_MklLRNGrad);F(Zeta)|" \
+            "A->B;A:control->DMT/_0:control;B->E:2;B:1->E:3;B:2->E:6;B:3->E:7;" \
+            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"    \
+            "D->E:1;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:5;E->F:1");               \
 }
+REGISTER_TEST_FLOAT32(LRN_Positive);
+#undef REGISTER_TEST
 
 /* Test LRN->LRNGrad replacement when only one of them is present. */
-TEST_F(MklLayoutPassTest, LRN_Negative1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'LRN'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklLRN);C(Zeta);DMT/_0(Const)|"
-            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'LRN'"                                           \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'alpha'        value { f: 0.001 } }"                     \
+      " attr { key: 'beta'         value { f: 0.75 } }"                      \
+      " attr { key: 'bias'         value { f: 1.0 } }"                       \
+      " attr { key: 'depth_radius' value { i: 2 } }"                         \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(_MklLRN);C(Zeta);DMT/_0(Const)|"                \
+            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");       \
 }
+REGISTER_TEST_FLOAT32(LRN_Negative1);
+#undef REGISTER_TEST
 
 /* Test LRN->LRNGrad replacement when only one of them is present. */
-TEST_F(MklLayoutPassTest, LRN_Negative2) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A', 'B', 'C'] }"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(LRNGrad);"
-            "E(Zeta)|A->D;A->E;B->D:1;C->D:2;D->E:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: '" #INPUT "'}"                                   \
+      "node { name: 'C' op: '" #INPUT "'}"                                   \
+      "node { name: 'D' op: 'LRNGrad'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'alpha'        value { f: 0.001 } }"                     \
+      " attr { key: 'beta'         value { f: 0.75 } }"                      \
+      " attr { key: 'bias'         value { f: 1.0 } }"                       \
+      " attr { key: 'depth_radius' value { i: 2 } }"                         \
+      " input: ['A', 'B', 'C'] }"                                            \
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'D'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(" #INPUT ");C(" #INPUT ");D(LRNGrad);"          \
+            "E(Zeta)|A->D;A->E;B->D:1;C->D:2;D->E:1");                       \
 }
+REGISTER_TEST_ALL_TYPES(LRN_Negative2);
+#undef REGISTER_TEST
 
 /* Test LRN->LRNGrad negative case, where single LRN feeds
    2 LRNGrad nodes at different slots. */
-TEST_F(MklLayoutPassTest, LRN_Negative3) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'LRN'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['C', 'D', 'B'] }"
-      "node { name: 'F' op: 'LRNGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'alpha'        value { f: 0.001 } }"
-      " attr { key: 'beta'         value { f: 0.75 } }"
-      " attr { key: 'bias'         value { f: 1.0 } }"
-      " attr { key: 'depth_radius' value { i: 2 } }"
-      " input: ['C', 'B', 'D'] }"
-      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['E', 'F'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklLRN);C(Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
-            "DMT/_2(Const);E(_MklLRNGrad);F(LRNGrad);G(Zeta)|A->B;"
-            "A:control->DMT/_0:control;B->E:2;B->F:1;B:1->E:3;B:2->E:6;"
-            "B:3->E:7;C->E;C->F;C:control->DMT/_1:control;"
-            "C:control->DMT/_2:control;D->E:1;D->F:2;DMT/_0->B:1;"
-            "DMT/_1->E:4;DMT/_2->E:5;E->G;F->G:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                          \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                      \
+    InitGraph(                                                                 \
+      "node { name: 'A' op: '" #INPUT "'}"                                     \
+      "node { name: 'B' op: 'LRN'"                                             \
+      " attr { key: 'T'            value { type: " #T " } }"                   \
+      " attr { key: 'alpha'        value { f: 0.001 } }"                       \
+      " attr { key: 'beta'         value { f: 0.75 } }"                        \
+      " attr { key: 'bias'         value { f: 1.0 } }"                         \
+      " attr { key: 'depth_radius' value { i: 2 } }"                           \
+      " input: ['A'] }"                                                        \
+      "node { name: 'C' op: '" #INPUT "'}"                                     \
+      "node { name: 'D' op: '" #INPUT "'}"                                     \
+      "node { name: 'E' op: 'LRNGrad'"                                         \
+      " attr { key: 'T'            value { type: " #T " } }"                   \
+      " attr { key: 'alpha'        value { f: 0.001 } }"                       \
+      " attr { key: 'beta'         value { f: 0.75 } }"                        \
+      " attr { key: 'bias'         value { f: 1.0 } }"                         \
+      " attr { key: 'depth_radius' value { i: 2 } }"                           \
+      " input: ['C', 'D', 'B'] }"                                              \
+      "node { name: 'F' op: 'LRNGrad'"                                         \
+      " attr { key: 'T'            value { type: " #T " } }"                   \
+      " attr { key: 'alpha'        value { f: 0.001 } }"                       \
+      " attr { key: 'beta'         value { f: 0.75 } }"                        \
+      " attr { key: 'bias'         value { f: 1.0 } }"                         \
+      " attr { key: 'depth_radius' value { i: 2 } }"                           \
+      " input: ['C', 'B', 'D'] }"                                              \
+      "node { name: 'G' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"   \
+      " input: ['E', 'F'] }");                                                 \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                   \
+            "A(" #INPUT ");B(_MklLRN);C(" #INPUT ");D(" #INPUT ");"            \
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(_MklLRNGrad);"        \
+            "F(LRNGrad);G(Zeta)|A->B;A:control->DMT/_0:control;B->E:2;B->F:1;" \
+            "B:1->E:3;B:2->E:6;B:3->E:7;C->E;C->F;C:control->DMT/_1:control;"  \
+            "C:control->DMT/_2:control;D->E:1;D->F:2;DMT/_0->B:1;"             \
+            "DMT/_1->E:4;DMT/_2->E:5;E->G;F->G:1");                            \
 }
+REGISTER_TEST_FLOAT32(LRN_Negative3);
+#undef REGISTER_TEST
 
 /* Test MaxPool->MaxPoolGrad replacement by workspace+rewrite nodes. */
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Positive) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'Input'}"
-      "node { name: 'E' op: 'MaxPoolGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['C', 'B', 'D'] }"
-      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['C', 'E'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklMaxPool);C(Input);D(Input);DMT/_0(Const);"
-            "DMT/_1(Const);DMT/_2(Const);E(_MklMaxPoolGrad);F(Zeta)|"
-            "A->B;A:control->DMT/_0:control;B->E:1;B:1->E:3;B:2->E:5;B:3->E:7;"
-            "C->E;C->F;C:control->DMT/_1:control;C:control->DMT/_2:control;"
-            "D->E:2;DMT/_0->B:1;DMT/_1->E:4;DMT/_2->E:6;E->F:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                          \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                      \
+    InitGraph(                                                                 \
+      "node { name: 'A' op: '" #INPUT "'}"                                     \
+      "node { name: 'B' op: 'MaxPool'"                                         \
+      " attr { key: 'T'            value { type: " #T " } }"                   \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                      \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"    \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                     \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"    \
+      " input: ['A'] }"                                                        \
+      "node { name: 'C' op: '" #INPUT "'}"                                     \
+      "node { name: 'D' op: '" #INPUT "'}"                                     \
+      "node { name: 'E' op: 'MaxPoolGrad'"                                     \
+      " attr { key: 'T'            value { type: " #T " } }"                   \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                      \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"    \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                     \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"    \
+      " input: ['C', 'B', 'D'] }"                                              \
+      "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: " #T " } }"   \
+      " input: ['C', 'E'] }");                                                 \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                   \
+            "A(" #INPUT ");B(_MklMaxPool);C(" #INPUT ");D(" #INPUT ");"        \
+            "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);E(_MklMaxPoolGrad);"    \
+            "F(Zeta)|A->B;A:control->DMT/_0:control;B->E:1;B:1->E:3;B:2->E:5;" \
+            "B:3->E:7;C->E;C->F;C:control->DMT/_1:control;"                    \
+            "C:control->DMT/_2:control;D->E:2;DMT/_0->B:1;DMT/_1->E:4;"        \
+            "DMT/_2->E:6;E->F:1");                                             \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Positive);
+#undef REGISTER_TEST
 
 // Test MaxPool>MaxPoolGrad replacement when only one of them is present.
 // In this case, we will rewrite MaxPool node but workspace edges will not
 // be present.
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative1) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(_MklMaxPool);C(Zeta);DMT/_0(Const)|"
-            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(_MklMaxPool);C(Zeta);DMT/_0(Const)|"            \
+            "A->B;A->C;A:control->DMT/_0:control;B->C:1;DMT/_0->B:1");       \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative1);
+#undef REGISTER_TEST
 
 // Test MaxPoolGrad replacement when only one of them is present.
 // In this case, we will rewrite MaxPoolGrad and for workspace tensor and
 // its Mkl part, we will generate dummy tensor.
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative2) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'Input'}"
-      "node { name: 'C' op: 'Input'}"
-      "node { name: 'D' op: 'MaxPoolGrad'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"
-      " input: ['A', 'B', 'C'] }"
-      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'D'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(Input);C(Input);D(MaxPoolGrad);"
-            "E(Zeta)|A->D;A->E;B->D:1;C->D:2;D->E:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: '" #INPUT "'}"                                   \
+      "node { name: 'C' op: '" #INPUT "'}"                                   \
+      "node { name: 'D' op: 'MaxPoolGrad'"                                   \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:3, i:3} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:2, i:2} } }"  \
+      " input: ['A', 'B', 'C'] }"                                            \
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'D'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(" #INPUT ");C(" #INPUT ");D(MaxPoolGrad);"      \
+            "E(Zeta)|A->D;A->E;B->D:1;C->D:2;D->E:1");                       \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative2);
+#undef REGISTER_TEST
 
 // Test MaxPool handling for batch-wise pooling (NCHW)
 // No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative3) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");            \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative3);
+#undef REGISTER_TEST
 
 // Test MaxPool handling for batch-wise pooling (NCHW)
 // No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative4) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");            \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative4);
+#undef REGISTER_TEST
 
 // Test MaxPool handling for depth-wise pooling (NHWC)
 // No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative5) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:2, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:2, i:1, i:1} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");            \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative5);
+#undef REGISTER_TEST
 
 // Test MaxPool handling for depth-wise pooling (NCHW)
 // No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative6) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NCHW' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:2, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NCHW' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:2, i:1, i:1} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");            \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative6);
+#undef REGISTER_TEST
 
 // Test MaxPool handling for batch-wise pooling (NHWC)
 // No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative7) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 2, i:1, i:1, i:1} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");            \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative7);
+#undef REGISTER_TEST
 
 // Test MaxPool handling for batch-wise pooling (NHWC)
 // No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative8) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 2, i:1, i:1, i:1} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");            \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative8);
+#undef REGISTER_TEST
 
 // Test MaxPool handling for depth-wise pooling (NHWC)
 // No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative9) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:2} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:2} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:1} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");            \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative9);
+#undef REGISTER_TEST
 
 // Test MaxPool handling for depth-wise pooling (NHWC)
 // No rewrite should take place in such case
-TEST_F(MklLayoutPassTest, NodeWorkspace_MaxPool_Negative10) {
-  InitGraph(
-      "node { name: 'A' op: 'Input'}"
-      "node { name: 'B' op: 'MaxPool'"
-      " attr { key: 'T'            value { type: DT_FLOAT } }"
-      " attr { key: 'data_format'  value { s: 'NHWC' } }"
-      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"
-      " attr { key: 'padding'      value { s: 'VALID' } }"
-      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:2} } }"
-      " input: ['A'] }"
-      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
-      " input: ['A', 'B'] }");
-  EXPECT_EQ(DoMklLayoutOptimizationPass(),
-            "A(Input);B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph(                                                               \
+      "node { name: 'A' op: '" #INPUT "'}"                                   \
+      "node { name: 'B' op: 'MaxPool'"                                       \
+      " attr { key: 'T'            value { type: " #T " } }"                 \
+      " attr { key: 'data_format'  value { s: 'NHWC' } }"                    \
+      " attr { key: 'ksize'        value { list: {i: 1, i:1, i:1, i:1} } }"  \
+      " attr { key: 'padding'      value { s: 'VALID' } }"                   \
+      " attr { key: 'strides'      value { list: {i: 1, i:1, i:1, i:2} } }"  \
+      " input: ['A'] }"                                                      \
+      "node { name: 'C' op: 'Zeta' attr { key: 'T' value { type: " #T " } }" \
+      " input: ['A', 'B'] }");                                               \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+            "A(" #INPUT ");B(MaxPool);C(Zeta)|A->B;A->C;B->C:1");            \
 }
+REGISTER_TEST_ALL_TYPES(NodeWorkspace_MaxPool_Negative10);
+#undef REGISTER_TEST
+// clang-format on
 
 /////////////////////////////////////////////////////////////////////
 
@@ -4118,6 +4321,28 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_DeviceTest) {
             "A->D:2;B->D;B:1->D:1;C->E;D->E:1");
 }
 
+TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_IndexTest) {
+  InitGraph(
+      "node { name: 'A' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_INT64 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_INT32 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'B' op: 'InputList'"
+      " attr { key: 'N'                value { i: 2 } }}"
+      "node { name: 'C' op: 'Input'}"
+      "node { name: 'D' op: 'ConcatV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tidx'             value { type: DT_INT64 } }"
+      " attr { key: 'N'                value { i: 2 } }"
+      " input: ['B:0', 'B:1', 'A']}"
+      "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
+      " input: ['C', 'D'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Const);B(InputList);C(Input);D(ConcatV2);E(Zeta)|"
+            "A->D:2;B->D;B:1->D:1;C->E;D->E:1");
+}
+
 TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNorm_DeviceTest) {
   InitGraph(
       "node { name: 'A' op: 'Input'}"
@@ -4553,6 +4778,7 @@ TEST_F(MklLayoutPassTest, FusedConv2DWithBias_FilterCaching_Positive) {
       " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
       " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['D', 'C'] }");
@@ -4580,6 +4806,7 @@ TEST_F(MklLayoutPassTest,
       " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
       " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['D', 'C'] }");
@@ -4602,6 +4829,7 @@ TEST_F(MklLayoutPassTest, FusedConv2DWithBias_FilterCaching_Negative) {
       " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
       " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['D', 'C'] }");
@@ -4625,6 +4853,7 @@ TEST_F(MklLayoutPassTest,
       " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'fused_ops'        value { list: {s: 'BiasAdd'} } }"
       " attr { key: 'epsilon'          value { f: 0.001 }}"
+      " attr { key: 'leakyrelu_alpha'  value { f: 0.2 }}"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['D', 'C'] }");
diff --git a/tensorflow/core/common_runtime/mkl_tfconversion_pass.cc b/tensorflow/core/common_runtime/mkl_tfconversion_pass.cc
index 6f22c9c62ae..6df967d45c9 100644
--- a/tensorflow/core/common_runtime/mkl_tfconversion_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_tfconversion_pass.cc
@@ -430,6 +430,11 @@ Status MklToTfConversionPass::Run(const GraphOptimizationPassOptions& options) {
     VLOG(2) << "TF-MKL: Disabling MKL";
     return Status::OK();
   }
+  if (NativeFormatEnabled()) {
+    VLOG(2)
+        << "Running in native format mode, MklToTfConversionPass won't run.";
+    return Status::OK();
+  }
 
   auto process_graph = [&](std::unique_ptr<Graph>* g) {
     // Get the ownership of graph
diff --git a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
index c29752d3c2c..8d64f6e69db 100644
--- a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
+++ b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 #include "tensorflow/core/common_runtime/threadpool_device.h"
-
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
@@ -37,15 +36,6 @@ TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) {
   EXPECT_EQ(omp_get_max_threads(), (port::NumSchedulableCPUs() + ht - 1) / ht);
 }
 
-TEST(MKLThreadPoolDeviceTest, TestOmpPreSets) {
-  SessionOptions options;
-  setenv("OMP_NUM_THREADS", "314", 1);
-
-  ThreadPoolDevice* tp = new ThreadPoolDevice(
-      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
-
-  EXPECT_EQ(omp_get_max_threads(), 314);
-}
 #endif  // defined(_OPENMP) && !defined(ENABLE_MKLDNN_THREADPOOL)
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/pending_counts.h b/tensorflow/core/common_runtime/pending_counts.h
index 46b6509390f..af188fad85d 100644
--- a/tensorflow/core/common_runtime/pending_counts.h
+++ b/tensorflow/core/common_runtime/pending_counts.h
@@ -16,6 +16,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <atomic>
+
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
@@ -79,13 +81,19 @@ class PendingCounts {
   // Create a new PendingCounts object that can hold the state of
   // all the Handles allocated from "final_allocator".
   explicit PendingCounts(Layout layout)
-      : num_bytes_(layout.next_offset_), bytes_(new char[num_bytes_]) {}
+      : num_bytes_(layout.next_offset_), bytes_(new char[num_bytes_]) {
+    if (num_bytes_ >= sizeof(LargeCounts)) {
+      CHECK_EQ(uintptr_t(bytes_) % alignof(LargeCounts), 0);
+    }
+  }
 
   // Create a new PendingCounts object with the same layout and counts
   // as "other".
   explicit PendingCounts(const PendingCounts& other)
       : num_bytes_(other.num_bytes_), bytes_(new char[num_bytes_]) {
-    CHECK_EQ(uintptr_t(bytes_) % alignof(LargeCounts), 0);
+    if (num_bytes_ >= sizeof(LargeCounts)) {
+      CHECK_EQ(uintptr_t(bytes_) % alignof(LargeCounts), 0);
+    }
     memcpy(bytes_, other.bytes_, other.num_bytes_);
   }
 
@@ -93,63 +101,75 @@ class PendingCounts {
 
   void set_initial_count(Handle h, size_t pending_count) {
     if (h.is_large_) {
-      LargeCounts* c = Large(h);
-      c->pending = pending_count;
-      c->dead_count = 0;
-      c->has_started = 0;
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      c.pending = pending_count;
+      c.dead_count = 0;
+      c.has_started = 0;
+      c_ptr->store(c, std::memory_order_relaxed);
     } else {
-      PackedCounts* c = Packed(h);
       DCHECK_LE(pending_count, kMaxCountForPackedCounts);
-      c->pending = pending_count;
-      c->dead_count = 0;
-      c->has_started = 0;
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      c.pending = pending_count;
+      c.dead_count = 0;
+      c.has_started = 0;
+      c_ptr->store(c, std::memory_order_relaxed);
     }
   }
 
   NodeState node_state(Handle h) {
     if (h.is_large_) {
-      return NodeStateForStruct(Large(h));
+      return NodeStateForStruct(Large(h)->load(std::memory_order_relaxed));
     } else {
-      return NodeStateForStruct(Packed(h));
+      return NodeStateForStruct(Packed(h)->load(std::memory_order_relaxed));
     }
   }
   void mark_started(Handle h) {
     DCHECK_EQ(pending(h), 0);
     if (h.is_large_) {
-      LargeCounts* c = Large(h);
-      DCHECK_EQ(c->has_started, 0);
-      c->has_started = 1;
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      DCHECK_EQ(c.has_started, 0);
+      c.has_started = 1;
+      c_ptr->store(c, std::memory_order_relaxed);
     } else {
-      PackedCounts* c = Packed(h);
-      DCHECK_EQ(c->has_started, 0);
-      c->has_started = 1;
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      DCHECK_EQ(c.has_started, 0);
+      c.has_started = 1;
+      c_ptr->store(c, std::memory_order_relaxed);
     }
   }
   void mark_completed(Handle h) {
     if (h.is_large_) {
-      LargeCounts* c = Large(h);
-      DCHECK_EQ(c->has_started, 1);
-      c->pending = 1;
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      DCHECK_EQ(c.has_started, 1);
+      c.pending = 1;
+      c_ptr->store(c, std::memory_order_relaxed);
     } else {
-      PackedCounts* c = Packed(h);
-      DCHECK_EQ(c->has_started, 1);
-      c->pending = 1;
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      DCHECK_EQ(c.has_started, 1);
+      c.pending = 1;
+      c_ptr->store(c, std::memory_order_relaxed);
     }
   }
   int pending(Handle h) {
     if (h.is_large_) {
-      LargeCounts* c = Large(h);
+      LargeCounts c = Large(h)->load(std::memory_order_relaxed);
       if (PENDING_NOTREADY == NodeStateForStruct(c)) {
-        return c->pending;
+        return c.pending;
       } else {
         // The pending count encodes the state once the node has
         // started, so just return 0.
         return 0;
       }
     } else {
-      PackedCounts* c = Packed(h);
+      PackedCounts c = Packed(h)->load(std::memory_order_relaxed);
       if (PENDING_NOTREADY == NodeStateForStruct(c)) {
-        return c->pending;
+        return c.pending;
       } else {
         // The pending count encodes the state once the node has
         // started, so just return 0.
@@ -160,50 +180,63 @@ class PendingCounts {
   int decrement_pending(Handle h, int v) {
     DCHECK_GE(pending(h), v);
     if (h.is_large_) {
-      LargeCounts* c = Large(h);
-      c->pending -= v;
-      return c->pending;
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      c.pending -= v;
+      c_ptr->store(c, std::memory_order_relaxed);
+      return c.pending;
     } else {
-      PackedCounts* c = Packed(h);
-      c->pending -= v;
-      return c->pending;
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
+      c.pending -= v;
+      c_ptr->store(c, std::memory_order_relaxed);
+      return c.pending;
     }
   }
   // Mark a merge node as live
   // REQUIRES: Node corresponding to "h" is a merge node
   void mark_live(Handle h) {
     if (h.is_large_) {
-      LargeCounts* c = Large(h);
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
       // Only do anything if the node hasn't already started executing.
       if (PENDING_NOTREADY == NodeStateForStruct(c)) {
-        c->pending &= ~static_cast<int>(0x1);
+        c.pending &= ~static_cast<int>(0x1);
+        c_ptr->store(c, std::memory_order_relaxed);
       }
     } else {
-      PackedCounts* c = Packed(h);
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
       // Only do anything if the node hasn't already started executing.
       if (PENDING_NOTREADY == NodeStateForStruct(c)) {
         static_assert(7 == kMaxCountForPackedCounts,
                       "Live flag incorrect for max packed count");
-        c->pending &= 0x6;
+        c.pending &= 0x6;
+        c_ptr->store(c, std::memory_order_relaxed);
       }
     }
   }
 
   int dead_count(Handle h) {
-    int r = h.is_large_ ? Large(h)->dead_count : Packed(h)->dead_count;
+    int r = h.is_large_ ? Large(h)->load(std::memory_order_relaxed).dead_count
+                        : Packed(h)->load(std::memory_order_relaxed).dead_count;
     return r;
   }
   void increment_dead_count(Handle h) {
     if (h.is_large_) {
-      LargeCounts* c = Large(h);
+      std::atomic<LargeCounts>* c_ptr = Large(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
       if (PENDING_NOTREADY == NodeStateForStruct(c)) {
-        c->dead_count++;
+        c.dead_count++;
+        c_ptr->store(c, std::memory_order_relaxed);
       }
     } else {
-      PackedCounts* c = Packed(h);
+      std::atomic<PackedCounts>* c_ptr = Packed(h);
+      auto c = c_ptr->load(std::memory_order_relaxed);
       if (PENDING_NOTREADY == NodeStateForStruct(c)) {
-        DCHECK_LT(c->dead_count, kMaxCountForPackedCounts);
-        c->dead_count++;
+        DCHECK_LT(c.dead_count, kMaxCountForPackedCounts);
+        c.dead_count++;
+        c_ptr->store(c, std::memory_order_relaxed);
       }
     }
   }
@@ -230,6 +263,17 @@ class PendingCounts {
     }
   }
 
+  // The same as the above, but performs the operation atomically. This
+  // is thread-safe to run concurrently with other threads.
+  AdjustResult adjust_for_activation_atomic(Handle h, bool increment_dead) {
+    DCHECK_GE(pending(h), 1);
+    if (h.is_large_) {
+      return adjust_for_activation_shared_atomic(Large(h), increment_dead);
+    } else {
+      return adjust_for_activation_shared_atomic(Packed(h), increment_dead);
+    }
+  }
+
   class Handle {
    public:
     Handle() : byte_offset_(0), is_large_(0) {}
@@ -242,12 +286,31 @@ class PendingCounts {
 
  private:
   template <typename T>
-  inline AdjustResult adjust_for_activation_shared(T* c, bool increment_dead) {
-    if (increment_dead && PENDING_NOTREADY == NodeStateForStruct(c)) {
-      c->dead_count++;
+  inline AdjustResult adjust_for_activation_shared(std::atomic<T>* c,
+                                                   bool increment_dead) {
+    T val = c->load(std::memory_order_relaxed);
+    if (increment_dead && PENDING_NOTREADY == NodeStateForStruct(val)) {
+      val.dead_count++;
+    }
+    val.pending--;
+    c->store(val, std::memory_order_relaxed);
+    return AdjustResult(val.dead_count, val.pending);
+  }
+
+  template <typename T>
+  inline AdjustResult adjust_for_activation_shared_atomic(std::atomic<T>* c,
+                                                          bool increment_dead) {
+    T old_val = c->load(std::memory_order_relaxed);
+    while (true) {
+      T new_val = old_val;
+      if (increment_dead && PENDING_NOTREADY == NodeStateForStruct(new_val)) {
+        new_val.dead_count++;
+      }
+      new_val.pending--;
+      AdjustResult ret(new_val.dead_count, new_val.pending);
+      if (TF_PREDICT_TRUE(c->compare_exchange_weak(old_val, new_val)))
+        return ret;
     }
-    c->pending -= 1;
-    return AdjustResult(c->dead_count, c->pending);
   }
 
   // We keep track of the pending count and dead input count for each
@@ -272,30 +335,35 @@ class PendingCounts {
     uint8 has_started : 1;
   };
 
-  struct LargeCounts {
+  // NOTE: alignas(8) is critical to implement efficient atomic<LargeCounts>
+  // on MSVC.
+  struct alignas(8) LargeCounts {
     uint32 pending;
     uint32 dead_count : 31;
-    uint8 has_started : 1;
+    // NOTE(tlipcon): MSVC won't pack this struct into 8 bytes unless
+    // all of the member types are uint32.
+    uint32 has_started : 1;
   };
 
   template <typename T>
-  NodeState NodeStateForStruct(T* c) const {
-    if (c->has_started) {
-      return (c->pending == 0) ? STARTED : COMPLETED;
+  NodeState NodeStateForStruct(const T& c) const {
+    if (c.has_started) {
+      return (c.pending == 0) ? STARTED : COMPLETED;
     } else {
-      return (c->pending == 0) ? PENDING_READY : PENDING_NOTREADY;
+      return (c.pending == 0) ? PENDING_READY : PENDING_NOTREADY;
     }
   }
-  inline LargeCounts* Large(Handle h) {
+  inline std::atomic<LargeCounts>* Large(Handle h) {
     DCHECK(h.is_large_);
-    DCHECK_LE(h.byte_offset_ + sizeof(LargeCounts), num_bytes_);
-    DCHECK_EQ(h.byte_offset_ % alignof(LargeCounts), 0);
-    return reinterpret_cast<LargeCounts*>(bytes_ + h.byte_offset_);
+    DCHECK_LE(h.byte_offset_ + sizeof(std::atomic<LargeCounts>), num_bytes_);
+    DCHECK_EQ(h.byte_offset_ % alignof(std::atomic<LargeCounts>), 0);
+    return reinterpret_cast<std::atomic<LargeCounts>*>(bytes_ + h.byte_offset_);
   }
-  inline PackedCounts* Packed(Handle h) {
+  inline std::atomic<PackedCounts>* Packed(Handle h) {
     DCHECK(!h.is_large_);
     DCHECK_LE(h.byte_offset_ + sizeof(PackedCounts), num_bytes_);
-    return reinterpret_cast<PackedCounts*>(bytes_ + h.byte_offset_);
+    return reinterpret_cast<std::atomic<PackedCounts>*>(bytes_ +
+                                                        h.byte_offset_);
   }
 
   const int num_bytes_;  // Just for bounds checking in debug mode
@@ -309,9 +377,11 @@ inline PendingCounts::Handle PendingCounts::Layout::CreateHandle(
   Handle result;
   if ((max_pending_count > kMaxCountForPackedCounts) ||
       (max_dead_count > kMaxCountForPackedCounts)) {
-    int B = sizeof(LargeCounts);
+    constexpr int B = sizeof(std::atomic<LargeCounts>);
     // Round byte offset to proper alignment
-    DCHECK_GE(sizeof(LargeCounts), alignof(LargeCounts));
+    static_assert(
+        sizeof(std::atomic<LargeCounts>) >= alignof(std::atomic<LargeCounts>),
+        "std::atomic<LargeCounts> must be packed");
     int64 offset = ((static_cast<int64>(next_offset_) + B - 1) / B) * B;
     result.byte_offset_ = offset;
     result.is_large_ = true;
@@ -319,8 +389,9 @@ inline PendingCounts::Handle PendingCounts::Layout::CreateHandle(
   } else {
     result.byte_offset_ = next_offset_;
     result.is_large_ = false;
-    DCHECK_EQ(sizeof(PackedCounts), 1);
-    next_offset_ += sizeof(PackedCounts);
+    static_assert(sizeof(std::atomic<PackedCounts>) == 1,
+                  "std::atomic<PackedCounts> should be a single byte");
+    next_offset_ += sizeof(std::atomic<PackedCounts>);
   }
   return result;
 }
diff --git a/tensorflow/core/common_runtime/pending_counts_test.cc b/tensorflow/core/common_runtime/pending_counts_test.cc
index 5d5e7367c86..9debed4528a 100644
--- a/tensorflow/core/common_runtime/pending_counts_test.cc
+++ b/tensorflow/core/common_runtime/pending_counts_test.cc
@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/common_runtime/pending_counts.h"
+
 #include <memory>
 #include <unordered_map>
+#include <vector>
 
-#include "tensorflow/core/common_runtime/pending_counts.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 
+using std::unique_ptr;
+
 namespace tensorflow {
 
 TEST(PendingCounts, Simple) {
@@ -165,4 +170,36 @@ TEST(PendingCounts, AdjustForActivation) {
   }
 }
 
+TEST(PendingCounts, AdjustForActivationAtomic) {
+  PendingCounts::Layout layout;
+  PendingCounts::Handle handles[2];
+  const int kInitialCounts[2] = {6, 16};
+  handles[0] = layout.CreateHandle(kInitialCounts[0], 0);
+  handles[1] = layout.CreateHandle(kInitialCounts[1], 0);
+  PendingCounts c(layout);
+  c.set_initial_count(handles[0], kInitialCounts[0]);
+  c.set_initial_count(handles[1], kInitialCounts[1]);
+
+  Env* env = Env::Default();
+  std::atomic<bool> start{false};
+  std::vector<unique_ptr<Thread>> threads;
+  for (int t = 0; t < 2; t++) {
+    threads.emplace_back(env->StartThread({}, "tester", [&]() {
+      while (!start) {
+      }
+      for (int i = 0; i < kInitialCounts[0] / 2; i++) {
+        c.adjust_for_activation_atomic(handles[0], false);
+      }
+      for (int i = 0; i < kInitialCounts[1] / 2; i++) {
+        c.adjust_for_activation_atomic(handles[1], false);
+      }
+    }));
+  }
+  start = true;
+  threads.clear();  // Joins the threads.
+
+  EXPECT_EQ(c.pending(handles[0]), 0);
+  EXPECT_EQ(c.pending(handles[1]), 0);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/permuter.cc b/tensorflow/core/common_runtime/permuter.cc
index 61b8dcb79c8..1053cae1f7d 100644
--- a/tensorflow/core/common_runtime/permuter.cc
+++ b/tensorflow/core/common_runtime/permuter.cc
@@ -87,7 +87,7 @@ void Permuter::DispatchSend(int src_rank, int target_rank, const Tensor* tensor,
           << " target_rank=" << target_rank << " src_rank=" << src_rank;
   col_ctx_->col_exec->remote_access()->PostToPeer(
       col_params_->instance.devices[target_rank],
-      col_params_->instance.task_names[target_rank], send_buf_key,
+      col_params_->group.task_names[target_rank], send_buf_key,
       col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
       col_ctx_->op_ctx->output_alloc_attr(0), tensor, col_ctx_->device_locality,
       done);
@@ -103,7 +103,7 @@ void Permuter::DispatchRecv(int src_rank, int target_rank, Tensor* tensor,
           << " target_rank=" << target_rank << " src_rank=" << src_rank;
   col_ctx_->col_exec->remote_access()->RecvFromPeer(
       col_params_->instance.devices[src_rank],
-      col_params_->instance.task_names[src_rank],
+      col_params_->group.task_names[src_rank],
       col_params_->task.is_local[src_rank], recv_buf_key, col_ctx_->device,
       col_ctx_->op_ctx->op_device_context(),
       col_ctx_->op_ctx->output_alloc_attr(0), tensor, col_ctx_->device_locality,
diff --git a/tensorflow/core/common_runtime/permuter_test.cc b/tensorflow/core/common_runtime/permuter_test.cc
index fd219aa3c17..1b65a9ebe5f 100644
--- a/tensorflow/core/common_runtime/permuter_test.cc
+++ b/tensorflow/core/common_runtime/permuter_test.cc
@@ -183,11 +183,11 @@ class PermuterTest : public ::testing::Test {
         } else {
           dev_name = strings::StrCat(task_name, "/device:CPU:", di);
         }
-        col_params_.instance.device_names.push_back(dev_name);
+        col_params_.group.device_names.push_back(dev_name);
         col_params_.instance.devices.push_back(dev_name);
         int default_rank = wi * num_devices_per_worker + di;
         permutation_.push_back(default_rank);
-        col_params_.instance.task_names.push_back(task_name);
+        col_params_.group.task_names.push_back(task_name);
         col_params_.task.is_local.push_back(true);
       }
     }
@@ -212,7 +212,7 @@ class PermuterTest : public ::testing::Test {
       for (int di = 0; di < num_devices_per_worker; di++) {
         int default_rank = wi * num_devices_per_worker + di;
         instances_.push_back(new DeviceInstance(
-            default_rank, col_params_.instance.device_names[default_rank],
+            default_rank, col_params_.group.device_names[default_rank],
             device_type, this));
       }
     }
@@ -323,16 +323,14 @@ class PermuterTest : public ::testing::Test {
       col_params_.instance.instance_key =
           parent_->col_params_.instance.instance_key;
       col_params_.group.device_type = parent_->col_params_.group.device_type;
-      col_params_.instance.device_names =
-          parent_->col_params_.instance.device_names;
+      col_params_.group.device_names = parent_->col_params_.group.device_names;
       col_params_.instance.devices = parent_->col_params_.instance.devices;
       col_params_.instance.permutation =
           parent->col_params_.instance.permutation;
-      col_params_.instance.task_names =
-          parent_->col_params_.instance.task_names;
+      col_params_.group.task_names = parent_->col_params_.group.task_names;
       col_params_.task.is_local = parent_->col_params_.task.is_local;
       CHECK_EQ(col_params_.instance.devices.size(),
-               col_params_.instance.device_names.size());
+               col_params_.group.device_names.size());
       // Default rank is order in device_names.
       col_params_.default_rank = rank;
     }
@@ -388,8 +386,9 @@ class PermuterTest : public ::testing::Test {
       Permuter* permuter = new Permuter;
       core::ScopedUnref unref(permuter);
       auto col_ctx = std::make_shared<CollectiveContext>(
-          parent_->col_exec_, parent_->dev_mgr_.get(), &ctx, &op_params,
-          col_params_, exec_key, kStepId, &tensor_input_, &tensor_output_);
+          parent_->col_exec_, /*nccl_communicator*/ nullptr,
+          parent_->dev_mgr_.get(), &ctx, &op_params, col_params_, exec_key,
+          kStepId, &tensor_input_, &tensor_output_);
       TF_CHECK_OK(permuter->InitializeCollectiveContext(col_ctx));
       Notification note;
       // Run the permute.
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index ac3343e5a61..40c31185eac 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -88,7 +88,6 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
     const OptimizerOptions& optimizer_options,
     thread::ThreadPool* default_thread_pool,
     DistributedFunctionLibraryRuntime* parent,
-    const CustomKernelCreator* custom_kernel_creator,
     const SessionMetadata* session_metadata,
     Rendezvous::Factory rendezvous_factory)
     : parent_(parent),
@@ -106,14 +105,14 @@ ProcessFunctionLibraryRuntime::ProcessFunctionLibraryRuntime(
     (*flr_map_)[nullptr] = NewFunctionLibraryRuntime(
         nullptr, env, config_ ? &(*config_) : nullptr, nullptr,
         graph_def_version, lib_def_, default_thread_pool, optimizer_options,
-        custom_kernel_creator, session_metadata_, this);
+        session_metadata_, this);
     return;
   }
   for (Device* d : device_mgr->ListDevices()) {
     (*flr_map_)[d] = NewFunctionLibraryRuntime(
         device_mgr, env, config_ ? &(*config_) : nullptr, d, graph_def_version,
-        lib_def_, default_thread_pool, optimizer_options, custom_kernel_creator,
-        session_metadata_, this);
+        lib_def_, default_thread_pool, optimizer_options, session_metadata_,
+        this);
   }
 
   InitializeDeviceSet();
@@ -731,13 +730,24 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
       function_name, function_key, ret_node_names.size(),
       lib_def->ReachableDefinitions(*fdef), std::move(ret_types));
 
+  // Do not run function/graph optimization passes for component functions,
+  // since they have already processed the main function.
+  const bool should_run_optimization_passes = !options.is_component_function;
+  if (!should_run_optimization_passes) {
+    VLOG(1) << "Skipping function/graph optimization passes when instantiating "
+               "component function "
+            << function_name;
+  }
+
   // Mapping from a function body node name to the control output name.
   std::unordered_map<string, string> node_name_to_control_ret;
 
   bool control_rets_updated = false;
-  TF_RETURN_IF_ERROR(FunctionOptimizationPassRegistry::Global().Run(
-      *dev_set, options.config_proto, &graph, &data->lib_def_,
-      &control_ret_node_names, &control_rets_updated));
+  if (should_run_optimization_passes) {
+    TF_RETURN_IF_ERROR(FunctionOptimizationPassRegistry::Global().Run(
+        *dev_set, options.config_proto, &graph, &data->lib_def_,
+        &control_ret_node_names, &control_rets_updated));
+  }
 
   if (control_rets_updated) {
     // Function graph pass may have resulted in different nodes/node names for
@@ -762,17 +772,8 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   optimization_options.device_set = dev_set.get();
   optimization_options.is_function_graph = true;
 
-  // Do not run graph optimization passes for component functions, since they
-  // have already processed the main function.
-  bool should_run_graph_passes = !options.is_component_function;
-  if (!should_run_graph_passes) {
-    VLOG(1) << "Skipping graph optimization passes when instantiating "
-               "component function "
-            << function_name;
-  }
-
   DumpGraph("Before running PRE_PLACEMENT passes", graph.get());
-  if (should_run_graph_passes) {
+  if (should_run_optimization_passes) {
     TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
         OptimizationPassRegistry::PRE_PLACEMENT, optimization_options));
   }
@@ -787,7 +788,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   TF_RETURN_IF_ERROR(placer.Run());
 
   DumpGraph("Before running POST_PLACEMENT passes", graph.get());
-  if (should_run_graph_passes) {
+  if (should_run_optimization_passes) {
     TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
         OptimizationPassRegistry::POST_PLACEMENT, optimization_options));
   }
@@ -808,7 +809,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   }
 
   DumpGraph("Before running POST_REWRITE_FOR_EXEC passes", graph.get());
-  if (should_run_graph_passes) {
+  if (should_run_optimization_passes) {
     TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
         OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, optimization_options));
   }
@@ -846,7 +847,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   // Normally POST_PARTITIONING passes are run by distributed workers.
   // Distributed workers are currently not supported in this code path, so we
   // run the passes here.
-  if (should_run_graph_passes) {
+  if (should_run_optimization_passes) {
     TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
         OptimizationPassRegistry::POST_PARTITIONING, optimization_options));
   }
@@ -1130,10 +1131,11 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
     rets->resize(data->num_outputs_);
 
     auto component_fn_callback = [comp_rets, rets, comp_data, refcounted_done,
-                                  cm, local_cm, data,
+                                  cm, local_cm, data, handle,
                                   target](const Status& status) {
       if (!status.ok()) {
         VLOG(2) << "Component function execution on target " << target
+                << " from " << data->function_name_ << " with handle " << handle
                 << " failed: " << status;
         const string function_and_msg = strings::StrCat(
             errors::FormatFunctionForError(data->function_name_), " ",
@@ -1143,6 +1145,7 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
         cm->StartCancel();
       } else {
         VLOG(2) << "Component function execution on target " << target
+                << " from " << data->function_name_ << " with handle " << handle
                 << " succeeded.";
         for (int i = 0; i < comp_rets->size(); ++i) {
           (*rets)[comp_data.ret_indices[i]] = (*comp_rets)[i];
@@ -1161,8 +1164,8 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
       thread::ThreadPool* pool = flr->device()->tensorflow_device_thread_pool();
       opts_copy.runner = (pool == nullptr) ? opts_copy.runner : flr->runner();
 
-      VLOG(1) << "Running component function on device " << target
-              << " with handle " << handle;
+      VLOG(1) << "Running component function on device " << target << " from "
+              << data->function_name_ << " with handle " << handle;
       VLOG(4) << "    with " << opts_copy.DebugString();
 
       std::vector<Tensor>* comp_tensor_rets = new std::vector<Tensor>;
@@ -1173,8 +1176,8 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
     } else {
       opts_copy.remote_execution = true;
 
-      VLOG(1) << "Running component function on device " << target
-              << " with handle " << handle;
+      VLOG(1) << "Running component function on device " << target << " from "
+              << data->function_name_ << " with handle " << handle;
       VLOG(4) << "    with " << opts_copy.DebugString();
 
       RunInternal(opts_copy, handle, comp_args.args, comp_rets, cleanup_items,
@@ -1713,7 +1716,6 @@ void ProcessFunctionLibraryRuntime::CleanUp(
 
 Status ProcessFunctionLibraryRuntime::Clone(
     Env* env, int graph_def_version, const OptimizerOptions& optimizer_options,
-    const CustomKernelCreator* custom_kernel_creator,
     std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
     std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr,
     bool skip_flib_def) const {
@@ -1726,7 +1728,7 @@ Status ProcessFunctionLibraryRuntime::Clone(
   *out_pflr = absl::make_unique<ProcessFunctionLibraryRuntime>(
       device_mgr_, env, config_ ? &(*config_) : nullptr, graph_def_version,
       out_lib_def->get(), optimizer_options, default_thread_pool_, parent_,
-      custom_kernel_creator, session_metadata_, rendezvous_factory_);
+      session_metadata_, rendezvous_factory_);
   {
     tf_shared_lock l(mu_);
     for (auto* d : composite_devices_) (*out_pflr)->AddCompositeDevice(d);
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index a882f5406d3..54d59f35ff3 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -69,7 +69,6 @@ class ProcessFunctionLibraryRuntime {
       const OptimizerOptions& optimizer_options,
       thread::ThreadPool* thread_pool = nullptr,
       DistributedFunctionLibraryRuntime* parent = nullptr,
-      const CustomKernelCreator* custom_kernel_creator = nullptr,
       const SessionMetadata* session_metadata = nullptr,
       Rendezvous::Factory rendezvous_factory = Rendezvous::Factory());
 
@@ -357,7 +356,6 @@ class ProcessFunctionLibraryRuntime {
   // runtime w.r.t. to number of functions in the current function library.
   Status Clone(Env* env, int graph_def_version,
                const OptimizerOptions& optimizer_options,
-               const CustomKernelCreator* custom_kernel_creator,
                std::unique_ptr<FunctionLibraryDefinition>* out_lib_def,
                std::unique_ptr<ProcessFunctionLibraryRuntime>* out_pflr,
                bool skip_flib_def = false) const;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 54c821d282a..fd047cc7371 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -139,8 +139,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     proc_flr_.reset(new ProcessFunctionLibraryRuntime(
         device_mgr_.get(), Env::Default(), /*config=*/nullptr,
         TF_GRAPH_DEF_VERSION, lib_def_.get(), opts,
-        /*thread_pool=*/nullptr, cluster_flr_.get(),
-        /*custom_kernel_creator=*/nullptr, session_metadata,
+        /*thread_pool=*/nullptr, cluster_flr_.get(), session_metadata,
         Rendezvous::Factory{
             [this](const int64 step_id, const DeviceMgr* device_mgr,
                    Rendezvous** r) {
diff --git a/tensorflow/core/common_runtime/propagator_state.cc b/tensorflow/core/common_runtime/propagator_state.cc
index a6639b1132e..fde47200282 100644
--- a/tensorflow/core/common_runtime/propagator_state.cc
+++ b/tensorflow/core/common_runtime/propagator_state.cc
@@ -89,13 +89,12 @@ void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
   IterationState* output_iter = input_iter;
 
   if (!item->is_enter_exit_or_next_iter) {
-    // Fast path for nodes types that don't need special handling
+    // Fast path for node types that don't need special handling.
+    // This is the case for most nodes.
     DCHECK_EQ(input_frame, output_frame);
-    // Normal path for most nodes
-    mutex_lock l(input_frame->mu);
-    output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
-    is_frame_done =
-        input_frame->DecrementOutstandingOpsLocked(input_iter, ready);
+    FrameState* frame = input_frame;
+    is_frame_done = frame->ActivateNodesAndAdjustOutstanding(
+        item, is_dead, output_iter, outputs, ready);
   } else if (item->is_enter) {
     FindOrCreateChildFrame(input_frame, input_iter, *item, &output_frame);
     {
@@ -105,7 +104,9 @@ void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
         // Propagate to all active iterations if this is a loop invariant.
         output_frame->AddLoopInv(item, (*outputs)[0], ready);
       } else {
-        output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
+        int activated = output_frame->ActivateNodesLocked(
+            item, is_dead, output_iter, outputs, ready);
+        output_frame->AdjustOutstandingOpsLocked(output_iter, activated, ready);
       }
       output_frame->num_pending_inputs--;
     }
@@ -124,7 +125,9 @@ void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
       output_iter = input_frame->parent_iter;
       {
         mutex_lock l(output_frame->mu);
-        output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
+        int activated = output_frame->ActivateNodesLocked(
+            item, is_dead, output_iter, outputs, ready);
+        output_frame->AdjustOutstandingOpsLocked(output_iter, activated, ready);
       }
       is_frame_done = input_frame->DecrementOutstandingOps(input_iter, ready);
     }
@@ -153,7 +156,9 @@ void PropagatorState::PropagateOutputs(const TaggedNode& tagged_node,
     if (output_frame != nullptr) {
       // This is the case when node is not Enter, Exit, or NextIteration.
       DCHECK(input_frame == output_frame);
-      output_frame->ActivateNodes(item, is_dead, output_iter, outputs, ready);
+      int activated = output_frame->ActivateNodesLocked(
+          item, is_dead, output_iter, outputs, ready);
+      output_frame->AdjustOutstandingOpsLocked(output_iter, activated, ready);
     }
     is_frame_done =
         input_frame->DecrementOutstandingOpsLocked(input_iter, ready);
@@ -378,13 +383,15 @@ void PropagatorState::CleanupFramesIterations(FrameState* frame,
   }
 }
 
-void PropagatorState::FrameState::ActivateNodesFastPath(
+template <bool atomic>
+int PropagatorState::FrameState::ActivateNodesFastPathInternal(
     const NodeItem* item, const bool is_dead, IterationState* iter_state,
     EntryVector* outputs, TaggedNodeSeq* ready) {
   // If we know that none of the item's edge destinations require special
   // handling (i.e. none of the nodes is a merge or control trigger node), we
   // can take a fast path that avoids accessing the destination NodeItem.
   const GraphView& gview = immutable_state.graph_view();
+  int new_outstanding = 0;
 
 // Add dst to the ready queue if it's ready
 //
@@ -399,12 +406,11 @@ void PropagatorState::FrameState::ActivateNodesFastPath(
       t.input_frame = this;                               \
       t.input_iter = iter_state;                          \
       t.is_dead = adjust_result.any_dead;                 \
-      iter_state->outstanding_ops++;                      \
+      new_outstanding++;                                  \
     }                                                     \
   } while (0);
 
   Entry* input_tensors = iter_state->input_tensors;
-
   for (const EdgeInfo& e : item->output_edges()) {
     const int dst_id = e.dst_id;
     const PendingCounts::Handle dst_pending_id =
@@ -413,14 +419,17 @@ void PropagatorState::FrameState::ActivateNodesFastPath(
 
     const bool increment_dead =
         (is_dead || ((*outputs)[src_slot].state == Entry::State::NO_VALUE));
-    const PendingCounts::AdjustResult adjust_result =
-        iter_state->adjust_for_activation(dst_pending_id, increment_dead);
     const int dst_loc = e.input_slot;
     if (e.is_last) {
       input_tensors[dst_loc] = std::move((*outputs)[src_slot]);
     } else {
       input_tensors[dst_loc] = (*outputs)[src_slot];
     }
+    const PendingCounts::AdjustResult adjust_result =
+        atomic
+            ? iter_state->adjust_for_activation_atomic(dst_pending_id,
+                                                       increment_dead)
+            : iter_state->adjust_for_activation(dst_pending_id, increment_dead);
     MAYBE_ADD_TO_READY(dst_id, adjust_result);
   }
 
@@ -429,27 +438,31 @@ void PropagatorState::FrameState::ActivateNodesFastPath(
     const PendingCounts::Handle dst_pending_id =
         immutable_state.pending_ids()[dst_id];
     const PendingCounts::AdjustResult adjust_result =
-        iter_state->adjust_for_activation(dst_pending_id, is_dead);
+        atomic
+            ? iter_state->adjust_for_activation_atomic(dst_pending_id, is_dead)
+            : iter_state->adjust_for_activation(dst_pending_id, is_dead);
     MAYBE_ADD_TO_READY(dst_id, adjust_result);
   }
+
+  return new_outstanding;
 #undef MAYBE_ADD_TO_READY
 }
 
-void PropagatorState::FrameState::ActivateNodesSlowPath(
+int PropagatorState::FrameState::ActivateNodesSlowPath(
     const NodeItem* item, const bool is_dead, IterationState* iter_state,
     EntryVector* outputs, TaggedNodeSeq* ready) {
   // If any of the edge destinations is a merge or a control trigger node,
   // we need to read each destination NodeItem to determine what action
   // to take.
   const GraphView& gview = immutable_state.graph_view();
-
+  int activated = 0;
   auto maybe_add_to_ready = [&](int dst_id, const NodeItem* dst_item,
                                 bool dst_ready, bool dst_dead) {
     // Add dst to the ready queue if it's ready
     if (dst_ready) {
       if (dst_item->is_control_trigger) dst_dead = false;
       ready->emplace_back(dst_item, this, iter_state, dst_dead);
-      iter_state->outstanding_ops++;
+      activated++;
     }
   };
 
@@ -544,43 +557,72 @@ void PropagatorState::FrameState::ActivateNodesSlowPath(
     }
     maybe_add_to_ready(dst_id, dst_item, dst_ready, dst_dead);
   }
+
+  return activated;
 }
 
-void PropagatorState::FrameState::ActivateNodes(const NodeItem* item,
-                                                const bool is_dead,
-                                                IterationState* iter_state,
-                                                EntryVector* outputs,
-                                                TaggedNodeSeq* ready) {
+bool PropagatorState::FrameState::ActivateNodesAndAdjustOutstanding(
+    const NodeItem* item, const bool is_dead, IterationState* iter_state,
+    EntryVector* outputs, TaggedNodeSeq* ready) {
   if (TF_PREDICT_FALSE(item->is_any_consumer_merge_or_control_trigger)) {
-    ActivateNodesSlowPath(item, is_dead, iter_state, outputs, ready);
+    mutex_lock l(mu);
+    int activated =
+        ActivateNodesSlowPath(item, is_dead, iter_state, outputs, ready);
+    return AdjustOutstandingOpsLocked(iter_state, activated - 1, ready);
+  }
+  {
+    tf_shared_lock l(mu);
+    int activated =
+        ActivateNodesFastPathShared(item, is_dead, iter_state, outputs, ready);
+    bool iter_done = AdjustOutstandingOpsFastPath(iter_state, activated - 1);
+    if (!iter_done) return false;
+  }
+  mutex_lock l(mu);
+  return CleanupIterations(iter_state, ready);
+}
+
+int PropagatorState::FrameState::ActivateNodesLocked(const NodeItem* item,
+                                                     const bool is_dead,
+                                                     IterationState* iter_state,
+                                                     EntryVector* outputs,
+                                                     TaggedNodeSeq* ready) {
+  if (TF_PREDICT_FALSE(item->is_any_consumer_merge_or_control_trigger)) {
+    return ActivateNodesSlowPath(item, is_dead, iter_state, outputs, ready);
   } else {
-    ActivateNodesFastPath(item, is_dead, iter_state, outputs, ready);
+    return ActivateNodesFastPathLocked(item, is_dead, iter_state, outputs,
+                                       ready);
   }
 }
 
 void PropagatorState::FrameState::ActivateNexts(IterationState* iter_state,
                                                 TaggedNodeSeq* ready) {
+  int activated = 0;
   // Propagate the deferred NextIteration nodes to the new iteration.
   for (auto& node_entry : next_iter_roots) {
     const NodeItem* item = node_entry.first;
     const Entry& entry = node_entry.second;
     const bool is_dead = entry.state == Entry::State::NO_VALUE;
     EntryVector outputs{entry};
-    ActivateNodes(item, is_dead, iter_state, &outputs, ready);
+    activated +=
+        ActivateNodesLocked(item, is_dead, iter_state, &outputs, ready);
   }
   next_iter_roots.clear();
+  AdjustOutstandingOpsLocked(iter_state, activated, ready);
 }
 
 void PropagatorState::FrameState::ActivateLoopInvs(IterationState* iter_state,
                                                    TaggedNodeSeq* ready) {
   // Propagate loop invariants to the new iteration.
+  int activated = 0;
   for (auto& node_entry : inv_values) {
     const NodeItem* item = node_entry.first;
     const Entry& entry = node_entry.second;
     const bool is_dead = entry.state == Entry::State::NO_VALUE;
     EntryVector outputs{entry};
-    ActivateNodes(item, is_dead, iter_state, &outputs, ready);
+    activated +=
+        ActivateNodesLocked(item, is_dead, iter_state, &outputs, ready);
   }
+  AdjustOutstandingOpsLocked(iter_state, activated, ready);
 }
 
 void PropagatorState::FrameState::AddLoopInv(const NodeItem* item,
@@ -593,7 +635,10 @@ void PropagatorState::FrameState::AddLoopInv(const NodeItem* item,
   const bool is_dead = entry.state == Entry::State::NO_VALUE;
   for (int i = 0; i <= iteration_count; ++i) {
     EntryVector outputs{entry};
-    ActivateNodes(item, is_dead, GetIteration(i), &outputs, ready);
+    IterationState* iter_state = GetIteration(i);
+    int activated =
+        ActivateNodesLocked(item, is_dead, iter_state, &outputs, ready);
+    AdjustOutstandingOpsLocked(iter_state, activated, ready);
   }
 }
 
@@ -676,8 +721,50 @@ void PropagatorState::FrameState::SetIteration(int64 iter,
 // frame. Return true iff the execution of the frame is done.
 bool PropagatorState::FrameState::DecrementOutstandingOps(
     IterationState* iter_state, TaggedNodeSeq* ready) {
+  return AdjustOutstandingOps(iter_state, -1, ready);
+}
+
+bool PropagatorState::FrameState::AdjustOutstandingOps(
+    IterationState* iter_state, int delta, TaggedNodeSeq* ready) {
+  // Given the following profile of values of 'delta' for wide_deep model from
+  // the TF model garden:
+  //
+  // Count  Value
+  // ---------------
+  // 757938 delta=0x0
+  // 541713 delta=0xffffffff
+  // 138115 delta=0x1
+  //  58770 delta=0x2
+  //   5394 delta=0x3
+  //   4669 delta=0x4
+  //   2037 delta=0xa
+  //   1646 delta=0x7
+  //   1632 delta=0x6
+  //   1613 delta=0x6c
+  //   1224 delta=0x5
+  //    409 delta=0x53
+  //     17 delta=0x86
+  //
+  // ... it's worth no-opping out when delta == 0 to avoid the atomic
+  // instruction.
+  if (delta == 0) {
+    return false;
+  }
+  {
+    tf_shared_lock sl(mu);
+    if (TF_PREDICT_TRUE(!AdjustOutstandingOpsFastPath(iter_state, delta))) {
+      return false;
+    }
+  }
   mutex_lock l(mu);
-  return DecrementOutstandingOpsLocked(iter_state, ready);
+  DCHECK(IsIterationDone(iter_state));
+  return CleanupIterations(iter_state, ready);
+}
+
+bool PropagatorState::FrameState::AdjustOutstandingOpsFastPath(
+    IterationState* iter_state, int delta) {
+  auto old_val = iter_state->outstanding_ops.fetch_add(delta);
+  return (old_val + delta == 0) && IsIterationDone(iter_state);
 }
 
 // Decrement the outstanding op count and clean up the iterations in the
@@ -685,12 +772,22 @@ bool PropagatorState::FrameState::DecrementOutstandingOps(
 bool PropagatorState::FrameState::DecrementOutstandingOpsLocked(
     IterationState* iter_state, TaggedNodeSeq* ready)
     TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
-  iter_state->outstanding_ops--;
-  if (iter_state->outstanding_ops != 0) {
+  return AdjustOutstandingOpsLocked(iter_state, -1, ready);
+}
+
+bool PropagatorState::FrameState::AdjustOutstandingOpsLocked(
+    IterationState* iter_state, int delta, TaggedNodeSeq* ready) {
+  // We hold the lock, so we don't need to use an atomic modification.
+  auto cur_val = iter_state->outstanding_ops.load(std::memory_order_relaxed);
+  DCHECK(delta >= 0 || cur_val >= -delta)
+      << "cannot adjust outstanding_ops by " << delta
+      << " when current value is " << cur_val;
+  auto new_val = cur_val + delta;
+  iter_state->outstanding_ops.store(new_val, std::memory_order_relaxed);
+  if (new_val != 0) {
     return false;
-  } else {
-    return CleanupIterations(iter_state, ready);
   }
+  return CleanupIterations(iter_state, ready);
 }
 
 // Returns true if the computation in the frame is completed.
diff --git a/tensorflow/core/common_runtime/propagator_state.h b/tensorflow/core/common_runtime/propagator_state.h
index 167519ccc73..4e66e709310 100644
--- a/tensorflow/core/common_runtime/propagator_state.h
+++ b/tensorflow/core/common_runtime/propagator_state.h
@@ -143,7 +143,7 @@ class PropagatorState {
     Entry* input_tensors;
 
     // The number of outstanding ops for each iteration.
-    size_t outstanding_ops;
+    std::atomic<size_t> outstanding_ops;
 
     // The number of outstanding frames for each iteration.
     int outstanding_frame_count;
@@ -170,6 +170,10 @@ class PropagatorState {
                                                       bool increment_dead) {
       return counts.adjust_for_activation(h, increment_dead);
     }
+    PendingCounts::AdjustResult adjust_for_activation_atomic(
+        PendingCounts::Handle h, bool increment_dead) {
+      return counts.adjust_for_activation_atomic(h, increment_dead);
+    }
 
     ~IterationState() { delete[] input_tensors; }
 
@@ -283,7 +287,7 @@ class PropagatorState {
     void InitializeFrameInfo(const ImmutableExecutorState::FrameInfo& finfo);
 
     inline IterationState* GetIteration(int64 iter)
-        TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
+        TF_SHARED_LOCKS_REQUIRED(mu) {
       if (TF_PREDICT_TRUE(iter == 0)) {
         return iterations_first;
       } else {
@@ -294,13 +298,26 @@ class PropagatorState {
 
     void SetIteration(int64 iter, IterationState* state);
 
-    // Decrement the outstanding op count and clean up the iterations in the
-    // frame. Return true iff the execution of the frame is done.
+    // Adjust the outstanding op count by 'delta' and clean up the iterations in
+    // the frame if no more ops are oustanding. Return true iff the execution of
+    // the frame is done.
+    //
+    // Avoids acquiring the lock in the common case that the frame is not done.
+    bool AdjustOutstandingOps(IterationState* iter_state, int delta,
+                              TaggedNodeSeq* ready);
+
+    bool AdjustOutstandingOpsLocked(IterationState* iter_state, int delta,
+                                    TaggedNodeSeq* ready)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
+
+    bool AdjustOutstandingOpsFastPath(IterationState* iter_state, int delta)
+        TF_SHARED_LOCKS_REQUIRED(mu);
+
+    // Convenience methods for the above 'Adjust' calls where delta takes the
+    // common value of -1.
     bool DecrementOutstandingOps(IterationState* iter_state,
                                  TaggedNodeSeq* ready);
 
-    // Decrement the outstanding op count and clean up the iterations in the
-    // frame. Return true iff the execution of the frame is done.
     bool DecrementOutstandingOpsLocked(IterationState* iter_state,
                                        TaggedNodeSeq* ready);
 
@@ -309,7 +326,7 @@ class PropagatorState {
 
     // Returns true if the iteration of the frame is completed.
     bool IsIterationDone(IterationState* iter_state)
-        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
+        TF_SHARED_LOCKS_REQUIRED(mu);
 
     // Increments the iteration id. If this is a new iteration, initialize it.
     //
@@ -332,9 +349,23 @@ class PropagatorState {
 
     // Activate the successors of a node. Contents of *outputs are left in an
     // indeterminate state after returning from this method.
-    void ActivateNodes(const NodeItem* item, const bool is_dead,
-                       IterationState* iter_state, EntryVector* outputs,
-                       TaggedNodeSeq* ready) TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
+    //
+    // In the case that 'item' is a simple node (no merge/control outputs) this
+    // will acquire a shared lock and can run concurrently with other
+    // invocations.
+    //
+    // Return true if the frame is done after activation.
+    bool ActivateNodesAndAdjustOutstanding(const NodeItem* item,
+                                           const bool is_dead,
+                                           IterationState* iter_state,
+                                           EntryVector* outputs,
+                                           TaggedNodeSeq* ready);
+
+    // Same as the above, but requires 'mu' already held in exclusive mode.
+    int ActivateNodesLocked(const NodeItem* item, const bool is_dead,
+                            IterationState* iter_state, EntryVector* outputs,
+                            TaggedNodeSeq* ready)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
 
     // Cleanup iterations of this frame starting from the given iteration.
     bool CleanupIterations(IterationState* iter_state, TaggedNodeSeq* ready)
@@ -359,14 +390,35 @@ class PropagatorState {
 
    private:
     // REQUIRES: `!item->is_any_consumer_merge_or_control_trigger`.
-    void ActivateNodesFastPath(const NodeItem* item, const bool is_dead,
-                               IterationState* iter_state, EntryVector* outputs,
-                               TaggedNodeSeq* ready)
-        TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
+    // This variant does not use atomic operations to modify the pending counts
+    // and thus must hold the exclusive lock.
+    int ActivateNodesFastPathLocked(const NodeItem* item, const bool is_dead,
+                                    IterationState* iter_state,
+                                    EntryVector* outputs, TaggedNodeSeq* ready)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
+      return ActivateNodesFastPathInternal<false>(item, is_dead, iter_state,
+                                                  outputs, ready);
+    }
 
-    void ActivateNodesSlowPath(const NodeItem* item, const bool is_dead,
-                               IterationState* iter_state, EntryVector* outputs,
-                               TaggedNodeSeq* ready)
+    // REQUIRES: `!item->is_any_consumer_merge_or_control_trigger`.
+    // This variant uses atomic operations to modify the pending counts.
+    int ActivateNodesFastPathShared(const NodeItem* item, const bool is_dead,
+                                    IterationState* iter_state,
+                                    EntryVector* outputs, TaggedNodeSeq* ready)
+        TF_SHARED_LOCKS_REQUIRED(mu) {
+      return ActivateNodesFastPathInternal<true>(item, is_dead, iter_state,
+                                                 outputs, ready);
+    }
+
+    template <bool atomic>
+    int ActivateNodesFastPathInternal(const NodeItem* item, const bool is_dead,
+                                      IterationState* iter_state,
+                                      EntryVector* outputs,
+                                      TaggedNodeSeq* ready);
+
+    int ActivateNodesSlowPath(const NodeItem* item, const bool is_dead,
+                              IterationState* iter_state, EntryVector* outputs,
+                              TaggedNodeSeq* ready)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu);
   };
 
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
index cbec750e86c..9a7c730c1fb 100644
--- a/tensorflow/core/common_runtime/renamed_device.h
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -91,11 +91,6 @@ class RenamedDevice : public Device {
     return underlying_device_->has_eigen_cpu_device();
   }
 
-#ifdef TENSORFLOW_USE_SYCL
-  const Eigen::SyclDevice* eigen_sycl_device() const override {
-    return underlying_device_->eigen_sycl_device();
-  }
-#endif
 
   PerOpGpuDevice* MakeGpuDevice() override {
     return underlying_device_->MakeGpuDevice();
diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc
index 4af624ae1d2..2ee74477231 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@@ -151,7 +151,7 @@ void IntraProcessRecvAsyncImpl(const DeviceMgr* device_mgr,
 
 RefCountedIntraProcessRendezvous::RefCountedIntraProcessRendezvous(
     const DeviceMgr* device_mgr)
-    : device_mgr_(device_mgr) {}
+    : device_mgr_(device_mgr), local_(this) {}
 
 RefCountedIntraProcessRendezvous::~RefCountedIntraProcessRendezvous() {}
 
@@ -176,7 +176,7 @@ void RefCountedIntraProcessRendezvous::StartAbort(const Status& s) {
 
 PrivateIntraProcessRendezvous::PrivateIntraProcessRendezvous(
     const DeviceMgr* device_mgr)
-    : device_mgr_(device_mgr) {}
+    : device_mgr_(device_mgr), local_(nullptr) {}
 
 PrivateIntraProcessRendezvous::~PrivateIntraProcessRendezvous() {}
 
diff --git a/tensorflow/core/common_runtime/ring_alg.cc b/tensorflow/core/common_runtime/ring_alg.cc
index 870429bd883..b7a3bd11ec6 100644
--- a/tensorflow/core/common_runtime/ring_alg.cc
+++ b/tensorflow/core/common_runtime/ring_alg.cc
@@ -164,7 +164,7 @@ Status GenerateSubdivsInCollectiveParams(CollectiveParams* col_params) {
 
 Status RingAlg::InitializeCollectiveParams(CollectiveParams* col_params) {
   const string& device_name =
-      col_params->instance.device_names[col_params->default_rank];
+      col_params->group.device_names[col_params->default_rank];
   // Each subdiv permutation is a ring formed by rotating each
   // single-task subsequence of devices by an offset.  This makes most
   // sense when each task has the same number of devices but we can't
@@ -175,15 +175,15 @@ Status RingAlg::InitializeCollectiveParams(CollectiveParams* col_params) {
   // Precondition: device_names must be sorted so that all devices in
   // the same task are adjacent.
   VLOG(2) << "Sorted task names: "
-          << absl::StrJoin(col_params->instance.task_names, ", ");
+          << absl::StrJoin(col_params->group.task_names, ", ");
   std::vector<int> dev_per_task;
-  const string* prior_task_name = &col_params->instance.task_names[0];
+  const string* prior_task_name = &col_params->group.task_names[0];
   int dev_count = 1;
   for (int di = 1; di < col_params->group.group_size; ++di) {
-    if (col_params->instance.task_names[di] != *prior_task_name) {
+    if (col_params->group.task_names[di] != *prior_task_name) {
       dev_per_task.push_back(dev_count);
       dev_count = 1;
-      prior_task_name = &col_params->instance.task_names[di];
+      prior_task_name = &col_params->group.task_names[di];
     } else {
       ++dev_count;
     }
@@ -227,7 +227,7 @@ Status RingAlg::InitializeCollectiveParams(CollectiveParams* col_params) {
         int permuted_di = prior_dev_count + offset_di;
         int rank = static_cast<int>(perm.size());
         perm.push_back(permuted_di);
-        if (col_params->instance.device_names[permuted_di] == device_name) {
+        if (col_params->group.device_names[permuted_di] == device_name) {
           DCHECK_EQ(permuted_di, col_params->default_rank);
           col_params->subdiv_rank[sdi] = rank;
         }
@@ -385,8 +385,8 @@ void RingAlg::DispatchSend(RingField* rf, const StatusCallback& done) {
   int send_to_dev_idx = col_params_->instance.impl_details
                             .subdiv_permutations[rf->subdiv_idx][send_to_rank];
   col_ctx_->col_exec->remote_access()->PostToPeer(
-      col_params_->instance.device_names[send_to_dev_idx],
-      col_params_->instance.task_names[send_to_dev_idx], send_buf_key,
+      col_params_->group.device_names[send_to_dev_idx],
+      col_params_->group.task_names[send_to_dev_idx], send_buf_key,
       col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
       col_ctx_->op_ctx->output_alloc_attr(0), &rf->chunk,
       col_ctx_->device_locality, done);
@@ -404,8 +404,8 @@ void RingAlg::DispatchRecv(RingField* rf, const StatusCallback& done) {
                            ? &rf->tmp_chunk
                            : &rf->chunk;
   col_ctx_->col_exec->remote_access()->RecvFromPeer(
-      col_params_->instance.device_names[rf->recv_dev_idx],
-      col_params_->instance.task_names[rf->recv_dev_idx],
+      col_params_->group.device_names[rf->recv_dev_idx],
+      col_params_->group.task_names[rf->recv_dev_idx],
       col_params_->task.is_local[rf->recv_dev_idx], recv_buf_key,
       col_ctx_->device, col_ctx_->op_ctx->op_device_context(),
       col_ctx_->op_ctx->output_alloc_attr(0), dst_tensor,
diff --git a/tensorflow/core/common_runtime/ring_gatherer.cc b/tensorflow/core/common_runtime/ring_gatherer.cc
index ecffd4a6eea..8ddf3e2c004 100644
--- a/tensorflow/core/common_runtime/ring_gatherer.cc
+++ b/tensorflow/core/common_runtime/ring_gatherer.cc
@@ -71,9 +71,9 @@ void RingGatherer::Run(StatusCallback done) {
 
   if (VLOG_IS_ON(1)) {
     string buf;
-    for (int r = 0; r < col_params_->instance.device_names.size(); ++r) {
+    for (int r = 0; r < col_params_->group.device_names.size(); ++r) {
       strings::StrAppend(&buf, "dev ", r, " : ",
-                         col_params_->instance.device_names[r], "\n");
+                         col_params_->group.device_names[r], "\n");
     }
     for (int sd = 0;
          sd < col_params_->instance.impl_details.subdiv_permutations.size();
diff --git a/tensorflow/core/common_runtime/ring_gatherer_test.cc b/tensorflow/core/common_runtime/ring_gatherer_test.cc
index 5d7a68156dd..6b51993e2f4 100644
--- a/tensorflow/core/common_runtime/ring_gatherer_test.cc
+++ b/tensorflow/core/common_runtime/ring_gatherer_test.cc
@@ -222,8 +222,8 @@ class RingGathererTest : public ::testing::Test {
           dev_name =
               strings::StrCat(task_name, "/gpu:", di % gpu_devices_.size());
         }
-        col_params_.instance.device_names.push_back(dev_name);
-        col_params_.instance.task_names.push_back(task_name);
+        col_params_.group.device_names.push_back(dev_name);
+        col_params_.group.task_names.push_back(task_name);
         // Normally each device would set is_local to its own perspective but
         // this test runs in a single process so is_local is always true.
         col_params_.task.is_local.push_back(true);
@@ -240,7 +240,7 @@ class RingGathererTest : public ::testing::Test {
       for (int di = 0; di < num_devices; ++di) {
         int rank = wi * num_devices + di;
         instances_.push_back(new DeviceInstance(
-            rank, col_params_.instance.device_names[rank], device_type_, this));
+            rank, col_params_.group.device_names[rank], device_type_, this));
       }
     }
   }
@@ -389,9 +389,7 @@ class RingGathererTest : public ::testing::Test {
           << "Couldn't find device " << dev_name
           << " existing devices: " << parent_->dev_mgr_->DebugString();
       col_params_.name = parent_->col_params_.name;
-      col_params_.group.group_key = parent_->col_params_.group.group_key;
-      col_params_.group.device_type = parent_->col_params_.group.device_type;
-      col_params_.group.group_size = parent_->col_params_.group.group_size;
+      col_params_.group = parent_->col_params_.group;
       col_params_.instance = parent->col_params_.instance;
       col_params_.task.is_local = parent_->col_params_.task.is_local;
       col_params_.subdiv_rank = parent_->col_params_.subdiv_rank;
@@ -399,7 +397,7 @@ class RingGathererTest : public ::testing::Test {
       int num_subdivs = static_cast<int>(col_params_.subdiv_rank.size());
       int group_size = col_params_.group.group_size;
       CHECK_EQ(group_size,
-               static_cast<int>(col_params_.instance.device_names.size()));
+               static_cast<int>(col_params_.group.device_names.size()));
       // Id of this device is at rank position in first subdiv perm.
       int my_device_id =
           col_params_.instance.impl_details.subdiv_permutations[0][rank];
@@ -480,8 +478,9 @@ class RingGathererTest : public ::testing::Test {
       RingGatherer* gatherer = new RingGatherer;
       core::ScopedUnref unref(gatherer);
       auto col_ctx = std::make_shared<CollectiveContext>(
-          parent_->col_exec_, parent_->dev_mgr_.get(), &ctx, &op_params,
-          col_params_, exec_key, kStepId, &input_tensor_, output_tensor_ptr);
+          parent_->col_exec_, /*nccl_communicator*/ nullptr,
+          parent_->dev_mgr_.get(), &ctx, &op_params, col_params_, exec_key,
+          kStepId, &input_tensor_, output_tensor_ptr);
       TF_CHECK_OK(gatherer->InitializeCollectiveContext(col_ctx));
 
       // Run the all-gather.
@@ -546,8 +545,8 @@ CollectiveParams SetUpCollectiveParams(const int num_devs_per_task,
     int dev_id = i % num_devs_per_task;
     string task_name = strings::StrCat("/job:worker/replica:0/task:", task_id);
     string device_name = strings::StrCat(task_name, "/device:GPU:", dev_id);
-    cp.instance.task_names.push_back(task_name);
-    cp.instance.device_names.push_back(device_name);
+    cp.group.task_names.push_back(task_name);
+    cp.group.device_names.push_back(device_name);
   }
   return cp;
 }
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index ab4542d58d8..71f0226549f 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -67,9 +67,9 @@ void RingReducer::Run(StatusCallback done) {
 
   if (VLOG_IS_ON(1)) {
     string buf;
-    for (int r = 0; r < col_params_->instance.device_names.size(); ++r) {
+    for (int r = 0; r < col_params_->group.device_names.size(); ++r) {
       strings::StrAppend(&buf, "dev ", r, " : ",
-                         col_params_->instance.device_names[r], "\n");
+                         col_params_->group.device_names[r], "\n");
     }
     for (int sd = 0;
          sd < col_params_->instance.impl_details.subdiv_permutations.size();
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index 11157d49ae8..ad20a243151 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -238,15 +238,15 @@ class RingReducerTest : public ::testing::Test {
     // Set up all of the fake device contexts.
     for (int wi = 0; wi < num_workers; ++wi) {
       string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
-      col_params_.instance.num_devices_per_task[task_name] = num_devices;
+      col_params_.group.num_devices_per_task[task_name] = num_devices;
       for (int di = 0; di < num_devices; ++di) {
         string dev_name = strings::StrCat(task_name, "/cpu:", di);
         if (device_type == DEVICE_GPU) {
           dev_name =
               strings::StrCat(task_name, "/gpu:", di % gpu_devices_.size());
         }
-        col_params_.instance.device_names.push_back(dev_name);
-        col_params_.instance.task_names.push_back(task_name);
+        col_params_.group.device_names.push_back(dev_name);
+        col_params_.group.task_names.push_back(task_name);
         // Normally each device would set is_local to its own perspective but
         // this test runs in a single process so is_local is always true.
         col_params_.task.is_local.push_back(true);
@@ -263,7 +263,7 @@ class RingReducerTest : public ::testing::Test {
       for (int di = 0; di < num_devices; ++di) {
         int rank = wi * num_devices + di;
         instances_.push_back(new DeviceInstance(
-            rank, col_params_.instance.device_names[rank], device_type_, this));
+            rank, col_params_.group.device_names[rank], device_type_, this));
       }
     }
   }
@@ -414,9 +414,7 @@ class RingReducerTest : public ::testing::Test {
           << "Couldn't find device " << dev_name
           << " existing devices: " << parent_->dev_mgr_->DebugString();
       col_params_.name = parent_->col_params_.name;
-      col_params_.group.group_key = parent_->col_params_.group.group_key;
-      col_params_.group.device_type = parent_->col_params_.group.device_type;
-      col_params_.group.group_size = parent_->col_params_.group.group_size;
+      col_params_.group = parent_->col_params_.group;
       col_params_.instance = parent->col_params_.instance;
       col_params_.task.is_local = parent_->col_params_.task.is_local;
       col_params_.subdiv_rank = parent_->col_params_.subdiv_rank;
@@ -424,7 +422,7 @@ class RingReducerTest : public ::testing::Test {
       int num_subdivs = static_cast<int>(col_params_.subdiv_rank.size());
       int group_size = col_params_.group.group_size;
       CHECK_EQ(group_size,
-               static_cast<int>(col_params_.instance.device_names.size()));
+               static_cast<int>(col_params_.group.device_names.size()));
       // Id of this device is at rank position in first subdiv perm.
       int my_device_id =
           col_params_.instance.impl_details.subdiv_permutations[0][rank];
@@ -510,8 +508,9 @@ class RingReducerTest : public ::testing::Test {
       RingReducer* reducer = new RingReducer;
       core::ScopedUnref unref(reducer);
       auto col_ctx = std::make_shared<CollectiveContext>(
-          parent_->col_exec_, parent_->dev_mgr_.get(), &ctx, &op_params,
-          col_params_, exec_key, kStepId, &tensor_, &tensor_);
+          parent_->col_exec_, /*nccl_communicator*/ nullptr,
+          parent_->dev_mgr_.get(), &ctx, &op_params, col_params_, exec_key,
+          kStepId, &tensor_, &tensor_);
       TF_CHECK_OK(reducer->InitializeCollectiveContext(col_ctx));
 
       // Run the all-reduce.
@@ -573,8 +572,8 @@ CollectiveParams SetUpCollectiveParams(const int num_devs_per_task,
     int dev_id = i % num_devs_per_task;
     string task_name = strings::StrCat("/job:worker/replica:0/task:", task_id);
     string device_name = strings::StrCat(task_name, "/device:GPU:", dev_id);
-    cp.instance.task_names.push_back(task_name);
-    cp.instance.device_names.push_back(device_name);
+    cp.group.task_names.push_back(task_name);
+    cp.group.device_names.push_back(device_name);
   }
   return cp;
 }
diff --git a/tensorflow/core/common_runtime/sycl/BUILD b/tensorflow/core/common_runtime/sycl/BUILD
deleted file mode 100644
index 426903197df..00000000000
--- a/tensorflow/core/common_runtime/sycl/BUILD
+++ /dev/null
@@ -1,46 +0,0 @@
-load(
-    "//tensorflow:tensorflow.bzl",
-    "if_not_windows",
-    "tf_copts",
-)
-load(
-    "//tensorflow/core/platform:rules_cc.bzl",
-    "cc_library",
-)
-
-package(
-    default_visibility = [
-        "//tensorflow:internal",
-    ],
-    features = ["-parse_headers"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-cc_library(
-    name = "sycl_runtime",
-    srcs = if_not_windows([
-        "sycl_allocator.cc",
-        "sycl_device.cc",
-        "sycl_device_context.cc",
-        "sycl_device_factory.cc",
-    ]),
-    hdrs = if_not_windows([
-        "sycl_allocator.h",
-        "sycl_device.h",
-        "sycl_util.h",
-        "sycl_device_context.h",
-    ]),
-    copts = tf_copts(),
-    linkstatic = 0,
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/common_runtime:core_cpu",
-        "//tensorflow/core/common_runtime:core_cpu_internal",
-        "//third_party/eigen3",
-        "@local_config_sycl//sycl",
-    ],
-    alwayslink = 0,
-)
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc b/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
deleted file mode 100644
index 6a784efe6f5..00000000000
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifdef TENSORFLOW_USE_SYCL
-
-#include "tensorflow/core/common_runtime/sycl/sycl_allocator.h"
-
-namespace tensorflow {
-
-SYCLAllocator::SYCLAllocator(Eigen::QueueInterface* queue)
-    : sycl_device_(new Eigen::SyclDevice(queue)) {
-  cl::sycl::queue& sycl_queue = sycl_device_->sycl_queue();
-  const cl::sycl::device& device = sycl_queue.get_device();
-  stats_.bytes_limit =
-      device.get_info<cl::sycl::info::device::max_mem_alloc_size>();
-}
-
-SYCLAllocator::~SYCLAllocator() {
-  if (sycl_device_) {
-    delete sycl_device_;
-  }
-}
-
-string SYCLAllocator::Name() { return "device:SYCL"; }
-
-void* SYCLAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
-  mutex_lock lock(mu_);
-  assert(sycl_device_);
-  if (num_bytes == 0) {
-    // Cannot allocate no bytes in SYCL, so instead allocate a single byte
-    num_bytes = 1;
-  }
-  auto p = sycl_device_->allocate(num_bytes);
-  const auto& allocated_buffer = sycl_device_->get_sycl_buffer(p);
-  const std::size_t bytes_allocated = allocated_buffer.get_range().size();
-
-  ++stats_.num_allocs;
-  stats_.bytes_in_use += bytes_allocated;
-  stats_.max_bytes_in_use =
-      std::max<int64>(stats_.max_bytes_in_use, stats_.bytes_in_use);
-  stats_.max_alloc_size =
-      std::max<int64>(stats_.max_alloc_size, bytes_allocated);
-
-  return p;
-}
-
-void SYCLAllocator::DeallocateRaw(void* ptr) {
-  mutex_lock lock(mu_);
-  if (sycl_device_) {
-    const auto& buffer_to_delete = sycl_device_->get_sycl_buffer(ptr);
-    const std::size_t dealloc_size = buffer_to_delete.get_range().size();
-    stats_.bytes_in_use -= dealloc_size;
-    sycl_device_->deallocate(ptr);
-  }
-}
-
-void SYCLAllocator::GetStats(AllocatorStats* stats) {
-  mutex_lock lock(mu_);
-  *stats = stats_;
-}
-
-void SYCLAllocator::ClearStats() override {
-  mutex_lock l(mu_);
-  stats_.num_allocs = 0;
-  stats_.max_bytes_in_use = stats_.bytes_in_use;
-  stats_.max_alloc_size = 0;
-}
-
-size_t SYCLAllocator::RequestedSize(const void* ptr) const {
-  mutex_lock lock(mu_);
-  if (!sycl_device_) {
-    return 0;
-  }
-  const auto& buffer = sycl_device_->get_sycl_buffer(ptr);
-  return buffer.get_size();
-}
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_allocator.h b/tensorflow/core/common_runtime/sycl/sycl_allocator.h
deleted file mode 100644
index a70291181d0..00000000000
--- a/tensorflow/core/common_runtime/sycl/sycl_allocator.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if !TENSORFLOW_USE_SYCL
-#error This file must only be included when building TensorFlow with SYCL support
-#endif
-
-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-
-class SYCLAllocator : public Allocator {
- public:
-  SYCLAllocator(Eigen::QueueInterface* queue);
-  ~SYCLAllocator() override;
-  string Name() override;
-  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
-  void DeallocateRaw(void* ptr) override;
-
-  bool ShouldAllocateEmptyTensors() const final { return true; }
-  void Synchronize() {
-    mutex_lock lock(mu_);
-    if (sycl_device_) {
-      sycl_device_->synchronize();
-    }
-  }
-  bool Ok() const { return sycl_device_ && sycl_device_->ok(); }
-  void GetStats(AllocatorStats* stats) override;
-  void ClearStats() override;
-
-  // The SYCL buffers keep track of their size, so we already have tracking.
-  bool TracksAllocationSizes() const override { return true; }
-  // Get the size of the corresponding SYCL buffer.
-  // Implementing this also provides an implementation of
-  // AllocatedSize(void* ptr) by default.
-  size_t RequestedSize(const void* ptr) const override;
-  Eigen::SyclDevice* getSyclDevice() { return sycl_device_; }
-  // Clear the SYCL device used by the Allocator
-  void ClearSYCLDevice() {
-    mutex_lock lock(mu_);
-    if (sycl_device_) {
-      delete sycl_device_;
-      sycl_device_ = nullptr;
-    }
-  }
-
- private:
-  mutable mutex mu_;
-  Eigen::SyclDevice* sycl_device_ TF_GUARDED_BY(mu_);  // owned
-  AllocatorStats stats_ TF_GUARDED_BY(mu_);
-
-  TF_DISALLOW_COPY_AND_ASSIGN(SYCLAllocator);
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.cc b/tensorflow/core/common_runtime/sycl/sycl_device.cc
deleted file mode 100644
index 8293e6d8881..00000000000
--- a/tensorflow/core/common_runtime/sycl/sycl_device.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if TENSORFLOW_USE_SYCL
-
-#include "tensorflow/core/common_runtime/sycl/sycl_device.h"
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/framework/tensor.pb.h"
-#include "tensorflow/core/platform/tracing.h"
-
-namespace tensorflow {
-
-SYCLDevice::~SYCLDevice() {}
-
-void SYCLDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
-  assert(context);
-  // When ThreadScape profiling is off (which is the default), constructing the
-  // following code is simple enough that its overhead is negligible.
-  tracing::ScopedRegion region(tracing::EventCategory::kCompute,
-                               op_kernel->name());
-
-  op_kernel->Compute(context);
-}
-
-Allocator* SYCLDevice::GetAllocator(AllocatorAttributes attr) {
-  if (attr.on_host())
-    return cpu_allocator_;
-  else
-    return sycl_allocator_;
-}
-
-Status SYCLDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
-                                       const AllocatorAttributes alloc_attrs,
-                                       Tensor* tensor) {
-  AllocatorAttributes attr;
-  attr.set_on_host(true);
-  Allocator* host_alloc = GetAllocator(attr);
-
-  Tensor parsed(tensor_proto.dtype());
-  if (!parsed.FromProto(host_alloc, tensor_proto)) {
-    return errors::InvalidArgument("Cannot parse tensor from proto: ",
-                                   tensor_proto.DebugString());
-  }
-  Status status;
-  if (alloc_attrs.on_host()) {
-    *tensor = parsed;
-  } else {
-    Tensor copy(GetAllocator(alloc_attrs), parsed.dtype(), parsed.shape());
-
-    // If the tensor is not initialized, we likely ran out of memory.
-    if (!copy.IsInitialized()) {
-      return errors::ResourceExhausted(
-          "OOM when allocating tensor of shape ", parsed.shape().DebugString(),
-          " and type ", DataTypeString(parsed.dtype()));
-    }
-
-    device_context_->CopyCPUTensorToDevice(
-        &parsed, this, &copy, [&status](const Status& s) { status = s; });
-    *tensor = copy;
-  }
-  return status;
-}
-
-Status SYCLDevice::TryGetDeviceContext(DeviceContext** out_context) {
-  device_context_->Ref();
-  *out_context = device_context_;
-  return Status::OK();
-}
-
-Status SYCLDevice::Sync() {
-  sycl_allocator_->Synchronize();
-  if (sycl_allocator_->Ok()) {
-    return Status::OK();
-  } else {
-    return errors::Internal("Unknown error detected on device ", name());
-  }
-}
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.h b/tensorflow/core/common_runtime/sycl/sycl_device.h
deleted file mode 100644
index 08b5b3979ca..00000000000
--- a/tensorflow/core/common_runtime/sycl/sycl_device.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if !TENSORFLOW_USE_SYCL
-#error This file must only be included when building TensorFlow with SYCL support
-#endif
-
-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_DEVICE_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_DEVICE_H_
-
-#include "tensorflow/core/common_runtime/local_device.h"
-#include "tensorflow/core/common_runtime/sycl/sycl_allocator.h"
-#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
-#include "tensorflow/core/public/session_options.h"
-
-namespace tensorflow {
-
-class GSYCLInterface {
-  std::vector<Eigen::QueueInterface*> m_queue_interface_;  // owned
-  std::vector<Allocator*> m_cpu_allocator_;                // not owned
-  std::vector<SYCLAllocator*> m_sycl_allocator_;           // owned
-  std::vector<SYCLDeviceContext*> m_sycl_context_;         // ref counted
-  GSYCLInterface() {
-    bool found_device = false;
-    auto device_list = Eigen::get_sycl_supported_devices();
-    // Obtain list of supported devices from Eigen
-    for (const auto& device : device_list) {
-      if (device.is_gpu()) {
-        // returns first found GPU
-        AddDevice(device);
-        found_device = true;
-      }
-    }
-
-    if (!found_device) {
-      // Currently Intel GPU is not supported
-      LOG(WARNING) << "No OpenCL GPU found that is supported by "
-                   << "ComputeCpp/triSYCL, trying OpenCL CPU";
-    }
-
-    for (const auto& device : device_list) {
-      if (device.is_cpu()) {
-        // returns first found CPU
-        AddDevice(device);
-        found_device = true;
-      }
-    }
-
-    if (!found_device) {
-      LOG(WARNING) << "No OpenCL CPU found that is supported by "
-                   << "ComputeCpp/triSYCL, checking for host sycl device";
-    }
-
-    for (const auto& device : device_list) {
-      // triSYCL only supports the host device for now
-      if (device.is_host()) {
-        LOG(WARNING) << "Found SYCL host device";
-        AddDevice(device);
-        found_device = true;
-      }
-    }
-
-    if (!found_device) {
-      // Currently Intel GPU is not supported
-      LOG(FATAL) << "No SYCL host and no OpenCL GPU nor CPU"
-                 << " supported by ComputeCPP/triSYCL was found";
-    } else {
-      LOG(INFO) << "Found following OpenCL devices:";
-      for (int i = 0; i < device_list.size(); i++) {
-        LOG(INFO) << GetShortDeviceDescription(i);
-      }
-    }
-  }
-
-  ~GSYCLInterface() {
-    m_cpu_allocator_.clear();
-
-    for (auto p : m_sycl_allocator_) {
-      p->Synchronize();
-      p->ClearSYCLDevice();
-      // Cannot delete the Allocator instances, as the Allocator lifetime
-      // needs to exceed any Tensor created by it. There is no way of
-      // knowing when all Tensors have been deallocated, as they are
-      // RefCounted and wait until all instances of a Tensor have been
-      // destroyed before calling Allocator.Deallocate. This could happen at
-      // program exit, which can set up a race condition between destroying
-      // Tensors and Allocators when the program is cleaning up.
-    }
-    m_sycl_allocator_.clear();
-
-    for (auto p : m_sycl_context_) {
-      p->Unref();
-    }
-    m_sycl_context_.clear();
-
-    for (auto p : m_queue_interface_) {
-      p->deallocate_all();
-      delete p;
-    }
-    m_queue_interface_.clear();
-  }
-
-  void AddDevice(const cl::sycl::device& d) {
-    m_queue_interface_.push_back(new Eigen::QueueInterface(d));
-    m_cpu_allocator_.push_back(cpu_allocator());
-    m_sycl_allocator_.push_back(new SYCLAllocator(m_queue_interface_.back()));
-    m_sycl_context_.push_back(new SYCLDeviceContext());
-  }
-
- public:
-  static const GSYCLInterface* instance() {
-    // c++11 guarantees that this will be constructed in a thread safe way
-    static const GSYCLInterface instance;
-    return &instance;
-  }
-
-  Eigen::QueueInterface* GetQueueInterface(size_t i = 0) const {
-    if (!m_queue_interface_.empty()) {
-      return m_queue_interface_[i];
-    } else {
-      std::cerr << "No cl::sycl::device has been added" << std::endl;
-      return nullptr;
-    }
-  }
-
-  SYCLAllocator* GetSYCLAllocator(size_t i = 0) const {
-    if (!m_sycl_allocator_.empty()) {
-      return m_sycl_allocator_[i];
-    } else {
-      std::cerr << "No cl::sycl::device has been added" << std::endl;
-      return nullptr;
-    }
-  }
-
-  Allocator* GetCPUAllocator(size_t i = 0) const {
-    if (!m_cpu_allocator_.empty()) {
-      return m_cpu_allocator_[i];
-    } else {
-      std::cerr << "No cl::sycl::device has been added" << std::endl;
-      return nullptr;
-    }
-  }
-
-  SYCLDeviceContext* GetSYCLContext(size_t i = 0) const {
-    if (!m_sycl_context_.empty()) {
-      return m_sycl_context_[i];
-    } else {
-      std::cerr << "No cl::sycl::device has been added" << std::endl;
-      return nullptr;
-    }
-  }
-
-  string GetShortDeviceDescription(int device_id = 0) const {
-    Eigen::QueueInterface* queue_ptr = GetQueueInterface(device_id);
-    if (!queue_ptr) {
-      LOG(ERROR)
-          << "Device name cannot be given after Eigen QueueInterface destroyed";
-      return "";
-    }
-    auto device = queue_ptr->sycl_queue().get_device();
-    auto name = device.get_info<cl::sycl::info::device::name>();
-    auto vendor = device.get_info<cl::sycl::info::device::vendor>();
-    auto profile = device.get_info<cl::sycl::info::device::profile>();
-
-    std::string type;
-    if (device.is_host()) {
-      type = "Host";
-    } else if (device.is_cpu()) {
-      type = "CPU";
-    } else if (device.is_gpu()) {
-      type = "GPU";
-    } else if (device.is_accelerator()) {
-      type = "Accelerator";
-    } else {
-      type = "Unknown";
-    }
-
-    return strings::StrCat(
-        "id: ", device_id, ", type: ", type, ", name: ", name.c_str(),
-        ", vendor: ", vendor.c_str(), ", profile: ", profile.c_str());
-  }
-};
-
-class SYCLDevice : public LocalDevice {
- public:
-  SYCLDevice(const SessionOptions& options, const string& name,
-             Bytes memory_limit, const DeviceLocality& locality,
-             const string& physical_device_desc, SYCLAllocator* sycl_allocator,
-             Allocator* cpu_allocator, SYCLDeviceContext* ctx)
-      : LocalDevice(options, Device::BuildDeviceAttributes(
-                                 name, DEVICE_SYCL, memory_limit, locality,
-                                 physical_device_desc)),
-        cpu_allocator_(cpu_allocator),
-        sycl_allocator_(sycl_allocator),
-        device_context_(ctx) {
-    set_eigen_sycl_device(sycl_allocator->getSyclDevice());
-  }
-
-  ~SYCLDevice() override;
-
-  void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
-  Allocator* GetAllocator(AllocatorAttributes attr) override;
-  Status MakeTensorFromProto(const TensorProto& tensor_proto,
-                             const AllocatorAttributes alloc_attrs,
-                             Tensor* tensor) override;
-
-  Status TryGetDeviceContext(DeviceContext** out_context) override;
-
-  Status Sync() override;
-
- private:
-  Allocator* cpu_allocator_;           // not owned
-  SYCLAllocator* sycl_allocator_;      // not owned
-  SYCLDeviceContext* device_context_;  // not owned
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_DEVICE_H_
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_context.cc b/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
deleted file mode 100644
index 1c868f5606e..00000000000
--- a/tensorflow/core/common_runtime/sycl/sycl_device_context.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if TENSORFLOW_USE_SYCL
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-
-#include "tensorflow/core/common_runtime/dma_helper.h"
-#include "tensorflow/core/common_runtime/sycl/sycl_device_context.h"
-
-namespace tensorflow {
-
-void SYCLDeviceContext::CopyCPUTensorToDevice(const Tensor *cpu_tensor,
-                                              Device *device,
-                                              Tensor *device_tensor,
-                                              StatusCallback done) const {
-  const int64 total_bytes = cpu_tensor->TotalBytes();
-  if (total_bytes > 0) {
-    const void *src_ptr = DMAHelper::base(cpu_tensor);
-    void *dst_ptr = DMAHelper::base(device_tensor);
-    switch (cpu_tensor->dtype()) {
-      case DT_FLOAT:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_DOUBLE:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<double *>(dst_ptr),
-            static_cast<const double *>(src_ptr), total_bytes);
-        break;
-      case DT_INT32:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_INT64:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_HALF:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<Eigen::half *>(dst_ptr),
-            static_cast<const Eigen::half *>(src_ptr), total_bytes);
-        break;
-      case DT_COMPLEX64:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<std::complex<float> *>(dst_ptr),
-            static_cast<const std::complex<float> *>(src_ptr), total_bytes);
-        break;
-      case DT_COMPLEX128:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<std::complex<double> *>(dst_ptr),
-            static_cast<const std::complex<double> *>(src_ptr), total_bytes);
-        break;
-      case DT_INT8:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_INT16:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_UINT8:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_UINT16:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<uint16 *>(dst_ptr),
-            static_cast<const uint16 *>(src_ptr), total_bytes);
-        break;
-      case DT_BOOL:
-        device->eigen_sycl_device()->memcpyHostToDevice(
-            static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
-            total_bytes);
-        break;
-      default:
-        assert(false && "unsupported type");
-    }
-  }
-  device->eigen_sycl_device()->synchronize();
-  done(Status::OK());
-}
-
-void SYCLDeviceContext::CopyDeviceTensorToCPU(const Tensor *device_tensor,
-                                              StringPiece edge_name,
-                                              Device *device,
-                                              Tensor *cpu_tensor,
-                                              StatusCallback done) {
-  const int64 total_bytes = device_tensor->TotalBytes();
-  if (total_bytes > 0) {
-    const void *src_ptr = DMAHelper::base(device_tensor);
-    void *dst_ptr = DMAHelper::base(cpu_tensor);
-    switch (device_tensor->dtype()) {
-      case DT_FLOAT:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<float *>(dst_ptr), static_cast<const float *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_DOUBLE:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<double *>(dst_ptr),
-            static_cast<const double *>(src_ptr), total_bytes);
-        break;
-      case DT_INT32:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<int32 *>(dst_ptr), static_cast<const int32 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_INT64:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<int64 *>(dst_ptr), static_cast<const int64 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_HALF:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<Eigen::half *>(dst_ptr),
-            static_cast<const Eigen::half *>(src_ptr), total_bytes);
-        break;
-      case DT_COMPLEX64:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<std::complex<float> *>(dst_ptr),
-            static_cast<const std::complex<float> *>(src_ptr), total_bytes);
-        break;
-      case DT_COMPLEX128:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<std::complex<double> *>(dst_ptr),
-            static_cast<const std::complex<double> *>(src_ptr), total_bytes);
-        break;
-      case DT_INT8:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<int8 *>(dst_ptr), static_cast<const int8 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_INT16:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<int16 *>(dst_ptr), static_cast<const int16 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_UINT8:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<uint8 *>(dst_ptr), static_cast<const uint8 *>(src_ptr),
-            total_bytes);
-        break;
-      case DT_UINT16:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<uint16 *>(dst_ptr),
-            static_cast<const uint16 *>(src_ptr), total_bytes);
-        break;
-      case DT_BOOL:
-        device->eigen_sycl_device()->memcpyDeviceToHost(
-            static_cast<bool *>(dst_ptr), static_cast<const bool *>(src_ptr),
-            total_bytes);
-        break;
-      default:
-        assert(false && "unsupported type");
-    }
-  }
-  device->eigen_sycl_device()->synchronize();
-  done(Status::OK());
-}
-
-}  // namespace tensorflow
-#endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_context.h b/tensorflow/core/common_runtime/sycl/sycl_device_context.h
deleted file mode 100644
index 0f8f17b8058..00000000000
--- a/tensorflow/core/common_runtime/sycl/sycl_device_context.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if !TENSORFLOW_USE_SYCL
-#error This file must only be included when building TensorFlow with SYCL support
-#endif
-
-#ifndef TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_
-#define TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_
-
-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/framework/device_base.h"
-
-namespace tensorflow {
-
-class SYCLDeviceContext : public DeviceContext {
- public:
-  SYCLDeviceContext() {}
-
-  ~SYCLDeviceContext() override {}
-
-  void CopyCPUTensorToDevice(const Tensor *cpu_tensor, Device *device,
-                             Tensor *device_tensor,
-                             StatusCallback done) const override;
-
-  void CopyDeviceTensorToCPU(const Tensor *device_tensor, StringPiece edge_name,
-                             Device *device, Tensor *cpu_tensor,
-                             StatusCallback done) override;
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMMON_RUNTIME_SYCL_SYCL_DEVICE_CONTEXT_H_
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc b/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
deleted file mode 100644
index ca575450279..00000000000
--- a/tensorflow/core/common_runtime/sycl/sycl_device_factory.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if TENSORFLOW_USE_SYCL
-
-#include "tensorflow/core/common_runtime/device_factory.h"
-#include "tensorflow/core/common_runtime/sycl/sycl_device.h"
-
-#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-
-namespace tensorflow {
-
-class SYCLDeviceFactory : public DeviceFactory {
- public:
-  Status ListPhysicalDevices(std::vector<string>* devices) override {
-    return tensorflow::Status::OK();
-  }
-
-  Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<std::unique_ptr<Device>>* devices) override {
-    auto syclInterface = GSYCLInterface::instance();
-
-    size_t n = 1;
-    auto iter = options.config.device_count().find("SYCL");
-    if (iter != options.config.device_count().end()) {
-      n = iter->second;
-    }
-
-    for (int i = 0; i < n; i++) {
-      string name = strings::StrCat(name_prefix, "/device:SYCL:", i);
-      devices->push_back(new SYCLDevice(
-          options, name, Bytes(256 << 20), DeviceLocality(),
-          syclInterface->GetShortDeviceDescription(i),
-          syclInterface->GetSYCLAllocator(i), syclInterface->GetCPUAllocator(i),
-          syclInterface->GetSYCLContext(i)));
-    }
-
-    return Status::OK();
-  }
-};
-
-REGISTER_LOCAL_DEVICE_FACTORY("SYCL", SYCLDeviceFactory, 200);
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/common_runtime/sycl/sycl_util.h b/tensorflow/core/common_runtime/sycl/sycl_util.h
deleted file mode 100644
index 3124ed23c92..00000000000
--- a/tensorflow/core/common_runtime/sycl/sycl_util.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if !TENSORFLOW_USE_SYCL
-#error This file must only be included when building TensorFlow with SYCL support
-#endif
-
-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/core/common_runtime/device.h"
-// For DMA helper
-#include "tensorflow/core/common_runtime/dma_helper.h"
-#include "tensorflow/core/framework/tensor.h"
-
-namespace tensorflow {
-inline void const* GetBase(const Tensor* src) { return DMAHelper::base(src); }
-inline void* GetBase(Tensor* dst) { return DMAHelper::base(dst); }
-
-inline void SYCLmemcpy(Eigen::SyclDevice const& device,
-                       Tensor const& src_tensor, Tensor* dst_tensor) {
-  const size_t size = src_tensor.TotalBytes();
-  void* dst_ptr = GetBase(dst_tensor);
-  void const* src_ptr = GetBase(&src_tensor);
-
-#define COPY_WITH_TYPE(T) \
-  device.memcpy(dst_ptr, static_cast<T const*>(src_ptr), size);
-  switch (src_tensor.dtype()) {
-    case DT_COMPLEX128:
-      COPY_WITH_TYPE(cl::sycl::cl_ulong2);
-      break;
-    case DT_DOUBLE:
-    case DT_COMPLEX64:
-    case DT_INT64:
-      COPY_WITH_TYPE(cl::sycl::cl_ulong);
-      break;
-    case DT_FLOAT:
-    case DT_INT32:
-    case DT_QINT32:
-      COPY_WITH_TYPE(cl::sycl::cl_uint);
-      break;
-    case DT_INT16:
-    case DT_UINT16:
-    case DT_BFLOAT16:
-    case DT_QINT16:
-    case DT_QUINT16:
-    case DT_HALF:
-      COPY_WITH_TYPE(cl::sycl::cl_ushort);
-      break;
-    case DT_BOOL:
-      COPY_WITH_TYPE(bool);
-      break;
-    case DT_UINT8:
-    case DT_INT8:
-    case DT_QINT8:
-    case DT_QUINT8:
-      COPY_WITH_TYPE(cl::sycl::cl_uchar);
-      break;
-    default:
-      LOG(FATAL) << "Unknown data type " << src_tensor.dtype();
-      break;
-  }
-#undef COPY_WITH_TYPE
-}
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_SYCL_SYCL_UTIL_H_
diff --git a/tensorflow/core/common_runtime/test_collective_executor_mgr.h b/tensorflow/core/common_runtime/test_collective_executor_mgr.h
index c2e6d2ae08c..ddfb135aeef 100644
--- a/tensorflow/core/common_runtime/test_collective_executor_mgr.h
+++ b/tensorflow/core/common_runtime/test_collective_executor_mgr.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_COMMON_RUNTIME_TEST_COLLECTIVE_EXECUTOR_MGR_H_
 
 #include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 
 namespace tensorflow {
@@ -35,7 +36,7 @@ class TestCollectiveExecutor : public CollectiveExecutor {
 };
 
 class TestParamResolver : public ParamResolverInterface {
-  void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+  void CompleteParamsAsync(const DeviceAttributes& device, CollectiveParams* cp,
                            CancellationManager* cancel_mgr,
                            const StatusCallback& done) override {
     done(errors::Internal("Unimplemented"));
@@ -100,6 +101,10 @@ class TestCollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
     return nullptr;
   }
 
+  NcclCommunicatorInterface* GetNcclCommunicator() const override {
+    return nullptr;
+  }
+
   void GetStepSequenceAsync(const GetStepSequenceRequest* request,
                             GetStepSequenceResponse* response,
                             const StatusCallback& done) override {
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 44fa5bf2d3a..02cd53221d4 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/core/common_runtime/threadpool_device.h"
 
+#include "absl/base/call_once.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/common_runtime/scoped_allocator.h"
 #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
@@ -55,18 +55,14 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
   if (DisableMKL()) return;
 #ifdef _OPENMP
   const char* user_omp_threads = getenv("OMP_NUM_THREADS");
+  static absl::once_flag omp_setting_flag;
   if (user_omp_threads == nullptr) {
     // OMP_NUM_THREADS controls MKL's intra-op parallelization
     // Default to available physical cores
     const int mkl_intra_op = port::NumSchedulableCPUs();
     const int ht = port::NumHyperthreadsPerCore();
-    omp_set_num_threads((mkl_intra_op + ht - 1) / ht);
-  } else {
-    uint64 user_val = 0;
-    if (strings::safe_strtou64(user_omp_threads, &user_val)) {
-      // Superflous but triggers OpenMP loading
-      omp_set_num_threads(user_val);
-    }
+    absl::call_once(omp_setting_flag, omp_set_num_threads,
+                    (mkl_intra_op + ht - 1) / ht);
   }
 #endif  // _OPENMP
 #endif  // !defined(ENABLE_MKLDNN_THREADPOOL) && defined(INTEL_MKL)
diff --git a/tensorflow/core/data/BUILD b/tensorflow/core/data/BUILD
index 1b6e6790559..f0d715b06b7 100644
--- a/tensorflow/core/data/BUILD
+++ b/tensorflow/core/data/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load(
     "//tensorflow/core/platform:build_config.bzl",
@@ -13,8 +14,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 cc_library(
     name = "compression_utils",
     srcs = ["compression_utils.cc"],
diff --git a/tensorflow/core/data/compression_utils.cc b/tensorflow/core/data/compression_utils.cc
index 2ab51712580..bbff3a96667 100644
--- a/tensorflow/core/data/compression_utils.cc
+++ b/tensorflow/core/data/compression_utils.cc
@@ -114,7 +114,9 @@ Status UncompressElement(const CompressedElement& compressed,
   size_t uncompressed_size;
   if (!port::Snappy_GetUncompressedLength(
           compressed_data.data(), compressed_data.size(), &uncompressed_size)) {
-    return errors::Internal("Could not get snappy uncompressed length");
+    return errors::Internal(
+        "Could not get snappy uncompressed length. Compressed data size: ",
+        compressed_data.size());
   }
   if (uncompressed_size != static_cast<size_t>(total_size)) {
     return errors::Internal(
diff --git a/tensorflow/core/data/service/BUILD b/tensorflow/core/data/service/BUILD
index 35971e39ea1..d386dc6ec6b 100644
--- a/tensorflow/core/data/service/BUILD
+++ b/tensorflow/core/data/service/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("@com_github_grpc_grpc//bazel:cc_grpc_library.bzl", "cc_grpc_library")
 load(
     "//tensorflow/core/platform:build_config.bzl",
@@ -5,10 +6,17 @@ load(
     "tf_proto_library",
     "tf_protos_profiler_service",
 )
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")
 load(
     "//tensorflow:tensorflow.bzl",
-    "cc_header_only_library",
     "tf_cc_test",
 )
 
@@ -19,8 +27,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 tf_proto_library(
     name = "common_proto",
     srcs = ["common.proto"],
@@ -49,190 +55,6 @@ tf_proto_library(
     ],
 )
 
-cc_library(
-    name = "dataset_store",
-    srcs = ["dataset_store.cc"],
-    hdrs = ["dataset_store.h"],
-    deps = [
-        ":common_proto_cc",
-        ":dispatcher_state",
-        ":utils",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-tf_cc_test(
-    name = "dataset_store_test",
-    srcs = ["dataset_store_test.cc"],
-    deps = [
-        ":common_proto_cc",
-        ":dataset_store",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "@com_google_absl//absl/memory",
-    ],
-)
-
-cc_library(
-    name = "dispatcher_impl",
-    srcs = ["dispatcher_impl.cc"],
-    hdrs = [
-        "dispatcher_impl.h",
-    ],
-    deps = [
-        ":common_proto_cc",
-        ":credentials_factory",
-        ":data_service",
-        ":dataset_store",
-        ":dispatcher_proto_cc",
-        ":dispatcher_state",
-        ":grpc_util",
-        ":journal",
-        ":worker_cc_grpc_proto",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/kernels/data:dataset_utils",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
-        tf_grpc_cc_dependency(),
-    ],
-)
-
-cc_library(
-    name = "dispatcher_state",
-    srcs = ["dispatcher_state.cc"],
-    hdrs = [
-        "dispatcher_state.h",
-    ],
-    deps = [
-        ":common_proto_cc",
-        ":data_service",
-        ":journal",
-        ":journal_proto_cc",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
-    ],
-)
-
-tf_cc_test(
-    name = "dispatcher_state_test",
-    srcs = ["dispatcher_state_test.cc"],
-    deps = [
-        ":common_proto_cc",
-        ":dispatcher_state",
-        ":journal",
-        ":journal_proto_cc",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/platform:errors",
-    ],
-)
-
-cc_library(
-    name = "worker_impl",
-    srcs = ["worker_impl.cc"],
-    hdrs = [
-        "worker_impl.h",
-    ],
-    deps = [
-        ":common_proto_cc",
-        ":credentials_factory",
-        ":data_service",
-        ":dispatcher_cc_grpc_proto",
-        ":dispatcher_proto_cc",
-        ":grpc_util",
-        ":utils",
-        ":worker_proto_cc",
-        "//tensorflow/c:c_api_internal",
-        "//tensorflow/c:tf_status_helper",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/data:dataset_proto_cc",
-        "//tensorflow/core/data:standalone",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
-        tf_grpc_cc_dependency(),
-    ],
-)
-
-cc_library(
-    name = "grpc_util",
-    srcs = ["grpc_util.cc"],
-    hdrs = [
-        "grpc_util.h",
-    ],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
-        tf_grpc_cc_dependency(),
-    ],
-)
-
-tf_cc_test(
-    name = "grpc_util_test",
-    srcs = ["grpc_util_test.cc"],
-    deps = [
-        ":grpc_util",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-cc_library(
-    name = "journal",
-    srcs = ["journal.cc"],
-    hdrs = ["journal.h"],
-    deps = [
-        ":journal_proto_cc",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/platform:regexp",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-tf_cc_test(
-    name = "journal_test",
-    srcs = ["journal_test.cc"],
-    deps = [
-        ":common_proto_cc",
-        ":journal",
-        ":journal_proto_cc",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "@com_google_absl//absl/memory",
-    ],
-)
-
-tf_proto_library(
-    name = "journal_proto",
-    srcs = ["journal.proto"],
-    cc_api_version = 2,
-    protodeps = [
-        ":common_proto",
-    ],
-)
-
 cc_library(
     name = "credentials_factory",
     srcs = ["credentials_factory.cc"],
@@ -256,144 +78,6 @@ tf_cc_test(
     ],
 )
 
-# Link this target to enable LOCAL credentials for the dataset service.
-cc_library(
-    name = "local_credentials_factory",
-    srcs = ["local_credentials_factory.cc"],
-    deps = [
-        ":credentials_factory",
-        tf_grpc_cc_dependency(),
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "test_cluster",
-    testonly = True,
-    srcs = ["test_cluster.cc"],
-    hdrs = ["test_cluster.h"],
-    deps = [
-        ":server_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:errors",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "test_util",
-    testonly = True,
-    srcs = ["test_util.cc"],
-    hdrs = [
-        "test_util.h",
-    ],
-    data = glob(["testdata/*.pbtxt"]),
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/framework:protos_all_cc",
-        "//tensorflow/core/kernels/data:dataset_test_base",
-    ],
-)
-
-tf_cc_test(
-    name = "test_util_test",
-    srcs = ["test_util_test.cc"],
-    deps = [
-        ":test_util",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/data:standalone",
-        "//tensorflow/core/kernels/data:dataset_test_base",
-    ],
-)
-
-cc_library(
-    name = "grpc_dispatcher_impl",
-    srcs = ["grpc_dispatcher_impl.cc"],
-    hdrs = ["grpc_dispatcher_impl.h"],
-    deps = [
-        ":dispatcher_cc_grpc_proto",
-        ":dispatcher_impl",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
-        tf_grpc_cc_dependency(),
-    ],
-)
-
-cc_library(
-    name = "grpc_worker_impl",
-    srcs = ["grpc_worker_impl.cc"],
-    hdrs = ["grpc_worker_impl.h"],
-    deps = [
-        ":worker_cc_grpc_proto",
-        ":worker_impl",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
-        tf_grpc_cc_dependency(),
-    ],
-)
-
-# This needs to be cc_header_only_library - tf_pybind_cc_library_wrapper
-# does not pull in the server_lib.h header.
-cc_header_only_library(
-    name = "server_lib_headers_lib",
-    features = ["-parse_headers"],
-    deps = [
-        ":server_lib",
-    ],
-)
-
-cc_library(
-    name = "server_lib",
-    srcs = ["server_lib.cc"],
-    hdrs = ["server_lib.h"],
-    linkstatic = True,
-    visibility = [
-        "//visibility:public",
-    ],
-    deps = [
-        ":credentials_factory",
-        ":grpc_dispatcher_impl",
-        ":grpc_util",
-        ":grpc_worker_impl",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensorflow",
-        "//tensorflow/core/profiler/rpc:profiler_service_impl",
-        tf_grpc_cc_dependency(),
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "utils",
-    srcs = ["utils.cc"],
-    hdrs = ["utils.h"],
-    deps = [
-        ":common_proto_cc",
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-tf_cc_test(
-    name = "utils_test",
-    srcs = ["utils_test.cc"],
-    deps = [
-        ":common_proto_cc",
-        ":utils",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 cc_library(
     name = "data_service",
     srcs = ["data_service.cc"],
@@ -442,18 +126,359 @@ tf_cc_test(
     ] + tf_protos_profiler_service(),
 )
 
+cc_library(
+    name = "dataset_store",
+    srcs = ["dataset_store.cc"],
+    hdrs = ["dataset_store.h"],
+    deps = [
+        ":common_proto_cc",
+        ":dispatcher_state",
+        ":utils",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "dataset_store_test",
+    srcs = ["dataset_store_test.cc"],
+    deps = [
+        ":common_proto_cc",
+        ":dataset_store",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 cc_grpc_library(
     name = "dispatcher_cc_grpc_proto",
     srcs = [":dispatcher_proto"],
+    compatible_with = get_compatible_with_cloud(),
     generate_mocks = True,
     grpc_only = True,
     deps = [":dispatcher_proto_cc"],
 )
 
+cc_library(
+    name = "dispatcher_impl",
+    srcs = ["dispatcher_impl.cc"],
+    hdrs = [
+        "dispatcher_impl.h",
+    ],
+    deps = [
+        ":common_proto_cc",
+        ":credentials_factory",
+        ":data_service",
+        ":dataset_store",
+        ":dispatcher_proto_cc",
+        ":dispatcher_state",
+        ":grpc_util",
+        ":journal",
+        ":worker_cc_grpc_proto",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/data:standalone",
+        "//tensorflow/core/kernels/data:dataset_utils",
+        "//tensorflow/core/kernels/data:hash_utils",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        tf_grpc_cc_dependency(),
+    ],
+)
+
+cc_library(
+    name = "dispatcher_state",
+    srcs = ["dispatcher_state.cc"],
+    hdrs = [
+        "dispatcher_state.h",
+    ],
+    deps = [
+        ":common_proto_cc",
+        ":data_service",
+        ":journal",
+        ":journal_proto_cc",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+tf_cc_test(
+    name = "dispatcher_state_test",
+    srcs = ["dispatcher_state_test.cc"],
+    deps = [
+        ":common_proto_cc",
+        ":dispatcher_state",
+        ":journal",
+        ":journal_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/platform:errors",
+    ],
+)
+
+cc_library(
+    name = "grpc_dispatcher_impl",
+    srcs = ["grpc_dispatcher_impl.cc"],
+    hdrs = ["grpc_dispatcher_impl.h"],
+    deps = [
+        ":dispatcher_cc_grpc_proto",
+        ":dispatcher_impl",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        tf_grpc_cc_dependency(),
+    ],
+)
+
+cc_library(
+    name = "grpc_util",
+    srcs = ["grpc_util.cc"],
+    hdrs = [
+        "grpc_util.h",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        tf_grpc_cc_dependency(),
+    ],
+)
+
+tf_cc_test(
+    name = "grpc_util_test",
+    srcs = ["grpc_util_test.cc"],
+    deps = [
+        ":grpc_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+cc_library(
+    name = "grpc_worker_impl",
+    srcs = ["grpc_worker_impl.cc"],
+    hdrs = ["grpc_worker_impl.h"],
+    deps = [
+        ":worker_cc_grpc_proto",
+        ":worker_impl",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        tf_grpc_cc_dependency(),
+    ],
+)
+
+cc_library(
+    name = "journal",
+    srcs = ["journal.cc"],
+    hdrs = ["journal.h"],
+    deps = [
+        ":journal_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform:regexp",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_proto_library(
+    name = "journal_proto",
+    srcs = ["journal.proto"],
+    cc_api_version = 2,
+    protodeps = [
+        ":common_proto",
+    ],
+)
+
+tf_cc_test(
+    name = "journal_test",
+    srcs = ["journal_test.cc"],
+    deps = [
+        ":common_proto_cc",
+        ":journal",
+        ":journal_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+# Link this target to enable LOCAL credentials for the dataset service.
+cc_library(
+    name = "local_credentials_factory",
+    srcs = ["local_credentials_factory.cc"],
+    deps = [
+        ":credentials_factory",
+        tf_grpc_cc_dependency(),
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "server_lib",
+    srcs = ["server_lib.cc"],
+    hdrs = ["server_lib.h"],
+    linkstatic = True,
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        ":credentials_factory",
+        ":grpc_dispatcher_impl",
+        ":grpc_util",
+        ":grpc_worker_impl",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core/profiler/rpc:profiler_service_impl",
+        tf_grpc_cc_dependency(),
+    ],
+    alwayslink = 1,
+)
+
+# This needs to be cc_header_only_library - tf_pybind_cc_library_wrapper
+# does not pull in the server_lib.h header.
+cc_header_only_library(
+    name = "server_lib_headers_lib",
+    features = ["-parse_headers"],
+    deps = [
+        ":server_lib",
+    ],
+)
+
+cc_library(
+    name = "split_provider",
+    srcs = ["split_provider.cc"],
+    hdrs = [
+        "split_provider.h",
+    ],
+    deps = [
+        ":data_service",
+        ":grpc_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "test_cluster",
+    testonly = True,
+    srcs = ["test_cluster.cc"],
+    hdrs = ["test_cluster.h"],
+    deps = [
+        ":server_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "test_util",
+    testonly = True,
+    srcs = ["test_util.cc"],
+    hdrs = [
+        "test_util.h",
+    ],
+    data = glob(["testdata/*.pbtxt"]),
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/framework:protos_all_cc",
+        "//tensorflow/core/kernels/data:dataset_test_base",
+    ],
+)
+
+tf_cc_test(
+    name = "test_util_test",
+    srcs = ["test_util_test.cc"],
+    deps = [
+        ":test_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/data:standalone",
+        "//tensorflow/core/kernels/data:dataset_test_base",
+    ],
+)
+
+cc_library(
+    name = "utils",
+    srcs = ["utils.cc"],
+    hdrs = ["utils.h"],
+    deps = [
+        ":common_proto_cc",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "utils_test",
+    srcs = ["utils_test.cc"],
+    deps = [
+        ":common_proto_cc",
+        ":utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 cc_grpc_library(
     name = "worker_cc_grpc_proto",
     srcs = [":worker_proto"],
+    compatible_with = get_compatible_with_cloud(),
     generate_mocks = True,
     grpc_only = True,
     deps = [":worker_proto_cc"],
 )
+
+cc_library(
+    name = "worker_impl",
+    srcs = ["worker_impl.cc"],
+    hdrs = [
+        "worker_impl.h",
+    ],
+    deps = [
+        ":common_proto_cc",
+        ":credentials_factory",
+        ":data_service",
+        ":dispatcher_cc_grpc_proto",
+        ":dispatcher_proto_cc",
+        ":grpc_util",
+        ":split_provider",
+        ":utils",
+        ":worker_proto_cc",
+        "//tensorflow/c:c_api_internal",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/data:dataset_proto_cc",
+        "//tensorflow/core/data:standalone",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        tf_grpc_cc_dependency(),
+    ],
+)
diff --git a/tensorflow/core/data/service/common.proto b/tensorflow/core/data/service/common.proto
index 64fced1d13c..45f1b0ea811 100644
--- a/tensorflow/core/data/service/common.proto
+++ b/tensorflow/core/data/service/common.proto
@@ -19,6 +19,7 @@ message TaskDef {
   int64 dataset_id = 3;
   int64 task_id = 4;
   int64 job_id = 5;
+  ProcessingModeDef processing_mode = 6;
 }
 
 message TaskInfo {
@@ -31,8 +32,9 @@ message TaskInfo {
 }
 
 enum ProcessingModeDef {
+  INVALID = 0;
   // Each tf.data worker processes an entire epoch.
-  PARALLEL_EPOCHS = 0;
+  PARALLEL_EPOCHS = 1;
   // Processing of an epoch is distributed across all tf.data workers.
-  ONE_EPOCH = 1;
+  DISTRIBUTED_EPOCH = 2;
 }
diff --git a/tensorflow/core/data/service/data_service.cc b/tensorflow/core/data/service/data_service.cc
index 0f25805b653..a119331c56a 100644
--- a/tensorflow/core/data/service/data_service.cc
+++ b/tensorflow/core/data/service/data_service.cc
@@ -28,14 +28,14 @@ namespace data {
 
 namespace {
 constexpr const char kParallelEpochs[] = "parallel_epochs";
-constexpr const char kOneEpoch[] = "one_epoch";
+constexpr const char kDistributedEpoch[] = "distributed_epoch";
 }  // namespace
 
-Status ParseProcessingMode(const std::string& s, ProcessingMode* mode) {
+Status ParseProcessingMode(const std::string& s, ProcessingMode& mode) {
   if (s == kParallelEpochs) {
-    *mode = ProcessingMode::PARALLEL_EPOCHS;
-  } else if (s == kOneEpoch) {
-    *mode = ProcessingMode::ONE_EPOCH;
+    mode = ProcessingMode::PARALLEL_EPOCHS;
+  } else if (s == kDistributedEpoch) {
+    mode = ProcessingMode::DISTRIBUTED_EPOCH;
   } else {
     return errors::InvalidArgument("Unrecognized processing mode: ", s);
   }
@@ -46,27 +46,34 @@ std::string ProcessingModeToString(ProcessingMode mode) {
   switch (mode) {
     case ProcessingMode::PARALLEL_EPOCHS:
       return kParallelEpochs;
-    case ProcessingMode::ONE_EPOCH:
-      return kOneEpoch;
+    case ProcessingMode::DISTRIBUTED_EPOCH:
+      return kDistributedEpoch;
     default:
       DCHECK(false);
       return "Unknown";
   }
 }
 
-Status DataServiceDispatcherClient::RegisterWorker(
-    const std::string& worker_address, std::vector<TaskDef>& tasks) {
+Status DataServiceDispatcherClient::WorkerHeartbeat(
+    const std::string& worker_address, const std::vector<int64>& current_tasks,
+    std::vector<TaskDef>& new_tasks, std::vector<int64>& tasks_to_delete) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
-  RegisterWorkerRequest req;
+  WorkerHeartbeatRequest req;
   req.set_worker_address(worker_address);
-  RegisterWorkerResponse resp;
-  grpc::ClientContext client_ctx;
-  grpc::Status status = stub_->RegisterWorker(&client_ctx, req, &resp);
-  if (!status.ok()) {
-    return grpc_util::WrapError("Failed to register worker", status);
+  for (int64 task : current_tasks) {
+    req.add_current_tasks(task);
   }
-  for (const auto& task : resp.tasks()) {
-    tasks.push_back(task);
+  WorkerHeartbeatResponse resp;
+  grpc::ClientContext client_ctx;
+  grpc::Status status = stub_->WorkerHeartbeat(&client_ctx, req, &resp);
+  if (!status.ok()) {
+    return grpc_util::WrapError("Failed to perform worker heartbeat", status);
+  }
+  for (const auto& task : resp.new_tasks()) {
+    new_tasks.push_back(task);
+  }
+  for (int64 task_to_delete : resp.tasks_to_delete()) {
+    tasks_to_delete.push_back(task_to_delete);
   }
   return Status::OK();
 }
@@ -104,8 +111,30 @@ Status DataServiceDispatcherClient::GetDatasetDef(int64 dataset_id,
   return Status::OK();
 }
 
+Status DataServiceDispatcherClient::GetSplit(int64 job_id, int64 repetition,
+                                             Tensor& split,
+                                             bool& end_of_splits) {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+  GetSplitRequest req;
+  req.set_job_id(job_id);
+  req.set_repetition(repetition);
+  GetSplitResponse resp;
+  grpc::ClientContext client_ctx;
+  grpc::Status status = stub_->GetSplit(&client_ctx, req, &resp);
+  if (!status.ok()) {
+    return grpc_util::WrapError("Failed to get split", status);
+  }
+  end_of_splits = resp.end_of_splits();
+  if (!end_of_splits) {
+    if (!split.FromProto(resp.split())) {
+      return errors::Internal("Failed to parse split tensor proto");
+    }
+  }
+  return Status::OK();
+}
+
 Status DataServiceDispatcherClient::RegisterDataset(GraphDef dataset,
-                                                    int64* dataset_id) {
+                                                    int64& dataset_id) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   GetOrRegisterDatasetRequest req;
   *req.mutable_dataset()->mutable_graph() = dataset;
@@ -115,13 +144,13 @@ Status DataServiceDispatcherClient::RegisterDataset(GraphDef dataset,
   if (!status.ok()) {
     return grpc_util::WrapError("Failed to register dataset", status);
   }
-  *dataset_id = resp.dataset_id();
+  dataset_id = resp.dataset_id();
   return Status::OK();
 }
 
 Status DataServiceDispatcherClient::CreateJob(int64 dataset_id,
                                               ProcessingMode processing_mode,
-                                              int64* job_client_id) {
+                                              int64& job_client_id) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   CreateJobRequest req;
   req.set_dataset_id(dataset_id);
@@ -134,13 +163,13 @@ Status DataServiceDispatcherClient::CreateJob(int64 dataset_id,
         absl::StrCat("Failed to create job for dataset with id ", dataset_id),
         status);
   }
-  *job_client_id = resp.job_client_id();
+  job_client_id = resp.job_client_id();
   return Status::OK();
 }
 
 Status DataServiceDispatcherClient::GetOrCreateJob(
     int64 dataset_id, ProcessingMode processing_mode,
-    const std::string& job_name, int job_name_index, int64* job_client_id) {
+    const std::string& job_name, int job_name_index, int64& job_client_id) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   GetOrCreateJobRequest req;
   req.set_dataset_id(dataset_id);
@@ -156,7 +185,7 @@ Status DataServiceDispatcherClient::GetOrCreateJob(
                      dataset_id),
         status);
   }
-  *job_client_id = resp.job_client_id();
+  job_client_id = resp.job_client_id();
   return Status::OK();
 }
 
@@ -176,8 +205,8 @@ Status DataServiceDispatcherClient::ReleaseJobClient(int64 job_client_id) {
 }
 
 Status DataServiceDispatcherClient::GetTasks(int64 job_client_id,
-                                             std::vector<TaskInfo>* tasks,
-                                             bool* job_finished) {
+                                             std::vector<TaskInfo>& tasks,
+                                             bool& job_finished) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   GetTasksRequest req;
   req.set_job_client_id(job_client_id);
@@ -187,16 +216,16 @@ Status DataServiceDispatcherClient::GetTasks(int64 job_client_id,
   if (!s.ok()) {
     return grpc_util::WrapError("Failed to get tasks", s);
   }
-  tasks->clear();
+  tasks.clear();
   for (auto& task : resp.task_info()) {
-    tasks->push_back(task);
+    tasks.push_back(task);
   }
-  *job_finished = resp.job_finished();
+  job_finished = resp.job_finished();
   return Status::OK();
 }
 
 Status DataServiceDispatcherClient::GetWorkers(
-    std::vector<WorkerInfo>* workers) {
+    std::vector<WorkerInfo>& workers) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   GetWorkersRequest req;
   GetWorkersResponse resp;
@@ -205,9 +234,9 @@ Status DataServiceDispatcherClient::GetWorkers(
   if (!s.ok()) {
     return grpc_util::WrapError("Failed to get workers", s);
   }
-  workers->clear();
+  workers.clear();
   for (auto& worker : resp.workers()) {
-    workers->push_back(worker);
+    workers.push_back(worker);
   }
   return Status::OK();
 }
@@ -220,14 +249,16 @@ Status DataServiceDispatcherClient::EnsureInitialized() {
   std::shared_ptr<grpc::ChannelCredentials> credentials;
   TF_RETURN_IF_ERROR(
       CredentialsFactory::CreateClientCredentials(protocol_, &credentials));
-  auto channel = grpc::CreateChannel(address_, credentials);
+  grpc::ChannelArguments args;
+  args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
+  auto channel = grpc::CreateCustomChannel(address_, credentials, args);
   stub_ = DispatcherService::NewStub(channel);
   return Status::OK();
 }
 
 Status DataServiceWorkerClient::GetElement(int64 task_id,
-                                           CompressedElement* element,
-                                           bool* end_of_sequence) {
+                                           CompressedElement& element,
+                                           bool& end_of_sequence) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   GetElementRequest req;
   req.set_task_id(task_id);
@@ -237,9 +268,9 @@ Status DataServiceWorkerClient::GetElement(int64 task_id,
   if (!s.ok()) {
     return grpc_util::WrapError("Failed to get element", s);
   }
-  *end_of_sequence = resp.end_of_sequence();
-  if (!*end_of_sequence) {
-    *element = std::move(*resp.mutable_compressed_element());
+  end_of_sequence = resp.end_of_sequence();
+  if (!end_of_sequence) {
+    element = std::move(*resp.mutable_compressed_element());
   }
   return Status::OK();
 }
@@ -261,20 +292,20 @@ Status DataServiceWorkerClient::EnsureInitialized() {
 
 Status CreateDataServiceDispatcherClient(
     const std::string& address, const std::string& protocol,
-    std::unique_ptr<DataServiceDispatcherClient>* out) {
+    std::unique_ptr<DataServiceDispatcherClient>& out) {
   auto client =
       absl::make_unique<DataServiceDispatcherClient>(address, protocol);
   TF_RETURN_IF_ERROR(client->Initialize());
-  *out = std::move(client);
+  out = std::move(client);
   return Status::OK();
 }
 
 Status CreateDataServiceWorkerClient(
     const std::string& address, const std::string& protocol,
-    std::unique_ptr<DataServiceWorkerClient>* out) {
+    std::unique_ptr<DataServiceWorkerClient>& out) {
   auto client = absl::make_unique<DataServiceWorkerClient>(address, protocol);
   TF_RETURN_IF_ERROR(client->Initialize());
-  *out = std::move(client);
+  out = std::move(client);
   return Status::OK();
 }
 }  // namespace data
diff --git a/tensorflow/core/data/service/data_service.h b/tensorflow/core/data/service/data_service.h
index 621e76da749..f10abae8acf 100644
--- a/tensorflow/core/data/service/data_service.h
+++ b/tensorflow/core/data/service/data_service.h
@@ -26,16 +26,17 @@ namespace data {
 
 // Modes for how a tf.data service job should process a dataset.
 enum class ProcessingMode : int64 {
+  UNSET = 0,
   // Each tf.data worker processes an entire epoch. If a dataset contains 2
   // elements and there are 3 workers, the job will produce 6 elements.
-  PARALLEL_EPOCHS = 0,
+  PARALLEL_EPOCHS = 1,
   // Processing of a single epoch is distributed across all tf.data workers.
-  ONE_EPOCH = 1,
+  DISTRIBUTED_EPOCH = 2,
 };
 
 // Parses a string representing a processing mode and stores the result in
-// *mode. Returns an InvalidArgument status if the string is not recognized.
-Status ParseProcessingMode(const std::string& s, ProcessingMode* mode);
+// `mode`. Returns an InvalidArgument status if the string is not recognized.
+Status ParseProcessingMode(const std::string& s, ProcessingMode& mode);
 
 // Converts a processing mode to its corresponding string.
 std::string ProcessingModeToString(ProcessingMode mode);
@@ -73,10 +74,15 @@ class DataServiceDispatcherClient : public DataServiceClientBase {
                               const std::string& protocol)
       : DataServiceClientBase(address, protocol) {}
 
-  // Registers a worker with the dispatcher. The dispatcher returns a list of
-  // initial tasks for the worker to run, storing them in `tasks`.
-  Status RegisterWorker(const std::string& worker_address,
-                        std::vector<TaskDef>& tasks);
+  // Sends a heartbeat to the dispatcher. If the worker wasn't already
+  // registered with the dispatcher, this will register the worker. The
+  // dispatcher will report which new tasks the worker should run, and which
+  // tasks it should delete. This is stored into `new_tasks` and
+  // `tasks_to_delete`.
+  Status WorkerHeartbeat(const std::string& worker_address,
+                         const std::vector<int64>& current_tasks,
+                         std::vector<TaskDef>& new_tasks,
+                         std::vector<int64>& tasks_to_delete);
 
   // Updates the dispatcher with information about the worker's state.
   Status WorkerUpdate(const std::string& worker_address,
@@ -86,35 +92,39 @@ class DataServiceDispatcherClient : public DataServiceClientBase {
   // definition in `dataset_def`.
   Status GetDatasetDef(int64 dataset_id, DatasetDef& dataset_def);
 
+  // Gets the next split for the specified job id and repetition.
+  Status GetSplit(int64 job_id, int64 repetition, Tensor& split,
+                  bool& end_of_splits);
+
   // Registers a dataset with the tf.data service, and stores the generated
-  // dataset id in `*dataset_id`.
-  Status RegisterDataset(GraphDef dataset, int64* dataset_id);
+  // dataset id in `dataset_id`.
+  Status RegisterDataset(GraphDef dataset, int64& dataset_id);
 
   // Creates a new tf.data service job for the specified dataset. The id for the
-  // created job will be stored in `*job_client_id`.
+  // created job will be stored in `job_client_id`.
   Status CreateJob(int64 dataset_id, ProcessingMode processing_mode,
-                   int64* job_client_id);
+                   int64& job_client_id);
 
   // Gets the job id for the job represented by the tuple
-  // (job_name, job_name_index), and stores the id in *job_client_id. If the
+  // (job_name, job_name_index), and stores the id in `job_client_id`. If the
   // job doesn't exist yet, it will be created.
   Status GetOrCreateJob(int64 dataset_id, ProcessingMode processing_mode,
                         const std::string& job_name, int job_name_index,
-                        int64* job_client_id);
+                        int64& job_client_id);
 
   // Releases a job client id, indicating that the id will no longer be used to
   // read from the job.
   Status ReleaseJobClient(int64 job_client_id);
 
   // Queries the dispatcher for the tasks associated with the specified job.
-  // The tasks will be stored in *tasks, and whether the job is finished will
-  // be stored in `*job_finished`.
-  Status GetTasks(int64 job_client_id, std::vector<TaskInfo>* tasks,
-                  bool* job_finished);
+  // The tasks will be stored in `tasks`, and whether the job is finished will
+  // be stored in `job_finished`.
+  Status GetTasks(int64 job_client_id, std::vector<TaskInfo>& tasks,
+                  bool& job_finished);
 
   // Queries the dispatcher for its registered workers. The worker info will be
-  // stored in `*workers`.
-  Status GetWorkers(std::vector<WorkerInfo>* workers);
+  // stored in `workers`.
+  Status GetWorkers(std::vector<WorkerInfo>& workers);
 
  protected:
   Status EnsureInitialized() override;
@@ -134,10 +144,10 @@ class DataServiceWorkerClient : public DataServiceClientBase {
       : DataServiceClientBase(address, protocol) {}
 
   // Fetches the next element for the specified task_id. The element's
-  // compressed tensors will be stored in *element. If no element is available,
-  // `*end_of_sequence` will be `true`, and `element` will be left unchanged.
-  Status GetElement(int64 task_id, CompressedElement* element,
-                    bool* end_of_sequence);
+  // compressed tensors will be stored in `element`. If no element is available,
+  // `end_of_sequence` will be `true`, and `element` will be left unchanged.
+  Status GetElement(int64 task_id, CompressedElement& element,
+                    bool& end_of_sequence);
 
  protected:
   Status EnsureInitialized() override;
@@ -152,12 +162,12 @@ class DataServiceWorkerClient : public DataServiceClientBase {
 // Creates and initializes a new tf.data service dispatcher client.
 Status CreateDataServiceDispatcherClient(
     const std::string& address, const std::string& protocol,
-    std::unique_ptr<DataServiceDispatcherClient>* out);
+    std::unique_ptr<DataServiceDispatcherClient>& out);
 
 // Creates and initializes a new tf.data service worker client.
 Status CreateDataServiceWorkerClient(
     const std::string& address, const std::string& protocol,
-    std::unique_ptr<DataServiceWorkerClient>* out);
+    std::unique_ptr<DataServiceWorkerClient>& out);
 
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/data_service_test.cc b/tensorflow/core/data/service/data_service_test.cc
index 607570054b4..0c2ee40f05e 100644
--- a/tensorflow/core/data/service/data_service_test.cc
+++ b/tensorflow/core/data/service/data_service_test.cc
@@ -41,26 +41,27 @@ constexpr const char kProtocol[] = "grpc+local";
 
 TEST(DataService, ParseParallelEpochsProcessingMode) {
   ProcessingMode mode;
-  TF_ASSERT_OK(ParseProcessingMode("parallel_epochs", &mode));
+  TF_ASSERT_OK(ParseProcessingMode("parallel_epochs", mode));
   EXPECT_EQ(mode, ProcessingMode::PARALLEL_EPOCHS);
 }
 
-TEST(DataService, ParseOneEpochProcessingMode) {
+TEST(DataService, ParseDistributedEpochProcessingMode) {
   ProcessingMode mode;
-  TF_ASSERT_OK(ParseProcessingMode("one_epoch", &mode));
-  EXPECT_EQ(mode, ProcessingMode::ONE_EPOCH);
+  TF_ASSERT_OK(ParseProcessingMode("distributed_epoch", mode));
+  EXPECT_EQ(mode, ProcessingMode::DISTRIBUTED_EPOCH);
 }
 
 TEST(DataService, ParseInvalidProcessingMode) {
   ProcessingMode mode;
-  Status s = ParseProcessingMode("invalid", &mode);
+  Status s = ParseProcessingMode("invalid", mode);
   EXPECT_EQ(s.code(), error::Code::INVALID_ARGUMENT);
 }
 
 TEST(DataService, ProcessingModeToString) {
   EXPECT_EQ("parallel_epochs",
             ProcessingModeToString(ProcessingMode::PARALLEL_EPOCHS));
-  EXPECT_EQ("one_epoch", ProcessingModeToString(ProcessingMode::ONE_EPOCH));
+  EXPECT_EQ("distributed_epoch",
+            ProcessingModeToString(ProcessingMode::DISTRIBUTED_EPOCH));
 }
 
 TEST(DataService, GetWorkers) {
@@ -69,7 +70,7 @@ TEST(DataService, GetWorkers) {
   DataServiceDispatcherClient dispatcher(cluster.DispatcherAddress(),
                                          kProtocol);
   std::vector<WorkerInfo> workers;
-  TF_EXPECT_OK(dispatcher.GetWorkers(&workers));
+  TF_EXPECT_OK(dispatcher.GetWorkers(workers));
   EXPECT_EQ(1, workers.size());
 }
 
diff --git a/tensorflow/core/data/service/dispatcher.proto b/tensorflow/core/data/service/dispatcher.proto
index cf8c4c20c70..be423b23eb5 100644
--- a/tensorflow/core/data/service/dispatcher.proto
+++ b/tensorflow/core/data/service/dispatcher.proto
@@ -3,16 +3,7 @@ syntax = "proto3";
 package tensorflow.data;
 
 import "tensorflow/core/data/service/common.proto";
-
-message RegisterWorkerRequest {
-  // The address of the registering worker.
-  string worker_address = 1;
-}
-
-message RegisterWorkerResponse {
-  // Tasks to begin processing.
-  repeated TaskDef tasks = 2;
-}
+import "tensorflow/core/framework/tensor.proto";
 
 message TaskProgress {
   // The task that this message is about.
@@ -21,6 +12,16 @@ message TaskProgress {
   bool completed = 2;
 }
 
+message WorkerHeartbeatRequest {
+  string worker_address = 1;
+  repeated int64 current_tasks = 2;
+}
+
+message WorkerHeartbeatResponse {
+  repeated TaskDef new_tasks = 1;
+  repeated int64 tasks_to_delete = 2;
+}
+
 message WorkerUpdateRequest {
   string worker_address = 1;
   repeated TaskProgress updates = 2;
@@ -36,6 +37,16 @@ message GetDatasetDefResponse {
   DatasetDef dataset_def = 1;
 }
 
+message GetSplitRequest {
+  int64 job_id = 1;
+  int64 repetition = 2;
+}
+
+message GetSplitResponse {
+  TensorProto split = 1;
+  bool end_of_splits = 2;
+}
+
 message GetOrRegisterDatasetRequest {
   // The dataset to register.
   DatasetDef dataset = 1;
@@ -110,8 +121,8 @@ message GetWorkersResponse {
 }
 
 service DispatcherService {
-  // Registers a worker with the dispatcher.
-  rpc RegisterWorker(RegisterWorkerRequest) returns (RegisterWorkerResponse);
+  // Performs a periodic worker heartbeat.
+  rpc WorkerHeartbeat(WorkerHeartbeatRequest) returns (WorkerHeartbeatResponse);
 
   // Updates the dispatcher with information about the worker's state.
   rpc WorkerUpdate(WorkerUpdateRequest) returns (WorkerUpdateResponse);
@@ -119,6 +130,9 @@ service DispatcherService {
   // Gets a dataset defintion.
   rpc GetDatasetDef(GetDatasetDefRequest) returns (GetDatasetDefResponse);
 
+  // Gets the next split for a given job.
+  rpc GetSplit(GetSplitRequest) returns (GetSplitResponse);
+
   // Registers a dataset with the server, or returns its id if it is already
   // registered.
   //
diff --git a/tensorflow/core/data/service/dispatcher_impl.cc b/tensorflow/core/data/service/dispatcher_impl.cc
index de5f63a01a0..c8eddaefeea 100644
--- a/tensorflow/core/data/service/dispatcher_impl.cc
+++ b/tensorflow/core/data/service/dispatcher_impl.cc
@@ -35,8 +35,10 @@ limitations under the License.
 #include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/data/service/journal.h"
 #include "tensorflow/core/data/service/worker.grpc.pb.h"
+#include "tensorflow/core/data/standalone.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/hash_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/path.h"
@@ -52,6 +54,17 @@ constexpr char kJournalDir[] = "tf_data_dispatcher_journal";
 // The name of the datasets directory inside the dispatcher's working directory.
 constexpr char kDatasetsDir[] = "datasets";
 
+constexpr std::array<const char*, 8> kNodeNameSharingOps = {
+    "HashTable",
+    "HashTableV2",
+    "MutableHashTable",
+    "MutableHashTableV2",
+    "MutableDenseHashTable",
+    "MutableDenseHashTableV2",
+    "MutableHashTableOfTensors",
+    "MutableHashTableOfTensorsV2",
+};
+
 using Dataset = DispatcherState::Dataset;
 using Worker = DispatcherState::Worker;
 using NamedJobKey = DispatcherState::NamedJobKey;
@@ -71,21 +84,38 @@ std::string DatasetKey(int64 id, uint64 fingerprint) {
 }
 
 Status CreateWorkerStub(const std::string& address, const std::string& protocol,
-                        std::unique_ptr<WorkerService::Stub>* stub) {
+                        std::unique_ptr<WorkerService::Stub>& stub) {
   ::grpc::ChannelArguments args;
   args.SetMaxReceiveMessageSize(-1);
   std::shared_ptr<::grpc::ChannelCredentials> credentials;
   TF_RETURN_IF_ERROR(
       CredentialsFactory::CreateClientCredentials(protocol, &credentials));
   auto channel = ::grpc::CreateCustomChannel(address, credentials, args);
-  *stub = WorkerService::NewStub(channel);
+  stub = WorkerService::NewStub(channel);
   return Status::OK();
 }
+
+void PrepareGraph(GraphDef* graph) {
+  for (NodeDef& node : *graph->mutable_node()) {
+    for (const auto& op : kNodeNameSharingOps) {
+      // Set `use_node_name_sharing` to `true` so that resources aren't deleted
+      // prematurely. Otherwise, resources may be deleted when their ops are
+      // deleted at the end of the GraphRunner::Run used by standalone::Dataset.
+      if (node.op() == op) {
+        (*node.mutable_attr())["use_node_name_sharing"].set_b(true);
+      }
+      if (!node.device().empty()) {
+        *node.mutable_device() = "";
+      }
+    }
+  }
+  StripDevicePlacement(graph->mutable_library());
+}
 }  // namespace
 
 DataServiceDispatcherImpl::DataServiceDispatcherImpl(
     const experimental::DispatcherConfig& config)
-    : config_(config) {
+    : config_(config), env_(Env::Default()) {
   if (config_.work_dir().empty()) {
     dataset_store_ = absl::make_unique<MemoryDatasetStore>();
   } else {
@@ -94,8 +124,19 @@ DataServiceDispatcherImpl::DataServiceDispatcherImpl(
   }
 }
 
+DataServiceDispatcherImpl::~DataServiceDispatcherImpl() {
+  {
+    mutex_lock l(mu_);
+    cancelled_ = true;
+    job_gc_thread_cv_.notify_all();
+  }
+  job_gc_thread_.reset();
+}
+
 Status DataServiceDispatcherImpl::Start() {
   mutex_lock l(mu_);
+  job_gc_thread_ = absl::WrapUnique(
+      env_->StartThread({}, "job-gc-thread", [&] { JobGcThread(); }));
   if (config_.work_dir().empty()) {
     if (config_.fault_tolerant_mode()) {
       return errors::InvalidArgument(
@@ -103,21 +144,22 @@ Status DataServiceDispatcherImpl::Start() {
     }
   } else {
     TF_RETURN_IF_ERROR(
-        Env::Default()->RecursivelyCreateDir(DatasetsDir(config_.work_dir())));
+        env_->RecursivelyCreateDir(DatasetsDir(config_.work_dir())));
   }
   if (!config_.fault_tolerant_mode()) {
     LOG(INFO) << "Running with fault_tolerant_mode=False. The dispatcher will "
                  "not be able to recover its state on restart.";
+    started_ = true;
     return Status::OK();
   }
   journal_writer_ = absl::make_unique<FileJournalWriter>(
-      Env::Default(), JournalDir(config_.work_dir()));
-  LOG(INFO) << "Restoring dispatcher state from journal in "
+      env_, JournalDir(config_.work_dir()));
+  LOG(INFO) << "Attempting to restore dispatcher state from journal in "
             << JournalDir(config_.work_dir());
   Update update;
   bool end_of_journal = false;
-  FileJournalReader reader(Env::Default(), JournalDir(config_.work_dir()));
-  Status s = reader.Read(&update, &end_of_journal);
+  FileJournalReader reader(env_, JournalDir(config_.work_dir()));
+  Status s = reader.Read(update, end_of_journal);
   if (errors::IsNotFound(s)) {
     LOG(INFO) << "No journal found. Starting dispatcher from new state.";
   } else if (!s.ok()) {
@@ -125,54 +167,67 @@ Status DataServiceDispatcherImpl::Start() {
   } else {
     while (!end_of_journal) {
       TF_RETURN_IF_ERROR(ApplyWithoutJournaling(update));
-      TF_RETURN_IF_ERROR(reader.Read(&update, &end_of_journal));
+      TF_RETURN_IF_ERROR(reader.Read(update, end_of_journal));
+    }
+  }
+  for (const auto& job : state_.ListJobs()) {
+    if (job->processing_mode == ProcessingMode::DISTRIBUTED_EPOCH) {
+      TF_RETURN_IF_ERROR(MakeDistributedEpochJob(job->job_id, job->dataset_id));
     }
   }
   // Initialize the journal writer in `Start` so that we fail fast in case it
   // can't be initialized.
   TF_RETURN_IF_ERROR(journal_writer_.value()->EnsureInitialized());
+  started_ = true;
   return Status::OK();
 }
 
-Status DataServiceDispatcherImpl::RegisterWorker(
-    const RegisterWorkerRequest* request, RegisterWorkerResponse* response) {
-  VLOG(3) << "Received register worker request";
+Status DataServiceDispatcherImpl::MakeDistributedEpochJob(int64 job_id,
+                                                          int64 dataset_id)
+    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  std::unique_ptr<DistributedEpochJob>& distributed_epoch_job =
+      distributed_epoch_jobs_[job_id];
+  DCHECK(!distributed_epoch_job);
+  std::unique_ptr<SplitProvider> split_provider;
+  TF_RETURN_IF_ERROR(MakeSplitProvider(dataset_id, split_provider));
+  distributed_epoch_job = absl::make_unique<DistributedEpochJob>(
+      job_id, dataset_id, std::move(split_provider));
+  return Status::OK();
+}
+
+Status DataServiceDispatcherImpl::WorkerHeartbeat(
+    const WorkerHeartbeatRequest* request, WorkerHeartbeatResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
+  VLOG(3) << "Received worker heartbeat request from worker "
+          << request->worker_address();
   mutex_lock l(mu_);
-  std::string worker_address = request->worker_address();
-  std::vector<std::shared_ptr<const Task>> tasks;
-  Status s = state_.TasksForWorker(worker_address, tasks);
-  if (errors::IsNotFound(s)) {
+  const std::string& worker_address = request->worker_address();
+  std::vector<std::shared_ptr<const Task>> correct_tasks;
+  Status s = state_.TasksForWorker(worker_address, correct_tasks);
+  if (!s.ok()) {
+    if (!errors::IsNotFound(s)) {
+      return s;
+    }
     Update update;
     update.mutable_register_worker()->set_worker_address(worker_address);
     TF_RETURN_IF_ERROR(Apply(update));
-  } else if (!s.ok()) {
-    return s;
+    TF_RETURN_IF_ERROR(CreateTasksForWorker(worker_address));
+    TF_RETURN_IF_ERROR(state_.TasksForWorker(worker_address, correct_tasks));
   }
 
-  absl::flat_hash_map<int64, std::shared_ptr<const Task>> tasks_by_job;
-  for (const auto& task : tasks) {
-    // Should never have multiple tasks on the same worker for the same job.
-    auto& task_for_job = tasks_by_job[task->job_id];
-    DCHECK(task_for_job == nullptr);
-    task_for_job = task;
-  }
+  absl::flat_hash_set<int64> current_tasks;
+  current_tasks.insert(request->current_tasks().cbegin(),
+                       request->current_tasks().cend());
+  absl::flat_hash_set<int64> correct_tasks_set;
 
-  std::vector<std::shared_ptr<const Job>> jobs = state_.ListJobs();
-  // Allocate tasks to the worker.
-  for (const auto& job : jobs) {
-    if (job->finished) {
+  for (const auto& task : correct_tasks) {
+    correct_tasks_set.insert(task->task_id);
+    if (current_tasks.contains(task->task_id)) {
       continue;
     }
-    std::shared_ptr<const Task> task;
-    auto it = tasks_by_job.find(job->job_id);
-    if (it != tasks_by_job.end()) {
-      task = it->second;
-    } else {
-      TF_RETURN_IF_ERROR(CreateTask(job, worker_address, &task));
-    }
-    TaskDef* task_def = response->add_tasks();
+    TaskDef* task_def = response->add_new_tasks();
     std::shared_ptr<const Dataset> dataset;
-    TF_RETURN_IF_ERROR(state_.DatasetFromId(job->dataset_id, &dataset));
+    TF_RETURN_IF_ERROR(state_.DatasetFromId(task->dataset_id, dataset));
     std::string dataset_key =
         DatasetKey(dataset->dataset_id, dataset->fingerprint);
     if (config_.work_dir().empty()) {
@@ -184,22 +239,30 @@ Status DataServiceDispatcherImpl::RegisterWorker(
           io::JoinPath(DatasetsDir(config_.work_dir()), dataset_key);
       task_def->set_path(path);
     }
-    task_def->set_dataset_id(job->dataset_id);
-    task_def->set_job_id(job->job_id);
+    task_def->set_dataset_id(task->dataset_id);
+    task_def->set_job_id(task->job_id);
     task_def->set_task_id(task->task_id);
+    task_def->set_processing_mode(ProcessingModeDef(task->processing_mode));
+  }
+  for (int64 current_task : current_tasks) {
+    if (!correct_tasks_set.contains(current_task)) {
+      response->add_tasks_to_delete(current_task);
+    }
   }
 
-  VLOG(1) << "Registered worker at address " << request->worker_address();
+  VLOG(1) << "Finished worker heartbeat for worker at address "
+          << request->worker_address();
   return Status::OK();
 }
 
 Status DataServiceDispatcherImpl::WorkerUpdate(
     const WorkerUpdateRequest* request, WorkerUpdateResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
   mutex_lock l(mu_);
   for (auto& update : request->updates()) {
     int64 task_id = update.task_id();
     std::shared_ptr<const Task> task;
-    TF_RETURN_IF_ERROR(state_.TaskFromId(task_id, &task));
+    TF_RETURN_IF_ERROR(state_.TaskFromId(task_id, task));
     if (update.completed()) {
       if (task->finished) {
         VLOG(1) << "Received completion update for already-finished task "
@@ -218,31 +281,79 @@ Status DataServiceDispatcherImpl::WorkerUpdate(
 
 Status DataServiceDispatcherImpl::GetDatasetDef(
     const GetDatasetDefRequest* request, GetDatasetDefResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
   mutex_lock l(mu_);
   std::shared_ptr<const Dataset> dataset;
-  TF_RETURN_IF_ERROR(state_.DatasetFromId(request->dataset_id(), &dataset));
-  std::string key = DatasetKey(dataset->dataset_id, dataset->fingerprint);
+  TF_RETURN_IF_ERROR(state_.DatasetFromId(request->dataset_id(), dataset));
   std::shared_ptr<const DatasetDef> dataset_def;
-  TF_RETURN_IF_ERROR(dataset_store_->Get(key, dataset_def));
+  TF_RETURN_IF_ERROR(GetDatasetDef(*dataset, dataset_def));
   *response->mutable_dataset_def() = *dataset_def;
   return Status::OK();
 }
 
+Status DataServiceDispatcherImpl::GetSplit(const GetSplitRequest* request,
+                                           GetSplitResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
+  mutex_lock l(mu_);
+  int64 job_id = request->job_id();
+  int64 repetition = request->repetition();
+  std::unique_ptr<DistributedEpochJob>& distributed_epoch_job =
+      distributed_epoch_jobs_[job_id];
+  if (!distributed_epoch_job) {
+    return errors::NotFound("distributed_epoch_job id not found: ", job_id);
+  }
+  std::unique_ptr<SplitProvider>& split_provider =
+      distributed_epoch_job->split_providers[repetition];
+  if (!split_provider) {
+    VLOG(1) << "Creating split provider for job "
+            << distributed_epoch_job->job_id << " repetition " << repetition;
+    TF_RETURN_IF_ERROR(
+        MakeSplitProvider(distributed_epoch_job->dataset_id, split_provider));
+  }
+  Tensor split;
+  bool end_of_splits = false;
+  TF_RETURN_IF_ERROR(split_provider->GetNext(&split, &end_of_splits));
+  response->set_end_of_splits(end_of_splits);
+  if (!end_of_splits) {
+    split.AsProtoTensorContent(response->mutable_split());
+  }
+  return Status::OK();
+}
+
+Status DataServiceDispatcherImpl::MakeSplitProvider(
+    int64 dataset_id, std::unique_ptr<SplitProvider>& split_provider)
+    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  std::shared_ptr<const Dataset> dataset;
+  TF_RETURN_IF_ERROR(state_.DatasetFromId(dataset_id, dataset));
+  std::shared_ptr<const DatasetDef> dataset_def;
+  TF_RETURN_IF_ERROR(GetDatasetDef(*dataset, dataset_def));
+  standalone::Dataset::Params params;
+  std::unique_ptr<standalone::Dataset> standalone_dataset;
+  TF_RETURN_IF_ERROR(standalone::Dataset::FromGraph(
+      params, dataset_def->graph(), &standalone_dataset));
+  TF_RETURN_IF_ERROR(standalone_dataset->MakeSplitProvider(&split_provider));
+  return Status::OK();
+}
+
 Status DataServiceDispatcherImpl::GetOrRegisterDataset(
     const GetOrRegisterDatasetRequest* request,
     GetOrRegisterDatasetResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
   uint64 fingerprint;
-  const GraphDef& graph = request->dataset().graph();
-  TF_RETURN_IF_ERROR(HashGraph(graph, &fingerprint));
+  DatasetDef dataset_def = request->dataset();
+  GraphDef* graph = dataset_def.mutable_graph();
+  PrepareGraph(graph);
+  TF_RETURN_IF_ERROR(HashGraph(*graph, &fingerprint));
+
   mutex_lock l(mu_);
 #if defined(PLATFORM_GOOGLE)
   VLOG_LINES(4,
-             absl::StrCat("Registering dataset graph: ", graph.DebugString()));
+             absl::StrCat("Registering dataset graph: ", graph->DebugString()));
 #else
-  VLOG(4) << "Registering dataset graph: " << graph.DebugString();
+  VLOG(4) << "Registering dataset graph: " << graph->DebugString();
 #endif
   std::shared_ptr<const Dataset> dataset;
-  Status s = state_.DatasetFromFingerprint(fingerprint, &dataset);
+  Status s = state_.DatasetFromFingerprint(fingerprint, dataset);
   if (s.ok()) {
     int64 id = dataset->dataset_id;
     VLOG(3) << "Received duplicate RegisterDataset request with fingerprint "
@@ -254,7 +365,7 @@ Status DataServiceDispatcherImpl::GetOrRegisterDataset(
   }
 
   int64 id;
-  TF_RETURN_IF_ERROR(RegisterDataset(fingerprint, request->dataset(), &id));
+  TF_RETURN_IF_ERROR(RegisterDataset(fingerprint, dataset_def, id));
   response->set_dataset_id(id);
   VLOG(3) << "Registered new dataset with id " << id;
   return Status::OK();
@@ -262,20 +373,21 @@ Status DataServiceDispatcherImpl::GetOrRegisterDataset(
 
 Status DataServiceDispatcherImpl::RegisterDataset(uint64 fingerprint,
                                                   const DatasetDef& dataset,
-                                                  int64* dataset_id)
+                                                  int64& dataset_id)
     EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  *dataset_id = state_.NextAvailableDatasetId();
+  dataset_id = state_.NextAvailableDatasetId();
   Update update;
   RegisterDatasetUpdate* register_dataset = update.mutable_register_dataset();
-  register_dataset->set_dataset_id(*dataset_id);
+  register_dataset->set_dataset_id(dataset_id);
   register_dataset->set_fingerprint(fingerprint);
   TF_RETURN_IF_ERROR(
-      dataset_store_->Put(DatasetKey(*dataset_id, fingerprint), dataset));
+      dataset_store_->Put(DatasetKey(dataset_id, fingerprint), dataset));
   return Apply(update);
 }
 
 Status DataServiceDispatcherImpl::CreateJob(const CreateJobRequest* request,
                                             CreateJobResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
   VLOG(3) << "Received create job request for dataset id "
           << request->dataset_id();
   ProcessingMode processing_mode = ProcessingMode(request->processing_mode());
@@ -284,11 +396,11 @@ Status DataServiceDispatcherImpl::CreateJob(const CreateJobRequest* request,
   {
     mutex_lock l(mu_);
     TF_RETURN_IF_ERROR(CreateJob(request->dataset_id(), processing_mode,
-                                 absl::optional<NamedJobKey>(), &job));
+                                 absl::optional<NamedJobKey>(), job));
     int64 job_client_id;
     TF_RETURN_IF_ERROR(AcquireJobClientId(job, job_client_id));
     response->set_job_client_id(job_client_id);
-    TF_RETURN_IF_ERROR(CreateTasksForJob(job, &tasks));
+    TF_RETURN_IF_ERROR(CreateTasksForJob(job, tasks));
   }
   TF_RETURN_IF_ERROR(AssignTasks(tasks));
 
@@ -299,6 +411,7 @@ Status DataServiceDispatcherImpl::CreateJob(const CreateJobRequest* request,
 
 Status DataServiceDispatcherImpl::GetOrCreateJob(
     const GetOrCreateJobRequest* request, GetOrCreateJobResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
   VLOG(3) << "Received get or create job request for dataset id "
           << request->dataset_id() << " with name " << request->job_name()
           << " and index " << request->job_name_index();
@@ -309,7 +422,7 @@ Status DataServiceDispatcherImpl::GetOrCreateJob(
   std::vector<std::shared_ptr<const Task>> tasks;
   {
     mutex_lock l(mu_);
-    Status s = state_.NamedJobByKey(key, &job);
+    Status s = state_.NamedJobByKey(key, job);
     if (s.ok()) {
       TF_RETURN_IF_ERROR(ValidateMatchingJob(job, requested_processing_mode,
                                              request->dataset_id()));
@@ -323,11 +436,11 @@ Status DataServiceDispatcherImpl::GetOrCreateJob(
       return s;
     }
     TF_RETURN_IF_ERROR(
-        CreateJob(request->dataset_id(), requested_processing_mode, key, &job));
+        CreateJob(request->dataset_id(), requested_processing_mode, key, job));
     int64 job_client_id;
     TF_RETURN_IF_ERROR(AcquireJobClientId(job, job_client_id));
     response->set_job_client_id(job_client_id);
-    TF_RETURN_IF_ERROR(CreateTasksForJob(job, &tasks));
+    TF_RETURN_IF_ERROR(CreateTasksForJob(job, tasks));
   }
   TF_RETURN_IF_ERROR(AssignTasks(tasks));
   VLOG(3) << "Created job " << job->job_id << " for dataset "
@@ -338,6 +451,7 @@ Status DataServiceDispatcherImpl::GetOrCreateJob(
 Status DataServiceDispatcherImpl::ReleaseJobClient(
     const ReleaseJobClientRequest* request,
     ReleaseJobClientResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
   mutex_lock l(mu_);
   int64 job_client_id = request->job_client_id();
   std::shared_ptr<const Job> job;
@@ -346,7 +460,7 @@ Status DataServiceDispatcherImpl::ReleaseJobClient(
   ReleaseJobClientUpdate* release_job_client =
       update.mutable_release_job_client();
   release_job_client->set_job_client_id(job_client_id);
-  release_job_client->set_time_micros(Env::Default()->NowMicros());
+  release_job_client->set_time_micros(env_->NowMicros());
   TF_RETURN_IF_ERROR(Apply(update));
   return Status::OK();
 }
@@ -361,36 +475,31 @@ Status DataServiceDispatcherImpl::ValidateMatchingJob(
     std::string requested = ProcessingModeToString(processing_mode);
     std::string actual = ProcessingModeToString(job->processing_mode);
     return errors::FailedPrecondition(
-        "Found a job with name ", job_name, ", but the processing mode <",
-        actual, "> doesn't match the requested processing mode <", requested,
-        ">.");
-  }
-  if (job->dataset_id != dataset_id) {
-    return errors::FailedPrecondition(
-        "Found a job with name ", job_name, ", but the dataset id <",
-        job->dataset_id, "> doesn't match the requested dataset id <",
-        dataset_id, ">.");
+        "Tried to create a job with name ", job_name, " and processing_mode <",
+        requested,
+        "> but there is already an existing job with that name using "
+        "processing mode <",
+        actual, ">");
   }
   return Status::OK();
 }
 
 Status DataServiceDispatcherImpl::CreateJob(
     int64 dataset_id, ProcessingMode processing_mode,
-    absl::optional<NamedJobKey> named_job_key, std::shared_ptr<const Job>* job)
+    absl::optional<NamedJobKey> named_job_key, std::shared_ptr<const Job>& job)
     EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   switch (processing_mode) {
     case ProcessingMode::PARALLEL_EPOCHS:
+    case ProcessingMode::DISTRIBUTED_EPOCH:
       break;
-    case ProcessingMode::ONE_EPOCH:
-      return errors::Unimplemented(
-          "CreateJob only supports the PARALLEL_EPOCHS job mode. "
-          "ONE_EPOCH is not currently supported.");
     default:
-      return errors::Unimplemented("ProcessingMode ",
-                                   ProcessingModeToString(processing_mode),
-                                   " not recognized");
+      return errors::Internal(
+          absl::StrCat("ProcessingMode ", processing_mode, " not recognized"));
   }
   int64 job_id = state_.NextAvailableJobId();
+  if (processing_mode == ProcessingMode::DISTRIBUTED_EPOCH) {
+    TF_RETURN_IF_ERROR(MakeDistributedEpochJob(job_id, dataset_id));
+  }
   Update update;
   CreateJobUpdate* create_job = update.mutable_create_job();
   create_job->set_job_id(job_id);
@@ -406,6 +515,19 @@ Status DataServiceDispatcherImpl::CreateJob(
   return Status::OK();
 }
 
+Status DataServiceDispatcherImpl::CreateTasksForWorker(
+    const std::string& worker_address) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  std::vector<std::shared_ptr<const Job>> jobs = state_.ListJobs();
+  for (const auto& job : jobs) {
+    if (job->finished) {
+      continue;
+    }
+    std::shared_ptr<const Task> task;
+    TF_RETURN_IF_ERROR(CreateTask(job, worker_address, task));
+  }
+  return Status::OK();
+}
+
 Status DataServiceDispatcherImpl::AcquireJobClientId(
     const std::shared_ptr<const Job>& job, int64& job_client_id)
     EXCLUSIVE_LOCKS_REQUIRED(mu_) {
@@ -421,22 +543,22 @@ Status DataServiceDispatcherImpl::AcquireJobClientId(
 
 Status DataServiceDispatcherImpl::CreateTasksForJob(
     std::shared_ptr<const Job> job,
-    std::vector<std::shared_ptr<const Task>>* tasks)
+    std::vector<std::shared_ptr<const Task>>& tasks)
     EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   std::vector<std::shared_ptr<const Worker>> workers = state_.ListWorkers();
-  tasks->clear();
-  tasks->reserve(workers.size());
+  tasks.clear();
+  tasks.reserve(workers.size());
   for (const auto& worker : workers) {
     std::shared_ptr<const Task> task;
-    TF_RETURN_IF_ERROR(CreateTask(job, worker->address, &task));
-    tasks->push_back(task);
+    TF_RETURN_IF_ERROR(CreateTask(job, worker->address, task));
+    tasks.push_back(task);
   }
   return Status::OK();
 }
 
 Status DataServiceDispatcherImpl::CreateTask(std::shared_ptr<const Job> job,
                                              const std::string& worker_address,
-                                             std::shared_ptr<const Task>* task)
+                                             std::shared_ptr<const Task>& task)
     EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   int64 task_id = state_.NextAvailableTaskId();
   Update update;
@@ -444,6 +566,7 @@ Status DataServiceDispatcherImpl::CreateTask(std::shared_ptr<const Job> job,
   create_task->set_task_id(task_id);
   create_task->set_job_id(job->job_id);
   create_task->set_dataset_id(job->dataset_id);
+  create_task->set_processing_mode(ProcessingModeDef(job->processing_mode));
   create_task->set_worker_address(worker_address);
   TF_RETURN_IF_ERROR(Apply(update));
   TF_RETURN_IF_ERROR(state_.TaskFromId(task_id, task));
@@ -459,19 +582,19 @@ Status DataServiceDispatcherImpl::AssignTasks(
 }
 
 Status DataServiceDispatcherImpl::GetOrCreateWorkerStub(
-    const std::string& worker_address, WorkerService::Stub** out_stub)
+    const std::string& worker_address, WorkerService::Stub*& out_stub)
     LOCKS_EXCLUDED(mu_) {
   {
     mutex_lock l(mu_);
     auto it = worker_stubs_.find(worker_address);
     if (it != worker_stubs_.end()) {
-      *out_stub = it->second.get();
+      out_stub = it->second.get();
       return Status::OK();
     }
   }
   std::unique_ptr<WorkerService::Stub> stub;
   TF_RETURN_IF_ERROR(
-      CreateWorkerStub(worker_address, config_.protocol(), &stub));
+      CreateWorkerStub(worker_address, config_.protocol(), stub));
   {
     mutex_lock l(mu_);
     // A concurrent call could have already created the stub.
@@ -479,7 +602,7 @@ Status DataServiceDispatcherImpl::GetOrCreateWorkerStub(
     if (worker == nullptr) {
       worker = std::move(stub);
     }
-    *out_stub = worker.get();
+    out_stub = worker.get();
   }
   return Status::OK();
 }
@@ -492,10 +615,11 @@ Status DataServiceDispatcherImpl::AssignTask(std::shared_ptr<const Task> task)
   ProcessTaskRequest req;
   TaskDef* task_def = req.mutable_task();
   task_def->set_dataset_id(task->dataset_id);
+  task_def->set_job_id(task->job_id);
   {
     mutex_lock l(mu_);
     std::shared_ptr<const Dataset> dataset;
-    TF_RETURN_IF_ERROR(state_.DatasetFromId(task->dataset_id, &dataset));
+    TF_RETURN_IF_ERROR(state_.DatasetFromId(task->dataset_id, dataset));
     std::string dataset_key =
         DatasetKey(dataset->dataset_id, dataset->fingerprint);
     if (config_.work_dir().empty()) {
@@ -509,9 +633,10 @@ Status DataServiceDispatcherImpl::AssignTask(std::shared_ptr<const Task> task)
     }
   }
   task_def->set_task_id(task->task_id);
+  task_def->set_processing_mode(ProcessingModeDef(task->processing_mode));
   ProcessTaskResponse resp;
   WorkerService::Stub* stub;
-  TF_RETURN_IF_ERROR(GetOrCreateWorkerStub(task->worker_address, &stub));
+  TF_RETURN_IF_ERROR(GetOrCreateWorkerStub(task->worker_address, stub));
   grpc::Status s = stub->ProcessTask(&client_ctx, req, &resp);
   if (!s.ok()) {
     return grpc_util::WrapError(
@@ -525,12 +650,13 @@ Status DataServiceDispatcherImpl::AssignTask(std::shared_ptr<const Task> task)
 
 Status DataServiceDispatcherImpl::GetTasks(const GetTasksRequest* request,
                                            GetTasksResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
   mutex_lock l(mu_);
   VLOG(3) << "Looking up tasks for job client id " << request->job_client_id();
   std::shared_ptr<const Job> job;
   TF_RETURN_IF_ERROR(state_.JobForJobClientId(request->job_client_id(), job));
   std::vector<std::shared_ptr<const Task>> tasks;
-  TF_RETURN_IF_ERROR(state_.TasksForJob(job->job_id, &tasks));
+  TF_RETURN_IF_ERROR(state_.TasksForJob(job->job_id, tasks));
   for (const auto& task : tasks) {
     TaskInfo* task_info = response->mutable_task_info()->Add();
     task_info->set_worker_address(task->worker_address);
@@ -545,6 +671,7 @@ Status DataServiceDispatcherImpl::GetTasks(const GetTasksRequest* request,
 
 Status DataServiceDispatcherImpl::GetWorkers(const GetWorkersRequest* request,
                                              GetWorkersResponse* response) {
+  TF_RETURN_IF_ERROR(CheckStarted());
   mutex_lock l(mu_);
   VLOG(3) << "Enter GetWorkers";
   std::vector<std::shared_ptr<const Worker>> workers = state_.ListWorkers();
@@ -557,6 +684,14 @@ Status DataServiceDispatcherImpl::GetWorkers(const GetWorkersRequest* request,
   return Status::OK();
 }
 
+Status DataServiceDispatcherImpl::CheckStarted() LOCKS_EXCLUDED(mu_) {
+  mutex_lock l(mu_);
+  if (!started_) {
+    return errors::Unavailable("Dispatcher has not started yet.");
+  }
+  return Status::OK();
+}
+
 Status DataServiceDispatcherImpl::ApplyWithoutJournaling(const Update& update)
     EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   return state_.Apply(update);
@@ -570,5 +705,66 @@ Status DataServiceDispatcherImpl::Apply(const Update& update)
   return state_.Apply(update);
 }
 
+void DataServiceDispatcherImpl::JobGcThread() {
+  int64 next_check_micros = 0;
+  while (true) {
+    mutex_lock l(mu_);
+    while (!cancelled_ && env_->NowMicros() < next_check_micros) {
+      int64 remaining_micros = next_check_micros - env_->NowMicros();
+      job_gc_thread_cv_.wait_for(l,
+                                 std::chrono::microseconds(remaining_micros));
+    }
+    if (cancelled_) {
+      return;
+    }
+    Status s = GcOldJobs();
+    if (!s.ok()) {
+      LOG(WARNING) << "Error garbage collecting old jobs: " << s;
+    }
+    next_check_micros =
+        env_->NowMicros() + (config_.job_gc_check_interval_ms() * 1000);
+  }
+}
+
+Status DataServiceDispatcherImpl::GcOldJobs() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  std::vector<std::shared_ptr<const Job>> jobs = state_.ListJobs();
+  int64 now = env_->NowMicros();
+  for (const auto& job : jobs) {
+    if (job->finished || job->num_clients > 0 ||
+        job->last_client_released_micros < 0 ||
+        now < job->last_client_released_micros +
+                  (config_.job_gc_timeout_ms() * 1000)) {
+      continue;
+    }
+    std::vector<std::shared_ptr<const Task>> tasks;
+    TF_RETURN_IF_ERROR(state_.TasksForJob(job->job_id, tasks));
+    for (const auto& task : tasks) {
+      if (task->finished) {
+        continue;
+      }
+      Update update;
+      update.mutable_finish_task()->set_task_id(task->task_id);
+      TF_RETURN_IF_ERROR(state_.Apply(update));
+    }
+    DCHECK(job->finished);
+  }
+  return Status::OK();
+}
+
+Status DataServiceDispatcherImpl::GetDatasetDef(
+    int64 dataset_id, std::shared_ptr<const DatasetDef>& dataset_def)
+    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  std::shared_ptr<const Dataset> dataset;
+  TF_RETURN_IF_ERROR(state_.DatasetFromId(dataset_id, dataset));
+  return GetDatasetDef(*dataset, dataset_def);
+}
+
+Status DataServiceDispatcherImpl::GetDatasetDef(
+    const Dataset& dataset, std::shared_ptr<const DatasetDef>& dataset_def)
+    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  std::string key = DatasetKey(dataset.dataset_id, dataset.fingerprint);
+  return dataset_store_->Get(key, dataset_def);
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/dispatcher_impl.h b/tensorflow/core/data/service/dispatcher_impl.h
index 34cdc678183..5a5e09b3f97 100644
--- a/tensorflow/core/data/service/dispatcher_impl.h
+++ b/tensorflow/core/data/service/dispatcher_impl.h
@@ -48,6 +48,8 @@ class DataServiceDispatcherImpl {
   explicit DataServiceDispatcherImpl(
       const experimental::DispatcherConfig& config);
 
+  ~DataServiceDispatcherImpl();
+
   // Starts the dispatcher. If there is a journal, this will read from the
   // journal to restore the dispatcher's state.
   Status Start();
@@ -55,12 +57,13 @@ class DataServiceDispatcherImpl {
   // See dispatcher.proto for API documentation.
 
   /// Worker-facing API.
-  Status RegisterWorker(const RegisterWorkerRequest* request,
-                        RegisterWorkerResponse* response);
+  Status WorkerHeartbeat(const WorkerHeartbeatRequest* request,
+                         WorkerHeartbeatResponse* response);
   Status WorkerUpdate(const WorkerUpdateRequest* request,
                       WorkerUpdateResponse* response);
   Status GetDatasetDef(const GetDatasetDefRequest* request,
                        GetDatasetDefResponse* response);
+  Status GetSplit(const GetSplitRequest* request, GetSplitResponse* response);
 
   /// Client-facing API.
   Status GetOrRegisterDataset(const GetOrRegisterDatasetRequest* request,
@@ -76,38 +79,65 @@ class DataServiceDispatcherImpl {
                     GetWorkersResponse* response);
 
  private:
+  struct DistributedEpochJob {
+    // When the distributed epoch job is first created, we eagerly create the
+    // split provider to fail fast in case the dataset doesn't support
+    // splitting. Split providers for later repetitions are created on demand.
+    explicit DistributedEpochJob(int64 job_id, int64 dataset_id,
+                                 std::unique_ptr<SplitProvider> split_provider)
+        : job_id(job_id), dataset_id(dataset_id) {
+      split_providers[0] = std::move(split_provider);
+    }
+
+    const int64 job_id;
+    const int64 dataset_id;
+    // Map from repetition index to split provider.
+    absl::flat_hash_map<int64, std::unique_ptr<SplitProvider>> split_providers;
+  };
+
+  // Creates a new DistributedEpochJob in `distributed_epoch_jobs_`.
+  Status MakeDistributedEpochJob(int64 job_id, int64 dataset_id)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Makes a split provider for the specified `dataset_id`, and stores it in
+  // `split_provider`.
+  Status MakeSplitProvider(int64 dataset_id,
+                           std::unique_ptr<SplitProvider>& split_provider)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
   // Registers a dataset with the given fingerprint, storing the new dataset's
-  // id in `*dataset-id`.
+  // id in `dataset_id`.
   Status RegisterDataset(uint64 fingerprint, const DatasetDef& dataset,
-                         int64* dataset_id) EXCLUSIVE_LOCKS_REQUIRED(mu_);
+                         int64& dataset_id) EXCLUSIVE_LOCKS_REQUIRED(mu_);
   // Gets a worker's stub from `worker_stubs_`, or if none exists, creates a
-  // stub and stores it in `worker_stubs_`.
+  // stub and stores it in `worker_stubs_`. A borrowed pointer to the stub is
+  // stored in `out_stub`.
   Status GetOrCreateWorkerStub(const std::string& worker_address,
-                               WorkerService::Stub** out_stub)
+                               WorkerService::Stub*& out_stub)
       LOCKS_EXCLUDED(mu_);
-  // Creates a job and stores it in `*job`. This method updates the
+  // Creates a job and stores it in `job`. This method updates the
   // dispatcher state with the new job, but does not assign tasks to workers.
   Status CreateJob(int64 dataset_id, ProcessingMode processing_mode,
                    absl::optional<DispatcherState::NamedJobKey> named_job_key,
-                   std::shared_ptr<const DispatcherState::Job>* job)
+                   std::shared_ptr<const DispatcherState::Job>& job)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Creates tasks for the specified worker, one task for every unfinished job.
+  Status CreateTasksForWorker(const std::string& worker_address);
   // Acquires a job client id to read from the given job and sets
   // `job_client_id`.
   Status AcquireJobClientId(
       const std::shared_ptr<const DispatcherState::Job>& job,
       int64& job_client_id) EXCLUSIVE_LOCKS_REQUIRED(mu_);
   // Creates one task for each worker, for the given job. The created tasks are
-  // stored in `*tasks`. This method only updates dispatcher metadata with the
+  // stored in `tasks`. This method only updates dispatcher metadata with the
   // new tasks, but doesn't assign the tasks to the workers.
   Status CreateTasksForJob(
       std::shared_ptr<const DispatcherState::Job> job,
-      std::vector<std::shared_ptr<const DispatcherState::Task>>* tasks)
+      std::vector<std::shared_ptr<const DispatcherState::Task>>& tasks)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  // Creates a new task for a job, storing the created task in `*task`.
+  // Creates a new task for a job, storing the created task in `task`.
   Status CreateTask(std::shared_ptr<const DispatcherState::Job> job,
                     const std::string& worker_address,
-                    std::shared_ptr<const DispatcherState::Task>* task);
+                    std::shared_ptr<const DispatcherState::Task>& task);
   // Assigns the list of tasks to the workers indicated by their
   // `worker_address` fields.
   Status AssignTasks(
@@ -121,28 +151,52 @@ class DataServiceDispatcherImpl {
   Status ValidateMatchingJob(std::shared_ptr<const DispatcherState::Job> job,
                              ProcessingMode processing_mode, int64 dataset_id)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Checks that the dispatcher has started, returning UNAVAILABLE if it hasn't.
+  Status CheckStarted() LOCKS_EXCLUDED(mu_);
   // Applies a state update, updating both the journal and the in-memory state.
   Status Apply(const Update& update) EXCLUSIVE_LOCKS_REQUIRED(mu_);
   // Applies a state update, but doesn't update the journal. Only meant to be
   // used when recovering state when the dispatcher starts.
   Status ApplyWithoutJournaling(const Update& update)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // A thread which periodically checks for jobs to clean up.
+  void JobGcThread();
+  // Scans for old jobs and marks them as finished.
+  Status GcOldJobs() EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Gets a `DatasetDef` from `dataset_store_` for the given dataset id, and
+  // stores it in `dataset_def`.
+  Status GetDatasetDef(int64 dataset_id,
+                       std::shared_ptr<const DatasetDef>& dataset_def)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  // Gets a `DatasetDef` from `dataset_store_` for the given dataset, and
+  // stores it in `dataset_def`.
+  Status GetDatasetDef(const DispatcherState::Dataset& dataset,
+                       std::shared_ptr<const DatasetDef>& dataset_def)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   const experimental::DispatcherConfig& config_;
+  Env* env_;
 
   mutex mu_;
-
-  int64 next_task_id_ TF_GUARDED_BY(mu_) = 0;
+  bool started_ TF_GUARDED_BY(mu_) = false;
+  bool cancelled_ TF_GUARDED_BY(mu_) = false;
 
   // Cached worker stubs for communicating with workers.
   absl::flat_hash_map<std::string, std::unique_ptr<WorkerService::Stub>>
       worker_stubs_ TF_GUARDED_BY(mu_);
   // Store of dataset definitions.
   std::unique_ptr<DatasetStore> dataset_store_ TF_GUARDED_BY(mu_);
+  // Mapping from job id to `DistributedEpochJob` for jobs with processing mode
+  // DISTRIBUTED_EPOCH.
+  absl::flat_hash_map<int64, std::unique_ptr<DistributedEpochJob>>
+      distributed_epoch_jobs_ TF_GUARDED_BY(mu_);
 
   absl::optional<std::unique_ptr<JournalWriter>> journal_writer_
       TF_GUARDED_BY(mu_);
   DispatcherState state_ TF_GUARDED_BY(mu_);
+  // Condition variable for waking up the job gc thread.
+  condition_variable job_gc_thread_cv_;
+  std::unique_ptr<Thread> job_gc_thread_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(DataServiceDispatcherImpl);
 };
diff --git a/tensorflow/core/data/service/dispatcher_state.cc b/tensorflow/core/data/service/dispatcher_state.cc
index b302810f715..d48d9e2629d 100644
--- a/tensorflow/core/data/service/dispatcher_state.cc
+++ b/tensorflow/core/data/service/dispatcher_state.cc
@@ -25,7 +25,7 @@ namespace data {
 
 DispatcherState::DispatcherState() {}
 
-Status DispatcherState::Apply(Update update) {
+Status DispatcherState::Apply(const Update& update) {
   switch (update.update_type_case()) {
     case Update::kRegisterDataset:
       RegisterDataset(update.register_dataset());
@@ -72,7 +72,8 @@ void DispatcherState::RegisterWorker(
   std::string address = register_worker.worker_address();
   DCHECK(!workers_.contains(address));
   workers_[address] = std::make_shared<Worker>(address);
-  tasks_by_worker_[address] = std::vector<std::shared_ptr<Task>>();
+  tasks_by_worker_[address] =
+      absl::flat_hash_map<int64, std::shared_ptr<Task>>();
 }
 
 void DispatcherState::CreateJob(const CreateJobUpdate& create_job) {
@@ -124,9 +125,10 @@ void DispatcherState::CreateTask(const CreateTaskUpdate& create_task) {
   DCHECK_EQ(task, nullptr);
   task = std::make_shared<Task>(task_id, create_task.job_id(),
                                 create_task.dataset_id(),
+                                ProcessingMode(create_task.processing_mode()),
                                 create_task.worker_address());
   tasks_by_job_[create_task.job_id()].push_back(task);
-  tasks_by_worker_[create_task.worker_address()].push_back(task);
+  tasks_by_worker_[create_task.worker_address()][task->task_id] = task;
   next_available_task_id_ = std::max(next_available_task_id_, task_id + 1);
 }
 
@@ -136,6 +138,7 @@ void DispatcherState::FinishTask(const FinishTaskUpdate& finish_task) {
   auto& task = tasks_[task_id];
   DCHECK(task != nullptr);
   task->finished = true;
+  tasks_by_worker_[task->worker_address].erase(task->task_id);
   bool all_finished = true;
   for (const auto& task_for_job : tasks_by_job_[task->job_id]) {
     if (!task_for_job->finished) {
@@ -151,32 +154,32 @@ int64 DispatcherState::NextAvailableDatasetId() const {
 }
 
 Status DispatcherState::DatasetFromId(
-    int64 id, std::shared_ptr<const Dataset>* dataset) const {
+    int64 id, std::shared_ptr<const Dataset>& dataset) const {
   auto it = datasets_by_id_.find(id);
   if (it == datasets_by_id_.end()) {
     return errors::NotFound("Dataset id ", id, " not found");
   }
-  *dataset = it->second;
+  dataset = it->second;
   return Status::OK();
 }
 
 Status DispatcherState::DatasetFromFingerprint(
-    uint64 fingerprint, std::shared_ptr<const Dataset>* dataset) const {
+    uint64 fingerprint, std::shared_ptr<const Dataset>& dataset) const {
   auto it = datasets_by_fingerprint_.find(fingerprint);
   if (it == datasets_by_fingerprint_.end()) {
     return errors::NotFound("Dataset fingerprint ", fingerprint, " not found");
   }
-  *dataset = it->second;
+  dataset = it->second;
   return Status::OK();
 }
 
 Status DispatcherState::WorkerFromAddress(
-    const std::string& address, std::shared_ptr<const Worker>* worker) const {
+    const std::string& address, std::shared_ptr<const Worker>& worker) const {
   auto it = workers_.find(address);
   if (it == workers_.end()) {
     return errors::NotFound("Worker with address ", address, " not found.");
   }
-  *worker = it->second;
+  worker = it->second;
   return Status::OK();
 }
 
@@ -201,23 +204,23 @@ DispatcherState::ListJobs() {
 }
 
 Status DispatcherState::JobFromId(int64 id,
-                                  std::shared_ptr<const Job>* job) const {
+                                  std::shared_ptr<const Job>& job) const {
   auto it = jobs_.find(id);
   if (it == jobs_.end()) {
     return errors::NotFound("Job id ", id, " not found");
   }
-  *job = it->second;
+  job = it->second;
   return Status::OK();
 }
 
 Status DispatcherState::NamedJobByKey(NamedJobKey named_job_key,
-                                      std::shared_ptr<const Job>* job) const {
+                                      std::shared_ptr<const Job>& job) const {
   auto it = named_jobs_.find(named_job_key);
   if (it == named_jobs_.end()) {
     return errors::NotFound("Named job key (", named_job_key.name, ", ",
                             named_job_key.index, ") not found");
   }
-  *job = it->second;
+  job = it->second;
   return Status::OK();
 }
 
@@ -239,25 +242,25 @@ int64 DispatcherState::NextAvailableJobClientId() const {
 }
 
 Status DispatcherState::TaskFromId(int64 id,
-                                   std::shared_ptr<const Task>* task) const {
+                                   std::shared_ptr<const Task>& task) const {
   auto it = tasks_.find(id);
   if (it == tasks_.end()) {
     return errors::NotFound("Task ", id, " not found");
   }
-  *task = it->second;
+  task = it->second;
   return Status::OK();
 }
 
 Status DispatcherState::TasksForJob(
-    int64 job_id, std::vector<std::shared_ptr<const Task>>* tasks) const {
+    int64 job_id, std::vector<std::shared_ptr<const Task>>& tasks) const {
   auto it = tasks_by_job_.find(job_id);
   if (it == tasks_by_job_.end()) {
     return errors::NotFound("Job ", job_id, " not found");
   }
-  tasks->clear();
-  tasks->reserve(it->second.size());
+  tasks.clear();
+  tasks.reserve(it->second.size());
   for (const auto& task : it->second) {
-    tasks->push_back(task);
+    tasks.push_back(task);
   }
   return Status::OK();
 }
@@ -269,10 +272,11 @@ Status DispatcherState::TasksForWorker(
   if (it == tasks_by_worker_.end()) {
     return errors::NotFound("Worker ", worker_address, " not found");
   }
-  std::vector<std::shared_ptr<Task>> worker_tasks = it->second;
+  const absl::flat_hash_map<int64, std::shared_ptr<Task>>& worker_tasks =
+      it->second;
   tasks.reserve(worker_tasks.size());
   for (const auto& task : worker_tasks) {
-    tasks.push_back(task);
+    tasks.push_back(task.second);
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/data/service/dispatcher_state.h b/tensorflow/core/data/service/dispatcher_state.h
index d2080c8e10c..9c1492f838a 100644
--- a/tensorflow/core/data/service/dispatcher_state.h
+++ b/tensorflow/core/data/service/dispatcher_state.h
@@ -56,7 +56,7 @@ class DispatcherState {
   DispatcherState& operator=(const DispatcherState&) = delete;
 
   // Applies the given update to the dispatcher's state.
-  Status Apply(Update update);
+  Status Apply(const Update& update);
 
   // A dataset registered with the dispatcher.
   struct Dataset {
@@ -113,15 +113,18 @@ class DispatcherState {
 
   struct Task {
     explicit Task(int64 task_id, int64 job_id, int64 dataset_id,
+                  ProcessingMode processing_mode,
                   const std::string& worker_address)
         : task_id(task_id),
           job_id(job_id),
           dataset_id(dataset_id),
+          processing_mode(processing_mode),
           worker_address(worker_address) {}
 
     const int64 task_id;
     const int64 job_id;
     const int64 dataset_id;
+    const ProcessingMode processing_mode;
     const std::string worker_address;
     bool finished = false;
   };
@@ -129,15 +132,15 @@ class DispatcherState {
   // Returns the next available dataset id.
   int64 NextAvailableDatasetId() const;
   // Gets a dataset by id. Returns NOT_FOUND if there is no such dataset.
-  Status DatasetFromId(int64 id, std::shared_ptr<const Dataset>* dataset) const;
+  Status DatasetFromId(int64 id, std::shared_ptr<const Dataset>& dataset) const;
   // Gets a dataset by fingerprint. Returns NOT_FOUND if there is no such
   // dataset.
   Status DatasetFromFingerprint(uint64 fingerprint,
-                                std::shared_ptr<const Dataset>* dataset) const;
+                                std::shared_ptr<const Dataset>& dataset) const;
 
   // Gets a worker by address. Returns NOT_FOUND if there is no such worker.
   Status WorkerFromAddress(const std::string& address,
-                           std::shared_ptr<const Worker>* worker) const;
+                           std::shared_ptr<const Worker>& worker) const;
   // Lists all workers registered with the dispatcher.
   std::vector<std::shared_ptr<const Worker>> ListWorkers() const;
 
@@ -146,9 +149,9 @@ class DispatcherState {
   // Returns a list of all jobs.
   std::vector<std::shared_ptr<const Job>> ListJobs();
   // Gets a job by id. Returns NOT_FOUND if there is no such job.
-  Status JobFromId(int64 id, std::shared_ptr<const Job>* job) const;
+  Status JobFromId(int64 id, std::shared_ptr<const Job>& job) const;
   // Gets a named job by key. Returns NOT_FOUND if there is no such job.
-  Status NamedJobByKey(NamedJobKey key, std::shared_ptr<const Job>* job) const;
+  Status NamedJobByKey(NamedJobKey key, std::shared_ptr<const Job>& job) const;
 
   // Returns the job associated with the given job client id. Returns NOT_FOUND
   // if the job_client_id is unknown or has been released.
@@ -160,12 +163,12 @@ class DispatcherState {
   // Returns the next available task id.
   int64 NextAvailableTaskId() const;
   // Gets a task by id. Returns NOT_FOUND if there is no such task.
-  Status TaskFromId(int64 id, std::shared_ptr<const Task>* task) const;
-  // Stores a list of all tasks for the given job to `*tasks`. Returns NOT_FOUND
+  Status TaskFromId(int64 id, std::shared_ptr<const Task>& task) const;
+  // Stores a list of all tasks for the given job to `tasks`. Returns NOT_FOUND
   // if there is no such job.
   Status TasksForJob(int64 job_id,
-                     std::vector<std::shared_ptr<const Task>>* tasks) const;
-  // Stores a list of all tasks for the given worker to `*tasks`. Returns
+                     std::vector<std::shared_ptr<const Task>>& tasks) const;
+  // Stores a list of all tasks for the given worker to `tasks`. Returns
   // NOT_FOUND if there is no such worker.
   Status TasksForWorker(const absl::string_view worker_address,
                         std::vector<std::shared_ptr<const Task>>& tasks) const;
@@ -179,7 +182,7 @@ class DispatcherState {
   void CreateTask(const CreateTaskUpdate& create_task);
   void FinishTask(const FinishTaskUpdate& finish_task);
 
-  int64 next_available_dataset_id_ = 0;
+  int64 next_available_dataset_id_ = 1000;
   // Registered datasets, keyed by dataset ids.
   absl::flat_hash_map<int64, std::shared_ptr<Dataset>> datasets_by_id_;
   // Registered datasets, keyed by dataset fingerprints.
@@ -189,24 +192,26 @@ class DispatcherState {
   // Registered workers, keyed by address.
   absl::flat_hash_map<std::string, std::shared_ptr<Worker>> workers_;
 
-  int64 next_available_job_id_ = 0;
+  int64 next_available_job_id_ = 2000;
   // Jobs, keyed by job ids.
   absl::flat_hash_map<int64, std::shared_ptr<Job>> jobs_;
   // Named jobs, keyed by their names and indices. Not all jobs have names, so
   // this is a subset of the jobs stored in `jobs_`.
   absl::flat_hash_map<NamedJobKey, std::shared_ptr<Job>> named_jobs_;
 
-  int64 next_available_job_client_id_ = 0;
+  int64 next_available_job_client_id_ = 3000;
   // Mapping from client ids to the jobs they are associated with.
   absl::flat_hash_map<int64, std::shared_ptr<Job>> jobs_for_client_ids_;
 
-  int64 next_available_task_id_ = 0;
+  int64 next_available_task_id_ = 4000;
   // Tasks, keyed by task ids.
   absl::flat_hash_map<int64, std::shared_ptr<Task>> tasks_;
   // Tasks, keyed by job ids.
   absl::flat_hash_map<int64, std::vector<std::shared_ptr<Task>>> tasks_by_job_;
-  // Tasks, keyed by worker addresses.
-  absl::flat_hash_map<std::string, std::vector<std::shared_ptr<Task>>>
+  // Tasks, keyed by worker addresses. The values are a map from task id to
+  // task.
+  absl::flat_hash_map<std::string,
+                      absl::flat_hash_map<int64, std::shared_ptr<Task>>>
       tasks_by_worker_;
 };
 
diff --git a/tensorflow/core/data/service/dispatcher_state_test.cc b/tensorflow/core/data/service/dispatcher_state_test.cc
index 1676fc704f4..299ff2c8feb 100644
--- a/tensorflow/core/data/service/dispatcher_state_test.cc
+++ b/tensorflow/core/data/service/dispatcher_state_test.cc
@@ -36,39 +36,39 @@ using Task = DispatcherState::Task;
 using ::testing::IsEmpty;
 using ::testing::SizeIs;
 
-Status RegisterDataset(int64 id, uint64 fingerprint, DispatcherState* state) {
+Status RegisterDataset(int64 id, uint64 fingerprint, DispatcherState& state) {
   Update update;
   RegisterDatasetUpdate* register_dataset = update.mutable_register_dataset();
   register_dataset->set_dataset_id(id);
   register_dataset->set_fingerprint(fingerprint);
-  TF_RETURN_IF_ERROR(state->Apply(update));
+  TF_RETURN_IF_ERROR(state.Apply(update));
   return Status::OK();
 }
 
-Status RegisterDataset(int64 id, DispatcherState* state) {
+Status RegisterDataset(int64 id, DispatcherState& state) {
   return RegisterDataset(id, /*fingerprint=*/1, state);
 }
 
-Status RegisterWorker(std::string worker_address, DispatcherState* state) {
+Status RegisterWorker(std::string worker_address, DispatcherState& state) {
   Update update;
   update.mutable_register_worker()->set_worker_address(worker_address);
-  TF_RETURN_IF_ERROR(state->Apply(update));
+  TF_RETURN_IF_ERROR(state.Apply(update));
   return Status::OK();
 }
 
 Status CreateAnonymousJob(int64 job_id, int64 dataset_id,
-                          DispatcherState* state) {
+                          DispatcherState& state) {
   Update update;
   CreateJobUpdate* create_job = update.mutable_create_job();
   create_job->set_job_id(job_id);
   create_job->set_dataset_id(dataset_id);
   create_job->set_processing_mode(ProcessingModeDef::PARALLEL_EPOCHS);
-  TF_RETURN_IF_ERROR(state->Apply(update));
+  TF_RETURN_IF_ERROR(state.Apply(update));
   return Status::OK();
 }
 
 Status CreateNamedJob(int64 job_id, int64 dataset_id, NamedJobKey named_job_key,
-                      DispatcherState* state) {
+                      DispatcherState& state) {
   Update update;
   CreateJobUpdate* create_job = update.mutable_create_job();
   create_job->set_job_id(job_id);
@@ -77,68 +77,68 @@ Status CreateNamedJob(int64 job_id, int64 dataset_id, NamedJobKey named_job_key,
   NamedJobKeyDef* key = create_job->mutable_named_job_key();
   key->set_name(named_job_key.name);
   key->set_index(named_job_key.index);
-  TF_RETURN_IF_ERROR(state->Apply(update));
+  TF_RETURN_IF_ERROR(state.Apply(update));
   return Status::OK();
 }
 
 Status AcquireJobClientId(int64 job_id, int64 job_client_id,
-                          DispatcherState* state) {
+                          DispatcherState& state) {
   Update update;
   AcquireJobClientUpdate* acquire_job_client =
       update.mutable_acquire_job_client();
   acquire_job_client->set_job_id(job_id);
   acquire_job_client->set_job_client_id(job_client_id);
-  TF_RETURN_IF_ERROR(state->Apply(update));
+  TF_RETURN_IF_ERROR(state.Apply(update));
   return Status::OK();
 }
 
 Status ReleaseJobClientId(int64 job_client_id, int64 release_time,
-                          DispatcherState* state) {
+                          DispatcherState& state) {
   Update update;
   ReleaseJobClientUpdate* release_job_client =
       update.mutable_release_job_client();
   release_job_client->set_job_client_id(job_client_id);
   release_job_client->set_time_micros(release_time);
-  TF_RETURN_IF_ERROR(state->Apply(update));
+  TF_RETURN_IF_ERROR(state.Apply(update));
   return Status::OK();
 }
 
 Status CreateTask(int64 task_id, int64 job_id, int64 dataset_id,
-                  const std::string& worker_address, DispatcherState* state) {
+                  const std::string& worker_address, DispatcherState& state) {
   Update update;
   CreateTaskUpdate* create_task = update.mutable_create_task();
   create_task->set_task_id(task_id);
   create_task->set_job_id(job_id);
   create_task->set_dataset_id(dataset_id);
   create_task->set_worker_address(worker_address);
-  TF_RETURN_IF_ERROR(state->Apply(update));
+  TF_RETURN_IF_ERROR(state.Apply(update));
   return Status::OK();
 }
 
-Status FinishTask(int64 task_id, DispatcherState* state) {
+Status FinishTask(int64 task_id, DispatcherState& state) {
   Update update;
   FinishTaskUpdate* finish_task = update.mutable_finish_task();
   finish_task->set_task_id(task_id);
-  TF_RETURN_IF_ERROR(state->Apply(update));
+  TF_RETURN_IF_ERROR(state.Apply(update));
   return Status::OK();
 }
 }  // namespace
 
 TEST(DispatcherState, RegisterDataset) {
-  int64 id = 10;
   uint64 fingerprint = 20;
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDataset(id, fingerprint, &state));
+  int64 id = state.NextAvailableDatasetId();
+  TF_EXPECT_OK(RegisterDataset(id, fingerprint, state));
   EXPECT_EQ(state.NextAvailableDatasetId(), id + 1);
 
   {
     std::shared_ptr<const Dataset> dataset;
-    TF_EXPECT_OK(state.DatasetFromFingerprint(fingerprint, &dataset));
+    TF_EXPECT_OK(state.DatasetFromFingerprint(fingerprint, dataset));
     EXPECT_EQ(dataset->dataset_id, id);
   }
   {
     std::shared_ptr<const Dataset> dataset;
-    TF_EXPECT_OK(state.DatasetFromId(id, &dataset));
+    TF_EXPECT_OK(state.DatasetFromId(id, dataset));
     EXPECT_EQ(dataset->fingerprint, fingerprint);
   }
 }
@@ -146,14 +146,14 @@ TEST(DispatcherState, RegisterDataset) {
 TEST(DispatcherState, MissingDatasetId) {
   DispatcherState state;
   std::shared_ptr<const Dataset> dataset;
-  Status s = state.DatasetFromId(0, &dataset);
+  Status s = state.DatasetFromId(0, dataset);
   EXPECT_EQ(s.code(), error::NOT_FOUND);
 }
 
 TEST(DispatcherState, MissingDatasetFingerprint) {
   DispatcherState state;
   std::shared_ptr<const Dataset> dataset;
-  Status s = state.DatasetFromFingerprint(0, &dataset);
+  Status s = state.DatasetFromFingerprint(0, dataset);
   EXPECT_EQ(s.code(), error::NOT_FOUND);
 }
 
@@ -161,7 +161,7 @@ TEST(DispatcherState, NextAvailableDatasetId) {
   DispatcherState state;
   int64 id = state.NextAvailableDatasetId();
   uint64 fingerprint = 20;
-  TF_EXPECT_OK(RegisterDataset(id, fingerprint, &state));
+  TF_EXPECT_OK(RegisterDataset(id, fingerprint, state));
   EXPECT_NE(state.NextAvailableDatasetId(), id);
   EXPECT_EQ(state.NextAvailableDatasetId(), state.NextAvailableDatasetId());
 }
@@ -169,9 +169,9 @@ TEST(DispatcherState, NextAvailableDatasetId) {
 TEST(DispatcherState, RegisterWorker) {
   DispatcherState state;
   std::string address = "test_worker_address";
-  TF_EXPECT_OK(RegisterWorker(address, &state));
+  TF_EXPECT_OK(RegisterWorker(address, state));
   std::shared_ptr<const Worker> worker;
-  TF_EXPECT_OK(state.WorkerFromAddress(address, &worker));
+  TF_EXPECT_OK(state.WorkerFromAddress(address, worker));
   EXPECT_EQ(worker->address, address);
 }
 
@@ -183,12 +183,12 @@ TEST(DispatcherState, ListWorkers) {
     std::vector<std::shared_ptr<const Worker>> workers = state.ListWorkers();
     EXPECT_THAT(workers, IsEmpty());
   }
-  TF_EXPECT_OK(RegisterWorker(address_1, &state));
+  TF_EXPECT_OK(RegisterWorker(address_1, state));
   {
     std::vector<std::shared_ptr<const Worker>> workers = state.ListWorkers();
     EXPECT_THAT(workers, SizeIs(1));
   }
-  TF_EXPECT_OK(RegisterWorker(address_2, &state));
+  TF_EXPECT_OK(RegisterWorker(address_2, state));
   {
     std::vector<std::shared_ptr<const Worker>> workers = state.ListWorkers();
     EXPECT_THAT(workers, SizeIs(2));
@@ -198,7 +198,7 @@ TEST(DispatcherState, ListWorkers) {
 TEST(DispatcherState, MissingWorker) {
   DispatcherState state;
   std::shared_ptr<const Worker> worker;
-  Status s = state.WorkerFromAddress("test_worker_address", &worker);
+  Status s = state.WorkerFromAddress("test_worker_address", worker);
   EXPECT_EQ(s.code(), error::NOT_FOUND);
 }
 
@@ -210,31 +210,31 @@ TEST(DispatcherState, UnknownUpdate) {
 }
 
 TEST(DispatcherState, AnonymousJob) {
-  int64 job_id = 3;
   int64 dataset_id = 10;
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDataset(dataset_id, &state));
-  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
+  int64 job_id = state.NextAvailableJobId();
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, state));
   std::shared_ptr<const Job> job;
-  TF_EXPECT_OK(state.JobFromId(job_id, &job));
+  TF_EXPECT_OK(state.JobFromId(job_id, job));
   EXPECT_EQ(state.NextAvailableJobId(), job_id + 1);
   EXPECT_EQ(job->dataset_id, dataset_id);
   EXPECT_EQ(job->job_id, job_id);
   std::vector<std::shared_ptr<const Task>> tasks;
-  TF_EXPECT_OK(state.TasksForJob(job_id, &tasks));
+  TF_EXPECT_OK(state.TasksForJob(job_id, tasks));
   EXPECT_THAT(tasks, IsEmpty());
   EXPECT_FALSE(job->finished);
 }
 
 TEST(DispatcherState, NamedJob) {
-  int64 job_id = 3;
   int64 dataset_id = 10;
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDataset(dataset_id, &state));
+  int64 job_id = state.NextAvailableJobId();
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
   NamedJobKey named_job_key("test", 1);
-  TF_EXPECT_OK(CreateNamedJob(job_id, dataset_id, named_job_key, &state));
+  TF_EXPECT_OK(CreateNamedJob(job_id, dataset_id, named_job_key, state));
   std::shared_ptr<const Job> job;
-  TF_EXPECT_OK(state.NamedJobByKey(named_job_key, &job));
+  TF_EXPECT_OK(state.NamedJobByKey(named_job_key, job));
   EXPECT_EQ(state.NextAvailableJobId(), job_id + 1);
   EXPECT_EQ(job->dataset_id, dataset_id);
   EXPECT_EQ(job->job_id, job_id);
@@ -244,16 +244,16 @@ TEST(DispatcherState, NamedJob) {
 TEST(DispatcherState, CreateTask) {
   int64 job_id = 3;
   int64 dataset_id = 10;
-  int64 task_id = 8;
   std::string worker_address = "test_worker_address";
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDataset(dataset_id, &state));
-  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
-  TF_EXPECT_OK(CreateTask(task_id, job_id, dataset_id, worker_address, &state));
+  int64 task_id = state.NextAvailableTaskId();
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, state));
+  TF_EXPECT_OK(CreateTask(task_id, job_id, dataset_id, worker_address, state));
   EXPECT_EQ(state.NextAvailableTaskId(), task_id + 1);
   {
     std::shared_ptr<const Task> task;
-    TF_EXPECT_OK(state.TaskFromId(task_id, &task));
+    TF_EXPECT_OK(state.TaskFromId(task_id, task));
     EXPECT_EQ(task->task_id, task_id);
     EXPECT_EQ(task->job_id, job_id);
     EXPECT_EQ(task->dataset_id, dataset_id);
@@ -261,7 +261,7 @@ TEST(DispatcherState, CreateTask) {
   }
   {
     std::vector<std::shared_ptr<const Task>> tasks;
-    TF_EXPECT_OK(state.TasksForJob(job_id, &tasks));
+    TF_EXPECT_OK(state.TasksForJob(job_id, tasks));
     EXPECT_THAT(tasks, SizeIs(1));
   }
   {
@@ -278,15 +278,15 @@ TEST(DispatcherState, CreateTasksForSameJob) {
   int64 task_id_2 = 9;
   std::string worker_address = "test_worker_address";
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDataset(dataset_id, &state));
-  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, state));
   TF_EXPECT_OK(
-      CreateTask(task_id_1, job_id, dataset_id, worker_address, &state));
+      CreateTask(task_id_1, job_id, dataset_id, worker_address, state));
   TF_EXPECT_OK(
-      CreateTask(task_id_2, job_id, dataset_id, worker_address, &state));
+      CreateTask(task_id_2, job_id, dataset_id, worker_address, state));
   {
     std::vector<std::shared_ptr<const Task>> tasks;
-    TF_EXPECT_OK(state.TasksForJob(job_id, &tasks));
+    TF_EXPECT_OK(state.TasksForJob(job_id, tasks));
     EXPECT_THAT(tasks, SizeIs(2));
   }
 }
@@ -299,21 +299,21 @@ TEST(DispatcherState, CreateTasksForDifferentJobs) {
   int64 task_id_2 = 9;
   std::string worker_address = "test_worker_address";
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDataset(dataset_id, &state));
-  TF_EXPECT_OK(CreateAnonymousJob(job_id_1, dataset_id, &state));
-  TF_EXPECT_OK(CreateAnonymousJob(job_id_2, dataset_id, &state));
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id_1, dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id_2, dataset_id, state));
   TF_EXPECT_OK(
-      CreateTask(task_id_1, job_id_1, dataset_id, worker_address, &state));
+      CreateTask(task_id_1, job_id_1, dataset_id, worker_address, state));
   TF_EXPECT_OK(
-      CreateTask(task_id_2, job_id_2, dataset_id, worker_address, &state));
+      CreateTask(task_id_2, job_id_2, dataset_id, worker_address, state));
   {
     std::vector<std::shared_ptr<const Task>> tasks;
-    TF_EXPECT_OK(state.TasksForJob(job_id_1, &tasks));
+    TF_EXPECT_OK(state.TasksForJob(job_id_1, tasks));
     EXPECT_THAT(tasks, SizeIs(1));
   }
   {
     std::vector<std::shared_ptr<const Task>> tasks;
-    TF_EXPECT_OK(state.TasksForJob(job_id_2, &tasks));
+    TF_EXPECT_OK(state.TasksForJob(job_id_2, tasks));
     EXPECT_THAT(tasks, SizeIs(1));
   }
 }
@@ -325,12 +325,12 @@ TEST(DispatcherState, CreateTasksForSameWorker) {
   int64 task_id_2 = 9;
   std::string worker_address = "test_worker_address";
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDataset(dataset_id, &state));
-  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, state));
   TF_EXPECT_OK(
-      CreateTask(task_id_1, job_id, dataset_id, worker_address, &state));
+      CreateTask(task_id_1, job_id, dataset_id, worker_address, state));
   TF_EXPECT_OK(
-      CreateTask(task_id_2, job_id, dataset_id, worker_address, &state));
+      CreateTask(task_id_2, job_id, dataset_id, worker_address, state));
   {
     std::vector<std::shared_ptr<const Task>> tasks;
     TF_EXPECT_OK(state.TasksForWorker(worker_address, tasks));
@@ -346,12 +346,12 @@ TEST(DispatcherState, CreateTasksForDifferentWorkers) {
   std::string worker_address_1 = "test_worker_address_1";
   std::string worker_address_2 = "test_worker_address_2";
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDataset(dataset_id, &state));
-  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, state));
   TF_EXPECT_OK(
-      CreateTask(task_id_1, job_id, dataset_id, worker_address_1, &state));
+      CreateTask(task_id_1, job_id, dataset_id, worker_address_1, state));
   TF_EXPECT_OK(
-      CreateTask(task_id_2, job_id, dataset_id, worker_address_2, &state));
+      CreateTask(task_id_2, job_id, dataset_id, worker_address_2, state));
   {
     std::vector<std::shared_ptr<const Task>> tasks;
     TF_EXPECT_OK(state.TasksForWorker(worker_address_1, tasks));
@@ -367,7 +367,7 @@ TEST(DispatcherState, CreateTasksForDifferentWorkers) {
 TEST(DispatcherState, GetTasksForWorkerEmpty) {
   std::string worker_address = "test_worker_address";
   DispatcherState state;
-  TF_EXPECT_OK(RegisterWorker(worker_address, &state));
+  TF_EXPECT_OK(RegisterWorker(worker_address, state));
   {
     std::vector<std::shared_ptr<const Task>> tasks;
     TF_EXPECT_OK(state.TasksForWorker(worker_address, tasks));
@@ -381,15 +381,15 @@ TEST(DispatcherState, FinishTask) {
   int64 task_id = 4;
   std::string worker_address = "test_worker_address";
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDataset(dataset_id, &state));
-  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
-  TF_EXPECT_OK(CreateTask(task_id, job_id, dataset_id, worker_address, &state));
-  TF_EXPECT_OK(FinishTask(task_id, &state));
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, state));
+  TF_EXPECT_OK(CreateTask(task_id, job_id, dataset_id, worker_address, state));
+  TF_EXPECT_OK(FinishTask(task_id, state));
   std::shared_ptr<const Task> task;
-  TF_EXPECT_OK(state.TaskFromId(task_id, &task));
+  TF_EXPECT_OK(state.TaskFromId(task_id, task));
   EXPECT_TRUE(task->finished);
   std::shared_ptr<const Job> job;
-  TF_EXPECT_OK(state.JobFromId(job_id, &job));
+  TF_EXPECT_OK(state.JobFromId(job_id, job));
   EXPECT_TRUE(job->finished);
 }
 
@@ -400,24 +400,24 @@ TEST(DispatcherState, FinishMultiTaskJob) {
   int64 task_id_2 = 5;
   std::string worker_address = "test_worker_address";
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDataset(dataset_id, &state));
-  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, state));
   TF_EXPECT_OK(
-      CreateTask(task_id_1, job_id, dataset_id, worker_address, &state));
+      CreateTask(task_id_1, job_id, dataset_id, worker_address, state));
   TF_EXPECT_OK(
-      CreateTask(task_id_2, job_id, dataset_id, worker_address, &state));
+      CreateTask(task_id_2, job_id, dataset_id, worker_address, state));
 
-  TF_EXPECT_OK(FinishTask(task_id_1, &state));
+  TF_EXPECT_OK(FinishTask(task_id_1, state));
   {
     std::shared_ptr<const Job> job;
-    TF_EXPECT_OK(state.JobFromId(job_id, &job));
+    TF_EXPECT_OK(state.JobFromId(job_id, job));
     EXPECT_FALSE(job->finished);
   }
 
-  TF_EXPECT_OK(FinishTask(task_id_2, &state));
+  TF_EXPECT_OK(FinishTask(task_id_2, state));
   {
     std::shared_ptr<const Job> job;
-    TF_EXPECT_OK(state.JobFromId(job_id, &job));
+    TF_EXPECT_OK(state.JobFromId(job_id, job));
     EXPECT_TRUE(job->finished);
   }
 }
@@ -428,14 +428,14 @@ TEST(DispatcherState, AcquireJobClientId) {
   int64 job_client_id_2 = 2;
   int64 dataset_id = 10;
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDataset(dataset_id, &state));
-  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
-  TF_EXPECT_OK(AcquireJobClientId(job_id, job_client_id_1, &state));
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, state));
+  TF_EXPECT_OK(AcquireJobClientId(job_id, job_client_id_1, state));
   {
     std::shared_ptr<const Job> job;
-    TF_EXPECT_OK(state.JobFromId(job_id, &job));
+    TF_EXPECT_OK(state.JobFromId(job_id, job));
     EXPECT_EQ(job->num_clients, 1);
-    TF_EXPECT_OK(AcquireJobClientId(job_id, job_client_id_2, &state));
+    TF_EXPECT_OK(AcquireJobClientId(job_id, job_client_id_2, state));
     EXPECT_EQ(job->num_clients, 2);
   }
   {
@@ -456,12 +456,12 @@ TEST(DispatcherState, ReleaseJobClientId) {
   int64 job_client_id = 6;
   int64 release_time = 100;
   DispatcherState state;
-  TF_EXPECT_OK(RegisterDataset(dataset_id, &state));
-  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, &state));
-  TF_EXPECT_OK(AcquireJobClientId(job_id, job_client_id, &state));
-  TF_EXPECT_OK(ReleaseJobClientId(job_client_id, release_time, &state));
+  TF_EXPECT_OK(RegisterDataset(dataset_id, state));
+  TF_EXPECT_OK(CreateAnonymousJob(job_id, dataset_id, state));
+  TF_EXPECT_OK(AcquireJobClientId(job_id, job_client_id, state));
+  TF_EXPECT_OK(ReleaseJobClientId(job_client_id, release_time, state));
   std::shared_ptr<const Job> job;
-  TF_EXPECT_OK(state.JobFromId(job_id, &job));
+  TF_EXPECT_OK(state.JobFromId(job_id, job));
   EXPECT_EQ(job->num_clients, 0);
   Status s = state.JobForJobClientId(job_client_id, job);
   EXPECT_EQ(s.code(), error::NOT_FOUND);
diff --git a/tensorflow/core/data/service/grpc_dispatcher_impl.cc b/tensorflow/core/data/service/grpc_dispatcher_impl.cc
index a7a30798a93..3345bc476b7 100644
--- a/tensorflow/core/data/service/grpc_dispatcher_impl.cc
+++ b/tensorflow/core/data/service/grpc_dispatcher_impl.cc
@@ -26,9 +26,9 @@ using ::grpc::ServerBuilder;
 using ::grpc::ServerContext;
 
 GrpcDispatcherImpl::GrpcDispatcherImpl(
-    ServerBuilder* server_builder, const experimental::DispatcherConfig& config)
+    const experimental::DispatcherConfig& config, ServerBuilder& server_builder)
     : impl_(config) {
-  server_builder->RegisterService(this);
+  server_builder.RegisterService(this);
   VLOG(1) << "Registered data service dispatcher";
 }
 
@@ -40,9 +40,10 @@ Status GrpcDispatcherImpl::Start() { return impl_.Start(); }
                                           method##Response* response) {   \
     return ToGrpcStatus(impl_.method(request, response));                 \
   }
-HANDLER(RegisterWorker);
+HANDLER(WorkerHeartbeat);
 HANDLER(WorkerUpdate);
 HANDLER(GetDatasetDef);
+HANDLER(GetSplit);
 HANDLER(GetOrRegisterDataset);
 HANDLER(CreateJob);
 HANDLER(ReleaseJobClient);
diff --git a/tensorflow/core/data/service/grpc_dispatcher_impl.h b/tensorflow/core/data/service/grpc_dispatcher_impl.h
index 81f1cbf6f02..99cd0336a30 100644
--- a/tensorflow/core/data/service/grpc_dispatcher_impl.h
+++ b/tensorflow/core/data/service/grpc_dispatcher_impl.h
@@ -25,18 +25,12 @@ namespace tensorflow {
 namespace data {
 
 // This class is a wrapper that handles communication for gRPC.
-//
-// Example usage:
-//
-// ::grpc::ServerBuilder builder;
-// // configure builder
-// GrpcDispatcherImpl data_service(&builder);
-// builder.BuildAndStart()
-//
 class GrpcDispatcherImpl : public DispatcherService::Service {
  public:
-  explicit GrpcDispatcherImpl(::grpc::ServerBuilder* server_builder,
-                              const experimental::DispatcherConfig& config);
+  // Constructs a GrpcDispatcherImpl with the given config, and registers it
+  // with `server_builder`.
+  explicit GrpcDispatcherImpl(const experimental::DispatcherConfig& config,
+                              ::grpc::ServerBuilder& server_builder);
   ~GrpcDispatcherImpl() override {}
 
   Status Start();
@@ -45,9 +39,10 @@ class GrpcDispatcherImpl : public DispatcherService::Service {
   ::grpc::Status method(::grpc::ServerContext* context, \
                         const method##Request* request, \
                         method##Response* response) override;
-  HANDLER(RegisterWorker);
+  HANDLER(WorkerHeartbeat);
   HANDLER(WorkerUpdate);
   HANDLER(GetDatasetDef);
+  HANDLER(GetSplit);
   HANDLER(GetOrRegisterDataset);
   HANDLER(CreateJob);
   HANDLER(ReleaseJobClient);
diff --git a/tensorflow/core/data/service/grpc_util.cc b/tensorflow/core/data/service/grpc_util.cc
index 73ea384ea60..0551d9537fc 100644
--- a/tensorflow/core/data/service/grpc_util.cc
+++ b/tensorflow/core/data/service/grpc_util.cc
@@ -31,7 +31,8 @@ Status WrapError(const std::string& message, const ::grpc::Status& status) {
     return errors::Internal("Expected a non-ok grpc status. Wrapping message: ",
                             message);
   } else {
-    return Status(static_cast<tensorflow::error::Code>(status.error_code()),
+    Status s = FromGrpcStatus(status);
+    return Status(s.code(),
                   absl::StrCat(message, ": ", status.error_message()));
   }
 }
diff --git a/tensorflow/core/data/service/grpc_worker_impl.cc b/tensorflow/core/data/service/grpc_worker_impl.cc
index b3a37fe0eec..3c3a81d0daf 100644
--- a/tensorflow/core/data/service/grpc_worker_impl.cc
+++ b/tensorflow/core/data/service/grpc_worker_impl.cc
@@ -24,10 +24,10 @@ namespace data {
 using ::grpc::ServerBuilder;
 using ::grpc::ServerContext;
 
-GrpcWorkerImpl::GrpcWorkerImpl(ServerBuilder* server_builder,
-                               const experimental::WorkerConfig& config)
+GrpcWorkerImpl::GrpcWorkerImpl(const experimental::WorkerConfig& config,
+                               ServerBuilder& server_builder)
     : impl_(config) {
-  server_builder->RegisterService(this);
+  server_builder.RegisterService(this);
   VLOG(1) << "Registered data service worker";
 }
 
@@ -43,6 +43,7 @@ Status GrpcWorkerImpl::Start(const std::string& worker_address) {
   }
 HANDLER(ProcessTask);
 HANDLER(GetElement);
+HANDLER(GetWorkerTasks);
 #undef HANDLER
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/grpc_worker_impl.h b/tensorflow/core/data/service/grpc_worker_impl.h
index c42e5639385..734865e3447 100644
--- a/tensorflow/core/data/service/grpc_worker_impl.h
+++ b/tensorflow/core/data/service/grpc_worker_impl.h
@@ -25,18 +25,12 @@ namespace tensorflow {
 namespace data {
 
 // This class is a wrapper that handles communication for gRPC.
-//
-// Example usage:
-//
-// ::grpc::ServerBuilder builder;
-// // configure builder
-// GrpcWorkerImpl data_service(&builder);
-// builder.BuildAndStart()
-//
 class GrpcWorkerImpl : public WorkerService::Service {
  public:
-  explicit GrpcWorkerImpl(::grpc::ServerBuilder* server_builder,
-                          const experimental::WorkerConfig& config);
+  // Constructs a GrpcWorkerImpl with the given config, and registers it with
+  // `server_builder`.
+  explicit GrpcWorkerImpl(const experimental::WorkerConfig& config,
+                          ::grpc::ServerBuilder& server_builder);
   ~GrpcWorkerImpl() override {}
 
   Status Start(const std::string& worker_address);
@@ -47,6 +41,7 @@ class GrpcWorkerImpl : public WorkerService::Service {
                         method##Response* response) override;
   HANDLER(ProcessTask);
   HANDLER(GetElement);
+  HANDLER(GetWorkerTasks);
 #undef HANDLER
 
  private:
diff --git a/tensorflow/core/data/service/journal.cc b/tensorflow/core/data/service/journal.cc
index b0ce0876c69..979fc78b7c0 100644
--- a/tensorflow/core/data/service/journal.cc
+++ b/tensorflow/core/data/service/journal.cc
@@ -96,7 +96,7 @@ Status FileJournalReader::EnsureInitialized() {
   return UpdateFile(DataServiceJournalFile(journal_dir_, 0));
 }
 
-Status FileJournalReader::Read(Update* update, bool* end_of_journal) {
+Status FileJournalReader::Read(Update& update, bool& end_of_journal) {
   TF_RETURN_IF_ERROR(EnsureInitialized());
   while (true) {
     tstring record;
@@ -108,20 +108,20 @@ Status FileJournalReader::Read(Update* update, bool* end_of_journal) {
       if (errors::IsNotFound(env_->FileExists(next_journal_file))) {
         VLOG(3) << "Next journal file " << next_journal_file
                 << " does not exist. End of journal reached.";
-        *end_of_journal = true;
+        end_of_journal = true;
         return Status::OK();
       }
       TF_RETURN_IF_ERROR(UpdateFile(next_journal_file));
       continue;
     }
     TF_RETURN_IF_ERROR(s);
-    if (!update->ParseFromString(record)) {
+    if (!update.ParseFromString(record)) {
       return errors::DataLoss("Failed to parse journal record.");
     }
     if (VLOG_IS_ON(4)) {
-      VLOG(4) << "Read journal entry: " << update->DebugString();
+      VLOG(4) << "Read journal entry: " << update.DebugString();
     }
-    *end_of_journal = false;
+    end_of_journal = false;
     return Status::OK();
   }
 }
diff --git a/tensorflow/core/data/service/journal.h b/tensorflow/core/data/service/journal.h
index 3483497705e..e31830e8c35 100644
--- a/tensorflow/core/data/service/journal.h
+++ b/tensorflow/core/data/service/journal.h
@@ -77,9 +77,9 @@ class FileJournalWriter : public JournalWriter {
 class JournalReader {
  public:
   virtual ~JournalReader() = default;
-  // Reads the next update from the journal. Sets `*end_of_journal=true` if
+  // Reads the next update from the journal. Sets `end_of_journal=true` if
   // there are no more updates left in the journal.
-  virtual Status Read(Update* update, bool* end_of_journal) = 0;
+  virtual Status Read(Update& update, bool& end_of_journal) = 0;
 };
 
 // JournalReader is not thread-safe, requiring external synchronization when
@@ -93,7 +93,7 @@ class FileJournalReader : public JournalReader {
   FileJournalReader(const FileJournalReader&) = delete;
   FileJournalReader& operator=(const FileJournalReader&) = delete;
 
-  Status Read(Update* update, bool* end_of_journal) override;
+  Status Read(Update& update, bool& end_of_journal) override;
 
  private:
   // Initializes the reader if it is not yet initialized.
diff --git a/tensorflow/core/data/service/journal.proto b/tensorflow/core/data/service/journal.proto
index 09136714cfa..44b3e04f2e2 100644
--- a/tensorflow/core/data/service/journal.proto
+++ b/tensorflow/core/data/service/journal.proto
@@ -57,6 +57,7 @@ message CreateTaskUpdate {
   int64 task_id = 1;
   int64 job_id = 2;
   int64 dataset_id = 3;
+  ProcessingModeDef processing_mode = 5;
   string worker_address = 4;
 }
 
diff --git a/tensorflow/core/data/service/journal_test.cc b/tensorflow/core/data/service/journal_test.cc
index 313b216fe76..3f55447cc68 100644
--- a/tensorflow/core/data/service/journal_test.cc
+++ b/tensorflow/core/data/service/journal_test.cc
@@ -28,12 +28,12 @@ namespace data {
 namespace {
 using ::testing::HasSubstr;
 
-bool NewJournalDir(std::string* journal_dir) {
+bool NewJournalDir(std::string& journal_dir) {
   std::string filename = testing::TmpDir();
   if (!Env::Default()->CreateUniqueFileName(&filename, "journal_dir")) {
     return false;
   }
-  *journal_dir = filename;
+  journal_dir = filename;
   return true;
 }
 
@@ -67,7 +67,7 @@ Status CheckJournalContent(StringPiece journal_dir,
   for (const auto& update : expected) {
     Update result;
     bool end_of_journal = true;
-    TF_RETURN_IF_ERROR(reader.Read(&result, &end_of_journal));
+    TF_RETURN_IF_ERROR(reader.Read(result, end_of_journal));
     EXPECT_FALSE(end_of_journal);
     // We can't use the testing::EqualsProto matcher because it is not available
     // in OSS.
@@ -75,7 +75,7 @@ Status CheckJournalContent(StringPiece journal_dir,
   }
   Update result;
   bool end_of_journal = false;
-  TF_RETURN_IF_ERROR(reader.Read(&result, &end_of_journal));
+  TF_RETURN_IF_ERROR(reader.Read(result, end_of_journal));
   EXPECT_TRUE(end_of_journal);
   return Status::OK();
 }
@@ -83,7 +83,7 @@ Status CheckJournalContent(StringPiece journal_dir,
 
 TEST(Journal, RoundTripMultiple) {
   std::string journal_dir;
-  EXPECT_TRUE(NewJournalDir(&journal_dir));
+  EXPECT_TRUE(NewJournalDir(journal_dir));
   std::vector<Update> updates = {MakeCreateJobUpdate(),
                                  MakeRegisterDatasetUpdate(),
                                  MakeFinishTaskUpdate()};
@@ -97,7 +97,7 @@ TEST(Journal, RoundTripMultiple) {
 
 TEST(Journal, AppendExistingJournal) {
   std::string journal_dir;
-  EXPECT_TRUE(NewJournalDir(&journal_dir));
+  EXPECT_TRUE(NewJournalDir(journal_dir));
   std::vector<Update> updates = {MakeCreateJobUpdate(),
                                  MakeRegisterDatasetUpdate(),
                                  MakeFinishTaskUpdate()};
@@ -111,17 +111,17 @@ TEST(Journal, AppendExistingJournal) {
 
 TEST(Journal, MissingFile) {
   std::string journal_dir;
-  EXPECT_TRUE(NewJournalDir(&journal_dir));
+  EXPECT_TRUE(NewJournalDir(journal_dir));
   FileJournalReader reader(Env::Default(), journal_dir);
   Update result;
   bool end_of_journal = true;
-  Status s = reader.Read(&result, &end_of_journal);
+  Status s = reader.Read(result, end_of_journal);
   EXPECT_TRUE(errors::IsNotFound(s));
 }
 
 TEST(Journal, NonRecordData) {
   std::string journal_dir;
-  EXPECT_TRUE(NewJournalDir(&journal_dir));
+  EXPECT_TRUE(NewJournalDir(journal_dir));
 
   TF_ASSERT_OK(Env::Default()->RecursivelyCreateDir(journal_dir));
   {
@@ -134,14 +134,14 @@ TEST(Journal, NonRecordData) {
   FileJournalReader reader(Env::Default(), journal_dir);
   Update result;
   bool end_of_journal = true;
-  Status s = reader.Read(&result, &end_of_journal);
+  Status s = reader.Read(result, end_of_journal);
   EXPECT_THAT(s.error_message(), HasSubstr("corrupted record"));
   EXPECT_EQ(s.code(), error::DATA_LOSS);
 }
 
 TEST(Journal, InvalidRecordData) {
   std::string journal_dir;
-  EXPECT_TRUE(NewJournalDir(&journal_dir));
+  EXPECT_TRUE(NewJournalDir(journal_dir));
 
   TF_ASSERT_OK(Env::Default()->RecursivelyCreateDir(journal_dir));
   {
@@ -155,7 +155,7 @@ TEST(Journal, InvalidRecordData) {
   FileJournalReader reader(Env::Default(), journal_dir);
   Update result;
   bool end_of_journal = true;
-  Status s = reader.Read(&result, &end_of_journal);
+  Status s = reader.Read(result, end_of_journal);
   EXPECT_THAT(s.error_message(), HasSubstr("Failed to parse journal record"));
   EXPECT_EQ(s.code(), error::DATA_LOSS);
 }
diff --git a/tensorflow/core/data/service/server_lib.cc b/tensorflow/core/data/service/server_lib.cc
index 4ee186cd9ec..af940fe54a3 100644
--- a/tensorflow/core/data/service/server_lib.cc
+++ b/tensorflow/core/data/service/server_lib.cc
@@ -51,8 +51,8 @@ Status GrpcDataServerBase::Start() {
                            credentials, &bound_port_);
   builder.SetMaxReceiveMessageSize(-1);
 
-  AddDataServiceToBuilder(&builder);
-  AddProfilerServiceToBuilder(&builder);
+  AddDataServiceToBuilder(builder);
+  AddProfilerServiceToBuilder(builder);
   server_ = builder.BuildAndStart();
   if (!server_) {
     return errors::Internal("Could not start gRPC server");
@@ -81,9 +81,9 @@ void GrpcDataServerBase::Join() { server_->Wait(); }
 int GrpcDataServerBase::BoundPort() { return bound_port(); }
 
 void GrpcDataServerBase::AddProfilerServiceToBuilder(
-    ::grpc::ServerBuilder* builder) {
-  profiler_service_ = CreateProfilerService();
-  builder->RegisterService(profiler_service_.get());
+    ::grpc::ServerBuilder& builder) {
+  profiler_service_ = profiler::CreateProfilerService();
+  builder.RegisterService(profiler_service_.get());
 }
 
 DispatchGrpcDataServer::DispatchGrpcDataServer(
@@ -94,8 +94,8 @@ DispatchGrpcDataServer::DispatchGrpcDataServer(
 DispatchGrpcDataServer::~DispatchGrpcDataServer() { delete service_; }
 
 void DispatchGrpcDataServer::AddDataServiceToBuilder(
-    ::grpc::ServerBuilder* builder) {
-  service_ = absl::make_unique<GrpcDispatcherImpl>(builder, config_).release();
+    ::grpc::ServerBuilder& builder) {
+  service_ = absl::make_unique<GrpcDispatcherImpl>(config_, builder).release();
 }
 
 Status DispatchGrpcDataServer::StartServiceInternal() {
@@ -122,8 +122,8 @@ WorkerGrpcDataServer::WorkerGrpcDataServer(
 WorkerGrpcDataServer::~WorkerGrpcDataServer() { delete service_; }
 
 void WorkerGrpcDataServer::AddDataServiceToBuilder(
-    ::grpc::ServerBuilder* builder) {
-  service_ = absl::make_unique<GrpcWorkerImpl>(builder, config_).release();
+    ::grpc::ServerBuilder& builder) {
+  service_ = absl::make_unique<GrpcWorkerImpl>(config_, builder).release();
 }
 
 Status WorkerGrpcDataServer::StartServiceInternal() {
@@ -138,15 +138,27 @@ Status WorkerGrpcDataServer::StartServiceInternal() {
   return Status::OK();
 }
 
+Status WorkerGrpcDataServer::NumTasks(int* num_tasks) {
+  GetWorkerTasksRequest req;
+  GetWorkerTasksResponse resp;
+  ::grpc::ServerContext ctx;
+  ::grpc::Status s = service_->GetWorkerTasks(&ctx, &req, &resp);
+  if (!s.ok()) {
+    return grpc_util::WrapError("Failed to get tasks", s);
+  }
+  *num_tasks = resp.tasks_size();
+  return Status::OK();
+}
+
 Status NewDispatchServer(const experimental::DispatcherConfig& config,
-                         std::unique_ptr<DispatchGrpcDataServer>* out_server) {
-  *out_server = absl::make_unique<DispatchGrpcDataServer>(config);
+                         std::unique_ptr<DispatchGrpcDataServer>& out_server) {
+  out_server = absl::make_unique<DispatchGrpcDataServer>(config);
   return Status::OK();
 }
 
 Status NewWorkerServer(const experimental::WorkerConfig& config,
-                       std::unique_ptr<WorkerGrpcDataServer>* out_server) {
-  *out_server = absl::make_unique<WorkerGrpcDataServer>(config);
+                       std::unique_ptr<WorkerGrpcDataServer>& out_server) {
+  out_server = absl::make_unique<WorkerGrpcDataServer>(config);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/data/service/server_lib.h b/tensorflow/core/data/service/server_lib.h
index 0ddc80676c3..c9981008248 100644
--- a/tensorflow/core/data/service/server_lib.h
+++ b/tensorflow/core/data/service/server_lib.h
@@ -53,8 +53,8 @@ class GrpcDataServerBase {
   int BoundPort();
 
  protected:
-  virtual void AddDataServiceToBuilder(::grpc::ServerBuilder* builder) = 0;
-  void AddProfilerServiceToBuilder(::grpc::ServerBuilder* builder);
+  virtual void AddDataServiceToBuilder(::grpc::ServerBuilder& builder) = 0;
+  void AddProfilerServiceToBuilder(::grpc::ServerBuilder& builder);
   // Starts the service. This will be called after building the service, so
   // bound_port() will return the actual bound port.
   virtual Status StartServiceInternal() = 0;
@@ -84,7 +84,7 @@ class DispatchGrpcDataServer : public GrpcDataServerBase {
   Status NumWorkers(int* num_workers);
 
  protected:
-  void AddDataServiceToBuilder(::grpc::ServerBuilder* builder) override;
+  void AddDataServiceToBuilder(::grpc::ServerBuilder& builder) override;
   Status StartServiceInternal() override;
 
  private:
@@ -98,8 +98,11 @@ class WorkerGrpcDataServer : public GrpcDataServerBase {
   explicit WorkerGrpcDataServer(const experimental::WorkerConfig& config);
   ~WorkerGrpcDataServer() override;
 
+  // Returns the number of tasks currently being executed by the worker.
+  Status NumTasks(int* num_tasks);
+
  protected:
-  void AddDataServiceToBuilder(::grpc::ServerBuilder* builder) override;
+  void AddDataServiceToBuilder(::grpc::ServerBuilder& builder) override;
   Status StartServiceInternal() override;
 
  private:
@@ -108,13 +111,13 @@ class WorkerGrpcDataServer : public GrpcDataServerBase {
   GrpcWorkerImpl* service_;
 };
 
-// Creates a dispatch tf.data server and stores it in `*out_server`.
+// Creates a dispatch tf.data server and stores it in `out_server`.
 Status NewDispatchServer(const experimental::DispatcherConfig& config,
-                         std::unique_ptr<DispatchGrpcDataServer>* out_server);
+                         std::unique_ptr<DispatchGrpcDataServer>& out_server);
 
-// Creates a worker tf.data server and stores it in `*out_server`.
+// Creates a worker tf.data server and stores it in `out_server`.
 Status NewWorkerServer(const experimental::WorkerConfig& config,
-                       std::unique_ptr<WorkerGrpcDataServer>* out_server);
+                       std::unique_ptr<WorkerGrpcDataServer>& out_server);
 
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/split_provider.cc b/tensorflow/core/data/service/split_provider.cc
new file mode 100644
index 00000000000..b3100d52ff1
--- /dev/null
+++ b/tensorflow/core/data/service/split_provider.cc
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/data/service/split_provider.h"
+
+#include "tensorflow/core/data/service/data_service.h"
+#include "tensorflow/core/data/service/grpc_util.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+namespace data {
+
+namespace {
+const int64 kRetryTimeoutMicros = 1000LL * 1000 * 60 * 60;  // 60 minutes.
+}  // namespace
+
+Status DataServiceSplitProvider::GetNext(Tensor* split, bool* end_of_splits) {
+  mutex_lock l(mu_);
+  if (!dispatcher_) {
+    dispatcher_ =
+        absl::make_unique<DataServiceDispatcherClient>(address_, protocol_);
+  }
+  return grpc_util::Retry(
+      [this, split, end_of_splits] {
+        return dispatcher_->GetSplit(job_id_, repetition_, *split,
+                                     *end_of_splits);
+      },
+      "get next split",
+      /*deadline_micros=*/Env::Default()->NowMicros() + kRetryTimeoutMicros);
+}
+
+Status DataServiceSplitProvider::Reset() {
+  mutex_lock l(mu_);
+  repetition_++;
+  return Status::OK();
+}
+
+Status DataServiceSplitProvider::Save(
+    std::function<std::string(std::string)> full_name,
+    IteratorStateWriter* writer) {
+  return errors::Unimplemented(
+      "Save is not implemented for DataServiceSplitProvider");
+}
+
+Status DataServiceSplitProvider::Restore(
+    std::function<std::string(std::string)> full_name,
+    IteratorStateReader* reader) {
+  return errors::Unimplemented(
+      "Restore is not implemented for DataServiceSplitProvider");
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/service/split_provider.h b/tensorflow/core/data/service/split_provider.h
new file mode 100644
index 00000000000..110b9e26ec7
--- /dev/null
+++ b/tensorflow/core/data/service/split_provider.h
@@ -0,0 +1,54 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DATA_SERVICE_SPLIT_PROVIDER_H_
+#define TENSORFLOW_CORE_DATA_SERVICE_SPLIT_PROVIDER_H_
+
+#include <queue>
+
+#include "tensorflow/core/data/service/data_service.h"
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+// SplitProvider which reads splits from a tf.data service dispatcher over RPC.
+class DataServiceSplitProvider : public SplitProvider {
+ public:
+  DataServiceSplitProvider(const std::string& address,
+                           const std::string& protocol, int64 job_id)
+      : address_(address), protocol_(protocol), job_id_(job_id) {}
+
+  Status GetNext(Tensor* split, bool* end_of_splits) override;
+  Status Reset() override;
+  Status Save(std::function<std::string(std::string)> full_name,
+              IteratorStateWriter* writer) override;
+  Status Restore(std::function<std::string(std::string)> full_name,
+                 IteratorStateReader* reader) override;
+
+ private:
+  const std::string address_;
+  const std::string protocol_;
+  const int64 job_id_;
+
+  mutex mu_;
+  int64 repetition_ = 0;
+  std::unique_ptr<DataServiceDispatcherClient> dispatcher_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_SERVICE_SPLIT_PROVIDER_H_
diff --git a/tensorflow/core/data/service/test_cluster.cc b/tensorflow/core/data/service/test_cluster.cc
index 8ae3f191407..49f7eaef30d 100644
--- a/tensorflow/core/data/service/test_cluster.cc
+++ b/tensorflow/core/data/service/test_cluster.cc
@@ -49,7 +49,7 @@ Status TestCluster::Initialize() {
   experimental::DispatcherConfig config;
   config.set_port(0);
   config.set_protocol(kProtocol);
-  TF_RETURN_IF_ERROR(NewDispatchServer(config, &dispatcher_));
+  TF_RETURN_IF_ERROR(NewDispatchServer(config, dispatcher_));
   TF_RETURN_IF_ERROR(dispatcher_->Start());
   dispatcher_address_ = absl::StrCat("localhost:", dispatcher_->BoundPort());
   workers_.reserve(num_workers_);
@@ -67,7 +67,7 @@ Status TestCluster::AddWorker() {
   config.set_protocol(kProtocol);
   config.set_dispatcher_address(dispatcher_address_);
   config.set_worker_address("localhost:%port%");
-  TF_RETURN_IF_ERROR(NewWorkerServer(config, &worker));
+  TF_RETURN_IF_ERROR(NewWorkerServer(config, worker));
   TF_RETURN_IF_ERROR(worker->Start());
   worker_addresses_.push_back(absl::StrCat("localhost:", worker->BoundPort()));
   workers_.push_back(std::move(worker));
diff --git a/tensorflow/core/data/service/worker.proto b/tensorflow/core/data/service/worker.proto
index 51c6899f540..32d3b79a78e 100644
--- a/tensorflow/core/data/service/worker.proto
+++ b/tensorflow/core/data/service/worker.proto
@@ -23,10 +23,20 @@ message GetElementResponse {
   bool end_of_sequence = 2;
 }
 
+// Named GetWorkerTasks to avoid conflicting with GetTasks in dispatcher.proto
+message GetWorkerTasksRequest {}
+
+message GetWorkerTasksResponse {
+  repeated TaskInfo tasks = 1;
+}
+
 service WorkerService {
   // Processes an task for a dataset, making elements available to clients.
   rpc ProcessTask(ProcessTaskRequest) returns (ProcessTaskResponse);
 
   // Gets the next dataset element.
   rpc GetElement(GetElementRequest) returns (GetElementResponse);
+
+  // Gets the tasks currently being executed by the worker.
+  rpc GetWorkerTasks(GetWorkerTasksRequest) returns (GetWorkerTasksResponse);
 }
diff --git a/tensorflow/core/data/service/worker_impl.cc b/tensorflow/core/data/service/worker_impl.cc
index cc61c481d7c..98862b1f176 100644
--- a/tensorflow/core/data/service/worker_impl.cc
+++ b/tensorflow/core/data/service/worker_impl.cc
@@ -20,11 +20,13 @@ limitations under the License.
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/data/dataset.pb.h"
+#include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/credentials_factory.h"
 #include "tensorflow/core/data/service/data_service.h"
 #include "tensorflow/core/data/service/dispatcher.grpc.pb.h"
 #include "tensorflow/core/data/service/dispatcher.pb.h"
 #include "tensorflow/core/data/service/grpc_util.h"
+#include "tensorflow/core/data/service/split_provider.h"
 #include "tensorflow/core/data/service/utils.h"
 #include "tensorflow/core/data/standalone.h"
 #include "tensorflow/core/framework/tensor.pb.h"
@@ -32,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/zlib_outputbuffer.h"
 #include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/snappy.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -56,7 +59,8 @@ DataServiceWorkerImpl::DataServiceWorkerImpl(
 DataServiceWorkerImpl::~DataServiceWorkerImpl() {
   mutex_lock l(mu_);
   cancelled_ = true;
-  background_cv_.notify_one();
+  task_completion_cv_.notify_one();
+  heartbeat_cv_.notify_one();
 }
 
 Status DataServiceWorkerImpl::Start(const std::string& worker_address) {
@@ -67,18 +71,24 @@ Status DataServiceWorkerImpl::Start(const std::string& worker_address) {
       config_.dispatcher_address(), config_.protocol());
   TF_RETURN_IF_ERROR(dispatcher_->Initialize());
 
-  Status s = Register();
+  Status s = Heartbeat();
   while (!s.ok()) {
+    if (!errors::IsUnavailable(s) && !errors::IsAborted(s) &&
+        !errors::IsCancelled(s)) {
+      return s;
+    }
     LOG(WARNING) << "Failed to register with dispatcher at "
                  << config_.dispatcher_address() << ": " << s;
     Env::Default()->SleepForMicroseconds(kRetryIntervalMicros);
-    s = Register();
+    s = Heartbeat();
   }
-  Thread* thread = Env::Default()->StartThread(
-      {}, "data-service-worker-background", [this]() { BackgroundThread(); });
   LOG(INFO) << "Worker registered with dispatcher running at "
             << config_.dispatcher_address();
-  background_thread_.reset(thread);
+  task_completion_thread_ = absl::WrapUnique(
+      Env::Default()->StartThread({}, "data-service-worker-task-completion",
+                                  [this]() { TaskCompletionThread(); }));
+  heartbeat_thread_ = absl::WrapUnique(Env::Default()->StartThread(
+      {}, "data-service-worker-heartbeat", [this]() { HeartbeatThread(); }));
   mutex_lock l(mu_);
   registered_ = true;
   return Status::OK();
@@ -96,11 +106,13 @@ Status DataServiceWorkerImpl::ProcessTaskInternal(const TaskDef& task_def)
     EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   std::unique_ptr<Task>& task = tasks_[task_def.task_id()];
   if (task) {
-    return errors::AlreadyExists("A task with id ", task_def.task_id(),
-                                 " already exists.");
+    VLOG(1) << "Received request to process already-processed task "
+            << task->task_def.task_id();
+    return Status::OK();
   }
   task = absl::make_unique<Task>(task_def);
-  VLOG(3) << "Began processing for task " << task_def.task_id();
+  VLOG(3) << "Began processing for task " << task_def.task_id()
+          << " with processing mode " << task_def.processing_mode();
   return Status::OK();
 }
 
@@ -134,7 +146,22 @@ Status DataServiceWorkerImpl::EnsureTaskInitialized(
       return errors::Internal("Unrecognized dataset case: ",
                               task.task_def.dataset_case());
   }
-  TF_RETURN_IF_ERROR(task.dataset->MakeIterator(&task.iterator));
+  switch (task.task_def.processing_mode()) {
+    case DISTRIBUTED_EPOCH: {
+      auto split_provider = absl::make_unique<DataServiceSplitProvider>(
+          config_.dispatcher_address(), config_.protocol(),
+          task.task_def.job_id());
+      TF_RETURN_IF_ERROR(task.dataset->MakeIterator(std::move(split_provider),
+                                                    &task.iterator));
+      break;
+    }
+    case PARALLEL_EPOCHS:
+      TF_RETURN_IF_ERROR(task.dataset->MakeIterator(&task.iterator));
+      break;
+    default:
+      return errors::InvalidArgument("Unrecognized processing mode: ",
+                                     task.task_def.processing_mode());
+  }
   task.initialized = true;
   VLOG(3) << "Created iterator for task " << task.task_def.task_id();
   return Status::OK();
@@ -155,25 +182,18 @@ Status DataServiceWorkerImpl::GetElement(const GetElementRequest* request,
           "Worker has not yet registered with dispatcher.");
     }
     auto it = tasks_.find(request->task_id());
-    if (it == tasks_.end()) {
-      return errors::NotFound("DataServiceWorkerImpl::GetElement failed. ",
-                              "Task id ", request->task_id(), " not found");
-    }
-    auto& task = it->second;
-    TF_RETURN_IF_ERROR(EnsureTaskInitialized(*task));
-    std::unique_ptr<standalone::Iterator>& iter = task->iterator;
-    if (iter == nullptr) {
-      VLOG(3) << "Task " << request->task_id() << " is already finished";
+    if (it == tasks_.end() || it->second->finished) {
       response->set_end_of_sequence(true);
       return Status::OK();
     }
-    TF_RETURN_IF_ERROR(iter->GetNext(&outputs, &end_of_sequence));
+    auto& task = it->second;
+    TF_RETURN_IF_ERROR(EnsureTaskInitialized(*task));
+    TF_RETURN_IF_ERROR(task->iterator->GetNext(&outputs, &end_of_sequence));
     if (end_of_sequence) {
       VLOG(3) << "Reached end_of_sequence for task " << request->task_id();
-      // Release iterator memory and leave a null entry as a tombstone.
-      iter.reset();
+      task->finished = true;
       pending_completed_tasks_.insert(request->task_id());
-      background_cv_.notify_one();
+      task_completion_cv_.notify_one();
     }
   }
 
@@ -212,27 +232,28 @@ Status DataServiceWorkerImpl::GetElement(const GetElementRequest* request,
   return Status::OK();
 }
 
-Status DataServiceWorkerImpl::Register() LOCKS_EXCLUDED(mu_) {
-  VLOG(3) << "Registering with dispatcher at " << config_.dispatcher_address();
-  std::vector<TaskDef> tasks;
-  TF_RETURN_IF_ERROR(dispatcher_->RegisterWorker(worker_address_, tasks));
-  for (const TaskDef& task : tasks) {
-    mutex_lock l(mu_);
-    TF_RETURN_IF_ERROR(ProcessTaskInternal(task));
+Status DataServiceWorkerImpl::GetWorkerTasks(
+    const GetWorkerTasksRequest* request, GetWorkerTasksResponse* response) {
+  mutex_lock l(mu_);
+  for (const auto& it : tasks_) {
+    Task* task = it.second.get();
+    TaskInfo* task_info = response->add_tasks();
+    task_info->set_worker_address(worker_address_);
+    task_info->set_task_id(task->task_def.task_id());
+    task_info->set_job_id(task->task_def.job_id());
   }
-  VLOG(3) << "Registered worker with address " << worker_address_;
   return Status::OK();
 }
 
-void DataServiceWorkerImpl::BackgroundThread() LOCKS_EXCLUDED(mu_) {
+void DataServiceWorkerImpl::TaskCompletionThread() LOCKS_EXCLUDED(mu_) {
   while (true) {
     {
       mutex_lock l(mu_);
       while (!cancelled_ && pending_completed_tasks_.empty()) {
-        background_cv_.wait(l);
+        task_completion_cv_.wait(l);
       }
       if (cancelled_) {
-        VLOG(3) << "Background thread shutting down";
+        VLOG(3) << "Task completion thread shutting down";
         return;
       }
     }
@@ -241,7 +262,7 @@ void DataServiceWorkerImpl::BackgroundThread() LOCKS_EXCLUDED(mu_) {
       LOG(WARNING) << "Failed to send task updates to dispatcher: " << s;
       mutex_lock l(mu_);
       if (!cancelled_) {
-        background_cv_.wait_for(
+        task_completion_cv_.wait_for(
             l, std::chrono::microseconds(kRetryIntervalMicros));
       }
     }
@@ -271,5 +292,62 @@ Status DataServiceWorkerImpl::SendTaskUpdates() LOCKS_EXCLUDED(mu_) {
   return Status::OK();
 }
 
+void DataServiceWorkerImpl::HeartbeatThread() LOCKS_EXCLUDED(mu_) {
+  while (true) {
+    int64 next_heartbeat_micros =
+        Env::Default()->NowMicros() + (config_.heartbeat_interval_ms() * 1000);
+    {
+      mutex_lock l(mu_);
+      while (!cancelled_ &&
+             Env::Default()->NowMicros() < next_heartbeat_micros) {
+        int64 time_to_wait_micros =
+            next_heartbeat_micros - Env::Default()->NowMicros();
+        heartbeat_cv_.wait_for(l,
+                               std::chrono::microseconds(time_to_wait_micros));
+      }
+      if (cancelled_) {
+        VLOG(3) << "Heartbeat thread shutting down";
+        return;
+      }
+      if (!registered_) {
+        VLOG(1) << "Not performing heartbeat; worker is not yet registered";
+        continue;
+      }
+    }
+    Status s = Heartbeat();
+    if (!s.ok()) {
+      LOG(WARNING) << "Failed to send heartbeat to dispatcher: " << s;
+    }
+  }
+}
+
+Status DataServiceWorkerImpl::Heartbeat() LOCKS_EXCLUDED(mu_) {
+  std::vector<int64> current_tasks;
+  {
+    mutex_lock l(mu_);
+    for (const auto& task : tasks_) {
+      current_tasks.push_back(task.first);
+    }
+  }
+  std::vector<TaskDef> new_tasks;
+  std::vector<int64> tasks_to_delete;
+  TF_RETURN_IF_ERROR(dispatcher_->WorkerHeartbeat(
+      worker_address_, current_tasks, new_tasks, tasks_to_delete));
+  mutex_lock l(mu_);
+  for (const auto& task : new_tasks) {
+    Status s = ProcessTaskInternal(task);
+    if (!s.ok() && !errors::IsAlreadyExists(s)) {
+      LOG(WARNING) << "Failed to start processing task " << task.task_id()
+                   << ": " << s;
+    }
+  }
+  for (int64 task_id : tasks_to_delete) {
+    VLOG(3) << "Deleting task " << task_id
+            << " at the request of the dispatcher";
+    tasks_.erase(task_id);
+  }
+  return Status::OK();
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/worker_impl.h b/tensorflow/core/data/service/worker_impl.h
index 27a7da34c1d..16a0ba0cd93 100644
--- a/tensorflow/core/data/service/worker_impl.h
+++ b/tensorflow/core/data/service/worker_impl.h
@@ -50,6 +50,8 @@ class DataServiceWorkerImpl {
   /// Client-facing API.
   Status GetElement(const GetElementRequest* request,
                     GetElementResponse* response);
+  Status GetWorkerTasks(const GetWorkerTasksRequest* request,
+                        GetWorkerTasksResponse* response);
 
  private:
   struct Task {
@@ -58,22 +60,24 @@ class DataServiceWorkerImpl {
     TaskDef task_def;
     mutex mu;
     bool initialized TF_GUARDED_BY(mu) = false;
+    bool finished = false;
     // TODO(aaudibert): Have standalone::Iterator own a reference to
     // standalone::Dataset so that we don't need to store the dataset here.
     std::unique_ptr<standalone::Dataset> dataset;
     std::unique_ptr<standalone::Iterator> iterator;
   };
 
-  // Registers the worker with the dispatcher.
-  Status Register() LOCKS_EXCLUDED(mu_);
   // Sends task status to the dispatcher and checks for dispatcher commands.
   Status SendTaskUpdates() LOCKS_EXCLUDED(mu_);
   // Creates an iterator to process a task.
   Status ProcessTaskInternal(const TaskDef& task) EXCLUSIVE_LOCKS_REQUIRED(mu_);
   Status EnsureTaskInitialized(Task& task);
-  // A thread for doing async background processing not associated with a
-  // specific RPC, such as reporting finished tasks.
-  void BackgroundThread() LOCKS_EXCLUDED(mu_);
+  // A thread for notifying the dispatcher when tasks complete.
+  void TaskCompletionThread() LOCKS_EXCLUDED(mu_);
+  // A thread for doing periodic heartbeats to the dispatcher.
+  void HeartbeatThread() LOCKS_EXCLUDED(mu_);
+  // Performs a heartbeat to the dispatcher.
+  Status Heartbeat() LOCKS_EXCLUDED(mu_);
 
   const experimental::WorkerConfig config_;
   // The worker's own address.
@@ -88,9 +92,12 @@ class DataServiceWorkerImpl {
   bool cancelled_ TF_GUARDED_BY(mu_) = false;
   // Whether the worker has registered with the dispatcher yet.
   bool registered_ TF_GUARDED_BY(mu_) = false;
-  // Condition variable for notifying the background thread.
-  condition_variable background_cv_ TF_GUARDED_BY(mu_);
-  std::unique_ptr<Thread> background_thread_;
+  // A thread for notifying the dispatcher when tasks complete.
+  std::unique_ptr<Thread> task_completion_thread_;
+  condition_variable task_completion_cv_ TF_GUARDED_BY(mu_);
+  // A thread for performing regular heartbeats to the dispatcher.
+  std::unique_ptr<Thread> heartbeat_thread_;
+  condition_variable heartbeat_cv_ TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(DataServiceWorkerImpl);
 };
diff --git a/tensorflow/core/data/standalone.cc b/tensorflow/core/data/standalone.cc
index bbfa4612146..714e2b9f588 100644
--- a/tensorflow/core/data/standalone.cc
+++ b/tensorflow/core/data/standalone.cc
@@ -59,7 +59,6 @@ Status Dataset::FromGraph(Params params, const GraphDef& graph_def,
       device_mgr.get(), Env::Default(), /*config=*/nullptr,
       TF_GRAPH_DEF_VERSION, flib_def.get(), OptimizerOptions{},
       /*thread_pool=*/nullptr, /*parent=*/nullptr,
-      /*custom_kernel_creator=*/nullptr,
       /*session_metadata=*/nullptr,
       Rendezvous::Factory{
           [](const int64, const DeviceMgr* device_mgr, Rendezvous** r) {
@@ -99,7 +98,8 @@ Status Dataset::FromGraph(Params params, const GraphDef& graph_def,
   return Status::OK();
 }  // static
 
-Status Dataset::MakeIterator(std::unique_ptr<Iterator>* result) {
+Status Dataset::MakeIterator(std::unique_ptr<SplitProvider> split_provider,
+                             std::unique_ptr<Iterator>* result) {
   // Create an `IteratorContext`, which bundles together the necessary runtime
   // support to create and get elements from an iterator.
   std::unique_ptr<IteratorContext> ctx;
@@ -116,6 +116,7 @@ Status Dataset::MakeIterator(std::unique_ptr<Iterator>* result) {
     params.function_handle_cache = function_handle_cache_.get();
     params.resource_mgr = &resource_mgr_;
     params.cancellation_manager = &cancellation_manager_;
+    params.split_provider = std::move(split_provider);
 
     ctx = absl::make_unique<IteratorContext>(std::move(params));
   }
@@ -130,6 +131,14 @@ Status Dataset::MakeIterator(std::unique_ptr<Iterator>* result) {
   return Status::OK();
 }
 
+Status Dataset::MakeIterator(std::unique_ptr<Iterator>* result) {
+  return MakeIterator(/*split_provider=*/nullptr, result);
+}
+
+Status Dataset::MakeSplitProvider(std::unique_ptr<SplitProvider>* result) {
+  return dataset_->MakeSplitProvider(result);
+}
+
 Dataset::Dataset(DatasetBase* dataset, DeviceMgr* device_mgr,
                  ProcessFunctionLibraryRuntime* pflr,
                  FunctionLibraryDefinition* flib_def, thread::ThreadPool* pool)
diff --git a/tensorflow/core/data/standalone.h b/tensorflow/core/data/standalone.h
index 91d174fd384..f72f20b08e4 100644
--- a/tensorflow/core/data/standalone.h
+++ b/tensorflow/core/data/standalone.h
@@ -98,6 +98,12 @@ class Dataset {
 
   // Creates an iterator for this dataset.
   Status MakeIterator(std::unique_ptr<Iterator>* result);
+  // Creates an iterator, optionally with a split provider.
+  Status MakeIterator(std::unique_ptr<SplitProvider> split_provider,
+                      std::unique_ptr<Iterator>* result);
+
+  // Creates a split provider for this dataset.
+  Status MakeSplitProvider(std::unique_ptr<SplitProvider>* result);
 
  private:
   Dataset(DatasetBase* dataset, DeviceMgr* device_mgr,
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index ca3118c51e0..c594b936010 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -12,6 +12,7 @@
 #    a watch state.
 # ":debug_node_key" - Defines a struct used for tracking tensors.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")
 load(
     "//tensorflow:tensorflow.bzl",
@@ -229,7 +230,6 @@ tf_cc_test(
         "//tensorflow/core:framework",
         "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
-        "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:math_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core:no_op_op_lib",
@@ -243,6 +243,7 @@ tf_cc_test(
         "//tensorflow/core/distributed_runtime/rpc:grpc_testlib",
         "//tensorflow/core/kernels:constant_op",
         "//tensorflow/core/kernels:matmul_op",
+        "//tensorflow/core/protobuf:master_proto_cc",
     ],
 )
 
diff --git a/tensorflow/core/debug/grpc_session_debug_test.cc b/tensorflow/core/debug/grpc_session_debug_test.cc
index 65ec1ef8a6d..cb722c646c4 100644
--- a/tensorflow/core/debug/grpc_session_debug_test.cc
+++ b/tensorflow/core/debug/grpc_session_debug_test.cc
@@ -283,12 +283,10 @@ TEST_F(GrpcSessionDebugTest, MultiDevices_String) {
 
         DeleteDumpDir();
       } else {
-        // CUDA and SYCL devices do not have an Identity op for strings
+        // The CUDA device does not have an Identity op for strings
         LOG(ERROR) << "Error: " << s;
         ASSERT_TRUE((a_dev.device_type() == DEVICE_GPU) ||
-                    (a_dev.device_type() == DEVICE_SYCL) ||
-                    (b_dev.device_type() == DEVICE_GPU) ||
-                    (b_dev.device_type() == DEVICE_SYCL));
+                    (b_dev.device_type() == DEVICE_GPU));
         ASSERT_FALSE(s.ok());
       }
     }
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 94570c1b577..d3030f4ca0b 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -3,9 +3,11 @@
 # to be distributed and performed in parallel across multiple
 # processes.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_copts", "tf_cuda_library")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")  # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")  # buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")  # buildifier: disable=same-origin-load
 
 # For platform specific build config
 load(
@@ -22,8 +24,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 exports_files(
     ["server_lib.h"],
     visibility = ["//tensorflow:internal"],
@@ -66,9 +66,9 @@ cc_library(
     hdrs = ["message_wrappers.h"],
     deps = [
         "//tensorflow/core:framework",
-        "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/protobuf:master_proto_cc",
+        "//tensorflow/core/protobuf:worker_proto_cc",
     ],
 )
 
@@ -104,7 +104,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/protobuf:worker_proto_cc",
         "@com_google_absl//absl/types:optional",
     ],
 )
@@ -144,7 +144,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:ptr_util",
-        "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/protobuf:worker_proto_cc",
     ],
 )
 
@@ -181,7 +181,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/protobuf:worker_proto_cc",
     ],
 )
 
@@ -195,7 +195,7 @@ cc_library(
         ":message_wrappers",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
-        "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/protobuf:worker_proto_cc",
     ],
 )
 
@@ -265,8 +265,8 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/protobuf:worker_proto_cc",
     ],
 )
 
@@ -297,7 +297,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/protobuf:worker_proto_cc",
     ],
 )
 
@@ -309,7 +309,7 @@ cc_library(
         ":message_wrappers",
         ":request_id",
         "//tensorflow/core:lib",
-        "//tensorflow/core:master_proto_cc",
+        "//tensorflow/core/protobuf:master_proto_cc",
     ],
 )
 
@@ -330,9 +330,9 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/protobuf:master_proto_cc",
+        "//tensorflow/core/protobuf:worker_proto_cc",
     ],
 )
 
@@ -354,9 +354,9 @@ cc_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/debug:debug_graph_utils",
+        "//tensorflow/core/protobuf:master_proto_cc",
     ],
 )
 
@@ -438,10 +438,10 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/debug",
         "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:traceme_encode",
+        "//tensorflow/core/protobuf:worker_proto_cc",
     ],
 )
 
@@ -455,7 +455,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/protobuf:worker_proto_cc",
     ],
 )
 
@@ -521,7 +521,8 @@ tf_cc_test(
         "//tensorflow/core:session_options",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/nccl:collective_communicator",
+        "//tensorflow/core/protobuf:worker_proto_cc",
     ],
 )
 
@@ -537,7 +538,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib_internal",  # protobuf::Any
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/protobuf:worker_proto_cc",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -558,7 +559,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/protobuf:worker_proto_cc",
     ],
 )
 
@@ -571,7 +572,10 @@ cc_library(
         ":device_resolver_distributed",
         ":worker_cache",
         "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -606,6 +610,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/kernels:collective_ops",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -618,6 +623,8 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
@@ -664,7 +671,6 @@ tf_cuda_cc_test(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:state_ops_op_lib",
         "//tensorflow/core:test",
@@ -680,6 +686,7 @@ tf_cuda_cc_test(
         "//tensorflow/core/kernels:dense_update_ops",
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:variable_ops",
+        "//tensorflow/core/protobuf:master_proto_cc",
         tf_grpc_cc_dependency(),
     ],
 )
@@ -704,7 +711,6 @@ tf_cuda_cc_test(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -713,6 +719,7 @@ tf_cuda_cc_test(
         "//tensorflow/core/distributed_runtime/rpc:grpc_testlib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
+        "//tensorflow/core/protobuf:master_proto_cc",
         tf_grpc_cc_dependency(),
     ],
 )
@@ -774,7 +781,7 @@ cc_library(
     deps = [
         ":message_wrappers",
         "//tensorflow/core:lib",
-        "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/protobuf:worker_proto_cc",
     ],
 )
 
@@ -788,7 +795,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/protobuf:worker_proto_cc",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
index 650c52cd8da..238d29065d2 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
@@ -15,17 +15,24 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
 
 #include "absl/strings/escaping.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/distributed_runtime/cancellable_call.h"
 #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 namespace {
 
 class CompleteGroupCall : public CancellableCall {
  public:
-  CompleteGroupCall(const CollGroupParams& group, const string& device_name,
+  CompleteGroupCall(const CollGroupParams& group,
+                    const DeviceAttributes& device,
                     const CollectiveType& collective_type,
                     CancellationManager* cancel_mgr,
                     const string& remote_worker, WorkerCacheInterface* wc)
@@ -33,7 +40,7 @@ class CompleteGroupCall : public CancellableCall {
     req_.set_group_key(group.group_key);
     req_.set_group_size(group.group_size);
     req_.set_device_type(group.device_type.type_string());
-    req_.add_device_name(device_name);
+    *req_.mutable_device_attributes() = device;
     req_.set_collective_type(collective_type);
   }
   ~CompleteGroupCall() override {}
@@ -98,47 +105,54 @@ CollectiveParamResolverDistributed::CollectiveParamResolverDistributed(
 }
 
 void CollectiveParamResolverDistributed::CompleteParamsAsync(
-    const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
-    const StatusCallback& done) {
-  VLOG(1) << "CompleteParams distributed " << device << " for " << cp << ": "
-          << cp->ToString();
-  CompleteGroupDistributed(device, cp, cancel_mgr,
-                           [this, device, cp, cancel_mgr, done](
-                               const Status& s, const GroupRec* gr) {
-                             if (s.ok()) {
-                               CompleteInstanceDistributed(device, gr, cp,
-                                                           cancel_mgr, done);
-                             } else {
-                               done(s);
-                             }
-                           });
+    const DeviceAttributes& device, CollectiveParams* cp,
+    CancellationManager* cancel_mgr, const StatusCallback& done) {
+  VLOG(1) << "CompleteParams distributed " << device.name() << " for " << cp
+          << ": " << cp->ToString();
+  CompleteGroupDistributed(
+      device, cp, cancel_mgr,
+      [this, device, cp, cancel_mgr, done](Status s, const GroupRec* gr) {
+        if (s.ok()) {
+          std::vector<DeviceAttributes> attributes;
+          mutex_lock l(gr->mu);
+          for (const auto& item : gr->devices) {
+            attributes.push_back(item.second);
+          }
+          s = dev_resolver_->UpdateDeviceAttributes(attributes);
+        }
+        if (s.ok()) {
+          CompleteInstanceDistributed(device.name(), gr, cp, cancel_mgr, done);
+        } else {
+          done(s);
+        }
+      });
 }
 
 void CollectiveParamResolverDistributed::CompleteGroupAsync(
     const CompleteGroupRequest* request, CompleteGroupResponse* response,
     CancellationManager* cancel_mgr, const StatusCallback& done) {
+  if (!request->has_device_attributes()) {
+    done(errors::Internal(
+        "CompleteGroupRequest device_attributes is not set. Make sure you're "
+        "running the same version of Tensorflow on all workers."));
+    return;
+  }
   CollectiveParams cp;
   cp.group.group_key = request->group_key();
   cp.group.group_size = request->group_size();
   cp.group.device_type = DeviceType(request->device_type());
-  for (const string& dn : request->device_name()) {
-    cp.instance.device_names.push_back(dn);
-  }
   cp.instance.type = CollectiveType(request->collective_type());
   CompleteGroupDistributed(
-      cp.instance.device_names[0], &cp, cancel_mgr,
+      request->device_attributes(), &cp, cancel_mgr,
       [response, done](const Status& s, const GroupRec* gr) {
         if (s.ok()) {
           mutex_lock l(gr->mu);
           response->set_group_key(gr->group.group_key);
           response->set_group_size(gr->group.group_size);
           response->set_device_type(gr->group.device_type.type_string());
-          response->set_num_tasks(gr->task_set.size());
-          for (const string& dn : gr->device_list) {
-            response->add_device_name(dn);
-          }
-          for (const string& tn : gr->task_list) {
-            response->add_task_name(tn);
+          response->set_num_tasks(gr->group.num_tasks);
+          for (const auto& item : gr->devices) {
+            *response->add_device_attributes() = item.second;
           }
           response->set_communicator_key(
               gr->group.runtime_details.communicator_key);
@@ -152,6 +166,22 @@ void CollectiveParamResolverDistributed::CompleteGroupAsync(
 void CollectiveParamResolverDistributed::CompleteInstanceAsync(
     const CompleteInstanceRequest* request, CompleteInstanceResponse* response,
     CancellationManager* cancel_mgr, const StatusCallback& done) {
+  GroupRec* gr = GetCachedGroup(request->group_key());
+  if (gr == nullptr) {
+    done(errors::FailedPrecondition(
+        "group ", request->group_key(),
+        " not found. This normally means the server has restarted"));
+    return;
+  }
+  {
+    mutex_lock l(gr->mu);
+    if (!gr->status.ok() || gr->devices.size() != gr->group.group_size) {
+      done(errors::FailedPrecondition(
+          "group ", request->group_key(),
+          " failed to resolve. This normally means the server has restarted"));
+      return;
+    }
+  }
   CollectiveParams* cp = new CollectiveParams;
   cp->name = request->name();
   cp->group.group_key = request->group_key();
@@ -164,56 +194,37 @@ void CollectiveParamResolverDistributed::CompleteInstanceAsync(
   for (int32 offset : request->subdiv_offset()) {
     cp->instance.impl_details.subdiv_offsets.push_back(offset);
   }
-  string* device = new string(request->device());
-  VLOG(1) << "New cp " << cp << " for device " << *device << " : "
-          << cp->ToString();
-  StatusCallback done_and_cleanup = [cp, device, done](const Status& s) {
+  StatusCallback done_and_cleanup = [cp, done](const Status& s) {
     done(s);
     delete cp;
-    delete device;
   };
-  // Start by completing the group.
-  CompleteGroupDistributed(
-      *device, cp, cancel_mgr,
-      [this, cp, device, response, cancel_mgr, done_and_cleanup](
-          const Status& cg_status, const GroupRec* gr) {
-        if (cg_status.ok()) {
-          // Then complete the instance.
-          CompleteInstanceDistributed(
-              *device, gr, cp, cancel_mgr,
-              [this, gr, cp, response,
-               done_and_cleanup](const Status& ci_status) {
-                if (ci_status.ok()) {
-                  // Now source_rank should be known, so
-                  // retrieve it.
-                  FindInstanceRec(
-                      gr, cp,
-                      [cp, response, done_and_cleanup](const Status& fi_status,
-                                                       InstanceRec* ir) {
-                        if (fi_status.ok()) {
-                          mutex_lock l(ir->out_mu);
-                          ir->WaitForOutMu(l);
-                          response->set_instance_key(cp->instance.instance_key);
-                          response->set_source_rank(ir->source_rank);
-                          done_and_cleanup(fi_status);
-                        } else {
-                          done_and_cleanup(fi_status);
-                        }
-                      });
-                } else {
-                  done_and_cleanup(ci_status);
-                }
-              });
-        } else {
-          done_and_cleanup(cg_status);
+  CompleteInstanceDistributed(
+      request->device(), gr, cp, cancel_mgr,
+      [this, gr, cp, response, done_and_cleanup](Status status) {
+        if (status.ok()) {
+          // Now source_rank should be known, so retrieve it.
+          InstanceRec* ir = GetOrCreateInstanceRec(gr, cp);
+          {
+            mutex_lock l(ir->mu);
+            status = ir->status;
+            if (ir->status.ok()) {
+              response->set_instance_key(cp->instance.instance_key);
+              response->set_source_rank(ir->source_rank);
+            }
+          }
         }
+        done_and_cleanup(status);
       });
 }
 
-bool CollectiveParamResolverDistributed::GroupIsCached(int32 group_key) {
+CollectiveParamResolverDistributed::GroupRec*
+CollectiveParamResolverDistributed::GetCachedGroup(int32 group_key) {
   mutex_lock l(group_mu_);
-  const auto& it = group_table_.find(group_key);
-  return it != group_table_.end();
+  auto it = group_table_.find(group_key);
+  if (it == group_table_.end()) {
+    return nullptr;
+  }
+  return it->second.get();
 }
 
 Status CollectiveParamResolverDistributed::UpdateGroupCache(
@@ -226,61 +237,60 @@ Status CollectiveParamResolverDistributed::UpdateGroupCache(
     gr->group.group_key = resp.group_key();
     gr->group.group_size = resp.group_size();
     gr->group.num_tasks = resp.num_tasks();
-    if (resp.device_name_size() != gr->group.group_size) {
+    if (resp.device_attributes().empty()) {
+      return errors::Internal(
+          "CompleteGroupResponse device_attributes is empty. Make sure you're "
+          "running the same version of Tensorflow on all workers.");
+    }
+    if (resp.device_attributes_size() != gr->group.group_size) {
       return errors::Internal(
           "CompleteGroupResponse group_size doesn't match device_name list");
     }
-    for (const string& dn : resp.device_name()) {
-      gr->device_set.insert(dn);
-      gr->device_list.push_back(dn);
+    for (const DeviceAttributes& device : resp.device_attributes()) {
+      gr->devices[device.name()] = device;
     }
-    if (resp.task_name_size() != gr->group.group_size) {
-      return errors::Internal(
-          "CompleteGroupResponse group_size doesn't match task_name list");
-    }
-    for (const string& tn : resp.task_name()) {
-      gr->task_list.push_back(tn);
-      gr->task_set.insert(tn);
-    }
-    CHECK_EQ(gr->task_set.size(), gr->group.num_tasks);
     gr->group.runtime_details.communicator_key = resp.communicator_key();
-    VLOG(2) << "Group communicator_key="
-            << absl::CEscape(gr->group.runtime_details.communicator_key);
+    FinishGroup(gr.get());
   }
+  GroupRec* previous_gr = nullptr;
   {
     // Group membership should never change. Once a record is in group_table_
     // it never gets removed.
     mutex_lock l(group_mu_);
-    auto it = group_table_.find(gr->group.group_key);
+    auto it = group_table_.find(resp.group_key());
     if (it == group_table_.end()) {
       VLOG(2) << "UpdateGroupCache: communicator_key="
-              << absl::CEscape(gr->group.runtime_details.communicator_key);
+              << absl::CEscape(resp.communicator_key());
       group_table_[gr->group.group_key] = std::move(gr);
     } else {
-      auto& previous_gr = group_table_[gr->group.group_key];
-      if (previous_gr->group.runtime_details.communicator_key !=
-          gr->group.runtime_details.communicator_key) {
-        return errors::Internal(
-            "UpdateGroupCache: CompleteGroupResponse for group ",
-            gr->group.group_key, " gives communicator_key=",
-            absl::CEscape(gr->group.runtime_details.communicator_key),
-            " but cache already holds communicator_key=",
-            absl::CEscape(previous_gr->group.runtime_details.communicator_key));
-      }
+      previous_gr = it->second.get();
+    }
+  }
+  if (previous_gr != nullptr) {
+    mutex_lock grl(previous_gr->mu);
+    if (previous_gr->group.runtime_details.communicator_key !=
+        resp.communicator_key()) {
+      return errors::Internal(
+          "UpdateGroupCache: CompleteGroupResponse for group ",
+          resp.group_key(),
+          " gives communicator_key=", absl::CEscape(resp.communicator_key()),
+          " but cache already holds communicator_key=",
+          absl::CEscape(previous_gr->group.runtime_details.communicator_key));
     }
   }
   return Status::OK();
 }
 
 void CollectiveParamResolverDistributed::CompleteGroupDistributed(
-    const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
-    const GroupRecCallback& done) {
+    const DeviceAttributes& device, CollectiveParams* cp,
+    CancellationManager* cancel_mgr, const GroupRecCallback& done) {
   VLOG(1) << "CompleteGroupDistributed group_key=" << cp->group.group_key
-          << " dev: " << device << " is_leader=" << (group_leader_.empty());
+          << " dev: " << device.name()
+          << " is_leader=" << (group_leader_.empty());
   if (group_leader_.empty()) {
     // This is the group leader, so resolution is local.
     return CompleteGroupLocal(device, cp, done);
-  } else if (!GroupIsCached(cp->group.group_key)) {
+  } else if (GetCachedGroup(cp->group.group_key) == nullptr) {
     // Need to update Group cache from the leader.
     CompleteGroupCall* call =
         new CompleteGroupCall(cp->group, device, cp->instance.type, cancel_mgr,
@@ -315,62 +325,40 @@ bool CollectiveParamResolverDistributed::InstanceIsCached(int32 group_key,
   return instance_it != group_it->second.end();
 }
 
-void CollectiveParamResolverDistributed::UpdateInstanceCache(
+Status CollectiveParamResolverDistributed::UpdateInstanceCache(
     const GroupRec* gr, CollectiveParams* cp,
-    const CompleteInstanceResponse& resp, const StatusCallback& done) {
-  using InstanceRecPointer = InstanceRec*;
-  InstanceRecPointer* irp = new InstanceRecPointer(nullptr);
+    const CompleteInstanceResponse& resp) {
   int32 source_rank = resp.source_rank();
-
-  auto continue_with_ir = [cp, irp, source_rank, done](const Status& s) {
-    if (!s.ok()) {
-      done(s);
-      delete irp;
-      return;
+  InstanceRec* ir = GetOrCreateInstanceRec(gr, cp);
+  mutex_lock l(ir->mu);
+  if (!ir->status.ok()) {
+    return ir->status;
+  }
+  if (ir->source_rank != source_rank) {
+    if (ir->source_rank >= 0) {
+      ir->status = errors::Internal(
+          "UpdateInstanceCache: CompleteInstanceResponse for instance ",
+          cp->instance.instance_key, " gives source_rank=", source_rank,
+          " but cache already holds value=", ir->source_rank);
+      return ir->status;
     }
-    Status status;
-    InstanceRec* ir = *irp;
-    do {
-      mutex_lock l(ir->out_mu);
-      ir->WaitForOutMu(l);
-      if (ir->source_rank != source_rank) {
-        if (ir->source_rank >= 0) {
-          ir->status = errors::Internal(
-              "UpdateInstanceCache: CompleteInstanceResponse for instance ",
-              cp->instance.instance_key, " gives source_rank=", source_rank,
-              " but cache already holds value=", ir->source_rank);
-          status = ir->status;
-          break;
-        }
-        ir->source_rank = source_rank;
-      }
-      if (ir->known_count < cp->group.group_size) {
-        ir->known_count = cp->group.group_size;
-        const int ir_known_size = ir->known.size();
-        if (ir_known_size != cp->group.group_size) {
-          ir->status = errors::Internal(
-              "UpdateInstanceCache:: CompleteInstanceResponse for instance ",
-              cp->instance.instance_key, " has known.size()=", ir->known.size(),
-              " < group_size=", cp->group.group_size);
-          status = ir->status;
-          break;
-        }
-        for (int i = 0; i < ir_known_size; ++i) {
-          ir->known[i] = true;
-        }
-      }
-      status = ir->status;
-    } while (false);
-    // Callback outside of lock.
-    done(status);
-    delete irp;
-  };
-
-  FindInstanceRec(gr, cp,
-                  [irp, continue_with_ir](const Status s, InstanceRec* irec) {
-                    *irp = irec;
-                    continue_with_ir(s);
-                  });
+    ir->source_rank = source_rank;
+  }
+  if (ir->known_count < cp->group.group_size) {
+    ir->known_count = cp->group.group_size;
+    const int ir_known_size = ir->known.size();
+    if (ir_known_size != cp->group.group_size) {
+      ir->status = errors::Internal(
+          "UpdateInstanceCache:: CompleteInstanceResponse for instance ",
+          cp->instance.instance_key, " has known.size()=", ir->known.size(),
+          " < group_size=", cp->group.group_size);
+      return ir->status;
+    }
+    for (int i = 0; i < ir_known_size; ++i) {
+      ir->known[i] = true;
+    }
+  }
+  return ir->status;
 }
 
 void CollectiveParamResolverDistributed::CompleteInstanceDistributed(
@@ -379,22 +367,18 @@ void CollectiveParamResolverDistributed::CompleteInstanceDistributed(
   if (group_leader_.empty()) {
     // This is the group leader so resolution is local.
     return CompleteInstanceLocal(device, gr, cp, cp->is_source, done);
-  } else if (InstanceIsCached(gr->group.group_key, cp->instance.instance_key)) {
+  } else if (InstanceIsCached(cp->group.group_key, cp->instance.instance_key)) {
     return CompleteInstanceLocal(device, gr, cp, cp->is_source, done);
   } else {
     CompleteInstanceCall* call = new CompleteInstanceCall(
         cp->group, cp->instance, cp->name, device, cp->is_source, cancel_mgr,
         group_leader_, worker_cache_);
-    call->Start([this, device, gr, cp, call, done](const Status& s) {
+    call->Start([this, device, gr, cp, call, done](Status s) {
       if (s.ok()) {
-        UpdateInstanceCache(
-            gr, cp, call->resp_, [this, device, gr, cp, done](const Status& s) {
-              if (!s.ok()) {
-                done(s);
-              } else {
-                CompleteInstanceLocal(device, gr, cp, cp->is_source, done);
-              }
-            });
+        s = UpdateInstanceCache(gr, cp, call->resp_);
+      }
+      if (s.ok()) {
+        CompleteInstanceLocal(device, gr, cp, cp->is_source, done);
       } else {
         done(s);
       }
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
index 684887430c3..89f923a800b 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
@@ -16,6 +16,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_PARAM_RESOLVER_DISTRIBUTED_H_
 
 #include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
 class ConfigProto;
@@ -31,7 +33,7 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
                                      WorkerCacheInterface* worker_cache,
                                      const string& task_name);
 
-  void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+  void CompleteParamsAsync(const DeviceAttributes& device, CollectiveParams* cp,
                            CancellationManager* cancel_mgr,
                            const StatusCallback& done) override;
 
@@ -46,9 +48,9 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
                              const StatusCallback& done) override;
 
  protected:
-  // Returns true iff there's an entry for this group_key in the
-  // local group_table_.
-  bool GroupIsCached(int32 group_key) TF_LOCKS_EXCLUDED(group_mu_);
+  // Returns the cached group iff there's an entry for this group_key in the
+  // local group_table_; returns nullptr otherwise.
+  GroupRec* GetCachedGroup(int32 group_key) TF_LOCKS_EXCLUDED(group_mu_);
 
   // Updates group_table_ with contents of resp.
   Status UpdateGroupCache(const CompleteGroupResponse& resp)
@@ -59,7 +61,8 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
   //
   // Semantics are like those of CompleteGroupLocal but will make a
   // remote call to the group leader if necessary.
-  void CompleteGroupDistributed(const string& device, CollectiveParams* cp,
+  void CompleteGroupDistributed(const DeviceAttributes& device,
+                                CollectiveParams* cp,
                                 CancellationManager* cancel_mgr,
                                 const GroupRecCallback& done);
 
@@ -69,9 +72,8 @@ class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
       TF_LOCKS_EXCLUDED(instance_mu_);
 
   // Updates instance_table_ with contents of resp.
-  void UpdateInstanceCache(const GroupRec* gr, CollectiveParams* cp,
-                           const CompleteInstanceResponse& resp,
-                           const StatusCallback& done)
+  Status UpdateInstanceCache(const GroupRec* gr, CollectiveParams* cp,
+                             const CompleteInstanceResponse& resp)
       TF_LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
 
   // Finish populating *cp.  Semantics are like those of
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
index 130a48e80d2..f08f7a3275d 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 #include "tensorflow/core/distributed_runtime/test_utils.h"
@@ -23,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -41,6 +43,7 @@ static std::unique_ptr<Device> NewDevice(const string& type,
   attr.set_name(name);
   attr.set_device_type(type);
   attr.mutable_locality()->set_numa_node(3);  // a non-default value
+  attr.set_incarnation(random::New64());
   return absl::make_unique<FakeDevice>(attr);
 }
 
@@ -125,127 +128,110 @@ class FakeCache : public TestWorkerCache {
 
 class DeviceResDistTest : public ::testing::Test {
  protected:
-  DeviceResDistTest() {}
-
-  ~DeviceResDistTest() override {
-    for (DeviceMgr* dm : device_mgrs_) {
-      delete dm;
-    }
-    for (auto it : dev_resolvers_) {
-      delete it.second;
-    }
-    for (auto it : cp_resolvers_) {
-      delete it.second;
-    }
-    for (FakeWorker* w : workers_) {
-      delete w;
-    }
-  }
-
   void DefineWorkers(int num_workers, int num_devices,
                      const string& device_type, bool nccl) {
-    ConfigProto config;
     for (int w = 0; w < num_workers; ++w) {
       string name = strings::StrCat("/job:worker/replica:0/task:", w);
-      if (w == 0) {
-        config.mutable_experimental()->set_collective_group_leader(name);
-        if (nccl) {
-          config.mutable_experimental()->set_collective_nccl(true);
-        }
-      }
-      DefineWorker(config, name, device_type, num_devices);
+      DefineWorker(name, device_type, num_devices, nccl);
     }
   }
 
-  void DefineWorker(const ConfigProto& config, const string& worker_name,
-                    const string& device_type, int num_devices) {
+  void DefineWorker(const string& worker_name, const string& device_type,
+                    int num_devices, bool nccl) {
+    ConfigProto config;
+    config.mutable_experimental()->set_collective_group_leader(
+        "/job:worker/replica:0/task:0");
+    config.mutable_experimental()->set_collective_nccl(nccl);
+
     std::vector<std::unique_ptr<Device>> devices;
     for (int i = 0; i < num_devices; ++i) {
       devices.push_back(NewDevice(
           device_type,
           strings::StrCat(worker_name, "/device:", device_type, ":", i)));
     }
-    DeviceMgr* dev_mgr = new StaticDeviceMgr(std::move(devices));
-    device_mgrs_.push_back(dev_mgr);
+    device_mgrs_[worker_name] =
+        absl::make_unique<StaticDeviceMgr>(std::move(devices));
     std::vector<string>* dv = &dev_by_task_[worker_name];
-    for (auto* d : dev_mgr->ListDevices()) {
+    dv->clear();
+    for (auto* d : device_mgrs_[worker_name]->ListDevices()) {
       dv->push_back(d->name());
     }
-    DeviceResolverDistributed* dev_res =
-        new DeviceResolverDistributed(dev_mgr, &wc_, worker_name);
-    dev_resolvers_[worker_name] = dev_res;
-    CollectiveParamResolverDistributed* cp_res =
-        new CollectiveParamResolverDistributed(config, dev_mgr, dev_res, &wc_,
-                                               worker_name);
-    cp_resolvers_[worker_name] = cp_res;
-    FakeWorker* fw = new FakeWorker(worker_name, dev_mgr, cp_res);
-    workers_.push_back(fw);
-    wc_.AddWorker(worker_name, fw);
+    dev_resolvers_[worker_name] = absl::make_unique<DeviceResolverDistributed>(
+        device_mgrs_[worker_name].get());
+    cp_resolvers_[worker_name] =
+        absl::make_unique<CollectiveParamResolverDistributed>(
+            config, device_mgrs_[worker_name].get(),
+            dev_resolvers_[worker_name].get(), &wc_, worker_name);
+    workers_[worker_name] = absl::make_unique<FakeWorker>(
+        worker_name, device_mgrs_[worker_name].get(),
+        cp_resolvers_[worker_name].get());
+    wc_.AddWorker(worker_name, workers_[worker_name].get());
   }
 
-  void DefineCollectiveParams(int num_workers, int num_devices) {
-    const int kGroupKey = 5;
-    const int kInstanceKey = 3;
+  void DefineCollectiveParams(int num_workers, int num_devices,
+                              const string& device_type) {
     for (int wi = 0; wi < num_workers; ++wi) {
       string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
       for (int di = 0; di < num_devices; ++di) {
-        string device_name = strings::StrCat(task_name, "/device:CPU:", di);
-        cp_.push_back(CollectiveParams());
-        CollectiveParams& cp = cp_.back();
-        cp.group.group_key = kGroupKey;
-        cp.group.group_size = num_workers * num_devices;
-        cp.group.device_type = DEVICE_CPU;
-        cp.group.num_tasks = num_workers;
-        cp.instance.instance_key = kInstanceKey;
-        cp.instance.type = REDUCTION_COLLECTIVE;
-        cp.instance.data_type = DT_FLOAT;
-        cp.instance.shape = TensorShape({64});
-        cp.instance.impl_details.subdiv_offsets.push_back(0);
+        string device_name =
+            strings::StrCat(task_name, "/device:", device_type, ":", di);
+        cp_[device_name] =
+            CreateCollectiveParams(num_workers, num_devices, device_type);
       }
     }
   }
 
+  CollectiveParams CreateCollectiveParams(int num_workers, int num_devices,
+                                          const string& device_type) {
+    const int kGroupKey = 5;
+    const int kInstanceKey = 3;
+    CollectiveParams cp;
+    cp.group.group_key = kGroupKey;
+    cp.group.group_size = num_workers * num_devices;
+    cp.group.device_type = DeviceType(device_type);
+    cp.group.num_tasks = num_workers;
+    cp.instance.instance_key = kInstanceKey;
+    cp.instance.type = REDUCTION_COLLECTIVE;
+    cp.instance.data_type = DT_FLOAT;
+    cp.instance.shape = TensorShape({64});
+    cp.instance.impl_details.subdiv_offsets.push_back(0);
+    return cp;
+  }
+
   void IssueRequests(int num_workers, int num_devices) {
-    const int device_count = num_workers * num_devices;
     {
       mutex_lock l(mu_);
       num_done_ = 0;
     }
-    cp_.resize(device_count);
-    status_.resize(device_count);
-    int idx = 0;
+    int group_size = num_workers * num_devices;
     for (int wi = 0; wi < num_workers; ++wi) {
+      string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
       for (int di = 0; di < num_devices; ++di) {
-        IssueRequest(num_workers, num_devices, idx);
-        ++idx;
+        string device_name = strings::StrCat(task_name, "/device:CPU:", di);
+        IssueRequest(task_name, device_name, group_size);
       }
     }
   }
 
-  void IssueRequest(int num_workers, int num_devices, int idx) {
-    int device_count = num_workers * num_devices;
-    int wi = idx / num_devices;
-    int di = idx % num_devices;
-    string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
-    string device_name = strings::StrCat(task_name, "/device:CPU:", di);
-    while (idx >= cp_.size()) {
-      status_.resize(idx + 1);
-      cp_.resize(idx + 1);
-    }
-    CollectiveParams* cp = &cp_[idx];
-    CollectiveParamResolverDistributed* cp_res = cp_resolvers_[task_name];
+  void IssueRequest(const string& task_name, const string& device_name,
+                    int group_size) {
+    Device* device = nullptr;
+    TF_CHECK_OK(device_mgrs_[task_name]->LookupDevice(device_name, &device));
+    CollectiveParams* cp = &cp_[device_name];
+    CollectiveParamResolverDistributed* cp_res = cp_resolvers_[task_name].get();
     CHECK(cp_res);
-    cp_res->CompleteParamsAsync(device_name, cp, &cm_,
-                                [this, idx, device_count](const Status& s) {
-                                  status_[idx] = s;
-                                  {
-                                    mutex_lock l(mu_);
-                                    ++num_done_;
-                                    if (num_done_ == device_count) {
-                                      done_.notify_all();
-                                    }
-                                  }
-                                });
+    cp_res->CompleteParamsAsync(
+        device->attributes(), cp, &cm_,
+        [this, device_name, group_size](const Status& s) {
+          status_[device_name] = s;
+          {
+            mutex_lock l(mu_);
+            ++num_done_;
+            if (num_done_ == group_size) {
+              done_.notify_all();
+            }
+          }
+        });
   }
 
   void ValidateCollectiveParams(int num_workers, int num_devices) {
@@ -259,39 +245,68 @@ class DeviceResDistTest : public ::testing::Test {
     // Verify that all cp_ values get the same set of task and device
     // names, with unique default_rank in the expected order.
     const int dev_count = num_workers * num_devices;
+    string dev0 = "/job:worker/replica:0/task:0/device:CPU:0";
     for (int wi = 0; wi < num_workers; ++wi) {
       string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
       for (int di = 0; di < num_devices; ++di) {
         string device_name = strings::StrCat(task_name, "/device:CPU:", di);
         int idx = wi * num_devices + di;
-        TF_ASSERT_OK(status_[idx]);
-        EXPECT_EQ(cp_[idx].default_rank, idx);
-        EXPECT_EQ(cp_[idx].instance.device_names.size(), dev_count);
-        EXPECT_EQ(cp_[idx].instance.device_names[idx], device_name);
-        EXPECT_EQ(cp_[idx].instance.task_names[idx], task_name);
+        TF_ASSERT_OK(status_[device_name]);
+        EXPECT_EQ(cp_[device_name].default_rank, idx);
+        EXPECT_EQ(cp_[device_name].group.device_names.size(), dev_count);
+        EXPECT_EQ(cp_[device_name].group.device_names[idx], device_name);
+        EXPECT_EQ(cp_[device_name].group.task_names[idx], task_name);
+        ValidateDeviceResolver(cp_[device_name], task_name);
         if (idx > 0) {
-          EXPECT_EQ(cp_[0].group.runtime_details.communicator_key,
-                    cp_[idx].group.runtime_details.communicator_key);
+          EXPECT_EQ(cp_[dev0].group.runtime_details.communicator_key,
+                    cp_[device_name].group.runtime_details.communicator_key);
           for (int i = 0; i < dev_count; ++i) {
-            EXPECT_EQ(cp_[0].instance.device_names[i],
-                      cp_[idx].instance.device_names[i]);
-            EXPECT_EQ(cp_[0].instance.task_names[i],
-                      cp_[idx].instance.task_names[i]);
+            EXPECT_EQ(cp_[dev0].group.device_names[i],
+                      cp_[device_name].group.device_names[i]);
+            EXPECT_EQ(cp_[dev0].group.task_names[i],
+                      cp_[device_name].group.task_names[i]);
           }
         }
       }
     }
   }
 
+  void ValidateDeviceResolver(const CollectiveParams& cp, const string& task) {
+    for (const string& device_name : cp.group.device_names) {
+      DeviceAttributes attributes;
+      TF_ASSERT_OK(
+          dev_resolvers_[task]->GetDeviceAttributes(device_name, &attributes));
+    }
+  }
+
+  void RestartWorker(int worker_idx, int num_workers, int num_devices,
+                     const string& device_type, bool nccl) {
+    string worker_name =
+        strings::StrCat("/job:worker/replica:0/task:", worker_idx);
+    DefineWorker(worker_name, device_type, num_devices, nccl);
+    for (int i = 0; i < num_devices; ++i) {
+      string device_name =
+          strings::StrCat(worker_name, "/device:", device_type, ":", i);
+      cp_[device_name] =
+          CreateCollectiveParams(num_workers, num_devices, device_type);
+      status_.erase(device_name);
+    }
+  }
+
   FakeCache wc_;
   CancellationManager cm_;
-  std::vector<DeviceMgr*> device_mgrs_;
-  std::unordered_map<string, DeviceResolverDistributed*> dev_resolvers_;
-  std::unordered_map<string, CollectiveParamResolverDistributed*> cp_resolvers_;
-  std::unordered_map<string, std::vector<string>> dev_by_task_;
-  std::vector<FakeWorker*> workers_;
-  std::vector<CollectiveParams> cp_;
-  std::vector<Status> status_;
+  // Below are keyed by task names.
+  absl::flat_hash_map<string, std::unique_ptr<DeviceMgr>> device_mgrs_;
+  absl::flat_hash_map<string, std::unique_ptr<DeviceResolverDistributed>>
+      dev_resolvers_;
+  absl::flat_hash_map<string,
+                      std::unique_ptr<CollectiveParamResolverDistributed>>
+      cp_resolvers_;
+  absl::flat_hash_map<string, std::vector<string>> dev_by_task_;
+  absl::flat_hash_map<string, std::unique_ptr<FakeWorker>> workers_;
+  // Below are keyed by device names;
+  absl::flat_hash_map<string, CollectiveParams> cp_;
+  absl::flat_hash_map<string, Status> status_;
   mutex mu_;
   int num_done_ TF_GUARDED_BY(mu_);
   condition_variable done_;
@@ -300,8 +315,8 @@ class DeviceResDistTest : public ::testing::Test {
 TEST_F(DeviceResDistTest, Workers1Devices1) {
   const int num_workers = 1;
   const int num_devices = 1;
-  DefineWorkers(num_workers, num_devices, "CPU", false);
-  DefineCollectiveParams(num_workers, num_devices);
+  DefineWorkers(num_workers, num_devices, "CPU", /*nccl*/ false);
+  DefineCollectiveParams(num_workers, num_devices, "CPU");
   IssueRequests(num_workers, num_devices);
   ValidateCollectiveParams(num_workers, num_devices);
 }
@@ -309,12 +324,25 @@ TEST_F(DeviceResDistTest, Workers1Devices1) {
 TEST_F(DeviceResDistTest, Workers2Devices2) {
   const int num_workers = 2;
   const int num_devices = 2;
-  DefineWorkers(num_workers, num_devices, "CPU", false);
-  DefineCollectiveParams(num_workers, num_devices);
+  DefineWorkers(num_workers, num_devices, "CPU", /*nccl*/ false);
+  DefineCollectiveParams(num_workers, num_devices, "CPU");
   IssueRequests(num_workers, num_devices);
   ValidateCollectiveParams(num_workers, num_devices);
 }
 
+TEST_F(DeviceResDistTest, DifferentIncarnation) {
+  const int num_workers = 2;
+  const int num_devices = 1;
+  DefineWorkers(num_workers, num_devices, "CPU", /*nccl*/ false);
+  DefineCollectiveParams(num_workers, num_devices, "CPU");
+  IssueRequests(num_workers, num_devices);
+  RestartWorker(1, num_workers, num_devices, "CPU", /*nccl*/ false);
+  const string task_name = "/job:worker/replica:0/task:1";
+  const string device_name = absl::StrCat(task_name, "/device:CPU:0");
+  IssueRequest(task_name, device_name, num_workers * num_devices);
+  EXPECT_TRUE(errors::IsFailedPrecondition(status_[device_name]));
+}
+
 #if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
 namespace {
 // A mock NcclReducer for testing group runtime details initialization with CPU
@@ -347,7 +375,7 @@ TEST_F(DeviceResDistTest, Workers4Devices3) {
   const int num_workers = 4;
   const int num_devices = 3;
   DefineWorkers(num_workers, num_devices, "CPU", true);
-  DefineCollectiveParams(num_workers, num_devices);
+  DefineCollectiveParams(num_workers, num_devices, "CPU");
   IssueRequests(num_workers, num_devices);
   ValidateCollectiveParams(num_workers, num_devices);
 }
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
index 4215b163991..1861262e9b1 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
@@ -155,34 +155,22 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
         PopulateTensorFromExtra(extra, to_tensor);
       }
     }
-    if (!s.ok() && errors::IsFailedPrecondition(s)) {
-      dev_resolver_->ClearTask(peer_task);
-    }
 
     delete state;
     done(s);
   };
 
-  // Logic to execute once we have the device attributes for the server-side
-  // device.
-  auto dev_attributes_callback = [this, state, peer_device, peer_task, key,
-                                  to_device, to_device_ctx, to_alloc_attr,
-                                  to_tensor, client_locality,
-                                  recv_buf_callback](const Status& s) {
-    if (!s.ok()) {
-      recv_buf_callback(s);
-    } else {
-      state->call.reset(new RecvBufCall(
-          step_id_, peer_device, peer_task, key, to_device, to_device_ctx,
-          to_alloc_attr, to_tensor, client_locality, state->server_attributes,
-          &cancel_mgr_, worker_cache_));
-      state->call->Start(recv_buf_callback);
-    }
-  };
-
-  dev_resolver_->GetDeviceAttributesAsync(peer_device, peer_task,
-                                          &state->server_attributes,
-                                          dev_attributes_callback);
+  Status s = dev_resolver_->GetDeviceAttributes(peer_device,
+                                                &state->server_attributes);
+  if (!s.ok()) {
+    recv_buf_callback(s);
+    return;
+  }
+  state->call.reset(
+      new RecvBufCall(step_id_, peer_device, peer_task, key, to_device,
+                      to_device_ctx, to_alloc_attr, to_tensor, client_locality,
+                      state->server_attributes, &cancel_mgr_, worker_cache_));
+  state->call->Start(recv_buf_callback);
 }
 
 void CollectiveRemoteAccessDistributed::CheckPeerHealth(
@@ -212,7 +200,7 @@ void CollectiveRemoteAccessDistributed::CheckPeerHealth(
       [this, req, resp, wi, peer_task, done](Status s) {
         std::vector<DeviceAttributes> cached_attrs;
         if (s.ok()) {
-          s = dev_resolver_->GetTaskCached(peer_task, &cached_attrs);
+          s = dev_resolver_->GetAllDeviceAttributes(peer_task, &cached_attrs);
         }
         if (s.ok()) {
           absl::flat_hash_set<uint64> remote_incarnations;
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
index b6975e40723..454111eb1b6 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 #include "tensorflow/core/distributed_runtime/test_utils.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -30,7 +31,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/transport_options.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
-#include "tensorflow/core/util/device_name_utils.h"
 
 // The only interesting method on CollectiveRemoteAccessDistributed
 // that's not on CollectiveRemoteAccessLocal is RecvFromPeer which
@@ -224,6 +224,18 @@ class CollRMADistTest : public ::testing::Test {
     }
   }
 
+  // Populates all device resolvers with device attributes of the cluster. This
+  // should be called in the beginning of all tests unless you would like to
+  // simulate a situation that is before parameter resolution.
+  void ResolveDeviceAttributes() {
+    for (auto& dev_resolver_item : dev_resolvers_) {
+      DeviceResolverDistributed* dev_resolver = dev_resolver_item.second;
+      for (const auto& item : dev_by_task_) {
+        TF_CHECK_OK(dev_resolver->UpdateDeviceAttributes(item.second));
+      }
+    }
+  }
+
   void DefineWorker(const string& worker_name, const string& device_type,
                     int num_devices, bool is_failed = false) {
     std::vector<std::unique_ptr<Device>> devices;
@@ -234,13 +246,12 @@ class CollRMADistTest : public ::testing::Test {
     }
     DeviceMgr* dev_mgr = new StaticDeviceMgr(std::move(devices));
     device_mgrs_.push_back(dev_mgr);
-    std::vector<string>* dv = &dev_by_task_[worker_name];
+    std::vector<DeviceAttributes>* dv = &dev_by_task_[worker_name];
     dv->clear();
     for (auto d : dev_mgr->ListDevices()) {
-      dv->push_back(d->name());
+      dv->push_back(d->attributes());
     }
-    DeviceResolverDistributed* dev_res =
-        new DeviceResolverDistributed(dev_mgr, &wc_, worker_name);
+    DeviceResolverDistributed* dev_res = new DeviceResolverDistributed(dev_mgr);
     dev_resolvers_[worker_name] = dev_res;
     FakeWorker* fw = new FakeWorker(worker_name, dev_mgr, dev_res, is_failed);
     workers_.push_back(fw);
@@ -254,6 +265,9 @@ class CollRMADistTest : public ::testing::Test {
       delete it->second;
       dev_resolvers_.erase(it);
     }
+    // After restarting a worker, the other workers already have the device
+    // attributes of the old worker. We don't broadcast device attributes of the
+    // new worker to mimic the real world.
     DefineWorker(worker_name, device_type, num_devices, is_failed);
   }
 
@@ -269,7 +283,7 @@ class CollRMADistTest : public ::testing::Test {
   CancellationManager cm_;
   std::vector<DeviceMgr*> device_mgrs_;
   std::unordered_map<string, DeviceResolverDistributed*> dev_resolvers_;
-  std::unordered_map<string, std::vector<string>> dev_by_task_;
+  std::unordered_map<string, std::vector<DeviceAttributes>> dev_by_task_;
   std::shared_ptr<UnboundedWorkQueue> work_queue_;
   std::vector<FakeWorker*> workers_;
   std::unique_ptr<CollectiveRemoteAccessDistributed> rma_;
@@ -284,6 +298,7 @@ class CollRMADistTest : public ::testing::Test {
 };
 
 TEST_F(CollRMADistTest, ProdFirstOK) {
+  ResolveDeviceAttributes();
   Notification consumer_note;
   Notification producer_note;
   Status consumer_status;
@@ -319,6 +334,7 @@ TEST_F(CollRMADistTest, ProdFirstOK) {
 }
 
 TEST_F(CollRMADistTest, ConsFirstOK) {
+  ResolveDeviceAttributes();
   Notification consumer_note;
   Notification producer_note;
   Status consumer_status;
@@ -354,6 +370,7 @@ TEST_F(CollRMADistTest, ConsFirstOK) {
 }
 
 TEST_F(CollRMADistTest, ConsFirstAbort) {
+  ResolveDeviceAttributes();
   Notification consumer_note;
   Status consumer_status;
   const string kBufKey = "fake_buf_key";
@@ -377,6 +394,7 @@ TEST_F(CollRMADistTest, ConsFirstAbort) {
 }
 
 TEST_F(CollRMADistTest, WorkerRestart) {
+  ResolveDeviceAttributes();
   Notification consumer_note;
   Notification producer_note;
   Status consumer_status;
@@ -428,21 +446,7 @@ TEST_F(CollRMADistTest, WorkerRestart) {
 }
 
 TEST_F(CollRMADistTest, CheckHealthOKWithCachedAttr) {
-  DeviceAttributes attr;
-  Status get_attr_status;
-  Notification get_attr_done;
-  // Call GetDeviceAttributesAsync to cache the device attributes of a remote
-  // worker.
-  dev_resolvers_["/job:worker/replica:0/task:0"]->GetDeviceAttributesAsync(
-      "/job:worker/replica:0/task:1/device:CPU:0",
-      "/job:worker/replica:0/task:1", &attr,
-      [&get_attr_status, &get_attr_done](const Status& s) {
-        get_attr_status = s;
-        get_attr_done.Notify();
-      });
-  get_attr_done.WaitForNotification();
-  TF_ASSERT_OK(get_attr_status);
-
+  ResolveDeviceAttributes();
   Status check_health_status;
   Notification check_health_done;
   rma_->CheckPeerHealth(
@@ -469,21 +473,7 @@ TEST_F(CollRMADistTest, CheckHealthOKWithoutCachedAttr) {
 }
 
 TEST_F(CollRMADistTest, CheckHealthRestarted) {
-  DeviceAttributes attr;
-  Status get_attr_status;
-  Notification get_attr_done;
-  // Call GetDeviceAttributesAsync to cache the device attributes of a remote
-  // worker.
-  dev_resolvers_["/job:worker/replica:0/task:0"]->GetDeviceAttributesAsync(
-      "/job:worker/replica:0/task:1/device:CPU:0",
-      "/job:worker/replica:0/task:1", &attr,
-      [&get_attr_status, &get_attr_done](const Status& s) {
-        get_attr_status = s;
-        get_attr_done.Notify();
-      });
-  get_attr_done.WaitForNotification();
-  TF_ASSERT_OK(get_attr_status);
-
+  ResolveDeviceAttributes();
   RestartWorker("/job:worker/replica:0/task:1", "CPU", /*num_devices*/ 1);
 
   Status check_health_status;
@@ -499,21 +489,7 @@ TEST_F(CollRMADistTest, CheckHealthRestarted) {
 }
 
 TEST_F(CollRMADistTest, CheckHealthFailedPeer) {
-  DeviceAttributes attr;
-  Status get_attr_status;
-  Notification get_attr_done;
-  // Call GetDeviceAttributesAsync to cache the device attributes of a remote
-  // worker.
-  dev_resolvers_["/job:worker/replica:0/task:0"]->GetDeviceAttributesAsync(
-      "/job:worker/replica:0/task:1/device:CPU:0",
-      "/job:worker/replica:0/task:1", &attr,
-      [&get_attr_status, &get_attr_done](const Status& s) {
-        get_attr_status = s;
-        get_attr_done.Notify();
-      });
-  get_attr_done.WaitForNotification();
-  TF_ASSERT_OK(get_attr_status);
-
+  ResolveDeviceAttributes();
   RestartWorker("/job:worker/replica:0/task:1", "CPU", /*num_devices*/ 1,
                 /*is_failed*/ true);
 
@@ -530,25 +506,8 @@ TEST_F(CollRMADistTest, CheckHealthFailedPeer) {
 }
 
 TEST_F(CollRMADistTest, CheckHealthRestartedWithDifferentDevices) {
+  ResolveDeviceAttributes();
   RestartWorker("/job:worker/replica:0/task:1", "GPU", /*num_devices*/ 1);
-
-  DeviceAttributes attr;
-  Status get_attr_status;
-  Notification get_attr_done;
-  // Call GetDeviceAttributesAsync to cache the device attributes of a remote
-  // worker.
-  dev_resolvers_["/job:worker/replica:0/task:0"]->GetDeviceAttributesAsync(
-      "/job:worker/replica:0/task:1/device:GPU:0",
-      "/job:worker/replica:0/task:1", &attr,
-      [&get_attr_status, &get_attr_done](const Status& s) {
-        get_attr_status = s;
-        get_attr_done.Notify();
-      });
-  get_attr_done.WaitForNotification();
-  TF_ASSERT_OK(get_attr_status);
-
-  RestartWorker("/job:worker/replica:0/task:1", "CPU", /*num_devices*/ 1);
-
   Status check_health_status;
   Notification check_health_done;
   rma_->CheckPeerHealth(
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed.cc b/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
index ab0b3a60600..76d921fe254 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
@@ -15,105 +15,30 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 
 #include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
-DeviceResolverDistributed::DeviceResolverDistributed(
-    const DeviceMgr* dev_mgr, WorkerCacheInterface* worker_cache,
-    const string& task_name)
-    : dev_mgr_(dev_mgr), worker_cache_(worker_cache), task_name_(task_name) {}
 
-void DeviceResolverDistributed::GetDeviceAttributesAsync(
-    const string& device, const string& task, DeviceAttributes* attributes,
-    const StatusCallback& done) {
-  if (task.empty() || task == task_name_) {
-    // Device is local to this task.
-    Device* dev;
-    Status s = dev_mgr_->LookupDevice(device, &dev);
-    if (s.ok()) {
-      *attributes = dev->attributes();
-    }
-    done(s);
-    return;
-  } else {
-    // Lookup of a remote device: first try the local cache.
-    bool found = false;
-    {
-      mutex_lock l(mu_);
-      auto it = attr_table_.find(device);
-      if (it != attr_table_.end()) {
-        *attributes = it->second;
-        found = true;
-      }
-    }
-    if (found) {
-      done(Status::OK());
-      return;
-    }
-  }
-  // Device is remote and no cache entry was found.  Refresh the cache
-  // then retry the lookup.
-  RefreshRemoteAttributes(
-      device, task, [this, device, task, attributes, done](const Status& s) {
-        if (!s.ok()) {
-          done(s);
-        } else {
-          GetDeviceAttributesAsync(device, task, attributes, done);
-        }
-      });
-}
-
-void DeviceResolverDistributed::GetAllDeviceAttributesAsync(
-    const std::vector<string>& devices, const std::vector<string>& tasks,
-    std::vector<DeviceAttributes>* attributes, const StatusCallback& done) {
-  attributes->clear();
-  GetAllDeviceAttributesRecursive(devices, tasks, attributes, done);
-}
-
-void DeviceResolverDistributed::GetAllDeviceAttributesRecursive(
-    const std::vector<string>& devices, const std::vector<string>& tasks,
-    std::vector<DeviceAttributes>* attributes, const StatusCallback& done) {
-  size_t i = attributes->size();
-  if (i < devices.size()) {
-    attributes->push_back(DeviceAttributes());
-    GetDeviceAttributesAsync(
-        devices[i], tasks[i], &attributes->back(),
-        [this, &devices, &tasks, attributes, done](const Status& s) {
-          if (!s.ok()) {
-            done(s);
-            return;
-          } else {
-            GetAllDeviceAttributesRecursive(devices, tasks, attributes, done);
-          }
-        });
-  } else {
-    done(Status::OK());
+DeviceResolverDistributed::DeviceResolverDistributed(const DeviceMgr* dev_mgr) {
+  mutex_lock l(mu_);
+  for (Device* device : dev_mgr->ListDevices()) {
+    attr_table_[device->name()] = device->attributes();
   }
 }
 
-void DeviceResolverDistributed::RefreshRemoteAttributes(
-    const string& device, const string& task, const StatusCallback& done) {
-  GetStatusRequest* req = new GetStatusRequest;
-  GetStatusResponse* resp = new GetStatusResponse;
-  WorkerInterface* worker = worker_cache_->GetOrCreateWorker(task);
-  CHECK(worker) << "Failed to get worker for " << task;
-  worker->GetStatusAsync(
-      req, resp, /*fail_fast=*/false,
-      [this, device, task, req, resp, worker, done](Status s) {
-        if (s.ok()) {
-          mutex_lock l(mu_);
-          for (const DeviceAttributes& da : resp->device_attributes()) {
-            attr_table_[da.name()] = da;
-          }
-        }
-        done(s);
-        delete req;
-        delete resp;
-        worker_cache_->ReleaseWorker(task, worker);
-      });
+Status DeviceResolverDistributed::GetDeviceAttributes(
+    const string& device, DeviceAttributes* attributes) {
+  mutex_lock l(mu_);
+  auto it = attr_table_.find(device);
+  if (it == attr_table_.end()) {
+    return errors::NotFound(device, " not found");
+  }
+  *attributes = it->second;
+  return Status::OK();
 }
 
-Status DeviceResolverDistributed::GetTaskCached(
+Status DeviceResolverDistributed::GetAllDeviceAttributes(
     const string& task, std::vector<DeviceAttributes>* attributes) {
   mutex_lock l(mu_);
   attributes->clear();
@@ -129,25 +54,23 @@ Status DeviceResolverDistributed::GetTaskCached(
   return Status::OK();
 }
 
-void DeviceResolverDistributed::ClearTask(const string& task) {
+Status DeviceResolverDistributed::UpdateDeviceAttributes(
+    const std::vector<DeviceAttributes>& attributes) {
   mutex_lock l(mu_);
-  // First find all the keys belonging to the task.
-  std::unordered_set<string> task_keys;
-  for (const auto& it : attr_table_) {
-    const string& device_name = it.first;
-    if (DeviceNameUtils::IsSameAddressSpace(task, device_name)) {
-      task_keys.insert(device_name);
+  for (const DeviceAttributes& attr : attributes) {
+    auto item = attr_table_.insert({attr.name(), attr});
+    auto it = item.first;
+    bool success = item.second;
+    // Returns error if the device already exists in the cache and has a
+    // different incarnation.
+    if (!success && it->second.incarnation() != attr.incarnation()) {
+      return errors::FailedPrecondition(
+          attr.name(),
+          "exists in cache with a different incarnation. "
+          "This usually means the remote worker has restarted");
     }
   }
-  // Then delete them.
-  for (const string& key : task_keys) {
-    attr_table_.erase(key);
-  }
-}
-
-void DeviceResolverDistributed::ClearCache() {
-  mutex_lock l(mu_);
-  attr_table_.clear();
+  return Status::OK();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed.h b/tensorflow/core/distributed_runtime/device_resolver_distributed.h
index d400fb5750e..299cbc95b01 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed.h
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
 class DeviceMgr;
@@ -28,43 +29,18 @@ class WorkerCacheInterface;
 
 class DeviceResolverDistributed : public DeviceResolverInterface {
  public:
-  DeviceResolverDistributed(const DeviceMgr* dev_mgr,
-                            WorkerCacheInterface* worker_cache,
-                            const string& task_name);
+  explicit DeviceResolverDistributed(const DeviceMgr* dev_mgr);
 
-  virtual ~DeviceResolverDistributed() {}
+  Status GetDeviceAttributes(const string& device,
+                             DeviceAttributes* attributes) override;
 
-  void GetAllDeviceAttributesAsync(const std::vector<string>& devices,
-                                   const std::vector<string>& tasks,
-                                   std::vector<DeviceAttributes>* attributes,
-                                   const StatusCallback& done) override;
+  Status GetAllDeviceAttributes(
+      const string& task, std::vector<DeviceAttributes>* attributes) override;
 
-  void GetDeviceAttributesAsync(const string& device, const string& task,
-                                DeviceAttributes* attributes,
-                                const StatusCallback& done) override;
-
-  Status GetTaskCached(const string& task,
-                       std::vector<DeviceAttributes>* attributes) override;
-
-  void ClearTask(const string& task) override;
-
-  void ClearCache() override;
+  Status UpdateDeviceAttributes(
+      const std::vector<DeviceAttributes>& attributes) override;
 
  protected:
-  // Loads attr_table_ with device attributes retrieved from remote task.
-  void RefreshRemoteAttributes(const string& device, const string& task,
-                               const StatusCallback& done)
-      TF_LOCKS_EXCLUDED(mu_);
-
-  // Subroutine used by GetAllDeviceAttributesAsync.  Recursively extends
-  // *attributes with DeviceAttributes of the corresponding device named
-  // by inst_params.instance.device_names.
-  void GetAllDeviceAttributesRecursive(
-      const std::vector<string>& devices, const std::vector<string>& tasks,
-      std::vector<DeviceAttributes>* attributes, const StatusCallback& done);
-
-  const DeviceMgr* dev_mgr_;            // Not owned
-  WorkerCacheInterface* worker_cache_;  // Not owned
   const string task_name_;
   mutex mu_;
   absl::flat_hash_map<string, DeviceAttributes> attr_table_ TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
index 3d7523f945c..21466d2edf2 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
@@ -22,30 +22,19 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 namespace {
 
-// Subclass of DeviceResolverDistributed which behaves identically but
-// allows access to the attr_table_.
-class TestableDeviceResolverDistributed : public DeviceResolverDistributed {
- public:
-  TestableDeviceResolverDistributed(const DeviceMgr* dev_mgr,
-                                    WorkerCacheInterface* worker_cache,
-                                    const string& task)
-      : DeviceResolverDistributed(dev_mgr, worker_cache, task) {}
-
-  absl::flat_hash_map<string, DeviceAttributes>& attr_table() {
-    return attr_table_;
-  }
-};
+using ::testing::Property;
+using ::testing::UnorderedElementsAre;
 
 // Create a fake 'Device' whose only interesting attribute is a non-default
 // DeviceLocality and incarnation.
-static std::unique_ptr<Device> NewDevice(const string& type, const string& name,
-                                         int numa_node, uint64 incarnation) {
+std::unique_ptr<Device> NewDevice(const string& type, const string& name) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
@@ -55,256 +44,121 @@ static std::unique_ptr<Device> NewDevice(const string& type, const string& name,
   DeviceAttributes attr;
   attr.set_name(name);
   attr.set_device_type(type);
-  attr.mutable_locality()->set_numa_node(numa_node);
-  attr.set_incarnation(incarnation);
+  attr.set_incarnation(random::New64());
   return absl::make_unique<FakeDevice>(attr);
 }
 
-// Create a fake WorkerInterface that responds to requests without RPCs,
-// in this case returning the DeviceAttributes of a fake remote worker.
-class FakeWorker : public TestWorkerInterface {
- public:
-  FakeWorker(const string& name, DeviceMgr* dev_mgr,
-             DeviceResolverDistributed* dres)
-      : name_(name), device_mgr_(dev_mgr), device_resolver_(dres) {}
-
-  void GetStatusAsync(const GetStatusRequest* request,
-                      GetStatusResponse* response, bool fail_fast,
-                      StatusCallback done) override {
-    std::vector<DeviceAttributes> dev_attr;
-    device_mgr_->ListDeviceAttributes(&dev_attr);
-    for (const auto& da : dev_attr) {
-      *response->add_device_attributes() = da;
-    }
-    done(Status::OK());
-  }
-
- private:
-  string name_;
-  DeviceMgr* device_mgr_;
-  DeviceResolverDistributed* device_resolver_;
-};
-
-// An implementation of WorkerCacheInterface that routes all requests
-// to local FakeWorkers, implementing only the methods needed for tests.
-class FakeCache : public TestWorkerCache {
- public:
-  // Override the Locality methods to actually pass through to the
-  // worker.
-  bool GetDeviceLocalityNonBlocking(const string& device,
-                                    DeviceLocality* locality) override {
-    return false;
-  }
-
-  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
-                              StatusCallback done) override {
-    string task_name;
-    string dev_part;
-    if (!DeviceNameUtils::SplitDeviceName(device, &task_name, &dev_part)) {
-      done(errors::Internal("failed to parse device name"));
-      return;
-    }
-    auto it = workers_.find(task_name);
-    if (it == workers_.end()) {
-      done(errors::Internal("failed to find worker ", task_name));
-      return;
-    }
-    WorkerInterface* wi = it->second;
-    GetStatusRequest req;
-    GetStatusResponse resp;
-    Status status = wi->GetStatus(&req, &resp);
-    if (!status.ok()) {
-      done(status);
-      return;
-    }
-    for (const auto& it : resp.device_attributes()) {
-      if (it.name() == device) {
-        *locality = it.locality();
-        done(Status::OK());
-        return;
-      }
-    }
-    done(errors::Internal("device not found: ", device));
-  }
-};
-
 class DeviceResDistTest : public ::testing::Test {
  protected:
-  DeviceResDistTest() {}
-
-  ~DeviceResDistTest() override {
-    for (DeviceMgr* dm : device_mgrs_) {
-      delete dm;
-    }
-    for (auto it : resolvers_) {
-      delete it.second;
-    }
-    for (FakeWorker* w : workers_) {
-      delete w;
-    }
-  }
-
-  void DefineWorkers(int num_workers, int num_devices,
-                     const string& device_type,
-                     uint64 device_incarnation_base) {
-    for (int w = 0; w < num_workers; ++w) {
-      string name = strings::StrCat("/job:worker/replica:0/task:", w);
-      DefineWorker(name, device_type, num_devices,
-                   w * num_devices + device_incarnation_base);
-    }
-  }
-
-  void DefineWorker(const string& worker_name, const string& device_type,
-                    int num_devices, uint64 device_incarnation_base) {
+  void SetUp() override {
     std::vector<std::unique_ptr<Device>> devices;
-    for (int i = 0; i < num_devices; ++i) {
-      devices.push_back(NewDevice(
-          device_type,
-          strings::StrCat(worker_name, "/device:", device_type, ":", i), i,
-          device_incarnation_base + i));
-    }
-    DeviceMgr* dev_mgr = new StaticDeviceMgr(std::move(devices));
-    TestableDeviceResolverDistributed* dev_res =
-        new TestableDeviceResolverDistributed(dev_mgr, &wc_, worker_name);
-    resolvers_[worker_name] = dev_res;
-    device_mgrs_.push_back(dev_mgr);
-    std::vector<string>* dv = &dev_by_task_[worker_name];
-    dv->clear();
-    for (auto* d : dev_mgr->ListDevices()) {
-      dv->push_back(d->name());
-    }
-    FakeWorker* fw = new FakeWorker(worker_name, dev_mgr, dev_res);
-    workers_.push_back(fw);
-    wc_.AddWorker(worker_name, fw);
+    devices.push_back(
+        NewDevice("CPU", "/job:worker/replica:0/task:0/device:CPU:0"));
+    devices.push_back(
+        NewDevice("CPU", "/job:worker/replica:0/task:0/device:CPU:1"));
+    dev_mgr_ = absl::make_unique<StaticDeviceMgr>(std::move(devices));
+    dev_resolver_ =
+        absl::make_unique<DeviceResolverDistributed>(dev_mgr_.get());
+
+    std::vector<DeviceAttributes> attributes;
+    attributes.push_back(
+        NewDevice("CPU", "/job:worker/replica:0/task:1/device:CPU:0")
+            ->attributes());
+    attributes.push_back(
+        NewDevice("CPU", "/job:worker/replica:0/task:1/device:CPU:1")
+            ->attributes());
+    TF_ASSERT_OK(dev_resolver_->UpdateDeviceAttributes(attributes));
   }
 
-  void RestartWorker(const string& worker_name, const string& device_type,
-                     int num_devices, uint64 device_incarnation_base) {
-    for (auto it : resolvers_) {
-      it.second->ClearCache();
-    }
-    // `DefineWorker` creates a device resolver and a worker and adds them to
-    // resolvers_ and workers_.  Recreating the worker would overwrite these map
-    // entries.  We destroy the old device resolver here; all other objects are
-    // cleaned up in the destructor.
-    delete resolvers_[worker_name];
-    DefineWorker(worker_name, device_type, num_devices,
-                 device_incarnation_base);
-  }
-
-  void ResolveIncarnationsAndValidate(
-      const int num_workers, const int num_devices, const string& worker_prefix,
-      const string& device_type,
-      const std::vector<std::vector<uint64>>& expected_incarnations) {
-    for (int w = 0; w < num_workers; ++w) {
-      const string worker_name = absl::StrCat(worker_prefix, w);
-      auto* device_resolver = resolvers_[worker_name];
-      const string device_prefix =
-          absl::StrCat(worker_name, "/device:", device_type, ":");
-      for (int peer_w = 0; peer_w < num_workers; ++peer_w) {
-        const string peer_worker_name = absl::StrCat(worker_prefix, peer_w);
-        for (int d = 0; d < num_devices; ++d) {
-          const string device_name =
-              absl::StrCat(peer_worker_name, "/device:", device_type, ":", d);
-          DeviceNameUtils::ParsedName parsed;
-          ASSERT_TRUE(DeviceNameUtils::ParseFullName(device_name, &parsed));
-          // NOLINT prevents linter from suggesting absl::Notification as a
-          // replacement, which is not available in OSS.
-          Notification note;  // NOLINT
-          Status status;
-          DeviceAttributes attributes;
-          device_resolver->GetDeviceAttributesAsync(
-              device_name, peer_worker_name, &attributes,
-              [&note, &status](const Status& s) {
-                status = s;
-                note.Notify();
-              });
-          note.WaitForNotification();
-          TF_EXPECT_OK(status);
-          EXPECT_EQ(attributes.incarnation(), expected_incarnations[peer_w][d]);
-        }
-      }
-    }
-  }
-
-  FakeCache wc_;
-  std::vector<DeviceMgr*> device_mgrs_;
-  std::unordered_map<string, TestableDeviceResolverDistributed*> resolvers_;
-  std::unordered_map<string, std::vector<string>> dev_by_task_;
-  std::vector<FakeWorker*> workers_;
+  std::unique_ptr<DeviceMgr> dev_mgr_;
+  std::unique_ptr<DeviceResolverDistributed> dev_resolver_;
 };
 
-TEST_F(DeviceResDistTest, Workers3Devices4) {
-  DefineWorkers(/*num_workers=*/3, /*num_devices=*/4, /*device_type=*/"CPU",
-                /*device_incarnation_base=*/1);
-  // Check that every device is available from every task.
-  for (auto it : resolvers_) {
-    DeviceResolverDistributed* dres = it.second;
-    for (auto it2 : dev_by_task_) {
-      const string& task_name = it2.first;
-      for (const auto& dev_name : it2.second) {
-        DeviceNameUtils::ParsedName parsed;
-        ASSERT_TRUE(DeviceNameUtils::ParseFullName(dev_name, &parsed));
-        Notification note;
-        Status status;
-        DeviceAttributes attributes;
-        dres->GetDeviceAttributesAsync(dev_name, task_name, &attributes,
-                                       [&note, &status](const Status& s) {
-                                         status = s;
-                                         note.Notify();
-                                       });
-        note.WaitForNotification();
-        TF_EXPECT_OK(status);
-        EXPECT_EQ(parsed.id, attributes.locality().numa_node());
-      }
-    }
-  }
-  // Clear just task 0 from all.
-  const string w0_name = "/job:worker/replica:0/task:0";
-  for (auto it : resolvers_) {
-    if (it.first == w0_name) continue;
-    TestableDeviceResolverDistributed* dres = it.second;
-    EXPECT_EQ(8, it.second->attr_table().size());
-    dres->ClearTask("/job:worker/replica:0/task:0");
-    EXPECT_EQ(4, it.second->attr_table().size());
-  }
+TEST_F(DeviceResDistTest, GetDeviceAttributesLocal) {
+  DeviceAttributes attributes;
+  TF_ASSERT_OK(dev_resolver_->GetDeviceAttributes(
+      "/job:worker/replica:0/task:0/device:CPU:0", &attributes));
+  EXPECT_EQ(attributes.name(), "/job:worker/replica:0/task:0/device:CPU:0");
 }
 
-TEST_F(DeviceResDistTest, DeviceIncarnationChangesOnFailure) {
-  constexpr int num_workers = 3;
-  constexpr int num_devices = 4;
-  constexpr int failing_worker_index = 1;
-  const string device_type = "CPU";
-  constexpr uint64 device_incarnation_base = 100;
-  DefineWorkers(num_workers, num_devices, device_type, device_incarnation_base);
-  const string worker_prefix = "/job:worker/replica:0/task:";
-  const string failing_worker =
-      absl::StrCat(worker_prefix, failing_worker_index);
+TEST_F(DeviceResDistTest, GetDeviceAttributesLocalUnknown) {
+  DeviceAttributes attributes;
+  EXPECT_TRUE(errors::IsNotFound(dev_resolver_->GetDeviceAttributes(
+      "/job:worker/replica:0/task:0/device:CPU:9", &attributes)));
+}
 
-  // Check device incarnations match expected.
-  std::vector<std::vector<uint64>> expected_incarnations(num_workers);
-  for (int w = 0; w < num_workers; ++w) {
-    expected_incarnations[w].resize(num_devices);
-    for (int d = 0; d < num_devices; ++d) {
-      expected_incarnations[w][d] =
-          w * num_devices + d + device_incarnation_base;
-    }
-  }
-  ResolveIncarnationsAndValidate(num_workers, num_devices, worker_prefix,
-                                 device_type, expected_incarnations);
+TEST_F(DeviceResDistTest, GetAllDeviceAttributes) {
+  std::vector<DeviceAttributes> attributes;
+  TF_ASSERT_OK(dev_resolver_->GetAllDeviceAttributes(
+      "/job:worker/replica:0/task:0", &attributes));
+  EXPECT_THAT(attributes,
+              UnorderedElementsAre(
+                  Property(&DeviceAttributes::name,
+                           "/job:worker/replica:0/task:0/device:CPU:0"),
+                  Property(&DeviceAttributes::name,
+                           "/job:worker/replica:0/task:0/device:CPU:1")));
+  TF_ASSERT_OK(dev_resolver_->GetAllDeviceAttributes(
+      "/job:worker/replica:0/task:1", &attributes));
+  EXPECT_THAT(attributes,
+              UnorderedElementsAre(
+                  Property(&DeviceAttributes::name,
+                           "/job:worker/replica:0/task:1/device:CPU:0"),
+                  Property(&DeviceAttributes::name,
+                           "/job:worker/replica:0/task:1/device:CPU:1")));
+}
 
-  // Restart worker `failing_worker`.
-  constexpr uint64 restart_incarnation_base = 200;
-  RestartWorker(failing_worker, device_type, num_devices,
-                restart_incarnation_base);
-  for (int d = 0; d < num_devices; ++d) {
-    expected_incarnations[failing_worker_index][d] =
-        d + restart_incarnation_base;
-  }
+TEST_F(DeviceResDistTest, GetAllDeviceAttributesUnknown) {
+  std::vector<DeviceAttributes> attributes;
+  EXPECT_TRUE(errors::IsNotFound(dev_resolver_->GetAllDeviceAttributes(
+      "/job:worker/replica:0/task:3", &attributes)));
+}
 
-  // Check incarnations have changed for `failing worker`.
-  ResolveIncarnationsAndValidate(num_workers, num_devices, worker_prefix,
-                                 device_type, expected_incarnations);
+TEST_F(DeviceResDistTest, UpdateDeviceAttributes) {
+  std::vector<DeviceAttributes> attributes;
+  attributes.push_back(
+      NewDevice("CPU", "/job:worker/replica:0/task:2/device:CPU:0")
+          ->attributes());
+  attributes.push_back(
+      NewDevice("CPU", "/job:worker/replica:0/task:2/device:CPU:1")
+          ->attributes());
+  TF_ASSERT_OK(dev_resolver_->UpdateDeviceAttributes(attributes));
+  // Get the new task.
+  TF_ASSERT_OK(dev_resolver_->GetAllDeviceAttributes(
+      "/job:worker/replica:0/task:2", &attributes));
+  EXPECT_THAT(attributes,
+              UnorderedElementsAre(
+                  Property(&DeviceAttributes::name,
+                           "/job:worker/replica:0/task:2/device:CPU:0"),
+                  Property(&DeviceAttributes::name,
+                           "/job:worker/replica:0/task:2/device:CPU:1")));
+  // Get an existing task.
+  TF_ASSERT_OK(dev_resolver_->GetAllDeviceAttributes(
+      "/job:worker/replica:0/task:0", &attributes));
+  EXPECT_THAT(attributes,
+              UnorderedElementsAre(
+                  Property(&DeviceAttributes::name,
+                           "/job:worker/replica:0/task:0/device:CPU:0"),
+                  Property(&DeviceAttributes::name,
+                           "/job:worker/replica:0/task:0/device:CPU:1")));
+}
+
+TEST_F(DeviceResDistTest, UpdateDeviceAttributesExisting) {
+  std::vector<DeviceAttributes> attributes;
+  TF_ASSERT_OK(dev_resolver_->GetAllDeviceAttributes(
+      "/job:worker/replica:0/task:0", &attributes));
+  TF_ASSERT_OK(dev_resolver_->UpdateDeviceAttributes(attributes));
+}
+
+TEST_F(DeviceResDistTest, UpdateDeviceAttributesDifferentIncarnation) {
+  std::vector<DeviceAttributes> attributes;
+  attributes.push_back(
+      NewDevice("CPU", "/job:worker/replica:0/task:0/device:CPU:0")
+          ->attributes());
+  attributes.push_back(
+      NewDevice("CPU", "/job:worker/replica:0/task:0/device:CPU:1")
+          ->attributes());
+  EXPECT_TRUE(errors::IsFailedPrecondition(
+      dev_resolver_->UpdateDeviceAttributes(attributes)));
 }
 
 }  // namespace
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index fb9808b80cf..0b0c38e5a35 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -1,3 +1,9 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")
 load(
     "//tensorflow:tensorflow.bzl",
@@ -11,8 +17,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 cc_library(
     name = "remote_tensor_handle",
     hdrs = ["remote_tensor_handle.h"],
@@ -55,11 +59,11 @@ cc_library(
     hdrs = ["destroy_tensor_handle_node.h"],
     deps = [
         ":eager_client",
-        "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/common_runtime/eager:eager_executor",
+        "//tensorflow/core/protobuf:eager_service_proto_cc",
     ],
 )
 
@@ -67,10 +71,10 @@ cc_library(
     name = "eager_client",
     hdrs = ["eager_client.h"],
     deps = [
-        "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/distributed_runtime:call_options",
+        "//tensorflow/core/protobuf:eager_service_proto_cc",
     ],
 )
 
@@ -81,13 +85,13 @@ cc_library(
     deps = [
         ":eager_client",
         "//tensorflow/core:core_cpu",
-        "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/common_runtime/eager:eager_executor",
         "//tensorflow/core/common_runtime/eager:shape_inference",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "//tensorflow/core/protobuf:eager_service_proto_cc",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -106,7 +110,6 @@ cc_library(
         "//tensorflow/c:c_api_internal",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -124,6 +127,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/protobuf:eager_service_proto_cc",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:optional",
@@ -140,7 +144,6 @@ tf_cc_test(
         ":remote_mgr",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_internal",
-        "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
@@ -153,6 +156,7 @@ tf_cc_test(
         "//tensorflow/core/distributed_runtime:test_utils",
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
+        "//tensorflow/core/protobuf:eager_service_proto_cc",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:variant",
     ],
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index c3ed312428b..2138ecdfe95 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -185,15 +185,11 @@ Status AddOpRetvalsToResponse(
   } else {
     for (int i = 0; i < num_retvals; i++) {
       TF_RETURN_IF_ERROR(TensorHandleShape(retvals[i], add_shape_proto_fn()));
-      const bool is_remote = retvals[i]->Type() == TensorHandle::REMOTE;
       if (add_device_fn) {
-        *add_device_fn() =
-            is_remote ? absl::get<Device*>(
-                            retvals[i]->DeviceOrHostCPU(*eager_context))
-                            ->name()
-                      : "";
+        Device* device = absl::get<Device*>(retvals[i]->device());
+        *add_device_fn() = device ? device->name() : "";
       }
-      if (is_remote) {
+      if (retvals[i]->Type() == TensorHandle::REMOTE) {
         retvals[i]->Unref();
       } else {
         const int output_num = output_nums.empty() ? i : output_nums.at(i);
@@ -278,9 +274,8 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
   opts.config = request->server_def().default_session_config();
   tensorflow::EagerContext* ctx = new tensorflow::EagerContext(
       opts, tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, request->async(),
-      request->lazy_copy_remote_function_inputs(), device_mgr, false, r,
-      GetDefaultCustomKernelCreator(), worker_session->cluster_flr());
+      request->async(), request->lazy_copy_remote_function_inputs(), device_mgr,
+      false, r, worker_session->cluster_flr());
   // Ownership will be transferred to the ServerContext, or else in an error
   // case ctx will be deleted by this unref.
   core::ScopedUnref unref_ctx(ctx);
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 700cea117de..9d35ddf08f7 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -780,7 +780,7 @@ class FunctionWithRemoteInputsTest : public EagerServiceImplTest {
         remote_device_mgr_.get(), Env::Default(), /*config=*/
         nullptr, TF_GRAPH_DEF_VERSION, &func_lib_def_, OptimizerOptions(),
         /*thread_pool=*/nullptr, eager_cluster_flr_.get(),
-        /*custom_kernel_creator=*/nullptr, /*session_metadata=*/nullptr,
+        /*session_metadata=*/nullptr,
         Rendezvous::Factory{[this](const int64 step_id,
                                    const DeviceMgr* device_mgr,
                                    Rendezvous** r) {
@@ -954,6 +954,7 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncTest) {
       /*composite_devices=*/{}, /*input_resource_dtypes_and_shapes=*/{},
       /*runner=*/nullptr,
       /*collective_executor=*/nullptr, local_device, fdef_.signature().name(),
+      /*outputs_on_op_device=*/false,
       [ctx](const int64 step_id) { return ctx->CreateRendezvous(step_id); },
       [=]() { return op_id; }));
 
@@ -1001,6 +1002,7 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncAsyncTest) {
       /*composite_devices=*/{}, /*input_resource_dtypes_and_shapes=*/{},
       /*runner=*/nullptr,
       /*collective_executor=*/nullptr, local_device, fdef_.signature().name(),
+      /*outputs_on_op_device=*/false,
       [ctx](const int64 step_id) { return ctx->CreateRendezvous(step_id); },
       [=]() { return op_id; }));
 
@@ -1218,9 +1220,9 @@ TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
   tensorflow::EagerContext* ctx = new tensorflow::EagerContext(
       SessionOptions(),
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, /*async=*/false,
+      /*async=*/false,
       /*lazy_copy_function_remote_inputs=*/false, device_mgr_.get(), false,
-      rendezvous, GetDefaultCustomKernelCreator());
+      rendezvous);
   const uint64 context_id = random::New64();
 
   // Set RemoteMgr to ctx.
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
index 9544906471f..bc5f0fd2910 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
@@ -54,9 +54,9 @@ class RemoteMgrTest : public ::testing::Test {
     ctx_ = new tensorflow::EagerContext(
         SessionOptions(),
         tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-        tensorflow::ContextMirroringPolicy::MIRRORING_NONE, /*async=*/false,
+        /*async=*/false,
         /*lazy_copy_function_remote_inputs=*/false, device_mgr.release(), true,
-        rendezvous, GetDefaultCustomKernelCreator(), nullptr);
+        rendezvous, nullptr);
   }
 
   ~RemoteMgrTest() override { ctx_->Unref(); }
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index cce4a3f7960..cab523fce3b 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -136,12 +136,11 @@ Status GraphMgr::InitItem(
 
   // We don't explicitly Validate the graph def because ConvertGraphDefToGraph
   // does that below.
-
   item->proc_flr.reset(new ProcessFunctionLibraryRuntime(
       device_mgr_, worker_env_->env, /*config=*/&config_proto,
       gdef.versions().producer(), item->lib_def.get(),
       graph_options.optimizer_options(), worker_env_->compute_pool, cluster_flr,
-      /*custom_kernel_creator=*/nullptr, /*session_metadata=*/nullptr,
+      /*session_metadata=*/nullptr,
       Rendezvous::Factory{
           [this, session](const int64 step_id, const DeviceMgr*,
                           Rendezvous** r) -> Status {
diff --git a/tensorflow/core/distributed_runtime/remote_device.cc b/tensorflow/core/distributed_runtime/remote_device.cc
index 9b837bd5671..05a9072894e 100644
--- a/tensorflow/core/distributed_runtime/remote_device.cc
+++ b/tensorflow/core/distributed_runtime/remote_device.cc
@@ -45,6 +45,7 @@ class RemoteDevice : public Device {
   ResourceMgr* resource_manager() override {
     LOG(FATAL) << "Accessing the resource manager of a remote device is not "
                << "supported.";
+    std::abort();
   }
 
   bool IsLocal() const override { return false; }
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 5cf41c3d734..14a358e8ac2 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -1,6 +1,7 @@
 # Description:
 #   RPC communication interfaces and implementations for TensorFlow.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_windows",
@@ -12,6 +13,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")  # buildifier: disable=sa
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")  # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_grpc_dependency")  # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")  # buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")  # buildifier: disable=same-origin-load
 
 # For platform specific build config
 load(
@@ -29,8 +31,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 filegroup(
     name = "c_srcs",
     data = glob([
@@ -94,10 +94,10 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/distributed_runtime:tensor_coding",
         "//tensorflow/core/distributed_runtime:worker_cache_logger",
         "//tensorflow/core/distributed_runtime:worker_interface",
+        "//tensorflow/core/protobuf:worker_proto_cc",
         tf_grpc_cc_dependency(),
     ],
 )
@@ -127,7 +127,7 @@ cc_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/protobuf:worker_proto_cc",
         "@com_google_absl//absl/flags:flag",
         tf_grpc_cc_dependency(),
     ],
@@ -216,13 +216,13 @@ tf_cuda_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/distributed_runtime:graph_mgr",
         "//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
         "//tensorflow/core/distributed_runtime:worker",
         "//tensorflow/core/distributed_runtime:worker_cache",
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime:worker_session",
+        "//tensorflow/core/protobuf:worker_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         tf_grpc_cc_dependency(),
     ],
@@ -234,8 +234,8 @@ cc_library(
     hdrs = ["grpc_worker_service_impl.h"],
     deps = [
         ":grpc_util",
-        "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/distributed_runtime:tensor_coding",
+        "//tensorflow/core/protobuf:worker_proto_cc",
         tf_grpc_cc_dependency(),
     ],
 )
@@ -249,10 +249,10 @@ cc_library(
         ":grpc_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:master_proto_cc",
         "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime:master_interface",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/protobuf:master_proto_cc",
         "@com_google_absl//absl/time",
     ],
     alwayslink = 1,
@@ -269,9 +269,9 @@ cc_library(
         ":grpc_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:master_proto_cc",
         "//tensorflow/core/distributed_runtime:master",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/protobuf:master_proto_cc",
         tf_grpc_cc_dependency(),
     ],
     alwayslink = 1,
@@ -282,7 +282,7 @@ cc_library(
     srcs = ["grpc_master_service_impl.cc"],
     hdrs = ["grpc_master_service_impl.h"],
     deps = [
-        "//tensorflow/core:master_proto_cc",
+        "//tensorflow/core/protobuf:master_proto_cc",
         tf_grpc_cc_dependency(),
     ],
 )
@@ -316,6 +316,7 @@ cc_library(
         ":grpc_worker_cache",
         ":grpc_worker_service",
         ":rpc_rendezvous_mgr",
+        "//tensorflow/core/nccl:collective_communicator",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -444,13 +445,13 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime:local_master",
         "//tensorflow/core/distributed_runtime:master_interface",
         "//tensorflow/core/distributed_runtime:message_wrappers",
         "//tensorflow/core/distributed_runtime:request_id",
+        "//tensorflow/core/protobuf:master_proto_cc",
     ],
     alwayslink = 1,
 )
@@ -478,7 +479,6 @@ tf_cuda_cc_tests(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -486,6 +486,7 @@ tf_cuda_cc_tests(
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/distributed_runtime:test_utils",
         "//tensorflow/core/platform:blocking_counter",
+        "//tensorflow/core/protobuf:master_proto_cc",
     ],
 )
 
@@ -507,7 +508,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/protobuf:worker_proto_cc",
         tf_grpc_cc_dependency(),
     ],
 )
@@ -520,7 +521,7 @@ tf_cc_test(
         ":grpc_util",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:worker_proto_cc",
+        "//tensorflow/core/protobuf:worker_proto_cc",
         tf_grpc_dependency(),
         tf_grpc_cc_dependency(),
     ],
@@ -543,7 +544,6 @@ tf_cuda_cc_test(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:state_ops_op_lib",
         "//tensorflow/core:test",
@@ -554,6 +554,7 @@ tf_cuda_cc_test(
         "//tensorflow/core/kernels:dense_update_ops",
         "//tensorflow/core/kernels:matmul_op",
         "//tensorflow/core/kernels:variable_ops",
+        "//tensorflow/core/protobuf:master_proto_cc",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/BUILD b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
index ff362c3411f..12dbbadefb8 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
@@ -8,14 +9,12 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 cc_library(
     name = "grpc_eager_service",
     srcs = ["grpc_eager_service.h"],
     hdrs = ["grpc_eager_service.h"],
     deps = [
-        "//tensorflow/core:eager_service_proto_cc",
+        "//tensorflow/core/protobuf:eager_service_proto_cc",
         "//tensorflow/stream_executor/platform",
         tf_grpc_cc_dependency(),
     ],
@@ -27,7 +26,6 @@ cc_library(
     hdrs = ["grpc_eager_client.h"],
     deps = [
         ":grpc_eager_service",
-        "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/distributed_runtime:call_options",
@@ -36,6 +34,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime/rpc:grpc_client_cq_tag",
         "//tensorflow/core/distributed_runtime/rpc:grpc_state",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "//tensorflow/core/protobuf:eager_service_proto_cc",
         tf_grpc_cc_dependency(),
     ],
 )
@@ -46,7 +45,6 @@ cc_library(
     hdrs = ["grpc_eager_service_impl.h"],
     deps = [
         ":grpc_eager_service",
-        "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:framework",
         "//tensorflow/core:ptr_util",
         "//tensorflow/core/distributed_runtime/eager:eager_service_impl",
@@ -55,6 +53,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
+        "//tensorflow/core/protobuf:eager_service_proto_cc",
         tf_grpc_cc_dependency(),
     ],
 )
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
index 770a0fcf14f..32f00f44171 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "grpcpp/impl/codegen/async_unary_call.h"
 #include "grpcpp/impl/codegen/channel_interface.h"
 #include "grpcpp/impl/codegen/client_unary_call.h"
-#include "grpcpp/impl/codegen/method_handler_impl.h"
+#include "grpcpp/impl/codegen/method_handler.h"
 #include "grpcpp/impl/codegen/rpc_service_method.h"
 #include "grpcpp/impl/codegen/service_type.h"
 #include "grpcpp/impl/codegen/sync_stream.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 6e706179863..d529abef36c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -35,11 +35,10 @@ limitations under the License.
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/protobuf/transport_options.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 
-const int kMaxWorkerRpcRetries = 10;
-
 class GrpcRemoteWorker : public WorkerInterface {
  public:
   explicit GrpcRemoteWorker(SharedGrpcChannelPtr channel,
@@ -274,7 +273,7 @@ class GrpcRemoteWorker : public WorkerInterface {
                     bool fail_fast = true) {
     new RPCState<protobuf::Message>(
         &stub_, cq_, method, *request, response, std::move(done), call_opts,
-        callback_threadpool_, /*max_retries=*/0, fail_fast, &target_);
+        callback_threadpool_, MaxRetries(), fail_fast, &target_);
   }
 
   void IssueRequest(const protobuf::Message* request, TensorResponse* response,
@@ -282,7 +281,7 @@ class GrpcRemoteWorker : public WorkerInterface {
                     CallOptions* call_opts = nullptr) {
     new RPCState<TensorResponse>(&stub_, cq_, method, *request, response,
                                  std::move(done), call_opts,
-                                 callback_threadpool_, /*max_retries=*/0,
+                                 callback_threadpool_, MaxRetries(),
                                  /*fail_fast=*/true, &target_);
   }
 
@@ -299,6 +298,14 @@ class GrpcRemoteWorker : public WorkerInterface {
   // Helper function for initializing the RpcMethod objects below.
   const char* Method(GrpcWorkerMethod id) { return GrpcWorkerMethodName(id); }
 
+  // Helper function for configuring max GRPC retries. Defaults to 0 (no
+  // retries).
+  const int64 MaxRetries() {
+    int64 max_retries = -1;
+    TF_CHECK_OK(ReadInt64FromEnvVar("GRPC_MAX_RETRIES", 0, &max_retries));
+    return max_retries;
+  }
+
   SharedGrpcChannelPtr channel_;
   ::grpc::GenericStub stub_;
   ::grpc::CompletionQueue* cq_;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 83e072559e9..fb925e51497 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -47,6 +47,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/nccl/collective_communicator.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mem.h"
@@ -249,7 +250,7 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
                         .release();
   eager_service_ = new eager::GrpcEagerServiceImpl(&worker_env_, &builder);
 
-  profiler_service_ = CreateProfilerService();
+  profiler_service_ = profiler::CreateProfilerService();
   builder.RegisterService(profiler_service_.get());
 
   // extra service:
@@ -279,15 +280,15 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
     }
   } else {
     std::unique_ptr<DeviceResolverDistributed> dev_resolver(
-        new DeviceResolverDistributed(worker_env_.device_mgr, worker_cache,
-                                      default_worker_name));
+        new DeviceResolverDistributed(worker_env_.device_mgr));
     std::unique_ptr<CollectiveParamResolverDistributed> param_resolver(
         new CollectiveParamResolverDistributed(config, worker_env_.device_mgr,
                                                dev_resolver.get(), worker_cache,
                                                default_worker_name));
     worker_env_.collective_executor_mgr.reset(new RpcCollectiveExecutorMgr(
         config, worker_env_.device_mgr, std::move(dev_resolver),
-        std::move(param_resolver), worker_cache, default_worker_name));
+        std::move(param_resolver), MaybeCreateNcclCommunicator(), worker_cache,
+        default_worker_name));
   }
 
   // Set up worker environment.
@@ -448,16 +449,15 @@ Status GrpcServer::UpdateServerDef(const ServerDef& server_def) {
     return errors::Internal("Could not parse worker name.");
   }
   std::unique_ptr<DeviceResolverDistributed> dev_resolver(
-      new DeviceResolverDistributed(worker_env_.device_mgr, worker_cache,
-                                    default_worker_name));
+      new DeviceResolverDistributed(worker_env_.device_mgr));
   std::unique_ptr<CollectiveParamResolverDistributed> param_resolver(
       new CollectiveParamResolverDistributed(
           server_def_.default_session_config(), worker_env_.device_mgr,
           dev_resolver.get(), worker_cache, default_worker_name));
   worker_env_.collective_executor_mgr.reset(new RpcCollectiveExecutorMgr(
       server_def_.default_session_config(), worker_env_.device_mgr,
-      std::move(dev_resolver), std::move(param_resolver), worker_cache,
-      default_worker_name));
+      std::move(dev_resolver), std::move(param_resolver),
+      MaybeCreateNcclCommunicator(), worker_cache, default_worker_name));
 
   master_env_.worker_cache = worker_cache;
   master_env_.collective_executor_mgr =
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index 041b6e51ffb..c6e08f9c1a7 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -49,19 +49,40 @@ class RPCState : public GrpcClientCQTag {
       : RPCState(
             stub, cq, method, request, response, std::move(done), call_opts,
             threadpool,
-            // 1) If GRPC_FAIL_FAST is specified, fail_fast=$GRPC_FAIL_FAST.
-            // See b/141948186.
-            // 2) Otherwise, if the platform is Google, use the fail_fast from
-            // the caller. See b/140260119.
-            // 3) Otherwise, use fail_fast=false.
-            [fail_fast]() -> bool {
-              bool x;
+            // 1) If GRPC_FAIL_FAST is set to 'true' or 'false',
+            // fail_fast=$GRPC_FAIL_FAST. See b/141948186.
+            // 2) Otherwise if GRPC_FAIL_FAST is set to 'use_caller', use the
+            // fail_fast from the caller. See b/140260119.
+            //
+            // Current default for PLATFORM_GOOGLE: use caller fail_fast;
+            // Current default for open source: fail_fast=false.
+            //
+            // NOTE: Callers mostly set fail_fast=true to prevent job hanging
+            // on worker task failures, except a few cases such as GetStatus
+            // in cluster initialization and collective param resolution.
+            [fail_fast, &done]() -> bool {
+              string fail_fast_env;
 #if defined(PLATFORM_GOOGLE)
-              TF_CHECK_OK(ReadBoolFromEnvVar("GRPC_FAIL_FAST", fail_fast, &x));
+              TF_CHECK_OK(ReadStringFromEnvVar("GRPC_FAIL_FAST", "use_caller",
+                                               &fail_fast_env));
 #else
-              TF_CHECK_OK(ReadBoolFromEnvVar("GRPC_FAIL_FAST", false, &x));
+              TF_CHECK_OK(ReadStringFromEnvVar("GRPC_FAIL_FAST", "false",
+                                               &fail_fast_env));
 #endif  // PLATFORM_GOOGLE
-              return x;
+              string fail_fast_env_lower = absl::AsciiStrToLower(fail_fast_env);
+              if (fail_fast_env_lower == "true") {
+                return true;
+              } else if (fail_fast_env_lower == "use_caller") {
+                return fail_fast;
+              } else if (fail_fast_env_lower == "false") {
+                return false;
+              } else {
+                string error_message = strings::StrCat(
+                    "Invalid GRPC_FAIL_FAST config: ", fail_fast_env);
+                LOG(WARNING) << error_message;
+                done(errors::InvalidArgument(error_message));
+                return false;
+              }
             }(),
             /*timeout_in_ms=*/0, max_retries, target) {
   }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index 10bd8936a74..f833adcf932 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -194,7 +194,7 @@ class GrpcWorkerServiceThread {
     auto closure = [this, call]() {                                           \
       Status s = worker_->method(&call->request, &call->response);            \
       if (!s.ok()) {                                                          \
-        VLOG(1) << "Bad response from " << #method << ": " << s;              \
+        VLOG(3) << "Bad response from " << #method << ": " << s;              \
       }                                                                       \
       call->SendResponse(ToGrpcStatus(s));                                    \
     };                                                                        \
@@ -223,7 +223,7 @@ class GrpcWorkerServiceThread {
     Schedule([this, call]() {
       worker_->GetStepSequenceAsync(
           &call->request, &call->response, [call](const Status& s) {
-            VLOG(1) << "Bad response from GetStepSequence:" << s;
+            VLOG(3) << "Bad response from GetStepSequence:" << s;
             call->SendResponse(ToGrpcStatus(s));
           });
     });
@@ -232,7 +232,7 @@ class GrpcWorkerServiceThread {
 
   void MarkRecvFinishedHandler(
       WorkerCall<MarkRecvFinishedRequest, MarkRecvFinishedResponse>* call) {
-    VLOG(1) << "Clean cache entry for request " << call->request.request_id();
+    VLOG(3) << "Clean cache entry for request " << call->request.request_id();
     worker_->RemoveCacheEntryForId(call->request.request_id());
     call->SendResponse(::grpc::Status::OK);
     ENQUEUE_REQUEST(MarkRecvFinished, false);
@@ -249,9 +249,9 @@ class GrpcWorkerServiceThread {
       worker_->RunGraphAsync(call_opts, wrapped_request, wrapped_response,
                              [call, call_opts, wrapped_request,
                               wrapped_response](const Status& s) {
-                               VLOG(1) << "RunGraph::Done";
+                               VLOG(3) << "RunGraph::Done";
                                if (!s.ok()) {
-                                 VLOG(1) << "Bad response from RunGraph:" << s;
+                                 VLOG(3) << "Bad response from RunGraph:" << s;
                                }
                                call->ClearCancelCallback();
                                delete call_opts;
@@ -275,7 +275,7 @@ class GrpcWorkerServiceThread {
             call->ClearCancelCallback();
             delete call_opts;
             if (!s.ok()) {
-              VLOG(1) << "Bad response from RecvTensor:" << s;
+              VLOG(3) << "Bad response from RecvTensor:" << s;
             }
             call->SendResponse(ToGrpcStatus(s));
           });
@@ -292,7 +292,7 @@ class GrpcWorkerServiceThread {
                               call->ClearCancelCallback();
                               delete call_opts;
                               if (!s.ok()) {
-                                VLOG(1) << "Bad response from RecvBuf:" << s;
+                                VLOG(3) << "Bad response from RecvBuf:" << s;
                               }
                               call->SendResponse(ToGrpcStatus(s));
                             });
@@ -311,7 +311,7 @@ class GrpcWorkerServiceThread {
             call->ClearCancelCallback();
             delete call_opts;
             if (!s.ok()) {
-              VLOG(1) << "Bad response from CompleteGroup:" << s;
+              VLOG(3) << "Bad response from CompleteGroup:" << s;
             }
             call->SendResponse(ToGrpcStatus(s));
           });
@@ -330,7 +330,7 @@ class GrpcWorkerServiceThread {
             call->ClearCancelCallback();
             delete call_opts;
             if (!s.ok()) {
-              VLOG(1) << "Bad response from CompleteInstance:" << s;
+              VLOG(3) << "Bad response from CompleteInstance:" << s;
             }
             call->SendResponse(ToGrpcStatus(s));
           });
@@ -430,7 +430,7 @@ GrpcWorker::GrpcWorker(WorkerEnv* worker_env, const ConfigProto& config)
 }
 
 void GrpcWorker::EnableResponseCache() {
-  VLOG(1) << "Enabling gRPC tensor response cache.";
+  VLOG(3) << "Enabling gRPC tensor response cache.";
   response_cache_ = absl::make_unique<GrpcResponseCache>();
 }
 
@@ -441,7 +441,7 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
                                      const RecvTensorRequest* request,
                                      ::grpc::ByteBuffer* response,
                                      StatusCallback done) {
-  VLOG(1) << "GrpcRecvTensorAsync req: " << request->DebugString();
+  VLOG(3) << "GrpcRecvTensorAsync req: " << request->DebugString();
   const int64 request_id = request->request_id();
   const int64 step_id = request->step_id();
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
index 7fc89c46b15..5e3acda6276 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "grpcpp/impl/codegen/async_unary_call.h"
 #include "grpcpp/impl/codegen/channel_interface.h"
 #include "grpcpp/impl/codegen/client_unary_call.h"
-#include "grpcpp/impl/codegen/method_handler_impl.h"
+#include "grpcpp/impl/codegen/method_handler.h"
 #include "grpcpp/impl/codegen/rpc_service_method.h"
 #include "grpcpp/impl/codegen/service_type.h"
 #include "grpcpp/impl/codegen/sync_stream.h"
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
index 62a67b5a3c0..1e5c0aa9e5d 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
@@ -29,9 +29,11 @@ RpcCollectiveExecutorMgr::RpcCollectiveExecutorMgr(
     const ConfigProto& config, const DeviceMgr* dev_mgr,
     std::unique_ptr<DeviceResolverDistributed> dev_resolver,
     std::unique_ptr<CollectiveParamResolverDistributed> param_resolver,
+    std::unique_ptr<NcclCommunicatorInterface> nccl_communicator,
     WorkerCacheInterface* worker_cache, const string& task_name)
     : CollectiveExecutorMgr(config, dev_mgr, std::move(dev_resolver),
-                            std::move(param_resolver)),
+                            std::move(param_resolver),
+                            std::move(nccl_communicator)),
       worker_cache_(worker_cache),
       task_name_(task_name) {
   group_leader_ = (task_name == config.experimental().collective_group_leader())
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
index 60e428f65a5..9b5959b3fe4 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
@@ -38,6 +38,7 @@ class RpcCollectiveExecutorMgr : public CollectiveExecutorMgr {
       const ConfigProto& config, const DeviceMgr* dev_mgr,
       std::unique_ptr<DeviceResolverDistributed> dev_resolver,
       std::unique_ptr<CollectiveParamResolverDistributed> param_resolver,
+      std::unique_ptr<NcclCommunicatorInterface> nccl_communicator,
       WorkerCacheInterface* worker_cache, const string& task_name);
 
   virtual ~RpcCollectiveExecutorMgr();
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
index d49b040d59f..72061b93c14 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
@@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h"
+
 #include <stdlib.h>
+
 #include <string>
 #include <vector>
 
@@ -21,10 +24,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
 #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
-#include "tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h"
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/nccl/collective_communicator.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 #include "tensorflow/core/public/session_options.h"
@@ -45,16 +48,16 @@ class RpcCollectiveExecutorMgrTest : public ::testing::Test {
     std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices));
     device_mgr_ = absl::make_unique<StaticDeviceMgr>(std::move(devices));
-    std::unique_ptr<DeviceResolverDistributed> dr(new DeviceResolverDistributed(
-        device_mgr_.get(), worker_cache, task_name));
+    std::unique_ptr<DeviceResolverDistributed> dr(
+        new DeviceResolverDistributed(device_mgr_.get()));
     std::unique_ptr<CollectiveParamResolverDistributed> cpr(
         new CollectiveParamResolverDistributed(options.config,
                                                device_mgr_.get(), dr.get(),
                                                worker_cache, task_name));
     // This CME is the group leader.
-    cme_.reset(new RpcCollectiveExecutorMgr(options.config, device_mgr_.get(),
-                                            std::move(dr), std::move(cpr),
-                                            worker_cache, task_name));
+    cme_.reset(new RpcCollectiveExecutorMgr(
+        options.config, device_mgr_.get(), std::move(dr), std::move(cpr),
+        MaybeCreateNcclCommunicator(), worker_cache, task_name));
   }
 
   std::unique_ptr<RpcCollectiveExecutorMgr> cme_;
diff --git a/tensorflow/core/example/BUILD b/tensorflow/core/example/BUILD
index 52e62ecce52..19521c0ea82 100644
--- a/tensorflow/core/example/BUILD
+++ b/tensorflow/core/example/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "@io_bazel_rules_closure//closure:defs.bzl",
     "closure_proto_library",
@@ -15,7 +17,7 @@ load(
 
 package(
     default_visibility = [
-        "//tensorflow/core:__subpackages__",
+        "//visibility:public",
     ],
     licenses = ["notice"],  # Apache 2.0
 )
@@ -29,7 +31,6 @@ cc_library(
     hdrs = ["example_parser_configuration.h"],
     copts = tf_copts(),
     linkstatic = 1,
-    visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
@@ -67,7 +68,6 @@ tf_cc_test(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:direct_session_internal",
         "//tensorflow/core:example_parser_configuration",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -78,6 +78,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/common_runtime:direct_session_internal",
         "//tensorflow/core/kernels:example_parsing_ops",
     ],
 )
@@ -89,9 +90,6 @@ filegroup(
     ],
 )
 
-# TODO(bmzhao): Split this target into separate tf_proto_libraries.
-# This target is a holdover from the target tensorflow/core:example_protos,
-# but splitting it while require multiple LSCs.
 tf_proto_library(
     name = "example_protos",
     srcs = [
@@ -122,6 +120,9 @@ tf_proto_library(
         ":example_protos",
         ":example_parser_configuration_proto",
     ],
+    tags = [
+        "alt_dep=//third_party/tensorflow/core:protos_all",
+    ],
 )
 
 tf_pyclif_proto_library(
@@ -138,7 +139,6 @@ tf_pyclif_proto_library(
 
 closure_proto_library(
     name = "example_protos_closure",
-    visibility = ["//visibility:public"],
     deps = [":example_protos"],
 )
 
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index 651b48772f9..e1b6ccb3680 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -6,23 +6,33 @@ load(
 )
 load(
     "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
     "tf_cc_tests",
     "tf_copts",
     "tf_cuda_library",
-    "tf_generate_proto_text_sources",
 )
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_generate_proto_text_sources")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_selective_registration_deps")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
+    "tf_cuda_tests_tags",
 )
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
 )
 
-# buildifier: disable=same-origin-load
-load("//tensorflow:tensorflow.bzl", "tf_selective_registration_deps")
-
 package(
     default_visibility = [
         "//tensorflow/core:__subpackages__",
@@ -40,7 +50,9 @@ exports_files(
         "control_flow.h",
         "dataset.h",
         "dataset_stateful_op_allowlist.h",
+        "device.h",
         "device_base.h",
+        "device_factory.h",
         "function.h",
         "function_handle_cache.h",
         "graph_def_util.h",
@@ -68,6 +80,7 @@ exports_files(
         "resource_mgr.h",
         "resource_op_kernel.h",
         "resource_var.h",
+        "rng_alg.h",
         "run_handler.h",
         "run_handler_util.h",
         "session_state.h",
@@ -167,7 +180,9 @@ filegroup(
         "control_flow.h",
         "dataset.h",
         "dataset_stateful_op_allowlist.h",
+        "device.h",
         "device_base.h",
+        "device_factory.h",
         "function.h",
         "function_handle_cache.h",
         "graph_def_util.h",
@@ -240,7 +255,9 @@ filegroup(
         "cancellation.cc",
         "collective.cc",
         "dataset.cc",
+        "device.cc",
         "device_base.cc",
+        "device_factory.cc",
         "function.cc",
         "function_handle_cache.cc",
         "graph_def_util.cc",
@@ -331,8 +348,12 @@ filegroup(
         "dataset.cc",
         "dataset.h",
         "dataset_stateful_op_allowlist.h",
+        "device.cc",
+        "device.h",
         "device_base.cc",
         "device_base.h",
+        "device_factory.cc",
+        "device_factory.h",
         "function.cc",
         "function.h",
         "function_handle_cache.cc",
@@ -385,6 +406,7 @@ filegroup(
         "resource_mgr.h",
         "resource_op_kernel.h",
         "resource_var.h",
+        "rng_alg.h",
         "run_handler.cc",
         "run_handler.h",
         "run_handler_util.cc",
@@ -447,7 +469,7 @@ cc_library(
         "allocator.h",
     ],
     features = ["parse_headers"],
-    visibility = ["//tensorflow/core:__subpackages__"],
+    visibility = ["//visibility:public"],
     deps = [
         ":numeric_types",
         ":type_traits",
@@ -460,6 +482,7 @@ cc_library(
             "//tensorflow/core/lib/strings:strcat",
             "//tensorflow/core/lib/strings:stringprintf",
             "//tensorflow/core/platform:env",
+            "//tensorflow/core/platform:env_impl",
             "//tensorflow/core/platform:logging",
             "//tensorflow/core/platform:macros",
             "//tensorflow/core/platform:mutex",
@@ -526,6 +549,7 @@ cc_library(
     srcs = ["shape_inference_testutil.cc"],
     hdrs = ["shape_inference_testutil.h"],
     copts = tf_copts(),
+    visibility = ["//tensorflow:internal"],
     deps = [
         ":node_def_proto_cc",
         "//tensorflow/core:framework",
@@ -538,7 +562,7 @@ cc_library(
     name = "reader_base",
     srcs = ["reader_base.cc"],
     hdrs = ["reader_base.h"],
-    visibility = ["//tensorflow/core:__subpackages__"],
+    visibility = ["//visibility:public"],
     deps = [
         ":reader_base_proto_cc",
         "//tensorflow/core:framework",
@@ -550,7 +574,7 @@ cc_library(
     name = "op_gen_lib",
     srcs = ["op_gen_lib.cc"],
     hdrs = ["op_gen_lib.h"],
-    visibility = ["//tensorflow/core:__subpackages__"],
+    visibility = ["//visibility:public"],
     deps = [
         ":api_def_proto_cc",
         ":attr_value_proto_cc",
@@ -1013,6 +1037,110 @@ exports_files(
 )
 
 # Framework tests.
+tf_cc_test(
+    name = "framework_op_gen_lib_test",
+    size = "small",
+    srcs = ["op_gen_lib_test.cc"],
+    deps = [
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:op_gen_lib",
+    ],
+)
+
+tf_cc_test_gpu(
+    name = "variant_op_copy_test",
+    size = "small",
+    srcs = ["variant_op_copy_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:array",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_cc_test(
+    name = "framework_run_handler_util_test",
+    size = "small",
+    srcs = ["run_handler_util_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "framework_run_handler_test",
+    size = "small",
+    srcs = ["run_handler_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session_internal",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:matmul_op",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+tf_cc_test(
+    name = "framework_op_segment_test",
+    size = "small",
+    srcs = ["op_segment_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:direct_session_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:ops_util",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_cc_tests(
     name = "higher_level_tests",
     size = "small",
@@ -1044,6 +1172,7 @@ tf_cc_tests(
         "rendezvous_test.cc",
         "resource_mgr_test.cc",
         "resource_op_kernel_test.cc",
+        "selective_registration_test.cc",
         "shape_inference_test.cc",
         "shape_inference_testutil_test.cc",
         "tensor_shape_test.cc",
@@ -1056,7 +1185,6 @@ tf_cc_tests(
         "variant_op_registry_test.cc",
         "variant_test.cc",
     ],
-    create_named_test_suite = True,
     linkopts = select({
         "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
@@ -1077,7 +1205,6 @@ tf_cc_tests(
         "//tensorflow/core",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:direct_session_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -1087,6 +1214,7 @@ tf_cc_tests(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/common_runtime:direct_session_internal",
         "//tensorflow/core/kernels:ops_util",
         "//tensorflow/core/platform:regexp",
         "//tensorflow/core/util:protos_test_cc",
@@ -1504,4 +1632,7 @@ tf_proto_library(
         ":variable_proto",
         ":versions_proto",
     ],
+    tags = [
+        "alt_dep=//third_party/tensorflow/core:protos_all",
+    ],
 )
diff --git a/tensorflow/core/framework/collective.cc b/tensorflow/core/framework/collective.cc
index 0b90adb78f8..36e26ba9fe9 100644
--- a/tensorflow/core/framework/collective.cc
+++ b/tensorflow/core/framework/collective.cc
@@ -54,10 +54,23 @@ string CollGroupRuntimeDetails::ToString() const {
 }
 
 string CollGroupParams::ToString() const {
-  return strings::StrCat(
+  string v = strings::StrCat(
       "CollGroupParams {group_key=", group_key, " group_size=", group_size,
       " device_type=", device_type.type_string(), " num_tasks=", num_tasks,
-      " runtime_details=", runtime_details.ToString(), "}");
+      " runtime_details=", runtime_details.ToString(), " devices {");
+  for (const auto& d : device_names) {
+    strings::StrAppend(&v, d, ",");
+  }
+  strings::StrAppend(&v, "} task_names={");
+  for (const auto& n : task_names) {
+    strings::StrAppend(&v, n, ", ");
+  }
+  strings::StrAppend(&v, "} num_devices_per_task={");
+  for (const auto& dpt : num_devices_per_task) {
+    strings::StrAppend(&v, dpt.first, ": ", dpt.second, ", ");
+  }
+  strings::StrAppend(&v, "}");
+  return v;
 }
 
 CollInstanceParams& CollInstanceParams::operator=(
@@ -67,12 +80,6 @@ CollInstanceParams& CollInstanceParams::operator=(
     type = other.type;
     data_type = other.data_type;
     shape = other.shape;
-    device_names.clear();
-    device_names.assign(other.device_names.begin(), other.device_names.end());
-    task_names.assign(other.task_names.begin(), other.task_names.end());
-    same_num_devices_per_task = other.same_num_devices_per_task;
-    num_devices_per_task = other.num_devices_per_task;
-    gpu_ring_order = other.gpu_ring_order;
     impl_details.subdiv_offsets.assign(
         other.impl_details.subdiv_offsets.begin(),
         other.impl_details.subdiv_offsets.end());
@@ -85,6 +92,8 @@ CollInstanceParams& CollInstanceParams::operator=(
         other.impl_details.subdiv_source_rank.begin(),
         other.impl_details.subdiv_source_rank.end());
     impl_details.dependencies = other.impl_details.dependencies;
+    devices.assign(other.devices.begin(), other.devices.end());
+    permutation.assign(other.permutation.begin(), other.permutation.end());
   }
   return *this;
 }
@@ -94,17 +103,6 @@ string CollInstanceParams::ToString() const {
       strings::StrCat("CollInstanceParams { instance_key=", instance_key,
                       " type=", type, " data_type=", DataTypeString(data_type),
                       " shape=", shape.DebugString(), " devices {");
-  for (const auto& d : device_names) {
-    strings::StrAppend(&v, d, ",");
-  }
-  strings::StrAppend(&v, "} task_names={");
-  for (const auto& n : task_names) {
-    strings::StrAppend(&v, n, ", ");
-  }
-  strings::StrAppend(&v, "} num_devices_per_task={");
-  for (const auto& dpt : num_devices_per_task) {
-    strings::StrAppend(&v, dpt.first, ": ", dpt.second, ", ");
-  }
   strings::StrAppend(&v, "}, collective_name=", impl_details.collective_name,
                      ", subdiv_offsets={");
   strings::StrAppend(&v, "}, subdiv_offsets={");
@@ -125,8 +123,18 @@ string CollInstanceParams::ToString() const {
       strings::StrAppend(&v, r, ",");
     }
     strings::StrAppend(&v, "}");
+  }  // all subdivs
+  if (type == PERMUTE_COLLECTIVE) {
+    strings::StrAppend(&v, "}, permute_devices {");
+    for (const auto& d : devices) {
+      strings::StrAppend(&v, d, ",");
+    }
+    strings::StrAppend(&v, "}, permute_permutation {");
+    for (const auto& p : permutation) {
+      strings::StrAppend(&v, p, ",");
+    }
+    strings::StrAppend(&v, "}");
   }
-  strings::StrAppend(&v, "}");  // all subdivs
   return v;
 }
 
@@ -158,14 +166,13 @@ string CollectiveParams::ToString() const {
   return ctx->params_;
 }
 
-CollectiveContext::CollectiveContext(CollectiveExecutor* col_exec,
-                                     const DeviceMgr* dev_mgr,
-                                     OpKernelContext* ctx,
-                                     OpKernelContext::Params* op_params,
-                                     const CollectiveParams& col_params,
-                                     const string& exec_key, int64 step_id,
-                                     const Tensor* input, Tensor* output)
+CollectiveContext::CollectiveContext(
+    CollectiveExecutor* col_exec, NcclCommunicatorInterface* nccl_communicator,
+    const DeviceMgr* dev_mgr, OpKernelContext* ctx,
+    OpKernelContext::Params* op_params, const CollectiveParams& col_params,
+    const string& exec_key, int64 step_id, const Tensor* input, Tensor* output)
     : col_exec(col_exec),
+      nccl_communicator(nccl_communicator),
       dev_mgr(dev_mgr),
       op_ctx(ctx),
       op_params(op_params),
@@ -175,7 +182,7 @@ CollectiveContext::CollectiveContext(CollectiveExecutor* col_exec,
       input(input),
       output(output),
       device(nullptr),
-      device_name(col_params.instance.device_names[col_params.default_rank]) {}
+      device_name(col_params.group.device_names[col_params.default_rank]) {}
 
 /*static*/
 int64 CollectiveExecutor::kInvalidId = -1;
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index d0c53231403..e63efcc15bf 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -36,6 +36,7 @@ class Device;
 class DeviceMgr;
 class GetStepSequenceRequest;
 class GetStepSequenceResponse;
+class NcclManager;
 class Tensor;
 
 // Types of supported collective operations.
@@ -62,6 +63,17 @@ struct CollGroupParams {
   int32 group_key;
   int32 group_size;
   DeviceType device_type;
+  // Fully qualified name of device for each member, in default rank order.
+  std::vector<string> device_names;
+  // Task name prefix of corresponding device name.
+  std::vector<string> task_names;
+  // True if every task has the same number of devices.
+  bool same_num_devices_per_task = false;
+  // Task -> number of devices on that task.
+  std::unordered_map<string, int32> num_devices_per_task;
+  // If passed in to GPUOptions in ConfigProto, defines a good ring order for
+  // GPUs.  Assumes same GPU configuration at each worker.
+  string gpu_ring_order = "";
   int32 num_tasks;  // number of distinct tasks in group
   CollGroupRuntimeDetails runtime_details;
   string ToString() const;
@@ -97,17 +109,6 @@ struct CollInstanceParams {
   CollectiveType type = UNDEFINED_COLLECTIVE;
   DataType data_type = DT_FLOAT;
   TensorShape shape = {0};
-  // Fully qualified name of device for each member, in default rank order.
-  std::vector<string> device_names;
-  // Task name prefix of corresponding device name.
-  std::vector<string> task_names;
-  // True if every task has the same number of devices.
-  bool same_num_devices_per_task = false;
-  // Task -> number of devices on that task.
-  std::unordered_map<string, int32> num_devices_per_task;
-  // If passed in to GPUOptions in ConfigProto, defines a good ring order for
-  // GPUs.  Assumes same GPU configuration at each worker.
-  string gpu_ring_order = "";
   CollImplDetails impl_details;
   string ToString() const;
   CollInstanceParams& operator=(const struct CollInstanceParams& other);
@@ -154,28 +155,18 @@ class DeviceResolverInterface {
  public:
   virtual ~DeviceResolverInterface() {}
 
-  // Collects DeviceAttributes protobufs from all of the devices identified
-  // in 'col_params'.
-  virtual void GetAllDeviceAttributesAsync(
-      const std::vector<string>& devices, const std::vector<string>& tasks,
-      std::vector<DeviceAttributes>* attributes,
-      const StatusCallback& done) = 0;
-
   // Populates *attributes with the DeviceAttributes of the specified device.
-  virtual void GetDeviceAttributesAsync(const string& device,
-                                        const string& task,
-                                        DeviceAttributes* attributes,
-                                        const StatusCallback& done) = 0;
+  virtual Status GetDeviceAttributes(const string& device,
+                                     DeviceAttributes* attributes) = 0;
 
-  // Returns the cached device attributes of a task.
-  virtual Status GetTaskCached(const string& task,
-                               std::vector<DeviceAttributes>* attributes) = 0;
+  // Returns all device attributes of a task.
+  virtual Status GetAllDeviceAttributes(
+      const string& task, std::vector<DeviceAttributes>* attributes) = 0;
 
-  // Clears the cache of device data belonging to the specified task.
-  virtual void ClearTask(const string& task) = 0;
-
-  // Clears the cache of all device data.
-  virtual void ClearCache() = 0;
+  // Updates device attributes. It returns error if any device already
+  // exists in the DeviceResolver and has a different incarnation.
+  virtual Status UpdateDeviceAttributes(
+      const std::vector<DeviceAttributes>& attributes) = 0;
 };
 
 // Interface that provides resolution of shared CollectiveParams fields.
@@ -186,7 +177,8 @@ class ParamResolverInterface {
   // Called by each collective op at first execution in order to fill out
   // the CollectiveParams structure with data gathered from the full
   // (maybe distributed) collection of peer nodes.
-  virtual void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+  virtual void CompleteParamsAsync(const DeviceAttributes& device,
+                                   CollectiveParams* cp,
                                    CancellationManager* cancel_mgr,
                                    const StatusCallback& done) = 0;
 
@@ -238,6 +230,8 @@ class StepSequenceInterface {
   virtual void RetireStepId(int64 graph_key, int64 step_id) = 0;
 };
 
+class NcclCommunicatorInterface;
+
 // Interface that provides access to per-step CollectiveExecutor
 // instances and various distributed resolution capabilities.
 class CollectiveExecutorMgrInterface : public StepSequenceInterface {
@@ -255,6 +249,8 @@ class CollectiveExecutorMgrInterface : public StepSequenceInterface {
   virtual ParamResolverInterface* GetParamResolver() const = 0;
 
   virtual DeviceResolverInterface* GetDeviceResolver() const = 0;
+
+  virtual NcclCommunicatorInterface* GetNcclCommunicator() const = 0;
 };
 
 // Interface that a Collective Op implementation uses to exchange data
@@ -307,7 +303,8 @@ class CollectiveExecutor : public core::RefCounted {
         "a CollectiveExecutor has not been provided."));
   }
 
-  virtual void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+  virtual void CompleteParamsAsync(const DeviceAttributes& device,
+                                   CollectiveParams* cp,
                                    CancellationManager* cancel_mgr,
                                    StatusCallback done) {
     done(errors::Internal(
@@ -362,19 +359,12 @@ class CollectiveExecutor : public core::RefCounted {
   TF_DISALLOW_COPY_AND_ASSIGN(CollectiveExecutor);
 };
 
-class CollectiveContext {
- public:
-  CollectiveContext(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr,
-                    OpKernelContext* ctx, OpKernelContext::Params* op_params,
-                    const CollectiveParams& col_params, const string& exec_key,
-                    int64 step_id, const Tensor* input, Tensor* output);
-
-  virtual ~CollectiveContext() = default;
-
-  CollectiveExecutor* col_exec;        // Not owned
-  const DeviceMgr* dev_mgr;            // Not owned
-  OpKernelContext* op_ctx;             // Not owned
-  OpKernelContext::Params* op_params;  // Not owned
+struct CollectiveContext {
+  CollectiveExecutor* col_exec;                  // Not owned
+  NcclCommunicatorInterface* nccl_communicator;  // Not owned
+  const DeviceMgr* dev_mgr;                      // Not owned
+  OpKernelContext* op_ctx;                       // Not owned
+  OpKernelContext::Params* op_params;            // Not owned
   const CollectiveParams& col_params;
   const string exec_key;
   const int64 step_id;
@@ -383,6 +373,23 @@ class CollectiveContext {
   Device* device;       // The device for which this instance labors
   const string device_name;
   DeviceLocality device_locality;
+
+  CollectiveContext(CollectiveExecutor* col_exec,
+                    NcclCommunicatorInterface* nccl_communicator,
+                    const DeviceMgr* dev_mgr, OpKernelContext* ctx,
+                    OpKernelContext::Params* op_params,
+                    const CollectiveParams& col_params, const string& exec_key,
+                    int64 step_id, const Tensor* input, Tensor* output);
+};
+
+class NcclCommunicatorInterface {
+ public:
+  virtual ~NcclCommunicatorInterface() = default;
+
+  virtual void Enqueue(std::shared_ptr<CollectiveContext> col_ctx,
+                       StatusCallback done) = 0;
+
+  virtual void StartAbort(const Status& s) = 0;
 };
 
 // Interface of a Collective Op implementation.  Each specific CollectiveOp will
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 8157f4ee01d..60c95e04799 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -1120,9 +1120,26 @@ Status AvgPoolShape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
+Status AvgPoolGradShape(shape_inference::InferenceContext* c) {
+  ShapeHandle s;
+  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+  TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
+  c->set_output(0, s);
+  return Status::OK();
+}
+
 Status FusedBatchNormShape(shape_inference::InferenceContext* c) {
+  string data_format_str;
+  TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
+  TensorFormat data_format;
+  if (!FormatFromString(data_format_str, &data_format)) {
+    return errors::InvalidArgument("Invalid data format string: ",
+                                   data_format_str);
+  }
+  const int rank =
+      (data_format_str == "NDHWC" or data_format_str == "NCDHW") ? 5 : 4;
   ShapeHandle x;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &x));
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), rank, &x));
 
   bool is_training;
   TF_RETURN_IF_ERROR(c->GetAttr("is_training", &is_training));
@@ -1131,14 +1148,8 @@ Status FusedBatchNormShape(shape_inference::InferenceContext* c) {
     exponential_avg_factor = 1.0f;  // default value
   }
   int number_inputs = (is_training && exponential_avg_factor == 1.0f) ? 3 : 5;
-  string data_format_str;
-  TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
-  TensorFormat data_format;
-  if (!FormatFromString(data_format_str, &data_format)) {
-    return errors::InvalidArgument("Invalid data format string: ",
-                                   data_format_str);
-  }
-  int channel_dim_index = GetTensorFeatureDimIndex(4, data_format);
+
+  int channel_dim_index = GetTensorFeatureDimIndex(rank, data_format);
   DimensionHandle channel_dim = c->Dim(x, channel_dim_index);
 
   // covers scale, offset, and if is_training is false, mean, variance
@@ -1191,13 +1202,6 @@ Status FusedBatchNormExShape(shape_inference::InferenceContext* c) {
 }
 
 Status FusedBatchNormGradShape(shape_inference::InferenceContext* c) {
-  ShapeHandle y_backprop;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &y_backprop));
-  ShapeHandle x;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &x));
-
-  bool is_training;
-  TF_RETURN_IF_ERROR(c->GetAttr("is_training", &is_training));
   string data_format_str;
   TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
   TensorFormat data_format;
@@ -1205,7 +1209,17 @@ Status FusedBatchNormGradShape(shape_inference::InferenceContext* c) {
     return errors::InvalidArgument("Invalid data format string: ",
                                    data_format_str);
   }
-  int channel_dim_index = GetTensorFeatureDimIndex(4, data_format);
+  const int rank =
+      (data_format_str == "NDHWC" or data_format_str == "NCDHW") ? 5 : 4;
+  ShapeHandle y_backprop;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), rank, &y_backprop));
+  ShapeHandle x;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), rank, &x));
+
+  bool is_training;
+  TF_RETURN_IF_ERROR(c->GetAttr("is_training", &is_training));
+
+  int channel_dim_index = GetTensorFeatureDimIndex(rank, data_format);
   DimensionHandle channel_dim = c->Dim(y_backprop, channel_dim_index);
   TF_RETURN_IF_ERROR(
       c->Merge(channel_dim, c->Dim(x, channel_dim_index), &channel_dim));
@@ -1477,7 +1491,8 @@ Status MatrixSetDiagV2Shape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
-Status MaxPoolShape(shape_inference::InferenceContext* c) {
+Status MaxPoolShapeImpl(shape_inference::InferenceContext* c,
+                        bool supports_explicit_padding) {
   string data_format_str;
   TensorFormat data_format;
   Status s = c->GetAttr("data_format", &data_format_str);
@@ -1530,14 +1545,39 @@ Status MaxPoolShape(shape_inference::InferenceContext* c) {
   Padding padding;
   TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
 
+  std::vector<int64> explicit_paddings;
+  if (supports_explicit_padding) {
+    Status status = c->GetAttr("explicit_paddings", &explicit_paddings);
+    // Use the default value, which is an empty list, if the attribute is not
+    // found. Otherwise return the error to the caller.
+    if (!status.ok() && !errors::IsNotFound(status)) {
+      return status;
+    }
+    TF_RETURN_IF_ERROR(CheckValidPadding(padding, explicit_paddings,
+                                         /*num_dims=*/4, data_format));
+  } else {
+    DCHECK(padding != Padding::EXPLICIT);
+  }
+
   ShapeHandle output_shape;
   DimensionHandle output_rows, output_cols, output_depth;
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
-      c, in_rows_dim, kernel_rows, stride_rows, padding, &output_rows));
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
-      c, in_cols_dim, kernel_cols, stride_cols, padding, &output_cols));
-  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDims(
-      c, in_depth_dim, kernel_depth, stride_depth, padding, &output_depth));
+  int64 pad_rows_before = -1, pad_rows_after = -1;
+  int64 pad_cols_before = -1, pad_cols_after = -1;
+  if (padding == Padding::EXPLICIT) {
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H',
+                             &pad_rows_before, &pad_rows_after);
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W',
+                             &pad_cols_before, &pad_cols_after);
+  }
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, in_rows_dim, kernel_rows, /*dilation_rate=*/1, stride_rows, padding,
+      pad_rows_before, pad_rows_after, &output_rows));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, in_cols_dim, kernel_cols, /*dilation_rate=*/1, stride_cols, padding,
+      pad_cols_before, pad_cols_after, &output_cols));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeFromDimsV2(
+      c, in_depth_dim, kernel_depth, /*dilation_rate=*/1, stride_depth, padding,
+      /*pad_before*/ 0, /*pad_after*/ 0, &output_depth));
 
   TF_RETURN_IF_ERROR(MakeShapeFromFormat(data_format, batch_size_dim,
                                          {output_rows, output_cols},
@@ -1547,6 +1587,18 @@ Status MaxPoolShape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
+Status MaxPoolShape(shape_inference::InferenceContext* c) {
+  return MaxPoolShapeImpl(c, /*supports_explicit_padding=*/false);
+}
+
+Status MaxPoolGradShape(shape_inference::InferenceContext* c) {
+  return UnchangedShapeWithRank(c, 4);
+}
+
+Status MaxPoolShapeWithExplicitPadding(shape_inference::InferenceContext* c) {
+  return MaxPoolShapeImpl(c, /*supports_explicit_padding=*/true);
+}
+
 Status MaxPoolV2Shape(shape_inference::InferenceContext* c, int num_inputs) {
   string data_format_str;
   TensorFormat data_format;
@@ -1731,6 +1783,18 @@ Status Pool3DShape(shape_inference::InferenceContext* c) {
   return Status::OK();
 }
 
+Status MaxPool3DGradShape(shape_inference::InferenceContext* c) {
+  return UnchangedShapeWithRank(c, 5);
+}
+
+Status AvgPool3DGradShape(shape_inference::InferenceContext* c) {
+  ShapeHandle s;
+  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+  TF_RETURN_IF_ERROR(c->WithRank(s, 5, &s));
+  c->set_output(0, s);
+  return Status::OK();
+}
+
 Status UnknownShape(shape_inference::InferenceContext* c) {
   for (int i = 0; i < c->num_outputs(); ++i) {
     c->set_output(i, c->UnknownShape());
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index f3e02638f54..ba03514c739 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -147,6 +147,9 @@ Status Conv2DBackpropInputShape(shape_inference::InferenceContext* c);
 // Shape function for AvgPool-like operations.
 Status AvgPoolShape(shape_inference::InferenceContext* c);
 
+// Shape function for AvgPoolGrad-like operations.
+Status AvgPoolGradShape(shape_inference::InferenceContext* c);
+
 // Shape function for FusedBatchNorm and FusedBatchNormV2 operations.
 Status FusedBatchNormShape(shape_inference::InferenceContext* c);
 
@@ -168,15 +171,28 @@ Status MatrixDiagV2Shape(shape_inference::InferenceContext* c);
 // Shape function for MatrixSetDiagV2 and MatrixSetDiagV3 operations.
 Status MatrixSetDiagV2Shape(shape_inference::InferenceContext* c);
 
-// Shape function for MaxPool-like operations.
+// Shape function for MaxPool-like operations that support explicit padding.
+Status MaxPoolShapeWithExplicitPadding(shape_inference::InferenceContext* c);
+
+// Shape function for MaxPool-like operations that do not support explicit
+// padding.
 Status MaxPoolShape(shape_inference::InferenceContext* c);
 
 // Shape function for MaxPoolV2-like operations.
 Status MaxPoolV2Shape(shape_inference::InferenceContext* c, int num_inputs);
 
+// Shape function for MaxPoolGrad-like operations.
+Status MaxPoolGradShape(shape_inference::InferenceContext* c);
+
 // Shape function for 3D Pooling operations.
 Status Pool3DShape(shape_inference::InferenceContext* c);
 
+// Shape function for MaxPool3DGrad-like operations.
+Status MaxPool3DGradShape(shape_inference::InferenceContext* c);
+
+// Shape function for AvgPool3DGrad-like operations.
+Status AvgPool3DGradShape(shape_inference::InferenceContext* c);
+
 // Shape function for use with ops whose output shapes are unknown.
 Status UnknownShape(shape_inference::InferenceContext* c);
 
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index 2c4e615b193..c851af9a5c4 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/resource.h"
@@ -382,6 +383,14 @@ int64 GetTotalBytes(const std::vector<Tensor>& element) {
   return total_bytes;
 }
 
+std::string FullName(const std::string& prefix, const std::string& name) {
+  if (str_util::StrContains(name, kColon)) {
+    LOG(ERROR) << name << " should not contain " << kColon;
+  }
+
+  return strings::StrCat(kFullNameRandomHex, kPipe, prefix, kColon, name);
+}
+
 Status GetDatasetFromVariantTensor(const Tensor& tensor,
                                    DatasetBase** out_dataset) {
   if (!(tensor.dtype() == DT_VARIANT &&
@@ -427,6 +436,31 @@ Status DatasetBase::MakeIterator(
   return s;
 }
 
+Status DatasetBase::MakeSplitProvider(
+    std::unique_ptr<SplitProvider>* split_provider) const {
+  std::vector<const DatasetBase*> inputs;
+  Status s = InputDatasets(&inputs);
+  if (errors::IsUnimplemented(s)) {
+    return errors::Unimplemented(
+        "Cannot create a split provider for dataset of type ", type_string(),
+        ", because the dataset implements neither `InputDatasets` nor "
+        "`MakeSplitProvider`.");
+  }
+  if (inputs.size() != 1) {
+    return errors::Unimplemented(
+        "Cannot create a split provider for dataset of type ", type_string(),
+        ", because the dataset is not unary (having arity ", inputs.size(),
+        "), and no custom implementation of `MakeSplitProvider` is defined.");
+  }
+  return inputs[0]->MakeSplitProvider(split_provider);
+}
+
+Status DatasetBase::InputDatasets(
+    std::vector<const DatasetBase*>* inputs) const {
+  return errors::Unimplemented("InputDatasets not implemented for ",
+                               type_string());
+}
+
 Status DatasetBase::DatasetGraphDefBuilder::AddInputDataset(
     SerializationContext* ctx, const DatasetBase* dataset, Node** output) {
   Status status = dataset->AsGraphDefInternal(ctx, this, output);
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 8c35b1909ca..6aa31909197 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -75,23 +75,30 @@ constexpr char kTFDataResourceTag[] = "tfdata";
 class DatasetBase;
 class SerializationContext;
 
+inline bool IsTFDataFunction(const FunctionDef& func) {
+  return (func.attr().contains(data::kTFDataFunction) &&
+          func.attr().at(data::kTFDataFunction).b());
+}
+
 // Interface for reading values from a key-value store.
 // Used for restoring iterator state. This class is thread safe.
 // Please see comment on IteratorStateWriter for guidance around using the
 // Read*(key, val) vs Read*(name, key, val).
 class IteratorStateReader {
  public:
-  virtual Status ReadScalar(StringPiece key, int64* val) = 0;
-  virtual Status ReadScalar(StringPiece key, tstring* val) = 0;
-  virtual Status ReadTensor(StringPiece key, Tensor* val) = 0;
+  virtual Status ReadScalar(StringPiece key, int64* val) const = 0;
+  virtual Status ReadScalar(StringPiece key, tstring* val) const = 0;
+  virtual Status ReadTensor(StringPiece key, Tensor* val) const = 0;
 
-  virtual Status ReadScalar(StringPiece name, StringPiece key, int64* val) = 0;
   virtual Status ReadScalar(StringPiece name, StringPiece key,
-                            tstring* val) = 0;
-  virtual Status ReadTensor(StringPiece name, StringPiece key, Tensor* val) = 0;
+                            int64* val) const = 0;
+  virtual Status ReadScalar(StringPiece name, StringPiece key,
+                            tstring* val) const = 0;
+  virtual Status ReadTensor(StringPiece name, StringPiece key,
+                            Tensor* val) const = 0;
 
-  virtual bool Contains(StringPiece key) = 0;
-  virtual bool Contains(StringPiece name, StringPiece key) = 0;
+  virtual bool Contains(StringPiece key) const = 0;
+  virtual bool Contains(StringPiece name, StringPiece key) const = 0;
 
   virtual ~IteratorStateReader() {}
 };
@@ -121,6 +128,10 @@ class IteratorStateWriter {
   virtual ~IteratorStateWriter() {}
 };
 
+// Generates a full name key for iterator checkpointing. All keys generated for
+// iterator checkpoints should go through this function.
+std::string FullName(const std::string& prefix, const std::string& name);
+
 // Wrapper around GraphDefBuilder. Used to serialize Dataset graph.
 class GraphDefBuilderWrapper {
  public:
@@ -295,6 +306,24 @@ class Runner {
   static Runner* get();
 };
 
+// A class which provides a sequence of splits. Iterators created with a split
+// provider will iterate over only the splits provided by the split provider.
+class SplitProvider {
+ public:
+  virtual ~SplitProvider() {}
+  // Stores the next split in `*split`, setting `*end_of_splits` to indicate
+  // whether there were any splits left.
+  virtual Status GetNext(Tensor* split, bool* end_of_splits) = 0;
+  // Resets the split provider to its beginning.
+  virtual Status Reset() = 0;
+  // Saves the state of this split provider.
+  virtual Status Save(std::function<std::string(std::string)> full_name,
+                      IteratorStateWriter* writer) = 0;
+  // Saves the state of this split provider.
+  virtual Status Restore(std::function<std::string(std::string)> full_name,
+                         IteratorStateReader* reader) = 0;
+};
+
 // A cut-down version of `OpKernelContext` for running computations in
 // iterators. Note that we cannot simply use `OpKernelContext` here because we
 // might run computation in an iterator whose lifetime is not nested within the
@@ -319,6 +348,7 @@ class IteratorContext {
           model(ctx->model()),
           runner(*(ctx->runner())),
           runner_threadpool_size(ctx->runner_threadpool_size()),
+          split_provider(ctx->split_provider()),
           stats_aggregator(ctx->stats_aggregator()),
           thread_factory(ctx->thread_factory()),
           thread_pool(ctx->thread_pool()) {}
@@ -386,6 +416,9 @@ class IteratorContext {
     // Number of threads used for executing user-defined functions.
     int32 runner_threadpool_size = 0;
 
+    // An optional split provider indicating which splits to process.
+    std::shared_ptr<SplitProvider> split_provider = nullptr;
+
     // The `StatsAggregator` object to record statistics about the iterator.
     std::shared_ptr<StatsAggregator> stats_aggregator = nullptr;
 
@@ -432,6 +465,10 @@ class IteratorContext {
 
   int32 runner_threadpool_size() { return params_.runner_threadpool_size; }
 
+  std::shared_ptr<SplitProvider> split_provider() {
+    return params_.split_provider;
+  }
+
   std::shared_ptr<StatsAggregator> stats_aggregator() {
     return params_.stats_aggregator;
   }
@@ -802,6 +839,12 @@ class DatasetBase : public core::RefCounted {
     return MakeIteratorFromCheckpoint(&ctx, output_prefix, reader, iterator);
   }
 
+  // Returns a split provider which partitions the dataset's data into splits
+  // and provides them in a sequence. The split provider is stored in
+  // `*split_provider`.
+  virtual Status MakeSplitProvider(
+      std::unique_ptr<SplitProvider>* split_provider) const;
+
   // Returns a vector of DataType values, representing the respective
   // element types of each tuple component in the outputs of this
   // dataset.
@@ -824,6 +867,14 @@ class DatasetBase : public core::RefCounted {
   // A human-readable debug string for this dataset.
   virtual string DebugString() const = 0;
 
+  // Stores the dataset's input datasets in `*inputs`. The pointers stored in
+  // `*inputs` are borrowed. The only valid non-ok return status is
+  // UNIMPLEMENTED in case `InputDatasets` is not implemented by a dataset
+  // subclass. Implementing `InputDatasets` enables `DatasetBase` to provide a
+  // default implementation of `MakeSplitProvider` when there is a single input
+  // dataset.
+  virtual Status InputDatasets(std::vector<const DatasetBase*>* inputs) const;
+
   // Indicates whether the dataset depends on any external state which would
   // prevent it from being serializable. If so, the method returns
   // `errors::FailedPrecondition` with a message that identifies the external
@@ -932,12 +983,7 @@ class DatasetBaseIterator : public IteratorBase {
                               bool* end_of_sequence, int* num_skipped);
 
   string full_name(const string& name) const {
-    if (str_util::StrContains(name, kColon)) {
-      LOG(ERROR) << name << " should not contain " << kColon;
-    }
-
-    return strings::StrCat(kFullNameRandomHex, kPipe, params_.prefix, kColon,
-                           name);
+    return FullName(params_.prefix, name);
   }
 
   // Returns a map of key-value pairs to included in the TraceMe string.
diff --git a/tensorflow/core/framework/dataset_test.cc b/tensorflow/core/framework/dataset_test.cc
index 9dbb3be7faf..e471b441ce2 100644
--- a/tensorflow/core/framework/dataset_test.cc
+++ b/tensorflow/core/framework/dataset_test.cc
@@ -27,6 +27,11 @@ TEST(DatasetTest, RegisterDatasetOp) {
   EXPECT_FALSE(data::DatasetOpRegistry::IsRegistered("InvalidDatasetOp"));
 }
 
+TEST(DatasetTest, FullName) {
+  EXPECT_EQ(data::FullName("prefix", "name"),
+            "60d899aa0d8ce4351e7c3b419e92d25b|prefix:name");
+}
+
 enum DataTypeTest {
   _tf_int_32,
   _tf_int_64,
diff --git a/tensorflow/core/common_runtime/device.cc b/tensorflow/core/framework/device.cc
similarity index 92%
rename from tensorflow/core/common_runtime/device.cc
rename to tensorflow/core/framework/device.cc
index 9925814a48a..50453822230 100644
--- a/tensorflow/core/common_runtime/device.cc
+++ b/tensorflow/core/framework/device.cc
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/device.h"
 
 #include "tensorflow/core/framework/op_segment.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/framework/device.h b/tensorflow/core/framework/device.h
new file mode 100644
index 00000000000..0f544bdd123
--- /dev/null
+++ b/tensorflow/core/framework/device.h
@@ -0,0 +1,202 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// A Device is a something that can perform computations as part of a
+// model.  Devices can be local (runs computation on this machine), or
+// remote (contacts a device local to another machine using an RPC to
+// do the work).  Devices are registered in a DeviceSet, which is also
+// responsible for the Device <-> id mapping.
+//
+// Device names
+// * Every Device should have a unique name with the format:
+//     /job:___/replica:___/task:___/(gpu|cpu):___
+//   An example name would be "/job:train/replica:0/task:3/device:GPU:2".
+// * Task numbers are within the specified replica, so there are as
+//   many "task zeros" as replicas.
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_DEVICE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_DEVICE_H_
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/control_flow.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_segment.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/types.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+
+class Device : public DeviceBase {
+ public:
+  // Callback type that takes a Status and returns void.
+  typedef std::function<void(const Status&)> DoneCallback;
+
+  Device(Env* env, const DeviceAttributes& device_attributes);
+  ~Device() override;
+
+  // Full name of this device (see top comment).
+  const std::string& name() const override { return device_attributes_.name(); }
+
+  // Parsed name of this device
+  const DeviceNameUtils::ParsedName& parsed_name() const {
+    return parsed_name_;
+  }
+
+  // Describes what kind of device this is.  This is intended to be
+  // human-readable and not computer-parsed, except that two devices
+  // with the same device_type() are expected to perform similarly
+  // (both from a computation and communication perspective).
+  const std::string& device_type() const {
+    return device_attributes_.device_type();
+  }
+
+  // Returns an aggregation of device attributes.
+  const DeviceAttributes& attributes() const override {
+    return device_attributes_;
+  }
+
+  // Performs the actual compute function.
+  //
+  // Subclasses may override this function if they wish to perform
+  // some initialization before each compute.
+  virtual void Compute(OpKernel* op_kernel, OpKernelContext* context) {
+    op_kernel->Compute(context);
+  }
+
+  // Asynchronous kernel's compute.
+  virtual void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
+                            AsyncOpKernel::DoneCallback done) {
+    op_kernel->ComputeAsync(context, std::move(done));
+  }
+
+  // Blocks until all operations queued on the device at the time of
+  // the call have completed.  Returns any error pending on the device
+  // at completion.
+  virtual Status Sync() = 0;
+
+  // Calls the given callback when all operations queued on the device at the
+  // time of the call have completed. The callback is passed any error pending
+  // on the device at completion.
+  // TODO(b/112409994): Consolidate these two APIs, removing the synchronous
+  // version.
+  virtual void Sync(const DoneCallback& done);
+
+  // On session completion, the executor may call Device::Sync() depending on
+  // flag settings. Override this to return false for devices that don't allow
+  // such calls. Instead, these devices must use other mechanisms (such as
+  // num_deferred_ops) to ensure the device has finished processing necessary
+  // work at session completion. In addition, for these devices, RefreshStatus
+  // must be called at session completion to retrieve execution result status.
+  //
+  // Devices that override this function must also implement RefreshStatus.
+  virtual bool AllowsSyncOnCompletion() const { return true; }
+
+  // This is used in conjunction with AllowsSyncOnCompletion to allow the
+  // executor to get execution result status at session completion.
+  //
+  // For supported devices, this call returns the underlying device stream's
+  // current status in a non-blocking way, without using blocking calls such as
+  // Stream::BlockHostUntilDone or Device::Sync. When applicable, the device
+  // status is also updated with the retrieved stream status.
+  virtual Status RefreshStatus() {
+    return errors::Unimplemented(
+        "RefreshStatus is not supported on this device.");
+  }
+
+  // Optionally modify the device's GraphDef before execution.
+  //
+  // This method should be considered experimental and is supplied to enable
+  // prototyping of TensorFlow device implementations that need to modify
+  // the GraphDef before execution.
+  //
+  // 'graph' supplies the partition of the graph assigned to this
+  // device.
+  virtual Status MaybeRewriteGraph(std::unique_ptr<Graph>* /*graph*/) {
+    return Status::OK();
+  }
+
+  // Sets `out_context` a new DeviceContext* for executing a graph, or nullptr
+  // if the device does not support contexts. Returns an error status if any
+  // error occurred while trying to create a context, otherwise OK.
+  //
+  // The caller takes ownership of one reference on the output DeviceContext*,
+  // and should call Unref().
+  virtual Status TryGetDeviceContext(DeviceContext** out_context) {
+    *out_context = nullptr;
+    return Status::OK();
+  }
+
+  // Returns the op segment of this device.  The caller can reuse op
+  // kernels registered for the same session running on this device.
+  OpSegment* op_segment() { return &op_seg_; }
+
+  // Returns the resource manager associated w/ this device.
+  virtual ResourceMgr* resource_manager() { return rmgr_; }
+
+  // Summarizes the status of this Device, for debugging.
+  std::string DebugString() const { return device_attributes_.DebugString(); }
+
+  // Assembles the parameter components into a complete DeviceAttributes value.
+  static DeviceAttributes BuildDeviceAttributes(
+      const std::string& name, DeviceType device, Bytes memory_limit,
+      const DeviceLocality& locality, const std::string& physical_device_desc);
+
+  static DeviceAttributes BuildDeviceAttributes(
+      const std::string& name, DeviceType device, Bytes memory_limit,
+      const DeviceLocality& locality) {
+    // Pass in an empty string as physical device name.
+    return BuildDeviceAttributes(name, device, memory_limit, locality, "");
+  }
+
+  // Clears the resource manager associated with this device.
+  void ClearResourceMgr() { rmgr_->Clear(); }
+
+  virtual bool IsLocal() const { return true; }
+
+ protected:
+  void DeleteResourceMgr() {
+    delete rmgr_;
+    rmgr_ = nullptr;
+  }
+
+ private:
+  const DeviceAttributes device_attributes_;
+  DeviceNameUtils::ParsedName parsed_name_;
+
+  // op_seg_ maps session handle and op name to OpKernel objects.
+  OpSegment op_seg_;
+
+  // Resources associated w/ this device. E.g., shared variables, etc.
+  ResourceMgr* rmgr_ = nullptr;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(Device);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_DEVICE_H_
diff --git a/tensorflow/core/framework/device_base.cc b/tensorflow/core/framework/device_base.cc
index a43bb4fd656..3eca8c86e36 100644
--- a/tensorflow/core/framework/device_base.cc
+++ b/tensorflow/core/framework/device_base.cc
@@ -65,10 +65,12 @@ Status DeviceContext::CopyCPUTensorToDeviceSync(const Tensor* cpu_tensor,
 
 const DeviceAttributes& DeviceBase::attributes() const {
   LOG(FATAL) << "Device does not implement attributes()";
+  std::abort();
 }
 
 const string& DeviceBase::name() const {
   LOG(FATAL) << "Device does not implement name()";
+  std::abort();
 }
 
 void DeviceBase::set_eigen_cpu_device(Eigen::ThreadPoolDevice* d) {
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index fabb0b24a93..c39cf43912c 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -32,9 +32,6 @@ limitations under the License.
 
 namespace Eigen {
 struct ThreadPoolDevice;
-#ifdef TENSORFLOW_USE_SYCL
-struct SyclDevice;
-#endif
 }  // end namespace Eigen
 
 namespace stream_executor {
@@ -176,9 +173,6 @@ class DeviceBase {
   // Does not take ownership.
   void set_eigen_cpu_device(Eigen::ThreadPoolDevice* d);
 
-#ifdef TENSORFLOW_USE_SYCL
-  void set_eigen_sycl_device(Eigen::SyclDevice* d) { eigen_sycl_device_ = d; }
-#endif
 
   // Return the Allocator implementation to use based on the allocator
   // attributes requested.  See allocator.h for more details.
@@ -210,12 +204,6 @@ class DeviceBase {
 
   virtual const Eigen::ThreadPoolDevice* eigen_cpu_device();
 
-#ifdef TENSORFLOW_USE_SYCL
-  virtual const Eigen::SyclDevice* eigen_sycl_device() const {
-    CHECK(eigen_sycl_device_ != nullptr);
-    return eigen_sycl_device_;
-  }
-#endif
 
   // Caller owns the return value. The OpKernelContext calls this even
   // for devices that do not implement an eigen_gpu_device. Overridden
@@ -290,9 +278,6 @@ class DeviceBase {
   GpuDeviceInfo* gpu_device_info_ = nullptr;
   thread::ThreadPool* device_thread_pool_ = nullptr;
   std::vector<Eigen::ThreadPoolDevice*> eigen_cpu_devices_;
-#ifdef TENSORFLOW_USE_SYCL
-  Eigen::SyclDevice* eigen_sycl_device_ = nullptr;
-#endif
 };
 
 // Methods to create and check for Symbolic execution devices.
diff --git a/tensorflow/core/common_runtime/device_factory.cc b/tensorflow/core/framework/device_factory.cc
similarity index 98%
rename from tensorflow/core/common_runtime/device_factory.cc
rename to tensorflow/core/framework/device_factory.cc
index 2872da15c26..dda5d5f0101 100644
--- a/tensorflow/core/common_runtime/device_factory.cc
+++ b/tensorflow/core/framework/device_factory.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/device_factory.h"
 
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/device.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/framework/device_factory.h b/tensorflow/core/framework/device_factory.h
new file mode 100644
index 00000000000..43d1d615555
--- /dev/null
+++ b/tensorflow/core/framework/device_factory.h
@@ -0,0 +1,151 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_DEVICE_FACTORY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_DEVICE_FACTORY_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Device;
+struct SessionOptions;
+
+class DeviceFactory {
+ public:
+  virtual ~DeviceFactory() {}
+  static void Register(const std::string& device_type, DeviceFactory* factory,
+                       int priority);
+  static DeviceFactory* GetFactory(const std::string& device_type);
+
+  // Append to "*devices" all suitable devices, respecting
+  // any device type specific properties/counts listed in "options".
+  //
+  // CPU devices are added first.
+  static Status AddDevices(const SessionOptions& options,
+                           const std::string& name_prefix,
+                           std::vector<std::unique_ptr<Device>>* devices);
+
+  // Helper for tests.  Create a single device of type "type".  The
+  // returned device is always numbered zero, so if creating multiple
+  // devices of the same type, supply distinct name_prefix arguments.
+  static std::unique_ptr<Device> NewDevice(const string& type,
+                                           const SessionOptions& options,
+                                           const string& name_prefix);
+
+  // Iterate through all device factories and build a list of all of the
+  // possible physical devices.
+  //
+  // CPU is are added first.
+  static Status ListAllPhysicalDevices(std::vector<string>* devices);
+
+  // Get details for a specific device among all device factories.
+  // 'device_index' indexes into devices from ListAllPhysicalDevices.
+  static Status GetAnyDeviceDetails(
+      int device_index, std::unordered_map<string, string>* details);
+
+  // For a specific device factory list all possible physical devices.
+  virtual Status ListPhysicalDevices(std::vector<string>* devices) = 0;
+
+  // Get details for a specific device for a specific factory. Subclasses
+  // can store arbitrary device information in the map. 'device_index' indexes
+  // into devices from ListPhysicalDevices.
+  virtual Status GetDeviceDetails(int device_index,
+                                  std::unordered_map<string, string>* details) {
+    return Status::OK();
+  }
+
+  // Most clients should call AddDevices() instead.
+  virtual Status CreateDevices(
+      const SessionOptions& options, const std::string& name_prefix,
+      std::vector<std::unique_ptr<Device>>* devices) = 0;
+
+  // Return the device priority number for a "device_type" string.
+  //
+  // Higher number implies higher priority.
+  //
+  // In standard TensorFlow distributions, GPU device types are
+  // preferred over CPU, and by default, custom devices that don't set
+  // a custom priority during registration will be prioritized lower
+  // than CPU.  Custom devices that want a higher priority can set the
+  // 'priority' field when registering their device to something
+  // higher than the packaged devices.  See calls to
+  // REGISTER_LOCAL_DEVICE_FACTORY to see the existing priorities used
+  // for built-in devices.
+  static int32 DevicePriority(const std::string& device_type);
+};
+
+namespace dfactory {
+
+template <class Factory>
+class Registrar {
+ public:
+  // Multiple registrations for the same device type with different priorities
+  // are allowed.  Priorities are used in two different ways:
+  //
+  // 1) When choosing which factory (that is, which device
+  //    implementation) to use for a specific 'device_type', the
+  //    factory registered with the highest priority will be chosen.
+  //    For example, if there are two registrations:
+  //
+  //      Registrar<CPUFactory1>("CPU", 125);
+  //      Registrar<CPUFactory2>("CPU", 150);
+  //
+  //    then CPUFactory2 will be chosen when
+  //    DeviceFactory::GetFactory("CPU") is called.
+  //
+  // 2) When choosing which 'device_type' is preferred over other
+  //    DeviceTypes in a DeviceSet, the ordering is determined
+  //    by the 'priority' set during registration.  For example, if there
+  //    are two registrations:
+  //
+  //      Registrar<CPUFactory>("CPU", 100);
+  //      Registrar<GPUFactory>("GPU", 200);
+  //
+  //    then DeviceType("GPU") will be prioritized higher than
+  //    DeviceType("CPU").
+  //
+  // The default priority values for built-in devices is:
+  // GPU: 210
+  // GPUCompatibleCPU: 70
+  // ThreadPoolDevice: 60
+  // Default: 50
+  explicit Registrar(const std::string& device_type, int priority = 50) {
+    DeviceFactory::Register(device_type, new Factory(), priority);
+  }
+};
+
+}  // namespace dfactory
+
+#define REGISTER_LOCAL_DEVICE_FACTORY(device_type, device_factory, ...) \
+  INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY(device_type, device_factory,   \
+                                         __COUNTER__, ##__VA_ARGS__)
+
+#define INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY(device_type, device_factory, \
+                                               ctr, ...)                    \
+  static ::tensorflow::dfactory::Registrar<device_factory>                  \
+      INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY_NAME(ctr)(device_type,         \
+                                                       ##__VA_ARGS__)
+
+// __COUNTER__ must go through another macro to be properly expanded
+#define INTERNAL_REGISTER_LOCAL_DEVICE_FACTORY_NAME(ctr) ___##ctr##__object_
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_DEVICE_FACTORY_H_
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index ebf06c7d0cd..564290bcb21 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -1454,6 +1454,12 @@ Status FunctionLibraryDefinition::RemoveFunctionHelper(const string& func) {
   return Status::OK();
 }
 
+void FunctionLibraryDefinition::Clear() {
+  mutex_lock l(mu_);
+  function_defs_.clear();
+  func_grad_.clear();
+}
+
 Status FunctionLibraryDefinition::RemoveGradient(const string& func) {
   const auto& i = func_grad_.find(func);
   if (i == func_grad_.end()) {
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 3c7c09eee37..3c048161b7d 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -403,6 +403,9 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // are no longer in use.
   Status RemoveFunction(const std::string& func) TF_LOCKS_EXCLUDED(mu_);
 
+  // Removes all the functions and gradient functions.
+  void Clear() TF_LOCKS_EXCLUDED(mu_);
+
   // Adds the functions and gradients in 'other' to this function library.
   // Duplicate functions and gradients are ignored.
   // This operation is atomic.
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index a62acfe571e..38ab8be291d 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -1068,6 +1068,16 @@ TEST(FunctionLibraryDefinitionTest, RemoveFunction) {
   EXPECT_FALSE(lib_def.Contains("XTimesTwo"));
 }
 
+TEST(FunctionLibraryDefinitionTest, Clear) {
+  FunctionLibraryDefinition lib_def(OpRegistry::Global(), {});
+  TF_CHECK_OK(lib_def.AddFunctionDef(test::function::XTimesTwo()));
+  TF_CHECK_OK(lib_def.AddFunctionDef(test::function::XAddX()));
+
+  lib_def.Clear();
+  EXPECT_FALSE(lib_def.Contains("XTimesTwo"));
+  EXPECT_FALSE(lib_def.Contains("XAddX"));
+}
+
 TEST(FunctionLibraryDefinitionTest, AddLibrary) {
   // Create lib def with single function
   FunctionDefLibrary proto;
diff --git a/tensorflow/core/framework/graph_to_functiondef.cc b/tensorflow/core/framework/graph_to_functiondef.cc
index e825aa722b5..786833f2d6a 100644
--- a/tensorflow/core/framework/graph_to_functiondef.cc
+++ b/tensorflow/core/framework/graph_to_functiondef.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_node_util.h"
@@ -442,6 +443,14 @@ Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
         AttrValue value;
         *(value.mutable_list()->add_shape()) = attr.second.shape();
         arg_attrs.mutable_attr()->insert({"_output_shapes", value});
+      } else if (attr.first == "value" && node->type_string() == "Const") {
+        // Small eager tensors are captured as const ops rather than
+        // Placeholders. Add a _output_shapes arg_attr with the shape of the
+        // const tensor.
+        AttrValue value;
+        *(value.mutable_list()->add_shape()) =
+            attr.second.tensor().tensor_shape();
+        arg_attrs.mutable_attr()->insert({"_output_shapes", value});
       }
       if (attr.first == "_resource_arg_unique_id") {
         resource_arg_unique_id = attr.second.i();
diff --git a/tensorflow/core/framework/local_rendezvous.cc b/tensorflow/core/framework/local_rendezvous.cc
index 3535b57e7db..34053808b4a 100644
--- a/tensorflow/core/framework/local_rendezvous.cc
+++ b/tensorflow/core/framework/local_rendezvous.cc
@@ -187,6 +187,20 @@ void LocalRendezvous::RecvAsync(const Rendezvous::ParsedKey& key,
     CancellationToken token = CancellationManager::kInvalidToken;
     bool already_cancelled = false;
     if (cm != nullptr) {
+      // Increment the refcount when cancellation manager is present, to make
+      // sure the rendezvous outlives the recv and its cancel callbacks.
+      // This refcount is dropped in exactly one of the following cases:
+      // (1) Recv registers cancellation callback to cm, and then cm is
+      //     cancelled, unref in the cancellation callback;
+      // (2) Recv registers cancellation callback to cm, but cm is already
+      //     cancelled, unref in the already_cancelled check;
+      // (3) Recv is successful, and item done callback finishes deregistering
+      //     the cancellation callback, unref in the item done callback;
+      // (4) Recv is successful, but the item done callback fails to deregister
+      //     the cancellation callback because cm already StartCancel, in this
+      //     case the cancellation callback will be invoked by the cm anyway,
+      //     unref in the cancellation callback.
+      if (rc_owner_) rc_owner_->Ref();
       token = cm->get_cancellation_token();
       already_cancelled = !cm->RegisterCallback(token, [this, token, key_hash] {
         Item* item = nullptr;
@@ -230,10 +244,14 @@ void LocalRendezvous::RecvAsync(const Rendezvous::ParsedKey& key,
               Rendezvous::Args(), item->args, Tensor(), /*is_dead=*/false);
           delete item;
         }
+        // Unref case (1) and (4)
+        if (rc_owner_) rc_owner_->Unref();
       });
     }
     if (already_cancelled) {
       mu_.unlock();
+      // Unref case (2)
+      if (rc_owner_) rc_owner_->Unref();
       done(StatusGroup::MakeDerived(
                errors::Cancelled("RecvAsync is cancelled.")),
            Rendezvous::Args(), recv_args, Tensor(), /*is_dead=*/false);
@@ -250,10 +268,17 @@ void LocalRendezvous::RecvAsync(const Rendezvous::ParsedKey& key,
       // cancellation manager may no longer be live after `done` is called.
       queue->push_back(new Item(
           recv_args,
-          [cm, token, done = std::move(done)](
+          [this, cm, token, done = std::move(done)](
               const Status& s, const Rendezvous::Args& send_args,
               const Rendezvous::Args& recv_args, const Tensor& v, bool dead) {
-            cm->TryDeregisterCallback(token);
+            // TryDeregisterCallback returns true when the cancellation callback
+            // is successfully deregistered. If it fails because the CM already
+            // StartAbort, Unref will happen inside the cancellation callback
+            // when called by the CM.
+            if (cm->TryDeregisterCallback(token)) {
+              // Unref case (3)
+              if (this->rc_owner_) this->rc_owner_->Unref();
+            }
             done(s, send_args, recv_args, v, dead);
           },
           token));
diff --git a/tensorflow/core/framework/local_rendezvous.h b/tensorflow/core/framework/local_rendezvous.h
index 19c218793b6..ed3a5d4dd73 100644
--- a/tensorflow/core/framework/local_rendezvous.h
+++ b/tensorflow/core/framework/local_rendezvous.h
@@ -35,7 +35,11 @@ namespace tensorflow {
 // is not expected to be needed.
 class LocalRendezvous {
  public:
-  LocalRendezvous() = default;
+  // If the class wrapping LocalRendezvous is refcounted (i.e., extending
+  // Rendezvous), pass in its pointer in constructor so the LocalRendezvous
+  // can make sure it outlives the async recv requests.
+  // Pass in nullptr if the wrapping class is not refcounted.
+  explicit LocalRendezvous(Rendezvous* owner) : rc_owner_(owner) {}
   ~LocalRendezvous();
 
   Status Send(const Rendezvous::ParsedKey& key,
@@ -62,6 +66,9 @@ class LocalRendezvous {
 
   typedef gtl::FlatMap<uint64, ItemQueue> Table;
 
+  // Pointer to the owner class of this LocalRendezvous if it is refcounted.
+  const Rendezvous* rc_owner_;
+
   // TODO(zhifengc): shard table_.
   mutex mu_;
   Table table_ TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/framework/metrics.cc b/tensorflow/core/framework/metrics.cc
index f5aff3a4e11..dccfbbfaeca 100644
--- a/tensorflow/core/framework/metrics.cc
+++ b/tensorflow/core/framework/metrics.cc
@@ -42,6 +42,12 @@ auto* graph_run_time_usecs_histogram = monitoring::Sampler<0>::New(
     // Power of 2 with bucket count 20 (> 17 minutes)
     {monitoring::Buckets::Exponential(1000, 2, 20)});
 
+auto* graph_pending_queue_length_histogram = monitoring::Sampler<0>::New(
+    {"/tensorflow/core/graph_pending_queue_length_histogram",
+     "The number of pending (ready but not running) tasks in graph executor."},
+    // Power of 1.5 with bucket count 30 (> 191k)
+    {monitoring::Buckets::Exponential(1, 1.5, 30)});
+
 auto* graph_run_input_tensor_bytes = monitoring::Sampler<0>::New(
     {"/tensorflow/core/graph_run_input_tensor_bytes",
      "The size of input tensors in bytes."},
@@ -252,6 +258,12 @@ void UpdateGraphExecTime(const uint64 running_time_usecs) {
   }
 }
 
+void UpdateGraphPendingQueueLength(uint64 len) {
+  static auto* graph_pending_queue_length_cell =
+      graph_pending_queue_length_histogram->GetCell();
+  graph_pending_queue_length_cell->Add(len);
+}
+
 void UpdateGraphOptimizationPassTime(const string& pass_name,
                                      const uint64 running_time_usecs) {
   if (running_time_usecs > 0) {
diff --git a/tensorflow/core/framework/metrics.h b/tensorflow/core/framework/metrics.h
index f7c90ce593e..ef04e09aaa4 100644
--- a/tensorflow/core/framework/metrics.h
+++ b/tensorflow/core/framework/metrics.h
@@ -95,6 +95,7 @@ void RecordGraphInputTensors(const size_t size);
 void RecordGraphOutputTensors(const size_t size);
 
 void UpdateGraphExecTime(const uint64 running_time_usecs);
+void UpdateGraphPendingQueueLength(uint64 len);
 
 // Records that one output of an op of type `op_name` was unused.
 void RecordUnusedOutput(const string& op_name);
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 3d54ffd51d8..951052b794a 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -25,6 +25,15 @@ namespace data {
 namespace model {
 namespace {
 
+// Helper function for node traversal that doesn't skip any nodes.
+inline bool IsAnyNode(const std::shared_ptr<Node> node) { return true; }
+
+// Helper function for node traversal that filters out nodes for which
+// autotuning is disabled.
+inline bool IsAutotuneNode(const std::shared_ptr<Node> node) {
+  return node->autotune();
+}
+
 // Wrapper for the square function to reduce verbosity.
 inline double Square(double x) { return x * x; }
 
@@ -82,7 +91,8 @@ class InterleaveMany : public Node {
     if (num_inputs() <= 1) {
       (*output_times)[long_name()] = self_processing_time;
       if (gradients) {
-        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        for (const auto& node :
+             CollectNodes(TraversalOrder::REVERSE_BFS, IsAutotuneNode)) {
           gradients->erase(node->long_name());
         }
       }
@@ -94,7 +104,8 @@ class InterleaveMany : public Node {
          (*output_times)[inputs_.front()->long_name()]) /
         static_cast<double>(num_inputs() - 1);
     if (gradients) {
-      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+      for (const auto& node :
+           CollectNodes(TraversalOrder::REVERSE_BFS, IsAutotuneNode)) {
         auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
         if (gradient) {
           *gradient /= static_cast<double>(num_inputs() - 1);
@@ -211,7 +222,8 @@ class AsyncInterleaveMany : public Node {
     if (num_inputs() <= 1) {
       (*output_times)[long_name()] = self_processing_time;
       if (gradients) {
-        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        for (const auto& node :
+             CollectNodes(TraversalOrder::REVERSE_BFS, IsAutotuneNode)) {
           gradients->erase(node->long_name());
         }
       }
@@ -245,7 +257,8 @@ class AsyncInterleaveMany : public Node {
           consumer_time_der +
           producer_time_der * inputs_time_der_sum / parallelism;
 
-      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+      for (const auto& node :
+           CollectNodes(TraversalOrder::REVERSE_BFS, IsAutotuneNode)) {
         auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
         if (gradient) {
           *gradient *= (producer_time_der /
@@ -345,14 +358,16 @@ class KnownRatio : public Node {
     if (ratio_ == 0) {
       (*output_times)[long_name()] = self_processing_time;
       if (gradients) {
-        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        for (const auto& node :
+             CollectNodes(TraversalOrder::REVERSE_BFS, IsAutotuneNode)) {
           gradients->erase(node->long_name());
         }
       }
       return;
     }
     if (gradients) {
-      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+      for (const auto& node :
+           CollectNodes(TraversalOrder::REVERSE_BFS, IsAutotuneNode)) {
         auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
         if (gradient) {
           *gradient *= ratio_;
@@ -461,7 +476,17 @@ class AsyncKnownRatio : public Node {
     auto* buffer_size_parameter = gtl::FindOrNull(parameters_, kBufferSize);
     if (parallelism_parameter) {
       parallelism = (*parallelism_parameter)->value;
-      buffer_size = parallelism;
+      if (ratio_ == 0) {
+        buffer_size = parallelism;
+      } else {
+        // Currently, MapAndBatch is the only transformation creates
+        // AsyncKnownRatio nodes with ratio >= 1. For MapAndBatch, we create
+        // `parallelism` threads to apply the function on elements from input
+        // dataset, while one element in the buffer actually corresponds to
+        // `ratio_` elements from input dataset. So we adjust the `buffer_size`
+        // by dividing `ratio_`.
+        buffer_size = parallelism / ratio_;
+      }
     } else if (buffer_size_parameter) {
       buffer_size = (*buffer_size_parameter)->value;
     }
@@ -473,7 +498,8 @@ class AsyncKnownRatio : public Node {
       consumer_time = input_time;
       producer_time = 0.0L;
       if (gradients) {
-        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        for (const auto& node :
+             CollectNodes(TraversalOrder::REVERSE_BFS, IsAutotuneNode)) {
           gradients->erase(node->long_name());
         }
 
@@ -518,7 +544,8 @@ class AsyncKnownRatio : public Node {
       (*output_time_gradients)[long_name()] =
           consumer_time_der + producer_time_der * inputs_time_der_sum;
 
-      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+      for (const auto& node :
+           CollectNodes(TraversalOrder::REVERSE_BFS, IsAutotuneNode)) {
         auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
         if (gradient) {
           *gradient *= (ratio_ * producer_time_der);
@@ -527,10 +554,11 @@ class AsyncKnownRatio : public Node {
 
       // Add derivative w.r.t. own parameter if it's tunable.
       if (parallelism_parameter && (*parallelism_parameter)->state->tunable) {
-        (*gradients)[long_name()] =
-            buffer_size_der - (1.0L + consumer_time_der +
-                               producer_time_der * inputs_time_der_sum) *
-                                  self_processing_time / Square(parallelism);
+        (*gradients)[long_name()] = buffer_size_der / ratio_ -
+                                    (1.0L + consumer_time_der +
+                                     producer_time_der * inputs_time_der_sum) *
+                                        self_processing_time /
+                                        Square(parallelism);
       } else if (buffer_size_parameter &&
                  (*buffer_size_parameter)->state->tunable) {
         (*gradients)[long_name()] = buffer_size_der;
@@ -618,7 +646,8 @@ class UnknownRatio : public Node {
         inputs_.front()->num_elements() == 0) {
       (*output_times)[long_name()] = self_processing_time;
       if (gradients) {
-        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        for (const auto& node :
+             CollectNodes(TraversalOrder::REVERSE_BFS, IsAutotuneNode)) {
           gradients->erase(node->long_name());
         }
       }
@@ -629,7 +658,8 @@ class UnknownRatio : public Node {
     double ratio = static_cast<double>(inputs_.front()->num_elements()) /
                    static_cast<double>(num_elements_);
     if (gradients) {
-      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+      for (const auto& node :
+           CollectNodes(TraversalOrder::REVERSE_BFS, IsAutotuneNode)) {
         auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
         if (gradient) {
           *gradient *= ratio;
@@ -906,7 +936,8 @@ void Node::CollectTunableParameters(
     absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const {
   tf_shared_lock l(mu_);
   // Collect tunable parameters from the leaves of the nodes tree to the root.
-  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+  for (const auto& node :
+       CollectNodes(TraversalOrder::REVERSE_BFS, IsAutotuneNode)) {
     tf_shared_lock l(node->mu_);
     node->CollectTunableParametersHelper(parameters);
   }
@@ -917,7 +948,8 @@ string Node::DebugString() const {
   absl::flat_hash_map<string, string> debug_strings;
   tf_shared_lock l(mu_);
   // Build up the debug string from the leaves of the nodes tree to the root.
-  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+  for (const auto& node :
+       CollectNodes(TraversalOrder::REVERSE_BFS, IsAnyNode)) {
     tf_shared_lock l(node->mu_);
     node->DebugStringHelper(&debug_strings);
   }
@@ -941,7 +973,7 @@ double Node::OutputTime(absl::flat_hash_map<string, double>* input_times,
   // `nullptr`) and the output time for each node.
   absl::flat_hash_map<string, double> output_time_gradients, output_times;
   tf_shared_lock l(mu_);
-  auto nodes = CollectNodes(TraversalOrder::BFS);
+  auto nodes = CollectNodes(TraversalOrder::BFS, IsAutotuneNode);
 
   // Computes and stores input time for each node from the root to leaves of the
   // nodes tree.
@@ -990,7 +1022,8 @@ double Node::TotalBufferedBytes() const {
   absl::flat_hash_map<string, double> total_bytes;
   tf_shared_lock l(mu_);
   // Compute total buffered bytes from the leaves of the nodes tree to the root.
-  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+  for (const auto& node :
+       CollectNodes(TraversalOrder::REVERSE_BFS, IsAnyNode)) {
     tf_shared_lock l(node->mu_);
     node->TotalBufferedBytesHelper(&total_bytes);
   }
@@ -1004,7 +1037,8 @@ double Node::TotalMaximumBufferedBytes() const {
   tf_shared_lock l(mu_);
   // Compute total maximum buffered bytes from the leaves of the nodes tree
   // to the root.
-  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+  for (const auto& node :
+       CollectNodes(TraversalOrder::REVERSE_BFS, IsAnyNode)) {
     tf_shared_lock l(node->mu_);
     node->TotalMaximumBufferedBytesHelper(&total_bytes);
   }
@@ -1022,7 +1056,8 @@ double Node::TotalProcessingTime(
 
   // Computes per-element CPU time spent in the subtree rooted in the node from
   // the leaves of the nodes tree to the root.
-  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+  for (const auto& node :
+       CollectNodes(TraversalOrder::REVERSE_BFS, IsAutotuneNode)) {
     tf_shared_lock l(node->mu_);
     node->TotalProcessingTimeLocked(processing_times, &total_processing_times);
   }
@@ -1112,14 +1147,17 @@ double Node::SelfProcessingTimeLocked() const {
          static_cast<double>(num_elements_);
 }
 
-Node::NodeVector Node::CollectNodes(TraversalOrder order) const
+Node::NodeVector Node::CollectNodes(
+    TraversalOrder order, bool collect_node(const std::shared_ptr<Node>)) const
     TF_SHARED_LOCKS_REQUIRED(mu_) {
   NodeVector node_vector;
   std::list<std::shared_ptr<Node>> temp_list;
 
   for (auto& input : inputs_) {
-    node_vector.push_back(input);
-    temp_list.push_back(input);
+    if (collect_node(input)) {
+      node_vector.push_back(input);
+      temp_list.push_back(input);
+    }
   }
 
   while (!temp_list.empty()) {
@@ -1127,8 +1165,10 @@ Node::NodeVector Node::CollectNodes(TraversalOrder order) const
     temp_list.pop_front();
     tf_shared_lock l(cur_node->mu_);
     for (auto& input : cur_node->inputs_) {
-      node_vector.push_back(input);
-      temp_list.push_back(input);
+      if (collect_node(input)) {
+        node_vector.push_back(input);
+        temp_list.push_back(input);
+      }
     }
   }
 
@@ -1350,9 +1390,6 @@ void Model::OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget,
   VLOG(2) << "Starting optimization of tunable parameters with GradientDescent";
   auto parameters = CollectTunableParameters(snapshot);
   auto essential_parameters = CollectEssentialParallelism(snapshot, parameters);
-  // We add the number of model's buffered bytes because it is excluded from the
-  // memory budget, but it is included in the maximum number of buffered bytes.
-  ram_budget += TotalBufferedBytes(snapshot);
   for (auto& pair : parameters) {
     pair.second->value = pair.second->min;
   }
@@ -1427,9 +1464,6 @@ void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget,
   VLOG(2) << "Starting optimization of tunable parameters with HillClimb";
   const double processing_time = TotalProcessingTime(snapshot);
   auto parameters = CollectTunableParameters(snapshot);
-  // We add the number of model's buffered bytes because it is excluded from the
-  // memory budget, but it is included in the maximum number of buffered bytes.
-  ram_budget += TotalBufferedBytes(snapshot);
   // Buffer size parameter will only be incremented if the output latency
   // improvement is greater than this constant.
   constexpr double kBufferSizeMinDelta = 1.0L;
@@ -1454,7 +1488,7 @@ void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget,
     double best_delta = -1.0L;
     Parameter* best_parameter = nullptr;
     for (auto& pair : parameters) {
-      if (pair.second->value == pair.second->max) {
+      if (pair.second->value >= pair.second->max) {
         continue;
       }
       pair.second->value++;
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index bfa6e31209a..5199f2cbeef 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -77,7 +77,14 @@ struct Parameter {
   Parameter(const string& name, std::shared_ptr<SharedState> state, double min,
             double max)
       : name(name),
-        value(state->value),
+        // Sometimes non-autotune nodes (with `autotune_=false`) may contain
+        // parameters (for example inputs of parallel interleave dataset which
+        // are not in the current cycle). To avoid unrealistic situation
+        // (say `buffer_size=-1` or `parallelism=-1`) in the optimization
+        // computation, if the state value is `kAutotune=-1` (just to indicate
+        // the `SharedState` is tunable), we initialize the parameter value to
+        // be the minimal value of the state.
+        value(state->value == kAutotune ? min : state->value),
         min(min),
         max(max),
         state(std::move(state)) {}
@@ -478,8 +485,12 @@ class Node {
 
   // Returns a vector of nodes of the subtree rooted in this node. The nodes are
   // either in breadth-first search or reverse breadth-first search order
-  // depending on the `order` argument. The root node itself is not collected.
-  NodeVector CollectNodes(TraversalOrder order) const
+  // depending on the `order` argument. The nodes are collected based on the
+  // results of the `collect_node` predicate: if the predicate returns `false`
+  // for a given node, then the subtree rooted in this node is excluded. The
+  // root node itself is not collected.
+  NodeVector CollectNodes(TraversalOrder order,
+                          bool collect_node(const std::shared_ptr<Node>)) const
       TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Collect tunable parameters for the node.
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index be98c7cedfe..1146b02ed1c 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -795,6 +795,8 @@ bool IsValidControlInputName(StringPiece sp) {
   }
 }
 
+const StringPiece kColocationGroupPrefixStringPiece(kColocationGroupPrefix);
+
 }  // namespace
 
 Status ValidateOpInput(const string& input_name, bool* is_control_input) {
@@ -924,17 +926,27 @@ Status AddPrefixAndSuffixToNode(StringPiece prefix, StringPiece suffix,
     attr.set_s(frame_name);
   }
 
-  // Update colocation constraints.
-  constexpr char kClassAttr[] = "_class";
-  auto class_attr = node_def->mutable_attr()->find(kClassAttr);
-  if (class_attr != node_def->mutable_attr()->end()) {
-    AttrValue new_value;
-    new_value.mutable_list()->add_s(
-        strings::StrCat(prefix, class_attr->second.s()));
-    node_def->mutable_attr()->erase(kClassAttr);
-    node_def->mutable_attr()->insert({kClassAttr, new_value});
-  }
+  return Status::OK();
+}
 
+Status MaybeAddPrefixToColocationConstraints(
+    const std::unordered_set<string>& match, StringPiece prefix,
+    NodeDef* node_def) {
+  auto attr = node_def->mutable_attr()->find(kColocationAttrName);
+  if (attr == node_def->mutable_attr()->end()) {
+    return Status::OK();
+  }
+  auto constraints_list = attr->second.mutable_list();
+  auto constraints_size = constraints_list->s_size();
+  for (size_t i = 0; i < constraints_size; ++i) {
+    StringPiece original(constraints_list->s(i));
+    if (absl::ConsumePrefix(&original, kColocationGroupPrefixStringPiece)) {
+      if (match.find(string(original)) != match.end()) {
+        (*constraints_list->mutable_s(i)) =
+            strings::StrCat(kColocationGroupPrefix, prefix, original);
+      }
+    }
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index d1a7c9aebba..d774d1cf414 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_NODE_DEF_UTIL_H_
 
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -391,6 +392,13 @@ Status AttachDef(const Status& status, const NodeDef& node_def,
 Status AddPrefixAndSuffixToNode(StringPiece prefix, StringPiece suffix,
                                 NodeDef* node_def,
                                 bool uniquify_frame_name = true);
+
+// Appends the given prefix to the colocation group name if the name exists
+// in `to_match`.
+Status MaybeAddPrefixToColocationConstraints(
+    const std::unordered_set<string>& match, StringPiece prefix,
+    NodeDef* node_def);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_NODE_DEF_UTIL_H_
diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc
index 2fc000d4e3c..b79b738353c 100644
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@@ -615,6 +615,39 @@ TEST(AddPrefixAndSuffixToNode, Enter) {
   EXPECT_EQ("prefix/test_frame/suffix", frame_name);
 }
 
+TEST(MaybeAddPrefixToColocationConstraints, Basic) {
+  NodeDef node_def;
+  node_def.set_name("Identity");
+  node_def.set_op("Identity");
+  AddNodeAttr(kColocationAttrName,
+              {strings::StrCat(kColocationGroupPrefix, "Node1"),
+               strings::StrCat(kColocationGroupPrefix, "Node2"),
+               strings::StrCat(kColocationGroupPrefix, "Node3")},
+              &node_def);
+
+  std::unordered_set<string> match;
+  match.insert("Node1");
+  match.insert("Node3");
+  TF_ASSERT_OK(MaybeAddPrefixToColocationConstraints(match, "fn/", &node_def));
+  std::vector<string> coloc_constraints;
+  TF_ASSERT_OK(GetNodeAttr(node_def, kColocationAttrName, &coloc_constraints));
+  EXPECT_EQ(
+      coloc_constraints,
+      std::vector<string>({"loc:@fn/Node1", "loc:@Node2", "loc:@fn/Node3"}));
+}
+
+TEST(MaybeAddPrefixToColocationConstraints, NoConstraints) {
+  NodeDef node_def;
+  node_def.set_name("Identity");
+  node_def.set_op("Identity");
+
+  std::unordered_set<string> match;
+  match.insert("Node1");
+  match.insert("Node3");
+  TF_ASSERT_OK(MaybeAddPrefixToColocationConstraints(match, "fn/", &node_def));
+  EXPECT_FALSE(HasNodeAttr(node_def, kColocationAttrName));
+}
+
 TEST(FormatNodeForErrorTest, Node) {
   Graph g(OpRegistry::Global());
   Node* node;
diff --git a/tensorflow/core/framework/op.cc b/tensorflow/core/framework/op.cc
index b9c47f9e61c..00ba011d19c 100644
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@@ -289,16 +289,17 @@ Status OpListOpRegistry::LookUp(const string& op_type_name,
   return OpNotFound(op_type_name);
 }
 
-// Other registration ---------------------------------------------------------
-
 namespace register_op {
-OpDefBuilderReceiver::OpDefBuilderReceiver(
-    const OpDefBuilderWrapper<true>& wrapper) {
+
+InitOnStartupMarker OpDefBuilderWrapper::operator()() {
   OpRegistry::Global()->Register(
-      [wrapper](OpRegistrationData* op_reg_data) -> Status {
-        return wrapper.builder().Finalize(op_reg_data);
+      [builder =
+           std::move(builder_)](OpRegistrationData* op_reg_data) -> Status {
+        return builder.Finalize(op_reg_data);
       });
+  return {};
 }
-}  // namespace register_op
+
+}  //  namespace register_op
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op.h b/tensorflow/core/framework/op.h
index 94b98d5aff6..5372f6fcfb4 100644
--- a/tensorflow/core/framework/op.h
+++ b/tensorflow/core/framework/op.h
@@ -214,124 +214,83 @@ class OpListOpRegistry : public OpRegistryInterface {
 
 namespace register_op {
 
-// OpDefBuilderWrapper is a templated class that is used in the REGISTER_OP
-// calls. This allows the result of REGISTER_OP to be used in chaining, as in
-// REGISTER_OP(a).Attr("...").Input("...");, while still allowing selective
-// registration to turn the entire call-chain into a no-op.
-template <bool should_register>
-class OpDefBuilderWrapper;
-
-// Template specialization that forwards all calls to the contained builder.
-template <>
-class OpDefBuilderWrapper<true> {
+class OpDefBuilderWrapper {
  public:
   explicit OpDefBuilderWrapper(const char name[]) : builder_(name) {}
-  OpDefBuilderWrapper<true>& Attr(std::string spec) {
+  OpDefBuilderWrapper& Attr(std::string spec) {
     builder_.Attr(std::move(spec));
     return *this;
   }
-  OpDefBuilderWrapper<true>& Input(std::string spec) {
+  OpDefBuilderWrapper& Input(std::string spec) {
     builder_.Input(std::move(spec));
     return *this;
   }
-  OpDefBuilderWrapper<true>& Output(std::string spec) {
+  OpDefBuilderWrapper& Output(std::string spec) {
     builder_.Output(std::move(spec));
     return *this;
   }
-  OpDefBuilderWrapper<true>& SetIsCommutative() {
+  OpDefBuilderWrapper& SetIsCommutative() {
     builder_.SetIsCommutative();
     return *this;
   }
-  OpDefBuilderWrapper<true>& SetIsAggregate() {
+  OpDefBuilderWrapper& SetIsAggregate() {
     builder_.SetIsAggregate();
     return *this;
   }
-  OpDefBuilderWrapper<true>& SetIsStateful() {
+  OpDefBuilderWrapper& SetIsStateful() {
     builder_.SetIsStateful();
     return *this;
   }
-  OpDefBuilderWrapper<true>& SetDoNotOptimize() {
+  OpDefBuilderWrapper& SetDoNotOptimize() {
     // We don't have a separate flag to disable optimizations such as constant
     // folding and CSE so we reuse the stateful flag.
     builder_.SetIsStateful();
     return *this;
   }
-  OpDefBuilderWrapper<true>& SetAllowsUninitializedInput() {
+  OpDefBuilderWrapper& SetAllowsUninitializedInput() {
     builder_.SetAllowsUninitializedInput();
     return *this;
   }
-  OpDefBuilderWrapper<true>& Deprecated(int version, std::string explanation) {
+  OpDefBuilderWrapper& Deprecated(int version, std::string explanation) {
     builder_.Deprecated(version, std::move(explanation));
     return *this;
   }
-  OpDefBuilderWrapper<true>& Doc(std::string text) {
+  OpDefBuilderWrapper& Doc(std::string text) {
     builder_.Doc(std::move(text));
     return *this;
   }
-  OpDefBuilderWrapper<true>& SetShapeFn(OpShapeInferenceFn fn) {
+  OpDefBuilderWrapper& SetShapeFn(OpShapeInferenceFn fn) {
     builder_.SetShapeFn(std::move(fn));
     return *this;
   }
+
   const ::tensorflow::OpDefBuilder& builder() const { return builder_; }
 
+  InitOnStartupMarker operator()();
+
  private:
   mutable ::tensorflow::OpDefBuilder builder_;
 };
 
-// Template specialization that turns all calls into no-ops.
-template <>
-class OpDefBuilderWrapper<false> {
- public:
-  explicit constexpr OpDefBuilderWrapper(const char name[]) {}
-  OpDefBuilderWrapper<false>& Attr(StringPiece spec) { return *this; }
-  OpDefBuilderWrapper<false>& Input(StringPiece spec) { return *this; }
-  OpDefBuilderWrapper<false>& Output(StringPiece spec) { return *this; }
-  OpDefBuilderWrapper<false>& SetIsCommutative() { return *this; }
-  OpDefBuilderWrapper<false>& SetIsAggregate() { return *this; }
-  OpDefBuilderWrapper<false>& SetIsStateful() { return *this; }
-  OpDefBuilderWrapper<false>& SetDoNotOptimize() { return *this; }
-  OpDefBuilderWrapper<false>& SetAllowsUninitializedInput() { return *this; }
-  OpDefBuilderWrapper<false>& Deprecated(int, StringPiece) { return *this; }
-  OpDefBuilderWrapper<false>& Doc(StringPiece text) { return *this; }
-  OpDefBuilderWrapper<false>& SetShapeFn(
-      Status (*fn)(shape_inference::InferenceContext*)) {
-    return *this;
-  }
-};
-
-struct OpDefBuilderReceiver {
-  // To call OpRegistry::Global()->Register(...), used by the
-  // REGISTER_OP macro below.
-  // Note: These are implicitly converting constructors.
-  OpDefBuilderReceiver(
-      const OpDefBuilderWrapper<true>& wrapper);  // NOLINT(runtime/explicit)
-  constexpr OpDefBuilderReceiver(const OpDefBuilderWrapper<false>&) {
-  }  // NOLINT(runtime/explicit)
-};
 }  // namespace register_op
 
-#define REGISTER_OP(name) REGISTER_OP_UNIQ_HELPER(__COUNTER__, name)
-#define REGISTER_OP_UNIQ_HELPER(ctr, name) REGISTER_OP_UNIQ(ctr, name)
-#define REGISTER_OP_UNIQ(ctr, name)                                          \
-  TF_ATTRIBUTE_ANNOTATE("tf:op")                                             \
-  static ::tensorflow::register_op::OpDefBuilderReceiver register_op##ctr    \
-      TF_ATTRIBUTE_UNUSED =                                                  \
-          ::tensorflow::register_op::OpDefBuilderWrapper<SHOULD_REGISTER_OP( \
-              name)>(name)
+#define REGISTER_OP_IMPL(ctr, name, is_system_op)                         \
+  static ::tensorflow::InitOnStartupMarker const register_op##ctr         \
+      TF_ATTRIBUTE_UNUSED =                                               \
+          TF_INIT_ON_STARTUP_IF(is_system_op || SHOULD_REGISTER_OP(name)) \
+          << ::tensorflow::register_op::OpDefBuilderWrapper(name)
+
+#define REGISTER_OP(name)        \
+  TF_ATTRIBUTE_ANNOTATE("tf:op") \
+  TF_NEW_ID_FOR_INIT(REGISTER_OP_IMPL, name, false)
 
 // The `REGISTER_SYSTEM_OP()` macro acts as `REGISTER_OP()` except
 // that the op is registered unconditionally even when selective
 // registration is used.
-#define REGISTER_SYSTEM_OP(name) \
-  REGISTER_SYSTEM_OP_UNIQ_HELPER(__COUNTER__, name)
-#define REGISTER_SYSTEM_OP_UNIQ_HELPER(ctr, name) \
-  REGISTER_SYSTEM_OP_UNIQ(ctr, name)
-#define REGISTER_SYSTEM_OP_UNIQ(ctr, name)                                \
-  TF_ATTRIBUTE_ANNOTATE("tf:op")                                          \
-  TF_ATTRIBUTE_ANNOTATE("tf:op:system")                                   \
-  static ::tensorflow::register_op::OpDefBuilderReceiver register_op##ctr \
-      TF_ATTRIBUTE_UNUSED =                                               \
-          ::tensorflow::register_op::OpDefBuilderWrapper<true>(name)
+#define REGISTER_SYSTEM_OP(name)        \
+  TF_ATTRIBUTE_ANNOTATE("tf:op")        \
+  TF_ATTRIBUTE_ANNOTATE("tf:op:system") \
+  TF_NEW_ID_FOR_INIT(REGISTER_OP_IMPL, name, true)
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/framework/op_def_builder.cc b/tensorflow/core/framework/op_def_builder.cc
index 0eda2c6492f..3b7e597e425 100644
--- a/tensorflow/core/framework/op_def_builder.cc
+++ b/tensorflow/core/framework/op_def_builder.cc
@@ -145,7 +145,7 @@ bool ProcessCompoundType(const StringPiece type_string, AttrValue* allowed) {
   return true;
 }
 
-void FinalizeAttr(StringPiece spec, OpDef* op_def,
+void FinalizeAttr(StringPiece spec, bool allow_attr_type_any, OpDef* op_def,
                   std::vector<string>* errors) {
   OpDef::AttrDef* attr = op_def->add_attr();
   StringPiece orig(spec);
@@ -175,6 +175,8 @@ void FinalizeAttr(StringPiece spec, OpDef* op_def,
     type = "tensor";
   } else if (absl::ConsumePrefix(&spec, "func")) {
     type = "func";
+  } else if (absl::ConsumePrefix(&spec, "any") && allow_attr_type_any) {
+    type = "any";
   } else if (ConsumeCompoundAttrType(&spec, &type_string)) {
     type = "type";
     AttrValue* allowed = attr->mutable_allowed_values();
@@ -633,13 +635,18 @@ OpDefBuilder& OpDefBuilder::SetShapeFn(OpShapeInferenceFn fn) {
   return *this;
 }
 
+OpDefBuilder& OpDefBuilder::AllowAttrTypeAny() {
+  allow_attr_type_any_ = true;
+  return *this;
+}
+
 Status OpDefBuilder::Finalize(OpRegistrationData* op_reg_data) const {
   std::vector<string> errors = errors_;
   *op_reg_data = op_reg_data_;
 
   OpDef* op_def = &op_reg_data->op_def;
   for (StringPiece attr : attrs_) {
-    FinalizeAttr(attr, op_def, &errors);
+    FinalizeAttr(attr, allow_attr_type_any_, op_def, &errors);
   }
   for (StringPiece input : inputs_) {
     FinalizeInputOrOutput(input, false, op_def, &errors);
diff --git a/tensorflow/core/framework/op_def_builder.h b/tensorflow/core/framework/op_def_builder.h
index b69ee46cd59..6789558bc80 100644
--- a/tensorflow/core/framework/op_def_builder.h
+++ b/tensorflow/core/framework/op_def_builder.h
@@ -142,6 +142,10 @@ class OpDefBuilder {
   // python/framework/common_shapes.py
   OpDefBuilder& SetShapeFn(OpShapeInferenceFn fn);
 
+  // Allows the `<type>` in calls to `Attr()` to be "any".
+  // This is used by PythonAPIWrapper for pass-through parameters.
+  OpDefBuilder& AllowAttrTypeAny();
+
   // Sets op_reg_data->op_def to the requested OpDef and
   // op_reg_data->shape_inference_fn to the requested shape inference function,
   // or returns an error.
@@ -168,6 +172,7 @@ class OpDefBuilder {
   std::vector<string> control_outputs_;
   std::string doc_;
   std::vector<string> errors_;
+  bool allow_attr_type_any_ = false;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index 32bb2200853..9bb352b0707 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -114,10 +114,9 @@ OpKernel::OpKernel(OpKernelConstruction* context, bool is_deferred)
   OP_REQUIRES_OK(context, CheckOpDeprecation(*props_->op_def,
                                              context->graph_def_version()));
 
-  // Kernels executing on GPU/SYCL tie very few resources on the CPU where the
+  // Kernels executing on GPU tie very few resources on the CPU where the
   // scheduler runs: we consider them as inexpensive.
-  expensive_ = context->device_type() != DeviceType(DEVICE_GPU) &&
-               context->device_type() != DeviceType(DEVICE_SYCL);
+  expensive_ = context->device_type() != DeviceType(DEVICE_GPU);
 }
 
 OpKernel::OpKernel(OpKernelConstruction* context, NodeDef&& custom_def,
@@ -141,10 +140,9 @@ OpKernel::OpKernel(OpKernelConstruction* context, NodeDef&& custom_def,
   OP_REQUIRES_OK(context, CheckOpDeprecation(*props_->op_def,
                                              context->graph_def_version()));
 
-  // Kernels executing on GPU/SYCL tie very few resources on the CPU where the
+  // Kernels executing on GPU tie very few resources on the CPU where the
   // scheduler runs: we consider them as inexpensive.
-  expensive_ = context->device_type() != DeviceType(DEVICE_GPU) &&
-               context->device_type() != DeviceType(DEVICE_SYCL);
+  expensive_ = context->device_type() != DeviceType(DEVICE_GPU);
 }
 
 OpKernel::~OpKernel() {}
@@ -1258,26 +1256,23 @@ namespace kernel_factory {
 void OpKernelRegistrar::InitInternal(const KernelDef* kernel_def,
                                      StringPiece kernel_class_name,
                                      std::unique_ptr<OpKernelFactory> factory) {
-  // See comments in register_kernel::Name in header for info on _no_register.
-  if (kernel_def->op() != "_no_register") {
-    const string key =
-        Key(kernel_def->op(), DeviceType(kernel_def->device_type()),
-            kernel_def->label());
+  const string key =
+      Key(kernel_def->op(), DeviceType(kernel_def->device_type()),
+          kernel_def->label());
 
-    // To avoid calling LoadDynamicKernels DO NOT CALL GlobalKernelRegistryTyped
-    // here.
-    // InitInternal gets called by static initializers, so it ends up executing
-    // before main. This causes LoadKernelLibraries function to get called
-    // before some file libraries can initialize, which in turn crashes the
-    // program flakily. Until we get rid of static initializers in kernel
-    // registration mechanism, we have this workaround here.
-    auto global_registry =
-        reinterpret_cast<KernelRegistry*>(GlobalKernelRegistry());
-    mutex_lock l(global_registry->mu);
-    global_registry->registry.emplace(
-        key,
-        KernelRegistration(*kernel_def, kernel_class_name, std::move(factory)));
-  }
+  // To avoid calling LoadDynamicKernels DO NOT CALL GlobalKernelRegistryTyped
+  // here.
+  // InitInternal gets called by static initializers, so it ends up executing
+  // before main. This causes LoadKernelLibraries function to get called
+  // before some file libraries can initialize, which in turn crashes the
+  // program flakily. Until we get rid of static initializers in kernel
+  // registration mechanism, we have this workaround here.
+  auto global_registry =
+      reinterpret_cast<KernelRegistry*>(GlobalKernelRegistry());
+  mutex_lock l(global_registry->mu);
+  global_registry->registry.emplace(
+      key,
+      KernelRegistration(*kernel_def, kernel_class_name, std::move(factory)));
   delete kernel_def;
 }
 
@@ -1722,12 +1717,6 @@ const Eigen::GpuDevice& OpKernelContext::eigen_device() const {
   return eigen_gpu_device();
 }
 
-#ifdef TENSORFLOW_USE_SYCL
-template <>
-const Eigen::SyclDevice& OpKernelContext::eigen_device() const {
-  return eigen_sycl_device();
-}
-#endif
 
 void OpKernelConstruction::CtxFailure(const Status& s) {
   VLOG(1) << s;
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 0116a1f8825..124b41a08bf 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -58,7 +58,6 @@ limitations under the License.
 namespace Eigen {
 struct ThreadPoolDevice;
 struct GpuDevice;
-struct SyclDevice;
 }  // end namespace Eigen
 
 namespace tensorflow {
@@ -1149,11 +1148,6 @@ class OpKernelContext {
   const Eigen::GpuDevice& eigen_gpu_device() const {
     return params_->eigen_gpu_device->device();
   }
-#ifdef TENSORFLOW_USE_SYCL
-  const Eigen::SyclDevice& eigen_sycl_device() const {
-    return *device()->eigen_sycl_device();
-  }
-#endif
   template <typename EigenDeviceType>
   const EigenDeviceType& eigen_device() const;
 
@@ -1336,10 +1330,6 @@ const Eigen::ThreadPoolDevice& OpKernelContext::eigen_device() const;
 template <>
 const Eigen::GpuDevice& OpKernelContext::eigen_device() const;
 
-#ifdef TENSORFLOW_USE_SYCL
-template <>
-const Eigen::SyclDevice& OpKernelContext::eigen_device() const;
-#endif
 
 // Register your OpKernel by specifying the Op's name, the device the
 // kernel runs on, any type attr constraints for this kernel, any
@@ -1421,75 +1411,74 @@ namespace register_kernel {
 
 class Name : public KernelDefBuilder {
  public:
-  // With selective registration, kernels whose implementation class is not used
-  // by any kernel are disabled with the SHOULD_REGISTER_OP_KERNEL call in
-  // REGISTER_KERNEL_BUILDER_UNIQ. However, an unused kernel that shares an
-  // implementation class with a used kernel would get through that mechanism.
-  //
-  // This mechanism stops that registration by changing the name of the kernel
-  // for the unused op to one that is ignored by
-  // OpKernelRegistrar::InitInternal.  Note that this method alone is
-  // not sufficient - the compiler can't evaluate the entire KernelDefBuilder at
-  // compilation time, so this method doesn't actually reduce code size.
-  explicit Name(const char* op)
-      : KernelDefBuilder(SHOULD_REGISTER_OP(op) ? op : "_no_register") {}
-};
-
-namespace system {
-
-class Name : public KernelDefBuilder {
- public:
-  // For system kernels, we ignore selective registration and
-  // unconditionally register the kernel.
   explicit Name(const char* op) : KernelDefBuilder(op) {}
 };
 
-}  // namespace system
-
 }  // namespace register_kernel
 
+// Kernel registration appears as:
+//   REGISTER_KERNEL_BUILDER(Name("OpName").Device(DEVICE_CPU)..., OpImpl)
+// We'd like to have "OpName" as a constant-expression, without requiring that
+// of the overall KernelDefBuilder expression (beginning with the
+// register_kernel::Name constructor above).
+//
+// So, we pull the "OpName" part to a separate macro-level argument. This
+// involves treating Name("OpName") as a macro call, via token-pasting (e.g.
+// M_## =>  M_Name("OpName")), and having it expand to '"OpName",
+// Name("OpName")' which is then usable as two arguments.
+#define TF_EXTRACT_KERNEL_NAME_Name(name_str) \
+  name_str, ::tensorflow::register_kernel::Name(name_str)
+#define TF_EXTRACT_KERNEL_NAME_IMPL(m, ...) m(__VA_ARGS__)
+#define TF_EXTRACT_KERNEL_NAME(m, kernel_builder, ...)                    \
+  TF_EXTRACT_KERNEL_NAME_IMPL(m, TF_EXTRACT_KERNEL_NAME_##kernel_builder, \
+                              __VA_ARGS__)
+
+// REGISTER_KERNEL_BUILDER_IMPL_2, with a unique 'ctr' as the first argument.
+// TODO(dodgen): There are some uses of this macro inside functions, where
+// kernel_builder refers to (non-const) locals (they should be fixed). To
+// accommodate those, kernel_builder.Build() appears as an argument to an
+// immediately-called lambda (not in the lambda itself).
+#define REGISTER_KERNEL_BUILDER_IMPL_3(ctr, op_name, kernel_builder_expr,   \
+                                       is_system_kernel, ...)               \
+  static ::tensorflow::InitOnStartupMarker const register_kernel_##ctr      \
+      TF_ATTRIBUTE_UNUSED =                                                 \
+          TF_INIT_ON_STARTUP_IF(is_system_kernel ||                         \
+                                (SHOULD_REGISTER_OP_KERNEL(#__VA_ARGS__) && \
+                                 SHOULD_REGISTER_OP(op_name)))              \
+          << ([](::tensorflow::KernelDef const* kernel_def) {               \
+               ::tensorflow::kernel_factory::OpKernelRegistrar registrar(   \
+                   kernel_def, #__VA_ARGS__,                                \
+                   [](::tensorflow::OpKernelConstruction* context)          \
+                       -> ::tensorflow::OpKernel* {                         \
+                     return new __VA_ARGS__(context);                       \
+                   });                                                      \
+               (void)registrar;                                             \
+               return ::tensorflow::InitOnStartupMarker{};                  \
+             })(kernel_builder_expr.Build());
+
+// REGISTER_KERNEL_BUILDER_IMPL, but with kernel_builder split to op_name,
+// kernel_builder_expr.
+#define REGISTER_KERNEL_BUILDER_IMPL_2(op_name, kernel_builder_expr, \
+                                       is_system_kernel, ...)        \
+  TF_NEW_ID_FOR_INIT(REGISTER_KERNEL_BUILDER_IMPL_3, op_name,        \
+                     kernel_builder_expr, is_system_kernel, __VA_ARGS__)
+
+// REGISTER_KERNEL_BUILDER, but with is_system_kernel bound.
+#define REGISTER_KERNEL_BUILDER_IMPL(kernel_builder, is_system_kernel, ...) \
+  TF_EXTRACT_KERNEL_NAME(REGISTER_KERNEL_BUILDER_IMPL_2, kernel_builder,    \
+                         is_system_kernel, __VA_ARGS__)
+
 #define REGISTER_KERNEL_BUILDER(kernel_builder, ...) \
-  REGISTER_KERNEL_BUILDER_UNIQ_HELPER(__COUNTER__, kernel_builder, __VA_ARGS__)
-
-#define REGISTER_KERNEL_BUILDER_UNIQ_HELPER(ctr, kernel_builder, ...) \
-  REGISTER_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, __VA_ARGS__)
-
-#define REGISTER_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, ...)        \
-  constexpr bool should_register_##ctr##__flag =                      \
-      SHOULD_REGISTER_OP_KERNEL(#__VA_ARGS__);                        \
-  TF_ATTRIBUTE_ANNOTATE("tf:kernel")                                  \
-  static ::tensorflow::kernel_factory::OpKernelRegistrar              \
-      registrar__body__##ctr##__object(                               \
-          should_register_##ctr##__flag                               \
-              ? ::tensorflow::register_kernel::kernel_builder.Build() \
-              : nullptr,                                              \
-          #__VA_ARGS__,                                               \
-          [](::tensorflow::OpKernelConstruction* context)             \
-              -> ::tensorflow::OpKernel* {                            \
-            return new __VA_ARGS__(context);                          \
-          });
+  TF_ATTRIBUTE_ANNOTATE("tf:kernel")                 \
+  REGISTER_KERNEL_BUILDER_IMPL(kernel_builder, false, __VA_ARGS__)
 
 // The `REGISTER_SYSTEM_KERNEL_BUILDER()` macro acts as
 // `REGISTER_KERNEL_BUILDER()` except that the kernel is registered
 // unconditionally even when selective registration is used.
-#define REGISTER_SYSTEM_KERNEL_BUILDER(kernel_builder, ...)               \
-  REGISTER_SYSTEM_KERNEL_BUILDER_UNIQ_HELPER(__COUNTER__, kernel_builder, \
-                                             __VA_ARGS__)
-
-#define REGISTER_SYSTEM_KERNEL_BUILDER_UNIQ_HELPER(ctr, kernel_builder, ...) \
-  REGISTER_SYSTEM_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, __VA_ARGS__)
-
-#define REGISTER_SYSTEM_KERNEL_BUILDER_UNIQ(ctr, kernel_builder, ...)    \
-  TF_ATTRIBUTE_ANNOTATE("tf:kernel")                                     \
-  TF_ATTRIBUTE_ANNOTATE("tf:kernel:system")                              \
-  static ::tensorflow::kernel_factory::OpKernelRegistrar                 \
-      registrar__body__##ctr##__object(                                  \
-          ::tensorflow::register_kernel::system::kernel_builder.Build(), \
-          #__VA_ARGS__,                                                  \
-          [](::tensorflow::OpKernelConstruction* context)                \
-              -> ::tensorflow::OpKernel* {                               \
-            return new __VA_ARGS__(context);                             \
-          });
+#define REGISTER_SYSTEM_KERNEL_BUILDER(kernel_builder, ...) \
+  TF_ATTRIBUTE_ANNOTATE("tf:kernel")                        \
+  TF_ATTRIBUTE_ANNOTATE("tf:kernel:system")                 \
+  REGISTER_KERNEL_BUILDER_IMPL(kernel_builder, true, __VA_ARGS__)
 
 // Checks whether a given kernel is registered on device_type.
 bool KernelDefAvailable(const DeviceType& device_type, const NodeDef& node_def);
@@ -1543,23 +1532,15 @@ class OpKernelRegistrar {
   // KernelDef is required.
   OpKernelRegistrar(const KernelDef* kernel_def, StringPiece kernel_class_name,
                     std::unique_ptr<OpKernelFactory> factory) {
-    // Perform the check in the header to allow compile-time optimization
-    // to a no-op, allowing the linker to remove the kernel symbols.
-    if (kernel_def != nullptr) {
-      InitInternal(kernel_def, kernel_class_name, std::move(factory));
-    }
+    InitInternal(kernel_def, kernel_class_name, std::move(factory));
   }
 
   // Registers the given factory function with TensorFlow. This is equivalent
   // to registering a factory whose Create function invokes `create_fn`.
   OpKernelRegistrar(const KernelDef* kernel_def, StringPiece kernel_class_name,
                     OpKernel* (*create_fn)(OpKernelConstruction*)) {
-    // Perform the check in the header to allow compile-time optimization
-    // to a no-op, allowing the linker to remove the kernel symbols.
-    if (kernel_def != nullptr) {
-      InitInternal(kernel_def, kernel_class_name,
-                   absl::make_unique<PtrOpKernelFactory>(create_fn));
-    }
+    InitInternal(kernel_def, kernel_class_name,
+                 absl::make_unique<PtrOpKernelFactory>(create_fn));
   }
 
  private:
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index 186f36ccae6..9b5648927d1 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -1152,5 +1152,18 @@ TEST(RegisteredKernels, GetRegisteredKernelsForOp) {
   EXPECT_EQ(kernel_list.kernel(0).device_type(), "CPU");
 }
 
+// EXTRACT_KERNEL_NAME_TO_STRING wraps TF_EXTRACT_KERNEL_NAME for testing
+// (it involves quite a bit of macro-magic).
+#define EXTRACT_KERNEL_NAME_TO_STRING_IMPL(name, kernel_builder, ...) name
+#define EXTRACT_KERNEL_NAME_TO_STRING(kernel_builder) \
+  TF_EXTRACT_KERNEL_NAME(EXTRACT_KERNEL_NAME_TO_STRING_IMPL, kernel_builder)
+
+TEST(RegisterKernelMacro, ExtractName) {
+  static constexpr char const* kName = "Foo";
+  static constexpr char const* kExtractedName =
+      EXTRACT_KERNEL_NAME_TO_STRING(Name(kName).Label("Label"));
+  EXPECT_THAT(kExtractedName, ::testing::StrEq(kName));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_requires.h b/tensorflow/core/framework/op_requires.h
index ea80bfd7b2d..b7cf6e859fb 100644
--- a/tensorflow/core/framework/op_requires.h
+++ b/tensorflow/core/framework/op_requires.h
@@ -68,7 +68,7 @@ namespace tensorflow {
 
 #define OP_REQUIRES_OK_ASYNC(CTX, STATUS, CALLBACK)         \
   do {                                                      \
-    ::tensorflow::Status _s(STATUS);                        \
+    const ::tensorflow::Status& _s(STATUS);                 \
     if (!TF_PREDICT_TRUE(_s.ok())) {                        \
       (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
       (CALLBACK)();                                         \
diff --git a/tensorflow/core/framework/register_types.h b/tensorflow/core/framework/register_types.h
index 0cf6536e8c2..8cab653aec0 100644
--- a/tensorflow/core/framework/register_types.h
+++ b/tensorflow/core/framework/register_types.h
@@ -153,24 +153,25 @@ limitations under the License.
 #endif  // defined(IS_MOBILE_PLATFORM)  - end of TF_CALL_type defines
 
 // Defines for sets of types.
-#define TF_CALL_INTEGRAL_TYPES(m)                                       \
-  TF_CALL_uint64(m) TF_CALL_int64(m) TF_CALL_uint32(m) TF_CALL_int32(m) \
-      TF_CALL_uint16(m) TF_CALL_int16(m) TF_CALL_uint8(m) TF_CALL_int8(m)
+#define TF_CALL_INTEGRAL_TYPES_NO_INT32(m)                               \
+  TF_CALL_uint64(m) TF_CALL_int64(m) TF_CALL_uint32(m) TF_CALL_uint16(m) \
+      TF_CALL_int16(m) TF_CALL_uint8(m) TF_CALL_int8(m)
+
+#define TF_CALL_INTEGRAL_TYPES(m) \
+  TF_CALL_INTEGRAL_TYPES_NO_INT32(m) TF_CALL_int32(m)
 
 #define TF_CALL_FLOAT_TYPES(m) \
   TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)
 
 #define TF_CALL_REAL_NUMBER_TYPES(m) \
-  TF_CALL_INTEGRAL_TYPES(m)          \
-  TF_CALL_FLOAT_TYPES(m)
+  TF_CALL_INTEGRAL_TYPES(m) TF_CALL_FLOAT_TYPES(m)
 
 #define TF_CALL_REAL_NUMBER_TYPES_NO_BFLOAT16(m) \
   TF_CALL_INTEGRAL_TYPES(m) TF_CALL_half(m) TF_CALL_float(m) TF_CALL_double(m)
 
-#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)                                \
-  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m)     \
-      TF_CALL_uint64(m) TF_CALL_int64(m) TF_CALL_uint32(m) TF_CALL_uint16(m) \
-          TF_CALL_int16(m) TF_CALL_uint8(m) TF_CALL_int8(m)
+#define TF_CALL_REAL_NUMBER_TYPES_NO_INT32(m)                            \
+  TF_CALL_half(m) TF_CALL_bfloat16(m) TF_CALL_float(m) TF_CALL_double(m) \
+      TF_CALL_INTEGRAL_TYPES_NO_INT32(m)
 
 #define TF_CALL_COMPLEX_TYPES(m) TF_CALL_complex64(m) TF_CALL_complex128(m)
 
@@ -211,16 +212,4 @@ limitations under the License.
   TF_CALL_COMPLEX_TYPES(m)                 \
   TF_CALL_QUANTIZED_TYPES(m) TF_CALL_bool(m) TF_CALL_tstring(m)
 
-#ifdef TENSORFLOW_SYCL_NO_DOUBLE
-#define TF_CALL_SYCL_double(m)
-#else  // TENSORFLOW_SYCL_NO_DOUBLE
-#define TF_CALL_SYCL_double(m) TF_CALL_double(m)
-#endif  // TENSORFLOW_SYCL_NO_DOUBLE
-
-#ifdef __ANDROID_TYPES_SLIM__
-#define TF_CALL_SYCL_NUMBER_TYPES(m) TF_CALL_float(m)
-#else  // __ANDROID_TYPES_SLIM__
-#define TF_CALL_SYCL_NUMBER_TYPES(m) TF_CALL_float(m) TF_CALL_SYCL_double(m)
-#endif  // __ANDROID_TYPES_SLIM__
-
 #endif  // TENSORFLOW_CORE_FRAMEWORK_REGISTER_TYPES_H_
diff --git a/tensorflow/core/framework/register_types_traits.h b/tensorflow/core/framework/register_types_traits.h
index 660021759de..ff6c9fb3da7 100644
--- a/tensorflow/core/framework/register_types_traits.h
+++ b/tensorflow/core/framework/register_types_traits.h
@@ -21,9 +21,6 @@ limitations under the License.
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/platform/types.h"
@@ -74,16 +71,6 @@ struct proxy_type_pod<GPUDevice, 1> {
   typedef ::tensorflow::int8 type;
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <>
-struct proxy_type_pod<SYCLDevice, 8> {
-  typedef double type;
-};
-template <>
-struct proxy_type_pod<SYCLDevice, 4> {
-  typedef float type;
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 /// If POD we use proxy_type_pod, otherwise this maps to identity.
 template <typename Device, typename T>
@@ -101,10 +88,6 @@ struct proxy_type {
 #define TF_CALL_GPU_PROXY_TYPES(m)                                    \
   TF_CALL_double(m) TF_CALL_float(m) TF_CALL_half(m) TF_CALL_int32(m) \
       TF_CALL_int8(m)
-#ifdef TENSORFLOW_USE_SYCL
-#define TF_CALL_SYCL_PROXY_TYPES(m) \
-  TF_CALL_double(m) TF_CALL_float(m) TF_CALL_int32(m)
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_REGISTER_TYPES_TRAITS_H_
diff --git a/tensorflow/core/framework/rendezvous.cc b/tensorflow/core/framework/rendezvous.cc
index 764f8995d02..9d63265af62 100644
--- a/tensorflow/core/framework/rendezvous.cc
+++ b/tensorflow/core/framework/rendezvous.cc
@@ -151,7 +151,7 @@ Status RendezvousInterface::Recv(const ParsedKey& key, const Args& args,
 namespace {
 class LocalRendezvousWrapper : public Rendezvous {
  public:
-  LocalRendezvousWrapper() = default;
+  LocalRendezvousWrapper() : impl_(this) {}
 
   Status Send(const ParsedKey& key, const Args& send_args, const Tensor& val,
               const bool is_dead) override {
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index e6ecfbb9190..16bd2c11414 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -329,13 +329,4 @@ Status DeleteResource(OpKernelContext* ctx, const ResourceHandle& p) {
   return ctx->resource_manager()->Delete(p);
 }
 
-Status ResourceHandlesShape(shape_inference::InferenceContext* c) {
-  int n;
-  TF_RETURN_IF_ERROR(c->GetAttr("N", &n));
-  for (int i = 0; i < n; ++i) {
-    c->set_output(i, c->Scalar());
-  }
-  return Status::OK();
-}
-
 }  //  end namespace tensorflow
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index 758837e017a..961e6df005b 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -498,8 +498,6 @@ class ResourceHandlesOp : public OpKernel {
   std::atomic<bool> initialized_{false};
 };
 
-Status ResourceHandlesShape(shape_inference::InferenceContext* c);
-
 // Registers a kernel for an op which produces a handle to a resource of the
 // specified type.
 #define REGISTER_RESOURCE_HANDLE_KERNEL(Type)                        \
diff --git a/tensorflow/core/framework/rng_alg.h b/tensorflow/core/framework/rng_alg.h
new file mode 100644
index 00000000000..4317d66ad9a
--- /dev/null
+++ b/tensorflow/core/framework/rng_alg.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_RNG_ALG_H_
+#define TENSORFLOW_CORE_FRAMEWORK_RNG_ALG_H_
+
+namespace tensorflow {
+
+enum Algorithm { RNG_ALG_PHILOX = 1, RNG_ALG_THREEFRY = 2 };
+
+static constexpr int RNG_KEY_SIZE = 1;
+static constexpr int RNG_MAX_COUNTER_SIZE = 2;
+inline int GetCounterSize(Algorithm alg) {
+  if (alg == RNG_ALG_PHILOX) {
+    return 2;
+  }
+  return 1;
+}
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_RNG_ALG_H_
diff --git a/tensorflow/core/framework/selective_registration.h b/tensorflow/core/framework/selective_registration.h
index 4b281a04bf6..c9bbcb8bfe8 100644
--- a/tensorflow/core/framework/selective_registration.h
+++ b/tensorflow/core/framework/selective_registration.h
@@ -13,11 +13,28 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// This file provides some common support for 'registration' of e.g. ops and
+// kernels. In particular, it relates to the REGISTER_OP (op registration) and
+// REGISTER_KERNEL_BUILDER (kernel registration) macros.
+//
+// Note that there are two sides to 'registration':
+//   - Definition (compile-time): making op and kernel definitions _available_.
+//   - Usage (run-time): adding particular (available) definitions of ops and
+//     kernels to the global OpRegistry / KernelRegistry, to be found when
+//     constructing and executing graphs.
+//
+// Currently, definition and usage happen to be coupled together: all
+// 'available' definitions (from the REGISTER_*' macros) are added to the global
+// registries on startup / library load.
+
 #ifndef TENSORFLOW_CORE_FRAMEWORK_SELECTIVE_REGISTRATION_H_
 #define TENSORFLOW_CORE_FRAMEWORK_SELECTIVE_REGISTRATION_H_
 
 #include <string.h>
 
+#include <type_traits>
+#include <utility>
+
 #ifdef SELECTIVE_REGISTRATION
 
 // Experimental selective registration support to reduce binary size.
@@ -55,4 +72,69 @@ static_assert(false, "ops_to_register.h must define SHOULD_REGISTER macros");
 #define SHOULD_REGISTER_OP_KERNEL(clz) true
 #endif
 
+namespace tensorflow {
+
+// An InitOnStartupMarker is 'initialized' on program startup, purely for the
+// side-effects of that initialization - the struct itself is empty. (The type
+// is expected to be used to define globals.)
+//
+// The '<<' operator should be used in initializer expressions to specify what
+// to run on startup. The following values are accepted:
+//   - An InitOnStartupMarker. Example:
+//      InitOnStartupMarker F();
+//      InitOnStartupMarker const kInitF =
+//        InitOnStartupMarker{} << F();
+//   - Something to call, which returns an InitOnStartupMarker. Example:
+//      InitOnStartupMarker const kInit =
+//        InitOnStartupMarker{} << []() { G(); return
+//
+// See also: TF_INIT_ON_STARTUP_IF
+struct InitOnStartupMarker {
+  constexpr InitOnStartupMarker operator<<(InitOnStartupMarker) const {
+    return *this;
+  }
+
+  template <typename T>
+  constexpr InitOnStartupMarker operator<<(T&& v) const {
+    return std::forward<T>(v)();
+  }
+};
+
+// Conditional initializer expressions for InitOnStartupMarker:
+//   TF_INIT_ON_STARTUP_IF(cond) << f
+// If 'cond' is true, 'f' is evaluated (and called, if applicable) on startup.
+// Otherwise, 'f' is *not evaluated*. Note that 'cond' is required to be a
+// constant-expression, and so this approximates #ifdef.
+//
+// The implementation uses the ?: operator (!cond prevents evaluation of 'f').
+// The relative precedence of ?: and << is significant; this effectively expands
+// to (see extra parens):
+//   !cond ? InitOnStartupMarker{} : (InitOnStartupMarker{} << f)
+//
+// Note that although forcing 'cond' to be a constant-expression should not
+// affect binary size (i.e. the same optimizations should apply if it 'happens'
+// to be one), it was found to be necessary (for a recent version of clang;
+// perhaps an optimizer bug).
+//
+// The parens are necessary to hide the ',' from the preprocessor; it could
+// otherwise act as a macro argument separator.
+#define TF_INIT_ON_STARTUP_IF(cond)                \
+  (::std::integral_constant<bool, !(cond)>::value) \
+      ? ::tensorflow::InitOnStartupMarker{}        \
+      : ::tensorflow::InitOnStartupMarker {}
+
+// Wrapper for generating unique IDs (for 'anonymous' InitOnStartup definitions)
+// using __COUNTER__. The new ID (__COUNTER__ already expanded) is provided as a
+// macro argument.
+//
+// Usage:
+//   #define M_IMPL(id, a, b) ...
+//   #define M(a, b) TF_NEW_ID_FOR_INIT(M, a, b)
+#define TF_NEW_ID_FOR_INIT_2(m, c, ...) m(c, __VA_ARGS__)
+#define TF_NEW_ID_FOR_INIT_1(m, c, ...) TF_NEW_ID_FOR_INIT_2(m, c, __VA_ARGS__)
+#define TF_NEW_ID_FOR_INIT(m, ...) \
+  TF_NEW_ID_FOR_INIT_1(m, __COUNTER__, __VA_ARGS__)
+
+}  // namespace tensorflow
+
 #endif  // TENSORFLOW_CORE_FRAMEWORK_SELECTIVE_REGISTRATION_H_
diff --git a/tensorflow/core/framework/selective_registration_test.cc b/tensorflow/core/framework/selective_registration_test.cc
new file mode 100644
index 00000000000..172e1c50c64
--- /dev/null
+++ b/tensorflow/core/framework/selective_registration_test.cc
@@ -0,0 +1,86 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/selective_registration.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+using ::testing::Eq;
+
+#define STORE_NEXT_ID_IMPL(id, name) constexpr int name = id
+#define STORE_NEXT_ID(name) TF_NEW_ID_FOR_INIT(STORE_NEXT_ID_IMPL, name)
+
+STORE_NEXT_ID(kBaseId);
+STORE_NEXT_ID(kNextId1);
+STORE_NEXT_ID(kNextId2);
+
+TEST(NewIdForInitTest, SequentialIds) {
+  static_assert(kBaseId >= 0, "kBaseId < 0");
+  static_assert(kNextId1 == kBaseId + 1, "kNextId1 != kBaseId+1");
+  static_assert(kNextId2 == kBaseId + 2, "kNextId2 != kBaseId+2");
+}
+
+int observed_unconditional_init;
+InitOnStartupMarker const kUnconditionalInitMarker =
+    InitOnStartupMarker{} << []() {
+      observed_unconditional_init++;
+      return InitOnStartupMarker{};
+    };
+
+TEST(InitOnStartupTest, Unconditional) {
+  EXPECT_THAT(observed_unconditional_init, Eq(1));
+}
+
+template <bool Enable>
+int observed_conditional_init;
+template <bool Enable>
+InitOnStartupMarker const kConditionalInitMarker =
+    TF_INIT_ON_STARTUP_IF(Enable) << []() {
+      (observed_conditional_init<Enable>)++;
+      return InitOnStartupMarker{};
+    };
+
+template InitOnStartupMarker const kConditionalInitMarker<true>;
+template InitOnStartupMarker const kConditionalInitMarker<false>;
+
+// TODO(b/169282173): Enable once the issue is fixed.
+TEST(InitOnStartupTest, DISABLED_Conditional) {
+  EXPECT_THAT(observed_conditional_init<true>, Eq(1));
+  EXPECT_THAT(observed_conditional_init<false>, Eq(0));
+}
+
+template <bool Enable>
+int observed_conditional_init_immediate;
+template <bool Enable>
+InitOnStartupMarker const kConditionalInitImmediateMarker =
+    TF_INIT_ON_STARTUP_IF(Enable) << ([]() {
+      (observed_conditional_init_immediate<Enable>)++;
+      return InitOnStartupMarker{};
+    })();
+
+template InitOnStartupMarker const kConditionalInitImmediateMarker<true>;
+template InitOnStartupMarker const kConditionalInitImmediateMarker<false>;
+
+// TODO(b/169282173): Enable once the issue is fixed.
+TEST(InitOnStartupTest, DISABLED_ConditionalImmediate) {
+  EXPECT_THAT(observed_conditional_init_immediate<true>, Eq(1));
+  EXPECT_THAT(observed_conditional_init_immediate<false>, Eq(0));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index bb79b278cb1..10b54476d18 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -133,9 +133,14 @@ struct DimensionOrConstant {
 struct ShapeAndType {
   ShapeAndType() {}
   ShapeAndType(ShapeHandle s, DataType t) : shape(s), dtype(t) {}
+  ShapeAndType(ShapeHandle s, DataType t, SpecializedType specialized_t)
+      : shape(s), dtype(t), specialized_type(specialized_t) {}
 
   ShapeHandle shape;
   DataType dtype = DT_INVALID;
+  // The type of a variant-dtype tensor sometimes affects graph building
+  // (e.g. for vectorization), and needs to be know statically in such cases.
+  SpecializedType specialized_type = ST_INVALID;
 };
 
 // Shape inference functions registered on ops in REGISTER_OP implement
diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index 565014d14b1..b564ac144cc 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -276,7 +276,6 @@ void TensorShapeRep::SlowCopyFrom(const TensorShapeRep& b) {
     //   set_ndims_byte(b.ndims_byte());
     //   set_data_type(b.data_type());
   } else {
-    DCHECK_EQ(b.tag(), REP_OUT_OF_LINE);
     set_ndims_byte(b.ndims_byte());
     set_data_type(b.data_type());
     if (tag() == REP_OUT_OF_LINE) {
@@ -502,8 +501,8 @@ TensorShapeIter<Shape> TensorShapeBase<Shape>::begin() const {
 
 template <class Shape>
 TensorShapeIter<Shape> TensorShapeBase<Shape>::end() const {
-  CHECK(!unknown_rank());
-  return TensorShapeIter<Shape>(static_cast<const Shape*>(this), dims());
+  const int max_dim = unknown_rank() ? -1 : dims();
+  return TensorShapeIter<Shape>(static_cast<const Shape*>(this), max_dim);
 }
 
 string TensorShapeRep::DebugString() const {
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index dbe103088c1..70253f4e7c8 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -332,6 +332,11 @@ class TensorShape : public TensorShapeBase<TensorShape> {
   friend class Tensor;
 };
 
+/// Outputs `TensorShapeBase` to `std::ostream`.
+inline std::ostream& operator<<(std::ostream& os, const TensorShape& ts) {
+  return os << ts.DebugString();
+}
+
 /// Represents the value of one dimension in a TensorShape.
 struct TensorShapeDim {
   explicit TensorShapeDim(int64 s) : size(s) {}
diff --git a/tensorflow/core/framework/tensor_testutil.cc b/tensorflow/core/framework/tensor_testutil.cc
index 89a20b9a039..804d5df31ed 100644
--- a/tensorflow/core/framework/tensor_testutil.cc
+++ b/tensorflow/core/framework/tensor_testutil.cc
@@ -216,6 +216,8 @@ void ExpectClose(const Tensor& x, const Tensor& y, double atol, double rtol) {
   switch (x.dtype()) {
     case DT_HALF:
       return ExpectClose<Eigen::half>(x, y, atol, rtol);
+    case DT_BFLOAT16:
+      return ExpectClose<Eigen::bfloat16>(x, y, atol, rtol);
     case DT_FLOAT:
       return ExpectClose<float>(x, y, atol, rtol);
     case DT_DOUBLE:
diff --git a/tensorflow/core/framework/types.cc b/tensorflow/core/framework/types.cc
index 294f7a21557..457ba639cc2 100644
--- a/tensorflow/core/framework/types.cc
+++ b/tensorflow/core/framework/types.cc
@@ -38,7 +38,6 @@ std::ostream& operator<<(std::ostream& os, const DeviceType& d) {
 const char* const DEVICE_DEFAULT = "DEFAULT";
 const char* const DEVICE_CPU = "CPU";
 const char* const DEVICE_GPU = "GPU";
-const char* const DEVICE_SYCL = "SYCL";
 const char* const DEVICE_TPU_SYSTEM = "TPU_SYSTEM";
 
 const std::string DeviceName<Eigen::ThreadPoolDevice>::value = DEVICE_CPU;
@@ -46,9 +45,6 @@ const std::string DeviceName<Eigen::ThreadPoolDevice>::value = DEVICE_CPU;
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 const std::string DeviceName<Eigen::GpuDevice>::value = DEVICE_GPU;
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-const std::string DeviceName<Eigen::SyclDevice>::value = DEVICE_SYCL;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace {
 string DataTypeStringInternal(DataType dtype) {
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index 2b5f41be0de..2ba259434de 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -74,7 +74,6 @@ std::ostream& operator<<(std::ostream& os, const DeviceType& d);
 TF_EXPORT extern const char* const DEVICE_DEFAULT;     // "DEFAULT"
 TF_EXPORT extern const char* const DEVICE_CPU;         // "CPU"
 TF_EXPORT extern const char* const DEVICE_GPU;         // "GPU"
-TF_EXPORT extern const char* const DEVICE_SYCL;        // "SYCL"
 TF_EXPORT extern const char* const DEVICE_TPU_SYSTEM;  // "TPU_SYSTEM"
 
 template <typename Device>
@@ -93,12 +92,6 @@ struct DeviceName<Eigen::GpuDevice> {
 };
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-template <>
-struct DeviceName<Eigen::SyclDevice> {
-  static const std::string value;
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 typedef gtl::InlinedVector<MemoryType, 4> MemoryTypeVector;
 typedef gtl::ArraySlice<MemoryType> MemoryTypeSlice;
diff --git a/tensorflow/core/framework/types.proto b/tensorflow/core/framework/types.proto
index 900132c0db9..61549ae08ce 100644
--- a/tensorflow/core/framework/types.proto
+++ b/tensorflow/core/framework/types.proto
@@ -74,3 +74,14 @@ enum DataType {
 //    https://www.tensorflow.org/code/tensorflow/core/framework/types.cc,
 //    https://www.tensorflow.org/code/tensorflow/python/framework/dtypes.py,
 //    https://www.tensorflow.org/code/tensorflow/python/framework/function.py)
+
+// For identifying the underlying type of a variant. For variants, the types
+// listed here are a subset of the types in the variant type registry,
+// corresponding to commonly used variants which must occasionally be
+// special-cased.
+enum SpecializedType {
+  // Invalid/unknown specialized type.
+  ST_INVALID = 0;
+  // "tensorflow::TensorList" in the variant type registry.
+  ST_TENSOR_LIST = 1;
+}
\ No newline at end of file
diff --git a/tensorflow/core/framework/types_test.cc b/tensorflow/core/framework/types_test.cc
index 63fb35081cd..060e86ed72b 100644
--- a/tensorflow/core/framework/types_test.cc
+++ b/tensorflow/core/framework/types_test.cc
@@ -26,7 +26,6 @@ namespace {
 TEST(TypesTest, DeviceTypeName) {
   EXPECT_EQ("CPU", DeviceTypeString(DeviceType(DEVICE_CPU)));
   EXPECT_EQ("GPU", DeviceTypeString(DeviceType(DEVICE_GPU)));
-  EXPECT_EQ("SYCL", DeviceTypeString(DeviceType(DEVICE_SYCL)));
 }
 
 TEST(TypesTest, kDataTypeRefOffset) {
diff --git a/tensorflow/core/graph/BUILD b/tensorflow/core/graph/BUILD
index 4834cdc53ed..e3b5076b9f6 100644
--- a/tensorflow/core/graph/BUILD
+++ b/tensorflow/core/graph/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
@@ -62,6 +64,7 @@ filegroup(
         "graph_node_util.h",
         "node_builder.h",
         "tensor_id.h",
+        "types.h",
     ],
 )
 
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 3d3921d68c0..dedbdb29624 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -617,9 +617,9 @@ class Graph {
   int num_edge_ids() const { return edges_.size(); }
 
   // Returns the Edge associated with an id, or nullptr if no edge
-  // with that id (the node with that id was removed and the id has
+  // with that id (the edge with that id was removed and the id has
   // not yet been re-used). *this owns the returned instance.
-  // REQUIRES: 0 <= id < num_node_ids().
+  // REQUIRES: 0 <= id < num_edge_ids().
   const Edge* FindEdgeId(int id) const { return edges_[id]; }
 
   // Access to the set of all edges.  Example usage:
diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h
index 0c57362703d..efa1cfb0d3c 100644
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 // Since our ops are going to produce and also consume N addition tensors
@@ -87,6 +88,30 @@ bool inline DoesControlEdgeExist(const Node* src, const Node* dst) {
   return false;
 }
 
+// Check if graph should run in layout-dependent mode or native format mode
+// based on environment variable setting. User can set
+// TF_ENABLE_MKL_NATIVE_FORMAT=1 to enable the native format mode.
+bool inline NativeFormatEnabled() {
+  static bool native_fmt_enabled = false;
+  static absl::once_flag once;
+  absl::call_once(once, [&] {
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MKL_NATIVE_FORMAT",
+                                   /*default_value*/ false,
+                                   &native_fmt_enabled));
+  });
+  return native_fmt_enabled;
+}
+
+// Check if the data_format attribute in the node def represents 5D tensor
+bool inline Check5DFormat(const NodeDef& ndef) {
+  string data_format;
+  TF_CHECK_OK(GetNodeAttr(ndef, "data_format", &data_format));
+  if (data_format.compare("NCDHW") == 0 || data_format.compare("NDHWC") == 0) {
+    return true;
+  }
+  return false;
+}
+
 namespace mkl_op_registry {
 // MKL operators whose kernels are registered with 'MklLayoutDependentOp' label
 // (e.g., MklConv2D) understand input tensors in MKL layout. These operators
@@ -119,10 +144,30 @@ static const char* const kMklEagerOpPrefix = "_MklEager";
 // _MklEager prefix.
 static const char* const kMklNativeOpPrefix = "_MklNative";
 
+// Get the name of Mkl Native (does not depend on layout propagation) op
+// from original TensorFlow op.
+inline string GetMklNativeOpName(const string& name) {
+  // There are few operators that don't depend on layout propagation but are
+  // prefixed with _Mkl instead of _MklNative.
+  bool result =
+      (0 == name.compare("ConjugateTranspose") ||
+       0 == name.compare("BatchMatMul") || 0 == name.compare("BatchMatMulV2") ||
+       0 == name.compare("MatMul") || 0 == name.compare("Transpose"));
+  if (result) {
+    return string(kMklOpPrefix) + name;
+  } else {
+    return string(kMklNativeOpPrefix) + name;
+  }
+}
+
 // Get the name of Mkl op from original TensorFlow op
-// We prefix 'Mkl' to the original op to get Mkl op.
+// We prefix the original op with _Mkl or _MklNative to get Mkl op.
 inline string GetMklOpName(const string& name) {
-  return string(kMklOpPrefix) + name;
+  if (!NativeFormatEnabled()) {
+    return string(kMklOpPrefix) + name;
+  } else {
+    return GetMklNativeOpName(name);
+  }
 }
 
 // Get the name of Mkl Eager op from original TensorFlow op
@@ -131,12 +176,6 @@ inline string GetMklEagerOpName(const string& name) {
   return string(kMklEagerOpPrefix) + name;
 }
 
-// Get the name of Mkl Native (does not depend on layout propagation) op
-// from original TensorFlow op.
-inline string GetMklNativeOpName(const string& name) {
-  return string(kMklNativeOpPrefix) + name;
-}
-
 #ifdef ENABLE_INTEL_MKL_BFLOAT16
 static inline bool IsBF16SupportedByOneDNNOnThisCPU() {
   return port::TestCPUFeature(port::CPUFeature::AVX512F);
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index fdd72968a40..f3bea5c75a3 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 
 package(
@@ -72,7 +74,7 @@ tf_cuda_library(
     hdrs = ["devices.h"],
     cuda_deps = [
         "//tensorflow/core/common_runtime/gpu:gpu_init",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
     ],
     visibility = ["//visibility:public"],
     deps = [
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index 5b8a545031e..8ac60a0916e 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cuda_library")
 load(
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 02c69920b84..b205f5c3e56 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 3299e2b4dc6..41910fc9efc 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -854,6 +854,15 @@ class SymbolicShapeRefiner {
       }
     }
 
+    // ReplaceInputWithConst() may break GraphView's internal node mapping
+    // structure; hence, we separately build node name to NodeDef* map, for the
+    // output nodes (before GraphView becomes invalid). Note that we use string,
+    // not string_view.
+    absl::flat_hash_map<std::string, NodeDef*> output_nodes;
+    for (const auto& output_arg : grappler_function_item.outputs()) {
+      output_nodes[output_arg.node_name] = gv.GetNode(output_arg.node_name);
+    }
+
     // Replace input nodes with Consts, if values are known. Note that
     // we don't check exceptions here as it's done in the above loop.
     auto* ctx = GetNodeContext(function_node);
@@ -884,11 +893,13 @@ class SymbolicShapeRefiner {
                                           &grappler_function_item));
       }
     }
+    // node_name to NodeDef* map in GraphView gv can be broken due to
+    // ReplaceInputWithConst(). gv should not be used after this.
 
     // Replace output _Retval nodes with Identity nodes. _Retval is a system op
     // without outputs and registered shape function.
     for (const auto& output_arg : grappler_function_item.outputs()) {
-      NodeDef* output_node = gv.GetNode(output_arg.node_name);
+      NodeDef* output_node = output_nodes[output_arg.node_name];
       DCHECK_EQ(output_node->op(), "_Retval");
       output_node->set_op("Identity");
       output_node->mutable_attr()->erase("index");
@@ -911,12 +922,12 @@ class SymbolicShapeRefiner {
       // inputs, so port_id >= 0.
       TensorId out_tensor = ParseTensorName(out_arg.node_name);
 
-      const NodeDef* retnode = gv.GetNode(out_tensor.node());
-      if (retnode == nullptr) {
+      if (output_nodes.count(out_tensor.node()) <= 0) {
         return errors::FailedPrecondition(
             "Unable to find return function_node ", out_tensor.node(), " for ",
             function_node->name());
       }
+      const NodeDef* retnode = output_nodes[out_tensor.node()];
 
       auto output_properties = gp.GetOutputProperties(retnode->name());
       int output_properties_size = output_properties.size();
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 9fd43cdee12..5409e6077bc 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -34,6 +34,9 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
+#ifdef INTEL_MKL
+#include "tensorflow/core/graph/mkl_graph_util.h"
+#endif
 
 namespace tensorflow {
 namespace grappler {
@@ -251,9 +254,13 @@ TEST_F(GraphPropertiesTest, DynamicProperties) {
         EXPECT_EQ(1, prop.shape().dim(1).size());
         const auto out_props = properties.GetOutputProperties(node.name());
 #ifdef INTEL_MKL
-        // Intel MKL AddN OP would have two output.
-        // One is the real output, another one for MKL metadata
-        EXPECT_EQ(2, out_props.size());
+        if (!NativeFormatEnabled()) {
+          // Intel MKL AddN OP would have two output.
+          // One is the real output, another one for MKL metadata
+          EXPECT_EQ(2, out_props.size());
+        } else {
+          EXPECT_EQ(1, out_props.size());
+        }
 #else
         EXPECT_EQ(1, out_props.size());
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index 11eb621430a..16b62ab3f3a 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -65,8 +65,10 @@ constexpr char kRecv[] = "_Recv";
 constexpr char kSend[] = "_Send";
 constexpr char kBatchMatMul[] = "BatchMatMul";
 constexpr char kBatchMatMulV2[] = "BatchMatMulV2";
+constexpr char kOneHot[] = "OneHot";
 constexpr char kPack[] = "Pack";
 constexpr char kRank[] = "Rank";
+constexpr char kRange[] = "Range";
 constexpr char kShape[] = "Shape";
 constexpr char kShapeN[] = "ShapeN";
 constexpr char kSize[] = "Size";
@@ -86,6 +88,7 @@ constexpr char kSlice[] = "Slice";
 constexpr char kStridedSlice[] = "StridedSlice";
 constexpr char kSpaceToDepth[] = "SpaceToDepth";
 constexpr char kTranspose[] = "Transpose";
+constexpr char kTile[] = "Tile";
 constexpr char kMaxPool[] = "MaxPool";
 constexpr char kMaxPoolGrad[] = "MaxPoolGrad";
 constexpr char kAvgPool[] = "AvgPool";
@@ -471,8 +474,12 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
                             wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
   device_cost_impl_.emplace(kFill,
                             wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
+  device_cost_impl_.emplace(kOneHot,
+                            wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
   device_cost_impl_.emplace(kPack,
                             wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
+  device_cost_impl_.emplace(kRange,
+                            wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
   device_cost_impl_.emplace(kSpaceToDepth,
                             wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
   device_cost_impl_.emplace(kSplit,
@@ -481,6 +488,8 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
                             wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
   device_cost_impl_.emplace(kTranspose,
                             wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
+  device_cost_impl_.emplace(kTile,
+                            wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
   device_cost_impl_.emplace(kUnpack,
                             wrap(&OpLevelCostEstimator::PredictPureMemoryOp));
 
@@ -806,7 +815,7 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost(
           << " Intermediate Memory Time (ns):"
           << intermediate_memory_cost.count();
 
-  Costs costs;
+  Costs costs = Costs::ZeroCosts();
   costs.compute_time = compute_cost;
   costs.memory_time = memory_cost;
   costs.intermediate_memory_time = intermediate_memory_cost;
@@ -2355,11 +2364,16 @@ Costs OpLevelCostEstimator::PredictResizeBilinear(
     const OpContext& op_context) const {
   bool found_unknown_shapes = false;
 
+  if (op_context.op_info.outputs().empty() ||
+      op_context.op_info.inputs().empty()) {
+    return Costs::ZeroCosts(/*inaccurate=*/true);
+  }
+
   const int64 input_size =
       CalculateTensorSize(op_context.op_info.inputs(0), &found_unknown_shapes);
   const int64 output_size =
       CalculateTensorSize(op_context.op_info.outputs(0), &found_unknown_shapes);
-  const int output_elements = CalculateTensorElementCount(
+  const int64 output_elements = CalculateTensorElementCount(
       op_context.op_info.outputs(0), &found_unknown_shapes);
 
   const auto half_pixel_centers =
@@ -2373,7 +2387,7 @@ Costs OpLevelCostEstimator::PredictResizeBilinear(
   }
 
   // Compose cost of bilinear interpolation.
-  auto ops = 0;
+  int64 ops = 0;
 
 #define EIGEN_COST(X) Eigen::internal::functor_traits<Eigen::internal::X>::Cost
   const auto sub_cost_float = EIGEN_COST(scalar_difference_op<float>);
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index a17ccd25932..e38e8d187a1 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -56,6 +56,13 @@ class TestOpLevelCostEstimator : public OpLevelCostEstimator {
   DeviceInfo device_info_;
 };
 
+void ExpectZeroCost(const Costs& cost) {
+  EXPECT_TRUE(cost.inaccurate);
+  EXPECT_EQ(cost.compute_time, Costs::Duration::zero());
+  EXPECT_EQ(cost.execution_time, Costs::Duration::zero());
+  EXPECT_EQ(cost.memory_time, Costs::Duration::zero());
+}
+
 // Wrangles the minimum number of proto fields to set up a matrix.
 void DescribeMatrix(int rows, int columns, OpInfo* op_info) {
   auto input = op_info->add_inputs();
@@ -634,9 +641,11 @@ TEST_F(OpLevelCostEstimatorTest, TestPersistentOpCosts) {
     EXPECT_EQ(Costs::Duration(0), cost.memory_time);
     EXPECT_EQ(Costs::Duration(1), cost.compute_time);
     EXPECT_EQ(Costs::Duration(1), cost.execution_time);
-    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_EQ(cost.num_ops_total, 1);
     EXPECT_FALSE(cost.inaccurate);
-    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(cost.temporary_memory, 0);
+    EXPECT_EQ(cost.persistent_memory, 0);
   }
 }
 
@@ -657,9 +666,11 @@ TEST_F(OpLevelCostEstimatorTest, TestGatherCosts) {
     EXPECT_EQ(Costs::Duration(130), cost.memory_time);
     EXPECT_EQ(Costs::Duration(16), cost.compute_time);
     EXPECT_EQ(Costs::Duration(146), cost.execution_time);
-    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_EQ(cost.num_ops_total, 1);
     EXPECT_FALSE(cost.inaccurate);
-    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(cost.temporary_memory, 0);
+    EXPECT_EQ(cost.persistent_memory, 0);
   }
 }
 
@@ -678,7 +689,9 @@ TEST_F(OpLevelCostEstimatorTest, TestGatherCostsWithoutOutput) {
   EXPECT_EQ(Costs::Duration(0), cost.execution_time);
   EXPECT_EQ(1, cost.num_ops_total);
   EXPECT_TRUE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, TestSliceCosts) {
@@ -696,9 +709,11 @@ TEST_F(OpLevelCostEstimatorTest, TestSliceCosts) {
   EXPECT_EQ(Costs::Duration(81), cost.memory_time);
   EXPECT_EQ(Costs::Duration(10), cost.compute_time);
   EXPECT_EQ(Costs::Duration(91), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, TestStridedSliceCosts) {
@@ -717,9 +732,11 @@ TEST_F(OpLevelCostEstimatorTest, TestStridedSliceCosts) {
   EXPECT_EQ(Costs::Duration(81), cost.memory_time);
   EXPECT_EQ(Costs::Duration(10), cost.compute_time);
   EXPECT_EQ(Costs::Duration(91), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, TestScatterOps) {
@@ -744,9 +761,11 @@ TEST_F(OpLevelCostEstimatorTest, TestScatterOps) {
       EXPECT_EQ(Costs::Duration(205), cost.memory_time);
       EXPECT_EQ(Costs::Duration(16), cost.compute_time);
       EXPECT_EQ(Costs::Duration(221), cost.execution_time);
-      EXPECT_EQ(1, cost.num_ops_total);
+      EXPECT_EQ(cost.num_ops_total, 1);
       EXPECT_FALSE(cost.inaccurate);
-      EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+      EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+      EXPECT_EQ(cost.temporary_memory, 0);
+      EXPECT_EQ(cost.persistent_memory, 0);
     }
 
     // Test updates.shape = [] and INT32 indices
@@ -778,9 +797,11 @@ TEST_F(OpLevelCostEstimatorTest, BiasAddExecutionTime) {
   EXPECT_EQ(Costs::Duration(8400), cost.memory_time);
   EXPECT_EQ(Costs::Duration(1000), cost.compute_time);
   EXPECT_EQ(Costs::Duration(9400), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, Conv2DExecutionTime) {
@@ -788,9 +809,11 @@ TEST_F(OpLevelCostEstimatorTest, Conv2DExecutionTime) {
   EXPECT_EQ(Costs::Duration(233780), cost.memory_time);
   EXPECT_EQ(Costs::Duration(354877440), cost.compute_time);
   EXPECT_EQ(Costs::Duration(355111220), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, InvalidConv2DConfig) {
@@ -832,9 +855,11 @@ TEST_F(OpLevelCostEstimatorTest, DepthwiseConv2dNativeExecutionTime) {
   EXPECT_EQ(Costs::Duration(112340), cost.memory_time);
   EXPECT_EQ(Costs::Duration(4158720), cost.compute_time);
   EXPECT_EQ(Costs::Duration(4271060), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, DummyExecutionTime) {
@@ -842,9 +867,11 @@ TEST_F(OpLevelCostEstimatorTest, DummyExecutionTime) {
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
   EXPECT_EQ(Costs::Duration(0), cost.compute_time);
   EXPECT_EQ(Costs::Duration(2000), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_TRUE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, ExecutionTimeSumOrMax) {
@@ -853,9 +880,11 @@ TEST_F(OpLevelCostEstimatorTest, ExecutionTimeSumOrMax) {
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
   EXPECT_EQ(Costs::Duration(0), cost.compute_time);
   EXPECT_EQ(Costs::Duration(2000), cost.execution_time);  // max(2000, 200)
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_TRUE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
   SetComputeMemoryOverlap(false);  // Set it back to default.
 }
 
@@ -867,9 +896,11 @@ TEST_F(OpLevelCostEstimatorTest,
   EXPECT_EQ(Costs::Duration(825345), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355321038), cost.compute_time);
   EXPECT_EQ(Costs::Duration(356146383), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_HWIO) {
@@ -879,9 +910,11 @@ TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_HWIO) {
   EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
   EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_OIHW) {
@@ -891,9 +924,11 @@ TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_OIHW) {
   EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
   EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNHWC_HWIO) {
@@ -903,9 +938,11 @@ TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNHWC_HWIO) {
   EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
   EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNHWC_OIHW) {
@@ -915,9 +952,11 @@ TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNHWC_OIHW) {
   EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
   EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_VECT_C_OIHW) {
@@ -927,9 +966,11 @@ TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_VECT_C_OIHW) {
   EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
   EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_OIHW_VECT_I) {
@@ -939,9 +980,11 @@ TEST_F(OpLevelCostEstimatorTest, FusedConv2DBiasActivationNCHW_OIHW_VECT_I) {
   EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
   EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest,
@@ -952,9 +995,11 @@ TEST_F(OpLevelCostEstimatorTest,
   EXPECT_EQ(Costs::Duration(1416808), cost.memory_time);
   EXPECT_EQ(Costs::Duration(355616770), cost.compute_time);
   EXPECT_EQ(Costs::Duration(357033578), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, MulExecutionTime) {
@@ -962,9 +1007,11 @@ TEST_F(OpLevelCostEstimatorTest, MulExecutionTime) {
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
   EXPECT_EQ(Costs::Duration(200), cost.compute_time);
   EXPECT_EQ(Costs::Duration(2200), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, MulBroadcastExecutionTime) {
@@ -972,9 +1019,11 @@ TEST_F(OpLevelCostEstimatorTest, MulBroadcastExecutionTime) {
   EXPECT_EQ(Costs::Duration(3600), cost.memory_time);
   EXPECT_EQ(Costs::Duration(400), cost.compute_time);
   EXPECT_EQ(Costs::Duration(4000), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, ModExecutionTime) {
@@ -982,9 +1031,11 @@ TEST_F(OpLevelCostEstimatorTest, ModExecutionTime) {
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
   EXPECT_EQ(Costs::Duration(1600), cost.compute_time);
   EXPECT_EQ(Costs::Duration(3600), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, SquaredDifferenceExecutionTime) {
@@ -995,6 +1046,8 @@ TEST_F(OpLevelCostEstimatorTest, SquaredDifferenceExecutionTime) {
   EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
   EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, UnaryOpExecutionTime) {
@@ -1021,6 +1074,8 @@ TEST_F(OpLevelCostEstimatorTest, UnaryOpExecutionTime) {
     EXPECT_EQ(cost.num_ops_total, 1);
     EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
     EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(cost.temporary_memory, 0);
+    EXPECT_EQ(cost.persistent_memory, 0);
   }
 }
 
@@ -1051,9 +1106,11 @@ TEST_F(OpLevelCostEstimatorTest, BinaryOpExecutionTime) {
     EXPECT_EQ(Costs::Duration(expected_compute_time + kExpectedMemoryTime),
               cost.execution_time)
         << binary_op.first;
-    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_EQ(cost.num_ops_total, 1);
     EXPECT_FALSE(cost.inaccurate);
-    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(cost.temporary_memory, 0);
+    EXPECT_EQ(cost.persistent_memory, 0);
   }
 }
 
@@ -1069,9 +1126,11 @@ TEST_F(OpLevelCostEstimatorTest, BroadcastAddExecutionTime) {
   EXPECT_EQ(Costs::Duration(44), cost.memory_time);
   EXPECT_EQ(Costs::Duration(100), cost.compute_time);
   EXPECT_EQ(Costs::Duration(144), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, UnknownOrPartialShape) {
@@ -1339,9 +1398,11 @@ TEST_F(OpLevelCostEstimatorTest, PredictMaxPool) {
     EXPECT_EQ(Costs::Duration(1075200), costs.execution_time);
     EXPECT_EQ(Costs::Duration(307200), costs.compute_time);
     EXPECT_EQ(Costs::Duration(768000), costs.memory_time);
-    EXPECT_EQ(1, costs.num_ops_total);
+    EXPECT_EQ(costs.num_ops_total, 1);
     EXPECT_FALSE(costs.inaccurate);
-    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
+    EXPECT_EQ(costs.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(costs.temporary_memory, 0);
+    EXPECT_EQ(costs.persistent_memory, 0);
   }
   {
     // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
@@ -1381,9 +1442,11 @@ TEST_F(OpLevelCostEstimatorTest, PredictMaxPoolGrad) {
     EXPECT_EQ(Costs::Duration(1996800), costs.execution_time);
     EXPECT_EQ(Costs::Duration(614400), costs.compute_time);
     EXPECT_EQ(Costs::Duration(1382400), costs.memory_time);
-    EXPECT_EQ(1, costs.num_ops_total);
+    EXPECT_EQ(costs.num_ops_total, 1);
     EXPECT_FALSE(costs.inaccurate);
-    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
+    EXPECT_EQ(costs.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(costs.temporary_memory, 0);
+    EXPECT_EQ(costs.persistent_memory, 0);
   }
   {
     // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
@@ -1422,9 +1485,11 @@ TEST_F(OpLevelCostEstimatorTest, PredictAvgPool) {
     EXPECT_EQ(Costs::Duration(1113600), costs.execution_time);
     EXPECT_EQ(Costs::Duration(345600), costs.compute_time);
     EXPECT_EQ(Costs::Duration(768000), costs.memory_time);
-    EXPECT_EQ(1, costs.num_ops_total);
+    EXPECT_EQ(costs.num_ops_total, 1);
     EXPECT_FALSE(costs.inaccurate);
-    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
+    EXPECT_EQ(costs.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(costs.temporary_memory, 0);
+    EXPECT_EQ(costs.persistent_memory, 0);
   }
   {
     // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
@@ -1464,9 +1529,11 @@ TEST_F(OpLevelCostEstimatorTest, PredictAvgPoolGrad) {
     EXPECT_EQ(Costs::Duration(1305602), costs.execution_time);
     EXPECT_EQ(Costs::Duration(537600), costs.compute_time);
     EXPECT_EQ(Costs::Duration(768002), costs.memory_time);
-    EXPECT_EQ(1, costs.num_ops_total);
+    EXPECT_EQ(costs.num_ops_total, 1);
     EXPECT_FALSE(costs.inaccurate);
-    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
+    EXPECT_EQ(costs.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(costs.temporary_memory, 0);
+    EXPECT_EQ(costs.persistent_memory, 0);
   }
   {
     // 1x1 window with 2x2 stride: used for shortcut in resnet-50.
@@ -1503,9 +1570,11 @@ TEST_F(OpLevelCostEstimatorTest, PredictFusedBatchNorm) {
     EXPECT_EQ(Costs::Duration(614737), costs.execution_time);
     EXPECT_EQ(Costs::Duration(153706), costs.compute_time);
     EXPECT_EQ(Costs::Duration(461031), costs.memory_time);
-    EXPECT_EQ(1, costs.num_ops_total);
+    EXPECT_EQ(costs.num_ops_total, 1);
     EXPECT_FALSE(costs.inaccurate);
-    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
+    EXPECT_EQ(costs.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(costs.temporary_memory, 0);
+    EXPECT_EQ(costs.persistent_memory, 0);
   }
 
   {
@@ -1552,9 +1621,11 @@ TEST_F(OpLevelCostEstimatorTest, PredictFusedBatchNormGrad) {
     EXPECT_EQ(Costs::Duration(1037050), costs.execution_time);
     EXPECT_EQ(Costs::Duration(422496), costs.compute_time);
     EXPECT_EQ(Costs::Duration(614554), costs.memory_time);
-    EXPECT_EQ(1, costs.num_ops_total);
+    EXPECT_EQ(costs.num_ops_total, 1);
     EXPECT_FALSE(costs.inaccurate);
-    EXPECT_EQ(0, costs.num_ops_with_unknown_shapes);
+    EXPECT_EQ(costs.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(costs.temporary_memory, 0);
+    EXPECT_EQ(costs.persistent_memory, 0);
   }
 
   {
@@ -1701,9 +1772,11 @@ TEST_F(OpLevelCostEstimatorTest, Einsum) {
     EXPECT_EQ(Costs::Duration(100 * 50 * 100 * 2 / (1000 * 10 * 1e-3)),
               cost.compute_time);
     EXPECT_EQ(Costs::Duration(4000), cost.memory_time);
-    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_EQ(cost.num_ops_total, 1);
     EXPECT_FALSE(cost.inaccurate);
-    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(cost.temporary_memory, 0);
+    EXPECT_EQ(cost.persistent_memory, 0);
 
     // Einsums and XlaEinsums should be estimated similarly.
     EXPECT_EQ(PredictCosts(DescribeEinsum({100, 50}, {100, 50}, "ik,jk->ij"))
@@ -1921,6 +1994,8 @@ TEST_F(OpLevelCostEstimatorTest, PredictResourceVariableOps) {
     EXPECT_EQ(Costs::Duration(0), cost.compute_time);
     EXPECT_EQ(Costs::Duration(400), cost.execution_time);
     EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(cost.temporary_memory, 0);
+    EXPECT_EQ(cost.persistent_memory, 0);
   }
 
   {
@@ -1949,9 +2024,11 @@ TEST_F(OpLevelCostEstimatorTest, AddNExecutionTime) {
   EXPECT_EQ(Costs::Duration(1200), cost.memory_time);
   EXPECT_EQ(Costs::Duration(200), cost.compute_time);
   EXPECT_EQ(Costs::Duration(1400), cost.execution_time);
-  EXPECT_EQ(1, cost.num_ops_total);
+  EXPECT_EQ(cost.num_ops_total, 1);
   EXPECT_FALSE(cost.inaccurate);
-  EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+  EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  EXPECT_EQ(cost.temporary_memory, 0);
+  EXPECT_EQ(cost.persistent_memory, 0);
 }
 
 TEST_F(OpLevelCostEstimatorTest, IdentityOpExecutionTime) {
@@ -1974,9 +2051,11 @@ TEST_F(OpLevelCostEstimatorTest, IdentityOpExecutionTime) {
     EXPECT_EQ(Costs::Duration(kExpectedComputeTime + kExpectedMemoryTime),
               cost.execution_time);
     EXPECT_EQ(cost.max_memory, kTensorSize * 4);
-    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_EQ(cost.num_ops_total, 1);
     EXPECT_FALSE(cost.inaccurate);
-    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(cost.temporary_memory, 0);
+    EXPECT_EQ(cost.persistent_memory, 0);
   }
 }
 
@@ -1984,10 +2063,11 @@ TEST_F(OpLevelCostEstimatorTest, PureMemoryOpExecutionTime) {
   std::vector<std::string> reshape_ops = {
       "ConcatV2",     "DataFormatVecPermute",
       "DepthToSpace", "ExpandDims",
-      "Fill",         "Pack",
+      "Fill",         "OneHot",
+      "Pack",         "Range",
       "SpaceToDepth", "Split",
       "Squeeze",      "Transpose",
-      "Unpack"};
+      "Tile",         "Unpack"};
 
   const int kTensorSize = 1000;
   for (auto reshape_op : reshape_ops) {
@@ -2002,15 +2082,34 @@ TEST_F(OpLevelCostEstimatorTest, PureMemoryOpExecutionTime) {
     EXPECT_EQ(Costs::Duration(kExpectedComputeTime + kExpectedMemoryTime),
               cost.execution_time);
     EXPECT_EQ(cost.max_memory, kTensorSize * 4);
-    EXPECT_EQ(1, cost.num_ops_total);
+    EXPECT_EQ(cost.num_ops_total, 1);
     EXPECT_FALSE(cost.inaccurate);
-    EXPECT_EQ(0, cost.num_ops_with_unknown_shapes);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(cost.temporary_memory, 0);
+    EXPECT_EQ(cost.persistent_memory, 0);
   }
 }
+
 TEST_F(OpLevelCostEstimatorTest, ResizeBilinearExecutionTime) {
   const int kImageDim = 255;
   const int kChannelSize = 10;
   const int kComputeLerpCost = 9;
+  {
+    OpContext op_context;
+    SetCpuDevice(&op_context.op_info);
+    op_context.op_info.set_op("ResizeBilinear");
+    DescribeTensor4D(1, kImageDim, kImageDim, kChannelSize,
+                     op_context.op_info.add_inputs());
+    // Test with no output.
+    auto cost = PredictCosts(op_context);
+    ExpectZeroCost(cost);
+    op_context.op_info.clear_inputs();
+
+    DescribeTensor4D(0, 0, 0, 0, op_context.op_info.add_outputs());
+    // Test with no input.
+    cost = PredictCosts(op_context);
+    ExpectZeroCost(cost);
+  }
   {
     // Test with size 0 output.
     OpContext op_context;
@@ -2030,6 +2129,8 @@ TEST_F(OpLevelCostEstimatorTest, ResizeBilinearExecutionTime) {
     EXPECT_EQ(cost.execution_time, Costs::Duration(kExpectedMemoryTime));
     EXPECT_TRUE(cost.inaccurate);
     EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+    EXPECT_EQ(cost.temporary_memory, 0);
+    EXPECT_EQ(cost.persistent_memory, 0);
 
     AttrValue half_pixel_centers;
     half_pixel_centers.set_b(false);
@@ -2101,6 +2202,41 @@ TEST_F(OpLevelCostEstimatorTest, ResizeBilinearExecutionTime) {
     EXPECT_FALSE(cost.inaccurate);
     EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
   }
+
+  {
+    // Cost with very large tensor.
+    op_context.op_info.clear_outputs();
+    // Number of elements in tensor exceeds 2^32.
+    constexpr int64 kLargeOutputImageDim = 40000;
+    DescribeTensor4D(1, kLargeOutputImageDim, kLargeOutputImageDim,
+                     kChannelSize, op_context.op_info.add_outputs());
+    const int64 kInterpWeightCost = 12;
+    // Using half_pixel_centers.
+    AttrValue half_pixel_centers;
+    half_pixel_centers.set_b(true);
+    (*op_context.op_info.mutable_attr())["half_pixel_centers"] =
+        half_pixel_centers;
+
+    const int64 num_ops =
+        kInterpWeightCost * (kLargeOutputImageDim * 2) +
+        kComputeLerpCost *
+            (kLargeOutputImageDim * kLargeOutputImageDim * kChannelSize);
+    const int64 expected_compute_time = std::ceil(
+        num_ops /
+        estimator_.GetDeviceInfo(op_context.op_info.device()).gigaops);
+
+    const int64 expected_memory_time =
+        (kImageDim * kImageDim + kLargeOutputImageDim * kLargeOutputImageDim) *
+        4;
+
+    const auto cost = PredictCosts(op_context);
+    EXPECT_EQ(cost.compute_time, Costs::Duration(expected_compute_time));
+    EXPECT_EQ(cost.memory_time, Costs::Duration(expected_memory_time));
+    EXPECT_EQ(cost.execution_time,
+              Costs::Duration(expected_memory_time + expected_compute_time));
+    EXPECT_FALSE(cost.inaccurate);
+    EXPECT_EQ(cost.num_ops_with_unknown_shapes, 0);
+  }
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 392eff98c78..a8a337bc3fa 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -361,6 +361,8 @@ std::unique_ptr<ReadyNodeManager> ReadyNodeManagerFactory(
   return nullptr;
 }
 
+SchedulerState::~SchedulerState() {}
+
 SchedulerState::SchedulerState(const bool use_static_shapes,
                                const bool use_aggressive_shape_inference,
                                Cluster* cluster,
@@ -531,7 +533,6 @@ Status SchedulerState::Init(const GrapplerItem* item,
       initial_nodes->push_back(curr_node);
       VLOG(3) << "Added ready node: " << curr_node->name();
     }
-
     feed_nodes.erase(curr_node->name());
 
     if (IsPersistent(*curr_node)) {
@@ -778,6 +779,10 @@ NodeState& SchedulerState::GetNodeStateOrCreateIt(const NodeDef* node) {
   node_state.num_outputs_executed[-1] = 0;
   node_state.outputs[-1] = {};
 
+  // Initialize time_scheduled to infinity, so we know whether it has been
+  // assigned a non-default value later.
+  node_state.time_scheduled = Costs::Duration().infinity();
+
   return it->second;
 }
 
@@ -862,10 +867,16 @@ std::vector<const NodeDef*> SchedulerState::MarkNodeExecuted(
   // Node is scheduled when the device is available AND all the inputs are
   // ready; hence, time_scheduled is time_ready if time_ready > device curr
   // time.
-  node_state.time_scheduled =
-      std::max(device.GetCurrTime(), node_state.time_ready);
-  // Override device curr time with the time_scheduled.
-  device.device_costs.execution_time = node_state.time_scheduled;
+  // NodeState times are assigned infinity at initialization. If they are
+  // still infinity here, we need to assign them. If not, it has been assigned
+  // already, so skip. This latter case may occur when a scheduler in-lines
+  // function calls, and thus schedules only function sub-nodes.
+  if (node_state.time_scheduled == Costs::Duration().infinity()) {
+    node_state.time_scheduled =
+        std::max(device.GetCurrTime(), node_state.time_ready);
+    // Override device curr time with the time_scheduled.
+    device.device_costs.execution_time = node_state.time_scheduled;
+  }
   device.device_costs = CombineCosts(device.device_costs, total_node_costs);
   auto curr_time = device.GetCurrTime();
   node_state.time_finished = curr_time;
@@ -1000,8 +1011,13 @@ Costs SchedulerState::Summary() const {
     for (const auto& node_port : state.persistent_nodes) {
       const auto* node = node_port.first;
       const auto port = node_port.second;
-      const auto output_size =
-          CalculateOutputSize(node_map_.at(node).output_properties, port);
+      auto output_size = 0;
+      // Check if the node is in the node_map. It may be that the node executed
+      // on this device was executed by a different Scheduler.
+      if (node_map_.find(node) != node_map_.end()) {
+        output_size =
+            CalculateOutputSize(node_map_.at(node).output_properties, port);
+      }
       persistent_memory_usage += output_size;
       op_to_memory[node->op()] += output_size;
       persistent_ops.insert(node->op());
@@ -1048,8 +1064,12 @@ Costs SchedulerState::Summary() const {
     for (const auto& node_port : state.mem_usage_snapshot_at_peak) {
       const auto* node = node_port.first;
       const auto port = node_port.second;
-      op_to_memory[node->op()] +=
-          CalculateOutputSize(node_map_.at(node).output_properties, port);
+      // Check if the node is in the node_map. It may be that the node executed
+      // on this device was executed by a different Scheduler.
+      if (node_map_.find(node) != node_map_.end()) {
+        op_to_memory[node->op()] +=
+            CalculateOutputSize(node_map_.at(node).output_properties, port);
+      }
     }
     Costs::NanoSeconds total_compute_time_ns;
     bool is_total_cost_accurate = true;
@@ -1132,6 +1152,12 @@ void SchedulerState::GenerateRunMetadata(RunMetadata* metadata) {
     DeviceStepStats* device_stepstats = stepstats->add_dev_stats();
     device_stepstats->set_device(device.first);
     for (const auto& node_def : device.second.nodes_executed) {
+      // Only proceed if the node is in the node_map. This is to cover the case
+      // where a device has executed a node that is not in the node_map of
+      // this scheduler.
+      if (node_map_.find(node_def) == node_map_.end()) {
+        continue;
+      }
       const NodeState& nodestate = node_map_.at(node_def);
       NodeExecStats* node_stats = device_stepstats->add_node_stats();
       uint64 total_output_size = 0;
@@ -1229,15 +1255,29 @@ SchedulerState::GetPersistentMemoryUsage() const {
   return result;
 }
 
+void SchedulerState::SetNodeStateTimeScheduled(const NodeDef* node) {
+  auto& node_state = node_map_.at(node);
+  auto& device = device_[node_state.device_name];
+  node_state.time_scheduled = device.GetCurrTime();
+}
+
+VirtualScheduler::~VirtualScheduler() {}
+
 VirtualScheduler::VirtualScheduler(const bool use_static_shapes,
                                    const bool use_aggressive_shape_inference,
                                    Cluster* cluster,
                                    ReadyNodeManager* ready_nodes,
                                    std::unique_ptr<VirtualPlacer> placer)
-    : scheduler_state_(use_static_shapes, use_aggressive_shape_inference,
-                       cluster, std::move(placer)),
+    : scheduler_state_(absl::make_unique<SchedulerState>(
+          use_static_shapes, use_aggressive_shape_inference, cluster,
+          std::move(placer))),
       ready_nodes_(ready_nodes) {}
 
+VirtualScheduler::VirtualScheduler(
+    ReadyNodeManager* ready_nodes,
+    std::unique_ptr<SchedulerState> scheduler_state)
+    : scheduler_state_(std::move(scheduler_state)), ready_nodes_(ready_nodes) {}
+
 Status VirtualScheduler::Init(const GrapplerItem* item) {
   // SchedulerState::Init() preprocesses the input grappler_item and
   // graph_properties to extract necessary information for emulating tensorflow
@@ -1245,7 +1285,7 @@ Status VirtualScheduler::Init(const GrapplerItem* item) {
   // DeviceState) for virtual scheduling.
   TF_RETURN_IF_ERROR(ready_nodes_->Init(GetNodeStates()));
   std::vector<const NodeDef*> initial_nodes;
-  auto status = scheduler_state_.Init(item, &initial_nodes);
+  auto status = scheduler_state_->Init(item, &initial_nodes);
   if (status.ok()) {
     // Add the set of initial nodes to ready_nodes_
     for (auto node : initial_nodes) {
@@ -1255,17 +1295,17 @@ Status VirtualScheduler::Init(const GrapplerItem* item) {
   return status;
 }
 
-OpContext VirtualScheduler::GetCurrNode() const {
+OpContext VirtualScheduler::GetCurrNode() {
   const NodeDef* node = ready_nodes_->GetCurrNode();
-  return scheduler_state_.CreateOpContext(node);
+  return scheduler_state_->CreateOpContext(node);
 }
 
 bool VirtualScheduler::MarkCurrNodeExecuted(const Costs& node_costs) {
   // Update graph_costs_ and per-op costs.
   const NodeDef* node = ready_nodes_->GetCurrNode();
-  auto new_nodes = scheduler_state_.MarkNodeExecuted(
+  auto new_nodes = scheduler_state_->MarkNodeExecuted(
       node, node_costs,
-      scheduler_state_.CreateOpContext(ready_nodes_->GetCurrNode()));
+      scheduler_state_->CreateOpContext(ready_nodes_->GetCurrNode()));
   ready_nodes_->RemoveCurrNode();
   // Add the set of new nodes obtained from MarkNodeExecuted() to ready_nodes_.
   for (auto node : new_nodes) {
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 0e15b9842a1..04f1e571ae5 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -324,6 +324,21 @@ class SchedulerState {
   SchedulerState(const bool use_static_shapes,
                  const bool use_aggressive_shape_inference, Cluster* cluster,
                  std::unique_ptr<VirtualPlacer> placer);
+  // Move constructor. Explicitly defined because it otherwise gets implicitly
+  // deleted. SchedulerState is a move-only class, as we have a <unique_ptr>
+  // for it in VirtualScheduler. A derivative of VirtualScheduler can move a
+  // <unique_ptr> SchedulerState to VirtualScheduler when it is constructed,
+  // which is where this move constructor is needed.
+  SchedulerState(SchedulerState&& arg) = default;
+  // We explicitly delete assinment and copy operators, this is done implicitly,
+  // but we state it here explicitly for clarity.
+  SchedulerState& operator=(SchedulerState&& arg) = delete;
+  SchedulerState(const SchedulerState&) = delete;
+  SchedulerState& operator=(const SchedulerState&) = delete;
+  // Destructor. Must be defined such that a derivative class can override it
+  // and allow proper desctruction of the derivative class. If this is not done
+  // properly, memory leaks can occur.
+  virtual ~SchedulerState();
   // Sets up the graph while also performing some necessary transformations
   // initial_nodes is the set of nodes (primary inputs) discovered by Init()
   // which may be added by a ReadyNodeManager (or related/derivative scheduler)
@@ -332,12 +347,14 @@ class SchedulerState {
               std::vector<const NodeDef*>* initial_nodes,
               bool create_explicit_channel_device = true);
 
-  Costs Summary() const;
+  virtual Costs Summary() const;
   // Like the above, but writes detailed stats to RunMetadata.
   // If metadata is nullptr, then just calls and return Summary().
-  Costs Summary(RunMetadata* metadata);
+  virtual Costs Summary(RunMetadata* metadata);
   // Generates RunMetadata's step_stats and partition_graphs fields from results
   // of the virtual execution of the graph.
+  // TODO(rdegruijl) See if we can make this function and caller Summary()
+  // const.
   void GenerateRunMetadata(RunMetadata* metadata);
 
   // Returns per device memory usage.
@@ -358,6 +375,29 @@ class SchedulerState {
                                                const Costs& node_costs,
                                                const OpContext& op_context);
 
+  // Some getter functions.
+  const GrapplerItem* GetGrapplerItem() { return grappler_item_; }
+  Costs GetGraphCost() { return graph_costs_; }
+  Cluster* GetCluster() { return cluster_; }
+  bool GetUseStaticShape() { return use_static_shapes_; }
+  bool GetUseAggressiveShapeInference() {
+    return use_aggressive_shape_inference_;
+  }
+  const std::unordered_map<const NodeDef*, NodeState>& GetNodeMap() {
+    return node_map_;
+  }
+
+ protected:
+  // Assigns the time_scheduled in the NodeState of node to the current
+  // execution_time of the device executing this node.
+  void SetNodeStateTimeScheduled(const NodeDef* node);
+
+  // This method can be used by a class derived from SchedulerState to
+  // access the device state map.
+  std::unordered_map<string, DeviceState>* GetMutableDeviceState() {
+    return &device_;
+  }
+
  private:
   // Methods called from Init(). Fails if initialize_ is set.
 
@@ -415,6 +455,15 @@ class VirtualScheduler {
                    const bool use_aggressive_shape_inference, Cluster* cluster,
                    ReadyNodeManager* ready_nodes,
                    std::unique_ptr<VirtualPlacer> placer);
+  // This constructor can be called by a derivative of VirtualScheduler to
+  // construct the base class. It lets VirtualScheduler take ownership of
+  // a new SchedulerState or a derivative thereof.
+  // Note that this constructor does not set a VirtualPlacer, in this
+  // constructor the VirtialPlacer is passed as a member of the SchedulerState
+  // that is passed as an argument.
+  VirtualScheduler(ReadyNodeManager* ready_nodes,
+                   std::unique_ptr<SchedulerState> scheduler_state);
+  virtual ~VirtualScheduler();
 
   // Initializes the scheduler for the specific grappler item.
   // Should be called immediately after the c'tor or when the scheduler will be
@@ -424,51 +473,51 @@ class VirtualScheduler {
   // This function should be called at least once after the scheduler is
   // constructed. An uninitialized or failed-to-initialize scheduler will cause
   // undefined behavior.
-  Status Init(const GrapplerItem* item);
+  virtual Status Init(const GrapplerItem* item);
 
   // Gets the current scheduled node for execution; the caller of this function
   // can accordingly simulate the execution of the current scheduled node.
-  OpContext GetCurrNode() const;
+  virtual OpContext GetCurrNode();
   // Marks the current scheduled node as executed. Note that we should call this
   // function only after the execution of the node has been simulated;
   // node_costs_ capture the simulated costs of the node.
   // Returns true if there is any node to be scheduled.
-  bool MarkCurrNodeExecuted(const Costs& node_costs);
+  virtual bool MarkCurrNodeExecuted(const Costs& node_costs);
 
   // Prints out summary of execution (timing, memory usage, etc.)
-  Costs Summary() const { return scheduler_state_.Summary(); }
+  Costs Summary() const { return scheduler_state_->Summary(); }
   // Like the above, but writes detailed stats to RunMetadata.
   // If metadata is nullptr, then just calls and return Summary().
   Costs Summary(RunMetadata* metadata) {
-    return scheduler_state_.Summary(metadata);
+    return scheduler_state_->Summary(metadata);
   }
   // Generates RunMetadata's step_stats and partition_graphs fields from results
   // of the virtual execution of the graph.
   void GenerateRunMetadata(RunMetadata* metadata) {
-    scheduler_state_.GenerateRunMetadata(metadata);
+    scheduler_state_->GenerateRunMetadata(metadata);
   }
   // Returns per device memory usage.
   const std::unordered_map<string, int64> GetPeakMemoryUsage() const {
-    return scheduler_state_.GetPeakMemoryUsage();
+    return scheduler_state_->GetPeakMemoryUsage();
   }
   const std::unordered_map<string, int64> GetPersistentMemoryUsage() const {
-    return scheduler_state_.GetPersistentMemoryUsage();
+    return scheduler_state_->GetPersistentMemoryUsage();
   }
   // Returns VirtualScheduler (read only) device and node states.
   const std::unordered_map<string, DeviceState>* GetDeviceStates() const {
-    return scheduler_state_.GetDeviceStates();
+    return scheduler_state_->GetDeviceStates();
   }
   const std::unordered_map<const NodeDef*, NodeState>* GetNodeStates() const {
-    return scheduler_state_.GetNodeStates();
+    return scheduler_state_->GetNodeStates();
   }
   void enable_mem_usage_tracking() {
-    scheduler_state_.enable_mem_usage_tracking();
+    scheduler_state_->enable_mem_usage_tracking();
   }
 
- private:
+ protected:
   // The state of the scheduler and the execution of the graph is encapsulated
   // by the scheduler_state_ object.
-  SchedulerState scheduler_state_;
+  std::unique_ptr<SchedulerState> scheduler_state_;
   // ready_nodes_ is responsible for ordering the traversal of the graph.
   ReadyNodeManager* ready_nodes_;  // Not owned.
 };
diff --git a/tensorflow/core/grappler/graph_analyzer/BUILD b/tensorflow/core/grappler/graph_analyzer/BUILD
index d252a950557..03ebad9686a 100644
--- a/tensorflow/core/grappler/graph_analyzer/BUILD
+++ b/tensorflow/core/grappler/graph_analyzer/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
diff --git a/tensorflow/core/grappler/inputs/BUILD b/tensorflow/core/grappler/inputs/BUILD
index 250665995e6..9faad74f7d8 100644
--- a/tensorflow/core/grappler/inputs/BUILD
+++ b/tensorflow/core/grappler/inputs/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
diff --git a/tensorflow/core/grappler/inputs/testdata/BUILD b/tensorflow/core/grappler/inputs/testdata/BUILD
new file mode 100644
index 00000000000..42b3bef163d
--- /dev/null
+++ b/tensorflow/core/grappler/inputs/testdata/BUILD
@@ -0,0 +1,16 @@
+# Description:
+# grappler testdata packages.
+
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "test_file",
+    srcs = [
+        "test_file.txt",
+    ],
+    visibility = ["//tensorflow/core/grappler/inputs:__pkg__"],
+)
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index 6b961c1e18f..bb9d2379841 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -334,6 +334,12 @@ bool IsImmutableConst(const NodeDef& node) {
 
 bool IsInvGrad(const NodeDef& node) { return node.op() == "InvGrad"; }
 
+bool IsLeakyRelu(const NodeDef& node) { return node.op() == "LeakyRelu"; }
+
+bool IsLeakyReluGrad(const NodeDef& node) {
+  return node.op() == "LeakyReluGrad";
+}
+
 bool IsLess(const NodeDef& node) { return node.op() == "Less"; }
 
 bool IsLessEqual(const NodeDef& node) { return node.op() == "LessEqual"; }
@@ -575,6 +581,8 @@ bool IsSymbolicGradient(const NodeDef& node) {
   return node.op() == "SymbolicGradient";
 }
 
+bool IsTanh(const NodeDef& node) { return node.op() == "Tanh"; }
+
 bool IsTanhGrad(const NodeDef& node) { return node.op() == "TanhGrad"; }
 
 bool IsTensorArray(const NodeDef& node) {
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 1bf26721847..e844f961ca3 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -99,6 +99,8 @@ bool IsIgammac(const NodeDef& node);
 bool IsImag(const NodeDef& node);
 bool IsImmutableConst(const NodeDef& node);
 bool IsInvGrad(const NodeDef& node);
+bool IsLeakyRelu(const NodeDef& node);
+bool IsLeakyReluGrad(const NodeDef& node);
 bool IsLess(const NodeDef& node);
 bool IsLessEqual(const NodeDef& node);
 bool IsLog(const NodeDef& node);
@@ -187,6 +189,7 @@ bool IsSub(const NodeDef& node);
 bool IsSum(const NodeDef& node);
 bool IsSwitch(const NodeDef& node);
 bool IsSymbolicGradient(const NodeDef& node);
+bool IsTanh(const NodeDef& node);
 bool IsTanhGrad(const NodeDef& node);
 bool IsTensorArray(const NodeDef& node);
 bool IsTile(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 9d2925e8452..70a5d59e2b3 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -1,4 +1,11 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cc_test_mkl", "tf_copts", "tf_cuda_cc_test", "tf_kernel_library")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_cc_test_mkl", "tf_copts", "tf_cuda_cc_test")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 
 # Platform specific build config
 load(
@@ -534,7 +541,10 @@ cc_library(
 tf_cuda_cc_test(
     name = "memory_optimizer_test",
     srcs = ["memory_optimizer_test.cc"],
-    tags = ["no_cuda_on_cpu_tap"],  # Do not re-enable again without actually testing.
+    tags = [
+        "no_cuda_on_cpu_tap",  # Do not re-enable again without actually testing.
+        "no_windows",
+    ],
     deps = [
         ":gpu_swapping_kernels",
         ":gpu_swapping_ops",
@@ -707,6 +717,7 @@ tf_cuda_cc_test(
         ":custom_graph_optimizer_registry",
         ":meta_optimizer",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
@@ -880,6 +891,7 @@ tf_cuda_cc_test(
     deps = [
         ":remapper",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -902,6 +914,7 @@ tf_cc_test_mkl(
     deps = [
         ":remapper",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 8b784c602db..f0a15cbce49 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/graph_topology_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -666,7 +667,8 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
 
     // add new Add node
     NodeDef* node = AddEmptyNode(node_name);
-    node->set_op("Add");
+    node->set_op((dtype == DT_STRING || dtype == DT_STRING_REF) ? "Add"
+                                                                : "AddV2");
     node->set_device(root_node.device());
     (*node->mutable_attr())["T"].set_type(dtype);
     node->add_input(left.input);
@@ -783,7 +785,7 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
   // Get a name new inner Add node
   string InnerAddNodeName(const NodeDef* node) const {
     auto scope_and_name = ParseNodeScopeAndName(node->name());
-    return OptimizedNodeName(scope_and_name, "Add");
+    return OptimizedNodeName(scope_and_name, "AddV2");
   }
 
   // Determine the set of common factors if the input nodes are all Mul or
@@ -1361,7 +1363,7 @@ class RemoveNegationStage : public ArithmeticOptimizerStage {
       // a - (-b) = a + b or  a + (-b) = a - b
       ForwardControlDependencies(node, {y});
       ctx().node_map->UpdateInput(node->name(), node->input(1), y->input(0));
-      node->set_op(IsAdd(*node) ? "Sub" : "Add");
+      node->set_op(IsAdd(*node) ? "Sub" : "AddV2");
       node->set_input(1, y->input(0));
       updated = true;
     } else if (IsAdd(*node) && IsNeg(*x)) {
@@ -1956,7 +1958,7 @@ class RemoveRedundantReshapeOrBroadcastTo : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
 
     // 1. Bypass reshape followed by reshape.
-    if (IsReshape(*node) && IsReshape(*input)) {
+    if (IsReshape(*node) && IsReshape(*input) && !IsInPreserveSet(*input)) {
       ForwardControlDependencies(node, {input});
       node->set_input(0, input->input(0));
       ctx().node_map->UpdateInput(node->name(), input->name(), input->input(0));
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index d8c60ec897b..ec40ade8248 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -44,7 +44,7 @@ constexpr char kHoistFactorOptimizerMul[] =
     "ArithmeticOptimizer/HoistCommonFactor_Mul_";
 
 constexpr char kHoistFactorOptimizerAdd[] =
-    "ArithmeticOptimizer/HoistCommonFactor_Add_";
+    "ArithmeticOptimizer/HoistCommonFactor_AddV2_";
 
 constexpr char kSimplifyAggregationConst[] =
     "ArithmeticOptimizer/SimplifyAggregation_Const_";
@@ -1041,23 +1041,50 @@ TEST_F(ArithmeticOptimizerTest, RemoveRedundantReshapeCombineReshapes) {
       ops::Const(s.WithOpName("flatten_shape"), {8, 28 * 28 * 12}, {2}));
   Output outputs = ops::Identity(s.WithOpName("outputs"), flatten);
 
-  GrapplerItem item;
-  item.fetch = {"outputs"};
-  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  GraphDef graph;
+  TF_CHECK_OK(s.ToGraphDef(&graph));
   auto x_t = GenerateRandomTensor<DT_INT8>(TensorShape({8, 3, 28, 28, 4}));
-  item.feed = {{"nchw_vect_c", x_t}};
-  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
-  ASSERT_EQ(tensors_expected.size(), 1);
+  auto eval = EvaluateNodes(graph, {"outputs", "nhwc"}, {{"nchw_vect_c", x_t}});
 
-  GraphDef output;
-  ArithmeticOptimizer optimizer;
-  EnableOnlyRemoveRedundantReshape(&optimizer);
-  OptimizeTwiceAndPrune(&optimizer, &item, &output);
+  ASSERT_EQ(eval.size(), 2);
+  auto expected_output_t = eval[0];
+  auto nhwc_t = eval[1];
 
-  EXPECT_EQ(CountOpNodes(output, "Reshape"), 1);
-  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
-  ASSERT_EQ(tensors.size(), 1);
-  test::ExpectTensorEqual<int8>(tensors[0], tensors_expected[0]);
+  {
+    GrapplerItem item;
+    item.graph = graph;
+    item.fetch = {"outputs"};
+    item.feed = {{"nchw_vect_c", x_t}};
+
+    GraphDef output;
+    ArithmeticOptimizer optimizer;
+    EnableOnlyRemoveRedundantReshape(&optimizer);
+    OptimizeTwiceAndPrune(&optimizer, &item, &output);
+
+    EXPECT_EQ(CountOpNodes(output, "Reshape"), 1);
+    auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+    ASSERT_EQ(tensors.size(), 1);
+    test::ExpectTensorEqual<int8>(tensors[0], expected_output_t);
+  }
+
+  // Test when the first reshape node output is the feed tensor.
+  // (Expected no reshape removal to happen.)
+  {
+    GrapplerItem item;
+    item.graph = graph;
+    item.fetch = {"outputs"};
+    item.feed = {{"nhwc", nhwc_t}};
+
+    GraphDef output;
+    ArithmeticOptimizer optimizer;
+    EnableOnlyRemoveRedundantReshape(&optimizer);
+    OptimizeTwiceAndPrune(&optimizer, &item, &output);
+
+    EXPECT_EQ(CountOpNodes(output, "Reshape"), 2);
+    auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+    ASSERT_EQ(tensors.size(), 1);
+    test::ExpectTensorEqual<int8>(tensors[0], expected_output_t);
+  }
 }
 
 TEST_F(ArithmeticOptimizerTest, ReorderTransposeCastProducerIsCast) {
@@ -2219,7 +2246,7 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewriteMinimizeBCast) {
   // Then add results together starting from smaller shapes [a, x] + [b, y]
   const NodeDef* outer_0_node = node_map.GetNode(outer_0_add_name);
   ASSERT_NE(outer_0_node, nullptr);
-  EXPECT_EQ(outer_0_node->op(), "Add");
+  EXPECT_EQ(outer_0_node->op(), "AddV2");
   ASSERT_EQ(outer_0_node->input_size(), 2);
   EXPECT_EQ(outer_0_node->input(0), inner_0_add_name);
   EXPECT_EQ(outer_0_node->input(1), inner_1_add_name);
@@ -2227,7 +2254,7 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewriteMinimizeBCast) {
   // And finally top level Add node
   const NodeDef* outer_node = node_map.GetNode(outer_add_name);
   ASSERT_NE(outer_node, nullptr);
-  EXPECT_EQ(outer_node->op(), "Add");
+  EXPECT_EQ(outer_node->op(), "AddV2");
   ASSERT_EQ(outer_node->input_size(), 2);
   EXPECT_EQ(outer_node->input(0), outer_0_add_name);
   EXPECT_EQ(outer_node->input(1), inner_2_add_name);
@@ -2299,7 +2326,7 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewriteMinimizeBCastWithSymbolicShapes) {
   // outer Add node
   const NodeDef* outer_add = node_map.GetNode(outer_add_name);
   ASSERT_NE(outer_add, nullptr);
-  EXPECT_EQ(outer_add->op(), "Add");
+  EXPECT_EQ(outer_add->op(), "AddV2");
   ASSERT_EQ(outer_add->input_size(), 2);
   EXPECT_EQ(outer_add->input(0), inner_add_name);
   EXPECT_EQ(outer_add->input(1), "b");
@@ -2384,7 +2411,7 @@ TEST_F(ArithmeticOptimizerTest, RemoveNegation) {
       EXPECT_EQ(node.input(1), "y");
     } else if (node.name() == "Sub_x_negy") {
       ++found;
-      EXPECT_EQ(node.op(), "Add");
+      EXPECT_EQ(node.op(), "AddV2");
       ASSERT_EQ(node.input_size(), 2);
       EXPECT_EQ(node.input(0), "x");
       EXPECT_EQ(node.input(1), "y");
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
index 94907d2ee6c..9b41df4cab5 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@@ -965,6 +965,7 @@ class AutoMixedPrecisionImpl {
   bool NodeImplicitlyReadsNonResourceVariable(const NodeDef& node) const;
   void ConvertBatchNormOpsToV2();
   bool SupportsF16(const NodeTypeId& node_type) const;
+  bool SupportsF16DataType(const NodeTypeId& node_type) const;
   const NodeTypeId* GetTensorListFloat32NodeTypeId(const NodeDef& node) const;
   bool IsSourceOrSinkOp(const string& op) const;
   void FindFloat32TensorListOpClustersAndDenylistUnsafe(
@@ -974,6 +975,7 @@ class AutoMixedPrecisionImpl {
       const absl::flat_hash_set<const NodeDef*>& tensor_list_nodes,
       std::vector<NodeTypeIdEdge>* implicit_data_edges) const;
   void AddAllowlistOps(absl::flat_hash_set<int>* allow_set) const;
+  void RemoveAllowsetWithFp32(absl::flat_hash_set<int>* allow_set) const;
   void PropagateDenyFwdThroughClearAndInfer(
       absl::flat_hash_set<int>* deny_set) const;
   void ForceColorMatchBetweenTensorListOps(
@@ -1200,6 +1202,15 @@ bool AutoMixedPrecisionImpl::SupportsF16(const NodeTypeId& node_type) const {
          NodeHasF16KernelForTypeAttr(*node_type.node, node_type.type_attr);
 }
 
+bool AutoMixedPrecisionImpl::SupportsF16DataType(
+    const NodeTypeId& node_type) const {
+  const OpDef* op_def;
+  Status status =
+      OpRegistry::Global()->LookUpOpDef(node_type.node->op(), &op_def);
+  if (!status.ok()) return false;
+  return AllowedDataTypes(*op_def, node_type.type_attr).Contains(target_dtype_);
+}
+
 // TODO(mconley): Make this change the node's name (to aid debugging). Need to
 // make sure that doing this won't break anything.
 void AutoMixedPrecisionImpl::ConvertBatchNormOpsToV2() {
@@ -1366,6 +1377,12 @@ Status AutoMixedPrecisionImpl::Optimize() {
   PropagateAllowThroughClear(deny_set, &allow_set);
   VLOG(2) << "Finished pass 4";
 
+  VLOG(2) << "Beginning pass 5 to remove some nodes which could not be changed "
+             "to F16"
+             "from allow set";
+  RemoveAllowsetWithFp32(&allow_set);
+  VLOG(2) << "Finished pass 5";
+
   VLOG(2) << "Forcing color match between data structure ops";
   for (const auto& cluster : tensor_list_clusters) {
     ForceColorMatchBetweenTensorListOps(cluster, &allow_set, &deny_set);
@@ -1684,6 +1701,25 @@ void AutoMixedPrecisionImpl::PropagateAllowThroughClear(
   }
 }
 
+// If ops have one or more type_attr, But this type_attr could not be converted
+// to F16. Such as FusedBatchNormV2/FusedBatchNormV3, its type_attr 'U' only
+// support float. So we will remove this node from allow_set.
+void AutoMixedPrecisionImpl::RemoveAllowsetWithFp32(
+    absl::flat_hash_set<int>* allow_set) const {
+  for (int root_idx = 0; root_idx < graph_type_view_.num_nodes(); ++root_idx) {
+    const NodeTypeId& root = *graph_type_view_.GetNode(root_idx);
+    if (f16_allowlist_.count(root.node->op()) && allow_set->count(root_idx) &&
+        !SupportsF16DataType(root)) {
+      auto erased = allow_set->erase(root_idx);
+      if (VLOG_IS_ON(2) && erased) {
+        VLOG(2) << "UnPainting type " << root.type_attr.DebugString()
+                << " of node " << root.node->name() << " ALLOW because its op "
+                << root.node->op() << " is not support F16 DataType";
+      }
+    }
+  }
+}
+
 // Forces NextIteration nodes and their output Merge node(s) to have the same
 // color. Specifically, it removes them all from allow_set if any of the Merge
 // nodes is not in allow_set, otherwise it adds the NextIteration node to
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index d595d2ba09c..6b51c9146d0 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -1338,7 +1338,9 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
     TF_RETURN_IF_ERROR(CheckAttrExists(*input_node, "value"));
     const TensorProto& raw_val = input_node->attr().at("value").tensor();
     Tensor* value = new Tensor(raw_val.dtype(), raw_val.tensor_shape());
-    CHECK(value->FromProto(raw_val));
+    CHECK(value->FromProto(raw_val))
+        << "Unable to make Tensor from proto for " << node.name()
+        << " with shape " << raw_val.tensor_shape().DebugString();
     inputs.emplace_back(value);
     total_inputs_size += value->TotalBytes();
   }
@@ -1793,6 +1795,71 @@ bool ConstantFolding::IsZeros(const NodeDef& node) const {
   return false;
 }
 
+bool ConstantFolding::ReplaceOperationWithBroadcastTo(
+    int input_to_broadcast, const GraphProperties& properties, NodeDef* node,
+    GraphDef* graph) {
+  const DataType dtype = GetDataTypeFromNodeOrProps(*node, properties);
+  if (dtype == DT_INVALID) {
+    return false;
+  }
+  const PartialTensorShape shape(
+      properties.GetOutputProperties(node->name())[0].shape());
+  if (!shape.IsFullyDefined()) {
+    return false;
+  }
+  // Create constant node with shape.
+  const string const_name = OptimizedNodeName(
+      *node, strings::StrCat("-broadcastto_shape-", input_to_broadcast));
+  if (node_map_->GetNode(const_name) != nullptr) {
+    return false;
+  }
+
+  Tensor shape_t;
+  if (!ConvertShapeToConstant("Shape", DT_INT32, shape, &shape_t).ok()) {
+    return false;
+  }
+  NodeDef tmp;
+  if (!CreateNodeDef(const_name, TensorValue(&shape_t), &tmp).ok()) {
+    return false;
+  }
+  NodeDef* const_node = graph->add_node();
+  const_node->Swap(&tmp);
+  const_node->set_device(node->device());
+  node_map_->AddNode(const_name, const_node);
+  for (int i = 0; i < node->input_size(); ++i) {
+    if (i != input_to_broadcast) {
+      // Add a control input on the unused input.
+      string ctrl_dep = AddControlDependency(NodeName(node->input(i)), graph,
+                                             node_map_.get());
+      *const_node->add_input() = ctrl_dep;
+      node_map_->AddOutput(NodeName(ctrl_dep), const_name);
+    }
+  }
+
+  // Rewrite `node` in-place to BroadcastTo.
+  node->set_op("BroadcastTo");
+  EraseRegularNodeAttributes(node);
+  (*node->mutable_attr())["T"].set_type(dtype);
+  (*node->mutable_attr())["Tidx"].set_type(DT_INT32);
+  // Set the designated input to BroadcastTo.
+  node->mutable_input()->SwapElements(0, input_to_broadcast);
+  // Keep all other inputs as control dependencies.
+  for (int i = 1; i < node->input_size(); ++i) {
+    if (IsControlInput(node->input(i))) {
+      break;
+    }
+    const string ctrl_dep =
+        AddControlDependency(node->input(i), graph, node_map_.get());
+    node_map_->UpdateInput(node->name(), node->input(i), ctrl_dep);
+    node->set_input(i, ctrl_dep);
+  }
+  // Add the shape argument.
+  *node->add_input() = const_node->name();
+  node_map_->AddOutput(const_name, node->name());
+  node->mutable_input()->SwapElements(1, node->input_size() - 1);
+  return true;
+}
+
 // Replace an operation with Identity.
 void ConstantFolding::ReplaceOperationWithIdentity(
     int input_to_forward, const GraphProperties& properties, NodeDef* node,
@@ -1878,54 +1945,10 @@ void ConstantFolding::ReplaceOperationWithNoOp(NodeDef* node,
 void ConstantFolding::ReplaceBinaryOperationWithBroadcastTo(
     int input_to_broadcast, const GraphProperties& properties, NodeDef* node,
     GraphDef* graph) {
-  const DataType dtype = GetDataTypeFromNodeOrProps(*node, properties);
-  if (dtype == DT_INVALID) return;
-  const PartialTensorShape shape(
-      properties.GetOutputProperties(node->name())[0].shape());
-  if (!shape.IsFullyDefined()) return;
-
-  // Create constant node with shape.
-  const string const_name = OptimizedNodeName(
-      *node, strings::StrCat("-broadcastto_shape-", input_to_broadcast));
-  if (node_map_->GetNode(const_name) != nullptr) {
+  if (!ReplaceOperationWithBroadcastTo(input_to_broadcast, properties, node,
+                                       graph)) {
     return;
   }
-
-  Tensor shape_t;
-  if (!ConvertShapeToConstant("Shape", DT_INT32, shape, &shape_t).ok()) return;
-  NodeDef tmp;
-  if (!CreateNodeDef(const_name, TensorValue(&shape_t), &tmp).ok()) return;
-  NodeDef* const_node = graph->add_node();
-  const_node->Swap(&tmp);
-  const_node->set_device(node->device());
-  node_map_->AddNode(const_name, const_node);
-  // Add a control input on the unused input.
-  string ctrl_dep = AddControlDependency(
-      NodeName(node->input(1 - input_to_broadcast)), graph, node_map_.get());
-  *const_node->add_input() = ctrl_dep;
-  node_map_->AddOutput(NodeName(ctrl_dep), const_name);
-
-  // Rewrite `node` in-place to BroadcastTo.
-  node->set_op("BroadcastTo");
-  EraseRegularNodeAttributes(node);
-  (*node->mutable_attr())["T"].set_type(dtype);
-  (*node->mutable_attr())["Tidx"].set_type(DT_INT32);
-  // Set the designated input to BroadcastTo.
-  node->mutable_input()->SwapElements(0, input_to_broadcast);
-  // Keep all other inputs as control dependencies.
-  for (int i = 1; i < node->input_size(); ++i) {
-    if (IsControlInput(node->input(i))) {
-      break;
-    }
-    const string ctrl_dep =
-        AddControlDependency(node->input(i), graph, node_map_.get());
-    node_map_->UpdateInput(node->name(), node->input(i), ctrl_dep);
-    node->set_input(i, ctrl_dep);
-  }
-  // Add the shape argument.
-  *node->add_input() = const_node->name();
-  node_map_->AddOutput(const_name, node->name());
-  node->mutable_input()->SwapElements(1, node->input_size() - 1);
   graph_modified_ = true;
 }
 
@@ -2475,19 +2498,9 @@ bool ConstantFolding::SimplifyCase(GraphDef* optimized_graph, NodeDef* node) {
 bool ConstantFolding::SimplifySelect(const GraphProperties& properties,
                                      GraphDef* optimized_graph, NodeDef* node) {
   if (!IsSelect(*node)) return false;
-  // Replace node with Identity if no broadcasting is involved.
-  // TODO(b/155503011): Add support for broadcast.
   const std::vector<OpInfo::TensorProperties>& input_props =
       properties.GetInputProperties(node->name());
   if (input_props.size() < 3) return false;
-  const TensorShapeProto& predicate_shape = input_props[0].shape();
-  const bool predicate_is_scalar =
-      !predicate_shape.unknown_rank() && predicate_shape.dim_size() == 0;
-  if (!ShapesSymbolicallyEqual(input_props[1], input_props[2]) ||
-      !(ShapesSymbolicallyEqual(input_props[0], input_props[1]) ||
-        predicate_is_scalar)) {
-    return false;
-  }
   const NodeDef* predicate_node = node_map_->GetNode(node->input(0));
   const bool is_all_true = IsOnes(*predicate_node);
   const bool is_all_false = IsZeros(*predicate_node);
@@ -2496,12 +2509,23 @@ bool ConstantFolding::SimplifySelect(const GraphProperties& properties,
   }
   const int live_input_idx = is_all_true ? 1 : 2;
   const int ignored_input_idx = is_all_true ? 2 : 1;
-  node->set_op("Identity");
-  *node->mutable_input(0) =
-      AddControlDependency(node->input(0), optimized_graph, node_map_.get());
-  *node->mutable_input(ignored_input_idx) = AddControlDependency(
-      node->input(ignored_input_idx), optimized_graph, node_map_.get());
-  node->mutable_input()->SwapElements(0, live_input_idx);
+  const TensorShapeProto& predicate_shape = input_props[0].shape();
+  const bool predicate_is_scalar =
+      !predicate_shape.unknown_rank() && predicate_shape.dim_size() == 0;
+  if (ShapesSymbolicallyEqual(input_props[1], input_props[2]) &&
+      (ShapesSymbolicallyEqual(input_props[0], input_props[1]) ||
+       predicate_is_scalar)) {
+    // Replace node with Identity if no broadcasting is involved.
+    node->set_op("Identity");
+    *node->mutable_input(0) =
+        AddControlDependency(node->input(0), optimized_graph, node_map_.get());
+    *node->mutable_input(ignored_input_idx) = AddControlDependency(
+        node->input(ignored_input_idx), optimized_graph, node_map_.get());
+    node->mutable_input()->SwapElements(0, live_input_idx);
+  } else if (!ReplaceOperationWithBroadcastTo(live_input_idx, properties, node,
+                                              optimized_graph)) {
+    return false;
+  }
   DedupControlInputs(node);
   return true;
 }
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index 398e16947ec..c25bd521a9c 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -102,6 +102,9 @@ class ConstantFolding : public GraphOptimizer {
 
   bool IsOnes(const NodeDef& node) const;
   bool IsZeros(const NodeDef& node) const;
+  bool ReplaceOperationWithBroadcastTo(int input_to_broadcast,
+                                       const GraphProperties& properties,
+                                       NodeDef* node, GraphDef* graph);
   void ReplaceOperationWithIdentity(int input_to_forward,
                                     const GraphProperties& properties,
                                     NodeDef* node, GraphDef* graph);
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index cb1ad87de60..dcde4c17e4b 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -4299,6 +4299,75 @@ TEST_F(ConstantFoldingTest, SimplifySelect) {
   }
 }
 
+TEST_F(ConstantFoldingTest, SimplifySelect_BroadcastTo) {
+  for (TensorShape pred_shape : {TensorShape{2, 1}, TensorShape{2, 2, 1}}) {
+    for (bool pred_val : {true, false}) {
+      tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+      std::unique_ptr<Tensor> if_t;
+      if_t.reset(new Tensor(DT_BOOL, pred_shape));
+      for (int i = 0; i < pred_shape.num_elements(); ++i) {
+        if_t->flat<bool>()(i) = pred_val;
+      }
+      Output if_ = ops::Const(scope.WithOpName("if"), *if_t);
+      Output then_ =
+          ops::Placeholder(scope.WithOpName("then"), DT_FLOAT,
+                           ops::Placeholder::Shape(TensorShape({2, 1})));
+      Output else_ =
+          ops::Placeholder(scope.WithOpName("else"), DT_FLOAT,
+                           ops::Placeholder::Shape(TensorShape({2, 4})));
+      Output select =
+          ops::SelectV2(scope.WithOpName("select"), if_, then_, else_);
+      Output id = ops::Identity(scope.WithOpName("id"), select);
+
+      GrapplerItem item;
+      TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+      item.fetch = {"id"};
+
+      const Tensor kOne =
+          test::AsTensor<float>({1.0f, 1.0f}, TensorShape({2, 1}));
+      const Tensor kTwo = test::AsTensor<float>(
+          {2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f},
+          TensorShape({2, 4}));
+      auto tensors_expected = EvaluateNodes(item.graph, item.fetch,
+                                            {{"then", kOne}, {"else", kTwo}});
+
+      // Use aggressive mode to force the shape inference to propagate
+      // placeholder shapes.
+      ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
+                                /*cpu_device=*/nullptr);
+      GraphDef optimized_graph;
+      TF_EXPECT_OK(
+          optimizer.Optimize(/*cluster=*/nullptr, item, &optimized_graph));
+
+      ASSERT_EQ(optimized_graph.node_size(), 6);
+      bool found = false;
+      for (const auto& node : optimized_graph.node()) {
+        if (node.name() == "select") {
+          found = true;
+          EXPECT_EQ(node.op(), "BroadcastTo");
+          ASSERT_EQ(node.input_size(), 4);
+          EXPECT_EQ(node.input(0), pred_val ? "then" : "else");
+          EXPECT_EQ(node.input(1),
+                    strings::StrCat("ConstantFolding/select-broadcastto_shape-",
+                                    pred_val ? 1 : 2));
+          EXPECT_EQ(node.input(2), pred_val ? "^else" : "^if");
+          EXPECT_EQ(node.input(3), pred_val ? "^if" : "^then");
+        }
+      }
+      EXPECT_TRUE(found);
+
+      auto tensors = EvaluateNodes(optimized_graph, item.fetch,
+                                   {{"then", kOne}, {"else", kTwo}});
+      ASSERT_EQ(tensors.size(), 1);
+      ASSERT_EQ(tensors_expected.size(), 1);
+      ASSERT_EQ(tensors[0].shape(), pred_shape.num_elements() == 2
+                                        ? TensorShape({2, 4})
+                                        : TensorShape({2, 2, 4}));
+      test::ExpectTensorEqual<float>(tensors[0], tensors_expected[0]);
+    }
+  }
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 54a21706c37..db75284a60d 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/core/platform:build_config.bzl", "tf_protos_all")
 
@@ -14,11 +15,13 @@ cc_library(
     name = "data",
     visibility = ["//visibility:public"],
     deps = [
+        ":autotune_buffer_sizes",
         ":disable_intra_op_parallelism",
+        ":disable_prefetch_legacy_autotune",
+        ":enable_gradient_descent",
         ":filter_fusion",
         ":filter_with_random_uniform_fusion",
         ":hoist_random_uniform",
-        ":inject_prefetch",
         ":latency_all_edges",
         ":make_sloppy",
         ":map_and_batch_fusion",
@@ -58,6 +61,42 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "autotune_buffer_sizes",
+    srcs = ["autotune_buffer_sizes.cc"],
+    hdrs = ["autotune_buffer_sizes.h"],
+    deps = [
+        ":graph_utils",
+        ":optimizer_base",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core:lib_internal",
+    ] + tf_protos_all(),
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "autotune_buffer_sizes_test",
+    srcs = ["autotune_buffer_sizes_test.cc"],
+    deps = [
+        ":autotune_buffer_sizes",
+        ":graph_test_utils",
+        ":graph_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
+
 cc_library(
     name = "disable_intra_op_parallelism",
     srcs = ["disable_intra_op_parallelism.cc"],
@@ -93,6 +132,76 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "disable_prefetch_legacy_autotune",
+    srcs = ["disable_prefetch_legacy_autotune.cc"],
+    hdrs = ["disable_prefetch_legacy_autotune.h"],
+    deps = [
+        ":graph_utils",
+        ":optimizer_base",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core:lib_internal",
+    ] + tf_protos_all(),
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "disable_prefetch_legacy_autotune_test",
+    srcs = ["disable_prefetch_legacy_autotune_test.cc"],
+    deps = [
+        ":disable_prefetch_legacy_autotune",
+        ":graph_test_utils",
+        ":graph_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
+
+cc_library(
+    name = "enable_gradient_descent",
+    srcs = ["enable_gradient_descent.cc"],
+    hdrs = ["enable_gradient_descent.h"],
+    deps = [
+        ":graph_utils",
+        ":optimizer_base",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core:lib_internal",
+    ] + tf_protos_all(),
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "enable_gradient_descent_test",
+    srcs = ["enable_gradient_descent_test.cc"],
+    deps = [
+        ":enable_gradient_descent",
+        ":graph_test_utils",
+        ":graph_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
+
 cc_library(
     name = "filter_fusion",
     srcs = ["filter_fusion.cc"],
@@ -328,41 +437,6 @@ tf_cc_test(
     ] + tf_protos_all(),
 )
 
-cc_library(
-    name = "inject_prefetch",
-    srcs = ["inject_prefetch.cc"],
-    hdrs = ["inject_prefetch.h"],
-    deps = [
-        ":graph_utils",
-        ":optimizer_base",
-        "//tensorflow/core/grappler:mutable_graph_view",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
-        "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core:lib_internal",
-    ] + tf_protos_all(),
-    alwayslink = 1,
-)
-
-tf_cc_test(
-    name = "inject_prefetch_test",
-    srcs = ["inject_prefetch_test.cc"],
-    deps = [
-        ":graph_test_utils",
-        ":graph_utils",
-        ":inject_prefetch",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/grappler:grappler_item",
-    ],
-)
-
 cc_library(
     name = "latency_all_edges",
     srcs = ["latency_all_edges.cc"],
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.cc b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
index 4d324ecbd3d..1288f9695b9 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.cc
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
@@ -45,6 +45,9 @@ constexpr char kShuffleDatasetV3OpName[] = "ShuffleDatasetV3";
 constexpr char kPrefetchDatasetOpName[] = "PrefetchDataset";
 constexpr char kRebatchDatasetOpName[] = "RebatchDataset";
 constexpr char kRebatchDatasetV2OpName[] = "RebatchDatasetV2";
+constexpr char kTensorDatasetOpName[] = "TensorDataset";
+constexpr char kTensorSliceDatasetOpName[] = "TensorSliceDataset";
+constexpr char kPlaceholderOpName[] = "Placeholder";
 
 constexpr char kNumWorkersAttrName[] = "num_workers";
 constexpr char kNumReplicasAttrName[] = "num_replicas";
@@ -68,12 +71,13 @@ constexpr std::array<const char*, 2> kMultipleInputsDatasetOps = {
     "ZipDataset"
 };
 
-constexpr std::array<const char*, 25> kPassThroughOps = {
+constexpr std::array<const char*, 26> kPassThroughOps = {
     "_Retval",
     "AssertNextDataset",
     "BatchDataset",
     "CacheDataset",
     "ExperimentalMapAndBatchDataset",
+    "ExperimentalParseExampleDataset",
     "ExperimentalRebatchDataset",
     "FilterDataset",
     "Identity",
@@ -413,6 +417,33 @@ Status ProcessDatasetSourceNode(MutableGraphView* graph, const NodeDef& node,
   return Status::OK();
 }
 
+const NodeDef* FindFuncAndTensorSliceDataset(
+    const NodeDef* node, int64 num_workers, int64 index,
+    FunctionLibraryDefinition* flib, MutableGraphView* graph,
+    absl::flat_hash_set<string>* nodes_to_delete) {
+  if (IsDatasetNodeOfType(*node, kFuncDatasetOps)) {
+    const NodeDef* input_node = graph_utils::GetInputNode(*node, *graph, 0);
+    if (input_node->op() == kTensorSliceDatasetOpName ||
+        input_node->op() == kTensorDatasetOpName) {
+      const NodeDef* next_input_node =
+          graph_utils::GetInputNode(*input_node, *graph, 0);
+      if (next_input_node->op() == kPlaceholderOpName) {
+        return node;
+      }
+    }
+  }
+
+  if (!IsDatasetNodeOfType(*node, kPassThroughOps)) {
+    return nullptr;
+  }
+
+  // Sometimes there are other nodes between the last InterleaveDataset and the
+  // second to last FlatMapDataset, so we need to skip over those.
+  const NodeDef* input_node = graph_utils::GetInputNode(*node, *graph, 0);
+  return FindFuncAndTensorSliceDataset(input_node, num_workers, index, flib,
+                                       graph, nodes_to_delete);
+}
+
 Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers, int64 index,
                            FunctionLibraryDefinition* flib,
                            MutableGraphView* graph,
@@ -441,6 +472,39 @@ Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers, int64 index,
     return Status::OK();
   }
 
+  // This handles the case for the following subgraph:
+  //   Placeholder -> TensorSliceDataset -> FlatMapDataset -x->
+  //   (other preprocessing datasets) -> InterleaveDataset
+  // and then inserting the shard node immediately after the FlatMapDataset.
+  //
+  // This is used for some training pipelines where a dataset is created with
+  // the following code:
+  //
+  // def make_dataset_pipeline():
+  //   file_globs = [...]
+  //   datasets = []
+  //   for file_glob in file_globs:
+  //     datasets.append(Dataset.list_files(file_glob).map(TFRecordReader))
+  //   dataset = Dataset.from_tensor_slices(datasets)
+  //   dataset = dataset.flat_map(lambda x: x)
+  //   dataset = ...  # additional preprocessing
+  //   dataset = dataset.interleave(lambda x: x, cycle_length=...)
+  //   return dataset
+  if (IsDatasetNodeOfType(node, kFuncDatasetOps)) {
+    const NodeDef* input_node = graph_utils::GetInputNode(node, *graph, 0);
+    const NodeDef* flat_map_node = FindFuncAndTensorSliceDataset(
+        input_node, num_workers, index, flib, graph, nodes_to_delete);
+
+    if (flat_map_node != nullptr) {
+      auto fanouts = graph->GetFanouts(*flat_map_node, false);
+      // FlatMapDataset should only be the input to one other dataset.
+      if (fanouts.size() == 1) {
+        return ProcessDatasetSourceNode(graph, *fanouts.begin()->node,
+                                        nodes_to_delete, num_workers, index);
+      }
+    }
+  }
+
   // This handles the case where a reader Dataset is contained within a
   // FuncDataset (e.g. FlatMap, ParallelInterleave, etc...). For example:
   //
@@ -570,7 +634,6 @@ Status OptimizeGraph(const GrapplerItem& item, int64 num_workers, int64 index,
   MutableGraphView graph(output);
   FunctionLibraryDefinition flib(OpRegistry::Global(), item.graph.library());
 
-
   NodeDef* sink_node;
   TF_RETURN_IF_ERROR(graph_utils::GetFetchNode(graph, item, &sink_node));
 
diff --git a/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.cc
similarity index 79%
rename from tensorflow/core/grappler/optimizers/data/inject_prefetch.cc
rename to tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.cc
index ed202c151ae..92d1d8911cc 100644
--- a/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc
+++ b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/grappler/optimizers/data/inject_prefetch.h"
+#include "tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h"
 
 #include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -42,11 +42,16 @@ constexpr std::array<const char*, 7> kAsyncDatasetOps = {
 
 }  // namespace
 
-Status InjectPrefetch::OptimizeAndCollectStats(Cluster* cluster,
-                                               const GrapplerItem& item,
-                                               GraphDef* output,
-                                               OptimizationStats* stats) {
+Status AutotuneBufferSizes::OptimizeAndCollectStats(Cluster* cluster,
+                                                    const GrapplerItem& item,
+                                                    GraphDef* output,
+                                                    OptimizationStats* stats) {
   *output = item.graph;
+  if (!autotune_) {
+    VLOG(1) << "The optimization autotune_buffer_sizes is not applied if "
+               "autotune is off.";
+    return Status::OK();
+  }
   MutableGraphView graph(output);
 
   std::vector<const NodeDef*> async_datasets;
@@ -54,6 +59,7 @@ Status InjectPrefetch::OptimizeAndCollectStats(Cluster* cluster,
     for (const auto& async_dataset_op : kAsyncDatasetOps) {
       if (node.op() == async_dataset_op) {
         async_datasets.push_back(&node);
+        stats->num_changes++;
         break;
       }
     }
@@ -86,22 +92,16 @@ Status InjectPrefetch::OptimizeAndCollectStats(Cluster* cluster,
         graph.UpdateFanouts(async_dataset_node->name(), added_node->name()));
   }
 
-  for (NodeDef& node : *output->mutable_node()) {
-    if (node.op() == kPrefetchDataset) {
-      (*node.mutable_attr())[kLegacyAutotune].set_b(false);
-      stats->num_changes++;
-    }
-  }
-
   return Status::OK();
 }
 
-void InjectPrefetch::Feedback(Cluster* cluster, const GrapplerItem& item,
-                              const GraphDef& optimize_output, double result) {
+void AutotuneBufferSizes::Feedback(Cluster* cluster, const GrapplerItem& item,
+                                   const GraphDef& optimize_output,
+                                   double result) {
   // no-op
 }
 
-REGISTER_GRAPH_OPTIMIZER_AS(InjectPrefetch, "inject_prefetch");
+REGISTER_GRAPH_OPTIMIZER_AS(AutotuneBufferSizes, "autotune_buffer_sizes");
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/inject_prefetch.h b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h
similarity index 55%
rename from tensorflow/core/grappler/optimizers/data/inject_prefetch.h
rename to tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h
index b685e7aefa9..c0eb43dd6a4 100644
--- a/tensorflow/core/grappler/optimizers/data/inject_prefetch.h
+++ b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h
@@ -13,29 +13,41 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_INJECT_PREFETCH_H_
-#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_INJECT_PREFETCH_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_AUTOTUNE_BUFFER_SIZES_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_AUTOTUNE_BUFFER_SIZES_H_
 
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
-// This optimization adds `Prefetch(AUTOTUNE)` after all asynchronous tf.data
-// transformations. This reduces the problem of tuning buffer sizes of these
-// asynchronous transformations to tuning buffer sizes of the prefetch
-// transformation.
-class InjectPrefetch : public TFDataOptimizerBase {
- public:
-  InjectPrefetch() = default;
-  ~InjectPrefetch() override = default;
+constexpr char kAutotune[] = "autotune";
 
-  string name() const override { return "inject_prefetch"; };
+// This optimization adds `prefetch(AUTOTUNE)` after all asynchronous tf.data
+// transformations (e.g. parallel map, parallel interleave, and map + batch).
+class AutotuneBufferSizes : public TFDataOptimizerBase {
+ public:
+  AutotuneBufferSizes() = default;
+  ~AutotuneBufferSizes() override = default;
+
+  string name() const override { return "autotune_buffer_sizes"; };
 
   bool UsesFunctionLibrary() const override { return false; }
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    if (!config) return Status::OK();
+
+    const string& autotune = config->parameter_map().at(kAutotune).s();
+    if (autotune == "true") {
+      autotune_ = true;
+    } else if (autotune == "false") {
+      autotune_ = false;
+    } else {
+      return errors::InvalidArgument("Received an invalid value for parameter ",
+                                     kAutotune, ": ", autotune);
+    }
     return Status::OK();
   }
 
@@ -45,9 +57,12 @@ class InjectPrefetch : public TFDataOptimizerBase {
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
+
+ private:
+  bool autotune_ = true;
 };
 
 }  // namespace grappler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_INJECT_PREFETCH_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_AUTOTUNE_BUFFER_SIZES_H_
diff --git a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes_test.cc b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes_test.cc
new file mode 100644
index 00000000000..7b4f9d10a8c
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes_test.cc
@@ -0,0 +1,155 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h"
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_test_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+Status OptimizeWithAutotuneBufferSizes(const GrapplerItem &item,
+                                       GraphDef *output, bool autotune) {
+  AutotuneBufferSizes optimizer;
+  RewriterConfig_CustomGraphOptimizer config;
+  if (autotune) {
+    (*config.mutable_parameter_map())["autotune"].set_s("true");
+  } else {
+    (*config.mutable_parameter_map())["autotune"].set_s("false");
+  }
+  TF_RETURN_IF_ERROR(optimizer.Init(&config));
+  return optimizer.Optimize(nullptr, item, output);
+}
+
+class SimpleInject : public ::testing::TestWithParam<string> {};
+
+TEST_P(SimpleInject, AutotuneBufferSizesTest) {
+  const string async_dataset = GetParam();
+  using test::function::NDef;
+  GrapplerItem item;
+  if (async_dataset == "map") {
+    item.graph = test::function::GDef(
+        {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+         NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+         NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+         NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+         NDef("num_parallel_calls", "Const", {},
+              {{"value", 1}, {"dtype", DT_INT32}}),
+         graph_tests_utils::MakeParallelMapNode(
+             "map", "range", "num_parallel_calls", "XTimesTwo",
+             /*sloppy=*/false)},
+        // FunctionLib
+        {
+            test::function::XTimesTwo(),
+        });
+  } else if (async_dataset == "interleave") {
+    item.graph = test::function::GDef(
+        {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+         NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+         NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+         NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+         NDef("cycle_length", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+         NDef("block_length", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+         NDef("num_parallel_calls", "Const", {},
+              {{"value", 1}, {"dtype", DT_INT32}}),
+         graph_tests_utils::MakeParallelInterleaveV2Node(
+             "interleave", "range", "cycle_length", "block_length",
+             "num_parallel_calls", "XTimesTwo", /*sloppy=*/false)},
+        // FunctionLib
+        {
+            test::function::XTimesTwo(),
+        });
+  } else if (async_dataset == "map_and_batch") {
+    item.graph = test::function::GDef(
+        {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+         NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+         NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+         NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+         NDef("batch_size", "Const", {}, {{"value", 32}, {"dtype", DT_INT64}}),
+         NDef("num_parallel_calls", "Const", {},
+              {{"value", 1}, {"dtype", DT_INT64}}),
+         NDef("drop_remainder", "Const", {},
+              {{"value", false}, {"dtype", DT_BOOL}}),
+         graph_tests_utils::MakeMapAndBatchNode(
+             "map_and_batch", "range", "batch_size", "num_parallel_calls",
+             "drop_remainder", "XTimesTwo")},
+        // FunctionLib
+        {
+            test::function::XTimesTwo(),
+        });
+  }
+
+  GraphDef output;
+  TF_ASSERT_OK(OptimizeWithAutotuneBufferSizes(item, &output, true));
+
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("PrefetchDataset", output));
+  int index = graph_utils::FindGraphNodeWithOp("PrefetchDataset", output);
+  const NodeDef prefetch_node = output.node(index);
+  EXPECT_TRUE(prefetch_node.attr().find("legacy_autotune") ==
+              prefetch_node.attr().end());
+  EXPECT_EQ(prefetch_node.input_size(), 2);
+  NodeDef async_node = output.node(
+      graph_utils::FindGraphNodeWithName(prefetch_node.input(0), output));
+  EXPECT_EQ(async_node.name(), async_dataset);
+  NodeDef buffer_size_val = output.node(
+      graph_utils::FindGraphNodeWithName(prefetch_node.input(1), output));
+  EXPECT_EQ(buffer_size_val.attr().at("value").tensor().int64_val(0), -1);
+}
+
+INSTANTIATE_TEST_SUITE_P(Test, SimpleInject,
+                         ::testing::Values("map", "interleave",
+                                           "map_and_batch"));
+
+class AutotuneSetting : public ::testing::TestWithParam<bool> {};
+
+TEST_P(AutotuneSetting, AutotuneBufferSizesTest) {
+  const bool autotune = GetParam();
+
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       NDef("num_parallel_calls", "Const", {},
+            {{"value", 1}, {"dtype", DT_INT32}}),
+       graph_tests_utils::MakeParallelMapNode("map", "range",
+                                              "num_parallel_calls", "XTimesTwo",
+                                              /*sloppy=*/false)},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+      });
+
+  GraphDef output;
+  TF_ASSERT_OK(OptimizeWithAutotuneBufferSizes(item, &output, autotune));
+  EXPECT_EQ(graph_utils::ContainsNodeWithOp("PrefetchDataset", output),
+            autotune);
+}
+
+INSTANTIATE_TEST_SUITE_P(Test, AutotuneSetting, ::testing::Values(false, true));
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc
index ee8f9e84765..f8b493f06d4 100644
--- a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc
+++ b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc
@@ -29,8 +29,8 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-constexpr char kRetValOp[] = "_Retval";
 constexpr char kMaxIntraOpParallelismDataset[] = "MaxIntraOpParallelismDataset";
+constexpr char kModelDataset[] = "ModelDataset";
 
 constexpr std::array<const char*, 2> kMaxIntraOpParallelismDatasetOps = {
     "MaxIntraOpParallelismDataset",
@@ -45,17 +45,11 @@ Status DisableIntraOpParallelism::OptimizeAndCollectStats(
   *output = item.graph;
   MutableGraphView graph(output);
 
-  for (const auto& fetch_name : item.fetch) {
-    // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
-    // because we only want to disable intra op parallelism on the main dataset
-    // pipeline.
-    auto fetch = graph.GetNode(fetch_name);
-    if (fetch == nullptr || fetch->op() == kRetValOp) {
-      // Heuristic: If the fetch nodes are Retval ops, this item is from a
-      // function.
-      return Status::OK();
-    }
-  }
+  // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
+  // because we only want to disable intra op parallelism on the main dataset
+  // pipeline.
+  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph))
+    return Status::OK();
 
   if (item.fetch.size() != 1) {
     return errors::InvalidArgument(
@@ -75,9 +69,20 @@ Status DisableIntraOpParallelism::OptimizeAndCollectStats(
 
   NodeDef* sink_node = graph.GetNode(item.fetch.at(0));
   NodeDef* last_node = graph_utils::GetInputNode(*sink_node, graph);
+  // If the pipeline is autotuned (ModelDataset exists as the last dataset in
+  // the pipeline), we insert MaxIntraOpParallelismDataset before ModelDataset.
+  // If the pipeline is not autotuned (ModelDataset doesn't exist), we insert
+  // MaxIntraOpParallelismDataset as the last dataset in the pipeline.
+  //
+  // In general, if exists, ModelDataset should be the last dataset in the
+  // pipeline.
+  if (last_node->op() == kModelDataset) {
+    last_node = graph_utils::GetInputNode(*last_node, graph);
+  }
 
   // Add a const node with value 1
-  NodeDef* max_parallelism_value = graph_utils::AddScalarConstNode(1LL, &graph);
+  NodeDef* max_parallelism_value =
+      graph_utils::AddScalarConstNode(int64{1}, &graph);
 
   NodeDef insert_node;
   graph_utils::SetUniqueGraphNodeName("intra_op_parallelism", graph.graph(),
@@ -89,8 +94,15 @@ Status DisableIntraOpParallelism::OptimizeAndCollectStats(
   // `max_intra_op_parallelism` input
   *insert_node.mutable_input()->Add() = max_parallelism_value->name();
 
-  for (const auto& attr_name : {"output_types", "output_shapes"}) {
-    graph_utils::CopyAttribute(attr_name, *last_node, &insert_node);
+  // Set `output_types` and `output_shapes` attributes by copying the relevant
+  // attrs from the input node. If we fail to set the attributes, we abort the
+  // rewrite.
+  for (auto attr : {"output_shapes", "output_types"}) {
+    if (last_node->attr().find(attr) != last_node->attr().end()) {
+      graph_utils::CopyAttribute(attr, *last_node, &insert_node);
+    } else {
+      return Status::OK();
+    }
   }
 
   auto* added_node = graph.AddNode(std::move(insert_node));
diff --git a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc
index 291d77e834c..4af000bb656 100644
--- a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism_test.cc
@@ -49,8 +49,8 @@ TEST_P(IntraOpAlreadySetTest, IntraOpParallelism) {
   range_inputs[1] = stop_val->name();
   range_inputs[2] = step_val->name();
   std::vector<std::pair<string, AttrValue>> range_attrs;
-  NodeDef *range_node = graph_utils::AddNode("", "RangeDataset", range_inputs,
-                                             range_attrs, &graph);
+  NodeDef *range_node = graph_utils::AddNode("range", "RangeDataset",
+                                             range_inputs, range_attrs, &graph);
 
   NodeDef *parallelism_val =
       graph_utils::AddScalarConstNode<int64>(value, &graph);
@@ -153,6 +153,51 @@ TEST_P(IntraOpNotSetTest, IntraOpParallelism) {
 INSTANTIATE_TEST_SUITE_P(Test, IntraOpNotSetTest,
                          ::testing::Values("Identity", "_Retval"));
 
+// Test the autotune case with ModelDataset in the pipeline. We will insert
+// MaxIntraOpParallelismDataset before ModelDataset.
+TEST(AutotuneWithModelTest, IntraOpParallelism) {
+  GrapplerItem item;
+
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"},
+            {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
+             {"output_types", gtl::ArraySlice<DataType>{}}}),
+       NDef("model", "ModelDataset", {"range"}, {}),
+       NDef("Sink", "Identity", {"model"}, {})});
+  EXPECT_FALSE(graph_utils::ContainsNodeWithOp("MaxIntraOpParallelismDataset",
+                                               item.graph));
+  EXPECT_EQ(item.graph.node_size(), 6);
+  item.fetch.push_back("Sink");
+
+  DisableIntraOpParallelism optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_EQ(output.node_size(), 8);
+  EXPECT_TRUE(
+      graph_utils::ContainsNodeWithOp("MaxIntraOpParallelismDataset", output));
+  NodeDef sink_node =
+      output.node(graph_utils::FindGraphNodeWithName("Sink", output));
+  EXPECT_EQ(sink_node.input_size(), 1);
+  NodeDef model_node = output.node(
+      graph_utils::FindGraphNodeWithName(sink_node.input(0), output));
+  EXPECT_EQ(model_node.op(), "ModelDataset");
+  EXPECT_EQ(model_node.input_size(), 1);
+  NodeDef parallelism_node = output.node(
+      graph_utils::FindGraphNodeWithName(model_node.input(0), output));
+  EXPECT_EQ(parallelism_node.op(), "MaxIntraOpParallelismDataset");
+  EXPECT_EQ(parallelism_node.input_size(), 2);
+  NodeDef range_node = output.node(
+      graph_utils::FindGraphNodeWithName(parallelism_node.input(0), output));
+  EXPECT_EQ(range_node.name(), "range");
+  NodeDef parallelism_val = output.node(
+      graph_utils::FindGraphNodeWithName(parallelism_node.input(1), output));
+  EXPECT_EQ(parallelism_val.attr().at("value").tensor().int64_val(0), 1);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.cc b/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.cc
new file mode 100644
index 00000000000..8b0d783c334
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.cc
@@ -0,0 +1,75 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h"
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+constexpr char kLegacyAutotune[] = "legacy_autotune";
+constexpr char kPrefetchDataset[] = "PrefetchDataset";
+
+}  // namespace
+
+Status DisablePrefetchLegacyAutotune::OptimizeAndCollectStats(
+    Cluster* cluster, const GrapplerItem& item, GraphDef* output,
+    OptimizationStats* stats) {
+  *output = item.graph;
+  if (!autotune_) {
+    VLOG(1) << "The optimization disable_prefetch_legacy_autotune is not "
+               "applied if autotune is off.";
+    return Status::OK();
+  }
+
+  MutableGraphView graph(output);
+
+  for (NodeDef& node : *output->mutable_node()) {
+    if (node.op() == kPrefetchDataset) {
+      if (node.attr().find(kLegacyAutotune) == node.attr().end() ||
+          node.attr().at(kLegacyAutotune).b()) {
+        // If `legacy_autotune` does not exist as attr or it is true, set it to
+        // false.
+        (*node.mutable_attr())[kLegacyAutotune].set_b(false);
+        stats->num_changes++;
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+void DisablePrefetchLegacyAutotune::Feedback(Cluster* cluster,
+                                             const GrapplerItem& item,
+                                             const GraphDef& optimize_output,
+                                             double result) {
+  // no-op
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(DisablePrefetchLegacyAutotune,
+                            "disable_prefetch_legacy_autotune");
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h b/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h
new file mode 100644
index 00000000000..5910c2a19e0
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h
@@ -0,0 +1,67 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_DISABLE_PREFETCH_LEGACY_AUTOTUNE_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_DISABLE_PREFETCH_LEGACY_AUTOTUNE_H_
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+constexpr char kAutotune[] = "autotune";
+
+// This optimization disables the lagacy autotune option for PrefetchDataset.
+class DisablePrefetchLegacyAutotune : public TFDataOptimizerBase {
+ public:
+  DisablePrefetchLegacyAutotune() = default;
+  ~DisablePrefetchLegacyAutotune() override = default;
+
+  string name() const override { return "disable_prefetch_legacy_autotune"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    if (!config) return Status::OK();
+
+    const string& autotune = config->parameter_map().at(kAutotune).s();
+    if (autotune == "true") {
+      autotune_ = true;
+    } else if (autotune == "false") {
+      autotune_ = false;
+    } else {
+      return errors::InvalidArgument("Received an invalid value for parameter ",
+                                     kAutotune, ": ", autotune);
+    }
+    return Status::OK();
+  }
+
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+
+ private:
+  bool autotune_ = true;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_DISABLE_PREFETCH_LEGACY_AUTOTUNE_H_
diff --git a/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune_test.cc b/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune_test.cc
new file mode 100644
index 00000000000..849e52f6e1b
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune_test.cc
@@ -0,0 +1,91 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h"
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_test_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+using test::function::NDef;
+
+Status OptimizeWithDisablePrefetchLegacyAutotune(const GrapplerItem &item,
+                                                 GraphDef *output,
+                                                 bool autotune) {
+  DisablePrefetchLegacyAutotune optimizer;
+  RewriterConfig_CustomGraphOptimizer config;
+  if (autotune) {
+    (*config.mutable_parameter_map())["autotune"].set_s("true");
+  } else {
+    (*config.mutable_parameter_map())["autotune"].set_s("false");
+  }
+  TF_RETURN_IF_ERROR(optimizer.Init(&config));
+  return optimizer.Optimize(nullptr, item, output);
+}
+
+class RewriteTest : public ::testing::TestWithParam<bool> {};
+
+TEST_P(RewriteTest, DisablePrefetchLegacyAutotune) {
+  const bool autotune = GetParam();
+  GrapplerItem item;
+
+  item.graph = test::function::GDef({
+      NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+      NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+      NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+      NDef("range", "RangeDataset", {"start", "stop", "step"},
+           {{"output_shapes", gtl::ArraySlice<TensorShape>{}},
+            {"output_types", gtl::ArraySlice<DataType>{}}}),
+      NDef("prefetch1", "PrefetchDataset", {"range"},
+           {{"legacy_autotune", true}}),
+      NDef("prefetch2", "PrefetchDataset", {"prefetch1"},
+           {{"legacy_autotune", false}}),
+      NDef("prefetch3", "PrefetchDataset", {"prefetch2"}, {}),
+  });
+
+  GraphDef output;
+  TF_ASSERT_OK(
+      OptimizeWithDisablePrefetchLegacyAutotune(item, &output, autotune));
+
+  NodeDef prefetch_node1 =
+      output.node(graph_utils::FindGraphNodeWithName("prefetch1", output));
+  EXPECT_EQ(prefetch_node1.attr().at("legacy_autotune").b(), !autotune);
+  NodeDef prefetch_node2 =
+      output.node(graph_utils::FindGraphNodeWithName("prefetch2", output));
+  EXPECT_FALSE(prefetch_node2.attr().at("legacy_autotune").b());
+  NodeDef prefetch_node3 =
+      output.node(graph_utils::FindGraphNodeWithName("prefetch3", output));
+  if (autotune) {
+    EXPECT_FALSE(prefetch_node3.attr().at("legacy_autotune").b());
+  } else {
+    EXPECT_TRUE(prefetch_node3.attr().find("legacy_autotune") ==
+                prefetch_node3.attr().end());
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(Test, RewriteTest, ::testing::Values(false, true));
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.cc b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.cc
new file mode 100644
index 00000000000..4159f518e27
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.cc
@@ -0,0 +1,77 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h"
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+constexpr char kAlgorithm[] = "algorithm";
+constexpr char kModelDataset[] = "ModelDataset";
+
+constexpr int64 HILL_CLIMB = 0;
+constexpr int64 GRADIENT_DESCENT = 1;
+
+}  // namespace
+
+Status EnableGradientDescent::OptimizeAndCollectStats(
+    Cluster* cluster, const GrapplerItem& item, GraphDef* output,
+    OptimizationStats* stats) {
+  *output = item.graph;
+  if (!autotune_) {
+    VLOG(1) << "The optimization enable_gradient_descent is not applied if "
+               "autotune is off.";
+    return Status::OK();
+  }
+  MutableGraphView graph(output);
+
+  // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
+  // because we only want to enable gradient descent on the main dataset
+  // pipeline.
+  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph))
+    return Status::OK();
+
+  int index = graph_utils::FindGraphNodeWithOp(kModelDataset, *output);
+  NodeDef& model_node = *(output->mutable_node(index));
+
+  if (model_node.attr().at(kAlgorithm).i() == HILL_CLIMB) {
+    (*model_node.mutable_attr())[kAlgorithm].set_i(GRADIENT_DESCENT);
+    stats->num_changes++;
+  }
+
+  return Status::OK();
+}
+
+void EnableGradientDescent::Feedback(Cluster* cluster, const GrapplerItem& item,
+                                     const GraphDef& optimize_output,
+                                     double result) {
+  // no-op
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(EnableGradientDescent, "enable_gradient_descent");
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h
new file mode 100644
index 00000000000..148c557b9cf
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h
@@ -0,0 +1,67 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_ENABLE_GRADIENT_DESCENT_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_ENABLE_GRADIENT_DESCENT_H_
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+constexpr char kAutotune[] = "autotune";
+
+// This optimization enables Gradient Descent Optimization in `ModelDataset`.
+class EnableGradientDescent : public TFDataOptimizerBase {
+ public:
+  EnableGradientDescent() = default;
+  ~EnableGradientDescent() override = default;
+
+  string name() const override { return "enable_gradient_descent"; };
+
+  bool UsesFunctionLibrary() const override { return false; }
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    if (!config) return Status::OK();
+
+    const string& autotune = config->parameter_map().at(kAutotune).s();
+    if (autotune == "true") {
+      autotune_ = true;
+    } else if (autotune == "false") {
+      autotune_ = false;
+    } else {
+      return errors::InvalidArgument("Received an invalid value for parameter ",
+                                     kAutotune, ": ", autotune);
+    }
+    return Status::OK();
+  }
+
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+
+ private:
+  bool autotune_ = true;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_ENABLE_GRADIENT_DESCENT_H_
diff --git a/tensorflow/core/grappler/optimizers/data/enable_gradient_descent_test.cc b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent_test.cc
new file mode 100644
index 00000000000..c623c53f0b1
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent_test.cc
@@ -0,0 +1,82 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h"
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_test_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+Status OptimizeWithEnableGradientDescent(const GrapplerItem &item,
+                                         GraphDef *output, bool autotune) {
+  EnableGradientDescent optimizer;
+  RewriterConfig_CustomGraphOptimizer config;
+  if (autotune) {
+    (*config.mutable_parameter_map())["autotune"].set_s("true");
+  } else {
+    (*config.mutable_parameter_map())["autotune"].set_s("false");
+  }
+  TF_RETURN_IF_ERROR(optimizer.Init(&config));
+  return optimizer.Optimize(nullptr, item, output);
+}
+
+class SimpleRewrite
+    : public ::testing::TestWithParam<std::tuple<bool, int64, string>> {};
+
+TEST_P(SimpleRewrite, EnableGradientDescentTest) {
+  const bool autotune = std::get<0>(GetParam());
+  const int64 algorithm_index = std::get<1>(GetParam());
+  const string op = std::get<2>(GetParam());
+
+  using test::function::NDef;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
+       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
+       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
+       NDef("batch_size", "Const", {}, {{"value", 5}, {"dtype", DT_INT32}}),
+       NDef("batch", "BatchDataset", {"range", "batch_size"}, {}),
+       NDef("model", "ModelDataset", {"batch"},
+            {{"algorithm", algorithm_index}}),
+       NDef("Sink", op, {"model"}, {})});
+  item.fetch.push_back("Sink");
+
+  GraphDef output;
+  TF_ASSERT_OK(OptimizeWithEnableGradientDescent(item, &output, autotune));
+  EXPECT_EQ(item.graph.node().size(), output.node().size());
+
+  NodeDef model_node =
+      output.node(graph_utils::FindGraphNodeWithName("model", output));
+  EXPECT_EQ(model_node.attr().at("algorithm").i(),
+            (autotune && op != "_Retval") ? 1 : algorithm_index);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Test, SimpleRewrite,
+    ::testing::Combine(::testing::Values(false, true), ::testing::Values(0, 1),
+                       ::testing::Values("Identity", "_Retval")));
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
index d70a1ca486e..e09ea575ce4 100644
--- a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
@@ -430,9 +430,8 @@ FunctionDef* FuseFunctions(
     const SetInputFn& set_input, const SetOutputFn& set_output,
     const SetNodesFn& set_nodes, FunctionDefLibrary* library) {
   auto has_attrs = [](const FunctionDef& func) {
-    return !(
-        func.attr_size() == 0 ||
-        (func.attr_size() == 1 && func.attr().contains(data::kTFDataFunction)));
+    return !(func.attr_size() == 0 ||
+             (func.attr_size() == 1 && data::IsTFDataFunction(func)));
   };
   if (has_attrs(first_function) || has_attrs(second_function)) {
     return nullptr;  // Functions with attributes are currently not supported.
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
index 8bc33ea8464..10207de920c 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
@@ -27,6 +27,7 @@ namespace graph_utils {
 namespace {
 
 constexpr char kConstOpName[] = "Const";
+constexpr char kRetValOp[] = "_Retval";
 
 template <typename Predicate, typename Collection>
 std::vector<int> GetElementIndicesWithPredicate(const Predicate& predicate,
@@ -367,6 +368,19 @@ Status GetFetchNode(const MutableGraphView& graph, const GrapplerItem& item,
   return Status::OK();
 }
 
+bool IsItemDerivedFromFunctionDef(const GrapplerItem& item,
+                                  const MutableGraphView& graph_view) {
+  for (const auto& fetch_name : item.fetch) {
+    auto fetch = graph_view.GetNode(fetch_name);
+    if (fetch != nullptr && fetch->op() != kRetValOp) {
+      // We found a fetch node which is not a `Retval` op.
+      return false;
+    }
+  }
+  // All fetch nodes are `Retval` ops (or we don't have any fetch nodes).
+  return true;
+}
+
 }  // namespace graph_utils
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.h b/tensorflow/core/grappler/optimizers/data/graph_utils.h
index 87c9831126f..3a397e50106 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.h
@@ -169,6 +169,13 @@ Status EnsureNodeNamesUnique(Graph* g);
 Status GetFetchNode(const MutableGraphView& graph, const GrapplerItem& item,
                     NodeDef** fetch_node);
 
+// Returns true if `item` is derived from a `FunctionDef`, false otherwise.
+// Currently, we determine this heuristically: If we don't have any fetch nodes
+// or all fetch nodes are `Retval` ops, then we consider this item as derived
+// from a `FunctionDef`.
+bool IsItemDerivedFromFunctionDef(const GrapplerItem& item,
+                                  const MutableGraphView& graph_view);
+
 }  // namespace graph_utils
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
index e29b6201402..ebe8ff7522c 100644
--- a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
+++ b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
@@ -130,7 +130,8 @@ NodeDef MakeZipNode(const NodeDef& first_node, const NodeDef& second_node,
 
 // We need to insert our argument before the placeholders, which are the last
 // arguments.
-OpDef_ArgDef* InsertSeedArgument(OpDef* signature, int num_placeholders) {
+OpDef_ArgDef* InsertSeedArgument(FunctionDef* function, int num_placeholders) {
+  OpDef* signature = function->mutable_signature();
   int new_argument_idx = signature->input_arg_size() - num_placeholders;
   signature->add_input_arg();
   for (int i = signature->input_arg_size() - 1; i > new_argument_idx; i--) {
@@ -140,6 +141,16 @@ OpDef_ArgDef* InsertSeedArgument(OpDef* signature, int num_placeholders) {
   seed_arg->set_name(strings::StrCat("seed_arg", new_argument_idx));
   seed_arg->set_type(DT_INT64);
 
+  // Update arg_attr, any arg_attrs for the placeholders how have index one
+  // higher.
+  for (int i = signature->input_arg_size() - 1; i > new_argument_idx; i--) {
+    if (function->arg_attr().contains(i - 1)) {
+      (*function->mutable_arg_attr())[i] =
+          (*function->mutable_arg_attr())[i - 1];
+      function->mutable_arg_attr()->erase(i - 1);
+    }
+  }
+
   return seed_arg;
 }
 
@@ -157,8 +168,7 @@ const FunctionDef* MakeLessStatefulFunction(const FunctionDef& map_function,
   graph_utils::SetUniqueGraphFunctionName("stateless_function", library,
                                           stateless_function);
 
-  auto* seed_arg = InsertSeedArgument(stateless_function->mutable_signature(),
-                                      num_placeholders);
+  auto* seed_arg = InsertSeedArgument(stateless_function, num_placeholders);
 
   auto* const random_uniform = stateless_function->mutable_node_def(
       function_utils::FindFunctionNodeWithOp("RandomUniform",
diff --git a/tensorflow/core/grappler/optimizers/data/inject_prefetch_test.cc b/tensorflow/core/grappler/optimizers/data/inject_prefetch_test.cc
deleted file mode 100644
index 9c75867ca9d..00000000000
--- a/tensorflow/core/grappler/optimizers/data/inject_prefetch_test.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/grappler/optimizers/data/inject_prefetch.h"
-
-#include "tensorflow/core/framework/attr_value_util.h"
-#include "tensorflow/core/framework/function_testlib.h"
-#include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/optimizers/data/graph_test_utils.h"
-#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace grappler {
-namespace {
-
-TEST(MakeStateless, ParallelMap) {
-  using test::function::NDef;
-  GrapplerItem item;
-  item.graph = test::function::GDef(
-      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
-       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
-       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
-       NDef("num_parallel_calls", "Const", {},
-            {{"value", 1}, {"dtype", DT_INT32}}),
-       graph_tests_utils::MakeParallelMapNode("map", "range",
-                                              "num_parallel_calls", "XTimesTwo",
-                                              /*sloppy=*/false)},
-      // FunctionLib
-      {
-          test::function::XTimesTwo(),
-      });
-
-  InjectPrefetch optimizer;
-  GraphDef output;
-  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("PrefetchDataset", output));
-  int index = graph_utils::FindGraphNodeWithOp("PrefetchDataset", output);
-  EXPECT_FALSE(output.node(index).attr().at("legacy_autotune").b());
-}
-
-TEST(MakeStateless, ParallelInterleave) {
-  using test::function::NDef;
-  GrapplerItem item;
-  item.graph = test::function::GDef(
-      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
-       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
-       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
-       NDef("cycle_length", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("block_length", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("num_parallel_calls", "Const", {},
-            {{"value", 1}, {"dtype", DT_INT32}}),
-       graph_tests_utils::MakeParallelInterleaveV2Node(
-           "interleave", "range", "cycle_length", "block_length",
-           "num_parallel_calls", "XTimesTwo", /*sloppy=*/false)},
-      // FunctionLib
-      {
-          test::function::XTimesTwo(),
-      });
-
-  InjectPrefetch optimizer;
-  GraphDef output;
-  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("PrefetchDataset", output));
-  int index = graph_utils::FindGraphNodeWithOp("PrefetchDataset", output);
-  EXPECT_FALSE(output.node(index).attr().at("legacy_autotune").b());
-}
-
-TEST(MakeStateless, MapAndBatch) {
-  using test::function::NDef;
-  GrapplerItem item;
-  item.graph = test::function::GDef(
-      {NDef("start", "Const", {}, {{"value", 0}, {"dtype", DT_INT32}}),
-       NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
-       NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
-       NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
-       NDef("batch_size", "Const", {}, {{"value", 32}, {"dtype", DT_INT64}}),
-       NDef("num_parallel_calls", "Const", {},
-            {{"value", 1}, {"dtype", DT_INT64}}),
-       NDef("drop_remainder", "Const", {},
-            {{"value", false}, {"dtype", DT_BOOL}}),
-       graph_tests_utils::MakeMapAndBatchNode(
-           "map_and_batch", "range", "batch_size", "num_parallel_calls",
-           "drop_remainder", "XTimesTwo")},
-      // FunctionLib
-      {
-          test::function::XTimesTwo(),
-      });
-
-  InjectPrefetch optimizer;
-  GraphDef output;
-  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("PrefetchDataset", output));
-  int index = graph_utils::FindGraphNodeWithOp("PrefetchDataset", output);
-  EXPECT_FALSE(output.node(index).attr().at("legacy_autotune").b());
-}
-
-}  // namespace
-}  // namespace grappler
-}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
index e11be71af61..c9d8a4e0e4c 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
@@ -34,6 +34,7 @@ namespace grappler {
 namespace {
 
 constexpr char kInsertOpName[] = "LatencyStatsDataset";
+constexpr char kModelDataset[] = "ModelDataset";
 
 // Creates a LatencyStatsDataset node whose input is `node`.
 Status MakeLatencyNode(const NodeDef& node, MutableGraphView* graph,
@@ -92,6 +93,11 @@ Status LatencyAllEdges::OptimizeAndCollectStats(Cluster* cluster,
       // node corresponds to a `Dataset` op.
       continue;
     }
+    // We don't add LatencyStatsDataset after ModelDataset.
+    if (node.op() == kModelDataset) {
+      continue;
+    }
+
     NodeDef latency_node;
     // Try to make a latency node. This may fail if the input node doesn't have
     // output_types or output_shapes attrs. In those cases, we don't add a node
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
index 7dc32084f29..71d761a4dff 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
@@ -26,7 +26,10 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-TEST(LatencyAllEdgesTest, AddLatenciesAfterTensorMapPrefetch) {
+class LatencyAllEdgesTest : public ::testing::TestWithParam<bool> {};
+
+TEST_P(LatencyAllEdgesTest, AddLatenciesAfterTensorMapPrefetch) {
+  const bool contain_model = GetParam();
   using test::function::NDef;
   GrapplerItem item;
   NodeDef component_node =
@@ -45,18 +48,27 @@ TEST(LatencyAllEdgesTest, AddLatenciesAfterTensorMapPrefetch) {
                            {"output_types", {}}});
   NodeDef buffer_size_node = NDef("buffer_size_node", "Const", {},
                                   {{"value", 1}, {"dtype", DT_INT32}});
-  NodeDef prefetch_node = NDef("prefetch_node", "Prefetch_Dataset",
-                               {"map_node", "buffer_size_node"},
-                               {{"output_shapes", {}}, {"output_types", {}}});
+  NodeDef prefetch_node =
+      NDef("prefetch_node", "PrefetchDataset", {"map_node", "buffer_size_node"},
+           {{"output_shapes", {}}, {"output_types", {}}});
+  NodeDef model_node = NDef("model_node", "ModelDataset", {"prefetch_node"},
+                            {{"output_shapes", {}}, {"output_types", {}}});
 
-  item.graph = test::function::GDef({component_node, from_tensor_node,
-                                     captured_input_node, map_node,
-                                     buffer_size_node, prefetch_node});
+  if (contain_model) {
+    item.graph = test::function::GDef(
+        {component_node, from_tensor_node, captured_input_node, map_node,
+         buffer_size_node, prefetch_node, model_node});
+  } else {
+    item.graph = test::function::GDef({component_node, from_tensor_node,
+                                       captured_input_node, map_node,
+                                       buffer_size_node, prefetch_node});
+  }
 
   LatencyAllEdges optimizer;
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
+  EXPECT_EQ(output.node_size(), contain_model ? 13 : 12);
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp("LatencyStatsDataset", output));
   std::vector<int> latency_node_indices =
       graph_utils::FindAllGraphNodesWithOp("LatencyStatsDataset", output);
@@ -87,6 +99,9 @@ TEST(LatencyAllEdgesTest, AddLatenciesAfterTensorMapPrefetch) {
   }
 }
 
+INSTANTIATE_TEST_SUITE_P(Test, LatencyAllEdgesTest,
+                         ::testing::Values(false, true));
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_parallelization.cc b/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
index 26dc1caca4c..239cb74d819 100644
--- a/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
@@ -58,7 +58,13 @@ Status MapParallelization::OptimizeAndCollectStats(Cluster* cluster,
                                                    GraphDef* output,
                                                    OptimizationStats* stats) {
   *output = item.graph;
+  if (!autotune_) {
+    VLOG(1) << "The optimization map_parallelization is not applied if "
+               "autotune is off.";
+    return Status::OK();
+  }
   MutableGraphView graph(output);
+
   absl::flat_hash_set<string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              item.graph.library());
diff --git a/tensorflow/core/grappler/optimizers/data/map_parallelization.h b/tensorflow/core/grappler/optimizers/data/map_parallelization.h
index b231697b0ed..e521c654200 100644
--- a/tensorflow/core/grappler/optimizers/data/map_parallelization.h
+++ b/tensorflow/core/grappler/optimizers/data/map_parallelization.h
@@ -16,11 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_PARALLELIZATION_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_PARALLELIZATION_H_
 
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
 namespace tensorflow {
 namespace grappler {
 
+constexpr char kAutotune[] = "autotune";
+
 // This optimization parallelizes MapDataset when function is stateless.
 class MapParallelization : public TFDataOptimizerBase {
  public:
@@ -33,6 +36,17 @@ class MapParallelization : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    if (!config) return Status::OK();
+
+    const string& autotune = config->parameter_map().at(kAutotune).s();
+    if (autotune == "true") {
+      autotune_ = true;
+    } else if (autotune == "false") {
+      autotune_ = false;
+    } else {
+      return errors::InvalidArgument("Received an invalid value for parameter ",
+                                     kAutotune, ": ", autotune);
+    }
     return Status::OK();
   }
 
@@ -42,6 +56,9 @@ class MapParallelization : public TFDataOptimizerBase {
 
   void Feedback(Cluster* cluster, const GrapplerItem& item,
                 const GraphDef& optimize_output, double result) override;
+
+ private:
+  bool autotune_ = true;
 };
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/data/map_parallelization_test.cc b/tensorflow/core/grappler/optimizers/data/map_parallelization_test.cc
index 9fdfe8af301..197e75a46a2 100644
--- a/tensorflow/core/grappler/optimizers/data/map_parallelization_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_parallelization_test.cc
@@ -28,11 +28,28 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+Status OptimizeWithMapParallelization(const GrapplerItem& item,
+                                      GraphDef* output, bool autotune) {
+  MapParallelization optimizer;
+  RewriterConfig_CustomGraphOptimizer config;
+  if (autotune) {
+    (*config.mutable_parameter_map())["autotune"].set_s("true");
+  } else {
+    (*config.mutable_parameter_map())["autotune"].set_s("false");
+  }
+  TF_RETURN_IF_ERROR(optimizer.Init(&config));
+  return optimizer.Optimize(nullptr, item, output);
+}
+
 using graph_tests_utils::MakeMapNode;
 const char stateless_fun_name[] = "XTimesTwo";
 const char stateful_fun_name[] = "RandomUniform";
 
-TEST(MapParallelizationTest, ParallelizeSimpleMap) {
+class AutotuneSetting : public ::testing::TestWithParam<bool> {};
+
+TEST_P(AutotuneSetting, MapParallelizationTest) {
+  const bool autotune = GetParam();
+
   using test::function::NDef;
   GrapplerItem item;
   item.graph = test::function::GDef(
@@ -40,21 +57,22 @@ TEST(MapParallelizationTest, ParallelizeSimpleMap) {
        NDef("stop", "Const", {}, {{"value", 10}, {"dtype", DT_INT32}}),
        NDef("step", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
        NDef("range", "RangeDataset", {"start", "stop", "step"}, {}),
-       MakeMapNode("map1", "range", stateless_fun_name)},
+       MakeMapNode("map", "range", stateless_fun_name)},
       // FunctionLib
       {
           test::function::XTimesTwo(),
       });
 
-  MapParallelization optimizer;
   GraphDef output;
-  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ParallelMapDataset", output));
-  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map1", output));
-  EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map2", output));
+  TF_ASSERT_OK(OptimizeWithMapParallelization(item, &output, autotune));
+  EXPECT_EQ(graph_utils::ContainsNodeWithOp("ParallelMapDataset", output),
+            autotune);
+  EXPECT_EQ(graph_utils::ContainsGraphNodeWithName("map", output), !autotune);
 }
 
-TEST(MapParallelization, ParallelizeAssert) {
+INSTANTIATE_TEST_SUITE_P(Test, AutotuneSetting, ::testing::Values(false, true));
+
+TEST(ParallelizeAssert, MapParallelizationTest) {
   using test::function::NDef;
   GrapplerItem item;
   item.graph = test::function::GDef(
@@ -72,9 +90,8 @@ TEST(MapParallelization, ParallelizeAssert) {
           test::function::RandomUniform(),
       });
 
-  MapParallelization optimizer;
   GraphDef output;
-  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  TF_ASSERT_OK(OptimizeWithMapParallelization(item, &output, true));
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ParallelMapDataset", output));
   EXPECT_TRUE(graph_utils::ContainsGraphNodeWithName("map1", output));
   EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map2", output));
diff --git a/tensorflow/core/grappler/optimizers/data/map_vectorization.h b/tensorflow/core/grappler/optimizers/data/map_vectorization.h
index b91537c920d..3e467645895 100644
--- a/tensorflow/core/grappler/optimizers/data/map_vectorization.h
+++ b/tensorflow/core/grappler/optimizers/data/map_vectorization.h
@@ -22,6 +22,8 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+constexpr char kUseChooseFastest[] = "use_choose_fastest";
+
 // This optimizer rewrites dataset.map(map_fn, ...).batch(...) and
 // dataset.apply(tf.data.experimental.map_and_batch(map_fn, ...)) patterns in an
 // input pipeline. It vectorizes the map_fn, such that this segment can be
@@ -54,15 +56,15 @@ class MapVectorization : public TFDataOptimizerBase {
     if (!config) return Status::OK();
 
     const string& choose_fastest_param =
-        config->parameter_map().at("use_choose_fastest").s();
+        config->parameter_map().at(kUseChooseFastest).s();
     if (choose_fastest_param == "true") {
       use_choose_fastest_ = true;
     } else if (choose_fastest_param == "false") {
       use_choose_fastest_ = false;
     } else {
-      return errors::Internal(
-          "Received an invalid value for parameter \"use_choose_fastest\"",
-          choose_fastest_param);
+      return errors::InvalidArgument("Received an invalid value for parameter ",
+                                     kUseChooseFastest, ": ",
+                                     choose_fastest_param);
     }
     return Status::OK();
   }
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
index 8d50a0409df..b4317577cb8 100644
--- a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
@@ -35,7 +35,7 @@ using ConfigMap =
     std::map<string, tensorflow::RewriterConfig_CustomGraphOptimizer>;
 
 // tf.data optimizations, in the order we want to perform them.
-constexpr std::array<const char*, 17> kTFDataOptimizations = {
+constexpr std::array<const char*, 19> kTFDataOptimizations = {
     "noop_elimination",
     "disable_intra_op_parallelism",
     "shuffle_and_repeat_fusion",
@@ -52,7 +52,9 @@ constexpr std::array<const char*, 17> kTFDataOptimizations = {
     "parallel_batch",
     "reorder_data_discarding_ops",
     "slack",
-    "inject_prefetch"};
+    "autotune_buffer_sizes",
+    "disable_prefetch_legacy_autotune",
+    "enable_gradient_descent"};
 
 // Parses a list of string optimizer configurations into a map from
 // optimizer name -> rewriter config for that optimizer.
@@ -116,7 +118,7 @@ Status TFDataMetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   for (const auto& name : flib.ListFunctionNames()) {
     auto* func = flib.Find(name);
     // Skip non tf.data functions.
-    if (!func->attr().contains(data::kTFDataFunction)) continue;
+    if (!data::IsTFDataFunction(*func)) continue;
     VLOG(3) << "Optimize function: function=" << func->signature().name();
     optimized_functions = true;
 
diff --git a/tensorflow/core/grappler/optimizers/data/slack.cc b/tensorflow/core/grappler/optimizers/data/slack.cc
index 27915e2d5d6..fad2b9f7f67 100644
--- a/tensorflow/core/grappler/optimizers/data/slack.cc
+++ b/tensorflow/core/grappler/optimizers/data/slack.cc
@@ -33,7 +33,6 @@ namespace grappler {
 
 namespace {
 
-constexpr char kRetValOp[] = "_Retval";
 constexpr char kPrefetchDatasetOp[] = "PrefetchDataset";
 
 template <std::size_t SIZE>
@@ -101,10 +100,9 @@ Status Slack::RecursivelyHandleOp(const MutableGraphView& graph,
     return Status::OK();
   }
 
-  return errors::InvalidArgument(
-      "Encountered unsupported op \"", dataset_node->op(),
-      "\" when rewriting the input pipeline graph to use slack in its "
-      "final prefetch transformation.");
+  LOG(WARNING) << "Could not find a final `prefetch` in the input pipeline to "
+                  "which to introduce slack.";
+  return Status::OK();
 }
 
 Status Slack::OptimizeAndCollectStats(Cluster* cluster,
@@ -117,17 +115,13 @@ Status Slack::OptimizeAndCollectStats(Cluster* cluster,
 
   *output = item.graph;
   MutableGraphView graph(output);
-  for (const auto& fetch_name : item.fetch) {
-    // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
-    // because we only want to add slack to the prefetch on the main dataset
-    // pipeline.
-    auto fetch = graph.GetNode(fetch_name);
-    if (fetch == nullptr || fetch->op() == kRetValOp) {
-      // Heuristic: If the fetch nodes are Retval ops, this item is from a
-      // function.
-      return Status::OK();
-    }
-  }
+
+  // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
+  // because we only want to add slack to the prefetch on the main dataset
+  // pipeline.
+  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph))
+    return Status::OK();
+
   if (item.fetch.size() != 1) {
     return errors::InvalidArgument(
         "Expected only one fetch node but there were ", item.fetch.size(), ": ",
diff --git a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
index 57962163562..9ae0a9d5391 100644
--- a/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/vectorization/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/core/platform:build_config.bzl", "tf_protos_all")
 
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 7af70e4755f..ad9f3d5ed8f 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -828,9 +828,9 @@ const bool IsExemptFromSideEffectsExecutionValidation(const string& op) {
       {// LINT.IfChange
        // Op types that should not run in program order, e.g. because they need
        // to run asynchronously to avoid deadlock.
-       "CollectiveGather", "CollectiveReduce", "CollectiveReduceV2",
-       "CollectiveBcastSend", "CollectiveBcastRecv", "NcclAllReduce", "Send",
-       "Recv",
+       "CollectiveGather", "CollectiveGatherV2", "CollectiveReduce",
+       "CollectiveReduceV2", "CollectiveBcastSend", "CollectiveBcastRecv",
+       "NcclAllReduce", "Send", "Recv",
 
        // Legacy random ops.
        // See details in tensorflow/python/framework/auto_control_deps.py.
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
index 8307b37407e..0a2b308af63 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
@@ -81,6 +81,7 @@ constexpr int kDepthOut = 16;
   { 0, 3, 1, 2 }
 #endif  // (GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
 
+template <typename T = float>
 Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
                     const string& padding, const string& device) {
   int batch_size = 8;
@@ -91,15 +92,15 @@ Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
   int stride = 1;
   TensorShape input_shape(
       DIMS(batch_size, input_height, input_width, input_depth));
-  Tensor input_data(DT_FLOAT, input_shape);
-  test::FillIota<float>(&input_data, 1.0f);
+  Tensor input_data(DataTypeToEnum<T>::value, input_shape);
+  test::FillIota<T>(&input_data, static_cast<T>(1));
   Output input =
       ops::Const(s->WithOpName("Input"), Input::Initializer(input_data));
 
   TensorShape filter_shape(
       {filter_size, filter_size, input_depth, filter_count});
-  Tensor filter_data(DT_FLOAT, filter_shape);
-  test::FillIota<float>(&filter_data, 1.0f);
+  Tensor filter_data(DataTypeToEnum<T>::value, filter_shape);
+  test::FillIota<T>(&filter_data, static_cast<T>(1));
   Output filter =
       ops::Const(s->WithOpName("Filter"), Input::Initializer(filter_data));
 
@@ -356,6 +357,25 @@ TEST_F(GenericLayoutOptimizerTest, CPUDevice) {
 #endif  // (GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
 }
 
+TEST_F(GenericLayoutOptimizerTest, NoOptimizeIntegerConvolution) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D<int32>(&s, 4, 2, "VALID", "");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+
+  GenericLayoutOptimizer optimizer(REWRITER_CONFIG);
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+
+  Status status;
+  utils::GraphView graph_view(&output, &status);
+  TF_ASSERT_OK(status);
+  auto* conv_node = graph_view.GetNode("Conv2D");
+  ASSERT_NE(conv_node, nullptr);
+  VerifyDataFormatAttributeMatch(conv_node, SRC_DATA_FORMAT);
+}
+
 TEST_F(GenericLayoutOptimizerTest, Connectivity) {
   Scope scope = Scope::NewRootScope();
   auto conv = SimpleConv2D(&scope, 4, 2, "VALID",
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
index 930dda6137a..ef68b7e7898 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
@@ -30,11 +30,13 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/frame.h"
+#include "tensorflow/core/grappler/utils/graph_view.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -83,6 +85,16 @@ inline bool AttrDataFormatMatch(const utils::MutableNodeView& node,
   return AttrDataFormatMatch(node, src_data_format, &missing);
 }
 
+bool IsNonFloatingConv2D(const utils::MutableNodeView& node) {
+  if (IsConv2D(*node.node()) || IsConv2DBackpropInput(*node.node())) {
+    const auto* attr = node.GetAttr(kAttrT);
+    if (attr != nullptr) {
+      return !kDataTypeIsFloating.Contains(attr->type());
+    }
+  }
+  return false;
+}
+
 // Utils for layout agnostic transposer.
 
 bool IsComparisonOp(const NodeDef& node) {
@@ -155,6 +167,55 @@ std::vector<int> GetDimensionIndicesFromLabel(
   return indices;
 }
 
+// RAII-styled object for keeping track of 4D to 5D data format
+// upgrade/conversion. Currently only NHWC -> NDHWC and NCHW -> NCDHW are
+// supported.
+class ScopedDataFormatUpgrader {
+ public:
+  ScopedDataFormatUpgrader(TransposeContext* context, int rank)
+      : context_(context) {
+    if (rank == 5 && IsSupportedDataFormat(context_->src_format) &&
+        IsSupportedDataFormat(context_->dst_format)) {
+      old_src_format_ = context_->src_format;
+      old_dst_format_ = context_->dst_format;
+      std::string new_src_format = GetUpgradedDataFormat(context_->src_format);
+      std::string new_dst_format = GetUpgradedDataFormat(context_->dst_format);
+      context_->AssignDeviceAndDataFormats(context_->target_device,
+                                           new_src_format, new_dst_format);
+      upgraded_ = true;
+    }
+  }
+
+  ScopedDataFormatUpgrader(const ScopedDataFormatUpgrader&) = delete;
+  ScopedDataFormatUpgrader& operator=(const ScopedDataFormatUpgrader&) = delete;
+
+  ~ScopedDataFormatUpgrader() {
+    if (upgraded_) {
+      context_->AssignDeviceAndDataFormats(context_->target_device,
+                                           old_src_format_, old_dst_format_);
+    }
+  }
+
+ private:
+  bool IsSupportedDataFormat(absl::string_view data_format) {
+    return data_format == "NHWC" || data_format == "NCHW";
+  }
+
+  std::string GetUpgradedDataFormat(absl::string_view data_format) {
+    if (data_format == "NHWC") {
+      return "NDHWC";
+    }
+
+    DCHECK_EQ(data_format, "NCHW");
+    return "NCDHW";
+  }
+
+  TransposeContext* context_ = nullptr;
+  bool upgraded_ = false;
+  std::string old_src_format_;
+  std::string old_dst_format_;
+};
+
 }  // namespace
 
 // TransposeContext.
@@ -205,15 +266,19 @@ bool Transposer::ShouldProcess(const TransposeContext& context,
       GetDeviceName(context.virtual_placer.get(), *node_def);
   string device;
   string task;
-  bool is_on_target_device =
+  const bool is_on_target_device =
       DeviceNameUtils::SplitDeviceName(device_name, &task, &device) &&
       absl::StrContains(absl::AsciiStrToLower(device),
                         absl::AsciiStrToLower(context.target_device));
 
   // Only checks data format for layout sensitive op.
-  bool data_format_match = !IsLayoutSensitiveOp(*node_def) ||
-                           AttrDataFormatMatch(node, context.src_format);
-  return is_on_target_device && data_format_match &&
+  const bool data_format_match = !IsLayoutSensitiveOp(*node_def) ||
+                                 AttrDataFormatMatch(node, context.src_format);
+
+  // Only transposes floating point nodes.
+  const bool is_integer_conv2d = IsNonFloatingConv2D(node);
+
+  return is_on_target_device && data_format_match && !is_integer_conv2d &&
          !context.nodes_to_preserve.contains(node_def->name()) &&
          !(node.NumRegularFanouts() == 0 && node.NumControlledFanouts() == 0);
 }
@@ -654,7 +719,11 @@ Status LayoutSensitiveOpTransposer::UpdateNode(TransposeContext* context,
 Status DefaultLayoutSensitiveOpTransposer::TransposeNode(
     TransposeContext* context, utils::MutableNodeView* node) {
   DCHECK(IsDefaultLayoutSensitiveOp(*node->node()));
-  if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 4)) {
+  const auto* output_shape_attr = node->GetAttr(kAttrOutputShape);
+  const auto& shape = output_shape_attr->list().shape(0);
+  const int rank = shape.dim_size();
+  ScopedDataFormatUpgrader data_format_upgrader(context, rank);
+  if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, rank)) {
     return Status::OK();
   }
   VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
@@ -755,13 +824,6 @@ Status Conv2DBackpropInputTransposer::TransposeNode(
 Status Conv3DTransposer::TransposeNode(TransposeContext* context,
                                        utils::MutableNodeView* node) {
   DCHECK(IsConv3D(*node->node()));
-  // Update the format from 4D to 5D layout.
-  std::string src_format = context->src_format;
-  std::string dst_format = context->dst_format;
-  std::string src_format_3d = src_format == "NHWC" ? "NDHWC" : "NCDHW";
-  std::string dst_format_3d = dst_format == "NHWC" ? "NDHWC" : "NCDHW";
-  context->AssignDeviceAndDataFormats(context->target_device, src_format_3d,
-                                      dst_format_3d);
   if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 5)) {
     return Status::OK();
   }
@@ -771,22 +833,12 @@ Status Conv3DTransposer::TransposeNode(TransposeContext* context,
   TF_RETURN_IF_ERROR(UpdateNode(context, node));
   TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(context, {0}, node, kOpTranspose));
   TF_RETURN_IF_ERROR(UpdateFanoutEdgesWithOp(context, {0}, node, kOpTranspose));
-  // Change back the format from 5D to 4D layout.
-  context->AssignDeviceAndDataFormats(context->target_device, src_format,
-                                      dst_format);
   return context->graph_view->GetMutationBuilder()->Apply();
 }
 
 Status Conv3DBackpropFilterTransposer::TransposeNode(
     TransposeContext* context, utils::MutableNodeView* node) {
   DCHECK(IsConv3DBackpropFilterV2(*node->node()));
-  // Update the format from 4D to 5D layout.
-  std::string src_format = context->src_format;
-  std::string dst_format = context->dst_format;
-  std::string src_format_3d = src_format == "NHWC" ? "NDHWC" : "NCDHW";
-  std::string dst_format_3d = dst_format == "NHWC" ? "NDHWC" : "NCDHW";
-  context->AssignDeviceAndDataFormats(context->target_device, src_format_3d,
-                                      dst_format_3d);
   if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 5)) {
     return Status::OK();
   }
@@ -799,22 +851,12 @@ Status Conv3DBackpropFilterTransposer::TransposeNode(
   // No need to update output shape, as it is always of shape
   // [filter_height, filter_width, in_channels, out_channels], regardless of
   // whether NCHW or NHWC is used.
-  // Change back the format from 5D to 4D layout.
-  context->AssignDeviceAndDataFormats(context->target_device, src_format,
-                                      dst_format);
   return context->graph_view->GetMutationBuilder()->Apply();
 }
 
 Status Conv3DBackpropInputTransposer::TransposeNode(
     TransposeContext* context, utils::MutableNodeView* node) {
   DCHECK(IsConv3DBackpropInputV2(*node->node()));
-  // Update the format from 4D to 5D layout.
-  std::string src_format = context->src_format;
-  std::string dst_format = context->dst_format;
-  std::string src_format_3d = src_format == "NHWC" ? "NDHWC" : "NCDHW";
-  std::string dst_format_3d = dst_format == "NHWC" ? "NDHWC" : "NCDHW";
-  context->AssignDeviceAndDataFormats(context->target_device, src_format_3d,
-                                      dst_format_3d);
   if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 5)) {
     return Status::OK();
   }
@@ -826,9 +868,6 @@ Status Conv3DBackpropInputTransposer::TransposeNode(
       UpdateFaninEdgesWithOp(context, {0}, node, kOpDataFormatVecPermute));
   TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(context, {2}, node, kOpTranspose));
   TF_RETURN_IF_ERROR(UpdateFanoutEdgesWithOp(context, {0}, node, kOpTranspose));
-  // Change back the format from 5D to 4D layout.
-  context->AssignDeviceAndDataFormats(context->target_device, src_format,
-                                      dst_format);
   return context->graph_view->GetMutationBuilder()->Apply();
 }
 
@@ -865,7 +904,11 @@ bool FusedBatchNormGradTransposer::IsTraining(
 Status FusedBatchNormGradTransposer::TransposeNode(
     TransposeContext* context, utils::MutableNodeView* node) {
   DCHECK(IsFusedBatchNormGrad(*node->node()));
-  if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 4) ||
+  const auto* output_shape_attr = node->GetAttr(kAttrOutputShape);
+  const auto& shape = output_shape_attr->list().shape(0);
+  const int rank = shape.dim_size();
+  ScopedDataFormatUpgrader data_format_upgrader(context, rank);
+  if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, rank) ||
       !IsTraining(*node)) {
     return Status::OK();
   }
@@ -1046,10 +1089,20 @@ std::vector<int> LayoutAgnosticOpTransposer::GetVariadic4DFaninPorts(
 Status DefaultLayoutAgnosticOpTransposer::TransposeNode(
     TransposeContext* context, utils::MutableNodeView* node) {
   DCHECK(IsDefaultLayoutAgnosticOp(*node->node()));
-  if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 4) ||
+  const auto* output_shape_attr = node->GetAttr(kAttrOutputShape);
+  const auto& shape = output_shape_attr->list().shape(0);
+  const int rank = shape.dim_size();
+  if (rank != 4 && rank != 5) {
+    return Status::OK();
+  }
+  ScopedDataFormatUpgrader data_format_upgrader(context, rank);
+  if (!ShouldProcess(*context, *node) ||
       !IsAfterDstToSrcTransform(*context, *node)) {
     return Status::OK();
   }
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
   TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(context, {0}, node, kOpTranspose));
   TF_RETURN_IF_ERROR(UpdateFanoutEdgesWithOp(context, {0}, node, kOpTranspose));
   return context->graph_view->GetMutationBuilder()->Apply();
@@ -1074,19 +1127,20 @@ bool BinaryOpTransposer::IsNDOperateWithMD(const utils::MutableNodeView& node,
 }
 
 bool BinaryOpTransposer::IsFaninShapeSupported(
-    const utils::MutableNodeView& node) {
-  return (IsNDOperateWithMD(node, 4, 0) || IsNDOperateWithMD(node, 4, 1) ||
-          IsNDOperateWithMD(node, 4, 4) || IsNDOperateWithMD(node, 0, 4) ||
-          IsNDOperateWithMD(node, 1, 4));
+    const utils::MutableNodeView& node, int rank) {
+  return (IsNDOperateWithMD(node, rank, 0) ||
+          IsNDOperateWithMD(node, rank, 1) ||
+          IsNDOperateWithMD(node, rank, rank) ||
+          IsNDOperateWithMD(node, 0, rank) || IsNDOperateWithMD(node, 1, rank));
 }
 
-std::vector<int> BinaryOpTransposer::Get4DDataFaninPorts(
-    const utils::MutableNodeView& node) {
+std::vector<int> BinaryOpTransposer::GetNDDataFaninPorts(
+    const utils::MutableNodeView& node, int rank) {
   std::vector<int> values;
-  if (IsFaninPortRankN(node, 0, 4)) {
+  if (IsFaninPortRankN(node, 0, rank)) {
     values.push_back(0);
   }
-  if (IsFaninPortRankN(node, 1, 4)) {
+  if (IsFaninPortRankN(node, 1, rank)) {
     values.push_back(1);
   }
   return values;
@@ -1116,12 +1170,10 @@ Status BinaryOpTransposer::AddNodeReshape(
   return status;
 }
 
-Status BinaryOpTransposer::AddNodeShapeConst(utils::Mutation* mutation,
-                                             absl::string_view node_name,
-                                             absl::string_view node_device,
-                                             bool node_in_frame,
-                                             int num_channels,
-                                             absl::string_view depended_node) {
+Status BinaryOpTransposer::AddNodeShapeConst(
+    utils::Mutation* mutation, absl::string_view node_name,
+    absl::string_view node_device, bool node_in_frame, int num_channels,
+    absl::string_view depended_node, int rank) {
   NodeDef new_node;
   new_node.set_name(string(node_name));
   new_node.set_op(kOpConst);
@@ -1131,8 +1183,9 @@ Status BinaryOpTransposer::AddNodeShapeConst(utils::Mutation* mutation,
   new_node.mutable_attr()->insert({"dtype", attr_data_type});
 
   AttrValue attr_tensor;
-  Tensor tensor(DT_INT32, TensorShape({4}));
-  std::vector<int> shape = {1, num_channels, 1, 1};
+  Tensor tensor(DT_INT32, TensorShape({rank}));
+  std::vector<int> shape(rank, 1);
+  shape[1] = num_channels;
   for (int i = 0; i < static_cast<int>(shape.size()); i++) {
     tensor.flat<int>()(i) = shape[i];
   }
@@ -1150,12 +1203,13 @@ Status BinaryOpTransposer::AddNodeShapeConst(utils::Mutation* mutation,
   return status;
 }
 
-Status BinaryOpTransposer::MaybeReshapeVectorFanin(
-    TransposeContext* context, utils::MutableNodeView* node) {
+Status BinaryOpTransposer::MaybeReshapeVectorFanin(TransposeContext* context,
+                                                   utils::MutableNodeView* node,
+                                                   int rank) {
   int vector_index = -1;
-  if (IsNDOperateWithMD(*node, 4, 1)) {
+  if (IsNDOperateWithMD(*node, rank, 1)) {
     vector_index = 1;
-  } else if (IsNDOperateWithMD(*node, 1, 4)) {
+  } else if (IsNDOperateWithMD(*node, 1, rank)) {
     vector_index = 0;
   }
   if (vector_index != -1) {
@@ -1177,7 +1231,7 @@ Status BinaryOpTransposer::MaybeReshapeVectorFanin(
     TF_RETURN_IF_ERROR(
         AddNodeShapeConst(mutation, shape_const_node_name, node_device,
                           context->frames.IsInFrame(*node->node()), vector_size,
-                          fanin_node->GetName()));
+                          fanin_node->GetName(), rank));
     const auto* t_attr = node->GetAttr(kAttrT);
     if (t_attr == nullptr) {
       return errors::InvalidArgument("Missing attribute ", kAttrT);
@@ -1195,13 +1249,20 @@ Status BinaryOpTransposer::MaybeReshapeVectorFanin(
 Status BinaryOpTransposer::TransposeNode(TransposeContext* context,
                                          utils::MutableNodeView* node) {
   DCHECK(IsBinaryOp(*node->node()));
-  if (!ShouldProcess(*context, *node) || !IsFaninShapeSupported(*node) ||
+  const auto* output_shape_attr = node->GetAttr(kAttrOutputShape);
+  const auto& shape = output_shape_attr->list().shape(0);
+  const int rank = shape.dim_size();
+  ScopedDataFormatUpgrader data_format_upgrader(context, rank);
+  if (!ShouldProcess(*context, *node) || !IsFaninShapeSupported(*node, rank) ||
       !IsAfterDstToSrcTransform(*context, *node)) {
     return Status::OK();
   }
-  TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(context, Get4DDataFaninPorts(*node),
-                                            node, kOpTranspose));
-  TF_RETURN_IF_ERROR(MaybeReshapeVectorFanin(context, node));
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
+  TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(
+      context, GetNDDataFaninPorts(*node, rank), node, kOpTranspose));
+  TF_RETURN_IF_ERROR(MaybeReshapeVectorFanin(context, node, rank));
   TF_RETURN_IF_ERROR(UpdateFanoutEdgesWithOp(context, {0}, node, kOpTranspose));
   return context->graph_view->GetMutationBuilder()->Apply();
 }
@@ -1371,11 +1432,20 @@ bool ReduceTransposer::IsReduceAxisSupported(
 Status ReduceTransposer::TransposeNode(TransposeContext* context,
                                        utils::MutableNodeView* node) {
   DCHECK(IsReduceOp(*node->node()));
-  if (!ShouldProcess(*context, *node) || !IsFaninPortRankN(*node, 0, 4) ||
+  const auto& regular_fanin = node->GetRegularFanin(0);
+  const auto* output_shape_attr =
+      regular_fanin.node_view()->GetAttr(kAttrOutputShape);
+  const auto& shape = output_shape_attr->list().shape(0);
+  const int rank = shape.dim_size();
+  ScopedDataFormatUpgrader data_format_upgrader(context, rank);
+  if (!ShouldProcess(*context, *node) || !IsFaninPortRankN(*node, 0, rank) ||
       !IsReduceAxisSupported(*context, *node) ||
       !IsAfterDstToSrcTransform(*context, *node)) {
     return Status::OK();
   }
+  VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
+          << "' with op '" << node->GetOp() << "' from data format '"
+          << context->src_format << "' to '" << context->dst_format << "'";
   TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(context, {0}, node, kOpTranspose));
   TF_RETURN_IF_ERROR(
       UpdateFaninEdgesWithOp(context, {1}, node, kOpDataFormatDimMap));
@@ -1805,6 +1875,7 @@ bool IsDefaultLayoutAgnosticOp(const NodeDef& node) {
                                             "IsFinite",
                                             "IsInf",
                                             "IsNan",
+                                            "LeakyRelu",
                                             "Lgamma",
                                             "Log",
                                             "LogicalNot",
@@ -1853,10 +1924,11 @@ bool IsTernaryOp(const NodeDef& node) { return IsBetainc(node); }
 
 bool IsUnaryGrad(const NodeDef& node) {
   bool is_unary_grad =
-      IsEluGrad(node) || IsInvGrad(node) || IsReciprocalGrad(node) ||
-      IsRelu6Grad(node) || IsReluGrad(node) || IsRsqrtGrad(node) ||
-      IsSeluGrad(node) || IsSigmoidGrad(node) || IsSoftplusGrad(node) ||
-      IsSoftsignGrad(node) || IsSqrtGrad(node) || IsTanhGrad(node);
+      IsEluGrad(node) || IsInvGrad(node) || IsLeakyReluGrad(node) ||
+      IsReciprocalGrad(node) || IsRelu6Grad(node) || IsReluGrad(node) ||
+      IsRsqrtGrad(node) || IsSeluGrad(node) || IsSigmoidGrad(node) ||
+      IsSoftplusGrad(node) || IsSoftsignGrad(node) || IsSqrtGrad(node) ||
+      IsTanhGrad(node);
   return is_unary_grad;
 }
 
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
index 61720df791b..11a223ee097 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h
@@ -347,19 +347,21 @@ class BinaryOpTransposer : public LayoutAgnosticOpTransposer {
 
  private:
   bool IsNDOperateWithMD(const utils::MutableNodeView& node, int n, int m);
-  bool IsFaninShapeSupported(const utils::MutableNodeView& node);
-  std::vector<int> Get4DDataFaninPorts(const utils::MutableNodeView& node);
+  bool IsFaninShapeSupported(const utils::MutableNodeView& node, int rank);
+  std::vector<int> GetNDDataFaninPorts(const utils::MutableNodeView& node,
+                                       int rank);
   Status AddNodeShapeConst(utils::Mutation* mutation,
                            absl::string_view node_name,
                            absl::string_view node_device, bool node_in_frame,
-                           int num_channels, absl::string_view depended_node);
+                           int num_channels, absl::string_view depended_node,
+                           int rank);
   Status AddNodeReshape(utils::Mutation* mutation, absl::string_view node_name,
                         absl::string_view node_device,
                         absl::string_view input_name,
                         absl::string_view shape_const_node_name,
                         const DataType& data_type);
   Status MaybeReshapeVectorFanin(TransposeContext* context,
-                                 utils::MutableNodeView* node);
+                                 utils::MutableNodeView* node, int rank);
 };
 
 class ConcatOpTransposer : public LayoutAgnosticOpTransposer {
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 6f88f81548b..0bd683b7b64 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -1100,7 +1100,8 @@ class Conv2DProcessor : public NodeProcessor {
  protected:
   bool ShouldProcess() const override {
     return !MustPreserve() && IsNHWC() && IsPortZeroDimsFour(*node_) &&
-           HasOutputs() && (!IsGemmUsed() || no_gemm_) && IsOnGPU();
+           HasOutputs() && (!IsGemmUsed() || no_gemm_) && IsOnGPU() &&
+           IsDataTypeFloat();
   }
 
   TensorShapeProto GetShape(const string& input_name) const {
@@ -1131,6 +1132,13 @@ class Conv2DProcessor : public NodeProcessor {
     return false;
   }
 
+  bool IsDataTypeFloat() const {
+    if (node_->attr().find("T") != node_->attr().end()) {
+      return kDataTypeIsFloating.Contains(node_->attr().at("T").type());
+    }
+    return false;
+  }
+
   // The logic inside this function is based on the internal implementation of
   // Conv2D, Conv2DBackpropInput, and Conv2DBackpropFilter ops, and thus
   // needs to be updated accordingly if the internal implementation changes.
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index c92693adef4..f9c9747fd8c 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/grappler/clusters/single_machine.h"
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/virtual_placer.h"
@@ -56,11 +57,13 @@ class LayoutOptimizerTest : public GrapplerTest {
 
   void TearDown() override { TF_CHECK_OK(virtual_cluster_->Shutdown()); }
 
+  template <typename T = float>
   Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
                       const string& padding) {
-    return SimpleConv2D(s, input_size, filter_size, padding, "");
+    return SimpleConv2D<T>(s, input_size, filter_size, padding, "");
   }
 
+  template <typename T = float>
   Output SimpleConv2D(tensorflow::Scope* s, int input_size, int filter_size,
                       const string& padding, const string& device) {
     int batch_size = 8;
@@ -71,15 +74,15 @@ class LayoutOptimizerTest : public GrapplerTest {
     int stride = 1;
     TensorShape input_shape(
         {batch_size, input_height, input_width, input_depth});
-    Tensor input_data(DT_FLOAT, input_shape);
-    test::FillIota<float>(&input_data, 1.0f);
+    Tensor input_data(DataTypeToEnum<T>::value, input_shape);
+    test::FillIota<T>(&input_data, static_cast<T>(1));
     Output input =
         ops::Const(s->WithOpName("Input"), Input::Initializer(input_data));
 
     TensorShape filter_shape(
         {filter_size, filter_size, input_depth, filter_count});
-    Tensor filter_data(DT_FLOAT, filter_shape);
-    test::FillIota<float>(&filter_data, 1.0f);
+    Tensor filter_data(DataTypeToEnum<T>::value, filter_shape);
+    test::FillIota<T>(&filter_data, static_cast<T>(1));
     Output filter =
         ops::Const(s->WithOpName("Filter"), Input::Initializer(filter_data));
 
@@ -359,6 +362,20 @@ TEST_F(LayoutOptimizerTest, ExplicitPadding) {
   EXPECT_TRUE(node_map.GetNode("Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer"));
 }
 
+TEST_F(LayoutOptimizerTest, DataTypeIsInt32) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto conv = SimpleConv2D<int32>(&s, 4, 2, "EXPLICIT");
+  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  LayoutOptimizer optimizer;
+  GraphDef output;
+  Status status = optimizer.Optimize(virtual_cluster_.get(), item, &output);
+  NodeMap node_map(&output);
+  EXPECT_FALSE(
+      node_map.GetNode("Conv2D-0-TransposeNHWCToNCHW-LayoutOptimizer"));
+}
+
 TEST_F(LayoutOptimizerTest, Pad) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   auto conv = SimpleConv2D(&s, 4, 2, "VALID");
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 0c4c04633dc..ddfa5522e01 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -93,10 +93,6 @@ bool IsRunOnceOptimizer(const string& name) {
          name == "auto_mixed_precision_mkl";
 }
 
-bool IsTFDataFunction(const FunctionDef& func) {
-  return func.attr().contains(data::kTFDataFunction);
-}
-
 // Creates a function library stub from a real function library: copy only
 // signatures and attributes of all the function defined in fdef_lib. This stub
 // can be swapped with real function library in a graph, before passing it to
@@ -615,6 +611,63 @@ Status MetaOptimizer::RunOptimizer(
   return Status::OK();
 }
 
+// Propagates `_tf_data_function` attributes from functions to their callees.
+void PropagateTFDataAttrs(const FunctionLibraryDefinition& flib,
+                          FunctionDefLibrary& fdef_lib) {
+  // Collect functions that need the attribute in this set.
+  absl::flat_hash_set<string> tf_data_functions;
+  std::function<void(const string&)> collect_tf_data_functions_dfs =
+      [&](const string& func_name) -> void {
+    // Return if we already found and added this function.
+    if (tf_data_functions.contains(func_name)) return;
+
+    // We only get here if the function is (directly or indirectly) called from
+    // a tf.data function, so add it to the set.
+    tf_data_functions.insert(func_name);
+
+    const FunctionDef* func_def = flib.Find(func_name);
+    // Skip functions that are not reachable from the optimized graph.
+    if (func_def == nullptr) return;
+
+    // Proceed with DFS for functions called from current function.
+    for (const NodeDef& node : func_def->node_def()) {
+      if (flib.Contains(node.op())) {
+        // This is a function call node.
+        collect_tf_data_functions_dfs(node.op());
+      }
+      // Check if there are functions in attributes.
+      for (const auto& attr : node.attr()) {
+        const AttrValue& attr_value = attr.second;
+        if (attr_value.has_func()) {
+          collect_tf_data_functions_dfs(attr_value.func().name());
+        }
+        if (attr_value.has_list()) {
+          for (const auto& func : attr_value.list().func()) {
+            collect_tf_data_functions_dfs(func.name());
+          }
+        }
+      }
+    }
+  };
+  // Perform DFS for all tf.data functions in `fdef_lib`.
+  for (const auto& func_def : fdef_lib.function()) {
+    const string& func_name = func_def.signature().name();
+    if (data::IsTFDataFunction(func_def))
+      collect_tf_data_functions_dfs(func_name);
+  }
+  // Set attribute for tf.data functions. We cannot do this in the DFS directly
+  // because `FunctionLibraryDefinition` does not seem to provide mutable access
+  // to a `FunctionDef`.
+  for (FunctionDef& func_def : *fdef_lib.mutable_function()) {
+    const string& func_name = func_def.signature().name();
+    if (tf_data_functions.contains(func_name) &&
+        !data::IsTFDataFunction(func_def)) {
+      VLOG(2) << "Marking " << func_name << " as tf.data function";
+      (*func_def.mutable_attr())[data::kTFDataFunction].set_b(true);
+    }
+  }
+}
+
 Status MetaOptimizer::OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
                                           GraphDef* optimized_graph) {
   const uint64 start_us = Env::Default()->NowMicros();
@@ -636,13 +689,13 @@ Status MetaOptimizer::OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
   // remove all the unreachable functions.
   // TODO(ezhulenev): Construct reachable function library definition directly
   // from the proto without constructing temporary FunctionLibraryDefinition.
+  int old_library_size = item.graph.library().function_size();
   *item.graph.mutable_library() = minimized_flib(item.graph).ToProto();
+  int new_library_size = item.graph.library().function_size();
 
   VLOG(1) << absl::Substitute(
       "Deleted $0 unreachable functions from the graph (library size = $1)",
-      item.graph.library().function_size() -
-          item.graph.library().function_size(),
-      item.graph.library().function_size());
+      old_library_size - new_library_size, new_library_size);
 
   // Save a few small fields from item before we move it.
   bool optimize_function_library =
@@ -678,18 +731,41 @@ Status MetaOptimizer::OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
     find_differentiable_functions(function.node_def());
   }
 
-  // Find functions that are formed by XLA and will be compiled later. We do it
-  // by looking for a function attribute in XlaLaunch ops. Grappler rewrites
-  // potentially can add nodes that are not supported by XLA, so we choose to
-  // skip such functions when we optimize function library.
+  // Find functions that will be compiled by XLA later
+  // We do it by looking for XlaLaunch ops that call functions,
+  // then depth first search down those functions to find transitive functions.
+  // Grappler rewrites can potentially add nodes that are
+  // not supported by XLA, so we choose to skip such functions when we optimize
+  // the function library.
   absl::flat_hash_set<string> xla_compiled_functions;
+  std::function<void(const string&)> find_all_functions;
+  find_all_functions = [&](const string& func) -> void {
+    // Ignore call cycles in the graph
+    if (xla_compiled_functions.contains(func)) return;
+    // Find func in the flib
+    const FunctionDef* func_def = flib.Find(func);
+    CHECK(func_def) << "not found: " << func;
+    // Mark function to be ignored by grappler
+    xla_compiled_functions.insert(func);
+    // Depth first search through the func for transitively called funcs
+    for (const NodeDef& node : func_def->node_def()) {
+      for (const auto attr : node.attr()) {
+        const AttrValue& attr_value = attr.second;
+        if (attr_value.has_func()) {
+          find_all_functions(attr_value.func().name());
+        }
+      }
+    }
+  };
 
-  const auto find_xla_compiled_functions = [&](const NodeDefs& nodes) -> void {
+  auto find_xla_compiled_functions = [&](const NodeDefs& nodes) -> void {
     NameAttrList function;
     for (const NodeDef& node : nodes) {
+      // Look only for XlaLaunch nodes that call a function
       if (!IsXlaLaunch(node)) continue;
       if (!GetNodeAttr(node, "function", &function).ok()) continue;
-      xla_compiled_functions.insert(function.name());
+      // Find all transitively called functions
+      find_all_functions(function.name());
     }
   };
 
@@ -699,6 +775,8 @@ Status MetaOptimizer::OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
   for (const FunctionDef& function : optimized_graph->library().function()) {
     find_xla_compiled_functions(function.node_def());
   }
+  // Propagate `_tf_data_function` attributes from functions to their callees.
+  PropagateTFDataAttrs(flib, *optimized_graph->mutable_library());
 
   // Optimize each function only once.
   absl::flat_hash_set<string> optimized_funcs;
@@ -724,8 +802,9 @@ Status MetaOptimizer::OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
       // the function optimizer, before we can optimize function body.
       if (IsParametrized(func)) continue;
 
-      // Skip tf.data functions as they are optimized by tf.data meta optimizer.
-      if (IsTFDataFunction(func)) continue;
+      // Skip tf.data functions as they are optimized by tf.data meta optimizer
+      // and in function instantiation.
+      if (data::IsTFDataFunction(func)) continue;
 
       VLOG(3) << "Optimize function: function=" << func_name << " ["
               << function_idx++ << " of "
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 595b636c7a9..85f7f911635 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -1016,6 +1017,144 @@ TEST_F(MetaOptimizerTest, CompressConstants) {
   }
 }
 
+// Tests for checking expected behavior when skipping tf.data functions in
+// meta optimizer.
+
+// Custom optimizer which counts its calls.
+class TfDataTestOptimizer : public CustomGraphOptimizer {
+ public:
+  static void InitCount() { cnt_ = 0; }
+  static int GetCount() { return cnt_; }
+
+  TfDataTestOptimizer() {}
+  string name() const override { return "tf_data_test_optimizer"; }
+  bool UsesFunctionLibrary() const override { return false; }
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return Status::OK();
+  }
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* optimized_graph) override {
+    ++cnt_;
+    *optimized_graph = item.graph;
+    return Status::OK();
+  }
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override {}
+
+ private:
+  static int cnt_;
+};
+
+int TfDataTestOptimizer::cnt_;
+
+REGISTER_GRAPH_OPTIMIZER(TfDataTestOptimizer);
+
+// Test fixture for parametrized testing.
+class TfDataTestFixture
+    : public ::testing::TestWithParam<std::tuple<bool, bool>> {
+ protected:
+  void SetUp() override {
+    is_my_mul_tf_data_ = std::get<0>(GetParam());
+    is_my_square_tf_data_ = std::get<1>(GetParam());
+  }
+  void RunTest();
+
+ private:
+  // controls which of the functions is flagged as tf.data function
+  bool is_my_mul_tf_data_ = false;
+  bool is_my_square_tf_data_ = false;
+};
+
+TEST_P(TfDataTestFixture, TfDataTests) { RunTest(); }
+
+// Core test function.
+void TfDataTestFixture::RunTest() {
+  using test::function::NDef;
+
+  // Define function library:
+  //
+  //  MyMul(x, y)    = x * y
+  //  MySquare(x)    = MyMul(x, x)
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:float", "y:float"}, {"z:float"}, {},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/
+      {{"z", "mul:z:0"}});
+  (*mul_func.mutable_attr())[data::kTFDataFunction].set_b(is_my_mul_tf_data_);
+
+  FunctionDef square_func = FunctionDefHelper::Create(
+      "MySquare", {"x:float"}, {"z:float"}, {},
+      {{{"my_mul"}, "MyMul", {"x", "x"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/
+      {{"z", "my_mul:z:0"}});
+  (*square_func.mutable_attr())[data::kTFDataFunction].set_b(
+      is_my_square_tf_data_);
+
+  // Tensorflow graph:
+  //
+  //   a = tf.Placeholder(tf.float);
+  //   square = MySquare(a);  // a^2
+  GrapplerItem item;
+  item.id = "tf_graph";
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       // Calls into function library
+       NDef("square", "MySquare", {"a"}, {{"T", DT_FLOAT}}, kDevice),
+       // Forward outputs
+       NDef("out_s", "Identity", {"square:0"}, {{"T", DT_FLOAT}}, kDevice)},
+      /*funcs=*/
+      {mul_func, square_func});
+
+  // Use only custom optimizer which counts its calls.
+  TfDataTestOptimizer::InitCount();
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *(config_proto.mutable_graph_options()->mutable_rewrite_options());
+  rewriter_config.add_optimizers("TfDataTestOptimizer");
+  rewriter_config.set_min_graph_nodes(-1);
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::ONE);
+
+  MetaOptimizer optimizer(nullptr, config_proto);
+  GraphDef output;
+  const Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  // We expect one graph optimization + one optimization for each non-tf.data
+  // function. Note that if `MySquare` is flagged as a tf.data function, then
+  // `MyMul` is implicitly also considered a tf.data function because it is
+  // called from `MySquare`.
+  int expected_count = 3;
+  if (is_my_square_tf_data_)
+    expected_count -= 2;
+  else if (is_my_mul_tf_data_)
+    expected_count -= 1;
+  EXPECT_EQ(TfDataTestOptimizer::GetCount(), expected_count);
+
+  // We expect that the tf.data-attribute has been propagated from `MySquare`
+  // to its callee `MyMul` if the value is `true`. Otherwise, the attribute
+  // values should be unchanged.
+  FunctionLibraryDefinition flib(OpRegistry::Global(), output.library());
+  const FunctionDef* square_func_after_opt = flib.Find("MySquare");
+  const FunctionDef* mul_func_after_opt = flib.Find("MyMul");
+
+  EXPECT_EQ(data::IsTFDataFunction(*square_func_after_opt),
+            is_my_square_tf_data_);
+  if (is_my_square_tf_data_ || is_my_mul_tf_data_) {
+    EXPECT_EQ(data::IsTFDataFunction(*mul_func_after_opt), true);
+  } else {
+    EXPECT_EQ(data::IsTFDataFunction(*mul_func_after_opt), false);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(MetaOptimizerTest, TfDataTestFixture,
+                         ::testing::Combine(::testing::Bool(),
+                                            ::testing::Bool()));
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
index f534d3ed34f..f8574a4e0d3 100644
--- a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #ifdef INTEL_MKL
+#include "tensorflow/cc/ops/nn_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/devices.h"
@@ -34,8 +35,9 @@ class MklRemapperTest : public GrapplerTest {
   const string kAddV2Op = "AddV2";
 
  protected:
-  void FuseConv2DWithBiasAndAddNOrAdd(const string& data_format, bool has_relu,
-                                      string add_op, bool add_with_bcast) {
+  void FuseConv2DWithBiasAndAddNOrAdd(const string& data_format,
+                                      const string& activation, string add_op,
+                                      bool add_with_bcast) {
     using ::tensorflow::ops::Placeholder;
 
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -70,31 +72,34 @@ class MklRemapperTest : public GrapplerTest {
                     ops::Conv2D::Attrs().DataFormat(data_format));
     auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias,
                                  ops::BiasAdd::Attrs().DataFormat(data_format));
+
+    auto addfetch = [&](::tensorflow::Input addop) {
+      auto activate = s.WithOpName("activation");
+      auto fetch = s.WithOpName("fetch");
+      if (activation == "Relu") {
+        ops::Identity(fetch, ops::Relu(activate, addop));
+      } else if (activation == "Relu6") {
+        ops::Identity(fetch, ops::Relu6(activate, addop));
+      } else if (activation == "Elu") {
+        ops::Identity(fetch, ops::Elu(activate, addop));
+      } else if (activation == "LeakyRelu") {
+        ops::Identity(fetch, ops::internal::LeakyRelu(activate, addop));
+      } else {
+        DCHECK(activation == "None");
+        ops::Identity(fetch, addop);
+      }
+    };
+
     if (add_op == kAddNOp) {
       auto addn = ops::AddN(s.WithOpName(add_op),
                             std::initializer_list<Input>{input_addn, bias_add});
-      if (has_relu) {
-        auto relu = ops::Relu(s.WithOpName("relu"), addn);
-        ops::Identity(s.WithOpName("fetch"), relu);
-      } else {
-        ops::Identity(s.WithOpName("fetch"), addn);
-      }
+      addfetch(addn);
     } else if (add_op == kAddV2Op) {
       auto add = ops::AddV2(s.WithOpName(add_op), input_addn, bias_add);
-      if (has_relu) {
-        auto relu = ops::Relu(s.WithOpName("relu"), add);
-        ops::Identity(s.WithOpName("fetch"), relu);
-      } else {
-        ops::Identity(s.WithOpName("fetch"), add);
-      }
+      addfetch(add);
     } else {
       auto add = ops::Add(s.WithOpName(add_op), input_addn, bias_add);
-      if (has_relu) {
-        auto relu = ops::Relu(s.WithOpName("relu"), add);
-        ops::Identity(s.WithOpName("fetch"), relu);
-      } else {
-        ops::Identity(s.WithOpName("fetch"), add);
-      }
+      addfetch(add);
     }
     auto input_tensor = GenerateRandomTensor<DT_FLOAT>(
         TensorShape(input_shape.shape_.dim_sizes()));
@@ -129,7 +134,7 @@ class MklRemapperTest : public GrapplerTest {
     bool check_fusion = !add_with_bcast;
     int found = 0;
     for (const NodeDef& node : output.node()) {
-      auto fetch_node_name = has_relu ? "relu" : add_op;
+      auto fetch_node_name = activation != "None" ? "activation" : add_op;
       if (node.name() == fetch_node_name) {
         if (check_fusion) {
           EXPECT_EQ("_FusedConv2D", node.op());
@@ -141,19 +146,19 @@ class MklRemapperTest : public GrapplerTest {
           EXPECT_EQ("input_addn", node.input(3));
 
           const auto fused_ops = node.attr().at("fused_ops").list().s();
-          if (has_relu) {
+          if (activation != "None") {
             EXPECT_EQ(3, fused_ops.size());
             EXPECT_EQ("BiasAdd", fused_ops[0]);
             EXPECT_EQ("Add", fused_ops[1]);
-            EXPECT_EQ("Relu", fused_ops[2]);
+            EXPECT_EQ(activation, fused_ops[2]);
           } else {
             EXPECT_EQ(2, fused_ops.size());
             EXPECT_EQ("BiasAdd", fused_ops[0]);
             EXPECT_EQ("Add", fused_ops[1]);
           }
         } else {
-          if (has_relu) {
-            EXPECT_EQ(node.op(), "Relu");
+          if (activation != "None") {
+            EXPECT_EQ(node.op(), activation);
             ASSERT_EQ(node.input_size(), 1);
             EXPECT_EQ(node.input(0), add_op);
           } else {
@@ -170,42 +175,44 @@ class MklRemapperTest : public GrapplerTest {
     auto tensors = EvaluateNodes(output, item.fetch, item.feed);
     EXPECT_EQ(1, tensors_expected.size());
     EXPECT_EQ(1, tensors.size());
-    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+    // Using relative tolerance since oneDNN could produce different results
+    // when float32 numbers need to be rounded during accumulation.
+    test::ExpectClose(tensors_expected[0], tensors[0], 0, 1e-6);
   }
 };
 
-#define CREATE_CONV2DFUSION_TEST(data_format, addop, relu, bcast)                    \
-  TEST_F(                                                                            \
-      MklRemapperTest,                                                               \
-      FuseConv2DWithBiasAnd##addop##_##data_format##_relu##relu##_addbcast##bcast) { \
-    const bool kShouldFuseRelu = relu;                                               \
-    const bool kIsAddWithBcast = bcast;                                              \
-    FuseConv2DWithBiasAndAddNOrAdd(#data_format, relu, #addop, bcast);               \
+#define CREATE_CONV2DFUSION_TEST(data_format, addop, activation, bcast)                          \
+  TEST_F(                                                                                        \
+      MklRemapperTest,                                                                           \
+      FuseConv2DWithBiasAnd##addop##_##data_format##_activation##activation##_addbcast##bcast) { \
+    FuseConv2DWithBiasAndAddNOrAdd(#data_format, #activation, #addop, bcast);                    \
   }
 
-#define CREATE_CONV2DFUSION_ADD_NOBCAST_TEST(addop)    \
-  CREATE_CONV2DFUSION_TEST(NHWC, addop, false, false); \
-  CREATE_CONV2DFUSION_TEST(NHWC, addop, true, false);  \
-  CREATE_CONV2DFUSION_TEST(NCHW, addop, false, false); \
-  CREATE_CONV2DFUSION_TEST(NCHW, addop, true, false);
+#define CREATE_CONV2DFUSION_ADD_ACTIVATION_TEST(data_format, addop, bcast) \
+  CREATE_CONV2DFUSION_TEST(data_format, addop, Relu, bcast);               \
+  CREATE_CONV2DFUSION_TEST(data_format, addop, Relu6, bcast);              \
+  CREATE_CONV2DFUSION_TEST(data_format, addop, Elu, bcast);                \
+  CREATE_CONV2DFUSION_TEST(data_format, addop, LeakyRelu, bcast);          \
+  CREATE_CONV2DFUSION_TEST(data_format, addop, None, bcast);
+
+#define CREATE_CONV2DFUSION_ADD_NOBCAST_TEST(addop)            \
+  CREATE_CONV2DFUSION_ADD_ACTIVATION_TEST(NHWC, addop, false); \
+  CREATE_CONV2DFUSION_ADD_ACTIVATION_TEST(NCHW, addop, false);
 
 CREATE_CONV2DFUSION_ADD_NOBCAST_TEST(AddN);
 
-#define CREATE_CONV2DFUSION_ADD_BCAST_TEST(addop)      \
-  CREATE_CONV2DFUSION_TEST(NHWC, addop, false, false); \
-  CREATE_CONV2DFUSION_TEST(NHWC, addop, true, false);  \
-  CREATE_CONV2DFUSION_TEST(NCHW, addop, false, false); \
-  CREATE_CONV2DFUSION_TEST(NCHW, addop, true, false);  \
-  CREATE_CONV2DFUSION_TEST(NHWC, addop, false, true);  \
-  CREATE_CONV2DFUSION_TEST(NHWC, addop, true, true);   \
-  CREATE_CONV2DFUSION_TEST(NCHW, addop, false, true);  \
-  CREATE_CONV2DFUSION_TEST(NCHW, addop, true, true);
+#define CREATE_CONV2DFUSION_ADD_BCAST_TEST(addop)              \
+  CREATE_CONV2DFUSION_ADD_ACTIVATION_TEST(NHWC, addop, false); \
+  CREATE_CONV2DFUSION_ADD_ACTIVATION_TEST(NCHW, addop, false); \
+  CREATE_CONV2DFUSION_ADD_ACTIVATION_TEST(NHWC, addop, true);  \
+  CREATE_CONV2DFUSION_ADD_ACTIVATION_TEST(NCHW, addop, true);
 
 CREATE_CONV2DFUSION_ADD_BCAST_TEST(Add);
 CREATE_CONV2DFUSION_ADD_BCAST_TEST(AddV2);
 
 #undef CREATE_CONV2DFUSION_ADD_NOBCAST_TEST
 #undef CREATE_CONV2DFUSION_ADD_BCAST_TEST
+#undef CREATE_CONV2DFUSION_ADD_ACTIVATION_TEST
 #undef CREATE_CONV2DFUSION_TEST
 
 #define REGISTER_TEST(NAME, T, INPUT)                                         \
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 46c7afbc53a..e8dd1a68bb3 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -34,6 +34,10 @@ limitations under the License.
 #include "third_party/gpus/cudnn/cudnn.h"
 #endif  // GOOGLE_CUDA
 
+#ifdef INTEL_MKL
+#include "tensorflow/core/graph/mkl_graph_util.h"
+#endif  // INTEL_MKL
+
 namespace tensorflow {
 namespace grappler {
 
@@ -361,7 +365,12 @@ bool IsDeviceCompatible(const RemapperContext& ctx, Pattern& matched) {
 }
 
 bool IsSupportedActivation(const NodeDef& node) {
-  return IsRelu(node) || IsRelu6(node) || IsElu(node);
+#ifdef INTEL_MKL
+  return IsRelu(node) || IsRelu6(node) || IsElu(node) || IsLeakyRelu(node) ||
+         IsTanh(node);
+#else
+  return IsRelu(node) || IsRelu6(node) || IsElu(node) || IsLeakyRelu(node);
+#endif
 }
 
 inline bool HasControlFaninOrFanout(const utils::MutableNodeView& node_view) {
@@ -450,6 +459,17 @@ bool FindContractionWithBiasAndActivation(
       IsInPreserveSet(ctx, bias_add_node_def))
     return false;
 
+  // Get the contraction node
+  const auto* contraction_node_view =
+      bias_add_node_view->GetRegularFanin(0).node_view();
+  const auto* contraction_node_def = contraction_node_view->node();
+
+  // Currently, only matmul + bias + tanh is enable
+  if (!IsMatMul(*contraction_node_def) && IsTanh(*node_def)) return false;
+
+  // Currently, only conv + bias + leakyrelu is enabled
+  if (!IsConv2D(*contraction_node_def) && IsLeakyRelu(*node_def)) return false;
+
   // Check that data type and data format are supported on assigned device.
   const ContractionWithBiasAddAndActivation pattern{base.contraction,
                                                     base.bias_add, node_index};
@@ -699,6 +719,9 @@ bool FindContractionWithBiasAndAddActivation(
   if (node_def == nullptr) return false;
   if (!IsSupportedActivation(*node_def)) return false;
 
+  // Currently, Contraction + Bias + Add + Tanh pattern is not supported
+  if (IsTanh(*node_def)) return false;
+
 #ifdef ENABLE_INTEL_MKL_BFLOAT16
   // MKL activation op only supports float and bfloat16 data types.
   if (!HasDataType(node_def, DT_FLOAT) && !HasDataType(node_def, DT_BFLOAT16))
@@ -719,6 +742,16 @@ bool FindContractionWithBiasAndAddActivation(
     return false;
   }
 
+  // Get the contraction node
+  const auto* bias_add_node_view =
+      add_node_view->GetRegularFanin(base.port_id).node_view();
+  const auto* contraction_node_view =
+      bias_add_node_view->GetRegularFanin(0).node_view();
+  const auto* contraction_node_def = contraction_node_view->node();
+
+  // Currently, only conv + bias + add + leakyrelu is enabled
+  if (!IsConv2D(*contraction_node_def) && IsLeakyRelu(*node_def)) return false;
+
   // We successfully found a Conv2D+BiasAdd+AddN+activation pattern.
   const ContractionWithBiasAndAddActivation pattern{
       base.contraction, base.bias_add, base.add, base.port_id, node_index};
@@ -919,7 +952,8 @@ bool FindFusedBatchNormEx(const RemapperContext& ctx, int node_index,
   return false;
 }
 
-void CopyConv2DAttributes(const NodeDef& conv2d, NodeDef* fused_conv2d) {
+void CopyConv2DAttributes(const NodeDef& conv2d, NodeDef* fused_conv2d,
+                          const NodeDef* activation = nullptr) {
   DCHECK(IsConv2D(conv2d)) << "Input node must be a Conv2D";
 
   auto* attr = fused_conv2d->mutable_attr();
@@ -932,10 +966,16 @@ void CopyConv2DAttributes(const NodeDef& conv2d, NodeDef* fused_conv2d) {
   (*attr)["dilations"] = src_attr.at("dilations");
   (*attr)["data_format"] = src_attr.at("data_format");
   (*attr)["use_cudnn_on_gpu"] = src_attr.at("use_cudnn_on_gpu");
+  // Copy LeakyRelu's attr alpha to FusedConv2D's attr leakyrelu_alpha
+  if (activation != nullptr && IsLeakyRelu(*activation)) {
+    auto& activation_attr = activation->attr();
+    (*attr)["leakyrelu_alpha"] = activation_attr.at("alpha");
+  }
 }
 
 void CopyDepthwiseConv2dNativeAttributes(const NodeDef& dw_conv2d,
-                                         NodeDef* fused_dw_conv2d) {
+                                         NodeDef* fused_dw_conv2d,
+                                         const NodeDef* activation = nullptr) {
   DCHECK(IsDepthwiseConv2dNative(dw_conv2d))
       << "Input node must be a DepthwiseConv2dNative";
 
@@ -947,6 +987,11 @@ void CopyDepthwiseConv2dNativeAttributes(const NodeDef& dw_conv2d,
   (*attr)["padding"] = src_attr.at("padding");
   (*attr)["dilations"] = src_attr.at("dilations");
   (*attr)["data_format"] = src_attr.at("data_format");
+  // Copy LeakyRelu's attr alpha to FusedDepthwiseConv2d's attr leakyrelu_alpha
+  if (activation != nullptr && IsLeakyRelu(*activation)) {
+    auto& activation_attr = activation->attr();
+    (*attr)["leakyrelu_alpha"] = activation_attr.at("alpha");
+  }
 }
 
 void CopyFusedBatchNormAttributes(const NodeDef& fused_batch_norm,
@@ -1049,6 +1094,7 @@ Status AddFusedContractionNode(
   const NodeDef& contraction = graph->node(matched.contraction);
   const NodeDef& bias_add = graph->node(matched.bias_add);
   const NodeDef& activation = graph->node(matched.activation);
+
   VLOG(2) << "Fuse " << contraction.op() << " with BiasAdd and "
           << activation.op() << ":"
           << " activation=" << activation.name()
@@ -1064,7 +1110,8 @@ Status AddFusedContractionNode(
 
   if (IsConv2D(contraction)) {
     fused_op.set_op(kFusedConv2D);
-    CopyConv2DAttributes(contraction, &fused_op);
+    // leaky relu has a special attribute alpha
+    CopyConv2DAttributes(contraction, &fused_op, &activation);
   } else if (IsDepthwiseConv2dNative(contraction)) {
     fused_op.set_op(kFusedDepthwiseConv2dNative);
     CopyDepthwiseConv2dNativeAttributes(contraction, &fused_op);
@@ -1202,7 +1249,7 @@ Status AddFusedConv2DNode(RemapperContext* ctx,
   fused_conv2d.add_input(fused_batch_norm.input(3));  // 4: mean
   fused_conv2d.add_input(fused_batch_norm.input(4));  // 5: variance
 
-  CopyConv2DAttributes(contraction, &fused_conv2d);
+  CopyConv2DAttributes(contraction, &fused_conv2d, &activation);
   SetFusedOpAttributes(&fused_conv2d, {"FusedBatchNorm", activation.op()},
                        /*num_args=*/4, /*epsilon=*/matched.epsilon);
 
@@ -1284,7 +1331,7 @@ Status AddFusedContractionNode(
   fused_conv2d.add_input(add.input(1 - matched.port_id));
 
   CopyConv2DAttributes(contraction, &fused_conv2d);
-  SetFusedOpAttributes(&fused_conv2d, {"BiasAdd", "Add", "Relu"}, 2);
+  SetFusedOpAttributes(&fused_conv2d, {"BiasAdd", "Add", activation.op()}, 2);
 
   utils::Mutation* mutation = ctx->graph_view.GetMutationBuilder();
   Status status;
@@ -1385,29 +1432,41 @@ Status AddBatchNormNodes(RemapperContext* ctx, const FusedBatchNorm& matched) {
   utils::Mutation* mutation = ctx->graph_view.GetMutationBuilder();
   Status status;
 
-  if (fused_node.attr().at(kDataFormat).s() == "NCHW") {
+  string x_format = fused_node.attr().at(kDataFormat).s();
+  if (x_format == "NCHW" or x_format == "NCDHW") {
     // Need to reshape the last 4 inputs
     NodeDef new_shape;
     const string new_shape_name =
-        AddPrefixToNodeName("NCHWShape", fused_node.name());
+        AddPrefixToNodeName(x_format + "Shape", fused_node.name());
     new_shape.set_name(new_shape_name);
     new_shape.set_op("Const");
     new_shape.set_device(fused_node.device());
     *new_shape.add_input() = AsControlDependency(scale);
     (*new_shape.mutable_attr())["dtype"].set_type(DT_INT32);
-    Tensor t(DT_INT32, {4});
-    t.flat<int32>()(0) = 1;
-    t.flat<int32>()(1) = -1;
-    t.flat<int32>()(2) = 1;
-    t.flat<int32>()(3) = 1;
-    t.AsProtoTensorContent(
-        (*new_shape.mutable_attr())["value"].mutable_tensor());
+    if (x_format == "NCHW") {
+      Tensor t(DT_INT32, {4});
+      t.flat<int32>()(0) = 1;
+      t.flat<int32>()(1) = -1;
+      t.flat<int32>()(2) = 1;
+      t.flat<int32>()(3) = 1;
+      t.AsProtoTensorContent(
+          (*new_shape.mutable_attr())["value"].mutable_tensor());
+    } else {
+      Tensor t(DT_INT32, {5});
+      t.flat<int32>()(0) = 1;
+      t.flat<int32>()(1) = -1;
+      t.flat<int32>()(2) = 1;
+      t.flat<int32>()(3) = 1;
+      t.flat<int32>()(4) = 1;
+      t.AsProtoTensorContent(
+          (*new_shape.mutable_attr())["value"].mutable_tensor());
+    }
     mutation->AddNode(std::move(new_shape), &status);
     TF_RETURN_IF_ERROR(status);
 
     NodeDef reshaped_scale;
     reshaped_scale.set_name(
-        AddPrefixToNodeName("NCHWShapedScale", fused_node.name()));
+        AddPrefixToNodeName(x_format + "ShapedScale", fused_node.name()));
     reshaped_scale.set_op("Reshape");
     reshaped_scale.set_device(fused_node.device());
     *reshaped_scale.add_input() = scale;
@@ -1420,7 +1479,7 @@ Status AddBatchNormNodes(RemapperContext* ctx, const FusedBatchNorm& matched) {
 
     NodeDef reshaped_offset;
     reshaped_offset.set_name(
-        AddPrefixToNodeName("NCHWShapedOffset", fused_node.name()));
+        AddPrefixToNodeName(x_format + "ShapedOffset", fused_node.name()));
     reshaped_offset.set_op("Reshape");
     reshaped_offset.set_device(fused_node.device());
     *reshaped_offset.add_input() = offset;
@@ -1433,7 +1492,7 @@ Status AddBatchNormNodes(RemapperContext* ctx, const FusedBatchNorm& matched) {
 
     NodeDef reshaped_mean;
     reshaped_mean.set_name(
-        AddPrefixToNodeName("NCHWShapedMean", fused_node.name()));
+        AddPrefixToNodeName(x_format + "ShapedMean", fused_node.name()));
     reshaped_mean.set_op("Reshape");
     reshaped_mean.set_device(fused_node.device());
     *reshaped_mean.add_input() = mean;
@@ -1446,7 +1505,7 @@ Status AddBatchNormNodes(RemapperContext* ctx, const FusedBatchNorm& matched) {
 
     NodeDef reshaped_variance;
     reshaped_variance.set_name(
-        AddPrefixToNodeName("NCHWShapedVariance", fused_node.name()));
+        AddPrefixToNodeName(x_format + "ShapedVariance", fused_node.name()));
     reshaped_variance.set_op("Reshape");
     reshaped_variance.set_device(fused_node.device());
     *reshaped_variance.add_input() = variance;
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index f4bc5e38526..2aa564104db 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/remapper.h"
 
+#include "tensorflow/cc/ops/nn_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.h"
@@ -450,8 +451,10 @@ class RemapperFuseMatMulWithBiasTest : public RemapperTest {
     ASSERT_EQ(tensors_expected.size(), 1);
     auto tensors = EvaluateNodes(output, item.fetch, item.feed);
     ASSERT_EQ(tensors.size(), 1);
-    typedef typename EnumToDataType<DTYPE>::Type T;
-    test::ExpectTensorNear<T>(tensors[0], tensors_expected[0], 1e-6);
+    if (DTYPE == DT_BFLOAT16)
+      test::ExpectClose(tensors[0], tensors_expected[0], 1e-2, 1e-2);
+    else
+      test::ExpectClose(tensors[0], tensors_expected[0], 1e-6);
   }
 };
 
@@ -541,7 +544,7 @@ TEST_F(RemapperTest, DISABLED_FuseConv2DWithBiasAndActivationOnGPU) {
 TEST_F(RemapperTest, FuseConv2DWithBiasAndActivation) {
   using ::tensorflow::ops::Placeholder;
 
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
     auto input_shape = Placeholder::Shape({8, 32, 32, 3});
@@ -567,6 +570,10 @@ TEST_F(RemapperTest, FuseConv2DWithBiasAndActivation) {
         return ops::Identity(fetch, ops::Relu6(activate, bias_add));
       } else if (activation == "Elu") {
         return ops::Identity(fetch, ops::Elu(activate, bias_add));
+      } else if (activation == "LeakyRelu") {
+        auto attr = ops::internal::LeakyRelu::Alpha(0.5);
+        return ops::Identity(
+            fetch, ops::internal::LeakyRelu(activate, bias_add, attr));
       }
 
       return ops::Identity(fetch, bias);
@@ -605,6 +612,10 @@ TEST_F(RemapperTest, FuseConv2DWithBiasAndActivation) {
         ASSERT_EQ(fused_ops.size(), 2);
         EXPECT_EQ(fused_ops[0], "BiasAdd");
         EXPECT_EQ(fused_ops[1], activation);
+
+        if (activation == "LeakyRelu") {
+          EXPECT_EQ(node.attr().at("leakyrelu_alpha").f(), 0.5);
+        }
         found++;
       }
     }
@@ -695,8 +706,10 @@ class RemapperFuseMatMulWithBiasAndActivationTest : public RemapperTest {
       ASSERT_EQ(tensors_expected.size(), 1);
       auto tensors = EvaluateNodes(output, item.fetch, item.feed);
       ASSERT_EQ(tensors.size(), 1);
-      typedef typename EnumToDataType<DTYPE>::Type T;
-      test::ExpectTensorNear<T>(tensors[0], tensors_expected[0], 1e-6);
+      if (DTYPE == DT_BFLOAT16)
+        test::ExpectClose(tensors[0], tensors_expected[0], 1e-2, 1e-2);
+      else
+        test::ExpectClose(tensors[0], tensors_expected[0], 1e-6);
     }
   }
 };
@@ -795,7 +808,7 @@ TEST_F(RemapperTest, FuseConv2DWithBatchNorm) {
 TEST_F(RemapperTest, FuseConv2DWithBatchNormAndActivation) {
   using ops::Placeholder;
 
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
     auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
@@ -828,6 +841,10 @@ TEST_F(RemapperTest, FuseConv2DWithBatchNormAndActivation) {
         return ops::Identity(fetch, ops::Relu6(activate, batch_norm.y));
       } else if (activation == "Elu") {
         return ops::Identity(fetch, ops::Elu(activate, batch_norm.y));
+      } else if (activation == "LeakyRelu") {
+        auto attr = ops::internal::LeakyRelu::Alpha(0.5);
+        return ops::Identity(
+            fetch, ops::internal::LeakyRelu(activate, batch_norm.y, attr));
       }
 
       return ops::Identity(fetch, batch_norm.y);
@@ -874,6 +891,10 @@ TEST_F(RemapperTest, FuseConv2DWithBatchNormAndActivation) {
         ASSERT_EQ(fused_ops.size(), 2);
         EXPECT_EQ(fused_ops[0], "FusedBatchNorm");
         EXPECT_EQ(fused_ops[1], activation);
+
+        if (activation == "LeakyRelu") {
+          EXPECT_EQ(node.attr().at("leakyrelu_alpha").f(), 0.5);
+        }
         found++;
       }
     }
@@ -957,7 +978,7 @@ TEST_F(RemapperTest, FuseConv2DWithSqueezeAndBias) {
   ASSERT_EQ(tensors.size(), 1);
   test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
 }
-#endif
+#endif  // !INTEL_MKL
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc
index 4f7f4f582e4..b9972526080 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc
@@ -421,8 +421,7 @@ TEST_F(ScopedAllocatorOptimizerTest, UnaryExecute) {
   SetShapes(&graph_def);
   std::vector<Tensor> outputs;
   ExecuteGraph(graph_def,
-               /*output_names=*/{"r1:0", "r2:0", "scoped_allocator_1_2_Abs:0"},
-               &outputs);
+               /*output_names=*/{"r1:0", "r2:0"}, &outputs);
   // a + b == 2, -2, 3, 3
   // b + c == -4, -4, 3, 2
   ValidateValues(outputs, /*expected=*/{{2, 2, 3, 3}, {4, 4, 3, 2}});
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index 8e04b573770..e87ec9d16ca 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load(
     "//tensorflow/core/platform:build_config.bzl",
diff --git a/tensorflow/core/grappler/utils/grappler_test.cc b/tensorflow/core/grappler/utils/grappler_test.cc
index 47397b589f0..64645fb0689 100644
--- a/tensorflow/core/grappler/utils/grappler_test.cc
+++ b/tensorflow/core/grappler/utils/grappler_test.cc
@@ -68,27 +68,44 @@ void CompareGraphNodes(protobuf::RepeatedPtrField<NodeDef>* want,
     }
   }
 }
+
+void SetAllOptimizers(RewriterConfig* cfg, RewriterConfig::Toggle value) {
+  cfg->set_arithmetic_optimization(value);
+  cfg->set_auto_mixed_precision(value);
+  cfg->set_auto_mixed_precision_mkl(value);
+  cfg->set_common_subgraph_elimination(value);
+  cfg->set_constant_folding(value);
+  cfg->set_debug_stripper(value);
+  cfg->set_dependency_optimization(value);
+  cfg->set_function_optimization(value);
+  cfg->set_implementation_selector(value);
+  cfg->set_layout_optimizer(value);
+  cfg->set_loop_optimization(value);
+  cfg->set_pin_to_host_optimization(value);
+  cfg->set_remapping(value);
+  cfg->set_scoped_allocator_optimization(value);
+  cfg->set_shape_optimization(value);
+}
 }  // namespace
 
 GrapplerTest::GrapplerTest() {
   // Turn off all the automatic optimizations to ensure that we run the graph
-  // exactly as it is given to us. This ensures that we can compare the results
-  // before and after manual optimization, without any of the automatic
-  // optimizations interfering in the comparison.
-  RewriterConfig* cfg =
-      options_.config.mutable_graph_options()->mutable_rewrite_options();
-  // TODO(rmlarsen): Add utility to generate config w/ all optimizers turned
-  // off.
-  cfg->set_arithmetic_optimization(RewriterConfig::OFF);
-  cfg->set_constant_folding(RewriterConfig::OFF);
-  cfg->set_debug_stripper(RewriterConfig::OFF);
-  cfg->set_dependency_optimization(RewriterConfig::OFF);
-  cfg->set_function_optimization(RewriterConfig::OFF);
-  cfg->set_implementation_selector(RewriterConfig::OFF);
-  cfg->set_layout_optimizer(RewriterConfig::OFF);
-  cfg->set_loop_optimization(RewriterConfig::OFF);
-  cfg->set_pin_to_host_optimization(RewriterConfig::OFF);
-  cfg->set_remapping(RewriterConfig::OFF);
+  // exactly as it is given to us. This ensures that we can compare the
+  // results before and after manual optimization, without any of the
+  // automatic optimizations interfering in the comparison.
+  DisableAllOptimizers();
+}
+
+void GrapplerTest::DisableAllOptimizers() {
+  SetAllOptimizers(
+      options_.config.mutable_graph_options()->mutable_rewrite_options(),
+      RewriterConfig::OFF);
+}
+
+void GrapplerTest::EnableAllOptimizers() {
+  SetAllOptimizers(
+      options_.config.mutable_graph_options()->mutable_rewrite_options(),
+      RewriterConfig::ON);
 }
 
 std::vector<Tensor> GrapplerTest::EvaluateNodes(
diff --git a/tensorflow/core/grappler/utils/grappler_test.h b/tensorflow/core/grappler/utils/grappler_test.h
index 9225f9172e8..967cff28de9 100644
--- a/tensorflow/core/grappler/utils/grappler_test.h
+++ b/tensorflow/core/grappler/utils/grappler_test.h
@@ -37,6 +37,9 @@ class GrapplerTest : public ::testing::Test {
   GrapplerTest();
 
  protected:
+  void DisableAllOptimizers();
+  void EnableAllOptimizers();
+
   std::vector<Tensor> EvaluateNodes(
       const GraphDef& graph, const std::vector<string>& node_names) const;
 
@@ -51,6 +54,8 @@ class GrapplerTest : public ::testing::Test {
                    const std::vector<std::pair<string, AttrValue>>& attributes,
                    GraphDef* graph) const;
 
+  void DisableAllOptimizers(RewriterConfig* cfg);
+
   // Checks if two graphs are equal. Both graphs must have the same set of nodes
   // with the same inputs and attributes. Nodes can be in different order.
   //
diff --git a/tensorflow/core/grappler/utils/grappler_test_test.cc b/tensorflow/core/grappler/utils/grappler_test_test.cc
index 677fa5a7989..795a6e90d73 100644
--- a/tensorflow/core/grappler/utils/grappler_test_test.cc
+++ b/tensorflow/core/grappler/utils/grappler_test_test.cc
@@ -95,6 +95,35 @@ TEST_F(GrapplerTestTest, CountOpNodes) {
   EXPECT_EQ(0, CountOpNodes(graph, "Transpose"));
 }
 
+TEST_F(GrapplerTestTest, EvaluateNodes) {
+  EnableAllOptimizers();
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
+  Output b = ops::Const(s.WithOpName("d"), {3.0f, 4.0f}, {1, 2});
+  Output mul = ops::Mul(s.WithOpName("mul"), a, b);
+  GrapplerItem item;
+  item.fetch = {"mul"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors = EvaluateNodes(item.graph, item.fetch);
+  ASSERT_EQ(tensors.size(), 1);
+  EXPECT_EQ(tensors[0].flat<float>()(0), 3.0f);
+  EXPECT_EQ(tensors[0].flat<float>()(1), 8.0f);
+}
+
+TEST_F(GrapplerTestTest, EvaluateNodesInvalidFetch) {
+  EnableAllOptimizers();
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
+  Output b = ops::Const(s.WithOpName("d"), {3.0f, 4.0f}, {1, 2});
+  Output mul = ops::Mul(s.WithOpName("mul"), a, b);
+  GrapplerItem item;
+  item.fetch = {"no_such_node"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  EXPECT_DEATH(EvaluateNodes(item.graph, item.fetch),
+               "Invalid argument: Tensor no_such_node:0, specified in either "
+               "feed_devices or fetch_devices was not found in the Graph");
+}
+
 }  // namespace
 }  // namespace grappler
-}  // namespace tensorflow
\ No newline at end of file
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/verifiers/BUILD b/tensorflow/core/grappler/verifiers/BUILD
index 939972b0617..c9501d95a54 100644
--- a/tensorflow/core/grappler/verifiers/BUILD
+++ b/tensorflow/core/grappler/verifiers/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 0981bf8d65b..f984f3ab30f 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -1,7 +1,7 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load(
     "//tensorflow:tensorflow.bzl",
-    "cc_header_only_library",
     "if_android",
     "if_cuda_or_rocm",
     "if_mobile",
@@ -12,12 +12,19 @@ load(
     "tf_cc_tests",
     "tf_copts",
     "tf_cuda_library",
-    "tf_kernel_library",
     "tf_opts_nortti_if_lite_protos",
 )
-load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
 load("//tensorflow/core/kernels/mlir_generated:build_defs.bzl", "if_mlir_generated_gpu_kernels_enabled")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "if_nccl")
 
@@ -27,6 +34,9 @@ load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
 # For platform specific build config
 load(
     "//tensorflow/core/platform:build_config.bzl",
@@ -162,7 +172,6 @@ tf_kernel_library(
         "strided_slice_op_gpu_number_types.cu.cc",
     ],
     deps = [
-        ":bounds_check",
         ":dense_update_functor",
         ":inplace_ops",
         ":ops_util",
@@ -171,6 +180,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
     ],
 )
@@ -201,17 +211,6 @@ tf_cc_test(
     ],
 )
 
-# virtual targets since nested select statements not possible
-tf_kernel_library(
-    name = "virtual_nccl",
-    deps = if_cuda(["@local_config_nccl//:nccl"]),
-)
-
-tf_kernel_library(
-    name = "virtual_rccl",
-    deps = if_rocm(["@local_config_rocm//rocm:rccl"]),
-)
-
 tf_kernel_library(
     name = "collective_ops",
     srcs = if_nccl([
@@ -229,11 +228,10 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core/profiler/lib:traceme",
     ] + if_nccl([
-        ":virtual_nccl",
-        ":virtual_rccl",
-        "//tensorflow/core/nccl:nccl_lib",
+        "//tensorflow/core/nccl:collective_communicator",
     ]),
 )
 
@@ -256,6 +254,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/nccl:collective_communicator",
     ],
 )
 
@@ -280,9 +279,9 @@ tf_kernel_library(
         "gpu_device_array_gpu.h",
     ],
     deps = [
-        ":bounds_check",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
     ],
     alwayslink = 0,
@@ -335,6 +334,7 @@ cc_library(
     hdrs = ["conv_2d.h"],
     deps = [
         ":eigen_helpers",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
     ],
 )
@@ -465,8 +465,8 @@ tf_cuda_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
+        "//tensorflow/core/framework:tensor_testutil",
     ],
 )
 
@@ -515,10 +515,10 @@ tf_cuda_library(
     hdrs = ["gpu_utils.h"],
     deps = [
         ":gpu_util_hdrs",
-        "//tensorflow/core:autotuning_proto_cc",
-        "//tensorflow/core:conv_autotuning_proto_cc",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
+        "//tensorflow/core/protobuf:autotuning_proto_cc",
+        "//tensorflow/core/protobuf:conv_autotuning_proto_cc",
         "//tensorflow/core/util:env_var",
         "//tensorflow/core/util/proto:proto_utils",
         "//tensorflow/stream_executor/gpu:asm_compiler",
@@ -560,11 +560,11 @@ tf_cc_test(
     deps = [
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu",
-        "//tensorflow/core:direct_session_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime:direct_session_internal",
     ],
 )
 
@@ -656,9 +656,9 @@ cc_library(
     hdrs = ["save_restore_tensor.h"],
     copts = if_not_windows(["-Wno-sign-compare"]),
     deps = [
-        ":bounds_check",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/util/tensor_bundle",
     ],
 )
@@ -714,19 +714,8 @@ cc_library(
     ],
 )
 
-alias(
-    name = "bounds_check",
-    actual = "//tensorflow/core:framework_bounds_check",
-    visibility = [":friends"],
-)
-
 # Private support libraries ---------------------------------------------------
 
-cc_header_only_library(
-    name = "bounds_check_lib",
-    deps = [":bounds_check"],
-)
-
 cc_library(
     name = "gpu_device_array",
     hdrs = [
@@ -894,6 +883,7 @@ cc_library(
     hdrs = [
         "eigen_spatial_convolutions-inl.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         ":eigen_convolution_helpers",
     ],
@@ -904,12 +894,12 @@ cc_library(
     hdrs = [
         "eigen_convolution_helpers.h",
     ],
+    compatible_with = get_compatible_with_portable(),
 )
 
 # OpKernel libraries ----------------------------------------------------------
 
 ARRAY_DEPS = [
-    ":bounds_check",
     ":concat_lib",
     ":fill_functor",
     ":gather_functor",
@@ -921,8 +911,9 @@ ARRAY_DEPS = [
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core/framework:bounds_check",
     "//third_party/eigen3",
-] + if_sycl(["//tensorflow/core/common_runtime/sycl:sycl_runtime"])
+]
 
 tf_kernel_library(
     name = "immutable_constant_op",
@@ -1104,7 +1095,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "one_hot_op",
     prefix = "one_hot_op",
-    deps = ARRAY_DEPS + ["//tensorflow/core:overflow"],
+    deps = ARRAY_DEPS + ["//tensorflow/core/util:overflow"],
 )
 
 tf_kernel_library(
@@ -1240,7 +1231,6 @@ tf_kernel_library(
         "tile_functor_cpu_uint64.cc",
         "tile_functor_cpu_uint8.cc",
         "tile_functor_cpu_variant.cc",
-        "tile_functor_sycl.cc",
     ],
     hdrs = ["tile_functor.h"],
     gpu_srcs = [
@@ -1419,10 +1409,22 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "ragged_tensor_variant",
+    srcs = ["ragged_tensor_variant.cc"],
+    hdrs = ["ragged_tensor_variant.h"],
+    deps = [
+        ":cwise_op",
+        "//tensorflow/core:framework",
+    ],
+)
+
 tf_kernel_library(
     name = "ragged_tensor_to_variant_op",
     srcs = ["ragged_tensor_to_variant_op.cc"],
     deps = [
+        ":concat_lib",
+        ":ragged_tensor_variant",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -1432,6 +1434,7 @@ tf_kernel_library(
     name = "ragged_tensor_from_variant_op",
     srcs = ["ragged_tensor_from_variant_op.cc"],
     deps = [
+        ":ragged_tensor_variant",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
@@ -1444,6 +1447,7 @@ tf_cc_test(
     deps = [
         ":ops_testutil",
         ":ragged_tensor_to_variant_op",
+        ":ragged_tensor_variant",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -1460,6 +1464,7 @@ tf_cc_test(
     deps = [
         ":ops_testutil",
         ":ragged_tensor_from_variant_op",
+        ":ragged_tensor_variant",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -1491,12 +1496,12 @@ tf_kernel_library(
     srcs = ["cudnn_rnn_ops.cc"],
     visibility = ["//visibility:public"],
     deps = [
-        ":bounds_check_lib",
         ":gpu_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/framework:bounds_check",
+        "//tensorflow/core/platform:stream_executor",
         "//third_party/eigen3",
     ],
 )
@@ -1652,15 +1657,12 @@ tf_cuda_cc_test(
     name = "conv_ops_test",
     size = "medium",
     srcs = ["conv_ops_test.cc"],
-    tags = [
-        "no_cuda11",  # b/159664089
-        "no_oss",
-    ],
     deps = [
         ":conv_ops",
         ":ops_testutil",
         ":ops_util",
         "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -1671,6 +1673,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/kernels/image",
+        "//tensorflow/core/platform:tensor_float_32_utils",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
@@ -1887,8 +1890,8 @@ tf_kernel_library(
     prefix = "gather_functor",
     visibility = [":friends"],
     deps = [
-        ":bounds_check",
         "//tensorflow/core:framework",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
     ],
 )
@@ -2117,10 +2120,10 @@ tf_kernel_library(
     prefix = "scatter_functor",
     visibility = [":friends"],
     deps = [
-        ":bounds_check",
         ":dense_update_functor",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
     ],
 )
@@ -2232,9 +2235,9 @@ tf_cc_test(
     deps = [
         ":transpose_functor",
         "//tensorflow/core:framework",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:tensor_testutil",
     ],
 )
 
@@ -2285,7 +2288,7 @@ tf_kernel_library(
     name = "ctc_ops",
     prefix = "ctc",
     deps = [
-        ":bounds_check",
+        "//tensorflow/core/framework:bounds_check",
         ":ops_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -2354,7 +2357,6 @@ cc_header_only_library(
 )
 
 DATA_FLOW_DEPS = [
-    ":bounds_check",
     ":concat_lib",
     ":conditional_accumulator",
     ":conditional_accumulator_base",
@@ -2375,6 +2377,7 @@ DATA_FLOW_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
+    "//tensorflow/core/framework:bounds_check",
 ]
 
 tf_kernel_library(
@@ -2498,7 +2501,7 @@ tf_kernel_library(
 )
 
 DYNAMIC_DEPS = [
-    ":bounds_check",
+    "//tensorflow/core/framework:bounds_check",
     "//tensorflow/core:core_cpu",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
@@ -2551,7 +2554,6 @@ tf_cc_test(
 )
 
 LOOKUP_DEPS = [
-    ":bounds_check",
     ":initializable_lookup_table",
     ":lookup_util",
     "@com_google_absl//absl/container:flat_hash_map",
@@ -2559,6 +2561,7 @@ LOOKUP_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
+    "//tensorflow/core/framework:bounds_check",
 ]
 
 tf_kernel_library(
@@ -2716,7 +2719,6 @@ tf_kernel_library(
     srcs = ["resource_variable_ops.cc"],
     hdrs = ["resource_variable_ops.h"],
     deps = [
-        ":bounds_check",
         ":dense_update_functor",
         ":gather_functor",
         ":gather_nd_op",
@@ -2727,6 +2729,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/framework:bounds_check",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -3008,12 +3011,12 @@ tf_kernel_library(
 )
 
 SAVE_RESTORE_DEPS = [
-    ":bounds_check_lib",
     ":save_restore_tensor",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:protos_all_cc",
+    "//tensorflow/core/framework:bounds_check",
     "//tensorflow/core/util/tensor_bundle",
 ]
 
@@ -3117,7 +3120,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "summary_image_op",
     prefix = "summary_image_op",
-    deps = LOGGING_DEPS + ["//tensorflow/core:png_internal"],
+    deps = LOGGING_DEPS + ["//tensorflow/core/lib/png:png_io"],
 )
 
 # TODO(b/162630222): remove this target
@@ -3176,9 +3179,9 @@ tf_kernel_library(
         "roll_op.h",
     ],
     deps = [
-        ":bounds_check",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
     ],
 )
@@ -3203,13 +3206,13 @@ tf_cc_test(
 )
 
 MATH_DEPS = [
-    ":bounds_check",
     ":fill_functor",
     "//tensorflow/core:core_cpu",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:math_grad",
+    "//tensorflow/core/framework:bounds_check",
     "//third_party/eigen3",
 ]
 
@@ -3334,6 +3337,9 @@ tf_kernel_library(
     prefix = "batch_matmul_op",
     deps = MATH_DEPS + [":eigen_contraction_kernel"] + if_mkl_ml([
         "//third_party/mkl:intel_binary_blob",
+    ]) + if_cuda_or_rocm([
+        "//tensorflow/core/kernels:gpu_utils",
+        "//tensorflow/core/platform:tensor_float_32_utils",
     ]),
 )
 
@@ -3372,7 +3378,7 @@ tf_kernel_library(
     name = "cwise_op",
     copts = if_mlir_generated_gpu_kernels_enabled(if_true = ["-DMLIR_GENERATED_GPU_KERNELS_ENABLED=1"]),
     prefix = "cwise_op",
-    deps = MATH_DEPS + if_mlir_generated_gpu_kernels_enabled(if_true = ["//tensorflow/core/kernels/mlir_generated:cwise_op"]),
+    deps = MATH_DEPS + if_mlir_generated_gpu_kernels_enabled(if_true = ["//tensorflow/core/kernels/mlir_generated:cwise_unary_op"]),
 )
 
 tf_kernel_library(
@@ -3391,7 +3397,7 @@ tf_kernel_library(
     name = "fft_ops",
     prefix = "fft_ops",
     deps = MATH_DEPS + [
-    ] + if_cuda([
+    ] + if_cuda_or_rocm([":gpu_utils"]) + if_cuda([
         "//tensorflow/core/platform/default/build_config:cufft_plugin",
     ]),
 )
@@ -3549,7 +3555,6 @@ tf_cuda_cc_test(
     size = "small",
     srcs = ["cwise_ops_test.cc"],
     deps = [
-        ":bounds_check",
         ":cwise_op",
         ":nn",
         ":ops_testutil",
@@ -3561,6 +3566,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:bounds_check",
     ],
 )
 
@@ -3827,7 +3833,6 @@ tf_kernel_library(
     deps = [
         ":conv_grad_shape_utils",
         ":conv_ops_3d_headers",
-        ":bounds_check",
         ":conv_2d",
         ":conv_3d",
         ":eigen_contraction_kernel",
@@ -3841,20 +3846,21 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/util:image_resizer_state",
         "//tensorflow/core/util/proto:proto_utils",
-        "//tensorflow/stream_executor/gpu:gpu_asm_opts",
     ] + select({
         ":xsmm_convolutions": [
             "@libxsmm_archive//:xsmm_avx",
         ],
         "//conditions:default": [],
     }) + if_cuda([
+        "//tensorflow/stream_executor/gpu:gpu_asm_opts",
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
         "//tensorflow/core/platform/default/build_config:cudnn_plugin",
         "//tensorflow/stream_executor:tf_allocator_adapter",
         "//tensorflow/stream_executor:stream_executor_headers",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
     ]) + if_cuda_or_rocm([
         ":gpu_utils",
         "//tensorflow/stream_executor/gpu:redzone_allocator",
@@ -3894,12 +3900,12 @@ tf_kernel_library(
         "depthwise_conv_op_gpu_half.cu.cc",
     ],
     deps = [
-        ":bounds_check",
         ":conv_ops",
         ":ops_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:bounds_check",
     ] + if_cuda([
         "@local_config_cuda//cuda:cub_headers",
         "@local_config_cuda//cuda:cudnn_header",
@@ -3915,13 +3921,13 @@ tf_kernel_library(
     ],
     prefix = "depthwise_conv_grad_op",
     deps = [
-        ":bounds_check",
+        ":cast_op",
         ":conv_ops",
         ":ops_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        ":cast_op",
+        "//tensorflow/core/framework:bounds_check",
     ] + if_cuda([
         "@local_config_cuda//cuda:cudnn_header",
     ]),
@@ -3960,7 +3966,6 @@ cc_library(
 )
 
 NN_DEPS = [
-    ":bounds_check",
     ":conv_2d",
     ":eigen_contraction_kernel",
     ":ops_util",
@@ -3968,6 +3973,7 @@ NN_DEPS = [
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
     "//tensorflow/core:nn_grad",
+    "//tensorflow/core/framework:bounds_check",
     "//third_party/eigen3",
 ]
 
@@ -3992,7 +3998,7 @@ tf_kernel_library(
         ":reduction_ops",
     ]) + if_cuda([
         "@local_config_cuda//cuda:cub_headers",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
         "//tensorflow/stream_executor/cuda:cuda_stream",
     ]) + if_rocm([
         "@local_config_rocm//rocm:rocprim",
@@ -4007,7 +4013,7 @@ tf_kernel_library(
         ":redux_functor",
         ":transpose_functor",
     ] + if_cuda([
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
     ]),
 )
 
@@ -4204,7 +4210,7 @@ tf_kernel_library(
         "maxpooling_op.h",
         "pooling_ops_3d.h",
         "pooling_ops_common.h",
-    ] + if_sycl(["pooling_ops_3d_sycl.h"]),
+    ],
     gpu_srcs = [
         "avgpooling_op.h",
         "avgpooling_op_gpu.cu.cc",
@@ -4217,7 +4223,6 @@ tf_kernel_library(
         "pooling_ops_3d_gpu.cu.cc",
     ],
     deps = [
-        ":bounds_check",
         ":conv_2d",
         ":conv_3d",
         ":conv_ops",
@@ -4227,7 +4232,8 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/framework:bounds_check",
+        "//tensorflow/core/platform:stream_executor",
         "//third_party/eigen3",
     ],
 )
@@ -4288,9 +4294,9 @@ tf_kernel_library(
     ],
     visibility = [":friends"],
     deps = [
-        ":bounds_check",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
     ],
 )
@@ -4468,12 +4474,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "stateless_random_ops_v2_header",
+    hdrs = ["stateless_random_ops_v2.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
 tf_kernel_library(
     name = "stateful_random_ops",
     prefix = "stateful_random_ops",
     deps = [
-        ":bounds_check",
         ":dense_update_functor",
+        ":fill_functor",
         ":gather_functor",
         ":mutex_ops",
         ":random_op",
@@ -4488,6 +4503,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/framework:bounds_check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:variant",
     ],
@@ -4497,11 +4513,11 @@ tf_kernel_library(
     name = "stateless_random_ops",
     prefix = "stateless_random_ops",
     deps = [
-        ":bounds_check",
         ":random_op",
         ":random_poisson_op",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:bounds_check",
     ],
 )
 
@@ -4690,9 +4706,9 @@ tf_kernel_library(
     name = "sparse_tensor_dense_matmul_op",
     prefix = "sparse_tensor_dense_matmul_op",
     deps = SPARSE_DEPS + [
-        ":bounds_check",
         ":fill_functor",
         "//third_party/eigen3",
+        "//tensorflow/core/framework:bounds_check",
     ],
 )
 
@@ -4708,8 +4724,8 @@ tf_kernel_library(
     name = "sparse_xent_op",
     prefix = "sparse_xent_op",
     deps = SPARSE_DEPS + [
-        ":bounds_check",
         "//third_party/eigen3",
+        "//tensorflow/core/framework:bounds_check",
     ] + if_cuda_or_rocm([
         ":reduction_ops",
     ]) + if_cuda([
@@ -4863,14 +4879,14 @@ cc_library(
 
 STATE_DEPS = [
     ":assign_op",
-    ":bounds_check",
+    "//tensorflow/core/framework:bounds_check",
     ":fill_functor",
     ":scatter_functor",
     "//third_party/eigen3",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
-] + if_sycl(["//tensorflow/core/common_runtime/sycl:sycl_runtime"])
+]
 
 tf_kernel_library(
     name = "count_up_to_op",
@@ -5034,7 +5050,7 @@ cc_library(
 )
 
 STRING_DEPS = [
-    ":bounds_check",
+    "//tensorflow/core/framework:bounds_check",
     ":string_util",
     "//third_party/eigen3",
     "//tensorflow/core:framework",
@@ -5229,15 +5245,33 @@ tf_kernel_library(
     deps = STRING_DEPS,
 )
 
+tf_cc_test(
+    name = "as_string_op_test",
+    size = "small",
+    srcs = ["as_string_op_test.cc"],
+    deps = [
+        ":as_string_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "unicode_ops",
     prefix = "unicode_ops",
     deps = [
-        ":bounds_check",
         ":string_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
         "//third_party/icu/data:conversion_data",
         "@icu//:common",
@@ -5254,11 +5288,11 @@ tf_kernel_library(
     name = "training_ops",
     prefix = "training_ops",
     deps = [
-        ":bounds_check",
         ":training_op_helpers",
         ":variable_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
     ],
 )
@@ -5348,7 +5382,7 @@ tf_kernel_library(
     prefix = "random_binomial_op",
     deps = [
         ":cwise_op",
-        ":random_ops",
+        ":random_op",
         ":resource_variable_ops",
         ":stateful_random_ops",
         ":stateless_random_ops",
@@ -5417,11 +5451,11 @@ tf_kernel_library(
     name = "encode_wav_op",
     prefix = "encode_wav_op",
     deps = [
-        ":bounds_check",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/framework:bounds_check",
     ],
 )
 
@@ -5912,6 +5946,7 @@ filegroup(
         "avgpooling_op.h",
         "batch_matmul_op_impl.h",
         "batch_norm_op.h",
+        "bincount_op.h",
         "broadcast_to_op.h",
         "bucketize_op.h",
         "control_flow_ops.h",
@@ -6114,6 +6149,9 @@ filegroup(
         "//tensorflow/core/kernels/linalg:einsum_op_impl_complex128.cc",
         "//tensorflow/core/kernels/linalg:einsum_op_impl.h",
         "//tensorflow/core/kernels/linalg:einsum_op.h",
+        "//tensorflow/core/kernels/image:decode_image_op.cc",
+        "//tensorflow/core/kernels/image:encode_jpeg_op.cc",
+        "//tensorflow/core/kernels/image:encode_png_op.cc",
     ] + select({
         ":xsmm_convolutions": [
             "xsmm_conv2d.h",
@@ -6129,6 +6167,7 @@ filegroup(
         ":android_extended_ops_headers",
         "base64_ops.cc",
         "batchtospace_op.cc",
+        "bincount_op.cc",
         "broadcast_to_op.cc",
         "bucketize_op.cc",
         "ctc_decoder_ops.cc",
@@ -6161,10 +6200,12 @@ filegroup(
         "queue_op.cc",
         "queue_ops.cc",
         "ragged_range_op.cc",
+        "ragged_gather_op.cc",
         "ragged_tensor_to_sparse_kernel.cc",
         "ragged_tensor_to_tensor_op.cc",
         "random_op.cc",
         "random_op_cpu.h",
+        "random_ops_util.h",
         "random_poisson_op.cc",
         "reduce_join_op.cc",
         "reduction_ops_all.cc",
@@ -6215,8 +6256,10 @@ filegroup(
         "stack_ops.cc",
         "stateless_random_ops.cc",
         "string_join_op.cc",
+        "string_lower_op.cc",
         "string_util.cc",
         "string_split_op.cc",
+        "string_strip_op.cc",
         "string_to_hash_bucket_op.cc",
         "substr_op.cc",
         "tensor_array.cc",
@@ -6385,7 +6428,6 @@ filegroup(
             "unicode_script_op.cc",
             # Ops that are inherently incompatible with Android (e.g. tied to x86 platform).
             "xsmm_*",
-            "cwise_ops_sycl_common.h",
             "nextafter_op.cc",
         ] + ANDROID_TEXTUAL_HDRS,
     ) + [
@@ -6412,12 +6454,6 @@ filegroup(
 )
 # LINT.ThenChange(//tensorflow/contrib/makefile/tf_op_files.txt)
 
-alias(
-    name = "android_tensorflow_kernels",
-    actual = ":portable_tensorflow_kernels",
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "portable_tensorflow_kernels",
     srcs = if_mobile([
@@ -6432,8 +6468,11 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core:portable_jpeg_internal",
         "//tensorflow/core:portable_tensorflow_lib_lite",
         "//tensorflow/core:protos_all_cc_impl",
+        "//tensorflow/core/lib/png:png_io",
         "//tensorflow/core/platform:strong_hash",
         "//third_party/eigen3",
         "//third_party/fft2d:fft2d_headers",
@@ -6596,7 +6635,7 @@ tf_cc_binary(
     deps = [
     ] + select({
         "//tensorflow:android": [
-            ":android_tensorflow_kernels",
+            ":portable_tensorflow_kernels",
             "//tensorflow/core:portable_tensorflow_lib",
             "//tensorflow/core:portable_tensorflow_test_lib",
         ],
@@ -6609,7 +6648,7 @@ tf_cc_binary(
             "//tensorflow/cc:cc_ops",
             "//tensorflow/cc:client_session",
             "//tensorflow/core:framework",
-            "//tensorflow/core:tensor_testutil",
+            "//tensorflow/core/framework:tensor_testutil",
         ],
     }),
 )
@@ -6657,7 +6696,7 @@ cc_binary(
         "//tensorflow/cc:client_session",
     ] + select({
         "//tensorflow:android": [
-            ":android_tensorflow_kernels",
+            ":portable_tensorflow_kernels",
             "//tensorflow/core:portable_tensorflow_lib",
             "//tensorflow/core:portable_tensorflow_test_lib",
         ],
@@ -6666,7 +6705,7 @@ cc_binary(
             ":quantized_ops",
             "//tensorflow/core:framework",
             "//tensorflow/core:protos_all_cc",
-            "//tensorflow/core:tensor_testutil",
+            "//tensorflow/core/framework:tensor_testutil",
             "//tensorflow/core:tensorflow",
             "//tensorflow/core:test",
         ],
@@ -6742,7 +6781,7 @@ cc_binary(
         "//tensorflow/cc:client_session",
     ] + select({
         "//tensorflow:android": [
-            ":android_tensorflow_kernels",
+            ":portable_tensorflow_kernels",
             "//tensorflow/core:portable_tensorflow_lib",
             "//tensorflow/core:portable_tensorflow_test_lib",
         ],
@@ -6863,7 +6902,7 @@ cc_binary(
         "//tensorflow/cc:client_session",
     ] + select({
         "//tensorflow:android": [
-            ":android_tensorflow_kernels",
+            ":portable_tensorflow_kernels",
             "//tensorflow/core:portable_tensorflow_lib",
             "//tensorflow/core:portable_tensorflow_test_lib",
         ],
@@ -6871,7 +6910,7 @@ cc_binary(
             ":ops_util",
             ":quantized_ops",
             "//tensorflow/core:framework",
-            "//tensorflow/core:tensor_testutil",
+            "//tensorflow/core/framework:tensor_testutil",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core:test",
         ],
@@ -7005,13 +7044,13 @@ cc_binary(
         "//tensorflow/cc:client_session",
     ] + select({
         "//tensorflow:android": [
-            ":android_tensorflow_kernels",
+            ":portable_tensorflow_kernels",
             "//tensorflow/core:portable_tensorflow_lib",
             "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
-            "//tensorflow/core:tensor_testutil",
+            "//tensorflow/core/framework:tensor_testutil",
         ],
     }),
 )
@@ -7331,11 +7370,11 @@ cc_library(
         "fill_functor.h",
     ],
     deps = [
-        ":bounds_check",
         ":meta_support",
         ":quantization_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
         "@gemmlowp",
     ],
@@ -7430,7 +7469,7 @@ test_suite(
         ":variable_ops_test",
         "//tensorflow/core/kernels/image:crop_and_resize_op_test",
         "//tensorflow/core/kernels/image:non_max_suppression_op_test",
-        "//tensorflow/core/kernels/image:resize_bilinear_op_test",
+        "//tensorflow/core/kernels/image:resize_ops_test",
     ],
 )
 
@@ -7457,8 +7496,10 @@ exports_files([
     "cwise_op_gpu_floor_div.cu.cc",
     "cwise_op_gpu_greater.cu.cc",
     "cwise_op_gpu_greater_equal.cu.cc",
+    "cwise_op_gpu_isinf.cu.cc",
     "cwise_op_gpu_less.cu.cc",
     "cwise_op_gpu_less_equal.cu.cc",
+    "cwise_op_gpu_log.cu.cc",
     "cwise_op_gpu_logical_and.cu.cc",
     "cwise_op_gpu_logical_not.cu.cc",
     "cwise_op_gpu_logical_or.cu.cc",
@@ -7468,6 +7509,7 @@ exports_files([
     "cwise_op_gpu_mul.cu.cc",
     "cwise_op_gpu_neg.cu.cc",
     "cwise_op_gpu_not_equal_to.cu.cc",
+    "cwise_op_gpu_pow.cu.cc",
     "cwise_op_gpu_round.cu.cc",
     "cwise_op_gpu_rsqrt.cu.cc",
     "cwise_op_gpu_select.cu.cc",
@@ -7479,8 +7521,10 @@ exports_files([
     "cwise_op_gpu_tanh.cu.cc",
     "cwise_op_greater.cc",
     "cwise_op_greater_equal.cc",
+    "cwise_op_isinf.cc",
     "cwise_op_less.cc",
     "cwise_op_less_equal.cc",
+    "cwise_op_log.cc",
     "cwise_op_logical_and.cc",
     "cwise_op_logical_not.cc",
     "cwise_op_logical_or.cc",
@@ -7493,6 +7537,7 @@ exports_files([
     "cwise_op_neg_2.cc",
     "cwise_op_not_equal_to_1.cc",
     "cwise_op_not_equal_to_2.cc",
+    "cwise_op_pow.cc",
     "cwise_op_round.cc",
     "cwise_op_rsqrt.cc",
     "cwise_op_select.cc",
diff --git a/tensorflow/core/kernels/aggregate_ops.cc b/tensorflow/core/kernels/aggregate_ops.cc
index 79062aee156..3b6f89a4c43 100644
--- a/tensorflow/core/kernels/aggregate_ops.cc
+++ b/tensorflow/core/kernels/aggregate_ops.cc
@@ -28,9 +28,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_ADDN(type, dev)                                   \
   REGISTER_KERNEL_BUILDER(                                         \
@@ -67,21 +64,6 @@ REGISTER_KERNEL_BUILDER(
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_ADDN(float, SYCL);
-REGISTER_ADDN(double, SYCL);
-
-// A special GPU kernel for int32.
-// TODO(b/25387198): Also enable int32 in device memory. This kernel
-// registration requires all int32 inputs and outputs to be in host memory.
-REGISTER_KERNEL_BUILDER(
-    Name("AddN")
-        .Device(DEVICE_SYCL)
-        .TypeConstraint<int32>("T")
-        .HostMemory("inputs")
-        .HostMemory("sum"),
-    AddNOp<CPUDevice, int32, OpKernel, OpKernelConstruction, OpKernelContext>);
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef REGISTER_ADDN
 
diff --git a/tensorflow/core/kernels/aggregate_ops_cpu.h b/tensorflow/core/kernels/aggregate_ops_cpu.h
index 3e87917b64f..d64d30615e1 100644
--- a/tensorflow/core/kernels/aggregate_ops_cpu.h
+++ b/tensorflow/core/kernels/aggregate_ops_cpu.h
@@ -23,9 +23,6 @@ limitations under the License.
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
@@ -137,114 +134,6 @@ struct Add9Functor<CPUDevice, T> {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-// Partial specializations for a SYCLDevice, that uses the Eigen implementation
-// from AddNEigenImpl.
-template <typename T>
-struct Add2Functor<SYCLDevice, T> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstFlat in1,
-                  typename TTypes<T>::ConstFlat in2) {
-    Add2EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2);
-  }
-};
-template <typename T>
-struct Add3Functor<SYCLDevice, T> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstFlat in1,
-                  typename TTypes<T>::ConstFlat in2,
-                  typename TTypes<T>::ConstFlat in3) {
-    Add3EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3);
-  }
-};
-template <typename T>
-struct Add4Functor<SYCLDevice, T> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstFlat in1,
-                  typename TTypes<T>::ConstFlat in2,
-                  typename TTypes<T>::ConstFlat in3,
-                  typename TTypes<T>::ConstFlat in4) {
-    Add4EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4);
-  }
-};
-template <typename T>
-struct Add5Functor<SYCLDevice, T> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstFlat in1,
-                  typename TTypes<T>::ConstFlat in2,
-                  typename TTypes<T>::ConstFlat in3,
-                  typename TTypes<T>::ConstFlat in4,
-                  typename TTypes<T>::ConstFlat in5) {
-    Add5EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5);
-  }
-};
-template <typename T>
-struct Add6Functor<SYCLDevice, T> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstFlat in1,
-                  typename TTypes<T>::ConstFlat in2,
-                  typename TTypes<T>::ConstFlat in3,
-                  typename TTypes<T>::ConstFlat in4,
-                  typename TTypes<T>::ConstFlat in5,
-                  typename TTypes<T>::ConstFlat in6) {
-    Add6EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6);
-  }
-};
-template <typename T>
-struct Add7Functor<SYCLDevice, T> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstFlat in1,
-                  typename TTypes<T>::ConstFlat in2,
-                  typename TTypes<T>::ConstFlat in3,
-                  typename TTypes<T>::ConstFlat in4,
-                  typename TTypes<T>::ConstFlat in5,
-                  typename TTypes<T>::ConstFlat in6,
-                  typename TTypes<T>::ConstFlat in7) {
-    Add7EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                          in7);
-  }
-};
-
-template <typename T>
-struct Add8Functor<SYCLDevice, T> {
-  void operator()(
-      const SYCLDevice& d, typename TTypes<T>::Flat out,
-      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
-      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
-      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
-      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
-    Add8EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                          in7, in8);
-  }
-};
-
-template <typename T>
-struct Add8pFunctor<SYCLDevice, T> {
-  void operator()(
-      const SYCLDevice& d, typename TTypes<T>::Flat out,
-      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
-      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
-      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
-      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8) {
-    Add8pEigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                           in7, in8);
-  }
-};
-
-template <typename T>
-struct Add9Functor<SYCLDevice, T> {
-  void operator()(
-      const SYCLDevice& d, typename TTypes<T>::Flat out,
-      typename TTypes<T>::ConstFlat in1, typename TTypes<T>::ConstFlat in2,
-      typename TTypes<T>::ConstFlat in3, typename TTypes<T>::ConstFlat in4,
-      typename TTypes<T>::ConstFlat in5, typename TTypes<T>::ConstFlat in6,
-      typename TTypes<T>::ConstFlat in7, typename TTypes<T>::ConstFlat in8,
-      typename TTypes<T>::ConstFlat in9) {
-    Add9EigenImpl<SYCLDevice, T>::Compute(d, out, in1, in2, in3, in4, in5, in6,
-                                          in7, in8, in9);
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/as_string_op.cc b/tensorflow/core/kernels/as_string_op.cc
index 8341909fbc8..b9af976a654 100644
--- a/tensorflow/core/kernels/as_string_op.cc
+++ b/tensorflow/core/kernels/as_string_op.cc
@@ -65,9 +65,26 @@ class AsStringOp : public OpKernel {
     OP_REQUIRES(ctx, !(scientific && shortest),
                 errors::InvalidArgument(
                     "Cannot select both scientific and shortest notation"));
+
     format_ = "%";
+    if (!fill_string.empty()) {
+      switch (fill_string[0]) {
+        case ' ':
+        case '+':
+        case '-':
+        case '0':
+        case '#':
+          strings::Appendf(&format_, "%s", fill_string.c_str());
+          break;
+        default:
+          bool fill_not_supported = true;
+          OP_REQUIRES(ctx, !fill_not_supported,
+                      errors::InvalidArgument("Fill argument not supported: \"",
+                                              fill_string, "\""));
+      }
+    }
     if (width > -1) {
-      strings::Appendf(&format_, "%s%d", fill_string.c_str(), width);
+      strings::Appendf(&format_, "%d", width);
     }
     if (precision > -1) {
       strings::Appendf(&format_, ".%d", precision);
diff --git a/tensorflow/core/kernels/as_string_op_test.cc b/tensorflow/core/kernels/as_string_op_test.cc
new file mode 100644
index 00000000000..dff78e25e72
--- /dev/null
+++ b/tensorflow/core/kernels/as_string_op_test.cc
@@ -0,0 +1,245 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+class AsStringGraphTest : public OpsTestBase {
+ protected:
+  Status Init(DataType input_type, const string& fill = "", int width = -1,
+              int precision = -1, bool scientific = false,
+              bool shortest = false) {
+    TF_CHECK_OK(NodeDefBuilder("op", "AsString")
+                    .Input(FakeInput(input_type))
+                    .Attr("fill", fill)
+                    .Attr("precision", precision)
+                    .Attr("scientific", scientific)
+                    .Attr("shortest", shortest)
+                    .Attr("width", width)
+                    .Finalize(node_def()));
+    return InitOp();
+  }
+};
+
+TEST_F(AsStringGraphTest, Int8) {
+  TF_ASSERT_OK(Init(DT_INT8));
+
+  AddInputFromArray<int8>(TensorShape({3}), {-42, 0, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({3}));
+  test::FillValues<tstring>(&expected, {"-42", "0", "42"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, Int64) {
+  TF_ASSERT_OK(Init(DT_INT64));
+
+  AddInputFromArray<int64>(TensorShape({3}), {-42, 0, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({3}));
+  test::FillValues<tstring>(&expected, {"-42", "0", "42"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, FloatDefault) {
+  TF_ASSERT_OK(Init(DT_FLOAT));
+
+  AddInputFromArray<float>(TensorShape({4}), {-42, 0, 3.14159, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({4}));
+  test::FillValues<tstring>(
+      &expected, {"-42.000000", "0.000000", "3.141590", "42.000000"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, FloatScientific) {
+  TF_ASSERT_OK(Init(DT_FLOAT, /*fill=*/"", /*width=*/-1, /*precision=*/-1,
+                    /*scientific=*/true));
+
+  AddInputFromArray<float>(TensorShape({4}), {-42, 0, 3.14159, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({4}));
+  test::FillValues<tstring>(&expected, {"-4.200000e+01", "0.000000e+00",
+                                        "3.141590e+00", "4.200000e+01"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, FloatShortest) {
+  TF_ASSERT_OK(Init(DT_FLOAT, /*fill=*/"", /*width=*/-1, /*precision=*/-1,
+                    /*scientific=*/false, /*shortest=*/true));
+
+  AddInputFromArray<float>(TensorShape({4}), {-42, 0, 3.14159, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({4}));
+  test::FillValues<tstring>(&expected, {"-42", "0", "3.14159", "42"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, FloatPrecisionOnly) {
+  TF_ASSERT_OK(Init(DT_FLOAT, /*fill=*/"", /*width=*/-1, /*precision=*/2));
+
+  AddInputFromArray<float>(TensorShape({4}), {-42, 0, 3.14159, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({4}));
+  test::FillValues<tstring>(&expected, {"-42.00", "0.00", "3.14", "42.00"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, FloatWidthOnly) {
+  TF_ASSERT_OK(Init(DT_FLOAT, /*fill=*/"", /*width=*/5));
+
+  AddInputFromArray<float>(TensorShape({4}), {-42, 0, 3.14159, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({4}));
+  test::FillValues<tstring>(
+      &expected, {"-42.000000", "0.000000", "3.141590", "42.000000"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, Float_5_2_Format) {
+  TF_ASSERT_OK(Init(DT_FLOAT, /*fill=*/"", /*width=*/5, /*precision=*/2));
+
+  AddInputFromArray<float>(TensorShape({4}), {-42, 0, 3.14159, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({4}));
+  test::FillValues<tstring>(&expected, {"-42.00", " 0.00", " 3.14", "42.00"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, Complex) {
+  TF_ASSERT_OK(Init(DT_COMPLEX64, /*fill=*/"", /*width=*/5, /*precision=*/2));
+
+  AddInputFromArray<complex64>(TensorShape({3}), {{-4, 2}, {0}, {3.14159, -1}});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({3}));
+  test::FillValues<tstring>(
+      &expected, {"(-4.00, 2.00)", "( 0.00, 0.00)", "( 3.14,-1.00)"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, Bool) {
+  TF_ASSERT_OK(Init(DT_BOOL));
+
+  AddInputFromArray<bool>(TensorShape({2}), {true, false});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({2}));
+  test::FillValues<tstring>(&expected, {"true", "false"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, String) {
+  Status s = Init(DT_STRING);
+  ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
+  ASSERT_TRUE(absl::StrContains(
+      s.error_message(),
+      "Value for attr 'T' of string is not in the list of allowed values"));
+}
+
+TEST_F(AsStringGraphTest, OnlyOneOfScientificAndShortest) {
+  Status s = Init(DT_FLOAT, /*fill=*/"", /*width=*/-1, /*precision=*/-1,
+                  /*scientific=*/true, /*shortest=*/true);
+  ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
+  ASSERT_TRUE(
+      absl::StrContains(s.error_message(),
+                        "Cannot select both scientific and shortest notation"));
+}
+
+TEST_F(AsStringGraphTest, NoShortestForNonFloat) {
+  Status s = Init(DT_INT32, /*fill=*/"", /*width=*/-1, /*precision=*/-1,
+                  /*scientific=*/false, /*shortest=*/true);
+  ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
+  ASSERT_TRUE(absl::StrContains(
+      s.error_message(),
+      "scientific and shortest format not supported for datatype"));
+}
+
+TEST_F(AsStringGraphTest, NoScientificForNonFloat) {
+  Status s = Init(DT_INT32, /*fill=*/"", /*width=*/-1, /*precision=*/-1,
+                  /*scientific=*/true);
+  ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
+  ASSERT_TRUE(absl::StrContains(
+      s.error_message(),
+      "scientific and shortest format not supported for datatype"));
+}
+
+TEST_F(AsStringGraphTest, NoPrecisionForNonFloat) {
+  Status s = Init(DT_INT32, /*fill=*/"", /*width=*/-1, /*precision=*/5);
+  ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
+  ASSERT_TRUE(absl::StrContains(s.error_message(),
+                                "precision not supported for datatype"));
+}
+
+TEST_F(AsStringGraphTest, LongFill) {
+  Status s = Init(DT_INT32, /*fill=*/"asdf");
+  ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
+  ASSERT_TRUE(absl::StrContains(s.error_message(),
+                                "Fill string must be one or fewer characters"));
+}
+
+TEST_F(AsStringGraphTest, FillWithZero) {
+  TF_ASSERT_OK(Init(DT_INT64, /*fill=*/"0", /*width=*/4));
+
+  AddInputFromArray<int64>(TensorShape({3}), {-42, 0, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({3}));
+  test::FillValues<tstring>(&expected, {"-042", "0000", "0042"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, FillWithSpace) {
+  TF_ASSERT_OK(Init(DT_INT64, /*fill=*/" ", /*width=*/4));
+
+  AddInputFromArray<int64>(TensorShape({3}), {-42, 0, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({3}));
+  test::FillValues<tstring>(&expected, {" -42", "   0", "  42"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, FillWithChar1) {
+  TF_ASSERT_OK(Init(DT_INT64, /*fill=*/"-", /*width=*/4));
+
+  AddInputFromArray<int64>(TensorShape({3}), {-42, 0, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({3}));
+  test::FillValues<tstring>(&expected, {"-42 ", "0   ", "42  "});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, FillWithChar3) {
+  Status s = Init(DT_INT32, /*fill=*/"s");
+  ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
+  ASSERT_TRUE(
+      absl::StrContains(s.error_message(), "Fill argument not supported"));
+}
+
+TEST_F(AsStringGraphTest, FillWithChar4) {
+  Status s = Init(DT_INT32, /*fill=*/"n");
+  ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
+  ASSERT_TRUE(
+      absl::StrContains(s.error_message(), "Fill argument not supported"));
+}
+
+}  // end namespace
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/avgpooling_op.cc b/tensorflow/core/kernels/avgpooling_op.cc
index 20df833a934..58004d1789f 100644
--- a/tensorflow/core/kernels/avgpooling_op.cc
+++ b/tensorflow/core/kernels/avgpooling_op.cc
@@ -81,8 +81,13 @@ class AvgPoolingOp : public UnaryOp<T> {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
-    PoolParameters params{context,  ksize_,       stride_,
-                          padding_, data_format_, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize_,
+                          stride_,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          data_format_,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -146,8 +151,13 @@ class AvgPoolingOp<GPUDevice, T> : public UnaryOp<T> {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
-    PoolParameters params{context,  ksize_,       stride_,
-                          padding_, data_format_, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize_,
+                          stride_,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          data_format_,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -169,14 +179,14 @@ class AvgPoolingOp<GPUDevice, T> : public UnaryOp<T> {
 
 #if CUDNN_VERSION >= 7300
     DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kAverage, ksize_,
-                             stride_, padding_, data_format_, tensor_in,
-                             output_shape,
+                             stride_, padding_, /*explicit_paddings=*/{},
+                             data_format_, tensor_in, output_shape,
                              /*propagate_nans=*/false);
 #else
     if (data_format_ == FORMAT_NCHW) {
       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kAverage, ksize_,
-                               stride_, padding_, data_format_, tensor_in,
-                               output_shape,
+                               stride_, padding_, /*explicit_paddings=*/{},
+                               data_format_, tensor_in, output_shape,
                                /*propagate_nans=*/false);
     } else {
       Tensor* output = nullptr;
@@ -446,10 +456,10 @@ class AvgPoolingGradOp<GPUDevice, T> : public OpKernel {
       return;
     }
 
-    DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kAverage,
-                                 ksize_, stride_, padding_, data_format_,
-                                 nullptr, nullptr, out_backprop, output_shape,
-                                 /*propagate_nans=*/false);
+    DnnPoolingGradOp<T>::Compute(
+        context, se::dnn::PoolingMode::kAverage, ksize_, stride_, padding_,
+        /*explicit_paddings=*/{}, data_format_, nullptr, nullptr, out_backprop,
+        output_shape, /*propagate_nans=*/false);
   }
 
  private:
@@ -533,7 +543,8 @@ class AvgPoolingGradOpCustomGPUKernel : public OpKernel {
 
 #if CUDNN_VERSION >= 7300
     DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kAverage,
-                                 ksize_, stride_, padding_, data_format_,
+                                 ksize_, stride_, padding_,
+                                 /*explicit_paddings=*/{}, data_format_,
                                  nullptr, nullptr, out_backprop, output_shape,
                                  /*propagate_nans=*/false);
 #else
@@ -589,7 +600,8 @@ class AvgPoolingGradOpCustomGPUKernel : public OpKernel {
                                 context->eigen_gpu_device());   // d
     } else {
       DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kAverage,
-                                   ksize_, stride_, padding_, data_format_,
+                                   ksize_, stride_, padding_,
+                                   /*explicit_paddings=*/{}, data_format_,
                                    nullptr, nullptr, out_backprop, output_shape,
                                    /*propagate_nans=*/false);
     }
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 89c438b62cc..5c1e0cbe6e4 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/matmul_autotune.h"
 #include "tensorflow/core/util/matmul_bcast.h"
 #include "tensorflow/core/util/work_sharder.h"
 
@@ -43,16 +44,18 @@ limitations under the License.
 #endif
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#include "tensorflow/core/kernels/gpu_utils.h"
 #include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda.h"  // For CUDA_VERSION
+#endif
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace {
 
@@ -222,7 +225,8 @@ template <typename Scalar>
 struct LaunchBatchMatMul<CPUDevice, Scalar> {
   static void Launch(OpKernelContext* context, const Tensor& in_x,
                      const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
-                     bool trans_y, const MatMulBCast& bcast, Tensor* out) {
+                     bool trans_y, const MatMulBCast& bcast, bool use_autotune,
+                     Tensor* out) {
     typedef ParallelMatMulKernel<Scalar, Eigen::NumTraits<Scalar>::IsComplex>
         ParallelMatMulKernel;
     bool conjugate_result = false;
@@ -278,45 +282,212 @@ se::DeviceMemory<T> AsDeviceMemory(const T* gpu_memory) {
   return typed;
 }
 
-class BlasScratchAllocator : public se::ScratchAllocator {
+using BlasScratchAllocator = GpuScratchAllocator;
+
+int64 GetBlasWorkspaceLimit(const string& envvar_in_mb,
+                            int64 default_value_in_bytes) {
+  return gpu_utils::GetWorkspaceLimit(envvar_in_mb, default_value_in_bytes);
+}
+
+// Encapsulate all of the shape, dtype etc. information that defines a unique
+// batched matmul operation.
+class BatchMatmulParameters {
  public:
-  using Stream = se::Stream;
-  using DeviceMemoryBytes = se::DeviceMemory<uint8>;
+  BatchMatmulParameters(bool trans_a, bool trans_b, bool adj_a, bool adj_b,
+                        uint64 m, uint64 n, uint64 k, uint64 batch_count,
+                        bool broadcast_a, bool broadcast_b, DataType dtype_ab,
+                        DataType dtype_cd, bool allow_tf32, int device_id)
+      : trans_a_(trans_a),
+        trans_b_(trans_b),
+        adj_a_(adj_a),
+        adj_b_(adj_b),
+        m_(m),
+        n_(n),
+        k_(k),
+        batch_count_(batch_count),
+        broadcast_a_(broadcast_a),
+        broadcast_b_(broadcast_b),
+        dtype_ab_(dtype_ab),
+        dtype_cd_(dtype_cd),
+        allow_tf32_(allow_tf32),
+        device_id_(device_id) {
+    hash_code_ = trans_a;
+    hash_code_ = Hash64Combine(hash_code_, trans_b);
+    hash_code_ = Hash64Combine(hash_code_, adj_a);
+    hash_code_ = Hash64Combine(hash_code_, adj_b);
+    hash_code_ = Hash64Combine(hash_code_, m);
+    hash_code_ = Hash64Combine(hash_code_, n);
+    hash_code_ = Hash64Combine(hash_code_, k);
+    hash_code_ = Hash64Combine(hash_code_, batch_count);
+    hash_code_ = Hash64Combine(hash_code_, broadcast_a);
+    hash_code_ = Hash64Combine(hash_code_, broadcast_b);
+    hash_code_ = Hash64Combine(hash_code_, dtype_ab);
+    hash_code_ = Hash64Combine(hash_code_, dtype_cd);
+    hash_code_ = Hash64Combine(hash_code_, allow_tf32);
+    hash_code_ = Hash64Combine(hash_code_, device_id);
+  }
+  bool operator==(const BatchMatmulParameters& other) const {
+    return this->get_data_as_tuple() == other.get_data_as_tuple();
+  }
 
-  BlasScratchAllocator(OpKernelContext* context) : context_(context) {}
+  bool operator!=(const BatchMatmulParameters& other) const {
+    return !(*this == other);
+  }
+  uint64 hash() const { return hash_code_; }
 
-  int64 GetMemoryLimitInBytes() override { return -1; }
-
-  se::port::StatusOr<DeviceMemoryBytes> AllocateBytes(
-      int64 byte_size) override {
-    Tensor temporary_memory;
-
-    Status allocation_status(context_->allocate_temp(
-        DT_UINT8, TensorShape({byte_size}), &temporary_memory));
-    if (!allocation_status.ok()) {
-      return se::port::StatusOr<DeviceMemoryBytes>(
-          DeviceMemoryBytes::MakeFromByteSize(nullptr, 0));
-    }
-    // Hold the reference of the allocated tensors until the end of the
-    // allocator.
-    allocated_tensors_.push_back(temporary_memory);
-    return se::port::StatusOr<DeviceMemoryBytes>(
-        DeviceMemoryBytes::MakeFromByteSize(
-            temporary_memory.flat<uint8>().data(),
-            temporary_memory.flat<uint8>().size()));
+  string ToString() const {
+    // clang-format off
+    return strings::StrCat(
+        trans_a_, ", ", trans_b_, ", ", adj_a_, ", ", adj_b_, ", ",
+        m_, ", ", n_, ", ", k_, ", ", batch_count_, ", ",
+        broadcast_a_, ", ", broadcast_b_, ", ",
+        dtype_ab_, ", ", dtype_cd_, ", ", allow_tf32_, ", ", device_id_);
+    // clang-format on
   }
 
  private:
-  OpKernelContext* context_;
-  std::vector<Tensor> allocated_tensors_;
+  typedef std::tuple<bool, bool, bool, bool, int64, int64, int64, int64, bool,
+                     bool, DataType, DataType, bool, int>
+      ParameterDataType;
+
+  ParameterDataType get_data_as_tuple() const {
+    return std::make_tuple(trans_a_, trans_b_, adj_a_, adj_b_, m_, n_, k_,
+                           batch_count_, broadcast_a_, broadcast_b_, dtype_ab_,
+                           dtype_cd_, allow_tf32_, device_id_);
+  }
+
+  bool trans_a_;
+  bool trans_b_;
+  bool adj_a_;
+  bool adj_b_;
+  uint64 m_;
+  uint64 n_;
+  uint64 k_;
+  uint64 batch_count_;
+  bool broadcast_a_;
+  bool broadcast_b_;
+  DataType dtype_ab_;
+  DataType dtype_cd_;
+  bool allow_tf32_;
+  int device_id_;
+  uint64 hash_code_;
 };
+
+bool GetBlasComputationType(const DataType& dtype, bool allow_tf32,
+                            se::blas::ComputationType* compute_type) {
+  using se::blas::ComputationType;
+  static bool use_f32_for_f16_computation = MatmulDoFP32ComputationFP16Input();
+  ComputationType f32_type =
+      allow_tf32 ? ComputationType::kTF32AsF32 : ComputationType::kF32;
+  switch (dtype) {
+    case DT_HALF:
+    case DT_BFLOAT16:
+      *compute_type =
+          use_f32_for_f16_computation ? f32_type : ComputationType::kF16;
+      return true;
+    case DT_FLOAT:
+      *compute_type = f32_type;
+      return true;
+    case DT_DOUBLE:
+      *compute_type = ComputationType::kF64;
+      return true;
+    case DT_COMPLEX64:
+      *compute_type = f32_type;
+      return true;
+    case DT_COMPLEX128:
+      *compute_type = ComputationType::kComplexF64;
+      return true;
+    default:
+      // Unsupported compute_type, return false.
+      return false;
+  }
+}
+
+// Thread-safe map from matmul parameters to their corresponding plan and
+// algorithms.
+template <typename Parameters>
+class BlasLtMatmulPlanMap {
+ public:
+  struct PlanAndAlgorithms {
+    std::unique_ptr<se::blas::IBlasLtMatmulPlan> plan;
+    std::vector<std::unique_ptr<se::blas::IBlasLtMatmulAlgorithm>> algorithms;
+  };
+
+  const PlanAndAlgorithms* Find(const Parameters& params) {
+    mutex_lock lock(mu_);
+    auto iter = params_plan_map_.find(params);
+    if (iter == params_plan_map_.end()) {
+      return nullptr;
+    }
+    return &iter->second;
+  }
+  const PlanAndAlgorithms* Insert(const Parameters& params,
+                                  PlanAndAlgorithms value) {
+    mutex_lock lock(mu_);
+    return &params_plan_map_.emplace(params, std::move(value)).first->second;
+  }
+
+ private:
+  struct Hasher {
+    std::size_t operator()(const Parameters& parameter) const {
+      return parameter.hash();
+    }
+  };
+
+  mutable mutex mu_;
+  std::unordered_map<Parameters, PlanAndAlgorithms, Hasher> params_plan_map_
+      GUARDED_BY(mu_);
+};
+
+template <typename Parameters>
+struct BlasLtPlanMapSingleton {
+  typedef BlasLtMatmulPlanMap<Parameters> PlanMapType;
+  static PlanMapType* GetInstance() {
+    static PlanMapType* instance = new PlanMapType();
+    return instance;
+  }
+};
+
+typedef BlasLtPlanMapSingleton<BatchMatmulParameters>
+    BatchMatmulPlanMapSingleton;
+
+// A dummy type to group matmul autotune results together.
+struct BatchMatmulAutoTuneGroup {
+  static string name() { return "MatmulLt"; }
+};
+
+typedef AutoTuneSingleton<BatchMatmulAutoTuneGroup, BatchMatmulParameters,
+                          se::blas::AlgorithmConfig>
+    AutoTuneBatchMatmul;
+
+template <typename Scalar>
+struct CoefficientType {
+  typedef Scalar type;
+};
+template <>
+struct CoefficientType<Eigen::half> {
+  typedef float type;
+};
+
+inline Status FromExecutorStatus(const se::port::Status& s) {
+  return s.ok() ? Status::OK()
+                : Status(static_cast<error::Code>(static_cast<int>(s.code())),
+                         s.error_message());
+}
+
+template <typename T>
+inline Status FromExecutorStatus(const se::port::StatusOr<T>& s) {
+  return FromExecutorStatus(s.status());
+}
+
 }  // namespace
 
 template <typename Scalar>
 struct LaunchBatchMatMul<GPUDevice, Scalar> {
   static void Launch(OpKernelContext* context, const Tensor& in_x,
                      const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
-                     bool trans_y, const MatMulBCast& bcast, Tensor* out) {
+                     bool trans_y, const MatMulBCast& bcast, bool use_autotune,
+                     Tensor* out) {
     se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose,
                                    se::blas::Transpose::kTranspose,
                                    se::blas::Transpose::kConjugateTranspose};
@@ -346,10 +517,191 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
     auto* a_base_ptr = in_x.template flat<Scalar>().data();
     auto* b_base_ptr = in_y.template flat<Scalar>().data();
     auto* c_base_ptr = out->template flat<Scalar>().data();
-    uint64 a_stride;
-    uint64 b_stride;
-    uint64 c_stride;
+    int64 a_stride;
+    int64 b_stride;
+    int64 c_stride;
 
+    typedef typename CoefficientType<Scalar>::type Coefficient;
+
+    static const int64 max_scratch_size = GetBlasWorkspaceLimit(
+        "TF_CUBLAS_WORKSPACE_LIMIT_IN_MB", 1LL << 32);  // 4GB by default
+
+    // The BlasLtMatmul routines are only supported from CUDA 11.0 onward.
+#if GOOGLE_CUDA && CUDA_VERSION >= 11000
+    bool is_full_broadcast =
+        std::min(bcast.x_batch_size(), bcast.y_batch_size()) == 1;
+    bool requires_mixed_broadcasting =
+        bcast.IsBroadcastingRequired() && !is_full_broadcast;
+    if (!requires_mixed_broadcasting) {
+      bool broadcast_a = bcast.x_batch_size() == 1;
+      bool broadcast_b = bcast.y_batch_size() == 1;
+      a_stride = broadcast_a ? 0 : m * k;
+      b_stride = broadcast_b ? 0 : k * n;
+      c_stride = m * n;
+      a_device_memory.push_back(AsDeviceMemory(a_base_ptr));
+      b_device_memory.push_back(AsDeviceMemory(b_base_ptr));
+      c_device_memory.push_back(AsDeviceMemory(c_base_ptr));
+      a_ptrs.push_back(&a_device_memory.back());
+      b_ptrs.push_back(&b_device_memory.back());
+      c_ptrs.push_back(&c_device_memory.back());
+
+      DataType dtype = DataTypeToEnum<Scalar>::value;
+      bool allow_tf32 = tensor_float_32_execution_enabled();
+      int device_id = stream->parent()->device_ordinal();
+      BatchMatmulParameters matmul_parameters(
+          trans_x, trans_y, adj_x, adj_y, m, n, k, batch_size, broadcast_a,
+          broadcast_b, dtype, dtype, allow_tf32, device_id);
+
+      static const bool max_autotune_algorithm_count =
+          MatmulMaxAutotuneAlgorithmCount();
+      int max_algorithm_count = use_autotune ? max_autotune_algorithm_count : 1;
+
+      const auto* plan_and_algorithms =
+          BatchMatmulPlanMapSingleton::GetInstance()->Find(matmul_parameters);
+      if (!plan_and_algorithms) {
+        se::blas::DataType blas_dtype = se::blas::ToDataType<Scalar>::value;
+        se::blas::ComputationType computation_type;
+        OP_REQUIRES(
+            context,
+            GetBlasComputationType(dtype, allow_tf32, &computation_type),
+            errors::Internal("Unsupported dtype for batched matmul"));
+
+        auto status_or_plan = stream->parent()->CreateBlasLtMatmulPlan(
+            {/*ab_type=*/blas_dtype,
+             /*c_type=*/blas_dtype, computation_type,
+             se::blas::PointerMode::kHost, se::blas::Epilogue::kDefault,
+             blas_transpose_b, blas_transpose_a, n, m, k,
+             /*lda=*/in_y.dim_size(2), /*ldb=*/in_x.dim_size(2),
+             /*ldc=*/static_cast<int64>(n), static_cast<int>(batch_size),
+             b_stride, a_stride, c_stride});
+        OP_REQUIRES(context, status_or_plan.ok(),
+                    FromExecutorStatus(status_or_plan));
+        std::unique_ptr<se::blas::IBlasLtMatmulPlan> plan =
+            status_or_plan.ConsumeValueOrDie();
+
+        auto status_or_algorithms = stream->parent()->GetBlasLtMatmulAlgorithms(
+            plan.get(), max_scratch_size, max_algorithm_count);
+        OP_REQUIRES(context, status_or_algorithms.ok(),
+                    FromExecutorStatus(status_or_algorithms));
+        auto algorithms = status_or_algorithms.ConsumeValueOrDie();
+
+        plan_and_algorithms =
+            BatchMatmulPlanMapSingleton::GetInstance()->Insert(
+                matmul_parameters, {std::move(plan), std::move(algorithms)});
+      }
+      const auto& plan = plan_and_algorithms->plan;
+      const auto& algorithms = plan_and_algorithms->algorithms;
+
+      // The BlasLtMatmul routines (unlike BlasGemm, BlasGemmBatched etc.) take
+      // alpha and beta with the same type as the matrices.
+      Scalar alpha(1.0);
+      Scalar beta(0.0);
+
+      // Note that algorithm_config.algorithm() here is used to refer
+      // to the index within the algorithms vector, not the algorithm
+      // itself.
+      se::blas::AlgorithmConfig algorithm_config(se::blas::kNoAlgorithm);
+      if (max_algorithm_count == 1) {
+        algorithm_config.set_algorithm(0);
+      } else if (!AutoTuneBatchMatmul::GetInstance()->Find(matmul_parameters,
+                                                           &algorithm_config)) {
+        VLOG(4) << "Autotuning BlasLtMatmul over " << algorithms.size()
+                << " algorithms.";
+        se::blas::ProfileResult best_result;
+        se::blas::ProfileResult profile_result;
+        // for (const auto& profile_algorithm : plan_and_algorithms->algorithms)
+        // {
+        for (size_t i = 0; i != algorithms.size(); ++i) {
+          const auto& profile_algorithm = algorithms[i];
+          // Create a new scratch allocator with every autotuning run so that
+          // scratch space is deallocated between runs.
+          BlasScratchAllocator scratch_allocator(max_scratch_size, context);
+
+          bool cublas_launch_status =
+              stream
+                  ->ThenBlasLtMatmul(plan.get(), alpha, *b_ptrs[0], *a_ptrs[0],
+                                     beta, c_ptrs[0], &scratch_allocator,
+                                     profile_algorithm.get(), {},
+                                     &profile_result)
+                  .ok();
+
+          VLOG(4) << "  Autotune algorithm " << i
+                  << " result: " << profile_result.elapsed_time_in_ms()
+                  << " ms, valid=" << profile_result.is_valid()
+                  << ", workspace_size=" << profile_algorithm->workspace_size();
+
+          if (cublas_launch_status && profile_result.is_valid() &&
+              profile_result.elapsed_time_in_ms() <
+                  best_result.elapsed_time_in_ms()) {
+            best_result = profile_result;
+          }
+        }
+
+        if (best_result.is_valid()) {
+          algorithm_config.set_algorithm(best_result.algorithm());
+        }
+        // We make sure that each matmul parameter set only gets one pass of
+        // autotune. If no algorithms works, we add kNoAlgorithm to the autotune
+        // map.
+        AutoTuneBatchMatmul::GetInstance()->Insert(matmul_parameters,
+                                                   algorithm_config);
+      }
+      se::blas::AlgorithmType algorithm_idx = algorithm_config.algorithm();
+      OP_REQUIRES(context,
+                  0 <= algorithm_idx && algorithm_idx < algorithms.size(),
+                  errors::Internal("Missing/invalid BatchMatmul algorithm"));
+      const auto& algorithm = algorithms[algorithm_idx];
+      BlasScratchAllocator scratch_allocator(max_scratch_size, context);
+      bool cublas_launch_status =
+          stream
+              ->ThenBlasLtMatmul(plan.get(), alpha, *b_ptrs[0], *a_ptrs[0],
+                                 beta, c_ptrs[0], &scratch_allocator,
+                                 algorithm.get())
+              .ok();
+      if (!cublas_launch_status) {
+        context->SetStatus(errors::Internal(
+            "Blas batched matmul launch failed : a.shape=(",
+            bcast.x_batch_size(), ", ", in_x.dim_size(0), ", ",
+            in_x.dim_size(1), "), b.shape=(", bcast.y_batch_size(), ", ",
+            in_y.dim_size(0), ", ", in_y.dim_size(1), "), m=", m, ", n=", n,
+            ", k=", k, ", batch_size=", batch_size));
+      }
+    } else {  // requires mixed broadcasting
+      const std::vector<int64>& a_batch_indices = bcast.x_batch_indices();
+      const std::vector<int64>& b_batch_indices = bcast.y_batch_indices();
+      for (int64 i = 0; i < bcast.x_batch_size(); ++i) {
+        a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k));
+      }
+      for (int64 i = 0; i < bcast.y_batch_size(); ++i) {
+        b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n));
+      }
+      for (int64 i = 0; i < batch_size; ++i) {
+        c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n));
+        a_ptrs.push_back(&a_device_memory[a_batch_indices[i]]);
+        b_ptrs.push_back(&b_device_memory[b_batch_indices[i]]);
+        c_ptrs.push_back(&c_device_memory.back());
+      }
+
+      BlasScratchAllocator scratch_allocator(max_scratch_size, context);
+      bool blas_launch_status =
+          stream
+              ->ThenBlasGemmBatchedWithScratch(
+                  blas_transpose_b, blas_transpose_a, n, m, k,
+                  static_cast<Coefficient>(1.0), b_ptrs,
+                  adj_y || trans_y ? k : n, a_ptrs, adj_x || trans_x ? m : k,
+                  static_cast<Coefficient>(0.0), c_ptrs, n, batch_size,
+                  &scratch_allocator)
+              .ok();
+      if (!blas_launch_status) {
+        context->SetStatus(errors::Internal(
+            "Blas xGEMMBatched launch failed : a.shape=",
+            in_x.shape().DebugString(),
+            ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n,
+            ", k=", k, ", batch_size=", batch_size));
+      }
+    }
+    return;
+#else  // if not GOOGLE_CUDA or CUDA_VERSION < 11000
     bool is_full_broadcast =
         std::min(bcast.x_batch_size(), bcast.y_batch_size()) == 1;
     bool use_strided_batched =
@@ -391,8 +743,6 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
       }
     }
 
-    typedef Scalar Coefficient;
-
     // Blas does
     // C = A x B
     // where A, B and C are assumed to be in column major.
@@ -402,7 +752,10 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
     if (batch_size == 1) {
       // This is a regular matrix*matrix or matrix*vector multiply. Avoid the
       // overhead of the scratch allocator and the batch interface.
-      if (n == 1 &&
+      // Note that the GEMV call here does not support Eigen::half, so we do not
+      // use this path in that case. A workaround is applied to the pointers
+      // passed to the call itself to avoid compilation errors.
+      if (!std::is_same<Scalar, Eigen::half>::value && n == 1 &&
           blas_transpose_b != se::blas::Transpose::kConjugateTranspose &&
           blas_transpose_a != se::blas::Transpose::kConjugateTranspose) {
         // This is a matrix*vector multiply so use GEMV to compute A * b.
@@ -413,13 +766,19 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
         auto gemv_trans_a = blas_transpose_a == se::blas::Transpose::kTranspose
                                 ? se::blas::Transpose::kNoTranspose
                                 : se::blas::Transpose::kTranspose;
+        // Cast pointers as a workaround for GEMV not supporting Eigen::half
+        // (this will never actually be executed for Eigen::half).
+        typedef se::DeviceMemory<Coefficient> NonHalfDeviceMemoryType;
+        NonHalfDeviceMemoryType a_ptr(*(a_ptrs[0]));
+        NonHalfDeviceMemoryType b_ptr(*(b_ptrs[0]));
+        NonHalfDeviceMemoryType c_ptr(*(c_ptrs[0]));
         bool blas_launch_status =
             stream
                 ->ThenBlasGemv(gemv_trans_a, adj_x || trans_x ? m : k,
                                adj_x || trans_x ? k : m,
-                               static_cast<Coefficient>(1.0), *(a_ptrs[0]),
-                               adj_x || trans_x ? m : k, *(b_ptrs[0]), 1,
-                               static_cast<Coefficient>(0.0), c_ptrs[0], 1)
+                               static_cast<Coefficient>(1.0), a_ptr,
+                               adj_x || trans_x ? m : k, b_ptr, 1,
+                               static_cast<Coefficient>(0.0), &c_ptr, 1)
                 .ok();
         if (!blas_launch_status) {
           context->SetStatus(errors::Internal(
@@ -462,154 +821,7 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
             ", k=", k, ", batch_size=", batch_size));
       }
     } else {
-      BlasScratchAllocator scratch_allocator(context);
-      bool blas_launch_status =
-          stream
-              ->ThenBlasGemmBatchedWithScratch(
-                  blas_transpose_b, blas_transpose_a, n, m, k,
-                  static_cast<Coefficient>(1.0), b_ptrs,
-                  adj_y || trans_y ? k : n, a_ptrs, adj_x || trans_x ? m : k,
-                  static_cast<Coefficient>(0.0), c_ptrs, n, batch_size,
-                  &scratch_allocator)
-              .ok();
-      if (!blas_launch_status) {
-        context->SetStatus(errors::Internal(
-            "Blas xGEMMBatched launch failed : a.shape=",
-            in_x.shape().DebugString(),
-            ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n,
-            ", k=", k, ", batch_size=", batch_size));
-      }
-    }
-  }
-};
-
-template <>
-struct LaunchBatchMatMul<GPUDevice, Eigen::half> {
-  static void Launch(OpKernelContext* context, const Tensor& in_x,
-                     const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
-                     bool trans_y, const MatMulBCast& bcast, Tensor* out) {
-    typedef Eigen::half Scalar;
-    se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose,
-                                   se::blas::Transpose::kTranspose,
-                                   se::blas::Transpose::kConjugateTranspose};
-    const uint64 m = in_x.dim_size(adj_x || trans_x ? 2 : 1);
-    const uint64 k = in_x.dim_size(adj_x || trans_x ? 1 : 2);
-    const uint64 n = in_y.dim_size(adj_y || trans_y ? 1 : 2);
-    const uint64 batch_size = bcast.output_batch_size();
-    auto blas_transpose_a = trans[adj_x ? 2 : (trans_x ? 1 : 0)];
-    auto blas_transpose_b = trans[adj_y ? 2 : (trans_y ? 1 : 0)];
-
-    auto* stream = context->op_device_context()->stream();
-    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
-
-    typedef perftools::gputools::DeviceMemory<Scalar> DeviceMemoryType;
-    std::vector<DeviceMemoryType> a_device_memory;
-    std::vector<DeviceMemoryType> b_device_memory;
-    std::vector<DeviceMemoryType> c_device_memory;
-    std::vector<DeviceMemoryType*> a_ptrs;
-    std::vector<DeviceMemoryType*> b_ptrs;
-    std::vector<DeviceMemoryType*> c_ptrs;
-    a_device_memory.reserve(bcast.x_batch_size());
-    b_device_memory.reserve(bcast.y_batch_size());
-    c_device_memory.reserve(batch_size);
-    a_ptrs.reserve(batch_size);
-    b_ptrs.reserve(batch_size);
-    c_ptrs.reserve(batch_size);
-    auto* a_base_ptr = in_x.template flat<Scalar>().data();
-    auto* b_base_ptr = in_y.template flat<Scalar>().data();
-    auto* c_base_ptr = out->template flat<Scalar>().data();
-
-    uint64 a_stride;
-    uint64 b_stride;
-    uint64 c_stride;
-
-    bool is_full_broadcast =
-        std::min(bcast.x_batch_size(), bcast.y_batch_size()) == 1;
-    bool use_strided_batched =
-        (!bcast.IsBroadcastingRequired() || is_full_broadcast) &&
-        batch_size > 1;
-    if (use_strided_batched) {
-      a_stride = bcast.x_batch_size() != 1 ? m * k : 0;
-      b_stride = bcast.y_batch_size() != 1 ? k * n : 0;
-      c_stride = m * n;
-      a_device_memory.push_back(AsDeviceMemory(a_base_ptr));
-      b_device_memory.push_back(AsDeviceMemory(b_base_ptr));
-      c_device_memory.push_back(AsDeviceMemory(c_base_ptr));
-      a_ptrs.push_back(&a_device_memory.back());
-      b_ptrs.push_back(&b_device_memory.back());
-      c_ptrs.push_back(&c_device_memory.back());
-    } else if (!bcast.IsBroadcastingRequired()) {
-      for (int64 i = 0; i < batch_size; ++i) {
-        a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k));
-        b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n));
-        c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n));
-        a_ptrs.push_back(&a_device_memory.back());
-        b_ptrs.push_back(&b_device_memory.back());
-        c_ptrs.push_back(&c_device_memory.back());
-      }
-    } else {
-      const std::vector<int64>& a_batch_indices = bcast.x_batch_indices();
-      const std::vector<int64>& b_batch_indices = bcast.y_batch_indices();
-      for (int64 i = 0; i < bcast.x_batch_size(); ++i) {
-        a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k));
-      }
-      for (int64 i = 0; i < bcast.y_batch_size(); ++i) {
-        b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n));
-      }
-      for (int64 i = 0; i < batch_size; ++i) {
-        c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n));
-        a_ptrs.push_back(&a_device_memory[a_batch_indices[i]]);
-        b_ptrs.push_back(&b_device_memory[b_batch_indices[i]]);
-        c_ptrs.push_back(&c_device_memory.back());
-      }
-    }
-
-    typedef float Coefficient;
-
-    // Blas does
-    // C = A x B
-    // where A, B and C are assumed to be in column major.
-    // We want the output to be in row-major, so we can compute
-    // C' = B' x A', where ' stands for transpose (not adjoint).
-    // TODO(yangzihao): Choose the best of the three strategies using autotune.
-    if (batch_size == 1) {
-      // This is a regular matrix*matrix or matrix*vector multiply. Avoid the
-      // overhead of the scratch allocator and the batch interface.
-      // TODO(benbarsdell): Use fp16 Gemv if it becomes supported by CUBLAS
-      bool blas_launch_status =
-          stream
-              ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k,
-                             static_cast<Coefficient>(1.0), *(b_ptrs[0]),
-                             adj_y || trans_y ? k : n, *(a_ptrs[0]),
-                             adj_x || trans_x ? m : k,
-                             static_cast<Coefficient>(0.0), c_ptrs[0], n)
-              .ok();
-      if (!blas_launch_status) {
-        context->SetStatus(errors::Internal(
-            "Blas xGEMM launch failed : a.shape=", in_x.shape().DebugString(),
-            ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n,
-            ", k=", k));
-      }
-    } else if (use_strided_batched) {
-      bool blas_launch_status =
-          stream
-              ->ThenBlasGemmStridedBatched(
-                  blas_transpose_b, blas_transpose_a, n, m, k,
-                  static_cast<Coefficient>(1.0), *b_ptrs[0],
-                  adj_y || trans_y ? k : n, b_stride, *a_ptrs[0],
-                  adj_x || trans_x ? m : k, a_stride,
-                  static_cast<Coefficient>(0.0), c_ptrs[0], n, c_stride,
-                  batch_size)
-              .ok();
-      if (!blas_launch_status) {
-        context->SetStatus(errors::Internal(
-            "Blas xGEMMStridedBatched launch failed : a.shape=",
-            in_x.shape().DebugString(),
-            ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n,
-            ", k=", k, ", batch_size=", batch_size));
-      }
-    } else {
-      BlasScratchAllocator scratch_allocator(context);
+      BlasScratchAllocator scratch_allocator(max_scratch_size, context);
       bool blas_launch_status =
           stream
               ->ThenBlasGemmBatchedWithScratch(
@@ -627,53 +839,12 @@ struct LaunchBatchMatMul<GPUDevice, Eigen::half> {
             ", k=", k, ", batch_size=", batch_size));
       }
     }
+#endif  // not GOOGLE_CUDA or CUDA_VERSION < 11000
   }
 };
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename Scalar>
-struct ParallelMatMulKernelSYCL {
-  static void Run(const OpKernelContext* context, const Tensor& in_x,
-                  const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
-                  bool trans_y, const MatMulBCast& bcast, Tensor* out,
-                  int start, int limit) {
-    auto Tx = in_x.tensor<Scalar, 3>();
-    auto Ty = in_y.tensor<Scalar, 3>();
-    auto Tz = out->tensor<Scalar, 3>();
-    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs;
-    contract_pairs[0] = ContractionDims(adj_x || trans_x, adj_y || trans_y);
-    auto d = context->eigen_sycl_device();
-
-    const bool should_bcast = bcast.IsBroadcastingRequired();
-    const auto& x_batch_indices = bcast.x_batch_indices();
-    const auto& y_batch_indices = bcast.y_batch_indices();
-    for (int64 i = start; i < limit; ++i) {
-      const int64 x_batch_index = should_bcast ? x_batch_indices[i] : i;
-      const int64 y_batch_index = should_bcast ? y_batch_indices[i] : i;
-
-      auto x = Tx.template chip<0>(x_batch_index);
-      auto y = Ty.template chip<0>(y_batch_index);
-      auto z = Tz.template chip<0>(i);
-      z.device(d) = x.contract(y, contract_pairs);
-    }
-  }
-};
-
-template <typename Scalar>
-struct LaunchBatchMatMul<SYCLDevice, Scalar> {
-  static void Launch(OpKernelContext* context, const Tensor& in_x,
-                     const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x,
-                     bool trans_y, const MatMulBCast& bcast, Tensor* out) {
-    // Number of matrix multiplies i.e. size of the batch.
-    const int64 batch_size = bcast.output_batch_size();
-    ParallelMatMulKernelSYCL<Scalar>::Run(context, in_x, in_y, adj_x, adj_y,
-                                          trans_x, trans_y, bcast, out, 0,
-                                          batch_size);
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename Scalar>
 class BaseBatchMatMulOp : public OpKernel {
@@ -682,6 +853,7 @@ class BaseBatchMatMulOp : public OpKernel {
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("adj_x", &adj_x_));
     OP_REQUIRES_OK(context, context->GetAttr("adj_y", &adj_y_));
+    use_autotune_ = MatmulAutotuneEnable();
   }
 
   ~BaseBatchMatMulOp() override {}
@@ -743,7 +915,7 @@ class BaseBatchMatMulOp : public OpKernel {
                                  out->shape().DebugString()));
     LaunchBatchMatMul<Device, Scalar>::Launch(
         ctx, in0_reshaped, in1_reshaped, adj_x_, adj_y_, /*trans_x=*/false,
-        /*trans_y=*/false, bcast, &out_reshaped);
+        /*trans_y=*/false, bcast, use_autotune_, &out_reshaped);
   }
 
  protected:
@@ -753,6 +925,7 @@ class BaseBatchMatMulOp : public OpKernel {
  private:
   bool adj_x_;
   bool adj_y_;
+  bool use_autotune_;
 };
 
 // BatchMatMul Op implementation which disallows broadcasting.
@@ -826,15 +999,6 @@ class BatchMatMulV2Op : public BaseBatchMatMulOp<Device, Scalar> {
       Name("BatchMatMulV2").Device(DEVICE_GPU).TypeConstraint<TYPE>("T"), \
       BatchMatMulV2Op<GPUDevice, TYPE>)
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_BATCH_MATMUL_SYCL(TYPE)                                   \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("BatchMatMul").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"),   \
-      BatchMatMulOp<SYCLDevice, TYPE>);                                    \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("BatchMatMulV2").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BatchMatMulV2Op<SYCLDevice, TYPE>)
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_BATCH_MATMUL_OP_IMPL_H_
diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc
index 075666c1dc3..30ec13e6b4d 100644
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_real.cc
@@ -34,8 +34,4 @@ TF_CALL_double(REGISTER_BATCH_MATMUL_GPU);
 TF_CALL_half(REGISTER_BATCH_MATMUL_GPU);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-TF_CALL_float(REGISTER_BATCH_MATMUL_SYCL);
-TF_CALL_double(REGISTER_BATCH_MATMUL_SYCL);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_norm_op.cc b/tensorflow/core/kernels/batch_norm_op.cc
index 4a03abbba49..f9783b52574 100644
--- a/tensorflow/core/kernels/batch_norm_op.cc
+++ b/tensorflow/core/kernels/batch_norm_op.cc
@@ -28,9 +28,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class BatchNormOp : public OpKernel {
@@ -208,17 +205,6 @@ TF_CALL_float(REGISTER_GPU_KERNEL);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_KERNEL(T)                                         \
-  REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalization") \
-                              .Device(DEVICE_SYCL)                 \
-                              .TypeConstraint<T>("T"),             \
-                          BatchNormOp<SYCLDevice, T>);
-
-TF_CALL_float(REGISTER_KERNEL);
-TF_CALL_double(REGISTER_KERNEL);
-#undef REGISTER_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNEL(T)                                             \
   REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalizationGrad") \
@@ -267,17 +253,5 @@ TF_CALL_float(REGISTER_GPU_KERNEL);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_KERNEL(T)                                             \
-  REGISTER_KERNEL_BUILDER(Name("BatchNormWithGlobalNormalizationGrad") \
-                              .Device(DEVICE_SYCL)                     \
-                              .TypeConstraint<T>("T"),                 \
-                          BatchNormGradOp<SYCLDevice, T>);
-
-TF_CALL_float(REGISTER_KERNEL);
-TF_CALL_double(REGISTER_KERNEL);
-#undef REGISTER_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index b662e2e066a..8f233957032 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -1,5 +1,6 @@
 # Description: Utilities.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
@@ -97,6 +98,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/lib:traceme_encode",
     ],
     alwayslink = 1,
 )
@@ -247,6 +249,8 @@ cc_library(
         "//tensorflow/core/kernels/batching_util:threadsafe_status",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:thread_annotations",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/lib:traceme_encode",
         "//tensorflow/core/util:incremental_barrier",
     ],
 )
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index 356c857616b..3e587038005 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -134,6 +134,17 @@ class AdaptiveSharedBatchScheduler
     // A non-zero value can improve performance by limiting the scheduling of
     // nearly empty batches.
     int64 batch_timeout_micros = 0;
+    // If non nullptr, split_input_task_func should split input_task into
+    // multiple tasks, the first of which has size first_size and the remaining
+    // not exceeding max_size. This function may acquire ownership of input_task
+    // and should return a status indicating if the split was successful. Upon
+    // success, the caller can assume that all output_tasks will be scheduled.
+    // Including this option allows the scheduler to pack batches better and
+    // should usually improve overall throughput.
+    std::function<Status(std::unique_ptr<TaskType>* input_task, int first_size,
+                         int max_size,
+                         std::vector<std::unique_ptr<TaskType>>* output_tasks)>
+        split_input_task_func;
   };
 
   using BatchProcessor = std::function<void(std::unique_ptr<Batch<TaskType>>)>;
@@ -149,7 +160,7 @@ class AdaptiveSharedBatchScheduler
   }
 
  private:
-  // access to AddBatch, RemoveQueue, GetEnv.
+  // access to AddBatch, MaybeScheduleClosedBatches, RemoveQueue, GetEnv.
   friend class internal::ASBSQueue<TaskType>;
 
   explicit AdaptiveSharedBatchScheduler(const Options& options);
@@ -161,16 +172,17 @@ class AdaptiveSharedBatchScheduler
   // Schedules batch if in_flight_batches_limit_ is not met.
   void MaybeScheduleNextBatch() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
-  // Schedules the earliest closed batch in batches_
-  // if batch_thread_pool_ has an idle thead.
+  // Schedules all closed batches in batches_ for which an idle thread is
+  // available in batch_thread_pool_.
   // Batches scheduled this way are called express batches.
   // Express batches are not limited by in_flight_batches_limit_, and
   // their latencies will not affect in_flight_batches_limit_.
-  void MaybeScheduleClosedBatch() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  void MaybeScheduleClosedBatches();
+
+  void MaybeScheduleClosedBatchesLocked() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   // Notifies scheduler of non-empty batch which is eligible for processing.
-  void AddBatch(const internal::ASBSBatch<TaskType>* batch,
-                bool also_schedule_closed_batch);
+  void AddBatch(const internal::ASBSBatch<TaskType>* batch);
 
   // Removes queue from scheduler.
   void RemoveQueue(const internal::ASBSQueue<TaskType>* queue);
@@ -262,6 +274,9 @@ class ASBSQueue : public BatchScheduler<TaskType> {
   size_t max_task_size() const override { return options_.max_batch_size; }
 
  private:
+  // Number of size 1 tasks which could currently be scheduled without failing.
+  size_t SchedulingCapacityLocked() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
   std::shared_ptr<AdaptiveSharedBatchScheduler<TaskType>> scheduler_;
   const QueueOptions options_;
   // Owned by scheduler_.
@@ -392,19 +407,24 @@ Status AdaptiveSharedBatchScheduler<TaskType>::AddQueue(
 
 template <typename TaskType>
 void AdaptiveSharedBatchScheduler<TaskType>::AddBatch(
-    const internal::ASBSBatch<TaskType>* batch,
-    bool also_schedule_closed_batch) {
+    const internal::ASBSBatch<TaskType>* batch) {
   mutex_lock l(mu_);
   batches_.push_back(batch);
-  // Maybe schedule this batch once it becomes schedulable.
+  int64 delay_micros = batch->schedulable_time_micros() - GetEnv()->NowMicros();
+  if (delay_micros <= 0) {
+    MaybeScheduleNextBatch();
+    return;
+  }
+  // Try to schedule batch once it becomes schedulable. Although scheduler waits
+  // for all batches to finish processing before allowing itself to be deleted,
+  // MaybeScheduleNextBatch() is called in other places, and therefore it's
+  // possible the scheduler could be deleted by the time this closure runs.
+  // Grab a shared_ptr reference to prevent this from happening.
   GetEnv()->SchedClosureAfter(
-      batch->schedulable_time_micros() - batch->creation_time_micros(), [this] {
+      delay_micros, [this, lifetime_preserver = this->shared_from_this()] {
         mutex_lock l(mu_);
         MaybeScheduleNextBatch();
       });
-  if (also_schedule_closed_batch) {
-    MaybeScheduleClosedBatch();
-  }
 }
 
 template <typename TaskType>
@@ -451,21 +471,31 @@ void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleNextBatch() {
 }
 
 template <typename TaskType>
-void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleClosedBatch() {
-  if (in_flight_batches_ + in_flight_express_batches_ >=
-      options_.num_batch_threads) {
-    return;
-  }
-  for (auto it = batches_.begin(); it != batches_.end(); it++) {
+void AdaptiveSharedBatchScheduler<TaskType>::MaybeScheduleClosedBatches() {
+  mutex_lock l(mu_);
+  MaybeScheduleClosedBatchesLocked();
+}
+
+template <typename TaskType>
+void AdaptiveSharedBatchScheduler<
+    TaskType>::MaybeScheduleClosedBatchesLocked() {
+  // Only schedule closed batches if we have spare capacity.
+  int available_threads =
+      static_cast<int>(options_.num_batch_threads - in_flight_batches_ -
+                       in_flight_express_batches_);
+  for (auto it = batches_.begin();
+       it != batches_.end() && available_threads > 0;) {
     if ((*it)->IsClosed()) {
       const internal::ASBSBatch<TaskType>* batch = *it;
-      batches_.erase(it);
+      it = batches_.erase(it);
       batch->queue()->ReleaseBatch(batch);
       batch_thread_pool_->Schedule(
           std::bind(&AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper,
                     this, batch, queues_and_callbacks_[batch->queue()], true));
       in_flight_express_batches_++;
-      return;
+      available_threads--;
+    } else {
+      ++it;
     }
   }
 }
@@ -482,7 +512,7 @@ void AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper(
   mutex_lock l(mu_);
   if (is_express) {
     in_flight_express_batches_--;
-    MaybeScheduleClosedBatch();
+    MaybeScheduleClosedBatchesLocked();
     return;
   }
   in_flight_batches_--;
@@ -554,38 +584,71 @@ ASBSQueue<TaskType>::~ASBSQueue() {
 
 template <typename TaskType>
 Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
-  ASBSBatch<TaskType>* new_batch = nullptr;
   size_t size = (*task)->size();
-  if (size > options_.max_batch_size) {
+  if (options_.split_input_task_func == nullptr &&
+      size > options_.max_batch_size) {
     return errors::InvalidArgument("Task size ", size,
                                    " is larger than maximum batch size ",
                                    options_.max_batch_size);
   }
-  bool is_old_batch_closed = false;
+  std::vector<std::unique_ptr<TaskType>> tasks_to_schedule;
+  std::vector<ASBSBatch<TaskType>*> new_batches;
+  bool closed_batch = false;
   {
     mutex_lock l(mu_);
-    // Current batch is full, create another if allowed.
-    if (current_batch_ &&
-        current_batch_->size() + size > options_.max_batch_size) {
-      if (num_enqueued_batches_ >= options_.max_enqueued_batches) {
-        return errors::Unavailable("The batch scheduling queue is full");
+    if (size > SchedulingCapacityLocked()) {
+      return errors::Unavailable("The batch scheduling queue is full");
+    }
+    int remaining_batch_size =
+        current_batch_ == nullptr
+            ? options_.max_batch_size
+            : options_.max_batch_size - current_batch_->size();
+    if (options_.split_input_task_func == nullptr ||
+        size <= remaining_batch_size) {
+      // Either we don't allow task splitting or task fits within the current
+      // batch.
+      tasks_to_schedule.push_back(std::move(*task));
+    } else {
+      // Split task in order to completely fill the current batch.
+      // Beyond this point Schedule should not fail, as the caller has been
+      // promised that all of the split tasks will be scheduled.
+      TF_RETURN_IF_ERROR(options_.split_input_task_func(
+          task, remaining_batch_size, options_.max_batch_size,
+          &tasks_to_schedule));
+    }
+    for (auto& task : tasks_to_schedule) {
+      // Can't fit within current batch, close it off and try to create another.
+      if (current_batch_ &&
+          current_batch_->size() + task->size() > options_.max_batch_size) {
+        current_batch_->Close();
+        closed_batch = true;
+        current_batch_ = nullptr;
+      }
+      if (!current_batch_) {
+        num_enqueued_batches_++;
+        current_batch_ =
+            new ASBSBatch<TaskType>(this, scheduler_->GetEnv()->NowMicros(),
+                                    options_.batch_timeout_micros);
+        new_batches.push_back(current_batch_);
+      }
+      current_batch_->AddTask(std::move(task));
+      num_enqueued_tasks_++;
+      // If current_batch_ is now full, allow it to be processed immediately.
+      if (current_batch_->size() == options_.max_batch_size) {
+        current_batch_->Close();
+        closed_batch = true;
+        current_batch_ = nullptr;
       }
-      current_batch_->Close();
-      is_old_batch_closed = true;
-      current_batch_ = nullptr;
     }
-    if (!current_batch_) {
-      num_enqueued_batches_++;
-      current_batch_ = new_batch =
-          new ASBSBatch<TaskType>(this, scheduler_->GetEnv()->NowMicros(),
-                                  options_.batch_timeout_micros);
-    }
-    current_batch_->AddTask(std::move(*task));
-    num_enqueued_tasks_++;
   }
-  // AddBatch must be called outside of lock, since it may call ReleaseBatch.
-  if (new_batch != nullptr)
-    scheduler_->AddBatch(new_batch, is_old_batch_closed);
+  // Scheduler functions must be called outside of lock, since they may call
+  // ReleaseBatch.
+  for (auto* batch : new_batches) {
+    scheduler_->AddBatch(batch);
+  }
+  if (closed_batch) {
+    scheduler_->MaybeScheduleClosedBatches();
+  }
   return Status::OK();
 }
 
@@ -609,6 +672,11 @@ size_t ASBSQueue<TaskType>::NumEnqueuedTasks() const {
 template <typename TaskType>
 size_t ASBSQueue<TaskType>::SchedulingCapacity() const {
   mutex_lock l(mu_);
+  return SchedulingCapacityLocked();
+}
+
+template <typename TaskType>
+size_t ASBSQueue<TaskType>::SchedulingCapacityLocked() const {
   const int current_batch_capacity =
       current_batch_ ? options_.max_batch_size - current_batch_->size() : 0;
   const int spare_batches =
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
index af356cf24db..ef2f66dcf70 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/batching_util/fake_clock_env.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -33,8 +34,10 @@ class FakeTask : public BatchTask {
 
   size_t size() const override { return size_; }
 
+  void set_size(size_t size) { size_ = size; }
+
  private:
-  const size_t size_;
+  size_t size_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(FakeTask);
 };
@@ -124,10 +127,14 @@ TEST(AdaptiveSharedBatchSchedulerTest, InFlightBatchesLimit) {
   std::unique_ptr<BatchScheduler<FakeTask>> queue;
   TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
 
-  // Enqueue 3 tasks, should result in 3 batches.
-  for (int i = 0; i < 3; i++) {
-    TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  // Enqueue 3 batches.
+  TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  while (queue->NumEnqueuedTasks() > 0) {
   }
+  TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  while (queue->NumEnqueuedTasks() > 0) {
+  }
+  TF_ASSERT_OK(ScheduleTask(100, queue.get()));
 }
 
 TEST(AdaptiveSharedBatchSchedulerTest, InFlightBatchesLimitTuning) {
@@ -201,6 +208,7 @@ TEST(AdaptiveSharedBatchSchedulerTest, FullBatchSchedulingBoostMicros) {
     AdaptiveSharedBatchScheduler<FakeTask>::Options options;
     options.env = &env;
     options.initial_in_flight_batches_limit = 1;
+    options.num_batch_threads = 1;
     options.batches_to_average_over = 1000;
     options.full_batch_scheduling_boost_micros = 100;
     mutex mu;
@@ -242,6 +250,8 @@ TEST(AdaptiveSharedBatchSchedulerTest, FullBatchSchedulingBoostMicros) {
 
     // First batch immediately processed.
     TF_ASSERT_OK(ScheduleTask(100, queue1.get()));
+    while (queue1->NumEnqueuedTasks() > 0) {
+    }
 
     TF_ASSERT_OK(ScheduleTask(100, queue1.get()));
     env.AdvanceByMicroseconds(10);
@@ -266,6 +276,7 @@ TEST(AdaptiveSharedBatchSchedulerTest, FullBatchSchedulingBoostMicros) {
 TEST(AdaptiveSharedBatchSchedulerTest, DeleteQueue) {
   AdaptiveSharedBatchScheduler<FakeTask>::Options options;
   options.initial_in_flight_batches_limit = 1;
+  options.num_batch_threads = 1;
   options.batches_to_average_over = 1000;
   mutex mu;
   int processed_batches = 0;
@@ -280,63 +291,24 @@ TEST(AdaptiveSharedBatchSchedulerTest, DeleteQueue) {
     mu.unlock();
   };
 
-  std::unique_ptr<Thread> queue_deleter;
-  std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
-  TF_ASSERT_OK(
-      AdaptiveSharedBatchScheduler<FakeTask>::Create(options, &scheduler));
-  std::unique_ptr<BatchScheduler<FakeTask>> queue;
-  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
-
-  // Enqueue 2 tasks, should result in 2 batches.
-  for (int i = 0; i < 2; i++) {
-    TF_ASSERT_OK(ScheduleTask(100, queue.get()));
-  }
-  // Delete queue, should be kept alive until empty.
-  queue_deleter.reset(Env::Default()->StartThread(
-      {}, "QueueDeleterThread", [&queue, &mu, &processed_batches] {
-        queue.reset();
-        mutex_lock l(mu);
-        EXPECT_EQ(processed_batches, 2);
-      }));
-  // Give queue_deleter thread time to delete queue.
-  Env::Default()->SleepForMicroseconds(1000);
-  finish_processing.Notify();
-}
-
-TEST(AdaptiveSharedBatchSchedulerTest, DeleteScheduler) {
-  AdaptiveSharedBatchScheduler<FakeTask>::Options options;
-  options.initial_in_flight_batches_limit = 1;
-  options.batches_to_average_over = 1000;
-  mutex mu;
-  int processed_batches = 0;
-  Notification finish_processing;
-  auto queue_callback = [&mu, &processed_batches, &finish_processing](
-                            std::unique_ptr<Batch<FakeTask>> batch) {
-    ASSERT_TRUE(batch->IsClosed());
-    EXPECT_GT(batch->num_tasks(), 0);
-    finish_processing.WaitForNotification();
-    mu.lock();
-    processed_batches++;
-    mu.unlock();
-  };
-
-  std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
-  TF_ASSERT_OK(
-      AdaptiveSharedBatchScheduler<FakeTask>::Create(options, &scheduler));
-  std::unique_ptr<BatchScheduler<FakeTask>> queue;
-  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
-
-  // Enqueue 2 tasks, should result in 2 batches.
-  for (int i = 0; i < 2; i++) {
-    TF_ASSERT_OK(ScheduleTask(100, queue.get()));
-  }
-  // Delete scheduler, should be kept alive until queues are empty.
-  scheduler.reset();
-  finish_processing.Notify();
-  while (true) {
+  auto processed_checker = gtl::MakeCleanup([&mu, &processed_batches] {
     mutex_lock l(mu);
-    if (processed_batches == 2) break;
+    EXPECT_EQ(processed_batches, 2);
+  });
+  std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(
+      AdaptiveSharedBatchScheduler<FakeTask>::Create(options, &scheduler));
+  std::unique_ptr<BatchScheduler<FakeTask>> queue;
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
+
+  // Enqueue 2 tasks, should result in 2 batches.
+  TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  while (queue->NumEnqueuedTasks() > 0) {
   }
+  TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  // Queue destructor should block until second batch has been scheduled.
+  Env::Default()->SchedClosureAfter(
+      1000, [&finish_processing] { finish_processing.Notify(); });
 }
 
 TEST(AdaptiveSharedBatchSchedulerTest, QueueCapacityInfo) {
@@ -364,9 +336,10 @@ TEST(AdaptiveSharedBatchSchedulerTest, QueueCapacityInfo) {
   TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
 
   // Enqueue 2 tasks, should result in 2 batches.
-  for (int i = 0; i < 2; i++) {
-    TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  while (queue->NumEnqueuedTasks() > 0) {
   }
+  TF_ASSERT_OK(ScheduleTask(100, queue.get()));
   // First batch was immediately processed, no longer counts as enqueued.
   EXPECT_EQ(queue->NumEnqueuedTasks(), 1);
   EXPECT_EQ(queue->SchedulingCapacity(), 9 * 1000 + 900);
@@ -375,12 +348,69 @@ TEST(AdaptiveSharedBatchSchedulerTest, QueueCapacityInfo) {
   TF_ASSERT_OK(ScheduleTask(200, queue.get()));
   EXPECT_EQ(queue->NumEnqueuedTasks(), 3);
   EXPECT_EQ(queue->SchedulingCapacity(), 9 * 1000 + 600);
-  // Enqueue 1 more task, should create new batch.
+  // Enqueue 1 more task, should create new batch and start processing the
+  // previous batch.
   TF_ASSERT_OK(ScheduleTask(700, queue.get()));
-  EXPECT_EQ(queue->NumEnqueuedTasks(), 4);
-  EXPECT_EQ(queue->SchedulingCapacity(), 8 * 1000 + 300);
+  EXPECT_EQ(queue->NumEnqueuedTasks(), 1);
+  EXPECT_EQ(queue->SchedulingCapacity(), 9 * 1000 + 300);
   finish_processing.Notify();
 }
+
+TEST(AdaptiveSharedBatchSchedulerTest, FullBatches) {
+  std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(AdaptiveSharedBatchScheduler<FakeTask>::Create({}, &scheduler));
+  auto queue_callback = [](std::unique_ptr<Batch<FakeTask>> batch) {
+    ASSERT_TRUE(batch->IsClosed());
+  };
+  AdaptiveSharedBatchScheduler<FakeTask>::QueueOptions queue_options;
+  queue_options.max_batch_size = 100;
+  queue_options.batch_timeout_micros = 1000000000000;
+  std::unique_ptr<BatchScheduler<FakeTask>> queue;
+  TF_ASSERT_OK(scheduler->AddQueue(queue_options, queue_callback, &queue));
+  TF_ASSERT_OK(ScheduleTask(100, queue.get()));
+  // Full batches should not have to wait batch_timeout_micros.
+}
+
+TEST(AdaptiveSharedBatchSchedulerTest, TruncateBatches) {
+  mutex mu;
+  int processed_batches = 0;
+  auto queue_callback =
+      [&mu, &processed_batches](std::unique_ptr<Batch<FakeTask>> batch) {
+        ASSERT_TRUE(batch->IsClosed());
+        mutex_lock l(mu);
+        ++processed_batches;
+      };
+  std::shared_ptr<AdaptiveSharedBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(AdaptiveSharedBatchScheduler<FakeTask>::Create({}, &scheduler));
+  std::unique_ptr<BatchScheduler<FakeTask>> queue;
+
+  AdaptiveSharedBatchScheduler<FakeTask>::QueueOptions queue_options;
+  queue_options.max_batch_size = 100;
+  queue_options.batch_timeout_micros = 1000000;
+  queue_options.split_input_task_func =
+      [](std::unique_ptr<FakeTask>* input_task, int first_size, int max_size,
+         std::vector<std::unique_ptr<FakeTask>>* output_tasks) {
+        EXPECT_EQ(first_size, 70);
+        output_tasks->push_back(std::move(*input_task));
+        int remaining_size = output_tasks->back()->size() - first_size;
+        output_tasks->back()->set_size(first_size);
+        while (remaining_size > 0) {
+          int task_size = std::min(remaining_size, max_size);
+          output_tasks->emplace_back(new FakeTask(task_size));
+          remaining_size -= task_size;
+        }
+        return Status::OK();
+      };
+  TF_ASSERT_OK(scheduler->AddQueue(queue_options, queue_callback, &queue));
+  TF_ASSERT_OK(ScheduleTask(30, queue.get()));
+  TF_ASSERT_OK(ScheduleTask(350, queue.get()));
+  // Second task should be split into a task of size 70, 2 tasks of size 100,
+  // and one task of size 80.
+  while (true) {
+    mutex_lock l(mu);
+    if (processed_batches == 4) break;
+  }
+}
 }  // namespace anonymous
 }  // namespace serving
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h b/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
index cdaf5331e9f..df09bfd6099 100644
--- a/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
@@ -141,6 +141,39 @@ class BasicBatchScheduler : public BatchScheduler<TaskType> {
   // (Keep them mirrored to the ones in SharedBatchScheduler::QueueOptions and
   // SharedBatchScheduler::Options.)
   struct Options {
+    // Options related with (underlying) shared batch scheduler.
+    // 'thread_pool_name' and 'num_batch_threads' are used to initialize
+    // a shared batch scheduler underlyingly iff 'shared_batch_scheduler' is
+    // nullptr.
+    //
+    // There are two ways to specify threading:
+    // 1) Have each session create its own pool.
+    // 2) Have multiple sessions share the same pool.
+    //
+    // In general, the number of threads should be tied to roughly the number of
+    // compute resources (CPU cores or accelerator cores) backing the threads.
+    // Sharing a thread pool helps alleviate potential over allocation of
+    // threads to limited compute resources.
+
+    // To have each session create its own thread pool (1) set
+    // thread_pool_name/num_batch_threads.
+
+    // To share a thread pool (2) create a scheduler and pass it in.
+
+    // The name to use for the pool of batch threads.
+    string thread_pool_name = {"batch_threads"};
+
+    // The number of threads to use to process batches.
+    // Must be >= 1, and should be tuned carefully.
+    int num_batch_threads = port::MaxParallelism();
+
+    // If specified, this scheduler will be used underlyingly to schedule
+    // batches. Note setting this means `thread_pool_name` and
+    // `num_batch_threads` are ignored.
+    std::shared_ptr<SharedBatchScheduler<TaskType>> shared_batch_scheduler =
+        nullptr;
+
+    // Options for queue.
     // The maximum size of each batch.
     //
     // The scheduler may form batches of any size between 1 and this number
@@ -163,12 +196,6 @@ class BasicBatchScheduler : public BatchScheduler<TaskType> {
     // avoid latency spikes.
     int64 batch_timeout_micros = 0;
 
-    // The name to use for the pool of batch threads.
-    string thread_pool_name = {"batch_threads"};
-
-    // The number of threads to use to process batches.
-    // Must be >= 1, and should be tuned carefully.
-    int num_batch_threads = port::MaxParallelism();
 
     // The maximum allowable number of enqueued (accepted by Schedule() but
     // not yet being processed on a batch thread) tasks in terms of batches.
@@ -272,13 +299,19 @@ Status BasicBatchScheduler<TaskType>::Create(
     std::function<void(std::unique_ptr<Batch<TaskType>>)>
         process_batch_callback,
     std::unique_ptr<BasicBatchScheduler>* scheduler) {
-  typename SharedBatchScheduler<TaskType>::Options shared_scheduler_options;
-  shared_scheduler_options.thread_pool_name = options.thread_pool_name;
-  shared_scheduler_options.num_batch_threads = options.num_batch_threads;
-  shared_scheduler_options.env = options.env;
   std::shared_ptr<SharedBatchScheduler<TaskType>> shared_scheduler;
-  TF_RETURN_IF_ERROR(SharedBatchScheduler<TaskType>::Create(
-      shared_scheduler_options, &shared_scheduler));
+
+  if (options.shared_batch_scheduler == nullptr) {
+    typename SharedBatchScheduler<TaskType>::Options shared_scheduler_options;
+    shared_scheduler_options.thread_pool_name = options.thread_pool_name;
+    shared_scheduler_options.num_batch_threads = options.num_batch_threads;
+    shared_scheduler_options.env = options.env;
+
+    TF_RETURN_IF_ERROR(SharedBatchScheduler<TaskType>::Create(
+        shared_scheduler_options, &shared_scheduler));
+  } else {
+    shared_scheduler = options.shared_batch_scheduler;
+  }
 
   typename SharedBatchScheduler<TaskType>::QueueOptions
       shared_scheduler_queue_options;
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
index 98175b5b9d0..d638760b833 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/batching_util/concat_split_util.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/monitoring/percentile_sampler.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/core/util/incremental_barrier.h"
 
 namespace tensorflow {
@@ -202,6 +204,11 @@ Status BatchResourceBase::ConcatInputTensors(
 
   const int padded_batch_size = RoundToLowestAllowedBatchSize(batch.size());
   const int padding_amount = padded_batch_size - batch.size();
+  profiler::TraceMe trace_me([padded_batch_size, padding_amount]() {
+    return profiler::TraceMeEncode(
+        "ConcatInputTensors", {{"batch_size_after_padding", padded_batch_size},
+                               {"padding_amount", padding_amount}});
+  });
   RecordPaddingSize(padding_amount, GetModelName(context), padded_batch_size);
   RecordProcessedBatchSize(padded_batch_size, GetModelName(context));
 
diff --git a/tensorflow/core/kernels/batching_util/concat_split_util.h b/tensorflow/core/kernels/batching_util/concat_split_util.h
index fcd3b6ef0bb..77c4463f118 100644
--- a/tensorflow/core/kernels/batching_util/concat_split_util.h
+++ b/tensorflow/core/kernels/batching_util/concat_split_util.h
@@ -29,9 +29,6 @@ namespace concat_split_util {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 // Concatenates 'inputs' into a single tensor along the zeroth dimension.
 // Requires that all elements of 'inputs' have element type T. Writes to
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index c4cf111fe96..9bb853f708e 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
 
 namespace tensorflow {
 namespace serving {
@@ -641,7 +642,10 @@ Status Queue<TaskType>::ScheduleWithoutSplit(std::unique_ptr<TaskType>* task) {
       open_batch_start_time_micros_ = env_->NowMicros();
     }
     profiler::TraceMeProducer trace_me(
-        [&] { return strings::StrCat("Schedule:", (*task)->size()); },
+        [task] {
+          return profiler::TraceMeEncode("Schedule",
+                                         {{"size", (*task)->size()}});
+        },
         profiler::ContextType::kSharedBatchScheduler,
         batches_.back()->traceme_context_id());
     batches_.back()->AddTask(std::move(*task));
@@ -668,7 +672,8 @@ Status Queue<TaskType>::ScheduleWithoutSplit(std::unique_ptr<TaskType>* task) {
 template <typename TaskType>
 Status Queue<TaskType>::ScheduleWithSplit(std::unique_ptr<TaskType>* task) {
   profiler::TraceMe trace_me([task] {
-    return strings::StrCat("ScheduleWithSplit:", (*task)->size());
+    return profiler::TraceMeEncode("ScheduleWithSplit",
+                                   {{"size", (*task)->size()}});
   });
   if ((*task)->size() > options_.input_batch_size_limit) {
     return errors::InvalidArgument("Task size ", (*task)->size(),
@@ -726,6 +731,13 @@ Status Queue<TaskType>::ScheduleWithSplit(std::unique_ptr<TaskType>* task) {
       if (batches_.back()->empty()) {
         open_batch_start_time_micros_ = env_->NowMicros();
       }
+      profiler::TraceMeProducer trace_me(
+          [&output_tasks, i] {
+            return profiler::TraceMeEncode("ScheduleOutputTask",
+                                           {{"size", output_tasks[i]->size()}});
+          },
+          profiler::ContextType::kSharedBatchScheduler,
+          batches_.back()->traceme_context_id());
       batches_.back()->AddTask(std::move(output_tasks[i]));
     }
 
@@ -795,7 +807,10 @@ std::unique_ptr<Batch<TaskType>> Queue<TaskType>::ScheduleBatch() {
 template <typename TaskType>
 void Queue<TaskType>::ProcessBatch(std::unique_ptr<Batch<TaskType>> batch) {
   profiler::TraceMeConsumer trace_me(
-      [&batch] { return strings::StrCat("ProcessBatch:", batch->size()); },
+      [&] {
+        return profiler::TraceMeEncode(
+            "ProcessBatch", {{"batch_size_before_padding", batch->size()}});
+      },
       profiler::ContextType::kSharedBatchScheduler,
       batch->traceme_context_id());
   process_batch_callback_(std::move(batch));
diff --git a/tensorflow/core/kernels/bcast_ops.cc b/tensorflow/core/kernels/bcast_ops.cc
index fe185bd1526..f8a640b16c2 100644
--- a/tensorflow/core/kernels/bcast_ops.cc
+++ b/tensorflow/core/kernels/bcast_ops.cc
@@ -145,22 +145,6 @@ REGISTER_KERNEL_BUILDER(Name("BroadcastArgs")
                             .HostMemory("r0"),
                         BCastArgsOp<int64>);
 
-#if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("BroadcastArgs")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("s0")
-                            .HostMemory("s1")
-                            .HostMemory("r0"),
-                        BCastArgsOp<int32>);
-REGISTER_KERNEL_BUILDER(Name("BroadcastArgs")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int64>("T")
-                            .HostMemory("s0")
-                            .HostMemory("s1")
-                            .HostMemory("r0"),
-                        BCastArgsOp<int32>);
-#endif
 
 REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
                             .Device(DEVICE_CPU)
@@ -195,22 +179,4 @@ REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
                             .HostMemory("r1"),
                         BCastGradArgsOp<int64>);
 
-#if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("s0")
-                            .HostMemory("s1")
-                            .HostMemory("r0")
-                            .HostMemory("r1"),
-                        BCastGradArgsOp<int32>);
-REGISTER_KERNEL_BUILDER(Name("BroadcastGradientArgs")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int64>("T")
-                            .HostMemory("s0")
-                            .HostMemory("s1")
-                            .HostMemory("r0")
-                            .HostMemory("r1"),
-                        BCastGradArgsOp<int64>);
-#endif
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/bias_op.cc b/tensorflow/core/kernels/bias_op.cc
index bf001cceae7..e3dd9acb29a 100644
--- a/tensorflow/core/kernels/bias_op.cc
+++ b/tensorflow/core/kernels/bias_op.cc
@@ -39,9 +39,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace {
 
@@ -216,20 +213,6 @@ class BiasOp : public BinaryOp<T> {
 TF_CALL_NUMBER_TYPES(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_KERNEL(type)                                          \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("BiasAdd").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
-      BiasOp<SYCLDevice, type>);                                       \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("BiasAddV1").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      BiasOp<SYCLDevice, type>);
-
-TF_CALL_INTEGRAL_TYPES(REGISTER_KERNEL);
-REGISTER_KERNEL(float);
-REGISTER_KERNEL(double);
-#undef REGISTER_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class BiasGradOp : public OpKernel {
@@ -308,17 +291,6 @@ class BiasGradOp : public OpKernel {
 TF_CALL_NUMBER_TYPES(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_KERNEL(type)                                            \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("BiasAddGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      BiasGradOp<SYCLDevice, type>);
-
-TF_CALL_INTEGRAL_TYPES(REGISTER_KERNEL);
-REGISTER_KERNEL(float);
-REGISTER_KERNEL(double);
-#undef REGISTER_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename T>
diff --git a/tensorflow/core/kernels/bincount_op_gpu.cu.cc b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
index b137413d5e3..94ba4d86adb 100644
--- a/tensorflow/core/kernels/bincount_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/bincount_op_gpu.cu.cc
@@ -126,7 +126,6 @@ struct BincountFunctor<GPUDevice, Tidx, T, true> {
     return GpuLaunchKernel(BincountReduceKernel<Tidx, T>, config.block_count,
                            config.thread_per_block, 0, d.stream(), arr.data(),
                            output.data(), nthreads, num_bins);
-    return Status::OK();
   }
 };
 
@@ -215,14 +214,11 @@ struct BincountReduceFunctor<GPUDevice, Tidx, T, binary_count> {
           config.block_count, config.thread_per_block, smem_usage, d.stream(),
           in.data(), weights.data(), weights.size(), out.data(), num_rows,
           num_cols, num_bins);
-    } else {
-      return GpuLaunchKernel(
-          BincountColReduceKernel<Tidx, T, binary_count>, config.block_count,
-          config.thread_per_block, 0, d.stream(), in.data(), weights.data(),
-          weights.size(), out.data(), num_rows, num_cols, num_bins);
     }
-
-    return Status::OK();
+    return GpuLaunchKernel(
+        BincountColReduceKernel<Tidx, T, binary_count>, config.block_count,
+        config.thread_per_block, 0, d.stream(), in.data(), weights.data(),
+        weights.size(), out.data(), num_rows, num_cols, num_bins);
   }
 };
 
diff --git a/tensorflow/core/kernels/boosted_trees/BUILD b/tensorflow/core/kernels/boosted_trees/BUILD
index 4ca40dc4177..c9cb81694b7 100644
--- a/tensorflow/core/kernels/boosted_trees/BUILD
+++ b/tensorflow/core/kernels/boosted_trees/BUILD
@@ -1,11 +1,12 @@
 # Description:
 #   OpKernels for boosted trees ops.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
-    "tf_kernel_library",
 )
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
diff --git a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
index 19be606f184..e3a908d1b6b 100644
--- a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
@@ -121,7 +121,7 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
       auto do_work = [&resource, &bucketized_features, &cached_tree_ids,
                       &cached_node_ids, &output_partial_logits,
                       &output_node_ids, latest_tree,
-                      this](int32 start, int32 end) {
+                      this](int64 start, int64 end) {
         for (int32 i = start; i < end; ++i) {
           int32 tree_id = cached_tree_ids(i);
           int32 node_id = cached_node_ids(i);
@@ -237,7 +237,7 @@ class BoostedTreesPredictOp : public OpKernel {
 
     const int32 last_tree = resource->num_trees() - 1;
     auto do_work = [&resource, &bucketized_features, &output_logits, last_tree,
-                    this](int32 start, int32 end) {
+                    this](int64 start, int64 end) {
       for (int32 i = start; i < end; ++i) {
         std::vector<float> tree_logits(logits_dimension_, 0.0);
         int32 tree_id = 0;
@@ -340,7 +340,7 @@ class BoostedTreesExampleDebugOutputsOp : public OpKernel {
     // path. Note: feature_ids has one less value than logits_path because the
     // first value of each logit path will be the bias.
     auto do_work = [&resource, &bucketized_features, &output_debug_info,
-                    last_tree](int32 start, int32 end) {
+                    last_tree](int64 start, int64 end) {
       for (int32 i = start; i < end; ++i) {
         // Proto to store debug outputs, per example.
         boosted_trees::DebugOutput example_debug_info;
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/BUILD b/tensorflow/core/kernels/boosted_trees/quantiles/BUILD
index fb03e284d8d..ec7082318b3 100644
--- a/tensorflow/core/kernels/boosted_trees/quantiles/BUILD
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/BUILD
@@ -1,6 +1,8 @@
 # Description:
 #   This directory contains common quantile utilities used in boosted_trees.
 
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
@@ -8,8 +10,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 filegroup(
     name = "weighted_quantiles_hdrs",
     srcs = [
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index e8c428a80d0..5f32291101a 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -34,9 +34,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 #define CURRY_TYPES2(FN, arg0)   \
   FN(arg0, bool);                \
@@ -253,50 +250,6 @@ REGISTER_CAST_GPU(bfloat16, float);
 #undef REGISTER_CAST_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-class SyclCastOp : public CastOpBase {
- public:
-  explicit SyclCastOp(OpKernelConstruction* ctx) : CastOpBase(ctx) {
-    OP_REQUIRES_OK(ctx, Prepare());
-  }
-
- private:
-  Status Prepare() {
-    if (external_src_dtype_ == external_dst_dtype_) {
-      work_ = nullptr;  // Identity
-      return Status::OK();
-    }
-    if (src_dtype_ == DT_BOOL) {
-      work_ = GetSyclCastFromBool(dst_dtype_);
-    } else if (src_dtype_ == DT_INT32) {
-      work_ = GetSyclCastFromInt32(dst_dtype_);
-    } else if (src_dtype_ == DT_INT64) {
-      work_ = GetSyclCastFromInt64(dst_dtype_);
-    } else if (src_dtype_ == DT_FLOAT) {
-      work_ = GetSyclCastFromFloat(dst_dtype_);
-    } else if (src_dtype_ == DT_DOUBLE) {
-      work_ = GetSyclCastFromDouble(dst_dtype_);
-    }
-
-    return work_ == nullptr ? Unimplemented() : Status::OK();
-  }
-};
-
-#define REGISTER_CAST_SYCL(srctype, dsttype)                   \
-  REGISTER_KERNEL_BUILDER(Name("Cast")                         \
-                              .TypeConstraint<srctype>("SrcT") \
-                              .TypeConstraint<dsttype>("DstT") \
-                              .Device(DEVICE_SYCL),            \
-                          SyclCastOp)
-CURRY_TYPES2(REGISTER_CAST_SYCL, bool);
-CURRY_TYPES2(REGISTER_CAST_SYCL, int32);
-CURRY_TYPES2(REGISTER_CAST_SYCL, int64);
-CURRY_TYPES2(REGISTER_CAST_SYCL, float);
-CURRY_TYPES2(REGISTER_CAST_SYCL, double);
-
-#undef REGISTER_CAST_SYCL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef CURRY_TYPES2
 
diff --git a/tensorflow/core/kernels/cast_op_impl.h b/tensorflow/core/kernels/cast_op_impl.h
index 266e2cec47a..536afb49073 100644
--- a/tensorflow/core/kernels/cast_op_impl.h
+++ b/tensorflow/core/kernels/cast_op_impl.h
@@ -27,9 +27,6 @@ namespace functor {
 
 CAST_FUNCTORS(Eigen::ThreadPoolDevice);
 
-#ifdef TENSORFLOW_USE_SYCL
-CAST_FUNCTORS(Eigen::SyclDevice);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
@@ -134,27 +131,6 @@ CastFunctorType GetGpuCastFromBfloat(DataType dst_dtype);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-CastFunctorType GetSyclCastFromBool(DataType dst_dtype);
-
-CastFunctorType GetSyclCastFromUint8(DataType dst_dtype);
-
-CastFunctorType GetSyclCastFromUint16(DataType dst_dtype);
-
-CastFunctorType GetSyclCastFromUint32(DataType dst_dtype);
-
-CastFunctorType GetSyclCastFromUint64(DataType dst_dtype);
-
-CastFunctorType GetSyclCastFromInt16(DataType dst_dtype);
-
-CastFunctorType GetSyclCastFromInt32(DataType dst_dtype);
-
-CastFunctorType GetSyclCastFromInt64(DataType dst_dtype);
-
-CastFunctorType GetSyclCastFromFloat(DataType dst_dtype);
-
-CastFunctorType GetSyclCastFromDouble(DataType dst_dtype);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/cast_op_impl_bool.cc b/tensorflow/core/kernels/cast_op_impl_bool.cc
index d08a45a0745..bbe33474ad3 100644
--- a/tensorflow/core/kernels/cast_op_impl_bool.cc
+++ b/tensorflow/core/kernels/cast_op_impl_bool.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromBool(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromBool(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, bool);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_double.cc b/tensorflow/core/kernels/cast_op_impl_double.cc
index 8637f3dbabf..ad897664fc6 100644
--- a/tensorflow/core/kernels/cast_op_impl_double.cc
+++ b/tensorflow/core/kernels/cast_op_impl_double.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromDouble(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromDouble(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, double);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_float.cc b/tensorflow/core/kernels/cast_op_impl_float.cc
index c2418e93f9b..698923073d8 100644
--- a/tensorflow/core/kernels/cast_op_impl_float.cc
+++ b/tensorflow/core/kernels/cast_op_impl_float.cc
@@ -35,12 +35,5 @@ CastFunctorType GetGpuCastFromFloat(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromFloat(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, float);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_int16.cc b/tensorflow/core/kernels/cast_op_impl_int16.cc
index b32200615fa..04c9952afb6 100644
--- a/tensorflow/core/kernels/cast_op_impl_int16.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int16.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromInt16(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromInt16(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int16);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_int32.cc b/tensorflow/core/kernels/cast_op_impl_int32.cc
index 154fd148ce2..5c8b7161490 100644
--- a/tensorflow/core/kernels/cast_op_impl_int32.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int32.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromInt32(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromInt32(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int32);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_int64.cc b/tensorflow/core/kernels/cast_op_impl_int64.cc
index 1f4ebc96b46..0175231d705 100644
--- a/tensorflow/core/kernels/cast_op_impl_int64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int64.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromInt64(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromInt64(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int64);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_int8.cc b/tensorflow/core/kernels/cast_op_impl_int8.cc
index 00a72ab9868..2aaac7a2c9b 100644
--- a/tensorflow/core/kernels/cast_op_impl_int8.cc
+++ b/tensorflow/core/kernels/cast_op_impl_int8.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromInt8(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromInt8(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, int8);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_uint16.cc b/tensorflow/core/kernels/cast_op_impl_uint16.cc
index 2981fe99e3c..aca3c877418 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint16.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint16.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromUint16(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromUint16(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, uint16);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_uint32.cc b/tensorflow/core/kernels/cast_op_impl_uint32.cc
index b94540dfe7d..d41ac9d9382 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint32.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint32.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromUint32(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromUint32(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, uint32);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_uint64.cc b/tensorflow/core/kernels/cast_op_impl_uint64.cc
index e04c0a28cd8..d941f1dc118 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint64.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint64.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromUint64(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromUint64(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, uint64);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_impl_uint8.cc b/tensorflow/core/kernels/cast_op_impl_uint8.cc
index 20c572980c3..fbffeb554e1 100644
--- a/tensorflow/core/kernels/cast_op_impl_uint8.cc
+++ b/tensorflow/core/kernels/cast_op_impl_uint8.cc
@@ -33,12 +33,5 @@ CastFunctorType GetGpuCastFromUint8(DataType dst_dtype) {
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-CastFunctorType GetSyclCastFromUint8(DataType dst_dtype) {
-  CURRY_TYPES3_NO_HALF(CAST_CASE, SYCLDevice, uint8);
-  return nullptr;
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cast_op_test.cc b/tensorflow/core/kernels/cast_op_test.cc
index c8da7c55ae8..11550be4874 100644
--- a/tensorflow/core/kernels/cast_op_test.cc
+++ b/tensorflow/core/kernels/cast_op_test.cc
@@ -138,9 +138,6 @@ static void BM_gpu_float_int64(int iters, int num) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   test::Benchmark("gpu", Cast<float, int64>(num)).Run(iters);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-  test::Benchmark("sycl", Cast<float, int64>(num)).Run(iters);
-#endif  // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_float_int64)->Arg(64 << 10)->Arg(32 << 20);
 
@@ -161,9 +158,6 @@ static void BM_gpu_bool_float(int iters, int num) {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   test::Benchmark("gpu", Cast<bool, float>(num)).Run(iters);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-  test::Benchmark("sycl", Cast<bool, float>(num)).Run(iters);
-#endif  // TENSORFLOW_USE_SYCL
 }
 BENCHMARK(BM_gpu_bool_float)->Arg(64 << 10)->Arg(32 << 20);
 
diff --git a/tensorflow/core/kernels/collective_nccl.cc b/tensorflow/core/kernels/collective_nccl.cc
index 74ad24abfaa..44e0b07e9ad 100644
--- a/tensorflow/core/kernels/collective_nccl.cc
+++ b/tensorflow/core/kernels/collective_nccl.cc
@@ -74,10 +74,6 @@ Status NcclBase::InitializeCollectiveGroupRuntimeDetails(
   return Status::OK();
 }
 
-const string NcclBase::NcclCollectiveKey(const string& exec_key, int step_id) {
-  return strings::StrCat(exec_key, ":", step_id);
-}
-
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/collective_nccl.h b/tensorflow/core/kernels/collective_nccl.h
index b076272b6a5..1c3681311bc 100644
--- a/tensorflow/core/kernels/collective_nccl.h
+++ b/tensorflow/core/kernels/collective_nccl.h
@@ -37,8 +37,6 @@ class NcclBase : public CollectiveImplementationInterface {
       CollGroupRuntimeDetails* col_group_runtime_details) override;
 
  protected:
-  const string NcclCollectiveKey(const string& exec_key, int step_id);
-
   const CollectiveType type_;
   const string name_;
   std::shared_ptr<CollectiveContext> col_ctx_;
diff --git a/tensorflow/core/kernels/collective_nccl_broadcaster.cc b/tensorflow/core/kernels/collective_nccl_broadcaster.cc
index c4081ca81c4..12b7f5222a9 100644
--- a/tensorflow/core/kernels/collective_nccl_broadcaster.cc
+++ b/tensorflow/core/kernels/collective_nccl_broadcaster.cc
@@ -24,55 +24,7 @@ limitations under the License.
 namespace tensorflow {
 
 void NcclBroadcaster::Run(StatusCallback done) {
-  auto* compute_stream = col_ctx_->op_ctx->op_device_context()->stream();
-  auto* gpu_info = col_ctx_->op_ctx->device()->tensorflow_gpu_device_info();
-  const int num_global_devices = col_params_->group.group_size;
-  const int num_local_devices = col_params_->instance.num_devices_per_task.at(
-      col_params_->instance.task_names[col_params_->default_rank]);
-  string nccl_collective_key =
-      NcclCollectiveKey(col_ctx_->exec_key, col_ctx_->step_id);
-  auto participant = absl::make_unique<NcclManager::Participant>(
-      compute_stream->parent(), compute_stream, gpu_info, col_ctx_->input,
-      col_ctx_->output, col_params_->default_rank, std::move(done));
-  VLOG(1)
-      << "NcclBroadcast calling NcclManager::AddBroadcastSend/Recv num_tasks "
-      << col_params_->group.num_tasks << " current task "
-      << col_params_->instance.task_names[col_params_->default_rank]
-      << " num local devices " << num_local_devices << " num global devices "
-      << num_global_devices << " rank " << col_params_->default_rank
-      << " device " << col_ctx_->device_name << " instance "
-      << col_params_->instance.instance_key << " source "
-      << col_params_->is_source;
-  if (col_params_->is_source) {
-    NcclManager::instance()->AddBroadcastSend(
-        std::move(participant),
-        {std::move(nccl_collective_key), num_local_devices, num_global_devices,
-         col_params_->group.runtime_details.communicator_key,
-         col_params_->source_rank});
-  } else {
-    NcclManager::instance()->AddBroadcastRecv(
-        std::move(participant),
-        {std::move(nccl_collective_key), num_local_devices, num_global_devices,
-         col_params_->group.runtime_details.communicator_key,
-         col_params_->source_rank});
-  }
-  {
-    // `WaitForDependencies` may block if the collective instances on which this
-    // op depends have not yet launched.  When this function returns, this op is
-    // ready to go.
-    profiler::TraceMe activity("WaitForDependencies",
-                               profiler::TraceMeLevel::kInfo);
-    col_ctx_->col_exec->WaitForDependencies(*col_params_);
-    NcclManager::instance()->SignalMultiNodeReady(nccl_collective_key);
-  }
-  {
-    // When all devices at this worker have called `SignalMultiNodeReady`, the
-    // `NcclManager` will enqueue the NCCL kernel on the NCCL stream.  Thus the
-    // implementation of `UnblockDependencies` keeps track of the number of
-    // devices that have launched.
-    profiler::TraceMe activity("Schedule", profiler::TraceMeLevel::kInfo);
-    col_ctx_->col_exec->UnblockDependencies(*col_params_);
-  }
+  col_ctx_->nccl_communicator->Enqueue(col_ctx_, std::move(done));
 }
 
 REGISTER_COLLECTIVE(NcclBroadcast, NcclBroadcaster);
diff --git a/tensorflow/core/kernels/collective_nccl_gatherer.cc b/tensorflow/core/kernels/collective_nccl_gatherer.cc
index 8c8b29cbb12..a029e5bb660 100644
--- a/tensorflow/core/kernels/collective_nccl_gatherer.cc
+++ b/tensorflow/core/kernels/collective_nccl_gatherer.cc
@@ -24,45 +24,7 @@ limitations under the License.
 namespace tensorflow {
 
 void NcclGatherer::Run(StatusCallback done) {
-  auto* compute_stream = col_ctx_->op_ctx->op_device_context()->stream();
-  auto* gpu_info = col_ctx_->op_ctx->device()->tensorflow_gpu_device_info();
-  const int num_global_devices = col_params_->group.group_size;
-  const int num_local_devices = col_params_->instance.num_devices_per_task.at(
-      col_params_->instance.task_names[col_params_->default_rank]);
-  string nccl_collective_key =
-      NcclCollectiveKey(col_ctx_->exec_key, col_ctx_->step_id);
-  auto participant = absl::make_unique<NcclManager::Participant>(
-      compute_stream->parent(), compute_stream, gpu_info, col_ctx_->input,
-      col_ctx_->output, col_params_->default_rank, std::move(done));
-  VLOG(1) << "NcclGatherer calling NcclManager::AddToAllGather num_tasks "
-          << col_params_->group.num_tasks << " current task "
-          << col_params_->instance.task_names[col_params_->default_rank]
-          << " num local devices " << num_local_devices
-          << " num global devices " << num_global_devices << " rank "
-          << col_params_->default_rank << " device " << col_ctx_->device_name
-          << " instance " << col_params_->instance.instance_key;
-  NcclManager::instance()->AddToAllGather(
-      std::move(participant),
-      {std::move(nccl_collective_key), num_local_devices, num_global_devices,
-       col_params_->group.runtime_details.communicator_key,
-       /*source_rank=*/-1});
-  {
-    // `WaitForDependencies` may block if the collective instances on which this
-    // op depends have not yet launched.  When this function returns, this op is
-    // ready to go.
-    profiler::TraceMe activity("WaitForDependencies",
-                               profiler::TraceMeLevel::kInfo);
-    col_ctx_->col_exec->WaitForDependencies(*col_params_);
-    NcclManager::instance()->SignalMultiNodeReady(nccl_collective_key);
-  }
-  {
-    // When all devices at this worker have called `SignalMultiNodeReady`, the
-    // `NcclManager` will enqueue the NCCL kernel on the NCCL stream.  Thus the
-    // implementation of `UnblockDependencies` keeps track of the number of
-    // devices that have launched.
-    profiler::TraceMe activity("Schedule", profiler::TraceMeLevel::kInfo);
-    col_ctx_->col_exec->UnblockDependencies(*col_params_);
-  }
+  col_ctx_->nccl_communicator->Enqueue(col_ctx_, std::move(done));
 }
 
 REGISTER_COLLECTIVE(NcclGather, NcclGatherer);
diff --git a/tensorflow/core/kernels/collective_nccl_reducer.cc b/tensorflow/core/kernels/collective_nccl_reducer.cc
index a1ae23b2f8c..451f2cb96bc 100644
--- a/tensorflow/core/kernels/collective_nccl_reducer.cc
+++ b/tensorflow/core/kernels/collective_nccl_reducer.cc
@@ -23,36 +23,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-namespace {
-Status ReductionOp(const string& merge_op, ncclRedOp_t* reduction_op) {
-  if (merge_op == "Add") {
-    *reduction_op = ncclSum;
-    return Status::OK();
-  } else if (merge_op == "Mul") {
-    *reduction_op = ncclProd;
-    return Status::OK();
-  } else if (merge_op == "Maximum") {
-    *reduction_op = ncclMax;
-    return Status::OK();
-  } else if (merge_op == "Minimum") {
-    *reduction_op = ncclMin;
-    return Status::OK();
-  } else {
-    return errors::Internal(
-        "Expected merge_op to be in [Add, Mul, Maximum, Minimum], found ",
-        merge_op);
-  }
-}
-}  // namespace
-
 void NcclReducer::Run(StatusCallback done) {
-  ncclRedOp_t reduction_op;
-  Status s = ReductionOp(col_params_->merge_op->type_string(), &reduction_op);
-  if (!s.ok()) {
-    done(s);
-    return;
-  }
-
   Tensor group_size;
   std::unique_ptr<Notification> group_size_ready;
   Status group_size_status;
@@ -117,73 +88,7 @@ void NcclReducer::Run(StatusCallback done) {
   } else {
     done_callback = std::move(done);
   }
-  auto* compute_stream = col_ctx_->op_ctx->op_device_context()->stream();
-  auto* gpu_info = col_ctx_->op_ctx->device()->tensorflow_gpu_device_info();
-  // `AddToAllReduce` performs consistency checks for the NCCL call and enqueues
-  // the `Participant` struct locally.  When all local participants with this
-  // `nccl_collective_key` have called `AddToAllReduce` and
-  // `SignalMultiNodeReady`, all devices at this worker are ready to process
-  // this NCCL op.
-  //
-  // The `NcclManager` uses a dedicated CUDA stream for NCCL kernels.  At this
-  // point, it synchronizes the NCCL stream with the compute stream, and then
-  // enqueues the NCCL kernel on the NCCL stream.
-  const int num_global_devices = col_params_->group.group_size;
-  const int num_local_devices = col_params_->instance.num_devices_per_task.at(
-      col_params_->instance.task_names[col_params_->default_rank]);
-  const string nccl_collective_key =
-      NcclCollectiveKey(col_ctx_->exec_key, col_ctx_->step_id);
-  auto participant = absl::make_unique<NcclManager::Participant>(
-      compute_stream->parent(), compute_stream, gpu_info, col_ctx_->input,
-      col_ctx_->output, col_params_->default_rank, std::move(done_callback));
-  VLOG(1) << "NcclReducer calling NcclManager::AddToAllReduce num_tasks "
-          << col_params_->group.num_tasks << " current task "
-          << col_params_->instance.task_names[col_params_->default_rank]
-          << " num local devices " << num_local_devices
-          << " num global devices " << num_global_devices << " device "
-          << col_ctx_->device_name << " instance "
-          << col_params_->instance.instance_key;
-  NcclManager::instance()->AddToAllReduce(
-      std::move(participant),
-      {nccl_collective_key, num_local_devices, num_global_devices,
-       col_params_->group.runtime_details.communicator_key, /*source_rank=*/-1},
-      reduction_op);
-
-  // NOTE(ayushd): We need to synchronize NCCL launches across nodes to prevent
-  // deadlocks.  In the current implementation, we define a deterministic
-  // sequential launch order between potentially concurrent collective instances
-  // by introducing control information during static graph analysis in
-  // graph/collective_order.cc.  This can be either in the form of explicit
-  // control edges or via `wait_for` attribute on the collective op.
-  //
-  // The other end of the design spectrum would have a distinguished node
-  // dynamically signal the next collective to launch to all other participants.
-  // This has higher degree of runtime coordination, but it may be able to
-  // achieve better performance if the (arbitrary) static execution order
-  // assigned in the first approach turns out to not be good from a scheduling
-  // perspective.  e.g. consider a graph in which c1, c2, and c3 are three
-  // concurrent collective instances, and the static ordering assigns c1 -> c2
-  // -> c3.  In practice, it could turn out that c3 is always ready to execute
-  // before c1 or c2.
-  {
-    // `WaitForDependencies` may block if the collective instances on which this
-    // op depends have not yet launched.  When this function returns, this op is
-    // ready to go.
-    profiler::TraceMe activity("WaitForDependencies",
-                               profiler::TraceMeLevel::kInfo);
-    // TODO(b/80529858): make this entirely non-blocking by converting
-    // `WaitForDependencies` to async function.
-    col_ctx_->col_exec->WaitForDependencies(*col_params_);
-    NcclManager::instance()->SignalMultiNodeReady(nccl_collective_key);
-  }
-  {
-    // When all devices at this worker have called `SignalMultiNodeReady`, the
-    // `NcclManager` will enqueue the NCCL kernel on the NCCL stream.  Thus the
-    // implementation of `UnblockDependencies` keeps track of the number of
-    // devices that have launched.
-    profiler::TraceMe activity("Schedule", profiler::TraceMeLevel::kInfo);
-    col_ctx_->col_exec->UnblockDependencies(*col_params_);
-  }
+  col_ctx_->nccl_communicator->Enqueue(col_ctx_, std::move(done_callback));
 
   // If no final_op, then this OpKernel is non-blocking.
   if (!col_params_->final_op) {
diff --git a/tensorflow/core/kernels/collective_nccl_test.cc b/tensorflow/core/kernels/collective_nccl_test.cc
index 04a15a00dad..f7725151d8a 100644
--- a/tensorflow/core/kernels/collective_nccl_test.cc
+++ b/tensorflow/core/kernels/collective_nccl_test.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/nccl/collective_communicator.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/unbounded_work_queue.h"
@@ -84,6 +85,7 @@ class NcclTestBase : public ::testing::Test {
   NcclTestBase(CollectiveType collective_type, const string& collective_name)
       : collective_type_(collective_type),
         collective_name_(collective_name),
+        nccl_communicator_(MaybeCreateNcclCommunicator()),
         work_queue_(std::make_shared<UnboundedWorkQueue>(
             Env::Default(), "collective_executor")),
         col_exec_(nullptr) {}
@@ -134,15 +136,14 @@ class NcclTestBase : public ::testing::Test {
     col_params_.instance.data_type = DT_FLOAT;
     col_params_.instance.impl_details.collective_name = collective_name_;
     const string task_name = "/job:worker/replica:0/task:0";
-    col_params_.instance.num_devices_per_task[task_name] = num_ranks;
+    col_params_.group.num_devices_per_task[task_name] = num_ranks;
     for (int rank = 0; rank < num_ranks; ++rank) {
-      col_params_.instance.device_names.push_back(
-          device_names[rank % num_gpus]);
-      col_params_.instance.task_names.push_back(task_name);
+      col_params_.group.device_names.push_back(device_names[rank % num_gpus]);
+      col_params_.group.task_names.push_back(task_name);
     }
     for (int rank = 0; rank < num_ranks; ++rank) {
       instances_.push_back(absl::make_unique<DeviceInstance>(
-          rank, col_params_.instance.device_names[rank], this));
+          rank, col_params_.group.device_names[rank], this));
     }
   }
 
@@ -249,9 +250,7 @@ class NcclTestBase : public ::testing::Test {
           << parent_->dev_mgr_->DebugString();
       col_params_.name = parent_->col_params_.name;
       col_params_.default_rank = rank;
-      col_params_.group.group_key = parent_->col_params_.group.group_key;
-      col_params_.group.device_type = parent_->col_params_.group.device_type;
-      col_params_.group.group_size = parent_->col_params_.group.group_size;
+      col_params_.group = parent_->col_params_.group;
       col_params_.instance = parent->col_params_.instance;
     }
 
@@ -318,7 +317,8 @@ class NcclTestBase : public ::testing::Test {
           strings::StrCat(col_params_.instance.instance_key, ":0:0");
       auto* reducer = new NcclReducer();
       auto col_ctx = std::make_shared<CollectiveContext>(
-          parent_->col_exec_, parent_->dev_mgr_.get(),
+          parent_->col_exec_, parent_->nccl_communicator_.get(),
+          parent_->dev_mgr_.get(),
           /*OpKernelContext=*/&ctx, &op_params, col_params_, exec_key, kStepId,
           /*input=*/&input_, /*output=*/&input_);
       TF_CHECK_OK(reducer->InitializeCollectiveContext(col_ctx));
@@ -349,7 +349,8 @@ class NcclTestBase : public ::testing::Test {
           strings::StrCat(col_params_.instance.instance_key, ":0:0");
       auto* broadcaster = new NcclBroadcaster();
       auto col_ctx = std::make_shared<CollectiveContext>(
-          parent_->col_exec_, parent_->dev_mgr_.get(),
+          parent_->col_exec_, parent_->nccl_communicator_.get(),
+          parent_->dev_mgr_.get(),
           /*OpKernelContext=*/&ctx, &op_params, col_params_, exec_key, kStepId,
           /*input=*/col_params_.is_source ? &input_ : nullptr,
           /*output=*/&input_);
@@ -389,7 +390,8 @@ class NcclTestBase : public ::testing::Test {
           strings::StrCat(col_params_.instance.instance_key, ":0:0");
       auto* gatherer = new NcclGatherer();
       auto col_ctx = std::make_shared<CollectiveContext>(
-          parent_->col_exec_, parent_->dev_mgr_.get(),
+          parent_->col_exec_, parent_->nccl_communicator_.get(),
+          parent_->dev_mgr_.get(),
           /*OpKernelContext=*/&ctx, &op_params, col_params_, exec_key, kStepId,
           /*input=*/&input_,
           /*output=*/&output_);
@@ -419,6 +421,7 @@ class NcclTestBase : public ::testing::Test {
   const string collective_name_;
   std::vector<std::unique_ptr<tensorflow::Device>> gpus_;
   TestCollectiveExecutorMgr col_exec_mgr_;
+  std::unique_ptr<NcclCommunicatorInterface> nccl_communicator_;
   std::shared_ptr<UnboundedWorkQueue> work_queue_;
   CollectiveExecutor* col_exec_;
   std::unique_ptr<DeviceMgr> dev_mgr_;
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
index 0230852d082..557d392f5b9 100644
--- a/tensorflow/core/kernels/collective_ops.cc
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
@@ -62,8 +64,7 @@ class CollectiveOpKernel : public AsyncOpKernel {
   // immediately.
   bool CanProceedWithCompute(OpKernelContext* c, CollectiveExecutor* col_exec,
                              const DoneCallback& done) {
-    if (col_params_.group.group_size >
-        col_params_.instance.device_names.size()) {
+    if (col_params_.group.group_size > col_params_.group.device_names.size()) {
       // This is the first invocation: Finish initializing col_params_.
       // Schedule the `CompleteParamsAsync` call on a work queue that can handle
       // blocking work because it's not guaranteed that this call cannot block.
@@ -73,7 +74,7 @@ class CollectiveOpKernel : public AsyncOpKernel {
                 << " group " << col_params_.group.group_key << " instance "
                 << col_params_.instance.instance_key;
         col_exec->CompleteParamsAsync(
-            c->device()->name(), &col_params_, c->cancellation_manager(),
+            c->device()->attributes(), &col_params_, c->cancellation_manager(),
             [this, c, done](const Status& s) {
               if (s.ok()) {
                 col_params_.instance.impl_details.dependencies = dependencies_;
@@ -455,6 +456,9 @@ class CollectiveReduceV2OpKernel : public AsyncOpKernel {
     OP_REQUIRES_OK(
         c, c->GetAttr("communication_hint",
                       &col_params_->instance.impl_details.communication_hint));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("timeout_seconds",
+                      &col_params_->instance.impl_details.timeout_seconds));
     // Prepare OpKernels for reduction and final operations.
     // The merge_op takes two inputs
     NodeDef sub_node;
@@ -509,7 +513,8 @@ class CollectiveReduceV2OpKernel : public AsyncOpKernel {
     col_params->instance.data_type = col_params_->instance.data_type;
     col_params->instance.impl_details.communication_hint =
         col_params_->instance.impl_details.communication_hint;
-    col_params->instance.impl_details.timeout_seconds = 0;
+    col_params->instance.impl_details.timeout_seconds =
+        col_params_->instance.impl_details.timeout_seconds;
     col_params->instance.impl_details.subdiv_offsets =
         col_params_->instance.impl_details.subdiv_offsets;
     col_params->merge_op = std::move(col_params_->merge_op);
@@ -538,7 +543,8 @@ class CollectiveReduceV2OpKernel : public AsyncOpKernel {
               << " group " << col_params->group.group_key << " instance "
               << col_params->instance.instance_key;
       col_exec->CompleteParamsAsync(
-          c->device()->name(), col_params.get(), c->cancellation_manager(),
+          c->device()->attributes(), col_params.get(),
+          c->cancellation_manager(),
           [c, done = std::move(done), col_params, col_exec](const Status& s) {
             if (s.ok()) {
               auto actual_done = [c, group_key = col_params->group.group_key,
@@ -577,8 +583,146 @@ class CollectiveReduceV2OpKernel : public AsyncOpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("CollectiveReduceV2").Device(DEVICE_CPU),
                         CollectiveReduceV2OpKernel);
-REGISTER_KERNEL_BUILDER(Name("CollectiveReduceV2").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("CollectiveReduceV2")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("group_size")
+                            .HostMemory("group_key")
+                            .HostMemory("instance_key"),
                         CollectiveReduceV2OpKernel);
 
+class CollectiveGatherV2OpKernel : public AsyncOpKernel {
+ public:
+  explicit CollectiveGatherV2OpKernel(OpKernelConstruction* c)
+      : AsyncOpKernel(c), device_type_(DEVICE_DEFAULT) {
+    OP_REQUIRES_OK(c, c->GetAttr("T", &data_type_));
+    OP_REQUIRES_OK(c, c->GetAttr("communication_hint", &communication_hint_));
+    OP_REQUIRES_OK(c, c->GetAttr("timeout_seconds", &timeout_seconds_));
+    name_ = strings::StrCat(c->def().name(), ": GatherV2");
+    device_type_ = c->device_type();
+    VLOG(2) << "CollectiveGatherV2 " << this << " name " << name_
+            << " communication_hint " << communication_hint_;
+  }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
+    CollectiveExecutor* col_exec = c->collective_executor();
+    OP_REQUIRES_ASYNC(
+        c, col_exec,
+        errors::Internal(
+            "Failed to get CollectiveExecutor from OpKernelContext for Op ",
+            name_),
+        done);
+    const Tensor& input = c->input(0);
+    const Tensor& group_size = c->input(1);
+    const Tensor& group_key = c->input(2);
+    const Tensor& instance_key = c->input(3);
+    OP_REQUIRES_ASYNC(c, group_size.dims() == 0,
+                      errors::InvalidArgument(
+                          "Unexpected dimensions on input group_size, got ",
+                          group_size.shape().DebugString()),
+                      done);
+    OP_REQUIRES_ASYNC(c, group_key.dims() == 0,
+                      errors::InvalidArgument(
+                          "Unexpected dimensions on input group_key, got ",
+                          group_key.shape().DebugString()),
+                      done);
+    OP_REQUIRES_ASYNC(c, instance_key.dims() == 0,
+                      errors::InvalidArgument(
+                          "Unexpected dimensions on input instance_key, got ",
+                          instance_key.shape().DebugString()),
+                      done);
+
+    auto col_params = new CollectiveParams();
+    col_params->name = name_;
+    col_params->group.device_type = device_type_;
+    col_params->group.group_size = group_size.unaligned_flat<int32>()(0);
+    OP_REQUIRES(
+        c, col_params->group.group_size > 0,
+        errors::InvalidArgument("group_size must be positive integer but got ",
+                                col_params->group.group_size));
+    col_params->group.group_key = group_key.unaligned_flat<int32>()(0);
+    col_params->instance.type = GATHER_COLLECTIVE;
+    col_params->instance.instance_key = instance_key.unaligned_flat<int32>()(0);
+    col_params->instance.data_type = data_type_;
+    col_params->instance.impl_details.communication_hint = communication_hint_;
+    col_params->instance.impl_details.timeout_seconds = timeout_seconds_;
+    VLOG(1) << "CollectiveGatherV2 group_size " << col_params->group.group_size
+            << " group_key " << col_params->group.group_key << " instance_key "
+            << col_params->instance.instance_key;
+
+    auto output_shape = input.shape();
+    output_shape.set_dim(
+        0, output_shape.dim_size(0) * col_params->group.group_size);
+    col_params->instance.shape = output_shape;
+
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK_ASYNC(
+        c, c->allocate_output(0, col_params->instance.shape, &output), done);
+
+    auto done_with_cleanup = [col_params, done = std::move(done)]() {
+      delete col_params;
+      done();
+    };
+
+    // Resolve the collective params.
+    // Schedule the `CompleteParamsAsync` call on a work queue that can handle
+    // blocking work because it's not guaranteed that this call cannot block.
+    c->collective_executor()->RunClosure([c,
+                                          done = std::move(done_with_cleanup),
+                                          col_params, col_exec]() {
+      VLOG(1) << "CollectiveGatherV2 CompleteParams for collective "
+              << col_params->name << " device " << c->device()->name()
+              << " group " << col_params->group.group_key << " instance "
+              << col_params->instance.instance_key;
+      col_exec->CompleteParamsAsync(
+          c->device()->attributes(), col_params, c->cancellation_manager(),
+          [c, done = std::move(done), col_params, col_exec](const Status& s) {
+            if (s.ok()) {
+              auto actual_done = [c, group_key = col_params->group.group_key,
+                                  instance_key =
+                                      col_params->instance.instance_key,
+                                  done = std::move(done)](const Status& s) {
+                VLOG(1) << "CollectiveGatherV2 ExecuteAsync done for "
+                           "collective "
+                        << c->op_kernel().name() << " device "
+                        << c->device()->name() << " group " << group_key
+                        << " instance " << instance_key << " status " << s;
+                OP_REQUIRES_OK_ASYNC(c, s, done);
+                done();
+              };
+              VLOG(1) << "CollectiveGatherV2 ExecuteAsync start for "
+                         "collective "
+                      << col_params->name << " device " << c->device()->name()
+                      << " group " << col_params->group.group_key
+                      << " instance " << col_params->instance.instance_key;
+              col_exec->ExecuteAsync(
+                  c, *col_params,
+                  CollectiveKey(c, col_params->group.group_key,
+                                col_params->instance.instance_key),
+                  actual_done);
+            } else {
+              c->SetStatus(s);
+              done();
+            }
+          });
+    });
+  }
+
+ private:
+  DataType data_type_;
+  string communication_hint_;
+  float timeout_seconds_;
+  DeviceType device_type_;
+  string name_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectiveGatherV2").Device(DEVICE_CPU),
+                        CollectiveGatherV2OpKernel);
+REGISTER_KERNEL_BUILDER(Name("CollectiveGatherV2")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("group_size")
+                            .HostMemory("group_key")
+                            .HostMemory("instance_key"),
+                        CollectiveGatherV2OpKernel);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_lib.h b/tensorflow/core/kernels/concat_lib.h
index 35da7afe3f5..b85ed34fbff 100644
--- a/tensorflow/core/kernels/concat_lib.h
+++ b/tensorflow/core/kernels/concat_lib.h
@@ -64,23 +64,12 @@ void ConcatGPU(
           inputs_flat,                                                        \
       Tensor* output, typename TTypes<T, 2>::Tensor* output_flat);
 
-TF_CALL_int32(REGISTER);  // Needed for TensorLists.
-TF_CALL_int64(REGISTER);
-TF_CALL_int16(REGISTER);
+TF_CALL_INTEGRAL_TYPES(REGISTER);  // int32 Needed for TensorLists.
 TF_CALL_bfloat16(REGISTER);
-TF_CALL_uint8(REGISTER);
 TF_CALL_GPU_ALL_TYPES(REGISTER);
 #undef REGISTER
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-void ConcatSYCL(
-    const Eigen::SyclDevice& d,
-    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
-        inputs,
-    typename TTypes<T, 2>::Matrix* output);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_CONCAT_LIB_H_
diff --git a/tensorflow/core/kernels/concat_lib_cpu.cc b/tensorflow/core/kernels/concat_lib_cpu.cc
index 1dec589d3ff..d1748e056a7 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.cc
+++ b/tensorflow/core/kernels/concat_lib_cpu.cc
@@ -127,24 +127,4 @@ REGISTER(tstring);
         // !defined(SUPPORT_SELECTIVE_REGISTRATION) &&
         // !defined(__ANDROID_TYPES_FULL__)
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-void ConcatSYCL(
-    const Eigen::SyclDevice& d,
-    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
-        inputs,
-    typename TTypes<T, 2>::Matrix* output) {
-  ConcatSYCLImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */, MemCpyCopier<T>(),
-                    output);
-}
-#define REGISTER_SYCL(T)                                                       \
-  template void ConcatSYCL<T>(                                                 \
-      const Eigen::SyclDevice&,                                                \
-      const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&, \
-      typename TTypes<T, 2>::Matrix* output);
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL)
-
-#undef REGISTER_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/concat_lib_cpu.h b/tensorflow/core/kernels/concat_lib_cpu.h
index 6ee717a9215..34f99291abf 100644
--- a/tensorflow/core/kernels/concat_lib_cpu.h
+++ b/tensorflow/core/kernels/concat_lib_cpu.h
@@ -130,41 +130,6 @@ void ConcatCPUImpl(
         cost_per_unit, work);
 }
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T, typename ElementCopier>
-void ConcatSYCLImpl(
-    const Eigen::SyclDevice& d,
-    const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
-        inputs,
-    int64 cost_per_unit, ElementCopier copier,
-    typename TTypes<T, 2>::Matrix* output) {
-  size_t num_inputs = inputs.size();
-
-  std::vector<ptrdiff_t> sizes;
-  sizes.reserve(num_inputs);
-  int64 row_size = 0;
-  for (const auto& input : inputs) {
-    sizes.push_back(input->dimension(1));
-    row_size += sizes.back();
-  }
-
-  T* out = &(*output)(0, 0);
-  std::vector<const T*> inp;
-  inp.reserve(num_inputs);
-  for (const auto& input : inputs) {
-    inp.push_back(&(*input)(0, 0));
-  }
-  const int64 dim0 = output->dimension(0);
-  for (int64 i = 0; i < dim0; ++i) {
-    for (int64 j = 0; j < num_inputs; ++j) {
-      auto size = sizes[j];
-      d.memcpy(out, inp[j], size * sizeof(T));
-      out += size;
-      inp[j] += size;
-    }
-  }
-}
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_CONCAT_LIB_CPU_H_
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index de029397847..5093c32ecf0 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -98,11 +98,8 @@ void ConcatGPU(
           inputs_flat,                                                        \
       Tensor* output, typename TTypes<T, 2>::Tensor* output_flat);
 
-TF_CALL_int32(REGISTER);  // Needed for TensorLists.
-TF_CALL_int64(REGISTER);
-TF_CALL_int16(REGISTER);
+TF_CALL_INTEGRAL_TYPES(REGISTER);  // int32 Needed for TensorLists.
 TF_CALL_bfloat16(REGISTER);
-TF_CALL_uint8(REGISTER);
 TF_CALL_GPU_ALL_TYPES(REGISTER);
 
 #undef REGISTER
diff --git a/tensorflow/core/kernels/concat_lib_gpu.h b/tensorflow/core/kernels/concat_lib_gpu.h
index a83b6b63d9b..79aa08d380f 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.h
+++ b/tensorflow/core/kernels/concat_lib_gpu.h
@@ -66,11 +66,8 @@ void ConcatGPUImpl(const Eigen::GpuDevice& d,
       const GpuDeviceArrayStruct<int64>& ptr_offsets, bool fixed_size,        \
       int split_size, typename TTypes<T, 2>::Matrix* output);
 
-TF_CALL_int32(REGISTER);  // Needed for TensorLists.
-TF_CALL_int64(REGISTER);
-TF_CALL_int16(REGISTER);
+TF_CALL_INTEGRAL_TYPES(REGISTER);  // int32 Needed for TensorLists.
 TF_CALL_bfloat16(REGISTER);
-TF_CALL_uint8(REGISTER);
 TF_CALL_GPU_ALL_TYPES(REGISTER);
 #undef REGISTER
 
diff --git a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
index 859aca2f932..8ddf10ec98f 100644
--- a/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu_impl.cu.cc
@@ -201,31 +201,19 @@ void ConcatGPUImpl(const Eigen::GpuDevice& gpu_device,
       const GpuDeviceArrayStruct<int64>& ptr_offsets, bool fixed_size, \
       int split_size, typename TTypes<T, 2>::Matrix* output);
 
-TF_CALL_int32(REGISTER_GPUCONCAT32);  // Needed for TensorLists.
-TF_CALL_int64(REGISTER_GPUCONCAT32);
-TF_CALL_int16(REGISTER_GPUCONCAT32);
-TF_CALL_uint8(REGISTER_GPUCONCAT32);
+TF_CALL_INTEGRAL_TYPES(REGISTER_GPUCONCAT32);  // int32 Needed for TensorLists.
 TF_CALL_bfloat16(REGISTER_GPUCONCAT32);
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPUCONCAT32);
 
-TF_CALL_int32(REGISTER_GPUCONCAT64);  // Needed for TensorLists.
-TF_CALL_int64(REGISTER_GPUCONCAT64);
-TF_CALL_int16(REGISTER_GPUCONCAT64);
-TF_CALL_uint8(REGISTER_GPUCONCAT64);
+TF_CALL_INTEGRAL_TYPES(REGISTER_GPUCONCAT64);  // int32 Needed for TensorLists.
 TF_CALL_bfloat16(REGISTER_GPUCONCAT64);
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPUCONCAT64);
 
-TF_CALL_int32(REGISTER_GPU32);  // Needed for TensorLists.
-TF_CALL_int64(REGISTER_GPU32);
-TF_CALL_int16(REGISTER_GPU32);
-TF_CALL_uint8(REGISTER_GPU32);
+TF_CALL_INTEGRAL_TYPES(REGISTER_GPU32);  // int32 Needed for TensorLists.
 TF_CALL_bfloat16(REGISTER_GPU32);
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU32);
 
-TF_CALL_int32(REGISTER_GPU64);  // Needed for TensorLists.
-TF_CALL_int64(REGISTER_GPU64);
-TF_CALL_int16(REGISTER_GPU64);
-TF_CALL_uint8(REGISTER_GPU64);
+TF_CALL_INTEGRAL_TYPES(REGISTER_GPU64);  // int32 Needed for TensorLists.
 TF_CALL_bfloat16(REGISTER_GPU64);
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU64);
 
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index d3f3a04f33b..3a241c0ebb4 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -35,9 +35,6 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 typedef Eigen::GpuDevice GPUDevice;
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 enum AxisArgumentName { NAME_IS_AXIS, NAME_IS_CONCAT_DIM };
 
@@ -168,12 +165,6 @@ class ConcatBaseOp : public OpKernel {
         return;
       }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-      if (std::is_same<Device, SYCLDevice>::value) {
-        ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
-        return;
-      }
-#endif  // TENSORFLOW_USE_SYCL
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
@@ -225,9 +216,8 @@ REGISTER_CONCAT(qint32);
                               .HostMemory("axis"),       \
                           ConcatV2Op<GPUDevice, type>)
 
+TF_CALL_INTEGRAL_TYPES_NO_INT32(REGISTER_GPU);
 TF_CALL_bfloat16(REGISTER_GPU);
-TF_CALL_uint8(REGISTER_GPU);
-TF_CALL_int64(REGISTER_GPU);
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU);
 #undef REGISTER_GPU
 
@@ -251,38 +241,6 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2")
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                              \
-  REGISTER_KERNEL_BUILDER(Name("Concat")                 \
-                              .Device(DEVICE_SYCL)       \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("concat_dim"), \
-                          ConcatOp<SYCLDevice, type>)    \
-  REGISTER_KERNEL_BUILDER(Name("ConcatV2")               \
-                              .Device(DEVICE_SYCL)       \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("axis"),       \
-                          ConcatV2Op<SYCLDevice, type>)
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
-
-REGISTER_KERNEL_BUILDER(Name("Concat")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("concat_dim")
-                            .HostMemory("values")
-                            .HostMemory("output"),
-                        ConcatOp<CPUDevice, int32>);
-REGISTER_KERNEL_BUILDER(Name("ConcatV2")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("values")
-                            .HostMemory("axis")
-                            .HostMemory("output"),
-                        ConcatV2Op<CPUDevice, int32>);
-
-#undef REGISTER_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 class ConcatOffsetOp : public OpKernel {
  public:
@@ -370,12 +328,4 @@ REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
                             .HostMemory("offset"),
                         ConcatOffsetOp);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("concat_dim")
-                            .HostMemory("shape")
-                            .HostMemory("offset"),
-                        ConcatOffsetOp);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index 682da43a9b0..f9b382ca6f0 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -39,9 +39,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/platform/macros.h"
 
-#ifdef TENSORFLOW_USE_SYCL
-#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
@@ -127,33 +124,9 @@ REGISTER_KERNEL(GPU, Variant);
 #undef REGISTER_KERNEL
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(D, TYPE)                                 \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Const").Device(DEVICE_##D).TypeConstraint<TYPE>("dtype"), \
-      ConstantOp);
-REGISTER_SYCL_KERNEL(SYCL, float);
-REGISTER_SYCL_KERNEL(SYCL, double);
-REGISTER_SYCL_KERNEL(SYCL, uint8);
-REGISTER_SYCL_KERNEL(SYCL, int8);
-REGISTER_SYCL_KERNEL(SYCL, qint8);
-REGISTER_SYCL_KERNEL(SYCL, uint16);
-REGISTER_SYCL_KERNEL(SYCL, int16);
-REGISTER_SYCL_KERNEL(SYCL, qint16);
-REGISTER_SYCL_KERNEL(SYCL, quint16);
-REGISTER_SYCL_KERNEL(SYCL, uint32);
-REGISTER_SYCL_KERNEL(SYCL, qint32);
-REGISTER_SYCL_KERNEL(SYCL, int64);
-REGISTER_SYCL_KERNEL(SYCL, uint64);
-REGISTER_SYCL_KERNEL(SYCL, bool);
-#undef REGISTER_SYCL_KERNEL
-#endif
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T, typename Index>
 class FillOp : public OpKernel {
@@ -216,25 +189,6 @@ REGISTER_KERNEL(CPU, qint8);
 REGISTER_KERNEL(CPU, qint16);
 #undef REGISTER_CPU_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL(SYCL, float);
-REGISTER_KERNEL(SYCL, double);
-REGISTER_KERNEL(SYCL, uint8);
-REGISTER_KERNEL(SYCL, int8);
-REGISTER_KERNEL(SYCL, uint16);
-REGISTER_KERNEL(SYCL, int16);
-REGISTER_KERNEL(SYCL, int64);
-
-REGISTER_KERNEL_BUILDER(Name("Fill")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("index_type")
-                            .HostMemory("dims")
-                            .HostMemory("value")
-                            .HostMemory("output"),
-                        FillOp<CPUDevice, int32, int32>);
-#undef REGISTER_KERNEL_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
@@ -309,17 +263,6 @@ TF_CALL_POD_STRING_TYPES(REGISTER_CPU);
 REGISTER_CPU(Variant);
 #undef REGISTER_CPU
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL(bool, SYCL);
-REGISTER_KERNEL(float, SYCL);
-REGISTER_KERNEL(double, SYCL);
-REGISTER_KERNEL(int64, SYCL);
-REGISTER_KERNEL_BUILDER(Name("ZerosLike")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("y"),
-                        ZerosLikeOp<CPUDevice, int32>);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
@@ -365,15 +308,6 @@ class OnesLikeOp : public OpKernel {
 TF_CALL_POD_TYPES(REGISTER_CPU);
 #undef REGISTER_CPU
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL(float, SYCL);
-REGISTER_KERNEL(bool, SYCL);
-REGISTER_KERNEL_BUILDER(Name("OnesLike")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("y"),
-                        OnesLikeOp<CPUDevice, int32>);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index f886235a3f7..64b1390ed09 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -156,57 +156,6 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 #undef REGISTER_GPU_HOST_KERNEL
 #undef REGISTER_GPU_HOST_REF_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_SWITCH(type)                        \
-  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("pred")         \
-                              .TypeConstraint<type>("T"), \
-                          SwitchOp)
-TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_SWITCH);
-
-#define REGISTER_SYCL_REF_SWITCH(type)                    \
-  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("pred")         \
-                              .TypeConstraint<type>("T"), \
-                          SwitchOp)
-TF_CALL_REAL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_SWITCH);
-
-#undef REGISTER_SYCL_SWITCH
-#undef REGISTER_SYCL_REF_SWITCH
-
-#define REGISTER_SYCL_HOST_KERNEL(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("pred")         \
-                              .HostMemory("output_false") \
-                              .HostMemory("output_true")  \
-                              .TypeConstraint<type>("T"), \
-                          SwitchOp)
-
-REGISTER_SYCL_HOST_KERNEL(bool);
-REGISTER_SYCL_HOST_KERNEL(tstring);
-REGISTER_SYCL_HOST_KERNEL(int32);
-
-#define REGISTER_SYCL_HOST_REF_KERNEL(type)               \
-  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("pred")         \
-                              .HostMemory("output_false") \
-                              .HostMemory("output_true")  \
-                              .TypeConstraint<type>("T"), \
-                          SwitchOp)
-
-REGISTER_SYCL_HOST_REF_KERNEL(int32);
-REGISTER_SYCL_HOST_REF_KERNEL(bool);
-REGISTER_SYCL_HOST_REF_KERNEL(tstring);
-
-#undef REGISTER_SYCL_HOST_KERNEL
-#undef REGISTER_SYCL_HOST_REF_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 class RefSelectOp : public OpKernel {
  public:
@@ -316,28 +265,6 @@ TF_CALL_variant(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                        \
-  REGISTER_KERNEL_BUILDER(Name("Merge")                   \
-                              .Device(DEVICE_SYCL)        \
-                              .TypeConstraint<type>("T")  \
-                              .HostMemory("value_index"), \
-                          MergeOp);
-REGISTER_SYCL_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-
-#define REGISTER_SYCL_REF_KERNEL(type)                    \
-  REGISTER_KERNEL_BUILDER(Name("RefMerge")                \
-                              .Device(DEVICE_SYCL)        \
-                              .TypeConstraint<type>("T")  \
-                              .HostMemory("value_index"), \
-                          MergeOp);
-REGISTER_SYCL_REF_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
-
-#undef REGISTER_SYCL_KERNEL
-#undef REGISTER_SYCL_REF_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -364,29 +291,6 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_HOST_KERNEL(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("Merge")                   \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("inputs")       \
-                              .HostMemory("output")       \
-                              .HostMemory("value_index")  \
-                              .TypeConstraint<type>("T"), \
-                          MergeOp);                       \
-  REGISTER_KERNEL_BUILDER(Name("RefMerge")                \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("inputs")       \
-                              .HostMemory("output")       \
-                              .HostMemory("value_index")  \
-                              .TypeConstraint<type>("T"), \
-                          MergeOp)
-
-REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(tstring);
-REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
-
-#undef REGISTER_SYCL_HOST_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 void EnterOp::Compute(OpKernelContext* context) {
   if (IsRefType(context->input_dtype(0))) {
@@ -416,46 +320,6 @@ TF_CALL_variant(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type) \
-  REGISTER_KERNEL_BUILDER(         \
-      Name("Enter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
-REGISTER_SYCL_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-
-#define REGISTER_SYCL_REF_KERNEL(type) \
-  REGISTER_KERNEL_BUILDER(             \
-      Name("RefEnter").Device(DEVICE_SYCL).TypeConstraint<type>("T"), EnterOp)
-REGISTER_SYCL_REF_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_REF_KERNEL);
-
-#undef REGISTER_SYCL_KERNEL
-#undef REGISTER_SYCL_REF_KERNEL
-#define REGISTER_SYCL_HOST_KERNEL(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("Enter")                   \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          EnterOp)
-
-#define REGISTER_SYCL_HOST_REF_KERNEL(type)               \
-  REGISTER_KERNEL_BUILDER(Name("RefEnter")                \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          EnterOp)
-
-REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_REF_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(tstring);
-REGISTER_SYCL_HOST_REF_KERNEL(tstring);
-REGISTER_SYCL_HOST_KERNEL(ResourceHandle);
-
-#undef REGISTER_SYCL_HOST_KERNEL
-#undef REGISTER_SYCL_HOST_REF_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -513,36 +377,6 @@ TF_CALL_variant(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                         \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("Exit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp); \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("RefExit").Device(DEVICE_SYCL).TypeConstraint<type>("T"), ExitOp);
-REGISTER_SYCL_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-
-#undef REGISTER_SYCL_KERNEL
-#undef REGISTER_SYCL_REF_KERNEL
-
-#define REGISTER_SYCL_HOST_KERNEL(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("Exit")                    \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          ExitOp);                        \
-  REGISTER_KERNEL_BUILDER(Name("RefExit")                 \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          ExitOp)
-
-REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(tstring);
-#undef REGISTER_SYCL_HOST_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
@@ -619,37 +453,6 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                            \
-  REGISTER_KERNEL_BUILDER(                                                    \
-      Name("NextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"),    \
-      NextIterationOp);                                                       \
-  REGISTER_KERNEL_BUILDER(                                                    \
-      Name("RefNextIteration").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      NextIterationOp)
-REGISTER_SYCL_KERNEL(bool);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-
-#undef REGISTER_SYCL_KERNEL
-
-#define REGISTER_SYCL_HOST_KERNEL(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("NextIteration")           \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          NextIterationOp);               \
-  REGISTER_KERNEL_BUILDER(Name("RefNextIteration")        \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("data")         \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          NextIterationOp)
-
-REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(tstring);
-#undef REGISTER_SYCL_HOST_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 LoopCondOp::LoopCondOp(OpKernelConstruction* context) : OpKernel(context) {}
 LoopCondOp::~LoopCondOp() = default;
diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h
index 88e9742b136..b9a8c977e11 100644
--- a/tensorflow/core/kernels/conv_2d.h
+++ b/tensorflow/core/kernels/conv_2d.h
@@ -309,6 +309,28 @@ struct TransformDepth {
   }
 };
 
+// Note on the use of const reference for the "padding_value" argument
+//
+// In the ROCm TF build,
+// ++ the call(s) to the functor are in the files (conv_*.cc) that are compiled
+//    by the "CPU" compiler, while the
+// ++ the GPUDevice specific template instantiations are in the files that are
+//     compiled by the "GPU" compiler.
+//
+// For T == Eigen::half, the value of the "padding_value" argument (when it was
+// pass-by-value) was getting corrupted, leading to regressions in the
+// convolution unit tests.
+//
+// I do not understand the exact reason for the this, but based on similar past
+// issues, it is likely due to a combination of
+// ++ an ABI incompatibility between the "old" CPU compiler (gcc 5.4 for
+//    Ubuntu 16.04, gcc 7.5 for Ubuntu 18.04) and the "new" ROCm GPU compiler
+//    (hipclang which is based on latest clang), AND
+// ++ Eigen::half having the same size but different internals on the CPU and
+//    GPU sides (unsigned short on CPU, union {unsigned short, _Float16} on GPU
+//
+// Changing the "padding value" argument to be a const reference type seems to
+// suppress the bug
 template <typename Device, typename T, typename IndexType, int NDIMS>
 struct PadInput {
   void operator()(const Device& d,
@@ -316,7 +338,7 @@ struct PadInput {
                   const std::array<int, NDIMS - 2>& padding_left,
                   const std::array<int, NDIMS - 2>& padding_right,
                   typename TTypes<T, NDIMS, IndexType>::Tensor out,
-                  TensorFormat format) {
+                  TensorFormat format, const T& padding_value) {
     Eigen::array<Eigen::IndexPair<IndexType>, NDIMS> padding;
     padding[GetTensorDimIndex<NDIMS - 2>(format, 'N')] = {0, 0};
     for (int i = 0; i < NDIMS - 2; ++i) {
@@ -324,7 +346,7 @@ struct PadInput {
           padding_left[i], padding_right[i]};
     }
     padding[GetTensorDimIndex<NDIMS - 2>(format, 'C')] = {0, 0};
-    out.device(d) = in.pad(padding);
+    out.device(d) = in.pad(padding, padding_value);
   }
 };
 
diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h
index 85ca2b5722a..1ed88ca753c 100644
--- a/tensorflow/core/kernels/conv_2d_gpu.h
+++ b/tensorflow/core/kernels/conv_2d_gpu.h
@@ -287,7 +287,7 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
   // One extra line in the inner dimension to avoid share memory bank conflict.
   // This is to mimic the following, but no constructor of T can be invoked.
   //     __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
-#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG
+#if GOOGLE_CUDA  // || TENSORFLOW_COMPILER_IS_HIP_CLANG
   __shared__ __align__(
       alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
   typedef T(*SharedMemoryTile)[TileSizeJ + 1];
@@ -417,14 +417,12 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
 }
 
 // A Gpu custom kernel that convert input to output, given proper padding on
-// the left and the top. The padded value is zero.
+// the left and the top.
 template <typename T, int NDIMS>
-__global__ void PadInputCustomKernelNHWC(int nthreads,
-                                         const T* __restrict__ input,
-                                         Dimension<NDIMS> input_dims,
-                                         T* __restrict__ output,
-                                         Dimension<NDIMS> output_dims,
-                                         Dimension<NDIMS - 2> padding_left) {
+__global__ void PadInputCustomKernelNHWC(
+    int nthreads, const T* __restrict__ input, Dimension<NDIMS> input_dims,
+    T* __restrict__ output, Dimension<NDIMS> output_dims,
+    Dimension<NDIMS - 2> padding_left, T padding_value) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     int output_index = index;
     Index<NDIMS> output_tensor_index =
@@ -444,18 +442,16 @@ __global__ void PadInputCustomKernelNHWC(int nthreads,
       const int input_index = TensorIndexToFlat(input_tensor_index, input_dims);
       output[output_index] = input[input_index];
     } else {
-      output[output_index] = T(0);
+      output[output_index] = padding_value;
     }
   }
 }
 
 template <typename T, int NDIMS>
-__global__ void PadInputCustomKernelNCHW(int nthreads,
-                                         const T* __restrict__ input,
-                                         Dimension<NDIMS> input_dims,
-                                         T* __restrict__ output,
-                                         Dimension<NDIMS> output_dims,
-                                         Dimension<NDIMS - 2> padding_left) {
+__global__ void PadInputCustomKernelNCHW(
+    int nthreads, const T* __restrict__ input, Dimension<NDIMS> input_dims,
+    T* __restrict__ output, Dimension<NDIMS> output_dims,
+    Dimension<NDIMS - 2> padding_left, T padding_value) {
   GPU_1D_KERNEL_LOOP(index, nthreads) {
     int output_index = index;
     Index<NDIMS> output_tensor_index =
@@ -475,7 +471,7 @@ __global__ void PadInputCustomKernelNCHW(int nthreads,
       const int input_index = TensorIndexToFlat(input_tensor_index, input_dims);
       output[output_index] = input[input_index];
     } else {
-      output[output_index] = T(0);
+      output[output_index] = padding_value;
     }
   }
 }
@@ -572,7 +568,7 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
                   const std::array<int, NDIMS - 2>& padding_left,
                   const std::array<int, NDIMS - 2>& padding_right,
                   typename TTypes<T, NDIMS, int>::Tensor out,
-                  TensorFormat format) {
+                  TensorFormat format, const T& padding_value) {
     GpuLaunchConfig config = GetGpuLaunchConfig(out.size(), d);
     Dimension<NDIMS> input_dims;
     for (int i = 0; i < NDIMS; ++i) {
@@ -589,12 +585,14 @@ struct PadInput<GPUDevice, T, int, NDIMS> {
       TF_CHECK_OK(GpuLaunchKernel(
           PadInputCustomKernelNHWC<T, NDIMS>, config.block_count,
           config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
-          in.data(), input_dims, out.data(), output_dims, padding_left_dim));
+          in.data(), input_dims, out.data(), output_dims, padding_left_dim,
+          padding_value));
     } else if (format == FORMAT_NCHW) {
       TF_CHECK_OK(GpuLaunchKernel(
           PadInputCustomKernelNCHW<T, NDIMS>, config.block_count,
           config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
-          in.data(), input_dims, out.data(), output_dims, padding_left_dim));
+          in.data(), input_dims, out.data(), output_dims, padding_left_dim,
+          padding_value));
     } else {
       LOG(FATAL) << "Invalid data format: " << format;
     }
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index a923df5c477..a49f6b5f194 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -812,7 +812,7 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
         {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
         {{static_cast<int>(input_pad_bottom),
           static_cast<int>(input_pad_right)}},
-        To32Bit(compatible_input.tensor<T, 4>()), data_format);
+        To32Bit(compatible_input.tensor<T, 4>()), data_format, T{});
   } else {
     compatible_input = input;
   }
@@ -1012,15 +1012,11 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
               : static_cast<se::ScratchAllocator*>(&scratch_allocator);
 
       ProfileResult profile_result;
-      bool cudnn_launch_status =
-          stream
-              ->ThenConvolveBackwardFilterWithAlgorithm(
-                  input_desc, input_ptr, output_desc, out_backprop_ptr,
-                  conv_desc, filter_desc, &filter_backprop_ptr_rz,
-                  allocator_used, AlgorithmConfig(profile_algorithm),
-                  &profile_result)
-              .ok();
-      if (cudnn_launch_status && profile_result.is_valid()) {
+      auto cudnn_launch_status = stream->ConvolveBackwardFilterWithAlgorithm(
+          input_desc, input_ptr, output_desc, out_backprop_ptr, conv_desc,
+          filter_desc, &filter_backprop_ptr_rz, allocator_used,
+          AlgorithmConfig(profile_algorithm), &profile_result);
+      if (cudnn_launch_status.ok() && profile_result.is_valid()) {
         results.emplace_back();
         auto& result = results.back();
         result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
@@ -1072,19 +1068,13 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
       for (auto miopen_algorithm : algorithms) {
         auto profile_algorithm = miopen_algorithm.algorithm();
         ProfileResult profile_result;
-        bool miopen_launch_status = true;
-        miopen_launch_status =
-            stream
-                ->ThenConvolveBackwardFilterWithAlgorithm(
-                    input_desc, input_ptr, output_desc, out_backprop_ptr,
-                    conv_desc, filter_desc, &filter_backprop_ptr,
-                    &scratch_allocator,
-                    AlgorithmConfig(profile_algorithm,
-                                    miopen_algorithm.scratch_size()),
-                    &profile_result)
-                .ok();
+        auto miopen_launch_status = stream->ConvolveBackwardFilterWithAlgorithm(
+            input_desc, input_ptr, output_desc, out_backprop_ptr, conv_desc,
+            filter_desc, &filter_backprop_ptr, &scratch_allocator,
+            AlgorithmConfig(profile_algorithm, miopen_algorithm.scratch_size()),
+            &profile_result);
 
-        if (miopen_launch_status && profile_result.is_valid()) {
+        if (miopen_launch_status.ok() && profile_result.is_valid()) {
           results.emplace_back();
           auto& result = results.back();
           result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
@@ -1107,19 +1097,13 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
                                                  algorithm_config);
   }
   DnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize, ctx);
-  bool cudnn_launch_status =
-      stream
-          ->ThenConvolveBackwardFilterWithAlgorithm(
-              input_desc, input_ptr, output_desc, out_backprop_ptr, conv_desc,
-              filter_desc, &filter_backprop_ptr, &scratch_allocator,
-              algorithm_config, nullptr)
-          .ok();
+  auto cudnn_launch_status = stream->ConvolveBackwardFilterWithAlgorithm(
+      input_desc, input_ptr, output_desc, out_backprop_ptr, conv_desc,
+      filter_desc, &filter_backprop_ptr, &scratch_allocator, algorithm_config,
+      nullptr);
 
-  if (!cudnn_launch_status) {
-    ctx->SetStatus(errors::Internal(
-        "DNN Backward Filter function launch failure : input shape(",
-        input.shape().DebugString(), ") filter shape(",
-        filter_shape.DebugString(), ")"));
+  if (!cudnn_launch_status.ok()) {
+    ctx->SetStatus(cudnn_launch_status);
     return;
   }
 
@@ -1135,19 +1119,20 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                              \
-  template <>                                                            \
-  void TransformFilter<GPUDevice, T, int, 4>::operator()(                \
-      const GPUDevice& d, FilterTensorFormat dst_filter_format,          \
-      typename TTypes<T, 4, int>::ConstTensor in,                        \
-      typename TTypes<T, 4, int>::Tensor out);                           \
-  extern template struct TransformFilter<GPUDevice, T, int, 4>;          \
-  template <>                                                            \
-  void PadInput<GPUDevice, T, int, 4>::operator()(                       \
-      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,    \
-      const std::array<int, 2>& padding_left,                            \
-      const std::array<int, 2>& padding_right,                           \
-      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
+#define DECLARE_GPU_SPEC(T)                                             \
+  template <>                                                           \
+  void TransformFilter<GPUDevice, T, int, 4>::operator()(               \
+      const GPUDevice& d, FilterTensorFormat dst_filter_format,         \
+      typename TTypes<T, 4, int>::ConstTensor in,                       \
+      typename TTypes<T, 4, int>::Tensor out);                          \
+  extern template struct TransformFilter<GPUDevice, T, int, 4>;         \
+  template <>                                                           \
+  void PadInput<GPUDevice, T, int, 4>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,   \
+      const std::array<int, 2>& padding_left,                           \
+      const std::array<int, 2>& padding_right,                          \
+      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
+      const T& padding_value);                                          \
   extern template struct PadInput<GPUDevice, T, int, 4>;
 
 DECLARE_GPU_SPEC(float);
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 158f93fdec1..2605f7b02f1 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -1185,14 +1185,11 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
               ? static_cast<se::ScratchAllocator*>(&rz_scratch_allocator)
               : static_cast<se::ScratchAllocator*>(&scratch_allocator);
       ProfileResult profile_result;
-      bool cudnn_launch_status =
-          stream
-              ->ThenConvolveBackwardDataWithAlgorithm(
-                  filter_desc, filter_ptr, output_desc, out_backprop_ptr,
-                  conv_desc, input_desc, &in_backprop_ptr_rz, allocator_used,
-                  AlgorithmConfig(profile_algorithm), &profile_result)
-              .ok();
-      if (cudnn_launch_status && profile_result.is_valid()) {
+      auto cudnn_launch_status = stream->ConvolveBackwardDataWithAlgorithm(
+          filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
+          input_desc, &in_backprop_ptr_rz, allocator_used,
+          AlgorithmConfig(profile_algorithm), &profile_result);
+      if (cudnn_launch_status.ok() && profile_result.is_valid()) {
         results.emplace_back();
         auto& result = results.back();
         result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
@@ -1241,18 +1238,13 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
       for (auto miopen_algorithm : algorithms) {
         auto profile_algorithm = miopen_algorithm.algorithm();
         ProfileResult profile_result;
-        bool miopen_launch_status = true;
-        miopen_launch_status =
-            stream
-                ->ThenConvolveBackwardDataWithAlgorithm(
-                    filter_desc, filter_ptr, output_desc, out_backprop_ptr,
-                    conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
-                    AlgorithmConfig(profile_algorithm,
-                                    miopen_algorithm.scratch_size()),
-                    &profile_result)
-                .ok();
+        auto miopen_launch_status = stream->ConvolveBackwardDataWithAlgorithm(
+            filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
+            input_desc, &in_backprop_ptr, &scratch_allocator,
+            AlgorithmConfig(profile_algorithm, miopen_algorithm.scratch_size()),
+            &profile_result);
 
-        if (miopen_launch_status && profile_result.is_valid()) {
+        if (miopen_launch_status.ok() && profile_result.is_valid()) {
           results.emplace_back();
           auto& result = results.back();
           result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
@@ -1273,19 +1265,13 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     AutoTuneConvBwdData::GetInstance()->Insert(conv_parameters,
                                                algorithm_config);
   }
-  bool cudnn_launch_status =
-      stream
-          ->ThenConvolveBackwardDataWithAlgorithm(
-              filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
-              input_desc, &in_backprop_ptr, &scratch_allocator,
-              algorithm_config, nullptr)
-          .ok();
+  auto cudnn_launch_status = stream->ConvolveBackwardDataWithAlgorithm(
+      filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
+      input_desc, &in_backprop_ptr, &scratch_allocator, algorithm_config,
+      nullptr);
 
-  if (!cudnn_launch_status) {
-    ctx->SetStatus(errors::Internal(
-        "DNN Backward Data function launch failure : input shape(",
-        input_shape.DebugString(), ") filter shape(",
-        filter_shape.DebugString(), ")"));
+  if (!cudnn_launch_status.ok()) {
+    ctx->SetStatus(cudnn_launch_status);
     return;
   }
 
@@ -1313,8 +1299,8 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
         {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
         {{static_cast<int>(-input_pad_bottom),
           static_cast<int>(-input_pad_right)}},
-        To32Bit(in_backprop_remove_padding.tensor<T, 4>()),
-        compute_data_format);
+        To32Bit(in_backprop_remove_padding.tensor<T, 4>()), compute_data_format,
+        T{});
 
     pre_transformed_in_backprop = in_backprop_remove_padding;
   }
@@ -1333,19 +1319,20 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
 
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
-#define DECLARE_GPU_SPEC(T)                                              \
-  template <>                                                            \
-  void TransformFilter<GPUDevice, T, int, 4>::operator()(                \
-      const GPUDevice& d, FilterTensorFormat dst_filter_format,          \
-      typename TTypes<T, 4, int>::ConstTensor in,                        \
-      typename TTypes<T, 4, int>::Tensor out);                           \
-  extern template struct TransformFilter<GPUDevice, T, int, 4>;          \
-  template <>                                                            \
-  void PadInput<GPUDevice, T, int, 4>::operator()(                       \
-      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,    \
-      const std::array<int, 2>& padding_left,                            \
-      const std::array<int, 2>& padding_right,                           \
-      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
+#define DECLARE_GPU_SPEC(T)                                             \
+  template <>                                                           \
+  void TransformFilter<GPUDevice, T, int, 4>::operator()(               \
+      const GPUDevice& d, FilterTensorFormat dst_filter_format,         \
+      typename TTypes<T, 4, int>::ConstTensor in,                       \
+      typename TTypes<T, 4, int>::Tensor out);                          \
+  extern template struct TransformFilter<GPUDevice, T, int, 4>;         \
+  template <>                                                           \
+  void PadInput<GPUDevice, T, int, 4>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,   \
+      const std::array<int, 2>& padding_left,                           \
+      const std::array<int, 2>& padding_right,                          \
+      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
+      const T& padding_value);                                          \
   extern template struct PadInput<GPUDevice, T, int, 4>;
 
 DECLARE_GPU_SPEC(float);
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index 8f811138823..ee40cd537d7 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -1082,7 +1082,8 @@ namespace functor {
       const GPUDevice& d, typename TTypes<T, 5, int>::ConstTensor in, \
       const std::array<int, 3>& padding_left,                         \
       const std::array<int, 3>& padding_right,                        \
-      typename TTypes<T, 5, int>::Tensor out, TensorFormat format);
+      typename TTypes<T, 5, int>::Tensor out, TensorFormat format,    \
+      const T& padding_value);
 
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
@@ -1450,14 +1451,11 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
                 ? static_cast<se::ScratchAllocator*>(&rz_scratch_allocator)
                 : static_cast<se::ScratchAllocator*>(&scratch_allocator);
         ProfileResult profile_result;
-        bool cudnn_launch_status =
-            stream
-                ->ThenConvolveBackwardDataWithAlgorithm(
-                    filter_desc, filter_ptr, output_desc, out_backprop_ptr,
-                    conv_desc, input_desc, &in_backprop_ptr_rz, allocator_used,
-                    AlgorithmConfig(profile_algorithm), &profile_result)
-                .ok();
-        if (cudnn_launch_status) {
+        auto cudnn_launch_status = stream->ConvolveBackwardDataWithAlgorithm(
+            filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
+            input_desc, &in_backprop_ptr_rz, allocator_used,
+            AlgorithmConfig(profile_algorithm), &profile_result);
+        if (cudnn_launch_status.ok()) {
           if (profile_result.is_valid()) {
             results.emplace_back();
             auto& result = results.back();
@@ -1491,16 +1489,12 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
       for (auto miopen_algorithm : algorithms) {
         auto profile_algorithm = miopen_algorithm.algorithm();
         ProfileResult profile_result;
-        bool miopen_launch_status =
-            stream
-                ->ThenConvolveBackwardDataWithAlgorithm(
-                    filter_desc, filter_ptr, output_desc, out_backprop_ptr,
-                    conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
-                    AlgorithmConfig(profile_algorithm,
-                                    miopen_algorithm.scratch_size()),
-                    &profile_result)
-                .ok();
-        if (miopen_launch_status) {
+        auto miopen_launch_status = stream->ConvolveBackwardDataWithAlgorithm(
+            filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
+            input_desc, &in_backprop_ptr, &scratch_allocator,
+            AlgorithmConfig(profile_algorithm, miopen_algorithm.scratch_size()),
+            &profile_result);
+        if (miopen_launch_status.ok()) {
           if (profile_result.is_valid()) {
             results.emplace_back();
             auto& result = results.back();
@@ -1526,19 +1520,13 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
     }
     DnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
                                           context);
-    bool cudnn_launch_status =
-        stream
-            ->ThenConvolveBackwardDataWithAlgorithm(
-                filter_desc, filter_ptr, output_desc, out_backprop_ptr,
-                conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
-                algorithm_config, nullptr)
-            .ok();
+    auto cudnn_launch_status = stream->ConvolveBackwardDataWithAlgorithm(
+        filter_desc, filter_ptr, output_desc, out_backprop_ptr, conv_desc,
+        input_desc, &in_backprop_ptr, &scratch_allocator, algorithm_config,
+        nullptr);
 
-    if (!cudnn_launch_status) {
-      context->SetStatus(errors::Internal(
-          "cuDNN Backward Data function launch failure : input shape(",
-          input_shape.DebugString(), ") filter shape(",
-          filter_shape.DebugString(), ")"));
+    if (!cudnn_launch_status.ok()) {
+      context->SetStatus(cudnn_launch_status);
     }
 
     if (rows_odd || cols_odd || planes_odd) {
@@ -1559,7 +1547,7 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
                       .tensor<T, 5>()),
           {{0, 0, 0}}, {{-planes_odd, -rows_odd, -cols_odd}},
           To32Bit(in_backprop_remove_padding.tensor<T, 5>()),
-          compute_data_format);
+          compute_data_format, T{});
 
       pre_transformed_in_backprop = in_backprop_remove_padding;
     }
@@ -1764,7 +1752,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
           context->template eigen_device<GPUDevice>(),
           To32Bit(input.tensor<T, 5>()), {{0, 0, 0}},
           {{planes_odd, rows_odd, cols_odd}},
-          To32Bit(compatible_input.tensor<T, 5>()), data_format_);
+          To32Bit(compatible_input.tensor<T, 5>()), data_format_, T{});
     } else {
       compatible_input = input;
     }
@@ -1947,15 +1935,11 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
         DnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
                                               context);
         ProfileResult profile_result;
-        bool cudnn_launch_status =
-            stream
-                ->ThenConvolveBackwardFilterWithAlgorithm(
-                    input_desc, input_ptr, output_desc, out_backprop_ptr,
-                    conv_desc, filter_desc, &filter_backprop_ptr,
-                    &scratch_allocator, AlgorithmConfig(profile_algorithm),
-                    &profile_result)
-                .ok();
-        if (cudnn_launch_status) {
+        auto cudnn_launch_status = stream->ConvolveBackwardFilterWithAlgorithm(
+            input_desc, input_ptr, output_desc, out_backprop_ptr, conv_desc,
+            filter_desc, &filter_backprop_ptr, &scratch_allocator,
+            AlgorithmConfig(profile_algorithm), &profile_result);
+        if (cudnn_launch_status.ok()) {
           if (profile_result.is_valid()) {
             results.emplace_back();
             auto& result = results.back();
@@ -1982,17 +1966,12 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
       for (auto miopen_algorithm : algorithms) {
         auto profile_algorithm = miopen_algorithm.algorithm();
         ProfileResult profile_result;
-        bool cudnn_launch_status =
-            stream
-                ->ThenConvolveBackwardFilterWithAlgorithm(
-                    input_desc, input_ptr, output_desc, out_backprop_ptr,
-                    conv_desc, filter_desc, &filter_backprop_ptr,
-                    &scratch_allocator,
-                    AlgorithmConfig(profile_algorithm,
-                                    miopen_algorithm.scratch_size()),
-                    &profile_result)
-                .ok();
-        if (cudnn_launch_status) {
+        auto cudnn_launch_status = stream->ConvolveBackwardFilterWithAlgorithm(
+            input_desc, input_ptr, output_desc, out_backprop_ptr, conv_desc,
+            filter_desc, &filter_backprop_ptr, &scratch_allocator,
+            AlgorithmConfig(profile_algorithm, miopen_algorithm.scratch_size()),
+            &profile_result);
+        if (cudnn_launch_status.ok()) {
           if (profile_result.is_valid()) {
             results.emplace_back();
             auto& result = results.back();
@@ -2018,19 +1997,13 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
     }
     DnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
                                           context);
-    bool cudnn_launch_status =
-        stream
-            ->ThenConvolveBackwardFilterWithAlgorithm(
-                input_desc, input_ptr, output_desc, out_backprop_ptr, conv_desc,
-                filter_desc, &filter_backprop_ptr, &scratch_allocator,
-                algorithm_config, nullptr)
-            .ok();
+    auto cudnn_launch_status = stream->ConvolveBackwardFilterWithAlgorithm(
+        input_desc, input_ptr, output_desc, out_backprop_ptr, conv_desc,
+        filter_desc, &filter_backprop_ptr, &scratch_allocator, algorithm_config,
+        nullptr);
 
-    if (!cudnn_launch_status) {
-      context->SetStatus(errors::Internal(
-          "cuDNN Backward Filter function launch failure : input shape(",
-          input_shape.DebugString(), ") filter shape(",
-          filter_shape.DebugString(), ")"));
+    if (!cudnn_launch_status.ok()) {
+      context->SetStatus(cudnn_launch_status);
     }
 
     auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; };
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index ca2abce0b15..f5b9e79fb54 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -619,19 +619,7 @@ template struct LaunchConv2DOp<CPUDevice, double>;
 
 int64 GetDnnWorkspaceLimit(const string& envvar_in_mb,
                            int64 default_value_in_bytes) {
-  const char* workspace_limit_in_mb_str = getenv(envvar_in_mb.c_str());
-  if (workspace_limit_in_mb_str != nullptr &&
-      strcmp(workspace_limit_in_mb_str, "") != 0) {
-    int64 scratch_limit_in_mb = -1;
-    if (strings::safe_strto64(workspace_limit_in_mb_str,
-                              &scratch_limit_in_mb)) {
-      return scratch_limit_in_mb * (1 << 20);
-    } else {
-      LOG(WARNING) << "Invalid value for env-var " << envvar_in_mb << ": "
-                   << workspace_limit_in_mb_str;
-    }
-  }
-  return default_value_in_bytes;
+  return gpu_utils::GetWorkspaceLimit(envvar_in_mb, default_value_in_bytes);
 }
 
 // A dummy type to group forward convolution autotune results together.
@@ -827,7 +815,7 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
         {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
         {{static_cast<int>(input_pad_bottom),
           static_cast<int>(input_pad_right)}},
-        To32Bit(transformed_input.tensor<T, 4>()), data_format);
+        To32Bit(transformed_input.tensor<T, 4>()), data_format, T{});
 
     input = transformed_input;
     in_rows = new_in_rows;
@@ -1024,14 +1012,11 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
               : static_cast<se::ScratchAllocator*>(&scratch_allocator);
 
       ProfileResult profile_result;
-      bool cudnn_launch_status =
-          stream
-              ->ThenConvolveWithAlgorithm(
-                  input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
-                  output_desc, &output_tensor, allocator_used,
-                  AlgorithmConfig(profile_algorithm), &profile_result)
-              .ok();
-      if (cudnn_launch_status && profile_result.is_valid()) {
+      auto cudnn_launch_status = stream->ConvolveWithAlgorithm(
+          input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
+          output_desc, &output_tensor, allocator_used,
+          AlgorithmConfig(profile_algorithm), &profile_result);
+      if (cudnn_launch_status.ok() && profile_result.is_valid()) {
         results.emplace_back();
         auto& result = results.back();
         result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
@@ -1083,17 +1068,12 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       for (auto miopen_algorithm : algorithms) {
         auto profile_algorithm = miopen_algorithm.algorithm();
         ProfileResult profile_result;
-        bool miopen_launch_status = false;
-        miopen_launch_status =
-            stream
-                ->ThenConvolveWithAlgorithm(
-                    input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
-                    output_desc, &output_ptr, &scratch_allocator,
-                    AlgorithmConfig(profile_algorithm,
-                                    miopen_algorithm.scratch_size()),
-                    &profile_result)
-                .ok();
-        if (miopen_launch_status && profile_result.is_valid()) {
+        auto miopen_launch_status = stream->ConvolveWithAlgorithm(
+            input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
+            output_desc, &output_ptr, &scratch_allocator,
+            AlgorithmConfig(profile_algorithm, miopen_algorithm.scratch_size()),
+            &profile_result);
+        if (miopen_launch_status.ok() && profile_result.is_valid()) {
           results.emplace_back();
           auto& result = results.back();
           result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
@@ -1121,18 +1101,12 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
           << algorithm_config.algorithm()->tensor_ops_enabled();
 
   DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
-  bool cudnn_launch_status =
-      stream
-          ->ThenConvolveWithAlgorithm(input_desc, input_ptr, filter_desc,
-                                      filter_ptr, conv_desc, output_desc,
-                                      &output_ptr, &scratch_allocator,
-                                      algorithm_config, nullptr)
-          .ok();
+  auto cudnn_launch_status = stream->ConvolveWithAlgorithm(
+      input_desc, input_ptr, filter_desc, filter_ptr, conv_desc, output_desc,
+      &output_ptr, &scratch_allocator, algorithm_config, nullptr);
 
-  if (!cudnn_launch_status) {
-    ctx->SetStatus(errors::Internal(
-        "cuDNN launch failure : input shape(", input.shape().DebugString(),
-        ") filter shape(", filter.shape().DebugString(), ")"));
+  if (!cudnn_launch_status.ok()) {
+    ctx->SetStatus(cudnn_launch_status);
   }
 
   if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
@@ -1183,7 +1157,8 @@ namespace functor {
       const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,       \
       const std::array<int, 2>& padding_left,                               \
       const std::array<int, 2>& padding_right,                              \
-      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format);    \
+      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format,     \
+      const T& padding_value);                                              \
   extern template struct PadInput<GPUDevice, T, int, 4>
 
 DECLARE_GPU_SPEC(float);
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 289a083acfb..4ca5f514b7a 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -194,7 +194,7 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
         functor::PadInput<GPUDevice, T, int, 5>()(
             ctx->eigen_device<GPUDevice>(), To32Bit(input_param.tensor<T, 5>()),
             {{0, 0, 0}}, {{planes_odd, rows_odd, cols_odd}},
-            To32Bit(transformed_input.tensor<T, 5>()), data_format);
+            To32Bit(transformed_input.tensor<T, 5>()), data_format, T{});
         input = transformed_input;
         in_rows = new_in_rows;
         in_cols = new_in_cols;
@@ -402,14 +402,11 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
                 ? static_cast<se::ScratchAllocator*>(&rz_scratch_allocator)
                 : static_cast<se::ScratchAllocator*>(&scratch_allocator);
         ProfileResult profile_result;
-        bool cudnn_launch_status =
-            stream
-                ->ThenConvolveWithAlgorithm(
-                    input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
-                    output_desc, &output_ptr_rz, allocator_used,
-                    AlgorithmConfig(profile_algorithm), &profile_result)
-                .ok();
-        if (cudnn_launch_status) {
+        auto cudnn_launch_status = stream->ConvolveWithAlgorithm(
+            input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
+            output_desc, &output_ptr_rz, allocator_used,
+            AlgorithmConfig(profile_algorithm), &profile_result);
+        if (cudnn_launch_status.ok()) {
           if (profile_result.is_valid()) {
             results.emplace_back();
             auto& result = results.back();
@@ -459,16 +456,13 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
         for (auto miopen_algorithm : algorithms) {
           auto profile_algorithm = miopen_algorithm.algorithm();
           ProfileResult profile_result;
-          bool miopen_launch_status =
-              stream
-                  ->ThenConvolveWithAlgorithm(
-                      input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
-                      output_desc, &output_ptr, &scratch_allocator,
-                      AlgorithmConfig(profile_algorithm,
-                                      miopen_algorithm.scratch_size()),
-                      &profile_result)
-                  .ok();
-          if (miopen_launch_status) {
+          auto miopen_launch_status = stream->ConvolveWithAlgorithm(
+              input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
+              output_desc, &output_ptr, &scratch_allocator,
+              AlgorithmConfig(profile_algorithm,
+                              miopen_algorithm.scratch_size()),
+              &profile_result);
+          if (miopen_launch_status.ok()) {
             if (profile_result.is_valid()) {
               results.emplace_back();
               auto& result = results.back();
@@ -493,18 +487,12 @@ struct LaunchConvOp<GPUDevice, T, OpKernelContext> {
     }
 
     DnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
-    bool cudnn_launch_status =
-        stream
-            ->ThenConvolveWithAlgorithm(input_desc, input_ptr, filter_desc,
-                                        filter_ptr, conv_desc, output_desc,
-                                        &output_ptr, &scratch_allocator,
-                                        algorithm_config, nullptr)
-            .ok();
+    auto cudnn_launch_status = stream->ConvolveWithAlgorithm(
+        input_desc, input_ptr, filter_desc, filter_ptr, conv_desc, output_desc,
+        &output_ptr, &scratch_allocator, algorithm_config, nullptr);
 
-    if (!cudnn_launch_status) {
-      ctx->SetStatus(errors::Internal(
-          "cuDNN launch failure : input shape(", input.shape().DebugString(),
-          ") filter shape(", filter.shape().DebugString(), ")"));
+    if (!cudnn_launch_status.ok()) {
+      ctx->SetStatus(cudnn_launch_status);
     }
 
     if (data_format == FORMAT_NHWC && compute_data_format == FORMAT_NCHW) {
@@ -539,7 +527,8 @@ namespace functor {
       const GPUDevice& d, typename TTypes<T, 5, int>::ConstTensor in, \
       const std::array<int, 3>& padding_left,                         \
       const std::array<int, 3>& padding_right,                        \
-      typename TTypes<T, 5, int>::Tensor out, TensorFormat format);   \
+      typename TTypes<T, 5, int>::Tensor out, TensorFormat format,    \
+      const T& padding_value);                                        \
   template <>                                                         \
   void NHWCToNCHW<GPUDevice, T, 5>::operator()(                       \
       const GPUDevice& d, typename TTypes<T, 5>::ConstTensor in,      \
diff --git a/tensorflow/core/kernels/conv_ops_fused_impl.h b/tensorflow/core/kernels/conv_ops_fused_impl.h
index f838d05decf..dfa921876c2 100644
--- a/tensorflow/core/kernels/conv_ops_fused_impl.h
+++ b/tensorflow/core/kernels/conv_ops_fused_impl.h
@@ -106,6 +106,16 @@ class LaunchFusedConv2DWithOutputKernel {
   template <typename OutputKernel>
   void operator()(const OutputKernel& output_kernel, OpKernelContext* ctx,
                   const Tensor& input, const Tensor& filter, Tensor* output) {
+    // Wrap output_kernel into type erased wrapper to reduce the number of
+    // unique template instantiations for Eigen Tensor contraction expressions.
+    OutputKernelWrapper output_kernel_wrapper(
+        [&output_kernel](
+            const ContractionOutputMapper<T, Eigen::Index>& output_mapper,
+            const Eigen::TensorContractionParams& params, Eigen::Index i,
+            Eigen::Index j, Eigen::Index num_rows, Eigen::Index num_cols) {
+          output_kernel(output_mapper, params, i, j, num_rows, num_cols);
+        });
+
     if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 &&
         row_stride_ == 1 && col_stride_ == 1 && padding_ != EXPLICIT) {
       int conv_width = 1;  // Width for the convolution step.
@@ -115,12 +125,12 @@ class LaunchFusedConv2DWithOutputKernel {
 
       Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
       dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
-      functor::MatMulConvFunctor<CPUDevice, T, OutputKernel>()(
+      functor::MatMulConvFunctor<CPUDevice, T, OutputKernelWrapper>()(
           ctx->eigen_device<CPUDevice>(),
           output->shaped<T, 2>({conv_width, filter.dim_size(3)}),
           input.shaped<T, 2>({conv_width, filter.dim_size(2)}),
           filter.shaped<T, 2>({filter.dim_size(2), filter.dim_size(3)}),
-          dim_pair, output_kernel);
+          dim_pair, std::move(output_kernel_wrapper));
 
     } else if (filter.dim_size(0) == input.dim_size(1) &&
                filter.dim_size(1) == input.dim_size(2) && row_dilation_ == 1 &&
@@ -132,34 +142,60 @@ class LaunchFusedConv2DWithOutputKernel {
 
       Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
       dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
-      functor::MatMulConvFunctor<CPUDevice, T, OutputKernel>()(
+      functor::MatMulConvFunctor<CPUDevice, T, OutputKernelWrapper>()(
           ctx->eigen_device<CPUDevice>(),
           output->shaped<T, 2>({input.dim_size(0), filter.dim_size(3)}),
           input.shaped<T, 2>({input.dim_size(0), k}),
           filter.shaped<T, 2>({k, filter.dim_size(3)}), dim_pair,
-          output_kernel);
+          std::move(output_kernel_wrapper));
 
     } else {
       if (padding_ == EXPLICIT) {
-        functor::SpatialConvolution<CPUDevice, T, OutputKernel>()(
+        functor::SpatialConvolution<CPUDevice, T, OutputKernelWrapper>()(
             ctx->eigen_device<CPUDevice>(), output->tensor<T, 4>(),
             input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride_,
             col_stride_, row_dilation_, col_dilation_,
             static_cast<int>(explicit_paddings_[2]),
             static_cast<int>(explicit_paddings_[3]),
             static_cast<int>(explicit_paddings_[4]),
-            static_cast<int>(explicit_paddings_[5]), output_kernel);
+            static_cast<int>(explicit_paddings_[5]),
+            std::move(output_kernel_wrapper));
       } else {
-        functor::SpatialConvolution<CPUDevice, T, OutputKernel>()(
+        functor::SpatialConvolution<CPUDevice, T, OutputKernelWrapper>()(
             ctx->eigen_device<CPUDevice>(), output->tensor<T, 4>(),
             input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride_,
             col_stride_, row_dilation_, col_dilation_,
-            BrainPadding2EigenPadding(padding_), output_kernel);
+            BrainPadding2EigenPadding(padding_),
+            std::move(output_kernel_wrapper));
       }
     }
   }
 
  private:
+  // Wrap output_kernel into type erased struct to reduce the number of unique
+  // template instantiations for Eigen Tensor contraction expressions.
+  //
+  // We do not pass std::function directly as an output kernel because it blows
+  // up the binary size in debug mode with super long symbol names.
+  struct OutputKernelWrapper {
+    using OutputKernelFn =
+        std::function<void(const ContractionOutputMapper<T, Eigen::Index>&,
+                           const Eigen::TensorContractionParams&, Eigen::Index,
+                           Eigen::Index, Eigen::Index, Eigen::Index)>;
+
+    explicit OutputKernelWrapper(OutputKernelFn fn)
+        : output_kernel_fn(std::move(fn)) {}
+
+    void operator()(
+        const ContractionOutputMapper<T, Eigen::Index>& output_mapper,
+        const Eigen::TensorContractionParams& params, Eigen::Index i,
+        Eigen::Index j, Eigen::Index num_rows, Eigen::Index num_cols) const {
+      output_kernel_fn(output_mapper, params, i, j, num_rows, num_cols);
+    }
+
+    OutputKernelFn output_kernel_fn;
+  };
+
   int row_stride_;
   int col_stride_;
   int row_dilation_;
@@ -185,14 +221,26 @@ struct LaunchFusedConv2DOp<CPUDevice, T> {
 
     BiasAddArgs<T> bias_add_args;
     if (BiasAddArgs<T>::IsSupported(fusion)) {
-      OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add_args));
+      if (fusion == FusedComputationType::kBiasAddWithLeakyRelu) {
+        OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add_args,
+                                                &fusion_args.leakyrelu_alpha));
+      } else {
+        OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add_args));
+      }
     }
 
     FusedBatchNormArgs<T> fused_batch_norm_args;
     if (FusedBatchNormArgs<T>::IsSupported(fusion)) {
-      OP_REQUIRES_OK(context,
-                     InitFusedBatchNormArgs(context, fusion_args.epsilon,
-                                            &fused_batch_norm_args));
+      if (fusion == FusedComputationType::kFusedBatchNormWithLeakyRelu) {
+        OP_REQUIRES_OK(context,
+                       InitFusedBatchNormArgs(context, fusion_args.epsilon,
+                                              &fused_batch_norm_args,
+                                              &fusion_args.leakyrelu_alpha));
+      } else {
+        OP_REQUIRES_OK(context,
+                       InitFusedBatchNormArgs(context, fusion_args.epsilon,
+                                              &fused_batch_norm_args));
+      }
     }
 
     LaunchFusedConv2DWithOutputKernel<T> conv2d(
@@ -215,6 +263,10 @@ struct LaunchFusedConv2DOp<CPUDevice, T> {
         conv2d(WithBiasAddAndRelu6<T>(bias_add_args), context, input, filter,
                output);
         break;
+      case FusedComputationType::kBiasAddWithLeakyRelu:
+        conv2d(WithBiasAddAndLeakyRelu<T>(bias_add_args), context, input,
+               filter, output);
+        break;
       case FusedComputationType::kBiasAddWithElu:
         conv2d(WithBiasAddAndElu<T>(bias_add_args), context, input, filter,
                output);
@@ -234,6 +286,11 @@ struct LaunchFusedConv2DOp<CPUDevice, T> {
                                              fused_batch_norm_args),
                context, input, filter, output);
         break;
+      case FusedComputationType::kFusedBatchNormWithLeakyRelu:
+        conv2d(WithFusedBatchNormAndLeakyRelu<T>(fusion_args.epsilon,
+                                                 fused_batch_norm_args),
+               context, input, filter, output);
+        break;
       case FusedComputationType::kFusedBatchNormWithElu:
         conv2d(WithFusedBatchNormAndElu<T>(fusion_args.epsilon,
                                            fused_batch_norm_args),
@@ -345,11 +402,11 @@ Status FindBestConvolveAlgorithm(const FusedConvParameters& params,
             : static_cast<se::ScratchAllocator*>(&scratch_allocator);
     se::dnn::ProfileResult profile_result;
 
-    bool cudnn_launch_status =
+    Status cudnn_launch_status =
         launch(se::dnn::AlgorithmConfig(profile_algorithm), allocator_used,
                output_ptr_rz, &profile_result);
 
-    if (cudnn_launch_status && profile_result.is_valid()) {
+    if (cudnn_launch_status.ok() && profile_result.is_valid()) {
       results.emplace_back();
       auto& result = results.back();
       result.mutable_conv()->set_algorithm(profile_algorithm.algo_id());
@@ -474,7 +531,7 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
           {{static_cast<int>(input_pad_top), static_cast<int>(input_pad_left)}},
           {{static_cast<int>(input_pad_bottom),
             static_cast<int>(input_pad_right)}},
-          To32Bit(transformed_input.tensor<T, 4>()), params.data_format);
+          To32Bit(transformed_input.tensor<T, 4>()), params.data_format, T{});
       input = transformed_input;
       in_rows = new_in_rows;
       in_cols = new_in_cols;
@@ -612,19 +669,17 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
     const auto launch = [&](se::dnn::AlgorithmConfig algorithm_config,
                             se::ScratchAllocator* scratch_allocator,
                             se::DeviceMemory<T> output_ptr_to_use,
-                            se::dnn::ProfileResult* profile_result) -> bool {
-      return stream
-          ->ThenFusedConvolveWithAlgorithm(
-              input_desc, input_ptr,                     // input
-              /*conv_input_scale=*/1.0,                  // input_scale
-              filter_desc, filter_ptr,                   // filter
-              conv_desc,                                 // conv
-              side_input_ptr, /*side_input_scale=*/0.0,  // side_input
-              bias_desc, bias_ptr,                       // bias
-              dnn_activation_mode,                       // activation
-              output_desc, &output_ptr_to_use,           // output
-              scratch_allocator, algorithm_config, profile_result)
-          .ok();
+                            se::dnn::ProfileResult* profile_result) -> Status {
+      return stream->FusedConvolveWithAlgorithm(
+          input_desc, input_ptr,                     // input
+          /*conv_input_scale=*/1.0,                  // input_scale
+          filter_desc, filter_ptr,                   // filter
+          conv_desc,                                 // conv
+          side_input_ptr, /*side_input_scale=*/0.0,  // side_input
+          bias_desc, bias_ptr,                       // bias
+          dnn_activation_mode,                       // activation
+          output_desc, &output_ptr_to_use,           // output
+          scratch_allocator, algorithm_config, profile_result);
     };
 
     se::dnn::AlgorithmConfig algorithm_config;
@@ -643,13 +698,9 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
     }
 
     DnnScratchAllocator scratch_allocator(ConvolveScratchSize(), context);
-    bool cudnn_launch_status = launch(algorithm_config, &scratch_allocator,
-                                      output_ptr, /*profile_result=*/nullptr);
-    OP_REQUIRES(
-        context, cudnn_launch_status,
-        errors::Internal(absl::Substitute(
-            "cuDNN launch failure: input shape($0) filter shape($1)",
-            input.shape().DebugString(), filter.shape().DebugString())));
+    Status cudnn_launch_status = launch(algorithm_config, &scratch_allocator,
+                                        output_ptr, /*profile_result=*/nullptr);
+    OP_REQUIRES_OK(context, cudnn_launch_status);
 
     // Convert the output tensor back from NCHW to NHWC.
     if (params.data_format == FORMAT_NHWC) {
@@ -681,10 +732,12 @@ class FusedConv2DOp : public OpKernel {
           {FCT::kBiasAddWithRelu, {"BiasAdd", "Relu"}},
           {FCT::kBiasAddWithRelu6, {"BiasAdd", "Relu6"}},
           {FCT::kBiasAddWithElu, {"BiasAdd", "Elu"}},
+          {FCT::kBiasAddWithLeakyRelu, {"BiasAdd", "LeakyRelu"}},
           {FCT::kFusedBatchNorm, {"FusedBatchNorm"}},
           {FCT::kFusedBatchNormWithRelu, {"FusedBatchNorm", "Relu"}},
           {FCT::kFusedBatchNormWithRelu6, {"FusedBatchNorm", "Relu6"}},
           {FCT::kFusedBatchNormWithElu, {"FusedBatchNorm", "Elu"}},
+          {FCT::kFusedBatchNormWithLeakyRelu, {"FusedBatchNorm", "LeakyRelu"}},
       };
     }
 
@@ -765,19 +818,20 @@ class FusedConv2DOp : public OpKernel {
 
 #if GOOGLE_CUDA
 
-#define DECLARE_FUNCTOR_GPU_SPEC(T)                                      \
-  template <>                                                            \
-  void TransformFilter<GPUDevice, T, int, 4>::operator()(                \
-      const GPUDevice& d, FilterTensorFormat dst_filter_format,          \
-      typename TTypes<T, 4, int>::ConstTensor in,                        \
-      typename TTypes<T, 4, int>::Tensor out);                           \
-  extern template struct TransformFilter<GPUDevice, T, int, 4>;          \
-  template <>                                                            \
-  void PadInput<GPUDevice, T, int, 4>::operator()(                       \
-      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,    \
-      const std::array<int, 2>& padding_left,                            \
-      const std::array<int, 2>& padding_right,                           \
-      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
+#define DECLARE_FUNCTOR_GPU_SPEC(T)                                     \
+  template <>                                                           \
+  void TransformFilter<GPUDevice, T, int, 4>::operator()(               \
+      const GPUDevice& d, FilterTensorFormat dst_filter_format,         \
+      typename TTypes<T, 4, int>::ConstTensor in,                       \
+      typename TTypes<T, 4, int>::Tensor out);                          \
+  extern template struct TransformFilter<GPUDevice, T, int, 4>;         \
+  template <>                                                           \
+  void PadInput<GPUDevice, T, int, 4>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,   \
+      const std::array<int, 2>& padding_left,                           \
+      const std::array<int, 2>& padding_right,                          \
+      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
+      const T& padding_value);                                          \
   extern template struct PadInput<GPUDevice, T, int, 4>
 
 // Registration of the GPU implementations.
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index 2e97d486b54..8beab722a64 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -48,52 +48,7 @@ int64 GetDnnWorkspaceLimit(const string& envvar_in_mb,
 // A class to provide scratch-space allocator for Stream-Executor Cudnn
 // callback. TensorFlow is responsible for releasing the temporary buffers after
 // the kernel finishes.
-class DnnScratchAllocator : public se::ScratchAllocator {
- public:
-  virtual ~DnnScratchAllocator() {}
-  DnnScratchAllocator(int64 memory_limit, OpKernelContext* context)
-      : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
-  int64 GetMemoryLimitInBytes() override { return memory_limit_; }
-  se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
-      int64 byte_size) override {
-    Tensor temporary_memory;
-    if (byte_size < 0) {
-      return se::port::Status{se::port::error::INVALID_ARGUMENT,
-                              "Requested negative byte size!"};
-    }
-    if (byte_size > memory_limit_) {
-      return se::port::Status{se::port::error::UNAVAILABLE,
-                              absl::StrCat("Requested memory size (", byte_size,
-                                           ") exceeds the max memory limit (",
-                                           memory_limit_, ").")};
-    }
-    AllocationAttributes allocation_attr;
-    allocation_attr.retry_on_failure = false;
-    Status allocation_status(context_->allocate_temp(
-        DT_UINT8, TensorShape({byte_size}), &temporary_memory,
-        AllocatorAttributes(), allocation_attr));
-    if (!allocation_status.ok()) {
-      return se::port::Status{
-          se::port::error::UNAVAILABLE,
-          absl::StrCat("Failed to allocate the requested memory size (",
-                       byte_size, ").")};
-    }
-    // Hold the reference of the allocated tensors until the end of the
-    // allocator.
-    allocated_tensors_.push_back(temporary_memory);
-    total_byte_size_ += byte_size;
-    return se::port::StatusOr<se::DeviceMemory<uint8>>(
-        AsDeviceMemory(temporary_memory.flat<uint8>().data(),
-                       temporary_memory.flat<uint8>().size()));
-  }
-  int64 TotalByteSize() { return total_byte_size_; }
-
- private:
-  int64 memory_limit_;
-  int64 total_byte_size_;
-  OpKernelContext* context_;
-  std::vector<Tensor> allocated_tensors_;
-};
+using DnnScratchAllocator = GpuScratchAllocator;
 
 // Encapsulate all the shape information that is used in both forward and
 // backward conv operations.
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index 3e192b83c57..6be42217501 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/image_ops.h"
 #include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/nn_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/fake_input.h"
@@ -30,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -652,6 +654,8 @@ class FusedConv2DOpTest : public OpsTestBase {
       ops::Relu6(root.WithOpName("with_activation"), with_bias);
     } else if (activation_type == "Elu") {
       ops::Elu(root.WithOpName("with_activation"), with_bias);
+    } else if (activation_type == "LeakyRelu") {
+      ops::internal::LeakyRelu(root.WithOpName("with_activation"), with_bias);
     } else {
       ops::Identity(root.WithOpName("with_activation"), with_bias);
     }
@@ -721,6 +725,9 @@ class FusedConv2DOpTest : public OpsTestBase {
       ops::Relu6(root.WithOpName("with_activation"), with_fused_batch_norm.y);
     } else if (activation_type == "Elu") {
       ops::Elu(root.WithOpName("with_activation"), with_fused_batch_norm.y);
+    } else if (activation_type == "LeakyRelu") {
+      ops::internal::LeakyRelu(root.WithOpName("with_activation"),
+                               with_fused_batch_norm.y);
     } else {
       ops::Identity(root.WithOpName("with_activation"),
                     with_fused_batch_norm.y);
@@ -1038,9 +1045,11 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest, ExplicitPaddingConvolution) {
 #endif
 
 TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolutionAndActivation) {
+  // Requires full precision Conv2D op
+  tensorflow::enable_tensor_float_32_execution(false);
   const int filter_size = 1;
   const int filter_count = 12;
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBiasAndActivation(activation, filter_size,
                                             filter_count);
   }
@@ -1049,7 +1058,7 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolutionAndActivation) {
 TYPED_TEST_P(FusedConv2DWithBiasOpTest, ImageSizeConvolutionAndActivation) {
   const int filter_size = TestFixture::kImageWidth;
   const int filter_count = 12;
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBiasAndActivation(activation, filter_size,
                                             filter_count);
   }
@@ -1058,7 +1067,7 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest, ImageSizeConvolutionAndActivation) {
 TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolutionAndActivation) {
   const int filter_size = 3;
   const int filter_count = 12;
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBiasAndActivation(activation, filter_size,
                                             filter_count);
   }
@@ -1069,7 +1078,7 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest,
              ExplicitPaddingConvolutionAndActivation) {
   const int filter_size = 3;
   const int filter_count = 12;
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBiasAndActivation(
         activation, filter_size, filter_count,
         /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
@@ -1112,7 +1121,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ExplicitPaddingConvolution) {
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, OneByOneConvolutionAndActivation) {
   const int filter_size = 1;
   const int filter_count = 12;
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBatchNormAndActivation(activation, filter_size,
                                                  filter_count);
   }
@@ -1122,7 +1131,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,
              ImageSizeConvolutionAndActivation) {
   const int filter_size = TestFixture::kImageWidth;
   const int filter_count = 12;
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBatchNormAndActivation(activation, filter_size,
                                                  filter_count);
   }
@@ -1131,7 +1140,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolutionAndActivation) {
   const int filter_size = 3;
   const int filter_count = 12;
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBatchNormAndActivation(activation, filter_size,
                                                  filter_count);
   }
@@ -1142,7 +1151,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,
              ExplicitPaddingConvolutionAndActivation) {
   const int filter_size = 3;
   const int filter_count = 12;
-  for (const string& activation : {"Relu", "Relu6", "Elu"}) {
+  for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
     this->VerifyConv2DWithBatchNormAndActivation(
         activation, filter_size, filter_count,
         /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
diff --git a/tensorflow/core/kernels/count_ops.cc b/tensorflow/core/kernels/count_ops.cc
index 7c85b050039..087deef0812 100644
--- a/tensorflow/core/kernels/count_ops.cc
+++ b/tensorflow/core/kernels/count_ops.cc
@@ -178,10 +178,30 @@ class SparseCount : public OpKernel {
     const Tensor& weights = context->input(3);
     bool use_weights = weights.NumElements() > 0;
 
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(indices.shape()),
+                errors::InvalidArgument(
+                    "Input indices must be a 2-dimensional tensor. Got: ",
+                    indices.shape().DebugString()));
+
+    if (use_weights) {
+      OP_REQUIRES(
+          context, weights.shape() == values.shape(),
+          errors::InvalidArgument(
+              "Weights and values must have the same shape. Weight shape: ",
+              weights.shape().DebugString(),
+              "; values shape: ", values.shape().DebugString()));
+    }
+
     bool is_1d = shape.NumElements() == 1;
     int num_batches = is_1d ? 1 : shape.flat<int64>()(0);
     int num_values = values.NumElements();
 
+    OP_REQUIRES(context, num_values == indices.shape().dim_size(0),
+                errors::InvalidArgument(
+                    "Number of values must match first dimension of indices.",
+                    "Got ", num_values,
+                    " values, indices shape: ", indices.shape().DebugString()));
+
     const auto indices_values = indices.matrix<int64>();
     const auto values_values = values.flat<T>();
     const auto weight_values = weights.flat<W>();
@@ -235,12 +255,33 @@ class RaggedCount : public OpKernel {
     bool use_weights = weights.NumElements() > 0;
     bool is_1d = false;
 
+    if (use_weights) {
+      OP_REQUIRES(
+          context, weights.shape() == values.shape(),
+          errors::InvalidArgument(
+              "Weights and values must have the same shape. Weight shape: ",
+              weights.shape().DebugString(),
+              "; values shape: ", values.shape().DebugString()));
+    }
+
     const auto splits_values = splits.flat<int64>();
     const auto values_values = values.flat<T>();
     const auto weight_values = weights.flat<W>();
     int num_batches = splits.NumElements() - 1;
     int num_values = values.NumElements();
 
+    OP_REQUIRES(
+        context, num_batches > 0,
+        errors::InvalidArgument(
+            "Must provide at least 2 elements for the splits argument"));
+    OP_REQUIRES(context, splits_values(0) == 0,
+                errors::InvalidArgument("Splits must start with 0, not with ",
+                                        splits_values(0)));
+    OP_REQUIRES(context, splits_values(num_batches) == num_values,
+                errors::InvalidArgument(
+                    "Splits must end with the number of values, got ",
+                    splits_values(num_batches), " instead of ", num_values));
+
     auto per_batch_counts = BatchedMap<W>(num_batches);
     T max_value = 0;
     int batch_idx = 0;
diff --git a/tensorflow/core/kernels/cwise_op_abs.cc b/tensorflow/core/kernels/cwise_op_abs.cc
index d3b09f7078a..20befa1c061 100644
--- a/tensorflow/core/kernels/cwise_op_abs.cc
+++ b/tensorflow/core/kernels/cwise_op_abs.cc
@@ -39,13 +39,4 @@ REGISTER_KERNEL_BUILDER(Name("Abs")
 #endif
 #endif
 
-#if TENSORFLOW_USE_SYCL
-REGISTER3(UnaryOp, SYCL, "Abs", functor::abs, float, double, int64);
-REGISTER_KERNEL_BUILDER(Name("Abs")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
-                        UnaryOp<CPUDevice, functor::abs<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_acos.cc b/tensorflow/core/kernels/cwise_op_acos.cc
index 8449f0661da..7cd01cf283e 100644
--- a/tensorflow/core/kernels/cwise_op_acos.cc
+++ b/tensorflow/core/kernels/cwise_op_acos.cc
@@ -22,7 +22,4 @@ REGISTER2(UnaryOp, CPU, "Acos", functor::acos, float, double);
 REGISTER2(UnaryOp, GPU, "Acos", functor::acos, float, double);
 #endif
 
-#if TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Acos", functor::acos, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_acosh.cc b/tensorflow/core/kernels/cwise_op_acosh.cc
index 06aee8671bc..05acf66fb16 100644
--- a/tensorflow/core/kernels/cwise_op_acosh.cc
+++ b/tensorflow/core/kernels/cwise_op_acosh.cc
@@ -20,9 +20,6 @@ namespace tensorflow {
 REGISTER4(UnaryOp, CPU, "Acosh", functor::acosh, float, double, complex64,
           complex128);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Acosh", functor::acosh, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER2(UnaryOp, GPU, "Acosh", functor::acosh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_add_1.cc b/tensorflow/core/kernels/cwise_op_add_1.cc
index 608fe3fa8b1..0af41541de7 100644
--- a/tensorflow/core/kernels/cwise_op_add_1.cc
+++ b/tensorflow/core/kernels/cwise_op_add_1.cc
@@ -44,26 +44,4 @@ REGISTER_KERNEL_BUILDER(Name("AddV2")
                         BinaryOp<CPUDevice, functor::add<int32>>);
 #endif
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_KERNEL(type)                          \
-  REGISTER(BinaryOp, SYCL, "Add", functor::add, type); \
-  REGISTER(BinaryOp, SYCL, "AddV2", functor::add, type);
-
-TF_CALL_SYCL_NUMBER_TYPES(REGISTER_KERNEL);
-
-REGISTER_KERNEL_BUILDER(Name("Add")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::add<int32>>);
-REGISTER_KERNEL_BUILDER(Name("AddV2")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::add<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_asin.cc b/tensorflow/core/kernels/cwise_op_asin.cc
index 9089dfce23b..2471f8db2c2 100644
--- a/tensorflow/core/kernels/cwise_op_asin.cc
+++ b/tensorflow/core/kernels/cwise_op_asin.cc
@@ -22,7 +22,4 @@ REGISTER2(UnaryOp, CPU, "Asin", functor::asin, float, double);
 REGISTER2(UnaryOp, GPU, "Asin", functor::asin, float, double);
 #endif
 
-#if TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Asin", functor::asin, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_asinh.cc b/tensorflow/core/kernels/cwise_op_asinh.cc
index 9801b31af48..d096debca2e 100644
--- a/tensorflow/core/kernels/cwise_op_asinh.cc
+++ b/tensorflow/core/kernels/cwise_op_asinh.cc
@@ -20,9 +20,6 @@ namespace tensorflow {
 REGISTER4(UnaryOp, CPU, "Asinh", functor::asinh, float, double, complex64,
           complex128);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Asinh", functor::asinh, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER2(UnaryOp, GPU, "Asinh", functor::asinh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_atan.cc b/tensorflow/core/kernels/cwise_op_atan.cc
index d8f84f01034..07b030571a8 100644
--- a/tensorflow/core/kernels/cwise_op_atan.cc
+++ b/tensorflow/core/kernels/cwise_op_atan.cc
@@ -22,7 +22,4 @@ REGISTER2(UnaryOp, CPU, "Atan", functor::atan, float, double);
 REGISTER2(UnaryOp, GPU, "Atan", functor::atan, float, double);
 #endif
 
-#if TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Atan", functor::atan, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_atanh.cc b/tensorflow/core/kernels/cwise_op_atanh.cc
index e58adb57833..2404cd19646 100644
--- a/tensorflow/core/kernels/cwise_op_atanh.cc
+++ b/tensorflow/core/kernels/cwise_op_atanh.cc
@@ -20,9 +20,6 @@ namespace tensorflow {
 REGISTER4(UnaryOp, CPU, "Atanh", functor::atanh, float, double, complex64,
           complex128);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Atanh", functor::atanh, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER2(UnaryOp, GPU, "Atanh", functor::atanh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_and.cc b/tensorflow/core/kernels/cwise_op_bitwise_and.cc
index 49d5044f289..5e557e76e66 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_and.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_and.cc
@@ -19,22 +19,6 @@ namespace tensorflow {
 REGISTER8(BinaryOp, CPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
           int64, uint8, uint16, uint32, uint64);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                      \
-  REGISTER_KERNEL_BUILDER(                                              \
-      Name("BitwiseAnd").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BinaryOp<SYCLDevice, functor::bitwise_and<TYPE>>);
-REGISTER_SYCL_KERNEL(int8);
-REGISTER_SYCL_KERNEL(int16);
-REGISTER_SYCL_KERNEL(int32);
-REGISTER_SYCL_KERNEL(int64);
-REGISTER_SYCL_KERNEL(uint8);
-REGISTER_SYCL_KERNEL(uint16);
-REGISTER_SYCL_KERNEL(uint32);
-REGISTER_SYCL_KERNEL(uint64);
-#undef REGISTER_SYCL_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER8(BinaryOp, GPU, "BitwiseAnd", functor::bitwise_and, int8, int16, int32,
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_or.cc b/tensorflow/core/kernels/cwise_op_bitwise_or.cc
index f448968860d..3b371f9b5f9 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_or.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_or.cc
@@ -19,22 +19,6 @@ namespace tensorflow {
 REGISTER8(BinaryOp, CPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
           int64, uint8, uint16, uint32, uint64);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                     \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("BitwiseOr").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BinaryOp<SYCLDevice, functor::bitwise_or<TYPE>>);
-REGISTER_SYCL_KERNEL(int8);
-REGISTER_SYCL_KERNEL(int16);
-REGISTER_SYCL_KERNEL(int32);
-REGISTER_SYCL_KERNEL(int64);
-REGISTER_SYCL_KERNEL(uint8);
-REGISTER_SYCL_KERNEL(uint16);
-REGISTER_SYCL_KERNEL(uint32);
-REGISTER_SYCL_KERNEL(uint64);
-#undef REGISTER_SYCL_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER8(BinaryOp, GPU, "BitwiseOr", functor::bitwise_or, int8, int16, int32,
diff --git a/tensorflow/core/kernels/cwise_op_bitwise_xor.cc b/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
index b4387c2e8fd..bb3c7277944 100644
--- a/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
+++ b/tensorflow/core/kernels/cwise_op_bitwise_xor.cc
@@ -19,22 +19,6 @@ namespace tensorflow {
 REGISTER8(BinaryOp, CPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
           int64, uint8, uint16, uint32, uint64);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                      \
-  REGISTER_KERNEL_BUILDER(                                              \
-      Name("BitwiseXor").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BinaryOp<SYCLDevice, functor::bitwise_xor<TYPE>>);
-REGISTER_SYCL_KERNEL(int8);
-REGISTER_SYCL_KERNEL(int16);
-REGISTER_SYCL_KERNEL(int32);
-REGISTER_SYCL_KERNEL(int64);
-REGISTER_SYCL_KERNEL(uint8);
-REGISTER_SYCL_KERNEL(uint16);
-REGISTER_SYCL_KERNEL(uint32);
-REGISTER_SYCL_KERNEL(uint64);
-#undef REGISTER_SYCL_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER8(BinaryOp, GPU, "BitwiseXor", functor::bitwise_xor, int8, int16, int32,
diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc
index f8907ff1baa..765e5b94949 100644
--- a/tensorflow/core/kernels/cwise_op_ceil.cc
+++ b/tensorflow/core/kernels/cwise_op_ceil.cc
@@ -23,7 +23,4 @@ REGISTER4(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, bfloat16,
 REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double);
 #endif
 
-#if TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Ceil", functor::ceil, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_cos.cc b/tensorflow/core/kernels/cwise_op_cos.cc
index 3d406fe040a..64e9fabfc2b 100644
--- a/tensorflow/core/kernels/cwise_op_cos.cc
+++ b/tensorflow/core/kernels/cwise_op_cos.cc
@@ -23,7 +23,4 @@ REGISTER6(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, bfloat16,
 REGISTER3(UnaryOp, GPU, "Cos", functor::cos, float, Eigen::half, double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Cos", functor::cos, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_cosh.cc b/tensorflow/core/kernels/cwise_op_cosh.cc
index e6dff0ea317..6e1c5361a58 100644
--- a/tensorflow/core/kernels/cwise_op_cosh.cc
+++ b/tensorflow/core/kernels/cwise_op_cosh.cc
@@ -19,15 +19,6 @@ namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Cosh", functor::cosh, float, double, bfloat16,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                \
-  REGISTER_KERNEL_BUILDER(                                        \
-      Name("Cosh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      UnaryOp<SYCLDevice, functor::cosh<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER2(UnaryOp, GPU, "Cosh", functor::cosh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_div.cc b/tensorflow/core/kernels/cwise_op_div.cc
index 733f3886d19..6e43f45b0c7 100644
--- a/tensorflow/core/kernels/cwise_op_div.cc
+++ b/tensorflow/core/kernels/cwise_op_div.cc
@@ -50,15 +50,4 @@ REGISTER_KERNEL_BUILDER(Name("Div")
                         BinaryOp<CPUDevice, functor::safe_div<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(BinaryOp, SYCL, "Div", functor::div, float, double);
-REGISTER2(BinaryOp, SYCL, "RealDiv", functor::div, float, double);
-REGISTER_KERNEL_BUILDER(Name("Div")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::safe_div<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_equal_to_1.cc
index 64cd784af73..af72aa3418c 100644
--- a/tensorflow/core/kernels/cwise_op_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_equal_to_1.cc
@@ -18,7 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER7(BinaryOp, CPU, "Equal", functor::equal_to, float, Eigen::half, double,
           uint8, int8, int16, bfloat16);
-REGISTER3(BinaryOp, CPU, "Equal", functor::equal_to, uint16, uint32, uint64);
+REGISTER7(BinaryOp, CPU, "Equal", functor::equal_to, uint16, uint32, uint64,
+          qint8, qint16, quint8, quint16);
 REGISTER_KERNEL_BUILDER(
     Name("ApproximateEqual").Device(DEVICE_CPU).TypeConstraint<float>("T"),
     ApproximateEqualOp<CPUDevice, float>);
@@ -47,16 +48,5 @@ REGISTER_KERNEL_BUILDER(Name("Equal")
                         BinaryOp<CPUDevice, functor::equal_to<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER5(BinaryOp, SYCL, "Equal", functor::equal_to, float, double, uint8,
-          int8, int16);
-REGISTER_KERNEL_BUILDER(Name("Equal")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::equal_to<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_exp.cc b/tensorflow/core/kernels/cwise_op_exp.cc
index d937dd0c06d..28ace80431b 100644
--- a/tensorflow/core/kernels/cwise_op_exp.cc
+++ b/tensorflow/core/kernels/cwise_op_exp.cc
@@ -24,7 +24,4 @@ REGISTER5(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double,
           complex64, complex128);
 #endif
 
-#if TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Exp", functor::exp, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_expm1.cc b/tensorflow/core/kernels/cwise_op_expm1.cc
index 0b145d83e5c..62a26eb1892 100644
--- a/tensorflow/core/kernels/cwise_op_expm1.cc
+++ b/tensorflow/core/kernels/cwise_op_expm1.cc
@@ -21,7 +21,4 @@ REGISTER6(UnaryOp, CPU, "Expm1", functor::expm1, float, Eigen::half, bfloat16,
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Expm1", functor::expm1, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc
index 1dbd9bf0634..da5619b3df9 100644
--- a/tensorflow/core/kernels/cwise_op_floor.cc
+++ b/tensorflow/core/kernels/cwise_op_floor.cc
@@ -22,7 +22,4 @@ REGISTER4(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, bfloat16,
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Floor", functor::floor, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor_div.cc b/tensorflow/core/kernels/cwise_op_floor_div.cc
index d1f6d4c0652..a98eecdb889 100644
--- a/tensorflow/core/kernels/cwise_op_floor_div.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_div.cc
@@ -41,13 +41,4 @@ REGISTER_KERNEL_BUILDER(Name("FloorDiv")
                         BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("FloorDiv")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::safe_floor_div<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_floor_mod.cc b/tensorflow/core/kernels/cwise_op_floor_mod.cc
index 599ed1a9318..6d8a12a731c 100644
--- a/tensorflow/core/kernels/cwise_op_floor_mod.cc
+++ b/tensorflow/core/kernels/cwise_op_floor_mod.cc
@@ -34,13 +34,4 @@ REGISTER_KERNEL_BUILDER(Name("FloorMod")
                         BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("FloorMod")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::safe_floor_mod<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater.cc b/tensorflow/core/kernels/cwise_op_greater.cc
index d70233dc55c..f9a2b8c2500 100644
--- a/tensorflow/core/kernels/cwise_op_greater.cc
+++ b/tensorflow/core/kernels/cwise_op_greater.cc
@@ -33,15 +33,4 @@ REGISTER_KERNEL_BUILDER(Name("Greater")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::greater<int32>>);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(BinaryOp, SYCL, "Greater", functor::greater, float, double);
-
-REGISTER_KERNEL_BUILDER(Name("Greater")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::greater<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_greater_equal.cc b/tensorflow/core/kernels/cwise_op_greater_equal.cc
index 7f6b788eb2e..d33adc2d7d1 100644
--- a/tensorflow/core/kernels/cwise_op_greater_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_greater_equal.cc
@@ -34,16 +34,4 @@ REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
                         BinaryOp<CPUDevice, functor::greater_equal<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(BinaryOp, SYCL, "GreaterEqual", functor::greater_equal, float,
-          double);
-
-REGISTER_KERNEL_BUILDER(Name("GreaterEqual")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::greater_equal<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_invert.cc b/tensorflow/core/kernels/cwise_op_invert.cc
index 7bdc3d02a42..455e773cfd1 100644
--- a/tensorflow/core/kernels/cwise_op_invert.cc
+++ b/tensorflow/core/kernels/cwise_op_invert.cc
@@ -19,10 +19,6 @@ namespace tensorflow {
 REGISTER8(UnaryOp, CPU, "Invert", functor::invert, int8, int16, int32, int64,
           uint8, uint16, uint32, uint64);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER6(UnaryOp, SYCL, "Invert", functor::invert, int8, int16, int32, int64,
-          uint8, uint16, uint32, uint64);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER8(UnaryOp, GPU, "Invert", functor::invert, int8, int16, int32, int64,
diff --git a/tensorflow/core/kernels/cwise_op_isfinite.cc b/tensorflow/core/kernels/cwise_op_isfinite.cc
index 42c7cbd4fd7..0246d89df56 100644
--- a/tensorflow/core/kernels/cwise_op_isfinite.cc
+++ b/tensorflow/core/kernels/cwise_op_isfinite.cc
@@ -24,7 +24,4 @@ REGISTER3(UnaryOp, GPU, "IsFinite", functor::isfinite, float, Eigen::half,
           double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "IsFinite", functor::isfinite, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isinf.cc b/tensorflow/core/kernels/cwise_op_isinf.cc
index 68141f4924a..d4da9fcf3ca 100644
--- a/tensorflow/core/kernels/cwise_op_isinf.cc
+++ b/tensorflow/core/kernels/cwise_op_isinf.cc
@@ -23,7 +23,4 @@ REGISTER4(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, bfloat16,
 REGISTER3(UnaryOp, GPU, "IsInf", functor::isinf, float, Eigen::half, double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "IsInf", functor::isinf, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_isnan.cc b/tensorflow/core/kernels/cwise_op_isnan.cc
index 2867b16e39a..b168b1c7472 100644
--- a/tensorflow/core/kernels/cwise_op_isnan.cc
+++ b/tensorflow/core/kernels/cwise_op_isnan.cc
@@ -23,7 +23,4 @@ REGISTER4(UnaryOp, CPU, "IsNan", functor::isnan, float, Eigen::half, double,
 REGISTER3(UnaryOp, GPU, "IsNan", functor::isnan, float, Eigen::half, double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "IsNan", functor::isnan, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_left_shift.cc b/tensorflow/core/kernels/cwise_op_left_shift.cc
index 38109a14c5d..ed65bea3126 100644
--- a/tensorflow/core/kernels/cwise_op_left_shift.cc
+++ b/tensorflow/core/kernels/cwise_op_left_shift.cc
@@ -19,22 +19,6 @@ namespace tensorflow {
 REGISTER8(BinaryOp, CPU, "LeftShift", functor::left_shift, int8, int16, int32,
           int64, uint8, uint16, uint32, uint64);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                     \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("LeftShift").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BinaryOp<SYCLDevice, functor::left_shift<TYPE>>);
-REGISTER_SYCL_KERNEL(int8);
-REGISTER_SYCL_KERNEL(int16);
-REGISTER_SYCL_KERNEL(int32);
-REGISTER_SYCL_KERNEL(int64);
-REGISTER_SYCL_KERNEL(uint8);
-REGISTER_SYCL_KERNEL(uint16);
-REGISTER_SYCL_KERNEL(uint32);
-REGISTER_SYCL_KERNEL(uint64);
-#undef REGISTER_SYCL_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER8(BinaryOp, GPU, "LeftShift", functor::left_shift, int8, int16, int32,
diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc
index 062a029f069..817f07af8dd 100644
--- a/tensorflow/core/kernels/cwise_op_less.cc
+++ b/tensorflow/core/kernels/cwise_op_less.cc
@@ -35,14 +35,4 @@ REGISTER_KERNEL_BUILDER(Name("Less")
                             .TypeConstraint<int32>("T"),
                         BinaryOp<CPUDevice, functor::less<int32>>);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER3(BinaryOp, SYCL, "Less", functor::less, float, double, int64);
-REGISTER_KERNEL_BUILDER(Name("Less")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::less<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_less_equal.cc b/tensorflow/core/kernels/cwise_op_less_equal.cc
index 43af03878e9..17b9915631b 100644
--- a/tensorflow/core/kernels/cwise_op_less_equal.cc
+++ b/tensorflow/core/kernels/cwise_op_less_equal.cc
@@ -37,15 +37,4 @@ REGISTER_KERNEL_BUILDER(Name("LessEqual")
                         BinaryOp<CPUDevice, functor::less_equal<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER6(BinaryOp, SYCL, "LessEqual", functor::less_equal, float, double,
-          int64, uint8, int8, int16);
-REGISTER_KERNEL_BUILDER(Name("LessEqual")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::less_equal<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log.cc b/tensorflow/core/kernels/cwise_op_log.cc
index e4ff3808a93..236f95dfa77 100644
--- a/tensorflow/core/kernels/cwise_op_log.cc
+++ b/tensorflow/core/kernels/cwise_op_log.cc
@@ -23,7 +23,4 @@ REGISTER6(UnaryOp, CPU, "Log", functor::log, float, Eigen::half, double,
 REGISTER3(UnaryOp, GPU, "Log", functor::log, float, Eigen::half, double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Log", functor::log, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_log1p.cc b/tensorflow/core/kernels/cwise_op_log1p.cc
index 88ddfd6af26..392067f7341 100644
--- a/tensorflow/core/kernels/cwise_op_log1p.cc
+++ b/tensorflow/core/kernels/cwise_op_log1p.cc
@@ -23,7 +23,4 @@ REGISTER6(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, bfloat16,
 REGISTER3(UnaryOp, GPU, "Log1p", functor::log1p, float, Eigen::half, double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Log1p", functor::log1p, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_maximum.cc b/tensorflow/core/kernels/cwise_op_maximum.cc
index 5ebfa74eb4e..2b70cdb4e14 100644
--- a/tensorflow/core/kernels/cwise_op_maximum.cc
+++ b/tensorflow/core/kernels/cwise_op_maximum.cc
@@ -34,14 +34,4 @@ REGISTER_KERNEL_BUILDER(Name("Maximum")
                         BinaryOp<CPUDevice, functor::maximum<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER3(BinaryOp, SYCL, "Maximum", functor::maximum, float, double, int64);
-REGISTER_KERNEL_BUILDER(Name("Maximum")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::maximum<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_minimum.cc b/tensorflow/core/kernels/cwise_op_minimum.cc
index 8b301e8ce64..f8ba0714680 100644
--- a/tensorflow/core/kernels/cwise_op_minimum.cc
+++ b/tensorflow/core/kernels/cwise_op_minimum.cc
@@ -34,15 +34,5 @@ REGISTER_KERNEL_BUILDER(Name("Minimum")
                         BinaryOp<CPUDevice, functor::minimum<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER3(BinaryOp, SYCL, "Minimum", functor::minimum, float, double, int64);
-REGISTER_KERNEL_BUILDER(Name("Minimum")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::minimum<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_mul_1.cc b/tensorflow/core/kernels/cwise_op_mul_1.cc
index 4e2aa6bbc58..5660f4309b3 100644
--- a/tensorflow/core/kernels/cwise_op_mul_1.cc
+++ b/tensorflow/core/kernels/cwise_op_mul_1.cc
@@ -49,14 +49,4 @@ REGISTER5(BinaryOp, GPU, "MulNoNan", functor::mul_no_nan, Eigen::half, float,
           double, complex64, complex128);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER3(BinaryOp, SYCL, "Mul", functor::mul, float, double, uint8);
-REGISTER_KERNEL_BUILDER(Name("Mul")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::mul<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_neg_1.cc b/tensorflow/core/kernels/cwise_op_neg_1.cc
index 18a7c61be90..fde5fae54bd 100644
--- a/tensorflow/core/kernels/cwise_op_neg_1.cc
+++ b/tensorflow/core/kernels/cwise_op_neg_1.cc
@@ -18,15 +18,6 @@ limitations under the License.
 namespace tensorflow {
 REGISTER4(UnaryOp, CPU, "Neg", functor::neg, int8, int16, int32, int64);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER3(UnaryOp, SYCL, "Neg", functor::neg, float, double, int64);
-REGISTER_KERNEL_BUILDER(Name("Neg")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
-                        UnaryOp<CPUDevice, functor::neg<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Neg", functor::neg, int8, int16, int64);
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
index 4de69edd21d..49edd3f3ceb 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
@@ -18,8 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER7(BinaryOp, CPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
           double, uint8, int8, int16, bfloat16);
-REGISTER3(BinaryOp, CPU, "NotEqual", functor::not_equal_to, uint16, uint32,
-          uint64);
+REGISTER7(BinaryOp, CPU, "NotEqual", functor::not_equal_to, uint16, uint32,
+          uint64, qint8, qint16, quint8, quint16);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER4(BinaryOp, GPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
           double, uint8);
@@ -35,16 +35,5 @@ REGISTER_KERNEL_BUILDER(Name("NotEqual")
                         BinaryOp<CPUDevice, functor::not_equal_to<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(BinaryOp, SYCL, "NotEqual", functor::not_equal_to, float, double);
-
-REGISTER_KERNEL_BUILDER(Name("NotEqual")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::not_equal_to<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_pow.cc b/tensorflow/core/kernels/cwise_op_pow.cc
index 214d083e11b..e969c39a2d8 100644
--- a/tensorflow/core/kernels/cwise_op_pow.cc
+++ b/tensorflow/core/kernels/cwise_op_pow.cc
@@ -24,7 +24,4 @@ REGISTER2(BinaryOp, CPU, "Pow", functor::safe_pow, int32, int64);
 REGISTER4(BinaryOp, GPU, "Pow", functor::pow, float, Eigen::half, double,
           int64);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(BinaryOp, SYCL, "Pow", functor::pow, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_reciprocal.cc b/tensorflow/core/kernels/cwise_op_reciprocal.cc
index 4fe201e9c7b..76480e1fede 100644
--- a/tensorflow/core/kernels/cwise_op_reciprocal.cc
+++ b/tensorflow/core/kernels/cwise_op_reciprocal.cc
@@ -36,9 +36,6 @@ REGISTER6(UnaryOp, CPU, "Reciprocal", functor::inverse, float, Eigen::half,
 REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
           double, int64);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float);
-#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER6(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
           Eigen::half, bfloat16, double, complex64, complex128);
@@ -46,7 +43,4 @@ REGISTER6(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
 REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float,
           Eigen::half, double);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER(SimpleBinaryOp, SYCL, "ReciprocalGrad", functor::inverse_grad, float);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_right_shift.cc b/tensorflow/core/kernels/cwise_op_right_shift.cc
index 8165662e53f..2bf819c53fd 100644
--- a/tensorflow/core/kernels/cwise_op_right_shift.cc
+++ b/tensorflow/core/kernels/cwise_op_right_shift.cc
@@ -19,22 +19,6 @@ namespace tensorflow {
 REGISTER8(BinaryOp, CPU, "RightShift", functor::right_shift, int8, int16, int32,
           int64, uint8, uint16, uint32, uint64);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                      \
-  REGISTER_KERNEL_BUILDER(                                              \
-      Name("RightShift").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BinaryOp<SYCLDevice, functor::right_shift<TYPE>>);
-REGISTER_SYCL_KERNEL(int8);
-REGISTER_SYCL_KERNEL(int16);
-REGISTER_SYCL_KERNEL(int32);
-REGISTER_SYCL_KERNEL(int64);
-REGISTER_SYCL_KERNEL(uint8);
-REGISTER_SYCL_KERNEL(uint16);
-REGISTER_SYCL_KERNEL(uint32);
-REGISTER_SYCL_KERNEL(uint64);
-#undef REGISTER_SYCL_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER8(BinaryOp, GPU, "RightShift", functor::right_shift, int8, int16, int32,
diff --git a/tensorflow/core/kernels/cwise_op_round.cc b/tensorflow/core/kernels/cwise_op_round.cc
index 86e709b01e1..73a1d9e533a 100644
--- a/tensorflow/core/kernels/cwise_op_round.cc
+++ b/tensorflow/core/kernels/cwise_op_round.cc
@@ -19,9 +19,6 @@ namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Round", functor::round, Eigen::half, float, double,
           int32, int64);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Round", functor::round, float, double);
-#endif
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER5(UnaryOp, GPU, "Round", functor::round, Eigen::half, float, double,
diff --git a/tensorflow/core/kernels/cwise_op_rsqrt.cc b/tensorflow/core/kernels/cwise_op_rsqrt.cc
index 20d81a66bbf..21e3bf4d33f 100644
--- a/tensorflow/core/kernels/cwise_op_rsqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_rsqrt.cc
@@ -22,9 +22,6 @@ REGISTER5(UnaryOp, CPU, "Rsqrt", functor::rsqrt, float, Eigen::half, double,
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Rsqrt", functor::rsqrt, float, Eigen::half, double);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Rsqrt", functor::rsqrt, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER5(SimpleBinaryOp, CPU, "RsqrtGrad", functor::rsqrt_grad, float,
           Eigen::half, double, complex64, complex128);
@@ -32,8 +29,4 @@ REGISTER5(SimpleBinaryOp, CPU, "RsqrtGrad", functor::rsqrt_grad, float,
 REGISTER3(SimpleBinaryOp, GPU, "RsqrtGrad", functor::rsqrt_grad, float,
           Eigen::half, double);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(SimpleBinaryOp, SYCL, "RsqrtGrad", functor::rsqrt_grad, float,
-          double);
-#endif  //  TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc
index af003084998..a78af454f09 100644
--- a/tensorflow/core/kernels/cwise_op_select.cc
+++ b/tensorflow/core/kernels/cwise_op_select.cc
@@ -29,9 +29,6 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 template <typename Device, typename T>
@@ -152,21 +149,9 @@ class SelectV2Op : public OpKernel {
 
     // The `cond`, `then`, and `else` are broadcastable (bcast.IsValid()),
     // This matches the behavior of numpy.
-    // TODO (yongtang): Consolidate into n-ary broadcast, instead of multiple
-    // 2-ary broadcast.
-
-    // Combine `then` and `else`.
-    BCast then_else_bcast(BCast::FromShape(then->shape()),
-                          BCast::FromShape(else_->shape()), false);
-    OP_REQUIRES(ctx, then_else_bcast.IsValid(),
-                errors::InvalidArgument(
-                    "then ", then->shape().DebugString(), " and else ",
-                    else_->shape().DebugString(), " must be broadcastable"));
-    // Combine `cond` with `then` and `else`.
-    BCast bcast(
-        BCast::FromShape(cond->shape()),
-        BCast::FromShape(BCast::ToShape(then_else_bcast.output_shape())),
-        false);
+    BCastList<3> bcast({cond->shape().dim_sizes(), then->shape().dim_sizes(),
+                        else_->shape().dim_sizes()},
+                       false);
     OP_REQUIRES(ctx, bcast.IsValid(),
                 errors::InvalidArgument(
                     "condition ", cond->shape().DebugString(), ", then ",
@@ -175,12 +160,9 @@ class SelectV2Op : public OpKernel {
 
     // Broadcast `cond`, `then` and `else` to combined shape,
     // in order to obtain the reshape.
-    BCast cond_bcast(BCast::FromShape(BCast::ToShape(bcast.output_shape())),
-                     BCast::FromShape(cond->shape()), false);
-    BCast then_bcast(BCast::FromShape(BCast::ToShape(bcast.output_shape())),
-                     BCast::FromShape(then->shape()), false);
-    BCast else_bcast(BCast::FromShape(BCast::ToShape(bcast.output_shape())),
-                     BCast::FromShape(else_->shape()), false);
+    BCast cond_bcast(bcast.output_shape(), cond->shape().dim_sizes(), false);
+    BCast then_bcast(bcast.output_shape(), then->shape().dim_sizes(), false);
+    BCast else_bcast(bcast.output_shape(), else_->shape().dim_sizes(), false);
     OP_REQUIRES(
         ctx,
         cond_bcast.IsValid() && then_bcast.IsValid() && else_bcast.IsValid(),
@@ -294,22 +276,6 @@ REGISTER_SELECT_GPU(complex128);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-// Registration of the SYCL implementations.
-#define REGISTER_SELECT_SYCL(type)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Select").Device(DEVICE_SYCL).TypeConstraint<type>("T"),   \
-      SelectOp<SYCLDevice, type>);                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("SelectV2").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      SelectOp<SYCLDevice, type>);
-
-REGISTER_SELECT_SYCL(float);
-REGISTER_SELECT_SYCL(double);
-REGISTER_SELECT_SYCL(int32);
-REGISTER_SELECT_SYCL(int64);
-#undef REGISTER_SELECT_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 
@@ -326,10 +292,6 @@ struct SelectFunctorBase {
 
 template <typename T>
 struct SelectFunctor<CPUDevice, T> : SelectFunctorBase<CPUDevice, T> {};
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-struct SelectFunctor<SYCLDevice, T> : SelectFunctorBase<SYCLDevice, T> {};
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 struct SelectScalarHandler {
@@ -364,21 +326,6 @@ struct SelectScalarHandler<CPUDevice, T> {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename Device, typename T>
-struct SelectScalarFunctorBase {
-  void operator()(const Device& d, typename TTypes<T>::Flat out,
-                  TTypes<bool>::ConstScalar cond,
-                  typename TTypes<T>::ConstFlat then_flat,
-                  typename TTypes<T>::ConstFlat else_flat) {
-    out.device(d) = cond() ? then_flat : else_flat;
-  }
-};
-
-template <typename T>
-struct SelectScalarFunctor<SYCLDevice, T>
-    : SelectScalarFunctorBase<SYCLDevice, T> {};
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 struct BatchSelectFunctorBase {
@@ -469,16 +416,6 @@ template <typename T, int NDIMS>
 struct BCastSelectFunctor<CPUDevice, T, NDIMS>
     : BCastSelectFunctorBase<CPUDevice, T, NDIMS> {};
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-struct BatchSelectFunctor<SYCLDevice, T>
-    : BatchSelectFunctorBase<SYCLDevice, T> {};
-
-template <typename T, int NDIMS>
-struct BCastSelectFunctor<SYCLDevice, T, NDIMS>
-    : BCastSelectFunctorBase<SYCLDevice, T, NDIMS> {};
-
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 
diff --git a/tensorflow/core/kernels/cwise_op_sigmoid.cc b/tensorflow/core/kernels/cwise_op_sigmoid.cc
index 926284571ed..22ec20d124e 100644
--- a/tensorflow/core/kernels/cwise_op_sigmoid.cc
+++ b/tensorflow/core/kernels/cwise_op_sigmoid.cc
@@ -17,24 +17,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_gradients.h"
 
 namespace tensorflow {
-REGISTER5(UnaryOp, CPU, "Sigmoid", functor::sigmoid, float, Eigen::half, double,
-          complex64, complex128);
+REGISTER6(UnaryOp, CPU, "Sigmoid", functor::sigmoid, bfloat16, float,
+          Eigen::half, double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(UnaryOp, GPU, "Sigmoid", functor::sigmoid, float, Eigen::half,
           double);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER(UnaryOp, SYCL, "Sigmoid", functor::sigmoid, float);
-#endif  // TENSORFLOW_USE_SYCL
 
-REGISTER5(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, float,
-          Eigen::half, double, complex64, complex128);
+REGISTER6(SimpleBinaryOp, CPU, "SigmoidGrad", functor::sigmoid_grad, bfloat16,
+          float, Eigen::half, double, complex64, complex128);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER3(SimpleBinaryOp, GPU, "SigmoidGrad", functor::sigmoid_grad, float,
           Eigen::half, double);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER(SimpleBinaryOp, SYCL, "SigmoidGrad", functor::sigmoid_grad, float);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sign.cc b/tensorflow/core/kernels/cwise_op_sign.cc
index 200a56eb2d2..b1501555fbc 100644
--- a/tensorflow/core/kernels/cwise_op_sign.cc
+++ b/tensorflow/core/kernels/cwise_op_sign.cc
@@ -33,14 +33,5 @@ REGISTER_KERNEL_BUILDER(Name("Sign")
                         UnaryOp<CPUDevice, functor::sign<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER3(UnaryOp, SYCL, "Sign", functor::sign, float, double, int64);
-REGISTER_KERNEL_BUILDER(Name("Sign")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
-                        UnaryOp<CPUDevice, functor::sign<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sin.cc b/tensorflow/core/kernels/cwise_op_sin.cc
index f0fc2af7366..d3e8f3b605c 100644
--- a/tensorflow/core/kernels/cwise_op_sin.cc
+++ b/tensorflow/core/kernels/cwise_op_sin.cc
@@ -23,7 +23,4 @@ REGISTER6(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, bfloat16,
 REGISTER3(UnaryOp, GPU, "Sin", functor::sin, float, Eigen::half, double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Sin", functor::sin, float, double);
-#endif  // TENSORFLOW_USE_SYC
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sinh.cc b/tensorflow/core/kernels/cwise_op_sinh.cc
index 4448d2fef76..24b3a666aee 100644
--- a/tensorflow/core/kernels/cwise_op_sinh.cc
+++ b/tensorflow/core/kernels/cwise_op_sinh.cc
@@ -19,15 +19,6 @@ namespace tensorflow {
 REGISTER5(UnaryOp, CPU, "Sinh", functor::sinh, float, double, bfloat16,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                \
-  REGISTER_KERNEL_BUILDER(                                        \
-      Name("Sinh").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      UnaryOp<SYCLDevice, functor::sinh<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYC
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER2(UnaryOp, GPU, "Sinh", functor::sinh, float, double);
diff --git a/tensorflow/core/kernels/cwise_op_sqrt.cc b/tensorflow/core/kernels/cwise_op_sqrt.cc
index 976f8b0954d..2e33297a305 100644
--- a/tensorflow/core/kernels/cwise_op_sqrt.cc
+++ b/tensorflow/core/kernels/cwise_op_sqrt.cc
@@ -23,9 +23,6 @@ REGISTER6(UnaryOp, CPU, "Sqrt", functor::sqrt, float, Eigen::half, double,
 REGISTER3(UnaryOp, GPU, "Sqrt", functor::sqrt, float, Eigen::half, double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Sqrt", functor::sqrt, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER6(SimpleBinaryOp, CPU, "SqrtGrad", functor::sqrt_grad, float,
           Eigen::half, bfloat16, double, complex64, complex128);
@@ -34,7 +31,4 @@ REGISTER3(SimpleBinaryOp, GPU, "SqrtGrad", functor::sqrt_grad, float,
           Eigen::half, double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(SimpleBinaryOp, SYCL, "SqrtGrad", functor::sqrt_grad, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_square.cc b/tensorflow/core/kernels/cwise_op_square.cc
index 40dea5a5fa3..3811839a7e3 100644
--- a/tensorflow/core/kernels/cwise_op_square.cc
+++ b/tensorflow/core/kernels/cwise_op_square.cc
@@ -34,13 +34,4 @@ REGISTER_KERNEL_BUILDER(Name("Square")
                         UnaryOp<CPUDevice, functor::square<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER3(UnaryOp, SYCL, "Square", functor::square, float, double, int64);
-REGISTER_KERNEL_BUILDER(Name("Square")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .TypeConstraint<int32>("T"),
-                        UnaryOp<CPUDevice, functor::square<int32>>);
-#endif  // TENSORFLOW_USE_SYC
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_squared_difference.cc b/tensorflow/core/kernels/cwise_op_squared_difference.cc
index 12520b7e10b..9bd457f5937 100644
--- a/tensorflow/core/kernels/cwise_op_squared_difference.cc
+++ b/tensorflow/core/kernels/cwise_op_squared_difference.cc
@@ -36,17 +36,5 @@ REGISTER_KERNEL_BUILDER(
         .TypeConstraint<int32>("T"),
     BinaryOp<CPUDevice, functor::squared_difference<int32>>);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER3(BinaryOp, SYCL, "SquaredDifference", functor::squared_difference,
-          float, double, int64);
-REGISTER_KERNEL_BUILDER(
-    Name("SquaredDifference")
-        .Device(DEVICE_SYCL)
-        .HostMemory("x")
-        .HostMemory("y")
-        .HostMemory("z")
-        .TypeConstraint<int32>("T"),
-    BinaryOp<CPUDevice, functor::squared_difference<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_sub.cc b/tensorflow/core/kernels/cwise_op_sub.cc
index 17e690b2c17..6164b2a23ed 100644
--- a/tensorflow/core/kernels/cwise_op_sub.cc
+++ b/tensorflow/core/kernels/cwise_op_sub.cc
@@ -45,14 +45,4 @@ REGISTER_KERNEL_BUILDER(Name("Sub")
                         BinaryOp<CPUDevice, functor::sub<int32>>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER3(BinaryOp, SYCL, "Sub", functor::sub, float, double, int64);
-REGISTER_KERNEL_BUILDER(Name("Sub")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("z")
-                            .TypeConstraint<int32>("T"),
-                        BinaryOp<CPUDevice, functor::sub<int32>>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_tan.cc b/tensorflow/core/kernels/cwise_op_tan.cc
index 115531213ac..a9ccc5853db 100644
--- a/tensorflow/core/kernels/cwise_op_tan.cc
+++ b/tensorflow/core/kernels/cwise_op_tan.cc
@@ -23,7 +23,4 @@ REGISTER6(UnaryOp, CPU, "Tan", functor::tan, Eigen::half, bfloat16, float,
 REGISTER3(UnaryOp, GPU, "Tan", functor::tan, Eigen::half, float, double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Tan", functor::tan, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_tanh.cc b/tensorflow/core/kernels/cwise_op_tanh.cc
index de56a5e3e03..2dbd77d9b06 100644
--- a/tensorflow/core/kernels/cwise_op_tanh.cc
+++ b/tensorflow/core/kernels/cwise_op_tanh.cc
@@ -26,9 +26,6 @@ REGISTER3(UnaryOp, GPU, "Tanh", functor::tanh, float, Eigen::half, double);
 #endif
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER2(UnaryOp, SYCL, "Tanh", functor::tanh, float, double);
-#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER6(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float,
           Eigen::half, bfloat16, double, complex64, complex128);
diff --git a/tensorflow/core/kernels/cwise_op_xdivy.cc b/tensorflow/core/kernels/cwise_op_xdivy.cc
index dbd0a69347b..2baf788182f 100644
--- a/tensorflow/core/kernels/cwise_op_xdivy.cc
+++ b/tensorflow/core/kernels/cwise_op_xdivy.cc
@@ -19,16 +19,6 @@ namespace tensorflow {
 REGISTER5(BinaryOp, CPU, "Xdivy", functor::xdivy, float, Eigen::half, double,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                 \
-  REGISTER_KERNEL_BUILDER(                                         \
-      Name("Xdivy").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BinaryOp<SYCLDevice, functor::xdivy<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER5(BinaryOp, GPU, "Xdivy", functor::xdivy, float, Eigen::half, double,
diff --git a/tensorflow/core/kernels/cwise_op_xlog1py.cc b/tensorflow/core/kernels/cwise_op_xlog1py.cc
index f00d73e3038..493ee91c86d 100644
--- a/tensorflow/core/kernels/cwise_op_xlog1py.cc
+++ b/tensorflow/core/kernels/cwise_op_xlog1py.cc
@@ -19,19 +19,6 @@ namespace tensorflow {
 REGISTER5(BinaryOp, CPU, "Xlog1py", functor::xlog1py, float, Eigen::half,
           double, complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                   \
-  REGISTER_KERNEL_BUILDER(                                           \
-      Name("Xlog1py").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BinaryOp<SYCLDevice, functor::xlog1py<TYPE>>);
-REGISTER_SYCL_KERNEL(Eigen::half);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-REGISTER_SYCL_KERNEL(complex64);
-REGISTER_SYCL_KERNEL(complex128);
-#undef REGISTER_SYCL_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER5(BinaryOp, GPU, "Xlog1py", functor::xlog1py, float, Eigen::half,
diff --git a/tensorflow/core/kernels/cwise_op_xlogy.cc b/tensorflow/core/kernels/cwise_op_xlogy.cc
index a7eefa59d61..a48a7865455 100644
--- a/tensorflow/core/kernels/cwise_op_xlogy.cc
+++ b/tensorflow/core/kernels/cwise_op_xlogy.cc
@@ -19,19 +19,6 @@ namespace tensorflow {
 REGISTER5(BinaryOp, CPU, "Xlogy", functor::xlogy, float, Eigen::half, double,
           complex64, complex128);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                 \
-  REGISTER_KERNEL_BUILDER(                                         \
-      Name("Xlogy").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BinaryOp<SYCLDevice, functor::xlogy<TYPE>>);
-REGISTER_SYCL_KERNEL(Eigen::half);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-REGISTER_SYCL_KERNEL(complex64);
-REGISTER_SYCL_KERNEL(complex128);
-#undef REGISTER_SYCL_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA
 REGISTER5(BinaryOp, GPU, "Xlogy", functor::xlogy, float, Eigen::half, double,
diff --git a/tensorflow/core/kernels/cwise_ops_common.h b/tensorflow/core/kernels/cwise_ops_common.h
index 9920da3f163..9adc628421d 100644
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@@ -24,9 +24,6 @@ limitations under the License.
 
 #include "tensorflow/core/platform/bfloat16.h"
 
-#ifdef TENSORFLOW_USE_SYCL
-#include "tensorflow/core/kernels/cwise_ops_sycl_common.h"
-#endif
 
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -42,9 +39,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif
 
 class BinaryOpShared : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
index 8849c3f4edd..938aea793f0 100644
--- a/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
+++ b/tensorflow/core/kernels/cwise_ops_gpu_common.cu.h
@@ -27,15 +27,8 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/cwise_ops.h"
-#include "tensorflow/core/platform/types.h"
-
 #include "tensorflow/core/platform/logging.h"
-
-#ifdef __HIP_DEVICE_COMPILE__
-// Provide ldexp float overload for HIP, it's missing in their headers.
-__device__ inline float ldexp(float x, int exp) { return ldexpf(x, exp); }
-#endif
-
+#include "tensorflow/core/platform/types.h"
 namespace tensorflow {
 namespace functor {
 
diff --git a/tensorflow/core/kernels/cwise_ops_gradients.h b/tensorflow/core/kernels/cwise_ops_gradients.h
index ab919738f99..78f77caa6fe 100644
--- a/tensorflow/core/kernels/cwise_ops_gradients.h
+++ b/tensorflow/core/kernels/cwise_ops_gradients.h
@@ -188,19 +188,6 @@ struct SimpleBinaryFunctor<CPUDevice, Functor> {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-// Partial specialization of BinaryFunctor for SYCL devices
-typedef Eigen::SyclDevice SYCLDevice;
-template <typename Functor>
-struct SimpleBinaryFunctor<SYCLDevice, Functor> {
-  void operator()(const SYCLDevice& d, typename Functor::tout_type out,
-                  typename Functor::tin_type in0,
-                  typename Functor::tin_type in1) {
-    out.device(d) = in0.binaryExpr(in1, typename Functor::func());
-  }
-};
-
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T>
 struct tanh_grad : base<T, Eigen::internal::scalar_tanh_gradient_op<T>> {};
diff --git a/tensorflow/core/kernels/cwise_ops_sycl_common.h b/tensorflow/core/kernels/cwise_ops_sycl_common.h
deleted file mode 100644
index 3e107cee04c..00000000000
--- a/tensorflow/core/kernels/cwise_ops_sycl_common.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if !TENSORFLOW_USE_SYCL
-#error This file must only be included when building TensorFlow with SYCL support
-#endif
-
-#ifndef TENSORFLOW_CORE_KERNELS_CWISE_OPS_SYCL_COMMON_H_
-#define TENSORFLOW_CORE_KERNELS_CWISE_OPS_SYCL_COMMON_H_
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/kernels/cwise_ops.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace functor {
-
-typedef Eigen::SyclDevice SYCLDevice;
-
-template <typename OUT, typename RHS>
-void Assign(const SYCLDevice& d, OUT out, RHS rhs) {
-  out.device(d) = rhs;
-}
-
-// Partial specialization of UnaryFunctor<Device=SYCLDevice, Functor>.
-template <typename Functor>
-struct UnaryFunctor<SYCLDevice, Functor> {
-  void operator()(const SYCLDevice& d, typename Functor::tout_type out,
-                  typename Functor::tin_type in) {
-    To32Bit(out).device(d) = To32Bit(in).unaryExpr(typename Functor::func());
-  }
-};
-
-// Partial specialization of BinaryFunctor<Device=SYCLDevice, Functor>.
-template <typename Functor, int NDIMS, bool has_errors>
-struct BinaryFunctor<SYCLDevice, Functor, NDIMS, has_errors> {
-  void operator()(const SYCLDevice& d, typename Functor::tout_type out,
-                  typename Functor::tin_type in0,
-                  typename Functor::tin_type in1, bool* error) {
-    To32Bit(out).device(d) =
-        To32Bit(in0).binaryExpr(To32Bit(in1), typename Functor::func());
-  }
-
-  void Left(const SYCLDevice& d, typename Functor::tout_type out,
-            typename Functor::tscalar_type scalar,
-            typename Functor::tin_type in, bool* error) {
-    typedef typename Functor::func Binary;
-    constexpr int NumDims = Functor::tin_type::NumDimensions;
-    static_assert(NumDims == 1, "Unexpected size");
-    Eigen::Sizes<1> scalar_dim;
-    out.device(d) = scalar.reshape(scalar_dim)
-                        .broadcast(in.dimensions())
-                        .binaryExpr(in, Binary());
-  }
-
-  void Right(const SYCLDevice& d, typename Functor::tout_type out,
-             typename Functor::tin_type in,
-             typename Functor::tscalar_type scalar, bool* error) {
-    typedef typename Functor::func Binary;
-    constexpr int NumDims = Functor::tin_type::NumDimensions;
-    static_assert(NumDims == 1, "Unexpected size");
-    Eigen::Sizes<1> scalar_dim;
-    out.device(d) = in.binaryExpr(
-        scalar.reshape(scalar_dim).broadcast(in.dimensions()), Binary());
-  }
-
-  void BCast(const SYCLDevice& d,
-             typename TTypes<typename Functor::out_type, NDIMS>::Tensor out,
-             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in0,
-             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast0,
-             typename TTypes<typename Functor::in_type, NDIMS>::ConstTensor in1,
-             typename Eigen::array<Eigen::DenseIndex, NDIMS> bcast1,
-             bool* error) {
-    typedef typename Functor::in_type T;
-    typename Functor::func func;
-    if ((NDIMS == 2) && Functor::use_bcast_optimization &&
-        use_bcast_optimization<T>::value) {
-      const bool bcast0_all_one = AllOne<NDIMS>(bcast0);
-      const bool bcast1_all_one = AllOne<NDIMS>(bcast1);
-      if (bcast0_all_one && !bcast1_all_one) {
-        To32Bit(out).device(d) =
-            To32Bit(in0).binaryExpr(To32Bit(in1).broadcast(bcast1), func);
-        return;
-      }
-      if (!bcast0_all_one && bcast1_all_one) {
-        To32Bit(out).device(d) =
-            To32Bit(in0).broadcast(bcast0).binaryExpr(To32Bit(in1), func);
-        return;
-      }
-    }
-    To32Bit(out).device(d) = To32Bit(in0).broadcast(bcast0).binaryExpr(
-        To32Bit(in1).broadcast(bcast1), func);
-  }
-};
-
-// Macros to explicitly instantiate kernels on GPU for multiple types
-// (T0, T1, etc.) for UnaryFunctor (e.g., functor::sqrt).
-#define DEFINE_UNARY1(F, T) template struct UnaryFunctor<SYCLDevice, F<T> >
-#define DEFINE_UNARY2(F, T0, T1) \
-  DEFINE_UNARY1(F, T0);          \
-  DEFINE_UNARY1(F, T1)
-#define DEFINE_UNARY3(F, T0, T1, T2) \
-  DEFINE_UNARY2(F, T0, T1);          \
-  DEFINE_UNARY1(F, T2)
-#define DEFINE_UNARY4(F, T0, T1, T2, T3) \
-  DEFINE_UNARY2(F, T0, T1);              \
-  DEFINE_UNARY2(F, T2, T3)
-#define DEFINE_UNARY5(F, T0, T1, T2, T3, T4) \
-  DEFINE_UNARY2(F, T0, T1);                  \
-  DEFINE_UNARY3(F, T2, T3, T4)
-
-// Macros to explicitly instantiate kernels on GPU for multiple types
-// (T0, T1, etc.) for BinaryFunctor.
-#define DEFINE_BINARY1(F, T)                          \
-  template struct BinaryFunctor<SYCLDevice, F<T>, 1>; \
-  template struct BinaryFunctor<SYCLDevice, F<T>, 2>; \
-  template struct BinaryFunctor<SYCLDevice, F<T>, 3>
-#define DEFINE_BINARY2(F, T0, T1) \
-  DEFINE_BINARY1(F, T0);          \
-  DEFINE_BINARY1(F, T1)
-#define DEFINE_BINARY3(F, T0, T1, T2) \
-  DEFINE_BINARY2(F, T0, T1);          \
-  DEFINE_BINARY1(F, T2)
-#define DEFINE_BINARY4(F, T0, T1, T2, T3) \
-  DEFINE_BINARY2(F, T0, T1);              \
-  DEFINE_BINARY2(F, T2, T3)
-#define DEFINE_BINARY5(F, T0, T1, T2, T3, T4) \
-  DEFINE_BINARY2(F, T0, T1);                  \
-  DEFINE_BINARY3(F, T2, T3, T4)
-#define DEFINE_BINARY6(F, T0, T1, T2, T3, T4, T5) \
-  DEFINE_BINARY3(F, T0, T1, T2);                  \
-  DEFINE_BINARY3(F, T3, T4, T5)
-#define DEFINE_BINARY7(F, T0, T1, T2, T3, T4, T5, T6) \
-  DEFINE_BINARY3(F, T0, T1, T2);                      \
-  DEFINE_BINARY4(F, T3, T4, T5, T6)
-#define DEFINE_BINARY8(F, T0, T1, T2, T3, T4, T5, T6, T7) \
-  DEFINE_BINARY4(F, T0, T1, T2, T3);                      \
-  DEFINE_BINARY4(F, T4, T5, T6, T7)
-#define DEFINE_BINARY9(F, T0, T1, T2, T3, T4, T5, T6, T7, T8) \
-  DEFINE_BINARY4(F, T0, T1, T2, T3);                          \
-  DEFINE_BINARY5(F, T4, T5, T6, T7, T8)
-#define DEFINE_BINARY10(F, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
-  DEFINE_BINARY5(F, T0, T1, T2, T3, T4);                           \
-  DEFINE_BINARY5(F, T5, T6, T7, T8, T9)
-
-}  // end namespace functor
-}  // end namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_CWISE_OPS_SYCL_COMMON_H_
diff --git a/tensorflow/core/kernels/cwise_ops_test.cc b/tensorflow/core/kernels/cwise_ops_test.cc
index bc77a119f0a..61f4b89535a 100644
--- a/tensorflow/core/kernels/cwise_ops_test.cc
+++ b/tensorflow/core/kernels/cwise_ops_test.cc
@@ -56,17 +56,11 @@ BM_UNARY(cpu, Floor, float, DT_FLOAT);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_UNARY(gpu, Floor, float, DT_FLOAT);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_UNARY(sycl, Floor, float, DT_FLOAT);
-#endif  // TENSORFLOW_USE_SYCL
 
 BM_UNARY(cpu, Floor, double, DT_DOUBLE);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_UNARY(gpu, Floor, double, DT_DOUBLE);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_UNARY(sycl, Floor, double, DT_DOUBLE);
-#endif  // TENSORFLOW_USE_SYCL
 
 BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -134,25 +128,16 @@ BM_BINARY_SCALAR(cpu, Less);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_BINARY_SCALAR(gpu, Less);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_BINARY_SCALAR(sycl, Less);
-#endif  // TENSORFLOW_USE_SYCL
 
 BM_BINARY_SCALAR(cpu, Add);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_BINARY_SCALAR(gpu, Add);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_BINARY_SCALAR(sycl, Add);
-#endif  // TENSORFLOW_USE_SYCL
 
 BM_BINARY_SCALAR(cpu, DivNoNan);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_BINARY_SCALAR(gpu, DivNoNan);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_BINARY_SCALAR(sycl, DivNoNan);
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef BM_BINARY_SCALAR
 
@@ -209,11 +194,6 @@ BM_CUBE(gpu, CubeWithPow3);
 BM_CUBE(gpu, CubeWithTwoMuls);
 BM_CUBE(gpu, CubeWithMulSquare);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_CUBE(sycl, CubeWithPow3);
-BM_CUBE(sycl, CubeWithTwoMuls);
-BM_CUBE(sycl, CubeWithMulSquare);
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef BM_CUBE
 
@@ -367,9 +347,6 @@ BM_BCAST_ADD_ROW_ALL(cpu);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_BCAST_ADD_ROW_ALL(gpu);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_BCAST_ADD_ROW_ALL(sycl);
-#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_ROW_ALL
 #undef BM_BCAST_ADD_ROW
 
@@ -394,9 +371,6 @@ BM_BCAST_ADD_COL_ALL(cpu);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_BCAST_ADD_COL_ALL(gpu);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_BCAST_ADD_COL_ALL(sycl);
-#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_COL_ALL
 #undef BM_BCAST_ADD_COL
 
@@ -422,9 +396,6 @@ BM_BCAST_ADD_CROSS_RC_ALL(cpu);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_BCAST_ADD_CROSS_RC_ALL(gpu);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_BCAST_ADD_CROSS_RC_ALL(sycl);
-#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_CROSS_RC_ALL
 #undef BM_BCAST_ADD_CROSS_RC
 
@@ -450,9 +421,6 @@ BM_BCAST_ADD_CROSS_CR_ALL(cpu);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 BM_BCAST_ADD_CROSS_CR_ALL(gpu);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_BCAST_ADD_CROSS_CR_ALL(sycl);
-#endif  // TENSORFLOW_USE_SYCL
 #undef BM_BCAST_ADD_CROSS_CR_ALL
 #undef BM_BCAST_ADD_CROSS_CR
 
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 1365f8a1d31..29aaf4c4d3e 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -1,251 +1,30 @@
 # Description:
 #   OpKernels for tf.data
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_not_mobile",
     "tf_cc_test",
-    "tf_kernel_library",
 )
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+
+# TODO(b/168713048): Avoid visibility exceptions.
+package_group(name = "friends")
+
 package(
-    default_visibility = ["//visibility:public"],
+    default_visibility = [
+        ":friends",
+        "//tensorflow:internal",
+    ],
     licenses = ["notice"],  # Apache 2.0
 )
 
-cc_library(
-    name = "dataset_test_base",
-    testonly = 1,
-    srcs = ["dataset_test_base.cc"],
-    hdrs = ["dataset_test_base.h"],
-    deps = [
-        ":batch_dataset_op",
-        ":concatenate_dataset_op",
-        ":dataset_utils",
-        ":iterator_ops",
-        ":map_dataset_op",
-        ":name_utils",
-        ":range_dataset_op",
-        ":take_dataset_op",
-        ":tensor_slice_dataset_op",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
-        "//tensorflow/core:test",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:function_ops",
-        "//third_party/eigen3",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "dataset_utils",
-    srcs = ["dataset_utils.cc"],
-    hdrs = ["dataset_utils.h"],
-    deps = [
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:regexp",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-    ],
-)
-
-tf_cc_test(
-    name = "dataset_utils_test",
-    srcs = ["dataset_utils_test.cc"],
-    deps = [
-        ":dataset_test_base",
-        ":dataset_utils",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-
-cc_library(
-    name = "name_utils",
-    srcs = ["name_utils.cc"],
-    hdrs = ["name_utils.h"],
-    deps = [
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-tf_cc_test(
-    name = "name_utils_test",
-    srcs = ["name_utils_test.cc"],
-    deps = [
-        ":concatenate_dataset_op",
-        ":name_utils",
-        ":parallel_interleave_dataset_op",
-        ":range_dataset_op",
-        ":shuffle_dataset_op",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-
-cc_library(
-    name = "rewrite_utils",
-    srcs = ["rewrite_utils.cc"],
-    hdrs = ["rewrite_utils.h"],
-    deps = [
-        ":dataset_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:graph_view",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:grappler_item_builder",
-        "//tensorflow/core/grappler/clusters:virtual_cluster",
-        "//tensorflow/core/grappler/optimizers:meta_optimizer",
-        "//tensorflow/core/grappler/optimizers/data",
-        "//tensorflow/core/grappler/optimizers/data:function_utils",
-        "//tensorflow/core/grappler/optimizers/data:graph_utils",
-        "//tensorflow/core/kernels/data:serialization_utils",
-    ],
-)
-
-cc_library(
-    name = "serialization_utils",
-    srcs = ["serialization_utils.cc"],
-    hdrs = ["serialization_utils.h"],
-    deps = [
-        ":captured_function",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:graph",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/lib/core:status",
-    ],
-)
-
-cc_library(
-    name = "stats_utils",
-    srcs = ["stats_utils.cc"],
-    hdrs = ["stats_utils.h"],
-    deps = [
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/base:core_headers",
-    ],
-)
-
-cc_library(
-    name = "captured_function",
-    srcs = ["captured_function.cc"],
-    hdrs = ["captured_function.h"],
-    deps = [
-        ":dataset_utils",
-        ":single_threaded_executor",
-        ":stats_utils",
-        "@com_google_absl//absl/time",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/kernels:variable_ops",
-        "//tensorflow/core/profiler/lib:traceme",
-    ] + if_not_mobile([
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler/optimizers:meta_optimizer",
-    ]),
-)
-
-cc_library(
-    name = "single_threaded_executor",
-    srcs = ["single_threaded_executor.cc"],
-    hdrs = ["single_threaded_executor.h"],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/common_runtime:core_cpu_internal",
-        "//tensorflow/core/common_runtime:entry",
-        "//tensorflow/core/common_runtime:local_executor_params",
-    ],
-    alwayslink = 1,
-)
-
-tf_cc_test(
-    name = "single_threaded_executor_test",
-    srcs = ["single_threaded_executor_test.cc"],
-    deps = [
-        ":single_threaded_executor",
-        "//tensorflow/core:bitwise_ops_op_lib",
-        "//tensorflow/core:control_flow_ops_op_lib",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:random_ops_op_lib",
-        "//tensorflow/core:spectral_ops_op_lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:array",
-        "//tensorflow/core/kernels:control_flow_ops",
-        "//tensorflow/core/kernels:function_ops",
-        "//tensorflow/core/kernels:math",
-        "//tensorflow/core/kernels:random_ops",
-        "//tensorflow/core/kernels:state",
-    ],
-)
-
-cc_library(
-    name = "unbounded_thread_pool",
-    srcs = ["unbounded_thread_pool.cc"],
-    hdrs = ["unbounded_thread_pool.h"],
-    deps = [
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "@com_google_absl//absl/memory",
-    ],
-)
-
-tf_cc_test(
-    name = "unbounded_thread_pool_test",
-    srcs = ["unbounded_thread_pool_test.cc"],
-    deps = [
-        ":unbounded_thread_pool",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-
-cc_library(
-    name = "window_dataset",
-    srcs = ["window_dataset.cc"],
-    hdrs = ["window_dataset.h"],
-    deps = [
-        ":name_utils",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "batch_dataset_op",
     srcs = ["batch_dataset_op.cc"],
@@ -280,9 +59,80 @@ tf_cc_test(
 )
 
 tf_kernel_library(
-    name = "shard_dataset_op",
-    srcs = ["shard_dataset_op.cc"],
-    hdrs = ["shard_dataset_op.h"],
+    name = "cache_dataset_ops",
+    srcs = ["cache_dataset_ops.cc"],
+    hdrs = ["cache_dataset_ops.h"],
+    deps = [
+        ":cache_ops",
+        ":dataset_utils",
+        ":name_utils",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/util/tensor_bundle",
+    ],
+)
+
+tf_cc_test(
+    name = "cache_dataset_ops_test",
+    srcs = ["cache_dataset_ops_test.cc"],
+    deps = [
+        ":cache_dataset_ops",
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":tensor_slice_dataset_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:ptr_util",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+tf_kernel_library(
+    name = "cache_ops",
+    srcs = ["cache_ops.cc"],
+    hdrs = ["cache_ops.h"],
+    deps = [
+        ":dataset_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "captured_function",
+    srcs = ["captured_function.cc"],
+    hdrs = ["captured_function.h"],
+    deps = [
+        ":dataset_utils",
+        ":single_threaded_executor",
+        ":stats_utils",
+        "@com_google_absl//absl/time",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/kernels:variable_ops",
+        "//tensorflow/core/profiler/lib:traceme",
+    ] + if_not_mobile([
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
+    ]),
+)
+
+tf_kernel_library(
+    name = "concatenate_dataset_op",
+    srcs = ["concatenate_dataset_op.cc"],
+    hdrs = ["concatenate_dataset_op.h"],
     deps = [
         ":name_utils",
         "//tensorflow/core:dataset_ops_op_lib",
@@ -293,15 +143,15 @@ tf_kernel_library(
 )
 
 tf_cc_test(
-    name = "shard_dataset_op_test",
+    name = "concatenate_dataset_op_test",
     size = "small",
-    srcs = ["shard_dataset_op_test.cc"],
+    srcs = ["concatenate_dataset_op_test.cc"],
     deps = [
+        ":concatenate_dataset_op",
         ":dataset_test_base",
         ":dataset_utils",
         ":iterator_ops",
-        ":range_dataset_op",
-        ":shard_dataset_op",
+        ":tensor_slice_dataset_op",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -313,68 +163,82 @@ tf_cc_test(
 )
 
 tf_kernel_library(
-    name = "window_dataset_op",
-    srcs = ["window_dataset_op.cc"],
-    hdrs = ["window_dataset_op.h"],
+    name = "dataset_ops",
+    srcs = ["dataset_ops.cc"],
+    hdrs = ["dataset_ops.h"],
     deps = [
-        ":name_utils",
-        ":window_dataset",
+        ":captured_function",
+        ":dataset_utils",
+        ":serialization_utils",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_topology_view",
+        "//tensorflow/core/grappler/utils:traversal",
     ],
 )
 
-tf_cc_test(
-    name = "window_dataset_op_test",
-    size = "small",
-    srcs = ["window_dataset_op_test.cc"],
+cc_library(
+    name = "dataset_test_base",
+    testonly = 1,
+    srcs = ["dataset_test_base.cc"],
+    hdrs = ["dataset_test_base.h"],
     deps = [
-        ":dataset_test_base",
+        ":batch_dataset_op",
+        ":concatenate_dataset_op",
         ":dataset_utils",
         ":iterator_ops",
+        ":map_dataset_op",
+        ":name_utils",
         ":range_dataset_op",
-        ":window_dataset_op",
+        ":split_utils",
+        ":take_dataset_op",
+        ":tensor_slice_dataset_op",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/kernels:function_ops",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/strings",
     ],
 )
 
-tf_kernel_library(
-    name = "padded_batch_dataset_op",
-    srcs = ["padded_batch_dataset_op.cc"],
-    hdrs = ["padded_batch_dataset_op.h"],
+cc_library(
+    name = "dataset_utils",
+    srcs = ["dataset_utils.cc"],
+    hdrs = ["dataset_utils.h"],
     deps = [
-        ":name_utils",
-        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:regexp",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
 tf_cc_test(
-    name = "padded_batch_dataset_op_test",
-    size = "small",
-    srcs = ["padded_batch_dataset_op_test.cc"],
+    name = "dataset_utils_test",
+    srcs = ["dataset_utils_test.cc"],
     deps = [
         ":dataset_test_base",
         ":dataset_utils",
-        ":iterator_ops",
-        ":padded_batch_dataset_op",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/cc:cc_ops",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
     ],
 )
 
@@ -419,31 +283,28 @@ tf_cc_test(
 )
 
 tf_kernel_library(
-    name = "map_dataset_op",
-    srcs = ["map_dataset_op.cc"],
-    hdrs = ["map_dataset_op.h"],
+    name = "fixed_length_record_dataset_op",
+    srcs = ["fixed_length_record_dataset_op.cc"],
+    hdrs = ["fixed_length_record_dataset_op.h"],
     deps = [
-        ":captured_function",
-        ":dataset_utils",
         ":name_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
     ],
 )
 
 tf_cc_test(
-    name = "map_dataset_op_test",
+    name = "fixed_length_record_dataset_op_test",
     size = "small",
-    srcs = ["map_dataset_op_test.cc"],
+    srcs = ["fixed_length_record_dataset_op_test.cc"],
     deps = [
         ":dataset_test_base",
         ":dataset_utils",
+        ":fixed_length_record_dataset_op",
         ":iterator_ops",
-        ":map_dataset_op",
-        ":range_dataset_op",
-        ":stats_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -451,69 +312,6 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:cwise_op",
-        "//tensorflow/core/kernels:function_ops",
-    ],
-)
-
-tf_kernel_library(
-    name = "parallel_map_dataset_op",
-    srcs = ["parallel_map_dataset_op.cc"],
-    hdrs = ["parallel_map_dataset_op.h"],
-    deps = [
-        ":captured_function",
-        ":dataset_utils",
-        ":name_utils",
-        ":stats_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler/lib:traceme",
-        "//tensorflow/core/profiler/lib:traceme_encode",
-    ],
-)
-
-tf_cc_test(
-    name = "parallel_map_dataset_op_test",
-    size = "small",
-    srcs = ["parallel_map_dataset_op_test.cc"],
-    deps = [
-        ":dataset_test_base",
-        ":dataset_utils",
-        ":iterator_ops",
-        ":name_utils",
-        ":parallel_map_dataset_op",
-        ":range_dataset_op",
-        ":stats_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:cwise_op",
-        "//tensorflow/core/kernels:function_ops",
-    ],
-)
-
-tf_kernel_library(
-    name = "generator_dataset_op",
-    srcs = ["generator_dataset_op.cc"],
-    hdrs = ["generator_dataset_op.h"],
-    deps = [
-        ":captured_function",
-        ":dataset_utils",
-        ":name_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
     ],
 )
 
@@ -557,6 +355,53 @@ tf_cc_test(
     ],
 )
 
+tf_kernel_library(
+    name = "generator_dataset_op",
+    srcs = ["generator_dataset_op.cc"],
+    hdrs = ["generator_dataset_op.h"],
+    deps = [
+        ":captured_function",
+        ":dataset_utils",
+        ":name_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "hash_utils",
+    srcs = ["hash_utils.cc"],
+    hdrs = ["hash_utils.h"],
+    deps = [
+        ":dataset_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:regexp",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+    ],
+)
+
+tf_cc_test(
+    name = "hash_utils_test",
+    srcs = ["hash_utils_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":hash_utils",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_kernel_library(
     name = "interleave_dataset_op",
     srcs = ["interleave_dataset_op.cc"],
@@ -598,6 +443,245 @@ tf_cc_test(
     ],
 )
 
+tf_kernel_library(
+    name = "iterator_ops",
+    srcs = ["iterator_ops.cc"],
+    hdrs = ["iterator_ops.h"],
+    deps = [
+        ":captured_function",
+        ":dataset_utils",
+        ":optional_ops",
+        ":unbounded_thread_pool",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+        "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+tf_kernel_library(
+    name = "map_dataset_op",
+    srcs = ["map_dataset_op.cc"],
+    hdrs = ["map_dataset_op.h"],
+    deps = [
+        ":captured_function",
+        ":dataset_utils",
+        ":name_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_cc_test(
+    name = "map_dataset_op_test",
+    size = "small",
+    srcs = ["map_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":map_dataset_op",
+        ":range_dataset_op",
+        ":stats_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:function_ops",
+    ],
+)
+
+tf_kernel_library(
+    name = "map_defun_op",
+    srcs = ["map_defun_op.cc"],
+    hdrs = ["map_defun_op.h"],
+    deps = [
+        ":dataset_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_cc_test(
+    name = "map_defun_op_test",
+    size = "small",
+    srcs = ["map_defun_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":map_defun_op",
+        ":stats_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:function_ops",
+    ],
+)
+
+tf_kernel_library(
+    name = "model_dataset_op",
+    srcs = ["model_dataset_op.cc"],
+    hdrs = ["model_dataset_op.h"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+tf_kernel_library(
+    name = "multi_device_iterator_ops",
+    srcs = ["multi_device_iterator_ops.cc"],
+    deps = [
+        ":dataset_utils",
+        ":iterator_ops",
+        ":unbounded_thread_pool",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
+cc_library(
+    name = "name_utils",
+    srcs = ["name_utils.cc"],
+    hdrs = ["name_utils.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "name_utils_test",
+    srcs = ["name_utils_test.cc"],
+    deps = [
+        ":concatenate_dataset_op",
+        ":name_utils",
+        ":parallel_interleave_dataset_op",
+        ":range_dataset_op",
+        ":shuffle_dataset_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_kernel_library(
+    name = "optimize_dataset_op",
+    srcs = ["optimize_dataset_op.cc"],
+    hdrs = ["optimize_dataset_op.h"],
+    deps = [
+        ":dataset_utils",
+        ":rewrite_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:platform_port",
+    ],
+)
+
+tf_cc_test(
+    name = "optimize_dataset_op_test",
+    size = "small",
+    srcs = ["optimize_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":optimize_dataset_op",
+        ":range_dataset_op",
+        ":take_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels/data/experimental:threadpool_dataset_op",
+    ],
+)
+
+tf_kernel_library(
+    name = "optional_ops",
+    srcs = ["optional_ops.cc"],
+    hdrs = ["optional_ops.h"],
+    gpu_srcs = [
+        "optional_ops.cu.cc",
+        "optional_ops.h",
+    ],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_kernel_library(
+    name = "padded_batch_dataset_op",
+    srcs = ["padded_batch_dataset_op.cc"],
+    hdrs = ["padded_batch_dataset_op.h"],
+    deps = [
+        ":name_utils",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_cc_test(
+    name = "padded_batch_dataset_op_test",
+    size = "small",
+    srcs = ["padded_batch_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":padded_batch_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "parallel_interleave_dataset_op",
     srcs = ["parallel_interleave_dataset_op.cc"],
@@ -643,6 +727,51 @@ tf_cc_test(
     ],
 )
 
+tf_kernel_library(
+    name = "parallel_map_dataset_op",
+    srcs = ["parallel_map_dataset_op.cc"],
+    hdrs = ["parallel_map_dataset_op.h"],
+    deps = [
+        ":captured_function",
+        ":dataset_utils",
+        ":name_utils",
+        ":stats_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/lib:traceme_encode",
+    ],
+)
+
+tf_cc_test(
+    name = "parallel_map_dataset_op_test",
+    size = "small",
+    srcs = ["parallel_map_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":name_utils",
+        ":parallel_map_dataset_op",
+        ":range_dataset_op",
+        ":stats_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:function_ops",
+    ],
+)
+
 cc_library(
     name = "prefetch_autotuner",
     srcs = ["prefetch_autotuner.cc"],
@@ -704,6 +833,73 @@ tf_cc_test(
     ],
 )
 
+tf_kernel_library(
+    name = "random_seed_ops",
+    srcs = ["random_seed_ops.cc"],
+    hdrs = ["random_seed_ops.h"],
+    deps = [
+        ":dataset_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:functional_ops_op_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "range_dataset_op",
+    srcs = ["range_dataset_op.cc"],
+    hdrs = ["range_dataset_op.h"],
+    deps = [
+        ":name_utils",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+tf_cc_test(
+    name = "range_dataset_op_test",
+    size = "small",
+    srcs = ["range_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":range_dataset_op",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:ptr_util",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+tf_cc_test(
+    name = "reduce_dataset_op_test",
+    size = "small",
+    srcs = ["reduce_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":range_dataset_op",
+        ":stats_utils",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:function_ops",
+    ],
+)
+
 tf_kernel_library(
     name = "repeat_dataset_op",
     srcs = ["repeat_dataset_op.cc"],
@@ -737,43 +933,47 @@ tf_cc_test(
     ],
 )
 
-tf_kernel_library(
-    name = "take_dataset_op",
-    srcs = ["take_dataset_op.cc"],
-    hdrs = ["take_dataset_op.h"],
+cc_library(
+    name = "rewrite_utils",
+    srcs = ["rewrite_utils.cc"],
+    hdrs = ["rewrite_utils.h"],
     deps = [
-        ":name_utils",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_cc_test(
-    name = "take_dataset_op_test",
-    size = "small",
-    srcs = ["take_dataset_op_test.cc"],
-    deps = [
-        ":dataset_test_base",
         ":dataset_utils",
-        ":iterator_ops",
-        ":take_dataset_op",
-        ":tensor_slice_dataset_op",
+        ":hash_utils",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
+        "//tensorflow/core/grappler/optimizers/data",
+        "//tensorflow/core/grappler/optimizers/data:function_utils",
+        "//tensorflow/core/grappler/optimizers/data:graph_utils",
+        "//tensorflow/core/kernels/data:serialization_utils",
+    ],
+)
+
+cc_library(
+    name = "serialization_utils",
+    srcs = ["serialization_utils.cc"],
+    hdrs = ["serialization_utils.h"],
+    deps = [
+        ":captured_function",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/core:status",
     ],
 )
 
 tf_kernel_library(
-    name = "skip_dataset_op",
-    srcs = ["skip_dataset_op.cc"],
-    hdrs = ["skip_dataset_op.h"],
+    name = "shard_dataset_op",
+    srcs = ["shard_dataset_op.cc"],
+    hdrs = ["shard_dataset_op.h"],
     deps = [
         ":name_utils",
         "//tensorflow/core:dataset_ops_op_lib",
@@ -784,49 +984,19 @@ tf_kernel_library(
 )
 
 tf_cc_test(
-    name = "skip_dataset_op_test",
+    name = "shard_dataset_op_test",
     size = "small",
-    srcs = ["skip_dataset_op_test.cc"],
-    deps = [
-        ":dataset_test_base",
-        ":dataset_utils",
-        ":iterator_ops",
-        ":skip_dataset_op",
-        ":tensor_slice_dataset_op",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_kernel_library(
-    name = "range_dataset_op",
-    srcs = ["range_dataset_op.cc"],
-    hdrs = ["range_dataset_op.h"],
-    deps = [
-        ":name_utils",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_cc_test(
-    name = "range_dataset_op_test",
-    size = "small",
-    srcs = ["range_dataset_op_test.cc"],
+    srcs = ["shard_dataset_op_test.cc"],
     deps = [
         ":dataset_test_base",
         ":dataset_utils",
         ":iterator_ops",
         ":range_dataset_op",
+        ":shard_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:ptr_util",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
@@ -866,6 +1036,81 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "single_threaded_executor",
+    srcs = ["single_threaded_executor.cc"],
+    hdrs = ["single_threaded_executor.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core/common_runtime:core_cpu_internal",
+        "//tensorflow/core/common_runtime:entry",
+        "//tensorflow/core/common_runtime:local_executor_params",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "single_threaded_executor_test",
+    srcs = ["single_threaded_executor_test.cc"],
+    deps = [
+        ":single_threaded_executor",
+        "//tensorflow/core:bitwise_ops_op_lib",
+        "//tensorflow/core:control_flow_ops_op_lib",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:random_ops_op_lib",
+        "//tensorflow/core:spectral_ops_op_lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:array",
+        "//tensorflow/core/kernels:control_flow_ops",
+        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:math",
+        "//tensorflow/core/kernels:random_ops",
+        "//tensorflow/core/kernels:state",
+    ],
+)
+
+tf_kernel_library(
+    name = "skip_dataset_op",
+    srcs = ["skip_dataset_op.cc"],
+    hdrs = ["skip_dataset_op.h"],
+    deps = [
+        ":name_utils",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_cc_test(
+    name = "skip_dataset_op_test",
+    size = "small",
+    srcs = ["skip_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":skip_dataset_op",
+        ":tensor_slice_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "sparse_tensor_slice_dataset_op",
     srcs = ["sparse_tensor_slice_dataset_op.cc"],
@@ -896,6 +1141,75 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "split_utils",
+    srcs = ["split_utils.cc"],
+    hdrs = ["split_utils.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/data:iterator_ops",
+    ],
+)
+
+tf_cc_test(
+    name = "split_utils_test",
+    size = "small",
+    srcs = ["split_utils_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":split_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:tensor_testutil",
+    ],
+)
+
+cc_library(
+    name = "stats_utils",
+    srcs = ["stats_utils.cc"],
+    hdrs = ["stats_utils.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/base:core_headers",
+    ],
+)
+
+tf_kernel_library(
+    name = "take_dataset_op",
+    srcs = ["take_dataset_op.cc"],
+    hdrs = ["take_dataset_op.h"],
+    deps = [
+        ":name_utils",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_cc_test(
+    name = "take_dataset_op_test",
+    size = "small",
+    srcs = ["take_dataset_op_test.cc"],
+    deps = [
+        ":dataset_test_base",
+        ":dataset_utils",
+        ":iterator_ops",
+        ":take_dataset_op",
+        ":tensor_slice_dataset_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "tensor_dataset_op",
     srcs = ["tensor_dataset_op.cc"],
@@ -928,6 +1242,7 @@ tf_kernel_library(
     deps = [
         ":dataset_utils",
         ":name_utils",
+        ":split_utils",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
@@ -953,72 +1268,6 @@ tf_cc_test(
     ],
 )
 
-tf_kernel_library(
-    name = "zip_dataset_op",
-    srcs = ["zip_dataset_op.cc"],
-    hdrs = ["zip_dataset_op.h"],
-    deps = [
-        ":name_utils",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_cc_test(
-    name = "zip_dataset_op_test",
-    size = "small",
-    srcs = ["zip_dataset_op_test.cc"],
-    deps = [
-        ":dataset_test_base",
-        ":dataset_utils",
-        ":iterator_ops",
-        ":range_dataset_op",
-        ":zip_dataset_op",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_kernel_library(
-    name = "concatenate_dataset_op",
-    srcs = ["concatenate_dataset_op.cc"],
-    hdrs = ["concatenate_dataset_op.h"],
-    deps = [
-        ":name_utils",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_cc_test(
-    name = "concatenate_dataset_op_test",
-    size = "small",
-    srcs = ["concatenate_dataset_op_test.cc"],
-    deps = [
-        ":concatenate_dataset_op",
-        ":dataset_test_base",
-        ":dataset_utils",
-        ":iterator_ops",
-        ":tensor_slice_dataset_op",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_kernel_library(
     name = "text_line_dataset_op",
     srcs = ["text_line_dataset_op.cc"],
@@ -1052,39 +1301,6 @@ tf_cc_test(
     ],
 )
 
-tf_kernel_library(
-    name = "fixed_length_record_dataset_op",
-    srcs = ["fixed_length_record_dataset_op.cc"],
-    hdrs = ["fixed_length_record_dataset_op.h"],
-    deps = [
-        ":name_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_cc_test(
-    name = "fixed_length_record_dataset_op_test",
-    size = "small",
-    srcs = ["fixed_length_record_dataset_op_test.cc"],
-    deps = [
-        ":dataset_test_base",
-        ":dataset_utils",
-        ":fixed_length_record_dataset_op",
-        ":iterator_ops",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_kernel_library(
     name = "tf_record_dataset_op",
     srcs = ["tf_record_dataset_op.cc"],
@@ -1118,38 +1334,66 @@ tf_cc_test(
     ],
 )
 
-tf_kernel_library(
-    name = "iterator_ops",
-    srcs = ["iterator_ops.cc"],
-    hdrs = ["iterator_ops.h"],
+cc_library(
+    name = "unbounded_thread_pool",
+    srcs = ["unbounded_thread_pool.cc"],
+    hdrs = ["unbounded_thread_pool.h"],
     deps = [
-        ":captured_function",
-        ":dataset_utils",
-        ":optional_ops",
-        ":unbounded_thread_pool",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:session_options",
-        "//tensorflow/core/kernels:ops_util",
-        "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/memory",
     ],
 )
 
 tf_cc_test(
-    name = "reduce_dataset_op_test",
+    name = "unbounded_thread_pool_test",
+    srcs = ["unbounded_thread_pool_test.cc"],
+    deps = [
+        ":unbounded_thread_pool",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "window_dataset",
+    srcs = ["window_dataset.cc"],
+    hdrs = ["window_dataset.h"],
+    deps = [
+        ":name_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "window_dataset_op",
+    srcs = ["window_dataset_op.cc"],
+    hdrs = ["window_dataset_op.h"],
+    deps = [
+        ":name_utils",
+        ":window_dataset",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_cc_test(
+    name = "window_dataset_op_test",
     size = "small",
-    srcs = ["reduce_dataset_op_test.cc"],
+    srcs = ["window_dataset_op_test.cc"],
     deps = [
         ":dataset_test_base",
         ":dataset_utils",
         ":iterator_ops",
         ":range_dataset_op",
-        ":stats_utils",
+        ":window_dataset_op",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -1157,106 +1401,32 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:cwise_op",
-        "//tensorflow/core/kernels:function_ops",
     ],
 )
 
 tf_kernel_library(
-    name = "multi_device_iterator_ops",
-    srcs = ["multi_device_iterator_ops.cc"],
+    name = "zip_dataset_op",
+    srcs = ["zip_dataset_op.cc"],
+    hdrs = ["zip_dataset_op.h"],
     deps = [
-        ":dataset_utils",
-        ":iterator_ops",
-        ":unbounded_thread_pool",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:ops_util",
-    ],
-)
-
-tf_kernel_library(
-    name = "optional_ops",
-    srcs = ["optional_ops.cc"],
-    hdrs = ["optional_ops.h"],
-    gpu_srcs = [
-        "optional_ops.cu.cc",
-        "optional_ops.h",
-    ],
-    deps = [
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//third_party/eigen3",
-    ],
-)
-
-tf_kernel_library(
-    name = "cache_dataset_ops",
-    srcs = ["cache_dataset_ops.cc"],
-    hdrs = ["cache_dataset_ops.h"],
-    deps = [
-        ":cache_ops",
-        ":dataset_utils",
         ":name_utils",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/util/tensor_bundle",
     ],
 )
 
 tf_cc_test(
-    name = "cache_dataset_ops_test",
-    srcs = ["cache_dataset_ops_test.cc"],
-    deps = [
-        ":cache_dataset_ops",
-        ":dataset_test_base",
-        ":dataset_utils",
-        ":iterator_ops",
-        ":tensor_slice_dataset_op",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:ptr_util",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
-tf_kernel_library(
-    name = "optimize_dataset_op",
-    srcs = ["optimize_dataset_op.cc"],
-    hdrs = ["optimize_dataset_op.h"],
-    deps = [
-        ":dataset_utils",
-        ":rewrite_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:platform_port",
-    ],
-)
-
-tf_cc_test(
-    name = "optimize_dataset_op_test",
+    name = "zip_dataset_op_test",
     size = "small",
-    srcs = ["optimize_dataset_op_test.cc"],
+    srcs = ["zip_dataset_op_test.cc"],
     deps = [
         ":dataset_test_base",
         ":dataset_utils",
         ":iterator_ops",
-        ":optimize_dataset_op",
         ":range_dataset_op",
-        ":take_dataset_op",
+        ":zip_dataset_op",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
@@ -1267,39 +1437,25 @@ tf_cc_test(
     ],
 )
 
-tf_kernel_library(
-    name = "model_dataset_op",
-    srcs = ["model_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "@com_google_absl//absl/memory",
-    ],
-)
-
-tf_kernel_library(
-    name = "dataset_ops",
-    srcs = ["dataset_ops.cc"],
-    hdrs = ["dataset_ops.h"],
-    deps = [
-        ":captured_function",
-        ":dataset_utils",
-        ":serialization_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:graph_topology_view",
-        "//tensorflow/core/grappler/utils:traversal",
-    ],
+# A file group which contains all operators which are known to work on mobile.
+filegroup(
+    name = "android_all_op_kernels",
+    srcs = glob(
+        [
+            "*.cc",
+            "*.h",
+        ],
+        exclude = [
+            "dataset_test_base.*",
+            "*test.cc",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
 )
 
 tf_kernel_library(
     name = "data",
+    visibility = ["//visibility:public"],
     deps = [
         ":batch_dataset_op",
         ":cache_dataset_ops",
@@ -1336,89 +1492,6 @@ tf_kernel_library(
         ":zip_dataset_op",
         "//tensorflow/core:array_ops_op_lib",
         "//tensorflow/core:nn_ops_op_lib",
-        "//tensorflow/core/kernels/data/experimental:dataset_kernels",
+        "//tensorflow/core/kernels/data/experimental",
     ],
 )
-
-tf_kernel_library(
-    name = "map_defun_op",
-    srcs = ["map_defun_op.cc"],
-    hdrs = ["map_defun_op.h"],
-    deps = [
-        ":dataset_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:functional_ops_op_lib",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_cc_test(
-    name = "map_defun_op_test",
-    size = "small",
-    srcs = ["map_defun_op_test.cc"],
-    deps = [
-        ":dataset_test_base",
-        ":dataset_utils",
-        ":map_defun_op",
-        ":stats_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels:cwise_op",
-        "//tensorflow/core/kernels:function_ops",
-    ],
-)
-
-tf_kernel_library(
-    name = "random_seed_ops",
-    srcs = ["random_seed_ops.cc"],
-    hdrs = ["random_seed_ops.h"],
-    deps = [
-        ":dataset_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:functional_ops_op_lib",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "cache_ops",
-    srcs = ["cache_ops.cc"],
-    hdrs = ["cache_ops.h"],
-    deps = [
-        ":dataset_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:functional_ops_op_lib",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-# A file group which contains all operators which are known to work on mobile.
-filegroup(
-    name = "android_all_op_kernels",
-    srcs = glob(
-        [
-            "*.cc",
-            "*.h",
-        ],
-        exclude = [
-            "dataset_ops*",  # includes grappler dependency, which isn't supported on mobile.
-            "optimize_dataset_op.*",  # includes grappler dependency, which isn't supported on mobile.
-            "rewrite_utils*",  # includes grappler dependency, which isn't supported on mobile.
-            "*test.cc",
-            "*test.h",
-            "*_test_*",
-        ],
-    ),
-    visibility = ["//tensorflow:__subpackages__"],
-)
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 96c7e036e03..7ea39dfe709 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -117,6 +117,11 @@ class BatchDatasetOp::Dataset : public DatasetBase {
     return n / batch_size_ + (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index f60001b0055..c9883f9c938 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -102,6 +102,11 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
 
   int64 Cardinality() const override { return input_->Cardinality(); }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
@@ -680,6 +685,11 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
 
   int64 Cardinality() const override { return input_->Cardinality(); }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index 0066764baa0..22c93dc5306 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -465,9 +465,16 @@ Status MakeIteratorFromInputElement(
       GetDatasetFromVariantTensor(return_values[0], &returned_dataset));
 
   // Create an iterator for the dataset that was returned by `f`.
-  return returned_dataset->MakeIterator(
-      ctx, parent, strings::StrCat(prefix, "[", thread_index, "]"),
-      out_iterator);
+  std::string iterator_prefix = strings::StrCat(prefix, "[", thread_index, "]");
+  if (ctx->split_provider() == nullptr) {
+    return returned_dataset->MakeIterator(ctx, parent, iterator_prefix,
+                                          out_iterator);
+  }
+  // Strip out the split provider so that it doesn't apply to sub-iterators.
+  IteratorContext::Params params(ctx);
+  params.split_provider = nullptr;
+  return returned_dataset->MakeIterator(IteratorContext(std::move(params)),
+                                        parent, iterator_prefix, out_iterator);
 }
 
 /* static */
@@ -519,13 +526,6 @@ Status FunctionMetadata::Create(
       return Status::OK();
     }
   }
-  for (const auto& node : fdef->node_def()) {
-    if (node.op() == kDataServiceDataset) {
-      return errors::InvalidArgument(
-          "The `.distribute(...)` dataset transformation is not supported "
-          "within tf.data functions.");
-    }
-  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index 34faafeb178..ffe15248c0e 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -85,6 +85,12 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
     return n1 + n2;
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    inputs->push_back(to_concatenate_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(input_->CheckExternalState());
     return to_concatenate_->CheckExternalState();
diff --git a/tensorflow/core/kernels/data/dataset_ops.cc b/tensorflow/core/kernels/data/dataset_ops.cc
index 7c701b7886a..597e2587e66 100644
--- a/tensorflow/core/kernels/data/dataset_ops.cc
+++ b/tensorflow/core/kernels/data/dataset_ops.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/core/kernels/data/dataset_ops.h"
 
+// On mobile we do not provide this functionality because not all of its
+// dependencies are available there.
+#if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
@@ -168,3 +170,4 @@ REGISTER_KERNEL_BUILDER(Name("DatasetFromGraph").Device(DEVICE_CPU),
 
 }  // namespace data
 }  // namespace tensorflow
+#endif  // !IS_MOBILE_PLATFORM
diff --git a/tensorflow/core/kernels/data/dataset_ops.h b/tensorflow/core/kernels/data/dataset_ops.h
index 9895585f3de..576e018d320 100644
--- a/tensorflow/core/kernels/data/dataset_ops.h
+++ b/tensorflow/core/kernels/data/dataset_ops.h
@@ -12,10 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #ifndef TENSORFLOW_CORE_KERNELS_DATA_DATASET_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_DATA_DATASET_OPS_H_
 
+#include "tensorflow/core/platform/platform.h"
+
+// On mobile we do not provide this functionality because not all of its
+// dependencies are available there.
+#if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
@@ -61,5 +65,6 @@ class DatasetFromGraphOp : public OpKernel {
 
 }  // namespace data
 }  // namespace tensorflow
+#endif  // !IS_MOBILE_PLATFORM
 
 #endif  // TENSORFLOW_CORE_KERNELS_DATA_DATASET_OPS_H_
diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc
index 14af07fe494..4a71bfd1ccb 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.cc
+++ b/tensorflow/core/kernels/data/dataset_test_base.cc
@@ -62,6 +62,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/map_dataset_op.h"
 #include "tensorflow/core/kernels/data/name_utils.h"
 #include "tensorflow/core/kernels/data/range_dataset_op.h"
+#include "tensorflow/core/kernels/data/split_utils.h"
 #include "tensorflow/core/kernels/data/take_dataset_op.h"
 #include "tensorflow/core/kernels/data/tensor_slice_dataset_op.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -401,7 +402,7 @@ Status DatasetOpsTestBase::InitFunctionLibraryRuntime(
   pflr_ = absl::make_unique<ProcessFunctionLibraryRuntime>(
       device_mgr_.get(), Env::Default(), /*config=*/nullptr,
       TF_GRAPH_DEF_VERSION, lib_def_.get(), opts, thread_pool_.get(),
-      /*parent=*/nullptr, /*custom_kernel_creator=*/nullptr,
+      /*parent=*/nullptr,
       /*session_metadata=*/nullptr,
       Rendezvous::Factory{
           [](const int64, const DeviceMgr* device_mgr, Rendezvous** r) {
@@ -606,6 +607,47 @@ Status DatasetOpsTestBase::CheckIteratorGetNext(
   return Status::OK();
 }
 
+Status DatasetOpsTestBase::CheckSplitProviderFullIteration(
+    const DatasetParams& params, const std::vector<Tensor>& expected_outputs) {
+  std::unique_ptr<TestDataset> dataset;
+  TF_RETURN_IF_ERROR(MakeDataset(params, &dataset));
+  std::unique_ptr<SplitProvider> split_provider;
+  TF_RETURN_IF_ERROR(dataset->dataset()->MakeSplitProvider(&split_provider));
+  std::unique_ptr<TestIterator> iterator;
+  TF_RETURN_IF_ERROR(
+      MakeIterator(params, *dataset, std::move(split_provider), &iterator));
+  TF_RETURN_IF_ERROR(CheckIteratorGetNext(iterator.get(), expected_outputs,
+                                          /*compare_order=*/true));
+  return Status::OK();
+}
+
+Status DatasetOpsTestBase::CheckSplitProviderShardedIteration(
+    const DatasetParams& params, int64 num_shards, int64 shard_index,
+    const std::vector<Tensor>& expected_outputs) {
+  std::unique_ptr<TestDataset> dataset;
+  TF_RETURN_IF_ERROR(MakeDataset(params, &dataset));
+  std::unique_ptr<SplitProvider> split_provider;
+  TF_RETURN_IF_ERROR(dataset->dataset()->MakeSplitProvider(&split_provider));
+  split_provider = absl::make_unique<ShardingSplitProvider>(
+      num_shards, shard_index, std::move(split_provider));
+  std::unique_ptr<IteratorContext> iterator_ctx;
+  TF_RETURN_IF_ERROR(
+      CreateIteratorContext(dataset->op_kernel_context(), &iterator_ctx));
+  IteratorContext::Params iterator_params(iterator_ctx.get());
+  iterator_params.split_provider = std::move(split_provider);
+  iterator_ctx = absl::make_unique<IteratorContext>(iterator_params);
+  int mid_breakpoint = expected_outputs.size() / 2;
+  int near_end_breakpoint = expected_outputs.size() - 1;
+  int end_breakpoint = expected_outputs.size();
+  TF_RETURN_IF_ERROR(CheckIteratorSaveAndRestore(
+      dataset->dataset(), iterator_ctx.get(), params.iterator_prefix(),
+      expected_outputs,
+      /*breakpoints=*/
+      {0, mid_breakpoint, near_end_breakpoint, end_breakpoint},
+      /*compare_order=*/true));
+  return Status::OK();
+}
+
 Status DatasetOpsTestBase::CheckDatasetNodeName(
     const string& expected_dataset_node_name) {
   EXPECT_EQ(dataset_->node_name(), expected_dataset_node_name);
@@ -658,11 +700,13 @@ Status DatasetOpsTestBase::CheckIteratorPrefix(
 }
 
 Status DatasetOpsTestBase::CheckIteratorSaveAndRestore(
-    const string& iterator_prefix, const std::vector<Tensor>& expected_outputs,
+    DatasetBase* dataset, IteratorContext* iterator_ctx,
+    const std::string& iterator_prefix,
+    const std::vector<Tensor>& expected_outputs,
     const std::vector<int>& breakpoints, bool compare_order) {
   std::unique_ptr<IteratorBase> iterator;
-  TF_RETURN_IF_ERROR(dataset_->MakeIterator(
-      iterator_ctx_.get(), /*parent=*/nullptr, iterator_prefix, &iterator));
+  TF_RETURN_IF_ERROR(dataset->MakeIterator(iterator_ctx, /*parent=*/nullptr,
+                                           iterator_prefix, &iterator));
   std::unique_ptr<SerializationContext> serialization_ctx;
   TF_RETURN_IF_ERROR(CreateSerializationContext(&serialization_ctx));
   bool end_of_sequence = false;
@@ -674,33 +718,31 @@ Status DatasetOpsTestBase::CheckIteratorSaveAndRestore(
     std::vector<const VariantTensorData*> data;
     writer.GetData(&data);
     VariantTensorDataReader reader(data);
-    TF_EXPECT_OK(RestoreIterator(iterator_ctx_.get(), &reader, iterator_prefix,
-                                 *dataset_, &iterator));
+    TF_EXPECT_OK(RestoreIterator(iterator_ctx, &reader, iterator_prefix,
+                                 *dataset, &iterator));
 
     while (cur_iteration <= breakpoint) {
       std::vector<Tensor> next;
       TF_RETURN_IF_ERROR(
-          iterator->GetNext(iterator_ctx_.get(), &next, &end_of_sequence));
+          iterator->GetNext(iterator_ctx, &next, &end_of_sequence));
       out_tensors.insert(out_tensors.end(), next.begin(), next.end());
       cur_iteration++;
     }
-
-    if (dataset_->Cardinality() == kUnknownCardinality) {
-      continue;
-    }
-
-    if (dataset_->Cardinality() == kInfiniteCardinality ||
-        breakpoint < dataset_->Cardinality()) {
-      EXPECT_FALSE(end_of_sequence);
-    } else {
-      EXPECT_TRUE(end_of_sequence);
-    }
   }
   TF_EXPECT_OK(ExpectEqual(out_tensors, expected_outputs,
                            /*compare_order=*/compare_order));
   return Status::OK();
 }
 
+Status DatasetOpsTestBase::CheckIteratorSaveAndRestore(
+    const std::string& iterator_prefix,
+    const std::vector<Tensor>& expected_outputs,
+    const std::vector<int>& breakpoints, bool compare_order) {
+  return CheckIteratorSaveAndRestore(dataset_, iterator_ctx_.get(),
+                                     iterator_prefix, expected_outputs,
+                                     breakpoints, compare_order);
+}
+
 Status DatasetOpsTestBase::Initialize(const DatasetParams& dataset_params) {
   if (initialized_) {
     return errors::Internal(
@@ -793,10 +835,14 @@ Status DatasetOpsTestBase::MakeDataset(
 
 Status DatasetOpsTestBase::MakeIterator(
     const DatasetParams& dataset_params, const TestDataset& dataset,
+    std::unique_ptr<SplitProvider> split_provider,
     std::unique_ptr<TestIterator>* iterator) {
   std::unique_ptr<IteratorContext> iterator_ctx;
   TF_RETURN_IF_ERROR(
       CreateIteratorContext(dataset.op_kernel_context(), &iterator_ctx));
+  IteratorContext::Params iterator_params(iterator_ctx.get());
+  iterator_params.split_provider = std::move(split_provider);
+  iterator_ctx = absl::make_unique<IteratorContext>(iterator_params);
   std::unique_ptr<IteratorBase> iterator_base;
   TF_RETURN_IF_ERROR(dataset.dataset()->MakeIterator(
       iterator_ctx.get(), /*parent=*/nullptr, dataset_params.iterator_prefix(),
@@ -806,6 +852,13 @@ Status DatasetOpsTestBase::MakeIterator(
   return Status::OK();
 }
 
+Status DatasetOpsTestBase::MakeIterator(
+    const DatasetParams& dataset_params, const TestDataset& dataset,
+    std::unique_ptr<TestIterator>* iterator) {
+  return MakeIterator(dataset_params, dataset, /*split_provider=*/nullptr,
+                      iterator);
+}
+
 Status DatasetOpsTestBase::RunDatasetOp(const DatasetParams& dataset_params,
                                         std::vector<Tensor>* outputs) {
   TF_RETURN_IF_ERROR(RunDatasetOp(dataset_params, &dataset_kernel_, &params_,
diff --git a/tensorflow/core/kernels/data/dataset_test_base.h b/tensorflow/core/kernels/data/dataset_test_base.h
index 0d07a93d4f2..c8680fa98b5 100644
--- a/tensorflow/core/kernels/data/dataset_test_base.h
+++ b/tensorflow/core/kernels/data/dataset_test_base.h
@@ -517,6 +517,12 @@ class DatasetOpsTestBase : public ::testing::Test {
   Status MakeDataset(const DatasetParams& dataset_params,
                      std::unique_ptr<TestDataset>* dataset);
 
+  // Creates an iterator for the given dataset, using the specified split
+  // provider.
+  Status MakeIterator(const DatasetParams& dataset_params,
+                      const TestDataset& dataset,
+                      std::unique_ptr<SplitProvider> split_provider,
+                      std::unique_ptr<TestIterator>* iterator);
   // Creates an iterator for the given dataset.
   Status MakeIterator(const DatasetParams& dataset_params,
                       const TestDataset& dataset,
@@ -556,6 +562,18 @@ class DatasetOpsTestBase : public ::testing::Test {
                               const std::vector<Tensor>& expected_outputs,
                               bool compare_order);
 
+  // Checks that iterating through the dataset using a split provider produces
+  // the expected outputs.
+  Status CheckSplitProviderFullIteration(
+      const DatasetParams& params, const std::vector<Tensor>& expected_outputs);
+
+  // Checks that iterating through the dataset using a sharded split provider
+  // with the given `num_shards` and `shard_index` produces the expected
+  // outputs.
+  Status CheckSplitProviderShardedIteration(
+      const DatasetParams& params, int64 num_shards, int64 shard_index,
+      const std::vector<Tensor>& expected_outputs);
+
   // Checks `DatasetBase::node_name()`.
   Status CheckDatasetNodeName(const string& expected_dataset_node_name);
 
@@ -583,9 +601,14 @@ class DatasetOpsTestBase : public ::testing::Test {
   // Checks `IteratorBase::prefix()`.
   Status CheckIteratorPrefix(const string& expected_iterator_prefix);
 
-  // Checks `IteratorBase::GetNext()`.
   Status CheckIteratorSaveAndRestore(
-      const string& iterator_prefix,
+      DatasetBase* dataset, IteratorContext* iterator_ctx,
+      const std::string& iterator_prefix,
+      const std::vector<Tensor>& expected_outputs,
+      const std::vector<int>& breakpoints, bool compare_order);
+
+  Status CheckIteratorSaveAndRestore(
+      const std::string& iterator_prefix,
       const std::vector<Tensor>& expected_outputs,
       const std::vector<int>& breakpoints, bool compare_order);
 
@@ -660,6 +683,7 @@ class DatasetOpsTestBase : public ::testing::Test {
       OpKernelContext* const op_context,
       std::unique_ptr<IteratorContext>* iterator_context);
 
+  // Creates a new iterator context for iterating the dataset.
   // Creates a new serialization context for serializing the dataset and
   // iterator.
   Status CreateSerializationContext(
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index d0c493b8d59..7e6d5b1ccfd 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -39,403 +39,12 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 namespace {
-
 constexpr char kDelimiter[] = "@@";
-
-// clang-format off
-constexpr std::array<const char*, 3> kOpsWithSeed = {
-    "AnonymousRandomSeedGenerator",
-    "ShuffleDataset",
-    "ShuffleAndRepeatDataset"
-};
-// clang-format on
-
-constexpr char kSeedInputName[] = "seed";
-constexpr char kSeed2InputName[] = "seed2";
-constexpr char kSeedGeneratorInputName[] = "seed_generator";
 constexpr char kComponent[] = "component";
 constexpr char kNumElements[] = "num_elements";
 constexpr char kNumComponents[] = "num_components";
-
-template <std::size_t SIZE>
-bool IsNodeOfType(const NodeDef& node,
-                  const std::array<const char*, SIZE>& op_types) {
-  for (const auto& type : op_types) {
-    if (MatchesAnyVersion(type, node.op())) {
-      return true;
-    }
-  }
-  return false;
-}
-
-Status FindNode(const GraphDef& graph, const string& name,
-                const NodeDef** result) {
-  for (const auto& node : graph.node()) {
-    if (node.name() == name) {
-      *result = &node;
-      return Status::OK();
-    }
-  }
-  return errors::NotFound("Could not find node ", name, ".");
-}
-
-uint64 DefaultDependencyLoopNodeHash() {
-  static const uint64 hash = Hash64("DependencyLoopNode");
-  return hash;
-}
-
-uint64 DefaultDependencyLoopFnHash() {
-  static const uint64 hash = Hash64("DependencyLoopFn");
-  return hash;
-}
-
-void ClearOpDefForHashing(OpDef* op) {
-  op->clear_name();
-  op->clear_description();
-  op->clear_summary();
-  for (auto& arg : *op->mutable_input_arg()) {
-    arg.clear_name();
-    arg.clear_description();
-  }
-  for (auto& arg : *op->mutable_output_arg()) {
-    arg.clear_name();
-    arg.clear_description();
-  }
-}
-
-namespace {
-Status ShouldIgnoreInput(const NodeDef& node, int i, bool* result) {
-  *result = false;
-  if (IsNodeOfType(node, kOpsWithSeed)) {
-    const OpRegistrationData* reg;
-    auto status = OpRegistry::Global()->LookUp(node.op(), &reg);
-
-    if (status.ok()) {
-      if (reg->op_def.input_arg_size() > i) {
-        const std::string input_arg_name = reg->op_def.input_arg(i).name();
-        if (input_arg_name == kSeedInputName ||
-            input_arg_name == kSeed2InputName ||
-            input_arg_name == kSeedGeneratorInputName) {
-          VLOG(2) << "Ignoring arg: " << input_arg_name
-                  << " from node: " << node.name();
-          *result = true;
-          return Status::OK();
-        }
-      }
-    } else if (errors::IsNotFound(status)) {
-      LOG(WARNING) << "Cannot find " << node.op()
-                   << " in global op registry, so cannot determine which "
-                      "inputs are seeds.";
-    } else {
-      return status;
-    }
-  }
-  return Status::OK();
-}
-
-// Returns true if its a control input.
-Status ParseInputNodeName(const std::string& input_name, std::string* node_name,
-                          std::string* suffix, bool* is_control_input) {
-  if (input_name[0] == '^') {
-    *node_name = input_name.substr(1);
-    *is_control_input = true;
-    return Status::OK();
-  }
-  std::pair<std::string, std::string> node_spec =
-      absl::StrSplit(input_name, absl::MaxSplits(':', 1));
-  *node_name = node_spec.first;
-  *suffix = node_spec.second;
-  *is_control_input = false;
-  return Status::OK();
-}
 }  // namespace
 
-// Given a graph_def and a root_node, this class computes a fingerprint that
-// tries to capture the structure of the graph rooted at the provided node.
-// It does not at any point rely on the names of the nodes in the graph and
-// just relies on the connections between different nodes. In the presence of
-// multiple cycles in the graph, there is a non-zero possibility that two
-// graphs with different structure might end up with the same fingerprint
-// as in order to break cycles we prune away some edges (in a deterministic
-// fashion though). Idea for this algorithm was borrowed from:
-// https://stackoverflow.com/questions/11338746/directed-graphs-with-a-given-root-node-match-another-directed-graph-for-equali
-class GraphHasher {
- public:
-  // `GraphHasher` does not take ownership of `graph_def`, `root_node`, or
-  // `flib_def`.
-  explicit GraphHasher(const GraphDef* graph_def, const NodeDef* root_node,
-                       const FunctionLibraryDefinition* flib_def)
-      : graph_def_(graph_def), root_node_(root_node), flib_def_(flib_def) {}
-
-  Status ComputeHash(uint64* hash) {
-    TF_RETURN_IF_ERROR(Init());
-    return ComputeNodeHash(root_node_, hash);
-  }
-
- private:
-  // Pre process the graph to do a BFS and prune away cycles that might cause
-  // problems.
-  Status Init() {
-    absl::flat_hash_set<std::string> visited;
-    std::queue<const NodeDef*> bfs_queue;
-    bfs_queue.push(root_node_);
-    while (!bfs_queue.empty()) {
-      const NodeDef* node = bfs_queue.front();
-      bfs_queue.pop();
-      visited.insert(node->name());
-      NodeRep node_rep;
-      for (int i = 0; i < node->input_size(); ++i) {
-        DCHECK_GT(node->input(i).length(), 0);
-
-        // We skip trying to take the hash of the seeds of any ops, as they
-        // are irrelevant to the hash of the graph and may vary from run to run.
-        bool should_ignore_input = false;
-        TF_RETURN_IF_ERROR(ShouldIgnoreInput(*node, i, &should_ignore_input));
-        if (should_ignore_input) continue;
-
-        std::string node_name, suffix;
-        bool is_control_input;
-        TF_RETURN_IF_ERROR(ParseInputNodeName(node->input(i), &node_name,
-                                              &suffix, &is_control_input));
-        const NodeDef* input_node;
-        TF_RETURN_IF_ERROR(FindNode(*graph_def_, node_name, &input_node));
-
-        // If we've already seen this node before, skip it and don't add it to
-        // the queue.
-        if (visited.find(node_name) != visited.end()) {
-          EdgeRep cycle_edge(node, input_node);
-          cycle_forming_edges_.insert(cycle_edge.GetHash());
-          continue;
-        }
-        if (is_control_input) {
-          node_rep.node_control_inputs.push_back(input_node);
-        } else {
-          node_rep.node_inputs.push_back(std::make_pair(input_node, suffix));
-        }
-        bfs_queue.push(input_node);
-      }
-      nodes_[node] = node_rep;
-    }
-    return Status::OK();
-  }
-
-  Status ComputeNodeHash(const NodeDef* node, uint64* hash) {
-    auto it = cache_.find(node);
-    if (it != cache_.end()) {
-      *hash = it->second;
-      return Status::OK();
-    }
-
-    NodeRep* node_rep = gtl::FindOrNull(nodes_, node);
-    if (node_rep == nullptr) {
-      return errors::InvalidArgument("Could not find node: ", node->name());
-    }
-
-    uint64 non_input_hash;
-    TF_RETURN_IF_ERROR(ComputeNonInputNodeHash(*node, &non_input_hash));
-
-    // Hash control inputs. We combine the hashes in an unordered fashion
-    // because the order doesn't matter.
-    uint64 control_inputs_hash = 0;
-    for (const NodeDef* control_input : node_rep->node_control_inputs) {
-      uint64 node_hash = 0;
-      EdgeRep edge(node, control_input);
-      // If the edge was pruned we get the non input node hash to avoid cycles.
-      if (cycle_forming_edges_.find(edge.GetHash()) !=
-          cycle_forming_edges_.end()) {
-        TF_RETURN_IF_ERROR(ComputeNonInputNodeHash(*control_input, &node_hash));
-      } else {
-        TF_RETURN_IF_ERROR(ComputeNodeHash(control_input, &node_hash));
-      }
-      control_inputs_hash =
-          Hash64CombineUnordered(control_inputs_hash, node_hash);
-    }
-
-    // Hash regular inputs. We combine them in an ordered fashion.
-    uint64 inputs_hash = 0;
-    for (const auto& input : node_rep->node_inputs) {
-      uint64 node_hash = 0;
-      EdgeRep edge(node, input.first);
-      // If the edge was pruned we get the non input node hash to avoid cycles.
-      if (cycle_forming_edges_.find(edge.GetHash()) !=
-          cycle_forming_edges_.end()) {
-        TF_RETURN_IF_ERROR(ComputeNonInputNodeHash(*input.first, &node_hash));
-      } else {
-        TF_RETURN_IF_ERROR(ComputeNodeHash(input.first, &node_hash));
-      }
-      inputs_hash = Hash64Combine(
-          inputs_hash, Hash64Combine(node_hash, Hash64(input.second)));
-    }
-
-    *hash = Hash64Combine(non_input_hash,
-                          Hash64Combine(control_inputs_hash, inputs_hash));
-    cache_[node] = *hash;
-    return Status::OK();
-  }
-
-  Status ComputeNonInputNodeHash(const NodeDef& node, uint64* hash) {
-    // Hash Op.
-    uint64 op_hash = Hash64(node.op());
-
-    // Hash Attrs. We get the list of attrs from the op registry and then look
-    // up their values in the NodeDef attr map. This avoids looping over
-    // a map which is non-deterministic.
-    uint64 attrs_hash = 0;
-    const OpRegistrationData* reg;
-    TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUp(node.op(), &reg));
-
-    for (const auto& attr : reg->op_def.attr()) {
-      auto attr_key = attr.name();
-      if (!node.attr().contains(attr_key)) continue;
-      auto attr_value = node.attr().at(attr_key);
-      if (attr_key == kColocationAttrName ||
-          attr_key == kColocationGroupPrefix) {
-        continue;
-      }
-      uint64 attr_hash = 0;
-      TF_RETURN_IF_ERROR(HashAttr(attr_key, attr_value, &attr_hash));
-      attrs_hash = Hash64Combine(attrs_hash, attr_hash);
-    }
-
-    // Hash Device.
-    uint64 device_hash = Hash64(node.device());
-
-    *hash = Hash64Combine(op_hash, Hash64Combine(attrs_hash, device_hash));
-    return Status::OK();
-  }
-
-  Status HashAttr(const std::string& attr_name, const AttrValue& attr_value,
-                  uint64* hash) {
-    uint64 value_hash = 0;
-    if (attr_value.has_func()) {
-      TF_RETURN_IF_ERROR(HashFunction(attr_value.func(), &value_hash));
-    } else {
-      value_hash = DeterministicProtoHash64(attr_value);
-    }
-    *hash = Hash64(absl::StrCat(attr_name, "=", value_hash));
-    return Status::OK();
-  }
-
-  Status HashFunction(const NameAttrList& func, uint64* hash) {
-    const FunctionDef* fdef = flib_def_->Find(func.name());
-
-    // Convert to a GraphDef.
-    std::unique_ptr<FunctionBody> fbody;
-    TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(*fdef, AttrSlice(&func.attr()),
-                                               flib_def_, &fbody));
-    GraphDef graph_def = fbody->graph->ToGraphDefDebug();
-
-    // For each return node, we create a new GraphHasher to compute a hash.
-    // We then combine these hashes to produce the hash ordered.
-    uint64 ret_nodes_hash = 0;
-    for (const auto& ret_node : fbody->ret_nodes) {
-      GraphHasher ret_node_hasher(&graph_def, &ret_node->def(), flib_def_);
-      uint64 ret_node_hash = 0;
-      TF_RETURN_IF_ERROR(ret_node_hasher.ComputeHash(&ret_node_hash));
-      ret_nodes_hash = Hash64Combine(ret_nodes_hash, ret_node_hash);
-    }
-
-    // For the control ret nodes, we just take the non-input hash.
-    uint64 control_ret_nodes_hash = 0;
-    for (const auto& control_ret_node : fbody->control_ret_nodes) {
-      uint64 control_ret_node_hash = 0;
-      TF_RETURN_IF_ERROR(ComputeNonInputNodeHash(control_ret_node->def(),
-                                                 &control_ret_node_hash));
-      control_ret_nodes_hash =
-          Hash64CombineUnordered(control_ret_nodes_hash, control_ret_node_hash);
-    }
-
-    *hash = Hash64Combine(ret_nodes_hash, control_ret_nodes_hash);
-    return Status::OK();
-  }
-
-  struct NodeRep {
-    std::vector<const NodeDef*> node_control_inputs;
-    std::vector<std::pair<const NodeDef*, std::string>> node_inputs;
-  };
-
-  struct EdgeRep {
-    const NodeDef* start_node;
-    const NodeDef* end_node;
-
-    EdgeRep(const NodeDef* start, const NodeDef* end)
-        : start_node(start), end_node(end) {}
-
-    uint64 GetHash() {
-      return Hash64Combine(absl::Hash<const NodeDef*>()(start_node),
-                           absl::Hash<const NodeDef*>()(end_node));
-    }
-  };
-
-  const GraphDef* const graph_def_;                  // Not owned.
-  const NodeDef* const root_node_;                   // Not owned.
-  const FunctionLibraryDefinition* const flib_def_;  // Not owned.
-  // Edges that need to be pruned as their presence will cause cycles.
-  absl::flat_hash_set<uint64> cycle_forming_edges_;
-  absl::flat_hash_map<const NodeDef*, NodeRep> nodes_;
-  absl::flat_hash_map<const NodeDef*, uint64> cache_;
-};
-
-}  // anonymous namespace
-
-Status HashTensor(const Tensor& tensor, uint64* hash) {
-  const tstring* s = nullptr;
-  // Hash tensor type.
-  *hash = Hash64Combine(0, tensor.dtype());
-  // Hash tensor shape.
-  for (int i = 0; i < tensor.shape().dims(); ++i) {
-    *hash = Hash64Combine(*hash, tensor.shape().dim_size(i));
-  }
-  // Hash tensor data.
-  switch (tensor.dtype()) {
-    case DT_RESOURCE:
-    case DT_VARIANT:
-      return errors::Unimplemented("Hashing ", DataTypeString(tensor.dtype()),
-                                   " is not supported.");
-    case DT_STRING:
-      s = tensor.flat<tstring>().data();
-      for (int i = 0; i < tensor.NumElements(); ++i, ++s) {
-        *hash = Hash64Combine(*hash, Hash64(s->data(), s->size()));
-      }
-      break;
-    default:
-      *hash = Hash64(tensor.tensor_data().data(), tensor.tensor_data().size());
-  }
-  return Status::OK();
-}
-
-Status HashNode(const GraphDef& graph, const NodeDef& node, uint64* hash) {
-  const FunctionLibraryDefinition flib_def(OpRegistry::Global(),
-                                           graph.library());
-  return HashNode(graph, node, flib_def, hash);
-}
-
-Status HashNode(const GraphDef& graph, const NodeDef& node,
-                const FunctionLibraryDefinition& flib_def, uint64* hash) {
-  GraphHasher graph_hasher(&graph, &node, &flib_def);
-  return graph_hasher.ComputeHash(hash);
-}
-
-Status HashGraph(const GraphDef& graph_def, uint64* hash) {
-  const NodeDef* sink = nullptr;
-  for (auto& node : graph_def.node()) {
-    if (node.op() == "_Retval") {
-      sink = &node;
-      break;
-    }
-  }
-
-  if (sink == nullptr) {
-    return errors::Internal("Cannot find sink node for dataset graph.");
-  }
-
-  const FunctionLibraryDefinition flib_def(OpRegistry::Global(),
-                                           graph_def.library());
-  GraphHasher graph_hasher(&graph_def, sink, &flib_def);
-  TF_RETURN_IF_ERROR(graph_hasher.ComputeHash(hash));
-  return Status::OK();
-}
-
 Status WriteElementsToCheckpoint(
     IteratorStateWriter* writer, StringPiece key_prefix,
     const std::vector<std::vector<Tensor>>& elements) {
@@ -618,34 +227,35 @@ VariantTensorDataReader::VariantTensorDataReader(
   }
 }
 
-Status VariantTensorDataReader::ReadScalar(StringPiece key, int64* val) {
+Status VariantTensorDataReader::ReadScalar(StringPiece key, int64* val) const {
   return ReadScalarInternal(key, val);
 }
 
-Status VariantTensorDataReader::ReadScalar(StringPiece key, tstring* val) {
+Status VariantTensorDataReader::ReadScalar(StringPiece key,
+                                           tstring* val) const {
   return ReadScalarInternal(key, val);
 }
 
-Status VariantTensorDataReader::ReadTensor(StringPiece key, Tensor* val) {
+Status VariantTensorDataReader::ReadTensor(StringPiece key, Tensor* val) const {
   return ReadTensorInternal(key, val);
 }
 
 Status VariantTensorDataReader::ReadScalar(StringPiece name, StringPiece key,
-                                           int64* val) {
+                                           int64* val) const {
   return ReadScalarInternal(name, key, val);
 }
 
 Status VariantTensorDataReader::ReadScalar(StringPiece name, StringPiece key,
-                                           tstring* val) {
+                                           tstring* val) const {
   return ReadScalarInternal(name, key, val);
 }
 
 Status VariantTensorDataReader::ReadTensor(StringPiece name, StringPiece key,
-                                           Tensor* val) {
+                                           Tensor* val) const {
   return ReadTensorInternal(name, key, val);
 }
 
-bool VariantTensorDataReader::Contains(StringPiece key) {
+bool VariantTensorDataReader::Contains(StringPiece key) const {
   string name;
   if (!GetIteratorName(key, &name).ok()) {
     return false;
@@ -653,20 +263,26 @@ bool VariantTensorDataReader::Contains(StringPiece key) {
   return Contains(name, key);
 }
 
-bool VariantTensorDataReader::Contains(StringPiece n, StringPiece key) {
+bool VariantTensorDataReader::Contains(StringPiece n, StringPiece key) const {
   string name(n);
-  return map_[name].find(string(key)) != map_[name].end();
+  auto it = map_.find(name);
+  if (it == map_.end()) {
+    return false;
+  }
+  const auto& bucket = it->second;
+  return bucket.find(string(key)) != bucket.end();
 }
 
 template <typename T>
-Status VariantTensorDataReader::ReadScalarInternal(StringPiece key, T* val) {
+Status VariantTensorDataReader::ReadScalarInternal(StringPiece key,
+                                                   T* val) const {
   string name;
   TF_RETURN_IF_ERROR(GetIteratorName(key, &name));
   return ReadScalarInternal(name, key, val);
 }
 
 Status VariantTensorDataReader::ReadTensorInternal(StringPiece key,
-                                                   Tensor* val) {
+                                                   Tensor* val) const {
   string name;
   TF_RETURN_IF_ERROR(GetIteratorName(key, &name));
   return ReadTensorInternal(name, key, val);
@@ -674,23 +290,36 @@ Status VariantTensorDataReader::ReadTensorInternal(StringPiece key,
 
 template <typename T>
 Status VariantTensorDataReader::ReadScalarInternal(StringPiece n,
-                                                   StringPiece key, T* val) {
+                                                   StringPiece key,
+                                                   T* val) const {
   string name(n);
-  if (map_[name].find(string(key)) == map_[name].end()) {
+  auto it = map_.find(name);
+  if (it == map_.end()) {
+    return errors::NotFound(name);
+  }
+  const auto& bucket = it->second;
+  auto key_it = bucket.find(string(key));
+  if (key_it == bucket.end()) {
     return errors::NotFound(key);
   }
-  *val = data_[name]->tensors(map_[name][string(key)]).scalar<T>()();
+  *val = data_.at(name)->tensors(key_it->second).scalar<T>()();
   return Status::OK();
 }
 
 Status VariantTensorDataReader::ReadTensorInternal(StringPiece n,
                                                    StringPiece key,
-                                                   Tensor* val) {
+                                                   Tensor* val) const {
   string name(n);
-  if (map_[name].find(string(key)) == map_[name].end()) {
+  auto it = map_.find(name);
+  if (it == map_.end()) {
+    return errors::NotFound(name);
+  }
+  const auto& bucket = it->second;
+  auto key_it = bucket.find(string(key));
+  if (key_it == bucket.end()) {
     return errors::NotFound(key);
   }
-  *val = data_[name]->tensors(map_[name][string(key)]);
+  *val = data_.at(name)->tensors(key_it->second);
   return Status::OK();
 }
 
@@ -1060,5 +689,15 @@ std::vector<tstring> SelectOptimizations(
   return optimizations;
 }
 
+void StripDevicePlacement(FunctionDefLibrary* library) {
+  for (auto& function : (*library->mutable_function())) {
+    for (auto& node : (*function.mutable_node_def())) {
+      if (!node.device().empty()) {
+        *node.mutable_device() = "";
+      }
+    }
+  }
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index cefa388a29e..4bfee51a087 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -105,26 +105,6 @@ Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
 Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
                               const std::vector<Tensor>& received);
 
-// Returns a stable hash of the subgraph rooted at the given node.
-//
-// NOTE: There is currently no guarantee that the hash of a subgraph will stay
-// the same between TensorFlow builds.
-Status HashNode(const GraphDef& graph, const NodeDef& node, uint64* hash);
-Status HashNode(const GraphDef& graph, const NodeDef& node,
-                const FunctionLibraryDefinition& flib_def, uint64* hash);
-
-// Returns a stable hash of the given tensor.
-//
-// NOTE: There is currently no guarantee that the hash of a subgraph will stay
-// the same between TensorFlow builds.
-Status HashTensor(const Tensor& tensor, uint64* hash);
-
-// Returns a stable hash of the given graph.
-//
-// NOTE: There is currently no guarantee that the hash of a subgraph will stay
-// the same between TensorFlow builds.
-Status HashGraph(const GraphDef& graph, uint64* hash);
-
 // Writes dataset elements to the checkpoint writer using the given key prefix.
 // The elements can be read back by passing the same key prefix to
 // ReadElementsFromCheckpoint. Only one list of elements can be written under
@@ -191,24 +171,28 @@ class VariantTensorDataReader : public IteratorStateReader {
   explicit VariantTensorDataReader(
       const std::vector<const VariantTensorData*>& data);
 
-  Status ReadScalar(StringPiece key, int64* val) override;
-  Status ReadScalar(StringPiece key, tstring* val) override;
-  Status ReadTensor(StringPiece key, Tensor* val) override;
-  bool Contains(StringPiece key) override;
+  Status ReadScalar(StringPiece key, int64* val) const override;
+  Status ReadScalar(StringPiece key, tstring* val) const override;
+  Status ReadTensor(StringPiece key, Tensor* val) const override;
+  bool Contains(StringPiece key) const override;
 
-  Status ReadScalar(StringPiece name, StringPiece key, int64* val) override;
-  Status ReadScalar(StringPiece name, StringPiece key, tstring* val) override;
-  Status ReadTensor(StringPiece name, StringPiece key, Tensor* val) override;
-  bool Contains(StringPiece name, StringPiece key) override;
+  Status ReadScalar(StringPiece name, StringPiece key,
+                    int64* val) const override;
+  Status ReadScalar(StringPiece name, StringPiece key,
+                    tstring* val) const override;
+  Status ReadTensor(StringPiece name, StringPiece key,
+                    Tensor* val) const override;
+  bool Contains(StringPiece name, StringPiece key) const override;
 
  private:
   template <typename T>
-  Status ReadScalarInternal(StringPiece key, T* val);
-  Status ReadTensorInternal(StringPiece key, Tensor* val);
+  Status ReadScalarInternal(StringPiece key, T* val) const;
+  Status ReadTensorInternal(StringPiece key, Tensor* val) const;
 
   template <typename T>
-  Status ReadScalarInternal(StringPiece name, StringPiece key, T* val);
-  Status ReadTensorInternal(StringPiece name, StringPiece key, Tensor* val);
+  Status ReadScalarInternal(StringPiece name, StringPiece key, T* val) const;
+  Status ReadTensorInternal(StringPiece name, StringPiece key,
+                            Tensor* val) const;
 
   std::map<string, std::map<string, size_t>> map_;
   std::map<string, const VariantTensorData*> data_;  // Not owned.
@@ -315,6 +299,9 @@ std::vector<tstring> SelectOptimizations(
     const std::vector<tstring>& optimizations_default,
     std::function<uint64(const string&)> hash_func);
 
+// Removes device placements from the ops of all functions in `library`.
+void StripDevicePlacement(FunctionDefLibrary* library);
+
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/dataset_utils_test.cc b/tensorflow/core/kernels/data/dataset_utils_test.cc
index f1c5e7b1edb..5dcfd934af2 100644
--- a/tensorflow/core/kernels/data/dataset_utils_test.cc
+++ b/tensorflow/core/kernels/data/dataset_utils_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -32,38 +33,7 @@ namespace {
 
 using ::testing::UnorderedElementsAre;
 
-class DatasetHashUtilsTest : public ::testing::Test {
- protected:
-  uint64 GetHash(const FunctionDefLibrary& library, const FunctionDef& fn) {
-    // Construct a node with a function as an attr.
-    GraphDef graph_def;
-    *graph_def.mutable_library() = library;
-    NodeDef* node = graph_def.add_node();
-    node->set_op("RemoteCall");
-    NameAttrList func;
-    func.set_name(fn.signature().name());
-    AddNodeAttr("f", func, node);
-    uint64 hash = 0;
-    TF_CHECK_OK(HashNode(graph_def, *node, &hash));
-    return hash;
-  }
-
-  uint64 GetHash(const GraphDef& graph, const NodeDef& node) {
-    uint64 hash = 0;
-    TF_CHECK_OK(HashNode(graph, node, &hash));
-    return hash;
-  }
-
-  uint64 GetHash(const Tensor& tensor) {
-    uint64 hash = 0;
-    TF_CHECK_OK(HashTensor(tensor, &hash));
-    return hash;
-  }
-};
-
-string full_name(string key) {
-  return strings::StrCat(kFullNameRandomHex, kPipe, "Iterator:", key);
-}
+string full_name(string key) { return FullName("Iterator:", key); }
 
 TEST(DatasetUtilsTest, MatchesAnyVersion) {
   EXPECT_TRUE(MatchesAnyVersion("BatchDataset", "BatchDataset"));
@@ -273,6 +243,26 @@ TEST(DatasetUtilsTest, AddToFunctionLibraryWithConflictingSignatures) {
       s.error_message());
 }
 
+TEST(DatasetUtilsTest, StripDevicePlacement) {
+  FunctionDefLibrary flib;
+  *flib.add_function() = FunctionDefHelper::Create(
+      /*function_name=*/"0",
+      /*in_def=*/{"arg: int64"},
+      /*out_def=*/{"ret: int64"},
+      /*attr_def=*/{},
+      /*node_def=*/
+      {{{"node"},
+        "Identity",
+        {"arg"},
+        {{"T", DT_INT64}},
+        /*dep=*/{},
+        /*device=*/"device:CPU:0"}},
+      /*ret_def=*/{{"ret", "arg"}});
+  EXPECT_EQ(flib.function(0).node_def(0).device(), "device:CPU:0");
+  StripDevicePlacement(&flib);
+  EXPECT_EQ(flib.function(0).node_def(0).device(), "");
+}
+
 TEST(DatasetUtilsTest, RunnerWithMaxParallelism) {
   auto runner =
       RunnerWithMaxParallelism([](const std::function<void()> fn) { fn(); }, 2);
@@ -308,832 +298,6 @@ TEST(DatasetUtilsTest, BoolConstructor) {
   EXPECT_FALSE(DeterminismPolicy(false).IsDefault());
 }
 
-TEST_F(DatasetHashUtilsTest, HashFunctionSameFunctionDifferentNames) {
-  FunctionDefLibrary fl;
-
-  FunctionDef* f1 = fl.add_function();
-  *f1 = FunctionDefHelper::Create(
-      "AddAndMul", {"i: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
-       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "ret:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "add"}});
-
-  FunctionDef* f2 = fl.add_function();
-  *f2 = FunctionDefHelper::Create(
-      "AddAndMul2", {"input: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"input", "input"}, {{"T", DT_FLOAT}}},
-       {{"ret"}, "Mul", {"input", "input"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "ret:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "add"}});
-
-  EXPECT_EQ(GetHash(fl, *f1), GetHash(fl, *f2));
-}
-
-TEST_F(DatasetHashUtilsTest, HashFunctionDifferentFunctions) {
-  FunctionDefLibrary fl;
-
-  FunctionDef* f1 = fl.add_function();
-  *f1 = FunctionDefHelper::Create(
-      "AddAndMul", {"i: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
-       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "ret:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "add"}});
-
-  FunctionDef* f2 = fl.add_function();
-  *f2 = FunctionDefHelper::Create(
-      "AddAndAdd", {"i: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
-       {{"ret"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "ret:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "add"}});
-
-  // The second op in `f2` is changed to "Add"
-  EXPECT_NE(GetHash(fl, *f1), GetHash(fl, *f2));
-}
-
-TEST_F(DatasetHashUtilsTest, HashFunctionDifferentInternalNodeNames) {
-  FunctionDefLibrary fl;
-
-  FunctionDef* f1 = fl.add_function();
-  *f1 = FunctionDefHelper::Create(
-      "AddAndMul", {"i: float", "j: float", "k: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"i", "j"}, {{"T", DT_FLOAT}}},
-       {{"ret"}, "Mul", {"add:z:0", "k"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "ret:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "ret"}});
-
-  FunctionDef* f2 = fl.add_function();
-  *f2 = FunctionDefHelper::Create(
-      "AddAndMul", {"a: float", "b: float", "c: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"a", "b"}, {{"T", DT_FLOAT}}},
-       {{"mul"}, "Mul", {"add:z:0", "c"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "mul:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "mul"}});
-
-  EXPECT_EQ(GetHash(fl, *f1), GetHash(fl, *f2));
-}
-
-TEST_F(DatasetHashUtilsTest, HashGraphWithMultipleCycles) {
-  uint64 hash = 0;
-  for (int i = 0; i < 1000; ++i) {
-    GraphDef g;
-    NodeDef* output_node = g.add_node();
-    TF_CHECK_OK(NodeDefBuilder("O", "Add")
-                    .Input("A", 0, DT_FLOAT)
-                    .Input("D", 0, DT_FLOAT)
-                    .Finalize(output_node));
-    TF_CHECK_OK(NodeDefBuilder("A", "Abs")
-                    .Input("B", 0, DT_FLOAT)
-                    .Finalize(g.add_node()));
-    TF_CHECK_OK(NodeDefBuilder("B", "Add")
-                    .Input("C", 0, DT_FLOAT)
-                    .Input("D", 0, DT_FLOAT)
-                    .Finalize(g.add_node()));
-    TF_CHECK_OK(NodeDefBuilder("C", "Ceil")
-                    .Input("A", 0, DT_FLOAT)
-                    .Finalize(g.add_node()));
-    TF_CHECK_OK(NodeDefBuilder("D", "Cos")
-                    .Input("E", 0, DT_FLOAT)
-                    .Finalize(g.add_node()));
-    TF_CHECK_OK(NodeDefBuilder("E", "Floor")
-                    .Input("B", 0, DT_FLOAT)
-                    .Finalize(g.add_node()));
-    uint64 t = GetHash(g, *output_node);
-    if (hash == 0) {
-      hash = t;
-    } else {
-      EXPECT_EQ(t, hash);
-    }
-  }
-}
-
-TEST_F(DatasetHashUtilsTest, HashNodeSameGraphDifferentNames) {
-  GraphDef gd;
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  NodeDef* n3 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash1 = GetHash(gd, *n3);
-
-  n1->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_3/node_7", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  n2->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_4/node_9", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  n3->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_5/node_11", "Add")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash2 = GetHash(gd, *n3);
-
-  EXPECT_EQ(hash1, hash2);
-}
-
-TEST_F(DatasetHashUtilsTest, HashNodeDifferentGraphs) {
-  GraphDef gd;
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  NodeDef* n3 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash1 = GetHash(gd, *n3);
-
-  n3->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Mul")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash2 = GetHash(gd, *n3);
-
-  // We expect different hashes because the op of n3 has changed.
-  EXPECT_NE(hash1, hash2);
-}
-
-TEST_F(DatasetHashUtilsTest, HashSameGraphDifferentSeeds) {
-  GraphDef gd;
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  NodeDef* seed = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/seed", "Const")
-                  .Attr("value", 123)
-                  .Device("CPU:0")
-                  .Finalize(seed));
-
-  NodeDef* seed2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/seed2", "Const")
-                  .Attr("value", 456)
-                  .Device("CPU:0")
-                  .Finalize(seed2));
-
-  NodeDef* range_ds = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/range", "RangeDataset")
-                  .Input(n1->name(), 0, DT_INT64)
-                  .Input(n1->name(), 0, DT_INT64)
-                  .Input(n1->name(), 0, DT_INT64)
-                  .Device("CPU:0")
-                  .Finalize(range_ds));
-
-  NodeDef* shuffle_ds = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/shuffle", "ShuffleDataset")
-                  .Input(range_ds->name(), 0, DT_VARIANT)
-                  .Input(n1->name(), 0, DT_INT64)
-                  .Input(seed->name(), 0, DT_INT64)
-                  .Input(seed2->name(), 0, DT_INT64)
-                  .Device("CPU:0")
-                  .Finalize(shuffle_ds));
-
-  uint64 hash1 = GetHash(gd, *shuffle_ds);
-
-  seed->Clear();
-  seed2->Clear();
-
-  TF_CHECK_OK(NodeDefBuilder("graph_1/seed", "Const")
-                  .Attr("value", 789)
-                  .Device("CPU:0")
-                  .Finalize(seed));
-  TF_CHECK_OK(NodeDefBuilder("graph_1/seed2", "Const")
-                  .Attr("value", 654)
-                  .Device("CPU:0")
-                  .Finalize(seed2));
-
-  uint64 hash2 = GetHash(gd, *shuffle_ds);
-
-  EXPECT_EQ(hash1, hash2);
-}
-
-TEST_F(DatasetHashUtilsTest, HashNodeSameGraphDifferentColocationNames) {
-  GraphDef gd;
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Attr("_class", {"graph_1/node_2"})
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  NodeDef* n3 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash1 = GetHash(gd, *n3);
-
-  n1->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_3/node_7", "Const")
-                  .Attr("value", 1)
-                  .Attr("_class", {"graph_3/node_9"})
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  n2->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_4/node_9", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  n3->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_5/node_11", "Add")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash2 = GetHash(gd, *n3);
-
-  EXPECT_EQ(hash1, hash2);
-}
-
-TEST_F(DatasetHashUtilsTest, HashNodeReversedOrder) {
-  GraphDef gd;
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  NodeDef* n3 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash1 = GetHash(gd, *n3);
-
-  n3->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
-                  .Device("CPU:0")
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash2 = GetHash(gd, *n3);
-
-  // We expect different hashes because the inputs of n3 are swapped.
-  EXPECT_NE(hash1, hash2);
-}
-
-TEST_F(DatasetHashUtilsTest, HashNodeInputPortChanged) {
-  GraphDef gd;
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  NodeDef* n3 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash1 = GetHash(gd, *n3);
-
-  n3->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 1, DT_INT32)
-                  .Input(n2->name(), 2, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash2 = GetHash(gd, *n3);
-
-  // We expect different hashes because the input ports for nodes used by n3
-  // has changed.
-  EXPECT_NE(hash1, hash2);
-}
-
-TEST_F(DatasetHashUtilsTest, HashNodeSameFunctionDifferentNames) {
-  GraphDef gd;
-  FunctionDefLibrary* fl1 = gd.mutable_library();
-
-  FunctionDef* f1 = fl1->add_function();
-  *f1 = FunctionDefHelper::Create(
-      "AddAndMul", {"i: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
-       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "ret:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "add"}});
-
-  FunctionDef* f2 = fl1->add_function();
-  *f2 = FunctionDefHelper::Create(
-      "AddAndMul2", {"input: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"input", "input"}, {{"T", DT_FLOAT}}},
-       {{"ret"}, "Mul", {"input", "input"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "ret:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "add"}});
-
-  AttrValue a1;
-  NameAttrList* nal1 = a1.mutable_func();
-  nal1->set_name("AddAndMul");
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  std::vector<NodeDefBuilder::NodeOut> func_inputs;
-  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
-  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(func_inputs)
-                  .Attr("body", a1)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  uint64 hash1 = GetHash(gd, *n2);
-
-  n2->Clear();
-  AttrValue a2;
-  NameAttrList* nal2 = a2.mutable_func();
-  nal2->set_name("AddAndMul2");
-
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(func_inputs)
-                  .Attr("body", a2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  uint64 hash2 = GetHash(gd, *n2);
-
-  EXPECT_EQ(hash1, hash2);
-}
-
-TEST_F(DatasetHashUtilsTest, HashNodeDifferentFunctions) {
-  GraphDef gd;
-
-  FunctionDefLibrary* fl1 = gd.mutable_library();
-  FunctionDef* f1 = fl1->add_function();
-
-  FunctionDef func = FunctionDefHelper::Create(
-      "AddAndMul", {"i: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
-       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "ret:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "add"}});
-  *f1 = func;
-
-  FunctionDef* f2 = fl1->add_function();
-  func = FunctionDefHelper::Create(
-      "AddAndMul2", {"i: float"}, {"o: float"}, {},
-      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
-       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "ret:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "ret"}});
-  *f2 = func;
-
-  AttrValue a1;
-  NameAttrList* nal1 = a1.mutable_func();
-  nal1->set_name("AddAndMul");
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  std::vector<NodeDefBuilder::NodeOut> func_inputs;
-  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
-  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(func_inputs)
-                  .Attr("body", a1)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  uint64 hash1 = GetHash(gd, *n2);
-
-  n2->Clear();
-  AttrValue a2;
-  NameAttrList* nal2 = a2.mutable_func();
-  nal2->set_name("AddAndMul2");
-
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(func_inputs)
-                  .Attr("body", a2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  uint64 hash2 = GetHash(gd, *n2);
-
-  EXPECT_NE(hash1, hash2);
-}
-
-TEST_F(DatasetHashUtilsTest, HashNodeDifferentControlInputs) {
-  GraphDef gd;
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  NodeDef* n3 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Const")
-                  .Attr("value", 10)
-                  .Device("CPU:0")
-                  .Finalize(n3));
-
-  NodeDef* n4 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Identity")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .ControlInput(n2->name())
-                  .Finalize(n4));
-
-  uint64 hash1 = GetHash(gd, *n4);
-
-  n4->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Identity")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .ControlInput(n3->name())
-                  .Finalize(n4));
-
-  uint64 hash2 = GetHash(gd, *n4);
-
-  // Control inputs are different between these two graphs.
-  EXPECT_NE(hash1, hash2);
-}
-
-TEST_F(DatasetHashUtilsTest, HashNodeControlInputDifferentOrdering) {
-  GraphDef gd;
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  NodeDef* n3 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Const")
-                  .Attr("value", 10)
-                  .Device("CPU:0")
-                  .Finalize(n3));
-
-  NodeDef* n4 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Identity")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .ControlInput(n2->name())
-                  .ControlInput(n3->name())
-                  .Finalize(n4));
-
-  uint64 hash1 = GetHash(gd, *n4);
-
-  n4->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Identity")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .ControlInput(n3->name())
-                  .ControlInput(n2->name())
-                  .Finalize(n4));
-
-  uint64 hash2 = GetHash(gd, *n4);
-
-  EXPECT_EQ(hash1, hash2);
-}
-
-TEST_F(DatasetHashUtilsTest, HashNodeDifferentGraphSamePartialGraph) {
-  GraphDef gd;
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  NodeDef* n3 = gd.add_node();
-
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash1 = GetHash(gd, *n1);
-
-  n3->Clear();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Mul")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .Finalize(n3));
-
-  uint64 hash2 = GetHash(gd, *n1);
-
-  EXPECT_EQ(hash1, hash2);
-}
-
-TEST_F(DatasetHashUtilsTest, HashNodeWithManyControlDependencies) {
-  GraphDef gd;
-  NodeDef* n;
-
-  for (int i = 0; i < 1000; ++i) {
-    n = gd.add_node();
-    NodeDefBuilder ndb(absl::StrCat("graph_1/node_", i), "Const");
-    ndb.Attr("value", 1);
-    ndb.Device("CPU:0");
-    for (int j = 0; j < i; ++j) {
-      ndb.ControlInput(absl::StrCat("graph_1/node_", j));
-    }
-    TF_CHECK_OK(ndb.Finalize(n));
-  }
-
-  // No checks here, because so long as this does not time out, we are OK.
-  GetHash(gd, *n);
-}
-
-TEST_F(DatasetHashUtilsTest, HashFunctionsWithControlDependencyLoop) {
-  GraphDef gd;
-
-  FunctionDefLibrary* fl1 = gd.mutable_library();
-  FunctionDef* f1 = fl1->add_function();
-
-  AttrValue a1;
-  NameAttrList* nal1 = a1.mutable_func();
-  nal1->set_name("AddAndMul");
-
-  std::pair<string, FunctionDefHelper::AttrValueWrapper> func_attr = {
-      "body", FunctionDefHelper::AttrValueWrapper(*nal1)};
-
-  FunctionDef func = FunctionDefHelper::Create(
-      /*function_name=*/"AddAndMul",
-      /*in_def=*/{"i: float", "j: int32"},
-      /*out_def=*/{"o: float"},
-      /*attr_def=*/{},
-      /*node_def=*/
-      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}, {"ret"}},
-       // This creates a dependency on the same function.
-       {{"for"}, "For", {"j", "j", "j"}, {func_attr, {"T", DT_FLOAT}}, {"ret"}},
-       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
-      /*ret_def=*/{{"o", "ret:z:0"}},
-      /*control_ret_def=*/{{"must_execute", "add"}});
-  *f1 = func;
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .Finalize(n1));
-
-  std::vector<NodeDefBuilder::NodeOut> func_inputs;
-  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
-  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(func_inputs)
-                  .ControlInput("graph_1/node_2")
-                  .Attr("body", a1)
-                  .Device("CPU:0")
-                  .Finalize(n2));
-
-  // No checks in the test, the fact that it runs and doesn't timeout or exhaust
-  // the stack means it is successful.
-  GetHash(gd, *n2);
-}
-
-TEST_F(DatasetHashUtilsTest, HashNodeWithControlDependencyLoop) {
-  GraphDef gd;
-
-  NodeDef* n1 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .ControlInput("graph_1/node_2")
-                  .Finalize(n1));
-
-  NodeDef* n2 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .ControlInput("graph_1/node_1")
-                  .Finalize(n2));
-
-  NodeDef* n3 = gd.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .ControlInput("graph_1/node_1")
-                  .ControlInput("graph_1/node_2")
-                  .Finalize(n3));
-
-  // No checks in the test, the fact that it runs and doesn't timeout or exhaust
-  // the stack means it is successful.
-  GetHash(gd, *n3);
-}
-
-TEST_F(DatasetHashUtilsTest, HashNodeWithControlDependencyLoopDifferentNames) {
-  GraphDef gd1;
-
-  NodeDef* n1 = gd1.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .ControlInput("graph_1/node_2")
-                  .Finalize(n1));
-
-  NodeDef* n2 = gd1.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .ControlInput("graph_1/node_1")
-                  .Finalize(n2));
-
-  NodeDef* n3 = gd1.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
-                  .Device("CPU:0")
-                  .Input(n1->name(), 0, DT_INT32)
-                  .Input(n2->name(), 0, DT_INT32)
-                  .ControlInput("graph_1/node_1")
-                  .ControlInput("graph_1/node_2")
-                  .Finalize(n3));
-
-  GraphDef gd2;
-
-  NodeDef* n4 = gd2.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Const")
-                  .Attr("value", 1)
-                  .Device("CPU:0")
-                  .ControlInput("graph_1/node_5")
-                  .Finalize(n4));
-
-  NodeDef* n5 = gd2.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_5", "Const")
-                  .Attr("value", 2)
-                  .Device("CPU:0")
-                  .ControlInput("graph_1/node_4")
-                  .Finalize(n5));
-
-  NodeDef* n6 = gd2.add_node();
-  TF_CHECK_OK(NodeDefBuilder("graph_1/node_6", "Add")
-                  .Device("CPU:0")
-                  .Input(n4->name(), 0, DT_INT32)
-                  .Input(n5->name(), 0, DT_INT32)
-                  .ControlInput("graph_1/node_4")
-                  .ControlInput("graph_1/node_5")
-                  .Finalize(n6));
-
-  EXPECT_EQ(GetHash(gd1, *n3), GetHash(gd2, *n6));
-}
-
-TEST_F(DatasetHashUtilsTest, HashInt32Tensor) {
-  Tensor s1(42);
-  Tensor s2(42);
-  Tensor s3(43);
-
-  EXPECT_EQ(GetHash(s1), GetHash(s2));
-  EXPECT_NE(GetHash(s1), GetHash(s3));
-
-  Tensor v1(DT_INT32, TensorShape({2}));
-  v1.vec<int32>()(0) = 0;
-  v1.vec<int32>()(1) = 1;
-  Tensor v2(DT_INT32, TensorShape({2}));
-  v2.vec<int32>()(0) = 0;
-  v2.vec<int32>()(1) = 1;
-  Tensor v3(DT_INT32, TensorShape({2}));
-  v3.vec<int32>()(0) = 0;
-  v3.vec<int32>()(1) = 2;
-
-  EXPECT_EQ(GetHash(v1), GetHash(v2));
-  EXPECT_NE(GetHash(v1), GetHash(v3));
-}
-
-TEST_F(DatasetHashUtilsTest, HashStringTensor) {
-  Tensor s1("hello");
-  Tensor s2("hello");
-  Tensor s3("world");
-
-  EXPECT_EQ(GetHash(s1), GetHash(s2));
-  EXPECT_NE(GetHash(s1), GetHash(s3));
-
-  Tensor v1(DT_STRING, TensorShape({2}));
-  v1.vec<tstring>()(0) = "hello";
-  v1.vec<tstring>()(1) = "world";
-  Tensor v2(DT_STRING, TensorShape({2}));
-  v2.vec<tstring>()(0) = "hello";
-  v2.vec<tstring>()(1) = "world";
-  Tensor v3(DT_STRING, TensorShape({2}));
-  v3.vec<tstring>()(0) = "hello";
-  v3.vec<tstring>()(1) = "universe";
-
-  EXPECT_EQ(GetHash(v1), GetHash(v2));
-  EXPECT_NE(GetHash(v1), GetHash(v3));
-}
-
 class SelectOptimizationsHashTest : public ::testing::TestWithParam<uint64> {};
 
 TEST_P(SelectOptimizationsHashTest, DatasetUtils) {
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index cd318b1343b..92bc48159ad 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -1,11 +1,9 @@
 # Description:
 #   Contains experimental kernels for datasets and iterators.
 
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-    "tf_kernel_library",
-)
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -134,12 +132,12 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
         "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler/optimizers/data:graph_utils",
         "//tensorflow/core/kernels/data:dataset_utils",
         "//tensorflow/core/kernels/data:name_utils",
         "//tensorflow/core/kernels/data:serialization_utils",
+        "//tensorflow/core/platform:regexp",
     ],
 )
 
@@ -309,7 +307,7 @@ tf_cc_test(
     name = "lmdb_dataset_op_test",
     size = "small",
     srcs = ["lmdb_dataset_op_test.cc"],
-    data = ["//tensorflow/core:lmdb_testdata"],
+    data = ["//tensorflow/core/lib/lmdb:lmdb_testdata"],
     deps = [
         ":lmdb_dataset_op",
         "//tensorflow/core:experimental_dataset_ops_op_lib",
@@ -426,6 +424,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core/kernels:ragged_tensor_variant",
         "//tensorflow/core/kernels/data:dataset_utils",
         "//tensorflow/core/kernels/data:name_utils",
         "//tensorflow/core/kernels/data:parallel_map_dataset_op",
@@ -616,6 +615,7 @@ tf_kernel_library(
         "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/kernels/data:captured_function",
         "//tensorflow/core/kernels/data:dataset_utils",
+        "//tensorflow/core/kernels/data:hash_utils",
         "//tensorflow/core/kernels/data:name_utils",
         "//tensorflow/core/platform:platform_port",
         "//tensorflow/core/profiler/lib:traceme",
@@ -754,7 +754,7 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "dataset_kernels",
+    name = "experimental",
     deps = [
         ":assert_cardinality_dataset_op",
         ":assert_next_dataset_op",
diff --git a/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc
index 1dd38dcaa04..30d0f9405f7 100644
--- a/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc
@@ -67,6 +67,11 @@ class AssertCardinalityDatasetOp::Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return cardinality_; }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
index 7348b342c6a..3898bb4e705 100644
--- a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
@@ -64,6 +64,11 @@ class AssertNextDatasetOp::Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return input_->Cardinality(); }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
diff --git a/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
index 8772f21ef8f..1cb2564d3a0 100644
--- a/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
@@ -55,6 +55,10 @@ class WrapperDataset : public DatasetBase {
 
   string DebugString() const override { return "WrapperDataset"; }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
@@ -245,6 +249,12 @@ class ChooseFastestBranchDatasetOp : public UnaryDatasetOpKernel {
       return static_cast<double>(n) * ratio_numerator_ / ratio_denominator_;
     }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       for (const auto& captured_func : captured_funcs_) {
         TF_RETURN_IF_ERROR(captured_func->CheckExternalState());
diff --git a/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc b/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
index 6ab72d85a99..3fff7bc6f16 100644
--- a/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
@@ -158,6 +158,14 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
 
     int64 Cardinality() const override { return cardinality_; }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      for (const auto& input : inputs_) {
+        inputs->push_back(input);
+      }
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       for (const auto& input : inputs_) {
         TF_RETURN_IF_ERROR(input->CheckExternalState());
diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index 1c354153ec2..753daa72356 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -200,6 +200,9 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       for (auto& worker_thread : worker_threads_) {
         worker_thread.reset();
       }
+
+      VLOG(1) << "Destroyed data service dataset iterator for job id "
+              << job_client_id_;
     }
 
     void CancelThreads() TF_LOCKS_EXCLUDED(mu_) {
@@ -225,17 +228,23 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
             [&]() {
               return dispatcher_->CreateJob(dataset()->dataset_id_,
                                             dataset()->processing_mode_,
-                                            &job_client_id_);
+                                            job_client_id_);
             },
-            "create job", deadline_micros));
+            /*description=*/
+            strings::StrCat("create job with dispatcher at ",
+                            dataset()->address_),
+            deadline_micros));
       } else {
         TF_RETURN_IF_ERROR(grpc_util::Retry(
             [&]() {
               return dispatcher_->GetOrCreateJob(
                   dataset()->dataset_id_, dataset()->processing_mode_,
-                  dataset()->job_name_, iterator_index_, &job_client_id_);
+                  dataset()->job_name_, iterator_index_, job_client_id_);
             },
-            "get or create job", deadline_micros));
+            /*description=*/
+            strings::StrCat("get or create job with dispatcher at ",
+                            dataset()->address_),
+            deadline_micros));
       }
       initialized_ = true;
       VLOG(1) << "Created data service job with id " << job_client_id_;
@@ -347,7 +356,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       VLOG(3) << "Updating tasks";
       std::vector<TaskInfo> tasks;
       bool job_finished;
-      Status s = dispatcher_->GetTasks(job_client_id_, &tasks, &job_finished);
+      Status s = dispatcher_->GetTasks(job_client_id_, tasks, job_finished);
       if (!s.ok()) {
         LOG(WARNING) << "Failed to get task info for job client id "
                      << job_client_id_ << ": " << s;
@@ -382,7 +391,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
         TaskInfo& task_info = new_task_entry.second;
         std::unique_ptr<DataServiceWorkerClient> worker;
         Status s = CreateDataServiceWorkerClient(task_info.worker_address(),
-                                                 dataset()->protocol_, &worker);
+                                                 dataset()->protocol_, worker);
         if (!s.ok()) {
           status_ = s;
           get_next_cv_.notify_all();
@@ -466,8 +475,8 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
         Status s = GetElement(task_to_process.get(), deadline_micros);
         if (!s.ok()) {
           mutex_lock l(mu_);
-          VLOG(1) << "Failed to get element for task "
-                  << task_to_process->task_id << ": " << s;
+          VLOG(1) << "Failed to get element from worker "
+                  << task_to_process->address << ": " << s;
           task_to_process->in_use = false;
           status_ = s;
           get_next_cv_.notify_all();
@@ -489,8 +498,8 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       CompressedElement compressed;
       bool end_of_sequence;
       for (int num_retries = 0;; ++num_retries) {
-        Status s = task->worker->GetElement(task->task_id, &compressed,
-                                            &end_of_sequence);
+        Status s = task->worker->GetElement(task->task_id, compressed,
+                                            end_of_sequence);
         if (s.ok()) {
           break;
         }
@@ -520,6 +529,9 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
             (deadline_micros > deadline_with_backoff_micros)
                 ? deadline_with_backoff_micros
                 : deadline_micros;
+        VLOG(1) << "Failed to get an element from worker " << task->address
+                << ": " << s << ". Will retry in "
+                << (backoff_until - now_micros) << " microseconds";
         Env::Default()->SleepForMicroseconds(backoff_until - now_micros);
       }
 
@@ -629,7 +641,7 @@ void DataServiceDatasetOp::MakeDataset(OpKernelContext* ctx,
       ctx, ParseScalarArgument(ctx, kProcessingMode, &processing_mode_str));
   ProcessingMode processing_mode;
   OP_REQUIRES_OK(ctx,
-                 ParseProcessingMode(processing_mode_str, &processing_mode));
+                 ParseProcessingMode(processing_mode_str, processing_mode));
 
   tstring address;
   OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kAddress, &address));
diff --git a/tensorflow/core/kernels/data/experimental/data_service_ops.cc b/tensorflow/core/kernels/data/experimental/data_service_ops.cc
index ba175815c73..4d993d9462f 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_ops.cc
@@ -63,8 +63,10 @@ void RegisterDatasetOp::Compute(OpKernelContext* ctx) {
   int64 deadline_micros = EnvTime::NowMicros() + kRetryTimeoutMicros;
   OP_REQUIRES_OK(
       ctx, grpc_util::Retry(
-               [&]() { return client.RegisterDataset(graph_def, &dataset_id); },
-               /*description=*/"register dataset", deadline_micros));
+               [&]() { return client.RegisterDataset(graph_def, dataset_id); },
+               /*description=*/
+               strings::StrCat("register dataset with dispatcher at ", address),
+               deadline_micros));
 
   Tensor* output;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &output));
diff --git a/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
index d09922988b9..c0070dca9f7 100644
--- a/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
@@ -120,6 +120,12 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
       return n / batch_size_ + (n % batch_size_ == 0 ? 0 : 1);
     }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
diff --git a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
index 0a6df24d40a..a629292e2ed 100644
--- a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
@@ -107,6 +107,12 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       return "GroupByWindowDatasetOp::Dataset";
     }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       TF_RETURN_IF_ERROR(captured_key_func_->CheckExternalState());
       TF_RETURN_IF_ERROR(captured_reduce_func_->CheckExternalState());
diff --git a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
index 27d3b7cd45b..8b2745ee526 100644
--- a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
@@ -66,6 +66,12 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
diff --git a/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc b/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
index 7cfa74e6516..31763c89544 100644
--- a/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/lmdb_dataset_op.cc
@@ -54,6 +54,10 @@ class LMDBDatasetOp::Dataset : public DatasetBase {
 
   string DebugString() const override { return "LMDBDatasetOp::Dataset"; }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index fdc63bdb913..5cc72ba853e 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -137,6 +137,11 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
     return n / batch_size_ + (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
     return input_->CheckExternalState();
diff --git a/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
index 90a61d72597..a606e76008c 100644
--- a/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
@@ -82,6 +82,11 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
       return "MatchingFilesDatasetOp::Dataset";
     }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override { return Status::OK(); }
 
    protected:
diff --git a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
index 1e752931157..cb0be0e0bbf 100644
--- a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
@@ -69,6 +69,12 @@ class NonSerializableDatasetOp : public UnaryDatasetOpKernel {
       return "NonSerializableDatasetOp::Dataset";
     }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
index 9c344e01c6a..0e15015efee 100644
--- a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
@@ -147,6 +147,11 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType, params);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
     return input_->CheckExternalState();
diff --git a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index 3002987b621..80f23bb5a0c 100644
--- a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/name_utils.h"
 #include "tensorflow/core/kernels/data/parallel_map_dataset_op.h"
 #include "tensorflow/core/kernels/data/stats_utils.h"
+#include "tensorflow/core/kernels/ragged_tensor_variant.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/stringprintf.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
@@ -270,6 +271,12 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
@@ -672,12 +679,9 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
         for (int d = 0; d < dataset()->ragged_keys_.size(); ++d) {
           int output_index =
               dataset()->key_to_output_index_.at(dataset()->ragged_keys_[d]);
-          (*output)[output_index] = Tensor(ctx->allocator({}), DT_VARIANT, {});
-          Tensor serialized_ragged =
-              Tensor(ctx->allocator({}), DT_VARIANT, {2});
-          auto serialized_ragged_t = serialized_ragged.vec<Variant>();
-          serialized_ragged_t(0) = example_result.ragged_splits[d];
-          serialized_ragged_t(1) = example_result.ragged_values[d];
+          RaggedTensorVariant serialized_ragged;
+          serialized_ragged.append_splits(example_result.ragged_splits[d]);
+          serialized_ragged.set_values(example_result.ragged_values[d]);
           (*output)[output_index] = Tensor(ctx->allocator({}), DT_VARIANT, {});
           Tensor& ragged_wrapper = (*output)[output_index];
           ragged_wrapper.scalar<Variant>()() = serialized_ragged;
diff --git a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
index 460c18ce7a3..ee90d8cb603 100644
--- a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
@@ -64,6 +64,10 @@ class RandomDatasetOp::Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return kInfiniteCardinality; }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
diff --git a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
index e2cbe7d9dcc..7a65baaa680 100644
--- a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
@@ -417,7 +417,6 @@ class RebatchDatasetV2Op : public UnaryDatasetOpKernel {
           std::vector<Tensor> slices;
           slices.reserve(tensors_.size());
           for (const auto& tensor : tensors_) {
-            Tensor slice = tensor.Slice(offset_, slice_end);
             slices.push_back(tensor.Slice(offset_, slice_end));
           }
           slices_to_concatenate.push_back(std::move(slices));
@@ -452,8 +451,28 @@ class RebatchDatasetV2Op : public UnaryDatasetOpKernel {
         if (desired_batch_size == 0) {
           DCHECK_EQ(batch_size, 0);
           DCHECK_EQ(slices_to_concatenate.size(), 0);
-          for (const auto& dtype : dataset()->output_dtypes()) {
-            out_tensors->push_back(Tensor(dtype));
+          for (int i = 0; i < dataset()->output_dtypes().size(); ++i) {
+            if (dataset()->output_shapes()[i].unknown_rank()) {
+              // For unknown rank tensors, we just create a empty Tensor since
+              // it doesn't matter what shape it is.
+              out_tensors->push_back(Tensor(dataset()->output_dtypes()[i]));
+            } else {
+              auto dim_sizes = dataset()->output_shapes()[i].dim_sizes();
+
+              // The output batch size is always zero since the desired batch
+              // size is zero.
+              dim_sizes[0] = 0;
+
+              // Handle unknown dimensions by setting any unknown dimensions to
+              // zero since there isn't any data anyway.
+              for (int j = 1; j < dim_sizes.size(); ++j) {
+                if (dim_sizes[j] == -1) dim_sizes[j] = 0;
+              }
+
+              TensorShape tensor_shape(dim_sizes);
+              out_tensors->push_back(
+                  Tensor(dataset()->output_dtypes()[i], tensor_shape));
+            }
           }
           return Status::OK();
         }
diff --git a/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc
index 00869eea85c..6f271e4912b 100644
--- a/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc
@@ -70,6 +70,11 @@ class SamplingDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
diff --git a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
index 723f32311d0..eee635ffa7b 100644
--- a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
@@ -114,6 +114,12 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
       }
     }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
       return input_->CheckExternalState();
diff --git a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
index e96de29d759..ab4f58e3c5c 100644
--- a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
@@ -138,6 +138,12 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
diff --git a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
index f2195804cfd..d21039c4078 100644
--- a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
@@ -68,6 +68,12 @@ class SleepDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
diff --git a/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
index 04ebd5bfd34..523259dde73 100644
--- a/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
@@ -108,6 +108,12 @@ class SlidingWindowDatasetOp : public UnaryDatasetOpKernel {
       return n / window_shift_;
     }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
index 8b1c5fb7453..01044957857 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/experimental/snapshot_util.h"
+#include "tensorflow/core/kernels/data/hash_utils.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/raw_coding.h"
@@ -64,6 +65,7 @@ namespace data {
 namespace experimental {
 
 /* static */ constexpr const char* const SnapshotDatasetV2Op::kCompression;
+/* static */ constexpr const char* const SnapshotDatasetV2Op::kCompressionAuto;
 /* static */ constexpr const char* const SnapshotDatasetV2Op::kReaderFunc;
 /* static */ constexpr const char* const SnapshotDatasetV2Op::kShardFunc;
 /* static */ constexpr const char* const
@@ -106,6 +108,7 @@ class SnapshotDatasetV2Op::Dataset : public DatasetBase {
  public:
   Dataset(OpKernelContext* ctx, const DatasetBase* input, uint64 hash,
           const std::string& path, const std::string& compression,
+          const std::string& reader_prefix, const std::string& writer_prefix,
           std::unique_ptr<CapturedFunction> reader_func,
           std::unique_ptr<CapturedFunction> shard_func);
 
@@ -122,6 +125,8 @@ class SnapshotDatasetV2Op::Dataset : public DatasetBase {
 
   int64 Cardinality() const override;
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override;
+
   Status CheckExternalState() const override;
 
  protected:
@@ -134,6 +139,8 @@ class SnapshotDatasetV2Op::Dataset : public DatasetBase {
   const uint64 hash_;
   const tstring path_;
   const std::string compression_;
+  const std::string reader_prefix_;
+  const std::string writer_prefix_;
 
   std::unique_ptr<CapturedFunction> reader_func_;
   std::unique_ptr<CapturedFunction> shard_func_;
@@ -245,11 +252,12 @@ class SnapshotDatasetV2Op::Dataset::Iterator::Writer
   void SignalEOF(bool mark_closed) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   mutex mu_;
+  mutex writer_status_mu_;
   std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
 
   absl::flat_hash_map<int64, std::unique_ptr<snapshot_util::AsyncWriter>>
       writers_ TF_GUARDED_BY(mu_);
-  Status writer_status_ TF_GUARDED_BY(mu_);
+  Status writer_status_ TF_GUARDED_BY(writer_status_mu_);
   bool writers_closed_ TF_GUARDED_BY(mu_);
 
   uint64 run_id_ TF_GUARDED_BY(mu_);
@@ -289,13 +297,17 @@ class SnapshotDatasetV2Op::Dataset::Iterator::Passthrough
 SnapshotDatasetV2Op::Dataset::Dataset(
     OpKernelContext* ctx, const DatasetBase* input, uint64 hash,
     const std::string& path, const std::string& compression,
+    const std::string& reader_prefix, const std::string& writer_prefix,
     std::unique_ptr<CapturedFunction> reader_func,
     std::unique_ptr<CapturedFunction> shard_func)
     : DatasetBase(DatasetContext(ctx)),
       input_(input),
       hash_(hash),
       path_(path),
-      compression_(compression),
+      compression_(compression == kCompressionAuto ? io::compression::kSnappy
+                                                   : compression),
+      reader_prefix_(reader_prefix),
+      writer_prefix_(writer_prefix),
       reader_func_(std::move(reader_func)),
       shard_func_(std::move(shard_func)) {
   input_->Ref();
@@ -326,6 +338,12 @@ int64 SnapshotDatasetV2Op::Dataset::Cardinality() const {
   return input_->Cardinality();
 }
 
+Status SnapshotDatasetV2Op::Dataset::InputDatasets(
+    std::vector<const DatasetBase*>* inputs) const {
+  inputs->push_back(input_);
+  return Status::OK();
+}
+
 Status SnapshotDatasetV2Op::Dataset::CheckExternalState() const {
   return input_->CheckExternalState();
 }
@@ -351,6 +369,12 @@ Status SnapshotDatasetV2Op::Dataset::AsGraphDefInternal(
   AttrValue compression_attr;
   b->BuildAttrValue(compression_, &compression_attr);
 
+  AttrValue reader_prefix_attr;
+  b->BuildAttrValue(reader_prefix_, &reader_prefix_attr);
+
+  AttrValue writer_prefix_attr;
+  b->BuildAttrValue(writer_prefix_, &writer_prefix_attr);
+
   AttrValue reader_func_attr;
   b->BuildAttrValue(reader_func_->func(), &reader_func_attr);
 
@@ -374,6 +398,8 @@ Status SnapshotDatasetV2Op::Dataset::AsGraphDefInternal(
        std::make_pair(3, shard_func_other_args)},
       /*attrs=*/
       {{kCompression, compression_attr},
+       {kReaderPrefix, reader_prefix_attr},
+       {kWriterPrefix, writer_prefix_attr},
        {kReaderFunc, reader_func_attr},
        {kShardFunc, shard_func_attr},
        {kReaderFuncTarguments, reader_func_arguments_types_attr},
@@ -389,7 +415,8 @@ SnapshotDatasetV2Op::Dataset::Iterator::Iterator(const Params& params)
 
 Status SnapshotDatasetV2Op::Dataset::Iterator::Initialize(
     IteratorContext* ctx) {
-  return ctx->env()->RecursivelyCreateDir(hash_dir_);
+  return ctx->env()->RecursivelyCreateDir(
+      io::JoinPath(dataset()->writer_prefix_, hash_dir_));
 }
 
 Status SnapshotDatasetV2Op::Dataset::Iterator::SaveInternal(
@@ -448,7 +475,8 @@ Status SnapshotDatasetV2Op::Dataset::Iterator::InitializeIterator(
     experimental::SnapshotMetadataRecord metadata;
     bool file_exists;
     TF_RETURN_IF_ERROR(snapshot_util::ReadMetadataFile(
-        ctx->env(), hash_dir_, &metadata, &file_exists));
+        ctx->env(), io::JoinPath(dataset()->reader_prefix_, hash_dir_),
+        &metadata, &file_exists));
     if (!file_exists) {
       return errors::DataLoss("Snapshot metadata file in ", hash_dir_,
                               " does not exist any more.");
@@ -464,7 +492,8 @@ Status SnapshotDatasetV2Op::Dataset::Iterator::InitializeIterator(
     experimental::SnapshotMetadataRecord metadata;
     bool file_exists;
     TF_RETURN_IF_ERROR(snapshot_util::ReadMetadataFile(
-        ctx->env(), hash_dir_, &metadata, &file_exists));
+        ctx->env(), io::JoinPath(dataset()->reader_prefix_, hash_dir_),
+        &metadata, &file_exists));
 
     // `pending_snapshot_expiry_seconds` is a legacy option where we would not
     // write snapshots that we think were still on-going. We decided that this
@@ -510,8 +539,9 @@ Status SnapshotDatasetV2Op::Dataset::Iterator::Reader::Initialize(
   TF_RETURN_IF_ERROR(
       dataset()->reader_func_->Instantiate(ctx, &instantiated_reader_func_));
 
-  auto hash_dir =
-      snapshot_util::HashDirectory(dataset()->path_, dataset()->hash_);
+  auto hash_dir = snapshot_util::HashDirectory(
+      io::JoinPath(dataset()->reader_prefix_, dataset()->path_),
+      dataset()->hash_);
   bool metadata_file_exists;
   experimental::SnapshotMetadataRecord metadata;
   TF_RETURN_IF_ERROR(snapshot_util::ReadMetadataFile(
@@ -612,8 +642,9 @@ Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::WriteMetadataFile(
     metadata.add_dtype(output_dtype);
   }
   metadata.set_finalized(finalized);
-  tstring hash_directory =
-      snapshot_util::HashDirectory(dataset()->path_, dataset()->hash_);
+  tstring hash_directory = io::JoinPath(
+      dataset()->writer_prefix_,
+      snapshot_util::HashDirectory(dataset()->path_, dataset()->hash_));
 
   return snapshot_util::WriteMetadataFile(env, hash_directory, &metadata);
 }
@@ -666,16 +697,21 @@ Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::GetNextInternal(
 
       // Creates the run directory.
       run_dir_ = snapshot_util::RunDirectory(
-          snapshot_util::HashDirectory(dataset()->path_, dataset()->hash_),
+          snapshot_util::HashDirectory(
+              io::JoinPath(dataset()->writer_prefix_, dataset()->path_),
+              dataset()->hash_),
           run_id_);
       TF_RETURN_IF_ERROR(ctx->env()->RecursivelyCreateDir(run_dir_));
       TF_RETURN_IF_ERROR(WriteMetadataFile(ctx->env(), /*finalized=*/false));
     }
 
     // Writers have either encountered an error or are closed.
-    if (!writer_status_.ok() || writers_closed_) {
-      *end_of_sequence = true;
-      return writer_status_;
+    {
+      mutex_lock wsl(writer_status_mu_);
+      if (!writer_status_.ok() || writers_closed_) {
+        *end_of_sequence = true;
+        return writer_status_;
+      }
     }
 
     TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
@@ -683,7 +719,10 @@ Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::GetNextInternal(
     // Finalize metadata file when we are at the end of the iterator.
     if (*end_of_sequence) {
       SignalEOF(/*mark_closed=*/true);
-      TF_RETURN_IF_ERROR(writer_status_);
+      {
+        mutex_lock wsl(writer_status_mu_);
+        TF_RETURN_IF_ERROR(writer_status_);
+      }
       return WriteMetadataFile(ctx->env(), /*finalized=*/true);
     }
 
@@ -699,7 +738,8 @@ Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::GetNextInternal(
           current_checkpoint_id_, dataset()->compression_, kFileFormatVersion,
           dataset()->output_dtypes(), [this](Status s) {
             if (!s.ok()) {
-              mutex_lock l(mu_);
+              LOG(ERROR) << "AsyncWriter in snapshot writer failed: " << s;
+              mutex_lock l(writer_status_mu_);
               writer_status_ = s;
             }
           });
@@ -738,7 +778,9 @@ Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::RestoreInternal(
 
   run_id_ = static_cast<uint64>(run_id_signed);
   run_dir_ = snapshot_util::RunDirectory(
-      snapshot_util::HashDirectory(dataset()->path_, dataset()->hash_),
+      snapshot_util::HashDirectory(
+          io::JoinPath(dataset()->writer_prefix_, dataset()->path_),
+          dataset()->hash_),
       run_id_);
   current_checkpoint_id_ = static_cast<uint64>(current_checkpoint_id);
 
@@ -779,6 +821,14 @@ SnapshotDatasetV2Op::SnapshotDatasetV2Op(OpKernelConstruction* ctx)
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kCompression, &compression_));
 
+  if (ctx->HasAttr(kReaderPrefix)) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kReaderPrefix, &reader_prefix_));
+  }
+
+  if (ctx->HasAttr(kWriterPrefix)) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kWriterPrefix, &writer_prefix_));
+  }
+
   OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, kReaderFunc, reader_params,
                                                &reader_func_metadata_));
   OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, kShardFunc, shard_params,
@@ -802,6 +852,9 @@ void SnapshotDatasetV2Op::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
       ctx, AsGraphDef(ctx, input, SerializationContext(params), &graph_def));
   OP_REQUIRES_OK(ctx, HashGraph(graph_def, &graph_hash));
 
+  // Different compression modes should result in different graph hashes.
+  graph_hash = Hash64Combine(graph_hash, Hash64(compression_));
+
   std::unique_ptr<CapturedFunction> reader_func;
   OP_REQUIRES_OK(ctx,
                  CapturedFunction::Create(ctx, reader_func_metadata_,
@@ -812,8 +865,8 @@ void SnapshotDatasetV2Op::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                                           kShardFuncOtherArgs, &shard_func));
 
   *output = new SnapshotDatasetV2Op::Dataset(
-      ctx, input, graph_hash, path, compression_, std::move(reader_func),
-      std::move(shard_func));
+      ctx, input, graph_hash, path, compression_, reader_prefix_,
+      writer_prefix_, std::move(reader_func), std::move(shard_func));
 }
 
 namespace {
@@ -1026,6 +1079,12 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.h b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.h
index 95d7fdd0b28..d7097f43190 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.h
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.h
@@ -44,6 +44,9 @@ class SnapshotDatasetV2Op : public UnaryDatasetOpKernel {
   static constexpr const char* const kOutputTypes = "output_types";
   static constexpr const char* const kOutputShapes = "output_shapes";
   static constexpr const char* const kCompression = "compression";
+  static constexpr const char* const kReaderPrefix = "reader_prefix";
+  static constexpr const char* const kWriterPrefix = "writer_prefix";
+  static constexpr const char* const kCompressionAuto = "AUTO";
   static constexpr const char* const kReaderFunc = "reader_func";
   static constexpr const char* const kShardFunc = "shard_func";
   static constexpr const char* const kReaderFuncOtherArgs =
@@ -70,6 +73,8 @@ class SnapshotDatasetV2Op : public UnaryDatasetOpKernel {
   std::vector<PartialTensorShape> output_shapes_;
 
   std::string compression_;
+  std::string reader_prefix_;
+  std::string writer_prefix_;
 
   std::shared_ptr<FunctionMetadata> reader_func_metadata_;
   std::shared_ptr<FunctionMetadata> shard_func_metadata_;
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util.cc b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
index 9e936974c83..33ce9956cbc 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
@@ -363,6 +363,10 @@ class Reader::Dataset : public DatasetBase {
     return "snapshot_util::Reader::Dataset";
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
@@ -483,6 +487,11 @@ class Reader::NestedDataset : public DatasetBase {
     return "snapshot_util::Reader::NestedDataset";
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->clear();
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
diff --git a/tensorflow/core/kernels/data/experimental/sql/BUILD b/tensorflow/core/kernels/data/experimental/sql/BUILD
index 923cebf9f03..937ab34c587 100644
--- a/tensorflow/core/kernels/data/experimental/sql/BUILD
+++ b/tensorflow/core/kernels/data/experimental/sql/BUILD
@@ -1,6 +1,8 @@
 # Description:
 #   SQL library.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
diff --git a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
index f6720aa1c88..4b1b99a120f 100644
--- a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
@@ -103,6 +103,11 @@ class SqlDatasetOp : public DatasetOpKernel {
 
     string DebugString() const override { return "SqlDatasetOp::Dataset"; }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override { return Status::OK(); }
 
    protected:
diff --git a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
index 08d208fc340..1aa179acdd3 100644
--- a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
@@ -78,6 +78,12 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
diff --git a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
index fd4b4fccb7e..fd7eedc4cf0 100644
--- a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
@@ -82,6 +82,12 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return kUnknownCardinality; }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
       return input_->CheckExternalState();
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index a9c682a426b..111d7b2fec2 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -173,6 +173,12 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
@@ -301,6 +307,13 @@ class MaxIntraOpParallelismDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->clear();
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
@@ -423,6 +436,13 @@ class PrivateThreadPoolDatasetOp : public UnaryDatasetOpKernel {
 
     int64 Cardinality() const override { return input_->Cardinality(); }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->clear();
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
diff --git a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
index e813de70931..7d5810f7818 100644
--- a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
@@ -84,6 +84,12 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
       return kUnknownCardinality;
     }
 
+    Status InputDatasets(
+        std::vector<const DatasetBase*>* inputs) const override {
+      inputs->push_back(input_);
+      return Status::OK();
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
diff --git a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
index a4319234082..eeb1655970f 100644
--- a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
@@ -54,6 +54,11 @@ class UniqueDatasetOp::Dataset : public DatasetBase {
     return strings::StrCat("UniqueDatasetOp::Dataset");
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index 1301aed3cb4..b93f19e58e3 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -75,6 +75,11 @@ class FilterDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
     return input_->CheckExternalState();
diff --git a/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc b/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
index 468a22261d5..2b75483a7a5 100644
--- a/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
@@ -93,6 +93,10 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType, params);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index eba5097a1bb..ab0eb18abda 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -75,6 +75,11 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
     return input_->CheckExternalState();
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
index fcdbe4ab9a5..8d841cf9f60 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -74,6 +74,10 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(init_func_->CheckExternalState());
     TF_RETURN_IF_ERROR(next_func_->CheckExternalState());
diff --git a/tensorflow/core/kernels/data/hash_utils.cc b/tensorflow/core/kernels/data/hash_utils.cc
new file mode 100644
index 00000000000..688d6d847c9
--- /dev/null
+++ b/tensorflow/core/kernels/data/hash_utils.cc
@@ -0,0 +1,715 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/hash_utils.h"
+
+#include <queue>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/framework/op_def_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/regexp.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+// clang-format off
+constexpr std::array<const char*, 3> kOpsWithSeed = {
+    "AnonymousRandomSeedGenerator",
+    "ShuffleDataset",
+    "ShuffleAndRepeatDataset"
+};
+// clang-format on
+constexpr char kSeedInputName[] = "seed";
+constexpr char kSeed2InputName[] = "seed2";
+constexpr char kSeedGeneratorInputName[] = "seed_generator";
+
+template <std::size_t SIZE>
+bool IsNodeOfType(const NodeDef& node,
+                  const std::array<const char*, SIZE>& op_types) {
+  for (const auto& type : op_types) {
+    if (MatchesAnyVersion(type, node.op())) {
+      return true;
+    }
+  }
+  return false;
+}
+
+Status FindNode(const GraphDef& graph, const string& name,
+                const NodeDef** result) {
+  for (const auto& node : graph.node()) {
+    if (node.name() == name) {
+      *result = &node;
+      return Status::OK();
+    }
+  }
+  return errors::NotFound("Could not find node ", name, ".");
+}
+
+Status GetSink(const GraphDef& graph_def, const NodeDef** sink) {
+  for (auto& node : graph_def.node()) {
+    if (node.op() == "_Retval") {
+      *sink = &node;
+      break;
+    }
+  }
+
+  if (sink == nullptr) {
+    return errors::Internal("Cannot find sink node for dataset graph.");
+  }
+  return Status::OK();
+}
+
+Status ShouldIgnoreInput(const NodeDef& node, int i, bool* result) {
+  *result = false;
+  if (IsNodeOfType(node, kOpsWithSeed)) {
+    const OpRegistrationData* reg;
+    auto status = OpRegistry::Global()->LookUp(node.op(), &reg);
+
+    if (status.ok()) {
+      if (reg->op_def.input_arg_size() > i) {
+        const std::string input_arg_name = reg->op_def.input_arg(i).name();
+        if (input_arg_name == kSeedInputName ||
+            input_arg_name == kSeed2InputName ||
+            input_arg_name == kSeedGeneratorInputName) {
+          VLOG(2) << "Ignoring arg: " << input_arg_name
+                  << " from node: " << node.name();
+          *result = true;
+          return Status::OK();
+        }
+      }
+    } else if (errors::IsNotFound(status)) {
+      LOG(WARNING) << "Cannot find " << node.op()
+                   << " in global op registry, so cannot determine which "
+                      "inputs are seeds.";
+    } else {
+      return status;
+    }
+  }
+  return Status::OK();
+}
+
+Status ParseInputNodeName(const std::string& input_name, std::string* node_name,
+                          std::string* suffix, bool* is_control_input) {
+  if (input_name[0] == '^') {
+    *node_name = input_name.substr(1);
+    *is_control_input = true;
+    return Status::OK();
+  }
+  std::pair<std::string, std::string> node_spec =
+      absl::StrSplit(input_name, absl::MaxSplits(':', 1));
+  *node_name = node_spec.first;
+  *suffix = node_spec.second;
+  *is_control_input = false;
+  return Status::OK();
+}
+
+// Given a graph_def and a root_node, this class computes a fingerprint that
+// tries to capture the structure of the graph rooted at the provided node.
+// It does not at any point rely on the names of the nodes in the graph and
+// just relies on the connections between different nodes. In the presence of
+// multiple cycles in the graph, there is a non-zero possibility that two
+// graphs with different structure might end up with the same fingerprint
+// as in order to break cycles we prune away some edges (in a deterministic
+// fashion though). Idea for this algorithm was borrowed from:
+// https://stackoverflow.com/questions/11338746/directed-graphs-with-a-given-root-node-match-another-directed-graph-for-equali
+class GraphHasher {
+ public:
+  // `GraphHasher` does not take ownership of `graph_def`, `root_node`, or
+  // `flib_def`.
+  explicit GraphHasher(const GraphDef* graph, const NodeDef* root,
+                       const FunctionLibraryDefinition* flib)
+      : graph_(graph), root_(root), flib_(flib) {}
+
+  Status Init() {
+    // Pre-process the graph to do a BFS and prune away cycles that might cause
+    // problems.
+    absl::flat_hash_set<std::string> visited;
+    std::queue<const NodeDef*> bfs_queue;
+    bfs_queue.push(root_);
+    while (!bfs_queue.empty()) {
+      const NodeDef* node = bfs_queue.front();
+      bfs_queue.pop();
+      if (visited.contains(node->name())) {
+        continue;
+      }
+      visited.insert(node->name());
+      NodeRep node_rep;
+      for (int i = 0; i < node->input_size(); ++i) {
+        DCHECK_GT(node->input(i).length(), 0);
+
+        // We skip trying to take the hash of the seeds of any ops, as they
+        // are irrelevant to the hash of the graph and may vary from run to run.
+        bool should_ignore_input = false;
+        TF_RETURN_IF_ERROR(ShouldIgnoreInput(*node, i, &should_ignore_input));
+        if (should_ignore_input) continue;
+
+        std::string node_name, suffix;
+        bool is_control_input;
+        TF_RETURN_IF_ERROR(ParseInputNodeName(node->input(i), &node_name,
+                                              &suffix, &is_control_input));
+        const NodeDef* input_node;
+        TF_RETURN_IF_ERROR(FindNode(*graph_, node_name, &input_node));
+
+        // If we've already seen this node before, skip it and don't add it to
+        // the queue.
+        if (visited.find(node_name) != visited.end()) {
+          EdgeRep cycle_edge(node, input_node);
+          cycle_forming_edges_.insert(cycle_edge.GetHash());
+          continue;
+        }
+        if (is_control_input) {
+          node_rep.node_control_inputs.push_back(input_node);
+        } else {
+          node_rep.node_inputs.push_back(std::make_pair(input_node, suffix));
+          bfs_queue.push(input_node);
+        }
+      }
+      nodes_[node] = node_rep;
+    }
+    return Status::OK();
+  }
+
+  Status HashRoot(uint64* hash) { return HashNode(root_, hash); }
+
+  Status CheckEqual(GraphHasher* that) {
+    return CheckNodesEqual(root_, that, that->root_);
+  }
+
+ private:
+  Status HashNode(const NodeDef* node, uint64* hash) {
+    auto it = cache_.find(node);
+    if (it != cache_.end()) {
+      *hash = it->second;
+      return Status::OK();
+    }
+
+    NodeRep* node_rep = gtl::FindOrNull(nodes_, node);
+    if (node_rep == nullptr) {
+      return errors::InvalidArgument("Could not find node: ", node->name());
+    }
+
+    uint64 non_input_hash;
+    TF_RETURN_IF_ERROR(
+        HashNodeNonInput(node, /*hash_functions=*/true, &non_input_hash));
+
+    uint64 control_inputs_hash;
+    TF_RETURN_IF_ERROR(
+        HashControlInputs(node_rep->node_control_inputs, &control_inputs_hash));
+
+    // Hash regular inputs. We combine them in an ordered fashion.
+    uint64 inputs_hash = 0;
+    for (const auto& input : node_rep->node_inputs) {
+      uint64 node_hash = 0;
+      EdgeRep edge(node, input.first);
+      // If the edge was pruned we get the non input node hash to avoid cycles.
+      if (cycle_forming_edges_.find(edge.GetHash()) !=
+          cycle_forming_edges_.end()) {
+        TF_RETURN_IF_ERROR(
+            HashNodeNonInput(input.first, /*hash_functions=*/true, &node_hash));
+      } else {
+        TF_RETURN_IF_ERROR(HashNode(input.first, &node_hash));
+      }
+      inputs_hash = Hash64Combine(
+          inputs_hash, Hash64Combine(node_hash, Hash64(input.second)));
+    }
+
+    *hash = Hash64Combine(non_input_hash,
+                          Hash64Combine(control_inputs_hash, inputs_hash));
+    cache_[node] = *hash;
+    return Status::OK();
+  }
+
+  Status CheckNodesEqual(const NodeDef* this_node, GraphHasher* that,
+                         const NodeDef* that_node) {
+    Status s = CheckNodesEqualHelper(this_node, that, that_node);
+    if (!s.ok()) {
+      return errors::FailedPrecondition("Nodes ", this_node->name(), " and ",
+                                        that_node->name(),
+                                        " are not the same:\n", s);
+    }
+    return s;
+  }
+
+  Status CheckNodesEqualHelper(const NodeDef* this_node, GraphHasher* that,
+                               const NodeDef* that_node) {
+    TF_RETURN_IF_ERROR(CheckNodesEqualNonInput(this_node, that, that_node,
+                                               /*compare_functions=*/true));
+
+    TF_RETURN_IF_ERROR(
+        CheckControlInputsEqual(nodes_[this_node].node_control_inputs, that,
+                                that->nodes_[that_node].node_control_inputs));
+
+    auto& this_node_inputs = nodes_[this_node].node_inputs;
+    auto& that_node_inputs = that->nodes_[that_node].node_inputs;
+    if (this_node_inputs.size() != that_node_inputs.size()) {
+      return errors::FailedPrecondition(
+          "Nodes have different numbers of node inputs: ",
+          this_node_inputs.size(), " vs ", that_node_inputs.size());
+    }
+    for (int i = 0; i < this_node_inputs.size(); ++i) {
+      const NodeDef* this_input = this_node_inputs[i].first;
+      const NodeDef* that_input = that_node_inputs[i].first;
+      if (is_cycle_forming_edge(this_node, this_input)) {
+        TF_RETURN_IF_ERROR(CheckNodesEqualNonInput(this_input, that, that_input,
+                                                   /*compare_functions=*/true));
+      } else {
+        TF_RETURN_IF_ERROR(CheckNodesEqual(this_input, that, that_input));
+      }
+      std::string this_input_suffix = this_node_inputs[i].second;
+      std::string that_input_suffix = that_node_inputs[i].second;
+      if (this_input_suffix != that_input_suffix) {
+        return errors::FailedPrecondition(
+            "Node inputs ", this_input->name(), " and ", that_input->name(),
+            " have different suffixes: ", this_input_suffix, " vs ",
+            that_input_suffix);
+      }
+    }
+    return Status::OK();
+  }
+
+  Status HashNodeNonInput(const NodeDef* node, bool hash_functions,
+                          uint64* hash) {
+    // Hash Attrs. We get the list of attrs from the op registry and then look
+    // up their values in the NodeDef attr map. This avoids looping over
+    // a map which is non-deterministic.
+    uint64 attrs_hash = 0;
+    const OpRegistrationData* reg;
+    TF_RETURN_IF_ERROR(flib_->LookUp(node->op(), &reg));
+    uint64 op_hash = 0;
+    if (reg->is_function_op) {
+      if (hash_functions) {
+        TF_RETURN_IF_ERROR(HashFunction(node->op(), node->attr(), &op_hash));
+      }
+    } else {
+      op_hash = Hash64(node->op());
+    }
+
+    for (const auto& attr : reg->op_def.attr()) {
+      const auto& attr_key = attr.name();
+      if (!node->attr().contains(attr_key)) continue;
+      auto attr_value = node->attr().at(attr_key);
+      if (attr_key == kColocationAttrName ||
+          attr_key == kColocationGroupPrefix) {
+        continue;
+      }
+      uint64 attr_hash = 0;
+      TF_RETURN_IF_ERROR(
+          HashAttr(attr_key, attr_value, hash_functions, &attr_hash));
+      attrs_hash = Hash64Combine(attrs_hash, attr_hash);
+    }
+
+    // Hash Device.
+    uint64 device_hash = Hash64(node->device());
+
+    *hash = Hash64Combine(op_hash, Hash64Combine(attrs_hash, device_hash));
+    return Status::OK();
+  }
+
+  Status CheckNodesEqualNonInput(const NodeDef* this_node, GraphHasher* that,
+                                 const NodeDef* that_node,
+                                 bool compare_functions) {
+    // We get the list of attrs from the op registry and then look
+    // up their values in the NodeDef attr map. This avoids looping over
+    // a map which is non-deterministic.
+    const OpRegistrationData* reg;
+    TF_RETURN_IF_ERROR(flib_->LookUp(this_node->op(), &reg));
+    if (reg->is_function_op) {
+      if (compare_functions) {
+        TF_RETURN_IF_ERROR(
+            CheckFunctionsEqual(this_node->op(), this_node->attr(), that,
+                                that_node->op(), that_node->attr()));
+      }
+    } else {
+      if (this_node->op() != that_node->op()) {
+        return errors::FailedPrecondition(
+            "ops for nodes ", this_node->name(), " and ", that_node->name(),
+            " are different: ", this_node->op(), " != ", that_node->op());
+      }
+    }
+
+    for (const auto& attr : reg->op_def.attr()) {
+      const auto& attr_key = attr.name();
+      if (this_node->attr().contains(attr_key) !=
+          that_node->attr().contains(attr_key)) {
+        return errors::FailedPrecondition(
+            "attr with key ", attr_key, " is different for nodes ",
+            this_node->name(), " and ", that_node->name(),
+            ". Present in former: ", this_node->attr().contains(attr_key),
+            ". Present in latter: ", that_node->attr().contains(attr_key));
+      }
+      if (!this_node->attr().contains(attr_key)) continue;
+      if (attr_key == kColocationAttrName ||
+          attr_key == kColocationGroupPrefix) {
+        continue;
+      }
+      auto this_attr = this_node->attr().at(attr_key);
+      auto that_attr = that_node->attr().at(attr_key);
+      TF_RETURN_IF_ERROR(CheckAttrsEqual(attr_key, this_attr, that, that_attr,
+                                         compare_functions));
+    }
+
+    if (this_node->device() != that_node->device()) {
+      return errors::FailedPrecondition(
+          "Devices are different for nodes ", this_node->name(), " and ",
+          that_node->name(), ": ", this_node->device(), " vs ",
+          that_node->device());
+    }
+    return Status::OK();
+  }
+
+  Status HashAttr(const std::string& attr_name, const AttrValue& attr_value,
+                  bool hash_functions, uint64* hash) {
+    uint64 value_hash = 0;
+    if (attr_value.has_func()) {
+      if (hash_functions) {
+        TF_RETURN_IF_ERROR(HashFunction(attr_value.func(), &value_hash));
+      }
+    } else if (attr_value.has_list() && attr_value.list().func_size() > 0) {
+      if (hash_functions) {
+        for (auto& func : attr_value.list().func()) {
+          uint64 func_hash;
+          TF_RETURN_IF_ERROR(HashFunction(func, &func_hash));
+          value_hash = Hash64Combine(value_hash, func_hash);
+        }
+      }
+    } else {
+      value_hash = DeterministicProtoHash64(attr_value);
+    }
+    *hash = Hash64(absl::StrCat(attr_name, "=", value_hash));
+    return Status::OK();
+  }
+
+  Status CheckAttrsEqual(const std::string& attr_name,
+                         const AttrValue& this_attr, GraphHasher* that,
+                         const AttrValue& that_attr, bool compare_functions) {
+    if (this_attr.has_func() != that_attr.has_func()) {
+      return errors::FailedPrecondition(
+          "AttrValues are of different types: ", this_attr.DebugString(),
+          " vs ", that_attr.DebugString());
+    }
+    if (this_attr.has_func()) {
+      if (compare_functions) {
+        TF_RETURN_IF_ERROR(
+            CheckFunctionsEqual(this_attr.func(), that, that_attr.func()));
+      }
+      return Status::OK();
+    }
+    if (this_attr.has_list() != that_attr.has_list()) {
+      return errors::FailedPrecondition(
+          "AttrValues are of different types: ", this_attr.DebugString(),
+          " vs ", that_attr.DebugString());
+    }
+    if (this_attr.has_list()) {
+      if (this_attr.list().func_size() != that_attr.list().func_size()) {
+        return errors::FailedPrecondition(
+            "AttrValues have func lists of different sizes: ",
+            this_attr.DebugString(), " vs ", that_attr.DebugString());
+      }
+      if (compare_functions) {
+        for (int i = 0; i < this_attr.list().func_size(); ++i) {
+          TF_RETURN_IF_ERROR(CheckFunctionsEqual(this_attr.list().func(i), that,
+                                                 that_attr.list().func(i)));
+        }
+      }
+      return Status::OK();
+    }
+    uint64 this_hash, that_hash;
+    TF_RETURN_IF_ERROR(
+        HashAttr(attr_name, this_attr, /*hash_functions=*/true, &this_hash));
+    TF_RETURN_IF_ERROR(that->HashAttr(attr_name, that_attr,
+                                      /*hash_functions=*/true, &that_hash));
+    if (this_hash != that_hash) {
+      return errors::FailedPrecondition(
+          "AttrValues are different: ", this_attr.DebugString(), " vs ",
+          that_attr.DebugString());
+    }
+    return Status::OK();
+  }
+
+  Status HashFunction(const NameAttrList& func, uint64* hash) {
+    return HashFunction(func.name(), func.attr(), hash);
+  }
+
+  Status HashFunction(const std::string& name, const AttrValueMap& attrs,
+                      uint64* hash) {
+    const FunctionDef* fdef = flib_->Find(name);
+
+    // Convert to a GraphDef.
+    std::unique_ptr<FunctionBody> fbody;
+    TF_RETURN_IF_ERROR(
+        FunctionDefToBodyHelper(*fdef, AttrSlice(&attrs), flib_, &fbody));
+    GraphDef graph_def = fbody->graph->ToGraphDefDebug();
+
+    // For each return node, we create a new GraphHasher to compute a hash.
+    // We then combine these hashes to produce the hash ordered.
+    uint64 ret_nodes_hash = 0;
+    for (const auto& ret_node : fbody->ret_nodes) {
+      uint64 ret_node_hash = 0;
+      GraphHasher hasher(&graph_def, &ret_node->def(), flib_);
+      TF_RETURN_IF_ERROR(hasher.Init());
+      TF_RETURN_IF_ERROR(hasher.HashRoot(&ret_node_hash));
+      ret_nodes_hash = Hash64Combine(ret_nodes_hash, ret_node_hash);
+    }
+
+    std::vector<const NodeDef*> control_rets;
+    for (const auto& control_ret_node : fbody->control_ret_nodes) {
+      control_rets.push_back(&control_ret_node->def());
+    }
+    uint64 control_ret_nodes_hash = 0;
+    TF_RETURN_IF_ERROR(
+        HashControlInputs(control_rets, &control_ret_nodes_hash));
+
+    *hash = Hash64Combine(ret_nodes_hash, control_ret_nodes_hash);
+    return Status::OK();
+  }
+
+  Status CheckFunctionsEqual(const NameAttrList& this_func, GraphHasher* that,
+                             const NameAttrList& that_func) {
+    return CheckFunctionsEqual(this_func.name(), this_func.attr(), that,
+                               that_func.name(), that_func.attr());
+  }
+  Status CheckFunctionsEqual(const std::string& this_name,
+                             const AttrValueMap& this_attrs, GraphHasher* that,
+                             const std::string& that_name,
+                             const AttrValueMap& that_attrs) {
+    Status s = CheckFunctionsEqualHelper(this_name, this_attrs, that, that_name,
+                                         that_attrs);
+    if (!s.ok()) {
+      return errors::FailedPrecondition("Functions ", this_name, " and ",
+                                        that_name, " are not the same:\n", s);
+    }
+    return s;
+  }
+
+  Status CheckFunctionsEqualHelper(const std::string& this_name,
+                                   const AttrValueMap& this_attrs,
+                                   GraphHasher* that,
+                                   const std::string& that_name,
+                                   const AttrValueMap& that_attrs) {
+    const FunctionDef* this_fdef = flib_->Find(this_name);
+    const FunctionDef* that_fdef = that->flib_->Find(that_name);
+
+    // Convert to GraphDefs.
+    std::unique_ptr<FunctionBody> this_fbody;
+    TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+        *this_fdef, AttrSlice(&this_attrs), flib_, &this_fbody));
+    GraphDef this_graph_def = this_fbody->graph->ToGraphDefDebug();
+    std::unique_ptr<FunctionBody> that_fbody;
+    TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
+        *that_fdef, AttrSlice(&that_attrs), that->flib_, &that_fbody));
+    GraphDef that_graph_def = that_fbody->graph->ToGraphDefDebug();
+
+    if (this_fbody->ret_nodes.size() != that_fbody->ret_nodes.size()) {
+      return errors::FailedPrecondition(
+          "Different numbers of ret nodes for functions ", this_name, " and ",
+          that_name, ": ", this_fbody->ret_nodes.size(), " vs ",
+          that_fbody->ret_nodes.size());
+    }
+    for (int i = 0; i < this_fbody->ret_nodes.size(); ++i) {
+      const NodeDef* this_root = &this_fbody->ret_nodes[i]->def();
+      const NodeDef* that_root = &that_fbody->ret_nodes[i]->def();
+      GraphHasher this_hasher(&this_graph_def, this_root, flib_);
+      TF_RETURN_IF_ERROR(this_hasher.Init());
+      GraphHasher that_hasher(&that_graph_def, that_root, that->flib_);
+      TF_RETURN_IF_ERROR(that_hasher.Init());
+      TF_RETURN_IF_ERROR(this_hasher.CheckEqual(&that_hasher));
+    }
+
+    std::vector<const NodeDef*> this_control_rets;
+    for (const auto& control_ret_node : this_fbody->control_ret_nodes) {
+      this_control_rets.push_back(&control_ret_node->def());
+    }
+    std::vector<const NodeDef*> that_control_rets;
+    for (const auto& control_ret_node : that_fbody->control_ret_nodes) {
+      that_control_rets.push_back(&control_ret_node->def());
+    }
+    TF_RETURN_IF_ERROR(
+        CheckControlInputsEqual(this_control_rets, that, that_control_rets));
+    return Status::OK();
+  }
+
+  Status HashControlInputs(const std::vector<const NodeDef*>& inputs,
+                           uint64* hash) {
+    *hash = 0;
+    for (const NodeDef* input : inputs) {
+      uint64 node_hash = 0;
+      TF_RETURN_IF_ERROR(
+          HashNodeNonInput(input, /*hash_functions=*/false, &node_hash));
+      *hash = Hash64CombineUnordered(*hash, node_hash);
+    }
+    return Status::OK();
+  }
+
+  Status CheckControlInputsEqual(
+      const std::vector<const NodeDef*>& this_inputs, GraphHasher* that,
+      const std::vector<const NodeDef*>& that_inputs) {
+    absl::flat_hash_map<uint64, const NodeDef*> this_hashes;
+    for (const NodeDef* input : this_inputs) {
+      uint64 node_hash = 0;
+      TF_RETURN_IF_ERROR(
+          HashNodeNonInput(input, /*hash_functions=*/false, &node_hash));
+      this_hashes[node_hash] = input;
+    }
+    absl::flat_hash_map<uint64, const NodeDef*> that_hashes;
+    for (const NodeDef* input : that_inputs) {
+      uint64 node_hash = 0;
+      TF_RETURN_IF_ERROR(
+          HashNodeNonInput(input, /*hash_functions=*/false, &node_hash));
+      if (this_hashes.contains(node_hash)) {
+        this_hashes.erase(node_hash);
+      } else {
+        that_hashes[node_hash] = input;
+      }
+    }
+    if (!this_hashes.empty()) {
+      std::vector<std::string> this_unmatched;
+      for (const auto& it : this_hashes) {
+        this_unmatched.push_back(it.second->name());
+      }
+      std::vector<std::string> that_unmatched;
+      for (const auto& it : that_hashes) {
+        that_unmatched.push_back(it.second->name());
+      }
+      return errors::FailedPrecondition(
+          "Control dependencies are different. One node has dependencies [",
+          absl::StrJoin(this_unmatched, ", "),
+          "], which don't match any of the other node's dependencies [",
+          absl::StrJoin(that_unmatched, ", "), "]");
+    }
+    return Status::OK();
+  }
+
+ private:
+  bool is_cycle_forming_edge(const NodeDef* start, const NodeDef* end) {
+    EdgeRep edge(start, end);
+    return cycle_forming_edges_.contains(edge.GetHash());
+  }
+
+  struct NodeRep {
+    std::vector<const NodeDef*> node_control_inputs;
+    std::vector<std::pair<const NodeDef*, std::string>> node_inputs;
+  };
+
+  struct EdgeRep {
+    const NodeDef* start_node;
+    const NodeDef* end_node;
+
+    EdgeRep(const NodeDef* start, const NodeDef* end)
+        : start_node(start), end_node(end) {}
+
+    uint64 GetHash() {
+      return Hash64Combine(absl::Hash<const NodeDef*>()(start_node),
+                           absl::Hash<const NodeDef*>()(end_node));
+    }
+  };
+  const GraphDef* const graph_;                  // Not owned.
+  const NodeDef* const root_;                    // Not owned.
+  const FunctionLibraryDefinition* const flib_;  // Not owned.
+  // Edges that need to be pruned as their presence will cause cycles.
+  absl::flat_hash_set<uint64> cycle_forming_edges_;
+  absl::flat_hash_map<const NodeDef*, NodeRep> nodes_;
+  absl::flat_hash_map<const NodeDef*, uint64> cache_;
+};
+
+}  // anonymous namespace
+
+Status HashTensor(const Tensor& tensor, uint64* hash) {
+  const tstring* s = nullptr;
+  // Hash tensor type.
+  *hash = Hash64Combine(0, tensor.dtype());
+  // Hash tensor shape.
+  for (int i = 0; i < tensor.shape().dims(); ++i) {
+    *hash = Hash64Combine(*hash, tensor.shape().dim_size(i));
+  }
+  // Hash tensor data.
+  switch (tensor.dtype()) {
+    case DT_RESOURCE:
+    case DT_VARIANT:
+      return errors::Unimplemented("Hashing ", DataTypeString(tensor.dtype()),
+                                   " is not supported.");
+    case DT_STRING:
+      s = tensor.flat<tstring>().data();
+      for (int i = 0; i < tensor.NumElements(); ++i, ++s) {
+        *hash = Hash64Combine(*hash, Hash64(s->data(), s->size()));
+      }
+      break;
+    default:
+      *hash = Hash64(tensor.tensor_data().data(), tensor.tensor_data().size());
+  }
+  return Status::OK();
+}
+
+Status HashNode(const GraphDef& graph, const NodeDef& node, uint64* hash) {
+  const FunctionLibraryDefinition flib_def(OpRegistry::Global(),
+                                           graph.library());
+  return HashNode(graph, node, flib_def, hash);
+}
+
+Status HashNode(const GraphDef& graph, const NodeDef& node,
+                const FunctionLibraryDefinition& flib_def, uint64* hash) {
+  GraphHasher hasher(&graph, &node, &flib_def);
+  TF_RETURN_IF_ERROR(hasher.Init());
+  return hasher.HashRoot(hash);
+}
+
+Status HashGraph(const GraphDef& graph_def, uint64* hash) {
+  const NodeDef* sink = nullptr;
+  TF_RETURN_IF_ERROR(GetSink(graph_def, &sink));
+  return HashNode(graph_def, *sink, hash);
+}
+
+Status CheckGraphsEqual(const GraphDef& a, const GraphDef& b) {
+  const NodeDef* sink_a;
+  TF_RETURN_IF_ERROR(GetSink(a, &sink_a));
+  const NodeDef* sink_b;
+  TF_RETURN_IF_ERROR(GetSink(b, &sink_b));
+  return CheckSubgraphsEqual(a, sink_a, b, sink_b);
+}
+
+Status CheckSubgraphsEqual(const GraphDef& a, const NodeDef* node_a,
+                           const GraphDef& b, const NodeDef* node_b) {
+  const FunctionLibraryDefinition flib_def_a(OpRegistry::Global(), a.library());
+  GraphHasher hasher_a(&a, node_a, &flib_def_a);
+  TF_RETURN_IF_ERROR(hasher_a.Init());
+
+  const FunctionLibraryDefinition flib_def_b(OpRegistry::Global(), b.library());
+  GraphHasher hasher_b(&b, node_b, &flib_def_b);
+  TF_RETURN_IF_ERROR(hasher_b.Init());
+
+  return hasher_a.CheckEqual(&hasher_b);
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/hash_utils.h b/tensorflow/core/kernels/data/hash_utils.h
new file mode 100644
index 00000000000..1ff4065e7b5
--- /dev/null
+++ b/tensorflow/core/kernels/data/hash_utils.h
@@ -0,0 +1,62 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_HASH_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_HASH_UTILS_H_
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace data {
+
+// Returns a stable hash of the subgraph rooted at the given node.
+//
+// NOTE: There is currently no guarantee that the hash of a subgraph will stay
+// the same between TensorFlow builds.
+Status HashNode(const GraphDef& graph, const NodeDef& node, uint64* hash);
+Status HashNode(const GraphDef& graph, const NodeDef& node,
+                const FunctionLibraryDefinition& flib_def, uint64* hash);
+
+// Returns a stable hash of the given tensor.
+//
+// NOTE: There is currently no guarantee that the hash of a subgraph will stay
+// the same between TensorFlow builds.
+Status HashTensor(const Tensor& tensor, uint64* hash);
+
+// Returns a stable hash of the given graph.
+//
+// NOTE: There is currently no guarantee that the hash of a subgraph will stay
+// the same between TensorFlow builds.
+Status HashGraph(const GraphDef& graph, uint64* hash);
+
+// Determines whether the given graphs are equal, following the same logic used
+// for HashGraph. Returns OK if the graphs can be determined to be equal,
+// otherwise returns an error message explaining why the graphs couldn't be
+// determined to be equal.
+Status CheckGraphsEqual(const GraphDef& a, const GraphDef& b);
+
+// Determines whether the subgraphs rooted aat the given nodes are equal
+// following the same logic used for HashGraph. Returns OK if the graphs can be
+// determined to be equal, otherwise returns an error message explaining why the
+// graphs couldn't be determined to be equal.
+Status CheckSubgraphsEqual(const GraphDef& a, const NodeDef* node_a,
+                           const GraphDef& b, const NodeDef* node_b);
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_HASH_UTILS_H_
diff --git a/tensorflow/core/kernels/data/hash_utils_test.cc b/tensorflow/core/kernels/data/hash_utils_test.cc
new file mode 100644
index 00000000000..f61ab2a9c44
--- /dev/null
+++ b/tensorflow/core/kernels/data/hash_utils_test.cc
@@ -0,0 +1,1178 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/hash_utils.h"
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+using ::testing::ContainsRegex;
+
+class DatasetHashUtilsTest : public ::testing::Test {
+ protected:
+  uint64 GetHash(const FunctionDefLibrary& library, const FunctionDef& fn) {
+    // Construct a node with a function as an attr.
+    GraphDef graph_def;
+    *graph_def.mutable_library() = library;
+    NodeDef* node = graph_def.add_node();
+    node->set_op("RemoteCall");
+    NameAttrList func;
+    func.set_name(fn.signature().name());
+    AddNodeAttr("f", func, node);
+    uint64 hash = 0;
+    TF_CHECK_OK(HashNode(graph_def, *node, &hash));
+    return hash;
+  }
+
+  Status CheckEqual(const FunctionDefLibrary& library, const FunctionDef& fn1,
+                    const FunctionDef& fn2) {
+    // Construct nodes with a function as an attr.
+    GraphDef graph_def;
+    *graph_def.mutable_library() = library;
+
+    NodeDef* node1 = graph_def.add_node();
+    node1->set_op("RemoteCall");
+    NameAttrList func1;
+    func1.set_name(fn1.signature().name());
+    AddNodeAttr("f", func1, node1);
+
+    NodeDef* node2 = graph_def.add_node();
+    node2->set_op("RemoteCall");
+    NameAttrList func2;
+    func2.set_name(fn2.signature().name());
+    AddNodeAttr("f", func2, node2);
+
+    return CheckSubgraphsEqual(graph_def, node1, graph_def, node2);
+  }
+
+  uint64 GetHash(const GraphDef& graph, const NodeDef& node) {
+    uint64 hash = 0;
+    TF_CHECK_OK(HashNode(graph, node, &hash));
+    return hash;
+  }
+
+  uint64 GetHash(const Tensor& tensor) {
+    uint64 hash = 0;
+    TF_CHECK_OK(HashTensor(tensor, &hash));
+    return hash;
+  }
+};
+
+TEST_F(DatasetHashUtilsTest, HashFunctionSameFunctionDifferentNames) {
+  FunctionDefLibrary fl;
+
+  FunctionDef* f1 = fl.add_function();
+  *f1 = FunctionDefHelper::Create(
+      "AddAndMul", {"i: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+
+  FunctionDef* f2 = fl.add_function();
+  *f2 = FunctionDefHelper::Create(
+      "AddAndMul2", {"input: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"input", "input"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"input", "input"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+
+  EXPECT_EQ(GetHash(fl, *f1), GetHash(fl, *f2));
+  TF_EXPECT_OK(CheckEqual(fl, *f1, *f2));
+}
+
+TEST_F(DatasetHashUtilsTest, HashFunctionDifferentFunctions) {
+  FunctionDefLibrary fl;
+
+  FunctionDef* f1 = fl.add_function();
+  *f1 = FunctionDefHelper::Create(
+      "AddAndMul", {"i: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+
+  FunctionDef* f2 = fl.add_function();
+  *f2 = FunctionDefHelper::Create(
+      "AddAndAdd", {"i: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+
+  // The second op in `f2` is changed to "Add"
+  EXPECT_NE(GetHash(fl, *f1), GetHash(fl, *f2));
+  Status s = CheckEqual(fl, *f1, *f2);
+  EXPECT_NE(s.code(), error::OK);
+  EXPECT_THAT(s.error_message(), ContainsRegex("Add"));
+}
+
+TEST_F(DatasetHashUtilsTest, HashFunctionDifferentInternalNodeNames) {
+  FunctionDefLibrary fl;
+
+  FunctionDef* f1 = fl.add_function();
+  *f1 = FunctionDefHelper::Create(
+      "AddAndMul", {"i: float", "j: float", "k: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "j"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"add:z:0", "k"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "ret"}});
+
+  FunctionDef* f2 = fl.add_function();
+  *f2 = FunctionDefHelper::Create(
+      "AddAndMul", {"a: float", "b: float", "c: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"a", "b"}, {{"T", DT_FLOAT}}},
+       {{"mul"}, "Mul", {"add:z:0", "c"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "mul:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "mul"}});
+
+  EXPECT_EQ(GetHash(fl, *f1), GetHash(fl, *f2));
+  TF_EXPECT_OK(CheckEqual(fl, *f1, *f2));
+}
+
+TEST_F(DatasetHashUtilsTest, HashGraphWithMultipleCycles) {
+  uint64 hash = 0;
+  for (int i = 0; i < 1000; ++i) {
+    GraphDef g;
+    NodeDef* output_node = g.add_node();
+    TF_CHECK_OK(NodeDefBuilder("O", "Add")
+                    .Input("A", 0, DT_FLOAT)
+                    .Input("D", 0, DT_FLOAT)
+                    .Finalize(output_node));
+    TF_CHECK_OK(NodeDefBuilder("A", "Abs")
+                    .Input("B", 0, DT_FLOAT)
+                    .Finalize(g.add_node()));
+    TF_CHECK_OK(NodeDefBuilder("B", "Add")
+                    .Input("C", 0, DT_FLOAT)
+                    .Input("D", 0, DT_FLOAT)
+                    .Finalize(g.add_node()));
+    TF_CHECK_OK(NodeDefBuilder("C", "Ceil")
+                    .Input("A", 0, DT_FLOAT)
+                    .Finalize(g.add_node()));
+    TF_CHECK_OK(NodeDefBuilder("D", "Cos")
+                    .Input("E", 0, DT_FLOAT)
+                    .Finalize(g.add_node()));
+    TF_CHECK_OK(NodeDefBuilder("E", "Floor")
+                    .Input("B", 0, DT_FLOAT)
+                    .Finalize(g.add_node()));
+    uint64 t = GetHash(g, *output_node);
+    if (hash == 0) {
+      hash = t;
+    } else {
+      EXPECT_EQ(t, hash);
+    }
+  }
+}
+
+TEST_F(DatasetHashUtilsTest, HashNodeSameGraphDifferentNames) {
+  GraphDef gd;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .Finalize(n3));
+
+  NodeDef* n4 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_3/node_7", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n4));
+
+  NodeDef* n5 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_4/node_9", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .Finalize(n5));
+
+  NodeDef* n6 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_5/node_11", "Add")
+                  .Device("CPU:0")
+                  .Input(n4->name(), 0, DT_INT32)
+                  .Input(n5->name(), 0, DT_INT32)
+                  .Finalize(n6));
+
+  uint64 hash1 = GetHash(gd, *n3);
+  uint64 hash2 = GetHash(gd, *n6);
+  EXPECT_EQ(hash1, hash2);
+  TF_EXPECT_OK(CheckSubgraphsEqual(gd, n3, gd, n6));
+}
+
+TEST_F(DatasetHashUtilsTest, HashNodeDifferentGraphs) {
+  GraphDef gd;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .Finalize(n3));
+
+  NodeDef* n4 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Mul")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .Finalize(n4));
+
+  uint64 hash1 = GetHash(gd, *n3);
+  uint64 hash2 = GetHash(gd, *n4);
+  // We expect different hashes because the op has changed.
+  EXPECT_NE(hash1, hash2);
+  Status s = CheckSubgraphsEqual(gd, n3, gd, n4);
+  EXPECT_NE(s.code(), error::OK);
+  EXPECT_THAT(s.error_message(), ContainsRegex("Add"));
+  EXPECT_THAT(s.error_message(), ContainsRegex("Mul"));
+}
+
+TEST_F(DatasetHashUtilsTest, HashSameGraphDifferentSeeds) {
+  GraphDef gd;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  NodeDef* seed = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/seed", "Const")
+                  .Attr("value", 123)
+                  .Device("CPU:0")
+                  .Finalize(seed));
+
+  NodeDef* seed2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/seed2", "Const")
+                  .Attr("value", 456)
+                  .Device("CPU:0")
+                  .Finalize(seed2));
+
+  NodeDef* range_ds = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/range", "RangeDataset")
+                  .Input(n1->name(), 0, DT_INT64)
+                  .Input(n1->name(), 0, DT_INT64)
+                  .Input(n1->name(), 0, DT_INT64)
+                  .Device("CPU:0")
+                  .Finalize(range_ds));
+
+  NodeDef* shuffle_ds = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/shuffle", "ShuffleDataset")
+                  .Input(range_ds->name(), 0, DT_VARIANT)
+                  .Input(n1->name(), 0, DT_INT64)
+                  .Input(seed->name(), 0, DT_INT64)
+                  .Input(seed2->name(), 0, DT_INT64)
+                  .Device("CPU:0")
+                  .Finalize(shuffle_ds));
+
+  NodeDef* different_seed = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/different_seed", "Const")
+                  .Attr("value", 789)
+                  .Device("CPU:0")
+                  .Finalize(different_seed));
+  NodeDef* different_seed2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/different_seed2", "Const")
+                  .Attr("value", 654)
+                  .Device("CPU:0")
+                  .Finalize(different_seed2));
+
+  NodeDef* range_ds_2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/range_2", "RangeDataset")
+                  .Input(n1->name(), 0, DT_INT64)
+                  .Input(n1->name(), 0, DT_INT64)
+                  .Input(n1->name(), 0, DT_INT64)
+                  .Device("CPU:0")
+                  .Finalize(range_ds_2));
+
+  NodeDef* shuffle_ds_2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/shuffle_2", "ShuffleDataset")
+                  .Input(range_ds_2->name(), 0, DT_VARIANT)
+                  .Input(n1->name(), 0, DT_INT64)
+                  .Input(different_seed->name(), 0, DT_INT64)
+                  .Input(different_seed2->name(), 0, DT_INT64)
+                  .Device("CPU:0")
+                  .Finalize(shuffle_ds_2));
+
+  uint64 hash1 = GetHash(gd, *shuffle_ds);
+  uint64 hash2 = GetHash(gd, *shuffle_ds_2);
+  EXPECT_EQ(hash1, hash2);
+  TF_EXPECT_OK(CheckSubgraphsEqual(gd, shuffle_ds, gd, shuffle_ds_2));
+}
+
+TEST_F(DatasetHashUtilsTest, HashNodeSameGraphDifferentColocationNames) {
+  GraphDef gd;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Attr("_class", {"graph_1/node_2"})
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .Finalize(n3));
+
+  NodeDef* n4 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_3/node_7", "Const")
+                  .Attr("value", 1)
+                  .Attr("_class", {"graph_3/node_9"})
+                  .Device("CPU:0")
+                  .Finalize(n4));
+
+  NodeDef* n5 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_4/node_9", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .Finalize(n5));
+
+  NodeDef* n6 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_5/node_11", "Add")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .Finalize(n6));
+
+  uint64 hash1 = GetHash(gd, *n3);
+  uint64 hash2 = GetHash(gd, *n6);
+
+  EXPECT_EQ(hash1, hash2);
+  TF_EXPECT_OK(CheckSubgraphsEqual(gd, n3, gd, n6));
+}
+
+TEST_F(DatasetHashUtilsTest, HashNodeReversedOrder) {
+  GraphDef gd;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .Finalize(n3));
+
+  NodeDef* n4 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
+                  .Device("CPU:0")
+                  .Input(n2->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Finalize(n4));
+
+  uint64 hash1 = GetHash(gd, *n3);
+  uint64 hash2 = GetHash(gd, *n4);
+  // We expect different hashes because the inputs of n3 are swapped.
+  EXPECT_NE(hash1, hash2);
+  Status s = CheckSubgraphsEqual(gd, n3, gd, n4);
+  EXPECT_NE(s.code(), error::OK);
+  EXPECT_THAT(s.error_message(), ContainsRegex("AttrValues are different"));
+}
+
+TEST_F(DatasetHashUtilsTest, HashNodeInputPortChanged) {
+  GraphDef gd;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .Finalize(n3));
+
+  NodeDef* n4 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 1, DT_INT32)
+                  .Input(n2->name(), 2, DT_INT32)
+                  .Finalize(n4));
+
+  uint64 hash1 = GetHash(gd, *n3);
+  uint64 hash2 = GetHash(gd, *n4);
+  // We expect different hashes because the input ports for nodes used by n3
+  // has changed.
+  EXPECT_NE(hash1, hash2);
+  Status s = CheckSubgraphsEqual(gd, n3, gd, n4);
+  EXPECT_NE(s.code(), error::OK);
+  EXPECT_THAT(s.error_message(), ContainsRegex("Node inputs"));
+}
+
+TEST_F(DatasetHashUtilsTest, HashNodeSameFunctionDifferentNames) {
+  GraphDef gd;
+  FunctionDefLibrary* fl1 = gd.mutable_library();
+
+  FunctionDef* f1 = fl1->add_function();
+  *f1 = FunctionDefHelper::Create(
+      "AddAndMul", {"i: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+
+  FunctionDef* f2 = fl1->add_function();
+  *f2 = FunctionDefHelper::Create(
+      "AddAndMul2", {"input: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"input", "input"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"input", "input"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+
+  AttrValue a1;
+  NameAttrList* nal1 = a1.mutable_func();
+  nal1->set_name("AddAndMul");
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  std::vector<NodeDefBuilder::NodeOut> func_inputs;
+  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
+  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(func_inputs)
+                  .Attr("body", a1)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  AttrValue a2;
+  NameAttrList* nal2 = a2.mutable_func();
+  nal2->set_name("AddAndMul2");
+
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(func_inputs)
+                  .Attr("body", a2)
+                  .Device("CPU:0")
+                  .Finalize(n3));
+
+  uint64 hash1 = GetHash(gd, *n2);
+  uint64 hash2 = GetHash(gd, *n3);
+  EXPECT_EQ(hash1, hash2);
+  TF_EXPECT_OK(CheckSubgraphsEqual(gd, n2, gd, n3));
+}
+
+TEST_F(DatasetHashUtilsTest, HashNodeSameFunctionListsDifferentNames) {
+  GraphDef gd;
+  FunctionDefLibrary* fl1 = gd.mutable_library();
+
+  FunctionDef* f1 = fl1->add_function();
+  *f1 = FunctionDefHelper::Create(
+      "AddAndMul", {"i: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+
+  FunctionDef* f2 = fl1->add_function();
+  *f2 = FunctionDefHelper::Create(
+      "AddAndMul2", {"input: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"input", "input"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"input", "input"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+
+  AttrValue a1;
+  AttrValue_ListValue* list1 = a1.mutable_list();
+  NameAttrList* nal1 = list1->add_func();
+  nal1->set_name("AddAndMul");
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  std::vector<NodeDefBuilder::NodeOut> func_inputs;
+  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
+  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(func_inputs)
+                  .Attr("body", a1)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  AttrValue a2;
+  AttrValue_ListValue* list2 = a2.mutable_list();
+  NameAttrList* nal2 = list2->add_func();
+  nal2->set_name("AddAndMul2");
+
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(func_inputs)
+                  .Attr("body", a2)
+                  .Device("CPU:0")
+                  .Finalize(n3));
+
+  uint64 hash1 = GetHash(gd, *n2);
+  uint64 hash2 = GetHash(gd, *n3);
+  EXPECT_EQ(hash1, hash2);
+  TF_EXPECT_OK(CheckSubgraphsEqual(gd, n2, gd, n3));
+}
+
+TEST_F(DatasetHashUtilsTest, HashNodeSameFunctionsOps) {
+  GraphDef gd;
+
+  FunctionDefLibrary* fl1 = gd.mutable_library();
+  FunctionDef* f1 = fl1->add_function();
+
+  FunctionDef func = FunctionDefHelper::Create(
+      "AddAndMul", {"i: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+  *f1 = func;
+
+  FunctionDef* f2 = fl1->add_function();
+  func = FunctionDefHelper::Create(
+      "AddAndMul2", {"i: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+  *f2 = func;
+  FunctionLibraryDefinition flib(OpRegistry::Global(), gd.library());
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "AddAndMul", &flib)
+                  .Input(n1->name(), 0, DT_FLOAT)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "AddAndMul2", &flib)
+                  .Input(n1->name(), 0, DT_FLOAT)
+                  .Device("CPU:0")
+                  .Finalize(n3));
+
+  uint64 hash1 = GetHash(gd, *n2);
+  uint64 hash2 = GetHash(gd, *n3);
+  EXPECT_EQ(hash1, hash2);
+  TF_EXPECT_OK(CheckSubgraphsEqual(gd, n2, gd, n3));
+}
+
+TEST_F(DatasetHashUtilsTest, HashNodeDifferentFunctionsOps) {
+  GraphDef gd;
+
+  FunctionDefLibrary* fl1 = gd.mutable_library();
+  FunctionDef* f1 = fl1->add_function();
+
+  FunctionDef func = FunctionDefHelper::Create(
+      "AddAndMul", {"i: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+  *f1 = func;
+
+  FunctionDef* f2 = fl1->add_function();
+  func = FunctionDefHelper::Create(
+      "AddAndMul2", {"i: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "ret"}});
+  *f2 = func;
+  FunctionLibraryDefinition flib(OpRegistry::Global(), gd.library());
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "AddAndMul", &flib)
+                  .Input(n1->name(), 0, DT_FLOAT)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "AddAndMul2", &flib)
+                  .Input(n1->name(), 0, DT_FLOAT)
+                  .Device("CPU:0")
+                  .Finalize(n3));
+
+  uint64 hash1 = GetHash(gd, *n2);
+  uint64 hash2 = GetHash(gd, *n3);
+  EXPECT_NE(hash1, hash2);
+  Status s = CheckSubgraphsEqual(gd, n2, gd, n3);
+  EXPECT_NE(s.code(), error::OK);
+  EXPECT_THAT(
+      s.error_message(),
+      ContainsRegex("Functions AddAndMul and AddAndMul2 are not the same"));
+}
+
+TEST_F(DatasetHashUtilsTest, HashNodeDifferentFunctions) {
+  GraphDef gd;
+
+  FunctionDefLibrary* fl1 = gd.mutable_library();
+  FunctionDef* f1 = fl1->add_function();
+
+  FunctionDef func = FunctionDefHelper::Create(
+      "AddAndMul", {"i: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+  *f1 = func;
+
+  FunctionDef* f2 = fl1->add_function();
+  func = FunctionDefHelper::Create(
+      "AddAndMul2", {"i: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "ret"}});
+  *f2 = func;
+
+  AttrValue a1;
+  NameAttrList* nal1 = a1.mutable_func();
+  nal1->set_name("AddAndMul");
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  std::vector<NodeDefBuilder::NodeOut> func_inputs;
+  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
+  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(func_inputs)
+                  .Attr("body", a1)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  AttrValue a2;
+  NameAttrList* nal2 = a2.mutable_func();
+  nal2->set_name("AddAndMul2");
+
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(func_inputs)
+                  .Attr("body", a2)
+                  .Device("CPU:0")
+                  .Finalize(n3));
+
+  uint64 hash1 = GetHash(gd, *n2);
+  uint64 hash2 = GetHash(gd, *n3);
+  EXPECT_NE(hash1, hash2);
+  Status s = CheckSubgraphsEqual(gd, n2, gd, n3);
+  EXPECT_NE(s.code(), error::OK);
+  EXPECT_THAT(
+      s.error_message(),
+      ContainsRegex("Functions AddAndMul and AddAndMul2 are not the same"));
+}
+
+TEST_F(DatasetHashUtilsTest, HashNodeDifferentFunctionLists) {
+  GraphDef gd;
+
+  FunctionDefLibrary* fl1 = gd.mutable_library();
+  FunctionDef* f1 = fl1->add_function();
+
+  FunctionDef func = FunctionDefHelper::Create(
+      "AddAndMul", {"i: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+  *f1 = func;
+
+  FunctionDef* f2 = fl1->add_function();
+  func = FunctionDefHelper::Create(
+      "AddAndMul2", {"i: float"}, {"o: float"}, {},
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "ret"}});
+  *f2 = func;
+
+  AttrValue a1;
+  AttrValue_ListValue* list1 = a1.mutable_list();
+  NameAttrList* nal1 = list1->add_func();
+  nal1->set_name("AddAndMul");
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  std::vector<NodeDefBuilder::NodeOut> func_inputs;
+  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
+  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(func_inputs)
+                  .Attr("body", a1)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  AttrValue a2;
+  AttrValue_ListValue* list2 = a2.mutable_list();
+  NameAttrList* nal2 = list2->add_func();
+  nal2->set_name("AddAndMul2");
+
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(func_inputs)
+                  .Attr("body", a2)
+                  .Device("CPU:0")
+                  .Finalize(n3));
+
+  uint64 hash1 = GetHash(gd, *n2);
+  uint64 hash2 = GetHash(gd, *n3);
+  EXPECT_NE(hash1, hash2);
+  Status s = CheckSubgraphsEqual(gd, n2, gd, n3);
+  EXPECT_NE(s.code(), error::OK);
+  EXPECT_THAT(
+      s.error_message(),
+      ContainsRegex("Functions AddAndMul and AddAndMul2 are not the same"));
+}
+
+TEST_F(DatasetHashUtilsTest, HashNodeDifferentControlInputs) {
+  GraphDef gd;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Const")
+                  .Attr("value", 10)
+                  .Device("CPU:0")
+                  .Finalize(n3));
+
+  NodeDef* n4 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Identity")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .ControlInput(n2->name())
+                  .Finalize(n4));
+
+  NodeDef* n5 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Identity")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .ControlInput(n3->name())
+                  .Finalize(n5));
+
+  // Control inputs are different between these two graphs.
+  uint64 hash1 = GetHash(gd, *n4);
+  uint64 hash2 = GetHash(gd, *n5);
+  EXPECT_NE(hash1, hash2);
+  Status s = CheckSubgraphsEqual(gd, n4, gd, n5);
+  EXPECT_NE(s.code(), error::OK);
+  EXPECT_THAT(s.error_message(),
+              ContainsRegex("Control dependencies are different"));
+}
+
+TEST_F(DatasetHashUtilsTest, HashNodeControlInputDifferentOrdering) {
+  GraphDef gd;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Const")
+                  .Attr("value", 10)
+                  .Device("CPU:0")
+                  .Finalize(n3));
+
+  NodeDef* n4 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Identity")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .ControlInput(n2->name())
+                  .ControlInput(n3->name())
+                  .Finalize(n4));
+
+  NodeDef* n5 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Identity")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .ControlInput(n3->name())
+                  .ControlInput(n2->name())
+                  .Finalize(n5));
+
+  uint64 hash1 = GetHash(gd, *n4);
+  uint64 hash2 = GetHash(gd, *n5);
+  EXPECT_EQ(hash1, hash2);
+  TF_EXPECT_OK(CheckSubgraphsEqual(gd, n4, gd, n5));
+}
+
+TEST_F(DatasetHashUtilsTest, HashNodeDifferentGraphSamePartialGraph) {
+  GraphDef gd;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .Finalize(n3));
+
+  uint64 hash1 = GetHash(gd, *n1);
+
+  n3->Clear();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Mul")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .Finalize(n3));
+
+  uint64 hash2 = GetHash(gd, *n1);
+
+  EXPECT_EQ(hash1, hash2);
+}
+
+TEST_F(DatasetHashUtilsTest, HashNodeWithManyControlDependencies) {
+  GraphDef gd;
+  NodeDef* n;
+
+  for (int i = 0; i < 1000; ++i) {
+    n = gd.add_node();
+    NodeDefBuilder ndb(absl::StrCat("graph_1/node_", i), "Const");
+    ndb.Attr("value", 1);
+    ndb.Device("CPU:0");
+    for (int j = 0; j < i; ++j) {
+      ndb.ControlInput(absl::StrCat("graph_1/node_", j));
+    }
+    TF_CHECK_OK(ndb.Finalize(n));
+  }
+
+  // No checks here, because so long as this does not time out, we are OK.
+  GetHash(gd, *n);
+}
+
+TEST_F(DatasetHashUtilsTest, HashFunctionsWithControlDependencyLoop) {
+  GraphDef gd;
+
+  FunctionDefLibrary* fl1 = gd.mutable_library();
+  FunctionDef* f1 = fl1->add_function();
+
+  AttrValue a1;
+  NameAttrList* nal1 = a1.mutable_func();
+  nal1->set_name("AddAndMul");
+
+  std::pair<string, FunctionDefHelper::AttrValueWrapper> func_attr = {
+      "body", FunctionDefHelper::AttrValueWrapper(*nal1)};
+
+  FunctionDef func = FunctionDefHelper::Create(
+      /*function_name=*/"AddAndMul",
+      /*in_def=*/{"i: float", "j: int32"},
+      /*out_def=*/{"o: float"},
+      /*attr_def=*/{},
+      /*node_def=*/
+      {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}, {"ret"}},
+       // This creates a dependency on the same function.
+       {{"for"}, "For", {"j", "j", "j"}, {func_attr, {"T", DT_FLOAT}}, {"ret"}},
+       {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}},
+      /*ret_def=*/{{"o", "ret:z:0"}},
+      /*control_ret_def=*/{{"must_execute", "add"}});
+  *f1 = func;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .Finalize(n1));
+
+  std::vector<NodeDefBuilder::NodeOut> func_inputs;
+  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
+  func_inputs.emplace_back(n1->name(), 0, DT_FLOAT);
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(func_inputs)
+                  .ControlInput("graph_1/node_2")
+                  .Attr("body", a1)
+                  .Device("CPU:0")
+                  .Finalize(n2));
+
+  // No checks in the test, the fact that it runs and doesn't timeout or exhaust
+  // the stack means it is successful.
+  GetHash(gd, *n2);
+}
+
+TEST_F(DatasetHashUtilsTest, HashNodeWithControlDependencyLoop) {
+  GraphDef gd;
+
+  NodeDef* n1 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .ControlInput("graph_1/node_2")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .ControlInput("graph_1/node_1")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .ControlInput("graph_1/node_1")
+                  .ControlInput("graph_1/node_2")
+                  .Finalize(n3));
+
+  // No checks in the test, the fact that it runs and doesn't timeout or exhaust
+  // the stack means it is successful.
+  GetHash(gd, *n3);
+}
+
+TEST_F(DatasetHashUtilsTest, HashNodeWithControlDependencyLoopDifferentNames) {
+  GraphDef gd1;
+
+  NodeDef* n1 = gd1.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .ControlInput("graph_1/node_2")
+                  .Finalize(n1));
+
+  NodeDef* n2 = gd1.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .ControlInput("graph_1/node_1")
+                  .Finalize(n2));
+
+  NodeDef* n3 = gd1.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add")
+                  .Device("CPU:0")
+                  .Input(n1->name(), 0, DT_INT32)
+                  .Input(n2->name(), 0, DT_INT32)
+                  .ControlInput("graph_1/node_1")
+                  .ControlInput("graph_1/node_2")
+                  .Finalize(n3));
+
+  GraphDef gd2;
+
+  NodeDef* n4 = gd2.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Const")
+                  .Attr("value", 1)
+                  .Device("CPU:0")
+                  .ControlInput("graph_1/node_5")
+                  .Finalize(n4));
+
+  NodeDef* n5 = gd2.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_5", "Const")
+                  .Attr("value", 2)
+                  .Device("CPU:0")
+                  .ControlInput("graph_1/node_4")
+                  .Finalize(n5));
+
+  NodeDef* n6 = gd2.add_node();
+  TF_CHECK_OK(NodeDefBuilder("graph_1/node_6", "Add")
+                  .Device("CPU:0")
+                  .Input(n4->name(), 0, DT_INT32)
+                  .Input(n5->name(), 0, DT_INT32)
+                  .ControlInput("graph_1/node_4")
+                  .ControlInput("graph_1/node_5")
+                  .Finalize(n6));
+
+  EXPECT_EQ(GetHash(gd1, *n3), GetHash(gd2, *n6));
+}
+
+TEST_F(DatasetHashUtilsTest, HashInt32Tensor) {
+  Tensor s1(42);
+  Tensor s2(42);
+  Tensor s3(43);
+
+  EXPECT_EQ(GetHash(s1), GetHash(s2));
+  EXPECT_NE(GetHash(s1), GetHash(s3));
+
+  Tensor v1(DT_INT32, TensorShape({2}));
+  v1.vec<int32>()(0) = 0;
+  v1.vec<int32>()(1) = 1;
+  Tensor v2(DT_INT32, TensorShape({2}));
+  v2.vec<int32>()(0) = 0;
+  v2.vec<int32>()(1) = 1;
+  Tensor v3(DT_INT32, TensorShape({2}));
+  v3.vec<int32>()(0) = 0;
+  v3.vec<int32>()(1) = 2;
+
+  EXPECT_EQ(GetHash(v1), GetHash(v2));
+  EXPECT_NE(GetHash(v1), GetHash(v3));
+}
+
+TEST_F(DatasetHashUtilsTest, HashStringTensor) {
+  Tensor s1("hello");
+  Tensor s2("hello");
+  Tensor s3("world");
+
+  EXPECT_EQ(GetHash(s1), GetHash(s2));
+  EXPECT_NE(GetHash(s1), GetHash(s3));
+
+  Tensor v1(DT_STRING, TensorShape({2}));
+  v1.vec<tstring>()(0) = "hello";
+  v1.vec<tstring>()(1) = "world";
+  Tensor v2(DT_STRING, TensorShape({2}));
+  v2.vec<tstring>()(0) = "hello";
+  v2.vec<tstring>()(1) = "world";
+  Tensor v3(DT_STRING, TensorShape({2}));
+  v3.vec<tstring>()(0) = "hello";
+  v3.vec<tstring>()(1) = "universe";
+
+  EXPECT_EQ(GetHash(v1), GetHash(v2));
+  EXPECT_NE(GetHash(v1), GetHash(v3));
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc
index 0a795c1cf82..cbe1caeb0b0 100644
--- a/tensorflow/core/kernels/data/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc
@@ -87,6 +87,11 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
     return input_->CheckExternalState();
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index d34e4f2b041..3626c0bbf89 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -80,6 +80,11 @@ class MapDatasetOp::Dataset : public DatasetBase {
     }
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
     return input_->CheckExternalState();
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index d32ac368fa1..f790b4bf07f 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -12,7 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/kernels/data/model_dataset_op.h"
 
+// On mobile we do not provide model dataset op because not all of its
+// dependencies are available there. The op is replaced with a no-op.
+#if !defined(IS_MOBILE_PLATFORM)
 #include "absl/memory/memory.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/metrics.h"
@@ -32,235 +36,283 @@ constexpr int64 kOptimizationPeriodThresholdMs = 60 * EnvTime::kSecondsToMillis;
 // Default share of available RAM that can be used by model's internal buffers.
 constexpr double kRamBudgetShare = 0.5;
 
-class ModelDatasetOp : public UnaryDatasetOpKernel {
+}  // namespace
+
+/* static */ constexpr const char* const ModelDatasetOp::kAlgorithm;
+/* static */ constexpr const char* const ModelDatasetOp::kCpuBudget;
+/* static */ constexpr const char* const ModelDatasetOp::kRamBudget;
+
+class ModelDatasetOp::Dataset : public DatasetBase {
  public:
-  explicit ModelDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {
-    if (ctx->HasAttr("algorithm")) {
-      int64 algorithm;
-      OP_REQUIRES_OK(ctx, ctx->GetAttr("algorithm", &algorithm));
-      algorithm_ = model::AutotuneAlgorithm(algorithm);
-    } else {
-      algorithm_ = model::AutotuneAlgorithm::HILL_CLIMB;
-    }
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("cpu_budget", &cpu_budget_));
-    if (cpu_budget_ == 0) {
-      cpu_budget_ = port::NumSchedulableCPUs();
-    }
-    OP_REQUIRES(ctx, cpu_budget_ > 0,
-                errors::InvalidArgument("CPU budget must be positive but is ",
-                                        cpu_budget_, "."));
-    ram_budget_ = kRamBudgetShare * port::AvailableRam();
+  Dataset(OpKernelContext* ctx, const DatasetBase* input,
+          model::AutotuneAlgorithm algorithm, int64 cpu_budget,
+          int64 ram_budget)
+      : DatasetBase(DatasetContext(ctx)),
+        input_(input),
+        algorithm_(algorithm),
+        cpu_budget_(cpu_budget),
+        ram_budget_(ram_budget) {
+    input_->Ref();
   }
 
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    *output = new Dataset(ctx, input, algorithm_, cpu_budget_, ram_budget_);
+  ~Dataset() override { input_->Unref(); }
+
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override {
+    return absl::make_unique<Iterator>(
+        Iterator::Params{this, strings::StrCat(prefix, "::Model")});
+  }
+
+  const DataTypeVector& output_dtypes() const override {
+    return input_->output_dtypes();
+  }
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return input_->output_shapes();
+  }
+
+  string DebugString() const override { return "ModelDatasetOp::Dataset"; }
+
+  int64 Cardinality() const override { return input_->Cardinality(); }
+
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
+  Status CheckExternalState() const override {
+    return input_->CheckExternalState();
+  }
+
+ protected:
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override {
+    Node* input_graph_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+    TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output));
+    AttrValue algorithm_attr;
+    b->BuildAttrValue(static_cast<int64>(algorithm_), &algorithm_attr);
+    AttrValue cpu_budget_attr;
+    b->BuildAttrValue(cpu_budget_, &cpu_budget_attr);
+    AttrValue ram_budget_attr;
+    b->BuildAttrValue(ram_budget_, &ram_budget_attr);
+
+    TF_RETURN_IF_ERROR(
+        b->AddDataset(this, {input_graph_node},
+                      {std::make_pair(kAlgorithm, algorithm_attr),
+                       std::make_pair(kCpuBudget, cpu_budget_attr),
+                       std::make_pair(kRamBudget, ram_budget_attr)},
+                      output));
+    return Status::OK();
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Iterator : public DatasetIterator<Dataset> {
    public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            model::AutotuneAlgorithm algorithm, int64 cpu_budget,
-            int64 ram_budget)
-        : DatasetBase(DatasetContext(ctx)),
-          input_(input),
-          algorithm_(algorithm),
-          cpu_budget_(cpu_budget),
-          ram_budget_(ram_budget) {
-      input_->Ref();
+    explicit Iterator(const Params& params)
+        : DatasetIterator<Dataset>(params),
+          cpu_budget_(dataset()->cpu_budget_ == 0 ? port::NumSchedulableCPUs()
+                                                  : dataset()->cpu_budget_),
+          ram_budget_(dataset()->ram_budget_ == 0
+                          ? kRamBudgetShare * port::AvailableRam()
+                          : dataset()->ram_budget_) {
+      model_ = std::make_shared<model::Model>();
     }
 
-    ~Dataset() override { input_->Unref(); }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return absl::make_unique<Iterator>(
-          Iterator::Params{this, strings::StrCat(prefix, "::Model")});
+    ~Iterator() override {
+      // Signal the optimize thread to terminate it. We will then join that
+      // thread when we delete `this->optimize_thread_`.
+      mutex_lock l(mu_);
+      cancelled_ = true;
+      cond_var_.notify_all();
     }
 
-    const DataTypeVector& output_dtypes() const override {
-      return input_->output_dtypes();
-    }
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return input_->output_shapes();
+    Status Initialize(IteratorContext* ctx) override {
+      IteratorContext::Params params(ctx);
+      params.model = model_;
+      return dataset()->input_->MakeIterator(IteratorContext(std::move(params)),
+                                             this, prefix(), &input_impl_);
     }
 
-    string DebugString() const override { return "ModelDatasetOp::Dataset"; }
-
-    int64 Cardinality() const override { return input_->Cardinality(); }
-
-    Status CheckExternalState() const override {
-      return input_->CheckExternalState();
+    Status GetNextInternal(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_sequence) override {
+      IteratorContext::Params params(ctx);
+      {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(EnsureOptimizeThreadStarted(ctx));
+        params.model = model_;
+        int64 now_nanos = EnvTime::NowNanos();
+        RecordInput(now_nanos);
+      }
+      Status s = input_impl_->GetNext(IteratorContext(std::move(params)),
+                                      out_tensors, end_of_sequence);
+      int64 now_nanos = EnvTime::NowNanos();
+      mutex_lock l(mu_);
+      RecordOutput(now_nanos);
+      return s;
     }
 
    protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      Node* input_graph_node = nullptr;
-      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
-      TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output));
+    std::shared_ptr<model::Node> CreateNode(
+        IteratorContext* ctx, model::Node::Args args) const override {
+      return model::MakeKnownRatioNode(std::move(args),
+                                       /*ratio=*/1);
+    }
+
+    Status SaveInternal(SerializationContext* ctx,
+                        IteratorStateWriter* writer) override {
+      mutex_lock l(mu_);
+      TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
+      return Status::OK();
+    }
+
+    Status RestoreInternal(IteratorContext* ctx,
+                           IteratorStateReader* reader) override {
+      mutex_lock l(mu_);
+      TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
       return Status::OK();
     }
 
    private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {
-        model_ = std::make_shared<model::Model>();
+    Status EnsureOptimizeThreadStarted(IteratorContext* ctx)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      if (!model_thread_) {
+        std::shared_ptr<IteratorContext> new_ctx =
+            std::make_shared<IteratorContext>(*ctx);
+        model_thread_ = ctx->StartThread(
+            "tf_data_model", [this, new_ctx]() { ModelThread(new_ctx); });
       }
+      return Status::OK();
+    }
 
-      ~Iterator() override {
-        // Signal the optimize thread to terminate it. We will then join that
-        // thread when we delete `this->optimize_thread_`.
-        mutex_lock l(mu_);
-        cancelled_ = true;
-        cond_var_.notify_all();
-      }
-
-      Status Initialize(IteratorContext* ctx) override {
-        IteratorContext::Params params(ctx);
-        params.model = model_;
-        return dataset()->input_->MakeIterator(
-            IteratorContext(std::move(params)), this, prefix(), &input_impl_);
-      }
-
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        IteratorContext::Params params(ctx);
+    void ModelThread(const std::shared_ptr<IteratorContext>& ctx) {
+      int64 last_optimization_ms = 0;
+      int64 optimization_period_ms = 10;
+      int64 current_time_ms = EnvTime::NowMicros() / EnvTime::kMillisToMicros;
+      while (true) {
         {
           mutex_lock l(mu_);
-          TF_RETURN_IF_ERROR(EnsureOptimizeThreadStarted(ctx));
-          params.model = model_;
-          int64 now_nanos = EnvTime::NowNanos();
-          RecordInput(now_nanos);
-        }
-        Status s = input_impl_->GetNext(IteratorContext(std::move(params)),
-                                        out_tensors, end_of_sequence);
-        int64 now_nanos = EnvTime::NowNanos();
-        mutex_lock l(mu_);
-        RecordOutput(now_nanos);
-        return s;
-      }
-
-     protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeKnownRatioNode(std::move(args),
-                                         /*ratio=*/1);
-      }
-
-      Status SaveInternal(SerializationContext* ctx,
-                          IteratorStateWriter* writer) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        return Status::OK();
-      }
-
-     private:
-      Status EnsureOptimizeThreadStarted(IteratorContext* ctx)
-          TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        if (!model_thread_) {
-          std::shared_ptr<IteratorContext> new_ctx =
-              std::make_shared<IteratorContext>(*ctx);
-          model_thread_ = ctx->StartThread(
-              "tf_data_model", [this, new_ctx]() { ModelThread(new_ctx); });
-        }
-        return Status::OK();
-      }
-
-      void ModelThread(const std::shared_ptr<IteratorContext>& ctx) {
-        int64 last_optimization_ms = 0;
-        int64 optimization_period_ms = 10;
-        int64 current_time_ms = EnvTime::NowMicros() / EnvTime::kMillisToMicros;
-        while (true) {
-          {
-            mutex_lock l(mu_);
-            while (!cancelled_ &&
-                   last_optimization_ms + optimization_period_ms >
-                       current_time_ms) {
-              auto wait_ms = last_optimization_ms + optimization_period_ms -
-                             current_time_ms;
-              VLOG(2) << "Waiting for " << wait_ms << " ms.";
-              cond_var_.wait_for(l, std::chrono::milliseconds(wait_ms));
-              current_time_ms = EnvTime::NowMicros() / EnvTime::kMillisToMicros;
-            }
-            if (cancelled_) return;
+          while (!cancelled_ && last_optimization_ms + optimization_period_ms >
+                                    current_time_ms) {
+            auto wait_ms =
+                last_optimization_ms + optimization_period_ms - current_time_ms;
+            VLOG(2) << "Waiting for " << wait_ms << " ms.";
+            cond_var_.wait_for(l, std::chrono::milliseconds(wait_ms));
+            current_time_ms = EnvTime::NowMicros() / EnvTime::kMillisToMicros;
           }
-          double model_input_time;
-          {
-            tf_shared_lock l(mu_);
-            model_input_time = SelfInputTime();
-          }
-          model_->Optimize(dataset()->algorithm_, dataset()->cpu_budget_,
-                           dataset()->ram_budget_, /*model_input_time=*/0);
-          // Exponentially increase the period of running the optimization
-          // until a threshold is reached.
-          if (optimization_period_ms != kOptimizationPeriodThresholdMs) {
-            optimization_period_ms = std::min(optimization_period_ms << 1,
-                                              kOptimizationPeriodThresholdMs);
-          }
-          current_time_ms = EnvTime::NowMicros() / EnvTime::kMillisToMicros;
-          last_optimization_ms = current_time_ms;
-          model_->FlushMetrics();
+          if (cancelled_) return;
         }
-      }
-
-      void RecordInput(int64 time_nanos) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        if (last_output_time_ != 0) {
-          DCHECK_LE(last_output_time_, time_nanos);
-          input_time_ += time_nanos - last_output_time_;
-          num_input_events_++;
+        double model_input_time;
+        {
+          tf_shared_lock l(mu_);
+          model_input_time = SelfInputTime();
         }
-      }
-
-      void RecordOutput(int64 time_nanos) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        last_output_time_ = time_nanos;
-      }
-
-      double SelfInputTime() const TF_SHARED_LOCKS_REQUIRED(mu_) {
-        if (num_input_events_ == 0) {
-          return 0;
+        model_->Optimize(dataset()->algorithm_, cpu_budget_, ram_budget_,
+                         /*model_input_time=*/0);
+        // Exponentially increase the period of running the optimization
+        // until a threshold is reached.
+        if (optimization_period_ms != kOptimizationPeriodThresholdMs) {
+          optimization_period_ms = std::min(optimization_period_ms << 1,
+                                            kOptimizationPeriodThresholdMs);
         }
-        return static_cast<double>(input_time_) /
-               static_cast<double>(num_input_events_);
+        current_time_ms = EnvTime::NowMicros() / EnvTime::kMillisToMicros;
+        last_optimization_ms = current_time_ms;
+        model_->FlushMetrics();
       }
+    }
 
-      mutex mu_;
-      condition_variable cond_var_;
-      std::shared_ptr<model::Model> model_;
-      std::unique_ptr<Thread> model_thread_ TF_GUARDED_BY(mu_);
-      bool cancelled_ TF_GUARDED_BY(mu_) = false;
-      std::unique_ptr<IteratorBase> input_impl_;
-      int64 num_input_events_ TF_GUARDED_BY(mu_) = 0;
-      int64 input_time_ TF_GUARDED_BY(mu_) = 0;
-      int64 last_output_time_ TF_GUARDED_BY(mu_) = 0;
-    };
+    void RecordInput(int64 time_nanos) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      if (last_output_time_ != 0) {
+        DCHECK_LE(last_output_time_, time_nanos);
+        input_time_ += time_nanos - last_output_time_;
+        num_input_events_++;
+      }
+    }
 
-    const DatasetBase* input_;
-    const model::AutotuneAlgorithm algorithm_;
+    void RecordOutput(int64 time_nanos) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      last_output_time_ = time_nanos;
+    }
+
+    double SelfInputTime() const TF_SHARED_LOCKS_REQUIRED(mu_) {
+      if (num_input_events_ == 0) {
+        return 0;
+      }
+      return static_cast<double>(input_time_) /
+             static_cast<double>(num_input_events_);
+    }
+
+    mutex mu_;
+    condition_variable cond_var_;
+    std::shared_ptr<model::Model> model_;
+    std::unique_ptr<Thread> model_thread_ TF_GUARDED_BY(mu_);
+    bool cancelled_ TF_GUARDED_BY(mu_) = false;
+    std::unique_ptr<IteratorBase> input_impl_;
+    int64 num_input_events_ TF_GUARDED_BY(mu_) = 0;
+    int64 input_time_ TF_GUARDED_BY(mu_) = 0;
+    int64 last_output_time_ TF_GUARDED_BY(mu_) = 0;
     const int64 cpu_budget_;
     const int64 ram_budget_;
   };
 
-  model::AutotuneAlgorithm algorithm_;
-  int64 cpu_budget_;
-  int64 ram_budget_;
+  const DatasetBase* input_;
+  const model::AutotuneAlgorithm algorithm_;
+  const int64 cpu_budget_;
+  const int64 ram_budget_;
 };
 
+ModelDatasetOp::ModelDatasetOp(OpKernelConstruction* ctx)
+    : UnaryDatasetOpKernel(ctx) {
+  if (ctx->HasAttr(kAlgorithm)) {
+    int64 algorithm;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kAlgorithm, &algorithm));
+    algorithm_ = model::AutotuneAlgorithm(algorithm);
+  } else {
+    algorithm_ = model::AutotuneAlgorithm::HILL_CLIMB;
+  }
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kCpuBudget, &cpu_budget_));
+  OP_REQUIRES(ctx, cpu_budget_ >= 0,
+              errors::InvalidArgument("CPU budget must be positive but is ",
+                                      cpu_budget_, "."));
+  if (ctx->HasAttr(kRamBudget)) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr(kRamBudget, &ram_budget_));
+  } else {
+    ram_budget_ = 0;
+  }
+  OP_REQUIRES(ctx, ram_budget_ >= 0,
+              errors::InvalidArgument("RAM budget must be positive but is ",
+                                      ram_budget_, "."));
+}
+
+void ModelDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                                 DatasetBase** output) {
+  *output = new ModelDatasetOp::Dataset(ctx, input, algorithm_, cpu_budget_,
+                                        ram_budget_);
+}
+
+namespace {
 REGISTER_KERNEL_BUILDER(Name("ModelDataset").Device(DEVICE_CPU),
                         ModelDatasetOp);
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
+#else  // !IS_MOBILE_PLATFORM
+namespace tensorflow {
+namespace data {
+
+ModelDatasetOp::ModelDatasetOp(OpKernelConstruction* ctx)
+    : UnaryDatasetOpKernel(ctx) {}
+
+void ModelDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                                 DatasetBase** output) {
+  input->Ref();
+  *output = input;
+}
+
+namespace {
+REGISTER_KERNEL_BUILDER(Name("ModelDataset").Device(DEVICE_CPU),
+                        ModelDatasetOp);
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
+#endif  // !IS_MOBILE_PLATFORM
diff --git a/tensorflow/core/kernels/data/model_dataset_op.h b/tensorflow/core/kernels/data/model_dataset_op.h
new file mode 100644
index 00000000000..09935e36586
--- /dev/null
+++ b/tensorflow/core/kernels/data/model_dataset_op.h
@@ -0,0 +1,70 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_MODEL_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_MODEL_DATASET_OP_H_
+
+#include "tensorflow/core/platform/platform.h"
+
+// On mobile we do not provide model dataset op because not all of its
+// dependencies are available there. The op is replaced with a no-op.
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/model.h"
+
+namespace tensorflow {
+namespace data {
+
+class ModelDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kAlgorithm = "algorithm";
+  static constexpr const char* const kCpuBudget = "cpu_budget";
+  static constexpr const char* const kRamBudget = "ram_budget";
+
+  explicit ModelDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+
+  model::AutotuneAlgorithm algorithm_;
+  int64 cpu_budget_;
+  int64 ram_budget_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+#else  // !IS_MOBILE_PLATFORM
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class ModelDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit ModelDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+#endif  // !IS_MOBILE_PLATFORM
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_MODEL_DATASET_OP_H_
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index f7325cd512f..6e079937885 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/optimize_dataset_op.h"
 
+// On mobile we do not provide optimize dataset op because not all of its
+// dependencies are available there. The op is replaced with a no-op.
+#if !defined(IS_MOBILE_PLATFORM)
 #include <map>
 
 #include "tensorflow/core/framework/partial_tensor_shape.h"
@@ -27,9 +30,6 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
-// See documentation in ../../ops/dataset_ops.cc for a high-level
-// description of the following op.
-
 /* static */ constexpr const char* const OptimizeDatasetOp::kDatasetType;
 /* static */ constexpr const char* const OptimizeDatasetOp::kInputDataset;
 /* static */ constexpr const char* const OptimizeDatasetOp::kOptimizations;
@@ -80,11 +80,11 @@ void OptimizeDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                                                      &optimizations_default));
 
     string job_name = port::JobName();
-    // The map that stores the experiment names and for how much percentage
-    // of the jobs, the experiments will be randomly turned on.
+    // The map that stores the live experiment names and for how much percentage
+    // of the Borg jobs, the experiments will be randomly turned on.
     // clang-format off
     absl::flat_hash_map<string, uint64> live_experiments = {
-        {"disable_intra_op_parallelism", 5}
+        {"enable_gradient_descent", 5}
     };
     // clang-format on
     auto hash_func = [](const string& str) { return Hash64(str); };
@@ -92,7 +92,7 @@ void OptimizeDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
         job_name, live_experiments, optimizations_enabled,
         optimizations_disabled, optimizations_default, hash_func);
 
-    // Log and record the experiments that will be applied.
+    // Log and record the live experiments that will be applied.
     if (!job_name.empty() && !live_experiments.empty()) {
       VLOG(1) << "The input pipeline is subject to tf.data experiment. "
                  "Please see `go/tf-data-experiments` for more details.";
@@ -101,13 +101,28 @@ void OptimizeDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
         string experiment = pair.first;
         if (std::find(optimizations.begin(), optimizations.end(), experiment) !=
             optimizations.end()) {
-          VLOG(1) << "The experiment \"" << experiment << "\" is applied.";
+          VLOG(1) << "The live experiment \"" << experiment << "\" is applied.";
           metrics::RecordTFDataExperiment(experiment);
         }
       }
     }
   }
 
+  // The vector stores the graduated experiment names which will be turned on
+  // for all input pipelines.
+  // clang-format off
+  std::vector<string> graduated_experiments = {"disable_intra_op_parallelism"};
+  // clang-format on
+
+  // Add the graduated experiments to the optimization list and log them.
+  for (auto& experiment : graduated_experiments) {
+    if (std::find(optimizations.begin(), optimizations.end(), experiment) ==
+        optimizations.end()) {
+      optimizations.push_back(experiment);
+    }
+    VLOG(1) << "The graduated experiment \"" << experiment << "\" is applied.";
+  }
+
   // If there are no optimizations to be applied, directly return the input.
   if (optimizations.empty()) {
     *output = input;
@@ -163,3 +178,25 @@ REGISTER_KERNEL_BUILDER(Name("OptimizeDatasetV2").Device(DEVICE_CPU),
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
+#else  // !IS_MOBILE_PLATFORM
+namespace tensorflow {
+namespace data {
+
+OptimizeDatasetOp::OptimizeDatasetOp(OpKernelConstruction* ctx)
+    : UnaryDatasetOpKernel(ctx) {}
+
+void OptimizeDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                                    DatasetBase** output) {
+  input->Ref();
+  *output = input;
+}
+
+namespace {
+REGISTER_KERNEL_BUILDER(Name("OptimizeDataset").Device(DEVICE_CPU),
+                        OptimizeDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("OptimizeDatasetV2").Device(DEVICE_CPU),
+                        OptimizeDatasetOp);
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
+#endif  // !IS_MOBILE_PLATFORM
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.h b/tensorflow/core/kernels/data/optimize_dataset_op.h
index d9e366f1ad5..a65cf588fef 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.h
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.h
@@ -15,6 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_DATA_OPTIMIZE_DATASET_OP_H_
 #define TENSORFLOW_CORE_KERNELS_DATA_OPTIMIZE_DATASET_OP_H_
 
+#include "tensorflow/core/platform/platform.h"
+
+// On mobile we do not provide optimize dataset op because not all of its
+// dependencies are available there. The op is replaced with a no-op.
+#if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/framework/dataset.h"
 
 namespace tensorflow {
@@ -54,5 +59,23 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
 
 }  // namespace data
 }  // namespace tensorflow
+#else  // !IS_MOBILE_PLATFORM
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+class OptimizeDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit OptimizeDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+#endif  // !IS_MOBILE_PLATFORM
 
 #endif  // TENSORFLOW_CORE_KERNELS_DATA_OPTIMIZE_DATASET_OP_H_
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index fd0a1855206..805954a5179 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -119,6 +119,11 @@ class PaddedBatchDatasetOp::Dataset : public DatasetBase {
     return n / batch_size_ + (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 90dd5337c1d..583a3cc509c 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -219,6 +219,11 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         ParallelInterleaveDatasetOp::kDatasetType, params);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
     return input_->CheckExternalState();
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index b0c4a6589cc..8c1a49d9dda 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -120,6 +120,11 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
     }
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     TF_RETURN_IF_ERROR(captured_func_->CheckExternalState());
     return input_->CheckExternalState();
@@ -400,7 +405,6 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
         TF_LOCKS_EXCLUDED(*mu_) {
       mutex_lock l(*mu_);
       num_calls_--;
-      RecordBufferEnqueue(ctx.get(), result->return_values);
       result->notification.Notify();
       cond_var_->notify_all();
     }
@@ -423,6 +427,7 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
 
       auto done = [this, ctx, result](Status status) {
         result->status.Update(status);
+        RecordBufferEnqueue(ctx.get(), result->return_values);
         CallCompleted(ctx, result);
       };
 
diff --git a/tensorflow/core/kernels/data/prefetch_autotuner_test.cc b/tensorflow/core/kernels/data/prefetch_autotuner_test.cc
index fb434333994..e9d291d22f7 100644
--- a/tensorflow/core/kernels/data/prefetch_autotuner_test.cc
+++ b/tensorflow/core/kernels/data/prefetch_autotuner_test.cc
@@ -60,7 +60,7 @@ TEST(PrefetchAutotuner, Enabled) {
 TEST(PrefetchAutotuner, EnabledSteady) {
   PrefetchAutotuner t(model::kAutotune);
   EXPECT_EQ(1, t.buffer_limit());
-  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  t.RecordConsumption(0);  // Expect buffer limit to stay the same!
   EXPECT_EQ(1, t.buffer_limit());
   t.RecordConsumption(1);
   EXPECT_EQ(1, t.buffer_limit());
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 20b78ba14ad..4a55514ffd1 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -88,6 +88,11 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return input_->Cardinality(); }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc b/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc
index 90a2861dcc3..737008bb4c9 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc
@@ -28,13 +28,14 @@ class PrefetchDatasetParams : public DatasetParams {
   PrefetchDatasetParams(T input_dataset_params, int64 buffer_size,
                         DataTypeVector output_dtypes,
                         std::vector<PartialTensorShape> output_shapes,
-                        int slack_period, bool legacy_autotune,
-                        string node_name)
+                        int64 slack_period, bool legacy_autotune,
+                        int64 buffer_size_min, string node_name)
       : DatasetParams(std::move(output_dtypes), std::move(output_shapes),
                       std::move(node_name)),
         buffer_size_(buffer_size),
         slack_period_(slack_period),
-        legacy_autotune_(legacy_autotune) {
+        legacy_autotune_(legacy_autotune),
+        buffer_size_min_(buffer_size_min) {
     input_dataset_params_.push_back(absl::make_unique<T>(input_dataset_params));
     iterator_prefix_ =
         name_utils::IteratorPrefix(input_dataset_params.dataset_type(),
@@ -59,6 +60,7 @@ class PrefetchDatasetParams : public DatasetParams {
     attr_vector->emplace_back(PrefetchDatasetOp::kSlackPeriod, slack_period_);
     attr_vector->emplace_back(PrefetchDatasetOp::kLegacyAutotune,
                               legacy_autotune_);
+    attr_vector->emplace_back("buffer_size_min", buffer_size_min_);
     return Status::OK();
   }
 
@@ -68,8 +70,9 @@ class PrefetchDatasetParams : public DatasetParams {
 
  private:
   int64 buffer_size_;
-  int slack_period_;
+  int64 slack_period_;
   bool legacy_autotune_;
+  int64 buffer_size_min_;
 };
 
 // Test case 1: positive buffer size.
@@ -85,6 +88,7 @@ PrefetchDatasetParams PrefetchDatasetParams1() {
       /*output_shapes=*/{PartialTensorShape({1})},
       /*slack_period=*/0,
       /*legacy_autotune=*/true,
+      /*buffer_size_min=*/0,
       /*node_name=*/kNodeName);
 }
 
@@ -101,6 +105,7 @@ PrefetchDatasetParams PrefetchDatasetParams2() {
       /*output_shapes=*/{PartialTensorShape({1})},
       /*slack_period=*/0,
       /*legacy_autotune=*/true,
+      /*buffer_size_min=*/0,
       /*node_name=*/kNodeName);
 }
 
@@ -117,6 +122,7 @@ PrefetchDatasetParams PrefetchDatasetParams3() {
       /*output_shapes=*/{PartialTensorShape({1})},
       /*slack_period=*/0,
       /*legacy_autotune=*/true,
+      /*buffer_size_min=*/0,
       /*node_name=*/kNodeName);
 }
 
@@ -133,6 +139,7 @@ PrefetchDatasetParams PrefetchDatasetParams4() {
       /*output_shapes=*/{PartialTensorShape({1})},
       /*slack_period=*/5,
       /*legacy_autotune=*/true,
+      /*buffer_size_min=*/0,
       /*node_name=*/kNodeName);
 }
 
@@ -149,6 +156,24 @@ PrefetchDatasetParams PrefetchDatasetParams5() {
       /*output_shapes=*/{PartialTensorShape({1})},
       /*slack_period=*/5,
       /*legacy_autotune=*/false,
+      /*buffer_size_min=*/0,
+      /*node_name=*/kNodeName);
+}
+
+// Test case 6: buffer_size_min > 0.
+PrefetchDatasetParams PrefetchDatasetParams6() {
+  auto tensor_slice_dataset_params = TensorSliceDatasetParams(
+      /*components=*/{CreateTensor<int64>(TensorShape{10, 1},
+                                          {0, 1, 2, 3, 4, 5, 6, 7, 8, 9})},
+      /*node_name=*/"tensor_slice");
+  return PrefetchDatasetParams(
+      /*input_dataset_params=*/tensor_slice_dataset_params,
+      /*buffer_size=*/-1,
+      /*output_dtypes=*/{DT_INT64},
+      /*output_shapes=*/{PartialTensorShape({1})},
+      /*slack_period=*/0,
+      /*legacy_autotune=*/true,
+      /*buffer_size_min=*/3,
       /*node_name=*/kNodeName);
 }
 
@@ -164,6 +189,7 @@ PrefetchDatasetParams InvalidBufferSizePrefetchDatasetParams() {
       /*output_shapes=*/{PartialTensorShape({1})},
       /*slack_period=*/0,
       /*legacy_autotune=*/true,
+      /*buffer_size_min=*/0,
       /*node_name=*/kNodeName);
 }
 
@@ -190,6 +216,11 @@ std::vector<GetNextTestCase<PrefetchDatasetParams>> GetNextTestCases() {
       {/*dataset_params=*/
        PrefetchDatasetParams5(),
        /*expected_outputs=*/
+       CreateTensors<int64>(
+           TensorShape{1}, {{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}})},
+      {/*dataset_params=*/
+       PrefetchDatasetParams6(),
+       /*expected_outputs=*/
        CreateTensors<int64>(
            TensorShape{1},
            {{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}})}};
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index f6993ab2797..e0a80f1f0ee 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/range_dataset_op.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -32,7 +33,97 @@ namespace data {
 /* static */ constexpr const char* const RangeDatasetOp::kOutputTypes;
 /* static */ constexpr const char* const RangeDatasetOp::kOutputShapes;
 
+namespace {
 constexpr char kNext[] = "next";
+constexpr char kHasSplitProvider[] = "has_split_provider";
+constexpr char kSlash[] = "/";
+constexpr char kSplitProvider[] = "split_provider";
+
+// Class which produces the elements of `range(start, stop, step)`. Threadsafe.
+class RangeCounter {
+ public:
+  RangeCounter(int64 start, int64 stop, int64 step)
+      : start_(start), stop_(stop), step_(step), next_(start) {}
+
+  // Returns the next value for the counter. Sets `*end_of_counter` to indicate
+  // whether the end of the counter was reached.
+  int64 GetNext(bool* end_of_counter) {
+    mutex_lock l(mu_);
+    if ((step_ > 0 && next_ >= stop_) || (step_ < 0 && next_ <= stop_)) {
+      *end_of_counter = true;
+      return -1;
+    }
+    *end_of_counter = false;
+    int result = next_;
+    next_ += step_;
+    return result;
+  }
+
+  int64 Peek() const {
+    mutex_lock l(mu_);
+    return next_;
+  }
+
+  void Reset() {
+    mutex_lock l(mu_);
+    next_ = start_;
+  }
+
+  void SetNext(int64 value) {
+    mutex_lock l(mu_);
+    next_ = value;
+  }
+
+ private:
+  const int64 start_;
+  const int64 stop_;
+  const int64 step_;
+  mutable mutex mu_;
+  int64 next_ TF_GUARDED_BY(mu_);
+};
+}  // namespace
+
+// Split provider where splits are individual outputs from RangeDataset.
+// For example, the "splits" of range(0, 10, 2) will be {0, 2, 4, 6, 8}.
+// The split tensors are scalars of type DT_INT64.
+class RangeDatasetOp::RangeSplitProvider : public SplitProvider {
+ public:
+  RangeSplitProvider(int64 start, int64 stop, int64 step)
+      : counter_(start, stop, step) {}
+
+  Status GetNext(Tensor* split, bool* end_of_splits) override {
+    int64 next = counter_.GetNext(end_of_splits);
+    if (*end_of_splits) {
+      return Status::OK();
+    }
+    *split = Tensor(DT_INT64, TensorShape{});
+    split->scalar<int64>()() = next;
+    return Status::OK();
+  }
+
+  Status Reset() override {
+    counter_.Reset();
+    return Status::OK();
+  }
+
+  Status Save(std::function<std::string(std::string)> key_name_fn,
+              IteratorStateWriter* writer) override {
+    TF_RETURN_IF_ERROR(
+        writer->WriteScalar(key_name_fn(kNext), counter_.Peek()));
+    return Status::OK();
+  }
+
+  Status Restore(std::function<std::string(std::string)> key_name_fn,
+                 IteratorStateReader* reader) override {
+    int64 next;
+    TF_RETURN_IF_ERROR(reader->ReadScalar(key_name_fn(kNext), &next));
+    counter_.SetNext(next);
+    return Status::OK();
+  }
+
+ private:
+  RangeCounter counter_;
+};
 
 class RangeDatasetOp::Dataset : public DatasetBase {
  public:
@@ -74,6 +165,18 @@ class RangeDatasetOp::Dataset : public DatasetBase {
     }
   }
 
+  Status MakeSplitProvider(
+      std::unique_ptr<SplitProvider>* split_provider) const override {
+    *split_provider =
+        absl::make_unique<RangeSplitProvider>(start_, stop_, step_);
+    return Status::OK();
+  }
+
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->clear();
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
@@ -93,24 +196,40 @@ class RangeDatasetOp::Dataset : public DatasetBase {
  private:
   class Iterator : public DatasetIterator<Dataset> {
    public:
-    explicit Iterator(const Params& params) : DatasetIterator<Dataset>(params) {
-      next_ = params.dataset->start_;
+    explicit Iterator(const Params& params)
+        : DatasetIterator<Dataset>(params) {}
+
+    Status Initialize(IteratorContext* ctx) override {
+      split_provider_ = ctx->split_provider();
+      if (!split_provider_) {
+        counter_ = absl::make_unique<RangeCounter>(
+            dataset()->start_, dataset()->stop_, dataset()->step_);
+      }
+      return Status::OK();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
-      mutex_lock l(mu_);
-      if ((dataset()->step_ > 0 && next_ >= dataset()->stop_) ||
-          (dataset()->step_ < 0 && next_ <= dataset()->stop_)) {
-        *end_of_sequence = true;
-        return Status::OK();
+      int64 value;
+      if (split_provider_ != nullptr) {
+        Tensor split;
+        TF_RETURN_IF_ERROR(split_provider_->GetNext(&split, end_of_sequence));
+        if (*end_of_sequence) {
+          return Status::OK();
+        }
+        value = split.scalar<int64>()();
+      } else {
+        value = counter_->GetNext(end_of_sequence);
+        if (*end_of_sequence) {
+          return Status::OK();
+        }
       }
       out_tensors->reserve(1);
       switch (dataset()->output_dtypes()[0]) {
 #define HANDLE_TYPE(type)                                \
   case DataTypeToEnum<type>::value: {                    \
-    out_tensors->emplace_back(static_cast<type>(next_)); \
+    out_tensors->emplace_back(static_cast<type>(value)); \
     break;                                               \
   }
         TF_CALL_NUMBER_TYPES(HANDLE_TYPE);
@@ -120,9 +239,6 @@ class RangeDatasetOp::Dataset : public DatasetBase {
               "Unsupported data type: ",
               DataTypeString(dataset()->output_dtypes()[0]));
       }
-      *end_of_sequence = false;
-      next_ += dataset()->step_;
-
       return Status::OK();
     }
 
@@ -134,21 +250,44 @@ class RangeDatasetOp::Dataset : public DatasetBase {
 
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
-      mutex_lock l(mu_);
-      TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kNext), next_));
+      if (split_provider_) {
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(kHasSplitProvider), true));
+        TF_RETURN_IF_ERROR(split_provider_->Save(
+            [this](const std::string& key) {
+              return SplitProviderKeyNameFn(key);
+            },
+            writer));
+      } else {
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(kNext), counter_->Peek()));
+      }
       return Status::OK();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
-      mutex_lock l(mu_);
-      TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kNext), &next_));
+      if (reader->Contains(full_name(kHasSplitProvider))) {
+        TF_RETURN_IF_ERROR(split_provider_->Restore(
+            [this](const std::string& key) {
+              return SplitProviderKeyNameFn(key);
+            },
+            reader));
+      } else {
+        int64 next;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kNext), &next));
+        counter_->SetNext(next);
+      }
       return Status::OK();
     }
 
+    std::string SplitProviderKeyNameFn(const std::string& key) {
+      return full_name(absl::StrCat(kSplitProvider, kSlash, key));
+    }
+
    private:
-    mutex mu_;
-    int64 next_ TF_GUARDED_BY(mu_);
+    std::unique_ptr<RangeCounter> counter_;
+    std::shared_ptr<SplitProvider> split_provider_;
   };
 
   const int64 start_;
diff --git a/tensorflow/core/kernels/data/range_dataset_op.h b/tensorflow/core/kernels/data/range_dataset_op.h
index 077987b72e1..8e9891c5671 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.h
+++ b/tensorflow/core/kernels/data/range_dataset_op.h
@@ -36,6 +36,7 @@ class RangeDatasetOp : public DatasetOpKernel {
 
  private:
   class Dataset;
+  class RangeSplitProvider;
   DataTypeVector output_types_;
 };
 
diff --git a/tensorflow/core/kernels/data/range_dataset_op_test.cc b/tensorflow/core/kernels/data/range_dataset_op_test.cc
index 13a027ecdc6..f8f12c36343 100644
--- a/tensorflow/core/kernels/data/range_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op_test.cc
@@ -141,6 +141,39 @@ TEST_F(RangeDatasetOpTest, ZeroStep) {
             tensorflow::error::INVALID_ARGUMENT);
 }
 
+TEST_F(RangeDatasetOpTest, SplitProviderPositiveStep) {
+  auto params = RangeDatasetParams(/*start=*/0, /*stop=*/10, /*step=*/3,
+                                   /*output_dtypes=*/{DT_INT64});
+  TF_ASSERT_OK(InitializeRuntime(params));
+  TF_EXPECT_OK(CheckSplitProviderFullIteration(
+      params, CreateTensors<int64>(TensorShape({}), {{0}, {3}, {6}, {9}})));
+  TF_EXPECT_OK(CheckSplitProviderShardedIteration(
+      params, /*num_shards=*/2, /*shard_index=*/1,
+      CreateTensors<int64>(TensorShape({}), {{3}, {9}})));
+}
+
+TEST_F(RangeDatasetOpTest, SplitProviderNegativeStep) {
+  auto params = RangeDatasetParams(/*start=*/10, /*stop=*/0, /*step=*/-3,
+                                   /*output_dtypes=*/{DT_INT64});
+  TF_ASSERT_OK(InitializeRuntime(params));
+  TF_EXPECT_OK(CheckSplitProviderFullIteration(
+      params, CreateTensors<int64>(TensorShape({}), {{10}, {7}, {4}, {1}})));
+  TF_EXPECT_OK(CheckSplitProviderShardedIteration(
+      params, /*num_shards=*/2, /*shard_index=*/0,
+      CreateTensors<int64>(TensorShape({}), {{10}, {4}})));
+}
+
+TEST_F(RangeDatasetOpTest, SplitProviderEmpty) {
+  auto params = RangeDatasetParams(/*start=*/0, /*stop=*/0, /*step=*/1,
+                                   /*output_dtypes=*/{DT_INT64});
+  TF_ASSERT_OK(InitializeRuntime(params));
+  TF_EXPECT_OK(CheckSplitProviderFullIteration(
+      params, CreateTensors<int64>(TensorShape({}), {})));
+  TF_EXPECT_OK(CheckSplitProviderShardedIteration(
+      params, /*num_shards=*/3, /*shard_index=*/2,
+      CreateTensors<int64>(TensorShape({}), {})));
+}
+
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index dd6a0e9d03e..76dbff1744d 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -89,6 +89,11 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
     return count_ * n;
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
@@ -158,6 +163,9 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
           return Status::OK();
         }
         ++i_;
+        if (ctx->split_provider()) {
+          TF_RETURN_IF_ERROR(ctx->split_provider()->Reset());
+        }
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_));
       }
diff --git a/tensorflow/core/kernels/data/rewrite_utils.cc b/tensorflow/core/kernels/data/rewrite_utils.cc
index dd9bfdb5143..0ed3e1f75ec 100644
--- a/tensorflow/core/kernels/data/rewrite_utils.cc
+++ b/tensorflow/core/kernels/data/rewrite_utils.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/core/kernels/data/rewrite_utils.h"
 
+// On mobile we do not provide this functionality because not all of its
+// dependencies are available there.
+#if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/metrics.h"
@@ -31,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/hash_utils.h"
 #include "tensorflow/core/kernels/data/serialization_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/hash/hash.h"
@@ -221,3 +224,4 @@ Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
 
 }  // namespace data
 }  // namespace tensorflow
+#endif  // !IS_MOBILE_PLATFORM
diff --git a/tensorflow/core/kernels/data/rewrite_utils.h b/tensorflow/core/kernels/data/rewrite_utils.h
index aed878e79cf..0de413c77ed 100644
--- a/tensorflow/core/kernels/data/rewrite_utils.h
+++ b/tensorflow/core/kernels/data/rewrite_utils.h
@@ -15,6 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_DATA_REWRITE_UTILS_H_
 #define TENSORFLOW_CORE_KERNELS_DATA_REWRITE_UTILS_H_
 
+#include "tensorflow/core/platform/platform.h"
+
+// On mobile we do not provide this functionality because not all of its
+// dependencies are available there.
+#if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
@@ -31,5 +36,6 @@ Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
 
 }  // namespace data
 }  // namespace tensorflow
+#endif  // !IS_MOBILE_PLATFORM
 
 #endif  // TENSORFLOW_CORE_KERNELS_DATA_REWRITE_UTILS_H_
diff --git a/tensorflow/core/kernels/data/shard_dataset_op.cc b/tensorflow/core/kernels/data/shard_dataset_op.cc
index d54ea63099b..56bd388a2f6 100644
--- a/tensorflow/core/kernels/data/shard_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shard_dataset_op.cc
@@ -84,6 +84,11 @@ class ShardDatasetOp::Dataset : public DatasetBase {
     return n / num_shards_ + (index_ < n % num_shards_ ? 1 : 0);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
@@ -157,14 +162,20 @@ class ShardDatasetOp::Dataset : public DatasetBase {
         Status s = input_impl_->Skip(ctx, dataset()->num_shards_ - next_index_,
                                      end_of_sequence, &num_skipped);
         if (*end_of_sequence || errors::IsOutOfRange(s)) {
+          // `dataset()->require_non_empty_` implies that this transformation
+          // was introduced by auto_sharding rewrite, so it's acceptable
+          // produce an error message that assumes auto-sharding context.
           return errors::InvalidArgument(
-              "There aren't enough elements in this dataset for each shard to "
-              "have at least one element (# elems = ",
-              next_index_, ", ", "# shards = ", dataset()->num_shards_,
-              "). If you are using datasets with distribution strategy, "
-              "considering setting the auto sharding policy to either DATA or "
+              "Could not apply FILE based sharding: the dataset only has ",
+              next_index_, " file(s), which is not enough for the required ",
+              dataset()->num_shards_,
+              " shards/workers."
+              "If you are using datasets with distribution strategy, "
+              "consider setting the auto sharding policy to either DATA or "
               "OFF using the `experimental_distribute.auto_shard_policy` option"
-              "of `tf.data.Options()`.");
+              "of `tf.data.Options()`. Or, split your input files into a "
+              "larger number of small files such that number of files is "
+              "greater than number of shards/workers.");
         } else if (!s.ok()) {
           return s;
         }
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index 7b696371049..4df9dcefcf0 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -117,6 +117,11 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
     }
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
@@ -182,7 +187,8 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
             data_produced_ = true;
             break;
           }
-          if (!data_produced_ && this->dataset()->count_ == -1) {
+          if (ctx->split_provider() == nullptr && !data_produced_ &&
+              this->dataset()->count_ == -1) {
             // If we encounter the end of sequence without producing data, we
             // terminate the iteration immediately. (Otherwise, this iterator
             // would loop infinitely and never produce a value.)
@@ -192,6 +198,9 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
           epoch_++;
           int64 n = slices_.back()->end;
           slices_.push_back(absl::make_unique<Slice>(n, n));
+          if (ctx->split_provider()) {
+            TF_RETURN_IF_ERROR(ctx->split_provider()->Reset());
+          }
           TF_RETURN_IF_ERROR(this->dataset()->input_->MakeIterator(
               ctx, this, this->prefix(), &input_impl_));
         }
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index 9f1e99cd915..897c7b6b7e4 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -75,6 +75,11 @@ class SkipDatasetOp::Dataset : public DatasetBase {
     return count_ < 0 ? 0 : std::max(int64{0}, n - count_);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
index 1e3ed53d6c6..9efc9fddf58 100644
--- a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
@@ -56,6 +56,10 @@ class Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return sparse_tensor_.shape()[0]; }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
diff --git a/tensorflow/core/kernels/data/split_utils.cc b/tensorflow/core/kernels/data/split_utils.cc
new file mode 100644
index 00000000000..def079169db
--- /dev/null
+++ b/tensorflow/core/kernels/data/split_utils.cc
@@ -0,0 +1,115 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/split_utils.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+constexpr char kNumToSkip[] = "num_to_skip";
+constexpr char kSplitProvider[] = "split_provider";
+constexpr char kSlash[] = "/";
+constexpr char kIndex[] = "index";
+}  // namespace
+
+IndexSplitProvider::IndexSplitProvider(int64 n) : i_(0), n_(n) {}
+
+Status IndexSplitProvider::GetNext(Tensor* split, bool* end_of_splits) {
+  mutex_lock l(mu_);
+  if (i_ >= n_) {
+    *end_of_splits = true;
+    return Status::OK();
+  }
+  *end_of_splits = false;
+  *split = Tensor(DT_INT64, TensorShape{});
+  split->scalar<int64>()() = i_++;
+  return Status::OK();
+}
+
+Status IndexSplitProvider::Reset() {
+  mutex_lock l(mu_);
+  i_ = 0;
+  return Status::OK();
+}
+
+Status IndexSplitProvider::Save(
+    std::function<std::string(std::string)> full_name,
+    IteratorStateWriter* writer) {
+  mutex_lock l(mu_);
+  return writer->WriteScalar(full_name(kIndex), i_);
+}
+
+Status IndexSplitProvider::Restore(
+    std::function<std::string(std::string)> full_name,
+    IteratorStateReader* reader) {
+  mutex_lock l(mu_);
+  return reader->ReadScalar(full_name(kIndex), &i_);
+}
+
+ShardingSplitProvider::ShardingSplitProvider(
+    int64 num_shards, int64 shard_index,
+    std::shared_ptr<SplitProvider> split_provider)
+    : num_shards_(num_shards),
+      shard_index_(shard_index),
+      split_provider_(split_provider),
+      num_to_skip_(shard_index_) {}
+
+Status ShardingSplitProvider::GetNext(Tensor* split, bool* end_of_splits) {
+  mutex_lock l(mu_);
+  while (num_to_skip_ > 0) {
+    TF_RETURN_IF_ERROR(split_provider_->GetNext(split, end_of_splits));
+    if (*end_of_splits) {
+      return Status::OK();
+    }
+    num_to_skip_--;
+  }
+  num_to_skip_ = num_shards_ - 1;
+  TF_RETURN_IF_ERROR(split_provider_->GetNext(split, end_of_splits));
+  return Status::OK();
+}
+
+Status ShardingSplitProvider::Reset() {
+  mutex_lock l(mu_);
+  TF_RETURN_IF_ERROR(split_provider_->Reset());
+  num_to_skip_ = shard_index_;
+  return Status::OK();
+}
+
+Status ShardingSplitProvider::Save(
+    std::function<std::string(std::string)> full_name,
+    IteratorStateWriter* writer) {
+  mutex_lock l(mu_);
+  TF_RETURN_IF_ERROR(split_provider_->Save(
+      [&](const std::string& key) {
+        return full_name(absl::StrCat(kSplitProvider, kSlash, key));
+      },
+      writer));
+  return writer->WriteScalar(full_name(kNumToSkip), num_to_skip_);
+}
+
+Status ShardingSplitProvider::Restore(
+    std::function<std::string(std::string)> full_name,
+    IteratorStateReader* reader) {
+  mutex_lock l(mu_);
+  TF_RETURN_IF_ERROR(split_provider_->Restore(
+      [&](const std::string& key) {
+        return full_name(absl::StrCat(kSplitProvider, kSlash, key));
+      },
+      reader));
+  TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kNumToSkip), &num_to_skip_));
+  return Status::OK();
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/split_utils.h b/tensorflow/core/kernels/data/split_utils.h
new file mode 100644
index 00000000000..82fd4e8c0a4
--- /dev/null
+++ b/tensorflow/core/kernels/data/split_utils.h
@@ -0,0 +1,67 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_SPLIT_UTILS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_SPLIT_UTILS_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+
+// A class which produces splits for a dataset of size N that can be indexed
+// into.
+class IndexSplitProvider : public SplitProvider {
+ public:
+  explicit IndexSplitProvider(int64 n);
+  Status GetNext(Tensor* split, bool* end_of_splits) override;
+  Status Reset() override;
+  Status Save(std::function<std::string(std::string)> full_name,
+              IteratorStateWriter* writer) override;
+  Status Restore(std::function<std::string(std::string)> full_name,
+                 IteratorStateReader* reader) override;
+
+ private:
+  mutex mu_;
+  int64 i_ GUARDED_BY(mu_);
+  const int64 n_;
+};
+
+// A SplitProvider which wraps another split provider, but drops all splits
+// where `index != shard_index % num_shards`
+class ShardingSplitProvider : public SplitProvider {
+ public:
+  ShardingSplitProvider(int64 num_shards, int64 shard_index,
+                        std::shared_ptr<SplitProvider> split_provider);
+
+  Status GetNext(Tensor* split, bool* end_of_splits) override;
+  Status Reset() override;
+  Status Save(std::function<std::string(std::string)> full_name,
+              IteratorStateWriter* writer) override;
+  Status Restore(std::function<std::string(std::string)> full_name,
+                 IteratorStateReader* reader) override;
+
+ private:
+  const int64 num_shards_;
+  const int64 shard_index_;
+  mutex mu_;
+  std::shared_ptr<SplitProvider> split_provider_ TF_GUARDED_BY(mu_);
+  int64 num_to_skip_ TF_GUARDED_BY(mu_);
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_SPLIT_UTILS_H_
diff --git a/tensorflow/core/kernels/data/split_utils_test.cc b/tensorflow/core/kernels/data/split_utils_test.cc
new file mode 100644
index 00000000000..651f6c1c894
--- /dev/null
+++ b/tensorflow/core/kernels/data/split_utils_test.cc
@@ -0,0 +1,142 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/data/split_utils.h"
+
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/kernels/data/dataset_test_base.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+std::string full_name(const std::string& name) {
+  return FullName("test", name);
+}
+
+Status SaveAndRestore(SplitProvider* split_provider) {
+  VariantTensorDataWriter writer;
+  TF_RETURN_IF_ERROR(split_provider->Save(full_name, &writer));
+  std::vector<const VariantTensorData*> variants;
+  writer.GetData(&variants);
+  VariantTensorDataReader reader(variants);
+  TF_RETURN_IF_ERROR(split_provider->Restore(full_name, &reader));
+  return Status::OK();
+}
+
+Status CheckOutput(SplitProvider* split_provider,
+                   std::vector<Tensor> expected) {
+  int64 next = 0;
+  bool end_of_splits = false;
+  while (!end_of_splits) {
+    Tensor split;
+    TF_RETURN_IF_ERROR(split_provider->GetNext(&split, &end_of_splits));
+    if (!end_of_splits) {
+      test::ExpectEqual(split, expected[next++]);
+    }
+  }
+  EXPECT_EQ(next, expected.size());
+  return Status::OK();
+}
+
+TEST(IndexSplitProviderTest, Empty) {
+  IndexSplitProvider split_provider(0);
+  TF_EXPECT_OK(
+      CheckOutput(&split_provider, CreateTensors<int64>(TensorShape({}), {})));
+}
+
+TEST(IndexSplitProviderTest, One) {
+  IndexSplitProvider split_provider(1);
+  TF_EXPECT_OK(CheckOutput(&split_provider,
+                           CreateTensors<int64>(TensorShape({}), {{0}})));
+}
+
+TEST(IndexSplitProviderTest, Three) {
+  IndexSplitProvider split_provider(3);
+  TF_EXPECT_OK(CheckOutput(
+      &split_provider, CreateTensors<int64>(TensorShape({}), {{0}, {1}, {2}})));
+}
+
+TEST(IndexSplitProviderTest, SaveAndRestore) {
+  IndexSplitProvider split_provider(4);
+  std::vector<Tensor> expected =
+      CreateTensors<int64>(TensorShape({}), {{0}, {1}, {2}, {3}});
+  for (int i = 0; i < expected.size(); ++i) {
+    TF_ASSERT_OK(SaveAndRestore(&split_provider));
+    Tensor split;
+    bool end_of_splits = true;
+    TF_ASSERT_OK(split_provider.GetNext(&split, &end_of_splits));
+    EXPECT_FALSE(end_of_splits);
+    test::ExpectEqual(split, expected[i]);
+  }
+  TF_ASSERT_OK(SaveAndRestore(&split_provider));
+  Tensor split;
+  bool end_of_splits = false;
+  TF_ASSERT_OK(split_provider.GetNext(&split, &end_of_splits));
+  EXPECT_TRUE(end_of_splits);
+}
+
+TEST(ShardingSplitProviderTest, TwoWayShardZero) {
+  auto base = std::make_shared<IndexSplitProvider>(4);
+  ShardingSplitProvider split_provider(2, 0, base);
+  TF_EXPECT_OK(CheckOutput(&split_provider,
+                           CreateTensors<int64>(TensorShape({}), {{0}, {2}})));
+}
+
+TEST(ShardingSplitProviderTest, TwoWayShardOne) {
+  auto base = std::make_shared<IndexSplitProvider>(4);
+  ShardingSplitProvider split_provider(2, 1, base);
+  TF_EXPECT_OK(CheckOutput(&split_provider,
+                           CreateTensors<int64>(TensorShape({}), {{1}, {3}})));
+}
+
+TEST(ShardingSplitProviderTest, ThreeWayShardOne) {
+  auto base = std::make_shared<IndexSplitProvider>(6);
+  ShardingSplitProvider split_provider(3, 1, base);
+  TF_EXPECT_OK(CheckOutput(&split_provider,
+                           CreateTensors<int64>(TensorShape({}), {{1}, {4}})));
+}
+
+TEST(ShardingSplitProviderTest, Empty) {
+  auto base = std::make_shared<IndexSplitProvider>(1);
+  ShardingSplitProvider split_provider(2, 1, base);
+  TF_EXPECT_OK(
+      CheckOutput(&split_provider, CreateTensors<int64>(TensorShape({}), {})));
+}
+
+TEST(ShardingSplitProviderTest, SaveAndRestore) {
+  auto base = std::make_shared<IndexSplitProvider>(6);
+  std::vector<Tensor> expected =
+      CreateTensors<int64>(TensorShape({}), {{1}, {4}});
+  ShardingSplitProvider split_provider(3, 1, base);
+  for (int i = 0; i < expected.size(); ++i) {
+    TF_ASSERT_OK(SaveAndRestore(&split_provider));
+    Tensor split;
+    bool end_of_splits = true;
+    TF_ASSERT_OK(split_provider.GetNext(&split, &end_of_splits));
+    EXPECT_FALSE(end_of_splits);
+    test::ExpectEqual(split, expected[i]);
+  }
+  TF_ASSERT_OK(SaveAndRestore(&split_provider));
+  Tensor split;
+  bool end_of_splits = false;
+  TF_ASSERT_OK(split_provider.GetNext(&split, &end_of_splits));
+  EXPECT_TRUE(end_of_splits);
+}
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
index 627467f291b..bfafcaa7aa1 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -74,6 +74,12 @@ int64 TakeDataset::Cardinality() const {
   return std::min(n, count_);
 }
 
+Status TakeDataset::InputDatasets(
+    std::vector<const DatasetBase*>* inputs) const {
+  inputs->push_back(input_);
+  return Status::OK();
+}
+
 Status TakeDataset::CheckExternalState() const {
   return input_->CheckExternalState();
 }
diff --git a/tensorflow/core/kernels/data/take_dataset_op.h b/tensorflow/core/kernels/data/take_dataset_op.h
index 03f8ff662a7..2b85e74e7f1 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.h
+++ b/tensorflow/core/kernels/data/take_dataset_op.h
@@ -40,6 +40,8 @@ class TakeDataset : public DatasetBase {
 
   int64 Cardinality() const override;
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override;
+
   Status CheckExternalState() const override;
 
  protected:
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index 78cc06a54c5..84b8e0bd435 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -64,6 +64,10 @@ class TensorDatasetOp::Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return 1LL; }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index e4f27f55327..2a8f422d90a 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/name_utils.h"
+#include "tensorflow/core/kernels/data/split_utils.h"
 #include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
@@ -57,6 +58,13 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
         this, name_utils::IteratorPrefix(kDatasetType, prefix)});
   }
 
+  Status MakeSplitProvider(
+      std::unique_ptr<SplitProvider>* split_provider) const override {
+    *split_provider =
+        absl::make_unique<IndexSplitProvider>(tensors_[0].dim_size(0));
+    return Status::OK();
+  }
+
   const DataTypeVector& output_dtypes() const override { return dtypes_; }
 
   const std::vector<PartialTensorShape>& output_shapes() const override {
@@ -69,6 +77,10 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
 
   int64 Cardinality() const override { return tensors_[0].dim_size(0); }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
@@ -99,24 +111,26 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
   class Iterator : public DatasetIterator<Dataset> {
    public:
     explicit Iterator(const Params& params)
-        : DatasetIterator<Dataset>(params),
-          i_(0),
-          n_(params.dataset->tensors_[0].dim_size(0)) {}
+        : DatasetIterator<Dataset>(params) {}
+
+    Status Initialize(IteratorContext* ctx) override {
+      split_provider_ = ctx->split_provider();
+      if (split_provider_ == nullptr) {
+        split_provider_ = std::make_shared<IndexSplitProvider>(
+            dataset()->tensors_[0].dim_size(0));
+      }
+      return Status::OK();
+    }
 
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
-      int64 index = 0;
-      {
-        mutex_lock l(mu_);
-        if (i_ < n_) {
-          index = i_;
-          ++i_;
-        } else {
-          *end_of_sequence = true;
-          return Status::OK();
-        }
+      Tensor split;
+      TF_RETURN_IF_ERROR(split_provider_->GetNext(&split, end_of_sequence));
+      if (*end_of_sequence) {
+        return Status::OK();
       }
+      int64 index = split.scalar<int64>()();
       out_tensors->clear();
       out_tensors->reserve(dataset()->tensors_.size());
       for (size_t i = 0; i < dataset()->tensors_.size(); ++i) {
@@ -138,22 +152,18 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
 
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
-      mutex_lock l(mu_);
-      TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kCurIndex), i_));
-      return Status::OK();
+      return split_provider_->Save(
+          [this](const std::string& key) { return full_name(key); }, writer);
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
-      mutex_lock l(mu_);
-      TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kCurIndex), &i_));
-      return Status::OK();
+      return split_provider_->Restore(
+          [this](const std::string& key) { return full_name(key); }, reader);
     }
 
    private:
-    mutex mu_;
-    int64 i_ TF_GUARDED_BY(mu_);
-    const int64 n_;
+    std::shared_ptr<SplitProvider> split_provider_;
   };
 
   const std::vector<Tensor> tensors_;
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc
index a42ac083ba2..f9bccd45173 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op_test.cc
@@ -319,6 +319,30 @@ INSTANTIATE_TEST_SUITE_P(
     TensorSliceDatasetOpTest, ParameterizedIteratorSaveAndRestoreTest,
     ::testing::ValuesIn(IteratorSaveAndRestoreTestCases()));
 
+TEST_F(TensorSliceDatasetOpTest, SplitProvider) {
+  auto params = TensorSliceDatasetParams(
+      CreateTensors<int64>(TensorShape({7}), {{6, 2, 3, 8, 7, 0, 10}}),
+      kNodeName);
+  TF_ASSERT_OK(InitializeRuntime(params));
+  TF_EXPECT_OK(CheckSplitProviderFullIteration(
+      params, CreateTensors<int64>(TensorShape({}),
+                                   {{6}, {2}, {3}, {8}, {7}, {0}, {10}})));
+  TF_EXPECT_OK(CheckSplitProviderShardedIteration(
+      params, /*num_shards=*/3, /*shard_index=*/1,
+      CreateTensors<int64>(TensorShape({}), {{2}, {7}})));
+}
+
+TEST_F(TensorSliceDatasetOpTest, SplitProviderEmpty) {
+  auto params = TensorSliceDatasetParams(
+      CreateTensors<int64>(TensorShape({0}), {{}}), kNodeName);
+  TF_ASSERT_OK(InitializeRuntime(params));
+  TF_EXPECT_OK(CheckSplitProviderFullIteration(
+      params, CreateTensors<int64>(TensorShape({}), {})));
+  TF_EXPECT_OK(CheckSplitProviderShardedIteration(
+      params, /*num_shards=*/3, /*shard_index=*/1,
+      CreateTensors<int64>(TensorShape({}), {})));
+}
+
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/text_line_dataset_op.cc b/tensorflow/core/kernels/data/text_line_dataset_op.cc
index 550a859093d..8851ec7995e 100644
--- a/tensorflow/core/kernels/data/text_line_dataset_op.cc
+++ b/tensorflow/core/kernels/data/text_line_dataset_op.cc
@@ -70,6 +70,10 @@ class TextLineDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
diff --git a/tensorflow/core/kernels/data/tf_record_dataset_op.cc b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
index c6387a49f46..0de7f9100b1 100644
--- a/tensorflow/core/kernels/data/tf_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
@@ -85,6 +85,10 @@ class TFRecordDatasetOp::Dataset : public DatasetBase {
     return name_utils::DatasetDebugString(kDatasetType);
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
diff --git a/tensorflow/core/kernels/data/window_dataset.cc b/tensorflow/core/kernels/data/window_dataset.cc
index 0c156baec89..42c2fc7656c 100644
--- a/tensorflow/core/kernels/data/window_dataset.cc
+++ b/tensorflow/core/kernels/data/window_dataset.cc
@@ -67,6 +67,10 @@ class WindowDataset : public DatasetBase {
 
   string DebugString() const override { return kWindowDataset; }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override { return Status::OK(); }
 
  protected:
diff --git a/tensorflow/core/kernels/data/window_dataset_op.cc b/tensorflow/core/kernels/data/window_dataset_op.cc
index 35437a9231c..69ad7ea3bb5 100644
--- a/tensorflow/core/kernels/data/window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op.cc
@@ -105,6 +105,11 @@ class WindowDatasetOp::Dataset : public DatasetBase {
     return cardinality;
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     return input_->CheckExternalState();
   }
@@ -150,14 +155,17 @@ class WindowDatasetOp::Dataset : public DatasetBase {
       std::vector<std::vector<Tensor>> window_elements;
       Status status = Status::OK();
       {
+        const size_t target_size = TargetBufferSize(window_size, window_stride);
+
         mutex_lock l(mu_);
-        if (!input_impl_ && buffer_.empty()) {
+        if (!input_impl_ &&
+            (buffer_.empty() ||
+             (dataset()->drop_remainder_ && buffer_.size() < target_size))) {
           *end_of_sequence = true;
           return Status::OK();
         }
 
         // Add elements to the buffer.
-        size_t target_size = TargetBufferSize(window_size, window_stride);
         if (input_impl_) {
           *end_of_sequence = false;
           for (size_t i = buffer_.size(); i < target_size && !*end_of_sequence;
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index b59dc2c3a22..0ac9f17839b 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -87,6 +87,13 @@ class ZipDatasetOp::Dataset : public DatasetBase {
     return result;
   }
 
+  Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
+    for (const auto& input : inputs_) {
+      inputs->push_back(input);
+    }
+    return Status::OK();
+  }
+
   Status CheckExternalState() const override {
     for (const auto& input : inputs_) {
       TF_RETURN_IF_ERROR(input->CheckExternalState());
diff --git a/tensorflow/core/kernels/data_format_ops.cc b/tensorflow/core/kernels/data_format_ops.cc
index c62c710faf1..b52d4d6c888 100644
--- a/tensorflow/core/kernels/data_format_ops.cc
+++ b/tensorflow/core/kernels/data_format_ops.cc
@@ -37,14 +37,15 @@ class DataFormatDimMapOp : public OpKernel {
     OP_REQUIRES_OK(context, context->GetAttr("src_format", &src_format));
     string dst_format;
     OP_REQUIRES_OK(context, context->GetAttr("dst_format", &dst_format));
-    OP_REQUIRES(context, src_format.size() == 4,
+    OP_REQUIRES(context, src_format.size() == 4 || src_format.size() == 5,
                 errors::InvalidArgument(strings::StrCat(
-                    "Source format must of length 4, received src_format = ",
+                    "Source format must of length 4 or 5, received "
+                    "src_format = ",
                     src_format)));
     OP_REQUIRES(
-        context, dst_format.size() == 4,
+        context, dst_format.size() == 4 || dst_format.size() == 5,
         errors::InvalidArgument(strings::StrCat(
-            "Destination format must of length 4, received dst_format = ",
+            "Destination format must of length 4 or 5, received dst_format = ",
             dst_format)));
     dst_idx_ = Tensor(DT_INT32, {static_cast<int64>(src_format.size())});
     for (int i = 0; i < src_format.size(); ++i) {
diff --git a/tensorflow/core/kernels/data_format_ops.h b/tensorflow/core/kernels/data_format_ops.h
index bc416fa78bc..0ec89876dbf 100644
--- a/tensorflow/core/kernels/data_format_ops.h
+++ b/tensorflow/core/kernels/data_format_ops.h
@@ -28,24 +28,50 @@ template <typename Device, typename T>
 struct DataFormatDimMap {
   void operator()(const Device& d, typename TTypes<T>::ConstFlat x,
                   typename TTypes<T>::Flat y, const TTypes<int>::Vec dst) {
-    auto zero = x.constant(0);
-    auto one = x.constant(1);
-    auto two = x.constant(2);
+    if (dst.size() == 4) {
+      auto zero = x.constant(0);
+      auto one = x.constant(1);
+      auto two = x.constant(2);
 
-    auto f_zero = x.constant(dst(0));
-    auto f_one = x.constant(dst(1));
-    auto f_two = x.constant(dst(2));
-    auto f_three = x.constant(dst(3));
+      auto f_zero = x.constant(dst(0));
+      auto f_one = x.constant(dst(1));
+      auto f_two = x.constant(dst(2));
+      auto f_three = x.constant(dst(3));
 
-    auto four = x.constant(4);
-    auto x_mod = (x + four) % 4;
+      auto four = x.constant(4);
+      auto x_mod = (x + four) % 4;
 
-    auto is_zero = (x_mod == zero);
-    auto is_one = (x_mod == one);
-    auto is_two = (x_mod == two);
+      auto is_zero = (x_mod == zero);
+      auto is_one = (x_mod == one);
+      auto is_two = (x_mod == two);
 
-    y.device(d) = is_zero.select(
-        f_zero, is_one.select(f_one, is_two.select(f_two, f_three)));
+      y.device(d) = is_zero.select(
+          f_zero, is_one.select(f_one, is_two.select(f_two, f_three)));
+    } else {
+      auto zero = x.constant(0);
+      auto one = x.constant(1);
+      auto two = x.constant(2);
+      auto three = x.constant(3);
+
+      auto f_zero = x.constant(dst(0));
+      auto f_one = x.constant(dst(1));
+      auto f_two = x.constant(dst(2));
+      auto f_three = x.constant(dst(3));
+      auto f_four = x.constant(dst(4));
+
+      auto five = x.constant(5);
+      auto x_mod = (x + five) % 5;
+
+      auto is_zero = (x_mod == zero);
+      auto is_one = (x_mod == one);
+      auto is_two = (x_mod == two);
+      auto is_three = (x_mod == three);
+
+      y.device(d) = is_zero.select(
+          f_zero,
+          is_one.select(
+              f_one, is_two.select(f_two, is_three.select(f_three, f_four))));
+    }
   }
 };
 
diff --git a/tensorflow/core/kernels/debug_ops.cc b/tensorflow/core/kernels/debug_ops.cc
index db42b9f6511..92abc7a4955 100644
--- a/tensorflow/core/kernels/debug_ops.cc
+++ b/tensorflow/core/kernels/debug_ops.cc
@@ -38,15 +38,6 @@ REGISTER_KERNEL_BUILDER(Name("CopyHost")
                         CopyOp);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("Copy").Device(DEVICE_SYCL), CopyOp);
-
-REGISTER_KERNEL_BUILDER(Name("CopyHost")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("input")
-                            .HostMemory("output"),
-                        CopyOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug identity (non-ref and ref) ops.
 REGISTER_KERNEL_BUILDER(Name("DebugIdentity").Device(DEVICE_CPU),
@@ -60,13 +51,6 @@ REGISTER_KERNEL_BUILDER(Name("DebugIdentity")
                         DebugIdentityOp);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("DebugIdentity")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("input")
-                            .HostMemory("output"),
-                        DebugIdentityOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug NaN-counter (non-ref and ref) ops.
 #define REGISTER_DEBUG_NAN_COUNT(type)                                    \
@@ -88,17 +72,6 @@ REGISTER_GPU_DEBUG_NAN_COUNT(float);
 REGISTER_GPU_DEBUG_NAN_COUNT(double);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_GPU_DEBUG_NAN_COUNT(type)                \
-  REGISTER_KERNEL_BUILDER(Name("DebugNanCount")           \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("input")        \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          DebugNanCountOp<type>);
-REGISTER_GPU_DEBUG_NAN_COUNT(float);
-REGISTER_GPU_DEBUG_NAN_COUNT(double);
-#endif  // TENSORFLOW_USE_SYCL
 
 // Register debug numeric summary ops.
 #define REGISTER_DEBUG_NUMERIC_SUMMARY_COUNT(type)        \
@@ -125,19 +98,6 @@ TF_CALL_float(REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT);
 TF_CALL_double(REGISTER_GPU_DEBUG_NUMERIC_SUMMARY_COUNT);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT(type)   \
-  REGISTER_KERNEL_BUILDER(Name("DebugNumericSummary")     \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("input")        \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          DebugNumericSummaryOp<type>);
-TF_CALL_bool(REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT);
-TF_CALL_INTEGRAL_TYPES(REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT);
-TF_CALL_float(REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT);
-TF_CALL_double(REGISTER_SYCL_DEBUG_NUMERIC_SUMMARY_COUNT);
-#endif  // TENSORFLOW_USE_SYCL
 
 REGISTER_KERNEL_BUILDER(Name("DebugIdentityV2").Device(DEVICE_CPU),
                         DebugIdentityV2Op);
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 0b256a062c2..b7cb7eb39d0 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -31,9 +31,6 @@ limitations under the License.
 #include "tensorflow/core/platform/rocm.h"
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif  // TENSORFLOW_USE_SYCL
 #include "tensorflow/core/debug/debug_io_utils.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -100,17 +97,6 @@ class CopyOp : public OpKernel {
         // The input tensor is on the host (CPU): deep-copy from CPU to CPU.
         *copied_tensor = tensor::DeepCopy(src_tensor);
       }
-#elif defined(TENSORFLOW_USE_SYCL)
-      Device* device = static_cast<Device*>(context->device());
-      // Determine if the input tensor is not on CPU (e.g., on GPU).
-      const bool off_host_input = device->device_type() == DEVICE_SYCL &&
-                                  !context->input_alloc_attr(0).on_host();
-
-      if (off_host_input) {
-        SYCLmemcpy(context->eigen_sycl_device(), src_tensor, copied_tensor);
-      } else {
-        *copied_tensor = tensor::DeepCopy(src_tensor);
-      }
 #else
       *copied_tensor = tensor::DeepCopy(src_tensor);
 #endif
diff --git a/tensorflow/core/kernels/dense_update_functor.h b/tensorflow/core/kernels/dense_update_functor.h
index 61b57312502..791d4b30ef1 100644
--- a/tensorflow/core/kernels/dense_update_functor.h
+++ b/tensorflow/core/kernels/dense_update_functor.h
@@ -27,9 +27,6 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 enum DenseUpdateType { ADD, SUB, ASSIGN };
 
@@ -65,31 +62,6 @@ struct DenseUpdate<CPUDevice, T, ASSIGN> {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-struct DenseUpdate<SYCLDevice, T, ADD> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat params,
-                  typename TTypes<T>::ConstFlat update) {
-    params.device(d) += update;
-  }
-};
-
-template <typename T>
-struct DenseUpdate<SYCLDevice, T, SUB> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat params,
-                  typename TTypes<T>::ConstFlat update) {
-    params.device(d) -= update;
-  }
-};
-
-template <typename T>
-struct DenseUpdate<SYCLDevice, T, ASSIGN> {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat params,
-                  typename TTypes<T>::ConstFlat update) {
-    params.device(d) = update;
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 
diff --git a/tensorflow/core/kernels/dense_update_ops.cc b/tensorflow/core/kernels/dense_update_ops.cc
index 71235fca143..f27eab8b901 100644
--- a/tensorflow/core/kernels/dense_update_ops.cc
+++ b/tensorflow/core/kernels/dense_update_ops.cc
@@ -87,9 +87,6 @@ class DenseUpdateOp : public OpKernel {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(type)                                     \
   REGISTER_KERNEL_BUILDER(                                         \
@@ -117,15 +114,6 @@ TF_CALL_uint32(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                 \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("Assign").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      AssignOpT<SYCLDevice, type>);
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(type)                                        \
   REGISTER_KERNEL_BUILDER(                                            \
@@ -151,16 +139,4 @@ TF_CALL_int64(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // end GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                    \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("AssignAdd").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      DenseUpdateOp<SYCLDevice, type, DenseUpdateType::ADD>);          \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("AssignSub").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      DenseUpdateOp<SYCLDevice, type, DenseUpdateType::SUB>);
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/depthtospace_op.cc b/tensorflow/core/kernels/depthtospace_op.cc
index acc13773b8c..1df0ae06bb3 100644
--- a/tensorflow/core/kernels/depthtospace_op.cc
+++ b/tensorflow/core/kernels/depthtospace_op.cc
@@ -172,6 +172,16 @@ struct DepthToSpaceOpFunctor<CPUDevice, T, FORMAT_NHWC> {
     }
   }
 };
+
+#ifdef WIN32
+template <typename T>
+struct DepthToSpaceOpFunctor<CPUDevice, T, FORMAT_NCHW> {
+  void operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  int block_size, typename TTypes<T, 4>::Tensor output) {
+    LOG(FATAL) << "Trivial implementation to make debug build compile.";
+  }
+};
+#endif
 }  // namespace functor
 
 #define REGISTER(type)                                                \
diff --git a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
index 7b64d9e8484..2bcc2b6ec65 100644
--- a/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op_gpu.cu.cc
@@ -48,6 +48,14 @@ limitations under the License.
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 #include "tensorflow/core/util/transform_output_iterator.h"
 
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/cuda/cuda_activation.h"
+using stream_executor::cuda::ScopedActivateExecutorContext;
+#elif TENSORFLOW_USE_ROCM
+#include "tensorflow/core/platform/rocm.h"
+using stream_executor::rocm::ScopedActivateExecutorContext;
+#endif  // GOOGLE_CUDA
+
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
@@ -302,6 +310,9 @@ class DynamicPartitionOpGPU : public AsyncOpKernel {
     TensorReference partition_ref(partition_count);
     auto wrapped_callback = [this, c, &data, &partitions, indices_out,
                              partition_ref, cpu_tensor, done]() {
+      auto stream = c->op_device_context()->stream();
+      ScopedActivateExecutorContext scoped_activation{stream->parent()};
+
       OpOutputList outputs;
       this->AllocateOutputs(c, &data, &partitions, &cpu_tensor, &outputs, done);
       if (!c->status().ok()) {
diff --git a/tensorflow/core/kernels/dynamic_stitch_op.cc b/tensorflow/core/kernels/dynamic_stitch_op.cc
index 5f6b0357f95..b43cd66e1a3 100644
--- a/tensorflow/core/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/core/kernels/dynamic_stitch_op.cc
@@ -249,7 +249,8 @@ class DynamicStitchOpImplCPU : public DynamicStitchOpImplBase<T> {
     // merged that aren't covered by an index in indices.  What should we do?
     if (first_dim_size > 0) {
       auto merged_flat = merged->flat_outer_dims<T>();
-      const int slice_size = merged_flat.dimension(1);
+      // slice_size must not be stored as int for cases of tensors over 2GB.
+      const auto slice_size = merged_flat.dimension(1);
       const size_t slice_bytes = slice_size * sizeof(T);
       auto OnInputNumber = [&](int input_num) {
         const Tensor& indices = indices_inputs[input_num];
@@ -365,24 +366,4 @@ TF_CALL_COMPLEX_TYPES(REGISTER_DYNAMIC_STITCH_GPU);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_DYNAMIC_STITCH_SYCL(type)               \
-  REGISTER_KERNEL_BUILDER(Name("DynamicStitch")          \
-                              .Device(DEVICE_SYCL)       \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("indices")     \
-                              .HostMemory("data")        \
-                              .HostMemory("merged"),     \
-                          DynamicStitchOpCPU<type>)      \
-  REGISTER_KERNEL_BUILDER(Name("ParallelDynamicStitch")  \
-                              .Device(DEVICE_SYCL)       \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("indices")     \
-                              .HostMemory("data")        \
-                              .HostMemory("merged"),     \
-                          ParallelDynamicStitchOpCPU<type>)
-
-TF_CALL_POD_STRING_TYPES(REGISTER_DYNAMIC_STITCH_SYCL);
-#undef REGISTER_DYNAMIC_STITCH_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h
index 7db4a69a8b3..b9c9e549b5d 100644
--- a/tensorflow/core/kernels/eigen_pooling.h
+++ b/tensorflow/core/kernels/eigen_pooling.h
@@ -131,8 +131,8 @@ SpatialMaxPooling(const Input& input, DenseIndex patchRows,
       .extract_image_patches(
           patchRows, patchCols, strideRows, strideCols, in_strideRows,
           in_strideCols, padding_type,
-          -Eigen::NumTraits<typename internal::remove_const<
-              typename internal::traits<Input>::Scalar>::type>::highest())
+          Eigen::NumTraits<typename internal::remove_const<
+              typename internal::traits<Input>::Scalar>::type>::lowest())
       .maximum(reduction_dims)
       .reshape(post_reduce_dims);
 }
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
index 0a940e52eb7..d0be01578d8 100644
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -222,7 +222,7 @@ class ParseExampleOp : public OpKernel {
     for (int d = 0; d < attrs_.num_sparse; ++d) {
       config.sparse.emplace_back(sparse_keys_t[d], attrs_.sparse_types[d]);
     }
-    config.sparse.reserve(attrs_.num_ragged);
+    config.ragged.reserve(attrs_.num_ragged);
     for (int d = 0; d < attrs_.num_ragged; ++d) {
       config.ragged.emplace_back(ragged_keys_t[d], attrs_.ragged_value_types[d],
                                  attrs_.ragged_split_types[d]);
diff --git a/tensorflow/core/kernels/fft_ops.cc b/tensorflow/core/kernels/fft_ops.cc
index 050b83980c6..9b625c256a5 100644
--- a/tensorflow/core/kernels/fft_ops.cc
+++ b/tensorflow/core/kernels/fft_ops.cc
@@ -31,6 +31,7 @@ limitations under the License.
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
+#include "tensorflow/core/kernels/gpu_utils.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -400,20 +401,7 @@ class CufftScratchAllocator : public se::ScratchAllocator {
 
 int64 GetCufftWorkspaceLimit(const string& envvar_in_mb,
                              int64 default_value_in_bytes) {
-  const char* workspace_limit_in_mb_str = getenv(envvar_in_mb.c_str());
-  if (workspace_limit_in_mb_str != nullptr &&
-      strcmp(workspace_limit_in_mb_str, "") != 0) {
-    int64 scratch_limit_in_mb = -1;
-    Status status = ReadInt64FromEnvVar(envvar_in_mb, default_value_in_bytes,
-                                        &scratch_limit_in_mb);
-    if (!status.ok()) {
-      LOG(WARNING) << "Invalid value for env-var " << envvar_in_mb << ": "
-                   << workspace_limit_in_mb_str;
-    } else {
-      return scratch_limit_in_mb * (1 << 20);
-    }
-  }
-  return default_value_in_bytes;
+  return gpu_utils::GetWorkspaceLimit(envvar_in_mb, default_value_in_bytes);
 }
 
 class FFTGPUBase : public FFTBase {
diff --git a/tensorflow/core/kernels/fill_functor.cc b/tensorflow/core/kernels/fill_functor.cc
index 0619facbd65..140497b06d0 100644
--- a/tensorflow/core/kernels/fill_functor.cc
+++ b/tensorflow/core/kernels/fill_functor.cc
@@ -63,26 +63,6 @@ DEFINE_SETZERO_CPU(complex128);
 DEFINE_SETZERO_CPU(Variant);
 #undef DEFINE_SETZERO_CPU
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-void SetZeroFunctor<Eigen::SyclDevice, T>::operator()(
-    const Eigen::SyclDevice& d, typename TTypes<T>::Flat out) {
-  To32Bit(out).device(d) = To32Bit(out).constant(T(0));
-}
-
-#define DEFINE_SETZERO_SYCL(T) \
-  template struct SetZeroFunctor<Eigen::SyclDevice, T>;
-DEFINE_SETZERO_SYCL(bool);
-DEFINE_SETZERO_SYCL(float);
-DEFINE_SETZERO_SYCL(double);
-DEFINE_SETZERO_SYCL(uint8);
-DEFINE_SETZERO_SYCL(int8);
-DEFINE_SETZERO_SYCL(uint16);
-DEFINE_SETZERO_SYCL(int16);
-DEFINE_SETZERO_SYCL(int32);
-DEFINE_SETZERO_SYCL(int64);
-#undef DEFINE_SETZERO_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T>
 void SetOneFunctor<Eigen::ThreadPoolDevice, T>::operator()(
@@ -110,20 +90,6 @@ DEFINE_SETONE_CPU(complex64);
 DEFINE_SETONE_CPU(complex128);
 #undef DEFINE_SETONE_CPU
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-void SetOneFunctor<Eigen::SyclDevice, T>::operator()(
-    const Eigen::SyclDevice& d, typename TTypes<T>::Flat out) {
-  out.device(d) = out.constant(T(1));
-}
-
-#define DEFINE_SETONE_SYCL(T) \
-  template struct SetOneFunctor<Eigen::SyclDevice, T>;
-DEFINE_SETONE_SYCL(float);
-DEFINE_SETONE_SYCL(bool);
-DEFINE_SETONE_SYCL(double);
-#undef DEFINE_SETONE_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T>
 struct FillFunctor<Eigen::ThreadPoolDevice, T> {
@@ -145,29 +111,6 @@ DEFINE_FILL_CPU(qint8);
 DEFINE_FILL_CPU(qint16);
 #undef DEFINE_FILL_CPU
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-struct FillFunctor<Eigen::SyclDevice, T> {
-  void operator()(const Eigen::SyclDevice& d, typename TTypes<T>::Flat out,
-                  typename TTypes<T>::ConstScalar in) {
-#if !defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::array<int, 1> rank1{1};
-#else
-    Eigen::IndexList<Eigen::type2index<1> > rank1;
-#endif
-    const int size = out.dimension(0);
-    Eigen::array<int, 1> broadcast_dims{size};
-
-    To32Bit(out).device(d) = in.reshape(rank1).broadcast(broadcast_dims);
-  }
-};
-
-#define DEFINE_FILL_SYCL(T) template struct FillFunctor<Eigen::SyclDevice, T>;
-DEFINE_FILL_SYCL(float);
-DEFINE_FILL_SYCL(double);
-TF_CALL_INTEGRAL_TYPES(DEFINE_FILL_SYCL)
-#undef DEFINE_FILL_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fill_functor.h b/tensorflow/core/kernels/fill_functor.h
index a9a47c6ecd3..7e2d558e33f 100644
--- a/tensorflow/core/kernels/fill_functor.h
+++ b/tensorflow/core/kernels/fill_functor.h
@@ -45,13 +45,6 @@ struct SetZeroFunctor<Eigen::ThreadPoolDevice, T> {
                   typename TTypes<T>::Flat out);
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-// Partial specialization of SetZeroFunctor<Device=Eigen::SyclDevice, T>.
-template <typename T>
-struct SetZeroFunctor<Eigen::SyclDevice, T> {
-  void operator()(const Eigen::SyclDevice& d, typename TTypes<T>::Flat out);
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 template <>
 struct SetZeroFunctor<Eigen::ThreadPoolDevice, tstring> {
@@ -72,13 +65,6 @@ struct SetOneFunctor<Eigen::ThreadPoolDevice, T> {
                   typename TTypes<T>::Flat out);
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-// Partial specialization of SetOneFunctor<Device=Eigen::SyclDevice, T>.
-template <typename T>
-struct SetOneFunctor<Eigen::SyclDevice, T> {
-  void operator()(const Eigen::SyclDevice& d, typename TTypes<T>::Flat out);
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 template <>
 struct SetOneFunctor<Eigen::ThreadPoolDevice, tstring> {
diff --git a/tensorflow/core/kernels/fingerprint_op.cc b/tensorflow/core/kernels/fingerprint_op.cc
index 340dcf111a5..89bd54009a9 100644
--- a/tensorflow/core/kernels/fingerprint_op.cc
+++ b/tensorflow/core/kernels/fingerprint_op.cc
@@ -91,7 +91,12 @@ class FingerprintOp : public OpKernel {
                                 input.shape()));
 
     const int64 dim0 = input.shape().dim_size(0);
-    const int64 dim1 = input.shape().num_elements() / dim0;
+    int64 dim1;
+    if (dim0 == 0) {
+      dim1 = 0;
+    } else {
+      dim1 = input.shape().num_elements() / dim0;
+    }
 
     Tensor* output;
     OP_REQUIRES_OK(context,
diff --git a/tensorflow/core/kernels/fingerprint_op_test.cc b/tensorflow/core/kernels/fingerprint_op_test.cc
index e4824775902..2925dee56d9 100644
--- a/tensorflow/core/kernels/fingerprint_op_test.cc
+++ b/tensorflow/core/kernels/fingerprint_op_test.cc
@@ -61,6 +61,15 @@ class FingerprintOpTest : public OpsTestBase {
   Tensor method_;
 };
 
+TEST_F(FingerprintOpTest, Empty) {
+  Tensor tensor(DT_UINT8, {0});
+
+  TF_ASSERT_OK(MakeFingerprintOp(&tensor));
+  TF_ASSERT_OK(RunOpKernel());
+  EXPECT_EQ(GetOutput(0)->shape(), (TensorShape{0, 8}));
+  EXPECT_EQ(GetOutput(0)->tensor_data(), "");
+}
+
 // This test detects changes in fingerprint method.
 TEST_F(FingerprintOpTest, GoldenValue) {
   Tensor tensor(DT_UINT8, {1, 3, 4, 5, 6, 7});
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 52a11e0870d..82b1aa8f63a 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -94,28 +94,6 @@ REGISTER_SYSTEM_KERNEL_BUILDER(Name(kDeviceRetOp).Device(DEVICE_CPU), RetvalOp);
 // is turned on.
 REGISTER_KERNEL_BUILDER(Name(kRetOp).Device(DEVICE_TPU_SYSTEM), RetvalOp);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER(type)     \
-  REGISTER_KERNEL_BUILDER( \
-      Name(kArgOp).Device(DEVICE_SYCL).TypeConstraint<type>("T"), ArgOp);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
-TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kArgOp)
-                                                   .Device(DEVICE_SYCL)
-                                                   .HostMemory("output")
-                                                   .TypeConstraint<int32>("T"),
-                                               ArgOp);
-#undef REGISTER
-#define REGISTER(type)     \
-  REGISTER_KERNEL_BUILDER( \
-      Name(kRetOp).Device(DEVICE_SYCL).TypeConstraint<type>("T"), RetvalOp);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER)
-TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kRetOp)
-                                                   .Device(DEVICE_SYCL)
-                                                   .HostMemory("input")
-                                                   .TypeConstraint<int32>("T"),
-                                               RetvalOp);
-#undef REGISTER
-#endif
 
 #define REGISTER(type)     \
   REGISTER_KERNEL_BUILDER( \
@@ -225,33 +203,6 @@ REGISTER_KERNEL_BUILDER(Name("_ArrayToList")
                             .TypeConstraint<int32>("T"),
                         PassOn);
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                       \
-  REGISTER_KERNEL_BUILDER(                                                \
-      Name("_ListToArray").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      PassOn);                                                            \
-  REGISTER_KERNEL_BUILDER(                                                \
-      Name("_ArrayToList").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      PassOn);
-
-REGISTER_SYCL_KERNELS(float);
-REGISTER_SYCL_KERNELS(double);
-
-#undef REGISTER_SYCL_KERNELS
-
-REGISTER_KERNEL_BUILDER(Name("_ListToArray")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("input")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T"),
-                        PassOn);
-REGISTER_KERNEL_BUILDER(Name("_ArrayToList")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("input")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T"),
-                        PassOn);
-#endif  // TENSORFLOW_USE_SYCL
 
 class SymbolicGradientOp : public AsyncOpKernel {
  public:
@@ -283,13 +234,7 @@ class SymbolicGradientOp : public AsyncOpKernel {
       args.push_back(ctx->input(i));
     }
     std::vector<Tensor>* rets = new std::vector<Tensor>;
-    profiler::TraceMe trace_me(
-        [&] {
-          return absl::StrCat(
-              "SymbolicGradientOp #parent_step_id=", ctx->step_id(),
-              ",function_step_id=", opts.step_id, "#");
-        },
-        profiler::TraceMeLevel::kInfo);
+    profiler::TraceMe trace_me("SymbolicGradientOp");
     lib->Run(opts, handle, args, rets, [ctx, done, rets](const Status& status) {
       if (!status.ok()) {
         ctx->SetStatus(status);
@@ -315,11 +260,6 @@ REGISTER_KERNEL_BUILDER(Name(kGradientOp).Device(DEVICE_CPU),
                         SymbolicGradientOp);
 REGISTER_KERNEL_BUILDER(Name(kGradientOp).Device(DEVICE_GPU),
                         SymbolicGradientOp);
-#if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name(kGradientOp).Device(DEVICE_SYCL),
-                        SymbolicGradientOp);
-
-#endif  // TENSORFLOW_USE_SYCL
 
 RemoteCallOp::RemoteCallOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
   OP_REQUIRES_OK(ctx,
@@ -411,8 +351,6 @@ void RemoteCallOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
   profiler::TraceMe trace_me(
       [&] {
         return absl::StrCat("RemoteCallOp#func_name=", func_name,
-                            ",parent_step_id=", ctx->step_id(),
-                            ",function_step_id=", opts.step_id,
                             ",device=", target_device, "#");
       },
       profiler::TraceMeLevel::kInfo);
@@ -424,8 +362,6 @@ void RemoteCallOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
         profiler::TraceMe activity(
             [&] {
               return absl::StrCat("RemoteCallOpDone#func_name=", func_name,
-                                  ",parent_step_id=", ctx->step_id(),
-                                  ",function_step_id=", function_step_id,
                                   ",device=", target_device, "#");
             },
             profiler::TraceMeLevel::kInfo);
@@ -459,9 +395,4 @@ REGISTER_KERNEL_BUILDER(
     Name("RemoteCall").Device(DEVICE_CPU).HostMemory("target"), RemoteCallOp);
 REGISTER_KERNEL_BUILDER(
     Name("RemoteCall").Device(DEVICE_GPU).HostMemory("target"), RemoteCallOp);
-#if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(
-    Name("RemoteCall").Device(DEVICE_SYCL).HostMemory("target"), RemoteCallOp);
-
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index e9ad7995eed..8d798a3f823 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -174,12 +174,7 @@ class IfOp : public AsyncOpKernel {
     void Start() {
       FHandle handle = cond_ ? then_handle_ : else_handle_;
       rets_.clear();
-      profiler::TraceMe trace_me(
-          [&] {
-            return absl::StrCat("IfOp #parent_step_id=", ctx_->step_id(),
-                                ",function_step_id=", opts_.step_id, "#");
-          },
-          /*level=*/2);
+      profiler::TraceMe trace_me("IfOp");
       lib_->Run(
           // Evaluate one of the branch.
           opts_, handle, args_, &rets_,
@@ -309,12 +304,7 @@ class CaseOp : public AsyncOpKernel {
         branch = branch_handles_.size() - 1;
       }
       rets_.clear();
-      profiler::TraceMe trace_me(
-          [&] {
-            return absl::StrCat("CaseOp #parent_step_id=", ctx_->step_id(),
-                                ",function_step_id=", opts_.step_id, "#");
-          },
-          /*level=*/2);
+      profiler::TraceMe trace_me("CaseOp");
       lib_->Run(
           // Evaluate one of the branch.
           opts_, branch_handles_[branch], args_, &rets_,
@@ -407,18 +397,6 @@ class WhileOp : public AsyncOpKernel {
   std::unordered_map<FunctionLibraryRuntime*, std::pair<FHandle, FHandle>>
       handles_ GUARDED_BY(mu_);
 
-  static string EvalCondTraceString(
-      OpKernelContext* ctx, const FunctionLibraryRuntime::Options& opts) {
-    return absl::StrCat("WhileOp-EvalCond #parent_step_id=", ctx->step_id(),
-                        ",function_step_id=", opts.step_id, "#");
-  }
-
-  static string StartBodyTraceString(
-      OpKernelContext* ctx, const FunctionLibraryRuntime::Options& opts) {
-    return absl::StrCat("WhileOp-StartBody #parent_step_id=", ctx->step_id(),
-                        ",function_step_id=", opts.step_id, "#");
-  }
-
   static Status CondResultToBool(OpKernelContext* ctx,
                                  const FunctionLibraryRuntime::Options& opts,
                                  const Tensor& cond_t, bool* out_result) {
@@ -554,9 +532,7 @@ class WhileOp : public AsyncOpKernel {
     std::unique_ptr<BodyFuncCallFrame> body_frame_;
 
     void EvalCond() {
-      profiler::TraceMe trace_me(
-          [&] { return EvalCondTraceString(ctx_, opts_); },
-          /*level=*/2);
+      profiler::TraceMe trace_me("WhileOp-EvalCond");
       lib_->Run(
           // Evaluate the condition.
           opts_, cond_handle_, args_, &rets_,
@@ -592,9 +568,7 @@ class WhileOp : public AsyncOpKernel {
       }
       rets_.clear();
       rets_.resize(args_.size());
-      profiler::TraceMe trace_me(
-          [&] { return StartBodyTraceString(ctx_, opts_); },
-          /*level=*/2);
+      profiler::TraceMe trace_me("WhileOp-StartBody");
       lib_->Run(
           // Evaluate the body.
           opts_, body_handle_, body_frame_.get(),
@@ -649,9 +623,7 @@ class WhileOp : public AsyncOpKernel {
     do {
       // Evaluate the cond function on the current loop variables.
       {
-        profiler::TraceMe trace_me(
-            [&] { return EvalCondTraceString(ctx, opts); },
-            /*level=*/2);
+        profiler::TraceMe trace_me("WhileOp-EvalCond");
         TF_RETURN_IF_ERROR(lib->RunSync(opts, cond_handle, args, &cond_rets));
       }
       if (cond_rets.size() != 1) {
@@ -672,9 +644,7 @@ class WhileOp : public AsyncOpKernel {
       // Evaluate the body function on the current loop variables, to get an
       // updated vector of loop variables.
       {
-        profiler::TraceMe trace_me(
-            [&] { return StartBodyTraceString(ctx, opts); },
-            /*level=*/2);
+        profiler::TraceMe trace_me("WhileOp-StartBody");
         body_rets.resize(num_loop_vars);
         BodyFuncCallFrame call_frame(&args, &body_rets, loop_var_types);
         TF_RETURN_IF_ERROR(lib->RunSync(opts, body_handle, &call_frame));
@@ -854,12 +824,7 @@ class ForOp : public AsyncOpKernel {
         args_[1 + i] = std::move(rets_[i]);
       }
       rets_.clear();
-      profiler::TraceMe trace_me(
-          [&] {
-            return absl::StrCat("ForOp #parent_step_id=", ctx_->step_id(),
-                                ",function_step_id=", opts_.step_id, "#");
-          },
-          /*level=*/2);
+      profiler::TraceMe trace_me("ForOp");
       lib_->Run(opts_, kernel_->body_handle_, args_, &rets_,
                 [this](const Status& s) {
                   if (s.ok()) {
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 00ac9be6dcd..d8e58093b07 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -1241,15 +1241,15 @@ class FusedBatchNormOpBase : public OpKernel {
   // If use_reserved_space is false, we don't have 5th output.
   virtual void ComputeWithReservedSpace(OpKernelContext* context,
                                         bool use_reserved_space) {
-    const Tensor& x = context->input(0);
+    Tensor x = context->input(0);
     const Tensor& scale = context->input(1);
     const Tensor& offset = context->input(2);
     const Tensor& estimated_mean = context->input(3);
     const Tensor& estimated_variance = context->input(4);
     const Tensor* side_input = has_side_input_ ? &context->input(5) : nullptr;
 
-    OP_REQUIRES(context, x.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
+    OP_REQUIRES(context, x.dims() == 4 or x.dims() == 5,
+                errors::InvalidArgument("input must be 4 or 5-dimensional",
                                         x.shape().DebugString()));
     OP_REQUIRES(context, scale.dims() == 1,
                 errors::InvalidArgument("scale must be 1-dimensional",
@@ -1264,6 +1264,21 @@ class FusedBatchNormOpBase : public OpKernel {
         context, estimated_variance.dims() == 1,
         errors::InvalidArgument("estimated_variance must be 1-dimensional",
                                 estimated_variance.shape().DebugString()));
+    bool use_reshape = (x.dims() == 5);
+    auto x_shape = x.shape();
+    TensorShape dest_shape;
+    if (use_reshape) {
+      const int64 in_batch = GetTensorDim(x, tensor_format_, 'N');
+      int64 in_planes = GetTensorDim(x, tensor_format_, '0');
+      int64 in_rows = GetTensorDim(x, tensor_format_, '1');
+      int64 in_cols = GetTensorDim(x, tensor_format_, '2');
+      const int64 in_depth = GetTensorDim(x, tensor_format_, 'C');
+      dest_shape = ShapeFromFormat(tensor_format_, in_batch,
+                                   {{in_planes, in_rows * in_cols}}, in_depth);
+      OP_REQUIRES(context, x.CopyFrom(x, dest_shape),
+                  errors::InvalidArgument("Error during tensor copy."));
+    }
+
     if (has_side_input_) {
       OP_REQUIRES(context, side_input->shape() == x.shape(),
                   errors::InvalidArgument(
@@ -1282,8 +1297,10 @@ class FusedBatchNormOpBase : public OpKernel {
     }
 
     Tensor* y = nullptr;
+    auto alloc_shape = use_reshape ? dest_shape : x_shape;
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                {0}, 0, x.shape(), &y));
+                                {0}, 0, alloc_shape, &y));
+
     Tensor* batch_mean = nullptr;
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
                                 {3}, 1, scale.shape(), &batch_mean));
@@ -1310,6 +1327,10 @@ class FusedBatchNormOpBase : public OpKernel {
           batch_mean, batch_var, saved_mean, saved_maybe_inv_var,
           tensor_format_, use_reserved_space);
     }
+    if (use_reshape) {
+      OP_REQUIRES(context, y->CopyFrom(*y, x_shape),
+                  errors::InvalidArgument("Error during tensor copy."));
+    }
   }
 
  private:
@@ -1375,8 +1396,8 @@ class FusedBatchNormGradOpBase : public OpKernel {
 
   virtual void ComputeWithReservedSpace(OpKernelContext* context,
                                         bool use_reserved_space) {
-    const Tensor& y_backprop = context->input(0);
-    const Tensor& x = context->input(1);
+    Tensor y_backprop = context->input(0);
+    Tensor x = context->input(1);
     const Tensor& scale = context->input(2);
     // When is_training=True, batch mean and variance/inverted variance are
     // saved in the forward pass to be reused here. When is_training=False,
@@ -1387,11 +1408,11 @@ class FusedBatchNormGradOpBase : public OpKernel {
     // saves inverted variance.
     const Tensor& saved_maybe_inv_var_or_pop_var = context->input(4);
 
-    OP_REQUIRES(context, y_backprop.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
+    OP_REQUIRES(context, y_backprop.dims() == 4 or y_backprop.dims() == 5,
+                errors::InvalidArgument("input must be 4 or 5-dimensional",
                                         y_backprop.shape().DebugString()));
-    OP_REQUIRES(context, x.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
+    OP_REQUIRES(context, x.dims() == 4 or x.dims() == 5,
+                errors::InvalidArgument("input must be 4 or 5-dimensional",
                                         x.shape().DebugString()));
     OP_REQUIRES(context, scale.dims() == 1,
                 errors::InvalidArgument("scale must be 1-dimensional",
@@ -1404,10 +1425,27 @@ class FusedBatchNormGradOpBase : public OpKernel {
                 errors::InvalidArgument(
                     "saved variance must be 1-dimensional",
                     saved_maybe_inv_var_or_pop_var.shape().DebugString()));
+    bool use_reshape = (x.dims() == 5);
+    auto x_shape = x.shape();
+    TensorShape dest_shape;
+    if (use_reshape) {
+      const int64 in_batch = GetTensorDim(x, tensor_format_, 'N');
+      int64 in_planes = GetTensorDim(x, tensor_format_, '0');
+      int64 in_rows = GetTensorDim(x, tensor_format_, '1');
+      int64 in_cols = GetTensorDim(x, tensor_format_, '2');
+      const int64 in_depth = GetTensorDim(x, tensor_format_, 'C');
+      dest_shape = ShapeFromFormat(tensor_format_, in_batch,
+                                   {{in_planes, in_rows * in_cols}}, in_depth);
+      OP_REQUIRES(context, x.CopyFrom(x, dest_shape),
+                  errors::InvalidArgument("Error during tensor copy."));
+      OP_REQUIRES(context, y_backprop.CopyFrom(y_backprop, dest_shape),
+                  errors::InvalidArgument("Error during tensor copy."));
+    }
 
     Tensor* x_backprop = nullptr;
+    auto alloc_shape = use_reshape ? dest_shape : x_shape;
     OP_REQUIRES_OK(context,
-                   context->allocate_output(0, x.shape(), &x_backprop));
+                   context->allocate_output(0, alloc_shape, &x_backprop));
 
     const TensorShape& scale_offset_shape = scale.shape();
     Tensor* scale_backprop = nullptr;
@@ -1441,15 +1479,20 @@ class FusedBatchNormGradOpBase : public OpKernel {
           offset_backprop, use_reserved_space, tensor_format_);
     } else {
       // Necessary layout conversion is currently done in python.
-      CHECK(tensor_format_ == FORMAT_NHWC)
-          << "The implementation of FusedBatchNormGrad with is_training=False "
-             "only support "
-          << "NHWC tensor format for now.";
+      OP_REQUIRES(context, tensor_format_ == FORMAT_NHWC,
+                  errors::InvalidArgument(
+                      "The implementation of "
+                      "FusedBatchNormGrad with is_training=False only support "
+                      "NHWC tensor format for now."));
       functor::FusedBatchNormFreezeGrad<Device, T, U>()(
           context, y_backprop, x, scale, saved_mean_or_pop_mean,
           saved_maybe_inv_var_or_pop_var, epsilon_, x_backprop, scale_backprop,
           offset_backprop);
     }
+    if (use_reshape) {
+      OP_REQUIRES(context, x_backprop->CopyFrom(*x_backprop, x_shape),
+                  errors::InvalidArgument("Error during tensor copy."));
+    }
   }
 
  private:
diff --git a/tensorflow/core/kernels/fused_eigen_output_kernels.cc b/tensorflow/core/kernels/fused_eigen_output_kernels.cc
index 94e621ae05b..e8e9fd6407e 100644
--- a/tensorflow/core/kernels/fused_eigen_output_kernels.cc
+++ b/tensorflow/core/kernels/fused_eigen_output_kernels.cc
@@ -60,18 +60,25 @@ Status InitializeFusedComputation(
   if (*fused_computation == FusedComputationType::kBiasAdd ||
       *fused_computation == FusedComputationType::kBiasAddWithRelu ||
       *fused_computation == FusedComputationType::kBiasAddWithRelu6 ||
-      *fused_computation == FusedComputationType::kBiasAddWithElu) {
+      *fused_computation == FusedComputationType::kBiasAddWithElu ||
+      *fused_computation == FusedComputationType::kBiasAddWithLeakyRelu) {
     if (num_args != 1) {
       return errors::InvalidArgument(
           "Fused ", kernel_name,
           " with BiasAdd must have one extra argument: bias.");
     }
+    if (*fused_computation == FusedComputationType::kBiasAddWithLeakyRelu) {
+      TF_RETURN_IF_ERROR(context->GetAttr(
+          "leakyrelu_alpha", &fused_computation_args->leakyrelu_alpha));
+    }
   }
 
   if (*fused_computation == FusedComputationType::kFusedBatchNorm ||
       *fused_computation == FusedComputationType::kFusedBatchNormWithRelu ||
       *fused_computation == FusedComputationType::kFusedBatchNormWithRelu6 ||
-      *fused_computation == FusedComputationType::kFusedBatchNormWithElu) {
+      *fused_computation == FusedComputationType::kFusedBatchNormWithElu ||
+      *fused_computation ==
+          FusedComputationType::kFusedBatchNormWithLeakyRelu) {
     if (num_args != 4) {
       return errors::InvalidArgument(
           "Fused ", kernel_name,
@@ -80,6 +87,11 @@ Status InitializeFusedComputation(
     }
     TF_RETURN_IF_ERROR(
         context->GetAttr("epsilon", &fused_computation_args->epsilon));
+    if (*fused_computation ==
+        FusedComputationType::kFusedBatchNormWithLeakyRelu) {
+      TF_RETURN_IF_ERROR(context->GetAttr(
+          "leakyrelu_alpha", &fused_computation_args->leakyrelu_alpha));
+    }
   }
 
   return Status::OK();
diff --git a/tensorflow/core/kernels/fused_eigen_output_kernels.h b/tensorflow/core/kernels/fused_eigen_output_kernels.h
index 2588da10f58..546cf39e094 100644
--- a/tensorflow/core/kernels/fused_eigen_output_kernels.h
+++ b/tensorflow/core/kernels/fused_eigen_output_kernels.h
@@ -39,15 +39,18 @@ enum class FusedComputationType {
   kBiasAddWithRelu,
   kBiasAddWithRelu6,
   kBiasAddWithElu,
+  kBiasAddWithLeakyRelu,
   kFusedBatchNorm,
   kFusedBatchNormWithRelu,
   kFusedBatchNormWithRelu6,
-  kFusedBatchNormWithElu
+  kFusedBatchNormWithElu,
+  kFusedBatchNormWithLeakyRelu
 };
 
 // We have to pass around additional arguments for all possible fusion types.
 struct FusedComputationArgs {
-  float epsilon = 0.0;  // Used by `FusedBatchNorm` fusion only
+  float epsilon = 0.0;          // Used by `FusedBatchNorm` fusion only
+  float leakyrelu_alpha = 0.0;  // Used by `LeakyRelu` fusion only
 };
 
 struct FusedComputationPattern {
@@ -111,15 +114,32 @@ struct Elu {
   };
 };
 
+// Applies `LeakyRelu` to the passed input expression.
+struct LeakyRelu {
+  template <typename XprType>
+  static auto apply(XprType expr, const float leakyrelu_alpha) -> decltype(
+      (expr < std::declval<typename XprType::Scalar>())
+          .select(expr *
+                      expr.constant(std::declval<typename XprType::Scalar>()),
+                  expr)) {
+    return (expr < static_cast<typename XprType::Scalar>(0))
+        .select(expr * expr.constant(static_cast<typename XprType::Scalar>(
+                           leakyrelu_alpha)),
+                expr);
+  };
+};
+
 template <typename T>
 struct BiasAddArgs {
   const T* bias_add_data = nullptr;
+  float leakyrelu_alpha;
 
   static bool IsSupported(FusedComputationType fusion) {
     return fusion == FusedComputationType::kBiasAdd ||
            fusion == FusedComputationType::kBiasAddWithRelu ||
            fusion == FusedComputationType::kBiasAddWithRelu6 ||
-           fusion == FusedComputationType::kBiasAddWithElu;
+           fusion == FusedComputationType::kBiasAddWithElu ||
+           fusion == FusedComputationType::kBiasAddWithLeakyRelu;
   }
 };
 
@@ -134,11 +154,14 @@ struct FusedBatchNormArgs {
   //   scaling_factor = (estimated_variance + epsilon).rsqrt() * scale
   Eigen::Tensor<T, 1, Eigen::RowMajor> scaling_factor;
 
+  float leakyrelu_alpha;
+
   static bool IsSupported(FusedComputationType fusion) {
     return fusion == FusedComputationType::kFusedBatchNorm ||
            fusion == FusedComputationType::kFusedBatchNormWithRelu ||
            fusion == FusedComputationType::kFusedBatchNormWithRelu6 ||
-           fusion == FusedComputationType::kFusedBatchNormWithElu;
+           fusion == FusedComputationType::kFusedBatchNormWithElu ||
+           fusion == FusedComputationType::kFusedBatchNormWithLeakyRelu;
   }
 };
 
@@ -203,6 +226,34 @@ struct BiasAddOutputKernel {
   const T* bias_data;
 };
 
+template <typename T>
+struct BiasAddOutputKernel<T, LeakyRelu> {
+  explicit BiasAddOutputKernel(const BiasAddArgs<T>& args)
+      : bias_data(args.bias_add_data), leakyrelu_alpha(args.leakyrelu_alpha) {}
+
+  template <typename StorageIndex, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper,
+      const Eigen::TensorContractionParams& params, StorageIndex i,
+      StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const {
+    DCHECK(params.swapped_arguments);
+
+    const T* bias_base = bias_data + i;
+    typename TTypes<T>::UnalignedConstTensor bias(bias_base, num_rows);
+
+    for (int col = 0; col < num_cols; ++col) {
+      T* output_base = &output_mapper(0, col);
+      typename TTypes<T>::UnalignedTensor output(output_base, num_rows);
+      const auto expr = output + bias;
+      output = LeakyRelu::template apply<decltype(expr)>(expr, leakyrelu_alpha);
+    }
+  }
+
+ private:
+  const T* bias_data;
+  float leakyrelu_alpha;
+};
+
 // Output kernel that fuses FusedBatchNorm operation into the output of tensor
 // contraction + activation function defined by Activation.
 template <typename T, typename Activation = Identity>
@@ -247,6 +298,51 @@ struct FusedBatchNormOutputKernel {
   const T* estimated_mean_data;
 };
 
+template <typename T>
+struct FusedBatchNormOutputKernel<T, LeakyRelu> {
+  FusedBatchNormOutputKernel(T epsilon, const FusedBatchNormArgs<T>& args)
+      : epsilon(epsilon),
+        scaling_factor_data(args.scaling_factor.data()),
+        offset_data(args.offset_data),
+        estimated_mean_data(args.estimated_mean_data),
+        leakyrelu_alpha(args.leakyrelu_alpha) {}
+
+  template <typename StorageIndex, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(
+      const ContractionOutputMapper<Scalar, StorageIndex>& output_mapper,
+      const Eigen::TensorContractionParams& params, StorageIndex i,
+      StorageIndex j, StorageIndex num_rows, StorageIndex num_cols) const {
+    DCHECK(params.swapped_arguments);
+
+    const T* scaling_factor_base = scaling_factor_data + i;
+    const T* offset_base = offset_data + i;
+    const T* mean_base = estimated_mean_data + i;
+
+    typename TTypes<T>::UnalignedConstTensor scaling_factor(scaling_factor_base,
+                                                            num_rows);
+    typename TTypes<T>::UnalignedConstTensor offset(offset_base, num_rows);
+    typename TTypes<T>::UnalignedConstTensor mean(mean_base, num_rows);
+
+    for (int col = 0; col < num_cols; ++col) {
+      T* output_base = &output_mapper(0, col);
+      typename TTypes<T>::UnalignedTensor output(output_base, num_rows);
+
+      auto scaled = (output - mean) * scaling_factor;
+      auto shifted = scaled + offset;
+
+      output = LeakyRelu::template apply<decltype(shifted)>(shifted,
+                                                            leakyrelu_alpha);
+    }
+  }
+
+ private:
+  T epsilon;
+  const T* scaling_factor_data;
+  const T* offset_data;
+  const T* estimated_mean_data;
+  float leakyrelu_alpha;
+};
+
 // Type aliases for the output kernels, purely for the sake of better launch
 // dispatching code readability.
 template <typename T>
@@ -258,6 +354,8 @@ using WithBiasAddAndRelu6 = BiasAddOutputKernel<T, Relu6>;
 template <typename T>
 using WithBiasAddAndElu = BiasAddOutputKernel<T, Elu>;
 template <typename T>
+using WithBiasAddAndLeakyRelu = BiasAddOutputKernel<T, LeakyRelu>;
+template <typename T>
 using WithFusedBatchNorm = FusedBatchNormOutputKernel<T>;
 template <typename T>
 using WithFusedBatchNormAndRelu = FusedBatchNormOutputKernel<T, Relu>;
@@ -265,9 +363,12 @@ template <typename T>
 using WithFusedBatchNormAndRelu6 = FusedBatchNormOutputKernel<T, Relu6>;
 template <typename T>
 using WithFusedBatchNormAndElu = FusedBatchNormOutputKernel<T, Elu>;
+template <typename T>
+using WithFusedBatchNormAndLeakyRelu = FusedBatchNormOutputKernel<T, LeakyRelu>;
 
 template <typename T>
-Status InitBiasAddArgs(OpKernelContext* context, BiasAddArgs<T>* args) {
+Status InitBiasAddArgs(OpKernelContext* context, BiasAddArgs<T>* args,
+                       const float* leakyrelu_alpha = nullptr) {
   // Bias of the following dimensions: [ output_depth ]
   const Tensor& bias = context->input(2);
 
@@ -281,12 +382,17 @@ Status InitBiasAddArgs(OpKernelContext* context, BiasAddArgs<T>* args) {
 
   args->bias_add_data = data_ptr(bias);
 
+  if (leakyrelu_alpha) {
+    args->leakyrelu_alpha = *leakyrelu_alpha;
+  }
+
   return Status::OK();
 }
 
 template <typename T>
 Status InitFusedBatchNormArgs(OpKernelContext* context, float epsilon,
-                              FusedBatchNormArgs<T>* args) {
+                              FusedBatchNormArgs<T>* args,
+                              const float* leakyrelu_alpha = nullptr) {
   const Tensor& scale = context->input(2);
   const Tensor& offset = context->input(3);
   const Tensor& estimated_mean = context->input(4);
@@ -319,6 +425,10 @@ Status InitFusedBatchNormArgs(OpKernelContext* context, float epsilon,
       (estimated_variance.flat<T>() + static_cast<T>(epsilon)).rsqrt() *
       scale.flat<T>();
 
+  if (leakyrelu_alpha) {
+    args->leakyrelu_alpha = *leakyrelu_alpha;
+  }
+
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD
index 4133462cad5..ce71fe77cb2 100644
--- a/tensorflow/core/kernels/fuzzing/BUILD
+++ b/tensorflow/core/kernels/fuzzing/BUILD
@@ -1,5 +1,12 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow/core/kernels/fuzzing:tf_ops_fuzz_target_lib.bzl", "tf_ops_fuzz_target_lib")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow/core/kernels/fuzzing:tf_ops_fuzz_target_lib.bzl", "tf_oss_fuzz_corpus")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow/core/kernels/fuzzing:tf_ops_fuzz_target_lib.bzl", "tf_oss_fuzz_dict")
 
 package(
diff --git a/tensorflow/core/kernels/gpu_utils.cc b/tensorflow/core/kernels/gpu_utils.cc
index 7da1963c676..1a14768f487 100644
--- a/tensorflow/core/kernels/gpu_utils.cc
+++ b/tensorflow/core/kernels/gpu_utils.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "google/protobuf/any.pb.h"
 #include "absl/algorithm/container.h"
 #include "absl/base/call_once.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/logger.h"
 #include "tensorflow/core/protobuf/autotuning.pb.h"
 #include "tensorflow/core/protobuf/conv_autotuning.pb.h"
@@ -282,6 +283,64 @@ Status BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results,
   return Status::OK();
 }
 
+namespace gpu_utils {
+int64 GetWorkspaceLimit(const string& envvar_in_mb,
+                        int64 default_value_in_bytes) {
+  const char* workspace_limit_in_mb_str = getenv(envvar_in_mb.c_str());
+  if (workspace_limit_in_mb_str != nullptr &&
+      strcmp(workspace_limit_in_mb_str, "") != 0) {
+    int64 scratch_limit_in_mb = -1;
+    if (strings::safe_strto64(workspace_limit_in_mb_str,
+                              &scratch_limit_in_mb)) {
+      return scratch_limit_in_mb * (1 << 20);
+    } else {
+      LOG(WARNING) << "Invalid value for env-var " << envvar_in_mb << ": "
+                   << workspace_limit_in_mb_str;
+    }
+  }
+  return default_value_in_bytes;
+}
+}  // namespace gpu_utils
+
+GpuScratchAllocator::GpuScratchAllocator(int64 memory_limit,
+                                         OpKernelContext* context)
+    : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
+
+se::port::StatusOr<se::DeviceMemory<uint8>> GpuScratchAllocator::AllocateBytes(
+    int64 byte_size) {
+  Tensor temporary_memory;
+  if (byte_size < 0) {
+    return se::port::Status{se::port::error::INVALID_ARGUMENT,
+                            "Requested negative byte size!"};
+  }
+  if (byte_size > memory_limit_) {
+    return se::port::Status{
+        se::port::error::UNAVAILABLE,
+        absl::StrCat("Requested memory size (", byte_size,
+                     ") exceeds the max memory limit (", memory_limit_, ").")};
+  }
+  AllocationAttributes allocation_attr;
+  allocation_attr.retry_on_failure = false;
+  Status allocation_status(context_->allocate_temp(
+      DT_UINT8, TensorShape({byte_size}), &temporary_memory,
+      AllocatorAttributes(), allocation_attr));
+  if (!allocation_status.ok()) {
+    return se::port::Status{
+        se::port::error::UNAVAILABLE,
+        absl::StrCat("Failed to allocate the requested memory size (",
+                     byte_size, ").")};
+  }
+  // Hold the reference of the allocated tensors until the end of the
+  // allocator.
+  // NOTE: We expect tensors to be deallocated when this allocator goes out of
+  // scope when allocated_tensors is destructed.
+  allocated_tensors_.push_back(temporary_memory);
+  total_byte_size_ += byte_size;
+  return se::port::StatusOr<se::DeviceMemory<uint8>>(
+      AsDeviceMemory(temporary_memory.flat<uint8>().data(),
+                     temporary_memory.flat<uint8>().size()));
+}
+
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h
index a1589db3b5b..62db406513b 100644
--- a/tensorflow/core/kernels/gpu_utils.h
+++ b/tensorflow/core/kernels/gpu_utils.h
@@ -243,6 +243,37 @@ void LogFusedConvForwardAutotuneResults(
 Status BestCudnnConvAlgorithm(absl::Span<const AutotuneResult> results,
                               se::dnn::AlgorithmConfig* algo);
 
+namespace gpu_utils {
+// Get a workspace limit from the environment variable, which is in MB.
+// Return the workspace memory limit in bytes. If no value is set, return the
+// default value.
+int64 GetWorkspaceLimit(const string& envvar_in_mb,
+                        int64 default_value_in_bytes);
+}  // namespace gpu_utils
+
+// A class to provide scratch-space allocator for Stream-Executor callbacks in
+// CUDA libraries (CUDNN etc.).
+// TensorFlow is responsible for releasing the temporary buffers after
+// the kernel finishes.
+class GpuScratchAllocator : public se::ScratchAllocator {
+ public:
+  virtual ~GpuScratchAllocator() {}
+
+  GpuScratchAllocator(int64 memory_limit, OpKernelContext* context);
+
+  int64 GetMemoryLimitInBytes() override { return memory_limit_; }
+
+  se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
+      int64 byte_size) override;
+
+  int64 TotalByteSize() { return total_byte_size_; }
+
+ private:
+  int64 memory_limit_;
+  int64 total_byte_size_;
+  OpKernelContext* context_;
+  std::vector<Tensor> allocated_tensors_;
+};
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/hexagon/BUILD b/tensorflow/core/kernels/hexagon/BUILD
index 9b15b729ffe..6174ced9d4d 100644
--- a/tensorflow/core/kernels/hexagon/BUILD
+++ b/tensorflow/core/kernels/hexagon/BUILD
@@ -1,11 +1,12 @@
 # Description:
 #   quantization-specific OpKernels for hexagon
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
-    "tf_kernel_library",
 )
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 
 package(
     default_visibility = ["//visibility:public"],
diff --git a/tensorflow/core/kernels/host_constant_op.cc b/tensorflow/core/kernels/host_constant_op.cc
index cb1afdb1b3f..dbba1feba0a 100644
--- a/tensorflow/core/kernels/host_constant_op.cc
+++ b/tensorflow/core/kernels/host_constant_op.cc
@@ -54,13 +54,6 @@ REGISTER_KERNEL_BUILDER(Name("Const")
                         _HostConstantOp);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("Const")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("dtype"),
-                        _HostConstantOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 // HostConst: forced to generate output on the host.
 REGISTER_KERNEL_BUILDER(Name("HostConst").Device(DEVICE_CPU), _HostConstantOp);
diff --git a/tensorflow/core/kernels/identity_op.cc b/tensorflow/core/kernels/identity_op.cc
index aee7b545f79..b5a17d8d675 100644
--- a/tensorflow/core/kernels/identity_op.cc
+++ b/tensorflow/core/kernels/identity_op.cc
@@ -60,45 +60,6 @@ REGISTER_KERNEL_BUILDER(Name("Identity")
                             .HostMemory("output"),
                         IdentityOp);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("Identity").Device(DEVICE_SYCL).TypeConstraint<type>("T"),        \
-      IdentityOp);                                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("PreventGradient").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      IdentityOp);                                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("RefIdentity").Device(DEVICE_SYCL).TypeConstraint<type>("T"),     \
-      IdentityOp);                                                           \
-  REGISTER_KERNEL_BUILDER(                                                   \
-      Name("StopGradient").Device(DEVICE_SYCL).TypeConstraint<type>("T"),    \
-      IdentityOp)
-
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-
-#undef REGISTER_SYCL_KERNEL
-
-#define REGISTER_SYCL_HOST_KERNEL(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("Identity")                \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("input")        \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          IdentityOp);                    \
-  REGISTER_KERNEL_BUILDER(Name("RefIdentity")             \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("input")        \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          IdentityOp)
-
-REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(bool);
-
-#undef REGISTER_SYCL_HOST_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_GPU_KERNEL(type)                                           \
   REGISTER_KERNEL_BUILDER(                                                  \
diff --git a/tensorflow/core/kernels/image/BUILD b/tensorflow/core/kernels/image/BUILD
index 0d69a384f73..31a50576808 100644
--- a/tensorflow/core/kernels/image/BUILD
+++ b/tensorflow/core/kernels/image/BUILD
@@ -1,19 +1,27 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_android",
+    "if_cuda_or_rocm",
     "tf_cc_test",
     "tf_cc_tests",
     "tf_copts",
-    "tf_kernel_library",
 )
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+
 # TODO(rmlarsen): Remove ASAP.
 package_group(
     name = "friends",
@@ -61,6 +69,9 @@ exports_files([
     "resize_nearest_neighbor_op.cc",
     "resize_nearest_neighbor_op.h",
     "sample_distorted_bounding_box_op.cc",
+    "decode_image_op.cc",
+    "encode_jpeg_op.cc",
+    "encode_png_op.cc",
 ])
 
 # Private support libraries ---------------------------------------------------
@@ -122,9 +133,9 @@ IMAGE_DEPS = [
     "//tensorflow/core:jpeg_internal",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
-    "//tensorflow/core:png_internal",
+    "//tensorflow/core/lib/png:png_io",
     "//tensorflow/core:protos_all_cc",
-    "//tensorflow/core/kernels:bounds_check",
+    "//tensorflow/core/framework:bounds_check",
     "//tensorflow/core/kernels:eigen_helpers",
     "//tensorflow/core/util/tensor_bundle",
     "//tensorflow/core/util:image_resizer_state",
@@ -238,7 +249,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "generate_box_proposals_op",
     gpu_srcs = ["generate_box_proposals_op.cu.cc"],
-    deps = ["//tensorflow/core/kernels:gpu_prim_hdrs"] + if_cuda([
+    deps = ["//tensorflow/core/kernels:gpu_prim_hdrs"] + if_cuda_or_rocm([
         ":non_max_suppression_op_gpu",
     ]),
 )
@@ -315,7 +326,6 @@ tf_cc_tests(
         "non_max_suppression_op_test.cc",
         "resize_area_op_test.cc",
         "resize_bicubic_op_test.cc",
-        "resize_nearest_neighbor_op_test.cc",
         "scale_and_translate_op_test.cc",
     ],
     linkopts = select({
@@ -338,8 +348,11 @@ tf_cc_test(
 )
 
 tf_cuda_cc_test(
-    name = "resize_bilinear_op_test",
-    srcs = ["resize_bilinear_op_test.cc"],
+    name = "resize_ops_test",
+    srcs = [
+        "resize_bilinear_op_test.cc",
+        "resize_nearest_neighbor_op_test.cc",
+    ],
     tags = ["no_cuda_on_cpu_tap"],
     deps = [
         ":image",
@@ -405,10 +418,10 @@ cc_library(
     linkopts = ["-ldl"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:android_gif_internal",
-        "//tensorflow/core:android_jpeg_internal",
-        "//tensorflow/core:android_png_internal",
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core:portable_jpeg_internal",
         "//tensorflow/core:portable_tensorflow_lib_lite",
+        "//tensorflow/core/lib/png:png_io",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/kernels/image/adjust_contrast_op.cc b/tensorflow/core/kernels/image/adjust_contrast_op.cc
index 6853465d9db..b43964aa064 100644
--- a/tensorflow/core/kernels/image/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/image/adjust_contrast_op.cc
@@ -33,9 +33,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif
 
 // AdjustContrastOp is deprecated as of GraphDef version >= 2
 
@@ -434,26 +431,5 @@ REGISTER_GPU(Eigen::half)
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-template <>
-class AdjustContrastOpv2<SYCLDevice, float> : public AdjustContrastOpV2Base {
- public:
-  explicit AdjustContrastOpv2(OpKernelConstruction* context)
-      : AdjustContrastOpV2Base(context) {}
-
-  void DoCompute(OpKernelContext* context,
-                 const ComputeOptions& options) override {
-    const int64 shape[4] = {options.batch, options.height, options.width,
-                            options.channels};
-    functor::AdjustContrastv2<SYCLDevice>()(
-        context->eigen_device<SYCLDevice>(),
-        options.input->shaped<float, 4>(shape), options.factor->scalar<float>(),
-        options.output->shaped<float, 4>(shape));
-  }
-};
-REGISTER_KERNEL_BUILDER(
-    Name("AdjustContrastv2").Device(DEVICE_SYCL).TypeConstraint<float>("T"),
-    AdjustContrastOpv2<SYCLDevice, float>);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/image/adjust_contrast_op_benchmark_test.cc b/tensorflow/core/kernels/image/adjust_contrast_op_benchmark_test.cc
index 0b9142ce1b5..bcbbc24d471 100644
--- a/tensorflow/core/kernels/image/adjust_contrast_op_benchmark_test.cc
+++ b/tensorflow/core/kernels/image/adjust_contrast_op_benchmark_test.cc
@@ -60,8 +60,5 @@ BM_AdjustContrastDev(cpu, 1, 299, 299);
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 BM_AdjustContrastDev(gpu, 32, 299, 299);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-BM_AdjustContrastDev(sycl, 32, 299, 299);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/image/colorspace_op.cc b/tensorflow/core/kernels/image/colorspace_op.cc
index a3164bb582d..8e81038ea0a 100644
--- a/tensorflow/core/kernels/image/colorspace_op.cc
+++ b/tensorflow/core/kernels/image/colorspace_op.cc
@@ -36,9 +36,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif
 
 template <typename Device, typename T>
 class RGBToHSVOp : public OpKernel {
@@ -150,16 +147,5 @@ TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(T)                                           \
-  REGISTER_KERNEL_BUILDER(                                         \
-      Name("RGBToHSV").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
-      RGBToHSVOp<SYCLDevice, T>);                                  \
-  REGISTER_KERNEL_BUILDER(                                         \
-      Name("HSVToRGB").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
-      HSVToRGBOp<SYCLDevice, T>);
-TF_CALL_float(REGISTER_SYCL);
-TF_CALL_double(REGISTER_SYCL);
-#endif
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/image/crop_and_resize_op.cc b/tensorflow/core/kernels/image/crop_and_resize_op.cc
index 18a66afe073..9d0f27d0a20 100644
--- a/tensorflow/core/kernels/image/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/image/crop_and_resize_op.cc
@@ -227,7 +227,7 @@ struct CropAndResize<CPUDevice, T> {
     const int depth = crops.dimension(3);
 
     // Sharding across boxes.
-    auto CropAndResizePerBox = [&](int start_box, int limit_box) {
+    auto CropAndResizePerBox = [&](int64 start_box, int64 limit_box) {
       for (int b = start_box; b < limit_box; ++b) {
         const float y1 = boxes(b, 0);
         const float x1 = boxes(b, 1);
@@ -453,7 +453,7 @@ struct CropAndResizeBackpropImage<CPUDevice, T> {
 
     grads_image.setZero();
 
-    auto CropAndResizeBackImgPerBox = [&](int start_box, int limit_box) {
+    auto CropAndResizeBackImgPerBox = [&](int64 start_box, int64 limit_box) {
       for (int b = start_box; b < limit_box; ++b) {
         const float y1 = boxes(b, 0);
         const float x1 = boxes(b, 1);
diff --git a/tensorflow/core/kernels/image/decode_image_op.cc b/tensorflow/core/kernels/image/decode_image_op.cc
index 407ff59874e..61b126fb81e 100644
--- a/tensorflow/core/kernels/image/decode_image_op.cc
+++ b/tensorflow/core/kernels/image/decode_image_op.cc
@@ -67,296 +67,6 @@ FileFormat ClassifyFileFormat(StringPiece data) {
   return kUnknownFormat;
 }
 
-string FileFormatString(FileFormat magic, StringPiece data) {
-  switch (magic) {
-    case kPngFormat:
-      return "PNG";
-    case kJpgFormat:
-      return "JPEG";
-    case kGifFormat:
-      return "GIF";
-    default: {
-      if (data.empty()) return "empty file";
-      return strings::StrCat("unknown format starting with '",
-                             absl::CEscape(data.substr(0, 16)), "'");
-    }
-  }
-}
-
-// Decode an image (either jpeg, png, or gif).  We use a single op so that
-// users don't have to care about which format they have.
-// TODO(b/141645641): Separate concerns here: constructors uses name to
-// determine type of parsing, compute uses file magic to parse and these might
-// not match.
-class DecodeImageOp : public OpKernel {
- public:
-  explicit DecodeImageOp(OpKernelConstruction* context) : OpKernel(context) {
-    // Determine which op we are: jpeg, png, gif, or any
-    if (type_string() == "DecodeJpeg") {
-      format_ = kJpgFormat;
-    } else if (type_string() == "DecodeAndCropJpeg") {
-      format_ = kJpgFormat;
-      flags_.crop = true;
-    } else if (type_string() == "DecodePng") {
-      format_ = kPngFormat;
-    } else if (type_string() == "DecodeGif") {
-      format_ = kGifFormat;
-    } else {
-      OP_REQUIRES_OK(context,
-                     errors::InvalidArgument("Bad op type ", type_string()));
-    }
-
-    if (format_ == kGifFormat) {
-      channels_ = 3;
-    } else {
-      OP_REQUIRES_OK(context, context->GetAttr("channels", &channels_));
-      OP_REQUIRES(
-          context,
-          channels_ == 0 || channels_ == 1 || channels_ == 3 || channels_ == 4,
-          errors::InvalidArgument("channels must be 0, 1, 3, or 4, got ",
-                                  channels_));
-    }
-    flags_.components = channels_;
-
-    // In the case of png, we support uint16 output
-    if (format_ == kPngFormat) {
-      DataType dt;
-      OP_REQUIRES_OK(context, context->GetAttr("dtype", &dt));
-      OP_REQUIRES(
-          context, dt == DataType::DT_UINT8 || dt == DataType::DT_UINT16,
-          errors::InvalidArgument("Type must be uint8 or uint16, got ", dt));
-      if (dt == DataType::DT_UINT8) {
-        channel_bits_ = 8;
-      } else {
-        channel_bits_ = 16;
-      }
-    }
-
-    // The TensorFlow-chosen default for jpeg decoding is IFAST, sacrificing
-    // image quality for speed.
-    flags_.dct_method = JDCT_IFAST;
-
-    if (format_ == kJpgFormat) {
-      OP_REQUIRES_OK(context, context->GetAttr("ratio", &flags_.ratio));
-      OP_REQUIRES(context,
-                  flags_.ratio == 1 || flags_.ratio == 2 || flags_.ratio == 4 ||
-                      flags_.ratio == 8,
-                  errors::InvalidArgument("ratio must be 1, 2, 4, or 8, got ",
-                                          flags_.ratio));
-      OP_REQUIRES_OK(context, context->GetAttr("fancy_upscaling",
-                                               &flags_.fancy_upscaling));
-      OP_REQUIRES_OK(context,
-                     context->GetAttr("try_recover_truncated",
-                                      &flags_.try_recover_truncated_jpeg));
-      OP_REQUIRES_OK(context,
-                     context->GetAttr("acceptable_fraction",
-                                      &flags_.min_acceptable_fraction));
-
-      string dct_method;
-      OP_REQUIRES_OK(context, context->GetAttr("dct_method", &dct_method));
-      OP_REQUIRES(
-          context,
-          (dct_method.empty() || dct_method == "INTEGER_FAST" ||
-           dct_method == "INTEGER_ACCURATE"),
-          errors::InvalidArgument("dct_method must be one of "
-                                  "{'', 'INTEGER_FAST', 'INTEGER_ACCURATE'}"));
-      if (dct_method == "INTEGER_FAST") {
-        flags_.dct_method = JDCT_IFAST;
-      } else if (dct_method == "INTEGER_ACCURATE") {
-        flags_.dct_method = JDCT_ISLOW;
-      }
-    }
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& contents = context->input(0);
-    OP_REQUIRES(context, TensorShapeUtils::IsScalar(contents.shape()),
-                errors::InvalidArgument("contents must be scalar, got shape ",
-                                        contents.shape().DebugString()));
-
-    // Determine format
-    const StringPiece input = contents.scalar<tstring>()();
-    const auto magic = ClassifyFileFormat(input);
-    OP_REQUIRES(
-        context,
-        magic == kJpgFormat || magic == kPngFormat || magic == kGifFormat,
-        errors::InvalidArgument("Expected image (JPEG, PNG, or GIF), got ",
-                                FileFormatString(magic, input)));
-    OP_REQUIRES(context, input.size() <= std::numeric_limits<int>::max(),
-                errors::InvalidArgument(
-                    FileFormatString(magic, input),
-                    " contents are too large for int: ", input.size()));
-    OP_REQUIRES(context, magic == kPngFormat || channel_bits_ == 8,
-                errors::InvalidArgument(FileFormatString(magic, input),
-                                        " does not support uint16 output"));
-
-    switch (magic) {
-      case kJpgFormat:
-        DecodeJpeg(context, input);
-        break;
-      case kPngFormat:
-        DecodePng(context, input);
-        break;
-      case kGifFormat:
-        DecodeGif(context, input);
-        break;
-      default:
-        LOG(FATAL) << "Should never get here after check above";
-        break;
-    }
-  }
-
-  void DecodeJpeg(OpKernelContext* context, StringPiece input) {
-    OP_REQUIRES(context, channels_ == 0 || channels_ == 1 || channels_ == 3,
-                errors::InvalidArgument(
-                    "channels must be 0, 1, or 3 for JPEG, got ", channels_));
-
-    // Use local copy of flags to avoid race condition as the class member is
-    // shared among different invocations.
-    jpeg::UncompressFlags flags = flags_;
-    if (flags.crop) {
-      // Update flags to include crop window.
-      const Tensor& crop_window = context->input(1);
-      OP_REQUIRES(context, crop_window.dims() == 1,
-                  errors::InvalidArgument("crop_window must be 1-D, got shape ",
-                                          crop_window.shape().DebugString()));
-      OP_REQUIRES(context, crop_window.dim_size(0) == 4,
-                  errors::InvalidArgument("crop_size must have four elements ",
-                                          crop_window.shape().DebugString()));
-      auto crop_window_vec = crop_window.vec<int32>();
-      flags.crop_y = crop_window_vec(0);
-      flags.crop_x = crop_window_vec(1);
-      flags.crop_height = crop_window_vec(2);
-      flags.crop_width = crop_window_vec(3);
-    }
-
-    // Decode jpeg, allocating tensor once the size is known.
-    Tensor* output = nullptr;
-    OP_REQUIRES(
-        context,
-        jpeg::Uncompress(
-            input.data(), input.size(), flags, nullptr /* nwarn */,
-            [=, &output](int width, int height, int channels) -> uint8* {
-              Status status(context->allocate_output(
-                  0,
-                  format_ == kGifFormat
-                      ? TensorShape({1, height, width, channels})
-                      : TensorShape({height, width, channels}),
-                  &output));
-              if (!status.ok()) {
-                VLOG(1) << status;
-                context->SetStatus(status);
-                return nullptr;
-              }
-              return output->flat<uint8>().data();
-            }),
-        errors::InvalidArgument("Invalid JPEG data or crop window, data size ",
-                                input.size()));
-  }
-
-  void DecodePng(OpKernelContext* context, StringPiece input) {
-    // Start decoding png to get shape details
-    png::DecodeContext decode;
-    OP_REQUIRES(context,
-                png::CommonInitDecode(input, channels_, channel_bits_, &decode),
-                errors::InvalidArgument("Invalid PNG header, data size ",
-                                        input.size()));
-
-    // Verify that width and height are not too large:
-    // - verify width and height don't overflow int.
-    // - width can later be multiplied by channels_ and sizeof(uint16), so
-    //   verify single dimension is not too large.
-    // - verify when width and height are multiplied together, there are a few
-    //   bits to spare as well.
-    const int width = static_cast<int>(decode.width);
-    const int height = static_cast<int>(decode.height);
-    const int64 total_size =
-        static_cast<int64>(width) * static_cast<int64>(height);
-    if (width != static_cast<int64>(decode.width) || width <= 0 ||
-        width >= (1LL << 27) || height != static_cast<int64>(decode.height) ||
-        height <= 0 || height >= (1LL << 27) || total_size >= (1LL << 29)) {
-      png::CommonFreeDecode(&decode);
-      OP_REQUIRES(context, false,
-                  errors::InvalidArgument("PNG size too large for int: ",
-                                          decode.width, " by ", decode.height));
-    }
-
-    // Allocate tensor
-    Tensor* output = nullptr;
-    const auto status = context->allocate_output(
-        0,
-        format_ == kGifFormat ? TensorShape({1, height, width, decode.channels})
-                              : TensorShape({height, width, decode.channels}),
-        &output);
-    if (!status.ok()) png::CommonFreeDecode(&decode);
-    OP_REQUIRES_OK(context, status);
-
-    if (channel_bits_ == 8) {
-      // Finish decoding png
-      OP_REQUIRES(
-          context,
-          png::CommonFinishDecode(
-              reinterpret_cast<png_bytep>(output->flat<uint8>().data()),
-              decode.channels * width * sizeof(uint8), &decode),
-          errors::InvalidArgument("Invalid PNG data, size ", input.size()));
-    } else {
-      // Finish decoding png
-      OP_REQUIRES(
-          context,
-          png::CommonFinishDecode(
-              reinterpret_cast<png_bytep>(output->flat<uint16>().data()),
-              decode.channels * width * sizeof(uint16), &decode),
-          errors::InvalidArgument("Invalid PNG data, size ", input.size()));
-    }
-  }
-
-  void DecodeGif(OpKernelContext* context, StringPiece input) {
-    OP_REQUIRES(context, channels_ == 0 || channels_ == 3,
-                errors::InvalidArgument("channels must be 0 or 3 for GIF, got ",
-                                        channels_));
-
-    // Decode GIF, allocating tensor once the size is known.
-    Tensor* output = nullptr;
-    string error_string;
-    OP_REQUIRES(
-        context,
-        gif::Decode(input.data(), input.size(),
-                    [=, &output](int num_frames, int width, int height,
-                                 int channels) -> uint8* {
-                      Status status;
-                      if (format_ == kGifFormat) {
-                        status = context->allocate_output(
-                            0,
-                            TensorShape({num_frames, height, width, channels}),
-                            &output);
-                      } else if (num_frames == 1) {
-                        status = context->allocate_output(
-                            0, TensorShape({height, width, channels}), &output);
-                      } else {
-                        status = errors::InvalidArgument(
-                            "Got ", num_frames, " frames, but animated gifs ",
-                            "can only be decoded by tf.io.decode_gif or ",
-                            "tf.io.decode_image");
-                      }
-                      if (!status.ok()) {
-                        VLOG(1) << status;
-                        context->SetStatus(status);
-                        return nullptr;
-                      }
-                      return output->flat<uint8>().data();
-                    },
-                    &error_string),
-        errors::InvalidArgument("Invalid GIF data (size ", input.size(), "), ",
-                                error_string));
-  }
-
- private:
-  FileFormat format_;
-  int channels_;
-  int channel_bits_ = 8;
-  jpeg::UncompressFlags flags_;
-};
-
 // Decode an image. Supported image formats are JPEG, PNG, GIF and BMP. This is
 // a newer version of `DecodeImageOp` for enabling image data parsing to take
 // place in kernels only, reducing security vulnerabilities and redundancy.
@@ -532,6 +242,16 @@ class DecodeImageV2Op : public OpKernel {
       flags.crop_x = crop_window_vec(1);
       flags.crop_height = crop_window_vec(2);
       flags.crop_width = crop_window_vec(3);
+    } else if (op_type_ == "DecodeBmp") {
+      // TODO(b/171060723): Only DecodeBmp as op_type_ is not acceptable here
+      // because currently `decode_(jpeg|png|gif)` ops can decode any one of
+      // jpeg, png or gif but not bmp. Similarly, `decode_bmp` cannot decode
+      // anything but bmp formats. This behavior needs to be revisited. For more
+      // details, please refer to the bug.
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument(
+                      "Trying to decode JPEG format using DecodeBmp op. Use "
+                      "`decode_jpeg` or `decode_image` instead."));
     }
 
     // Output tensor and the image buffer size.
@@ -636,6 +356,24 @@ class DecodeImageV2Op : public OpKernel {
       status = context->allocate_output(
           0, TensorShape({height, width, decode.channels}), &output);
     }
+
+    if (op_type_ == "DecodeBmp") {
+      // TODO(b/171060723): Only DecodeBmp as op_type_ is not acceptable here
+      // because currently `decode_(jpeg|png|gif)` ops can decode any one of
+      // jpeg, png or gif but not bmp. Similarly, `decode_bmp` cannot decode
+      // anything but bmp formats. This behavior needs to be revisited. For more
+      // details, please refer to the bug.
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument(
+                      "Trying to decode PNG format using DecodeBmp op. Use "
+                      "`decode_png` or `decode_image` instead."));
+    } else if (op_type_ == "DecodeAndCropJpeg") {
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument(
+                      "DecodeAndCropJpeg operation can run on JPEG only, but "
+                      "detected PNG."));
+    }
+
     if (!status.ok()) png::CommonFreeDecode(&decode);
     OP_REQUIRES_OK(context, status);
 
@@ -683,6 +421,23 @@ class DecodeImageV2Op : public OpKernel {
                 errors::InvalidArgument("channels must be 0 or 3 for GIF, got ",
                                         channels_));
 
+    if (op_type_ == "DecodeBmp") {
+      // TODO(b/171060723): Only DecodeBmp as op_type_ is not acceptable here
+      // because currently `decode_(jpeg|png|gif)` ops can decode any one of
+      // jpeg, png or gif but not bmp. Similarly, `decode_bmp` cannot decode
+      // anything but bmp formats. This behavior needs to be revisited. For more
+      // details, please refer to the bug.
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument(
+                      "Trying to decode GIF format using DecodeBmp op. Use "
+                      "`decode_gif` or `decode_image` instead."));
+    } else if (op_type_ == "DecodeAndCropJpeg") {
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument(
+                      "DecodeAndCropJpeg operation can run on JPEG only, but "
+                      "detected GIF."));
+    }
+
     // Decode GIF, allocating tensor if dtype is uint8, otherwise defer tensor
     // allocation til after dtype conversion is done. `gif`::Decode` supports
     // uint8 only.
@@ -767,6 +522,21 @@ class DecodeImageV2Op : public OpKernel {
         errors::InvalidArgument(
             "`channels` must be 0, 3 or 4 for BMP, but got ", channels_));
 
+    if (op_type_ != "DecodeBmp" && op_type_ != "DecodeImage") {
+      if (op_type_ == "DecodeAndCropJpeg") {
+        OP_REQUIRES(context, false,
+                    errors::InvalidArgument(
+                        "DecodeAndCropJpeg operation can run on JPEG only, but "
+                        "detected BMP."));
+      } else {
+        OP_REQUIRES(context, false,
+                    errors::InvalidArgument(
+                        "Trying to decode BMP format using a wrong op. Use "
+                        "`decode_bmp` or `decode_image` instead. Op used: ",
+                        op_type_));
+      }
+    }
+
     OP_REQUIRES(context, (32 <= input.size()),
                 errors::InvalidArgument("Incomplete bmp content, requires at "
                                         "least 32 bytes to find the header "
diff --git a/tensorflow/core/kernels/image/generate_box_proposals_op.cu.cc b/tensorflow/core/kernels/image/generate_box_proposals_op.cu.cc
index 721d190fa22..a12cd3e6601 100644
--- a/tensorflow/core/kernels/image/generate_box_proposals_op.cu.cc
+++ b/tensorflow/core/kernels/image/generate_box_proposals_op.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include <algorithm>
@@ -34,24 +34,6 @@ limitations under the License.
 
 namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
-#define TF_RETURN_IF_CUDA_ERROR(result)                   \
-  do {                                                    \
-    cudaError_t error(result);                            \
-    if (!SE_PREDICT_TRUE(error == cudaSuccess)) {         \
-      return errors::Internal("Cuda call failed with ",   \
-                              cudaGetErrorString(error)); \
-    }                                                     \
-  } while (0)
-
-#define TF_OP_REQUIRES_CUDA_SUCCESS(context, result)                   \
-  do {                                                                 \
-    cudaError_t error(result);                                         \
-    if (!SE_PREDICT_TRUE(error == cudaSuccess)) {                      \
-      context->SetStatus(errors::Internal("Cuda call failed with",     \
-                                          cudaGetErrorString(error))); \
-      return;                                                          \
-    }                                                                  \
-  } while (0)
 
 namespace {
 
@@ -62,7 +44,7 @@ namespace {
 // min_size is the lower bound of the shortest edge for the boxes to consider.
 // bbox_xform_clip is the upper bound of encoded width and height.
 __global__ void GeneratePreNMSUprightBoxesKernel(
-    const Cuda2DLaunchConfig config, const int* d_sorted_scores_keys,
+    const Gpu2DLaunchConfig config, const int* d_sorted_scores_keys,
     const float4* d_bbox_deltas, const float4* d_anchors, const int height,
     const int width, const int num_anchors, const float min_size,
     const float* d_img_info_vec,  // Input "image_info" to the op [N,5]
@@ -162,7 +144,7 @@ __global__ void GeneratePreNMSUprightBoxesKernel(
 // Copy the selected boxes and scores to output tensors.
 //
 __global__ void WriteUprightBoxesOutput(
-    const CudaLaunchConfig nboxes, const float4* d_image_boxes,
+    const GpuLaunchConfig nboxes, const float4* d_image_boxes,
     const float* d_image_scores, const int* d_image_boxes_keep_list,
     const int n_rois, float* d_image_out_rois, float* d_image_out_rois_probs) {
   CUDA_1D_KERNEL_LOOP(i, nboxes.virtual_thread_count) {
@@ -191,7 +173,7 @@ __global__ void WriteUprightBoxesOutput(
 
 template <typename T>
 Status ResetTensor(Tensor* t, const Eigen::GpuDevice& d) {
-  CudaLaunchConfig zconfig = GetCudaLaunchConfig(t->NumElements(), d);
+  GpuLaunchConfig zconfig = GetGpuLaunchConfig(t->NumElements(), d);
   return GpuLaunchKernel(SetZero<T>, zconfig.block_count,
                          zconfig.thread_per_block, 0, d.stream(),
                          zconfig.virtual_thread_count, (*t).flat<T>().data());
@@ -280,7 +262,7 @@ Status AllocatePreNMSTempTensors(
 
 // Initialize index and offset arrays.
 // num_images is the batch size.
-__global__ void InitializeDataKernel(const Cuda2DLaunchConfig config,
+__global__ void InitializeDataKernel(const Gpu2DLaunchConfig config,
                                      int* d_image_offsets,
                                      int* d_boxes_keys_iota) {
   const int image_size = config.virtual_thread_count.x;
@@ -365,18 +347,19 @@ class GenerateBoundingBoxProposals : public tensorflow::OpKernel {
     size_t cub_sort_temp_storage_bytes = 0;
     float* flt_ptr = nullptr;
     int* int_ptr = nullptr;
-    cudaError_t cuda_ret = cub::DeviceSegmentedRadixSort::SortPairsDescending(
-        nullptr, cub_sort_temp_storage_bytes, flt_ptr, flt_ptr, int_ptr,
-        int_ptr, num_images * conv_layer_nboxes, num_images, int_ptr, int_ptr,
-        0, 8 * sizeof(float),  // sort all bits
-        cuda_stream);
+    cudaError_t cuda_ret =
+        gpuprim::DeviceSegmentedRadixSort::SortPairsDescending(
+            nullptr, cub_sort_temp_storage_bytes, flt_ptr, flt_ptr, int_ptr,
+            int_ptr, num_images * conv_layer_nboxes, num_images, int_ptr,
+            int_ptr, 0, 8 * sizeof(float),  // sort all bits
+            cuda_stream);
     TF_OP_REQUIRES_CUDA_SUCCESS(context, cuda_ret);
     // get the size of select temp buffer
     size_t cub_select_temp_storage_bytes = 0;
     char* char_ptr = nullptr;
     float4* f4_ptr = nullptr;
     TF_OP_REQUIRES_CUDA_SUCCESS(
-        context, cub::DeviceSelect::Flagged(
+        context, gpuprim::DeviceSelect::Flagged(
                      nullptr, cub_select_temp_storage_bytes, f4_ptr, char_ptr,
                      f4_ptr, int_ptr, image_stride * num_anchors, cuda_stream));
     Tensor d_conv_layer_indexes;  // box indices on device
@@ -399,8 +382,8 @@ class GenerateBoundingBoxProposals : public tensorflow::OpKernel {
             &dev_boxes_keep_flags, num_images, conv_layer_nboxes,
             cub_temp_storage_bytes, nboxes_to_generate, box_dim));
     const GPUDevice& d = context->eigen_device<GPUDevice>();
-    Cuda2DLaunchConfig conf2d =
-        GetCuda2DLaunchConfig(conv_layer_nboxes, num_images, d);
+    Gpu2DLaunchConfig conf2d =
+        GetGpu2DLaunchConfig(conv_layer_nboxes, num_images, d);
     // create box indices and offsets for each image on device
     OP_REQUIRES_OK(
         context, GpuLaunchKernel(InitializeDataKernel, conf2d.block_count,
@@ -412,7 +395,7 @@ class GenerateBoundingBoxProposals : public tensorflow::OpKernel {
     // d_sorted_conv_layer_indexes will hold the pointers to old indices.
     TF_OP_REQUIRES_CUDA_SUCCESS(
         context,
-        cub::DeviceSegmentedRadixSort::SortPairsDescending(
+        gpuprim::DeviceSegmentedRadixSort::SortPairsDescending(
             d_cub_temp_buffer.flat<int8>().data(), cub_temp_storage_bytes,
             scores.flat<float>().data(), dev_sorted_scores.flat<float>().data(),
             d_conv_layer_indexes.flat<int>().data(),
@@ -423,7 +406,7 @@ class GenerateBoundingBoxProposals : public tensorflow::OpKernel {
             8 * sizeof(float),  // sort all bits
             cuda_stream));
     // Keeping only the topN pre_nms
-    conf2d = GetCuda2DLaunchConfig(nboxes_to_generate, num_images, d);
+    conf2d = GetGpu2DLaunchConfig(nboxes_to_generate, num_images, d);
 
     // create box y1,x1,y2,x2 from box_deltas and anchors (decode the boxes) and
     // mark the boxes which are smaller that min_size ignored.
@@ -481,8 +464,8 @@ class GenerateBoundingBoxProposals : public tensorflow::OpKernel {
                                 &output_roi_probs));
     float* d_postnms_rois = (*output_rois).flat<float>().data();
     float* d_postnms_rois_probs = (*output_roi_probs).flat<float>().data();
-    cudaEvent_t copy_done;
-    cudaEventCreate(&copy_done);
+    gpuEvent_t copy_done;
+    gpuEventCreate(&copy_done);
 
     // Do  per-image nms
     for (int image_index = 0; image_index < num_images; ++image_index) {
@@ -509,7 +492,7 @@ class GenerateBoundingBoxProposals : public tensorflow::OpKernel {
       // Moving valid boxes (ie the ones with d_boxes_keep_flags[ibox] == true)
       // to the output tensors
       TF_OP_REQUIRES_CUDA_SUCCESS(
-          context, cub::DeviceSelect::Flagged(
+          context, gpuprim::DeviceSelect::Flagged(
                        d_cub_temp_storage, cub_temp_storage_bytes,
                        reinterpret_cast<const float4*>(d_image_boxes),
                        d_image_boxes_keep_flags,
@@ -517,14 +500,14 @@ class GenerateBoundingBoxProposals : public tensorflow::OpKernel {
                        d_prenms_nboxes, nboxes_generated, d.stream()));
       TF_OP_REQUIRES_CUDA_SUCCESS(
           context,
-          cub::DeviceSelect::Flagged(
+          gpuprim::DeviceSelect::Flagged(
               d_cub_temp_storage, cub_temp_storage_bytes, d_image_sorted_scores,
               d_image_boxes_keep_flags, d_image_prenms_scores, d_prenms_nboxes,
               nboxes_generated, d.stream()));
       d.memcpyDeviceToHost(&h_prenms_nboxes, d_prenms_nboxes, sizeof(int));
       TF_OP_REQUIRES_CUDA_SUCCESS(context,
-                                  cudaEventRecord(copy_done, d.stream()));
-      TF_OP_REQUIRES_CUDA_SUCCESS(context, cudaEventSynchronize(copy_done));
+                                  gpuEventRecord(copy_done, d.stream()));
+      TF_OP_REQUIRES_CUDA_SUCCESS(context, gpuEventSynchronize(copy_done));
       // We know prenms_boxes <= topN_prenms, because nboxes_generated <=
       // topN_prenms. Calling NMS on the generated boxes
       const int prenms_nboxes = h_prenms_nboxes;
@@ -538,7 +521,7 @@ class GenerateBoundingBoxProposals : public tensorflow::OpKernel {
       const int postnms_nboxes = std::min(nkeep, post_nms_topn_);
       // Moving the out boxes to the output tensors,
       // adding the image_index dimension on the fly
-      CudaLaunchConfig config = GetCudaLaunchConfig(post_nms_topn_, d);
+      GpuLaunchConfig config = GetGpuLaunchConfig(post_nms_topn_, d);
       // make this single kernel
       OP_REQUIRES_OK(
           context,
diff --git a/tensorflow/core/kernels/image/image_ops.cc b/tensorflow/core/kernels/image/image_ops.cc
index f121fb81654..af1c2fd467f 100644
--- a/tensorflow/core/kernels/image/image_ops.cc
+++ b/tensorflow/core/kernels/image/image_ops.cc
@@ -48,6 +48,68 @@ using functor::FillProjectiveTransform;
 using generator::Interpolation;
 using generator::Mode;
 
+template <typename Device, typename T>
+void DoImageProjectiveTransformOp(OpKernelContext* ctx,
+                                  const Interpolation& interpolation,
+                                  const Mode& fill_mode) {
+  const Tensor& images_t = ctx->input(0);
+  const Tensor& transform_t = ctx->input(1);
+  OP_REQUIRES(ctx, images_t.shape().dims() == 4,
+              errors::InvalidArgument("Input images must have rank 4"));
+  OP_REQUIRES(ctx,
+              (TensorShapeUtils::IsMatrix(transform_t.shape()) &&
+               (transform_t.dim_size(0) == images_t.dim_size(0) ||
+                transform_t.dim_size(0) == 1) &&
+               transform_t.dim_size(1) == 8),
+              errors::InvalidArgument(
+                  "Input transform should be num_images x 8 or 1 x 8"));
+
+  int32 out_height, out_width;
+  // Kernel is shared by legacy "ImageProjectiveTransform" op with 2 args.
+  if (ctx->num_inputs() >= 3) {
+    const Tensor& shape_t = ctx->input(2);
+    OP_REQUIRES(ctx, shape_t.dims() == 1,
+                errors::InvalidArgument("output shape must be 1-dimensional",
+                                        shape_t.shape().DebugString()));
+    OP_REQUIRES(ctx, shape_t.NumElements() == 2,
+                errors::InvalidArgument("output shape must have two elements",
+                                        shape_t.shape().DebugString()));
+    auto shape_vec = shape_t.vec<int32>();
+    out_height = shape_vec(0);
+    out_width = shape_vec(1);
+    OP_REQUIRES(ctx, out_height > 0 && out_width > 0,
+                errors::InvalidArgument("output dimensions must be positive"));
+  } else {
+    // Shape is N (batch size), H (height), W (width), C (channels).
+    out_height = images_t.shape().dim_size(1);
+    out_width = images_t.shape().dim_size(2);
+  }
+
+  T fill_value(0);
+  // Kernel is shared by "ImageProjectiveTransformV2" with 3 args.
+  if (ctx->num_inputs() >= 4) {
+    const Tensor& fill_value_t = ctx->input(3);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(fill_value_t.shape()),
+                errors::InvalidArgument("fill_value must be a scalar",
+                                        fill_value_t.shape().DebugString()));
+    fill_value = static_cast<T>(*(fill_value_t.scalar<float>().data()));
+  }
+
+  Tensor* output_t;
+  OP_REQUIRES_OK(
+      ctx, ctx->allocate_output(0,
+                                TensorShape({images_t.dim_size(0), out_height,
+                                             out_width, images_t.dim_size(3)}),
+                                &output_t));
+  auto output = output_t->tensor<T, 4>();
+  auto images = images_t.tensor<T, 4>();
+  auto transform = transform_t.matrix<float>();
+
+  (FillProjectiveTransform<Device, T>(interpolation))(
+      ctx->eigen_device<Device>(), &output, images, transform, fill_mode,
+      fill_value);
+}
+
 template <typename Device, typename T>
 class ImageProjectiveTransformV2 : public OpKernel {
  private:
@@ -84,52 +146,7 @@ class ImageProjectiveTransformV2 : public OpKernel {
   }
 
   void Compute(OpKernelContext* ctx) override {
-    const Tensor& images_t = ctx->input(0);
-    const Tensor& transform_t = ctx->input(1);
-    OP_REQUIRES(ctx, images_t.shape().dims() == 4,
-                errors::InvalidArgument("Input images must have rank 4"));
-    OP_REQUIRES(ctx,
-                (TensorShapeUtils::IsMatrix(transform_t.shape()) &&
-                 (transform_t.dim_size(0) == images_t.dim_size(0) ||
-                  transform_t.dim_size(0) == 1) &&
-                 transform_t.dim_size(1) == 8),
-                errors::InvalidArgument(
-                    "Input transform should be num_images x 8 or 1 x 8"));
-
-    int32 out_height, out_width;
-    // Kernel is shared by legacy "ImageProjectiveTransform" op with 2 args.
-    if (ctx->num_inputs() >= 3) {
-      const Tensor& shape_t = ctx->input(2);
-      OP_REQUIRES(ctx, shape_t.dims() == 1,
-                  errors::InvalidArgument("output shape must be 1-dimensional",
-                                          shape_t.shape().DebugString()));
-      OP_REQUIRES(ctx, shape_t.NumElements() == 2,
-                  errors::InvalidArgument("output shape must have two elements",
-                                          shape_t.shape().DebugString()));
-      auto shape_vec = shape_t.vec<int32>();
-      out_height = shape_vec(0);
-      out_width = shape_vec(1);
-      OP_REQUIRES(
-          ctx, out_height > 0 && out_width > 0,
-          errors::InvalidArgument("output dimensions must be positive"));
-    } else {
-      // Shape is N (batch size), H (height), W (width), C (channels).
-      out_height = images_t.shape().dim_size(1);
-      out_width = images_t.shape().dim_size(2);
-    }
-
-    Tensor* output_t;
-    OP_REQUIRES_OK(ctx, ctx->allocate_output(
-                            0,
-                            TensorShape({images_t.dim_size(0), out_height,
-                                         out_width, images_t.dim_size(3)}),
-                            &output_t));
-    auto output = output_t->tensor<T, 4>();
-    auto images = images_t.tensor<T, 4>();
-    auto transform = transform_t.matrix<float>();
-
-    (FillProjectiveTransform<Device, T>(interpolation_))(
-        ctx->eigen_device<Device>(), &output, images, transform, fill_mode_);
+    DoImageProjectiveTransformOp<Device, T>(ctx, interpolation_, fill_mode_);
   }
 };
 
@@ -148,6 +165,29 @@ TF_CALL_double(REGISTER);
 
 #undef REGISTER
 
+template <typename Device, typename T>
+class ImageProjectiveTransformV3
+    : public ImageProjectiveTransformV2<Device, T> {
+ public:
+  explicit ImageProjectiveTransformV3(OpKernelConstruction* ctx)
+      : ImageProjectiveTransformV2<Device, T>(ctx) {}
+};
+
+#define REGISTER(TYPE)                                        \
+  REGISTER_KERNEL_BUILDER(Name("ImageProjectiveTransformV3")  \
+                              .Device(DEVICE_CPU)             \
+                              .TypeConstraint<TYPE>("dtype"), \
+                          ImageProjectiveTransformV3<CPUDevice, TYPE>)
+
+TF_CALL_uint8(REGISTER);
+TF_CALL_int32(REGISTER);
+TF_CALL_int64(REGISTER);
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+
+#undef REGISTER
+
 #if GOOGLE_CUDA
 
 typedef Eigen::GpuDevice GPUDevice;
@@ -161,7 +201,8 @@ namespace functor {
   template <>                                                               \
   void FillProjectiveTransform<GPUDevice, TYPE>::operator()(                \
       const GPUDevice& device, OutputType* output, const InputType& images, \
-      const TransformsType& transform, const Mode fill_mode) const;         \
+      const TransformsType& transform, const Mode fill_mode,                \
+      const TYPE fill_value) const;                                         \
   extern template struct FillProjectiveTransform<GPUDevice, TYPE>
 
 TF_CALL_uint8(DECLARE_PROJECT_FUNCTOR);
@@ -204,6 +245,23 @@ TF_CALL_double(REGISTER);
 
 #undef REGISTER
 
+#define REGISTER(TYPE)                                       \
+  REGISTER_KERNEL_BUILDER(Name("ImageProjectiveTransformV3") \
+                              .Device(DEVICE_GPU)            \
+                              .TypeConstraint<TYPE>("dtype") \
+                              .HostMemory("output_shape")    \
+                              .HostMemory("fill_value"),     \
+                          ImageProjectiveTransformV3<GPUDevice, TYPE>)
+
+TF_CALL_uint8(REGISTER);
+TF_CALL_int32(REGISTER);
+TF_CALL_int64(REGISTER);
+TF_CALL_half(REGISTER);
+TF_CALL_float(REGISTER);
+TF_CALL_double(REGISTER);
+
+#undef REGISTER
+
 #endif  // GOOGLE_CUDA
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/image/image_ops.h b/tensorflow/core/kernels/image/image_ops.h
index 70b47e181df..abaccf88d64 100644
--- a/tensorflow/core/kernels/image/image_ops.h
+++ b/tensorflow/core/kernels/image/image_ops.h
@@ -35,6 +35,8 @@ enum Mode { FILL_REFLECT, FILL_WRAP, FILL_CONSTANT, FILL_NEAREST };
 using Eigen::array;
 using Eigen::DenseIndex;
 
+// Follow scipy's implementation
+// https://github.com/scipy/scipy/blob/master/scipy/ndimage/src/ni_interpolation.c
 template <typename Device, Mode M>
 struct MapCoordinate {
   float operator()(const float out_coord, const DenseIndex len);
@@ -44,22 +46,32 @@ template <typename Device>
 struct MapCoordinate<Device, Mode::FILL_REFLECT> {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float operator()(const float out_coord,
                                                          const DenseIndex len) {
+    // Reflect [abcd] to [dcba|abcd|dcba].
     float in_coord = out_coord;
-    // Reflect [abcd] to [dcba|abcd|dcba], periodically from [0, 2 * len)
-    // over [abcddcba]
-    const DenseIndex boundary = 2 * len;
-    // Shift coordinate to (-boundary, boundary)
-    in_coord -= boundary * static_cast<DenseIndex>(in_coord / boundary);
-    // Convert negative coordinates from [-boundary, 0) to [0, boundary)
     if (in_coord < 0) {
-      in_coord += boundary;
+      if (len <= 1) {
+        in_coord = 0;
+      } else {
+        const DenseIndex sz2 = 2 * len;
+        if (in_coord < sz2) {
+          in_coord = sz2 * static_cast<DenseIndex>(-in_coord / sz2) + in_coord;
+        }
+        in_coord = (in_coord < -len) ? in_coord + sz2 : -in_coord - 1;
+      }
+    } else if (in_coord > len - 1) {
+      if (len <= 1) {
+        in_coord = 0;
+      } else {
+        const DenseIndex sz2 = 2 * len;
+        in_coord -= sz2 * static_cast<DenseIndex>(in_coord / sz2);
+        if (in_coord >= len) {
+          in_coord = sz2 - in_coord - 1;
+        }
+      }
     }
-    // Coordinate in_coord between [len, boundary) should reverse reflect
-    // to coordinate to (bounary - 1 - in_coord) between [0, len)
-    if (in_coord > len - 1) {
-      in_coord = boundary - 1 - in_coord;
-    }
-    return in_coord;
+    // clamp is necessary because when out_coord = 3.5 and len = 4,
+    // in_coord = 3.5 and will be rounded to 4 in nearest interpolation.
+    return Eigen::internal::scalar_clamp_op<float>(0.0f, len - 1)(in_coord);
   }
 };
 
@@ -67,17 +79,26 @@ template <typename Device>
 struct MapCoordinate<Device, Mode::FILL_WRAP> {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float operator()(const float out_coord,
                                                          const DenseIndex len) {
+    // Wrap [abcd] to [abcd|abcd|abcd].
     float in_coord = out_coord;
-    // Wrap [abcd] to [abcd|abcd|abcd], periodically from [0, len)
-    // over [abcd]
-    const DenseIndex boundary = len;
-    // Shift coordinate to (-boundary, boundary)
-    in_coord -= boundary * static_cast<DenseIndex>(in_coord / boundary);
-    // Shift negative coordinate from [-boundary, 0) to [0, boundary)
     if (in_coord < 0) {
-      in_coord += boundary;
+      if (len <= 1) {
+        in_coord = 0;
+      } else {
+        const DenseIndex sz = len - 1;
+        in_coord += len * (static_cast<DenseIndex>(-in_coord / sz) + 1);
+      }
+    } else if (in_coord > len - 1) {
+      if (len <= 1) {
+        in_coord = 0;
+      } else {
+        const DenseIndex sz = len - 1;
+        in_coord -= len * static_cast<DenseIndex>(in_coord / sz);
+      }
     }
-    return in_coord;
+    // clamp is necessary because when out_coord = -0.5 and len = 4,
+    // in_coord = 3.5 and will be rounded to 4 in nearest interpolation.
+    return Eigen::internal::scalar_clamp_op<float>(0.0f, len - 1)(in_coord);
   }
 };
 
@@ -93,11 +114,7 @@ template <typename Device>
 struct MapCoordinate<Device, Mode::FILL_NEAREST> {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float operator()(const float out_coord,
                                                          const DenseIndex len) {
-    if (out_coord < 0)
-      return 0;
-    else if (out_coord >= len)
-      return len - 1;
-    return out_coord;
+    return Eigen::internal::scalar_clamp_op<float>(0.0f, len - 1)(out_coord);
   }
 };
 
@@ -107,17 +124,20 @@ class ProjectiveGenerator {
   typename TTypes<T, 4>::ConstTensor input_;
   typename TTypes<float>::ConstMatrix transforms_;
   const Interpolation interpolation_;
+  const T fill_value_;
 
  public:
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   ProjectiveGenerator(typename TTypes<T, 4>::ConstTensor input,
                       typename TTypes<float>::ConstMatrix transforms,
-                      const Interpolation interpolation)
-      : input_(input), transforms_(transforms), interpolation_(interpolation) {}
+                      const Interpolation interpolation, const T fill_value)
+      : input_(input),
+        transforms_(transforms),
+        interpolation_(interpolation),
+        fill_value_(fill_value) {}
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
   operator()(const array<DenseIndex, 4>& coords) const {
-    const T fill_value = T(0);
     const int64 output_y = coords[1];
     const int64 output_x = coords[2];
     const float* transform =
@@ -126,9 +146,9 @@ class ProjectiveGenerator {
             : &transforms_.data()[transforms_.dimension(1) * coords[0]];
     float projection = transform[6] * output_x + transform[7] * output_y + 1.f;
     if (projection == 0) {
-      // Return the fill value (0) for infinite coordinates,
+      // Return the fill value for infinite coordinates,
       // which are outside the input image
-      return fill_value;
+      return fill_value_;
     }
     const float input_x =
         (transform[0] * output_x + transform[1] * output_y + transform[2]) /
@@ -146,13 +166,13 @@ class ProjectiveGenerator {
     const DenseIndex channels = coords[3];
     switch (interpolation_) {
       case NEAREST:
-        return nearest_interpolation(batch, y, x, channels, fill_value);
+        return nearest_interpolation(batch, y, x, channels, fill_value_);
       case BILINEAR:
-        return bilinear_interpolation(batch, y, x, channels, fill_value);
+        return bilinear_interpolation(batch, y, x, channels, fill_value_);
     }
     // Unreachable; ImageProjectiveTransform only uses INTERPOLATION_NEAREST
     // or INTERPOLATION_BILINEAR.
-    return fill_value;
+    return fill_value_;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T
@@ -225,27 +245,27 @@ struct FillProjectiveTransform {
   EIGEN_ALWAYS_INLINE
   void operator()(const Device& device, OutputType* output,
                   const InputType& images, const TransformsType& transform,
-                  const Mode fill_mode) const {
+                  const Mode fill_mode, const T fill_value) const {
     switch (fill_mode) {
       case Mode::FILL_REFLECT:
         output->device(device) =
             output->generate(ProjectiveGenerator<Device, T, Mode::FILL_REFLECT>(
-                images, transform, interpolation));
+                images, transform, interpolation, fill_value));
         break;
       case Mode::FILL_WRAP:
         output->device(device) =
             output->generate(ProjectiveGenerator<Device, T, Mode::FILL_WRAP>(
-                images, transform, interpolation));
+                images, transform, interpolation, fill_value));
         break;
       case Mode::FILL_CONSTANT:
         output->device(device) = output->generate(
             ProjectiveGenerator<Device, T, Mode::FILL_CONSTANT>(
-                images, transform, interpolation));
+                images, transform, interpolation, fill_value));
         break;
       case Mode::FILL_NEAREST:
         output->device(device) =
             output->generate(ProjectiveGenerator<Device, T, Mode::FILL_NEAREST>(
-                images, transform, interpolation));
+                images, transform, interpolation, fill_value));
         break;
     }
   }
diff --git a/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc b/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
index c8dfe754060..90cb41dd1d8 100644
--- a/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2016-2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/kernels/image/resize_bilinear_op.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
 namespace tensorflow {
@@ -228,6 +229,59 @@ __global__ void ResizeBilinearGradKernel(const int32 nthreads,
   }
 }
 
+template <typename T>
+__global__ void ResizeBilinearDeterministicGradKernel(
+    const int32 nthreads, const float* __restrict__ input_grad,
+    float height_scale, float inverse_height_scale, float width_scale,
+    float inverse_width_scale, int batch, int original_height,
+    int original_width, int channels, int resized_height, int resized_width,
+    float offset, T* __restrict__ output_grad) {
+  GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
+    // out_idx = c + channels * (x + original_width * (y + original_height * b))
+    int idx = out_idx;
+    const int c = idx % channels;
+    idx /= channels;
+    const int out_x_center = idx % original_width;
+    idx /= original_width;
+    const int out_y_center = idx % original_height;
+    const int b = idx / original_height;
+
+    int in_y_start = max(
+        0, __float2int_ru((out_y_center - 1 + offset) * inverse_height_scale -
+                          offset));
+    const float out_y_start = (in_y_start + offset) * height_scale - offset;
+    int in_x_start =
+        max(0, __float2int_ru(
+                   (out_x_center - 1 + offset) * inverse_width_scale - offset));
+    const float out_x_start = (in_x_start + offset) * width_scale - offset;
+    T acc = 0;
+    // For clarity, prior to C++17, while loops are preferable to for loops here
+    float out_y = out_y_start;
+    int in_y = in_y_start;
+    while (out_y < out_y_center + 1 && in_y < resized_height) {
+      float out_x = out_x_start;
+      int in_x = in_x_start;
+      while (out_x < out_x_center + 1 && in_x < resized_width) {
+        int in_idx =
+            ((b * resized_height + in_y) * resized_width + in_x) * channels + c;
+        // Clamping to zero is necessary because out_x and out_y can be negative
+        // due to half-pixel adjustments to out_y_start and out_x_start.
+        // Clamping to height/width is necessary when upscaling.
+        float out_y_clamped = fmaxf(0, fminf(out_y, original_height - 1));
+        float out_x_clamped = fmaxf(0, fminf(out_x, original_width - 1));
+        float y_lerp = (1 - fabsf(out_y_clamped - out_y_center));
+        float x_lerp = (1 - fabsf(out_x_clamped - out_x_center));
+        acc += static_cast<T>(input_grad[in_idx] * y_lerp * x_lerp);
+        out_x += width_scale;
+        in_x++;
+      }
+      out_y += height_scale;
+      in_y++;
+    }
+    output_grad[out_idx] = acc;
+  }
+}
+
 template <typename T>
 __global__ void LegacyResizeBilinearKernel(
     const int32 nthreads, const T* __restrict__ images, float height_scale,
@@ -394,6 +448,17 @@ struct ResizeBilinear<GPUDevice, T> {
   }
 };
 
+bool RequireDeterminism() {
+  static bool require_determinism = [] {
+    bool deterministic_ops = false;
+    TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar("TF_DETERMINISTIC_OPS",
+                                               /*default_val=*/false,
+                                               &deterministic_ops));
+    return deterministic_ops;
+  }();
+  return require_determinism;
+}
+
 // Partial specialization of ResizeBilinearGrad functor for a GPUDevice.
 template <typename T>
 struct ResizeBilinearGrad<GPUDevice, T> {
@@ -413,31 +478,45 @@ struct ResizeBilinearGrad<GPUDevice, T> {
     int total_count;
     GpuLaunchConfig config;
 
-    // Initialize output_grad with all zeros.
     total_count = batch * original_height * original_width * channels;
     if (total_count == 0) return;
     config = GetGpuLaunchConfig(total_count, d);
-    TF_CHECK_OK(GpuLaunchKernel(
-        SetZero<T>, config.block_count, config.thread_per_block, 0, d.stream(),
-        config.virtual_thread_count, output_grad.data()));
 
-    // Accumulate.
-    total_count = batch * resized_height * resized_width * channels;
-    config = GetGpuLaunchConfig(total_count, d);
-    if (half_pixel_centers) {
+    if (RequireDeterminism()) {
+      // The scale values below should never be zero, enforced by
+      // ImageResizerGradientState
+      float inverse_height_scale = 1 / height_scale;
+      float inverse_width_scale = 1 / width_scale;
+      float offset = half_pixel_centers ? 0.5 : 0;
       TF_CHECK_OK(GpuLaunchKernel(
-          ResizeBilinearGradKernel<T>, config.block_count,
+          ResizeBilinearDeterministicGradKernel<T>, config.block_count,
           config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
-          input_grad.data(), height_scale, width_scale, batch, original_height,
-          original_width, channels, resized_height, resized_width,
-          output_grad.data()));
+          input_grad.data(), height_scale, inverse_height_scale, width_scale,
+          inverse_width_scale, batch, original_height, original_width, channels,
+          resized_height, resized_width, offset, output_grad.data()));
     } else {
+      // Initialize output_grad with all zeros.
       TF_CHECK_OK(GpuLaunchKernel(
-          LegacyResizeBilinearGradKernel<T>, config.block_count,
-          config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
-          input_grad.data(), height_scale, width_scale, batch, original_height,
-          original_width, channels, resized_height, resized_width,
-          output_grad.data()));
+          SetZero<T>, config.block_count, config.thread_per_block, 0,
+          d.stream(), config.virtual_thread_count, output_grad.data()));
+      // Accumulate.
+      total_count = batch * resized_height * resized_width * channels;
+      config = GetGpuLaunchConfig(total_count, d);
+      if (half_pixel_centers) {
+        TF_CHECK_OK(GpuLaunchKernel(
+            ResizeBilinearGradKernel<T>, config.block_count,
+            config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
+            input_grad.data(), height_scale, width_scale, batch,
+            original_height, original_width, channels, resized_height,
+            resized_width, output_grad.data()));
+      } else {
+        TF_CHECK_OK(GpuLaunchKernel(
+            LegacyResizeBilinearGradKernel<T>, config.block_count,
+            config.thread_per_block, 0, d.stream(), config.virtual_thread_count,
+            input_grad.data(), height_scale, width_scale, batch,
+            original_height, original_width, channels, resized_height,
+            resized_width, output_grad.data()));
+      }
     }
   }
 };
diff --git a/tensorflow/core/kernels/image/resize_nearest_neighbor_op_gpu.cu.cc b/tensorflow/core/kernels/image/resize_nearest_neighbor_op_gpu.cu.cc
index 50066d5b653..93fde9131f2 100644
--- a/tensorflow/core/kernels/image/resize_nearest_neighbor_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/resize_nearest_neighbor_op_gpu.cu.cc
@@ -173,20 +173,18 @@ struct ResizeNearestNeighbor<GPUDevice, T, half_pixel_centers, align_corners> {
     if (output_size == 0) return true;
 
     GpuLaunchConfig config = GetGpuLaunchConfig(output_size, d);
-    if (half_pixel_centers) {
-      TF_CHECK_OK(GpuLaunchKernel(
-          ResizeNearestNeighborNHWC<T>, config.block_count,
-          config.thread_per_block, 0, d.stream(), output_size, input.data(),
-          in_height, in_width, channels, out_height, out_width, height_scale,
-          width_scale, output.data()));
-      return d.ok();
-    } else {
-      TF_CHECK_OK(GpuLaunchKernel(
-          LegacyResizeNearestNeighborNHWC<T, align_corners>, config.block_count,
-          config.thread_per_block, 0, d.stream(), output_size, input.data(),
-          in_height, in_width, channels, out_height, out_width, height_scale,
-          width_scale, output.data()));
-    }
+    void (*kernel)(const int nthreads, const T* __restrict__ bottom_data,
+                   const int in_height, const int in_width, const int channels,
+                   const int out_height, const int out_width,
+                   const float height_scale, const float width_scale,
+                   T* top_data) =
+        half_pixel_centers ? ResizeNearestNeighborNHWC<T>
+                           : LegacyResizeNearestNeighborNHWC<T, align_corners>;
+    TF_CHECK_OK(
+        GpuLaunchKernel(kernel, config.block_count, config.thread_per_block, 0,
+                        d.stream(), config.virtual_thread_count, input.data(),
+                        in_height, in_width, channels, out_height, out_width,
+                        height_scale, width_scale, output.data()));
     return d.ok();
   }
 };
@@ -228,23 +226,20 @@ struct ResizeNearestNeighborGrad<GPUDevice, T, half_pixel_centers,
     if (input_size == 0) return true;
 
     GpuLaunchConfig input_config = GetGpuLaunchConfig(input_size, d);
-    if (half_pixel_centers) {
-      TF_CHECK_OK(GpuLaunchKernel(
-          ResizeNearestNeighborBackwardNHWC<T>, input_config.block_count,
-          input_config.thread_per_block, 0, d.stream(),
-          input_config.virtual_thread_count, input.data(), in_height, in_width,
-          channels, out_height, out_width, height_scale, width_scale,
-          output.data()));
-      return d.ok();
-    } else {
-      TF_CHECK_OK(GpuLaunchKernel(
-          LegacyResizeNearestNeighborBackwardNHWC<T, align_corners>,
-          input_config.block_count, input_config.thread_per_block, 0,
-          d.stream(), input_config.virtual_thread_count, input.data(),
-          in_height, in_width, channels, out_height, out_width, height_scale,
-          width_scale, output.data()));
-      return d.ok();
-    }
+    void (*kernel)(const int nthreads, const T* __restrict__ top_diff,
+                   const int in_height, const int in_width, const int channels,
+                   const int out_height, const int out_width,
+                   const float height_scale, const float width_scale,
+                   T* __restrict__ bottom_diff) =
+        half_pixel_centers
+            ? ResizeNearestNeighborBackwardNHWC<T>
+            : LegacyResizeNearestNeighborBackwardNHWC<T, align_corners>;
+    TF_CHECK_OK(GpuLaunchKernel(
+        kernel, input_config.block_count, input_config.thread_per_block, 0,
+        d.stream(), input_config.virtual_thread_count, input.data(), in_height,
+        in_width, channels, out_height, out_width, height_scale, width_scale,
+        output.data()));
+    return d.ok();
   }
 };
 
diff --git a/tensorflow/core/kernels/image/resize_nearest_neighbor_op_test.cc b/tensorflow/core/kernels/image/resize_nearest_neighbor_op_test.cc
index 734ef5da69b..4e73c6ed7e5 100644
--- a/tensorflow/core/kernels/image/resize_nearest_neighbor_op_test.cc
+++ b/tensorflow/core/kernels/image/resize_nearest_neighbor_op_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // TODO(shlens, sherrym): Consider adding additional tests in image_ops.py in
 // order to compare the reference implementation for image resizing in Python
 // Image Library.
+#include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -30,18 +31,32 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
+enum class TestDevice { kCPU, kGPU };
 
-class ResizeNearestNeighborOpTestBase : public OpsTestBase {
+class ResizeNearestNeighborOpTestBase
+    : public OpsTestBase,
+      public ::testing::WithParamInterface<TestDevice> {
  protected:
-  explicit ResizeNearestNeighborOpTestBase(bool half_pixel_centers) {
-    TF_EXPECT_OK(NodeDefBuilder("resize_nn", "ResizeNearestNeighbor")
+  explicit ResizeNearestNeighborOpTestBase(bool half_pixel_centers)
+      : align_corners_(false), half_pixel_centers_(half_pixel_centers) {}
+  void SetUp() override {
+    if (GetParam() == TestDevice::kGPU) {
+      std::unique_ptr<Device> device_gpu(
+          DeviceFactory::NewDevice(/*type=*/"GPU", /*options=*/{},
+                                   /*name_prefix=*/"/job:a/replica:0/task:0"));
+      SetDevice(DEVICE_GPU, std::move(device_gpu));
+    }
+
+    TF_EXPECT_OK(NodeDefBuilder("resize_nn_op", "ResizeNearestNeighbor")
                      .Input(FakeInput(DT_FLOAT))
                      .Input(FakeInput(DT_INT32))
-                     .Attr("align_corners", false)
-                     .Attr("half_pixel_centers", half_pixel_centers)
+                     .Attr("align_corners", align_corners_)
+                     .Attr("half_pixel_centers", half_pixel_centers_)
                      .Finalize(node_def()));
     TF_EXPECT_OK(InitOp());
   }
+  bool align_corners_;
+  bool half_pixel_centers_;
 };
 
 class ResizeNearestNeighborOpTest : public ResizeNearestNeighborOpTestBase {
@@ -58,19 +73,30 @@ class ResizeNearestNeighborHalfPixelCentersOpTest
 
 // TODO(jflynn): Add some actual tests for the half pixel centers case.
 
-class ResizeNearestNeighborOpAlignCornersTest : public OpsTestBase {
+class ResizeNearestNeighborOpAlignCornersTest
+    : public OpsTestBase,
+      public ::testing::WithParamInterface<TestDevice> {
  protected:
-  ResizeNearestNeighborOpAlignCornersTest() {
-    TF_EXPECT_OK(NodeDefBuilder("resize_nn", "ResizeNearestNeighbor")
+  ResizeNearestNeighborOpAlignCornersTest() : align_corners_(true) {}
+  void SetUp() override {
+    if (GetParam() == TestDevice::kGPU) {
+      std::unique_ptr<Device> device_gpu(
+          DeviceFactory::NewDevice(/*type=*/"GPU", /*options=*/{},
+                                   /*name_prefix=*/"/job:a/replica:0/task:0"));
+      SetDevice(DEVICE_GPU, std::move(device_gpu));
+    }
+
+    TF_EXPECT_OK(NodeDefBuilder("resize_nn_op", "ResizeNearestNeighbor")
                      .Input(FakeInput(DT_FLOAT))
                      .Input(FakeInput(DT_INT32))
-                     .Attr("align_corners", true)
+                     .Attr("align_corners", align_corners_)
                      .Finalize(node_def()));
     TF_EXPECT_OK(InitOp());
   }
+  bool align_corners_;
 };
 
-TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To1x1) {
+TEST_P(ResizeNearestNeighborOpTest, TestNearest2x2To1x1) {
   // Input:
   //  1, 2
   //  3, 4
@@ -87,7 +113,7 @@ TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To1x1) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborOpAlignCornersTest,
+TEST_P(ResizeNearestNeighborOpAlignCornersTest,
        TestNearest2x2AlignCornersTo1x1) {
   // Input:
   //  1, 2
@@ -105,7 +131,7 @@ TEST_F(ResizeNearestNeighborOpAlignCornersTest,
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To3x3) {
+TEST_P(ResizeNearestNeighborOpTest, TestNearest2x2To3x3) {
   // Input:
   //  1, 2
   //  3, 4
@@ -125,7 +151,7 @@ TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To3x3) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborOpAlignCornersTest,
+TEST_P(ResizeNearestNeighborOpAlignCornersTest,
        TestNearestAlignCorners2x2To3x3) {
   // Input:
   //  1, 2
@@ -146,7 +172,7 @@ TEST_F(ResizeNearestNeighborOpAlignCornersTest,
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborOpTest, TestNearest3x3To2x2) {
+TEST_P(ResizeNearestNeighborOpTest, TestNearest3x3To2x2) {
   // Input:
   //  1, 2, 3
   //  4, 5, 6
@@ -167,7 +193,7 @@ TEST_F(ResizeNearestNeighborOpTest, TestNearest3x3To2x2) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborOpAlignCornersTest,
+TEST_P(ResizeNearestNeighborOpAlignCornersTest,
        TestNearestAlignCorners3x3To2x2) {
   // Input:
   //  1, 2, 3
@@ -189,7 +215,7 @@ TEST_F(ResizeNearestNeighborOpAlignCornersTest,
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To2x5) {
+TEST_P(ResizeNearestNeighborOpTest, TestNearest2x2To2x5) {
   // Input:
   //  1, 2
   //  3, 4
@@ -208,7 +234,7 @@ TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To2x5) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborOpTest, TestNearestNeighbor4x4To3x3) {
+TEST_P(ResizeNearestNeighborOpTest, TestNearestNeighbor4x4To3x3) {
   // Input:
   //  1,  2,  3,  4
   //  5,  6,  7,  8
@@ -232,7 +258,7 @@ TEST_F(ResizeNearestNeighborOpTest, TestNearestNeighbor4x4To3x3) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborOpAlignCornersTest,
+TEST_P(ResizeNearestNeighborOpAlignCornersTest,
        TestNearestNeighborAlignCorners4x4To3x3) {
   // Input:
   //  1,  2,  3,  4
@@ -257,7 +283,7 @@ TEST_F(ResizeNearestNeighborOpAlignCornersTest,
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To5x2) {
+TEST_P(ResizeNearestNeighborOpTest, TestNearest2x2To5x2) {
   // Input:
   //  1, 2
   //  3, 4
@@ -279,7 +305,7 @@ TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To5x2) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To4x4) {
+TEST_P(ResizeNearestNeighborOpTest, TestNearest2x2To4x4) {
   // Input:
   //  1, 2
   //  3, 4
@@ -300,7 +326,7 @@ TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2To4x4) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2x2x2To2x3x3x2) {
+TEST_P(ResizeNearestNeighborOpTest, TestNearest2x2x2x2To2x3x3x2) {
   // Input:
   //  [ [ 1, 1 ], [ 2, 2],
   //    [ 3, 3 ], [ 4, 4] ],
@@ -332,7 +358,7 @@ TEST_F(ResizeNearestNeighborOpTest, TestNearest2x2x2x2To2x3x3x2) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest5x2To2x2) {
+TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest5x2To2x2) {
   // Input:
   //  1, 2
   //  3, 4
@@ -350,7 +376,7 @@ TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest5x2To2x2) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To1x1) {
+TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To1x1) {
   // Input:
   //  1, 2
   //  3, 4
@@ -367,7 +393,7 @@ TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To1x1) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To3x3) {
+TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To3x3) {
   // Input:
   //  1, 2
   //  3, 4
@@ -387,7 +413,7 @@ TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To3x3) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest3x3To2x2) {
+TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest3x3To2x2) {
   // Input:
   //  1, 2, 3
   //  4, 5, 6
@@ -408,7 +434,7 @@ TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest3x3To2x2) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To2x5) {
+TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To2x5) {
   // Input:
   //  1, 2
   //  3, 4
@@ -427,7 +453,7 @@ TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To2x5) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest,
+TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest,
        TestNearestNeighbor4x4To3x3) {
   // Input:
   //  1,  2,  3,  4
@@ -452,7 +478,7 @@ TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest,
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To5x2) {
+TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To5x2) {
   // Input:
   //  1, 2
   //  3, 4
@@ -474,7 +500,7 @@ TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To5x2) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To4x4) {
+TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To4x4) {
   // Input:
   //  1, 2
   //  3, 4
@@ -495,7 +521,7 @@ TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest, TestNearest2x2To4x4) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
-TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest,
+TEST_P(ResizeNearestNeighborHalfPixelCentersOpTest,
        TestNearest2x2x2x2To2x3x3x2) {
   // Input:
   //  [ [ 1, 1 ], [ 2, 2],
@@ -521,4 +547,26 @@ TEST_F(ResizeNearestNeighborHalfPixelCentersOpTest,
   // clang-format on
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
+
+INSTANTIATE_TEST_SUITE_P(ResizeNearestNeighborOpTestCpu,
+                         ResizeNearestNeighborOpTest,
+                         ::testing::Values(TestDevice::kCPU));
+INSTANTIATE_TEST_SUITE_P(ResizeNearestNeighborHalfPixelCentersOpTestCpu,
+                         ResizeNearestNeighborHalfPixelCentersOpTest,
+                         ::testing::Values(TestDevice::kCPU));
+INSTANTIATE_TEST_SUITE_P(ResizeNearestNeighborOpAlignCornersTestCpu,
+                         ResizeNearestNeighborOpAlignCornersTest,
+                         ::testing::Values(TestDevice::kCPU));
+#if GOOGLE_CUDA
+// Instantiate tests for kGPU.
+INSTANTIATE_TEST_SUITE_P(ResizeNearestNeighborOpTestGpu,
+                         ResizeNearestNeighborOpTest,
+                         ::testing::Values(TestDevice::kGPU));
+INSTANTIATE_TEST_SUITE_P(ResizeNearestNeighborHalfPixelCentersOpTestGpu,
+                         ResizeNearestNeighborHalfPixelCentersOpTest,
+                         ::testing::Values(TestDevice::kGPU));
+INSTANTIATE_TEST_SUITE_P(ResizeNearestNeighborOpAlignCornersTestGpu,
+                         ResizeNearestNeighborOpAlignCornersTest,
+                         ::testing::Values(TestDevice::kGPU));
+#endif  // GOOGLE_CUDA
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index b5191b9989f..1849cb42883 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -25,9 +25,6 @@ limitations under the License.
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SyclDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 
@@ -60,23 +57,6 @@ Status DoParallelConcat(const CPUDevice& d, const Tensor& value, int32 loc,
   }
 }
 
-#ifdef TENSORFLOW_USE_SYCL
-template <>
-Status DoParallelConcat(const SyclDevice& d, const Tensor& value, int32 loc,
-                        Tensor* output) {
-  CHECK_EQ(value.dtype(), output->dtype());
-  switch (value.dtype()) {
-#define CASE(type)                  \
-  case DataTypeToEnum<type>::value: \
-    return DoParallelConcatUpdate<SyclDevice, type>(d, value, loc, output);
-    TF_CALL_GPU_NUMBER_TYPES_NO_HALF(CASE);
-#undef CASE
-    default:
-      return errors::InvalidArgument("Unsupported data type: ",
-                                     DataTypeString(value.dtype()));
-  }
-}
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 
@@ -175,41 +155,6 @@ TF_CALL_POD_STRING_TYPES(REGISTER_EMPTY)
 TF_CALL_POD_STRING_TYPES(REGISTER_PARALLEL_CONCAT);
 #undef REGISTER_PARALLEL_CONCAT
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_EMPTY(type)                                  \
-  REGISTER_KERNEL_BUILDER(Name("_ParallelConcatStart")        \
-                              .Device(DEVICE_SYCL)            \
-                              .TypeConstraint<type>("dtype"), \
-                          ParallelConcatStart<SyclDevice, type>);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_EMPTY)
-#undef REGISTER_EMPTY
-
-#define REGISTER_PARALLEL_CONCAT(type)                                      \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("ParallelConcat").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      FailureKernel);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_PARALLEL_CONCAT);
-#undef REGISTER_PARALLEL_CONCAT
-
-#define REGISTER(type)                                    \
-  REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")   \
-                              .Device(DEVICE_SYCL)        \
-                              .TypeConstraint<type>("T"), \
-                          ParallelConcatUpdate<SyclDevice>);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER)
-#undef REGISTER
-
-// Register versions that operate on int32 data on the CPU even though the op
-// has been placed on the SYCL
-
-REGISTER_KERNEL_BUILDER(Name("_ParallelConcatUpdate")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("value")
-                            .HostMemory("update")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T"),
-                        ParallelConcatUpdate<CPUDevice>);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/linalg/BUILD b/tensorflow/core/kernels/linalg/BUILD
index ab25fad3ec3..0ceeb5f22ea 100644
--- a/tensorflow/core/kernels/linalg/BUILD
+++ b/tensorflow/core/kernels/linalg/BUILD
@@ -1,15 +1,23 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_cuda_or_rocm",
-    "tf_kernel_library",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm",
 )
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+
 package(
     default_visibility = [
         "//tensorflow:__subpackages__",
@@ -204,7 +212,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels:fill_functor",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
     ] + if_cuda([
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
         "//tensorflow/core/util:cuda_solvers",
diff --git a/tensorflow/core/kernels/linalg/banded_triangular_solve_op.cc b/tensorflow/core/kernels/linalg/banded_triangular_solve_op.cc
index 6758dcf5b8b..bec28088ad1 100644
--- a/tensorflow/core/kernels/linalg/banded_triangular_solve_op.cc
+++ b/tensorflow/core/kernels/linalg/banded_triangular_solve_op.cc
@@ -193,7 +193,8 @@ struct LaunchBatchBandedTriangularSolve {
 
     Shard(worker_threads.num_threads, worker_threads.workers, batch_size,
           cost_per_unit,
-          [&in_x, &in_y, adjoint, lower, &bcast, out](int start, int limit) {
+          [&in_x, &in_y, adjoint, lower, &bcast, out](int64 start,
+                                                      int64 limit) {
             SequentialBandedTriangularSolveKernel<Scalar>::Run(
                 in_x, in_y, lower, adjoint, bcast, out, start, limit);
           });
diff --git a/tensorflow/core/kernels/linalg/einsum_op_impl.h b/tensorflow/core/kernels/linalg/einsum_op_impl.h
index b9b2d1f0eae..1fe9a34e67d 100644
--- a/tensorflow/core/kernels/linalg/einsum_op_impl.h
+++ b/tensorflow/core/kernels/linalg/einsum_op_impl.h
@@ -549,7 +549,7 @@ struct EinsumHelper {
   static Status ContractOperands(OpKernelContext* ctx,
                                  absl::Span<const Tensor> inputs,
                                  absl::Span<const bool> swap_free_and_contract,
-                                 Tensor* output) {
+                                 bool use_autotune, Tensor* output) {
     if (inputs.size() == 1)
       return CopyFrom(inputs[0], inputs[0].shape(), output);
     MatMulBCast bcast(inputs[0].shape().dim_sizes(),
@@ -583,7 +583,7 @@ struct EinsumHelper {
         ReshapeToRank3(*output, bcast.output_batch_size(), &output_reshaped));
     LaunchBatchMatMul<Device, T>::Launch(ctx, lhs, rhs, /*adj_x=*/false,
                                          /*adj_y=*/false, trans_x, trans_y,
-                                         bcast, &output_reshaped);
+                                         bcast, use_autotune, &output_reshaped);
     return Status::OK();
   }
 };
@@ -598,6 +598,7 @@ class EinsumOp : public OpKernel {
                equation_, &input_labels_, &output_labels_, &label_types_,
                &input_label_counts_, &output_label_counts_,
                &input_has_ellipsis_, &output_has_ellipsis_));
+    use_autotune_ = MatmulAutotuneEnable();
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -640,7 +641,7 @@ class EinsumOp : public OpKernel {
     Tensor contraction_output_reshaped;
     OP_REQUIRES_OK(ctx, EinsumHelper::ContractOperands<Device, T>(
                             ctx, inputs_reduced, swap_free_and_contract,
-                            &contraction_output_reshaped));
+                            use_autotune_, &contraction_output_reshaped));
 
     // Copy the batch labels from the contraction output. Recover the batch
     // shape, which may have been broadcasted.
@@ -738,6 +739,7 @@ class EinsumOp : public OpKernel {
   LabelCounts output_label_counts_;
   gtl::InlinedVector<bool, 2> input_has_ellipsis_;
   bool output_has_ellipsis_ = false;
+  bool use_autotune_;
 };
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 37fc1b3ae08..1a99eac31c4 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -281,9 +281,6 @@ class TensorListConcat : public OpKernel {
       std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>;
   explicit TensorListConcat(OpKernelConstruction* c) : OpKernel(c) {
     OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
-    // TODO(skyewm): the HasAttr check can be removed once the
-    // element_shape_except_first_dim attr has been checked in for 2 weeks
-    // (around 1/14/2019).
     if (c->HasAttr("element_shape")) {
       PartialTensorShape element_shape;
       OP_REQUIRES_OK(c, c->GetAttr("element_shape", &element_shape));
diff --git a/tensorflow/core/kernels/map_kernels.cc b/tensorflow/core/kernels/map_kernels.cc
index 53299cc24e0..4430fdf9217 100644
--- a/tensorflow/core/kernels/map_kernels.cc
+++ b/tensorflow/core/kernels/map_kernels.cc
@@ -41,6 +41,9 @@ REGISTER_KERNEL_BUILDER(Name("TensorMapErase").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("TensorMapHasKey").Device(DEVICE_CPU),
                         TensorMapHasKey);
 
+REGISTER_KERNEL_BUILDER(Name("TensorMapStackKeys").Device(DEVICE_CPU),
+                        TensorMapStackKeys);
+
 #undef REGISTER_TENSOR_MAP_OPS_CPU
 
 #define REGISTER_TENSOR_MAP_OPS_CPU(T)
diff --git a/tensorflow/core/kernels/map_kernels.h b/tensorflow/core/kernels/map_kernels.h
index 0a1bd12cd54..cf45db132ee 100644
--- a/tensorflow/core/kernels/map_kernels.h
+++ b/tensorflow/core/kernels/map_kernels.h
@@ -15,39 +15,37 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_MAP_KERNELS_H_
 #define TENSORFLOW_CORE_KERNELS_MAP_KERNELS_H_
 
-#include <iostream>
-
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/tensor_map.h"
+#include "tensorflow/core/util/batch_util.h"
 #include "tensorflow/core/util/tensor_ops_util.h"
 
 namespace tensorflow {
 
-Status GetInputMap(OpKernelContext* c, int index, const TensorMap** map) {
-  if (!TensorShapeUtils::IsScalar(c->input(index).shape())) {
+Status GetInputMap(OpKernelContext* ctx, int index, const TensorMap** ret_map) {
+  if (!TensorShapeUtils::IsScalar(ctx->input(index).shape())) {
     return errors::InvalidArgument("Input map must be a scalar. Saw: ",
-                                   c->input(index).shape().DebugString());
+                                   ctx->input(index).shape().DebugString());
   }
-  const TensorMap* m = c->input(index).scalar<Variant>()().get<TensorMap>();
-  if (m == nullptr) {
+  const TensorMap* map = ctx->input(index).scalar<Variant>()().get<TensorMap>();
+  if (map == nullptr) {
     return errors::InvalidArgument(
         "Input handle is not a map. Saw: '",
-        c->input(index).scalar<Variant>()().DebugString(), "'");
+        ctx->input(index).scalar<Variant>()().DebugString(), "'");
   }
-  *map = m;
+  *ret_map = map;
   return Status::OK();
 }
 
 // TODO(kattian): change into templated function
-Status ForwardInputOrCreateNewMap(OpKernelContext* c, int32 input_index,
+Status ForwardInputOrCreateNewMap(OpKernelContext* ctx, int32 input_index,
                                   int32 output_index,
                                   const TensorMap& input_map,
                                   TensorMap** output_map) {
   // Attempt to forward the input tensor to the output if possible.
-  std::unique_ptr<Tensor> maybe_output = c->forward_input(
+  std::unique_ptr<Tensor> maybe_output = ctx->forward_input(
       input_index, output_index, DT_VARIANT, TensorShape{},
-      c->input_memory_type(input_index), AllocatorAttributes());
+      ctx->input_memory_type(input_index), AllocatorAttributes());
   Tensor* output_tensor;
   if (maybe_output != nullptr && maybe_output->dtype() == DT_VARIANT &&
       maybe_output->NumElements() == 1) {
@@ -60,7 +58,7 @@ Status ForwardInputOrCreateNewMap(OpKernelContext* c, int32 input_index,
     }
     if (tmp_out->RefCountIsOne()) {
       // Woohoo, forwarding succeeded!
-      c->set_output(output_index, *output_tensor);
+      ctx->set_output(output_index, *output_tensor);
       *output_map = tmp_out;
       return Status::OK();
     }
@@ -71,7 +69,7 @@ Status ForwardInputOrCreateNewMap(OpKernelContext* c, int32 input_index,
   AllocatorAttributes attr;
   attr.set_on_host(true);
   TF_RETURN_IF_ERROR(
-      c->allocate_output(output_index, {}, &output_tensor, attr));
+      ctx->allocate_output(output_index, {}, &output_tensor, attr));
   output_tensor->scalar<Variant>()() = input_map.Copy();
 
   *output_map = output_tensor->scalar<Variant>()().get<TensorMap>();
@@ -80,13 +78,13 @@ Status ForwardInputOrCreateNewMap(OpKernelContext* c, int32 input_index,
 
 class EmptyTensorMap : public OpKernel {
  public:
-  explicit EmptyTensorMap(OpKernelConstruction* c) : OpKernel(c) {}
+  explicit EmptyTensorMap(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
-  void Compute(OpKernelContext* c) override {
+  void Compute(OpKernelContext* ctx) override {
     Tensor* result;
     AllocatorAttributes attr;
     attr.set_on_host(true);
-    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr));
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &result, attr));
     TensorMap empty;
     result->scalar<Variant>()() = std::move(empty);
   }
@@ -94,87 +92,136 @@ class EmptyTensorMap : public OpKernel {
 
 class TensorMapSize : public OpKernel {
  public:
-  explicit TensorMapSize(OpKernelConstruction* c) : OpKernel(c) {}
+  explicit TensorMapSize(OpKernelConstruction* ctx) : OpKernel(ctx) {}
   ~TensorMapSize() override {}
 
-  void Compute(OpKernelContext* c) override {
-    const TensorMap* m = nullptr;
-    OP_REQUIRES_OK(c, GetInputMap(c, 0, &m));
+  void Compute(OpKernelContext* ctx) override {
+    const TensorMap* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
     Tensor* result;
-    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result));
-    result->scalar<int32>()() = m->tensors().size();
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &result));
+    result->scalar<int32>()() = map->tensors().size();
   }
 };
 
 class TensorMapLookup : public OpKernel {
  public:
-  explicit TensorMapLookup(OpKernelConstruction* c) : OpKernel(c) {}
+  explicit TensorMapLookup(OpKernelConstruction* ctx) : OpKernel(ctx) {}
   ~TensorMapLookup() override {}
 
-  void Compute(OpKernelContext* c) override {
-    const TensorKey& key = c->input(1);
-    const TensorMap* m = nullptr;
-    OP_REQUIRES_OK(c, GetInputMap(c, 0, &m));
+  void Compute(OpKernelContext* ctx) override {
+    const TensorKey& key = ctx->input(1);
+    const TensorMap* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
 
-    OP_REQUIRES(c, m->tensors().find(key) != m->tensors().end(),
-                errors::InvalidArgument("Trying to lookup non-existent key."));
+    OP_REQUIRES(
+        ctx, map->tensors().find(key) != map->tensors().end(),
+        errors::InvalidArgument("Trying to lookup non-existent key. Could not "
+                                "find key \"" +
+                                key.SummarizeValue(100) + "\"."));
 
-    c->set_output(0, m->tensors().find(key)->second);
+    ctx->set_output(0, map->tensors().find(key)->second);
   }
 };
 
 class TensorMapInsert : public OpKernel {
  public:
-  explicit TensorMapInsert(OpKernelConstruction* c) : OpKernel(c) {}
+  explicit TensorMapInsert(OpKernelConstruction* ctx) : OpKernel(ctx) {}
   ~TensorMapInsert() override {}
 
-  void Compute(OpKernelContext* c) override {
-    const TensorKey& key = c->input(1);
-    const Tensor& value = c->input(2);
-    const TensorMap* m = nullptr;
-    OP_REQUIRES_OK(c, GetInputMap(c, 0, &m));
+  void Compute(OpKernelContext* ctx) override {
+    const TensorKey& key = ctx->input(1);
+    const Tensor& value = ctx->input(2);
+    const TensorMap* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
 
     TensorMap* output_map = nullptr;
-    OP_REQUIRES_OK(c, ForwardInputOrCreateNewMap(c, 0, 0, *m, &output_map));
+    OP_REQUIRES_OK(ctx,
+                   ForwardInputOrCreateNewMap(ctx, 0, 0, *map, &output_map));
     output_map->replace(key, value);
   }
 };
 
 class TensorMapErase : public OpKernel {
  public:
-  explicit TensorMapErase(OpKernelConstruction* c) : OpKernel(c) {}
+  explicit TensorMapErase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
-  void Compute(OpKernelContext* c) override {
-    const TensorKey& key = c->input(1);
-    const TensorMap* m = nullptr;
-    OP_REQUIRES_OK(c, GetInputMap(c, 0, &m));
+  void Compute(OpKernelContext* ctx) override {
+    const TensorKey& key = ctx->input(1);
+    const TensorMap* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
 
-    OP_REQUIRES(c, m->tensors().find(key) != m->tensors().end(),
-                errors::InvalidArgument("Trying to erase non-existent item."));
+    OP_REQUIRES(
+        ctx, map->tensors().find(key) != map->tensors().end(),
+        errors::InvalidArgument("Trying to erase non-existent item. Could not "
+                                "find key \"" +
+                                key.SummarizeValue(100) + "\"."));
 
     TensorMap* output_map = nullptr;
-    OP_REQUIRES_OK(c, ForwardInputOrCreateNewMap(c, 0, 0, *m, &output_map));
+    OP_REQUIRES_OK(ctx,
+                   ForwardInputOrCreateNewMap(ctx, 0, 0, *map, &output_map));
     output_map->tensors().erase(key);
   }
 };
 
 class TensorMapHasKey : public OpKernel {
  public:
-  explicit TensorMapHasKey(OpKernelConstruction* c) : OpKernel(c) {}
+  explicit TensorMapHasKey(OpKernelConstruction* ctx) : OpKernel(ctx) {}
   ~TensorMapHasKey() override {}
 
-  void Compute(OpKernelContext* c) override {
-    const TensorKey& key = c->input(1);
-    const TensorMap* m = nullptr;
-    OP_REQUIRES_OK(c, GetInputMap(c, 0, &m));
+  void Compute(OpKernelContext* ctx) override {
+    const TensorKey& key = ctx->input(1);
+    const TensorMap* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
     Tensor* result;
-    OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result));
-    result->scalar<bool>()() = m->tensors().find(key) != m->tensors().end();
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape{}, &result));
+    result->scalar<bool>()() = map->tensors().find(key) != map->tensors().end();
   }
 };
 
+class TensorMapStackKeys : public OpKernel {
+ public:
+  explicit TensorMapStackKeys(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("key_dtype", &key_dtype_));
+  }
+  ~TensorMapStackKeys() override {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const TensorMap* map = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputMap(ctx, 0, &map));
+
+    OP_REQUIRES(ctx, map->size() != 0,
+                errors::InvalidArgument(
+                    "TensorMapStackKeys cannot be called on empty map."));
+
+    auto it = map->tensors().begin();
+    TensorShape output_shape = it->first.shape();
+    output_shape.InsertDim(0, map->tensors().size());
+    Tensor* result;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &result));
+
+    int i = 0;
+    size_t sz = map->tensors().size();
+    TensorShape key_shape = it->first.shape();
+    while (it != map->tensors().end() && i < sz) {
+      OP_REQUIRES(
+          ctx, it->first.dtype() == key_dtype_,
+          errors::InvalidArgument("Key does not match requested dtype."));
+      OP_REQUIRES(
+          ctx, it->first.shape() == key_shape,
+          errors::InvalidArgument("Keys must all have the same shape."));
+      OP_REQUIRES_OK(ctx, batch_util::CopyElementToSlice(it->first, result, i));
+      i++;
+      it++;
+    }
+  }
+
+ private:
+  DataType key_dtype_;
+};
+
 template <typename Device>
-Status TensorMapBinaryAdd(OpKernelContext* c, const TensorMap& a,
+Status TensorMapBinaryAdd(OpKernelContext* ctx, const TensorMap& a,
                           const TensorMap& b, TensorMap* out) {
   // Binary add returns a map containing the union of keys.
   // Values with keys in the intersection are added.
@@ -185,7 +232,7 @@ Status TensorMapBinaryAdd(OpKernelContext* c, const TensorMap& a,
     if (it != out->tensors().end()) {
       Tensor out_tensor;
       TF_RETURN_IF_ERROR(
-          BinaryAddTensors<Device>(c, p.second, it->second, &out_tensor));
+          BinaryAddTensors<Device>(ctx, p.second, it->second, &out_tensor));
       it->second = out_tensor;
     } else {
       out->tensors().emplace(p.first, p.second);
@@ -195,7 +242,7 @@ Status TensorMapBinaryAdd(OpKernelContext* c, const TensorMap& a,
 }
 
 template <typename Device>
-Status TensorMapZerosLike(OpKernelContext* c, const TensorMap& x,
+Status TensorMapZerosLike(OpKernelContext* ctx, const TensorMap& x,
                           TensorMap* y) {
   // Zeros like returns an empty map.
   return Status::OK();
diff --git a/tensorflow/core/kernels/map_stage_op.cc b/tensorflow/core/kernels/map_stage_op.cc
index 6c01e42ff8c..89b760ea4d0 100644
--- a/tensorflow/core/kernels/map_stage_op.cc
+++ b/tensorflow/core/kernels/map_stage_op.cc
@@ -556,18 +556,6 @@ REGISTER_KERNEL_BUILDER(Name("OrderedMapStage")
                         MapStageOp<true>);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("MapStage")
-                            .HostMemory("key")
-                            .HostMemory("indices")
-                            .Device(DEVICE_SYCL),
-                        MapStageOp<false>);
-REGISTER_KERNEL_BUILDER(Name("OrderedMapStage")
-                            .HostMemory("key")
-                            .HostMemory("indices")
-                            .Device(DEVICE_SYCL),
-                        MapStageOp<true>);
-#endif  // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
 class MapUnstageOp : public OpKernel {
@@ -617,18 +605,6 @@ REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstage")
                             .Device(DEVICE_GPU),
                         MapUnstageOp<true>);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("MapUnstage")
-                            .HostMemory("key")
-                            .HostMemory("indices")
-                            .Device(DEVICE_SYCL),
-                        MapUnstageOp<false>);
-REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstage")
-                            .HostMemory("key")
-                            .HostMemory("indices")
-                            .Device(DEVICE_SYCL),
-                        MapUnstageOp<true>);
-#endif  // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
 class MapPeekOp : public OpKernel {
@@ -676,16 +652,6 @@ REGISTER_KERNEL_BUILDER(Name("OrderedMapPeek")
                         MapPeekOp<true>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(
-    Name("MapPeek").HostMemory("key").HostMemory("indices").Device(DEVICE_SYCL),
-    MapPeekOp<false>);
-REGISTER_KERNEL_BUILDER(Name("OrderedMapPeek")
-                            .HostMemory("key")
-                            .HostMemory("indices")
-                            .Device(DEVICE_SYCL),
-                        MapPeekOp<true>);
-#endif  // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
 class MapUnstageNoKeyOp : public OpKernel {
@@ -741,18 +707,6 @@ REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstageNoKey")
                         MapUnstageNoKeyOp<true>);
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("MapUnstageNoKey")
-                            .HostMemory("key")
-                            .HostMemory("indices")
-                            .Device(DEVICE_SYCL),
-                        MapUnstageNoKeyOp<false>);
-REGISTER_KERNEL_BUILDER(Name("OrderedMapUnstageNoKey")
-                            .HostMemory("key")
-                            .HostMemory("indices")
-                            .Device(DEVICE_SYCL),
-                        MapUnstageNoKeyOp<true>);
-#endif  // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
 class MapSizeOp : public OpKernel {
@@ -784,13 +738,6 @@ REGISTER_KERNEL_BUILDER(
     Name("OrderedMapSize").Device(DEVICE_GPU).HostMemory("size"),
     MapSizeOp<true>);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("MapSize").Device(DEVICE_SYCL).HostMemory("size"),
-                        MapSizeOp<false>);
-REGISTER_KERNEL_BUILDER(
-    Name("OrderedMapSize").Device(DEVICE_SYCL).HostMemory("size"),
-    MapSizeOp<true>);
-#endif  // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
 class MapIncompleteSizeOp : public OpKernel {
@@ -824,14 +771,6 @@ REGISTER_KERNEL_BUILDER(
     Name("OrderedMapIncompleteSize").Device(DEVICE_GPU).HostMemory("size"),
     MapIncompleteSizeOp<true>);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(
-    Name("MapIncompleteSize").Device(DEVICE_SYCL).HostMemory("size"),
-    MapIncompleteSizeOp<false>);
-REGISTER_KERNEL_BUILDER(
-    Name("OrderedMapIncompleteSize").Device(DEVICE_SYCL).HostMemory("size"),
-    MapIncompleteSizeOp<true>);
-#endif  // TENSORFLOW_USE_SYCL
 
 template <bool Ordered>
 class MapClearOp : public OpKernel {
@@ -856,12 +795,6 @@ REGISTER_KERNEL_BUILDER(Name("MapClear").Device(DEVICE_GPU), MapClearOp<false>);
 REGISTER_KERNEL_BUILDER(Name("OrderedMapClear").Device(DEVICE_GPU),
                         MapClearOp<true>);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("MapClear").Device(DEVICE_SYCL),
-                        MapClearOp<false>);
-REGISTER_KERNEL_BUILDER(Name("OrderedMapClear").Device(DEVICE_SYCL),
-                        MapClearOp<true>);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index 2e3c120248f..3b57f093e23 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -36,9 +36,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T, bool USE_CUBLAS>
 struct LaunchMatMul;
@@ -123,18 +120,14 @@ struct LaunchMatMulBase {
       OpKernelContext* ctx, const Tensor& a, const Tensor& b,
       const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair,
       std::vector<AlgorithmType>* algorithms, bool use_autotune, Tensor* out) {
-#ifndef TENSORFLOW_USE_SYCL
     // An explicit vector-matrix multiply is much better optimized than an
     // implicit one and this is a bottleneck during non-batched inference.
     bool was_vector = ExplicitVectorMatrixOptimization<T>(a, b, dim_pair, out);
     if (!was_vector) {
-#endif  // TENSORFLOW_USE_SYCL
       functor::MatMulFunctor<Device, T>()(ctx->eigen_device<Device>(),
                                           out->matrix<T>(), a.matrix<T>(),
                                           b.matrix<T>(), dim_pair);
-#ifndef TENSORFLOW_USE_SYCL
     }
-#endif  // TENSORFLOW_USE_SYCL
   }
 
   static void GetBlasGemmAlgorithm(OpKernelConstruction* ctx,
@@ -148,13 +141,6 @@ struct LaunchMatMulCPU : LaunchMatMulBase<CPUDevice, T> {};
 template <typename T, bool USE_CUBLAS>
 struct LaunchMatMul<CPUDevice, T, USE_CUBLAS> : public LaunchMatMulCPU<T> {};
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-struct LaunchMatMulSYCL : LaunchMatMulBase<SYCLDevice, T> {};
-
-template <typename T, bool USE_CUBLAS>
-struct LaunchMatMul<SYCLDevice, T, USE_CUBLAS> : public LaunchMatMulSYCL<T> {};
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -544,19 +530,6 @@ struct MatMulFunctor<CPUDevice, T> {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-// Partial specialization MatMulFunctor<Device=SYCLDevice, T>.
-template <typename T>
-struct MatMulFunctor<SYCLDevice, T> {
-  void operator()(
-      const SYCLDevice& d, typename MatMulTypes<T>::out_type out,
-      typename MatMulTypes<T>::in_type in0,
-      typename MatMulTypes<T>::in_type in1,
-      const Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1>& dim_pair) {
-    MatMul<SYCLDevice>(d, out, in0, in1, dim_pair);
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 
@@ -591,18 +564,4 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
 TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(T)                                         \
-  REGISTER_KERNEL_BUILDER(                                       \
-      Name("MatMul").Device(DEVICE_SYCL).TypeConstraint<T>("T"), \
-      MatMulOp<SYCLDevice, T, false /* xxblas */>);              \
-  REGISTER_KERNEL_BUILDER(Name("MatMul")                         \
-                              .Device(DEVICE_SYCL)               \
-                              .TypeConstraint<T>("T")            \
-                              .Label("eigen"),                   \
-                          MatMulOp<SYCLDevice, T, false /* xxblas */>)
-TF_CALL_float(REGISTER_SYCL);
-TF_CALL_double(REGISTER_SYCL);
-
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/maxpooling_op.cc b/tensorflow/core/kernels/maxpooling_op.cc
index 36ab1d71671..cd9680c8212 100644
--- a/tensorflow/core/kernels/maxpooling_op.cc
+++ b/tensorflow/core/kernels/maxpooling_op.cc
@@ -105,8 +105,8 @@ static void SpatialMaxPoolWithArgMaxHelper(
     const int32 depth = params.depth;
     const int32 in_rows = params.tensor_in_rows;
     const int32 in_cols = params.tensor_in_cols;
-    const int32 pad_rows = params.pad_rows;
-    const int32 pad_cols = params.pad_cols;
+    const int32 pad_top = params.pad_top;
+    const int32 pad_left = params.pad_left;
     const int32 window_rows = params.window_rows;
     const int32 window_cols = params.window_cols;
     const int32 row_stride = params.row_stride;
@@ -131,8 +131,8 @@ static void SpatialMaxPoolWithArgMaxHelper(
         for (int w = 0; w < in_cols; ++w) {
           // (h_start, h_end) * (w_start, w_end) is the range that the input
           // vector projects to.
-          const int hpad = h + pad_rows;
-          const int wpad = w + pad_cols;
+          const int hpad = h + pad_top;
+          const int wpad = w + pad_left;
           const int h_start =
               (hpad < window_rows) ? 0 : (hpad - window_rows) / row_stride + 1;
           const int h_end = std::min(hpad / row_stride + 1, out_height);
@@ -243,6 +243,13 @@ class MaxPoolingGradOp : public OpKernel {
     }
 
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+
+    if (padding_ == Padding::EXPLICIT) {
+      OP_REQUIRES_OK(
+          context, context->GetAttr("explicit_paddings", &explicit_paddings_));
+      OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
+                                                /*num_dims=*/4, data_format_));
+    }
   }
 
   void Compute(OpKernelContext* context) override {
@@ -297,8 +304,13 @@ class MaxPoolingGradOp : public OpKernel {
         errors::Unimplemented(
             "MaxPoolingGrad is not yet supported on the depth dimension."));
 
-    PoolParameters params{context,  ksize,       stride,
-                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize,
+                          stride,
+                          padding_,
+                          explicit_paddings_,
+                          FORMAT_NHWC,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -316,6 +328,7 @@ class MaxPoolingGradOp : public OpKernel {
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
+  std::vector<int64> explicit_paddings_;
   TensorFormat data_format_;
 };
 
@@ -347,7 +360,12 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
                       "Pooling is not yet supported on the batch dimension."));
     }
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-
+    if (padding_ == Padding::EXPLICIT) {
+      OP_REQUIRES_OK(
+          context, context->GetAttr("explicit_paddings", &explicit_paddings_));
+      OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
+                                                /*num_dims=*/4, data_format_));
+    }
     TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_MAXPOOL_NANPROP", false,
                                    &propagate_nans_));
   }
@@ -392,16 +410,26 @@ class MaxPoolingGradOp<Eigen::GpuDevice, T> : public OpKernel {
     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
+    int64 pad_top, pad_bottom, pad_left, pad_right;
+    if (padding_ == Padding::EXPLICIT) {
+      GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'H',
+                               /*pad_top=*/&pad_top,
+                               /*pad_bottom=*/&pad_bottom);
+      GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'W',
+                               /*pad_left=*/&pad_left,
+                               /*pad_right=*/&pad_right);
+    }
     DnnPoolingGradOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize,
-                                 stride, padding_, data_format_, &tensor_in,
-                                 &tensor_out, out_backprop, output_shape,
-                                 propagate_nans_);
+                                 stride, padding_, explicit_paddings_,
+                                 data_format_, &tensor_in, &tensor_out,
+                                 out_backprop, output_shape, propagate_nans_);
   }
 
  private:
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
+  std::vector<int64> explicit_paddings_;
   TensorFormat data_format_;
   bool propagate_nans_;
 };
@@ -492,8 +520,13 @@ class MaxPoolingGradGradOp : public OpKernel {
         errors::Unimplemented(
             "MaxPoolingGrad is not yet supported on the depth dimension."));
 
-    PoolParameters params{context,  ksize,       stride,
-                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize,
+                          stride,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          FORMAT_NHWC,
+                          tensor_in.shape()};
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
                                 {2}, 0, tensor_out.shape(), &output));
@@ -551,8 +584,8 @@ class MaxPoolingGradGradOp : public OpKernel {
       const int32 depth = params.depth;
       const int32 in_rows = params.tensor_in_rows;
       const int32 in_cols = params.tensor_in_cols;
-      const int32 pad_rows = params.pad_rows;
-      const int32 pad_cols = params.pad_cols;
+      const int32 pad_top = params.pad_top;
+      const int32 pad_left = params.pad_left;
       const int32 window_rows = params.window_rows;
       const int32 window_cols = params.window_cols;
       const int32 row_stride = params.row_stride;
@@ -574,9 +607,9 @@ class MaxPoolingGradGradOp : public OpKernel {
           for (int pw = 0; pw < out_width; ++pw) {
             // (h_start, h_end) * (w_start, w_end) is the range that the input
             // vector projects to.
-            int h_start = ph * row_stride - pad_rows;
+            int h_start = ph * row_stride - pad_top;
             const int h_end = std::min(h_start + window_rows, in_rows);
-            int w_start = pw * col_stride - pad_cols;
+            int w_start = pw * col_stride - pad_left;
             const int w_end = std::min(w_start + window_cols, in_cols);
             h_start = std::max(h_start, 0);
             w_start = std::max(w_start, 0);
@@ -691,15 +724,20 @@ class MaxPoolingGradGradOp<Eigen::GpuDevice, T> : public OpKernel {
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
 
-    PoolParameters params{context,  ksize,        stride,
-                          padding_, data_format_, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize,
+                          stride,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          data_format_,
+                          tensor_in.shape()};
 
     functor::MaxPoolGradBackwardNoMask<T>()(
         data_format_, tensor_in.flat<T>().data(), tensor_out.flat<T>().data(),
         params.tensor_in_batch, params.out_height, params.out_width,
         params.depth, params.tensor_in_rows, params.tensor_in_cols,
         params.window_rows, params.window_cols, params.row_stride,
-        params.col_stride, params.pad_rows, params.pad_cols,
+        params.col_stride, params.pad_top, params.pad_left,
         out_grad_backprop.flat<T>().data(), output->flat<T>().data(),
         context->eigen_device<Eigen::GpuDevice>());
   }
@@ -743,13 +781,22 @@ class MaxPoolingNoMaskOp : public OpKernel {
     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
+    OP_REQUIRES(
+        context, padding_ != EXPLICIT,
+        errors::Unimplemented(
+            "Explicit padding is not supported for MaxPoolingNoMaskOp."));
   }
 
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
 
-    PoolParameters params{context,  ksize_,       stride_,
-                          padding_, data_format_, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize_,
+                          stride_,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          data_format_,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -826,8 +873,13 @@ class MaxPoolingNoMaskV2Op : public OpKernel {
     OP_REQUIRES(context, ksize[0] == 1 && stride[0] == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
-    PoolParameters params{context,  ksize,        stride,
-                          padding_, data_format_, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize,
+                          stride,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          data_format_,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -889,8 +941,13 @@ class MaxPoolingWithArgmaxOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
 
-    PoolParameters params{context,  ksize_,      stride_,
-                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize_,
+                          stride_,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          FORMAT_NHWC,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -1003,8 +1060,13 @@ class MaxPoolingGradWithArgmaxOp : public OpKernel {
     const Tensor& grad_in = context->input(1);
     const Tensor& argmax = context->input(2);
 
-    PoolParameters params{context,  ksize_,      stride_,
-                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize_,
+                          stride_,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          FORMAT_NHWC,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -1056,8 +1118,13 @@ class MaxPoolingGradGradWithArgmaxOp : public OpKernel {
     const Tensor& grad_in = context->input(1);
     const Tensor& argmax = context->input(2);
 
-    PoolParameters params{context,  ksize_,      stride_,
-                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize_,
+                          stride_,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          FORMAT_NHWC,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -1100,6 +1167,8 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
                 errors::InvalidArgument("Sliding window stride field must "
                                         "specify 4 dimensions"));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("explicit_paddings", &explicit_paddings_));
     const int32 ksize_n = GetTensorDim(ksize_, data_format_, 'N');
     const int32 stride_n = GetTensorDim(stride_, data_format_, 'N');
     OP_REQUIRES(context, ksize_n == 1 && stride_n == 1,
@@ -1113,8 +1182,9 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
 
-    PoolParameters params{context,  ksize_,       stride_,
-                          padding_, data_format_, tensor_in.shape()};
+    PoolParameters params{
+        context,      ksize_,           stride_, padding_, explicit_paddings_,
+        data_format_, tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -1131,15 +1201,22 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
 
 #if CUDNN_VERSION >= 7300
     DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
-                             stride_, padding_, data_format_, tensor_in,
-                             out_shape, propagate_nans_);
+                             stride_, padding_, explicit_paddings_,
+                             data_format_, tensor_in, out_shape,
+                             propagate_nans_);
 #else
     // These is_int8x4 checks avoid linker errors for missing qint8 kernels.
     if (!is_int8x4 && data_format_ == FORMAT_NCHW) {
       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize_,
-                               stride_, padding_, data_format_, tensor_in,
-                               out_shape, propagate_nans_);
+                               stride_, padding_, explicit_paddings_,
+                               data_format_, tensor_in, out_shape,
+                               propagate_nans_);
     } else {
+#if !defined(TENSORFLOW_USE_ROCM)
+      OP_REQUIRES(context, padding_ != EXPLICIT,
+                  errors::Unimplemented("Explicit padding is not supported ",
+                                        "when CUDNN is not enabled."));
+#endif
       Tensor* output = nullptr;
       OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
       if (is_int8x4) {
@@ -1165,6 +1242,7 @@ class MaxPoolingNoMaskOp<GPUDevice, T> : public OpKernel {
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
+  std::vector<int64> explicit_paddings_;
   TensorFormat data_format_;
   bool propagate_nans_;
 };
@@ -1228,8 +1306,13 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
 
-    PoolParameters params{context,  ksize,        stride,
-                          padding_, data_format_, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize,
+                          stride,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          data_format_,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -1239,8 +1322,9 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
                         params.out_width, params.depth);
     if (data_format_ == FORMAT_NCHW) {
       DnnPoolingOp<T>::Compute(context, se::dnn::PoolingMode::kMaximum, ksize,
-                               stride, padding_, data_format_, tensor_in,
-                               out_shape, propagate_nans_);
+                               stride, padding_, explicit_paddings_,
+                               data_format_, tensor_in, out_shape,
+                               propagate_nans_);
     } else {
       CHECK(data_format_ == FORMAT_NHWC)
           << "MaxPool only supports NCHW or NHWC format";
@@ -1255,6 +1339,7 @@ class MaxPoolingNoMaskV2Op<GPUDevice, T> : public OpKernel {
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
+  std::vector<int64> explicit_paddings_;
   TensorFormat data_format_;
   bool propagate_nans_;
 };
@@ -1267,7 +1352,7 @@ struct LaunchMaxPoolingNoMask<Eigen::GpuDevice, T> {
         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
         params.tensor_in_cols, params.depth, params.out_height,
         params.out_width, params.window_rows, params.window_cols,
-        params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
+        params.row_stride, params.col_stride, params.pad_top, params.pad_left,
         output->flat<T>().data(), nullptr, context->eigen_gpu_device(),
         propagate_nans, false);
     if (!status) {
@@ -1286,7 +1371,7 @@ struct LaunchMaxPoolingWithArgmax<Eigen::GpuDevice, T> {
         input.flat<T>().data(), params.tensor_in_batch, params.tensor_in_rows,
         params.tensor_in_cols, params.depth, params.out_height,
         params.out_width, params.window_rows, params.window_cols,
-        params.row_stride, params.col_stride, params.pad_rows, params.pad_cols,
+        params.row_stride, params.col_stride, params.pad_top, params.pad_left,
         output->flat<T>().data(),
         reinterpret_cast<int64*>(argmax->flat<int64>().data()),
         context->eigen_gpu_device(), propagate_nans, include_batch_in_index);
diff --git a/tensorflow/core/kernels/mkl/BUILD b/tensorflow/core/kernels/mkl/BUILD
index 16180a5b7bd..297173cfca3 100644
--- a/tensorflow/core/kernels/mkl/BUILD
+++ b/tensorflow/core/kernels/mkl/BUILD
@@ -22,7 +22,7 @@ MKL_SHORT_DEPS = [
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
     "//tensorflow/core:lib_internal",
-    "//tensorflow/core/kernels:bounds_check",
+    "//tensorflow/core/framework:bounds_check",
     "//tensorflow/core/kernels:ops_util",
 ] + mkl_deps()
 
@@ -78,6 +78,7 @@ tf_cc_test_mkl(
     name = "mkl_quantized_conv_ops_perchannel_test",
     size = "small",
     srcs = ["mkl_quantized_conv_ops_perchannel_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
     deps = [
         ":mkl_conv_op",
         "//tensorflow/core:array_ops_op_lib",
@@ -92,6 +93,7 @@ tf_cc_test_mkl(
     name = "mkl_quantized_conv_ops_test",
     size = "small",
     srcs = ["mkl_quantized_conv_ops_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
     deps = [
         ":mkl_conv_op",
         "//tensorflow/core:array_ops_op_lib",
@@ -106,6 +108,7 @@ tf_cc_test_mkl(
     name = "mkl_qmatmul_op_test",
     size = "small",
     srcs = ["mkl_qmatmul_op_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
     deps = [
         ":mkl_qmatmul_op",
         "//tensorflow/core:array_ops_op_lib",
@@ -123,7 +126,7 @@ tf_mkl_kernel_library(
     srcs = ["mkl_quantize_op.cc"],
     deps = [
         "//tensorflow/core/kernels:quantized_ops",
-        "//tensorflow/core:mkl_graph_util",
+        "//tensorflow/core/graph:mkl_graph_util",
         "@gemmlowp",
     ] + MKL_DEPS,
 )
@@ -132,6 +135,7 @@ tf_cc_test_mkl(
     name = "mkl_quantize_op_test",
     size = "small",
     srcs = ["mkl_quantize_op_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
     deps = [
         ":mkl_quantize_op",
         "//tensorflow/core:array_ops_op_lib",
@@ -144,6 +148,7 @@ tf_cc_test_mkl(
     name = "mkl_quantized_pooling_ops_test",
     size = "small",
     srcs = ["mkl_quantized_pooling_ops_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
     deps = [
         ":mkl_pooling_ops",
         "//tensorflow/core:array_ops_op_lib",
@@ -158,6 +163,7 @@ tf_cc_test_mkl(
     name = "mkl_quantized_concat_op_test",
     size = "small",
     srcs = ["mkl_quantized_concat_op_test.cc"],
+    linkstatic = 1,  # Fixes dyld error on MacOS.
     deps = [
         ":mkl_concat_op",
         "//tensorflow/core:array_ops_op_lib",
@@ -249,7 +255,7 @@ tf_mkl_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:math_ops_op_lib",
-        "//tensorflow/core:mkl_graph_util",
+        "//tensorflow/core/graph:mkl_graph_util",
         "//tensorflow/core:nn_ops_op_lib",
         "//tensorflow/core/kernels:concat_lib_hdrs",
         "//tensorflow/core/kernels:conv_ops",
@@ -318,6 +324,7 @@ tf_cc_test_mkl(
     srcs = ["mkl_fused_batch_norm_op_test.cc"],
     linkstatic = 1,
     deps = [
+        ":mkl_conv_op",
         ":mkl_fused_batch_norm_op",
         "//tensorflow/core:direct_session",
         "//tensorflow/core/kernels:conv_ops_gpu_hdrs",
@@ -408,6 +415,7 @@ tf_cc_test_mkl(
         ":mkl_conv_op",
         ":mkl_matmul_op",
         ":mkl_tfconv_op",
+        "//tensorflow/cc:cc_ops_internal",
         "//tensorflow/core:direct_session",
         "//tensorflow/core/kernels:bias_op",
         "//tensorflow/core/kernels:conv_ops",
@@ -416,6 +424,7 @@ tf_cc_test_mkl(
         "//tensorflow/core/kernels:pad_op",
         "//tensorflow/core/kernels:relu_op",
         "//tensorflow/core/kernels/image:image",
+        "//tensorflow/core:tensorflow",
     ] + MKL_TEST_DEPS,
 )
 
diff --git a/tensorflow/core/kernels/mkl/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl/mkl_avgpooling_op.cc
index 754156c860a..c77601859cf 100644
--- a/tensorflow/core/kernels/mkl/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_avgpooling_op.cc
@@ -40,7 +40,7 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
-template <typename Device, typename T>
+template <typename Device, typename T, bool native_format = false>
 class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
  public:
   explicit MklAvgPoolingOp(OpKernelConstruction* context)
@@ -48,6 +48,7 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
     // Workspace is an MKLDNN construct that is only used in Max Pooling.
     // So set workspace_enabled_ to false.
     this->workspace_enabled_ = false;
+    this->native_format_ = native_format;
   }
 
   void Compute(OpKernelContext* context) override {
@@ -55,7 +56,8 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
       const Tensor& input_tensor =
           MklGetInput(context, this->kInputTensorIndexInput);
       MklDnnShape dnn_shape_input;
-      GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input);
+      GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input,
+                  this->native_format_);
       this->SanityCheckInput(context, input_tensor, dnn_shape_input);
       if (!context->status().ok()) return;
 
@@ -116,7 +118,8 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
           src_dims, output_dims_mkl_order, filter_dims, strides, padding_left,
           padding_right, ALGORITHM::pooling_avg_exclude_padding,
           pooling_prop_kind,
-          static_cast<MEMORY_FORMAT>(this->data_format_mkldnn_), input_md);
+          static_cast<MEMORY_FORMAT>(this->data_format_mkldnn_), input_md,
+          this->native_format_);
 #else
       MklPoolingParams fwdParams(
           src_dims, output_dims_mkl_order, filter_dims, strides, padding_left,
@@ -174,11 +177,13 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
   engine cpu_engine_ = engine(ENGINE_CPU, 0);
 };  // MklAvgPoolingOp
 
-template <class Device, class T>
+template <class Device, class T, bool native_format = false>
 class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
  public:
   explicit MklAvgPoolingGradOp(OpKernelConstruction* context)
-      : MklPoolingBackwardOpBase<T>(context) {}
+      : MklPoolingBackwardOpBase<T>(context) {
+    this->native_format_ = native_format;
+  }
 
   void Compute(OpKernelContext* context) override {
     try {
@@ -188,8 +193,10 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
           MklGetInput(context, kInputTensorIndexInputGradient);
 
       MklDnnShape orig_input_mkl_shape, grad_mkl_shape;
-      GetMklShape(context, kInputTensorIndexInputShape, &orig_input_mkl_shape);
-      GetMklShape(context, kInputTensorIndexInputGradient, &grad_mkl_shape);
+      GetMklShape(context, kInputTensorIndexInputShape, &orig_input_mkl_shape,
+                  this->native_format_);
+      GetMklShape(context, kInputTensorIndexInputGradient, &grad_mkl_shape,
+                  this->native_format_);
       if (!context->status().ok()) return;
 
       // Used to allocate output_diff_src/diff_src.
@@ -249,7 +256,8 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
           orig_input_dims_mkl_order, output_dims_mkl_order, filter_dims,
           strides, padding_left, padding_right,
           ALGORITHM::pooling_avg_exclude_padding, prop_kind::forward_training,
-          static_cast<MEMORY_FORMAT>(this->data_format_mkldnn_), src_md);
+          static_cast<MEMORY_FORMAT>(this->data_format_mkldnn_), src_md,
+          this->native_format_);
 #else
       MklPoolingParams bwdParams(
           orig_input_dims_mkl_order, output_dims_mkl_order, filter_dims,
@@ -273,7 +281,8 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       std::shared_ptr<PoolingBwdPd> pooling_bwd_pd =
           pooling_bwd->GetPoolingBwdPd();
       T* diff_dst_data = nullptr;
-      if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, pooling_bwd_pd,
+      if (!this->native_format_ &&
+          IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, pooling_bwd_pd,
                                      pooling_bwd)) {
         grad_dnn_data.SetUsrMem(diff_dst_md, &grad_tensor);
         grad_dnn_data.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
@@ -307,36 +316,56 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
   engine cpu_engine_ = engine(ENGINE_CPU, 0);
 };  // MklAvgPoolingGradOp
 
-#define REGISTER_MKL_AVGPOOL3D_KERNELS(T)                      \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklAvgPool3D")                                    \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklAvgPoolingOp<CPUDevice, T>);                          \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklAvgPool3DGrad")                                \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklAvgPoolingGradOp<CPUDevice, T>);
+#define REGISTER_MKL_AVGPOOL3D_KERNELS(T)                                     \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_MklAvgPool3D")                                                   \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                \
+      MklAvgPoolingOp<CPUDevice, T>);                                         \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_MklAvgPool3DGrad")                                               \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                \
+      MklAvgPoolingGradOp<CPUDevice, T>);                                     \
+  REGISTER_KERNEL_BUILDER(Name("_MklNativeAvgPool3D")                         \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklNameChangeOpLabel), \
+                          MklAvgPoolingOp<CPUDevice, T, true>);               \
+  REGISTER_KERNEL_BUILDER(Name("_MklNativeAvgPool3DGrad")                     \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklNameChangeOpLabel), \
+                          MklAvgPoolingGradOp<CPUDevice, T, true>);
 
 TF_CALL_float(REGISTER_MKL_AVGPOOL3D_KERNELS);
 TF_CALL_bfloat16(REGISTER_MKL_AVGPOOL3D_KERNELS);
 
-#define REGISTER_MKL_AVGPOOL_KERNELS(T)                        \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklAvgPool")                                      \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklAvgPoolingOp<CPUDevice, T>);                          \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklAvgPoolGrad")                                  \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklAvgPoolingGradOp<CPUDevice, T>);
+#define REGISTER_MKL_AVGPOOL_KERNELS(T)                                       \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_MklAvgPool")                                                     \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                \
+      MklAvgPoolingOp<CPUDevice, T>);                                         \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_MklAvgPoolGrad")                                                 \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                \
+      MklAvgPoolingGradOp<CPUDevice, T>);                                     \
+  REGISTER_KERNEL_BUILDER(Name("_MklNativeAvgPool")                           \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklNameChangeOpLabel), \
+                          MklAvgPoolingOp<CPUDevice, T, true>);               \
+  REGISTER_KERNEL_BUILDER(Name("_MklNativeAvgPoolGrad")                       \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklNameChangeOpLabel), \
+                          MklAvgPoolingGradOp<CPUDevice, T, true>);
 
 TF_CALL_float(REGISTER_MKL_AVGPOOL_KERNELS);
 TF_CALL_bfloat16(REGISTER_MKL_AVGPOOL_KERNELS);
diff --git a/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc
index da5a239c224..fbba4116e3b 100644
--- a/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc
@@ -15,26 +15,16 @@ limitations under the License.
 
 // See docs in ../ops/math_ops.cc.
 
-// This file uses both oneDNN and MKL CBLAS batched xGEMM for acceleration of
-// Batch Matrix-Matrix Multiplication (MatMul) operations.
-// We currently register this kernel only for oneDNN supported data
-// types (float, bfloat16). This file can be built with and without the use of
-// the binary MKL CBLAS calls, controlled by the macro INTEL_MKL_DNN_ONLY.
-// If INTEL_MKL_DNN_ONLY is defined, only oneDNN is used. For cases not
-// supported by oneDNN (ex. Batchmatmul with broadcasting) we fall back to the
-// default CPU implementation.
-// if INTEL_MKL_DNN_ONLY is not defined, both oneDNN and MKL CBLAS
-// implementations are used. This is only temporary, once we are able handle all
-// cases with oneDNN, CBLAS calls will be removed.
+// This file uses oneDNN library for acceleration of Batch Matrix-Matrix
+// Multiplication (MatMul) operations. We currently register this kernel only
+// for oneDNN supported data types (float, bfloat16). The maximum number of
+// dimensions (rank) for output tensor is 12 in oneDNN. If output tensor rank
+// exceeds 12, we fall back to Eigen library based kernel.
 
 #define EIGEN_USE_THREADS
 
 #if defined(INTEL_MKL)
-#include <vector>
 
-#if !defined(INTEL_MKL_DNN_ONLY)
-#include "mkl_cblas.h"
-#endif  // !INTEL_MKL_DNN_ONLY
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -100,8 +90,8 @@ class BatchMatMulMkl : public OpKernel {
     }
 
     // lhs and rhs can have different dimensions
-    const int ndims_lhs = lhs.dims();
-    const int ndims_rhs = rhs.dims();
+    const auto ndims_lhs = lhs.dims();
+    const auto ndims_rhs = rhs.dims();
 
     // Get broadcast info
     MatMulBCast bcast(lhs.shape().dim_sizes(), rhs.shape().dim_sizes());
@@ -111,16 +101,7 @@ class BatchMatMulMkl : public OpKernel {
             "In[0] and In[1] must have compatible batch dimensions: ",
             lhs.shape().DebugString(), " vs. ", rhs.shape().DebugString()));
 
-#if defined(INTEL_MKL_DNN_ONLY)
-    if (bcast.IsBroadcastingRequired()) {
-      // Calling Eigen Kernel for broadcasting case and return. Eigen does
-      // not have BF16 support, so we have to fail graciously in that case.
-      eigen_batch_mm_v2_.Compute(ctx);
-      return;
-    }
-#endif  // INTEL_MKL_DNN_ONLY
     TensorShape out_shape = bcast.output_batch_shape();
-    auto batch_size = bcast.output_batch_size();
 
     auto lhs_rows = lhs.dim_size(ndims_lhs - 2);
     auto lhs_cols = lhs.dim_size(ndims_lhs - 1);
@@ -137,6 +118,12 @@ class BatchMatMulMkl : public OpKernel {
 
     out_shape.AddDim(lhs_rows);
     out_shape.AddDim(rhs_cols);
+    // The maximum number of dimensions for a tensor in DNNL is 12.
+    OP_REQUIRES(
+        ctx, out_shape.dims() <= 12,
+        errors::InvalidArgument(
+            "Rank of output tensor must be <= 12, but is ", out_shape.dims(),
+            ". Current implementation supports upto rank 12 tensors."));
 
     Tensor* out = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
@@ -149,75 +136,17 @@ class BatchMatMulMkl : public OpKernel {
       return;
     }
 
-    auto rhs_reshaped = rhs.template flat_inner_dims<Scalar, 3>();
-    auto lhs_reshaped = lhs.template flat_inner_dims<Scalar, 3>();
-    auto out_reshaped = out->template flat_inner_dims<Scalar, 3>();
-    const uint64 M = lhs_reshaped.dimension(adj_x_ ? 2 : 1);
-    const uint64 K = lhs_reshaped.dimension(adj_x_ ? 1 : 2);
-    const uint64 N = rhs_reshaped.dimension(adj_y_ ? 1 : 2);
-
-    std::vector<MKL_INT> m_array(batch_size, M);
-    std::vector<MKL_INT> n_array(batch_size, N);
-    std::vector<MKL_INT> k_array(batch_size, K);
-    std::vector<MKL_INT> lda_array(batch_size, adj_x_ ? M : K);
-    std::vector<MKL_INT> ldb_array(batch_size, adj_y_ ? K : N);
-    std::vector<MKL_INT> ldc_array(batch_size, N);
-    std::vector<MKL_INT> group_size(1, batch_size);
-
-    bool bcast_not_supported = false;
-#if defined(INTEL_MKL_DNN_ONLY)
-    bcast_not_supported = true;
-#endif  // INTEL_MKL_DNN_ONLY
-    if (std::is_same<Scalar, bfloat16>::value || bcast_not_supported) {
-      // DNNL bfloat16 API requires a, b, and c as pointers to tensors
-      // represented as flat-byte array.
-      const Scalar* a = nullptr;
-      const Scalar* b = nullptr;
-      Scalar* c = nullptr;
-      a = &lhs_reshaped(0, 0, 0);
-      b = &rhs_reshaped(0, 0, 0);
-      OP_REQUIRES(ctx, !bcast.IsBroadcastingRequired(),
-                  errors::Unimplemented("Broadcasting is not supported for "
-                                        "_MklBatchMatMul yet."));
-      c = &out_reshaped(0, 0, 0);
-      // TODO(nhasabni): Use appropriate cast instead of passing addresses of
-      // a,b and c.
-      MklCblasGemmBatch(CblasRowMajor, adj_x_, adj_y_, m_array, n_array,
-                        k_array, &a, lda_array, &b, ldb_array, &c, ldc_array, 1,
-                        group_size, ctx);
-    } else {
-      std::vector<const Scalar*> a_array;
-      std::vector<const Scalar*> b_array;
-      std::vector<Scalar*> c_array;
-      a_array.reserve(batch_size);
-      b_array.reserve(batch_size);
-      c_array.reserve(batch_size);
-
-      if (!bcast.IsBroadcastingRequired()) {
-        for (int64 i = 0; i < batch_size; i++) {
-          a_array.push_back(&lhs_reshaped(i, 0, 0));
-          b_array.push_back(&rhs_reshaped(i, 0, 0));
-          c_array.push_back(&out_reshaped(i, 0, 0));
-        }
-      } else {
-        // Broadcasting is needed, so get the mapping from flattened output
-        // batch indices to x's and y's flattened batch indices.
-        const std::vector<int64>& a_batch_indices = bcast.x_batch_indices();
-        const std::vector<int64>& b_batch_indices = bcast.y_batch_indices();
-
-        for (int64 i = 0; i < batch_size; i++) {
-          a_array.push_back(&lhs_reshaped(a_batch_indices[i], 0, 0));
-          b_array.push_back(&rhs_reshaped(b_batch_indices[i], 0, 0));
-          c_array.push_back(&out_reshaped(i, 0, 0));
-        }
-      }
-
-      // MKL CBLAS API requires a, b, and c as array of pointers, where each
-      // pointer is to 2D matrix.
-      MklCblasGemmBatch(CblasRowMajor, adj_x_, adj_y_, m_array, n_array,
-                        k_array, &a_array[0], lda_array, &b_array[0], ldb_array,
-                        &c_array[0], ldc_array, 1, group_size, ctx);
-    }
+    // Compute parameters for DNNL matmul primitive.
+    auto params = CreateMatMulParams(lhs.shape(), rhs.shape(), out_shape);
+    // Create or retrieve matmul primitive from cache.
+    MklMatMulPrimitive<Scalar>* matmul_prim =
+        MklMatMulPrimitiveFactory<Scalar>::Get(
+            *params, false /* value for do_not_cache */);
+    // Execute matmul primitive.
+    std::shared_ptr<stream> cpu_stream;
+    cpu_stream.reset(CreateStream(ctx, matmul_prim->GetEngine()));
+    matmul_prim->Execute(lhs.flat<Scalar>().data(), rhs.flat<Scalar>().data(),
+                         out->flat<Scalar>().data(), cpu_stream);
   }
 
  private:
@@ -225,60 +154,78 @@ class BatchMatMulMkl : public OpKernel {
   bool adj_y_;
   BatchMatMulV2Op<CPUDevice, Scalar> eigen_batch_mm_v2_;
 
-  void MklCblasGemmBatch(
-      const CBLAS_LAYOUT Layout, const bool TransA, const bool TransB,
-      const std::vector<MKL_INT>& M_Array, const std::vector<MKL_INT>& N_Array,
-      const std::vector<MKL_INT>& K_Array, const float** A_Array,
-      const std::vector<MKL_INT>& lda_Array, const float** B_Array,
-      const std::vector<MKL_INT>& ldb_Array, float** C_Array,
-      const std::vector<MKL_INT>& ldc_Array, const MKL_INT group_count,
-      const std::vector<MKL_INT>& group_size, OpKernelContext* ctx) {
-#if !defined(INTEL_MKL_DNN_ONLY)
-    std::vector<CBLAS_TRANSPOSE> TransA_Array(
-        group_size[0], TransA ? CblasTrans : CblasNoTrans);
-    std::vector<CBLAS_TRANSPOSE> TransB_Array(
-        group_size[0], TransB ? CblasTrans : CblasNoTrans);
-    std::vector<float> alpha_Array(group_size[0], 1.0);
-    std::vector<float> beta_Array(group_size[0], 0.0);
-    cblas_sgemm_batch(Layout, &TransA_Array[0], &TransB_Array[0], &M_Array[0],
-                      &N_Array[0], &K_Array[0], &alpha_Array[0],
-                      reinterpret_cast<const float**>(A_Array), &lda_Array[0],
-                      reinterpret_cast<const float**>(B_Array), &ldb_Array[0],
-                      &beta_Array[0], reinterpret_cast<float**>(C_Array),
-                      &ldc_Array[0], group_count, &group_size[0]);
-#else
-    DCHECK(Layout == CblasRowMajor);
-    std::vector<bool> TransA_Array(group_size[0], TransA);
-    std::vector<bool> TransB_Array(group_size[0], TransB);
-    std::vector<float> alpha_Array(group_size[0], 1.0);
-    std::vector<float> beta_Array(group_size[0], 0.0);
-    dnnl_gemm_batch<float>(TransA_Array, TransB_Array, M_Array, N_Array,
-                           K_Array, alpha_Array, *A_Array, *B_Array, beta_Array,
-                           *C_Array, group_count, group_size, ctx);
-#endif  // !INTEL_MKL_DNN_ONLY
+  using dims = dnnl::memory::dims;
+
+  // This method makes the rank (ndims) of input same as the output by adding
+  // new axes to the input. For example, if input shape is [a, b, c, d] and
+  // output shape is [e, f, g, h, i, j], then the reshaped input would have a
+  // shape of [1, 1, a, b, c, d].
+  void ExpandInputDimsToOutputShape(const TensorShape& input_shape,
+                                    const TensorShape& output_shape,
+                                    dims* reshaped_dims) {
+    auto ndims_input = input_shape.dims();
+    auto ndims_output = output_shape.dims();
+    auto dim_offset = ndims_output - ndims_input;
+    DCHECK(dim_offset > 0);
+    reshaped_dims->clear();
+    reshaped_dims->resize(ndims_output, 1);
+    auto input_dims = input_shape.dim_sizes();
+    for (int dim_idx = 0; dim_idx < ndims_input; ++dim_idx)
+      reshaped_dims->at(dim_idx + dim_offset) = input_dims[dim_idx];
   }
-// BatchMatMul BFloat16 support only exists in DNNL 1.2 onwards.
-#if defined(ENABLE_MKLDNN_V1) && defined(ENABLE_INTEL_MKL_BFLOAT16)
-  void MklCblasGemmBatch(
-      const CBLAS_LAYOUT Layout, const bool TransA, const bool TransB,
-      const std::vector<MKL_INT>& M_Array, const std::vector<MKL_INT>& N_Array,
-      const std::vector<MKL_INT>& K_Array, const bfloat16** A_Array,
-      const std::vector<MKL_INT>& lda_Array, const bfloat16** B_Array,
-      const std::vector<MKL_INT>& ldb_Array, bfloat16** C_Array,
-      const std::vector<MKL_INT>& ldc_Array, const MKL_INT group_count,
-      const std::vector<MKL_INT>& group_size, OpKernelContext* ctx) {
-    DCHECK(Layout == CblasRowMajor);
-    std::vector<bool> TransA_Array(group_size[0], TransA);
-    std::vector<bool> TransB_Array(group_size[0], TransB);
-    std::vector<float> alpha_Array(group_size[0], 1.0);
-    std::vector<float> beta_Array(group_size[0], 0.0);
-    // TODO(nhasabni): Remove *A when we pass a, b, and c correctly.
-    // MKLDNN API does not require lda, ldb, and ldc.
-    dnnl_gemm_batch<bfloat16>(
-        TransA_Array, TransB_Array, M_Array, N_Array, K_Array, alpha_Array,
-        *A_Array, *B_Array, beta_Array, *C_Array, group_count, group_size, ctx);
+
+  std::unique_ptr<MklMatMulParams> CreateMatMulParams(
+      const TensorShape& lhs_shape, const TensorShape& rhs_shape,
+      const TensorShape& out_shape) {
+    const auto ndims_lhs = lhs_shape.dims();
+    const auto ndims_rhs = rhs_shape.dims();
+    const auto ndims_out = out_shape.dims();
+    auto lhs_dims = TFShapeToMklDnnDims(lhs_shape);
+    auto rhs_dims = TFShapeToMklDnnDims(rhs_shape);
+    auto out_dims = TFShapeToMklDnnDims(out_shape);
+
+    // DNNL matmul_primitive requires ranks of inputs and output to be same.
+    // Create dnnl::memory::dims for inputs and output of same rank.
+    // It is assumed here that MatMulBCast object creates output_batch_shape as
+    // a conforming superset of input batch shapes, i.e., ndims_out >=
+    // ndims_lhs and ndims_out >= ndims_rhs.
+    if (ndims_lhs < ndims_out) {
+      ExpandInputDimsToOutputShape(lhs_shape, out_shape, &lhs_dims);
+    }
+    if (ndims_rhs < ndims_out) {
+      ExpandInputDimsToOutputShape(rhs_shape, out_shape, &rhs_dims);
+    }
+
+    using dim = dnnl::memory::dim;
+    dim m;  // number of rows in x
+    dim k;  // number of columns in x
+    dim n;  // number of columns in y
+    auto lhs_strides = CalculateTFStrides(lhs_dims);
+    auto rhs_strides = CalculateTFStrides(rhs_dims);
+    auto out_strides = CalculateTFStrides(out_dims);
+
+    if (adj_x_) {
+      int m_idx = ndims_out - 1;
+      int k_idx = ndims_out - 2;
+      m = lhs_dims[m_idx];
+      k = lhs_dims[k_idx];
+      std::swap(lhs_dims[m_idx], lhs_dims[k_idx]);
+      lhs_strides[m_idx] = m;
+      lhs_strides[k_idx] = 1;
+    }
+
+    if (adj_y_) {
+      int k_idx = ndims_out - 1;
+      int n_idx = ndims_out - 2;
+      k = rhs_dims[k_idx];
+      n = rhs_dims[n_idx];
+      std::swap(rhs_dims[k_idx], rhs_dims[n_idx]);
+      rhs_strides[k_idx] = k;
+      rhs_strides[n_idx] = 1;
+    }
+    return std::make_unique<MklMatMulParams>(
+        lhs_dims, rhs_dims, out_dims, lhs_strides, rhs_strides, out_strides);
   }
-#endif  // ENABLE_MKLDNN_V1 && ENABLE_INTEL_MKL_BFLOAT16
 };
 
 #define REGISTER_BATCH_MATMUL_MKL(TYPE)                                       \
@@ -294,14 +241,11 @@ class BatchMatMulMkl : public OpKernel {
                               .TypeConstraint<TYPE>("T")                      \
                               .Label(mkl_op_registry::kMklNameChangeOpLabel), \
                           BatchMatMulMkl<CPUDevice, TYPE, true>)
-
 #ifdef ENABLE_MKL
 TF_CALL_float(REGISTER_BATCH_MATMUL_MKL);
 TF_CALL_float(REGISTER_BATCH_MATMUL_MKL_V2);
-#if defined(ENABLE_MKLDNN_V1) && defined(ENABLE_INTEL_MKL_BFLOAT16)
 TF_CALL_bfloat16(REGISTER_BATCH_MATMUL_MKL);
 TF_CALL_bfloat16(REGISTER_BATCH_MATMUL_MKL_V2);
-#endif  // ENABLE_MKLDNN_V1 && ENABLE_INTEL_MKL_BFLOAT16
 #endif  // ENABLE_MKL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl/mkl_concat_op.cc b/tensorflow/core/kernels/mkl/mkl_concat_op.cc
index 086dc46d903..7d7cd568073 100644
--- a/tensorflow/core/kernels/mkl/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_concat_op.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::concat;
@@ -241,11 +240,11 @@ struct MklConcatFwdParams {
   memory::dims dst_dims;
   int num_inputs;
   int concat_dims;
-  MEMORY_FORMAT mkl_common_format;
+  memory::format_tag mkl_common_format;
 
   MklConcatFwdParams(std::vector<memory::dims>& src_dims_pt,
                      memory::dims dst_dims, int num_inputs, int concat_dims,
-                     MEMORY_FORMAT mkl_common_format)
+                     memory::format_tag mkl_common_format)
       : dst_dims(dst_dims),
         num_inputs(num_inputs),
         concat_dims(concat_dims),
@@ -265,7 +264,7 @@ class MklConcatFwdPrimitive : public MklPrimitive {
  public:
   explicit MklConcatFwdPrimitive(const MklConcatFwdParams& concat_fwd_dims,
                                  const std::vector<memory::desc>& srcs_md)
-      : MklPrimitive(engine(ENGINE_CPU, 0)) {
+      : MklPrimitive(engine(engine::kind::cpu, 0)) {
     // Create concat primitive
     Setup(concat_fwd_dims, srcs_md);
   }
@@ -299,12 +298,8 @@ class MklConcatFwdPrimitive : public MklPrimitive {
       context_.data_mem[i] = *context_.data_mem_shdptr[i];
     }
 
-#ifdef ENABLE_MKLDNN_V1
     execute_primitives(context_.fwd_primitives, fwd_stream,
                        context_.fwd_primitives_args);
-#else
-    fwd_stream->submit(context_.fwd_primitives);
-#endif  // ENABLE_MKLDNN_V1
 
     // After exec, set data handle back
     context_.dst_mem->set_data_handle(DummyData);
@@ -320,18 +315,8 @@ class MklConcatFwdPrimitive : public MklPrimitive {
  private:
   // Primitive reuse context for concat Fwd op
   struct ConcatFwdContext {
-#ifndef ENABLE_MKLDNN_V1
-    std::vector<mkldnn::memory::primitive_desc> src_pd;
-    std::vector<std::shared_ptr<mkldnn::memory::primitive_desc>> src_pd_shdptr;
-    std::shared_ptr<mkldnn::memory::primitive_desc> dst_pd;
-#endif  // !ENABLE_MKLDNN_V1
-
-// MKL-DNN memory
-#ifdef ENABLE_MKLDNN_V1
+    // MKL-DNN memory
     std::vector<mkldnn::memory> data_mem;
-#else
-    std::vector<mkldnn::primitive::at> data_mem;
-#endif  // ENABLE_MKLDNN_V1
     std::vector<std::shared_ptr<mkldnn::memory>> data_mem_shdptr;
     std::shared_ptr<mkldnn::memory> dst_mem;
 
@@ -345,9 +330,7 @@ class MklConcatFwdPrimitive : public MklPrimitive {
 
     std::vector<mkldnn::primitive> fwd_primitives;
 
-#ifdef ENABLE_MKLDNN_V1
     std::vector<std::unordered_map<int, memory>> fwd_primitives_args;
-#endif  // ENABLE_MKLDNN_V1
 
     ConcatFwdContext()
         : dst_mem(nullptr), fwd_pd(nullptr), concat_fwd(nullptr) {}
@@ -361,18 +344,8 @@ class MklConcatFwdPrimitive : public MklPrimitive {
     for (size_t i = 0; i < concat_fwd_dims.num_inputs; i++) {
       mkldnn::memory::desc source_md(memory::desc(srcs_md[i].data));
       context_.src_md.push_back(source_md);
-#ifdef ENABLE_MKLDNN_V1
       std::shared_ptr<mkldnn::memory> src_mem(
           new mkldnn::memory(source_md, cpu_engine_, DummyData));
-#else
-      std::shared_ptr<mkldnn::memory::primitive_desc> src_mpd(
-          new memory::primitive_desc(source_md, cpu_engine_));
-      context_.src_pd_shdptr.push_back(src_mpd);
-
-      std::shared_ptr<mkldnn::memory> src_mem(
-          new mkldnn::memory(*src_mpd, DummyData));
-      context_.src_pd.push_back(*context_.src_pd_shdptr[i]);
-#endif  // ENABLE_MKLDNN_V1
       context_.data_mem_shdptr.push_back(src_mem);
       context_.data_mem.push_back(*context_.data_mem_shdptr[i]);
     }
@@ -380,29 +353,15 @@ class MklConcatFwdPrimitive : public MklPrimitive {
     context_.dst_md.reset(new memory::desc({concat_fwd_dims.dst_dims},
                                            MklDnnType<T>(),
                                            concat_fwd_dims.mkl_common_format));
-// Create a concat primitive descriptor
-#ifdef ENABLE_MKLDNN_V1
+    // Create a concat primitive descriptor
     context_.fwd_pd.reset(new concat::primitive_desc(
         *context_.dst_md, concat_fwd_dims.concat_dims, context_.src_md,
         cpu_engine_));
-#else
-    context_.fwd_pd.reset(new concat::primitive_desc(
-        concat_fwd_dims.concat_dims, context_.src_pd));
-#endif  // ENABLE_MKLDNN_V1
 
-#ifdef ENABLE_MKLDNN_V1
     // Create memory primitive based on dummy data
     context_.dst_mem.reset(
         new memory(*context_.dst_md, cpu_engine_, DummyData));
-#else
-    context_.dst_pd.reset(
-        new memory::primitive_desc(*context_.dst_md, cpu_engine_));
 
-    // Create memory primitive based on dummy data
-    context_.dst_mem.reset(new memory(*context_.dst_pd, DummyData));
-#endif  // ENABLE_MKLDNN_V1
-
-#ifdef ENABLE_MKLDNN_V1
     context_.concat_fwd.reset(new concat(*context_.fwd_pd));
     std::unordered_map<int, memory> net_args = {
         {MKLDNN_ARG_DST, *context_.dst_mem}};
@@ -411,12 +370,6 @@ class MklConcatFwdPrimitive : public MklPrimitive {
     }
 
     context_.fwd_primitives_args.push_back(net_args);
-#else
-    // Create concat primitive
-    context_.concat_fwd.reset(
-        new concat(*context_.fwd_pd, context_.data_mem, *context_.dst_mem));
-#endif  // ENABLE_MKLDNN_V1
-
     context_.fwd_primitives.push_back(*context_.concat_fwd);
   }
 
@@ -500,7 +453,7 @@ class MklConcatOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     try {
-      auto cpu_engine = engine(ENGINE_CPU, 0);
+      auto cpu_engine = engine(engine::kind::cpu, 0);
       OpInputList input_tensors;
       GetMklInputList(context, "values", &input_tensors);
       const int N = input_tensors.size();
@@ -579,11 +532,9 @@ class MklConcatOp : public OpKernel {
       // format and avoid calling eigen version.
       if (!are_all_tf_inputs && !are_all_mkl_inputs) invoke_eigen = true;
 
-#ifdef ENABLE_MKLDNN_V1
       // Temporally call Eigen if number of input dimensions is 2.
       // That is due to an incorrect output results in DNNL 1.2 path.
       if (expected_dims == 2) invoke_eigen = true;
-#endif  // ENABLE_MKLDNN_V1
 
       OpInputList input_mins, input_maxes;
       bool quantized_input =
@@ -636,17 +587,13 @@ class MklConcatOp : public OpKernel {
         // output format that is same as input formats.
         dst_dims = TFShapeToMklDnnDims(input_tensors[0].shape());
 
-      std::vector<MEMORY_PRIMITIVE_DESC> srcs_pd;
+      std::vector<memory::desc> srcs_pd;
       std::vector<MklDnnData<T>> srcs(N, MklDnnData<T>(&cpu_engine));
       int64 dst_concat_dim_size = 0;
 
       bool isMklReorderNeeded = false;
-      MEMORY_FORMAT mkl_common_format = MEMORY_FORMAT::any;
-#ifdef ENABLE_MKLDNN_V1
+      memory::format_tag mkl_common_format = memory::format_tag::any;
       std::vector<memory> inputs;
-#else
-      std::vector<primitive::at> inputs;
-#endif  // ENABLE_MKLDNN_V1
       std::vector<memory::dims> src_dims_pt;
       std::vector<mkldnn::memory> srcs_mem;
       std::vector<memory::desc> srcs_md;
@@ -662,7 +609,7 @@ class MklConcatOp : public OpKernel {
             if (input_tensors[k].NumElements() == 0) continue;
             auto src_md = mkl_input_shapes[k].GetMklLayout();
             srcs[k].SetUsrMem(src_md, &input_tensors[k]);
-            auto src_mpd = GET_USR_MEM_PRIM_DESC(srcs[k]);
+            auto src_mpd = srcs[k].GetUsrMemDesc();
             srcs_pd.push_back(src_mpd);
             inputs.push_back(srcs[k].GetOpMem());
           }
@@ -681,8 +628,7 @@ class MklConcatOp : public OpKernel {
               src_md =
                   memory::desc(src_dims, MklDnnType<T>(), mkl_common_format);
             }
-            srcs_pd.push_back(
-                MEMORY_PD_CONSTRUCTOR_2_PARAMS(src_md, cpu_engine));
+            srcs_pd.push_back(memory::desc(src_md));
           }
         }
       } else {  // All TF inputs
@@ -696,15 +642,15 @@ class MklConcatOp : public OpKernel {
           // It does not matter what data format to be used (NHWC versus NCHW).
           // We just need to ensure that output uses same data format as inputs.
           if (s_dims == 4)
-            mkl_common_format = MEMORY_FORMAT::nchw;
+            mkl_common_format = memory::format_tag::nchw;
           else if (s_dims == 2)
-            mkl_common_format = MEMORY_FORMAT::nc;
+            mkl_common_format = memory::format_tag::nc;
 
           auto src_md =
               memory::desc(src_dims, MklDnnType<T>(), mkl_common_format);
 
           srcs[k].SetUsrMem(src_md, &input_tensors[k]);
-          auto src_mpd = GET_USR_MEM_PRIM_DESC(srcs[k]);
+          auto src_mpd = srcs[k].GetUsrMemDesc();
           srcs_pd.push_back(src_mpd);
           inputs.push_back(srcs[k].GetOpMem());
           src_dims_pt.push_back(src_dims);
@@ -715,7 +661,8 @@ class MklConcatOp : public OpKernel {
       dst_dims[concat_dim] = dst_concat_dim_size;
 
       MklDnnData<T> dst(&cpu_engine);
-      memory::desc dst_md({}, MEMORY_DATA_TYPE_UNDEF, MEMORY_FORMAT_UNDEF);
+      memory::desc dst_md({}, memory::data_type::undef,
+                          memory::format_tag::undef);
       memory::dims dst_dims_in_nchw;
       if (are_all_mkl_inputs) {
         // Since we are passing a specific format for destination,
@@ -724,27 +671,16 @@ class MklConcatOp : public OpKernel {
         if (dst_dims.size() == 4) {
           dst_dims_in_nchw = MklDnnDimsInNCHW(
               dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format));
-// Set the output format same as the most common format of inputs
-// to avoid layout conversions.
-#ifdef ENABLE_MKLDNN_V1
+          // Set the output format same as the most common format of inputs
+          // to avoid layout conversions.
           // DNN 1.0: internal format is always blocked;
           //          format_tag does not have "blocked" field.
-          VLOG(1) << "mkl_common_format == MEMORY_FORMAT::blocked";
+          VLOG(1) << "mkl_common_format == memory::format_tag::blocked";
           dst_md = MklDnnData<T>::CreateBlockedMemDesc(
               dst_dims_in_nchw, CalculateTFStrides(dst_dims_in_nchw));
-#else
-          if (mkl_common_format == MEMORY_FORMAT::blocked) {
-            VLOG(1) << "mkl_common_format == MEMORY_FORMAT::blocked";
-            dst_md = MklDnnData<T>::CreateBlockedMemDesc(
-                dst_dims_in_nchw, CalculateTFStrides(dst_dims_in_nchw));
-          } else {
-            dst_md = memory::desc(dst_dims_in_nchw, MklDnnType<T>(),
-                                  mkl_common_format);
-          }
-#endif  // ENABLE_MKLDNN_V1
         } else if (dst_dims.size() == 2 &&
-                   mkl_common_format == MEMORY_FORMAT::nc) {
-          // When MEMORY_FORMAT::nc, dst_dims are already in MKL-DNN order
+                   mkl_common_format == memory::format_tag::nc) {
+          // When memory::format_tag::nc, dst_dims are already in MKL-DNN order
           dst_md = memory::desc(dst_dims, MklDnnType<T>(), mkl_common_format);
         } else {
           TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION,
@@ -760,8 +696,7 @@ class MklConcatOp : public OpKernel {
       if (isMklReorderNeeded) {
         for (int k = 0; k < input_tensors.size(); k++) {
           if (input_tensors[k].NumElements() > 0) {
-            srcs[k].CheckReorderToOpMem(
-                MEMORY_PD_WITHOUT_DATA(srcs_pd[k], cpu_engine), context);
+            srcs[k].CheckReorderToOpMem(srcs_pd[k], cpu_engine, context);
             inputs.push_back(srcs[k].GetOpMem());
           }
         }
@@ -779,9 +714,9 @@ class MklConcatOp : public OpKernel {
 
       if (!inputs.empty()) {
         if (are_all_mkl_inputs) {
-          auto concat_pd = concat::primitive_desc(
-              concat_dim, MEMORY_PD_WITHOUT_DATA(srcs_pd, cpu_engine));
-          auto dst_pd = concat_pd.PRIMITIVE_DESC_DST;
+          auto concat_pd =
+              concat::primitive_desc(concat_dim, srcs_pd, cpu_engine);
+          auto dst_pd = concat_pd.dst_desc();
 
           MklDnnShape dnn_shape_dst;
           TensorShape tf_shape_dst;
@@ -803,21 +738,14 @@ class MklConcatOp : public OpKernel {
             dst_md = dnn_shape_dst.GetMklLayout();
           dst.SetUsrMem(dst_md, dst_tensor);
           dst.SetUsrMemDataHandle(dst_tensor, fwd_cpu_stream);
-#ifdef ENABLE_MKLDNN_V1
+
           auto concat_op = concat(concat_pd);
           std::unordered_map<int, memory> net_args = {
-              { MKLDNN_ARG_DST,
-                dst.GetOpMem() }};
+              {MKLDNN_ARG_DST, dst.GetOpMem()}};
           for (int i = 0; i < inputs.size(); ++i) {
             net_args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, inputs[i]});
           }
           concat_op.execute(*fwd_cpu_stream, net_args);
-#else
-          auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
-          std::vector<primitive> net;
-          net.push_back(concat_op);
-          fwd_cpu_stream->submit(net).wait();
-#endif  // ENABLE_MKLDNN_V1
         } else {
           MklConcatFwdPrimitive<T>* concat_fwd = nullptr;
 
@@ -938,13 +866,14 @@ class MklConcatOp : public OpKernel {
   //   2. concat_dim_size is the size of concat_dim.
   // Return:
   //   return the common MKL format.
-  MEMORY_FORMAT FindMklCommonFormat(const MklDnnShapeList& input_shapes,
-                                    int concat_dim, bool* is_reorder_needed,
-                                    int64* concat_dim_size) {
+  memory::format_tag FindMklCommonFormat(const MklDnnShapeList& input_shapes,
+                                         int concat_dim,
+                                         bool* is_reorder_needed,
+                                         int64* concat_dim_size) {
     *is_reorder_needed = false;
     *concat_dim_size = 0;
     std::unordered_map<int, int> occurrence_map;
-    if (input_shapes.size() == 0) return MEMORY_FORMAT::any;
+    if (input_shapes.size() == 0) return memory::format_tag::any;
 
     // Compute ocurrences of each format of all inputs.
     for (int k = 0; k < input_shapes.size(); k++) {
@@ -958,19 +887,19 @@ class MklConcatOp : public OpKernel {
     if (occurrence_map.size() == 1) {
       // this means that all inputs have a same format
       // return it with is_reorder_needed set false.
-      return static_cast<MEMORY_FORMAT>(
+      return static_cast<memory::format_tag>(
           MklTensorFormatToMklDnnDataFormat(input_shapes[0].GetTfDataFormat()));
     }
 
     // Input tensors have different formats. Thus, reorder is needed.
     // We pick up the most common format to minimize the total
     // number of input reorder.
-    MEMORY_FORMAT commonest_format = MEMORY_FORMAT::any;
+    memory::format_tag commonest_format = memory::format_tag::any;
     int max_occurrence = 0;
     *is_reorder_needed = true;
     for (auto item : occurrence_map) {
       if (item.second > max_occurrence) {
-        commonest_format = static_cast<MEMORY_FORMAT>(item.first);
+        commonest_format = static_cast<memory::format_tag>(item.first);
         max_occurrence = item.second;
       }
     }
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
index c9bcdb57cb7..692b401918d 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_grad_filter_ops.cc
@@ -601,8 +601,8 @@ class MklConvCustomBackpropFilterOp
                diff_filter_dims
                    [MklDnnFilterGroupDims::MKL_GROUP_FILTER_DIM_O]});
           AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
-                                    diff_filter_tf_shape,
-                                    diff_filter_mkl_shape);
+                                    diff_filter_tf_shape, diff_filter_mkl_shape,
+                                    native_format);
         }
       } else {
         // Conv3D: output_dims_mkl_order is in OIDHW format.
@@ -613,7 +613,8 @@ class MklConvCustomBackpropFilterOp
              diff_filter_dims[MklDnnDims3D::Dim3d_I],
              diff_filter_dims[MklDnnDims3D::Dim3d_O]});
         AllocateOutputSetMklShape(context, 0, &diff_filter_tensor,
-                                  diff_filter_tf_shape, diff_filter_mkl_shape);
+                                  diff_filter_tf_shape, diff_filter_mkl_shape,
+                                  native_format);
       }
 
       Tensor* diff_bias_tensor = nullptr;
@@ -806,12 +807,6 @@ class MklConvCustomBackpropFilterOp
           .TypeConstraint<T>("T")                                        \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel),           \
       MklConvCustomBackpropFilterOp<CPUDevice, T, false, false, false>); \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("_MklNativeConv2DBackpropFilter")                             \
-          .Device(DEVICE_CPU)                                            \
-          .TypeConstraint<T>("T")                                        \
-          .Label(mkl_op_registry::kMklNameChangeOpLabel),                \
-      MklConvCustomBackpropFilterOp<CPUDevice, T, false, false, true>);  \
   REGISTER_KERNEL_BUILDER(                                               \
       Name("_MklConv2DBackpropFilterWithBias")                           \
           .Device(DEVICE_CPU)                                            \
@@ -835,7 +830,25 @@ class MklConvCustomBackpropFilterOp
           .Device(DEVICE_CPU)                                            \
           .TypeConstraint<T>("T")                                        \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel),           \
-      MklConvCustomBackpropFilterOp<CPUDevice, T, false, false, false>);
+      MklConvCustomBackpropFilterOp<CPUDevice, T, false, false, false>); \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklNativeConv2DBackpropFilter")                             \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),                \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, false, false, true>);  \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklNativeDepthwiseConv2dNativeBackpropFilter")              \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),                \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, false, true, true>);   \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_MklNativeConv3DBackpropFilterV2")                           \
+          .Device(DEVICE_CPU)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),                \
+      MklConvCustomBackpropFilterOp<CPUDevice, T, false, false, true>);
 
 TF_CALL_float(REGISTER_MKL_FILTER_KERNELS);
 TF_CALL_bfloat16(REGISTER_MKL_FILTER_KERNELS);
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc
index bfac57d59eb..c358a650b19 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_grad_input_ops.cc
@@ -675,12 +675,6 @@ class MklConvCustomBackpropInputOp
           .TypeConstraint<T>("T")                                \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel),   \
       MklConvCustomBackpropInputOp<CPUDevice, T, false, false>); \
-  REGISTER_KERNEL_BUILDER(                                       \
-      Name("_MklNativeConv2DBackpropInput")                      \
-          .Device(DEVICE_CPU)                                    \
-          .TypeConstraint<T>("T")                                \
-          .Label(mkl_op_registry::kMklNameChangeOpLabel),        \
-      MklConvCustomBackpropInputOp<CPUDevice, T, false, true>);  \
   REGISTER_KERNEL_BUILDER(                                       \
       Name("_MklConv3DBackpropInputV2")                          \
           .Device(DEVICE_CPU)                                    \
@@ -692,7 +686,25 @@ class MklConvCustomBackpropInputOp
           .Device(DEVICE_CPU)                                    \
           .TypeConstraint<T>("T")                                \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel),   \
-      MklConvCustomBackpropInputOp<CPUDevice, T, true, false>);
+      MklConvCustomBackpropInputOp<CPUDevice, T, true, false>);  \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("_MklNativeConv2DBackpropInput")                      \
+          .Device(DEVICE_CPU)                                    \
+          .TypeConstraint<T>("T")                                \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),        \
+      MklConvCustomBackpropInputOp<CPUDevice, T, false, true>);  \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("_MklNativeConv3DBackpropInputV2")                    \
+          .Device(DEVICE_CPU)                                    \
+          .TypeConstraint<T>("T")                                \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),        \
+      MklConvCustomBackpropInputOp<CPUDevice, T, false, true>);  \
+  REGISTER_KERNEL_BUILDER(                                       \
+      Name("_MklNativeDepthwiseConv2dNativeBackpropInput")       \
+          .Device(DEVICE_CPU)                                    \
+          .TypeConstraint<T>("T")                                \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),        \
+      MklConvCustomBackpropInputOp<CPUDevice, T, true, true>);
 
 TF_CALL_float(REGISTER_MKL_CPU_KERNELS);
 TF_CALL_bfloat16(REGISTER_MKL_CPU_KERNELS);
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
index 2caa244b835..b579f34b17a 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
@@ -490,8 +490,9 @@ class MklConvOp : public OpKernel {
     const int64 stride_c = GetTensorDim(strides_, data_format_, 'C');
     OP_REQUIRES(
         context, stride_n == 1 && stride_c == 1,
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
+        errors::Unimplemented("Current implementation does not yet support "
+                              "strides in the batch and depth dimensions."));
+
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
     is_filter_const_ = false;
     if (context->HasAttr("is_filter_const")) {
@@ -853,10 +854,12 @@ class MklConvOp : public OpKernel {
   void set_fuse_biasadd(bool fuse_biasadd) { fuse_biasadd_ = fuse_biasadd; }
   void set_fuse_activation(bool fuse_activation,
                            mkldnn::algorithm activation_alg,
-                           float relu_up_bound = 0.0) {
+                           float alpha_or_upbound = 0.0) {
     fuse_activation_ = fuse_activation;
     activation_alg_ = activation_alg;
-    relu_up_bound_ = relu_up_bound;
+    // This variable is used for alpha in leakyrelu or upper bound in relu6
+    // depending on the context
+    alpha_or_upbound_ = alpha_or_upbound;
   }
   void set_fuse_pad(bool fuse_pad) {
     fuse_pad_ = fuse_pad;
@@ -884,7 +887,7 @@ class MklConvOp : public OpKernel {
     }
     if (fuse_activation_) {
       params.post_op_params.push_back(
-          {"activation", activation_alg_, {1.0, relu_up_bound_, 0.0}, ""});
+          {"activation", activation_alg_, {1.0, alpha_or_upbound_, 0.0}, ""});
     }
   }
 
@@ -937,13 +940,20 @@ class MklConvOp : public OpKernel {
     if (fuse_add_) {
       const Tensor& add_tensor = MklGetInput(context, kInputIndex_Add);
       MklDnnShape add_mkl_shape;
-      GetMklShape(context, kInputIndex_Add, &add_mkl_shape);
-
+      GetMklShape(context, kInputIndex_Add, &add_mkl_shape, native_format);
+      // Forward the summand tensor to the output only if it has no other
+      // references, otherwise make a copy of it.
+      if (native_format && context->forward_input_to_output_with_shape(
+                               kInputIndex_Add, kOutputIndex_Dst,
+                               output_tf_shape, output_tensor)) {
+        return;
+      }
       // Check if reorder is needed
-      if (add_mkl_shape == *output_mkl_shape) {
-        ForwardMklTensorInToOutWithMklShape(context, kInputIndex_Add,
-                                            kOutputIndex_Dst, add_mkl_shape);
-        *output_tensor = context->mutable_output(kOutputIndex_Dst);
+      if (!native_format && add_mkl_shape == *output_mkl_shape &&
+          ForwardMklTensorInToOutWithMklShape(context, kInputIndex_Add,
+                                              kOutputIndex_Dst, output_tensor,
+                                              add_mkl_shape, false)) {
+        return;
       } else {
         AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor,
                                   output_tf_shape, *output_mkl_shape,
@@ -969,6 +979,13 @@ class MklConvOp : public OpKernel {
             const_cast<Toutput*>(add_tensor.flat<Toutput>().data()));
         void* dst_buf =
             static_cast<void*>((*output_tensor)->flat<Ttemp_output>().data());
+        if (native_format) {
+          // We are simply deep copying the add_tensor to output_tensor without
+          // changing memory layout, hence using same memory descriptor.
+          ADD_MD = DST_MD =
+              memory::desc({add_tensor.NumElements()}, MklDnnType<Toutput>(),
+                           mkldnn::memory::format_tag::x);
+        }
         fuse_add_src_.reset(
             new MEMORY_CONSTRUCTOR(ADD_MD, this->cpu_engine_, add_buf));
         fuse_add_dst_.reset(
@@ -1007,7 +1024,9 @@ class MklConvOp : public OpKernel {
   bool fuse_pad_ = pad_enabled;
   bool fuse_add_ = false;
 
-  float relu_up_bound_ = 0.0;
+  // This variable is used for alpha in leakyrelu or upper bound in relu6
+  // depending on the context
+  float alpha_or_upbound_ = 0.0;
   mkldnn::algorithm activation_alg_ = ALGORITHM_UNDEF;
 
   int input_index_pad_ = 2;
@@ -1278,14 +1297,14 @@ class MklConvOp : public OpKernel {
 // Base class for fused convolution forward operations
 template <typename Device, typename Tinput, typename Tfilter, typename Tbias,
           typename Toutput, typename Ttemp_output, typename Tpadding,
-          bool pad_enabled>
+          bool pad_enabled, bool native_format>
 class MklFusedConvOp
     : public MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput, Ttemp_output,
-                       Tpadding, false, false, false, false> {
+                       Tpadding, false, false, false, native_format> {
  public:
   explicit MklFusedConvOp(OpKernelConstruction* context)
       : MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput, Ttemp_output,
-                  Tpadding, false, false, false, false>(context) {
+                  Tpadding, false, false, false, native_format>(context) {
     // Since we came here through the registration of _MklFusedConv2D, get
     // all information from 'fused_ops' and 'num_args'
     std::vector<string> fused_ops;
@@ -1308,6 +1327,11 @@ class MklFusedConvOp
       this->set_fuse_activation(true, ALGORITHM::eltwise_bounded_relu, 6.0);
     } else if (fused_ops == std::vector<string>{"Elu"}) {
       this->set_fuse_activation(true, ALGORITHM::eltwise_elu, 1.0);
+    } else if (fused_ops == std::vector<string>{"LeakyRelu"}) {
+      float leakyrelu_alpha;
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("leakyrelu_alpha", &leakyrelu_alpha));
+      this->set_fuse_activation(true, ALGORITHM::eltwise_relu, leakyrelu_alpha);
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Relu"}) {
       this->set_fuse_biasadd(true);
       this->set_fuse_activation(true, ALGORITHM::eltwise_relu);
@@ -1326,6 +1350,15 @@ class MklFusedConvOp
       OP_REQUIRES(context, num_args == 1,
                   errors::InvalidArgument(
                       "Fused Conv2D must have one extra argument: bias."));
+    } else if (fused_ops == std::vector<string>{"BiasAdd", "LeakyRelu"}) {
+      this->set_fuse_biasadd(true);
+      float leakyrelu_alpha;
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("leakyrelu_alpha", &leakyrelu_alpha));
+      this->set_fuse_activation(true, ALGORITHM::eltwise_relu, leakyrelu_alpha);
+      OP_REQUIRES(context, num_args == 1,
+                  errors::InvalidArgument(
+                      "Fused Conv2D must have one extra argument: bias."));
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Add"}) {
       this->set_fuse_biasadd(true);
       this->set_fuse_add(true);
@@ -1357,6 +1390,18 @@ class MklFusedConvOp
           context, num_args == 2,
           errors::InvalidArgument(
               "Fused Conv2D must have two extra arguments: bias and add."));
+    } else if (fused_ops ==
+               std::vector<string>{"BiasAdd", "Add", "LeakyRelu"}) {
+      this->set_fuse_biasadd(true);
+      this->set_fuse_add(true);
+      float leakyrelu_alpha;
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("leakyrelu_alpha", &leakyrelu_alpha));
+      this->set_fuse_activation(true, ALGORITHM::eltwise_relu, leakyrelu_alpha);
+      OP_REQUIRES(
+          context, num_args == 2,
+          errors::InvalidArgument(
+              "Fused Conv2D must have two extra arguments: bias and add."));
     } else {
       OP_REQUIRES(context, false,
                   errors::Unimplemented("Fusion is not implemented: [",
@@ -1373,14 +1418,17 @@ class MklFusedConvOp
 
 template <typename Device, typename Tinput, typename Tfilter, typename Tbias,
           typename Toutput, typename Ttemp_output, typename Tpadding,
-          bool pad_enabled, bool bias_enabled, bool is_depthwise>
+          bool pad_enabled, bool bias_enabled, bool is_depthwise,
+          bool native_format>
 class MklFusedDepthwiseConvOp
     : public MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput, Ttemp_output,
-                       Tpadding, bias_enabled, false, is_depthwise, false> {
+                       Tpadding, bias_enabled, false, is_depthwise,
+                       native_format> {
  public:
   explicit MklFusedDepthwiseConvOp(OpKernelConstruction* context)
       : MklConvOp<Device, Tinput, Tfilter, Tbias, Toutput, Ttemp_output,
-                  Tpadding, bias_enabled, false, is_depthwise, false>(context) {
+                  Tpadding, bias_enabled, false, is_depthwise, native_format>(
+            context) {
     // Since we came here through the registration of
     // _MklFusedDepthwiseConv2dNative, get all
     // information from 'fused_ops' and 'num_args'
@@ -1840,7 +1888,7 @@ class MklQuantizedConv2DSumReluOp
       GetMklShape(context, summand_idx, &summand_mkl_shape);
       auto dst_md = summand_mkl_shape.GetMklLayout();
 
-      // TODO(mdfaijul): handle both non-MKL and MKL tensors
+      // TODO(intel-tf): Handle both non-MKL and MKL tensors
       if (summand_type == DT_QINT8) {
         OP_REQUIRES_OK(
             context, summand.BitcastFrom(summand, DT_QUINT8, summand.shape()));
@@ -1849,9 +1897,13 @@ class MklQuantizedConv2DSumReluOp
         summand_mkl_shape.SetMklLayout(&dst_md);
         summand_mkl_shape.SetElemType(MklDnnType<Toutput>());
       }
-      ForwardMklTensorInToOutWithMklShape(context, summand_idx, 0,
-                                          summand_mkl_shape);
-      *output_tensor = const_cast<Tensor*>(&summand);
+      // TODO(intel-tf): Support cases when summand cannot be forwarded.
+      OP_REQUIRES(
+          context,
+          ForwardMklTensorInToOutWithMklShape(
+              context, summand_idx, 0, output_tensor, summand_mkl_shape, false),
+          errors::InvalidArgument(
+              "Summand cannot be forwarded in the current fusion."));
       return;
     }
     MklConvOp<Device, Tinput, qint8, Tbias, Toutput, Ttemp_output, int32,
@@ -2426,7 +2478,27 @@ REGISTER_KERNEL_BUILDER(
           .Device(DEVICE_CPU)                                                  \
           .TypeConstraint<T>("T")                                              \
           .Label(mkl_op_registry::kMklNameChangeOpLabel),                      \
-      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false, false, true>);
+      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false, false, true>);  \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("_MklNativeConv2DWithBias")                                         \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<T>("T")                                              \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),                      \
+      MklConvOp<CPUDevice, T, T, T, T, T, int32, true, false, false, true>);   \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("_MklNativePadWithConv2D")                                          \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<T>("T")                                              \
+          .TypeConstraint<int32>("Tpaddings")                                  \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),                      \
+      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, true, false, true>);   \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("_MklNativePadWithConv2D")                                          \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<T>("T")                                              \
+          .TypeConstraint<int64>("Tpaddings")                                  \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),                      \
+      MklConvOp<CPUDevice, T, T, T, T, T, int64, false, true, false, true>);
 
 TF_CALL_float(REGISTER_MKL_CPU_2D);
 TF_CALL_bfloat16(REGISTER_MKL_CPU_2D);
@@ -2444,53 +2516,92 @@ TF_CALL_bfloat16(REGISTER_MKL_CPU_2D);
           .TypeConstraint<T>("T")                                             \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                \
       MklFusedDepthwiseConvOp<CPUDevice, T, T, T, T, T, int32, false, true,   \
-                              true>);
+                              true, false>);                                  \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_MklNativeFusedDepthwiseConv2dNative")                            \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),                     \
+      MklFusedDepthwiseConvOp<CPUDevice, T, T, T, T, T, int32, false, true,   \
+                              true, true>);                                   \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_MklNativeDepthwiseConv2dNative")                                 \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),                     \
+      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false, true, true>);
 
 TF_CALL_float(REGISTER_MKL_CPU_2D_DEPTHWISE);
 TF_CALL_bfloat16(REGISTER_MKL_CPU_2D_DEPTHWISE);
 
 // Note we are registering _MklFusedConv2D.
 // We check the fused_ops attributes to decide if bias is enabled or not.
-#define REGISTER_MKL_CPU_2D_FUSED(T)                           \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklFusedConv2D")                                  \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedConvOp<CPUDevice, T, T, T, T, T, int32, false>); \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklPadWithFusedConv2D")                           \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<int32>("Tpaddings")                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedConvOp<CPUDevice, T, T, T, T, T, int32, true>);  \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklPadWithFusedConv2D")                           \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .TypeConstraint<int64>("Tpaddings")                  \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedConvOp<CPUDevice, T, T, T, T, T, int64, true>);  \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("__MklDummyPadWithFusedConv2D")                     \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .TypeConstraint<int32>("Tpaddings")                  \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklDummyOp<CPUDevice, T>);
+#define REGISTER_MKL_CPU_2D_FUSED(T)                                  \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("_MklFusedConv2D")                                         \
+          .Device(DEVICE_CPU)                                         \
+          .TypeConstraint<T>("T")                                     \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),        \
+      MklFusedConvOp<CPUDevice, T, T, T, T, T, int32, false, false>); \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("_MklPadWithFusedConv2D")                                  \
+          .Device(DEVICE_CPU)                                         \
+          .TypeConstraint<int32>("Tpaddings")                         \
+          .TypeConstraint<T>("T")                                     \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),        \
+      MklFusedConvOp<CPUDevice, T, T, T, T, T, int32, true, false>);  \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("_MklPadWithFusedConv2D")                                  \
+          .Device(DEVICE_CPU)                                         \
+          .TypeConstraint<T>("T")                                     \
+          .TypeConstraint<int64>("Tpaddings")                         \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),        \
+      MklFusedConvOp<CPUDevice, T, T, T, T, T, int64, true, false>);  \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("__MklDummyPadWithFusedConv2D")                            \
+          .Device(DEVICE_CPU)                                         \
+          .TypeConstraint<T>("T")                                     \
+          .TypeConstraint<int32>("Tpaddings")                         \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),        \
+      MklDummyOp<CPUDevice, T>);                                      \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("_MklNativeFusedConv2D")                                   \
+          .Device(DEVICE_CPU)                                         \
+          .TypeConstraint<T>("T")                                     \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),             \
+      MklFusedConvOp<CPUDevice, T, T, T, T, T, int32, false, true>);  \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("_MklNativePadWithFusedConv2D")                            \
+          .Device(DEVICE_CPU)                                         \
+          .TypeConstraint<int32>("Tpaddings")                         \
+          .TypeConstraint<T>("T")                                     \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),             \
+      MklFusedConvOp<CPUDevice, T, T, T, T, T, int32, true, true>);   \
+  REGISTER_KERNEL_BUILDER(                                            \
+      Name("_MklNativePadWithFusedConv2D")                            \
+          .Device(DEVICE_CPU)                                         \
+          .TypeConstraint<T>("T")                                     \
+          .TypeConstraint<int64>("Tpaddings")                         \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),             \
+      MklFusedConvOp<CPUDevice, T, T, T, T, T, int64, true, true>);
 
 TF_CALL_float(REGISTER_MKL_CPU_2D_FUSED);
 TF_CALL_bfloat16(REGISTER_MKL_CPU_2D_FUSED);
 
 // Register 3D operations
-#define REGISTER_MKL_CPU_3D(T)                                 \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklConv3D")                                       \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false, false, false>);
+#define REGISTER_MKL_CPU_3D(T)                                                 \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("_MklConv3D")                                                       \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<T>("T")                                              \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                 \
+      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false, false, false>); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("_MklNativeConv3D")                                                 \
+          .Device(DEVICE_CPU)                                                  \
+          .TypeConstraint<T>("T")                                              \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),                      \
+      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false, false, true>);
 TF_CALL_float(REGISTER_MKL_CPU_3D);
 TF_CALL_bfloat16(REGISTER_MKL_CPU_3D);
 
diff --git a/tensorflow/core/kernels/mkl/mkl_dequantize_op.cc b/tensorflow/core/kernels/mkl/mkl_dequantize_op.cc
index 82d78250576..e262e3ec5ef 100644
--- a/tensorflow/core/kernels/mkl/mkl_dequantize_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_dequantize_op.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/meta_support.h"
 #include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::primitive_attr;
@@ -51,7 +50,7 @@ class MklDequantizeOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override {
     try {
       // Using CPU device
-      auto cpu_engine = engine(ENGINE_CPU, 0);
+      auto cpu_engine = engine(engine::kind::cpu, 0);
 
       // Get the inputs
       const Tensor& src_tensor = MklGetInput(ctx, kSrcIndex);
@@ -82,10 +81,10 @@ class MklDequantizeOp : public OpKernel {
       // construct input TF layout. For TF layout, although input shape
       // (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
       // layout
-      auto src_md =
-          src_mkl_shape.IsMklTensor()
-              ? src_mkl_shape.GetMklLayout()
-              : memory::desc(src_dims, MklDnnType<T>(), MEMORY_FORMAT::nhwc);
+      auto src_md = src_mkl_shape.IsMklTensor()
+                        ? src_mkl_shape.GetMklLayout()
+                        : memory::desc(src_dims, MklDnnType<T>(),
+                                       memory::format_tag::nhwc);
 
       src.SetUsrMem(src_md, &src_tensor);
       src.SetUsrMemDataHandle(&src_tensor, reorder_stream);
@@ -93,14 +92,6 @@ class MklDequantizeOp : public OpKernel {
       Tensor* output_tensor = nullptr;
       MklDnnShape output_mkl_shape;
       TensorShape output_tf_shape;
-#ifndef ENABLE_MKLDNN_V1
-      memory::desc dst_md =
-          src_mkl_shape.IsMklTensor()
-              ? memory::desc(src_dims, MklDnnType<float>(),
-                             static_cast<MEMORY_FORMAT>(src_md.data.format))
-              : memory::desc(src_dims, MklDnnType<float>(),
-                             MEMORY_FORMAT::nhwc);
-#else
       memory::desc dst_md = memory::desc();
       if (src_mkl_shape.IsMklTensor()) {
         dst_md = memory::desc(src_mkl_shape.GetMklLayout().data);
@@ -108,10 +99,9 @@ class MklDequantizeOp : public OpKernel {
         // same .data field but different type.
         dst_md.data.data_type = memory::convert_to_c(MklDnnType<float>());
       } else {
-        dst_md =
-            memory::desc(src_dims, MklDnnType<float>(), MEMORY_FORMAT::nhwc);
+        dst_md = memory::desc(src_dims, MklDnnType<float>(),
+                              memory::format_tag::nhwc);
       }
-#endif  // !ENABLE_MKLDNN_V1
 
       // If input is MKL shape, output is also MKL shape.
       // If input is TF shape, output is also TF shape.
@@ -122,8 +112,7 @@ class MklDequantizeOp : public OpKernel {
         output_mkl_shape.SetTfLayout(src_mkl_shape.GetDimension(),
                                      src_mkl_shape.GetSizesAsMklDnnDims(),
                                      src_mkl_shape.GetTfDataFormat());
-        output_tf_shape.AddDim(GET_MEMORY_SIZE_FROM_MD(dst_md, cpu_engine) /
-                               sizeof(float));
+        output_tf_shape.AddDim(dst_md.get_size() / sizeof(float));
       } else {
         output_mkl_shape.SetMklTensor(false);
         output_tf_shape = MklDnnDimsToTFShape(output_dims);
@@ -155,29 +144,17 @@ class MklDequantizeOp : public OpKernel {
       scales.push_back(scale_factor);
       primitive_attr attr;
       attr.set_output_scales(0, scales);
-#ifndef ENABLE_MKLDNN_V1
-      // MKL-DNN 1.0 does not provide set_int_output_round_mode() API.
-      // Also it does not define round_nearest (enum).
-      attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
-#endif  // !ENABLE_MKLDNN_V1
       std::vector<primitive> net;
 
       // Create reorder primitive and then execute.
-      auto reorder_pd = REORDER_PD_CONSTRUCTOR_WITH_ATTR(
-          GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(src.GetUsrMem()),
-          GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(dst.GetUsrMem()), cpu_engine,
-          attr);
-#ifdef ENABLE_MKLDNN_V1
+      auto reorder_pd =
+          ReorderPd(cpu_engine, src.GetUsrMem()->get_desc(), cpu_engine,
+                    dst.GetUsrMem()->get_desc(), attr);
       net.push_back(reorder(reorder_pd));
       std::vector<std::unordered_map<int, memory>> reorder_net_args;
       reorder_net_args.push_back({{MKLDNN_ARG_FROM, *src.GetUsrMem()},
-                                  { MKLDNN_ARG_TO,
-                                    *dst.GetUsrMem() }});
+                                  {MKLDNN_ARG_TO, *dst.GetUsrMem()}});
       execute_primitives(net, reorder_stream, reorder_net_args);
-#else
-      net.push_back(reorder(reorder_pd, *src.GetUsrMem(), *dst.GetUsrMem()));
-      reorder_stream->submit(net);
-#endif  // ENABLE_MKLDNN_V1
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
diff --git a/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op.cc
index 3b2c4f84039..c641fd56275 100644
--- a/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op.cc
@@ -775,7 +775,7 @@ class MklFusedBatchNormBwdPrimitiveFactory : public MklPrimitiveFactory<T> {
 //  with MKL. This is different from default where the classes are
 //  derived. Moves enabling to compile-time rather than runtime.
 template <typename Device, typename T, typename U, bool reserved_space,
-          bool is_batch_norm_ex = false>
+          bool is_batch_norm_ex = false, bool native_format = false>
 class MklFusedBatchNormOp : public OpKernel {
  public:
   explicit MklFusedBatchNormOp(OpKernelConstruction* context)
@@ -835,7 +835,7 @@ class MklFusedBatchNormOp : public OpKernel {
 
       TensorShape tf_shape_src;
       MklDnnShape dnn_shape_src;
-      GetMklShape(context, kSrcIndex, &dnn_shape_src);
+      GetMklShape(context, kSrcIndex, &dnn_shape_src, native_format);
 
       if (dnn_shape_src.IsMklTensor()) {
         tf_shape_src = dnn_shape_src.GetTfShape();
@@ -986,7 +986,7 @@ class MklFusedBatchNormOp : public OpKernel {
       // Check if reorder is needed for src.
       const T* src_data = nullptr;
       std::shared_ptr<BatchNormFwdPd> bn_fwd_pd = bn_fwd->GetBatchNormFwdPd();
-      if (IS_SRC_REORDER_NEEDED(src_md, bn_fwd_pd, bn_fwd)) {
+      if (!native_format && IS_SRC_REORDER_NEEDED(src_md, bn_fwd_pd, bn_fwd)) {
         src.SetUsrMem(src_md, &src_tensor);
         src.CheckReorderToOpMem(
             MEMORY_PD_WITHOUT_DATA(GET_SRC_DESC_FROM_OP_PD(bn_fwd_pd),
@@ -997,7 +997,7 @@ class MklFusedBatchNormOp : public OpKernel {
         src_data = static_cast<T*>(const_cast<T*>(src_tensor.flat<T>().data()));
       }
 
-      // Allocate output (dst) tensor; always set it as MKL-DNN layout
+      // Allocate output (dst) tensor
       MklDnnShape dnn_shape_dst;
       TensorShape tf_shape_dst;
       dnn_shape_dst.SetMklTensor(true);
@@ -1008,8 +1008,11 @@ class MklFusedBatchNormOp : public OpKernel {
                                                : src_tensor.shape().dims();
       dnn_shape_dst.SetTfLayout(ndims, src_dims, mkl_tensor_fmt);
       tf_shape_dst.AddDim(dst_pd.get_size() / sizeof(T));
+      if (native_format) {
+        tf_shape_dst = dnn_shape_dst.GetTfShape();
+      }
       AllocateOutputSetMklShape(context, kDstIndex, &dst_tensor, tf_shape_dst,
-                                dnn_shape_dst);
+                                dnn_shape_dst, native_format);
 
       U* weights_op_data = weights_data;
       U* mean_op_data = saved_mean_tensor->flat<U>().data();
@@ -1097,7 +1100,7 @@ class MklFusedBatchNormOp : public OpKernel {
     MklDnnShape dnn_shape_dst;
     dnn_shape_dst.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kDstIndex, dst_tensor, tf_shape_src,
-                              dnn_shape_dst);
+                              dnn_shape_dst, native_format);
     DCHECK(*dst_tensor);
     memset(const_cast<char*>((*dst_tensor)->tensor_data().data()), 0,
            (*dst_tensor)->tensor_data().size());
@@ -1135,7 +1138,8 @@ class MklFusedBatchNormOp : public OpKernel {
     MklDnnShape mkl_shape_batch_mean;
     mkl_shape_batch_mean.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kBatchMeanIndex, batch_mean_tensor,
-                              tf_shape_scale, mkl_shape_batch_mean);
+                              tf_shape_scale, mkl_shape_batch_mean,
+                              native_format);
     DCHECK(*batch_mean_tensor);
 
     // Set NAN mean value in case of empty input tensor
@@ -1148,7 +1152,7 @@ class MklFusedBatchNormOp : public OpKernel {
     mkl_shape_batch_variance.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kBatchVarianceIndex,
                               batch_variance_tensor, tf_shape_scale,
-                              mkl_shape_batch_variance);
+                              mkl_shape_batch_variance, native_format);
     DCHECK(*batch_variance_tensor);
 
     // Set NAN variance value in case of empty input tensor
@@ -1159,7 +1163,8 @@ class MklFusedBatchNormOp : public OpKernel {
     MklDnnShape mkl_shape_saved_mean;
     mkl_shape_saved_mean.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kSavedMeanIndex, saved_mean_tensor,
-                              tf_shape_scale, mkl_shape_saved_mean);
+                              tf_shape_scale, mkl_shape_saved_mean,
+                              native_format);
     DCHECK(*saved_mean_tensor);
 
     // Set 0 mean value in case of empty input tensor
@@ -1170,7 +1175,7 @@ class MklFusedBatchNormOp : public OpKernel {
     mkl_shape_saved_variance.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kSavedVarianceIndex,
                               saved_variance_tensor, tf_shape_scale,
-                              mkl_shape_saved_variance);
+                              mkl_shape_saved_variance, native_format);
     DCHECK(*saved_variance_tensor);
 
     // Set 0 variance value in case of empty input tensor
@@ -1185,13 +1190,14 @@ class MklFusedBatchNormOp : public OpKernel {
       mkl_shape_reserved_space.SetMklTensor(false);
       AllocateOutputSetMklShape(context, kReservedSpaceIndex,
                                 reserved_space_tensor, workspace_tf_shape,
-                                mkl_shape_reserved_space);
+                                mkl_shape_reserved_space, native_format);
       DCHECK((*reserved_space_tensor) != nullptr);
     }
   }
 };
 
-template <typename Device, typename T, typename U, bool reserved_space>
+template <typename Device, typename T, typename U, bool reserved_space,
+          bool native_format = false>
 class MklFusedBatchNormGradOp : public OpKernel {
  public:
   explicit MklFusedBatchNormGradOp(OpKernelConstruction* context)
@@ -1227,8 +1233,8 @@ class MklFusedBatchNormGradOp : public OpKernel {
                            : Tensor();
 
       MklDnnShape dnn_shape_src, dnn_shape_diff_dst;
-      GetMklShape(context, kSrcIndex, &dnn_shape_src);
-      GetMklShape(context, kDiffDstIndex, &dnn_shape_diff_dst);
+      GetMklShape(context, kSrcIndex, &dnn_shape_src, native_format);
+      GetMklShape(context, kDiffDstIndex, &dnn_shape_diff_dst, native_format);
 
       TensorShape tf_shape_src, tf_shape_diff_dst;
       if (dnn_shape_diff_dst.IsMklTensor()) {
@@ -1327,6 +1333,37 @@ class MklFusedBatchNormGradOp : public OpKernel {
               ? dnn_shape_diff_dst.GetMklLayout()
               : memory::desc(diff_dst_dims, MklDnnType<T>(), dnn_fmt);
 
+      MklDnnData<T> reorder_src(&cpu_engine_);
+      MklDnnData<T> reorder_diff_dst(&cpu_engine_);
+      T* diff_dst_data =
+          static_cast<T*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+      T* src_data =
+          static_cast<T*>(const_cast<T*>(src_tensor.flat<T>().data()));
+
+#ifdef ENABLE_MKLDNN_V1
+      if (!native_format) {
+        // MKL-DNN requires src and diff_dst to be in same memory layout, either
+        // blocked or native format. If these inputs are in different formats,
+        // convert the one in native format to blocked format as MKL-DNN gives
+        // better performance for blocked format.
+        if (dnn_shape_src.IsMklTensor() && !dnn_shape_diff_dst.IsMklTensor()) {
+          reorder_diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
+          reorder_diff_dst.CheckReorderToOpMem(
+              MEMORY_PD_WITHOUT_DATA(src_md, cpu_engine_), context);
+          diff_dst_md = src_md;
+          diff_dst_data =
+              static_cast<T*>(reorder_diff_dst.GetOpMem().get_data_handle());
+        } else if (!dnn_shape_src.IsMklTensor() &&
+                   dnn_shape_diff_dst.IsMklTensor()) {
+          reorder_src.SetUsrMem(src_md, &src_tensor);
+          reorder_src.CheckReorderToOpMem(
+              MEMORY_PD_WITHOUT_DATA(diff_dst_md, cpu_engine_), context);
+          src_md = diff_dst_md;
+          src_data = static_cast<T*>(reorder_src.GetOpMem().get_data_handle());
+        }
+      }
+#endif  // ENABLE_MKLDNN_V1
+
       // weights -- MKL DNN packs scales/ shifts as weights in order
       // of scale, ..., scale, shift, ...., shift
       weights.AllocateBuffer(2 * depth_ * sizeof(U));
@@ -1350,20 +1387,25 @@ class MklFusedBatchNormGradOp : public OpKernel {
       MklFusedBatchNormBwdPrimitive<T, U>* bn_bwd =
           MklFusedBatchNormBwdPrimitiveFactory<T, U>::Get(bwdParams);
 
-      const T* src_data = src_tensor.flat<T>().data();
-      const T* diff_dst_data = diff_dst_tensor.flat<T>().data();
       // Check if diff_dst input needs to be reordered
       std::shared_ptr<BatchNormBwdPd> bn_bwd_pd = bn_bwd->GetBatchNormBwdPd();
-      if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, bn_bwd_pd, bn_bwd)) {
-        diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
+      if (!native_format &&
+          IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, bn_bwd_pd, bn_bwd)) {
+        diff_dst.SetUsrMem(diff_dst_md, diff_dst_data);
         diff_dst.CheckReorderToOpMem(
             MEMORY_PD_WITHOUT_DATA(GET_DIFF_DST_DESC_FROM_OP_PD(bn_bwd_pd),
                                    cpu_engine_),
             context);
         diff_dst_data = static_cast<T*>(diff_dst.GetOpMem().get_data_handle());
-      } else {
-        diff_dst_data =
-            static_cast<T*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
+      }
+
+      if (!native_format && IS_SRC_REORDER_NEEDED(src_md, bn_bwd_pd, bn_bwd)) {
+        src.SetUsrMem(src_md, src_data);
+        src.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(GET_SRC_DESC_FROM_OP_PD(bn_bwd_pd),
+                                   cpu_engine_),
+            context);
+        src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
       }
 
       // Indices of output tensors
@@ -1379,8 +1421,12 @@ class MklFusedBatchNormGradOp : public OpKernel {
       dnn_shape_diff_src.SetTfLayout(src_dims.size(), src_dims, mkl_tensor_fmt);
       dnn_shape_diff_src.SetTfDimOrder(src_dims.size(), tensor_format_);
       tf_shape_diff_src.AddDim(diff_src_pd.get_size() / sizeof(T));
+      if (native_format) {
+        tf_shape_diff_src = dnn_shape_diff_src.GetTfShape();
+      }
       AllocateOutputSetMklShape(context, kDiffSrcIndex, &diff_src_tensor,
-                                tf_shape_diff_src, dnn_shape_diff_src);
+                                tf_shape_diff_src, dnn_shape_diff_src,
+                                native_format);
 
       U* mean_data =
           static_cast<U*>(const_cast<U*>(saved_mean_tensor.flat<U>().data()));
@@ -1446,7 +1492,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
     MklDnnShape dnn_shape_diff_src;
     dnn_shape_diff_src.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kDiffSrcIndex, diff_src_tensor,
-                              tf_shape_src, dnn_shape_diff_src);
+                              tf_shape_src, dnn_shape_diff_src, native_format);
     auto diff_src_data = (*diff_src_tensor)->flat<T>().data();
     std::fill_n(diff_src_data, (*diff_src_tensor)->shape().num_elements(),
                 static_cast<T>(0));
@@ -1473,7 +1519,8 @@ class MklFusedBatchNormGradOp : public OpKernel {
     MklDnnShape mkl_shape_diff_scale;
     mkl_shape_diff_scale.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kDiffScaleIndex, diff_scale_tensor,
-                              tf_shape_scale_shift, mkl_shape_diff_scale);
+                              tf_shape_scale_shift, mkl_shape_diff_scale,
+                              native_format);
     DCHECK(*diff_scale_tensor);
 
     auto diff_scale_data = (*diff_scale_tensor)->flat<U>().data();
@@ -1483,7 +1530,8 @@ class MklFusedBatchNormGradOp : public OpKernel {
     MklDnnShape mkl_shape_diff_shift;
     mkl_shape_diff_shift.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kDiffShiftIndex, diff_shift_tensor,
-                              tf_shape_scale_shift, mkl_shape_diff_shift);
+                              tf_shape_scale_shift, mkl_shape_diff_shift,
+                              native_format);
     DCHECK(*diff_shift_tensor);
 
     auto diff_shift_data = (*diff_shift_tensor)->flat<U>().data();
@@ -1496,11 +1544,11 @@ class MklFusedBatchNormGradOp : public OpKernel {
     MklDnnShape mkl_shape_p;
     mkl_shape_p.SetMklTensor(false);
     AllocateOutputSetMklShape(context, kP1Index, &p1_tensor, TensorShape({}),
-                              mkl_shape_p);
+                              mkl_shape_p, native_format);
     std::fill_n(p1_tensor->flat<U>().data(), p1_tensor->shape().num_elements(),
                 static_cast<U>(0));
     AllocateOutputSetMklShape(context, kP2Index, &p2_tensor, TensorShape({}),
-                              mkl_shape_p);
+                              mkl_shape_p, native_format);
     std::fill_n(p2_tensor->flat<U>().data(), p2_tensor->shape().num_elements(),
                 static_cast<U>(0));
   }
@@ -1514,7 +1562,13 @@ class MklFusedBatchNormGradOp : public OpKernel {
           .Device(DEVICE_CPU)                                  \
           .TypeConstraint<T>("T")                              \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedBatchNormOp<CPUDevice, T, T, false, false>);
+      MklFusedBatchNormOp<CPUDevice, T, T, false, false>);     \
+  REGISTER_KERNEL_BUILDER(                                     \
+      Name("_MklNativeFusedBatchNorm")                         \
+          .Device(DEVICE_CPU)                                  \
+          .TypeConstraint<T>("T")                              \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),      \
+      MklFusedBatchNormOp<CPUDevice, T, T, false, false, true>);
 
 TF_CALL_float(REGISTER_MKL_FUSED_BATCHNORM_CPU);
 TF_CALL_bfloat16(REGISTER_MKL_FUSED_BATCHNORM_CPU);
@@ -1527,7 +1581,14 @@ TF_CALL_bfloat16(REGISTER_MKL_FUSED_BATCHNORM_CPU);
           .TypeConstraint<T>("T")                              \
           .TypeConstraint<U>("U")                              \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedBatchNormOp<CPUDevice, T, U, false, false>);
+      MklFusedBatchNormOp<CPUDevice, T, U, false, false>);     \
+  REGISTER_KERNEL_BUILDER(                                     \
+      Name("_MklNativeFusedBatchNormV2")                       \
+          .Device(DEVICE_CPU)                                  \
+          .TypeConstraint<T>("T")                              \
+          .TypeConstraint<U>("U")                              \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),      \
+      MklFusedBatchNormOp<CPUDevice, T, U, false, false, true>);
 
 REGISTER_MKL_FUSED_BATCHNORM_V2_CPU(float, float);
 REGISTER_MKL_FUSED_BATCHNORM_V2_CPU(bfloat16, float);
@@ -1539,7 +1600,13 @@ REGISTER_MKL_FUSED_BATCHNORM_V2_CPU(bfloat16, float);
           .Device(DEVICE_CPU)                                  \
           .TypeConstraint<T>("T")                              \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedBatchNormGradOp<CPUDevice, T, T, false>);
+      MklFusedBatchNormGradOp<CPUDevice, T, T, false>);        \
+  REGISTER_KERNEL_BUILDER(                                     \
+      Name("_MklNativeFusedBatchNormGrad")                     \
+          .Device(DEVICE_CPU)                                  \
+          .TypeConstraint<T>("T")                              \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),      \
+      MklFusedBatchNormGradOp<CPUDevice, T, T, false, true>);
 
 TF_CALL_float(REGISTER_MKL_FUSED_BATCHNORM_GRAD_CPU);
 TF_CALL_bfloat16(REGISTER_MKL_FUSED_BATCHNORM_GRAD_CPU);
@@ -1552,7 +1619,14 @@ TF_CALL_bfloat16(REGISTER_MKL_FUSED_BATCHNORM_GRAD_CPU);
           .TypeConstraint<T>("T")                              \
           .TypeConstraint<U>("U")                              \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedBatchNormGradOp<CPUDevice, T, U, false>);
+      MklFusedBatchNormGradOp<CPUDevice, T, U, false>);        \
+  REGISTER_KERNEL_BUILDER(                                     \
+      Name("_MklNativeFusedBatchNormGradV2")                   \
+          .Device(DEVICE_CPU)                                  \
+          .TypeConstraint<T>("T")                              \
+          .TypeConstraint<U>("U")                              \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),      \
+      MklFusedBatchNormGradOp<CPUDevice, T, U, false, true>);
 
 REGISTER_MKL_FUSED_BATCHNORM_GRAD_V2_CPU(float, float);
 REGISTER_MKL_FUSED_BATCHNORM_GRAD_V2_CPU(bfloat16, float);
@@ -1561,21 +1635,35 @@ REGISTER_MKL_FUSED_BATCHNORM_GRAD_V2_CPU(bfloat16, float);
 // TODO: FusedBatchNormV3 has an additional output that is used to
 //       hold intermediate results. This parameter functionality is
 //       not implemented on CPU.
-#define REGISTER_MKL_FUSED_BATCHNORM_V3_CPU(T, U)              \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklFusedBatchNormV3")                             \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .TypeConstraint<U>("U")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedBatchNormOp<CPUDevice, T, U, true, false>);      \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklFusedBatchNormEx")                             \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .TypeConstraint<U>("U")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedBatchNormOp<CPUDevice, T, U, true, true>);
+#define REGISTER_MKL_FUSED_BATCHNORM_V3_CPU(T, U)               \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("_MklFusedBatchNormV3")                              \
+          .Device(DEVICE_CPU)                                   \
+          .TypeConstraint<T>("T")                               \
+          .TypeConstraint<U>("U")                               \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),  \
+      MklFusedBatchNormOp<CPUDevice, T, U, true, false>);       \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("_MklFusedBatchNormEx")                              \
+          .Device(DEVICE_CPU)                                   \
+          .TypeConstraint<T>("T")                               \
+          .TypeConstraint<U>("U")                               \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),  \
+      MklFusedBatchNormOp<CPUDevice, T, U, true, true>);        \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("_MklNativeFusedBatchNormV3")                        \
+          .Device(DEVICE_CPU)                                   \
+          .TypeConstraint<T>("T")                               \
+          .TypeConstraint<U>("U")                               \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),       \
+      MklFusedBatchNormOp<CPUDevice, T, U, true, false, true>); \
+  REGISTER_KERNEL_BUILDER(                                      \
+      Name("_MklNativeFusedBatchNormEx")                        \
+          .Device(DEVICE_CPU)                                   \
+          .TypeConstraint<T>("T")                               \
+          .TypeConstraint<U>("U")                               \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),       \
+      MklFusedBatchNormOp<CPUDevice, T, U, true, true, true>);
 
 REGISTER_MKL_FUSED_BATCHNORM_V3_CPU(float, float);
 REGISTER_MKL_FUSED_BATCHNORM_V3_CPU(bfloat16, float);
@@ -1599,7 +1687,14 @@ REGISTER_KERNEL_BUILDER(Name("_FusedBatchNormEx")
           .TypeConstraint<T>("T")                              \
           .TypeConstraint<U>("U")                              \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedBatchNormGradOp<CPUDevice, T, U, true>);
+      MklFusedBatchNormGradOp<CPUDevice, T, U, true>);         \
+  REGISTER_KERNEL_BUILDER(                                     \
+      Name("_MklNativeFusedBatchNormGradV3")                   \
+          .Device(DEVICE_CPU)                                  \
+          .TypeConstraint<T>("T")                              \
+          .TypeConstraint<U>("U")                              \
+          .Label(mkl_op_registry::kMklNameChangeOpLabel),      \
+      MklFusedBatchNormGradOp<CPUDevice, T, U, true, true>);
 
 REGISTER_MKL_FUSED_BATCHNORM_GRAD_V3_CPU(float, float);
 REGISTER_MKL_FUSED_BATCHNORM_GRAD_V3_CPU(bfloat16, float);
diff --git a/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op_test.cc b/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op_test.cc
index d97d70fdd81..60068a09d62 100644
--- a/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_fused_batch_norm_op_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -46,6 +47,12 @@ using GraphRunner = std::function<void(
     const float exponential_avg_factor, const bool is_training, Tensor* output,
     Tensor* batch_mean, Tensor* batch_var)>;
 
+using GraphRunnerGrad = std::function<void(
+    const Tensor& input, const Tensor& filter, const Tensor& y_backprop,
+    const Tensor& scale, const Tensor& mean, const Tensor& variance,
+    const Tensor& res_sp3, Tensor* output, Tensor* scale_backprop,
+    Tensor* offset_backprop, bool disable_grappler_opts)>;
+
 template <typename T>
 class CommonTestUtilities : public OpsTestBase {
  public:
@@ -118,10 +125,99 @@ class CommonTestUtilities : public OpsTestBase {
     test::ExpectClose(batch_var, mkl_batch_var, 1e-5);
   }
 
+  static void VerifyTensorsCloseForGrad(const float epsilon,
+                                        const GraphRunnerGrad& run,
+                                        const GraphRunnerGrad& run_mkl) {
+    int batch = 2;
+    int height = 8;
+    int width = 8;
+    int depth = 1;
+    int filter_height = 3;
+    int filter_width = 3;
+    int in_channels = 1;
+    int out_channels = 6;
+    DataType dtype = DataTypeToEnum<T>::v();
+
+    Tensor input(dtype, {batch, height, width, depth});
+    input.flat<T>() = input.flat<T>().template setRandom<random_gen_>();
+    Tensor filter(dtype,
+                  {filter_height, filter_width, in_channels, out_channels});
+    filter.flat<T>() = filter.flat<T>().template setRandom<random_gen_>();
+
+    Tensor y_backprop(dtype, {batch, height, width, out_channels});
+    y_backprop.flat<T>() =
+        y_backprop.flat<T>().template setRandom<random_gen_>();
+    Tensor scale(dtype, {out_channels});
+    scale.flat<T>() = scale.flat<T>().template setRandom<random_gen_>();
+    Tensor mean(dtype, {out_channels});
+    mean.flat<T>() = mean.flat<T>().template setRandom<random_gen_>();
+    Tensor variance(dtype, {out_channels});
+    variance.flat<T>() =
+        variance.flat<T>().template setRandom<random_gen_>().abs();
+    Tensor res_sp3(dtype, {out_channels});
+    res_sp3.flat<T>() =
+        res_sp3.flat<T>().template setRandom<random_gen_>().abs();
+
+    Tensor output;
+    Tensor scale_backprop;
+    Tensor offset_backprop;
+    Tensor mkl_output;
+    Tensor mkl_scale_backprop;
+    Tensor mkl_offset_backprop;
+
+    run(input, filter, y_backprop, scale, mean, variance, res_sp3, &output,
+        &scale_backprop, &offset_backprop, epsilon);
+
+    run_mkl(input, filter, y_backprop, scale, mean, variance, res_sp3,
+            &mkl_output, &mkl_scale_backprop, &mkl_offset_backprop, epsilon);
+
+    ASSERT_EQ(output.dtype(), mkl_output.dtype());
+    ASSERT_EQ(output.shape(), mkl_output.shape());
+    ASSERT_EQ(scale_backprop.dtype(), mkl_scale_backprop.dtype());
+    ASSERT_EQ(scale_backprop.shape(), mkl_scale_backprop.shape());
+    ASSERT_EQ(offset_backprop.dtype(), mkl_offset_backprop.dtype());
+    ASSERT_EQ(offset_backprop.shape(), mkl_offset_backprop.shape());
+
+    test::ExpectClose(output, mkl_output, 1e-5);
+    test::ExpectClose(scale_backprop, mkl_scale_backprop, 1e-5);
+    test::ExpectClose(offset_backprop, mkl_offset_backprop, 1e-5);
+  }
+
  private:
   using random_gen_ = Eigen::internal::NormalRandomGenerator<T>;
 };
 
+template <typename T>
+class Conv2DOpTest : public OpsTestBase {
+  void TestBody() {}
+
+ public:
+  void RunConv2D(const Tensor& input, const Tensor& filter, Tensor* output,
+                 Tensor* meta_output) {
+    DataType dtype = DataTypeToEnum<T>::v();
+
+    TF_EXPECT_OK(NodeDefBuilder("MklConv2D", "_MklConv2D")
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(DT_UINT8))
+                     .Attr("strides", {1, 1, 1, 1})
+                     .Attr("padding", "SAME")
+                     .Attr("data_format", "NHWC")
+                     .Attr("_kernel", "MklLayoutDependentOp")
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    AddInputFromArray<T>(input.shape(), input.flat<T>());
+    AddInputFromArray<T>(filter.shape(), filter.flat<T>());
+    for (int i = 0; i < 2; ++i)
+      AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    TF_ASSERT_OK(RunOpKernel());
+
+    *output = *GetOutput(0);
+    *meta_output = *GetOutput(2);
+  }
+};
+
 template <typename T>
 class FusedBatchNormOpTest : public OpsTestBase {
  protected:
@@ -180,48 +276,199 @@ class FusedBatchNormOpTest : public OpsTestBase {
                                        const bool is_training, Tensor* output,
                                        Tensor* batch_mean, Tensor* batch_var) {
       DataType dtype = DataTypeToEnum<T>::v();
-      TF_EXPECT_OK(NodeDefBuilder("MklFusedBatchNorm", "_MklFusedBatchNorm")
-                       .Input(FakeInput(dtype))
-                       .Input(FakeInput(DT_FLOAT))
-                       .Input(FakeInput(DT_FLOAT))
-                       .Input(FakeInput(DT_FLOAT))
-                       .Input(FakeInput(DT_FLOAT))
-                       .Input(FakeInput(DT_UINT8))
-                       .Input(FakeInput(DT_UINT8))
-                       .Input(FakeInput(DT_UINT8))
-                       .Input(FakeInput(DT_UINT8))
-                       .Input(FakeInput(DT_UINT8))
-                       .Attr("exponential_avg_factor", exponential_avg_factor)
-                       .Attr("epsilon", 0.001)
-                       .Attr("is_training", is_training)
-                       .Attr("_kernel", "MklLayoutDependentOp")
-                       .Finalize(node_def()));
+      if (!NativeFormatEnabled()) {
+        TF_EXPECT_OK(NodeDefBuilder("MklFusedBatchNorm", "_MklFusedBatchNorm")
+                         .Input(FakeInput(dtype))
+                         .Input(FakeInput(DT_FLOAT))
+                         .Input(FakeInput(DT_FLOAT))
+                         .Input(FakeInput(DT_FLOAT))
+                         .Input(FakeInput(DT_FLOAT))
+                         .Input(FakeInput(DT_UINT8))
+                         .Input(FakeInput(DT_UINT8))
+                         .Input(FakeInput(DT_UINT8))
+                         .Input(FakeInput(DT_UINT8))
+                         .Input(FakeInput(DT_UINT8))
+                         .Attr("exponential_avg_factor", exponential_avg_factor)
+                         .Attr("epsilon", 0.001)
+                         .Attr("is_training", is_training)
+                         .Attr("_kernel", "MklLayoutDependentOp")
+                         .Finalize(node_def()));
+      } else {
+        TF_EXPECT_OK(NodeDefBuilder("MklNativeFusedBatchNorm",
+                                    "_MklNativeFusedBatchNorm")
+                         .Input(FakeInput(dtype))
+                         .Input(FakeInput(DT_FLOAT))
+                         .Input(FakeInput(DT_FLOAT))
+                         .Input(FakeInput(DT_FLOAT))
+                         .Input(FakeInput(DT_FLOAT))
+                         .Attr("exponential_avg_factor", exponential_avg_factor)
+                         .Attr("epsilon", 0.001)
+                         .Attr("is_training", is_training)
+                         .Attr("_kernel", "MklNameChangeOp")
+                         .Finalize(node_def()));
+      }
       TF_EXPECT_OK(InitOp());
 
-      AddInputFromArray<float>(input.shape(), input.flat<T>());
-      AddInputFromArray<float>(scale.shape(), scale.flat<T>());
-      AddInputFromArray<float>(offset.shape(), offset.flat<T>());
-      AddInputFromArray<float>(mean.shape(), mean.flat<T>());
-      AddInputFromArray<float>(variance.shape(), variance.flat<T>());
-      for (int i = 0; i < 5; ++i)
-        AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+      AddInputFromArray<T>(input.shape(), input.flat<T>());
+      AddInputFromArray<float>(scale.shape(), scale.flat<float>());
+      AddInputFromArray<float>(offset.shape(), offset.flat<float>());
+      AddInputFromArray<float>(mean.shape(), mean.flat<float>());
+      AddInputFromArray<float>(variance.shape(), variance.flat<float>());
+      if (!NativeFormatEnabled()) {
+        for (int i = 0; i < 5; ++i)
+          AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+      }
       TF_ASSERT_OK(RunOpKernel());
 
-      CommonTestUtilities<T> test_util;
-      test_util.PerformConversion(dtype, *GetOutput(0), *GetOutput(5), output);
+      if (!NativeFormatEnabled()) {
+        CommonTestUtilities<T> test_util;
+        test_util.PerformConversion(dtype, *GetOutput(0), *GetOutput(5),
+                                    output);
 
-      CommonTestUtilities<T> test_util_mean;
-      test_util_mean.PerformConversion(dtype, *GetOutput(1), *GetOutput(6),
-                                       batch_mean);
+        CommonTestUtilities<T> test_util_mean;
+        test_util_mean.PerformConversion(dtype, *GetOutput(1), *GetOutput(6),
+                                         batch_mean);
 
-      CommonTestUtilities<T> test_util_var;
-      test_util_var.PerformConversion(dtype, *GetOutput(2), *GetOutput(7),
-                                      batch_var);
+        CommonTestUtilities<T> test_util_var;
+        test_util_var.PerformConversion(dtype, *GetOutput(2), *GetOutput(7),
+                                        batch_var);
+      } else {
+        *output = *GetOutput(0);
+        *batch_mean = *GetOutput(1);
+        *batch_var = *GetOutput(2);
+      }
     };
 
     CommonTestUtilities<T>::VerifyTensorsClose(exponential_avg_factor,
                                                is_training, run, run_mkl);
   }
+
+  void VerifyFusedBatchNormGradWithConv2D(const float epsilon) {
+    const GraphRunnerGrad run =
+        [this](const Tensor& input, const Tensor& filter,
+               const Tensor& y_backprop, const Tensor& scale,
+               const Tensor& mean, const Tensor& variance,
+               const Tensor& res_sp3, Tensor* x_backprop_tensor,
+               Tensor* scale_backprop_tensor, Tensor* offset_backprop_tensor,
+               const float epsilon) {
+          auto root = tensorflow::Scope::NewRootScope();
+
+          auto input_op =
+              ops::Const(root.WithOpName("input"), Input::Initializer(input));
+          auto filter_op =
+              ops::Const(root.WithOpName("filter"), Input::Initializer(filter));
+          ops::Conv2D::Attrs conv_attr;
+          conv_attr = conv_attr.DataFormat("NHWC");
+          auto conv = ops::Conv2D(root.WithOpName("Conv"), input_op, filter_op,
+                                  {1, 1, 1, 1}, "SAME", conv_attr);
+          // -------------------------------------------------------------
+          auto y_backprop_op = ops::Const(root.WithOpName("y_backprop"),
+                                          Input::Initializer(y_backprop));
+          auto scale_op =
+              ops::Const(root.WithOpName("scale"), Input::Initializer(scale));
+          auto mean_op =
+              ops::Const(root.WithOpName("mean"), Input::Initializer(mean));
+          auto var_op = ops::Const(root.WithOpName("variance"),
+                                   Input::Initializer(variance));
+          auto res_sp3_op = ops::Const(root.WithOpName("reserve_space_3"),
+                                       Input::Initializer(res_sp3));
+          ops::FusedBatchNormGradV3::Attrs bn_attr;
+          bn_attr = bn_attr.IsTraining(true);
+          bn_attr = bn_attr.Epsilon(epsilon);
+          bn_attr = bn_attr.DataFormat("NHWC");
+          auto bn = ops::FusedBatchNormGradV3(
+              root.WithOpName("FusedBatchNormGrad"), y_backprop_op, conv,
+              scale_op, mean_op, var_op, res_sp3_op, bn_attr);
+
+          auto x_backprop =
+              ops::Identity(root.WithOpName("x_backprop"), bn.x_backprop);
+          auto scale_backprop = ops::Identity(root.WithOpName("scale_backprop"),
+                                              bn.scale_backprop);
+          auto offset_backprop = ops::Identity(
+              root.WithOpName("offset_backprop"), bn.offset_backprop);
+
+          tensorflow::GraphDef graph;
+          TF_ASSERT_OK(root.ToGraphDef(&graph));
+
+          tensorflow::SessionOptions session_options;
+          std::unique_ptr<tensorflow::Session> session(
+              tensorflow::NewSession(session_options));
+          TF_ASSERT_OK(session->Create(graph));
+
+          std::vector<Tensor> output_tensors;
+          TF_ASSERT_OK(session->Run(
+              {}, {"x_backprop", "scale_backprop", "offset_backprop"}, {},
+              &output_tensors));
+
+          *x_backprop_tensor = output_tensors[0];
+          *scale_backprop_tensor = output_tensors[1];
+          *offset_backprop_tensor = output_tensors[2];
+        };
+
+    const GraphRunnerGrad run_mkl =
+        [this](const Tensor& input, const Tensor& filter,
+               const Tensor& y_backprop, const Tensor& scale,
+               const Tensor& mean, const Tensor& variance,
+               const Tensor& res_sp3, Tensor* x_backprop_tensor,
+               Tensor* scale_backprop_tensor, Tensor* offset_backprop_tensor,
+               const float epsilon) {
+          Tensor conv2d_output, conv2d_meta_output;
+          Conv2DOpTest<T> conv2d_test;
+          conv2d_test.RunConv2D(input, filter, &conv2d_output,
+                                &conv2d_meta_output);
+
+          DataType dtype = DataTypeToEnum<T>::v();
+          TF_EXPECT_OK(
+              NodeDefBuilder("MklFusedBatchNorm", "_MklFusedBatchNormGradV3")
+                  .Input(FakeInput(dtype))
+                  .Input(FakeInput(dtype))
+                  .Input(FakeInput(DT_FLOAT))
+                  .Input(FakeInput(DT_FLOAT))
+                  .Input(FakeInput(DT_FLOAT))
+                  .Input(FakeInput(DT_FLOAT))
+                  .Input(FakeInput(DT_UINT8))
+                  .Input(FakeInput(DT_UINT8))
+                  .Input(FakeInput(DT_UINT8))
+                  .Input(FakeInput(DT_UINT8))
+                  .Input(FakeInput(DT_UINT8))
+                  .Input(FakeInput(DT_UINT8))
+                  .Attr("epsilon", epsilon)
+                  .Attr("is_training", true)
+                  .Attr("data_format", "NHWC")
+                  .Attr("_kernel", "MklLayoutDependentOp")
+                  .Finalize(node_def()));
+          TF_EXPECT_OK(InitOp());
+
+          AddInputFromArray<T>(y_backprop.shape(), y_backprop.flat<T>());
+          AddInputFromArray<T>(conv2d_output.shape(), conv2d_output.flat<T>());
+          AddInputFromArray<float>(scale.shape(), scale.flat<float>());
+          AddInputFromArray<float>(mean.shape(), mean.flat<float>());
+          AddInputFromArray<float>(variance.shape(), variance.flat<float>());
+          AddInputFromArray<float>(res_sp3.shape(), res_sp3.flat<float>());
+          AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+          AddInputFromArray<uint8>(conv2d_meta_output.shape(),
+                                   conv2d_meta_output.flat<uint8>());
+          AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+          AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+          AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+          AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+          TF_ASSERT_OK(RunOpKernel());
+
+          CommonTestUtilities<T> test_util;
+          test_util.PerformConversion(dtype, *GetOutput(0), *GetOutput(5),
+                                      x_backprop_tensor);
+
+          CommonTestUtilities<T> test_util_mean;
+          test_util_mean.PerformConversion(dtype, *GetOutput(1), *GetOutput(6),
+                                           scale_backprop_tensor);
+
+          CommonTestUtilities<T> test_util_var;
+          test_util_var.PerformConversion(dtype, *GetOutput(2), *GetOutput(7),
+                                          offset_backprop_tensor);
+        };
+
+    CommonTestUtilities<T>::VerifyTensorsCloseForGrad(epsilon, run, run_mkl);
+  }
 };
 
 TYPED_TEST_SUITE_P(FusedBatchNormOpTest);
@@ -250,8 +497,14 @@ TYPED_TEST_P(FusedBatchNormOpTest, InferenceIgnoreAvgFactor) {
   this->VerifyFusedBatchNorm(exponential_avg_factor, is_training);
 }
 
+TYPED_TEST_P(FusedBatchNormOpTest, FusedBatchNormGradV3) {
+  const float epsilon = 0.001;
+  this->VerifyFusedBatchNormGradWithConv2D(epsilon);
+}
+
 REGISTER_TYPED_TEST_SUITE_P(FusedBatchNormOpTest, Training, TrainingRunningMean,
-                            Inference, InferenceIgnoreAvgFactor);
+                            Inference, InferenceIgnoreAvgFactor,
+                            FusedBatchNormGradV3);
 
 using FusedBatchNormDataTypes = ::testing::Types<float>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedBatchNormOpTest,
diff --git a/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
index 1f1bbd158f9..7bd47e9d014 100644
--- a/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
@@ -16,12 +16,14 @@ limitations under the License.
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/image_ops.h"
 #include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/nn_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -248,6 +250,12 @@ class MklFusedConv2DOpTest : public OpsTestBase {
       next_op = ops::Elu(root.WithOpName(last_op), next_op);
     }
 
+    if (std::find(fused_ops.begin(), fused_ops.end(), "LeakyRelu") !=
+        fused_ops.end()) {
+      last_op = "with_leakyrelu";
+      next_op = ops::internal::LeakyRelu(root.WithOpName(last_op), next_op);
+    }
+
     CommonTestUtilities<T>::RunAndFetch(root, last_op, output);
   }
 
@@ -258,20 +266,34 @@ class MklFusedConv2DOpTest : public OpsTestBase {
     DataType dtype = DataTypeToEnum<T>::v();
     int num_args = static_cast<int>(args.size());
 
-    TF_EXPECT_OK(NodeDefBuilder("fused_conv_op", "_MklFusedConv2D")
-                     .Input(FakeInput(dtype))
-                     .Input(FakeInput(dtype))
-                     .Input(FakeInput(num_args, dtype))
-                     .Input(FakeInput(DT_UINT8))
-                     .Input(FakeInput(DT_UINT8))
-                     .Input(FakeInput(num_args, DT_UINT8))
-                     .Attr("T", dtype)
-                     .Attr("num_args", num_args)
-                     .Attr("strides", {1, stride, stride, 1})
-                     .Attr("padding", "SAME")
-                     .Attr("fused_ops", fused_ops)
-                     .Attr("_kernel", "MklLayoutDependentOp")
-                     .Finalize(node_def()));
+    if (!NativeFormatEnabled()) {
+      TF_EXPECT_OK(NodeDefBuilder("fused_conv_op", "_MklFusedConv2D")
+                       .Input(FakeInput(dtype))
+                       .Input(FakeInput(dtype))
+                       .Input(FakeInput(num_args, dtype))
+                       .Input(FakeInput(DT_UINT8))
+                       .Input(FakeInput(DT_UINT8))
+                       .Input(FakeInput(num_args, DT_UINT8))
+                       .Attr("T", dtype)
+                       .Attr("num_args", num_args)
+                       .Attr("strides", {1, stride, stride, 1})
+                       .Attr("padding", "SAME")
+                       .Attr("fused_ops", fused_ops)
+                       .Attr("_kernel", "MklLayoutDependentOp")
+                       .Finalize(node_def()));
+    } else {
+      TF_EXPECT_OK(NodeDefBuilder("fused_conv_op", "_MklNativeFusedConv2D")
+                       .Input(FakeInput(dtype))
+                       .Input(FakeInput(dtype))
+                       .Input(FakeInput(num_args, dtype))
+                       .Attr("T", dtype)
+                       .Attr("num_args", num_args)
+                       .Attr("strides", {1, stride, stride, 1})
+                       .Attr("padding", "SAME")
+                       .Attr("fused_ops", fused_ops)
+                       .Attr("_kernel", "MklNameChangeOp")
+                       .Finalize(node_def()));
+    }
 
     TF_EXPECT_OK(InitOp());
 
@@ -279,20 +301,26 @@ class MklFusedConv2DOpTest : public OpsTestBase {
     AddInputFromArray<T>(filter.shape(), filter.flat<T>());
     for (const Tensor& arg : args)
       AddInputFromArray<T>(arg.shape(), arg.flat<T>());
-    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
-    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
-    for (const Tensor& arg : args)
+    if (!NativeFormatEnabled()) {
       AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+      AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+      for (const Tensor& arg : args)
+        AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    }
     TF_ASSERT_OK(RunOpKernel());
 
     // Compare output to expected results
     const Tensor& output_tensor = *GetOutput(0);
-    // Index 2 will need to be changed if the number of outputs produced
-    // by MklConv2D change.
-    const Tensor& output_meta_tensor = *GetOutput(2);
     CommonTestUtilities<T> test_util;
-    test_util.PerformConversion(dtype, output_tensor, output_meta_tensor,
-                                output);
+    if (!NativeFormatEnabled()) {
+      // Index 2 will need to be changed if the number of outputs produced
+      // by MklConv2D change.
+      const Tensor& output_meta_tensor = *GetOutput(2);
+      test_util.PerformConversion(dtype, output_tensor, output_meta_tensor,
+                                  output);
+    } else {
+      *output = output_tensor;
+    }
   }
 
   // Verifies computing unfused ops in a graph is identical to FusedConv2D.
@@ -385,6 +413,18 @@ TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, SpatialConvolutionAndElu) {
   this->VerifyFusedConv2D(kFilterSize, kFilterCount, {"BiasAdd", "Elu"});
 }
 
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, OneByOneConvolutionAndLeakyRelu) {
+  const int kFilterSize = 1;
+  const int kFilterCount = 12;
+  this->VerifyFusedConv2D(kFilterSize, kFilterCount, {"BiasAdd", "LeakyRelu"});
+}
+
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, SpatialConvolutionAndLeakyRelu) {
+  const int kFilterSize = 3;
+  const int kFilterCount = 12;
+  this->VerifyFusedConv2D(kFilterSize, kFilterCount, {"BiasAdd", "LeakyRelu"});
+}
+
 TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, OneByOneConvolutionAndAdd) {
   const int kFilterSize = 1;
   const int kFilterCount = 3;
@@ -437,15 +477,31 @@ TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, SpatialConvolutionAndAddElu) {
   this->VerifyFusedConv2D(kFilterSize, kFilterCount, {"BiasAdd", "Add", "Elu"});
 }
 
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, OneByOneConvolutionAndAddLeakyRelu) {
+  const int kFilterSize = 1;
+  const int kFilterCount = 3;
+  this->VerifyFusedConv2D(kFilterSize, kFilterCount,
+                          {"BiasAdd", "Add", "LeakyRelu"});
+}
+
+TYPED_TEST_P(MklFusedConv2DWithBiasOpTest, SpatialConvolutionAndAddLeakyRelu) {
+  const int kFilterSize = 3;
+  const int kFilterCount = 3;
+  this->VerifyFusedConv2D(kFilterSize, kFilterCount,
+                          {"BiasAdd", "Add", "LeakyRelu"});
+}
+
 REGISTER_TYPED_TEST_SUITE_P(
     MklFusedConv2DWithBiasOpTest, OneByOneConvolution, SpatialConvolution,
     OneByOneConvolutionAndRelu, SpatialConvolutionAndRelu,
     OneByOneConvolutionAndRelu6, SpatialConvolutionAndRelu6,
     OneByOneConvolutionAndElu, SpatialConvolutionAndElu,
+    OneByOneConvolutionAndLeakyRelu, SpatialConvolutionAndLeakyRelu,
     OneByOneConvolutionAndAdd, SpatialConvolutionAndAdd,
     OneByOneConvolutionAndAddRelu, SpatialConvolutionAndAddRelu,
     OneByOneConvolutionAndAddRelu6, SpatialConvolutionAndAddRelu6,
-    OneByOneConvolutionAndAddElu, SpatialConvolutionAndAddElu);
+    OneByOneConvolutionAndAddElu, SpatialConvolutionAndAddElu,
+    OneByOneConvolutionAndAddLeakyRelu, SpatialConvolutionAndAddLeakyRelu);
 
 using MklFusedBiasAddDataTypes = ::testing::Types<float>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, MklFusedConv2DWithBiasOpTest,
@@ -510,21 +566,36 @@ class MklFusedDepthwiseConv2DOpTest : public OpsTestBase {
     DataType dtype = DataTypeToEnum<T>::v();
     int num_args = static_cast<int>(args.size());
 
-    TF_EXPECT_OK(NodeDefBuilder("fused_depthwise_conv_op",
-                                "_MklFusedDepthwiseConv2dNative")
-                     .Input(FakeInput(dtype))
-                     .Input(FakeInput(dtype))
-                     .Input(FakeInput(num_args, dtype))
-                     .Input(FakeInput(DT_UINT8))
-                     .Input(FakeInput(DT_UINT8))
-                     .Input(FakeInput(num_args, DT_UINT8))
-                     .Attr("T", dtype)
-                     .Attr("num_args", num_args)
-                     .Attr("strides", {1, stride, stride, 1})
-                     .Attr("padding", "SAME")
-                     .Attr("fused_ops", fused_ops)
-                     .Attr("_kernel", "MklLayoutDependentOp")
-                     .Finalize(node_def()));
+    if (!NativeFormatEnabled()) {
+      TF_EXPECT_OK(NodeDefBuilder("fused_depthwise_conv_op",
+                                  "_MklFusedDepthwiseConv2dNative")
+                       .Input(FakeInput(dtype))
+                       .Input(FakeInput(dtype))
+                       .Input(FakeInput(num_args, dtype))
+                       .Input(FakeInput(DT_UINT8))
+                       .Input(FakeInput(DT_UINT8))
+                       .Input(FakeInput(num_args, DT_UINT8))
+                       .Attr("T", dtype)
+                       .Attr("num_args", num_args)
+                       .Attr("strides", {1, stride, stride, 1})
+                       .Attr("padding", "SAME")
+                       .Attr("fused_ops", fused_ops)
+                       .Attr("_kernel", "MklLayoutDependentOp")
+                       .Finalize(node_def()));
+    } else {
+      TF_EXPECT_OK(NodeDefBuilder("fused_depthwise_conv_op",
+                                  "_MklNativeFusedDepthwiseConv2dNative")
+                       .Input(FakeInput(dtype))
+                       .Input(FakeInput(dtype))
+                       .Input(FakeInput(num_args, dtype))
+                       .Attr("T", dtype)
+                       .Attr("num_args", num_args)
+                       .Attr("strides", {1, stride, stride, 1})
+                       .Attr("padding", "SAME")
+                       .Attr("fused_ops", fused_ops)
+                       .Attr("_kernel", "MklNameChangeOp")
+                       .Finalize(node_def()));
+    }
 
     TF_EXPECT_OK(InitOp());
 
@@ -532,20 +603,26 @@ class MklFusedDepthwiseConv2DOpTest : public OpsTestBase {
     AddInputFromArray<T>(filter.shape(), filter.flat<T>());
     for (const Tensor& arg : args)
       AddInputFromArray<T>(arg.shape(), arg.flat<T>());
-    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
-    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
-    for (const Tensor& arg : args)
+    if (!NativeFormatEnabled()) {
       AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+      AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+      for (const Tensor& arg : args)
+        AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    }
     TF_ASSERT_OK(RunOpKernel());
 
     // Compare output to expected results
     const Tensor& output_tensor = *GetOutput(0);
-    // Index 2 will need to be changed if the number of outputs produced
-    // by MklDepthwiseConv2D change.
-    const Tensor& output_meta_tensor = *GetOutput(2);
     CommonTestUtilities<T> test_util;
-    test_util.PerformConversion(dtype, output_tensor, output_meta_tensor,
-                                output);
+    if (!NativeFormatEnabled()) {
+      // Index 2 will need to be changed if the number of outputs produced
+      // by MklDepthwiseConv2D change.
+      const Tensor& output_meta_tensor = *GetOutput(2);
+      test_util.PerformConversion(dtype, output_tensor, output_meta_tensor,
+                                  output);
+    } else {
+      *output = output_tensor;
+    }
   }
 
   // Verifies computing unfused ops in a graph is identical to
@@ -715,35 +792,55 @@ class FusedPadConvOpTest : public OpsTestBase {
          59, 12,  0,   0,   0,   0,  0,   0,   0,   0,   0,   0});
 
     // Create a fused pad+conv2d node
-    TF_EXPECT_OK(NodeDefBuilder("fused_pad_conv_op", "_MklPadWithConv2D")
-                     .Input(FakeInput(dtype))     // Input
-                     .Input(FakeInput(dtype))     // Filter
-                     .Input(FakeInput(DT_INT32))  // Padding
-                     .Input(FakeInput(DT_UINT8))  // MKl second tensor
-                     .Input(FakeInput(DT_UINT8))  // MKl second tensor
-                     .Input(FakeInput(DT_UINT8))  // MKl second tensor
-                     .Attr("padding", "VALID")
-                     .Attr("data_format", data_format)
-                     .Attr("T", dtype)
-                     .Attr("strides", {1, stride, stride, 1})
-                     .Attr("_kernel", "MklLayoutDependentOp")
-                     .Finalize(node_def()));
+    if (!NativeFormatEnabled()) {
+      TF_EXPECT_OK(NodeDefBuilder("fused_pad_conv_op", "_MklPadWithConv2D")
+                       .Input(FakeInput(dtype))     // Input
+                       .Input(FakeInput(dtype))     // Filter
+                       .Input(FakeInput(DT_INT32))  // Padding
+                       .Input(FakeInput(DT_UINT8))  // MKL second tensor
+                       .Input(FakeInput(DT_UINT8))  // MKL second tensor
+                       .Input(FakeInput(DT_UINT8))  // MKL second tensor
+                       .Attr("padding", "VALID")
+                       .Attr("data_format", data_format)
+                       .Attr("T", dtype)
+                       .Attr("strides", {1, stride, stride, 1})
+                       .Attr("_kernel", "MklLayoutDependentOp")
+                       .Finalize(node_def()));
+    } else {
+      TF_EXPECT_OK(
+          NodeDefBuilder("fused_pad_conv_op", "_MklNativePadWithConv2D")
+              .Input(FakeInput(dtype))     // Input
+              .Input(FakeInput(dtype))     // Filter
+              .Input(FakeInput(DT_INT32))  // Padding
+              .Attr("padding", "VALID")
+              .Attr("data_format", data_format)
+              .Attr("T", dtype)
+              .Attr("strides", {1, stride, stride, 1})
+              .Attr("_kernel", "MklNameChangeOp")
+              .Finalize(node_def()));
+    }
     TF_EXPECT_OK(InitOp());
 
     // Setting up inputs and execute
     AddInputFromArray<T>(image.shape(), image.flat<T>());
     AddInputFromArray<T>(filter.shape(), filter.flat<T>());
     AddInputFromArray<int32>(padding.shape(), padding.flat<int32>());
-    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
-    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
-    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    if (!NativeFormatEnabled()) {
+      AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+      AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+      AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    }
     TF_ASSERT_OK(RunOpKernel());
 
     // Compare output to expected results
     const Tensor& first = *GetOutput(0);
-    const Tensor& second = *GetOutput(2);
     CommonTestUtilities<T> test_util;
-    test_util.ConvertAndCompareIntegral(dtype, first, second, expected);
+    if (!NativeFormatEnabled()) {
+      const Tensor& second = *GetOutput(2);
+      test_util.ConvertAndCompareIntegral(dtype, first, second, expected);
+    } else {
+      test::ExpectTensorEqual<T>(expected, first);
+    }
   }
 };
 
@@ -770,33 +867,52 @@ class FilterCacheTest : public OpsTestBase {
            const bool is_filter_const) {
     const int stride = 1;
 
-    TF_EXPECT_OK(NodeDefBuilder("conv2d_filter_cache", "_MklConv2D")
-                     .Input(FakeInput(dtype))     // Input
-                     .Input(FakeInput(dtype))     // Filter
-                     .Input(FakeInput(DT_UINT8))  // MKl second tensor
-                     .Input(FakeInput(DT_UINT8))  // MKl second tensor
-                     .Attr("padding", "VALID")
-                     .Attr("data_format", "NHWC")
-                     .Attr("is_filter_const", is_filter_const)
-                     .Attr("T", dtype)
-                     .Attr("strides", {1, stride, stride, 1})
-                     .Attr("_kernel", "MklLayoutDependentOp")
-                     .Finalize(node_def()));
+    if (!NativeFormatEnabled()) {
+      TF_EXPECT_OK(NodeDefBuilder("conv2d_filter_cache", "_MklConv2D")
+                       .Input(FakeInput(dtype))     // Input
+                       .Input(FakeInput(dtype))     // Filter
+                       .Input(FakeInput(DT_UINT8))  // MKL second tensor
+                       .Input(FakeInput(DT_UINT8))  // MKL second tensor
+                       .Attr("padding", "VALID")
+                       .Attr("data_format", "NHWC")
+                       .Attr("is_filter_const", is_filter_const)
+                       .Attr("T", dtype)
+                       .Attr("strides", {1, stride, stride, 1})
+                       .Attr("_kernel", "MklLayoutDependentOp")
+                       .Finalize(node_def()));
+    } else {
+      TF_EXPECT_OK(NodeDefBuilder("conv2d_filter_cache", "_MklNativeConv2D")
+                       .Input(FakeInput(dtype))  // Input
+                       .Input(FakeInput(dtype))  // Filter
+                       .Attr("padding", "VALID")
+                       .Attr("data_format", "NHWC")
+                       .Attr("is_filter_const", is_filter_const)
+                       .Attr("T", dtype)
+                       .Attr("strides", {1, stride, stride, 1})
+                       .Attr("_kernel", "MklNameChangeOp")
+                       .Finalize(node_def()));
+    }
     TF_EXPECT_OK(InitOp());
 
     // Setting up inputs and execute
     AddInputFromArray<T>(image.shape(), image.flat<T>());
     AddInputFromArray<T>(filter.shape(), filter.flat<T>());
-    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
-    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    if (!NativeFormatEnabled()) {
+      AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+      AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    }
 
     TF_ASSERT_OK(RunOpKernel());
 
     // Compare outputs to expected results
     const Tensor& output = *GetOutput(0);
-    const Tensor& output_layout = *GetOutput(2);
     CommonTestUtilities<T> conv_comp;
-    conv_comp.ConvertAndCompare(dtype, output, output_layout, expected);
+    if (!NativeFormatEnabled()) {
+      const Tensor& output_layout = *GetOutput(2);
+      conv_comp.ConvertAndCompare(dtype, output, output_layout, expected);
+    } else {
+      test::ExpectTensorEqual<T>(expected, output);
+    }
 
     // TODO(bhavanis): For now, we rely on internal performance tests to
     // determine if filter data is being cached and reused.
@@ -806,10 +922,14 @@ class FilterCacheTest : public OpsTestBase {
 
     // Compare output to expected results
     const Tensor& output_new = *GetOutput(0);
-    const Tensor& output_layout_new = *GetOutput(2);
     CommonTestUtilities<T> conv_comp_new;
-    conv_comp_new.ConvertAndCompare(dtype, output_new, output_layout_new,
-                                    expected);
+    if (!NativeFormatEnabled()) {
+      const Tensor& output_layout_new = *GetOutput(2);
+      conv_comp_new.ConvertAndCompare(dtype, output_new, output_layout_new,
+                                      expected);
+    } else {
+      test::ExpectTensorEqual<T>(expected, output_new);
+    }
   }
 };
 
@@ -876,6 +996,12 @@ class MklFusedMatMulOpTest : public OpsTestBase {
             next_op = ops::Elu(root.WithOpName(last_op), next_op);
           }
 
+          if (std::find(fused_ops.begin(), fused_ops.end(), "Tanh") !=
+              fused_ops.end()) {
+            last_op = "with_tanh";
+            next_op = ops::Tanh(root.WithOpName(last_op), next_op);
+          }
+
           CommonTestUtilities<T>::RunAndFetch(root, last_op, output);
         };
 
@@ -885,39 +1011,61 @@ class MklFusedMatMulOpTest : public OpsTestBase {
           DataType dtype = DataTypeToEnum<T>::v();
           const int num_args = 1;
 
-          TF_EXPECT_OK(NodeDefBuilder("MklFusedMatMul", "_MklFusedMatMul")
-                           .Input(FakeInput(dtype))
-                           .Input(FakeInput(dtype))
-                           .Input(FakeInput(num_args, dtype))
-                           .Input(FakeInput(DT_UINT8))
-                           .Input(FakeInput(DT_UINT8))
-                           .Input(FakeInput(num_args, DT_UINT8))
-                           .Attr("T", dtype)
-                           .Attr("transpose_a", false)
-                           .Attr("transpose_b", false)
-                           .Attr("num_args", num_args)
-                           .Attr("fused_ops", fused_ops)
-                           .Attr("epsilon", 0.0001)
-                           .Attr("_kernel", "MklLayoutDependentOp")
-                           .Finalize(node_def()));
+          if (!NativeFormatEnabled()) {
+            TF_EXPECT_OK(NodeDefBuilder("MklFusedMatMul", "_MklFusedMatMul")
+                             .Input(FakeInput(dtype))
+                             .Input(FakeInput(dtype))
+                             .Input(FakeInput(num_args, dtype))
+                             .Input(FakeInput(DT_UINT8))
+                             .Input(FakeInput(DT_UINT8))
+                             .Input(FakeInput(num_args, DT_UINT8))
+                             .Attr("T", dtype)
+                             .Attr("transpose_a", false)
+                             .Attr("transpose_b", false)
+                             .Attr("num_args", num_args)
+                             .Attr("fused_ops", fused_ops)
+                             .Attr("epsilon", 0.0001)
+                             .Attr("_kernel", "MklLayoutDependentOp")
+                             .Finalize(node_def()));
+          } else {
+            TF_EXPECT_OK(
+                NodeDefBuilder("MklFusedMatMul", "_MklNativeFusedMatMul")
+                    .Input(FakeInput(dtype))
+                    .Input(FakeInput(dtype))
+                    .Input(FakeInput(num_args, dtype))
+                    .Attr("T", dtype)
+                    .Attr("transpose_a", false)
+                    .Attr("transpose_b", false)
+                    .Attr("num_args", num_args)
+                    .Attr("fused_ops", fused_ops)
+                    .Attr("epsilon", 0.0001)
+                    .Attr("_kernel", "MklNameChangeOp")
+                    .Finalize(node_def()));
+          }
 
           TF_EXPECT_OK(InitOp());
 
           AddInputFromArray<T>(input.shape(), input.flat<T>());
           AddInputFromArray<T>(weight.shape(), weight.flat<T>());
           AddInputFromArray<T>(bias.shape(), bias.flat<T>());
-          // Add MKL meta input for input, filter and bias.
-          AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
-          AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
-          AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+          if (!NativeFormatEnabled()) {
+            // Add MKL meta input for input, filter and bias.
+            AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+            AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+            AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+          }
 
           TF_ASSERT_OK(RunOpKernel());
 
           const Tensor& output_tensor = *GetOutput(0);
-          const Tensor& output_meta_tensor = *GetOutput(1);
-          CommonTestUtilities<T> test_util;
-          test_util.PerformConversion(dtype, output_tensor, output_meta_tensor,
-                                      output);
+          if (!NativeFormatEnabled()) {
+            const Tensor& output_meta_tensor = *GetOutput(1);
+            CommonTestUtilities<T> test_util;
+            test_util.PerformConversion(dtype, output_tensor,
+                                        output_meta_tensor, output);
+          } else {
+            *output = output_tensor;
+          }
         };
 
     CommonTestUtilities<T>::VerifyFusedMatrixClose(kInputChannel, kBatch,
@@ -963,11 +1111,21 @@ TYPED_TEST_P(MklFusedMatMulOpTest, WithBiasAndElu) {
                           {"BiasAdd", "Elu"});
 }
 
+TYPED_TEST_P(MklFusedMatMulOpTest, WithBiasAndTanh) {
+  const int batch = 3;
+  const int input_channel = 4;
+  const int output_channel = 5;
+
+  this->VerifyFusedMatMul(batch, input_channel, output_channel,
+                          {"BiasAdd", "Tanh"});
+}
+
 REGISTER_TYPED_TEST_SUITE_P(MklFusedMatMulOpTest,  //
                             WithBias,              //
                             WithBiasAndRelu,       //
                             WithBiasAndRelu6,      //
-                            WithBiasAndElu);
+                            WithBiasAndElu,        //
+                            WithBiasAndTanh);
 
 using MklFusedMatMulDataTypes = ::testing::Types<float>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, MklFusedMatMulOpTest,
@@ -982,21 +1140,36 @@ TEST_F(MklFusedMatMulCacheTest, WeightCached) {
   const int num_args = 1;
   const std::vector<string>& fused_ops = {"BiasAdd"};
 
-  TF_ASSERT_OK(NodeDefBuilder("MklFusedMatMul", "_MklFusedMatMul")
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(DT_FLOAT))
-                   .Input(FakeInput(num_args, DT_FLOAT))
-                   .Input(FakeInput(DT_UINT8))
-                   .Input(FakeInput(DT_UINT8))
-                   .Input(FakeInput(num_args, DT_UINT8))
-                   .Attr("T", DT_FLOAT)
-                   .Attr("transpose_a", false)
-                   .Attr("transpose_b", false)
-                   .Attr("num_args", num_args)
-                   .Attr("fused_ops", fused_ops)
-                   .Attr("epsilon", 0.0001)
-                   .Attr("_kernel", "MklLayoutDependentOp")
-                   .Finalize(node_def()));
+  if (!NativeFormatEnabled()) {
+    TF_ASSERT_OK(NodeDefBuilder("MklFusedMatMul", "_MklFusedMatMul")
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(num_args, DT_FLOAT))
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(DT_UINT8))
+                     .Input(FakeInput(num_args, DT_UINT8))
+                     .Attr("T", DT_FLOAT)
+                     .Attr("transpose_a", false)
+                     .Attr("transpose_b", false)
+                     .Attr("num_args", num_args)
+                     .Attr("fused_ops", fused_ops)
+                     .Attr("epsilon", 0.0001)
+                     .Attr("_kernel", "MklLayoutDependentOp")
+                     .Finalize(node_def()));
+  } else {
+    TF_ASSERT_OK(NodeDefBuilder("MklFusedMatMul", "_MklNativeFusedMatMul")
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(num_args, DT_FLOAT))
+                     .Attr("T", DT_FLOAT)
+                     .Attr("transpose_a", false)
+                     .Attr("transpose_b", false)
+                     .Attr("num_args", num_args)
+                     .Attr("fused_ops", fused_ops)
+                     .Attr("epsilon", 0.0001)
+                     .Attr("_kernel", "MklNameChangeOp")
+                     .Finalize(node_def()));
+  }
 
   TF_EXPECT_OK(InitOp());
   // The tensor shape of (1,3) is selected to allow the mkldnn expected
@@ -1012,10 +1185,12 @@ TEST_F(MklFusedMatMulCacheTest, WeightCached) {
                            {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18});
   // Bias vector.
   AddInputFromArray<float>(TensorShape({4}), {1, 2, 3, 4});
-  // Add MKL meta input for input, filter and bias.
-  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
-  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
-  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  if (!NativeFormatEnabled()) {
+    // Add MKL meta input for input, filter and bias.
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  }
 
   int64 start_time = Env::Default()->NowMicros();
   TF_ASSERT_OK(RunOpKernel());
@@ -1028,9 +1203,13 @@ TEST_F(MklFusedMatMulCacheTest, WeightCached) {
   test::FillValues<float>(&expected, {75, 82, 89, 96});
 
   const Tensor& output = *GetOutput(0);
-  const Tensor& mkl_shape_tensor = *GetOutput(1);
   CommonTestUtilities<float> test_util;
-  test_util.ConvertAndCompare(DT_FLOAT, output, mkl_shape_tensor, expected);
+  if (!NativeFormatEnabled()) {
+    const Tensor& mkl_shape_tensor = *GetOutput(1);
+    test_util.ConvertAndCompare(DT_FLOAT, output, mkl_shape_tensor, expected);
+  } else {
+    test::ExpectTensorNear<float>(expected, output, 1e-5);
+  }
 
   // Test for the second time to use the cached weight
   start_time = Env::Default()->NowMicros();
@@ -1046,9 +1225,13 @@ TEST_F(MklFusedMatMulCacheTest, WeightCached) {
   // Compare the result with expected result
   CommonTestUtilities<float> test_util_new;
   const Tensor& output_new = *GetOutput(0);
-  const Tensor& mkl_shape_tensor_new = *GetOutput(1);
-  test_util_new.ConvertAndCompare(DT_FLOAT, output_new, mkl_shape_tensor_new,
-                                  expected);
+  if (!NativeFormatEnabled()) {
+    const Tensor& mkl_shape_tensor_new = *GetOutput(1);
+    test_util_new.ConvertAndCompare(DT_FLOAT, output_new, mkl_shape_tensor_new,
+                                    expected);
+  } else {
+    test::ExpectTensorNear<float>(expected, output_new, 1e-5);
+  }
 }
 
 class BiasCacheTest : public OpsTestBase {
@@ -1313,22 +1496,38 @@ class MklPadWithFusedConv2DOpTest : public OpsTestBase {
         &padding, {0, 0, padding_list_[0], padding_list_[1], padding_list_[2],
                    padding_list_[3], 0, 0});
 
-    TF_EXPECT_OK(NodeDefBuilder("pad_fused_conv_op", "_MklPadWithFusedConv2D")
-                     .Input(FakeInput(dtype))
-                     .Input(FakeInput(dtype))
-                     .Input(FakeInput(num_args, dtype))
-                     .Input(FakeInput(DT_INT32))
-                     .Input(FakeInput(DT_UINT8))
-                     .Input(FakeInput(DT_UINT8))
-                     .Input(FakeInput(num_args, DT_UINT8))
-                     .Input(FakeInput(DT_UINT8))
-                     .Attr("T", dtype)
-                     .Attr("num_args", num_args)
-                     .Attr("strides", {1, stride, stride, 1})
-                     .Attr("padding", "VALID")
-                     .Attr("fused_ops", fused_ops)
-                     .Attr("_kernel", "MklLayoutDependentOp")
-                     .Finalize(node_def()));
+    if (!NativeFormatEnabled()) {
+      TF_EXPECT_OK(NodeDefBuilder("pad_fused_conv_op", "_MklPadWithFusedConv2D")
+                       .Input(FakeInput(dtype))
+                       .Input(FakeInput(dtype))
+                       .Input(FakeInput(num_args, dtype))
+                       .Input(FakeInput(DT_INT32))
+                       .Input(FakeInput(DT_UINT8))
+                       .Input(FakeInput(DT_UINT8))
+                       .Input(FakeInput(num_args, DT_UINT8))
+                       .Input(FakeInput(DT_UINT8))
+                       .Attr("T", dtype)
+                       .Attr("num_args", num_args)
+                       .Attr("strides", {1, stride, stride, 1})
+                       .Attr("padding", "VALID")
+                       .Attr("fused_ops", fused_ops)
+                       .Attr("_kernel", "MklLayoutDependentOp")
+                       .Finalize(node_def()));
+    } else {
+      TF_EXPECT_OK(
+          NodeDefBuilder("pad_fused_conv_op", "_MklNativePadWithFusedConv2D")
+              .Input(FakeInput(dtype))
+              .Input(FakeInput(dtype))
+              .Input(FakeInput(num_args, dtype))
+              .Input(FakeInput(DT_INT32))
+              .Attr("T", dtype)
+              .Attr("num_args", num_args)
+              .Attr("strides", {1, stride, stride, 1})
+              .Attr("padding", "VALID")
+              .Attr("fused_ops", fused_ops)
+              .Attr("_kernel", "MklNameChangeOp")
+              .Finalize(node_def()));
+    }
 
     TF_EXPECT_OK(InitOp());
 
@@ -1337,19 +1536,25 @@ class MklPadWithFusedConv2DOpTest : public OpsTestBase {
     for (const Tensor& arg : args)
       AddInputFromArray<T>(arg.shape(), arg.flat<T>());
     AddInputFromArray<int32>(padding.shape(), padding.flat<int32>());
-    // Add MKL meta input for input, filter, pad and agrs.
-    for (int i = 0; i < args.size() + 3; ++i)
-      AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    if (!NativeFormatEnabled()) {
+      // Add MKL meta input for input, filter, pad and agrs.
+      for (int i = 0; i < args.size() + 3; ++i)
+        AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    }
     TF_ASSERT_OK(RunOpKernel());
 
     // Compare output to expected results
     const Tensor& output_tensor = *GetOutput(0);
-    // Index 2 will need to be changed if the number of outputs produced
-    // by MklConv2D change.
-    const Tensor& output_meta_tensor = *GetOutput(2);
     CommonTestUtilities<T> test_util;
-    test_util.PerformConversion(dtype, output_tensor, output_meta_tensor,
-                                output);
+    if (!NativeFormatEnabled()) {
+      // Index 2 will need to be changed if the number of outputs produced
+      // by MklConv2D change.
+      const Tensor& output_meta_tensor = *GetOutput(2);
+      test_util.PerformConversion(dtype, output_tensor, output_meta_tensor,
+                                  output);
+    } else {
+      *output = output_tensor;
+    }
   }
 
  public:
diff --git a/tensorflow/core/kernels/mkl/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl/mkl_input_conversion_op.cc
index ae130700a8d..c1a9cbe1c58 100644
--- a/tensorflow/core/kernels/mkl/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_input_conversion_op.cc
@@ -35,21 +35,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-#ifdef ENABLE_MKLDNN_V1
-#define ENGINE_CPU engine::kind::cpu
-#define GET_CHECK_REORDER_TO_OP_MEM_ARGS(md, tensor, net, net_args, engine) \
-  md, tensor, net, net_args, engine
-#define GET_TF_DATA_FORMAT(shape, mem_desc) shape.GetTfDataFormat()
-#define NET_ARGS_PTR &net_args
-#else
-#define ENGINE_CPU engine::cpu
-#define GET_CHECK_REORDER_TO_OP_MEM_ARGS(md, tensor, net_ptr, net_args, \
-                                         engine)                        \
-  memory::primitive_desc(md, engine), tensor, &net_ptr
-#define GET_TF_DATA_FORMAT(shape, mem_desc) mem_desc.data.format
-#define NET_ARGS_PTR nullptr
-#endif  // ENABLE_MKLDNN_V1
-
 ///////////////////////////////////////////////////////////
 //               Op kernel
 // Checks and ensures that the 2 inputs are compatible for mkl binary ops.
@@ -120,8 +105,8 @@ class MklInputConversionOp : public OpKernel {
         auto input1_md = input_shape_1.GetMklLayout();
 
         // If both have the same shape and same format, pass them through
-        if (GET_TF_DATA_FORMAT(input_shape_0, input0_md) ==
-            GET_TF_DATA_FORMAT(input_shape_1, input1_md)) {
+        if (input_shape_0.GetTfDataFormat() ==
+            input_shape_1.GetTfDataFormat()) {
           VLOG(1) << "MklInputConversionOp: No conversion needed, "
                   << "copying MKL inputs with identical shapes to output";
 
@@ -151,7 +136,7 @@ class MklInputConversionOp : public OpKernel {
                                     mkl_output_mkl_shape);
 
           // Create MklDnnData object for input0 tensor
-          auto cpu_engine = engine(ENGINE_CPU, 0);
+          auto cpu_engine = engine(engine::kind::cpu, 0);
           MklDnnData<T> input(&cpu_engine);
           input.SetUsrMem(input0_md, &input_tensor_0);
           // Create reorder from input0's layout to input1's layout
@@ -161,11 +146,11 @@ class MklInputConversionOp : public OpKernel {
           // execute reorder
           OP_REQUIRES(
               context,
-              input.CheckReorderToOpMem(GET_CHECK_REORDER_TO_OP_MEM_ARGS(
-                  input1_md, tensor_out, net, net_args, cpu_engine)),
+              input.CheckReorderToOpMem(input1_md, tensor_out, net, net_args,
+                                        cpu_engine),
               errors::Internal(
                   "MklInputConversionOp: Failed to create reorder for input0"));
-          ExecutePrimitive(net, NET_ARGS_PTR, cpu_engine, context);
+          ExecutePrimitive(net, &net_args, cpu_engine, context);
           // Input1 will be passed through
           ForwardMklTensorInToOut(context, kInputIndex_1, kInputIndex_1);
           return;
@@ -252,16 +237,15 @@ class MklInputConversionOp : public OpKernel {
 
       // Create MklDnnData object for input tensor. Input tensor is in
       // Tensorflow layout.
-      auto cpu_engine = engine(ENGINE_CPU, 0);
+      auto cpu_engine = engine(engine::kind::cpu, 0);
       MklDnnData<T> tf_input(&cpu_engine);
       auto input_tf_md = mkl_output_mkl_shape.GetTfLayout();
       tf_input.SetUsrMem(input_tf_md, tf_tensor);
       // Create reorder between TF layout and MKL layout if necessary
       std::vector<primitive> net;
       std::vector<MemoryArgsMap> net_args;
-      bool reordered =
-          tf_input.CheckReorderToOpMem(GET_CHECK_REORDER_TO_OP_MEM_ARGS(
-              output_mkl_md, tensor_out, net, net_args, cpu_engine));
+      bool reordered = tf_input.CheckReorderToOpMem(output_mkl_md, tensor_out,
+                                                    net, net_args, cpu_engine);
       if (!reordered) {
         // This is the case that the TF tensor has the same shape and format of
         // mkl tensor. However, tf_tensor can not be simply forwarded to the
@@ -273,7 +257,7 @@ class MklInputConversionOp : public OpKernel {
                     errors::Internal("MklInputConversionOp: Failed to forward "
                                      "input tensor to output"));
       } else {
-        ExecutePrimitive(net, NET_ARGS_PTR, cpu_engine, context);
+        ExecutePrimitive(net, &net_args, cpu_engine, context);
       }
 
       // -- The tensor in MKL format passes through --
@@ -330,10 +314,6 @@ TF_CALL_float(REGISTER_CPU);
 TF_CALL_bfloat16(REGISTER_CPU);
 
 #undef REGISTER_CPU
-#undef ENGINE_CPU
-#undef GET_CHECK_REORDER_TO_OP_MEM_ARGS
-#undef GET_TF_DATA_FORMAT
-#undef NET_ARGS_PTR
 
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl/mkl_lrn_op.cc
index 3e512d0792b..c315385ddae 100644
--- a/tensorflow/core/kernels/mkl/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_lrn_op.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -72,7 +71,7 @@ class MklLRNOp : public OpKernel {
   ~MklLRNOp() {}
 
   explicit MklLRNOp(OpKernelConstruction* context)
-      : OpKernel(context), cpu_engine_(ENGINE_CPU, 0) {
+      : OpKernel(context), cpu_engine_(engine::kind::cpu, 0) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
     OP_REQUIRES(
@@ -136,18 +135,18 @@ class MklLRNOp : public OpKernel {
       // and MKL-DNN performs normalization over Channel, we tell MKL-DNN
       // that input is in NHWC layout with Channel being the last dimension.
       src_dnn_data.SetUsrMem(src_md, &src_tensor);
-      src_dnn_data.SetOpMemDesc(input_dims, MEMORY_FORMAT::nhwc);
+      src_dnn_data.SetOpMemDesc(input_dims, memory::format_tag::nhwc);
       src_dnn_data.SetUsrMemDataHandle(&src_tensor, fwd_stream_);
 
       // dst_dnn_data has the same shape as input.
       dst_dnn_data.SetUsrMem(src_md);
-      dst_dnn_data.SetOpMemDesc(input_dims, MEMORY_FORMAT::nhwc);
+      dst_dnn_data.SetOpMemDesc(input_dims, memory::format_tag::nhwc);
 
       // Create LRN primitive descriptor.
       // Tensorflow's normalization semantics is across channels.
       // MKL-DNN also supports normalization within channel.
       auto lrn_desc = lrn_forward::desc(
-          prop_kind::forward, ALGORITHM::lrn_across_channels,
+          prop_kind::forward, mkldnn::algorithm::lrn_across_channels,
           src_dnn_data.GetUsrMemDesc(), kernel_size, new_alpha, beta_, bias_);
       auto lrn_prim_desc = lrn_forward::primitive_desc(lrn_desc, cpu_engine_);
 
@@ -165,26 +164,17 @@ class MklLRNOp : public OpKernel {
       OP_REQUIRES_OK(context, context->status());
 
       // Check for input reorder
-      src_dnn_data.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-          lrn_prim_desc.PRIMITIVE_DESC_SRC, cpu_engine_));
+      src_dnn_data.CheckReorderToOpMem(lrn_prim_desc.src_desc(), cpu_engine_);
 
       std::vector<primitive> net;
       fwd_stream_.reset(CreateStream(context, cpu_engine_));
-#ifdef ENABLE_MKLDNN_V1
       net.push_back(lrn_forward(lrn_prim_desc));
       std::vector<std::unordered_map<int, memory>> net_args;
       net_args.push_back({{MKLDNN_ARG_SRC, src_dnn_data.GetOpMem()},
                           {MKLDNN_ARG_WORKSPACE, workspace_dnn_data.GetOpMem()},
-                          { MKLDNN_ARG_DST,
-                            dst_dnn_data.GetOpMem() }});
+                          {MKLDNN_ARG_DST, dst_dnn_data.GetOpMem()}});
       net.push_back(lrn_forward(lrn_prim_desc));
       net.at(0).execute(*fwd_stream_, net_args.at(0));
-#else
-      net.push_back(lrn_forward(lrn_prim_desc, src_dnn_data.GetOpMem(),
-                                workspace_dnn_data.GetOpMem(),
-                                dst_dnn_data.GetOpMem()));
-      fwd_stream_->submit(net).wait();
-#endif
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
@@ -200,9 +190,9 @@ class MklLRNOp : public OpKernel {
       OpKernelContext* context,
       const lrn_forward::primitive_desc& lrn_fwd_prim_desc,
       const memory::dims output_dims_mkl_order,
-      const MKL_TENSOR_FORMAT& output_tf_format, Tensor** output_tensor) {
+      const MklTensorFormat& output_tf_format, Tensor** output_tensor) {
     DCHECK(output_tensor != nullptr);
-    MEMORY_PRIMITIVE_DESC dst_pd = lrn_fwd_prim_desc.PRIMITIVE_DESC_DST;
+    memory::desc dst_pd = lrn_fwd_prim_desc.dst_desc();
 
     MklDnnShape output_mkl_shape;
     // We only handle the case when the inputs and output are in Mkl format
@@ -274,8 +264,7 @@ class MklLRNOp : public OpKernel {
       MklDnnData<uint8>* dnn_data_wksp) {
     DCHECK(dnn_data_wksp != nullptr);
     Tensor* workspace_tensor = nullptr;
-    MEMORY_PRIMITIVE_DESC workspace_pd =
-        lrn_fwd_prim_desc.PRIMITIVE_DESC_WORKSPACE;
+    memory::desc workspace_pd = lrn_fwd_prim_desc.workspace_desc();
     size_t workspace_bytes = workspace_pd.get_size();
     MklDnnShape workspace_mkl_shape;
     // the workspace tensor is a uint8 tensor that has
@@ -325,7 +314,7 @@ template <typename T>
 class MklLRNGradOp : public OpKernel {
  public:
   explicit MklLRNGradOp(OpKernelConstruction* context)
-      : OpKernel(context), cpu_engine_(ENGINE_CPU, 0) {
+      : OpKernel(context), cpu_engine_(engine::kind::cpu, 0) {
     int64 depth_radius64;
     OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
     OP_REQUIRES(
@@ -340,7 +329,7 @@ class MklLRNGradOp : public OpKernel {
     workspace_enabled_ = false;
     OP_REQUIRES_OK(context,
                    context->GetAttr("workspace_enabled", &workspace_enabled_));
-    bwd_stream_.reset(new CPU_STREAM(cpu_engine_));
+    bwd_stream_.reset(new stream(cpu_engine_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -393,12 +382,13 @@ class MklLRNGradOp : public OpKernel {
       memory::dims orig_input_dims =
           orig_input_dnn_shape.GetSizesAsMklDnnDims();
       orig_input_dnn_data.SetUsrMem(orig_input_md, &orig_input_tensor);
-      orig_input_dnn_data.SetOpMemDesc(orig_input_dims, MEMORY_FORMAT::nhwc);
+      orig_input_dnn_data.SetOpMemDesc(orig_input_dims,
+                                       memory::format_tag::nhwc);
       orig_input_dnn_data.SetUsrMemDataHandle(&orig_input_tensor, bwd_stream_);
 
       // output_dnn_data has the same shape as original input
       output_dnn_data.SetUsrMem(orig_input_md);
-      output_dnn_data.SetOpMemDesc(orig_input_dims, MEMORY_FORMAT::nhwc);
+      output_dnn_data.SetOpMemDesc(orig_input_dims, memory::format_tag::nhwc);
 
       // MKL-DNN has a notion of kernel_size and not depth_radius.
       int kernel_size = 2 * depth_radius_ + 1;
@@ -407,12 +397,12 @@ class MklLRNGradOp : public OpKernel {
       // Create LRN backward primitive descriptor. It requires LRN forward
       // primitive descriptor also.
       auto lrn_fwd_desc = lrn_forward::desc(
-          prop_kind::forward, ALGORITHM::lrn_across_channels, orig_input_md,
-          kernel_size, new_alpha, beta_, bias_);
+          prop_kind::forward, mkldnn::algorithm::lrn_across_channels,
+          orig_input_md, kernel_size, new_alpha, beta_, bias_);
       auto lrn_fwd_prim_desc =
           lrn_forward::primitive_desc(lrn_fwd_desc, cpu_engine_);
       auto lrn_bwd_desc = lrn_backward::desc(
-          ALGORITHM::lrn_across_channels, original_output_md,
+          mkldnn::algorithm::lrn_across_channels, original_output_md,
           target_diff_dst_md, kernel_size, new_alpha, beta_, bias_);
       auto lrn_bwd_prim_desc = lrn_backward::primitive_desc(
           lrn_bwd_desc, cpu_engine_, lrn_fwd_prim_desc);
@@ -430,34 +420,25 @@ class MklLRNGradOp : public OpKernel {
       // to check. Pass input workspace to LRN backward primitive.
       const Tensor& workspace_tensor = MklGetInput(context, kIdxWorkspace);
       MklDnnData<uint8> workspace_dnn_data(&cpu_engine_);
-      ConfigureWorkspace(workspace_tensor,
-                         lrn_fwd_prim_desc.PRIMITIVE_DESC_WORKSPACE,
+      ConfigureWorkspace(workspace_tensor, lrn_fwd_prim_desc.workspace_desc(),
                          &workspace_dnn_data);
 
       // Check for input reordering on the diff dst input
-      input_grad_dnn_data.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-          lrn_bwd_prim_desc.PRIMITIVE_DESC_DIFF_DST, cpu_engine_));
+      input_grad_dnn_data.CheckReorderToOpMem(lrn_bwd_prim_desc.diff_dst_desc(),
+                                              cpu_engine_);
 
       // Check for input reordering on the original input
-      orig_input_dnn_data.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-          lrn_fwd_prim_desc.PRIMITIVE_DESC_SRC, cpu_engine_));
+      orig_input_dnn_data.CheckReorderToOpMem(lrn_fwd_prim_desc.src_desc(),
+                                              cpu_engine_);
 
       std::vector<primitive> net;
-#ifdef ENABLE_MKLDNN_V1
       std::vector<std::unordered_map<int, memory>> net_args;
       net.push_back(lrn_backward(lrn_bwd_prim_desc));
       net_args.push_back({{MKLDNN_ARG_SRC, orig_input_dnn_data.GetOpMem()},
                           {MKLDNN_ARG_DIFF_DST, input_grad_dnn_data.GetOpMem()},
-                          { MKLDNN_ARG_DST,
-                            output_dnn_data.GetOpMem() }});
+                          {MKLDNN_ARG_DST, output_dnn_data.GetOpMem()}});
       net.push_back(lrn_backward(lrn_bwd_prim_desc));
       net.at(0).execute(*bwd_stream_, net_args.at(0));
-#else
-      net.push_back(lrn_backward(
-          lrn_bwd_prim_desc, orig_input_dnn_data.GetOpMem(),
-          input_grad_dnn_data.GetOpMem(), output_dnn_data.GetOpMem()));
-      bwd_stream_->submit(net).wait();
-#endif
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
@@ -472,9 +453,9 @@ class MklLRNGradOp : public OpKernel {
       OpKernelContext* context,
       const lrn_backward::primitive_desc& lrn_bkwd_prim_desc,
       const memory::dims output_dims_mkl_order,
-      const MKL_TENSOR_FORMAT& output_tf_format, Tensor** output_tensor) {
+      const MklTensorFormat& output_tf_format, Tensor** output_tensor) {
     DCHECK(output_tensor != nullptr);
-    MEMORY_PRIMITIVE_DESC dst_pd = lrn_bkwd_prim_desc.PRIMITIVE_DESC_DIFF_SRC;
+    memory::desc dst_pd = lrn_bkwd_prim_desc.diff_src_desc();
     MklDnnShape output_mkl_shape;
 
     // We assume that all outputs at this point are MKL Tensors
@@ -502,12 +483,13 @@ class MklLRNGradOp : public OpKernel {
     memory::desc input_grad_md = input_grad_dnn_shape.GetCurLayout();
     memory::dims orig_input_dims = input_grad_dnn_shape.GetSizesAsMklDnnDims();
     input_grad_dnn_data->SetUsrMem(input_grad_md, &input_grad_tensor);
-    input_grad_dnn_data->SetOpMemDesc(orig_input_dims, MEMORY_FORMAT::nhwc);
+    input_grad_dnn_data->SetOpMemDesc(orig_input_dims,
+                                      memory::format_tag::nhwc);
     return input_grad_md;
   }
 
   void ConfigureWorkspace(const Tensor& workspace_tensor,
-                          MEMORY_PRIMITIVE_DESC workspace_pd,
+                          memory::desc workspace_pd,
                           MklDnnData<uint8>* workspace_dnn_data) {
     DCHECK(workspace_dnn_data);
 
diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_op_fused.cc b/tensorflow/core/kernels/mkl/mkl_matmul_op_fused.cc
index 4dd7e3f8c6e..905abbfeef2 100644
--- a/tensorflow/core/kernels/mkl/mkl_matmul_op_fused.cc
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_op_fused.cc
@@ -27,7 +27,7 @@ limitations under the License.
 namespace tensorflow {
 
 // Fuse Operation
-template <typename Device, typename T>
+template <typename Device, typename T, bool native_format = false>
 class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, T> {
  public:
   explicit MklFusedMatMulOp(OpKernelConstruction* ctx)
@@ -58,8 +58,8 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, T> {
 
     MklDnnShape src_mkl_shape;
     MklDnnShape weight_mkl_shape;
-    GetMklShape(ctx, this->kInputIndexSrc, &src_mkl_shape);
-    GetMklShape(ctx, this->kInputIndexWeight, &weight_mkl_shape);
+    GetMklShape(ctx, this->kInputIndexSrc, &src_mkl_shape, native_format);
+    GetMklShape(ctx, this->kInputIndexWeight, &weight_mkl_shape, native_format);
     OP_REQUIRES(ctx, !weight_mkl_shape.IsMklTensor(),
                 errors::InvalidArgument("Weight should not be in MKL Layout"));
 
@@ -134,7 +134,7 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, T> {
       MklDnnShape dst_mkl_shape;
       dst_mkl_shape.SetMklTensor(false);
       AllocateOutputSetMklShape(ctx, 0, &dst_tensor, dst_tensor_shape,
-                                dst_mkl_shape);
+                                dst_mkl_shape, native_format);
     }
 
     // if there's nothing to compute, just return.
@@ -226,6 +226,8 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, T> {
         params.post_op_params.push_back({"relu6", {1.0, 6.0, 0.0}});
       } else if (post_op == "Elu") {
         params.post_op_params.push_back({"elu", {1.0, 1.0, 0.0}});
+      } else if (post_op == "Tanh") {
+        params.post_op_params.push_back({"tanh", {1.0, 0.0, 0.0}});
       } else {
         OP_REQUIRES_OK(
             ctx, errors::InvalidArgument(
@@ -241,13 +243,18 @@ class MklFusedMatMulOp : public MklDnnMatMulOpBase<T, T> {
 };
 
 // Register mkl kernels for supported operations and types.
-#define REGISTER_FUSEDMATMUL_MKL_SUPPORTED_KERNELS_TYPES(type) \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklFusedMatMul")                                  \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<type>("T")                           \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedMatMulOp<CPUDevice, type>);
+#define REGISTER_FUSEDMATMUL_MKL_SUPPORTED_KERNELS_TYPES(type)                \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_MklFusedMatMul")                                                 \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<type>("T")                                          \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                \
+      MklFusedMatMulOp<CPUDevice, type>);                                     \
+  REGISTER_KERNEL_BUILDER(Name("_MklNativeFusedMatMul")                       \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<type>("T")                      \
+                              .Label(mkl_op_registry::kMklNameChangeOpLabel), \
+                          MklFusedMatMulOp<CPUDevice, type, true>);
 TF_CALL_float(REGISTER_FUSEDMATMUL_MKL_SUPPORTED_KERNELS_TYPES);
 TF_CALL_bfloat16(REGISTER_FUSEDMATMUL_MKL_SUPPORTED_KERNELS_TYPES);
 
diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
index fc03374a414..b77d033c9de 100644
--- a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
@@ -35,12 +35,6 @@ using mkldnn::stream;
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
-#ifdef INTEL_MKL_DNN_ONLY
-// Temporarily copying some definitions from mkl_cblas.h so the same code can
-// be used when calling oneDNN or CBLAS batchmatmul in mkl_batch_matmul_op.cc.
-typedef enum { CblasRowMajor, CblasColumnMajor } CBLAS_LAYOUT;
-#define MKL_INT int
-#endif
 
 // This structure aggregates multiple inputs to MklDnnMatMul* methods.
 struct MklDnnMatMulFwdParams {
@@ -247,6 +241,13 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
           float op_beta = post_op_param.param[2];
           post_ops.append_eltwise(op_scale, ALGORITHM::eltwise_elu, op_alpha,
                                   op_beta);
+        } else if (post_op_param.name == "tanh") {
+          DCHECK_EQ(post_op_param.param.size(), 3);
+          float op_scale = post_op_param.param[0];
+          float op_alpha = post_op_param.param[1];
+          float op_beta = post_op_param.param[2];
+          post_ops.append_eltwise(op_scale, ALGORITHM::eltwise_tanh, op_alpha,
+                                  op_beta);
         } else if (post_op_param.name == "output_scale") {
           DCHECK_EQ(post_op_param.param.size(), 1);
           std::vector<float> scales;
@@ -256,6 +257,7 @@ class MklDnnMatMulFwdPrimitive : public MklPrimitive {
           DCHECK((post_op_param.name == "relu") ||
                  (post_op_param.name == "relu6") ||
                  (post_op_param.name == "elu") ||
+                 (post_op_param.name == "tanh") ||
                  (post_op_param.name == "output_scale"));
         }
       }
@@ -359,11 +361,12 @@ class MklDnnMatMulFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
     key_creator.AddAsKey(mkldnn_matmul_fwd_dims.bias_dims);
     key_creator.AddAsKey(mkldnn_matmul_fwd_dims.dst_dims);
     key_creator.AddAsKey(mkldnn_matmul_fwd_dims.dtypes);
+    key_creator.AddAsKey(mkldnn_matmul_fwd_dims.weight_format);
 
     // Generate keys for post-ops
     for (auto const& post_op_param : mkldnn_matmul_fwd_dims.post_op_params) {
       if (post_op_param.name == "relu" || post_op_param.name == "relu6" ||
-          post_op_param.name == "elu") {
+          post_op_param.name == "elu" || post_op_param.name == "tanh") {
         DCHECK_EQ(post_op_param.param.size(), 3);
         key_creator.AddAsKey(post_op_param.name);
         key_creator.AddAsKey(post_op_param.param[0]);
@@ -720,67 +723,6 @@ class MklMatMulPrimitiveFactory : public MklPrimitiveFactory<T> {
   }
 };
 
-template <typename T>
-void dnnl_gemm_batch(const std::vector<bool>& transa,
-                     const std::vector<bool>& transb, const std::vector<int>& m,
-                     const std::vector<int>& n, const std::vector<int>& k,
-                     const std::vector<float>& alpha, const T* a, const T* b,
-                     const std::vector<float>& beta, T* c,
-                     const int group_count, const std::vector<int>& group_size,
-                     OpKernelContext* ctx = nullptr) {
-  // Current BatchMatMul support in Tensorflow is narrower than the one offered
-  // by MKL and MKL-DNN. Current BatchMatMul support in Tensorflow uses only 1
-  // group of size equal to batch_size, and all MatMul parameters (m, n, k,
-  // alpha, beta) within that group are same.
-  DCHECK(group_size.size() == 1);
-  DCHECK(transa.size() == group_size[0]);
-  DCHECK(transb.size() == group_size[0]);
-  DCHECK(alpha.size() == group_size[0]);
-  DCHECK(beta.size() == group_size[0]);
-  DCHECK(m.size() == group_size[0]);
-  DCHECK(n.size() == group_size[0]);
-  DCHECK(k.size() == group_size[0]);
-  for (int64_t idx = 0; idx < group_size[0]; idx++)
-    DCHECK(transa[0] == transa[idx]);
-  for (int64_t idx = 0; idx < group_size[0]; idx++)
-    DCHECK(transb[0] == transb[idx]);
-  for (int64_t idx = 0; idx < group_size[0]; idx++)
-    DCHECK(alpha[0] == alpha[idx]);
-  for (int64_t idx = 0; idx < group_size[0]; idx++)
-    DCHECK(beta[0] == beta[idx]);
-  for (int64_t idx = 0; idx < group_size[0]; idx++) DCHECK(m[0] == m[idx]);
-  for (int64_t idx = 0; idx < group_size[0]; idx++) DCHECK(n[0] == n[idx]);
-  for (int64_t idx = 0; idx < group_size[0]; idx++) DCHECK(k[0] == k[idx]);
-
-  using dims = mkldnn::memory::dims;
-  // Prepare strides based on the transa and transb flags: transposed
-  // matrices have strides swapped BatchMatMul in MKL-DNN supports 3D metrices
-  // so far. That is why strides are 3D also.
-  dims a_sizes = dims{group_size[0], m[0], k[0]};
-  dims b_sizes = dims{group_size[0], k[0], n[0]};
-  dims c_sizes = dims{group_size[0], m[0], n[0]};
-  dims a_strides =
-      !transa[0] ? dims{m[0] * k[0], k[0], 1} : dims{k[0] * m[0], 1, m[0]};
-  dims b_strides =
-      !transb[0] ? dims{k[0] * n[0], n[0], 1} : dims{n[0] * k[0], 1, k[0]};
-  dims c_strides = dims{m[0] * n[0], n[0], 1};
-
-  // MklMatMul uses const alpha and beta, make guarantee here to ensure
-  // they are never changed.
-  DCHECK_EQ(alpha, 1.0f);
-  DCHECK_EQ(beta, 0.f);
-
-  MklMatMulParams params(a_sizes, b_sizes, c_sizes, a_strides, b_strides,
-                         c_strides);
-  MklMatMulPrimitive<T>* matmul_prim =
-      MklMatMulPrimitiveFactory<T>::Get(params, 0);
-
-  // Execute matmul primitive.
-  std::shared_ptr<stream> cpu_stream;
-  cpu_stream.reset(CreateStream(ctx, matmul_prim->GetEngine()));
-  matmul_prim->Execute(a, b, c, cpu_stream);
-}
-
 template <typename T>
 void dnnl_gemm(char transa, char transb, int64_t m, int64_t n, int64_t k,
                float alpha, const T* a, int64_t lda, const T* b, int64_t ldb,
diff --git a/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc
index ca7ebd7fd12..441d5f5099f 100644
--- a/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc
@@ -44,7 +44,7 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 
 // An implementation of MaxPooling (forward).
-template <typename Device, typename T>
+template <typename Device, typename T, bool native_format = false>
 class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
  public:
   explicit MklMaxPoolingOp(OpKernelConstruction* context)
@@ -52,6 +52,7 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
     // In Max Pooling, MKL-DNN does not allow passing workspace as nullptr.
     // So we set workspace_enabled_ to true.
     this->workspace_enabled_ = true;
+    this->native_format_ = native_format;
   }
 
   void Compute(OpKernelContext* context) override {
@@ -59,7 +60,8 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       const Tensor& input_tensor =
           MklGetInput(context, this->kInputTensorIndexInput);
       MklDnnShape dnn_shape_input;
-      GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input);
+      GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input,
+                  this->native_format_);
       this->SanityCheckInput(context, input_tensor, dnn_shape_input);
       if (!context->status().ok()) return;
 
@@ -143,7 +145,8 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       MklPoolingParams fwdParams(
           src_dims, output_dims_mkl_order, filter_dims, strides, padding_left,
           padding_right, ALGORITHM::pooling_max, pooling_prop_kind,
-          static_cast<MEMORY_FORMAT>(this->data_format_mkldnn_), input_md);
+          static_cast<MEMORY_FORMAT>(this->data_format_mkldnn_), input_md,
+          this->native_format_);
 #else
       MklPoolingParams fwdParams(
           src_dims, output_dims_mkl_order, filter_dims, strides, padding_left,
@@ -229,7 +232,7 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
     workspace_tf_shape.AddDim(workspace_bytes);
     AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace,
                               &workspace_tensor, workspace_tf_shape,
-                              workspace_mkl_shape);
+                              workspace_mkl_shape, this->native_format_);
     DCHECK(workspace_tensor);
     dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor);
   }
@@ -241,11 +244,13 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
 //   - The original output tensor
 //   - Backprop tensor for output
 // It produces one output: backprop tensor for input.
-template <class Device, class T>
+template <class Device, class T, bool native_format = false>
 class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
  public:
   explicit MklMaxPoolingGradOp(OpKernelConstruction* context)
-      : MklPoolingBackwardOpBase<T>(context) {}
+      : MklPoolingBackwardOpBase<T>(context) {
+    this->native_format_ = native_format;
+  }
   void Compute(OpKernelContext* context) override {
     try {
       const Tensor& orig_input_tensor =
@@ -255,8 +260,10 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       const Tensor& workspace_tensor =
           MklGetInput(context, kInputTensorIndexWorkspace);
       MklDnnShape orig_input_mkl_shape, grad_mkl_shape;
-      GetMklShape(context, kInputTensorIndexOrigInput, &orig_input_mkl_shape);
-      GetMklShape(context, kInputTensorIndexGradient, &grad_mkl_shape);
+      GetMklShape(context, kInputTensorIndexOrigInput, &orig_input_mkl_shape,
+                  this->native_format_);
+      GetMklShape(context, kInputTensorIndexGradient, &grad_mkl_shape,
+                  this->native_format_);
       if (!context->status().ok()) return;
 
       MklDnnData<T> grad_dnn_data(&cpu_engine_);
@@ -312,7 +319,8 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
           orig_input_dims_mkl_order, output_dims_mkl_order, filter_dims,
           strides, padding_left, padding_right, ALGORITHM::pooling_max,
           prop_kind::forward_training,
-          static_cast<MEMORY_FORMAT>(this->data_format_mkldnn_), src_md);
+          static_cast<MEMORY_FORMAT>(this->data_format_mkldnn_), src_md,
+          this->native_format_);
 #else
       MklPoolingParams bwdParams(
           orig_input_dims_mkl_order, output_dims_mkl_order, filter_dims,
@@ -335,7 +343,8 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       std::shared_ptr<PoolingBwdPd> pooling_bwd_pd =
           pooling_bwd->GetPoolingBwdPd();
       T* diff_dst_data = nullptr;
-      if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, pooling_bwd_pd,
+      if (!this->native_format_ &&
+          IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, pooling_bwd_pd,
                                      pooling_bwd)) {
         grad_dnn_data.SetUsrMem(diff_dst_md, &grad_tensor);
         grad_dnn_data.CheckReorderToOpMem(
@@ -389,36 +398,56 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
   engine cpu_engine_ = engine(ENGINE_CPU, 0);
 };  // MklMaxPoolingGradOp
 
-#define REGISTER_MKL_MAXPOOL3D_KERNELS(T)                      \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklMaxPool3D")                                    \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklMaxPoolingOp<CPUDevice, T>);                          \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklMaxPool3DGrad")                                \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklMaxPoolingGradOp<CPUDevice, T>);
+#define REGISTER_MKL_MAXPOOL3D_KERNELS(T)                                     \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_MklMaxPool3D")                                                   \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                \
+      MklMaxPoolingOp<CPUDevice, T>);                                         \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_MklMaxPool3DGrad")                                               \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                \
+      MklMaxPoolingGradOp<CPUDevice, T>);                                     \
+  REGISTER_KERNEL_BUILDER(Name("_MklNativeMaxPool3D")                         \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklNameChangeOpLabel), \
+                          MklMaxPoolingOp<CPUDevice, T, true>);               \
+  REGISTER_KERNEL_BUILDER(Name("_MklNativeMaxPool3DGrad")                     \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklNameChangeOpLabel), \
+                          MklMaxPoolingGradOp<CPUDevice, T, true>);
 
 TF_CALL_float(REGISTER_MKL_MAXPOOL3D_KERNELS);
 TF_CALL_bfloat16(REGISTER_MKL_MAXPOOL3D_KERNELS);
 
-#define REGISTER_MKL_MAXPOOL_KERNELS(T)                        \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklMaxPool")                                      \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklMaxPoolingOp<CPUDevice, T>);                          \
-  REGISTER_KERNEL_BUILDER(                                     \
-      Name("_MklMaxPoolGrad")                                  \
-          .Device(DEVICE_CPU)                                  \
-          .TypeConstraint<T>("T")                              \
-          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklMaxPoolingGradOp<CPUDevice, T>);
+#define REGISTER_MKL_MAXPOOL_KERNELS(T)                                       \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_MklMaxPool")                                                     \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                \
+      MklMaxPoolingOp<CPUDevice, T>);                                         \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_MklMaxPoolGrad")                                                 \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel),                \
+      MklMaxPoolingGradOp<CPUDevice, T>);                                     \
+  REGISTER_KERNEL_BUILDER(Name("_MklNativeMaxPool")                           \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklNameChangeOpLabel), \
+                          MklMaxPoolingOp<CPUDevice, T, true>);               \
+  REGISTER_KERNEL_BUILDER(Name("_MklNativeMaxPoolGrad")                       \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .Label(mkl_op_registry::kMklNameChangeOpLabel), \
+                          MklMaxPoolingGradOp<CPUDevice, T, true>);
 
 TF_CALL_float(REGISTER_MKL_MAXPOOL_KERNELS);
 TF_CALL_bfloat16(REGISTER_MKL_MAXPOOL_KERNELS);
diff --git a/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.cc
index 9824fabce0e..2b18404d1cf 100644
--- a/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.cc
@@ -54,8 +54,9 @@ void MklPoolingFwdPrimitive<T>::Setup(const MklPoolingParams& fwdParams) {
 #else
   context_.src_md.reset(new memory::desc(fwdParams.src_md.data));
 #endif  //  !ENABLE_MKLDNN_V1
-  context_.dst_md.reset(new memory::desc({fwdParams.dst_dims}, MklDnnType<T>(),
-                                         MEMORY_FORMAT::any));
+  context_.dst_md.reset(new memory::desc(
+      {fwdParams.dst_dims}, MklDnnType<T>(),
+      fwdParams.native_format ? fwdParams.src_format : MEMORY_FORMAT::any));
 
 #ifndef ENABLE_MKLDNN_V1
   // Create a pooling descriptor.
@@ -187,8 +188,9 @@ void MklPoolingBwdPrimitive<T>::Setup(const MklPoolingParams& bwdParams) {
       {bwdParams.dst_dims}, MklDnnType<T>(), bwdParams.src_format));
 #else
   context_.src_md.reset(new memory::desc(bwdParams.src_md.data));
-  context_.dst_md.reset(new memory::desc({bwdParams.dst_dims}, MklDnnType<T>(),
-                                         MEMORY_FORMAT::any));
+  context_.dst_md.reset(new memory::desc(
+      {bwdParams.dst_dims}, MklDnnType<T>(),
+      bwdParams.native_format ? bwdParams.src_format : MEMORY_FORMAT::any));
 #endif  // !ENABLE_MKLDNN_V1
 
 #ifndef ENABLE_MKLDNN_V1
diff --git a/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h
index 3a608a66c16..7034c4a224b 100644
--- a/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h
@@ -49,12 +49,14 @@ struct MklPoolingParams {
   mkldnn::prop_kind prop_kind;
   MEMORY_FORMAT src_format;
   memory::desc src_md;
+  bool native_format;
 
   MklPoolingParams(memory::dims src_dims, memory::dims dst_dims,
                    memory::dims filter_dims, memory::dims strides,
                    memory::dims padding_left, memory::dims padding_right,
                    mkldnn::algorithm alg_kind, mkldnn::prop_kind prop_kind,
-                   MEMORY_FORMAT src_format, memory::desc src_md)
+                   MEMORY_FORMAT src_format, memory::desc src_md,
+                   bool native_format)
       : src_dims(src_dims),
         dst_dims(dst_dims),
         filter_dims(filter_dims),
@@ -64,7 +66,8 @@ struct MklPoolingParams {
         alg_kind(alg_kind),
         prop_kind(prop_kind),
         src_format(src_format),
-        src_md(src_md) {}
+        src_md(src_md),
+        native_format(native_format) {}
 };
 
 template <typename T>
@@ -583,7 +586,8 @@ class MklPoolingOpBase : public OpKernel {
       output_tf_shape = MklDnnDimsToTFShape(output_dims_order);
     }
     AllocateOutputSetMklShape(context, kOutputIndex, output_tensor,
-                              output_tf_shape, output_mkl_shape);
+                              output_tf_shape, output_mkl_shape,
+                              native_format_);
     DCHECK(output_tensor);
   }
 
@@ -608,6 +612,7 @@ class MklPoolingOpBase : public OpKernel {
   // Either memory::format (MKL-DNN v-0.x) or memory::format_tag (MKL-DNN v-1.x)
   MEMORY_FORMAT data_format_mkldnn_;
   bool workspace_enabled_;
+  bool native_format_ = false;
 };
 
 template <class T>
@@ -671,8 +676,13 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
                                  output_dims_mkl_order, output_tf_format);
     // Only allocate enough space for the elements we need.
     output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
+
+    if (this->native_format_) {
+      output_tf_shape = output_mkl_shape.GetTfShape();
+    }
     AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, output_tensor,
-                              output_tf_shape, output_mkl_shape);
+                              output_tf_shape, output_mkl_shape,
+                              this->native_format_);
     DCHECK(*output_tensor);
   }
 
@@ -719,8 +729,12 @@ class MklPoolingBackwardOpBase : public MklPoolingOpBase<T> {
 
     TensorShape output_tf_shape;
     output_tf_shape.AddDim(this->GetNumTElements(dst_pd));
+    if (this->native_format_) {
+      output_tf_shape = output_mkl_shape.GetTfShape();
+    }
     AllocateOutputSetMklShape(context, kOutputTensorIndexOutput, output_tensor,
-                              output_tf_shape, output_mkl_shape);
+                              output_tf_shape, output_mkl_shape,
+                              this->native_format_);
     DCHECK(*output_tensor);
   }
 };
diff --git a/tensorflow/core/kernels/mkl/mkl_quantize_op.cc b/tensorflow/core/kernels/mkl/mkl_quantize_op.cc
index 177cbb43d0b..0b24af31a14 100644
--- a/tensorflow/core/kernels/mkl/mkl_quantize_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_quantize_op.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::primitive_attr;
@@ -77,7 +76,7 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
  public:
   explicit MklReorderWithScalePrimitive(
       const MklReorderWithScaleFwdParams& fwdParams)
-      : MklPrimitive(engine(ENGINE_CPU, 0)) {
+      : MklPrimitive(engine(engine::kind::cpu, 0)) {
     // Create reorder primitive
     Setup(fwdParams);
   }
@@ -95,11 +94,7 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
     context_.src_mem->set_data_handle(src_data);
     context_.dst_mem->set_data_handle(dst_data);
 #endif  // ENABLE_MKLDNN_THREADPOOL
-#ifndef ENABLE_MKLDNN_V1
-    reorder_stream->submit(context_.net);
-#else
     context_.reorder_prim->execute(*reorder_stream, context_.prim_args);
-#endif  // !ENABLE_MKLDNN_V1
     // After execution, set data handle back.
     context_.src_mem->set_data_handle(DummyData);
     context_.dst_mem->set_data_handle(DummyData);
@@ -119,11 +114,7 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
     // Stream and primitive vector
     std::shared_ptr<mkldnn::stream> reorder_stream;
 
-#ifndef ENABLE_MKLDNN_V1
-    std::vector<mkldnn::primitive> net;
-#else
     std::unordered_map<int, mkldnn::memory> prim_args;
-#endif  // !ENABLE_MKLDNN_V1
 
     ReorderContext()
         : src_mem(nullptr),
@@ -135,10 +126,10 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
   // Reorder primitive setup
   void Setup(const MklReorderWithScaleFwdParams& fwdParams) {
     // Create memory descriptors for reorder data with specified format
-    context_.src_mem.reset(new MEMORY_CONSTRUCTOR_USING_MD(
-        fwdParams.src_md, cpu_engine_, DummyData));
-    context_.dst_mem.reset(new MEMORY_CONSTRUCTOR_USING_MD(
-        fwdParams.dst_md, cpu_engine_, DummyData));
+    context_.src_mem.reset(
+        new memory(fwdParams.src_md, cpu_engine_, DummyData));
+    context_.dst_mem.reset(
+        new memory(fwdParams.dst_md, cpu_engine_, DummyData));
 
     // Check if there is any fusion as post-ops
     auto const& post_op_params = fwdParams.post_op_params;
@@ -150,21 +141,14 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
     scales.push_back(post_op_params.param[0]);
     post_ops_attr.set_output_scales(0, scales);
 
-    context_.reorder_pd.reset(new REORDER_PD_CONSTRUCTOR_WITH_ATTR(
-        GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(context_.src_mem),
-        GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(context_.dst_mem), cpu_engine_,
-        post_ops_attr));
+    context_.reorder_pd.reset(
+        new ReorderPd(cpu_engine_, context_.src_mem->get_desc(), cpu_engine_,
+                      context_.dst_mem->get_desc(), post_ops_attr));
 
-// Create reorder primitive
-#ifndef ENABLE_MKLDNN_V1
-    context_.reorder_prim.reset(new reorder(
-        *context_.reorder_pd, *context_.src_mem, *context_.dst_mem));
-    context_.net.push_back(*context_.reorder_prim);
-#else
+    // Create reorder primitive
     context_.reorder_prim.reset(new reorder(*context_.reorder_pd));
     context_.prim_args.insert({MKLDNN_ARG_FROM, *context_.src_mem});
     context_.prim_args.insert({MKLDNN_ARG_TO, *context_.dst_mem});
-#endif  // !ENABLE_MKLDNN_V1
   }
 };
 
@@ -278,7 +262,7 @@ class MklQuantizeV2Op : public OpKernel {
                     "Scalar calculation in MKL is supported only for"
                     "MIN_FIRST mode for now."));
 
-    auto cpu_engine = engine(ENGINE_CPU, 0);
+    auto cpu_engine = engine(engine::kind::cpu, 0);
     const Tensor& input = ctx->input(0);
     const unsigned int src_idx = 0;
     const Tensor& src_tensor = MklGetInput(ctx, src_idx);
@@ -344,7 +328,7 @@ class MklQuantizeV2Op : public OpKernel {
     max_range = std::max(input_max_range, min_range + epsilon);
     // Clamping the max_range to zero since max_range can also be negative.
     max_range = std::max(0.0f, max_range);
-    auto cpu_engine = engine(ENGINE_CPU, 0);
+    auto cpu_engine = engine(engine::kind::cpu, 0);
     const Tensor& src_tensor = MklGetInput(ctx, src_idx);
     MklDnnShape src_mkl_shape;
     GetMklShape(ctx, src_idx, &src_mkl_shape);
@@ -355,25 +339,25 @@ class MklQuantizeV2Op : public OpKernel {
                         : TFShapeToMklDnnDims(src_tensor.shape());
     auto output_dims = src_dims;
     // Set the dst layout to be the best mkl layout based on dims and type.
-    MEMORY_FORMAT dst_layout_type;
+    memory::format_tag dst_layout_type;
     switch (src_tf_shape.dims()) {
       case 0:
         ComputeScalar(ctx, min_range, max_range);
         return;
       case 1:
-        dst_layout_type = MEMORY_FORMAT::x;
+        dst_layout_type = memory::format_tag::x;
         break;
       case 2:
-        dst_layout_type = MEMORY_FORMAT::nc;
+        dst_layout_type = memory::format_tag::nc;
         break;
       case 3:
-        dst_layout_type = MEMORY_FORMAT::tnc;
+        dst_layout_type = memory::format_tag::tnc;
         break;
       case 4:
-        dst_layout_type = MEMORY_FORMAT::nhwc;
+        dst_layout_type = memory::format_tag::nhwc;
         break;
       case 5:
-        dst_layout_type = MEMORY_FORMAT::ndhwc;
+        dst_layout_type = memory::format_tag::ndhwc;
         break;
       default:
         OP_REQUIRES_OK(ctx,
@@ -417,9 +401,7 @@ class MklQuantizeV2Op : public OpKernel {
 
     memory::desc dst_md =
         memory::desc(src_dims, MklDnnType<T>(), dst_layout_type);
-#ifndef ENABLE_MKLDNN_V1
-    auto dst_pd = memory::primitive_desc(dst_md, cpu_engine);
-#endif  // !ENABLE_MKLDNN_V1
+
     // Standard shape assignments for layout pass
     MklDnnShape output_mkl_shape;
     TensorShape output_tf_shape;
diff --git a/tensorflow/core/kernels/mkl/mkl_relu_op.cc b/tensorflow/core/kernels/mkl/mkl_relu_op.cc
index a1ba5dc93af..877d6536cb6 100644
--- a/tensorflow/core/kernels/mkl/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_relu_op.cc
@@ -566,6 +566,7 @@ class MklReluOpBase : public OpKernel {
       std::shared_ptr<stream> fwd_cpu_stream;
       fwd_cpu_stream.reset(CreateStream(context, eltwise_fwd->GetEngine()));
       // Check if src needs to be reordered
+      bool is_src_reordered = false;
       const T* src_data = src_tensor.flat<T>().data();
       if (IS_SRC_REORDER_NEEDED(src_md, eltwise_fwd_pd, eltwise_fwd)) {
         src.SetUsrMem(src_md, &src_tensor);
@@ -575,27 +576,48 @@ class MklReluOpBase : public OpKernel {
             context);
         src_data = const_cast<T*>(
             reinterpret_cast<T*>(src.GetOpMem().get_data_handle()));
+        is_src_reordered = true;
       }
-      // Allocate dst tensor, always set it as MKL-DNN layout
-      if (dnn_shape_src.IsMklTensor()) {
+
+      // If src is reordered, then dst tensor would be in blocked layout.
+      // So we propagate this blocked layout on the output. We follow same
+      // logic when src is in blocked (MKL) layout to start of with also.
+      if (is_src_reordered || dnn_shape_src.IsMklTensor()) {
         dnn_shape_dst.SetMklTensor(true);
         auto dst_pd = eltwise_fwd_pd->PRIMITIVE_DESC_DST;
         dnn_shape_dst.SetMklLayout(&dst_pd);
         dnn_shape_dst.SetElemType(MklDnnType<T>());
-        dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(),
-                                  dnn_shape_src.GetSizesAsMklDnnDims(),
-                                  dnn_shape_src.GetTfDataFormat());
+        if (dnn_shape_src.IsMklTensor()) {
+          dnn_shape_dst.SetTfLayout(dnn_shape_src.GetDimension(),
+                                    dnn_shape_src.GetSizesAsMklDnnDims(),
+                                    dnn_shape_src.GetTfDataFormat());
+        } else {
+          dnn_shape_dst.SetTfLayout(src_tensor.dims(),
+                                    TFShapeToMklDnnDims(src_tensor.shape()),
+                                    MKL_TENSOR_FORMAT_BLOCKED);
+        }
         tf_shape_dst.AddDim(dst_pd.get_size() / sizeof(T));
       } else {
+        // If src is not in blocked layout or it is not reordered, then dst is
+        // in native layout.
         dnn_shape_dst.SetMklTensor(false);
         tf_shape_dst = src_tensor.shape();
       }
-      OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                  {static_cast<const int>(src_index)},
-                                  static_cast<const int>(dst_index),
-                                  tf_shape_dst, &dst_tensor));
-      AllocateOutputSetMklShape(context, dst_index, dnn_shape_dst);
 
+      if (is_src_reordered) {
+        // If src is reordered, then src and dst would be in different layouts.
+        AllocateOutputSetMklShape(context, dst_index, &dst_tensor, tf_shape_dst,
+                                  dnn_shape_dst);
+      } else {
+        // forwarding input to output works only when layouts of src and
+        // dst tensor remains same -- either both of them are in native layout
+        // or in blocked (MKL) layout.
+        OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                    {static_cast<const int>(src_index)},
+                                    static_cast<const int>(dst_index),
+                                    tf_shape_dst, &dst_tensor));
+        AllocateOutputSetMklShape(context, dst_index, dnn_shape_dst);
+      }
       T* dst_data = dst_tensor->flat<T>().data();
 
       // execute eltwise
diff --git a/tensorflow/core/kernels/mkl/mkl_requantize_per_channel_op.cc b/tensorflow/core/kernels/mkl/mkl_requantize_per_channel_op.cc
index 0a0464f648b..40570a467ea 100644
--- a/tensorflow/core/kernels/mkl/mkl_requantize_per_channel_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_requantize_per_channel_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #ifdef INTEL_MKL
 #define EIGEN_USE_THREADS
+
 #include <math.h>
 
 #include "mkldnn.hpp"
@@ -28,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/meta_support.h"
 #include "tensorflow/core/kernels/no_op.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
@@ -101,13 +101,13 @@ class MklRequantizePerChannelOp : public OpKernel {
       memory::dims dims_mkl_order =
           TFShapeToMklDnnDimsInNCHW(input.shape(), FORMAT_NHWC);
       memory::desc input_md = memory::desc(dims_mkl_order, MklDnnType<qint32>(),
-                                           MEMORY_FORMAT::nhwc);
+                                           memory::format_tag::nhwc);
       memory::desc output_md =
           (out_type_ == DT_QINT8)
               ? memory::desc(dims_mkl_order, MklDnnType<qint8>(),
-                             MEMORY_FORMAT::nhwc)
+                             memory::format_tag::nhwc)
               : memory::desc(dims_mkl_order, MklDnnType<quint8>(),
-                             MEMORY_FORMAT::nhwc);
+                             memory::format_tag::nhwc);
 
       void* input_buf =
           static_cast<void*>(const_cast<qint32*>(input.flat<qint32>().data()));
@@ -121,28 +121,21 @@ class MklRequantizePerChannelOp : public OpKernel {
       }
 
       std::unique_ptr<memory> input_mem_prim(
-          new MEMORY_CONSTRUCTOR_USING_MD(input_md, cpu_engine_, input_buf));
+          new memory(input_md, cpu_engine_, input_buf));
       std::unique_ptr<memory> output_mem_prim(
-          new MEMORY_CONSTRUCTOR_USING_MD(output_md, cpu_engine_, output_buf));
+          new memory(output_md, cpu_engine_, output_buf));
 
       mkldnn::reorder::primitive_desc reorder_pd =
-          REORDER_PD_CONSTRUCTOR_WITH_ATTR(
-              GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(input_mem_prim),
-              GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(output_mem_prim),
-              cpu_engine_, reorder_attr);
+          ReorderPd(cpu_engine_, input_mem_prim->get_desc(), cpu_engine_,
+                    output_mem_prim->get_desc(), reorder_attr);
       std::shared_ptr<stream> reorder_stream;
       reorder_stream.reset(CreateStream(ctx, cpu_engine_));
-#ifndef ENABLE_MKLDNN_V1
-      reorder_stream->submit(
-          {mkldnn::reorder(reorder_pd, *input_mem_prim, *output_mem_prim)});
-#else
       std::unordered_map<int, mkldnn::memory> reorder_args = {
           {MKLDNN_ARG_FROM, *input_mem_prim},
           {MKLDNN_ARG_TO, *output_mem_prim}};
       std::unique_ptr<mkldnn::primitive> reorder_prim(
           new mkldnn::reorder(reorder_pd));
       reorder_prim->execute(*reorder_stream, reorder_args);
-#endif  // !ENABLE_MKLDNN_V1
 
       Tensor* output_min = nullptr;
       Tensor* output_max = nullptr;
@@ -172,7 +165,7 @@ class MklRequantizePerChannelOp : public OpKernel {
   const int kOutputMinIndex = 1;
   const int kOutputMaxIndex = 2;
   DataType out_type_;
-  engine cpu_engine_ = engine(ENGINE_CPU, 0);
+  engine cpu_engine_ = engine(engine::kind::cpu, 0);
 };
 
 // Registration for out_type: qint8
diff --git a/tensorflow/core/kernels/mkl/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl/mkl_reshape_op.cc
index bda3fad38cf..d440136509a 100644
--- a/tensorflow/core/kernels/mkl/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_reshape_op.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::stream;
@@ -55,7 +54,8 @@ class MklReshapeOp : public OpKernel {
     // blocking_desc_is_equal() for checking all the stride arrays in
     // mkl-dnn/blob/master/src/common/type_helpers.hpp
     auto input_mkl_md = mkl_shape_input.GetMklLayout();
-    return SKIP_INPUT_REORDER(mkl_shape_input, input_mkl_md);
+    return (mkl_shape_input.GetTfDataFormat() ==
+            MklTensorFormat::FORMAT_BLOCKED);
   }
 
  public:
@@ -138,7 +138,7 @@ class MklReshapeOp : public OpKernel {
         return;
       } else {
         try {
-          auto cpu_engine = engine(ENGINE_CPU, 0);
+          auto cpu_engine = engine(engine::kind::cpu, 0);
           MklDnnData<T> dnn_data_input(&cpu_engine);
           // Reshape is just a logical view change operation for a tensor.
           // It does not change underlying layout. But MKLDNN may maintain
@@ -156,9 +156,6 @@ class MklReshapeOp : public OpKernel {
           dnn_data_input.SetUsrMem(input_mkl_md, &input_tensor);
           // Get expected Tensorflow layout of input tensor.
           auto output_tf_md = mkl_shape_input.GetTfLayout();
-#ifndef ENABLE_MKLDNN_V1
-          auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine);
-#endif  // !ENABLE_MKLDNN_V1
 
           Tensor* output_tensor = nullptr;
           MklDnnShape mkl_shape_output;
@@ -172,7 +169,7 @@ class MklReshapeOp : public OpKernel {
           // shape_from != shape_to), then we just copy input tensor to
           // output tensor with target shape (we cannot forward Mkl layout
           // in such case because shape has changed.)
-          if (dnn_data_input.CheckReorderToOpMem(OUTPUT_TF_MD, output_tensor,
+          if (dnn_data_input.CheckReorderToOpMem(output_tf_md, output_tensor,
                                                  context)) {
           } else {
             OP_REQUIRES(context,
diff --git a/tensorflow/core/kernels/mkl/mkl_slice_op.cc b/tensorflow/core/kernels/mkl/mkl_slice_op.cc
index 7e293e14d98..3f01dfd104f 100644
--- a/tensorflow/core/kernels/mkl/mkl_slice_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_slice_op.cc
@@ -26,13 +26,9 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/prefetch.h"
-#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::stream;
-#ifndef ENABLE_MKLDNN_V1
-using mkldnn::view;
-#endif
 
 namespace tensorflow {
 
@@ -181,7 +177,7 @@ template <typename T>
 class MklSlicePrimitive : public MklPrimitive {
  public:
   explicit MklSlicePrimitive(const MklSliceParams& sliceParams)
-      : MklPrimitive(engine(ENGINE_CPU, 0)) {
+      : MklPrimitive(engine(engine::kind::cpu, 0)) {
     Setup(sliceParams);
   }
 
@@ -198,12 +194,9 @@ class MklSlicePrimitive : public MklPrimitive {
     context_.src_mem->set_data_handle(sliceParams.from->get_data_handle());
     context_.dst_mem->set_data_handle(sliceParams.to->get_data_handle());
 #endif  // ENABLE_MKLDNN_THREADPOOL
-#ifdef ENABLE_MKLDNN_V1
+
     execute_primitives(context_.slice_primitives, slice_stream,
                        context_.slice_primitives_args);
-#else
-    slice_stream->submit(context_.slice_primitives);
-#endif
 
     // We should set it back to DummyData so as to make the primitive
     // in cache pool stateless. Otherwise, if the result for previous
@@ -224,12 +217,8 @@ class MklSlicePrimitive : public MklPrimitive {
     std::shared_ptr<reorder::primitive_desc> reorder_pd;
     std::shared_ptr<mkldnn::stream> slice_stream;
     std::vector<mkldnn::primitive> slice_primitives;
-#ifdef ENABLE_MKLDNN_V1
     std::shared_ptr<mkldnn::memory> src_sub_mem;
     std::vector<std::unordered_map<int, memory>> slice_primitives_args;
-#else
-    std::shared_ptr<view::primitive_desc> view_pd;
-#endif  // ENABLE_MKLDNN_V1
     SliceContext()
         : src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {}
   } context_;
@@ -237,15 +226,13 @@ class MklSlicePrimitive : public MklPrimitive {
   void Setup(const MklSliceParams& sliceParams) {
     // Actually, DummyData will not be used in computation,
     // because the real data will be filled before execution.
-    context_.src_mem.reset(new MEMORY_CONSTRUCTOR_WITH_MEM_PD(
-        sliceParams.from, cpu_engine_, DummyData));
-    context_.dst_mem.reset(new MEMORY_CONSTRUCTOR_WITH_MEM_PD(
-        sliceParams.to, cpu_engine_, DummyData));
-    auto src_pd = context_.src_mem->GET_DESC;
-    auto dst_pd = context_.dst_mem->GET_DESC;
-#ifdef ENABLE_MKLDNN_V1
-    // MKL-DNN 1.x removes struct view, alias of memory in 0.x version.
-    // So the implementation is based on submemory.
+    context_.src_mem.reset(
+        new memory(sliceParams.from->get_desc(), cpu_engine_, DummyData));
+    context_.dst_mem.reset(
+        new memory(sliceParams.to->get_desc(), cpu_engine_, DummyData));
+    auto src_pd = context_.src_mem->get_desc();
+    auto dst_pd = context_.dst_mem->get_desc();
+
     auto src_sub_desc = context_.src_mem->get_desc().submemory_desc(
         sliceParams.size_dims, sliceParams.begin_dims);
     context_.src_sub_mem.reset(new memory(src_sub_desc, cpu_engine_, nullptr));
@@ -256,18 +243,7 @@ class MklSlicePrimitive : public MklPrimitive {
 
     context_.slice_primitives_args.push_back(
         {{MKLDNN_ARG_SRC, *context_.src_mem},
-         { MKLDNN_ARG_DST,
-           *context_.dst_mem }});
-#else
-    context_.view_pd =
-        std::make_shared<view::primitive_desc>(view::primitive_desc(
-            src_pd, sliceParams.size_dims, sliceParams.begin_dims));
-    context_.reorder_pd =
-        std::make_shared<reorder::primitive_desc>(reorder::primitive_desc(
-            context_.view_pd->dst_primitive_desc(), dst_pd));
-    context_.reorder_prim = std::make_shared<mkldnn::reorder>(
-        reorder(*context_.reorder_pd, *context_.src_mem, *context_.dst_mem));
-#endif
+         {MKLDNN_ARG_DST, *context_.dst_mem}});
     context_.slice_primitives.push_back(*context_.reorder_prim);
   }
 };
@@ -298,32 +274,24 @@ class MklSlicePrimitiveFactory : public MklPrimitiveFactory<T> {
   static string CreateKey(const MklSliceParams& sliceParams) {
     string prefix = "reorder";
     FactoryKeyCreator key_creator;
-    auto const& from_desc = GET_MEMORY_DESC_FROM_MEM_PTR(sliceParams.from).data;
-    auto const& to_desc = GET_MEMORY_DESC_FROM_MEM_PTR(sliceParams.to).data;
+    auto const& from_desc = sliceParams.from->get_desc().data;
+    auto const& to_desc = sliceParams.to->get_desc().data;
     const int kIdxFirstStride = 0;
     memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]);
     memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]);
 
     // MKL-DNN removes "struct view". Submemory has similar capability.
-    auto from_strides = from_desc.MEMORY_FORMAT_DESC.blocking.strides;
-    auto to_strides = to_desc.MEMORY_FORMAT_DESC.blocking.strides;
-    memory::dims from_strides_outer_blocks(
-        GET_BLOCK_STRIDES(from_strides, kIdxFirstStride),
-        &GET_BLOCK_STRIDES(from_strides, kIdxFirstStride)[from_desc.ndims]);
-    memory::dims to_strides_outer_blocks(
-        GET_BLOCK_STRIDES(to_strides, kIdxFirstStride),
-        &GET_BLOCK_STRIDES(to_strides, kIdxFirstStride)[to_desc.ndims]);
+    auto from_strides = from_desc.format_desc.blocking.strides;
+    auto to_strides = to_desc.format_desc.blocking.strides;
+    memory::dims from_strides_outer_blocks(from_strides,
+                                           &from_strides[from_desc.ndims]);
+    memory::dims to_strides_outer_blocks(to_strides,
+                                         &to_strides[to_desc.ndims]);
 
     key_creator.AddAsKey(prefix);
-#ifndef ENABLE_MKLDNN_V1
-    key_creator.AddAsKey(static_cast<int>(from_desc.format));
-#endif
     key_creator.AddAsKey(static_cast<int>(from_desc.data_type));
     key_creator.AddAsKey(from_dims);
     key_creator.AddAsKey(from_strides_outer_blocks);
-#ifndef ENABLE_MKLDNN_V1
-    key_creator.AddAsKey(static_cast<int>(to_desc.format));
-#endif
     key_creator.AddAsKey(static_cast<int>(to_desc.data_type));
     key_creator.AddAsKey(to_dims);
     key_creator.AddAsKey(to_strides_outer_blocks);
@@ -401,7 +369,7 @@ class MklSliceOp : public OpKernel {
       // primitive descriptor. And the reorder uses source memory as input but
       // traverses it according to a view in_submem_pd.
 
-      auto cpu_engine = engine(ENGINE_CPU, 0);
+      auto cpu_engine = engine(engine::kind::cpu, 0);
       MklDnnData<T> src(&cpu_engine);
       MklDnnData<T> output(&cpu_engine);
 
@@ -468,22 +436,13 @@ class MklSliceOp : public OpKernel {
       // Or else do nothing for it.
       auto op_md =
           MklDnnData<T>::CreateBlockedMemDesc(input_dims, input_strides);
-#ifdef ENABLE_MKLDNN_V1
       src.CheckReorderToOpMem(op_md, cpu_engine, context);
-#else
-      auto op_pd = memory::primitive_desc(op_md, cpu_engine);
-      src.CheckReorderToOpMem(op_pd);
-#endif
 
       // Step 2 - Create memory for output.
       auto output_strides = CalculateTFStrides(size_dims);
       auto output_md =
           MklDnnData<T>::CreateBlockedMemDesc(size_dims, output_strides);
-#ifdef ENABLE_MKLDNN_V1
       auto output_pd = output_md;
-#else
-      auto output_pd = memory::primitive_desc(output_md, cpu_engine);
-#endif
       AllocateOutputTensor(context, input_mkl_shape, &output_pd, size_dims,
                            &output_tensor, &output_mkl_shape);
       DCHECK(output_tensor);
@@ -512,7 +471,7 @@ class MklSliceOp : public OpKernel {
  private:
   void AllocateOutputTensor(OpKernelContext* context,
                             const MklDnnShape& input_mkl_shape,
-                            MEMORY_PRIMITIVE_DESC* output_pd,
+                            memory::desc* output_pd,
                             const memory::dims& output_dims,
                             Tensor** output_tensor,
                             MklDnnShape* output_mkl_shape) {
diff --git a/tensorflow/core/kernels/mkl/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl/mkl_softmax_op.cc
index 2f51573fe13..71837e9e91d 100644
--- a/tensorflow/core/kernels/mkl/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_softmax_op.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -37,10 +36,10 @@ namespace tensorflow {
 class MklSoftmaxParams {
  public:
   memory::dims src_dims;
-  MKL_TENSOR_FORMAT src_fmt;
+  MklTensorFormat src_fmt;
   int axis;
 
-  MklSoftmaxParams(memory::dims src_dims, MKL_TENSOR_FORMAT src_fmt, int axis)
+  MklSoftmaxParams(memory::dims src_dims, MklTensorFormat src_fmt, int axis)
       : src_dims(src_dims), src_fmt(src_fmt), axis(axis) {}
 };
 
@@ -48,7 +47,7 @@ template <typename T>
 class MklSoftmaxPrimitive : public MklPrimitive {
  public:
   explicit MklSoftmaxPrimitive(const MklSoftmaxParams& fwdParams)
-      : MklPrimitive(engine(ENGINE_CPU, 0)) {
+      : MklPrimitive(engine(engine::kind::cpu, 0)) {
     Setup(fwdParams);
   }
 
@@ -69,13 +68,10 @@ class MklSoftmaxPrimitive : public MklPrimitive {
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
 #endif  // ENABLE_MKLDNN_THREADPOOL
-#ifdef ENABLE_MKLDNN_V1
+
     DCHECK_EQ(context_.fwd_primitives.size(), context_.fwd_net_args.size());
     execute_primitives(context_.fwd_primitives, fwd_cpu_stream,
                        context_.fwd_net_args);
-#else
-    fwd_cpu_stream->submit(context_.fwd_primitives);
-#endif
 
     // After execution, set data handle back.
     context_.src_mem->set_data_handle(DummyData);
@@ -117,7 +113,7 @@ class MklSoftmaxPrimitive : public MklPrimitive {
   // Softmax forward primitive setup
   void Setup(const MklSoftmaxParams& fwdParams) {
     // Create memory descriptors for softmax data with specified format.
-    auto src_format = GET_TENSOR_FORMAT(fwdParams.src_fmt);
+    auto src_format = MklTensorFormatToMklDnnDataFormat(fwdParams.src_fmt);
     context_.src_md.reset(
         new memory::desc({fwdParams.src_dims}, MklDnnType<T>(), src_format));
 
@@ -128,21 +124,15 @@ class MklSoftmaxPrimitive : public MklPrimitive {
         *context_.fwd_desc, cpu_engine_));
 
     // Create memory primitive based on dummy data.
-    context_.src_mem.reset(new MEMORY_CONSTRUCTOR_USING_MD(
-        *context_.src_md, cpu_engine_, DummyData));
-    context_.dst_mem.reset(new MEMORY_CONSTRUCTOR_PD(
-        context_.fwd_pd.get()->PRIMITIVE_DESC_DST, cpu_engine_, DummyData));
+    context_.src_mem.reset(
+        new memory(*context_.src_md, cpu_engine_, DummyData));
+    context_.dst_mem.reset(
+        new memory(context_.fwd_pd.get()->dst_desc(), cpu_engine_, DummyData));
 
-#ifdef ENABLE_MKLDNN_V1
     // Create softmax primitive and add it to net
     context_.softmax_fwd.reset(new mkldnn::softmax_forward(*context_.fwd_pd));
     context_.fwd_net_args.push_back({{MKLDNN_ARG_SRC, *context_.src_mem},
-                                     { MKLDNN_ARG_DST,
-                                       *context_.dst_mem }});
-#else
-    context_.softmax_fwd.reset(new mkldnn::softmax_forward(
-        *context_.fwd_pd, *context_.src_mem, *context_.dst_mem));
-#endif  // ENABLE_MKLDNN_V1
+                                     {MKLDNN_ARG_DST, *context_.dst_mem}});
 
     context_.fwd_primitives.push_back(*context_.softmax_fwd);
   }
@@ -209,7 +199,7 @@ class MklSoftmaxOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     try {
-      auto cpu_engine = engine(ENGINE_CPU, 0);
+      auto cpu_engine = engine(engine::kind::cpu, 0);
       // src_tensor points to the 0-th input of global data struct "context".
       size_t src_idx = 0;
       const Tensor& src_tensor = MklGetInput(context, src_idx);
@@ -231,7 +221,7 @@ class MklSoftmaxOp : public OpKernel {
         src_dims = TFShapeToMklDnnDims(src_tf_shape);
         axis = input_dims - 1;
       }
-      MKL_TENSOR_FORMAT layout_type;
+      MklTensorFormat layout_type;
       // In MKL, data format passed to mkl softmax op depends on dimension of
       // the input tensor. Here "x" data format in MKL is used for 1 dim tensor,
       // "nc" for 2 dim tensor, "tnc" for 3 dim tensor, "nchw" for 4 dim tensor,
@@ -243,26 +233,26 @@ class MklSoftmaxOp : public OpKernel {
       // dimension to do softmax.
       switch (input_dims) {
         case 1:
-          layout_type = MKL_TENSOR_FORMAT_X;
+          layout_type = MklTensorFormat::FORMAT_X;
           break;
         case 2:
-          layout_type = MKL_TENSOR_FORMAT_NC;
+          layout_type = MklTensorFormat::FORMAT_NC;
           break;
         case 3:
-          layout_type = MKL_TENSOR_FORMAT_TNC;
+          layout_type = MklTensorFormat::FORMAT_TNC;
           break;
         case 4:
           if (src_mkl_shape.IsMklTensor()) {
-            layout_type = MKL_TENSOR_FORMAT_NHWC;
+            layout_type = MklTensorFormat::FORMAT_NHWC;
           } else {
-            layout_type = MKL_TENSOR_FORMAT_NCHW;
+            layout_type = MklTensorFormat::FORMAT_NCHW;
           }
           break;
         case 5:
           if (src_mkl_shape.IsMklTensor()) {
-            layout_type = MKL_TENSOR_FORMAT_NDHWC;
+            layout_type = MklTensorFormat::FORMAT_NDHWC;
           } else {
-            layout_type = MKL_TENSOR_FORMAT_NCDHW;
+            layout_type = MklTensorFormat::FORMAT_NCDHW;
           }
           break;
         default:
@@ -274,7 +264,7 @@ class MklSoftmaxOp : public OpKernel {
       // If input is in MKL layout, then simply get the format from input;
       // otherwise, use TF layout defined before.
       auto src_fmt = src_mkl_shape.IsMklTensor()
-                         ? GET_FORMAT_FROM_SHAPE(src_mkl_shape)
+                         ? MklTensorFormat::FORMAT_BLOCKED
                          : layout_type;
 
       // Get a softmax fwd primitive from primitive pool.
@@ -287,7 +277,7 @@ class MklSoftmaxOp : public OpKernel {
       MklDnnShape output_mkl_shape;
       TensorShape output_tf_shape;  // shape of output TF tensor.
 
-      auto dst_pd = softmax_fwd->GetSoftmaxFwdPd()->PRIMITIVE_DESC_DST;
+      auto dst_pd = softmax_fwd->GetSoftmaxFwdPd()->dst_desc();
 
       // If input is MKL shape, output is also MKL shape.
       // If input is TF shape, output is also TF shape.
diff --git a/tensorflow/core/kernels/mkl/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl/mkl_tfconv_op.h
index 0a603ee2c12..232994dacd9 100644
--- a/tensorflow/core/kernels/mkl/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl/mkl_tfconv_op.h
@@ -32,7 +32,6 @@ limitations under the License.
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
 
@@ -86,7 +85,7 @@ class MklToTfOp : public OpKernel {
       CHECK_EQ(op_data_type, input_data_type);
       CHECK_EQ(op_data_type, output_data_type);
 
-      auto cpu_engine = engine(ENGINE_CPU, 0);
+      auto cpu_engine = engine(engine::kind::cpu, 0);
       MklDnnData<T> input(&cpu_engine);
 
       // Get MKL layout of input tensor.
@@ -94,9 +93,6 @@ class MklToTfOp : public OpKernel {
       // Get TensorFlow layout of input tensor. Expected output of conversion
       // has same layout as Tensorflow layout of input tensor.
       auto output_tf_md = input_shape.GetTfLayout();
-#ifndef ENABLE_MKLDNN_V1
-      auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine);
-#endif  // !ENABLE_MKLDNN_V1
       // Set input MKL layout as the user layout.
       input.SetUsrMem(input_mkl_md, &input_tensor);
 
@@ -108,11 +104,11 @@ class MklToTfOp : public OpKernel {
       DCHECK(output_tensor);
 
       // Check if input needs to be reordered
-      if (input.IsReorderNeeded(OUTPUT_TF_MD)) {
+      if (input.IsReorderNeeded(output_tf_md)) {
         // Insert reorder between MKL layout and TensorFlow layout
         OP_REQUIRES(
             context,
-            input.CheckReorderToOpMem(OUTPUT_TF_MD, output_tensor, context),
+            input.CheckReorderToOpMem(output_tf_md, output_tensor, context),
             errors::Internal("MklToTfOp: Failed to create input reorder"));
       } else {
         // If not, just forward input tensor to output tensor.
diff --git a/tensorflow/core/kernels/mkl/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl/mkl_transpose_op.cc
index 2e5c6d2719b..72cc760c0de 100644
--- a/tensorflow/core/kernels/mkl/mkl_transpose_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_transpose_op.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/kernels/transpose_op.h"
-#include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 using mkldnn::stream;
@@ -126,7 +125,7 @@ template <typename T>
 Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
                       Tensor* out_tensor, const gtl::ArraySlice<int32>& perm) {
   try {
-    engine cpu_engine = engine(ENGINE_CPU, 0);
+    engine cpu_engine = engine(engine::kind::cpu, 0);
     MklDnnData<T> in(&cpu_engine);
     MklDnnData<T> out(&cpu_engine);
 
@@ -144,7 +143,6 @@ Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
     out.SetUsrMem(in_dims, out_strides, out_tensor);
 
     std::vector<primitive> net;
-#ifdef ENABLE_MKLDNN_V1
     auto* prim = FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem());
     transpose_stream.reset(CreateStream(context, prim->GetEngine()));
     in.SetUsrMemDataHandle(&in_tensor, transpose_stream);
@@ -154,11 +152,6 @@ Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
     net_args.push_back({{MKLDNN_ARG_FROM, *in.GetUsrMem()},
                         {MKLDNN_ARG_TO, *out.GetUsrMem()}});
     execute_primitives(net, transpose_stream, net_args);
-#else
-    transpose_stream.reset(new CPU_STREAM(cpu_engine));
-    net.push_back(FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem()));
-    transpose_stream->submit(net).wait();
-#endif  // ENABLE_MKLDNN_V1
 
     return Status::OK();
   } catch (mkldnn::error& e) {
@@ -196,7 +189,7 @@ Status MklTransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
 
   // MKL-DNN has limit on the maximum number of dimensions in a tensor.
   // Fallback to Eigen for not supported cases.
-  if (in.dims() <= TENSOR_MAX_DIMS) {
+  if (in.dims() <= MKLDNN_MAX_NDIMS) {
     switch (in.dtype()) {
       case DT_FLOAT:
         return MKLTransposeND<float>(ctx, in, out, perm);
@@ -243,7 +236,7 @@ Status MklConjugateTransposeCpuOp::DoTranspose(OpKernelContext* ctx,
 
   // MKL-DNN has limit on the maximum number of dimensions in a tensor.
   // Fallback to Eigen for not supported cases.
-  if (in.dims() <= TENSOR_MAX_DIMS) {
+  if (in.dims() <= MKLDNN_MAX_NDIMS) {
     switch (in.dtype()) {
       case DT_FLOAT:
         return MKLTransposeND<float>(ctx, in, out, perm);
diff --git a/tensorflow/core/kernels/mlir_generated/BUILD b/tensorflow/core/kernels/mlir_generated/BUILD
index 9f3efe9d972..1d18e41d3ef 100644
--- a/tensorflow/core/kernels/mlir_generated/BUILD
+++ b/tensorflow/core/kernels/mlir_generated/BUILD
@@ -1,19 +1,26 @@
 # Generates CUDA kernels using MLIR codegen.
 
-load("//tensorflow/core/kernels/mlir_generated:build_defs.bzl", "gen_kernel_library", "if_mlir_generated_gpu_kernels_enabled")
+load(
+    "//tensorflow/core/kernels/mlir_generated:build_defs.bzl",
+    "gen_kernel_library",
+    "if_mlir_generated_gpu_kernels_enabled",
+    "if_mlir_unranked_kernels_enabled",
+)
 load(
     "//tensorflow:tensorflow.bzl",
-    "tf_kernel_library",
+    "if_cuda_or_rocm",
 )
-load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")  # buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")  # buildifier: disable=same-origin-load
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 package(
-    default_visibility = ["//tensorflow/core/kernels:__subpackages__"],
+    default_visibility = [
+        "//tensorflow/core/kernels:__subpackages__",
+    ],
     licenses = ["notice"],  # Apache 2.0
 )
 
@@ -24,24 +31,60 @@ config_setting(
     },
 )
 
+config_setting(
+    name = "mlir_use_unranked_kernels",
+    define_values = {"enable_unranked_kernels": "1"},
+)
+
+filegroup(
+    name = "kernel_srcs",
+    srcs = if_mlir_unranked_kernels_enabled(
+        [
+            "unranked_op_gpu_abs.cc",
+            "unranked_op_gpu_tanh.cc",
+            "unranked_op_gpu_base.h",
+            "unranked_op_gpu_base.cc",
+        ],
+        [
+            "cwise_op_gpu_abs.cc",
+            "cwise_op_gpu_base.cc",
+            "cwise_op_gpu_base.h",
+            "cwise_op_gpu_tanh.cc",
+        ],
+    ),
+)
+
+cc_library(
+    name = "kernel_deps",
+    deps = if_mlir_unranked_kernels_enabled(
+        [
+            ":abs_unranked_kernels",
+            ":tanh_unranked_kernels",
+            "//tensorflow/compiler/mlir/tools/kernel_gen:tf_cuda_runtime_wrappers",
+            "//tensorflow/compiler/mlir/tools/kernel_gen:tf_framework_c_interface",
+        ],
+        [
+            ":abs_kernels",
+            ":tanh_kernels",
+        ],
+    ),
+)
+
 tf_kernel_library(
-    name = "cwise_op",
-    gpu_srcs = [
-        "cwise_op_gpu_base.cu.cc",
-        "cwise_op_gpu_base.cu.h",
-        "cwise_op_gpu_abs.cu.cc",
-        "cwise_op_gpu_tanh.cu.cc",
-    ],
-    tags = ["manual"],
-    deps = if_cuda([
-        ":abs_kernels",
-        ":tanh_kernels",
+    name = "cwise_unary_op",
+    # Technically these source files don't need --config=cuda or --config=rocm,
+    # but we want to avoid building them if they are not needed.
+    srcs = if_cuda_or_rocm([":kernel_srcs"]),
+    tags = ["no_rocm"],
+    deps = if_cuda_or_rocm([
+        ":kernel_deps",
         "@com_google_absl//absl/strings",
-        "//third_party/eigen3",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "//third_party/eigen3",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
     ]),
 )
 
@@ -49,7 +92,7 @@ tf_cuda_cc_test(
     name = "gpu_tanh_test",
     size = "small",
     srcs = if_mlir_generated_gpu_kernels_enabled(["gpu_tanh_test.cc"]),
-    tags = tf_cuda_tests_tags() + ["no_rocm"],
+    tags = tf_cuda_tests_tags(),
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -57,8 +100,6 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/common_runtime:device",
-        "//tensorflow/core/common_runtime:device_factory",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:ops_testutil",
     ],
@@ -68,7 +109,7 @@ tf_cuda_cc_test(
     name = "gpu_abs_test",
     size = "small",
     srcs = if_mlir_generated_gpu_kernels_enabled(["gpu_abs_test.cc"]),
-    tags = tf_cuda_tests_tags() + ["no_rocm"],
+    tags = tf_cuda_tests_tags(),
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -76,8 +117,6 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
-        "//tensorflow/core/common_runtime:device",
-        "//tensorflow/core/common_runtime:device_factory",
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:ops_testutil",
     ],
@@ -107,20 +146,9 @@ tf_cuda_cc_test(
 #     ],
 # )
 
-gen_kernel_library(
-    name = "tanh",
-    same_shape = "0,1",
-    tile_size = "256",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-    ],
-    unroll_factors = "4",
-)
-
 gen_kernel_library(
     name = "abs",
+    generate_unranked = True,
     same_shape = "0,1",
     tile_size = "256",
     types = [
@@ -132,3 +160,203 @@ gen_kernel_library(
     ],
     unroll_factors = "4",
 )
+
+gen_kernel_library(
+    name = "ceil",
+    generate_unranked = True,
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "conj",
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "c64",
+        "c128",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "cos",
+    generate_unranked = True,
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "exp",
+    generate_unranked = True,
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "floor",
+    generate_unranked = True,
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "imag",
+    tile_size = "256",
+    types = [
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "invert",
+    generate_unranked = True,
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "i8",
+        "i16",
+        "i32",
+        "i64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "isfinite",
+    generate_unranked = True,
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "log",
+    generate_unranked = True,
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "logicalnot",
+    generate_unranked = True,
+    same_shape = "0,1",
+    tile_size = "256",
+    types = ["i1"],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "neg",
+    generate_unranked = True,
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "real",
+    tile_size = "256",
+    types = [
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "rsqrt",
+    generate_unranked = True,
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "sign",
+    generate_unranked = True,
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        # TODO(b/162577610): Add bf16, c64 and c128.
+        "f16",
+        "f32",
+        "f64",
+        "i32",
+        "i64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "sqrt",
+    generate_unranked = True,
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
+
+gen_kernel_library(
+    name = "tanh",
+    generate_unranked = True,
+    same_shape = "0,1",
+    tile_size = "256",
+    types = [
+        "f16",
+        "f32",
+        "f64",
+    ],
+    unroll_factors = "4",
+)
diff --git a/tensorflow/core/kernels/mlir_generated/build_defs.bzl b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
index 2bf6e8fa3bb..5b4daac8820 100644
--- a/tensorflow/core/kernels/mlir_generated/build_defs.bzl
+++ b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
@@ -1,6 +1,19 @@
 """Generates cubin headers for TF dialect ops."""
 
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures", "if_cuda")
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "rocm_gpu_architectures",
+    "rocm_is_configured",
+)
+load(
+    "//tensorflow/core/platform/default:cuda_build_defs.bzl",
+    "if_cuda_is_configured",
+)
+load(
+    "//tensorflow/stream_executor:build_defs.bzl",
+    "if_gpu_is_configured",
+)
 
 def if_mlir_generated_gpu_kernels_enabled(if_true, if_false = []):
     return select({
@@ -15,9 +28,12 @@ def _lookup_file(filegroup, path):
             return file
     return None
 
-CubinInfo = provider(fields = ["cubins"])
+GpuBinaryInfo = provider(
+    "GPU binaries in either cubin format or hsaco format",
+    fields = ["gpu_bins"],
+)
 
-def _gen_kernel_cubin_impl(ctx):
+def _gen_kernel_gpu_bin_impl(ctx):
     name = ctx.attr.name
     tile_sizes = ctx.attr.tile_size.replace("x", ",")
     cmd_args = []
@@ -26,35 +42,38 @@ def _gen_kernel_cubin_impl(ctx):
     if ctx.attr.unroll_factors:
         cmd_args.append("--unroll_factors=%s" % ctx.attr.unroll_factors)
 
-    cubins = []
+    if ctx.attr.extra_args:
+        cmd_args.extend(ctx.attr.extra_args)
+
+    gpu_bins = []
     for arch in ctx.attr.gpu_archs:
-        # TODO(b/152737872): 'compute_' should generate both SASS and PTX.
+        # TODO(b/170283783): 'compute_' should generate both SASS and PTX.
         arch = arch.replace("compute_", "sm_")
-        filename = "%s.%s.cubin" % (name, arch)
-        cubin = ctx.actions.declare_file(filename)
+        filename = "%s.%s.bin" % (name, arch)
+        gpu_bin = ctx.actions.declare_file(filename)
         ctx.actions.run(
             inputs = [ctx.file.mlir_op, ctx.file._tfso],
-            outputs = [cubin],
+            outputs = [gpu_bin],
             executable = ctx.executable._tool,
             arguments = cmd_args + [
                 "--tile_sizes=%s" % tile_sizes,
-                "--arch=%s" % arch.split("_")[1],
+                "--arch=%s" % arch,
                 "--input=%s" % ctx.file.mlir_op.path,
-                "--output=%s" % cubin.path,
+                "--output=%s" % gpu_bin.path,
             ],
             mnemonic = "compile",
         )
-        cubins.append(cubin)
-    return [CubinInfo(cubins = cubins)]
+        gpu_bins.append(gpu_bin)
+    return [GpuBinaryInfo(gpu_bins = gpu_bins)]
 
-_gen_kernel_cubin_rule = rule(
-    implementation = _gen_kernel_cubin_impl,
+_gen_kernel_gpu_bin_rule = rule(
     attrs = {
         "mlir_op": attr.label(mandatory = True, allow_single_file = True),
         "tile_size": attr.string(mandatory = True),
         "same_shape": attr.string(),
         "unroll_factors": attr.string(),
         "gpu_archs": attr.string_list(mandatory = True),
+        "extra_args": attr.string_list(),
         "_tfso": attr.label(
             default = Label("//tensorflow:libtensorflow_framework.so.2"),
             cfg = "host",
@@ -62,16 +81,17 @@ _gen_kernel_cubin_rule = rule(
         ),
         "_tool": attr.label(
             executable = True,
-            default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_cubin"),
+            default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_gpu_binary"),
             cfg = "host",
         ),
     },
     output_to_genfiles = True,
+    implementation = _gen_kernel_gpu_bin_impl,
 )
 
-def _gen_kernel_image_hdr_impl(ctx):
+def _gen_kernel_image_hdr_impl_cuda(ctx):
     images = []
-    for cubin in ctx.attr.input[CubinInfo].cubins:
+    for cubin in ctx.attr.input[GpuBinaryInfo].gpu_bins:
         arch = cubin.path.split(".")[-2]
         images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
 
@@ -79,8 +99,8 @@ def _gen_kernel_image_hdr_impl(ctx):
     fatbin = ctx.actions.declare_file("%s.fatbin" % ctx.attr.name)
     ctx.actions.run(
         outputs = [fatbin],
-        inputs = ctx.attr.input[CubinInfo].cubins,
-        executable = _lookup_file(ctx.attr._cuda_root, "bin/fatbinary"),
+        inputs = ctx.attr.input[GpuBinaryInfo].gpu_bins,
+        executable = _lookup_file(ctx.attr._gpu_root, "bin/fatbinary"),
         arguments = [
             "--64",
             "--cmdline=--compile-only",
@@ -91,7 +111,7 @@ def _gen_kernel_image_hdr_impl(ctx):
         mnemonic = "fatbinary",
     )
 
-    bin2c = _lookup_file(ctx.attr._cuda_root, "bin/bin2c")
+    bin2c = _lookup_file(ctx.attr._gpu_root, "bin/bin2c")
     ctx.actions.run_shell(
         outputs = [ctx.outputs.out],
         inputs = [fatbin],
@@ -101,45 +121,101 @@ def _gen_kernel_image_hdr_impl(ctx):
         mnemonic = "bin2c",
     )
 
+def _gen_kernel_image_hdr_impl_rocm(ctx):
+    hsaco_files = []
+    hsaco_targets = []
+
+    # Add a dummy host target triple...clang-offload-bundler requires 1 and only 1 host target triple
+    hsaco_files.append("/dev/null")
+    hsaco_targets.append("host-x86_64-unknown-linux")
+
+    hsacos = ctx.attr.input[GpuBinaryInfo].gpu_bins
+    for hsaco in hsacos:
+        gfx_arch = hsaco.path.split(".")[-2]
+        hsaco_files.append(hsaco.path)
+        hsaco_targets.append("hip-amdgcn-amd-amdhsa-%s" % gfx_arch)
+
+    # Generate fatbin file from all hsacos.
+    fatbin = ctx.actions.declare_file("%s.fatbin" % ctx.attr.name)
+    ctx.actions.run(
+        outputs = [fatbin],
+        inputs = hsacos,
+        executable = _lookup_file(ctx.attr._gpu_root, "bin/clang-offload-bundler"),
+        arguments = [
+            "--inputs=%s" % ",".join(hsaco_files),
+            "--targets=%s" % ",".join(hsaco_targets),
+            "--type=o",
+            "--outputs=%s" % fatbin.path,
+        ],
+        mnemonic = "fatbinary",
+    )
+
+    ctx.actions.run_shell(
+        outputs = [ctx.outputs.out],
+        inputs = [fatbin],
+        command = (
+            ("hex=`hexdump -v -e \'/1 \"0x%%02x, \"\' %s` && " +
+             "len=`echo $hex | wc -c` && " +
+             "echo 'static const unsigned char %s['$len' + 1] = {' > %s && " +
+             "echo $hex | cat >> %s && " +
+             "echo '};' >> %s") % (
+                fatbin.path,
+                ctx.attr.symbol,
+                ctx.outputs.out.path,
+                ctx.outputs.out.path,
+                ctx.outputs.out.path,
+            )
+        ),
+    )
+
 _gen_kernel_image_hdr_rule = rule(
-    implementation = _gen_kernel_image_hdr_impl,
+    implementation = _gen_kernel_image_hdr_impl_rocm if rocm_is_configured() else _gen_kernel_image_hdr_impl_cuda,
     output_to_genfiles = True,
     attrs = {
-        "input": attr.label(mandatory = True, providers = [CubinInfo]),
+        "input": attr.label(mandatory = True, providers = [GpuBinaryInfo]),
         "out": attr.output(mandatory = True),
         "symbol": attr.string(mandatory = True),
-        "_cuda_root": attr.label(
-            default = Label("@local_config_cuda//cuda:cuda_root"),
+        "_gpu_root": attr.label(
+            default = Label("@local_config_rocm//rocm:rocm_root") if rocm_is_configured() else Label("@local_config_cuda//cuda:cuda_root"),
         ),
     },
 )
 
-def _gen_kernel_image_hdr(name, mlir_op, tile_size, same_shape = None, unroll_factors = None):
+def _gen_kernel_image_hdr(name, mlir_op, gpu_archs, tile_size, same_shape = None, unroll_factors = None, extra_args = []):
     """Generates a C header with fatbin data from a Tensorflow op."""
-    if cuda_gpu_architectures():
-        _gen_kernel_cubin_rule(
-            name = name + "_cubin",
-            mlir_op = mlir_op,
-            tile_size = tile_size,
-            same_shape = same_shape,
-            unroll_factors = unroll_factors,
-            gpu_archs = cuda_gpu_architectures(),
-        )
-        _gen_kernel_image_hdr_rule(
-            name = name,
-            input = ":" + name + "_cubin",
-            out = "%s.h" % name,
-            symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
-        )
+    _gen_kernel_gpu_bin_rule(
+        name = name + "_cubin",
+        mlir_op = mlir_op,
+        tile_size = tile_size,
+        same_shape = same_shape,
+        unroll_factors = unroll_factors,
+        gpu_archs = gpu_archs,
+        extra_args = extra_args,
+    )
+    _gen_kernel_image_hdr_rule(
+        name = name,
+        input = ":" + name + "_cubin",
+        out = "%s.h" % name,
+        symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
+    )
 
 def _gen_mlir_op_impl(ctx):
-    ctx.actions.run_shell(
+    # In order to generate a ranked kernel we change *xelem_type to ?xelem_type
+    # and remove element type from the entry function name.
+    convert_to_ranked = ""
+    if ctx.attr.unranked == False:
+        convert_to_ranked = "sed s/*x/?x/g | sed s/_elem_type//g |"
+    cmd = ctx.actions.run_shell(
         inputs = [ctx.file.template],
         outputs = [ctx.outputs.out],
-        command = "cat %s | sed s/elem_type/%s/g > %s" % (
-            ctx.file.template.path,
-            ctx.attr.type,
-            ctx.outputs.out.path,
+        command = (
+            ("cat %s | %s sed s/elem_type/%s/g | sed 's/c64/complex<f32>/g'" +
+             " | sed 's/c128/complex<f64>/g' > %s") % (
+                ctx.file.template.path,
+                convert_to_ranked,
+                ctx.attr.type,
+                ctx.outputs.out.path,
+            )
         ),
     )
 
@@ -150,18 +226,21 @@ _gen_mlir_op_rule = rule(
         "template": attr.label(mandatory = True, allow_single_file = True),
         "type": attr.string(mandatory = True),
         "out": attr.output(mandatory = True),
+        "unranked": attr.bool(mandatory = True),
     },
 )
 
-def _gen_mlir_op(name, type):
+def _gen_mlir_op(name, type, unranked):
+    tmpl_name = name.replace("_unranked", "") if unranked else name
     _gen_mlir_op_rule(
         name = "generate_{name}_{type}_mlir".format(name = name, type = type),
-        template = "op_definitions/{name}.mlir.tmpl".format(name = name),
+        template = "op_definitions/{name}.mlir.tmpl".format(name = tmpl_name),
         type = type,
         out = "{name}_{type}.mlir".format(name = name, type = type),
+        unranked = unranked,
     )
 
-def gen_kernel_library(name, types, tile_size, tags = [], same_shape = None, unroll_factors = None):
+def gen_ranked_kernel_library(name, types, tile_size, tags = [], same_shape = None, unroll_factors = None, extra_args = []):
     """ Generate a library with kernels for a specific tensorflow op.
 
     Args:
@@ -171,6 +250,93 @@ def gen_kernel_library(name, types, tile_size, tags = [], same_shape = None, unr
       unroll_factors: The unrolling specification, e.g. "4,4"
       tags: The tags which should be added to the library.
       same_shape: The information about which shapes are the same, e.g. "0,1".
+      extra_args: Extra arguments to pass to the generator tool.
+    """
+
+    if cuda_gpu_architectures() or rocm_gpu_architectures():
+        for type in types:
+            _gen_mlir_op(
+                name = name,
+                type = type,
+                unranked = False,
+            )
+            _gen_kernel_image_hdr(
+                name = "{name}_{type}_kernel".format(name = name, type = type),
+                mlir_op = "{name}_{type}.mlir".format(name = name, type = type),
+                gpu_archs = rocm_gpu_architectures() if rocm_is_configured() else cuda_gpu_architectures(),
+                tile_size = tile_size,
+                same_shape = same_shape,
+                unroll_factors = unroll_factors,
+                extra_args = extra_args,
+            )
+
+    native.cc_library(
+        name = name + "_kernels",
+        hdrs = if_gpu_is_configured([":{name}_{type}_kernel".format(name = name, type = type) for type in types]),
+        tags = tags,
+    )
+
+################################################################################
+# Unranked kernels build rules.
+################################################################################
+
+def if_mlir_unranked_kernels_enabled(if_true, if_false = []):
+    return select({
+        "//tensorflow/core/kernels/mlir_generated:mlir_use_unranked_kernels": if_true,
+        "//conditions:default": if_false,
+    })
+
+def _gen_unranked_kernel_fatbin_impl(ctx):
+    name = ctx.attr.name
+    cmd_args = []
+    if ctx.attr.unroll_factors:
+        cmd_args.append("--unroll_factors=%s" % ctx.attr.unroll_factors)
+    if ctx.attr.extra_args:
+        cmd_args.extend(ctx.attr.extra_args)
+    tile_sizes = ctx.attr.tile_size.replace("x", ",")
+    arch_flag = ",".join(ctx.attr.gpu_archs)
+    gpu_bin = ctx.outputs.output
+    ctx.actions.run(
+        inputs = [ctx.file.mlir_op],
+        outputs = [gpu_bin],
+        executable = ctx.executable._tool,
+        arguments = cmd_args + [
+            "--tile_sizes=%s" % tile_sizes,
+            "--arch=%s" % arch_flag,
+            "--input=%s" % ctx.file.mlir_op.path,
+            "--output=%s" % gpu_bin.path,
+        ],
+        mnemonic = "compile",
+    )
+
+_gen_unranked_kernel_fatbin_rule = rule(
+    attrs = {
+        "mlir_op": attr.label(mandatory = True, allow_single_file = True),
+        "output": attr.output(mandatory = True, doc = "The generated file"),
+        "tile_size": attr.string(mandatory = True),
+        "unroll_factors": attr.string(),
+        "gpu_archs": attr.string_list(mandatory = True),
+        "extra_args": attr.string_list(),
+        "_tool": attr.label(
+            executable = True,
+            default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_kernel"),
+            cfg = "host",
+        ),
+    },
+    output_to_genfiles = True,
+    implementation = _gen_unranked_kernel_fatbin_impl,
+)
+
+def gen_unranked_kernel_library(name, types, tile_size, tags = [], unroll_factors = None, extra_args = []):
+    """ Generate a library with unranked kernels for a specific tensorflow op.
+
+    Args:
+      name: The name of the tensorflow op.
+      types: The types ("f16", "f32", "f64") for which a kernel should be generated.
+      tile_size: The tiling specification, e.g. "16x16".
+      unroll_factors: The unrolling specification, e.g. "4,4"
+      tags: The tags which should be added to the library.
+      extra_args: Extra arguments to pass to the generator tool.
     """
 
     if cuda_gpu_architectures():
@@ -178,17 +344,46 @@ def gen_kernel_library(name, types, tile_size, tags = [], same_shape = None, unr
             _gen_mlir_op(
                 name = name,
                 type = type,
+                unranked = True,
             )
-            _gen_kernel_image_hdr(
-                name = "{name}_{type}_kernel".format(name = name, type = type),
+            _gen_unranked_kernel_fatbin_rule(
+                name = "{name}_{type}_kernel_generator".format(name = name, type = type),
                 mlir_op = "{name}_{type}.mlir".format(name = name, type = type),
+                output = "{name}_{type}.a".format(name = name, type = type),
+                gpu_archs = cuda_gpu_architectures(),
                 tile_size = tile_size,
-                same_shape = same_shape,
                 unroll_factors = unroll_factors,
+                extra_args = extra_args,
+            )
+            native.cc_import(
+                name = "{name}_{type}_kernel".format(name = name, type = type),
+                static_library = "{name}_{type}.a".format(name = name, type = type),
             )
 
     native.cc_library(
         name = name + "_kernels",
-        hdrs = if_cuda(if_true = [":{name}_{type}_kernel".format(name = name, type = type) for type in types]),
+        deps = if_cuda_is_configured([":{name}_{type}_kernel".format(name = name, type = type) for type in types]),
+        linkstatic = 1,
         tags = tags,
     )
+
+def gen_kernel_library(name, types, tile_size, tags = [], same_shape = None, unroll_factors = None, extra_args = [], generate_ranked = True, generate_unranked = False):
+    if (generate_ranked):
+        gen_ranked_kernel_library(
+            name = name,
+            types = types,
+            tile_size = tile_size,
+            tags = tags,
+            same_shape = same_shape,
+            unroll_factors = unroll_factors,
+            extra_args = extra_args,
+        )
+    if (generate_unranked):
+        gen_unranked_kernel_library(
+            name = name + "_unranked",
+            types = types,
+            tile_size = tile_size,
+            tags = tags,
+            unroll_factors = unroll_factors,
+            extra_args = extra_args,
+        )
diff --git a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_abs.cu.cc b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_abs.cc
similarity index 99%
rename from tensorflow/core/kernels/mlir_generated/cwise_op_gpu_abs.cu.cc
rename to tensorflow/core/kernels/mlir_generated/cwise_op_gpu_abs.cc
index 1920317a7ae..a8e780d6bb5 100644
--- a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_abs.cu.cc
+++ b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_abs.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/mlir_generated/abs_f64_kernel.h"
 #include "tensorflow/core/kernels/mlir_generated/abs_i32_kernel.h"
 #include "tensorflow/core/kernels/mlir_generated/abs_i64_kernel.h"
-#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.h"
+#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.cc b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cc
similarity index 99%
rename from tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.cc
rename to tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cc
index 5a5c9ed6a42..c5fbb155923 100644
--- a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.cc
+++ b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.h"
+#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.h"
 
 #include <memory>
 #include <string>
diff --git a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.h b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.h
similarity index 90%
rename from tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.h
rename to tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.h
index 4e75aab6e16..995aa5390e4 100644
--- a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.h
+++ b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_CU_H_
-#define TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_CU_H_
+#ifndef TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_H_
+#define TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_H_
 
 #include <memory>
 #include <string>
@@ -43,14 +43,12 @@ class MlirGeneratedUnaryOp : public OpKernel {
   absl::Mutex mu_;
 };
 
-#define GENERATE_OP_KERNEL_BASE(kernel_name)                                  \
-  class MlirGenerated##kernel_name##Op : public MlirGeneratedUnaryOp {        \
-   public:                                                                    \
-    MlirGenerated##kernel_name##Op(OpKernelConstruction* ctx,                 \
-                                   absl::Span<const uint8_t> cubin_data)      \
-        : MlirGeneratedUnaryOp(ctx,                                           \
-                               absl::AsciiStrToLower(#kernel_name "_kernel"), \
-                               cubin_data) {}                                 \
+#define GENERATE_OP_KERNEL_BASE(kernel_name)                               \
+  class MlirGenerated##kernel_name##Op : public MlirGeneratedUnaryOp {     \
+   public:                                                                 \
+    MlirGenerated##kernel_name##Op(OpKernelConstruction* ctx,              \
+                                   absl::Span<const uint8_t> cubin_data)   \
+        : MlirGeneratedUnaryOp(ctx, #kernel_name "_kernel", cubin_data) {} \
   };
 
 #define GENERATE_OP_KERNEL_FOR(kernel_name, data_type)    \
@@ -74,4 +72,4 @@ class MlirGeneratedUnaryOp : public OpKernel {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_CU_H_
+#endif  // TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_CWISE_OP_GPU_BASE_H_
diff --git a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_tanh.cu.cc b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_tanh.cc
similarity index 99%
rename from tensorflow/core/kernels/mlir_generated/cwise_op_gpu_tanh.cu.cc
rename to tensorflow/core/kernels/mlir_generated/cwise_op_gpu_tanh.cc
index b113c4cad34..72469a33378 100644
--- a/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_tanh.cu.cc
+++ b/tensorflow/core/kernels/mlir_generated/cwise_op_gpu_tanh.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.cu.h"
+#include "tensorflow/core/kernels/mlir_generated/cwise_op_gpu_base.h"
 #include "tensorflow/core/kernels/mlir_generated/tanh_f16_kernel.h"
 #include "tensorflow/core/kernels/mlir_generated/tanh_f32_kernel.h"
 #include "tensorflow/core/kernels/mlir_generated/tanh_f64_kernel.h"
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/abs.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/abs.mlir.tmpl
index d4c9bd5eaed..c0dcd77a9e7 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/abs.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/abs.mlir.tmpl
@@ -1,5 +1,5 @@
-func @abs(%arg0: tensor<?xelem_type>) -> tensor<?xelem_type> {
-  %0 = "tf.Abs"(%arg0) { }
-    : (tensor<?xelem_type>) -> tensor<?xelem_type>
-  return %0 : tensor<?xelem_type>
+func @Abs_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Abs"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/acos.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/acos.mlir.tmpl
new file mode 100644
index 00000000000..4727685d597
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/acos.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Acos_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Acos"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/acosh.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/acosh.mlir.tmpl
new file mode 100644
index 00000000000..7d2fc844f7d
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/acosh.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Acosh_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Acosh"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/angle.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/angle.mlir.tmpl
new file mode 100644
index 00000000000..8605741dfea
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/angle.mlir.tmpl
@@ -0,0 +1,4 @@
+func @Angle(%arg0: tensor<?xcomplex<elem_type>>) -> tensor<?xelem_type> {
+  %0 = "tf.Angle"(%arg0) : (tensor<?xcomplex<elem_type>>) -> tensor<?xelem_type>
+  return %0 : tensor<?xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/asin.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/asin.mlir.tmpl
new file mode 100644
index 00000000000..741ca1b145c
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/asin.mlir.tmpl
@@ -0,0 +1,4 @@
+func @Asin(%arg0: tensor<?xelem_type>) -> tensor<?xelem_type> {
+  %0 = "tf.Asin"(%arg0) : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  return %0 : tensor<?xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/atan.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/atan.mlir.tmpl
new file mode 100644
index 00000000000..80e22f38dbe
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/atan.mlir.tmpl
@@ -0,0 +1,4 @@
+func @Atan(%arg0: tensor<?xelem_type>) -> tensor<?xelem_type> {
+  %0 = "tf.Atan"(%arg0) : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  return %0 : tensor<?xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/bias_add.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/bias_add.mlir.tmpl
index f58b6c0c1cb..210df66516c 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/bias_add.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/bias_add.mlir.tmpl
@@ -1,6 +1,6 @@
-func @bias_add(%arg0: tensor<?x?xelem_type>,
-         %arg1: tensor<?xelem_type>) -> tensor<?x?xelem_type> {
-  %0 = "tf.BiasAdd"(%arg0, %arg1) { }
-    : (tensor<?x?xelem_type>, tensor<?xelem_type>) -> tensor<?x?xelem_type>
+func @BiasAdd(%arg0: tensor<?x?xelem_type>, %arg1: tensor<?xelem_type>)
+    -> tensor<?x?xelem_type> {
+  %0 = "tf.BiasAdd"(%arg0, %arg1)
+      : (tensor<?x?xelem_type>, tensor<?xelem_type>) -> tensor<?x?xelem_type>
   return %0 : tensor<?x?xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/ceil.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/ceil.mlir.tmpl
new file mode 100644
index 00000000000..8eac1b6a602
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/ceil.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Ceil_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Ceil"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/conj.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/conj.mlir.tmpl
new file mode 100644
index 00000000000..963a0740c6f
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/conj.mlir.tmpl
@@ -0,0 +1,4 @@
+func @Conj(%arg0: tensor<?xelem_type>) -> tensor<?xelem_type> {
+  %0 = "tf.Conj"(%arg0) : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  return %0 : tensor<?xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/cos.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/cos.mlir.tmpl
new file mode 100644
index 00000000000..297cc82e2b0
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/cos.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Cos_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Cos"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/cosh.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/cosh.mlir.tmpl
new file mode 100644
index 00000000000..937e8d21ee6
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/cosh.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Cosh_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Cosh"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/digamma.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/digamma.mlir.tmpl
new file mode 100644
index 00000000000..3a14e107a51
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/digamma.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Digamma_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Digamma"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/erf.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/erf.mlir.tmpl
new file mode 100644
index 00000000000..3299dedf434
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/erf.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Erf_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Erf"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/erfc.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/erfc.mlir.tmpl
new file mode 100644
index 00000000000..f42cf6da336
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/erfc.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Erfc_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Erfc"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/exp.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/exp.mlir.tmpl
new file mode 100644
index 00000000000..e9725c8b174
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/exp.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Exp_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Exp"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/expm1.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/expm1.mlir.tmpl
new file mode 100644
index 00000000000..36abfa3e019
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/expm1.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Expm1_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Expm1"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/floor.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/floor.mlir.tmpl
new file mode 100644
index 00000000000..fcd5ab74f5b
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/floor.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Floor_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Floor"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/imag.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/imag.mlir.tmpl
new file mode 100644
index 00000000000..c68c85a798c
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/imag.mlir.tmpl
@@ -0,0 +1,4 @@
+func @Imag(%arg0: tensor<?xcomplex<elem_type>>) -> tensor<?xelem_type> {
+  %0 = "tf.Imag"(%arg0) : (tensor<?xcomplex<elem_type>>) -> tensor<?xelem_type>
+  return %0 : tensor<?xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/invert.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/invert.mlir.tmpl
new file mode 100644
index 00000000000..28287d34832
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/invert.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Invert_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Invert"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/isfinite.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/isfinite.mlir.tmpl
new file mode 100644
index 00000000000..73784adec09
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/isfinite.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Isfinite_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xi1> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.IsFinite"(%arg0) : (tensor<*xelem_type>) -> tensor<*xi1>
+  return %0 : tensor<*xi1>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/isinf.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/isinf.mlir.tmpl
new file mode 100644
index 00000000000..b8477acca4e
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/isinf.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Isinf_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.IsInf"(%arg0) : (tensor<*xelem_type>) -> tensor<*xi1>
+  return %0 : tensor<*xi1>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/isnan.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/isnan.mlir.tmpl
new file mode 100644
index 00000000000..ac0c09d22d4
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/isnan.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Isnan_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.IsNan"(%arg0) : (tensor<*xelem_type>) -> tensor<*xi1>
+  return %0 : tensor<*xi1>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/lgamma.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/lgamma.mlir.tmpl
new file mode 100644
index 00000000000..ac3e44db23f
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/lgamma.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Lgamma_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Lgamma"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/log.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/log.mlir.tmpl
new file mode 100644
index 00000000000..639e7ed5ffe
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/log.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Log_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Log"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/log1p.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/log1p.mlir.tmpl
new file mode 100644
index 00000000000..6604d5f763a
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/log1p.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Log1p_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Log1p"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/logicalnot.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/logicalnot.mlir.tmpl
new file mode 100644
index 00000000000..74750fd4749
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/logicalnot.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Logicalnot_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.LogicalNot"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/neg.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/neg.mlir.tmpl
new file mode 100644
index 00000000000..fbf0d4fbfb5
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/neg.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Neg_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Neg"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/real.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/real.mlir.tmpl
new file mode 100644
index 00000000000..600fbe563b8
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/real.mlir.tmpl
@@ -0,0 +1,4 @@
+func @Real(%arg0: tensor<?xcomplex<elem_type>>) -> tensor<?xelem_type> {
+  %0 = "tf.Real"(%arg0) : (tensor<?xcomplex<elem_type>>) -> tensor<?xelem_type>
+  return %0 : tensor<?xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/reciprocal.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/reciprocal.mlir.tmpl
new file mode 100644
index 00000000000..9548f4b2bc2
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/reciprocal.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Reciprocal_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Reciprocal"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/relu.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/relu.mlir.tmpl
index 7e7082ff295..4aaacf33df0 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/relu.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/relu.mlir.tmpl
@@ -1,5 +1,5 @@
-func @relu(%arg0: tensor<?xelem_type>) -> tensor<?xelem_type> {
-  %0 = "tf.Relu"(%arg0) { }
-    : (tensor<?xelem_type>) -> tensor<?xelem_type>
-  return %0 : tensor<?xelem_type>
+func @Relu_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Relu"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/rint.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/rint.mlir.tmpl
new file mode 100644
index 00000000000..174ebc2210e
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/rint.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Rint_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Rint"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/round.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/round.mlir.tmpl
new file mode 100644
index 00000000000..a5d9b79841a
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/round.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Round_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Round"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/rsqrt.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/rsqrt.mlir.tmpl
new file mode 100644
index 00000000000..e2a3bf8a5df
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/rsqrt.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Rsqrt_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Rsqrt"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/sigmoid.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/sigmoid.mlir.tmpl
new file mode 100644
index 00000000000..7969fffdbce
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/sigmoid.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Sigmoid_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Sigmoid"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/sign.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/sign.mlir.tmpl
new file mode 100644
index 00000000000..8642c8b0b3e
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/sign.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Sign_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Sign"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/sin.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/sin.mlir.tmpl
new file mode 100644
index 00000000000..0e739740f93
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/sin.mlir.tmpl
@@ -0,0 +1,4 @@
+func @Sin(%arg0: tensor<?xelem_type>) -> tensor<?xelem_type> {
+  %0 = "tf.Sin"(%arg0) : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  return %0 : tensor<?xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/sinh.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/sinh.mlir.tmpl
new file mode 100644
index 00000000000..4b110bae313
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/sinh.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Sinh_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Sinh"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/sqrt.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/sqrt.mlir.tmpl
new file mode 100644
index 00000000000..2a2c1ce1d05
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/sqrt.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Sqrt_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Sqrt"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/square.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/square.mlir.tmpl
new file mode 100644
index 00000000000..45bdc13e182
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/square.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Square_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Square"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/tan.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/tan.mlir.tmpl
new file mode 100644
index 00000000000..1913e755b36
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/tan.mlir.tmpl
@@ -0,0 +1,5 @@
+func @Tan_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Tan"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
+}
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/tanh.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/tanh.mlir.tmpl
index 58a9b61ef68..694e4ce10a4 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/tanh.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/tanh.mlir.tmpl
@@ -1,5 +1,5 @@
-func @tanh(%arg0: tensor<?xelem_type>) -> tensor<?xelem_type> {
-  %0 = "tf.Tanh"(%arg0) { }
-    : (tensor<?xelem_type>) -> tensor<?xelem_type>
-  return %0 : tensor<?xelem_type>
+func @Tanh_elem_type(%arg0: tensor<*xelem_type>)
+    -> tensor<*xelem_type> attributes {tf_entry, llvm.emit_c_interface} {
+  %0 = "tf.Tanh"(%arg0) : (tensor<*xelem_type>) -> tensor<*xelem_type>
+  return %0 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/unranked_op_gpu_abs.cc b/tensorflow/core/kernels/mlir_generated/unranked_op_gpu_abs.cc
new file mode 100644
index 00000000000..586d73171f6
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/unranked_op_gpu_abs.cc
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/mlir_generated/unranked_op_gpu_base.h"
+
+namespace tensorflow {
+
+REGISTER_AND_GENERATE_KERNEL(Abs, f16, DT_HALF, Eigen::half);
+REGISTER_AND_GENERATE_KERNEL(Abs, f32, DT_FLOAT, float);
+REGISTER_AND_GENERATE_KERNEL(Abs, f64, DT_DOUBLE, double);
+REGISTER_AND_GENERATE_KERNEL(Abs, i32, DT_INT32, int32);
+REGISTER_AND_GENERATE_KERNEL(Abs, i64, DT_INT64, int64);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mlir_generated/unranked_op_gpu_base.cc b/tensorflow/core/kernels/mlir_generated/unranked_op_gpu_base.cc
new file mode 100644
index 00000000000..bc4a36facd3
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/unranked_op_gpu_base.cc
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/mlir_generated/unranked_op_gpu_base.h"
+
+#include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+namespace {
+
+// A simple TensorBuffer implementation that allows us to create Tensors that
+// take ownership of pre-allocated memory.
+class MlirTensorBuffer : public TensorBuffer {
+ public:
+  MlirTensorBuffer(const void* ptr, size_t size, Allocator* allocator)
+      : TensorBuffer(const_cast<void*>(ptr)),
+        size_(size),
+        allocator_(allocator) {}
+
+  ~MlirTensorBuffer() override {
+    if (data()) {
+      allocator_->DeallocateRaw(data());
+    }
+  }
+
+  size_t size() const override { return size_; }
+
+  TensorBuffer* root_buffer() override { return this; }
+
+  void FillAllocationDescription(AllocationDescription* proto) const override {
+    proto->set_allocated_bytes(size_);
+  }
+
+ private:
+  size_t size_;
+  Allocator* allocator_;
+};
+
+}  // namespace
+
+TensorBuffer* GetMlirTensorBuffer(const void* ptr, size_t size,
+                                  Allocator* allocator) {
+  return new MlirTensorBuffer(ptr, size, allocator);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mlir_generated/unranked_op_gpu_base.h b/tensorflow/core/kernels/mlir_generated/unranked_op_gpu_base.h
new file mode 100644
index 00000000000..457658948ed
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/unranked_op_gpu_base.h
@@ -0,0 +1,117 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_UNRANKED_OP_GPU_ABS_H_
+#define TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_UNRANKED_OP_GPU_ABS_H_
+
+#include "mlir/ExecutionEngine/CRunnerUtils.h"  // from @llvm-project
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+// Returns a pointer to an allocated MlirTensorBuffer that takes ownership of
+// pre-allocated memory.
+TensorBuffer* GetMlirTensorBuffer(const void* ptr, size_t size,
+                                  Allocator* allocator);
+
+template <typename ElemType>
+::UnrankedMemRefType<ElemType> ConvertTensorToDescriptor(const Tensor& tensor) {
+  ::UnrankedMemRefType<ElemType> result;
+  result.rank = tensor.dims();
+  result.descriptor = malloc(sizeof(void*) * (2 * result.rank + 3));
+
+  // Fill the descriptor.
+  void** pointers = static_cast<void**>(result.descriptor);
+  pointers[0] = tensor.data();
+  pointers[1] = tensor.data();
+  intptr_t* int_pointers = static_cast<intptr_t*>(result.descriptor);
+  int_pointers[2] = 0;
+  // Fill size.
+  for (int i = 0; i < result.rank; ++i) {
+    int_pointers[3 + i] = tensor.dim_size(i);
+  }
+  // Fill strides.
+  int64_t stride = 1;
+  for (int i = result.rank - 1; i >= 0; --i) {
+    int_pointers[i + result.rank + 3] = stride;
+    stride *= tensor.dim_size(i);
+  }
+  return result;
+}
+
+template <typename ElemType>
+Tensor ConvertDescriptorToTensor(
+    ::UnrankedMemRefType<ElemType> unranked_descriptor, DataType tf_data_type,
+    Allocator* allocator) {
+  void* base_ptr = static_cast<void**>(unranked_descriptor.descriptor)[0];
+  TensorShape result_shape;
+  intptr_t* pointers = static_cast<intptr_t*>(unranked_descriptor.descriptor);
+  for (int i = 0; i < unranked_descriptor.rank; ++i) {
+    result_shape.AddDim(pointers[3 + i]);
+  }
+  TensorBuffer* buffer = GetMlirTensorBuffer(
+      base_ptr, sizeof(ElemType) * result_shape.num_elements(), allocator);
+
+  // Tensor takes ownership of the buffer.
+  Tensor tensor{tf_data_type, result_shape, buffer};
+  // When Tensor is constructed, its ref-counter is incremented. We need to
+  // decrement it back.
+  buffer->Unref();
+  return tensor;
+}
+
+#define MLIR_FUNCTION(tf_op, mlir_type) _mlir_ciface_##tf_op##_##mlir_type
+
+// Generates a class derived from OpKernel with Compute function that converts
+// input tensors to unranked memref descriptors and calls mlir-generated
+// unranked kernel. The outputs are converted back to tensors using
+// MlirTensorBuffer to take ownership of pre-allocated memory.
+#define REGISTER_AND_GENERATE_KERNEL(tf_op, mlir_type, tf_data_type,          \
+                                     data_type)                               \
+  extern "C" ::UnrankedMemRefType<data_type> MLIR_FUNCTION(tf_op, mlir_type)( \
+      tensorflow::OpKernelContext * ctx,                                      \
+      ::UnrankedMemRefType<data_type> * arg);                                 \
+                                                                              \
+  namespace {                                                                 \
+  class MlirUnranked##tf_op##mlir_type##Op : public OpKernel {                \
+   public:                                                                    \
+    MlirUnranked##tf_op##mlir_type##Op(OpKernelConstruction* ctx)             \
+        : OpKernel(ctx) {}                                                    \
+                                                                              \
+    void Compute(OpKernelContext* ctx) override {                             \
+      const Tensor& input = ctx->input(0);                                    \
+                                                                              \
+      auto input_desc = ConvertTensorToDescriptor<data_type>(input);          \
+      auto result_desc = MLIR_FUNCTION(tf_op, mlir_type)(ctx, &input_desc);   \
+      free(input_desc.descriptor);                                            \
+                                                                              \
+      tensorflow::AllocatorAttributes attrs;                                  \
+      auto* allocator = ctx->get_allocator(attrs);                            \
+                                                                              \
+      Tensor result_tensor = ConvertDescriptorToTensor<data_type>(            \
+          result_desc, tf_data_type, allocator);                              \
+      free(result_desc.descriptor);                                           \
+      ctx->set_output(0, result_tensor);                                      \
+    }                                                                         \
+  };                                                                          \
+  }                                                                           \
+                                                                              \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name(#tf_op).Device(DEVICE_GPU).TypeConstraint<data_type>("T"),         \
+      MlirUnranked##tf_op##mlir_type##Op);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_UNRANKED_OP_GPU_ABS_H_
diff --git a/tensorflow/core/kernels/mlir_generated/unranked_op_gpu_tanh.cc b/tensorflow/core/kernels/mlir_generated/unranked_op_gpu_tanh.cc
new file mode 100644
index 00000000000..206c0756e9c
--- /dev/null
+++ b/tensorflow/core/kernels/mlir_generated/unranked_op_gpu_tanh.cc
@@ -0,0 +1,25 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/kernels/mlir_generated/unranked_op_gpu_base.h"
+
+namespace tensorflow {
+
+REGISTER_AND_GENERATE_KERNEL(Tanh, f16, DT_HALF, Eigen::half);
+REGISTER_AND_GENERATE_KERNEL(Tanh, f32, DT_FLOAT, float);
+REGISTER_AND_GENERATE_KERNEL(Tanh, f64, DT_DOUBLE, double);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
index 95bc0ed357a..bc3232170f2 100644
--- a/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/multinomial_op_gpu.cu.cc
@@ -66,8 +66,9 @@ struct MultinomialFunctor<GPUDevice, T, OutputType> {
                   typename TTypes<OutputType>::Matrix output) {
     // Uniform, [0, 1).
     typedef random::UniformDistribution<random::PhiloxRandom, float> Dist;
-    functor::FillPhiloxRandom<GPUDevice, Dist>()(ctx, d, gen, noises.data(),
-                                                 noises.size(), Dist());
+    functor::FillPhiloxRandom<GPUDevice, Dist>()(
+        ctx, d, /*key=*/nullptr, /*counter=*/nullptr, gen, noises.data(),
+        noises.size(), Dist());
 
 #if defined(EIGEN_HAS_INDEX_LIST)
     Eigen::IndexList<int, int, int> bsc;
diff --git a/tensorflow/core/kernels/neon/BUILD b/tensorflow/core/kernels/neon/BUILD
index 0a9bd5a7fd5..668ac4b2f64 100644
--- a/tensorflow/core/kernels/neon/BUILD
+++ b/tensorflow/core/kernels/neon/BUILD
@@ -1,7 +1,4 @@
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_kernel_library",
-)
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 
 # Description:
 # Kernel implementations using Neon intrinsics.
diff --git a/tensorflow/core/kernels/nextafter_op.cc b/tensorflow/core/kernels/nextafter_op.cc
index d97b7373bba..923fc2399d1 100644
--- a/tensorflow/core/kernels/nextafter_op.cc
+++ b/tensorflow/core/kernels/nextafter_op.cc
@@ -22,15 +22,6 @@ namespace tensorflow {
 
 REGISTER2(BinaryOp, CPU, "NextAfter", functor::nextafter, float, double);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(TYPE)                                     \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("NextAfter").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      BinaryOp<SYCLDevice, functor::nextafter<TYPE>>);
-REGISTER_SYCL_KERNEL(float);
-REGISTER_SYCL_KERNEL(double);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER2(BinaryOp, GPU, "NextAfter", functor::nextafter, float, double);
diff --git a/tensorflow/core/kernels/nth_element_op.cc b/tensorflow/core/kernels/nth_element_op.cc
index dced32ef7df..95328b3e469 100644
--- a/tensorflow/core/kernels/nth_element_op.cc
+++ b/tensorflow/core/kernels/nth_element_op.cc
@@ -95,7 +95,8 @@ struct NthElementFunctor<CPUDevice, T> {
     const int last_dim = input_tensor.dim_size(input_tensor.dims() - 1);
 
     // Allocate each row to different shard.
-    auto SubNthElement = [&, input, output, last_dim, n](int start, int limit) {
+    auto SubNthElement = [&, input, output, last_dim, n](int64 start,
+                                                         int64 limit) {
       // std::nth_element would rearrange the array, so we need a new buffer.
       std::vector<T> buf(last_dim);
 
diff --git a/tensorflow/core/kernels/pack_op.cc b/tensorflow/core/kernels/pack_op.cc
index 04b5c72b3cf..1418159ff8a 100644
--- a/tensorflow/core/kernels/pack_op.cc
+++ b/tensorflow/core/kernels/pack_op.cc
@@ -34,9 +34,6 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 typedef Eigen::GpuDevice GPUDevice;
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 // --------------------------------------------------------------------------
 template <typename Device, typename T>
@@ -115,12 +112,6 @@ class PackOp : public OpKernel {
         return;
       }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#ifdef TENSORFLOW_USE_SYCL
-      if (std::is_same<Device, SYCLDevice>::value) {
-        ConcatSYCL<T>(c->eigen_sycl_device(), inputs_flat, &output_flat);
-        return;
-      }
-#endif  // TENSORFLOW_USE_SYCL
       ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
     }
   }
@@ -170,19 +161,4 @@ REGISTER_KERNEL_BUILDER(Name("Pack")
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                                       \
-  REGISTER_KERNEL_BUILDER(                                        \
-      Name("Pack").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      PackOp<SYCLDevice, type>)
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
-REGISTER_KERNEL_BUILDER(Name("Pack")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("values")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T"),
-                        PackOp<CPUDevice, int32>);
-#undef REGISTER_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pad_op.cc b/tensorflow/core/kernels/pad_op.cc
index 0b404238a14..4a1d0cfc3e2 100644
--- a/tensorflow/core/kernels/pad_op.cc
+++ b/tensorflow/core/kernels/pad_op.cc
@@ -38,9 +38,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T, typename Tpadding>
 class PadOp : public OpKernel {
@@ -392,72 +389,5 @@ REGISTER_KERNEL_BUILDER(Name("PadV2")
                         PadOp<CPUDevice, int32, int64>);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-// Registration of the GPU implementations.
-#define REGISTER_SYCL_KERNEL(T)                                   \
-  REGISTER_KERNEL_BUILDER(Name("Pad")                             \
-                              .Device(DEVICE_SYCL)                \
-                              .TypeConstraint<T>("T")             \
-                              .TypeConstraint<int32>("Tpaddings") \
-                              .HostMemory("paddings"),            \
-                          PadOp<SYCLDevice, T, int32>);           \
-  REGISTER_KERNEL_BUILDER(Name("Pad")                             \
-                              .Device(DEVICE_SYCL)                \
-                              .TypeConstraint<T>("T")             \
-                              .TypeConstraint<int64>("Tpaddings") \
-                              .HostMemory("paddings"),            \
-                          PadOp<SYCLDevice, T, int64>);           \
-  REGISTER_KERNEL_BUILDER(Name("PadV2")                           \
-                              .Device(DEVICE_SYCL)                \
-                              .TypeConstraint<T>("T")             \
-                              .TypeConstraint<int32>("Tpaddings") \
-                              .HostMemory("paddings")             \
-                              .HostMemory("constant_values"),     \
-                          PadOp<SYCLDevice, T, int32>)            \
-  REGISTER_KERNEL_BUILDER(Name("PadV2")                           \
-                              .Device(DEVICE_SYCL)                \
-                              .TypeConstraint<T>("T")             \
-                              .TypeConstraint<int64>("Tpaddings") \
-                              .HostMemory("paddings")             \
-                              .HostMemory("constant_values"),     \
-                          PadOp<SYCLDevice, T, int64>)
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL);
-REGISTER_KERNEL_BUILDER(Name("Pad")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Tpaddings")
-                            .HostMemory("input")
-                            .HostMemory("paddings")
-                            .HostMemory("output"),
-                        PadOp<CPUDevice, int32, int32>);
-REGISTER_KERNEL_BUILDER(Name("Pad")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int64>("Tpaddings")
-                            .HostMemory("input")
-                            .HostMemory("paddings")
-                            .HostMemory("output"),
-                        PadOp<CPUDevice, int32, int64>);
-REGISTER_KERNEL_BUILDER(Name("PadV2")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Tpaddings")
-                            .HostMemory("input")
-                            .HostMemory("paddings")
-                            .HostMemory("constant_values")
-                            .HostMemory("output"),
-                        PadOp<CPUDevice, int32, int32>);
-REGISTER_KERNEL_BUILDER(Name("PadV2")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int64>("Tpaddings")
-                            .HostMemory("input")
-                            .HostMemory("paddings")
-                            .HostMemory("constant_values")
-                            .HostMemory("output"),
-                        PadOp<CPUDevice, int32, int64>);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
index ba1fd280ce7..a63457551ac 100644
--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
@@ -70,8 +70,8 @@ struct TruncatedNormalFunctor<CPUDevice, T> {
 
     auto do_work = [samples_per_batch, num_elements, &ctx, &means, &stddevs,
                     &minvals, &maxvals, &gen, &output,
-                    kStdDevsInsideBoundsToUseRandnSampler](int start_batch,
-                                                           int limit_batch) {
+                    kStdDevsInsideBoundsToUseRandnSampler](int64 start_batch,
+                                                           int64 limit_batch) {
       // Capturing "gen" by-value would only make a copy for the _shared_
       // lambda.  Since we want to let each worker have its own copy, we pass
       // "gen" by reference and explicitly do a copy assignment here.
@@ -333,8 +333,8 @@ struct TruncatedNormalFunctorV2<CPUDevice, T> {
 
     auto do_work = [num_batches, samples_per_batch, &ctx, &bcast, &means,
                     &stddevs, &minvals, &maxvals, &gen, &output,
-                    kStdDevsInsideBoundsToUseRandnSampler](int start_output,
-                                                           int limit_output) {
+                    kStdDevsInsideBoundsToUseRandnSampler](int64 start_output,
+                                                           int64 limit_output) {
       // Capturing "gen" by-value would only make a copy for the _shared_
       // lambda.  Since we want to let each worker have its own copy, we pass
       // "gen" by reference and explicitly do a copy assignment here.
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index d8112531b73..6a1e2d5e29f 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -257,13 +257,7 @@ void PartitionedCallOp::RunFunction(FunctionLibraryRuntime::Handle handle,
 
   std::vector<Tensor>* rets = new std::vector<Tensor>;
   const string& func_name = func_->name();
-  profiler::TraceMe trace_me(
-      [&] {
-        return absl::StrCat(
-            "PartitionedCallOp #parent_step_id=", ctx->step_id(),
-            ",function_step_id=", run_opts.step_id, "#");
-      },
-      /*level=*/2);
+  profiler::TraceMe trace_me("PartitionedCallOp");
   lib->Run(run_opts, handle, inputs, rets,
            [rets, done = std::move(done), ctx, func_name,
             step_container](const Status& status) {
@@ -295,11 +289,5 @@ REGISTER_KERNEL_BUILDER(Name("StatefulPartitionedCall").Device(DEVICE_GPU),
 REGISTER_INPUT_COLOCATION_EXEMPTION("PartitionedCall");
 REGISTER_INPUT_COLOCATION_EXEMPTION("StatefulPartitionedCall");
 
-#if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("PartitionedCall").Device(DEVICE_SYCL),
-                        PartitionedCallOp);
-REGISTER_KERNEL_BUILDER(Name("StatefulPartitionedCall").Device(DEVICE_SYCL),
-                        PartitionedCallOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index 532d861e615..1114e6931ec 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -39,17 +39,11 @@ limitations under the License.
 #include "tensorflow/core/kernels/pooling_ops_3d_gpu.h"
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#include "tensorflow/core/kernels/pooling_ops_3d_sycl.h"
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 Pool3dParameters::Pool3dParameters(OpKernelContext* context,
                                    const std::vector<int32>& ksize,
@@ -830,11 +824,6 @@ TF_CALL_float(REGISTER_GPU_KERNELS) TF_CALL_half(REGISTER_GPU_KERNELS)
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T)
-    TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS)
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef REGISTER_KERNELS
 
diff --git a/tensorflow/core/kernels/pooling_ops_3d_sycl.h b/tensorflow/core/kernels/pooling_ops_3d_sycl.h
deleted file mode 100644
index b4bead2456d..00000000000
--- a/tensorflow/core/kernels/pooling_ops_3d_sycl.h
+++ /dev/null
@@ -1,758 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#if !TENSORFLOW_USE_SYCL
-#error This file must only be included when building with SYCL support
-#endif
-
-#ifndef TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_SYCL_H_
-#define TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_SYCL_H_
-
-#include "tensorflow/core/kernels/pooling_ops_3d.h"
-
-namespace tensorflow {
-
-typedef Eigen::SyclDevice SYCLDevice;
-
-// Helper struct to contain the various pool parameters used in the SYCL
-// pooling kernels. Similar to the Pool3dParameters, but with a number of
-// convenient constructors.
-struct SYCL3DPoolParams {
-  SYCL3DPoolParams(const int depth, const int batch, const int in_planes,
-                   const int in_rows, const int in_cols, const int out_planes,
-                   const int out_rows, const int out_cols,
-                   const std::array<int64, 3>& window,
-                   const std::array<int64, 3>& stride,
-                   const std::array<int64, 3>& padding)
-      : depth_(depth),
-        batch_(batch),
-        in_planes_(in_planes),
-        in_rows_(in_rows),
-        in_cols_(in_cols),
-        window_planes_(window[2]),
-        window_rows_(window[1]),
-        window_cols_(window[0]),
-        stride_planes_(stride[2]),
-        stride_rows_(stride[1]),
-        stride_cols_(stride[0]),
-        out_planes_(out_planes),
-        out_rows_(out_rows),
-        out_cols_(out_cols),
-        pad_planes_(padding[2]),
-        pad_rows_(padding[1]),
-        pad_cols_(padding[0]) {}
-
-  SYCL3DPoolParams(const int depth, const int batch, const int in_planes,
-                   const int in_rows, const int in_cols,
-                   const std::array<int64, 3>& out_shape,
-                   const std::array<int64, 3>& window,
-                   const std::array<int64, 3>& stride,
-                   const std::array<int64, 3>& padding)
-      : SYCL3DPoolParams(depth, batch, in_planes, in_rows, in_cols,
-                         out_shape[2], out_shape[1], out_shape[0], window,
-                         stride, padding) {}
-
-  SYCL3DPoolParams(const Pool3dParameters& params)
-      : depth_(params.depth),
-        batch_(params.tensor_in_batch),
-        in_planes_(params.tensor_in_planes),
-        in_rows_(params.tensor_in_rows),
-        in_cols_(params.tensor_in_cols),
-        window_planes_(params.window_planes),
-        window_rows_(params.window_rows),
-        window_cols_(params.window_cols),
-        stride_planes_(params.plane_stride),
-        stride_rows_(params.row_stride),
-        stride_cols_(params.col_stride),
-        out_planes_(params.out_plane),
-        out_rows_(params.out_height),
-        out_cols_(params.out_width),
-        pad_planes_(params.pad_planes),
-        pad_rows_(params.pad_rows),
-        pad_cols_(params.pad_cols) {}
-
-  const int depth_;
-  const int batch_;
-  const int in_planes_;
-  const int in_rows_;
-  const int in_cols_;
-
-  const int window_planes_;
-  const int window_rows_;
-  const int window_cols_;
-
-  const int stride_planes_;
-  const int stride_rows_;
-  const int stride_cols_;
-
-  const int out_planes_;
-  const int out_rows_;
-  const int out_cols_;
-
-  const int pad_planes_;
-  const int pad_rows_;
-  const int pad_cols_;
-};
-// MaxPool3d SYCL kernel. Expects the number of threads to be equal to the
-// number of elements in the output tensor.
-//
-// For each output element, find the corresponding input window and run over
-// all values in the window to find the maximum value. This value is then
-// copied into that output element.
-template <typename T>
-class MaxPool3DSYCL {
-  using write_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
-                         cl::sycl::access::target::global_buffer>;
-  using read_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
-                         cl::sycl::access::target::global_buffer>;
-
- public:
-  MaxPool3DSYCL(const int depth, const int batch, const int in_planes,
-                const int in_rows, const int in_cols, const int out_planes,
-                const int out_rows, const int out_cols,
-                const std::array<int64, 3>& window,
-                const std::array<int64, 3>& stride,
-                const std::array<int64, 3>& padding,
-                const read_accessor input_accessor,
-                write_accessor output_accessor)
-      : p_(depth, batch, in_planes, in_rows, in_cols, out_planes, out_rows,
-           out_cols, window, stride, padding),
-        input_accessor_(input_accessor),
-        output_accessor_(output_accessor) {}
-  void operator()(cl::sycl::item<1> item) {
-    T* input_data = ConvertToActualTypeSycl(T, input_accessor_);
-    T* output_data = ConvertToActualTypeSycl(T, output_accessor_);
-
-    int index = item.get_linear_id();
-    int n = index;
-    int d = n % p_.depth_;
-    n /= p_.depth_;
-    int cstart = (n % p_.out_cols_) * p_.stride_cols_ - p_.pad_cols_;
-    int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
-    cstart = std::max(cstart, 0);
-    n /= p_.out_cols_;
-    int rstart = (n % p_.out_rows_) * p_.stride_rows_ - p_.pad_rows_;
-    int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
-    rstart = std::max(rstart, 0);
-    n /= p_.out_rows_;
-    int pstart = (n % p_.out_planes_) * p_.stride_planes_ - p_.pad_planes_;
-    int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
-    pstart = std::max(pstart, 0);
-    n /= p_.out_planes_;
-    T maxval = Eigen::NumTraits<T>::lowest();
-    const T* input_data_n =
-        input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
-    for (int p = pstart; p < pend; ++p) {
-      for (int r = rstart; r < rend; ++r) {
-        for (int c = cstart; c < cend; ++c) {
-          int idx = ((p * p_.in_rows_ + r) * p_.in_cols_ + c) * p_.depth_ + d;
-          if (input_data_n[idx] > maxval) {
-            maxval = input_data_n[idx];
-          }
-        }
-      }
-    }
-    output_data[index] = maxval;
-  }
-
- private:
-  const SYCL3DPoolParams p_;
-  const read_accessor input_accessor_;
-  write_accessor output_accessor_;
-};
-template <typename T>
-struct LaunchPoolingOp<SYCLDevice, T, MAX> {
-  static void launch(OpKernelContext* context, const Tensor& tensor_in,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& padding,
-                     TensorFormat data_format, Padding padding_type,
-                     Tensor* output) {
-    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
-    const int out_planes = GetTensorDim(*output, data_format, '0');
-    const int out_rows = GetTensorDim(*output, data_format, '1');
-    const int out_cols = GetTensorDim(*output, data_format, '2');
-    const int batch = GetTensorDim(tensor_in, data_format, 'N');
-    const int in_planes = GetTensorDim(tensor_in, data_format, '0');
-    const int in_rows = GetTensorDim(tensor_in, data_format, '1');
-    const int in_cols = GetTensorDim(tensor_in, data_format, '2');
-    const int depth = GetTensorDim(tensor_in, data_format, 'C');
-
-    const int num_threads = output->NumElements();
-
-    auto input_buffer =
-        device.get_sycl_buffer(tensor_in.template flat<T>().data());
-    auto output_buffer =
-        device.get_sycl_buffer(output->template flat<T>().data());
-
-    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
-      auto input_access =
-          input_buffer.template get_access<cl::sycl::access::mode::read>(cgh);
-      auto output_access =
-          output_buffer.template get_access<cl::sycl::access::mode::write>(cgh);
-      MaxPool3DSYCL<T> max_pool(depth, batch, in_planes, in_rows, in_cols,
-                                out_planes, out_rows, out_cols, window, stride,
-                                padding, input_access, output_access);
-
-      cgh.parallel_for(cl::sycl::range<1>(num_threads), max_pool);
-    });
-  }
-};
-// MaxPool3DGrad SYCL kernel. Expects the number of threads to be equal to the
-// number of elements in the output backprop tensor (i.e. the number of elements
-// in the input data tensor).
-//
-// For each output backprop element we compute the possible window of values in
-// the input backprop tensor which might contribute to this element. Then for
-// each error in this window, compute the corresponding input window which was
-// pooled into that element in the output. Walk through this input window to
-// determine whether the input value is the first maximum value, and so the
-// error should be propagated back to the corresponding backprop element.
-template <typename T>
-class MaxPool3DGradSYCL {
-  using write_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
-                         cl::sycl::access::target::global_buffer>;
-  using read_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
-                         cl::sycl::access::target::global_buffer>;
-
- public:
-  MaxPool3DGradSYCL(const int depth, const int batch, const int in_planes,
-                    const int in_rows, const int in_cols,
-                    const std::array<int64, 3>& output_shape,
-                    const std::array<int64, 3>& window,
-                    const std::array<int64, 3>& stride,
-                    const std::array<int64, 3>& padding,
-                    const read_accessor input_data_accessor,
-                    const read_accessor output_data_accessor,
-                    const read_accessor input_backprop_accessor,
-                    write_accessor output_backprop_accessor)
-      : p_(depth, batch, in_planes, in_rows, in_cols, output_shape, window,
-           stride, padding),
-        input_data_accessor_(input_data_accessor),
-        output_data_accessor_(output_data_accessor),
-        input_backprop_accessor_(input_backprop_accessor),
-        output_backprop_accessor_(output_backprop_accessor) {}
-  void operator()(cl::sycl::item<1> item) {
-    T* input_data = ConvertToActualTypeSycl(T, input_data_accessor_);
-    T* output_data = ConvertToActualTypeSycl(T, output_data_accessor_);
-    T* input_backprop = ConvertToActualTypeSycl(T, input_backprop_accessor_);
-    T* output_backprop = ConvertToActualTypeSycl(T, output_backprop_accessor_);
-
-    const int index = item.get_linear_id();
-    T output_value = 0;
-    int n = index;
-    const int d = n % p_.depth_;
-    n /= p_.depth_;
-    const int c = (n % p_.in_cols_) + p_.pad_cols_;
-    const int poolcstart =
-        (c < p_.window_cols_) ? 0 : (c - p_.window_cols_) / p_.stride_cols_ + 1;
-    const int poolcend = std::min(c / p_.stride_cols_ + 1, p_.out_cols_);
-    n /= p_.in_cols_;
-    const int r = (n % p_.in_rows_) + p_.pad_rows_;
-    const int poolrstart =
-        (r < p_.window_rows_) ? 0 : (r - p_.window_rows_) / p_.stride_rows_ + 1;
-    const int poolrend = std::min(r / p_.stride_rows_ + 1, p_.out_rows_);
-    n /= p_.in_rows_;
-    const int p = (n % p_.in_planes_) + p_.pad_planes_;
-    const int poolpstart =
-        (p < p_.window_planes_)
-            ? 0
-            : (p - p_.window_planes_) / p_.stride_planes_ + 1;
-    const int poolpend = std::min(p / p_.stride_planes_ + 1, p_.out_planes_);
-    n /= p_.in_planes_;
-    const int index_no_n =
-        index - n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
-
-    const T* input_data_n =
-        input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
-    const T* output_data_n = output_data + n * p_.out_planes_ * p_.out_cols_ *
-                                               p_.out_rows_ * p_.depth_;
-    const T* input_backprop_n = input_backprop + n * p_.out_planes_ *
-                                                     p_.out_cols_ *
-                                                     p_.out_rows_ * p_.depth_;
-    for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
-      int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
-      const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
-      pstart = std::max(pstart, 0);
-
-      for (int poolr = poolrstart; poolr < poolrend; ++poolr) {
-        int rstart = poolr * p_.stride_rows_ - p_.pad_rows_;
-        const int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
-        rstart = std::max(rstart, 0);
-
-        for (int poolc = poolcstart; poolc < poolcend; ++poolc) {
-          int cstart = poolc * p_.stride_cols_ - p_.pad_cols_;
-          const int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
-          cstart = std::max(cstart, 0);
-
-          const int output_data_idx =
-              ((poolp * p_.out_rows_ + poolr) * p_.out_cols_ + poolc) *
-                  p_.depth_ +
-              d;
-          bool should_continue = true;
-          bool is_max = (input_data[index] == output_data_n[output_data_idx]);
-          for (int win_p = pstart; win_p < pend && should_continue; ++win_p) {
-            for (int win_r = rstart; win_r < rend && should_continue; ++win_r) {
-              for (int win_c = cstart; win_c < cend && should_continue;
-                   ++win_c) {
-                const int input_data_idx =
-                    ((win_p * p_.in_rows_ + win_r) * p_.in_cols_ + win_c) *
-                        p_.depth_ +
-                    d;
-                if (input_data_idx == index_no_n) {
-                  should_continue = false;
-                } else if (input_data_n[input_data_idx] ==
-                           output_data_n[output_data_idx]) {
-                  should_continue = false;
-                  is_max = false;
-                }
-              }
-            }
-          }
-          if (is_max) {
-            output_value += input_backprop_n[output_data_idx];
-          }
-        }
-      }
-    }
-    output_backprop[index] = output_value;
-  }
-
- private:
-  const SYCL3DPoolParams p_;
-
-  const read_accessor input_data_accessor_;
-  const read_accessor output_data_accessor_;
-  const read_accessor input_backprop_accessor_;
-  write_accessor output_backprop_accessor_;
-};
-template <typename T>
-struct LaunchMaxPooling3dGradOp<SYCLDevice, T> {
-  static void launch(OpKernelContext* context, const Tensor& tensor_in,
-                     const Tensor& tensor_out, const Tensor& out_backprop,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& out,
-                     const std::array<int64, 3>& padding,
-                     TensorFormat data_format, Tensor* output) {
-    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
-    const int batch = GetTensorDim(tensor_in, data_format, 'N');
-    const int in_planes = GetTensorDim(tensor_in, data_format, '0');
-    const int in_rows = GetTensorDim(tensor_in, data_format, '1');
-    const int in_cols = GetTensorDim(tensor_in, data_format, '2');
-    const int depth = GetTensorDim(tensor_in, data_format, 'C');
-
-    const int output_size = output->NumElements();
-
-    auto input_data_buffer =
-        device.get_sycl_buffer(tensor_in.template flat<T>().data());
-    auto output_data_buffer =
-        device.get_sycl_buffer(tensor_out.template flat<T>().data());
-    auto input_backprop_buffer =
-        device.get_sycl_buffer(out_backprop.template flat<T>().data());
-    auto output_backprop_buffer =
-        device.get_sycl_buffer(output->template flat<T>().data());
-
-    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
-      auto input_data_access =
-          input_data_buffer.template get_access<cl::sycl::access::mode::read>(
-              cgh);
-      auto output_data_access =
-          output_data_buffer.template get_access<cl::sycl::access::mode::read>(
-              cgh);
-      auto input_backprop_access =
-          input_backprop_buffer
-              .template get_access<cl::sycl::access::mode::read>(cgh);
-      auto output_backprop_access =
-          output_backprop_buffer
-              .template get_access<cl::sycl::access::mode::write>(cgh);
-      MaxPool3DGradSYCL<T> max_pool(
-          depth, batch, in_planes, in_rows, in_cols, out, window, stride,
-          padding, input_data_access, output_data_access, input_backprop_access,
-          output_backprop_access);
-
-      cgh.parallel_for(cl::sycl::range<1>(output_size), max_pool);
-    });
-  }
-};
-// MaxPool3DGradGrad SYCL kernel. Expects the number of threads to be equal to
-// the number of elements in the output backprop tensor, i.e. the number of
-// elements in the output tensor.
-//
-// For each element in the output backprop tensor, find the corresponding input
-// window, and compare the input and output data to find the index of the
-// maximum value in the input tensor. This is then the index of the gradient to
-// pass through to the output backprop tensor.
-template <typename T>
-class MaxPool3DGradGradSYCL {
-  using write_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
-                         cl::sycl::access::target::global_buffer>;
-  using read_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
-                         cl::sycl::access::target::global_buffer>;
-
- public:
-  MaxPool3DGradGradSYCL(const Pool3dParameters& params,
-                        const read_accessor input_data_accessor,
-                        const read_accessor output_data_accessor,
-                        const read_accessor input_backprop_accessor,
-                        write_accessor output_backprop_accessor)
-      : p_(params),
-        input_data_accessor_(input_data_accessor),
-        output_data_accessor_(output_data_accessor),
-        input_backprop_accessor_(input_backprop_accessor),
-        output_backprop_accessor_(output_backprop_accessor) {}
-  void operator()(cl::sycl::item<1> item) {
-    T* input_data = ConvertToActualTypeSycl(T, input_data_accessor_);
-    T* output_data = ConvertToActualTypeSycl(T, output_data_accessor_);
-    T* input_backprop = ConvertToActualTypeSycl(T, input_backprop_accessor_);
-    T* output_backprop = ConvertToActualTypeSycl(T, output_backprop_accessor_);
-
-    int index = item.get_linear_id();
-    int n = index;
-    int d = n % p_.depth_;
-    n /= p_.depth_;
-    int cstart = (n % p_.out_cols_) * p_.stride_cols_ - p_.pad_cols_;
-    int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
-    cstart = std::max(cstart, 0);
-    n /= p_.out_cols_;
-    int rstart = (n % p_.out_rows_) * p_.stride_rows_ - p_.pad_rows_;
-    int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
-    rstart = std::max(rstart, 0);
-    n /= p_.out_rows_;
-    int pstart = (n % p_.out_planes_) * p_.stride_planes_ - p_.pad_planes_;
-    int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
-    pstart = std::max(pstart, 0);
-    n /= p_.out_planes_;
-    int maxidx = -1;
-    bool should_stop = false;
-    const T* input_data_n =
-        input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
-    for (int p = pstart; p < pend && !should_stop; ++p) {
-      for (int r = rstart; r < rend && !should_stop; ++r) {
-        for (int c = cstart; c < cend && !should_stop; ++c) {
-          int idx = ((p * p_.in_rows_ + r) * p_.in_cols_ + c) * p_.depth_ + d;
-          if (output_data[index] == input_data_n[idx]) {
-            maxidx = idx;
-            should_stop = true;
-          }
-        }
-      }
-    }
-    if (maxidx != -1) {
-      output_backprop[index] = input_backprop[n * p_.in_planes_ * p_.in_rows_ *
-                                                  p_.in_cols_ * p_.depth_ +
-                                              maxidx];
-    }
-  }
-
- private:
-  const SYCL3DPoolParams p_;
-
-  const read_accessor input_data_accessor_;
-  const read_accessor output_data_accessor_;
-  const read_accessor input_backprop_accessor_;
-  write_accessor output_backprop_accessor_;
-};
-template <typename T>
-struct LaunchMaxPooling3dGradGradOp<SYCLDevice, T> {
-  static void launch(OpKernelContext* context, const Pool3dParameters& params,
-                     const Tensor& tensor_in, const Tensor& tensor_out,
-                     const Tensor& out_backprop, Tensor* output) {
-    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
-
-    const int num_threads = output->NumElements();
-
-    auto input_data_buffer =
-        device.get_sycl_buffer(tensor_in.template flat<T>().data());
-    auto output_data_buffer =
-        device.get_sycl_buffer(tensor_out.template flat<T>().data());
-    auto input_backprop_buffer =
-        device.get_sycl_buffer(out_backprop.template flat<T>().data());
-    auto output_backprop_buffer =
-        device.get_sycl_buffer(output->template flat<T>().data());
-
-    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
-      auto input_data_access =
-          input_data_buffer.template get_access<cl::sycl::access::mode::read>(
-              cgh);
-      auto output_data_access =
-          output_data_buffer.template get_access<cl::sycl::access::mode::read>(
-              cgh);
-      auto input_backprop_access =
-          input_backprop_buffer
-              .template get_access<cl::sycl::access::mode::read>(cgh);
-      auto output_backprop_access =
-          output_backprop_buffer
-              .template get_access<cl::sycl::access::mode::write>(cgh);
-      MaxPool3DGradGradSYCL<T> functor(
-          params, input_data_access, output_data_access, input_backprop_access,
-          output_backprop_access);
-
-      cgh.parallel_for(cl::sycl::range<1>(num_threads), functor);
-    });
-  }
-};
-// AvgPool3D SYCL kernel. Expects the number of threads to be equal to the
-// number of elements in the output tensor.
-//
-// For each output value find the corresponding input window, and run through
-// the window accumulating the values to form an average. We divide each value
-// before accumulating to prevent the accumulator from becoming significantly
-// bigger than the values we are adding and so decrease any errors.
-template <typename T>
-class AvgPool3DSYCL {
-  using write_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
-                         cl::sycl::access::target::global_buffer>;
-  using read_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
-                         cl::sycl::access::target::global_buffer>;
-
- public:
-  AvgPool3DSYCL(const int depth, const int batch, const int in_planes,
-                const int in_rows, const int in_cols, const int out_planes,
-                const int out_rows, const int out_cols,
-                const std::array<int64, 3>& window,
-                const std::array<int64, 3>& stride,
-                const std::array<int64, 3>& padding,
-                const read_accessor input_accessor,
-                write_accessor output_accessor)
-      : p_(depth, batch, in_planes, in_rows, in_cols, out_planes, out_rows,
-           out_cols, window, stride, padding),
-        input_accessor_(input_accessor),
-        output_accessor_(output_accessor) {}
-  void operator()(cl::sycl::item<1> item) {
-    T* input_data = ConvertToActualTypeSycl(T, input_accessor_);
-    T* output_data = ConvertToActualTypeSycl(T, output_accessor_);
-
-    int index = item.get_linear_id();
-    int n = index;
-    int d = n % p_.depth_;
-    n /= p_.depth_;
-    int cstart = (n % p_.out_cols_) * p_.stride_cols_ - p_.pad_cols_;
-    int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
-    cstart = std::max(cstart, 0);
-    n /= p_.out_cols_;
-    int rstart = (n % p_.out_rows_) * p_.stride_rows_ - p_.pad_rows_;
-    int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
-    rstart = std::max(rstart, 0);
-    n /= p_.out_rows_;
-    int pstart = (n % p_.out_planes_) * p_.stride_planes_ - p_.pad_planes_;
-    int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
-    pstart = std::max(pstart, 0);
-    n /= p_.out_planes_;
-    T accum = T(0);
-    T count =
-        static_cast<T>((pend - pstart) * (rend - rstart) * (cend - cstart));
-    const T* input_data_n =
-        input_data + n * p_.in_planes_ * p_.in_cols_ * p_.in_rows_ * p_.depth_;
-    for (int p = pstart; p < pend; ++p) {
-      for (int r = rstart; r < rend; ++r) {
-        for (int c = cstart; c < cend; ++c) {
-          int idx = ((p * p_.in_rows_ + r) * p_.in_cols_ + c) * p_.depth_ + d;
-          accum += input_data_n[idx] / count;
-        }
-      }
-    }
-    output_data[index] = accum;
-  }
-
- private:
-  const SYCL3DPoolParams p_;
-  const read_accessor input_accessor_;
-  write_accessor output_accessor_;
-};
-template <typename T>
-struct LaunchPoolingOp<SYCLDevice, T, AVG> {
-  static void launch(OpKernelContext* context, const Tensor& tensor_in,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& padding,
-                     TensorFormat data_format, Padding padding_type,
-                     Tensor* output) {
-    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
-    const int out_planes = GetTensorDim(*output, data_format, '0');
-    const int out_rows = GetTensorDim(*output, data_format, '1');
-    const int out_cols = GetTensorDim(*output, data_format, '2');
-    const int batch = GetTensorDim(tensor_in, data_format, 'N');
-    const int in_planes = GetTensorDim(tensor_in, data_format, '0');
-    const int in_rows = GetTensorDim(tensor_in, data_format, '1');
-    const int in_cols = GetTensorDim(tensor_in, data_format, '2');
-    const int depth = GetTensorDim(tensor_in, data_format, 'C');
-
-    const int num_threads = output->NumElements();
-
-    auto input_buffer =
-        device.get_sycl_buffer(tensor_in.template flat<T>().data());
-    auto output_buffer =
-        device.get_sycl_buffer(output->template flat<T>().data());
-
-    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
-      auto input_access =
-          input_buffer.template get_access<cl::sycl::access::mode::read>(cgh);
-      auto output_access =
-          output_buffer.template get_access<cl::sycl::access::mode::write>(cgh);
-      AvgPool3DSYCL<T> avg_pool(depth, batch, in_planes, in_rows, in_cols,
-                                out_planes, out_rows, out_cols, window, stride,
-                                padding, input_access, output_access);
-
-      cgh.parallel_for(cl::sycl::range<1>(num_threads), avg_pool);
-    });
-  }
-};
-// AvgPool3DGrad SYCL kernel. Expects the number of threads to be equal to the
-// number of elements in the output backprop tensor, i.e. the number of
-// elements in the input tensor.
-//
-// For each output backprop index find a window in the input backprop tensor
-// which corresponds to all the values of the output which were affected by the
-// input value at this index. Then for each gradient in this window, compute
-// the size of the input window which was averaged to give this output, and use
-// this size to scale the gradient accordingly. Add this scaled gradient to the
-// output backprop value.
-template <typename T>
-class AvgPool3DGradSYCL {
-  using write_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::write,
-                         cl::sycl::access::target::global_buffer>;
-  using read_accessor =
-      cl::sycl::accessor<uint8_t, 1, cl::sycl::access::mode::read,
-                         cl::sycl::access::target::global_buffer>;
-
- public:
-  AvgPool3DGradSYCL(const int depth, const int batch, const int in_planes,
-                    const int in_rows, const int in_cols,
-                    const std::array<int64, 3>& out_shape,
-                    const std::array<int64, 3>& window,
-                    const std::array<int64, 3>& stride,
-                    const std::array<int64, 3>& padding,
-                    const read_accessor input_backprop_accessor,
-                    write_accessor output_backprop_accessor)
-      : p_(depth, batch, in_planes, in_rows, in_cols, out_shape, window, stride,
-           padding),
-        input_backprop_accessor_(input_backprop_accessor),
-        output_backprop_accessor_(output_backprop_accessor) {}
-  void operator()(cl::sycl::item<1> item) {
-    T* input_backprop = ConvertToActualTypeSycl(T, input_backprop_accessor_);
-    T* output_backprop = ConvertToActualTypeSycl(T, output_backprop_accessor_);
-
-    const int index = item.get_linear_id();
-    int n = index;
-    const int d = n % p_.depth_;
-    n /= p_.depth_;
-    const int c = (n % p_.in_cols_) + p_.pad_cols_;
-    const int poolcstart =
-        (c < p_.window_cols_) ? 0 : (c - p_.window_cols_) / p_.stride_cols_ + 1;
-    const int poolcend = std::min(c / p_.stride_cols_ + 1, p_.out_cols_);
-    n /= p_.in_cols_;
-    const int r = (n % p_.in_rows_) + p_.pad_rows_;
-    const int poolrstart =
-        (r < p_.window_rows_) ? 0 : (r - p_.window_rows_) / p_.stride_rows_ + 1;
-    const int poolrend = std::min(r / p_.stride_rows_ + 1, p_.out_rows_);
-    n /= p_.in_rows_;
-    const int p = (n % p_.in_planes_) + p_.pad_planes_;
-    const int poolpstart =
-        (p < p_.window_planes_)
-            ? 0
-            : (p - p_.window_planes_) / p_.stride_planes_ + 1;
-    const int poolpend = std::min(p / p_.stride_planes_ + 1, p_.out_planes_);
-    n /= p_.in_planes_;
-
-    T gradient = T(0);
-    const T* input_backprop_n = input_backprop + n * p_.out_planes_ *
-                                                     p_.out_cols_ *
-                                                     p_.out_rows_ * p_.depth_;
-    for (int poolp = poolpstart; poolp < poolpend; ++poolp) {
-      int pstart = poolp * p_.stride_planes_ - p_.pad_planes_;
-      const int pend = std::min(pstart + p_.window_planes_, p_.in_planes_);
-      pstart = std::max(pstart, 0);
-      const int plane_window_size = pend - pstart;
-      for (int poolr = poolrstart; poolr < poolrend; ++poolr) {
-        int rstart = poolr * p_.stride_rows_ - p_.pad_rows_;
-        const int rend = std::min(rstart + p_.window_rows_, p_.in_rows_);
-        rstart = std::max(rstart, 0);
-        const int row_window_size = rend - rstart;
-        for (int poolc = poolcstart; poolc < poolcend; ++poolc) {
-          const int idx =
-              ((poolp * p_.out_rows_ + poolr) * p_.out_cols_ + poolc) *
-                  p_.depth_ +
-              d;
-          int cstart = poolc * p_.stride_cols_ - p_.pad_cols_;
-          const int cend = std::min(cstart + p_.window_cols_, p_.in_cols_);
-          cstart = std::max(cstart, 0);
-          const int col_window_size = cend - cstart;
-          const int window_size =
-              plane_window_size * row_window_size * col_window_size;
-          gradient += input_backprop_n[idx] / static_cast<T>(window_size);
-        }
-      }
-    }
-    output_backprop[index] = gradient;
-  }
-
- private:
-  const SYCL3DPoolParams p_;
-  const read_accessor input_backprop_accessor_;
-  write_accessor output_backprop_accessor_;
-};
-template <typename T>
-struct LaunchAvgPooling3dGradOp<SYCLDevice, T> {
-  static void launch(OpKernelContext* context,
-                     const TensorShape& tensor_in_shape,
-                     const Tensor& out_backprop,
-                     const std::array<int64, 3>& window,
-                     const std::array<int64, 3>& stride,
-                     const std::array<int64, 3>& output_shape,
-                     const std::array<int64, 3>& padding,
-                     TensorFormat data_format, Tensor* output) {
-    const SYCLDevice& device = context->eigen_device<SYCLDevice>();
-    const int batch = GetTensorDim(tensor_in_shape, data_format, 'N');
-    const int in_planes = GetTensorDim(tensor_in_shape, data_format, '0');
-    const int in_rows = GetTensorDim(tensor_in_shape, data_format, '1');
-    const int in_cols = GetTensorDim(tensor_in_shape, data_format, '2');
-    const int depth = GetTensorDim(tensor_in_shape, data_format, 'C');
-
-    const int num_threads = output->NumElements();
-
-    auto input_backprop_buffer =
-        device.get_sycl_buffer(out_backprop.template flat<T>().data());
-    auto output_backprop_buffer =
-        device.get_sycl_buffer(output->template flat<T>().data());
-
-    device.sycl_queue().submit([&](cl::sycl::handler& cgh) {
-      auto input_backprop_access =
-          input_backprop_buffer
-              .template get_access<cl::sycl::access::mode::read>(cgh);
-      auto output_backprop_access =
-          output_backprop_buffer
-              .template get_access<cl::sycl::access::mode::write>(cgh);
-      AvgPool3DGradSYCL<T> functor(
-          depth, batch, in_planes, in_rows, in_cols, output_shape, window,
-          stride, padding, input_backprop_access, output_backprop_access);
-
-      cgh.parallel_for(cl::sycl::range<1>(num_threads), functor);
-    });
-  }
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_POOLING_OP_3D_SYCL_H_
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index 4bd710546fe..a0c07f31b3d 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_shape_util.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -49,12 +50,75 @@ struct RawType<qint8> {
   using type = int8;
 };
 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+template <typename T>
+struct PadInputWithNegativeInf {
+  Status operator()(const GPUDevice& d,
+                    typename TTypes<T, 4, int>::ConstTensor in,
+                    int input_pad_top, int input_pad_bottom, int input_pad_left,
+                    int input_pad_right, typename TTypes<T, 4, int>::Tensor out,
+                    TensorFormat format) {
+    T padding_value = -std::numeric_limits<T>::infinity();
+    functor::PadInput<GPUDevice, T, int, 4>()(
+        d, in, {{input_pad_top, input_pad_left}},
+        {{input_pad_bottom, input_pad_right}}, out, format, padding_value);
+    return Status::OK();
+  }
+};
+
+template <>
+struct PadInputWithNegativeInf<qint8> {
+  Status operator()(const GPUDevice& d,
+                    typename TTypes<qint8, 4, int>::ConstTensor in,
+                    int input_pad_top, int input_pad_bottom, int input_pad_left,
+                    int input_pad_right,
+                    typename TTypes<qint8, 4, int>::Tensor out,
+                    TensorFormat format) {
+    return errors::InvalidArgument(
+        "Explicit padding not yet supported with qint8");
+  }
+};
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
 }  // namespace
 
+Status CheckPaddingSize(int64 window_rows, int64 window_cols, int64 pad_top,
+                        int64 pad_bottom, int64 pad_left, int64 pad_right) {
+  if (!FastBoundsCheck(pad_top, window_rows)) {
+    return errors::InvalidArgument("Top padding ", pad_top,
+                                   " needs to be smaller than the "
+                                   "window size ",
+                                   window_rows);
+  }
+  if (!FastBoundsCheck(pad_bottom, window_rows)) {
+    return errors::InvalidArgument("Bottom padding ", pad_bottom,
+                                   " needs to be smaller than the "
+                                   "window size ",
+                                   window_rows);
+  }
+  if (!FastBoundsCheck(pad_left, window_cols)) {
+    return errors::InvalidArgument("Left padding ", pad_left,
+                                   " needs to be smaller than the "
+                                   "window size ",
+                                   window_cols);
+  }
+  if (!FastBoundsCheck(pad_right, window_cols)) {
+    return errors::InvalidArgument("Right padding ", pad_right,
+                                   " needs to be smaller than the "
+                                   "window size ",
+                                   window_cols);
+  }
+  return Status::OK();
+}
+
 PoolParameters::PoolParameters(OpKernelContext* context,
                                const std::vector<int32>& ksize,
                                const std::vector<int32>& stride,
-                               Padding padding, TensorFormat data_format,
+                               Padding padding,
+                               std::vector<int64> explicit_paddings,
+                               TensorFormat data_format,
                                const TensorShape& tensor_in_shape) {
   // For maxpooling, tensor_in should have 2 spatial dimensions.
   // Note: the total number of dimensions could be 4 for NHWC, NCHW,
@@ -85,14 +149,24 @@ PoolParameters::PoolParameters(OpKernelContext* context,
               errors::Unimplemented(
                   "MaxPooling supports exactly one of pooling across depth "
                   "or pooling across width/height."));
+  if (padding == Padding::EXPLICIT) {
+    OP_REQUIRES_OK(context, CheckValidPadding(padding, explicit_paddings,
+                                              /*num_dims=*/4, data_format));
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'H', &pad_top,
+                             &pad_bottom);
+    GetExplicitPaddingForDim(explicit_paddings, data_format, 'W', &pad_left,
+                             &pad_right);
+    OP_REQUIRES_OK(context, CheckPaddingSize(window_rows, window_cols, pad_top,
+                                             pad_bottom, pad_left, pad_right));
+  }
 
   if (depth_window == 1) {
-    OP_REQUIRES_OK(
-        context, GetWindowedOutputSize(tensor_in_rows, window_rows, row_stride,
-                                       padding, &out_height, &pad_rows));
-    OP_REQUIRES_OK(
-        context, GetWindowedOutputSize(tensor_in_cols, window_cols, col_stride,
-                                       padding, &out_width, &pad_cols));
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                tensor_in_rows, window_rows, row_stride,
+                                padding, &out_height, &pad_top, &pad_bottom));
+    OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
+                                tensor_in_cols, window_cols, col_stride,
+                                padding, &out_width, &pad_left, &pad_right));
     pad_depth = 0;
     out_depth = depth;
   } else {
@@ -140,6 +214,7 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
                               se::dnn::PoolingMode pooling_mode,
                               const std::vector<int32>& size,
                               const std::vector<int32>& stride, Padding padding,
+                              std::vector<int64> explicit_paddings,
                               TensorFormat data_format, const Tensor& tensor_in,
                               const TensorShape& tensor_out_shape,
                               bool propagate_nans) {
@@ -150,14 +225,18 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
     return;
   }
 
-  PoolParameters params{context, size,        stride,
-                        padding, data_format, tensor_in.shape()};
+  PoolParameters params{
+      context,           size,        stride,           padding,
+      explicit_paddings, data_format, tensor_in.shape()};
   if (!context->status().ok()) {
     return;
   }
 
   int batch_size = params.tensor_in_batch;
   int depth = params.depth;
+  int tensor_in_cols = params.tensor_in_cols;
+  int tensor_in_rows = params.tensor_in_rows;
+
 #if CUDNN_VERSION < 7300
   /// Earlier versions do not support NHWC format, so we need to convert it
   /// to NCHW before calling cudnn. We need to get rid of this once it is done
@@ -186,7 +265,7 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
   }
   se::dnn::DataLayout data_layout = se::dnn::DataLayout::kBatchDepthYX;
 #else
-  auto& transformed_input = tensor_in;
+  Tensor transformed_input = tensor_in;
   auto& transformed_output = *tensor_out;
   se::dnn::DataLayout data_layout;
   switch (data_format) {
@@ -209,21 +288,81 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
                                           ToString(data_format)));
   }
 #endif
-  /// Get ready to call cudnn
+
+  int64 vertical_padding = params.pad_top;
+  int64 horizontal_padding = params.pad_left;
+
+  if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
+                              params.pad_left != params.pad_right)) {
+    // cuDNN only supports padding the same amount on the left and right sides,
+    // and on the top and bottom sides. So we manually create a new padded
+    // input tensor such that we can pass it to cuDNN.
+    const int64 common_padding_rows =
+        std::min(params.pad_top, params.pad_bottom);
+    const int64 common_padding_cols =
+        std::min(params.pad_left, params.pad_right);
+
+    Tensor padded_input;
+    const int64 padding_rows_diff =
+        std::abs(params.pad_top - params.pad_bottom);
+    const int64 padding_cols_diff =
+        std::abs(params.pad_left - params.pad_right);
+
+    const int64 new_in_rows = tensor_in_rows + padding_rows_diff;
+    const int64 new_in_cols = tensor_in_cols + padding_cols_diff;
+
+    OP_REQUIRES_OK(
+        context,
+        context->allocate_temp(DataTypeToEnum<T>::value,
+                               ShapeFromFormat(data_format, batch_size,
+                                               new_in_rows, new_in_cols, depth),
+                               &padded_input));
+    const int64 input_pad_top = params.pad_top - common_padding_rows;
+    const int64 input_pad_bottom = params.pad_bottom - common_padding_rows;
+    const int64 input_pad_left = params.pad_left - common_padding_cols;
+    const int64 input_pad_right = params.pad_right - common_padding_cols;
+
+    bool in_bounds =
+        FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
+    if (!in_bounds) {
+      context->SetStatus(errors::InvalidArgument("Padding is too large."));
+      return;
+    }
+
+    // We need to call the const version of transformed_input.tensor()
+    const Tensor& const_transformed_input = transformed_input;
+    OP_REQUIRES_OK(
+        context,
+        PadInputWithNegativeInf<T>()(
+            context->eigen_device<GPUDevice>(),
+            To32Bit(const_transformed_input.tensor<T, 4>()),
+            static_cast<int>(input_pad_top), static_cast<int>(input_pad_bottom),
+            static_cast<int>(input_pad_left), static_cast<int>(input_pad_right),
+            To32Bit(padded_input.tensor<T, 4>()), data_format));
+    transformed_input = padded_input;
+    vertical_padding = common_padding_rows;
+    horizontal_padding = common_padding_cols;
+    tensor_in_rows = new_in_rows;
+    tensor_in_cols = new_in_cols;
+  }
+
   se::dnn::PoolingDescriptor pooling_desc;
   pooling_desc.set_pooling_mode(pooling_mode)
       .set_window_height(params.window_rows)
       .set_window_width(params.window_cols)
       .set_vertical_stride(params.row_stride)
       .set_horizontal_stride(params.col_stride)
-      .set_vertical_padding(params.pad_rows)
-      .set_horizontal_padding(params.pad_cols)
+      .set_vertical_padding(vertical_padding)
+      .set_horizontal_padding(horizontal_padding)
       .set_propagate_nans(propagate_nans);
 
   se::dnn::BatchDescriptor input_desc;
   input_desc.set_count(batch_size)
-      .set_height(params.tensor_in_rows)
-      .set_width(params.tensor_in_cols)
+      .set_height(tensor_in_rows)
+      .set_width(tensor_in_cols)
       .set_feature_map_count(depth)
       .set_layout(data_layout);
 
@@ -280,13 +419,32 @@ void DnnPoolingOp<T>::Compute(OpKernelContext* context,
 #endif
 }
 
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                             \
+  template <>                                                           \
+  void PadInput<GPUDevice, T, int, 4>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T, 4, int>::ConstTensor in,   \
+      const std::array<int, 2>& padding_left,                           \
+      const std::array<int, 2>& padding_right,                          \
+      typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format, \
+      const T& padding_value);                                          \
+  extern template struct PadInput<GPUDevice, T, int, 4>;
+
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(double);
+DECLARE_GPU_SPEC(int32);
+}  // namespace functor
+
 template <typename T>
 void DnnPoolingGradOp<T>::Compute(
     OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
     const std::vector<int32>& size, const std::vector<int32>& stride,
-    Padding padding, TensorFormat data_format, const Tensor* tensor_in,
-    const Tensor* tensor_out, const Tensor& out_backprop,
-    const TensorShape& tensor_in_shape, bool propagate_nans) {
+    Padding padding, std::vector<int64> explicit_paddings,
+    TensorFormat data_format, const Tensor* tensor_in, const Tensor* tensor_out,
+    const Tensor& out_backprop, const TensorShape& tensor_in_shape,
+    bool propagate_nans) {
   CHECK((pooling_mode != se::dnn::PoolingMode::kMaximum) ||
         (tensor_in && tensor_out))
       << "For MaxPoolGrad, both tensor_in and tensor_out needs to be "
@@ -299,12 +457,14 @@ void DnnPoolingGradOp<T>::Compute(
     return;
   }
 
-  PoolParameters params{context, size,        stride,
-                        padding, data_format, tensor_in_shape};
+  PoolParameters params{context,           size,        stride,         padding,
+                        explicit_paddings, data_format, tensor_in_shape};
   if (!context->status().ok()) {
     return;
   }
 
+  TensorFormat transformed_input_data_format = data_format;
+
 #if CUDNN_VERSION < 7300
   /// For now, cudnn does not support NHWC format, so we need to convert it
   /// to NCHW before calling cudnn. We need to get rid of this once it is done
@@ -358,6 +518,7 @@ void DnnPoolingGradOp<T>::Compute(
       functor::NHWCToNCHW<GPUDevice, T, 4>()(context->eigen_device<Device>(),
                                              tensor_in->tensor<T, 4>(),
                                              transformed_input.tensor<T, 4>());
+      transformed_input_data_format = FORMAT_NCHW;
     }
     if (tensor_out) {
       // For AvgPoolGrad, the original output tensor is not necessary. However,
@@ -406,6 +567,102 @@ void DnnPoolingGradOp<T>::Compute(
   }
 #endif  // CUDNN_VERSION < 7300
 
+  int64 vertical_padding = params.pad_top;
+  int64 horizontal_padding = params.pad_left;
+
+  int batch_size = params.tensor_in_batch;
+  int depth = params.depth;
+  int tensor_in_cols = params.tensor_in_cols;
+  int tensor_in_rows = params.tensor_in_rows;
+
+  int64 input_pad_top = 0;
+  int64 input_pad_bottom = 0;
+  int64 input_pad_left = 0;
+  int64 input_pad_right = 0;
+
+  Tensor transformed_and_padded_input_backprop;
+
+  if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
+                              params.pad_left != params.pad_right)) {
+    // Pad the input in the same way we did during the forward pass, so that
+    // cuDNN or MIOpen receives the same input during the backward pass function
+    // as it did during the forward pass function.
+    const int64 common_padding_rows =
+        std::min(params.pad_top, params.pad_bottom);
+    const int64 common_padding_cols =
+        std::min(params.pad_left, params.pad_right);
+
+    Tensor padded_input;
+    const int64 padding_rows_diff =
+        std::abs(params.pad_top - params.pad_bottom);
+    const int64 padding_cols_diff =
+        std::abs(params.pad_left - params.pad_right);
+
+    const int64 new_in_rows = tensor_in_rows + padding_rows_diff;
+    const int64 new_in_cols = tensor_in_cols + padding_cols_diff;
+
+    VLOG(2) << "Create new tensor: "
+            << " original rows=" << tensor_in_rows
+            << " original cols=" << tensor_in_cols
+            << " padding_rows=" << new_in_rows
+            << " padding_cols=" << new_in_cols << " depth= " << depth
+            << " batch_size=" << batch_size << " kernel_rows"
+            << params.window_rows << " kernel_col" << params.window_cols
+            << " stride_rows" << params.row_stride;
+
+    OP_REQUIRES_OK(
+        context, context->allocate_temp(
+                     DataTypeToEnum<T>::value,
+                     ShapeFromFormat(transformed_input_data_format, batch_size,
+                                     new_in_rows, new_in_cols, depth),
+                     &padded_input));
+
+    OP_REQUIRES_OK(
+        context, context->allocate_temp(
+                     DataTypeToEnum<T>::value,
+                     ShapeFromFormat(transformed_input_data_format, batch_size,
+                                     new_in_rows, new_in_cols, depth),
+                     &transformed_and_padded_input_backprop));
+
+    input_pad_top = params.pad_top - common_padding_rows;
+    input_pad_bottom = params.pad_bottom - common_padding_rows;
+    input_pad_left = params.pad_left - common_padding_cols;
+    input_pad_right = params.pad_right - common_padding_cols;
+
+    bool in_bounds =
+        FastBoundsCheck(input_pad_top, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_bottom, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_left, std::numeric_limits<int>::max()) &&
+        FastBoundsCheck(input_pad_right, std::numeric_limits<int>::max());
+    if (!in_bounds) {
+      context->SetStatus(errors::InvalidArgument("Padding is too large."));
+      return;
+    }
+
+    // PadInputWithNegativeInf functor requires input to be a const.
+    const Tensor& const_transformed_input = transformed_input;
+    OP_REQUIRES_OK(
+        context,
+        PadInputWithNegativeInf<T>()(
+            context->eigen_device<GPUDevice>(),
+            To32Bit(const_transformed_input.tensor<T, 4>()),
+            static_cast<int>(input_pad_top), static_cast<int>(input_pad_bottom),
+            static_cast<int>(input_pad_left), static_cast<int>(input_pad_right),
+            To32Bit(padded_input.tensor<T, 4>()),
+            transformed_input_data_format));
+
+    transformed_input = padded_input;
+
+    vertical_padding = common_padding_rows;
+    horizontal_padding = common_padding_cols;
+    VLOG(2) << "vertical padding set to: " << vertical_padding
+            << " horizontal padding set to: " << horizontal_padding;
+    tensor_in_rows = new_in_rows;
+    tensor_in_cols = new_in_cols;
+  } else {
+    transformed_and_padded_input_backprop = transformed_input_backprop;
+  }
+
   /// Get ready to call cudnn
   se::dnn::PoolingDescriptor pooling_desc;
   pooling_desc.set_pooling_mode(pooling_mode)
@@ -413,8 +670,8 @@ void DnnPoolingGradOp<T>::Compute(
       .set_window_width(params.window_cols)
       .set_vertical_stride(params.row_stride)
       .set_horizontal_stride(params.col_stride)
-      .set_vertical_padding(params.pad_rows)
-      .set_horizontal_padding(params.pad_cols)
+      .set_vertical_padding(vertical_padding)
+      .set_horizontal_padding(horizontal_padding)
       .set_propagate_nans(propagate_nans);
 
   se::dnn::BatchDescriptor orig_output_desc;
@@ -426,8 +683,8 @@ void DnnPoolingGradOp<T>::Compute(
 
   se::dnn::BatchDescriptor orig_input_desc;
   orig_input_desc.set_count(params.tensor_in_batch)
-      .set_height(params.tensor_in_rows)
-      .set_width(params.tensor_in_cols)
+      .set_height(tensor_in_rows)
+      .set_width(tensor_in_cols)
       .set_feature_map_count(params.depth)
       .set_layout(data_layout);
 
@@ -440,9 +697,9 @@ void DnnPoolingGradOp<T>::Compute(
   auto output_backprop_data =
       AsDeviceMemory(transformed_output_backprop.template flat<T>().data(),
                      transformed_output_backprop.template flat<T>().size());
-  auto input_backprop_data =
-      AsDeviceMemory(transformed_input_backprop.template flat<T>().data(),
-                     transformed_input_backprop.template flat<T>().size());
+  auto input_backprop_data = AsDeviceMemory(
+      transformed_and_padded_input_backprop.template flat<T>().data(),
+      transformed_and_padded_input_backprop.template flat<T>().size());
 
   auto* stream = context->op_device_context()->stream();
   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
@@ -472,6 +729,20 @@ void DnnPoolingGradOp<T>::Compute(
   OP_REQUIRES(context, status,
               errors::Internal("dnn PoolBackward launch failed"));
 
+  if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
+                              params.pad_left != params.pad_right)) {
+    // Remove the padding that was added to the input shape above.
+    functor::PadInput<GPUDevice, T, int, 4>()(
+        context->eigen_device<GPUDevice>(),
+        To32Bit(const_cast<const Tensor&>(transformed_and_padded_input_backprop)
+                    .tensor<T, 4>()),
+        {{static_cast<int>(-input_pad_top), static_cast<int>(-input_pad_left)}},
+        {{static_cast<int>(-input_pad_bottom),
+          static_cast<int>(-input_pad_right)}},
+        To32Bit(transformed_input_backprop.template tensor<T, 4>()),
+        transformed_input_data_format, T{});
+  }
+
 #if CUDNN_VERSION < 7300
   if (data_format == FORMAT_NHWC) {
     /// Transform the output data from NCHW back to NHWC.
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
index 2aa1d0b6de0..dacbb872cf0 100644
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -18,7 +18,12 @@ limitations under the License.
 
 #include <vector>
 
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -40,9 +45,12 @@ typedef Eigen::GpuDevice GPUDevice;
 // A helper class to manage sizes and shapes for pooling operations.
 struct PoolParameters {
   // Updates context->status if there is an invalid input.
+  // explicit_paddings has eight elements if padding==EXPLIICT, and zero
+  // elements otherwise.
   PoolParameters(OpKernelContext* context, const std::vector<int32>& ksize,
                  const std::vector<int32>& stride, Padding padding,
-                 TensorFormat data_format, const TensorShape& tensor_in_shape);
+                 std::vector<int64> explicit_paddings, TensorFormat data_format,
+                 const TensorShape& tensor_in_shape);
 
   // Returns the shape of the output for "forward" pooling operations.
   TensorShape forward_output_shape();
@@ -65,13 +73,21 @@ struct PoolParameters {
   int64 out_width;
   int out_depth;
 
-  int64 pad_rows;
-  int64 pad_cols;
+  int64 pad_top;
+  int64 pad_bottom;
+  int64 pad_left;
+  int64 pad_right;
+
   int pad_depth;
 
   TensorFormat data_format;
 };
 
+// Checks if the sizes of the paddings are less than the size of window.
+// This is required for MaxPool because it pads with -inf, so the pooling
+// window cannot fully cover the padded area.
+Status CheckPaddingSize(PoolParameters& params);
+
 // An implementation of MaxPooling (forward).
 // TODO (yongtang): Remove MaxPoolingOp and use MaxPoolingV2Op,
 //     QuantizedMaxPoolingOp depends on MaxPoolingOp so keep intact for now
@@ -106,6 +122,10 @@ class MaxPoolingOp : public OpKernel {
                 errors::InvalidArgument("Sliding window stride field must "
                                         "specify 4 dimensions"));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+    if (padding_ == Padding::EXPLICIT) {
+      OP_REQUIRES_OK(
+          context, context->GetAttr("explicit_paddings", &explicit_paddings_));
+    }
     OP_REQUIRES(context, ksize_[0] == 1 && stride_[0] == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
@@ -113,8 +133,9 @@ class MaxPoolingOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
-    PoolParameters params{context,  ksize_,      stride_,
-                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    PoolParameters params{
+        context,     ksize_,           stride_, padding_, explicit_paddings_,
+        FORMAT_NHWC, tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
@@ -134,9 +155,21 @@ class MaxPoolingOp : public OpKernel {
           context, params.depth_window == params.depth_stride,
           errors::Unimplemented("Depthwise max pooling requires "
                                 "the depth window to equal the depth stride."));
+      OP_REQUIRES(
+          context, padding_ != EXPLICIT,
+          errors::Unimplemented("Depthwise max pooling does not support "
+                                "explicit padding."));
 
       DepthwiseMaxPool(context, output, tensor_in, params);
     } else {
+      // MaxPoolingOp is only called on the GPU when the eigen_tensor label
+      // is used. In this case, explicit padding is not supported
+      if (std::is_same<Device, GPUDevice>::value &&
+          padding_ == Padding::EXPLICIT) {
+        context->SetStatus(errors::Unimplemented(
+            "MaxPoolingOp does not support explicit padding."));
+        return;
+      }
       SpatialMaxPool(context, output, tensor_in, params, padding_);
     }
   }
@@ -202,8 +235,8 @@ class MaxPoolingOp : public OpKernel {
       auto shard = [&params, &in_mat, &out_mat](int64 start, int64 limit) {
         const int32 in_rows = params.tensor_in_rows;
         const int32 in_cols = params.tensor_in_cols;
-        const int32 pad_rows = params.pad_rows;
-        const int32 pad_cols = params.pad_cols;
+        const int32 pad_top = params.pad_top;
+        const int32 pad_left = params.pad_left;
         const int32 window_rows = params.window_rows;
         const int32 window_cols = params.window_cols;
         const int32 row_stride = params.row_stride;
@@ -225,8 +258,8 @@ class MaxPoolingOp : public OpKernel {
             for (int32 w = 0; w < in_cols; ++w) {
               // (h_start, h_end) * (w_start, w_end) is the range that the input
               // vector projects to.
-              const int32 hpad = h + pad_rows;
-              const int32 wpad = w + pad_cols;
+              const int32 hpad = h + pad_top;
+              const int32 wpad = w + pad_left;
               const int32 h_start = (hpad < window_rows)
                                         ? 0
                                         : (hpad - window_rows) / row_stride + 1;
@@ -263,6 +296,7 @@ class MaxPoolingOp : public OpKernel {
   std::vector<int32> ksize_;
   std::vector<int32> stride_;
   Padding padding_;
+  std::vector<int64> explicit_paddings_;
   TensorFormat data_format_;
 };
 
@@ -280,7 +314,7 @@ struct LaunchMaxPoolingNoMask_NCHW_VECT_C<Eigen::GpuDevice> {
         params.tensor_in_batch, params.tensor_in_rows, params.tensor_in_cols,
         params.depth, params.out_height, params.out_width, params.window_rows,
         params.window_cols, params.row_stride, params.col_stride,
-        params.pad_rows, params.pad_cols,
+        params.pad_top, params.pad_left,
         reinterpret_cast<int32*>(output->flat<qint8>().data()),
         context->eigen_gpu_device());
     if (!status) {
@@ -358,8 +392,15 @@ class MaxPoolingV2Op : public OpKernel {
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
 
-    PoolParameters params{context,  ksize,        stride,
-                          padding_, data_format_, tensor_in.shape()};
+    PoolParameters params{
+        context,
+        ksize,
+        stride,
+        padding_,
+        /*explicit_paddings=*/{},
+        data_format_,
+        tensor_in.shape(),
+    };
     if (!context->status().ok()) {
       return;
     }
@@ -455,8 +496,8 @@ class MaxPoolingV2Op : public OpKernel {
       auto shard = [&params, &in_mat, &out_mat](int64 start, int64 limit) {
         const int32 in_rows = params.tensor_in_rows;
         const int32 in_cols = params.tensor_in_cols;
-        const int32 pad_rows = params.pad_rows;
-        const int32 pad_cols = params.pad_cols;
+        const int32 pad_top = params.pad_top;
+        const int32 pad_left = params.pad_left;
         const int32 window_rows = params.window_rows;
         const int32 window_cols = params.window_cols;
         const int32 row_stride = params.row_stride;
@@ -478,8 +519,8 @@ class MaxPoolingV2Op : public OpKernel {
             for (int32 w = 0; w < in_cols; ++w) {
               // (h_start, h_end) * (w_start, w_end) is the range that the input
               // vector projects to.
-              const int32 hpad = h + pad_rows;
-              const int32 wpad = w + pad_cols;
+              const int32 hpad = h + pad_top;
+              const int32 wpad = w + pad_left;
               const int32 h_start = (hpad < window_rows)
                                         ? 0
                                         : (hpad - window_rows) / row_stride + 1;
@@ -567,8 +608,8 @@ void SpatialAvgPool(OpKernelContext* context, Tensor* output,
         for (int w = 0; w < params.tensor_in_cols; ++w) {
           // (h_start, h_end) * (w_start, w_end) is the range that the input
           // vector projects to.
-          const int hpad = h + params.pad_rows;
-          const int wpad = w + params.pad_cols;
+          const int hpad = h + params.pad_top;
+          const int wpad = w + params.pad_left;
           const int h_start =
               (hpad < params.window_rows)
                   ? 0
diff --git a/tensorflow/core/kernels/pooling_ops_common_gpu.h b/tensorflow/core/kernels/pooling_ops_common_gpu.h
index 9685bd9fdd0..8e391f35aee 100644
--- a/tensorflow/core/kernels/pooling_ops_common_gpu.h
+++ b/tensorflow/core/kernels/pooling_ops_common_gpu.h
@@ -43,6 +43,7 @@ class DnnPoolingOp {
                       se::dnn::PoolingMode pooling_mode,
                       const std::vector<int32>& size,
                       const std::vector<int32>& stride, Padding padding,
+                      std::vector<int64> explicit_paddings,
                       TensorFormat data_format, const Tensor& tensor_in,
                       const TensorShape& tensor_out_shape, bool propagate_nans);
 };
@@ -58,6 +59,7 @@ class DnnPoolingGradOp {
                       se::dnn::PoolingMode pooling_mode,
                       const std::vector<int32>& size,
                       const std::vector<int32>& stride, Padding padding,
+                      std::vector<int64> explicit_paddings,
                       TensorFormat data_format, const Tensor* tensor_in,
                       const Tensor* tensor_out, const Tensor& out_backprop,
                       const TensorShape& tensor_in_shape, bool propagate_nans);
diff --git a/tensorflow/core/kernels/quantization_utils.h b/tensorflow/core/kernels/quantization_utils.h
index fef3ed582b3..eaa29023a60 100644
--- a/tensorflow/core/kernels/quantization_utils.h
+++ b/tensorflow/core/kernels/quantization_utils.h
@@ -43,7 +43,8 @@ namespace tensorflow {
 // We have to be able to detect and handle overflows in int32, so this function
 // uses doubles and int64's to make sure we have enough room.
 template <class T>
-int64 FloatToQuantizedUnclamped(float input, float range_min, float range_max) {
+inline int64 FloatToQuantizedUnclamped(float input, float range_min,
+                                       float range_max) {
   const int64 lowest_quantized =
       static_cast<double>(Eigen::NumTraits<T>::lowest());
   if (range_min == range_max) {
@@ -60,6 +61,12 @@ int64 FloatToQuantizedUnclamped(float input, float range_min, float range_max) {
   return quantized;
 }
 
+template <>
+inline int64 FloatToQuantizedUnclamped<float>(float input, float range_min,
+                                              float range_max) {
+  return -1;
+}
+
 // This converts the float into the final quantized type, clamping/saturating
 // any over or underflows.
 template <class T>
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.cc b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
index 8f71d09c083..dec0262cf04 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
@@ -131,6 +131,75 @@ class QuantizeAndDequantizeV2Op : public OpKernel {
   bool narrow_range_;
 };
 
+// Implementation of QuantizeAndDequantizeV4GradientOp.
+// When back-propagating the error through a quantized layer, the following
+// paper gives evidence that clipped-ReLU is better than non-clipped:
+// "Deep Learning with Low Precision by Half-wave Gaussian Quantization"
+// http://zpascal.net/cvpr2017/Cai_Deep_Learning_With_CVPR_2017_paper.pdf
+template <typename Device, typename T>
+class QuantizeAndDequantizeV4GradientOp : public OpKernel {
+ public:
+  explicit QuantizeAndDequantizeV4GradientOp(OpKernelConstruction* ctx)
+      : OpKernel::OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("axis", &axis_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& gradient = ctx->input(0);
+    const Tensor& input = ctx->input(1);
+    Tensor* input_backprop = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, input.shape(), &input_backprop));
+
+    OP_REQUIRES(
+        ctx, input.IsSameSize(gradient),
+        errors::InvalidArgument("gradient and input must be the same size"));
+    const int depth = (axis_ == -1) ? 1 : input.dim_size(axis_);
+    const Tensor& input_min_tensor = ctx->input(2);
+    const Tensor& input_max_tensor = ctx->input(3);
+    if (axis_ != -1) {
+      OP_REQUIRES(
+          ctx, input_min_tensor.dim_size(0) == depth,
+          errors::InvalidArgument("min has incorrect size, expected ", depth,
+                                  " was ", input_min_tensor.dim_size(0)));
+      OP_REQUIRES(
+          ctx, input_max_tensor.dim_size(0) == depth,
+          errors::InvalidArgument("max has incorrect size, expected ", depth,
+                                  " was ", input_max_tensor.dim_size(0)));
+    }
+
+    TensorShape min_max_shape(input_min_tensor.shape());
+    Tensor* input_min_backprop;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(1, min_max_shape, &input_min_backprop));
+
+    Tensor* input_max_backprop;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(2, min_max_shape, &input_max_backprop));
+
+    if (axis_ == -1) {
+      functor::QuantizeAndDequantizeOneScaleGradientFunctor<Device, T> f;
+      f(ctx->eigen_device<Device>(), gradient.template flat<T>(),
+        input.template flat<T>(), input_min_tensor.scalar<T>(),
+        input_max_tensor.scalar<T>(), input_backprop->template flat<T>(),
+        input_min_backprop->template scalar<T>(),
+        input_max_backprop->template scalar<T>());
+    } else {
+      functor::QuantizeAndDequantizePerChannelGradientFunctor<Device, T> f;
+      f(ctx->eigen_device<Device>(),
+        gradient.template flat_inner_outer_dims<T, 3>(axis_ - 1),
+        input.template flat_inner_outer_dims<T, 3>(axis_ - 1),
+        &input_min_tensor, &input_max_tensor,
+        input_backprop->template flat_inner_outer_dims<T, 3>(axis_ - 1),
+        input_min_backprop->template flat<T>(),
+        input_max_backprop->template flat<T>());
+    }
+  }
+
+ private:
+  int axis_;
+};
+
 // Simulate quantization precision loss in a float tensor by:
 // 1. Quantize the tensor to fixed point numbers, which should match the target
 //    quantization method when it is used in inference.
@@ -295,6 +364,43 @@ struct QuantizeAndDequantizePerChannelFunctor<CPUDevice, T> {
         input_max_tensor, round_mode, narrow_range, out);
   }
 };
+
+template <typename T>
+struct QuantizeAndDequantizeOneScaleGradientFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::ConstFlat gradient,
+                  typename TTypes<T>::ConstFlat input,
+                  typename TTypes<T>::ConstScalar input_min_tensor,
+                  typename TTypes<T>::ConstScalar input_max_tensor,
+                  typename TTypes<T>::Flat input_backprop,
+                  typename TTypes<T>::Scalar input_min_backprop,
+                  typename TTypes<T>::Scalar input_max_backprop) {
+    QuantizeAndDequantizeOneScaleGradientImpl<CPUDevice, T>::Compute(
+        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
+        input_min_backprop, input_max_backprop);
+  }
+};
+
+template <typename T>
+struct QuantizeAndDequantizePerChannelGradientFunctor<CPUDevice, T> {
+  void operator()(const CPUDevice& d,
+                  typename TTypes<T, 3>::ConstTensor gradient,
+                  typename TTypes<T, 3>::ConstTensor input,
+                  const Tensor* input_min_tensor,
+                  const Tensor* input_max_tensor,
+                  typename TTypes<T, 3>::Tensor input_backprop,
+                  typename TTypes<T>::Flat input_min_backprop,
+                  typename TTypes<T>::Flat input_max_backprop) {
+    QuantizeAndDequantizePerChannelGradientImpl<CPUDevice, T>::Compute(
+        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
+        input_min_backprop, input_max_backprop);
+  }
+};
+
+template struct functor::QuantizeAndDequantizeOneScaleGradientFunctor<CPUDevice,
+                                                                      float>;
+template struct functor::QuantizeAndDequantizePerChannelGradientFunctor<
+    CPUDevice, double>;
+
 }  // namespace functor
 
 #define REGISTER_CPU_KERNEL(T)                                                 \
@@ -306,6 +412,14 @@ struct QuantizeAndDequantizePerChannelFunctor<CPUDevice, T> {
                               .Device(DEVICE_CPU)                              \
                               .TypeConstraint<T>("T"),                         \
                           QuantizeAndDequantizeV3Op<CPUDevice, T>);            \
+  REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV4")                      \
+                              .Device(DEVICE_CPU)                              \
+                              .TypeConstraint<T>("T"),                         \
+                          QuantizeAndDequantizeV2Op<CPUDevice, T>);            \
+  REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV4Grad")                  \
+                              .Device(DEVICE_CPU)                              \
+                              .TypeConstraint<T>("T"),                         \
+                          QuantizeAndDequantizeV4GradientOp<CPUDevice, T>);    \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("QuantizeAndDequantize").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       QuantizeAndDequantizeOp<CPUDevice, T>);
@@ -329,6 +443,18 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
                               .HostMemory("num_bits")                          \
                               .TypeConstraint<T>("T"),                         \
                           QuantizeAndDequantizeV3Op<GPUDevice, T>);            \
+  REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV4")                      \
+                              .Device(DEVICE_GPU)                              \
+                              .HostMemory("input_min")                         \
+                              .HostMemory("input_max")                         \
+                              .TypeConstraint<T>("T"),                         \
+                          QuantizeAndDequantizeV2Op<GPUDevice, T>);            \
+  REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV4Grad")                  \
+                              .Device(DEVICE_GPU)                              \
+                              .HostMemory("input_min")                         \
+                              .HostMemory("input_max")                         \
+                              .TypeConstraint<T>("T"),                         \
+                          QuantizeAndDequantizeV4GradientOp<GPUDevice, T>);    \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("QuantizeAndDequantize").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
       QuantizeAndDequantizeOp<GPUDevice, T>);
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h
index 4dd6e5c839b..c286a10a9c6 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.h
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -60,6 +60,28 @@ struct QuantizeAndDequantizePerChannelFunctor {
                   typename TTypes<T, 3>::Tensor output);
 };
 
+template <typename Device, typename T>
+struct QuantizeAndDequantizeOneScaleGradientFunctor {
+  void operator()(const Device& d, typename TTypes<T>::ConstFlat gradient,
+                  typename TTypes<T>::ConstFlat input,
+                  typename TTypes<T>::ConstScalar input_min,
+                  typename TTypes<T>::ConstScalar input_max,
+                  typename TTypes<T>::Flat input_backprop,
+                  typename TTypes<T>::Scalar input_min_backprop,
+                  typename TTypes<T>::Scalar input_max_backprop);
+};
+
+template <typename Device, typename T>
+struct QuantizeAndDequantizePerChannelGradientFunctor {
+  void operator()(const Device& d, typename TTypes<T, 3>::ConstTensor gradient,
+                  typename TTypes<T, 3>::ConstTensor input,
+                  const Tensor* input_min_tensor,
+                  const Tensor* input_max_tensor,
+                  typename TTypes<T, 3>::Tensor input_backprop,
+                  typename TTypes<T>::Flat input_min_backprop,
+                  typename TTypes<T>::Flat input_max_backprop);
+};
+
 // The implementation below runs on both CPU and GPU.
 template <typename Device, typename T, typename Func,
           typename Vec = typename TTypes<T>::Vec,
@@ -249,6 +271,55 @@ struct QuantizeAndDequantizePerChannelImpl {
   }
 };
 
+template <typename Device, typename T>
+struct QuantizeAndDequantizeOneScaleGradientImpl {
+  static void Compute(const Device& d, typename TTypes<T>::ConstFlat gradient,
+                      typename TTypes<T>::ConstFlat input,
+                      typename TTypes<T>::ConstScalar input_min,
+                      typename TTypes<T>::ConstScalar input_max,
+                      typename TTypes<T>::Flat input_backprop,
+                      typename TTypes<T>::Scalar input_min_backprop,
+                      typename TTypes<T>::Scalar input_max_backprop) {
+    const T min_val = input_min();
+    const T max_val = input_max();
+    const auto in_range =
+        (input >= min_val && input <= max_val)
+            .select(input.constant(1.0f), input.constant(0.0f));
+    input_backprop.device(d) = gradient * in_range;
+    input_min_backprop.device(d) = input_min_backprop.constant(0.0f);
+    input_max_backprop.device(d) = input_max_backprop.constant(0.0f);
+  }
+};
+
+template <typename Device, typename T>
+struct QuantizeAndDequantizePerChannelGradientImpl {
+  static void Compute(const Device& d,
+                      typename TTypes<T, 3>::ConstTensor gradient,
+                      typename TTypes<T, 3>::ConstTensor input,
+                      const Tensor* input_min_tensor,
+                      const Tensor* input_max_tensor,
+                      typename TTypes<T, 3>::Tensor input_backprop,
+                      typename TTypes<T>::Flat input_min_backprop,
+                      typename TTypes<T>::Flat input_max_backprop) {
+    using Index = typename tensorflow::TTypes<T>::ConstTensor::Index;
+    auto input_min = input_min_tensor->vec<T>();
+    auto input_max = input_max_tensor->vec<T>();
+    int num_channels = input.dimension(1);
+    for (Index i = 0; i < num_channels; ++i) {
+      const auto gradient_chip = gradient.template chip<1>(i);
+      const auto input_chip = input.template chip<1>(i);
+      const T min_val = input_min(i);
+      const T max_val = input_max(i);
+      const auto in_range =
+          (input_chip >= min_val && input_chip <= max_val)
+              .select(input_chip.constant(1.0f), input_chip.constant(0.0f));
+      input_backprop.template chip<1>(i).device(d) = gradient_chip * in_range;
+    }
+    input_min_backprop.device(d) = input_min_backprop.constant(0.0f);
+    input_max_backprop.device(d) = input_max_backprop.constant(0.0f);
+  }
+};
+
 }  // end of namespace functor
 }  // end of namespace tensorflow
 
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
index f3bb41071cb..9f074535770 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
@@ -53,6 +53,37 @@ struct QuantizeAndDequantizePerChannelFunctor<GPUDevice, T> {
   }
 };
 
+template <typename T>
+struct QuantizeAndDequantizeOneScaleGradientFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::ConstFlat gradient,
+                  typename TTypes<T>::ConstFlat input,
+                  typename TTypes<T>::ConstScalar input_min_tensor,
+                  typename TTypes<T>::ConstScalar input_max_tensor,
+                  typename TTypes<T>::Flat input_backprop,
+                  typename TTypes<T>::Scalar input_min_backprop,
+                  typename TTypes<T>::Scalar input_max_backprop) {
+    QuantizeAndDequantizeOneScaleGradientImpl<GPUDevice, T>::Compute(
+        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
+        input_min_backprop, input_max_backprop);
+  }
+};
+
+template <typename T>
+struct QuantizeAndDequantizePerChannelGradientFunctor<GPUDevice, T> {
+  void operator()(const GPUDevice& d,
+                  typename TTypes<T, 3>::ConstTensor gradient,
+                  typename TTypes<T, 3>::ConstTensor input,
+                  const Tensor* input_min_tensor,
+                  const Tensor* input_max_tensor,
+                  typename TTypes<T, 3>::Tensor input_backprop,
+                  typename TTypes<T>::Flat input_min_backprop,
+                  typename TTypes<T>::Flat input_max_backprop) {
+    QuantizeAndDequantizePerChannelGradientImpl<GPUDevice, T>::Compute(
+        d, gradient, input, input_min_tensor, input_max_tensor, input_backprop,
+        input_min_backprop, input_max_backprop);
+  }
+};
+
 }  // end namespace functor
 
 // Instantiate the GPU implementation for float and double.
@@ -65,6 +96,15 @@ template struct functor::QuantizeAndDequantizePerChannelFunctor<GPUDevice,
 template struct functor::QuantizeAndDequantizePerChannelFunctor<GPUDevice,
                                                                 double>;
 
+template struct functor::QuantizeAndDequantizeOneScaleGradientFunctor<GPUDevice,
+                                                                      float>;
+template struct functor::QuantizeAndDequantizeOneScaleGradientFunctor<GPUDevice,
+                                                                      double>;
+template struct functor::QuantizeAndDequantizePerChannelGradientFunctor<
+    GPUDevice, float>;
+template struct functor::QuantizeAndDequantizePerChannelGradientFunctor<
+    GPUDevice, double>;
+
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index 90764b0feb2..596ab13590a 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -362,6 +362,54 @@ TEST_P(ParameterizedQuantizeAndDequantizeTest,
   }
 }
 
+// Verifies the Gradient.
+TEST_P(ParameterizedQuantizeAndDequantizeTest, GradientV4_op) {
+  const int axis = GetParam();
+  TF_ASSERT_OK(NodeDefBuilder("qdq_v4_grad_op", "QuantizeAndDequantizeV4Grad")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Attr("axis", axis)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const std::vector<int64> dims = {2, 3, 4, 5};
+  // Input gradient. (repeating 11 values multiplied by (slice_idx + 1))
+  auto gradients = ScalePerSliceAlongAxis<float>(
+      dims, axis, {1, -2, -3, 4, 5, 6, -7, -8, -9, -10, 11});
+  AddInputFromArray<float>(TensorShape(dims), gradients);
+  // Forward op inputs. (repeating 7 values multiplied by (slice_idx + 1)).
+  auto inputs = ScalePerSliceAlongAxis<float>(
+      dims, axis, {-1, -0.5, 0, 0.3, 0.8, 0.55, 0.6});
+  AddInputFromArray<float>(TensorShape(dims), inputs);
+  const int num_slices = (axis == -1) ? 1 : dims[axis];
+  const TensorShape range_shape =
+      (axis == -1) ? TensorShape({}) : TensorShape({num_slices});
+  std::vector<float> input_min_values(num_slices), input_max_values(num_slices);
+  for (int i = 0; i < num_slices; ++i) {
+    input_max_values[i] = 0.8f + i * 0.4f;
+    input_min_values[i] = -input_max_values[i];
+  }
+  AddInputFromArray<float>(range_shape, input_min_values);
+  AddInputFromArray<float>(range_shape, input_max_values);
+  std::vector<float> expected_vals(inputs.size());
+  int minor_size = 1;
+  for (int i = axis + 1; i < dims.size(); ++i) {
+    minor_size *= dims[i];
+  }
+  for (int i = 0; i < inputs.size(); ++i) {
+    int slice_idx = (i / minor_size) % num_slices;
+    expected_vals[i] = ((inputs[i] >= input_min_values[slice_idx]) &&
+                        (inputs[i] <= input_max_values[slice_idx]))
+                           ? gradients[i]
+                           : 0;
+  }
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape(dims));
+  test::FillValues<float>(&expected, expected_vals);
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+}
+
 // Instantiate parameterized tests for axis = -1, 1, 3.
 INSTANTIATE_TEST_SUITE_P(All, ParameterizedQuantizeAndDequantizeTest,
                          ::testing::Values(-1, 1, 3));
diff --git a/tensorflow/core/kernels/quantized_pooling_ops.cc b/tensorflow/core/kernels/quantized_pooling_ops.cc
index 33a12c47466..cf8d6838b1c 100644
--- a/tensorflow/core/kernels/quantized_pooling_ops.cc
+++ b/tensorflow/core/kernels/quantized_pooling_ops.cc
@@ -54,8 +54,13 @@ class QuantizedAvgPoolingOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& tensor_in = context->input(0);
-    PoolParameters params{context,  ksize_,      stride_,
-                          padding_, FORMAT_NHWC, tensor_in.shape()};
+    PoolParameters params{context,
+                          ksize_,
+                          stride_,
+                          padding_,
+                          /*explicit_paddings=*/{},
+                          FORMAT_NHWC,
+                          tensor_in.shape()};
     if (!context->status().ok()) {
       return;
     }
diff --git a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
index ad0712e6fd0..d9993bb6d39 100644
--- a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
@@ -20,110 +20,76 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/kernels/ragged_tensor_variant.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace {
 
-struct RaggedTensor {
-  Tensor values;
-  std::vector<Tensor> nested_splits;
-};
-
-Status RaggedComponentsFromVariant(const Tensor& encoded_variant,
-                                   int ragged_rank, DataType value_dtype,
-                                   DataType split_dtype,
-                                   std::vector<RaggedTensor>* decoded_ragged) {
+Status RaggedComponentsFromVariant(
+    const Tensor& encoded_variant, int ragged_rank, DataType value_dtype,
+    DataType split_dtype, std::vector<RaggedTensorVariant>* decoded_ragged) {
   const auto& flat_variants = encoded_variant.flat<Variant>();
-  decoded_ragged->resize(flat_variants.size());
-  // Step 1: Extract the 1-D DT_VARIANT Tensor from each Variant element in the
-  // input.
+  decoded_ragged->reserve(flat_variants.size());
+
   for (int i = 0; i < flat_variants.size(); i++) {
     const auto& flat_variant = flat_variants(i);
-    const Tensor* encoded_list = flat_variant.get<Tensor>();
-    if (encoded_list == nullptr) {
+    const RaggedTensorVariant* decoded =
+        flat_variant.get<RaggedTensorVariant>();
+    if (decoded == nullptr) {
       return errors::InvalidArgument(
           "Input Variant element at index ", i,
-          " doesn't hold a Tensor: ", flat_variant.DebugString());
+          " doesn't hold a RaggedTensorVariant: ", flat_variant.DebugString());
     }
-    if (encoded_list->dims() != 1) {
+    decoded_ragged->push_back(*decoded);
+    decoded = &decoded_ragged->back();
+    // Check ragged rank & types
+    if (decoded->ragged_rank() != ragged_rank) {
       return errors::InvalidArgument(
-          "Encoded input Variant must have rank 1, but found rank: ",
-          encoded_list->dims(),
-          ". encoded input Variant: ", encoded_list->DebugString());
+          "Encoded input RaggedTensorVariant has ragged_rank=",
+          decoded->ragged_rank(), ".  Expected ragged_rank=", ragged_rank, ".");
     }
-    if (encoded_list->NumElements() != (ragged_rank + 1) &&
-        encoded_list->NumElements() != 1) {
-      return errors::InvalidArgument(
-          "Encoded input Variant must hold either input_ragged_rank + 1 "
-          "Tensors or an empty Tensor (zero splits Tensors, 1 values Tensor), "
-          "input_ragged_rank: ",
-          ragged_rank,
-          ", encoded input Variant: ", encoded_list->DebugString());
-    }
-    const auto& input_vec = encoded_list->vec<Variant>();
-
-    // Step 2: Get the splits and value Tensors from the 1-D DT_VARIANT Tensor
-    // to create the component RaggedTensors.
-    (*decoded_ragged)[i].nested_splits.reserve(ragged_rank);
-    for (int j = 0; j < ragged_rank; j++) {
-      const Tensor* split_tensor = input_vec(j).get<Tensor>();
-      if (split_tensor == nullptr) {
-        return errors::InvalidArgument(
-            "Encoded scalar element at index ", i,
-            " doesn't have a splits Tensor at split_index ", j, ": ",
-            input_vec(j).DebugString());
-      }
-      Tensor splits_tensor = *split_tensor;
-      if (splits_tensor.dtype() != split_dtype) {
-        return errors::InvalidArgument(
-            "Expected splits Tensor dtype: ", split_dtype,
-            ", found: ", splits_tensor.dtype());
-      }
-      if (splits_tensor.dims() != 1) {
-        return errors::InvalidArgument(
-            "Ragged splits must have rank 1; encoded scalar element at index ",
-            i, " has splits Tensor at split_index ", j, ": ",
-            splits_tensor.DebugString());
-      }
-      (*decoded_ragged)[i].nested_splits.push_back(splits_tensor);
-    }
-    const Tensor* values_tensor = input_vec(ragged_rank).get<Tensor>();
-    if (values_tensor == nullptr) {
-      return errors::InvalidArgument("Encoded scalar element at index ", i,
-                                     " doesn't have a values Tensor: ",
-                                     input_vec(ragged_rank).DebugString());
-    }
-    if (values_tensor->dtype() != value_dtype) {
+    if (decoded->values().dtype() != value_dtype) {
       return errors::InvalidArgument(
           "Expected values Tensor dtype: ", DataTypeString(value_dtype),
-          ", found: ", DataTypeString(values_tensor->dtype()));
+          ", found: ", DataTypeString(decoded->values().dtype()));
     }
-    if (values_tensor->dims() < 1) {
+    if (decoded->values().dims() < 1) {
       return errors::InvalidArgument(
           "Ragged values must have rank >= 1; encoded scalar element at index ",
-          i, " has values Tensor: ", values_tensor->DebugString());
+          i, " has values Tensor: ", decoded->values().DebugString());
+    }
+    for (const auto& splits : decoded->nested_splits()) {
+      if (splits.dtype() != split_dtype) {
+        return errors::InvalidArgument(
+            "Expected row_splits Tensor dtype: ", DataTypeString(split_dtype),
+            ", found: ", DataTypeString(splits.dtype()));
+      }
+      if (splits.dims() != 1) {
+        return errors::InvalidArgument(
+            "Ragged splits must have rank 1; encoded scalar element at index ",
+            i, " has splits Tensor ", splits.DebugString());
+      }
     }
-    (*decoded_ragged)[i].values = *values_tensor;
   }
   return Status::OK();
 }
 
 template <typename VALUE_TYPE, typename SPLIT_TYPE>
 Status NestedStackRaggedTensors(
-    const std::vector<RaggedTensor>& ragged_components,
+    const std::vector<RaggedTensorVariant>& ragged_components,
     const std::vector<int>& nested_dim_sizes, const int input_ragged_rank,
-    const int output_ragged_rank, RaggedTensor* output_ragged) {
-  output_ragged->nested_splits.reserve(output_ragged_rank);
+    const int output_ragged_rank, RaggedTensorVariant* output_ragged) {
+  output_ragged->mutable_nested_splits()->reserve(output_ragged_rank);
   const int dims = nested_dim_sizes.size();
 
   // Populate first `dims - 1` splits.
   for (int i = 0; i < dims - 1; i++) {
     int dims_splits_size = nested_dim_sizes[i] + 1;
-    output_ragged->nested_splits.push_back(Tensor(
-        DataTypeToEnum<SPLIT_TYPE>::value, TensorShape({dims_splits_size})));
-    auto splits_vec = output_ragged->nested_splits[i].vec<SPLIT_TYPE>();
+    output_ragged->append_splits(Tensor(DataTypeToEnum<SPLIT_TYPE>::value,
+                                        TensorShape({dims_splits_size})));
+    auto splits_vec = output_ragged->mutable_splits(i)->vec<SPLIT_TYPE>();
     int split_diff = nested_dim_sizes[i + 1];
     for (int j = 0; j < dims_splits_size; j++) {
       splits_vec(j) = j * split_diff;
@@ -132,15 +98,15 @@ Status NestedStackRaggedTensors(
 
   // Populate `dims`-th split.
   int splits_size = ragged_components.size() + 1;
-  output_ragged->nested_splits.push_back(
+  output_ragged->append_splits(
       Tensor(DataTypeToEnum<SPLIT_TYPE>::value, TensorShape({splits_size})));
   auto dims_splits_vec =
-      output_ragged->nested_splits[dims - 1].vec<SPLIT_TYPE>();
+      output_ragged->mutable_splits(dims - 1)->vec<SPLIT_TYPE>();
   dims_splits_vec(0) = 0;
   for (int i = 0; i < ragged_components.size(); i++) {
-    int split_val = ragged_components[i].values.shape().dim_size(0);
-    if (input_ragged_rank != 0 && !ragged_components[i].nested_splits.empty()) {
-      split_val = ragged_components[i].nested_splits[0].NumElements() - 1;
+    int split_val = ragged_components[i].values().shape().dim_size(0);
+    if (input_ragged_rank != 0 && ragged_components[i].ragged_rank() > 0) {
+      split_val = ragged_components[i].splits(0).NumElements() - 1;
     }
     dims_splits_vec(i + 1) = dims_splits_vec(i) + split_val;
   }
@@ -150,24 +116,24 @@ Status NestedStackRaggedTensors(
     int split_index = dims + i;
     int split_size = 1;
     for (int j = 0; j < ragged_components.size(); j++) {
-      if (!ragged_components[j].nested_splits.empty()) {
-        split_size += ragged_components[j].nested_splits[i].NumElements() - 1;
+      if (!ragged_components[j].nested_splits().empty()) {
+        split_size += ragged_components[j].splits(i).NumElements() - 1;
       }
     }
-    output_ragged->nested_splits.push_back(
+    output_ragged->append_splits(
         Tensor(DataTypeToEnum<SPLIT_TYPE>::value, TensorShape({split_size})));
     auto splits_vec =
-        output_ragged->nested_splits[split_index].vec<SPLIT_TYPE>();
+        output_ragged->mutable_splits(split_index)->vec<SPLIT_TYPE>();
     splits_vec(0) = 0;
     SPLIT_TYPE last_split_value = 0;
     int index = 1;
     for (int j = 0; j < ragged_components.size(); j++) {
-      if (ragged_components[j].nested_splits.empty()) {
+      if (ragged_components[j].nested_splits().empty()) {
         // Corner case: empty row. e.g [ [[x], [x]], [] ]
         continue;
       }
       auto component_splits_vec =
-          ragged_components[j].nested_splits[i].vec<SPLIT_TYPE>();
+          ragged_components[j].splits(i).vec<SPLIT_TYPE>();
       for (int k = 1; k < component_splits_vec.size(); k++, index++) {
         splits_vec(index) = component_splits_vec(k) + last_split_value;
       }
@@ -175,33 +141,47 @@ Status NestedStackRaggedTensors(
     }
   }
 
+  // If the variant tensor input is empty, then we have no way to determine
+  // the correct shape for the dense_values.  (It must have rank>=1, and its
+  // outer dimension must be 0, but we don't know its shape beyond that.)
+  // For now, we just use a shape of `[0]` in this case.
+  // TODO(edloper): Update this op with an attribute containing information
+  // about dense_values shape.  If it's `None`, then we'll probably still have
+  // to use shape=[0] here, but if we have more info, then we can use it.
+  // E.g., in map_fn, we may have shape info from the RaggedTensorSpec.
+  TensorShape component_values_shape;
+  if (ragged_components.empty()) {
+    component_values_shape = TensorShape({0});
+  } else {
+    component_values_shape = ragged_components[0].values().shape();
+  }
+
   // Populate values.
-  TensorShape component_values_shape = ragged_components[0].values.shape();
   int values_size = component_values_shape.dim_size(0);
   for (int i = 1; i < ragged_components.size(); i++) {
-    if (ragged_components[i].values.dims() != component_values_shape.dims()) {
+    if (ragged_components[i].values().dims() != component_values_shape.dims()) {
       return errors::InvalidArgument(
           "Rank of values must match for all "
           "components; values shape at index 0: ",
           component_values_shape.DebugString(), ", values shape at index ", i,
-          ": ", ragged_components[i].values.shape().DebugString());
+          ": ", ragged_components[i].values().shape().DebugString());
     }
-    values_size += ragged_components[i].values.shape().dim_size(0);
+    values_size += ragged_components[i].values().shape().dim_size(0);
   }
   component_values_shape.set_dim(0, values_size);
-  output_ragged->values =
-      Tensor(DataTypeToEnum<VALUE_TYPE>::value, component_values_shape);
+  output_ragged->set_values(
+      Tensor(DataTypeToEnum<VALUE_TYPE>::value, component_values_shape));
   auto output_values_flat =
-      output_ragged->values.flat_outer_dims<VALUE_TYPE, 2>();
+      output_ragged->mutable_values()->flat_outer_dims<VALUE_TYPE, 2>();
   int values_index = 0;
   for (int i = 0; i < ragged_components.size(); i++) {
     auto component_values_flat =
-        ragged_components[i].values.flat_outer_dims<VALUE_TYPE, 2>();
-    int num_inner_elements = ragged_components[i].values.NumElements();
-    if (ragged_components[i].values.dim_size(0) > 0) {
-      num_inner_elements /= ragged_components[i].values.dim_size(0);
+        ragged_components[i].values().flat_outer_dims<VALUE_TYPE, 2>();
+    int num_inner_elements = ragged_components[i].values().NumElements();
+    if (ragged_components[i].values().dim_size(0) > 0) {
+      num_inner_elements /= ragged_components[i].values().dim_size(0);
     }
-    for (int j = 0; j < ragged_components[i].values.dim_size(0);
+    for (int j = 0; j < ragged_components[i].values().dim_size(0);
          j++, values_index++) {
       for (int k = 0; k < num_inner_elements; k++) {
         output_values_flat(values_index, k) = component_values_flat(j, k);
@@ -251,7 +231,7 @@ class RaggedTensorFromVariantOp : public OpKernel {
     // Decode all variants.
     const auto value_dtype = DataTypeToEnum<VALUE_TYPE>::v();
     const auto split_dtype = DataTypeToEnum<SPLIT_TYPE>::v();
-    std::vector<RaggedTensor> decoded_components;
+    std::vector<RaggedTensorVariant> decoded_components;
     OP_REQUIRES_OK(context, RaggedComponentsFromVariant(
                                 encoded_variant, input_ragged_rank_,
                                 value_dtype, split_dtype, &decoded_components));
@@ -267,7 +247,7 @@ class RaggedTensorFromVariantOp : public OpKernel {
     for (int i = 0; i < encoded_variant.dims(); i++) {
       encoded_dim_sizes[i] = encoded_variant.dim_size(i);
     }
-    RaggedTensor output_ragged;
+    RaggedTensorVariant output_ragged;
     OP_REQUIRES_OK(
         context, NestedStackRaggedTensors<VALUE_TYPE, SPLIT_TYPE>(
                      decoded_components, encoded_dim_sizes, input_ragged_rank_,
@@ -282,15 +262,15 @@ class RaggedTensorFromVariantOp : public OpKernel {
   int output_ragged_rank_;
 
   void ReturnRaggedTensor(OpKernelContext* context,
-                          RaggedTensor ragged_tensor) {
-    int ragged_rank = ragged_tensor.nested_splits.size();
+                          const RaggedTensorVariant& ragged_tensor) {
+    int ragged_rank = ragged_tensor.ragged_rank();
     OpOutputList splits_out;
     OP_REQUIRES_OK(context,
                    context->output_list("output_nested_splits", &splits_out));
     for (int i = 0; i < ragged_rank; i++) {
-      splits_out.set(i, ragged_tensor.nested_splits[i]);
+      splits_out.set(i, ragged_tensor.splits(i));
     }
-    context->set_output(ragged_rank, ragged_tensor.values);
+    context->set_output(ragged_rank, ragged_tensor.values());
   }
 };
 
diff --git a/tensorflow/core/kernels/ragged_tensor_from_variant_op_test.cc b/tensorflow/core/kernels/ragged_tensor_from_variant_op_test.cc
index bdf321d0515..fc46283c90e 100644
--- a/tensorflow/core/kernels/ragged_tensor_from_variant_op_test.cc
+++ b/tensorflow/core/kernels/ragged_tensor_from_variant_op_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ragged_tensor_variant.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -55,28 +56,22 @@ class RaggedTensorFromVariantKernelTest : public ::tensorflow::OpsTestBase {
   }
 
   template <typename VALUE_TYPE, typename SPLIT_TYPE>
-  Tensor CreateVariantFromRagged(
+  RaggedTensorVariant CreateVariantFromRagged(
       const std::vector<std::vector<SPLIT_TYPE>>& ragged_splits,
       const TensorShape& ragged_values_shape,
       const std::vector<VALUE_TYPE>& ragged_values) {
-    // Step 1: Create Tensors out of ragged splits and values.
-    std::vector<Variant> ragged_components;
+    RaggedTensorVariant encoded;
     for (auto ragged_split : ragged_splits) {
       int splits_size = ragged_split.size();
       Tensor splits(DataTypeToEnum<SPLIT_TYPE>::v(),
                     TensorShape({splits_size}));
       test::FillValues<SPLIT_TYPE>(&splits, ragged_split);
-      ragged_components.push_back(splits);
+      encoded.append_splits(splits);
     }
     Tensor values(DataTypeToEnum<VALUE_TYPE>::v(), ragged_values_shape);
     test::FillValues<VALUE_TYPE>(&values, ragged_values);
-    ragged_components.push_back(values);
-
-    // Step 2: Encode into a 1-D Variant Tensor.
-    int num_splits = ragged_splits.size();
-    Tensor encoded_list(DT_VARIANT, TensorShape({num_splits + 1}));
-    test::FillValues<Variant>(&encoded_list, ragged_components);
-    return encoded_list;
+    encoded.set_values(values);
+    return encoded;
   }
 };
 
@@ -85,7 +80,7 @@ TEST_F(RaggedTensorFromVariantKernelTest, ScalarInput) {
   const std::vector<int64> split_2 = {0, 1, 2, 5, 6, 7};
   const std::vector<int> values = {0, 1, 1, 2, 2, 3, 4};
 
-  Tensor encoded_variant = CreateVariantFromRagged<int, int64>(
+  auto encoded_variant = CreateVariantFromRagged<int, int64>(
       {split_1, split_2}, TensorShape({7}), values);
   Tensor expected_splits_1(DT_INT64, TensorShape({6}));
   Tensor expected_splits_2(DT_INT64, TensorShape({6}));
@@ -113,7 +108,7 @@ TEST_F(RaggedTensorFromVariantKernelTest, OneInputElement) {
   const std::vector<int> values = {0, 1, 1, 2, 2, 3, 4};
   const std::vector<int64> batched_splits_1 = {0, 5};
 
-  Tensor encoded_variant = CreateVariantFromRagged<int, int64>(
+  auto encoded_variant = CreateVariantFromRagged<int, int64>(
       {split_1, split_2}, TensorShape({7}), values);
   Tensor expected_splits_1(DT_INT64, TensorShape({2}));
   Tensor expected_splits_2(DT_INT64, TensorShape({6}));
@@ -157,13 +152,13 @@ TEST_F(RaggedTensorFromVariantKernelTest, TensorIn2DOut) {
   const std::vector<int64> batched_splits_2 = {0, 3, 3, 5, 6};
   const std::vector<int> batched_values = {1, 2, 3, 4, 5, 6};
 
-  Tensor component_variant_1 =
+  auto component_variant_1 =
       CreateVariantFromRagged<int, int64>({}, TensorShape({3}), values_1);
-  Tensor component_variant_2 =
+  auto component_variant_2 =
       CreateVariantFromRagged<int, int64>({}, TensorShape({0}), values_2);
-  Tensor component_variant_3 =
+  auto component_variant_3 =
       CreateVariantFromRagged<int, int64>({}, TensorShape({2}), values_3);
-  Tensor component_variant_4 =
+  auto component_variant_4 =
       CreateVariantFromRagged<int, int64>({}, TensorShape({1}), values_4);
 
   Tensor expected_splits_1(DT_INT64, TensorShape({3}));
@@ -223,15 +218,15 @@ TEST_F(RaggedTensorFromVariantKernelTest, NonEmpty1DIn3DOut) {
   test::FillValues<int64>(&expected_splits_3, batched_splits_3);
   test::FillValues<int>(&expected_values, batched_values);
 
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {component_split_1_1}, TensorShape({1}), component_values_1);
-  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_2 = CreateVariantFromRagged<int, int64>(
       {component_split_2_1}, TensorShape({2}), component_values_2);
-  Tensor variant_component_3 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_3 = CreateVariantFromRagged<int, int64>(
       {component_split_3_1}, TensorShape({2}), component_values_3);
-  Tensor variant_component_4 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_4 = CreateVariantFromRagged<int, int64>(
       {component_split_4_1}, TensorShape({3}), component_values_4);
-  Tensor variant_component_5 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_5 = CreateVariantFromRagged<int, int64>(
       {component_split_5_1}, TensorShape({3}), component_values_5);
   int input_ragged_rank = 1;
   int output_ragged_rank = 3;
@@ -297,10 +292,10 @@ TEST_F(RaggedTensorFromVariantKernelTest,
   test::FillValues<int64>(&expected_splits_4, batched_splits_4);
   test::FillValues<int>(&expected_values, batched_values);
 
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {component_split_1_1, component_split_1_2}, TensorShape({11}),
       component_values_1);
-  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_2 = CreateVariantFromRagged<int, int64>(
       {component_split_2_1, component_split_2_2}, TensorShape({11}),
       component_values_2);
   int input_ragged_rank = -1;
@@ -336,9 +331,9 @@ TEST_F(RaggedTensorFromVariantKernelTest, EmptyRow1DIn2DOut) {
   test::FillValues<int64>(&expected_splits_2, batched_splits_2);
   test::FillValues<int>(&expected_values, batched_values);
 
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {component_split_1_1}, TensorShape({3}), component_values_1);
-  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_2 = CreateVariantFromRagged<int, int64>(
       {component_split_2_1}, TensorShape({0}), {});  // Empty row.
   int input_ragged_rank = 1;
   int output_ragged_rank = 2;
@@ -371,9 +366,9 @@ TEST_F(RaggedTensorFromVariantKernelTest, NDValues1DIn2DOut) {
   test::FillValues<int64>(&expected_splits_2, batched_splits_2);
   test::FillValues<int>(&expected_values, batched_values);
 
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {component_split_1_1}, TensorShape({1, 2}), component_values_1);
-  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_2 = CreateVariantFromRagged<int, int64>(
       {component_split_2_1}, TensorShape({2, 2}), component_values_2);
   int input_ragged_rank = 1;
   int output_ragged_rank = 2;
@@ -423,15 +418,15 @@ TEST_F(RaggedTensorFromVariantKernelTest, NonEmpty1DIn3DOutInt32Splits) {
   test::FillValues<int>(&expected_splits_3, batched_splits_3);
   test::FillValues<int>(&expected_values, batched_values);
 
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int>(
       {component_split_1_1}, TensorShape({1}), component_values_1);
-  Tensor variant_component_2 = CreateVariantFromRagged<int, int>(
+  auto variant_component_2 = CreateVariantFromRagged<int, int>(
       {component_split_2_1}, TensorShape({2}), component_values_2);
-  Tensor variant_component_3 = CreateVariantFromRagged<int, int>(
+  auto variant_component_3 = CreateVariantFromRagged<int, int>(
       {component_split_3_1}, TensorShape({2}), component_values_3);
-  Tensor variant_component_4 = CreateVariantFromRagged<int, int>(
+  auto variant_component_4 = CreateVariantFromRagged<int, int>(
       {component_split_4_1}, TensorShape({3}), component_values_4);
-  Tensor variant_component_5 = CreateVariantFromRagged<int, int>(
+  auto variant_component_5 = CreateVariantFromRagged<int, int>(
       {component_split_5_1}, TensorShape({3}), component_values_5);
   int input_ragged_rank = 1;
   int output_ragged_rank = 3;
@@ -451,13 +446,13 @@ TEST_F(RaggedTensorFromVariantKernelTest, NonEmpty1DIn3DOutInt32Splits) {
 
 // Tests for invalid inputs.
 TEST_F(RaggedTensorFromVariantKernelTest, InvalidInferredInputRaggedRank) {
-  Tensor component_variant_1 =
+  auto component_variant_1 =
       CreateVariantFromRagged<int, int64>({}, TensorShape({3}), {1, 2, 3});
-  Tensor component_variant_2 =
+  auto component_variant_2 =
       CreateVariantFromRagged<int, int64>({}, TensorShape({0}), {});
-  Tensor component_variant_3 =
+  auto component_variant_3 =
       CreateVariantFromRagged<int, int64>({}, TensorShape({2}), {1, 2});
-  Tensor component_variant_4 =
+  auto component_variant_4 =
       CreateVariantFromRagged<int, int64>({}, TensorShape({1}), {1});
 
   int input_ragged_rank = -1;
@@ -478,9 +473,9 @@ TEST_F(RaggedTensorFromVariantKernelTest, InputDimsAndRaggedRankAttrsMismatch) {
   const std::vector<int> component_values_1 = {0};
   const std::vector<int> component_values_2 = {0, 1};
 
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {component_split_1_1}, TensorShape({1}), component_values_1);
-  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_2 = CreateVariantFromRagged<int, int64>(
       {component_split_2_1}, TensorShape({2}), component_values_2);
 
   int input_ragged_rank = 1;
@@ -493,33 +488,21 @@ TEST_F(RaggedTensorFromVariantKernelTest, InputDimsAndRaggedRankAttrsMismatch) {
                                "input_ragged_rank + encoded_ragged.dims()"));
 }
 
-TEST_F(RaggedTensorFromVariantKernelTest, InputDoesNotHoldTensors) {
+TEST_F(RaggedTensorFromVariantKernelTest, InputDoesNotHoldRaggedTensorVariant) {
   int input_ragged_rank = 1;
   int output_ragged_rank = 2;
   BuildDecodeRaggedTensorGraph<int, int64>(
       input_ragged_rank, output_ragged_rank, TensorShape({2}), {1, 2});
   EXPECT_TRUE(absl::StartsWith(
       RunOpKernel().error_message(),
-      "Input Variant element at index 0 doesn't hold a Tensor"));
-}
-
-TEST_F(RaggedTensorFromVariantKernelTest, InputVariantTensorRankNotOne) {
-  Tensor variant_list(DT_VARIANT, TensorShape({2, 1}));
-  test::FillValues<Variant>(&variant_list, {1, 2});
-  int input_ragged_rank = 1;
-  int output_ragged_rank = 2;
-  BuildDecodeRaggedTensorGraph<int, int64>(
-      input_ragged_rank, output_ragged_rank, TensorShape({1}), {variant_list});
-  EXPECT_TRUE(absl::StartsWith(
-      RunOpKernel().error_message(),
-      "Encoded input Variant must have rank 1, but found rank: 2"));
+      "Input Variant element at index 0 doesn't hold a RaggedTensorVariant"));
 }
 
 TEST_F(RaggedTensorFromVariantKernelTest,
        InputScalarElementDoesNotMatchInputRaggedRank) {
   const std::vector<int64> component_split_1_1 = {0, 1};
   const std::vector<int> component_values_1 = {1, 2};
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {component_split_1_1}, TensorShape({1, 2}), component_values_1);
 
   int input_ragged_rank = 2;
@@ -527,31 +510,17 @@ TEST_F(RaggedTensorFromVariantKernelTest,
   BuildDecodeRaggedTensorGraph<int, int64>(input_ragged_rank,
                                            output_ragged_rank, TensorShape({1}),
                                            {variant_component_1});
-  EXPECT_TRUE(absl::StartsWith(
-      RunOpKernel().error_message(),
-      "Encoded input Variant must hold either input_ragged_rank + 1 "
-      "Tensors or an empty Tensor"));
-}
-
-TEST_F(RaggedTensorFromVariantKernelTest, RaggedSplitNotATensor) {
-  Tensor variant_list(DT_VARIANT, TensorShape({2}));
-  test::FillValues<Variant>(&variant_list, {1, 2});
-
-  int input_ragged_rank = 1;
-  int output_ragged_rank = 2;
-  BuildDecodeRaggedTensorGraph<int, int>(input_ragged_rank, output_ragged_rank,
-                                         TensorShape({1}), {variant_list});
   EXPECT_TRUE(
       absl::StartsWith(RunOpKernel().error_message(),
-                       "Encoded scalar element at index 0 doesn't have a "
-                       "splits Tensor at split_index 0"));
+                       "Encoded input RaggedTensorVariant has ragged_rank=1.  "
+                       "Expected ragged_rank=2."));
 }
 
 TEST_F(RaggedTensorFromVariantKernelTest, RaggedSplitTypeMismatch) {
   const std::vector<int64> component_split_1_1 = {0, 1};
   const std::vector<int> component_values_1 = {0};
 
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {component_split_1_1}, TensorShape({1}), component_values_1);
 
   int input_ragged_rank = 1;
@@ -559,46 +528,29 @@ TEST_F(RaggedTensorFromVariantKernelTest, RaggedSplitTypeMismatch) {
   BuildDecodeRaggedTensorGraph<int, int>(input_ragged_rank, output_ragged_rank,
                                          TensorShape({1}),
                                          {variant_component_1});
-  EXPECT_TRUE(absl::StartsWith(RunOpKernel().error_message(),
-                               "Expected splits Tensor dtype: 3, found: 9"));
+  EXPECT_TRUE(absl::StartsWith(
+      RunOpKernel().error_message(),
+      "Expected row_splits Tensor dtype: int32, found: int64"));
 }
 
 TEST_F(RaggedTensorFromVariantKernelTest, RaggedSplitRankNotOne) {
-  Tensor splits(DT_INT64, TensorShape({2, 1}));
-  test::FillValues<int64>(&splits, {1, 2});
-  Tensor values(DT_INT32, {2});
-  test::FillValues<int>(&values, {1, 2});
-  Tensor encoded_list(DT_VARIANT, TensorShape({2}));
-  test::FillValues<Variant>(&encoded_list, {splits, values});
+  RaggedTensorVariant encoded(Tensor(DT_INT32, {2}),
+                              {Tensor(DT_INT64, {2, 1})});
+  test::FillValues<int64>(encoded.mutable_splits(0), {1, 2});
+  test::FillValues<int>(encoded.mutable_values(), {1, 2});
 
   int input_ragged_rank = 1;
   int output_ragged_rank = 2;
   BuildDecodeRaggedTensorGraph<int, int64>(
-      input_ragged_rank, output_ragged_rank, TensorShape({1}), {encoded_list});
+      input_ragged_rank, output_ragged_rank, TensorShape({1}), {encoded});
   EXPECT_TRUE(absl::StartsWith(RunOpKernel().error_message(),
                                "Ragged splits must have rank 1"));
 }
 
-TEST_F(RaggedTensorFromVariantKernelTest, RaggedValuesNotATensor) {
-  Tensor splits(DT_INT64, TensorShape({3}));
-  test::FillValues<int64>(&splits, {0, 2, 3});
-  Tensor variant_list(DT_VARIANT, TensorShape({2}));
-  test::FillValues<Variant>(&variant_list, {splits, 2});
-
-  int input_ragged_rank = 1;
-  int output_ragged_rank = 2;
-  BuildDecodeRaggedTensorGraph<int, int64>(
-      input_ragged_rank, output_ragged_rank, TensorShape({1}), {variant_list});
-  EXPECT_TRUE(
-      absl::StartsWith(RunOpKernel().error_message(),
-                       "Encoded scalar element at index 0 doesn't have a "
-                       "values Tensor"));
-}
-
 TEST_F(RaggedTensorFromVariantKernelTest, RaggedValuesTypeMismatch) {
   const std::vector<int64> component_split_1_1 = {0, 1};
   const std::vector<int> component_values_1 = {0};
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {component_split_1_1}, TensorShape({1}), component_values_1);
   int input_ragged_rank = 1;
   int output_ragged_rank = 2;
@@ -611,7 +563,7 @@ TEST_F(RaggedTensorFromVariantKernelTest, RaggedValuesTypeMismatch) {
 }
 
 TEST_F(RaggedTensorFromVariantKernelTest, RaggedValuesRankNotGreaterThanOne) {
-  Tensor variant_component_1 =
+  auto variant_component_1 =
       CreateVariantFromRagged<int, int64>({{0, 1}}, TensorShape({}), {1});
   int input_ragged_rank = 1;
   int output_ragged_rank = 2;
@@ -628,9 +580,9 @@ TEST_F(RaggedTensorFromVariantKernelTest, RaggedValuesRankMismatch) {
   const std::vector<int> component_values_1 = {0};
   const std::vector<int> component_values_2 = {0, 1, 2, 3};
 
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {component_split_1_1}, TensorShape({1}), component_values_1);
-  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_2 = CreateVariantFromRagged<int, int64>(
       {component_split_2_1}, TensorShape({2, 2}), component_values_2);
   int input_ragged_rank = 1;
   int output_ragged_rank = 2;
@@ -711,13 +663,13 @@ TEST_F(RaggedTensorFromVariantKernelTest, 2DValuesTensorIn1DOut) {
   const std::vector<int> batched_values = {1, 1, 1, 1, 2, 2, 2, 2, 3, 3,
                                            3, 3, 4, 4, 4, 4, 5, 5, 5, 5};
 
-  Tensor variant_component_1 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_1 = CreateVariantFromRagged<int, int64>(
       {}, TensorShape({2, 2, 2}), {1, 1, 1, 1, 2, 2, 2, 2});
-  Tensor variant_component_2 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_2 = CreateVariantFromRagged<int, int64>(
       {}, TensorShape({1, 2, 2}), {3, 3, 3, 3});
-  Tensor variant_component_3 =
+  auto variant_component_3 =
       CreateVariantFromRagged<int, int64>({}, TensorShape({0, 2, 2}), {});
-  Tensor variant_component_4 = CreateVariantFromRagged<int, int64>(
+  auto variant_component_4 = CreateVariantFromRagged<int, int64>(
       {}, TensorShape({2, 2, 2}), {4, 4, 4, 4, 5, 5, 5, 5});
 
   Tensor expected_splits_1(DT_INT64, TensorShape({5}));
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
index 64c372b005e..549dc68dfbf 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
@@ -18,50 +18,38 @@ limitations under the License.
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/kernels/concat_lib.h"
+#include "tensorflow/core/kernels/ragged_tensor_variant.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/tensor_ops_util.h"
 
 namespace tensorflow {
 namespace {
 
-struct RaggedTensor {
-  Tensor values;
-  std::vector<Tensor> nested_splits;
-};
-
-Status RaggedToVariant(const RaggedTensor& ragged, Tensor* encoded_list) {
-  // Encode as a rank-1 Variant Tensor.
-  int ragged_rank = ragged.nested_splits.size();
-  *encoded_list = Tensor(DT_VARIANT, TensorShape({ragged_rank + 1}));
-  auto encoded_vec = encoded_list->vec<Variant>();
-  for (int i = 0; i < ragged_rank; i++) {
-    encoded_vec(i) = ragged.nested_splits[i];
-  }
-  encoded_vec(ragged_rank) = ragged.values;
-  return Status::OK();
-}
-
 template <typename VALUE_TYPE, typename SPLIT_TYPE>
-Status UnbatchRaggedZerothDim(const RaggedTensor& batched_ragged,
-                              std::vector<RaggedTensor>* ragged_components) {
+Status UnbatchRaggedZerothDim(
+    const RaggedTensorVariant& batched_ragged,
+    std::vector<RaggedTensorVariant>* ragged_components) {
   // Set up the component Ragged Tensors.
-  int ragged_rank = batched_ragged.nested_splits.size();
-  auto batched_splits_top_vec =
-      batched_ragged.nested_splits[0].vec<SPLIT_TYPE>();
+  int ragged_rank = batched_ragged.ragged_rank();
+  auto batched_splits_top_vec = batched_ragged.splits(0).vec<SPLIT_TYPE>();
   int num_components = batched_splits_top_vec.size() - 1;
   int num_splits = ragged_rank - 1;
   ragged_components->resize(num_components);
-  for (RaggedTensor ragged_component : *ragged_components) {
-    ragged_component.nested_splits.reserve(num_splits);
+  for (RaggedTensorVariant& ragged_component : *ragged_components) {
+    ragged_component.mutable_nested_splits()->reserve(num_splits);
   }
-  const auto& batched_flat = batched_ragged.values.flat<VALUE_TYPE>();
-  int num_inner_elems = batched_ragged.values.NumElements();
-  if (batched_ragged.values.dim_size(0) > 1) {
-    num_inner_elems /= batched_ragged.values.dim_size(0);
+  const auto& batched_flat = batched_ragged.values().flat<VALUE_TYPE>();
+  int num_inner_elems = batched_ragged.values().NumElements();
+  if (batched_ragged.values().dim_size(0) > 1) {
+    num_inner_elems /= batched_ragged.values().dim_size(0);
   }
-  TensorShape values_shape = batched_ragged.values.shape();
+  TensorShape values_shape = batched_ragged.values().shape();
 
   // Corner case: ragged_rank == 1, e.g. [[1, 2, 3], [4, 5]]
   if (num_splits == 0) {
@@ -70,10 +58,10 @@ Status UnbatchRaggedZerothDim(const RaggedTensor& batched_ragged,
       int limit = batched_splits_top_vec(i + 1);
       int num_values = limit - start;
       values_shape.set_dim(0, num_values);
-      (*ragged_components)[i].values =
-          Tensor(DataTypeToEnum<VALUE_TYPE>::value, values_shape);
+      (*ragged_components)[i].set_values(
+          Tensor(DataTypeToEnum<VALUE_TYPE>::value, values_shape));
       auto ragged_component_values_flat =
-          (*ragged_components)[i].values.flat<VALUE_TYPE>();
+          (*ragged_components)[i].mutable_values()->flat<VALUE_TYPE>();
       for (int j = 0; j < num_values * num_inner_elems; j++) {
         ragged_component_values_flat(j) =
             batched_flat(j + start * num_inner_elems);
@@ -86,8 +74,7 @@ Status UnbatchRaggedZerothDim(const RaggedTensor& batched_ragged,
   std::vector<typename TTypes<SPLIT_TYPE>::ConstVec> batched_splits_vec;
   batched_splits_vec.reserve(ragged_rank);
   for (int i = 0; i < ragged_rank; i++) {
-    batched_splits_vec.push_back(
-        batched_ragged.nested_splits[i].vec<SPLIT_TYPE>());
+    batched_splits_vec.push_back(batched_ragged.splits(i).vec<SPLIT_TYPE>());
   }
   std::vector<int> index(num_splits, 1);
   std::vector<int> ragged_component_values_size(num_components, 0);
@@ -104,10 +91,10 @@ Status UnbatchRaggedZerothDim(const RaggedTensor& batched_ragged,
         int last_index = ragged_component_splits_vec[j - 1].size() - 1;
         split_size = ragged_component_splits_vec[j - 1](last_index) + 1;
       }
-      (*ragged_components)[i].nested_splits.push_back(
+      (*ragged_components)[i].append_splits(
           Tensor(DataTypeToEnum<SPLIT_TYPE>::value, TensorShape({split_size})));
       ragged_component_splits_vec.push_back(
-          (*ragged_components)[i].nested_splits[j].vec<SPLIT_TYPE>());
+          (*ragged_components)[i].mutable_splits(j)->vec<SPLIT_TYPE>());
       SPLIT_TYPE last_split_value = batched_splits_vec[j + 1](index[j] - 1);
       ragged_component_splits_vec[j](0) = 0;
       for (int k = 1; k < split_size; k++, index[j]++) {
@@ -125,10 +112,10 @@ Status UnbatchRaggedZerothDim(const RaggedTensor& batched_ragged,
   for (int i = 0; i < num_components; i++) {
     int num_values = ragged_component_values_size[i];
     values_shape.set_dim(0, num_values);
-    (*ragged_components)[i].values =
-        Tensor(DataTypeToEnum<VALUE_TYPE>::value, values_shape);
+    (*ragged_components)[i].set_values(
+        Tensor(DataTypeToEnum<VALUE_TYPE>::value, values_shape));
     auto ragged_component_values_flat =
-        (*ragged_components)[i].values.flat<VALUE_TYPE>();
+        (*ragged_components)[i].mutable_values()->flat<VALUE_TYPE>();
     for (int j = 0; j < num_values * num_inner_elems; j++, value_index++) {
       ragged_component_values_flat(j) = batched_flat(value_index);
     }
@@ -152,46 +139,38 @@ class RaggedTensorToVariantOp : public OpKernel {
     OP_REQUIRES_OK(context, context->input_list("rt_nested_splits",
                                                 &ragged_nested_splits_in));
     const int ragged_nested_splits_len = ragged_nested_splits_in.size();
-    RaggedTensor batched_ragged_input;
+    RaggedTensorVariant batched_ragged_input;
     // Read ragged_values input.
-    batched_ragged_input.values = context->input(ragged_nested_splits_len);
-    batched_ragged_input.nested_splits.reserve(ragged_nested_splits_len);
+    batched_ragged_input.set_values(context->input(ragged_nested_splits_len));
+    batched_ragged_input.mutable_nested_splits()->reserve(
+        ragged_nested_splits_len);
     for (int i = 0; i < ragged_nested_splits_len; i++) {
-      batched_ragged_input.nested_splits.push_back(ragged_nested_splits_in[i]);
+      batched_ragged_input.append_splits(ragged_nested_splits_in[i]);
     }
 
     if (!batched_input_) {
-      // Encode the input as is.
-      Tensor encoded_list;
-      OP_REQUIRES_OK(context,
-                     RaggedToVariant(batched_ragged_input, &encoded_list));
       // Encode as a Scalar Variant Tensor.
       Tensor* encoded_scalar;
       OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}),
                                                        &encoded_scalar));
-      encoded_scalar->scalar<Variant>()() = std::move(encoded_list);
+      encoded_scalar->scalar<Variant>()() = std::move(batched_ragged_input);
       return;
     }
 
     // Unbatch the Ragged Tensor and encode the components.
-    std::vector<RaggedTensor> ragged_components;
+    std::vector<RaggedTensorVariant> unbatched_ragged_input;
     OP_REQUIRES_OK(context, UnbatchRaggedZerothDim<VALUE_TYPE, SPLIT_TYPE>(
-                                batched_ragged_input, &ragged_components));
-    std::vector<Tensor> encoded_components(ragged_components.size());
-    for (int i = 0; i < ragged_components.size(); i++) {
-      OP_REQUIRES_OK(context, RaggedToVariant(ragged_components[i],
-                                              &encoded_components[i]));
-    }
+                                batched_ragged_input, &unbatched_ragged_input));
 
     // Bundle the encoded scalar Variant Tensors into a rank-1 Variant Tensor.
-    Tensor* encoded_ragged;
-    int output_size = ragged_components.size();
+    Tensor* encoded_vector;
+    int output_size = unbatched_ragged_input.size();
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, TensorShape({output_size}),
-                                            &encoded_ragged));
-    auto encoded_ragged_vec = encoded_ragged->vec<Variant>();
+                                            &encoded_vector));
+    auto encoded_vector_t = encoded_vector->vec<Variant>();
     for (int i = 0; i < output_size; i++) {
-      encoded_ragged_vec(i) = encoded_components[i];
+      encoded_vector_t(i) = unbatched_ragged_input[i];
     }
   }
 
@@ -199,12 +178,81 @@ class RaggedTensorToVariantOp : public OpKernel {
   bool batched_input_;
 };
 
-#define REGISTER_KERNELS_WITH_SPLIT_TYPE(value_type, split_type)      \
-  REGISTER_KERNEL_BUILDER(Name("RaggedTensorToVariant")               \
-                              .Device(DEVICE_CPU)                     \
-                              .TypeConstraint<value_type>("Tvalues")  \
-                              .TypeConstraint<split_type>("Tsplits"), \
-                          RaggedTensorToVariantOp<value_type, split_type>);
+template <typename VALUE_TYPE, typename SPLIT_TYPE>
+class RaggedTensorToVariantGradientOp : public OpKernel {
+ public:
+  using OpKernel::OpKernel;
+
+  void Compute(OpKernelContext* context) override {
+    // Read inputs.
+    Tensor encoded_variant = context->input(0);
+    Tensor row_splits = context->input(1);
+    auto flat_row_splits = row_splits.flat<SPLIT_TYPE>();
+    TensorShape dense_values_shape;
+    OP_REQUIRES_OK(context,
+                   TensorShapeUtils::MakeShape(context->input(2).vec<int32>(),
+                                               &dense_values_shape));
+
+    const auto& flat_variants = encoded_variant.flat<Variant>();
+
+    // Get a Tensor containing the flat_values for each variant.
+    std::vector<Tensor> values;
+    for (int i = 0; i < flat_variants.size(); ++i) {
+      if (const auto* encoded = flat_variants(i).get<RaggedTensorVariant>()) {
+        values.push_back(encoded->values());
+      } else {
+        // Missing value: this happens if only some of the variant values
+        // generated by ragged_tensor_to_variant impacted the value that we're
+        // calculating the gradient for.  In this case, we will see a
+        // default-constructed variant; so treat it as a zero tensor with the
+        // appropriate shape.
+        const auto value_dtype = DataTypeToEnum<VALUE_TYPE>::v();
+        int piece_size = flat_row_splits(i + 1) - flat_row_splits(i);
+        TensorShape zeros_shape = dense_values_shape;
+        zeros_shape.set_dim(0, piece_size);
+        Tensor zero(value_dtype, zeros_shape);
+        zero.flat<VALUE_TYPE>() =
+            zero.flat<VALUE_TYPE>().constant(VALUE_TYPE());
+        values.push_back(zero);
+      }
+    }
+
+    if (values.size() == 1) {
+      // Just one flat_value tensor: return as-is.
+      context->set_output(0, values[0]);
+    } else {
+      // Multiple flat_values tensors: concatenate them together.
+      using Piece = typename TTypes<VALUE_TYPE, 2>::Matrix;
+      using ConstPiece = typename TTypes<VALUE_TYPE, 2>::ConstMatrix;
+      std::vector<std::unique_ptr<ConstPiece>> pieces;
+      pieces.reserve(values.size());
+      for (const Tensor& t : values) {
+        pieces.emplace_back(
+            new ConstPiece(t.shaped<VALUE_TYPE, 2>({1, t.NumElements()})));
+      }
+      Tensor* out = nullptr;
+      OP_REQUIRES_OK(context,
+                     context->allocate_output(0, dense_values_shape, &out));
+      Piece out_flat =
+          out->shaped<VALUE_TYPE, 2>({1, dense_values_shape.num_elements()});
+      ConcatCPU<VALUE_TYPE>(context->device(), pieces, &out_flat);
+    }
+  }
+};
+
+#define REGISTER_KERNELS_WITH_SPLIT_TYPE(value_type, split_type)            \
+  REGISTER_KERNEL_BUILDER(Name("RaggedTensorToVariant")                     \
+                              .Device(DEVICE_CPU)                           \
+                              .TypeConstraint<value_type>("Tvalues")        \
+                              .TypeConstraint<split_type>("Tsplits"),       \
+                          RaggedTensorToVariantOp<value_type, split_type>); \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("RaggedTensorToVariantGradient")                                 \
+          .Device(DEVICE_CPU)                                               \
+          .TypeConstraint<value_type>("Tvalues")                            \
+          .TypeConstraint<split_type>("Tsplits"),                           \
+      RaggedTensorToVariantGradientOp<value_type, split_type>);
+
 #define REGISTER_KERNELS(value_type)                  \
   REGISTER_KERNELS_WITH_SPLIT_TYPE(value_type, int32) \
   REGISTER_KERNELS_WITH_SPLIT_TYPE(value_type, int64)
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc b/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc
index c1438dd7af9..94f35673c8b 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ragged_tensor_variant.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -60,6 +61,43 @@ class RaggedTensorToVariantKernelTest : public ::tensorflow::OpsTestBase {
     }
     AddInputFromArray<VALUE_TYPE>(ragged_values_shape, ragged_values);
   }
+
+  template <typename VALUE_TYPE, typename SPLIT_TYPE>
+  RaggedTensorVariant CreateVariantFromRagged(
+      const std::vector<std::vector<SPLIT_TYPE>>& ragged_splits,
+      const TensorShape& ragged_values_shape,
+      const std::vector<VALUE_TYPE>& ragged_values) {
+    RaggedTensorVariant encoded;
+    for (auto ragged_split : ragged_splits) {
+      int splits_size = ragged_split.size();
+      Tensor splits(DataTypeToEnum<SPLIT_TYPE>::v(),
+                    TensorShape({splits_size}));
+      test::FillValues<SPLIT_TYPE>(&splits, ragged_split);
+      encoded.append_splits(splits);
+    }
+    Tensor values(DataTypeToEnum<VALUE_TYPE>::v(), ragged_values_shape);
+    test::FillValues<VALUE_TYPE>(&values, ragged_values);
+    encoded.set_values(values);
+    return encoded;
+  }
+
+  template <typename VALUE_TYPE, typename SPLIT_TYPE>
+  RaggedTensorVariant CreateVariantFromRagged(
+      const std::vector<std::vector<SPLIT_TYPE>>& ragged_splits,
+      const std::vector<VALUE_TYPE>& ragged_values) {
+    int num_values = ragged_values.size();
+    return CreateVariantFromRagged(ragged_splits, {num_values}, ragged_values);
+  }
+
+  template <typename VALUE_TYPE, typename SPLIT_TYPE>
+  void ExpectRaggedTensorVariantEqual(const RaggedTensorVariant& expected,
+                                      const RaggedTensorVariant& actual) {
+    test::ExpectTensorEqual<VALUE_TYPE>(actual.values(), expected.values());
+    EXPECT_EQ(actual.ragged_rank(), expected.ragged_rank());
+    for (int i = 0; i < actual.ragged_rank(); ++i) {
+      test::ExpectTensorEqual<SPLIT_TYPE>(actual.splits(i), expected.splits(i));
+    }
+  }
 };
 
 TEST_F(RaggedTensorToVariantKernelTest, NoValuesInput) {
@@ -67,18 +105,6 @@ TEST_F(RaggedTensorToVariantKernelTest, NoValuesInput) {
   const std::vector<int64> batched_splits_1 = {0, 2, 3, 3};
   const std::vector<int64> batched_splits_2 = {0, 0, 0, 0};
 
-  const std::vector<int64> component_splits_1_1 = {0, 0, 0};
-  const std::vector<int64> component_splits_2_1 = {0, 0};
-  const std::vector<int64> component_splits_3_1 = {0};
-
-  Tensor expected_splits_1_1(DT_INT64, TensorShape({3}));
-  Tensor expected_splits_2_1(DT_INT64, TensorShape({2}));
-  Tensor expected_splits_3_1(DT_INT64, TensorShape({1}));
-
-  test::FillValues<int64>(&expected_splits_1_1, component_splits_1_1);
-  test::FillValues<int64>(&expected_splits_2_1, component_splits_2_1);
-  test::FillValues<int64>(&expected_splits_3_1, component_splits_3_1);
-
   BuildEncodeRaggedTensorGraph<int, int64>({batched_splits_1, batched_splits_2},
                                            TensorShape({0}), {}, true);
   TF_ASSERT_OK(RunOpKernel());
@@ -86,55 +112,26 @@ TEST_F(RaggedTensorToVariantKernelTest, NoValuesInput) {
   const auto& encoded_list = GetOutput(0)->vec<Variant>();
   EXPECT_EQ(encoded_list.size(), 3);
 
-  const Variant& encoded_splits_1_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_splits_2_1 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_2 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_splits_3_1 =
-      encoded_list(2).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_3 =
-      encoded_list(2).get<Tensor>()->vec<Variant>()(1);
-
-  test::ExpectTensorEqual<int64>(*encoded_splits_1_1.get<Tensor>(),
-                                 expected_splits_1_1);
-  test::ExpectTensorEqual<int64>(*encoded_splits_2_1.get<Tensor>(),
-                                 expected_splits_2_1);
-  test::ExpectTensorEqual<int64>(*encoded_splits_3_1.get<Tensor>(),
-                                 expected_splits_3_1);
-  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
-                               Tensor(DT_INT32, TensorShape({0})));
-  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
-                               Tensor(DT_INT32, TensorShape({0})));
-  test::ExpectTensorEqual<int>(*encoded_values_3.get<Tensor>(),
-                               Tensor(DT_INT32, TensorShape({0})));
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({{0, 0, 0}}, {}),
+      *encoded_list(0).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({{0, 0}}, {}),
+      *encoded_list(1).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({{0}}, {}),
+      *encoded_list(2).get<RaggedTensorVariant>());
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, 1DValuesRaggedRankOneInput) {
   // ragged_tensor=
-  // [ [x, x, x],
+  // [ [1, 2, 3],
   //   [       ],
-  //   [x, x   ],
-  //   [x      ]]
+  //   [4, 5   ],
+  //   [6      ]]
   const std::vector<int64> batched_splits = {0, 3, 3, 5, 6};
   const std::vector<int> batched_values = {1, 2, 3, 4, 5, 6};
 
-  const std::vector<int> component_values_1 = {1, 2, 3};
-  const std::vector<int> component_values_3 = {4, 5};
-  const std::vector<int> component_values_4 = {6};
-
-  Tensor expected_values_1(DT_INT32, TensorShape({3}));
-  Tensor expected_values_2(DT_INT32, TensorShape({0}));
-  Tensor expected_values_3(DT_INT32, TensorShape({2}));
-  Tensor expected_values_4(DT_INT32, TensorShape({1}));
-
-  test::FillValues<int>(&expected_values_1, component_values_1);
-  test::FillValues<int>(&expected_values_3, component_values_3);
-  test::FillValues<int>(&expected_values_4, component_values_4);
-
   BuildEncodeRaggedTensorGraph<int, int64>({batched_splits}, TensorShape({6}),
                                            batched_values, true);
   TF_ASSERT_OK(RunOpKernel());
@@ -142,45 +139,28 @@ TEST_F(RaggedTensorToVariantKernelTest, 1DValuesRaggedRankOneInput) {
   const auto& encoded_list = GetOutput(0)->vec<Variant>();
   EXPECT_EQ(encoded_list.size(), 4);
 
-  const Variant& encoded_values_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_2 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_3 =
-      encoded_list(2).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_4 =
-      encoded_list(3).get<Tensor>()->vec<Variant>()(0);
-
-  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
-                               expected_values_1);
-  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
-                               expected_values_2);
-  test::ExpectTensorEqual<int>(*encoded_values_3.get<Tensor>(),
-                               expected_values_3);
-  test::ExpectTensorEqual<int>(*encoded_values_4.get<Tensor>(),
-                               expected_values_4);
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({}, {1, 2, 3}),
+      *encoded_list(0).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({}, {}),
+      *encoded_list(1).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({}, {4, 5}),
+      *encoded_list(2).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({}, {6}),
+      *encoded_list(3).get<RaggedTensorVariant>());
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, 2DBatchedValuesRankOneInput) {
   // ragged_tensor=
-  // [[x, x],
-  //  [x, x],
-  //  [x, x]]
+  // [[1, 2],
+  //  [4, 5],
+  //  [6, 7]]
   const std::vector<int64> batched_splits = {0, 1, 2, 3};
   const std::vector<int> batched_values = {1, 2, 4, 5, 6, 7};
 
-  const std::vector<int> component_values_1 = {1, 2};
-  const std::vector<int> component_values_2 = {4, 5};
-  const std::vector<int> component_values_3 = {6, 7};
-
-  Tensor expected_values_1(DT_INT32, TensorShape({1, 2}));
-  Tensor expected_values_2(DT_INT32, TensorShape({1, 2}));
-  Tensor expected_values_3(DT_INT32, TensorShape({1, 2}));
-
-  test::FillValues<int>(&expected_values_1, component_values_1);
-  test::FillValues<int>(&expected_values_2, component_values_2);
-  test::FillValues<int>(&expected_values_3, component_values_3);
-
   BuildEncodeRaggedTensorGraph<int, int64>(
       {batched_splits}, TensorShape({3, 2}), batched_values, true);
   TF_ASSERT_OK(RunOpKernel());
@@ -188,44 +168,25 @@ TEST_F(RaggedTensorToVariantKernelTest, 2DBatchedValuesRankOneInput) {
   const auto& encoded_list = GetOutput(0)->vec<Variant>();
   EXPECT_EQ(encoded_list.size(), 3);
 
-  const Variant& encoded_values_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_2 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_3 =
-      encoded_list(2).get<Tensor>()->vec<Variant>()(0);
-
-  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
-                               expected_values_1);
-  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
-                               expected_values_2);
-  test::ExpectTensorEqual<int>(*encoded_values_3.get<Tensor>(),
-                               expected_values_3);
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({}, {1, 2}, {1, 2}),
+      *encoded_list(0).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({}, {1, 2}, {4, 5}),
+      *encoded_list(1).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({}, {1, 2}, {6, 7}),
+      *encoded_list(2).get<RaggedTensorVariant>());
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, 2DBatchedValuesRankTwoInput) {
-  // ragged_tensor=[
-  // [ [[x, x], [x, x]],
-  //   [[x, x]        ] ]
+  // ragged_tensor=
+  // [ [[[1, 2], [4, 5]]],
+  //   [[[6 7]]]          ]
   const std::vector<int64> batched_splits_1 = {0, 1, 2};
   const std::vector<int64> batched_splits_2 = {0, 2, 3};
   const std::vector<int> batched_values = {1, 2, 4, 5, 6, 7};
 
-  const std::vector<int64> component_splits_1_1 = {0, 2};
-  const std::vector<int64> component_splits_2_1 = {0, 1};
-  const std::vector<int> component_values_1 = {1, 2, 4, 5};
-  const std::vector<int> component_values_2 = {6, 7};
-
-  Tensor expected_splits_1_1(DT_INT64, TensorShape({2}));
-  Tensor expected_splits_2_1(DT_INT64, TensorShape({2}));
-  Tensor expected_values_1(DT_INT32, TensorShape({2, 2}));
-  Tensor expected_values_2(DT_INT32, TensorShape({1, 2}));
-
-  test::FillValues<int64>(&expected_splits_1_1, component_splits_1_1);
-  test::FillValues<int64>(&expected_splits_2_1, component_splits_2_1);
-  test::FillValues<int>(&expected_values_1, component_values_1);
-  test::FillValues<int>(&expected_values_2, component_values_2);
-
   BuildEncodeRaggedTensorGraph<int, int64>({batched_splits_1, batched_splits_2},
                                            TensorShape({3, 2}), batched_values,
                                            true);
@@ -234,23 +195,12 @@ TEST_F(RaggedTensorToVariantKernelTest, 2DBatchedValuesRankTwoInput) {
   const auto& encoded_list = GetOutput(0)->vec<Variant>();
   EXPECT_EQ(encoded_list.size(), 2);
 
-  const Variant& encoded_splits_1_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_splits_2_1 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_2 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(1);
-
-  test::ExpectTensorEqual<int64>(*encoded_splits_1_1.get<Tensor>(),
-                                 expected_splits_1_1);
-  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
-                               expected_values_1);
-  test::ExpectTensorEqual<int64>(*encoded_splits_2_1.get<Tensor>(),
-                                 expected_splits_2_1);
-  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
-                               expected_values_2);
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({{0, 2}}, {2, 2}, {1, 2, 4, 5}),
+      *encoded_list(0).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({{0, 1}}, {1, 2}, {6, 7}),
+      *encoded_list(1).get<RaggedTensorVariant>());
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, EmptyRowInBatchedInput) {
@@ -263,30 +213,6 @@ TEST_F(RaggedTensorToVariantKernelTest, EmptyRowInBatchedInput) {
   const std::vector<int64> batched_splits_2 = {0, 1, 3, 3, 8, 11, 11, 15};
   const std::vector<int> batched_values = {1, 2,  3,  4,  5,  6,  7, 8,
                                            9, 10, 11, 12, 13, 14, 15};
-  const std::vector<int64> component_splits_1_1 = {0, 1, 3, 3};
-  const std::vector<int64> component_splits_2_1 = {0};
-  const std::vector<int64> component_splits_3_1 = {0, 5, 8};
-  const std::vector<int64> component_splits_4_1 = {0, 0, 4};
-  const std::vector<int> component_values_1 = {1, 2, 3};
-  const std::vector<int> component_values_3 = {4, 5, 6, 7, 8, 9, 10, 11};
-  const std::vector<int> component_values_4 = {12, 13, 14, 15};
-
-  Tensor expected_splits_1_1(DT_INT64, TensorShape({4}));
-  Tensor expected_splits_2_1(DT_INT64, TensorShape({1}));
-  Tensor expected_splits_3_1(DT_INT64, TensorShape({3}));
-  Tensor expected_splits_4_1(DT_INT64, TensorShape({3}));
-  Tensor expected_values_1(DT_INT32, TensorShape({3}));
-  Tensor expected_values_2(DT_INT32, TensorShape({0}));
-  Tensor expected_values_3(DT_INT32, TensorShape({8}));
-  Tensor expected_values_4(DT_INT32, TensorShape({4}));
-
-  test::FillValues<int64>(&expected_splits_1_1, component_splits_1_1);
-  test::FillValues<int64>(&expected_splits_2_1, component_splits_2_1);
-  test::FillValues<int64>(&expected_splits_3_1, component_splits_3_1);
-  test::FillValues<int64>(&expected_splits_4_1, component_splits_4_1);
-  test::FillValues<int>(&expected_values_1, component_values_1);
-  test::FillValues<int>(&expected_values_3, component_values_3);
-  test::FillValues<int>(&expected_values_4, component_values_4);
 
   BuildEncodeRaggedTensorGraph<int, int64>({batched_splits_1, batched_splits_2},
                                            TensorShape({15}), batched_values,
@@ -296,39 +222,19 @@ TEST_F(RaggedTensorToVariantKernelTest, EmptyRowInBatchedInput) {
   const auto& encoded_list = GetOutput(0)->vec<Variant>();
   EXPECT_EQ(encoded_list.size(), 4);
 
-  const Variant& encoded_splits_1_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_splits_2_1 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_2 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_splits_3_1 =
-      encoded_list(2).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_3 =
-      encoded_list(2).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_splits_4_1 =
-      encoded_list(3).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_values_4 =
-      encoded_list(3).get<Tensor>()->vec<Variant>()(1);
-
-  test::ExpectTensorEqual<int64>(*encoded_splits_1_1.get<Tensor>(),
-                                 expected_splits_1_1);
-  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
-                               expected_values_1);
-  test::ExpectTensorEqual<int64>(*encoded_splits_2_1.get<Tensor>(),
-                                 expected_splits_2_1);
-  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
-                               expected_values_2);
-  test::ExpectTensorEqual<int64>(*encoded_splits_3_1.get<Tensor>(),
-                                 expected_splits_3_1);
-  test::ExpectTensorEqual<int>(*encoded_values_3.get<Tensor>(),
-                               expected_values_3);
-  test::ExpectTensorEqual<int64>(*encoded_splits_4_1.get<Tensor>(),
-                                 expected_splits_4_1);
-  test::ExpectTensorEqual<int>(*encoded_values_4.get<Tensor>(),
-                               expected_values_4);
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({{0, 1, 3, 3}}, {1, 2, 3}),
+      *encoded_list(0).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({{0}}, {}),
+      *encoded_list(1).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({{0, 5, 8}},
+                                          {4, 5, 6, 7, 8, 9, 10, 11}),
+      *encoded_list(2).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({{0, 0, 4}}, {12, 13, 14, 15}),
+      *encoded_list(3).get<RaggedTensorVariant>());
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, NonEmptyBatchedInput) {
@@ -350,26 +256,6 @@ TEST_F(RaggedTensorToVariantKernelTest, NonEmptyBatchedInput) {
                                                7, 8, 9, 12, 13, 14};
   const std::vector<int> batched_values = {0, 1, 1, 2, 2, 3, 4,
                                            5, 6, 7, 8, 9, 8, 9};
-  const std::vector<int64> component_split_1_1 = {0, 1, 3, 4, 5, 6};
-  const std::vector<int64> component_split_1_2 = {0, 2, 3, 4, 5, 6, 7};
-  const std::vector<int64> component_split_2_1 = {0, 1, 2, 3, 4, 5};
-  const std::vector<int64> component_split_2_2 = {0, 1, 2, 5, 6, 7};
-  const std::vector<int> component_values_1 = {0, 1, 1, 2, 2, 3, 4};
-  const std::vector<int> component_values_2 = {5, 6, 7, 8, 9, 8, 9};
-
-  Tensor expected_splits_1_1(DT_INT64, TensorShape({6}));
-  Tensor expected_splits_1_2(DT_INT64, TensorShape({7}));
-  Tensor expected_splits_2_1(DT_INT64, TensorShape({6}));
-  Tensor expected_splits_2_2(DT_INT64, TensorShape({6}));
-  Tensor expected_values_1(DT_INT32, TensorShape({7}));
-  Tensor expected_values_2(DT_INT32, TensorShape({7}));
-
-  test::FillValues<int64>(&expected_splits_1_1, component_split_1_1);
-  test::FillValues<int64>(&expected_splits_1_2, component_split_1_2);
-  test::FillValues<int64>(&expected_splits_2_1, component_split_2_1);
-  test::FillValues<int64>(&expected_splits_2_2, component_split_2_2);
-  test::FillValues<int>(&expected_values_1, component_values_1);
-  test::FillValues<int>(&expected_values_2, component_values_2);
 
   BuildEncodeRaggedTensorGraph<int, int64>(
       {batched_splits_1, batched_splits_2, batched_splits_3}, TensorShape({14}),
@@ -379,31 +265,14 @@ TEST_F(RaggedTensorToVariantKernelTest, NonEmptyBatchedInput) {
   const auto& encoded_list = GetOutput(0)->vec<Variant>();
   EXPECT_EQ(encoded_list.size(), 2);
 
-  const Variant& encoded_splits_1_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_splits_1_2 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_values_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(2);
-  const Variant& encoded_splits_2_1 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_splits_2_2 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_values_2 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(2);
-
-  test::ExpectTensorEqual<int64>(*encoded_splits_1_1.get<Tensor>(),
-                                 expected_splits_1_1);
-  test::ExpectTensorEqual<int64>(*encoded_splits_1_2.get<Tensor>(),
-                                 expected_splits_1_2);
-  test::ExpectTensorEqual<int64>(*encoded_splits_2_1.get<Tensor>(),
-                                 expected_splits_2_1);
-  test::ExpectTensorEqual<int64>(*encoded_splits_2_2.get<Tensor>(),
-                                 expected_splits_2_2);
-  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
-                               expected_values_1);
-  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
-                               expected_values_2);
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>(
+          {{0, 1, 3, 4, 5, 6}, {0, 2, 3, 4, 5, 6, 7}}, {0, 1, 1, 2, 2, 3, 4}),
+      *encoded_list(0).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>(
+          {{0, 1, 2, 3, 4, 5}, {0, 1, 2, 5, 6, 7}}, {5, 6, 7, 8, 9, 8, 9}),
+      *encoded_list(1).get<RaggedTensorVariant>());
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, NonEmptyBatchedInputInt32Splits) {
@@ -424,28 +293,8 @@ TEST_F(RaggedTensorToVariantKernelTest, NonEmptyBatchedInputInt32Splits) {
                                              7, 8, 9, 12, 13, 14};
   const std::vector<int> batched_values = {0, 1, 1, 2, 2, 3, 4,
                                            5, 6, 7, 8, 9, 8, 9};
-  const std::vector<int> component_split_1_1 = {0, 1, 3, 4, 5, 6};
-  const std::vector<int> component_split_1_2 = {0, 2, 3, 4, 5, 6, 7};
-  const std::vector<int> component_split_2_1 = {0, 1, 2, 3, 4, 5};
-  const std::vector<int> component_split_2_2 = {0, 1, 2, 5, 6, 7};
-  const std::vector<int> component_values_1 = {0, 1, 1, 2, 2, 3, 4};
-  const std::vector<int> component_values_2 = {5, 6, 7, 8, 9, 8, 9};
 
-  Tensor expected_splits_1_1(DT_INT32, TensorShape({6}));
-  Tensor expected_splits_1_2(DT_INT32, TensorShape({7}));
-  Tensor expected_splits_2_1(DT_INT32, TensorShape({6}));
-  Tensor expected_splits_2_2(DT_INT32, TensorShape({6}));
-  Tensor expected_values_1(DT_INT32, TensorShape({7}));
-  Tensor expected_values_2(DT_INT32, TensorShape({7}));
-
-  test::FillValues<int>(&expected_splits_1_1, component_split_1_1);
-  test::FillValues<int>(&expected_splits_1_2, component_split_1_2);
-  test::FillValues<int>(&expected_splits_2_1, component_split_2_1);
-  test::FillValues<int>(&expected_splits_2_2, component_split_2_2);
-  test::FillValues<int>(&expected_values_1, component_values_1);
-  test::FillValues<int>(&expected_values_2, component_values_2);
-
-  BuildEncodeRaggedTensorGraph<int, int>(
+  BuildEncodeRaggedTensorGraph<int, int32>(
       {batched_splits_1, batched_splits_2, batched_splits_3}, TensorShape({14}),
       batched_values, true);
   TF_ASSERT_OK(RunOpKernel());
@@ -453,31 +302,14 @@ TEST_F(RaggedTensorToVariantKernelTest, NonEmptyBatchedInputInt32Splits) {
   const auto& encoded_list = GetOutput(0)->vec<Variant>();
   EXPECT_EQ(encoded_list.size(), 2);
 
-  const Variant& encoded_splits_1_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_splits_1_2 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_values_1 =
-      encoded_list(0).get<Tensor>()->vec<Variant>()(2);
-  const Variant& encoded_splits_2_1 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_splits_2_2 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_values_2 =
-      encoded_list(1).get<Tensor>()->vec<Variant>()(2);
-
-  test::ExpectTensorEqual<int>(*encoded_splits_1_1.get<Tensor>(),
-                               expected_splits_1_1);
-  test::ExpectTensorEqual<int>(*encoded_splits_1_2.get<Tensor>(),
-                               expected_splits_1_2);
-  test::ExpectTensorEqual<int>(*encoded_splits_2_1.get<Tensor>(),
-                               expected_splits_2_1);
-  test::ExpectTensorEqual<int>(*encoded_splits_2_2.get<Tensor>(),
-                               expected_splits_2_2);
-  test::ExpectTensorEqual<int>(*encoded_values_1.get<Tensor>(),
-                               expected_values_1);
-  test::ExpectTensorEqual<int>(*encoded_values_2.get<Tensor>(),
-                               expected_values_2);
+  ExpectRaggedTensorVariantEqual<int, int32>(
+      CreateVariantFromRagged<int, int32>(
+          {{0, 1, 3, 4, 5, 6}, {0, 2, 3, 4, 5, 6, 7}}, {0, 1, 1, 2, 2, 3, 4}),
+      *encoded_list(0).get<RaggedTensorVariant>());
+  ExpectRaggedTensorVariantEqual<int, int32>(
+      CreateVariantFromRagged<int, int32>(
+          {{0, 1, 2, 3, 4, 5}, {0, 1, 2, 5, 6, 7}}, {5, 6, 7, 8, 9, 8, 9}),
+      *encoded_list(1).get<RaggedTensorVariant>());
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, NonBatchInput) {
@@ -491,33 +323,17 @@ TEST_F(RaggedTensorToVariantKernelTest, NonBatchInput) {
   const std::vector<int> batched_values = {1, 2,  3,  4,  5,  6,  7, 8,
                                            9, 10, 11, 12, 13, 14, 15};
 
-  Tensor batched_ragged_splits_1(DT_INT64, TensorShape({5}));
-  Tensor batched_ragged_splits_2(DT_INT64, TensorShape({8}));
-  Tensor batched_ragged_values(DT_INT32, TensorShape({15}));
-
-  test::FillValues<int64>(&batched_ragged_splits_1, batched_splits_1);
-  test::FillValues<int64>(&batched_ragged_splits_2, batched_splits_2);
-  test::FillValues<int>(&batched_ragged_values, batched_values);
-
   BuildEncodeRaggedTensorGraph<int, int64>({batched_splits_1, batched_splits_2},
                                            TensorShape({15}), batched_values,
                                            false);
   TF_ASSERT_OK(RunOpKernel());
 
   const auto& encoded_scalar = GetOutput(0)->scalar<Variant>()();
-  const Variant& encoded_splits_1 =
-      encoded_scalar.get<Tensor>()->vec<Variant>()(0);
-  const Variant& encoded_splits_2 =
-      encoded_scalar.get<Tensor>()->vec<Variant>()(1);
-  const Variant& encoded_values =
-      encoded_scalar.get<Tensor>()->vec<Variant>()(2);
 
-  test::ExpectTensorEqual<int64>(*encoded_splits_1.get<Tensor>(),
-                                 batched_ragged_splits_1);
-  test::ExpectTensorEqual<int64>(*encoded_splits_2.get<Tensor>(),
-                                 batched_ragged_splits_2);
-  test::ExpectTensorEqual<int>(*encoded_values.get<Tensor>(),
-                               batched_ragged_values);
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({batched_splits_1, batched_splits_2},
+                                          batched_values),
+      *encoded_scalar.get<RaggedTensorVariant>());
 }
 
 TEST_F(RaggedTensorToVariantKernelTest, ShapeFnTestBatched) {
@@ -598,17 +414,14 @@ TEST_F(RaggedTensorToVariantKernelTest, ShapeFnTestNotBatched) {
 
 TEST_F(RaggedTensorToVariantKernelTest, NonRaggedInput) {
   const std::vector<int> values = {1, 2, 3, 4, 5, 6};
-  Tensor expected_values(DT_INT32, TensorShape({6}));
-  test::FillValues<int>(&expected_values, values);
 
   BuildEncodeRaggedTensorGraph<int, int64>({}, TensorShape({6}), values, false);
   TF_ASSERT_OK(RunOpKernel());
 
   const auto& encoded_scalar = GetOutput(0)->scalar<Variant>()();
-  const Variant& encoded_values =
-      encoded_scalar.get<Tensor>()->vec<Variant>()(0);
-
-  test::ExpectTensorEqual<int>(*encoded_values.get<Tensor>(), expected_values);
+  ExpectRaggedTensorVariantEqual<int, int64>(
+      CreateVariantFromRagged<int, int64>({}, values),
+      *encoded_scalar.get<RaggedTensorVariant>());
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/ragged_tensor_variant.cc b/tensorflow/core/kernels/ragged_tensor_variant.cc
new file mode 100644
index 00000000000..9466313819b
--- /dev/null
+++ b/tensorflow/core/kernels/ragged_tensor_variant.cc
@@ -0,0 +1,86 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include "tensorflow/core/kernels/ragged_tensor_variant.h"
+
+namespace tensorflow {
+
+string RaggedTensorVariant::TypeName() const { return "RaggedTensorVariant"; }
+
+string RaggedTensorVariant::DebugString() const {
+  return absl::StrCat(
+      "RaggedTensorVariant(dtype=", DataTypeString(values_.dtype()),
+      ", ragged_rank=", nested_splits_.size(), ", splits_dtype=",
+      DataTypeString(nested_splits_.empty() ? DT_INVALID
+                                            : nested_splits_.back().dtype()));
+}
+
+void RaggedTensorVariant::Encode(VariantTensorData* data) const {
+  data->set_type_name(TypeName());
+  for (const auto& splits : nested_splits_) {
+    *data->add_tensors() = splits;
+  }
+  *data->add_tensors() = values_;
+}
+
+bool RaggedTensorVariant::Decode(const VariantTensorData& data) {
+  if (data.tensors_size() < 1) {
+    return false;
+  }
+  nested_splits_.assign(data.tensors().begin(),
+                        std::prev(data.tensors().end()));
+  values_ = data.tensors().back();
+  return true;
+}
+
+namespace {
+
+Status RaggedTensorVariantDeviceCopy(
+    const RaggedTensorVariant& from, RaggedTensorVariant* to,
+    const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copy) {
+  TF_RETURN_IF_ERROR(copy(from.values(), to->mutable_values()));
+  // TODO(b/170415165) Should we use `copy` to move splits from device<->host?
+  *to->mutable_nested_splits() = from.nested_splits();
+  return Status::OK();
+}
+
+}  // namespace
+
+REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(
+    ZEROS_LIKE_VARIANT_UNARY_OP, DEVICE_CPU, RaggedTensorVariant,
+    RaggedTensorVariantZerosLike<CPUDevice>);
+
+REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(
+    ADD_VARIANT_BINARY_OP, DEVICE_CPU, RaggedTensorVariant,
+    RaggedTensorVariantBinaryAdd<CPUDevice>);
+
+REGISTER_UNARY_VARIANT_DECODE_FUNCTION(RaggedTensorVariant,
+                                       "RaggedTensorVariant");
+
+#define REGISTER_RAGGED_TENSOR_VARIANT_COPY(DIRECTION)  \
+  INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION( \
+      RaggedTensorVariant, DIRECTION, RaggedTensorVariantDeviceCopy)
+
+REGISTER_RAGGED_TENSOR_VARIANT_COPY(VariantDeviceCopyDirection::HOST_TO_DEVICE);
+REGISTER_RAGGED_TENSOR_VARIANT_COPY(VariantDeviceCopyDirection::DEVICE_TO_HOST);
+REGISTER_RAGGED_TENSOR_VARIANT_COPY(
+    VariantDeviceCopyDirection::DEVICE_TO_DEVICE);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/ragged_tensor_variant.h b/tensorflow/core/kernels/ragged_tensor_variant.h
new file mode 100644
index 00000000000..730758a3e82
--- /dev/null
+++ b/tensorflow/core/kernels/ragged_tensor_variant.h
@@ -0,0 +1,110 @@
+#include "tensorflow/core/framework/tensor_key.h"
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RAGGED_TENSOR_VARIANT_H_
+#define TENSORFLOW_CORE_KERNELS_RAGGED_TENSOR_VARIANT_H_
+
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#include <vector>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+#include "tensorflow/core/util/tensor_ops_util.h"
+
+namespace tensorflow {
+
+// Class used to store a RaggedTensor as a Variant scalar.
+class RaggedTensorVariant {
+ public:
+  RaggedTensorVariant() {}
+  RaggedTensorVariant(Tensor values, const std::vector<Tensor>& nested_splits)
+      : values_(std::move(values)), nested_splits_(nested_splits) {}
+
+  // Variant support methods.
+  string TypeName() const;
+  string DebugString() const;
+  void Encode(VariantTensorData* data) const;
+  bool Decode(const VariantTensorData& data);
+
+  // The flat_values of the RaggedTensor.
+  const Tensor& values() const { return values_; }
+  Tensor* mutable_values() { return &values_; }
+  void set_values(const Tensor& new_values) { values_ = new_values; }
+
+  // The nested row_splits of the RaggedTensor.
+  int ragged_rank() const { return nested_splits_.size(); }
+  const std::vector<Tensor>& nested_splits() const { return nested_splits_; }
+  std::vector<Tensor>* mutable_nested_splits() { return &nested_splits_; }
+  const Tensor& splits(int i) const { return nested_splits_[i]; }
+  Tensor* mutable_splits(int i) { return &nested_splits_[i]; }
+  void set_nested_splits(const std::vector<Tensor>& nested_splits) {
+    nested_splits_ = nested_splits;
+  }
+  void append_splits(const Tensor& splits) { nested_splits_.push_back(splits); }
+
+ private:
+  Tensor values_;
+  std::vector<Tensor> nested_splits_;
+};
+
+template <typename Device>
+Status RaggedTensorVariantZerosLike(OpKernelContext* c,
+                                    const RaggedTensorVariant& x,
+                                    RaggedTensorVariant* y) {
+  y->set_nested_splits(x.nested_splits());
+  TF_RETURN_IF_ERROR(
+      ZerosLikeTensor<Device>(c, x.values(), y->mutable_values()));
+  return Status::OK();
+}
+
+template <typename Device>
+Status RaggedTensorVariantBinaryAdd(OpKernelContext* c,
+                                    const RaggedTensorVariant& x,
+                                    const RaggedTensorVariant& y,
+                                    RaggedTensorVariant* out) {
+  if (x.values().dtype() != y.values().dtype()) {
+    return errors::InvalidArgument(
+        "Can't add RaggedTensorVariants of different dtypes. One is ",
+        DataTypeString(x.values().dtype()), " and the other is ",
+        DataTypeString(y.values().dtype()));
+  }
+  if (x.ragged_rank() != y.ragged_rank()) {
+    return errors::InvalidArgument(
+        "Can't add RaggedTensorVariants of different ragged rank. ", "One is ",
+        x.ragged_rank(), " and the other is ", y.ragged_rank());
+  }
+  for (int i = 0; i < x.ragged_rank(); ++i) {
+    if (TensorKey(x.splits(i)) != TensorKey(y.splits(i))) {
+      return errors::InvalidArgument(
+          "Can't add RaggedTensorVariants with different row_splits.");
+    }
+  }
+  out->set_nested_splits(x.nested_splits());
+  TF_RETURN_IF_ERROR(BinaryAddTensors<Device>(c, x.values(), y.values(),
+                                              out->mutable_values()));
+  return Status::OK();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RAGGED_TENSOR_VARIANT_H_
diff --git a/tensorflow/core/kernels/random_binomial_op.cc b/tensorflow/core/kernels/random_binomial_op.cc
index 4647457ff6f..3c4027765b7 100644
--- a/tensorflow/core/kernels/random_binomial_op.cc
+++ b/tensorflow/core/kernels/random_binomial_op.cc
@@ -30,8 +30,10 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/rng_alg.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/random_ops_util.h"
 #include "tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h"
 #include "tensorflow/core/kernels/stateless_random_ops.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
@@ -182,7 +184,7 @@ struct RandomBinomialFunctor<CPUDevice, T, U> {
     // the sample shape and [H1, ... Hm] for the batch shape of the samples.
     // We have B1 * ... * Bk samples per batch member we need.
     auto DoWork = [num_batches, samples_per_batch, &bcast, &counts, &probs,
-                   &gen, &output](int start_output, int limit_output) {
+                   &gen, &output](int64 start_output, int64 limit_output) {
       // Vectorized intermediate calculations for uniform rejection sampling.
       // We always generate at most 4 samples.
       Eigen::array<T, 4> z;
@@ -253,7 +255,6 @@ struct RandomBinomialFunctor<CPUDevice, T, U> {
           }
         } else if (prob > T(0.5)) {
           T q = T(1) - prob;
-          double dcount = static_cast<double>(count);
           double dq = static_cast<double>(q);
           if (count * q >= T(10)) {
             for (int64 sample_idx = output_idx % samples_per_batch;
@@ -375,7 +376,7 @@ class RandomBinomialOp : public OpKernel {
     OP_REQUIRES(ctx, alg_tensor.dims() == 0,
                 errors::InvalidArgument("algorithm must be of shape [], not ",
                                         alg_tensor.shape().DebugString()));
-    Algorithm alg = alg_tensor.flat<Algorithm>()(0);
+    Algorithm alg = Algorithm(alg_tensor.flat<int64>()(0));
 
     int64 samples_per_batch = 1;
     const int64 num_sample_dims =
diff --git a/tensorflow/core/kernels/random_op.cc b/tensorflow/core/kernels/random_op.cc
index 152ab5f7d1e..bace2362e33 100644
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@@ -48,9 +48,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace {
 
@@ -77,7 +74,7 @@ class PhiloxRandomOp : public OpKernel {
     OP_REQUIRES_OK(ctx, AllocateOutputWithShape(ctx, shape, 0, &output));
     auto output_flat = output->flat<T>();
     functor::FillPhiloxRandom<Device, Distribution>()(
-        ctx, ctx->eigen_device<Device>(),
+        ctx, ctx->eigen_device<Device>(), /*key=*/nullptr, /*counter=*/nullptr,
         // Multiplier 256 is the same as in FillPhiloxRandomTask; do not change
         // it just here.
         generator_.ReserveRandomOutputs(output_flat.size(), 256),
@@ -126,7 +123,7 @@ class RandomUniformIntOp : public OpKernel {
 
     auto output_flat = output->flat<IntType>();
     functor::FillPhiloxRandom<Device, Distribution>()(
-        ctx, ctx->eigen_device<Device>(),
+        ctx, ctx->eigen_device<Device>(), /*key=*/nullptr, /*counter=*/nullptr,
         // Multiplier 256 is the same as in FillPhiloxRandomTask; do not change
         // it just here.
         generator_.ReserveRandomOutputs(output_flat.size(), 256),
@@ -205,7 +202,7 @@ class RandomGammaOp : public OpKernel {
     // avoid a couple flops which can be done on a per-alpha basis.
 
     auto DoWork = [samples_per_alpha, num_alphas, &rng, samples_flat,
-                   alpha_flat](int start_output, int limit_output) {
+                   alpha_flat](int64 start_output, int64 limit_output) {
       using Eigen::numext::exp;
       using Eigen::numext::log;
       using Eigen::numext::log1p;
@@ -457,52 +454,5 @@ TF_CALL_uint64(REGISTER_FULL_INT);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-
-#define REGISTER(TYPE)                                                         \
-  template struct functor::FillPhiloxRandom<                                   \
-      SYCLDevice, random::UniformDistribution<random::PhiloxRandom, TYPE>>;    \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("RandomUniform")                                                    \
-          .Device(DEVICE_SYCL)                                                 \
-          .HostMemory("shape")                                                 \
-          .TypeConstraint<TYPE>("dtype"),                                      \
-      PhiloxRandomOp<SYCLDevice, random::UniformDistribution<                  \
-                                     random::PhiloxRandom, TYPE>>);            \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("RandomStandardNormal")                                             \
-          .Device(DEVICE_SYCL)                                                 \
-          .HostMemory("shape")                                                 \
-          .TypeConstraint<TYPE>("dtype"),                                      \
-      PhiloxRandomOp<SYCLDevice,                                               \
-                     random::NormalDistribution<random::PhiloxRandom, TYPE>>); \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("TruncatedNormal")                                                  \
-          .Device(DEVICE_SYCL)                                                 \
-          .HostMemory("shape")                                                 \
-          .TypeConstraint<TYPE>("dtype"),                                      \
-      PhiloxRandomOp<                                                          \
-          SYCLDevice,                                                          \
-          random::TruncatedNormalDistribution<                                 \
-              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE>>);
-
-#define REGISTER_INT(IntType)                                   \
-  REGISTER_KERNEL_BUILDER(Name("RandomUniformInt")              \
-                              .Device(DEVICE_SYCL)              \
-                              .HostMemory("shape")              \
-                              .HostMemory("minval")             \
-                              .HostMemory("maxval")             \
-                              .TypeConstraint<IntType>("Tout"), \
-                          RandomUniformIntOp<SYCLDevice, IntType>);
-
-TF_CALL_float(REGISTER);
-TF_CALL_double(REGISTER);
-TF_CALL_int32(REGISTER_INT);
-TF_CALL_int64(REGISTER_INT);
-
-#undef REGISTER
-#undef REGISTER_INT
-
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/random_op.h b/tensorflow/core/kernels/random_op.h
index c3f138a87f6..5c8615344d9 100644
--- a/tensorflow/core/kernels/random_op.h
+++ b/tensorflow/core/kernels/random_op.h
@@ -34,10 +34,14 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 // NOTE: Due to inlining done by the compiler, you may need to add
 // explicit instantiation of the functor in random_op.cc.  See example
 // functor::FillPhiloxRandom<CPUDevice, random::UniformDistribution>.
+//
+// This functor can take the PhiloxRandom input from either device memory `key`
+// and `counter` or a stack value `gen`. If both `key` and `counter` are not
+// nullptr, they provide the input; otherwise `gen` provides the input.
 template <class Distribution>
 struct FillPhiloxRandom<CPUDevice, Distribution> {
-  void operator()(OpKernelContext* ctx, const CPUDevice& d,
-                  random::PhiloxRandom gen,
+  void operator()(OpKernelContext* ctx, const CPUDevice& d, const uint64* key,
+                  const uint64* counter, random::PhiloxRandom gen,
                   typename Distribution::ResultElementType* data, int64 size,
                   Distribution dist);
 };
@@ -47,25 +51,13 @@ typedef Eigen::GpuDevice GPUDevice;
 // Declares the partially GPU-specialized functor struct.
 template <class Distribution>
 struct FillPhiloxRandom<GPUDevice, Distribution> {
-  void operator()(OpKernelContext* ctx, const GPUDevice& d,
-                  random::PhiloxRandom gen,
+  void operator()(OpKernelContext* ctx, const GPUDevice& d, const uint64* key,
+                  const uint64* counter, random::PhiloxRandom gen,
                   typename Distribution::ResultElementType* data, int64 size,
                   Distribution dist);
 };
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#if TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-// Declares the partially SYCL-specialized functor struct.
-template <class Distribution>
-struct FillPhiloxRandom<SYCLDevice, Distribution> {
-  void operator()(OpKernelContext* ctx, const SYCLDevice& d,
-                  random::PhiloxRandom gen,
-                  typename Distribution::ResultElementType* data, int64 size,
-                  Distribution dist);
-};
-#endif  // TENSORFLOW_USE_SYCL
-
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/random_op_cpu.h b/tensorflow/core/kernels/random_op_cpu.h
index eac1faee2e4..461de98ccac 100644
--- a/tensorflow/core/kernels/random_op_cpu.h
+++ b/tensorflow/core/kernels/random_op_cpu.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/kernels/random_op.h"
+#include "tensorflow/core/kernels/random_ops_util.h"
 #include "tensorflow/core/lib/hash/crc32c.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
@@ -48,9 +49,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace functor {
 using random::PhiloxRandom;
@@ -62,8 +60,9 @@ using random::SingleSampleAdapter;
 template <typename Device, class Distribution>
 struct FillPhiloxRandom {
   typedef typename Distribution::ResultElementType T;
-  void operator()(OpKernelContext* ctx, const Device&, random::PhiloxRandom gen,
-                  T* data, int64 size, Distribution dist) {
+  void operator()(OpKernelContext* ctx, const Device&, const uint64* key,
+                  const uint64* counter, random::PhiloxRandom gen, T* data,
+                  int64 size, Distribution dist) {
     OP_REQUIRES(
         ctx, false,
         errors::Internal(
@@ -157,18 +156,24 @@ struct FillPhiloxRandomTask<Distribution, true> {
 // It splits the work into several tasks and run them in parallel
 template <class Distribution>
 void FillPhiloxRandom<CPUDevice, Distribution>::operator()(
-    OpKernelContext* context, const CPUDevice&, random::PhiloxRandom gen,
+    OpKernelContext* ctx, const CPUDevice&, const uint64* key,
+    const uint64* counter, random::PhiloxRandom gen,
     typename Distribution::ResultElementType* data, int64 size,
     Distribution dist) {
   const int kGroupSize = Distribution::kResultElementCount;
 
-  auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+  auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
 
   int64 total_group_count = (size + kGroupSize - 1) / kGroupSize;
 
   const int kGroupCost =
       random::PhiloxRandom::kResultElementCount *
       (random::PhiloxRandom::kElementCost + Distribution::kElementCost);
+
+  if (key != nullptr && counter != nullptr) {
+    gen = GetPhiloxRandomFromCounterKeyMem(counter, key);
+  }
+
   Shard(worker_threads.num_threads, worker_threads.workers, total_group_count,
         kGroupCost,
         [&gen, data, size, dist](int64 start_group, int64 limit_group) {
@@ -182,146 +187,6 @@ void FillPhiloxRandom<CPUDevice, Distribution>::operator()(
 
 }  // namespace functor
 
-#ifdef TENSORFLOW_USE_SYCL
-
-namespace functor {
-
-template <class Distribution, bool VariableSamplesPerOutput>
-struct FillPhiloxRandomKernel;
-
-template <class Distribution>
-struct FillPhiloxRandomKernel<Distribution, false> {
-  typedef typename Distribution::ResultElementType T;
-  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write,
-                                        sycl::access::target::global_buffer>;
-
-  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen,
-                         Distribution& dist)
-      : data_(data), gen_(gen), dist_(dist) {}
-
-  void operator()(sycl::nd_item<1> item) {
-    const size_t kGroupSize = Distribution::kResultElementCount;
-
-    const size_t item_id = item.get_global(0);
-    const size_t total_item_count = item.get_global_range();
-    size_t offset = item_id * kGroupSize;
-    gen_.Skip(item_id);
-
-    const size_t size = data_.get_size() / sizeof(T);
-    T* data = ConvertToActualTypeSycl(T, data_);
-
-    while (offset + kGroupSize <= size) {
-      const typename Distribution::ResultType samples = dist_(&gen_);
-      for (size_t i = 0; i < kGroupSize; ++i) {
-        data[offset + i] = samples[i];
-      }
-
-      offset += (total_item_count - 1) * kGroupSize;
-      gen_.Skip(total_item_count - 1);
-    }
-
-    const typename Distribution::ResultType samples = dist_(&gen_);
-    for (size_t i = 0; i < kGroupSize; ++i) {
-      if (offset >= size) {
-        return;
-      }
-      data[offset] = samples[i];
-      ++offset;
-    }
-  }
-
- private:
-  write_accessor data_;
-  random::PhiloxRandom gen_;
-  Distribution dist_;
-};
-
-template <class Distribution>
-struct FillPhiloxRandomKernel<Distribution, true> {
-  typedef typename Distribution::ResultElementType T;
-  using write_accessor = sycl::accessor<uint8_t, 1, sycl::access::mode::write,
-                                        sycl::access::target::global_buffer>;
-
-  FillPhiloxRandomKernel(write_accessor& data, random::PhiloxRandom& gen,
-                         Distribution& dist)
-      : data_(data), gen_(gen), dist_(dist) {}
-
-  void operator()(sycl::nd_item<1> item) {
-    using random::PhiloxRandom;
-    using random::SingleSampleAdapter;
-
-    const size_t kReservedSamplesPerOutput = 256;
-    const size_t kGroupSize = Distribution::kResultElementCount;
-    const size_t kGeneratorSkipPerOutputGroup =
-        kGroupSize * kReservedSamplesPerOutput /
-        PhiloxRandom::kResultElementCount;
-
-    const size_t item_id = item.get_global(0);
-    const size_t total_item_count = item.get_global_range();
-    size_t group_index = item_id;
-    size_t offset = group_index * kGroupSize;
-
-    T* data = ConvertToActualTypeSycl(T, data_);
-    const size_t size = data_.get_size() / sizeof(T);
-
-    while (offset < size) {
-      // Since each output takes a variable number of samples, we need to
-      // realign the generator to the beginning for the current output group
-      PhiloxRandom gen = gen_;
-      gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
-      SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
-
-      const typename Distribution::ResultType samples = dist_(&single_samples);
-
-      for (size_t i = 0; i < kGroupSize; ++i) {
-        if (offset >= size) {
-          return;
-        }
-        data[offset] = samples[i];
-        ++offset;
-      }
-
-      offset += (total_item_count - 1) * kGroupSize;
-      group_index += total_item_count;
-    }
-  }
-
- private:
-  write_accessor data_;
-  random::PhiloxRandom gen_;
-  Distribution dist_;
-};
-
-template <typename T>
-class FillRandomKernel;
-// Partial specialization for SYCL to fill the entire region with randoms
-// It splits the work into several tasks and run them in parallel
-template <class Distribution>
-void FillPhiloxRandom<SYCLDevice, Distribution>::operator()(
-    OpKernelContext* context, const SYCLDevice& device,
-    random::PhiloxRandom gen, typename Distribution::ResultElementType* data,
-    int64 size, Distribution dist) {
-  const size_t group_size = device.maxSyclThreadsPerBlock();
-  const size_t group_count = (size + group_size - 1) / group_size;
-
-  auto buffer = device.get_sycl_buffer(data);
-
-  device.sycl_queue().submit([&](sycl::handler& cgh) {
-    auto access = buffer.template get_access<sycl::access::mode::write>(cgh);
-
-    FillPhiloxRandomKernel<Distribution,
-                           Distribution::kVariableSamplesPerOutput>
-        task(access, gen, dist);
-    cgh.parallel_for<class FillRandomKernel<Distribution>>(
-        sycl::nd_range<1>(sycl::range<1>(group_count * group_size),
-                          sycl::range<1>(group_size)),
-        task);
-  });
-}
-
-}  // namespace functor
-
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/kernels/random_op_gpu.h b/tensorflow/core/kernels/random_op_gpu.h
index 70c96b45af5..e4b4c9ac77b 100644
--- a/tensorflow/core/kernels/random_op_gpu.h
+++ b/tensorflow/core/kernels/random_op_gpu.h
@@ -19,6 +19,7 @@ limitations under the License.
 #if defined(__CUDACC__) || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/kernels/random_op.h"
+#include "tensorflow/core/kernels/random_ops_util.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random_distributions.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
@@ -33,14 +34,16 @@ struct FillPhiloxRandomKernel;
 template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, false> {
   typedef typename Distribution::ResultElementType T;
-  PHILOX_DEVICE_INLINE void Run(random::PhiloxRandom gen, T* data, int64 size,
+  PHILOX_DEVICE_INLINE void Run(const uint64* key, const uint64* counter,
+                                random::PhiloxRandom gen, T* data, int64 size,
                                 Distribution dist);
 };
 
 template <class Distribution>
 struct FillPhiloxRandomKernel<Distribution, true> {
   typedef typename Distribution::ResultElementType T;
-  PHILOX_DEVICE_INLINE void Run(const random::PhiloxRandom& base_gen, T* data,
+  PHILOX_DEVICE_INLINE void Run(const uint64* key, const uint64* counter,
+                                random::PhiloxRandom base_gen, T* data,
                                 int64 size, Distribution dist);
 };
 
@@ -136,12 +139,16 @@ class SampleCopier<int64, 2> {
 // distribution. Each output takes a fixed number of samples.
 template <class Distribution>
 PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, false>::Run(
-    random::PhiloxRandom gen, T* data, int64 size, Distribution dist) {
+    const uint64* key, const uint64* counter, random::PhiloxRandom gen, T* data,
+    int64 size, Distribution dist) {
   const int kGroupSize = Distribution::kResultElementCount;
 
   const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x;
   const int32 total_thread_count = gridDim.x * blockDim.x;
   int32 offset = thread_id * kGroupSize;
+  if (key != nullptr && counter != nullptr) {
+    gen = GetPhiloxRandomFromCounterKeyMem(counter, key);
+  }
   gen.Skip(thread_id);
 
   const SampleCopier<T, kGroupSize> copier;
@@ -167,8 +174,8 @@ PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, false>::Run(
 // distribution. Each output takes a variable number of samples.
 template <class Distribution>
 PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, true>::Run(
-    const random::PhiloxRandom& base_gen, T* data, int64 size,
-    Distribution dist) {
+    const uint64* key, const uint64* counter, random::PhiloxRandom base_gen,
+    T* data, int64 size, Distribution dist) {
   using random::PhiloxRandom;
   using random::SingleSampleAdapter;
 
@@ -183,6 +190,9 @@ PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, true>::Run(
   int64 group_index = thread_id;
   int64 offset = group_index * kGroupSize;
 
+  if (key != nullptr && counter != nullptr) {
+    base_gen = GetPhiloxRandomFromCounterKeyMem(counter, key);
+  }
   while (offset < size) {
     // Since each output takes a variable number of samples, we need to
     // realign the generator to the beginning for the current output group
@@ -208,28 +218,32 @@ PHILOX_DEVICE_INLINE void FillPhiloxRandomKernel<Distribution, true>::Run(
 // A simple launch pad to call the correct function templates to fill the data
 template <class Distribution>
 __global__ void __launch_bounds__(1024)
-    FillPhiloxRandomKernelLaunch(random::PhiloxRandom base_gen,
+    FillPhiloxRandomKernelLaunch(const uint64* key, const uint64* counter,
+                                 random::PhiloxRandom base_gen,
                                  typename Distribution::ResultElementType* data,
                                  int64 size, Distribution dist) {
   FillPhiloxRandomKernel<Distribution,
                          Distribution::kVariableSamplesPerOutput>()
-      .Run(base_gen, data, size, dist);
+      .Run(key, counter, base_gen, data, size, dist);
 }
 
 // Partial specialization for GPU
 template <class Distribution>
 void FillPhiloxRandom<GPUDevice, Distribution>::operator()(
-    OpKernelContext*, const GPUDevice& d, random::PhiloxRandom gen,
+    OpKernelContext*, const GPUDevice& d, const uint64* key,
+    const uint64* counter, random::PhiloxRandom gen,
     typename Distribution::ResultElementType* data, int64 size,
     Distribution dist) {
+  if (size == 0) return;
   const int32 block_size = d.maxGpuThreadsPerBlock();
   const int32 num_blocks =
-      (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) /
+      std::min<int64>(
+          d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor(),
+          size + block_size - 1) /
       block_size;
-
   TF_CHECK_OK(GpuLaunchKernel(FillPhiloxRandomKernelLaunch<Distribution>,
-                              num_blocks, block_size, 0, d.stream(), gen, data,
-                              size, dist));
+                              num_blocks, block_size, 0, d.stream(), key,
+                              counter, gen, data, size, dist));
 }
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/random_ops_util.h b/tensorflow/core/kernels/random_ops_util.h
new file mode 100644
index 00000000000..728eac3485e
--- /dev/null
+++ b/tensorflow/core/kernels/random_ops_util.h
@@ -0,0 +1,72 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_RANDOM_OPS_UTIL_H_
+#define TENSORFLOW_CORE_KERNELS_RANDOM_OPS_UTIL_H_
+
+#include "tensorflow/core/lib/random/philox_random.h"
+
+namespace tensorflow {
+
+using random::PhiloxRandom;
+
+// The following 2 functions use the contract "lower 32 bits for the first
+// uint32, higher 32 bits for the second". Note that this is endian-neutral,
+// unlike a direct memory copy `memcpy(output, &input, 8)`.
+PHILOX_DEVICE_INLINE void Uint64ToUint32s(uint64 input, uint32* output1,
+                                          uint32* output2) {
+  *output1 = static_cast<uint32>(input);
+  *output2 = static_cast<uint32>(input >> 32);
+}
+
+PHILOX_DEVICE_INLINE uint64 Uint32sToUint64(uint32 input1, uint32 input2) {
+  auto u64_1 = static_cast<uint64>(input1);
+  auto u64_2 = static_cast<uint64>(input2);
+  return u64_1 | (u64_2 << 32);
+}
+
+PHILOX_DEVICE_INLINE PhiloxRandom::ResultType GetCounterFromMem(
+    uint64 const* ptr) {
+  PhiloxRandom::ResultType counter;
+  Uint64ToUint32s(ptr[0], &counter[0], &counter[1]);
+  Uint64ToUint32s(ptr[1], &counter[2], &counter[3]);
+  return counter;
+}
+
+PHILOX_DEVICE_INLINE void WriteCounterToMem(
+    PhiloxRandom::ResultType const& counter, uint64* ptr) {
+  ptr[0] = Uint32sToUint64(counter[0], counter[1]);
+  ptr[1] = Uint32sToUint64(counter[2], counter[3]);
+}
+
+PHILOX_DEVICE_INLINE PhiloxRandom::Key GetKeyFromMem(uint64 const* ptr) {
+  PhiloxRandom::Key key;
+  Uint64ToUint32s(ptr[0], &key[0], &key[1]);
+  return key;
+}
+
+PHILOX_DEVICE_INLINE void WriteKeyToMem(PhiloxRandom::Key const& key,
+                                        uint64* ptr) {
+  *ptr = Uint32sToUint64(key[0], key[1]);
+}
+
+PHILOX_DEVICE_INLINE PhiloxRandom GetPhiloxRandomFromCounterKeyMem(
+    uint64 const* counter_ptr, uint64 const* key_ptr) {
+  return PhiloxRandom(GetCounterFromMem(counter_ptr), GetKeyFromMem(key_ptr));
+}
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RANDOM_OPS_UTIL_H_
diff --git a/tensorflow/core/kernels/random_poisson_op.cc b/tensorflow/core/kernels/random_poisson_op.cc
index aa9a0bfe214..2c898d95749 100644
--- a/tensorflow/core/kernels/random_poisson_op.cc
+++ b/tensorflow/core/kernels/random_poisson_op.cc
@@ -97,7 +97,7 @@ struct PoissonFunctor<CPUDevice, T, U> {
     typedef random::UniformDistribution<random::PhiloxRandom, CT> Uniform;
 
     auto DoWork = [num_samples, num_rate, &rng, samples_flat, rate_flat](
-                      int start_output, int limit_output) {
+                      int64 start_output, int64 limit_output) {
       // Capturing "rng" by value would only make a copy for the _shared_
       // lambda.  Since we want to let each worker have its own copy, we pass
       // "rng" by reference and explicitly do a copy assignment.
@@ -150,6 +150,16 @@ struct PoissonFunctor<CPUDevice, T, U> {
           }
           continue;
         }
+        if (Eigen::numext::isinf(rate) && rate > CT(0)) {
+          // Fill the rest of the samples for the current rate value.
+          for (int64 sample_idx = output_idx % num_samples;
+               sample_idx < num_samples && output_idx < limit_output;
+               sample_idx++, output_idx++) {
+            U k = Eigen::NumTraits<U>::infinity();
+            samples_rate_output[sample_idx * num_rate] = k;
+          }
+          continue;
+        }
         // Transformed rejection due to Hormann.
         //
         // Given a CDF F(x), and G(x), a dominating distribution chosen such
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 103d01b6c58..59184ab061d 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -276,7 +276,7 @@ __global__ __launch_bounds__(1024) void ColumnReduceMax16ColumnsKernel(
     // This is to mimic the following, but without any constructors:
     //   __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE *
     //   (TF_RED_WARPSIZE+1)];
-#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG
+#if GOOGLE_CUDA
   __shared__ __align__(alignof(value_type)) char
       partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) *
                        sizeof(value_type)];
@@ -337,7 +337,7 @@ __global__ __launch_bounds__(1024) void ColumnReduceKernel(
     // This is to mimic the following, but without constructors:
     //     __shared__ storage_type<value_type> partial_sums[TF_RED_WARPSIZE *
     //     (TF_RED_WARPSIZE + 1)];
-#if GOOGLE_CUDA || TENSORFLOW_COMPILER_IS_HIP_CLANG
+#if GOOGLE_CUDA
   __shared__ __align__(alignof(value_type)) char
       partial_sums_raw[TF_RED_WARPSIZE * (TF_RED_WARPSIZE + 1) *
                        sizeof(value_type)];
diff --git a/tensorflow/core/kernels/reduction_ops_common.h b/tensorflow/core/kernels/reduction_ops_common.h
index 072699288db..2dbf5f7d307 100644
--- a/tensorflow/core/kernels/reduction_ops_common.h
+++ b/tensorflow/core/kernels/reduction_ops_common.h
@@ -41,9 +41,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device>
 struct Constants {
@@ -71,10 +68,6 @@ struct ConstantsBase {
 };
 template <>
 struct Constants<CPUDevice> : ConstantsBase {};
-#ifdef TENSORFLOW_USE_SYCL
-template <>
-struct Constants<SYCLDevice> : ConstantsBase {};
-#endif  // TENSORFLOW_USE_SYCL
 #endif  // EIGEN_HAS_INDEX_LIST
 
 class ReductionHelper {
@@ -279,11 +272,6 @@ struct ReduceFunctorBase {
 template <typename Reducer>
 struct ReduceFunctor<CPUDevice, Reducer>
     : ReduceFunctorBase<CPUDevice, Reducer> {};
-#if TENSORFLOW_USE_SYCL
-template <typename Reducer>
-struct ReduceFunctor<SYCLDevice, Reducer>
-    : ReduceFunctorBase<SYCLDevice, Reducer> {};
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_euclidean.cc b/tensorflow/core/kernels/reduction_ops_euclidean.cc
index 9bc11e29069..370328a829f 100644
--- a/tensorflow/core/kernels/reduction_ops_euclidean.cc
+++ b/tensorflow/core/kernels/reduction_ops_euclidean.cc
@@ -58,25 +58,5 @@ TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
 
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                          \
-  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
-                              .Device(DEVICE_SYCL)                           \
-                              .TypeConstraint<type>("T")                     \
-                              .TypeConstraint<int32>("Tidx")                 \
-                              .HostMemory("reduction_indices"),              \
-                          ReductionOp<SYCLDevice, type, int32,               \
-                                      functor::EuclideanNormReducer<type>>); \
-  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
-                              .Device(DEVICE_SYCL)                           \
-                              .TypeConstraint<type>("T")                     \
-                              .TypeConstraint<int64>("Tidx")                 \
-                              .HostMemory("reduction_indices"),              \
-                          ReductionOp<SYCLDevice, type, int64,               \
-                                      functor::EuclideanNormReducer<type>>);
-REGISTER_SYCL_KERNELS(float);
-REGISTER_SYCL_KERNELS(double);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_max.cc b/tensorflow/core/kernels/reduction_ops_max.cc
index fe9775f7f1d..99b17f402af 100644
--- a/tensorflow/core/kernels/reduction_ops_max.cc
+++ b/tensorflow/core/kernels/reduction_ops_max.cc
@@ -82,44 +82,5 @@ REGISTER_KERNEL_BUILDER(
 
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                        \
-  REGISTER_KERNEL_BUILDER(Name("Max")                                      \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("T")                   \
-                              .TypeConstraint<int32>("Tidx")               \
-                              .HostMemory("reduction_indices"),            \
-                          ReductionOp<SYCLDevice, type, int32,             \
-                                      Eigen::internal::MaxReducer<type>>); \
-  REGISTER_KERNEL_BUILDER(Name("Max")                                      \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("T")                   \
-                              .TypeConstraint<int64>("Tidx")               \
-                              .HostMemory("reduction_indices"),            \
-                          ReductionOp<SYCLDevice, type, int64,             \
-                                      Eigen::internal::MaxReducer<type>>);
-REGISTER_SYCL_KERNELS(float);
-REGISTER_SYCL_KERNELS(double);
-
-REGISTER_KERNEL_BUILDER(
-    Name("Max")
-        .Device(DEVICE_SYCL)
-        .HostMemory("reduction_indices")
-        .HostMemory("input")
-        .HostMemory("output")
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int32>("Tidx"),
-    ReductionOp<CPUDevice, int32, int32, Eigen::internal::MaxReducer<int32>>);
-REGISTER_KERNEL_BUILDER(
-    Name("Max")
-        .Device(DEVICE_SYCL)
-        .HostMemory("reduction_indices")
-        .HostMemory("input")
-        .HostMemory("output")
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int64>("Tidx"),
-    ReductionOp<CPUDevice, int32, int64, Eigen::internal::MaxReducer<int32>>);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_mean.cc b/tensorflow/core/kernels/reduction_ops_mean.cc
index e96d6f829ac..2eff4752080 100644
--- a/tensorflow/core/kernels/reduction_ops_mean.cc
+++ b/tensorflow/core/kernels/reduction_ops_mean.cc
@@ -58,25 +58,5 @@ TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
 
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                      \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("Mean")                                                       \
-          .Device(DEVICE_SYCL)                                           \
-          .TypeConstraint<type>("T")                                     \
-          .TypeConstraint<int32>("Tidx")                                 \
-          .HostMemory("reduction_indices"),                              \
-      ReductionOp<SYCLDevice, type, int32, functor::MeanReducer<type>>); \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("Mean")                                                       \
-          .Device(DEVICE_SYCL)                                           \
-          .TypeConstraint<type>("T")                                     \
-          .TypeConstraint<int64>("Tidx")                                 \
-          .HostMemory("reduction_indices"),                              \
-      ReductionOp<SYCLDevice, type, int64, functor::MeanReducer<type>>);
-REGISTER_SYCL_KERNELS(float);
-REGISTER_SYCL_KERNELS(double);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_min.cc b/tensorflow/core/kernels/reduction_ops_min.cc
index 9f1feae969e..be1d09352e0 100644
--- a/tensorflow/core/kernels/reduction_ops_min.cc
+++ b/tensorflow/core/kernels/reduction_ops_min.cc
@@ -80,44 +80,5 @@ REGISTER_KERNEL_BUILDER(
 
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                        \
-  REGISTER_KERNEL_BUILDER(Name("Min")                                      \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("T")                   \
-                              .TypeConstraint<int32>("Tidx")               \
-                              .HostMemory("reduction_indices"),            \
-                          ReductionOp<SYCLDevice, type, int32,             \
-                                      Eigen::internal::MinReducer<type>>); \
-  REGISTER_KERNEL_BUILDER(Name("Min")                                      \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("T")                   \
-                              .TypeConstraint<int64>("Tidx")               \
-                              .HostMemory("reduction_indices"),            \
-                          ReductionOp<SYCLDevice, type, int64,             \
-                                      Eigen::internal::MinReducer<type>>);
-REGISTER_SYCL_KERNELS(float);
-REGISTER_SYCL_KERNELS(double);
-
-REGISTER_KERNEL_BUILDER(
-    Name("Min")
-        .Device(DEVICE_SYCL)
-        .HostMemory("reduction_indices")
-        .HostMemory("input")
-        .HostMemory("output")
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int32>("Tidx"),
-    ReductionOp<CPUDevice, int32, int32, Eigen::internal::MinReducer<int32>>);
-REGISTER_KERNEL_BUILDER(
-    Name("Min")
-        .Device(DEVICE_SYCL)
-        .HostMemory("reduction_indices")
-        .HostMemory("input")
-        .HostMemory("output")
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int64>("Tidx"),
-    ReductionOp<CPUDevice, int32, int64, Eigen::internal::MinReducer<int32>>);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_prod.cc b/tensorflow/core/kernels/reduction_ops_prod.cc
index 33742e97146..a9dfbbca67d 100644
--- a/tensorflow/core/kernels/reduction_ops_prod.cc
+++ b/tensorflow/core/kernels/reduction_ops_prod.cc
@@ -59,26 +59,5 @@ TF_CALL_COMPLEX_TYPES(REGISTER_GPU_KERNELS);
 
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                         \
-  REGISTER_KERNEL_BUILDER(Name("Prod")                                      \
-                              .Device(DEVICE_SYCL)                          \
-                              .TypeConstraint<type>("T")                    \
-                              .TypeConstraint<int32>("Tidx")                \
-                              .HostMemory("reduction_indices"),             \
-                          ReductionOp<SYCLDevice, type, int32,              \
-                                      Eigen::internal::ProdReducer<type>>); \
-  REGISTER_KERNEL_BUILDER(Name("Prod")                                      \
-                              .Device(DEVICE_SYCL)                          \
-                              .TypeConstraint<type>("T")                    \
-                              .TypeConstraint<int64>("Tidx")                \
-                              .HostMemory("reduction_indices"),             \
-                          ReductionOp<SYCLDevice, type, int64,              \
-                                      Eigen::internal::ProdReducer<type>>);
-REGISTER_SYCL_KERNELS(int32);
-REGISTER_SYCL_KERNELS(float);
-REGISTER_SYCL_KERNELS(double);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc
index b5f7a5d7089..1c3c03f032c 100644
--- a/tensorflow/core/kernels/reduction_ops_sum.cc
+++ b/tensorflow/core/kernels/reduction_ops_sum.cc
@@ -81,44 +81,5 @@ REGISTER_KERNEL_BUILDER(
 
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(type)                                        \
-  REGISTER_KERNEL_BUILDER(Name("Sum")                                      \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("T")                   \
-                              .TypeConstraint<int32>("Tidx")               \
-                              .HostMemory("reduction_indices"),            \
-                          ReductionOp<SYCLDevice, type, int32,             \
-                                      Eigen::internal::SumReducer<type>>); \
-  REGISTER_KERNEL_BUILDER(Name("Sum")                                      \
-                              .Device(DEVICE_SYCL)                         \
-                              .TypeConstraint<type>("T")                   \
-                              .TypeConstraint<int64>("Tidx")               \
-                              .HostMemory("reduction_indices"),            \
-                          ReductionOp<SYCLDevice, type, int64,             \
-                                      Eigen::internal::SumReducer<type>>);
-REGISTER_SYCL_KERNELS(float);
-REGISTER_SYCL_KERNELS(double);
-
-REGISTER_KERNEL_BUILDER(
-    Name("Sum")
-        .Device(DEVICE_SYCL)
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int32>("Tidx")
-        .HostMemory("input")
-        .HostMemory("output")
-        .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, int32, int32, Eigen::internal::SumReducer<int32>>);
-REGISTER_KERNEL_BUILDER(
-    Name("Sum")
-        .Device(DEVICE_SYCL)
-        .TypeConstraint<int32>("T")
-        .TypeConstraint<int64>("Tidx")
-        .HostMemory("input")
-        .HostMemory("output")
-        .HostMemory("reduction_indices"),
-    ReductionOp<CPUDevice, int32, int64, Eigen::internal::SumReducer<int32>>);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc
index 784c977ac50..cd6826a65a0 100644
--- a/tensorflow/core/kernels/relu_op.cc
+++ b/tensorflow/core/kernels/relu_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/kernels/relu_op.h"
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -29,9 +30,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_RELU_KERNELS(type)                                       \
   REGISTER_KERNEL_BUILDER(                                                \
@@ -71,7 +69,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_RELU_KERNELS);
       SeluGradOp<CPUDevice, type>)
 
 // Elu and Selu only make sense with float or double.
-TF_CALL_GPU_NUMBER_TYPES(REGISTER_ELU_KERNELS);
+TF_CALL_FLOAT_TYPES(REGISTER_ELU_KERNELS);
 #undef REGISTER_ELU_KERNELS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -211,42 +209,4 @@ REGISTER_KERNEL_BUILDER(
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-// Registration of the GPU implementations.
-#define REGISTER_SYCL_KERNELS(type)                                        \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("Relu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),          \
-      ReluOp<SYCLDevice, type>);                                           \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("ReluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),      \
-      ReluGradOp<SYCLDevice, type>);                                       \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("Relu6").Device(DEVICE_SYCL).TypeConstraint<type>("T"),         \
-      Relu6Op<SYCLDevice, type>);                                          \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("Relu6Grad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),     \
-      Relu6GradOp<SYCLDevice, type>);                                      \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("LeakyRelu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),     \
-      LeakyReluOp<SYCLDevice, type>);                                      \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("LeakyReluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      LeakyReluGradOp<SYCLDevice, type>);                                  \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("Elu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),           \
-      EluOp<SYCLDevice, type>);                                            \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("EluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),       \
-      EluGradOp<SYCLDevice, type>);                                        \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("Selu").Device(DEVICE_SYCL).TypeConstraint<type>("T"),          \
-      SeluOp<SYCLDevice, type>);                                           \
-  REGISTER_KERNEL_BUILDER(                                                 \
-      Name("SeluGrad").Device(DEVICE_SYCL).TypeConstraint<type>("T"),      \
-      SeluGradOp<SYCLDevice, type>)
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNELS);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
-
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/reshape_op.cc b/tensorflow/core/kernels/reshape_op.cc
index 9860448947a..d43cc5a92ea 100644
--- a/tensorflow/core/kernels/reshape_op.cc
+++ b/tensorflow/core/kernels/reshape_op.cc
@@ -46,45 +46,6 @@ TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_KERNEL);
 TF_CALL_bool(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                              \
-  REGISTER_KERNEL_BUILDER(Name("Reshape")                       \
-                              .Device(DEVICE_SYCL)              \
-                              .HostMemory("shape")              \
-                              .TypeConstraint<type>("T")        \
-                              .TypeConstraint<int32>("Tshape"), \
-                          ReshapeOp);                           \
-  REGISTER_KERNEL_BUILDER(Name("Reshape")                       \
-                              .Device(DEVICE_SYCL)              \
-                              .HostMemory("shape")              \
-                              .TypeConstraint<type>("T")        \
-                              .TypeConstraint<int64>("Tshape"), \
-                          ReshapeOp);
-REGISTER_SYCL_KERNEL(float)
-REGISTER_SYCL_KERNEL(double)
-REGISTER_SYCL_KERNEL(uint8)
-REGISTER_SYCL_KERNEL(int8)
-REGISTER_SYCL_KERNEL(int64)
-REGISTER_SYCL_KERNEL(uint16)
-
-REGISTER_KERNEL_BUILDER(Name("Reshape")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("tensor")
-                            .HostMemory("shape")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Tshape"),
-                        ReshapeOp);
-REGISTER_KERNEL_BUILDER(Name("Reshape")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("tensor")
-                            .HostMemory("shape")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int64>("Tshape"),
-                        ReshapeOp);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
diff --git a/tensorflow/core/kernels/reverse_op.cc b/tensorflow/core/kernels/reverse_op.cc
index 393231f156c..4b4aa05fc7b 100644
--- a/tensorflow/core/kernels/reverse_op.cc
+++ b/tensorflow/core/kernels/reverse_op.cc
@@ -34,9 +34,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace {
 
@@ -399,52 +396,4 @@ REGISTER_KERNEL_BUILDER(Name("ReverseV2")
                         ReverseV2Op<CPUDevice, int32, int64>);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(T)                             \
-  REGISTER_KERNEL_BUILDER(Name("Reverse")                    \
-                              .Device(DEVICE_SYCL)           \
-                              .TypeConstraint<T>("T")        \
-                              .HostMemory("dims"),           \
-                          ReverseOp<SYCLDevice, T>)          \
-  REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
-                              .Device(DEVICE_SYCL)           \
-                              .TypeConstraint<T>("T")        \
-                              .TypeConstraint<int32>("Tidx") \
-                              .HostMemory("axis"),           \
-                          ReverseV2Op<SYCLDevice, T, int32>) \
-  REGISTER_KERNEL_BUILDER(Name("ReverseV2")                  \
-                              .Device(DEVICE_SYCL)           \
-                              .TypeConstraint<T>("T")        \
-                              .TypeConstraint<int64>("Tidx") \
-                              .HostMemory("axis"),           \
-                          ReverseV2Op<SYCLDevice, T, int64>)
-TF_CALL_uint8(REGISTER_SYCL_KERNELS);
-TF_CALL_int8(REGISTER_SYCL_KERNELS);
-TF_CALL_float(REGISTER_SYCL_KERNELS);
-TF_CALL_double(REGISTER_SYCL_KERNELS);
-
-REGISTER_KERNEL_BUILDER(Name("Reverse")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("tensor")
-                            .HostMemory("dims")
-                            .HostMemory("output"),
-                        ReverseOp<CPUDevice, int32>);
-REGISTER_KERNEL_BUILDER(Name("ReverseV2")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Tidx")
-                            .HostMemory("tensor")
-                            .HostMemory("axis")
-                            .HostMemory("output"),
-                        ReverseV2Op<CPUDevice, int32, int32>);
-REGISTER_KERNEL_BUILDER(Name("ReverseV2")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int64>("Tidx")
-                            .HostMemory("tensor")
-                            .HostMemory("axis")
-                            .HostMemory("output"),
-                        ReverseV2Op<CPUDevice, int32, int64>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/rnn/BUILD b/tensorflow/core/kernels/rnn/BUILD
index 0f0e358d9ed..104cc1eba03 100644
--- a/tensorflow/core/kernels/rnn/BUILD
+++ b/tensorflow/core/kernels/rnn/BUILD
@@ -4,8 +4,8 @@
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_gpu_library",
-    "tf_kernel_library",
 )
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load(
     "//tensorflow/core/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
@@ -32,9 +32,9 @@ tf_gpu_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor",
         "//tensorflow/core/kernels:eigen_contraction_kernel",
         "//tensorflow/core/kernels:eigen_helpers",
+        "//tensorflow/core/platform:stream_executor",
         "//third_party/eigen3",
     ],
 )
diff --git a/tensorflow/core/kernels/rnn/blas_gemm.h b/tensorflow/core/kernels/rnn/blas_gemm.h
index 74f4cd2bb39..126e1edef17 100644
--- a/tensorflow/core/kernels/rnn/blas_gemm.h
+++ b/tensorflow/core/kernels/rnn/blas_gemm.h
@@ -25,11 +25,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
 #endif
 
-#ifdef __HIP_DEVICE_COMPILE__
-// Provide ldexp float overload for HIP, it's missing in their headers.
-__device__ inline float ldexp(float x, int exp) { return ldexpf(x, exp); }
-#endif
-
 namespace tensorflow {
 class OpKernelContext;
 namespace functor {
diff --git a/tensorflow/core/kernels/scan_ops_gpu.h b/tensorflow/core/kernels/scan_ops_gpu.h
index d5ceca92a80..f99f8af3190 100644
--- a/tensorflow/core/kernels/scan_ops_gpu.h
+++ b/tensorflow/core/kernels/scan_ops_gpu.h
@@ -279,7 +279,13 @@ void LaunchScan(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
         GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
                         num_blocks, block_size, 0, d.stream(), in.data(),
                         out.data(), dimx, dimy, dimz, exclusive, reverse, op));
+#if TENSORFLOW_COMPILER_IS_HIP_CLANG
+    // HIP-CLANG has some kind of problem here with 32 threads (possibly because
+    // the warpsize is 64). Reenable when working properly
+  } else if (true) {
+#else
   } else if (ideal_block_size >= 64) {
+#endif
     const int block_size = 64;
     TF_CHECK_OK(
         GpuLaunchKernel(scan_kernel<T, Op, block_size, items_per_thread>,
diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h
index fd2724a73d8..5af04c7aeae 100644
--- a/tensorflow/core/kernels/scatter_functor.h
+++ b/tensorflow/core/kernels/scatter_functor.h
@@ -33,9 +33,6 @@ namespace tensorflow {
 class OpKernelContext;
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace scatter_op {
 
@@ -125,65 +122,6 @@ struct Assign<scatter_op::UpdateOp::MAX> {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <scatter_op::UpdateOp Op>
-struct AssignSYCL {};
-template <>
-struct AssignSYCL<scatter_op::UpdateOp::ASSIGN> {
-  template <typename Device, typename Params, typename Update>
-  static void Run(Device d, Params p, Update u) {
-    p.device(d) = u;
-  }
-};
-
-template <>
-struct AssignSYCL<scatter_op::UpdateOp::ADD> {
-  template <typename Device, typename Params, typename Update>
-  static void Run(Device d, Params p, Update u) {
-    p.device(d) += u;
-  }
-};
-
-template <>
-struct AssignSYCL<scatter_op::UpdateOp::SUB> {
-  template <typename Device, typename Params, typename Update>
-  static void Run(Device d, Params p, Update u) {
-    p.device(d) -= u;
-  }
-};
-
-template <>
-struct AssignSYCL<scatter_op::UpdateOp::MUL> {
-  template <typename Device, typename Params, typename Update>
-  static void Run(Device d, Params p, Update u) {
-    p.device(d) = p * u;
-  }
-};
-
-template <>
-struct AssignSYCL<scatter_op::UpdateOp::DIV> {
-  template <typename Device, typename Params, typename Update>
-  static void Run(Device d, Params p, Update u) {
-    p.device(d) = p / u;
-  }
-};
-
-template <>
-struct AssignSYCL<scatter_op::UpdateOp::MIN> {
-  template <typename Device, typename Params, typename Update>
-  static void Run(Device d, Params p, Update u) {
-    p.device(d) = p.cwiseMin(u);
-  }
-};
-
-template <>
-struct AssignSYCL<scatter_op::UpdateOp::MAX> {
-  template <typename Device, typename Params, typename Update>
-  static void Run(Device d, Params p, Update u) {
-    p.device(d) = p.cwiseMax(u);
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace internal
 }  // namespace scatter_op
@@ -328,30 +266,6 @@ template <typename Index>
 struct ScatterFunctor<GPUDevice, Variant, Index, scatter_op::UpdateOp::ASSIGN>
     : ScatterFunctorVariantAssignBase<GPUDevice, Index> {};
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T, typename Index, scatter_op::UpdateOp op>
-struct ScatterFunctorBase<SYCLDevice, T, Index, op> {
-  Index operator()(OpKernelContext* c, const SYCLDevice& d,
-                   typename TTypes<T>::Matrix params,
-                   typename TTypes<T>::ConstMatrix updates,
-                   typename TTypes<Index>::ConstFlat indices) {
-    // indices and params sizes were validated in DoCompute().
-    const Index N = static_cast<Index>(indices.size());
-    const Index limit = static_cast<Index>(params.dimension(0));
-    for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  Do this carefully,
-      // to avoid checking the value and grabbing it again from
-      // memory a second time (a security risk since it may change in between).
-      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
-      if (!FastBoundsCheck(index, limit)) return i;
-      // Copy last Ndim-1 dimensions of updates[i] to params[index]
-      scatter_op::internal::AssignSYCL<op>::Run(
-          d, params.template chip<0>(index), updates.template chip<0>(i));
-    }
-    return -1;
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T, typename Index>
 struct ScatterFunctorBase<CPUDevice, T, Index, scatter_op::UpdateOp::ASSIGN> {
@@ -395,27 +309,6 @@ template <typename T, typename Index, scatter_op::UpdateOp op>
 struct ScatterFunctor<CPUDevice, T, Index, op>
     : ScatterFunctorBase<CPUDevice, T, Index, op> {};
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T, typename Index, scatter_op::UpdateOp op>
-struct ScatterFunctorSYCL {
-  Index operator()(OpKernelContext* c, const SYCLDevice& d,
-                   typename TTypes<T>::Matrix params,
-                   typename TTypes<T>::ConstMatrix updates,
-                   typename TTypes<Index>::Flat indices) {
-    // indices and params sizes were validated in DoCompute().
-    const Index N = static_cast<Index>(indices.size());
-    const Index limit = static_cast<Index>(params.dimension(0));
-    for (Index i = 0; i < N; i++) {
-      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
-      if (!FastBoundsCheck(index, limit)) return i;
-      // Copy last Ndim-1 dimensions of updates[i] to params[index]
-      scatter_op::internal::AssignSYCL<op>::Run(
-          d, params.template chip<0>(index), updates.template chip<0>(i));
-    }
-    return -1;
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T, typename Index, scatter_op::UpdateOp op>
 struct ScatterScalarFunctor {
@@ -483,30 +376,6 @@ struct ScatterScalarFunctor<GPUDevice, Variant, Index,
                             scatter_op::UpdateOp::ASSIGN>
     : ScatterScalarFunctorVariantAssignBase<GPUDevice, Index> {};
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T, typename Index, scatter_op::UpdateOp op>
-struct ScatterScalarFunctorBase<SYCLDevice, T, Index, op> {
-  Index operator()(OpKernelContext* c, const SYCLDevice& d,
-                   typename TTypes<T>::Matrix params,
-                   const typename TTypes<T>::ConstScalar update,
-                   typename TTypes<Index>::ConstFlat indices) {
-    // indices and params sizes were validated in DoCompute().
-    const Index N = static_cast<Index>(indices.size());
-    const Index limit = static_cast<Index>(params.dimension(0));
-    for (Index i = 0; i < N; i++) {
-      // Grab the index and check its validity.  Do this carefully,
-      // to avoid checking the value and grabbing it again from
-      // memory a second time (a security risk since it may change in between).
-      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
-      if (!FastBoundsCheck(index, limit)) return i;
-      // Broadcast update to params[index]
-      scatter_op::internal::AssignSYCL<op>::RunScalar(
-          d, params.template chip<0>(index), update);
-    }
-    return -1;
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T, typename Index>
 struct ScatterScalarFunctorBase<CPUDevice, T, Index,
@@ -536,27 +405,6 @@ template <typename T, typename Index, scatter_op::UpdateOp op>
 struct ScatterScalarFunctor<CPUDevice, T, Index, op>
     : ScatterScalarFunctorBase<CPUDevice, T, Index, op> {};
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T, typename Index, scatter_op::UpdateOp op>
-struct ScatterScalarFunctorSYCL {
-  Index operator()(OpKernelContext* c, const SYCLDevice& d,
-                   typename TTypes<T>::Matrix params,
-                   const typename TTypes<T>::ConstScalar update,
-                   typename TTypes<Index>::Flat indices) {
-    // indices and params sizes were validated in DoCompute().
-    const Index N = static_cast<Index>(indices.size());
-    const Index limit = static_cast<Index>(params.dimension(0));
-    for (Index i = 0; i < N; i++) {
-      const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i));
-      if (!FastBoundsCheck(index, limit)) return i;
-      // Broadcast update to params[index]
-      scatter_op::internal::AssignSYCL<op>::Run(
-          d, params.template chip<0>(index), update());
-    }
-    return -1;
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 04a66d39b0a..b50c8d2cec3 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -38,17 +38,11 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/util.h"
 
-#ifdef TENSORFLOW_USE_SYCL
-#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 // Returns true if the three tensors have valid number of elements
 // If shape_input has 0 elements, then we need to have indices and updates with
@@ -677,28 +671,6 @@ TF_CALL_COMPLEX_TYPES(REGISTER_SCATTER_ND_ALL_GPU);
 
 #undef REGISTER_SCATTER_ND_ALL_GPU
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SCATTER_ND_ADD_SUB_SYCL(type) \
-  REGISTER_SCATTER_ND_ADD_SUB(type, SYCL);
-
-#define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
-  REGISTER_SCATTER_ND_UPDATE(type, SYCL);
-
-#define REGISTER_SCATTER_ND_MIN_MAX_SYCL(type) \
-  REGISTER_SCATTER_ND_MIN_MAX(type, SYCL);
-
-TF_CALL_int32(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
-TF_CALL_int32(REGISTER_SCATTER_ND_UPDATE_SYCL);
-TF_CALL_int32(REGISTER_SCATTER_ND_MIN_MAX_SYCL);
-TF_CALL_bool(REGISTER_SCATTER_ND_UPDATE_SYCL);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_MIN_MAX_SYCL);
-
-#undef REGISTER_SCATTER_ND_ADD_SUB_SYCL
-#undef REGISTER_SCATTER_ND_MIN_MAX_SYCL
-#undef REGISTER_SCATTER_ND_UPDATE_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_SCATTER_ND_TENSOR_UPDATE_GPU(type)                    \
   REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, int32, GPU); \
@@ -924,30 +896,6 @@ class IndexFlattener {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename Index>
-class IndexFlattener<SYCLDevice, Index> {
- public:
-  IndexFlattener() { indices_host_ = nullptr; }
-  ~IndexFlattener() { delete[] indices_host_; }
-
-  inline typename TTypes<Index, 2>::ConstTensor operator()(
-      OpKernelContext* c, const Tensor& indices) {
-    size_t num_indices = indices.NumElements();
-    indices_host_ = new Index[num_indices];
-    auto device = c->eigen_sycl_device();
-    auto size = sizeof(Index) * num_indices;
-    auto src_ptr = GetBase(&indices);
-    device.memcpyDeviceToHost(indices_host_, static_cast<const Index*>(src_ptr),
-                              size);
-    return typename TTypes<Index, 2>::ConstTensor(
-        indices_host_, indices.shape().AsEigenDSizes<2>());
-  }
-
- private:
-  Index* indices_host_;
-};
-#endif
 
 template <typename Device, typename T, typename Index,
           scatter_nd_op::UpdateOp Op>
diff --git a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
index 948db7f932d..6cfa1df7c61 100644
--- a/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
+++ b/tensorflow/core/kernels/scatter_nd_op_cpu_impl.h
@@ -38,9 +38,6 @@ limitations under the License.
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 class OpKernelContext;
 
@@ -194,97 +191,6 @@ TF_CALL_bool(REGISTER_SCATTER_ND_MATH);
 #undef REGISTER_SCATTER_ND_UPDATE
 #undef REGISTER_SCATTER_ND_INDEX
 #undef REGISTER_SCATTER_ND_FULL
-
-// Implementation of update functor for SYCL.
-#ifdef TENSORFLOW_USE_SYCL
-
-template <typename T, typename Index, scatter_nd_op::UpdateOp OP, int IXDIM>
-struct ScatterNdFunctor<SYCLDevice, T, Index, OP, IXDIM> {
-  Index operator()(
-      const SYCLDevice& d, const Index slice_size,
-      const Eigen::array<Eigen::DenseIndex, IXDIM> output_shape_prefix,
-      typename TTypes<T, 2>::Tensor Tparams,
-      typename TTypes<Index, 2>::ConstTensor Tindices,
-      typename TTypes<T, 2>::ConstTensor Tupdates,
-      typename TTypes<T, 2>::Tensor Toutput) {
-    // error_loc is -1 if there's no out-of-bounds index,
-    // otherwise it is the location of an OOB index in Tindices.
-    Index error_loc = -1;
-
-    const Eigen::DenseIndex batch_size = Tindices.dimension(0);
-
-    Index batch_strides[IXDIM];
-    for (int dim = IXDIM - 1; dim >= 0; --dim) {
-      if (dim == IXDIM - 1) {
-        batch_strides[dim] = 1;
-      } else {
-        batch_strides[dim] =
-            batch_strides[dim + 1] * output_shape_prefix[dim + 1];
-      }
-    }
-
-    for (Eigen::DenseIndex loc = 0; loc < batch_size; ++loc) {
-      Index i = 0;
-      bool out_of_bounds = false;
-      for (int dim = 0; dim < IXDIM; ++dim) {
-        const Index ix_d = internal::SubtleMustCopy(Tindices(loc, dim));
-        out_of_bounds |= !FastBoundsCheck(ix_d, output_shape_prefix[dim]);
-        i += ix_d * batch_strides[dim];
-      }
-      if (TF_PREDICT_FALSE(out_of_bounds)) {
-        error_loc = loc;
-        break;
-      } else {
-        auto input_chip = Toutput.template chip<0>(i);
-        auto output_chip = input_chip;
-        auto update_chip = Tupdates.template chip<0>(loc);
-        update_executor::UpdateExecutor<
-            SYCLDevice, decltype(input_chip), decltype(update_chip),
-            decltype(output_chip), OP>::Execute(d, input_chip, update_chip,
-                                                output_chip);
-      }
-    }
-
-    return error_loc;
-  }
-};
-
-#define REGISTER_SCATTER_ND_FULL_SYCL(T, Index, op)                           \
-  template Index                                                              \
-  ScatterNdFunctor<SYCLDevice, T, Index, op, CPU_PROVIDED_IXDIM>::operator()( \
-      const SYCLDevice& d, const Index slice_size,                            \
-      const Eigen::array<Eigen::DenseIndex, CPU_PROVIDED_IXDIM>               \
-          output_shape_prefix,                                                \
-      typename TTypes<T, 2>::Tensor Tparams,                                  \
-      typename TTypes<Index, 2>::ConstTensor Tindices,                        \
-      typename TTypes<T, 2>::ConstTensor Tupdates,                            \
-      typename TTypes<T, 2>::Tensor Toutput)
-
-#define REGISTER_SCATTER_ND_INDEX_SYCL(type, op)  \
-  REGISTER_SCATTER_ND_FULL_SYCL(type, int32, op); \
-  REGISTER_SCATTER_ND_FULL_SYCL(type, int64, op)
-
-#define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
-  REGISTER_SCATTER_ND_INDEX_SYCL(type, scatter_nd_op::UpdateOp::ASSIGN);
-
-#define REGISTER_SCATTER_ND_MATH_SYCL(type)                           \
-  REGISTER_SCATTER_ND_INDEX_SYCL(type, scatter_nd_op::UpdateOp::ADD); \
-  REGISTER_SCATTER_ND_INDEX_SYCL(type, scatter_nd_op::UpdateOp::SUB); \
-  REGISTER_SCATTER_ND_INDEX_SYCL(type, scatter_nd_op::UpdateOp::MIN); \
-  REGISTER_SCATTER_ND_INDEX_SYCL(type, scatter_nd_op::UpdateOp::MAX);
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL)
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_MATH_SYCL)
-REGISTER_SCATTER_ND_UPDATE_SYCL(int32);
-REGISTER_SCATTER_ND_MATH_SYCL(int32);
-
-#undef REGISTER_SCATTER_ND_MATH_SYCL
-#undef REGISTER_SCATTER_ND_UPDATE_SYCL
-#undef REGISTER_SCATTER_ND_INDEX_SYCL
-#undef REGISTER_SCATTER_ND_FULL_SYCL
-
-#endif  // TENSORFLOW_USE_SYCL
-
 }  // namespace functor
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_op.cc b/tensorflow/core/kernels/scatter_op.cc
index c7ea9def4fa..f551711e25a 100644
--- a/tensorflow/core/kernels/scatter_op.cc
+++ b/tensorflow/core/kernels/scatter_op.cc
@@ -23,17 +23,11 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/util.h"
 
-#ifdef TENSORFLOW_USE_SYCL
-#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 // Check whether updates.shape = indices.shape + params.shape[1:]
 static bool ValidShapes(const Tensor& params, const Tensor& updates,
@@ -151,94 +145,6 @@ class ScatterUpdateOp : public OpKernel {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T, typename Index, scatter_op::UpdateOp op>
-class ScatterUpdateOp<SYCLDevice, T, Index, op> : public OpKernel {
- public:
-  explicit ScatterUpdateOp(OpKernelConstruction* c) : OpKernel(c) {
-    OP_REQUIRES_OK(c, c->GetAttr("use_locking", &use_exclusive_lock_));
-  }
-
-  void Compute(OpKernelContext* c) override {
-    if (use_exclusive_lock_) {
-      // Hold mutex while we apply updates
-      mutex_lock l(*c->input_ref_mutex(0));
-      DoCompute(c);
-    } else {
-      DoCompute(c);
-    }
-  }
-
- private:
-  bool use_exclusive_lock_;
-
-  void DoCompute(OpKernelContext* c) {
-    Tensor params = c->mutable_input(0, use_exclusive_lock_);
-    const Tensor& indices = c->input(1);
-    const Tensor& updates = c->input(2);
-    DoValidationChecking(c, params, indices, updates);
-    if (!c->status().ok()) return;
-
-    // Check that we have enough index space
-    const int64 N_big = indices.NumElements();
-    OP_REQUIRES(
-        c, N_big <= std::numeric_limits<Index>::max(),
-        errors::InvalidArgument("indices has too many elements for ",
-                                DataTypeString(DataTypeToEnum<Index>::v()),
-                                " indexing: ", N_big, " > ",
-                                std::numeric_limits<Index>::max()));
-    const Index N = static_cast<Index>(indices.NumElements());
-    OP_REQUIRES(
-        c, params.dim_size(0) <= std::numeric_limits<Index>::max(),
-        errors::InvalidArgument("params.shape[0] too large for ",
-                                DataTypeString(DataTypeToEnum<Index>::v()),
-                                " indexing: ", params.dim_size(0), " > ",
-                                std::numeric_limits<Index>::max()));
-
-    // We always return the input ref.
-    c->forward_ref_input_to_ref_output(0, 0);
-
-    if (N > 0) {
-      auto index_size = indices.NumElements() * sizeof(Index);
-      Tensor indices_host = Tensor(indices.dtype(), indices.shape());
-
-      auto src_ptr = GetBase(&indices);
-      auto dst_ptr = GetBase(&indices_host);
-
-      c->eigen_sycl_device().memcpyDeviceToHost(
-          dst_ptr, static_cast<const Index*>(src_ptr), index_size);
-
-      auto indices_flat = indices_host.flat<Index>();
-      auto params_flat = params.flat_outer_dims<T>();
-
-      if (TensorShapeUtils::IsScalar(updates.shape())) {
-        const auto update = updates.scalar<T>();
-
-        functor::ScatterScalarFunctorSYCL<T, Index, op> functor;
-        const Index bad_i = functor(c, c->template eigen_device<SYCLDevice>(),
-                                    params_flat, update, indices_flat);
-        OP_REQUIRES(c, bad_i < 0,
-                    errors::InvalidArgument(
-                        "indices", SliceDebugString(indices.shape(), bad_i),
-                        " = ", indices_flat(bad_i), " is not in [0, ",
-                        params.dim_size(0), ")"));
-      } else {
-        auto updates_flat =
-            updates.shaped<T, 2>({N, updates.NumElements() / N});
-
-        functor::ScatterFunctorSYCL<T, Index, op> functor;
-        const Index bad_i = functor(c, c->template eigen_device<SYCLDevice>(),
-                                    params_flat, updates_flat, indices_flat);
-        OP_REQUIRES(c, bad_i < 0,
-                    errors::InvalidArgument(
-                        "indices", SliceDebugString(indices.shape(), bad_i),
-                        " = ", indices_flat(bad_i), " is not in [0, ",
-                        params.dim_size(0), ")"));
-      }
-    }
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_SCATTER_KERNEL_INDEX(type, index_type, dev, name, op) \
   REGISTER_KERNEL_BUILDER(Name(name)                                   \
@@ -293,22 +199,6 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_SCATTER_UPDATE_GPU);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Registers GPU kernels.
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SCATTER_ARITHMETIC_SYCL(type) \
-  REGISTER_SCATTER_ARITHMETIC(type, SYCL);
-
-#define REGISTER_SCATTER_MINMAX_SYCL(type) REGISTER_SCATTER_MINMAX(type, SYCL);
-
-#define REGISTER_SCATTER_UPDATE_SYCL(type) REGISTER_SCATTER_UPDATE(type, SYCL);
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ARITHMETIC_SYCL);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_MINMAX_SYCL);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_UPDATE_SYCL);
-
-#undef REGISTER_SCATTER_ARITHMETIC_SYCL
-#undef REGISTER_SCATTER_MINMAX_SYCL
-#undef REGISTER_SCATTER_UPDATE_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef REGISTER_SCATTER_ARITHMETIC
 #undef REGISTER_SCATTER_ARITHMETIC_CPU
diff --git a/tensorflow/core/kernels/sequence_ops.cc b/tensorflow/core/kernels/sequence_ops.cc
index 7ce2016a2f7..d15f95125e0 100644
--- a/tensorflow/core/kernels/sequence_ops.cc
+++ b/tensorflow/core/kernels/sequence_ops.cc
@@ -99,14 +99,6 @@ class RangeOp : public OpKernel {
 
 #define REGISTER_CPU_KERNEL(T) REGISTER_KERNEL(DEVICE_CPU, T)
 #define REGISTER_GPU_KERNEL(T) REGISTER_KERNEL(DEVICE_GPU, T)
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(T) REGISTER_KERNEL(DEVICE_SYCL, T)
-TF_CALL_float(REGISTER_SYCL_KERNEL);
-TF_CALL_double(REGISTER_SYCL_KERNEL);
-TF_CALL_int32(REGISTER_SYCL_KERNEL);
-TF_CALL_int64(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 TF_CALL_float(REGISTER_CPU_KERNEL);
 TF_CALL_double(REGISTER_CPU_KERNEL);
@@ -189,12 +181,6 @@ TF_CALL_float(REGISTER_GPU_KERNEL);
 TF_CALL_double(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(T) REGISTER_KERNEL_ALL_NUMS(DEVICE_SYCL, T)
-TF_CALL_float(REGISTER_SYCL_KERNEL);
-TF_CALL_double(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef REGISTER_CPU_KERNEL
 #undef REGISTER_KERNEL_ALL_NUMS
diff --git a/tensorflow/core/kernels/session_ops.cc b/tensorflow/core/kernels/session_ops.cc
index d83a714452f..ee81ad27632 100644
--- a/tensorflow/core/kernels/session_ops.cc
+++ b/tensorflow/core/kernels/session_ops.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // See docs in ../ops/data_flow_ops.cc.
 
 #include <limits.h>
+
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device.h"
@@ -27,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -42,7 +44,11 @@ class GetSessionHandleOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor& val = ctx->input(0);
-    int64 id = ctx->session_state()->GetNewId();
+    auto session_state = ctx->session_state();
+    OP_REQUIRES(ctx, session_state != nullptr,
+                errors::FailedPrecondition(
+                    "GetSessionHandle called on null session state"));
+    int64 id = session_state->GetNewId();
     TensorStore::TensorAndKey tk{val, id, requested_device()};
     OP_REQUIRES_OK(ctx, ctx->tensor_store()->AddTensor(name(), tk));
 
@@ -85,23 +91,6 @@ TF_CALL_NUMBER_TYPES(REGISTER_GPU_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 #undef REGISTER_GPU_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                        \
-  REGISTER_KERNEL_BUILDER(Name("GetSessionHandle")        \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("handle")       \
-                              .TypeConstraint<type>("T"), \
-                          GetSessionHandleOp)             \
-  REGISTER_KERNEL_BUILDER(Name("GetSessionHandleV2")      \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("handle")       \
-                              .TypeConstraint<type>("T"), \
-                          GetSessionHandleOp)
-
-TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
-REGISTER_SYCL_KERNEL(bool);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 class GetSessionTensorOp : public OpKernel {
  public:
@@ -133,18 +122,6 @@ TF_CALL_NUMBER_TYPES(REGISTER_GPU_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 #undef REGISTER_GPU_KERNEL
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                            \
-  REGISTER_KERNEL_BUILDER(Name("GetSessionTensor")            \
-                              .Device(DEVICE_SYCL)            \
-                              .HostMemory("handle")           \
-                              .TypeConstraint<type>("dtype"), \
-                          GetSessionTensorOp)
-
-TF_CALL_NUMBER_TYPES(REGISTER_SYCL_KERNEL);
-REGISTER_SYCL_KERNEL(bool);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 class DeleteSessionTensorOp : public OpKernel {
  public:
@@ -166,9 +143,4 @@ REGISTER_KERNEL_BUILDER(
     Name("DeleteSessionTensor").Device(DEVICE_GPU).HostMemory("handle"),
     DeleteSessionTensorOp);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(
-    Name("DeleteSessionTensor").Device(DEVICE_SYCL).HostMemory("handle"),
-    DeleteSessionTensorOp);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/shape_ops.cc b/tensorflow/core/kernels/shape_ops.cc
index cf065f738d6..7b2ffa8a3d7 100644
--- a/tensorflow/core/kernels/shape_ops.cc
+++ b/tensorflow/core/kernels/shape_ops.cc
@@ -33,40 +33,6 @@ REGISTER_KERNEL_BUILDER(Name("Shape")
                             .TypeConstraint<int64>("out_type"),
                         ShapeOp<int64>);
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                               \
-  REGISTER_KERNEL_BUILDER(Name("Shape")                          \
-                              .Device(DEVICE_SYCL)               \
-                              .HostMemory("output")              \
-                              .TypeConstraint<int32>("out_type") \
-                              .TypeConstraint<type>("T"),        \
-                          ShapeOp<int32>);                       \
-  REGISTER_KERNEL_BUILDER(Name("Shape")                          \
-                              .Device(DEVICE_SYCL)               \
-                              .HostMemory("output")              \
-                              .TypeConstraint<int64>("out_type") \
-                              .TypeConstraint<type>("T"),        \
-                          ShapeOp<int64>);
-
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-TF_CALL_bool(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-
-REGISTER_KERNEL_BUILDER(Name("Shape")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("input")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("out_type"),
-                        ShapeOp<int32>);
-REGISTER_KERNEL_BUILDER(Name("Shape")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("input")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int64>("out_type"),
-                        ShapeOp<int64>);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_KERNEL(type)                                \
@@ -158,69 +124,11 @@ REGISTER_KERNEL_BUILDER(Name("ShapeN")
                         ShapeNOp<int64>);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                               \
-  REGISTER_KERNEL_BUILDER(Name("ShapeN")                         \
-                              .Device(DEVICE_SYCL)               \
-                              .HostMemory("output")              \
-                              .TypeConstraint<int32>("out_type") \
-                              .TypeConstraint<type>("T"),        \
-                          ShapeNOp<int32>);                      \
-  REGISTER_KERNEL_BUILDER(Name("ShapeN")                         \
-                              .Device(DEVICE_SYCL)               \
-                              .HostMemory("output")              \
-                              .TypeConstraint<int64>("out_type") \
-                              .TypeConstraint<type>("T"),        \
-                          ShapeNOp<int64>)
-
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-TF_CALL_bool(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-
-REGISTER_KERNEL_BUILDER(Name("ShapeN")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("input")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("out_type"),
-                        ShapeNOp<int32>);
-REGISTER_KERNEL_BUILDER(Name("ShapeN")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("input")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int64>("out_type"),
-                        ShapeNOp<int64>);
-#endif  // TENSORFLOW_USE_SYCL
 
 // Rank ------------------------------------------
 REGISTER_KERNEL_BUILDER(Name("Rank").Device(DEVICE_CPU).HostMemory("output"),
                         RankOp);
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                       \
-  REGISTER_KERNEL_BUILDER(Name("Rank")                   \
-                              .Device(DEVICE_SYCL)       \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("output"),     \
-                          RankOp);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-
-REGISTER_KERNEL_BUILDER(Name("Rank")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("input")
-                            .HostMemory("output"),
-                        RankOp);
-
-REGISTER_KERNEL_BUILDER(Name("Rank")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<bool>("T")
-                            .HostMemory("input")
-                            .HostMemory("output"),
-                        RankOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define REGISTER_GPU_KERNEL(type)                        \
@@ -303,39 +211,6 @@ REGISTER_KERNEL_BUILDER(Name("Size")
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                               \
-  REGISTER_KERNEL_BUILDER(Name("Size")                           \
-                              .Device(DEVICE_SYCL)               \
-                              .TypeConstraint<type>("T")         \
-                              .TypeConstraint<int32>("out_type") \
-                              .HostMemory("output"),             \
-                          SizeOp<int32>);                        \
-  REGISTER_KERNEL_BUILDER(Name("Size")                           \
-                              .Device(DEVICE_SYCL)               \
-                              .TypeConstraint<type>("T")         \
-                              .TypeConstraint<int64>("out_type") \
-                              .HostMemory("output"),             \
-                          SizeOp<int64>);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-TF_CALL_bool(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-
-REGISTER_KERNEL_BUILDER(Name("Size")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("out_type")
-                            .HostMemory("input")
-                            .HostMemory("output"),
-                        SizeOp<int32>);
-REGISTER_KERNEL_BUILDER(Name("Size")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int64>("out_type")
-                            .HostMemory("input")
-                            .HostMemory("output"),
-                        SizeOp<int64>);
-#endif  // TENSORFLOW_USE_SYCL
 
 // ExpandDims ------------------------------------
 REGISTER_KERNEL_BUILDER(Name("ExpandDims")
@@ -385,41 +260,6 @@ REGISTER_KERNEL_BUILDER(Name("ExpandDims")
                         ExpandDimsOp<int64>);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                           \
-  REGISTER_KERNEL_BUILDER(Name("ExpandDims")                 \
-                              .Device(DEVICE_SYCL)           \
-                              .TypeConstraint<type>("T")     \
-                              .TypeConstraint<int32>("Tdim") \
-                              .HostMemory("dim"),            \
-                          ExpandDimsOp<int32>);              \
-  REGISTER_KERNEL_BUILDER(Name("ExpandDims")                 \
-                              .Device(DEVICE_SYCL)           \
-                              .TypeConstraint<type>("T")     \
-                              .TypeConstraint<int64>("Tdim") \
-                              .HostMemory("dim"),            \
-                          ExpandDimsOp<int64>);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-TF_CALL_bool(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-
-REGISTER_KERNEL_BUILDER(Name("ExpandDims")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Tdim")
-                            .HostMemory("input")
-                            .HostMemory("dim")
-                            .HostMemory("output"),
-                        ExpandDimsOp<int32>);
-REGISTER_KERNEL_BUILDER(Name("ExpandDims")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int64>("Tdim")
-                            .HostMemory("input")
-                            .HostMemory("dim")
-                            .HostMemory("output"),
-                        ExpandDimsOp<int64>);
-#endif  // TENSORFLOW_USE_SYCL
 
 // Squeeze ---------------------------------------
 REGISTER_KERNEL_BUILDER(Name("Squeeze").Device(DEVICE_CPU), SqueezeOp);
@@ -444,22 +284,6 @@ REGISTER_KERNEL_BUILDER(Name("Squeeze")
                         SqueezeOp);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                   \
-  REGISTER_KERNEL_BUILDER(                                           \
-      Name("Squeeze").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      SqueezeOp);
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-TF_CALL_bool(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-
-REGISTER_KERNEL_BUILDER(Name("Squeeze")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("input")
-                            .HostMemory("output"),
-                        SqueezeOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 class EnsureShapeOp : public OpKernel {
  public:
@@ -497,30 +321,6 @@ class EnsureShapeOp : public OpKernel {
 // constraints.
 REGISTER_KERNEL_BUILDER(Name("EnsureShape").Device(DEVICE_CPU), EnsureShapeOp);
 
-#if TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                       \
-  REGISTER_KERNEL_BUILDER(                                               \
-      Name("EnsureShape").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      EnsureShapeOp)
-
-TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_SYCL_KERNEL);
-
-#undef REGISTER_SYCL_KERNEL
-
-#define REGISTER_SYCL_HOST_KERNEL(type)                   \
-  REGISTER_KERNEL_BUILDER(Name("EnsureShape")             \
-                              .Device(DEVICE_SYCL)        \
-                              .HostMemory("input")        \
-                              .HostMemory("output")       \
-                              .TypeConstraint<type>("T"), \
-                          EnsureShapeOp)
-
-REGISTER_SYCL_HOST_KERNEL(int32);
-REGISTER_SYCL_HOST_KERNEL(bool);
-
-#undef REGISTER_SYCL_HOST_KERNEL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_GPU_KERNEL(type)                                       \
   REGISTER_KERNEL_BUILDER(                                              \
diff --git a/tensorflow/core/kernels/slice_op.cc b/tensorflow/core/kernels/slice_op.cc
index 6d7cd6f2a3d..3bf3ce4c9d9 100644
--- a/tensorflow/core/kernels/slice_op.cc
+++ b/tensorflow/core/kernels/slice_op.cc
@@ -57,9 +57,6 @@ void IntTensorToInt64Vec(const Tensor& tensor,
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 // Shared code that is not dependent on the type of T.  We do this to reduce
 // code size by not duplicating all this for all T (float, double, int32, etc.)
@@ -339,57 +336,4 @@ REGISTER_KERNEL_BUILDER(Name("Slice")
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-// Forward declarations of the functor specializations for SYCL.
-namespace functor {
-#define DECLARE_SYCL_SPEC(T, NDIM)                                  \
-  template <>                                                       \
-  void Slice<SYCLDevice, T, NDIM>::operator()(                      \
-      const SYCLDevice& d, typename TTypes<T, NDIM>::Tensor output, \
-      typename TTypes<T, NDIM>::ConstTensor input,                  \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& indices,        \
-      const Eigen::DSizes<Eigen::DenseIndex, NDIM>& sizes);         \
-  extern template struct Slice<SYCLDevice, T, NDIM>;
-
-#define DECLARE_FOR_N(T)   \
-  DECLARE_SYCL_SPEC(T, 1); \
-  DECLARE_SYCL_SPEC(T, 2); \
-  DECLARE_SYCL_SPEC(T, 3); \
-  DECLARE_SYCL_SPEC(T, 4); \
-  DECLARE_SYCL_SPEC(T, 5); \
-  DECLARE_SYCL_SPEC(T, 6); \
-  DECLARE_SYCL_SPEC(T, 7); \
-  DECLARE_SYCL_SPEC(T, 8);
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_FOR_N);
-DECLARE_FOR_N(int32);
-DECLARE_FOR_N(bool);
-
-#undef DECLARE_FOR_N
-#undef DECLARE_SYCL_SPEC
-}  // namespace functor
-
-#define REGISTER_SYCL(type)                                    \
-  REGISTER_KERNEL_BUILDER(Name("Slice")                        \
-                              .Device(DEVICE_SYCL)             \
-                              .TypeConstraint<type>("T")       \
-                              .HostMemory("begin")             \
-                              .HostMemory("size")              \
-                              .TypeConstraint<int32>("Index"), \
-                          SliceOp<SYCLDevice, type>)
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
-
-REGISTER_KERNEL_BUILDER(Name("Slice")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("Index")
-                            .HostMemory("input")
-                            .HostMemory("begin")
-                            .HostMemory("size")
-                            .HostMemory("output"),
-                        SliceOp<CPUDevice, int32>);
-#undef REGISTER_SYCL
-
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/slice_op_cpu_impl.h b/tensorflow/core/kernels/slice_op_cpu_impl.h
index 64b6948190a..9eda840aa4a 100644
--- a/tensorflow/core/kernels/slice_op_cpu_impl.h
+++ b/tensorflow/core/kernels/slice_op_cpu_impl.h
@@ -33,17 +33,6 @@ TF_CALL_ALL_TYPES(DEFINE_CPU_KERNELS);
 
 #undef DEFINE_CPU_KERNELS
 
-#ifdef TENSORFLOW_USE_SYCL
-using SyclDevice = Eigen::SyclDevice;
-
-#define DEFINE_SYCL_KERNELS(T) \
-  template struct functor::Slice<SyclDevice, T, CPU_PROVIDED_IXDIM>;
-
-TF_CALL_GPU_NUMBER_TYPES(DEFINE_SYCL_KERNELS);
-DEFINE_SYCL_KERNELS(int32);
-
-#undef DEFINE_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/snapshot_op.cc b/tensorflow/core/kernels/snapshot_op.cc
index 95bcfd6b39d..1cbcb49548f 100644
--- a/tensorflow/core/kernels/snapshot_op.cc
+++ b/tensorflow/core/kernels/snapshot_op.cc
@@ -61,16 +61,5 @@ TF_CALL_POD_TYPES(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 #endif
 
-#if TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SyclDevice;
-#define REGISTER_SYCL_KERNEL(TYPE)                                    \
-  REGISTER_KERNEL_BUILDER(                                            \
-      Name("Snapshot").Device(DEVICE_SYCL).TypeConstraint<TYPE>("T"), \
-      SnapshotOp<SyclDevice, TYPE>);
-
-TF_CALL_POD_TYPES(REGISTER_SYCL_KERNEL);
-
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
index 7d09b39ad4b..5bb6c3702e2 100644
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -29,9 +29,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 // Partial specialization for a CPUDevice, that uses the Eigen implementation
 // from SoftmaxEigenImpl.
@@ -46,10 +43,6 @@ struct SoftmaxFunctorBase {
 template <typename T>
 struct SoftmaxFunctor<CPUDevice, T> : SoftmaxFunctorBase<CPUDevice, T> {};
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-struct SoftmaxFunctor<SYCLDevice, T> : SoftmaxFunctorBase<SYCLDevice, T> {};
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace functor
 
 template <typename Device, typename T>
@@ -93,12 +86,4 @@ TF_CALL_FLOAT_TYPES(REGISTER_CPU);
 
 #undef REGISTER_CPU
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(
-    Name("Softmax").Device(DEVICE_SYCL).TypeConstraint<float>("T"),
-    SoftmaxOp<SYCLDevice, float>);
-REGISTER_KERNEL_BUILDER(
-    Name("Softmax").Device(DEVICE_SYCL).TypeConstraint<double>("T"),
-    SoftmaxOp<SYCLDevice, double>);
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/spacetodepth_op.cc b/tensorflow/core/kernels/spacetodepth_op.cc
index 3f9dd33dd6e..a292894e5b3 100644
--- a/tensorflow/core/kernels/spacetodepth_op.cc
+++ b/tensorflow/core/kernels/spacetodepth_op.cc
@@ -188,6 +188,16 @@ struct SpaceToDepthOpFunctor<CPUDevice, T, FORMAT_NHWC> {
     }
   }
 };
+
+#ifdef WIN32
+template <typename T>
+struct SpaceToDepthOpFunctor<CPUDevice, T, FORMAT_NCHW> {
+  void operator()(const CPUDevice& d, typename TTypes<T, 4>::ConstTensor input,
+                  int block_size, typename TTypes<T, 4>::Tensor output) {
+    LOG(FATAL) << "Trivial implementation to make debug build compile.";
+  }
+};
+#endif
 }  // namespace functor
 
 #define REGISTER(type)                                                \
diff --git a/tensorflow/core/kernels/sparse/BUILD b/tensorflow/core/kernels/sparse/BUILD
index bfb6c4934bb..df4afb3159b 100644
--- a/tensorflow/core/kernels/sparse/BUILD
+++ b/tensorflow/core/kernels/sparse/BUILD
@@ -1,11 +1,12 @@
 # Description: Op kernels for sparse matrix operations.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_cuda_or_rocm",
     "tf_cc_test",
-    "tf_kernel_library",
 )
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 
 package(
     default_visibility = ["//visibility:public"],
diff --git a/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc b/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
index e71257037f1..6d9201d9f87 100644
--- a/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
+++ b/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
@@ -236,6 +236,9 @@ class SparseFillEmptyRowsGradOp : public OpKernel {
         context, TensorShapeUtils::IsVector(reverse_index_map_t->shape()),
         errors::InvalidArgument("reverse_index_map must be a vector, saw: ",
                                 reverse_index_map_t->shape().DebugString()));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(grad_values_t->shape()),
+                errors::InvalidArgument("grad_values must be a vector, saw: ",
+                                        grad_values_t->shape().DebugString()));
 
     const auto reverse_index_map = reverse_index_map_t->vec<int64>();
     const auto grad_values = grad_values_t->vec<T>();
@@ -264,8 +267,13 @@ class SparseFillEmptyRowsGradOp : public OpKernel {
       // Locate the index of the output of the forward prop associated
       // with this location in the input of the forward prop.  Copy
       // the gradient into it.  Mark it as visited.
-      d_values(i) = grad_values(reverse_index_map(i));
-      visited(reverse_index_map(i)) = true;
+      int64 reverse_index = reverse_index_map(i);
+      OP_REQUIRES(
+          context, 0 <= reverse_index && reverse_index < N_full,
+          errors::InvalidArgument("Elements in reverse index must be in [0, ",
+                                  N_full, ") but got ", reverse_index));
+      d_values(i) = grad_values(reverse_index);
+      visited(reverse_index) = true;
     }
     for (int j = 0; j < N_full; ++j) {
       // The default value gradient gets the accumulated remainder of
diff --git a/tensorflow/core/kernels/special_math/BUILD b/tensorflow/core/kernels/special_math/BUILD
index 9a49f8263e4..3775ab1434b 100644
--- a/tensorflow/core/kernels/special_math/BUILD
+++ b/tensorflow/core/kernels/special_math/BUILD
@@ -1,7 +1,4 @@
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_kernel_library",
-)
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 
 # Implementation of Special Functions kernels.
 
diff --git a/tensorflow/core/kernels/split_lib.h b/tensorflow/core/kernels/split_lib.h
index 9d43a008226..674083b7bf1 100644
--- a/tensorflow/core/kernels/split_lib.h
+++ b/tensorflow/core/kernels/split_lib.h
@@ -48,16 +48,6 @@ struct Split<Eigen::ThreadPoolDevice, T, NDims> {
                   const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_sizes);
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T, int NDims>
-struct Split<Eigen::SyclDevice, T> {
-  void operator()(const Eigen::SyclDevice& d,
-                  typename TTypes<T, NDims>::Tensor output,
-                  typename TTypes<T, NDims>::ConstTensor input,
-                  const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_indices,
-                  const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_sizes);
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_lib_cpu.cc b/tensorflow/core/kernels/split_lib_cpu.cc
index a3060e4e90d..743ff1f04a5 100644
--- a/tensorflow/core/kernels/split_lib_cpu.cc
+++ b/tensorflow/core/kernels/split_lib_cpu.cc
@@ -44,22 +44,6 @@ void Split<Eigen::ThreadPoolDevice, T, NDims>::operator()(
 TF_CALL_ALL_TYPES(DEFINE_CPU_KERNELS)
 DEFINE_CPU_KERNELS(quint8)
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T, int NDims>
-void Split<Eigen::SyclDevice, T, NDims>::operator()(
-    const Eigen::SyclDevice& d, typename TTypes<T, NDims>::Tensor output,
-    typename TTypes<T, NDims>::ConstTensor input,
-    const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_indices,
-    const Eigen::DSizes<Eigen::DenseIndex, NDims>& slice_sizes) {
-  output.device(d) = input.slice(slice_indices, slice_sizes);
-}
-
-#define DEFINE_SYCL_KERNELS(T)                    \
-  template struct Split<Eigen::SyclDevice, T, 2>; \
-  template struct Split<Eigen::SyclDevice, T, 3>;
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_SYCL_KERNELS);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace functor
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_op.cc b/tensorflow/core/kernels/split_op.cc
index 08575f01f67..6f2cd965e7a 100644
--- a/tensorflow/core/kernels/split_op.cc
+++ b/tensorflow/core/kernels/split_op.cc
@@ -38,9 +38,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SplitOpBase : public OpKernel {
@@ -325,75 +322,6 @@ class SplitOpGPU : public SplitOpBase<GPUDevice, T> {
 };
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-class SplitOpSYCL : public SplitOpBase<SYCLDevice, T> {
- public:
-  typedef SplitOpBase<SYCLDevice, T> Base;
-  explicit SplitOpSYCL(OpKernelConstruction* c) : Base(c) {}
-
-  void Compute(OpKernelContext* context) override {
-    bool done = false;
-    Base::ComputeEasyCases(context, &done);
-    if (!context->status().ok() || done) {
-      return;
-    }
-    const Tensor& input = context->input(1);
-    const TensorShape& input_shape = input.shape();
-    const int32 split_dim_orig = context->input(0).flat<int32>()(0);
-    const int32 split_dim =
-        split_dim_orig < 0 ? split_dim_orig + input.dims() : split_dim_orig;
-    const int32 num_split = Base::num_outputs();
-
-    // Android also uses int32 indexing, so check here also.
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(input.NumElements(),
-                        std::numeric_limits<Eigen::DenseIndex>::max()),
-        errors::InvalidArgument("Split requires input size < ",
-                                std::numeric_limits<Eigen::DenseIndex>::max()));
-
-    Eigen::DenseIndex prefix_dim_size;
-    Eigen::DenseIndex split_dim_size;
-    Eigen::DenseIndex suffix_dim_size;
-
-    std::tie(prefix_dim_size, split_dim_size, suffix_dim_size) =
-        Base::template SetDims<Eigen::DenseIndex>(input_shape, split_dim);
-    auto input_reshaped =
-        input.shaped<T, 3>({prefix_dim_size, split_dim_size, suffix_dim_size});
-
-    const int64 split_dim_output_size = split_dim_size / num_split;
-    TensorShape output_shape(input_shape);
-    output_shape.set_dim(split_dim, split_dim_output_size);
-
-    Eigen::DSizes<Eigen::DenseIndex, 3> indices{0, 0, 0};
-    Eigen::DSizes<Eigen::DenseIndex, 3> sizes{
-        prefix_dim_size, split_dim_output_size, suffix_dim_size};
-
-    for (int i = 0; i < num_split; ++i) {
-      Tensor* result = nullptr;
-      OP_REQUIRES_OK(context,
-                     context->allocate_output(i, output_shape, &result));
-      if (prefix_dim_size * split_dim_output_size * suffix_dim_size > 0) {
-        Eigen::DSizes<Eigen::DenseIndex, 3> slice_indices;
-        Eigen::DSizes<Eigen::DenseIndex, 3> slice_sizes;
-        for (int j = 0; j < 3; ++j) {
-          slice_indices[j] = indices[j];
-          slice_sizes[j] = sizes[j];
-        }
-
-        auto result_shaped = result->shaped<T, 3>(
-            {prefix_dim_size, split_dim_output_size, suffix_dim_size});
-
-        functor::Split<SYCLDevice, T>()(context->eigen_device<SYCLDevice>(),
-                                        result_shaped, input_reshaped,
-                                        slice_indices, slice_sizes);
-      }
-      indices[1] += split_dim_output_size;
-    }
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_SPLIT(type)                             \
   REGISTER_KERNEL_BUILDER(Name("Split")                  \
@@ -423,17 +351,5 @@ TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                              \
-  REGISTER_KERNEL_BUILDER(Name("Split")                  \
-                              .Device(DEVICE_SYCL)       \
-                              .TypeConstraint<type>("T") \
-                              .HostMemory("split_dim"),  \
-                          SplitOpSYCL<type>)
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
-#undef REGISTER_SYCL
-
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index 9c0f370de3b..58c41c4d0e5 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -220,9 +220,6 @@ REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_CPU), StageOp);
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_GPU), StageOp);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("Stage").Device(DEVICE_SYCL), StageOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 class UnstageOp : public OpKernel {
  public:
@@ -254,9 +251,6 @@ REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_CPU), UnstageOp);
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_GPU), UnstageOp);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("Unstage").Device(DEVICE_SYCL), UnstageOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 class StagePeekOp : public OpKernel {
  public:
@@ -291,10 +285,6 @@ REGISTER_KERNEL_BUILDER(Name("StagePeek").Device(DEVICE_CPU), StagePeekOp);
 REGISTER_KERNEL_BUILDER(
     Name("StagePeek").HostMemory("index").Device(DEVICE_GPU), StagePeekOp);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(
-    Name("StagePeek").HostMemory("index").Device(DEVICE_SYCL), StagePeekOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 class StageSizeOp : public OpKernel {
  public:
@@ -322,10 +312,6 @@ REGISTER_KERNEL_BUILDER(Name("StageSize").Device(DEVICE_CPU), StageSizeOp);
 REGISTER_KERNEL_BUILDER(Name("StageSize").HostMemory("size").Device(DEVICE_GPU),
                         StageSizeOp);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(
-    Name("StageSize").HostMemory("size").Device(DEVICE_SYCL), StageSizeOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 class StageClearOp : public OpKernel {
  public:
@@ -347,8 +333,5 @@ REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_CPU), StageClearOp);
     (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM)
 REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_GPU), StageClearOp);
 #endif
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("StageClear").Device(DEVICE_SYCL), StageClearOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stateful_random_ops.cc b/tensorflow/core/kernels/stateful_random_ops.cc
index 01fb6fce75d..dca3d451db3 100644
--- a/tensorflow/core/kernels/stateful_random_ops.cc
+++ b/tensorflow/core/kernels/stateful_random_ops.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include "tensorflow/core/framework/rng_alg.h"
 #include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/random_op_cpu.h"
 #include "tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
@@ -23,6 +25,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace functor {
+
 template <typename Distribution>
 struct UpdateVariableAndFill_Philox<CPUDevice, Distribution> {
   void operator()(OpKernelContext* ctx, const CPUDevice& device,
@@ -42,10 +46,13 @@ struct UpdateVariableAndFill_Philox<CPUDevice, Distribution> {
     // No longer needs the lock.
     state_var_guard->Release();
     functor::FillPhiloxRandom<CPUDevice, Distribution>()(
-        ctx, device, philox, output_data, output_size, dist);
+        ctx, device, /*key=*/nullptr, /*counter=*/nullptr, philox, output_data,
+        output_size, dist);
   }
 };
 
+}  // end namespace functor
+
 Status CheckState(const Tensor& state) {
   if (state.dtype() != STATE_ELEMENT_DTYPE) {
     return errors::InvalidArgument("dtype of RNG state variable must be ",
@@ -64,11 +71,12 @@ Status CheckPhiloxState(const Tensor& state, int64 alg_tag_skip = 0) {
                 "StateElementType must be int64");
   static_assert(std::is_same<PhiloxRandom::ResultElementType, uint32>::value,
                 "PhiloxRandom::ResultElementType must be uint32");
-  if (state.NumElements() < alg_tag_skip + PHILOX_MIN_STATE_SIZE) {
+  auto min_size = alg_tag_skip + PHILOX_MIN_STATE_SIZE;
+  if (state.NumElements() < min_size) {
     return errors::InvalidArgument(
         "For the Philox algorithm, the size of state"
         " must be at least ",
-        alg_tag_skip + PHILOX_MIN_STATE_SIZE, "; got ", state.NumElements());
+        min_size, "; got ", state.NumElements());
   }
   return Status::OK();
 }
@@ -95,7 +103,7 @@ Status UpdateVariableAndFill(
     if (var_tensor_flat.size() < 1) {
       return errors::InvalidArgument("Size of tensor must be at least 1");
     }
-    alg = var_tensor_flat(0);
+    alg = Algorithm(var_tensor_flat(0));
   }
   if (alg == RNG_ALG_PHILOX) {
     TF_RETURN_IF_ERROR(CheckPhiloxState(*var_tensor, alg_tag_skip));
@@ -107,7 +115,7 @@ Status UpdateVariableAndFill(
     arg.alg_tag_skip = alg_tag_skip;
     arg.not_used = &state_var_guard;
     arg.state_tensor = var_tensor;
-    UpdateVariableAndFill_Philox<Device, Distribution>()(
+    functor::UpdateVariableAndFill_Philox<Device, Distribution>()(
         ctx, ctx->eigen_device<Device>(), dist, &arg, output_data);
     return Status::OK();
   } else {
@@ -138,7 +146,8 @@ class StatefulRandomOp : public OpKernel {
   explicit StatefulRandomOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
   void Compute(OpKernelContext* ctx) override {
-    StatefulRandomCompute<Device>(ctx, Distribution(), 0, 1, true, 0);
+    StatefulRandomCompute<Device>(ctx, Distribution(), 0, 1, true,
+                                  RNG_ALG_PHILOX /*dummy*/);
   }
 };
 
@@ -159,6 +168,14 @@ Status GetScalar(const Tensor& tensor, int input_idx, T* result) {
   return Status::OK();
 }
 
+template <typename AlgEnumType>
+Status GetAlg(OpKernelContext* ctx, int input_idx, Algorithm* alg) {
+  AlgEnumType alg_id;
+  TF_RETURN_IF_ERROR(GetScalar(ctx->input(input_idx), input_idx, &alg_id));
+  *alg = Algorithm(alg_id);
+  return Status::OK();
+}
+
 template <typename Device, class Distribution>
 class StatefulRandomOpV2 : public OpKernel {
  public:
@@ -166,7 +183,7 @@ class StatefulRandomOpV2 : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     Algorithm alg;
-    OP_REQUIRES_OK(ctx, GetScalar(ctx->input(1), 1, &alg));
+    OP_REQUIRES_OK(ctx, GetAlg<int64>(ctx, 1, &alg));
     StatefulRandomCompute<Device>(ctx, Distribution(), /*state_input_idx=*/0,
                                   /*shape_input_idx=*/2,
                                   /*read_alg_from_state=*/false, alg);
@@ -180,7 +197,7 @@ class StatefulUniformIntOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     Algorithm alg;
-    OP_REQUIRES_OK(ctx, GetScalar(ctx->input(1), 1, &alg));
+    OP_REQUIRES_OK(ctx, GetAlg<int64>(ctx, 1, &alg));
     const Tensor& minval = ctx->input(3);
     const Tensor& maxval = ctx->input(4);
     OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(minval.shape()),
@@ -217,7 +234,7 @@ class StatefulUniformFullIntOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override {
     Algorithm alg;
-    OP_REQUIRES_OK(ctx, GetScalar(ctx->input(1), 1, &alg));
+    OP_REQUIRES_OK(ctx, GetAlg<int64>(ctx, 1, &alg));
     StatefulRandomCompute<Device>(
         ctx,
         random::UniformFullIntDistribution<random::PhiloxRandom, IntType>(),
@@ -226,38 +243,66 @@ class StatefulUniformFullIntOp : public OpKernel {
   }
 };
 
+namespace functor {
+
 template <>
 struct RngSkip_Philox<CPUDevice> {
-  void operator()(const CPUDevice& device, int64 delta, Tensor* state_tensor) {
-    auto state_data = state_tensor->flat<StateElementType>().data();
+  void operator()(const CPUDevice& device, const StateElementType* in_data,
+                  uint64 delta, StateElementType* out_data) {
     // Delegates to PhiloxRandom to do the actual increasing.
-    auto philox = GetPhiloxRandomFromMem(state_data);
-    UpdateMemWithPhiloxRandom(philox, delta, state_data);
+    auto counter = GetCounterFromMem(reinterpret_cast<const uint64*>(in_data));
+    UpdateCounterMemWithPhiloxRandom(counter, delta, out_data);
   }
 };
 
-template <typename Device>
+}  // end namespace functor
+
+template <typename Device, typename AlgEnumType = int64,
+          typename DeltaType = int64, bool read_old_value = false>
 class RngSkipOp : public OpKernel {
  public:
   explicit RngSkipOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
   void Compute(OpKernelContext* ctx) override {
     auto state_input_idx = 0;
+    auto alg_input_idx = 1;
+    auto delta_input_idx = 2;
     Algorithm alg;
-    OP_REQUIRES_OK(ctx, GetScalar(ctx->input(1), 1, &alg));
-    int64 delta;
-    OP_REQUIRES_OK(ctx, GetScalar(ctx->input(2), 2, &delta));
+    OP_REQUIRES_OK(ctx, GetAlg<AlgEnumType>(ctx, alg_input_idx, &alg));
+    DeltaType delta_;
+    OP_REQUIRES_OK(
+        ctx, GetScalar(ctx->input(delta_input_idx), delta_input_idx, &delta_));
+    uint64 delta = static_cast<uint64>(delta_);
     Var* var = nullptr;
     OP_REQUIRES_OK(
         ctx, LookupResource(ctx, HandleFromInput(ctx, state_input_idx), &var));
     ScopedUnlockUnrefVar state_var_guard(var);
     Tensor* var_tensor = var->tensor();
     OP_REQUIRES_OK(ctx, CheckState(*var_tensor));
+    using T = StateElementType;
+    OP_REQUIRES_OK(ctx, PrepareToUpdateVariable<Device, T>(
+                            ctx, var_tensor, var->copy_on_read_mode.load()));
+    if (read_old_value) {
+      Tensor* output;
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_output(0, {RNG_MAX_COUNTER_SIZE + RNG_KEY_SIZE},
+                                    &output));
+      auto output_flat = output->flat<T>();
+      if (RNG_MAX_COUNTER_SIZE > GetCounterSize(alg)) {
+        functor::SetZeroFunctor<Device, T>()(ctx->eigen_device<Device>(),
+                                             output_flat);
+      }
+      functor::DenseUpdate<Device, T, ASSIGN>()(
+          ctx->eigen_device<Device>(), output_flat,
+          const_cast<const Tensor*>(var_tensor)->flat<T>());
+    }
     if (alg == RNG_ALG_PHILOX) {
       OP_REQUIRES_OK(ctx, CheckPhiloxState(*var_tensor));
-      OP_REQUIRES_OK(ctx, PrepareToUpdateVariable<Device, StateElementType>(
-                              ctx, var_tensor, var->copy_on_read_mode.load()));
-      RngSkip_Philox<Device>()(ctx->eigen_device<Device>(), delta, var_tensor);
+      // var_tensor layout is counter+key, so var_tensor data is also counter
+      // data.
+      auto counter_data = var_tensor->flat<T>().data();
+      functor::RngSkip_Philox<Device>()(ctx->eigen_device<Device>(),
+                                        counter_data, delta, counter_data);
     } else {
       OP_REQUIRES(ctx, false,
                   errors::InvalidArgument("Unsupported algorithm id: ", alg));
@@ -393,13 +438,20 @@ TF_CALL_int64(REGISTER_StatefulUniformFullInt_CPU);
 TF_CALL_uint32(REGISTER_StatefulUniformFullInt_CPU);
 TF_CALL_uint64(REGISTER_StatefulUniformFullInt_CPU);
 
+// TODO(wangpeng): Remove `HostMemory("delta")` for RngReadAndSkip
 #define REGISTER_RngSkip(DEVICE)                       \
   REGISTER_KERNEL_BUILDER(Name("RngSkip")              \
                               .Device(DEVICE_##DEVICE) \
                               .HostMemory("resource")  \
                               .HostMemory("algorithm") \
                               .HostMemory("delta"),    \
-                          RngSkipOp<DEVICE##Device>);
+                          RngSkipOp<DEVICE##Device>);  \
+  REGISTER_KERNEL_BUILDER(Name("RngReadAndSkip")       \
+                              .Device(DEVICE_##DEVICE) \
+                              .HostMemory("resource")  \
+                              .HostMemory("alg")       \
+                              .HostMemory("delta"),    \
+                          RngSkipOp<DEVICE##Device, int32, uint64, true>);
 
 REGISTER_RngSkip(CPU);
 
diff --git a/tensorflow/core/kernels/stateful_random_ops.h b/tensorflow/core/kernels/stateful_random_ops.h
index 58ab41426f1..8fb8b9ad857 100644
--- a/tensorflow/core/kernels/stateful_random_ops.h
+++ b/tensorflow/core/kernels/stateful_random_ops.h
@@ -22,15 +22,12 @@ limitations under the License.
 namespace tensorflow {
 
 // 'Variable' doesn't support uint32 or uint64 yet (due to reasons explained
-// in b/111604096 and cl/171681867), so I use signed int here. I choose int64
-// instead of int32 because `VarHandleOp` doesn't support int32 on GPU.
+// in b/111604096 and cl/171681867), so we use signed int here. We choose int64
+// instead of int32 because `VarHandleOp` doesn't support int32 on GPU, and
+// because of the "int32 problem".
 using StateElementType = int64;
 static constexpr DataType STATE_ELEMENT_DTYPE = DT_INT64;
-
-using Algorithm = StateElementType;
 static constexpr DataType ALGORITHM_DTYPE = STATE_ELEMENT_DTYPE;
-static constexpr Algorithm RNG_ALG_PHILOX = 1;
-static constexpr Algorithm RNG_ALG_THREEFRY = 2;
 
 using random::PhiloxRandom;
 
diff --git a/tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h b/tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h
index a121b2c48bf..b69b2da92ca 100644
--- a/tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h
+++ b/tensorflow/core/kernels/stateful_random_ops_cpu_gpu.h
@@ -17,59 +17,51 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_STATEFUL_RANDOM_OPS_CPU_GPU_H_
 
 #include "tensorflow/core/framework/resource_var.h"
+#include "tensorflow/core/kernels/random_ops_util.h"
 #include "tensorflow/core/kernels/stateful_random_ops.h"
 
 namespace tensorflow {
 
-// The following 5 functions are made templates to avoid duplicate symbols when
-// linking.
-
-// The following 2 functions use the contract "lower 32 bits for the first
-// uint32, higher 32 bits for the second". Note that this is endian-neutral,
-// unlike a direct memory copy `memcpy(output, &input, 8)`.
-PHILOX_DEVICE_INLINE void Int64ToUint32s(int64 input, uint32* output1,
-                                         uint32* output2) {
-  auto u64 = static_cast<uint64>(input);
-  *output1 = static_cast<uint32>(u64);
-  *output2 = static_cast<uint32>(u64 >> 32);
-}
-
-PHILOX_DEVICE_INLINE int64 Uint32sToInt64(uint32 input1, uint32 input2) {
-  auto u64_1 = static_cast<uint64>(input1);
-  auto u64_2 = static_cast<uint64>(input2);
-  return static_cast<int64>(u64_1 | (u64_2 << 32));
-}
-
 PHILOX_DEVICE_INLINE PhiloxRandom
 GetPhiloxRandomFromMem(StateElementType const* ptr) {
-  PhiloxRandom::ResultType counter;
-  PhiloxRandom::Key key;
-  Int64ToUint32s(ptr[0], &counter[0], &counter[1]);
-  Int64ToUint32s(ptr[1], &counter[2], &counter[3]);
-  Int64ToUint32s(ptr[2], &key[0], &key[1]);
-  return PhiloxRandom(counter, key);
+  auto ptr_ = reinterpret_cast<uint64 const*>(ptr);
+  return GetPhiloxRandomFromCounterKeyMem(ptr_, ptr_ + 2);
 }
 
 PHILOX_DEVICE_INLINE void WritePhiloxRandomToMem(PhiloxRandom const& philox,
                                                  StateElementType* ptr) {
-  PhiloxRandom::ResultType const& counter = philox.counter();
-  PhiloxRandom::Key const& key = philox.key();
-  ptr[0] = Uint32sToInt64(counter[0], counter[1]);
-  ptr[1] = Uint32sToInt64(counter[2], counter[3]);
-  ptr[2] = Uint32sToInt64(key[0], key[1]);
+  auto ptr_ = reinterpret_cast<uint64*>(ptr);
+  WriteCounterToMem(philox.counter(), ptr_);
+  WriteKeyToMem(philox.key(), ptr_ + 2);
+}
+
+PHILOX_DEVICE_INLINE PhiloxRandom SkipPhiloxRandom(PhiloxRandom const& philox,
+                                                   uint64 output_size) {
+  auto new_philox = philox;
+  // Multiplier 256 is the same as in FillPhiloxRandomTask; do not change it
+  // just here.
+  auto delta = output_size * 256;
+  new_philox.Skip(delta);  // do the actual increasing
+  return new_philox;
 }
 
 PHILOX_DEVICE_INLINE void UpdateMemWithPhiloxRandom(PhiloxRandom const& philox,
-                                                    int64 output_size,
+                                                    uint64 output_size,
                                                     StateElementType* ptr) {
-  auto new_philox = philox;
-  // Multiplier 256 is the same as in `FillPhiloxRandomTask`; do not change
-  // it just here.
-  auto delta = output_size * 256;
-  new_philox.Skip(delta);  // do the actual increasing
+  auto new_philox = SkipPhiloxRandom(philox, output_size);
   WritePhiloxRandomToMem(new_philox, ptr);
 }
 
+PHILOX_DEVICE_INLINE void UpdateCounterMemWithPhiloxRandom(
+    PhiloxRandom::ResultType const& counter, uint64 output_size,
+    StateElementType* ptr) {
+  auto philox = PhiloxRandom(counter, PhiloxRandom::Key() /*dummy*/);
+  auto new_philox = SkipPhiloxRandom(philox, output_size);
+  WriteCounterToMem(new_philox.counter(), reinterpret_cast<uint64*>(ptr));
+}
+
+namespace functor {
+
 // A per-device helper function that does the actual work for
 // `UpdateVariableAndFill`.
 // Reason to use functor: C++ doesn't allow function-template partial
@@ -80,6 +72,8 @@ struct UpdateVariableAndFill_Philox;
 template <typename Device>
 struct RngSkip_Philox;
 
+}  // end namespace functor
+
 using CPUDevice = Eigen::ThreadPoolDevice;
 
 struct UpdateVariableAndFill_Philox_Arg {
@@ -93,6 +87,8 @@ struct UpdateVariableAndFill_Philox_Arg {
 
 using GPUDevice = Eigen::GpuDevice;
 
+namespace functor {
+
 // Declares the partially GPU-specialized functor structs.
 // must be kept at <=6 arguments because of a gcc/clang ABI incompatibility bug
 template <typename Distribution>
@@ -104,9 +100,12 @@ struct UpdateVariableAndFill_Philox<GPUDevice, Distribution> {
 
 template <>
 struct RngSkip_Philox<GPUDevice> {
-  void operator()(const GPUDevice& device, int64 delta, Tensor* state_tensor);
+  void operator()(const GPUDevice& device, const StateElementType* in_data,
+                  uint64 delta, StateElementType* out_data);
 };
 
+}  // end namespace functor
+
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc b/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
index 804a2fd60ee..d244577d2ab 100644
--- a/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/stateful_random_ops_gpu.cu.cc
@@ -31,6 +31,8 @@ __device__ int tensorflow_philox_thread_counter;
 
 namespace tensorflow {
 
+namespace functor {
+
 using random::PhiloxRandom;
 
 template <typename Distribution>
@@ -48,7 +50,8 @@ __global__ void FillKernel(
   __syncthreads();
   functor::FillPhiloxRandomKernel<Distribution,
                                   Distribution::kVariableSamplesPerOutput>()
-      .Run(*philox, output_data, output_size, dist);
+      .Run(/*key=*/nullptr, /*counter=*/nullptr, *philox, output_data,
+           output_size, dist);
   // The last thread updates the state.
   auto total_thread_count = gridDim.x * blockDim.x;
   auto old_counter_value = atomicAdd(&tensorflow_philox_thread_counter, 1);
@@ -96,16 +99,19 @@ void UpdateVariableAndFill_Philox<GPUDevice, Distribution>::operator()(
 }
 
 // Precondition: there is only 1 block and 1 thread.
-__global__ void SkipKernel(int64 delta,
-                           StateElementType* __restrict__ state_data) {
-  auto philox = GetPhiloxRandomFromMem(state_data);
-  UpdateMemWithPhiloxRandom(philox, delta, state_data);
+__global__ void SkipKernel(const StateElementType* __restrict__ in_data,
+                           uint64 delta,
+                           StateElementType* __restrict__ out_data) {
+  auto counter = GetCounterFromMem(reinterpret_cast<const uint64*>(in_data));
+  UpdateCounterMemWithPhiloxRandom(counter, delta, out_data);
 }
 
-void RngSkip_Philox<GPUDevice>::operator()(const GPUDevice& d, int64 delta,
-                                           Tensor* state_tensor) {
-  TF_CHECK_OK(GpuLaunchKernel(SkipKernel, 1, 1, 0, d.stream(), delta,
-                              state_tensor->flat<StateElementType>().data()));
+void RngSkip_Philox<GPUDevice>::operator()(const GPUDevice& d,
+                                           const StateElementType* in_data,
+                                           uint64 delta,
+                                           StateElementType* out_data) {
+  TF_CHECK_OK(GpuLaunchKernel(SkipKernel, 1, 1, 0, d.stream(), in_data, delta,
+                              out_data));
 }
 
 // Explicit instantiation of the GPU distributions functors.
@@ -154,6 +160,7 @@ template struct UpdateVariableAndFill_Philox<
                  random::PhiloxRandom, uint64> >;
 // clang-format on
 
+}  // end namespace functor
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/stateless_random_ops.cc b/tensorflow/core/kernels/stateless_random_ops.cc
index 6738a34e3fd..0d4488b538f 100644
--- a/tensorflow/core/kernels/stateless_random_ops.cc
+++ b/tensorflow/core/kernels/stateless_random_ops.cc
@@ -121,8 +121,8 @@ class StatelessRandomOp : public StatelessRandomOpBase {
     auto flat = output->flat<T>();
     // Reuse the compute kernels from the stateful random ops
     functor::FillPhiloxRandom<Device, Distribution>()(
-        context, context->eigen_device<Device>(), random, flat.data(),
-        flat.size(), Distribution());
+        context, context->eigen_device<Device>(), /*key=*/nullptr,
+        /*counter=*/nullptr, random, flat.data(), flat.size(), Distribution());
   }
 };
 
@@ -158,8 +158,8 @@ class StatelessRandomUniformIntOp : public StatelessRandomOpBase {
     auto flat = output->flat<IntType>();
     // Reuse the compute kernels from the stateful random ops
     functor::FillPhiloxRandom<Device, Distribution>()(
-        context, context->eigen_device<Device>(), random, flat.data(),
-        flat.size(), dist);
+        context, context->eigen_device<Device>(), /*key=*/nullptr,
+        /*counter=*/nullptr, random, flat.data(), flat.size(), dist);
   }
 };
 
@@ -178,8 +178,8 @@ class StatelessRandomUniformFullIntOp : public StatelessRandomOpBase {
     auto flat = output->flat<IntType>();
     // Reuse the compute kernels from the stateful random ops
     functor::FillPhiloxRandom<Device, Distribution>()(
-        context, context->eigen_device<Device>(), random, flat.data(),
-        flat.size(), dist);
+        context, context->eigen_device<Device>(), /*key=*/nullptr,
+        /*counter=*/nullptr, random, flat.data(), flat.size(), dist);
   }
 };
 
@@ -252,7 +252,7 @@ class StatelessRandomGammaOp : public StatelessRandomOpBase {
     // avoid a couple flops which can be done on a per-alpha basis.
 
     auto DoWork = [samples_per_alpha, num_alphas, &random, samples_flat,
-                   alpha_flat](int start_output, int limit_output) {
+                   alpha_flat](int64 start_output, int64 limit_output) {
       // Capturing "random" by-value would only make a copy for the _shared_
       // lambda.  Since we want to let each worker have its own copy, we pass
       // "random" by reference and explicitly do a copy assignment.
diff --git a/tensorflow/core/kernels/stateless_random_ops_v2.cc b/tensorflow/core/kernels/stateless_random_ops_v2.cc
new file mode 100644
index 00000000000..c93ba6c8d66
--- /dev/null
+++ b/tensorflow/core/kernels/stateless_random_ops_v2.cc
@@ -0,0 +1,330 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/stateless_random_ops_v2.h"
+
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/rng_alg.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_util.h"
+#include "tensorflow/core/kernels/random_op.h"
+#include "tensorflow/core/kernels/random_ops_util.h"
+#include "tensorflow/core/kernels/random_poisson_op.h"
+#include "tensorflow/core/kernels/stateless_random_ops.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+#if EIGEN_COMP_GNUC && __cplusplus > 199711L
+#define DISABLE_FLOAT_EQUALITY_WARNING \
+  _Pragma("GCC diagnostic push")       \
+      _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
+#define ENABLE_FLOAT_EQUALITY_WARNING _Pragma("GCC diagnostic pop")
+#else
+#define DISABLE_FLOAT_EQUALITY_WARNING
+#define ENABLE_FLOAT_EQUALITY_WARNING
+#endif
+
+namespace tensorflow {
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+namespace {
+
+template <typename T>
+Status GetScalar(const Tensor& tensor, int input_idx, T* result) {
+  auto dtype = DataTypeToEnum<T>::v();
+  if (tensor.dims() != 0) {
+    return errors::InvalidArgument("input ", std::to_string(input_idx),
+                                   " (0-based) must have shape [], not ",
+                                   tensor.shape().DebugString());
+  }
+  if (tensor.dtype() != dtype) {
+    return errors::InvalidArgument("dtype of input ", std::to_string(input_idx),
+                                   " (0-based) must be ", DataTypeString(dtype),
+                                   ", not ", DataTypeString(tensor.dtype()));
+  }
+  *result = tensor.flat<T>()(0);
+  return Status::OK();
+}
+
+class StatelessRandomOpBase : public OpKernel {
+ public:
+  explicit StatelessRandomOpBase(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    // Sanitize input
+    const Tensor& shape_t = ctx->input(0);
+    const Tensor& key_t = ctx->input(1);
+    const Tensor& counter_t = ctx->input(2);
+    const int alg_input_idx = 3;
+    const Tensor& alg_t = ctx->input(alg_input_idx);
+
+    int alg_id;
+    OP_REQUIRES_OK(ctx, GetScalar(alg_t, alg_input_idx, &alg_id));
+    Algorithm alg = Algorithm(alg_id);
+
+    TensorShape shape;
+    OP_REQUIRES_OK(ctx, tensor::MakeShape(shape_t, &shape));
+    OP_REQUIRES_OK(ctx,
+                   CheckKeyCounterShape(alg, key_t.shape(), counter_t.shape()));
+
+    // Allocate output
+    Tensor* output;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, shape, &output));
+    if (shape.num_elements() == 0) {
+      return;
+    }
+
+    // Fill in the random numbers
+    Fill(ctx, alg, key_t, counter_t, output);
+  }
+
+  // The part of Compute that depends on device, type, and distribution
+  virtual void Fill(OpKernelContext* ctx, Algorithm alg, const Tensor& key,
+                    const Tensor& counter, Tensor* output) = 0;
+};
+
+template <typename Device, typename Distribution>
+class StatelessRandomOp : public StatelessRandomOpBase {
+ public:
+  using StatelessRandomOpBase::StatelessRandomOpBase;
+
+  void Fill(OpKernelContext* ctx, Algorithm alg, const Tensor& key,
+            const Tensor& counter, Tensor* output) override {
+    typedef typename Distribution::ResultElementType T;
+    auto flat = output->flat<T>();
+    if (alg == RNG_ALG_PHILOX) {
+      // Reuse the compute kernels from the stateful random ops
+      auto key_data = key.flat<uint64>().data();
+      auto counter_data = counter.flat<uint64>().data();
+      functor::FillPhiloxRandom<Device, Distribution>()(
+          ctx, ctx->eigen_device<Device>(), key_data, counter_data,
+          random::PhiloxRandom() /*dummy*/, flat.data(), flat.size(),
+          Distribution());
+    } else {
+      OP_REQUIRES(ctx, false,
+                  errors::InvalidArgument("Unsupported algorithm id: ", alg));
+    }
+  }
+};
+
+template <typename Device, typename IntType>
+class StatelessRandomUniformIntOp : public StatelessRandomOpBase {
+ public:
+  using StatelessRandomOpBase::StatelessRandomOpBase;
+
+  void Fill(OpKernelContext* ctx, Algorithm alg, const Tensor& key,
+            const Tensor& counter, Tensor* output) override {
+    const Tensor& minval = ctx->input(4);
+    const Tensor& maxval = ctx->input(5);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(minval.shape()),
+                errors::InvalidArgument("minval must be 0-D, got shape ",
+                                        minval.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(maxval.shape()),
+                errors::InvalidArgument("maxval must be 0-D, got shape ",
+                                        maxval.shape().DebugString()));
+
+    // Verify that minval < maxval.  Note that we'll never reach this point for
+    // empty output.  Zero impossible things are fine.
+    const auto lo = minval.scalar<IntType>()();
+    const auto hi = maxval.scalar<IntType>()();
+    OP_REQUIRES(
+        ctx, lo < hi,
+        errors::InvalidArgument("Need minval < maxval, got ", lo, " >= ", hi));
+
+    // Build distribution
+    typedef random::UniformDistribution<random::PhiloxRandom, IntType>
+        Distribution;
+    Distribution dist(lo, hi);
+
+    auto flat = output->flat<IntType>();
+    if (alg == RNG_ALG_PHILOX) {
+      // Reuse the compute kernels from the stateful random ops
+      auto key_data = key.flat<uint64>().data();
+      auto counter_data = counter.flat<uint64>().data();
+      functor::FillPhiloxRandom<Device, Distribution>()(
+          ctx, ctx->eigen_device<Device>(), key_data, counter_data,
+          random::PhiloxRandom() /*dummy*/, flat.data(), flat.size(), dist);
+    } else {
+      OP_REQUIRES(ctx, false,
+                  errors::InvalidArgument("Unsupported algorithm id: ", alg));
+    }
+  }
+};
+
+template <typename Device, typename IntType>
+class StatelessRandomUniformFullIntOp : public StatelessRandomOpBase {
+ public:
+  using StatelessRandomOpBase::StatelessRandomOpBase;
+
+  void Fill(OpKernelContext* ctx, Algorithm alg, const Tensor& key,
+            const Tensor& counter, Tensor* output) override {
+    // Build distribution
+    typedef random::UniformFullIntDistribution<random::PhiloxRandom, IntType>
+        Distribution;
+    Distribution dist;
+
+    auto flat = output->flat<IntType>();
+    if (alg == RNG_ALG_PHILOX) {
+      // Reuse the compute kernels from the stateful random ops
+      auto key_data = key.flat<uint64>().data();
+      auto counter_data = counter.flat<uint64>().data();
+      functor::FillPhiloxRandom<Device, Distribution>()(
+          ctx, ctx->eigen_device<Device>(), key_data, counter_data,
+          random::PhiloxRandom() /*dummy*/, flat.data(), flat.size(), dist);
+    } else {
+      OP_REQUIRES(ctx, false,
+                  errors::InvalidArgument("Unsupported algorithm id: ", alg));
+    }
+  }
+};
+
+class GetKeyCounterAlgOp : public OpKernel {
+ public:
+  explicit GetKeyCounterAlgOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& seed_t = ctx->input(0);
+    OP_REQUIRES(ctx, seed_t.dims() == 1 && seed_t.dim_size(0) == 2,
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_t.shape().DebugString()));
+    // Allocate outputs
+    Tensor* key_output;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(0, TensorShape({RNG_KEY_SIZE}), &key_output));
+    Tensor* counter_output;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(1, TensorShape({RNG_MAX_COUNTER_SIZE}),
+                                        &counter_output));
+    Tensor* alg_output;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(2, TensorShape({}), &alg_output));
+
+    random::PhiloxRandom::Key key;
+    random::PhiloxRandom::ResultType counter;
+    OP_REQUIRES_OK(ctx, GenerateKey(seed_t, &key, &counter));
+    WriteKeyToMem(key, key_output->flat<uint64>().data());
+    WriteCounterToMem(counter, counter_output->flat<uint64>().data());
+    alg_output->flat<int>()(0) = RNG_ALG_PHILOX;
+  }
+};
+
+#define REGISTER(DEVICE, TYPE)                                              \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("StatelessRandomUniformV2")                                      \
+          .Device(DEVICE_##DEVICE)                                          \
+          .HostMemory("shape")                                              \
+          .HostMemory("alg")                                                \
+          .TypeConstraint<TYPE>("dtype"),                                   \
+      StatelessRandomOp<DEVICE##Device, random::UniformDistribution<        \
+                                            random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("StatelessRandomNormalV2")                                       \
+          .Device(DEVICE_##DEVICE)                                          \
+          .HostMemory("shape")                                              \
+          .HostMemory("alg")                                                \
+          .TypeConstraint<TYPE>("dtype"),                                   \
+      StatelessRandomOp<DEVICE##Device, random::NormalDistribution<         \
+                                            random::PhiloxRandom, TYPE> >); \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("StatelessTruncatedNormalV2")                                    \
+          .Device(DEVICE_##DEVICE)                                          \
+          .HostMemory("shape")                                              \
+          .HostMemory("alg")                                                \
+          .TypeConstraint<TYPE>("dtype"),                                   \
+      StatelessRandomOp<                                                    \
+          DEVICE##Device,                                                   \
+          random::TruncatedNormalDistribution<                              \
+              random::SingleSampleAdapter<random::PhiloxRandom>, TYPE> >)
+
+#define REGISTER_FULL_INT(DEVICE, TYPE)       \
+  REGISTER_KERNEL_BUILDER(                    \
+      Name("StatelessRandomUniformFullIntV2") \
+          .Device(DEVICE_##DEVICE)            \
+          .HostMemory("shape")                \
+          .HostMemory("alg")                  \
+          .TypeConstraint<TYPE>("dtype"),     \
+      StatelessRandomUniformFullIntOp<DEVICE##Device, TYPE>)
+
+#define REGISTER_INT(DEVICE, TYPE)                            \
+  REGISTER_FULL_INT(DEVICE, TYPE);                            \
+  REGISTER_KERNEL_BUILDER(Name("StatelessRandomUniformIntV2") \
+                              .Device(DEVICE_##DEVICE)        \
+                              .HostMemory("shape")            \
+                              .HostMemory("alg")              \
+                              .HostMemory("minval")           \
+                              .HostMemory("maxval")           \
+                              .TypeConstraint<TYPE>("dtype"), \
+                          StatelessRandomUniformIntOp<DEVICE##Device, TYPE>)
+
+#define REGISTER_CPU(TYPE) REGISTER(CPU, TYPE)
+#define REGISTER_GPU(TYPE) REGISTER(GPU, TYPE)
+#define REGISTER_INT_CPU(TYPE) REGISTER_INT(CPU, TYPE)
+#define REGISTER_INT_GPU(TYPE) REGISTER_INT(GPU, TYPE)
+#define REGISTER_FULL_INT_CPU(TYPE) REGISTER_FULL_INT(CPU, TYPE)
+#define REGISTER_FULL_INT_GPU(TYPE) REGISTER_FULL_INT(GPU, TYPE)
+
+TF_CALL_half(REGISTER_CPU);
+TF_CALL_bfloat16(REGISTER_CPU);
+TF_CALL_float(REGISTER_CPU);
+TF_CALL_double(REGISTER_CPU);
+TF_CALL_int32(REGISTER_INT_CPU);
+TF_CALL_int64(REGISTER_INT_CPU);
+TF_CALL_uint32(REGISTER_FULL_INT_CPU);
+TF_CALL_uint64(REGISTER_FULL_INT_CPU);
+
+#define REGISTER_GET_KCA(DEVICE)                                  \
+  REGISTER_KERNEL_BUILDER(Name("StatelessRandomGetKeyCounterAlg") \
+                              .Device(DEVICE_##DEVICE)            \
+                              .HostMemory("seed")                 \
+                              .HostMemory("key")                  \
+                              .HostMemory("counter")              \
+                              .HostMemory("alg"),                 \
+                          GetKeyCounterAlgOp)
+
+REGISTER_GET_KCA(CPU);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+TF_CALL_half(REGISTER_GPU);
+TF_CALL_float(REGISTER_GPU);
+TF_CALL_double(REGISTER_GPU);
+TF_CALL_int32(REGISTER_INT_GPU);
+TF_CALL_int64(REGISTER_INT_GPU);
+TF_CALL_uint32(REGISTER_FULL_INT_GPU);
+TF_CALL_uint64(REGISTER_FULL_INT_GPU);
+
+REGISTER_GET_KCA(GPU);
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#undef REGISTER
+#undef REGISTER_INT
+#undef REGISTER_CPU
+#undef REGISTER_GPU
+#undef REGISTER_INT_CPU
+#undef REGISTER_INT_GPU
+#undef REGISTER_FULL_INT_CPU
+#undef REGISTER_FULL_INT_GPU
+
+#undef REGISTER_GET_KCA
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/stateless_random_ops_v2.h b/tensorflow/core/kernels/stateless_random_ops_v2.h
new file mode 100644
index 00000000000..429f9353ab7
--- /dev/null
+++ b/tensorflow/core/kernels/stateless_random_ops_v2.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_V2_H_
+#define TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_V2_H_
+
+#include "tensorflow/core/framework/rng_alg.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+
+inline Status CheckKeyCounterShape(Algorithm const& alg,
+                                   TensorShape const& key_shape,
+                                   TensorShape const& counter_shape) {
+  if (!(key_shape.dims() == 1 && key_shape.dim_size(0) == RNG_KEY_SIZE)) {
+    return errors::InvalidArgument(
+        "key must have shape [", RNG_KEY_SIZE, "], not ",
+        key_shape.DebugString(),
+        ". (Note that batched keys are not supported yet.)");
+  }
+  auto counter_size = GetCounterSize(alg);
+  if (!(counter_shape.dims() == 1 &&
+        counter_shape.dim_size(0) >= counter_size)) {
+    return errors::InvalidArgument(
+        "counter must be a vector with length at least ", counter_size,
+        "; got shape: ", counter_shape.DebugString(),
+        ". (Note that batched counters are not supported yet.)");
+  }
+  return Status::OK();
+}
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_STATELESS_RANDOM_OPS_V2_H_
diff --git a/tensorflow/core/kernels/strided_slice_op.cc b/tensorflow/core/kernels/strided_slice_op.cc
index 7d9dfa44129..47147061912 100644
--- a/tensorflow/core/kernels/strided_slice_op.cc
+++ b/tensorflow/core/kernels/strided_slice_op.cc
@@ -529,90 +529,4 @@ REGISTER_KERNEL_BUILDER(Name("TensorStridedSliceUpdate")
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                                              \
-  REGISTER_KERNEL_BUILDER(Name("StridedSlice")                           \
-                              .Device(DEVICE_SYCL)                       \
-                              .TypeConstraint<type>("T")                 \
-                              .HostMemory("begin")                       \
-                              .HostMemory("end")                         \
-                              .HostMemory("strides"),                    \
-                          StridedSliceOp<SYCLDevice, type>)              \
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")                       \
-                              .Device(DEVICE_SYCL)                       \
-                              .TypeConstraint<type>("T")                 \
-                              .HostMemory("shape")                       \
-                              .HostMemory("begin")                       \
-                              .HostMemory("end")                         \
-                              .HostMemory("strides"),                    \
-                          StridedSliceGradOp<SYCLDevice, type>)          \
-  REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")                     \
-                              .Device(DEVICE_SYCL)                       \
-                              .TypeConstraint<type>("T")                 \
-                              .HostMemory("begin")                       \
-                              .HostMemory("end")                         \
-                              .HostMemory("strides"),                    \
-                          StridedSliceAssignOp<SYCLDevice, type, false>) \
-  REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")             \
-                              .Device(DEVICE_SYCL)                       \
-                              .TypeConstraint<type>("T")                 \
-                              .HostMemory("ref")                         \
-                              .HostMemory("begin")                       \
-                              .HostMemory("end")                         \
-                              .HostMemory("strides"),                    \
-                          StridedSliceAssignOp<SYCLDevice, type, false>) \
-  REGISTER_KERNEL_BUILDER(Name("TensorStridedSliceUpdate")               \
-                              .Device(DEVICE_SYCL)                       \
-                              .TypeConstraint<type>("T")                 \
-                              .HostMemory("begin")                       \
-                              .HostMemory("end")                         \
-                              .HostMemory("strides"),                    \
-                          StridedSliceAssignOp<SYCLDevice, type, true>)
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
-
-REGISTER_KERNEL_BUILDER(Name("StridedSlice")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("input")
-                            .HostMemory("begin")
-                            .HostMemory("end")
-                            .HostMemory("strides")
-                            .HostMemory("output"),
-                        StridedSliceOp<CPUDevice, int32>);
-REGISTER_KERNEL_BUILDER(Name("StridedSliceGrad")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("shape")
-                            .HostMemory("begin")
-                            .HostMemory("end")
-                            .HostMemory("strides")
-                            .HostMemory("dy")
-                            .HostMemory("output"),
-                        StridedSliceGradOp<CPUDevice, int32>);
-REGISTER_KERNEL_BUILDER(Name("StridedSliceAssign")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("ref")
-                            .HostMemory("begin")
-                            .HostMemory("end")
-                            .HostMemory("strides"),
-                        StridedSliceAssignOp<CPUDevice, int32, false>);
-REGISTER_KERNEL_BUILDER(Name("ResourceStridedSliceAssign")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("ref")
-                            .HostMemory("begin")
-                            .HostMemory("end")
-                            .HostMemory("strides"),
-                        StridedSliceAssignOp<CPUDevice, int32, false>);
-REGISTER_KERNEL_BUILDER(Name("TensorStridedSliceUpdate")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("begin")
-                            .HostMemory("end")
-                            .HostMemory("strides"),
-                        StridedSliceAssignOp<CPUDevice, int32, true>)
-#undef REGISTER_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/strided_slice_op_impl.h b/tensorflow/core/kernels/strided_slice_op_impl.h
index 5ce1d773e33..6f4f5fcc940 100644
--- a/tensorflow/core/kernels/strided_slice_op_impl.h
+++ b/tensorflow/core/kernels/strided_slice_op_impl.h
@@ -288,20 +288,6 @@ TF_CALL_GPU_ALL_TYPES(DECLARE_FOR_N_GPU);
 
 TF_CALL_ALL_TYPES(DECLARE_FOR_N_CPU);
 
-#ifdef TENSORFLOW_USE_SYCL
-#define PREVENT_FOR_N_SYCL(T) \
-  PREVENT_INSTANTIATE(T, STRIDED_SLICE_INSTANTIATE_DIM)
-
-#define DECLARE_FOR_N_SYCL(T) \
-  INSTANTIATE(SYCLDevice, T, STRIDED_SLICE_INSTANTIATE_DIM)
-
-TF_CALL_SYCL_PROXY_TYPES(PREVENT_FOR_N_SYCL);
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DECLARE_FOR_N_SYCL);
-DECLARE_FOR_N_SYCL(int32);
-DECLARE_FOR_N_SYCL(int64);
-
-#undef DECLARE_FOR_N_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef INSTANTIATE
 #undef DECLARE_FOR_N_CPU
diff --git a/tensorflow/core/kernels/string_ngrams_op.cc b/tensorflow/core/kernels/string_ngrams_op.cc
index 97b32c4242c..8aed2b3831a 100644
--- a/tensorflow/core/kernels/string_ngrams_op.cc
+++ b/tensorflow/core/kernels/string_ngrams_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 namespace text {
@@ -60,6 +61,18 @@ class StringNGramsOp : public tensorflow::OpKernel {
     OP_REQUIRES_OK(context, context->input("data_splits", &splits));
     const auto& splits_vec = splits->flat<SPLITS_TYPE>();
 
+    // Validate that the splits are valid indices into data
+    const int input_data_size = data->flat<tstring>().size();
+    const int splits_vec_size = splits_vec.size();
+    for (int i = 0; i < splits_vec_size; ++i) {
+      bool valid_splits = splits_vec(i) >= 0;
+      valid_splits = valid_splits && (splits_vec(i) <= input_data_size);
+      OP_REQUIRES(
+          context, valid_splits,
+          errors::InvalidArgument("Invalid split value ", splits_vec(i),
+                                  ", must be in [0,", input_data_size, "]"));
+    }
+
     int num_batch_items = splits_vec.size() - 1;
     tensorflow::Tensor* ngrams_splits;
     OP_REQUIRES_OK(
diff --git a/tensorflow/core/kernels/tensor_forest/BUILD b/tensorflow/core/kernels/tensor_forest/BUILD
index 8bab8bd3974..bfa42e2dd44 100644
--- a/tensorflow/core/kernels/tensor_forest/BUILD
+++ b/tensorflow/core/kernels/tensor_forest/BUILD
@@ -1,6 +1,7 @@
 # Description:
 #   OpKernels for tensor forest ops.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 
 package(
diff --git a/tensorflow/core/kernels/tensor_map.h b/tensorflow/core/kernels/tensor_map.h
index d29d244f1ca..cb4c827cc3c 100644
--- a/tensorflow/core/kernels/tensor_map.h
+++ b/tensorflow/core/kernels/tensor_map.h
@@ -144,7 +144,19 @@ class TensorMap {
   size_t erase(TensorKey key) { return tensors_->values_.erase(key); }
 
   // Size returns the number of elements in the map
-  size_t size() { return tensors_->values_.size(); }
+  size_t size() const { return tensors_->values_.size(); }
+
+  std::vector<Tensor> keys() const {
+    std::vector<Tensor> keys;
+    keys.reserve(tensors_->values_.size());
+    absl::flat_hash_map<TensorKey, Tensor>::iterator it =
+        tensors_->values_.begin();
+    while (it != tensors_->values_.end()) {
+      keys.push_back(it->first);
+      it++;
+    }
+    return keys;
+  }
 
   // Is this TensorMap the only one with a reference to the underlying
   // container?
diff --git a/tensorflow/core/kernels/tensor_map_test.cc b/tensorflow/core/kernels/tensor_map_test.cc
index beaff6fc622..76c903f047c 100644
--- a/tensorflow/core/kernels/tensor_map_test.cc
+++ b/tensorflow/core/kernels/tensor_map_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/kernels/tensor_map.h"
-
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -45,7 +44,6 @@ TEST(TensorKeyTest, Equal) {
 }
 
 TEST(TensorMapTest, Insert) {
-  EXPECT_EQ(1, 1);
   TensorMap tm;
   TensorKey k = Tensor(11);
   Tensor v = Tensor(22);
@@ -102,12 +100,49 @@ TEST(TensorMapTest, Replace) {
   Tensor v1 = Tensor(22);
   Tensor v2 = Tensor(23);
   tm[k] = v2;
-
   absl::flat_hash_map<TensorKey, Tensor>::iterator map_it = tm.find(k);
   EXPECT_EQ(map_it->first, k);
   test::ExpectTensorEqual<int32>(map_it->second, v2);
 }
 
+TEST(TensorMapTest, ListKeys) {
+  TensorMap tm;
+  TensorKey k = Tensor(11.0);
+  TensorKey k2 = Tensor(12.0);
+  Tensor v = Tensor(22);
+  Tensor v2 = Tensor(23);
+  tm.insert(k, v);
+  tm.insert(k2, v2);
+  std::vector<Tensor> keys = tm.keys();
+
+  // Extract and sort double value for each key Tensor.
+  std::vector<std::pair<double, int>> key_doubles;
+  for (int i = 0; i < keys.size(); i++) {
+    double x = keys[i].scalar<double>()();
+    std::pair<double, int> p = std::pair<double, int>(x, i);
+    key_doubles.push_back(p);
+  }
+  sort(key_doubles.begin(), key_doubles.end());
+  // Check number of keys and each key.
+  EXPECT_EQ(keys.size(), 2);
+  EXPECT_EQ(key_doubles[0].first, 11.0);
+  EXPECT_EQ(key_doubles[1].first, 12.0);
+  // Check key shapes.
+  int ind1 = key_doubles[0].second;
+  int ind2 = key_doubles[1].second;
+  EXPECT_EQ(keys[ind1].shape(), k.shape());
+  EXPECT_EQ(keys[ind2].shape(), k2.shape());
+}
+
+TEST(TensorMapTest, Size) {
+  TensorMap tm;
+  EXPECT_EQ(tm.size(), 0);
+  TensorKey k = Tensor(11);
+  Tensor v = Tensor(22);
+  tm.insert(k, v);
+  EXPECT_EQ(tm.size(), 1);
+}
+
 TEST(TensorMapTest, Copy) {
   TensorMap tm;
   TensorKey k = Tensor(11);
diff --git a/tensorflow/core/kernels/tile_functor.h b/tensorflow/core/kernels/tile_functor.h
index d8ce39dcaf8..f2428cd48d9 100644
--- a/tensorflow/core/kernels/tile_functor.h
+++ b/tensorflow/core/kernels/tile_functor.h
@@ -37,10 +37,6 @@ template <typename T>
 void TileSimple(const Eigen::GpuDevice& d, Tensor* out, const Tensor& in);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-void TileSimple(const Eigen::SyclDevice& d, Tensor* out, const Tensor& in);
-#endif
 
 template <typename Device, typename T, typename Tmultiples, int NDIM>
 void TileUsingEigen(const Device& d, Tensor* out, const Tensor& in,
diff --git a/tensorflow/core/kernels/tile_functor_cpu.h b/tensorflow/core/kernels/tile_functor_cpu.h
index 5b005e4a8b4..2967d56346d 100644
--- a/tensorflow/core/kernels/tile_functor_cpu.h
+++ b/tensorflow/core/kernels/tile_functor_cpu.h
@@ -48,12 +48,6 @@ void TileSimple(const Eigen::ThreadPoolDevice& d, Tensor* out,
                 const Tensor& in) {
   return TileSimpleImpl<Eigen::ThreadPoolDevice, T>(d, out, in);
 }
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-void TileSimple(const Eigen::SyclDevice& d, Tensor* out, const Tensor& in) {
-  return TileSimpleImpl<Eigen::SyclDevice, T>(d, out, in);
-}
-#endif
 
 }  // namespace internal
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_functor_sycl.cc b/tensorflow/core/kernels/tile_functor_sycl.cc
index 21574250773..b15a1f203b5 100644
--- a/tensorflow/core/kernels/tile_functor_sycl.cc
+++ b/tensorflow/core/kernels/tile_functor_sycl.cc
@@ -19,24 +19,6 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-
-#define DEFINE_TYPE(T)                        \
-  template struct Tile<SYCLDevice, T, int32>; \
-  template struct Tile<SYCLDevice, T, int64>;
-
-TF_CALL_bool(DEFINE_TYPE);
-TF_CALL_float(DEFINE_TYPE);
-TF_CALL_bfloat16(DEFINE_TYPE);
-TF_CALL_double(DEFINE_TYPE);
-TF_CALL_uint8(DEFINE_TYPE);
-TF_CALL_int32(DEFINE_TYPE);
-TF_CALL_int16(DEFINE_TYPE);
-TF_CALL_int64(DEFINE_TYPE);
-
-#undef DEFINE_TYPE
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index f733d9b9aea..c24c7f1b0bc 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -41,9 +41,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 // Forward declarations of functors that will be defined in tile_ops_impl.h
 namespace functor {
@@ -108,26 +105,6 @@ extern template struct Tile<GPUDevice, int64, int64>;
 #define DECLARE_CUDA_DIM(T, NDIM)
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define DECLARE_TYPE(T)                              \
-  extern template struct Tile<SYCLDevice, T, int32>; \
-  extern template struct Tile<SYCLDevice, T, int64>;
-TF_CALL_bool(DECLARE_TYPE);
-TF_CALL_float(DECLARE_TYPE);
-TF_CALL_bfloat16(DECLARE_TYPE);
-TF_CALL_double(DECLARE_TYPE);
-TF_CALL_uint8(DECLARE_TYPE);
-TF_CALL_int32(DECLARE_TYPE);
-TF_CALL_int16(DECLARE_TYPE);
-TF_CALL_int64(DECLARE_TYPE);
-#undef DECLARE_TYPE
-#define DECLARE_SYCL_DIM(T, NDIM)                       \
-  extern template struct TileGrad<SYCLDevice, T, NDIM>; \
-  extern template struct ReduceAndReshape<SYCLDevice, T, NDIM, 1>
-#else  // TENSORFLOW_USE_SYCL
-#define DECLARE_SYCL_DIM(T, NDIM)
-#endif  // TENSORFLOW_USE_SYCL
-
 #define DECLARE_TYPE(T)                             \
   extern template struct Tile<CPUDevice, T, int32>; \
   extern template struct Tile<CPUDevice, T, int64>;
@@ -150,7 +127,6 @@ TF_CALL_variant(DECLARE_TYPE);
 
 #define DECLARE_DIM(T, NDIM)                           \
   DECLARE_CUDA_DIM(T, NDIM);                           \
-  DECLARE_SYCL_DIM(T, NDIM);                           \
   extern template struct TileGrad<CPUDevice, T, NDIM>; \
   extern template struct ReduceAndReshape<CPUDevice, T, NDIM, 1>;
 
@@ -174,7 +150,6 @@ TF_CALL_complex128(DECLARE_TYPE);
 #undef DECLARE_TYPE
 
 #undef DECLARE_DIM
-#undef DECLARE_SYCL_DIM
 #undef DECLARE_CUDA_DIM
 
 }  // namespace functor
@@ -310,11 +285,6 @@ inline void TileOp<Device, Tmultiples>::HandleCase(
   HANDLE_CASE(GPUDevice, DataTypeToEnum<T>::value, int32); \
   HANDLE_CASE(GPUDevice, DataTypeToEnum<T>::value, int64);
 
-#ifdef TENSORFLOW_USE_SYCL
-#define HANDLE_TYPE_NAME_SYCL(T)                            \
-  HANDLE_CASE(SYCLDevice, DataTypeToEnum<T>::value, int32); \
-  HANDLE_CASE(SYCLDevice, DataTypeToEnum<T>::value, int64);
-#endif  // TENSORFLOW_USE_SYCL
 
 TF_CALL_bool(HANDLE_TYPE_NAME_CPU);
 TF_CALL_float(HANDLE_TYPE_NAME_CPU);
@@ -345,19 +315,9 @@ TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
-TF_CALL_double(HANDLE_TYPE_NAME_SYCL);
-TF_CALL_int16(HANDLE_TYPE_NAME_SYCL);
-TF_CALL_int32(HANDLE_TYPE_NAME_SYCL);
-TF_CALL_int64(HANDLE_TYPE_NAME_SYCL);
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef HANDLE_TYPE_NAME_CPU
 #undef HANDLE_TYPE_NAME_GPU
-#ifdef TENSORFLOW_USE_SYCL
-#undef HANDLE_TYPE_NAME_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 #undef HANDLE_CASE
 
 // --------------------------------------------------------------------------
@@ -610,17 +570,6 @@ TF_CALL_complex64(HANDLE_TYPE_NAME_GPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_GPU);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#if TENSORFLOW_USE_SYCL
-#define HANDLE_TYPE_NAME_SYCL(T) \
-  HANDLE_CASE_DIM(SYCLDevice, T, DataTypeToEnum<T>::value);
-
-TF_CALL_float(HANDLE_TYPE_NAME_SYCL);
-TF_CALL_double(HANDLE_TYPE_NAME_SYCL);
-TF_CALL_int16(HANDLE_TYPE_NAME_SYCL);
-TF_CALL_int32(HANDLE_TYPE_NAME_SYCL);
-TF_CALL_int64(HANDLE_TYPE_NAME_SYCL);
-#undef HANDLE_TYPE_NAME_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef HANDLE_TYPE_NAME_CPU
 #undef HANDLE_TYPE_NAME_GPU
@@ -696,37 +645,5 @@ TF_CALL_complex128(REGISTER_GPU)
 #undef REGISTER_GPU
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                                        \
-  REGISTER_KERNEL_BUILDER(Name("Tile")                             \
-                              .Device(DEVICE_SYCL)                 \
-                              .TypeConstraint<type>("T")           \
-                              .TypeConstraint<int32>("Tmultiples") \
-                              .HostMemory("multiples"),            \
-                          TileOp<SYCLDevice, int32>);              \
-  REGISTER_KERNEL_BUILDER(Name("Tile")                             \
-                              .Device(DEVICE_SYCL)                 \
-                              .TypeConstraint<type>("T")           \
-                              .TypeConstraint<int64>("Tmultiples") \
-                              .HostMemory("multiples"),            \
-                          TileOp<SYCLDevice, int64>);              \
-  REGISTER_KERNEL_BUILDER(Name("TileGrad")                         \
-                              .Device(DEVICE_SYCL)                 \
-                              .TypeConstraint<type>("T")           \
-                              .TypeConstraint<int32>("Tmultiples") \
-                              .HostMemory("multiples"),            \
-                          TileGradientOp<SYCLDevice, int32>);      \
-  REGISTER_KERNEL_BUILDER(Name("TileGrad")                         \
-                              .Device(DEVICE_SYCL)                 \
-                              .TypeConstraint<type>("T")           \
-                              .TypeConstraint<int64>("Tmultiples") \
-                              .HostMemory("multiples"),            \
-                          TileGradientOp<SYCLDevice, int64>);
-
-    TF_CALL_float(REGISTER_SYCL);
-TF_CALL_double(REGISTER_SYCL);
-
-#undef REGISTER_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_ops_cpu_impl.h b/tensorflow/core/kernels/tile_ops_cpu_impl.h
index 8b0c80159a3..066954a16a7 100644
--- a/tensorflow/core/kernels/tile_ops_cpu_impl.h
+++ b/tensorflow/core/kernels/tile_ops_cpu_impl.h
@@ -45,27 +45,6 @@ TF_CALL_complex128(DEFINE_TYPE);
 #undef DEFINE_DIM
 #undef DEFINE_TYPE
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-
-// Register functors used for TileGradientOp.
-#define DEFINE_DIM(T, NDIM)                      \
-  template struct TileGrad<SYCLDevice, T, NDIM>; \
-  template struct ReduceAndReshape<SYCLDevice, T, NDIM, 1>;
-#define DEFINE_TYPE(T) DEFINE_DIM(T, CPU_PROVIDED_IXDIM)
-
-TF_CALL_bool(DEFINE_TYPE);
-TF_CALL_float(DEFINE_TYPE);
-TF_CALL_bfloat16(DEFINE_TYPE);
-TF_CALL_double(DEFINE_TYPE);
-TF_CALL_uint8(DEFINE_TYPE);
-TF_CALL_int16(DEFINE_TYPE);
-TF_CALL_int32(DEFINE_TYPE);
-TF_CALL_int64(DEFINE_TYPE);
-
-#undef DEFINE_DIM
-#undef DEFINE_TYPE
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace functor
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc
index 50325b7bcfe..3390bd07308 100644
--- a/tensorflow/core/kernels/topk_op.cc
+++ b/tensorflow/core/kernels/topk_op.cc
@@ -136,7 +136,7 @@ struct TopKFunctor<CPUDevice, T> {
       return Status::OK();
     }
 
-    auto SortIndices = [&](int start_batch, int limit_batch) {
+    auto SortIndices = [&](int64 start_batch, int64 limit_batch) {
       for (int32 b = start_batch; b < limit_batch; ++b) {
         const T* input_data = &input(b, 0);
         const auto stable_comp = [input_data](const int32 a, const int32 b) {
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index 557e73e2290..877db987c10 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -27,15 +27,11 @@ limitations under the License.
 #include "tensorflow/core/platform/bfloat16.h"
 #include "tensorflow/core/util/util.h"
 
-#ifdef TENSORFLOW_USE_SYCL
-#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace tensorflow {
 
 using CPUDevice = Eigen::ThreadPoolDevice;
 using GPUDevice = Eigen::GpuDevice;
-using SYCLDevice = Eigen::SyclDevice;
 using Index = Eigen::Index;
 
 namespace {
@@ -57,15 +53,6 @@ struct ApplyGradientDescent<CPUDevice, T> {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-struct ApplyGradientDescentSYCL {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat var, T lr,
-                  typename TTypes<T>::ConstFlat grad) {
-    var.device(d) -= grad * lr;
-  }
-};
-#endif
 
 template <typename T>
 struct ApplyAdadelta<CPUDevice, T> {
@@ -363,6 +350,179 @@ struct ApplyFtrlMultiplyLinearByLr<CPUDevice, T> {
   }
 };
 
+namespace {
+
+template <typename T>
+inline T FtrlCompute(const T& accum, const T& linear, const T& lr, const T& l1,
+                     const T& l2, const T& lr_power,
+                     const bool multiply_linear_by_lr) {
+  T quadratic;
+  if (multiply_linear_by_lr) {
+    if (lr_power == static_cast<T>(-0.5)) {
+      quadratic = Eigen::numext::sqrt(accum) + static_cast<T>(2) * l2 * lr;
+    } else {
+      quadratic =
+          Eigen::numext::pow(accum, -lr_power) + static_cast<T>(2) * l2 * lr;
+    }
+    auto l1_reg_adjust = std::max(std::min(linear, l1 * lr), -l1 * lr);
+    return (l1_reg_adjust - linear) / quadratic;
+  } else {
+    if (lr_power == static_cast<T>(-0.5)) {
+      quadratic = Eigen::numext::sqrt(accum) / lr + static_cast<T>(2) * l2;
+    } else {
+      quadratic =
+          Eigen::numext::pow(accum, -lr_power) / lr + static_cast<T>(2) * l2;
+    }
+    auto l1_reg_adjust = std::max(std::min(linear, l1), -l1);
+    return (l1_reg_adjust - linear) / quadratic;
+  }
+}
+
+}  // namespace
+
+template <typename T, typename Tindex, bool has_l2_shrinkage>
+struct SparseApplyFtrl<CPUDevice, T, Tindex, has_l2_shrinkage> {
+  Status operator()(const CPUDevice& d, typename TTypes<T>::Matrix var_flat,
+                    typename TTypes<T>::Matrix accum_flat,
+                    typename TTypes<T>::Matrix linear_flat,
+                    typename TTypes<T>::ConstScalar lr,
+                    typename TTypes<T>::ConstScalar l1,
+                    typename TTypes<T>::ConstScalar l2,
+                    typename TTypes<T>::ConstScalar l2_shrinkage,
+                    typename TTypes<T>::ConstScalar lr_power,
+                    typename TTypes<T>::ConstMatrix grad_flat,
+                    typename TTypes<Tindex>::ConstVec indices_vec,
+                    int64 inner_dim, bool multiply_linear_by_lr) {
+    const Tindex N = static_cast<Tindex>(indices_vec.dimension(0));
+    if (N > 0) {
+      T lr_scalar = lr();
+      T l1_scalar = l1();
+      T l2_scalar = l2();
+      T l2_shrinkage_scalar;
+      if (has_l2_shrinkage) {
+        l2_shrinkage_scalar = l2_shrinkage();
+      }
+      T lr_power_scalar = lr_power();
+      if (inner_dim > 1) {
+        const Tindex first_dim_size =
+            static_cast<Tindex>(var_flat.dimension(0));
+
+        for (Tindex i = 0; i < N; i++) {
+          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+          if (!FastBoundsCheck(index, first_dim_size)) {
+            return errors::InvalidArgument(
+                strings::StrCat("Index ", index, " at offset ", i,
+                                " in indices is out of range"));
+          }
+          auto accum = accum_flat.template chip<0>(index);
+          auto linear = linear_flat.template chip<0>(index);
+          auto grad = grad_flat.template chip<0>(i);
+          auto var = var_flat.template chip<0>(index);
+
+// TODO(sanjoy): Remove this macro.
+// Use a macro to implement the computation here due to the templating of the
+// eigen tensor library.
+#define COMPUTE_FTRL(grad, grad_maybe_with_shrinkage)                          \
+  auto new_accum = accum + grad.square();                                      \
+  if (multiply_linear_by_lr) {                                                 \
+    if (lr_power_scalar == static_cast<T>(-0.5)) {                             \
+      linear += grad_maybe_with_shrinkage * lr_scalar -                        \
+                (new_accum.sqrt() - accum.sqrt()) * var;                       \
+    } else {                                                                   \
+      linear +=                                                                \
+          grad_maybe_with_shrinkage * lr_scalar -                              \
+          (new_accum.pow(-lr_power_scalar) - accum.pow(-lr_power_scalar)) *    \
+              var;                                                             \
+    }                                                                          \
+  } else {                                                                     \
+    if (lr_power_scalar == static_cast<T>(-0.5)) {                             \
+      linear += grad_maybe_with_shrinkage -                                    \
+                (new_accum.sqrt() - accum.sqrt()) / lr_scalar * var;           \
+    } else {                                                                   \
+      linear += grad_maybe_with_shrinkage - (new_accum.pow(-lr_power_scalar) - \
+                                             accum.pow(-lr_power_scalar)) /    \
+                                                lr_scalar * var;               \
+    }                                                                          \
+  }                                                                            \
+  auto l1_reg_adjust =                                                         \
+      (multiply_linear_by_lr                                                   \
+           ? linear.cwiseMin(l1_scalar * lr_scalar)                            \
+                 .cwiseMax(-l1_scalar * lr_scalar)                             \
+           : linear.cwiseMin(l1_scalar).cwiseMax(-l1_scalar));                 \
+  auto x = l1_reg_adjust - linear;                                             \
+  if (multiply_linear_by_lr) {                                                 \
+    if (lr_power_scalar == static_cast<T>(-0.5)) {                             \
+      auto y = new_accum.sqrt() +                                              \
+               linear.constant(static_cast<T>(2) * l2_scalar * lr_scalar);     \
+      var = x / y;                                                             \
+    } else {                                                                   \
+      auto y = new_accum.pow(-lr_power_scalar) +                               \
+               linear.constant(static_cast<T>(2) * l2_scalar * lr_scalar);     \
+      var = x / y;                                                             \
+    }                                                                          \
+  } else {                                                                     \
+    if (lr_power_scalar == static_cast<T>(-0.5)) {                             \
+      auto y = new_accum.sqrt() / new_accum.constant(lr_scalar) +              \
+               linear.constant(static_cast<T>(2) * l2_scalar);                 \
+      var = x / y;                                                             \
+    } else {                                                                   \
+      auto y =                                                                 \
+          new_accum.pow(-lr_power_scalar) / new_accum.constant(lr_scalar) +    \
+          linear.constant(static_cast<T>(2) * l2_scalar);                      \
+      var = x / y;                                                             \
+    }                                                                          \
+  }                                                                            \
+  accum += grad.square();
+
+          if (has_l2_shrinkage) {
+            auto grad_with_shrinkage =
+                grad + static_cast<T>(2) * l2_shrinkage_scalar * var;
+            COMPUTE_FTRL(grad, grad_with_shrinkage);
+          } else {
+            COMPUTE_FTRL(grad, grad);
+          }
+        }
+#undef COMPUTE_FTRL
+      } else {
+        const Tindex first_dim_size = accum_flat.size();
+
+        for (Tindex i = 0; i < N; i++) {
+          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+          if (!FastBoundsCheck(index, first_dim_size)) {
+            return errors::InvalidArgument(
+                strings::StrCat("Index ", index, " at offset ", i,
+                                " in indices is out of range"));
+          }
+          T& a = accum_flat(index);
+          T& l = linear_flat(index);
+          T& v = var_flat(index);
+          T g;
+          if (has_l2_shrinkage) {
+            g = grad_flat(i) +
+                (static_cast<T>(2) * l2_shrinkage_scalar * var_flat(index));
+          } else {
+            g = grad_flat(i);
+          }
+
+          T updated_a = a + grad_flat(i) * grad_flat(i);
+          using Eigen::numext::pow;
+          T sigma = pow(updated_a, -lr_power_scalar) - pow(a, -lr_power_scalar);
+          if (!multiply_linear_by_lr) {
+            sigma /= lr_scalar;
+          }
+          T updated_l = (multiply_linear_by_lr ? l + g * lr_scalar - sigma * v
+                                               : l + g - sigma * v);
+          v = FtrlCompute(updated_a, updated_l, lr_scalar, l1_scalar, l2_scalar,
+                          lr_power_scalar, multiply_linear_by_lr);
+          a = updated_a;
+          l = updated_l;
+        }
+      }
+    }
+    return Status::OK();
+  }
+};
+
 template <typename T>
 struct ApplyMomentum<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
@@ -496,21 +656,6 @@ struct ApplyAdamNonCuda {
   }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-struct ApplyAdamSYCL {
-  void operator()(const SYCLDevice& d, typename TTypes<T>::Flat var,
-                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
-                  T beta1_power, T beta2_power, T lr, T beta1, T beta2,
-                  T epsilon, typename TTypes<T>::ConstFlat grad) {
-    const T alpha =
-        lr * Eigen::numext::sqrt(T(1) - beta2_power) / (T(1) - beta1_power);
-    m.device(d) += (grad - m) * (T(1) - beta1);
-    v.device(d) += (grad.square() - v) * (T(1) - beta2);
-    var.device(d) -= (m * alpha) / (v.sqrt() + epsilon);
-  }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename T>
 struct ApplyAdam<CPUDevice, T> : ApplyAdamNonCuda<CPUDevice, T> {};
@@ -666,53 +811,6 @@ class ApplyGradientDescentOp : public OpKernel {
   bool use_exclusive_lock_;
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-class ApplyGradientDescentOp<SYCLDevice, T> : public OpKernel {
- public:
-  explicit ApplyGradientDescentOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const bool sparse = false;
-    auto locks = MaybeLockVariableInputMutexesInOrder<SYCLDevice, T>(
-        ctx, use_exclusive_lock_, sparse, {0});
-    Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<SYCLDevice, T>(
-                            ctx, 0, use_exclusive_lock_, sparse, &var));
-
-    OP_REQUIRES(
-        ctx, var.IsInitialized(),
-        errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", requested_input(0)));
-    const Tensor& alpha_dev = ctx->input(1);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(alpha_dev.shape()),
-                errors::InvalidArgument("alpha is not a scalar: ",
-                                        alpha_dev.shape().DebugString()));
-    const Tensor& delta = ctx->input(2);
-    OP_REQUIRES(
-        ctx, var.shape().IsSameSize(delta.shape()),
-        errors::InvalidArgument("var and delta do not have the same shape",
-                                var.shape().DebugString(), " ",
-                                delta.shape().DebugString()));
-
-    auto device = ctx->eigen_sycl_device();
-    auto size = sizeof(T);
-    T alpha = T(0);
-    auto src_ptr = GetBase(&alpha_dev);
-    device.memcpyDeviceToHost(&alpha, static_cast<const T*>(src_ptr), size);
-
-    functor::ApplyGradientDescentSYCL<T>()(device, var.flat<T>(), alpha,
-                                           delta.flat<T>());
-
-    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
-  }
-
- private:
-  bool use_exclusive_lock_;
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(D, T)                                                \
   REGISTER_KERNEL_BUILDER(                                                    \
@@ -757,12 +855,6 @@ REGISTER_KERNELS(GPU, complex128);
 #endif
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T);
-TF_CALL_float(REGISTER_SYCL_KERNELS);
-TF_CALL_double(REGISTER_SYCL_KERNELS);
-#undef REGISTER_SYCL_KERNELS
-#endif  // TENSORFLOW_USE_SYCL
 
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
@@ -1542,20 +1634,23 @@ class ApplyProximalAdagradOp : public OpKernel {
     const Tensor& lr = ctx->input(2);
     OP_REQUIRES(ctx,
                 TensorShapeUtils::IsScalar(lr.shape()) &&
-                    lr.scalar<T>()() > static_cast<T>(0),
+                    (!std::is_same<Device, CPUDevice>::value ||
+                     lr.scalar<T>()() > static_cast<T>(0)),
                 errors::InvalidArgument("lr is not a positive scalar: ",
                                         lr.shape().DebugString()));
     const Tensor& l1 = ctx->input(3);
     OP_REQUIRES(ctx,
                 TensorShapeUtils::IsScalar(l1.shape()) &&
-                    l1.scalar<T>()() >= static_cast<T>(0),
+                    (!std::is_same<Device, CPUDevice>::value ||
+                     l1.scalar<T>()() >= static_cast<T>(0)),
                 errors::InvalidArgument("l1 regularization strength is not a "
                                         "non-negative scalar: ",
                                         l1.shape().DebugString()));
     const Tensor& l2 = ctx->input(4);
     OP_REQUIRES(ctx,
                 TensorShapeUtils::IsScalar(l2.shape()) &&
-                    l2.scalar<T>()() >= static_cast<T>(0),
+                    (!std::is_same<Device, CPUDevice>::value ||
+                     l2.scalar<T>()() >= static_cast<T>(0)),
                 errors::InvalidArgument("l2 regularization strength is not a "
                                         "non-negative scalar: ",
                                         l2.shape().DebugString()));
@@ -1591,37 +1686,31 @@ class ApplyProximalAdagradOp : public OpKernel {
 
 REGISTER_KERNELS(CPU, float);
 REGISTER_KERNELS(CPU, double);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                                   \
+  template <>                                                                 \
+  void ApplyProximalAdagrad<GPUDevice, T>::operator()(                        \
+      const GPUDevice& d, typename TTypes<T>::Flat var,                       \
+      typename TTypes<T>::Flat accum, typename TTypes<T>::ConstScalar lr,     \
+      typename TTypes<T>::ConstScalar l1, typename TTypes<T>::ConstScalar l2, \
+      typename TTypes<T>::ConstFlat grad);                                    \
+  extern template struct ApplyProximalAdagrad<GPUDevice, T>;
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
-namespace {
-
-template <typename T>
-inline T FtrlCompute(const T& accum, const T& linear, const T& lr, const T& l1,
-                     const T& l2, const T& lr_power,
-                     const bool multiply_linear_by_lr) {
-  T quadratic;
-  if (multiply_linear_by_lr) {
-    if (lr_power == static_cast<T>(-0.5)) {
-      quadratic = Eigen::numext::sqrt(accum) + static_cast<T>(2) * l2 * lr;
-    } else {
-      quadratic =
-          Eigen::numext::pow(accum, -lr_power) + static_cast<T>(2) * l2 * lr;
-    }
-    auto l1_reg_adjust = std::max(std::min(linear, l1 * lr), -l1 * lr);
-    return (l1_reg_adjust - linear) / quadratic;
-  } else {
-    if (lr_power == static_cast<T>(-0.5)) {
-      quadratic = Eigen::numext::sqrt(accum) / lr + static_cast<T>(2) * l2;
-    } else {
-      quadratic =
-          Eigen::numext::pow(accum, -lr_power) / lr + static_cast<T>(2) * l2;
-    }
-    auto l1_reg_adjust = std::max(std::min(linear, l1), -l1);
-    return (l1_reg_adjust - linear) / quadratic;
-  }
-}
-}  // namespace
-
 // Note, this op works on cpu only.
 template <typename T, typename Tindex>
 class SparseApplyAdagradOp : public OpKernel {
@@ -2696,11 +2785,14 @@ class SparseApplyFtrlOp : public OpKernel {
                 errors::InvalidArgument("indices must be one-dimensional"));
 
     const Tensor& lr = ctx->input(5);
-    OP_REQUIRES(ctx,
-                TensorShapeUtils::IsScalar(lr.shape()) &&
-                    lr.scalar<T>()() > static_cast<T>(0),
-                errors::InvalidArgument("lr is not a positive scalar: ",
-                                        lr.shape().DebugString()));
+    OP_REQUIRES(
+        ctx,
+        TensorShapeUtils::IsScalar(lr.shape()) &&
+            (lr.scalar<T>()() > static_cast<T>(0) ||
+             (multiply_linear_by_lr_ && lr.scalar<T>()() >= static_cast<T>(0))),
+        errors::InvalidArgument("lr is not a positive scalar (or zero if "
+                                "multiply_linear_by_lr is set): ",
+                                lr.shape().DebugString()));
 
     const Tensor& l1 = ctx->input(6);
     OP_REQUIRES(ctx,
@@ -2753,146 +2845,18 @@ class SparseApplyFtrlOp : public OpKernel {
                                   l2_shrinkage->shape().DebugString()));
     }
 
-    if (N > 0) {
-      if (inner_dim > 1) {
-        const Tindex first_dim_size = var.dim_size(0);
-        auto indices_vec = indices.vec<Tindex>();
-        auto var_flat = var.flat_outer_dims<T>();
-        auto accum_flat = accum.flat_outer_dims<T>();
-        auto linear_flat = linear.flat_outer_dims<T>();
-        auto grad_flat = grad.flat_outer_dims<T>();
-        T lr_scalar = lr.scalar<T>()();
-        T l1_scalar = l1.scalar<T>()();
-        T l2_scalar = l2.scalar<T>()();
-        T l2_shrinkage_scalar;
-        if (has_l2_shrinkage) {
-          l2_shrinkage_scalar = l2_shrinkage->scalar<T>()();
-        }
-        T lr_power_scalar = lr_power.scalar<T>()();
-
-        for (Tindex i = 0; i < N; i++) {
-          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
-          OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
-                      errors::InvalidArgument(
-                          strings::StrCat("Index ", index, " at offset ", i,
-                                          " in indices is out of range")));
-          auto accum = accum_flat.template chip<0>(index);
-          auto linear = linear_flat.template chip<0>(index);
-          auto grad = grad_flat.template chip<0>(i);
-          auto var = var_flat.template chip<0>(index);
-
-// Use a macro to implement the computation here due to the templating of the
-// eigen tensor library.
-#define COMPUTE_FTRL(grad, grad_maybe_with_shrinkage)                          \
-  auto new_accum = accum + grad.square();                                      \
-  if (multiply_linear_by_lr_) {                                                \
-    if (lr_power_scalar == static_cast<T>(-0.5)) {                             \
-      linear += grad_maybe_with_shrinkage * lr_scalar -                        \
-                (new_accum.sqrt() - accum.sqrt()) * var;                       \
-    } else {                                                                   \
-      linear +=                                                                \
-          grad_maybe_with_shrinkage * lr_scalar -                              \
-          (new_accum.pow(-lr_power_scalar) - accum.pow(-lr_power_scalar)) *    \
-              var;                                                             \
-    }                                                                          \
-  } else {                                                                     \
-    if (lr_power_scalar == static_cast<T>(-0.5)) {                             \
-      linear += grad_maybe_with_shrinkage -                                    \
-                (new_accum.sqrt() - accum.sqrt()) / lr_scalar * var;           \
-    } else {                                                                   \
-      linear += grad_maybe_with_shrinkage - (new_accum.pow(-lr_power_scalar) - \
-                                             accum.pow(-lr_power_scalar)) /    \
-                                                lr_scalar * var;               \
-    }                                                                          \
-  }                                                                            \
-  auto l1_reg_adjust =                                                         \
-      (multiply_linear_by_lr_                                                  \
-           ? linear.cwiseMin(l1_scalar * lr_scalar)                            \
-                 .cwiseMax(-l1_scalar * lr_scalar)                             \
-           : linear.cwiseMin(l1_scalar).cwiseMax(-l1_scalar));                 \
-  auto x = l1_reg_adjust - linear;                                             \
-  if (multiply_linear_by_lr_) {                                                \
-    if (lr_power_scalar == static_cast<T>(-0.5)) {                             \
-      auto y = new_accum.sqrt() +                                              \
-               linear.constant(static_cast<T>(2) * l2_scalar * lr_scalar);     \
-      var = x / y;                                                             \
-    } else {                                                                   \
-      auto y = new_accum.pow(-lr_power_scalar) +                               \
-               linear.constant(static_cast<T>(2) * l2_scalar * lr_scalar);     \
-      var = x / y;                                                             \
-    }                                                                          \
-  } else {                                                                     \
-    if (lr_power_scalar == static_cast<T>(-0.5)) {                             \
-      auto y = new_accum.sqrt() / new_accum.constant(lr_scalar) +              \
-               linear.constant(static_cast<T>(2) * l2_scalar);                 \
-      var = x / y;                                                             \
-    } else {                                                                   \
-      auto y =                                                                 \
-          new_accum.pow(-lr_power_scalar) / new_accum.constant(lr_scalar) +    \
-          linear.constant(static_cast<T>(2) * l2_scalar);                      \
-      var = x / y;                                                             \
-    }                                                                          \
-  }                                                                            \
-  accum += grad.square();
-
-          if (has_l2_shrinkage) {
-            auto grad_with_shrinkage =
-                grad + static_cast<T>(2) * l2_shrinkage_scalar * var;
-            COMPUTE_FTRL(grad, grad_with_shrinkage);
-          } else {
-            COMPUTE_FTRL(grad, grad);
-          }
-        }
-#undef COMPUTE_FTRL
-      } else {
-        T lr_scalar = lr.scalar<T>()();
-        T l1_scalar = l1.scalar<T>()();
-        T l2_scalar = l2.scalar<T>()();
-        T lr_power_scalar = lr_power.scalar<T>()();
-        T l2_shrinkage_scalar;
-        if (has_l2_shrinkage) {
-          l2_shrinkage_scalar = l2_shrinkage->scalar<T>()();
-        }
-
-        auto indices_vec = indices.vec<Tindex>();
-        auto var_flat = var.flat<T>();
-        auto accum_flat = accum.flat<T>();
-        auto linear_flat = linear.flat<T>();
-        auto grad_flat = grad.flat<T>();
-        const Tindex first_dim_size = accum_flat.size();
-
-        for (Tindex i = 0; i < N; i++) {
-          const Tindex index = internal::SubtleMustCopy(indices_vec(i));
-          OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
-                      errors::InvalidArgument(
-                          strings::StrCat("Index ", index, " at offset ", i,
-                                          " in indices is out of range")));
-          T& a = accum_flat(index);
-          T& l = linear_flat(index);
-          T& v = var_flat(index);
-          T g;
-          if (has_l2_shrinkage) {
-            g = grad_flat(i) +
-                (static_cast<T>(2) * l2_shrinkage_scalar * var_flat(index));
-          } else {
-            g = grad_flat(i);
-          }
-
-          T updated_a = a + grad_flat(i) * grad_flat(i);
-          using Eigen::numext::pow;
-          T sigma = pow(updated_a, -lr_power_scalar) - pow(a, -lr_power_scalar);
-          if (!multiply_linear_by_lr_) {
-            sigma /= lr_scalar;
-          }
-          T updated_l = (multiply_linear_by_lr_ ? l + g * lr_scalar - sigma * v
-                                                : l + g - sigma * v);
-          v = FtrlCompute(updated_a, updated_l, lr_scalar, l1_scalar, l2_scalar,
-                          lr_power_scalar, multiply_linear_by_lr_);
-          a = updated_a;
-          l = updated_l;
-        }
-      }
-    }
+    const Device& device = ctx->template eigen_device<Device>();
+    auto indices_vec = indices.vec<Tindex>();
+    OP_REQUIRES_OK(
+        ctx, functor::SparseApplyFtrl<Device, T, Tindex, has_l2_shrinkage>()(
+                 device, var.flat_outer_dims<T>(), accum.flat_outer_dims<T>(),
+                 linear.flat_outer_dims<T>(), lr.scalar<T>(), l1.scalar<T>(),
+                 l2.scalar<T>(),
+                 // Note: Passing l2 as a placeholder when not has_l2_shrinkage
+                 // (it will not be used).
+                 has_l2_shrinkage ? l2_shrinkage->scalar<T>() : l2.scalar<T>(),
+                 lr_power.scalar<T>(), grad.flat_outer_dims<T>(), indices_vec,
+                 inner_dim, multiply_linear_by_lr_));
 
     MaybeForwardRefInputToRefOutput(ctx, 0, 0);
   }
@@ -3523,123 +3487,6 @@ class ApplyAdamOp : public OpKernel {
   bool use_nesterov_;
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-class ApplyAdamOp<SYCLDevice, T> : public OpKernel {
- public:
-  explicit ApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const bool sparse = false;
-    auto locks = MaybeLockVariableInputMutexesInOrder<SYCLDevice, T>(
-        ctx, use_exclusive_lock_, sparse, {0, 1, 2});
-
-    Tensor var;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<SYCLDevice, T>(
-                            ctx, 0, use_exclusive_lock_, sparse, &var));
-    Tensor m;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<SYCLDevice, T>(
-                            ctx, 1, use_exclusive_lock_, sparse, &m));
-    Tensor v;
-    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<SYCLDevice, T>(
-                            ctx, 2, use_exclusive_lock_, sparse, &v));
-    OP_REQUIRES(
-        ctx, var.IsInitialized(),
-        errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", requested_input(0)));
-    OP_REQUIRES(
-        ctx, m.IsInitialized(),
-        errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", requested_input(1)));
-    OP_REQUIRES(
-        ctx, v.IsInitialized(),
-        errors::FailedPrecondition(
-            "Attempting to use uninitialized variables: ", requested_input(2)));
-
-    const Tensor& beta1_power_dev = ctx->input(3);
-    const Tensor& beta2_power_dev = ctx->input(4);
-    const Tensor& lr_dev = ctx->input(5);
-    const Tensor& beta1_dev = ctx->input(6);
-    const Tensor& beta2_dev = ctx->input(7);
-    const Tensor& epsilon_dev = ctx->input(8);
-
-    T beta1_power = 0;
-    T beta2_power = 0;
-    T lr = 0;
-    T beta1 = 0;
-    T beta2 = 0;
-    T epsilon = 0;
-
-    auto device = ctx->eigen_sycl_device();
-    auto size = sizeof(T);
-    auto src_ptr = GetBase(&beta1_power_dev);
-    device.memcpyDeviceToHost(&beta1_power, static_cast<const T*>(src_ptr),
-                              size);
-
-    src_ptr = GetBase(&beta2_power_dev);
-    device.memcpyDeviceToHost(&beta2_power, static_cast<const T*>(src_ptr),
-                              size);
-
-    src_ptr = GetBase(&lr_dev);
-    device.memcpyDeviceToHost(&lr, static_cast<const T*>(src_ptr), size);
-
-    src_ptr = GetBase(&beta1_dev);
-    device.memcpyDeviceToHost(&beta1, static_cast<const T*>(src_ptr), size);
-
-    src_ptr = GetBase(&beta2_dev);
-    device.memcpyDeviceToHost(&beta2, static_cast<const T*>(src_ptr), size);
-
-    src_ptr = GetBase(&epsilon_dev);
-    device.memcpyDeviceToHost(&epsilon, static_cast<const T*>(src_ptr), size);
-
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power_dev.shape()),
-                errors::InvalidArgument("beta1_power is not a scalar: ",
-                                        beta1_power_dev.shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power_dev.shape()),
-                errors::InvalidArgument("beta2_power is not a scalar: ",
-                                        beta2_power_dev.shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr_dev.shape()),
-                errors::InvalidArgument("lr is not a scalar : ",
-                                        lr_dev.shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_dev.shape()),
-                errors::InvalidArgument("beta1 is not a scalar: ",
-                                        beta1_dev.shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_dev.shape()),
-                errors::InvalidArgument("beta2 is not a scalar: ",
-                                        beta2_dev.shape().DebugString()));
-    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon_dev.shape()),
-                errors::InvalidArgument("epsilon is not a scalar: ",
-                                        epsilon_dev.shape().DebugString()));
-
-    const Tensor& grad = ctx->input(9);
-
-    OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()),
-                errors::InvalidArgument("var and m do not have the same shape",
-                                        var.shape().DebugString(), " ",
-                                        m.shape().DebugString()));
-    OP_REQUIRES(ctx, var.shape().IsSameSize(v.shape()),
-                errors::InvalidArgument("var and v do not have the same shape",
-                                        var.shape().DebugString(), " ",
-                                        v.shape().DebugString()));
-    OP_REQUIRES(
-        ctx, var.shape().IsSameSize(grad.shape()),
-        errors::InvalidArgument("var and grad do not have the same shape",
-                                var.shape().DebugString(), " ",
-                                grad.shape().DebugString()));
-
-    functor::ApplyAdamSYCL<T>()(device, var.flat<T>(), m.flat<T>(), v.flat<T>(),
-                                beta1_power, beta2_power, lr, beta1, beta2,
-                                epsilon, grad.flat<T>());
-
-    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
-  }
-
- private:
-  bool use_exclusive_lock_;
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 #define REGISTER_KERNELS(D, T)                                     \
   REGISTER_KERNEL_BUILDER(                                         \
@@ -3657,12 +3504,6 @@ class ApplyAdamOp<SYCLDevice, T> : public OpKernel {
 TF_CALL_FLOAT_TYPES(REGISTER_CPU_KERNELS);
 TF_CALL_COMPLEX_TYPES(REGISTER_CPU_KERNELS);
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNELS(T) REGISTER_KERNELS(SYCL, T);
-
-TF_CALL_float(REGISTER_SYCL_KERNELS);
-TF_CALL_double(REGISTER_SYCL_KERNELS);
-#endif
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Forward declarations of the functor specializations for GPU.
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index ef44b5f9659..5af603077a5 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -151,6 +152,21 @@ struct ApplyFtrlV2MultiplyLinearByLr {
                   typename TTypes<T>::ConstScalar lr_power);
 };
 
+template <typename Device, typename T, typename Tindex, bool has_l2_shrinkage>
+struct SparseApplyFtrl {
+  Status operator()(const Device& d, typename TTypes<T>::Matrix var_flat,
+                    typename TTypes<T>::Matrix accum_flat,
+                    typename TTypes<T>::Matrix linear_flat,
+                    typename TTypes<T>::ConstScalar lr,
+                    typename TTypes<T>::ConstScalar l1,
+                    typename TTypes<T>::ConstScalar l2,
+                    typename TTypes<T>::ConstScalar l2_shrinkage,
+                    typename TTypes<T>::ConstScalar lr_power,
+                    typename TTypes<T>::ConstMatrix grad_flat,
+                    typename TTypes<Tindex>::ConstVec indices_vec,
+                    int64 inner_dim, bool multiply_linear_by_lr);
+};
+
 template <typename Device, typename T>
 struct ApplyMomentum {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index bbd22bae859..64df24180ef 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -175,15 +175,29 @@ __device__ std::complex<T> impl_rsqrt(std::complex<T> x) {
   // due to subtraction of two close values. We have to get fancy
   root[0] = sqrt(r * ((std::is_same<T, float>::value && re * r < -0.98)
                           ? rsqrt_helper(im * im * r * r)
-                          : 1 + re * r)) *
+                          : max(T(0.0), 1 + re * r))) *
             root2;
   root[1] = sqrt(r * ((std::is_same<T, float>::value && re * r > 0.98)
                           ? rsqrt_helper(im * im * r * r)
-                          : 1 - re * r)) *
+                          : max(T(0.0), 1 - re * r))) *
             root2 * (im >= 0 ? -1. : 1.);
   return *(reinterpret_cast<std::complex<T>*>(&root));
 }
 
+template <typename T>
+__device__ T impl_fabs(T x) {
+  return fabs(x);
+}
+template <>
+__device__ Eigen::half impl_fabs(Eigen::half x) {
+  return __float2half(fabs(__half2float(x)));
+}
+
+template <typename T>
+__device__ T impl_sign(T x) {
+  return x == T(0) ? T(0) : x < T(0) ? T(-1) : T(1);
+}
+
 template <typename T>
 __global__ __launch_bounds__(1024) void ApplyAdagradKernel(GpuLaunchConfig cfg,
                                                            T* var, T* accum,
@@ -207,6 +221,20 @@ __global__ __launch_bounds__(1024) void ApplyAdagradV2Kernel(
   }
 }
 
+template <typename T>
+__global__ __launch_bounds__(1024) void ApplyProximalAdagradKernel(
+    GpuLaunchConfig cfg, T* var, T* accum, const T* lr, const T* l1,
+    const T* l2, const T* grad) {
+  GPU_1D_KERNEL_LOOP(i, cfg.virtual_thread_count) {
+    accum[i] += grad[i] * grad[i];
+    T lr_scaled = lr[0] * impl_rsqrt(accum[i]);
+    T prox_var = var[i] - grad[i] * lr_scaled;
+    var[i] = impl_sign(prox_var) *
+             max(impl_fabs(prox_var) - lr_scaled * max(l1[0], T(0.f)), T(0.f)) /
+             (T(1.f) + l2[0] * lr_scaled);
+  }
+}
+
 template <typename T>
 __global__ __launch_bounds__(1024) void ApplyAdadeltaKernel(
     GpuLaunchConfig cfg, T* var, T* accum, T* accum_update, const T* plr,
@@ -326,6 +354,43 @@ struct ApplyAdagradV2<GPUDevice, T> {
 #endif
   }
 };
+
+template <typename T>
+struct ApplyProximalAdagrad<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar l1,
+                  typename TTypes<T>::ConstScalar l2,
+                  typename TTypes<T>::ConstFlat grad) {
+#if TENSORFLOW_USE_ROCM
+    wrap_kernel_call(ApplyProximalAdagradKernel<T>, d, var, accum, lr, l1, l2,
+                     grad);
+#else
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+    // Fobos update per paper with Adagrad learning rate.
+    accum.device(d) += grad.square();
+    // Adagrad learning rate.
+    // The following is the GPU equivalent of the CPU version:
+    // auto learning_rate = accum.constant(lr()) * accum.rsqrt();
+    auto lr_bcast = lr.reshape(single).broadcast(bcast);
+    auto l1_bcast = l1.reshape(single).broadcast(bcast);
+    auto l2_bcast = l2.reshape(single).broadcast(bcast);
+    auto learning_rate = lr_bcast * accum.rsqrt();
+    auto prox_var = var;
+    // compute v = w - lr * grad.
+    prox_var.device(d) -= grad * learning_rate;
+    // compute sign(v) * max(|v| - lr * max(l1, 0), 0)
+    var.device(d) = prox_var.sign() *
+                    (prox_var.abs() - learning_rate * l1_bcast.cwiseMax(T(0.f)))
+                        .cwiseMax(T(0.f)) /
+                    (var.constant(T(1.f)) + l2_bcast * learning_rate);
+#endif
+  }
+};
+
 template <typename T>
 struct ApplyAdadelta<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
@@ -811,6 +876,10 @@ template struct functor::ApplyAdagradV2<GPUDevice, complex64>;
 template struct functor::ApplyAdagradV2<GPUDevice, complex128>;
 #endif
 
+template struct functor::ApplyProximalAdagrad<GPUDevice, Eigen::half>;
+template struct functor::ApplyProximalAdagrad<GPUDevice, float>;
+template struct functor::ApplyProximalAdagrad<GPUDevice, double>;
+
 template struct functor::ApplyAdadelta<GPUDevice, Eigen::half>;
 template struct functor::ApplyAdadelta<GPUDevice, float>;
 template struct functor::ApplyAdadelta<GPUDevice, double>;
diff --git a/tensorflow/core/kernels/transpose_functor.h b/tensorflow/core/kernels/transpose_functor.h
index 0c22b11b7c6..e6aaca8ff5d 100644
--- a/tensorflow/core/kernels/transpose_functor.h
+++ b/tensorflow/core/kernels/transpose_functor.h
@@ -247,13 +247,6 @@ inline Status DoMatrixTransposeImpl(const Device& device, const Tensor& in,
   return DoTransposeImpl(device, in, perm, conjugate, out);
 }
 
-#ifdef TENSORFLOW_USE_SYCL
-// For SYCL lets always go through Eigen
-template <typename Device, typename T>
-void TransposeSYCL(const Device& d, const Tensor& in,
-                   const gtl::ArraySlice<int32> perm, bool conjugate,
-                   Tensor* out);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace internal
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/transpose_functor_cpu.cc b/tensorflow/core/kernels/transpose_functor_cpu.cc
index 1271c02fae7..6d0dd9848e5 100644
--- a/tensorflow/core/kernels/transpose_functor_cpu.cc
+++ b/tensorflow/core/kernels/transpose_functor_cpu.cc
@@ -136,69 +136,5 @@ struct Transpose<CPUDevice, T, conjugate> {
 
 INSTANTIATE(CPUDevice)
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-
-namespace internal {
-template <typename T>
-void TransposeSYCL(const SYCLDevice& d, const Tensor& in,
-                   const gtl::ArraySlice<int32> perm, bool conjugate,
-                   Tensor* out) {
-  switch (in.dims()) {
-    case 1:
-      TransposeUsingEigen<SYCLDevice, T, 1>(d, in, perm, conjugate, out);
-      break;
-    case 2:
-      TransposeUsingEigen<SYCLDevice, T, 2>(d, in, perm, conjugate, out);
-      break;
-    case 3:
-      TransposeUsingEigen<SYCLDevice, T, 3>(d, in, perm, conjugate, out);
-      break;
-    case 4:
-      TransposeUsingEigen<SYCLDevice, T, 4>(d, in, perm, conjugate, out);
-      break;
-    case 5:
-      TransposeUsingEigen<SYCLDevice, T, 5>(d, in, perm, conjugate, out);
-      break;
-    case 6:
-      TransposeUsingEigen<SYCLDevice, T, 6>(d, in, perm, conjugate, out);
-      break;
-    case 7:
-      TransposeUsingEigen<SYCLDevice, T, 7>(d, in, perm, conjugate, out);
-      break;
-    case 8:
-      TransposeUsingEigen<SYCLDevice, T, 8>(d, in, perm, conjugate, out);
-      break;
-    default:
-      LOG(FATAL) << "Unsupported TransposeUsingEigen for: " << in.dims();
-      break;
-  }
-}
-
-}  // namespace internal
-
-template <typename T, bool conjugate>
-struct Transpose<SYCLDevice, T, conjugate> {
-  static void run(const SYCLDevice& d, const Tensor& in,
-                  const gtl::ArraySlice<int32> perm, Tensor* out) {
-    internal::TransposeSycl(d, in, perm, conjugate, out);
-  }
-};
-
-template <bool conjugate>
-struct Transpose<SYCLDevice, tstring, conjugate> {
-  static void run(const SYCLDevice& d, const Tensor& in,
-                  const gtl::ArraySlice<int32> perm, Tensor* out) {
-    LOG(FATAL) << "DT_STRING not supported on SYCL device.";
-  }
-};
-
-// Explicit instantiation.
-template struct Transpose<SYCLDevice, tstring, false>;
-
-INSTANTIATE(SYCLDevice)
-#undef INSTANTIATE
-
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index acd278d7a51..8c2196903ae 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -91,20 +91,6 @@ REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
                             .HostMemory("y"),
                         InvertPermutationOp<int64>);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .HostMemory("x")
-                            .HostMemory("y"),
-                        InvertPermutationOp<int32>);
-REGISTER_KERNEL_BUILDER(Name("InvertPermutation")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int64>("T")
-                            .HostMemory("x")
-                            .HostMemory("y"),
-                        InvertPermutationOp<int64>);
-#endif  // TENSORFLOW_USE_SYCL
 
 namespace {
 template <typename Tperm>
@@ -263,33 +249,4 @@ TF_CALL_POD_TYPES(REGISTER);
 #undef REGISTER
 #endif
 
-#ifdef TENSORFLOW_USE_SYCL
-Status TransposeSyclOp::DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                                    gtl::ArraySlice<int32> perm, Tensor* out) {
-  typedef Eigen::SyclDevice SYCLDevice;
-  return ::tensorflow::DoTranspose(ctx->eigen_device<SYCLDevice>(), in, perm,
-                                   out);
-}
-Status ConjugateTransposeSyclOp::DoTranspose(OpKernelContext* ctx,
-                                             const Tensor& in,
-                                             gtl::ArraySlice<int32> perm,
-                                             Tensor* out) {
-  typedef Eigen::SyclDevice SYCLDevice;
-  return ::tensorflow::DoConjugateTranspose(ctx->eigen_device<SYCLDevice>(), in,
-                                            perm, out);
-}
-#define REGISTER(T)                                   \
-  REGISTER_KERNEL_BUILDER(Name("Transpose")           \
-                              .Device(DEVICE_SYCL)    \
-                              .TypeConstraint<T>("T") \
-                              .HostMemory("perm"),    \
-                          TransposeSyclOp);           \
-  REGISTER_KERNEL_BUILDER(Name("ConjugateTranspose")  \
-                              .Device(DEVICE_SYCL)    \
-                              .TypeConstraint<T>("T") \
-                              .HostMemory("perm"),    \
-                          ConjugateTransposeSyclOp);
-TF_CALL_POD_TYPES(REGISTER);
-#undef REGISTER
-#endif
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/transpose_op.h b/tensorflow/core/kernels/transpose_op.h
index 9e8c5737618..3ea51c7935b 100644
--- a/tensorflow/core/kernels/transpose_op.h
+++ b/tensorflow/core/kernels/transpose_op.h
@@ -62,16 +62,6 @@ class TransposeGpuOp : public TransposeOp {
                      gtl::ArraySlice<int32> perm, Tensor* out) override;
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-class TransposeSyclOp : public TransposeOp {
- public:
-  explicit TransposeSyclOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
-
- protected:
-  Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                     gtl::ArraySlice<int32> perm, Tensor* out) override;
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 // Conjugating transpose ops.
 class ConjugateTransposeCpuOp : public TransposeOp {
@@ -109,18 +99,6 @@ class ConjugateTransposeGpuOp : public TransposeOp {
   bool IsConjugate() const override { return true; }
 };
 
-#ifdef TENSORFLOW_USE_SYCL
-class ConjugateTransposeSyclOp : public TransposeOp {
- public:
-  explicit ConjugateTransposeSyclOp(OpKernelConstruction* ctx)
-      : TransposeOp(ctx) {}
-
- protected:
-  Status DoTranspose(OpKernelContext* ctx, const Tensor& in,
-                     gtl::ArraySlice<int32> perm, Tensor* out) override;
-  bool IsConjugate() const override { return true; }
-};
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc
index 20dccdc0627..d049d1f41ff 100644
--- a/tensorflow/core/kernels/unique_op.cc
+++ b/tensorflow/core/kernels/unique_op.cc
@@ -322,40 +322,6 @@ REGISTER_KERNEL_BUILDER(Name("Unique")
                             .HostMemory("idx"),
                         UniqueOp<int64, int64>);
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("Unique")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int32>("out_idx")
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("idx"),
-                        UniqueOp<int32, int32>);
-REGISTER_KERNEL_BUILDER(Name("Unique")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int64>("T")
-                            .TypeConstraint<int32>("out_idx")
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("idx"),
-                        UniqueOp<int64, int32>);
-REGISTER_KERNEL_BUILDER(Name("Unique")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int32>("T")
-                            .TypeConstraint<int64>("out_idx")
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("idx"),
-                        UniqueOp<int32, int64>);
-REGISTER_KERNEL_BUILDER(Name("Unique")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<int64>("T")
-                            .TypeConstraint<int64>("out_idx")
-                            .HostMemory("x")
-                            .HostMemory("y")
-                            .HostMemory("idx"),
-                        UniqueOp<int64, int64>);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/unpack_op.cc b/tensorflow/core/kernels/unpack_op.cc
index 7ac02e8b4d4..1bdb2474861 100644
--- a/tensorflow/core/kernels/unpack_op.cc
+++ b/tensorflow/core/kernels/unpack_op.cc
@@ -32,9 +32,6 @@ namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
 
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class UnpackOp : public OpKernel {
@@ -70,8 +67,6 @@ class UnpackOp : public OpKernel {
                         std::numeric_limits<Eigen::DenseIndex>::max()),
         errors::InvalidArgument("output size must fit in Eigen DenseIndex"));
 
-// This optimization is currently not applicable for SYCL devices
-#ifndef TENSORFLOW_USE_SYCL
     // Special case: Aligned, so we can share the underlying buffer.
     //
     // Apply this optimization conservatively: if input is aligned,
@@ -88,7 +83,6 @@ class UnpackOp : public OpKernel {
       }
       return;
     }
-#endif  // TENSORFLOW_USE_SYCL
 
     Eigen::DenseIndex before_dim = 1;
     for (int i = 0; i < axis; ++i) {
@@ -167,28 +161,5 @@ REGISTER_KERNEL_BUILDER(Name("Unpack")
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL(type)                                         \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("Unpack").Device(DEVICE_SYCL).TypeConstraint<type>("T"), \
-      UnpackOp<SYCLDevice, type>)
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL);
-
-REGISTER_KERNEL_BUILDER(Name("Unpack")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("value")
-                            .HostMemory("output")
-                            .TypeConstraint<int32>("T"),
-                        UnpackOp<CPUDevice, int32>);
-
-REGISTER_KERNEL_BUILDER(Name("Unpack")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("value")
-                            .HostMemory("output")
-                            .TypeConstraint<int64>("T"),
-                        UnpackOp<CPUDevice, int64>);
-#undef REGISTER_SYCL
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index ccd33e8c75a..259c8f6c5e0 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -200,31 +200,6 @@ REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized").Device(DEVICE_CPU),
                         IsVariableInitializedOp);
 
-#ifdef TENSORFLOW_USE_SYCL
-#define REGISTER_SYCL_KERNEL(type)                                          \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("Variable").Device(DEVICE_SYCL).TypeConstraint<type>("dtype"),   \
-      VariableOp);                                                          \
-  REGISTER_KERNEL_BUILDER(                                                  \
-      Name("VariableV2").Device(DEVICE_SYCL).TypeConstraint<type>("dtype"), \
-      VariableOp);                                                          \
-  REGISTER_KERNEL_BUILDER(Name("TemporaryVariable")                         \
-                              .Device(DEVICE_SYCL)                          \
-                              .TypeConstraint<type>("dtype"),               \
-                          TemporaryVariableOp);                             \
-  REGISTER_KERNEL_BUILDER(Name("DestroyTemporaryVariable")                  \
-                              .Device(DEVICE_SYCL)                          \
-                              .TypeConstraint<type>("T"),                   \
-                          DestroyTemporaryVariableOp);                      \
-  REGISTER_KERNEL_BUILDER(Name("IsVariableInitialized")                     \
-                              .Device(DEVICE_SYCL)                          \
-                              .TypeConstraint<type>("dtype")                \
-                              .HostMemory("is_initialized"),                \
-                          IsVariableInitializedOp);
-
-TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SYCL_KERNEL);
-#undef REGISTER_SYCL_KERNEL
-#endif  // TENSORFLOW_USE_SYCL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Only register 'Variable' on GPU for the subset of types also supported by
diff --git a/tensorflow/core/kernels/xent_op.cc b/tensorflow/core/kernels/xent_op.cc
index 8a7c16349a7..0e826274f2e 100644
--- a/tensorflow/core/kernels/xent_op.cc
+++ b/tensorflow/core/kernels/xent_op.cc
@@ -30,9 +30,6 @@ namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;
-#ifdef TENSORFLOW_USE_SYCL
-typedef Eigen::SyclDevice SYCLDevice;
-#endif  // TENSORFLOW_USE_SYCL
 
 template <typename Device, typename T>
 class SoftmaxXentWithLogitsOp : public OpKernel {
@@ -119,10 +116,6 @@ struct XentFunctorBase {
 template <typename T>
 struct XentFunctor<CPUDevice, T> : XentFunctorBase<CPUDevice, T> {};
 
-#ifdef TENSORFLOW_USE_SYCL
-template <typename T>
-struct XentFunctor<SYCLDevice, T> : XentFunctorBase<SYCLDevice, T> {};
-#endif  // TENSORFLOW_USE_SYCL
 }  // namespace functor
 
 #define REGISTER_CPU(T)                                         \
@@ -150,11 +143,5 @@ REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
                         SoftmaxXentWithLogitsOp<GPUDevice, double>);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("SoftmaxCrossEntropyWithLogits")
-                            .Device(DEVICE_SYCL)
-                            .TypeConstraint<float>("T"),
-                        SoftmaxXentWithLogitsOp<SYCLDevice, float>);
-#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/bmp/BUILD b/tensorflow/core/lib/bmp/BUILD
new file mode 100644
index 00000000000..186c3a0753f
--- /dev/null
+++ b/tensorflow/core/lib/bmp/BUILD
@@ -0,0 +1,24 @@
+# Description:
+# bmp test data packages.
+
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "bmp_testdata",
+    srcs = [
+        # BMP data
+        "testdata/lena.bmp",
+        "testdata/rgb_small.bmp",
+        "testdata/rgb_small_255.bmp",
+        "testdata/rgba_small.bmp",
+        "testdata/rgba_small_255.bmp",
+        "testdata/grayscale_small.bmp",
+        "testdata/grayscale_small_3channels.bmp",
+        "testdata/grayscale_small_4channels.bmp",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/core/lib/core/BUILD b/tensorflow/core/lib/core/BUILD
index 491e4c5e7aa..1311b4a44d5 100644
--- a/tensorflow/core/lib/core/BUILD
+++ b/tensorflow/core/lib/core/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
@@ -246,14 +247,6 @@ filegroup(
     visibility = ["//tensorflow/core:__pkg__"],
 )
 
-filegroup(
-    name = "legacy_lib_core_threadpool_options_header",
-    srcs = [
-        "threadpool_options.h",
-    ],
-    visibility = ["//tensorflow/core:__pkg__"],
-)
-
 filegroup(
     name = "legacy_lib_proto_parsing_headers",
     srcs = [
@@ -278,6 +271,7 @@ filegroup(
         "stringpiece.h",
         "threadpool.h",
         "threadpool_interface.h",
+        "threadpool_options.h",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
diff --git a/tensorflow/core/lib/gtl/BUILD b/tensorflow/core/lib/gtl/BUILD
index f4bfb8f8f60..650d6f8ddf5 100644
--- a/tensorflow/core/lib/gtl/BUILD
+++ b/tensorflow/core/lib/gtl/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
@@ -129,7 +130,7 @@ cc_library(
     name = "map_util",
     srcs = [
         "map_util.h",
-        "subtle/map_traits.h",
+        "//tensorflow/core/lib/gtl/subtle:map_traits",
     ],
     hdrs = ["map_util.h"],
 )
@@ -221,7 +222,7 @@ filegroup(
         "map_util.h",
         "optional.h",
         "priority_queue_util.h",
-        "subtle/map_traits.h",
+        "//tensorflow/core/lib/gtl/subtle:map_traits",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
@@ -243,8 +244,8 @@ filegroup(
         "map_util.h",
         "optional.h",
         "priority_queue_util.h",
-        "subtle/map_traits.h",
         "top_n.h",
+        "//tensorflow/core/lib/gtl/subtle:map_traits",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
diff --git a/tensorflow/core/lib/gtl/subtle/BUILD b/tensorflow/core/lib/gtl/subtle/BUILD
new file mode 100644
index 00000000000..6f73f441f80
--- /dev/null
+++ b/tensorflow/core/lib/gtl/subtle/BUILD
@@ -0,0 +1,16 @@
+# Description:
+# gtl subtle packages.
+
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "map_traits",
+    srcs = [
+        "map_traits.h",
+    ],
+    visibility = ["//tensorflow/core/lib/gtl:__pkg__"],
+)
diff --git a/tensorflow/core/lib/hash/BUILD b/tensorflow/core/lib/hash/BUILD
index 993ccf88341..8752aff7a92 100644
--- a/tensorflow/core/lib/hash/BUILD
+++ b/tensorflow/core/lib/hash/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_linux_x86_64",
diff --git a/tensorflow/core/lib/histogram/BUILD b/tensorflow/core/lib/histogram/BUILD
index 006a829ba62..ff8c1b69281 100644
--- a/tensorflow/core/lib/histogram/BUILD
+++ b/tensorflow/core/lib/histogram/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
diff --git a/tensorflow/core/lib/io/BUILD b/tensorflow/core/lib/io/BUILD
index 797e9ad1a4b..eadfbd1fe2e 100644
--- a/tensorflow/core/lib/io/BUILD
+++ b/tensorflow/core/lib/io/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
@@ -7,6 +8,7 @@ package(
     default_visibility = [
         "//tensorflow/c/experimental/filesystem:__pkg__",
         "//tensorflow/c/experimental/filesystem/plugins/posix:__pkg__",
+        "//tensorflow/core/lib/io/snappy:__pkg__",
         # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
         "//tensorflow/core:__pkg__",
     ],
@@ -182,56 +184,24 @@ cc_library(
     alwayslink = True,
 )
 
-cc_library(
+alias(
     name = "snappy_inputbuffer",
-    srcs = ["snappy/snappy_inputbuffer.cc"],
-    hdrs = ["snappy/snappy_inputbuffer.h"],
-    deps = [
-        ":inputstream_interface",
-        "//tensorflow/core/lib/core:status",
-        "//tensorflow/core/platform:env",
-        "//tensorflow/core/platform:macros",
-        "//tensorflow/core/platform:platform_port",
-        "//tensorflow/core/platform:types",
-    ],
-    alwayslink = True,
+    actual = "//tensorflow/core/lib/io/snappy:snappy_inputbuffer",
 )
 
-cc_library(
-    name = "snappy_outputbuffer",
-    srcs = ["snappy/snappy_outputbuffer.cc"],
-    hdrs = ["snappy/snappy_outputbuffer.h"],
-    deps = [
-        "//tensorflow/core/lib/core:status",
-        "//tensorflow/core/platform",
-        "//tensorflow/core/platform:env",
-        "//tensorflow/core/platform:macros",
-        "//tensorflow/core/platform:platform_port",
-        "//tensorflow/core/platform:types",
-    ],
-    alwayslink = True,
-)
-
-cc_library(
+alias(
     name = "snappy_inputstream",
-    srcs = ["snappy/snappy_inputstream.cc"],
-    hdrs = ["snappy/snappy_inputstream.h"],
-    deps = [
-        ":inputstream_interface",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:platform_port",
-        "@com_google_absl//absl/memory",
-    ],
-    alwayslink = True,
+    actual = "//tensorflow/core/lib/io/snappy:snappy_inputstream",
 )
 
-cc_library(
+alias(
+    name = "snappy_outputbuffer",
+    actual = "//tensorflow/core/lib/io/snappy:snappy_outputbuffer",
+)
+
+alias(
     name = "snappy_compression_options",
-    hdrs = ["snappy/snappy_compression_options.h"],
-    deps = [
-        "//tensorflow/core/platform:types",
-    ],
-    alwayslink = True,
+    actual = "//tensorflow/core/lib/io/snappy:snappy_compression_options",
 )
 
 cc_library(
@@ -349,9 +319,6 @@ filegroup(
         "random_inputstream.h",
         "record_reader.cc",
         "record_reader.h",
-        "snappy/snappy_compression_options.h",
-        "snappy/snappy_inputstream.cc",
-        "snappy/snappy_inputstream.h",
         "table.cc",
         "table.h",
         "table_builder.cc",
@@ -363,6 +330,9 @@ filegroup(
         "zlib_compression_options.h",
         "zlib_inputstream.cc",
         "zlib_inputstream.h",
+        "//tensorflow/core/lib/io/snappy:snappy_compression_options.h",
+        "//tensorflow/core/lib/io/snappy:snappy_inputstream.cc",
+        "//tensorflow/core/lib/io/snappy:snappy_inputstream.h",
     ],
 )
 
@@ -382,10 +352,6 @@ filegroup(
         "random_inputstream.h",
         "record_reader.h",
         "record_writer.h",
-        "snappy/snappy_compression_options.h",
-        "snappy/snappy_inputbuffer.h",
-        "snappy/snappy_inputstream.h",
-        "snappy/snappy_outputbuffer.h",
         "table.h",
         "table_builder.h",
         "table_options.h",
@@ -393,6 +359,10 @@ filegroup(
         "zlib_compression_options.h",
         "zlib_inputstream.h",
         "zlib_outputbuffer.h",
+        "//tensorflow/core/lib/io/snappy:snappy_compression_options.h",
+        "//tensorflow/core/lib/io/snappy:snappy_inputbuffer.h",
+        "//tensorflow/core/lib/io/snappy:snappy_inputstream.h",
+        "//tensorflow/core/lib/io/snappy:snappy_outputbuffer.h",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
@@ -408,9 +378,9 @@ filegroup(
         "random_inputstream_test.cc",
         "record_reader_writer_test.cc",
         "recordio_test.cc",
-        "snappy/snappy_test.cc",
         "table_test.cc",
         "zlib_buffers_test.cc",
+        "//tensorflow/core/lib/io/snappy:snappy_test.cc",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
@@ -439,13 +409,13 @@ filegroup(
     srcs = [
         "inputbuffer.h",
         "iterator.h",
-        "snappy/snappy_compression_options.h",
-        "snappy/snappy_inputbuffer.h",
-        "snappy/snappy_inputstream.h",
-        "snappy/snappy_outputbuffer.h",
         "zlib_compression_options.h",
         "zlib_inputstream.h",
         "zlib_outputbuffer.h",
+        "//tensorflow/core/lib/io/snappy:snappy_compression_options.h",
+        "//tensorflow/core/lib/io/snappy:snappy_inputbuffer.h",
+        "//tensorflow/core/lib/io/snappy:snappy_inputstream.h",
+        "//tensorflow/core/lib/io/snappy:snappy_outputbuffer.h",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
 )
diff --git a/tensorflow/core/lib/io/snappy/BUILD b/tensorflow/core/lib/io/snappy/BUILD
new file mode 100644
index 00000000000..3f9405cdd6a
--- /dev/null
+++ b/tensorflow/core/lib/io/snappy/BUILD
@@ -0,0 +1,74 @@
+# Snappy targets.
+
+load(
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
+)
+
+package(
+    default_visibility = [
+        "//tensorflow/core/lib/io:__pkg__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+exports_files([
+    "snappy_compression_options.h",
+    "snappy_inputbuffer.h",
+    "snappy_inputstream.h",
+    "snappy_outputbuffer.h",
+    "snappy_inputstream.cc",
+    "snappy_test.cc",
+])
+
+cc_library(
+    name = "snappy_inputbuffer",
+    srcs = ["snappy_inputbuffer.cc"],
+    hdrs = ["snappy_inputbuffer.h"],
+    deps = [
+        "//tensorflow/core/lib/core:status",
+        "//tensorflow/core/lib/io:inputstream_interface",
+        "//tensorflow/core/platform:env",
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:platform_port",
+        "//tensorflow/core/platform:types",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "snappy_outputbuffer",
+    srcs = ["snappy_outputbuffer.cc"],
+    hdrs = ["snappy_outputbuffer.h"],
+    deps = [
+        "//tensorflow/core/lib/core:status",
+        "//tensorflow/core/platform",
+        "//tensorflow/core/platform:env",
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:platform_port",
+        "//tensorflow/core/platform:types",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "snappy_inputstream",
+    srcs = ["snappy_inputstream.cc"],
+    hdrs = ["snappy_inputstream.h"],
+    deps = [
+        "//tensorflow/core/lib/io:inputstream_interface",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:platform_port",
+        "@com_google_absl//absl/memory",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "snappy_compression_options",
+    hdrs = ["snappy_compression_options.h"],
+    deps = [
+        "//tensorflow/core/platform:types",
+    ],
+    alwayslink = True,
+)
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.cc b/tensorflow/core/lib/jpeg/jpeg_mem.cc
index aa80576365e..af820084df5 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem.cc
@@ -164,7 +164,7 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
   cinfo.dct_method = flags.dct_method;
 
   // Determine the output image size before attempting decompress to prevent
-  // OOM'ing doing the decompress
+  // OOM'ing during the decompress
   jpeg_calc_output_dimensions(&cinfo);
 
   int64 total_size = static_cast<int64>(cinfo.output_height) *
@@ -577,7 +577,7 @@ bool GetImageInfo(const void* srcdata, int datasize, int* width, int* height,
   SetSrc(&cinfo, srcdata, datasize, false);
 
   jpeg_read_header(&cinfo, TRUE);
-  jpeg_start_decompress(&cinfo);  // required to transfer image size to cinfo
+  jpeg_calc_output_dimensions(&cinfo);
   if (width) *width = cinfo.output_width;
   if (height) *height = cinfo.output_height;
   if (components) *components = cinfo.output_components;
diff --git a/tensorflow/core/lib/llvm_rtti/BUILD b/tensorflow/core/lib/llvm_rtti/BUILD
index cc8f5d5f46c..0357523dd44 100644
--- a/tensorflow/core/lib/llvm_rtti/BUILD
+++ b/tensorflow/core/lib/llvm_rtti/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
 # Library for using LLVM style RTTI in TensorFlow.
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
diff --git a/tensorflow/core/lib/lmdb/BUILD b/tensorflow/core/lib/lmdb/BUILD
new file mode 100644
index 00000000000..f85de8cc223
--- /dev/null
+++ b/tensorflow/core/lib/lmdb/BUILD
@@ -0,0 +1,12 @@
+# Description:
+# lmdb test data packages alias.
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+alias(
+    name = "lmdb_testdata",
+    actual = "//tensorflow/core/lib/lmdb/testdata:lmdb_testdata",
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/core/lib/lmdb/testdata/BUILD b/tensorflow/core/lib/lmdb/testdata/BUILD
new file mode 100644
index 00000000000..188998c9797
--- /dev/null
+++ b/tensorflow/core/lib/lmdb/testdata/BUILD
@@ -0,0 +1,30 @@
+# Description:
+# lmdb test data packages.
+
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "lmdb_testdata",
+    testonly = 1,
+    srcs = [
+        # A simple key-value store:
+        #   0 : 'b'
+        #   1 : 'b'
+        #    ...
+        #   9 : 'b'
+        # Which is then overwritten with:
+        #   0 : 'a'
+        #   1 : 'b'
+        #    ...
+        #   9 : 'j'
+        "data.mdb",
+        # LMDB, being a memory-mapped database, uses a different file format on
+        # big-endian systems.
+        "data_bigendian.mdb",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/core/lib/math/BUILD b/tensorflow/core/lib/math/BUILD
index a095dded61c..1fcf7c0aadc 100644
--- a/tensorflow/core/lib/math/BUILD
+++ b/tensorflow/core/lib/math/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
diff --git a/tensorflow/core/lib/monitoring/BUILD b/tensorflow/core/lib/monitoring/BUILD
index bbed9a58452..fc1ab3fb59b 100644
--- a/tensorflow/core/lib/monitoring/BUILD
+++ b/tensorflow/core/lib/monitoring/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
diff --git a/tensorflow/core/lib/png/BUILD b/tensorflow/core/lib/png/BUILD
index 95debe44e5e..aca343c0dbd 100644
--- a/tensorflow/core/lib/png/BUILD
+++ b/tensorflow/core/lib/png/BUILD
@@ -5,8 +5,7 @@ load(
 
 package(
     default_visibility = [
-        # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
-        "//tensorflow/core:__pkg__",
+        "//tensorflow:__subpackages__",
     ],
     licenses = ["notice"],  # Apache 2.0
 )
@@ -26,13 +25,8 @@ cc_library(
     ],
 )
 
-filegroup(
+alias(
     name = "testdata",
-    srcs = [
-        "testdata/lena_gray.png",
-        "testdata/lena_palette.png",
-        "testdata/lena_palette_trns.png",
-        "testdata/lena_rgba.png",
-        "testdata/palette_only.png",
-    ],
+    actual = "//tensorflow/core/lib/png/testdata:png_testdata",
+    visibility = ["//tensorflow/core:__pkg__"],
 )
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index 35e189e7829..d0014066ce3 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -282,11 +282,8 @@ bool CommonInitDecode(StringPiece png_string, int desired_channels,
   }
 
   // convert palette to rgb(a) if needs be.
-  // Note if desired_channels=1 then the original palette indices
-  // will be presented.
-  if (context->color_type == PNG_COLOR_TYPE_PALETTE && desired_channels != 1) {
+  if (context->color_type == PNG_COLOR_TYPE_PALETTE)
     png_set_palette_to_rgb(context->png_ptr);
-  }
 
   // handle grayscale case for source or destination
   const bool want_gray = (context->channels < 3);
@@ -297,9 +294,7 @@ bool CommonInitDecode(StringPiece png_string, int desired_channels,
     }
   }
   if (want_gray) {  // output is grayscale
-    // Note if color type is palette and context->channels < 3,
-    // then the original palette indices will be presented.
-    if (!is_gray && context->color_type != PNG_COLOR_TYPE_PALETTE)
+    if (!is_gray)
       png_set_rgb_to_gray(context->png_ptr, 1, 0.299, 0.587);  // 601, JPG
   } else {  // output is rgb(a)
     if (is_gray)
diff --git a/tensorflow/core/lib/png/testdata/BUILD b/tensorflow/core/lib/png/testdata/BUILD
new file mode 100644
index 00000000000..7331a0f7d2c
--- /dev/null
+++ b/tensorflow/core/lib/png/testdata/BUILD
@@ -0,0 +1,20 @@
+# Description:
+# PNG test data packages.
+
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "png_testdata",
+    srcs = [
+        "lena_gray.png",
+        "lena_palette.png",
+        "lena_palette_trns.png",
+        "lena_rgba.png",
+        "palette_only.png",
+    ],
+    visibility = ["//tensorflow/core/lib/png:__pkg__"],
+)
diff --git a/tensorflow/core/lib/psnr/BUILD b/tensorflow/core/lib/psnr/BUILD
index 386f1a5bd06..196047c14c9 100644
--- a/tensorflow/core/lib/psnr/BUILD
+++ b/tensorflow/core/lib/psnr/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
 package(
     default_visibility = [
         "//tensorflow/core:__pkg__",
diff --git a/tensorflow/core/lib/random/BUILD b/tensorflow/core/lib/random/BUILD
index 88d2f0280f1..54670302e98 100644
--- a/tensorflow/core/lib/random/BUILD
+++ b/tensorflow/core/lib/random/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
diff --git a/tensorflow/core/lib/random/random_distributions.h b/tensorflow/core/lib/random/random_distributions.h
index 4dc2c7fee12..79ca7247838 100644
--- a/tensorflow/core/lib/random/random_distributions.h
+++ b/tensorflow/core/lib/random/random_distributions.h
@@ -710,7 +710,7 @@ void BoxMullerFloat(uint32 x0, uint32 x1, float* f0, float* f1) {
   }
   const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
   const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
-#if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
+#if !defined(__linux__)
   *f0 = Eigen::numext::sin(v1);
   *f1 = Eigen::numext::cos(v1);
 #else
@@ -736,7 +736,7 @@ void BoxMullerDouble(uint32 x0, uint32 x1, uint32 x2, uint32 x3, double* d0,
   }
   const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
   const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
-#if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
+#if !defined(__linux__)
   *d0 = Eigen::numext::sin(v1);
   *d1 = Eigen::numext::cos(v1);
 #else
diff --git a/tensorflow/core/lib/ssim/BUILD b/tensorflow/core/lib/ssim/BUILD
index 7d9b72b11b0..a3e8ec48e96 100644
--- a/tensorflow/core/lib/ssim/BUILD
+++ b/tensorflow/core/lib/ssim/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
 package(
     default_visibility = [
         "//tensorflow/core:__pkg__",
diff --git a/tensorflow/core/lib/strings/BUILD b/tensorflow/core/lib/strings/BUILD
index 15dc7fbfe7e..c855b278afe 100644
--- a/tensorflow/core/lib/strings/BUILD
+++ b/tensorflow/core/lib/strings/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
     "cc_library",
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
index 388b5e62c18..9b1447f53c1 100644
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@@ -2,23 +2,29 @@
 #   Wrap NVIDIA (https://github.com/NVIDIA/nccl) NCCL with tensorflow ops.
 #   APIs are meant to change over time.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
-load("//tensorflow:tensorflow.bzl", "tf_copts")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow:tensorflow.bzl", "if_cuda_or_rocm", "tf_copts")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
-load("//tensorflow:tensorflow.bzl", "if_cuda_or_rocm")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "if_nccl")
+
 package(
     default_visibility = ["//tensorflow:__subpackages__"],
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 cc_library(
     name = "nccl_lib",
     srcs = if_cuda_or_rocm([
@@ -37,12 +43,11 @@ cc_library(
     ]) + if_cuda_or_rocm([
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/lib:annotated_traceme",
@@ -77,3 +82,28 @@ tf_cuda_cc_test(
         "//tensorflow/core/common_runtime/gpu:rocm",
     ]),
 )
+
+cc_library(
+    name = "collective_communicator",
+    srcs = ["collective_communicator.cc"],
+    hdrs = ["collective_communicator.h"],
+    copts = tf_copts() + if_nccl(["-DTENSORFLOW_USE_NCCL=1"]),
+    visibility = [
+        "//learning/brain/runtime:__subpackages__",
+        "//tensorflow:__subpackages__",
+    ],
+    deps =
+        ["//tensorflow/core:framework"] + if_nccl([
+            ":nccl_lib",
+            "@com_google_absl//absl/memory",
+            "//tensorflow/core/profiler/lib:traceme",
+        ]),
+)
+
+filegroup(
+    name = "mobile_srcs",
+    srcs = [
+        "collective_communicator.cc",
+        "collective_communicator.h",
+    ],
+)
diff --git a/tensorflow/core/nccl/collective_communicator.cc b/tensorflow/core/nccl/collective_communicator.cc
new file mode 100644
index 00000000000..56e2255ae99
--- /dev/null
+++ b/tensorflow/core/nccl/collective_communicator.cc
@@ -0,0 +1,178 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/nccl/collective_communicator.h"
+
+#if TENSORFLOW_USE_NCCL && (GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
+
+#include "absl/memory/memory.h"
+#include "tensorflow/core/nccl/nccl_manager.h"
+#include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+namespace tensorflow {
+
+class NcclCommunicator : public NcclCommunicatorInterface {
+ public:
+  void Enqueue(std::shared_ptr<CollectiveContext> col_ctx,
+               StatusCallback done) override;
+
+  void StartAbort(const Status& s) override;
+
+ private:
+  NcclManager nccl_manager_;
+};
+
+namespace {
+Status ReductionOp(const string& merge_op, ncclRedOp_t* reduction_op) {
+  if (merge_op == "Add") {
+    *reduction_op = ncclSum;
+    return Status::OK();
+  } else if (merge_op == "Mul") {
+    *reduction_op = ncclProd;
+    return Status::OK();
+  } else if (merge_op == "Maximum") {
+    *reduction_op = ncclMax;
+    return Status::OK();
+  } else if (merge_op == "Minimum") {
+    *reduction_op = ncclMin;
+    return Status::OK();
+  } else {
+    return errors::Internal(
+        "Expected merge_op to be in [Add, Mul, Maximum, Minimum], found ",
+        merge_op);
+  }
+}
+
+string NcclCollectiveKey(const string& exec_key, int step_id) {
+  return strings::StrCat(exec_key, ":", step_id);
+}
+}  // namespace
+
+std::unique_ptr<NcclCommunicatorInterface> MaybeCreateNcclCommunicator() {
+  return absl::make_unique<NcclCommunicator>();
+}
+
+void NcclCommunicator::Enqueue(std::shared_ptr<CollectiveContext> col_ctx,
+                               StatusCallback done) {
+  const CollectiveParams& col_params = col_ctx->col_params;
+  const int num_global_devices = col_params.group.group_size;
+  const int num_local_devices = col_params.group.num_devices_per_task.at(
+      col_params.group.task_names[col_params.default_rank]);
+  const string nccl_collective_key =
+      NcclCollectiveKey(col_ctx->exec_key, col_ctx->step_id);
+  auto* compute_stream = col_ctx->op_ctx->op_device_context()->stream();
+  auto* gpu_info = col_ctx->op_ctx->device()->tensorflow_gpu_device_info();
+  auto participant = absl::make_unique<NcclManager::Participant>(
+      compute_stream->parent(), compute_stream, gpu_info, col_ctx->input,
+      col_ctx->output, col_ctx->col_params.default_rank, std::move(done));
+  NcclManager::Context context(
+      nccl_collective_key, num_local_devices, num_global_devices,
+      col_params.group.runtime_details.communicator_key,
+      col_params.source_rank);
+  VLOG(1) << "NcclCommunicator::Enqueue type " << col_params.instance.type
+          << " num_tasks " << col_params.group.num_tasks << " current task "
+          << col_params.group.task_names[col_params.default_rank]
+          << " num local devices " << num_local_devices
+          << " num global devices " << num_global_devices << " device "
+          << col_ctx->device_name << " instance "
+          << col_params.instance.instance_key;
+  // `AddTo*` performs consistency checks for the NCCL call and enqueues the
+  // `Participant` struct locally.  When all local participants with this
+  // `nccl_collective_key` have called `AddToAllReduce` and
+  // `SignalMultiNodeReady`, all devices at this worker are ready to process
+  // this NCCL op.
+  //
+  // The `NcclManager` uses a dedicated CUDA stream for NCCL kernels.  At this
+  // point, it synchronizes the NCCL stream with the compute stream, and then
+  // enqueues the NCCL kernel on the NCCL stream.
+  switch (col_params.instance.type) {
+    case REDUCTION_COLLECTIVE: {
+      ncclRedOp_t reduction_op;
+      Status s = ReductionOp(col_params.merge_op->type_string(), &reduction_op);
+      if (!s.ok()) {
+        participant->done_callback(s);
+        return;
+      }
+      nccl_manager_.AddToAllReduce(std::move(participant), context,
+                                   reduction_op);
+      break;
+    }
+    case GATHER_COLLECTIVE: {
+      nccl_manager_.AddToAllGather(std::move(participant), context);
+      break;
+    }
+    case BROADCAST_COLLECTIVE: {
+      if (col_params.is_source) {
+        nccl_manager_.AddBroadcastSend(std::move(participant), context);
+      } else {
+        nccl_manager_.AddBroadcastRecv(std::move(participant), context);
+      }
+      break;
+    }
+    default: {
+      participant->done_callback(errors::Internal("Unexpected CollectiveType ",
+                                                  col_params.instance.type));
+      return;
+    }
+  }
+  // NOTE(ayushd): We need to synchronize NCCL launches across nodes to prevent
+  // deadlocks.  In the current implementation, we define a deterministic
+  // sequential launch order between potentially concurrent collective instances
+  // by introducing control information during static graph analysis in
+  // graph/collective_order.cc.  This can be either in the form of explicit
+  // control edges or via `wait_for` attribute on the collective op.
+  //
+  // The other end of the design spectrum would have a distinguished node
+  // dynamically signal the next collective to launch to all other participants.
+  // This has higher degree of runtime coordination, but it may be able to
+  // achieve better performance if the (arbitrary) static execution order
+  // assigned in the first approach turns out to not be good from a scheduling
+  // perspective.  e.g. consider a graph in which c1, c2, and c3 are three
+  // concurrent collective instances, and the static ordering assigns c1 -> c2
+  // -> c3.  In practice, it could turn out that c3 is always ready to execute
+  // before c1 or c2.
+  {
+    // `WaitForDependencies` may block if the collective instances on which this
+    // op depends have not yet launched.  When this function returns, this op is
+    // ready to go.
+    profiler::TraceMe activity("WaitForDependencies",
+                               profiler::TraceMeLevel::kInfo);
+    col_ctx->col_exec->WaitForDependencies(col_params);
+    nccl_manager_.SignalMultiNodeReady(nccl_collective_key);
+  }
+  {
+    // When all devices at this worker have called `SignalMultiNodeReady`, the
+    // `NcclManager` will enqueue the NCCL kernel on the NCCL stream.  Thus the
+    // implementation of `UnblockDependencies` keeps track of the number of
+    // devices that have launched.
+    profiler::TraceMe activity("Schedule", profiler::TraceMeLevel::kInfo);
+    col_ctx->col_exec->UnblockDependencies(col_params);
+  }
+}
+
+void NcclCommunicator::StartAbort(const Status& s) {
+  nccl_manager_.StartAbort(s);
+}
+
+}  // namespace tensorflow
+
+#else
+namespace tensorflow {
+std::unique_ptr<NcclCommunicatorInterface> MaybeCreateNcclCommunicator() {
+  return nullptr;
+}
+}  // namespace tensorflow
+#endif  // TENSORFLOW_USE_NCCL && (GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
diff --git a/tensorflow/core/nccl/collective_communicator.h b/tensorflow/core/nccl/collective_communicator.h
new file mode 100644
index 00000000000..8820e0a53e0
--- /dev/null
+++ b/tensorflow/core/nccl/collective_communicator.h
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_NCCL_COLECTIVE_COMMUNICATOR_H_
+#define TENSORFLOW_CORE_NCCL_COLECTIVE_COMMUNICATOR_H_
+
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+
+// Creates a NcclCommunicator if built with NCCL support, otherwise it returns
+// nullptr.
+std::unique_ptr<NcclCommunicatorInterface> MaybeCreateNcclCommunicator();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_NCCL_COLECTIVE_COMMUNICATOR_H_
diff --git a/tensorflow/core/nccl/nccl_manager.cc b/tensorflow/core/nccl/nccl_manager.cc
index bb4e7c90a06..157c255a316 100644
--- a/tensorflow/core/nccl/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@@ -22,7 +22,9 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/blocking_counter.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/unbounded_work_queue.h"
 #include "tensorflow/core/profiler/lib/annotated_traceme.h"
 #include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
@@ -279,6 +281,9 @@ Status NcclManager::GetCommunicator(NcclManager::Collective* collective,
             });
 
   mutex_lock l(mu_);
+  if (!status_.ok()) {
+    return status_;
+  }
 
   if (collective->communicator_key.empty()) {
     // For single-node collectives, when the caller does not specify a
@@ -487,6 +492,7 @@ void NcclManager::AddParticipant(std::unique_ptr<Participant> participant,
                                  ncclRedOp_t reduction_op) {
   Collective* to_run = nullptr;
   DataType data_type;
+  Status nccl_manager_status;
   if (participant->input != nullptr) {
     data_type = participant->input->dtype();
   } else {
@@ -494,92 +500,100 @@ void NcclManager::AddParticipant(std::unique_ptr<Participant> participant,
   }
   {
     mutex_lock l(mu_);
-    auto collective_it = collectives_.find(context.collective_key);
-    Collective* collective = nullptr;
-    if (collective_it == collectives_.end()) {
-      collective =
-          new Collective(context.collective_key, data_type, collective_type,
-                         reduction_op, context.num_local_devices,
-                         context.num_global_devices, context.communicator_key);
-      collectives_.emplace(context.collective_key, collective);
-    } else {
-      collective = collective_it->second;
-    }
+    nccl_manager_status = status_;
+    if (nccl_manager_status.ok()) {
+      auto collective_it = collectives_.find(context.collective_key);
+      Collective* collective = nullptr;
+      if (collective_it == collectives_.end()) {
+        collective = new Collective(
+            context.collective_key, data_type, collective_type, reduction_op,
+            context.num_local_devices, context.num_global_devices,
+            context.communicator_key);
+        collectives_.emplace(context.collective_key, collective);
+      } else {
+        collective = collective_it->second;
+      }
 
-    // Check `collective` is correct and consistent.
-    if (collective->status.ok() && !collective->single_node &&
-        collective->communicator_key.empty()) {
-      collective->status = errors::Internal(
-          "Collective ", reduction_op, " is multi node with num_local_devices=",
-          collective->num_local_devices,
-          " and num_global_devices=", collective->num_global_devices,
-          " but has an empty communicator_key");
-    }
-    if (collective->status.ok() && collective->communicator_key.size() !=
-                                       context.communicator_key.size()) {
-      collective->status =
-          errors::Internal("Collective ", reduction_op,
-                           " mismatch in member communicator_key with size ",
-                           collective->communicator_key.size(),
-                           " and arg communicator_key with size ",
-                           context.communicator_key.size());
-    }
-    if (collective->status.ok() && collective->type != collective_type) {
-      collective->status = errors::Internal(
-          "Collective ", reduction_op, " previously initialized with type ",
-          collective->type, " but now got type ", collective_type);
-    }
-    if (collective->status.ok() &&
-        collective->num_global_devices != context.num_global_devices) {
-      collective->status =
-          errors::Internal("Collective ", reduction_op,
-                           " previously initialized with num_global_devices ",
-                           collective->num_global_devices, " but now got ",
-                           context.num_global_devices);
-    }
-    if (collective->status.ok() &&
-        collective->num_local_devices != context.num_local_devices) {
-      collective->status =
-          errors::Internal("Collective ", reduction_op,
-                           "previously initialized with num_local_devices ",
-                           collective->num_local_devices, " but now got ",
-                           context.num_local_devices);
-    }
-    if (collective->status.ok() &&
-        collective->participants.size() >= collective->num_local_devices) {
-      collective->status = errors::Internal(
-          "Collective ", reduction_op, " expected ",
-          collective->num_local_devices, " participants but now has ",
-          collective->participants.size(),
-          " with one more participant being added");
-    }
-    if (collective->status.ok() && collective->root_rank >= 0 &&
-        context.source_rank >= 0 &&
-        collective->root_rank != context.source_rank) {
-      collective->status = errors::Internal(
-          "Collective ", collective->collective_key, " already has root_rank ",
-          collective->root_rank, " but new participant has root_rank ",
-          context.source_rank);
-    }
-    if (collective->status.ok() &&
-        !kValidDataTypes.Contains(collective->data_type)) {
-      collective->status = errors::Internal(
-          "Collective ", collective->collective_key,
-          " expected data types compatible with NCCL but instead got ",
-          DataTypeString(collective->data_type));
-    }
+      // Check `collective` is correct and consistent.
+      if (collective->status.ok() && !collective->single_node &&
+          collective->communicator_key.empty()) {
+        collective->status = errors::Internal(
+            "Collective ", reduction_op,
+            " is multi node with num_local_devices=",
+            collective->num_local_devices,
+            " and num_global_devices=", collective->num_global_devices,
+            " but has an empty communicator_key");
+      }
+      if (collective->status.ok() && collective->communicator_key.size() !=
+                                         context.communicator_key.size()) {
+        collective->status =
+            errors::Internal("Collective ", reduction_op,
+                             " mismatch in member communicator_key with size ",
+                             collective->communicator_key.size(),
+                             " and arg communicator_key with size ",
+                             context.communicator_key.size());
+      }
+      if (collective->status.ok() && collective->type != collective_type) {
+        collective->status = errors::Internal(
+            "Collective ", reduction_op, " previously initialized with type ",
+            collective->type, " but now got type ", collective_type);
+      }
+      if (collective->status.ok() &&
+          collective->num_global_devices != context.num_global_devices) {
+        collective->status =
+            errors::Internal("Collective ", reduction_op,
+                             " previously initialized with num_global_devices ",
+                             collective->num_global_devices, " but now got ",
+                             context.num_global_devices);
+      }
+      if (collective->status.ok() &&
+          collective->num_local_devices != context.num_local_devices) {
+        collective->status =
+            errors::Internal("Collective ", reduction_op,
+                             "previously initialized with num_local_devices ",
+                             collective->num_local_devices, " but now got ",
+                             context.num_local_devices);
+      }
+      if (collective->status.ok() &&
+          collective->participants.size() >= collective->num_local_devices) {
+        collective->status = errors::Internal(
+            "Collective ", reduction_op, " expected ",
+            collective->num_local_devices, " participants but now has ",
+            collective->participants.size(),
+            " with one more participant being added");
+      }
+      if (collective->status.ok() && collective->root_rank >= 0 &&
+          context.source_rank >= 0 &&
+          collective->root_rank != context.source_rank) {
+        collective->status = errors::Internal(
+            "Collective ", collective->collective_key,
+            " already has root_rank ", collective->root_rank,
+            " but new participant has root_rank ", context.source_rank);
+      }
+      if (collective->status.ok() &&
+          !kValidDataTypes.Contains(collective->data_type)) {
+        collective->status = errors::Internal(
+            "Collective ", collective->collective_key,
+            " expected data types compatible with NCCL but instead got ",
+            DataTypeString(collective->data_type));
+      }
 
-    if (context.source_rank >= 0) {
-      collective->root_rank = context.source_rank;
-    }
-    collective->participants.emplace_back(std::move(participant));
-    ++collective->available_participants;
+      if (context.source_rank >= 0) {
+        collective->root_rank = context.source_rank;
+      }
 
-    if (CheckReady(context.collective_key, collective)) {
-      to_run = collective;
+      collective->participants.emplace_back(std::move(participant));
+      ++collective->available_participants;
+
+      if (CheckReady(context.collective_key, collective)) {
+        to_run = collective;
+      }
     }
   }
-
+  if (!nccl_manager_status.ok()) {
+    participant->done_callback(nccl_manager_status);
+    return;
+  }
   if (to_run != nullptr) RunCollective(to_run);
 }
 
@@ -618,7 +632,7 @@ void NcclManager::RunCollective(Collective* collective) {
       // Wait to ensure that the kernel that produces the data in the input
       // tensor has finished running before the nccl kernel runs on the
       // communication stream.
-      nccl_stream->stream->ThenWaitFor(p->input_event.get());
+      nccl_stream->stream->ThenWaitFor(p->tensor_stream);
     }
     if (p->root) {
       if (collective->root_rank == -1) {
@@ -834,6 +848,52 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) {
   }
 }
 
+void NcclManager::StartAbort(const Status& s) {
+  VLOG(1) << "NcclManager StartAbort";
+  absl::flat_hash_map<string, Collective*> collectives;
+  // After status_ is set to a non-OK one, there should be no further
+  // modifications to collectives_.
+  {
+    mutex_lock l(mu_);
+    if (!status_.ok()) {
+      LOG(WARNING)
+          << "NcclManager already aborted, ignoring subsequent StartAbort with "
+          << s;
+      return;
+    }
+    status_ = s;
+    collectives.swap(collectives_);
+  }
+  // collectives_ contains pending launches that haven't been dispatched to
+  // kernel launch threads, so we can simply invoke the done callbacks of them.
+  for (const auto& item : collectives) {
+    for (const std::unique_ptr<Participant>& p : item.second->participants) {
+      p->done_callback(s);
+    }
+    item.second->Unref();
+  }
+  // Abort ncclComm. Note that there could be multiple ncclComm per device, and
+  // ncclCommAbort contains cuda calls that requires device synchronization.
+  // That is a collective on nccl_comm_0 can block ncclCommAbort(nccl_comm_1),
+  // so we need to abort all ncclComm in a concurrent fashion. This assumes that
+  // there's only one active NcclManager at a time.
+  UnboundedWorkQueue queue(Env::Default(), "nccl_abort");
+  int num_comms = 0;
+  for (std::unique_ptr<Communicator>& communicator : communicators_) {
+    num_comms += communicator->members.size();
+  }
+  BlockingCounter pending(num_comms);
+  for (std::unique_ptr<Communicator>& communicator : communicators_) {
+    for (const CommunicatorMember& member : communicator->members) {
+      queue.Schedule([&member, &pending]() {
+        ncclCommAbort(member.nccl_comm);
+        pending.DecrementCount();
+      });
+    }
+  }
+  pending.Wait();
+}
+
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/nccl/nccl_manager.h b/tensorflow/core/nccl/nccl_manager.h
index a8bdce87081..88b8bc85663 100644
--- a/tensorflow/core/nccl/nccl_manager.h
+++ b/tensorflow/core/nccl/nccl_manager.h
@@ -27,7 +27,6 @@ limitations under the License.
 #endif
 
 #include "absl/container/flat_hash_map.h"
-#include "absl/memory/memory.h"
 #if GOOGLE_CUDA
 #include "third_party/nccl/nccl.h"
 #elif TENSORFLOW_USE_ROCM
@@ -77,7 +76,6 @@ class NcclManager {
           context(static_cast<GPUDeviceContext*>(info->default_context)),
 #endif
           input(input),
-          input_event(nullptr),
           output(output),
           global_rank(global_rank),
           done_callback(std::move(done_callback)),
@@ -85,11 +83,6 @@ class NcclManager {
       DCHECK(executor != nullptr);
       DCHECK(event_mgr != nullptr);
       DCHECK(tensor_stream != nullptr);
-      if (input != nullptr) {
-        input_event = absl::make_unique<se::Event>(executor);
-        input_event->Init();
-        tensor_stream->ThenRecordEvent(input_event.get());
-      }
     }
 
     // StreamExecutor for the device. Expected to be live for process lifetime.
@@ -118,10 +111,6 @@ class NcclManager {
     // called. Is NULL for participants that only receive data.
     const Tensor* input;
 
-    // Wait on this event rather than synchronizing on the entire stream.
-    // This allows greater concurrency between compute and nccl streams.
-    std::unique_ptr<se::Event> input_event;
-
     // Owned by the caller, who must keep it live until `done_callback` is
     // called. Is NULL for participants that only send data.
     Tensor* output;
@@ -202,6 +191,10 @@ class NcclManager {
   // function.
   void SignalMultiNodeReady(const string& collective_key);
 
+  // Aborts all collectives. After abortion, no further collectives can be
+  // launched with this NcclManager.
+  void StartAbort(const Status& s);
+
  private:
   enum CollectiveType {
     kAllReduce = 1,
@@ -257,6 +250,8 @@ class NcclManager {
 
   std::vector<std::unique_ptr<Communicator>> communicators_;
 
+  Status status_ TF_GUARDED_BY(mu_);
+
   TF_DISALLOW_COPY_AND_ASSIGN(NcclManager);
 };
 
diff --git a/tensorflow/core/nccl/nccl_manager_test.cc b/tensorflow/core/nccl/nccl_manager_test.cc
index a76b0494bab..ff967175091 100644
--- a/tensorflow/core/nccl/nccl_manager_test.cc
+++ b/tensorflow/core/nccl/nccl_manager_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/nccl/nccl_manager.h"
@@ -300,8 +302,14 @@ class NcclManagerTest : public ::testing::Test {
 
   void RunMultiNodeAllReduceTest(const int num_nodes,
                                  const int num_ranks_per_node) {
-    const int num_global_ranks = num_nodes * num_ranks_per_node;
     std::vector<NodeState> node_states(num_nodes);
+    RunMultiNodeAllReduceTest(node_states, num_ranks_per_node);
+  }
+
+  void RunMultiNodeAllReduceTest(std::vector<NodeState>& node_states,
+                                 const int num_ranks_per_node) {
+    const int num_nodes = node_states.size();
+    const int num_global_ranks = num_nodes * num_ranks_per_node;
     const string collective_key = "allreduce";
     // The NcclManagers in this test synchronize in real-time, so we need to run
     // each node's code in a separate thread.
@@ -632,6 +640,10 @@ TEST(NcclManagerTest, CommunicatorKey) {
 }
 
 #if !TENSORFLOW_USE_ROCM
+// ROCm platform currently does not support simulating a mutli-node
+// environment, on a single node with multiple GPUS. So tests that rely
+// upon such simulation need to be skipped on the ROCm platform
+
 // This test creates `num_nodes` NcclManagers to simulate a multi-node
 // environment.  It works on a single node with multiple GPUs.  It enqueues NCCL
 // kernels on separate stream per rank.
@@ -653,6 +665,10 @@ TYPED_TEST(NcclManagerTest, MultiNodeSingle) {
 }
 
 #if !TENSORFLOW_USE_ROCM
+// ROCm platform currently does not support simulating a mutli-node
+// environment, on a single node with multiple GPUS. So tests that rely
+// upon such simulation need to be skipped on the ROCm platform
+
 // Multi-node broadcast.
 TYPED_TEST(NcclManagerTest, MultiNodeBroadcast) {
   int num_nodes;
@@ -842,6 +858,74 @@ TYPED_TEST(NcclManagerTest, BroadcastInconsistentSource) {
   this->VerifyError(test_case.get());
 }
 
+#if !TENSORFLOW_USE_ROCM
+// ROCm platform currently does not support simulating a mutli-node
+// environment, on a single node with multiple GPUS. So tests that rely
+// upon such simulation need to be skipped on the ROCm platform
+
+TYPED_TEST(NcclManagerTest, Abort) {
+  using NodeState = typename TestFixture::NodeState;
+  using TestCase = typename TestFixture::TestCase;
+  int num_nodes = 2;
+  std::vector<NodeState> nodes(num_nodes);
+  // First do a normal all-reduce to simulate the the case when there're
+  // multiple communicators.
+  this->RunMultiNodeAllReduceTest(nodes, /* num_ranks_per_node */ 1);
+
+  // Use a new communicator_key, which uses a new set of ncclComm underneath.
+  string communicator_key = nodes[0].nccl_manager.GenerateCommunicatorKey();
+  string collective_key = "allreduce";
+  ncclRedOp_t reduction_op = static_cast<ncclRedOp_t>(0);
+  auto node_fn = [&](TestCase* test_case, int node) {
+    auto* device = this->GetDevice(/* num_ranks_per_node */ 1, node,
+                                   /* local_rank */ 0);
+    auto* info = device->tensorflow_gpu_device_info();
+    auto* stream = device->tensorflow_gpu_device_info()->stream;
+    auto participant = absl::make_unique<NcclManager::Participant>(
+        device->executor(), stream, info, &test_case->ins[node],
+        &test_case->outs[node], /* global_rank */ node,
+        this->CreateDoneCallback(test_case));
+    nodes[node].nccl_manager.AddToAllReduce(
+        std::move(participant),
+        {collective_key, /* num_local_devices */ 1,
+         /* num_global_devices */ num_nodes, communicator_key,
+         /*source_rank=*/-1},
+        reduction_op);
+    nodes[node].nccl_manager.SignalMultiNodeReady(collective_key);
+  };
+
+  // Do a normal all-reduce with this communicator key to initialize ncclComm.
+  // This is because ncclCommInitRank waits for all ranks and is blocking.
+  {
+    std::unique_ptr<typename TestFixture::TestCase> test_case(
+        this->MakeReductionTestCase(
+            /* num_nodes */ num_nodes, /* num_ranks_per_node */ 1, reduction_op,
+            TensorShape({2, 3}), 0.0f));
+    for (int i = 0; i < num_nodes; ++i) {
+      this->work_queue_->Schedule(
+          [&node_fn, &test_case, i]() { node_fn(test_case.get(), i); });
+    }
+    this->VerifyResults(test_case.get());
+  }
+
+  // A hanging all-reduce.
+  ASSERT_GT(num_nodes, 1);
+  std::unique_ptr<typename TestFixture::TestCase> test_case(
+      this->MakeReductionTestCase(
+          /* num_nodes */ num_nodes, /* num_ranks_per_node */ 1, reduction_op,
+          TensorShape({2, 3}), 0.0f));
+  node_fn(test_case.get(), 0);
+  Env::Default()->SleepForMicroseconds(1000000);
+  nodes[0].nccl_manager.StartAbort(errors::Unavailable("peer down"));
+  {
+    mutex_lock l(test_case->mu);
+    while (test_case->num_completed != 1) {
+      test_case->done_cv.wait(l);
+    }
+  }
+}
+#endif
+
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/ops/BUILD b/tensorflow/core/ops/BUILD
new file mode 100644
index 00000000000..d50d43f0b72
--- /dev/null
+++ b/tensorflow/core/ops/BUILD
@@ -0,0 +1,539 @@
+# Description:
+# Tensorflow default op definitions.
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "if_chromiumos",
+    "tf_cc_test",
+)
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# For platform specific build config
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_kernel_tests_linkstatic",
+)
+load(
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
+)
+load(
+    "//third_party/mkl:build_defs.bzl",
+    "if_mkl",
+)
+
+# A lot of packages try to minimize binary size by depending on individual ops,\
+# so they need access here.
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    features = ["-parse_headers"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Export the BUILD file so automated tooling can check licenses
+exports_files([
+    "BUILD",
+    "ops.pbtxt",
+])
+
+# Generates library per group of ops.
+tf_gen_op_libs(
+    is_external = False,
+    op_lib_names = [
+        "batch_ops",
+        "bitwise_ops",
+        "boosted_trees_ops",
+        "tensor_forest_ops",
+        "candidate_sampling_ops",
+        "checkpoint_ops",
+        "clustering_ops",
+        "collective_ops",
+        "control_flow_ops",
+        "count_ops",
+        "ctc_ops",
+        "data_flow_ops",
+        "dataset_ops",
+        "decode_proto_ops",
+        "encode_proto_ops",
+        "experimental_dataset_ops",
+        "function_ops",
+        "functional_ops",
+        "image_ops",
+        "io_ops",
+        "linalg_ops",
+        "list_ops",
+        "map_ops",
+        "lookup_ops",
+        "manip_ops",
+        "math_ops",
+        "mkl_nn_ops",
+        "nccl_ops",
+        "nn_ops",
+        "no_op",
+        "parsing_ops",
+        "random_grad",
+        "random_ops",
+        "special_math_ops",
+        "stateful_random_ops",
+        "remote_fused_graph_ops",
+        "rnn_ops",
+        "rpc_ops",
+        "scoped_allocator_ops",
+        "sdca_ops",
+        "set_ops",
+        "script_ops",
+        "sendrecv_ops",
+        "sparse_csr_matrix_ops",
+        "sparse_ops",
+        "spectral_ops",
+        "state_ops",
+        "stateless_random_ops",
+        "stateless_random_ops_v2",
+        "summary_ops",
+        "training_ops",
+    ],
+    sub_directory = "",
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_gen_op_libs(
+    is_external = False,
+    op_lib_names = [
+        "logging_ops",
+    ],
+    sub_directory = "",
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        # TODO(b/162630222): remove this dependency.
+        "//tensorflow/c/kernels:histogram_summary_op_lib",
+        "//tensorflow/c/kernels:merge_summary_op_lib",
+        "//tensorflow/c/kernels:summary_op_lib",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "string_ops",
+    ],
+    sub_directory = "",
+    deps = [
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:lib_proto_parsing",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "array_ops",
+    ],
+    sub_directory = "",
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "mkl_array_ops",
+    ],
+    sub_directory = "",
+    deps = ["//tensorflow/core:protos_all_cc"],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "audio_ops",
+    ],
+    sub_directory = "",
+    deps = ["//tensorflow/core:lib"],
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["debug_ops"],
+    sub_directory = "",
+    deps = ["//tensorflow/core:lib"],
+)
+
+tf_gen_op_libs(
+    is_external = False,
+    op_lib_names = [
+        "resource_variable_ops",
+    ],
+    sub_directory = "",
+    deps = ["//tensorflow/core:lib"],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "tpu_configuration_ops",
+        "tpu_cross_replica_ops",
+        "tpu_embedding_ops",
+        "tpu_embedding_load_retrieve_ops",
+        "tpu_functional_ops",
+        "tpu_heartbeat_ops",
+        "tpu_host_compute_ops",
+        "tpu_infeed_ops",
+        "tpu_outfeed_ops",
+        "tpu_ordinal_selector_ops",
+        "tpu_replication_ops",
+    ],
+    sub_directory = "",
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/protobuf/tpu:optimization_parameters_proto_cc",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_cc",
+        "//tensorflow/core/tpu:tpu_embedding_optimization_parameters_utils",
+        "//tensorflow/core/tpu:tpu_embedding_output_layout_utils",
+    ],
+)
+
+cc_library(
+    name = "word2vec_ops",
+    srcs = ["word2vec_ops.cc"],
+    linkstatic = 1,
+    deps = ["//tensorflow/core:framework"],
+    alwayslink = 1,
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "cudnn_rnn_ops",
+    ],
+    sub_directory = "",
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "ragged_ops",
+    deps = [
+        "//tensorflow/core:ragged_array_ops_op_lib",
+        "//tensorflow/core:ragged_conversion_ops_op_lib",
+        "//tensorflow/core:ragged_math_ops_op_lib",
+    ],
+)
+
+tf_gen_op_libs(
+    op_lib_names = [
+        "ragged_array_ops",
+        "ragged_conversion_ops",
+        "ragged_math_ops",
+    ],
+    sub_directory = "",
+    deps = ["//tensorflow/core/util:ragged_to_dense_util"],
+)
+
+cc_library(
+    name = "ops",
+    deps = [
+        ":array_ops_op_lib",
+        ":audio_ops_op_lib",
+        ":batch_ops_op_lib",
+        ":bitwise_ops_op_lib",
+        ":boosted_trees_ops_op_lib",
+        ":tensor_forest_ops_op_lib",
+        ":candidate_sampling_ops_op_lib",
+        ":checkpoint_ops_op_lib",
+        ":clustering_ops_op_lib",
+        ":collective_ops_op_lib",
+        ":control_flow_ops_op_lib",
+        ":count_ops_op_lib",
+        ":ctc_ops_op_lib",
+        ":cudnn_rnn_ops_op_lib",
+        ":data_flow_ops_op_lib",
+        ":dataset_ops_op_lib",
+        ":debug_ops_op_lib",
+        ":decode_proto_ops_op_lib",
+        ":encode_proto_ops_op_lib",
+        ":experimental_dataset_ops_op_lib",
+        ":function_ops_op_lib",
+        ":functional_ops_op_lib",
+        ":image_ops_op_lib",
+        ":io_ops_op_lib",
+        ":linalg_ops_op_lib",
+        ":list_ops_op_lib",
+        ":map_ops_op_lib",
+        ":logging_ops_op_lib",
+        ":lookup_ops_op_lib",
+        ":manip_ops_op_lib",
+        ":math_ops_op_lib",
+        ":nccl_ops_op_lib",
+        ":nn_ops_op_lib",
+        ":no_op_op_lib",
+        ":parsing_ops_op_lib",
+        ":ragged_ops",
+        ":random_ops_op_lib",
+        ":rnn_ops_op_lib",
+        ":special_math_ops_op_lib",
+        ":stateful_random_ops_op_lib",
+        ":remote_fused_graph_ops_op_lib",
+        ":resource_variable_ops_op_lib",
+        ":rpc_ops_op_lib",
+        ":scoped_allocator_ops_op_lib",
+        ":script_ops_op_lib",
+        ":sdca_ops_op_lib",
+        ":sendrecv_ops_op_lib",
+        ":set_ops_op_lib",
+        ":sparse_csr_matrix_ops_op_lib",
+        ":sparse_ops_op_lib",
+        ":summary_ops_op_lib",
+        ":spectral_ops_op_lib",
+        ":state_ops_op_lib",
+        ":stateless_random_ops_op_lib",
+        ":stateless_random_ops_v2_op_lib",
+        ":string_ops_op_lib",
+        ":training_ops_op_lib",
+        ":word2vec_ops",
+    ] + if_chromiumos(
+        [],
+        # Non-tpu platforms don't need tpu dependency.
+        [
+            ":tpu_configuration_ops_op_lib",
+            ":tpu_cross_replica_ops_op_lib",
+            ":tpu_embedding_ops_op_lib",
+            ":tpu_embedding_load_retrieve_ops_op_lib",
+            ":tpu_functional_ops_op_lib",
+            ":tpu_heartbeat_ops_op_lib",
+            ":tpu_host_compute_ops_op_lib",
+            ":tpu_infeed_ops_op_lib",
+            ":tpu_outfeed_ops_op_lib",
+            ":tpu_ordinal_selector_ops_op_lib",
+            ":tpu_replication_ops_op_lib",
+        ],
+    ) + if_mkl([
+        ":mkl_array_ops_op_lib",
+        ":mkl_nn_ops_op_lib",
+    ]),
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "array_grad",
+    srcs = ["array_grad.cc"],
+    linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
+    deps = [
+        ":array_ops_op_lib",
+        "//tensorflow/c/kernels:bitcast_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "functional_grad",
+    srcs = ["functional_grad.cc"],
+    linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
+    deps = [
+        ":functional_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "math_grad",
+    srcs = [
+        "math_grad.cc",
+        "random_grad.cc",
+        "stateless_random_grad.cc",
+    ],
+    linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
+    deps = [
+        ":math_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "nn_grad",
+    srcs = ["nn_grad.cc"],
+    linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        ":nn_ops_op_lib",
+    ] + if_mkl([
+        ":mkl_nn_ops_op_lib",
+    ]),
+    alwayslink = 1,
+)
+
+filegroup(
+    name = "portable_op_registrations_and_gradients",
+    srcs = ["//tensorflow/c/kernels:android_all_ops"] + glob(
+        [
+            "**/*.cc",
+            "**/*.h",
+        ],
+        exclude = [
+            "**/*test.cc",
+            "**/*testutil*",
+            "**/*testlib*",
+            "**/*main.cc",
+            "**/tpu_*",
+        ],
+    ),
+)
+
+tf_cc_test(
+    name = "cudnn_rnn_ops_test_cc",
+    size = "small",
+    srcs = [
+        "cudnn_rnn_ops_test.cc",
+    ],
+    deps = [
+        "//tensorflow/core",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+tf_cc_test(
+    name = "ops_array_grad_test",
+    size = "small",
+    srcs = ["array_grad_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":ops",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:direct_session_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:array",
+        "//tensorflow/core/kernels:cwise_op",
+        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:math",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_cc_test(
+    name = "ops_math_grad_test",
+    size = "small",
+    srcs = ["math_grad_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = ["no_gpu"],
+    deps = [
+        ":ops",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:direct_session_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:array",
+        "//tensorflow/core/kernels:data_flow",
+        "//tensorflow/core/kernels:function_ops",
+        "//tensorflow/core/kernels:math",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_cc_test(
+    name = "ops_remote_fused_graph_ops_test",
+    size = "small",
+    srcs = ["remote_fused_graph_ops_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":ops",
+        "//tensorflow/core",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels:remote_fused_graph_ops",
+    ],
+)
+
+tf_cc_test(
+    name = "ops_tests",
+    size = "small",
+    srcs = [
+        "array_ops_test.cc",
+        "candidate_sampling_ops_test.cc",
+        "control_flow_ops_test.cc",
+        "ctc_ops_test.cc",
+        "data_flow_ops_test.cc",
+        "functional_ops_test.cc",
+        "image_ops_test.cc",
+        "io_ops_test.cc",
+        "linalg_ops_test.cc",
+        "lookup_ops_test.cc",
+        "math_ops_test.cc",
+        "nn_ops_test.cc",
+        "parsing_ops_test.cc",
+        "random_ops_test.cc",
+        "rnn_ops_test.cc",
+        "set_ops_test.cc",
+        "shape_function_test.cc",
+        "sparse_csr_matrix_ops_test.cc",
+        "sparse_ops_test.cc",
+        "spectral_ops_test.cc",
+        "state_ops_test.cc",
+        "string_ops_test.cc",
+        "training_ops_test.cc",
+    ],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":ops",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//third_party/eigen3",
+    ],
+)
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index b4dfe6187d5..2018f793741 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -1219,9 +1219,15 @@ REGISTER_OP("GatherV2")
       // Note, batch_dims can be negative.
       int32 batch_dims;
       TF_RETURN_IF_ERROR(c->GetAttr("batch_dims", &batch_dims));
-      TF_RETURN_IF_ERROR(c->WithRankAtLeast(
-          params_shape, batch_dims < 0 ? -batch_dims : batch_dims + 1,
-          &unused));
+      // -rank(indices) <= batch_dims <= rank(indices)
+      TF_RETURN_IF_ERROR(
+          c->WithRankAtLeast(indices_shape, std::abs(batch_dims), &unused));
+      if (batch_dims < 0) {
+        batch_dims += c->Rank(indices_shape);
+      }
+      // rank(params) > batch_dims
+      TF_RETURN_IF_ERROR(
+          c->WithRankAtLeast(params_shape, batch_dims + 1, &unused));
 
       ShapeHandle params_outer_subshape;
       TF_RETURN_IF_ERROR(
@@ -2802,6 +2808,70 @@ REGISTER_OP("QuantizeAndDequantizeV2")
       return Status::OK();
     });
 
+REGISTER_OP("QuantizeAndDequantizeV4")
+    .Input("input: T")
+    .Input("input_min: T")
+    .Input("input_max: T")
+    .Attr("signed_input: bool = true")
+    .Attr("num_bits: int = 8")
+    .Attr("range_given: bool = false")
+    .Output("output: T")
+    .Attr("T: {bfloat16, half, float, double}")
+    .Attr(
+        "round_mode: {'HALF_TO_EVEN', 'HALF_UP'} = "
+        "'HALF_TO_EVEN'")
+    .Attr("narrow_range: bool = false")
+    .Attr("axis: int = -1")
+    .SetShapeFn([](InferenceContext* c) {
+      int axis;
+      TF_RETURN_IF_ERROR(c->GetAttr("axis", &axis));
+      const int minmax_rank = (axis == -1) ? 0 : 1;
+      ShapeHandle minmax;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), minmax_rank, &minmax));
+      TF_RETURN_IF_ERROR(c->Merge(c->input(2), minmax, &minmax));
+      if (axis != -1) {
+        ShapeHandle input;
+        TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), axis + 1, &input));
+        DimensionHandle depth;
+        TF_RETURN_IF_ERROR(
+            c->Merge(c->Dim(minmax, 0), c->Dim(input, axis), &depth));
+      }
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    });
+
+REGISTER_OP("QuantizeAndDequantizeV4Grad")
+    .Input("gradients: T")
+    .Input("input: T")
+    .Input("input_min: T")
+    .Input("input_max: T")
+    .Output("input_backprop: T")
+    .Output("input_min_backprop: T")
+    .Output("input_max_backprop: T")
+    .Attr("T: {bfloat16, half, float, double}")
+    .Attr("axis: int = -1")
+    .SetShapeFn([](InferenceContext* c) {
+      int axis;
+      TF_RETURN_IF_ERROR(c->GetAttr("axis", &axis));
+      const int minmax_rank = (axis == -1) ? 0 : 1;
+      ShapeHandle minmax;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), minmax_rank, &minmax));
+      TF_RETURN_IF_ERROR(c->Merge(c->input(3), minmax, &minmax));
+      if (axis != -1) {
+        ShapeHandle input;
+        TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), axis + 1, &input));
+        DimensionHandle depth;
+        TF_RETURN_IF_ERROR(
+            c->Merge(c->Dim(minmax, 0), c->Dim(input, axis), &depth));
+      }
+      ShapeHandle inputs;
+      TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(1), &inputs));
+      c->set_output(0, inputs);
+      c->set_output(1, minmax);
+      c->set_output(2, minmax);
+      return Status::OK();
+    });
+
 REGISTER_OP("QuantizeAndDequantizeV3")
     .Input("input: T")
     .Input("input_min: T")
diff --git a/tensorflow/core/ops/collective_ops.cc b/tensorflow/core/ops/collective_ops.cc
index 51b266d8f08..ecaab00c91a 100644
--- a/tensorflow/core/ops/collective_ops.cc
+++ b/tensorflow/core/ops/collective_ops.cc
@@ -114,7 +114,31 @@ REGISTER_OP("CollectiveReduceV2")
     .Attr("merge_op: {'Min', 'Max', 'Mul', 'Add'}")
     .Attr("final_op: {'Id', 'Div'}")
     .Attr("communication_hint: string = 'auto'")
+    .Attr("timeout_seconds: float = 0")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("CollectiveGatherV2")
+    .Input("input: T")
+    .Output("data: T")
+    .Attr("T: {float, float16, float64, int32, int64}")
+    .Input("group_size: int32")
+    .Input("group_key: int32")
+    .Input("instance_key: int32")
+    .Attr("communication_hint: string = 'auto'")
+    .Attr("timeout_seconds: float = 0")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Scalar input is not supported.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &unused));
+      // This output should have the same shape as its input except the first
+      // dimension is unknown, since the group size is unknown.
+      shape_inference::ShapeHandle out;
+      TF_RETURN_IF_ERROR(
+          c->ReplaceDim(c->input(0), /*dim_index*/ 0, c->UnknownDim(), &out));
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/compat/BUILD b/tensorflow/core/ops/compat/BUILD
index 47ab66cd944..aad8b8e550c 100644
--- a/tensorflow/core/ops/compat/BUILD
+++ b/tensorflow/core/ops/compat/BUILD
@@ -2,6 +2,7 @@
 # For keeping the history of OpDefs for every major version of TensorFlow,
 # to validate that we don't make backwards-incompatible changes.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_binary",
@@ -13,19 +14,17 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 cc_library(
     name = "op_compatibility_lib",
     srcs = ["op_compatibility_lib.cc"],
     hdrs = ["op_compatibility_lib.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:debug_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/ops:debug_ops_op_lib",
     ],
 )
 
@@ -36,7 +35,7 @@ tf_cc_test(
         "backwards_compatibility_test.cc",
     ],
     data = [
-        "//tensorflow/core:ops/ops.pbtxt",
+        "//tensorflow/core/ops:ops.pbtxt",
         "//tensorflow/core/ops/compat/ops_history_v1:ops_history_v1_srcs",
         "//tensorflow/core/ops/compat/ops_history_v2:ops_history_v2_srcs",
     ] + glob([
diff --git a/tensorflow/core/ops/compat/ops_history_v1/BUILD b/tensorflow/core/ops/compat/ops_history_v1/BUILD
index dfd7dab25bf..808031e7e5d 100644
--- a/tensorflow/core/ops/compat/ops_history_v1/BUILD
+++ b/tensorflow/core/ops/compat/ops_history_v1/BUILD
@@ -3,6 +3,8 @@
 # to validate that we don't make backwards-incompatible changes in particular
 # for v1.
 
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
 package(
     licenses = ["notice"],  # Apache 2.0
 )
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AddV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AddV2.pbtxt
index 8781485827d..400e4e04042 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AddV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AddV2.pbtxt
@@ -105,3 +105,40 @@ op {
   is_aggregate: true
   is_commutative: true
 }
+op {
+  name: "AddV2"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_UINT8
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_UINT32
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  is_aggregate: true
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BUILD b/tensorflow/core/ops/compat/ops_history_v2/BUILD
index a7462807779..dfc10238c29 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BUILD
+++ b/tensorflow/core/ops/compat/ops_history_v2/BUILD
@@ -3,6 +3,8 @@
 # to validate that we don't make backwards-incompatible changes in particular
 # for v2.
 
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
 package(
     licenses = ["notice"],  # Apache 2.0
 )
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveGatherV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveGatherV2.pbtxt
new file mode 100644
index 00000000000..8a081e34d34
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveGatherV2.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "CollectiveGatherV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "group_key"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "instance_key"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceV2.pbtxt
index dd39ac27f93..b2751cc59e8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduceV2.pbtxt
@@ -64,3 +64,76 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveReduceV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "group_key"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "instance_key"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Equal.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Equal.pbtxt
index f84733accce..a50cbdfcfeb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Equal.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Equal.pbtxt
@@ -214,3 +214,30 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "Equal"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "incompatible_shape_error"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormGradV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormGradV3.pbtxt
index b1576ffb772..aa05a575bfe 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormGradV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormGradV3.pbtxt
@@ -92,3 +92,99 @@ op {
     }
   }
 }
+op {
+  name: "FusedBatchNormGradV3"
+  input_arg {
+    name: "y_backprop"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "reserve_space_3"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "x_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "scale_backprop"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "offset_backprop"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_4"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_5"
+    type_attr: "U"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormV3.pbtxt
index 6df70795677..834ff00a0b3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FusedBatchNormV3.pbtxt
@@ -99,3 +99,106 @@ op {
     }
   }
 }
+op {
+  name: "FusedBatchNormV3"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "scale"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "offset"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "mean"
+    type_attr: "U"
+  }
+  input_arg {
+    name: "variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "batch_mean"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "batch_variance"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_1"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_2"
+    type_attr: "U"
+  }
+  output_arg {
+    name: "reserve_space_3"
+    type_attr: "U"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "U"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    name: "epsilon"
+    type: "float"
+    default_value {
+      f: 0.0001
+    }
+  }
+  attr {
+    name: "exponential_avg_factor"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "is_training"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ImageProjectiveTransformV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ImageProjectiveTransformV3.pbtxt
new file mode 100644
index 00000000000..90858cb06fc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/ImageProjectiveTransformV3.pbtxt
@@ -0,0 +1,48 @@
+op {
+  name: "ImageProjectiveTransformV3"
+  input_arg {
+    name: "images"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "transforms"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "output_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "fill_value"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "transformed_images"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "interpolation"
+    type: "string"
+  }
+  attr {
+    name: "fill_mode"
+    type: "string"
+    default_value {
+      s: "CONSTANT"
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Max.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Max.pbtxt
index 4c931ccac4d..bf147acf0f4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Max.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Max.pbtxt
@@ -32,8 +32,6 @@ op {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -89,8 +87,6 @@ op {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -148,8 +144,6 @@ op {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -206,14 +200,12 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -234,3 +226,63 @@ op {
     }
   }
 }
+op {
+  name: "Max"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MaxPool.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MaxPool.pbtxt
index ff78964704c..f4fd1cccf29 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MaxPool.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MaxPool.pbtxt
@@ -260,3 +260,81 @@ op {
     }
   }
 }
+op {
+  name: "MaxPool"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_QINT8
+      }
+    }
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+        s: "NCHW_VECT_C"
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGrad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGrad.pbtxt
index b54e555594c..131a3633cf9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGrad.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MaxPoolGrad.pbtxt
@@ -369,3 +369,89 @@ op {
     }
   }
 }
+op {
+  name: "MaxPoolGrad"
+  input_arg {
+    name: "orig_input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "orig_output"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "ksize"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 4
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NHWC"
+    }
+    allowed_values {
+      list {
+        s: "NHWC"
+        s: "NCHW"
+      }
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/Min.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/Min.pbtxt
index f0ebdb0e41f..4959b5e8d58 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/Min.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/Min.pbtxt
@@ -32,8 +32,6 @@ op {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -89,8 +87,6 @@ op {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -148,8 +144,6 @@ op {
         type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
@@ -206,14 +200,12 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -234,3 +226,63 @@ op {
     }
   }
 }
+op {
+  name: "Min"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "reduction_indices"
+    type_attr: "Tidx"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "keep_dims"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ModelDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ModelDataset.pbtxt
index 1a44d3416b2..a2ebc78ff48 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ModelDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ModelDataset.pbtxt
@@ -58,3 +58,47 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ModelDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "algorithm"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "cpu_budget"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ram_budget"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NotEqual.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NotEqual.pbtxt
index 0039b5e2cb6..099ef75d622 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NotEqual.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NotEqual.pbtxt
@@ -214,3 +214,30 @@ op {
   }
   is_commutative: true
 }
+op {
+  name: "NotEqual"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "z"
+    type: DT_BOOL
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "incompatible_shape_error"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_commutative: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeueTupleV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeueTupleV2.pbtxt
new file mode 100644
index 00000000000..744744b4545
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeueTupleV2.pbtxt
@@ -0,0 +1,22 @@
+op {
+  name: "OutfeedDequeueTupleV2"
+  input_arg {
+    name: "device_ordinal"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeueV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeueV2.pbtxt
new file mode 100644
index 00000000000..c5ca1f31a67
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/OutfeedDequeueV2.pbtxt
@@ -0,0 +1,20 @@
+op {
+  name: "OutfeedDequeueV2"
+  input_arg {
+    name: "device_ordinal"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PrefetchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PrefetchDataset.pbtxt
index d2c1686e99f..0bf61a6f118 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PrefetchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PrefetchDataset.pbtxt
@@ -94,3 +94,51 @@ op {
     }
   }
 }
+op {
+  name: "PrefetchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "slack_period"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "legacy_autotune"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "buffer_size_min"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4.pbtxt
new file mode 100644
index 00000000000..2a49131faaf
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4.pbtxt
@@ -0,0 +1,79 @@
+op {
+  name: "QuantizeAndDequantizeV4"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_TO_EVEN"
+    }
+    allowed_values {
+      list {
+        s: "HALF_TO_EVEN"
+        s: "HALF_UP"
+      }
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4Grad.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4Grad.pbtxt
new file mode 100644
index 00000000000..0bbe87452b1
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/QuantizeAndDequantizeV4Grad.pbtxt
@@ -0,0 +1,50 @@
+op {
+  name: "QuantizeAndDequantizeV4Grad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_min_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_max_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToVariantGradient.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToVariantGradient.pbtxt
new file mode 100644
index 00000000000..45f2fcefe04
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedTensorToVariantGradient.pbtxt
@@ -0,0 +1,36 @@
+op {
+  name: "RaggedTensorToVariantGradient"
+  input_arg {
+    name: "encoded_ragged_grad"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "row_splits"
+    type_attr: "Tsplits"
+  }
+  input_arg {
+    name: "dense_values_shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "dense_values_grad"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  attr {
+    name: "Tsplits"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RngReadAndSkip.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RngReadAndSkip.pbtxt
new file mode 100644
index 00000000000..e64f5dd6d26
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/RngReadAndSkip.pbtxt
@@ -0,0 +1,20 @@
+op {
+  name: "RngReadAndSkip"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "delta"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "value"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SaveDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SaveDataset.pbtxt
index 984c793f735..18f99fbe359 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SaveDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SaveDataset.pbtxt
@@ -36,3 +36,42 @@ op {
     has_minimum: true
   }
 }
+op {
+  name: "SaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "path"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "shard_func_other_args"
+    type_list_attr: "Tshard_func_args"
+  }
+  attr {
+    name: "compression"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shard_func"
+    type: "func"
+  }
+  attr {
+    name: "use_shard_func"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "Tshard_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt
index f0868514182..4356d954c8f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt
@@ -58,3 +58,77 @@ op {
     has_minimum: true
   }
 }
+op {
+  name: "SnapshotDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "path"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "reader_func_other_args"
+    type_list_attr: "Treader_func_args"
+  }
+  input_arg {
+    name: "shard_func_other_args"
+    type_list_attr: "Tshard_func_args"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "compression"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reader_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "writer_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reader_func"
+    type: "func"
+  }
+  attr {
+    name: "shard_func"
+    type: "func"
+  }
+  attr {
+    name: "Treader_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tshard_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetKeyCounterAlg.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetKeyCounterAlg.pbtxt
new file mode 100644
index 00000000000..149359b7068
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomGetKeyCounterAlg.pbtxt
@@ -0,0 +1,33 @@
+op {
+  name: "StatelessRandomGetKeyCounterAlg"
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomNormalV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomNormalV2.pbtxt
new file mode 100644
index 00000000000..dac945afe53
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomNormalV2.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "StatelessRandomNormalV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformFullIntV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformFullIntV2.pbtxt
new file mode 100644
index 00000000000..d4511c5447b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformFullIntV2.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "StatelessRandomUniformFullIntV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformIntV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformIntV2.pbtxt
new file mode 100644
index 00000000000..be4ed607285
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformIntV2.pbtxt
@@ -0,0 +1,56 @@
+op {
+  name: "StatelessRandomUniformIntV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "minval"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxval"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformV2.pbtxt
new file mode 100644
index 00000000000..f66ee72bd4a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessRandomUniformV2.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "StatelessRandomUniformV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/StatelessTruncatedNormalV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/StatelessTruncatedNormalV2.pbtxt
new file mode 100644
index 00000000000..23f886f104d
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/StatelessTruncatedNormalV2.pbtxt
@@ -0,0 +1,51 @@
+op {
+  name: "StatelessTruncatedNormalV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapHasKey.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapHasKey.pbtxt
index 437822797af..a095c36d7c2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorMapHasKey.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapHasKey.pbtxt
@@ -6,14 +6,14 @@ op {
   }
   input_arg {
     name: "key"
-    type_attr: "element_dtype"
+    type_attr: "key_dtype"
   }
   output_arg {
     name: "has_key"
     type: DT_BOOL
   }
   attr {
-    name: "element_dtype"
+    name: "key_dtype"
     type: "type"
   }
 }
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorMapStackKeys.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorMapStackKeys.pbtxt
new file mode 100644
index 00000000000..c3befaa320a
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorMapStackKeys.pbtxt
@@ -0,0 +1,15 @@
+op {
+  name: "TensorMapStackKeys"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "key_dtype"
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 6ef5635e95a..c4bd397ee60 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -184,6 +184,7 @@ REGISTER_OP("PrefetchDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("slack_period: int = 0")
     .Attr("legacy_autotune: bool = true")
+    .Attr("buffer_size_min: int = 0")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // buffer_size should be a scalar.
@@ -882,6 +883,7 @@ REGISTER_OP("ModelDataset")
     .Output("handle: variant")
     .Attr("algorithm: int = 0")
     .Attr("cpu_budget: int = 0")
+    .Attr("ram_budget: int = 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 85cc7dc7e70..9838398a6fd 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -966,6 +966,8 @@ REGISTER_OP("SnapshotDatasetV2")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("compression: string = ''")
+    .Attr("reader_prefix: string = ''")
+    .Attr("writer_prefix: string = ''")
     .Attr("reader_func: func")
     .Attr("shard_func: func")
     .Attr("Treader_func_args: list(type) >= 0")
@@ -985,11 +987,12 @@ REGISTER_OP("SaveDataset")
     .Attr("shard_func: func")
     .Attr("use_shard_func: bool = true")
     .Attr("Tshard_func_args: list(type) >= 0")
+    .SetIsStateful()
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // `path` should be a scalar.
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      return shape_inference::ScalarShape(c);
+      return Status::OK();
     });
 
 REGISTER_OP("LoadDataset")
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 8dfc67f22d3..d01ab2a8e60 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -1146,8 +1146,9 @@ REGISTER_OP("GenerateBoundingBoxProposals")
       return Status::OK();
     });
 
-// TODO(ringwalt): Add a "fill_constant" argument for constant mode (default 0).
-// V2 op supports output_shape. V1 op is in contrib.
+// V3 op supports fill_value.
+// V2 op supports output_shape.
+// V1 op is in contrib.
 REGISTER_OP("ImageProjectiveTransformV2")
     .Input("images: dtype")
     .Input("transforms: float32")
@@ -1163,4 +1164,20 @@ REGISTER_OP("ImageProjectiveTransformV2")
                                    c->Dim(input, 3));
     });
 
+REGISTER_OP("ImageProjectiveTransformV3")
+    .Input("images: dtype")
+    .Input("transforms: float32")
+    .Input("output_shape: int32")
+    .Input("fill_value: float32")
+    .Attr("dtype: {uint8, int32, int64, float16, float32, float64}")
+    .Attr("interpolation: string")
+    .Attr("fill_mode: string = 'CONSTANT'")
+    .Output("transformed_images: dtype")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
+      return SetOutputToSizedImage(c, c->Dim(input, 0), 2 /* size_input_idx */,
+                                   c->Dim(input, 3));
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index fde8979fd35..a19c3a6d934 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/types.pb.h"
 
 namespace tensorflow {
 namespace {
@@ -68,7 +69,7 @@ REGISTER_OP("EmptyTensorList")
           0, &element_shape));
       c->set_output_handle_shapes_and_types(
           0, std::vector<shape_inference::ShapeAndType>{
-                 {element_shape, element_dtype}});
+                 {element_shape, element_dtype, ST_TENSOR_LIST}});
       return Status::OK();
     });
 
@@ -105,7 +106,7 @@ REGISTER_OP("TensorListPushBack")
       }
       c->set_output_handle_shapes_and_types(
           0, std::vector<shape_inference::ShapeAndType>{
-                 {element_shape, element_dtype}});
+                 {element_shape, element_dtype, ST_TENSOR_LIST}});
       return Status::OK();
     });
 
@@ -152,7 +153,7 @@ REGISTER_OP("TensorListPushBackBatch")
       }
       c->set_output_handle_shapes_and_types(
           0, std::vector<shape_inference::ShapeAndType>{
-                 {element_shape, element_dtype}});
+                 {element_shape, element_dtype, ST_TENSOR_LIST}});
       return Status::OK();
     });
 
@@ -344,7 +345,7 @@ REGISTER_OP("TensorListSplit")
                                   &element_shape_from_tensor_shape));
       c->set_output_handle_shapes_and_types(
           0, std::vector<shape_inference::ShapeAndType>{
-                 {element_shape, element_dtype}});
+                 {element_shape, element_dtype, ST_TENSOR_LIST}});
       return Status::OK();
     });
 
@@ -369,7 +370,7 @@ REGISTER_OP("TensorListFromTensor")
                                   &tensor_shape_except_first_dim));
       c->set_output_handle_shapes_and_types(
           0, std::vector<shape_inference::ShapeAndType>{
-                 {element_shape, element_dtype}});
+                 {element_shape, element_dtype, ST_TENSOR_LIST}});
       return Status::OK();
     });
 
@@ -379,11 +380,18 @@ REGISTER_OP("TensorListElementShape")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       auto* handle_data = c->input_handle_shapes_and_types(0);
+      // `TensorListElementShape` returns the scalar -1 if the rank of
+      // element_shape is unknown else returns the shape vector (with possibly
+      // unknown dims).
       if (!IsValidTensorListHandleData(handle_data)) {
-        c->set_output(0, c->Vector(c->UnknownDim()));
+        c->set_output(0, c->UnknownShape());
         return Status::OK();
       }
-      c->set_output(0, c->Vector(c->Rank((*handle_data)[0].shape)));
+      if (c->RankKnown((*handle_data)[0].shape)) {
+        c->set_output(0, c->Vector(c->Rank((*handle_data)[0].shape)));
+      } else {
+        c->set_output(0, c->UnknownShape());
+      }
       return Status::OK();
     });
 
@@ -402,7 +410,7 @@ REGISTER_OP("TensorListReserve")
       TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       c->set_output_handle_shapes_and_types(
           0, std::vector<shape_inference::ShapeAndType>{
-                 {element_shape, element_dtype}});
+                 {element_shape, element_dtype, ST_TENSOR_LIST}});
       return Status::OK();
     });
 
@@ -474,7 +482,7 @@ REGISTER_OP("TensorListSetItem")
         c->set_output_handle_shapes_and_types(0, *handle_data);
       } else {
         c->set_output_handle_shapes_and_types(
-            0, {{c->UnknownShape(), element_dtype}});
+            0, {{c->UnknownShape(), element_dtype, ST_TENSOR_LIST}});
       }
       return Status::OK();
     });
@@ -525,8 +533,8 @@ REGISTER_OP("TensorListScatter")
       shape_inference::ShapeHandle element_shape;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
           2, &element_shape));
-      c->set_output_handle_shapes_and_types(0,
-                                            {{element_shape, element_dtype}});
+      c->set_output_handle_shapes_and_types(
+          0, {{element_shape, element_dtype, ST_TENSOR_LIST}});
       c->set_output(0, c->Scalar());
       return Status::OK();
     });
@@ -545,8 +553,8 @@ REGISTER_OP("TensorListScatterV2")
       shape_inference::ShapeHandle element_shape;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
           2, &element_shape));
-      c->set_output_handle_shapes_and_types(0,
-                                            {{element_shape, element_dtype}});
+      c->set_output_handle_shapes_and_types(
+          0, {{element_shape, element_dtype, ST_TENSOR_LIST}});
       c->set_output(0, c->Scalar());
       return Status::OK();
     });
@@ -573,8 +581,8 @@ REGISTER_OP("TensorListScatterIntoExistingList")
         TF_RETURN_IF_ERROR(VerifyHandleData(c, *handle_data, element_dtype));
         element_shape = GetElementShapeFromHandleData(*handle_data);
       }
-      c->set_output_handle_shapes_and_types(0,
-                                            {{element_shape, element_dtype}});
+      c->set_output_handle_shapes_and_types(
+          0, {{element_shape, element_dtype, ST_TENSOR_LIST}});
       c->set_output(0, c->Scalar());
       return Status::OK();
     });
@@ -599,7 +607,7 @@ REGISTER_OP("TensorListConcatLists")
       bool handle_data_b_nonempty = handle_data_b && !handle_data_b->empty();
       if (!(handle_data_a_nonempty || handle_data_b_nonempty)) {
         c->set_output_handle_shapes_and_types(
-            0, {{c->UnknownShape(), element_dtype}});
+            0, {{c->UnknownShape(), element_dtype, ST_TENSOR_LIST}});
         return Status::OK();
       }
       shape_inference::ShapeAndType list_shape_type_a =
diff --git a/tensorflow/core/ops/lookup_ops.cc b/tensorflow/core/ops/lookup_ops.cc
index f99c1f6922a..8948df2cef3 100644
--- a/tensorflow/core/ops/lookup_ops.cc
+++ b/tensorflow/core/ops/lookup_ops.cc
@@ -87,10 +87,35 @@ REGISTER_OP("LookupTableFind")
       return Status::OK();
     });
 
+Status ValidateTableType(InferenceContext* c,
+                         const ShapeAndType& key_shape_and_type,
+                         const string& key_dtype_attr,
+                         const ShapeAndType& value_shape_and_type,
+                         const string& value_dtype_attr) {
+  DataType key_dtype;
+  TF_RETURN_IF_ERROR(c->GetAttr(key_dtype_attr, &key_dtype));
+  if (key_shape_and_type.dtype != key_dtype) {
+    return errors::InvalidArgument(
+        "Trying to read value with wrong dtype. "
+        "Expected ",
+        DataTypeString(key_shape_and_type.dtype), " got ",
+        DataTypeString(key_dtype));
+  }
+  DataType value_dtype;
+  TF_RETURN_IF_ERROR(c->GetAttr(value_dtype_attr, &value_dtype));
+  if (value_shape_and_type.dtype != value_dtype) {
+    return errors::InvalidArgument(
+        "Trying to read value with wrong dtype. "
+        "Expected ",
+        DataTypeString(value_shape_and_type.dtype), " got ",
+        DataTypeString(value_dtype));
+  }
+  return Status::OK();
+}
+
 Status ValidateTableResourceHandle(InferenceContext* c, ShapeHandle keys,
                                    const string& key_dtype_attr,
                                    const string& value_dtype_attr,
-                                   bool is_lookup,
                                    ShapeAndType* output_shape_and_type) {
   auto* handle_data = c->input_handle_shapes_and_types(0);
   if (handle_data == nullptr || handle_data->size() != 2) {
@@ -99,57 +124,35 @@ Status ValidateTableResourceHandle(InferenceContext* c, ShapeHandle keys,
   } else {
     const ShapeAndType& key_shape_and_type = (*handle_data)[0];
     const ShapeAndType& value_shape_and_type = (*handle_data)[1];
-    DataType key_dtype;
-    TF_RETURN_IF_ERROR(c->GetAttr(key_dtype_attr, &key_dtype));
-    if (key_shape_and_type.dtype != key_dtype) {
-      return errors::InvalidArgument(
-          "Trying to read value with wrong dtype. "
-          "Expected ",
-          DataTypeString(key_shape_and_type.dtype), " got ",
-          DataTypeString(key_dtype));
-    }
-    DataType value_dtype;
-    TF_RETURN_IF_ERROR(c->GetAttr(value_dtype_attr, &value_dtype));
-    if (value_shape_and_type.dtype != value_dtype) {
-      return errors::InvalidArgument(
-          "Trying to read value with wrong dtype. "
-          "Expected ",
-          DataTypeString(value_shape_and_type.dtype), " got ",
-          DataTypeString(value_dtype));
-    }
+    TF_RETURN_IF_ERROR(ValidateTableType(c, key_shape_and_type, key_dtype_attr,
+                                         value_shape_and_type,
+                                         value_dtype_attr));
     output_shape_and_type->dtype = value_shape_and_type.dtype;
-
-    if (is_lookup) {
-      if (c->RankKnown(key_shape_and_type.shape) && c->RankKnown(keys)) {
-        int keys_rank = c->Rank(keys);
-        int key_suffix_rank = c->Rank(key_shape_and_type.shape);
-        if (keys_rank < key_suffix_rank) {
-          return errors::InvalidArgument(
-              "Expected keys to have suffix ",
-              c->DebugString(key_shape_and_type.shape),
-              " but saw shape: ", c->DebugString(keys));
-        }
-        for (int d = 0; d < key_suffix_rank; d++) {
-          // Ensure the suffix of keys match what's in the Table.
-          DimensionHandle dim = c->Dim(key_shape_and_type.shape, d);
-          TF_RETURN_IF_ERROR(
-              c->ReplaceDim(keys, keys_rank - key_suffix_rank + d, dim, &keys));
-        }
-        std::vector<DimensionHandle> keys_prefix_vec;
-        keys_prefix_vec.reserve(keys_rank - key_suffix_rank);
-        for (int d = 0; d < keys_rank - key_suffix_rank; ++d) {
-          keys_prefix_vec.push_back(c->Dim(keys, d));
-        }
-        ShapeHandle keys_prefix = c->MakeShape(keys_prefix_vec);
-        TF_RETURN_IF_ERROR(c->Concatenate(keys_prefix,
-                                          value_shape_and_type.shape,
-                                          &output_shape_and_type->shape));
-      } else {
-        output_shape_and_type->shape = c->UnknownShape();
+    if (c->RankKnown(key_shape_and_type.shape) && c->RankKnown(keys)) {
+      int keys_rank = c->Rank(keys);
+      int key_suffix_rank = c->Rank(key_shape_and_type.shape);
+      if (keys_rank < key_suffix_rank) {
+        return errors::InvalidArgument(
+            "Expected keys to have suffix ",
+            c->DebugString(key_shape_and_type.shape),
+            " but saw shape: ", c->DebugString(keys));
       }
-    } else {
-      TF_RETURN_IF_ERROR(c->Concatenate(keys, value_shape_and_type.shape,
+      for (int d = 0; d < key_suffix_rank; d++) {
+        // Ensure the suffix of keys match what's in the Table.
+        DimensionHandle dim = c->Dim(key_shape_and_type.shape, d);
+        TF_RETURN_IF_ERROR(
+            c->ReplaceDim(keys, keys_rank - key_suffix_rank + d, dim, &keys));
+      }
+      std::vector<DimensionHandle> keys_prefix_vec;
+      keys_prefix_vec.reserve(keys_rank - key_suffix_rank);
+      for (int d = 0; d < keys_rank - key_suffix_rank; ++d) {
+        keys_prefix_vec.push_back(c->Dim(keys, d));
+      }
+      ShapeHandle keys_prefix = c->MakeShape(keys_prefix_vec);
+      TF_RETURN_IF_ERROR(c->Concatenate(keys_prefix, value_shape_and_type.shape,
                                         &output_shape_and_type->shape));
+    } else {
+      output_shape_and_type->shape = c->UnknownShape();
     }
   }
   return Status::OK();
@@ -175,8 +178,7 @@ REGISTER_OP("LookupTableFindV2")
           c,
           /*keys=*/c->input(1),
           /*key_dtype_attr=*/"Tin",
-          /*value_dtype_attr=*/"Tout",
-          /*is_lookup=*/true, &value_shape_and_type));
+          /*value_dtype_attr=*/"Tout", &value_shape_and_type));
       c->set_output(0, value_shape_and_type.shape);
 
       return Status::OK();
@@ -268,16 +270,18 @@ REGISTER_OP("LookupTableExportV2")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle handle;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
-      ShapeHandle keys = c->UnknownShapeOfRank(1);
-      ShapeAndType value_shape_and_type;
-      TF_RETURN_IF_ERROR(ValidateTableResourceHandle(
-          c,
-          /*keys=*/keys,
-          /*key_dtype_attr=*/"Tkeys",
-          /*value_dtype_attr=*/"Tvalues",
-          /*is_lookup=*/false, &value_shape_and_type));
-      c->set_output(0, keys);
-      c->set_output(1, value_shape_and_type.shape);
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr && handle_data->size() == 2) {
+        const ShapeAndType& key_shape_and_type = (*handle_data)[0];
+        const ShapeAndType& value_shape_and_type = (*handle_data)[1];
+        TF_RETURN_IF_ERROR(ValidateTableType(c, key_shape_and_type,
+                                             /*key_dtype_attr*/ "Tkeys",
+                                             value_shape_and_type,
+                                             /*value_dtype_attr*/ "Tvalues"));
+      }
+      // Different lookup tables have different output shapes.
+      c->set_output(0, c->UnknownShape());
+      c->set_output(1, c->UnknownShape());
       return Status::OK();
     });
 
diff --git a/tensorflow/core/ops/lookup_ops_test.cc b/tensorflow/core/ops/lookup_ops_test.cc
new file mode 100644
index 00000000000..ac899d59993
--- /dev/null
+++ b/tensorflow/core/ops/lookup_ops_test.cc
@@ -0,0 +1,88 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(LookupOpsTest, LookupTableFindV2_ShapeFn) {
+  ShapeInferenceTestOp op("LookupTableFindV2");
+  INFER_ERROR("Shape must be rank 0 but is rank 1", op, "[?];?;?");
+  INFER_ERROR("Shape must be at most rank 1 but is rank 2", op, "[];?;[1,1]");
+  TF_ASSERT_OK(NodeDefBuilder("test", "LookupTableFindV2")
+                   .Input({"table_handle", 0, DT_RESOURCE})
+                   .Input({"keys", 0, DT_INT64})
+                   .Input({"default_value", 0, DT_FLOAT})
+                   .Attr("Tin", DT_INT64)
+                   .Attr("Tout", DT_FLOAT)
+                   .Finalize(&op.node_def));
+  std::vector<std::vector<ShapeInferenceTestOp::ShapeAndType>> types;
+  auto set_types = [&op, &types](DataType key_type, DataType value_type) {
+    types.emplace_back();
+    auto& table = types.back();
+    table.emplace_back("[3]", key_type);
+    table.emplace_back("[4]", value_type);
+    op.input_resource_handle_shapes_and_types = {&table, nullptr, nullptr};
+  };
+  // If there's no input handle shapes and types, output shape is unknown.
+  INFER_OK(op, "[];[?,3];[4]", "?");
+  // Set input handle with mismatched key type.
+  set_types(DT_INT32, DT_FLOAT);
+  INFER_ERROR("read value with wrong dtype", op, "[];[?,3];[4]");
+  // Set input handle with mismatched value type.
+  set_types(DT_INT64, DT_INT64);
+  INFER_ERROR("read value with wrong dtype", op, "[];[?,3];[4]");
+  // Set input handle with matched types.
+  set_types(DT_INT64, DT_FLOAT);
+  INFER_OK(op, "[];[?,3];[4]", "[d1_0,4]");
+  INFER_OK(op, "[];[1,3];[4]", "[d1_0,4]");
+  INFER_OK(op, "[];[1,?];[4]", "[d1_0,4]");
+}
+
+TEST(LookupOpsTest, LookupTableExportV2_ShapeFn) {
+  ShapeInferenceTestOp op("LookupTableExportV2");
+  TF_ASSERT_OK(NodeDefBuilder("test", "LookupTableExportV2")
+                   .Input({"table_handle", 0, DT_RESOURCE})
+                   .Attr("Tkeys", DT_INT64)
+                   .Attr("Tvalues", DT_FLOAT)
+                   .Finalize(&op.node_def));
+  std::vector<std::vector<ShapeInferenceTestOp::ShapeAndType>> types;
+  auto set_types = [&op, &types](DataType key_type, DataType value_type) {
+    types.emplace_back();
+    auto& table = types.back();
+    table.emplace_back("[3]", key_type);
+    table.emplace_back("[4]", value_type);
+    op.input_resource_handle_shapes_and_types = {&table};
+  };
+  // Set input handle with mismatched key type.
+  set_types(DT_INT32, DT_FLOAT);
+  INFER_ERROR("read value with wrong dtype", op, "[]");
+  // Set input handle with mismatched value type.
+  set_types(DT_INT64, DT_INT64);
+  INFER_ERROR("read value with wrong dtype", op, "[]");
+  // Set input handle with matched types.
+  set_types(DT_INT64, DT_FLOAT);
+  INFER_OK(op, "[]", "?;?");
+}
+
+// TODO(b/169969017): add shape fn tests for rest of the ops.
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/map_ops.cc b/tensorflow/core/ops/map_ops.cc
index f52075132eb..d54ef54b481 100644
--- a/tensorflow/core/ops/map_ops.cc
+++ b/tensorflow/core/ops/map_ops.cc
@@ -63,16 +63,25 @@ REGISTER_OP("TensorMapErase")
     .Attr("key_dtype: type")
     .Attr("value_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Scalar());        // output map
+      c->set_output(0, c->Scalar());  // output map
       return Status::OK();
     });
 
 REGISTER_OP("TensorMapHasKey")
     .Input("input_handle: variant")
-    .Input("key: element_dtype")
+    .Input("key: key_dtype")
     .Output("has_key: bool")
-    .Attr("element_dtype: type")
+    .Attr("key_dtype: type")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("TensorMapStackKeys")
+    .Input("input_handle: variant")
+    .Output("keys: key_dtype")
+    .Attr("key_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->UnknownShape());  // output keys
+      return Status::OK();
+    });
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/math_grad_test.cc b/tensorflow/core/ops/math_grad_test.cc
index ef839de92c9..0bc0e351b39 100644
--- a/tensorflow/core/ops/math_grad_test.cc
+++ b/tensorflow/core/ops/math_grad_test.cc
@@ -434,9 +434,6 @@ class TestOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override { ctx->set_output(0, Tensor()); }
 };
 REGISTER_KERNEL_BUILDER(Name("TestOpWithNoGrad").Device(DEVICE_CPU), TestOp);
-#ifdef TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("TestOpWithNoGrad").Device(DEVICE_SYCL), TestOp);
-#endif  // TENSORFLOW_USE_SYCL
 
 TEST_F(MathGradTest, Error_Reporting) {
   auto x = test::AsTensor<float>({-3.f});
@@ -893,8 +890,6 @@ TEST_F(MathGradTest, Pow) {
   }
 }
 
-// TODO{lukeiwanski}: Implement Complex Pow for SYCL
-#ifndef TENSORFLOW_USE_SYCL
 TEST_F(MathGradTest, ComplexPow) {
   auto x = test::AsTensor<complex64>({0.f, 2.f, -2.f}, TensorShape({3}));
   auto y = test::AsTensor<complex64>({2.f, 2.f, 2.f}, TensorShape({3}));
@@ -941,7 +936,6 @@ TEST_F(MathGradTest, ComplexPow) {
                                 TensorShape({3})),
       4.5e-6f);
 }
-#endif  // TENSORFLOW_USE_SYCL
 
 TEST_F(MathGradTest, Xlogy) {
   auto x = test::AsTensor<float>({0.f, 0.f, 2.f, 3.f, 4.f, 5.f},
@@ -1185,8 +1179,6 @@ TEST_F(MathGradTest, MatMul_11) {
   test::ExpectClose(dy, MatMul(dz, true, x, true));
 }
 
-// TODO{lukeiwanski}: Implement BatchMatMul for SYCL
-#ifndef TENSORFLOW_USE_SYCL
 TEST_F(MathGradTest, BatchMatMul_00) {
   auto x = test::AsTensor<float>({1.f, 2.f, 3.f, 4.f, 5.f, 6.f},
                                  TensorShape({1, 2, 3}));
@@ -1234,7 +1226,6 @@ TEST_F(MathGradTest, BatchMatMul_11) {
   test::ExpectClose(dx, BatchMatMul(y, true, dz, true));
   test::ExpectClose(dy, BatchMatMul(dz, true, x, true));
 }
-#endif  // TENSORFLOW_USE_SYCL
 
 TEST_F(MathGradTest, BatchMatMulV2_00) {
   auto x = test::AsTensor<float>({1.f, 2.f, 3.f, 4.f, 5.f, 6.f},
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index cbf1ef53dde..d6fde7248ab 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -382,8 +382,8 @@ REGISTER_OP("AddV2")
     .Input("y: T")
     .Output("z: T")
     .Attr(
-        "T: {bfloat16, half, float, double, uint8, int8, int16, int32, int64, "
-        "complex64, complex128}")
+        "T: {bfloat16, half, float, double, uint8, int8, int16, uint32, int32, "
+        "int64, complex64, complex128}")
     .SetShapeFn(shape_inference::BroadcastBinaryOpShapeFn)
     .SetIsAggregate()
     .SetIsCommutative();
@@ -703,27 +703,24 @@ REGISTER_OP("GreaterEqual").COMPARISON();
 
 // --------------------------------------------------------------------------
 
-#define EQUALITY_COMPARISON()                                              \
-  Input("x: T")                                                            \
-      .Input("y: T")                                                       \
-      .Output("z: bool")                                                   \
-      .SetIsCommutative()                                                  \
-      .Attr(                                                               \
-          "T: {bfloat16, half, float, double, uint8, int8, int16, int32, " \
-          "int64, uint16, uint32, uint64, complex64, "                     \
-          "quint8, qint8, qint32, string, bool, complex128}")              \
-      .Attr("incompatible_shape_error: bool = true")                       \
-      .SetShapeFn([](InferenceContext* c) {                                \
-        ShapeHandle x = c->input(0);                                       \
-        ShapeHandle y = c->input(1);                                       \
-        ShapeHandle output;                                                \
-        bool incompatible_shape_error;                                     \
-        TF_RETURN_IF_ERROR(c->GetAttr("incompatible_shape_error",          \
-                                      &incompatible_shape_error));         \
-        TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFnHelper(           \
-            c, x, y, incompatible_shape_error, &output));                  \
-        c->set_output(0, output);                                          \
-        return Status::OK();                                               \
+#define EQUALITY_COMPARISON()                                      \
+  Input("x: T")                                                    \
+      .Input("y: T")                                               \
+      .Output("z: bool")                                           \
+      .SetIsCommutative()                                          \
+      .Attr("T: type")                                             \
+      .Attr("incompatible_shape_error: bool = true")               \
+      .SetShapeFn([](InferenceContext* c) {                        \
+        ShapeHandle x = c->input(0);                               \
+        ShapeHandle y = c->input(1);                               \
+        ShapeHandle output;                                        \
+        bool incompatible_shape_error;                             \
+        TF_RETURN_IF_ERROR(c->GetAttr("incompatible_shape_error",  \
+                                      &incompatible_shape_error)); \
+        TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFnHelper(   \
+            c, x, y, incompatible_shape_error, &output));          \
+        c->set_output(0, output);                                  \
+        return Status::OK();                                       \
       })
 
 REGISTER_OP("Equal").EQUALITY_COMPARISON();
@@ -1026,7 +1023,7 @@ REGISTER_OP("Min")
     .Input("reduction_indices: Tidx")
     .Output("output: T")
     .Attr("keep_dims: bool = false")
-    .Attr("T: numbertype")
+    .Attr("T: {realnumbertype, quantizedtype}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn(shape_inference::ReductionShape);
 
@@ -1035,7 +1032,7 @@ REGISTER_OP("Max")
     .Input("reduction_indices: Tidx")
     .Output("output: T")
     .Attr("keep_dims: bool = false")
-    .Attr("T: numbertype")
+    .Attr("T: {realnumbertype, quantizedtype}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .SetShapeFn(shape_inference::ReductionShape);
 
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
index 248cf1d0e8a..1604527b941 100644
--- a/tensorflow/core/ops/mkl_nn_ops.cc
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -32,6 +32,129 @@ using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
+REGISTER_OP("_MklNativeConv3D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Output("output: T")
+    .Attr("T: {bfloat16, float}")
+    .Attr("strides: list(int) >= 5")
+    .Attr("is_filter_const: bool = false")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::Conv3DShape)
+    .Doc(R"doc(
+MKL version of Conv3D operator that does not depend on layout propagation.
+Uses oneDNN APIs to perform 3D convolution.
+
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklNativeConv3DBackpropInputV2")
+    .Input("input_sizes: Tshape")
+    .Input("filter: T")
+    .Input("out_backprop: T")
+    .Output("output: T")
+    .Attr("T: {bfloat16, float}")
+    .Attr("strides: list(int) >= 5")
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
+    .Attr("Tshape: {int32, int64} = DT_INT32")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 5, &s));
+      c->set_output(0, s);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of Convolution3D backward input op that does not depend on layout
+propagation. Uses oneDNN APIs to compute the gradients of convolution with
+respect to the input.
+
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklNativeConv3DBackpropFilterV2")
+    .Input("input: T")
+    .Input("filter_sizes: int32")
+    .Input("out_backprop: T")
+    .Output("output: T")
+    .Attr("T: {bfloat16, float}")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 5, &s));
+      c->set_output(0, s);
+      return Status::OK();
+    })
+    .Doc(R"doc(
+MKL version of Conv3DBackpropFilter op that does not depend on layout
+propagation. Uses oneDNN APIs to compute the gradients of convolution
+with respect to the filter.
+
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklNativeDepthwiseConv2dNative")
+    .Input("input: T")
+    .Input("filter: T")
+    .Output("output: T")
+    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = false")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetExplicitPaddingsAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::DepthwiseConv2DNativeShape);
+
+REGISTER_OP("_MklNativeDepthwiseConv2dNativeBackpropInput")
+    .Input("input_sizes: int32")
+    .Input("filter: T")
+    .Input("out_backprop: T")
+    .Output("output: T")
+    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetExplicitPaddingsAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
+      c->set_output(0, s);
+      return Status::OK();
+    });
+
+REGISTER_OP("_MklNativeDepthwiseConv2dNativeBackpropFilter")
+    .Input("input: T")
+    .Input("filter_sizes: int32")
+    .Input("out_backprop: T")
+    .Output("output: T")
+    .Attr("T: {half, bfloat16, float, double}")
+    .Attr("strides: list(int)")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetExplicitPaddingsAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
+      c->set_output(0, s);
+      return Status::OK();
+    });
+
 REGISTER_OP("_MklFusedConv2D")
     .Input("input: T")
     .Input("filter: T")
@@ -54,6 +177,8 @@ REGISTER_OP("_MklFusedConv2D")
     .Attr("fused_ops: list(string) = []")
     // Attributes for the FusedBatchNorm ------------------------------------ //
     .Attr("epsilon: float = 0.0001")
+    // Attributes for the LeakyRelu ----------------------------------------- //
+    .Attr("leakyrelu_alpha: float = 0.2")
     // ---------------------------------------------------------------------- //
     .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
@@ -61,6 +186,52 @@ REGISTER_OP("_MklFusedConv2D")
  is expected to create these operators.
 )doc");
 
+REGISTER_OP("_MklNativeFusedConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("args: num_args * T")
+    .Output("output: T")
+    .Attr("T: {bfloat16, float}")
+    .Attr("num_args: int >= 0")
+    .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = false")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetExplicitPaddingsAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("fused_ops: list(string) = []")
+    // Attributes for the FusedBatchNorm ------------------------------------ //
+    .Attr("epsilon: float = 0.0001")
+    // Attributes for the LeakyRelu ----------------------------------------- //
+    .Attr("leakyrelu_alpha: float = 0.2")
+    // ---------------------------------------------------------------------- //
+    .SetShapeFn(shape_inference::Conv2DShapeWithExplicitPadding)
+    .Doc(R"doc(
+*NOTE*: Do not invoke this operator directly in Python. oneDNN graph transformer
+ is expected to create these operators.
+)doc");
+
+REGISTER_OP("_MklNativeConv2DWithBias")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("bias: T")
+    .Output("output: T")
+    .Attr("T: {bfloat16, float}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr("is_filter_const: bool = false")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+MKL version of Conv2D and BiasAdd operator. Uses oneDNN APIs to perform
+2D convolution and add Bias to the output of convolution.
+
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke this operator.
+)doc");
+
 REGISTER_OP("_MklFusedDepthwiseConv2dNative")
     .Input("input: T")
     .Input("filter: T")
@@ -82,6 +253,28 @@ REGISTER_OP("_MklFusedDepthwiseConv2dNative")
     .Attr("fused_ops: list(string) = []")
     // Attributes for the FusedBatchNorm ------------------------------------ //
     .Attr("epsilon: float = 0.0001")
+    // Attributes for the LeakyRelu ----------------------------------------- //
+    .Attr("leakyrelu_alpha: float = 0.2")
+    // ---------------------------------------------------------------------- //
+    .SetShapeFn(shape_inference::DepthwiseConv2DNativeShape);
+
+REGISTER_OP("_MklNativeFusedDepthwiseConv2dNative")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("args: num_args * T")
+    .Output("output: T")
+    .Attr("T: {bfloat16, float}")
+    .Attr("num_args: int >= 0")
+    .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = false")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("fused_ops: list(string) = []")
+    // Attributes for the FusedBatchNorm ------------------------------------ //
+    .Attr("epsilon: float = 0.0001")
+    // Attributes for the LeakyRelu ----------------------------------------- //
+    .Attr("leakyrelu_alpha: float = 0.2")
     // ---------------------------------------------------------------------- //
     .SetShapeFn(shape_inference::DepthwiseConv2DNativeShape);
 
@@ -112,6 +305,29 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("_MklNativeFusedMatMul")
+    .Input("a: T")
+    .Input("b: T")
+    .Input("args: num_args * T")
+    .Output("product: T")
+    .Attr("is_filter_const: bool = false")
+    .Attr("transpose_a: bool = false")
+    .Attr("transpose_b: bool = false")
+    .Attr("T: {bfloat16, float}")
+    .Attr("num_args: int >= 0")
+    .Attr("fused_ops: list(string) = []")
+    // Attributes for the FusedBatchNorm ----------- //
+    .Attr("epsilon: float = 0.0001")
+    // --------------------------------------------- //
+    .SetShapeFn(shape_inference::MatMulShape)
+    .Doc(R"doc(
+oneDNN version of FusedMatMul operator that does not depend
+on layout propagation. Uses oneDNN APIs to implement MatMul fusion.
+
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke this one.
+)doc");
+
 REGISTER_OP("__MklDummyPadWithFusedConv2D")
     .Input("input: T")
     .Input("filter: T")
@@ -131,6 +347,8 @@ REGISTER_OP("__MklDummyPadWithFusedConv2D")
     .Attr("Tpaddings: {int32, int64} = DT_INT32")
     // Attributes for the FusedBatchNorm ------------------------------------ //
     .Attr("epsilon: float = 0.0001")
+    // Attributes for the LeakyRelu ----------------------------------------- //
+    .Attr("leakyrelu_alpha: float = 0.2")
     // ---------------------------------------------------------------------- //
     .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
@@ -162,6 +380,8 @@ REGISTER_OP("_MklPadWithFusedConv2D")
     .Attr("Tpaddings: {int32, int64} = DT_INT32")
     // Attributes for the FusedBatchNorm ------------------------------------ //
     .Attr("epsilon: float = 0.0001")
+    // Attributes for the LeakyRelu ----------------------------------------- //
+    .Attr("leakyrelu_alpha: float = 0.2")
     // ---------------------------------------------------------------------- //
     .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
@@ -169,6 +389,205 @@ REGISTER_OP("_MklPadWithFusedConv2D")
  is expected to create these operators.
 )doc");
 
+REGISTER_OP("_MklNativePadWithFusedConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("args: num_args * T")
+    .Input("paddings: Tpaddings")
+    .Output("output: T")
+    .Attr("T: {bfloat16, float}")
+    .Attr("num_args: int >= 0")
+    .Attr("strides: list(int)")
+    .Attr("is_filter_const: bool = false")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("fused_ops: list(string) = []")
+    .Attr("Tpaddings: {int32, int64} = DT_INT32")
+    // Attributes for the FusedBatchNorm ------------------------------------ //
+    .Attr("epsilon: float = 0.0001")
+    // Attributes for the LeakyRelu ----------------------------------------- //
+    .Attr("leakyrelu_alpha: float = 0.2")
+    // ---------------------------------------------------------------------- //
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+*NOTE*: Do not invoke this operator directly in Python. oneDNN graph transformer
+ is expected to create these operators.
+)doc");
+
+REGISTER_OP("_MklNativePadWithConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("paddings: Tpaddings")
+    .Output("output: T")
+    .Attr("T: {bfloat16, float}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("is_filter_const: bool = false")
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("Tpaddings: {int32, int64} = DT_INT32")
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+MKL version of Pad and Conv2D fusion that does not depend
+on layout propagation. Uses oneDNN APIs to perform
+the fusion.
+
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklNativeAvgPool")
+    .Input("value: T")
+    .Output("output: T")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("T: {float, half, double, bfloat16}")
+    .SetShapeFn(shape_inference::AvgPoolShape)
+    .Doc(R"doc(
+oneDNN version of AvgPool operator that does not depend on layout
+propagation. Uses oneDNN APIs to perform average pooling on the input.
+
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklNativeAvgPoolGrad")
+    .Input("orig_input_shape: int32")
+    .Input("grad: T")
+    .Output("output: T")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("T: {float, half, double, bfloat16}")
+    .SetShapeFn(shape_inference::AvgPoolGradShape)
+    .Doc(R"doc(
+oneDNN version of AvgPoolGrad operator that does not depend on layout
+propagation. Uses oneDNN APIs to compute gradients of AvgPool operator.
+
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklNativeAvgPool3D")
+    .Input("value: T")
+    .Output("output: T")
+    .Attr("ksize: list(int) >= 5")
+    .Attr("strides: list(int) >= 5")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("T: {float, half, double, bfloat16}")
+    .SetShapeFn(shape_inference::Pool3DShape)
+    .Doc(R"doc(
+oneDNN version of AvgPool3D operator that does not depend on layout
+propagation. Uses oneDNN APIs to perform 3D average pooling on the input.
+
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklNativeAvgPool3DGrad")
+    .Input("orig_input_shape: int32")
+    .Input("grad: T")
+    .Output("output: T")
+    .Attr("ksize: list(int) >= 5")
+    .Attr("strides: list(int) >= 5")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("T: {float, half, double, bfloat16}")
+    .SetShapeFn(shape_inference::AvgPool3DGradShape)
+    .Doc(R"doc(
+oneDNN version of AvgPool3DGrad operator that does not depend on layout
+propagation. Uses oneDNN APIs to compute gradients of AvgPool3D function.
+
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklNativeMaxPool")
+    .Attr("T: {float, half, bfloat16} = DT_FLOAT")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetExplicitPaddingsAttrString())
+    .Attr("workspace_enabled: bool = false")
+    .Input("input: T")
+    .Output("output: T")
+    .Output("workspace: uint8")
+    .SetShapeFn(shape_inference::MaxPoolShape)
+    .Doc(R"doc(
+oneDNN version of MaxPool operator that does not depend
+on layout propagation. Uses oneDNN APIs to perform max pooling
+on the input.
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklNativeMaxPoolGrad")
+    .Attr("T: {float, half, bfloat16} = DT_FLOAT")
+    .Attr("ksize: list(int) >= 4")
+    .Attr("strides: list(int) >= 4")
+    .Attr("workspace_enabled: bool = false")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetExplicitPaddingsAttrString())
+    .Input("orig_input: T")
+    .Input("orig_output: T")
+    .Input("grad: T")
+    .Input("workspace: uint8")
+    .Output("output: T")
+    .SetShapeFn(shape_inference::MaxPoolGradShape)
+    .Doc(R"doc(
+oneDNN version of MaxPoolGrad that does not depend on layout propagation.
+Uses oneDNN APIs to compute gradients of MaxPool operator.
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklNativeMaxPool3D")
+    .Input("input: T")
+    .Output("output: T")
+    .Output("workspace: uint8")
+    .Attr("ksize: list(int) >= 5")
+    .Attr("strides: list(int) >= 5")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("T: {half, bfloat16, float}")
+    .Attr("workspace_enabled: bool = false")
+    .SetShapeFn(shape_inference::Pool3DShape)
+    .Doc(R"doc(
+oneDNN version of MaxPool3D operator that does not depend on layout propagation.
+Uses oneDNN APIs to perform 3D max pooling on the input.
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklNativeMaxPool3DGrad")
+    .Input("orig_input: TInput")
+    .Input("orig_output: TInput")
+    .Input("grad: T")
+    .Input("workspace: uint8")
+    .Output("output: T")
+    .Attr("ksize: list(int) >= 5")
+    .Attr("strides: list(int) >= 5")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnet3dDataFormatAttrString())
+    .Attr("T: {half, bfloat16, float} = DT_FLOAT")
+    .Attr("TInput: {half, bfloat16, float} = DT_FLOAT")
+    .Attr("workspace_enabled: bool = false")
+    .SetShapeFn(shape_inference::MaxPool3DGradShape)
+    .Doc(R"doc(
+oneDNN version of MaxPool3DGrad operator that does not depend on layout
+propagation. Uses oneDNN APIs to compute gradients of MaxPool3D function.
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklQuantizedMaxPool")
     .Input("input:         T")
     .Input("min_input:     float")
@@ -1411,6 +1830,170 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("_MklNativeFusedBatchNorm")
+    .Input("x: T")
+    .Input("scale: T")
+    .Input("offset: T")
+    .Input("mean: T")
+    .Input("variance: T")
+    .Output("y: T")
+    .Output("batch_mean: T")
+    .Output("batch_variance: T")
+    .Output("reserve_space_1: T")
+    .Output("reserve_space_2: T")
+    .Attr("T: numbertype")
+    .Attr("epsilon: float = 0.0001")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("exponential_avg_factor: float = 1.0")
+    .Attr("is_training: bool = true")
+    .SetShapeFn(shape_inference::FusedBatchNormShape)
+    .Doc(R"doc(
+oneDNN version of FusedBatchNorm operator that does not depend on layout
+propagation. Uses oneDNN APIs to perform fused batch normalization.
+
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklNativeFusedBatchNormGrad")
+    .Input("y_backprop: T")
+    .Input("x: T")
+    .Input("scale: T")
+    .Input("reserve_space_1: T")
+    .Input("reserve_space_2: T")
+    .Output("x_backprop: T")
+    .Output("scale_backprop: T")
+    .Output("offset_backprop: T")
+    .Output("reserve_space_3: T")
+    .Output("reserve_space_4: T")
+    .Attr("T: numbertype")
+    .Attr("epsilon: float = 0.0001")
+    .Attr("data_format: string = 'NHWC'")
+    .Attr("is_training: bool = true")
+    .SetShapeFn(shape_inference::FusedBatchNormGradShape)
+    .Doc(R"doc(
+oneDNN version of FusedBatchNormGrad operator that does not depend
+on layout propagation. Uses oneDNN APIs to compute gradients for fused
+batch normalization.
+
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklNativeFusedBatchNormV2")
+    .Input("x: T")
+    .Input("scale: U")
+    .Input("offset: U")
+    .Input("mean: U")
+    .Input("variance: U")
+    .Output("y: T")
+    .Output("batch_mean: U")
+    .Output("batch_variance: U")
+    .Output("reserve_space_1: U")
+    .Output("reserve_space_2: U")
+    .Attr("T: {bfloat16, float}")
+    .Attr("U: {float}")
+    .Attr("epsilon: float = 0.0001")
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("exponential_avg_factor: float = 1.0")
+    .Attr("is_training: bool = true")
+    .SetShapeFn(shape_inference::FusedBatchNormShape);
+
+REGISTER_OP("_MklNativeFusedBatchNormGradV2")
+    .Input("y_backprop: T")
+    .Input("x: T")
+    .Input("scale: float")
+    .Input("reserve_space_1: U")
+    .Input("reserve_space_2: U")
+    .Output("x_backprop: T")
+    .Output("scale_backprop: U")
+    .Output("offset_backprop: U")
+    .Output("reserve_space_3: U")
+    .Output("reserve_space_4: U")
+    .Attr("T: {bfloat16, float}")
+    .Attr("U: {float}")
+    .Attr("epsilon: float = 0.0001")
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("is_training: bool = true")
+    .SetShapeFn(shape_inference::FusedBatchNormGradShape);
+
+REGISTER_OP("_MklNativeFusedBatchNormV3")
+    .Input("x: T")
+    .Input("scale: U")
+    .Input("offset: U")
+    .Input("mean: U")
+    .Input("variance: U")
+    .Output("y: T")
+    .Output("batch_mean: U")
+    .Output("batch_variance: U")
+    .Output("reserve_space_1: U")
+    .Output("reserve_space_2: U")
+    .Output("reserve_space_3: U")
+    .Attr("T: {half, bfloat16, float}")
+    .Attr("U: {float}")
+    .Attr("epsilon: float = 0.0001")
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("exponential_avg_factor: float = 1.0")
+    .Attr("is_training: bool = true")
+    .SetShapeFn(shape_inference::FusedBatchNormShape)
+    .Doc(
+        R"doc(oneDNN version of FusedBatchNormV3 operator that does not depend
+        on layout propagation. Do not invoke this operator directly in Python.
+        Graph rewrite pass is expected to invoke this operator.)doc");
+
+REGISTER_OP("_MklNativeFusedBatchNormGradV3")
+    .Input("y_backprop: T")
+    .Input("x: T")
+    .Input("scale: float")
+    .Input("reserve_space_1: U")
+    .Input("reserve_space_2: U")
+    .Input("reserve_space_3: U")
+    .Output("x_backprop: T")
+    .Output("scale_backprop: U")
+    .Output("offset_backprop: U")
+    .Output("reserve_space_4: U")
+    .Output("reserve_space_5: U")
+    .Attr("T: {half, bfloat16, float}")
+    .Attr("U: {float}")
+    .Attr("epsilon: float = 0.0001")
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("is_training: bool = true")
+    .SetShapeFn(shape_inference::FusedBatchNormGradShape)
+    .Doc(
+        R"doc(oneDNN version of FusedBatchNormGradV3 that does not depend
+        on layout propagation. Do not invoke this operator directly in Python.
+        Graph rewrite pass is expected to invoke this operator.)doc");
+
+REGISTER_OP("_MklNativeFusedBatchNormEx")
+    .Input("x: T")
+    .Input("scale: U")
+    .Input("offset: U")
+    .Input("mean: U")
+    .Input("variance: U")
+    .Input("side_input: num_side_inputs * T")
+    .Output("y: T")
+    .Output("batch_mean: U")
+    .Output("batch_variance: U")
+    .Output("reserve_space_1: U")
+    .Output("reserve_space_2: U")
+    .Output("reserve_space_3: U")
+    .Attr("T: {bfloat16, float}")
+    .Attr("U: {float}")
+    .Attr("epsilon: float = 0.0001")
+    .Attr("exponential_avg_factor: float = 1.0")
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("num_side_inputs: int >= 0 = 0")
+    .Attr("activation_mode: string = \"Identity\"")
+    .Attr("is_training: bool = true")
+    .SetShapeFn(shape_inference::FusedBatchNormShape)
+    .Doc(R"doc(
+oneDNN version of FusedBatchNormEx operator that does not depend on layout propagation.
+Uses oneDNN APIs to perform fused batch normalization and relu.
+
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 1ef0e82bf4a..aeef3f6cf89 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -88,13 +88,7 @@ REGISTER_OP("AvgPoolGrad")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("T: {half, bfloat16, float, double}")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle s;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
-      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
-      c->set_output(0, s);
-      return Status::OK();
-    });
+    .SetShapeFn(shape_inference::AvgPoolGradShape);
 
 // --------------------------------------------------------------------------
 
@@ -221,7 +215,7 @@ REGISTER_OP("FusedBatchNormV3")
     .Attr("U: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr("exponential_avg_factor: float = 1.0")
-    .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetConvnetDataFormat2D3DAttrString())
     .Attr("is_training: bool = true")
     .SetShapeFn(shape_inference::FusedBatchNormV3Shape);
 
@@ -308,7 +302,7 @@ REGISTER_OP("FusedBatchNormGradV3")
     .Attr("T: {half, bfloat16, float}")
     .Attr("U: {float}")
     .Attr("epsilon: float = 0.0001")
-    .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetConvnetDataFormat2D3DAttrString())
     .Attr("is_training: bool = true")
     .SetShapeFn(shape_inference::FusedBatchNormGradShape);
 // --------------------------------------------------------------------------
@@ -404,6 +398,8 @@ REGISTER_OP("_FusedConv2D")
     .Attr("fused_ops: list(string) = []")
     // Attributes for the FusedBatchNorm ------------------------------------ //
     .Attr("epsilon: float = 0.0001")
+    // Attributes for the LeakyRelu ----------------------------------------- //
+    .Attr("leakyrelu_alpha: float = 0.2")
     // ---------------------------------------------------------------------- //
     .SetShapeFn(shape_inference::Conv2DShapeWithExplicitPadding)
     .Doc(R"doc(
@@ -633,7 +629,10 @@ REGISTER_OP("_FusedDepthwiseConv2dNative")
     .Attr("fused_ops: list(string) = []")
     // Attributes for the FusedBatchNorm ------------------------------------ //
     .Attr("epsilon: float = 0.0001")
+    // Attributes for the LeakyRelu ----------------------------------------- //
+    .Attr("leakyrelu_alpha: float = 0.2")
     // ---------------------------------------------------------------------- //
+
     .SetShapeFn(shape_inference::DepthwiseConv2DNativeShape);
 
 // --------------------------------------------------------------------------
@@ -737,13 +736,7 @@ REGISTER_OP("AvgPool3DGrad")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
     .Attr("T: {half, bfloat16, float, double}")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle s;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
-      TF_RETURN_IF_ERROR(c->WithRank(s, 5, &s));
-      c->set_output(0, s);
-      return Status::OK();
-    });
+    .SetShapeFn(shape_inference::AvgPool3DGradShape);
 
 // --------------------------------------------------------------------------
 
@@ -768,9 +761,7 @@ REGISTER_OP("MaxPool3DGrad")
     .Attr(GetConvnet3dDataFormatAttrString())
     .Attr("T: {half, bfloat16, float} = DT_FLOAT")
     .Attr("TInput: {half, bfloat16, float} = DT_FLOAT")
-    .SetShapeFn([](InferenceContext* c) {
-      return UnchangedShapeWithRank(c, 5);
-    });
+    .SetShapeFn(shape_inference::MaxPool3DGradShape);
 
 REGISTER_OP("MaxPool3DGradGrad")
     .Input("orig_input: T")
@@ -841,11 +832,12 @@ REGISTER_OP("MaxPool")
         "uint16, qint8} = DT_FLOAT")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
-    .Attr(GetPaddingAttrString())
+    .Attr(GetPaddingAttrStringWithExplicit())
+    .Attr(GetExplicitPaddingsAttrString())
     .Attr("data_format: {'NHWC', 'NCHW', 'NCHW_VECT_C'} = 'NHWC'")
     .Input("input: T")
     .Output("output: T")
-    .SetShapeFn(shape_inference::MaxPoolShape);
+    .SetShapeFn(shape_inference::MaxPoolShapeWithExplicitPadding);
 
 REGISTER_OP("MaxPoolV2")
     .Attr(
@@ -865,16 +857,15 @@ REGISTER_OP("MaxPoolV2")
 REGISTER_OP("MaxPoolGrad")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
-    .Attr(GetPaddingAttrString())
+    .Attr(GetPaddingAttrStringWithExplicit())
+    .Attr(GetExplicitPaddingsAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Input("orig_input: T")
     .Input("orig_output: T")
     .Input("grad: T")
     .Output("output: T")
     .Attr("T: realnumbertype = DT_FLOAT")
-    .SetShapeFn([](InferenceContext* c) {
-      return UnchangedShapeWithRank(c, 4);
-    });
+    .SetShapeFn(shape_inference::MaxPoolGradShape);
 
 REGISTER_OP("MaxPoolGradV2")
     .Attr(GetPaddingAttrString())
@@ -886,10 +877,9 @@ REGISTER_OP("MaxPoolGradV2")
     .Input("strides: int32")
     .Output("output: T")
     .Attr("T: realnumbertype = DT_FLOAT")
-    .SetShapeFn([](InferenceContext* c) {
-      return UnchangedShapeWithRank(c, 4);
-    });
+    .SetShapeFn(shape_inference::MaxPoolGradShape);
 
+// TODO(b/150813181): Implement explicit padding.
 REGISTER_OP("MaxPoolGradGrad")
     .Attr("ksize: list(int) >= 4")
     .Attr("strides: list(int) >= 4")
@@ -1704,6 +1694,7 @@ REGISTER_OP("_MklNativeConv2D")
     .Attr("T: {bfloat16, float}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr("is_filter_const: bool = false")
     .Attr(GetPaddingAttrStringWithExplicit())
     .Attr(GetExplicitPaddingsAttrString())
     .Attr(GetConvnetDataFormatAttrString())
@@ -2291,6 +2282,7 @@ REGISTER_OP("_MklMaxPool")
     .Attr("strides: list(int) >= 4")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetExplicitPaddingsAttrString())
     .Attr("workspace_enabled: bool = false")
     .Input("input: T")
     .Input("mkl_input: uint8")
@@ -2318,6 +2310,7 @@ REGISTER_OP("_MklMaxPoolGrad")
     .Attr("workspace_enabled: bool = false")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetExplicitPaddingsAttrString())
     .Input("orig_input: T")
     .Input("orig_output: T")
     .Input("grad: T")
@@ -2332,14 +2325,12 @@ REGISTER_OP("_MklMaxPoolGrad")
     .Input("mkl_workspace: uint8")
     .Output("output: T")
     .Output("mkl_output: uint8")
-    .SetShapeFn([](InferenceContext* c) {
-      return UnchangedShapeWithRank(c, 4);
-    })
+    .SetShapeFn(shape_inference::MaxPoolGradShape)
     .Doc(R"doc(
-MKL version of MaxPoolGrad. Uses MKL DNN APIs to compute gradients of
+oneDNN version of MaxPoolGrad. Uses oneDNN APIs to compute gradients of
 MaxPool operator.
 
-NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
@@ -2374,18 +2365,12 @@ REGISTER_OP("_MklAvgPoolGrad")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("T: {float, half, double, bfloat16}")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle s;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
-      TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s));
-      c->set_output(0, s);
-      return Status::OK();
-    })
+    .SetShapeFn(shape_inference::AvgPoolGradShape)
     .Doc(R"doc(
-MKL version of AvgPoolGrad operator. Uses MKL DNN APIs to compute gradients
+oneDNN version of AvgPoolGrad operator. Uses oneDNN APIs to compute gradients
 of AvgPool function.
 
-NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
@@ -2420,18 +2405,12 @@ REGISTER_OP("_MklAvgPool3DGrad")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
     .Attr("T: {float, half, double, bfloat16}")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle s;
-      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
-      TF_RETURN_IF_ERROR(c->WithRank(s, 5, &s));
-      c->set_output(0, s);
-      return Status::OK();
-    })
+    .SetShapeFn(shape_inference::AvgPool3DGradShape)
     .Doc(R"doc(
-MKL version of AvgPool3DGrad operator. Uses MKL DNN APIs to compute gradients
+oneDNN version of AvgPool3DGrad operator. Uses oneDNN APIs to compute gradients
 of AvgPool function.
 
-NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
@@ -2475,14 +2454,12 @@ REGISTER_OP("_MklMaxPool3DGrad")
     .Attr("T: {half, bfloat16, float} = DT_FLOAT")
     .Attr("TInput: {half, bfloat16, float} = DT_FLOAT")
     .Attr("workspace_enabled: bool = false")
-    .SetShapeFn([](InferenceContext* c) {
-      return UnchangedShapeWithRank(c, 5);
-    })
+    .SetShapeFn(shape_inference::MaxPool3DGradShape)
     .Doc(R"doc(
-MKL version of MklPool3DGrad operator. Uses MKL DNN APIs to compute gradients
-of MklPool function.
+oneDNN version of MaxPool3DGrad operator. Uses oneDNN APIs to compute gradients
+of MaxPool3D function.
 
-NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
@@ -2569,44 +2546,12 @@ REGISTER_OP("_MklFusedBatchNorm")
     .Attr("data_format: string = 'NHWC'")
     .Attr("exponential_avg_factor: float = 1.0")
     .Attr("is_training: bool = true")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle x;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &x));
-
-      bool is_training;
-      TF_RETURN_IF_ERROR(c->GetAttr("is_training", &is_training));
-      int number_inputs = (is_training) ? 3 : 5;
-      string data_format;
-      TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format));
-      DimensionHandle channel_dim =
-          (data_format == "NHWC") ? c->Dim(x, 3) : c->Dim(x, 1);
-
-      // covers scale, offset, and if is_training is false, mean, variance
-      for (int i = 1; i < number_inputs; ++i) {
-        ShapeHandle vec;
-        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &vec));
-        TF_RETURN_IF_ERROR(c->Merge(channel_dim, c->Dim(vec, 0), &channel_dim));
-      }
-
-      ShapeHandle y;
-      if (data_format == "NHWC") {
-        TF_RETURN_IF_ERROR(c->ReplaceDim(x, 3, channel_dim, &y));
-      } else {
-        TF_RETURN_IF_ERROR(c->ReplaceDim(x, 1, channel_dim, &y));
-      }
-      c->set_output(0, y);
-      ShapeHandle vector_shape = c->Vector(channel_dim);
-      c->set_output(1, vector_shape);
-      c->set_output(2, vector_shape);
-      c->set_output(3, vector_shape);
-      c->set_output(4, vector_shape);
-      return Status::OK();
-    })
+    .SetShapeFn(shape_inference::FusedBatchNormShape)
     .Doc(R"doc(
-MKL version of FusedBatchNorm operator. Uses MKL DNN APIs to perform fused
+oneDNN version of FusedBatchNorm operator. Uses oneDNN APIs to perform fused
 batch normalization.
 
-NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
@@ -2635,60 +2580,12 @@ REGISTER_OP("_MklFusedBatchNormGrad")
     .Attr("epsilon: float = 0.0001")
     .Attr("data_format: string = 'NHWC'")
     .Attr("is_training: bool = true")
-    .SetShapeFn([](InferenceContext* c) {
-      ShapeHandle y_backprop;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &y_backprop));
-      ShapeHandle x;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &x));
-
-      bool is_training;
-      string data_format;
-      TF_RETURN_IF_ERROR(c->GetAttr("is_training", &is_training));
-      TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format));
-      DimensionHandle channel_dim = (data_format == "NHWC")
-                                        ? c->Dim(y_backprop, 3)
-                                        : c->Dim(y_backprop, 1);
-      if (data_format == "NHWC") {
-        TF_RETURN_IF_ERROR(c->Merge(channel_dim, c->Dim(x, 3), &channel_dim));
-      } else {
-        TF_RETURN_IF_ERROR(c->Merge(channel_dim, c->Dim(x, 1), &channel_dim));
-      }
-
-      // covers scale, mean (reserve_space_1), variance (reserve_space_2)
-      for (int i = 2; i < 5; ++i) {
-        ShapeHandle vec;
-        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &vec));
-        TF_RETURN_IF_ERROR(c->Merge(channel_dim, c->Dim(vec, 0), &channel_dim));
-      }
-
-      ShapeHandle x_backprop;
-      if (data_format == "NHWC") {
-        TF_RETURN_IF_ERROR(
-            c->ReplaceDim(y_backprop, 3, channel_dim, &x_backprop));
-      } else {
-        TF_RETURN_IF_ERROR(
-            c->ReplaceDim(y_backprop, 1, channel_dim, &x_backprop));
-      }
-      c->set_output(0, x_backprop);
-      c->set_output(1, c->Vector(channel_dim));
-      c->set_output(2, c->Vector(channel_dim));
-      // Set the correct shapes for reserve_spaces
-      // so that gradients can be performed when
-      // the op is in a symbolic condition.
-      if (is_training) {
-        c->set_output(3, c->Vector(0));
-        c->set_output(4, c->Vector(0));
-      } else {
-        c->set_output(3, c->Vector(channel_dim));
-        c->set_output(4, c->Vector(channel_dim));
-      }
-      return Status::OK();
-    })
+    .SetShapeFn(shape_inference::FusedBatchNormGradShape)
     .Doc(R"doc(
-MKL version of FusedBatchNormGrad operator. Uses MKL DNN APIs to compute
+oneDNN version of FusedBatchNormGrad operator. Uses oneDNN APIs to compute
 gradients for fused batch normalization.
 
-NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index f8393ffa743..dd831e03d45 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -434,6 +434,7 @@ op {
         type: DT_UINT8
         type: DT_INT8
         type: DT_INT16
+        type: DT_UINT32
         type: DT_INT32
         type: DT_INT64
         type: DT_COMPLEX64
@@ -7550,6 +7551,57 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveGatherV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "group_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "group_key"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "instance_key"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "CollectivePermute"
   input_arg {
@@ -7739,6 +7791,13 @@ op {
       s: "auto"
     }
   }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
   is_stateful: true
 }
 op {
@@ -14065,29 +14124,6 @@ op {
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_COMPLEX64
-        type: DT_QUINT8
-        type: DT_QINT8
-        type: DT_QINT32
-        type: DT_STRING
-        type: DT_BOOL
-        type: DT_COMPLEX128
-      }
-    }
   }
   attr {
     name: "incompatible_shape_error"
@@ -17439,6 +17475,8 @@ op {
       list {
         s: "NHWC"
         s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
@@ -17637,6 +17675,8 @@ op {
       list {
         s: "NHWC"
         s: "NCHW"
+        s: "NDHWC"
+        s: "NCDHW"
       }
     }
   }
@@ -19123,6 +19163,54 @@ op {
     }
   }
 }
+op {
+  name: "ImageProjectiveTransformV3"
+  input_arg {
+    name: "images"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "transforms"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "output_shape"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "fill_value"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "transformed_images"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "interpolation"
+    type: "string"
+  }
+  attr {
+    name: "fill_mode"
+    type: "string"
+    default_value {
+      s: "CONSTANT"
+    }
+  }
+}
 op {
   name: "ImageSummary"
   input_arg {
@@ -23634,17 +23722,17 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
@@ -23740,6 +23828,15 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
       }
     }
   }
@@ -24008,6 +24105,15 @@ op {
       list {
         s: "SAME"
         s: "VALID"
+        s: "EXPLICIT"
+      }
+    }
+  }
+  attr {
+    name: "explicit_paddings"
+    type: "list(int)"
+    default_value {
+      list {
       }
     }
   }
@@ -24792,17 +24898,17 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_QINT16
+        type: DT_QUINT16
       }
     }
   }
@@ -25014,6 +25120,13 @@ op {
       i: 0
     }
   }
+  attr {
+    name: "ram_budget"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
   attr {
     name: "output_types"
     type: "list(type)"
@@ -26235,29 +26348,6 @@ op {
   attr {
     name: "T"
     type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_HALF
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_UINT8
-        type: DT_INT8
-        type: DT_INT16
-        type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT16
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_COMPLEX64
-        type: DT_QUINT8
-        type: DT_QINT8
-        type: DT_QINT32
-        type: DT_STRING
-        type: DT_BOOL
-        type: DT_COMPLEX128
-      }
-    }
   }
   attr {
     name: "incompatible_shape_error"
@@ -26946,6 +27036,48 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "OutfeedDequeueTupleV2"
+  input_arg {
+    name: "device_ordinal"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "outputs"
+    type_list_attr: "dtypes"
+  }
+  attr {
+    name: "dtypes"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "shapes"
+    type: "list(shape)"
+  }
+  is_stateful: true
+}
+op {
+  name: "OutfeedDequeueV2"
+  input_arg {
+    name: "device_ordinal"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  is_stateful: true
+}
 op {
   name: "OutfeedEnqueue"
   input_arg {
@@ -29118,6 +29250,13 @@ op {
       b: true
     }
   }
+  attr {
+    name: "buffer_size_min"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
 }
 op {
   name: "Prelinearize"
@@ -29733,6 +29872,135 @@ op {
     }
   }
 }
+op {
+  name: "QuantizeAndDequantizeV4"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_TO_EVEN"
+    }
+    allowed_values {
+      list {
+        s: "HALF_TO_EVEN"
+        s: "HALF_UP"
+      }
+    }
+  }
+  attr {
+    name: "narrow_range"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
+op {
+  name: "QuantizeAndDequantizeV4Grad"
+  input_arg {
+    name: "gradients"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_min_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "input_max_backprop"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "axis"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+}
 op {
   name: "QuantizeDownAndShrinkRange"
   input_arg {
@@ -34507,6 +34775,42 @@ op {
     type: "bool"
   }
 }
+op {
+  name: "RaggedTensorToVariantGradient"
+  input_arg {
+    name: "encoded_ragged_grad"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "row_splits"
+    type_attr: "Tsplits"
+  }
+  input_arg {
+    name: "dense_values_shape"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "dense_values_grad"
+    type_attr: "Tvalues"
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  attr {
+    name: "Tsplits"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "RandomCrop"
   input_arg {
@@ -40985,6 +41289,26 @@ op {
     }
   }
 }
+op {
+  name: "RngReadAndSkip"
+  input_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "delta"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "value"
+    type: DT_INT64
+  }
+  is_stateful: true
+}
 op {
   name: "RngSkip"
   input_arg {
@@ -41443,6 +41767,7 @@ op {
     type: "list(type)"
     has_minimum: true
   }
+  is_stateful: true
 }
 op {
   name: "SaveSlices"
@@ -44275,6 +44600,20 @@ op {
       s: ""
     }
   }
+  attr {
+    name: "reader_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "writer_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   attr {
     name: "reader_func"
     type: "func"
@@ -49507,6 +49846,39 @@ op {
     }
   }
 }
+op {
+  name: "StatelessRandomGetKeyCounterAlg"
+  input_arg {
+    name: "seed"
+    type_attr: "Tseed"
+  }
+  output_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  output_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  attr {
+    name: "Tseed"
+    type: "type"
+    default_value {
+      type: DT_INT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "StatelessRandomNormal"
   input_arg {
@@ -49563,6 +49935,57 @@ op {
     }
   }
 }
+op {
+  name: "StatelessRandomNormalV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessRandomPoisson"
   input_arg {
@@ -49745,6 +50168,57 @@ op {
     }
   }
 }
+op {
+  name: "StatelessRandomUniformFullIntV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_UINT64
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessRandomUniformInt"
   input_arg {
@@ -49801,6 +50275,113 @@ op {
     }
   }
 }
+op {
+  name: "StatelessRandomUniformIntV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "minval"
+    type_attr: "dtype"
+  }
+  input_arg {
+    name: "maxval"
+    type_attr: "dtype"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "StatelessRandomUniformV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessSampleDistortedBoundingBox"
   input_arg {
@@ -49945,6 +50526,57 @@ op {
     }
   }
 }
+op {
+  name: "StatelessTruncatedNormalV2"
+  input_arg {
+    name: "shape"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "key"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "counter"
+    type: DT_UINT64
+  }
+  input_arg {
+    name: "alg"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "StatelessWhile"
   input_arg {
@@ -53207,14 +53839,14 @@ op {
   }
   input_arg {
     name: "key"
-    type_attr: "element_dtype"
+    type_attr: "key_dtype"
   }
   output_arg {
     name: "has_key"
     type: DT_BOOL
   }
   attr {
-    name: "element_dtype"
+    name: "key_dtype"
     type: "type"
   }
 }
@@ -53279,6 +53911,21 @@ op {
     type: DT_INT32
   }
 }
+op {
+  name: "TensorMapStackKeys"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "keys"
+    type_attr: "key_dtype"
+  }
+  attr {
+    name: "key_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorScatterAdd"
   input_arg {
diff --git a/tensorflow/core/ops/ragged_conversion_ops.cc b/tensorflow/core/ops/ragged_conversion_ops.cc
index 44712bf7739..043ff469487 100644
--- a/tensorflow/core/ops/ragged_conversion_ops.cc
+++ b/tensorflow/core/ops/ragged_conversion_ops.cc
@@ -92,7 +92,8 @@ tensorflow::Status ValidateRowPartitionTypesAndShapes(
 Status RaggedTensorToSparseShapeFn(InferenceContext* c);
 Status RaggedTensorToVariantShapeFn(InferenceContext* c);
 Status RaggedTensorFromVariantShapeFn(InferenceContext* c);
-tensorflow::Status RaggedTensorToTensorShapeFn(InferenceContext* c);
+Status RaggedTensorToVariantGradientShapeFn(InferenceContext* c);
+Status RaggedTensorToTensorShapeFn(InferenceContext* c);
 
 //==============================================================================
 // Registered Ops
@@ -129,6 +130,15 @@ REGISTER_OP("RaggedTensorFromVariant")
     .Attr("Tsplits: {int32, int64} = DT_INT64")
     .SetShapeFn(RaggedTensorFromVariantShapeFn);
 
+REGISTER_OP("RaggedTensorToVariantGradient")
+    .Input("encoded_ragged_grad: variant")
+    .Input("row_splits: Tsplits")
+    .Input("dense_values_shape: int32")
+    .Output("dense_values_grad: Tvalues")
+    .Attr("Tvalues: type")
+    .Attr("Tsplits: {int32, int64} = DT_INT64")
+    .SetShapeFn(RaggedTensorToVariantGradientShapeFn);
+
 REGISTER_OP("RaggedTensorToTensor")
     .Attr("T: type")
     .Attr("Tindex: {int64, int32}")
@@ -201,6 +211,14 @@ Status RaggedTensorToVariantShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+Status RaggedTensorToVariantGradientShapeFn(InferenceContext* c) {
+  ShapeHandle shape;
+  TF_RETURN_IF_ERROR(
+      c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(2, &shape));
+  c->set_output(0, shape);
+  return Status::OK();
+}
+
 Status RaggedTensorFromVariantShapeFn(InferenceContext* c) {
   int64 input_ragged_rank;
   TF_RETURN_IF_ERROR(
diff --git a/tensorflow/core/ops/stateful_random_ops.cc b/tensorflow/core/ops/stateful_random_ops.cc
index 6c3ca3ac466..beb9fd60e6a 100644
--- a/tensorflow/core/ops/stateful_random_ops.cc
+++ b/tensorflow/core/ops/stateful_random_ops.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/rng_alg.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
 namespace tensorflow {
@@ -90,6 +91,19 @@ REGISTER_OP("RngSkip")
       return Status::OK();
     });
 
+REGISTER_OP("RngReadAndSkip")
+    .Input("resource: resource")
+    .Input("alg: int32")
+    .Input("delta: uint64")
+    .Output("value: int64")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      c->set_output(0, c->MakeShape({RNG_MAX_COUNTER_SIZE + RNG_KEY_SIZE}));
+      return Status::OK();
+    });
+
 REGISTER_OP("NonDeterministicInts")
     .Input("shape: shape_dtype")
     .SetIsStateful()
diff --git a/tensorflow/core/ops/stateless_random_ops_v2.cc b/tensorflow/core/ops/stateless_random_ops_v2.cc
new file mode 100644
index 00000000000..e6f87674174
--- /dev/null
+++ b/tensorflow/core/ops/stateless_random_ops_v2.cc
@@ -0,0 +1,119 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/rng_alg.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+static Status StatelessShapeV2(InferenceContext* c) {
+  // Check key and counter shapes
+  ShapeHandle key;
+  ShapeHandle counter;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &key));
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &counter));
+  shape_inference::ShapeHandle unused_shape;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused_shape));
+  DimensionHandle unused;
+  TF_RETURN_IF_ERROR(c->WithValue(c->Dim(key, 0), RNG_KEY_SIZE, &unused));
+
+  // Set output shape
+  ShapeHandle out;
+  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &out));
+  c->set_output(0, out);
+  return Status::OK();
+}
+
+#define REGISTER_STATELESS_OP(name)                           \
+  REGISTER_OP(name)                                           \
+      .Input("shape: Tshape")                                 \
+      .Input("key: uint64")                                   \
+      .Input("counter: uint64")                               \
+      .Input("alg: int32")                                    \
+      .Output("output: dtype")                                \
+      .Attr("dtype: {half,bfloat16,float,double} = DT_FLOAT") \
+      .Attr("Tshape: {int32, int64} = DT_INT32")              \
+      .SetShapeFn(StatelessShapeV2)
+
+REGISTER_STATELESS_OP("StatelessRandomUniformV2");
+REGISTER_STATELESS_OP("StatelessRandomNormalV2");
+REGISTER_STATELESS_OP("StatelessTruncatedNormalV2");
+
+#undef REGISTER_STATELESS_OP
+
+REGISTER_OP("StatelessRandomUniformIntV2")
+    .Input("shape: Tshape")
+    .Input("key: uint64")
+    .Input("counter: uint64")
+    .Input("alg: int32")
+    .Input("minval: dtype")
+    .Input("maxval: dtype")
+    .Output("output: dtype")
+    .Attr("dtype: {int32, int64, uint32, uint64}")
+    .Attr("Tshape: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      Status s = c->WithRank(c->input(4), 0, &unused);
+      if (!s.ok()) {
+        return errors::InvalidArgument(
+            "minval must be a scalar; got a tensor of shape ",
+            c->DebugString(c->input(4)));
+      }
+      s = c->WithRank(c->input(5), 0, &unused);
+      if (!s.ok()) {
+        return errors::InvalidArgument(
+            "maxval must be a scalar; got a tensor of shape ",
+            c->DebugString(c->input(5)));
+      }
+      return StatelessShapeV2(c);
+    });
+
+REGISTER_OP("StatelessRandomUniformFullIntV2")
+    .Input("shape: Tshape")
+    .Input("key: uint64")
+    .Input("counter: uint64")
+    .Input("alg: int32")
+    .Output("output: dtype")
+    .Attr("dtype: {int32, int64, uint32, uint64} = DT_UINT64")
+    .Attr("Tshape: {int32, int64} = DT_INT32")
+    .SetShapeFn(StatelessShapeV2);
+
+REGISTER_OP("StatelessRandomGetKeyCounterAlg")
+    .Input("seed: Tseed")
+    .Output("key: uint64")
+    .Output("counter: uint64")
+    .Output("alg: int32")
+    .Attr("Tseed: {int32, int64} = DT_INT64")
+    .SetIsStateful()  // because outputs depend on device
+    .SetShapeFn([](InferenceContext* c) {
+      // Check seed shape
+      ShapeHandle seed;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &seed));
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(seed, 0), 2, &unused));
+
+      // Set output shapes
+      c->set_output(0, c->MakeShape({RNG_KEY_SIZE}));
+      c->set_output(1, c->MakeShape({RNG_MAX_COUNTER_SIZE}));
+      c->set_output(2, c->MakeShape({}));
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/tpu_embedding_load_retrieve_ops.cc b/tensorflow/core/ops/tpu_embedding_load_retrieve_ops.cc
index 30bdaa4d848..f32261209ae 100644
--- a/tensorflow/core/ops/tpu_embedding_load_retrieve_ops.cc
+++ b/tensorflow/core/ops/tpu_embedding_load_retrieve_ops.cc
@@ -37,8 +37,7 @@ REGISTER_OP("LoadTPUEmbeddingAdagradParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{/*alg=*/OptimizationAlgorithm::kAdagrad,
-                                    /*is_debug_op=*/false});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("LoadTPUEmbeddingAdagradParametersGradAccumDebug")
     .Input("parameters: float32")
@@ -50,8 +49,7 @@ REGISTER_OP("LoadTPUEmbeddingAdagradParametersGradAccumDebug")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{/*alg=*/OptimizationAlgorithm::kAdagrad,
-                                    /*is_debug_op=*/true});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("RetrieveTPUEmbeddingAdagradParameters")
     .Output("parameters: float32")
@@ -62,8 +60,7 @@ REGISTER_OP("RetrieveTPUEmbeddingAdagradParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{/*alg=*/OptimizationAlgorithm::kAdagrad,
-                                        /*is_debug_op=*/false});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 REGISTER_OP("RetrieveTPUEmbeddingAdagradParametersGradAccumDebug")
     .Output("parameters: float32")
@@ -75,8 +72,7 @@ REGISTER_OP("RetrieveTPUEmbeddingAdagradParametersGradAccumDebug")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{/*alg=*/OptimizationAlgorithm::kAdagrad,
-                                        /*is_debug_op=*/true});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 REGISTER_OP("LoadTPUEmbeddingStochasticGradientDescentParameters")
     .Input("parameters: float32")
@@ -86,9 +82,7 @@ REGISTER_OP("LoadTPUEmbeddingStochasticGradientDescentParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kStochasticGradientDescent,
-        /*is_debug_op=*/false});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug")
     .Input("parameters: float32")
@@ -99,9 +93,7 @@ REGISTER_OP("LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kStochasticGradientDescent,
-        /*is_debug_op=*/true});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("RetrieveTPUEmbeddingStochasticGradientDescentParameters")
     .Output("parameters: float32")
@@ -111,9 +103,7 @@ REGISTER_OP("RetrieveTPUEmbeddingStochasticGradientDescentParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kStochasticGradientDescent,
-        /*is_debug_op=*/false});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 REGISTER_OP(
     "RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug")
@@ -125,9 +115,7 @@ REGISTER_OP(
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kStochasticGradientDescent,
-        /*is_debug_op=*/true});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 REGISTER_OP("LoadTPUEmbeddingFTRLParameters")
     .Input("parameters: float32")
@@ -139,8 +127,7 @@ REGISTER_OP("LoadTPUEmbeddingFTRLParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{/*alg=*/OptimizationAlgorithm::kFtrl,
-                                    /*is_debug_op=*/false});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("LoadTPUEmbeddingFTRLParametersGradAccumDebug")
     .Input("parameters: float32")
@@ -153,8 +140,7 @@ REGISTER_OP("LoadTPUEmbeddingFTRLParametersGradAccumDebug")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{/*alg=*/OptimizationAlgorithm::kFtrl,
-                                    /*is_debug_op=*/true});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("RetrieveTPUEmbeddingFTRLParameters")
     .Output("parameters: float32")
@@ -166,8 +152,7 @@ REGISTER_OP("RetrieveTPUEmbeddingFTRLParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{/*alg=*/OptimizationAlgorithm::kFtrl,
-                                        /*is_debug_op=*/false});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 REGISTER_OP("RetrieveTPUEmbeddingFTRLParametersGradAccumDebug")
     .Output("parameters: float32")
@@ -180,8 +165,7 @@ REGISTER_OP("RetrieveTPUEmbeddingFTRLParametersGradAccumDebug")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{/*alg=*/OptimizationAlgorithm::kFtrl,
-                                        /*is_debug_op=*/true});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 REGISTER_OP("LoadTPUEmbeddingADAMParameters")
     .Input("parameters: float32")
@@ -193,8 +177,7 @@ REGISTER_OP("LoadTPUEmbeddingADAMParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{/*alg=*/OptimizationAlgorithm::kAdam,
-                                    /*is_debug_op=*/false});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("LoadTPUEmbeddingADAMParametersGradAccumDebug")
     .Input("parameters: float32")
@@ -207,8 +190,7 @@ REGISTER_OP("LoadTPUEmbeddingADAMParametersGradAccumDebug")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{/*alg=*/OptimizationAlgorithm::kAdam,
-                                    /*is_debug_op=*/true});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("RetrieveTPUEmbeddingADAMParameters")
     .Output("parameters: float32")
@@ -220,8 +202,7 @@ REGISTER_OP("RetrieveTPUEmbeddingADAMParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{/*alg=*/OptimizationAlgorithm::kAdam,
-                                        /*is_debug_op=*/false});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 REGISTER_OP("RetrieveTPUEmbeddingADAMParametersGradAccumDebug")
     .Output("parameters: float32")
@@ -234,8 +215,7 @@ REGISTER_OP("RetrieveTPUEmbeddingADAMParametersGradAccumDebug")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{/*alg=*/OptimizationAlgorithm::kAdam,
-                                        /*is_debug_op=*/true});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 REGISTER_OP("LoadTPUEmbeddingMomentumParameters")
     .Input("parameters: float32")
@@ -246,8 +226,7 @@ REGISTER_OP("LoadTPUEmbeddingMomentumParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{/*alg=*/OptimizationAlgorithm::kMomentum,
-                                    /*is_debug_op=*/false});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("LoadTPUEmbeddingMomentumParametersGradAccumDebug")
     .Input("parameters: float32")
@@ -259,8 +238,7 @@ REGISTER_OP("LoadTPUEmbeddingMomentumParametersGradAccumDebug")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{/*alg=*/OptimizationAlgorithm::kMomentum,
-                                    /*is_debug_op=*/true});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("RetrieveTPUEmbeddingMomentumParameters")
     .Output("parameters: float32")
@@ -271,9 +249,7 @@ REGISTER_OP("RetrieveTPUEmbeddingMomentumParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kMomentum,
-        /*is_debug_op=*/false});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 REGISTER_OP("RetrieveTPUEmbeddingMomentumParametersGradAccumDebug")
     .Output("parameters: float32")
@@ -285,9 +261,7 @@ REGISTER_OP("RetrieveTPUEmbeddingMomentumParametersGradAccumDebug")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kMomentum,
-        /*is_debug_op=*/true});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 REGISTER_OP("LoadTPUEmbeddingRMSPropParameters")
     .Input("parameters: float32")
@@ -299,8 +273,7 @@ REGISTER_OP("LoadTPUEmbeddingRMSPropParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{/*alg=*/OptimizationAlgorithm::kRmsProp,
-                                    /*is_debug_op=*/false});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("LoadTPUEmbeddingRMSPropParametersGradAccumDebug")
     .Input("parameters: float32")
@@ -313,8 +286,7 @@ REGISTER_OP("LoadTPUEmbeddingRMSPropParametersGradAccumDebug")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{/*alg=*/OptimizationAlgorithm::kRmsProp,
-                                    /*is_debug_op=*/true});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("RetrieveTPUEmbeddingRMSPropParameters")
     .Output("parameters: float32")
@@ -326,8 +298,7 @@ REGISTER_OP("RetrieveTPUEmbeddingRMSPropParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{/*alg=*/OptimizationAlgorithm::kRmsProp,
-                                        /*is_debug_op=*/false});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 REGISTER_OP("RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug")
     .Output("parameters: float32")
@@ -340,8 +311,7 @@ REGISTER_OP("RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{/*alg=*/OptimizationAlgorithm::kRmsProp,
-                                        /*is_debug_op=*/true});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 REGISTER_OP("LoadTPUEmbeddingCenteredRMSPropParameters")
     .Input("parameters: float32")
@@ -354,9 +324,7 @@ REGISTER_OP("LoadTPUEmbeddingCenteredRMSPropParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kCenteredRmsProp,
-        /*is_debug_op=*/false});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("RetrieveTPUEmbeddingCenteredRMSPropParameters")
     .Output("parameters: float32")
@@ -369,9 +337,7 @@ REGISTER_OP("RetrieveTPUEmbeddingCenteredRMSPropParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kCenteredRmsProp,
-        /*is_debug_op=*/false});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 REGISTER_OP("LoadTPUEmbeddingMDLAdagradLightParameters")
     .Input("parameters: float32")
@@ -384,9 +350,7 @@ REGISTER_OP("LoadTPUEmbeddingMDLAdagradLightParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kMdlAdagradLight,
-        /*is_debug_op=*/false});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("RetrieveTPUEmbeddingMDLAdagradLightParameters")
     .Output("parameters: float32")
@@ -399,9 +363,7 @@ REGISTER_OP("RetrieveTPUEmbeddingMDLAdagradLightParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kMdlAdagradLight,
-        /*is_debug_op=*/false});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 REGISTER_OP("LoadTPUEmbeddingAdadeltaParameters")
     .Input("parameters: float32")
@@ -413,8 +375,7 @@ REGISTER_OP("LoadTPUEmbeddingAdadeltaParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{/*alg=*/OptimizationAlgorithm::kAdadelta,
-                                    /*is_debug_op=*/false});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("LoadTPUEmbeddingAdadeltaParametersGradAccumDebug")
     .Input("parameters: float32")
@@ -427,8 +388,7 @@ REGISTER_OP("LoadTPUEmbeddingAdadeltaParametersGradAccumDebug")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{/*alg=*/OptimizationAlgorithm::kAdadelta,
-                                    /*is_debug_op=*/true});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("RetrieveTPUEmbeddingAdadeltaParameters")
     .Output("parameters: float32")
@@ -440,9 +400,7 @@ REGISTER_OP("RetrieveTPUEmbeddingAdadeltaParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kAdadelta,
-        /*is_debug_op=*/false});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 REGISTER_OP("RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug")
     .Output("parameters: float32")
@@ -455,9 +413,7 @@ REGISTER_OP("RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kAdadelta,
-        /*is_debug_op=*/true});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 REGISTER_OP("LoadTPUEmbeddingProximalAdagradParameters")
     .Input("parameters: float32")
@@ -468,9 +424,7 @@ REGISTER_OP("LoadTPUEmbeddingProximalAdagradParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kProximalAdagrad,
-        /*is_debug_op=*/false});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug")
     .Input("parameters: float32")
@@ -482,9 +436,7 @@ REGISTER_OP("LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kProximalAdagrad,
-        /*is_debug_op=*/true});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("RetrieveTPUEmbeddingProximalAdagradParameters")
     .Output("parameters: float32")
@@ -495,9 +447,7 @@ REGISTER_OP("RetrieveTPUEmbeddingProximalAdagradParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kProximalAdagrad,
-        /*is_debug_op=*/false});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 REGISTER_OP("RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug")
     .Output("parameters: float32")
@@ -509,9 +459,7 @@ REGISTER_OP("RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kProximalAdagrad,
-        /*is_debug_op=*/true});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 REGISTER_OP("LoadTPUEmbeddingProximalYogiParameters")
     .Input("parameters: float32")
@@ -523,9 +471,7 @@ REGISTER_OP("LoadTPUEmbeddingProximalYogiParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kProximalYogi,
-        /*is_debug_op=*/false});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("LoadTPUEmbeddingProximalYogiParametersGradAccumDebug")
     .Input("parameters: float32")
@@ -538,9 +484,7 @@ REGISTER_OP("LoadTPUEmbeddingProximalYogiParametersGradAccumDebug")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(LoadOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kProximalYogi,
-        /*is_debug_op=*/true});
+    .SetShapeFn(LoadOpShapeFunction());
 
 REGISTER_OP("RetrieveTPUEmbeddingProximalYogiParameters")
     .Output("parameters: float32")
@@ -552,9 +496,7 @@ REGISTER_OP("RetrieveTPUEmbeddingProximalYogiParameters")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kProximalYogi,
-        /*is_debug_op=*/false});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 REGISTER_OP("RetrieveTPUEmbeddingProximalYogiParametersGradAccumDebug")
     .Output("parameters: float32")
@@ -567,9 +509,7 @@ REGISTER_OP("RetrieveTPUEmbeddingProximalYogiParametersGradAccumDebug")
     .Attr("shard_id: int")
     .Attr("config: string = \"\"")
     .SetIsStateful()
-    .SetShapeFn(RetrieveOpShapeFunction{
-        /*alg=*/OptimizationAlgorithm::kProximalYogi,
-        /*is_debug_op=*/true});
+    .SetShapeFn(RetrieveOpShapeFunction());
 
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/tpu_embedding_ops.cc b/tensorflow/core/ops/tpu_embedding_ops.cc
index 164d78e8e9e..792c85bb79b 100644
--- a/tensorflow/core/ops/tpu_embedding_ops.cc
+++ b/tensorflow/core/ops/tpu_embedding_ops.cc
@@ -70,7 +70,6 @@ REGISTER_OP("RecvTPUEmbeddingActivations")
       if (!config.ParseFromString(config_string)) {
         return errors::InvalidArgument("Malformed tpu_embedding_config.");
       }
-      tpu::AddDefaultEmbeddingOutputLayoutIfNeeded(&config);
       std::vector<TensorShapeProto> output_shapes;
       TF_RETURN_IF_ERROR(ComputeOutputTensorShapes(config, &output_shapes));
       if (c->num_outputs() != output_shapes.size()) {
diff --git a/tensorflow/core/ops/tpu_outfeed_ops.cc b/tensorflow/core/ops/tpu_outfeed_ops.cc
index e170ed05a0c..dce19bcec20 100644
--- a/tensorflow/core/ops/tpu_outfeed_ops.cc
+++ b/tensorflow/core/ops/tpu_outfeed_ops.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 
@@ -65,4 +66,38 @@ REGISTER_OP("OutfeedDequeueTuple")
       return Status::OK();
     });
 
+REGISTER_OP("OutfeedDequeueV2")
+    .Input("device_ordinal: int32")
+    .Output("output: dtype")
+    .Attr("dtype: type")
+    .Attr("shape: shape")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ExplicitShape);
+
+REGISTER_OP("OutfeedDequeueTupleV2")
+    .Input("device_ordinal: int32")
+    .Output("outputs: dtypes")
+    .Attr("dtypes: list(type)")
+    .Attr("shapes: list(shape)")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      if (c->Rank(c->input(0)) != 0) {
+        return errors::InvalidArgument("device ordinal must be a scalar.");
+      }
+      std::vector<PartialTensorShape> shapes;
+      std::vector<DataType> dtypes;
+      TF_RETURN_IF_ERROR(c->GetAttr("shapes", &shapes));
+      TF_RETURN_IF_ERROR(c->GetAttr("dtypes", &dtypes));
+      if (shapes.size() != dtypes.size()) {
+        return errors::InvalidArgument(
+            "Incorrect number of output shapes specified");
+      }
+      for (int i = 0; i < shapes.size(); ++i) {
+        ShapeHandle out;
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shapes[i], &out));
+        c->set_output(i, out);
+      }
+      return Status::OK();
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 5d6f74fb1a3..0a324f31023 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -54,8 +54,16 @@ load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
+
 package(
-    default_visibility = ["//tensorflow:__subpackages__"],
+    default_visibility = [
+        "//visibility:public",
+    ],
     licenses = ["notice"],  # Apache 2.0
 )
 
@@ -127,6 +135,7 @@ cc_library(
 cc_library(
     name = "bfloat16",
     hdrs = ["bfloat16.h"],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         ":byte_order",
         "//third_party/eigen3",
@@ -145,6 +154,7 @@ cc_library(
 cc_library(
     name = "byte_order",
     hdrs = ["byte_order.h"],
+    compatible_with = get_compatible_with_portable(),
 )
 
 cc_library(
@@ -168,6 +178,7 @@ cc_library(
 cc_library(
     name = "cord",
     hdrs = ["cord.h"],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         ":platform",
     ] + tf_platform_deps("cord"),
@@ -196,6 +207,7 @@ cc_library(
 cc_library(
     name = "cuda",
     hdrs = ["cuda.h"],
+    compatible_with = [],
     deps = [
         ":platform",
         "//tensorflow/stream_executor/cuda:cuda_activation_header",
@@ -240,6 +252,11 @@ cc_library(
     deps = tf_windows_aware_platform_deps("env"),
 )
 
+cc_library(
+    name = "env_impl",
+    deps = tf_windows_aware_platform_deps("env_impl"),
+)
+
 cc_library(
     name = "env_time",
     textual_hdrs = ["env_time.h"],
@@ -370,9 +387,8 @@ filegroup(
 
 cc_library(
     name = "mutex",
+    compatible_with = get_compatible_with_portable(),
     textual_hdrs = ["mutex.h"],
-    # TODO(b/161569340): Short-term fix. Remove this visibility rule.
-    visibility = ["//tensorflow:__subpackages__"],
     deps = tf_platform_deps("mutex"),
 )
 
@@ -440,6 +456,7 @@ cc_library(
 cc_library(
     name = "platform",
     hdrs = ["platform.h"],
+    compatible_with = get_compatible_with_portable(),
 )
 
 cc_library(
@@ -610,10 +627,6 @@ filegroup(
     srcs = [
         "stacktrace_handler.h",
     ],
-    visibility = [
-        "//tensorflow/core:__pkg__",
-        "//tensorflow/python:__pkg__",
-    ],
 )
 
 cc_library(
@@ -690,7 +703,6 @@ cc_library(
 cc_library(
     name = "strong_hash",
     hdrs = ["strong_hash.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":platform",
         ":types",
@@ -772,6 +784,7 @@ cc_library(
         "ctstring_internal.h",
         "tstring.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         ":cord",
         "@com_google_absl//absl/strings",
@@ -789,11 +802,7 @@ filegroup(
 cc_library(
     name = "types",
     hdrs = ["types.h"],
-    # TODO(b/161569340): Short-term fix. Remove this visibility rule.
-    visibility = [
-        "//tensorflow:__subpackages__",
-        "//tensorflow_text:__subpackages__",
-    ],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         ":platform",
         ":bfloat16",
@@ -926,7 +935,6 @@ tf_cuda_library(
         "stream_executor.h",
     ],
     features = ["-parse_headers"],
-    visibility = ["//tensorflow/core:__pkg__"],
     deps = [
         "//tensorflow/core/platform/default/build_config:stream_executor",
     ],
@@ -941,7 +949,6 @@ cc_library(
         "stream_executor_no_cuda.h",
     ],
     features = ["-parse_headers"],
-    visibility = ["//tensorflow/core:__pkg__"],
     deps = [
         "//tensorflow/core/platform/default/build_config:stream_executor_no_cuda",
     ],
@@ -956,7 +963,6 @@ cc_library(
         "//tensorflow:windows": [],
         "//conditions:default": ["-lm"],
     }),
-    visibility = ["//tensorflow/core:__pkg__"],
     deps = [
         ":platform",
         ":stacktrace_handler",
@@ -969,9 +975,9 @@ cc_library(
 )
 
 cc_library(
-    name = "tf32_utils",
-    srcs = ["tf32_utils.cc"],
-    hdrs = ["tf32_utils.h"],
+    name = "tensor_float_32_utils",
+    srcs = ["tensor_float_32_utils.cc"],
+    hdrs = ["tensor_float_32_utils.h"],
     copts = tf_copts(),
     alwayslink = 1,
 )
@@ -997,8 +1003,9 @@ cc_library(
 )
 
 filegroup(
-    name = "tf32_hdr",
-    srcs = ["tf32_utils.h"],
+    name = "tensor_float_32_hdr",
+    srcs = ["tensor_float_32_utils.h"],
+    compatible_with = get_compatible_with_portable(),
 )
 
 tf_cc_tests(
@@ -1700,6 +1707,13 @@ bzl_library(
     srcs = [
         "build_config_root.bzl",
     ] + tf_platform_alias("build_config_root.bzl"),
+    visibility = [
+        "//learning/brain/google/xla/tests:__pkg__",
+        "//learning/brain/tfrc/runtime/tpu_driver:__subpackages__",
+        "//nlp/deleuze:__pkg__",
+        "//nlp/projects/minmt:__pkg__",
+        "//tensorflow:__subpackages__",
+    ],
 )
 
 bzl_library(
diff --git a/tensorflow/core/platform/build_config.bzl b/tensorflow/core/platform/build_config.bzl
index 3bfbe617122..e58c70313ad 100644
--- a/tensorflow/core/platform/build_config.bzl
+++ b/tensorflow/core/platform/build_config.bzl
@@ -28,10 +28,8 @@ load(
     _tf_platform_deps = "tf_platform_deps",
     _tf_portable_deps_no_runtime = "tf_portable_deps_no_runtime",
     _tf_portable_proto_lib = "tf_portable_proto_lib",
-    _tf_profiler_client_deps = "tf_profiler_client_deps",
     _tf_proto_library = "tf_proto_library",
     _tf_proto_library_cc = "tf_proto_library_cc",
-    _tf_proto_library_py = "tf_proto_library_py",
     _tf_protobuf_compiler_deps = "tf_protobuf_compiler_deps",
     _tf_protobuf_deps = "tf_protobuf_deps",
     _tf_protos_all = "tf_protos_all",
@@ -43,6 +41,7 @@ load(
     _tf_py_clif_cc = "tf_py_clif_cc",
     _tf_pyclif_proto_library = "tf_pyclif_proto_library",
     _tf_resource_deps = "tf_resource_deps",
+    _tf_tpu_dependencies = "tf_tpu_dependencies",
     _tf_windows_aware_platform_deps = "tf_windows_aware_platform_deps",
 )
 
@@ -74,7 +73,6 @@ tf_portable_proto_lib = _tf_portable_proto_lib
 tf_portable_deps_no_runtime = _tf_portable_deps_no_runtime
 tf_proto_library = _tf_proto_library
 tf_proto_library_cc = _tf_proto_library_cc
-tf_proto_library_py = _tf_proto_library_py
 tf_protobuf_compiler_deps = _tf_protobuf_compiler_deps
 tf_protobuf_deps = _tf_protobuf_deps
 tf_protos_all = _tf_protos_all
@@ -83,8 +81,8 @@ tf_protos_grappler = _tf_protos_grappler
 tf_protos_grappler_impl = _tf_protos_grappler_impl
 tf_protos_profiler_impl = _tf_protos_profiler_impl
 tf_protos_profiler_service = _tf_protos_profiler_service
-tf_profiler_client_deps = _tf_profiler_client_deps
 tf_py_clif_cc = _tf_py_clif_cc
 tf_pyclif_proto_library = _tf_pyclif_proto_library
 tf_resource_deps = _tf_resource_deps
 tf_windows_aware_platform_deps = _tf_windows_aware_platform_deps
+tf_tpu_dependencies = _tf_tpu_dependencies
diff --git a/tensorflow/core/platform/build_config_root.bzl b/tensorflow/core/platform/build_config_root.bzl
index c5626ca8d8c..eea50174b38 100644
--- a/tensorflow/core/platform/build_config_root.bzl
+++ b/tensorflow/core/platform/build_config_root.bzl
@@ -5,7 +5,6 @@ load(
     _if_dynamic_kernels = "if_dynamic_kernels",
     _if_static = "if_static",
     _if_static_and_not_mobile = "if_static_and_not_mobile",
-    _register_extension_info = "register_extension_info",
     _tf_additional_grpc_deps_py = "tf_additional_grpc_deps_py",
     _tf_additional_license_deps = "tf_additional_license_deps",
     _tf_additional_plugin_deps = "tf_additional_plugin_deps",
@@ -14,13 +13,11 @@ load(
     _tf_cuda_tests_tags = "tf_cuda_tests_tags",
     _tf_exec_properties = "tf_exec_properties",
     _tf_gpu_tests_tags = "tf_gpu_tests_tags",
-    _tf_sycl_tests_tags = "tf_sycl_tests_tags",
 )
 
 if_dynamic_kernels = _if_dynamic_kernels
 if_static = _if_static
 if_static_and_not_mobile = _if_static_and_not_mobile
-register_extension_info = _register_extension_info
 tf_additional_grpc_deps_py = _tf_additional_grpc_deps_py
 tf_additional_license_deps = _tf_additional_license_deps
 tf_additional_plugin_deps = _tf_additional_plugin_deps
@@ -29,4 +26,3 @@ tf_additional_xla_deps_py = _tf_additional_xla_deps_py
 tf_cuda_tests_tags = _tf_cuda_tests_tags
 tf_exec_properties = _tf_exec_properties
 tf_gpu_tests_tags = _tf_gpu_tests_tags
-tf_sycl_tests_tags = _tf_sycl_tests_tags
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index ec283099868..f52c97282bd 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -1,6 +1,7 @@
 # Description:
 # Cloud file system implementation.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_windows",
@@ -391,8 +392,8 @@ tf_cc_test(
     size = "small",
     srcs = ["oauth_client_test.cc"],
     data = [
-        "testdata/service_account_credentials.json",
-        "testdata/service_account_public_key.txt",
+        "//tensorflow/core/platform/cloud/testdata:service_account_credentials",
+        "//tensorflow/core/platform/cloud/testdata:service_account_public_key",
     ],
     deps = [
         ":http_request_fake",
@@ -414,8 +415,8 @@ tf_cc_test(
     size = "small",
     srcs = ["google_auth_provider_test.cc"],
     data = [
-        "testdata/application_default_credentials.json",
-        "testdata/service_account_credentials.json",
+        "//tensorflow/core/platform/cloud/testdata:application_default_credentials",
+        "//tensorflow/core/platform/cloud/testdata:service_account_credentials",
     ],
     deps = [
         ":google_auth_provider",
diff --git a/tensorflow/core/platform/cloud/curl_http_request.cc b/tensorflow/core/platform/cloud/curl_http_request.cc
index 3ae121abafe..10df2e12038 100644
--- a/tensorflow/core/platform/cloud/curl_http_request.cc
+++ b/tensorflow/core/platform/cloud/curl_http_request.cc
@@ -494,13 +494,16 @@ Status CurlHttpRequest::Send() {
 
     // INVALID_ARGUMENT indicates a problem with how the request is constructed.
     case 400:  // Bad Request
+    case 406:  // Not Acceptable
     case 411:  // Length Required
+    case 414:  // URI Too Long
       result = errors::InvalidArgument(get_error_message());
       break;
 
     // PERMISSION_DENIED indicates an authentication or an authorization issue.
     case 401:  // Unauthorized
     case 403:  // Forbidden
+    case 407:  // Proxy Authorization Required
       result = errors::PermissionDenied(get_error_message());
       break;
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index f0d2138b379..214ac4409d5 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -25,12 +25,13 @@ limitations under the License.
 #include <fstream>
 #include <vector>
 
+#include "tensorflow/core/platform/file_statistics.h"
 #include "tensorflow/core/platform/strcat.h"
 #ifdef _WIN32
 #include <io.h>  // for _mktemp
 #endif
 #include "absl/base/macros.h"
-#include "include/json/json.h"
+#include "json/json.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/cloud/curl_http_request.h"
 #include "tensorflow/core/platform/cloud/file_block_cache.h"
@@ -405,6 +406,11 @@ typedef std::function<Status(const string& session_uri, uint64 file_size,
                              uint64* uploaded)>
     StatusPoller;
 
+// Function object declaration with params needed to poll upload status.
+typedef std::function<Status(const string& fname, const string& bucket,
+                             const string& object, int64* generation)>
+    GenerationGetter;
+
 /// \brief GCS-based implementation of a writeable file.
 ///
 /// Since GCS objects are immutable, this implementation writes to a local
@@ -417,7 +423,8 @@ class GcsWritableFile : public WritableFile {
                   std::function<void()> file_cache_erase,
                   RetryConfig retry_config, bool compose_append,
                   SessionCreator session_creator,
-                  ObjectUploader object_uploader, StatusPoller status_poller)
+                  ObjectUploader object_uploader, StatusPoller status_poller,
+                  GenerationGetter generation_getter)
       : bucket_(bucket),
         object_(object),
         filesystem_(filesystem),
@@ -429,7 +436,8 @@ class GcsWritableFile : public WritableFile {
         start_offset_(0),
         session_creator_(std::move(session_creator)),
         object_uploader_(std::move(object_uploader)),
-        status_poller_(std::move(status_poller)) {
+        status_poller_(std::move(status_poller)),
+        generation_getter_(std::move(generation_getter)) {
     // TODO: to make it safer, outfile_ should be constructed from an FD
     VLOG(3) << "GcsWritableFile: " << GetGcsPath();
     if (GetTmpFilename(&tmp_content_filename_).ok()) {
@@ -449,7 +457,8 @@ class GcsWritableFile : public WritableFile {
                   std::function<void()> file_cache_erase,
                   RetryConfig retry_config, bool compose_append,
                   SessionCreator session_creator,
-                  ObjectUploader object_uploader, StatusPoller status_poller)
+                  ObjectUploader object_uploader, StatusPoller status_poller,
+                  GenerationGetter generation_getter)
       : bucket_(bucket),
         object_(object),
         filesystem_(filesystem),
@@ -461,7 +470,8 @@ class GcsWritableFile : public WritableFile {
         start_offset_(0),
         session_creator_(std::move(session_creator)),
         object_uploader_(std::move(object_uploader)),
-        status_poller_(std::move(status_poller)) {
+        status_poller_(std::move(status_poller)),
+        generation_getter_(std::move(generation_getter)) {
     VLOG(3) << "GcsWritableFile: " << GetGcsPath() << "with existing file "
             << tmp_content_filename;
     tmp_content_filename_ = tmp_content_filename;
@@ -632,32 +642,40 @@ class GcsWritableFile : public WritableFile {
   /// Appends the data of append_object to the original object and deletes
   /// append_object.
   Status AppendObject(string append_object) {
-    VLOG(3) << "AppendObject: " << GetGcsPathWithObject(append_object) << " to "
-            << GetGcsPath();
-    std::unique_ptr<HttpRequest> request;
-    TF_RETURN_IF_ERROR(filesystem_->CreateHttpRequest(&request));
+    const string append_object_path = GetGcsPathWithObject(append_object);
+    VLOG(3) << "AppendObject: " << append_object_path << " to " << GetGcsPath();
 
-    request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket_, "/o/",
-                                    request->EscapeString(object_),
-                                    "/compose"));
+    int64 generation = 0;
+    TF_RETURN_IF_ERROR(
+        generation_getter_(GetGcsPath(), bucket_, object_, &generation));
 
-    const string request_body =
-        strings::StrCat("{'sourceObjects': [{'name': '", object_,
-                        "'},{'name': '", append_object, "'}]}");
-    request->SetTimeouts(timeouts_->connect, timeouts_->idle,
-                         timeouts_->metadata);
-    request->AddHeader("content-type", "application/json");
-    request->SetPostFromBuffer(request_body.c_str(), request_body.size());
-    return RetryingUtils::CallWithRetries(
-        [&request, &append_object, this]() {
+    TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
+        [&append_object, &generation, this]() {
+          std::unique_ptr<HttpRequest> request;
+          TF_RETURN_IF_ERROR(filesystem_->CreateHttpRequest(&request));
+
+          request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket_, "/o/",
+                                          request->EscapeString(object_),
+                                          "/compose"));
+
+          const string request_body = strings::StrCat(
+              "{'sourceObjects': [{'name': '", object_,
+              "','objectPrecondition':{'ifGenerationMatch':", generation,
+              "}},{'name': '", append_object, "'}]}");
+          request->SetTimeouts(timeouts_->connect, timeouts_->idle,
+                               timeouts_->metadata);
+          request->AddHeader("content-type", "application/json");
+          request->SetPostFromBuffer(request_body.c_str(), request_body.size());
           TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(),
                                           " when composing to ", GetGcsPath());
-          TF_RETURN_WITH_CONTEXT_IF_ERROR(
-              filesystem_->DeleteFile(GetGcsPathWithObject(append_object),
-                                      nullptr),
-              " when cleaning up.");
           return Status::OK();
         },
+        retry_config_));
+
+    return RetryingUtils::DeleteWithRetries(
+        [&append_object_path, this]() {
+          return filesystem_->DeleteFile(append_object_path, nullptr);
+        },
         retry_config_);
   }
 
@@ -712,6 +730,7 @@ class GcsWritableFile : public WritableFile {
   const SessionCreator session_creator_;
   const ObjectUploader object_uploader_;
   const StatusPoller status_poller_;
+  const GenerationGetter generation_getter_;
 };
 
 class GcsReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
@@ -1265,10 +1284,23 @@ Status GcsFileSystem::NewWritableFile(const string& fname,
                                       completed, uploaded);
   };
 
+  auto generation_getter = [this](const string& fname, const string& bucket,
+                                  const string& object, int64* generation) {
+    GcsFileStat stat;
+    TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
+        [&fname, &bucket, &object, &stat, this]() {
+          return UncachedStatForObject(fname, bucket, object, &stat);
+        },
+        retry_config_));
+    *generation = stat.generation_number;
+    return Status::OK();
+  };
+
   result->reset(new GcsWritableFile(
       bucket, object, this, &timeouts_,
       [this, fname]() { ClearFileCaches(fname); }, retry_config_,
-      compose_append_, session_creator, object_uploader, status_poller));
+      compose_append_, session_creator, object_uploader, status_poller,
+      generation_getter));
   return Status::OK();
 }
 
@@ -1294,6 +1326,9 @@ Status GcsFileSystem::NewAppendableFile(const string& fname,
     if (status.ok()) {
       old_content << read_chunk;
       offset += kReadAppendableFileBufferSize;
+    } else if (status.code() == error::NOT_FOUND) {
+      // New file, there is no existing content in it.
+      break;
     } else if (status.code() == error::OUT_OF_RANGE) {
       // Expected, this means we reached EOF.
       old_content << read_chunk;
@@ -1326,13 +1361,26 @@ Status GcsFileSystem::NewAppendableFile(const string& fname,
                                       completed, uploaded);
   };
 
+  auto generation_getter = [this](const string& fname, const string& bucket,
+                                  const string& object, int64* generation) {
+    GcsFileStat stat;
+    TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
+        [&fname, &bucket, &object, &stat, this]() {
+          return UncachedStatForObject(fname, bucket, object, &stat);
+        },
+        retry_config_));
+    *generation = stat.generation_number;
+    return Status::OK();
+  };
+
   // Create a writable file and pass the old content to it.
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
   result->reset(new GcsWritableFile(
       bucket, object, this, old_content_filename, &timeouts_,
       [this, fname]() { ClearFileCaches(fname); }, retry_config_,
-      compose_append_, session_creator, object_uploader, status_poller));
+      compose_append_, session_creator, object_uploader, status_poller,
+      generation_getter));
   return Status::OK();
 }
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index b216281d630..6700b200e0a 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -1442,6 +1442,37 @@ TEST(GcsFileSystemTest, NewAppendableFile_NoObjectName) {
             fs.NewAppendableFile("gs://bucket/", nullptr, &file).code());
 }
 
+TEST(GcsFileSystemTest, NewAppendableFile_ObjectDoesNotExist) {
+  std::vector<HttpRequest*> requests(
+      {new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/filename\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-1048575\n"
+           "Timeouts: 5 1 20\n",
+           "", errors::NotFound("404"), 404),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/upload/storage/v1/b/bucket/o"
+           "?uploadType=resumable&name=filename\n"
+           "Auth Token: fake_token\n"
+           "Header X-Upload-Content-Length: 0\n"
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
+           "")});
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      std::unique_ptr<ZoneProvider>(new FakeZoneProvider), 0 /* block size */,
+      0 /* max bytes */, 0 /* max staleness */, 0 /* stat cache max age */,
+      0 /* stat cache max entries */, 0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, kTestRetryConfig,
+      kTestTimeoutConfig, *kAllowedLocationsDefault,
+      nullptr /* gcs additional header */, false /* compose append */);
+
+  std::unique_ptr<WritableFile> file;
+  TF_EXPECT_OK(fs.NewAppendableFile("gs://bucket/filename", nullptr, &file));
+}
+
 TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
   const string content = "file content";
   std::vector<HttpRequest*> requests(
@@ -3851,6 +3882,15 @@ TEST(GcsFileSystemTest, NewAppendableFile_MultipleFlushesWithCompose) {
                           "Put body: ",
                           contents[2], "\n"),
           ""),
+      // Fetch generation
+      new FakeHttpRequest(
+          "Uri: "
+          "https://www.googleapis.com/storage/v1/b/bucket/o/"
+          "some%2Fpath%2Fappendable?fields=size%2Cgeneration%2Cupdated\n"
+          "Auth Token: fake_token\n"
+          "Timeouts: 5 1 10\n",
+          strings::StrCat("{\"size\": \"8\",\"generation\": \"1234\","
+                          "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
       // Compose the new part at the end of the original object.
       new FakeHttpRequest("Uri: "
                           "https://www.googleapis.com/storage/v1/b/bucket/o/"
@@ -3859,7 +3899,9 @@ TEST(GcsFileSystemTest, NewAppendableFile_MultipleFlushesWithCompose) {
                           "Timeouts: 5 1 10\n"
                           "Header content-type: application/json\n"
                           "Post body: {'sourceObjects': [{'name': "
-                          "'some/path/appendable'},{'name': "
+                          "'some/path/"
+                          "appendable','objectPrecondition':{'"
+                          "ifGenerationMatch':1234}},{'name': "
                           "'some/path/.tmpcompose/appendable.18'}]}\n",
                           ""),
       // Delete the temporary object.
@@ -3887,6 +3929,15 @@ TEST(GcsFileSystemTest, NewAppendableFile_MultipleFlushesWithCompose) {
                           "Put body: ",
                           contents[3], "\n"),
           ""),
+      // Fetch generation
+      new FakeHttpRequest(
+          "Uri: "
+          "https://www.googleapis.com/storage/v1/b/bucket/o/"
+          "some%2Fpath%2Fappendable?fields=size%2Cgeneration%2Cupdated\n"
+          "Auth Token: fake_token\n"
+          "Timeouts: 5 1 10\n",
+          strings::StrCat("{\"size\": \"8\",\"generation\": \"4567\","
+                          "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
       new FakeHttpRequest("Uri: "
                           "https://www.googleapis.com/storage/v1/b/bucket/o/"
                           "some%2Fpath%2Fappendable/compose\n"
@@ -3894,7 +3945,9 @@ TEST(GcsFileSystemTest, NewAppendableFile_MultipleFlushesWithCompose) {
                           "Timeouts: 5 1 10\n"
                           "Header content-type: application/json\n"
                           "Post body: {'sourceObjects': [{'name': "
-                          "'some/path/appendable'},{'name': "
+                          "'some/path/"
+                          "appendable','objectPrecondition':{'"
+                          "ifGenerationMatch':4567}},{'name': "
                           "'some/path/.tmpcompose/appendable.27'}]}\n",
                           ""),
       new FakeHttpRequest("Uri: "
diff --git a/tensorflow/core/platform/cloud/google_auth_provider.cc b/tensorflow/core/platform/cloud/google_auth_provider.cc
index e8546ca022f..57240fa2494 100644
--- a/tensorflow/core/platform/cloud/google_auth_provider.cc
+++ b/tensorflow/core/platform/cloud/google_auth_provider.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/match.h"
-#include "include/json/json.h"
+#include "json/json.h"
 #include "tensorflow/core/platform/base64.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
diff --git a/tensorflow/core/platform/cloud/oauth_client.h b/tensorflow/core/platform/cloud/oauth_client.h
index ed8bf257253..97af3ecaf17 100644
--- a/tensorflow/core/platform/cloud/oauth_client.h
+++ b/tensorflow/core/platform/cloud/oauth_client.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "include/json/json.h"
+#include "json/json.h"
 #include "tensorflow/core/platform/cloud/http_request.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/core/platform/cloud/testdata/BUILD b/tensorflow/core/platform/cloud/testdata/BUILD
new file mode 100644
index 00000000000..88097d2214b
--- /dev/null
+++ b/tensorflow/core/platform/cloud/testdata/BUILD
@@ -0,0 +1,31 @@
+# Cloud test data files.
+
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "application_default_credentials",
+    srcs = [
+        "application_default_credentials.json",
+    ],
+    visibility = ["//tensorflow/core/platform/cloud:__pkg__"],
+)
+
+filegroup(
+    name = "service_account_credentials",
+    srcs = [
+        "service_account_credentials.json",
+    ],
+    visibility = ["//tensorflow/core/platform/cloud:__pkg__"],
+)
+
+filegroup(
+    name = "service_account_public_key",
+    srcs = [
+        "service_account_public_key.txt",
+    ],
+    visibility = ["//tensorflow/core/platform/cloud:__pkg__"],
+)
diff --git a/tensorflow/core/platform/default/BUILD b/tensorflow/core/platform/default/BUILD
index 40883cb69d4..849048e99be 100644
--- a/tensorflow/core/platform/default/BUILD
+++ b/tensorflow/core/platform/default/BUILD
@@ -1,10 +1,8 @@
 # Tensorflow default + linux implementations of tensorflow/core/platform libraries.
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow:tensorflow.bzl", "if_not_windows", "tf_copts")
-load(
-    "//tensorflow/core/platform:rules_cc.bzl",
-    "cc_library",
-)
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     default_visibility = [
@@ -49,6 +47,7 @@ cc_library(
     name = "cuda_libdevice_path",
     srcs = ["cuda_libdevice_path.cc"],
     hdrs = ["//tensorflow/core/platform:cuda_libdevice_path.h"],
+    compatible_with = [],
     tags = [
         "manual",
         "no_oss",
@@ -75,7 +74,6 @@ cc_library(
 cc_library(
     name = "env",
     srcs = [
-        "env.cc",
         "posix_file_system.cc",
         "posix_file_system.h",
         "//tensorflow/core/platform:env.cc",
@@ -130,6 +128,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "env_impl",
+    srcs = [
+        "env.cc",
+    ],
+    tags = [
+        "manual",
+        "no_oss",
+        "nobuilder",
+    ],
+    deps = [
+        ":env",
+    ],
+)
+
 cc_library(
     name = "env_time",
     srcs = ["env_time.cc"],
@@ -337,6 +350,7 @@ cc_library(
     name = "rocm_rocdl_path",
     srcs = ["rocm_rocdl_path.cc"],
     hdrs = ["//tensorflow/core/platform:rocm_rocdl_path.h"],
+    compatible_with = [],
     tags = [
         "manual",
         "no_oss",
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 9f84b9205f1..2471ed64464 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -1,7 +1,7 @@
 # Platform-specific build configurations.
 
 load("@com_google_protobuf//:protobuf.bzl", "proto_gen")
-load("//tensorflow:tensorflow.bzl", "clean_dep", "if_not_windows")
+load("//tensorflow:tensorflow.bzl", "clean_dep", "if_libtpu", "if_not_windows")
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
@@ -498,6 +498,7 @@ def tf_proto_library(
         visibility = None,
         testonly = 0,
         cc_libs = [],
+        cc_stubby_versions = None,
         cc_api_version = 2,
         cc_grpc_version = None,
         use_grpc_namespace = False,
@@ -505,14 +506,25 @@ def tf_proto_library(
         js_codegen = "jspb",
         create_service = False,
         create_java_proto = False,
+        create_go_proto = False,
+        create_grpc_library = False,
         make_default_target_header_only = False,
-        exports = []):
+        exports = [],
+        tags = []):
     """Make a proto library, possibly depending on other proto libraries."""
 
     # TODO(b/145545130): Add docstring explaining what rules this creates and how
     # opensource projects importing TF in bazel can use them safely (i.e. w/o ODR or
     # ABI violations).
-    _ignore = (js_codegen, exports, create_service, create_java_proto)
+    _ignore = (
+        js_codegen,
+        exports,
+        create_service,
+        create_java_proto,
+        create_grpc_library,
+        cc_stubby_versions,
+        create_go_proto,
+    )
 
     native.proto_library(
         name = name,
@@ -520,6 +532,7 @@ def tf_proto_library(
         deps = protodeps + well_known_proto_libs(),
         visibility = visibility,
         testonly = testonly,
+        tags = tags,
     )
 
     tf_proto_library_cc(
@@ -606,9 +619,6 @@ def tf_protos_profiler_service():
         clean_dep("//tensorflow/core/profiler:profiler_service_monitor_result_proto_cc_impl"),
     ]
 
-def tf_profiler_client_deps():
-    return [clean_dep("//tensorflow/core/profiler/rpc/client:profiler_client_headers")]
-
 def tf_protos_grappler_impl():
     return [clean_dep("//tensorflow/core/grappler/costs:op_performance_data_cc_impl")]
 
@@ -666,6 +676,7 @@ def tf_additional_core_deps():
         clean_dep("//tensorflow:linux_s390x"): [],
         clean_dep("//tensorflow:windows"): [],
         clean_dep("//tensorflow:no_hdfs_support"): [],
+        clean_dep("//tensorflow:with_tpu_support"): [],
         "//conditions:default": [
             clean_dep("//tensorflow/core/platform/hadoop:hadoop_file_system"),
         ],
@@ -800,3 +811,6 @@ def if_llvm_system_z_available(then, otherwise = []):
         "//tensorflow:linux_s390x": then,
         "//conditions:default": otherwise,
     })
+
+def tf_tpu_dependencies():
+    return if_libtpu(["//tensorflow/core/tpu/kernels"])
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 20f0e9e42d9..83eadf2c460 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -1,10 +1,9 @@
 # Description:
 # Platform-specific build configurations.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_copts", "tf_cuda_library")
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
-load("@local_config_sycl//sycl:platform.bzl", "sycl_library_path")
-load("@local_config_sycl//sycl:build_defs.bzl", "if_ccpp")
 
 package(default_visibility = ["//tensorflow:internal"])
 
@@ -219,17 +218,3 @@ cc_library(
     }),
     deps = [],
 )
-
-cc_library(
-    name = "sycl",
-    data = if_ccpp([
-        "@local_config_sycl//sycl:{}".format(sycl_library_path("ComputeCpp")),
-    ]),
-    linkopts = if_ccpp([
-        "-Wl,-rpath,../local_config_sycl/sycl/lib",
-    ]),
-    deps = if_ccpp(
-        ["@local_config_sycl//sycl:syclrt"],
-        ["@local_config_sycl//sycl:sycl_headers"],
-    ),
-)
diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl
index 3afe1de58df..38aeb9fb6ba 100644
--- a/tensorflow/core/platform/default/build_config_root.bzl
+++ b/tensorflow/core/platform/default/build_config_root.bzl
@@ -18,9 +18,6 @@ def tf_gpu_tests_tags():
 def tf_cuda_tests_tags():
     return tf_gpu_tests_tags()
 
-def tf_sycl_tests_tags():
-    return ["requires-gpu", "gpu"] + gpu_test_tags()
-
 def tf_exec_properties(kwargs):
     if ("tags" in kwargs and kwargs["tags"] != None and
         "remote-gpu" in kwargs["tags"]):
@@ -74,6 +71,3 @@ def if_dynamic_kernels(extra_deps, otherwise = []):
         str(Label("//tensorflow:dynamic_loaded_kernels")): extra_deps,
         "//conditions:default": otherwise,
     })
-
-def register_extension_info(**kwargs):
-    pass
diff --git a/tensorflow/core/platform/default/distribute.bzl b/tensorflow/core/platform/default/distribute.bzl
index b16d5e8cff7..058ed0c4ff8 100644
--- a/tensorflow/core/platform/default/distribute.bzl
+++ b/tensorflow/core/platform/default/distribute.bzl
@@ -1,9 +1,5 @@
 """Build rules for tf.distribute testing."""
 
-load(
-    "//tensorflow/core/platform:build_config_root.bzl",
-    "register_extension_info",
-)
 load("//tensorflow/python/tpu:tpu.bzl", _tpu_py_test = "tpu_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
@@ -81,8 +77,3 @@ def distribute_py_test(
             disable_v3 = disable_v3,
             disable_mlir_bridge = disable_mlir_bridge,
         )
-
-register_extension_info(
-    extension_name = "distribute_py_test",
-    label_regex_for_dep = "{extension_name}",
-)
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 7b716798c28..308d8a09fa7 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -111,6 +111,13 @@ class Env {
   Status NewRandomAccessFile(const std::string& fname,
                              std::unique_ptr<RandomAccessFile>* result);
 
+  Status NewRandomAccessFile(const std::string& fname, TransactionToken* token,
+                             std::unique_ptr<RandomAccessFile>* result) {
+    // We duplicate these methods due to Google internal coding style prevents
+    // virtual functions with default arguments. See PR #41615.
+    return Status::OK();
+  }
+
   /// \brief Creates an object that writes to a new file with the specified
   /// name.
   ///
@@ -127,6 +134,11 @@ class Env {
   Status NewWritableFile(const std::string& fname,
                          std::unique_ptr<WritableFile>* result);
 
+  Status NewWritableFile(const std::string& fname, TransactionToken* token,
+                         std::unique_ptr<WritableFile>* result) {
+    return Status::OK();
+  }
+
   /// \brief Creates an object that either appends to an existing file, or
   /// writes to a new file (if the file does not exist to begin with).
   ///
@@ -142,6 +154,10 @@ class Env {
   Status NewAppendableFile(const std::string& fname,
                            std::unique_ptr<WritableFile>* result);
 
+  Status NewAppendableFile(const std::string& fname, TransactionToken* token,
+                           std::unique_ptr<WritableFile>* result) {
+    return Status::OK();
+  }
   /// \brief Creates a readonly region of memory with the file context.
   ///
   /// On success, it returns a pointer to read-only memory region
@@ -156,21 +172,41 @@ class Env {
   Status NewReadOnlyMemoryRegionFromFile(
       const std::string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result);
 
+  Status NewReadOnlyMemoryRegionFromFile(
+      const std::string& fname, TransactionToken* token,
+      std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+    return Status::OK();
+  }
+
   /// Returns OK if the named path exists and NOT_FOUND otherwise.
   Status FileExists(const std::string& fname);
 
+  Status FileExists(const std::string& fname, TransactionToken* token) {
+    return Status::OK();
+  }
+
   /// Returns true if all the listed files exist, false otherwise.
   /// if status is not null, populate the vector with a detailed status
   /// for each file.
   bool FilesExist(const std::vector<string>& files,
                   std::vector<Status>* status);
 
+  bool FilesExist(const std::vector<string>& files, TransactionToken* token,
+                  std::vector<Status>* status) {
+    return true;
+  }
+
   /// \brief Stores in *result the names of the children of the specified
   /// directory. The names are relative to "dir".
   ///
   /// Original contents of *results are dropped.
   Status GetChildren(const std::string& dir, std::vector<string>* result);
 
+  Status GetChildren(const std::string& dir, TransactionToken* token,
+                     std::vector<string>* result) {
+    return Status::OK();
+  }
+
   /// \brief Returns true if the path matches the given pattern. The wildcards
   /// allowed in pattern are described in FileSystem::GetMatchingPaths.
   virtual bool MatchPath(const std::string& path,
@@ -183,9 +219,18 @@ class Env {
   virtual Status GetMatchingPaths(const std::string& pattern,
                                   std::vector<string>* results);
 
+  Status GetMatchingPaths(const std::string& pattern, TransactionToken* token,
+                          std::vector<string>* results) {
+    return Status::OK();
+  }
+
   /// Deletes the named file.
   Status DeleteFile(const std::string& fname);
 
+  Status DeleteFile(const std::string& fname, TransactionToken* token) {
+    return Status::OK();
+  }
+
   /// \brief Deletes the specified directory and all subdirectories and files
   /// underneath it. This is accomplished by traversing the directory tree
   /// rooted at dirname and deleting entries as they are encountered.
@@ -213,6 +258,11 @@ class Env {
   Status DeleteRecursively(const std::string& dirname, int64* undeleted_files,
                            int64* undeleted_dirs);
 
+  Status DeleteRecursively(const std::string& dirname, TransactionToken* token,
+                           int64* undeleted_files, int64* undeleted_dirs) {
+    return Status::OK();
+  }
+
   /// \brief Creates the specified directory and all the necessary
   /// subdirectories. Typical return codes.
   ///  * OK - successfully created the directory and sub directories, even if
@@ -220,18 +270,35 @@ class Env {
   ///  * PERMISSION_DENIED - dirname or some subdirectory is not writable.
   Status RecursivelyCreateDir(const std::string& dirname);
 
+  Status RecursivelyCreateDir(const std::string& dirname,
+                              TransactionToken* token) {
+    return Status::OK();
+  }
   /// \brief Creates the specified directory. Typical return codes
   ///  * OK - successfully created the directory.
   ///  * ALREADY_EXISTS - directory already exists.
   ///  * PERMISSION_DENIED - dirname is not writable.
   Status CreateDir(const std::string& dirname);
 
+  Status CreateDir(const std::string& dirname, TransactionToken* token) {
+    return Status::OK();
+  }
+
   /// Deletes the specified directory.
   Status DeleteDir(const std::string& dirname);
 
+  Status DeleteDir(const std::string& dirname, TransactionToken* token) {
+    return Status::OK();
+  }
+
   /// Obtains statistics for the given path.
   Status Stat(const std::string& fname, FileStatistics* stat);
 
+  Status Stat(const std::string& fname, TransactionToken* token,
+              FileStatistics* stat) {
+    return Status::OK();
+  }
+
   /// \brief Returns whether the given path is a directory or not.
   /// Typical return codes (not guaranteed exhaustive):
   ///  * OK - The path exists and is a directory.
@@ -256,13 +323,59 @@ class Env {
   /// Stores the size of `fname` in `*file_size`.
   Status GetFileSize(const std::string& fname, uint64* file_size);
 
+  Status GetFileSize(const std::string& fname, TransactionToken* token,
+                     uint64* file_size) {
+    return Status::OK();
+  }
+
   /// \brief Renames file src to target. If target already exists, it will be
   /// replaced.
   Status RenameFile(const std::string& src, const std::string& target);
 
+  Status RenameFile(const std::string& src, const std::string& target,
+                    TransactionToken* token) {
+    return Status::OK();
+  }
+
   /// \brief Copy the src to target.
   Status CopyFile(const std::string& src, const std::string& target);
 
+  Status CopyFile(const std::string& src, const std::string& target,
+                  TransactionToken* token) {
+    return Status::OK();
+  }
+
+  /// \brief starts a new transaction on the filesystem that handles filename
+  Status StartTransaction(const std::string& filename,
+                          TransactionToken** token) {
+    token = nullptr;
+    return Status::OK();
+  }
+
+  /// \brief Adds `path` to transaction in `token` if token belongs to
+  /// filesystem that handles the path.
+  Status AddToTransaction(const std::string& path, TransactionToken* token) {
+    return Status::OK();
+  }
+
+  /// \brief Get token for `path` or start a new transaction and add `path` to
+  /// it.
+  Status GetTokenOrStartTransaction(const std::string& path,
+                                    TransactionToken** token) {
+    *token = nullptr;
+    return Status::OK();
+  }
+
+  /// \brief Returns the transaction for `path` or nullptr in `token`
+  Status GetTransactionForPath(const std::string& path,
+                               TransactionToken** token) {
+    token = nullptr;
+    return Status::OK();
+  }
+
+  /// \brief Finalizes the transaction
+  Status EndTransaction(TransactionToken* token) { return Status::OK(); }
+
   /// \brief Returns the absolute path of the current executable. It resolves
   /// symlinks if there is any.
   std::string GetExecutablePath();
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index ca9f0fd5145..3b593049902 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -501,7 +501,7 @@ class FileSystem {
   /// \brief Get token for `path` or start a new transaction and add `path` to
   /// it.
   virtual tensorflow::Status GetTokenOrStartTransaction(
-      const string& path, TransactionToken** token) {
+      const std::string& path, TransactionToken** token) {
     *token = nullptr;
     return Status::OK();
   }
diff --git a/tensorflow/core/platform/hadoop/BUILD b/tensorflow/core/platform/hadoop/BUILD
index 49d9e9975cf..7f24ba3d07e 100644
--- a/tensorflow/core/platform/hadoop/BUILD
+++ b/tensorflow/core/platform/hadoop/BUILD
@@ -1,6 +1,7 @@
 # Description:
 # Hadoop file system implementation.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
@@ -15,6 +16,7 @@ cc_library(
     name = "hadoop_file_system",
     srcs = ["hadoop_file_system.cc"],
     hdrs = ["hadoop_file_system.h"],
+    compatible_with = [],
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 6e4267ec64b..74195db7730 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -154,9 +154,8 @@ Status SplitArchiveNameAndPath(StringPiece& path, string& nn) {
   return Status::OK();
 }
 
-// We rely on HDFS connection caching here. The HDFS client calls
-// org.apache.hadoop.fs.FileSystem.get(), which caches the connection
-// internally.
+// We implement connection caching in Tensorflow, which can significantly
+// improve performance. Fixes #43187
 Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
   TF_RETURN_IF_ERROR(libhdfs()->status());
 
@@ -164,6 +163,7 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
   io::ParseURI(fname, &scheme, &namenode, &path);
   string nn(namenode);
 
+  string cacheKey(scheme.data(), scheme.size());
   hdfsBuilder* builder = libhdfs()->hdfsNewBuilder();
   if (scheme == "file") {
     libhdfs()->hdfsBuilderSetNameNode(builder, nullptr);
@@ -185,13 +185,22 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) {
   } else if (scheme == "har") {
     TF_RETURN_IF_ERROR(SplitArchiveNameAndPath(path, nn));
     libhdfs()->hdfsBuilderSetNameNode(builder, nn.c_str());
+    cacheKey += nn;
   } else {
     libhdfs()->hdfsBuilderSetNameNode(builder,
                                       nn.empty() ? "default" : nn.c_str());
+    cacheKey += nn;
   }
-  *fs = libhdfs()->hdfsBuilderConnect(builder);
-  if (*fs == nullptr) {
-    return errors::NotFound(strerror(errno));
+  {
+    mutex_lock lock(mu_);
+    if (connectionCache_.find(cacheKey) == connectionCache_.end()) {
+      hdfsFS cacheFs = libhdfs()->hdfsBuilderConnect(builder);
+      if (cacheFs == nullptr) {
+        return errors::NotFound(strerror(errno));
+      }
+      connectionCache_[cacheKey] = cacheFs;
+    }
+    *fs = connectionCache_[cacheKey];
   }
   return Status::OK();
 }
@@ -209,7 +218,14 @@ class HDFSRandomAccessFile : public RandomAccessFile {
       : filename_(filename),
         hdfs_filename_(hdfs_filename),
         fs_(fs),
-        file_(file) {}
+        file_(file) {
+    const char* disable_eof_retried = getenv("HDFS_DISABLE_READ_EOF_RETRIED");
+    if (disable_eof_retried && disable_eof_retried[0] == '1') {
+      disable_eof_retried_ = true;
+    } else {
+      disable_eof_retried_ = false;
+    }
+  }
 
   ~HDFSRandomAccessFile() override {
     if (file_ != nullptr) {
@@ -228,6 +244,10 @@ class HDFSRandomAccessFile : public RandomAccessFile {
     Status s;
     char* dst = scratch;
     bool eof_retried = false;
+    if (disable_eof_retried_) {
+      // eof_retried = true, avoid calling hdfsOpenFile in Read, Fixes #42597
+      eof_retried = true;
+    }
     while (n > 0 && s.ok()) {
       // We lock inside the loop rather than outside so we don't block other
       // concurrent readers.
@@ -274,6 +294,7 @@ class HDFSRandomAccessFile : public RandomAccessFile {
   string filename_;
   string hdfs_filename_;
   hdfsFS fs_;
+  bool disable_eof_retried_;
 
   mutable mutex mu_;
   mutable hdfsFile file_ TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.h b/tensorflow/core/platform/hadoop/hadoop_file_system.h
index 24fb28c522e..78569b63b1f 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.h
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.h
@@ -16,7 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PLATFORM_HADOOP_HADOOP_FILE_SYSTEM_H_
 #define TENSORFLOW_CORE_PLATFORM_HADOOP_HADOOP_FILE_SYSTEM_H_
 
+#include <map>
+
 #include "tensorflow/core/platform/env.h"
+#include "third_party/hadoop/hdfs.h"
 
 extern "C" {
 struct hdfs_internal;
@@ -74,6 +77,8 @@ class HadoopFileSystem : public FileSystem {
   string TranslateName(const string& name) const override;
 
  private:
+  mutex mu_;
+  std::map<std::string, hdfsFS> connectionCache_ TF_GUARDED_BY(mu_);
   Status Connect(StringPiece fname, hdfsFS* fs);
 };
 
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
index ae5d09c806b..892eb919141 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system_test.cc
@@ -121,12 +121,12 @@ TEST_F(HadoopFileSystemTest, FileExists) {
 
 TEST_F(HadoopFileSystemTest, GetChildren) {
   const string base = TmpDir("GetChildren");
-  TF_EXPECT_OK(hdfs.CreateDir(base));
+  TF_EXPECT_OK(hdfs.CreateDir(base, nullptr));
 
   const string file = io::JoinPath(base, "testfile.csv");
   TF_EXPECT_OK(WriteString(file, "blah"));
   const string subdir = io::JoinPath(base, "subdir");
-  TF_EXPECT_OK(hdfs.CreateDir(subdir));
+  TF_EXPECT_OK(hdfs.CreateDir(subdir, nullptr));
 
   std::vector<string> children;
   TF_EXPECT_OK(hdfs.GetChildren(base, &children));
@@ -151,7 +151,7 @@ TEST_F(HadoopFileSystemTest, GetFileSize) {
 
 TEST_F(HadoopFileSystemTest, CreateDirStat) {
   const string dir = TmpDir("CreateDirStat");
-  TF_EXPECT_OK(hdfs.CreateDir(dir));
+  TF_EXPECT_OK(hdfs.CreateDir(dir, nullptr));
   FileStatistics stat;
   TF_EXPECT_OK(hdfs.Stat(dir, &stat));
   EXPECT_TRUE(stat.is_directory);
@@ -160,7 +160,7 @@ TEST_F(HadoopFileSystemTest, CreateDirStat) {
 TEST_F(HadoopFileSystemTest, DeleteDir) {
   const string dir = TmpDir("DeleteDir");
   EXPECT_FALSE(hdfs.DeleteDir(dir).ok());
-  TF_EXPECT_OK(hdfs.CreateDir(dir));
+  TF_EXPECT_OK(hdfs.CreateDir(dir, nullptr));
   TF_EXPECT_OK(hdfs.DeleteDir(dir));
   FileStatistics stat;
   EXPECT_FALSE(hdfs.Stat(dir, &stat).ok());
@@ -236,6 +236,40 @@ TEST_F(HadoopFileSystemTest, WriteWhileReading) {
   TF_EXPECT_OK(writer->Close());
 }
 
+TEST_F(HadoopFileSystemTest, ReadWhileOverwriting) {
+  static char set_disable_var[] = "HDFS_DISABLE_READ_EOF_RETRIED=1";
+  putenv(set_disable_var);
+  const string fname = TmpDir("ReadWhileOverwriting");
+
+  if (!str_util::StartsWith(fname, "hdfs://")) {
+    return;
+  }
+
+  const string content1 = "content1";
+  TF_ASSERT_OK(WriteString(fname, content1));
+
+  std::unique_ptr<RandomAccessFile> reader;
+  TF_EXPECT_OK(hdfs.NewRandomAccessFile(fname, &reader));
+
+  string got;
+  got.resize(content1.size());
+  StringPiece result;
+  TF_EXPECT_OK(reader->Read(0, content1.size(), &result, &*got.begin()));
+  EXPECT_EQ(content1, result);
+
+  TF_EXPECT_OK(hdfs.DeleteFile(fname));
+
+  string content2 = "overwrite";
+  TF_ASSERT_OK(WriteString(fname, content1 + content2));
+
+  got.resize(content2.size());
+  reader->Read(content1.size(), content2.size(), &result, &*got.begin());
+  EXPECT_EQ(0, result.size());
+
+  static char set_enable_var[] = "HDFS_DISABLE_READ_EOF_RETRIED=0";
+  putenv(set_enable_var);
+}
+
 TEST_F(HadoopFileSystemTest, HarSplit) {
   string har_path =
       "har://hdfs-root/user/j.doe/my_archive.har/dir0/dir1/file.txt";
diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index 4f8e49d2653..249ccb58032 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -93,6 +93,15 @@ limitations under the License.
 #define TF_ATTRIBUTE_ANNOTATE(str)
 #endif
 
+// A variable declaration annotated with the `TF_CONST_INIT` attribute will
+// not compile (on supported platforms) unless the variable has a constant
+// initializer.
+#if TF_HAS_CPP_ATTRIBUTE(clang::require_constant_initialization)
+#define TF_CONST_INIT [[clang::require_constant_initialization]]
+#else
+#define TF_CONST_INIT
+#endif
+
 // Compilers can be told that a certain branch is not likely to be taken
 // (for instance, a CHECK failure), and use that information in static
 // analysis. Giving it this information can help it optimize for the
diff --git a/tensorflow/core/platform/numbers.cc b/tensorflow/core/platform/numbers.cc
index d4ea4bb52e9..e96ef6243cf 100644
--- a/tensorflow/core/platform/numbers.cc
+++ b/tensorflow/core/platform/numbers.cc
@@ -186,6 +186,14 @@ size_t DoubleToBuffer(double value, char* buffer) {
   // this assert.
   static_assert(DBL_DIG < 20, "DBL_DIG is too big");
 
+  if (std::isnan(value)) {
+    int snprintf_result = snprintf(buffer, kFastToBufferSize, "%snan",
+                                   std::signbit(value) ? "-" : "");
+    // Paranoid check to ensure we don't overflow the buffer.
+    DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
+    return snprintf_result;
+  }
+
   if (std::abs(value) <= kDoublePrecisionCheckMax) {
     int snprintf_result =
         snprintf(buffer, kFastToBufferSize, "%.*g", DBL_DIG, value);
@@ -365,6 +373,14 @@ size_t FloatToBuffer(float value, char* buffer) {
   // this assert.
   static_assert(FLT_DIG < 10, "FLT_DIG is too big");
 
+  if (std::isnan(value)) {
+    int snprintf_result = snprintf(buffer, kFastToBufferSize, "%snan",
+                                   std::signbit(value) ? "-" : "");
+    // Paranoid check to ensure we don't overflow the buffer.
+    DCHECK(snprintf_result > 0 && snprintf_result < kFastToBufferSize);
+    return snprintf_result;
+  }
+
   int snprintf_result =
       snprintf(buffer, kFastToBufferSize, "%.*g", FLT_DIG, value);
 
diff --git a/tensorflow/core/platform/path.cc b/tensorflow/core/platform/path.cc
index a041ac67d72..281371d3f70 100644
--- a/tensorflow/core/platform/path.cc
+++ b/tensorflow/core/platform/path.cc
@@ -328,6 +328,7 @@ string GetTempFilename(const string& extension) {
     }
   }
   LOG(FATAL) << "No temp directory found.";
+  std::abort();
 #endif
 }
 
diff --git a/tensorflow/core/platform/platform.h b/tensorflow/core/platform/platform.h
index 3375a6e50eb..8241fe0bc00 100644
--- a/tensorflow/core/platform/platform.h
+++ b/tensorflow/core/platform/platform.h
@@ -47,17 +47,6 @@ limitations under the License.
 // EMSCRIPTEN builds are considered "mobile" for the sake of portability.
 #define IS_MOBILE_PLATFORM
 
-#elif defined(__arm__) || defined(__aarch64__)
-// If no platform specified, use:
-#define PLATFORM_POSIX
-
-// Require an outside macro to tell us if we're building for Raspberry Pi or
-// another ARM device that's not a mobile platform.
-#if !defined(RASPBERRY_PI) && !defined(ARM_NON_MOBILE) && \
-    !defined(PLATFORM_GOOGLE)
-#define IS_MOBILE_PLATFORM
-#endif
-
 #else
 // If no platform specified, use:
 #define PLATFORM_POSIX
diff --git a/tensorflow/core/platform/ram_file_system.h b/tensorflow/core/platform/ram_file_system.h
index 407bcb3ba0f..ce6d05486e5 100644
--- a/tensorflow/core/platform/ram_file_system.h
+++ b/tensorflow/core/platform/ram_file_system.h
@@ -177,7 +177,7 @@ class RamFileSystem : public FileSystem {
               FileStatistics* stat) override {
     mutex_lock m(mu_);
     auto it = fs_.lower_bound(fname);
-    if (it == fs_.end()) {
+    if (it == fs_.end() || !absl::StartsWith(it->first, fname)) {
       return errors::NotFound("");
     }
 
diff --git a/tensorflow/core/platform/ram_file_system_test.py b/tensorflow/core/platform/ram_file_system_test.py
index 0f4f47ec44e..960765d68a2 100644
--- a/tensorflow/core/platform/ram_file_system_test.py
+++ b/tensorflow/core/platform/ram_file_system_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import def_function
 from tensorflow.python.estimator.estimator import Estimator
 from tensorflow.python.estimator.model_fn import EstimatorSpec
 from tensorflow.python.estimator.run_config import RunConfig
@@ -28,9 +29,11 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.layers import core as core_layers
+from tensorflow.python.module import module
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import saved_model
 from tensorflow.python.training import adam
 from tensorflow.python.training import training_util
 
@@ -82,6 +85,17 @@ class RamFilesystemTest(test_util.TensorFlowTestCase):
     matches = ['ram://c/b/%d.txt' % i for i in range(10)]
     self.assertEqual(gfile.Glob('ram://c/b/*'), matches)
 
+  def test_file_exists(self):
+    with gfile.GFile('ram://exists/a/b/c.txt', 'w') as f:
+      f.write('')
+    self.assertTrue(gfile.Exists('ram://exists/a'))
+    self.assertTrue(gfile.Exists('ram://exists/a/b'))
+    self.assertTrue(gfile.Exists('ram://exists/a/b/c.txt'))
+
+    self.assertFalse(gfile.Exists('ram://exists/b'))
+    self.assertFalse(gfile.Exists('ram://exists/a/c'))
+    self.assertFalse(gfile.Exists('ram://exists/a/b/k'))
+
   def test_estimator(self):
 
     def model_fn(features, labels, mode, params):
@@ -114,6 +128,18 @@ class RamFilesystemTest(test_util.TensorFlowTestCase):
     estimator.train(input_fn=input_fn, steps=10)
     estimator.train(input_fn=input_fn, steps=10)
 
+  def test_savedmodel(self):
+    class MyModule(module.Module):
+
+      @def_function.function(input_signature=[])
+      def foo(self):
+        return constant_op.constant([1])
+
+    saved_model.save(MyModule(), 'ram://my_module')
+
+    loaded = saved_model.load('ram://my_module')
+    self.assertAllEqual(loaded.foo(), [1])
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/core/platform/s3/s3_file_system_test.cc b/tensorflow/core/platform/s3/s3_file_system_test.cc
index 06432165e68..960316685f3 100644
--- a/tensorflow/core/platform/s3/s3_file_system_test.cc
+++ b/tensorflow/core/platform/s3/s3_file_system_test.cc
@@ -68,8 +68,8 @@ class S3FileSystemTest : public ::testing::Test {
                          bool use_multi_part_download = true) {
     std::unique_ptr<RandomAccessFile> reader;
 
-    TF_RETURN_IF_ERROR(
-        s3fs.NewRandomAccessFile(fname, &reader, use_multi_part_download));
+    TF_RETURN_IF_ERROR(s3fs.NewRandomAccessFile(fname, nullptr, &reader,
+                                                use_multi_part_download));
 
     uint64 file_size = 0;
     TF_RETURN_IF_ERROR(s3fs.GetFileSize(fname, &file_size));
@@ -228,13 +228,13 @@ TEST_F(S3FileSystemTest, FileExists) {
 
 TEST_F(S3FileSystemTest, GetChildren) {
   const string base = TmpDir("GetChildren");
-  TF_EXPECT_OK(s3fs.CreateDir(base));
+  TF_EXPECT_OK(s3fs.CreateDir(base, nullptr));
 
   const string file = io::JoinPath(base, "TestFile.csv");
   TF_EXPECT_OK(WriteString(file, "test"));
 
   const string subdir = io::JoinPath(base, "SubDir");
-  TF_EXPECT_OK(s3fs.CreateDir(subdir));
+  TF_EXPECT_OK(s3fs.CreateDir(subdir, nullptr));
   // s3 object storage doesn't support empty directory, we create file in the
   // directory
   const string subfile = io::JoinPath(subdir, "TestSubFile.csv");
@@ -264,7 +264,7 @@ TEST_F(S3FileSystemTest, CreateDir) {
   // s3 object storage doesn't support empty directory, we create file in the
   // directory
   const string dir = TmpDir("CreateDir");
-  TF_EXPECT_OK(s3fs.CreateDir(dir));
+  TF_EXPECT_OK(s3fs.CreateDir(dir, nullptr));
 
   const string file = io::JoinPath(dir, "CreateDirFile.csv");
   TF_EXPECT_OK(WriteString(file, "test"));
diff --git a/tensorflow/core/platform/tf32_utils.cc b/tensorflow/core/platform/tensor_float_32_utils.cc
similarity index 73%
rename from tensorflow/core/platform/tf32_utils.cc
rename to tensorflow/core/platform/tensor_float_32_utils.cc
index d2f40ea161a..bbbe5683109 100644
--- a/tensorflow/core/platform/tf32_utils.cc
+++ b/tensorflow/core/platform/tensor_float_32_utils.cc
@@ -13,18 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/tf32_utils.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
 
 #include <atomic>
 
 namespace tensorflow {
 
 // Whether TensorFloat-32 should be used where supported.
-// TODO(nluehr): Maybe enable by default after TF32 Ampere testing.
-static std::atomic<bool> tf32_allowed{false};
+static std::atomic<bool> tensor_float_32_enabled{true};
 
-void allow_tf32_execution(bool allowed) { tf32_allowed = allowed; }
+void enable_tensor_float_32_execution(bool enabled) {
+  tensor_float_32_enabled = enabled;
+}
 
-bool tf32_execution_allowed() { return tf32_allowed; }
+bool tensor_float_32_execution_enabled() { return tensor_float_32_enabled; }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/tf32_utils.h b/tensorflow/core/platform/tensor_float_32_utils.h
similarity index 72%
rename from tensorflow/core/platform/tf32_utils.h
rename to tensorflow/core/platform/tensor_float_32_utils.h
index 7a158d00ad3..5dcef037ec2 100644
--- a/tensorflow/core/platform/tf32_utils.h
+++ b/tensorflow/core/platform/tensor_float_32_utils.h
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
-#define TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_TENSOR_FLOAT_32_UTILS_H_
+#define TENSORFLOW_CORE_PLATFORM_TENSOR_FLOAT_32_UTILS_H_
 
 namespace tensorflow {
 
-void allow_tf32_execution(bool allowed);
+void enable_tensor_float_32_execution(bool enabled);
 
-bool tf32_execution_allowed();
+bool tensor_float_32_execution_enabled();
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PLATFORM_TF32_UTILS_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_TENSOR_FLOAT_32_UTILS_H_
diff --git a/tensorflow/core/platform/test.h b/tensorflow/core/platform/test.h
index ba507837652..29fceb2d896 100644
--- a/tensorflow/core/platform/test.h
+++ b/tensorflow/core/platform/test.h
@@ -40,7 +40,7 @@ limitations under the License.
 // The advantages of using gmock matchers instead of self defined matchers are
 // better error messages, more maintainable tests and more test coverage.
 #if defined(PLATFORM_GOOGLE) || defined(PLATFORM_GOOGLE_ANDROID)
-#include "testing/base/public/gmock.h"
+#include "testing/base/public/gmock.h"  // IWYU pragma: export
 #else
 #include <gmock/gmock-generated-matchers.h>
 #include <gmock/gmock-matchers.h>
diff --git a/tensorflow/core/platform/windows/BUILD b/tensorflow/core/platform/windows/BUILD
index dddb4b9aed4..423d8fe5d35 100644
--- a/tensorflow/core/platform/windows/BUILD
+++ b/tensorflow/core/platform/windows/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
 # Tensorflow windows-specific implementations of tensorflow/core/platform libraries.
 load(
     "//tensorflow:tensorflow.bzl",
@@ -18,7 +20,6 @@ package(
 cc_library(
     name = "env",
     srcs = [
-        "env.cc",
         "windows_file_system.cc",
         "windows_file_system.h",
         "//tensorflow/core/platform:env.cc",
@@ -75,6 +76,21 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "env_impl",
+    srcs = [
+        "env.cc",
+    ],
+    tags = [
+        "manual",
+        "no_oss",
+        "nobuilder",
+    ],
+    deps = [
+        ":env",
+    ],
+)
+
 cc_library(
     name = "env_time",
     srcs = ["env_time.cc"],
diff --git a/tensorflow/core/profiler/BUILD b/tensorflow/core/profiler/BUILD
index 543a7d6467b..84228af8d5e 100644
--- a/tensorflow/core/profiler/BUILD
+++ b/tensorflow/core/profiler/BUILD
@@ -1,4 +1,6 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/core/platform:build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
 
 package(
@@ -87,9 +89,9 @@ cc_library(
     name = "profiler_impl",
     visibility = ["//tensorflow:__pkg__"],
     deps = [
-        "//tensorflow/core/profiler/internal:annotation_stack_impl",
-        "//tensorflow/core/profiler/internal:profiler_factory_impl",
-        "//tensorflow/core/profiler/internal:traceme_recorder_impl",
+        "//tensorflow/core/profiler/internal/cpu:annotation_stack_impl",
+        "//tensorflow/core/profiler/internal/cpu:traceme_recorder_impl",
+        "//tensorflow/core/profiler/lib:profiler_factory_impl",
         "//tensorflow/core/profiler/lib:profiler_session_impl",
     ],
     alwayslink = True,
@@ -133,7 +135,6 @@ cc_library(
 filegroup(
     name = "mobile_srcs",
     srcs = [
-        "//tensorflow/core/profiler/internal:mobile_srcs",
         "//tensorflow/core/profiler/lib:mobile_srcs",
     ],
     visibility = ["//tensorflow/core:__pkg__"],
diff --git a/tensorflow/core/profiler/builds/build_config.bzl b/tensorflow/core/profiler/builds/build_config.bzl
index 7c1b0a06c06..bd20787b398 100644
--- a/tensorflow/core/profiler/builds/build_config.bzl
+++ b/tensorflow/core/profiler/builds/build_config.bzl
@@ -3,12 +3,17 @@
 load(
     "//tensorflow/core/profiler/builds/oss:build_config.bzl",
     _tf_profiler_alias = "tf_profiler_alias",
+    _tf_profiler_pybind_cc_library_wrapper = "tf_profiler_pybind_cc_library_wrapper",
 )
 
 tf_profiler_alias = _tf_profiler_alias
+tf_profiler_pybind_cc_library_wrapper = _tf_profiler_pybind_cc_library_wrapper
 
 def if_profiler_oss(if_true, if_false = []):
     return select({
         "//tensorflow/core/profiler/builds:profiler_build_oss": if_true,
         "//conditions:default": if_false,
     })
+
+def tf_profiler_copts():
+    return if_profiler_oss(["-Wc++14-compat"])
diff --git a/tensorflow/core/profiler/builds/oss/build_config.bzl b/tensorflow/core/profiler/builds/oss/build_config.bzl
index 1dcfd0e3291..8849aa4cff4 100644
--- a/tensorflow/core/profiler/builds/oss/build_config.bzl
+++ b/tensorflow/core/profiler/builds/oss/build_config.bzl
@@ -3,5 +3,17 @@
 TF profiler build macros for use in OSS.
 """
 
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
+
 def tf_profiler_alias(target_dir, name):
     return target_dir + "oss:" + name
+
+def tf_profiler_pybind_cc_library_wrapper(name, actual, **kwargs):
+    """Wrapper for cc_library used by tf_python_pybind_extension.
+
+    This wrapper ensures that cc libraries headers are made available to pybind
+    code, without creating ODR violations in the dynamically linked case.  The
+    symbols in these deps symbols should be linked to, and exported by, the core
+    pywrap_tensorflow_internal.so
+    """
+    cc_header_only_library(name = name, deps = [actual], **kwargs)
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 4931d528f50..2ac68207ee6 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -1,4 +1,6 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
     default_visibility = ["//tensorflow/core/profiler:internal"],
@@ -9,6 +11,7 @@ cc_library(
     name = "xplane_to_op_metrics_db",
     srcs = ["xplane_to_op_metrics_db.cc"],
     hdrs = ["xplane_to_op_metrics_db.h"],
+    copts = tf_profiler_copts(),
     deps = [
         ":op_metrics_db_combiner",
         ":op_stack",
@@ -56,6 +59,7 @@ cc_library(
     name = "op_metrics_db_combiner",
     srcs = ["op_metrics_db_combiner.cc"],
     hdrs = ["op_metrics_db_combiner.h"],
+    copts = tf_profiler_copts(),
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:protobuf",
@@ -69,6 +73,7 @@ cc_library(
     name = "op_metrics_to_record",
     srcs = ["op_metrics_to_record.cc"],
     hdrs = ["op_metrics_to_record.h"],
+    copts = tf_profiler_copts(),
     deps = [
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/utils:math_utils",
@@ -80,6 +85,7 @@ cc_library(
 cc_library(
     name = "op_stack",
     hdrs = ["op_stack.h"],
+    copts = tf_profiler_copts(),
     deps = [
         "//tensorflow/core:lib",
     ],
@@ -89,6 +95,7 @@ cc_library(
     name = "op_stats_to_overview_page",
     srcs = ["op_stats_to_overview_page.cc"],
     hdrs = ["op_stats_to_overview_page.h"],
+    copts = tf_profiler_copts(),
     deps = [
         ":op_metrics_to_record",
         ":op_stats_to_input_pipeline_analysis",
@@ -103,6 +110,7 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
         "//tensorflow/core/profiler/utils:diagnostics",
+        "//tensorflow/core/profiler/utils:format_utils",
         "//tensorflow/core/profiler/utils:hardware_type_utils",
         "//tensorflow/core/profiler/utils:html_utils",
         "//tensorflow/core/profiler/utils:kernel_stats_utils",
@@ -111,7 +119,82 @@ cc_library(
         "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:time_utils",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
+cc_library(
+    name = "op_stats_to_pod_stats",
+    srcs = ["op_stats_to_pod_stats.cc"],
+    hdrs = ["op_stats_to_pod_stats.h"],
+    copts = tf_profiler_copts(),
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:pod_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
+        "//tensorflow/core/profiler/utils:diagnostics",
+        "//tensorflow/core/profiler/utils:event_span",
+        "//tensorflow/core/profiler/utils:time_utils",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "op_stats_to_pod_stats_test",
+    srcs = ["op_stats_to_pod_stats_test.cc"],
+    deps = [
+        ":op_stats_to_pod_stats",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
+        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
+        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
+        "//tensorflow/core/profiler/utils:diagnostics",
+        "//tensorflow/core/profiler/utils:event_span",
+        "//tensorflow/core/profiler/utils:time_utils",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "op_stats_to_pod_viewer",
+    srcs = ["op_stats_to_pod_viewer.cc"],
+    hdrs = ["op_stats_to_pod_viewer.h"],
+    copts = tf_profiler_copts(),
+    deps = [
+        ":op_stats_to_pod_stats",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:pod_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:pod_viewer_proto_cc",
+        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
+        "//tensorflow/core/profiler/utils:diagnostics",
+        "//tensorflow/core/profiler/utils:event_span",
+        "//tensorflow/core/profiler/utils:time_utils",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "op_stats_to_pod_viewer_test",
+    srcs = ["op_stats_to_pod_viewer_test.cc"],
+    deps = [
+        ":op_stats_to_pod_viewer",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
+        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
+        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:pod_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
+        "//tensorflow/core/profiler/utils:diagnostics",
+        "//tensorflow/core/profiler/utils:event_span",
+        "//tensorflow/core/profiler/utils:time_utils",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -119,6 +202,7 @@ cc_library(
     name = "op_stats_to_input_pipeline_analysis",
     srcs = ["op_stats_to_input_pipeline_analysis.cc"],
     hdrs = ["op_stats_to_input_pipeline_analysis.h"],
+    copts = tf_profiler_copts(),
     deps = [
         ":op_metrics_to_record",
         ":step_events_to_steps_db",
@@ -132,6 +216,7 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "//tensorflow/core/profiler/utils:diagnostics",
         "//tensorflow/core/profiler/utils:event_span",
+        "//tensorflow/core/profiler/utils:format_utils",
         "//tensorflow/core/profiler/utils:hardware_type_utils",
         "//tensorflow/core/profiler/utils:html_utils",
         "//tensorflow/core/profiler/utils:math_utils",
@@ -140,7 +225,6 @@ cc_library(
         "//tensorflow/core/util:stats_calculator_portable",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -148,6 +232,7 @@ cc_library(
     name = "op_stats_to_tf_stats",
     srcs = ["op_stats_to_tf_stats.cc"],
     hdrs = ["op_stats_to_tf_stats.h"],
+    copts = tf_profiler_copts(),
     deps = [
         ":op_metrics_to_record",
         "//tensorflow/core:lib",
@@ -188,6 +273,7 @@ cc_library(
     name = "step_events_to_steps_db",
     srcs = ["step_events_to_steps_db.cc"],
     hdrs = ["step_events_to_steps_db.h"],
+    copts = tf_profiler_copts(),
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -203,11 +289,15 @@ cc_library(
     name = "trace_events_to_json",
     srcs = ["trace_events_to_json.cc"],
     hdrs = ["trace_events_to_json.h"],
+    copts = tf_profiler_copts(),
     deps = [
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform:protobuf",
         "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
+        "//tensorflow/core/profiler/utils:format_utils",
+        "//tensorflow/core/profiler/utils:time_utils",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@jsoncpp_git//:jsoncpp",
     ],
 )
@@ -229,8 +319,10 @@ cc_library(
     name = "xplane_to_op_stats",
     srcs = ["xplane_to_op_stats.cc"],
     hdrs = ["xplane_to_op_stats.h"],
+    copts = tf_profiler_copts(),
     deps = [
         ":op_metrics_db_combiner",
+        ":op_stats_combiner",
         ":step_events_to_steps_db",
         ":xplane_to_kernel_stats_db",
         ":xplane_to_op_metrics_db",
@@ -248,6 +340,7 @@ cc_library(
         "//tensorflow/core/profiler/utils:event_span",
         "//tensorflow/core/profiler/utils:hardware_type_utils",
         "//tensorflow/core/profiler/utils:kernel_stats_utils",
+        "//tensorflow/core/profiler/utils:step_intersection",
         "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:xplane_schema",
@@ -263,6 +356,7 @@ tf_cc_test(
     size = "small",
     srcs = ["xplane_to_op_stats_test.cc"],
     deps = [
+        ":step_events_to_steps_db",
         ":xplane_to_op_stats",
         ":xplane_to_tf_functions",
         "//tensorflow/core:lib",
@@ -289,6 +383,7 @@ cc_library(
     name = "xplane_to_profile_response",
     srcs = ["xplane_to_profile_response.cc"],
     hdrs = ["xplane_to_profile_response.h"],
+    copts = tf_profiler_copts(),
     deps = [
         ":op_stats_to_input_pipeline_analysis",
         ":op_stats_to_overview_page",
@@ -341,6 +436,7 @@ cc_library(
     name = "xplane_to_step_events",
     srcs = ["xplane_to_step_events.cc"],
     hdrs = ["xplane_to_step_events.h"],
+    copts = tf_profiler_copts(),
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
@@ -380,6 +476,7 @@ cc_library(
     name = "xplane_to_trace_events",
     srcs = ["xplane_to_trace_events.cc"],
     hdrs = ["xplane_to_trace_events.h"],
+    copts = tf_profiler_copts(),
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
@@ -415,6 +512,7 @@ cc_library(
     name = "xplane_to_kernel_stats_db",
     srcs = ["xplane_to_kernel_stats_db.cc"],
     hdrs = ["xplane_to_kernel_stats_db.h"],
+    copts = tf_profiler_copts(),
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -457,6 +555,7 @@ cc_library(
     name = "xplane_to_tf_functions",
     srcs = ["xplane_to_tf_functions.cc"],
     hdrs = ["xplane_to_tf_functions.h"],
+    copts = tf_profiler_copts(),
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -501,6 +600,7 @@ cc_library(
     name = "xplane_to_memory_profile",
     srcs = ["xplane_to_memory_profile.cc"],
     hdrs = ["xplane_to_memory_profile.h"],
+    copts = tf_profiler_copts(),
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -517,7 +617,6 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
     ],
 )
@@ -540,3 +639,107 @@ tf_cc_test(
         "@com_google_absl//absl/strings",
     ],
 )
+
+cc_library(
+    name = "op_stats_combiner",
+    srcs = ["op_stats_combiner.cc"],
+    hdrs = ["op_stats_combiner.h"],
+    copts = tf_profiler_copts(),
+    deps = [
+        ":op_metrics_db_combiner",
+        ":xplane_to_tf_functions",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
+        "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
+        "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
+        "//tensorflow/core/profiler/utils:hardware_type_utils",
+        "//tensorflow/core/profiler/utils:kernel_stats_utils",
+        "//tensorflow/core/profiler/utils:step_intersection",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
+cc_library(
+    name = "post_process_single_host_xplane",
+    srcs = ["post_process_single_host_xplane.cc"],
+    hdrs = ["post_process_single_host_xplane.h"],
+    copts = tf_profiler_copts(),
+    visibility = ["//tensorflow/core/profiler:internal"],
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:derived_timeline",
+        "//tensorflow/core/profiler/utils:group_events",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_utils",
+    ],
+)
+
+cc_library(
+    name = "xplane_to_tools_data",
+    srcs = ["xplane_to_tools_data.cc"],
+    hdrs = ["xplane_to_tools_data.h"],
+    copts = tf_profiler_copts(),
+    deps = [
+        ":op_stats_to_input_pipeline_analysis",
+        ":op_stats_to_overview_page",
+        ":op_stats_to_pod_viewer",
+        ":op_stats_to_tf_stats",
+        ":xplane_to_memory_profile",
+        ":xplane_to_op_stats",
+        ":xplane_to_trace_events",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
+        "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
+        "//tensorflow/core/profiler/protobuf:pod_viewer_proto_cc",
+        "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "xplane_to_tf_data_stats",
+    srcs = ["xplane_to_tf_data_stats.cc"],
+    hdrs = ["xplane_to_tf_data_stats.h"],
+    copts = tf_profiler_copts(),
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/platform:protobuf",
+        "//tensorflow/core/profiler/protobuf:tf_data_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:group_events",
+        "//tensorflow/core/profiler/utils:tf_op_utils",
+        "//tensorflow/core/profiler/utils:tf_xplane_visitor",
+        "//tensorflow/core/profiler/utils:timespan",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "xplane_to_tf_data_stats_test",
+    size = "small",
+    srcs = ["xplane_to_tf_data_stats_test.cc"],
+    tags = [
+        "no_oss",  # b/169705709
+    ],
+    deps = [
+        ":xplane_to_tf_data_stats",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler/protobuf:tf_data_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:xplane_builder",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_test_utils",
+    ],
+)
diff --git a/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc b/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
index 425bf0077c3..4cf81f422af 100644
--- a/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
+++ b/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
@@ -36,6 +36,9 @@ void CopyOpMetricsMetadata(const OpMetrics& src, OpMetrics* dst) {
   DCHECK(dst != nullptr);
   DCHECK_EQ(src.hlo_module_id(), dst->hlo_module_id());
   DCHECK_EQ(src.name(), dst->name());
+  if (dst->long_name().empty()) {
+    dst->set_long_name(src.long_name());
+  }
   if (dst->category().empty()) {
     dst->set_category(src.category());
   }
diff --git a/tensorflow/core/profiler/convert/op_stats_combiner.cc b/tensorflow/core/profiler/convert/op_stats_combiner.cc
new file mode 100644
index 00000000000..0de30f2c4e8
--- /dev/null
+++ b/tensorflow/core/profiler/convert/op_stats_combiner.cc
@@ -0,0 +1,258 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/op_stats_combiner.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
+#include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
+#include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
+#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+#include "tensorflow/core/profiler/utils/hardware_type_utils.h"
+#include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
+#include "tensorflow/core/profiler/utils/step_intersection.h"
+
+namespace tensorflow {
+namespace profiler {
+
+namespace {
+
+// Combines the src PerCoreStepInfo into the dst PerCoreStepInfo.
+void CombinePerCoreStepInfo(
+    int src_host_id, const PerCoreStepInfo& src, bool use_incomplete_step,
+    PerCoreStepInfo* dst,
+    OpMetricsDbCombiner* hlo_metrics_db_complete_steps_only_combiner,
+    OpMetricsDbCombiner* hlo_metrics_db_per_step_combiner) {
+  CombineCoreIdMap(src_host_id, src.step_info_per_core(),
+                   dst->mutable_step_info_per_core());
+
+  // Since we have assigned a new step number to the combined result, update
+  // the step number on each core to this new step number.
+  uint32 new_step_num = dst->step_num();
+  for (auto& percore_stepinfo : *dst->mutable_step_info_per_core()) {
+    auto& stepinfo = percore_stepinfo.second;
+    stepinfo.set_step_num(new_step_num);
+  }
+
+  if (!use_incomplete_step) {
+    hlo_metrics_db_complete_steps_only_combiner->Combine(src.hlo_metrics_db());
+  }
+  hlo_metrics_db_per_step_combiner->Combine(src.hlo_metrics_db());
+  CombineCoreIdMap(src_host_id, src.flow_db_per_core(),
+                   dst->mutable_flow_db_per_core());
+  CombineCoreIdMap(src_host_id, src.all_reduce_db_per_core(),
+                   dst->mutable_all_reduce_db_per_core());
+  CombineCoreIdMap(src_host_id, src.core_id_to_replica_id_map(),
+                   dst->mutable_core_id_to_replica_id_map());
+}
+
+void CombineStepDatabase(
+    int src_host_id, const StepIntersection& step_intersection,
+    const StepDatabaseResult& src, StepDatabaseResult* dst,
+    OpMetricsDbCombiner* hlo_metrics_db_complete_steps_only_combiner,
+    std::vector<OpMetricsDbCombiner>* hlo_metrics_db_per_step_combiners) {
+  if (src.use_incomplete_step()) dst->set_use_incomplete_step(true);
+  uint32 src_first_step_idx = step_intersection.FirstStepIndex(src_host_id);
+  for (uint32 i = 0; i < step_intersection.NumSteps(); i++) {
+    CombinePerCoreStepInfo(
+        src_host_id, src.step_sequence(src_first_step_idx + i),
+        src.use_incomplete_step(), dst->mutable_step_sequence(i),
+        hlo_metrics_db_complete_steps_only_combiner,
+        &(*hlo_metrics_db_per_step_combiners)[i]);
+  }
+}
+
+void CombineRunEnvironment(const RunEnvironment& src, RunEnvironment* dst) {
+  dst->mutable_hostnames()->insert(src.hostnames().begin(),
+                                   src.hostnames().end());
+  dst->set_host_count(dst->hostnames_size());
+  if (src.device_type() != "CPU") {
+    dst->set_device_type(src.device_type());
+    // TODO(b/111402648): Batch size may differ per-core. Currently, we report
+    // the max batch size. We need to come up with a better measure.
+    dst->set_per_core_batch_size(
+        std::max(src.per_core_batch_size(), dst->per_core_batch_size()));
+    dst->set_device_core_count(src.device_core_count() +
+                               dst->device_core_count());
+    // Replica count and num cores per replica must be same for all copies.
+    dst->set_replica_count(std::max(src.replica_count(), dst->replica_count()));
+    dst->set_num_cores_per_replica(
+        std::max(src.num_cores_per_replica(), dst->num_cores_per_replica()));
+    *dst->mutable_topology() = src.topology();
+  } else if (dst->device_type().empty()) {
+    dst->set_device_type(src.device_type());
+  }
+  dst->set_task_count(src.task_count() + dst->task_count());
+  (*dst->mutable_host_independent_job_info()) = src.host_independent_job_info();
+  for (const auto& job_info : src.host_dependent_job_info()) {
+    *(dst->add_host_dependent_job_info()) = job_info;
+  }
+  dst->set_host_trace_level(src.host_trace_level());
+}
+
+// Combines the src PerfEnv into the dst PerfEnv.
+void CombinePerfEnv(const PerfEnv& src, PerfEnv* dst) {
+  dst->set_peak_tera_flops_per_second(src.peak_tera_flops_per_second());
+  dst->set_peak_hbm_bw_giga_bytes_per_second(
+      src.peak_hbm_bw_giga_bytes_per_second());
+  dst->set_ridge_point(src.ridge_point());
+}
+
+// Combines the src Diagnostics into the dst Diagnostics.
+void CombineDiagnostics(const Diagnostics& src, Diagnostics* dst) {
+  dst->mutable_info()->MergeFrom(src.info());
+  dst->mutable_warnings()->MergeFrom(src.warnings());
+  dst->mutable_errors()->MergeFrom(src.errors());
+}
+
+// Combine the src OpStats into the dst OpStats.
+void CombineOpStats(
+    bool no_accelerator_in_system, int src_host_id, HardwareType hardware_type,
+    const StepIntersection& step_intersection, const OpStats& src, OpStats* dst,
+    OpMetricsDbCombiner* host_op_metrics_db_combiner,
+    OpMetricsDbCombiner* device_op_metrics_db_combiner,
+    OpMetricsDbCombiner* hlo_metrics_db_complete_steps_only_combiner,
+    std::vector<OpMetricsDbCombiner>* hlo_metrics_db_per_step_combiners) {
+  // Combine host_metrics_db.
+  host_op_metrics_db_combiner->Combine(src.host_op_metrics_db());
+  // Combine device_metrics_db.
+  device_op_metrics_db_combiner->Combine(src.device_op_metrics_db());
+
+  // Combine step_db.
+  if (!IsCoordinator(no_accelerator_in_system, hardware_type)) {
+    CombineStepDatabase(src_host_id, step_intersection, src.step_db(),
+                        dst->mutable_step_db(),
+                        hlo_metrics_db_complete_steps_only_combiner,
+                        hlo_metrics_db_per_step_combiners);
+  }
+
+  // Combine run environment info.
+  CombineRunEnvironment(src.run_environment(), dst->mutable_run_environment());
+
+  // Combine the perf environment info.
+  CombinePerfEnv(src.perf_env(), dst->mutable_perf_env());
+
+  // Combine diagnostics.
+  CombineDiagnostics(src.diagnostics(), dst->mutable_diagnostics());
+
+  // Combine kernel stats.
+  dst->mutable_kernel_stats_db()->mutable_reports()->MergeFrom(
+      src.kernel_stats_db().reports());
+
+  // Combine tf-function stats.
+  CombineTfFunctionDb(src.tf_function_db(), dst->mutable_tf_function_db());
+
+  // Combine the mapping from core ID to details.
+  CombineCoreIdMap(src_host_id, src.core_id_to_details(),
+                   dst->mutable_core_id_to_details());
+}
+
+}  // namespace
+
+bool IsCoordinator(bool no_accelerator_in_system, HardwareType hardware_type) {
+  // A host is a coordinator if:
+  //   (1) The host doesn't have a device, and
+  //   (2) The system does use accelerator (if not, it uses CPU only and so this
+  //   host should be regarded as a worker as well).
+  return !HasDevice(hardware_type) && !no_accelerator_in_system;
+}
+
+bool NoAcceleratorInSystem(const std::vector<OpStatsInfo>& all_op_stats_info) {
+  for (const auto& op_stats_info : all_op_stats_info) {
+    if (HasDevice(op_stats_info.hardware_type)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+uint32 GlobalCoreId(int host_id, uint32 device_ordinal) {
+  constexpr uint32 kMaxDevicesPerHost = 1000;  // power-of-10 for debuggability
+  return host_id * kMaxDevicesPerHost + device_ordinal;
+}
+
+StepIntersection ComputeStepIntersectionToMergeOpStats(
+    const std::vector<OpStatsInfo>& all_op_stats_info,
+    uint32 max_step_per_host) {
+  bool no_accelerator_in_system = NoAcceleratorInSystem(all_op_stats_info);
+
+  absl::flat_hash_map<uint32, const StepDatabaseResult*> per_host_step_db;
+  for (const auto& op_stats_info : all_op_stats_info) {
+    if (IsCoordinator(no_accelerator_in_system, op_stats_info.hardware_type))
+      continue;
+    // Includes only workers in per_host_step_db.
+    per_host_step_db[op_stats_info.src_host_id] =
+        &op_stats_info.op_stats->step_db();
+  }
+
+  return StepIntersection(max_step_per_host, per_host_step_db);
+}
+
+void CombineAllOpStats(const std::vector<OpStatsInfo>& all_op_stats_info,
+                       const StepIntersection& step_intersection,
+                       OpStats* combined_op_stats) {
+  StepDatabaseResult* combined_step_db = combined_op_stats->mutable_step_db();
+  // Initialize the StepDatabaseResult field that depends on the number of
+  // steps.
+  for (uint32 dst_step_num : step_intersection.DstStepNumbers()) {
+    combined_step_db->add_step_sequence()->set_step_num(dst_step_num);
+  }
+  // Record the number of steps that are dropped.
+  combined_step_db->set_num_steps_dropped(step_intersection.StepsDropped());
+
+  // Set the default value of per_core_batch_size in <combined_op_stats>
+  combined_op_stats->mutable_run_environment()->set_per_core_batch_size(-1);
+
+  // Initialize all the OpMetricsDbCombiners.
+  OpMetricsDbCombiner host_op_metrics_db_combiner(
+      combined_op_stats->mutable_host_op_metrics_db());
+  OpMetricsDbCombiner device_op_metrics_db_combiner(
+      combined_op_stats->mutable_device_op_metrics_db());
+  OpMetricsDbCombiner hlo_metrics_db_complete_steps_only_combiner(
+      combined_op_stats->mutable_hlo_metrics_db_complete_steps_only());
+  std::vector<OpMetricsDbCombiner> hlo_metrics_db_per_step_combiners;
+  hlo_metrics_db_per_step_combiners.reserve(
+      combined_step_db->step_sequence_size());
+  for (PerCoreStepInfo& step_info :
+       *combined_step_db->mutable_step_sequence()) {
+    hlo_metrics_db_per_step_combiners.emplace_back(
+        step_info.mutable_hlo_metrics_db());
+  }
+
+  bool no_accelerator_in_system = NoAcceleratorInSystem(all_op_stats_info);
+
+  for (const auto& op_stats_info : all_op_stats_info) {
+    CombineOpStats(no_accelerator_in_system, op_stats_info.src_host_id,
+                   op_stats_info.hardware_type, step_intersection,
+                   *op_stats_info.op_stats, combined_op_stats,
+                   &host_op_metrics_db_combiner, &device_op_metrics_db_combiner,
+                   &hlo_metrics_db_complete_steps_only_combiner,
+                   &hlo_metrics_db_per_step_combiners);
+  }
+
+  // Sorts all the kernel reports that have been merged by CombineTfOpStats and
+  // keeps only the top kernel reports with long kernel duration.
+  SortAndKeepTopKDurationKernelReportsInDb(
+      combined_op_stats->mutable_kernel_stats_db());
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_combiner.h b/tensorflow/core/profiler/convert/op_stats_combiner.h
new file mode 100644
index 00000000000..8f694c3dd09
--- /dev/null
+++ b/tensorflow/core/profiler/convert/op_stats_combiner.h
@@ -0,0 +1,84 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_COMBINER_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_COMBINER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/utils/step_intersection.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Whether a host is a coordinator.
+bool IsCoordinator(bool no_accelerator_in_system, HardwareType hardware_type);
+
+// Translates the core id from single host to the one for multiple-host.
+// We need this translation because the device_ordinal was assigned when a
+// single host response was given. Now, we need a global core_id to distinguish
+// it with multiple hosts.
+uint32 GlobalCoreId(int host_id, uint32 device_ordinal);
+
+// Combines the src map into the dst map.
+// The src map keys are local core_ids. The src_host_id is used to convert them
+// into global core_ids used as keys in the dst map.
+// REQUIRED: cores from src_host_id are not already in dst.
+template <typename CoreIdMap>
+void CombineCoreIdMap(int src_host_id, const CoreIdMap& src, CoreIdMap* dst) {
+  for (const auto& core_id_and_value : src) {
+    uint32 global_core_id = GlobalCoreId(src_host_id, core_id_and_value.first);
+    auto iter_and_inserted =
+        dst->insert({global_core_id, core_id_and_value.second});
+    DCHECK(iter_and_inserted.second)
+        << "Duplicated core_id: " << iter_and_inserted.first->first;
+  }
+}
+
+// A struct that contains all the information that is needed to combine OpStats.
+struct OpStatsInfo {
+  OpStatsInfo(const OpStats* op_stats, HardwareType hardware_type,
+              int src_host_id)
+      : op_stats(op_stats),
+        hardware_type(hardware_type),
+        src_host_id(src_host_id) {}
+  const OpStats* op_stats;
+  HardwareType hardware_type;
+  int src_host_id;
+};
+
+// Returns true if there is no device (accelerator) in any of the hosts.
+bool NoAcceleratorInSystem(const std::vector<OpStatsInfo>& all_op_stats_info);
+
+// Compute the StepIntersection to merge OpStats.
+// Profiler will limit the number of steps to be at most <max_step_per_host>.
+StepIntersection ComputeStepIntersectionToMergeOpStats(
+    const std::vector<OpStatsInfo>& all_op_stats_info,
+    uint32 max_step_per_host);
+
+// Combine all the OpStats in <all_op_stats_info> using the steps in range
+// <step_intersection>. The result is stored in <combined_op_stats>.
+void CombineAllOpStats(const std::vector<OpStatsInfo>& all_op_stats_info,
+                       const StepIntersection& step_intersection,
+                       OpStats* combined_op_stats);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_COMBINER_H_
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
index 6828950e6a5..c339a43cefb 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/logging.h"
@@ -38,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 #include "tensorflow/core/profiler/utils/diagnostics.h"
 #include "tensorflow/core/profiler/utils/event_span.h"
+#include "tensorflow/core/profiler/utils/format_utils.h"
 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
 #include "tensorflow/core/profiler/utils/html_utils.h"
 #include "tensorflow/core/profiler/utils/math_utils.h"
@@ -59,6 +59,7 @@ const double kNumPsPerMs = 1000000000.0;
 // input-bound; else if it is considered HIGHLY input-bound.
 constexpr double kModeratelyInfeedBoundThresholdInPercent = 5;
 constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
+
 // If the percentage of step time that is due to outfeed is less than
 // kModeratelyOutfeedBoundThresholdInPercent, it is considered NOT
 // output-bound; else if it is less than
@@ -66,6 +67,7 @@ constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
 // output-bound; else if it is considered HIGHLY output-bound.
 constexpr double kModeratelyOutfeedBoundThresholdInPercent = 5;
 constexpr double kHighlyOutfeedBoundThresholdInPercent = 20;
+
 // If the percentage of step time that is due to kernel launch is less than
 // kModeratelyKernelLaunchBoundThresholdInPercent, it is considered NOT
 // kernel-launch bound; else if it is less than
@@ -73,6 +75,7 @@ constexpr double kHighlyOutfeedBoundThresholdInPercent = 20;
 // kernel-launch bound; else if it is considered HIGHLY kernel-launch bound.
 constexpr double kModeratelyKernelLaunchBoundThresholdInPercent = 3;
 constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
+
 // If the percentage of step time that is due to all other time is less than
 // kModeratelyAllOtherBoundThresholdInPercent, it is considered NOT
 // all-other bound; else if it is less than
@@ -80,6 +83,16 @@ constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
 // all-other bound; else if it is considered HIGHLY all-other bound.
 constexpr double kModeratelyAllOtherBoundThresholdInPercent = 3;
 constexpr double kHighlyAllOtherBoundThresholdInPercent = 15;
+
+// If the percentage of step time that is due to device collectives is less than
+// kModeratelyDeviceCollectivesBoundThresholdInPercent, it is considered NOT
+// device-collectives bound; else if it is less than
+// kHighlyDeviceCollectivesBoundThresholdInPercent, it is considered MODERATELY
+// device-collectives  bound; else if it is considered HIGHLY device-collectives
+// bound.
+constexpr double kModeratelyDeviceCollectivesBoundThresholdInPercent = 3;
+constexpr double kHighlyDeviceCollectivesBoundThresholdInPercent = 15;
+
 // Section number of the host-analysis section in the input-pipeline analysis.
 constexpr int kHostAnalysisSectionNumber = 3;
 // Python-only explanation for "All Others" time.
@@ -125,6 +138,7 @@ GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
   Stat<double> output_ms;
   Stat<double> device_compute_ms;
   Stat<double> device_to_device_ms;
+  Stat<double> device_collectives_ms;
   Stat<double> host_compute_ms;
   Stat<double> host_prepare_ms;
   Stat<double> host_compile_ms;
@@ -146,6 +160,7 @@ GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
     output_ms.UpdateStat(details.output_ms());
     device_compute_ms.UpdateStat(details.device_compute_ms());
     device_to_device_ms.UpdateStat(details.device_to_device_ms());
+    device_collectives_ms.UpdateStat(details.device_collectives_ms());
     host_compute_ms.UpdateStat(details.host_compute_ms());
     host_prepare_ms.UpdateStat(details.host_prepare_ms());
     host_compile_ms.UpdateStat(details.host_compile_ms());
@@ -162,6 +177,8 @@ GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
       GetStepSummaryForSampleStats(device_compute_ms);
   *result.mutable_device_to_device_ms_summary() =
       GetStepSummaryForSampleStats(device_to_device_ms);
+  *result.mutable_device_collectives_ms_summary() =
+      GetStepSummaryForSampleStats(device_collectives_ms);
   *result.mutable_host_compute_ms_summary() =
       GetStepSummaryForSampleStats(host_compute_ms);
   *result.mutable_host_prepare_ms_summary() =
@@ -208,14 +225,13 @@ InputPipelineAnalysisResult ComputeGenericInputPipelineAnalysisResult(
                                   GetTimeInMs(type_ps, DEVICE_WAIT_HOST));
     details.set_output_ms(GetTimeInMs(type_ps, DEVICE_TO_HOST));
     details.set_device_compute_ms(GetTimeInMs(type_ps, DEVICE_COMPUTE_16) +
-                                  GetTimeInMs(type_ps, DEVICE_COMPUTE_32) +
-                                  GetTimeInMs(type_ps, DEVICE_COLLECTIVES));
+                                  GetTimeInMs(type_ps, DEVICE_COMPUTE_32));
     details.set_device_to_device_ms(GetTimeInMs(type_ps, DEVICE_TO_DEVICE) +
                                     GetTimeInMs(type_ps, DEVICE_WAIT_DEVICE));
+    details.set_device_collectives_ms(GetTimeInMs(type_ps, DEVICE_COLLECTIVES));
     details.set_host_compute_ms(GetTimeInMs(type_ps, HOST_COMPUTE));
     details.set_host_prepare_ms(GetTimeInMs(type_ps, HOST_PREPARE));
     details.set_host_compile_ms(GetTimeInMs(type_ps, HOST_COMPILE));
-
     result.add_step_details()->PackFrom(details);
 
     const double input_percent_of_step_time =
@@ -360,14 +376,36 @@ double RatioOfHostToDeviceTimeToStepTime(
   return 0.0;
 }
 
+void DeviceCollectivesAnalysis(double device_collectives_percent,
+                               std::string* device_collectives_classification,
+                               std::string* device_collectives_statement) {
+  if (device_collectives_percent >=
+      kHighlyDeviceCollectivesBoundThresholdInPercent) {
+    *device_collectives_classification = "high";
+    *device_collectives_statement =
+        absl::StrCat(OneDigit(device_collectives_percent),
+                     " % of the total step time sampled is spent on 'Device "
+                     "Collective Communication'.");
+  } else if (device_collectives_percent >=
+             kModeratelyDeviceCollectivesBoundThresholdInPercent) {
+    *device_collectives_classification = "moderate";
+    *device_collectives_statement =
+        absl::StrCat(OneDigit(device_collectives_percent),
+                     " % of the total step time sampled is spent on 'Device "
+                     "Collective Communication'.");
+  } else {
+    *device_collectives_classification = "no";
+    *device_collectives_statement = "";
+  }
+}
+
 void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
                           std::string* kernel_launch_classification,
                           std::string* kernel_launch_statement) {
-  std::string percent_str = absl::StrFormat("%.1lf", kernel_launch_percent);
   if (kernel_launch_percent >= kHighlyKernelLaunchBoundThresholdInPercent) {
     *kernel_launch_classification = "high";
     *kernel_launch_statement = absl::StrCat(
-        percent_str,
+        OneDigit(kernel_launch_percent),
         " % of the total step time sampled is spent on 'Kernel Launch'.");
     if (tfdata_used) {
       absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
@@ -376,7 +414,7 @@ void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
              kModeratelyKernelLaunchBoundThresholdInPercent) {
     *kernel_launch_classification = "moderate";
     *kernel_launch_statement = absl::StrCat(
-        percent_str,
+        OneDigit(kernel_launch_percent),
         " % of the total step time sampled is spent on 'Kernel Launch'.");
     if (tfdata_used) {
       absl::StrAppend(kernel_launch_statement, kKernelLaunchTfDataContention);
@@ -395,15 +433,14 @@ void AllOtherAnalysis(bool all_other_reported, double all_other_percent,
     *all_other_statement = "";
     return;
   }
-  std::string percent_str = absl::StrFormat("%.1lf", all_other_percent);
   if (all_other_percent >= kHighlyAllOtherBoundThresholdInPercent) {
     *all_other_classification = "high";
     *all_other_statement =
-        absl::StrCat(percent_str, kAllOthersPythonExplanation);
+        absl::StrCat(OneDigit(all_other_percent), kAllOthersPythonExplanation);
   } else if (all_other_percent >= kModeratelyAllOtherBoundThresholdInPercent) {
     *all_other_classification = "moderate";
     *all_other_statement =
-        absl::StrCat(percent_str, kAllOthersPythonExplanation);
+        absl::StrCat(OneDigit(all_other_percent), kAllOthersPythonExplanation);
   } else {
     *all_other_classification = "no";
     *all_other_statement = "";
@@ -585,18 +622,18 @@ bool InputAnalysis(double input_percent, double all_other_percent,
                    std::string* input_classification,
                    std::string* input_statement) {
   absl::string_view non_input_time = "other time";
-  std::string infeed_percent_str = absl::StrFormat("%.1lf", input_percent);
   if (input_percent >= kHighlyInfeedBoundThresholdInPercent) {
     *input_classification = "host";
     *input_statement = absl::StrCat(
-        "Your program is HIGHLY input-bound because ", infeed_percent_str,
+        "Your program is HIGHLY input-bound because ", OneDigit(input_percent),
         "% of the total step time sampled is waiting for input. Therefore, you "
         "should first focus on reducing the input time.");
     return false;
   } else if (input_percent >= kModeratelyInfeedBoundThresholdInPercent) {
     *input_classification = "both";
     *input_statement = absl::StrCat(
-        "Your program is MODERATELY input-bound because ", infeed_percent_str,
+        "Your program is MODERATELY input-bound because ",
+        OneDigit(input_percent),
         "% of the total step time sampled is waiting for input. Therefore, "
         "you would need to reduce both the input time and ",
         non_input_time, ".");
@@ -605,41 +642,40 @@ bool InputAnalysis(double input_percent, double all_other_percent,
     // Input analysis says it is not input-bound, but "All-Other" time
     // is significant. It could still be input-bound (or Python overhead).
     *input_classification = "both";
-    std::string all_other_percent_str =
-        absl::StrFormat("%.1lf", all_other_percent);
     *input_statement = absl::StrCat(
         "Your program is POTENTIALLY input-bound because ",
-        all_other_percent_str,
+        OneDigit(all_other_percent),
         "% of the total step time sampled is spent on 'All Others' time (which "
         "could be due to I/O or Python execution or both).");
     return true;
   } else {
     // Defintely not input-bound.
     *input_classification = "device";
-    *input_statement = absl::StrCat(
-        "Your program is NOT input-bound because only ", infeed_percent_str,
-        "% of the total step time sampled is waiting for "
-        "input. Therefore, you should focus on "
-        "reducing ",
-        non_input_time, ".");
+    *input_statement =
+        absl::StrCat("Your program is NOT input-bound because only ",
+                     OneDigit(input_percent),
+                     "% of the total step time sampled is waiting for "
+                     "input. Therefore, you should focus on "
+                     "reducing ",
+                     non_input_time, ".");
     return false;
   }
 }
 
 void OutputAnalysis(double output_percent, std::string* output_classification,
                     std::string* output_statement) {
-  string tc_outfeed_percent_str = absl::StrFormat("%.1lf", output_percent);
   if (output_percent >= kHighlyOutfeedBoundThresholdInPercent) {
     *output_classification = "host";
     *output_statement = absl::StrCat(
-        "Your program is HIGHLY output-bound because ", tc_outfeed_percent_str,
+        "Your program is HIGHLY output-bound because ",
+        OneDigit(output_percent),
         "% of the total step time sampled is spent on output. Therefore, you "
         "should first focus on reducing the output time.");
   } else if (output_percent >= kModeratelyOutfeedBoundThresholdInPercent) {
     *output_classification = "both";
     *output_statement = absl::StrCat(
         "Your program is MODERATELY output-bound because ",
-        tc_outfeed_percent_str,
+        OneDigit(output_percent),
         "% of the total step time sampled is spent on output. Therefore, "
         "you would need to reduce both the output time and other time.");
   } else {
@@ -660,6 +696,7 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
   double total_host_compile_ms = 0;
   double total_device_compute_ms = 0;
   double total_device_to_device_ms = 0;
+  double total_device_collectives_ms = 0;
   double total_unknown_ms = 0;
 
   for (const google::protobuf::Any& step_details : any_step_details) {
@@ -677,6 +714,7 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
     total_host_prepare_ms += details.host_prepare_ms();
     total_device_compute_ms += details.device_compute_ms();
     total_device_to_device_ms += details.device_to_device_ms();
+    total_device_collectives_ms += details.device_collectives_ms();
     total_host_compute_ms += details.host_compute_ms();
     total_host_compile_ms += details.host_compile_ms();
     total_unknown_ms += details.unknown_time_ms();
@@ -692,24 +730,37 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
     analysis.set_kernel_launch_statement("");
     analysis.set_all_other_classification("no");
     analysis.set_all_other_statement("");
+    analysis.set_device_collectives_classification("no");
+    analysis.set_device_collectives_statement("");
     return analysis;
   }
   double input_percent = 100.0 * total_input_ms / total_step_time_ms;
   double output_percent = 100.0 * total_output_ms / total_step_time_ms;
   double compute_percent = 100.0 * total_device_compute_ms / total_step_time_ms;
+  double device_collectives_percent =
+      100.0 * total_device_collectives_ms / total_step_time_ms;
+
   // idle_percent includes host_prepare (i.e. kernel launch, device-to-device,
   // host compute, host compile, and unknown.
   double idle_percent =
-      std::max(0.0, 100.0 - input_percent - output_percent - compute_percent);
+      std::max(0.0, 100.0 - input_percent - output_percent - compute_percent -
+                        device_collectives_percent);
   double kernel_launch_percent =
       100.0 * total_host_prepare_ms / total_step_time_ms;
   double all_other_percent = 100.0 * total_unknown_ms / total_step_time_ms;
+
   std::string input_classification;
   std::string input_statement;
   bool all_other_reported =
       InputAnalysis(input_percent, all_other_percent, &input_classification,
                     &input_statement);
 
+  std::string device_collectives_classification;
+  std::string device_collectives_statement;
+  DeviceCollectivesAnalysis(device_collectives_percent,
+                            &device_collectives_classification,
+                            &device_collectives_statement);
+
   std::string kernel_launch_classification;
   std::string kernel_launch_statement;
   KernelLaunchAnalysis(TfDataInUse(input_time_breakdown), kernel_launch_percent,
@@ -732,6 +783,10 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
   analysis.set_kernel_launch_statement(kernel_launch_statement);
   analysis.set_all_other_classification(all_other_classification);
   analysis.set_all_other_statement(all_other_statement);
+  analysis.set_device_collectives_classification(
+      device_collectives_classification);
+  analysis.set_device_collectives_statement(device_collectives_statement);
+
   return analysis;
 }
 
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index 276181dd7bb..cf0b7e6ad43 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "google/protobuf/any.pb.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
@@ -32,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
 #include "tensorflow/core/profiler/utils/diagnostics.h"
+#include "tensorflow/core/profiler/utils/format_utils.h"
 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
 #include "tensorflow/core/profiler/utils/html_utils.h"
 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
@@ -118,7 +118,7 @@ std::string GeneratePrecisionStatement(const PrecisionStats& precision_stats) {
         (100.0 * precision_stats.compute_16bit_ps()) / total_compute_ps;
     if (percent_16bit < kLowPrecisionPercentThreshold) {
       return absl::StrCat(
-          "Only ", absl::StrFormat("%.1lf", percent_16bit),
+          "Only ", OneDigit(percent_16bit),
           "% of device computation is 16 bit. So you might want to replace "
           "more 32-bit Ops by 16-bit Ops to improve performance (if the "
           "reduced accuracy is acceptable).");
@@ -154,6 +154,10 @@ OverviewPageRecommendation ComputeGenericRecommendation(
     const PrecisionStats& precision_stats) {
   OverviewPageRecommendation re;
   GenericRecommendation generic;
+  generic.set_device_collectives_bottleneck(
+      bottleneck.device_collectives_classification());
+  generic.set_device_collectives_statement(
+      bottleneck.device_collectives_statement());
   generic.set_kernel_launch_bottleneck(
       bottleneck.kernel_launch_classification());
   generic.set_kernel_launch_statement(bottleneck.kernel_launch_statement());
@@ -234,7 +238,8 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
   uint64 outside_compilation_device_op_time_ps = 0;
   for (const OpMetrics& metrics :
        op_stats.device_op_metrics_db().metrics_db()) {
-    if (!IsOutsideCompilationOp(metrics.provenance(), metrics.name())) continue;
+    if (!IsOutsideCompilationOp(metrics.provenance(), metrics.long_name()))
+      continue;
     outside_compilation_device_op_time_ps += metrics.self_time_ps();
   }
   uint64 num_total_tf_ops = num_host_tf_ops + num_device_tf_ops;
@@ -333,12 +338,10 @@ std::string EagerRecommendationHtml(double host_op_time_eager_percent,
                                     double device_op_time_eager_percent) {
   std::string recommendation = "";
   if (host_op_time_eager_percent > kEagerReportThresholdInPercent)
-    absl::StrAppend(&recommendation,
-                    absl::StrFormat("%.1f", host_op_time_eager_percent),
+    absl::StrAppend(&recommendation, OneDigit(host_op_time_eager_percent),
                     "% of Op time on the host used eager execution. ");
   if (device_op_time_eager_percent > kEagerReportThresholdInPercent)
-    absl::StrAppend(&recommendation,
-                    absl::StrFormat("%.1f", device_op_time_eager_percent),
+    absl::StrAppend(&recommendation, OneDigit(device_op_time_eager_percent),
                     "% of Op time on the device used eager execution. ");
   if (!recommendation.empty())
     absl::StrAppend(&recommendation, "Performance could be improved with ",
@@ -353,7 +356,7 @@ std::string OutsideCompilationRecommendationHtml(
       kOutsideCompilationThresholdInPercent)
     return "";
   return absl::StrCat(
-      absl::StrFormat("%.1lf", device_op_time_outside_compilation_percent),
+      OneDigit(device_op_time_outside_compilation_percent),
       " % of Op time on the device are for outside compilation. Performance "
       "could be improved by avoiding outside compilation.");
 }
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_stats.cc b/tensorflow/core/profiler/convert/op_stats_to_pod_stats.cc
new file mode 100644
index 00000000000..9e18cb1446a
--- /dev/null
+++ b/tensorflow/core/profiler/convert/op_stats_to_pod_stats.cc
@@ -0,0 +1,96 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/op_stats_to_pod_stats.h"
+
+#include "google/protobuf/any.pb.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+#include "tensorflow/core/profiler/utils/diagnostics.h"
+#include "tensorflow/core/profiler/utils/event_span.h"
+#include "tensorflow/core/profiler/utils/time_utils.h"
+
+namespace tensorflow {
+namespace profiler {
+
+namespace {
+
+PodStatsRecord CreatePodStatsRecord(absl::string_view host_name,
+                                    const StepInfoResult& step_info) {
+  PodStatsRecord record;
+  GenericStepBreakdown generic;
+  bool success = step_info.step_breakdown().UnpackTo(&generic);
+  DCHECK(success);
+  record.set_host_name(string(host_name));
+  record.set_step_num(step_info.step_num());
+  record.set_total_duration_us(PicosToMicros(step_info.duration_ps()));
+  auto& step_breakdown_map = *record.mutable_step_breakdown_us();
+  std::vector<std::pair<uint64, absl::string_view>> metrics;
+
+  auto add_event = [&](GenericEventType type,
+                       std::initializer_list<EventType> event_list) {
+    uint64 ps = 0;
+    for (const auto& event_type : event_list) {
+      ps += gtl::FindWithDefault(generic.type_ps(), event_type, /*value=*/0);
+    }
+    step_breakdown_map[type] = PicosToMicros(ps);
+    metrics.emplace_back(ps, GetGenericEventTypeStr(type));
+  };
+
+  add_event(kDeviceCompute, {DEVICE_COMPUTE_32, DEVICE_COMPUTE_16});
+  add_event(kDeviceToDevice, {DEVICE_TO_DEVICE, DEVICE_WAIT_DEVICE});
+  add_event(kDeviceCollectives, {DEVICE_COLLECTIVES});
+  add_event(kHostCompute, {HOST_COMPUTE});
+  add_event(kHostPrepare, {HOST_PREPARE});
+  add_event(kInput, {HOST_WAIT_INPUT, HOST_TO_DEVICE, DEVICE_WAIT_HOST});
+  add_event(kOutput, {DEVICE_TO_HOST});
+  add_event(kCompile, {HOST_COMPILE});
+  add_event(kAllOthers, {UNKNOWN_TIME});
+
+  std::sort(metrics.begin(), metrics.end());
+  record.set_bottleneck(metrics.back().second.data(),
+                        metrics.back().second.size());
+  return record;
+}
+
+}  // namespace
+
+PodStatsDatabase ConvertOpStatsToPodStats(const OpStats& op_stats) {
+  PodStatsDatabase pod_stats_db;
+  const auto& core_id_map = op_stats.core_id_to_details();
+  for (int i = GenericEventType::kFirstGenericEventType;
+       i <= GenericEventType::kLastGenericEventType; i++) {
+    auto& event = *pod_stats_db.add_step_breakdown_events();
+    event.set_id(i);
+    absl::string_view type_str =
+        GetGenericEventTypeStr(static_cast<GenericEventType>(i));
+    event.set_name(type_str.data(), type_str.size());
+  }
+
+  for (const auto& step_sequence : op_stats.step_db().step_sequence()) {
+    for (const auto& entry : step_sequence.step_info_per_core()) {
+      const CoreDetails& details = core_id_map.at(entry.first);
+      *pod_stats_db.add_pod_stats_record() =
+          CreatePodStatsRecord(details.hostname(), entry.second);
+    }
+  }
+  PopulateStepDiagnostics(op_stats, pod_stats_db.mutable_diagnostics());
+  return pod_stats_db;
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_stats.h b/tensorflow/core/profiler/convert/op_stats_to_pod_stats.h
new file mode 100644
index 00000000000..bd3d74068d8
--- /dev/null
+++ b/tensorflow/core/profiler/convert/op_stats_to_pod_stats.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_POD_STATS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_POD_STATS_H_
+
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/pod_stats.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+PodStatsDatabase ConvertOpStatsToPodStats(const OpStats& op_stats);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_POD_STATS_H_
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_stats_test.cc b/tensorflow/core/profiler/convert/op_stats_to_pod_stats_test.cc
new file mode 100644
index 00000000000..4f922791e5f
--- /dev/null
+++ b/tensorflow/core/profiler/convert/op_stats_to_pod_stats_test.cc
@@ -0,0 +1,123 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/op_stats_to_pod_stats.h"
+
+#include "google/protobuf/any.pb.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+#include "tensorflow/core/profiler/utils/diagnostics.h"
+#include "tensorflow/core/profiler/utils/event_span.h"
+#include "tensorflow/core/profiler/utils/time_utils.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+const double kMaxError = 1e-6;
+constexpr int kStepNum = 2;
+constexpr int kCoreId = 1001;
+constexpr int kStepTimePs = 1000;
+constexpr int kHostComputePs = 50;
+constexpr int kHostCompilePs = 50;
+constexpr int kHostToHostPs = 50;
+constexpr int kHostToDevicePs = 50;
+constexpr int kHostPreparePs = 50;
+constexpr int kDeviceCollectivePs = 350;
+constexpr int kHostWaitInputPs = 50;
+constexpr int kDeviceToDevicePs = 50;
+constexpr int kDeviceToHostPs = 50;
+constexpr int kDeviceCompute32Ps = 50;
+constexpr int kDeviceCompute16Ps = 50;
+constexpr int kDeviceWaitDevicePs = 50;
+constexpr int kDeviceWaitHostPs = 50;
+constexpr int kUnknownTimePs = 50;
+static constexpr char kHostname[] = "host:123";
+
+void CreateOpStats(OpStats* op_stats) {
+  PerCoreStepInfo* info = op_stats->mutable_step_db()->add_step_sequence();
+  info->set_step_num(kStepNum);
+  StepInfoResult& step_info = (*info->mutable_step_info_per_core())[kCoreId];
+  step_info.set_step_num(kStepNum);
+  step_info.set_duration_ps(kStepTimePs);
+  GenericStepBreakdown breakdown;
+  auto& type_ps = *breakdown.mutable_type_ps();
+  type_ps[HOST_COMPUTE] = kHostComputePs;
+  type_ps[HOST_COMPILE] = kHostCompilePs;
+  type_ps[HOST_TO_HOST] = kHostToHostPs;
+  type_ps[HOST_TO_DEVICE] = kHostToDevicePs;
+  type_ps[HOST_PREPARE] = kHostPreparePs;
+  type_ps[DEVICE_COLLECTIVES] = kDeviceCollectivePs;
+  type_ps[HOST_WAIT_INPUT] = kHostWaitInputPs;
+  type_ps[DEVICE_TO_DEVICE] = kDeviceToDevicePs;
+  type_ps[DEVICE_TO_HOST] = kDeviceToHostPs;
+  type_ps[DEVICE_COMPUTE_32] = kDeviceCompute32Ps;
+  type_ps[DEVICE_COMPUTE_16] = kDeviceCompute16Ps;
+  type_ps[DEVICE_WAIT_DEVICE] = kDeviceWaitDevicePs;
+  type_ps[DEVICE_WAIT_HOST] = kDeviceWaitHostPs;
+  type_ps[UNKNOWN_TIME] = kUnknownTimePs;
+  step_info.mutable_step_breakdown()->PackFrom(breakdown);
+  CoreDetails& details = (*op_stats->mutable_core_id_to_details())[kCoreId];
+  details.set_hostname(kHostname);
+}
+
+TEST(OpStatsToPodStats, GpuPodStats) {
+  OpStats op_stats;
+  CreateOpStats(&op_stats);
+  PodStatsDatabase pod_stats_db = ConvertOpStatsToPodStats(op_stats);
+  EXPECT_EQ(1, pod_stats_db.pod_stats_record_size());
+  const PodStatsRecord& record = pod_stats_db.pod_stats_record(0);
+  EXPECT_EQ(kStepNum, record.step_num());
+  EXPECT_EQ(kHostname, record.host_name());
+  EXPECT_NEAR(PicosToMicros(kStepTimePs), record.total_duration_us(),
+              kMaxError);
+  const auto& breakdown = record.step_breakdown_us();
+  EXPECT_NEAR(PicosToMicros(kDeviceCompute32Ps + kDeviceCompute16Ps),
+              breakdown.at(kDeviceCompute), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kDeviceToDevicePs + kDeviceWaitDevicePs),
+              breakdown.at(kDeviceToDevice), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kDeviceCollectivePs),
+              breakdown.at(kDeviceCollectives), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kHostComputePs), breakdown.at(kHostCompute),
+              kMaxError);
+  EXPECT_NEAR(PicosToMicros(kHostPreparePs), breakdown.at(kHostPrepare),
+              kMaxError);
+  EXPECT_NEAR(
+      PicosToMicros(kHostWaitInputPs + kHostToDevicePs + kDeviceWaitHostPs),
+      breakdown.at(kInput), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kDeviceToHostPs), breakdown.at(kOutput), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kHostCompilePs), breakdown.at(kCompile), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kUnknownTimePs), breakdown.at(kAllOthers),
+              kMaxError);
+
+  EXPECT_EQ(GetGenericEventTypeStr(kDeviceCollectives), record.bottleneck());
+}
+
+TEST(OpStatsToPodStats, Diagnostics) {
+  OpStats op_stats;
+  op_stats.mutable_step_db()->set_use_incomplete_step(true);
+  PodStatsDatabase pod_stats_db = ConvertOpStatsToPodStats(op_stats);
+  EXPECT_EQ(1, pod_stats_db.diagnostics().warnings_size());
+  EXPECT_EQ(kErrorIncompleteStep, pod_stats_db.diagnostics().warnings(0));
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.cc b/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.cc
new file mode 100644
index 00000000000..c6d2a95f26d
--- /dev/null
+++ b/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.cc
@@ -0,0 +1,62 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/op_stats_to_pod_viewer.h"
+
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/profiler/convert/op_stats_to_pod_stats.h"
+#include "tensorflow/core/profiler/protobuf/pod_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+#include "tensorflow/core/profiler/utils/diagnostics.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+PodStatsSequence ConvertOpStatsToPodStatsSequence(const OpStats& op_stats,
+                                                  PodStatsDatabase pod_stats) {
+  PodStatsSequence result_db;
+  // PodStatsDatabase is created using the same iteration order below.
+  // Thus, we just need to move one record at a time.
+  int i = 0;
+  for (const auto& step_sequence : op_stats.step_db().step_sequence()) {
+    PodStatsMap* pod_stats_map = result_db.add_pod_stats_map();
+    pod_stats_map->set_step_num(step_sequence.step_num());
+    for (const auto& entry : step_sequence.step_info_per_core()) {
+      PodStatsRecord& record =
+          (*pod_stats_map->mutable_pod_stats_per_core())[entry.first];
+      DCHECK_LE(i, pod_stats.pod_stats_record_size());
+      record = std::move(*pod_stats.mutable_pod_stats_record(i++));
+    }
+  }
+  return result_db;
+}
+
+}  // namespace
+
+PodViewerDatabase ConvertOpStatsToPodViewer(const OpStats& op_stats) {
+  PodViewerDatabase database;
+  database.set_device_type(op_stats.run_environment().device_type());
+  PodStatsDatabase pod_stats = ConvertOpStatsToPodStats(op_stats);
+  database.mutable_step_breakdown_events()->Swap(
+      pod_stats.mutable_step_breakdown_events());
+  *database.mutable_pod_stats_sequence() =
+      ConvertOpStatsToPodStatsSequence(op_stats, std::move(pod_stats));
+  PopulateStepDiagnostics(op_stats, database.mutable_diagnostics());
+  return database;
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.h b/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.h
new file mode 100644
index 00000000000..c45c9939375
--- /dev/null
+++ b/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_POD_VIEWER_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_POD_VIEWER_H_
+
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/pod_viewer.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+PodViewerDatabase ConvertOpStatsToPodViewer(const OpStats& op_stats);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_POD_VIEWER_H_
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_viewer_test.cc b/tensorflow/core/profiler/convert/op_stats_to_pod_viewer_test.cc
new file mode 100644
index 00000000000..c63edffbfce
--- /dev/null
+++ b/tensorflow/core/profiler/convert/op_stats_to_pod_viewer_test.cc
@@ -0,0 +1,134 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/op_stats_to_pod_viewer.h"
+
+#include "google/protobuf/any.pb.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/pod_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+#include "tensorflow/core/profiler/utils/diagnostics.h"
+#include "tensorflow/core/profiler/utils/event_span.h"
+#include "tensorflow/core/profiler/utils/time_utils.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+const double kMaxError = 1e-6;
+constexpr int kStepNum = 2;
+constexpr int kCoreId = 1001;
+constexpr int kStepTimePs = 1000;
+constexpr int kHostComputePs = 50;
+constexpr int kHostCompilePs = 50;
+constexpr int kHostToHostPs = 50;
+constexpr int kHostToDevicePs = 50;
+constexpr int kHostPreparePs = 50;
+constexpr int kDeviceCollectivePs = 350;
+constexpr int kHostWaitInputPs = 50;
+constexpr int kDeviceToDevicePs = 50;
+constexpr int kDeviceToHostPs = 50;
+constexpr int kDeviceCompute32Ps = 50;
+constexpr int kDeviceCompute16Ps = 50;
+constexpr int kDeviceWaitDevicePs = 50;
+constexpr int kDeviceWaitHostPs = 50;
+constexpr int kUnknownTimePs = 50;
+static constexpr char kHostname[] = "host:123";
+
+void CreateOpStats(OpStats* op_stats) {
+  PerCoreStepInfo* info = op_stats->mutable_step_db()->add_step_sequence();
+  info->set_step_num(kStepNum);
+  StepInfoResult& step_info = (*info->mutable_step_info_per_core())[kCoreId];
+  step_info.set_step_num(kStepNum);
+  step_info.set_duration_ps(kStepTimePs);
+  GenericStepBreakdown breakdown;
+  auto& type_ps = *breakdown.mutable_type_ps();
+  type_ps[HOST_COMPUTE] = kHostComputePs;
+  type_ps[HOST_COMPILE] = kHostCompilePs;
+  type_ps[HOST_TO_HOST] = kHostToHostPs;
+  type_ps[HOST_TO_DEVICE] = kHostToDevicePs;
+  type_ps[HOST_PREPARE] = kHostPreparePs;
+  type_ps[DEVICE_COLLECTIVES] = kDeviceCollectivePs;
+  type_ps[HOST_WAIT_INPUT] = kHostWaitInputPs;
+  type_ps[DEVICE_TO_DEVICE] = kDeviceToDevicePs;
+  type_ps[DEVICE_TO_HOST] = kDeviceToHostPs;
+  type_ps[DEVICE_COMPUTE_32] = kDeviceCompute32Ps;
+  type_ps[DEVICE_COMPUTE_16] = kDeviceCompute16Ps;
+  type_ps[DEVICE_WAIT_DEVICE] = kDeviceWaitDevicePs;
+  type_ps[DEVICE_WAIT_HOST] = kDeviceWaitHostPs;
+  type_ps[UNKNOWN_TIME] = kUnknownTimePs;
+  step_info.mutable_step_breakdown()->PackFrom(breakdown);
+  CoreDetails& details = (*op_stats->mutable_core_id_to_details())[kCoreId];
+  details.set_hostname(kHostname);
+}
+
+TEST(OpStatsToPodViewer, GpuPodViewer) {
+  OpStats op_stats;
+  CreateOpStats(&op_stats);
+  PodViewerDatabase pod_viewer_db = ConvertOpStatsToPodViewer(op_stats);
+  EXPECT_EQ(1, pod_viewer_db.pod_stats_sequence().pod_stats_map_size());
+  const PodStatsMap& pod_stats_map =
+      pod_viewer_db.pod_stats_sequence().pod_stats_map(0);
+  EXPECT_EQ(kStepNum, pod_stats_map.step_num());
+  const PodStatsRecord& record = pod_stats_map.pod_stats_per_core().at(kCoreId);
+  EXPECT_EQ(kStepNum, record.step_num());
+  EXPECT_EQ(kHostname, record.host_name());
+  EXPECT_NEAR(PicosToMicros(kStepTimePs), record.total_duration_us(),
+              kMaxError);
+  const auto& breakdown = record.step_breakdown_us();
+  EXPECT_NEAR(PicosToMicros(kDeviceCompute32Ps + kDeviceCompute16Ps),
+              breakdown.at(kDeviceCompute), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kDeviceToDevicePs + kDeviceWaitDevicePs),
+              breakdown.at(kDeviceToDevice), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kDeviceCollectivePs),
+              breakdown.at(kDeviceCollectives), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kHostComputePs), breakdown.at(kHostCompute),
+              kMaxError);
+  EXPECT_NEAR(PicosToMicros(kHostPreparePs), breakdown.at(kHostPrepare),
+              kMaxError);
+  EXPECT_NEAR(
+      PicosToMicros(kHostWaitInputPs + kHostToDevicePs + kDeviceWaitHostPs),
+      breakdown.at(kInput), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kDeviceToHostPs), breakdown.at(kOutput), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kHostCompilePs), breakdown.at(kCompile), kMaxError);
+  EXPECT_NEAR(PicosToMicros(kUnknownTimePs), breakdown.at(kAllOthers),
+              kMaxError);
+
+  EXPECT_EQ(GetGenericEventTypeStr(kDeviceCollectives), record.bottleneck());
+}
+
+TEST(OpStatsToPodViewer, Diagnostics) {
+  OpStats op_stats;
+  op_stats.mutable_step_db()->set_use_incomplete_step(true);
+  PodViewerDatabase pod_viewer_db = ConvertOpStatsToPodViewer(op_stats);
+  EXPECT_EQ(1, pod_viewer_db.diagnostics().warnings_size());
+  EXPECT_EQ(kErrorIncompleteStep, pod_viewer_db.diagnostics().warnings(0));
+}
+
+TEST(OpStatsToPodViewer, DeviceType) {
+  OpStats op_stats;
+  op_stats.mutable_run_environment()->set_device_type("GPU");
+  PodViewerDatabase pod_viewer_db = ConvertOpStatsToPodViewer(op_stats);
+  EXPECT_EQ("GPU", pod_viewer_db.device_type());
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
index a5e127a45d0..48354874509 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
@@ -29,6 +29,10 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
+// The maximum number of Tensorflow Ops displayed on Tensorflow Stats page.
+// 500 device side ops and 500 host side ops.
+const int kMaxNumOfOps = 500;
+
 TfStatsRecord ConvertOpMetricsToTfStatsRecord(
     bool on_device, const OpMetrics& metrics,
     double ridge_point_operational_intensity) {
@@ -60,7 +64,8 @@ TfStatsTable GenerateTfStatsTable(
     total_device_time_ps -= IdleTimePs(device_tf_metrics_db);
   }
   double total_device_time_us = PicosToMicros(total_device_time_ps);
-  for (const OpMetrics* metrics : SortedOpMetricsDb(device_tf_metrics_db)) {
+  for (const OpMetrics* metrics :
+       SortedOpMetricsDb(device_tf_metrics_db, kMaxNumOfOps)) {
     if (exclude_idle && IsIdleOp(*metrics)) continue;
     TfStatsRecord* record = tf_stats_table.add_tf_stats_record();
     *record = ConvertOpMetricsToTfStatsRecord(
@@ -84,8 +89,8 @@ TfStatsTable GenerateTfStatsTable(
     total_host_time_ps -= IdleTimePs(host_tf_metrics_db);
   }
   double total_host_time_us = PicosToMicros(total_host_time_ps);
-  for (const OpMetrics* metrics :
-       tensorflow::profiler::SortedOpMetricsDb(host_tf_metrics_db)) {
+  for (const OpMetrics* metrics : tensorflow::profiler::SortedOpMetricsDb(
+           host_tf_metrics_db, kMaxNumOfOps)) {
     if (exclude_idle && IsIdleOp(*metrics)) continue;
     TfStatsRecord* record = tf_stats_table.add_tf_stats_record();
     *record = ConvertOpMetricsToTfStatsRecord(
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
index 2b255352916..067d47a0b57 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
@@ -123,8 +123,10 @@ block_z:1)MULTI";
       absl::StrCat(kTfOp3, ":", kTfOp3), kKernel5StartNs, kKernel5DurationNs,
       /*on_device=*/true, kKernel5, kKernelDetails, &device_plane, &stream2);
 
-  const OpStats op_stats =
-      ConvertXSpaceToOpStats(space, {OP_METRICS_DB, KERNEL_STATS_DB});
+  OpStatsOptions options;
+  options.generate_kernel_stats_db = true;
+  options.generate_op_metrics_db = true;
+  const OpStats op_stats = ConvertXSpaceToOpStats(space, options);
   const TfStatsDatabase tf_stats = ConvertOpStatsToTfStats(op_stats);
 
   EXPECT_EQ(tf_stats.device_type(), op_stats.run_environment().device_type());
diff --git a/tensorflow/core/profiler/convert/post_process_single_host_xplane.cc b/tensorflow/core/profiler/convert/post_process_single_host_xplane.cc
new file mode 100644
index 00000000000..581a003eb38
--- /dev/null
+++ b/tensorflow/core/profiler/convert/post_process_single_host_xplane.cc
@@ -0,0 +1,66 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/convert/post_process_single_host_xplane.h"
+
+#include "tensorflow/core/profiler/utils/derived_timeline.h"
+#include "tensorflow/core/profiler/utils/group_events.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
+
+namespace tensorflow {
+namespace profiler {
+
+void PostProcessSingleHostXSpace(XSpace* space, uint64 start_time_ns) {
+  VLOG(3) << "Post processing local profiler XSpace.";
+  // Post processing the collected XSpace without hold profiler lock.
+  // 1. Merge plane of host events with plane of CUPTI driver api.
+  const XPlane* cupti_driver_api_plane =
+      FindPlaneWithName(*space, kCuptiDriverApiPlaneName);
+  const XPlane* python_tracer_plane =
+      FindPlaneWithName(*space, kPythonTracerPlaneName);
+  if (cupti_driver_api_plane || python_tracer_plane) {
+    XPlane* host_plane =
+        FindOrAddMutablePlaneWithName(space, kHostThreadsPlaneName);
+    if (cupti_driver_api_plane) {
+      MergePlanes(*cupti_driver_api_plane, host_plane);
+    }
+    if (python_tracer_plane) {
+      MergePlanes(*python_tracer_plane, host_plane);
+    }
+    SortXLinesBy(host_plane, XLinesComparatorByName());
+    // NOTE: RemovePlaneWithName might invalidate plane pointers. so do these
+    // at the last step.
+    if (cupti_driver_api_plane) {
+      RemovePlaneWithName(space, kCuptiDriverApiPlaneName);
+    }
+    if (python_tracer_plane) {
+      RemovePlaneWithName(space, kPythonTracerPlaneName);
+    }
+  }
+
+  // 2. Normalize all timestamps by shifting timeline to profiling start time.
+  // NOTE: this have to be done before sorting XSpace due to timestamp overflow.
+  NormalizeTimestamps(space, start_time_ns);
+  // 3. Sort each plane of the XSpace
+  SortXSpace(space);
+  // 4. Grouping (i.e. marking step number) events in the XSpace.
+  EventForest event_forest;
+  GroupTfEvents(space, &event_forest);
+  // 5. Generated miscellaneous derived time lines for device planes.
+  GenerateDerivedTimeLines(event_forest.GetGroupMetadataMap(), space);
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/post_process_single_host_xplane.h b/tensorflow/core/profiler/convert/post_process_single_host_xplane.h
new file mode 100644
index 00000000000..31ebe28c48f
--- /dev/null
+++ b/tensorflow/core/profiler/convert/post_process_single_host_xplane.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_POST_PROCESS_SINGLE_HOST_XPLANE_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_POST_PROCESS_SINGLE_HOST_XPLANE_H_
+
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Post process XSpaces collected locally from multiple profilers.
+void PostProcessSingleHostXSpace(XSpace* space, uint64 start_time_ns);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_POST_PROCESS_SINGLE_HOST_XPLANE_H_
diff --git a/tensorflow/core/profiler/convert/step_events_to_steps_db.cc b/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
index f37cd6ed103..4b541f5b26c 100644
--- a/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
+++ b/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
@@ -109,7 +109,8 @@ string DebugStepInfo(const StepInfoResult& step_info) {
 }  // namespace
 
 StepDatabaseResult ConvertStepEventsToStepDb(
-    bool has_device, const StepEvents& nonoverlapped_step_events) {
+    bool has_device, bool maybe_drop_incomplete_steps,
+    const StepEvents& nonoverlapped_step_events) {
   StepDatabaseResult step_db;
   // Gets sorted step numbers.
   std::vector<int64> step_numbers;
@@ -151,6 +152,17 @@ StepDatabaseResult ConvertStepEventsToStepDb(
     // The remaining fields in PerCoreStepInfo are not filled.
     *step_db.add_step_sequence() = per_core_step_info;
   }
+
+  // If we are using sampling mode and we get enough steps, we would like to
+  // drop the incomplete steps at the beginning and the end.
+  // (Sometimes CUTPI instrumentation will prolong the first step too).
+  int kDropIncomplteteStepThreshold = 5;
+  if (maybe_drop_incomplete_steps &&
+      step_db.step_sequence_size() > kDropIncomplteteStepThreshold) {
+    step_db.mutable_step_sequence()->erase(
+        step_db.mutable_step_sequence()->begin());
+    step_db.mutable_step_sequence()->RemoveLast();
+  }
   return step_db;
 }
 
diff --git a/tensorflow/core/profiler/convert/step_events_to_steps_db.h b/tensorflow/core/profiler/convert/step_events_to_steps_db.h
index 9db65163f7a..ca01e87c4d7 100644
--- a/tensorflow/core/profiler/convert/step_events_to_steps_db.h
+++ b/tensorflow/core/profiler/convert/step_events_to_steps_db.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_STEP_EVENTS_TO_STEPS_DB_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_STEP_EVENTS_TO_STEPS_DB_H_
 
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 #include "tensorflow/core/profiler/utils/event_span.h"
@@ -23,11 +24,12 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
-ABSL_CONST_INIT extern const uint32 kDefaultGpuLocalCoreId;
+TF_CONST_INIT extern const uint32 kDefaultGpuLocalCoreId;
 
 // Converts from overlapped Step-Events to StepDatabaseResult.
 StepDatabaseResult ConvertStepEventsToStepDb(
-    bool has_device, const StepEvents& overlapped_step_events);
+    bool has_device, bool maybe_drop_incomplete_steps,
+    const StepEvents& overlapped_step_events);
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/trace_events_to_json.cc b/tensorflow/core/profiler/convert/trace_events_to_json.cc
index ba3e4516c8c..658f6d0c687 100644
--- a/tensorflow/core/profiler/convert/trace_events_to_json.cc
+++ b/tensorflow/core/profiler/convert/trace_events_to_json.cc
@@ -16,113 +16,110 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/trace_events_to_json.h"
 
 #include <algorithm>
-#include <map>
+#include <string>
 #include <utility>
 
+#include "absl/algorithm/container.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "include/json/json.h"
+#include "json/json.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/core/profiler/utils/format_utils.h"
+#include "tensorflow/core/profiler/utils/time_utils.h"
 
 namespace tensorflow {
 namespace profiler {
 namespace {
 
-constexpr double kPicosPerMicro = 1000000.0;
-
-inline void AppendEscapedName(string *json, const string &name) {
-  absl::StrAppend(json, "\"name\":", Json::valueToQuotedString(name.c_str()));
+// Converts the given time from picoseconds to microseconds and then to a string
+// using maximum precision.
+inline std::string PicosToMicrosString(uint64 ps) {
+  return MaxPrecision(PicosToMicros(ps));
 }
 
-// Adds resource events for a single device.
-void AddResourceMetadata(uint32 device_id,
-                         const std::map<uint32, const Resource *> &resources,
-                         string *json) {
-  for (const auto &pair : resources) {
-    uint32 resource_id = pair.first;
-    const Resource &resource = *pair.second;
-    if (!resource.name().empty()) {
-      absl::StrAppendFormat(json,
-                            R"({"ph":"M","pid":%u,"tid":%u,)"
-                            R"("name":"thread_name","args":{)",
-                            device_id, resource_id);
-      AppendEscapedName(json, resource.name());
-      absl::StrAppend(json, "}},");
-    }
-    uint32 sort_index =
-        resource.sort_index() ? resource.sort_index() : resource_id;
-    absl::StrAppendFormat(
-        json,
-        R"({"ph":"M","pid":%u,"tid":%u,)"
-        R"("name":"thread_sort_index","args":{"sort_index":%u}},)",
-        device_id, resource_id, sort_index);
+// Escapes and quotes the given string.
+inline std::string JsonString(const std::string& s) {
+  return Json::valueToQuotedString(s.c_str());
+}
+
+// Returns a vector of pointers to the elements in the given map, sorted by key.
+template <typename Map>
+std::vector<const typename Map::value_type*> SortByKey(const Map& m) {
+  std::vector<const typename Map::value_type*> pairs;
+  pairs.reserve(m.size());
+  for (const auto& pair : m) {
+    pairs.push_back(&pair);
   }
+  absl::c_sort(pairs, [](const typename Map::value_type* a,
+                         const typename Map::value_type* b) {
+    return a->first < b->first;
+  });
+  return pairs;
 }
 
-void AddDeviceMetadata(const std::map<uint32, const Device *> &devices,
-                       string *json) {
-  for (const auto &pair : devices) {
-    uint32 device_id = pair.first;
-    const Device &device = *pair.second;
-    if (!device.name().empty()) {
-      absl::StrAppendFormat(json,
-                            R"({"ph":"M","pid":%u,"name":"process_name",)"
-                            R"("args":{)",
-                            device_id);
-      AppendEscapedName(json, device.name());
-      absl::StrAppend(json, "}},");
-    }
-    absl::StrAppendFormat(json,
-                          R"({"ph":"M","pid":%u,"name":"process_sort_index",)"
-                          R"("args":{"sort_index":%u}},)",
-                          device_id, device_id);
-    // Convert to a std::map so that resources are sorted by the device id.
-    std::map<uint32, const Resource *> sorted_resources;
-    for (const auto &pair : device.resources()) {
-      sorted_resources[pair.first] = &pair.second;
-    }
-    AddResourceMetadata(device_id, sorted_resources, json);
+inline void AddDeviceMetadata(uint32 device_id, const Device& device,
+                              std::string* json) {
+  if (!device.name().empty()) {
+    absl::StrAppend(json, R"({"ph":"M","pid":)", device_id,
+                    R"(,"name":"process_name","args":{"name":)",
+                    JsonString(device.name()), "}},");
   }
+  absl::StrAppend(json, R"({"ph":"M","pid":)", device_id,
+                  R"(,"name":"process_sort_index","args":{"sort_index":)",
+                  device_id, "}},");
 }
 
-inline void AddTraceEvent(const TraceEvent &event, string *json) {
-  absl::StrAppendFormat(json, R"({"pid":%u,"tid":%u,"ts":%.17g,)",
-                        event.device_id(), event.resource_id(),
-                        event.timestamp_ps() / kPicosPerMicro);
-  AppendEscapedName(json, event.name());
-  absl::StrAppend(json, ",");
-  uint64 duration_ps =
-      std::max(static_cast<uint64>(event.duration_ps()), uint64{1});
-  absl::StrAppendFormat(json, R"("ph":"X","dur":%.17g)",
-                        duration_ps / kPicosPerMicro);
+inline void AddResourceMetadata(uint32 device_id, uint32 resource_id,
+                                const Resource& resource, std::string* json) {
+  if (!resource.name().empty()) {
+    absl::StrAppend(json, R"({"ph":"M","pid":)", device_id, R"(,"tid":)",
+                    resource_id, R"(,"name":"thread_name","args":{"name":)",
+                    JsonString(resource.name()), "}},");
+  }
+  uint32 sort_index =
+      resource.sort_index() ? resource.sort_index() : resource_id;
+  absl::StrAppend(json, R"({"ph":"M","pid":)", device_id, R"(,"tid":)",
+                  resource_id, R"(,"name":"thread_sort_index")",
+                  R"(,"args":{"sort_index":)", sort_index, "}},");
+}
+
+inline void AddTraceEvent(const TraceEvent& event, string* json) {
+  auto duration_ps = std::max(event.duration_ps(), protobuf_uint64{1});
+  absl::StrAppend(json, R"({"ph":"X","pid":)", event.device_id(), R"(,"tid":)",
+                  event.resource_id(), R"(,"ts":)",
+                  PicosToMicrosString(event.timestamp_ps()), R"(,"dur":)",
+                  PicosToMicrosString(duration_ps), R"(,"name":)",
+                  JsonString(event.name()));
   if (!event.args().empty()) {
-    std::map<std::string, std::string> sorted_args(event.args().begin(),
-                                                   event.args().end());
     absl::StrAppend(json, R"(,"args":{)");
-    for (const auto &arg : sorted_args) {
-      absl::StrAppend(json, Json::valueToQuotedString(arg.first.c_str()), ":",
-                      Json::valueToQuotedString(arg.second.c_str()), ",");
+    for (const auto* arg : SortByKey(event.args())) {
+      absl::StrAppend(json, JsonString(arg->first), ":",
+                      JsonString(arg->second), ",");
     }
-    // Removes the trailing comma.
-    json->pop_back();
-    absl::StrAppend(json, "}");
+    // Replace trailing comma with closing brace.
+    json->back() = '}';
   }
   absl::StrAppend(json, "},");
 }
 
 }  // namespace
 
-string TraceEventsToJson(const Trace &trace) {
-  string json = R"({"displayTimeUnit":"ns","metadata":{"highres-ticks":true},)"
-                R"("traceEvents":[)";
-  // Convert to a std::map so that devices are sorted by the device id.
-  std::map<uint32, const Device *> sorted_devices;
-  for (const auto &pair : trace.devices()) {
-    sorted_devices[pair.first] = &pair.second;
+std::string TraceEventsToJson(const Trace& trace) {
+  std::string json =
+      R"({"displayTimeUnit":"ns","metadata":{"highres-ticks":true},)"
+      R"("traceEvents":[)";
+  for (const auto* id_and_device : SortByKey(trace.devices())) {
+    uint32 device_id = id_and_device->first;
+    const Device& device = id_and_device->second;
+    AddDeviceMetadata(device_id, device, &json);
+    for (const auto* id_and_resource : SortByKey(device.resources())) {
+      uint32 resource_id = id_and_resource->first;
+      const Resource& resource = id_and_resource->second;
+      AddResourceMetadata(device_id, resource_id, resource, &json);
+    }
   }
-  AddDeviceMetadata(sorted_devices, &json);
-  for (const TraceEvent &event : trace.trace_events()) {
+  for (const TraceEvent& event : trace.trace_events()) {
     AddTraceEvent(event, &json);
   }
   // Add one fake event to avoid dealing with no-trailing-comma rule.
diff --git a/tensorflow/core/profiler/convert/trace_events_to_json.h b/tensorflow/core/profiler/convert/trace_events_to_json.h
index 16747fec737..15db9e2d954 100644
--- a/tensorflow/core/profiler/convert/trace_events_to_json.h
+++ b/tensorflow/core/profiler/convert/trace_events_to_json.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_EVENTS_TO_JSON_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_EVENTS_TO_JSON_H_
 
+#include <string>
+
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
 
@@ -24,7 +26,7 @@ namespace profiler {
 
 // Converts trace events in the trace proto to a JSON string that can be
 // consumed by catapult trace viewer.
-string TraceEventsToJson(const Trace &trace);
+std::string TraceEventsToJson(const Trace& trace);
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/trace_events_to_json_test.cc b/tensorflow/core/profiler/convert/trace_events_to_json_test.cc
index dc985f2f76f..2181197e3fc 100644
--- a/tensorflow/core/profiler/convert/trace_events_to_json_test.cc
+++ b/tensorflow/core/profiler/convert/trace_events_to_json_test.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/trace_events_to_json.h"
 
-#include "include/json/json.h"
+#include <string>
+
+#include "json/json.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
@@ -24,13 +26,13 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
-string ConvertTextFormattedTraceToJson(const string& trace_str) {
+std::string ConvertTextFormattedTraceToJson(const std::string& trace_str) {
   Trace trace;
-  ::tensorflow::protobuf::TextFormat::ParseFromString(trace_str, &trace);
+  EXPECT_TRUE(protobuf::TextFormat::ParseFromString(trace_str, &trace));
   return TraceEventsToJson(trace);
 }
 
-Json::Value ToJsonValue(const string& json_str) {
+Json::Value ToJsonValue(const std::string& json_str) {
   Json::Value json;
   Json::Reader reader;
   EXPECT_TRUE(reader.parse(json_str, json));
@@ -38,47 +40,48 @@ Json::Value ToJsonValue(const string& json_str) {
 }
 
 TEST(TraceEventsToJson, JsonConversion) {
-  string json_output = ConvertTextFormattedTraceToJson(R"(
-      devices { key: 2 value {
+  std::string json_output = ConvertTextFormattedTraceToJson(R"proto(
+    devices {
+      key: 2
+      value {
         name: 'D2'
         device_id: 2
-        resources { key: 2 value {
-          resource_id: 2
-          name: 'R2.2'
-        } }
-      } }
-      devices { key: 1 value {
+        resources {
+          key: 2
+          value { resource_id: 2 name: 'R2.2' }
+        }
+      }
+    }
+    devices {
+      key: 1
+      value {
         name: 'D1'
         device_id: 1
-        resources { key: 2 value {
-          resource_id: 1
-          name: 'R1.2'
-        } }
-      } }
+        resources {
+          key: 2
+          value { resource_id: 1 name: 'R1.2' }
+        }
+      }
+    }
+    trace_events {
+      device_id: 1
+      resource_id: 2
+      name: 'E1.2.1'
+      timestamp_ps: 100000
+      duration_ps: 10000
+      args { key: 'long_name' value: 'E1.2.1 long' }
+      args { key: 'arg2' value: 'arg2 val' }
+    }
+    trace_events {
+      device_id: 2
+      resource_id: 2
+      name: 'E2.2.1 # "comment"'
+      timestamp_ps: 105000
+    }
+  )proto");
+  Json::Value json = ToJsonValue(json_output);
 
-      trace_events {
-        device_id: 1
-        resource_id: 2
-        name: 'E1.2.1'
-        timestamp_ps: 100000
-        duration_ps: 10000
-        args {
-          key: 'long_name'
-          value: 'E1.2.1 long'
-        }
-        args {
-          key: 'arg2'
-          value: 'arg2 val'
-        }
-      }
-      trace_events {
-        device_id: 2
-        resource_id: 2
-        name: 'E2.2.1 # "comment"'
-        timestamp_ps: 105000
-      }
-  )");
-  string expected_json = R"(
+  Json::Value expected_json = ToJsonValue(R"(
   {
     "displayTimeUnit": "ns",
     "metadata": { "highres-ticks": true },
@@ -95,7 +98,6 @@ TEST(TraceEventsToJson, JsonConversion) {
        "args":{"name":"R2.2"}},
       {"ph":"M", "pid":2, "tid":2, "name":"thread_sort_index",
        "args":{"sort_index":2}},
-
       {
         "ph" : "X",
         "pid" : 1,
@@ -115,8 +117,9 @@ TEST(TraceEventsToJson, JsonConversion) {
       },
       {}
     ]
-  })";
-  EXPECT_EQ(ToJsonValue(json_output), ToJsonValue(expected_json));
+  })");
+
+  EXPECT_EQ(json, expected_json);
 }
 
 }  // namespace
diff --git a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db_test.cc b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db_test.cc
index e402b3b6672..a7052c1d065 100644
--- a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db_test.cc
@@ -84,8 +84,7 @@ block_z:1)MULTI"},
   KernelReportMap reports;
   ConvertDeviceTraceXPlaneToKernelReports(*device_trace, {}, &reports);
   KernelStatsDb kernel_stats;
-  CopyKernelReportsToDb(reports, &kernel_stats);
-  SortKernelsByTotalDurationDesc(&kernel_stats);
+  CopyTopKDurationKernelReportsToDb(reports, &kernel_stats);
 
   EXPECT_EQ(kernel_stats.reports_size(), 3);
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
index 3b67124ef27..4bd1aa9d49c 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/strings/str_format.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/framework/types.h"
@@ -153,7 +153,7 @@ MemoryProfile GenerateMemoryProfile(const XPlane* host_trace) {
         switch (stat.Type().value()) {
           case StatType::kIndexOnHost:
           case StatType::kDeviceOrdinal:
-            memory_id = absl::StrFormat("%d", stat.IntValue());
+            memory_id = absl::StrCat(stat.IntValue());
             break;
           case StatType::kAllocatorName:
             memory_id = std::string(stat.StrOrRefValue());
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
index 7758b215753..61d2fb396e3 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
@@ -85,7 +85,7 @@ TEST(ConvertXPlaneToMemoryProfile, OneAllocatorMultiActivitiesTest) {
                 {StatType::kRegionType, "temp"},
                 {StatType::kTensorShapes, "[1, 2]"}});
 
-  tensorflow::profiler::GroupTfEvents(&space, nullptr);
+  tensorflow::profiler::GroupTfEvents(&space);
   MemoryProfile memory_profile = ConvertXPlaneToMemoryProfile(*host_plane);
   EXPECT_EQ(memory_profile.memory_profile_per_allocator().size(), 1);
   EXPECT_EQ(memory_profile.num_hosts(), 1);
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
index 2f4bf2689b0..e21df595523 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
+#include "tensorflow/core/profiler/convert/op_stats_combiner.h"
 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
 #include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
 #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
@@ -38,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/event_span.h"
 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
+#include "tensorflow/core/profiler/utils/step_intersection.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
@@ -61,7 +63,7 @@ DeviceCapabilities GetDeviceCapFromXPlane(const XPlane& device_plane) {
         cap.set_num_cores(stat.IntValue());
         break;
       case kDevCapMemoryBandwidth:
-        cap.set_memory_bandwidth(stat.IntValue());  // bytes/s
+        cap.set_memory_bandwidth(stat.UintValue());  // bytes/s
         break;
       case kDevCapMemorySize:
         cap.set_memory_size_in_bytes(stat.UintValue());
@@ -107,7 +109,7 @@ void SetRunEnvironment(int32 accelerator_count, RunEnvironment* env) {
 }
 
 void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events,
-                      const OpStatsConfig& config, OpMetricsDb* op_metrics_db,
+                      const OpStatsOptions& options, OpMetricsDb* op_metrics_db,
                       StepEvents* step_events) {
   absl::flat_hash_map<int64, TfOp> tf_ops =
       CollectTfOpsFromHostThreadsXPlane(*host_plane);
@@ -116,7 +118,7 @@ void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events,
   plane.ForEachLine([&](const XLineVisitor& line) {
     ConsumeTfMetricsDbData(
         ConvertHostThreadsXLineToTfMetricsDbData(line, tf_ops), &combiner);
-    if (config.contains(STEP_DB)) {
+    if (options.generate_step_db) {
       CombineStepEvents(ConvertHostThreadsXLineToStepEvents(
                             line, use_device_step_events, *step_events),
                         step_events);
@@ -143,7 +145,7 @@ void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
 }
 
 OpStats ConvertXSpaceToOpStats(const XSpace& space,
-                               const OpStatsConfig& config) {
+                               const OpStatsOptions& options) {
   const XPlane* host_plane = FindPlaneWithName(space, kHostThreadsPlaneName);
   std::vector<const XPlane*> device_planes =
       FindPlanesWithPrefix(space, kGpuPlanePrefix);
@@ -158,7 +160,7 @@ OpStats ConvertXSpaceToOpStats(const XSpace& space,
   KernelReportMap reports;
   // TODO(b/161942993) parallelize XPlane processing per thread.
   for (const XPlane* device_trace : device_planes) {
-    if (config.contains(OP_METRICS_DB)) {
+    if (options.generate_op_metrics_db) {
       if (!op_stats.has_perf_env()) {
         *op_stats.mutable_perf_env() = GetPerfEnvFromXPlane(*device_trace);
       }
@@ -168,39 +170,83 @@ OpStats ConvertXSpaceToOpStats(const XSpace& space,
           perf_env.peak_hbm_bw_giga_bytes_per_second());
       op_metrics_db_combiner.Combine(device_op_metrics_db);
     }
-    if (config.contains(STEP_DB)) {
+    if (options.generate_step_db) {
       CombineStepEvents(ConvertDeviceTraceXPlaneToStepEvents(*device_trace),
                         &step_events);
     }
-    if (config.contains(KERNEL_STATS_DB)) {
+    if (options.generate_kernel_stats_db) {
       ConvertDeviceTraceXPlaneToKernelReports(*device_trace,
                                               /*on_kernel_fn=*/{}, &reports);
     }
   }
 
   // Combine into reports.
-  if (config.contains(KERNEL_STATS_DB)) {
-    CopyKernelReportsToDb(reports, op_stats.mutable_kernel_stats_db());
-    // TODO(b/161943499) Replace sort with a TopK algorithm.
-    SortKernelsByTotalDurationDesc(op_stats.mutable_kernel_stats_db());
+  if (options.generate_kernel_stats_db) {
+    CopyTopKDurationKernelReportsToDb(reports,
+                                      op_stats.mutable_kernel_stats_db());
   }
 
   bool has_device = !device_planes.empty();
   // Convert a host plane.
-  if (host_plane && config.contains(OP_METRICS_DB)) {
-    ProcessHostPlane(host_plane, has_device, config,
+  if (host_plane && options.generate_op_metrics_db) {
+    ProcessHostPlane(host_plane, has_device, options,
                      op_stats.mutable_host_op_metrics_db(), &step_events);
   }
-  if (config.contains(STEP_DB)) {
+  if (options.generate_step_db) {
     StepEvents nonoverlapped_step_events =
         ToNonOverlappedStepEvents(step_events);
-    *op_stats.mutable_step_db() =
-        ConvertStepEventsToStepDb(has_device, nonoverlapped_step_events);
+    *op_stats.mutable_step_db() = ConvertStepEventsToStepDb(
+        has_device, options.maybe_drop_incomplete_steps,
+        nonoverlapped_step_events);
     *op_stats.mutable_device_op_metrics_db()->mutable_precision_stats() =
         ComputePrecisionStats(nonoverlapped_step_events);
   }
+
+  CoreDetails& details =
+      (*op_stats.mutable_core_id_to_details())[kDefaultGpuLocalCoreId];
+  details.set_hostname(space.hostnames().empty() ? "localhost"
+                                                 : space.hostnames(0));
   return op_stats;
 }
 
+Status ConvertMultiXSpacesToCombinedOpStats(
+    const std::vector<std::string>& xspace_paths, const OpStatsOptions& options,
+    OpStats* combined_op_stats) {
+  // A shortcut code path for a single XSpace. There is no need to merge OpStats
+  // if there is only a single XSpace.
+  if (xspace_paths.size() == 1) {
+    XSpace xspace;
+    Status status = ReadBinaryProto(Env::Default(), xspace_paths[0], &xspace);
+    if (!status.ok()) return status;
+    *combined_op_stats = ConvertXSpaceToOpStats(xspace, options);
+    return Status::OK();
+  }
+
+  // Read multiple XSpaces and convert to multiple OpStats.
+  std::vector<OpStats> all_op_stats;
+  for (const std::string& xspace_path : xspace_paths) {
+    XSpace xspace;
+    Status status = ReadBinaryProto(Env::Default(), xspace_path, &xspace);
+    if (!status.ok()) return status;
+    all_op_stats.push_back(ConvertXSpaceToOpStats(xspace, options));
+  }
+
+  // Combine OpStats.
+  std::vector<OpStatsInfo> all_op_stats_info;
+  all_op_stats_info.reserve(all_op_stats.size());
+  for (int i = 0; i < all_op_stats.size(); i++) {
+    all_op_stats_info.emplace_back(
+        &all_op_stats[i],
+        ParseHardwareType(all_op_stats[i].run_environment().device_type()), i);
+  }
+
+  // Do not limit the maximum number of steps during the merge of OpStats.
+  StepIntersection step_intersection =
+      ComputeStepIntersectionToMergeOpStats(all_op_stats_info, kuint32max);
+  CombineAllOpStats(all_op_stats_info, step_intersection, combined_op_stats);
+
+  return Status::OK();
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.h b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
index e1778006cbd..178f8c261f2 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.h
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
@@ -17,23 +17,23 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_OP_STATS_H_
 
 #include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
 
-enum OpStatsKind {
-  OP_METRICS_DB,
-  STEP_DB,
-  KERNEL_STATS_DB,
+struct OpStatsOptions {
+  bool maybe_drop_incomplete_steps = false;
+  bool generate_op_metrics_db = false;
+  bool generate_step_db = false;
+  bool generate_kernel_stats_db = false;
 };
 
-using OpStatsConfig = absl::flat_hash_set<OpStatsKind>;
-
 // NOTE: call GroupTfEvents before if OpStats.step_db needs to be generated.
 OpStats ConvertXSpaceToOpStats(const XSpace& space,
-                               const OpStatsConfig& config);
+                               const OpStatsOptions& options);
 
 // Propagate and dedup the diagnostics in XSpace and add to OpStats.
 void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
@@ -46,6 +46,14 @@ PerfEnv MakePerfEnv(double peak_tera_flops_per_second,
 // Extracts PerfEnv from XPlane stats.
 PerfEnv GetPerfEnvFromXPlane(const XPlane& device_plane);
 
+// Reads multiple XSpaces from <xspace_paths>, convert them to OpStats, and
+// combine them to a single OpStats in <combined_op_stats>.
+// Return the first error status during conversion, or return Status::OK() if
+// there is no error.
+Status ConvertMultiXSpacesToCombinedOpStats(
+    const std::vector<std::string>& xspace_paths, const OpStatsOptions& options,
+    OpStats* combined_op_stats);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
index beeb4a097bc..a61c22f98a4 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
@@ -16,8 +16,12 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
 
 #include "absl/strings/str_cat.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
 #include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
@@ -33,36 +37,39 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
+static constexpr char kXPlanePb[] = "xplane.pb";
+
 TEST(ConvertXPlaneToOpStats, PerfEnv) {
   XSpace space;
   constexpr double kMaxError = 0.01;
   constexpr int kClockRateKHz = 1530000;
   constexpr int kCoreCount = 80;
-  constexpr uint64 kMemoryBandwidthBytesPerSecond = 900 * 1e9;
+  constexpr uint64 kMemoryBandwidthBytesPerSecond =
+      uint64{900} * 1000 * 1000 * 1000;
   // Volta.
   constexpr int kComputeCapMajor = 7;
   constexpr int kComputeCapMinor = 0;
 
   XPlaneBuilder device_plane(
       GetOrCreateGpuXPlane(&space, /*device_ordinal=*/0));
-  device_plane.ParseAndAddStatValue(
-      *device_plane.GetOrCreateStatMetadata("clock_rate"),
-      absl::StrCat(kClockRateKHz));
-  device_plane.ParseAndAddStatValue(
-      *device_plane.GetOrCreateStatMetadata("core_count"),
-      absl::StrCat(kCoreCount));
-  device_plane.ParseAndAddStatValue(
+  device_plane.AddStatValue(*device_plane.GetOrCreateStatMetadata("clock_rate"),
+                            kClockRateKHz);
+  device_plane.AddStatValue(*device_plane.GetOrCreateStatMetadata("core_count"),
+                            kCoreCount);
+  device_plane.AddStatValue(
       *device_plane.GetOrCreateStatMetadata("memory_bandwidth"),
-      absl::StrCat(kMemoryBandwidthBytesPerSecond));
-  device_plane.ParseAndAddStatValue(
+      kMemoryBandwidthBytesPerSecond);
+  device_plane.AddStatValue(
       *device_plane.GetOrCreateStatMetadata("compute_cap_major"),
-      absl::StrCat(kComputeCapMajor));
-  device_plane.ParseAndAddStatValue(
+      kComputeCapMajor);
+  device_plane.AddStatValue(
       *device_plane.GetOrCreateStatMetadata("compute_cap_minor"),
-      absl::StrCat(kComputeCapMinor));
+      kComputeCapMinor);
 
-  GroupTfEvents(&space, /*group_metadata_map=*/nullptr);
-  OpStats op_stats = ConvertXSpaceToOpStats(space, {OP_METRICS_DB});
+  GroupTfEvents(&space);
+  OpStatsOptions options;
+  options.generate_op_metrics_db = true;
+  OpStats op_stats = ConvertXSpaceToOpStats(space, options);
   const PerfEnv& perf_env = op_stats.perf_env();
   EXPECT_NEAR(141, perf_env.peak_tera_flops_per_second(), kMaxError);
   EXPECT_NEAR(900, perf_env.peak_hbm_bw_giga_bytes_per_second(), kMaxError);
@@ -76,8 +83,8 @@ TEST(ConvertXPlaneToOpStats, RunEnvironment) {
   XPlaneBuilder device_plane2(
       GetOrCreateGpuXPlane(&space, /*device_ordinal=*/1));
 
-  GroupTfEvents(&space, /*group_metadata_map=*/nullptr);
-  OpStats op_stats = ConvertXSpaceToOpStats(space, {});
+  GroupTfEvents(&space);
+  OpStats op_stats = ConvertXSpaceToOpStats(space, OpStatsOptions());
   const RunEnvironment& run_env = op_stats.run_environment();
 
   EXPECT_EQ("GPU", run_env.device_type());
@@ -106,8 +113,11 @@ TEST(ConvertXPlaneToOpStats, CpuOnlyStepDbTest) {
                {{StatType::kStepId, kStepId}});
   CreateXEvent(&host_plane_builder, &tf_executor_thread, "matmul", 30, 70);
 
-  GroupTfEvents(&space, /*group_metadata_map=*/nullptr);
-  OpStats op_stats = ConvertXSpaceToOpStats(space, {OP_METRICS_DB, STEP_DB});
+  GroupTfEvents(&space);
+  OpStatsOptions options;
+  options.generate_op_metrics_db = true;
+  options.generate_step_db = true;
+  OpStats op_stats = ConvertXSpaceToOpStats(space, options);
   const StepDatabaseResult& step_db = op_stats.step_db();
 
   EXPECT_EQ(step_db.step_sequence_size(), 1);
@@ -143,8 +153,11 @@ TEST(ConvertXPlaneToOpStats, GpuStepDbTest) {
   CreateXEvent(&device_plane_builder, &stream, "matmul", 50, 40,
                {{StatType::kCorrelationId, kCorrelationId}});
 
-  GroupTfEvents(&space, /*group_metadata_map=*/nullptr);
-  OpStats op_stats = ConvertXSpaceToOpStats(space, {OP_METRICS_DB, STEP_DB});
+  GroupTfEvents(&space);
+  OpStatsOptions options;
+  options.generate_op_metrics_db = true;
+  options.generate_step_db = true;
+  OpStats op_stats = ConvertXSpaceToOpStats(space, options);
   const StepDatabaseResult& step_db = op_stats.step_db();
 
   EXPECT_EQ(step_db.step_sequence_size(), 1);
@@ -161,12 +174,114 @@ TEST(ConvertXPlaneToOpStats, PropagateAndDedupErrors) {
   *space.add_errors() = kError;
   *space.add_errors() = kError;
 
-  OpStats op_stats = ConvertXSpaceToOpStats(space, {});
+  OpStats op_stats = ConvertXSpaceToOpStats(space, OpStatsOptions());
 
   EXPECT_EQ(1, op_stats.diagnostics().errors_size());
   EXPECT_EQ(kError, op_stats.diagnostics().errors(/*index=*/0));
 }
 
+TEST(ConvertXPlaneToOpStats, Hostnames) {
+  XSpace space;
+  static constexpr char kHost[] = "host1";
+  *space.add_hostnames() = kHost;
+
+  OpStats op_stats = ConvertXSpaceToOpStats(space, OpStatsOptions());
+  EXPECT_EQ(
+      kHost,
+      op_stats.core_id_to_details().at(kDefaultGpuLocalCoreId).hostname());
+}
+
+// Helper function to build a XSpace and store it to test directory.
+void BuildAndStoreXSpaceForTest(Env* test_env, absl::string_view test_dir,
+                                absl::string_view hostname) {
+  constexpr int64 kStepNum = 123;
+  constexpr int64 kStepId = 456;
+  // Create a host only XSpace for test.
+  XSpace xspace;
+  XPlaneBuilder host_plane_builder(GetOrCreateHostXPlane(&xspace));
+  host_plane_builder.ReserveLines(2);
+
+  auto main_thread = host_plane_builder.GetOrCreateLine(0);
+  CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kTraceContext,
+               0, 100, {{StatType::kStepNum, kStepNum}});
+  CreateXEvent(&host_plane_builder, &main_thread, HostEventType::kFunctionRun,
+               10, 90, {{StatType::kStepId, kStepId}});
+
+  auto executor_thread = host_plane_builder.GetOrCreateLine(1);
+  CreateXEvent(&host_plane_builder, &executor_thread,
+               HostEventType::kExecutorStateProcess, 20, 80,
+               {{StatType::kStepId, kStepId}});
+  // Create a TensorFlow op that runs for 70 ps.
+  CreateXEvent(&host_plane_builder, &executor_thread, "aaa:bbb", 30, 70);
+  GroupTfEvents(&xspace);
+
+  xspace.add_hostnames(std::string(hostname));
+
+  std::string xspace_name = absl::StrCat(hostname, ".", kXPlanePb);
+  TF_CHECK_OK(
+      WriteBinaryProto(test_env, io::JoinPath(test_dir, xspace_name), xspace))
+      << "Failed to write binary XSpace to file: " << xspace_name;
+}
+
+TEST(ConvertXPlaneToOpStats, TestConvertMultiXSpacesToCombinedOpStats) {
+  // Initialize environment and directory for testing.
+  Env* test_env = Env::Default();
+  std::string test_dir = io::JoinPath(testing::TmpDir(), "test_dir");
+  TF_CHECK_OK(test_env->CreateDir(test_dir))
+      << "Failed to create test directory: " << test_dir;
+
+  static constexpr char kHost1[] = "host1";
+  static constexpr char kHost2[] = "host2";
+
+  BuildAndStoreXSpaceForTest(test_env, test_dir, kHost1);
+  BuildAndStoreXSpaceForTest(test_env, test_dir, kHost2);
+
+  std::vector<std::string> xspace_paths;
+  xspace_paths.push_back(
+      io::JoinPath(test_dir, absl::StrCat(kHost1, ".", kXPlanePb)));
+  xspace_paths.push_back(
+      io::JoinPath(test_dir, absl::StrCat(kHost2, ".", kXPlanePb)));
+  OpStatsOptions options;
+  options.generate_op_metrics_db = true;
+  options.generate_step_db = true;
+  OpStats combined_op_stats;
+
+  TF_CHECK_OK(ConvertMultiXSpacesToCombinedOpStats(xspace_paths, options,
+                                                   &combined_op_stats))
+      << "Failed to convert multi XSpace to OpStats";
+
+  // Result OpStats has 2 Host Ops, "IDLE" and "aaa:bbb".
+  ASSERT_EQ(combined_op_stats.host_op_metrics_db().metrics_db_size(), 2);
+  const auto& metric = combined_op_stats.host_op_metrics_db().metrics_db(1);
+  EXPECT_EQ(metric.name(), "aaa");
+  EXPECT_EQ(metric.category(), "bbb");
+  // Each host has the HostOp "aaa:bbb" running for 70 ps, so the combined
+  // OpStats has "aaa:bbb" running for 140 ps in total.
+  EXPECT_EQ(metric.self_time_ps(), 140);
+
+  // Result OpStats has 1 step, 2 cores.
+  ASSERT_EQ(combined_op_stats.step_db().step_sequence_size(), 1);
+  ASSERT_EQ(
+      combined_op_stats.step_db().step_sequence(0).step_info_per_core_size(),
+      2);
+  const auto& step_info_per_core =
+      combined_op_stats.step_db().step_sequence(0).step_info_per_core();
+  // global_core_id is computed using: 1000 * host_id + local_core_id.
+  EXPECT_TRUE(step_info_per_core.contains(kDefaultGpuLocalCoreId));
+  EXPECT_TRUE(step_info_per_core.contains(1000 + kDefaultGpuLocalCoreId));
+
+  const auto& core_details_map = combined_op_stats.core_id_to_details();
+  EXPECT_EQ(kHost1, core_details_map.at(kDefaultGpuLocalCoreId).hostname());
+  EXPECT_EQ(kHost2,
+            core_details_map.at(1000 + kDefaultGpuLocalCoreId).hostname());
+
+  // Tear down environment and directory for testing.
+  int64 undeleted_files, undeleted_dirs;
+  TF_CHECK_OK(
+      test_env->DeleteRecursively(test_dir, &undeleted_files, &undeleted_dirs))
+      << "Failed to delete test directory: " << test_dir;
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
index b7e28b23758..cf48a7d61ab 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
@@ -50,6 +50,7 @@ const absl::string_view kInputPipeline = "input_pipeline";
 const absl::string_view kOverviewPage = "overview_page";
 const absl::string_view kKernelStats = "kernel_stats";
 const absl::string_view kMemoryProfile = "memory_profile";
+const absl::string_view kXPlanePb = "xplane.pb";
 
 template <typename Proto>
 void AddToolData(absl::string_view tool_name, const Proto& tool_output,
@@ -74,6 +75,9 @@ Status ConvertXSpaceToProfileResponse(const XSpace& xspace,
   absl::flat_hash_set<absl::string_view> tools(req.tools().begin(),
                                                req.tools().end());
   if (tools.empty()) return Status::OK();
+  if (tools.contains(kXPlanePb)) {
+    AddToolData(kXPlanePb, xspace, response);
+  }
   if (tools.contains(kTraceViewer)) {
     Trace trace;
     ConvertXSpaceToTraceEvents(xspace, &trace);
@@ -87,8 +91,13 @@ Status ConvertXSpaceToProfileResponse(const XSpace& xspace,
     // Trace viewer is the only tool, skip OpStats conversion.
     if (tools.size() == 1) return Status::OK();
   }
-  OpStats op_stats =
-      ConvertXSpaceToOpStats(xspace, {OP_METRICS_DB, STEP_DB, KERNEL_STATS_DB});
+
+  OpStatsOptions options;
+  options.generate_kernel_stats_db = true;
+  options.generate_op_metrics_db = true;
+  options.generate_step_db = true;
+  options.maybe_drop_incomplete_steps = true;
+  OpStats op_stats = ConvertXSpaceToOpStats(xspace, options);
   if (tools.contains(kOverviewPage)) {
     OverviewPage overview_page_db = ConvertOpStatsToOverviewPage(op_stats);
     AddToolData(ToolName(kOverviewPage), overview_page_db, response);
diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc b/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc
index d50cd9a98ff..25d39da0f00 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc
@@ -109,6 +109,18 @@ TEST(ConvertXPlaneToProfileResponse, TensorflowStats) {
   ASSERT_TRUE(tf_stats_db.ParseFromString(response.tool_data(0).data()));
 }
 
+TEST(ConvertXPlaneToProfileResponse, XPlane) {
+  XSpace xspace;
+  CreateXSpace(&xspace);
+  ProfileRequest request;
+  request.add_tools("xplane.pb");
+  ProfileResponse response;
+  TF_CHECK_OK(ConvertXSpaceToProfileResponse(xspace, request, &response));
+  EXPECT_EQ(1, response.tool_data_size());
+  EXPECT_EQ("xplane.pb", response.tool_data(0).name());
+  ASSERT_TRUE(xspace.ParseFromString(response.tool_data(0).data()));
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc b/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
index 1c6dfee7cc7..34203216afd 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
@@ -79,7 +79,7 @@ TEST(ConvertXPlaneToOpStats, CpuOnlyStepDbTest) {
   CreateXEvent(&device_plane_builder, &stream, "matmul", 50, 40,
                {{StatType::kCorrelationId, kFirstCorrelationId}});
 
-  GroupTfEvents(&space, nullptr);
+  GroupTfEvents(&space);
   StepEvents device_step_events =
       ConvertDeviceTraceXPlaneToStepEvents(*device_plane);
   EXPECT_EQ(device_step_events.size(), 1);
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.cc b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.cc
new file mode 100644
index 00000000000..9e1465c6718
--- /dev/null
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.cc
@@ -0,0 +1,308 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/profiler/protobuf/tf_data_stats.pb.h"
+#include "tensorflow/core/profiler/utils/group_events.h"
+#include "tensorflow/core/profiler/utils/tf_op_utils.h"
+#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// 50 us from https://www.tensorflow.org/guide/data_performance_analysis
+const int64 kSlowCallThresholdPs = 50 * 1000000;
+
+namespace {
+
+// Returns true if the given iterator event is for a root iterator.
+bool IsRootIteratorEvent(const XEventVisitor& iterator_event) {
+  std::vector<absl::string_view> split_result =
+      absl::StrSplit(iterator_event.Name(), "::");
+  // The root iterator's name contains only its own name (no parent
+  // information).
+  return split_result.size() == 2;
+}
+
+// Returns true if the given iterator event name is for an async iterator.
+bool IsAsyncIterator(absl::string_view iterator_event_name) {
+  static auto* kAsyncIterators = new absl::flat_hash_set<absl::string_view>(
+      {"Prefetch", "ParallelInterleave", "ParallelMap", "ParseExample",
+       "MapAndBatch", "DataService", "LegacyParallelInterleave"});
+  return kAsyncIterators->contains(iterator_event_name);
+}
+
+void SetIteratorMetadata(int64 id, const XEventVisitor& event,
+                         IteratorMetadata* metadata) {
+  metadata->set_id(id);
+  auto parent_id_stat = event.GetStat(StatType::kParentId);
+  if (parent_id_stat.has_value()) {
+    metadata->set_parent_id(parent_id_stat->IntValue());
+  }
+  metadata->set_name(IteratorName(event.Name()));
+  metadata->set_long_name(event.Name().data(), event.Name().size());
+  metadata->set_is_async(IsAsyncIterator(metadata->name()));
+  // TODO(b/161831651): Set params.
+}
+
+// Returns the parent iterator's id if it is a root of a device input
+// pipeline.
+absl::optional<int64> FindDeviceInputPipeline(const XEventVisitor& event) {
+  if (event.Type() == HostEventType::kDeviceInputPipelineSecondIterator) {
+    auto parent_id_stat = event.GetStat(StatType::kParentId);
+    if (parent_id_stat.has_value()) return parent_id_stat->IntValue();
+  }
+  return absl::nullopt;
+}
+
+// Processes EventForest to do the following:
+// (1) set iterator metadata
+// (2) find root iterator events
+// (3) find device input pipeline ids
+void ProcessEventForest(const EventForest& event_forest,
+                        absl::flat_hash_set<int64>* device_input_pipeline_ids,
+                        absl::flat_hash_map<int64, std::vector<EventNode*>>*
+                            root_iterator_event_map,
+                        TfDataStats* tf_data_stats) {
+  const EventNodeMap& event_node_map = event_forest.GetEventNodeMap();
+  auto iterator_event_list =
+      gtl::FindOrNull(event_node_map, HostEventType::kIterator);
+  if (!iterator_event_list) return;
+  for (const auto& iterator_event : *iterator_event_list) {
+    const XEventVisitor& iterator_event_visitor =
+        iterator_event->GetEventVisitor();
+    auto iterator_id_stat = iterator_event_visitor.GetStat(StatType::kStepId);
+    if (!iterator_id_stat.has_value()) continue;
+    int64 iterator_id = iterator_id_stat->IntValue();
+    auto result = tf_data_stats->mutable_iterator_metadata()->insert(
+        {iterator_id, IteratorMetadata()});
+    IteratorMetadata& metadata = result.first->second;
+    if (result.second) {
+      // First time processing this iterator.
+      SetIteratorMetadata(iterator_id, iterator_event_visitor, &metadata);
+    }
+    if (IsRootIteratorEvent(iterator_event_visitor)) {
+      // Record root iterator events.
+      (*root_iterator_event_map)[iterator_id].push_back(iterator_event.get());
+    }
+  }
+  auto device_input_pipeline_second_iterator_events = gtl::FindOrNull(
+      event_node_map, HostEventType::kDeviceInputPipelineSecondIterator);
+  if (!device_input_pipeline_second_iterator_events) return;
+  for (const auto& iterator_event :
+       *device_input_pipeline_second_iterator_events) {
+    const XEventVisitor& iterator_event_visitor =
+        iterator_event->GetEventVisitor();
+    auto iterator_id_stat = iterator_event_visitor.GetStat(StatType::kStepId);
+    if (!iterator_id_stat.has_value()) continue;
+    int64 iterator_id = iterator_id_stat->IntValue();
+    auto result = tf_data_stats->mutable_iterator_metadata()->insert(
+        {iterator_id, IteratorMetadata()});
+    IteratorMetadata& metadata = result.first->second;
+    if (result.second) {
+      // First time processing this iterator.
+      SetIteratorMetadata(iterator_id, iterator_event_visitor, &metadata);
+      // Find and record device input pipeline ids.
+      absl::optional<int64> device_input_pipeline_id =
+          FindDeviceInputPipeline(iterator_event_visitor);
+      if (device_input_pipeline_id.has_value()) {
+        device_input_pipeline_ids->insert(*device_input_pipeline_id);
+      }
+    }
+  }
+}
+
+void SetInputPipelineMetadata(int64 id, int64 name_id,
+                              bool is_device_input_pipeline,
+                              InputPipelineMetadata* metadata) {
+  constexpr absl::string_view kHostInputPipelinePrefix = "Host:";
+  constexpr absl::string_view kDeviceInputPipelinePrefix = "Device:";
+  metadata->set_id(id);
+  if (is_device_input_pipeline) {
+    metadata->set_type(InputPipelineMetadata::DEVICE);
+    metadata->set_name(absl::StrCat(kDeviceInputPipelinePrefix, name_id));
+  } else {
+    metadata->set_type(InputPipelineMetadata::HOST);
+    metadata->set_name(absl::StrCat(kHostInputPipelinePrefix, name_id));
+  }
+}
+
+void ProcessIteratorEvent(const EventNode& iterator_event,
+                          InputPipelineStat* input_pipeline_stat,
+                          bool is_blocking) {
+  const XEventVisitor& visitor = iterator_event.GetEventVisitor();
+  auto iterator_id_stat = visitor.GetStat(StatType::kStepId);
+  if (!iterator_id_stat.has_value()) return;
+  int64 iterator_id = iterator_id_stat->IntValue();
+  auto result = input_pipeline_stat->mutable_iterator_stats()->insert(
+      {iterator_id, IteratorStat()});
+  IteratorStat& iterator_stat = result.first->second;
+  if (result.second) {
+    iterator_stat.set_id(iterator_id);
+    iterator_stat.set_start_time_ps(visitor.TimestampPs());
+  }
+  iterator_stat.set_duration_ps(iterator_stat.duration_ps() +
+                                visitor.DurationPs());
+  int64 self_time_ps = visitor.DurationPs();
+  tensorflow::profiler::Timespan self_time_span = visitor.GetTimespan();
+  for (EventNode* child : iterator_event.GetChildren()) {
+    const XEventVisitor& child_visitor = child->GetEventVisitor();
+    if (ParseTfOpFullname(child_visitor.Name()).category == Category::kTfData) {
+      int64 overlap_duration_ps =
+          self_time_span.OverlappedDurationPs(child_visitor.GetTimespan());
+      ProcessIteratorEvent(*child, input_pipeline_stat,
+                           is_blocking && overlap_duration_ps);
+      // Note: Assume no overlap between child events.
+      self_time_ps -= overlap_duration_ps;
+    }
+  }
+  iterator_stat.set_self_time_ps(iterator_stat.self_time_ps() + self_time_ps);
+  iterator_stat.set_is_blocking(iterator_stat.is_blocking() || is_blocking);
+  iterator_stat.set_num_calls(iterator_stat.num_calls() + 1);
+}
+
+void SetBottleneckIteratorId(InputPipelineStat* input_pipeline_stat) {
+  int64 bottleneck_iterator_id = 0;
+  int64 max_self_time = 0;
+  for (const auto& pair : input_pipeline_stat->iterator_stats()) {
+    const auto& id = pair.first;
+    const auto& iterator_stat = pair.second;
+    if (iterator_stat.is_blocking() &&
+        iterator_stat.self_time_ps() > max_self_time) {
+      bottleneck_iterator_id = id;
+      max_self_time = iterator_stat.self_time_ps();
+    }
+  }
+  input_pipeline_stat->set_bottleneck_iterator_id(bottleneck_iterator_id);
+}
+
+void ProcessInputPipelines(
+    const absl::flat_hash_set<int64>& device_input_pipeline_ids,
+    absl::flat_hash_map<int64, std::vector<EventNode*>>*
+        root_iterator_event_map,
+    TfDataStats* tf_data_stats) {
+  auto* input_pipelines = tf_data_stats->mutable_input_pipelines();
+  int64 num_host_input_pipelines = 0;
+  int64 num_device_input_pipelines = 0;
+  for (auto& id_and_events : *root_iterator_event_map) {
+    auto& root_iterator_id = id_and_events.first;
+    auto& root_iterator_events = id_and_events.second;
+    absl::c_sort(root_iterator_events,
+                 [](const EventNode* lhs, const EventNode* rhs) {
+                   return lhs->GetEventVisitor().DurationPs() >
+                          rhs->GetEventVisitor().DurationPs();
+                 });
+    auto result =
+        input_pipelines->insert({root_iterator_id, InputPipelineStats()});
+    InputPipelineStats& input_pipeline_stats = result.first->second;
+    InputPipelineMetadata* metadata = input_pipeline_stats.mutable_metadata();
+    if (result.second) {
+      bool is_device_input_pipeline =
+          device_input_pipeline_ids.contains(root_iterator_id);
+      int64 name_id = is_device_input_pipeline ? num_device_input_pipelines++
+                                               : num_host_input_pipelines++;
+      SetInputPipelineMetadata(root_iterator_id, name_id,
+                               is_device_input_pipeline, metadata);
+    }
+    int64 sum_latency_ps = 0;
+    int64 min_latency_ps = INT64_MAX;
+    int64 max_latency_ps = 0;
+    int64 num_slow_calls = 0;
+    for (const EventNode* root_iterator_event : root_iterator_events) {
+      InputPipelineStat* stat = input_pipeline_stats.add_stats();
+      ProcessIteratorEvent(*root_iterator_event, stat,
+                           /*is_blocking*/ true);
+      SetBottleneckIteratorId(stat);
+      int64 latency_ps = root_iterator_event->GetEventVisitor().DurationPs();
+      sum_latency_ps += latency_ps;
+      min_latency_ps = std::min(min_latency_ps, latency_ps);
+      max_latency_ps = std::max(max_latency_ps, latency_ps);
+      if (latency_ps > kSlowCallThresholdPs) num_slow_calls++;
+    }
+    input_pipeline_stats.set_avg_latency_ps(sum_latency_ps /
+                                            root_iterator_events.size());
+    input_pipeline_stats.set_min_latency_ps(min_latency_ps);
+    input_pipeline_stats.set_max_latency_ps(max_latency_ps);
+    input_pipeline_stats.set_num_slow_calls(num_slow_calls);
+  }
+}
+
+void SetBottleneckAnalysis(absl::string_view host_name,
+                           const TfDataStats& tf_data_stats,
+                           TfDataBottleneckAnalysis* bottleneck_analysis) {
+  for (const auto& id_and_stats : tf_data_stats.input_pipelines()) {
+    const InputPipelineStats& input_pipeline_stats = id_and_stats.second;
+    if (input_pipeline_stats.metadata().type() ==
+            InputPipelineMetadata::DEVICE ||
+        input_pipeline_stats.max_latency_ps() <=
+            bottleneck_analysis->max_latency_ps()) {
+      // Ignore device input pipelines and input pipelines faster than the
+      // current bottleneck.
+      continue;
+    }
+    bottleneck_analysis->set_host(host_name.data(), host_name.size());
+    bottleneck_analysis->set_input_pipeline(
+        input_pipeline_stats.metadata().name());
+    bottleneck_analysis->set_max_latency_ps(
+        input_pipeline_stats.max_latency_ps());
+    const IteratorMetadata& metadata = tf_data_stats.iterator_metadata().at(
+        input_pipeline_stats.stats(0).bottleneck_iterator_id());
+    bottleneck_analysis->set_iterator_name(metadata.name());
+    bottleneck_analysis->set_iterator_long_name(metadata.long_name());
+  }
+}
+
+}  // namespace
+
+void CombinedTfDataStatsBuilder::Add(absl::string_view host_name,
+                                     XPlane* host_plane) {
+  TfDataStats& tf_data_stats =
+      (*combined_tf_data_stats_
+            ->mutable_tf_data_stats())[std::string(host_name)];
+  EventForest event_forest;
+  event_forest.AddPlanes(CreateTfXPlaneVisitor, {host_plane});
+  event_forest.ConnectEvents();
+  event_forest.ConnectTfDataEvents();
+  absl::flat_hash_set<int64> device_input_pipeline_ids;
+  absl::flat_hash_map<int64, std::vector<EventNode*>> root_iterator_event_map;
+  ProcessEventForest(event_forest, &device_input_pipeline_ids,
+                     &root_iterator_event_map, &tf_data_stats);
+  ProcessInputPipelines(device_input_pipeline_ids, &root_iterator_event_map,
+                        &tf_data_stats);
+}
+
+void CombinedTfDataStatsBuilder::Finalize() {
+  TfDataBottleneckAnalysis* bottleneck_analysis =
+      combined_tf_data_stats_->mutable_bottleneck_analysis();
+  for (const auto& host_name_and_tf_data_stats :
+       combined_tf_data_stats_->tf_data_stats()) {
+    SetBottleneckAnalysis(host_name_and_tf_data_stats.first,
+                          host_name_and_tf_data_stats.second,
+                          bottleneck_analysis);
+  }
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h
new file mode 100644
index 00000000000..4044e02bb08
--- /dev/null
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_DATA_STATS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_DATA_STATS_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/tf_data_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+TF_CONST_INIT extern const int64 kSlowCallThresholdPs;
+
+class CombinedTfDataStatsBuilder {
+ public:
+  explicit CombinedTfDataStatsBuilder(
+      CombinedTfDataStats* combined_tf_data_stats)
+      : combined_tf_data_stats_(combined_tf_data_stats) {}
+
+  void Add(absl::string_view host_name, XPlane* host_plane);
+
+  // Finalizes by populating TfDataBottleneckAnalysis.
+  void Finalize();
+
+ private:
+  CombinedTfDataStats* combined_tf_data_stats_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_DATA_STATS_H_
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats_test.cc
new file mode 100644
index 00000000000..c9636ac2eac
--- /dev/null
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats_test.cc
@@ -0,0 +1,400 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h"
+
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/profiler/protobuf/tf_data_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+using ::testing::EqualsProto;
+
+// Test with the following example dataset:
+// dataset = tf.data.Dataset.range(8)
+// dataset = dataset.prefetch(2)
+// for _ in dataset:
+//   pass
+TEST(XPlaneToTfDataStatsTest, HostInputPipeline) {
+  constexpr int64 kPrefetchIteratorId = 123;
+  constexpr int64 kRangeIteratorId = 456;
+  constexpr int64 kFirstElementId = 100;
+  constexpr int64 kSecondElementId = 200;
+
+  XPlane host_plane;
+  XPlaneBuilder host_plane_builder(&host_plane);
+  host_plane_builder.ReserveLines(2);
+
+  auto consumer_thread = host_plane_builder.GetOrCreateLine(0);
+  CreateXEvent(&host_plane_builder, &consumer_thread, "Iterator::Prefetch", 0,
+               100000000, {{StatType::kStepId, kPrefetchIteratorId}});
+  CreateXEvent(&host_plane_builder, &consumer_thread,
+               HostEventType::kPrefetchConsume, 80000000, 20000000,
+               {{StatType::kElementId, kFirstElementId}});
+  CreateXEvent(&host_plane_builder, &consumer_thread, "Iterator::Prefetch",
+               200000000, 20000000, {{StatType::kStepId, kPrefetchIteratorId}});
+  CreateXEvent(&host_plane_builder, &consumer_thread,
+               HostEventType::kPrefetchConsume, 210000000, 10000000,
+               {{StatType::kElementId, kSecondElementId}});
+
+  auto producer_thread = host_plane_builder.GetOrCreateLine(1);
+  // Blocking producer.
+  CreateXEvent(&host_plane_builder, &producer_thread,
+               HostEventType::kPrefetchProduce, 0, 80000000,
+               {{StatType::kElementId, kFirstElementId}});
+  CreateXEvent(&host_plane_builder, &producer_thread,
+               "Iterator::Prefetch::Range", 0, 80000000,
+               {{StatType::kStepId, kRangeIteratorId},
+                {StatType::kParentId, kPrefetchIteratorId}});
+  // Non-blocking producer.
+  CreateXEvent(&host_plane_builder, &producer_thread,
+               HostEventType::kPrefetchProduce, 100000000, 80000000,
+               {{StatType::kElementId, kSecondElementId}});
+  CreateXEvent(&host_plane_builder, &producer_thread,
+               "Iterator::Prefetch::Range", 100000000, 80000000,
+               {{StatType::kStepId, kRangeIteratorId},
+                {StatType::kParentId, kPrefetchIteratorId}});
+
+  CombinedTfDataStats combined_tf_data_stats;
+  CombinedTfDataStatsBuilder builder(&combined_tf_data_stats);
+  builder.Add("host1", &host_plane);
+  builder.Finalize();
+  EXPECT_THAT(combined_tf_data_stats, EqualsProto(R"pb(
+                bottleneck_analysis: {
+                  host: "host1"
+                  input_pipeline: "Host:0"
+                  max_latency_ps: 100000000
+                  iterator_name: "Range"
+                  iterator_long_name: "Iterator::Prefetch::Range"
+                }
+                tf_data_stats: {
+                  key: "host1"
+                  value: {
+                    iterator_metadata: {
+                      key: 123,
+                      value: {
+                        id: 123
+                        name: "Prefetch"
+                        long_name: "Iterator::Prefetch"
+                        is_async: true
+                      }
+                    }
+                    iterator_metadata: {
+                      key: 456,
+                      value: {
+                        id: 456
+                        parent_id: 123
+                        name: "Range"
+                        long_name: "Iterator::Prefetch::Range"
+                        is_async: false
+                      }
+                    }
+                    input_pipelines {
+                      key: 123,
+                      value: {
+                        metadata { id: 123 type: HOST name: "Host:0" }
+                        avg_latency_ps: 60000000
+                        min_latency_ps: 20000000
+                        max_latency_ps: 100000000
+                        num_slow_calls: 1
+                        stats {
+                          bottleneck_iterator_id: 456
+                          iterator_stats {
+                            key: 123,
+                            value: {
+                              id: 123
+                              start_time_ps: 0
+                              duration_ps: 100000000
+                              self_time_ps: 20000000
+                              is_blocking: true
+                              num_calls: 1
+                            }
+                          }
+                          iterator_stats {
+                            key: 456,
+                            value: {
+                              id: 456
+                              start_time_ps: 0
+                              duration_ps: 80000000
+                              self_time_ps: 80000000
+                              is_blocking: true
+                              num_calls: 1
+                            }
+                          }
+                        }
+                        stats {
+                          bottleneck_iterator_id: 123
+                          iterator_stats {
+                            key: 123,
+                            value: {
+                              id: 123
+                              start_time_ps: 200000000
+                              duration_ps: 20000000
+                              self_time_ps: 20000000
+                              is_blocking: true
+                              num_calls: 1
+                            }
+                          }
+                          iterator_stats {
+                            key: 456,
+                            value: {
+                              id: 456
+                              start_time_ps: 100000000
+                              duration_ps: 80000000
+                              self_time_ps: 80000000
+                              is_blocking: false
+                              num_calls: 1
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              )pb"));
+}
+
+TEST(XPlaneToTfDataStatsTest, DeviceInputPipeline) {
+  constexpr int64 kPrefetchIteratorId = 123;
+  constexpr int64 kRangeIteratorId = 456;
+  constexpr int64 kElementId = 100;
+
+  XPlane host_plane;
+  XPlaneBuilder host_plane_builder(&host_plane);
+  host_plane_builder.ReserveLines(2);
+
+  auto consumer_thread = host_plane_builder.GetOrCreateLine(0);
+  CreateXEvent(&host_plane_builder, &consumer_thread, "Iterator::Prefetch", 0,
+               30000000, {{StatType::kStepId, kPrefetchIteratorId}});
+  CreateXEvent(&host_plane_builder, &consumer_thread, "Iterator::Prefetch",
+               100000000, 100000000,
+               {{StatType::kStepId, kPrefetchIteratorId}});
+  CreateXEvent(&host_plane_builder, &consumer_thread,
+               HostEventType::kPrefetchConsume, 180000000, 20000000,
+               {{StatType::kElementId, kElementId}});
+
+  auto producer_thread = host_plane_builder.GetOrCreateLine(1);
+  CreateXEvent(&host_plane_builder, &producer_thread,
+               HostEventType::kPrefetchProduce, 100000000, 80000000,
+               {{StatType::kElementId, kElementId}});
+  CreateXEvent(&host_plane_builder, &producer_thread,
+               "Iterator::Prefetch::Generator", 100000000, 80000000,
+               {{StatType::kStepId, kRangeIteratorId},
+                {StatType::kParentId, kPrefetchIteratorId}});
+
+  CombinedTfDataStats combined_tf_data_stats;
+  CombinedTfDataStatsBuilder builder(&combined_tf_data_stats);
+  builder.Add("host1", &host_plane);
+  builder.Finalize();
+  // Device input pipeline is not considered for bottleneck analysis.
+  EXPECT_THAT(combined_tf_data_stats, EqualsProto(R"pb(
+                bottleneck_analysis: {}
+                tf_data_stats: {
+                  key: "host1"
+                  value: {
+                    iterator_metadata: {
+                      key: 123,
+                      value: {
+                        id: 123
+                        name: "Prefetch"
+                        long_name: "Iterator::Prefetch"
+                        is_async: true
+                      }
+                    }
+                    iterator_metadata: {
+                      key: 456,
+                      value: {
+                        id: 456
+                        parent_id: 123
+                        name: "Generator"
+                        long_name: "Iterator::Prefetch::Generator"
+                        is_async: false
+                      }
+                    }
+                    input_pipelines {
+                      key: 123,
+                      value: {
+                        metadata { id: 123 type: DEVICE name: "Device:0" }
+                        avg_latency_ps: 65000000
+                        min_latency_ps: 30000000
+                        max_latency_ps: 100000000
+                        num_slow_calls: 1
+                        stats {
+                          bottleneck_iterator_id: 456
+                          iterator_stats {
+                            key: 123,
+                            value: {
+                              id: 123
+                              start_time_ps: 100000000
+                              duration_ps: 100000000
+                              self_time_ps: 20000000
+                              is_blocking: true
+                              num_calls: 1
+                            }
+                          }
+                          iterator_stats {
+                            key: 456,
+                            value: {
+                              id: 456
+                              start_time_ps: 100000000
+                              duration_ps: 80000000
+                              self_time_ps: 80000000
+                              is_blocking: true
+                              num_calls: 1
+                            }
+                          }
+                        }
+                        stats {
+                          bottleneck_iterator_id: 123
+                          iterator_stats {
+                            key: 123,
+                            value: {
+                              id: 123
+                              start_time_ps: 0
+                              duration_ps: 30000000
+                              self_time_ps: 30000000
+                              is_blocking: true
+                              num_calls: 1
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              )pb"));
+}
+
+// Test with the following example dataset:
+// dataset = tf.data.Dataset.range(8)
+// dataset = dataset.map(lambda x: x + 1)
+// dataset = dataset.batch(2)
+// for _ in dataset:
+//   pass
+TEST(XPlaneToTfDataStatsTest, MapAndBatch) {
+  constexpr int64 kMapAndBatchIteratorId = 123;
+  constexpr int64 kRangeIteratorId = 456;
+  constexpr int64 kElementId = 100;
+
+  XPlane host_plane;
+  XPlaneBuilder host_plane_builder(&host_plane);
+  host_plane_builder.ReserveLines(2);
+
+  XLineBuilder consumer_thread = host_plane_builder.GetOrCreateLine(0);
+  CreateXEvent(&host_plane_builder, &consumer_thread, "Iterator::MapAndBatch",
+               0, 100000000, {{StatType::kStepId, kMapAndBatchIteratorId}});
+  CreateXEvent(&host_plane_builder, &consumer_thread,
+               HostEventType::kMapAndBatchConsume, 80000000, 20000000,
+               {{StatType::kElementId, kElementId}});
+
+  XLineBuilder producer_thread = host_plane_builder.GetOrCreateLine(1);
+  CreateXEvent(&host_plane_builder, &producer_thread,
+               HostEventType::kMapAndBatchProduce, 0, 30000000,
+               {{StatType::kElementId, kElementId}});
+  CreateXEvent(&host_plane_builder, &producer_thread,
+               "Iterator::MapAndBatch::Range", 0, 30000000,
+               {{StatType::kStepId, kRangeIteratorId},
+                {StatType::kParentId, kMapAndBatchIteratorId}});
+  CreateXEvent(&host_plane_builder, &producer_thread,
+               HostEventType::kMapAndBatchProduce, 40000000, 30000000,
+               {{StatType::kElementId, kElementId}});
+  CreateXEvent(&host_plane_builder, &producer_thread,
+               "Iterator::MapAndBatch::Range", 40000000, 30000000,
+               {{StatType::kStepId, kRangeIteratorId},
+                {StatType::kParentId, kMapAndBatchIteratorId}});
+
+  CombinedTfDataStats combined_tf_data_stats;
+  CombinedTfDataStatsBuilder builder(&combined_tf_data_stats);
+  builder.Add("host1", &host_plane);
+  builder.Finalize();
+  EXPECT_THAT(combined_tf_data_stats, EqualsProto(R"pb(
+                bottleneck_analysis: {
+                  host: "host1"
+                  input_pipeline: "Host:0"
+                  max_latency_ps: 100000000
+                  iterator_name: "Range"
+                  iterator_long_name: "Iterator::MapAndBatch::Range"
+                }
+                tf_data_stats: {
+                  key: "host1"
+                  value: {
+                    iterator_metadata: {
+                      key: 123,
+                      value: {
+                        id: 123
+                        name: "MapAndBatch"
+                        long_name: "Iterator::MapAndBatch"
+                        is_async: true
+                      }
+                    }
+                    iterator_metadata: {
+                      key: 456,
+                      value: {
+                        id: 456
+                        parent_id: 123
+                        name: "Range"
+                        long_name: "Iterator::MapAndBatch::Range"
+                        is_async: false
+                      }
+                    }
+                    input_pipelines {
+                      key: 123,
+                      value: {
+                        metadata { id: 123 type: HOST name: "Host:0" }
+                        avg_latency_ps: 100000000
+                        min_latency_ps: 100000000
+                        max_latency_ps: 100000000
+                        num_slow_calls: 1
+                        stats {
+                          bottleneck_iterator_id: 456
+                          iterator_stats {
+                            key: 123,
+                            value: {
+                              id: 123
+                              start_time_ps: 0
+                              duration_ps: 100000000
+                              self_time_ps: 40000000
+                              is_blocking: true
+                              num_calls: 1
+                            }
+                          }
+                          iterator_stats {
+                            key: 456,
+                            value: {
+                              id: 456
+                              start_time_ps: 0
+                              duration_ps: 60000000
+                              self_time_ps: 60000000
+                              is_blocking: true
+                              num_calls: 2
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              )pb"));
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_tools_data.cc b/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
new file mode 100644
index 00000000000..e12b2813af6
--- /dev/null
+++ b/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
@@ -0,0 +1,213 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/xplane_to_tools_data.h"
+
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
+#include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
+#include "tensorflow/core/profiler/convert/op_stats_to_pod_viewer.h"
+#include "tensorflow/core/profiler/convert/op_stats_to_tf_stats.h"
+#include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
+#include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
+#include "tensorflow/core/profiler/convert/xplane_to_trace_events.h"
+#include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
+#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
+#include "tensorflow/core/profiler/protobuf/pod_viewer.pb.h"
+#include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+namespace {
+
+std::pair<std::string, bool> ConvertXSpaceToTraceEvents(
+    const std::vector<std::string>& xspace_paths) {
+  if (xspace_paths.size() != 1) {
+    LOG(WARNING) << "Trace events tool expects only 1 XSpace path but gets "
+                 << xspace_paths.size();
+    return std::make_pair("", false);
+  }
+
+  XSpace xspace;
+  Status status = ReadBinaryProto(Env::Default(), xspace_paths[0], &xspace);
+  if (!status.ok()) {
+    LOG(WARNING) << "Could not read XSpace for trace events: "
+                 << xspace_paths[0];
+    return std::make_pair("", false);
+  }
+  std::string content;
+  ConvertXSpaceToTraceEventsString(xspace, &content);
+  return std::make_pair(content, true);
+}
+
+std::pair<std::string, bool> ConvertMultiXSpacesToOverviewPage(
+    const std::vector<std::string>& xspace_paths) {
+  OpStatsOptions options;
+  options.generate_kernel_stats_db = true;
+  options.generate_op_metrics_db = true;
+  options.generate_step_db = true;
+  OpStats combined_op_stats;
+  Status status = ConvertMultiXSpacesToCombinedOpStats(xspace_paths, options,
+                                                       &combined_op_stats);
+  if (!status.ok()) {
+    LOG(WARNING) << "Could not generate OpStats for overview page. Error: "
+                 << status.error_message();
+    return std::make_pair("", false);
+  }
+  // TODO(profiler): xspace should tell whether this is sampling mode.
+  return std::make_pair(
+      ConvertOpStatsToOverviewPage(combined_op_stats).SerializeAsString(),
+      true);
+}
+
+std::pair<std::string, bool> ConvertMultiXSpacesToInputPipeline(
+    const std::vector<std::string>& xspace_paths) {
+  OpStatsOptions options;
+  options.generate_op_metrics_db = true;
+  options.generate_step_db = true;
+  OpStats combined_op_stats;
+  Status status = ConvertMultiXSpacesToCombinedOpStats(xspace_paths, options,
+                                                       &combined_op_stats);
+  if (!status.ok()) {
+    LOG(WARNING) << "Could not generate OpStats for input pipeline. Error: "
+                 << status.error_message();
+    return std::make_pair("", false);
+  }
+  return std::make_pair(ConvertOpStatsToInputPipelineAnalysis(combined_op_stats)
+                            .SerializeAsString(),
+                        true);
+}
+
+std::pair<std::string, bool> ConvertMultiXSpacesToTfStats(
+    const std::vector<std::string>& xspace_paths) {
+  OpStatsOptions options;
+  options.generate_op_metrics_db = true;
+  options.generate_kernel_stats_db = true;
+  OpStats combined_op_stats;
+  Status status = ConvertMultiXSpacesToCombinedOpStats(xspace_paths, options,
+                                                       &combined_op_stats);
+  if (!status.ok()) {
+    LOG(WARNING) << "Could not generate OpStats for tensorflow stats. Error: "
+                 << status.error_message();
+    return std::make_pair("", false);
+  }
+  return std::make_pair(
+      ConvertOpStatsToTfStats(combined_op_stats).SerializeAsString(), true);
+}
+
+std::pair<std::string, bool> ConvertMultiXSpacesToKernelStats(
+    const std::vector<std::string>& xspace_paths) {
+  OpStatsOptions options;
+  options.generate_kernel_stats_db = true;
+  OpStats combined_op_stats;
+  Status status = ConvertMultiXSpacesToCombinedOpStats(xspace_paths, options,
+                                                       &combined_op_stats);
+  if (!status.ok()) {
+    LOG(WARNING) << "Could not generate OpStats for kernel stats. Error: "
+                 << status.error_message();
+    return std::make_pair("", false);
+  }
+  return std::make_pair(combined_op_stats.kernel_stats_db().SerializeAsString(),
+                        true);
+}
+
+std::pair<std::string, bool> ConvertXSpaceToMemoryProfile(
+    const std::vector<std::string>& xspace_paths) {
+  if (xspace_paths.size() != 1) {
+    LOG(WARNING) << "Memory profile tool expects only 1 XSpace path but gets "
+                 << xspace_paths.size();
+    return std::make_pair("", false);
+  }
+  XSpace xspace;
+  Status status = ReadBinaryProto(Env::Default(), xspace_paths[0], &xspace);
+  if (!status.ok()) {
+    LOG(WARNING) << "Could not read XSpace for memory profile: "
+                 << xspace_paths[0];
+    return std::make_pair("", false);
+  }
+  std::string json_output;
+  status = ConvertXSpaceToMemoryProfileJson(xspace, &json_output);
+  if (!status.ok()) {
+    LOG(WARNING) << "Could not generate memory profile. Error: "
+                 << status.error_message();
+    return std::make_pair("", false);
+  }
+  return std::make_pair(json_output, true);
+}
+
+std::pair<std::string, bool> ConvertMultiXSpacesToPodViewer(
+    const std::vector<std::string>& xspace_paths) {
+  OpStatsOptions options;
+  options.generate_op_metrics_db = true;
+  options.generate_step_db = true;
+  OpStats combined_op_stats;
+  Status status = ConvertMultiXSpacesToCombinedOpStats(xspace_paths, options,
+                                                       &combined_op_stats);
+  if (!status.ok()) {
+    LOG(WARNING) << "Could not generate OpStats for pod_viewer. Error: "
+                 << status.error_message();
+    return std::make_pair("", false);
+  }
+
+  std::string json_output;
+  protobuf::util::JsonPrintOptions opts;
+  opts.always_print_primitive_fields = true;
+  auto encode_status = protobuf::util::MessageToJsonString(
+      ConvertOpStatsToPodViewer(combined_op_stats), &json_output, opts);
+  if (!encode_status.ok()) {
+    LOG(WARNING) << "Could not convert pod viewer proto to json. Error: "
+                 << encode_status.message();
+    return std::make_pair("", false);
+  }
+  return std::make_pair(json_output, true);
+}
+
+}  // namespace
+
+std::pair<std::string, bool> ConvertMultiXSpacesToToolData(
+    const std::vector<std::string>& xspace_paths,
+    const absl::string_view tool_name) {
+  if (tool_name == "trace_viewer") {
+    return ConvertXSpaceToTraceEvents(xspace_paths);
+  } else if (tool_name == "overview_page") {
+    return ConvertMultiXSpacesToOverviewPage(xspace_paths);
+  } else if (tool_name == "input_pipeline_analyzer") {
+    return ConvertMultiXSpacesToInputPipeline(xspace_paths);
+  } else if (tool_name == "tensorflow_stats") {
+    return ConvertMultiXSpacesToTfStats(xspace_paths);
+  } else if (tool_name == "kernel_stats") {
+    return ConvertMultiXSpacesToKernelStats(xspace_paths);
+  } else if (tool_name == "memory_profile") {
+    return ConvertXSpaceToMemoryProfile(xspace_paths);
+  } else if (tool_name == "pod_viewer") {
+    return ConvertMultiXSpacesToPodViewer(xspace_paths);
+  } else {
+    LOG(WARNING) << "Can not find tool: " << tool_name << ". Please update to "
+                 << "the latest version of Tensorflow.";
+    return std::make_pair("", false);
+  }
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_tools_data.h b/tensorflow/core/profiler/convert/xplane_to_tools_data.h
new file mode 100644
index 00000000000..0f8155b01ba
--- /dev/null
+++ b/tensorflow/core/profiler/convert/xplane_to_tools_data.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TOOLS_DATA_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TOOLS_DATA_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Convert multiple XSpaces in <xspace_paths> to tool specific data.
+// Return the serialized string of tool specific data and whether the conversion
+// is successful.
+std::pair<std::string, bool> ConvertMultiXSpacesToToolData(
+    const std::vector<std::string>& xspace_paths,
+    const absl::string_view tool_name);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TOOLS_DATA_H_
diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index 85fa4e7fc44..336375f7d22 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -1,5 +1,5 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "if_not_windows", "tf_cc_test")
-load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -22,7 +22,7 @@ cc_library(
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler:tfprof_options",
         "@com_google_absl//absl/strings",
@@ -53,7 +53,7 @@ cc_library(
         ":tfprof_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler:tfprof_options",
         "@com_google_absl//absl/strings:str_format",
@@ -75,7 +75,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler:tfprof_options",
         "@com_google_absl//absl/strings:str_format",
@@ -95,7 +95,7 @@ cc_library(
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler:tfprof_options",
         "@com_google_absl//absl/strings",
@@ -119,7 +119,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler:tfprof_options",
         "@com_google_absl//absl/strings",
@@ -140,7 +140,7 @@ cc_library(
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler:tfprof_options",
         "@com_google_absl//absl/strings:str_format",
@@ -176,7 +176,7 @@ cc_library(
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler:tfprof_options",
         "@com_google_absl//absl/strings",
@@ -200,7 +200,7 @@ cc_library(
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler:tfprof_options",
         "@com_google_absl//absl/strings",
@@ -212,14 +212,7 @@ tf_cc_test(
     name = "tfprof_show_test",
     size = "small",
     srcs = ["tfprof_show_test.cc"],
-    data = [
-        "testdata/ckpt.data-00000-of-00001",
-        "testdata/ckpt.index",
-        "testdata/ckpt.meta",
-        "testdata/graph.pbtxt",
-        "testdata/run_meta",
-        "testdata/tfprof_log",
-    ],
+    data = ["//tensorflow/core/profiler/internal/testdata:profiler_testdata"],
     deps = [
         ":tfprof_constants",
         ":tfprof_stats",
@@ -238,10 +231,7 @@ tf_cc_test(
     name = "tfprof_timeline_test",
     size = "small",
     srcs = ["tfprof_timeline_test.cc"],
-    data = [
-        "testdata/graph.pbtxt",
-        "testdata/run_meta",
-    ],
+    data = ["//tensorflow/core/profiler/internal/testdata:profiler_testdata"],
     deps = [
         ":tfprof_constants",
         ":tfprof_stats",
@@ -265,7 +255,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler:tfprof_options",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -300,14 +290,7 @@ tf_cc_test(
     name = "tfprof_stats_test",
     size = "small",
     srcs = ["tfprof_stats_test.cc"],
-    data = [
-        "testdata/ckpt.data-00000-of-00001",
-        "testdata/ckpt.index",
-        "testdata/ckpt.meta",
-        "testdata/graph.pbtxt",
-        "testdata/run_meta",
-        "testdata/tfprof_log",
-    ],
+    data = ["//tensorflow/core/profiler/internal/testdata:profiler_testdata"],
     deps = [
         ":tfprof_constants",
         ":tfprof_stats",
@@ -340,12 +323,7 @@ tf_cc_test(
     name = "tfprof_tensor_test",
     size = "small",
     srcs = ["tfprof_tensor_test.cc"],
-    data = [
-        "testdata/ckpt.data-00000-of-00001",
-        "testdata/ckpt.index",
-        "testdata/ckpt.meta",
-        "testdata/graph.pbtxt",
-    ],
+    data = ["//tensorflow/core/profiler/internal/testdata:profiler_testdata"],
     deps = [
         ":tfprof_stats",
         ":tfprof_tf_testlib",
@@ -380,166 +358,6 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:regexp_internal",
-    ],
-)
-
-cc_library(
-    name = "traceme_recorder",
-    hdrs = ["traceme_recorder.h"],
-    visibility = [
-        "//perftools/accelerators/xprof/xprofilez:__subpackages__",
-        "//tensorflow/core/profiler:__subpackages__",
-        "//third_party/tf_runtime_google:__subpackages__",
-    ],
-    deps = [
-        "@com_google_absl//absl/container:flat_hash_map",
-        "//tensorflow/core:lib",
-    ] + if_static([
-        ":traceme_recorder_impl",
-    ]),
-)
-
-cc_library(
-    name = "traceme_recorder_impl",
-    srcs = [
-        "traceme_recorder.cc",
-        "traceme_recorder.h",
-    ],
-    visibility = [
-        "//tensorflow/core/profiler:__pkg__",
-        "//tensorflow/python:__pkg__",
-    ],
-    deps = [
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/container:flat_hash_map",
-    ],
-    alwayslink = True,
-)
-
-tf_cc_test(
-    name = "traceme_recorder_test",
-    srcs = ["traceme_recorder_test.cc"],
-    deps = [
-        ":traceme_recorder",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "profiler_interface",
-    hdrs = ["profiler_interface.h"],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-    ],
-)
-
-cc_library(
-    name = "profiler_factory",
-    hdrs = ["profiler_factory.h"],
-    deps = [
-        ":profiler_interface",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
-    ] + if_static([
-        ":profiler_factory_impl",
-    ]),
-)
-
-cc_library(
-    name = "profiler_factory_impl",
-    srcs = [
-        "profiler_factory.cc",
-        "profiler_factory.h",
-    ],
-    visibility = [
-        "//tensorflow/core/profiler:__pkg__",
-    ],
-    deps = [
-        ":profiler_interface",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
-    ],
-    alwayslink = True,
-)
-
-filegroup(
-    name = "mobile_srcs",
-    srcs = [
-        "profiler_interface.h",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "annotation_stack",
-    hdrs = ["annotation_stack.h"],
-    visibility = [
-        "//perftools/accelerators/xprof/xprofilez:__subpackages__",
-        "//tensorflow/core/profiler:__subpackages__",
-    ],
-    deps = [
-        "@com_google_absl//absl/strings",
-        "//tensorflow/core:lib",
-    ] + if_static([
-        ":annotation_stack_impl",
-    ]),
-)
-
-cc_library(
-    name = "annotation_stack_impl",
-    srcs = [
-        "annotation_stack.cc",
-        "annotation_stack.h",
-    ],
-    visibility = [
-        "//tensorflow/core/profiler:__pkg__",
-        "//tensorflow/python:__pkg__",
-    ],
-    deps = [
-        "//tensorflow/core:lib",
-        "@com_google_absl//absl/strings",
-    ],
-    alwayslink = True,
-)
-
-tf_cc_test(
-    name = "scoped_annotation_test",
-    size = "small",
-    srcs = ["scoped_annotation_test.cc"],
-    deps = [
-        ":annotation_stack",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/lib:scoped_annotation",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "parse_annotation",
-    srcs = ["parse_annotation.cc"],
-    hdrs = ["parse_annotation.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-tf_cc_test(
-    name = "parse_annotation_test",
-    srcs = ["parse_annotation_test.cc"],
-    deps = [
-        ":parse_annotation",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@com_google_absl//absl/strings",
+        "//tensorflow/core/platform:regexp",
     ],
 )
diff --git a/tensorflow/core/profiler/internal/advisor/BUILD b/tensorflow/core/profiler/internal/advisor/BUILD
index f6ab03a0100..c5e592525d9 100644
--- a/tensorflow/core/profiler/internal/advisor/BUILD
+++ b/tensorflow/core/profiler/internal/advisor/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
diff --git a/tensorflow/core/profiler/internal/cpu/BUILD b/tensorflow/core/profiler/internal/cpu/BUILD
index d8c84425e2b..552780dab79 100644
--- a/tensorflow/core/profiler/internal/cpu/BUILD
+++ b/tensorflow/core/profiler/internal/cpu/BUILD
@@ -1,4 +1,7 @@
+load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -9,12 +12,13 @@ cc_library(
     name = "host_tracer_utils",
     srcs = ["host_tracer_utils.cc"],
     hdrs = ["host_tracer_utils.h"],
+    copts = tf_profiler_copts(),
     visibility = ["//tensorflow/core/profiler:friends"],
     deps = [
+        ":traceme_recorder",
         "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/internal:parse_annotation",
-        "//tensorflow/core/profiler/internal:traceme_recorder",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:parse_annotation",
         "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_utils",
@@ -26,14 +30,15 @@ cc_library(
 cc_library(
     name = "host_tracer",
     srcs = ["host_tracer.cc"],
+    copts = tf_profiler_copts(),
     deps = [
         ":host_tracer_utils",
+        ":traceme_recorder",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "//tensorflow/core/profiler/internal:profiler_factory",
-        "//tensorflow/core/profiler/internal:profiler_interface",
-        "//tensorflow/core/profiler/internal:traceme_recorder",
+        "//tensorflow/core/profiler/lib:profiler_factory",
+        "//tensorflow/core/profiler/lib:profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
@@ -53,7 +58,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "//tensorflow/core/profiler/internal:profiler_interface",
+        "//tensorflow/core/profiler/lib:profiler_interface",
         "//tensorflow/core/profiler/lib:profiler_session",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
@@ -61,21 +66,91 @@ tf_cc_test(
         "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
-        "@com_google_googletest//:gtest",
     ],
 )
 
+cc_library(
+    name = "traceme_recorder",
+    hdrs = ["traceme_recorder.h"],
+    copts = tf_profiler_copts(),
+    visibility = ["//tensorflow/core/profiler:internal"],
+    deps = [
+        "@com_google_absl//absl/container:flat_hash_map",
+        "//tensorflow/core:lib",
+    ] + if_static([
+        ":traceme_recorder_impl",
+    ]),
+)
+
+cc_library(
+    name = "traceme_recorder_impl",
+    srcs = [
+        "traceme_recorder.cc",
+        "traceme_recorder.h",
+    ],
+    copts = tf_profiler_copts(),
+    visibility = [
+        "//tensorflow/core/profiler:__pkg__",
+        "//tensorflow/python:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+    alwayslink = True,
+)
+
+tf_cc_test(
+    name = "traceme_recorder_test",
+    srcs = ["traceme_recorder_test.cc"],
+    deps = [
+        ":traceme_recorder",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "annotation_stack",
+    hdrs = ["annotation_stack.h"],
+    copts = tf_profiler_copts(),
+    visibility = ["//tensorflow/core/profiler:internal"],
+    deps = [
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core:lib",
+    ] + if_static([
+        ":annotation_stack_impl",
+    ]),
+)
+
+cc_library(
+    name = "annotation_stack_impl",
+    srcs = [
+        "annotation_stack.cc",
+        "annotation_stack.h",
+    ],
+    copts = tf_profiler_copts(),
+    visibility = ["//tensorflow/core/profiler:__pkg__"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+    alwayslink = True,
+)
+
 cc_library(
     name = "python_tracer",
     srcs = ["python_tracer.cc"],
-    copts = ["-fexceptions"],
+    copts = tf_profiler_copts() + ["-fexceptions"],
     features = ["-use_header_modules"],
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "//tensorflow/core/profiler/internal:profiler_factory",
-        "//tensorflow/core/profiler/internal:profiler_interface",
+        "//tensorflow/core/profiler/lib:profiler_factory",
+        "//tensorflow/core/profiler/lib:profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/python/profiler/internal:python_hooks",
     ],
@@ -85,14 +160,15 @@ cc_library(
 cc_library(
     name = "metadata_collector",
     srcs = ["metadata_collector.cc"],
+    copts = tf_profiler_copts(),
     deps = [
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/service/gpu:gpu_debug_info_manager",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "//tensorflow/core/profiler/internal:profiler_factory",
-        "//tensorflow/core/profiler/internal:profiler_interface",
+        "//tensorflow/core/profiler/lib:profiler_factory",
+        "//tensorflow/core/profiler/lib:profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
diff --git a/tensorflow/core/profiler/internal/annotation_stack.cc b/tensorflow/core/profiler/internal/cpu/annotation_stack.cc
similarity index 95%
rename from tensorflow/core/profiler/internal/annotation_stack.cc
rename to tensorflow/core/profiler/internal/cpu/annotation_stack.cc
index 4c15ca47c3d..5a4c34e84a0 100644
--- a/tensorflow/core/profiler/internal/annotation_stack.cc
+++ b/tensorflow/core/profiler/internal/cpu/annotation_stack.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/profiler/internal/annotation_stack.h"
+#include "tensorflow/core/profiler/internal/cpu/annotation_stack.h"
 
 #include <atomic>
 
diff --git a/tensorflow/core/profiler/internal/annotation_stack.h b/tensorflow/core/profiler/internal/cpu/annotation_stack.h
similarity index 93%
rename from tensorflow/core/profiler/internal/annotation_stack.h
rename to tensorflow/core/profiler/internal/cpu/annotation_stack.h
index e626c4c73cc..b9d865fbf96 100644
--- a/tensorflow/core/profiler/internal/annotation_stack.h
+++ b/tensorflow/core/profiler/internal/cpu/annotation_stack.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ANNOTATION_STACK_H_
-#define TENSORFLOW_CORE_PROFILER_INTERNAL_ANNOTATION_STACK_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_CPU_ANNOTATION_STACK_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_CPU_ANNOTATION_STACK_H_
 
 #include <stddef.h>
 
@@ -93,4 +93,4 @@ class AnnotationStack {
 }  // namespace profiler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_ANNOTATION_STACK_H_
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_CPU_ANNOTATION_STACK_H_
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.cc b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
index c2e8121e5f0..ba882067463 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
@@ -25,9 +25,9 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/cpu/host_tracer_utils.h"
-#include "tensorflow/core/profiler/internal/profiler_factory.h"
-#include "tensorflow/core/profiler/internal/profiler_interface.h"
-#include "tensorflow/core/profiler/internal/traceme_recorder.h"
+#include "tensorflow/core/profiler/internal/cpu/traceme_recorder.h"
+#include "tensorflow/core/profiler/lib/profiler_factory.h"
+#include "tensorflow/core/profiler/lib/profiler_interface.h"
 #include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
@@ -141,6 +141,7 @@ Status HostTracer::CollectData(RunMetadata* run_metadata) {
 }
 
 Status HostTracer::CollectData(XSpace* space) {
+  VLOG(2) << "Collecting data to XSpace from HostTracer.";
   if (recording_) {
     return errors::Internal("TraceMeRecorder not stopped");
   }
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
index 0e4c3dd7a9b..1b48df0a650 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <ostream>
 #include <string>
 
-#include <gmock/gmock.h>
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
@@ -25,7 +24,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/lib/profiler_interface.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/profiler/profiler_options.pb.h"
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc
index 5d3e9ba1fc3..4f12776f581 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc
@@ -21,9 +21,9 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/internal/parse_annotation.h"
-#include "tensorflow/core/profiler/internal/traceme_recorder.h"
+#include "tensorflow/core/profiler/internal/cpu/traceme_recorder.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/parse_annotation.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.h b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.h
index fa5bf382c88..5770058cc6d 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.h
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.h
@@ -16,7 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_CPU_HOST_TRACER_UTILS_H_
 
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/internal/traceme_recorder.h"
+#include "tensorflow/core/profiler/internal/cpu/traceme_recorder.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/internal/cpu/metadata_collector.cc b/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
index c9a593f101b..4538ca0a344 100644
--- a/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
+++ b/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
@@ -18,12 +18,24 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#if defined(__clang__) && __cplusplus >= 201703L  // clang C++17
+#define TF_PROFILER_DISABLE_CXX17_WARNINGS \
+  _Pragma("clang diagnostic push")         \
+      _Pragma("clang diagnostic ignored \"-Wc++98-c++11-c++14-compat\"")
+#define TF_PROFILER_ENABLE_CXX17_WARNINGS _Pragma("clang diagnostic pop")
+#else
+#define TF_PROFILER_DISABLE_CXX17_WARNINGS
+#define TF_PROFILER_ENABLE_CXX17_WARNINGS
+#endif
+
+TF_PROFILER_DISABLE_CXX17_WARNINGS
 #include "tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h"
+TF_PROFILER_ENABLE_CXX17_WARNINGS
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/profiler/internal/profiler_factory.h"
-#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/lib/profiler_factory.h"
+#include "tensorflow/core/profiler/lib/profiler_interface.h"
 #include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
diff --git a/tensorflow/core/profiler/internal/cpu/python_tracer.cc b/tensorflow/core/profiler/internal/cpu/python_tracer.cc
index 4233c5fdd72..00f5cac61f1 100644
--- a/tensorflow/core/profiler/internal/cpu/python_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/python_tracer.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/profiler/internal/profiler_factory.h"
-#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/lib/profiler_factory.h"
+#include "tensorflow/core/profiler/lib/profiler_interface.h"
 #include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/protobuf/config.pb.h"
@@ -58,7 +58,7 @@ class PythonTracer : public ProfilerInterface {
 
 PythonTracer::~PythonTracer() {
   Stop().IgnoreError();
-  PythonHooks::GetSingleton()->Finalize();
+  PythonHooks::GetSingleton()->Finalize(nullptr);
 }
 
 Status PythonTracer::Start() {
@@ -76,7 +76,7 @@ Status PythonTracer::Stop() {
     return errors::Internal("TraceMeRecorder not started");
   }
   VLOG(1) << __FUNCTION__;
-  PythonHooks::GetSingleton()->Stop(options_);
+  PythonHooks::GetSingleton()->Stop();
   recording_ = false;
   return Status::OK();
 }
@@ -87,17 +87,14 @@ Status PythonTracer::CollectData(RunMetadata* run_metadata) {
   // in the wrong threads.
   // We had assumed HostTracer::Stop is called when ProfilerSession try to
   // serialize PythonTracer.
-  PythonHooks::GetSingleton()->Finalize();
+  VLOG(2) << "Collecting data to RunMetaData from PythonTracer.";
+  PythonHooks::GetSingleton()->Finalize(nullptr);
   return Status::OK();
 }
 
 Status PythonTracer::CollectData(XSpace* space) {
-  // This ProfilerInterface rely on HostTracer to serialize its trace.
-  // Make sure unpaired traceme don't get recorded, because it will end up
-  // in the wrong threads.
-  // We had assumed HostTracer::Stop is called when ProfilerSession try to
-  // serialize PythonTracer.
-  PythonHooks::GetSingleton()->Finalize();
+  VLOG(2) << "Collecting data to XSpace from PythonTracer.";
+  PythonHooks::GetSingleton()->Finalize(space);
   return Status::OK();
 }
 
@@ -106,10 +103,12 @@ Status PythonTracer::CollectData(XSpace* space) {
 // Not in anonymous namespace for testing purposes.
 std::unique_ptr<ProfilerInterface> CreatePythonTracer(
     const ProfileOptions& options) {
+  if (options.python_tracer_level() == 0 && options.host_tracer_level() == 0) {
+    return nullptr;
+  }
   PythonHooksOptions pyhooks_options;
-  pyhooks_options.enable_trace_python_function =
-      options.python_tracer_level() && options.host_tracer_level();
-  pyhooks_options.enable_python_traceme = options.host_tracer_level() != 0;
+  pyhooks_options.enable_trace_python_function = options.python_tracer_level();
+  pyhooks_options.enable_python_traceme = options.host_tracer_level();
   return absl::make_unique<PythonTracer>(pyhooks_options);
 }
 
diff --git a/tensorflow/core/profiler/internal/traceme_recorder.cc b/tensorflow/core/profiler/internal/cpu/traceme_recorder.cc
similarity index 99%
rename from tensorflow/core/profiler/internal/traceme_recorder.cc
rename to tensorflow/core/profiler/internal/cpu/traceme_recorder.cc
index 268585bde8c..57d36127493 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder.cc
+++ b/tensorflow/core/profiler/internal/cpu/traceme_recorder.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/profiler/internal/traceme_recorder.h"
+#include "tensorflow/core/profiler/internal/cpu/traceme_recorder.h"
 
 #include <stddef.h>
 
diff --git a/tensorflow/core/profiler/internal/traceme_recorder.h b/tensorflow/core/profiler/internal/cpu/traceme_recorder.h
similarity index 95%
rename from tensorflow/core/profiler/internal/traceme_recorder.h
rename to tensorflow/core/profiler/internal/cpu/traceme_recorder.h
index 5fdea5bddbd..69246ac4aa9 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder.h
+++ b/tensorflow/core/profiler/internal/cpu/traceme_recorder.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
-#define TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
+#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_CPU_TRACEME_RECORDER_H_
+#define TENSORFLOW_CORE_PROFILER_INTERNAL_CPU_TRACEME_RECORDER_H_
 
 #include <atomic>
 #include <string>
@@ -120,4 +120,5 @@ class TraceMeRecorder {
 
 }  // namespace profiler
 }  // namespace tensorflow
-#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
+
+#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_CPU_TRACEME_RECORDER_H_
diff --git a/tensorflow/core/profiler/internal/traceme_recorder_test.cc b/tensorflow/core/profiler/internal/cpu/traceme_recorder_test.cc
similarity index 98%
rename from tensorflow/core/profiler/internal/traceme_recorder_test.cc
rename to tensorflow/core/profiler/internal/cpu/traceme_recorder_test.cc
index 8d7abc94e8f..b6ab78ed546 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder_test.cc
+++ b/tensorflow/core/profiler/internal/cpu/traceme_recorder_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/profiler/internal/traceme_recorder.h"
+#include "tensorflow/core/profiler/internal/cpu/traceme_recorder.h"
 
 #include <atomic>
 #include <istream>
@@ -21,7 +21,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include <gmock/gmock.h>
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/env_time.h"
diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD
index dd8ff1e9a27..aa88db2b224 100644
--- a/tensorflow/core/profiler/internal/gpu/BUILD
+++ b/tensorflow/core/profiler/internal/gpu/BUILD
@@ -19,6 +19,7 @@ load(
     "//tensorflow/stream_executor:build_defs.bzl",
     "tf_additional_cupti_deps",
 )
+load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -28,7 +29,7 @@ package(
 tf_cuda_library(
     name = "device_tracer",
     srcs = tf_additional_device_tracer_srcs(),
-    copts = tf_copts(),
+    copts = tf_profiler_copts() + tf_copts(),
     cuda_deps = [
         "//tensorflow/core/profiler/internal/gpu:cupti_tracer",
         "//tensorflow/core/profiler/internal/gpu:cupti_wrapper",
@@ -38,11 +39,11 @@ tf_cuda_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler/internal:annotation_stack",
-        "//tensorflow/core/profiler/internal:parse_annotation",
-        "//tensorflow/core/profiler/internal:profiler_factory",
-        "//tensorflow/core/profiler/internal:profiler_interface",
+        "//tensorflow/core/profiler/internal/cpu:annotation_stack",
+        "//tensorflow/core/profiler/lib:profiler_factory",
+        "//tensorflow/core/profiler/lib:profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:parse_annotation",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
@@ -50,7 +51,6 @@ tf_cuda_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
     ],
     alwayslink = 1,
@@ -73,7 +73,6 @@ tf_cc_test_gpu(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:direct_session",
-        "//tensorflow/core:direct_session_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:gpu_runtime",
@@ -83,8 +82,9 @@ tf_cc_test_gpu(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/common_runtime:direct_session_internal",
         "//tensorflow/core/kernels:ops_util",
-        "//tensorflow/core/profiler/internal:profiler_interface",
+        "//tensorflow/core/profiler/lib:profiler_interface",
         "//tensorflow/core/profiler/lib:profiler_session",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:xplane_schema",
@@ -95,7 +95,7 @@ tf_cc_test_gpu(
 tf_cuda_library(
     name = "cupti_interface",
     hdrs = if_cuda_is_configured_compat(["cupti_interface.h"]),
-    copts = tf_copts(),
+    copts = tf_profiler_copts() + tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:lib",
@@ -113,7 +113,7 @@ tf_cuda_library(
     name = "cupti_wrapper",
     srcs = if_cuda_is_configured_compat(["cupti_wrapper.cc"]),
     hdrs = if_cuda_is_configured_compat(["cupti_wrapper.h"]),
-    copts = tf_copts(),
+    copts = tf_profiler_copts() + tf_copts(),
     linkstatic = 1,
     visibility = ["//visibility:public"],
     deps = [
@@ -125,13 +125,13 @@ tf_cuda_library(
     name = "cupti_tracer",
     srcs = if_cuda_is_configured_compat(["cupti_tracer.cc"]),
     hdrs = if_cuda_is_configured_compat(["cupti_tracer.h"]),
-    copts = tf_copts(),
+    copts = tf_profiler_copts() + tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
         ":cupti_interface",
         ":cupti_utils",
         "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/internal:annotation_stack",
+        "//tensorflow/core/profiler/internal/cpu:annotation_stack",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:node_hash_map",
@@ -143,7 +143,7 @@ tf_cuda_library(
 tf_cuda_library(
     name = "cupti_utils",
     srcs = if_cuda_is_configured_compat(["cupti_utils.cc"]),
-    copts = tf_copts(),
+    copts = tf_profiler_copts() + tf_copts(),
     cuda_deps = [
         ":cupti_interface",
         ":cupti_wrapper",
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
index bda7d5840ab..aedb1722fad 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mem.h"
-#include "tensorflow/core/profiler/internal/annotation_stack.h"
+#include "tensorflow/core/profiler/internal/cpu/annotation_stack.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index 6fc10bbf95b..ed675e712a1 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
@@ -35,13 +34,13 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/profiler/internal/annotation_stack.h"
+#include "tensorflow/core/profiler/internal/cpu/annotation_stack.h"
 #include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h"
 #include "tensorflow/core/profiler/internal/gpu/cupti_wrapper.h"
-#include "tensorflow/core/profiler/internal/parse_annotation.h"
-#include "tensorflow/core/profiler/internal/profiler_factory.h"
-#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/lib/profiler_factory.h"
+#include "tensorflow/core/profiler/lib/profiler_interface.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/parse_annotation.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
@@ -96,13 +95,12 @@ void CreateXEvent(const CuptiTracerEvent& event, XPlaneBuilder* plane,
         absl::StrCat("$$", static_cast<uint64>(event.context_id)));
   }
   if (event.type == CuptiTracerEventType::Kernel) {
-    const std::string kernel_details =
-        absl::StrFormat("regs:%u shm:%u grid:%u,%u,%u block:%u,%u,%u",
-                        event.kernel_info.registers_per_thread,
-                        event.kernel_info.static_shared_memory_usage,
-                        event.kernel_info.grid_x, event.kernel_info.grid_y,
-                        event.kernel_info.grid_z, event.kernel_info.block_x,
-                        event.kernel_info.block_y, event.kernel_info.block_z);
+    std::string kernel_details = absl::StrCat(
+        "regs:", event.kernel_info.registers_per_thread,
+        " shm:", event.kernel_info.static_shared_memory_usage,
+        " grid:", event.kernel_info.grid_x, ",", event.kernel_info.grid_y, ",",
+        event.kernel_info.grid_z, " block:", event.kernel_info.block_x, ",",
+        event.kernel_info.block_y, ",", event.kernel_info.block_z);
     xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
                             GetStatTypeStr(StatType::kKernelDetails)),
                         *plane->GetOrCreateStatMetadata(kernel_details));
@@ -112,15 +110,15 @@ void CreateXEvent(const CuptiTracerEvent& event, XPlaneBuilder* plane,
              event.type == CuptiTracerEventType::MemcpyP2P ||
              event.type == CuptiTracerEventType::MemcpyOther) {
     const auto& memcpy_info = event.memcpy_info;
-    std::string memcpy_details =
-        absl::StrFormat("size:%u dest:%u async:%u", memcpy_info.num_bytes,
-                        memcpy_info.destination, memcpy_info.async);
+    std::string memcpy_details = absl::StrCat("size:", memcpy_info.num_bytes,
+                                              " dest:", memcpy_info.destination,
+                                              " async:", memcpy_info.async);
     xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
                             GetStatTypeStr(StatType::kMemcpyDetails)),
                         memcpy_details);
   } else if (event.type == CuptiTracerEventType::MemoryAlloc) {
     std::string memalloc_details =
-        absl::StrFormat("num_bytes:%u", event.memalloc_info.num_bytes);
+        absl::StrCat("num_bytes:", event.memalloc_info.num_bytes);
     xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
                             GetStatTypeStr(StatType::kMemallocDetails)),
                         memalloc_details);
@@ -353,14 +351,14 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
           ns->set_node_name(activity_name);
           switch (event.type) {
             case CuptiTracerEventType::Kernel: {
-              ns->set_timeline_label(absl::StrFormat(
-                  "%s regs:%u shm:%u grid:%u,%u,%u block:%u,%u,%u@@%s",
-                  kernel_name, event.kernel_info.registers_per_thread,
-                  event.kernel_info.static_shared_memory_usage,
-                  event.kernel_info.grid_x, event.kernel_info.grid_y,
-                  event.kernel_info.grid_z, event.kernel_info.block_x,
-                  event.kernel_info.block_y, event.kernel_info.block_z,
-                  event.annotation));
+              ns->set_timeline_label(absl::StrCat(
+                  kernel_name, " regs:", event.kernel_info.registers_per_thread,
+                  " shm:", event.kernel_info.static_shared_memory_usage,
+                  " grid: ", event.kernel_info.grid_x, ",",
+                  event.kernel_info.grid_y, ",", event.kernel_info.grid_z,
+                  " block:", event.kernel_info.block_x, ",",
+                  event.kernel_info.block_y, ",", event.kernel_info.block_z,
+                  "@@", event.annotation));
               DeviceStepStats*& stream_dev_stats =
                   stream_dev_stats_map[std::make_pair(event.stream_id,
                                                       event.type)];
@@ -718,6 +716,7 @@ Status GpuTracer::CollectData(XSpace* space) {
 // Not in anonymous namespace for testing purposes.
 std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
     const ProfileOptions& options) {
+  VLOG(2) << "Collecting data to XSpace from GpuTracer.";
   if (options.device_type() != ProfileOptions::GPU &&
       options.device_type() != ProfileOptions::UNSPECIFIED)
     return nullptr;
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc b/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc
index 973167ff51b..55ccdbed977 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc
@@ -34,7 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/strcat.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/lib/profiler_interface.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
diff --git a/tensorflow/core/profiler/internal/testdata/BUILD b/tensorflow/core/profiler/internal/testdata/BUILD
new file mode 100644
index 00000000000..577e3d2032d
--- /dev/null
+++ b/tensorflow/core/profiler/internal/testdata/BUILD
@@ -0,0 +1,21 @@
+# Description:
+# Profiler test data packages.
+
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "profiler_testdata",
+    srcs = [
+        "ckpt.data-00000-of-00001",
+        "ckpt.index",
+        "ckpt.meta",
+        "graph.pbtxt",
+        "run_meta",
+        "tfprof_log",
+    ],
+    visibility = ["//tensorflow/core/profiler/internal:__pkg__"],
+)
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 8bcec0ccafb..6b1d0ee2403 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -23,8 +23,7 @@ bool CountAsAcceleratorTime(const string& device) {
   return device.find("stream:all") != device.npos;
 }
 bool CountAsCPUTime(const string& device) {
-  return RE2::FullMatch(device,
-                        ".*/(device:gpu|gpu|device:cpu|cpu|device:sycl):\\d+");
+  return RE2::FullMatch(device, ".*/(device:gpu|gpu|device:cpu|cpu):\\d+");
 }
 bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
 
@@ -210,11 +209,7 @@ void TFGraphNode::AddStepStat(int64 step, const string& device,
     } else {
       node_.set_canonical_device(dev);
       // TODO(xpan): Support things other than gpu?
-      if (dev.find("sycl") != dev.npos) {
-        node_.set_host_device(StringReplace(dev, "device:sycl:\\d+", "cpu:0"));
-      } else {
-        node_.set_host_device(StringReplace(dev, "gpu:\\d+", "cpu:0"));
-      }
+      node_.set_host_device(StringReplace(dev, "gpu:\\d+", "cpu:0"));
       AddOpType(node_.canonical_device());
     }
   }
@@ -288,8 +283,7 @@ TensorShapeProto VecToShapeProto(const std::vector<int64>& shape_vec) {
 }
 
 bool IsPlacedOnAccelerator(const string& device) {
-  return device.find("gpu") != device.npos ||
-         device.find("sycl") != device.npos;
+  return device.find("gpu") != device.npos;
 }
 bool IsPlacedOnCPU(const string& device) {
   return device.find("cpu") != device.npos;
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.h b/tensorflow/core/profiler/internal/tfprof_timeline.h
index 834e3c9be91..fb9ff8012e0 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.h
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
 
 #include "absl/strings/str_cat.h"
-#include "include/json/json.h"
+#include "json/json.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/profiler/internal/tfprof_node_show.h"
diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 57a3fa8a586..10e5df84345 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -1,6 +1,12 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 load("//tensorflow:tensorflow.bzl", "if_not_android", "tf_cc_test", "tf_cuda_library")
-load("//tensorflow:tensorflow.bzl", "tf_pybind_cc_library_wrapper")
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load(
+    "//tensorflow/core/profiler/builds:build_config.bzl",
+    "tf_profiler_copts",
+    "tf_profiler_pybind_cc_library_wrapper",
+)
 
 package(
     default_visibility = [
@@ -10,13 +16,10 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-tf_pybind_cc_library_wrapper(
-    name = "profiler_session_headers",
-    visibility = [
-        "//tensorflow/core/profiler/rpc:__pkg__",
-        "//tensorflow/python/profiler/internal:__pkg__",
-    ],
-    deps = [":profiler_session"],
+tf_profiler_pybind_cc_library_wrapper(
+    name = "profiler_session_for_pybind",
+    actual = ":profiler_session",
+    visibility = ["//tensorflow/python/profiler/internal:__pkg__"],
 )
 
 cc_library(
@@ -24,7 +27,7 @@ cc_library(
     hdrs = ["profiler_session.h"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/core/profiler/internal:profiler_interface",
+        ":profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core:lib",
@@ -40,6 +43,7 @@ cc_library(
         "profiler_session.cc",
         "profiler_session.h",
     ],
+    copts = tf_profiler_copts(),
     visibility = [
         "//tensorflow/core/profiler:__pkg__",
         "//tensorflow/python:__pkg__",
@@ -47,14 +51,15 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/platform",
-        "//tensorflow/core/profiler/internal:profiler_interface",
+        ":profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
         "@com_google_absl//absl/memory",
         "//tensorflow/core:protos_all_cc",
     ] + if_not_android([
-        ":profiler_utils",
-        "//tensorflow/core/profiler/internal:profiler_factory",
+        ":profiler_lock",
+        "//tensorflow/core/profiler/convert:post_process_single_host_xplane",
+        "//tensorflow/core/profiler/lib:profiler_factory",
         "//tensorflow/core/profiler/utils:derived_timeline",
         "//tensorflow/core/profiler/utils:group_events",
         "//tensorflow/core/profiler/utils:xplane_utils",
@@ -63,6 +68,46 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "profiler_factory",
+    hdrs = ["profiler_factory.h"],
+    deps = [
+        ":profiler_interface",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
+    ] + if_static([
+        ":profiler_factory_impl",
+    ]),
+)
+
+cc_library(
+    name = "profiler_factory_impl",
+    srcs = [
+        "profiler_factory.cc",
+        "profiler_factory.h",
+    ],
+    copts = tf_profiler_copts(),
+    visibility = [
+        "//tensorflow/core/profiler:__pkg__",
+    ],
+    deps = [
+        ":profiler_interface",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "profiler_interface",
+    hdrs = ["profiler_interface.h"],
+    copts = tf_profiler_copts(),
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+    ],
+)
+
 tf_cuda_library(
     name = "profiler_backends",
     cuda_deps = [
@@ -82,10 +127,10 @@ tf_cuda_library(
     alwayslink = True,
 )
 
-tf_pybind_cc_library_wrapper(
-    name = "traceme_headers",
+tf_profiler_pybind_cc_library_wrapper(
+    name = "traceme_for_pybind",
+    actual = ":traceme",
     visibility = ["//tensorflow/python/profiler/internal:__pkg__"],
-    deps = [":traceme"],
 )
 
 cc_library(
@@ -98,7 +143,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/platform",
     ] + if_not_android([
-        "//tensorflow/core/profiler/internal:traceme_recorder",
+        "//tensorflow/core/profiler/internal/cpu:traceme_recorder",
     ]),
 )
 
@@ -159,14 +204,30 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/platform",
     ] + if_not_android([
-        "//tensorflow/core/profiler/internal:annotation_stack",
+        "//tensorflow/core/profiler/internal/cpu:annotation_stack",
     ]),
 )
 
+tf_cc_test(
+    name = "scoped_annotation_test",
+    size = "small",
+    srcs = ["scoped_annotation_test.cc"],
+    deps = [
+        ":scoped_annotation",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler/internal/cpu:annotation_stack",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
-    name = "profiler_utils",
-    srcs = ["profiler_utils.cc"],
-    hdrs = ["profiler_utils.h"],
+    name = "profiler_lock",
+    srcs = ["profiler_lock.cc"],
+    hdrs = ["profiler_lock.h"],
+    copts = tf_profiler_copts(),
     visibility = ["//tensorflow/core/profiler:internal"],
 )
 
@@ -175,6 +236,7 @@ filegroup(
     srcs = [
         "annotated_traceme.h",
         "connected_traceme.h",
+        "profiler_interface.h",
         "profiler_session.cc",
         "profiler_session.h",
         "scoped_annotation.h",
diff --git a/tensorflow/core/profiler/internal/profiler_factory.cc b/tensorflow/core/profiler/lib/profiler_factory.cc
similarity index 92%
rename from tensorflow/core/profiler/internal/profiler_factory.cc
rename to tensorflow/core/profiler/lib/profiler_factory.cc
index 5152e79bdc8..e5bb3836365 100644
--- a/tensorflow/core/profiler/internal/profiler_factory.cc
+++ b/tensorflow/core/profiler/lib/profiler_factory.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/profiler/internal/profiler_factory.h"
+#include "tensorflow/core/profiler/lib/profiler_factory.h"
 
 #include <memory>
 #include <utility>
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/lib/profiler_interface.h"
 #include "tensorflow/core/profiler/profiler_options.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/internal/profiler_factory.h b/tensorflow/core/profiler/lib/profiler_factory.h
similarity index 81%
rename from tensorflow/core/profiler/internal/profiler_factory.h
rename to tensorflow/core/profiler/lib/profiler_factory.h
index c223d7275d9..b8b8d67e1bf 100644
--- a/tensorflow/core/profiler/internal/profiler_factory.h
+++ b/tensorflow/core/profiler/lib/profiler_factory.h
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_FACTORY_H_
-#define TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_FACTORY_H_
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_FACTORY_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_FACTORY_H_
 
 #include <memory>
 #include <vector>
 
-#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/lib/profiler_interface.h"
 #include "tensorflow/core/profiler/profiler_options.pb.h"
 
 namespace tensorflow {
@@ -35,4 +35,4 @@ void CreateProfilers(const ProfileOptions& options,
 }  // namespace profiler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_FACTORY_H_
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_PROFILER_FACTORY_H_
diff --git a/tensorflow/core/profiler/internal/profiler_interface.h b/tensorflow/core/profiler/lib/profiler_interface.h
similarity index 90%
rename from tensorflow/core/profiler/internal/profiler_interface.h
rename to tensorflow/core/profiler/lib/profiler_interface.h
index 9fe85e38652..31a9b262ca9 100644
--- a/tensorflow/core/profiler/internal/profiler_interface.h
+++ b/tensorflow/core/profiler/lib/profiler_interface.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_
-#define TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_INTERFACE_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_INTERFACE_H_
 
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
@@ -54,4 +54,4 @@ class ProfilerInterface {
 }  // namespace profiler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_PROFILER_INTERFACE_H_
diff --git a/tensorflow/core/profiler/lib/profiler_utils.cc b/tensorflow/core/profiler/lib/profiler_lock.cc
similarity index 95%
rename from tensorflow/core/profiler/lib/profiler_utils.cc
rename to tensorflow/core/profiler/lib/profiler_lock.cc
index ce3278f4519..b276b00e766 100644
--- a/tensorflow/core/profiler/lib/profiler_utils.cc
+++ b/tensorflow/core/profiler/lib/profiler_lock.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/profiler/lib/profiler_utils.h"
+#include "tensorflow/core/profiler/lib/profiler_lock.h"
 
 #include <atomic>
 
diff --git a/tensorflow/core/profiler/lib/profiler_utils.h b/tensorflow/core/profiler/lib/profiler_lock.h
similarity index 85%
rename from tensorflow/core/profiler/lib/profiler_utils.h
rename to tensorflow/core/profiler/lib/profiler_lock.h
index 140f12776db..75c49f77d54 100644
--- a/tensorflow/core/profiler/lib/profiler_utils.h
+++ b/tensorflow/core/profiler/lib/profiler_lock.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_UTILS_H_
-#define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_UTILS_H_
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_LOCK_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_LOCK_H_
 
 namespace tensorflow {
 namespace profiler {
@@ -28,4 +28,4 @@ void ReleaseProfilerLock();
 }  // namespace profiler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PROFILER_LIB_PROFILER_UTILS_H_
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_PROFILER_LOCK_H_
diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc
index ee6eb55300e..f37cb12ebab 100644
--- a/tensorflow/core/profiler/lib/profiler_session.cc
+++ b/tensorflow/core/profiler/lib/profiler_session.cc
@@ -24,15 +24,16 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/lib/profiler_interface.h"
 #include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
-#include "tensorflow/core/profiler/internal/profiler_factory.h"
-#include "tensorflow/core/profiler/lib/profiler_utils.h"
+#include "tensorflow/core/profiler/convert/post_process_single_host_xplane.h"
+#include "tensorflow/core/profiler/lib/profiler_factory.h"
+#include "tensorflow/core/profiler/lib/profiler_lock.h"
 #include "tensorflow/core/profiler/utils/derived_timeline.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
@@ -81,27 +82,7 @@ Status ProfilerSession::CollectData(profiler::XSpace* space) {
   }
 
 #if !defined(IS_MOBILE_PLATFORM)
-  // Post processing the collected XSpace without hold profiler lock.
-  // 1. Merge plane of host events with plane of CUPTI driver api.
-  const profiler::XPlane* cupti_driver_api_plane =
-      profiler::FindPlaneWithName(*space, profiler::kCuptiDriverApiPlaneName);
-  if (cupti_driver_api_plane) {
-    profiler::XPlane* host_plane = profiler::FindOrAddMutablePlaneWithName(
-        space, profiler::kHostThreadsPlaneName);
-    profiler::MergePlanes(*cupti_driver_api_plane, host_plane);
-    profiler::SortXLinesBy(host_plane, profiler::XLinesComparatorByName());
-    profiler::RemovePlaneWithName(space, profiler::kCuptiDriverApiPlaneName);
-  }
-  // 2. Normalize all timestamps by shifting timeline to profiling start time.
-  // NOTE: this have to be done before sorting XSpace due to timestamp overflow.
-  profiler::NormalizeTimestamps(space, start_time_ns_);
-  // 3. Sort each plane of the XSpace
-  profiler::SortXSpace(space);
-  // 4. Grouping (i.e. marking step number) events in the XSpace.
-  profiler::GroupMetadataMap group_metadata_map;
-  profiler::GroupTfEvents(space, &group_metadata_map);
-  // 5. Generated miscellaneous derived time lines for device planes.
-  profiler::GenerateDerivedTimeLines(group_metadata_map, space);
+  PostProcessSingleHostXSpace(space, start_time_ns_);
 #endif
 
   return Status::OK();
@@ -166,6 +147,7 @@ ProfilerSession::ProfilerSession(ProfileOptions options)
 }
 
 ProfilerSession::~ProfilerSession() {
+  VLOG(1) << "Profiler session stopping.";
   for (auto& profiler : profilers_) {
     profiler->Stop().IgnoreError();
   }
diff --git a/tensorflow/core/profiler/lib/profiler_session.h b/tensorflow/core/profiler/lib/profiler_session.h
index 93541f501ce..976ebcfc884 100644
--- a/tensorflow/core/profiler/lib/profiler_session.h
+++ b/tensorflow/core/profiler/lib/profiler_session.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/lib/profiler_interface.h"
 #include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/protobuf/config.pb.h"
@@ -38,7 +38,7 @@ namespace tensorflow {
 // Thread-safety: ProfilerSession is thread-safe.
 class ProfilerSession {
  public:
-  // Creates and ProfilerSession and starts profiling.
+  // Creates a ProfilerSession and starts profiling.
   static std::unique_ptr<ProfilerSession> Create(const ProfileOptions& options);
 
   static ProfileOptions DefaultOptions() {
diff --git a/tensorflow/core/profiler/lib/scoped_annotation.h b/tensorflow/core/profiler/lib/scoped_annotation.h
index 2cad5fd4708..57c1731608a 100644
--- a/tensorflow/core/profiler/lib/scoped_annotation.h
+++ b/tensorflow/core/profiler/lib/scoped_annotation.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/types.h"
 #if !defined(IS_MOBILE_PLATFORM)
-#include "tensorflow/core/profiler/internal/annotation_stack.h"
+#include "tensorflow/core/profiler/internal/cpu/annotation_stack.h"
 #endif
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/internal/scoped_annotation_test.cc b/tensorflow/core/profiler/lib/scoped_annotation_test.cc
similarity index 98%
rename from tensorflow/core/profiler/internal/scoped_annotation_test.cc
rename to tensorflow/core/profiler/lib/scoped_annotation_test.cc
index 50c1244b9ee..0e948cdbb02 100644
--- a/tensorflow/core/profiler/internal/scoped_annotation_test.cc
+++ b/tensorflow/core/profiler/lib/scoped_annotation_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
-#include "tensorflow/core/profiler/internal/annotation_stack.h"
+#include "tensorflow/core/profiler/internal/cpu/annotation_stack.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index 526f6d5104d..976fcfc82dd 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/types.h"
 #if !defined(IS_MOBILE_PLATFORM)
-#include "tensorflow/core/profiler/internal/traceme_recorder.h"
+#include "tensorflow/core/profiler/internal/cpu/traceme_recorder.h"
 #endif
 #include "tensorflow/core/profiler/lib/traceme_encode.h"  // IWYU pragma: export
 
@@ -123,8 +123,11 @@ class TraceMe {
   // name_generator is templated, rather than a std::function to avoid
   // allocations std::function might make even if never called.
   // Example Usage:
+  //   TraceMe trace_me([&]() {
+  //     return StrCat("my_trace", id);
+  //   }
   //   TraceMe op_trace_me([&]() {
-  //     return StrCat(op_name, ":", op_type);
+  //     return TraceMeOp(op_name, op_type);
   //   }
   //   TraceMe trace_me_with_metadata([&value1]() {
   //     return TraceMeEncode("my_trace", {{"key1", value1}, {"key2", 42}});
@@ -189,6 +192,23 @@ class TraceMe {
 
   // Static API, for use when scoped objects are inconvenient.
 
+  // Record the start time of an activity.
+  // Returns the activity ID, which is used to stop the activity.
+  // Calls `name_generator` to get the name for activity.
+  template <typename NameGeneratorT>
+  static uint64 ActivityStart(NameGeneratorT name_generator, int level = 1) {
+#if !defined(IS_MOBILE_PLATFORM)
+    if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
+      uint64 activity_id = TraceMeRecorder::NewActivityId();
+      TraceMeRecorder::Record({activity_id, name_generator(),
+                               /*start_time=*/EnvTime::NowNanos(),
+                               /*end_time=*/0});
+      return activity_id;
+    }
+#endif
+    return kUntracedActivity;
+  }
+
   // Record the start time of an activity.
   // Returns the activity ID, which is used to stop the activity.
   static uint64 ActivityStart(absl::string_view name, int level = 1) {
@@ -204,6 +224,16 @@ class TraceMe {
     return kUntracedActivity;
   }
 
+  // Same as ActivityStart above, an overload for "const std::string&"
+  static uint64 ActivityStart(const std::string& name, int level = 1) {
+    return ActivityStart(absl::string_view(name), level);
+  }
+
+  // Same as ActivityStart above, an overload for "const char*"
+  static uint64 ActivityStart(const char* name, int level = 1) {
+    return ActivityStart(absl::string_view(name), level);
+  }
+
   // Record the end time of an activity started by ActivityStart().
   static void ActivityEnd(uint64 activity_id) {
 #if !defined(IS_MOBILE_PLATFORM)
diff --git a/tensorflow/core/profiler/profiler_options.proto b/tensorflow/core/profiler/profiler_options.proto
index 8b4fc3de6fc..7858f08c8ec 100644
--- a/tensorflow/core/profiler/profiler_options.proto
+++ b/tensorflow/core/profiler/profiler_options.proto
@@ -2,6 +2,7 @@ syntax = "proto3";
 
 package tensorflow;
 
+// Next ID: 11
 message ProfileOptions {
   // Some default value of option are not proto3 default value. Use this version
   // to determine if we should use default option value instead of proto3
@@ -50,5 +51,32 @@ message ProfileOptions {
   // Whether serialize hlo_proto when XLA is used. (version >= 1)
   bool enable_hlo_proto = 7;
 
-  // next-field: 8
+  // The local profiler starts profiling at this Unix timestamp in nanoseconds.
+  uint64 start_timestamp_ns = 8;
+
+  // The local profiler collects `duration_ms` milliseconds of data. If the
+  // value is 0, profiling continues until interrupted.
+  uint64 duration_ms = 9;
+
+  // Directory to save profile data to. No-op when empty.
+  string repository_path = 10;
+}
+
+// Options for remote profiler session manager.
+// Next ID: 5
+message RemoteProfilerSessionManagerOptions {
+  // Options for each local profiler.
+  ProfileOptions profiler_options = 1;
+
+  // List of servers to profile. Supported formats: host:port.
+  repeated string service_addresses = 2;
+
+  // Unix timestamp of when the session was started.
+  uint64 session_creation_timestamp_ns = 3;
+
+  // Maximum time (in milliseconds) a profiling session manager waits for all
+  // profilers to finish after issuing gRPC request. If value is 0, session
+  // continues until interrupted. Otherwise, value must be greater than
+  // profiler_options.duration_ms.
+  uint64 max_session_duration_ms = 4;
 }
diff --git a/tensorflow/core/profiler/profiler_service.proto b/tensorflow/core/profiler/profiler_service.proto
index 69ccf04e304..f32b10f01bd 100644
--- a/tensorflow/core/profiler/profiler_service.proto
+++ b/tensorflow/core/profiler/profiler_service.proto
@@ -29,6 +29,7 @@ message ToolRequestOptions {
   bool save_to_repo = 3;
 }
 
+// Next-ID: 9
 message ProfileRequest {
   // In future, the caller will be able to customize when profiling starts and
   // stops. For now, it collects `duration_ms` milliseconds worth of data.
@@ -61,7 +62,6 @@ message ProfileRequest {
   // In future, the caller will indicate which TF session is being profiled, and
   // only data relating to that program will be returned. For now, we assume
   // all activity during the profiling period is relevant.
-  // next-field: 9
 }
 
 message ProfileToolData {
@@ -73,6 +73,7 @@ message ProfileToolData {
   bytes data = 2;
 }
 
+// Next-ID: 8
 message ProfileResponse {
   // Data payload for each required tools.
   repeated ProfileToolData tool_data = 6;
@@ -82,7 +83,6 @@ message ProfileResponse {
   bool empty_trace = 7;
 
   reserved 1, 2, 3, 4, 5;
-  // next-field: 8
 }
 
 message TerminateRequest {
@@ -92,6 +92,7 @@ message TerminateRequest {
 
 message TerminateResponse {}
 
+// Next-ID: 4
 message MonitorRequest {
   // Duration for which to profile between each update.
   uint64 duration_ms = 1;
@@ -106,10 +107,9 @@ message MonitorRequest {
   int32 monitoring_level = 2;
   // True to display timestamp in monitoring result.
   bool timestamp = 3;
-
-  // next-field: 4
 }
 
+// Next-ID: 11
 message MonitorResponse {
   // Properly formatted string data that can be directly returned back to user.
   string data = 1;
@@ -118,6 +118,4 @@ message MonitorResponse {
   ProfilerServiceMonitorResult monitor_result = 10;
 
   reserved 2, 3, 4, 5, 6, 7, 8, 9;
-
-  // next-field: 11
 }
diff --git a/tensorflow/core/profiler/protobuf/BUILD b/tensorflow/core/profiler/protobuf/BUILD
index 81bc222e119..02c8f2b6ad8 100644
--- a/tensorflow/core/profiler/protobuf/BUILD
+++ b/tensorflow/core/profiler/protobuf/BUILD
@@ -65,6 +65,27 @@ tf_proto_library(
     visibility = [":friends"],
 )
 
+tf_proto_library(
+    name = "pod_stats_proto",
+    srcs = ["pod_stats.proto"],
+    cc_api_version = 2,
+    protodeps = [
+        ":diagnostics_proto",
+    ],
+    visibility = [":friends"],
+)
+
+tf_proto_library(
+    name = "pod_viewer_proto",
+    srcs = ["pod_viewer.proto"],
+    cc_api_version = 2,
+    protodeps = [
+        ":diagnostics_proto",
+        ":pod_stats_proto",
+    ],
+    visibility = [":friends"],
+)
+
 tf_proto_library(
     name = "steps_db_proto",
     srcs = ["steps_db.proto"],
@@ -148,3 +169,10 @@ tf_proto_library(
     cc_api_version = 2,
     visibility = [":friends"],
 )
+
+tf_proto_library(
+    name = "tf_data_stats_proto",
+    srcs = ["tf_data_stats.proto"],
+    cc_api_version = 2,
+    visibility = [":friends"],
+)
diff --git a/tensorflow/core/profiler/protobuf/input_pipeline.proto b/tensorflow/core/profiler/protobuf/input_pipeline.proto
index b20942d3d36..e64470558ad 100644
--- a/tensorflow/core/profiler/protobuf/input_pipeline.proto
+++ b/tensorflow/core/profiler/protobuf/input_pipeline.proto
@@ -30,6 +30,12 @@ message BottleneckAnalysis {
   string all_other_classification = 5;
   // A human-readable description of the all other overhead.
   string all_other_statement = 6;
+  // Indicates if device collective communication is a bottleneck. Possible
+  // values: "no", "moderate", "high".
+  string device_collectives_classification = 11;
+  // A human-readable description of the device collective communication
+  // overhead.
+  string device_collectives_statement = 12;
 }
 
 // Used for both step duration and Op duration.
@@ -60,6 +66,8 @@ message PerGenericStepDetails {
   double device_compute_ms = 6;
   // The device-to-device communication time (in ms).
   double device_to_device_ms = 7;
+  // The device time spent on collective communications (in ms).
+  double device_collectives_ms = 13;
   // The host-compute time (in ms).
   double host_compute_ms = 8;
   // The host-prepare time (in ms).
@@ -129,6 +137,8 @@ message GenericStepTimeBreakdown {
   StepSummary device_compute_ms_summary = 4;
   // Summary of all device-to-device time as a part of step in ms.
   StepSummary device_to_device_ms_summary = 5;
+  // Summary of all device-collectives time as a part of step in ms.
+  StepSummary device_collectives_ms_summary = 12;
   // Summary of all host-compute time as a part of step in ms.
   StepSummary host_compute_ms_summary = 6;
   // Summary of all host-prepare time as a part of step in ms.
diff --git a/tensorflow/core/profiler/protobuf/op_metrics.proto b/tensorflow/core/profiler/protobuf/op_metrics.proto
index af38795b7b2..670ebd5ed67 100644
--- a/tensorflow/core/profiler/protobuf/op_metrics.proto
+++ b/tensorflow/core/profiler/protobuf/op_metrics.proto
@@ -26,12 +26,14 @@ message LayoutAnalysis {
 }
 
 // Metrics for an operation (accumulated over all occurrences).
-// Next ID: 20
+// Next ID: 21
 message OpMetrics {
   // HLO module id. 0 for TF ops.
   uint64 hlo_module_id = 13;
   // Name of this op.
   string name = 6;
+  // Long name of this op (e.g., HLO expression).
+  string long_name = 20;
   // Category of this op.
   string category = 11;
   // Provenance of this op (e.g., if HLO op, original TF op).
diff --git a/tensorflow/core/profiler/protobuf/op_stats.proto b/tensorflow/core/profiler/protobuf/op_stats.proto
index 4800e88a50a..53d6499b401 100644
--- a/tensorflow/core/profiler/protobuf/op_stats.proto
+++ b/tensorflow/core/profiler/protobuf/op_stats.proto
@@ -90,6 +90,17 @@ message RunEnvironment {
   uint32 host_trace_level = 12;
 }
 
+// Next ID: 7
+message CoreDetails {
+  string hostname = 1;
+  uint32 device_ordinal = 2;  // unique within host, TPU core only
+  uint32 core_num = 3;        // unique within chip per core type
+  uint32 local_chip_id = 4;   // unique within host
+  uint32 global_chip_id = 5;  // unique within mesh
+  uint32 global_core_id = 6;  // unique within mesh, TPU core only
+}
+
+// Next ID: 12
 // Operator Statistics.
 message OpStats {
   // The database for the op metrics collected from the host over the entire
@@ -98,6 +109,8 @@ message OpStats {
   // The database for the op metrics collected from the device over the entire
   // profiling session including incomplete steps.
   OpMetricsDb device_op_metrics_db = 2;
+  // The result for the HLO-metric database over the complete steps only.
+  OpMetricsDb hlo_metrics_db_complete_steps_only = 10;
   // Performance environment of the op metrics collected.
   PerfEnv perf_env = 3;
   // The database of step sequences.
@@ -108,6 +121,8 @@ message OpStats {
   KernelStatsDb kernel_stats_db = 6;
   // Statistics for all tf-functions.
   TfFunctionDb tf_function_db = 8;
+  // A map from core ID to details.
+  map<uint32, CoreDetails> core_id_to_details = 11;
   // Error and warning messages for diagnosing profiling issues.
   Diagnostics diagnostics = 9;
   reserved 7;
diff --git a/tensorflow/core/profiler/protobuf/overview_page.proto b/tensorflow/core/profiler/protobuf/overview_page.proto
index 433f8df27a6..6eb8efe8afa 100644
--- a/tensorflow/core/profiler/protobuf/overview_page.proto
+++ b/tensorflow/core/profiler/protobuf/overview_page.proto
@@ -87,6 +87,12 @@ message GenericRecommendation {
   // A statement that recommends if the user should try using lower precision.
   // Shows this statement to users only if it is not empty.
   string precision_statement = 5;
+  // Indicates if device collectives are a performance bottleneck. Possible
+  // values: "no", "moderate", "high".
+  string device_collectives_bottleneck = 6;
+  // A statement that recommends if we need to further investigate
+  // device-collectives performance.
+  string device_collectives_statement = 7;
 }
 
 // Overview result for the recommendation section.
diff --git a/tensorflow/core/profiler/protobuf/pod_stats.proto b/tensorflow/core/profiler/protobuf/pod_stats.proto
new file mode 100644
index 00000000000..abe55a5e8e0
--- /dev/null
+++ b/tensorflow/core/profiler/protobuf/pod_stats.proto
@@ -0,0 +1,41 @@
+syntax = "proto3";
+
+package tensorflow.profiler;
+
+import "tensorflow/core/profiler/protobuf/diagnostics.proto";
+
+message StepBreakdownEvents {
+  int32 id = 1;
+  string name = 2;
+}
+
+// A database of PodStats records.
+message PodStatsDatabase {
+  // All PodStats records, one for each row in the PodStats tool.
+  repeated PodStatsRecord pod_stats_record = 1;
+  // Error and warning messages for diagnosing profiling issues.
+  Diagnostics diagnostics = 3;
+  // A map from event type number to event name string for step breakdown.
+  repeated StepBreakdownEvents step_breakdown_events = 4;
+  reserved 2;
+}
+
+// Next ID: 20
+// There is one PodStatsRecord for each step traced on each compute node.
+message PodStatsRecord {
+  // The host name where the trace was collected.
+  string host_name = 1;
+  // The TPU global chip id where the trace was collected.
+  int32 chip_id = 2;
+  // The TPU node id where the trace was collected.
+  int32 node_id = 3;
+  // The step number.
+  uint32 step_num = 4;
+  // The step duration in micro-seconds.
+  double total_duration_us = 5;
+  // Breakdown the durations for each event type in micro-seconds.
+  map<int32, double> step_breakdown_us = 19;
+  // Indicates the bottleneck out of the above mentioned metrics.
+  string bottleneck = 14;
+  reserved 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18;
+}
diff --git a/tensorflow/core/profiler/protobuf/pod_viewer.proto b/tensorflow/core/profiler/protobuf/pod_viewer.proto
new file mode 100644
index 00000000000..3cfaa096c54
--- /dev/null
+++ b/tensorflow/core/profiler/protobuf/pod_viewer.proto
@@ -0,0 +1,125 @@
+// This proto describes the format of the output profile file from
+// the Pod Viewer tool.
+syntax = "proto3";
+
+package tensorflow.profiler;
+
+import "tensorflow/core/profiler/protobuf/diagnostics.proto";
+import "tensorflow/core/profiler/protobuf/pod_stats.proto";
+
+// Describes the replica groups in a cross replica op (e.g., all-reduce and
+// all-to-all).
+message ReplicaGroup {
+  // The ids of the replicas that belongs to the same group. The ordering of the
+  // ids matters in some ops (e.g., all-to-all).
+  repeated int64 replica_ids = 1;
+}
+
+message AllReduceOpInfo {
+  // Name of this OP.
+  string name = 1;
+  // Number of instances that this OP occurred.
+  uint32 occurrences = 2;
+  // The time in microseconds spent in this OP (averaged across all of its
+  // occurrences).
+  double duration_us = 3;
+  // Byte size of data transferred.
+  uint64 data_size = 4;
+  // Replica groups.
+  repeated ReplicaGroup replica_groups = 5;
+  // Description (e.g. XLA expression).
+  string description = 6;
+}
+
+// Result proto for information in a step across all cores.
+message PodStatsMap {
+  // The (micro) step number.
+  uint32 step_num = 1;
+  // A map from core_id to PodStatsRecord.
+  map<uint32, PodStatsRecord> pod_stats_per_core = 2;
+  // A database of channel info.
+  repeated ChannelInfo channel_db = 3;
+  // A map from core ID to program replica id. Replica id map could change
+  // during a profile session, but should stay stable within a step.
+  map<uint32, uint32> core_id_to_replica_id_map = 4;
+  // A database of all reduce ops.
+  repeated AllReduceOpInfo all_reduce_op_db = 5;
+}
+
+// A sequence of PodStatsMap for each step.
+message PodStatsSequence {
+  repeated PodStatsMap pod_stats_map = 1;
+}
+
+// Next ID: 14
+// Information about a send and recv channel.
+message ChannelInfo {
+  // Id of the channel.
+  int64 channel_id = 1;
+  // Core ids of send ops.
+  repeated uint32 src_core_ids = 11;
+  // Core ids of recv ops.
+  repeated uint32 dst_core_ids = 12;
+  // Byte size of the data transferred.
+  uint64 data_size = 4;
+  // Duration from the beginning of send to the end of recv-done in
+  // microseconds.
+  double duration_us = 5;
+  // Number of occurrences of a channel.
+  uint32 occurrences = 6;
+  // Percentage of the link BW utilized over the peak link BW.
+  double utilization = 7;
+  // A list of hlo names associated with this channel id.
+  repeated string hlo_names = 8;
+  // Duration from the beginning of the recv-done to the beginning of send in
+  // microseconds. If the recv-done op starts after the beginning of the send
+  // op, the delay is zero.
+  double send_delay_us = 9;
+  // Description (e.g. XLA expression).
+  string description = 13;
+
+  reserved 2, 3, 10;
+}
+
+message PodViewerSummary {
+  repeated string warnings = 1;
+}
+
+// Next ID: 8
+// Topology graph draws all the cores in the system in a 2-D rectangle or
+// 3-D cube. It is hierarchically grouped by host, chip and core.
+message PodViewerTopology {
+  // Number of cores in the x dimension of the rectangle/cube.
+  int32 x_dimension = 1;
+  // Number of cores in the y dimension of the rectangle/cube.
+  int32 y_dimension = 2;
+  // Number of cores in the z dimension of the cube.
+  int32 z_dimension = 3;
+  // Number of cores in the x dimension of each host.
+  int32 host_x_stride = 4;
+  // Number of cores in the y dimension of each host.
+  int32 host_y_stride = 5;
+  // Number of cores in the z dimension of each host.
+  int32 host_z_stride = 6;
+  // Number of cores per chip.
+  int32 num_cores_per_chip = 7;
+}
+
+// Next ID: 12
+// A database of pod viewer records.
+message PodViewerDatabase {
+  // The type of device used.
+  string device_type = 10;
+  // Pod level stats for each step.
+  PodStatsSequence pod_stats_sequence = 3;
+  // Top level summary of pod viewer.
+  PodViewerSummary summary = 7;
+  // Error and warning messages for diagnosing profiling issues.
+  Diagnostics diagnostics = 8;
+  // A map from event type number to event name string for step breakdown.
+  repeated StepBreakdownEvents step_breakdown_events = 9;
+  // Info to draw the topology graph.
+  PodViewerTopology topology = 11;
+
+  reserved 1, 2, 4, 5, 6;
+}
diff --git a/tensorflow/core/profiler/protobuf/tf_data_stats.proto b/tensorflow/core/profiler/protobuf/tf_data_stats.proto
new file mode 100644
index 00000000000..07f61ca6a0a
--- /dev/null
+++ b/tensorflow/core/profiler/protobuf/tf_data_stats.proto
@@ -0,0 +1,112 @@
+// This proto describes the format of the output profile file from
+// the tf.data stats tool.
+syntax = "proto3";
+
+package tensorflow.profiler;
+
+// Stat for iterator.
+message IteratorStat {
+  // Id of the iterator.
+  int64 id = 1;
+  // Start time of the iterator's GetNext in ps.
+  int64 start_time_ps = 2;
+  // Duration of the iterator's GetNext in ps.
+  int64 duration_ps = 3;
+  // Self time of the iterator's GetNext in ps. It takes account into async
+  // iterators. It is calculated by subtracting the time overlapped with its
+  // child iterator's duration from the iterator's duration.
+  int64 self_time_ps = 4;
+  // Whether it is blocking the root iterator. An async iterator's child
+  // iterator may not block its parent iterator if it is executed in advance and
+  // does not overlap with the parent iterator.
+  bool is_blocking = 5;
+  // The number of times this iterator is called. For example, a batch
+  // iterator's child iterator may be called multiple times.
+  int64 num_calls = 6;
+}
+
+// Metadata for iterator.
+message IteratorMetadata {
+  // Id of the iterator.
+  int64 id = 1;
+  // Id of the parent iterator.
+  int64 parent_id = 2;
+  // Name of the iterator.
+  string name = 3;
+  // Long name of the iterator.
+  string long_name = 6;
+  // Whether it is an async iterator.
+  bool is_async = 4;
+  // Parameters of the iterator (e.g., num_parallel_calls).
+  map<string, string> params = 5;
+}
+
+// Stat and metadata for input pipeline.
+message InputPipelineStat {
+  // Id of the blocking iterator with the longest self time.
+  int64 bottleneck_iterator_id = 2;
+  // Stats per iterator.
+  map<int64, IteratorStat> iterator_stats = 1;
+}
+
+// Metadata for input pipeline.
+message InputPipelineMetadata {
+  // The distribution strategy creates one "host" input pipeline which actually
+  // runs tf.data user code. Also, it creates a "device" input pipeline per
+  // device (e.g., TensorCore) which takes an element from the host input
+  // pipeline and transfers it to the device.
+  enum InputPipelineType {
+    HOST = 0;
+    DEVICE = 1;
+  }
+  // Id of the input pipeline which is set to the id of its root iterator.
+  int64 id = 1;
+  InputPipelineType type = 2;
+  string name = 4;
+  reserved 3;
+}
+
+// Collection of metadata and stats of input pipeline.
+message InputPipelineStats {
+  // Metadata of the input pipeline.
+  InputPipelineMetadata metadata = 1;
+  // Average latency (i.e., the root iterator's latency) of the input pipeline.
+  int64 avg_latency_ps = 3;
+  // Minimum latency of the input pipeline.
+  int64 min_latency_ps = 4;
+  // Maximum latency of the input pipeline.
+  int64 max_latency_ps = 5;
+  // The number of times this input pipeline was slower than 50 us.
+  int64 num_slow_calls = 6;
+  // Stats per call sorted by the root iterator's duration.
+  repeated InputPipelineStat stats = 2;
+}
+
+// Collection of stats of tf.data input pipelines within a host.
+message TfDataStats {
+  // Metadata per iterator.
+  map<int64, IteratorMetadata> iterator_metadata = 2;
+  // Stats per input pipeline.
+  map<int64, InputPipelineStats> input_pipelines = 1;
+}
+
+message TfDataBottleneckAnalysis {
+  // Host name.
+  string host = 1;
+  // Input pipeline name.
+  string input_pipeline = 2;
+  // Maximum latency of the input pipeline.
+  int64 max_latency_ps = 3;
+  // Name of the bottleneck iterator.
+  string iterator_name = 4;
+  // Long name of the bottleneck iterator.
+  string iterator_long_name = 5;
+}
+
+// TfDataStats of all hosts.
+message CombinedTfDataStats {
+  // Bottleneck analysis result.
+  TfDataBottleneckAnalysis bottleneck_analysis = 1;
+  // TfDataStats per host.
+  map<string, TfDataStats> tf_data_stats = 2;
+}
diff --git a/tensorflow/core/profiler/protobuf/xplane.proto b/tensorflow/core/profiler/protobuf/xplane.proto
index dd34c2f40b1..f57d7609891 100644
--- a/tensorflow/core/profiler/protobuf/xplane.proto
+++ b/tensorflow/core/profiler/protobuf/xplane.proto
@@ -5,13 +5,15 @@ package tensorflow.profiler;
 option cc_enable_arenas = true;
 
 // A container of parallel XPlanes, generated by one or more profiling sources.
-// Next ID: 4
+// Next ID: 5
 message XSpace {
   repeated XPlane planes = 1;
   // Errors (if any) in the generation of planes.
   repeated string errors = 2;
   // Warnings (if any) in the generation of planes;
   repeated string warnings = 3;
+  // List of hostnames that XPlanes are generated from.
+  repeated string hostnames = 4;
 }
 
 // An XPlane is a container of parallel timelines (XLines), generated by a
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
index 81861b95a3e..e54ffe0f615 100644
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -1,12 +1,15 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_external_workspace_visible")  # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")  # buildifier: disable=same-origin-load
-load("//tensorflow:tensorflow.bzl", "tf_pybind_cc_library_wrapper")  # buildifier: disable=same-origin-load
-load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_alias")
+load(
+    "//tensorflow/core/profiler/builds:build_config.bzl",
+    "tf_profiler_alias",
+    "tf_profiler_copts",
+    "tf_profiler_pybind_cc_library_wrapper",
+)
 
 package(
-    default_visibility = [
-        "//tensorflow/core/profiler:internal",
-    ],
+    default_visibility = ["//tensorflow/core/profiler:internal"],
     licenses = ["notice"],  # Apache 2.0
 )
 
@@ -26,11 +29,12 @@ exports_files(
     visibility = ["//tensorflow/core/profiler/rpc:__subpackages__"],
 )
 
+# Linked to pywrap_tensorflow.
 cc_library(
     name = "profiler_service_impl",
     srcs = ["profiler_service_impl.cc"],
     hdrs = ["profiler_service_impl.h"],
-    features = ["-layering_check"],
+    copts = tf_profiler_copts(),
     visibility = tf_external_workspace_visible(
         [
             "//tensorflow/core/data/service:__pkg__",
@@ -41,24 +45,35 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
-        "//tensorflow/core/profiler/lib:profiler_session_headers",
+        "//tensorflow/core/profiler/lib:profiler_session",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:file_system_utils",
+        "//tensorflow/core/profiler/utils:xplane_utils",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         tf_grpc_cc_dependency(),
     ],
 )
 
+tf_profiler_pybind_cc_library_wrapper(
+    name = "profiler_server_for_pybind",
+    actual = ":profiler_server_impl",
+    visibility = ["//tensorflow/python/profiler/internal:__pkg__"],
+)
+
 cc_library(
     name = "profiler_server_impl",
     srcs = ["profiler_server.cc"],
     hdrs = ["profiler_server.h"],
+    copts = tf_profiler_copts(),
     visibility = [
         "//tensorflow/compiler/xla/python:__pkg__",
+        "//tensorflow/core/profiler:internal",
         "//tensorflow/python:__pkg__",
         "//tensorflow/python/profiler/internal:__pkg__",
     ],
     deps = [
-        ":grpc",
         ":profiler_service_impl",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
@@ -67,11 +82,3 @@ cc_library(
     ],
     alwayslink = True,
 )
-
-tf_pybind_cc_library_wrapper(
-    name = "profiler_server_headers",
-    visibility = [
-        "//tensorflow/python/profiler/internal:__pkg__",
-    ],
-    deps = [":profiler_server_impl"],
-)
diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
index f1be26c6dd7..ca1ff506f2a 100644
--- a/tensorflow/core/profiler/rpc/client/BUILD
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -1,10 +1,16 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")
-load("//tensorflow:tensorflow.bzl", "tf_pybind_cc_library_wrapper")  # buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load(
+    "//tensorflow/core/profiler/builds:build_config.bzl",
+    "tf_profiler_copts",
+    "tf_profiler_pybind_cc_library_wrapper",
+)
 
 # For platform specific build config
 load(
     "//tensorflow/core/platform:build_config.bzl",
-    "tf_profiler_client_deps",
+    "tf_protos_profiler_service",
 )
 
 package(
@@ -15,55 +21,142 @@ cc_library(
     name = "capture_profile",
     srcs = ["capture_profile.cc"],
     hdrs = ["capture_profile.h"],
+    copts = tf_profiler_copts(),
     visibility = [
         "//tensorflow/python/profiler/internal:__pkg__",
     ],
     deps = [
+        ":profiler_client_for_pybind",
+        ":remote_profiler_session_manager",
         ":save_profile",
-        "@com_google_absl//absl/strings",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler:profiler_analysis_proto_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "//tensorflow/core/profiler/convert:xplane_to_profile_response",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core:lib_internal",
-    ] + tf_profiler_client_deps(),
+        "@com_google_absl//absl/strings",
+    ],
 )
 
 cc_library(
     name = "save_profile",
     srcs = ["save_profile.cc"],
     hdrs = ["save_profile.h"],
+    copts = tf_profiler_copts(),
     visibility = ["//tensorflow/core/profiler:internal"],
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
+        "//tensorflow/core/profiler/utils:file_system_utils",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
     ],
 )
 
+tf_profiler_pybind_cc_library_wrapper(
+    name = "profiler_client_for_pybind",
+    actual = ":profiler_client",
+)
+
+cc_library(
+    name = "profiler_client",
+    hdrs = ["profiler_client.h"],
+    deps = [
+        ":profiler_client_impl",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/profiler:profiler_analysis_proto_cc",
+        "//tensorflow/core/profiler:profiler_service_proto_cc",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+# Linked to pywrap_tensorflow to avoid ODR violation due to tf_grpc_cc_dependency().
 cc_library(
     name = "profiler_client_impl",
-    srcs = ["profiler_client.cc"],
-    hdrs = ["profiler_client.h"],
+    srcs = [
+        "profiler_client.cc",
+        "profiler_client.h",
+    ],
+    copts = tf_profiler_copts(),
     visibility = ["//tensorflow/python:__pkg__"],
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:profiler_analysis_proto_cc",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
-        "//tensorflow/core/profiler/rpc:grpc",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
         tf_grpc_cc_dependency(),
     ],
     alwayslink = True,
 )
 
-tf_pybind_cc_library_wrapper(
-    name = "profiler_client_headers",
-    visibility = ["//tensorflow/python/profiler/internal:__pkg__"],
-    deps = [":profiler_client_impl"],
+cc_library(
+    name = "profiler_client_test_util",
+    testonly = 1,
+    hdrs = ["profiler_client_test_util.h"],
+    deps = [
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
+        "//tensorflow/core/profiler/lib:profiler_session",
+        "//tensorflow/core/profiler/rpc:profiler_server_impl",
+    ] + tf_protos_profiler_service(),
+)
+
+tf_cc_test(
+    name = "profiler_client_test",
+    srcs = ["profiler_client_test.cc"],
+    deps = [
+        ":profiler_client",
+        ":profiler_client_impl",  # for oss
+        ":profiler_client_test_util",
+        "@com_google_absl//absl/time",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler/rpc:profiler_server_impl",
+    ] + tf_protos_profiler_service(),
+)
+
+cc_library(
+    name = "remote_profiler_session_manager",
+    srcs = ["remote_profiler_session_manager.cc"],
+    hdrs = ["remote_profiler_session_manager.h"],
+    copts = tf_profiler_copts(),
+    deps = [
+        ":profiler_client_for_pybind",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
+        "//tensorflow/core/profiler/utils:time_utils",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+tf_cc_test(
+    name = "remote_profiler_session_manager_test",
+    srcs = ["remote_profiler_session_manager_test.cc"],
+    deps = [
+        ":profiler_client_impl",  # for oss
+        ":profiler_client_test_util",
+        ":remote_profiler_session_manager",
+        "@com_google_absl//absl/time",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
+        "//tensorflow/core/profiler/rpc:profiler_server_impl",
+    ] + tf_protos_profiler_service(),
 )
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.cc b/tensorflow/core/profiler/rpc/client/capture_profile.cc
index d7707aff5c2..3f59d2ba265 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.cc
@@ -30,12 +30,16 @@ limitations under the License.
 #include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/rpc/client/profiler_client.h"
+#include "tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h"
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
 
 namespace tensorflow {
 namespace profiler {
 namespace {
 
+using ::tensorflow::profiler::RemoteProfilerSessionManager;
+using Response = ::tensorflow::profiler::RemoteProfilerSessionManager::Response;
+
 constexpr uint64 kMaxEvents = 1000000;
 const absl::string_view kXPlanePb = "xplane.pb";
 
@@ -48,17 +52,18 @@ MonitorRequest PopulateMonitorRequest(int duration_ms, int monitoring_level,
   return request;
 }
 
-ProfileRequest PopulateProfileRequest(int duration_ms,
-                                      const std::string& repository_root,
-                                      const std::string& session_id,
-                                      const std::string& host_name,
-                                      const ProfileOptions& opts) {
+ProfileRequest PopulateProfileRequest(
+    absl::string_view repository_root, absl::string_view session_id,
+    absl::string_view host_name,
+    const RemoteProfilerSessionManagerOptions& options) {
   ProfileRequest request;
-  request.set_duration_ms(duration_ms);
+  // TODO(b/169976117) Remove duration from request.
+  request.set_duration_ms(options.profiler_options().duration_ms());
   request.set_max_events(kMaxEvents);
-  request.set_repository_root(repository_root);
-  request.set_session_id(session_id);
-  request.set_host_name(host_name);
+  request.set_repository_root(repository_root.data(), repository_root.size());
+  request.set_session_id(session_id.data(), session_id.size());
+  request.set_host_name(host_name.data(), host_name.size());
+  // These tools are only used by TPU profiler.
   request.add_tools("trace_viewer");
   request.add_tools("op_profile");
   request.add_tools("input_pipeline");
@@ -68,21 +73,26 @@ ProfileRequest PopulateProfileRequest(int duration_ms,
   request.add_tools("overview_page");
   request.add_tools("pod_viewer");
   request.add_tools("tensorflow_stats");
-  *request.mutable_opts() = opts;
+  // XPlane tool is only used by OSS profiler and safely ignored by TPU
+  // profiler.
+  request.add_tools(kXPlanePb.data(), kXPlanePb.size());
+  *request.mutable_opts() = options.profiler_options();
   return request;
 }
 
 NewProfileSessionRequest PopulateNewProfileSessionRequest(
-    const std::string& service_addr, const std::string& repository_root,
-    const std::vector<string>& hostnames, int duration_ms,
-    const std::string& session_id, const ProfileOptions& opts) {
+    absl::string_view repository_root, absl::string_view session_id,
+    const RemoteProfilerSessionManagerOptions& opts) {
   NewProfileSessionRequest request;
-  std::vector<std::string> parts = absl::StrSplit(service_addr, ':');
-  *request.mutable_request() = PopulateProfileRequest(
-      duration_ms, repository_root, session_id, parts[0], opts);
-  request.set_repository_root(repository_root);
-  request.set_session_id(session_id);
-  for (const auto& hostname : hostnames) {
+  std::vector<absl::string_view> parts =
+      absl::StrSplit(opts.service_addresses(0), ':');
+  DCHECK(!parts.empty());
+
+  *request.mutable_request() =
+      PopulateProfileRequest(repository_root, session_id, parts[0], opts);
+  request.set_repository_root(repository_root.data(), repository_root.size());
+  request.set_session_id(session_id.data(), session_id.size());
+  for (const auto& hostname : opts.service_addresses()) {
     request.add_hosts(hostname);
   }
   return request;
@@ -99,44 +109,40 @@ inline bool ShouldRetryTracing(Status status) {
           status.error_message() == "Stream removed");
 }
 
-// If the ProfileResponse has single 'xplane.pb' tool, convert the xplane to
-// other tools and add in ProfileResponse. Otherwise, the ProfileResponse is
-// already converted, simply return.
-Status ConvertXSpaceToToolsInProfileResponse(const ProfileRequest& request,
-                                             ProfileResponse* response) {
-  if (response->tool_data_size() != 1) return Status::OK();
-  if (response->tool_data(0).name() != kXPlanePb) return Status::OK();
-  XSpace xspace;
-  xspace.ParseFromString(response->tool_data(0).data());
-  TF_RETURN_IF_ERROR(ConvertXSpaceToProfileResponse(xspace, request, response));
-  return Status::OK();
-}
+Status Profile(const std::string& repository_root,
+               const std::string& session_id,
+               const RemoteProfilerSessionManagerOptions& opts) {
+  Status status;
+  // Host name will be overwritten by RemoteProfilerSessionManager later.
+  ProfileRequest request = PopulateProfileRequest(repository_root, session_id,
+                                                  /*host_name=*/"", opts);
+  auto session = RemoteProfilerSessionManager::Create(opts, request, status);
+  TF_RETURN_IF_ERROR(status);
+  // Expect one or more service addresses.
+  DCHECK_GT(opts.service_addresses_size(), 0);
+  std::vector<Response> responses = session->WaitForCompletion();
+  // Expect responses to have the same size as clients.
+  DCHECK_EQ(responses.size(), opts.service_addresses_size());
 
-Status Profile(const std::string& service_addr,
-               const std::string& repository_root, int duration_ms,
-               const std::string& session_id, const ProfileOptions& opts) {
-  std::vector<std::string> parts = absl::StrSplit(service_addr, ':');
-  ProfileRequest request = PopulateProfileRequest(duration_ms, repository_root,
-                                                  session_id, parts[0], opts);
-  ProfileResponse response;
-  TF_RETURN_IF_ERROR(ProfileGrpc(service_addr, request, &response));
-
-  if (!response.empty_trace()) {
-    TF_RETURN_IF_ERROR(
-        ConvertXSpaceToToolsInProfileResponse(request, &response));
-    TF_RETURN_IF_ERROR(SaveProfile(repository_root, session_id,
-                                   request.host_name(), response, &std::cout));
-    // Print this at the end so that it's not buried in irrelevant LOG messages.
-    std::cout
-        << "NOTE: using the trace duration " << duration_ms << "ms.\n"
-        << "Set an appropriate duration (with --duration_ms) if you "
-           "don't see a full step in your trace or the captured trace is too "
-           "large."
-        << std::endl;
+  bool has_trace_data = false;
+  for (const auto& client_response : responses) {
+    ProfileResponse& response = *client_response.profile_response;
+    if (response.empty_trace()) {
+      LOG(WARNING) << "No trace event is collected from "
+                   << client_response.service_address;
+    } else {
+      has_trace_data = true;
+    }
+    if (!client_response.status.ok()) {
+      LOG(WARNING) << client_response.service_address << " returned "
+                   << client_response.status;
+    }
   }
 
-  if (response.empty_trace()) {
-    return Status(error::Code::UNAVAILABLE, "No trace event is collected");
+  if (!has_trace_data) {
+    return Status(error::Code::UNAVAILABLE,
+                  "No trace event was collected because there were no responses"
+                  " from clients or the responses did not have trace data.");
   }
   return Status::OK();
 }
@@ -144,52 +150,47 @@ Status Profile(const std::string& service_addr,
 // Start a new profiling session that include all the hosts included in
 // hostnames, for the time interval of duration_ms. Possibly save the profiling
 // result in the directory specified by repository_root and session_id.
-Status NewSession(const std::string& service_addr,
-                  const std::string& repository_root,
-                  const std::vector<string>& hostnames, int duration_ms,
-                  const std::string& session_id, const ProfileOptions& opts) {
-  NewProfileSessionRequest request = PopulateNewProfileSessionRequest(
-      service_addr, repository_root, hostnames, duration_ms, session_id, opts);
+Status NewSession(absl::string_view repository_root,
+                  absl::string_view session_id,
+                  const RemoteProfilerSessionManagerOptions& opts) {
+  NewProfileSessionRequest request =
+      PopulateNewProfileSessionRequest(repository_root, session_id, opts);
   NewProfileSessionResponse response;
-  TF_RETURN_IF_ERROR(NewSessionGrpc(service_addr, request, &response));
+  TF_RETURN_IF_ERROR(
+      NewSessionGrpc(opts.service_addresses(0), request, &response));
 
   std::cout << "Profile session succeed for host(s):"
-            << absl::StrJoin(hostnames, ",") << std::endl;
+            << absl::StrJoin(opts.service_addresses(), ",") << std::endl;
   if (response.empty_trace()) {
-    return Status(error::Code::UNAVAILABLE, "No trace event is collected");
+    return errors::Unavailable("No trace event is collected");
   }
   return Status::OK();
 }
 
 }  // namespace
 
-// Starts tracing on a single or multiple hosts and saves the result in the
-// given logdir. If no trace was collected, retries tracing for
-// num_tracing_attempts.
-Status Trace(const std::string& service_addr, const std::string& logdir,
-             const std::string& workers_list, int duration_ms,
-             int num_tracing_attempts, const ProfileOptions& opts) {
+Status Trace(const std::string& logdir, int num_tracing_attempts,
+             const RemoteProfilerSessionManagerOptions& opts,
+             bool is_cloud_tpu_session) {
+  DCHECK_GT(opts.profiler_options().duration_ms(), 0);
+  DCHECK(!opts.service_addresses().empty());
+
   // Use the current timestamp as the run name.
   std::string session_id = GetCurrentTimeStampAsString();
-  std::vector<std::string> hostnames;
-  if (!workers_list.empty()) {
-    hostnames = absl::StrSplit(workers_list, ',');
-  }
+  std::string repository_root = GetTensorBoardProfilePluginDir(logdir);
+  auto duration_ms = opts.profiler_options().duration_ms();
   TF_RETURN_IF_ERROR(MaybeCreateEmptyEventFile(logdir));
-  std::string repository_root =
-      profiler::GetTensorBoardProfilePluginDir(logdir);
 
-  Status status = Status::OK();
+  Status status;
   int remaining_attempts = num_tracing_attempts;
   while (true) {
     std::cout << "Starting to trace for " << duration_ms << " ms. "
               << "Remaining attempt(s): " << --remaining_attempts << std::endl;
-    if (hostnames.empty()) {
-      status =
-          Profile(service_addr, repository_root, duration_ms, session_id, opts);
+
+    if (is_cloud_tpu_session) {
+      status = NewSession(repository_root, session_id, opts);
     } else {
-      status = NewSession(service_addr, repository_root, hostnames, duration_ms,
-                          session_id, opts);
+      status = Profile(repository_root, session_id, opts);
     }
     if (remaining_attempts <= 0 || status.ok() || !ShouldRetryTracing(status))
       break;
@@ -223,11 +224,10 @@ Status ExportToTensorBoard(const XSpace& xspace, const std::string& logdir) {
 
   ProfileResponse response;
   ProfileRequest request = PopulateProfileRequest(
-      /*duration_ms=*/0, GetTensorBoardProfilePluginDir(logdir),
-      GetCurrentTimeStampAsString(), port::Hostname(), /*opts=*/{});
+      GetTensorBoardProfilePluginDir(logdir), GetCurrentTimeStampAsString(),
+      port::Hostname(), /*options=*/{});
   TF_RETURN_IF_ERROR(
       ConvertXSpaceToProfileResponse(xspace, request, &response));
-
   std::stringstream ss;  // Record LOG messages.
   TF_RETURN_IF_ERROR(SaveProfile(request.repository_root(),
                                  request.session_id(), request.host_name(),
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.h b/tensorflow/core/profiler/rpc/client/capture_profile.h
index 771f1fee722..cb9183e28a7 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.h
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.h
@@ -36,12 +36,12 @@ Status Monitor(const std::string& service_addr, int duration_ms,
                int monitoring_level, bool display_timestamp,
                std::string* result);
 
-// Starts tracing on a single or multiple hosts and saves the result in the
-// given logdir. If no trace was collected, retries tracing for
-// num_tracing_attempts.
-Status Trace(const std::string& service_addr, const std::string& logdir,
-             const std::string& workers_list, int duration_ms,
-             int num_tracing_attempts, const ProfileOptions& opts);
+// Starts tracing on a single or multiple hosts. Each host will save the result
+// in the given logdir. If no trace was collected, retries tracing for
+// num_tracing_attempts. Assumes that options have been validated.
+Status Trace(const std::string& logdir, int num_tracing_attempts,
+             const RemoteProfilerSessionManagerOptions& opts,
+             bool is_cloud_tpu_session);
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/client/profiler_client.cc b/tensorflow/core/profiler/rpc/client/profiler_client.cc
index 94c2bc8766f..f46075e8c44 100644
--- a/tensorflow/core/profiler/rpc/client/profiler_client.cc
+++ b/tensorflow/core/profiler/rpc/client/profiler_client.cc
@@ -17,11 +17,13 @@ limitations under the License.
 #include <limits>
 
 #include "grpcpp/grpcpp.h"
+#include "absl/memory/memory.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/rpc/grpc.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
@@ -35,50 +37,126 @@ inline Status FromGrpcStatus(const ::grpc::Status& s) {
 }
 
 template <typename T>
-std::unique_ptr<typename T::Stub> CreateStub(const std::string& service_addr) {
+std::unique_ptr<typename T::Stub> CreateStub(
+    const std::string& service_address) {
   ::grpc::ChannelArguments channel_args;
   channel_args.SetMaxReceiveMessageSize(std::numeric_limits<int32>::max());
   // Default URI prefix is "dns:///" if not provided.
   auto channel = ::grpc::CreateCustomChannel(
-      service_addr, ::grpc::InsecureChannelCredentials(), channel_args);
+      service_address, ::grpc::InsecureChannelCredentials(), channel_args);
   if (!channel) {
-    LOG(ERROR) << "Unable to create channel" << service_addr;
+    LOG(ERROR) << "Unable to create channel" << service_address;
   }
   return T::NewStub(channel);
 }
 
 }  // namespace
 
-Status ProfileGrpc(const std::string& service_addr,
+Status ProfileGrpc(const std::string& service_address,
                    const ProfileRequest& request, ProfileResponse* response) {
   ::grpc::ClientContext context;
   std::unique_ptr<grpc::ProfilerService::Stub> stub =
-      CreateStub<grpc::ProfilerService>(service_addr);
+      CreateStub<grpc::ProfilerService>(service_address);
   TF_RETURN_IF_ERROR(
       FromGrpcStatus(stub->Profile(&context, request, response)));
   return Status::OK();
 }
 
-Status NewSessionGrpc(const std::string& service_addr,
+Status NewSessionGrpc(const std::string& service_address,
                       const NewProfileSessionRequest& request,
                       NewProfileSessionResponse* response) {
   ::grpc::ClientContext context;
   std::unique_ptr<grpc::ProfileAnalysis::Stub> stub =
-      CreateStub<grpc::ProfileAnalysis>(service_addr);
+      CreateStub<grpc::ProfileAnalysis>(service_address);
   TF_RETURN_IF_ERROR(
       FromGrpcStatus(stub->NewSession(&context, request, response)));
   return Status::OK();
 }
 
-Status MonitorGrpc(const std::string& service_addr,
+Status MonitorGrpc(const std::string& service_address,
                    const MonitorRequest& request, MonitorResponse* response) {
   ::grpc::ClientContext context;
   std::unique_ptr<grpc::ProfilerService::Stub> stub =
-      CreateStub<grpc::ProfilerService>(service_addr);
+      CreateStub<grpc::ProfilerService>(service_address);
   TF_RETURN_IF_ERROR(
       FromGrpcStatus(stub->Monitor(&context, request, response)));
   return Status::OK();
 }
 
+/*static*/ std::unique_ptr<RemoteProfilerSession> RemoteProfilerSession::Create(
+    std::string service_address, absl::Time deadline,
+    ProfileRequest profile_request) {
+  auto instance = absl::WrapUnique(new RemoteProfilerSession(
+      std::move(service_address), deadline, std::move(profile_request)));
+  instance->ProfileAsync();
+  return instance;
+}
+
+RemoteProfilerSession::RemoteProfilerSession(std::string service_address,
+                                             absl::Time deadline,
+                                             ProfileRequest profile_request)
+    : response_(absl::make_unique<ProfileResponse>()),
+      service_address_(std::move(service_address)),
+      stub_(CreateStub<grpc::ProfilerService>(service_address_)),
+      deadline_(deadline),
+      profile_request_(std::move(profile_request)) {
+  response_->set_empty_trace(true);
+}
+
+RemoteProfilerSession::~RemoteProfilerSession() {
+  Status dummy;
+  WaitForCompletion(dummy);
+  grpc_context_.TryCancel();
+}
+
+void RemoteProfilerSession::ProfileAsync() {
+  LOG(INFO) << "Asynchronous gRPC Profile() to " << service_address_;
+  grpc_context_.set_deadline(absl::ToChronoTime(deadline_));
+  VLOG(1) << "Deadline set to " << deadline_;
+  rpc_ = stub_->AsyncProfile(&grpc_context_, profile_request_, &cq_);
+  // Connection failure will create lame channel whereby grpc_status_ will be an
+  // error.
+  rpc_->Finish(response_.get(), &grpc_status_,
+               static_cast<void*>(&status_on_completion_));
+  VLOG(2) << "Asynchronous gRPC Profile() issued." << absl::Now();
+}
+
+std::unique_ptr<ProfileResponse> RemoteProfilerSession::WaitForCompletion(
+    Status& out_status) {
+  if (!response_) {
+    out_status = errors::FailedPrecondition(
+        "WaitForCompletion must only be called once.");
+    return nullptr;
+  }
+  LOG(INFO) << "Waiting for completion.";
+
+  void* got_tag = nullptr;
+  bool ok = false;
+  // Next blocks until there is a response in the completion queue. Expect the
+  // completion queue to have exactly a single response because deadline is set
+  // and completion queue is only drained once at destruction time.
+  bool success = cq_.Next(&got_tag, &ok);
+  if (!success || !ok || got_tag == nullptr) {
+    out_status =
+        errors::Internal("Missing or invalid event from completion queue.");
+    return nullptr;
+  }
+
+  VLOG(1) << "Writing out status.";
+  // For the event read from the completion queue, expect that got_tag points to
+  // the memory location of status_on_completion.
+  DCHECK_EQ(got_tag, &status_on_completion_);
+  // tagged status points to pre-allocated memory which is okay to overwrite.
+  status_on_completion_.Update(FromGrpcStatus(grpc_status_));
+  if (status_on_completion_.code() == error::DEADLINE_EXCEEDED) {
+    LOG(WARNING) << status_on_completion_;
+  } else if (!status_on_completion_.ok()) {
+    LOG(ERROR) << status_on_completion_;
+  }
+
+  out_status = status_on_completion_;
+  return std::move(response_);
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/client/profiler_client.h b/tensorflow/core/profiler/rpc/client/profiler_client.h
index d946d607e55..84c71a55d5f 100644
--- a/tensorflow/core/profiler/rpc/client/profiler_client.h
+++ b/tensorflow/core/profiler/rpc/client/profiler_client.h
@@ -17,6 +17,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_PROFILER_CLIENT_H_
 #define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_PROFILER_CLIENT_H_
 
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/profiler_analysis.grpc.pb.h"
 #include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
@@ -24,16 +29,67 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
-Status ProfileGrpc(const std::string& service_addr,
+// Note that tensorflow/tools/def_file_filter/symbols_pybind.txt is incompatible
+// with absl::string_view.
+Status ProfileGrpc(const std::string& service_address,
                    const ProfileRequest& request, ProfileResponse* response);
 
-Status NewSessionGrpc(const std::string& service_addr,
+Status NewSessionGrpc(const std::string& service_address,
                       const NewProfileSessionRequest& request,
                       NewProfileSessionResponse* response);
 
-Status MonitorGrpc(const std::string& service_addr,
+Status MonitorGrpc(const std::string& service_address,
                    const MonitorRequest& request, MonitorResponse* response);
 
+class RemoteProfilerSession {
+ public:
+  // Creates an instance and starts a remote profiling session immediately.
+  // This is a non-blocking call and does not wait for a response.
+  // Response must outlive the instantiation.
+  static std::unique_ptr<RemoteProfilerSession> Create(
+      std::string service_address, absl::Time deadline,
+      ProfileRequest profile_request);
+
+  // Not copyable or movable.
+  RemoteProfilerSession(const RemoteProfilerSession&) = delete;
+  RemoteProfilerSession operator=(const RemoteProfilerSession&) = delete;
+
+  ~RemoteProfilerSession();
+
+  absl::string_view GetServiceAddress() const { return service_address_; }
+
+  // Blocks until a response has been received or until deadline expiry,
+  // whichever is first. Subsequent calls after the first will yield nullptr and
+  // an error status.
+  std::unique_ptr<ProfileResponse> WaitForCompletion(Status& out_status);
+
+ private:
+  explicit RemoteProfilerSession(std::string service_addr, absl::Time deadline,
+                                 ProfileRequest profile_request);
+
+  // Starts a remote profiling session. This is a non-blocking call.
+  // Will be called exactly once during instantiation.
+  // RPC will write to response.profile_response eagerly. However, since
+  // response.status requires a conversion from grpc::Status, it can only be
+  //  evaluated lazily at WaitForCompletion() time.
+  void ProfileAsync();
+
+  Status status_on_completion_;
+  std::unique_ptr<ProfileResponse> response_;
+  // Client address and connection attributes.
+  std::string service_address_;
+  std::unique_ptr<grpc::ProfilerService::Stub> stub_;
+  absl::Time deadline_;
+  ::grpc::ClientContext grpc_context_;
+  std::unique_ptr<::grpc::ClientAsyncResponseReader<ProfileResponse>> rpc_;
+  ::grpc::Status grpc_status_ = ::grpc::Status::OK;
+
+  // Asynchronous completion queue states.
+  ::grpc::CompletionQueue cq_;
+
+  ProfileRequest profile_request_;
+};
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/rpc/client/profiler_client_test.cc b/tensorflow/core/profiler/rpc/client/profiler_client_test.cc
new file mode 100644
index 00000000000..ae42c5cacd0
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/client/profiler_client_test.cc
@@ -0,0 +1,152 @@
+/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/rpc/client/profiler_client.h"
+
+#include <memory>
+#include <string>
+
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/profiler_service.pb.h"
+#include "tensorflow/core/profiler/rpc/client/profiler_client_test_util.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+using ::tensorflow::profiler::test::DurationApproxLess;
+using ::tensorflow::profiler::test::DurationNear;
+using ::tensorflow::profiler::test::StartServer;
+
+TEST(RemoteProfilerSession, Simple) {
+  absl::Duration duration = absl::Milliseconds(10);
+  ProfileRequest request;
+  std::string service_addr;
+  auto server = StartServer(duration, &service_addr, &request);
+  absl::Duration grace = absl::Seconds(1);
+  absl::Duration max_duration = duration + grace;
+  absl::Time approx_start = absl::Now();
+  absl::Time deadline = approx_start + max_duration;
+
+  auto remote_session =
+      RemoteProfilerSession::Create(service_addr, deadline, request);
+
+  Status status;
+  auto response = remote_session->WaitForCompletion(status);
+  absl::Duration elapsed = absl::Now() - approx_start;
+  // At end of session this evaluates to true still.
+  EXPECT_TRUE(status.ok());
+  // True because there was no workload traced and subsequently no XEvents.
+  EXPECT_TRUE(response->empty_trace());
+  // XSpaces are serialized and not returned as tools in ProfileResponse.
+  EXPECT_EQ(response->tool_data_size(), 0);
+  EXPECT_THAT(elapsed, DurationApproxLess(max_duration));
+}
+
+TEST(RemoteProfilerSession, WaitNotCalled) {
+  absl::Duration duration = absl::Milliseconds(10);
+  ProfileRequest request;
+  std::string service_addr;
+  auto server = StartServer(duration, &service_addr, &request);
+  absl::Duration grace = absl::Seconds(1);
+  absl::Duration max_duration = duration + grace;
+  absl::Time approx_start = absl::Now();
+  absl::Time deadline = approx_start + max_duration;
+
+  auto remote_session =
+      RemoteProfilerSession::Create(service_addr, deadline, request);
+  absl::Duration elapsed = absl::Now() - approx_start;
+
+  EXPECT_THAT(elapsed, DurationApproxLess(max_duration));
+}
+
+TEST(RemoteProfilerSession, Timeout) {
+  absl::Duration duration = absl::Milliseconds(10);
+  ProfileRequest request;
+  std::string service_addr;
+  auto server = StartServer(duration, &service_addr, &request);
+  // Expect this to fail immediately since deadline was set to the past,
+  auto remote_session =
+      RemoteProfilerSession::Create(service_addr, absl::Now(), request);
+  Status status;
+  auto response = remote_session->WaitForCompletion(status);
+  // At end of session we will have a timeout error.
+  EXPECT_TRUE(errors::IsDeadlineExceeded(status));
+  // True because there was no workload traced and subsequently no XEvents.
+  EXPECT_TRUE(response->empty_trace());
+  // XSpaces are serialized and not returned as tools in ProfileResponse.
+  EXPECT_EQ(response->tool_data_size(), 0);
+}
+
+TEST(RemoteProfilerSession, LongDeadline) {
+  absl::Duration duration = absl::Milliseconds(10);
+  ProfileRequest request;
+  std::string service_addr;
+  auto server = StartServer(duration, &service_addr, &request);
+
+  absl::Time approx_start = absl::Now();
+  absl::Duration grace = absl::Seconds(1000);
+  absl::Duration max_duration = duration + grace;
+  const absl::Time deadline = approx_start + max_duration;
+
+  auto remote_session =
+      RemoteProfilerSession::Create(service_addr, deadline, request);
+  Status status;
+  auto response = remote_session->WaitForCompletion(status);
+  absl::Duration elapsed = absl::Now() - approx_start;
+  // At end of session this evaluates to true still.
+  EXPECT_TRUE(status.ok());
+  // True because there was no workload traced and subsequently no XEvents.
+  EXPECT_TRUE(response->empty_trace());
+  // XSpaces are serialized and not returned as tools in ProfileResponse.
+  EXPECT_EQ(response->tool_data_size(), 0);
+  // Elapsed time is near profiling duration despite long grace period.
+  EXPECT_THAT(elapsed, DurationNear(duration));
+}
+
+TEST(RemoteProfilerSession, LongDuration) {
+  absl::Duration duration = absl::Seconds(3);
+  ProfileRequest request;
+  std::string service_addr;
+  auto server = StartServer(duration, &service_addr, &request);
+
+  absl::Time approx_start = absl::Now();
+  // Empirically determined value.
+  absl::Duration grace = absl::Seconds(20);
+  absl::Duration max_duration = duration + grace;
+  const absl::Time deadline = approx_start + max_duration;
+
+  auto remote_session =
+      RemoteProfilerSession::Create(service_addr, deadline, request);
+  Status status;
+  auto response = remote_session->WaitForCompletion(status);
+  absl::Duration elapsed = absl::Now() - approx_start;
+  // At end of session this evaluates to true still.
+  EXPECT_TRUE(status.ok());
+  // True because there was no workload traced and subsequently no XEvents.
+  EXPECT_TRUE(response->empty_trace());
+  // XSpaces are serialized and not returned as tools in ProfileResponse.
+  EXPECT_EQ(response->tool_data_size(), 0);
+  // Elapsed time takes longer to complete for larger traces.
+  EXPECT_THAT(elapsed, DurationApproxLess(max_duration));
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/client/profiler_client_test_util.h b/tensorflow/core/profiler/rpc/client/profiler_client_test_util.h
new file mode 100644
index 00000000000..da10dcc6193
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/client/profiler_client_test_util.h
@@ -0,0 +1,81 @@
+/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// GRPC client to perform on-demand profiling
+
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_PROFILER_CLIENT_TEST_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_PROFILER_CLIENT_TEST_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/profiler_session.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
+#include "tensorflow/core/profiler/profiler_service.pb.h"
+#include "tensorflow/core/profiler/rpc/profiler_server.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace test {
+
+inline std::unique_ptr<ProfilerServer> StartServer(
+    absl::Duration duration, std::string* service_address,
+    ProfileRequest* request = nullptr) {
+  auto profiler_server = absl::make_unique<ProfilerServer>();
+  int port = testing::PickUnusedPortOrDie();
+  profiler_server->StartProfilerServer(port);
+
+  DCHECK(service_address);
+  *service_address = absl::StrCat("localhost:", port);
+
+  if (request) {
+    request->set_duration_ms(absl::ToInt64Milliseconds(duration));
+    request->set_max_events(10000);
+    *request->mutable_opts() = ProfilerSession::DefaultOptions();
+    request->mutable_opts()->set_duration_ms(
+        absl::ToInt64Milliseconds(duration));
+    request->set_session_id("test_session");
+    request->set_host_name(*service_address);
+    request->set_repository_root(testing::TmpDir());
+  }
+
+  LOG(INFO) << "Started " << *service_address << " at " << absl::Now();
+  LOG(INFO) << "Duration: " << duration;
+
+  return profiler_server;
+}
+
+inline ::testing::Matcher<absl::Duration> DurationNear(
+    const absl::Duration duration, absl::Duration epsilon = absl::Seconds(1)) {
+  return ::testing::AllOf(::testing::Ge(duration - epsilon),
+                          ::testing::Le(duration + epsilon));
+}
+
+inline ::testing::Matcher<absl::Duration> DurationApproxLess(
+    const absl::Duration duration, absl::Duration epsilon = absl::Seconds(1)) {
+  return ::testing::Le(duration + epsilon);
+}
+
+}  // namespace test
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_CLIENT_PROFILER_CLIENT_TEST_H_
diff --git a/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.cc b/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.cc
new file mode 100644
index 00000000000..2eeffa292f0
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.cc
@@ -0,0 +1,117 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h"
+
+#include <cstddef>
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/rpc/client/profiler_client.h"
+#include "tensorflow/core/profiler/utils/time_utils.h"
+
+namespace tensorflow {
+namespace profiler {
+
+/*static*/ std::unique_ptr<RemoteProfilerSessionManager>
+RemoteProfilerSessionManager::Create(
+    const RemoteProfilerSessionManagerOptions& options,
+    const ProfileRequest& request, tensorflow::Status& out_status,
+    AddressResolver resolver) {
+  VLOG(1) << "Creating a RemoteProfilerSessionManager.";
+  auto session_manager = absl::WrapUnique(
+      new RemoteProfilerSessionManager(options, request, resolver));
+  out_status = session_manager->Init();
+  if (!out_status.ok()) {
+    return nullptr;
+  }
+  return session_manager;
+}
+
+RemoteProfilerSessionManager::RemoteProfilerSessionManager(
+    RemoteProfilerSessionManagerOptions options, ProfileRequest request,
+    AddressResolver resolver)
+    : options_(std::move(options)), request_(std::move(request)) {
+  if (resolver) {
+    resolver_ = std::move(resolver);
+  } else {
+    resolver_ = [](absl::string_view addr) { return std::string(addr); };
+  }
+}
+
+RemoteProfilerSessionManager::~RemoteProfilerSessionManager() {
+  VLOG(2) << "Destroying RemoteProfilerSessionManager.";
+}
+
+Status RemoteProfilerSessionManager::Init() {
+  mutex_lock lock(mutex_);
+  VLOG(1) << "SessionManager initializing.";
+
+  const absl::Time session_created_ts =
+      absl::FromUnixNanos(options_.session_creation_timestamp_ns());
+  const absl::Time deadline =
+      session_created_ts +
+      absl::Milliseconds(options_.max_session_duration_ms());
+
+  LOG(INFO) << "Deadline set to " << deadline
+            << " because max_session_duration_ms was "
+            << options_.max_session_duration_ms()
+            << " and session_creation_timestamp_ns was "
+            << options_.session_creation_timestamp_ns() << " ["
+            << session_created_ts << "]";
+
+  // Prepare a list of clients.
+  clients_.reserve(options_.service_addresses_size());
+
+  for (auto& service_address : options_.service_addresses()) {
+    std::string resolved_service_address = resolver_(service_address);
+    ProfileRequest request = request_;
+    request.set_host_name(resolved_service_address);
+
+    // Creation also issues Profile RPC asynchronously.
+    auto client = RemoteProfilerSession::Create(
+        std::move(resolved_service_address), deadline, std::move(request));
+    clients_.push_back(std::move(client));
+  }
+
+  LOG(INFO) << "Issued Profile gRPC to " << clients_.size() << " clients";
+  return Status::OK();
+}
+
+std::vector<RemoteProfilerSessionManager::Response>
+RemoteProfilerSessionManager::WaitForCompletion() {
+  mutex_lock lock(mutex_);
+  std::vector<RemoteProfilerSessionManager::Response> remote_responses(
+      clients_.size());
+
+  for (int32 idx = 0; idx < clients_.size(); ++idx) {
+    auto& remote_response = remote_responses[idx];
+    auto* client = clients_[idx].get();
+    remote_response.profile_response =
+        client->WaitForCompletion(remote_response.status);
+    remote_response.service_address = std::string(client->GetServiceAddress());
+  }
+  return remote_responses;
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h b/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h
new file mode 100644
index 00000000000..1dc240ad530
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h
@@ -0,0 +1,84 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_REMOTE_PROFILER_SESSION_MANAGER_H_
+#define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_REMOTE_PROFILER_SESSION_MANAGER_H_
+
+#include <functional>
+#include <memory>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/rpc/client/profiler_client.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using AddressResolver = std::function<std::string(absl::string_view)>;
+
+// Manages one or more remote profiling sessions.
+class RemoteProfilerSessionManager {
+ public:
+  struct Response {
+    std::string service_address;
+    std::unique_ptr<ProfileResponse> profile_response;
+    Status status;
+  };
+  // Instantiates a collection of RemoteProfilerSessions starts profiling on
+  // each of them immediately. Assumes that options have already been validated.
+  static std::unique_ptr<RemoteProfilerSessionManager> Create(
+      const RemoteProfilerSessionManagerOptions& options,
+      const ProfileRequest& request, tensorflow::Status& out_status,
+      AddressResolver resolver = nullptr);
+
+  // Awaits for responses from remote profiler sessions and returns them as a
+  // list. Subsequent calls beyond the first will yield a list of errors.
+  std::vector<Response> WaitForCompletion();
+
+  // Not copyable or movable.
+  RemoteProfilerSessionManager(const RemoteProfilerSessionManager&) = delete;
+  RemoteProfilerSessionManager operator=(const RemoteProfilerSessionManager&) =
+      delete;
+
+  ~RemoteProfilerSessionManager();
+
+ private:
+  explicit RemoteProfilerSessionManager(
+      RemoteProfilerSessionManagerOptions options, ProfileRequest request,
+      AddressResolver resolver);
+
+  // Initialization of all client contexts.
+  Status Init();
+
+  mutex mutex_;
+  // Remote profiler session options.
+  RemoteProfilerSessionManagerOptions options_ TF_GUARDED_BY(mutex_);
+  ProfileRequest request_ TF_GUARDED_BY(mutex_);
+  // List of clients, each connects to a profiling service.
+  std::vector<std::unique_ptr<RemoteProfilerSession>> clients_
+      TF_GUARDED_BY(mutex_);
+  // Resolves an address into a format that gRPC understands.
+  AddressResolver resolver_ TF_GUARDED_BY(mutex_);
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_RPC_CLIENT_REMOTE_PROFILER_SESSION_MANAGER_H_
diff --git a/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager_test.cc b/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager_test.cc
new file mode 100644
index 00000000000..c6d2d640eae
--- /dev/null
+++ b/tensorflow/core/profiler/rpc/client/remote_profiler_session_manager_test.cc
@@ -0,0 +1,162 @@
+/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/rpc/client/remote_profiler_session_manager.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
+#include "tensorflow/core/profiler/profiler_service.pb.h"
+#include "tensorflow/core/profiler/rpc/client/profiler_client_test_util.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+using ::tensorflow::profiler::test::DurationApproxLess;
+using ::tensorflow::profiler::test::DurationNear;
+using ::tensorflow::profiler::test::StartServer;
+using ::tensorflow::testing::TmpDir;
+using Response = tensorflow::profiler::RemoteProfilerSessionManager::Response;
+
+// Copied from capture_profile to not introduce a dependency.
+ProfileRequest PopulateProfileRequest(
+    absl::string_view repository_root, absl::string_view session_id,
+    absl::string_view host_name,
+    const RemoteProfilerSessionManagerOptions& options) {
+  constexpr uint64 kMaxEvents = 1000000;
+  const absl::string_view kXPlanePb = "xplane.pb";
+  ProfileRequest request;
+  // TODO(b/169976117) Remove duration from request.
+  request.set_duration_ms(options.profiler_options().duration_ms());
+  request.set_max_events(kMaxEvents);
+  request.set_repository_root(repository_root.data(), repository_root.size());
+  request.set_session_id(session_id.data(), session_id.size());
+  request.set_host_name(host_name.data(), host_name.size());
+  // XPlane tool is only used by OSS profiler and safely ignored by TPU
+  // profiler.
+  request.add_tools(kXPlanePb.data(), kXPlanePb.size());
+  *request.mutable_opts() = options.profiler_options();
+  return request;
+}
+
+TEST(RemoteProfilerSessionManagerTest, Simple) {
+  absl::Duration duration = absl::Milliseconds(30);
+  RemoteProfilerSessionManagerOptions options;
+  *options.mutable_profiler_options() =
+      tensorflow::ProfilerSession::DefaultOptions();
+  options.mutable_profiler_options()->set_duration_ms(
+      absl::ToInt64Milliseconds(duration));
+
+  std::string service_address;
+  auto server = StartServer(duration, &service_address);
+  options.add_service_addresses(service_address);
+  absl::Time approx_start = absl::Now();
+  absl::Duration grace = absl::Seconds(1);
+  absl::Duration max_duration = duration + grace;
+  options.set_max_session_duration_ms(absl::ToInt64Milliseconds(max_duration));
+  options.set_session_creation_timestamp_ns(absl::ToUnixNanos(approx_start));
+
+  ProfileRequest request =
+      PopulateProfileRequest(TmpDir(), "session_id", service_address, options);
+  Status status;
+  auto sessions =
+      RemoteProfilerSessionManager::Create(options, request, status);
+  EXPECT_TRUE(status.ok());
+  std::vector<Response> responses = sessions->WaitForCompletion();
+  absl::Duration elapsed = absl::Now() - approx_start;
+  ASSERT_EQ(responses.size(), 1);
+  EXPECT_TRUE(responses.back().status.ok());
+  EXPECT_TRUE(responses.back().profile_response->empty_trace());
+  EXPECT_EQ(responses.back().profile_response->tool_data_size(), 0);
+  EXPECT_THAT(elapsed, DurationApproxLess(max_duration));
+}
+
+TEST(RemoteProfilerSessionManagerTest, ExpiredDeadline) {
+  absl::Duration duration = absl::Milliseconds(30);
+  RemoteProfilerSessionManagerOptions options;
+  *options.mutable_profiler_options() =
+      tensorflow::ProfilerSession::DefaultOptions();
+  options.mutable_profiler_options()->set_duration_ms(
+      absl::ToInt64Milliseconds(duration));
+
+  std::string service_address;
+  auto server = StartServer(duration, &service_address);
+  options.add_service_addresses(service_address);
+  absl::Duration grace = absl::Seconds(1);
+  absl::Duration max_duration = duration + grace;
+  options.set_max_session_duration_ms(absl::ToInt64Milliseconds(max_duration));
+  // This will create a deadline in the past.
+  options.set_session_creation_timestamp_ns(0);
+
+  absl::Time approx_start = absl::Now();
+  ProfileRequest request =
+      PopulateProfileRequest(TmpDir(), "session_id", service_address, options);
+  Status status;
+  auto sessions =
+      RemoteProfilerSessionManager::Create(options, request, status);
+  EXPECT_TRUE(status.ok());
+  std::vector<Response> responses = sessions->WaitForCompletion();
+  absl::Duration elapsed = absl::Now() - approx_start;
+  EXPECT_THAT(elapsed, DurationNear(absl::Seconds(0)));
+  ASSERT_EQ(responses.size(), 1);
+  EXPECT_TRUE(errors::IsDeadlineExceeded(responses.back().status));
+  EXPECT_TRUE(responses.back().profile_response->empty_trace());
+  EXPECT_EQ(responses.back().profile_response->tool_data_size(), 0);
+}
+
+TEST(RemoteProfilerSessionManagerTest, LongSession) {
+  absl::Duration duration = absl::Seconds(3);
+  RemoteProfilerSessionManagerOptions options;
+  *options.mutable_profiler_options() =
+      tensorflow::ProfilerSession::DefaultOptions();
+  options.mutable_profiler_options()->set_duration_ms(
+      absl::ToInt64Milliseconds(duration));
+
+  std::string service_address;
+  auto server = StartServer(duration, &service_address);
+  options.add_service_addresses(service_address);
+  absl::Time approx_start = absl::Now();
+  // Empirically determined value.
+  absl::Duration grace = absl::Seconds(20);
+  absl::Duration max_duration = duration + grace;
+  options.set_max_session_duration_ms(absl::ToInt64Milliseconds(max_duration));
+  options.set_session_creation_timestamp_ns(absl::ToUnixNanos(approx_start));
+
+  ProfileRequest request =
+      PopulateProfileRequest(TmpDir(), "session_id", service_address, options);
+  Status status;
+  auto sessions =
+      RemoteProfilerSessionManager::Create(options, request, status);
+  EXPECT_TRUE(status.ok());
+  std::vector<Response> responses = sessions->WaitForCompletion();
+  absl::Duration elapsed = absl::Now() - approx_start;
+  ASSERT_EQ(responses.size(), 1);
+  EXPECT_TRUE(responses.back().status.ok());
+  EXPECT_TRUE(responses.back().profile_response->empty_trace());
+  EXPECT_EQ(responses.back().profile_response->tool_data_size(), 0);
+  EXPECT_THAT(elapsed, DurationApproxLess(max_duration));
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/client/save_profile.cc b/tensorflow/core/profiler/rpc/client/save_profile.cc
index 81f9490ff76..acf5ecc71de 100644
--- a/tensorflow/core/profiler/rpc/client/save_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/save_profile.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
 
-#include <initializer_list>
 #include <memory>
 #include <sstream>
 #include <string>
@@ -35,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
+#include "tensorflow/core/profiler/utils/file_system_utils.h"
 
 // Windows.h #defines ERROR, but it is also used in
 // tensorflow/core/util/event.proto
@@ -45,39 +45,6 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
-#ifdef PLATFORM_WINDOWS
-const absl::string_view kPathSep = "\\";
-#else
-const absl::string_view kPathSep = "/";
-#endif
-
-string ProfilerJoinPathImpl(std::initializer_list<absl::string_view> paths) {
-  string result;
-  for (absl::string_view path : paths) {
-    if (path.empty()) continue;
-
-    if (result.empty()) {
-      result = string(path);
-      continue;
-    }
-
-    path = absl::StripPrefix(path, kPathSep);
-    if (absl::EndsWith(result, kPathSep)) {
-      absl::StrAppend(&result, path);
-    } else {
-      absl::StrAppend(&result, kPathSep, path);
-    }
-  }
-
-  return result;
-}
-
-// A local duplication of ::tensorflow::io::JoinPath that supports windows.
-// TODO(b/150699701): revert to use ::tensorflow::io::JoinPath when fixed.
-template <typename... T>
-string ProfilerJoinPath(const T&... args) {
-  return ProfilerJoinPathImpl({args...});
-}
 
 constexpr char kProtoTraceFileName[] = "trace";
 constexpr char kTfStatsHelperSuffix[] = "tf_stats_helper_result";
@@ -86,8 +53,8 @@ Status DumpToolData(absl::string_view run_dir, absl::string_view host,
                     const ProfileToolData& tool, std::ostream* os) {
   // Don't save the intermediate results for combining the per host tool data.
   if (absl::EndsWith(tool.name(), kTfStatsHelperSuffix)) return Status::OK();
-  string host_prefix = host.empty() ? "" : absl::StrCat(host, ".");
-  string path =
+  std::string host_prefix = host.empty() ? "" : absl::StrCat(host, ".");
+  std::string path =
       ProfilerJoinPath(run_dir, absl::StrCat(host_prefix, tool.name()));
   TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(), path, tool.data()));
   if (os) {
@@ -97,7 +64,8 @@ Status DumpToolData(absl::string_view run_dir, absl::string_view host,
   return Status::OK();
 }
 
-Status WriteGzippedDataToFile(const string& filepath, const string& data) {
+Status WriteGzippedDataToFile(const std::string& filepath,
+                              const std::string& data) {
   std::unique_ptr<WritableFile> file;
   TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(filepath, &file));
   io::ZlibCompressionOptions options = io::ZlibCompressionOptions::GZIP();
@@ -110,9 +78,10 @@ Status WriteGzippedDataToFile(const string& filepath, const string& data) {
   return Status::OK();
 }
 
-Status GetOrCreateRunDir(const string& repository_root, const string& run,
-                         string* run_dir, std::ostream* os) {
-  // Dumps profile data to <repository_root>/<run>/.
+Status GetOrCreateRunDir(const std::string& repository_root,
+                         const std::string& run, std::string* run_dir,
+                         std::ostream* os) {
+  // Creates a directory to <repository_root>/<run>/.
   *run_dir = ProfilerJoinPath(repository_root, run);
   *os << "Creating directory: " << *run_dir;
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(*run_dir));
@@ -120,21 +89,21 @@ Status GetOrCreateRunDir(const string& repository_root, const string& run,
 }
 }  // namespace
 
-string GetTensorBoardProfilePluginDir(const string& logdir) {
+std::string GetTensorBoardProfilePluginDir(const std::string& logdir) {
   constexpr char kPluginName[] = "plugins";
   constexpr char kProfileName[] = "profile";
   return ProfilerJoinPath(logdir, kPluginName, kProfileName);
 }
 
-Status MaybeCreateEmptyEventFile(const string& logdir) {
+Status MaybeCreateEmptyEventFile(const std::string& logdir) {
   // Suffix for an empty event file.  it should be kept in sync with
   // _EVENT_FILE_SUFFIX in tensorflow/python/eager/profiler.py.
   constexpr char kProfileEmptySuffix[] = ".profile-empty";
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(logdir));
 
-  std::vector<string> children;
+  std::vector<std::string> children;
   TF_RETURN_IF_ERROR(Env::Default()->GetChildren(logdir, &children));
-  for (const string& child : children) {
+  for (const std::string& child : children) {
     if (absl::EndsWith(child, kProfileEmptySuffix)) {
       return Status::OK();
     }
@@ -143,10 +112,10 @@ Status MaybeCreateEmptyEventFile(const string& logdir) {
   return event_writer.InitWithSuffix(kProfileEmptySuffix);
 }
 
-Status SaveProfile(const string& repository_root, const string& run,
-                   const string& host, const ProfileResponse& response,
+Status SaveProfile(const std::string& repository_root, const std::string& run,
+                   const std::string& host, const ProfileResponse& response,
                    std::ostream* os) {
-  string run_dir;
+  std::string run_dir;
   TF_RETURN_IF_ERROR(GetOrCreateRunDir(repository_root, run, &run_dir, os));
   for (const auto& tool_data : response.tool_data()) {
     TF_RETURN_IF_ERROR(DumpToolData(run_dir, host, tool_data, os));
@@ -154,22 +123,24 @@ Status SaveProfile(const string& repository_root, const string& run,
   return Status::OK();
 }
 
-Status SaveGzippedToolData(const string& repository_root, const string& run,
-                           const string& host, const string& tool_name,
-                           const string& data) {
-  string run_dir;
+Status SaveGzippedToolData(const std::string& repository_root,
+                           const std::string& run, const std::string& host,
+                           const std::string& tool_name,
+                           const std::string& data) {
+  std::string run_dir;
   std::stringstream ss;
   Status status = GetOrCreateRunDir(repository_root, run, &run_dir, &ss);
   LOG(INFO) << ss.str();
   TF_RETURN_IF_ERROR(status);
-  string host_prefix = host.empty() ? "" : absl::StrCat(host, ".");
-  string path = ProfilerJoinPath(run_dir, absl::StrCat(host_prefix, tool_name));
+  std::string host_prefix = host.empty() ? "" : absl::StrCat(host, ".");
+  std::string path =
+      ProfilerJoinPath(run_dir, absl::StrCat(host_prefix, tool_name));
   TF_RETURN_IF_ERROR(WriteGzippedDataToFile(path, data));
   LOG(INFO) << "Dumped gzipped tool data for " << tool_name << " to " << path;
   return Status::OK();
 }
 
-string GetCurrentTimeStampAsString() {
+std::string GetCurrentTimeStampAsString() {
   return absl::FormatTime("%E4Y_%m_%d_%H_%M_%S", absl::Now(),
                           absl::LocalTimeZone());
 }
diff --git a/tensorflow/core/profiler/rpc/client/save_profile.h b/tensorflow/core/profiler/rpc/client/save_profile.h
index c155502fb60..9c15ef26080 100644
--- a/tensorflow/core/profiler/rpc/client/save_profile.h
+++ b/tensorflow/core/profiler/rpc/client/save_profile.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_SAVE_PROFILE_H_
 
 #include <ostream>
+#include <string>
 
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
@@ -25,27 +26,28 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
-string GetCurrentTimeStampAsString();
+std::string GetCurrentTimeStampAsString();
 
 // Returns the profile plugin directory given a logdir to TensorBoard.
-string GetTensorBoardProfilePluginDir(const string& logdir);
+std::string GetTensorBoardProfilePluginDir(const std::string& logdir);
 
 // Creates an empty event file if not already exists, which indicates that we
 // have a plugins/profile/ directory in the current logdir.
-Status MaybeCreateEmptyEventFile(const string& logdir);
+Status MaybeCreateEmptyEventFile(const std::string& logdir);
 
 // Saves all profiling tool data in a profile to <repository_root>/<run>/.
 // This writes user-facing log messages to `os`.
 // Note: this function creates a directory even when all fields in
 // ProfileResponse are unset/empty.
-Status SaveProfile(const string& repository_root, const string& run,
-                   const string& host, const ProfileResponse& response,
+Status SaveProfile(const std::string& repository_root, const std::string& run,
+                   const std::string& host, const ProfileResponse& response,
                    std::ostream* os);
 
 // Gzip the data and save to <repository_root>/<run>/.
-Status SaveGzippedToolData(const string& repository_root, const string& run,
-                           const string& host, const string& tool_name,
-                           const string& data);
+Status SaveGzippedToolData(const std::string& repository_root,
+                           const std::string& run, const std::string& host,
+                           const std::string& tool_name,
+                           const std::string& data);
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/oss/BUILD b/tensorflow/core/profiler/rpc/oss/BUILD
index 12bc92a68e8..cd455e225b7 100644
--- a/tensorflow/core/profiler/rpc/oss/BUILD
+++ b/tensorflow/core/profiler/rpc/oss/BUILD
@@ -1,9 +1,9 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")
+load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
-    default_visibility = [
-        "//tensorflow/core/profiler:internal",
-    ],
+    default_visibility = ["//tensorflow/core/profiler:internal"],
     licenses = ["notice"],  # Apache 2.0
 )
 
@@ -13,6 +13,7 @@ cc_library(
         "grpc.cc",
         "//tensorflow/core/profiler/rpc:grpc.h",
     ],
+    copts = tf_profiler_copts(),
     deps = [
         tf_grpc_cc_dependency(),
     ],
diff --git a/tensorflow/core/profiler/rpc/profiler_server.cc b/tensorflow/core/profiler/rpc/profiler_server.cc
index 966a94a1116..b65621450d1 100644
--- a/tensorflow/core/profiler/rpc/profiler_server.cc
+++ b/tensorflow/core/profiler/rpc/profiler_server.cc
@@ -23,27 +23,28 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
-#include "tensorflow/core/profiler/rpc/grpc.h"
 #include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
 
 namespace tensorflow {
+namespace profiler {
 
 void ProfilerServer::StartProfilerServer(int32 port) {
+  VLOG(1) << "Starting profiler server.";
   std::string server_address = absl::StrCat("[::]:", port);
   service_ = CreateProfilerService();
   ::grpc::ServerBuilder builder;
 
   int selected_port = 0;
-  builder.AddListeningPort(
-      server_address, profiler::GetDefaultServerCredentials(), &selected_port);
+  builder.AddListeningPort(server_address, ::grpc::InsecureServerCredentials(),
+                           &selected_port);
   builder.RegisterService(service_.get());
   server_ = builder.BuildAndStart();
   if (!selected_port) {
-    LOG(ERROR) << "Unable to bind to " << server_address << ":"
-               << selected_port;
+    LOG(ERROR) << "Unable to bind to " << server_address
+               << " selected port:" << selected_port;
   } else {
-    LOG(INFO) << "Profiling Server listening on " << server_address << ":"
-              << selected_port;
+    LOG(INFO) << "Profiler server listening on " << server_address
+              << " selected port:" << selected_port;
   }
 }
 
@@ -54,4 +55,5 @@ ProfilerServer::~ProfilerServer() {
   }
 }
 
+}  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_server.h b/tensorflow/core/profiler/rpc/profiler_server.h
index b7148e7e686..45680e83b6c 100644
--- a/tensorflow/core/profiler/rpc/profiler_server.h
+++ b/tensorflow/core/profiler/rpc/profiler_server.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
 
 namespace tensorflow {
+namespace profiler {
 
 class ProfilerServer {
  public:
@@ -34,6 +35,7 @@ class ProfilerServer {
   std::unique_ptr<::grpc::Server> server_;
 };
 
+}  // namespace profiler
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVER_H_
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.cc b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
index ba463813fc0..e8690f1f1f8 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.cc
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "grpcpp/support/status.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
+#include "absl/strings/str_replace.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/errors.h"
@@ -27,26 +28,45 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 #include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/file_system_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
 
 namespace tensorflow {
+namespace profiler {
 namespace {
 
 const absl::string_view kXPlanePb = "xplane.pb";
 
-Status CollectDataToResponse(const ProfileRequest& req,
-                             ProfilerSession* profiler,
-                             ProfileResponse* response) {
-  profiler::XSpace xspace;
+// Collects data in XSpace format. The data is saved to a repository
+// unconditionally.
+Status CollectDataToRepository(const ProfileRequest& request,
+                               ProfilerSession* profiler,
+                               ProfileResponse* response) {
+  response->set_empty_trace(true);
+  // Read the profile data into xspace.
+  XSpace xspace;
   TF_RETURN_IF_ERROR(profiler->CollectData(&xspace));
-  auto* tool_data = response->add_tool_data();
-  tool_data->set_name(kXPlanePb.data(), kXPlanePb.size());
-  xspace.SerializeToString(tool_data->mutable_data());
-  return Status::OK();
+  xspace.add_hostnames(request.host_name());
+  VLOG(3) << "Collected XSpace to repository.";
+  response->set_empty_trace(IsEmpty(xspace));
+
+  std::string log_dir_path =
+      ProfilerJoinPath(request.repository_root(), request.session_id());
+  VLOG(1) << "Creating " << log_dir_path;
+  TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(log_dir_path));
+
+  std::string file_name = absl::StrCat(request.host_name(), ".", kXPlanePb);
+  // Windows file names do not support colons.
+  absl::StrReplaceAll({{":", "_"}}, &file_name);
+  // Dumps profile data to <repository_root>/<run>/<host>_<port>.<kXPlanePb>
+  std::string out_path = ProfilerJoinPath(log_dir_path, file_name);
+  LOG(INFO) << "Collecting XSpace to repository: " << out_path;
+
+  return WriteBinaryProto(Env::Default(), out_path, xspace);
 }
 
 class ProfilerServiceImpl : public grpc::ProfilerService::Service {
@@ -68,7 +88,7 @@ class ProfilerServiceImpl : public grpc::ProfilerService::Service {
     }
 
     Env* env = Env::Default();
-    for (uint64 i = 0; i < req->duration_ms(); ++i) {
+    for (uint64 i = 0; i < req->opts().duration_ms(); ++i) {
       env->SleepForMicroseconds(EnvTime::kMillisToMicros);
       if (ctx->IsCancelled()) {
         return ::grpc::Status::CANCELLED;
@@ -80,7 +100,7 @@ class ProfilerServiceImpl : public grpc::ProfilerService::Service {
       }
     }
 
-    status = CollectDataToResponse(*req, profiler.get(), response);
+    status = CollectDataToRepository(*req, profiler.get(), response);
     if (!status.ok()) {
       return ::grpc::Status(::grpc::StatusCode::INTERNAL,
                             status.error_message());
@@ -115,4 +135,5 @@ std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService() {
   return absl::make_unique<ProfilerServiceImpl>();
 }
 
+}  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.h b/tensorflow/core/profiler/rpc/profiler_service_impl.h
index 00a850acbf2..5aca89d891f 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.h
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.h
@@ -20,9 +20,11 @@ limitations under the License.
 #include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
 
 namespace tensorflow {
+namespace profiler {
 
 std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService();
 
+}  // namespace profiler
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
diff --git a/tensorflow/core/profiler/tfprof_log.proto b/tensorflow/core/profiler/tfprof_log.proto
index 90b9e293ec7..b6db47e805e 100644
--- a/tensorflow/core/profiler/tfprof_log.proto
+++ b/tensorflow/core/profiler/tfprof_log.proto
@@ -1,6 +1,7 @@
 syntax = "proto3";
 
 package tensorflow.tfprof;
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/profiler/tfprof_log_go_proto";
 
 import "tensorflow/core/framework/attr_value.proto";
 import "tensorflow/core/framework/step_stats.proto";
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index 2d3ec1d004d..f7c6d5496d5 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -1,4 +1,6 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
     default_visibility = ["//tensorflow/core/profiler:internal"],
@@ -16,7 +18,9 @@ cc_library(
     name = "diagnostics",
     srcs = ["diagnostics.cc"],
     hdrs = ["diagnostics.h"],
+    copts = tf_profiler_copts(),
     deps = [
+        "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
@@ -29,6 +33,7 @@ cc_library(
     name = "event_span",
     srcs = ["event_span.cc"],
     hdrs = ["event_span.h"],
+    copts = tf_profiler_copts(),
     deps = [
         ":timespan",
         "//tensorflow/core:lib",
@@ -45,6 +50,7 @@ cc_library(
     name = "hardware_type_utils",
     srcs = ["hardware_type_utils.cc"],
     hdrs = ["hardware_type_utils.h"],
+    copts = tf_profiler_copts(),
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
@@ -57,6 +63,14 @@ cc_library(
     hdrs = ["math_utils.h"],
 )
 
+cc_library(
+    name = "format_utils",
+    hdrs = ["format_utils.h"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "html_utils",
     hdrs = ["html_utils.h"],
@@ -69,6 +83,7 @@ cc_library(
     name = "op_metrics_db_utils",
     srcs = ["op_metrics_db_utils.cc"],
     hdrs = ["op_metrics_db_utils.h"],
+    copts = tf_profiler_copts(),
     deps = [
         ":math_utils",
         ":tf_op_utils",
@@ -94,6 +109,7 @@ cc_library(
     name = "op_utils",
     srcs = ["op_utils.cc"],
     hdrs = ["op_utils.h"],
+    copts = tf_profiler_copts(),
     deps = [
         ":op_metrics_db_utils",
         ":tf_op_utils",
@@ -109,8 +125,10 @@ cc_library(
     name = "tf_op_utils",
     srcs = ["tf_op_utils.cc"],
     hdrs = ["tf_op_utils.h"],
+    copts = tf_profiler_copts(),
     deps = [
-        "//tensorflow/core:regexp_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform:regexp",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -130,6 +148,7 @@ tf_cc_test(
 cc_library(
     name = "timespan",
     hdrs = ["timespan.h"],
+    copts = tf_profiler_copts(),
     deps = [
         ":time_utils",
         "//tensorflow/core:lib",
@@ -150,6 +169,7 @@ tf_cc_test(
 cc_library(
     name = "time_utils",
     hdrs = ["time_utils.h"],
+    copts = tf_profiler_copts(),
     deps = [
         "//tensorflow/core:lib",
     ],
@@ -158,6 +178,7 @@ cc_library(
 cc_library(
     name = "trace_utils",
     hdrs = ["trace_utils.h"],
+    copts = tf_profiler_copts(),
     deps = [
         "//tensorflow/core:lib",
     ],
@@ -167,6 +188,7 @@ cc_library(
     name = "xplane_builder",
     srcs = ["xplane_builder.cc"],
     hdrs = ["xplane_builder.h"],
+    copts = tf_profiler_copts(),
     visibility = [":friends"],
     deps = [
         ":time_utils",
@@ -195,8 +217,10 @@ cc_library(
     name = "xplane_schema",
     srcs = ["xplane_schema.cc"],
     hdrs = ["xplane_schema.h"],
+    copts = tf_profiler_copts(),
     visibility = [":friends"],
     deps = [
+        ":tf_op_utils",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -209,9 +233,11 @@ cc_library(
     name = "xplane_utils",
     srcs = ["xplane_utils.cc"],
     hdrs = ["xplane_utils.h"],
+    copts = tf_profiler_copts(),
     visibility = [":friends"],
     deps = [
         ":timespan",
+        ":trace_utils",
         ":xplane_builder",
         ":xplane_visitor",
         "//tensorflow/core:platform_base",
@@ -244,6 +270,7 @@ cc_library(
     testonly = True,
     srcs = ["xplane_test_utils.cc"],
     hdrs = ["xplane_test_utils.h"],
+    copts = tf_profiler_copts(),
     visibility = [":friends"],
     deps = [
         ":xplane_builder",
@@ -261,6 +288,7 @@ cc_library(
     name = "xplane_visitor",
     srcs = ["xplane_visitor.cc"],
     hdrs = ["xplane_visitor.h"],
+    copts = tf_profiler_copts(),
     visibility = [":friends"],
     deps = [
         ":time_utils",
@@ -277,6 +305,7 @@ cc_library(
 cc_library(
     name = "tf_xplane_visitor",
     hdrs = ["tf_xplane_visitor.h"],
+    copts = tf_profiler_copts(),
     visibility = [":friends"],
     deps = [
         ":xplane_schema",
@@ -289,9 +318,9 @@ cc_library(
     name = "group_events",
     srcs = ["group_events.cc"],
     hdrs = ["group_events.h"],
+    copts = tf_profiler_copts(),
     visibility = [":friends"],
     deps = [
-        ":tf_op_utils",
         ":tf_xplane_visitor",
         ":xplane_builder",
         ":xplane_schema",
@@ -322,8 +351,10 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler/lib:connected_traceme",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
 )
@@ -332,6 +363,7 @@ cc_library(
     name = "cost_utils",
     srcs = ["cost_utils.cc"],
     hdrs = ["cost_utils.h"],
+    copts = tf_profiler_copts(),
     deps = [
         ":tf_op_utils",
         ":xplane_schema",
@@ -353,6 +385,7 @@ cc_library(
     name = "derived_timeline",
     srcs = ["derived_timeline.cc"],
     hdrs = ["derived_timeline.h"],
+    copts = tf_profiler_copts(),
     deps = [
         ":group_events",
         ":tf_op_utils",
@@ -398,6 +431,7 @@ cc_library(
     name = "kernel_stats_utils",
     srcs = ["kernel_stats_utils.cc"],
     hdrs = ["kernel_stats_utils.h"],
+    copts = tf_profiler_copts(),
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
@@ -422,6 +456,7 @@ cc_library(
     name = "tfstreamz_utils",
     srcs = ["tfstreamz_utils.cc"],
     hdrs = ["tfstreamz_utils.h"],
+    copts = tf_profiler_copts(),
     deps = [
         ":xplane_builder",
         "//tensorflow/core:lib",
@@ -433,3 +468,62 @@ cc_library(
         "@com_google_absl//absl/strings",
     ],
 )
+
+cc_library(
+    name = "step_intersection",
+    srcs = ["step_intersection.cc"],
+    hdrs = ["step_intersection.h"],
+    copts = tf_profiler_copts(),
+    deps = [
+        ":timespan",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/platform:types",
+        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
+tf_cc_test(
+    name = "step_intersection_test",
+    srcs = ["step_intersection_test.cc"],
+    deps = [
+        ":step_intersection",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
+cc_library(
+    name = "file_system_utils",
+    hdrs = ["file_system_utils.h"],
+    copts = tf_profiler_copts(),
+    deps = [
+        "//tensorflow/core/platform",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "parse_annotation",
+    srcs = ["parse_annotation.cc"],
+    hdrs = ["parse_annotation.h"],
+    copts = tf_profiler_copts(),
+    visibility = ["//tensorflow/core/profiler:friends"],
+    deps = [
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "parse_annotation_test",
+    srcs = ["parse_annotation_test.cc"],
+    deps = [
+        ":parse_annotation",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index 364ce270439..a78404af849 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -43,13 +43,6 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
-// TODO(profiler): Once we capture HLO protos for xla/gpu, we should use that
-// to look up tensorflow op name from hlo_module/hlo_op.
-absl::string_view DummySymbolResolver(absl::string_view hlo_module,
-                                      absl::string_view hlo_op) {
-  return absl::string_view();
-}
-
 const absl::string_view kAnnotationDelimiter = "::";
 
 XEvent CreateXEvent(const XEventMetadata& metadata, int64 offset_ps,
@@ -338,21 +331,17 @@ void DeriveEventsFromHostTrace(const XPlane* host_trace,
 
 void GenerateDerivedTimeLines(const GroupMetadataMap& group_metadata_map,
                               XSpace* space, bool step_info_only) {
-  for (XPlane& plane : *space->mutable_planes()) {
-    // Derived timelines only generated for device traces.
-    if (IsGpuPlaneName(plane.name())) {
-      DeriveEventsFromAnnotations(DummySymbolResolver, group_metadata_map,
-                                  &plane, step_info_only);
-    }
-  }
-}
-
-void GenerateDerivedTimeLines(const GroupMetadataMap& group_metadata_map,
-                              const std::vector<XPlane*>& device_traces,
-                              bool step_info_only) {
+  // TODO(profiler): Once we capture HLO protos for xla/gpu, we should use that
+  // to look up tensorflow op name from hlo_module/hlo_op.
+  auto dummy_symbol_resolver = [](absl::string_view hlo_module,
+                                  absl::string_view hlo_op) {
+    return absl::string_view();
+  };
+  std::vector<XPlane*> device_traces =
+      FindMutablePlanesWithPrefix(space, kGpuPlanePrefix);
   for (XPlane* plane : device_traces) {
-    DeriveEventsFromAnnotations(DummySymbolResolver, group_metadata_map, plane,
-                                step_info_only);
+    DeriveEventsFromAnnotations(dummy_symbol_resolver, group_metadata_map,
+                                plane, step_info_only);
   }
 }
 
diff --git a/tensorflow/core/profiler/utils/derived_timeline.h b/tensorflow/core/profiler/utils/derived_timeline.h
index bf8280708fa..5fcf8b44710 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.h
+++ b/tensorflow/core/profiler/utils/derived_timeline.h
@@ -98,9 +98,6 @@ void DeriveEventsFromHostTrace(const XPlane* host_trace,
 // derived timelines for the plane by calling DeriveEventsFromAnnotations.
 void GenerateDerivedTimeLines(const GroupMetadataMap& group_metadata_map,
                               XSpace* space, bool step_info_only = false);
-void GenerateDerivedTimeLines(const GroupMetadataMap& group_metadata_map,
-                              const std::vector<XPlane*>& device_traces,
-                              bool step_info_only = false);
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/diagnostics.h b/tensorflow/core/profiler/utils/diagnostics.h
index 3c96ebb0f06..80753ff4d51 100644
--- a/tensorflow/core/profiler/utils/diagnostics.h
+++ b/tensorflow/core/profiler/utils/diagnostics.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_UTILS_ERRORS_H_
 
 #include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 
@@ -24,15 +25,15 @@ namespace tensorflow {
 namespace profiler {
 
 // Error message that the visualization is based on incomplete step.
-ABSL_CONST_INIT extern const absl::string_view kErrorIncompleteStep;
+TF_CONST_INIT extern const absl::string_view kErrorIncompleteStep;
 
 // Error message that no step marker is seen and visualization contains no
 // step info.
-ABSL_CONST_INIT extern const absl::string_view kErrorNoStepMarker;
+TF_CONST_INIT extern const absl::string_view kErrorNoStepMarker;
 
-ABSL_CONST_INIT extern const absl::string_view kNoDeviceTraceCollected;
+TF_CONST_INIT extern const absl::string_view kNoDeviceTraceCollected;
 
-ABSL_CONST_INIT extern const absl::string_view kStepsDropped;
+TF_CONST_INIT extern const absl::string_view kStepsDropped;
 
 void PopulateStepDiagnostics(const OpStats& op_stats, Diagnostics* diag);
 
diff --git a/tensorflow/core/profiler/utils/event_span.cc b/tensorflow/core/profiler/utils/event_span.cc
index 137a798c7f8..3bc2505ea89 100644
--- a/tensorflow/core/profiler/utils/event_span.cc
+++ b/tensorflow/core/profiler/utils/event_span.cc
@@ -162,8 +162,35 @@ EventType ClassifyDeviceCompute(absl::string_view event_name,
   }
 }
 
+constexpr int kNumGenericEventTypes = GenericEventType::kLastGenericEventType -
+                                      GenericEventType::kFirstGenericEventType +
+                                      1;
+
+using GenericEventTypeStrMap =
+    absl::flat_hash_map<GenericEventType, absl::string_view>;
+
+const GenericEventTypeStrMap& GetGenericEventTypeStrMap() {
+  static const auto* generic_event_type_str_map = new GenericEventTypeStrMap({
+      {kDeviceCompute, "Device compute"},
+      {kDeviceToDevice, "Device to device"},
+      {kDeviceCollectives, "Device collective communication"},
+      {kHostCompute, "Host compute"},
+      {kHostPrepare, "Kernel launch"},
+      {kInput, "Input"},
+      {kOutput, "Output"},
+      {kCompile, "Compilation"},
+      {kAllOthers, "All others"},
+  });
+  DCHECK_EQ(generic_event_type_str_map->size(), kNumGenericEventTypes);
+  return *generic_event_type_str_map;
+}
+
 }  // namespace
 
+absl::string_view GetGenericEventTypeStr(GenericEventType event_type) {
+  return GetGenericEventTypeStrMap().at(event_type);
+}
+
 EventType ClassifyGpuEvent(absl::string_view event_name,
                            absl::string_view tensor_shapes) {
   if (absl::StartsWithIgnoreCase(event_name, "MEMCPYHtoD"))
@@ -210,6 +237,8 @@ std::string PrintEventType(EventType event_type) {
       return "host_to_device";
     case HOST_PREPARE:
       return "host_prepare";
+    case DEVICE_COLLECTIVES:
+      return "device_collectives";
     case HOST_WAIT_INPUT:
       return "host_wait_input";
     case DEVICE_TO_DEVICE:
diff --git a/tensorflow/core/profiler/utils/event_span.h b/tensorflow/core/profiler/utils/event_span.h
index 6ffbd228d5e..898d6ce7ad3 100644
--- a/tensorflow/core/profiler/utils/event_span.h
+++ b/tensorflow/core/profiler/utils/event_span.h
@@ -46,14 +46,17 @@ enum EventType {
   HOST_TO_DEVICE = 40,
   // Host is preparing to launch a computation on device.
   HOST_PREPARE = 50,
-  // Host is waiting for input.
-  HOST_WAIT_INPUT = 60,
-  // Device-to-device communication.
-  DEVICE_TO_DEVICE = 70,
-  // Device-to-host communication.
-  DEVICE_TO_HOST = 80,
+  // Assigns a smaller priority to DEVICE_COLLECTIVES than HOST_WAIT_INPUT,
+  // because if an all-reduce event is overlapped with an host-wait-input event,
+  // we want to count it as waiting for input.
   // Collective Ops such as All-Reduce.
-  DEVICE_COLLECTIVES = 90,
+  DEVICE_COLLECTIVES = 60,
+  // Host is waiting for input.
+  HOST_WAIT_INPUT = 70,
+  // Device-to-device communication.
+  DEVICE_TO_DEVICE = 80,
+  // Device-to-host communication.
+  DEVICE_TO_HOST = 90,
   // Device is computing with 32-bit precision.
   DEVICE_COMPUTE_32 = 100,
   // Device is computing with 16-bit precision.
@@ -65,6 +68,30 @@ enum EventType {
   LAST_EVENT_TYPE = DEVICE_WAIT_HOST
 };
 
+// Generic event types that shown to the user.
+enum GenericEventType {
+  kFirstGenericEventType = 1,
+  // Device is computing.
+  kDeviceCompute = kFirstGenericEventType,
+  // Device-to-device communication.
+  kDeviceToDevice,
+  // Collective Ops such as All-Reduce and NCCL.
+  kDeviceCollectives,
+  // Host is computing.
+  kHostCompute,
+  // Host is preparing to launch a computation on device.
+  kHostPrepare,
+  // Device waiting for input from the host.
+  kInput,
+  // Device sending output to the host.
+  kOutput,
+  // Host is compling.
+  kCompile,
+  // No recognized event associated with the time.
+  kAllOthers,
+  kLastGenericEventType = kAllOthers,
+};
+
 // Contains the type and timespan of an event.
 struct EventTypeSpan {
   EventType type;  // type of this event.
@@ -194,6 +221,9 @@ EventType ClassifyGpuEvent(absl::string_view event_name,
 // Returns the name of the given EventType.
 std::string PrintEventType(EventType event_type);
 
+// Returns the string of the given GenericEventType.
+absl::string_view GetGenericEventTypeStr(GenericEventType event_type);
+
 // Returns a string that prints the given EventTypeSpan.
 std::string PrintEventTypeSpan(const EventTypeSpan& event_type_span);
 
diff --git a/tensorflow/core/profiler/utils/file_system_utils.h b/tensorflow/core/profiler/utils/file_system_utils.h
new file mode 100644
index 00000000000..e0cebbef6fc
--- /dev/null
+++ b/tensorflow/core/profiler/utils/file_system_utils.h
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_FILE_SYSTEM_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_FILE_SYSTEM_UTILS_H_
+
+#include <initializer_list>
+#include <string>
+
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
+#include "tensorflow/core/platform/platform.h"
+
+#ifdef PLATFORM_WINDOWS
+const absl::string_view kPathSep = "\\";
+#else
+const absl::string_view kPathSep = "/";
+#endif
+
+namespace tensorflow {
+namespace profiler {
+
+inline std::string ProfilerJoinPathImpl(
+    std::initializer_list<absl::string_view> paths) {
+  std::string result;
+  for (absl::string_view path : paths) {
+    if (path.empty()) continue;
+
+    if (result.empty()) {
+      result = std::string(path);
+      continue;
+    }
+
+    path = absl::StripPrefix(path, kPathSep);
+    if (absl::EndsWith(result, kPathSep)) {
+      absl::StrAppend(&result, path);
+    } else {
+      absl::StrAppend(&result, kPathSep, path);
+    }
+  }
+
+  return result;
+}
+
+// A local duplication of ::tensorflow::io::JoinPath that supports windows.
+// TODO(b/150699701): revert to use ::tensorflow::io::JoinPath when fixed.
+template <typename... T>
+std::string ProfilerJoinPath(const T&... args) {
+  return ProfilerJoinPathImpl({args...});
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_FILE_SYSTEM_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/format_utils.h b/tensorflow/core/profiler/utils/format_utils.h
new file mode 100644
index 00000000000..ca989f0b2ec
--- /dev/null
+++ b/tensorflow/core/profiler/utils/format_utils.h
@@ -0,0 +1,53 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_FORMAT_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_FORMAT_UTILS_H_
+
+#include <stdio.h>
+
+#include <string>
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace internal {
+
+inline std::string FormatDouble(const char* fmt, double d) {
+  constexpr int kBufferSize = 32;
+  char buffer[kBufferSize];
+  int result = snprintf(buffer, kBufferSize, fmt, d);
+  DCHECK(result > 0 && result < kBufferSize);
+  return std::string(buffer);
+}
+
+}  // namespace internal
+
+// Formats d with one digit after the decimal point.
+inline std::string OneDigit(double d) {
+  return internal::FormatDouble("%.1f", d);
+}
+
+// Formats d with maximum precision to allow parsing the result back to the same
+// number.
+inline std::string MaxPrecision(double d) {
+  return internal::FormatDouble("%.17g", d);
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_FORMAT_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/group_events.cc b/tensorflow/core/profiler/utils/group_events.cc
index 85367e3ba9b..a78fe5a1513 100644
--- a/tensorflow/core/profiler/utils/group_events.cc
+++ b/tensorflow/core/profiler/utils/group_events.cc
@@ -20,26 +20,22 @@ limitations under the License.
 #include <iterator>
 #include <map>
 #include <memory>
+#include <queue>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/optional.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/connected_traceme.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
-#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -51,62 +47,46 @@ void CreateStatMetadata(XPlane* plane) {
   builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kGroupId));
   builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kStepName));
   builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kIsEager));
+  builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kSelectedGroupIds));
 }
 
 // Returns event type if it is a KernelLaunch or KernelExecute event.
 absl::optional<int64> GetKernelEventType(bool is_host_plane,
-                                         const XPlaneVisitor& visitor,
-                                         const XEvent& event) {
-  for (const auto& stat : event.stats()) {
-    if (visitor.GetStatType(stat) == StatType::kCorrelationId) {
-      return is_host_plane ? HostEventType::kKernelLaunch
-                           : HostEventType::kKernelExecute;
-    }
+                                         const EventNode& event) {
+  if (event.GetEventVisitor().GetStat(StatType::kCorrelationId).has_value()) {
+    return is_host_plane ? HostEventType::kKernelLaunch
+                         : HostEventType::kKernelExecute;
   }
   return absl::nullopt;
 }
 
-Category GetTfEventCategory(const XPlaneVisitor& visitor, const XEvent& event) {
-  TfOp tf_op =
-      ParseTfOpFullname(visitor.GetEventMetadata(event.metadata_id())->name());
-  return tf_op.category;
-}
-
-int64 GetEventType(bool is_host_plane, const XPlaneVisitor& visitor,
-                   const XEvent& event) {
-  if (absl::optional<int64> event_type = visitor.GetEventType(event)) {
+int64 GetEventType(bool is_host_plane, const EventNode& event) {
+  if (absl::optional<int64> event_type = event.GetEventVisitor().Type()) {
     return *event_type;
   } else if (absl::optional<int64> kernel_event_type =
-                 GetKernelEventType(is_host_plane, visitor, event)) {
+                 GetKernelEventType(is_host_plane, event)) {
     // KernelLaunch and KernelExecute event types are not supported by
     // XPlaneVisitor and should be checked separately.
     // TODO(b/148346217): Make XPlaneVisitor support KernelLaunch and
     // KernelExecute event types.
     return *kernel_event_type;
   } else {
-    Category category = GetTfEventCategory(visitor, event);
-    switch (category) {
-      case Category::kTensorFlow:
-        return HostEventType::kTfOpRun;
-      case Category::kTfData:
-        return HostEventType::kIterator;
-      default:
-        return HostEventType::kUnknownHostEventType;
+    absl::string_view name = event.GetEventVisitor().Name();
+    // Legacy event names appended with arguments.
+    if (absl::StartsWith(name, "BatchingSessionRun")) {
+      return HostEventType::kBatchingSessionRun;
+    } else if (absl::StartsWith(name, "ProcessBatch")) {
+      return HostEventType::kProcessBatch;
     }
+    return HostEventType::kUnknownHostEventType;
   }
 }
 
-void SetGroupId(const XPlaneVisitor& visitor, int64 group_id, XEvent* event) {
-  AddOrUpdateIntStat(*visitor.GetStatMetadataId(StatType::kGroupId), group_id,
-                     event);
-}
-
 void SetContextGroup(EventNode* event, ContextGroupMap* context_groups) {
   auto producer = event->GetProducerContext();
   if (producer.has_value()) {
-    DCHECK_EQ(((*context_groups)[producer->type][producer->id]).producer,
-              nullptr);
-    ((*context_groups)[producer->type][producer->id]).producer = event;
+    ((*context_groups)[producer->type][producer->id])
+        .producers.push_back(event);
   }
   auto consumer = event->GetConsumerContext();
   if (consumer.has_value()) {
@@ -119,7 +99,7 @@ void ConnectContextGroups(const ContextGroupMap& context_groups) {
   for (auto& type_id_group : context_groups) {
     for (auto& id_group : type_id_group.second) {
       const ContextGroup& group = id_group.second;
-      if (EventNode* parent = group.producer) {
+      for (EventNode* parent : group.producers) {
         for (EventNode* child : group.consumers) {
           parent->AddChild(child);
         }
@@ -153,9 +133,15 @@ bool IsImplicitRootEvent(const XEventVisitor& event) {
          kImplicitRootEvents->contains(*event.Type());
 }
 
-void ProcessRootEvent(int64 group_id, EventNode* root_event,
+void ProcessRootEvent(int64 group_id, bool set_step_name, EventNode* root_event,
                       GroupMetadataMap* group_metadata_map) {
-  root_event->PropagateGroupId(group_id);
+  root_event->PropagateGroupId(group_id, group_metadata_map);
+  if (!set_step_name) {
+    // Step names are not necessary for inference profiles but add group_id to
+    // group_metadata_map to count the number of groups.
+    group_metadata_map->emplace(group_id, GroupMetadata());
+    return;
+  }
   std::string group_name = root_event->GetGroupName();
   // TODO(jihochoi): change event name instead.
   if (!IsImplicitRootEvent(root_event->GetEventVisitor())) {
@@ -258,14 +244,42 @@ bool IsLegacyRootEvent(const XEventVisitor& event) {
   return event.Type().has_value() && kRootEvents->contains(*event.Type());
 }
 
+using Comparator = std::function<bool(const EventNode*)>;
+
+const EventNode* FindParentWithComparator(const Comparator& comparator,
+                                          const EventNode* node,
+                                          bool include_self) {
+  std::queue<const EventNode*> nodes;
+  absl::flat_hash_set<const EventNode*> seen = {node};
+  if (include_self) {
+    nodes.push(node);
+  } else {
+    for (const EventNode* parent : node->GetParents()) {
+      nodes.push(parent);
+      seen.insert(parent);
+    }
+  }
+  while (!nodes.empty()) {
+    const EventNode* node = nodes.front();
+    nodes.pop();
+    if (comparator(node)) return node;
+    for (const EventNode* parent : node->GetParents()) {
+      if (seen.contains(parent)) continue;
+      nodes.push(parent);
+      seen.insert(parent);
+    }
+  }
+  return nullptr;
+}
+
 // Returns true if none of its ancestors is a root event.
 bool IsTopRoot(const EventNode* event) {
+  // If it is already grouped, it is not a top root.
   if (event->GetGroupId().has_value()) return false;
-  for (EventNode* cur = event->GetParent(); cur != nullptr;
-       cur = cur->GetParent()) {
-    if (cur->IsRoot()) return false;
-  }
-  return true;
+  const EventNode* root_parent = FindParentWithComparator(
+      [](const EventNode* node) { return node->IsRoot(); }, event,
+      /*include_self=*/false);
+  return root_parent == nullptr;
 }
 
 void SortEventList(EventList* event_list) {
@@ -280,6 +294,11 @@ bool HasJaxEvent(const EventNodeMap& event_node_map) {
   return event_node_map.contains(HostEventType::kExecuteOnLocalDevices);
 }
 
+bool IsIteratorEventType(absl::optional<int64> event_type) {
+  return event_type == HostEventType::kIterator ||
+         event_type == HostEventType::kDeviceInputPipelineSecondIterator;
+}
+
 }  // namespace
 
 EventNode::EventNode(const XPlaneVisitor* plane, XLine* raw_line,
@@ -347,10 +366,20 @@ EventNode::EventNode(const EventNode& event_node)
                 event_node.raw_event_) {}
 
 absl::optional<XStatVisitor> EventNode::GetContextStat(int64 stat_type) const {
-  for (const EventNode* node = this; node != nullptr; node = node->parent_) {
+  std::queue<const EventNode*> nodes;
+  absl::flat_hash_set<const EventNode*> seen = {this};
+  nodes.push(this);
+  while (!nodes.empty()) {
+    const EventNode* node = nodes.front();
+    nodes.pop();
     if (absl::optional<XStatVisitor> stat = node->visitor_.GetStat(stat_type)) {
       return stat;
     }
+    for (const EventNode* parent : node->GetParents()) {
+      if (seen.contains(parent)) continue;
+      nodes.push(parent);
+      seen.insert(parent);
+    }
   }
   return absl::nullopt;
 }
@@ -374,16 +403,34 @@ std::string EventNode::GetGroupName() const {
   return name;
 }
 
-void EventNode::PropagateGroupId(int64 group_id) {
+void EventNode::SetGroupId(int64 group_id) {
   group_id_ = group_id;
-  SetGroupId(*plane_, group_id, raw_event_);
-  for (const auto& child : children_) {
-    // Skip if it already belongs to a group. Some nodes may be added multiple
-    // times as child (e.g., sometimes async ops are executed synchronously and
-    // their nodes are added as child both in ConnectIntraThread and
-    // ConnectInterThread).
-    if (child->GetGroupId()) continue;
-    child->PropagateGroupId(*group_id_);
+  AddOrUpdateIntStat(*plane_->GetStatMetadataId(StatType::kGroupId), group_id,
+                     raw_event_);
+}
+
+void EventNode::PropagateGroupId(int64 group_id,
+                                 GroupMetadataMap* group_metadata_map) {
+  std::queue<EventNode*> nodes;
+  absl::flat_hash_set<EventNode*> seen = {this};
+  nodes.push(this);
+  while (!nodes.empty()) {
+    EventNode* node = nodes.front();
+    nodes.pop();
+    absl::optional<int64> node_group_id = node->GetGroupId();
+    if (node_group_id.has_value()) {
+      if (*node_group_id != group_id) {
+        (*group_metadata_map)[group_id].children.insert(*node_group_id);
+        (*group_metadata_map)[*node_group_id].parents.insert(group_id);
+      }
+    } else {
+      node->SetGroupId(group_id);
+      for (EventNode* child : node->GetChildren()) {
+        if (seen.contains(child)) continue;
+        nodes.push(child);
+        seen.insert(child);
+      }
+    }
   }
 }
 
@@ -392,6 +439,24 @@ void EventNode::AddStepName(absl::string_view step_name) {
                      raw_event_);
 }
 
+void EventNode::AddSelectedGroupIds(
+    const GroupMetadataMap& group_metadata_map) {
+  std::vector<int64> group_ids;
+  group_ids.reserve(1 + group_metadata_map.at(*group_id_).parents.size() +
+                    group_metadata_map.at(*group_id_).children.size());
+  group_ids.push_back(*group_id_);
+  group_ids.insert(group_ids.end(),
+                   group_metadata_map.at(*group_id_).parents.begin(),
+                   group_metadata_map.at(*group_id_).parents.end());
+  group_ids.insert(group_ids.end(),
+                   group_metadata_map.at(*group_id_).children.begin(),
+                   group_metadata_map.at(*group_id_).children.end());
+  AddOrUpdateStrStat(
+      *plane_->GetStatMetadataId(StatType::kSelectedGroupIds),
+      absl::StrCat("?selected_group_ids=", absl::StrJoin(group_ids, ",")),
+      raw_event_);
+}
+
 void EventNode::SetIsEager(bool is_eager) {
   AddOrUpdateIntStat(*plane_->GetStatMetadataId(StatType::kIsEager),
                      is_eager ? 1 : 0, raw_event_);
@@ -406,15 +471,11 @@ bool EventNode::IsEager() {
 }
 
 const EventNode* EventNode::FindParent(int64 event_type) const {
-  absl::flat_hash_set<const EventNode*> seen;
-  const EventNode* node = this;
-  while (node) {
-    if (seen.contains(node)) break;
-    if (node->GetEventVisitor().Type() == event_type) return node;
-    seen.insert(node);
-    node = node->GetParent();
-  }
-  return nullptr;
+  return FindParentWithComparator(
+      [event_type](const EventNode* node) {
+        return node->GetEventVisitor().Type() == event_type;
+      },
+      this, /*include_self=*/true);
 }
 
 bool EventNode::StartsBefore(const EventNode& other) const {
@@ -422,15 +483,14 @@ bool EventNode::StartsBefore(const EventNode& other) const {
          other.GetEventVisitor().TimestampPs();
 }
 
-void EventForest::ConnectIntraThread(const XPlaneVisitor& visitor,
-                                     XPlane* plane,
+void EventForest::ConnectIntraThread(XPlane* plane, XPlaneVisitor* visitor,
                                      ContextGroupMap* context_groups) {
   // TODO(b/149095099): avoid string comparison.
-  bool is_host_plane = (visitor.Name() == kHostThreadsPlaneName);
+  bool is_host_plane = (visitor->Name() == kHostThreadsPlaneName);
   for (auto& line : *plane->mutable_lines()) {
     std::vector<EventNode*> parent_nodes;
     for (auto& event : *line.mutable_events()) {
-      auto cur_node = absl::make_unique<EventNode>(&visitor, &line, &event);
+      auto cur_node = absl::make_unique<EventNode>(visitor, &line, &event);
       // Update `context_groups` for `ConnectInterThread`.
       SetContextGroup(cur_node.get(), context_groups);
       // Update `root_events_` for `CreateEventGroup`.
@@ -449,7 +509,7 @@ void EventForest::ConnectIntraThread(const XPlaneVisitor& visitor,
       }
       parent_nodes.push_back(cur_node.get());
       // event_node_map_ keeps cur_node alive.
-      event_node_map_[GetEventType(is_host_plane, visitor, event)].push_back(
+      event_node_map_[GetEventType(is_host_plane, *cur_node)].push_back(
           std::move(cur_node));
     }
   }
@@ -512,11 +572,41 @@ void EventForest::ProcessLegacyRootEvents(
   }
 }
 
-void EventForest::CreateEventGroup() {
+void EventForest::CreateEventGroups() {
+  // Handle inference batching profiles.
+  if (event_node_map_.contains(HostEventType::kProcessBatch)) {
+    // Assign group_id per batch.
+    for (const auto& process_batch_node :
+         event_node_map_[HostEventType::kProcessBatch]) {
+      ProcessRootEvent(next_group_id_++, /*set_step_name=*/false,
+                       process_batch_node.get(), &group_metadata_map_);
+    }
+    HostEventType request_event_type =
+        event_node_map_.contains(HostEventType::kBatchingSessionRun)
+            ? HostEventType::kBatchingSessionRun
+            : HostEventType::kSessionRun;
+    if (auto request_events =
+            gtl::FindOrNull(event_node_map_, request_event_type)) {
+      // Assign group_id per request.
+      for (const auto& request_event : *request_events) {
+        ProcessRootEvent(next_group_id_++, /*set_step_name=*/false,
+                         request_event.get(), &group_metadata_map_);
+        // Also, set a helper stat for selected_group_ids.
+        request_event->AddSelectedGroupIds(group_metadata_map_);
+      }
+    }
+    // Set a helper stat for selected_group_ids per batch.
+    for (const auto& process_batch_node :
+         event_node_map_[HostEventType::kProcessBatch]) {
+      process_batch_node->AddSelectedGroupIds(group_metadata_map_);
+    }
+    return;
+  }
   // Create a group for each TF loop iteration in non-JAX profiles.
   if (!HasJaxEvent(event_node_map_) && !tf_loop_root_events_.empty()) {
     for (EventNode* root_event : tf_loop_root_events_) {
-      ProcessRootEvent(next_group_id_++, root_event, &group_metadata_map_);
+      ProcessRootEvent(next_group_id_++, /*set_step_name=*/true, root_event,
+                       &group_metadata_map_);
     }
     return;
   }
@@ -527,7 +617,8 @@ void EventForest::CreateEventGroup() {
     if (IsTopRoot(root_event) &&
         (!HasJaxEvent(event_node_map_) ||
          !IsLegacyRootEvent(root_event->GetEventVisitor()))) {
-      ProcessRootEvent(next_group_id_++, root_event, &group_metadata_map_);
+      ProcessRootEvent(next_group_id_++, /*set_step_name=*/true, root_event,
+                       &group_metadata_map_);
     }
   }
 }
@@ -649,7 +740,41 @@ void EventForest::ProcessModelIds() {
   }
 }
 
-void EventForest::ProcessTfDataEvents() {
+void EventForest::AddPlane(
+    const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
+    XPlane* plane) {
+  CreateStatMetadata(plane);
+  planes_.push_back({plane, visitor_factory(plane)});
+}
+
+void EventForest::AddSpace(
+    const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
+    XSpace* space) {
+  for (XPlane& plane : *space->mutable_planes()) {
+    AddPlane(visitor_factory, &plane);
+  }
+}
+
+void EventForest::AddPlanes(
+    const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
+    const std::vector<XPlane*>& planes) {
+  for (XPlane* plane : planes) {
+    AddPlane(visitor_factory, plane);
+  }
+}
+
+void EventForest::ConnectEvents(
+    const std::vector<InterThreadConnectInfo>& connect_info_list) {
+  ContextGroupMap context_groups;
+  for (auto& plane_visitor : planes_) {
+    ConnectIntraThread(plane_visitor.first, &plane_visitor.second,
+                       &context_groups);
+  }
+  ConnectInterThread(connect_info_list);
+  ConnectContextGroups(context_groups);
+}
+
+void EventForest::ConnectTfDataEvents() {
   absl::flat_hash_map<std::pair<int64 /*iterator_id*/, int64 /*element_id*/>,
                       std::vector<EventNode*>>
       produce_iterator_map;
@@ -668,8 +793,7 @@ void EventForest::ProcessTfDataEvents() {
           produce_event->GetEventVisitor().GetStat(StatType::kElementId);
       if (!element_id.has_value()) continue;
       for (EventNode* produce_iterator : produce_event->GetChildren()) {
-        if (IsDatasetOp(ParseTfOpFullname(
-                produce_iterator->GetEventVisitor().Name()))) {
+        if (IsIteratorEventType(produce_iterator->GetEventVisitor().Type())) {
           absl::optional<XStatVisitor> iterator_id =
               produce_iterator->GetEventVisitor().GetStat(StatType::kParentId);
           if (!iterator_id.has_value()) break;
@@ -697,10 +821,12 @@ void EventForest::ProcessTfDataEvents() {
       absl::optional<XStatVisitor> element_id =
           consume_event->GetEventVisitor().GetStat(StatType::kElementId);
       if (!element_id.has_value()) continue;
-      EventNode* consume_iterator = consume_event->GetParent();
+      if (consume_event->GetParents().empty()) continue;
+      // consume_event is nested by consumer_iterator and does not have other
+      // parents.
+      EventNode* consume_iterator = consume_event->GetParents().at(0);
       if (!consume_iterator ||
-          !IsDatasetOp(
-              ParseTfOpFullname(consume_iterator->GetEventVisitor().Name()))) {
+          !IsIteratorEventType(consume_iterator->GetEventVisitor().Type())) {
         continue;
       }
       absl::optional<XStatVisitor> iterator_id =
@@ -719,40 +845,16 @@ void EventForest::ProcessTfDataEvents() {
   VLOG(1) << num_matched << " consumer iterators matched.";
 }
 
-EventForest::EventForest(
-    const std::vector<InterThreadConnectInfo>& connect_info_list,
-    const std::vector<int64>& root_event_types,
-    const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
-    XSpace* space) {
-  ContextGroupMap context_groups;
-  visitors_.reserve(space->planes_size());
-  for (auto& plane : *space->mutable_planes()) {
-    CreateStatMetadata(&plane);
-    visitors_.push_back(visitor_factory(&plane));
-    ConnectIntraThread(visitors_.back(), &plane, &context_groups);
-  }
-  ConnectInterThread(connect_info_list);
-  ConnectContextGroups(context_groups);
+void EventForest::GroupEvents(const std::vector<int64>& root_event_types) {
   ProcessTensorFlowLoop();
   ProcessWorker();
   ProcessLegacyRootEvents(root_event_types);
-  CreateEventGroup();
+  CreateEventGroups();
   MarkEagerlyExecutedGpuKernels();
   MarkEagerlyExecutedCpuTfOps();
   ProcessModelIds();
 }
 
-EventForest::EventForest(
-    const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
-    XPlane* plane) {
-  ContextGroupMap context_groups;
-  visitors_.reserve(1);
-  CreateStatMetadata(plane);
-  visitors_.push_back(visitor_factory(plane));
-  ConnectIntraThread(visitors_.back(), plane, &context_groups);
-  ConnectContextGroups(context_groups);
-}
-
 std::vector<InterThreadConnectInfo> CreateInterThreadConnectInfoList() {
   std::vector<InterThreadConnectInfo> connect_info_list = {
       {HostEventType::kExecutorStateProcess,
@@ -767,14 +869,17 @@ std::vector<InterThreadConnectInfo> CreateInterThreadConnectInfoList() {
   return connect_info_list;
 }
 
-void GroupTfEvents(XSpace* space, GroupMetadataMap* group_metadata_map) {
-  if (!space) return;
+void GroupTfEvents(XSpace* space, EventForest* event_forest) {
   std::vector<InterThreadConnectInfo> connect_info_list =
       CreateInterThreadConnectInfoList();
-  EventForest event_forest(connect_info_list, {}, CreateTfXPlaneVisitor, space);
-  if (group_metadata_map) {
-    *group_metadata_map = event_forest.GetGroupMetadataMap();
-  }
+  event_forest->AddSpace(CreateTfXPlaneVisitor, space);
+  event_forest->ConnectEvents(connect_info_list);
+  event_forest->GroupEvents();
+}
+
+void GroupTfEvents(XSpace* space) {
+  EventForest event_forest;
+  GroupTfEvents(space, &event_forest);
 }
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/utils/group_events.h b/tensorflow/core/profiler/utils/group_events.h
index 44026c8d99d..57519f361a6 100644
--- a/tensorflow/core/profiler/utils/group_events.h
+++ b/tensorflow/core/profiler/utils/group_events.h
@@ -16,12 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_GROUP_EVENTS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_GROUP_EVENTS_H_
 
+#include <deque>
 #include <functional>
 #include <memory>
 #include <string>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/platform/logging.h"
@@ -49,6 +51,15 @@ struct ContextInfo {
   uint64 id;
 };
 
+struct GroupMetadata {
+  std::string name;
+  std::string model_id;  // inference only.
+  absl::flat_hash_set<int64> parents;
+  absl::flat_hash_set<int64> children;
+};
+
+using GroupMetadataMap = absl::flat_hash_map<int64 /*group_id*/, GroupMetadata>;
+
 // A wrapper for XEvent with parent and children pointers. Through these
 // pointers, a tree of EventNode is formed.
 class EventNode {
@@ -58,21 +69,23 @@ class EventNode {
 
   EventNode(const EventNode& event_node);
 
-  EventNode* GetParent() const { return parent_; }
+  const std::vector<EventNode*>& GetParents() const { return parents_; }
 
   const std::vector<EventNode*>& GetChildren() const { return children_; }
 
   void AddChild(EventNode* child) {
     children_.push_back(child);
-    child->parent_ = this;
+    child->parents_.push_back(this);
   }
 
   absl::optional<int64> GetGroupId() const { return group_id_; }
 
   std::string GetGroupName() const;
 
+  void SetGroupId(int64 group_id);
+
   // Sets group_id for this node and its descendants.
-  void PropagateGroupId(int64 group_id);
+  void PropagateGroupId(int64 group_id, GroupMetadataMap* group_metadata_map);
 
   const XPlaneVisitor& GetPlaneVisitor() const { return *plane_; }
 
@@ -82,6 +95,10 @@ class EventNode {
 
   void AddStepName(absl::string_view step_name);
 
+  // Add a helper stat, "selected_group_ids", with group_ids of the groups
+  // connected to this event's group.
+  void AddSelectedGroupIds(const GroupMetadataMap& group_metadata_map);
+
   void SetIsEager(bool is_eager);
 
   // Returns true if this event is part of eagerly executed op.
@@ -113,7 +130,7 @@ class EventNode {
   XEventVisitor visitor_;
   XLine* raw_line_;
   XEvent* raw_event_;
-  EventNode* parent_ = nullptr;
+  std::vector<EventNode*> parents_;
   std::vector<EventNode*> children_;
   absl::optional<int64> group_id_;
   absl::optional<ContextInfo> producer_context_;
@@ -126,17 +143,10 @@ using EventNodeMap =
     absl::flat_hash_map<int64 /*event_type*/,
                         std::vector<std::unique_ptr<EventNode>>>;
 
-struct GroupMetadata {
-  std::string name;
-  std::string model_id;  // inference only.
-};
-
-using GroupMetadataMap = absl::flat_hash_map<int64 /*group_id*/, GroupMetadata>;
-
 using EventList = std::vector<EventNode*>;
 
 struct ContextGroup {
-  EventNode* producer = nullptr;
+  std::vector<EventNode*> producers;
   std::vector<EventNode*> consumers;
 };
 
@@ -151,13 +161,20 @@ using ContextGroupMap = absl::flat_hash_map<
 // events specified in root_event_types or marked by the semantic argument.
 class EventForest {
  public:
-  EventForest(const std::vector<InterThreadConnectInfo>& connect_info_list,
-              const std::vector<int64>& root_event_types,
-              const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
-              XSpace* space);
+  void AddSpace(
+      const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
+      XSpace* space);
 
-  EventForest(const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
-              XPlane* plane);
+  void AddPlanes(
+      const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
+      const std::vector<XPlane*>& planes);
+
+  void ConnectEvents(
+      const std::vector<InterThreadConnectInfo>& connect_info_list = {});
+
+  void ConnectTfDataEvents();
+
+  void GroupEvents(const std::vector<int64>& root_event_types = {});
 
   const EventNodeMap& GetEventNodeMap() const { return event_node_map_; }
 
@@ -165,13 +182,14 @@ class EventForest {
     return group_metadata_map_;
   }
 
-  // Connects tf.data events across threads.
-  void ProcessTfDataEvents();
-
  private:
+  void AddPlane(
+      const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
+      XPlane* plane);
+
   // Creates an EventNode for each event in event_node_map and connect events
   // according to the nesting relationship within the thread.
-  void ConnectIntraThread(const XPlaneVisitor& visitor, XPlane* plane,
+  void ConnectIntraThread(XPlane* plane, XPlaneVisitor* visitor,
                           ContextGroupMap* context_groups);
 
   // Connects events across threads according to connect_info_list.
@@ -181,11 +199,11 @@ class EventForest {
   void ProcessLegacyRootEvents(
       const std::vector<int64 /*EventType*/>& root_event_types);
 
-  // Creates event groups and populates event_group_name_map_. If a TF loop is
+  // Creates event groups and populates group_metadata_map. If a TF loop is
   // used, each TF loop iteration becomes a root. Otherwise, top root events
   // (i.e., none of their ancestors is a root event) are used as roots. A new
   // group is created with all events reachable from a root.
-  void CreateEventGroup();
+  void CreateEventGroups();
 
   // Sets the is_eager stat to true for the eagerly executed GPU kernel events.
   void MarkEagerlyExecutedGpuKernels();
@@ -197,18 +215,20 @@ class EventForest {
   // iteraton to `tf_loop_root_events_`.
   void ProcessTensorFlowLoop();
 
-  // Processes the worker thread by grouping a FunctionRun with the following
+  // Processes the worker thread by connecting a FunctionRun with the following
   // eager ops (e.g., for Keras callback).
   void ProcessWorker();
 
-  // Adds model ids to group_metadata_map_ for inference profiles.
+  // Adds model ids to group_metadata_map for inference profiles.
   void ProcessModelIds();
 
   EventNodeMap event_node_map_;
   std::vector<XPlaneVisitor> visitors_;
-  GroupMetadataMap group_metadata_map_;
+  // std::deque for pointer stability.
+  std::deque<std::pair<XPlane*, XPlaneVisitor>> planes_;
   EventList root_events_;
   EventList tf_loop_root_events_;
+  GroupMetadataMap group_metadata_map_;
   int64 next_group_id_ = 0;
 };
 
@@ -216,7 +236,8 @@ std::vector<InterThreadConnectInfo> CreateInterThreadConnectInfoList();
 
 // Calls GroupEvents with connect_info_list and root_event_types specific to
 // TensorFlow.
-void GroupTfEvents(XSpace* space, GroupMetadataMap* group_metadata_map);
+void GroupTfEvents(XSpace* space, EventForest* event_forest);
+void GroupTfEvents(XSpace* space);
 
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/group_events_test.cc b/tensorflow/core/profiler/utils/group_events_test.cc
index 195f2adb9c4..604485f03e5 100644
--- a/tensorflow/core/profiler/utils/group_events_test.cc
+++ b/tensorflow/core/profiler/utils/group_events_test.cc
@@ -16,9 +16,14 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/group_events.h"
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/connected_traceme.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
@@ -30,6 +35,8 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
+using ::testing::UnorderedElementsAre;
+
 TEST(GroupEventsTest, GroupGpuTraceLegacyRootTest) {
   constexpr int64 kStepNum = 123;
   constexpr int64 kStepId = 0;
@@ -61,15 +68,17 @@ TEST(GroupEventsTest, GroupGpuTraceLegacyRootTest) {
   CreateXEvent(&device_plane_builder, &stream, "matmul", 200, 300,
                {{StatType::kCorrelationId, kCorrelationId}});
 
-  GroupMetadataMap group_metadata_map;
-  GroupTfEvents(&space, &group_metadata_map);
+  EventForest event_forest;
+  GroupTfEvents(&space, &event_forest);
+  const GroupMetadataMap& group_metadata_map =
+      event_forest.GetGroupMetadataMap();
   XPlaneVisitor device_plane_visitor = CreateTfXPlaneVisitor(device_plane);
   EXPECT_EQ(device_plane->lines(0).events(0).stats_size(), 3);
   EXPECT_EQ(device_plane_visitor.GetStatType(
                 device_plane->lines(0).events(0).stats(1)),
             StatType::kGroupId);
   EXPECT_EQ(group_metadata_map.size(), 1);
-  EXPECT_EQ(group_metadata_map[0].name, "train 123");
+  EXPECT_EQ(group_metadata_map.at(0).name, "train 123");
 }
 
 TEST(GroupEventsTest, GroupGpuTraceTest) {
@@ -102,15 +111,17 @@ TEST(GroupEventsTest, GroupGpuTraceTest) {
   CreateXEvent(&device_plane_builder, &stream, "matmul", 200, 300,
                {{StatType::kCorrelationId, kCorrelationId}});
 
-  GroupMetadataMap group_metadata_map;
-  GroupTfEvents(&space, &group_metadata_map);
+  EventForest event_forest;
+  GroupTfEvents(&space, &event_forest);
+  const GroupMetadataMap& group_metadata_map =
+      event_forest.GetGroupMetadataMap();
   XPlaneVisitor device_plane_visitor = CreateTfXPlaneVisitor(device_plane);
   EXPECT_EQ(device_plane->lines(0).events(0).stats_size(), 3);
   EXPECT_EQ(device_plane_visitor.GetStatType(
                 device_plane->lines(0).events(0).stats(1)),
             StatType::kGroupId);
   EXPECT_EQ(group_metadata_map.size(), 1);
-  EXPECT_EQ(group_metadata_map[0].name, "train 123");
+  EXPECT_EQ(group_metadata_map.at(0).name, "train 123");
 }
 
 TEST(GroupEventsTest, GroupTensorFlowLoopTest) {
@@ -140,8 +151,10 @@ TEST(GroupEventsTest, GroupTensorFlowLoopTest) {
   CreateXEvent(&device_plane_builder, &stream, "matmul", 200, 300,
                {{StatType::kCorrelationId, kCorrelationId}});
 
-  GroupMetadataMap group_metadata_map;
-  GroupTfEvents(&space, &group_metadata_map);
+  EventForest event_forest;
+  GroupTfEvents(&space, &event_forest);
+  const GroupMetadataMap& group_metadata_map =
+      event_forest.GetGroupMetadataMap();
   XPlaneVisitor device_plane_visitor = CreateTfXPlaneVisitor(device_plane);
   EXPECT_EQ(device_plane->lines(0).events(0).stats_size(), 3);
   EXPECT_EQ(device_plane_visitor.GetStatType(
@@ -149,7 +162,7 @@ TEST(GroupEventsTest, GroupTensorFlowLoopTest) {
             StatType::kGroupId);
   EXPECT_EQ(device_plane->lines(0).events(0).stats(1).int64_value(), 10);
   EXPECT_EQ(group_metadata_map.size(), 1);
-  EXPECT_EQ(group_metadata_map[10].name, "10");
+  EXPECT_EQ(group_metadata_map.at(10).name, "10");
 }
 
 // When there are multiple TF loops, group_id is assigned in the order of TF
@@ -187,8 +200,10 @@ TEST(GroupEventsTest, GroupMultipleTensorFlowLoopsTest) {
                {{StatType::kStepId, kFirstStepId},
                 {StatType::kIterNum, kFirstIterNumStart + 1}});
 
-  GroupMetadataMap group_metadata_map;
-  GroupTfEvents(&space, &group_metadata_map);
+  EventForest event_forest;
+  GroupTfEvents(&space, &event_forest);
+  const GroupMetadataMap& group_metadata_map =
+      event_forest.GetGroupMetadataMap();
   EXPECT_EQ(group_metadata_map.size(), 4);
   EXPECT_TRUE(group_metadata_map.count(10));
   EXPECT_TRUE(group_metadata_map.count(11));
@@ -223,8 +238,7 @@ TEST(GroupEventsTest, GroupFunctionalOp) {
                HostEventType::kExecutorStateProcess, 100, 150,
                {{StatType::kStepId, kFunctionStepId}});
 
-  GroupMetadataMap group_metadata_map;
-  GroupTfEvents(&space, &group_metadata_map);
+  GroupTfEvents(&space);
   XPlaneVisitor host_plane_visitor = CreateTfXPlaneVisitor(host_plane);
   // Check that RemoteCallOp is grouped correctly so that all events belong
   // to the same group.
@@ -271,7 +285,7 @@ TEST(GroupEventsTest, EagerOpTest) {
   CreateXEvent(&device_plane_builder, &stream, "matmul", 200, 300,
                {{StatType::kCorrelationId, kCorrelationId}});
 
-  GroupTfEvents(&space, /*group_metadata_map=*/nullptr);
+  GroupTfEvents(&space);
   XPlaneVisitor host_plane_visitor = CreateTfXPlaneVisitor(host_plane);
   const XEvent& eager_cpu_tf_op = host_plane->lines(0).events(3);
   EXPECT_EQ(eager_cpu_tf_op.stats_size(), 1);
@@ -323,7 +337,7 @@ TEST(GroupEventsTest, FunctionOpTest) {
   CreateXEvent(&device_plane_builder, &stream, "matmul", 200, 300,
                {{StatType::kCorrelationId, kCorrelationId}});
 
-  GroupTfEvents(&space, /*group_metadata_map=*/nullptr);
+  GroupTfEvents(&space);
   XPlaneVisitor host_plane_visitor = CreateTfXPlaneVisitor(host_plane);
   const XEvent& cpu_tf_op = host_plane->lines(1).events(2);
   EXPECT_EQ(cpu_tf_op.stats_size(), 2);
@@ -359,7 +373,7 @@ TEST(GroupEventsTest, SemanticArgTest) {
                {{StatType::kConsumerType, kContextType},
                 {StatType::kConsumerId, kContextId}});
 
-  GroupTfEvents(&raw_space, /*group_metadata_map=*/nullptr);
+  GroupTfEvents(&raw_space);
   int num_events = 0;
   CreateTfXPlaneVisitor(raw_plane).ForEachLine(
       [&](const tensorflow::profiler::XLineVisitor& line) {
@@ -400,7 +414,7 @@ TEST(GroupEventsTest, SemanticIntArgNoMatchTest) {
                {{StatType::kConsumerType, kContextType},
                 {StatType::kConsumerId, kConsumerId}});
 
-  GroupTfEvents(&raw_space, /*group_metadata_map=*/nullptr);
+  GroupTfEvents(&raw_space);
   int num_events = 0;
   CreateTfXPlaneVisitor(raw_plane).ForEachLine(
       [&](const tensorflow::profiler::XLineVisitor& line) {
@@ -445,7 +459,7 @@ TEST(GroupEventsTest, SemanticUintArgNoMatchTest) {
                {{StatType::kConsumerType, kContextType},
                 {StatType::kConsumerId, kConsumerId}});
 
-  GroupTfEvents(&raw_space, /*group_metadata_map=*/nullptr);
+  GroupTfEvents(&raw_space);
   int num_events = 0;
   CreateTfXPlaneVisitor(raw_plane).ForEachLine(
       [&](const tensorflow::profiler::XLineVisitor& line) {
@@ -485,7 +499,7 @@ TEST(GroupEventsTest, AsyncEventTest) {
                {{StatType::kIsAsync, kIsAsync}});
   CreateXEvent(&plane, &line, kChild, 20, 80);
 
-  GroupTfEvents(&raw_space, /*group_metadata_map=*/nullptr);
+  GroupTfEvents(&raw_space);
   CreateTfXPlaneVisitor(raw_plane).ForEachLine(
       [&](const tensorflow::profiler::XLineVisitor& line) {
         EXPECT_EQ(line.NumEvents(), 3);
@@ -538,7 +552,7 @@ TEST(GroupEventsTest, WorkerTest) {
   CreateXEvent(&plane, &line, HostEventType::kFunctionRun,
                kSecondFunctionRunStartTime, kFunctionRunDuration);
 
-  GroupTfEvents(&raw_space, /*group_metadata_map=*/nullptr);
+  GroupTfEvents(&raw_space);
   CreateTfXPlaneVisitor(raw_plane).ForEachLine(
       [&](const tensorflow::profiler::XLineVisitor& line) {
         EXPECT_EQ(line.NumEvents(), 6);
@@ -563,6 +577,90 @@ TEST(GroupEventsTest, WorkerTest) {
       });
 }
 
+absl::flat_hash_set<int64> ParseGroupIds(absl::string_view selected_group_ids) {
+  absl::flat_hash_set<int64> group_ids;
+  std::vector<absl::string_view> strs = absl::StrSplit(selected_group_ids, '=');
+  std::vector<absl::string_view> group_id_strs = absl::StrSplit(strs[1], ',');
+  for (absl::string_view group_id_str : group_id_strs) {
+    int64 group_id;
+    if (absl::SimpleAtoi(group_id_str, &group_id)) group_ids.insert(group_id);
+  }
+  return group_ids;
+}
+
+TEST(GroupEventsTest, BatchingSessionTest) {
+  constexpr absl::string_view kSchedule = "Schedule";
+  constexpr int64 kBatchContextType =
+      static_cast<int64>(ContextType::kSharedBatchScheduler);
+  constexpr int64 kBatchContextId = 123;
+
+  XSpace raw_space;
+  XPlane* raw_plane = raw_space.add_planes();
+  XPlaneBuilder plane(raw_plane);
+  plane.ReserveLines(2);
+  auto request_thread = plane.GetOrCreateLine(0);
+  // First request.
+  CreateXEvent(&plane, &request_thread, HostEventType::kBatchingSessionRun, 0,
+               100);
+  CreateXEvent(&plane, &request_thread, kSchedule, 0, 100,
+               {{StatType::kProducerType, kBatchContextType},
+                {StatType::kProducerId, kBatchContextId}});
+  // Second request.
+  CreateXEvent(&plane, &request_thread, HostEventType::kBatchingSessionRun, 200,
+               100);
+  CreateXEvent(&plane, &request_thread, kSchedule, 200, 100,
+               {{StatType::kProducerType, kBatchContextType},
+                {StatType::kProducerId, kBatchContextId}});
+  auto batch_thread = plane.GetOrCreateLine(1);
+  CreateXEvent(&plane, &batch_thread, HostEventType::kProcessBatch, 200, 100,
+               {{StatType::kConsumerType, kBatchContextType},
+                {StatType::kConsumerId, kBatchContextId}});
+
+  EventForest event_forest;
+  GroupTfEvents(&raw_space, &event_forest);
+  const GroupMetadataMap& group_metadata_map =
+      event_forest.GetGroupMetadataMap();
+  EXPECT_EQ(group_metadata_map.size(), 3);
+  // Check that the ProcessBatch group has two BatchingSessionRun groups as
+  // parents.
+  EXPECT_EQ(group_metadata_map.at(0).parents.size(), 2);
+  // Check that the BatchingSessionRun groups have one ProcessBatch group as a
+  // child.
+  EXPECT_EQ(group_metadata_map.at(1).children.size(), 1);
+  EXPECT_EQ(group_metadata_map.at(2).children.size(), 1);
+  // Chech that the events have the selected_group_ids stat set.
+  uint64 num_checked = 0;
+  CreateTfXPlaneVisitor(raw_plane).ForEachLine(
+      [&](const tensorflow::profiler::XLineVisitor& line) {
+        line.ForEachEvent(
+            [&](const tensorflow::profiler::XEventVisitor& event) {
+              absl::optional<int64> group_id;
+              if (absl::optional<XStatVisitor> stat =
+                      event.GetStat(StatType::kGroupId)) {
+                group_id = stat->IntValue();
+              }
+              EXPECT_TRUE(group_id.has_value());
+              absl::string_view selected_group_ids;
+              if (absl::optional<XStatVisitor> stat =
+                      event.GetStat(StatType::kSelectedGroupIds)) {
+                selected_group_ids = stat->StrOrRefValue();
+              }
+              if (line.Id() == 0) {
+                if (event.Type() == HostEventType::kBatchingSessionRun) {
+                  EXPECT_THAT(ParseGroupIds(selected_group_ids),
+                              UnorderedElementsAre(*group_id, 0));
+                  ++num_checked;
+                }
+              } else if (line.Id() == 1) {
+                EXPECT_THAT(ParseGroupIds(selected_group_ids),
+                            UnorderedElementsAre(0, 1, 2));
+                ++num_checked;
+              }
+            });
+      });
+  EXPECT_EQ(num_checked, 3);
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/hardware_type_utils.cc b/tensorflow/core/profiler/utils/hardware_type_utils.cc
index 69b5d4796a3..3fc382f92a9 100644
--- a/tensorflow/core/profiler/utils/hardware_type_utils.cc
+++ b/tensorflow/core/profiler/utils/hardware_type_utils.cc
@@ -57,6 +57,17 @@ uint32 GetFmaMaxThroughputPerSMPerCycle(const DeviceCapabilities& device_cap) {
       n_fp32_cores = 64;
       n_tc_cores = 8;
       break;
+    case 8:
+      // Ampere
+      if (device_cap.compute_capability().minor() >= 6) {
+        // Ampere SM86
+        n_fp32_cores = 128;
+      } else {
+        // Ampere SM80
+        n_fp32_cores = 64;
+      }
+      n_tc_cores = 4;
+      break;
     default:
       LOG(ERROR) << "Invalid GPU compute capability.";
       break;
@@ -82,5 +93,7 @@ HardwareType ParseHardwareType(absl::string_view device_type) {
   return HardwareType::UNKNOWN_HARDWARE;
 }
 
+bool HasDevice(HardwareType x) { return x > tensorflow::profiler::CPU_ONLY; }
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/hardware_type_utils.h b/tensorflow/core/profiler/utils/hardware_type_utils.h
index 70090fb766f..4a1470a352e 100644
--- a/tensorflow/core/profiler/utils/hardware_type_utils.h
+++ b/tensorflow/core/profiler/utils/hardware_type_utils.h
@@ -28,6 +28,9 @@ double GetFlopMaxThroughputPerSM(const DeviceCapabilities& device_cap);
 
 HardwareType ParseHardwareType(absl::string_view device_type);
 
+// Returns true if the given hardware type has a device.
+bool HasDevice(HardwareType x);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/utils/kernel_stats_utils.cc b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
index 9212ecba533..982434369ea 100644
--- a/tensorflow/core/profiler/utils/kernel_stats_utils.cc
+++ b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
@@ -32,6 +32,13 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
+namespace {
+
+// The maximum number of Kernels displayed on Kernel Stats page.
+const int kMaxNumOfKernels = 1000;
+
+}  // namespace
+
 void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
                              KernelReport* kernel) {
   const std::vector<absl::string_view> params =
@@ -81,9 +88,11 @@ bool IsKernelUsingTensorCore(absl::string_view kernel_name) {
   // Some examples: volta_h884gemm, volta_fp16_s884gemm,
   // turing_fp16_s1688cudnn_fp16
   bool possible_tensor_kernel = absl::StrContains(kernel_name, "884") ||
-                                absl::StrContains(kernel_name, "1688");
+                                absl::StrContains(kernel_name, "1688") ||
+                                absl::StrContains(kernel_name, "hmma") ||
+                                absl::StrContains(kernel_name, "xmma");
   if (possible_tensor_kernel) {
-    VLOG(1) << "Possible tensor kernel: " << kernel_name << "\n";
+    VLOG(3) << "Possible tensor kernel: " << kernel_name;
   }
 
   return (absl::StartsWith(kernel_name, "volta_i884") ||
@@ -97,7 +106,9 @@ bool IsKernelUsingTensorCore(absl::string_view kernel_name) {
           absl::StartsWith(kernel_name, "turing_s1688") ||
           absl::StartsWith(kernel_name, "turing_fp16_i1688") ||
           absl::StartsWith(kernel_name, "turing_fp16_h1688") ||
-          absl::StartsWith(kernel_name, "turing_fp16_s1688"));
+          absl::StartsWith(kernel_name, "turing_fp16_s1688") ||
+          absl::StrContains(kernel_name, "hmma") ||
+          absl::StrContains(kernel_name, "xmma"));
 }
 
 // This list is not exhaustive.
@@ -203,26 +214,66 @@ bool KernelReportEqualToComparator::operator()(const KernelReport& lhs,
   // clang-format on
 }
 
-void SortKernelsByTotalDurationDesc(KernelStatsDb* kernel_stats_db) {
-  // Sort kernel reports by total duration descendingly.
-  std::sort(kernel_stats_db->mutable_reports()->begin(),
-            kernel_stats_db->mutable_reports()->end(),
-            [](const KernelReport& lhs, const KernelReport& rhs) {
-              return lhs.total_duration_ns() > rhs.total_duration_ns() ||
-                     (lhs.total_duration_ns() == rhs.total_duration_ns() &&
-                      KernelReportLessThanComparator()(lhs, rhs));
-            });
+void SortAndKeepTopKDurationKernelReportsInDb(KernelStatsDb* kernel_stats_db) {
+  auto comp = [](const KernelReport& lhs, const KernelReport& rhs) {
+    return lhs.total_duration_ns() > rhs.total_duration_ns() ||
+           (lhs.total_duration_ns() == rhs.total_duration_ns() &&
+            KernelReportLessThanComparator()(lhs, rhs));
+  };
+
+  // Sort and keep at most <kMaxNumOfKernels> kernel reports.
+  if (kernel_stats_db->reports_size() > kMaxNumOfKernels) {
+    std::partial_sort(
+        kernel_stats_db->mutable_reports()->begin(),
+        kernel_stats_db->mutable_reports()->begin() + kMaxNumOfKernels,
+        kernel_stats_db->mutable_reports()->end(), comp);
+    kernel_stats_db->mutable_reports()->erase(
+        kernel_stats_db->mutable_reports()->begin() + kMaxNumOfKernels,
+        kernel_stats_db->mutable_reports()->end());
+  } else {
+    std::sort(kernel_stats_db->mutable_reports()->begin(),
+              kernel_stats_db->mutable_reports()->end(), comp);
+  }
 }
 
-void CopyKernelReportsToDb(const KernelReportMap& reports, KernelStatsDb* dst) {
+void CopyTopKDurationKernelReportsToDb(const KernelReportMap& reports,
+                                       KernelStatsDb* dst) {
+  std::vector<std::pair<const KernelReport*, const KernelReportValue*>>
+      kernels_to_sort;
+  kernels_to_sort.reserve(reports.size());
   for (const auto& report_value : reports) {
+    kernels_to_sort.push_back(
+        std::make_pair(&report_value.first, &report_value.second));
+  }
+
+  auto comp =
+      [](const std::pair<const KernelReport*, const KernelReportValue*>& lhs,
+         const std::pair<const KernelReport*, const KernelReportValue*>& rhs) {
+        return lhs.second->total_duration_ns > rhs.second->total_duration_ns ||
+               (lhs.second->total_duration_ns ==
+                    rhs.second->total_duration_ns &&
+                KernelReportLessThanComparator()(*lhs.first, *rhs.first));
+      };
+
+  // Sort and copy at most <kMaxNumOfKernels> kernels to <dst>.
+  if (kernels_to_sort.size() > kMaxNumOfKernels) {
+    absl::c_partial_sort(kernels_to_sort,
+                         kernels_to_sort.begin() + kMaxNumOfKernels, comp);
+  } else {
+    absl::c_sort(kernels_to_sort, comp);
+  }
+
+  int copy_size =
+      std::min(kMaxNumOfKernels, static_cast<int>(kernels_to_sort.size()));
+  for (int i = 0; i < copy_size; i++) {
     KernelReport* report = dst->add_reports();
-    *report = report_value.first;
+    *report = *kernels_to_sort[i].first;
+    const KernelReportValue& kernel_value = *kernels_to_sort[i].second;
     // Set value using KernelReportValue.
-    report->set_occurrences(report_value.second.occurrences);
-    report->set_min_duration_ns(report_value.second.min_duration_ns);
-    report->set_max_duration_ns(report_value.second.max_duration_ns);
-    report->set_total_duration_ns(report_value.second.total_duration_ns);
+    report->set_occurrences(kernel_value.occurrences);
+    report->set_min_duration_ns(kernel_value.min_duration_ns);
+    report->set_max_duration_ns(kernel_value.max_duration_ns);
+    report->set_total_duration_ns(kernel_value.total_duration_ns);
   }
 }
 
diff --git a/tensorflow/core/profiler/utils/kernel_stats_utils.h b/tensorflow/core/profiler/utils/kernel_stats_utils.h
index 8d0bd82bbe6..1b965376297 100644
--- a/tensorflow/core/profiler/utils/kernel_stats_utils.h
+++ b/tensorflow/core/profiler/utils/kernel_stats_utils.h
@@ -50,7 +50,9 @@ struct KernelReportEqualToComparator {
 };
 
 // Sorts kernel reorts by total duration descendingly.
-void SortKernelsByTotalDurationDesc(KernelStatsDb* kernel_stats_db);
+// Keeps only the top kernel reports with long kernel duration in the given
+// KernelStatsDb. Kernel reports with shorter kernel duration are dropped.
+void SortAndKeepTopKDurationKernelReportsInDb(KernelStatsDb* kernel_stats_db);
 
 struct KernelReportValue {
   uint64 total_duration_ns = 0;
@@ -95,8 +97,10 @@ using KernelReportMap =
     absl::flat_hash_map<KernelReport, KernelReportValue, KernelHash,
                         KernelReportEqualToComparator>;
 
-// Copies reports into the given KernelStatsDb.
-void CopyKernelReportsToDb(const KernelReportMap& reports, KernelStatsDb* dst);
+// Copies the top kernel reports with long kernel duration into the given
+// KernelStatsDb.
+void CopyTopKDurationKernelReportsToDb(const KernelReportMap& reports,
+                                       KernelStatsDb* dst);
 
 // Inserts or aggregates KernelReports into the given KernelReportMap.
 void InsertOrUpdateKernelReport(const KernelReport& kernel,
diff --git a/tensorflow/core/profiler/utils/op_metrics_db_utils.h b/tensorflow/core/profiler/utils/op_metrics_db_utils.h
index 7cb776abfe7..af00028169e 100644
--- a/tensorflow/core/profiler/utils/op_metrics_db_utils.h
+++ b/tensorflow/core/profiler/utils/op_metrics_db_utils.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 
@@ -27,7 +28,7 @@ namespace tensorflow {
 namespace profiler {
 
 // The name of OpMetrics to represent the idle time.
-ABSL_CONST_INIT extern const absl::string_view kIdle;
+TF_CONST_INIT extern const absl::string_view kIdle;
 
 // Helps build an op metrics database (borrowed).
 // Enables fast lookup of existing ops and prevents the creation of duplicate
diff --git a/tensorflow/core/profiler/internal/parse_annotation.cc b/tensorflow/core/profiler/utils/parse_annotation.cc
similarity index 98%
rename from tensorflow/core/profiler/internal/parse_annotation.cc
rename to tensorflow/core/profiler/utils/parse_annotation.cc
index a4cdc09739d..ce948f28021 100644
--- a/tensorflow/core/profiler/internal/parse_annotation.cc
+++ b/tensorflow/core/profiler/utils/parse_annotation.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/profiler/internal/parse_annotation.h"
+#include "tensorflow/core/profiler/utils/parse_annotation.h"
 
 #include <stack>
 #include <string>
diff --git a/tensorflow/core/profiler/internal/parse_annotation.h b/tensorflow/core/profiler/utils/parse_annotation.h
similarity index 87%
rename from tensorflow/core/profiler/internal/parse_annotation.h
rename to tensorflow/core/profiler/utils/parse_annotation.h
index bb0f12217d3..f8b1fd5df9a 100644
--- a/tensorflow/core/profiler/internal/parse_annotation.h
+++ b/tensorflow/core/profiler/utils/parse_annotation.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PARSE_ANNOTATION_H_
-#define TENSORFLOW_CORE_PROFILER_INTERNAL_PARSE_ANNOTATION_H_
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_PARSE_ANNOTATION_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_PARSE_ANNOTATION_H_
 
 #include <vector>
 
@@ -43,4 +43,4 @@ std::vector<Annotation> ParseAnnotationStack(
 }  // namespace profiler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_PARSE_ANNOTATION_H_
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_PARSE_ANNOTATION_H_
diff --git a/tensorflow/core/profiler/internal/parse_annotation_test.cc b/tensorflow/core/profiler/utils/parse_annotation_test.cc
similarity index 98%
rename from tensorflow/core/profiler/internal/parse_annotation_test.cc
rename to tensorflow/core/profiler/utils/parse_annotation_test.cc
index e5d876ac5af..3a88782c576 100644
--- a/tensorflow/core/profiler/internal/parse_annotation_test.cc
+++ b/tensorflow/core/profiler/utils/parse_annotation_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/profiler/internal/parse_annotation.h"
+#include "tensorflow/core/profiler/utils/parse_annotation.h"
 
 #include <vector>
 
diff --git a/tensorflow/core/profiler/utils/step_intersection.cc b/tensorflow/core/profiler/utils/step_intersection.cc
new file mode 100644
index 00000000000..af51590e68b
--- /dev/null
+++ b/tensorflow/core/profiler/utils/step_intersection.cc
@@ -0,0 +1,286 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/utils/step_intersection.h"
+
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
+
+namespace tensorflow {
+namespace profiler {
+
+namespace {
+
+// Returns the timespan in this step (across all cores).
+Timespan StepTimespan(const PerCoreStepInfo& percore_stepinfo) {
+  uint64 min_ps = kuint64max;
+  uint64 max_ps = 0;
+  for (const auto& core_stepinfo : percore_stepinfo.step_info_per_core()) {
+    const auto& stepinfo = core_stepinfo.second;
+    uint64 begin_ps = stepinfo.begin_ps();
+    uint64 end_ps = begin_ps + stepinfo.duration_ps();
+    min_ps = std::min(min_ps, begin_ps);
+    max_ps = std::max(max_ps, end_ps);
+  }
+  return (min_ps < max_ps) ? Timespan::FromEndPoints(min_ps, max_ps)
+                           : Timespan();
+}
+
+// Returns the timespan across all steps in the given step_db.
+Timespan AllStepsTimespan(const StepDatabaseResult& step_db) {
+  uint64 min_ps = kuint64max;
+  uint64 max_ps = 0;
+  for (const auto& step : step_db.step_sequence()) {
+    Timespan timespan = StepTimespan(step);
+    uint64 begin_ps = timespan.begin_ps();
+    uint64 end_ps = timespan.end_ps();
+    min_ps = std::min(min_ps, begin_ps);
+    max_ps = std::max(max_ps, end_ps);
+  }
+  return (min_ps < max_ps) ? Timespan::FromEndPoints(min_ps, max_ps)
+                           : Timespan();
+}
+
+struct AlignmentInfo {
+  StepsAlignment alignment;
+  double similarity;
+};
+
+// Computes the similarity between the given two steps. The closer their
+// timespans are, the larger is the similarity.
+double StepSimilarity(const PerCoreStepInfo& subordinate_step,
+                      const PerCoreStepInfo& chief_step) {
+  Timespan subordinate_timespan = StepTimespan(subordinate_step);
+  Timespan chief_timespan = StepTimespan(chief_step);
+  return chief_timespan.OverlappedDurationPs(subordinate_timespan);
+}
+
+// If the subordinate steps and the chief steps are aligned at the given anchor
+// points (i.e. at the subordinate_anchor step on the subordinate sequence, at
+// the chief_anchor step on the chief sequence), returns the corresponding
+// AlignmentInfo.
+AlignmentInfo ComputeAlignmentInfo(const StepDatabaseResult& subordinate,
+                                   uint32 subordinate_anchor,
+                                   const StepDatabaseResult& chief,
+                                   uint32 chief_anchor) {
+  // Assumes that the step at subordinate_anchor on the subordinate sequence is
+  // aligned with the step at the chief_anchor on the chief sequence. Then the
+  // number of steps before the anchor is the minimum of the number of steps
+  // before the anchor in the subordinate and that before the anchor in the
+  // chief. Similarly, the number of steps after the anchor is the minimum of
+  // the number of steps after the anchor in the subordinate and that after the
+  // anchor in the chief.
+  uint32 pre_anchor_steps = std::min(subordinate_anchor, chief_anchor);
+  uint32 post_anchor_steps =
+      std::min(subordinate.step_sequence_size() - subordinate_anchor,
+               chief.step_sequence_size() - chief_anchor);
+  // total number of steps aligned = pre_anchor_steps + post_anchor_steps.
+  uint32 alignment_steps = pre_anchor_steps + post_anchor_steps;
+
+  double similarity = 0;
+  // Where the aligned steps begin on the subordinate sequence.
+  uint32 begin_subordinate_idx = subordinate_anchor - pre_anchor_steps;
+  // Where the aligned steps begin on the chief sequence.
+  uint32 begin_chief_idx = chief_anchor - pre_anchor_steps;
+
+  for (uint32 i = 0; i < alignment_steps; i++) {
+    // Accumulates the similarity at each step.
+    similarity +=
+        StepSimilarity(subordinate.step_sequence(begin_subordinate_idx + i),
+                       chief.step_sequence(begin_chief_idx + i));
+  }
+  StepsAlignment alignment = {begin_subordinate_idx, begin_chief_idx,
+                              alignment_steps};
+  return {alignment, similarity};
+}
+
+// Returns the best alignment for aligning subordinate against chief.
+StepsAlignment FindStepsAlignment(const StepDatabaseResult& subordinate,
+                                  const StepDatabaseResult& chief) {
+  double max_similarity = -1;
+  StepsAlignment alignment = {0, 0, 0};
+  if (subordinate.step_sequence_size() == 0 || chief.step_sequence_size() == 0)
+    return alignment;
+  for (auto c = 0; c < chief.step_sequence_size(); c++) {
+    AlignmentInfo info =
+        ComputeAlignmentInfo(subordinate, /*subordinate_anchor=*/0, chief, c);
+    if (info.similarity <= max_similarity) continue;
+    max_similarity = info.similarity;
+    alignment = info.alignment;
+  }
+  for (auto s = 1; s < subordinate.step_sequence_size(); s++) {
+    // s starts at 1 instead of 0, because the loop above already considers
+    // (s=0, c=0).
+    AlignmentInfo info =
+        ComputeAlignmentInfo(subordinate, s, chief, /*chief_anchor=*/0);
+    if (info.similarity <= max_similarity) continue;
+    max_similarity = info.similarity;
+    alignment = info.alignment;
+  }
+  return alignment;
+}
+
+std::string StringStepsAlignment(const StepsAlignment& alignment) {
+  return absl::StrCat(
+      "[begin_subordinate_idx: ", alignment.begin_subordinate_idx,
+      ", begin_chief_idx: ", alignment.begin_chief_idx,
+      ", num_steps: ", alignment.num_steps, "]");
+}
+
+std::string StringDstStepNumbers(const std::vector<uint32>& step_numbers) {
+  std::string str;
+  absl::StrAppend(&str, "[");
+  for (auto i = 0; i < step_numbers.size(); i++) {
+    if (i > 0) absl::StrAppend(&str, ", ");
+    absl::StrAppend(&str, step_numbers[i]);
+  }
+  absl::StrAppend(&str, "]");
+  return str;
+}
+
+std::string StringSrcToDstIndexMap(uint32 src_first_step_idx,
+                                   uint32 num_steps) {
+  std::string str;
+  absl::StrAppend(&str, "[");
+  for (auto i = 0; i < num_steps; i++) {
+    if (i > 0) absl::StrAppend(&str, ", ");
+    absl::StrAppend(&str, src_first_step_idx + i, ":", i);
+  }
+  absl::StrAppend(&str, "]");
+  return str;
+}
+
+}  // namespace
+
+StepIntersection::StepIntersection(
+    uint32 max_steps,
+    const absl::flat_hash_map<uint32, const StepDatabaseResult*>&
+        perhost_stepdb) {
+  // Figures out the host with the shortest timespan among their steps (called
+  // this host the "chief").
+  chief_host_id_ = kuint32max;
+  uint64 min_duration_ps = kuint64max;
+  const StepDatabaseResult* chief_step_db = nullptr;
+  for (const auto& hostid_stepdb : perhost_stepdb) {
+    auto host_id = hostid_stepdb.first;
+    const auto& step_db = hostid_stepdb.second;
+    Timespan timespan = AllStepsTimespan(*step_db);
+    if (timespan.duration_ps() < min_duration_ps) {
+      chief_host_id_ = host_id;
+      chief_step_db = step_db;
+      min_duration_ps = timespan.duration_ps();
+    }
+  }
+  if (chief_host_id_ == kuint32max) {
+    // There is no step at all on any host.
+    steps_dropped_ = 0;
+    begin_chief_idx_ = 0;
+    end_chief_idx_ = 0;
+    return;
+  }
+
+  uint32 max_begin_chief_idx = 0;
+  uint32 min_end_chief_idx = kuint32max;
+  // Aligns the steps in all hosts with those in the chief.
+  for (const auto& hostid_stepdb : perhost_stepdb) {
+    auto host_id = hostid_stepdb.first;
+    const auto& step_db = hostid_stepdb.second;
+    if (host_id == chief_host_id_) {
+      // Simply aligns with itself.
+      perhost_alignment_[host_id] = {
+          /*begin_subordinate_idx=*/0, /*begin_chief_idx=*/0,
+          static_cast<uint32>(step_db->step_sequence_size())};
+    } else {
+      perhost_alignment_[host_id] =
+          FindStepsAlignment(*step_db, *chief_step_db);
+    }
+    // Intersects this host's alignment with other hosts' alignments.
+    uint32 host_begin_chief_idx = perhost_alignment_[host_id].begin_chief_idx;
+    max_begin_chief_idx = std::max(max_begin_chief_idx, host_begin_chief_idx);
+    uint32 host_end_chief_idx = perhost_alignment_[host_id].begin_chief_idx +
+                                perhost_alignment_[host_id].num_steps;
+    min_end_chief_idx = std::min(min_end_chief_idx, host_end_chief_idx);
+  }
+  DCHECK(max_begin_chief_idx <= min_end_chief_idx);
+
+  begin_chief_idx_ = max_begin_chief_idx;
+
+  // Takes max_steps into account.
+  uint32 num_steps = min_end_chief_idx - max_begin_chief_idx;
+  if (num_steps > max_steps) {
+    steps_dropped_ = num_steps - max_steps;
+    // TODO(ckluk): Drops from both ends to avoid incomplete steps at the
+    // beginning and end of the profile.
+    end_chief_idx_ = max_begin_chief_idx + max_steps;
+  } else {
+    steps_dropped_ = 0;
+    end_chief_idx_ = min_end_chief_idx;
+  }
+}
+
+std::vector<uint32> StepIntersection::DstStepNumbers() const {
+  // TODO(ckluk): Honors training-loop boundaries (if more than one loop
+  // sampled).
+  std::vector<uint32> result;
+  result.reserve(NumSteps());
+  for (uint32 i = 0; i < NumSteps(); i++) {
+    result.push_back(i);
+  }
+  return result;
+}
+
+uint32 StepIntersection::FirstStepIndex(uint32 host_id) const {
+  const auto* alignment = gtl::FindOrNull(perhost_alignment_, host_id);
+  if (alignment == nullptr) return 0;
+  DCHECK(alignment->begin_chief_idx <= begin_chief_idx_);
+  uint32 shift = begin_chief_idx_ - alignment->begin_chief_idx;
+  uint32 begin_subordinate_idx = alignment->begin_subordinate_idx + shift;
+  return begin_subordinate_idx;
+}
+
+std::string StepIntersection::DebugString() const {
+  std::string str;
+  absl::StrAppend(&str, "chief host id_: ", chief_host_id_, "\n");
+  absl::StrAppend(&str, "begin_chief_idx_: ", begin_chief_idx_,
+                  ", num_steps: ", NumSteps(), "\n");
+  absl::StrAppend(
+      &str, "DstStepNumbers(): ", StringDstStepNumbers(DstStepNumbers()), "\n");
+
+  std::vector<uint32> host_ids;
+  host_ids.reserve(perhost_alignment_.size());
+  for (const auto& hostid_alignment : perhost_alignment_) {
+    auto host_id = hostid_alignment.first;
+    host_ids.push_back(host_id);
+  }
+  absl::c_sort(host_ids);
+
+  absl::StrAppend(&str, "perhost_alignment:\n");
+  for (const auto host_id : host_ids) {
+    const auto* ptr = gtl::FindOrNull(perhost_alignment_, host_id);
+    if (ptr == nullptr) continue;
+    absl::StrAppend(&str, "host: ", host_id,
+                    ", step-alignment: ", StringStepsAlignment(*ptr), "\n");
+  }
+  absl::StrAppend(&str, "SrcToDstIndexMap():\n");
+  for (const auto host_id : host_ids) {
+    absl::StrAppend(&str, "host: ", host_id, ", src-to-dst-index-map: ",
+                    StringSrcToDstIndexMap(FirstStepIndex(host_id), NumSteps()),
+                    "\n");
+  }
+  return str;
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/step_intersection.h b/tensorflow/core/profiler/utils/step_intersection.h
new file mode 100644
index 00000000000..12501288aa5
--- /dev/null
+++ b/tensorflow/core/profiler/utils/step_intersection.h
@@ -0,0 +1,75 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_STEP_INTERSECTION_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_STEP_INTERSECTION_H_
+
+#include <algorithm>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Description of how two step sequences are aligned.
+struct StepsAlignment {
+  uint32 begin_subordinate_idx;  // where the alignment begins on the
+                                 // subordinate steps.
+  uint32 begin_chief_idx;  // where the alignment begins on the chief steps.
+  uint32 num_steps;        // aligned for how many steps.
+};
+
+class StepIntersection {
+ public:
+  StepIntersection(
+      uint32 max_steps,
+      const absl::flat_hash_map</*host_id=*/uint32, const StepDatabaseResult*>&
+          perhost_stepdb);
+
+  // Returns the number of steps in the intersection.
+  uint32 NumSteps() const { return end_chief_idx_ - begin_chief_idx_; }
+
+  // Returns the step numbers for the destination (i.e. the intersection
+  // result).
+  std::vector<uint32> DstStepNumbers() const;
+
+  // Returns the index to the step in the given host that corresponds to the
+  // first step in the intersection.
+  uint32 FirstStepIndex(uint32 host_id) const;
+
+  // Returns the number of steps dropped due to the max_steps constraint
+  // specified in the constructor.
+  uint32 StepsDropped() const { return steps_dropped_; }
+
+  std::string DebugString() const;
+
+ private:
+  absl::flat_hash_map</*host_id=*/uint32, StepsAlignment> perhost_alignment_;
+  uint32
+      chief_host_id_;  // the host whose step sequence is selected as the chief.
+  uint32 steps_dropped_;  // number of steps dropped.
+  // The begin and end indices to the chief step sequence for this step
+  // intersection. Note that the begin index is inclusive but the end index is
+  // exclusive.
+  uint32 begin_chief_idx_;
+  uint32 end_chief_idx_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_STEP_INTERSECTION_H_
diff --git a/tensorflow/core/profiler/utils/step_intersection_test.cc b/tensorflow/core/profiler/utils/step_intersection_test.cc
new file mode 100644
index 00000000000..038d2226c9d
--- /dev/null
+++ b/tensorflow/core/profiler/utils/step_intersection_test.cc
@@ -0,0 +1,200 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/utils/step_intersection.h"
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+using PerHostStepDb =
+    absl::flat_hash_map<uint32 /*=host_id*/, StepDatabaseResult>;
+
+constexpr uint64 kStepDurationPs = 2000000000;
+constexpr uint32 kNumStepsPerHost = 10;
+constexpr uint64 kStepGapPs = 0;
+constexpr uint32 kNumCoresPerHost = 8;
+
+PerCoreStepInfo CreateOneTestStep(uint32 host_id, uint32 num_steps,
+                                  uint32 step_idx, uint64 step_begin_ps) {
+  PerCoreStepInfo result;
+  uint32 step_num =
+      step_idx * host_id;  // creates the situation where each host has a
+                           // different step number for the same step.
+  result.set_step_num(step_num);
+  StepInfoResult info;
+  info.set_step_num(step_num);
+  if (host_id == 0 && step_idx == (num_steps - 1)) {
+    // Makes the last step on host_id is little bit shorter so that host-0 will
+    // be chosen as the chief.
+    info.set_duration_ps(kStepDurationPs - 1);
+  } else {
+    info.set_duration_ps(kStepDurationPs);
+  }
+  info.set_begin_ps(step_begin_ps);
+  // Don't care about the rest of the fields in StepInfoResult.
+  for (uint32 core_id = 0; core_id < kNumCoresPerHost; core_id++) {
+    (*result.mutable_step_info_per_core())[core_id] = info;
+    // Don't care about the rest of the fields in PerCoreStepInfo.
+  }
+  return result;
+}
+
+PerHostStepDb CreateTestSteps(uint32 num_hosts, uint64 shift_ps) {
+  PerHostStepDb result;
+  uint64 first_step_begin_ps = 0;
+  for (uint32 host_id = 0; host_id < num_hosts; host_id++) {
+    StepDatabaseResult step_db;
+    uint64 step_begin_ps = first_step_begin_ps;
+    for (uint32 step_idx = 0; step_idx < kNumStepsPerHost; step_idx++) {
+      *step_db.add_step_sequence() =
+          CreateOneTestStep(host_id, kNumStepsPerHost, step_idx, step_begin_ps);
+      step_begin_ps += (kStepDurationPs + kStepGapPs);
+    }
+    result[host_id] = step_db;
+    first_step_begin_ps += shift_ps;
+  }
+  return result;
+}
+
+PerHostStepDb CreateNoStep(uint32 num_hosts) {
+  PerHostStepDb result;
+  for (uint32 host_id = 0; host_id < num_hosts; host_id++) {
+    StepDatabaseResult step_db;
+    result[host_id] = step_db;
+  }
+  return result;
+}
+
+absl::flat_hash_map<uint32 /*=host_id*/, const StepDatabaseResult*> Convert(
+    const PerHostStepDb& perhost_stepdb) {
+  absl::flat_hash_map<uint32 /*=host_id*/, const StepDatabaseResult*> result;
+  for (const auto& hostid_stepdb : perhost_stepdb) {
+    auto host_id = hostid_stepdb.first;
+    const auto& step_db = hostid_stepdb.second;
+    result[host_id] = &step_db;
+  }
+  return result;
+}
+
+TEST(StepIntersectionTest, EachHostShiftedBy1StepDuration) {
+  uint32 num_hosts = 4;
+  uint64 shift_ps = kStepDurationPs;
+
+  PerHostStepDb perhost_stepdb = CreateTestSteps(num_hosts, shift_ps);
+  StepIntersection intersection =
+      StepIntersection(kNumStepsPerHost, Convert(perhost_stepdb));
+  EXPECT_EQ(intersection.StepsDropped(), 0);
+  uint32 dst_num_steps = kNumStepsPerHost - num_hosts + 1;
+  EXPECT_EQ(intersection.NumSteps(), dst_num_steps);
+
+  uint32 src_first_step_index = intersection.FirstStepIndex(0);
+  EXPECT_EQ(src_first_step_index, num_hosts - 1);
+  std::vector<uint32> dst_step_numbers = intersection.DstStepNumbers();
+  for (uint32 i = 0; i < dst_num_steps; i++) {
+    EXPECT_EQ(dst_step_numbers[i], i);
+  }
+}
+
+TEST(StepIntersectionTest, ExactlyNoShift) {
+  uint32 num_hosts = 4;
+  uint64 shift_ps = 0;
+
+  PerHostStepDb perhost_stepdb = CreateTestSteps(num_hosts, shift_ps);
+  StepIntersection intersection =
+      StepIntersection(kNumStepsPerHost, Convert(perhost_stepdb));
+  EXPECT_EQ(intersection.StepsDropped(), 0);
+  uint32 dst_num_steps = kNumStepsPerHost;
+  EXPECT_EQ(intersection.NumSteps(), dst_num_steps);
+
+  std::vector<uint32> dst_step_numbers = intersection.DstStepNumbers();
+  for (uint32 i = 0; i < dst_num_steps; i++) {
+    EXPECT_EQ(dst_step_numbers[i], i);
+  }
+  for (uint32 host_id = 0; host_id < num_hosts; host_id++) {
+    uint32 src_first_step_index = intersection.FirstStepIndex(host_id);
+    EXPECT_EQ(src_first_step_index, 0);
+  }
+}
+
+TEST(StepIntersectionTest, EachHostShiftedByJustABit) {
+  uint32 num_hosts = 4;
+  uint64 shift_ps = 100;
+
+  PerHostStepDb perhost_stepdb = CreateTestSteps(num_hosts, shift_ps);
+  StepIntersection intersection =
+      StepIntersection(kNumStepsPerHost, Convert(perhost_stepdb));
+  EXPECT_EQ(intersection.StepsDropped(), 0);
+  uint32 dst_num_steps = kNumStepsPerHost;
+  EXPECT_EQ(intersection.NumSteps(), dst_num_steps);
+
+  std::vector<uint32> dst_step_numbers = intersection.DstStepNumbers();
+  for (uint32 i = 0; i < dst_num_steps; i++) {
+    EXPECT_EQ(dst_step_numbers[i], i);
+  }
+  for (uint32 host_id = 0; host_id < num_hosts; host_id++) {
+    uint32 src_first_step_index = intersection.FirstStepIndex(host_id);
+    EXPECT_EQ(src_first_step_index, 0);
+  }
+}
+
+TEST(StepIntersectionTest, SingleHost) {
+  uint32 num_hosts = 1;
+  uint64 shift_ps = 0;
+
+  PerHostStepDb perhost_stepdb = CreateTestSteps(num_hosts, shift_ps);
+  StepIntersection intersection =
+      StepIntersection(kNumStepsPerHost, Convert(perhost_stepdb));
+  EXPECT_EQ(intersection.StepsDropped(), 0);
+  uint32 dst_num_steps = kNumStepsPerHost;
+  EXPECT_EQ(intersection.NumSteps(), dst_num_steps);
+
+  std::vector<uint32> dst_step_numbers = intersection.DstStepNumbers();
+  for (uint32 i = 0; i < dst_num_steps; i++) {
+    EXPECT_EQ(dst_step_numbers[i], i);
+  }
+  for (uint32 host_id = 0; host_id < num_hosts; host_id++) {
+    uint32 src_first_step_index = intersection.FirstStepIndex(host_id);
+    EXPECT_EQ(src_first_step_index, 0);
+  }
+}
+
+TEST(StepIntersectionTest, WithMaxSteps) {
+  uint32 num_hosts = 4;
+  uint64 shift_ps = 0;
+  uint32 max_steps = 3;
+
+  PerHostStepDb perhost_stepdb = CreateTestSteps(num_hosts, shift_ps);
+  StepIntersection intersection =
+      StepIntersection(max_steps, Convert(perhost_stepdb));
+  EXPECT_EQ(intersection.StepsDropped(), kNumStepsPerHost - max_steps);
+  EXPECT_EQ(intersection.NumSteps(), max_steps);
+}
+
+TEST(StepIntersectionTest, NoStep) {
+  uint32 num_hosts = 4;
+  uint32 max_steps = 100;
+  PerHostStepDb perhost_stepdb = CreateNoStep(num_hosts);
+  StepIntersection intersection =
+      StepIntersection(max_steps, Convert(perhost_stepdb));
+  EXPECT_EQ(intersection.NumSteps(), 0);
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/tf_op_utils.cc b/tensorflow/core/profiler/utils/tf_op_utils.cc
index 941676079b9..43503906f67 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils.cc
+++ b/tensorflow/core/profiler/utils/tf_op_utils.cc
@@ -88,6 +88,8 @@ TfOp ParseTfOpFullname(absl::string_view tf_op_fullname) {
     tf_op = {Category::kTensorFlow, parts[0], parts[1]};
   } else if (IsJaxOpType(parts[1])) {
     tf_op = {Category::kJax, parts[0], parts[1]};
+  } else if (parts[1].empty()) {
+    tf_op.name = parts[0];  // remove trailing ':'
   }
   return tf_op;
 }
diff --git a/tensorflow/core/profiler/utils/tf_op_utils.h b/tensorflow/core/profiler/utils/tf_op_utils.h
index af14e1ccb8e..0713e34faf4 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils.h
+++ b/tensorflow/core/profiler/utils/tf_op_utils.h
@@ -21,15 +21,16 @@ limitations under the License.
 
 #include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/macros.h"
 
 namespace tensorflow {
 namespace profiler {
 
 // Special op types.
-ABSL_CONST_INIT extern const absl::string_view kUnknownOp;
-ABSL_CONST_INIT extern const absl::string_view kDatasetOp;
-ABSL_CONST_INIT extern const absl::string_view kMemcpyHToDOp;
-ABSL_CONST_INIT extern const absl::string_view kMemcpyDToHOp;
+TF_CONST_INIT extern const absl::string_view kUnknownOp;
+TF_CONST_INIT extern const absl::string_view kDatasetOp;
+TF_CONST_INIT extern const absl::string_view kMemcpyHToDOp;
+TF_CONST_INIT extern const absl::string_view kMemcpyDToHOp;
 
 enum class Category {
   kTensorFlow,
diff --git a/tensorflow/core/profiler/utils/tf_op_utils_test.cc b/tensorflow/core/profiler/utils/tf_op_utils_test.cc
index 136dbee2430..eeda6102da1 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils_test.cc
+++ b/tensorflow/core/profiler/utils/tf_op_utils_test.cc
@@ -132,6 +132,15 @@ TEST(TfOpUtilsTest, JaxOpTest) {
   EXPECT_EQ(TfOpEventName(kName), "op_type");
 }
 
+TEST(TfOpUtilsTest, OpWithoutTypeTest) {
+  const absl::string_view kName = "OpName:";  // with trailing ':'
+  TfOp tf_op = ParseTfOpFullname(kName);
+  EXPECT_EQ(tf_op.category, Category::kUnknown);
+  EXPECT_EQ(tf_op.name, "OpName");
+  EXPECT_EQ(tf_op.type, kUnknownOp);
+  EXPECT_EQ(TfOpEventName(kName), "OpName");  // without trailing ':'
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/tf_xplane_visitor.h b/tensorflow/core/profiler/utils/tf_xplane_visitor.h
index 17a7b94ef92..45459173a71 100644
--- a/tensorflow/core/profiler/utils/tf_xplane_visitor.h
+++ b/tensorflow/core/profiler/utils/tf_xplane_visitor.h
@@ -24,7 +24,8 @@ namespace tensorflow {
 namespace profiler {
 
 inline XPlaneVisitor CreateTfXPlaneVisitor(const XPlane* plane) {
-  return XPlaneVisitor(plane, {FindHostEventType}, {FindStatType});
+  return XPlaneVisitor(plane, {FindHostEventType, FindTfOpEventType},
+                       {FindStatType});
 }
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/utils/time_utils.h b/tensorflow/core/profiler/utils/time_utils.h
index 0a2518b90ff..cef1bda0b76 100644
--- a/tensorflow/core/profiler/utils/time_utils.h
+++ b/tensorflow/core/profiler/utils/time_utils.h
@@ -22,6 +22,8 @@ namespace tensorflow {
 namespace profiler {
 
 // Converts among different time units.
+// NOTE: We use uint64 for picoseconds and nanoseconds, which are used in
+// storage, and double for other units that are used in the UI.
 inline double PicosToNanos(uint64 ps) { return ps / 1E3; }
 inline double PicosToMicros(uint64 ps) { return ps / 1E6; }
 inline double PicosToMillis(uint64 ps) { return ps / 1E9; }
@@ -29,9 +31,9 @@ inline double PicosToSeconds(uint64 ps) { return ps / 1E12; }
 inline uint64 NanosToPicos(uint64 ns) { return ns * 1000; }
 inline double NanosToMicros(uint64 ns) { return ns / 1E3; }
 inline double MicrosToMillis(double us) { return us / 1E3; }
-inline uint64 MillisToPicos(uint64 ms) { return ms * 1000000000; }
-inline uint64 MillisToNanos(uint64 ms) { return ms * 1000000; }
-inline double MillisToSeconds(uint64 ms) { return ms / 1E3; }
+inline uint64 MillisToPicos(double ms) { return ms * 1E9; }
+inline uint64 MillisToNanos(double ms) { return ms * 1E6; }
+inline double MillisToSeconds(double ms) { return ms / 1E3; }
 inline uint64 SecondsToNanos(double s) { return s * 1E9; }
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index 19831a53c4c..78fd7af584c 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/utils/tf_op_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -30,6 +31,7 @@ const absl::string_view kGpuPlanePrefix = "/device:GPU:";
 const absl::string_view kCuptiDriverApiPlaneName = "/host:CUPTI";
 const absl::string_view kMetadataPlaneName = "/host:metadata";
 const absl::string_view kTFStreamzPlaneName = "/host:tfstreamz";
+const absl::string_view kPythonTracerPlaneName = "/host:python-tracer";
 
 const absl::string_view kStepLineName = "Steps";
 const absl::string_view kTensorFlowNameScopeLineName = "TensorFlow Name Scope";
@@ -107,6 +109,9 @@ const HostEventTypeMap& GetHostEventTypeMap() {
       {"MapAndBatchConsume", kMapAndBatchConsume},
       {"ParseExampleProduce", kParseExampleProduce},
       {"ParseExampleConsume", kParseExampleConsume},
+      // Batching related.
+      {"BatchingSessionRun", kBatchingSessionRun},
+      {"ProcessBatch", kProcessBatch},
       // JAX related.
       {"LocalExecutable::ExecuteOnLocalDevices", kExecuteOnLocalDevices},
       // GPU related.
@@ -181,6 +186,7 @@ const StatTypeMap& GetStatTypeMap() {
       {"tracing_count", kTfFunctionTracingCount},
       {"flops", kFlops},
       {"bytes_accessed", kBytesAccessed},
+      {"selected_group_ids", kSelectedGroupIds},
       // Performance counter related.
       {"Raw Value", kRawValue},
       {"Scaled Value", kScaledValue},
@@ -226,6 +232,19 @@ absl::optional<int64> FindHostEventType(absl::string_view event_name) {
   return absl::nullopt;
 }
 
+absl::optional<int64> FindTfOpEventType(absl::string_view event_name) {
+  // TF op names.
+  Category category = ParseTfOpFullname(event_name).category;
+  switch (category) {
+    case Category::kTensorFlow:
+      return HostEventType::kTfOpRun;
+    case Category::kTfData:
+      return HostEventType::kIterator;
+    default:
+      return absl::nullopt;
+  }
+}
+
 absl::string_view GetStatTypeStr(StatType stat_type) {
   return GetStatTypeStrMap().at(stat_type);
 }
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index ea3656106ce..8a9ac4fd278 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -21,29 +21,32 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace profiler {
 
 // Name of XPlane that contains TraceMe events.
-ABSL_CONST_INIT extern const absl::string_view kHostThreadsPlaneName;
+TF_CONST_INIT extern const absl::string_view kHostThreadsPlaneName;
 // Name prefix of XPlane that contains GPU events.
-ABSL_CONST_INIT extern const absl::string_view kGpuPlanePrefix;
+TF_CONST_INIT extern const absl::string_view kGpuPlanePrefix;
 // Name of XPlane that contains CUPTI driver API generated events.
-ABSL_CONST_INIT extern const absl::string_view kCuptiDriverApiPlaneName;
+TF_CONST_INIT extern const absl::string_view kCuptiDriverApiPlaneName;
 // Name of XPlane that contains profile metadata such as XLA debug info.
-ABSL_CONST_INIT extern const absl::string_view kMetadataPlaneName;
+TF_CONST_INIT extern const absl::string_view kMetadataPlaneName;
 // Name of XPlane that contains kpi related metrics.
-ABSL_CONST_INIT extern const absl::string_view kTFStreamzPlaneName;
+TF_CONST_INIT extern const absl::string_view kTFStreamzPlaneName;
+// Name of XPlane that contains events from python tracer.
+TF_CONST_INIT extern const absl::string_view kPythonTracerPlaneName;
 
 // Names of XLines that contain ML-level events.
-ABSL_CONST_INIT extern const absl::string_view kStepLineName;
-ABSL_CONST_INIT extern const absl::string_view kTensorFlowNameScopeLineName;
-ABSL_CONST_INIT extern const absl::string_view kTensorFlowOpLineName;
-ABSL_CONST_INIT extern const absl::string_view kXlaModuleLineName;
-ABSL_CONST_INIT extern const absl::string_view kXlaOpLineName;
-ABSL_CONST_INIT extern const absl::string_view kKernelLaunchLineName;
+TF_CONST_INIT extern const absl::string_view kStepLineName;
+TF_CONST_INIT extern const absl::string_view kTensorFlowNameScopeLineName;
+TF_CONST_INIT extern const absl::string_view kTensorFlowOpLineName;
+TF_CONST_INIT extern const absl::string_view kXlaModuleLineName;
+TF_CONST_INIT extern const absl::string_view kXlaOpLineName;
+TF_CONST_INIT extern const absl::string_view kKernelLaunchLineName;
 
 // Interesting event types (i.e., TraceMe names).
 enum HostEventType {
@@ -97,6 +100,9 @@ enum HostEventType {
   kMapAndBatchConsume,
   kParseExampleProduce,
   kParseExampleConsume,
+  // Batching related.
+  kBatchingSessionRun,
+  kProcessBatch,
   // JAX related.
   kExecuteOnLocalDevices,
   // GPU related.
@@ -169,6 +175,7 @@ enum StatType {
   kTfFunctionTracingCount,
   kFlops,
   kBytesAccessed,
+  kSelectedGroupIds,
   // Performance counter related.
   kRawValue,
   kScaledValue,
@@ -191,10 +198,6 @@ inline std::string GpuPlaneName(int32 device_ordinal) {
   return absl::StrCat(kGpuPlanePrefix, device_ordinal);
 }
 
-inline bool IsGpuPlaneName(absl::string_view plane_name) {
-  return absl::StartsWith(plane_name, kGpuPlanePrefix);
-}
-
 absl::string_view GetHostEventTypeStr(HostEventType event_type);
 
 bool IsHostEventType(HostEventType event_type, absl::string_view event_name);
@@ -206,6 +209,8 @@ inline bool IsHostEventType(HostEventType event_type,
 
 absl::optional<int64> FindHostEventType(absl::string_view event_name);
 
+absl::optional<int64> FindTfOpEventType(absl::string_view event_name);
+
 absl::string_view GetStatTypeStr(StatType stat_type);
 
 bool IsStatType(StatType stat_type, absl::string_view stat_name);
diff --git a/tensorflow/core/profiler/utils/xplane_utils.cc b/tensorflow/core/profiler/utils/xplane_utils.cc
index 867d1315053..1389af2f16a 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.cc
+++ b/tensorflow/core/profiler/utils/xplane_utils.cc
@@ -40,13 +40,6 @@ Timespan XEventTimespan(const XEvent& event) {
   return Timespan(event.offset_ps(), event.duration_ps());
 }
 
-// Functor that compares XEvents of the same XLine for sorting by timespan.
-struct XEventsComparator {
-  bool operator()(const XEvent* a, const XEvent* b) const {
-    return XEventTimespan(*a) < XEventTimespan(*b);
-  }
-};
-
 }  // namespace
 
 const XPlane* FindPlaneWithName(const XSpace& space, absl::string_view name) {
@@ -144,6 +137,10 @@ void RemoveEmptyLines(XPlane* plane) {
                lines->end());
 }
 
+bool XEventsComparator::operator()(const XEvent* a, const XEvent* b) const {
+  return XEventTimespan(*a) < XEventTimespan(*b);
+}
+
 void SortXPlane(XPlane* plane) {
   for (XLine& line : *plane->mutable_lines()) {
     auto& events = *line.mutable_events();
@@ -243,5 +240,16 @@ uint64 GetStartTimestampNs(const XPlane& plane) {
   return plane_timestamp;
 }
 
+bool IsEmpty(const XSpace& space) {
+  for (const auto& plane : space.planes()) {
+    for (const auto& line : plane.lines()) {
+      if (!line.events().empty()) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/xplane_utils.h b/tensorflow/core/profiler/utils/xplane_utils.h
index ff65f5af3ef..eab2c8a8858 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.h
+++ b/tensorflow/core/profiler/utils/xplane_utils.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/trace_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -75,6 +76,26 @@ void SortXPlane(XPlane* plane);
 // Sorts each plane of the XSpace.
 void SortXSpace(XSpace* space);
 
+// Functor that compares XEvents for sorting by timespan.
+struct XEventsComparator {
+  bool operator()(const XEvent* a, const XEvent* b) const;
+};
+
+// Returns a sorted vector of all XEvents in the given XPlane.
+template <class Compare>
+std::vector<XEvent*> GetSortedEvents(XPlane* plane, Compare comp,
+                                     bool include_derived_events = false) {
+  std::vector<XEvent*> events;
+  for (XLine& line : *plane->mutable_lines()) {
+    if (!include_derived_events && IsDerivedThreadId(line.id())) continue;
+    for (XEvent& event : *line.mutable_events()) {
+      events.push_back(&event);
+    }
+  }
+  absl::c_sort(events, XEventsComparator());
+  return events;
+}
+
 // Normalize timestamps by time-shifting to start_time_ns_ as origin.
 void NormalizeTimestamps(XPlane* plane, uint64 start_time_ns);
 void NormalizeTimestamps(XSpace* space, uint64 start_time_ns);
@@ -89,6 +110,9 @@ void MergePlanes(const XPlane& src_plane, XPlane* dst_plane);
 // timestamps. If zero line exists, return 0;
 uint64 GetStartTimestampNs(const XPlane& plane);
 
+// Returns true if there are no XEvents.
+bool IsEmpty(const XSpace& space);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/utils/xplane_visitor.cc b/tensorflow/core/profiler/utils/xplane_visitor.cc
index 42068b7c61a..626657a5c2d 100644
--- a/tensorflow/core/profiler/utils/xplane_visitor.cc
+++ b/tensorflow/core/profiler/utils/xplane_visitor.cc
@@ -81,38 +81,40 @@ XPlaneVisitor::XPlaneVisitor(const XPlane* plane,
                              const TypeGetterList& event_type_getter_list,
                              const TypeGetterList& stat_type_getter_list)
     : XStatsOwner<XPlane>(this, plane), plane_(plane) {
-  for (const auto& event_type_getter : event_type_getter_list) {
-    BuildEventTypeMap(plane, event_type_getter);
-  }
-  for (const auto& stat_type_getter : stat_type_getter_list) {
-    BuildStatTypeMap(plane, stat_type_getter);
-  }
+  BuildEventTypeMap(plane, event_type_getter_list);
+  BuildStatTypeMap(plane, stat_type_getter_list);
 }
 
-void XPlaneVisitor::BuildEventTypeMap(const XPlane* plane,
-                                      const TypeGetter& event_type_getter) {
+void XPlaneVisitor::BuildEventTypeMap(
+    const XPlane* plane, const TypeGetterList& event_type_getter_list) {
   for (const auto& event_metadata : plane->event_metadata()) {
     uint64 metadata_id = event_metadata.first;
     const auto& metadata = event_metadata.second;
-    absl::optional<int64> event_type = event_type_getter(metadata.name());
-    if (event_type.has_value()) {
-      auto result = event_metadata_id_map_.emplace(metadata_id, *event_type);
-      DCHECK(result.second);  // inserted
-      event_type_map_.emplace(*event_type, &metadata);
+    for (const auto& event_type_getter : event_type_getter_list) {
+      absl::optional<int64> event_type = event_type_getter(metadata.name());
+      if (event_type.has_value()) {
+        auto result = event_metadata_id_map_.emplace(metadata_id, *event_type);
+        DCHECK(result.second);  // inserted
+        event_type_map_.emplace(*event_type, &metadata);
+        break;
+      }
     }
   }
 }
 
-void XPlaneVisitor::BuildStatTypeMap(const XPlane* plane,
-                                     const TypeGetter& stat_type_getter) {
+void XPlaneVisitor::BuildStatTypeMap(
+    const XPlane* plane, const TypeGetterList& stat_type_getter_list) {
   for (const auto& stat_metadata : plane->stat_metadata()) {
     uint64 metadata_id = stat_metadata.first;
     const auto& metadata = stat_metadata.second;
-    absl::optional<int64> stat_type = stat_type_getter(metadata.name());
-    if (stat_type.has_value()) {
-      auto result = stat_metadata_id_map_.emplace(metadata_id, *stat_type);
-      DCHECK(result.second);  // inserted
-      stat_type_map_.emplace(*stat_type, &metadata);
+    for (const auto& stat_type_getter : stat_type_getter_list) {
+      absl::optional<int64> stat_type = stat_type_getter(metadata.name());
+      if (stat_type.has_value()) {
+        auto result = stat_metadata_id_map_.emplace(metadata_id, *stat_type);
+        DCHECK(result.second);  // inserted
+        stat_type_map_.emplace(*stat_type, &metadata);
+        break;
+      }
     }
   }
 }
diff --git a/tensorflow/core/profiler/utils/xplane_visitor.h b/tensorflow/core/profiler/utils/xplane_visitor.h
index f0e4204e4d3..93830c0852a 100644
--- a/tensorflow/core/profiler/utils/xplane_visitor.h
+++ b/tensorflow/core/profiler/utils/xplane_visitor.h
@@ -239,9 +239,9 @@ class XPlaneVisitor : public XStatsOwner<XPlane> {
 
  private:
   void BuildEventTypeMap(const XPlane* plane,
-                         const TypeGetter& event_type_getter);
+                         const TypeGetterList& event_type_getter_list);
   void BuildStatTypeMap(const XPlane* plane,
-                        const TypeGetter& stat_type_getter);
+                        const TypeGetterList& stat_type_getter_list);
 
   const XPlane* plane_;
 
diff --git a/tensorflow/core/protobuf/BUILD b/tensorflow/core/protobuf/BUILD
index 69019cec9ce..2a0352d0b5e 100644
--- a/tensorflow/core/protobuf/BUILD
+++ b/tensorflow/core/protobuf/BUILD
@@ -3,7 +3,6 @@ load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_additional_all_protos",
     "tf_proto_library",
-    "tf_proto_library_cc",
     "tf_pyclif_proto_library",
 )
 
@@ -66,7 +65,7 @@ tf_proto_library(
     ],
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "worker_proto",
     srcs = ["worker.proto"],
     cc_api_version = 2,
@@ -74,7 +73,7 @@ tf_proto_library_cc(
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "worker_service_proto",
     srcs = ["worker_service.proto"],
     has_services = 1,
@@ -84,7 +83,7 @@ tf_proto_library_cc(
     protodeps = [":worker_proto"],
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "master_proto",
     srcs = ["master.proto"],
     cc_api_version = 2,
@@ -92,7 +91,7 @@ tf_proto_library_cc(
     visibility = ["//tensorflow:internal"],
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "master_service_proto",
     srcs = ["master_service.proto"],
     has_services = 1,
@@ -102,7 +101,7 @@ tf_proto_library_cc(
     protodeps = [":master_proto"],
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "eager_service_proto",
     srcs = ["eager_service.proto"],
     has_services = 1,
@@ -113,7 +112,7 @@ tf_proto_library_cc(
     protodeps = tf_additional_all_protos(),
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "replay_log_proto",
     srcs = ["replay_log.proto"],
     cc_api_version = 2,
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index fec929a0b03..29e3e8a4ce3 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -568,6 +568,9 @@ message ConfigProto {
     // Session::Extend() may not be supported.
     bool optimize_for_static_graph = 12;
 
+    // This field will eventually be deprecated and replaced by
+    // mlir_bridge_rollout (b/166038521).
+    //
     // Whether to enable the MLIR-based TF->XLA bridge.
     //
     // This is a replacement to the existing bridge, and not ready for
@@ -581,6 +584,22 @@ message ConfigProto {
     // to lower the encapsulated graph to a particular device.
     bool enable_mlir_bridge = 13;
 
+    // An enum that describes the state of the MLIR bridge rollout.
+    enum MlirBridgeRollout {
+      // If this field is left unspecified, the MLIR bridge may be selectively
+      // enabled on a per graph basis.
+      MLIR_BRIDGE_ROLLOUT_UNSPECIFIED = 0;
+      // Enabling the MLIR bridge enables it for all graphs in this session.
+      MLIR_BRIDGE_ROLLOUT_ENABLED = 1;
+      // Disabling the MLIR bridge disables it for all graphs in this session.
+      MLIR_BRIDGE_ROLLOUT_DISABLED = 2;
+    }
+    // This field is underdevelopment, for now use enable_mlir_bridge
+    // (b/166038521).
+    //
+    // Whether to enable the MLIR-based TF->XLA bridge.
+    MlirBridgeRollout mlir_bridge_rollout = 17;
+
     // Whether to enable the MLIR-based Graph optimizations.
     //
     // This will become a part of standard Tensorflow graph optimization
diff --git a/tensorflow/core/protobuf/data/experimental/service_config.proto b/tensorflow/core/protobuf/data/experimental/service_config.proto
index 017aaa2a960..7a0aa16e2c4 100644
--- a/tensorflow/core/protobuf/data/experimental/service_config.proto
+++ b/tensorflow/core/protobuf/data/experimental/service_config.proto
@@ -9,12 +9,17 @@ message DispatcherConfig {
   int64 port = 1;
   // The protocol for the dispatcher to use when connecting to workers.
   string protocol = 2;
-  // An optional work directory to use for storing dispatcher state, and for
-  // recovering during restarts.
+  // A work directory to use for storing dispatcher state, and for recovering
+  // during restarts. The empty string indicates not to use any work directory.
   string work_dir = 3;
   // Whether to run in fault tolerant mode, where dispatcher state is saved
-  // across restarts.
+  // across restarts. Requires that `work_dir` is nonempty.
   bool fault_tolerant_mode = 4;
+  // How often the dispatcher should scan through to delete old and unused jobs.
+  int64 job_gc_check_interval_ms = 5;
+  // How long a job needs to be unused before it becomes a candidate for garbage
+  // collection.
+  int64 job_gc_timeout_ms = 6;
 }
 
 // Configuration for a tf.data service WorkerServer.
@@ -30,4 +35,6 @@ message WorkerConfig {
   // will be replaced with the worker's bound port. This is useful when the port
   // is set to `0`.
   string worker_address = 4;
+  // How often the worker should heartbeat to the master.
+  int64 heartbeat_interval_ms = 5;
 }
diff --git a/tensorflow/core/protobuf/saved_object_graph.proto b/tensorflow/core/protobuf/saved_object_graph.proto
index 83ba782f2ae..a5b4cfbe823 100644
--- a/tensorflow/core/protobuf/saved_object_graph.proto
+++ b/tensorflow/core/protobuf/saved_object_graph.proto
@@ -124,6 +124,13 @@ message SavedBareConcreteFunction {
   repeated string argument_keywords = 2;
   // The prefix of `argument_keywords` which may be identified by position.
   int64 allowed_positional_arguments = 3;
+  // The spec of the function that this ConcreteFunction is traced from. This
+  // allows the ConcreteFunction to be called with nest structure inputs. This
+  // field may not be populated. If this field is absent, the concrete function
+  // can only be called with flat inputs.
+  // TODO(b/169361281): support calling saved ConcreteFunction with structured
+  // inputs in C++ SavedModel API.
+  FunctionSpec function_spec = 4;
 }
 
 message SavedConstant {
@@ -141,8 +148,16 @@ message SavedVariable {
   VariableAggregation aggregation = 5;
   string name = 6;
   string device = 7;
+  // List of component variables for a distributed variable.
+  //
+  // When this field is non-empty, the SavedVariable will be assumed
+  // to be a distributed variable defined by the components listed here.
+  //
+  // This is only supported by experimental loaders at the moment.
+  repeated SavedVariable experimental_distributed_variable_components = 8;
 }
 
+
 // Represents `FunctionSpec` used in `Function`. This represents a
 // function that has been wrapped as a TensorFlow `Function`.
 message FunctionSpec {
@@ -153,6 +168,21 @@ message FunctionSpec {
   // The input signature, if specified.
   StructuredValue input_signature = 5;
 
+  // Whether the function should be compiled by XLA.
+  //
+  // The public interface to `tf.function` uses an optional boolean to
+  // represent three distinct states for this field.  Unfortunately, proto3
+  // removes the ability to explicitly check for the presence or absence of a
+  // field, so we instead map to an enum.
+  //
+  // See `tf.function` for details.
+  enum ExperimentalCompile {
+    DEFAULT = 0;
+    ON = 1;
+    OFF = 2;
+  }
+  ExperimentalCompile experimental_compile = 6;
+
   reserved 3, 4;
 }
 
diff --git a/tensorflow/core/protobuf/tpu/BUILD b/tensorflow/core/protobuf/tpu/BUILD
index 72c39d20bda..364ebabd16d 100644
--- a/tensorflow/core/protobuf/tpu/BUILD
+++ b/tensorflow/core/protobuf/tpu/BUILD
@@ -2,8 +2,6 @@ load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_additional_all_protos",
     "tf_proto_library",
-    "tf_proto_library_cc",
-    "tf_proto_library_py",  # @unused
     "tf_pyclif_proto_library",
 )
 
@@ -11,8 +9,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 tf_proto_library(
     name = "tpu_embedding_configuration_proto",
     srcs = [
@@ -32,6 +28,7 @@ tf_proto_library(
         "optimization_parameters.proto",
     ],
     cc_api_version = 2,
+    protodeps = ["//tensorflow/compiler/xla/service:hlo_proto"],
     visibility = ["//visibility:public"],
 )
 
@@ -62,7 +59,7 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "compilation_result_proto",
     srcs = [
         "compilation_result.proto",
@@ -74,19 +71,7 @@ tf_proto_library_cc(
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library_py(
-    name = "compilation_result_proto",
-    srcs = [
-        "compilation_result.proto",
-    ],
-    protodeps = tf_additional_all_protos() + [
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo_proto",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-tf_proto_library_cc(
+tf_proto_library(
     name = "compile_metadata_proto",
     srcs = [
         "compile_metadata.proto",
diff --git a/tensorflow/core/protobuf/tpu/optimization_parameters.proto b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
index f7748ef5689..76c817957df 100644
--- a/tensorflow/core/protobuf/tpu/optimization_parameters.proto
+++ b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
@@ -3,6 +3,7 @@ syntax = "proto3";
 package tensorflow.tpu;
 
 import "google/protobuf/wrappers.proto";
+import "tensorflow/compiler/xla/service/hlo.proto";
 
 message ClippingLimits {
   google.protobuf.FloatValue lower = 1;  // -inf if not set
@@ -277,6 +278,74 @@ message ProximalYogiParameters {
   reserved 9;  // tanh
 }
 
+// Estimator for the frequency of updates to a lookup table. It maintains an
+// array (tf.Variable) D, where each element records the average number of
+// global steps between two consecutive batches that hit the corresponding
+// bucket. Once an item with bucket id i is sampled, D[i] is updated by:
+//   D[i] <- D[i] * (1 - tau) + delta[i] * tau,
+//
+// where tau is a learning rate between 0 and 1 (exclusive), and
+//   delta[i] = current global step - last step i is sampled.
+//
+// The estimated frequency (sampling rate in a batch) is thus 1 / D[i].
+//
+// Elements in D are initialized with a large value max_delta. delta[i] will
+// also be capped by this value.
+//
+// The exact sequence of operations used in the optimizer is shown below.
+// last_hit_step[i] is a tf.Variable that holds the last global step at which i
+// was sampled.
+//
+//   delta = global_step - last_hit_step[i]
+//   clipped_delta = min(delta, params.max_delta)
+//   is_outlier = (delta >= params.outlier_threshold * D[i])
+//   D[i] <- is_outlier ? clipped_delta
+//                      : D[i] * (1 - params.tau) + clipped_delta * params.tau
+//   last_hit_step[i] <- global_step
+message FrequencyEstimatorParameters {
+  // Learning rate between (0, 1) that is used to update the array D.
+  float tau = 1;
+
+  // Maximum value of delta: difference between the current global step and the
+  // last global step at which the row was sampled.
+  float max_delta = 2;
+
+  // Threshold used to determine whether the current update is an outlier.
+  float outlier_threshold = 3;
+
+  // The weight exponent used to transform the estimated delta into weights.
+  // The transformation function is: (delta / max_delta) ^ (weight_exponent)
+  float weight_exponent = 4;
+}
+
+// A user-defined optimizer.
+// The contained HLO program must take the following arguments in the following
+// order:
+// 1.  gradients
+// 2.  table weights
+// 3.  slot variables
+// 4.  an optional scalar input that is passed in via the dynamic learning
+//     rate mechanism.
+//
+// It must return/end in a tuple op that contains the following values in the
+// following order:
+// 1.  new table values
+// 2.  new slot variable value
+//
+// The program must have shape (1,1) with dtype float32 throughout and only use
+// HLO that operate elementwise (e.g., no reduce, no variables, no control flow
+// and no broadcasting outside of the single scalar input).
+// The HLO program should be written as if it were a dense update. It will be
+// called on each row that needs an update and will applied elementwise.
+message UserDefinedProgramParameters {
+  xla.HloModuleProto program = 1;
+  // Padding values for the parameter and the slots, see
+  // StateVariableSpecification.padding_initial_value below for more details on
+  // how this should be set. One value is needed for the weights and one for
+  // each slot.
+  repeated float padding_values = 2;
+}
+
 // Status of using gradient accumulation (doing two passes over the input
 // gradients: one to accumulate them into a temporary array and another to apply
 // them using the actual optimization algorithm). The extra message is to wrap
@@ -354,6 +423,8 @@ message OptimizationParameters {
     ProximalAdagradParameters proximal_adagrad = 14;
     OnlineYogiParameters online_yogi = 20;
     ProximalYogiParameters proximal_yogi = 21;
+    FrequencyEstimatorParameters frequency_estimator = 23;
+    UserDefinedProgramParameters user_defined_program = 24;
   }
 
   reserved 15;  // Old use_gradient_accumulation.
diff --git a/tensorflow/core/protobuf/tpu/tpu_embedding_configuration.proto b/tensorflow/core/protobuf/tpu/tpu_embedding_configuration.proto
index 22be27795c7..038c7a1b8aa 100644
--- a/tensorflow/core/protobuf/tpu/tpu_embedding_configuration.proto
+++ b/tensorflow/core/protobuf/tpu/tpu_embedding_configuration.proto
@@ -87,9 +87,6 @@ message TPUEmbeddingConfiguration {
   // problem.
   bool pipeline_execution_with_tensor_core = 7;
 
-  // Extended output layout information; if not provided, a compatibility mode
-  // will use defaults that match the old layout. Providing a value for this
-  // field is EXPERIMENTAL and most ways of filling it will probably break. Do
-  // not set it unless you know what you are doing.
-  TPUEmbeddingOutputLayout output_layout = 8;
+  // Extended output layout information; deprecated and now ignored.
+  TPUEmbeddingOutputLayout output_layout = 8 [deprecated = true];
 }
diff --git a/tensorflow/core/protobuf/tpu/tpu_embedding_output_layout.proto b/tensorflow/core/protobuf/tpu/tpu_embedding_output_layout.proto
index aed30b2f22f..25ee93fdd57 100644
--- a/tensorflow/core/protobuf/tpu/tpu_embedding_output_layout.proto
+++ b/tensorflow/core/protobuf/tpu/tpu_embedding_output_layout.proto
@@ -5,6 +5,8 @@ package tensorflow.tpu;
 // In the comments here, "layout" refers to the top-level EmbeddingOutputLayout
 // proto contained in the TPUEmbeddingConfiguration.
 
+// This proto is deprecated and its contents are no longer used.
+
 // The embedding output consists of a list of tensors, each specified by an
 // EmbeddingOutputTensor proto within the EmbeddingOutputLayout (the "output"
 // field). Each table and feature lookup is then placed into some number of
@@ -15,6 +17,8 @@ package tensorflow.tpu;
 // EmbeddingOutputLayout.
 
 message TPUEmbeddingOutputLayout {
+  option deprecated = true;
+
   // Location of one copy of the feature's data.
   message OutputLocation {
     // Which output tensor this copy of the feature will go into. Must be
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index 739ba8e03e6..0b4b50236c4 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -545,8 +545,10 @@ message CompleteGroupRequest {
   int32 group_key = 1;
   int32 group_size = 2;
   string device_type = 3;
-  repeated string device_name = 4;
   int32 collective_type = 5;
+  DeviceAttributes device_attributes = 6;
+
+  reserved 4;
 }
 
 // Gives the complete membership of the group identified by group_key.
@@ -555,9 +557,10 @@ message CompleteGroupResponse {
   int32 group_size = 2;
   string device_type = 3;
   int32 num_tasks = 4;  // number of distinct tasks hosting the devices
-  repeated string device_name = 5;
-  repeated string task_name = 6;  // task name prefixes of device_names
   bytes communicator_key = 7;
+  repeated DeviceAttributes device_attributes = 8;
+
+  reserved 5, 6;
 }
 
 // Supplies data about one collective op belonging to the instance identified
diff --git a/tensorflow/core/public/BUILD b/tensorflow/core/public/BUILD
index e440735ed3a..f4c11601bb6 100644
--- a/tensorflow/core/public/BUILD
+++ b/tensorflow/core/public/BUILD
@@ -1,3 +1,6 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 package(
     default_visibility = [
         "//tensorflow/core:__subpackages__",
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 6677ae4b273..0473230c5d0 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 503  // Updated: 2020/8/24
+#define TF_GRAPH_DEF_VERSION 558  // Updated: 2020/10/18
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/summary/BUILD b/tensorflow/core/summary/BUILD
index 44c9b4b8b17..6fc4d0d5d56 100644
--- a/tensorflow/core/summary/BUILD
+++ b/tensorflow/core/summary/BUILD
@@ -1,6 +1,7 @@
 # Description:
 #   C++ implementation code for the summary writing APIs.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_binary",
@@ -107,8 +108,8 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:png_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/png:png_io",
     ],
 )
 
diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index d8abbd042b9..85586809014 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -1,5 +1,6 @@
 # Description: Utilities for TPU Operations
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_windows",
@@ -26,6 +27,9 @@ cc_library(
     hdrs = ["tpu_embedding_optimization_parameters_utils.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_proto_parsing",
@@ -41,7 +45,6 @@ cc_library(
     hdrs = ["tpu_embedding_output_layout_utils.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_proto_parsing",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_cc",
@@ -155,8 +158,10 @@ cc_library(
     deps = [
         ":libtftpu_header",
         ":tpu_api",
+        ":tpu_api_dlsym_set_fn",
         ":tpu_compilation_device",
         ":tpu_config_c_api",
+        ":tpu_executor_init_fns",
         ":tpu_library_init_fns",
         ":tpu_node_device",
         ":tpu_system_device",
@@ -172,10 +177,34 @@ cc_library(
     ],
 )
 
+# This is an alternative to "tpu_api_dlsym_initializer" that only initializes
+# methods needed for the base TPU executor APIs (and thus has fewer deps). Do
+# not link in both this and "tpu_api_dlsym_initializer".
+cc_library(
+    name = "tpu_executor_dlsym_initializer",
+    srcs = ["tpu_executor_dlsym_initializer.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tpu_api_dlsym_set_fn",
+        ":tpu_executor_init_fns",
+        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor/tpu:tpu_computation_placer",
+        "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "tpu_api_dlsym_set_fn",
+    hdrs = ["tpu_api_dlsym_set_fn.h"],
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "tpu_library_init_fns",
     hdrs = ["tpu_library_init_fns.inc"],
     visibility = ["//visibility:public"],
+    deps = [":tpu_executor_init_fns"],
 )
 
 cc_library(
@@ -282,8 +311,10 @@ cc_library(
 cc_library(
     name = "tpu_on_demand_compiler",
     srcs = ["tpu_on_demand_compiler.cc"],
+    visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:executable",
@@ -293,8 +324,10 @@ cc_library(
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor/tpu:c_api_conversions",
+        "//tensorflow/stream_executor/tpu:c_api_decl",
         "//tensorflow/stream_executor/tpu:proto_helper",
         "//tensorflow/stream_executor/tpu:status_helper",
+        "//tensorflow/stream_executor/tpu:tpu_executable_interface",
         "//tensorflow/stream_executor/tpu:tpu_executor",
         "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
         "@com_google_absl//absl/types:span",
diff --git a/tensorflow/core/tpu/graph_rewrite/BUILD b/tensorflow/core/tpu/graph_rewrite/BUILD
index bffb44c1b97..36c3b6205e1 100644
--- a/tensorflow/core/tpu/graph_rewrite/BUILD
+++ b/tensorflow/core/tpu/graph_rewrite/BUILD
@@ -1,5 +1,7 @@
 # Contains graph rewrites for TPU runtimes and optimizations.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 package(
     default_visibility = [
         "//tensorflow/core/tpu:__subpackages__",
diff --git a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
index 2544e3f7e54..cdf32c54d86 100644
--- a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
@@ -1568,7 +1568,8 @@ Status DistributedTPURewritePass::GetArgAndRetvalShapes(
     arg_shape.shape = TensorShape();  // Variables are always scalars.
     arg_shape.handle_shape = info->handle_shape;
     arg_shape.handle_type = info->handle_type;
-    TF_RET_CHECK(arg_shape.handle_type != DT_INVALID);
+    TF_RET_CHECK(arg_shape.handle_type != DT_INVALID)
+        << " input edge: " << input_edges[edge_pos]->DebugString();
     ++edge_pos;
   }
 
diff --git a/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc b/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc
index 40f9353beb4..b7c71dc5cba 100644
--- a/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc
@@ -326,6 +326,28 @@ Status RemoveIdentityNodesForArgRetval(Graph* g) {
   return Status::OK();
 }
 
+// Updates the TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR when
+// 'additional_per_replicate_inputs' are added to the inputs of `xla_node`.
+Status UpdateMirroredVariableIndices(int additional_per_replica_inputs,
+                                     Node* xla_node) {
+  std::vector<int> mirrored_variable_indices;
+  if (xla_node->attrs().Find(TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR) !=
+      nullptr) {
+    TF_RETURN_IF_ERROR(GetNodeAttr(xla_node->def(),
+                                   TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR,
+                                   &mirrored_variable_indices));
+  }
+
+  if (!mirrored_variable_indices.empty()) {
+    for (int i = 0; i < mirrored_variable_indices.size(); ++i)
+      mirrored_variable_indices[i] += additional_per_replica_inputs;
+    xla_node->ClearAttr(TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR);
+    xla_node->AddAttr(TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR,
+                      mirrored_variable_indices);
+  }
+  return Status::OK();
+}
+
 // Move outside compilation nodes at the beginning of XLA computation to host.
 // For XLA computation graph, we will add new _Arg nodes to replace those
 // outside compilation nodes.
@@ -545,6 +567,9 @@ Status MoveHeadOutsideCompilationToHost(
   xla_node->ClearAttr("Tinputs");
   xla_node->AddAttr("Tinputs", new_input_types);
 
+  TF_RETURN_IF_ERROR(UpdateMirroredVariableIndices(
+      /*additional_per_replica_inputs=*/oc_output_edges.size(), xla_node));
+
   int new_variable_start_index =
       num_new_per_replica_input_types / num_replicas + num_distributed_vars +
       broadcast_input_types.size();
diff --git a/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.cc b/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.cc
index fad8e22399c..f303588ef33 100644
--- a/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.cc
+++ b/tensorflow/core/tpu/graph_rewrite/host_training_loop_optimization_util.cc
@@ -504,9 +504,10 @@ Status AddReshardOp(Graph* graph, const HostTrainingLoopInfo& host_loop_info) {
       "TPUVariableReshard/default_shard_state", "/_", internal::GetNodeId())));
   AddNodeAttr("dtype", DT_STRING, &default_sharding);
 
-  Tensor t(DT_STRING, {2});
+  Tensor t(DT_STRING, {3});
   t.vec<tstring>()(0) = kDefaultShardingValue;
   t.vec<tstring>()(1) = kDefaultShardingValue;
+  t.vec<tstring>()(2) = kDefaultShardingValue;
   t.AsProtoTensorContent(
       (*default_sharding.mutable_attr())["value"].mutable_tensor());
 
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index be6a9d4d864..8de50acfd6c 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -1,19 +1,15 @@
 # TPU Kernel Implementations
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow/core/platform:build_config.bzl",
-    "tf_proto_library_cc",
+    "tf_proto_library",
 )
+load("//tensorflow:tensorflow.bzl", "if_libtpu", "tf_copts")
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")  # buildifier: disable=same-origin-load
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_kernel_library",
-)
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")  # buildifier: disable=same-origin-load
 
 # Config setting to enable go/libtpu support.
-WITH_TPU_SUPPORT = "//tensorflow:with_tpu_support"
-
-DEFAULT = "//conditions:default"
 
 package(
     default_visibility = [
@@ -46,10 +42,10 @@ cc_library(
     name = "tpu_compile_op_common",
     srcs = ["tpu_compile_op_common.cc"],
     hdrs = ["tpu_compile_op_common.h"],
-    deps = select({
-        WITH_TPU_SUPPORT: [":tpu_compilation_metrics"],
-        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_compilation_metrics"],
-    }) + [
+    deps = if_libtpu(
+        [":tpu_compilation_metrics"],
+        ["//tensorflow/core/tpu/kernels:tpu_compilation_metrics"],
+    ) + [
         ":tpu_compilation_cache_entry_unloader",
         ":tpu_compilation_cache_interface",
         ":tpu_compilation_metrics_hdrs",
@@ -99,13 +95,18 @@ tf_kernel_library(
     name = "tpu_configuration_ops",
     srcs = ["tpu_configuration_ops.cc"],
     hdrs = ["tpu_configuration_ops.h"],
-    deps = [
+    deps = if_libtpu(
+        [":tpu_util"],
+        ["//tensorflow/core/tpu/kernels:tpu_util"],
+    ) + [
         ":tpu_compilation_cache_factory",
         ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_local_lookup",
         ":tpu_compilation_cache_lookup",
+        ":tpu_compilation_cache_rpc_lookup",
         ":tpu_mesh_state_interface",
         ":tpu_op_consts",
+        ":tpu_pod_state",
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/compiler/xla:util",
@@ -116,6 +117,7 @@ tf_kernel_library(
         "//tensorflow/core/tpu:tpu_config_c_api",
         "//tensorflow/core/tpu:tpu_configuration",
         "//tensorflow/core/tpu:tpu_defs",
+        "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/tpu:proto_helper",
     ],
     alwayslink = 1,
@@ -129,12 +131,12 @@ cc_library(
         ":tpu_program_c_api_hdrs",
         ":tpu_util_c_api_hdrs",
         "//tensorflow/core/tpu:libtftpu_header",
-        "//tensorflow/stream_executor/tpu:proto_helper",
+        "//tensorflow/stream_executor/tpu:c_api_decl",
     ],
     alwayslink = True,
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "tpu_executable_info_proto",
     srcs = ["tpu_executable_info.proto"],
     cc_api_version = 2,
@@ -145,7 +147,7 @@ tf_proto_library_cc(
     ],
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "tpu_compile_proto",
     srcs = ["tpu_compile.proto"],
     cc_api_version = 2,
@@ -237,8 +239,8 @@ cc_library(
         "tpu_compilation_cache_lookup.h",
     ],
     deps = [
+        ":tpu_compilation_cache_common_proto_cc",
         ":tpu_compilation_cache_interface",
-        ":tpu_compilation_cache_proto_cc",
         "//tensorflow/core/lib/core:refcount",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/profiler/lib:traceme",
@@ -250,11 +252,11 @@ cc_library(
     srcs = ["tpu_compilation_cache_local_lookup.cc"],
     hdrs = ["tpu_compilation_cache_local_lookup.h"],
     deps = [
+        ":tpu_compilation_cache_common_proto_cc",
         ":tpu_compilation_cache_entry",
         ":tpu_compilation_cache_external",
         ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_lookup",
-        ":tpu_compilation_cache_proto_cc",
         "//tensorflow/core/platform:status",
     ],
 )
@@ -317,6 +319,7 @@ cc_library(
         ":tpu_mesh_state_interface",
         ":tpu_program_c_api_hdrs",
         ":tpu_program_group_interface",
+        "//tensorflow/compiler/tf2xla:host_compute_metadata_proto_cc",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/client:compile_only_client",
@@ -337,14 +340,14 @@ cc_library(
     name = "tpu_compilation_cache_interface",
     srcs = ["tpu_compilation_cache_interface.cc"],
     hdrs = ["tpu_compilation_cache_interface.h"],
-    deps = select({
-        WITH_TPU_SUPPORT: [":tpu_compilation_metrics"],
-        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_compilation_metrics"],
-    }) + [
+    deps = if_libtpu(
+        [":tpu_compilation_metrics"],
+        ["//tensorflow/core/tpu/kernels:tpu_compilation_metrics"],
+    ) + [
         ":compiled_subgraph",
+        ":tpu_compilation_cache_common_proto_cc",
         ":tpu_compilation_cache_entry",
         ":tpu_compilation_cache_key",
-        ":tpu_compilation_cache_proto_cc",
         ":tpu_compilation_metrics_hdrs",
         ":tpu_util",
         ":tpu_util_hdrs",
@@ -373,10 +376,10 @@ cc_library(
     ],
     deps = [
         ":compiled_subgraph",
+        ":tpu_compilation_cache_common_proto_cc",
         ":tpu_compilation_cache_entry",
         ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_key",
-        ":tpu_compilation_cache_proto_cc",
         ":tpu_compilation_metrics",  # buildcleaner: keep
         ":tpu_compilation_metrics_hdrs",
         ":tpu_compile_c_api_hdrs",
@@ -415,10 +418,7 @@ cc_library(
 cc_library(
     name = "tpu_compilation_metrics",
     srcs = ["tpu_compilation_metrics.cc"],
-    copts = select({
-        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
-        DEFAULT: [],
-    }),
+    copts = tf_copts(),
     deps = [
         ":tpu_compilation_metrics_hdrs",
     ],
@@ -446,6 +446,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
+        tf_grpc_cc_dependency(),
     ],
 )
 
@@ -467,6 +468,7 @@ cc_library(
     deps = [
         ":tpu_util_c_api_hdrs",
         "//tensorflow/core/tpu:libtftpu_header",
+        "//tensorflow/stream_executor/tpu:c_api_decl",
         "//tensorflow/stream_executor/tpu:proto_helper",
     ],
     alwayslink = True,
@@ -494,30 +496,35 @@ cc_library(
     hdrs = ["tpu_util.h"],
     deps = [
         ":tpu_compilation_cache_key",
-        ":tpu_program_group_interface",
+        ":tpu_util_c_api_hdrs",
         "//tensorflow/cc:ops",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:compile_only_client",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/tpu:tpu_api",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/strings:str_format",
+        tf_grpc_cc_dependency(),
     ],
     alwayslink = 1,
 )
 
+# An alias for
+cc_library(
+    name = "tpu_compilation_cache_cc_proto",
+    deps = [":tpu_compilation_cache_proto_cc"],
+)
+
 cc_library(
     name = "tpu_compilation_cache_rpc_support_hdrs",
     hdrs = ["tpu_compilation_cache_rpc_support.h"],
-    copts = select({
-        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
-        DEFAULT: [],
-    }),
-    deps = select({
-        WITH_TPU_SUPPORT: [":tpu_compilation_cache_response_proto_cc"],
-        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_compilation_cache_response_proto_cc"],
-    }) + [
+    copts = tf_copts(),
+    deps = if_libtpu(
+        [":tpu_compilation_cache_proto_cc"],
+        ["//tensorflow/core/tpu/kernels:tpu_compilation_cache_cc_proto"],
+    ) + [
         ":tpu_compilation_cache_entry",
         ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_lookup",
@@ -531,8 +538,18 @@ cc_library(
 cc_library(
     name = "tpu_compilation_cache_rpc_support",
     srcs = ["tpu_compilation_cache_rpc_support.cc"],
+    copts = tf_copts(),
     deps = [
+        ":tpu_compilation_cache_common_proto_cc",
+        ":tpu_compilation_cache_proto_cc",
         ":tpu_compilation_cache_rpc_support_hdrs",
+        ":tpu_program_group",
+        "//tensorflow/compiler/tf2xla:host_compute_metadata_proto_cc",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "//tensorflow/core/tpu:tpu_config_c_api",
+        "//tensorflow/stream_executor/tpu:proto_helper",
     ],
 )
 
@@ -540,18 +557,15 @@ cc_library(
     name = "tpu_compilation_cache_rpc_lookup",
     srcs = ["tpu_compilation_cache_rpc_lookup.cc"],
     hdrs = ["tpu_compilation_cache_rpc_lookup.h"],
-    copts = select({
-        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
-        DEFAULT: [],
-    }),
-    deps = select({
-        WITH_TPU_SUPPORT: [":tpu_compilation_cache_rpc_support"],
-        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_compilation_cache_rpc_support"],
-    }) + [
+    copts = tf_copts(),
+    deps = if_libtpu(
+        [":tpu_compilation_cache_rpc_support"],
+        ["//tensorflow/core/tpu/kernels:tpu_compilation_cache_rpc_support"],
+    ) + [
         ":tpu_compilation_cache_grpc",
         ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_lookup",
-        ":tpu_compilation_cache_proto_cc",
+        ":tpu_compilation_cache_common_proto_cc",
         ":tpu_compilation_cache_rpc_support_hdrs",
         ":tpu_program_group_interface",
         "@com_google_absl//absl/strings",
@@ -562,42 +576,62 @@ cc_library(
     ],
 )
 
-# TODO(henrytan): rename the proto file.
-tf_proto_library_cc(
-    name = "tpu_compilation_cache_response_proto",
-    srcs = ["tpu_compilation_cache_response.proto"],
-    has_services = 1,
+tf_proto_library(
+    name = "tpu_compilation_cache_proto",
+    srcs = ["tpu_compilation_cache.proto"],
+    has_services = True,
     cc_api_version = 2,
     create_java_proto = False,
     protodeps = [
-        ":tpu_compilation_cache_proto",
+        ":tpu_compilation_cache_common_proto",
         "//tensorflow/compiler/tf2xla:host_compute_metadata_proto",
     ],
 )
 
-tf_proto_library_cc(
-    name = "tpu_compilation_cache_proto",
-    srcs = ["tpu_compilation_cache.proto"],
+tf_proto_library(
+    name = "tpu_compilation_cache_common_proto",
+    srcs = ["tpu_compilation_cache_common.proto"],
     cc_api_version = 2,
     create_java_proto = False,
-    protodeps = [
-        "//tensorflow/compiler/tf2xla:host_compute_metadata_proto",
-    ],
 )
 
 cc_library(
     name = "tpu_compilation_cache_grpc",
     srcs = ["tpu_compilation_cache_grpc.cc"],
     hdrs = ["tpu_compilation_cache_grpc.h"],
-    copts = select({
-        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
-        DEFAULT: [],
-    }),
-    deps = select({
-        WITH_TPU_SUPPORT: [":tpu_compilation_cache_response_proto_cc"],
-        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_compilation_cache_response_proto_cc"],
-    }) + [
-        ":tpu_compilation_cache_proto_cc",
+    copts = tf_copts(),
+    deps = if_libtpu(
+        [":tpu_compilation_cache_proto_cc"],
+        ["//tensorflow/core/tpu/kernels:tpu_compilation_cache_cc_proto"],
+    ) + [
+        ":tpu_compilation_cache_common_proto_cc",
+        tf_grpc_cc_dependency(),
+    ],
+)
+
+cc_library(
+    name = "tpu_compilation_cache_service",
+    srcs = ["tpu_compilation_cache_service.cc"],
+    hdrs = ["tpu_compilation_cache_service.h"],
+    copts = tf_copts(),
+    deps = if_libtpu(
+        [
+            ":tpu_compilation_cache_rpc_support",
+            ":tpu_compilation_cache_proto_cc",
+        ],
+        [
+            "//tensorflow/core/tpu/kernels:tpu_compilation_cache_rpc_support",
+            "//tensorflow/core/tpu/kernels:tpu_compilation_cache_cc_proto",
+        ],
+    ) + [
+        ":tpu_compilation_cache_common_proto_cc",
+        ":tpu_compilation_cache_grpc",
+        ":tpu_compilation_cache_interface",
+        ":tpu_compilation_cache_rpc_support_hdrs",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_call",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "//tensorflow/core/lib/core:threadpool",
+        "//tensorflow/core/platform:coding",
         tf_grpc_cc_dependency(),
     ],
 )
@@ -646,10 +680,7 @@ cc_library(
     name = "tpu_compile_op_impl",
     srcs = ["tpu_compile_op_impl.cc"],
     hdrs = ["tpu_compile_op_impl.h"],
-    copts = select({
-        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
-        DEFAULT: [],
-    }),
+    copts = tf_copts(),
     deps = [
         ":tpu_compilation_cache_key",
         ":tpu_compile_c_api_hdrs",
@@ -889,3 +920,20 @@ cc_library(
     ],
     alwayslink = True,
 )
+
+cc_library(
+    name = "tpu_pod_state",
+    srcs = ["tpu_pod_state.cc"],
+    hdrs = ["tpu_pod_state.h"],
+    copts = tf_copts(),
+    deps = if_libtpu(
+        [":tpu_util"],
+        ["//tensorflow/core/tpu/kernels:tpu_util"],
+    ) + [
+        ":tpu_compilation_cache_service",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/core/tpu:tpu_api",
+        "//tensorflow/core:framework",
+    ],
+)
diff --git a/tensorflow/core/tpu/kernels/compiled_subgraph.h b/tensorflow/core/tpu/kernels/compiled_subgraph.h
index a97c652c279..091d6e74470 100644
--- a/tensorflow/core/tpu/kernels/compiled_subgraph.h
+++ b/tensorflow/core/tpu/kernels/compiled_subgraph.h
@@ -124,6 +124,9 @@ struct CompiledSubgraph : public core::RefCounted {
   // Compilation cache proto key to identify the cache entry.
   std::vector<std::string> proto_key;
 
+  // Fingerprints of sharding programs if there is any.
+  std::vector<std::string> sharding_key;
+
   // The number of 'external' client-held references to the entry.
   int external_references = 0;
 
diff --git a/tensorflow/core/tpu/kernels/host_compute_ops.cc b/tensorflow/core/tpu/kernels/host_compute_ops.cc
index 77a7d6f3bf8..5295c1c700b 100644
--- a/tensorflow/core/tpu/kernels/host_compute_ops.cc
+++ b/tensorflow/core/tpu/kernels/host_compute_ops.cc
@@ -58,9 +58,9 @@ class RecvAtHostOp : public AsyncOpKernel {
     OP_REQUIRES_ASYNC(
         ctx,
         TensorShapeUtils::IsVector(input.shape()) &&
-            input.shape().dim_size(0) == 2,
+            input.shape().dim_size(0) == 3,
         errors::InvalidArgument("Input shape ", input.shape().DebugString(),
-                                " is not a vector of length 2."),
+                                " is not a vector of length 3."),
         done);
     const string rendezvous_key_base = input.vec<tstring>()(1);
     OP_REQUIRES_ASYNC(
@@ -164,10 +164,10 @@ class SendFromHostOp : public OpKernel {
     const Tensor& key_input = ctx->input(ctx->num_inputs() - 1);
     OP_REQUIRES(ctx,
                 TensorShapeUtils::IsVector(key_input.shape()) &&
-                    key_input.shape().dim_size(0) == 2,
+                    key_input.shape().dim_size(0) == 3,
                 errors::InvalidArgument("Key input shape ",
                                         key_input.shape().DebugString(),
-                                        " is not a vector of length 2."));
+                                        " is not a vector of length 3."));
     const string rendezvous_key_base = key_input.vec<tstring>()(1);
     OP_REQUIRES(
         ctx, ctx->rendezvous() != nullptr,
diff --git a/tensorflow/core/tpu/kernels/outfeed_ops.cc b/tensorflow/core/tpu/kernels/outfeed_ops.cc
index 51a3a71a297..bc9a9d14db9 100644
--- a/tensorflow/core/tpu/kernels/outfeed_ops.cc
+++ b/tensorflow/core/tpu/kernels/outfeed_ops.cc
@@ -30,14 +30,16 @@ limitations under the License.
 
 namespace tensorflow {
 
-TpuOutfeedDequeueOp::TpuOutfeedDequeueOp(OpKernelConstruction* ctx)
-    : TpuTransferAsyncOpKernel(ctx, "outfeed_dequeue", 1) {
+template <class T>
+TpuOutfeedDequeueOp<T>::TpuOutfeedDequeueOp(OpKernelConstruction* ctx)
+    : T(ctx, "outfeed_dequeue", 1) {
   OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &shape_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
   OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, shape_, &xla_shape_));
 }
 
-Status TpuOutfeedDequeueOp::DoWork(
+template <class T>
+Status TpuOutfeedDequeueOp<T>::DoWork(
     OpKernelContext* ctx, xla::TpuTransferManagerInterface* transfer_manager,
     stream_executor::StreamExecutor* stream_executor) {
   Tensor* output;
@@ -61,8 +63,9 @@ Status TpuOutfeedDequeueOp::DoWork(
 
 // The OutfeedDequeueTuple op is used to retrieve multiple tensors from the
 // device outfeed queue.
-TpuOutfeedDequeueTupleOp::TpuOutfeedDequeueTupleOp(OpKernelConstruction* ctx)
-    : TpuTransferAsyncOpKernel(ctx, "outfeed_dequeue", 1) {
+template <class T>
+TpuOutfeedDequeueTupleOp<T>::TpuOutfeedDequeueTupleOp(OpKernelConstruction* ctx)
+    : T(ctx, "outfeed_dequeue", 1) {
   OP_REQUIRES_OK(ctx, ctx->GetAttr("shapes", &shapes_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
   OP_REQUIRES(
@@ -79,7 +82,8 @@ TpuOutfeedDequeueTupleOp::TpuOutfeedDequeueTupleOp(OpKernelConstruction* ctx)
   tuple_shape_ = xla::ShapeUtil::MakeTupleShape(xla_shapes_);
 }
 
-Status TpuOutfeedDequeueTupleOp::DoWork(
+template <class T>
+Status TpuOutfeedDequeueTupleOp<T>::DoWork(
     OpKernelContext* ctx, xla::TpuTransferManagerInterface* transfer_manager,
     stream_executor::StreamExecutor* stream_executor) {
   VLOG(1) << "TransferLiteralFromOutfeed "
@@ -103,14 +107,29 @@ Status TpuOutfeedDequeueTupleOp::DoWork(
 // device_ordinal to indicate which TPU to receive outfeed from.
 REGISTER_KERNEL_BUILDER(
     Name("OutfeedDequeue").Device(DEVICE_TPU_NODE).HostMemory("output"),
-    TpuOutfeedDequeueOp);
+    TpuOutfeedDequeueOp<TpuTransferAsyncOpKernel>);
 REGISTER_KERNEL_BUILDER(Name("OutfeedDequeue").Device(DEVICE_CPU),
-                        TpuOutfeedDequeueOp);
+                        TpuOutfeedDequeueOp<TpuTransferAsyncOpKernel>);
 
 REGISTER_KERNEL_BUILDER(
     Name("OutfeedDequeueTuple").Device(DEVICE_TPU_NODE).HostMemory("outputs"),
-    TpuOutfeedDequeueTupleOp);
+    TpuOutfeedDequeueTupleOp<TpuTransferAsyncOpKernel>);
 REGISTER_KERNEL_BUILDER(Name("OutfeedDequeueTuple").Device(DEVICE_CPU),
-                        TpuOutfeedDequeueTupleOp);
+                        TpuOutfeedDequeueTupleOp<TpuTransferAsyncOpKernel>);
+
+// Below ops take device_ordinal as an input tensor rather than a attribute.
+REGISTER_KERNEL_BUILDER(
+    Name("OutfeedDequeueV2").Device(DEVICE_TPU_NODE).HostMemory("output"),
+    TpuOutfeedDequeueOp<TpuTransferAsyncDynamicOrdinalOpKernel>);
+REGISTER_KERNEL_BUILDER(
+    Name("OutfeedDequeueV2").Device(DEVICE_CPU),
+    TpuOutfeedDequeueOp<TpuTransferAsyncDynamicOrdinalOpKernel>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("OutfeedDequeueTupleV2").Device(DEVICE_TPU_NODE).HostMemory("outputs"),
+    TpuOutfeedDequeueTupleOp<TpuTransferAsyncDynamicOrdinalOpKernel>);
+REGISTER_KERNEL_BUILDER(
+    Name("OutfeedDequeueTupleV2").Device(DEVICE_CPU),
+    TpuOutfeedDequeueTupleOp<TpuTransferAsyncDynamicOrdinalOpKernel>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/outfeed_ops.h b/tensorflow/core/tpu/kernels/outfeed_ops.h
index 5e3ed87c04b..3474ff21f74 100644
--- a/tensorflow/core/tpu/kernels/outfeed_ops.h
+++ b/tensorflow/core/tpu/kernels/outfeed_ops.h
@@ -25,7 +25,8 @@ namespace tensorflow {
 
 // The OutfeedDequeue op is used to retrieve a single tensor from the device
 // outfeed queue.
-class TpuOutfeedDequeueOp : public TpuTransferAsyncOpKernel {
+template <class T>
+class TpuOutfeedDequeueOp : public T {
  public:
   explicit TpuOutfeedDequeueOp(OpKernelConstruction* ctx);
 
@@ -45,7 +46,8 @@ class TpuOutfeedDequeueOp : public TpuTransferAsyncOpKernel {
 
 // The OutfeedDequeueTuple op is used to retrieve multiple tensors from the
 // device outfeed queue.
-class TpuOutfeedDequeueTupleOp : public TpuTransferAsyncOpKernel {
+template <class T>
+class TpuOutfeedDequeueTupleOp : public T {
  public:
   explicit TpuOutfeedDequeueTupleOp(OpKernelConstruction* ctx);
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache.proto b/tensorflow/core/tpu/kernels/tpu_compilation_cache.proto
index 89b92ae9157..f4529224109 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache.proto
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache.proto
@@ -16,23 +16,27 @@ syntax = "proto3";
 
 package tensorflow.tpu;
 
-// Target type for compilation cache fetch operation.
-enum CompilationCacheFetchTarget {
-  INVALID = 0;
-  MAIN = 1;
-  SHARDING = 2;
-  UNSHARDING = 3;
-}
+import "tensorflow/compiler/tf2xla/host_compute_metadata.proto";
+import "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.proto";
 
-message TpuCompilationUidAndIndex {
-  int64 uid = 1;
-  int32 proto_index = 2;
-}
-
-message GetTpuProgramRequest {
-  oneof key_oneof {
-    string key = 1;
-    TpuCompilationUidAndIndex uid_and_index = 2;
+// Response for GetTpuProgram RPC.
+message GetTpuProgramResponseExternal {
+  message Blob {
+    bytes data = 1;
   }
-  CompilationCacheFetchTarget fetch_target = 3;
+
+  Blob proto = 1;
+  tf2xla.HostComputeMetadata host_compute_metadata = 2;
+  bool may_modify_variables = 3;
+  Blob compiler_metadata = 4;
+  // Whether the program is empty, which could be true for sharding/unsharding
+  // entries.
+  bool is_empty = 5;
+}
+
+service TpuCompilationCacheServiceExternal {
+  // This method requests the cached proto that the TPU execute op has been
+  // instructed to execute.
+  rpc GetTpuProgram(GetTpuProgramRequest)
+      returns (GetTpuProgramResponseExternal) {}
 }
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_common.proto b/tensorflow/core/tpu/kernels/tpu_compilation_cache_common.proto
new file mode 100644
index 00000000000..89b92ae9157
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_common.proto
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+syntax = "proto3";
+
+package tensorflow.tpu;
+
+// Target type for compilation cache fetch operation.
+enum CompilationCacheFetchTarget {
+  INVALID = 0;
+  MAIN = 1;
+  SHARDING = 2;
+  UNSHARDING = 3;
+}
+
+message TpuCompilationUidAndIndex {
+  int64 uid = 1;
+  int32 proto_index = 2;
+}
+
+message GetTpuProgramRequest {
+  oneof key_oneof {
+    string key = 1;
+    TpuCompilationUidAndIndex uid_and_index = 2;
+  }
+  CompilationCacheFetchTarget fetch_target = 3;
+}
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
index 51b5ffbed0d..c3f95e7e09d 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h
@@ -30,7 +30,7 @@ limitations under the License.
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.cc
index a44518c0be6..c3aa62805c0 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.cc
@@ -30,7 +30,11 @@ namespace tensorflow {
 namespace tpu {
 
 static const char* grpcTpuCompilationCacheService_method_names[] = {
+#if defined(LIBTPU_ON_GCE)
+    "/tensorflow.tpu.TpuCompilationCacheServiceExternal/GetTpuProgram",
+#else  // LIBTPU_ON_GCE
     "/tensorflow.tpu.TpuCompilationCacheService/GetTpuProgram",
+#endif  // LIBTPU_ON_GCE
 };
 
 std::unique_ptr<grpc::TpuCompilationCacheService::Stub>
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h
index 39e37ad3722..55877d15df2 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h
@@ -35,8 +35,12 @@ limitations under the License.
 
 #include <functional>
 
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_response.pb.h"
+#if defined(LIBTPU_ON_GCE)
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#else
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"  // copybara"
+#endif
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -44,14 +48,22 @@ namespace grpc {
 class TpuCompilationCacheService final {
  public:
   using RequestType = ::tensorflow::tpu::GetTpuProgramRequest;
+#if defined(LIBTPU_ON_GCE)
+  using ResponseType = ::tensorflow::tpu::GetTpuProgramResponseExternal;
+#else
   using ResponseType = ::tensorflow::tpu::GetTpuProgramResponse;
+#endif
 
   // N.B. This must be synchronized with the method order in
   // tpu_compilation_cache.proto.
   enum class MethodId { kGetTpuProgram = 0 };
 
   static constexpr char const* service_full_name() {
+#if defined(LIBTPU_ON_GCE)
+    return "tensorflow.tpu.TpuCompilationCacheServiceExternal";
+#else
     return "tensorflow.tpu.TpuCompilationCacheService";
+#endif
   }
   class StubInterface {
    public:
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
index 4cd2b864203..5ddce57807d 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
@@ -362,14 +362,15 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsent(
     const TpuCompilationCacheKey& subgraph_key,
     const SessionMetadata* session_metadata,
     CompilationRefHolder* per_step_ref_holder, int64* uid,
-    std::vector<std::string>* proto_key,
+    std::vector<std::string>* proto_key, std::vector<std::string>* sharding_key,
     std::vector<bool>* may_modify_variables,
     absl::Span<const xla::HloProto* const>* hlo_metadatas,
     const std::function<Status(TpuProgramGroupInterface*)>& compile_function) {
   std::vector<CompiledSubgraph*> removed_entries;
   auto status = CompileIfKeyAbsentHelper(
       subgraph_key, session_metadata, per_step_ref_holder, uid, proto_key,
-      may_modify_variables, &removed_entries, hlo_metadatas, compile_function);
+      sharding_key, may_modify_variables, &removed_entries, hlo_metadatas,
+      compile_function);
   for (auto entry : removed_entries) {
     UnloadAndDestroy(entry);
   }
@@ -399,7 +400,7 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
     const TpuCompilationCacheKey& subgraph_key,
     const SessionMetadata* session_metadata,
     CompilationRefHolder* per_step_ref_holder, int64* uid,
-    std::vector<std::string>* proto_key,
+    std::vector<std::string>* proto_key, std::vector<std::string>* sharding_key,
     std::vector<bool>* may_modify_variables,
     std::vector<CompiledSubgraph*>* removed_entries,
     absl::Span<const xla::HloProto* const>* hlo_metadatas,
@@ -460,9 +461,11 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
             << marked_for_eviction_size_ << " bytes).";
     // Note that InitializeEntry() will Release/Reacquire mu_.
     entry = InitializeEntry(cache_key, compile_function, subgraph_key);
+    bool compilation_success = entry->tpu_program_group->program_count() > 0;
     TRACELITERAL("TPU host compilation cache: compilation done.");
     LOG(INFO) << strings::StrCat(
-        "TPU host compilation cache: compilation done for cache_key(",
+        "TPU host compilation cache: compilation ",
+        compilation_success ? "complete" : "failed", " for cache_key(",
         cache_key, "), session_name(", session_name, "), subgraph_key(",
         subgraph_key.debug_string, ")");
     // If session_name is present, log some additional stats related to HBM
@@ -497,7 +500,8 @@ Status TpuCompilationCacheInterface::CompileIfKeyAbsentHelper(
   *uid = entry->uid;
   // Let the caller know the keys for each of the cached protos.
   *proto_key = entry->proto_key;
-  *may_modify_variables = entry->tpu_program_group->may_modify_variables();
+  *sharding_key = entry->sharding_key;
+  *may_modify_variables = entry->tpu_program_group->may_modify_variables_list();
   *hlo_metadatas = entry->tpu_program_group->hlo_metadatas();
 
   // If the caller didn't supply a per_step_ref_holder then the caller is going
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
index 7b206fb1cf4..12f116dfd26 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
@@ -31,7 +31,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/tpu/kernels/compiled_subgraph.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_metrics.h"
@@ -109,6 +109,7 @@ class TpuCompilationCacheInterface : public ResourceBase {
       const SessionMetadata* session_metadata,
       CompilationRefHolder* per_step_ref_holder, int64* uid,
       std::vector<std::string>* proto_key,
+      std::vector<std::string>* sharding_key,
       std::vector<bool>* may_modify_variables,
       absl::Span<const xla::HloProto* const>* hlo_metadatas,
       const std::function<Status(TpuProgramGroupInterface*)>& compile_function);
@@ -197,6 +198,7 @@ class TpuCompilationCacheInterface : public ResourceBase {
       const SessionMetadata* session_metadata,
       CompilationRefHolder* per_step_ref_holder, int64* uid,
       std::vector<std::string>* proto_key,
+      std::vector<std::string>* sharding_key,
       std::vector<bool>* may_modify_variables,
       std::vector<CompiledSubgraph*>* removed_entries,
       absl::Span<const xla::HloProto* const>* hlo_metadatas,
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h
index 6f1fe9bdf87..96f92358241 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h
@@ -16,7 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOCAL_LOOKUP_H_
 
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
index ab476322a8a..fc819700204 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_response.proto b/tensorflow/core/tpu/kernels/tpu_compilation_cache_response.proto
deleted file mode 100644
index 2b3d404e308..00000000000
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_response.proto
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-syntax = "proto3";
-
-package tensorflow.tpu;
-
-import "tensorflow/core/tpu/kernels/tpu_compilation_cache.proto";
-import "tensorflow/compiler/tf2xla/host_compute_metadata.proto";
-
-// Response for GetTpuProgram RPC.
-message GetTpuProgramResponse {
-  message Blob {
-    bytes data = 1;
-  }
-
-  Blob proto = 1;
-  tf2xla.HostComputeMetadata host_compute_metadata = 2;
-  bool may_modify_variables = 3;
-  Blob compiler_metadata = 4;
-  // Whether the program is empty, which could be true for sharding/unsharding
-  // entries.
-  bool is_empty = 5;
-}
-
-service TpuCompilationCacheService {
-  // This method requests the cached proto that the TPU execute op has been
-  // instructed to execute.
-  rpc GetTpuProgram(GetTpuProgramRequest) returns (GetTpuProgramResponse) {}
-}
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc
index 743229d91cf..7846cc7bbb3 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc
@@ -25,6 +25,12 @@ namespace tensorflow {
 namespace tpu {
 namespace {
 
+#if defined(LIBTPU_ON_GCE)
+using ResponseType = GetTpuProgramResponseExternal;
+#else
+using ResponseType = GetTpuProgramResponse;
+#endif
+
 static constexpr absl::Duration kProtoTimeout = absl::Minutes(15);
 static gpr_timespec TimeToGprTimespec(absl::Time time) {
   if (time == absl::InfiniteFuture()) {
@@ -147,14 +153,14 @@ Status TpuCompilationCacheRpcLookup::RemoteLookupLocked(
   client_context.set_deadline(TimeToGprTimespec(::absl::Now() + kProtoTimeout));
   client_context.set_compression_algorithm(GRPC_COMPRESS_GZIP);
 
-  tpu::GetTpuProgramResponse response;
+  ResponseType response;
   Status s =
       FromGrpcStatus(stub_->GetTpuProgram(&client_context, request, &response));
   VLOG(1) << "Looked up key " << local_proto_key
           << " in remote subgraph cache status " << s;
   TF_RETURN_IF_ERROR(s);
 
-  TF_RETURN_IF_ERROR(FillCacheEntryFromGetTpuProgramResponse(
+  TF_RETURN_IF_ERROR(DeserializeRpcResponseToCacheEntry(
       local_proto_key, &response, cache_entry));
   cache_.emplace(local_proto_key, (*cache_entry));
   cache_size_ += (*cache_entry)->size;
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h
index 4fbda6083ab..d5449a05371 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/synchronization/mutex.h"
-#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h"
@@ -35,6 +35,8 @@ namespace tpu {
 // Class for looking up and caching TPU program via RPC.
 class TpuCompilationCacheRpcLookup : public TpuCompilationCacheLookup {
  public:
+  using StubType = tpu::grpc::TpuCompilationCacheService::Stub;
+
   TpuCompilationCacheRpcLookup(const string& server_address,
                                int64 max_cache_size);
   ~TpuCompilationCacheRpcLookup() override = default;
@@ -69,7 +71,7 @@ class TpuCompilationCacheRpcLookup : public TpuCompilationCacheLookup {
   // evicted.
   const int64 max_cache_size_;
 
-  std::unique_ptr<tpu::grpc::TpuCompilationCacheService::Stub> stub_;
+  std::unique_ptr<StubType> stub_;
 
   // Protect concurrent access to member variables below.
   mutable absl::Mutex mu_;
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc
index 62df149c87a..6a0bab4a0a7 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc
@@ -14,17 +14,131 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h"
 
+#include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/platform/casts.h"
+#if defined(LIBTPU_ON_GCE)
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
+#endif
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_program_group.h"
+#include "tensorflow/stream_executor/tpu/proto_helper.h"
+
 namespace tensorflow {
 namespace tpu {
 std::shared_ptr<::grpc::ChannelCredentials> CreateChannelCredentials() {
-  return ::grpc::InsecureChannelCredentials();
+  return ::grpc::InsecureChannelCredentials();  // NOLINT
 }
 
-Status FillCacheEntryFromGetTpuProgramResponse(
-    absl::string_view local_proto_key, GetTpuProgramResponse* response,
+#if defined(LIBTPU_ON_GCE)
+template <>
+Status DeserializeRpcResponseToCacheEntry<GetTpuProgramResponseExternal>(
+    absl::string_view local_proto_key, GetTpuProgramResponseExternal* response,
     std::shared_ptr<CacheEntry>* cache_entry) {
-  // TODO(b/162904194): implement this method.
-  LOG(FATAL) << "Not implemented yet.";
+  CHECK_NE(response, nullptr);
+  CHECK_NE(cache_entry, nullptr);
+  *cache_entry = std::make_shared<CacheEntry>();
+  CacheEntry& entry = **cache_entry;
+  entry.key = std::string(local_proto_key);
+
+  if (response->is_empty()) {
+    entry.size = 0;
+  } else {
+    TpuSerializedProto serialized_response_proto =
+        stream_executor::tpu::SerializeProto(*response);
+    auto cleanup = xla::MakeCleanup([&serialized_response_proto]() {
+      stream_executor::tpu::SerializedProto_Free(serialized_response_proto);
+    });
+    // When we lookup from remote cache, we fetch a TPU program for a specific
+    // core, hence we allocate TPU program group for a single program.
+    auto tpu_program_group = absl::make_unique<TpuProgramGroup>();
+
+    // TODO(b/166575150): can be optimized by sending the buffer over the gRPC
+    // without an extra deserializing.
+    TF_RETURN_IF_ERROR(tpu_program_group->DeserializeFromRpcResponseProtos(
+        {serialized_response_proto}));
+    entry.tpu_program_group = std::move(tpu_program_group);
+    entry.size = entry.tpu_program_group->program_size();
+  }
+
+  return Status::OK();
 }
+
+xla::StatusOr<std::vector<::grpc::Slice>> SerializeCacheEntryToBufferSlices(
+    const TpuCompilationCacheEntry& cache_entry) {
+  if (cache_entry.tpu_program_group() == nullptr) {
+    // It's possible that the sharding/unsharding entry does not exist, but the
+    // main entry must exist.
+    GetTpuProgramResponseExternal header;
+    header.set_is_empty(true);
+    std::string encoded_header;
+    if (!header.AppendToString(&encoded_header)) {
+      return errors::Internal("Failed to serialize TPU program metadata.");
+    }
+    ::grpc::Slice slice(encoded_header);
+    return std::vector<::grpc::Slice>{slice};
+  }
+
+  const TpuProgramGroup* tpu_program_group =
+      tensorflow::down_cast<const TpuProgramGroup*>(
+          cache_entry.tpu_program_group());
+  CHECK_NE(tpu_program_group, nullptr);
+  CHECK_GE(tpu_program_group->program_count(), 0);
+  CHECK_GE(cache_entry.core_index(), 0);
+  CHECK_LT(cache_entry.core_index(), tpu_program_group->program_count());
+  const int64 program_size = tpu_program_group->program_size();
+  if (program_size > INT_MAX) {
+    return errors::Internal("TPU program exceeded 2 GiB.");
+  }
+
+  TpuExecutableSerializedProto executable;
+  auto cleanup_executable = xla::MakeCleanup([&executable]() {
+    if (executable.size > 0) {
+      stream_executor::tpu::SerializedProto_Free(executable);
+    }
+  });
+  auto get_executable_status = tpu_program_group->SerializeExecutable(
+      cache_entry.core_index(), &executable);
+  if (!get_executable_status.ok()) {
+    return errors::Internal("Failed to serialize TPU program.");
+  }
+
+  // Encode and serialize header fields.
+  GetTpuProgramResponseExternal header;
+  if (!header.mutable_proto()->ParseFromArray(executable.bytes,
+                                              executable.size)) {
+    return errors::Internal("Failed to serialize TPU program.");
+  }
+  header.set_is_empty(false);
+
+
+  bool may_modify_variables =
+      tpu_program_group->may_modify_variables(cache_entry.core_index());
+  header.set_may_modify_variables(may_modify_variables);
+
+  CompilerMetadataSerializedProto compiler_metadata;
+  auto cleanup_compiler_metadata = xla::MakeCleanup([&compiler_metadata]() {
+    if (compiler_metadata.size > 0) {
+      stream_executor::tpu::SerializedProto_Free(compiler_metadata);
+    }
+  });
+  Status get_compiler_metadata_status =
+      tpu_program_group->SerializeCompilerMetadata(cache_entry.core_index(),
+                                                   &compiler_metadata);
+  if (!get_compiler_metadata_status.ok()) {
+    return errors::Internal("Failed to serialize compiler metadata.");
+  }
+  if (!header.mutable_compiler_metadata()->ParseFromArray(
+          compiler_metadata.bytes, compiler_metadata.size)) {
+    return errors::Internal("Failed to deserialize compiler metadata.");
+  }
+  std::string encoded_header;
+  if (!header.AppendToString(&encoded_header)) {
+    return errors::Internal("Failed to serialize TPU program metadata.");
+  }
+
+  return std::vector<::grpc::Slice>{::grpc::Slice(encoded_header)};
+}
+#endif  // LIBTPU_ON_GCE
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h
index 5d717df392b..c9099ec7a27 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h
@@ -17,9 +17,12 @@ limitations under the License.
 
 #include <grpcpp/security/credentials.h>
 
+#include <functional>
 #include <memory>
 #include <string>
+#include <vector>
 
+#include "grpcpp/support/slice.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
@@ -75,17 +78,19 @@ class CacheWrapper : public CompilationCacheEntryRef {
   std::shared_ptr<CacheEntry> cache_entry_;
 };
 
-// Forward declaration.
-class GetTpuProgramResponse;
-
 // Creates gRPC channel credentials for the current runtime env.
 std::shared_ptr<::grpc::ChannelCredentials> CreateChannelCredentials();
 
 // Fills an uinitialized `CacheEntry` from `GetTpuProgramResponse` proto. The
 // `cache_entry` will be instantiated by the function.
-Status FillCacheEntryFromGetTpuProgramResponse(
-    const absl::string_view local_proto_key, GetTpuProgramResponse* response,
+template <typename ResponseType>
+Status DeserializeRpcResponseToCacheEntry(
+    const absl::string_view local_proto_key, ResponseType* response,
     std::shared_ptr<CacheEntry>* cache_entry);
+
+// Serializes `TpuCompilationCacheEntry` to gRPC bufer slices.
+xla::StatusOr<std::vector<::grpc::Slice>> SerializeCacheEntryToBufferSlices(
+    const TpuCompilationCacheEntry& cache_entry);
 }  // namespace tpu
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.cc
new file mode 100644
index 00000000000..96fb7e8bd32
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.cc
@@ -0,0 +1,175 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_service.h"
+
+#include <chrono>  // NOLINT
+
+#include "grpcpp/support/byte_buffer.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/platform/coding.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h"
+
+namespace tensorflow {
+namespace {
+using ::tensorflow::tpu::CompilationCacheEntryRef;
+using ::tensorflow::tpu::TpuCompilationCacheEntry;
+using ::tensorflow::tpu::TpuCompilationCacheInterface;
+
+static constexpr int kGetTpuProgramServingThreads = 32;
+}  // namespace
+
+TpuCompilationCacheService::TpuCompilationCacheService(
+    ::grpc::ServerBuilder* server_builder, TpuCompilationCacheInterface* cache)
+    : running_(true),
+      cache_(cache),
+      server_builder_(server_builder),
+      cq_(server_builder_->AddCompletionQueue()),
+      thread_pool_(absl::make_unique<thread::ThreadPool>(
+          Env::Default(), "TpuCompilationCacheService",
+          kGetTpuProgramServingThreads)) {
+  cache_->Ref();
+  server_builder_->RegisterService(&service_);
+}
+
+TpuCompilationCacheService::~TpuCompilationCacheService() {
+  // This ordering is important. We must first shutdown our CQ and allow the
+  // polling thread and dispatch pool to shutdown before releasing our cache
+  // reference. The gRPC server must be Shutdown() by this point or we will
+  // deadlock here.  The running_ boolean is necessary to avoid adding new
+  // operations to the CQ after is has shutdown.
+  running_ = false;
+  cq_->Shutdown();
+  polling_thread_.reset();
+  thread_pool_.reset();
+  cache_->Unref();
+}
+
+void TpuCompilationCacheService::Start() {
+  server_ = server_builder_->BuildAndStart();
+  ThreadOptions opts;
+  polling_thread_.reset(Env::Default()->StartThread(
+      opts, "TpuCompilationCachePoller", [this]() { HandleRPCsLoop(); }));
+}
+
+bool TpuCompilationCacheService::Shutdown(int timeout_sec) {
+  if (server_ != nullptr) {
+    std::chrono::system_clock::time_point timeout =
+        std::chrono::system_clock::now() + std::chrono::seconds(timeout_sec);
+    server_->Shutdown(std::chrono::system_clock::now() +
+                      std::chrono::seconds(timeout_sec));
+    if (std::chrono::system_clock::now() >= timeout) {
+      return false;
+    }
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void TpuCompilationCacheService::SetMemoryQuota(size_t max_bytes) {
+  ::grpc::ResourceQuota quota;
+  quota.Resize(max_bytes);
+  server_builder_->SetResourceQuota(quota);
+}
+
+// Fetch a cache result for the given request and serialize the result directly
+// into a ByteBuffer.
+void TpuCompilationCacheService::GetTpuProgram(GetTpuProgramCall* call) {
+  std::unique_ptr<CompilationCacheEntryRef> entry;
+
+  VLOG(1) << "GetTpuProgram: " << call->request.DebugString();
+  Status s;
+  switch (call->request.key_oneof_case()) {
+    case tpu::GetTpuProgramRequest::kKey:
+      s = cache_->Lookup(call->request.key(), &entry);
+      break;
+
+    case tpu::GetTpuProgramRequest::kUidAndIndex:
+      s = cache_->Lookup(call->request.uid_and_index().uid(),
+                         call->request.uid_and_index().proto_index(), &entry);
+      break;
+
+    default:
+      s = errors::Internal("Bad GetTpuProgram RPC request oneof case ",
+                           call->request.key_oneof_case());
+      break;
+  }
+  if (!s.ok()) {
+    return call->SendResponse(ToGrpcStatus(s));
+  }
+
+  s = entry->ToSubEntryRef(call->request.fetch_target());
+  if (!s.ok()) {
+    return call->SendResponse(::grpc::Status(
+        ::grpc::StatusCode::INVALID_ARGUMENT,
+        absl::StrCat(
+            "Error getting the fetching target ",
+            CompilationCacheFetchTarget_Name(call->request.fetch_target())),
+        s.error_message()));
+  }
+
+  TpuCompilationCacheEntry cache_entry = entry->get();
+  if (cache_entry.tpu_program_group() == nullptr) {
+    // It's possible that the sharding/unsharding entry does not exist, but the
+    // main entry must exist.
+    CHECK_NE(call->request.fetch_target(),
+             tpu::CompilationCacheFetchTarget::MAIN);
+  }
+
+  xla::StatusOr<std::vector<::grpc::Slice>> buffer_slices =
+      tpu::SerializeCacheEntryToBufferSlices(cache_entry);
+
+  if (!buffer_slices.ok()) {
+    return call->SendResponse(ToGrpcStatus(buffer_slices.status()));
+  }
+
+  call->response =
+      ::grpc::ByteBuffer{&buffer_slices.ValueOrDie()[0], buffer_slices->size()};
+  return call->SendResponse(::grpc::Status());
+}
+
+void TpuCompilationCacheService::HandleGetTpuProgram(GetTpuProgramCall* call) {
+  thread_pool_->Schedule([this, call]() { GetTpuProgram(call); });
+  if (running_) {
+    GetTpuProgramCall::EnqueueRequestForMethod(
+        &service_, cq_.get(),
+        static_cast<int>(ServiceType::MethodId::kGetTpuProgram),
+        &TpuCompilationCacheService::HandleGetTpuProgram,
+        /*supports_cancel=*/false);
+  }
+}
+
+void TpuCompilationCacheService::HandleRPCsLoop() {
+  void* tag;
+  bool ok;
+
+  for (int i = 0; i < 50; ++i) {
+    GetTpuProgramCall::EnqueueRequestForMethod(
+        &service_, cq_.get(),
+        static_cast<int>(ServiceType::MethodId::kGetTpuProgram),
+        &TpuCompilationCacheService::HandleGetTpuProgram,
+        /*supports_cancel=*/false);
+  }
+
+  while (cq_->Next(&tag, &ok)) {
+    VLOG(2) << "HandleRPCS: " << tag;
+    UntypedCall<TpuCompilationCacheService>::Tag* callback_tag =
+        static_cast<UntypedCall<TpuCompilationCacheService>::Tag*>(tag);
+    callback_tag->OnCompleted(this, ok);
+  }
+
+  VLOG(2) << "Cache thread shutting down.";
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.h
new file mode 100644
index 00000000000..72270ef28c5
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.h
@@ -0,0 +1,70 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_SERVICE_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_SERVICE_H_
+
+#include <atomic>
+#include <memory>
+
+#include "grpcpp/server_builder.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+
+namespace tensorflow {
+// gRPC service for handling CompilationCache requests.
+// To avoid OOMs during execution, this service using the asynchronous raw gRPC
+// interface to serialize cache results directly to gRPC byte buffers. This
+// allows us to control serialization concurrency and avoids making an extra
+// copy of the program cache for each worker.
+class TpuCompilationCacheService {
+ public:
+  using ServiceType = ::tensorflow::tpu::grpc::TpuCompilationCacheService;
+  using AsyncService = ServiceType::AsyncService;
+
+  TpuCompilationCacheService(::grpc::ServerBuilder* server_builder,
+                             tpu::TpuCompilationCacheInterface* cache);
+  ~TpuCompilationCacheService();
+
+  void Start();
+  bool Shutdown(int timeout_sec);
+  void SetMemoryQuota(size_t max_bytes);
+
+ private:
+  void HandleRPCsLoop();
+
+  using GetTpuProgramCall = Call<TpuCompilationCacheService, AsyncService,
+                                 tpu::GetTpuProgramRequest, ::grpc::ByteBuffer>;
+
+  // Schedule the cache fetch into the serving thread pool.
+  void HandleGetTpuProgram(GetTpuProgramCall* call);
+
+  // Performs the actual cache fetch and serialization.
+  void GetTpuProgram(GetTpuProgramCall* call);
+
+  std::atomic<bool> running_;
+  tpu::TpuCompilationCacheInterface* cache_;
+  ::grpc::ServerBuilder* server_builder_;
+  std::unique_ptr<::grpc::Server> server_;
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+  std::unique_ptr<Thread> polling_thread_;
+  AsyncService service_;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_SERVICE_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_metrics.cc b/tensorflow/core/tpu/kernels/tpu_compilation_metrics.cc
index e1a65ad0f32..ce982a1bd9a 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_metrics.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_metrics.cc
@@ -19,7 +19,7 @@ namespace tpu {
 
 // TODO(henrytan): remove this once `TpuCompilationCache` migration to OSS is
 // completed.
-#if defined(LIBTFTPU)
+#if defined(LIBTPU_ON_GCE)
 /* static */
 void TpuCompilationMetrics::IncrementCacheLookupCount(
     bool is_cache_hit, absl::string_view session_name) {
@@ -36,7 +36,7 @@ void TpuCompilationMetrics::IncrementCompilationCount(
     absl::string_view session_name) {
   // A placeholder for tracking metrics.
 }
-#endif  // LIBTFTPU
+#endif  // LIBTPU_ON_GCE
 
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compile.proto b/tensorflow/core/tpu/kernels/tpu_compile.proto
index bdf754493ce..3e95dfa0fa0 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile.proto
+++ b/tensorflow/core/tpu/kernels/tpu_compile.proto
@@ -16,134 +16,11 @@ syntax = "proto3";
 
 package tensorflow.tpu;
 
-import "tensorflow/compiler/tf2xla/host_compute_metadata.proto";
-import "tensorflow/compiler/xla/service/hlo.proto";
-import "tensorflow/compiler/xla/xla_data.proto";
 import "tensorflow/core/framework/attr_value.proto";
 import "tensorflow/core/framework/function.proto";
 import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
-import "tensorflow/core/framework/types.proto";
 import "tensorflow/core/protobuf/tpu/compile_metadata.proto";
-import "tensorflow/core/tpu/kernels/tpu_executable_info.proto";
-
-message PerCoreVariableIndices {
-  // For each resource variable output, what was the index of the corresponding
-  // input and was it updated? The indices are sorted by input order.
-  repeated TPUExecutableInfoProto.UpdateIndexPair variable_indices = 1;
-}
-
-message PerCoreArgShapes {
-  // Argument shapes for each Tpu core.
-  repeated xla.ShapeProto shapes = 1;
-}
-
-message PerCoreOutputShapes {
-  // Output shapes for each Tpu core.
-  repeated xla.ShapeProto shapes = 1;
-}
-
-message OutputDescriptionProto {
-  // Type and shape of the output. The shape is the unflattened shape.
-  // When `type` is DT_RESOURCE, `shape` is the shape of the resource
-  // variable's value.
-  tensorflow.DataType type = 1;
-  tensorflow.TensorShapeProto shape = 2;
-
-  // Constant output value, if known to be constant at JIT compilation time.
-  // 'Tensor' is in host memory.
-  bool is_constant = 3;
-  tensorflow.TensorProto constant_value = 4;
-
-  // When this output is a resource, i.e. `type == DT_RESOURCE`, this is
-  // the index of the input that contains the resource.
-  int32 input_index = 5;
-
-  // Whether this output is a TensorList.
-  bool is_tensor_list = 6;
-}
-
-// Describes a variable write side effect of the computation.
-message ResourceUpdateProto {
-  // Index of the input that contains the variable resource to write to.
-  int32 input_index = 1;
-
-  // Type and shape of the tensor to be written back.
-  // The `shape` field has the same meaning as the Argument::shape field.
-  tensorflow.DataType type = 2;
-  tensorflow.TensorShapeProto shape = 3;
-
-  // Was the value of the variable modified by the computation?
-  // (Always true, unless `return_updated_values_for_all_resources` is true.)
-  bool modified = 4;
-
-  // If the resource is a TensorArray, the set of gradients read or written.
-  map<string, bool> tensor_array_gradients_accessed = 5;
-}
-
-// Describes the result of a XLA Compiler compilation.
-message XlaCompilationResultProto {
-  // Vector that maps from the parameters of the XLA computation to their
-  // original argument positions. To handle compile-time constant inputs, the
-  // parameters to the XLA computation may be a subset of the original
-  // arguments. The relative ordering of parameters are maintained.
-  repeated int32 input_mappings = 1;
-
-  // Input shapes of the computation. If we are flattening inputs, these are
-  // the flattened shapes.
-  repeated xla.ShapeProto xla_input_shapes = 2;
-
-  // Output shape in XLA format. The output shape is always a tuple. If we
-  // are flattening outputs, these are the flattened shapes.
-  xla.ShapeProto xla_output_shape = 3;
-
-  // TensorFlow shapes of outputs, together with the values of any
-  // constant arguments. Vector indexed by Tensorflow _Retval number,
-  // containing both constant and non-constant results.
-  repeated OutputDescriptionProto outputs = 4;
-
-  // TensorFlow shapes and types of sends/recvs from HostCompute Ops to their
-  // matching RecvAtHost/SendFromHost Ops in the outer graph.
-  tf2xla.HostComputeMetadata host_compute_metadata = 5;
-
-  // Resources whose values were updated by the computation, ordered
-  // by return value position (which is the same as the order the resources
-  // were passed as arguments). Resource updates follow the non-constant
-  // results in the outputs of XLA computation.
-  repeated ResourceUpdateProto resource_updates = 6;
-
-  // The XLA computation built from the tensorflow subgraph.
-  xla.HloModuleProto computation = 7;
-}
-
-// TpuAotCompilationRequestProto represents a compilation request for performing
-// ahead-of-time (AOT) compilation of XLA Computations into XLA HLO IR.
-message TpuAotCompilationRequestProto {
-  // A set of HLO module built to run concurrently
-  // across different devices.
-  xla.HloModuleGroupProto hlo_module_group = 1;
-
-  // Compilation metadata.
-  TPUCompileMetadataProto metadata = 2;
-
-  // DeviceAssignmentProto is a serialized form of DeviceAssignment class, which
-  // represents the device ids assigned to a set of replicated computations.
-  // See xla::DeviceAssignment class comment for more details.
-  xla.DeviceAssignmentProto device_assignment = 3;
-
-  // Per TPU core program arguments shapes.
-  repeated PerCoreArgShapes per_core_arg_shapes = 4;
-
-  // Per TPU core program outputs shapes.
-  repeated PerCoreOutputShapes per_core_output_shapes = 5;
-
-  // Per TPU core information containing what was the index of the corresponding
-  // input and if whether it was updated. The indices are sorted by input order.
-  repeated PerCoreVariableIndices per_core_variable_indices = 6;
-
-  // XLA compiler compilation result.
-  XlaCompilationResultProto compilation_result = 7;
-}
 
 // TPU compilation request for compiling computations into XLA HLO IR and build
 // TPU programs.
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
index 44607631e15..07bc49b2167 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_c_api.h
@@ -15,25 +15,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_C_API_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_C_API_H_
 
+#include <stddef.h>
+
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_program_c_api.h"
-#include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
-#include "tensorflow/stream_executor/tpu/proto_helper.h"
+#include "tensorflow/stream_executor/tpu/c_api_decl.h"
 
 extern "C" {
 
-// Compiles HLO IR and returns `count` number of TPU programs ready for
-// execution.
-// The API allocates the `XLA_TpuProgram*[]` array `tpu_programs` and creates
-// `XLA_TpuProgram` object(s) using the `TpuProgram_New` API. The caller is
-// responsible to deallocate both the `XLA_TpuProgram*[]` array and the
-// `XLA_TpuProgram` object(s) using `TpuProgram_FreeArray` and `TpuProgram_Free`
-// API respectively.
-TFTPU_CAPI_EXPORT void TpuCompile_CompileAheadOfTime(
-    TpuSerializedProto aot_compilation_request, XLA_TpuProgram** tpu_programs[],
-    size_t* count, SE_Status* status);
-
 // Compiles Mlir or TF function computation by lowering into HLO IR and returns
 // `count` number of TPU programs ready for execution.
 // The API allocates the `XLA_TpuProgram*[]` array `tpu_programs` and creates
@@ -46,7 +36,6 @@ TFTPU_CAPI_EXPORT void TpuCompile_CompileAndBuild(
     XLA_TpuProgram** tpu_programs[], size_t* count, SE_Status* status);
 
 struct TfTpu_CompileApiFn {
-  TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CompileAheadOfTime);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CompileAndBuild);
 };
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
index ce18e844e66..eeb396349bb 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
@@ -497,26 +497,37 @@ Status TpuCompileOpKernelCommon::OptimizeGraph(
   opts.set_do_function_inlining(true);
   opts.set_do_constant_folding(!flags->tf_xla_disable_constant_folding);
   GraphOptimizer optimizer(opts);
-  // Performs a first function inlining pass before shape inference, since
-  // otherwise shape inference can't see inside functions and a comprehensive
-  // shape_map, including function ops, is needed to constant-propagate Shape
-  // Ops below.
-  GraphOptimizer::Options optimizer_opts;
-  optimizer_opts.inline_multi_device_functions = true;
-  optimizer_opts.inline_impl_selection_group_functions = true;
-  optimizer_opts.inline_with_single_device_body_placer = true;
-  optimizer.Optimize(flr, flr->env(), flr->device(), graph, optimizer_opts);
+  {
+    // Performs a first function inlining pass before shape inference, since
+    // otherwise shape inference can't see inside functions and a comprehensive
+    // shape_map, including function ops, is needed to constant-propagate Shape
+    // Ops below.
+    GraphOptimizer::Options optimizer_opts;
+    optimizer_opts.inline_multi_device_functions = true;
+    optimizer_opts.inline_impl_selection_group_functions = true;
+    optimizer_opts.inline_with_single_device_body_placer = true;
+    // Infer shapes for each node in the computation. Shape inference can help
+    // skip constant folding of large shapes.
+    GraphShapeInfo shape_info;
+    TF_RETURN_IF_ERROR(RunShapeInferenceOnComputation(
+        metadata, arg_shapes, graph->get(), flr, &shape_info));
+    // Converts the GraphShapeInfo into the form needed by the constant-folding
+    // pass of the optimizer.
+    std::unordered_map<string, std::vector<PartialTensorShape>> shape_map;
+    ConvertGraphShapeInfoToShapeMap(**graph, shape_info, &shape_map);
+    optimizer_opts.shape_map = &shape_map;
+    optimizer.Optimize(flr, flr->env(), flr->device(), graph, optimizer_opts);
+  }
 
-  // Infer shapes for each node in the computation.
-  GraphShapeInfo shape_info;
-  TF_RETURN_IF_ERROR(RunShapeInferenceOnComputation(
-      metadata, arg_shapes, graph->get(), flr, &shape_info));
-
-  // Converts the GraphShapeInfo into the form needed by the constant-folding
-  // pass of the optimizer.
-  std::unordered_map<string, std::vector<PartialTensorShape>> shape_map;
-  ConvertGraphShapeInfoToShapeMap(**graph, shape_info, &shape_map);
-  optimizer.Optimize(flr, flr->env(), flr->device(), graph, &shape_map);
+  {
+    // Infer shapes for each node in the computation.
+    GraphShapeInfo shape_info;
+    TF_RETURN_IF_ERROR(RunShapeInferenceOnComputation(
+        metadata, arg_shapes, graph->get(), flr, &shape_info));
+    std::unordered_map<string, std::vector<PartialTensorShape>> shape_map;
+    ConvertGraphShapeInfoToShapeMap(**graph, shape_info, &shape_map);
+    optimizer.Optimize(flr, flr->env(), flr->device(), graph, &shape_map);
+  }
 
   TF_RETURN_IF_ERROR(RewriteTensorListWithConstElement(graph->get(), fld));
 
@@ -589,7 +600,8 @@ Status TpuCompileOpKernelCommon::CompileLocallyAndFillHostCache(
 
   const std::string session_name = SessionNameFromMetadata(session_metadata);
   LOG(INFO) << "Compilation of " << key.prefix << " with session name "
-            << session_name << " took " << duration;
+            << session_name << " took " << duration << " and "
+            << (compile_status.ok() ? "succeeded" : "failed");
   tpu_program_group->LogProgramMemorySummary();
   metrics::UpdateXlaCompilationTime(absl::ToInt64Microseconds(duration));
   TpuCompilationMetrics::IncrementCompilationCount(session_name);
@@ -657,10 +669,11 @@ Status TpuCompileOpKernelCommon::ComputeInternal(OpKernelContext* ctx) {
 
   int64 uid;
   std::vector<std::string> proto_key;
+  std::vector<std::string> sharding_key;
   std::vector<bool> may_modify_variables;
   absl::Span<const xla::HloProto* const> hlo_metadatas;
   Status status = cache->CompileIfKeyAbsent(
-      key, ctx->session_metadata(), ref_holder, &uid, &proto_key,
+      key, ctx->session_metadata(), ref_holder, &uid, &proto_key, &sharding_key,
       &may_modify_variables, &hlo_metadatas,
       [&](TpuProgramGroupInterface* tpu_program_group) {
         VLOG(1) << "Cloud TPU: Compiling TPU program";
@@ -778,13 +791,21 @@ Status TpuCompileOpKernelCommon::ComputeInternal(OpKernelContext* ctx) {
 
   if (status.ok()) {
     for (int i = 0; i < num_cores_with_compiled_programs; ++i) {
-      Tensor output(DT_STRING, TensorShape({2}));
+      Tensor output(DT_STRING, TensorShape({3}));
       if (proto_key.size() == 1) {
         output.vec<tstring>()(0) = proto_key[0];
       } else {
         output.vec<tstring>()(0) = proto_key[i];
       }
       output.vec<tstring>()(1) = rendezvous_key_base;
+      if (sharding_key.empty()) {
+        output.vec<tstring>()(2) = "";
+      } else if (sharding_key.size() == 1) {
+        output.vec<tstring>()(2) = sharding_key[0];
+      } else {
+        TF_RET_CHECK(sharding_key.size() == num_cores_with_compiled_programs);
+        output.vec<tstring>()(2) = sharding_key[i];
+      }
       ctx->set_output(i + 1, output);
     }
     if (!use_mlir_) {
@@ -805,9 +826,10 @@ Status TpuCompileOpKernelCommon::ComputeInternal(OpKernelContext* ctx) {
   } else {
     // Return error in the invalid case.
     for (int i = 0; i < num_computations_; ++i) {
-      Tensor output(DT_STRING, TensorShape({2}));
+      Tensor output(DT_STRING, TensorShape({3}));
       output.vec<tstring>()(0) = "<<NO PROGRAM AS COMPILATION FAILED>>";
       output.vec<tstring>()(1) = "<<NO RENDEZVOUS KEY AS COMPILATION FAILED>>";
+      output.vec<tstring>()(2) = "<<NO SHARDing KEY AS COMPILATION FAILED>>";
       ctx->set_output(i + 1, output);
     }
     if (!use_mlir_) {
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
index 8703dd818f5..270c2c53d7a 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
@@ -68,11 +68,11 @@ class TpuCompileOpImplFactory : public CompileOpImplFactory {
   }
 };
 
-#if defined(LIBTFTPU)
+#if defined(LIBTPU_ON_GCE)
 REGISTER_MODULE_INITIALIZER(tpu_compile_op_impl_factory, {
   VLOG(1) << "register TpuCompileOpImplFactory()";
   CompileOpImplFactory::Register(new TpuCompileOpImplFactory());
 });
-#endif  // LIBTFTPU
+#endif  // LIBTPU_ON_GCE
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc
index 3440b6d265a..75635c3c36a 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc
@@ -334,109 +334,6 @@ Status CreateHloModules(
   return Status::OK();
 }
 
-XlaCompilationResultProto SerializeCompilationResult(
-    const XlaCompiler::CompilationResult& compilation_result) {
-  XlaCompilationResultProto compilation_result_proto;
-  for (int input_mapping : compilation_result.input_mapping) {
-    compilation_result_proto.add_input_mappings(input_mapping);
-  }
-
-  for (const Shape& input_shape : compilation_result.xla_input_shapes) {
-    *(compilation_result_proto.add_xla_input_shapes()) = input_shape.ToProto();
-  }
-  *(compilation_result_proto.mutable_xla_output_shape()) =
-      compilation_result.xla_output_shape.ToProto();
-
-  for (const XlaCompiler::OutputDescription& output_description :
-       compilation_result.outputs) {
-    auto* new_output = compilation_result_proto.add_outputs();
-    new_output->set_type(output_description.type);
-    output_description.shape.AsProto(new_output->mutable_shape());
-    new_output->set_is_constant(output_description.is_constant);
-    output_description.constant_value.AsProtoField(
-        new_output->mutable_constant_value());
-    new_output->set_input_index(output_description.input_index);
-    new_output->set_is_tensor_list(output_description.is_tensor_list);
-  }
-
-  *compilation_result_proto.mutable_host_compute_metadata() =
-      compilation_result.host_compute_metadata;
-
-  for (const XlaCompiler::ResourceUpdate& resource_update :
-       compilation_result.resource_updates) {
-    auto* new_resource_update = compilation_result_proto.add_resource_updates();
-    new_resource_update->set_input_index(resource_update.input_index);
-    new_resource_update->set_type(resource_update.type);
-    resource_update.shape.AsProto(new_resource_update->mutable_shape());
-    new_resource_update->set_modified(resource_update.modified);
-    for (const std::string& gradient_access :
-         resource_update.tensor_array_gradients_accessed) {
-      new_resource_update->mutable_tensor_array_gradients_accessed()->insert(
-          {gradient_access, true});
-    }
-  }
-
-  if (compilation_result.computation != nullptr) {
-    *compilation_result_proto.mutable_computation() =
-        compilation_result.computation->proto();
-  }
-
-  return compilation_result_proto;
-}
-
-StatusOr<TpuAotCompilationRequestProto> CreateTpuAotCompilationRequest(
-    const xla::HloModuleGroup& module_group,
-    const XlaCompiler::CompilationResult& compilation_result,
-    const TPUCompileMetadataProto& metadata,
-    const std::vector<std::vector<xla::Shape>>& per_core_arg_shapes,
-    const std::vector<std::vector<xla::Shape>>& per_core_output_shapes,
-    const std::vector<std::vector<std::pair<int, bool>>>&
-        per_core_variable_indices,
-    const absl::optional<xla::DeviceAssignment>& device_assignment) {
-  VLOG(1) << "CreateTpuAotCompilationRequest.";
-  TpuAotCompilationRequestProto aot_request;
-  *(aot_request.mutable_hlo_module_group()) = module_group.ToProto();
-  *(aot_request.mutable_metadata()) = metadata;
-  if (device_assignment.has_value()) {
-    xla::DeviceAssignmentProto device_assignment_proto;
-    Status status = device_assignment->Serialize(&device_assignment_proto);
-    if (!status.ok()) {
-      return status;
-    }
-    *(aot_request.mutable_device_assignment()) = device_assignment_proto;
-  }
-
-  for (const auto& arg_shapes : per_core_arg_shapes) {
-    auto* new_shape_list = aot_request.add_per_core_arg_shapes();
-    for (const auto& arg_shape : arg_shapes) {
-      *new_shape_list->add_shapes() = arg_shape.ToProto();
-    }
-  }
-
-  for (const auto& output_shapes : per_core_output_shapes) {
-    auto* new_shape_list = aot_request.add_per_core_output_shapes();
-    for (const auto& output_shape : output_shapes) {
-      *new_shape_list->add_shapes() = output_shape.ToProto();
-    }
-  }
-
-  for (const auto& variable_indices : per_core_variable_indices) {
-    auto* new_list = aot_request.add_per_core_variable_indices();
-    for (const auto& variable_index : variable_indices) {
-      auto* core_index = new_list->add_variable_indices();
-      core_index->set_index(variable_index.first);
-      core_index->set_updated(variable_index.second);
-    }
-  }
-
-  XlaCompilationResultProto compilation_result_proto =
-      SerializeCompilationResult(compilation_result);
-  *aot_request.mutable_compilation_result() = compilation_result_proto;
-
-  VLOG(1) << "TpuAotCompilationRequest:\n" << aot_request.DebugString();
-  return aot_request;
-}
-
 StatusOr<TpuCompilationRequestProto> CreateTpuCompilationRequest(
     const absl::variant<MlirToHloArgs, FunctionToHloArgs>& computation,
     const TPUCompileMetadataProto& metadata,
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_support.h b/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
index ea13d33b521..d0e28494f53 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
@@ -139,17 +139,6 @@ se::port::Status CreateHloModules(
     const absl::optional<xla::DeviceAssignment>& device_assignment,
     std::vector<std::unique_ptr<xla::HloModule>>* hlo_modules);
 
-se::port::StatusOr<TpuAotCompilationRequestProto>
-CreateTpuAotCompilationRequest(
-    const xla::HloModuleGroup& module_group,
-    const XlaCompiler::CompilationResult& compilation_result,
-    const TPUCompileMetadataProto& metadata,
-    const std::vector<std::vector<xla::Shape>>& per_core_arg_shapes,
-    const std::vector<std::vector<xla::Shape>>& per_core_output_shapes,
-    const std::vector<std::vector<std::pair<int, bool>>>&
-        per_core_variable_indices,
-    const absl::optional<xla::DeviceAssignment>& device_assignment);
-
 se::port::StatusOr<TpuCompilationRequestProto> CreateTpuCompilationRequest(
     const absl::variant<MlirToHloArgs, FunctionToHloArgs>& computation,
     const TPUCompileMetadataProto& metadata,
diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
index 5a8c283c7c2..271a9697f18 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
@@ -27,8 +27,10 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h"
 #include "tensorflow/core/tpu/kernels/tpu_mesh_state_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_op_consts.h"
+#include "tensorflow/core/tpu/kernels/tpu_pod_state.h"
 #include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/core/tpu/tpu_config_c_api.h"
 #include "tensorflow/core/tpu/tpu_configuration.h"
@@ -37,7 +39,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
-
 Status GetTpuMeshStateInterface(const ResourceMgr* rmgr,
                                 tpu::TpuMeshStateInterface** state) {
   if (!rmgr->Lookup(rmgr->default_container(),
@@ -69,7 +70,6 @@ Status DeleteIfExists(ResourceMgr* resource_manager,
   VLOG(1) << "Error removing resource " << resource_name << " : " << status;
   return status;
 }
-
 }  // namespace
 
 Status CreateTpuCompilationCache(
@@ -82,36 +82,39 @@ Status CreateTpuCompilationCache(
       });
 }
 
-void ConfigureDistributedTpuOp::Compute(OpKernelContext* ctx) {
-  VLOG(1) << "ConfigureDistributedTpuOp";
-  XLA_SCOPED_LOGGING_TIMER("ConfigureDistributedTpuOp");
-
+xla::StatusOr<std::vector<int32_t>> ConstructDevicesPerHost(
+    OpKernelContext* ctx) {
   std::vector<int32_t> num_devices_per_host;
   int chips_per_host = -1;
   for (int i = 0; i < ctx->num_inputs(); ++i) {
     const Tensor& input_tensor = ctx->input(i);
-    OP_REQUIRES(
-        ctx, TensorShapeUtils::IsScalar(input_tensor.shape()),
-        errors::InvalidArgument("Input ", i, " should be a scalar but has ",
-                                input_tensor.dims(), " dimensions"));
+    if (!TensorShapeUtils::IsScalar(input_tensor.shape())) {
+      return errors::InvalidArgument("Input ", i,
+                                     " should be a scalar but has ",
+                                     input_tensor.dims(), " dimensions");
+    }
     if (chips_per_host == -1) {
       chips_per_host = input_tensor.scalar<int32_t>()();
     } else {
-      OP_REQUIRES(
-          ctx, chips_per_host == input_tensor.scalar<int32>()(),
-          errors::Internal("Host ", i, " has ", input_tensor.scalar<int32>()(),
-                           " TPU chips but host 0 has ", chips_per_host));
+      if (chips_per_host != input_tensor.scalar<int32>()()) {
+        return errors::Internal("Host ", i, " has ",
+                                input_tensor.scalar<int32>()(),
+                                " TPU chips but host 0 has ", chips_per_host);
+      }
     }
     num_devices_per_host.push_back(input_tensor.scalar<int32_t>()());
   }
+  return num_devices_per_host;
+}
 
-  TF_Status* status = TF_NewStatus();
-  size_t host_config_output_size;
-  char* host_config_output;
+void ConfigureDistributedTpuOp::Compute(OpKernelContext* ctx) {
+  VLOG(1) << "ConfigureDistributedTpuOp";
+  XLA_SCOPED_LOGGING_TIMER("ConfigureDistributedTpuOp");
 
-  auto* rmgr = GetTPUConfigResourceMgr();
-  OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
-                          rmgr, tpu::kTpuMeshStateInterfaceResourceName));
+  xla::StatusOr<std::vector<int32_t>> num_devices_per_host =
+      ConstructDevicesPerHost(ctx);
+  OP_REQUIRES_OK(ctx, num_devices_per_host.status());
+  ResourceMgr* rmgr = GetTPUConfigResourceMgr();
 
   // Create the subgraph compilation cache and put it in the local resource
   // manager.
@@ -119,9 +122,13 @@ void ConfigureDistributedTpuOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, CreateTpuCompilationCache(rmgr, &compilation_cache));
   core::ScopedUnref compilation_cache_ref(compilation_cache);
 
-  tpu::ConfigApiFn()->ConfigureDistributedTpuOp_DoWorkFn(
-      num_devices_per_host.size(), num_devices_per_host.data(),
-      compilation_cache, &host_config_output_size, &host_config_output, status);
+  std::string host_config_output;
+  OP_REQUIRES_OK(
+      ctx, ConstructTpuPodState(rmgr, *num_devices_per_host, compilation_cache,
+                                &host_config_output));
+
+  OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
+                          rmgr, tpu::kTpuMeshStateInterfaceResourceName));
 
   auto* tpu_mesh = tpu::TpuMeshStateInterface::Create();
   OP_REQUIRES_OK(
@@ -130,13 +137,7 @@ void ConfigureDistributedTpuOp::Compute(OpKernelContext* ctx) {
 
   Tensor* ctx_output;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &ctx_output));
-  ctx_output->scalar<tstring>()() =
-      std::string(host_config_output, host_config_output_size);
-
-  OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
-  TF_DeleteStatus(status);
-
-  tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(host_config_output);
+  ctx_output->scalar<tstring>()() = std::move(host_config_output);
 
   VLOG(1) << "ConfigureDistributedTpuOp done";
 }
@@ -186,30 +187,39 @@ void WaitForDistributedTpuOp::Compute(OpKernelContext* ctx) {
     mapping_arg.push_back(mapping[i].data());
   }
 
-  TF_Status* status = TF_NewStatus();
-  size_t tpu_topology_output_size;
-  char* tpu_topology_output;
-
   tpu::TpuMeshStateInterface* mesh_state;
   auto* rmgr = GetTPUConfigResourceMgr();
   OP_REQUIRES_OK(ctx, GetTpuMeshStateInterface(rmgr, &mesh_state));
   core::ScopedUnref mesh_state_unref(mesh_state);
 
+  // TODO(b/166858751): this code to check if `TpuPodState` exists is ported
+  // from a legacy library that may have staled. A candidate for cleanup.
+  TpuPodState* pod_state;
+  OP_REQUIRES_OK(ctx, GetTPUPodState(rmgr, &pod_state));
+  core::ScopedUnref pod_state_unref(pod_state);
+
+  size_t tpu_topology_output_size;
+  char* tpu_topology_output = nullptr;
+  TF_Status* status = TF_NewStatus();
+  auto cleanup = xla::MakeCleanup([&status, &tpu_topology_output]() {
+    TF_DeleteStatus(status);
+    tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(
+        tpu_topology_output);
+  });
+
   auto* mesh_common_state = mesh_state->mesh_common_state();
   tpu::ConfigApiFn()->WaitForDistributedTpuOp_DoWorkFn(
       num_hosts, num_devices_per_host,
       const_cast<const int32_t**>(mapping_arg.data()), mesh_common_state,
       &tpu_topology_output_size, &tpu_topology_output, status);
 
+  OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
+
   Tensor* ctx_output;
   OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &ctx_output));
   ctx_output->scalar<tstring>()() =
       std::string(tpu_topology_output, tpu_topology_output_size);
 
-  OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
-  TF_DeleteStatus(status);
-  tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(tpu_topology_output);
-
   VLOG(1) << "WaitForDistributedTpuOp done";
 }
 
@@ -217,17 +227,14 @@ void ShutdownDistributedTpuOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "ShutdownDistributedTpuOp";
   XLA_SCOPED_LOGGING_TIMER("ShutdownDistributedTpuOp");
 
-  TF_Status* status = TF_NewStatus();
+  auto* rmgr = GetTPUConfigResourceMgr();
   OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuMeshStateInterface>(
-                          GetTPUConfigResourceMgr(),
-                          tpu::kTpuMeshStateInterfaceResourceName));
-  tpu::ConfigApiFn()->ShutdownDistributedTpuOp_DoWorkFn(status);
-  OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
-  TF_DeleteStatus(status);
+                          rmgr, tpu::kTpuMeshStateInterfaceResourceName));
 
-  OP_REQUIRES_OK(
-      ctx, DeleteIfExists<tpu::TpuCompilationCacheInterface>(
-               GetTPUConfigResourceMgr(), tpu::kCompilationCacheResourceName));
+  OP_REQUIRES_OK(ctx,
+                 DeleteIfExists<TpuPodState>(rmgr, kTpuPodStateResourceName));
+  OP_REQUIRES_OK(ctx, DeleteIfExists<tpu::TpuCompilationCacheInterface>(
+                          rmgr, tpu::kCompilationCacheResourceName));
 
   VLOG(1) << "ShutdownDistributedTpuOp done";
 }
@@ -239,10 +246,6 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
   auto* rmgr = GetTPUConfigResourceMgr();
   auto tpu_host_config = ctx->input(0).scalar<tstring>()();
 
-  size_t device_id_output_size;
-  int32_t* device_id_output;
-  TF_Status* status = TF_NewStatus();
-
   bool is_master_worker =
       tpu::ConfigApiFn()->TpuConfigurationApi_HasTPUPodStateFn();
   if (!is_master_worker) {
@@ -275,10 +278,18 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
     local_compilation_cache = nullptr;
   }
 
+  TF_Status* status = TF_NewStatus();
+  size_t device_id_output_size;
+  int32_t* device_id_output = nullptr;
+  auto cleanup = xla::MakeCleanup([&status, &device_id_output]() {
+    TF_DeleteStatus(status);
+    tpu::ConfigApiFn()->TpuConfigurationApi_FreeInt32ArrayFn(device_id_output);
+  });
   tpu::ConfigApiFn()->InitializeHostForDistributedTpuOp_DoWorkFn(
       tpu_host_config.size(), tpu_host_config.data(),
-      enable_whole_mesh_compilations_, local_compilation_cache,
-      &device_id_output_size, &device_id_output, status);
+      enable_whole_mesh_compilations_, is_master_worker, &device_id_output_size,
+      &device_id_output, status);
+  OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
 
   if (local_compilation_cache != nullptr) {
     local_compilation_cache->Unref();
@@ -289,6 +300,30 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
     OP_REQUIRES_OK(
         ctx, rmgr->Create(rmgr->default_container(),
                           tpu::kCompiledProtoCacheResourceName, proto_lookup));
+  } else {
+    int64_t cache_size_bytes;
+    tpu::ConfigApiFn()->TpuConfigurationApi_RemoteCompilationCacheSizeInBytesFn(
+        &cache_size_bytes);
+
+    char* server_address_output = nullptr;
+    auto cleanup_server_address = xla::MakeCleanup([&server_address_output]() {
+      tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(
+          server_address_output);
+    });
+    size_t server_address_output_size;
+    tpu::ConfigApiFn()
+        ->TpuConfigurationApi_CompilationCacheServerAddressFromConfigFn(
+            tpu_host_config.size(), tpu_host_config.data(),
+            &server_address_output_size, &server_address_output, status);
+    OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
+
+    std::string server_address(server_address_output,
+                               server_address_output_size);
+    tpu::TpuCompilationCacheLookup* proto_lookup =
+        new tpu::TpuCompilationCacheRpcLookup(server_address, cache_size_bytes);
+    OP_REQUIRES_OK(
+        ctx, rmgr->Create(rmgr->default_container(),
+                          tpu::kCompiledProtoCacheResourceName, proto_lookup));
   }
 
   Tensor* ctx_output;
@@ -301,10 +336,6 @@ void InitializeHostForDistributedTpuOp::Compute(OpKernelContext* ctx) {
     ctx_output->flat<int32>()(i) = device_id_output[i];
   }
 
-  OP_REQUIRES_OK(ctx, StatusFromTF_Status(status));
-  TF_DeleteStatus(status);
-  tpu::ConfigApiFn()->TpuConfigurationApi_FreeInt32ArrayFn(device_id_output);
-
   VLOG(1) << "InitializeHostForDistributedTpuOp done";
 }
 
diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.h b/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
index d0bf5809842..d58712ae3dd 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
@@ -15,14 +15,22 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_CONFIGURATION_OPS_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_CONFIGURATION_OPS_H_
 
+#include <stdint.h>
+
+#include <vector>
+
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
 
 Status CreateTpuCompilationCache(
     ResourceMgr* rmgr, tpu::TpuCompilationCacheInterface** compilation_cache);
 
+xla::StatusOr<std::vector<int32_t>> ConstructDevicesPerHost(
+    OpKernelContext* ctx);
+
 // The ConfigureDistributedTpu op is used to start an TPUDriver from
 // TensorFlow. It should be run on a TPU_SYSTEM device and returns the
 // connection host:port for the CompilationCacheServer. The
diff --git a/tensorflow/core/tpu/kernels/tpu_execute_op.cc b/tensorflow/core/tpu/kernels/tpu_execute_op.cc
index 3522ace379a..ce69d976398 100644
--- a/tensorflow/core/tpu/kernels/tpu_execute_op.cc
+++ b/tensorflow/core/tpu/kernels/tpu_execute_op.cc
@@ -72,9 +72,9 @@ Status GetComputationCacheEntry(
   TF_RETURN_IF_ERROR(context->input("key", &key));
   profiler::TraceMe trace_me("TpuExecuteOp::LookupProto", /*level=*/2);
   if (!TensorShapeUtils::IsVector(key->shape()) ||
-      key->shape().dim_size(0) != 2) {
+      key->shape().dim_size(0) != 3) {
     return errors::InvalidArgument(
-        "Key argument to TPUExecute must be a 2-element vector");
+        "Key argument to TPUExecute must be a 3-element vector");
   }
 
   ResourceMgr* rmgr = GetTPUConfigResourceMgr();
diff --git a/tensorflow/core/tpu/kernels/tpu_pod_state.cc b/tensorflow/core/tpu/kernels/tpu_pod_state.cc
new file mode 100644
index 00000000000..898f02b28e9
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_pod_state.cc
@@ -0,0 +1,173 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tpu/kernels/tpu_pod_state.h"
+
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/tpu/tpu_api.h"
+
+#if defined(LIBTPU_ON_GCE)
+#include "tensorflow/core/tpu/kernels/tpu_util.h"
+#else
+#include "tensorflow/core/tpu/kernels/tpu_util.h"  // copybara"
+#endif
+
+namespace tensorflow {
+const char kTpuPodStateResourceName[] = "tpu_pod_state";
+
+namespace {
+
+// Attempt to delete resource_name from resource_manager's default_container.
+// Returns OK if the deletion succeeded, or if the resource was not found. Else
+// return the deletion error.
+template <class ResourceT>
+Status DeleteIfExists(ResourceMgr* resource_manager,
+                      const char* resource_name) {
+  VLOG(1) << "Removing resource " << resource_name << " if it exists";
+  Status status = resource_manager->Delete<ResourceT>(
+      resource_manager->default_container(), resource_name);
+  if (status.ok()) {
+    VLOG(1) << "Removed existing resource " << resource_name;
+    return Status::OK();
+  }
+  if (status.code() == error::NOT_FOUND) {
+    VLOG(1) << "No resource " << resource_name << " to remove";
+    return Status::OK();
+  }
+  VLOG(1) << "Error removing resource " << resource_name << " : " << status;
+  return status;
+}
+
+xla::StatusOr<std::unique_ptr<TpuCompilationCacheService>>
+ConstructCacheService(ResourceMgr* rmgr, int serving_port,
+                      tpu::TpuCompilationCacheInterface* compilation_cache) {
+  xla::StatusOr<std::unique_ptr<::grpc::ServerBuilder>> server_builder;
+#if defined(LIBTPU_ON_GCE)
+  server_builder = tpu::CreateServerBuilder(serving_port);
+#else
+  server_builder = tpu::CreateServerBuilderGoogle(serving_port);
+#endif
+  TF_RETURN_IF_ERROR(server_builder.status());
+
+  auto cache_service = absl::make_unique<TpuCompilationCacheService>(
+      server_builder.ValueOrDie().get(), compilation_cache);
+  cache_service->SetMemoryQuota(1ul << 31);  // 2GB
+  cache_service->Start();
+  return cache_service;
+}
+}  // namespace
+
+Status GetServerAddressAndPort(std::string* server_address, int* serving_port) {
+  TF_Status* status = TF_NewStatus();
+  char* server_address_output = nullptr;
+  auto cleanup = xla::MakeCleanup([&status, &server_address_output]() {
+    TF_DeleteStatus(status);
+    tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(
+        server_address_output);
+  });
+  size_t server_address_output_size;
+  *serving_port = -1;
+  tpu::ConfigApiFn()->TpuConfigurationApi_GetServerAddressAndPortFn(
+      &server_address_output_size, &server_address_output, serving_port,
+      status);
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status));
+  *server_address =
+      std::string(server_address_output, server_address_output_size);
+  CHECK_NE(*serving_port, -1);
+  return Status::OK();
+}
+
+TpuPodState::TpuPodState(
+    int service_port, std::unique_ptr<TpuCompilationCacheService> cache_service)
+    : cache_service_(std::move(cache_service)), service_port_(service_port) {}
+
+TpuPodState::~TpuPodState() {
+  if (cache_service_) {
+    VLOG(1) << "Shutting down Compilation Cache Service.";
+    if (cache_service_->Shutdown(20)) {
+      if (service_port_ >= 0) {
+        tpu::UtilApiFn()->TpuNetUtil_RecycleUnusedPortFn(service_port_);
+      }
+    } else {
+      LOG(ERROR)
+          << "Failed to shutdown Compilation Cache Service within timeout.";
+    }
+  }
+  VLOG(1) << "Shutting down Compilation Cache Service done.";
+}
+
+string TpuPodState::DebugString() const {
+  return "Wrapper for distributed TPU state";
+}
+
+Status GetTPUPodState(const ResourceMgr* rmgr, TpuPodState** pod_state) {
+  if (!rmgr) {
+    return errors::Internal("No resource manager.");
+  }
+  if (!rmgr->Lookup(rmgr->default_container(), kTpuPodStateResourceName,
+                    pod_state)
+           .ok()) {
+    return errors::FailedPrecondition(
+        "The TPU system has not been initialized.");
+  }
+  return Status::OK();
+}
+
+bool HasTPUPodState(const ResourceMgr* rmgr) {
+  TpuPodState* pod_state;
+  if (!rmgr->Lookup(rmgr->default_container(), kTpuPodStateResourceName,
+                    &pod_state)
+           .ok()) {
+    return false;
+  }
+  pod_state->Unref();
+  return true;
+}
+
+Status ConstructTpuPodState(
+    ResourceMgr* rmgr, const std::vector<int32_t>& num_devices_per_host,
+    tpu::TpuCompilationCacheInterface* compilation_cache,
+    std::string* host_config_proto) {
+  TF_Status* status = TF_NewStatus();
+  auto status_cleanup =
+      xla::MakeCleanup([&status]() { TF_DeleteStatus(status); });
+
+  int serving_port;
+  std::string server_address;
+  TF_RETURN_IF_ERROR(GetServerAddressAndPort(&server_address, &serving_port));
+
+  char* host_config_output = nullptr;
+  auto host_config_cleanup = xla::MakeCleanup([&host_config_output]() {
+    tpu::ConfigApiFn()->TpuConfigurationApi_FreeCharArrayFn(host_config_output);
+  });
+  size_t host_config_output_size;
+  tpu::ConfigApiFn()->ConfigureDistributedTpuOp_DoWorkFn(
+      num_devices_per_host.size(), num_devices_per_host.data(),
+      server_address.size(), server_address.data(), &host_config_output_size,
+      &host_config_output, status);
+  TF_RETURN_IF_ERROR(StatusFromTF_Status(status));
+  *host_config_proto = std::string(host_config_output, host_config_output_size);
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<TpuCompilationCacheService> cache_service,
+      ConstructCacheService(rmgr, serving_port, compilation_cache));
+
+  // Delete TpuPodState if it exists, and recreate below.
+  TF_RETURN_IF_ERROR(
+      DeleteIfExists<TpuPodState>(rmgr, kTpuPodStateResourceName));
+  return rmgr->Create(rmgr->default_container(), kTpuPodStateResourceName,
+                      new TpuPodState(serving_port, std::move(cache_service)));
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_pod_state.h b/tensorflow/core/tpu/kernels/tpu_pod_state.h
new file mode 100644
index 00000000000..9515a8ee8f5
--- /dev/null
+++ b/tensorflow/core/tpu/kernels/tpu_pod_state.h
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_POD_STATE_H_
+#define TENSORFLOW_CORE_TPU_KERNELS_TPU_POD_STATE_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/tpu/kernels/tpu_compilation_cache_service.h"
+
+namespace tensorflow {
+
+// Name of tpu pod state.
+ABSL_CONST_INIT extern const char kTpuPodStateResourceName[];
+
+// Wrapper to hold centralized state for the distributed TPU in the TPU_SYSTEM
+// device's resource manager.
+class TpuPodState : public ResourceBase {
+ public:
+  // The port number given by isa_cache_port will be freed with
+  // RecycleUnusedPort in the destructor if it is non-negative.
+  TpuPodState(int service_port,
+              std::unique_ptr<TpuCompilationCacheService> cache_service);
+
+  ~TpuPodState() override;
+
+  string DebugString() const override;
+
+ private:
+  std::unique_ptr<TpuCompilationCacheService> cache_service_;
+  int service_port_;
+};
+
+// Returns the TPU pod state or an error.
+Status GetTPUPodState(const ResourceMgr* rmgr, TpuPodState** pod_state);
+
+// Checks whether the TPU POD state configuration is present within the resource
+// manager.
+bool HasTPUPodState(const ResourceMgr* rmgr);
+
+// Construct TpuPodState.
+Status ConstructTpuPodState(
+    ResourceMgr* rmgr, const std::vector<int32_t>& num_devices_per_host,
+    tpu::TpuCompilationCacheInterface* compilation_cache,
+    std::string* host_config_proto);
+
+Status GetServerAddressAndPort(std::string* server_address, int* serving_port);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_KERNELS_TPU_POD_STATE_H_
diff --git a/tensorflow/core/tpu/kernels/tpu_program_c_api.h b/tensorflow/core/tpu/kernels/tpu_program_c_api.h
index 41c7d47cf97..d6e46a7c419 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_c_api.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/tpu/kernels/tpu_util_c_api.h"
 #include "tensorflow/core/tpu/libtftpu.h"
+#include "tensorflow/stream_executor/tpu/c_api_decl.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 
 typedef struct XLA_TpuProgram XLA_TpuProgram;
@@ -24,6 +25,21 @@ typedef struct XLA_TpuProgram XLA_TpuProgram;
 // Enum for choosing sharding/unsharding program from a `XLA_TpuProgram` obj.
 enum TpuProgramShardingType { kInvalid = 0, kMain, kSharding, kUnsharding };
 
+struct TpuExecutableSerializedProto {
+  const char* bytes;
+  size_t size;
+};
+
+struct CompilerMetadataSerializedProto {
+  const char* bytes;
+  size_t size;
+};
+
+struct HostComputeMetadataSerializedProto {
+  const char* bytes;
+  size_t size;
+};
+
 extern "C" {
 
 // Creates a new TPU program.
@@ -53,21 +69,24 @@ TFTPU_CAPI_EXPORT bool TpuProgram_LogProgramMemorySummary(
 
 // Gets TPU program executable info from the `tpu_program`.
 TFTPU_CAPI_EXPORT void TpuProgram_GetExecutableInfo(
-    const XLA_TpuProgram* tpu_program, TpuSerializedProto* executable_info);
+    const XLA_TpuProgram* tpu_program, TpuSerializedProto* executable_info,
+    SE_Status* status);
 
 // Gets host transfer info proto.
 TFTPU_CAPI_EXPORT void TpuProgram_GetHostTransferInfo(
-    const XLA_TpuProgram* tpu_program, TpuSerializedProto* host_transfer_info);
+    const XLA_TpuProgram* tpu_program, TpuSerializedProto* host_transfer_info,
+    SE_Status* status);
 
 // Gets HLO metadata proto.
 TFTPU_CAPI_EXPORT void TpuProgram_GetHloMetadata(
-    const XLA_TpuProgram* tpu_program, TpuSerializedProto* hlo_metadata);
+    const XLA_TpuProgram* tpu_program, TpuSerializedProto* hlo_metadata,
+    SE_Status* status);
 
 // Gets may modify variables boolean value.
 TFTPU_CAPI_EXPORT void TpuProgram_GetMayModifyVariables(
     const XLA_TpuProgram* tpu_program, bool* may_modify_variables);
 
-// Check if TPU program has sharding.
+// Checks if TPU program has sharding.
 TFTPU_CAPI_EXPORT bool TpuProgram_HasSharding(
     const XLA_TpuProgram* tpu_program);
 
@@ -76,6 +95,22 @@ TFTPU_CAPI_EXPORT bool TpuProgram_HasSharding(
 TFTPU_CAPI_EXPORT XLA_TpuProgram* TpuProgram_GetTpuProgram(
     XLA_TpuProgram* tpu_program, TpuProgramShardingType type);
 
+// Gets TPU executable proto from a `tpu_program`.
+TFTPU_CAPI_EXPORT void TpuProgram_SerializeTpuExecutable(
+    const XLA_TpuProgram* tpu_program, TpuExecutableSerializedProto* executable,
+    SE_Status* status);
+
+// Gets compilation metadata proto from a `tpu_program`.
+TFTPU_CAPI_EXPORT void TpuProgram_SerializeCompilerMetadata(
+    const XLA_TpuProgram* tpu_program,
+    CompilerMetadataSerializedProto* compiler_metadata, SE_Status* status);
+
+
+// Deserializes the `GetTpuProgramResponse` proto into an `XLA_TpuProgram`.
+TFTPU_CAPI_EXPORT void TpuProgram_DeserializeFromGetTpuProgramResponseProto(
+    TpuSerializedProto get_tpu_program_response, XLA_TpuProgram* tpu_program,
+    SE_Status* status);
+
 struct TfTpu_TpuProgramApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuProgram_New);
   TFTPU_ADD_FN_IN_STRUCT(TpuProgram_Free);
@@ -90,6 +125,9 @@ struct TfTpu_TpuProgramApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetMayModifyVariables);
   TFTPU_ADD_FN_IN_STRUCT(TpuProgram_HasSharding);
   TFTPU_ADD_FN_IN_STRUCT(TpuProgram_GetTpuProgram);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_SerializeTpuExecutable);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_SerializeCompilerMetadata);
+  TFTPU_ADD_FN_IN_STRUCT(TpuProgram_DeserializeFromGetTpuProgramResponseProto);
 };
 
 }  // extern "C"
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.cc b/tensorflow/core/tpu/kernels/tpu_program_group.cc
index ff7a526cd45..abc53cfc0eb 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.cc
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.cc
@@ -29,122 +29,80 @@ limitations under the License.
 
 namespace tensorflow {
 namespace tpu {
-
 namespace {
-
 namespace se_tpu = ::stream_executor::tpu;
-
 using stream_executor::port::Status;
-using stream_executor::port::StatusOr;
-using xla::Shape;
-
-StatusOr<std::vector<XLA_TpuProgram*>> CompileAheadOfTime(
-    std::unique_ptr<xla::HloModuleGroup> module_group,
-    const XlaCompiler::CompilationResult& compilation_result,
-    const TPUCompileMetadataProto& metadata,
-    const std::vector<std::vector<xla::Shape>>& per_core_arg_shapes,
-    const std::vector<std::vector<xla::Shape>>& per_core_output_shapes,
-    const std::vector<std::vector<std::pair<int, bool>>>&
-        per_core_variable_indices,
-    const absl::optional<xla::DeviceAssignment>& device_assignment) {
-  VLOG(1) << "Run CompileAheadOfTime.";
-  TF_ASSIGN_OR_RETURN(TpuAotCompilationRequestProto aot_request,
-                      CreateTpuAotCompilationRequest(
-                          *module_group, compilation_result, metadata,
-                          per_core_arg_shapes, per_core_output_shapes,
-                          per_core_variable_indices, device_assignment));
-  se_tpu::SerializedProto serialized_aot_request =
-      se_tpu::SerializeProto(aot_request);
-  auto cleanup = gtl::MakeCleanup([serialized_aot_request] {
-    se_tpu::SerializedProto_Free(serialized_aot_request);
-  });
-
-  XLA_TpuProgram** xla_tpu_programs = nullptr;
-  size_t count = 0;
-  StatusHelper status;
-  VLOG(1) << "Run TpuCompile_CompileAheadOfTime.";
-  CompileApiFn()->TpuCompile_CompileAheadOfTimeFn(
-      serialized_aot_request, &xla_tpu_programs, &count, status.c_status);
-  VLOG(1) << "Run CompileAheadOfTime completed.";
-  if (!status.status().ok()) {
-    return status.status();
-  }
-  std::vector<XLA_TpuProgram*> tpu_programs(count, nullptr);
-  for (size_t i = 0; i < count; ++i) {
-    tpu_programs[i] = xla_tpu_programs[i];
-  }
-  TpuProgramApiFn()->TpuProgram_FreeArrayFn(xla_tpu_programs);
-  return tpu_programs;
-}
-
-StatusOr<std::vector<XLA_TpuProgram*>> CompileAheadOfTime(
-    const TPUCompileMetadataProto& metadata,
-    const XlaCompiler::CompilationResult& compilation_result,
-    const std::vector<std::vector<xla::Shape>>& per_core_arg_shapes,
-    const std::vector<std::vector<xla::Shape>>& per_core_output_shapes,
-    const std::vector<std::vector<std::pair<int, bool>>>&
-        per_core_variable_indices,
-    const absl::optional<xla::DeviceAssignment>& device_assignment) {
-  VLOG(1) << "Compile Tpu programs.";
-  std::vector<std::unique_ptr<xla::HloModule>> hlo_modules;
-  auto status = CreateHloModules(metadata, compilation_result,
-                                 device_assignment, &hlo_modules);
-  if (!status.ok()) {
-    return status;
-  }
-
-  return CompileAheadOfTime(
-      absl::make_unique<xla::HloModuleGroup>(hlo_modules[0]->name(),
-                                             absl::MakeSpan(hlo_modules)),
-      compilation_result, metadata, per_core_arg_shapes, per_core_output_shapes,
-      per_core_variable_indices, device_assignment);
-}
 }  // namespace
 
+TPUExecutableInfoProto TpuProgramGroup::ConstructExecutableInfo(
+    const XLA_TpuProgram* xla_tpu_program) {
+  VLOG(1) << "ConstructExecutableInfo";
+  TpuSerializedProto serialized_executable_info = {};
+  StatusHelper status;
+  TpuProgramApiFn()->TpuProgram_GetExecutableInfoFn(
+      xla_tpu_program, &serialized_executable_info, status.c_status);
+  TPUExecutableInfoProto executable_info;
+  if (status.ok()) {
+    executable_info = se_tpu::DeserializeProto<TPUExecutableInfoProto>(
+        serialized_executable_info);
+    StreamExecutor_Tpu_FreeSerializedProto(&serialized_executable_info);
+  }
+  return executable_info;
+}
+
+TPUHostTransferInfoProto TpuProgramGroup::ConstructHostTransferInfo(
+    const XLA_TpuProgram* xla_tpu_program) {
+  VLOG(1) << "ConstructHostTransferInfo";
+  TpuSerializedProto serialized_host_transfer_info = {};
+  StatusHelper status;
+  TpuProgramApiFn()->TpuProgram_GetHostTransferInfoFn(
+      xla_tpu_program, &serialized_host_transfer_info, status.c_status);
+  TPUHostTransferInfoProto host_transfer_info;
+  if (status.ok()) {
+    host_transfer_info = se_tpu::DeserializeProto<TPUHostTransferInfoProto>(
+        serialized_host_transfer_info);
+    StreamExecutor_Tpu_FreeSerializedProto(&serialized_host_transfer_info);
+  }
+  return host_transfer_info;
+}
+
+xla::HloProto TpuProgramGroup::ConstructHloMetadata(
+    const XLA_TpuProgram* xla_tpu_program) {
+  VLOG(1) << "ConstructHloMetadata";
+  TpuSerializedProto serialized_hlo_metadata = {};
+  StatusHelper status;
+  TpuProgramApiFn()->TpuProgram_GetHloMetadataFn(
+      xla_tpu_program, &serialized_hlo_metadata, status.c_status);
+  xla::HloProto hlo_metadata;
+  if (status.ok()) {
+    hlo_metadata =
+        se_tpu::DeserializeProto<xla::HloProto>(serialized_hlo_metadata);
+    StreamExecutor_Tpu_FreeSerializedProto(&serialized_hlo_metadata);
+  }
+  return hlo_metadata;
+}
+
 void TpuProgramGroup::Initialize(
     absl::Span<XLA_TpuProgram* const> xla_tpu_programs) {
   CHECK_GT(xla_tpu_programs.size(), 0);
+  CHECK_EQ(program_count(), 0) << "Reinitialization of an existing "
+                                  "`TpuProgramGroup` instance is prohibited.";
   set_tpu_programs(xla_tpu_programs);
 
-  std::vector<bool> may_modify_variables_array(xla_tpu_programs.size(), false);
-  std::vector<TPUExecutableInfoProto> executable_infos(xla_tpu_programs.size());
+  std::vector<bool> may_modify_variables_array(tpu_programs_.size(), false);
+  std::vector<TPUExecutableInfoProto> executable_infos(tpu_programs_.size());
   std::vector<TPUHostTransferInfoProto> host_transfer_infos(
-      xla_tpu_programs.size());
-  std::vector<xla::HloProto> hlo_metadatas(xla_tpu_programs.size());
-  for (size_t i = 0; i < xla_tpu_programs.size(); ++i) {
+      tpu_programs_.size());
+  std::vector<xla::HloProto> hlo_metadatas(tpu_programs_.size());
+  for (size_t i = 0; i < tpu_programs_.size(); ++i) {
     const XLA_TpuProgram* xla_tpu_program = tpu_programs_[i];
     bool may_modify_variables;
     TpuProgramApiFn()->TpuProgram_GetMayModifyVariablesFn(
         xla_tpu_program, &may_modify_variables);
     may_modify_variables_array[i] = may_modify_variables;
-
-    TpuSerializedProto serialized_executable_info;
-    TpuProgramApiFn()->TpuProgram_GetExecutableInfoFn(
-        xla_tpu_program, &serialized_executable_info);
-    TPUExecutableInfoProto executable_info =
-        se_tpu::DeserializeProto<TPUExecutableInfoProto>(
-            serialized_executable_info);
-    executable_infos[i] = executable_info;
-    StreamExecutor_Tpu_FreeSerializedProto(&serialized_executable_info);
-
-    TPUHostTransferInfoProto host_transfer_info;
-    TpuSerializedProto serialized_host_transfer_info;
-    TpuProgramApiFn()->TpuProgram_GetHostTransferInfoFn(
-        xla_tpu_program, &serialized_host_transfer_info);
-    if (serialized_host_transfer_info.size > 0) {
-      host_transfer_info = se_tpu::DeserializeProto<TPUHostTransferInfoProto>(
-          serialized_host_transfer_info);
-      StreamExecutor_Tpu_FreeSerializedProto(&serialized_host_transfer_info);
-    }
-    host_transfer_infos[i] = host_transfer_info;
-
-    TpuSerializedProto serialized_hlo_metadata;
-    TpuProgramApiFn()->TpuProgram_GetHloMetadataFn(xla_tpu_program,
-                                                   &serialized_hlo_metadata);
-    xla::HloProto hlo_metadata =
-        se_tpu::DeserializeProto<xla::HloProto>(serialized_hlo_metadata);
-    hlo_metadatas[i] = hlo_metadata;
-    StreamExecutor_Tpu_FreeSerializedProto(&serialized_hlo_metadata);
+    executable_infos[i] = ConstructExecutableInfo(xla_tpu_program);
+    host_transfer_infos[i] = ConstructHostTransferInfo(xla_tpu_program);
+    hlo_metadatas[i] = ConstructHloMetadata(xla_tpu_program);
   }
 
   may_modify_variables_ = may_modify_variables_array;
@@ -195,52 +153,6 @@ void TpuProgramGroup::UnloadAndDestroyPrograms() {
   tpu_programs_.clear();
 }
 
-/*static*/ Status TpuProgramGroup::Build(
-    const TPUCompileMetadataProto& metadata,
-    const tensorflow::XlaCompiler::CompilationResult& compilation_result,
-    const std::vector<ShardingAndIndex>& arg_core_mapping,
-    const std::vector<std::vector<xla::Shape>>& per_core_arg_shapes,
-    const absl::optional<xla::DeviceAssignment>& xla_device_assignment,
-    TpuProgramGroupInterface* tpu_program_group_interface) {
-  std::vector<std::vector<xla::Shape>> per_core_output_shapes(
-      metadata.num_cores_per_replica());
-  TF_RETURN_IF_ERROR(ComputeOutputShapesForEachCore(
-      metadata, compilation_result, &per_core_output_shapes));
-
-  std::vector<std::vector<std::pair<int, bool>>> per_core_variable_indices(
-      metadata.num_cores_per_replica());
-  std::vector<bool> may_modify_variables;
-  TF_RETURN_IF_ERROR(AddVariableUpdatesToCores(
-      metadata, compilation_result, arg_core_mapping, &may_modify_variables,
-      &per_core_output_shapes, &per_core_variable_indices));
-  TF_RET_CHECK(per_core_arg_shapes.size() == metadata.num_cores_per_replica());
-  TF_RET_CHECK(per_core_output_shapes.size() == per_core_arg_shapes.size());
-  TF_RET_CHECK(per_core_output_shapes.size() ==
-               per_core_variable_indices.size());
-
-  // With shardable input/output pairs, XLA could generate separate
-  // sharding/unsharding programs along with the main program. The
-  // sharding/unsharding programs will be in nested entries of the AOT
-  // compilation result.
-  auto status_or = CompileAheadOfTime(
-      metadata, compilation_result, per_core_arg_shapes, per_core_output_shapes,
-      per_core_variable_indices, xla_device_assignment);
-
-  TF_ASSIGN_OR_RETURN(std::vector<XLA_TpuProgram*> xla_tpu_programs,
-                      std::move(status_or));
-  // SPMD could return 1 result for all partitions.
-  TF_RET_CHECK(xla_tpu_programs.size() == 1 ||
-               xla_tpu_programs.size() == metadata.num_cores_per_replica());
-
-  // TODO(henrytan): add an interface to TpuProgramGroupInterface to set
-  // may_modify_variables.
-  TpuProgramGroup* tpu_program_group =
-      tensorflow::down_cast<TpuProgramGroup*>(tpu_program_group_interface);
-  tpu_program_group->Initialize(xla_tpu_programs);
-  tpu_program_group->may_modify_variables_ = may_modify_variables;
-  return Status::OK();
-}
-
 TpuProgramGroup::TpuProgramGroup(TpuProgramGroup&& other)
     : may_modify_variables_(std::move(other.may_modify_variables_)),
       tpu_programs_(std::move(other.tpu_programs_)),
@@ -283,7 +195,7 @@ Status TpuProgramGroup::LogCompilationStats(const TpuCompilationCacheKey& key,
   return Status::OK();
 }
 
-const std::vector<bool>& TpuProgramGroup::may_modify_variables() const {
+const std::vector<bool>& TpuProgramGroup::may_modify_variables_list() const {
   return may_modify_variables_;
 }
 
@@ -292,6 +204,15 @@ void TpuProgramGroup::set_may_modify_variables(
   may_modify_variables_ = may_modify_variables;
 }
 
+bool TpuProgramGroup::may_modify_variables(int index) const {
+  CHECK_GE(index, 0);
+  CHECK_LT(index, tpu_programs_.size());
+  bool may_modify_variables;
+  TpuProgramApiFn()->TpuProgram_GetMayModifyVariablesFn(tpu_programs_[index],
+                                                        &may_modify_variables);
+  return may_modify_variables;
+}
+
 const std::vector<XLA_TpuProgram*>& TpuProgramGroup::tpu_programs() const {
   return tpu_programs_;
 }
@@ -371,5 +292,46 @@ std::vector<XLA_TpuProgram*> TpuProgramGroup::tpu_programs(
   }
   return tpu_programs;
 }
+
+Status TpuProgramGroup::DeserializeFromRpcResponseProtos(
+    const std::vector<TpuSerializedProto>& rpc_response_protos) {
+  std::vector<XLA_TpuProgram*> tpu_programs;
+  tpu_programs.resize(rpc_response_protos.size());
+
+  for (size_t i = 0; i < rpc_response_protos.size(); ++i) {
+    StatusHelper status;
+    auto* xla_tpu_program = TpuProgramApiFn()->TpuProgram_NewFn();
+    TpuProgramApiFn()->TpuProgram_DeserializeFromGetTpuProgramResponseProtoFn(
+        rpc_response_protos[i], xla_tpu_program, status.c_status);
+    if (!status.status().ok()) {
+      TpuProgramApiFn()->TpuProgram_FreeFn(xla_tpu_program);
+      return status.status();
+    }
+    tpu_programs[i] = xla_tpu_program;
+  }
+
+  Initialize(tpu_programs);
+  return Status::OK();
+}
+
+Status TpuProgramGroup::SerializeExecutable(
+    int index, TpuExecutableSerializedProto* executable) const {
+  CHECK_GE(index, 0);
+  CHECK_LT(index, tpu_programs_.size());
+  StatusHelper status;
+  TpuProgramApiFn()->TpuProgram_SerializeTpuExecutableFn(
+      tpu_programs_[index], executable, status.c_status);
+  return status.status();
+}
+
+Status TpuProgramGroup::SerializeCompilerMetadata(
+    int index, CompilerMetadataSerializedProto* compiler_metadata) const {
+  CHECK_GE(index, 0);
+  CHECK_LT(index, tpu_programs_.size());
+  StatusHelper status;
+  TpuProgramApiFn()->TpuProgram_SerializeCompilerMetadataFn(
+      tpu_programs_[index], compiler_metadata, status.c_status);
+  return status.status();
+}
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.h b/tensorflow/core/tpu/kernels/tpu_program_group.h
index 0fc8bff08de..3ed1623e9e6 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.h
@@ -15,9 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_PROGRAM_GROUP_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_PROGRAM_GROUP_H_
 
+#include <memory>
 #include <vector>
 
 #include "absl/types/optional.h"
+#include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/compile_only_client.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
@@ -93,15 +95,6 @@ class TpuProgramGroup : public TpuProgramGroupInterface {
       const XLA_TpuMeshState* mesh_state,
       TpuProgramGroupInterface* tpu_program_group_interface);
 
-  // Compiles HLO IR and returns TPU programs ready for execution.
-  static Status Build(
-      const TPUCompileMetadataProto& metadata,
-      const tensorflow::XlaCompiler::CompilationResult& compilation_result,
-      const std::vector<ShardingAndIndex>& arg_core_mapping,
-      const std::vector<std::vector<xla::Shape>>& per_core_arg_shapes,
-      const absl::optional<xla::DeviceAssignment>& xla_device_assignment,
-      TpuProgramGroupInterface* tpu_program_group_interface);
-
   // Initializes `TpuProgramGroup` object with `xla_tpu_programs`.
   void Initialize(absl::Span<XLA_TpuProgram* const> xla_tpu_programs);
 
@@ -122,8 +115,9 @@ class TpuProgramGroup : public TpuProgramGroupInterface {
   Status LogCompilationStats(const TpuCompilationCacheKey& key,
                              absl::Duration duration) override;
 
-  const std::vector<bool>& may_modify_variables() const override;
+  const std::vector<bool>& may_modify_variables_list() const override;
   void set_may_modify_variables(const std::vector<bool>& may_modify_variables);
+  bool may_modify_variables(int index) const override;
 
   const std::vector<XLA_TpuProgram*>& tpu_programs() const;
   std::vector<XLA_TpuProgram*> tpu_programs(TpuProgramShardingType type) const;
@@ -137,7 +131,34 @@ class TpuProgramGroup : public TpuProgramGroupInterface {
   const xla::HloProto* hlo_metadata(int index) const;
   absl::Span<const xla::HloProto* const> hlo_metadatas() const override;
 
+  // Deserializes `GetTpuProgramResponse` protos from remote cache.
+  Status DeserializeFromRpcResponseProtos(
+      const std::vector<TpuSerializedProto>& rpc_response_protos);
+
+  // Serializes executable proto from the TPU program for the given core
+  // `index`.
+  Status SerializeExecutable(int index,
+                             TpuExecutableSerializedProto* executable) const;
+
+  // Serializes compiler metadata of the TPU program for the given core `index`.
+  Status SerializeCompilerMetadata(
+      int index, CompilerMetadataSerializedProto* compiler_metadata) const;
+
+  // Serializes host compute metadata of the TPU program for the given core
+  // `index`.
+  Status SerializeHostComputeMetadata(
+      int index,
+      HostComputeMetadataSerializedProto* host_compute_metadata) const;
+
  private:
+  TPUExecutableInfoProto ConstructExecutableInfo(
+      const XLA_TpuProgram* tpu_program);
+  TPUHostTransferInfoProto ConstructHostTransferInfo(
+      const XLA_TpuProgram* tpu_program);
+  xla::HloProto ConstructHloMetadata(const XLA_TpuProgram* tpu_program);
+
+  // Update `hlo_metadatas__ptrs_` array from `hlo_metadatas_`. This needs to be
+  // called on `hlo_metadatas_` change(s).
   void RefreshHloMetadatasPtrs();
 
   std::vector<bool> may_modify_variables_;
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group_interface.h b/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
index 4af94f8e1ad..8bf4404859f 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
@@ -61,7 +61,11 @@ class TpuProgramGroupInterface {
 
   // Boolean array to indicate if the modification of variables are
   // allowed.
-  virtual const std::vector<bool>& may_modify_variables() const = 0;
+  virtual const std::vector<bool>& may_modify_variables_list() const = 0;
+
+  // Gets may modify variables value of the TPU program for the given core
+  // `index`.
+  virtual bool may_modify_variables(int index) const = 0;
 };
 
 }  // namespace tpu
diff --git a/tensorflow/core/tpu/kernels/tpu_util.cc b/tensorflow/core/tpu/kernels/tpu_util.cc
index 60f8fe0198b..6f31d066db5 100644
--- a/tensorflow/core/tpu/kernels/tpu_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_util.cc
@@ -14,8 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/tpu_util.h"
 
+#include "absl/strings/str_format.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/platform/random.h"
+#include "tensorflow/core/tpu/tpu_api.h"
 
 namespace tensorflow {
 namespace tpu {
@@ -95,5 +97,14 @@ Status DynamicShapesToTensorShapes(const InputList& dynamic_shapes,
   }
   return Status::OK();
 }
+
+xla::StatusOr<std::unique_ptr<::grpc::ServerBuilder>> CreateServerBuilder(
+    int serving_port) {
+  auto server_builder = absl::make_unique<::grpc::ServerBuilder>();
+  server_builder->AddListeningPort(
+      absl::StrFormat("[::]:%d", serving_port),
+      ::grpc::InsecureServerCredentials());  // NOLINT
+  return std::move(server_builder);
+}
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_util.h b/tensorflow/core/tpu/kernels/tpu_util.h
index 579fbdf5e85..d45934f31b6 100644
--- a/tensorflow/core/tpu/kernels/tpu_util.h
+++ b/tensorflow/core/tpu/kernels/tpu_util.h
@@ -15,9 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_UTIL_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_UTIL_H_
 
+#include <memory>
 #include <string>
 #include <vector>
 
+#include "grpcpp/server_builder.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -54,6 +56,10 @@ Status DynamicShapesToTensorShapes(const OpInputList& dynamic_shapes,
                                    std::vector<TensorShape>* shapes);
 Status DynamicShapesToTensorShapes(const InputList& dynamic_shapes,
                                    std::vector<TensorShape>* shapes);
+
+// Creates gRPC ServerBuilder.
+xla::StatusOr<std::unique_ptr<::grpc::ServerBuilder>> CreateServerBuilder(
+    int serving_port);
 }  // namespace tpu
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tpu/kernels/tpu_util_c_api.h b/tensorflow/core/tpu/kernels/tpu_util_c_api.h
index ddc7a842f49..04b65e24e54 100644
--- a/tensorflow/core/tpu/kernels/tpu_util_c_api.h
+++ b/tensorflow/core/tpu/kernels/tpu_util_c_api.h
@@ -56,6 +56,9 @@ TFTPU_CAPI_EXPORT bool TpuCompile_ShouldTpuCompileOpIgnoreCancellation();
 TFTPU_CAPI_EXPORT int TpuTopology_AvailableCoreCount(
     const XLA_TpuMeshState* mesh_state, TpuCoreTypeEnum tpu_core_type);
 
+// Recycle unused service port.
+TFTPU_CAPI_EXPORT void TpuNetUtil_RecycleUnusedPort(int port);
+
 // Creates a unique compilation cache `key` used for `put` and `get` operations.
 // Returned buffers are heap-allocated and must be owned.
 TFTPU_CAPI_EXPORT CompilationCacheKeyResult
@@ -79,6 +82,7 @@ struct TfTpu_UtilApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_IsTpuCompilationEnabled);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_ShouldTpuCompileOpIgnoreCancellation);
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_AvailableCoreCount);
+  TFTPU_ADD_FN_IN_STRUCT(TpuNetUtil_RecycleUnusedPort);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateCompilationCacheKey);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_DestroyCompilationCacheKey);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompile_CreateGuaranteedConstFingerprint);
diff --git a/tensorflow/core/tpu/kernels/transfer_ops.cc b/tensorflow/core/tpu/kernels/transfer_ops.cc
index a5cdfd466a6..16d3ecf3587 100644
--- a/tensorflow/core/tpu/kernels/transfer_ops.cc
+++ b/tensorflow/core/tpu/kernels/transfer_ops.cc
@@ -27,27 +27,19 @@ limitations under the License.
 
 namespace tensorflow {
 
-TpuTransferAsyncOpKernel::TpuTransferAsyncOpKernel(OpKernelConstruction* ctx,
-                                                   const string& transfer_type,
-                                                   int number_of_threads)
+TpuTransferAsyncOpKernelBase::TpuTransferAsyncOpKernelBase(
+    OpKernelConstruction* ctx, const string& transfer_type,
+    int number_of_threads)
     : AsyncOpKernel(ctx),
+      transfer_type_(transfer_type),
       thread_pool_(new thread::ThreadPool(
           ctx->env(),
           strings::StrCat(transfer_type, "_thread_",
                           SanitizeThreadSuffix(def().name())),
-          /*num_threads=*/8)) {
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("device_ordinal", &device_ordinal_));
-  if (ctx->device_type() == DeviceType(DEVICE_CPU)) {
-    OP_REQUIRES(
-        ctx, device_ordinal_ >= 0,
-        errors::InvalidArgument(transfer_type,
-                                " ops must specify a device_ordinal when "
-                                "placed on CPU."));
-  }
-}
+          /*num_threads=*/8)) {}
 
-void TpuTransferAsyncOpKernel::ComputeAsync(OpKernelContext* ctx,
-                                            DoneCallback done) {
+void TpuTransferAsyncOpKernelBase::ComputeAsync(OpKernelContext* ctx,
+                                                DoneCallback done) {
   CancellationToken token =
       ctx->cancellation_manager()->get_cancellation_token();
   bool already_cancelled;
@@ -68,11 +60,12 @@ void TpuTransferAsyncOpKernel::ComputeAsync(OpKernelContext* ctx,
   });
 }
 
-Status TpuTransferAsyncOpKernel::RunTransfer(OpKernelContext* ctx) {
+Status TpuTransferAsyncOpKernelBase::RunTransferWithOrdinal(
+    OpKernelContext* ctx, int device_ordinal) {
   auto* tpu_platform = tpu::TpuPlatformInterface::GetRegisteredPlatform(
       /*initialize_platform=*/false);
 
-  int real_device_ordinal = device_ordinal_;
+  int real_device_ordinal = device_ordinal;
   if (real_device_ordinal < 0) {
     const XlaDevice::Metadata* metadata;
     TF_RETURN_IF_ERROR(XlaDevice::GetMetadata(ctx, &metadata));
@@ -91,9 +84,47 @@ Status TpuTransferAsyncOpKernel::RunTransfer(OpKernelContext* ctx) {
       stream_executor);
 }
 
-void TpuTransferAsyncOpKernel::Cancel() {
+void TpuTransferAsyncOpKernelBase::Cancel() {
   mutex_lock lock(mu_);
   TF_CHECK_OK(tpu::TpuNodeContext::CloseTpuHost());
 }
 
+TpuTransferAsyncOpKernel::TpuTransferAsyncOpKernel(OpKernelConstruction* ctx,
+                                                   const string& transfer_type,
+                                                   int number_of_threads)
+    : TpuTransferAsyncOpKernelBase(ctx, transfer_type, number_of_threads) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr("device_ordinal", &device_ordinal_));
+  if (ctx->device_type() == DeviceType(DEVICE_CPU)) {
+    OP_REQUIRES(
+        ctx, device_ordinal_ >= 0,
+        errors::InvalidArgument(transfer_type,
+                                " ops must specify a device_ordinal when "
+                                "placed on CPU."));
+  }
+}
+
+Status TpuTransferAsyncOpKernel::RunTransfer(OpKernelContext* ctx) {
+  return RunTransferWithOrdinal(ctx, device_ordinal_);
+}
+
+TpuTransferAsyncDynamicOrdinalOpKernel::TpuTransferAsyncDynamicOrdinalOpKernel(
+    OpKernelConstruction* ctx, const string& transfer_type,
+    int number_of_threads)
+    : TpuTransferAsyncOpKernelBase(ctx, transfer_type, number_of_threads) {}
+
+Status TpuTransferAsyncDynamicOrdinalOpKernel::RunTransfer(
+    OpKernelContext* ctx) {
+  const Tensor& device_ordinal_tensor = ctx->input(0);
+  const int device_ordinal = device_ordinal_tensor.scalar<int32>()();
+  XlaDevice* xla_device =
+      dynamic_cast<XlaDevice*>(ctx->device()->UnderlyingDevice());
+  if (((xla_device == nullptr) || (xla_device->device_type() == DEVICE_CPU)) &&
+      (device_ordinal < 0)) {
+    return errors::InvalidArgument(transfer_type_,
+                                   " ops must specify a device_ordinal when "
+                                   "placed on CPU.");
+  }
+  return RunTransferWithOrdinal(ctx, device_ordinal);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/transfer_ops.h b/tensorflow/core/tpu/kernels/transfer_ops.h
index d98d743f569..6a3662ad203 100644
--- a/tensorflow/core/tpu/kernels/transfer_ops.h
+++ b/tensorflow/core/tpu/kernels/transfer_ops.h
@@ -25,11 +25,11 @@ namespace tensorflow {
 
 // Base class providing common functionality for async ops that transfer from
 // host to TPU.
-class TpuTransferAsyncOpKernel : public AsyncOpKernel {
+class TpuTransferAsyncOpKernelBase : public AsyncOpKernel {
  public:
-  explicit TpuTransferAsyncOpKernel(OpKernelConstruction* ctx,
-                                    const string& transfer_type,
-                                    int number_of_threads);
+  explicit TpuTransferAsyncOpKernelBase(OpKernelConstruction* ctx,
+                                        const string& transfer_type,
+                                        int number_of_threads);
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
 
@@ -38,19 +38,54 @@ class TpuTransferAsyncOpKernel : public AsyncOpKernel {
                         xla::TpuTransferManagerInterface* transfer_manager,
                         stream_executor::StreamExecutor* stream_executor) = 0;
 
+  Status RunTransferWithOrdinal(OpKernelContext* ctx, int device_ordinal);
+  std::string transfer_type_;
+
  private:
-  Status RunTransfer(OpKernelContext* ctx);
+  virtual Status RunTransfer(OpKernelContext* ctx) = 0;
   void Cancel();
 
   std::unique_ptr<thread::ThreadPool> thread_pool_;
-  int device_ordinal_;
   mutex mu_;
 
+  // TpuTransferAsyncOpKernelBase is neither copyable nor movable.
+  TpuTransferAsyncOpKernelBase(const TpuTransferAsyncOpKernelBase&) = delete;
+  TpuTransferAsyncOpKernelBase& operator=(const TpuTransferAsyncOpKernelBase&) =
+      delete;
+};
+
+class TpuTransferAsyncOpKernel : public TpuTransferAsyncOpKernelBase {
+ public:
+  explicit TpuTransferAsyncOpKernel(OpKernelConstruction* ctx,
+                                    const string& transfer_type,
+                                    int number_of_threads);
+
+ private:
+  Status RunTransfer(OpKernelContext* ctx) override;
+  int device_ordinal_;
+
   // TpuTransferAsyncOpKernel is neither copyable nor movable.
   TpuTransferAsyncOpKernel(const TpuTransferAsyncOpKernel&) = delete;
   TpuTransferAsyncOpKernel& operator=(const TpuTransferAsyncOpKernel&) = delete;
 };
 
+class TpuTransferAsyncDynamicOrdinalOpKernel
+    : public TpuTransferAsyncOpKernelBase {
+ public:
+  explicit TpuTransferAsyncDynamicOrdinalOpKernel(OpKernelConstruction* ctx,
+                                                  const string& transfer_type,
+                                                  int number_of_threads);
+
+ private:
+  Status RunTransfer(OpKernelContext* ctx) override;
+
+  // TpuTransferAsyncDynamicOpKernel is neither copyable nor movable.
+  TpuTransferAsyncDynamicOrdinalOpKernel(
+      const TpuTransferAsyncDynamicOrdinalOpKernel&) = delete;
+  TpuTransferAsyncDynamicOrdinalOpKernel& operator=(
+      const TpuTransferAsyncDynamicOrdinalOpKernel&) = delete;
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_TPU_KERNELS_TRANSFER_OPS_H_
diff --git a/tensorflow/core/tpu/kernels/xla/BUILD b/tensorflow/core/tpu/kernels/xla/BUILD
index f55583a570b..0c1bfba9c36 100644
--- a/tensorflow/core/tpu/kernels/xla/BUILD
+++ b/tensorflow/core/tpu/kernels/xla/BUILD
@@ -1,5 +1,7 @@
 # XLA Ops for TPUs
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 package(
     licenses = ["notice"],  # Apache 2.0
 )
diff --git a/tensorflow/core/tpu/libtftpu.h b/tensorflow/core/tpu/libtftpu.h
index a4405df8205..9171af87061 100644
--- a/tensorflow/core/tpu/libtftpu.h
+++ b/tensorflow/core/tpu/libtftpu.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdbool.h>
+
 #ifndef TENSORFLOW_CORE_TPU_LIBTFTPU_H_
 #define TENSORFLOW_CORE_TPU_LIBTFTPU_H_
 
@@ -39,7 +41,7 @@ limitations under the License.
 extern "C" {
 #endif
 
-TFTPU_CAPI_EXPORT void TfTpu_Initialize();
+TFTPU_CAPI_EXPORT void TfTpu_Initialize(bool init_library);
 
 #ifdef __cplusplus
 }
diff --git a/tensorflow/core/tpu/ops/BUILD b/tensorflow/core/tpu/ops/BUILD
index 5b49f5abc78..a85b599ab31 100644
--- a/tensorflow/core/tpu/ops/BUILD
+++ b/tensorflow/core/tpu/ops/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
diff --git a/tensorflow/core/tpu/ops/tpu_compile_op.cc b/tensorflow/core/tpu/ops/tpu_compile_op.cc
index bfd29a7b1e7..038005b4f45 100644
--- a/tensorflow/core/tpu/ops/tpu_compile_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_compile_op.cc
@@ -40,7 +40,7 @@ REGISTER_OP("_TPUCompileMlir")
       c->set_output(0, c->Scalar());
       // Programs.
       for (int i = 0; i < num_computations; ++i) {
-        c->set_output(i + 1, c->Vector(2));
+        c->set_output(i + 1, c->Vector(3));
       }
       return Status::OK();
     })
@@ -64,7 +64,7 @@ REGISTER_OP("_TPUCompileMlirPlaceholderProgramKey")
     .SetIsStateful()
     .Output("program: string")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->Vector(2));
+      c->set_output(0, c->Vector(3));
       return Status::OK();
     })
     .SetIsStateful()
@@ -100,7 +100,7 @@ REGISTER_OP("TPUCompile")
       c->set_output(0, c->Scalar());
       // Programs.
       for (int i = 0; i < num_computations; ++i) {
-        c->set_output(i + 1, c->Vector(2));
+        c->set_output(i + 1, c->Vector(3));
       }
       // May modify variables.
       for (int i = 0; i < num_computations; ++i) {
diff --git a/tensorflow/core/tpu/ops/tpu_execute_op.cc b/tensorflow/core/tpu/ops/tpu_execute_op.cc
index 68ddc862031..2e437192aa2 100644
--- a/tensorflow/core/tpu/ops/tpu_execute_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_execute_op.cc
@@ -30,7 +30,7 @@ REGISTER_OP("TPUExecute")
       shape_inference::ShapeHandle key;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 1, &key));
       shape_inference::DimensionHandle unused;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(key, 0), 2, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(key, 0), 3, &unused));
       for (int i = 0; i < c->num_outputs(); ++i) {
         c->set_output(i, c->UnknownShape());
       }
@@ -50,7 +50,7 @@ REGISTER_OP("TPUExecuteAndUpdateVariables")
       shape_inference::ShapeHandle key;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 1, &key));
       shape_inference::DimensionHandle unused;
-      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(key, 0), 2, &unused));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(key, 0), 3, &unused));
       for (int i = 0; i < c->num_outputs(); ++i) {
         c->set_output(i, c->UnknownShape());
       }
diff --git a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
index 4dc09770c38..e4d723305a9 100644
--- a/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
+++ b/tensorflow/core/tpu/tpu_api_dlsym_initializer.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/tpu_api_dlsym_set_fn.h"
 #if !defined(PLATFORM_GOOGLE)
 #include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/core/tpu/tpu_node_device.h"
@@ -27,13 +28,6 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
 #endif
 
-#define TFTPU_SET_FN(Struct, FnName)                                         \
-  Struct->FnName##Fn =                                                       \
-      reinterpret_cast<decltype(FnName)*>(dlsym(library_handle, #FnName));   \
-  if (!(Struct->FnName##Fn)) {                                               \
-    LOG(FATAL) << #FnName " not available in this library.";                 \
-    return errors::Unimplemented(#FnName " not available in this library."); \
-  }
 
 // Reminder: Update tpu_library_loader_windows.cc if you are adding new publicly
 // visible methods.
@@ -55,10 +49,10 @@ Status InitializeTpuLibrary(void* library_handle) {
   // loaded. We do not want to register a TPU platform in XLA without the
   // supporting library providing the necessary APIs.
   if (s.ok()) {
-    void (*initialize_fn)();
+    void (*initialize_fn)(bool init_library);
     initialize_fn = reinterpret_cast<decltype(initialize_fn)>(
         dlsym(library_handle, "TfTpu_Initialize"));
-    (*initialize_fn)();
+    (*initialize_fn)(/*init_library=*/true);
 
     RegisterTpuPlatform();
     RegisterTpuSystemDevice();
diff --git a/tensorflow/core/tpu/tpu_api_dlsym_set_fn.h b/tensorflow/core/tpu/tpu_api_dlsym_set_fn.h
new file mode 100644
index 00000000000..a1e13550d96
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_api_dlsym_set_fn.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_API_DLSYM_SET_FN_H_
+#define TENSORFLOW_CORE_TPU_TPU_API_DLSYM_SET_FN_H_
+
+#define TFTPU_SET_FN(Struct, FnName)                                         \
+  Struct->FnName##Fn =                                                       \
+      reinterpret_cast<decltype(FnName)*>(dlsym(library_handle, #FnName));   \
+  if (!(Struct->FnName##Fn)) {                                               \
+    LOG(FATAL) << #FnName " not available in this library.";                 \
+    return errors::Unimplemented(#FnName " not available in this library."); \
+  }
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_API_DLSYM_SET_FN_H_
diff --git a/tensorflow/core/tpu/tpu_config_c_api.h b/tensorflow/core/tpu/tpu_config_c_api.h
index 55de89d17c9..de4b2e25570 100644
--- a/tensorflow/core/tpu/tpu_config_c_api.h
+++ b/tensorflow/core/tpu/tpu_config_c_api.h
@@ -26,17 +26,15 @@ typedef struct TpuSerializedProto TpuSerializedProto;
 
 namespace tensorflow {
 class TpuMeshCommonState;
-namespace tpu {
-class TpuMeshStateInterface;
-}  // namespace tpu
 }  // namespace tensorflow
 
 extern "C" {
 
 TFTPU_CAPI_EXPORT void ConfigureDistributedTpuOp_DoWork(
     const size_t num_cores_per_host_size, const int32_t* num_cores_per_host,
-    void* tpu_compilation_cache_interface, size_t* host_config_output_size,
-    char** host_config_output, TF_Status* status);
+    size_t server_address_size, const char* server_address,
+    size_t* host_config_output_size, char** host_config_output,
+    TF_Status* status);
 
 TFTPU_CAPI_EXPORT void WaitForDistributedTpuOp_DoWork(
     const size_t num_hosts, const size_t num_cores_per_host,
@@ -45,11 +43,9 @@ TFTPU_CAPI_EXPORT void WaitForDistributedTpuOp_DoWork(
     size_t* tpu_topology_output_size, char** tpu_topology_output,
     TF_Status* status);
 
-TFTPU_CAPI_EXPORT void ShutdownDistributedTpuOp_DoWork(TF_Status* status);
-
 TFTPU_CAPI_EXPORT void InitializeHostForDistributedTpuOp_DoWork(
     const size_t tpu_host_config_size, const char* tpu_host_config,
-    const bool enable_whole_mesh_compilations, void* local_compilation_cache,
+    const bool enable_whole_mesh_compilations, bool is_master_worker,
     size_t* core_id_output_size, int32_t** core_id_output, TF_Status* status);
 
 TFTPU_CAPI_EXPORT void SetGlobalTPUArrayOp_DoWork(
@@ -68,12 +64,22 @@ TFTPU_CAPI_EXPORT void TpuConfigurationApi_TpusPerHost(int32_t* tpus,
                                                        TF_Status* status);
 TFTPU_CAPI_EXPORT void TpuConfigurationApi_TpuMemoryLimit(int64_t* memory_limit,
                                                           TF_Status* status);
+
+TFTPU_CAPI_EXPORT void TpuConfigurationApi_RemoteCompilationCacheSizeInBytes(
+    int64_t* cache_size_in_bytes);
+TFTPU_CAPI_EXPORT
+void TpuConfigurationApi_CompilationCacheServerAddressFromConfig(
+    size_t tpu_host_config_size, const char* tpu_host_config,
+    size_t* server_address_output_size, char** server_address_output,
+    TF_Status* status);
+TFTPU_CAPI_EXPORT void TpuConfigurationApi_GetServerAddressAndPort(
+    size_t* server_address_output_size, char** server_address_output,
+    int* port_output, TF_Status* status);
 }
 
 struct TfTpu_ConfigApiFn {
   TFTPU_ADD_FN_IN_STRUCT(ConfigureDistributedTpuOp_DoWork);
   TFTPU_ADD_FN_IN_STRUCT(WaitForDistributedTpuOp_DoWork);
-  TFTPU_ADD_FN_IN_STRUCT(ShutdownDistributedTpuOp_DoWork);
   TFTPU_ADD_FN_IN_STRUCT(InitializeHostForDistributedTpuOp_DoWork);
   TFTPU_ADD_FN_IN_STRUCT(SetGlobalTPUArrayOp_DoWork);
   TFTPU_ADD_FN_IN_STRUCT(DisconnectDistributedTpuChipsOp_DoWork);
@@ -82,6 +88,10 @@ struct TfTpu_ConfigApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_HasTPUPodState);
   TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_TpusPerHost);
   TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_TpuMemoryLimit);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_RemoteCompilationCacheSizeInBytes);
+  TFTPU_ADD_FN_IN_STRUCT(
+      TpuConfigurationApi_CompilationCacheServerAddressFromConfig);
+  TFTPU_ADD_FN_IN_STRUCT(TpuConfigurationApi_GetServerAddressAndPort);
 };
 
 #endif  // TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_
diff --git a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
index 961858665a4..84d8ea70308 100644
--- a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h"
 
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -51,6 +54,10 @@ string GetOptimizationAlgorithmName(OptimizationAlgorithm alg) {
       return "OnlineYogi";
     case OptimizationAlgorithm::kProximalYogi:
       return "ProximalYogi";
+    case OptimizationAlgorithm::kFrequencyEstimator:
+      return "FrequencyEstimator";
+    case OptimizationAlgorithm::kUserDefinedProgram:
+      return "UserDefinedProgram";
     case OptimizationAlgorithm::PARAMETERS_NOT_SET:
       return "*** Not set ***";
   }
@@ -85,6 +92,10 @@ string GetOptimizationAlgorithmFriendlyName(OptimizationAlgorithm alg) {
       return "online Yogi";
     case OptimizationAlgorithm::kProximalYogi:
       return "proximal Yogi";
+    case OptimizationAlgorithm::kFrequencyEstimator:
+      return "frequency estimator";
+    case OptimizationAlgorithm::kUserDefinedProgram:
+      return "UserDefinedProgram";
     case OptimizationAlgorithm::PARAMETERS_NOT_SET:
       return "unknown (not specified)";
   }
@@ -94,8 +105,9 @@ string GetOptimizationAlgorithmFriendlyName(OptimizationAlgorithm alg) {
 // Returns the number of optimization parameter vectors used by the optimization
 // algorithm, excluding the weights themselves and assuming no gradient
 // accumulation.
-Status GetBaseAuxiliaryParameterCount(OptimizationAlgorithm alg, int* count) {
-  switch (alg) {
+Status GetBaseAuxiliaryParameterCount(const OptimizationParameters& params,
+                                      int* count) {
+  switch (params.parameters_case()) {
     case OptimizationAlgorithm::kAdagrad:
       *count = 1;
       return Status::OK();
@@ -135,17 +147,40 @@ Status GetBaseAuxiliaryParameterCount(OptimizationAlgorithm alg, int* count) {
     case OptimizationAlgorithm::kProximalYogi:
       *count = 2;
       return Status::OK();
+    case OptimizationAlgorithm::kFrequencyEstimator:
+      *count = 1;
+      return Status::OK();
+    case OptimizationAlgorithm::kUserDefinedProgram: {
+      const xla::ProgramShapeProto& program_shape =
+          params.user_defined_program().program().host_program_shape();
+
+      const int num_inputs = program_shape.parameters_size();
+      const int num_outputs = program_shape.result().tuple_shapes_size();
+
+      if ((num_inputs < 2) || ((num_inputs != num_outputs + 1) &&
+                               (num_inputs != num_outputs + 2))) {
+        return errors::InvalidArgument(
+            "User-defined TPU embedding optimizer program must have at least "
+            "two inputs and the number of outputs must be 1 or 2 less than the "
+            "number of inputs. Received ",
+            num_inputs, " input(s) and ", num_outputs, "output(s).");
+      }
+
+      *count = num_outputs - 1;
+
+      return Status::OK();
+    }
     case OptimizationAlgorithm::PARAMETERS_NOT_SET:
       return errors::InvalidArgument("No optimization algorithm specified");
   }
   return errors::InvalidArgument("No optimization algorithm specified");
 }
 
-Status GetGradientAccumulationSupport(OptimizationAlgorithm alg,
+Status GetGradientAccumulationSupport(const OptimizationParameters& params,
                                       GradientAccumulationSupport* support) {
   int auxiliary_parameter_count;
   TF_RETURN_IF_ERROR(
-      GetBaseAuxiliaryParameterCount(alg, &auxiliary_parameter_count));
+      GetBaseAuxiliaryParameterCount(params, &auxiliary_parameter_count));
   *support = auxiliary_parameter_count + 1 <= kMaxAuxiliaryParameterCount
                  ? GradientAccumulationSupport::kSupported
                  : GradientAccumulationSupport::kNotSupported;
@@ -168,97 +203,111 @@ StateVariableSpecification MakeStandardStateVariableSpecification(
 }  // namespace
 
 Status GetOptimizationAlgorithmStateVariables(
-    OptimizationAlgorithm alg, bool use_gradient_accumulation,
+    const OptimizationParameters& params, bool use_gradient_accumulation,
     std::vector<StateVariableSpecification>* state_variables) {
-  // The first parameter set is always the weights themselves.
-  state_variables->push_back(
-      MakeStandardStateVariableSpecification("parameters", 0.0));
   // The order of the returned parameters needs to match the offsets used by
   // the algorithm implementations in test_util.cc and
   // address_handler_program_creator.cc.
-  switch (alg) {
+  // The first parameter set is always the weights themselves.
+  auto add_state_variable = [&](const std::string& name, float value) {
+    state_variables->push_back(
+        MakeStandardStateVariableSpecification(name, value));
+  };
+  switch (params.parameters_case()) {
     case OptimizationAlgorithm::kAdagrad: {
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators", 0.1));
+      add_state_variable("parameters", 0.0);
+      add_state_variable("accumulators", 0.1);
       break;
     }
     case OptimizationAlgorithm::kBoundedAdagrad: {
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators", 0.1));
+      add_state_variable("parameters", 0.0);
+      add_state_variable("accumulators", 0.1);
       break;
     }
     case OptimizationAlgorithm::kStochasticGradientDescent: {
-      // None.
+      add_state_variable("parameters", 0.0);
       break;
     }
     case OptimizationAlgorithm::kFtrl: {
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators", 0.1));
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("linears", 0.0));
+      add_state_variable("parameters", 0.0);
+      add_state_variable("accumulators", 0.1);
+      add_state_variable("linears", 0.0);
       break;
     }
     case OptimizationAlgorithm::kAdam: {
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("momenta", 0.0));
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("velocities", 0.0));
+      add_state_variable("parameters", 0.0);
+      add_state_variable("momenta", 0.0);
+      add_state_variable("velocities", 0.0);
       break;
     }
     case OptimizationAlgorithm::kMomentum: {
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("momenta", 0.0));
+      add_state_variable("parameters", 0.0);
+      add_state_variable("momenta", 0.0);
       break;
     }
     case OptimizationAlgorithm::kRmsProp: {
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("ms", 1.0));
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("mom", 0.0));
+      add_state_variable("parameters", 0.0);
+      add_state_variable("ms", 1.0);
+      add_state_variable("mom", 0.0);
       break;
     }
     case OptimizationAlgorithm::kCenteredRmsProp: {
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("ms", 1.0));
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("mom", 0.0));
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("mg", 0.0));
+      add_state_variable("parameters", 0.0);
+      add_state_variable("ms", 1.0);
+      add_state_variable("mom", 0.0);
+      add_state_variable("mg", 0.0);
       break;
     }
     case OptimizationAlgorithm::kMdlAdagradLight: {
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators", 0.1));
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("weights", 0.0));
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("benefits", 0.0));
+      add_state_variable("parameters", 0.0);
+      add_state_variable("accumulators", 0.1);
+      add_state_variable("weights", 0.0);
+      add_state_variable("benefits", 0.0);
       break;
     }
     case OptimizationAlgorithm::kAdadelta: {
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators", 0.0));
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("updates", 0.0));
+      add_state_variable("parameters", 0.0);
+      add_state_variable("accumulators", 0.0);
+      add_state_variable("updates", 0.0);
       break;
     }
     case OptimizationAlgorithm::kProximalAdagrad: {
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("accumulators", 0.1));
+      add_state_variable("parameters", 0.0);
+      add_state_variable("accumulators", 0.1);
       break;
     }
     case OptimizationAlgorithm::kOnlineYogi: {
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("vs", 0.1));
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("linears", 0.0));
+      add_state_variable("parameters", 0.0);
+      add_state_variable("vs", 0.1);
+      add_state_variable("linears", 0.0);
       break;
     }
     case OptimizationAlgorithm::kProximalYogi: {
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("v", 0.1));
-      state_variables->push_back(
-          MakeStandardStateVariableSpecification("m", 0.0));
+      add_state_variable("parameters", 0.0);
+      add_state_variable("v", 0.1);
+      add_state_variable("m", 0.0);
+      break;
+    }
+    case OptimizationAlgorithm::kFrequencyEstimator: {
+      add_state_variable("parameters", 0.0);
+      add_state_variable("last_hit_step", 0);
+      break;
+    }
+    case OptimizationAlgorithm::kUserDefinedProgram: {
+      add_state_variable("parameters",
+                         params.user_defined_program().padding_values(0));
+      int num_slots = -1;
+      TF_RETURN_IF_ERROR(GetBaseAuxiliaryParameterCount(params, &num_slots));
+      if (num_slots + 1 !=
+          params.user_defined_program().padding_values_size()) {
+        return errors::InvalidArgument(
+            "Number of slots does not agree with the number of padding values "
+            "specified.");
+      }
+      for (int i = 0; i < num_slots; ++i) {
+        add_state_variable(absl::StrCat("Slot_", i),
+                           params.user_defined_program().padding_values(i + 1));
+      }
       break;
     }
     case OptimizationAlgorithm::PARAMETERS_NOT_SET: {
@@ -276,7 +325,8 @@ Status GetOptimizationAlgorithmStateVariables(
   }
   if (state_variables->size() > kMaxAuxiliaryParameterCount + 1) {
     return errors::InvalidArgument(
-        "Optimization algorithm", GetOptimizationAlgorithmName(alg),
+        "Optimization algorithm",
+        GetOptimizationAlgorithmName(params.parameters_case()),
         "does not support gradient accumulation because it "
         "already has too many other accumulators");
   }
@@ -298,24 +348,13 @@ std::vector<OptimizationAlgorithm> GetOptimizationAlgorithms() {
       OptimizationAlgorithm::kProximalAdagrad,
       OptimizationAlgorithm::kOnlineYogi,
       OptimizationAlgorithm::kProximalYogi,
+      OptimizationAlgorithm::kFrequencyEstimator,
+      OptimizationAlgorithm::kUserDefinedProgram,
   };
 }
 
-LoadOpShapeFunction::LoadOpShapeFunction(OptimizationAlgorithm alg,
-                                         bool is_debug_op)
-    : alg_(alg), is_debug_op_(is_debug_op) {}
-
 Status LoadOpShapeFunction::operator()(
     shape_inference::InferenceContext* c) const {
-  GradientAccumulationSupport grad_accum_support;
-  TF_CHECK_OK(GetGradientAccumulationSupport(alg_, &grad_accum_support));
-
-  std::vector<StateVariableSpecification> state_variable_specs;
-  TF_CHECK_OK(GetOptimizationAlgorithmStateVariables(
-      alg_,
-      grad_accum_support == GradientAccumulationSupport::kSupported &&
-          is_debug_op_,
-      &state_variable_specs));
   int table_id;
   TF_RETURN_IF_ERROR(c->GetAttr("table_id", &table_id));
   string table_name;
@@ -329,52 +368,23 @@ Status LoadOpShapeFunction::operator()(
   TF_RETURN_IF_ERROR(c->GetAttr("num_shards", &num_shards));
   int shard_id;
   TF_RETURN_IF_ERROR(c->GetAttr("shard_id", &shard_id));
-  const int user_param_count =
-      std::count_if(state_variable_specs.begin(), state_variable_specs.end(),
-                    [&](const StateVariableSpecification& sv) {
-                      return sv.has_user_defined() || is_debug_op_;
-                    });
-  std::vector<shape_inference::ShapeHandle> inputs(user_param_count);
-  int input_index = 0;
-  for (int i = 0, end = state_variable_specs.size(); i < end; ++i) {
-    if (state_variable_specs[i].has_user_defined() || is_debug_op_) {
-      std::vector<shape_inference::ShapeHandle> input_temp;
-      TF_RETURN_IF_ERROR(c->input(state_variable_specs[i].name(), &input_temp));
-      if (input_temp.size() != 1) {
-        return errors::InvalidArgument("each input to be rank 1");
-      }
-      inputs[input_index] = input_temp[0];
-      ++input_index;
-    }
-  }
+
   // Verify shapes have rank 2 and are compatible when they are
   // required to be valid.
   shape_inference::ShapeHandle parameter_shape;
-  TF_RETURN_IF_ERROR(c->WithRank(inputs[0], 2, &parameter_shape));
-  for (int j = 1; j < user_param_count; ++j) {
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &parameter_shape));
+  for (int j = 1; j < c->num_inputs(); ++j) {
     shape_inference::ShapeHandle accumulator_j_shape;
-    TF_RETURN_IF_ERROR(c->WithRank(inputs[j], 2, &accumulator_j_shape));
+    TF_RETURN_IF_ERROR(c->WithRank(c->input(j), 2, &accumulator_j_shape));
     shape_inference::ShapeHandle merged;
     TF_RETURN_IF_ERROR(c->Merge(parameter_shape, accumulator_j_shape, &merged));
   }
+
   return Status::OK();
 }
 
-RetrieveOpShapeFunction::RetrieveOpShapeFunction(OptimizationAlgorithm alg,
-                                                 bool is_debug_op)
-    : alg_(alg), is_debug_op_(is_debug_op) {}
-
 Status RetrieveOpShapeFunction::operator()(
     shape_inference::InferenceContext* c) const {
-  GradientAccumulationSupport grad_accum_support;
-  TF_CHECK_OK(GetGradientAccumulationSupport(alg_, &grad_accum_support));
-
-  std::vector<StateVariableSpecification> state_variable_specs;
-  TF_CHECK_OK(GetOptimizationAlgorithmStateVariables(
-      alg_,
-      grad_accum_support == GradientAccumulationSupport::kSupported &&
-          is_debug_op_,
-      &state_variable_specs));
   int table_id;
   TF_RETURN_IF_ERROR(c->GetAttr("table_id", &table_id));
   string table_name;
@@ -388,44 +398,12 @@ Status RetrieveOpShapeFunction::operator()(
   TF_RETURN_IF_ERROR(c->GetAttr("num_shards", &num_shards));
   int shard_id;
   TF_RETURN_IF_ERROR(c->GetAttr("shard_id", &shard_id));
-  for (int j = 0, end = state_variable_specs.size(); j < end; ++j) {
-    if (state_variable_specs[j].has_user_defined() || is_debug_op_) {
-      auto shape = c->MakeShape(
-          std::vector<shape_inference::DimensionHandle>(2, c->UnknownDim()));
-      TF_RETURN_IF_ERROR(
-          c->set_output(state_variable_specs[j].name(),
-                        std::vector<shape_inference::ShapeHandle>(1, shape)));
-    }
+  for (int j = 0; j < c->num_outputs(); ++j) {
+    c->set_output(j, c->MakeShape(std::vector<shape_inference::DimensionHandle>(
+                         2, c->UnknownDim())));
   }
   return Status::OK();
 }
 
-Status IsOptimizationAlgorithmInternal(OptimizationAlgorithm alg,
-                                       bool* internal) {
-  switch (alg) {
-    case OptimizationAlgorithm::kAdagrad:
-    case OptimizationAlgorithm::kStochasticGradientDescent:
-    case OptimizationAlgorithm::kFtrl:
-    case OptimizationAlgorithm::kAdam:
-    case OptimizationAlgorithm::kMomentum:
-    case OptimizationAlgorithm::kRmsProp:
-    case OptimizationAlgorithm::kCenteredRmsProp:
-    case OptimizationAlgorithm::kMdlAdagradLight:
-    case OptimizationAlgorithm::kAdadelta:
-    case OptimizationAlgorithm::kProximalAdagrad:
-    case OptimizationAlgorithm::kProximalYogi: {
-      *internal = false;
-      return Status::OK();
-    }
-    case OptimizationAlgorithm::kBoundedAdagrad:
-    case OptimizationAlgorithm::kOnlineYogi: {
-      *internal = true;
-      return Status::OK();
-    }
-    case OptimizationAlgorithm::PARAMETERS_NOT_SET:
-      return errors::InvalidArgument("No optimization algorithm specified");
-  }
-}
-
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h
index 3e864c89cf2..9367f88a4a3 100644
--- a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h
+++ b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.h
@@ -48,18 +48,19 @@ enum class GradientAccumulationSupport {
 // Returns the number of optimization parameter vectors used by the optimization
 // algorithm, excluding the weights themselves and assuming no gradient
 // accumulation.
-Status GetBaseAuxiliaryParameterCount(OptimizationAlgorithm alg, int *count);
+Status GetBaseAuxiliaryParameterCount(const OptimizationParameters &params,
+                                      int *count);
 
 // Returns whether (and how) an optimization algorithm supports gradient
 // accumulation.
-Status GetGradientAccumulationSupport(OptimizationAlgorithm alg,
+Status GetGradientAccumulationSupport(const OptimizationParameters &params,
                                       GradientAccumulationSupport *support);
 
 // Returns the parameter specifications for the optimization algorithm (the main
 // parameters first, followed by any auxiliary parameters such as Adagrad
 // accumulators).
 Status GetOptimizationAlgorithmStateVariables(
-    OptimizationAlgorithm alg, bool use_gradient_accumulation,
+    const OptimizationParameters &params, bool use_gradient_accumulation,
     std::vector<StateVariableSpecification> *state_variables);
 
 // Maximum value of auxiliar_parameter_count for any optimization algorithm.
@@ -85,43 +86,18 @@ inline float GradientAccumulatorInitialValue() {
   return absl::bit_cast<float, uint32>(1);
 }
 
-// Returns whether an optimization algorithm is only supported internally.
-// Returns an error if the algorithm is not recognized at all.
-Status IsOptimizationAlgorithmInternal(OptimizationAlgorithm alg,
-                                       bool *internal);
-
 // Generic shape function for per-optimization-algorithm load ops.
 class LoadOpShapeFunction {
  public:
-  // Constructor.
-  LoadOpShapeFunction(OptimizationAlgorithm alg, bool is_debug_op);
-
   // Computes resulting shape and does parameter checking.
   Status operator()(shape_inference::InferenceContext *c) const;
-
- private:
-  // Optimization algorithm.
-  const OptimizationAlgorithm alg_;
-
-  // Whether this op has an extra parameter for the gradient accumulators.
-  const bool is_debug_op_;
 };
 
 // Generic shape function for per-optimization-algorithm retrieve ops.
 class RetrieveOpShapeFunction {
  public:
-  // Constructor.
-  RetrieveOpShapeFunction(OptimizationAlgorithm alg, bool is_debug_op);
-
   // Computes resulting shape and does parameter checking.
   Status operator()(shape_inference::InferenceContext *c) const;
-
- private:
-  // Optimization algorithm.
-  const OptimizationAlgorithm alg_;
-
-  // Whether this op has an extra parameter for the gradient accumulators.
-  const bool is_debug_op_;
 };
 
 }  // namespace tpu
diff --git a/tensorflow/core/tpu/tpu_embedding_output_layout_utils.cc b/tensorflow/core/tpu/tpu_embedding_output_layout_utils.cc
index 3a027757af7..6ecab5ee981 100644
--- a/tensorflow/core/tpu/tpu_embedding_output_layout_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_output_layout_utils.cc
@@ -20,75 +20,17 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
-void AddDefaultEmbeddingOutputLayoutIfNeeded(
-    TPUEmbeddingConfiguration* config) {
-  if (config->has_output_layout()) {
-    // Model or previous step has already filled this in.
-    return;
-  }
-
-  TPUEmbeddingOutputLayout* layout = config->mutable_output_layout();
-  // Create output tensors.
-  for (const auto& table : config->table_descriptor()) {
-    TPUEmbeddingOutputLayout::EmbeddingOutputTensor* output =
-        layout->add_output();
-    TPUEmbeddingOutputLayout::TwoDOutputTensor* two_d = output->mutable_two_d();
-    two_d->set_dim1_size(table.dimension());
-    two_d->set_dim0_size_per_sample(table.num_features());
-  }
-
-  // Create table output locations.
-  for (int table_id = 0; table_id < config->table_descriptor_size();
-       ++table_id) {
-    TPUEmbeddingOutputLayout::TableDescriptor* output_table =
-        layout->add_table();
-    const auto& table = config->table_descriptor(table_id);
-    for (int feature_index = 0; feature_index < table.num_features();
-         ++feature_index) {
-      TPUEmbeddingOutputLayout::FeatureDescriptor* output_feature =
-          output_table->add_feature();
-      TPUEmbeddingOutputLayout::OutputLocation* output_location =
-          output_feature->add_output_location();
-      output_location->set_tensor_index(table_id);
-      output_location->set_dim0_offset(feature_index);
-      output_location->set_dim1_offset(0);
-    }
-  }
-}
-
 Status ComputeOutputTensorShapes(const TPUEmbeddingConfiguration& config,
                                  std::vector<TensorShapeProto>* shapes) {
-  if (!config.has_output_layout()) {
-    return errors::InvalidArgument(
-        "TPUEmbeddingConfiguration is missing output layout.");
-  }
-  const TPUEmbeddingOutputLayout& layout = config.output_layout();
   int batch_size = config.batch_size_per_tensor_core();
 
-  for (int i = 0; i < layout.output_size(); ++i) {
-    const auto& output = layout.output(i);
+  for (const TPUEmbeddingConfiguration::TableDescriptor& table :
+       config.table_descriptor()) {
     TensorShapeProto shape;
-    switch (output.output_format_case()) {
-      case TPUEmbeddingOutputLayout::EmbeddingOutputTensor::OutputFormatCase::
-          kTwoD: {
-        auto* dim0 = shape.add_dim();
-        dim0->set_size(output.two_d().dim0_size_per_sample() * batch_size);
-        auto* dim1 = shape.add_dim();
-        dim1->set_size(output.two_d().dim1_size());
-        break;
-      }
-      case TPUEmbeddingOutputLayout::EmbeddingOutputTensor::OutputFormatCase::
-          OUTPUT_FORMAT_NOT_SET: {
-        return errors::InvalidArgument(
-            "Output layout in TPUEmbeddingConfiguration has unset embedding "
-            "output tensor format.");
-      }
-      default: {
-        return errors::InvalidArgument(
-            "Output layout in TPUEmbeddingConfiguration has invalid or "
-            "unhandled embedding output tensor format.");
-      }
-    }
+    auto* dim0 = shape.add_dim();
+    dim0->set_size(table.num_features() * batch_size);
+    auto* dim1 = shape.add_dim();
+    dim1->set_size(table.dimension());
     shapes->push_back(shape);
   }
   return Status::OK();
diff --git a/tensorflow/core/tpu/tpu_embedding_output_layout_utils.h b/tensorflow/core/tpu/tpu_embedding_output_layout_utils.h
index 5bff401b9d2..177ad8e3055 100644
--- a/tensorflow/core/tpu/tpu_embedding_output_layout_utils.h
+++ b/tensorflow/core/tpu/tpu_embedding_output_layout_utils.h
@@ -23,11 +23,7 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
-// Creates a default output layout for compatibility if none was provided by the
-// model.
-void AddDefaultEmbeddingOutputLayoutIfNeeded(TPUEmbeddingConfiguration* config);
-
-// Computes the shape of the output tensors from an output layout.
+// Computes the shape of the output tensors from an embedding configuration.
 Status ComputeOutputTensorShapes(
     const TPUEmbeddingConfiguration& config,
     std::vector<tensorflow::TensorShapeProto>* shapes);
diff --git a/tensorflow/core/tpu/tpu_executor_api.cc b/tensorflow/core/tpu/tpu_executor_api.cc
index dd02ca27aa4..b2d8166e869 100644
--- a/tensorflow/core/tpu/tpu_executor_api.cc
+++ b/tensorflow/core/tpu/tpu_executor_api.cc
@@ -23,5 +23,12 @@ TfTpu_ExecutorApiFn* ExecutorApiFn() {
   return &executor_api_fn;
 }
 
+bool IsInitialized(TfTpu_ExecutorApiFn* executor_api_fn) {
+  // Check if an arbitrary function pointer is initialized. We could check more
+  // functions or add an explicit 'initialized' field to TfTpu_ExecutorApiFn,
+  // but this works well enough.
+  return executor_api_fn->TpuPlatform_NewFn != nullptr;
+}
+
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_executor_api.h b/tensorflow/core/tpu/tpu_executor_api.h
index ee07dc618a6..07e6aff07e2 100644
--- a/tensorflow/core/tpu/tpu_executor_api.h
+++ b/tensorflow/core/tpu/tpu_executor_api.h
@@ -24,6 +24,10 @@ namespace tpu {
 
 TfTpu_ExecutorApiFn* ExecutorApiFn();
 
+// Returns whether function pointers in `executor_api_fn` have been set.  If
+// false, it probably means an appropriate initializer needs to be linked in.
+bool IsInitialized(TfTpu_ExecutorApiFn* executor_api_fn);
+
 }  // namespace tpu
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tpu/tpu_executor_dlsym_initializer.cc b/tensorflow/core/tpu/tpu_executor_dlsym_initializer.cc
new file mode 100644
index 00000000000..4d84781f4e3
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_executor_dlsym_initializer.cc
@@ -0,0 +1,70 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// TODO(skye): this is largely a copy of tpu_api_dlsym_initializer.cc. Figure
+// out how to deduplicate these files a little.
+
+#include <dlfcn.h>
+
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tpu/tpu_api_dlsym_set_fn.h"
+#if !defined(PLATFORM_GOOGLE)
+#include "tensorflow/core/tpu/tpu_executor_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_platform.h"
+#endif
+
+namespace tensorflow {
+namespace tpu {
+
+#if defined(PLATFORM_GOOGLE)
+Status InitializeTpuLibrary(void* library_handle) {
+  return errors::Unimplemented("You must statically link in a TPU library.");
+}
+#else  // PLATFORM_GOOGLE
+#include "tensorflow/core/tpu/tpu_executor_init_fns.inc"
+
+Status InitializeTpuLibrary(void* library_handle) {
+  Status s = SetExecutorStructFn(library_handle);
+
+  // TPU platform registration must only be performed after the library is
+  // loaded. We do not want to register a TPU platform in XLA without the
+  // supporting library providing the necessary APIs.
+  if (s.ok()) {
+    void (*initialize_fn)();
+    initialize_fn = reinterpret_cast<decltype(initialize_fn)>(
+        dlsym(library_handle, "TfTpu_Initialize"));
+    (*initialize_fn)();
+
+    RegisterTpuPlatform();
+  }
+
+  return s;
+}
+
+bool FindAndLoadTpuLibrary() {
+  void* library = dlopen("libtpu.so", RTLD_NOW);
+  if (library) {
+    InitializeTpuLibrary(library);
+  }
+  return true;
+}
+
+static bool tpu_library_finder = FindAndLoadTpuLibrary();
+#endif  // PLATFORM_GOOGLE
+
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_executor_init_fns.inc b/tensorflow/core/tpu/tpu_executor_init_fns.inc
index 3e140aa828e..c2df15661f0 100644
--- a/tensorflow/core/tpu/tpu_executor_init_fns.inc
+++ b/tensorflow/core/tpu/tpu_executor_init_fns.inc
@@ -54,6 +54,8 @@ tensorflow::Status SetExecutorStructFn(void* library_handle) {
   TFTPU_SET_FN(executor_fn, TpuStream_Stream);
   TFTPU_SET_FN(executor_fn, TpuStream_Status);
   TFTPU_SET_FN(executor_fn, TpuStream_IsSameSharedMemoryLocation);
+  TFTPU_SET_FN(executor_fn, TpuStream_EnqueueTransferHostToDevice);
+  TFTPU_SET_FN(executor_fn, TpuStream_EnqueueTransferDeviceToHost);
   TFTPU_SET_FN(executor_fn, TpuStream_TpuEnqueueOnDeviceSendRecvLocal);
 
   TFTPU_SET_FN(executor_fn, TpuEvent_New);
@@ -91,6 +93,9 @@ tensorflow::Status SetExecutorStructFn(void* library_handle) {
   TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralToDeviceAsync);
   TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralFromDevice);
   TFTPU_SET_FN(executor_fn, TpuTransferManager_GetByteSizeRequirement);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_ChooseCompactLayoutForShape);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_CanShapedBufferBeAccessedNow);
+  TFTPU_SET_FN(executor_fn, TpuTransferManager_CanBufferBeAccessedNow);
   TFTPU_SET_FN(executor_fn, TpuTransferManager_WriteSingleTupleIndexTable);
   TFTPU_SET_FN(executor_fn, TpuTransferManager_GetInfeedLayout);
   TFTPU_SET_FN(executor_fn, TpuTransferManager_LinearizeToBuffers);
@@ -103,9 +108,12 @@ tensorflow::Status SetExecutorStructFn(void* library_handle) {
   TFTPU_SET_FN(executor_fn, TpuComputationPlacer_New);
   TFTPU_SET_FN(executor_fn, TpuComputationPlacer_Free);
   TFTPU_SET_FN(executor_fn, TpuComputationPlacer_AssignDevices);
+  TFTPU_SET_FN(executor_fn, TpuComputationPlacer_AssignLocalDevices);
 
   TFTPU_SET_FN(executor_fn, TpuTopology_LogicalDevicesPerHost);
   TFTPU_SET_FN(executor_fn, TpuTopology_LogicalDevicesPerChip);
+  TFTPU_SET_FN(executor_fn, TpuTopology_HostCount);
+  TFTPU_SET_FN(executor_fn, TpuTopology_ChipsPerHost);
   TFTPU_SET_FN(executor_fn, TpuTopology_ChipBounds_X);
   TFTPU_SET_FN(executor_fn, TpuTopology_ChipBounds_Y);
   TFTPU_SET_FN(executor_fn, TpuTopology_ChipBounds_Z);
@@ -122,6 +130,8 @@ tensorflow::Status SetExecutorStructFn(void* library_handle) {
   TFTPU_SET_FN(executor_fn, TpuCoreLocation_Id);
 
   TFTPU_SET_FN(executor_fn, TpuHostLocation_Id);
+  TFTPU_SET_FN(executor_fn, TpuHostLocation_NumCores);
+  TFTPU_SET_FN(executor_fn, TpuHostLocation_Cores);
 
   TFTPU_SET_FN(executor_fn, TpuCompiler_New);
   TFTPU_SET_FN(executor_fn, TpuCompiler_Free);
@@ -131,6 +141,8 @@ tensorflow::Status SetExecutorStructFn(void* library_handle) {
   TFTPU_SET_FN(executor_fn, TpuCompiler_Compile);
   TFTPU_SET_FN(executor_fn, TpuCompiler_ShapeSize);
   TFTPU_SET_FN(executor_fn, TpuExecutable_ExecuteAsyncOnStream);
+  TFTPU_SET_FN(executor_fn, TpuExecutable_Fingerprint);
+  TFTPU_SET_FN(executor_fn, TpuExecutable_HloModule);
   TFTPU_SET_FN(executor_fn, TpuExecutable_Free);
 
   TFTPU_SET_FN(executor_fn, XlaShapeToTpuShapeRepresentation);
diff --git a/tensorflow/core/tpu/tpu_library_init_fns.inc b/tensorflow/core/tpu/tpu_library_init_fns.inc
index 16494d0aa86..f824c9202e5 100644
--- a/tensorflow/core/tpu/tpu_library_init_fns.inc
+++ b/tensorflow/core/tpu/tpu_library_init_fns.inc
@@ -1,4 +1,8 @@
+#if defined(PLATFORM_GOOGLE)
 #include "third_party/tensorflow/core/tpu/tpu_executor_init_fns.inc"
+#else
+#include "tensorflow/core/tpu/tpu_executor_init_fns.inc"
+#endif
 
 namespace {
 
@@ -7,7 +11,6 @@ tensorflow::Status SetTpuConfigStructFns(void* library_handle) {
 
   TFTPU_SET_FN(config_fn, ConfigureDistributedTpuOp_DoWork);
   TFTPU_SET_FN(config_fn, WaitForDistributedTpuOp_DoWork);
-  TFTPU_SET_FN(config_fn, ShutdownDistributedTpuOp_DoWork);
   TFTPU_SET_FN(config_fn, InitializeHostForDistributedTpuOp_DoWork);
   TFTPU_SET_FN(config_fn, SetGlobalTPUArrayOp_DoWork);
   TFTPU_SET_FN(config_fn, DisconnectDistributedTpuChipsOp_DoWork);
@@ -16,6 +19,11 @@ tensorflow::Status SetTpuConfigStructFns(void* library_handle) {
   TFTPU_SET_FN(config_fn, TpuConfigurationApi_HasTPUPodState);
   TFTPU_SET_FN(config_fn, TpuConfigurationApi_TpusPerHost);
   TFTPU_SET_FN(config_fn, TpuConfigurationApi_TpuMemoryLimit);
+  TFTPU_SET_FN(config_fn,
+               TpuConfigurationApi_RemoteCompilationCacheSizeInBytes);
+  TFTPU_SET_FN(config_fn,
+               TpuConfigurationApi_CompilationCacheServerAddressFromConfig);
+  TFTPU_SET_FN(config_fn, TpuConfigurationApi_GetServerAddressAndPort);
 
   return tensorflow::Status::OK();
 }
@@ -33,7 +41,6 @@ tensorflow::Status SetTpuMeshStateStructFns(void* library_handle) {
 tensorflow::Status SetCompileStructFn(void* library_handle) {
   auto* compile_fn = tensorflow::tpu::CompileApiFn();
 
-  TFTPU_SET_FN(compile_fn, TpuCompile_CompileAheadOfTime);
   TFTPU_SET_FN(compile_fn, TpuCompile_CompileAndBuild);
 
   return tensorflow::Status::OK();
@@ -68,6 +75,10 @@ tensorflow::Status SetTpuProgramStructFn(void* library_handle) {
   TFTPU_SET_FN(tpu_program_fn, TpuProgram_GetMayModifyVariables);
   TFTPU_SET_FN(tpu_program_fn, TpuProgram_HasSharding);
   TFTPU_SET_FN(tpu_program_fn, TpuProgram_GetTpuProgram);
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_SerializeTpuExecutable);
+  TFTPU_SET_FN(tpu_program_fn, TpuProgram_SerializeCompilerMetadata);
+  TFTPU_SET_FN(tpu_program_fn,
+               TpuProgram_DeserializeFromGetTpuProgramResponseProto);
 
   return tensorflow::Status::OK();
 }
@@ -88,6 +99,7 @@ tensorflow::Status SetTpuUtilStructFns(void* library_handle) {
   auto* util_fn = tensorflow::tpu::UtilApiFn();
 
   TFTPU_SET_FN(util_fn, TpuTopology_AvailableCoreCount);
+  TFTPU_SET_FN(util_fn, TpuNetUtil_RecycleUnusedPort);
   TFTPU_SET_FN(util_fn, TpuCompile_IsTpuCompilationEnabled);
   TFTPU_SET_FN(util_fn, TpuCompile_ShouldTpuCompileOpIgnoreCancellation);
   TFTPU_SET_FN(util_fn, TpuCompile_CreateCompilationCacheKey);
diff --git a/tensorflow/core/tpu/tpu_on_demand_compiler.cc b/tensorflow/core/tpu/tpu_on_demand_compiler.cc
index b28cf62d123..c34a13a45dc 100644
--- a/tensorflow/core/tpu/tpu_on_demand_compiler.cc
+++ b/tensorflow/core/tpu/tpu_on_demand_compiler.cc
@@ -22,11 +22,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module_group.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/tpu/c_api_conversions.h"
+#include "tensorflow/stream_executor/tpu/c_api_decl.h"
 #include "tensorflow/stream_executor/tpu/proto_helper.h"
 #include "tensorflow/stream_executor/tpu/status_helper.h"
+#include "tensorflow/stream_executor/tpu/tpu_executable_interface.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
@@ -40,7 +43,7 @@ static SE_ExecutableRunOptions ToC(
   se_options.device_ordinal = options.run_options().device_ordinal();
   if (options.run_options().host_to_device_stream() != nullptr) {
     se_options.host_to_device_stream =
-        static_cast<TpuStream*>(
+        static_cast<tensorflow::tpu::TpuStream*>(
             options.run_options().host_to_device_stream()->implementation())
             ->se_stream();
   } else {
@@ -69,7 +72,8 @@ static SE_ExecutableRunOptions ToC(
 
   auto impl =
       const_cast<stream_executor::Stream*>(options.stream())->implementation();
-  se_options.stream = static_cast<TpuStream*>(impl)->se_stream();
+  se_options.stream =
+      static_cast<tensorflow::tpu::TpuStream*>(impl)->se_stream();
   return se_options;
 }
 }  // namespace ApiConverter
@@ -80,11 +84,11 @@ namespace {
 
 using ::tensorflow::tpu::ExecutorApiFn;
 
-class TpuExecutable : public Executable {
+class TpuExecutable : public TpuExecutableInterface {
  public:
   TpuExecutable(SE_Executable* se_executable,
                 std::shared_ptr<HloModule> hlo_module)
-      : Executable(std::move(hlo_module), nullptr, nullptr),
+      : TpuExecutableInterface(std::move(hlo_module), nullptr, nullptr),
         se_executable_(se_executable) {}
 
   ~TpuExecutable() override {
@@ -105,7 +109,8 @@ class TpuExecutable : public Executable {
       auto* arg_buffers = arg.MutableBuffers();
       absl::InlinedVector<SE_MaybeOwningDeviceMemory, 2> se_buffers;
       for (auto& pair : *arg_buffers) {
-        se_buffers.push_back(ApiConverter::ToC(pair.second));
+        bool aliased = arg.unowned_indices().count(pair.first) > 0;
+        se_buffers.push_back(ApiConverter::ToC(pair.second, aliased));
       }
       se_args[i]->shape_tree.buffers =
           new SE_MaybeOwningDeviceMemory[se_buffers.size()];
@@ -114,7 +119,6 @@ class TpuExecutable : public Executable {
       }
 
       ApiConverter::ToC(arg.shape(), &se_args[i]->dynamic_shape);
-      ApiConverter::ToC(arg.host_shape(), &se_args[i]->host_shape);
       const auto& unowned_indices = arg.unowned_indices();
       se_args[i]->unowned_indices_size = unowned_indices.size();
       se_args[i]->unowned_indices = new XLA_ShapeIndex[unowned_indices.size()];
@@ -130,15 +134,14 @@ class TpuExecutable : public Executable {
         se_executable_, &se_run_options, se_args, arguments.size(), nullptr,
         &se_execution_output, status.c_status);
 
+    if (se_run_options.device_assignment.bytes != nullptr) {
+      stream_executor::tpu::SerializedProto_Free(
+          se_run_options.device_assignment);
+    }
     for (int i = 0; i < arguments.size(); ++i) {
       ApiConverter::Free(&se_args[i]->shape_tree.shape);
       ApiConverter::Free(&se_args[i]->dynamic_shape);
-      ApiConverter::Free(&se_args[i]->host_shape);
-
-      for (int j = 0; j < se_args[i]->unowned_indices_size; ++i) {
-        ApiConverter::Free(&se_args[i]->unowned_indices[j]);
-      }
-
+      delete[] se_args[i]->unowned_indices;
       delete[] se_args[i]->shape_tree.buffers;
       delete se_args[i];
     }
@@ -151,12 +154,14 @@ class TpuExecutable : public Executable {
     xla::ScopedShapedBuffer result(
         ApiConverter::FromC(&se_execution_output.result),
         run_options->stream()->parent()->GetAllocator());
+    ApiConverter::Free(&se_execution_output.result);
 
     ExecutionOutput output(std::move(result));
     for (int i = 0; i < se_execution_output.aliased_indices_size; ++i) {
       output.AddAliasedIndex(
           ApiConverter::FromC(&se_execution_output.aliased_indices[i]));
     }
+    ApiConverter::Free(se_execution_output.aliased_indices);
 
     for (int i = 0; i < se_execution_output.to_be_released_size; ++i) {
       output.AddToBeReleased(
@@ -165,11 +170,36 @@ class TpuExecutable : public Executable {
               .Release()
               .value());
     }
+    delete[] se_execution_output.to_be_released;
 
     return output;
   }
 
+  absl::string_view fingerprint() const override {
+    const char* data;
+    size_t size;
+    ExecutorApiFn()->TpuExecutable_FingerprintFn(se_executable_, &data, &size);
+    return absl::string_view(data, size);
+  }
+
  private:
+  Status LoadProgramAndEnqueueToStream(
+      const ServiceExecutableRunOptions& run_options,
+      absl::Span<const stream_executor::DeviceMemoryBase> arguments,
+      stream_executor::DeviceMemoryBase result,
+      absl::optional<stream_executor::DeviceMemoryBase>
+          cross_program_prefetch_addr) override {
+    LOG(FATAL) << "LoadProgramAndEnqueueToStream unimplemented";
+  }
+
+  Shape HostShapeToDeviceShape(const Shape& host_shape) override {
+    LOG(FATAL) << "HostShapeToDeviceShape unimplemented";
+  }
+
+  int64 ShapeSize(const Shape& shape) override {
+    LOG(FATAL) << "ShapeSize unimplemented";
+  }
+
   SE_Executable* se_executable_;
 };
 
@@ -215,7 +245,7 @@ class TpuCompiler : public Compiler {
   ~TpuCompiler() override { ExecutorApiFn()->TpuCompiler_FreeFn(compiler_); }
 
   stream_executor::Platform::Id PlatformId() const override {
-    return tensorflow::TpuPlatform::kId;
+    return tensorflow::tpu::TpuPlatform::kId;
   }
 
   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
@@ -223,14 +253,19 @@ class TpuCompiler : public Compiler {
       stream_executor::StreamExecutor* executor,
       stream_executor::DeviceMemoryAllocator* device_allocator) override {
     XLA_HloModule hlo_module;
+    XLA_HloModule result;
+    auto cleanup = xla::MakeCleanup([&hlo_module, &result]() {
+      stream_executor::tpu::SerializedProto_Free(hlo_module.proto);
+      stream_executor::tpu::SerializedProto_Free(result.proto);
+      ApiConverter::Free(&hlo_module.module_config);
+    });
     hlo_module.module_config = HloModuleConfigToC(module->config());
     hlo_module.proto = stream_executor::tpu::SerializeProto(module->ToProto());
     auto allocator = ApiConverter::ToC(device_allocator);
-    XLA_HloModule result;
     StatusHelper status;
     ExecutorApiFn()->TpuCompiler_RunHloPassesFn(
         compiler_, &hlo_module,
-        static_cast<tensorflow::TpuExecutor*>(executor->implementation())
+        static_cast<tensorflow::tpu::TpuExecutor*>(executor->implementation())
             ->se_executor(),
         &allocator, &result, status.c_status);
     if (!status.ok()) {
@@ -238,35 +273,27 @@ class TpuCompiler : public Compiler {
     }
     HloModuleProto result_proto =
         stream_executor::tpu::DeserializeProto<HloModuleProto>(result.proto);
-    stream_executor::tpu::SerializedProto_Free(hlo_module.proto);
-    stream_executor::tpu::SerializedProto_Free(result.proto);
     return HloModule::CreateFromProto(result_proto, module->config());
   }
 
-  StatusOr<
-      std::tuple<std::unique_ptr<HloModule>, std::unique_ptr<BufferAssignment>>>
-  RunHloPassesAndBufferAssignement(
-      std::unique_ptr<HloModule> module,
-      stream_executor::StreamExecutor* executor,
-      stream_executor::DeviceMemoryAllocator* device_allocator) override {
-    return Unimplemented(
-        "This compiler does not support RunHloPassesAndBufferAssignment.");
-  }
-
   StatusOr<std::unique_ptr<Executable>> RunBackend(
       std::unique_ptr<HloModule> module,
       stream_executor::StreamExecutor* executor,
       stream_executor::DeviceMemoryAllocator* device_allocator) override {
     XLA_HloModule hlo_module;
+    auto cleanup = xla::MakeCleanup([&hlo_module]() {
+      stream_executor::tpu::SerializedProto_Free(hlo_module.proto);
+      ApiConverter::Free(&hlo_module.module_config);
+    });
+    SE_Executable* result;
     hlo_module.module_config = HloModuleConfigToC(module->config());
     hlo_module.proto = stream_executor::tpu::SerializeProto(module->ToProto());
     auto allocator = ApiConverter::ToC(device_allocator);
 
-    SE_Executable* result;
     StatusHelper status;
     ExecutorApiFn()->TpuCompiler_RunBackendFn(
         compiler_, &hlo_module,
-        static_cast<tensorflow::TpuExecutor*>(executor->implementation())
+        static_cast<tensorflow::tpu::TpuExecutor*>(executor->implementation())
             ->se_executor(),
         &allocator, &result, status.c_status);
     if (!status.ok()) {
@@ -275,7 +302,6 @@ class TpuCompiler : public Compiler {
 
     std::unique_ptr<Executable> exec =
         absl::make_unique<TpuExecutable>(result, std::move(module));
-    stream_executor::tpu::SerializedProto_Free(hlo_module.proto);
     return exec;
   }
 
@@ -288,17 +314,26 @@ class TpuCompiler : public Compiler {
         stream_executor::tpu::SerializeProto(module_group->ToProto());
     se_module_group.module_config =
         new XLA_HloModuleConfig[module_group->size()];
+    int module_group_size = module_group->size();
+    auto cleanup_config =
+        xla::MakeCleanup([&se_module_group, module_group_size]() {
+          for (auto i = 0; i < module_group_size; ++i) {
+            ApiConverter::Free(&se_module_group.module_config[i]);
+          }
+          delete[] se_module_group.module_config;
+        });
     for (int i = 0; i < module_group->size(); ++i) {
       const auto& config = module_group->module(i).config();
       se_module_group.module_config[i] = HloModuleConfigToC(config);
     }
-
-    SE_StreamExecutorList* se_lists =
-        new SE_StreamExecutorList[stream_exec.size()];
+    std::vector<SE_StreamExecutorList> se_lists(stream_exec.size());
+    std::vector<std::vector<SE_StreamExecutor*>> se_lists_storage;
     for (int i = 0; i < stream_exec.size(); ++i) {
-      se_lists[i].exec = new SE_StreamExecutor*[stream_exec[i].size()];
+      se_lists[i].count = stream_exec[i].size();
+      se_lists_storage.emplace_back(stream_exec[i].size());
+      se_lists[i].exec = se_lists_storage.back().data();
       for (int j = 0; j < stream_exec[i].size(); ++j) {
-        se_lists[i].exec[j] = static_cast<tensorflow::TpuExecutor*>(
+        se_lists[i].exec[j] = static_cast<tensorflow::tpu::TpuExecutor*>(
                                   stream_exec[i][j]->implementation())
                                   ->se_executor();
       }
@@ -311,23 +346,32 @@ class TpuCompiler : public Compiler {
     StatusHelper status;
 
     ExecutorApiFn()->TpuCompiler_CompileFn(
-        compiler_, &se_module_group, se_lists, stream_exec.size(), &allocator,
-        se_executables, status.c_status);
+        compiler_, &se_module_group, se_lists.data(), stream_exec.size(),
+        &allocator, se_executables, status.c_status);
 
     if (!status.ok()) {
       return status.status();
     }
 
     std::vector<std::unique_ptr<Executable>> executables;
-    std::vector<std::unique_ptr<HloModule>> modules =
-        module_group->ConsumeModules();
     for (int i = 0; i < module_group->size(); ++i) {
-      executables[i] = absl::make_unique<TpuExecutable>(se_executables[i],
-                                                        std::move(modules[i]));
+      // We get the HloModule from the compiled executable, rather than reusing
+      // the input module from 'module_group', in case the module changed in
+      // some way. For example, if the computation is automatically partitioned
+      // via XLA, the executable's module may have different input/output shapes
+      // than the input module.
+      XLA_HloModule c_module =
+          ExecutorApiFn()->TpuExecutable_HloModuleFn(se_executables[i]);
+      auto cleanup_c_module =
+          xla::MakeCleanup([&c_module]() { ApiConverter::Free(&c_module); });
+      TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                          ApiConverter::FromC(c_module));
+      std::shared_ptr<HloModule> module_shared(module.release());
+      executables.emplace_back(absl::make_unique<TpuExecutable>(
+          se_executables[i], std::move(module_shared)));
     }
 
     stream_executor::tpu::SerializedProto_Free(se_module_group.proto);
-    delete se_module_group.module_config;
     delete[] se_executables;
 
     return executables;
@@ -359,9 +403,9 @@ class TpuCompiler : public Compiler {
 };
 
 static bool InitModule() {
-  xla::Compiler::RegisterCompilerFactory(tensorflow::TpuPlatform::kId, []() {
-    return absl::make_unique<TpuCompiler>();
-  });
+  xla::Compiler::RegisterCompilerFactory(
+      tensorflow::tpu::TpuPlatform::kId,
+      []() { return absl::make_unique<TpuCompiler>(); });
   return true;
 }
 
diff --git a/tensorflow/core/tpu/tpu_system_device.cc b/tensorflow/core/tpu/tpu_system_device.cc
index 7a6c4e949e3..8a869ac192e 100644
--- a/tensorflow/core/tpu/tpu_system_device.cc
+++ b/tensorflow/core/tpu/tpu_system_device.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
 
 namespace tensorflow {
+namespace tpu {
 namespace {
 
 class TpuSystemDeviceFactory : public DeviceFactory {
@@ -75,4 +76,5 @@ void RegisterTpuSystemDevice() {
   REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_TPU_SYSTEM, TpuSystemDeviceFactory);
 }
 
+}  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_system_device.h b/tensorflow/core/tpu/tpu_system_device.h
index ab8f3f426a1..8b0649ca60d 100644
--- a/tensorflow/core/tpu/tpu_system_device.h
+++ b/tensorflow/core/tpu/tpu_system_device.h
@@ -17,9 +17,11 @@ limitations under the License.
 #define TENSORFLOW_CORE_TPU_TPU_SYSTEM_DEVICE_H_
 
 namespace tensorflow {
+namespace tpu {
 
 void RegisterTpuSystemDevice();
 
+}  // namespace tpu
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_TPU_TPU_SYSTEM_DEVICE_H_
diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index 7b2c5c47d29..0630cb56ba0 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -16,6 +16,14 @@ load(
     "tf_cuda_only_cc_test",
     "tf_kernel_library",
 )
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_version_info_genrule")
 load(
     "//third_party/mkl:build_defs.bzl",
@@ -105,6 +113,8 @@ filegroup(
         "ptr_util.h",
         "ragged_to_dense_util.cc",
         "ragged_to_dense_util.h",
+        "ragged_to_dense_util_common.cc",
+        "ragged_to_dense_util_common.h",
         "reffed_status_callback.h",
         "saved_tensor_slice_util.cc",
         "saved_tensor_slice_util.h",
@@ -364,6 +374,7 @@ filegroup(
 tf_version_info_genrule(
     name = "version_info_gen",
     out = "version_info.cc",
+    compatible_with = get_compatible_with_portable(),
 )
 
 cc_library(
@@ -381,6 +392,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ragged_to_dense_util_common",
+    srcs = [
+        "ragged_to_dense_util_common.cc",
+    ],
+    hdrs = [
+        "ragged_to_dense_util_common.h",
+    ],
+    visibility = [
+        "//visibility:public",
+    ],
+)
+
 cc_library(
     name = "ragged_to_dense_util",
     srcs = [
@@ -390,6 +414,7 @@ cc_library(
         "ragged_to_dense_util.h",
     ],
     deps = [
+        ":ragged_to_dense_util_common",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
     ],
@@ -428,6 +453,7 @@ cc_library(
     deps = [
         ":test_log_proto_impl_cc",
         "//tensorflow/core/platform:env",
+        "//tensorflow/core/platform:env_impl",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:macros",
         "//tensorflow/core/platform:mutex",
@@ -507,6 +533,7 @@ cc_library(
     name = "version_info",
     srcs = ["version_info.cc"],
     hdrs = ["//tensorflow/core/public:version.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tf_copts(),
     alwayslink = if_static(0, 1),
 )
@@ -579,6 +606,7 @@ tf_kernel_library(
     name = "cuda_solvers",
     srcs = ["cuda_solvers.cc"],
     hdrs = ["cuda_solvers.h"],
+    compatible_with = [],
     # @local_config_cuda//cuda:cusolver_static, //third_party/eigen3:blas,
     # and //third_party/libf2c all contain various parts of BLAS, LAPACK,
     # and f2c helper functions in global namespace. Tell the compiler to
@@ -602,6 +630,7 @@ tf_kernel_library(
     name = "rocm_solvers",
     srcs = ["rocm_solvers.cc"],
     hdrs = ["rocm_solvers.h"],
+    compatible_with = [],
     visibility = ["//tensorflow/core/kernels:friends"],
     deps = [
         "//tensorflow/core:framework",
@@ -620,6 +649,7 @@ tf_kernel_library(
     name = "cuda_sparse",
     srcs = if_cuda(["cuda_sparse.cc"]) + if_rocm(["rocm_sparse.cc"]),
     hdrs = ["cuda_sparse.h"],
+    compatible_with = [],
     deps = [
         ":cuda_solvers",
         "//tensorflow/core:framework",
@@ -638,7 +668,7 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//tensorflow/core/kernels:bounds_check",
+        "//tensorflow/core/framework:bounds_check",
         "//third_party/eigen3",
     ],
 )
@@ -707,7 +737,6 @@ tf_cc_tests(
         "tensor_slice_writer_test.cc",
         "work_sharder_test.cc",
     ],
-    create_named_test_suite = True,
     linkopts = select({
         "//tensorflow:macos": ["-headerpad_max_install_names"],
         "//conditions:default": [],
@@ -728,7 +757,6 @@ tf_cc_tests(
         "//tensorflow/core",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:direct_session_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -738,6 +766,7 @@ tf_cc_tests(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/common_runtime:direct_session_internal",
         "//tensorflow/core/kernels:ops_util",
         "//tensorflow/core/platform:regexp",
         "//third_party/eigen3",
@@ -849,4 +878,7 @@ tf_proto_library(
         ":saved_tensor_slice_proto",
         ":memmapped_file_system_proto",
     ],
+    tags = [
+        "alt_dep=//third_party/tensorflow/core:protos_all",
+    ],
 )
diff --git a/tensorflow/core/util/command_line_flags.cc b/tensorflow/core/util/command_line_flags.cc
index 00a9cbaa3d8..ce83f2d2fb3 100644
--- a/tensorflow/core/util/command_line_flags.cc
+++ b/tensorflow/core/util/command_line_flags.cc
@@ -132,51 +132,61 @@ bool ParseFloatFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
 
 }  // namespace
 
-Flag::Flag(const char* name, tensorflow::int32* dst, const string& usage_text)
+Flag::Flag(const char* name, tensorflow::int32* dst, const string& usage_text,
+           bool* dst_updated)
     : name_(name),
       type_(TYPE_INT32),
-      int32_hook_([dst](int32 value) {
+      int32_hook_([dst, dst_updated](int32 value) {
         *dst = value;
+        if (dst_updated) *dst_updated = true;
         return true;
       }),
       int32_default_for_display_(*dst),
       usage_text_(usage_text) {}
 
-Flag::Flag(const char* name, tensorflow::int64* dst, const string& usage_text)
+Flag::Flag(const char* name, tensorflow::int64* dst, const string& usage_text,
+           bool* dst_updated)
     : name_(name),
       type_(TYPE_INT64),
-      int64_hook_([dst](int64 value) {
+      int64_hook_([dst, dst_updated](int64 value) {
         *dst = value;
+        if (dst_updated) *dst_updated = true;
         return true;
       }),
       int64_default_for_display_(*dst),
       usage_text_(usage_text) {}
 
-Flag::Flag(const char* name, float* dst, const string& usage_text)
+Flag::Flag(const char* name, float* dst, const string& usage_text,
+           bool* dst_updated)
     : name_(name),
       type_(TYPE_FLOAT),
-      float_hook_([dst](float value) {
+      float_hook_([dst, dst_updated](float value) {
         *dst = value;
+        if (dst_updated) *dst_updated = true;
         return true;
       }),
       float_default_for_display_(*dst),
       usage_text_(usage_text) {}
 
-Flag::Flag(const char* name, bool* dst, const string& usage_text)
+Flag::Flag(const char* name, bool* dst, const string& usage_text,
+           bool* dst_updated)
     : name_(name),
       type_(TYPE_BOOL),
-      bool_hook_([dst](bool value) {
+      bool_hook_([dst, dst_updated](bool value) {
         *dst = value;
+        if (dst_updated) *dst_updated = true;
         return true;
       }),
       bool_default_for_display_(*dst),
       usage_text_(usage_text) {}
 
-Flag::Flag(const char* name, string* dst, const string& usage_text)
+Flag::Flag(const char* name, string* dst, const string& usage_text,
+           bool* dst_updated)
     : name_(name),
       type_(TYPE_STRING),
-      string_hook_([dst](string value) {
+      string_hook_([dst, dst_updated](string value) {
         *dst = std::move(value);
+        if (dst_updated) *dst_updated = true;
         return true;
       }),
       string_default_for_display_(*dst),
diff --git a/tensorflow/core/util/command_line_flags.h b/tensorflow/core/util/command_line_flags.h
index 928ae8a4e94..3d583a605b5 100644
--- a/tensorflow/core/util/command_line_flags.h
+++ b/tensorflow/core/util/command_line_flags.h
@@ -62,11 +62,16 @@ namespace tensorflow {
 // text, and a pointer to the corresponding variable.
 class Flag {
  public:
-  Flag(const char* name, int32* dst, const string& usage_text);
-  Flag(const char* name, int64* dst, const string& usage_text);
-  Flag(const char* name, bool* dst, const string& usage_text);
-  Flag(const char* name, string* dst, const string& usage_text);
-  Flag(const char* name, float* dst, const string& usage_text);
+  Flag(const char* name, int32* dst, const string& usage_text,
+       bool* dst_updated = nullptr);
+  Flag(const char* name, int64* dst, const string& usage_text,
+       bool* dst_updated = nullptr);
+  Flag(const char* name, bool* dst, const string& usage_text,
+       bool* dst_updated = nullptr);
+  Flag(const char* name, string* dst, const string& usage_text,
+       bool* dst_updated = nullptr);
+  Flag(const char* name, float* dst, const string& usage_text,
+       bool* dst_updated = nullptr);
 
   // These constructors invoke a hook on a match instead of writing to a
   // specific memory location.  The hook may return false to signal a malformed
@@ -85,6 +90,8 @@ class Flag {
   Flag(const char* name, std::function<bool(string)> string_hook,
        string default_value_for_display, const string& usage_text);
 
+  bool is_default_initialized() const { return default_initialized_; }
+
  private:
   friend class Flags;
 
@@ -115,6 +122,7 @@ class Flag {
   string string_default_for_display_;
 
   string usage_text_;
+  bool default_initialized_ = true;
 };
 
 class Flags {
diff --git a/tensorflow/core/util/ctc/BUILD b/tensorflow/core/util/ctc/BUILD
index aa00a210f79..086a046f5c1 100644
--- a/tensorflow/core/util/ctc/BUILD
+++ b/tensorflow/core/util/ctc/BUILD
@@ -2,6 +2,8 @@
 # is a type of seq2seq loss.  The libraries in this directory
 # implement the CTC loss and a number of CTC decoders.
 
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_tests")
 
 package(
diff --git a/tensorflow/core/util/cuda_sparse.h b/tensorflow/core/util/cuda_sparse.h
index cd10ba8d8cb..2f22e7b8c48 100644
--- a/tensorflow/core/util/cuda_sparse.h
+++ b/tensorflow/core/util/cuda_sparse.h
@@ -75,7 +75,8 @@ using gpuStream_t = hipStream_t;
 
 namespace tensorflow {
 
-inline string ConvertGPUSparseErrorToString(const gpusparseStatus_t status) {
+inline std::string ConvertGPUSparseErrorToString(
+    const gpusparseStatus_t status) {
   switch (status) {
 #define STRINGIZE(q) #q
 #define RETURN_IF_STATUS(err) \
diff --git a/tensorflow/core/util/device_name_utils.cc b/tensorflow/core/util/device_name_utils.cc
index 8688a11870e..14dab634416 100644
--- a/tensorflow/core/util/device_name_utils.cc
+++ b/tensorflow/core/util/device_name_utils.cc
@@ -174,6 +174,11 @@ bool DeviceNameUtils::ParseFullName(StringPiece fullname, ParsedName* p) {
   return true;
 }
 
+bool DeviceNameUtils::ParseFullOrLocalName(StringPiece fullname,
+                                           ParsedName* p) {
+  return ParseFullName(fullname, p) || ParseLocalName(fullname, p);
+}
+
 namespace {
 
 void CompleteName(const DeviceNameUtils::ParsedName& parsed_basename,
diff --git a/tensorflow/core/util/device_name_utils.h b/tensorflow/core/util/device_name_utils.h
index a1515ba8508..3de7544a05e 100644
--- a/tensorflow/core/util/device_name_utils.h
+++ b/tensorflow/core/util/device_name_utils.h
@@ -89,6 +89,11 @@ class DeviceNameUtils {
     bool has_id = false;
     int id = 0;
   };
+
+  // Parses the device name, first as a full name, then, if it fails, as a
+  // global one. Returns `false` if both attempts fail.
+  static bool ParseFullOrLocalName(StringPiece fullname, ParsedName* parsed);
+
   // Parses "fullname" into "*parsed". Returns true iff succeeds.
   // Legacy names like "/cpu:0" that don't contain "device",
   // are parsed to mean their current counterparts "/device:CPU:0". More
diff --git a/tensorflow/core/util/device_name_utils_test.cc b/tensorflow/core/util/device_name_utils_test.cc
index 729d1ec3ae8..065fcfbf2ce 100644
--- a/tensorflow/core/util/device_name_utils_test.cc
+++ b/tensorflow/core/util/device_name_utils_test.cc
@@ -105,6 +105,8 @@ TEST(DeviceNameUtilsTest, Basic) {
     DeviceNameUtils::ParsedName p;
     EXPECT_TRUE(DeviceNameUtils::ParseFullName(
         "/job:foo_bar/replica:1/task:2/device:GPU:3", &p));
+    EXPECT_TRUE(DeviceNameUtils::ParseFullOrLocalName(
+        "/job:foo_bar/replica:1/task:2/device:GPU:3", &p));
     EXPECT_TRUE(p.has_job);
     EXPECT_TRUE(p.has_replica);
     EXPECT_TRUE(p.has_task);
@@ -246,12 +248,14 @@ TEST(DeviceNameUtilsTest, Basic) {
   {
     DeviceNameUtils::ParsedName p;
     EXPECT_TRUE(DeviceNameUtils::ParseLocalName("CPU:10", &p));
+    EXPECT_TRUE(DeviceNameUtils::ParseFullOrLocalName("CPU:10", &p));
     EXPECT_EQ(p.type, "CPU");
     EXPECT_EQ(p.id, 10);
     EXPECT_FALSE(DeviceNameUtils::ParseLocalName("cpu:abc", &p));
     EXPECT_FALSE(DeviceNameUtils::ParseLocalName("abc:", &p));
     EXPECT_FALSE(DeviceNameUtils::ParseLocalName("abc", &p));
     EXPECT_FALSE(DeviceNameUtils::ParseLocalName("myspecialdevice", &p));
+    EXPECT_FALSE(DeviceNameUtils::ParseFullOrLocalName("myspecialdevice", &p));
   }
 
   // Test that all parts are round-tripped correctly.
diff --git a/tensorflow/core/util/event.proto b/tensorflow/core/util/event.proto
index f5dfffa671b..310d7948752 100644
--- a/tensorflow/core/util/event.proto
+++ b/tensorflow/core/util/event.proto
@@ -8,6 +8,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "EventProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/util/event_go_proto";
 
 // Protocol buffer representing an event that happened during
 // the execution of a Brain model.
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index b148ffab042..16f982f11b2 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -167,11 +167,14 @@ class Feature {
   }
 
   // Helper methods
-  tstring& construct_at_end(LimitedArraySlice<tstring>* bytes_list) {
-    return bytes_list->construct_at_end();
+  tstring* construct_at_end(LimitedArraySlice<tstring>* bytes_list) {
+    if (bytes_list->EndDistance() <= 0) {
+      return nullptr;
+    }
+    return &bytes_list->construct_at_end();
   }
-  tstring& construct_at_end(SmallVector<tstring>* bytes_list) {
-    return bytes_list->emplace_back();
+  tstring* construct_at_end(SmallVector<tstring>* bytes_list) {
+    return &bytes_list->emplace_back();
   }
 
   template <typename Result>
@@ -192,9 +195,10 @@ class Feature {
       // parse string
       uint32 bytes_length;
       if (!stream.ReadVarint32(&bytes_length)) return false;
-      tstring& bytes = construct_at_end(bytes_list);
-      bytes.resize_uninitialized(bytes_length);
-      if (!stream.ReadRaw(bytes.data(), bytes_length)) return false;
+      tstring* bytes = construct_at_end(bytes_list);
+      if (bytes == nullptr) return false;
+      bytes->resize_uninitialized(bytes_length);
+      if (!stream.ReadRaw(bytes->data(), bytes_length)) return false;
     }
     stream.PopLimit(limit);
     return true;
@@ -2281,12 +2285,13 @@ Status ParseContextDenseFeatures(const FeatureProtosMap& context_features,
         context_features.find(c.feature_name)->second;
     TensorShape dense_shape, example_shape;
     DataType dtype = c.dtype;
-    const size_t expected_max_elements = feature.length;
+    const size_t data_max_elements = feature.length;
     if (!c.shape.AsTensorShape(&example_shape) ||
-        expected_max_elements != example_shape.num_elements()) {
+        data_max_elements != example_shape.num_elements()) {
       return errors::InvalidArgument(
-          "Inconsistent number of elements for feature ", c.feature_name, ": ",
-          expected_max_elements, " vs ", dense_shape.num_elements());
+          "Inconsistent max number of elements for feature ", c.feature_name,
+          ": expected ", example_shape.num_elements(), ", but found ",
+          data_max_elements);
     }
     if (is_batch) {
       dense_shape.AddDim(num_examples);
@@ -2320,7 +2325,7 @@ Status ParseContextDenseFeatures(const FeatureProtosMap& context_features,
         EnableAliasing(&stream);
         num_elements += ParseFeature(dtype, &stream, &out, &out_offset);
       }
-      if (num_elements != expected_max_elements) {
+      if (num_elements != data_max_elements) {
         return errors::InvalidArgument(
             "Unexpected number of elements in example ",
             ExampleName(example_names, e));
diff --git a/tensorflow/core/util/gpu_device_functions.h b/tensorflow/core/util/gpu_device_functions.h
index bdfe528bcdf..5381c054d32 100644
--- a/tensorflow/core/util/gpu_device_functions.h
+++ b/tensorflow/core/util/gpu_device_functions.h
@@ -606,7 +606,7 @@ __device__ double GpuAtomicCasHelper(double* ptr, F accumulate) {
   // HIP has a bug in the implementation of __longlong_as_double
   // So workaround it by using reinterpret_cast<double*>.
   uint64_t result =
-      GpuAtomicCasHelper(reinterpret_cast<tensorflow::uint64*>(ptr),
+      GpuAtomicCasHelper(reinterpret_cast<unsigned long long*>(ptr),
                          [accumulate](tensorflow::uint64 a) {
                            return __double_as_longlong(
                                accumulate(*(reinterpret_cast<double*>(&a))));
@@ -614,7 +614,7 @@ __device__ double GpuAtomicCasHelper(double* ptr, F accumulate) {
   return *(reinterpret_cast<double*>(&result));
 #else
   return __longlong_as_double(GpuAtomicCasHelper(
-      reinterpret_cast<tensorflow::uint64*>(ptr),
+      reinterpret_cast<unsigned long long*>(ptr),
       [accumulate](tensorflow::uint64 a) {
         return __double_as_longlong(accumulate(__longlong_as_double(a)));
       }));
@@ -676,6 +676,38 @@ template <typename From, typename To>
 using ToTypeIfConvertible =
     typename std::enable_if<std::is_convertible<From, To>::value, To>::type;
 
+template <typename T>
+struct CudaSupportedTypeImpl {
+  using type = T;
+};
+
+template <>
+struct CudaSupportedTypeImpl<long long> {
+  using type = unsigned long long;
+};
+
+template <>
+struct CudaSupportedTypeImpl<unsigned long> {
+  using type =
+      typename std::conditional<sizeof(unsigned long) == sizeof(unsigned int),
+                                unsigned int, unsigned long long>::type;
+};
+
+template <>
+struct CudaSupportedTypeImpl<long> {
+  // This cast should be safe since module-2 addition should work fine. However,
+  // signed overflow is not handled correctly since it's undefined behavior.
+  using type = typename CudaSupportedTypeImpl<unsigned long>::type;
+};
+
+template <typename T>
+using CudaSupportedType = typename CudaSupportedTypeImpl<T>::type;
+
+template <typename T>
+__device__ CudaSupportedType<T>* ToCudaSupportedPtr(T* ptr) {
+  return reinterpret_cast<CudaSupportedType<T>*>(ptr);
+}
+
 }  // namespace detail
 
 // CUDA provides atomic ops, but not for all types.  We provide wrappers
@@ -683,13 +715,7 @@ using ToTypeIfConvertible =
 
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicAdd(T* ptr, U value) {
-  return atomicAdd(ptr, value);
-}
-
-__device__ inline int64 GpuAtomicAdd(int64* ptr, int64 value) {
-  // This cast should be safe since module-2 addition should work fine. However,
-  // signed overflow is not handled correctly since it's undefined behavior.
-  return atomicAdd(reinterpret_cast<uint64*>(ptr), static_cast<uint64>(value));
+  return atomicAdd(detail::ToCudaSupportedPtr(ptr), value);
 }
 
 __device__ inline Eigen::half GpuAtomicAdd(Eigen::half* ptr,
@@ -765,7 +791,7 @@ CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicSub, CudaAtomicSub);
 // GpuAtomicMax
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMax(T* ptr, U value) {
-  return atomicMax(ptr, value);
+  return atomicMax(detail::ToCudaSupportedPtr(ptr), value);
 }
 
 #if TENSORFLOW_USE_ROCM
@@ -817,11 +843,12 @@ __device__ inline Eigen::half GpuAtomicMax(Eigen::half* ptr,
 __device__ inline tensorflow::uint64 GpuAtomicMax(tensorflow::uint64* ptr,
                                                   tensorflow::uint64 value) {
   return detail::GpuAtomicCasHelper(
-      ptr, [value](tensorflow::uint64 a) { return max(a, value); });
+      detail::ToCudaSupportedPtr(ptr),
+      [value](tensorflow::uint64 a) { return max(a, value); });
 }
 
 __device__ inline int64 GpuAtomicMax(int64* ptr, int64 value) {
-  return detail::GpuAtomicCasHelper(ptr,
+  return detail::GpuAtomicCasHelper(detail::ToCudaSupportedPtr(ptr),
                                     [value](int64 a) { return max(a, value); });
 }
 #endif
@@ -830,7 +857,7 @@ CREATE_CUDA_DEVICE_FUNCTION_ALIAS(GpuAtomicMax, CudaAtomicMax);
 // GpuAtomicMin
 template <typename T, typename U>
 __device__ detail::ToTypeIfConvertible<U, T> GpuAtomicMin(T* ptr, U value) {
-  return atomicMin(ptr, value);
+  return atomicMin(detail::ToCudaSupportedPtr(ptr), value);
 }
 
 #if TENSORFLOW_USE_ROCM
@@ -882,11 +909,12 @@ __device__ inline Eigen::half GpuAtomicMin(Eigen::half* ptr,
 __device__ inline tensorflow::uint64 GpuAtomicMin(tensorflow::uint64* ptr,
                                                   tensorflow::uint64 value) {
   return detail::GpuAtomicCasHelper(
-      ptr, [value](tensorflow::uint64 a) { return min(a, value); });
+      detail::ToCudaSupportedPtr(ptr),
+      [value](tensorflow::uint64 a) { return min(a, value); });
 }
 
 __device__ inline int64 GpuAtomicMin(int64* ptr, int64 value) {
-  return detail::GpuAtomicCasHelper(ptr,
+  return detail::GpuAtomicCasHelper(detail::ToCudaSupportedPtr(ptr),
                                     [value](int64 a) { return min(a, value); });
 }
 #endif
diff --git a/tensorflow/core/util/image_resizer_state.h b/tensorflow/core/util/image_resizer_state.h
index b302021918d..84459c9447e 100644
--- a/tensorflow/core/util/image_resizer_state.h
+++ b/tensorflow/core/util/image_resizer_state.h
@@ -192,6 +192,20 @@ struct ImageResizerGradientState {
     original_height = original_image.dim_size(1);
     original_width = original_image.dim_size(2);
 
+    // The following check is also carried out for the forward op. It is added
+    // here to prevent a divide-by-zero exception when either height_scale or
+    // width_scale is being calculated.
+    OP_REQUIRES(context, resized_height > 0 && resized_width > 0,
+                errors::InvalidArgument("resized dimensions must be positive"));
+
+    // The following check is also carried out for the forward op. It is added
+    // here to prevent either height_scale or width_scale from being set to
+    // zero, which would cause a divide-by-zero exception in the deterministic
+    // back-prop path.
+    OP_REQUIRES(
+        context, original_height > 0 && original_width > 0,
+        errors::InvalidArgument("original dimensions must be positive"));
+
     OP_REQUIRES(
         context,
         FastBoundsCheck(original_height, std::numeric_limits<int32>::max()) &&
diff --git a/tensorflow/core/util/matmul_autotune.cc b/tensorflow/core/util/matmul_autotune.cc
index 741a78a193f..c30a5d930e7 100644
--- a/tensorflow/core/util/matmul_autotune.cc
+++ b/tensorflow/core/util/matmul_autotune.cc
@@ -48,4 +48,22 @@ bool MatmulDoFP32ComputationFP16Input() {
   return value;
 }
 
+int MatmulMaxAutotuneAlgorithmCount() {
+  int64 value;
+  // In CUDA 11, cublasLtMatmulAlgoGetHeuristic typically returns <= 4
+  // algorithms for a given configuration, so 10 seems like a reasonable default
+  // here.
+  Status status =
+      ReadInt64FromEnvVar("TF_MATMUL_AUTOTUNE_MAX_ALGORITHMS", 10, &value);
+  if (!status.ok()) {
+    LOG(ERROR) << status.error_message();
+  }
+  static constexpr const int kMaxValue = std::numeric_limits<int>::max();
+  if (value < 1 || value > kMaxValue) {
+    LOG(ERROR) << "Invalid value for TF_MATMUL_AUTOTUNE_MAX_ALGORITHMS: "
+               << value << " is not in range [1, " << kMaxValue << "]";
+  }
+  return value;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/matmul_autotune.h b/tensorflow/core/util/matmul_autotune.h
index 5846cae2fc7..c77d274e781 100644
--- a/tensorflow/core/util/matmul_autotune.h
+++ b/tensorflow/core/util/matmul_autotune.h
@@ -22,6 +22,7 @@ namespace tensorflow {
 
 bool MatmulAutotuneEnable();
 bool MatmulDoFP32ComputationFP16Input();
+int MatmulMaxAutotuneAlgorithmCount();
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/util/mkl_types.h b/tensorflow/core/util/mkl_types.h
index 8edbddac8a1..1091b2a7a59 100644
--- a/tensorflow/core/util/mkl_types.h
+++ b/tensorflow/core/util/mkl_types.h
@@ -18,6 +18,9 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 namespace tensorflow {
+// MKL DNN 0.x will not be supported. So all related macro's have been removed
+// This file will be removed once MKL DNN 0.x related source code is cleaned and
+// all MKL DNN 1.x related macro's have been replaced.
 
 #ifdef ENABLE_MKLDNN_V1
 #define ADD_MD add_md
@@ -116,106 +119,6 @@ namespace tensorflow {
 #define TENSOR_FORMAT_NHWC MKL_TENSOR_FORMAT_NHWC
 #define TENSOR_MAX_DIMS MKLDNN_MAX_NDIMS
 
-#else
-
-#define ADD_MD add_pd
-#define ALGORITHM mkldnn
-#define ALGORITHM_UNDEF ALGORITHM::algorithm_undef
-#define BN_FLAGS mkldnn
-#define CPU_STREAM(engine) stream(stream::kind::eager_nostore)
-#define DATA_WITH_ENGINE(data, engine) data
-#define DST_MD dst_pd
-#define ENGINE_CPU engine::cpu
-#define GET_CHECK_REORDER_MEM_ARGS(md, tensor, net_ptr, net_args, engine) \
-  memory::primitive_desc(md, engine), tensor, &net_ptr
-#define GET_CHECK_REORDER_TO_OP_MEM_ARGS(pd, tensor, net_ptr, net_args, \
-                                         engine)                        \
-  pd, tensor, &net_ptr
-#define GET_DESC get_primitive_desc()
-#define GET_FORMAT_FROM_SHAPE(src_mkl_shape) \
-  static_cast<memory::format>(src_mkl_shape.GetMklLayout().data.format)
-#define GET_BLOCK_STRIDES(strides, idx) strides[(idx)]
-#define GET_MEMORY_DESC_CONSTRUCTOR(dims, type, fm) \
-  { {dims}, MklDnnType<type>(), fm }
-#define GET_MEMORY_SIZE_FROM_MD(md, engine) \
-  memory::primitive_desc(md, engine).get_size()
-#define GET_SRC_DESC_FROM_OP_PD(op_pd) op_pd.get()->src_primitive_desc()
-#define GET_DST_DESC_FROM_OP_PD(op_pd) op_pd.get()->dst_primitive_desc()
-#define GET_BIAS_DESC_FROM_OP_PD(op_pd) op_pd.get()->bias_primitive_desc()
-#define GET_DIFF_DST_DESC_FROM_OP_PD(op_pd) \
-  op_pd.get()->diff_dst_primitive_desc()
-#define GET_WORKSPACE_DESC_FROM_OP_PD(op_pd) \
-  op_pd.get()->workspace_primitive_desc()
-#define GET_TENSOR_FORMAT(fmt) fmt
-#define GET_TF_DATA_FORMAT(shape, mem_desc) mem_desc.data.format
-#define GET_USR_MEM_PRIM_DESC(src) src.GetUsrMemPrimDesc()
-#define GET_WEIGHTS_DESC_FROM_OP_PD(op_pd) op_pd.get()->weights_primitive_desc()
-#define GET_WEIGHTS_FORMAT_FROM_OP_PD(op_pd, op) op->GetFilterMemoryFormat()
-#define IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, op_pd, op) \
-  diff_dst_md.data.format != op->GetDiffDstMemoryFormat()
-#define IS_DIFF_FILTER_REORDER_NEEDED(diff_filter_md, fmt, op_pd, op) \
-  fmt != op->GetDiffFilterMemoryFormat()
-#define IS_FILTER_REORDER_NEEDED(filter_md, op_pd, op) \
-  filter_md.data.format != op->GetFilterMemoryFormat()
-#define IS_SRC_REORDER_NEEDED(src_md, op_pd, op) \
-  src_md.data.format != op->GetSrcMemoryFormat()
-#define IS_WEIGHTS_REORDER_NEEDED(weights_md, op_pd, op) \
-  weights_md.data.format != op->GetWeightMemoryFormat()
-#define GET_MEMORY_DESC_FROM_MEM_PTR(mem_ptr) \
-  mem_ptr->get_primitive_desc().desc()
-#define GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(mem_ptr) \
-  mem_ptr->get_primitive_desc()
-#define MEMORY_CONSTRUCTOR(mem_pd, engine, data) memory(mem_pd, data)
-#define MEMORY_CONSTRUCTOR_PD(mem_pd, engine, data) memory(mem_pd, data)
-#define MEMORY_CONSTRUCTOR_WITH_MEM_PD(mem_ptr, cpu_engine, data) \
-  memory({GET_MEMORY_DESC_FROM_MEM_PTR(mem_ptr), cpu_engine}, data)
-#define MEMORY_CONSTRUCTOR_USING_MD(md, engine, data) memory({md, engine}, data)
-#define MEMORY_CONSTRUCTOR_USING_MEM_PD(dims, type, fm, engine, data) \
-  memory({GET_MEMORY_DESC_CONSTRUCTOR(dims, type, fm), engine}, data)
-#define MEMORY_CONSTRUCTOR_WITHOUT_DATA(mem_pd, engine) memory(mem_pd)
-#define MEMORY_DATA_TYPE_UNDEF memory::data_type::data_undef
-#define MEMORY_DESC memory::format
-#define MEMORY_FORMAT mkldnn::memory::format
-#define MEMORY_FORMAT_DESC layout_desc
-#define MEMORY_FORMAT_UNDEF mkldnn::memory::format::format_undef
-#define MEMORY_PD_CONSTRUCTOR(dims, type, fm, engine) \
-  memory::primitive_desc(GET_MEMORY_DESC_CONSTRUCTOR(dims, type, fm), engine)
-#define MEMORY_PD_WITHOUT_DATA(pd, engine) pd
-#define MEMORY_PRIMITIVE_DESC memory::primitive_desc
-#define MEMORY_PD_CONSTRUCTOR_2_PARAMS(md, engine) \
-  MEMORY_PRIMITIVE_DESC(md, engine)
-#define MKL_FMT_TAG tf_fmt
-#define MKL_TENSOR_FORMAT memory::format
-#define MKL_TENSOR_FORMAT_BLOCKED memory::format::blocked
-#define MKL_TENSOR_FORMAT_IN_C mkldnn_memory_format_t
-#define MKL_TENSOR_FORMAT_INVALID memory::format::format_undef
-#define MKL_TENSOR_FORMAT_NC memory::format::nc
-#define MKL_TENSOR_FORMAT_NCHW memory::format::nchw
-#define MKL_TENSOR_FORMAT_NCDHW memory::format::ncdhw
-#define MKL_TENSOR_FORMAT_NDHWC memory::format::ndhwc
-#define MKL_TENSOR_FORMAT_NHWC memory::format::nhwc
-#define MKL_TENSOR_FORMAT_TNC memory::format::tnc
-#define MKL_TENSOR_FORMAT_X memory::format::x
-#define MKL_TENSOR_FORMAT_UNDEF MKL_TENSOR_FORMAT_INVALID
-#define NET_ARGS_PTR nullptr
-#define OUTPUT_TF_MD output_tf_pd
-#define PRIMITIVE_DESC_BIAS bias_primitive_desc()
-#define PRIMITIVE_DESC_WEIGHTS weights_primitive_desc()
-#define PRIMITIVE_DESC_DIFF_DST diff_dst_primitive_desc()
-#define PRIMITIVE_DESC_DIFF_SRC diff_src_primitive_desc()
-#define PRIMITIVE_DESC_DIFF_WEIGHTS diff_weights_primitive_desc()
-#define PRIMITIVE_DESC_DST dst_primitive_desc()
-#define PRIMITIVE_DESC_SRC src_primitive_desc()
-#define PRIMITIVE_DESC_WORKSPACE workspace_primitive_desc()
-#define REORDER_PD_CONSTRUCTOR(src_pd, dst_pd, engine) ReorderPd(src_pd, dst_pd)
-#define REORDER_PD_CONSTRUCTOR_WITH_ATTR(src_pd, dst_pd, engine, prim_attr) \
-  ReorderPd(src_pd, dst_pd, prim_attr)
-#define SKIP_INPUT_REORDER(input_mkl_shape, input_md)           \
-  (input_mkl_shape.GetTfDataFormat() == input_md.data.format && \
-   input_mkl_shape.GetTfDataFormat() != MKL_TENSOR_FORMAT_BLOCKED)
-#define SUMMAND_MD summand_pd
-#define TENSOR_FORMAT TensorFormat
-#define TENSOR_FORMAT_NHWC FORMAT_NHWC
 #endif  // ENABLE_MKLDNN_V1
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 56a14cca04a..74f9cd100a8 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -43,9 +43,6 @@ limitations under the License.
 
 using mkldnn::engine;
 using mkldnn::memory;
-#ifndef ENABLE_MKLDNN_V1
-using mkldnn::padding_kind;
-#endif
 using mkldnn::primitive;
 using mkldnn::reorder;
 using mkldnn::stream;
@@ -130,7 +127,6 @@ enum class MklQuantization {
 
 static const int kSmallBatchSize = 32;
 
-#ifdef ENABLE_MKLDNN_V1
 inline void execute_primitives(
     std::vector<mkldnn::primitive>& primitives, std::shared_ptr<stream> stream,
     std::vector<std::unordered_map<int, memory>>& net_args) {
@@ -174,13 +170,11 @@ enum class MklTensorFormat {
   FORMAT_INVALID = 8,
 };
 
-#endif  // ENABLE_MKLDNN_V1
-
 // Forward declarations
-MEMORY_FORMAT MklTensorFormatToMklDnnDataFormat(MKL_TENSOR_FORMAT format);
+memory::format_tag MklTensorFormatToMklDnnDataFormat(MklTensorFormat format);
 
-TensorFormat MklDnn3DDataFormatToTFDataFormat(MKL_TENSOR_FORMAT format);
-TensorFormat MklDnnDataFormatToTFDataFormat(MKL_TENSOR_FORMAT format);
+TensorFormat MklDnn3DDataFormatToTFDataFormat(MklTensorFormat format);
+TensorFormat MklDnnDataFormatToTFDataFormat(MklTensorFormat format);
 
 memory::dims CalculateTFStrides(const memory::dims& dims_tf_order);
 Status CreateBlockedMemDescHelper(const memory::dims& dim,
@@ -188,7 +182,6 @@ Status CreateBlockedMemDescHelper(const memory::dims& dim,
                                   memory::data_type dtype,
                                   mkldnn_memory_desc_t* blocked_md);
 
-#ifdef ENABLE_MKLDNN_V1
 inline std::ostream& operator<<(std::ostream& os,
                                 const memory::format_tag& tag) {
   if (tag == memory::format_tag::undef) {
@@ -223,7 +216,6 @@ inline std::ostream& operator<<(std::ostream& os,
     os << "INVALID FORMAT";
   }
 }
-#endif  // ENABLE_MKLDNN_V1
 
 template <typename T>
 inline bool array_cmp(const T* a1, const T* a2, size_t size) {
@@ -235,7 +227,7 @@ inline bool array_cmp(const T* a1, const T* a2, size_t size) {
 inline mkldnn::stream* CreateStream(OpKernelContext* ctx,
                                     const engine& engine) {
 #ifdef ENABLE_MKLDNN_THREADPOOL
-  stream_attr tp_stream_attr(ENGINE_CPU);
+  stream_attr tp_stream_attr(engine::kind::cpu);
   if (ctx != nullptr) {
     auto eigen_tp =
         MklDnnThreadPoolWrapper::GetInstance().CreateThreadPoolPtr(ctx);
@@ -244,11 +236,11 @@ inline mkldnn::stream* CreateStream(OpKernelContext* ctx,
         new stream(engine, stream::flags::default_flags, tp_stream_attr);
     return tp_stream;
   } else {
-    stream* tp_stream = new CPU_STREAM(engine);
+    stream* tp_stream = new stream(engine);
     return tp_stream;
   }
 #else
-  stream* tp_stream = new CPU_STREAM(engine);
+  stream* tp_stream = new stream(engine);
   return tp_stream;
 #endif  // ENABLE_MKLDNN_THREADPOOL
 }
@@ -261,8 +253,8 @@ class MklDnnShape {
     // Number of dimensions in Tensorflow format
     size_t dimension_ = 0;
     mkldnn_dims_t sizes_;  // Required by MKL for conversions
-    MKL_TENSOR_FORMAT tf_data_format_ = MKL_TENSOR_FORMAT_UNDEF;
-    memory::data_type T_ = MEMORY_DATA_TYPE_UNDEF;
+    MklTensorFormat tf_data_format_ = MklTensorFormat::FORMAT_BLOCKED;
+    memory::data_type T_ = memory::data_type::undef;
     // MKL layout
     mkldnn_memory_desc_t mkl_md_;
     /// TF dimension corresponding to this MKL dimension
@@ -272,41 +264,6 @@ class MklDnnShape {
 
   typedef std::remove_extent<mkldnn_dims_t>::type mkldnn_dim_t;
 
-#ifndef ENABLE_MKLDNN_V1
-  // Helper function to compare mkldnn_blocking_desc_t.
-  inline bool blocking_desc_is_equal(const mkldnn_blocking_desc_t& lhs,
-                                     const mkldnn_blocking_desc_t& rhs,
-                                     int ndims) const {
-    return lhs.offset_padding == rhs.offset_padding &&
-           array_cmp(lhs.block_dims, rhs.block_dims, ndims) &&
-           array_cmp(lhs.strides[0], rhs.strides[0], ndims) &&
-           array_cmp(lhs.strides[1], rhs.strides[1], ndims) &&
-           array_cmp(lhs.padding_dims, rhs.padding_dims, ndims) &&
-           array_cmp(lhs.offset_padding_to_data, rhs.offset_padding_to_data,
-                     ndims);
-  }
-
-  // Helper function to compare mkldnn_wino_desc_t.
-  inline bool wino_desc_is_equal(const mkldnn_wino_desc_t& lhs,
-                                 const mkldnn_wino_desc_t& rhs) const {
-    return lhs.wino_format == rhs.wino_format && lhs.alpha == rhs.alpha &&
-           lhs.ic == rhs.ic && lhs.oc == rhs.oc &&
-           lhs.ic_block == rhs.ic_block && lhs.oc_block == rhs.oc_block &&
-           lhs.ic2_block == rhs.ic2_block && lhs.oc2_block == rhs.oc2_block &&
-           lhs.r == rhs.r;
-  }
-
-  // Helper function to compare mkldnn_rnn_packed_desc_t.
-  inline bool rnn_packed_desc_is_equal(
-      const mkldnn_rnn_packed_desc_t& lhs,
-      const mkldnn_rnn_packed_desc_t& rhs) const {
-    return lhs.format == rhs.format && lhs.n_parts == rhs.n_parts &&
-           lhs.offset_compensation == rhs.offset_compensation &&
-           lhs.size == rhs.size && lhs.n == rhs.n &&
-           array_cmp(lhs.parts, rhs.parts, lhs.n_parts) &&
-           array_cmp(lhs.part_pack_size, rhs.part_pack_size, lhs.n_parts);
-  }
-#endif  // !ENABLE_MKLDNN_V1
 #define INVALID_DIM_SIZE -1
 
  public:
@@ -323,36 +280,6 @@ class MklDnnShape {
   ~MklDnnShape() {}
   TF_DISALLOW_COPY_AND_ASSIGN(MklDnnShape);  // Cannot copy
 
-#ifndef ENABLE_MKLDNN_V1
-  /// Helper function to compare memory::desc objects for MklDnn.
-  /// May be this should go into MklDnn directly.
-  inline bool CompareMklDnnLayouts(const memory::desc& md1,
-                                   const memory::desc& md2) const {
-    mkldnn_memory_desc_t mdd1 = md1.data;
-    mkldnn_memory_desc_t mdd2 = md2.data;
-
-    assert(mdd1.primitive_kind == mkldnn::primitive::kind::memory);
-    assert(mdd2.primitive_kind == mkldnn::primitive::kind::memory);
-    bool base_equal = mdd1.ndims == mdd2.ndims &&
-                      array_cmp(mdd1.dims, mdd2.dims, mdd1.ndims) &&
-                      mdd1.data_type == mdd2.data_type &&
-                      mdd1.format == mdd2.format;
-    if (!base_equal) return false;
-    if (mdd1.format == memory::format::blocked) {
-      return blocking_desc_is_equal(mdd1.layout_desc.blocking,
-                                    mdd2.layout_desc.blocking, mdd1.ndims);
-    } else if (mdd1.format == memory::format::wino_fmt) {
-      return wino_desc_is_equal(mdd1.layout_desc.wino_desc,
-                                mdd2.layout_desc.wino_desc);
-    } else if (mdd1.format == memory::format::rnn_packed) {
-      return rnn_packed_desc_is_equal(mdd1.layout_desc.rnn_packed_desc,
-                                      mdd2.layout_desc.rnn_packed_desc);
-    }
-
-    return true;
-  }
-#endif  // !ENABLE_MKLDNN_V1
-
   /// Equality function for MklDnnShape objects
   /// @return true if both are equal; false otherwise.
   inline bool operator==(const MklDnnShape& input_shape) const {
@@ -363,17 +290,11 @@ class MklDnnShape {
     // If input tensors are in MKL layout, then we check for dimensions and
     // sizes.
     if (this->IsMklTensor()) {
-#ifdef ENABLE_MKLDNN_V1
       const mkldnn_memory_desc_t& cur_md = (this->GetMklLayout()).data;
       const mkldnn_memory_desc_t& input_shape_md =
           input_shape.GetMklLayout().data;
       return this->GetTfShape() == input_shape.GetTfShape() &&
              mkldnn_memory_desc_equal(&cur_md, &input_shape_md);
-#else
-      return this->GetTfShape() == input_shape.GetTfShape() &&
-             CompareMklDnnLayouts(this->GetMklLayout(),
-                                  input_shape.GetMklLayout());
-#endif  // ENABLE_MKLDNN_V1
     }
 
     // Both inputs are not MKL tensors.
@@ -480,7 +401,7 @@ class MklDnnShape {
     std::vector<int32> shape(data_.dimension_, -1);
     // As mentioned in the comment above, we now rely on TF's `data_format`
     // attribute to determine if TF shape is in blocked format or not.
-    if (data_.tf_data_format_ != MKL_TENSOR_FORMAT_BLOCKED) {
+    if (data_.tf_data_format_ != MklTensorFormat::FORMAT_BLOCKED) {
       for (size_t idx = 0; idx < data_.dimension_; ++idx) {
         shape[idx] = data_.sizes_[TfDimIdx(idx)];
       }
@@ -502,14 +423,6 @@ class MklDnnShape {
   inline void SetElemType(memory::data_type dt) { data_.T_ = dt; }
   inline const memory::data_type GetElemType() { return data_.T_; }
 
-#ifndef ENABLE_MKLDNN_V1
-  // Memory primitive descriptor is deprecated in MKL-DNN v1.x.
-  inline void SetMklLayout(memory::primitive_desc* pd) {
-    CHECK_NOTNULL(pd);
-    data_.mkl_md_ = pd->desc().data;
-  }
-#endif  // !ENABLE_MKLDNN_V1
-
   inline void SetMklLayout(memory::desc* md) {
     CHECK_NOTNULL(md);
     data_.mkl_md_ = md->data;
@@ -519,7 +432,7 @@ class MklDnnShape {
     return memory::desc(data_.mkl_md_);
   }
 
-  inline MKL_TENSOR_FORMAT GetTfDataFormat() const {
+  inline MklTensorFormat GetTfDataFormat() const {
     return data_.tf_data_format_;
   }
 
@@ -527,7 +440,7 @@ class MklDnnShape {
   /// We use lazy evaluation and create it only when needed. Input format can
   /// also be Blocked format.
   inline void SetTfLayout(size_t dims, const memory::dims& sizes,
-                          MKL_TENSOR_FORMAT format) {
+                          MklTensorFormat format) {
     DCHECK_EQ(dims, sizes.size())
         << "SetTfLayout: Number of dimensions does not"
            "match with dimension array";
@@ -536,7 +449,7 @@ class MklDnnShape {
       data_.sizes_[ii] = sizes[ii];
     }
     data_.tf_data_format_ = format;
-    if (format != MKL_TENSOR_FORMAT_BLOCKED) {
+    if (format != MklTensorFormat::FORMAT_BLOCKED) {
       if (dims == 2) {
         data_.map_[0] = MklDnnDims::Dim_N;
         data_.map_[1] = MklDnnDims::Dim_C;
@@ -553,21 +466,17 @@ class MklDnnShape {
     }
 
     // Create Blocked memory desc if input TF format was set like that.
-    if (data_.tf_data_format_ == MKL_TENSOR_FORMAT_BLOCKED) {
+    if (data_.tf_data_format_ == MklTensorFormat::FORMAT_BLOCKED) {
       auto strides = CalculateTFStrides(dims);
       mkldnn_memory_desc_t blocked_md;
       TF_CHECK_OK(
           CreateBlockedMemDescHelper(dims, strides, data_.T_, &blocked_md));
       return memory::desc(blocked_md);
     } else {
-#ifdef ENABLE_MKLDNN_V1
       auto format_tag =
           MklTensorFormatToMklDnnDataFormat(data_.tf_data_format_);
       DCHECK_NE(format_tag, memory::format_tag::undef);
       return memory::desc(dims, data_.T_, format_tag);
-#else
-      return memory::desc(dims, data_.T_, data_.tf_data_format_);
-#endif  // ENABLE_MKLDNN_V1
     }
   }
 
@@ -608,7 +517,7 @@ class MklDnnShape {
     }
   }
 
-  inline void SetTfDimOrder(const size_t dimension, MKL_TENSOR_FORMAT format) {
+  inline void SetTfDimOrder(const size_t dimension, MklTensorFormat format) {
     TensorFormat data_format = MklDnnDataFormatToTFDataFormat(format);
     SetTfDimOrder(dimension, data_format);
   }
@@ -702,7 +611,6 @@ inline void ExecutePrimitive(const std::vector<primitive>& net,
                              const std::vector<MemoryArgsMap>* net_args,
                              const engine& cpu_engine,
                              OpKernelContext* context = nullptr) {
-#ifdef ENABLE_MKLDNN_V1
   DCHECK(net_args);
   DCHECK_EQ(net.size(), net_args->size());
   stream* cpu_stream = CreateStream(context, cpu_engine);
@@ -711,9 +619,6 @@ inline void ExecutePrimitive(const std::vector<primitive>& net,
   }
   cpu_stream->wait();
   delete cpu_stream;
-#else
-  stream(stream::kind::eager_nostore).submit(net).wait();
-#endif  // ENABLE_MKLDNN_V1
 }
 template <typename T>
 inline Status ConvertMklToTF(OpKernelContext* context,
@@ -732,28 +637,24 @@ inline Status ConvertMklToTF(OpKernelContext* context,
     TF_CHECK_OK(context->allocate_temp(DataTypeToEnum<T>::v(), output_tf_shape,
                                        output_tf_tensor));
 
-    engine cpu_engine(ENGINE_CPU, 0);
+    engine cpu_engine(engine::kind::cpu, 0);
     MklDnnData<T> input(&cpu_engine);
 
     // Get MKL layout of input tensor.
     auto input_mkl_md = input_mkl_shape.GetMklLayout();
     auto output_tf_md = input_mkl_shape.GetTfLayout();
-#ifndef ENABLE_MKLDNN_V1
-    // Memory primitive descriptor is deprecated in MKL-DNN v1.x.
-    auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine);
-#endif  // !ENABLE_MKLDNN_V1
     input.SetUsrMem(input_mkl_md, &input_mkl_tensor);
 
-    if (input.IsReorderNeeded(OUTPUT_TF_MD)) {
+    if (input.IsReorderNeeded(output_tf_md)) {
       std::vector<primitive> net;
       std::vector<MemoryArgsMap> net_args;
-      bool status = input.CheckReorderToOpMem(GET_CHECK_REORDER_TO_OP_MEM_ARGS(
-          OUTPUT_TF_MD, output_tf_tensor, net, net_args, cpu_engine));
+      bool status = input.CheckReorderToOpMem(output_tf_md, output_tf_tensor,
+                                              net, net_args, cpu_engine);
       if (!status) {
         return Status(error::Code::INTERNAL,
                       "ConvertMklToTF(): Failed to create reorder for input");
       }
-      ExecutePrimitive(net, NET_ARGS_PTR, cpu_engine, context);
+      ExecutePrimitive(net, &net_args, cpu_engine, context);
     } else {
       // If not, just forward input tensor to output tensor.
       bool status =
@@ -877,7 +778,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
 // Allocates a temp tensor and returns the data buffer for temporary storage.
 template <typename T>
 inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
-                           const MEMORY_PRIMITIVE_DESC& pd, void** buf_out) {
+                           const memory::desc& pd, void** buf_out) {
   TensorShape tf_shape;
 
   tf_shape.AddDim(pd.get_size() / sizeof(T) + 1);
@@ -893,13 +794,11 @@ inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
                                                  tf_shape, tensor_out));
 }
 
-inline void GetStridesFromSizes(TENSOR_FORMAT data_format, size_t* strides,
+inline void GetStridesFromSizes(MklTensorFormat data_format, size_t* strides,
                                 const size_t* sizes) {
-#ifdef ENABLE_MKLDNN_V1
   DCHECK_NE(data_format, MklTensorFormat::FORMAT_INVALID);
-#endif  // ENABLE_MKLDNN_V1
   // MKL requires strides in NCHW
-  if (data_format == TENSOR_FORMAT_NHWC) {
+  if (data_format == MklTensorFormat::FORMAT_NHWC) {
     strides[0] = sizes[2];
     strides[1] = sizes[0] * sizes[2];
     strides[2] = 1;
@@ -994,21 +893,36 @@ inline void SetDummyMklDnnShapeOutput(OpKernelContext* context,
   AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output);
 }
 
-inline void ForwardMklTensorInToOutWithMklShape(OpKernelContext* context,
+// If the input tensor has ref count as 1, it is forwarded to the desired
+// output port and the function returns true. In that case, it also allocates
+// the serialized MklDnnShape object. Otherwise, the function returns false.
+inline bool ForwardMklTensorInToOutWithMklShape(OpKernelContext* context,
                                                 int idx_in, int idx_out,
-                                                const MklDnnShape& mkl_shape) {
+                                                Tensor** output,
+                                                const MklDnnShape& mkl_shape,
+                                                bool always_forward = true) {
   int num_inputs = context->num_inputs();
   int num_outputs = context->num_outputs();
   int idx_data_in = GetTensorDataIndex(idx_in, num_inputs);
   int idx_data_out = GetTensorDataIndex(idx_out, num_outputs);
-
-  AllocateOutputSetMklShape(context, idx_out, mkl_shape);
-
-  if (IsRefType(context->input_dtype(idx_data_in))) {
-    context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out);
+  bool is_forwarded = false;
+  const Tensor& input_tensor = context->input(idx_data_in);
+  const auto output_shape = input_tensor.shape();
+  if (always_forward) {
+    if (IsRefType(context->input_dtype(idx_data_in))) {
+      context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out);
+    } else {
+      context->set_output(idx_data_out, input_tensor);
+    }
   } else {
-    context->set_output(idx_data_out, context->input(idx_data_in));
+    is_forwarded = context->forward_input_to_output_with_shape(
+        idx_data_in, idx_data_out, output_shape, output);
   }
+  if (is_forwarded || always_forward) {
+    AllocateOutputSetMklShape(context, idx_out, mkl_shape);
+    return true;
+  }
+  return false;
 }
 
 // Forward the MKL shape ONLY (used in elementwise and other ops where
@@ -1092,31 +1006,27 @@ memory::data_type MklDnnType<bfloat16>() {
 // @input: MklTensorFormat i.e. TensorFlow data format
 // @return: MKL-DNN's memory format tag corresponding to MklTensorFormat.
 //          Fails with an error if invalid data format.
-inline MEMORY_FORMAT MklTensorFormatToMklDnnDataFormat(
-    MKL_TENSOR_FORMAT format) {
-#ifdef ENABLE_MKLDNN_V1
-  if (format == MklTensorFormat::FORMAT_NHWC) return MEMORY_FORMAT::nhwc;
-  if (format == MklTensorFormat::FORMAT_NCHW) return MEMORY_FORMAT::nchw;
-  if (format == MklTensorFormat::FORMAT_NDHWC) return MEMORY_FORMAT::ndhwc;
-  if (format == MklTensorFormat::FORMAT_NCDHW) return MEMORY_FORMAT::ncdhw;
-  if (format == MklTensorFormat::FORMAT_X) return MEMORY_FORMAT::x;
-  if (format == MklTensorFormat::FORMAT_NC) return MEMORY_FORMAT::nc;
-  if (format == MklTensorFormat::FORMAT_TNC) return MEMORY_FORMAT::tnc;
-  return MEMORY_FORMAT::undef;
-#else
-  return format;
-#endif
+inline memory::format_tag MklTensorFormatToMklDnnDataFormat(
+    MklTensorFormat format) {
+  if (format == MklTensorFormat::FORMAT_NHWC) return memory::format_tag::nhwc;
+  if (format == MklTensorFormat::FORMAT_NCHW) return memory::format_tag::nchw;
+  if (format == MklTensorFormat::FORMAT_NDHWC) return memory::format_tag::ndhwc;
+  if (format == MklTensorFormat::FORMAT_NCDHW) return memory::format_tag::ncdhw;
+  if (format == MklTensorFormat::FORMAT_X) return memory::format_tag::x;
+  if (format == MklTensorFormat::FORMAT_NC) return memory::format_tag::nc;
+  if (format == MklTensorFormat::FORMAT_TNC) return memory::format_tag::tnc;
+  return memory::format_tag::undef;
 }
 
 /// Map TensorFlow data format into MKL-DNN 3D data format
 /// @input: TensorFlow data format
 /// @return: MKL-DNN 3D data format corresponding to TensorFlow data format;
 ///          Fails with an error if invalid data format.
-inline MKL_TENSOR_FORMAT TFDataFormatToMklDnn3DDataFormat(TensorFormat format) {
-  if (format == FORMAT_NHWC) return MKL_TENSOR_FORMAT_NDHWC;
-  if (format == FORMAT_NCHW) return MKL_TENSOR_FORMAT_NCDHW;
+inline MklTensorFormat TFDataFormatToMklDnn3DDataFormat(TensorFormat format) {
+  if (format == FORMAT_NHWC) return MklTensorFormat::FORMAT_NDHWC;
+  if (format == FORMAT_NCHW) return MklTensorFormat::FORMAT_NCDHW;
   TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
-  return MKL_TENSOR_FORMAT_INVALID;
+  return MklTensorFormat::FORMAT_INVALID;
 }
 
 /// Map TensorFlow data format into MKL-DNN data format
@@ -1124,11 +1034,11 @@ inline MKL_TENSOR_FORMAT TFDataFormatToMklDnn3DDataFormat(TensorFormat format) {
 /// @input: TensorFlow data format
 /// @return: MKL-DNN data format corresponding to TensorFlow data format;
 ///          Fails with an error if invalid data format.
-inline MKL_TENSOR_FORMAT TFDataFormatToMklDnnDataFormat(TensorFormat format) {
-  if (format == FORMAT_NHWC) return MKL_TENSOR_FORMAT_NHWC;
-  if (format == FORMAT_NCHW) return MKL_TENSOR_FORMAT_NCHW;
+inline MklTensorFormat TFDataFormatToMklDnnDataFormat(TensorFormat format) {
+  if (format == FORMAT_NHWC) return MklTensorFormat::FORMAT_NHWC;
+  if (format == FORMAT_NCHW) return MklTensorFormat::FORMAT_NCHW;
   TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
-  return MKL_TENSOR_FORMAT_INVALID;
+  return MklTensorFormat::FORMAT_INVALID;
 }
 
 /// Map MKL-DNN data format into TensorFlow data format
@@ -1136,10 +1046,12 @@ inline MKL_TENSOR_FORMAT TFDataFormatToMklDnnDataFormat(TensorFormat format) {
 /// @input: MKL-DNN data format
 /// @return: Tensorflow data format corresponding to MKL-DNN data format;
 ///          Fails with an error if invalid data format.
-inline TensorFormat MklDnnDataFormatToTFDataFormat(MKL_TENSOR_FORMAT format) {
-  if (format == MKL_TENSOR_FORMAT_NHWC || format == MKL_TENSOR_FORMAT_NDHWC)
+inline TensorFormat MklDnnDataFormatToTFDataFormat(MklTensorFormat format) {
+  if (format == MklTensorFormat::FORMAT_NHWC ||
+      format == MklTensorFormat::FORMAT_NDHWC)
     return FORMAT_NHWC;
-  if (format == MKL_TENSOR_FORMAT_NCHW || format == MKL_TENSOR_FORMAT_NCDHW)
+  if (format == MklTensorFormat::FORMAT_NCHW ||
+      format == MklTensorFormat::FORMAT_NCDHW)
     return FORMAT_NCHW;
   TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format"));
 
@@ -1176,7 +1088,8 @@ inline memory::dims TFShapeToMklDnnDims(const TensorShape& shape) {
 inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape,
                                               TensorFormat format) {
   // Check validity of format.
-  DCHECK_NE(TFDataFormatToMklDnnDataFormat(format), MKL_TENSOR_FORMAT_INVALID);
+  DCHECK_NE(TFDataFormatToMklDnnDataFormat(format),
+            MklTensorFormat::FORMAT_INVALID);
 
   int n = shape.dim_size(GetTensorDimIndex(format, 'N'));
   int c = shape.dim_size(GetTensorDimIndex(format, 'C'));
@@ -1191,7 +1104,7 @@ inline memory::dims TFShapeToMklDnnDimsInNCDHW(const TensorShape& shape,
                                                TensorFormat format) {
   // Validate format.
   DCHECK_NE(TFDataFormatToMklDnn3DDataFormat(format),
-            MKL_TENSOR_FORMAT_INVALID);
+            MklTensorFormat::FORMAT_INVALID);
 
   int n = shape.dim_size(GetTensorDimIndex<3>(format, 'N'));
   int c = shape.dim_size(GetTensorDimIndex<3>(format, 'C'));
@@ -1208,7 +1121,8 @@ inline memory::dims TFShapeToMklDnnDimsInNCDHW(const TensorShape& shape,
 inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims,
                                      TensorFormat format) {
   // Validate format.
-  DCHECK_NE(TFDataFormatToMklDnnDataFormat(format), MKL_TENSOR_FORMAT_INVALID);
+  DCHECK_NE(TFDataFormatToMklDnnDataFormat(format),
+            MklTensorFormat::FORMAT_INVALID);
 
   int n = in_dims[GetTensorDimIndex(format, 'N')];
   int c = in_dims[GetTensorDimIndex(format, 'C')];
@@ -1224,7 +1138,8 @@ inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims,
 inline memory::dims MklDnnDimsInNCDHW(const memory::dims& in_dims,
                                       TensorFormat format) {
   // Validate format.
-  DCHECK_NE(TFDataFormatToMklDnnDataFormat(format), MKL_TENSOR_FORMAT_INVALID);
+  DCHECK_NE(TFDataFormatToMklDnnDataFormat(format),
+            MklTensorFormat::FORMAT_INVALID);
 
   int n = in_dims[GetTensorDimIndex<3>(format, 'N')];
   int c = in_dims[GetTensorDimIndex<3>(format, 'C')];
@@ -1273,13 +1188,6 @@ inline memory::dims CalculateTFStrides(const memory::dims& dims_tf_order) {
   return strides;
 }
 
-#ifndef ENABLE_MKLDNN_V1
-inline padding_kind TFPaddingToMklDnnPadding(Padding pad) {
-  // MKL-DNN only supports zero padding.
-  return padding_kind::zero;
-}
-#endif
-
 /// Helper function to create memory descriptor in Blocked format
 ///
 /// @input: Tensor dimensions
@@ -1295,7 +1203,6 @@ inline Status CreateBlockedMemDescHelper(const memory::dims& dim,
                                          memory::data_type dtype,
                                          mkldnn_memory_desc_t* blocked_md) {
   DCHECK_EQ(dim.size(), strides.size());
-#ifdef ENABLE_MKLDNN_V1
   const int kNumDims = dim.size();
   mkldnn_dim_t* input_dims = new mkldnn_dim_t[kNumDims];
   mkldnn_dim_t* input_strides = new mkldnn_dim_t[kNumDims];
@@ -1317,26 +1224,6 @@ inline Status CreateBlockedMemDescHelper(const memory::dims& dim,
                       "Failed to create blocked memory descriptor.",
                       "Status: ", e.status, ", message: ", e.message));
   }
-#else
-  // We have to construct memory descriptor in a C style. This is not at all
-  // ideal but MKL-DNN does not offer any API to construct descriptor in
-  // blocked format except a copy constructor that accepts
-  // mkldnn_memory_desc_t.
-  blocked_md->primitive_kind = mkldnn_memory;
-  blocked_md->ndims = dim.size();
-  blocked_md->format = mkldnn_blocked;
-  blocked_md->data_type = memory::convert_to_c(dtype);
-
-  for (size_t i = 0; i < dim.size(); i++) {
-    blocked_md->layout_desc.blocking.block_dims[i] = 1;
-    blocked_md->layout_desc.blocking.strides[1][i] = 1;
-    blocked_md->layout_desc.blocking.strides[0][i] = strides[i];
-    blocked_md->layout_desc.blocking.padding_dims[i] = dim[i];
-    blocked_md->layout_desc.blocking.offset_padding_to_data[i] = 0;
-    blocked_md->dims[i] = dim[i];
-  }
-  blocked_md->layout_desc.blocking.offset_padding = 0;
-#endif  // ENABLE_MKLDNN_V1
   return Status::OK();
 }
 
@@ -1345,25 +1232,17 @@ inline void CreateAndExecuteReorder(const ReorderPd& reorder_desc,
                                     const memory& dst_mem, const engine& engine,
                                     OpKernelContext* ctx = nullptr) {
   std::vector<primitive> net;
-#ifdef ENABLE_MKLDNN_V1
   net.push_back(mkldnn::reorder(reorder_desc));
   std::vector<MemoryArgsMap> net_args;
   net_args.push_back({{MKLDNN_ARG_FROM, src_mem}, {MKLDNN_ARG_TO, dst_mem}});
-#else
-  net.push_back(mkldnn::reorder(reorder_desc, src_mem, dst_mem));
-#endif  // ENABLE_MKLDNN_V1
-  ExecutePrimitive(net, NET_ARGS_PTR, engine, ctx);
+  ExecutePrimitive(net, &net_args, engine, ctx);
 }
 
 class MklReorderPrimitive;
 
 template <typename T>
-#ifdef ENABLE_MKLDNN_V1
 inline MklReorderPrimitive* FindOrCreateReorder(const memory* from,
                                                 const memory* to);
-#else
-inline primitive FindOrCreateReorder(const memory* from, const memory* to);
-#endif  // ENABLE_MKLDNN_V1
 
 // Class to represent all the resources corresponding to a tensor in TensorFlow
 // that are required to execute an operation (such as Convolution).
@@ -1421,13 +1300,13 @@ class MklDnnData {
   /// an operation. E.g., filter of Conv2D is of shape {1, 2, 3, 4}, and
   /// memory format tag HWIO, and the buffer that contains actual values is
   /// pointed by data_buffer.
-  inline void SetUsrMem(const memory::dims& dim, MEMORY_FORMAT fm,
+  inline void SetUsrMem(const memory::dims& dim, memory::format_tag fm,
                         void* data_buffer = nullptr) {
     auto md = memory::desc(dim, MklDnnType<T>(), fm);
     SetUsrMem(md, data_buffer);
   }
 
-  inline void SetUsrMem(const memory::dims& dim, MEMORY_FORMAT fm,
+  inline void SetUsrMem(const memory::dims& dim, memory::format_tag fm,
                         const Tensor* tensor) {
     DCHECK(tensor);
     SetUsrMem(dim, fm, GetTensorBuffer(tensor));
@@ -1468,18 +1347,6 @@ class MklDnnData {
     SetUsrMem(dim, strides, GetTensorBuffer(tensor));
   }
 
-#ifndef ENABLE_MKLDNN_V1
-  /// Memory primitive descriptor is deprecated in MKL-DNN v1.x.
-  /// A version of function to set user memory primitive that accepts memory
-  /// descriptor directly, instead of accepting dimensions and format. This
-  /// function is more generic that the one above, but the function above is
-  /// sufficient in most cases.
-  inline void SetUsrMem(const memory::desc& md, void* data_buffer = nullptr) {
-    auto pd = memory::primitive_desc(md, *cpu_engine_);
-    SetUsrMem(pd, data_buffer);
-  }
-#endif  // !ENABLE_MKLDNN_V1
-
   /// A version of SetUsrMem with memory descriptor and tensor
   inline void SetUsrMem(const memory::desc& md, const Tensor* tensor) {
     CHECK_NOTNULL(tensor);
@@ -1490,50 +1357,24 @@ class MklDnnData {
   /// descriptor directly, instead of accepting dimensions and format. This
   /// function is more generic than the one above, but the function above is
   /// sufficient in most cases.
-  inline void SetUsrMem(const MEMORY_PRIMITIVE_DESC& pd,
-                        void* data_buffer = nullptr) {
+  inline void SetUsrMem(const memory::desc& pd, void* data_buffer = nullptr) {
     DCHECK(cpu_engine_);
     if (user_memory_) delete user_memory_;
     // TODO(nhasabni): can we remove dynamic memory allocation?
     if (data_buffer) {
-      user_memory_ = new MEMORY_CONSTRUCTOR(pd, *cpu_engine_, data_buffer);
+      user_memory_ = new memory(pd, *cpu_engine_, data_buffer);
     } else {
-      user_memory_ = new MEMORY_CONSTRUCTOR_WITHOUT_DATA(pd, *cpu_engine_);
+      user_memory_ = new memory(pd, *cpu_engine_);
     }
   }
 
-#ifndef ENABLE_MKLDNN_V1
-  /// Memory primitive descriptor is deprecated in MKL-DNN v1.x
-  /// A version of SetUsrMem with primitive descriptor and tensor
-  inline void SetUsrMem(const memory::primitive_desc& pd,
-                        const Tensor* tensor) {
-    DCHECK(tensor);
-    SetUsrMem(pd, GetTensorBuffer(tensor));
-  }
-#endif  // !ENABLE_MKLDNN_V1
-
   /// Get function for user memory primitive.
   inline const memory* GetUsrMem() const { return user_memory_; }
 
-#ifndef ENABLE_MKLDNN_V1
-  /// Memory primitive descriptor is deprecated in MKL-DNN v1.x.
-  /// Get function for primitive descriptor of user memory primitive.
-  inline const memory::primitive_desc GetUsrMemPrimDesc() const {
-    DCHECK(user_memory_);
-    return user_memory_->get_primitive_desc();
-  }
-#endif  // !ENABLE_MKLDNN_V1
-
   /// Get function for descriptor of user memory.
   inline memory::desc GetUsrMemDesc() const {
-#ifdef ENABLE_MKLDNN_V1
     DCHECK(user_memory_);
     return user_memory_->get_desc();
-#else
-    // This is ugly. Why MKL-DNN does not provide desc() method of const type??
-    const memory::primitive_desc pd = GetUsrMemPrimDesc();
-    return const_cast<memory::primitive_desc*>(&pd)->desc();
-#endif  // ENABLE_MKLDNN_V1
   }
 
   /// Get function for data buffer of user memory primitive.
@@ -1584,7 +1425,7 @@ class MklDnnData {
   /// format. E.g., For Conv2D, the dimensions would be same as user dimensions
   /// but memory::format_tag would be mkldnn::any because we want MKL-DNN to
   /// choose the best layout/format for given input dimensions.
-  inline void SetOpMemDesc(const memory::dims& dim, MEMORY_FORMAT fm) {
+  inline void SetOpMemDesc(const memory::dims& dim, memory::format_tag fm) {
     // TODO(nhasabni): can we remove dynamic memory allocation?
     op_md_ = new memory::desc(dim, MklDnnType<T>(), fm);
   }
@@ -1597,29 +1438,11 @@ class MklDnnData {
   ///
   /// @input: op_md - memory descriptor of the given input of an operation.
   /// @return: true in case reorder of input is needed; false, otherwise.
-  inline bool IsReorderNeeded(const MEMORY_PRIMITIVE_DESC& op_pd) const {
+  inline bool IsReorderNeeded(const memory::desc& op_pd) const {
     DCHECK(user_memory_);
-    return op_pd != GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(user_memory_);
+    return op_pd != user_memory_->get_desc();
   }
 
-#ifndef ENABLE_MKLDNN_V1
-  /// In MKL-DNN v1.x, it it is not possible to directly compare two memory
-  /// format tags since they only provide a partial description of the memory
-  /// layout. Hence, this function is disabled for MKL-DNN v1.x.
-  ///
-  /// Predicate that checks if we need to reorder user's memory into memory
-  /// based on the provided format.
-  ///
-  /// @input: target_format - memory format of the given input of an
-  ///               operation
-  /// @return: true in case reorder of input is needed; false, otherwise.
-  inline bool IsReorderNeeded(const memory::format& target_format) const {
-    CHECK_NOTNULL(user_memory_);
-    return target_format !=
-           user_memory_->get_primitive_desc().desc().data.format;
-  }
-#endif  // !ENABLE_MKLDNN_V1
-
   /// Function to create a reorder from memory pointed by from to memory pointed
   /// by to. Returns created primitive.
   inline primitive CreateReorder(const memory* from, const memory* to) const {
@@ -1628,24 +1451,23 @@ class MklDnnData {
     return reorder(*from, *to);
   }
 
-/// Function to handle input reordering
-///
-/// Check if we need to reorder this input of an operation.
-/// Return true and allocate reorder memory primitive if reorder is needed.
-/// Otherwise, return false and do not allocate reorder memory primitive.
-///
-/// To check if reorder is needed, this function compares memory primitive
-/// descriptor (memory descriptor for v1.x) of an operation (op_pd) for
-/// the given input with the user-specified memory descriptor.
-///
-/// @input: op_pd - memory primitive descriptor of the given input of an
-///                 operation
-/// @input: net - net to which to add reorder primitive in case it is needed.
-/// @input: net_args - net to which user and reorder memories are added if
-///                    needed. Each entry is a key-value pair of the form
-///                    <argument-type, mkldnn::memory>.
-/// @return: true in case reorder of input is needed; false, otherwise.
-#ifdef ENABLE_MKLDNN_V1
+  /// Function to handle input reordering
+  ///
+  /// Check if we need to reorder this input of an operation.
+  /// Return true and allocate reorder memory primitive if reorder is needed.
+  /// Otherwise, return false and do not allocate reorder memory primitive.
+  ///
+  /// To check if reorder is needed, this function compares memory primitive
+  /// descriptor (memory descriptor for v1.x) of an operation (op_pd) for
+  /// the given input with the user-specified memory descriptor.
+  ///
+  /// @input: op_pd - memory primitive descriptor of the given input of an
+  ///                 operation
+  /// @input: net - net to which to add reorder primitive in case it is needed.
+  /// @input: net_args - net to which user and reorder memories are added if
+  ///                    needed. Each entry is a key-value pair of the form
+  ///                    <argument-type, mkldnn::memory>.
+  /// @return: true in case reorder of input is needed; false, otherwise.
   inline bool CheckReorderToOpMem(const memory::desc& op_md,
                                   std::vector<primitive>& net,
                                   std::vector<MemoryArgsMap>& net_args,
@@ -1658,21 +1480,11 @@ class MklDnnData {
       net.push_back(CreateReorder(user_memory_, reorder_memory_));
       net_args.push_back(MemoryArgsMap{{MKLDNN_ARG_FROM, *user_memory_},
                                        {MKLDNN_ARG_TO, *reorder_memory_}});
-#else
-  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
-                                  std::vector<primitive>* net) {
-    DCHECK(net);
-    DCHECK(user_memory_);
-    if (IsReorderNeeded(op_pd)) {
-      reorder_memory_ = new memory(op_pd);
-      net->push_back(CreateReorder(user_memory_, reorder_memory_));
-#endif  // ENABLE_MKLDNN_V1
       return true;
     }
     return false;
   }
 
-#ifdef ENABLE_MKLDNN_V1
   inline bool CheckReorderToOpMem(const memory::desc& op_md,
                                   const engine& engine,
                                   OpKernelContext* context = nullptr) {
@@ -1691,36 +1503,25 @@ class MklDnnData {
       net_args.push_back({{MKLDNN_ARG_FROM, *user_memory_},
                           {MKLDNN_ARG_TO, *reorder_memory_}});
       execute_primitives(net, cpu_stream, net_args);
-#else
-  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
-                                  OpKernelContext* ctx = nullptr) {
-    CHECK_NOTNULL(user_memory_);
-    if (IsReorderNeeded(op_pd)) {
-      reorder_memory_ = new memory(op_pd);
-      std::vector<primitive> net;
-      net.push_back(FindOrCreateReorder<T>(user_memory_, reorder_memory_));
-      stream(stream::kind::eager_nostore).submit(net).wait();
-#endif  // ENABLE_MKLDNN_V1
       return true;
     }
     return false;
   }
 
-/// Overloaded version of above function that accepts memory buffer
-/// where output of reorder needs to be stored.
-///
-/// @input: op_pd - memory primitive descriptor (memory descriptor for v1.x)
-///                 of the given input of an operation
-/// @reorder_data_handle - memory buffer where output of reorder needs to be
-///                        stored. Primitive does not check if buffer has
-///                        enough size to write.
-/// @input: net - net to which to add reorder primitive in case it is needed.
-/// @input: net_args - net to which user and reorder memories are added if
-///                    needed. Each entry is a key-value pair of the form
-///                    <argument-type, mkldnn::memory>.
-/// @input: engine - MKL-DNN's abstraction of a computational device
-/// @return: true in case reorder of input is needed; false, otherwise.
-#ifdef ENABLE_MKLDNN_V1
+  /// Overloaded version of above function that accepts memory buffer
+  /// where output of reorder needs to be stored.
+  ///
+  /// @input: op_pd - memory primitive descriptor (memory descriptor for v1.x)
+  ///                 of the given input of an operation
+  /// @reorder_data_handle - memory buffer where output of reorder needs to be
+  ///                        stored. Primitive does not check if buffer has
+  ///                        enough size to write.
+  /// @input: net - net to which to add reorder primitive in case it is needed.
+  /// @input: net_args - net to which user and reorder memories are added if
+  ///                    needed. Each entry is a key-value pair of the form
+  ///                    <argument-type, mkldnn::memory>.
+  /// @input: engine - MKL-DNN's abstraction of a computational device
+  /// @return: true in case reorder of input is needed; false, otherwise.
   inline bool CheckReorderToOpMem(const memory::desc& op_md,
                                   void* reorder_data_handle,
                                   std::vector<primitive>& net,
@@ -1734,26 +1535,14 @@ class MklDnnData {
       net.push_back(CreateReorder(user_memory_, reorder_memory_));
       net_args.push_back(MemoryArgsMap{{MKLDNN_ARG_FROM, *user_memory_},
                                        {MKLDNN_ARG_TO, *reorder_memory_}});
-#else
-  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
-                                  void* reorder_data_handle,
-                                  std::vector<primitive>* net) {
-    CHECK_NOTNULL(net);
-    CHECK_NOTNULL(reorder_data_handle);
-    CHECK_NOTNULL(user_memory_);
-    if (IsReorderNeeded(op_pd)) {
-      reorder_memory_ = new memory(op_pd, reorder_data_handle);
-      net->push_back(CreateReorder(user_memory_, reorder_memory_));
-#endif  // ENABLE_MKLDNN_V1
       return true;
     }
     return false;
   }
 
-/// This is a faster path with reorder primitive cache compared with
-/// CheckReorderToOpMem(..., std::vector<primitive>* net).
-/// The slower path will be removed in the future
-#ifdef ENABLE_MKLDNN_V1
+  /// This is a faster path with reorder primitive cache compared with
+  /// CheckReorderToOpMem(..., std::vector<primitive>* net).
+  /// The slower path will be removed in the future
   /// TODO(bhavanis): Need to use reorder cache here for better performance.
   inline bool CheckReorderToOpMem(const memory::desc& op_md,
                                   void* reorder_data_handle,
@@ -1775,38 +1564,25 @@ class MklDnnData {
       net_args.push_back({{MKLDNN_ARG_FROM, *user_memory_},
                           {MKLDNN_ARG_TO, *reorder_memory_}});
       execute_primitives(net, cpu_stream, net_args);
-#else
-  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
-                                  void* reorder_data_handle,
-                                  OpKernelContext* context = nullptr) {
-    CHECK_NOTNULL(reorder_data_handle);
-    CHECK_NOTNULL(user_memory_);
-    if (IsReorderNeeded(op_pd)) {
-      std::vector<primitive> net;
-      reorder_memory_ = new memory(op_pd, reorder_data_handle);
-      net.push_back(FindOrCreateReorder<T>(user_memory_, reorder_memory_));
-      stream(stream::kind::eager_nostore).submit(net).wait();
-#endif  // ENABLE_MKLDNN_V1
       return true;
     }
     return false;
   }
 
-/// Another overloaded version of CheckReorderToOpMem that accepts Tensor
-/// where output of reorder needs to be stored.
-///
-/// @input: op_md - memory primitive descriptor (memory descriptor for v1.x)
-///                 of the given input of an operation
-/// @reorder_tensor - Tensor whose buffer is to be used to store output of
-///                   reorder. Primitive does not check if buffer is
-///                   enough size to write.
-/// @input: net - net to which to add reorder primitive in case it is needed.
-/// @input: net_args - net to which user and reorder memories are added if
-///                    needed. Each entry is a key-value pair of the form
-///                    <argument-type, mkldnn::memory>.
-/// @input: engine - MKL-DNN's abstraction of a computational device
-/// @return: true in case reorder of input is needed; false, otherwise.
-#ifdef ENABLE_MKLDNN_V1
+  /// Another overloaded version of CheckReorderToOpMem that accepts Tensor
+  /// where output of reorder needs to be stored.
+  ///
+  /// @input: op_md - memory primitive descriptor (memory descriptor for v1.x)
+  ///                 of the given input of an operation
+  /// @reorder_tensor - Tensor whose buffer is to be used to store output of
+  ///                   reorder. Primitive does not check if buffer is
+  ///                   enough size to write.
+  /// @input: net - net to which to add reorder primitive in case it is needed.
+  /// @input: net_args - net to which user and reorder memories are added if
+  ///                    needed. Each entry is a key-value pair of the form
+  ///                    <argument-type, mkldnn::memory>.
+  /// @input: engine - MKL-DNN's abstraction of a computational device
+  /// @return: true in case reorder of input is needed; false, otherwise.
   inline bool CheckReorderToOpMem(const memory::desc& op_md,
                                   Tensor* reorder_tensor,
                                   std::vector<primitive>& net,
@@ -1816,30 +1592,17 @@ class MklDnnData {
     return CheckReorderToOpMem(op_md, GetTensorBuffer(reorder_tensor), net,
                                net_args, engine);
   }
-#else
-  inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd,
-                                  Tensor* reorder_tensor,
-                                  std::vector<primitive>* net) {
-    CHECK_NOTNULL(net);
-    CHECK_NOTNULL(reorder_tensor);
-    return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor), net);
-  }
-#endif  // ENABLE_MKLDNN_V1
 
   /// TODO: this is a faster path with reorder primitive cache compared with
   /// CheckReorderToOpMem(op_md, reorder_tensor, net, net_args, engine), will
   /// remove
   /// slow path in the future
-  inline bool CheckReorderToOpMem(const MEMORY_PRIMITIVE_DESC& op_pd,
+  inline bool CheckReorderToOpMem(const memory::desc& op_pd,
                                   Tensor* reorder_tensor,
                                   OpKernelContext* ctx = nullptr) {
     DCHECK(reorder_tensor);
-#ifdef ENABLE_MKLDNN_V1
     return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor),
                                *cpu_engine_, ctx);
-#else
-    return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor), ctx);
-#endif  // ENABLE_MKLDNN_V1
   }
 
   /// Function to handle output reorder
@@ -1855,28 +1618,26 @@ class MklDnnData {
   /// @input - memory primitive descriptor (memory descriptor for v1.x) for the
   ///          given output of an operation
   /// @return: true in case reorder of output is needed; false, otherwise.
-  inline bool PrepareReorderToUserMemIfReq(const MEMORY_PRIMITIVE_DESC& op_pd) {
+  inline bool PrepareReorderToUserMemIfReq(const memory::desc& op_pd) {
     DCHECK(user_memory_);
     if (IsReorderNeeded(op_pd)) {
       // TODO(nhasabni): can we remove dynamic memory allocation?
-      reorder_memory_ =
-          new MEMORY_CONSTRUCTOR_WITHOUT_DATA(op_pd, *cpu_engine_);
+      reorder_memory_ = new memory(op_pd, *cpu_engine_);
       return true;
     }
     return false;
   }
 
-/// Function to actually insert reorder primitive in the net
-///
-/// This function completes remaining part of output reordering. It inserts
-/// a reordering primitive from the temporary buffer that holds the output
-/// to the user-specified output buffer.
-///
-/// @input: net - net to which to add reorder primitive
-/// @input: net_args - net to which user and reorder memories are added if
-///                    needed. Each entry is a key-value pair of the form
-///                    <argument-type, mkldnn::memory>.
-#ifdef ENABLE_MKLDNN_V1
+  /// Function to actually insert reorder primitive in the net
+  ///
+  /// This function completes remaining part of output reordering. It inserts
+  /// a reordering primitive from the temporary buffer that holds the output
+  /// to the user-specified output buffer.
+  ///
+  /// @input: net - net to which to add reorder primitive
+  /// @input: net_args - net to which user and reorder memories are added if
+  ///                    needed. Each entry is a key-value pair of the form
+  ///                    <argument-type, mkldnn::memory>.
   inline void InsertReorderToUserMem(std::vector<primitive>& net,
                                      std::vector<MemoryArgsMap>& net_args) {
     DCHECK(user_memory_);
@@ -1885,14 +1646,6 @@ class MklDnnData {
     net_args.push_back(MemoryArgsMap{{MKLDNN_ARG_FROM, *reorder_memory_},
                                      {MKLDNN_ARG_TO, *user_memory_}});
   }
-#else
-  inline void InsertReorderToUserMem(std::vector<primitive>* net) {
-    CHECK_NOTNULL(net);
-    CHECK_NOTNULL(user_memory_);
-    CHECK_NOTNULL(reorder_memory_);
-    net->push_back(CreateReorder(reorder_memory_, user_memory_));
-  }
-#endif  // ENABLE_MKLDNN_V1
 
   /// TODO: this is a faster path with reorder primitive cache compared with
   ///       InsertReorderToUserMem(net, net_args), will remove
@@ -1904,7 +1657,6 @@ class MklDnnData {
     // primitive reuse don't allow two same reorder prim in
     // one stream, so submit it immediately
     std::vector<primitive> net;
-#ifdef ENABLE_MKLDNN_V1
     auto* prim = FindOrCreateReorder<T>(reorder_memory_, user_memory_);
     net.push_back(*(prim->GetPrimitive()));
     std::vector<MemoryArgsMap> net_args;
@@ -1913,10 +1665,6 @@ class MklDnnData {
     std::shared_ptr<stream> cpu_stream;
     cpu_stream.reset(CreateStream(ctx, prim->GetEngine()));
     execute_primitives(net, cpu_stream, net_args);
-#else
-    net.push_back(FindOrCreateReorder<T>(reorder_memory_, user_memory_));
-    ExecutePrimitive(net, NET_ARGS_PTR, *cpu_engine_);
-#endif  // ENABLE_MKLDNN_V1
   }
 };
 
@@ -1928,7 +1676,7 @@ class MklPrimitive {
   MklPrimitive(const engine& cpu_engine) { cpu_engine_ = cpu_engine; }
   // Dummy data which MKL DNN never operates on
   unsigned char* DummyData = nullptr;
-  engine cpu_engine_ = engine(ENGINE_CPU, 0);
+  engine cpu_engine_ = engine(engine::kind::cpu, 0);
   const engine& GetEngine() { return cpu_engine_; }
 };
 
@@ -2116,7 +1864,7 @@ class FactoryKeyCreator {
 class MklReorderPrimitive : public MklPrimitive {
  public:
   explicit MklReorderPrimitive(const memory* from, const memory* to)
-      : MklPrimitive(engine(ENGINE_CPU, 0)) {
+      : MklPrimitive(engine(engine::kind::cpu, 0)) {
     Setup(from, to);
   }
   ~MklReorderPrimitive() {}
@@ -2143,12 +1891,11 @@ class MklReorderPrimitive : public MklPrimitive {
 
   void Setup(const memory* from, const memory* to) {
     context_.src_mem.reset(
-        new MEMORY_CONSTRUCTOR_WITH_MEM_PD(from, cpu_engine_, DummyData));
-    context_.dst_mem.reset(
-        new MEMORY_CONSTRUCTOR_WITH_MEM_PD(to, cpu_engine_, DummyData));
+        new memory(from->get_desc(), cpu_engine_, DummyData));
+    context_.dst_mem.reset(new memory(to->get_desc(), cpu_engine_, DummyData));
     context_.reorder_prim = std::make_shared<mkldnn::reorder>(
         reorder(*context_.src_mem, *context_.dst_mem));
-    stream_.reset(new CPU_STREAM(cpu_engine_));
+    stream_.reset(new stream(cpu_engine_));
   }
 };
 
@@ -2175,58 +1922,46 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
   static string CreateKey(const memory* from, const memory* to) {
     string prefix = "reorder";
     FactoryKeyCreator key_creator;
-    auto const& from_desc = GET_MEMORY_DESC_FROM_MEM_PTR(from).data;
-    auto const& to_desc = GET_MEMORY_DESC_FROM_MEM_PTR(to).data;
+    auto const& from_desc = from->get_desc().data;
+    auto const& to_desc = to->get_desc().data;
     const int kIdxFirstStride = 0;
     memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]);
     memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]);
-    auto from_strides = from_desc.MEMORY_FORMAT_DESC.blocking.strides;
-#ifdef ENABLE_MKLDNN_V1
+    auto from_strides = from_desc.format_desc.blocking.strides;
+
     // As DNNL memory desc has C style array and only init the used
     // part, so need use the valid part as key.
-    auto from_inner_nblks = from_desc.MEMORY_FORMAT_DESC.blocking.inner_nblks;
-    auto from_inner_blks = from_desc.MEMORY_FORMAT_DESC.blocking.inner_blks;
-    auto from_inner_idxs = from_desc.MEMORY_FORMAT_DESC.blocking.inner_idxs;
+    auto from_inner_nblks = from_desc.format_desc.blocking.inner_nblks;
+    auto from_inner_blks = from_desc.format_desc.blocking.inner_blks;
+    auto from_inner_idxs = from_desc.format_desc.blocking.inner_idxs;
     memory::dims from_inner_blks_1(from_inner_blks,
                                    &from_inner_blks[from_inner_nblks]);
     memory::dims from_inner_idxs_1(from_inner_idxs,
                                    &from_inner_idxs[from_inner_nblks]);
-    auto to_inner_nblks = to_desc.MEMORY_FORMAT_DESC.blocking.inner_nblks;
-    auto to_inner_blks = to_desc.MEMORY_FORMAT_DESC.blocking.inner_blks;
-    auto to_inner_idxs = to_desc.MEMORY_FORMAT_DESC.blocking.inner_idxs;
+    auto to_inner_nblks = to_desc.format_desc.blocking.inner_nblks;
+    auto to_inner_blks = to_desc.format_desc.blocking.inner_blks;
+    auto to_inner_idxs = to_desc.format_desc.blocking.inner_idxs;
     memory::dims to_inner_blks_1(to_inner_blks, &to_inner_blks[to_inner_nblks]);
     memory::dims to_inner_idxs_1(to_inner_idxs, &to_inner_idxs[to_inner_nblks]);
-#endif  // ENABLE_MKLDNN_V1
-    auto to_strides = to_desc.MEMORY_FORMAT_DESC.blocking.strides;
-    memory::dims from_strides_outer_blocks(
-        GET_BLOCK_STRIDES(from_strides, kIdxFirstStride),
-        &GET_BLOCK_STRIDES(from_strides, kIdxFirstStride)[from_desc.ndims]);
-    memory::dims to_strides_outer_blocks(
-        GET_BLOCK_STRIDES(to_strides, kIdxFirstStride),
-        &GET_BLOCK_STRIDES(to_strides, kIdxFirstStride)[to_desc.ndims]);
+
+    auto to_strides = to_desc.format_desc.blocking.strides;
+    memory::dims from_strides_outer_blocks(from_strides,
+                                           &from_strides[from_desc.ndims]);
+    memory::dims to_strides_outer_blocks(to_strides,
+                                         &to_strides[to_desc.ndims]);
 
     key_creator.AddAsKey(prefix);
-#ifndef ENABLE_MKLDNN_V1
-    // `format_kind` is not added in v1.x since it will always set to
-    // `mkldnn_blocked`
-    key_creator.AddAsKey(static_cast<int>(from_desc.format));
-#else
     key_creator.AddAsKey(static_cast<int>(from_desc.extra.flags));
     key_creator.AddAsKey(static_cast<int>(from_inner_nblks));
     key_creator.AddAsKey(from_inner_blks_1);
     key_creator.AddAsKey(from_inner_idxs_1);
-#endif  // !ENABLE_MKLDNN_V1
     key_creator.AddAsKey(static_cast<int>(from_desc.data_type));
     key_creator.AddAsKey(from_dims);
     key_creator.AddAsKey(from_strides_outer_blocks);
-#ifndef ENABLE_MKLDNN_V1
-    key_creator.AddAsKey(static_cast<int>(to_desc.format));
-#else
     key_creator.AddAsKey(static_cast<int>(to_desc.extra.flags));
     key_creator.AddAsKey(static_cast<int>(to_inner_nblks));
     key_creator.AddAsKey(to_inner_blks_1);
     key_creator.AddAsKey(to_inner_idxs_1);
-#endif  // !ENABLE_MKLDNN_V1
     key_creator.AddAsKey(static_cast<int>(to_desc.data_type));
     key_creator.AddAsKey(to_dims);
     key_creator.AddAsKey(to_strides_outer_blocks);
@@ -2253,21 +1988,13 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory<T> {
 /// get primitive from pool if it is cached.
 /// Returns the primitive.
 template <typename T>
-#ifdef ENABLE_MKLDNN_V1
 inline MklReorderPrimitive* FindOrCreateReorder(const memory* from,
                                                 const memory* to) {
-#else
-inline primitive FindOrCreateReorder(const memory* from, const memory* to) {
-#endif  // ENABLE_MKLDNN_V1
   CHECK_NOTNULL(from);
   CHECK_NOTNULL(to);
   MklReorderPrimitive* reorder_prim =
       MklReorderPrimitiveFactory<T>::Get(from, to);
-#ifdef ENABLE_MKLDNN_V1
   return reorder_prim;
-#else
-  return *reorder_prim->GetPrimitive();
-#endif  // ENABLE_MKLDNN_V1
 }
 
 // utility function to determine if it is conv 1x1 and stride != 1
diff --git a/tensorflow/core/util/mkl_util_test.cc b/tensorflow/core/util/mkl_util_test.cc
index 872ffcc61d4..29760cab4e9 100644
--- a/tensorflow/core/util/mkl_util_test.cc
+++ b/tensorflow/core/util/mkl_util_test.cc
@@ -68,9 +68,6 @@ TEST(MklUtilTest, MklDnnBlockedFormatTest) {
   EXPECT_EQ(a_md1.data.ndims, 2);
   EXPECT_EQ(a_md1.data.dims[0], 3);
   EXPECT_EQ(a_md1.data.dims[1], 4);
-#ifndef ENABLE_MKLDNN_V1
-  EXPECT_EQ(a_md1.data.format, mkldnn_blocked);
-#endif  // !ENABLE_MKLDNN_V1
 
   // Setting for case 2
   MklDnnData<float> b(&cpu_engine);
@@ -82,9 +79,6 @@ TEST(MklUtilTest, MklDnnBlockedFormatTest) {
   EXPECT_EQ(b_md2.data.ndims, 2);
   EXPECT_EQ(b_md2.data.dims[0], 3);
   EXPECT_EQ(b_md2.data.dims[1], 4);
-#ifndef ENABLE_MKLDNN_V1
-  EXPECT_EQ(b_md2.data.format, mkldnn_blocked);
-#endif  // !ENABLE_MKLDNN_V1
 }
 
 TEST(MklUtilTest, LRUCacheTest) {
diff --git a/tensorflow/core/util/proto/BUILD b/tensorflow/core/util/proto/BUILD
index a47a64982e7..d971b6383cb 100644
--- a/tensorflow/core/util/proto/BUILD
+++ b/tensorflow/core/util/proto/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
diff --git a/tensorflow/core/util/ragged_to_dense_util.cc b/tensorflow/core/util/ragged_to_dense_util.cc
index cd95b5ec75b..1d00a43a14a 100644
--- a/tensorflow/core/util/ragged_to_dense_util.cc
+++ b/tensorflow/core/util/ragged_to_dense_util.cc
@@ -24,43 +24,15 @@ namespace tensorflow {
 
 using errors::InvalidArgument;
 
-string RowPartitionTypeToString(RowPartitionType row_partition_type) {
-  switch (row_partition_type) {
-    case RowPartitionType::FIRST_DIM_SIZE:
-      return "FIRST_DIM_SIZE";
-    case RowPartitionType::VALUE_ROWIDS:
-      return "VALUE_ROWIDS";
-    case RowPartitionType::ROW_LENGTHS:
-      return "ROW_LENGTHS";
-    case RowPartitionType::ROW_SPLITS:
-      return "ROW_SPLITS";
-    case RowPartitionType::ROW_LIMITS:
-      return "ROW_LIMITS";
-    case RowPartitionType::ROW_STARTS:
-      return "ROW_STARTS";
-    default:
-      return "UNKNOWN ROW PARTITION TYPE";
-  }
-}
 tensorflow::Status GetRowPartitionTypesHelper(
     const std::vector<string>& row_partition_type_strings,
     std::vector<RowPartitionType>* row_partition_types) {
-  static const auto kStringToType =
-      new std::unordered_map<string, RowPartitionType>(
-          {{"FIRST_DIM_SIZE", RowPartitionType::FIRST_DIM_SIZE},
-           {"VALUE_ROWIDS", RowPartitionType::VALUE_ROWIDS},
-           {"ROW_LENGTHS", RowPartitionType::ROW_LENGTHS},
-           {"ROW_SPLITS", RowPartitionType::ROW_SPLITS},
-           {"ROW_LIMITS", RowPartitionType::ROW_LIMITS},
-           {"ROW_STARTS", RowPartitionType::ROW_STARTS}});
-
-  for (const string& type_str : row_partition_type_strings) {
-    const auto iter = kStringToType->find(type_str);
-    if (iter == kStringToType->end()) {
-      return InvalidArgument("Unknown string for partition info type: ",
-                             type_str);
-    }
-    row_partition_types->push_back(iter->second);
+  *row_partition_types = GetRowPartitionTypesHelper(row_partition_type_strings);
+  if (row_partition_types->size() != row_partition_type_strings.size()) {
+    // Something was not converted, return error status.
+    return InvalidArgument(
+        "Unknown string for partition info type: ",
+        row_partition_type_strings.at(row_partition_types->size()));
   }
   return tensorflow::Status::OK();
 }
@@ -120,16 +92,6 @@ tensorflow::Status CombineRaggedTensorToTensorShapes(
   return tensorflow::Status::OK();
 }
 
-int GetRaggedRank(const std::vector<RowPartitionType>& row_partition_types) {
-  if (row_partition_types.empty()) {
-    return 0;
-  }
-  if (row_partition_types[0] == RowPartitionType::FIRST_DIM_SIZE) {
-    return row_partition_types.size() - 1;
-  }
-  return row_partition_types.size();
-}
-
 tensorflow::Status ValidateDefaultValueShape(
     const TensorShapeProto& default_value_shape,
     const TensorShapeProto& value_shape) {
diff --git a/tensorflow/core/util/ragged_to_dense_util.h b/tensorflow/core/util/ragged_to_dense_util.h
index d29d6a5b62d..28d230aa4a9 100644
--- a/tensorflow/core/util/ragged_to_dense_util.h
+++ b/tensorflow/core/util/ragged_to_dense_util.h
@@ -20,16 +20,9 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/util/ragged_to_dense_util_common.h"
 
 namespace tensorflow {
-enum class RowPartitionType {
-  FIRST_DIM_SIZE,
-  VALUE_ROWIDS,
-  ROW_LENGTHS,
-  ROW_SPLITS,
-  ROW_LIMITS,
-  ROW_STARTS
-};
 
 string RowPartitionTypeToString(RowPartitionType row_partition_type);
 
@@ -48,6 +41,10 @@ Status GetRowPartitionTypes(
                                     row_partition_types);
 }
 
+Status GetRowPartitionTypesHelper(
+    const std::vector<string>& row_partition_type_strings,
+    std::vector<RowPartitionType>* row_partition_types);
+
 Status CombineRaggedTensorToTensorShapes(int ragged_rank,
                                          const TensorShapeProto& shape,
                                          const TensorShapeProto& value_shape,
diff --git a/tensorflow/core/util/ragged_to_dense_util_common.cc b/tensorflow/core/util/ragged_to_dense_util_common.cc
new file mode 100644
index 00000000000..b2d0b2d2fd9
--- /dev/null
+++ b/tensorflow/core/util/ragged_to_dense_util_common.cc
@@ -0,0 +1,70 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/ragged_to_dense_util_common.h"
+
+#include <unordered_map>
+
+namespace tensorflow {
+std::string RowPartitionTypeToString(RowPartitionType row_partition_type) {
+  switch (row_partition_type) {
+    case RowPartitionType::FIRST_DIM_SIZE:
+      return "FIRST_DIM_SIZE";
+    case RowPartitionType::VALUE_ROWIDS:
+      return "VALUE_ROWIDS";
+    case RowPartitionType::ROW_LENGTHS:
+      return "ROW_LENGTHS";
+    case RowPartitionType::ROW_SPLITS:
+      return "ROW_SPLITS";
+    case RowPartitionType::ROW_LIMITS:
+      return "ROW_LIMITS";
+    case RowPartitionType::ROW_STARTS:
+      return "ROW_STARTS";
+    default:
+      return "UNKNOWN ROW PARTITION TYPE";
+  }
+}
+
+std::vector<RowPartitionType> GetRowPartitionTypesHelper(
+    const std::vector<std::string>& row_partition_type_strings) {
+  static const auto kStringToType =
+      new std::unordered_map<std::string, RowPartitionType>(
+          {{"FIRST_DIM_SIZE", RowPartitionType::FIRST_DIM_SIZE},
+           {"VALUE_ROWIDS", RowPartitionType::VALUE_ROWIDS},
+           {"ROW_LENGTHS", RowPartitionType::ROW_LENGTHS},
+           {"ROW_SPLITS", RowPartitionType::ROW_SPLITS},
+           {"ROW_LIMITS", RowPartitionType::ROW_LIMITS},
+           {"ROW_STARTS", RowPartitionType::ROW_STARTS}});
+  std::vector<RowPartitionType> result;
+  for (const auto& type_str : row_partition_type_strings) {
+    const auto iter = kStringToType->find(type_str);
+    if (iter == kStringToType->end()) {
+      break;
+    }
+    result.push_back(iter->second);
+  }
+  return result;
+}
+
+int GetRaggedRank(const std::vector<RowPartitionType>& row_partition_types) {
+  if (row_partition_types.empty()) {
+    return 0;
+  }
+  if (row_partition_types[0] == RowPartitionType::FIRST_DIM_SIZE) {
+    return row_partition_types.size() - 1;
+  }
+  return row_partition_types.size();
+}
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/ragged_to_dense_util_common.h b/tensorflow/core/util/ragged_to_dense_util_common.h
new file mode 100644
index 00000000000..b43412adb59
--- /dev/null
+++ b/tensorflow/core/util/ragged_to_dense_util_common.h
@@ -0,0 +1,40 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_RAGGED_TO_DENSE_UTIL_COMMON_H_
+#define TENSORFLOW_CORE_UTIL_RAGGED_TO_DENSE_UTIL_COMMON_H_
+
+#include <string>
+#include <vector>
+
+namespace tensorflow {
+enum class RowPartitionType {
+  FIRST_DIM_SIZE,
+  VALUE_ROWIDS,
+  ROW_LENGTHS,
+  ROW_SPLITS,
+  ROW_LIMITS,
+  ROW_STARTS
+};
+
+std::string RowPartitionTypeToString(RowPartitionType row_partition_type);
+
+std::vector<RowPartitionType> GetRowPartitionTypesHelper(
+    const std::vector<std::string>& row_partition_type_strings);
+
+int GetRaggedRank(const std::vector<RowPartitionType>& row_partition_types);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_RAGGED_TO_DENSE_UTIL_COMMON_H_
diff --git a/tensorflow/core/util/rpc/BUILD b/tensorflow/core/util/rpc/BUILD
index 1cd90283d5c..c1b8869b8d2 100644
--- a/tensorflow/core/util/rpc/BUILD
+++ b/tensorflow/core/util/rpc/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
diff --git a/tensorflow/core/util/sparse/BUILD b/tensorflow/core/util/sparse/BUILD
index 6d0e3d0b4af..4f621a6376d 100644
--- a/tensorflow/core/util/sparse/BUILD
+++ b/tensorflow/core/util/sparse/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
 package(
     licenses = ["notice"],  # Apache 2.0
 )
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index bc4e2c88f1c..062226d7699 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -476,12 +476,12 @@ inline SparseTensor SparseTensor::Concat(
     // Fill in indices & values.
     if (st_num_entries > 0) {
       std::copy_n(&st.vals_.vec<T>()(0), st_num_entries, &vals_t(offset));
-    }
 
-    const auto* st_ix = &st.ix_.matrix<int64>()(0, 0);
-    auto* ix_out = &ix_t(offset, 0);
-    for (std::size_t i = 0; i < st_num_entries * dims; ++i) {
-      *ix_out++ = *st_ix++ + ((i % dims == primary_dim) ? shape_offset : 0);
+      const auto* st_ix = &st.ix_.matrix<int64>()(0, 0);
+      auto* ix_out = &ix_t(offset, 0);
+      for (std::size_t i = 0; i < st_num_entries * dims; ++i) {
+        *ix_out++ = *st_ix++ + ((i % dims == primary_dim) ? shape_offset : 0);
+      }
     }
 
     offset += st_num_entries;
diff --git a/tensorflow/core/util/sparse/sparse_tensor_test.cc b/tensorflow/core/util/sparse/sparse_tensor_test.cc
index 8a73211327b..792269fb028 100644
--- a/tensorflow/core/util/sparse/sparse_tensor_test.cc
+++ b/tensorflow/core/util/sparse/sparse_tensor_test.cc
@@ -592,6 +592,20 @@ TEST(SparseTensorTest, Concat) {
   EXPECT_EQ(conc_ooo.num_entries(), 4 * N);
 }
 
+TEST(SparseTensorTest, ConcatEmptyN) {
+  constexpr int N = 0;
+  constexpr int NDIM = 2;
+  Tensor ix(DT_INT64, TensorShape({N, NDIM}));
+  Tensor vals(DT_STRING, TensorShape({N}));
+  TensorShape shape({10, 10});
+  SparseTensor st;
+  TF_ASSERT_OK(SparseTensor::Create(ix, vals, shape, {0, 1}, &st));
+
+  SparseTensor concatted = SparseTensor::Concat<tstring>({st, st, st});
+
+  EXPECT_EQ(concatted.num_entries(), 0);
+}
+
 // TODO(ebrevdo): ReduceToDense(R={dim1,dim2,...}, reduce_fn, &output)
 // reduce_fn sees slices of resorted values based on generator (dim: DDIMS), and
 // slices of resorted indices on generator.
diff --git a/tensorflow/core/util/tensor_bundle/BUILD b/tensorflow/core/util/tensor_bundle/BUILD
index cbe1a89b230..df4a05b61d4 100644
--- a/tensorflow/core/util/tensor_bundle/BUILD
+++ b/tensorflow/core/util/tensor_bundle/BUILD
@@ -1,15 +1,21 @@
 # Description:
 #   Tensor bundle: a module to efficiently serialize and deserialize tensors.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
-    "cc_header_only_library",
     "if_not_windows",
     "if_windows",
     "tf_cc_test",
     "tf_copts",
 )
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
@@ -72,7 +78,7 @@ cc_library(
 tf_cc_test(
     name = "tensor_bundle_test",
     srcs = ["tensor_bundle_test.cc"],
-    data = glob(["testdata/**"]),
+    data = ["//tensorflow/core/util/tensor_bundle/testdata:old_string_tensors"],
     tags = [
         "nomsan",
         "notsan",
@@ -83,8 +89,8 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/framework:tensor_testutil",
     ],
 )
diff --git a/tensorflow/core/util/tensor_bundle/testdata/BUILD b/tensorflow/core/util/tensor_bundle/testdata/BUILD
new file mode 100644
index 00000000000..5ea691eae10
--- /dev/null
+++ b/tensorflow/core/util/tensor_bundle/testdata/BUILD
@@ -0,0 +1,12 @@
+# Description:
+# Old string tensors data package alias.
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+alias(
+    name = "old_string_tensors",
+    actual = "//tensorflow/core/util/tensor_bundle/testdata/old_string_tensors:old_string_tensors_testdata",
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/BUILD b/tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/BUILD
new file mode 100644
index 00000000000..b3f1eea2fc4
--- /dev/null
+++ b/tensorflow/core/util/tensor_bundle/testdata/old_string_tensors/BUILD
@@ -0,0 +1,17 @@
+# Description:
+# Old string tensors data packages.
+
+load("//tensorflow:tensorflow.bzl", "filegroup")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+filegroup(
+    name = "old_string_tensors_testdata",
+    srcs = [
+        "foo.data-00000-of-00001",
+        "foo.index",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/examples/adding_an_op/BUILD b/tensorflow/examples/adding_an_op/BUILD
index 22f3e29b052..f40a310728c 100644
--- a/tensorflow/examples/adding_an_op/BUILD
+++ b/tensorflow/examples/adding_an_op/BUILD
@@ -14,8 +14,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 tf_custom_op_library(
     name = "zero_out_op_kernel_1.so",
     srcs = ["zero_out_op_kernel_1.cc"],
diff --git a/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.jar b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.jar
deleted file mode 100644
index 13372aef5e2..00000000000
Binary files a/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.jar and /dev/null differ
diff --git a/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.properties b/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.properties
deleted file mode 100644
index 4a0bf945ec9..00000000000
--- a/tensorflow/examples/android/gradle/wrapper/gradle-wrapper.properties
+++ /dev/null
@@ -1,6 +0,0 @@
-#Thu Jul 23 05:42:16 CEST 2020
-distributionBase=GRADLE_USER_HOME
-distributionPath=wrapper/dists
-zipStoreBase=GRADLE_USER_HOME
-zipStorePath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-6.5.1-all.zip
diff --git a/tensorflow/examples/android/res/drawable-hdpi/ic_action_info.png b/tensorflow/examples/android/res/drawable-hdpi/ic_action_info.png
deleted file mode 100644
index 32bd1aabcab..00000000000
Binary files a/tensorflow/examples/android/res/drawable-hdpi/ic_action_info.png and /dev/null differ
diff --git a/tensorflow/examples/android/res/drawable-hdpi/ic_launcher.png b/tensorflow/examples/android/res/drawable-hdpi/ic_launcher.png
deleted file mode 100644
index b3113cd15c3..00000000000
Binary files a/tensorflow/examples/android/res/drawable-hdpi/ic_launcher.png and /dev/null differ
diff --git a/tensorflow/examples/android/res/drawable-hdpi/tile.9.png b/tensorflow/examples/android/res/drawable-hdpi/tile.9.png
deleted file mode 100644
index 135862883e2..00000000000
Binary files a/tensorflow/examples/android/res/drawable-hdpi/tile.9.png and /dev/null differ
diff --git a/tensorflow/examples/android/res/drawable-mdpi/ic_action_info.png b/tensorflow/examples/android/res/drawable-mdpi/ic_action_info.png
deleted file mode 100644
index 8efbbf8b3c4..00000000000
Binary files a/tensorflow/examples/android/res/drawable-mdpi/ic_action_info.png and /dev/null differ
diff --git a/tensorflow/examples/android/res/drawable-mdpi/ic_launcher.png b/tensorflow/examples/android/res/drawable-mdpi/ic_launcher.png
deleted file mode 100644
index 51f87ee6507..00000000000
Binary files a/tensorflow/examples/android/res/drawable-mdpi/ic_launcher.png and /dev/null differ
diff --git a/tensorflow/examples/android/res/drawable-xhdpi/ic_action_info.png b/tensorflow/examples/android/res/drawable-xhdpi/ic_action_info.png
deleted file mode 100644
index ba143ea7a80..00000000000
Binary files a/tensorflow/examples/android/res/drawable-xhdpi/ic_action_info.png and /dev/null differ
diff --git a/tensorflow/examples/android/res/drawable-xhdpi/ic_launcher.png b/tensorflow/examples/android/res/drawable-xhdpi/ic_launcher.png
deleted file mode 100644
index 6361d792dac..00000000000
Binary files a/tensorflow/examples/android/res/drawable-xhdpi/ic_launcher.png and /dev/null differ
diff --git a/tensorflow/examples/android/res/drawable-xxhdpi/ic_action_info.png b/tensorflow/examples/android/res/drawable-xxhdpi/ic_action_info.png
deleted file mode 100644
index 394eb7e5349..00000000000
Binary files a/tensorflow/examples/android/res/drawable-xxhdpi/ic_action_info.png and /dev/null differ
diff --git a/tensorflow/examples/android/res/drawable-xxhdpi/ic_launcher.png b/tensorflow/examples/android/res/drawable-xxhdpi/ic_launcher.png
deleted file mode 100644
index 2e27bec9785..00000000000
Binary files a/tensorflow/examples/android/res/drawable-xxhdpi/ic_launcher.png and /dev/null differ
diff --git a/tensorflow/examples/ios/.gitignore b/tensorflow/examples/ios/.gitignore
deleted file mode 100644
index dbabfb33bf1..00000000000
--- a/tensorflow/examples/ios/.gitignore
+++ /dev/null
@@ -1,7 +0,0 @@
-project.xcworkspace
-xcuserdata
-imagenet_comp_graph_label_strings.txt
-tensorflow_inception_graph.pb
-simple/data/LICENSE
-camera/data/LICENSE
-benchmark/data/LICENSE
diff --git a/tensorflow/examples/ios/README.md b/tensorflow/examples/ios/README.md
deleted file mode 100644
index 64412d25a00..00000000000
--- a/tensorflow/examples/ios/README.md
+++ /dev/null
@@ -1,194 +0,0 @@
-# TensorFlow iOS Examples
-
-This folder contains examples of how to build applications for iOS devices using TensorFlow.
-
-## Running the Samples using CocoaPod
- - You'll need Xcode 7.3 or later.
-
- - There are currently three examples: simple, benchmark, and camera. For now,
-   you can download the sample code by cloning the main tensorflow repository
-   (we are planning to make the samples available as a separate repository
-   later).
-
- - From the root of the tensorflow folder, download
-   [Inception v1](https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip),
-   and extract the label and graph files into the data folders inside both the
-   simple and camera examples:
-
-```bash
-mkdir -p ~/graphs
-curl -o ~/graphs/inception5h.zip \
- https://storage.googleapis.com/download.tensorflow.org/models/inception5h.zip \
- && unzip ~/graphs/inception5h.zip -d ~/graphs/inception5h
-cp ~/graphs/inception5h/* tensorflow/examples/ios/benchmark/data/
-cp ~/graphs/inception5h/* tensorflow/examples/ios/camera/data/
-cp ~/graphs/inception5h/* tensorflow/examples/ios/simple/data/
-```
-
- - Change directory to one of the samples, download the TensorFlow-experimental
-   pod, and open the Xcode workspace. Observe: installing the pod can take a
-   long time since it is big (~450MB). For example, if you want to run the
-   simple example, then:
-```bash
-cd tensorflow/examples/ios/simple
-pod install
-open tf_simple_example.xcworkspace # obs, not the .xcodeproj directory
-```
-
- - Run the simple app in the simulator. You should see a single-screen app with
-   a "Run Model" button. Tap that, and you should see some debug output appear
-   below indicating that the example Grace Hopper image in directory data has
-   been analyzed, with a military uniform recognized.
-
- - Run the other samples using the same process. The camera example requires a
-   real device connected. Once you build and run that, you should get a live
-   camera view that you can point at objects to get real-time recognition
-   results.
-
-### Troubleshooting
-
- - Make sure you use the TensorFlow-experimental pod (and not TensorFlow).
-
- - The TensorFlow-experimental pod is current about ~450MB. The reason it is
-   so big is because we are bundling multiple platforms, and the pod includes
-   all TensorFlow functionality (e.g. operations). The final app size after
-   build is substantially smaller though (~25MB). Working with the complete
-   pod is convenient during development, but see below section on how you can
-   build your own custom TensorFlow library to reduce the size.
-
-### Creating Your own App
-
- - Create your own app using Xcode then add a file named Podfile at the project
-   root directory with the following content:
-```bash
-target 'YourProjectName'
-       pod 'TensorFlow-experimental'
-```
-
- - Then you run ```pod install``` to download and install the
- TensorFlow-experimental pod, and finally perform
- ```open YourProjectName.xcworkspace``` and add your code.
-
- - In your apps "Build Settings", make sure to add $(inherited) to sections
-   "Other Linker Flags", and "Header Search Paths".
-
- - That's it. If you want to create your custom TensorFlow iOS library, for
-   example to reduce binary footprint, see below section.
-
-## Building the TensorFlow iOS libraries from source
-
- - You'll need Xcode 7.3 or later, with the command-line tools installed.
-
- - Follow the instructions at
-   [tensorflow/contrib/makefile](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/makefile)
-   under "iOS" to compile a static library containing the core TensorFlow code.
-
- - You should see a single-screen app with a "Run Model" button. Tap that, and
-   you should see some debug output appear below indicating that the example
-   Grace Hopper image has been analyzed, with a military uniform recognized.
-
- - Once you have success there, make sure you have a real device connected and
-   open up the Xcode project in the `camera` subfolder. Once you build and run
-   that, you should get a live camera view that you can point at objects to get
-   real-time recognition results.
-
-### Troubleshooting
-
-If you're hitting problems, here's a checklist of common things to investigate:
-
- - Make sure that you've run the `build_all_ios.sh` script.
-   This will run `download_dependencies.sh`,`compile_ios_protobuf.sh` and `compile_ios_tensorflow.sh`.
-   (check each one if they have run successful.)
-
- - Check that you have version 7.3 of Xcode.
-
- - If there's a complaint about no Sessions registered, that means that the C++
-   global constructors that TensorFlow relies on for registration haven't been
-   linked in properly. You'll have to make sure your project uses force_load, as
-   described below.
-
-### Creating your Own App from your source libraries
-
-You'll need to update various settings in your app to link against
-TensorFlow. You can view them in the example projects, but here's a full
-rundown:
-
- - The `compile_ios_tensorflow.sh` script builds a universal static library in
-   `tensorflow/contrib/makefile/gen/lib/libtensorflow-core.a`. You'll need to add
-   this to your linking build stage, and in Search Paths add
-   `tensorflow/contrib/makefile/gen/lib` to the Library Search Paths setting.
-
- - You'll also need to add `libprotobuf.a` and `libprotobuf-lite.a` from
-   `tensorflow/contrib/makefile/gen/protobuf_ios/lib`
-   and `nsync.a` from `tensorflow/contrib/makefile/downloads/nsync/builds/lipo.ios.c++11` 
-   to your _Build Stages_ and _Library Search Paths_.
-
- - The _Header Search_ paths needs to contain:
-   - the root folder of tensorflow,
-   - `tensorflow/contrib/makefile/downloads/nsync/public`
-   - `tensorflow/contrib/makefile/downloads/protobuf/src`
-   - `tensorflow/contrib/makefile/downloads`,
-   - `tensorflow/contrib/makefile/downloads/eigen`, and
-   - `tensorflow/contrib/makefile/gen/proto`.
-
- - In the Linking section, you need to add `-force_load` followed by the path to
-   the TensorFlow static library in the _Other Linker_ Flags section. This ensures
-   that the global C++ objects that are used to register important classes
-   inside the library are not stripped out. To the linker, they can appear
-   unused because no other code references the variables, but in fact their
-   constructors have the important side effect of registering the class.
-
- - You'll need to include the Accelerate framework in the "Link Binary with
-   Libraries" build phase of your project.
-
- - C++11 support (or later) should be enabled by setting `C++ Language Dialect` to
-   `GNU++11` (or `GNU++14`), and `C++ Standard Library` to `libc++`.
-
- - The library doesn't currently support bitcode, so you'll need to disable that
-   in your project settings.
-
- - Remove any use of the `-all_load` flag in your project. The protocol buffers
-   libraries (full and lite versions) contain duplicate symbols, and the
-   `-all_load` flag will cause these duplicates to become link errors. If you
-   were using `-all_load` to avoid issues with Objective-C categories in static
-   libraries, you may be able to replace it with the `-ObjC` flag.
-
-### Reducing the binary size
-
-TensorFlow is a comparatively large library for a mobile device, so it will
-increase the size of your app. Currently on iOS we see around a 11 MB binary
-footprint per CPU architecture, though we're actively working on reducing that.
-It can be tricky to set up the right configuration in your own app to keep the
-size minimized, so if you do run into this issue we recommend you start by
-looking at the simple example to examine its size. Here's how you do that:
-
- - Open the Xcode project in tensorflow/examples/ios/simple.
-
- - Make sure you've followed the steps above to get the data files.
-
- - Choose "Generic iOS Device" as the build configuration.
-
- - Select Product->Build.
-
- - Once the build's complete, open the Report Navigator and select the logs.
-
- - Near the bottom, you'll see a line saying "Touch tf_simple_example.app".
-
- - Expand that line using the icon on the right, and copy the first argument to
-   the Touch command.
-
- - Go to the terminal, type `ls -lah ` and then paste the path you copied.
-
- - For example it might look like `ls -lah /Users/petewarden/Library/Developer/Xcode/DerivedData/tf_simple_example-etdbksqytcnzeyfgdwiihzkqpxwr/Build/Products/Debug-iphoneos/tf_simple_example.app`
-
- - Running this command will show the size of the executable as the
-   `tf_simple_example` line.
-
-Right now you'll see a size of around 25 MB, since it's including two
-architectures (armv7 and arm64). As a first step, you should make sure the size
-increase you see in your own app is similar, and if it's larger, look at the
-"Other Linker Flags" used in the Simple Xcode project settings to strip the
-executable.
-
-For further optimization, please refer to the ["Optimization" section](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/makefile#optimization)
-of the makefile instructions.
diff --git a/tensorflow/examples/ios/benchmark/AppDelegate.mm b/tensorflow/examples/ios/benchmark/AppDelegate.mm
deleted file mode 100644
index 23ffba0f7b8..00000000000
--- a/tensorflow/examples/ios/benchmark/AppDelegate.mm
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#import "AppDelegate.h"
-
-#import "BenchmarkViewController.h"
-
-@implementation AppDelegate
-
-- (BOOL)application:(UIApplication *)application
-    didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
-
-  UITabBarController *bar = [[UITabBarController alloc] init];
-  [bar setViewControllers:
-      @[[[BenchmarkViewController alloc] init]]];
-  bar.selectedIndex = 0;
-  self.window = [[UIWindow alloc] initWithFrame:[[UIScreen mainScreen] bounds]];
-  self.window.rootViewController = bar;
-  [self.window makeKeyAndVisible];
-  return YES;
-}
-
-- (void)applicationWillResignActive:(UIApplication *)application {}
-
-- (void)applicationDidEnterBackground:(UIApplication *)application {}
-
-- (void)applicationWillEnterForeground:(UIApplication *)application {}
-
-- (void)applicationDidBecomeActive:(UIApplication *)application {}
-
-- (void)applicationWillTerminate:(UIApplication *)application {}
-
-@end
diff --git a/tensorflow/examples/ios/benchmark/Benchmark-Info.plist b/tensorflow/examples/ios/benchmark/Benchmark-Info.plist
deleted file mode 100644
index 0cdbf28a31b..00000000000
--- a/tensorflow/examples/ios/benchmark/Benchmark-Info.plist
+++ /dev/null
@@ -1,47 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>CFBundleDevelopmentRegion</key>
-	<string>en</string>
-	<key>CFBundleDisplayName</key>
-	<string>tf_benchmark_example</string>
-	<key>CFBundleExecutable</key>
-	<string>tf_benchmark_example</string>
-	<key>CFBundleIdentifier</key>
-	<string>com.google.tf_benchmark_example</string>
-	<key>CFBundleInfoDictionaryVersion</key>
-	<string>6.0</string>
-	<key>CFBundleName</key>
-	<string>ios-app</string>
-	<key>CFBundlePackageType</key>
-	<string>APPL</string>
-	<key>CFBundleShortVersionString</key>
-	<string>1.0</string>
-	<key>CFBundleSignature</key>
-	<string>????</string>
-	<key>CFBundleVersion</key>
-	<string>1.0</string>
-	<key>LSRequiresIPhoneOS</key>
-	<true/>
-	<key>UILaunchStoryboardName</key>
-	<string>BenchmarkViewController</string>
-	<key>UIRequiredDeviceCapabilities</key>
-	<array>
-		<string>armv7</string>
-	</array>
-	<key>UISupportedInterfaceOrientations</key>
-	<array>
-		<string>UIInterfaceOrientationPortrait</string>
-		<string>UIInterfaceOrientationLandscapeLeft</string>
-		<string>UIInterfaceOrientationLandscapeRight</string>
-	</array>
-	<key>UISupportedInterfaceOrientations~ipad</key>
-	<array>
-		<string>UIInterfaceOrientationPortrait</string>
-		<string>UIInterfaceOrientationPortraitUpsideDown</string>
-		<string>UIInterfaceOrientationLandscapeLeft</string>
-		<string>UIInterfaceOrientationLandscapeRight</string>
-	</array>
-</dict>
-</plist>
diff --git a/tensorflow/examples/ios/benchmark/BenchmarkViewController.h b/tensorflow/examples/ios/benchmark/BenchmarkViewController.h
deleted file mode 100644
index c9cbc492809..00000000000
--- a/tensorflow/examples/ios/benchmark/BenchmarkViewController.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#import <UIKit/UIKit.h>
-
-@interface BenchmarkViewController : UIViewController
-
-- (IBAction)getUrl:(id)sender;
-
-@property(weak, nonatomic) IBOutlet UITextView *urlContentTextView;
-@property(weak, nonatomic) IBOutlet UITextField *urlTextField;
-
-@end
diff --git a/tensorflow/examples/ios/benchmark/BenchmarkViewController.mm b/tensorflow/examples/ios/benchmark/BenchmarkViewController.mm
deleted file mode 100644
index 9fc5f6ded24..00000000000
--- a/tensorflow/examples/ios/benchmark/BenchmarkViewController.mm
+++ /dev/null
@@ -1,292 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#import "BenchmarkViewController.h"
-
-#include <pthread.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include <fstream>
-#include <queue>
-#include <sstream>
-#include <string>
-
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/stat_summarizer.h"
-
-#include "ios_image_load.h"
-
-NSString* RunInferenceOnImage();
-
-namespace {
-class IfstreamInputStream : public ::google::protobuf::io::CopyingInputStream {
- public:
-  explicit IfstreamInputStream(const std::string& file_name)
-      : ifs_(file_name.c_str(), std::ios::in | std::ios::binary) {}
-  ~IfstreamInputStream() { ifs_.close(); }
-
-  int Read(void* buffer, int size) {
-    if (!ifs_) {
-      return -1;
-    }
-    ifs_.read(static_cast<char*>(buffer), size);
-    return (int)ifs_.gcount();
-  }
-
- private:
-  std::ifstream ifs_;
-};
-}  // namespace
-
-@interface BenchmarkViewController ()
-@end
-
-@implementation BenchmarkViewController {
-}
-
-- (IBAction)getUrl:(id)sender {
-  NSString* inference_result = RunInferenceOnImage();
-  self.urlContentTextView.text = inference_result;
-}
-
-@end
-
-// Returns the top N confidence values over threshold in the provided vector,
-// sorted by confidence in descending order.
-static void GetTopN(
-    const Eigen::TensorMap<Eigen::Tensor<float, 1, Eigen::RowMajor>,
-                           Eigen::Aligned>& prediction,
-    const int num_results, const float threshold,
-    std::vector<std::pair<float, int>>* top_results) {
-  // Will contain top N results in ascending order.
-  std::priority_queue<std::pair<float, int>, std::vector<std::pair<float, int>>,
-                      std::greater<std::pair<float, int>>>
-      top_result_pq;
-
-  long count = prediction.size();
-  for (int i = 0; i < count; ++i) {
-    const float value = prediction(i);
-
-    // Only add it if it beats the threshold and has a chance at being in
-    // the top N.
-    if (value < threshold) {
-      continue;
-    }
-
-    top_result_pq.push(std::pair<float, int>(value, i));
-
-    // If at capacity, kick the smallest value out.
-    if (top_result_pq.size() > num_results) {
-      top_result_pq.pop();
-    }
-  }
-
-  // Copy to output vector and reverse into descending order.
-  while (!top_result_pq.empty()) {
-    top_results->push_back(top_result_pq.top());
-    top_result_pq.pop();
-  }
-  std::reverse(top_results->begin(), top_results->end());
-}
-
-bool PortableReadFileToProto(const std::string& file_name,
-                             ::google::protobuf::MessageLite* proto) {
-  ::google::protobuf::io::CopyingInputStreamAdaptor stream(
-      new IfstreamInputStream(file_name));
-  stream.SetOwnsCopyingStream(true);
-  // TODO(jiayq): the following coded stream is for debugging purposes to allow
-  // one to parse arbitrarily large messages for MessageLite. One most likely
-  // doesn't want to put protobufs larger than 64MB on Android, so we should
-  // eventually remove this and quit loud when a large protobuf is passed in.
-  ::google::protobuf::io::CodedInputStream coded_stream(&stream);
-  // Total bytes hard limit / warning limit are set to 1GB and 512MB
-  // respectively.
-  coded_stream.SetTotalBytesLimit(1024LL << 20, 512LL << 20);
-  return proto->ParseFromCodedStream(&coded_stream);
-}
-
-NSString* FilePathForResourceName(NSString* name, NSString* extension) {
-  NSString* file_path =
-      [[NSBundle mainBundle] pathForResource:name ofType:extension];
-  if (file_path == NULL) {
-    LOG(FATAL) << "Couldn't find '" << [name UTF8String] << "."
-               << [extension UTF8String] << "' in bundle.";
-  }
-  return file_path;
-}
-
-// A utility function to get the current time in seconds, for simple profiling.
-double time() {
-  timeval t;
-  gettimeofday(&t, nullptr);
-  return t.tv_sec + 1e-6 * t.tv_usec;
-}
-
-// Runs the session with profiling enabled, and prints out details of the time
-// that each node in the graph takes to the debug log.
-tensorflow::Status BenchmarkInference(
-    tensorflow::Session* session,
-    const std::vector<std::pair<tensorflow::string, tensorflow::Tensor>> inputs,
-    const std::vector<tensorflow::string>& output_layer_names,
-    std::vector<tensorflow::Tensor>* output_layers,
-    tensorflow::StatSummarizer* stat_summarizer, double* average_time) {
-  tensorflow::Status run_status;
-  const int iterations_count = 20;
-  double total_time = 0.0;
-  tensorflow::RunOptions run_options;
-  run_options.set_trace_level(tensorflow::RunOptions::FULL_TRACE);
-  tensorflow::RunMetadata run_metadata;
-  for (int iteration = 0; iteration < (iterations_count + 1); ++iteration) {
-    const double start_time = time();
-    run_status = session->Run(run_options, inputs, output_layer_names, {},
-                              output_layers, &run_metadata);
-    const double end_time = time();
-    if (iteration != 0) {
-      total_time += end_time - start_time;
-    }
-    if (!run_status.ok()) {
-      LOG(ERROR) << "Running model failed: " << run_status;
-      tensorflow::LogAllRegisteredKernels();
-      return run_status;
-    }
-  }
-  assert(run_metadata.has_step_stats());
-  const tensorflow::StepStats& step_stats = run_metadata.step_stats();
-  stat_summarizer->ProcessStepStats(step_stats);
-  stat_summarizer->PrintStepStats();
-
-  *average_time = total_time / iterations_count;
-  NSLog(@"Took %f seconds", *average_time);
-
-  return tensorflow::Status::OK();
-}
-
-NSString* RunInferenceOnImage() {
-  tensorflow::SessionOptions options;
-
-  tensorflow::Session* session_pointer = nullptr;
-  tensorflow::Status session_status =
-      tensorflow::NewSession(options, &session_pointer);
-  if (!session_status.ok()) {
-    std::string status_string = session_status.ToString();
-    return [NSString
-        stringWithFormat:@"Session create failed - %s", status_string.c_str()];
-  }
-  std::unique_ptr<tensorflow::Session> session(session_pointer);
-  LOG(INFO) << "Session created.";
-
-  tensorflow::GraphDef tensorflow_graph;
-  LOG(INFO) << "Graph created.";
-
-  NSString* network_path =
-      FilePathForResourceName(@"tensorflow_inception_graph", @"pb");
-  PortableReadFileToProto([network_path UTF8String], &tensorflow_graph);
-
-  LOG(INFO) << "Creating session.";
-  tensorflow::Status s = session->Create(tensorflow_graph);
-  if (!s.ok()) {
-    LOG(ERROR) << "Could not create TensorFlow Graph: " << s;
-    return @"";
-  }
-
-  // Read the label list
-  NSString* labels_path =
-      FilePathForResourceName(@"imagenet_comp_graph_label_strings", @"txt");
-  std::vector<std::string> label_strings;
-  std::ifstream t;
-  t.open([labels_path UTF8String]);
-  std::string line;
-  while (t) {
-    std::getline(t, line);
-    label_strings.push_back(line);
-  }
-  t.close();
-
-  // Read the Grace Hopper image.
-  NSString* image_path = FilePathForResourceName(@"grace_hopper", @"jpg");
-  int image_width;
-  int image_height;
-  int image_channels;
-  std::vector<tensorflow::uint8> image_data = LoadImageFromFile(
-      [image_path UTF8String], &image_width, &image_height, &image_channels);
-  const int wanted_width = 224;
-  const int wanted_height = 224;
-  const int wanted_channels = 3;
-  const float input_mean = 117.0f;
-  const float input_std = 1.0f;
-  assert(image_channels >= wanted_channels);
-  tensorflow::Tensor image_tensor(
-      tensorflow::DT_FLOAT,
-      tensorflow::TensorShape(
-          {1, wanted_height, wanted_width, wanted_channels}));
-  auto image_tensor_mapped = image_tensor.tensor<float, 4>();
-  tensorflow::uint8* in = image_data.data();
-  float* out = image_tensor_mapped.data();
-  for (int y = 0; y < wanted_height; ++y) {
-    const int in_y = (y * image_height) / wanted_height;
-    tensorflow::uint8* in_row = in + (in_y * image_width * image_channels);
-    float* out_row = out + (y * wanted_width * wanted_channels);
-    for (int x = 0; x < wanted_width; ++x) {
-      const int in_x = (x * image_width) / wanted_width;
-      tensorflow::uint8* in_pixel = in_row + (in_x * image_channels);
-      float* out_pixel = out_row + (x * wanted_channels);
-      for (int c = 0; c < wanted_channels; ++c) {
-        out_pixel[c] = (in_pixel[c] - input_mean) / input_std;
-      }
-    }
-  }
-  tensorflow::string input_layer = "input";
-  tensorflow::string output_layer = "output";
-  std::vector<tensorflow::Tensor> outputs;
-  tensorflow::StatSummarizer stat_summarizer(tensorflow_graph);
-  double average_time = 0.0;
-  BenchmarkInference(session.get(), {{input_layer, image_tensor}},
-                     {output_layer}, &outputs, &stat_summarizer, &average_time);
-  NSString* result =
-      [NSString stringWithFormat:@"Average time: %.4f seconds \n\n", average_time];
-
-  tensorflow::Tensor* output = &outputs[0];
-  const int kNumResults = 5;
-  const float kThreshold = 0.1f;
-  std::vector<std::pair<float, int>> top_results;
-  GetTopN(output->flat<float>(), kNumResults, kThreshold, &top_results);
-
-  std::stringstream ss;
-  ss.precision(3);
-  for (const auto& result : top_results) {
-    const float confidence = result.first;
-    const int index = result.second;
-
-    ss << index << " " << confidence << " ";
-
-    // Write out the result as a string
-    if (index < label_strings.size()) {
-      // just for safety: theoretically, the output is under 1000 unless there
-      // is some numerical issues leading to a wrong prediction.
-      ss << label_strings[index];
-    } else {
-      ss << "Prediction: " << index;
-    }
-
-    ss << "\n";
-  }
-
-  LOG(INFO) << "Predictions: " << ss.str();
-
-  tensorflow::string predictions = ss.str();
-  result = [NSString stringWithFormat:@"%@ - %s", result, predictions.c_str()];
-
-  return result;
-}
diff --git a/tensorflow/examples/ios/benchmark/BenchmarkViewController.xib b/tensorflow/examples/ios/benchmark/BenchmarkViewController.xib
deleted file mode 100644
index 56c37080621..00000000000
--- a/tensorflow/examples/ios/benchmark/BenchmarkViewController.xib
+++ /dev/null
@@ -1,47 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.XIB" version="3.0" toolsVersion="10117" systemVersion="15G31" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES">
-    <dependencies>
-        <deployment identifier="iOS"/>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="10085"/>
-    </dependencies>
-    <objects>
-        <placeholder placeholderIdentifier="IBFilesOwner" id="-1" userLabel="File's Owner" customClass="BenchmarkViewController">
-            <connections>
-                <outlet property="urlContentTextView" destination="quY-AK-ZCn" id="YjW-BO-1Ta"/>
-                <outlet property="urlTextField" destination="hPw-q5-vh5" id="wmc-b6-2CV"/>
-                <outlet property="view" destination="1" id="iHm-Rr-4wj"/>
-            </connections>
-        </placeholder>
-        <placeholder placeholderIdentifier="IBFirstResponder" id="-2" customClass="UIResponder"/>
-        <view contentMode="scaleToFill" id="1">
-            <rect key="frame" x="0.0" y="0.0" width="320" height="568"/>
-            <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
-            <subviews>
-                <textView clipsSubviews="YES" contentMode="scaleToFill" fixedFrame="YES" editable="NO" text="The results of running the model will appear here." selectable="NO" translatesAutoresizingMaskIntoConstraints="NO" id="quY-AK-ZCn">
-                    <rect key="frame" x="40" y="99" width="240" height="168"/>
-                    <color key="backgroundColor" white="1" alpha="1" colorSpace="calibratedWhite"/>
-                    <fontDescription key="fontDescription" type="system" pointSize="14"/>
-                    <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
-                </textView>
-                <button opaque="NO" contentMode="scaleToFill" fixedFrame="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="AAC-Bk-PCC">
-                    <rect key="frame" x="76" y="37" width="168" height="30"/>
-                    <color key="backgroundColor" white="0.33333333333333331" alpha="1" colorSpace="calibratedWhite"/>
-                    <state key="normal" title="Benchmark Model">
-                        <color key="titleShadowColor" white="0.5" alpha="1" colorSpace="calibratedWhite"/>
-                    </state>
-                    <connections>
-                        <action selector="getUrl:" destination="-1" eventType="touchUpInside" id="mdP-nK-k9T"/>
-                    </connections>
-                </button>
-            </subviews>
-            <color key="backgroundColor" red="0.78314738357315861" green="0.79869981749999996" blue="0.56305065858222869" alpha="1" colorSpace="calibratedRGB"/>
-        </view>
-        <textField opaque="NO" clipsSubviews="YES" contentMode="scaleToFill" contentHorizontalAlignment="left" contentVerticalAlignment="center" text="http://localhost:8080" borderStyle="roundedRect" placeholder="Enter URL" minimumFontSize="17" id="hPw-q5-vh5">
-            <rect key="frame" x="0.0" y="0.0" width="280" height="30"/>
-            <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
-            <fontDescription key="fontDescription" type="system" pointSize="14"/>
-            <textInputTraits key="textInputTraits"/>
-            <point key="canvasLocation" x="795" y="44"/>
-        </textField>
-    </objects>
-</document>
diff --git a/tensorflow/examples/ios/benchmark/Podfile b/tensorflow/examples/ios/benchmark/Podfile
deleted file mode 100644
index e163d56e8d2..00000000000
--- a/tensorflow/examples/ios/benchmark/Podfile
+++ /dev/null
@@ -1,5 +0,0 @@
-platform :ios, '8.0'
-inhibit_all_warnings!
-
-target 'tf_benchmark_example'
-       pod 'TensorFlow-experimental'
diff --git a/tensorflow/examples/ios/benchmark/data/grace_hopper.jpg b/tensorflow/examples/ios/benchmark/data/grace_hopper.jpg
deleted file mode 100644
index d2a427810f6..00000000000
Binary files a/tensorflow/examples/ios/benchmark/data/grace_hopper.jpg and /dev/null differ
diff --git a/tensorflow/examples/ios/benchmark/ios_image_load.h b/tensorflow/examples/ios/benchmark/ios_image_load.h
deleted file mode 100644
index 3f949846923..00000000000
--- a/tensorflow/examples/ios/benchmark/ios_image_load.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_EXAMPLES_IOS_BENCHMARK_IOS_IMAGE_LOAD_H_
-#define TENSORFLOW_EXAMPLES_IOS_BENCHMARK_IOS_IMAGE_LOAD_H_
-
-#include <vector>
-
-#include "tensorflow/core/framework/types.h"
-
-std::vector<tensorflow::uint8> LoadImageFromFile(const char* file_name,
-                                                 int* out_width,
-                                                 int* out_height,
-                                                 int* out_channels);
-
-#endif  // TENSORFLOW_EXAMPLES_IOS_BENCHMARK_IOS_IMAGE_LOAD_H_
diff --git a/tensorflow/examples/ios/benchmark/ios_image_load.mm b/tensorflow/examples/ios/benchmark/ios_image_load.mm
deleted file mode 100644
index 64d1ea21cf2..00000000000
--- a/tensorflow/examples/ios/benchmark/ios_image_load.mm
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ios_image_load.h"
-
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-#include <stdio.h>
-
-#import <CoreImage/CoreImage.h>
-#import <ImageIO/ImageIO.h>
-
-using tensorflow::uint8;
-
-std::vector<uint8> LoadImageFromFile(const char* file_name,
-				     int* out_width, int* out_height,
-				     int* out_channels) {
-  FILE* file_handle = fopen(file_name, "rb");
-  fseek(file_handle, 0, SEEK_END);
-  const size_t bytes_in_file = ftell(file_handle);
-  fseek(file_handle, 0, SEEK_SET);
-  std::vector<uint8> file_data(bytes_in_file);
-  fread(file_data.data(), 1, bytes_in_file, file_handle);
-  fclose(file_handle);
-  CFDataRef file_data_ref = CFDataCreateWithBytesNoCopy(NULL, file_data.data(),
-						      bytes_in_file,
-						      kCFAllocatorNull);
-  CGDataProviderRef image_provider =
-    CGDataProviderCreateWithCFData(file_data_ref);
-
-  const char* suffix = strrchr(file_name, '.');
-  if (!suffix || suffix == file_name) {
-    suffix = "";
-  }
-  CGImageRef image;
-  if (strcasecmp(suffix, ".png") == 0) {
-    image = CGImageCreateWithPNGDataProvider(image_provider, NULL, true,
-					     kCGRenderingIntentDefault);
-  } else if ((strcasecmp(suffix, ".jpg") == 0) ||
-    (strcasecmp(suffix, ".jpeg") == 0)) {
-    image = CGImageCreateWithJPEGDataProvider(image_provider, NULL, true,
-					      kCGRenderingIntentDefault);
-  } else {
-    CFRelease(image_provider);
-    CFRelease(file_data_ref);
-    fprintf(stderr, "Unknown suffix for file '%s'\n", file_name);
-    *out_width = 0;
-    *out_height = 0;
-    *out_channels = 0;
-    return std::vector<uint8>();
-  }
-
-  const int width = (int)CGImageGetWidth(image);
-  const int height = (int)CGImageGetHeight(image);
-  const int channels = 4;
-  CGColorSpaceRef color_space = CGColorSpaceCreateDeviceRGB();
-  const int bytes_per_row = (width * channels);
-  const int bytes_in_image = (bytes_per_row * height);
-  std::vector<uint8> result(bytes_in_image);
-  const int bits_per_component = 8;
-  CGContextRef context = CGBitmapContextCreate(result.data(), width, height,
-    bits_per_component, bytes_per_row, color_space,
-    kCGImageAlphaPremultipliedLast | kCGBitmapByteOrder32Big);
-  CGColorSpaceRelease(color_space);
-  CGContextDrawImage(context, CGRectMake(0, 0, width, height), image);
-  CGContextRelease(context);
-  CFRelease(image);
-  CFRelease(image_provider);
-  CFRelease(file_data_ref);
-
-  *out_width = width;
-  *out_height = height;
-  *out_channels = channels;
-  return result;
-}
diff --git a/tensorflow/examples/ios/benchmark/main.mm b/tensorflow/examples/ios/benchmark/main.mm
deleted file mode 100644
index d70550a7307..00000000000
--- a/tensorflow/examples/ios/benchmark/main.mm
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#import <UIKit/UIKit.h>
-
-int main(int argc, char * argv[]) {
-  @autoreleasepool {
-    NSString *delegateClassName = @"AppDelegate";
-    return UIApplicationMain(argc, argv, nil, delegateClassName);
-  }
-}
diff --git a/tensorflow/examples/ios/benchmark/tf_benchmark_example.xcodeproj/project.pbxproj b/tensorflow/examples/ios/benchmark/tf_benchmark_example.xcodeproj/project.pbxproj
deleted file mode 100644
index d61b65ba614..00000000000
--- a/tensorflow/examples/ios/benchmark/tf_benchmark_example.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,388 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 46;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		1C8BA8FD1EC682E700CCCC8C /* main.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFC1CF4E68100C4259F /* main.mm */; };
-		1C8BA8FE1EC682E700CCCC8C /* AppDelegate.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFF21CF4E68100C4259F /* AppDelegate.mm */; };
-		1C8BA8FF1EC682E700CCCC8C /* BenchmarkViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFF1CF4E68100C4259F /* BenchmarkViewController.mm */; };
-		1C8BA9001EC682E700CCCC8C /* ios_image_load.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */; };
-		1C8BA9051EC682E700CCCC8C /* BenchmarkViewController.xib in Resources */ = {isa = PBXBuildFile; fileRef = 59A3D0001CF4E68100C4259F /* BenchmarkViewController.xib */; };
-		1C8BA9061EC682E700CCCC8C /* imagenet_comp_graph_label_strings.txt in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */; };
-		1C8BA9071EC682E700CCCC8C /* tensorflow_inception_graph.pb in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */; };
-		1C8BA9081EC682E700CCCC8C /* grace_hopper.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */; };
-		1CB1883E1ECCC0DC00C93EF7 /* CoreGraphics.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CB1883D1ECCC0DC00C93EF7 /* CoreGraphics.framework */; };
-		1CB1883F1ECCC10D00C93EF7 /* UIKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1C7AC7FC1ECCBFE400EAE588 /* UIKit.framework */; };
-		1E0EBA4DF4C722C63814B257 /* libPods-tf_benchmark_example.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 8C4FE48552EFB73D066C66E9 /* libPods-tf_benchmark_example.a */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXFileReference section */
-		1C7AC7FC1ECCBFE400EAE588 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = System/Library/Frameworks/UIKit.framework; sourceTree = SDKROOT; };
-		1C8BA90C1EC682E700CCCC8C /* tf_benchmark_example.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = tf_benchmark_example.app; sourceTree = BUILT_PRODUCTS_DIR; };
-		1CB1883B1ECCC09A00C93EF7 /* CoreFoundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreFoundation.framework; path = System/Library/Frameworks/CoreFoundation.framework; sourceTree = SDKROOT; };
-		1CB1883D1ECCC0DC00C93EF7 /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; };
-		59A3CFF11CF4E68100C4259F /* AppDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
-		59A3CFF21CF4E68100C4259F /* AppDelegate.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = AppDelegate.mm; sourceTree = "<group>"; };
-		59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = grace_hopper.jpg; sourceTree = "<group>"; };
-		59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = imagenet_comp_graph_label_strings.txt; sourceTree = "<group>"; };
-		59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */ = {isa = PBXFileReference; lastKnownFileType = file; path = tensorflow_inception_graph.pb; sourceTree = "<group>"; };
-		59A3CFFA1CF4E68100C4259F /* ios_image_load.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ios_image_load.h; sourceTree = "<group>"; };
-		59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ios_image_load.mm; sourceTree = "<group>"; };
-		59A3CFFC1CF4E68100C4259F /* main.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = main.mm; sourceTree = "<group>"; };
-		59A3CFFD1CF4E68100C4259F /* Benchmark-Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = "Benchmark-Info.plist"; sourceTree = "<group>"; };
-		59A3CFFE1CF4E68100C4259F /* BenchmarkViewController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = BenchmarkViewController.h; sourceTree = "<group>"; };
-		59A3CFFF1CF4E68100C4259F /* BenchmarkViewController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = BenchmarkViewController.mm; sourceTree = "<group>"; };
-		59A3D0001CF4E68100C4259F /* BenchmarkViewController.xib */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = file.xib; path = BenchmarkViewController.xib; sourceTree = "<group>"; };
-		5FD1623E64FC0154A67E8DD5 /* Pods-tf_benchmark_example.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tf_benchmark_example.debug.xcconfig"; path = "Pods/Target Support Files/Pods-tf_benchmark_example/Pods-tf_benchmark_example.debug.xcconfig"; sourceTree = "<group>"; };
-		8C4FE48552EFB73D066C66E9 /* libPods-tf_benchmark_example.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-tf_benchmark_example.a"; sourceTree = BUILT_PRODUCTS_DIR; };
-		DB6B3E596779C98202E84711 /* Pods-tf_benchmark_example.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tf_benchmark_example.release.xcconfig"; path = "Pods/Target Support Files/Pods-tf_benchmark_example/Pods-tf_benchmark_example.release.xcconfig"; sourceTree = "<group>"; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		1C8BA9011EC682E700CCCC8C /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				1CB1883F1ECCC10D00C93EF7 /* UIKit.framework in Frameworks */,
-				1CB1883E1ECCC0DC00C93EF7 /* CoreGraphics.framework in Frameworks */,
-				1E0EBA4DF4C722C63814B257 /* libPods-tf_benchmark_example.a in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		2BD56010B574F539C2070A57 /* Pods */ = {
-			isa = PBXGroup;
-			children = (
-				5FD1623E64FC0154A67E8DD5 /* Pods-tf_benchmark_example.debug.xcconfig */,
-				DB6B3E596779C98202E84711 /* Pods-tf_benchmark_example.release.xcconfig */,
-			);
-			name = Pods;
-			sourceTree = "<group>";
-		};
-		591157921CF4011C00C31E3A = {
-			isa = PBXGroup;
-			children = (
-				59A3CFF11CF4E68100C4259F /* AppDelegate.h */,
-				59A3CFF21CF4E68100C4259F /* AppDelegate.mm */,
-				59A3CFF31CF4E68100C4259F /* data */,
-				59A3CFFA1CF4E68100C4259F /* ios_image_load.h */,
-				59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */,
-				59A3CFFC1CF4E68100C4259F /* main.mm */,
-				59A3CFFD1CF4E68100C4259F /* Benchmark-Info.plist */,
-				59A3CFFE1CF4E68100C4259F /* BenchmarkViewController.h */,
-				59A3CFFF1CF4E68100C4259F /* BenchmarkViewController.mm */,
-				59A3D0001CF4E68100C4259F /* BenchmarkViewController.xib */,
-				5911579C1CF4011C00C31E3A /* Products */,
-				2BD56010B574F539C2070A57 /* Pods */,
-				76A25A27041EB307BDFF0DD1 /* Frameworks */,
-			);
-			sourceTree = "<group>";
-		};
-		5911579C1CF4011C00C31E3A /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				1C8BA90C1EC682E700CCCC8C /* tf_benchmark_example.app */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		59A3CFF31CF4E68100C4259F /* data */ = {
-			isa = PBXGroup;
-			children = (
-				59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */,
-				59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */,
-				59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */,
-			);
-			path = data;
-			sourceTree = "<group>";
-		};
-		76A25A27041EB307BDFF0DD1 /* Frameworks */ = {
-			isa = PBXGroup;
-			children = (
-				1CB1883D1ECCC0DC00C93EF7 /* CoreGraphics.framework */,
-				1CB1883B1ECCC09A00C93EF7 /* CoreFoundation.framework */,
-				1C7AC7FC1ECCBFE400EAE588 /* UIKit.framework */,
-				8C4FE48552EFB73D066C66E9 /* libPods-tf_benchmark_example.a */,
-			);
-			name = Frameworks;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		1C8BA8FB1EC682E700CCCC8C /* tf_benchmark_example */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 1C8BA9091EC682E700CCCC8C /* Build configuration list for PBXNativeTarget "tf_benchmark_example" */;
-			buildPhases = (
-				0388D751057A257A12848245 /* [CP] Check Pods Manifest.lock */,
-				1C8BA8FC1EC682E700CCCC8C /* Sources */,
-				1C8BA9011EC682E700CCCC8C /* Frameworks */,
-				1C8BA9041EC682E700CCCC8C /* Resources */,
-				8999A303091D4E86202C2F64 /* [CP] Embed Pods Frameworks */,
-				A7B4B278BCC417B76A47ABB0 /* [CP] Copy Pods Resources */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = tf_benchmark_example;
-			productName = benchmark;
-			productReference = 1C8BA90C1EC682E700CCCC8C /* tf_benchmark_example.app */;
-			productType = "com.apple.product-type.application";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		591157931CF4011C00C31E3A /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				LastUpgradeCheck = 0830;
-				ORGANIZATIONNAME = Google;
-			};
-			buildConfigurationList = 591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tf_benchmark_example" */;
-			compatibilityVersion = "Xcode 3.2";
-			developmentRegion = English;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-				Base,
-			);
-			mainGroup = 591157921CF4011C00C31E3A;
-			productRefGroup = 5911579C1CF4011C00C31E3A /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				1C8BA8FB1EC682E700CCCC8C /* tf_benchmark_example */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXResourcesBuildPhase section */
-		1C8BA9041EC682E700CCCC8C /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				1C8BA9051EC682E700CCCC8C /* BenchmarkViewController.xib in Resources */,
-				1C8BA9061EC682E700CCCC8C /* imagenet_comp_graph_label_strings.txt in Resources */,
-				1C8BA9071EC682E700CCCC8C /* tensorflow_inception_graph.pb in Resources */,
-				1C8BA9081EC682E700CCCC8C /* grace_hopper.jpg in Resources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXResourcesBuildPhase section */
-
-/* Begin PBXShellScriptBuildPhase section */
-		0388D751057A257A12848245 /* [CP] Check Pods Manifest.lock */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-			);
-			name = "[CP] Check Pods Manifest.lock";
-			outputPaths = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n";
-			showEnvVarsInLog = 0;
-		};
-		8999A303091D4E86202C2F64 /* [CP] Embed Pods Frameworks */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-			);
-			name = "[CP] Embed Pods Frameworks";
-			outputPaths = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tf_benchmark_example/Pods-tf_benchmark_example-frameworks.sh\"\n";
-			showEnvVarsInLog = 0;
-		};
-		A7B4B278BCC417B76A47ABB0 /* [CP] Copy Pods Resources */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-			);
-			name = "[CP] Copy Pods Resources";
-			outputPaths = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tf_benchmark_example/Pods-tf_benchmark_example-resources.sh\"\n";
-			showEnvVarsInLog = 0;
-		};
-/* End PBXShellScriptBuildPhase section */
-
-/* Begin PBXSourcesBuildPhase section */
-		1C8BA8FC1EC682E700CCCC8C /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				1C8BA8FD1EC682E700CCCC8C /* main.mm in Sources */,
-				1C8BA8FE1EC682E700CCCC8C /* AppDelegate.mm in Sources */,
-				1C8BA8FF1EC682E700CCCC8C /* BenchmarkViewController.mm in Sources */,
-				1C8BA9001EC682E700CCCC8C /* ios_image_load.mm in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin XCBuildConfiguration section */
-		1C8BA90A1EC682E700CCCC8C /* Debug */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = 5FD1623E64FC0154A67E8DD5 /* Pods-tf_benchmark_example.debug.xcconfig */;
-			buildSettings = {
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				ENABLE_BITCODE = NO;
-				HEADER_SEARCH_PATHS = "$(inherited)";
-				INFOPLIST_FILE = "$(SRCROOT)/Benchmark-Info.plist";
-				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
-				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
-				LIBRARY_SEARCH_PATHS = "";
-				OTHER_LDFLAGS = "$(inherited)";
-				PRODUCT_BUNDLE_IDENTIFIER = "com.google.tf-benchmark-example";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-			};
-			name = Debug;
-		};
-		1C8BA90B1EC682E700CCCC8C /* Release */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = DB6B3E596779C98202E84711 /* Pods-tf_benchmark_example.release.xcconfig */;
-			buildSettings = {
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				ENABLE_BITCODE = NO;
-				HEADER_SEARCH_PATHS = "$(inherited)";
-				INFOPLIST_FILE = "$(SRCROOT)/Benchmark-Info.plist";
-				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
-				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
-				LIBRARY_SEARCH_PATHS = "";
-				ONLY_ACTIVE_ARCH = YES;
-				OTHER_LDFLAGS = "$(inherited)";
-				PRODUCT_BUNDLE_IDENTIFIER = "com.google.tf-benchmark-example";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-			};
-			name = Release;
-		};
-		591157B01CF4011D00C31E3A /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = dwarf;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_TESTABILITY = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu99;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
-				MTL_ENABLE_DEBUG_INFO = YES;
-				ONLY_ACTIVE_ARCH = YES;
-				SDKROOT = iphoneos;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Debug;
-		};
-		591157B11CF4011D00C31E3A /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				ENABLE_NS_ASSERTIONS = NO;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu99;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				SDKROOT = iphoneos;
-				TARGETED_DEVICE_FAMILY = "1,2";
-				VALIDATE_PRODUCT = YES;
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		1C8BA9091EC682E700CCCC8C /* Build configuration list for PBXNativeTarget "tf_benchmark_example" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				1C8BA90A1EC682E700CCCC8C /* Debug */,
-				1C8BA90B1EC682E700CCCC8C /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tf_benchmark_example" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				591157B01CF4011D00C31E3A /* Debug */,
-				591157B11CF4011D00C31E3A /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = 591157931CF4011C00C31E3A /* Project object */;
-}
diff --git a/tensorflow/examples/ios/camera/CameraExampleAppDelegate.h b/tensorflow/examples/ios/camera/CameraExampleAppDelegate.h
deleted file mode 100644
index 0039d5e7cab..00000000000
--- a/tensorflow/examples/ios/camera/CameraExampleAppDelegate.h
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#import <UIKit/UIKit.h>
-
-@interface CameraExampleAppDelegate : UIResponder<UIApplicationDelegate>
-
-@property(strong, nonatomic) UIWindow *window;
-
-@end
diff --git a/tensorflow/examples/ios/camera/CameraExampleAppDelegate.m b/tensorflow/examples/ios/camera/CameraExampleAppDelegate.m
deleted file mode 100644
index d134c2b591e..00000000000
--- a/tensorflow/examples/ios/camera/CameraExampleAppDelegate.m
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#import "CameraExampleAppDelegate.h"
-
-@implementation CameraExampleAppDelegate
-
-@synthesize window = _window;
-
-- (BOOL)application:(UIApplication *)application
-    didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
-  [self.window makeKeyAndVisible];
-  return YES;
-}
-
-- (void)applicationWillResignActive:(UIApplication *)application {
-  [[UIApplication sharedApplication] setIdleTimerDisabled:NO];
-}
-
-- (void)applicationDidEnterBackground:(UIApplication *)application {
-}
-
-- (void)applicationWillEnterForeground:(UIApplication *)application {
-}
-
-- (void)applicationDidBecomeActive:(UIApplication *)application {
-  [[UIApplication sharedApplication] setIdleTimerDisabled:YES];
-}
-
-- (void)applicationWillTerminate:(UIApplication *)application {
-}
-
-@end
diff --git a/tensorflow/examples/ios/camera/CameraExampleViewController.h b/tensorflow/examples/ios/camera/CameraExampleViewController.h
deleted file mode 100644
index 0aefbc6eedb..00000000000
--- a/tensorflow/examples/ios/camera/CameraExampleViewController.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#import <AVFoundation/AVFoundation.h>
-#import <UIKit/UIKit.h>
-
-#include <memory>
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/memmapped_file_system.h"
-
-@interface CameraExampleViewController
-    : UIViewController<UIGestureRecognizerDelegate,
-                       AVCaptureVideoDataOutputSampleBufferDelegate> {
-  IBOutlet UIView *previewView;
-  IBOutlet UISegmentedControl *camerasControl;
-  AVCaptureVideoPreviewLayer *previewLayer;
-  AVCaptureVideoDataOutput *videoDataOutput;
-  dispatch_queue_t videoDataOutputQueue;
-  AVCaptureStillImageOutput *stillImageOutput;
-  UIView *flashView;
-  UIImage *square;
-  BOOL isUsingFrontFacingCamera;
-  AVSpeechSynthesizer *synth;
-  NSMutableDictionary *oldPredictionValues;
-  NSMutableArray *labelLayers;
-  AVCaptureSession *session;
-  std::unique_ptr<tensorflow::Session> tf_session;
-  std::unique_ptr<tensorflow::MemmappedEnv> tf_memmapped_env;
-  std::vector<std::string> labels;
-}
-@property(strong, nonatomic) CATextLayer *predictionTextLayer;
-
-- (IBAction)takePicture:(id)sender;
-- (IBAction)switchCameras:(id)sender;
-
-@end
diff --git a/tensorflow/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/examples/ios/camera/CameraExampleViewController.mm
deleted file mode 100644
index d113d50ff8e..00000000000
--- a/tensorflow/examples/ios/camera/CameraExampleViewController.mm
+++ /dev/null
@@ -1,621 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#import <AssertMacros.h>
-#import <AssetsLibrary/AssetsLibrary.h>
-#import <CoreImage/CoreImage.h>
-#import <ImageIO/ImageIO.h>
-#import "CameraExampleViewController.h"
-
-#include <sys/time.h>
-
-#include "tensorflow_utils.h"
-
-// If you have your own model, modify this to the file name, and make sure
-// you've added the file to your app resources too.
-static NSString* model_file_name = @"tensorflow_inception_graph";
-static NSString* model_file_type = @"pb";
-// This controls whether we'll be loading a plain GraphDef proto, or a
-// file created by the convert_graphdef_memmapped_format utility that wraps a
-// GraphDef and parameter file that can be mapped into memory from file to
-// reduce overall memory usage.
-const bool model_uses_memory_mapping = false;
-// If you have your own model, point this to the labels file.
-static NSString* labels_file_name = @"imagenet_comp_graph_label_strings";
-static NSString* labels_file_type = @"txt";
-// These dimensions need to match those the model was trained with.
-const int wanted_input_width = 224;
-const int wanted_input_height = 224;
-const int wanted_input_channels = 3;
-const float input_mean = 117.0f;
-const float input_std = 1.0f;
-const std::string input_layer_name = "input";
-const std::string output_layer_name = "softmax1";
-
-static void *AVCaptureStillImageIsCapturingStillImageContext =
-    &AVCaptureStillImageIsCapturingStillImageContext;
-
-@interface CameraExampleViewController (InternalMethods)
-- (void)setupAVCapture;
-- (void)teardownAVCapture;
-@end
-
-@implementation CameraExampleViewController
-
-- (void)setupAVCapture {
-  NSError *error = nil;
-
-  session = [AVCaptureSession new];
-  if ([[UIDevice currentDevice] userInterfaceIdiom] ==
-      UIUserInterfaceIdiomPhone)
-    [session setSessionPreset:AVCaptureSessionPreset640x480];
-  else
-    [session setSessionPreset:AVCaptureSessionPresetPhoto];
-
-  AVCaptureDevice *device =
-      [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
-  AVCaptureDeviceInput *deviceInput =
-      [AVCaptureDeviceInput deviceInputWithDevice:device error:&error];
-  assert(error == nil);
-
-  isUsingFrontFacingCamera = NO;
-  if ([session canAddInput:deviceInput]) [session addInput:deviceInput];
-
-  stillImageOutput = [AVCaptureStillImageOutput new];
-  [stillImageOutput
-      addObserver:self
-       forKeyPath:@"capturingStillImage"
-          options:NSKeyValueObservingOptionNew
-          context:(void *)(AVCaptureStillImageIsCapturingStillImageContext)];
-  if ([session canAddOutput:stillImageOutput])
-    [session addOutput:stillImageOutput];
-
-  videoDataOutput = [AVCaptureVideoDataOutput new];
-
-  NSDictionary *rgbOutputSettings = [NSDictionary
-      dictionaryWithObject:[NSNumber numberWithInt:kCMPixelFormat_32BGRA]
-                    forKey:(id)kCVPixelBufferPixelFormatTypeKey];
-  [videoDataOutput setVideoSettings:rgbOutputSettings];
-  [videoDataOutput setAlwaysDiscardsLateVideoFrames:YES];
-  videoDataOutputQueue =
-      dispatch_queue_create("VideoDataOutputQueue", DISPATCH_QUEUE_SERIAL);
-  [videoDataOutput setSampleBufferDelegate:self queue:videoDataOutputQueue];
-
-  if ([session canAddOutput:videoDataOutput])
-    [session addOutput:videoDataOutput];
-  [[videoDataOutput connectionWithMediaType:AVMediaTypeVideo] setEnabled:YES];
-
-  previewLayer = [[AVCaptureVideoPreviewLayer alloc] initWithSession:session];
-  [previewLayer setBackgroundColor:[[UIColor blackColor] CGColor]];
-  [previewLayer setVideoGravity:AVLayerVideoGravityResizeAspect];
-  CALayer *rootLayer = [previewView layer];
-  [rootLayer setMasksToBounds:YES];
-  [previewLayer setFrame:[rootLayer bounds]];
-  [rootLayer addSublayer:previewLayer];
-  [session startRunning];
-
-  if (error) {
-    NSString *title = [NSString stringWithFormat:@"Failed with error %d", (int)[error code]];
-    UIAlertController *alertController =
-        [UIAlertController alertControllerWithTitle:title
-                                            message:[error localizedDescription]
-                                     preferredStyle:UIAlertControllerStyleAlert];
-    UIAlertAction *dismiss =
-        [UIAlertAction actionWithTitle:@"Dismiss" style:UIAlertActionStyleDefault handler:nil];
-    [alertController addAction:dismiss];
-    [self presentViewController:alertController animated:YES completion:nil];
-    [self teardownAVCapture];
-  }
-}
-
-- (void)teardownAVCapture {
-  [stillImageOutput removeObserver:self forKeyPath:@"isCapturingStillImage"];
-  [previewLayer removeFromSuperlayer];
-}
-
-- (void)observeValueForKeyPath:(NSString *)keyPath
-                      ofObject:(id)object
-                        change:(NSDictionary *)change
-                       context:(void *)context {
-  if (context == AVCaptureStillImageIsCapturingStillImageContext) {
-    BOOL isCapturingStillImage =
-        [[change objectForKey:NSKeyValueChangeNewKey] boolValue];
-
-    if (isCapturingStillImage) {
-      // do flash bulb like animation
-      flashView = [[UIView alloc] initWithFrame:[previewView frame]];
-      [flashView setBackgroundColor:[UIColor whiteColor]];
-      [flashView setAlpha:0.f];
-      [[[self view] window] addSubview:flashView];
-
-      [UIView animateWithDuration:.4f
-                       animations:^{
-                         [flashView setAlpha:1.f];
-                       }];
-    } else {
-      [UIView animateWithDuration:.4f
-          animations:^{
-            [flashView setAlpha:0.f];
-          }
-          completion:^(BOOL finished) {
-            [flashView removeFromSuperview];
-            flashView = nil;
-          }];
-    }
-  }
-}
-
-- (AVCaptureVideoOrientation)avOrientationForDeviceOrientation:
-    (UIDeviceOrientation)deviceOrientation {
-  AVCaptureVideoOrientation result =
-      (AVCaptureVideoOrientation)(deviceOrientation);
-  if (deviceOrientation == UIDeviceOrientationLandscapeLeft)
-    result = AVCaptureVideoOrientationLandscapeRight;
-  else if (deviceOrientation == UIDeviceOrientationLandscapeRight)
-    result = AVCaptureVideoOrientationLandscapeLeft;
-  return result;
-}
-
-- (IBAction)takePicture:(id)sender {
-  if ([session isRunning]) {
-    [session stopRunning];
-    [sender setTitle:@"Continue" forState:UIControlStateNormal];
-
-    flashView = [[UIView alloc] initWithFrame:[previewView frame]];
-    [flashView setBackgroundColor:[UIColor whiteColor]];
-    [flashView setAlpha:0.f];
-    [[[self view] window] addSubview:flashView];
-
-    [UIView animateWithDuration:.2f
-        animations:^{
-          [flashView setAlpha:1.f];
-        }
-        completion:^(BOOL finished) {
-          [UIView animateWithDuration:.2f
-              animations:^{
-                [flashView setAlpha:0.f];
-              }
-              completion:^(BOOL finished) {
-                [flashView removeFromSuperview];
-                flashView = nil;
-              }];
-        }];
-
-  } else {
-    [session startRunning];
-    [sender setTitle:@"Freeze Frame" forState:UIControlStateNormal];
-  }
-}
-
-+ (CGRect)videoPreviewBoxForGravity:(NSString *)gravity
-                          frameSize:(CGSize)frameSize
-                       apertureSize:(CGSize)apertureSize {
-  CGFloat apertureRatio = apertureSize.height / apertureSize.width;
-  CGFloat viewRatio = frameSize.width / frameSize.height;
-
-  CGSize size = CGSizeZero;
-  if ([gravity isEqualToString:AVLayerVideoGravityResizeAspectFill]) {
-    if (viewRatio > apertureRatio) {
-      size.width = frameSize.width;
-      size.height =
-          apertureSize.width * (frameSize.width / apertureSize.height);
-    } else {
-      size.width =
-          apertureSize.height * (frameSize.height / apertureSize.width);
-      size.height = frameSize.height;
-    }
-  } else if ([gravity isEqualToString:AVLayerVideoGravityResizeAspect]) {
-    if (viewRatio > apertureRatio) {
-      size.width =
-          apertureSize.height * (frameSize.height / apertureSize.width);
-      size.height = frameSize.height;
-    } else {
-      size.width = frameSize.width;
-      size.height =
-          apertureSize.width * (frameSize.width / apertureSize.height);
-    }
-  } else if ([gravity isEqualToString:AVLayerVideoGravityResize]) {
-    size.width = frameSize.width;
-    size.height = frameSize.height;
-  }
-
-  CGRect videoBox;
-  videoBox.size = size;
-  if (size.width < frameSize.width)
-    videoBox.origin.x = (frameSize.width - size.width) / 2;
-  else
-    videoBox.origin.x = (size.width - frameSize.width) / 2;
-
-  if (size.height < frameSize.height)
-    videoBox.origin.y = (frameSize.height - size.height) / 2;
-  else
-    videoBox.origin.y = (size.height - frameSize.height) / 2;
-
-  return videoBox;
-}
-
-- (void)captureOutput:(AVCaptureOutput *)captureOutput
-didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
-       fromConnection:(AVCaptureConnection *)connection {
-  CVPixelBufferRef pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer);
-  CFRetain(pixelBuffer);
-  [self runCNNOnFrame:pixelBuffer];
-  CFRelease(pixelBuffer);
-}
-
-- (void)runCNNOnFrame:(CVPixelBufferRef)pixelBuffer {
-  assert(pixelBuffer != NULL);
-
-  OSType sourcePixelFormat = CVPixelBufferGetPixelFormatType(pixelBuffer);
-  int doReverseChannels;
-  if (kCVPixelFormatType_32ARGB == sourcePixelFormat) {
-    doReverseChannels = 1;
-  } else if (kCVPixelFormatType_32BGRA == sourcePixelFormat) {
-    doReverseChannels = 0;
-  } else {
-    assert(false);  // Unknown source format
-  }
-
-  const int sourceRowBytes = (int)CVPixelBufferGetBytesPerRow(pixelBuffer);
-  const int image_width = (int)CVPixelBufferGetWidth(pixelBuffer);
-  const int fullHeight = (int)CVPixelBufferGetHeight(pixelBuffer);
-
-  CVPixelBufferLockFlags unlockFlags = kNilOptions;
-  CVPixelBufferLockBaseAddress(pixelBuffer, unlockFlags);
-
-  unsigned char *sourceBaseAddr =
-      (unsigned char *)(CVPixelBufferGetBaseAddress(pixelBuffer));
-  int image_height;
-  unsigned char *sourceStartAddr;
-  if (fullHeight <= image_width) {
-    image_height = fullHeight;
-    sourceStartAddr = sourceBaseAddr;
-  } else {
-    image_height = image_width;
-    const int marginY = ((fullHeight - image_width) / 2);
-    sourceStartAddr = (sourceBaseAddr + (marginY * sourceRowBytes));
-  }
-  const int image_channels = 4;
-
-  assert(image_channels >= wanted_input_channels);
-  tensorflow::Tensor image_tensor(
-      tensorflow::DT_FLOAT,
-      tensorflow::TensorShape(
-          {1, wanted_input_height, wanted_input_width, wanted_input_channels}));
-  auto image_tensor_mapped = image_tensor.tensor<float, 4>();
-  tensorflow::uint8 *in = sourceStartAddr;
-  float *out = image_tensor_mapped.data();
-  for (int y = 0; y < wanted_input_height; ++y) {
-    float *out_row = out + (y * wanted_input_width * wanted_input_channels);
-    for (int x = 0; x < wanted_input_width; ++x) {
-      const int in_x = (y * image_width) / wanted_input_width;
-      const int in_y = (x * image_height) / wanted_input_height;
-      tensorflow::uint8 *in_pixel =
-          in + (in_y * image_width * image_channels) + (in_x * image_channels);
-      float *out_pixel = out_row + (x * wanted_input_channels);
-      for (int c = 0; c < wanted_input_channels; ++c) {
-        out_pixel[c] = (in_pixel[c] - input_mean) / input_std;
-      }
-    }
-  }
-
-  CVPixelBufferUnlockBaseAddress(pixelBuffer, unlockFlags);
-
-  if (tf_session.get()) {
-    std::vector<tensorflow::Tensor> outputs;
-    tensorflow::Status run_status = tf_session->Run(
-        {{input_layer_name, image_tensor}}, {output_layer_name}, {}, &outputs);
-    if (!run_status.ok()) {
-      LOG(ERROR) << "Running model failed:" << run_status;
-    } else {
-      tensorflow::Tensor *output = &outputs[0];
-      auto predictions = output->flat<float>();
-
-      NSMutableDictionary *newValues = [NSMutableDictionary dictionary];
-      for (int index = 0; index < predictions.size(); index += 1) {
-        const float predictionValue = predictions(index);
-        if (predictionValue > 0.05f) {
-          std::string label = labels[index % predictions.size()];
-          NSString *labelObject = [NSString stringWithUTF8String:label.c_str()];
-          NSNumber *valueObject = [NSNumber numberWithFloat:predictionValue];
-          [newValues setObject:valueObject forKey:labelObject];
-        }
-      }
-      dispatch_async(dispatch_get_main_queue(), ^(void) {
-        [self setPredictionValues:newValues];
-      });
-    }
-  }
-  CVPixelBufferUnlockBaseAddress(pixelBuffer, 0);
-}
-
-- (void)dealloc {
-  [self teardownAVCapture];
-}
-
-// use front/back camera
-- (IBAction)switchCameras:(id)sender {
-  AVCaptureDevicePosition desiredPosition;
-  if (isUsingFrontFacingCamera)
-    desiredPosition = AVCaptureDevicePositionBack;
-  else
-    desiredPosition = AVCaptureDevicePositionFront;
-
-  for (AVCaptureDevice *d in
-       [AVCaptureDevice devicesWithMediaType:AVMediaTypeVideo]) {
-    if ([d position] == desiredPosition) {
-      [[previewLayer session] beginConfiguration];
-      AVCaptureDeviceInput *input =
-          [AVCaptureDeviceInput deviceInputWithDevice:d error:nil];
-      for (AVCaptureInput *oldInput in [[previewLayer session] inputs]) {
-        [[previewLayer session] removeInput:oldInput];
-      }
-      [[previewLayer session] addInput:input];
-      [[previewLayer session] commitConfiguration];
-      break;
-    }
-  }
-  isUsingFrontFacingCamera = !isUsingFrontFacingCamera;
-}
-
-- (void)didReceiveMemoryWarning {
-  [super didReceiveMemoryWarning];
-}
-
-- (void)viewDidLoad {
-  [super viewDidLoad];
-  square = [UIImage imageNamed:@"squarePNG"];
-  synth = [[AVSpeechSynthesizer alloc] init];
-  labelLayers = [[NSMutableArray alloc] init];
-  oldPredictionValues = [[NSMutableDictionary alloc] init];
-
-  tensorflow::Status load_status;
-  if (model_uses_memory_mapping) {
-    load_status = LoadMemoryMappedModel(
-        model_file_name, model_file_type, &tf_session, &tf_memmapped_env);
-  } else {
-    load_status = LoadModel(model_file_name, model_file_type, &tf_session);
-  }
-  if (!load_status.ok()) {
-    LOG(FATAL) << "Couldn't load model: " << load_status;
-  }
-
-  tensorflow::Status labels_status =
-      LoadLabels(labels_file_name, labels_file_type, &labels);
-  if (!labels_status.ok()) {
-    LOG(FATAL) << "Couldn't load labels: " << labels_status;
-  }
-  [self setupAVCapture];
-}
-
-- (void)viewDidUnload {
-  [super viewDidUnload];
-}
-
-- (void)viewWillAppear:(BOOL)animated {
-  [super viewWillAppear:animated];
-}
-
-- (void)viewDidAppear:(BOOL)animated {
-  [super viewDidAppear:animated];
-}
-
-- (void)viewWillDisappear:(BOOL)animated {
-  [super viewWillDisappear:animated];
-}
-
-- (void)viewDidDisappear:(BOOL)animated {
-  [super viewDidDisappear:animated];
-}
-
-- (BOOL)shouldAutorotateToInterfaceOrientation:
-    (UIInterfaceOrientation)interfaceOrientation {
-  return (interfaceOrientation == UIInterfaceOrientationPortrait);
-}
-
-- (BOOL)prefersStatusBarHidden {
-  return YES;
-}
-
-- (void)setPredictionValues:(NSDictionary *)newValues {
-  const float decayValue = 0.75f;
-  const float updateValue = 0.25f;
-  const float minimumThreshold = 0.01f;
-
-  NSMutableDictionary *decayedPredictionValues =
-      [[NSMutableDictionary alloc] init];
-  for (NSString *label in oldPredictionValues) {
-    NSNumber *oldPredictionValueObject =
-        [oldPredictionValues objectForKey:label];
-    const float oldPredictionValue = [oldPredictionValueObject floatValue];
-    const float decayedPredictionValue = (oldPredictionValue * decayValue);
-    if (decayedPredictionValue > minimumThreshold) {
-      NSNumber *decayedPredictionValueObject =
-          [NSNumber numberWithFloat:decayedPredictionValue];
-      [decayedPredictionValues setObject:decayedPredictionValueObject
-                                  forKey:label];
-    }
-  }
-  oldPredictionValues = decayedPredictionValues;
-
-  for (NSString *label in newValues) {
-    NSNumber *newPredictionValueObject = [newValues objectForKey:label];
-    NSNumber *oldPredictionValueObject =
-        [oldPredictionValues objectForKey:label];
-    if (!oldPredictionValueObject) {
-      oldPredictionValueObject = [NSNumber numberWithFloat:0.0f];
-    }
-    const float newPredictionValue = [newPredictionValueObject floatValue];
-    const float oldPredictionValue = [oldPredictionValueObject floatValue];
-    const float updatedPredictionValue =
-        (oldPredictionValue + (newPredictionValue * updateValue));
-    NSNumber *updatedPredictionValueObject =
-        [NSNumber numberWithFloat:updatedPredictionValue];
-    [oldPredictionValues setObject:updatedPredictionValueObject forKey:label];
-  }
-  NSArray *candidateLabels = [NSMutableArray array];
-  for (NSString *label in oldPredictionValues) {
-    NSNumber *oldPredictionValueObject =
-        [oldPredictionValues objectForKey:label];
-    const float oldPredictionValue = [oldPredictionValueObject floatValue];
-    if (oldPredictionValue > 0.05f) {
-      NSDictionary *entry = @{
-        @"label" : label,
-        @"value" : oldPredictionValueObject
-      };
-      candidateLabels = [candidateLabels arrayByAddingObject:entry];
-    }
-  }
-  NSSortDescriptor *sort =
-      [NSSortDescriptor sortDescriptorWithKey:@"value" ascending:NO];
-  NSArray *sortedLabels = [candidateLabels
-      sortedArrayUsingDescriptors:[NSArray arrayWithObject:sort]];
-
-  const float leftMargin = 10.0f;
-  const float topMargin = 10.0f;
-
-  const float valueWidth = 48.0f;
-  const float valueHeight = 26.0f;
-
-  const float labelWidth = 246.0f;
-  const float labelHeight = 26.0f;
-
-  const float labelMarginX = 5.0f;
-  const float labelMarginY = 5.0f;
-
-  [self removeAllLabelLayers];
-
-  int labelCount = 0;
-  for (NSDictionary *entry in sortedLabels) {
-    NSString *label = [entry objectForKey:@"label"];
-    NSNumber *valueObject = [entry objectForKey:@"value"];
-    const float value = [valueObject floatValue];
-
-    const float originY =
-        (topMargin + ((labelHeight + labelMarginY) * labelCount));
-
-    const int valuePercentage = (int)roundf(value * 100.0f);
-
-    const float valueOriginX = leftMargin;
-    NSString *valueText = [NSString stringWithFormat:@"%d%%", valuePercentage];
-
-    [self addLabelLayerWithText:valueText
-                        originX:valueOriginX
-                        originY:originY
-                          width:valueWidth
-                         height:valueHeight
-                      alignment:kCAAlignmentRight];
-
-    const float labelOriginX = (leftMargin + valueWidth + labelMarginX);
-
-    [self addLabelLayerWithText:[label capitalizedString]
-                        originX:labelOriginX
-                        originY:originY
-                          width:labelWidth
-                         height:labelHeight
-                      alignment:kCAAlignmentLeft];
-
-    if ((labelCount == 0) && (value > 0.5f)) {
-      [self speak:[label capitalizedString]];
-    }
-
-    labelCount += 1;
-    if (labelCount > 4) {
-      break;
-    }
-  }
-}
-
-- (void)removeAllLabelLayers {
-  for (CATextLayer *layer in labelLayers) {
-    [layer removeFromSuperlayer];
-  }
-  [labelLayers removeAllObjects];
-}
-
-- (void)addLabelLayerWithText:(NSString *)text
-                      originX:(float)originX
-                      originY:(float)originY
-                        width:(float)width
-                       height:(float)height
-                    alignment:(NSString *)alignment {
-  CFTypeRef font = (CFTypeRef) @"Menlo-Regular";
-  const float fontSize = 20.0f;
-
-  const float marginSizeX = 5.0f;
-  const float marginSizeY = 2.0f;
-
-  const CGRect backgroundBounds = CGRectMake(originX, originY, width, height);
-
-  const CGRect textBounds =
-      CGRectMake((originX + marginSizeX), (originY + marginSizeY),
-                 (width - (marginSizeX * 2)), (height - (marginSizeY * 2)));
-
-  CATextLayer *background = [CATextLayer layer];
-  [background setBackgroundColor:[UIColor blackColor].CGColor];
-  [background setOpacity:0.5f];
-  [background setFrame:backgroundBounds];
-  background.cornerRadius = 5.0f;
-
-  [[self.view layer] addSublayer:background];
-  [labelLayers addObject:background];
-
-  CATextLayer *layer = [CATextLayer layer];
-  [layer setForegroundColor:[UIColor whiteColor].CGColor];
-  [layer setFrame:textBounds];
-  [layer setAlignmentMode:alignment];
-  [layer setWrapped:YES];
-  [layer setFont:font];
-  [layer setFontSize:fontSize];
-  layer.contentsScale = [[UIScreen mainScreen] scale];
-  [layer setString:text];
-
-  [[self.view layer] addSublayer:layer];
-  [labelLayers addObject:layer];
-}
-
-- (void)setPredictionText:(NSString *)text withDuration:(float)duration {
-  if (duration > 0.0) {
-    CABasicAnimation *colorAnimation =
-        [CABasicAnimation animationWithKeyPath:@"foregroundColor"];
-    colorAnimation.duration = duration;
-    colorAnimation.fillMode = kCAFillModeForwards;
-    colorAnimation.removedOnCompletion = NO;
-    colorAnimation.fromValue = (id)[UIColor darkGrayColor].CGColor;
-    colorAnimation.toValue = (id)[UIColor whiteColor].CGColor;
-    colorAnimation.timingFunction =
-        [CAMediaTimingFunction functionWithName:kCAMediaTimingFunctionLinear];
-    [self.predictionTextLayer addAnimation:colorAnimation
-                                    forKey:@"colorAnimation"];
-  } else {
-    self.predictionTextLayer.foregroundColor = [UIColor whiteColor].CGColor;
-  }
-
-  [self.predictionTextLayer removeFromSuperlayer];
-  [[self.view layer] addSublayer:self.predictionTextLayer];
-  [self.predictionTextLayer setString:text];
-}
-
-- (void)speak:(NSString *)words {
-  if ([synth isSpeaking]) {
-    return;
-  }
-  AVSpeechUtterance *utterance =
-      [AVSpeechUtterance speechUtteranceWithString:words];
-  utterance.voice = [AVSpeechSynthesisVoice voiceWithLanguage:@"en-US"];
-  utterance.rate = 0.75 * AVSpeechUtteranceDefaultSpeechRate;
-  [synth speakUtterance:utterance];
-}
-
-@end
diff --git a/tensorflow/examples/ios/camera/Info.plist b/tensorflow/examples/ios/camera/Info.plist
deleted file mode 100644
index 772fb38dcc9..00000000000
--- a/tensorflow/examples/ios/camera/Info.plist
+++ /dev/null
@@ -1,44 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>CFBundleDevelopmentRegion</key>
-	<string>en</string>
-	<key>CFBundleDisplayName</key>
-	<string>tf_camera_example</string>
-	<key>CFBundleExecutable</key>
-	<string>${EXECUTABLE_NAME}</string>
-	<key>CFBundleIdentifier</key>
-	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
-	<key>CFBundleInfoDictionaryVersion</key>
-	<string>6.0</string>
-	<key>CFBundleName</key>
-	<string>${PRODUCT_NAME}</string>
-	<key>CFBundlePackageType</key>
-	<string>APPL</string>
-	<key>CFBundleShortVersionString</key>
-	<string>1.0</string>
-	<key>CFBundleSignature</key>
-	<string>????</string>
-	<key>CFBundleVersion</key>
-	<string>1.0</string>
-	<key>LSRequiresIPhoneOS</key>
-	<true/>
-	<key>NSCameraUsageDescription</key>
-	<string>Capture images to detect object</string>
-	<key>UIMainStoryboardFile</key>
-	<string>MainStoryboard_iPhone</string>
-	<key>UIRequiresFullScreen</key>
-	<true/>
-	<key>UIStatusBarHidden</key>
-	<true/>
-	<key>UISupportedInterfaceOrientations</key>
-	<array>
-		<string>UIInterfaceOrientationPortrait</string>
-	</array>
-	<key>UISupportedInterfaceOrientations~ipad</key>
-	<array>
-		<string>UIInterfaceOrientationPortrait</string>
-	</array>
-</dict>
-</plist>
diff --git a/tensorflow/examples/ios/camera/MainStoryboard_iPhone.storyboard b/tensorflow/examples/ios/camera/MainStoryboard_iPhone.storyboard
deleted file mode 100644
index 0f10a22e415..00000000000
--- a/tensorflow/examples/ios/camera/MainStoryboard_iPhone.storyboard
+++ /dev/null
@@ -1,46 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="9531" systemVersion="15E65" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" initialViewController="2">
-    <dependencies>
-        <deployment identifier="iOS"/>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="9529"/>
-    </dependencies>
-    <scenes>
-        <!--Camera Example View Controller-->
-        <scene sceneID="5">
-            <objects>
-                <viewController id="2" customClass="CameraExampleViewController" sceneMemberID="viewController">
-                    <view key="view" contentMode="scaleToFill" id="3">
-                        <rect key="frame" x="0.0" y="0.0" width="320" height="568"/>
-                        <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
-                        <subviews>
-                            <view contentMode="scaleToFill" id="12">
-                                <rect key="frame" x="0.0" y="0.0" width="320" height="522"/>
-                                <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
-                                <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="calibratedWhite"/>
-                                <gestureRecognizers/>
-                            </view>
-                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" id="iD8-yH-eWH">
-                                <rect key="frame" x="0.0" y="454" width="320" height="33"/>
-                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
-                                <color key="backgroundColor" red="0.0" green="0.0" blue="0.0" alpha="1" colorSpace="calibratedRGB"/>
-                                <fontDescription key="fontDescription" name="Menlo-Regular" family="Menlo" pointSize="20"/>
-                                <state key="normal" title="Freeze Frame">
-                                    <color key="titleColor" white="1" alpha="1" colorSpace="calibratedWhite"/>
-                                    <color key="titleShadowColor" white="0.5" alpha="1" colorSpace="calibratedWhite"/>
-                                </state>
-                                <connections>
-                                    <action selector="takePicture:" destination="2" eventType="touchUpInside" id="BTy-7E-XUS"/>
-                                </connections>
-                            </button>
-                        </subviews>
-                        <color key="backgroundColor" red="0.0" green="0.0" blue="0.0" alpha="1" colorSpace="calibratedRGB"/>
-                    </view>
-                    <connections>
-                        <outlet property="previewView" destination="12" id="13"/>
-                    </connections>
-                </viewController>
-                <placeholder placeholderIdentifier="IBFirstResponder" id="4" sceneMemberID="firstResponder"/>
-            </objects>
-        </scene>
-    </scenes>
-</document>
diff --git a/tensorflow/examples/ios/camera/Podfile b/tensorflow/examples/ios/camera/Podfile
deleted file mode 100644
index 117828f0714..00000000000
--- a/tensorflow/examples/ios/camera/Podfile
+++ /dev/null
@@ -1,5 +0,0 @@
-platform :ios, '8.0'
-inhibit_all_warnings!
-
-target 'tf_camera_example'
-       pod 'TensorFlow-experimental'
diff --git a/tensorflow/examples/ios/camera/data/grace_hopper.jpg b/tensorflow/examples/ios/camera/data/grace_hopper.jpg
deleted file mode 100644
index d2a427810f6..00000000000
Binary files a/tensorflow/examples/ios/camera/data/grace_hopper.jpg and /dev/null differ
diff --git a/tensorflow/examples/ios/camera/ios_image_load.h b/tensorflow/examples/ios/camera/ios_image_load.h
deleted file mode 100644
index 3de812c34b3..00000000000
--- a/tensorflow/examples/ios/camera/ios_image_load.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_EXAMPLES_IOS_CAMERA_IOS_IMAGE_LOAD_H_
-#define TENSORFLOW_EXAMPLES_IOS_CAMERA_IOS_IMAGE_LOAD_H_
-
-#include <vector>
-
-#include "tensorflow/core/framework/types.h"
-
-std::vector<tensorflow::uint8> LoadImageFromFile(const char* file_name,
-                                                 int* out_width,
-                                                 int* out_height,
-                                                 int* out_channels);
-
-#endif  // TENSORFLOW_EXAMPLES_IOS_CAMERA_IOS_IMAGE_LOAD_H_
diff --git a/tensorflow/examples/ios/camera/ios_image_load.mm b/tensorflow/examples/ios/camera/ios_image_load.mm
deleted file mode 100644
index 64d1ea21cf2..00000000000
--- a/tensorflow/examples/ios/camera/ios_image_load.mm
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ios_image_load.h"
-
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-#include <stdio.h>
-
-#import <CoreImage/CoreImage.h>
-#import <ImageIO/ImageIO.h>
-
-using tensorflow::uint8;
-
-std::vector<uint8> LoadImageFromFile(const char* file_name,
-				     int* out_width, int* out_height,
-				     int* out_channels) {
-  FILE* file_handle = fopen(file_name, "rb");
-  fseek(file_handle, 0, SEEK_END);
-  const size_t bytes_in_file = ftell(file_handle);
-  fseek(file_handle, 0, SEEK_SET);
-  std::vector<uint8> file_data(bytes_in_file);
-  fread(file_data.data(), 1, bytes_in_file, file_handle);
-  fclose(file_handle);
-  CFDataRef file_data_ref = CFDataCreateWithBytesNoCopy(NULL, file_data.data(),
-						      bytes_in_file,
-						      kCFAllocatorNull);
-  CGDataProviderRef image_provider =
-    CGDataProviderCreateWithCFData(file_data_ref);
-
-  const char* suffix = strrchr(file_name, '.');
-  if (!suffix || suffix == file_name) {
-    suffix = "";
-  }
-  CGImageRef image;
-  if (strcasecmp(suffix, ".png") == 0) {
-    image = CGImageCreateWithPNGDataProvider(image_provider, NULL, true,
-					     kCGRenderingIntentDefault);
-  } else if ((strcasecmp(suffix, ".jpg") == 0) ||
-    (strcasecmp(suffix, ".jpeg") == 0)) {
-    image = CGImageCreateWithJPEGDataProvider(image_provider, NULL, true,
-					      kCGRenderingIntentDefault);
-  } else {
-    CFRelease(image_provider);
-    CFRelease(file_data_ref);
-    fprintf(stderr, "Unknown suffix for file '%s'\n", file_name);
-    *out_width = 0;
-    *out_height = 0;
-    *out_channels = 0;
-    return std::vector<uint8>();
-  }
-
-  const int width = (int)CGImageGetWidth(image);
-  const int height = (int)CGImageGetHeight(image);
-  const int channels = 4;
-  CGColorSpaceRef color_space = CGColorSpaceCreateDeviceRGB();
-  const int bytes_per_row = (width * channels);
-  const int bytes_in_image = (bytes_per_row * height);
-  std::vector<uint8> result(bytes_in_image);
-  const int bits_per_component = 8;
-  CGContextRef context = CGBitmapContextCreate(result.data(), width, height,
-    bits_per_component, bytes_per_row, color_space,
-    kCGImageAlphaPremultipliedLast | kCGBitmapByteOrder32Big);
-  CGColorSpaceRelease(color_space);
-  CGContextDrawImage(context, CGRectMake(0, 0, width, height), image);
-  CGContextRelease(context);
-  CFRelease(image);
-  CFRelease(image_provider);
-  CFRelease(file_data_ref);
-
-  *out_width = width;
-  *out_height = height;
-  *out_channels = channels;
-  return result;
-}
diff --git a/tensorflow/examples/ios/camera/main.mm b/tensorflow/examples/ios/camera/main.mm
deleted file mode 100644
index 42eff697efc..00000000000
--- a/tensorflow/examples/ios/camera/main.mm
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#import <UIKit/UIKit.h>
-
-#import "CameraExampleAppDelegate.h"
-
-int main(int argc, char *argv[]) {
-  int retVal = 0;
-
-  @autoreleasepool {
-    retVal = UIApplicationMain(
-        argc, argv, nil, NSStringFromClass([CameraExampleAppDelegate class]));
-  }
-  return retVal;
-}
diff --git a/tensorflow/examples/ios/camera/tensorflow_utils.h b/tensorflow/examples/ios/camera/tensorflow_utils.h
deleted file mode 100644
index 78bdb82aae6..00000000000
--- a/tensorflow/examples/ios/camera/tensorflow_utils.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_CONTRIB_IOS_EXAMPLES_CAMERA_TENSORFLOW_UTILS_H_
-#define TENSORFLOW_CONTRIB_IOS_EXAMPLES_CAMERA_TENSORFLOW_UTILS_H_
-
-#include <memory>
-#include <vector>
-
-#include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/memmapped_file_system.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-
-// Reads a serialized GraphDef protobuf file from the bundle, typically
-// created with the freeze_graph script. Populates the session argument with a
-// Session object that has the model loaded.
-tensorflow::Status LoadModel(NSString* file_name, NSString* file_type,
-                             std::unique_ptr<tensorflow::Session>* session);
-
-// Loads a model from a file that has been created using the
-// convert_graphdef_memmapped_format tool. This bundles together a GraphDef
-// proto together with a file that can be memory-mapped, containing the weight
-// parameters for the model. This is useful because it reduces the overall
-// memory pressure, since the read-only parameter regions can be easily paged
-// out and don't count toward memory limits on iOS.
-tensorflow::Status LoadMemoryMappedModel(
-    NSString* file_name, NSString* file_type,
-    std::unique_ptr<tensorflow::Session>* session,
-    std::unique_ptr<tensorflow::MemmappedEnv>* memmapped_env);
-
-// Takes a text file with a single label on each line, and returns a list.
-tensorflow::Status LoadLabels(NSString* file_name, NSString* file_type,
-                              std::vector<std::string>* label_strings);
-
-// Sorts the results from a model execution, and returns the highest scoring.
-void GetTopN(const Eigen::TensorMap<Eigen::Tensor<float, 1, Eigen::RowMajor>,
-                                    Eigen::Aligned>& prediction,
-             const int num_results, const float threshold,
-             std::vector<std::pair<float, int> >* top_results);
-
-#endif  // TENSORFLOW_CONTRIB_IOS_EXAMPLES_CAMERA_TENSORFLOW_UTILS_H_
diff --git a/tensorflow/examples/ios/camera/tensorflow_utils.mm b/tensorflow/examples/ios/camera/tensorflow_utils.mm
deleted file mode 100644
index 56d1e53081d..00000000000
--- a/tensorflow/examples/ios/camera/tensorflow_utils.mm
+++ /dev/null
@@ -1,219 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#import <Foundation/Foundation.h>
-
-#include "tensorflow_utils.h"
-
-#include <pthread.h>
-#include <unistd.h>
-#include <fstream>
-#include <queue>
-#include <sstream>
-#include <string>
-
-namespace {
-
-// Helper class used to load protobufs efficiently.
-class IfstreamInputStream : public ::google::protobuf::io::CopyingInputStream {
- public:
-  explicit IfstreamInputStream(const std::string& file_name)
-      : ifs_(file_name.c_str(), std::ios::in | std::ios::binary) {}
-  ~IfstreamInputStream() { ifs_.close(); }
-
-  int Read(void* buffer, int size) {
-    if (!ifs_) {
-      return -1;
-    }
-    ifs_.read(static_cast<char*>(buffer), size);
-    return ifs_.gcount();
-  }
-
- private:
-  std::ifstream ifs_;
-};
-}  // namespace
-
-// Returns the top N confidence values over threshold in the provided vector,
-// sorted by confidence in descending order.
-void GetTopN(const Eigen::TensorMap<Eigen::Tensor<float, 1, Eigen::RowMajor>,
-                                    Eigen::Aligned>& prediction,
-             const int num_results, const float threshold,
-             std::vector<std::pair<float, int> >* top_results) {
-  // Will contain top N results in ascending order.
-  std::priority_queue<std::pair<float, int>,
-                      std::vector<std::pair<float, int> >,
-                      std::greater<std::pair<float, int> > >
-      top_result_pq;
-
-  const int count = prediction.size();
-  for (int i = 0; i < count; ++i) {
-    const float value = prediction(i);
-
-    // Only add it if it beats the threshold and has a chance at being in
-    // the top N.
-    if (value < threshold) {
-      continue;
-    }
-
-    top_result_pq.push(std::pair<float, int>(value, i));
-
-    // If at capacity, kick the smallest value out.
-    if (top_result_pq.size() > num_results) {
-      top_result_pq.pop();
-    }
-  }
-
-  // Copy to output vector and reverse into descending order.
-  while (!top_result_pq.empty()) {
-    top_results->push_back(top_result_pq.top());
-    top_result_pq.pop();
-  }
-  std::reverse(top_results->begin(), top_results->end());
-}
-
-bool PortableReadFileToProto(const std::string& file_name,
-                             ::google::protobuf::MessageLite* proto) {
-  ::google::protobuf::io::CopyingInputStreamAdaptor stream(
-      new IfstreamInputStream(file_name));
-  stream.SetOwnsCopyingStream(true);
-  ::google::protobuf::io::CodedInputStream coded_stream(&stream);
-  // Total bytes hard limit / warning limit are set to 1GB and 512MB
-  // respectively.
-  coded_stream.SetTotalBytesLimit(1024LL << 20, 512LL << 20);
-  return proto->ParseFromCodedStream(&coded_stream);
-}
-
-NSString* FilePathForResourceName(NSString* name, NSString* extension) {
-  NSString* file_path =
-      [[NSBundle mainBundle] pathForResource:name ofType:extension];
-  if (file_path == NULL) {
-    LOG(FATAL) << "Couldn't find '" << [name UTF8String] << "."
-               << [extension UTF8String] << "' in bundle.";
-    return nullptr;
-  }
-  return file_path;
-}
-
-tensorflow::Status LoadModel(NSString* file_name, NSString* file_type,
-                             std::unique_ptr<tensorflow::Session>* session) {
-  tensorflow::SessionOptions options;
-
-  tensorflow::Session* session_pointer = nullptr;
-  tensorflow::Status session_status =
-      tensorflow::NewSession(options, &session_pointer);
-  if (!session_status.ok()) {
-    LOG(ERROR) << "Could not create TensorFlow Session: " << session_status;
-    return session_status;
-  }
-  session->reset(session_pointer);
-
-  tensorflow::GraphDef tensorflow_graph;
-
-  NSString* model_path = FilePathForResourceName(file_name, file_type);
-  if (!model_path) {
-    LOG(ERROR) << "Failed to find model proto at" << [file_name UTF8String]
-               << [file_type UTF8String];
-    return tensorflow::errors::NotFound([file_name UTF8String],
-                                        [file_type UTF8String]);
-  }
-  const bool read_proto_succeeded =
-      PortableReadFileToProto([model_path UTF8String], &tensorflow_graph);
-  if (!read_proto_succeeded) {
-    LOG(ERROR) << "Failed to load model proto from" << [model_path UTF8String];
-    return tensorflow::errors::NotFound([model_path UTF8String]);
-  }
-
-  tensorflow::Status create_status = (*session)->Create(tensorflow_graph);
-  if (!create_status.ok()) {
-    LOG(ERROR) << "Could not create TensorFlow Graph: " << create_status;
-    return create_status;
-  }
-
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status LoadMemoryMappedModel(
-    NSString* file_name, NSString* file_type,
-    std::unique_ptr<tensorflow::Session>* session,
-    std::unique_ptr<tensorflow::MemmappedEnv>* memmapped_env) {
-  NSString* network_path = FilePathForResourceName(file_name, file_type);
-  memmapped_env->reset(
-      new tensorflow::MemmappedEnv(tensorflow::Env::Default()));
-  tensorflow::Status mmap_status =
-      (memmapped_env->get())->InitializeFromFile([network_path UTF8String]);
-  if (!mmap_status.ok()) {
-    LOG(ERROR) << "MMap failed with " << mmap_status.error_message();
-    return mmap_status;
-  }
-
-  tensorflow::GraphDef tensorflow_graph;
-  tensorflow::Status load_graph_status = ReadBinaryProto(
-      memmapped_env->get(),
-      tensorflow::MemmappedFileSystem::kMemmappedPackageDefaultGraphDef,
-      &tensorflow_graph);
-  if (!load_graph_status.ok()) {
-    LOG(ERROR) << "MMap load graph failed with "
-               << load_graph_status.error_message();
-    return load_graph_status;
-  }
-
-  tensorflow::SessionOptions options;
-  // Disable optimizations on this graph so that constant folding doesn't
-  // increase the memory footprint by creating new constant copies of the weight
-  // parameters.
-  options.config.mutable_graph_options()
-      ->mutable_optimizer_options()
-      ->set_opt_level(::tensorflow::OptimizerOptions::L0);
-  options.env = memmapped_env->get();
-
-  tensorflow::Session* session_pointer = nullptr;
-  tensorflow::Status session_status =
-      tensorflow::NewSession(options, &session_pointer);
-  if (!session_status.ok()) {
-    LOG(ERROR) << "Could not create TensorFlow Session: " << session_status;
-    return session_status;
-  }
-
-  tensorflow::Status create_status = session_pointer->Create(tensorflow_graph);
-  if (!create_status.ok()) {
-    LOG(ERROR) << "Could not create TensorFlow Graph: " << create_status;
-    return create_status;
-  }
-
-  session->reset(session_pointer);
-
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status LoadLabels(NSString* file_name, NSString* file_type,
-                              std::vector<std::string>* label_strings) {
-  // Read the label list
-  NSString* labels_path = FilePathForResourceName(file_name, file_type);
-  if (!labels_path) {
-    LOG(ERROR) << "Failed to find model proto at" << [file_name UTF8String]
-               << [file_type UTF8String];
-    return tensorflow::errors::NotFound([file_name UTF8String],
-                                        [file_type UTF8String]);
-  }
-  std::ifstream t;
-  t.open([labels_path UTF8String]);
-  std::string line;
-  while (t) {
-    std::getline(t, line);
-    label_strings->push_back(line);
-  }
-  t.close();
-  return tensorflow::Status::OK();
-}
diff --git a/tensorflow/examples/ios/camera/tf_camera_example.xcodeproj/project.pbxproj b/tensorflow/examples/ios/camera/tf_camera_example.xcodeproj/project.pbxproj
deleted file mode 100644
index ee9fe57c792..00000000000
--- a/tensorflow/examples/ios/camera/tf_camera_example.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,412 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 46;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		1C3C9DCB1ED3AB4200B8B5FA /* ios_image_load.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1C3C9DC91ED3AB4200B8B5FA /* ios_image_load.mm */; };
-		1C3C9DCC1ED3AB4200B8B5FA /* main.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1C3C9DCA1ED3AB4200B8B5FA /* main.mm */; };
-		1C968D171ED3B8F20054F5C3 /* grace_hopper.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */; };
-		1C968D181ED3B8F20054F5C3 /* imagenet_comp_graph_label_strings.txt in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */; };
-		1C968D191ED3B8F20054F5C3 /* tensorflow_inception_graph.pb in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */; };
-		1C99111C1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */; };
-		1CA5EB931ED3ABFB00247A34 /* CoreMedia.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */; };
-		1CB47D491ED3AD1700DF7666 /* AVFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */; };
-		1CDB2D491ED3A9CD007929E9 /* CameraExampleAppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */; };
-		1CDB2D4A1ED3A9CD007929E9 /* CameraExampleViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */; };
-		1CDB2D4C1ED3A9CD007929E9 /* tensorflow_utils.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1CDB2D481ED3A9CD007929E9 /* tensorflow_utils.mm */; };
-		1CDB2D4E1ED3AA35007929E9 /* Info.plist in Resources */ = {isa = PBXBuildFile; fileRef = 1CDB2D4D1ED3AA35007929E9 /* Info.plist */; };
-		54DC6C3C5F734F3A58069F0C /* libPods-tf_camera_example.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 3BA8BF92C84895BFE59D8236 /* libPods-tf_camera_example.a */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXFileReference section */
-		1C0D73481ECCC41B008C1DAB /* CoreImage.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreImage.framework; path = System/Library/Frameworks/CoreImage.framework; sourceTree = SDKROOT; };
-		1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; };
-		1C3C9DC81ED3AB4200B8B5FA /* ios_image_load.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ios_image_load.h; sourceTree = "<group>"; };
-		1C3C9DC91ED3AB4200B8B5FA /* ios_image_load.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ios_image_load.mm; sourceTree = "<group>"; };
-		1C3C9DCA1ED3AB4200B8B5FA /* main.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = main.mm; sourceTree = "<group>"; };
-		1C564C0D1ED3A92E00087306 /* tf_camera_example.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = tf_camera_example.app; sourceTree = BUILT_PRODUCTS_DIR; };
-		1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = file.storyboard; path = MainStoryboard_iPhone.storyboard; sourceTree = "<group>"; };
-		1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = System/Library/Frameworks/UIKit.framework; sourceTree = SDKROOT; };
-		1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreMedia.framework; path = System/Library/Frameworks/CoreMedia.framework; sourceTree = SDKROOT; };
-		1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = AVFoundation.framework; path = System/Library/Frameworks/AVFoundation.framework; sourceTree = SDKROOT; };
-		1CDB2D421ED3A9CD007929E9 /* CameraExampleAppDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CameraExampleAppDelegate.h; sourceTree = "<group>"; };
-		1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = CameraExampleAppDelegate.m; sourceTree = "<group>"; };
-		1CDB2D441ED3A9CD007929E9 /* CameraExampleViewController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CameraExampleViewController.h; sourceTree = "<group>"; };
-		1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = CameraExampleViewController.mm; sourceTree = "<group>"; };
-		1CDB2D471ED3A9CD007929E9 /* tensorflow_utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = tensorflow_utils.h; sourceTree = "<group>"; };
-		1CDB2D481ED3A9CD007929E9 /* tensorflow_utils.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = tensorflow_utils.mm; sourceTree = "<group>"; };
-		1CDB2D4D1ED3AA35007929E9 /* Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
-		3BA8BF92C84895BFE59D8236 /* libPods-tf_camera_example.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-tf_camera_example.a"; sourceTree = BUILT_PRODUCTS_DIR; };
-		3BC5BE4BBD09374D3E98F082 /* Pods-tf_camera_example.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tf_camera_example.debug.xcconfig"; path = "Pods/Target Support Files/Pods-tf_camera_example/Pods-tf_camera_example.debug.xcconfig"; sourceTree = "<group>"; };
-		55ED318E8D29C8AFEF03DF1E /* Pods-tf_camera_example.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tf_camera_example.release.xcconfig"; path = "Pods/Target Support Files/Pods-tf_camera_example/Pods-tf_camera_example.release.xcconfig"; sourceTree = "<group>"; };
-		59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = grace_hopper.jpg; sourceTree = "<group>"; };
-		59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = imagenet_comp_graph_label_strings.txt; sourceTree = "<group>"; };
-		59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */ = {isa = PBXFileReference; lastKnownFileType = file; path = tensorflow_inception_graph.pb; sourceTree = "<group>"; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		1C564C0A1ED3A92E00087306 /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				1CB47D491ED3AD1700DF7666 /* AVFoundation.framework in Frameworks */,
-				1CA5EB931ED3ABFB00247A34 /* CoreMedia.framework in Frameworks */,
-				54DC6C3C5F734F3A58069F0C /* libPods-tf_camera_example.a in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		24D7686C331131624F4454A0 /* Frameworks */ = {
-			isa = PBXGroup;
-			children = (
-				1CB47D481ED3AD1700DF7666 /* AVFoundation.framework */,
-				1CA5EB921ED3ABFB00247A34 /* CoreMedia.framework */,
-				1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */,
-				1C0D73481ECCC41B008C1DAB /* CoreImage.framework */,
-				1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */,
-				3BA8BF92C84895BFE59D8236 /* libPods-tf_camera_example.a */,
-			);
-			name = Frameworks;
-			sourceTree = "<group>";
-		};
-		3E9FC355632FB928EA23BEED /* Pods */ = {
-			isa = PBXGroup;
-			children = (
-				3BC5BE4BBD09374D3E98F082 /* Pods-tf_camera_example.debug.xcconfig */,
-				55ED318E8D29C8AFEF03DF1E /* Pods-tf_camera_example.release.xcconfig */,
-			);
-			name = Pods;
-			sourceTree = "<group>";
-		};
-		591157921CF4011C00C31E3A = {
-			isa = PBXGroup;
-			children = (
-				1C99111B1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard */,
-				1C3C9DC81ED3AB4200B8B5FA /* ios_image_load.h */,
-				1C3C9DC91ED3AB4200B8B5FA /* ios_image_load.mm */,
-				1C3C9DCA1ED3AB4200B8B5FA /* main.mm */,
-				1CDB2D4D1ED3AA35007929E9 /* Info.plist */,
-				1CDB2D421ED3A9CD007929E9 /* CameraExampleAppDelegate.h */,
-				1CDB2D431ED3A9CD007929E9 /* CameraExampleAppDelegate.m */,
-				1CDB2D441ED3A9CD007929E9 /* CameraExampleViewController.h */,
-				1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */,
-				1CDB2D471ED3A9CD007929E9 /* tensorflow_utils.h */,
-				1CDB2D481ED3A9CD007929E9 /* tensorflow_utils.mm */,
-				59A3CFF31CF4E68100C4259F /* data */,
-				5911579C1CF4011C00C31E3A /* Products */,
-				3E9FC355632FB928EA23BEED /* Pods */,
-				24D7686C331131624F4454A0 /* Frameworks */,
-			);
-			sourceTree = "<group>";
-		};
-		5911579C1CF4011C00C31E3A /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				1C564C0D1ED3A92E00087306 /* tf_camera_example.app */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		59A3CFF31CF4E68100C4259F /* data */ = {
-			isa = PBXGroup;
-			children = (
-				59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */,
-				59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */,
-				59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */,
-			);
-			path = data;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		1C564C0C1ED3A92E00087306 /* tf_camera_example */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 1C564C351ED3A92E00087306 /* Build configuration list for PBXNativeTarget "tf_camera_example" */;
-			buildPhases = (
-				66DAEAAEE9EF6550C3A061E0 /* [CP] Check Pods Manifest.lock */,
-				1C564C091ED3A92E00087306 /* Sources */,
-				1C564C0A1ED3A92E00087306 /* Frameworks */,
-				1C564C0B1ED3A92E00087306 /* Resources */,
-				00E875C3B066535AE6B77101 /* [CP] Embed Pods Frameworks */,
-				5C2D02120E3E5E09567AA946 /* [CP] Copy Pods Resources */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = tf_camera_example;
-			productName = tf_camera_example;
-			productReference = 1C564C0D1ED3A92E00087306 /* tf_camera_example.app */;
-			productType = "com.apple.product-type.application";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		591157931CF4011C00C31E3A /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				LastSwiftUpdateCheck = 0830;
-				LastUpgradeCheck = 0830;
-				ORGANIZATIONNAME = Google;
-				TargetAttributes = {
-					1C564C0C1ED3A92E00087306 = {
-						CreatedOnToolsVersion = 8.3.2;
-						DevelopmentTeam = 5DRPWFQSHP;
-						ProvisioningStyle = Automatic;
-					};
-				};
-			};
-			buildConfigurationList = 591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tf_camera_example" */;
-			compatibilityVersion = "Xcode 3.2";
-			developmentRegion = English;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-				Base,
-			);
-			mainGroup = 591157921CF4011C00C31E3A;
-			productRefGroup = 5911579C1CF4011C00C31E3A /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				1C564C0C1ED3A92E00087306 /* tf_camera_example */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXResourcesBuildPhase section */
-		1C564C0B1ED3A92E00087306 /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				1C968D171ED3B8F20054F5C3 /* grace_hopper.jpg in Resources */,
-				1C968D181ED3B8F20054F5C3 /* imagenet_comp_graph_label_strings.txt in Resources */,
-				1C968D191ED3B8F20054F5C3 /* tensorflow_inception_graph.pb in Resources */,
-				1C99111C1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard in Resources */,
-				1CDB2D4E1ED3AA35007929E9 /* Info.plist in Resources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXResourcesBuildPhase section */
-
-/* Begin PBXShellScriptBuildPhase section */
-		00E875C3B066535AE6B77101 /* [CP] Embed Pods Frameworks */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-			);
-			name = "[CP] Embed Pods Frameworks";
-			outputPaths = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tf_camera_example/Pods-tf_camera_example-frameworks.sh\"\n";
-			showEnvVarsInLog = 0;
-		};
-		5C2D02120E3E5E09567AA946 /* [CP] Copy Pods Resources */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-			);
-			name = "[CP] Copy Pods Resources";
-			outputPaths = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tf_camera_example/Pods-tf_camera_example-resources.sh\"\n";
-			showEnvVarsInLog = 0;
-		};
-		66DAEAAEE9EF6550C3A061E0 /* [CP] Check Pods Manifest.lock */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-			);
-			name = "[CP] Check Pods Manifest.lock";
-			outputPaths = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n";
-			showEnvVarsInLog = 0;
-		};
-/* End PBXShellScriptBuildPhase section */
-
-/* Begin PBXSourcesBuildPhase section */
-		1C564C091ED3A92E00087306 /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				1CDB2D4C1ED3A9CD007929E9 /* tensorflow_utils.mm in Sources */,
-				1C3C9DCB1ED3AB4200B8B5FA /* ios_image_load.mm in Sources */,
-				1CDB2D4A1ED3A9CD007929E9 /* CameraExampleViewController.mm in Sources */,
-				1CDB2D491ED3A9CD007929E9 /* CameraExampleAppDelegate.m in Sources */,
-				1C3C9DCC1ED3AB4200B8B5FA /* main.mm in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin XCBuildConfiguration section */
-		1C564C361ED3A92E00087306 /* Debug */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = 3BC5BE4BBD09374D3E98F082 /* Pods-tf_camera_example.debug.xcconfig */;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				DEVELOPMENT_TEAM = 5DRPWFQSHP;
-				INFOPLIST_FILE = Info.plist;
-				IPHONEOS_DEPLOYMENT_TARGET = 10.3;
-				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
-				PRODUCT_BUNDLE_IDENTIFIER = "com.pf.tf-camera-example";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
-				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-				SWIFT_VERSION = 3.0;
-			};
-			name = Debug;
-		};
-		1C564C371ED3A92E00087306 /* Release */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = 55ED318E8D29C8AFEF03DF1E /* Pods-tf_camera_example.release.xcconfig */;
-			buildSettings = {
-				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
-				CLANG_ANALYZER_NONNULL = YES;
-				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
-				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
-				DEVELOPMENT_TEAM = 5DRPWFQSHP;
-				INFOPLIST_FILE = Info.plist;
-				IPHONEOS_DEPLOYMENT_TARGET = 10.3;
-				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
-				PRODUCT_BUNDLE_IDENTIFIER = "com.pf.tf-camera-example";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SWIFT_OPTIMIZATION_LEVEL = "-Owholemodule";
-				SWIFT_VERSION = 3.0;
-			};
-			name = Release;
-		};
-		591157B01CF4011D00C31E3A /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = dwarf;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_TESTABILITY = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu99;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
-				MTL_ENABLE_DEBUG_INFO = YES;
-				ONLY_ACTIVE_ARCH = YES;
-				SDKROOT = iphoneos;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Debug;
-		};
-		591157B11CF4011D00C31E3A /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				ENABLE_NS_ASSERTIONS = NO;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu99;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				SDKROOT = iphoneos;
-				TARGETED_DEVICE_FAMILY = "1,2";
-				VALIDATE_PRODUCT = YES;
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		1C564C351ED3A92E00087306 /* Build configuration list for PBXNativeTarget "tf_camera_example" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				1C564C361ED3A92E00087306 /* Debug */,
-				1C564C371ED3A92E00087306 /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tf_camera_example" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				591157B01CF4011D00C31E3A /* Debug */,
-				591157B11CF4011D00C31E3A /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = 591157931CF4011C00C31E3A /* Project object */;
-}
diff --git a/tensorflow/examples/ios/simple/AppDelegate.h b/tensorflow/examples/ios/simple/AppDelegate.h
deleted file mode 100644
index 75b1f1da384..00000000000
--- a/tensorflow/examples/ios/simple/AppDelegate.h
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#import <UIKit/UIKit.h>
-
-@interface AppDelegate : UIResponder <UIApplicationDelegate>
-
-@property (strong, nonatomic) UIWindow *window;
-
-@end
diff --git a/tensorflow/examples/ios/simple/AppDelegate.mm b/tensorflow/examples/ios/simple/AppDelegate.mm
deleted file mode 100644
index 1e808eb976f..00000000000
--- a/tensorflow/examples/ios/simple/AppDelegate.mm
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#import "AppDelegate.h"
-
-#import "RunModelViewController.h"
-
-@implementation AppDelegate
-
-- (BOOL)application:(UIApplication *)application
-    didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
-
-  UITabBarController *bar = [[UITabBarController alloc] init];
-  [bar setViewControllers:
-      @[[[RunModelViewController alloc] init]]];
-  bar.selectedIndex = 0;
-  self.window = [[UIWindow alloc] initWithFrame:[[UIScreen mainScreen] bounds]];
-  self.window.rootViewController = bar;
-  [self.window makeKeyAndVisible];
-  return YES;
-}
-
-- (void)applicationWillResignActive:(UIApplication *)application {}
-
-- (void)applicationDidEnterBackground:(UIApplication *)application {}
-
-- (void)applicationWillEnterForeground:(UIApplication *)application {}
-
-- (void)applicationDidBecomeActive:(UIApplication *)application {}
-
-- (void)applicationWillTerminate:(UIApplication *)application {}
-
-@end
diff --git a/tensorflow/examples/ios/simple/Podfile b/tensorflow/examples/ios/simple/Podfile
deleted file mode 100644
index 1740ad64573..00000000000
--- a/tensorflow/examples/ios/simple/Podfile
+++ /dev/null
@@ -1,5 +0,0 @@
-platform :ios, '8.0'
-inhibit_all_warnings!
-
-target 'tf_simple_example'
-       pod 'TensorFlow-experimental'
diff --git a/tensorflow/examples/ios/simple/RunModel-Info.plist b/tensorflow/examples/ios/simple/RunModel-Info.plist
deleted file mode 100644
index d0a8742456f..00000000000
--- a/tensorflow/examples/ios/simple/RunModel-Info.plist
+++ /dev/null
@@ -1,47 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-	<key>CFBundleDevelopmentRegion</key>
-	<string>en</string>
-	<key>CFBundleDisplayName</key>
-	<string>tf_simple_example</string>
-	<key>CFBundleExecutable</key>
-	<string>tf_simple_example</string>
-	<key>CFBundleIdentifier</key>
-	<string>$(PRODUCT_BUNDLE_IDENTIFIER)</string>
-	<key>CFBundleInfoDictionaryVersion</key>
-	<string>6.0</string>
-	<key>CFBundleName</key>
-	<string>ios-app</string>
-	<key>CFBundlePackageType</key>
-	<string>APPL</string>
-	<key>CFBundleShortVersionString</key>
-	<string>1.0</string>
-	<key>CFBundleSignature</key>
-	<string>????</string>
-	<key>CFBundleVersion</key>
-	<string>1.0</string>
-	<key>LSRequiresIPhoneOS</key>
-	<true/>
-	<key>UILaunchStoryboardName</key>
-	<string>RunModelViewController</string>
-	<key>UIRequiredDeviceCapabilities</key>
-	<array>
-		<string>armv7</string>
-	</array>
-	<key>UISupportedInterfaceOrientations</key>
-	<array>
-		<string>UIInterfaceOrientationPortrait</string>
-		<string>UIInterfaceOrientationLandscapeLeft</string>
-		<string>UIInterfaceOrientationLandscapeRight</string>
-	</array>
-	<key>UISupportedInterfaceOrientations~ipad</key>
-	<array>
-		<string>UIInterfaceOrientationPortrait</string>
-		<string>UIInterfaceOrientationPortraitUpsideDown</string>
-		<string>UIInterfaceOrientationLandscapeLeft</string>
-		<string>UIInterfaceOrientationLandscapeRight</string>
-	</array>
-</dict>
-</plist>
diff --git a/tensorflow/examples/ios/simple/RunModelViewController.mm b/tensorflow/examples/ios/simple/RunModelViewController.mm
deleted file mode 100644
index c8ccb5c77b2..00000000000
--- a/tensorflow/examples/ios/simple/RunModelViewController.mm
+++ /dev/null
@@ -1,253 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#import "RunModelViewController.h"
-
-#include <fstream>
-#include <pthread.h>
-#include <unistd.h>
-#include <queue>
-#include <sstream>
-#include <string>
-
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/public/session.h"
-
-#include "ios_image_load.h"
-
-NSString* RunInferenceOnImage();
-
-namespace {
-class IfstreamInputStream : public ::google::protobuf::io::CopyingInputStream {
- public:
-  explicit IfstreamInputStream(const std::string& file_name)
-      : ifs_(file_name.c_str(), std::ios::in | std::ios::binary) {}
-  ~IfstreamInputStream() { ifs_.close(); }
-
-  int Read(void* buffer, int size) {
-    if (!ifs_) {
-      return -1;
-    }
-    ifs_.read(static_cast<char*>(buffer), size);
-    return (int)ifs_.gcount();
-  }
-
- private:
-  std::ifstream ifs_;
-};
-}  // namespace
-
-@interface RunModelViewController ()
-@end
-
-@implementation RunModelViewController {
-}
-
-- (IBAction)getUrl:(id)sender {
-  NSString* inference_result = RunInferenceOnImage();
-  self.urlContentTextView.text = inference_result;
-}
-
-@end
-
-// Returns the top N confidence values over threshold in the provided vector,
-// sorted by confidence in descending order.
-static void GetTopN(
-    const Eigen::TensorMap<Eigen::Tensor<float, 1, Eigen::RowMajor>,
-                           Eigen::Aligned>& prediction,
-    const int num_results, const float threshold,
-    std::vector<std::pair<float, int> >* top_results) {
-  // Will contain top N results in ascending order.
-  std::priority_queue<std::pair<float, int>,
-      std::vector<std::pair<float, int> >,
-      std::greater<std::pair<float, int> > > top_result_pq;
-
-  const long count = prediction.size();
-  for (int i = 0; i < count; ++i) {
-    const float value = prediction(i);
-
-    // Only add it if it beats the threshold and has a chance at being in
-    // the top N.
-    if (value < threshold) {
-      continue;
-    }
-
-    top_result_pq.push(std::pair<float, int>(value, i));
-
-    // If at capacity, kick the smallest value out.
-    if (top_result_pq.size() > num_results) {
-      top_result_pq.pop();
-    }
-  }
-
-  // Copy to output vector and reverse into descending order.
-  while (!top_result_pq.empty()) {
-    top_results->push_back(top_result_pq.top());
-    top_result_pq.pop();
-  }
-  std::reverse(top_results->begin(), top_results->end());
-}
-
-
-bool PortableReadFileToProto(const std::string& file_name,
-                             ::google::protobuf::MessageLite* proto) {
-  ::google::protobuf::io::CopyingInputStreamAdaptor stream(
-      new IfstreamInputStream(file_name));
-  stream.SetOwnsCopyingStream(true);
-  // TODO(jiayq): the following coded stream is for debugging purposes to allow
-  // one to parse arbitrarily large messages for MessageLite. One most likely
-  // doesn't want to put protobufs larger than 64MB on Android, so we should
-  // eventually remove this and quit loud when a large protobuf is passed in.
-  ::google::protobuf::io::CodedInputStream coded_stream(&stream);
-  // Total bytes hard limit / warning limit are set to 1GB and 512MB
-  // respectively.
-  coded_stream.SetTotalBytesLimit(1024LL << 20, 512LL << 20);
-  return proto->ParseFromCodedStream(&coded_stream);
-}
-
-NSString* FilePathForResourceName(NSString* name, NSString* extension) {
-  NSString* file_path = [[NSBundle mainBundle] pathForResource:name ofType:extension];
-  if (file_path == NULL) {
-    LOG(FATAL) << "Couldn't find '" << [name UTF8String] << "."
-	       << [extension UTF8String] << "' in bundle.";
-  }
-  return file_path;
-}
-
-NSString* RunInferenceOnImage() {
-  tensorflow::SessionOptions options;
-
-  tensorflow::Session* session_pointer = nullptr;
-  tensorflow::Status session_status = tensorflow::NewSession(options, &session_pointer);
-  if (!session_status.ok()) {
-    std::string status_string = session_status.ToString();
-    return [NSString stringWithFormat: @"Session create failed - %s",
-	status_string.c_str()];
-  }
-  std::unique_ptr<tensorflow::Session> session(session_pointer);
-  LOG(INFO) << "Session created.";
-
-  tensorflow::GraphDef tensorflow_graph;
-  LOG(INFO) << "Graph created.";
-
-  NSString* network_path = FilePathForResourceName(@"tensorflow_inception_graph", @"pb");
-  PortableReadFileToProto([network_path UTF8String], &tensorflow_graph);
-
-  LOG(INFO) << "Creating session.";
-  tensorflow::Status s = session->Create(tensorflow_graph);
-  if (!s.ok()) {
-    LOG(ERROR) << "Could not create TensorFlow Graph: " << s;
-    return @"";
-  }
-
-  // Read the label list
-  NSString* labels_path = FilePathForResourceName(@"imagenet_comp_graph_label_strings", @"txt");
-  std::vector<std::string> label_strings;
-  std::ifstream t;
-  t.open([labels_path UTF8String]);
-  std::string line;
-  while(t){
-    std::getline(t, line);
-    label_strings.push_back(line);
-  }
-  t.close();
-
-  // Read the Grace Hopper image.
-  NSString* image_path = FilePathForResourceName(@"grace_hopper", @"jpg");
-  int image_width;
-  int image_height;
-  int image_channels;
-  std::vector<tensorflow::uint8> image_data = LoadImageFromFile(
-	[image_path UTF8String], &image_width, &image_height, &image_channels);
-  const int wanted_width = 224;
-  const int wanted_height = 224;
-  const int wanted_channels = 3;
-  const float input_mean = 117.0f;
-  const float input_std = 1.0f;
-  assert(image_channels >= wanted_channels);
-  tensorflow::Tensor image_tensor(
-      tensorflow::DT_FLOAT,
-      tensorflow::TensorShape({
-          1, wanted_height, wanted_width, wanted_channels}));
-  auto image_tensor_mapped = image_tensor.tensor<float, 4>();
-  tensorflow::uint8* in = image_data.data();
-  // tensorflow::uint8* in_end = (in + (image_height * image_width * image_channels));
-  float* out = image_tensor_mapped.data();
-  for (int y = 0; y < wanted_height; ++y) {
-    const int in_y = (y * image_height) / wanted_height;
-    tensorflow::uint8* in_row = in + (in_y * image_width * image_channels);
-    float* out_row = out + (y * wanted_width * wanted_channels);
-    for (int x = 0; x < wanted_width; ++x) {
-      const int in_x = (x * image_width) / wanted_width;
-      tensorflow::uint8* in_pixel = in_row + (in_x * image_channels);
-      float* out_pixel = out_row + (x * wanted_channels);
-      for (int c = 0; c < wanted_channels; ++c) {
-        out_pixel[c] = (in_pixel[c] - input_mean) / input_std;
-      }
-    }
-  }
-
-  NSString* result = [network_path stringByAppendingString: @" - loaded!"];
-  result = [NSString stringWithFormat: @"%@ - %lu, %s - %dx%d", result,
-	label_strings.size(), label_strings[0].c_str(), image_width, image_height];
-
-  std::string input_layer = "input";
-  std::string output_layer = "output";
-  std::vector<tensorflow::Tensor> outputs;
-  tensorflow::Status run_status = session->Run({{input_layer, image_tensor}},
-				               {output_layer}, {}, &outputs);
-  if (!run_status.ok()) {
-    LOG(ERROR) << "Running model failed: " << run_status;
-    tensorflow::LogAllRegisteredKernels();
-    result = @"Error running model";
-    return result;
-  }
-  tensorflow::string status_string = run_status.ToString();
-  result = [NSString stringWithFormat: @"%@ - %s", result,
-	status_string.c_str()];
-
-  tensorflow::Tensor* output = &outputs[0];
-  const int kNumResults = 5;
-  const float kThreshold = 0.1f;
-  std::vector<std::pair<float, int> > top_results;
-  GetTopN(output->flat<float>(), kNumResults, kThreshold, &top_results);
-
-  std::stringstream ss;
-  ss.precision(3);
-  for (const auto& result : top_results) {
-    const float confidence = result.first;
-    const int index = result.second;
-
-    ss << index << " " << confidence << "  ";
-
-    // Write out the result as a string
-    if (index < label_strings.size()) {
-      // just for safety: theoretically, the output is under 1000 unless there
-      // is some numerical issues leading to a wrong prediction.
-      ss << label_strings[index];
-    } else {
-      ss << "Prediction: " << index;
-    }
-
-    ss << "\n";
-  }
-
-  LOG(INFO) << "Predictions: " << ss.str();
-
-  tensorflow::string predictions = ss.str();
-  result = [NSString stringWithFormat: @"%@ - %s", result,
-	predictions.c_str()];
-
-  return result;
-}
diff --git a/tensorflow/examples/ios/simple/RunModelViewController.xib b/tensorflow/examples/ios/simple/RunModelViewController.xib
deleted file mode 100644
index 93f334b9850..00000000000
--- a/tensorflow/examples/ios/simple/RunModelViewController.xib
+++ /dev/null
@@ -1,46 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.XIB" version="3.0" toolsVersion="9531" systemVersion="15D21" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES">
-    <dependencies>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="9529"/>
-    </dependencies>
-    <objects>
-        <placeholder placeholderIdentifier="IBFilesOwner" id="-1" userLabel="File's Owner" customClass="RunModelViewController">
-            <connections>
-                <outlet property="urlContentTextView" destination="quY-AK-ZCn" id="YjW-BO-1Ta"/>
-                <outlet property="urlTextField" destination="hPw-q5-vh5" id="wmc-b6-2CV"/>
-                <outlet property="view" destination="1" id="iHm-Rr-4wj"/>
-            </connections>
-        </placeholder>
-        <placeholder placeholderIdentifier="IBFirstResponder" id="-2" customClass="UIResponder"/>
-        <view contentMode="scaleToFill" id="1">
-            <rect key="frame" x="0.0" y="0.0" width="320" height="568"/>
-            <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
-            <subviews>
-                <textView clipsSubviews="YES" contentMode="scaleToFill" fixedFrame="YES" editable="NO" text="The results of running the model will appear here." selectable="NO" translatesAutoresizingMaskIntoConstraints="NO" id="quY-AK-ZCn">
-                    <rect key="frame" x="40" y="99" width="240" height="168"/>
-                    <color key="backgroundColor" white="1" alpha="1" colorSpace="calibratedWhite"/>
-                    <fontDescription key="fontDescription" type="system" pointSize="14"/>
-                    <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
-                </textView>
-                <button opaque="NO" contentMode="scaleToFill" fixedFrame="YES" contentHorizontalAlignment="center" contentVerticalAlignment="center" buttonType="roundedRect" lineBreakMode="middleTruncation" translatesAutoresizingMaskIntoConstraints="NO" id="AAC-Bk-PCC">
-                    <rect key="frame" x="76" y="37" width="168" height="30"/>
-                    <color key="backgroundColor" white="0.33333333333333331" alpha="1" colorSpace="calibratedWhite"/>
-                    <state key="normal" title="Run Model">
-                        <color key="titleShadowColor" white="0.5" alpha="1" colorSpace="calibratedWhite"/>
-                    </state>
-                    <connections>
-                        <action selector="getUrl:" destination="-1" eventType="touchUpInside" id="mdP-nK-k9T"/>
-                    </connections>
-                </button>
-            </subviews>
-            <color key="backgroundColor" red="0.78314738357315861" green="0.79869981749999996" blue="0.56305065858222869" alpha="1" colorSpace="calibratedRGB"/>
-        </view>
-        <textField opaque="NO" clipsSubviews="YES" contentMode="scaleToFill" contentHorizontalAlignment="left" contentVerticalAlignment="center" text="http://localhost:8080" borderStyle="roundedRect" placeholder="Enter URL" minimumFontSize="17" id="hPw-q5-vh5">
-            <rect key="frame" x="0.0" y="0.0" width="280" height="30"/>
-            <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
-            <fontDescription key="fontDescription" type="system" pointSize="14"/>
-            <textInputTraits key="textInputTraits"/>
-            <point key="canvasLocation" x="795" y="44"/>
-        </textField>
-    </objects>
-</document>
diff --git a/tensorflow/examples/ios/simple/data/grace_hopper.jpg b/tensorflow/examples/ios/simple/data/grace_hopper.jpg
deleted file mode 100644
index d2a427810f6..00000000000
Binary files a/tensorflow/examples/ios/simple/data/grace_hopper.jpg and /dev/null differ
diff --git a/tensorflow/examples/ios/simple/ios_image_load.h b/tensorflow/examples/ios/simple/ios_image_load.h
deleted file mode 100644
index 0e0b771118b..00000000000
--- a/tensorflow/examples/ios/simple/ios_image_load.h
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef TENSORFLOW_EXAMPLES_IOS_IOS_IMAGE_LOAD_H_
-#define TENSORFLOW_EXAMPLES_IOS_IOS_IMAGE_LOAD_H_
-
-#include <vector>
-
-#include "tensorflow/core/framework/types.h"
-
-std::vector<tensorflow::uint8> LoadImageFromFile(const char* file_name,
-						 int* out_width,
-						 int* out_height,
-						 int* out_channels);
-
-#endif  // TENSORFLOW_EXAMPLES_IOS_IOS_IMAGE_LOAD_H_
diff --git a/tensorflow/examples/ios/simple/ios_image_load.mm b/tensorflow/examples/ios/simple/ios_image_load.mm
deleted file mode 100644
index 64d1ea21cf2..00000000000
--- a/tensorflow/examples/ios/simple/ios_image_load.mm
+++ /dev/null
@@ -1,87 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "ios_image_load.h"
-
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-#include <stdio.h>
-
-#import <CoreImage/CoreImage.h>
-#import <ImageIO/ImageIO.h>
-
-using tensorflow::uint8;
-
-std::vector<uint8> LoadImageFromFile(const char* file_name,
-				     int* out_width, int* out_height,
-				     int* out_channels) {
-  FILE* file_handle = fopen(file_name, "rb");
-  fseek(file_handle, 0, SEEK_END);
-  const size_t bytes_in_file = ftell(file_handle);
-  fseek(file_handle, 0, SEEK_SET);
-  std::vector<uint8> file_data(bytes_in_file);
-  fread(file_data.data(), 1, bytes_in_file, file_handle);
-  fclose(file_handle);
-  CFDataRef file_data_ref = CFDataCreateWithBytesNoCopy(NULL, file_data.data(),
-						      bytes_in_file,
-						      kCFAllocatorNull);
-  CGDataProviderRef image_provider =
-    CGDataProviderCreateWithCFData(file_data_ref);
-
-  const char* suffix = strrchr(file_name, '.');
-  if (!suffix || suffix == file_name) {
-    suffix = "";
-  }
-  CGImageRef image;
-  if (strcasecmp(suffix, ".png") == 0) {
-    image = CGImageCreateWithPNGDataProvider(image_provider, NULL, true,
-					     kCGRenderingIntentDefault);
-  } else if ((strcasecmp(suffix, ".jpg") == 0) ||
-    (strcasecmp(suffix, ".jpeg") == 0)) {
-    image = CGImageCreateWithJPEGDataProvider(image_provider, NULL, true,
-					      kCGRenderingIntentDefault);
-  } else {
-    CFRelease(image_provider);
-    CFRelease(file_data_ref);
-    fprintf(stderr, "Unknown suffix for file '%s'\n", file_name);
-    *out_width = 0;
-    *out_height = 0;
-    *out_channels = 0;
-    return std::vector<uint8>();
-  }
-
-  const int width = (int)CGImageGetWidth(image);
-  const int height = (int)CGImageGetHeight(image);
-  const int channels = 4;
-  CGColorSpaceRef color_space = CGColorSpaceCreateDeviceRGB();
-  const int bytes_per_row = (width * channels);
-  const int bytes_in_image = (bytes_per_row * height);
-  std::vector<uint8> result(bytes_in_image);
-  const int bits_per_component = 8;
-  CGContextRef context = CGBitmapContextCreate(result.data(), width, height,
-    bits_per_component, bytes_per_row, color_space,
-    kCGImageAlphaPremultipliedLast | kCGBitmapByteOrder32Big);
-  CGColorSpaceRelease(color_space);
-  CGContextDrawImage(context, CGRectMake(0, 0, width, height), image);
-  CGContextRelease(context);
-  CFRelease(image);
-  CFRelease(image_provider);
-  CFRelease(file_data_ref);
-
-  *out_width = width;
-  *out_height = height;
-  *out_channels = channels;
-  return result;
-}
diff --git a/tensorflow/examples/ios/simple/main.mm b/tensorflow/examples/ios/simple/main.mm
deleted file mode 100644
index d70550a7307..00000000000
--- a/tensorflow/examples/ios/simple/main.mm
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright 2015 Google Inc. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#import <UIKit/UIKit.h>
-
-int main(int argc, char * argv[]) {
-  @autoreleasepool {
-    NSString *delegateClassName = @"AppDelegate";
-    return UIApplicationMain(argc, argv, nil, delegateClassName);
-  }
-}
diff --git a/tensorflow/examples/ios/simple/tf_simple_example.xcodeproj/project.pbxproj b/tensorflow/examples/ios/simple/tf_simple_example.xcodeproj/project.pbxproj
deleted file mode 100644
index 55c06e28fb3..00000000000
--- a/tensorflow/examples/ios/simple/tf_simple_example.xcodeproj/project.pbxproj
+++ /dev/null
@@ -1,404 +0,0 @@
-// !$*UTF8*$!
-{
-	archiveVersion = 1;
-	classes = {
-	};
-	objectVersion = 46;
-	objects = {
-
-/* Begin PBXBuildFile section */
-		1C0D734B1ECCC460008C1DAB /* CoreGraphics.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */; };
-		1CA45FFF1ECCC356002FA6A4 /* UIKit.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */; };
-		2530463E3C9A9D5FB9299C0E /* libPods-tf_simple_example.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 73DBC33C5DD9A526EE6D1EF2 /* libPods-tf_simple_example.a */; };
-		59A3D0011CF4E68100C4259F /* AppDelegate.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFF21CF4E68100C4259F /* AppDelegate.mm */; };
-		59A3D0031CF4E68100C4259F /* grace_hopper.jpg in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */; };
-		59A3D0051CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */; };
-		59A3D0071CF4E68100C4259F /* tensorflow_inception_graph.pb in Resources */ = {isa = PBXBuildFile; fileRef = 59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */; };
-		59A3D0081CF4E68100C4259F /* ios_image_load.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */; };
-		59A3D0091CF4E68100C4259F /* main.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFC1CF4E68100C4259F /* main.mm */; };
-		59A3D00B1CF4E68100C4259F /* RunModelViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 59A3CFFF1CF4E68100C4259F /* RunModelViewController.mm */; };
-		59A3D00C1CF4E68100C4259F /* RunModelViewController.xib in Resources */ = {isa = PBXBuildFile; fileRef = 59A3D0001CF4E68100C4259F /* RunModelViewController.xib */; };
-/* End PBXBuildFile section */
-
-/* Begin PBXFileReference section */
-		1C0D73481ECCC41B008C1DAB /* CoreImage.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreImage.framework; path = System/Library/Frameworks/CoreImage.framework; sourceTree = SDKROOT; };
-		1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreGraphics.framework; path = System/Library/Frameworks/CoreGraphics.framework; sourceTree = SDKROOT; };
-		1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = UIKit.framework; path = System/Library/Frameworks/UIKit.framework; sourceTree = SDKROOT; };
-		5911579B1CF4011C00C31E3A /* tf_simple_example.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = tf_simple_example.app; sourceTree = BUILT_PRODUCTS_DIR; };
-		59A3CFF11CF4E68100C4259F /* AppDelegate.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
-		59A3CFF21CF4E68100C4259F /* AppDelegate.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = AppDelegate.mm; sourceTree = "<group>"; };
-		59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */ = {isa = PBXFileReference; lastKnownFileType = image.jpeg; path = grace_hopper.jpg; sourceTree = "<group>"; };
-		59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = imagenet_comp_graph_label_strings.txt; sourceTree = "<group>"; };
-		59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */ = {isa = PBXFileReference; lastKnownFileType = file; path = tensorflow_inception_graph.pb; sourceTree = "<group>"; };
-		59A3CFFA1CF4E68100C4259F /* ios_image_load.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ios_image_load.h; sourceTree = "<group>"; };
-		59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ios_image_load.mm; sourceTree = "<group>"; };
-		59A3CFFC1CF4E68100C4259F /* main.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = main.mm; sourceTree = "<group>"; };
-		59A3CFFD1CF4E68100C4259F /* RunModel-Info.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = "RunModel-Info.plist"; sourceTree = "<group>"; };
-		59A3CFFE1CF4E68100C4259F /* RunModelViewController.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RunModelViewController.h; sourceTree = "<group>"; };
-		59A3CFFF1CF4E68100C4259F /* RunModelViewController.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = RunModelViewController.mm; sourceTree = "<group>"; };
-		59A3D0001CF4E68100C4259F /* RunModelViewController.xib */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = file.xib; path = RunModelViewController.xib; sourceTree = "<group>"; };
-		73DBC33C5DD9A526EE6D1EF2 /* libPods-tf_simple_example.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = "libPods-tf_simple_example.a"; sourceTree = BUILT_PRODUCTS_DIR; };
-		87ABECA6543FF90E81111A6D /* Pods-tf_simple_example.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tf_simple_example.release.xcconfig"; path = "Pods/Target Support Files/Pods-tf_simple_example/Pods-tf_simple_example.release.xcconfig"; sourceTree = "<group>"; };
-		8C94FEE43FD467468C5B75AA /* Pods-tf_simple_example.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tf_simple_example.debug.xcconfig"; path = "Pods/Target Support Files/Pods-tf_simple_example/Pods-tf_simple_example.debug.xcconfig"; sourceTree = "<group>"; };
-/* End PBXFileReference section */
-
-/* Begin PBXFrameworksBuildPhase section */
-		591157981CF4011C00C31E3A /* Frameworks */ = {
-			isa = PBXFrameworksBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				1C0D734B1ECCC460008C1DAB /* CoreGraphics.framework in Frameworks */,
-				1CA45FFF1ECCC356002FA6A4 /* UIKit.framework in Frameworks */,
-				2530463E3C9A9D5FB9299C0E /* libPods-tf_simple_example.a in Frameworks */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXFrameworksBuildPhase section */
-
-/* Begin PBXGroup section */
-		24D7686C331131624F4454A0 /* Frameworks */ = {
-			isa = PBXGroup;
-			children = (
-				1C0D734A1ECCC460008C1DAB /* CoreGraphics.framework */,
-				1C0D73481ECCC41B008C1DAB /* CoreImage.framework */,
-				1CA45FFE1ECCC356002FA6A4 /* UIKit.framework */,
-				73DBC33C5DD9A526EE6D1EF2 /* libPods-tf_simple_example.a */,
-			);
-			name = Frameworks;
-			sourceTree = "<group>";
-		};
-		3E9FC355632FB928EA23BEED /* Pods */ = {
-			isa = PBXGroup;
-			children = (
-				8C94FEE43FD467468C5B75AA /* Pods-tf_simple_example.debug.xcconfig */,
-				87ABECA6543FF90E81111A6D /* Pods-tf_simple_example.release.xcconfig */,
-			);
-			name = Pods;
-			sourceTree = "<group>";
-		};
-		591157921CF4011C00C31E3A = {
-			isa = PBXGroup;
-			children = (
-				59A3CFF11CF4E68100C4259F /* AppDelegate.h */,
-				59A3CFF21CF4E68100C4259F /* AppDelegate.mm */,
-				59A3CFF31CF4E68100C4259F /* data */,
-				59A3CFFA1CF4E68100C4259F /* ios_image_load.h */,
-				59A3CFFB1CF4E68100C4259F /* ios_image_load.mm */,
-				59A3CFFC1CF4E68100C4259F /* main.mm */,
-				59A3CFFD1CF4E68100C4259F /* RunModel-Info.plist */,
-				59A3CFFE1CF4E68100C4259F /* RunModelViewController.h */,
-				59A3CFFF1CF4E68100C4259F /* RunModelViewController.mm */,
-				59A3D0001CF4E68100C4259F /* RunModelViewController.xib */,
-				5911579C1CF4011C00C31E3A /* Products */,
-				3E9FC355632FB928EA23BEED /* Pods */,
-				24D7686C331131624F4454A0 /* Frameworks */,
-			);
-			sourceTree = "<group>";
-		};
-		5911579C1CF4011C00C31E3A /* Products */ = {
-			isa = PBXGroup;
-			children = (
-				5911579B1CF4011C00C31E3A /* tf_simple_example.app */,
-			);
-			name = Products;
-			sourceTree = "<group>";
-		};
-		59A3CFF31CF4E68100C4259F /* data */ = {
-			isa = PBXGroup;
-			children = (
-				59A3CFF51CF4E68100C4259F /* grace_hopper.jpg */,
-				59A3CFF71CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt */,
-				59A3CFF91CF4E68100C4259F /* tensorflow_inception_graph.pb */,
-			);
-			path = data;
-			sourceTree = "<group>";
-		};
-/* End PBXGroup section */
-
-/* Begin PBXNativeTarget section */
-		5911579A1CF4011C00C31E3A /* tf_simple_example */ = {
-			isa = PBXNativeTarget;
-			buildConfigurationList = 591157B21CF4011D00C31E3A /* Build configuration list for PBXNativeTarget "tf_simple_example" */;
-			buildPhases = (
-				1CD07C1CEB04E50C5975C7BB /* [CP] Check Pods Manifest.lock */,
-				591157971CF4011C00C31E3A /* Sources */,
-				591157981CF4011C00C31E3A /* Frameworks */,
-				591157991CF4011C00C31E3A /* Resources */,
-				0EABEF9F31578BDA8CA9D2A7 /* [CP] Embed Pods Frameworks */,
-				96DDF9E6E35958387A215092 /* [CP] Copy Pods Resources */,
-			);
-			buildRules = (
-			);
-			dependencies = (
-			);
-			name = tf_simple_example;
-			productName = tf_ios_makefile_example;
-			productReference = 5911579B1CF4011C00C31E3A /* tf_simple_example.app */;
-			productType = "com.apple.product-type.application";
-		};
-/* End PBXNativeTarget section */
-
-/* Begin PBXProject section */
-		591157931CF4011C00C31E3A /* Project object */ = {
-			isa = PBXProject;
-			attributes = {
-				LastUpgradeCheck = 0830;
-				ORGANIZATIONNAME = Google;
-				TargetAttributes = {
-					5911579A1CF4011C00C31E3A = {
-						CreatedOnToolsVersion = 7.2;
-						DevelopmentTeam = 85Z3VXS37U;
-					};
-				};
-			};
-			buildConfigurationList = 591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tf_simple_example" */;
-			compatibilityVersion = "Xcode 3.2";
-			developmentRegion = English;
-			hasScannedForEncodings = 0;
-			knownRegions = (
-				en,
-				Base,
-			);
-			mainGroup = 591157921CF4011C00C31E3A;
-			productRefGroup = 5911579C1CF4011C00C31E3A /* Products */;
-			projectDirPath = "";
-			projectRoot = "";
-			targets = (
-				5911579A1CF4011C00C31E3A /* tf_simple_example */,
-			);
-		};
-/* End PBXProject section */
-
-/* Begin PBXResourcesBuildPhase section */
-		591157991CF4011C00C31E3A /* Resources */ = {
-			isa = PBXResourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				59A3D00C1CF4E68100C4259F /* RunModelViewController.xib in Resources */,
-				59A3D0051CF4E68100C4259F /* imagenet_comp_graph_label_strings.txt in Resources */,
-				59A3D0071CF4E68100C4259F /* tensorflow_inception_graph.pb in Resources */,
-				59A3D0031CF4E68100C4259F /* grace_hopper.jpg in Resources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXResourcesBuildPhase section */
-
-/* Begin PBXShellScriptBuildPhase section */
-		0EABEF9F31578BDA8CA9D2A7 /* [CP] Embed Pods Frameworks */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-			);
-			name = "[CP] Embed Pods Frameworks";
-			outputPaths = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tf_simple_example/Pods-tf_simple_example-frameworks.sh\"\n";
-			showEnvVarsInLog = 0;
-		};
-		1CD07C1CEB04E50C5975C7BB /* [CP] Check Pods Manifest.lock */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-			);
-			name = "[CP] Check Pods Manifest.lock";
-			outputPaths = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "diff \"${PODS_PODFILE_DIR_PATH}/Podfile.lock\" \"${PODS_ROOT}/Manifest.lock\" > /dev/null\nif [ $? != 0 ] ; then\n    # print error to STDERR\n    echo \"error: The sandbox is not in sync with the Podfile.lock. Run 'pod install' or update your CocoaPods installation.\" >&2\n    exit 1\nfi\n";
-			showEnvVarsInLog = 0;
-		};
-		96DDF9E6E35958387A215092 /* [CP] Copy Pods Resources */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-			);
-			name = "[CP] Copy Pods Resources";
-			outputPaths = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tf_simple_example/Pods-tf_simple_example-resources.sh\"\n";
-			showEnvVarsInLog = 0;
-		};
-/* End PBXShellScriptBuildPhase section */
-
-/* Begin PBXSourcesBuildPhase section */
-		591157971CF4011C00C31E3A /* Sources */ = {
-			isa = PBXSourcesBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-				59A3D0091CF4E68100C4259F /* main.mm in Sources */,
-				59A3D0011CF4E68100C4259F /* AppDelegate.mm in Sources */,
-				59A3D00B1CF4E68100C4259F /* RunModelViewController.mm in Sources */,
-				59A3D0081CF4E68100C4259F /* ios_image_load.mm in Sources */,
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-		};
-/* End PBXSourcesBuildPhase section */
-
-/* Begin XCBuildConfiguration section */
-		591157B01CF4011D00C31E3A /* Debug */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = dwarf;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				ENABLE_TESTABILITY = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu99;
-				GCC_DYNAMIC_NO_PIC = NO;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_OPTIMIZATION_LEVEL = 0;
-				GCC_PREPROCESSOR_DEFINITIONS = (
-					"DEBUG=1",
-					"$(inherited)",
-				);
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
-				MTL_ENABLE_DEBUG_INFO = YES;
-				ONLY_ACTIVE_ARCH = YES;
-				SDKROOT = iphoneos;
-				TARGETED_DEVICE_FAMILY = "1,2";
-			};
-			name = Debug;
-		};
-		591157B11CF4011D00C31E3A /* Release */ = {
-			isa = XCBuildConfiguration;
-			buildSettings = {
-				ALWAYS_SEARCH_USER_PATHS = NO;
-				CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
-				CLANG_CXX_LIBRARY = "libc++";
-				CLANG_ENABLE_MODULES = YES;
-				CLANG_ENABLE_OBJC_ARC = YES;
-				CLANG_WARN_BOOL_CONVERSION = YES;
-				CLANG_WARN_CONSTANT_CONVERSION = YES;
-				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
-				CLANG_WARN_EMPTY_BODY = YES;
-				CLANG_WARN_ENUM_CONVERSION = YES;
-				CLANG_WARN_INFINITE_RECURSION = YES;
-				CLANG_WARN_INT_CONVERSION = YES;
-				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
-				CLANG_WARN_SUSPICIOUS_MOVE = YES;
-				CLANG_WARN_UNREACHABLE_CODE = YES;
-				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
-				"CODE_SIGN_IDENTITY[sdk=iphoneos*]" = "iPhone Developer";
-				COPY_PHASE_STRIP = NO;
-				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
-				ENABLE_NS_ASSERTIONS = NO;
-				ENABLE_STRICT_OBJC_MSGSEND = YES;
-				GCC_C_LANGUAGE_STANDARD = gnu99;
-				GCC_NO_COMMON_BLOCKS = YES;
-				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
-				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
-				GCC_WARN_UNDECLARED_SELECTOR = YES;
-				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
-				GCC_WARN_UNUSED_FUNCTION = YES;
-				GCC_WARN_UNUSED_VARIABLE = YES;
-				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
-				MTL_ENABLE_DEBUG_INFO = NO;
-				SDKROOT = iphoneos;
-				TARGETED_DEVICE_FAMILY = "1,2";
-				VALIDATE_PRODUCT = YES;
-			};
-			name = Release;
-		};
-		591157B31CF4011D00C31E3A /* Debug */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = 8C94FEE43FD467468C5B75AA /* Pods-tf_simple_example.debug.xcconfig */;
-			buildSettings = {
-				CLANG_DEBUG_INFORMATION_LEVEL = default;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				ENABLE_BITCODE = NO;
-				GCC_ENABLE_CPP_EXCEPTIONS = YES;
-				GCC_ENABLE_CPP_RTTI = YES;
-				HEADER_SEARCH_PATHS = "$(inherited)";
-				INFOPLIST_FILE = "$(SRCROOT)/RunModel-Info.plist";
-				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
-				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
-				LIBRARY_SEARCH_PATHS = "";
-				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
-				OTHER_LDFLAGS = "$(inherited)";
-				PRODUCT_BUNDLE_IDENTIFIER = "com.google.tf-simple-example";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SEPARATE_STRIP = NO;
-			};
-			name = Debug;
-		};
-		591157B41CF4011D00C31E3A /* Release */ = {
-			isa = XCBuildConfiguration;
-			baseConfigurationReference = 87ABECA6543FF90E81111A6D /* Pods-tf_simple_example.release.xcconfig */;
-			buildSettings = {
-				CLANG_DEBUG_INFORMATION_LEVEL = default;
-				CODE_SIGN_IDENTITY = "iPhone Developer";
-				ENABLE_BITCODE = NO;
-				GCC_ENABLE_CPP_EXCEPTIONS = YES;
-				GCC_ENABLE_CPP_RTTI = YES;
-				HEADER_SEARCH_PATHS = "$(inherited)";
-				INFOPLIST_FILE = "$(SRCROOT)/RunModel-Info.plist";
-				IPHONEOS_DEPLOYMENT_TARGET = 9.2;
-				LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/Frameworks";
-				LIBRARY_SEARCH_PATHS = "";
-				ONLY_ACTIVE_ARCH = YES;
-				OTHER_CPLUSPLUSFLAGS = "$(OTHER_CFLAGS)";
-				OTHER_LDFLAGS = "$(inherited)";
-				PRODUCT_BUNDLE_IDENTIFIER = "com.google.tf-simple-example";
-				PRODUCT_NAME = "$(TARGET_NAME)";
-				SEPARATE_STRIP = NO;
-			};
-			name = Release;
-		};
-/* End XCBuildConfiguration section */
-
-/* Begin XCConfigurationList section */
-		591157961CF4011C00C31E3A /* Build configuration list for PBXProject "tf_simple_example" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				591157B01CF4011D00C31E3A /* Debug */,
-				591157B11CF4011D00C31E3A /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-		591157B21CF4011D00C31E3A /* Build configuration list for PBXNativeTarget "tf_simple_example" */ = {
-			isa = XCConfigurationList;
-			buildConfigurations = (
-				591157B31CF4011D00C31E3A /* Debug */,
-				591157B41CF4011D00C31E3A /* Release */,
-			);
-			defaultConfigurationIsVisible = 0;
-			defaultConfigurationName = Release;
-		};
-/* End XCConfigurationList section */
-	};
-	rootObject = 591157931CF4011C00C31E3A /* Project object */;
-}
diff --git a/tensorflow/examples/label_image/BUILD b/tensorflow/examples/label_image/BUILD
index 7c3a6dca1b2..3e1772dc21c 100644
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@@ -8,8 +8,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 exports_files(["data/grace_hopper.jpg"])
 
 tf_cc_binary(
diff --git a/tensorflow/examples/multibox_detector/BUILD b/tensorflow/examples/multibox_detector/BUILD
index 4b1e18954fa..5a7a4cbeb71 100644
--- a/tensorflow/examples/multibox_detector/BUILD
+++ b/tensorflow/examples/multibox_detector/BUILD
@@ -8,8 +8,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 tf_cc_binary(
     name = "detect_objects",
     srcs = [
diff --git a/tensorflow/examples/saved_model/integration_tests/BUILD b/tensorflow/examples/saved_model/integration_tests/BUILD
deleted file mode 100644
index 0bfbee1ee2a..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/BUILD
+++ /dev/null
@@ -1,86 +0,0 @@
-load("//tensorflow/core/platform/default:distribute.bzl", "distribute_py_test")
-
-package(
-    licenses = ["notice"],  # Apache 2.0
-)
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "integration_scripts",
-    srcs = [
-        "deploy_mnist_cnn.py",
-        "export_mnist_cnn.py",
-        "export_rnn_cell.py",
-        "export_simple_text_embedding.py",
-        "export_text_rnn_model.py",
-        "integration_scripts.py",
-        "use_mnist_cnn.py",
-        "use_model_in_sequential_keras.py",
-        "use_rnn_cell.py",
-        "use_text_embedding_in_dataset.py",
-        "use_text_rnn_model.py",
-    ],
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        ":distribution_strategy_utils",
-        ":mnist_util",
-        "//tensorflow:tensorflow_py",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_library(
-    name = "mnist_util",
-    srcs = ["mnist_util.py"],
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_library(
-    name = "distribution_strategy_utils",
-    srcs = ["distribution_strategy_utils.py"],
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/python/distribute:strategy_combinations",
-    ],
-)
-
-distribute_py_test(
-    name = "saved_model_test",
-    srcs = [
-        "saved_model_test.py",
-    ],
-    shard_count = 4,
-    tags = [
-        "no_pip",  # b/131697937 and b/132196869
-        "noasan",  # forge input size exceeded
-        "nomsan",  # forge input size exceeded
-        "notsan",  # forge input size exceeded
-    ],
-    tpu_tags = [
-        "no_oss",  # Test infra collision (b/157754990)
-    ],
-    deps = [
-        ":distribution_strategy_utils",
-        ":integration_scripts",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework_combinations",
-        "//tensorflow/python/distribute:combinations",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-# b/132234211: Target added to support internal test target that runs the test
-# in an environment that has the extra dependencies required to test integration
-# with non core tensorflow packages.
-py_library(
-    name = "saved_model_test_lib",
-    srcs = [
-        "saved_model_test.py",
-    ],
-    visibility = ["//tensorflow:internal"],
-    deps = [":integration_scripts"],
-)
diff --git a/tensorflow/examples/saved_model/integration_tests/deploy_mnist_cnn.py b/tensorflow/examples/saved_model/integration_tests/deploy_mnist_cnn.py
deleted file mode 100644
index cc5cb6b2a6c..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/deploy_mnist_cnn.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Deploys a SavedModel with an MNIST classifier to TFLite."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl import app
-from absl import flags
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-from tensorflow.examples.saved_model.integration_tests import mnist_util
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string(
-    'saved_model_dir', None,
-    'Directory of the SavedModel to deploy.')
-flags.DEFINE_bool(
-    'use_fashion_mnist', False,
-    'Use Fashion MNIST (products) instead of the real MNIST (digits).')
-flags.DEFINE_bool(
-    'fast_test_mode', False,
-    'Limit amount of test data for running in unit tests.')
-flags.DEFINE_string(
-    'tflite_output_file', None,
-    'The filename of the .tflite model file to write (optional).')
-flags.DEFINE_bool(
-    'reload_as_keras_model', True,
-    'Also test tf.keras.models.load_model() on --saved_model_dir.')
-
-
-def main(argv):
-  del argv
-
-  # First convert the SavedModel in a pristine environment.
-  converter = tf.lite.TFLiteConverter.from_saved_model(FLAGS.saved_model_dir)
-  lite_model_content = converter.convert()
-  # Here is how you can save it for actual deployment.
-  if FLAGS.tflite_output_file:
-    with open(FLAGS.tflite_output_file, 'wb') as outfile:
-      outfile.write(lite_model_content)
-  # For testing, the TFLite model can be executed like this.
-  interpreter = tf.lite.Interpreter(model_content=lite_model_content)
-  def lite_model(images):
-    interpreter.allocate_tensors()
-    interpreter.set_tensor(interpreter.get_input_details()[0]['index'], images)
-    interpreter.invoke()
-    return interpreter.get_tensor(interpreter.get_output_details()[0]['index'])
-
-  # Load the SavedModel again for use as a test baseline.
-  imported = tf.saved_model.load(FLAGS.saved_model_dir)
-  def tf_model(images):
-    output_dict = imported.signatures['serving_default'](tf.constant(images))
-    logits, = output_dict.values()  # Unpack single value.
-    return logits
-
-  # Compare model outputs on the test inputs.
-  (_, _), (x_test, _) = mnist_util.load_reshaped_data(
-      use_fashion_mnist=FLAGS.use_fashion_mnist,
-      fake_tiny_data=FLAGS.fast_test_mode)
-  for i, x in enumerate(x_test):
-    x = x[None, ...]  # Make batch of size 1.
-    y_lite = lite_model(x)
-    y_tf = tf_model(x)
-    # This numpy primitive uses plain `raise` and works outside tf.TestCase.
-    # Model outputs are probabilities that sum to 1, so atol makes sense here.
-    np.testing.assert_allclose(
-        y_lite, y_tf, rtol=0, atol=1e-5,
-        err_msg='Mismatch with TF Lite at test example %d' % i)
-
-  # Test that the SavedModel loads correctly with v1 load APIs as well.
-  with tf.compat.v1.Graph().as_default(), tf.compat.v1.Session() as session:
-    tf.compat.v1.saved_model.load(
-        session,
-        [tf.compat.v1.saved_model.SERVING],
-        FLAGS.saved_model_dir)
-
-  # The SavedModel actually was a Keras Model; test that it also loads as that.
-  if FLAGS.reload_as_keras_model:
-    keras_model = tf.keras.models.load_model(FLAGS.saved_model_dir)
-    for i, x in enumerate(x_test):
-      x = x[None, ...]  # Make batch of size 1.
-      y_tf = tf_model(x)
-      y_keras = keras_model(x)
-      # This numpy primitive uses plain `raise` and works outside tf.TestCase.
-      # Model outputs are probabilities that sum to 1, so atol makes sense here.
-      np.testing.assert_allclose(
-          y_tf, y_keras, rtol=0, atol=1e-5,
-          err_msg='Mismatch with Keras at test example %d' % i)
-
-if __name__ == '__main__':
-  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/distribution_strategy_utils.py b/tensorflow/examples/saved_model/integration_tests/distribution_strategy_utils.py
deleted file mode 100644
index eabe4cf3802..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/distribution_strategy_utils.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utils related to tf.distribute.strategy."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-import sys
-
-from tensorflow.python.distribute import strategy_combinations
-
-_strategies = [
-    strategy_combinations.one_device_strategy,
-    strategy_combinations.mirrored_strategy_with_one_cpu,
-    strategy_combinations.mirrored_strategy_with_one_gpu,
-    strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-    strategy_combinations.mirrored_strategy_with_two_gpus,
-    strategy_combinations.tpu_strategy,
-]
-
-# The presence of GPU strategies upsets TPU initialization,
-# despite their test instances being skipped early. This is a workaround
-# for b/145386854.
-if "test_tpu" in sys.argv[0]:
-  _strategies = [s for s in _strategies if "GPU" not in str(s)]
-
-
-named_strategies = collections.OrderedDict(
-    [(None, None)] +
-    [(str(s), s) for s in _strategies]
-)
-
-
-class MaybeDistributionScope(object):
-  """Provides a context allowing no distribution strategy."""
-
-  @staticmethod
-  def from_name(name):
-    return MaybeDistributionScope(named_strategies[name].strategy if name
-                                  else None)
-
-  def __init__(self, distribution):
-    self._distribution = distribution
-    self._scope = None
-
-  def __enter__(self):
-    if self._distribution:
-      self._scope = self._distribution.scope()
-      self._scope.__enter__()
-
-  def __exit__(self, exc_type, value, traceback):
-    if self._distribution:
-      self._scope.__exit__(exc_type, value, traceback)
-      self._scope = None
diff --git a/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py b/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py
deleted file mode 100644
index ea1c138d164..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Exports a convolutional feature extractor for MNIST in SavedModel format.
-
-The feature extractor is a convolutional neural network plus a hidden layer
-that gets trained as part of an MNIST classifier and then written to a
-SavedModel (without the classification layer). From there, use_mnist_cnn.py
-picks it up for transfer learning.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl import app
-from absl import flags
-import tensorflow.compat.v2 as tf
-
-from tensorflow.examples.saved_model.integration_tests import mnist_util
-from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_inspect
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string(
-    'export_dir', None,
-    'Directory of exported SavedModel.')
-flags.DEFINE_integer(
-    'epochs', 10,
-    'Number of epochs to train.')
-flags.DEFINE_bool(
-    'use_keras_save_api', False,
-    'Uses tf.keras.models.save_model() on the feature extractor '
-    'instead of tf.saved_model.save() on a manually wrapped version. '
-    'With this, the exported model has no hparams.')
-flags.DEFINE_bool(
-    'fast_test_mode', False,
-    'Shortcut training for running in unit tests.')
-flags.DEFINE_bool(
-    'export_print_hparams', False,
-    'If true, the exported function will print its effective hparams.')
-
-
-def make_feature_extractor(l2_strength, dropout_rate):
-  """Returns a Keras Model to compute a feature vector from MNIST images."""
-  regularizer = lambda: tf.keras.regularizers.l2(l2_strength)
-  net = inp = tf.keras.Input(mnist_util.INPUT_SHAPE)
-  net = tf.keras.layers.Conv2D(32, (3, 3), activation='relu', name='conv1',
-                               kernel_regularizer=regularizer())(net)
-  net = tf.keras.layers.Conv2D(64, (3, 3), activation='relu', name='conv2',
-                               kernel_regularizer=regularizer())(net)
-  net = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), name='pool1')(net)
-  net = tf.keras.layers.Dropout(dropout_rate, name='dropout1')(net)
-  net = tf.keras.layers.Flatten(name='flatten')(net)
-  net = tf.keras.layers.Dense(10, activation='relu', name='dense1',
-                              kernel_regularizer=regularizer())(net)
-  return tf.keras.Model(inputs=inp, outputs=net)
-
-
-def set_feature_extractor_hparams(model, dropout_rate):
-  model.get_layer('dropout1').rate = dropout_rate
-
-
-def make_classifier(feature_extractor, l2_strength, dropout_rate=0.5):
-  """Returns a Keras Model to classify MNIST using feature_extractor."""
-  regularizer = lambda: tf.keras.regularizers.l2(l2_strength)
-  net = inp = tf.keras.Input(mnist_util.INPUT_SHAPE)
-  net = feature_extractor(net)
-  net = tf.keras.layers.Dropout(dropout_rate)(net)
-  net = tf.keras.layers.Dense(mnist_util.NUM_CLASSES, activation='softmax',
-                              kernel_regularizer=regularizer())(net)
-  return tf.keras.Model(inputs=inp, outputs=net)
-
-
-def wrap_keras_model_for_export(model, batch_input_shape,
-                                set_hparams, default_hparams):
-  """Wraps `model` for saving and loading as SavedModel."""
-  # The primary input to the module is a Tensor with a batch of images.
-  # Here we determine its spec.
-  inputs_spec = tf.TensorSpec(shape=batch_input_shape, dtype=tf.float32)
-
-  # The module also accepts certain hparams as optional Tensor inputs.
-  # Here, we cut all the relevant slices from `default_hparams`
-  # (and don't worry if anyone accidentally modifies it later).
-  if default_hparams is None: default_hparams = {}
-  hparam_keys = list(default_hparams.keys())
-  hparam_defaults = tuple(default_hparams.values())
-  hparams_spec = {name: tf.TensorSpec.from_tensor(tf.constant(value))
-                  for name, value in default_hparams.items()}
-
-  # The goal is to save a function with this argspec...
-  argspec = tf_inspect.FullArgSpec(
-      args=(['inputs', 'training'] + hparam_keys),
-      defaults=((False,) + hparam_defaults),
-      varargs=None, varkw=None,
-      kwonlyargs=[], kwonlydefaults=None,
-      annotations={})
-  # ...and this behavior:
-  def call_fn(inputs, training, *args):
-    if FLAGS.export_print_hparams:
-      args = [tf.keras.backend.print_tensor(args[i], 'training=%s and %s='
-                                            % (training, hparam_keys[i]))
-              for i in range(len(args))]
-    kwargs = dict(zip(hparam_keys, args))
-    if kwargs: set_hparams(model, **kwargs)
-    return model(inputs, training=training)
-
-  # We cannot spell out `args` in def statement for call_fn, but since
-  # tf.function uses tf_inspect, we can use tf_decorator to wrap it with
-  # the desired argspec.
-  def wrapped(*args, **kwargs):  # TODO(arnoegw): Can we use call_fn itself?
-    return call_fn(*args, **kwargs)
-  traced_call_fn = tf.function(
-      tf_decorator.make_decorator(call_fn, wrapped, decorator_argspec=argspec))
-
-  # Now we need to trigger traces for all supported combinations of the
-  # non-Tensor-value inputs.
-  for training in (True, False):
-    traced_call_fn.get_concrete_function(inputs_spec, training, **hparams_spec)
-
-  # Finally, we assemble the object for tf.saved_model.save().
-  obj = tf.train.Checkpoint()
-  obj.__call__ = traced_call_fn
-  obj.trainable_variables = model.trainable_variables
-  obj.variables = model.trainable_variables + model.non_trainable_variables
-  # Make tf.functions for the regularization terms of the loss.
-  obj.regularization_losses = [_get_traced_loss(model, i)
-                               for i in range(len(model.losses))]
-  return obj
-
-
-def _get_traced_loss(model, i):
-  """Returns tf.function for model.losses[i] with a trace for zero args.
-
-  The intended usage is
-    [_get_traced_loss(model, i) for i in range(len(model.losses))]
-  This is better than
-    [tf.function(lambda: model.losses[i], input_signature=[]) for i ...]
-  because it avoids capturing a loop index in a lambda, and removes any
-  chance of deferring the trace.
-
-  Args:
-    model: a Keras Model.
-    i: an integer between from 0 up to but to len(model.losses).
-  """
-  f = tf.function(lambda: model.losses[i])
-  _ = f.get_concrete_function()
-  return f
-
-
-def main(argv):
-  del argv
-
-  # Build a complete classifier model using a feature extractor.
-  default_hparams = dict(dropout_rate=0.25)
-  l2_strength = 0.01  # Not a hparam for inputs -> outputs.
-  feature_extractor = make_feature_extractor(l2_strength=l2_strength,
-                                             **default_hparams)
-  classifier = make_classifier(feature_extractor, l2_strength=l2_strength)
-
-  # Train the complete model.
-  (x_train, y_train), (x_test, y_test) = mnist_util.load_reshaped_data(
-      fake_tiny_data=FLAGS.fast_test_mode)
-  classifier.compile(loss=tf.keras.losses.categorical_crossentropy,
-                     optimizer=tf.keras.optimizers.SGD(),
-                     metrics=['accuracy'])
-  classifier.fit(x_train, y_train,
-                 batch_size=128,
-                 epochs=FLAGS.epochs,
-                 verbose=1,
-                 validation_data=(x_test, y_test))
-
-  # Save the feature extractor to a framework-agnostic SavedModel for reuse.
-  # Note that the feature_extractor object has not been compiled or fitted,
-  # so it does not contain an optimizer and related state.
-  if FLAGS.use_keras_save_api:
-    # Use Keras' built-in way of creating reusable SavedModels.
-    # This has no support for adjustable hparams at this time (July 2019).
-    # (We could also call tf.saved_model.save(feature_extractor, ...),
-    # point is we're passing a Keras model, not a plain Checkpoint.)
-    tf.keras.models.save_model(feature_extractor, FLAGS.export_dir)
-  else:
-    # Assemble a reusable SavedModel manually, with adjustable hparams.
-    exportable = wrap_keras_model_for_export(feature_extractor,
-                                             (None,) + mnist_util.INPUT_SHAPE,
-                                             set_feature_extractor_hparams,
-                                             default_hparams)
-    tf.saved_model.save(exportable, FLAGS.export_dir)
-
-
-if __name__ == '__main__':
-  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py b/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py
deleted file mode 100644
index 6a2853f0617..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Export an RNN cell in SavedModel format."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl import app
-from absl import flags
-import numpy as np
-
-import tensorflow.compat.v2 as tf
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("export_dir", None, "Directory to export SavedModel.")
-
-
-def main(argv):
-  del argv
-
-  root = tf.train.Checkpoint()
-  # Create a cell and attach to our trackable.
-  root.rnn_cell = tf.keras.layers.LSTMCell(units=10, recurrent_initializer=None)
-
-  # Wrap the rnn_cell.__call__ function and assign to next_state.
-  root.next_state = tf.function(root.rnn_cell.__call__)
-
-  # Wrap the rnn_cell.get_initial_function using a decorator and assign to an
-  # attribute with the same name.
-  @tf.function(input_signature=[tf.TensorSpec([None, None], tf.float32)])
-  def get_initial_state(tensor):
-    return root.rnn_cell.get_initial_state(tensor, None, None)
-
-  root.get_initial_state = get_initial_state
-
-  # Construct an initial_state, then call next_state explicitly to trigger a
-  # trace for serialization (we need an explicit call, because next_state has
-  # not been annotated with an input_signature).
-  initial_state = root.get_initial_state(
-      tf.constant(np.random.uniform(size=[3, 10]).astype(np.float32)))
-  root.next_state(
-      tf.constant(np.random.uniform(size=[3, 19]).astype(np.float32)),
-      initial_state)
-
-  tf.saved_model.save(root, FLAGS.export_dir)
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/export_simple_text_embedding.py b/tensorflow/examples/saved_model/integration_tests/export_simple_text_embedding.py
deleted file mode 100644
index 891a8f1c7e2..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/export_simple_text_embedding.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Text embedding model stored as a SavedModel."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import tempfile
-
-from absl import app
-from absl import flags
-
-import tensorflow.compat.v2 as tf
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("export_dir", None, "Directory to export SavedModel.")
-
-
-def write_vocabulary_file(vocabulary):
-  """Write temporary vocab file for module construction."""
-  tmpdir = tempfile.mkdtemp()
-  vocabulary_file = os.path.join(tmpdir, "tokens.txt")
-  with tf.io.gfile.GFile(vocabulary_file, "w") as f:
-    for entry in vocabulary:
-      f.write(entry + "\n")
-  return vocabulary_file
-
-
-class TextEmbeddingModel(tf.train.Checkpoint):
-  """Text embedding model.
-
-  A text embeddings model that takes a sentences on input and outputs the
-  sentence embedding.
-  """
-
-  def __init__(self, vocabulary, emb_dim, oov_buckets):
-    super(TextEmbeddingModel, self).__init__()
-    self._oov_buckets = oov_buckets
-    self._total_size = len(vocabulary) + oov_buckets
-    # Assign the table initializer to this instance to ensure the asset
-    # it depends on is saved with the SavedModel.
-    self._table_initializer = tf.lookup.TextFileInitializer(
-        write_vocabulary_file(vocabulary), tf.string,
-        tf.lookup.TextFileIndex.WHOLE_LINE, tf.int64,
-        tf.lookup.TextFileIndex.LINE_NUMBER)
-    self._table = tf.lookup.StaticVocabularyTable(
-        self._table_initializer, num_oov_buckets=self._oov_buckets)
-    self.embeddings = tf.Variable(
-        tf.random.uniform(shape=[self._total_size, emb_dim]))
-    self.variables = [self.embeddings]
-    self.trainable_variables = self.variables
-
-  def _tokenize(self, sentences):
-    # Perform a minimalistic text preprocessing by removing punctuation and
-    # splitting on spaces.
-    normalized_sentences = tf.strings.regex_replace(
-        input=sentences, pattern=r"\pP", rewrite="")
-    normalized_sentences = tf.reshape(normalized_sentences, [-1])
-    sparse_tokens = tf.strings.split(normalized_sentences, " ").to_sparse()
-
-    # Deal with a corner case: there is one empty sentence.
-    sparse_tokens, _ = tf.sparse.fill_empty_rows(sparse_tokens, tf.constant(""))
-    # Deal with a corner case: all sentences are empty.
-    sparse_tokens = tf.sparse.reset_shape(sparse_tokens)
-    sparse_token_ids = self._table.lookup(sparse_tokens.values)
-
-    return (sparse_tokens.indices, sparse_token_ids, sparse_tokens.dense_shape)
-
-  @tf.function(input_signature=[tf.TensorSpec([None], tf.dtypes.string)])
-  def __call__(self, sentences):
-    token_ids, token_values, token_dense_shape = self._tokenize(sentences)
-
-    return tf.nn.safe_embedding_lookup_sparse(
-        embedding_weights=self.embeddings,
-        sparse_ids=tf.sparse.SparseTensor(token_ids, token_values,
-                                          token_dense_shape),
-        sparse_weights=None,
-        combiner="sqrtn")
-
-
-def main(argv):
-  del argv
-
-  vocabulary = ["cat", "is", "on", "the", "mat"]
-  module = TextEmbeddingModel(vocabulary=vocabulary, emb_dim=10, oov_buckets=10)
-  tf.saved_model.save(module, FLAGS.export_dir)
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/export_text_rnn_model.py b/tensorflow/examples/saved_model/integration_tests/export_text_rnn_model.py
deleted file mode 100644
index 9b9f5925588..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/export_text_rnn_model.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Text RNN model stored as a SavedModel."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl import app
-from absl import flags
-
-import tensorflow.compat.v2 as tf
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("export_dir", None, "Directory to export SavedModel.")
-
-
-class TextRnnModel(tf.train.Checkpoint):
-  """Text RNN model.
-
-  A full generative text RNN model that can train and decode sentences from a
-  starting word.
-  """
-
-  def __init__(self, vocab, emb_dim, buckets, state_size):
-    super(TextRnnModel, self).__init__()
-    self._buckets = buckets
-    self._lstm_cell = tf.keras.layers.LSTMCell(units=state_size)
-    self._rnn_layer = tf.keras.layers.RNN(
-        self._lstm_cell, return_sequences=True)
-    self._embeddings = tf.Variable(tf.random.uniform(shape=[buckets, emb_dim]))
-    self._logit_layer = tf.keras.layers.Dense(buckets)
-    self._set_up_vocab(vocab)
-
-  def _tokenize(self, sentences):
-    # Perform a minimalistic text preprocessing by removing punctuation and
-    # splitting on spaces.
-    normalized_sentences = tf.strings.regex_replace(
-        input=sentences, pattern=r"\pP", rewrite="")
-    sparse_tokens = tf.strings.split(normalized_sentences, " ").to_sparse()
-
-    # Deal with a corner case: there is one empty sentence.
-    sparse_tokens, _ = tf.sparse.fill_empty_rows(sparse_tokens, tf.constant(""))
-    # Deal with a corner case: all sentences are empty.
-    sparse_tokens = tf.sparse.reset_shape(sparse_tokens)
-
-    return (sparse_tokens.indices, sparse_tokens.values,
-            sparse_tokens.dense_shape)
-
-  def _set_up_vocab(self, vocab_tokens):
-    # TODO(vbardiovsky): Currently there is no real vocabulary, because
-    # saved_model serialization does not support trackable resources. Add a real
-    # vocabulary when it does.
-    vocab_list = ["UNK"] * self._buckets
-    for vocab_token in vocab_tokens:
-      index = self._words_to_indices(vocab_token).numpy()
-      vocab_list[index] = vocab_token
-    # This is a variable representing an inverse index.
-    self._vocab_tensor = tf.Variable(vocab_list)
-
-  def _indices_to_words(self, indices):
-    return tf.gather(self._vocab_tensor, indices)
-
-  def _words_to_indices(self, words):
-    return tf.strings.to_hash_bucket(words, self._buckets)
-
-  @tf.function(input_signature=[tf.TensorSpec([None], tf.dtypes.string)])
-  def train(self, sentences):
-    token_ids, token_values, token_dense_shape = self._tokenize(sentences)
-    tokens_sparse = tf.sparse.SparseTensor(
-        indices=token_ids, values=token_values, dense_shape=token_dense_shape)
-    tokens = tf.sparse.to_dense(tokens_sparse, default_value="")
-
-    sparse_lookup_ids = tf.sparse.SparseTensor(
-        indices=tokens_sparse.indices,
-        values=self._words_to_indices(tokens_sparse.values),
-        dense_shape=tokens_sparse.dense_shape)
-    lookup_ids = tf.sparse.to_dense(sparse_lookup_ids, default_value=0)
-
-    # Targets are the next word for each word of the sentence.
-    tokens_ids_seq = lookup_ids[:, 0:-1]
-    tokens_ids_target = lookup_ids[:, 1:]
-
-    tokens_prefix = tokens[:, 0:-1]
-
-    # Mask determining which positions we care about for a loss: all positions
-    # that have a valid non-terminal token.
-    mask = tf.logical_and(
-        tf.logical_not(tf.equal(tokens_prefix, "")),
-        tf.logical_not(tf.equal(tokens_prefix, "<E>")))
-
-    input_mask = tf.cast(mask, tf.int32)
-
-    with tf.GradientTape() as t:
-      sentence_embeddings = tf.nn.embedding_lookup(self._embeddings,
-                                                   tokens_ids_seq)
-
-      lstm_initial_state = self._lstm_cell.get_initial_state(
-          sentence_embeddings)
-
-      lstm_output = self._rnn_layer(
-          inputs=sentence_embeddings, initial_state=lstm_initial_state)
-
-      # Stack LSTM outputs into a batch instead of a 2D array.
-      lstm_output = tf.reshape(lstm_output, [-1, self._lstm_cell.output_size])
-
-      logits = self._logit_layer(lstm_output)
-
-      targets = tf.reshape(tokens_ids_target, [-1])
-      weights = tf.cast(tf.reshape(input_mask, [-1]), tf.float32)
-
-      losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
-          labels=targets, logits=logits)
-
-      # Final loss is the mean loss for all token losses.
-      final_loss = tf.math.divide(
-          tf.reduce_sum(tf.multiply(losses, weights)),
-          tf.reduce_sum(weights),
-          name="final_loss")
-
-    watched = t.watched_variables()
-    gradients = t.gradient(final_loss, watched)
-
-    for w, g in zip(watched, gradients):
-      w.assign_sub(g)
-
-    return final_loss
-
-  @tf.function
-  def decode_greedy(self, sequence_length, first_word):
-    initial_state = self._lstm_cell.get_initial_state(
-        dtype=tf.float32, batch_size=1)
-
-    sequence = [first_word]
-    current_word = first_word
-    current_id = tf.expand_dims(self._words_to_indices(current_word), 0)
-    current_state = initial_state
-
-    for _ in range(sequence_length):
-      token_embeddings = tf.nn.embedding_lookup(self._embeddings, current_id)
-      lstm_outputs, current_state = self._lstm_cell(token_embeddings,
-                                                    current_state)
-      lstm_outputs = tf.reshape(lstm_outputs, [-1, self._lstm_cell.output_size])
-      logits = self._logit_layer(lstm_outputs)
-      softmax = tf.nn.softmax(logits)
-
-      next_ids = tf.math.argmax(softmax, axis=1)
-      next_words = self._indices_to_words(next_ids)[0]
-
-      current_id = next_ids
-      current_word = next_words
-      sequence.append(current_word)
-
-    return sequence
-
-
-def main(argv):
-  del argv
-
-  sentences = ["<S> hello there <E>", "<S> how are you doing today <E>"]
-  vocab = [
-      "<S>", "<E>", "hello", "there", "how", "are", "you", "doing", "today"
-  ]
-
-  module = TextRnnModel(vocab=vocab, emb_dim=10, buckets=100, state_size=128)
-
-  for _ in range(100):
-    _ = module.train(tf.constant(sentences))
-
-  # We have to call this function explicitly if we want it exported, because it
-  # has no input_signature in the @tf.function decorator.
-  decoded = module.decode_greedy(
-      sequence_length=10, first_word=tf.constant("<S>"))
-  _ = [d.numpy() for d in decoded]
-
-  tf.saved_model.save(module, FLAGS.export_dir)
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/integration_scripts.py b/tensorflow/examples/saved_model/integration_tests/integration_scripts.py
deleted file mode 100644
index 6f1ccfa2f05..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/integration_scripts.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utility to write SavedModel integration tests.
-
-SavedModel testing requires isolation between the process that creates and
-consumes it. This file helps doing that by relaunching the same binary that
-calls `assertCommandSucceeded` with an environment flag indicating what source
-file to execute. That binary must start by calling `MaybeRunScriptInstead`.
-
-This allows to wire this into existing building systems without having to depend
-on data dependencies. And as so allow to keep a fixed binary size and allows
-interop with GPU tests.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import importlib
-import os
-import subprocess
-import sys
-
-from absl import app
-import tensorflow.compat.v2 as tf
-
-from tensorflow.python.platform import tf_logging as logging
-
-
-class TestCase(tf.test.TestCase):
-  """Base class to write SavedModel integration tests."""
-
-  def assertCommandSucceeded(self, script_name, **flags):
-    """Runs an integration test script with given flags."""
-    run_script = sys.argv[0]
-    if run_script.endswith(".py"):
-      command_parts = [sys.executable, run_script]
-    else:
-      command_parts = [run_script]
-    command_parts.append("--alsologtostderr")  # For visibility in sponge.
-    for flag_key, flag_value in flags.items():
-      command_parts.append("--%s=%s" % (flag_key, flag_value))
-
-    env = dict(TF2_BEHAVIOR="enabled", SCRIPT_NAME=script_name)
-    logging.info("Running %s with added environment variables %s" %
-                 (command_parts, env))
-    subprocess.check_call(command_parts, env=dict(os.environ, **env))
-
-
-def MaybeRunScriptInstead():
-  if "SCRIPT_NAME" in os.environ:
-    # Append current path to import path and execute `SCRIPT_NAME` main.
-    sys.path.extend([os.path.dirname(__file__)])
-    module_name = os.environ["SCRIPT_NAME"]
-    retval = app.run(importlib.import_module(module_name).main)  # pylint: disable=assignment-from-no-return
-    sys.exit(retval)
diff --git a/tensorflow/examples/saved_model/integration_tests/mnist_util.py b/tensorflow/examples/saved_model/integration_tests/mnist_util.py
deleted file mode 100644
index 9770c849603..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/mnist_util.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Convenience wrapper around Keras' MNIST and Fashion MNIST data."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-INPUT_SHAPE = (28, 28, 1)
-NUM_CLASSES = 10
-
-
-def _load_random_data(num_train_and_test):
-  return ((np.random.randint(0, 256, (num, 28, 28), dtype=np.uint8),
-           np.random.randint(0, 10, (num,), dtype=np.int64))
-          for num in num_train_and_test)
-
-
-def load_reshaped_data(use_fashion_mnist=False, fake_tiny_data=False):
-  """Returns MNIST or Fashion MNIST or fake train and test data."""
-  load = ((lambda: _load_random_data([128, 128])) if fake_tiny_data else
-          tf.keras.datasets.fashion_mnist.load_data if use_fashion_mnist else
-          tf.keras.datasets.mnist.load_data)
-  (x_train, y_train), (x_test, y_test) = load()
-  return ((_prepare_image(x_train), _prepare_label(y_train)),
-          (_prepare_image(x_test), _prepare_label(y_test)))
-
-
-def _prepare_image(x):
-  """Converts images to [n,h,w,c] format in range [0,1]."""
-  return x[..., None].astype('float32') / 255.
-
-
-def _prepare_label(y):
-  """Conerts labels to one-hot encoding."""
-  return tf.keras.utils.to_categorical(y, NUM_CLASSES)
diff --git a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py b/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
deleted file mode 100644
index 434d5ed4ad5..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""SavedModel integration tests."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from absl.testing import parameterized
-import tensorflow.compat.v2 as tf
-
-from tensorflow.examples.saved_model.integration_tests import distribution_strategy_utils as ds_utils
-from tensorflow.examples.saved_model.integration_tests import integration_scripts as scripts
-from tensorflow.python.distribute import combinations as distribute_combinations
-from tensorflow.python.framework import combinations
-
-
-class SavedModelTest(scripts.TestCase, parameterized.TestCase):
-
-  def __init__(self, method_name="runTest", has_extra_deps=False):
-    super(SavedModelTest, self).__init__(method_name)
-    self.has_extra_deps = has_extra_deps
-
-  def skipIfMissingExtraDeps(self):
-    """Skip test if it requires extra dependencies.
-
-    b/132234211: The extra dependencies are not available in all environments
-    that run the tests, e.g. "tensorflow_hub" is not available from tests
-    within "tensorflow" alone. Those tests are instead run by another
-    internal test target.
-    """
-    if not self.has_extra_deps:
-      self.skipTest("Missing extra dependencies")
-
-  def test_text_rnn(self):
-    export_dir = self.get_temp_dir()
-    self.assertCommandSucceeded("export_text_rnn_model", export_dir=export_dir)
-    self.assertCommandSucceeded("use_text_rnn_model", model_dir=export_dir)
-
-  def test_rnn_cell(self):
-    export_dir = self.get_temp_dir()
-    self.assertCommandSucceeded("export_rnn_cell", export_dir=export_dir)
-    self.assertCommandSucceeded("use_rnn_cell", model_dir=export_dir)
-
-  def test_text_embedding_in_sequential_keras(self):
-    self.skipIfMissingExtraDeps()
-    export_dir = self.get_temp_dir()
-    self.assertCommandSucceeded(
-        "export_simple_text_embedding", export_dir=export_dir)
-    self.assertCommandSucceeded(
-        "use_model_in_sequential_keras", model_dir=export_dir)
-
-  def test_text_embedding_in_dataset(self):
-    export_dir = self.get_temp_dir()
-    self.assertCommandSucceeded(
-        "export_simple_text_embedding", export_dir=export_dir)
-    self.assertCommandSucceeded(
-        "use_text_embedding_in_dataset", model_dir=export_dir)
-
-  TEST_MNIST_CNN_GENERATE_KWARGS = dict(
-      combinations=(
-          combinations.combine(
-              # Test all combinations with tf.saved_model.save().
-              # Test all combinations using tf.keras.models.save_model()
-              # for both the reusable and the final full model.
-              use_keras_save_api=True,
-              named_strategy=list(ds_utils.named_strategies.values()),
-              retrain_flag_value=["true", "false"],
-              regularization_loss_multiplier=[None, 2],  # Test for b/134528831.
-          ) + combinations.combine(
-              # Test few critcial combinations with raw tf.saved_model.save(),
-              # including export of a reusable SavedModel that gets assembled
-              # manually, including support for adjustable hparams.
-              use_keras_save_api=False,
-              named_strategy=None,
-              retrain_flag_value=["true", "false"],
-              regularization_loss_multiplier=[None, 2],  # Test for b/134528831.
-          )),
-      test_combinations=(distribute_combinations.GPUCombination(),
-                         distribute_combinations.TPUCombination()))
-
-  @combinations.generate(**TEST_MNIST_CNN_GENERATE_KWARGS)
-  def test_mnist_cnn(self, use_keras_save_api, named_strategy,
-                     retrain_flag_value, regularization_loss_multiplier):
-
-    self.skipIfMissingExtraDeps()
-
-    fast_test_mode = True
-    temp_dir = self.get_temp_dir()
-    feature_extrator_dir = os.path.join(temp_dir, "mnist_feature_extractor")
-    full_model_dir = os.path.join(temp_dir, "full_model")
-
-    self.assertCommandSucceeded(
-        "export_mnist_cnn",
-        fast_test_mode=fast_test_mode,
-        export_dir=feature_extrator_dir,
-        use_keras_save_api=use_keras_save_api)
-
-    use_kwargs = dict(fast_test_mode=fast_test_mode,
-                      input_saved_model_dir=feature_extrator_dir,
-                      retrain=retrain_flag_value,
-                      output_saved_model_dir=full_model_dir,
-                      use_keras_save_api=use_keras_save_api)
-    if named_strategy:
-      use_kwargs["strategy"] = str(named_strategy)
-    if regularization_loss_multiplier is not None:
-      use_kwargs[
-          "regularization_loss_multiplier"] = regularization_loss_multiplier
-    self.assertCommandSucceeded("use_mnist_cnn", **use_kwargs)
-
-    self.assertCommandSucceeded(
-        "deploy_mnist_cnn",
-        fast_test_mode=fast_test_mode,
-        saved_model_dir=full_model_dir)
-
-
-if __name__ == "__main__":
-  scripts.MaybeRunScriptInstead()
-  tf.test.main()
diff --git a/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py b/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py
deleted file mode 100644
index ae45a02a59b..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/use_mnist_cnn.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Imports a convolutional feature extractor for MNIST in SavedModel format.
-
-This program picks up the SavedModel written by export_mnist_cnn.py and
-uses the feature extractor contained in it to do classification on either
-classic MNIST (digits) or Fashion MNIST (thumbnails of apparel). Optionally,
-it trains the feature extractor further as part of the new classifier.
-As expected, that makes training slower but does not help much for the
-original training dataset but helps a lot for transfer to the other dataset.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl import app
-from absl import flags
-import tensorflow.compat.v2 as tf
-import tensorflow_hub as hub
-
-from tensorflow.examples.saved_model.integration_tests import distribution_strategy_utils as ds_utils
-from tensorflow.examples.saved_model.integration_tests import mnist_util
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string(
-    'input_saved_model_dir', None,
-    'Directory of the reusable SavedModel that is imported into this program.')
-flags.DEFINE_integer(
-    'epochs', 5,
-    'Number of epochs to train.')
-flags.DEFINE_bool(
-    'retrain', False,
-    'If set, the imported SavedModel is trained further.')
-flags.DEFINE_float(
-    'dropout_rate', None,
-    'If set, dropout rate passed to the SavedModel. '
-    'Requires a SavedModel with support for adjustable hyperparameters.')
-flags.DEFINE_float(
-    'regularization_loss_multiplier', None,
-    'If set, multiplier for the regularization losses in the SavedModel.')
-flags.DEFINE_bool(
-    'use_fashion_mnist', False,
-    'Use Fashion MNIST (products) instead of the real MNIST (digits). '
-    'With this, --retrain gains a lot.')
-flags.DEFINE_bool(
-    'fast_test_mode', False,
-    'Shortcut training for running in unit tests.')
-flags.DEFINE_string(
-    'output_saved_model_dir', None,
-    'Directory of the SavedModel that was exported for reuse.')
-flags.DEFINE_bool(
-    'use_keras_save_api', False,
-    'Uses tf.keras.models.save_model() instead of tf.saved_model.save().')
-flags.DEFINE_string('strategy', None,
-                    'Name of the distribution strategy to use.')
-
-
-def make_feature_extractor(saved_model_path, trainable,
-                           regularization_loss_multiplier):
-  """Load a pre-trained feature extractor and wrap it for use in Keras."""
-  if regularization_loss_multiplier is not None:
-    # TODO(b/63257857): Scaling regularization losses requires manual loading
-    # and modification of the SavedModel
-    obj = tf.saved_model.load(saved_model_path)
-    def _scale_one_loss(l):  # Separate def avoids lambda capture of loop var.
-      f = tf.function(lambda: tf.multiply(regularization_loss_multiplier, l()))
-      _ = f.get_concrete_function()
-      return f
-    obj.regularization_losses = [_scale_one_loss(l)
-                                 for l in obj.regularization_losses]
-    # The modified object is then passed to hub.KerasLayer instead of the
-    # string handle. That prevents it from saving a Keras config (b/134528831).
-    handle = obj
-  else:
-    # If possible, we exercise the more common case of passing a string handle
-    # such that hub.KerasLayer can save a Keras config (b/134528831).
-    handle = saved_model_path
-
-  arguments = {}
-  if FLAGS.dropout_rate is not None:
-    arguments['dropout_rate'] = FLAGS.dropout_rate
-
-  return hub.KerasLayer(handle, trainable=trainable, arguments=arguments)
-
-
-def make_classifier(feature_extractor, l2_strength=0.01, dropout_rate=0.5):
-  """Returns a Keras Model to classify MNIST using feature_extractor."""
-  regularizer = lambda: tf.keras.regularizers.l2(l2_strength)
-  net = inp = tf.keras.Input(mnist_util.INPUT_SHAPE)
-  net = feature_extractor(net)
-  if dropout_rate:
-    net = tf.keras.layers.Dropout(dropout_rate)(net)
-  net = tf.keras.layers.Dense(mnist_util.NUM_CLASSES, activation='softmax',
-                              kernel_regularizer=regularizer())(net)
-  return tf.keras.Model(inputs=inp, outputs=net)
-
-
-def main(argv):
-  del argv
-
-  with ds_utils.MaybeDistributionScope.from_name(FLAGS.strategy):
-    feature_extractor = make_feature_extractor(
-        FLAGS.input_saved_model_dir,
-        FLAGS.retrain,
-        FLAGS.regularization_loss_multiplier)
-    model = make_classifier(feature_extractor)
-
-    model.compile(loss=tf.keras.losses.categorical_crossentropy,
-                  optimizer=tf.keras.optimizers.SGD(),
-                  metrics=['accuracy'])
-
-  # Train the classifier (possibly on a different dataset).
-  (x_train, y_train), (x_test, y_test) = mnist_util.load_reshaped_data(
-      use_fashion_mnist=FLAGS.use_fashion_mnist,
-      fake_tiny_data=FLAGS.fast_test_mode)
-  print('Training on %s with %d trainable and %d untrainable variables.' %
-        ('Fashion MNIST' if FLAGS.use_fashion_mnist else 'MNIST',
-         len(model.trainable_variables), len(model.non_trainable_variables)))
-  model.fit(x_train, y_train,
-            batch_size=128,
-            epochs=FLAGS.epochs,
-            verbose=1,
-            validation_data=(x_test, y_test))
-
-  if FLAGS.output_saved_model_dir:
-    if FLAGS.use_keras_save_api:
-      tf.keras.models.save_model(model, FLAGS.output_saved_model_dir)
-    else:
-      tf.saved_model.save(model, FLAGS.output_saved_model_dir)
-
-
-if __name__ == '__main__':
-  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py b/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py
deleted file mode 100644
index 47a10fbb608..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/use_model_in_sequential_keras.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Load and use text embedding module in sequential Keras."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tempfile
-
-from absl import app
-from absl import flags
-
-import numpy as np
-import tensorflow.compat.v2 as tf
-import tensorflow_hub as hub
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("model_dir", None, "Directory to load SavedModel from.")
-
-
-def train(fine_tuning):
-  """Build a Keras model and train with mock data."""
-  features = np.array(["my first sentence", "my second sentence"])
-  labels = np.array([1, 0])
-  dataset = tf.data.Dataset.from_tensor_slices((features, labels))
-
-  module = tf.saved_model.load(FLAGS.model_dir)
-
-  # Create the sequential keras model.
-  l = tf.keras.layers
-  model = tf.keras.Sequential()
-  model.add(l.Reshape((), batch_input_shape=[None, 1], dtype=tf.string))
-  # TODO(b/124219898): output_shape should be optional.
-  model.add(hub.KerasLayer(module, output_shape=[10], trainable=fine_tuning))
-  model.add(l.Dense(100, activation="relu"))
-  model.add(l.Dense(50, activation="relu"))
-  model.add(l.Dense(1, activation="sigmoid"))
-
-  model.compile(
-      optimizer="adam",
-      loss="binary_crossentropy",
-      metrics=["accuracy"],
-      # TODO(b/124446120): Remove after fixed.
-      run_eagerly=True)
-
-  model.fit_generator(generator=dataset.batch(1), epochs=5)
-
-  # This is testing that a model using a SavedModel can be re-exported again,
-  # e.g. to catch issues such as b/142231881.
-  tf.saved_model.save(model, tempfile.mkdtemp())
-
-
-def main(argv):
-  del argv
-
-  train(fine_tuning=False)
-  train(fine_tuning=True)
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/use_rnn_cell.py b/tensorflow/examples/saved_model/integration_tests/use_rnn_cell.py
deleted file mode 100644
index 2caca306cac..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/use_rnn_cell.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Load and use an RNN cell stored as a SavedModel."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tempfile
-
-from absl import app
-from absl import flags
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("model_dir", None, "Directory to load SavedModel from.")
-
-
-def main(argv):
-  del argv
-  cell = tf.saved_model.load(FLAGS.model_dir)
-
-  initial_state = cell.get_initial_state(
-      tf.constant(np.random.uniform(size=[3, 10]).astype(np.float32)))
-
-  cell.next_state(
-      tf.constant(np.random.uniform(size=[3, 19]).astype(np.float32)),
-      initial_state)
-
-  # This is testing that a model using a SavedModel can be re-exported again,
-  # e.g. to catch issues such as b/142231881.
-  tf.saved_model.save(cell, tempfile.mkdtemp())
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/use_text_embedding_in_dataset.py b/tensorflow/examples/saved_model/integration_tests/use_text_embedding_in_dataset.py
deleted file mode 100644
index be147a86d4c..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/use_text_embedding_in_dataset.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Load and use text embedding module in a Dataset map function."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tempfile
-
-from absl import app
-from absl import flags
-
-import numpy as np
-import tensorflow.compat.v2 as tf
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("model_dir", None, "Directory to load SavedModel from.")
-
-
-def train():
-  """Build a Keras model and train with mock data."""
-  module = tf.saved_model.load(FLAGS.model_dir)
-  def _map_fn(features, labels):
-    features = tf.expand_dims(features, 0)
-    features = module(features)
-    features = tf.squeeze(features, 0)
-    return features, labels
-
-  features = np.array(["my first sentence", "my second sentence"])
-  labels = np.array([1, 0])
-  dataset = tf.data.Dataset.from_tensor_slices((features, labels)).map(_map_fn)
-
-  # Create the sequential keras model.
-  l = tf.keras.layers
-  model = tf.keras.Sequential()
-  model.add(l.Dense(10, activation="relu"))
-  model.add(l.Dense(1, activation="sigmoid"))
-
-  model.compile(
-      optimizer="adam",
-      loss="binary_crossentropy",
-      metrics=["accuracy"])
-
-  model.fit_generator(generator=dataset.batch(10), epochs=5)
-
-  # This is testing that a model using a SavedModel can be re-exported again,
-  # e.g. to catch issues such as b/142231881.
-  tf.saved_model.save(model, tempfile.mkdtemp())
-
-
-def main(argv):
-  del argv
-
-  train()
-
-
-if __name__ == "__main__":
-  tf.enable_v2_behavior()
-  app.run(main)
diff --git a/tensorflow/examples/saved_model/integration_tests/use_text_rnn_model.py b/tensorflow/examples/saved_model/integration_tests/use_text_rnn_model.py
deleted file mode 100644
index a3c0f230976..00000000000
--- a/tensorflow/examples/saved_model/integration_tests/use_text_rnn_model.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Load and use RNN model stored as a SavedModel."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tempfile
-
-from absl import app
-from absl import flags
-import tensorflow.compat.v2 as tf
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("model_dir", None, "Directory to load SavedModel from.")
-
-
-def main(argv):
-  del argv
-
-  sentences = [
-      "<S> sentence <E>", "<S> second sentence <E>", "<S> third sentence<E>"
-  ]
-
-  model = tf.saved_model.load(FLAGS.model_dir)
-  model.train(tf.constant(sentences))
-  decoded = model.decode_greedy(
-      sequence_length=10, first_word=tf.constant("<S>"))
-  _ = [d.numpy() for d in decoded]
-
-  # This is testing that a model using a SavedModel can be re-exported again,
-  # e.g. to catch issues such as b/142231881.
-  tf.saved_model.save(model, tempfile.mkdtemp())
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/tensorflow/examples/speech_commands/BUILD b/tensorflow/examples/speech_commands/BUILD
index b4b07c368b4..8e41e36296f 100644
--- a/tensorflow/examples/speech_commands/BUILD
+++ b/tensorflow/examples/speech_commands/BUILD
@@ -7,10 +7,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files([
-    "LICENSE",
-])
-
 py_binary(
     name = "accuracy_utils_py",
     srcs = ["accuracy_utils.py"],
diff --git a/tensorflow/examples/tf2_showcase/BUILD b/tensorflow/examples/tf2_showcase/BUILD
deleted file mode 100644
index afa32d1be25..00000000000
--- a/tensorflow/examples/tf2_showcase/BUILD
+++ /dev/null
@@ -1,32 +0,0 @@
-package(
-    default_visibility = ["//visibility:private"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-test_suite(
-    name = "all_tests",
-    tags = [
-        "manual",
-        "no_oss",
-        "notap",
-    ],
-    tests = [
-        ":mnist",
-    ],
-)
-
-py_test(
-    name = "mnist",
-    srcs = ["mnist.py"],
-    python_version = "PY3",
-    tags = [
-        "manual",
-        "no_oss",
-        "notap",
-    ],
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "@absl_py//absl:app",
-        "@absl_py//absl/flags",
-    ],
-)
diff --git a/tensorflow/examples/tf2_showcase/README.md b/tensorflow/examples/tf2_showcase/README.md
deleted file mode 100644
index 8211fb1d30d..00000000000
--- a/tensorflow/examples/tf2_showcase/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# TF 2.0 Showcase
-
-The code here shows idiomatic ways to write TensorFlow 2.0 code. It doubles as
-an integration test.
-
-## General guidelines for showcase code:
-
-- Code should minimize dependencies and be self-contained in one file. A user
-  should be able to copy-paste the example code into their project and have it
-  just work.
-- Code should emphasize simplicity over performance, as long as it performs
-  within a factor of 2-3x of the optimized implementation.
-- Code should work on CPU and single GPU.
-- Code should run in Python 3.
-- Code should conform to the [Google Python Style Guide](https://github.com/google/styleguide/blob/gh-pages/pyguide.md)
-
-
-- Code should follow these guidelines:
-  - Prefer Keras.
-  - Split code into separate input pipeline and model code segments.
-  - Don't use tf.cond or tf.while_loop; instead, make use of AutoGraph's
-    functionality to compile Python `for`, `while`, and `if` statements.
-  - Prefer a simple training loop over Estimator
-  - Save and restore a SavedModel.
-  - Write basic TensorBoard metrics - loss, accuracy,
diff --git a/tensorflow/examples/tf2_showcase/mnist.py b/tensorflow/examples/tf2_showcase/mnist.py
deleted file mode 100644
index a4bfe4e53a8..00000000000
--- a/tensorflow/examples/tf2_showcase/mnist.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""MNIST model training with TensorFlow eager execution.
-
-See:
-https://research.googleblog.com/2017/10/eager-execution-imperative-define-by.html
-
-This program demonstrates training, export, and inference of a convolutional
-neural network model with eager execution enabled.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import time
-
-from absl import app
-from absl import flags
-import numpy as np
-import tensorflow as tf
-
-tfe = tf.contrib.eager
-
-flags.DEFINE_integer(
-    name='log_interval',
-    default=10,
-    help='batches between logging training status')
-
-flags.DEFINE_float(name='learning_rate', default=0.01, help='Learning rate.')
-
-flags.DEFINE_float(
-    name='momentum', short_name='m', default=0.5, help='SGD momentum.')
-
-flags.DEFINE_integer(
-    name='batch_size',
-    default=100,
-    help='Batch size to use during training / eval')
-
-flags.DEFINE_integer(
-    name='train_epochs', default=10, help='Number of epochs to train')
-
-flags.DEFINE_string(
-    name='model_dir',
-    default='/tmp/tensorflow/mnist',
-    help='Where to save checkpoints, tensorboard summaries, etc.')
-
-flags.DEFINE_bool(
-    name='clean',
-    default=False,
-    help='Whether to clear model directory before training')
-
-FLAGS = flags.FLAGS
-
-
-def create_model():
-  """Model to recognize digits in the MNIST dataset.
-
-  Network structure is equivalent to:
-  https://github.com/tensorflow/tensorflow/blob/r1.5/tensorflow/examples/tutorials/mnist/mnist_deep.py
-  and
-  https://github.com/tensorflow/models/blob/master/tutorials/image/mnist/convolutional.py
-  But uses the tf.keras API.
-  Returns:
-    A tf.keras.Model.
-  """
-  # Assumes data_format == 'channel_last'.
-  # See https://www.tensorflow.org/performance/performance_guide#data_formats
-
-  input_shape = [28, 28, 1]
-
-  l = tf.keras.layers
-  max_pool = l.MaxPooling2D((2, 2), (2, 2), padding='same')
-  # The model consists of a sequential chain of layers, so tf.keras.Sequential
-  # (a subclass of tf.keras.Model) makes for a compact description.
-  model = tf.keras.Sequential(
-      [
-          l.Reshape(
-              target_shape=input_shape,
-              input_shape=(28 * 28,)),
-          l.Conv2D(2, 5, padding='same', activation=tf.nn.relu),
-          max_pool,
-          l.Conv2D(4, 5, padding='same', activation=tf.nn.relu),
-          max_pool,
-          l.Flatten(),
-          l.Dense(32, activation=tf.nn.relu),
-          l.Dropout(0.4),
-          l.Dense(10)
-      ])
-  # TODO(brianklee): Remove when @kaftan makes this happen by default.
-  # TODO(brianklee): remove `autograph=True` when kwarg default is flipped.
-  model.call = tfe.function(model.call, autograph=True)
-  # Needs to have input_signature specified in order to be exported
-  # since model.predict() is never called before saved_model.export()
-  # TODO(brianklee): Update with input signature, depending on how the impl of
-  # saved_model.restore() pans out.
-  model.predict = tfe.function(model.predict, autograph=True)
-  # ,input_signature=(tensor_spec.TensorSpec(shape=[28, 28, None], dtype=tf.float32),) # pylint: disable=line-too-long
-  return model
-
-
-def mnist_datasets():
-  (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
-  # Numpy defaults to dtype=float64; TF defaults to float32. Stick with float32.
-  x_train, x_test = x_train / np.float32(255), x_test / np.float32(255)
-  y_train, y_test = y_train.astype(np.int64), y_test.astype(np.int64)
-  train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
-  test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
-  return train_dataset, test_dataset
-
-
-def loss(logits, labels):
-  return tf.reduce_mean(
-      tf.nn.sparse_softmax_cross_entropy_with_logits(
-          logits=logits, labels=labels))
-
-
-def compute_accuracy(logits, labels):
-  predictions = tf.argmax(logits, axis=1, output_type=tf.int64)
-  labels = tf.cast(labels, tf.int64)
-  return tf.reduce_mean(
-      tf.cast(tf.equal(predictions, labels), dtype=tf.float32))
-
-
-# TODO(brianklee): Enable @tf.function on the training loop when zip, enumerate
-# are supported by autograph.
-def train(model, optimizer, dataset, step_counter, log_interval=None,
-          num_steps=None):
-  """Trains model on `dataset` using `optimizer`."""
-  start = time.time()
-  for (batch, (images, labels)) in enumerate(dataset):
-    if num_steps is not None and batch > num_steps:
-      break
-    with tf.contrib.summary.record_summaries_every_n_global_steps(
-        10, global_step=step_counter):
-      # Record the operations used to compute the loss given the input,
-      # so that the gradient of the loss with respect to the variables
-      # can be computed.
-      with tf.GradientTape() as tape:
-        logits = model(images, training=True)
-        loss_value = loss(logits, labels)
-        tf.contrib.summary.scalar('loss', loss_value)
-        tf.contrib.summary.scalar('accuracy', compute_accuracy(logits, labels))
-      grads = tape.gradient(loss_value, model.variables)
-      optimizer.apply_gradients(
-          zip(grads, model.variables), global_step=step_counter)
-      if log_interval and batch % log_interval == 0:
-        rate = log_interval / (time.time() - start)
-        print('Step #%d\tLoss: %.6f (%d steps/sec)' % (batch, loss_value, rate))
-        start = time.time()
-
-
-def test(model, dataset):
-  """Perform an evaluation of `model` on the examples from `dataset`."""
-  avg_loss = tfe.metrics.Mean('loss', dtype=tf.float32)
-  accuracy = tfe.metrics.Accuracy('accuracy', dtype=tf.float32)
-
-  for (images, labels) in dataset:
-    logits = model(images, training=False)
-    avg_loss(loss(logits, labels))
-    accuracy(
-        tf.argmax(logits, axis=1, output_type=tf.int64),
-        tf.cast(labels, tf.int64))
-  print('Test set: Average loss: %.4f, Accuracy: %4f%%\n' %
-        (avg_loss.result(), 100 * accuracy.result()))
-  with tf.contrib.summary.always_record_summaries():
-    tf.contrib.summary.scalar('loss', avg_loss.result())
-    tf.contrib.summary.scalar('accuracy', accuracy.result())
-
-
-def train_and_export(flags_obj):
-  """Run MNIST training and eval loop in eager mode.
-
-  Args:
-    flags_obj: An object containing parsed flag values.
-  """
-  # Load the datasets
-  train_ds, test_ds = mnist_datasets()
-  train_ds = train_ds.shuffle(60000).batch(flags_obj.batch_size)
-  test_ds = test_ds.batch(flags_obj.batch_size)
-
-  # Create the model and optimizer
-  model = create_model()
-  optimizer = tf.train.MomentumOptimizer(
-      flags_obj.learning_rate, flags_obj.momentum)
-
-  # See summaries with `tensorboard --logdir=<model_dir>`
-  train_dir = os.path.join(flags_obj.model_dir, 'summaries', 'train')
-  test_dir = os.path.join(flags_obj.model_dir, 'summaries', 'eval')
-  summary_writer = tf.contrib.summary.create_file_writer(
-      train_dir, flush_millis=10000)
-  test_summary_writer = tf.contrib.summary.create_file_writer(
-      test_dir, flush_millis=10000, name='test')
-
-  # Create and restore checkpoint (if one exists on the path)
-  checkpoint_dir = os.path.join(flags_obj.model_dir, 'checkpoints')
-  checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt')
-  step_counter = tf.train.get_or_create_global_step()
-  checkpoint = tf.train.Checkpoint(
-      model=model, optimizer=optimizer, step_counter=step_counter)
-  # Restore variables on creation if a checkpoint exists.
-  checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
-
-  # Train and evaluate for a set number of epochs.
-  for _ in range(flags_obj.train_epochs):
-    start = time.time()
-    with summary_writer.as_default():
-      train(model, optimizer, train_ds, step_counter,
-            flags_obj.log_interval, num_steps=1)
-    end = time.time()
-    print('\nTrain time for epoch #%d (%d total steps): %f' %
-          (checkpoint.save_counter.numpy() + 1,
-           step_counter.numpy(),
-           end - start))
-    with test_summary_writer.as_default():
-      test(model, test_ds)
-    checkpoint.save(checkpoint_prefix)
-
-  # TODO(brianklee): Enable this functionality after @allenl implements this.
-  # export_path = os.path.join(flags_obj.model_dir, 'export')
-  # tf.saved_model.save(export_path, model)
-
-
-def import_and_eval(flags_obj):
-  export_path = os.path.join(flags_obj.model_dir, 'export')
-  model = tf.saved_model.restore(export_path)
-  _, (x_test, y_test) = tf.keras.datasets.mnist.load_data()
-  x_test = x_test / np.float32(255)
-  y_predict = model(x_test)
-  accuracy = compute_accuracy(y_predict, y_test)
-  print('Model accuracy: {:0.2f}%'.format(accuracy.numpy() * 100))
-
-
-def apply_clean(flags_obj):
-  if flags_obj.clean and tf.gfile.Exists(flags_obj.model_dir):
-    tf.logging.info('--clean flag set. Removing existing model dir: {}'.format(
-        flags_obj.model_dir))
-    tf.gfile.DeleteRecursively(flags_obj.model_dir)
-
-
-def main(_):
-  apply_clean(flags.FLAGS)
-  train_and_export(flags.FLAGS)
-  # TODO(brianklee): Enable this functionality after @allenl implements this.
-  # import_and_eval(flags.FLAGS)
-
-
-if __name__ == '__main__':
-  app.run(main)
diff --git a/tensorflow/examples/tutorials/deepdream/README.md b/tensorflow/examples/tutorials/deepdream/README.md
deleted file mode 100644
index 7486476f30a..00000000000
--- a/tensorflow/examples/tutorials/deepdream/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Deepdream
-
-This example has moved.
-
-[A TensorFlow 2 version is available](https://tensorflow.org/tutorials/generative/deepdream)
-[The original is in the TensorFlow examples Repository](https://github.com/tensorflow/examples/tree/master/community/en/r1/deepdream.ipynb)
diff --git a/tensorflow/examples/tutorials/deepdream/deepdream.ipynb b/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
deleted file mode 100644
index f0dd7204609..00000000000
--- a/tensorflow/examples/tutorials/deepdream/deepdream.ipynb
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "xu2SVpFJjmJr"
-      },
-      "source": [
-        "# DeepDreaming with TensorFlow"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "eALNdPq7iH9a"
-      },
-      "source": [
-        "This example has moved.\n",
-        "\n",
-        "* [TensorFlow 2.0 version](https://tensorflow.org/tutorials/generative/deepdream)\n",
-        "* [The Original](https://github.com/tensorflow/examples/tree/master/community/en/r1/deepdream.ipynb)"
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "collapsed_sections": [],
-      "name": "deepdream.ipynb",
-      "provenance": [],
-      "version": "0.3.2"
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 2
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython2",
-      "version": "2.7.11"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
diff --git a/tensorflow/examples/tutorials/mnist/BUILD b/tensorflow/examples/tutorials/mnist/BUILD
deleted file mode 100644
index be8cdc3a1c2..00000000000
--- a/tensorflow/examples/tutorials/mnist/BUILD
+++ /dev/null
@@ -1,135 +0,0 @@
-# Description:
-# Example TensorFlow models for MNIST used in tutorials
-
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-
-package(
-    licenses = ["notice"],  # Apache 2.0
-)
-
-exports_files(["LICENSE"])
-
-py_library(
-    name = "package",
-    srcs = [
-        "__init__.py",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = [
-        ":input_data",
-        ":mnist",
-    ],
-)
-
-py_library(
-    name = "input_data",
-    srcs = ["input_data.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "mnist",
-    srcs = [
-        "mnist.py",
-    ],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_binary(
-    name = "fully_connected_feed",
-    srcs = [
-        "fully_connected_feed.py",
-    ],
-    python_version = "PY3",
-    srcs_version = "PY2AND3",
-    tags = ["optonly"],
-    deps = [
-        ":input_data",
-        ":mnist",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-py_binary(
-    name = "mnist_with_summaries",
-    srcs = [
-        "mnist_with_summaries.py",
-    ],
-    python_version = "PY3",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":input_data",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-# Note: We need to set the evironment variable to use CPU JIT.
-# The way to achieve this is via setting the following:
-# TF_XLA_FLAGS='--tf_xla_cpu_global_jit=true'
-# before the run command. To use XLA, we also must build
-# with --define=with_xla_support=true flag.
-# Note (GPU): Add --config=cuda to the build command.
-py_binary(
-    name = "mnist_softmax_xla",
-    srcs = [
-        "mnist_softmax_xla.py",
-    ],
-    python_version = "PY3",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":input_data",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-tf_py_test(
-    name = "fully_connected_feed_test",
-    srcs = [
-        "fully_connected_feed.py",
-    ],
-    args = [
-        "--fake_data",
-        "--max_steps=10",
-    ],
-    main = "fully_connected_feed.py",
-    tags = ["no_pip"],
-    deps = [
-        ":input_data",
-        ":mnist",
-        "//tensorflow:tensorflow_py",
-    ],
-)
-
-tf_py_test(
-    name = "mnist_with_summaries_test",
-    size = "small",
-    srcs = [
-        "mnist_with_summaries.py",
-    ],
-    args = [
-        "--fake_data",
-        "--max_steps=10",
-        "--learning_rate=0.00",
-    ],
-    main = "mnist_with_summaries.py",
-    tags = [
-        "no_pip",
-        "noasan",  # http://b/146080738
-        "notsan",  # http://b/29184009
-    ],
-    deps = [
-        ":input_data",
-        "//tensorflow:tensorflow_py",
-    ],
-)
diff --git a/tensorflow/examples/tutorials/mnist/fully_connected_feed.py b/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
deleted file mode 100644
index 8eb57100058..00000000000
--- a/tensorflow/examples/tutorials/mnist/fully_connected_feed.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Trains and Evaluates the MNIST network using a feed dictionary."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=missing-docstring
-import argparse
-import os
-import sys
-import time
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-from tensorflow.examples.tutorials.mnist import input_data
-from tensorflow.examples.tutorials.mnist import mnist
-
-# Basic model parameters as external flags.
-FLAGS = None
-
-
-def placeholder_inputs(batch_size):
-  """Generate placeholder variables to represent the input tensors.
-
-  These placeholders are used as inputs by the rest of the model building
-  code and will be fed from the downloaded data in the .run() loop, below.
-
-  Args:
-    batch_size: The batch size will be baked into both placeholders.
-
-  Returns:
-    images_placeholder: Images placeholder.
-    labels_placeholder: Labels placeholder.
-  """
-  # Note that the shapes of the placeholders match the shapes of the full
-  # image and label tensors, except the first dimension is now batch_size
-  # rather than the full size of the train or test data sets.
-  images_placeholder = tf.compat.v1.placeholder(
-      tf.float32, shape=(batch_size, mnist.IMAGE_PIXELS))
-  labels_placeholder = tf.compat.v1.placeholder(tf.int32, shape=(batch_size))
-  return images_placeholder, labels_placeholder
-
-
-def fill_feed_dict(data_set, images_pl, labels_pl):
-  """Fills the feed_dict for training the given step.
-
-  A feed_dict takes the form of:
-  feed_dict = {
-      <placeholder>: <tensor of values to be passed for placeholder>,
-      ....
-  }
-
-  Args:
-    data_set: The set of images and labels, from input_data.read_data_sets()
-    images_pl: The images placeholder, from placeholder_inputs().
-    labels_pl: The labels placeholder, from placeholder_inputs().
-
-  Returns:
-    feed_dict: The feed dictionary mapping from placeholders to values.
-  """
-  # Create the feed_dict for the placeholders filled with the next
-  # `batch size` examples.
-  images_feed, labels_feed = data_set.next_batch(FLAGS.batch_size,
-                                                 FLAGS.fake_data)
-  feed_dict = {
-      images_pl: images_feed,
-      labels_pl: labels_feed,
-  }
-  return feed_dict
-
-
-def do_eval(sess,
-            eval_correct,
-            images_placeholder,
-            labels_placeholder,
-            data_set):
-  """Runs one evaluation against the full epoch of data.
-
-  Args:
-    sess: The session in which the model has been trained.
-    eval_correct: The Tensor that returns the number of correct predictions.
-    images_placeholder: The images placeholder.
-    labels_placeholder: The labels placeholder.
-    data_set: The set of images and labels to evaluate, from
-      input_data.read_data_sets().
-  """
-  # And run one epoch of eval.
-  true_count = 0  # Counts the number of correct predictions.
-  steps_per_epoch = data_set.num_examples // FLAGS.batch_size
-  num_examples = steps_per_epoch * FLAGS.batch_size
-  for step in xrange(steps_per_epoch):
-    feed_dict = fill_feed_dict(data_set,
-                               images_placeholder,
-                               labels_placeholder)
-    true_count += sess.run(eval_correct, feed_dict=feed_dict)
-  precision = float(true_count) / num_examples
-  print('Num examples: %d  Num correct: %d  Precision @ 1: %0.04f' %
-        (num_examples, true_count, precision))
-
-
-def run_training():
-  """Train MNIST for a number of steps."""
-  # Get the sets of images and labels for training, validation, and
-  # test on MNIST.
-  data_sets = input_data.read_data_sets(FLAGS.input_data_dir, FLAGS.fake_data)
-
-  # Tell TensorFlow that the model will be built into the default Graph.
-  with tf.Graph().as_default():
-    # Generate placeholders for the images and labels.
-    images_placeholder, labels_placeholder = placeholder_inputs(
-        FLAGS.batch_size)
-
-    # Build a Graph that computes predictions from the inference model.
-    logits = mnist.inference(images_placeholder,
-                             FLAGS.hidden1,
-                             FLAGS.hidden2)
-
-    # Add to the Graph the Ops for loss calculation.
-    loss = mnist.loss(logits, labels_placeholder)
-
-    # Add to the Graph the Ops that calculate and apply gradients.
-    train_op = mnist.training(loss, FLAGS.learning_rate)
-
-    # Add the Op to compare the logits to the labels during evaluation.
-    eval_correct = mnist.evaluation(logits, labels_placeholder)
-
-    # Build the summary Tensor based on the TF collection of Summaries.
-    summary = tf.compat.v1.summary.merge_all()
-
-    # Add the variable initializer Op.
-    init = tf.compat.v1.global_variables_initializer()
-
-    # Create a saver for writing training checkpoints.
-    saver = tf.compat.v1.train.Saver()
-
-    # Create a session for running Ops on the Graph.
-    sess = tf.compat.v1.Session()
-
-    # Instantiate a SummaryWriter to output summaries and the Graph.
-    summary_writer = tf.compat.v1.summary.FileWriter(FLAGS.log_dir, sess.graph)
-
-    # And then after everything is built:
-
-    # Run the Op to initialize the variables.
-    sess.run(init)
-
-    # Start the training loop.
-    for step in xrange(FLAGS.max_steps):
-      start_time = time.time()
-
-      # Fill a feed dictionary with the actual set of images and labels
-      # for this particular training step.
-      feed_dict = fill_feed_dict(data_sets.train,
-                                 images_placeholder,
-                                 labels_placeholder)
-
-      # Run one step of the model.  The return values are the activations
-      # from the `train_op` (which is discarded) and the `loss` Op.  To
-      # inspect the values of your Ops or variables, you may include them
-      # in the list passed to sess.run() and the value tensors will be
-      # returned in the tuple from the call.
-      _, loss_value = sess.run([train_op, loss],
-                               feed_dict=feed_dict)
-
-      duration = time.time() - start_time
-
-      # Write the summaries and print an overview fairly often.
-      if step % 100 == 0:
-        # Print status to stdout.
-        print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration))
-        # Update the events file.
-        summary_str = sess.run(summary, feed_dict=feed_dict)
-        summary_writer.add_summary(summary_str, step)
-        summary_writer.flush()
-
-      # Save a checkpoint and evaluate the model periodically.
-      if (step + 1) % 1000 == 0 or (step + 1) == FLAGS.max_steps:
-        checkpoint_file = os.path.join(FLAGS.log_dir, 'model.ckpt')
-        saver.save(sess, checkpoint_file, global_step=step)
-        # Evaluate against the training set.
-        print('Training Data Eval:')
-        do_eval(sess,
-                eval_correct,
-                images_placeholder,
-                labels_placeholder,
-                data_sets.train)
-        # Evaluate against the validation set.
-        print('Validation Data Eval:')
-        do_eval(sess,
-                eval_correct,
-                images_placeholder,
-                labels_placeholder,
-                data_sets.validation)
-        # Evaluate against the test set.
-        print('Test Data Eval:')
-        do_eval(sess,
-                eval_correct,
-                images_placeholder,
-                labels_placeholder,
-                data_sets.test)
-
-
-def main(_):
-  if tf.io.gfile.exists(FLAGS.log_dir):
-    tf.io.gfile.rmtree(FLAGS.log_dir)
-  tf.io.gfile.makedirs(FLAGS.log_dir)
-  run_training()
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--learning_rate',
-      type=float,
-      default=0.01,
-      help='Initial learning rate.'
-  )
-  parser.add_argument(
-      '--max_steps',
-      type=int,
-      default=2000,
-      help='Number of steps to run trainer.'
-  )
-  parser.add_argument(
-      '--hidden1',
-      type=int,
-      default=128,
-      help='Number of units in hidden layer 1.'
-  )
-  parser.add_argument(
-      '--hidden2',
-      type=int,
-      default=32,
-      help='Number of units in hidden layer 2.'
-  )
-  parser.add_argument(
-      '--batch_size',
-      type=int,
-      default=100,
-      help='Batch size.  Must divide evenly into the dataset sizes.'
-  )
-  parser.add_argument(
-      '--input_data_dir',
-      type=str,
-      default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
-                           'tensorflow/mnist/input_data'),
-      help='Directory to put the input data.'
-  )
-  parser.add_argument(
-      '--log_dir',
-      type=str,
-      default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
-                           'tensorflow/mnist/logs/fully_connected_feed'),
-      help='Directory to put the log data.'
-  )
-  parser.add_argument(
-      '--fake_data',
-      default=False,
-      help='If true, uses fake data for unit testing.',
-      action='store_true'
-  )
-
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/mnist/mnist.py b/tensorflow/examples/tutorials/mnist/mnist.py
deleted file mode 100644
index 0141d4b25ea..00000000000
--- a/tensorflow/examples/tutorials/mnist/mnist.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Builds the MNIST network.
-
-Implements the inference/loss/training pattern for model building.
-
-1. inference() - Builds the model as far as required for running the network
-forward to make predictions.
-2. loss() - Adds to the inference model the layers required to generate loss.
-3. training() - Adds to the loss model the Ops required to generate and
-apply gradients.
-
-This file is used by the various "fully_connected_*.py" files and not meant to
-be run.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-
-import tensorflow as tf
-
-# The MNIST dataset has 10 classes, representing the digits 0 through 9.
-NUM_CLASSES = 10
-
-# The MNIST images are always 28x28 pixels.
-IMAGE_SIZE = 28
-IMAGE_PIXELS = IMAGE_SIZE * IMAGE_SIZE
-
-
-def inference(images, hidden1_units, hidden2_units):
-  """Build the MNIST model up to where it may be used for inference.
-
-  Args:
-    images: Images placeholder, from inputs().
-    hidden1_units: Size of the first hidden layer.
-    hidden2_units: Size of the second hidden layer.
-
-  Returns:
-    softmax_linear: Output tensor with the computed logits.
-  """
-  # Hidden 1
-  with tf.compat.v1.name_scope('hidden1'):
-    weights = tf.Variable(
-        tf.random.truncated_normal(
-            [IMAGE_PIXELS, hidden1_units],
-            stddev=1.0 / math.sqrt(float(IMAGE_PIXELS))), name='weights')
-    biases = tf.Variable(tf.zeros([hidden1_units]),
-                         name='biases')
-    hidden1 = tf.nn.relu(tf.matmul(images, weights) + biases)
-  # Hidden 2
-  with tf.compat.v1.name_scope('hidden2'):
-    weights = tf.Variable(
-        tf.random.truncated_normal(
-            [hidden1_units, hidden2_units],
-            stddev=1.0 / math.sqrt(float(hidden1_units))), name='weights')
-    biases = tf.Variable(tf.zeros([hidden2_units]),
-                         name='biases')
-    hidden2 = tf.nn.relu(tf.matmul(hidden1, weights) + biases)
-  # Linear
-  with tf.compat.v1.name_scope('softmax_linear'):
-    weights = tf.Variable(
-        tf.random.truncated_normal(
-            [hidden2_units, NUM_CLASSES],
-            stddev=1.0 / math.sqrt(float(hidden2_units))), name='weights')
-    biases = tf.Variable(tf.zeros([NUM_CLASSES]),
-                         name='biases')
-    logits = tf.matmul(hidden2, weights) + biases
-  return logits
-
-
-def loss(logits, labels):
-  """Calculates the loss from the logits and the labels.
-
-  Args:
-    logits: Logits tensor, float - [batch_size, NUM_CLASSES].
-    labels: Labels tensor, int32 - [batch_size].
-
-  Returns:
-    loss: Loss tensor of type float.
-  """
-  labels = tf.cast(labels, dtype=tf.int64)
-  return tf.compat.v1.losses.sparse_softmax_cross_entropy(
-      labels=labels, logits=logits)
-
-
-def training(loss, learning_rate):
-  """Sets up the training Ops.
-
-  Creates a summarizer to track the loss over time in TensorBoard.
-
-  Creates an optimizer and applies the gradients to all trainable variables.
-
-  The Op returned by this function is what must be passed to the
-  `sess.run()` call to cause the model to train.
-
-  Args:
-    loss: Loss tensor, from loss().
-    learning_rate: The learning rate to use for gradient descent.
-
-  Returns:
-    train_op: The Op for training.
-  """
-  # Add a scalar summary for the snapshot loss.
-  tf.compat.v1.summary.scalar('loss', loss)
-  # Create the gradient descent optimizer with the given learning rate.
-  optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate)
-  # Create a variable to track the global step.
-  global_step = tf.Variable(0, name='global_step', trainable=False)
-  # Use the optimizer to apply the gradients that minimize the loss
-  # (and also increment the global step counter) as a single training step.
-  train_op = optimizer.minimize(loss, global_step=global_step)
-  return train_op
-
-
-def evaluation(logits, labels):
-  """Evaluate the quality of the logits at predicting the label.
-
-  Args:
-    logits: Logits tensor, float - [batch_size, NUM_CLASSES].
-    labels: Labels tensor, int32 - [batch_size], with values in the
-      range [0, NUM_CLASSES).
-
-  Returns:
-    A scalar int32 tensor with the number of examples (out of batch_size)
-    that were predicted correctly.
-  """
-  # For a classifier model, we can use the in_top_k Op.
-  # It returns a bool tensor with shape [batch_size] that is true for
-  # the examples where the label is in the top k (here k=1)
-  # of all logits for that example.
-  correct = tf.nn.in_top_k(predictions=logits, targets=labels, k=1)
-  # Return the number of true entries.
-  return tf.reduce_sum(input_tensor=tf.cast(correct, tf.int32))
diff --git a/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py b/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py
deleted file mode 100644
index 28fdc2059f8..00000000000
--- a/tensorflow/examples/tutorials/mnist/mnist_softmax_xla.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Simple MNIST classifier example with JIT XLA and timelines.
-
-  Note: Please see further comments in the BUILD file to invoke XLA.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-
-import tensorflow as tf
-
-from tensorflow.examples.tutorials.mnist import input_data
-from tensorflow.python.client import timeline
-
-FLAGS = None
-
-
-def main(_):
-  # Import data
-  mnist = input_data.read_data_sets(FLAGS.data_dir)
-
-  # Create the model
-  x = tf.placeholder(tf.float32, [None, 784])
-  w = tf.Variable(tf.zeros([784, 10]))
-  b = tf.Variable(tf.zeros([10]))
-  y = tf.matmul(x, w) + b
-
-  # Define loss and optimizer
-  y_ = tf.placeholder(tf.int64, [None])
-
-  # The raw formulation of cross-entropy,
-  #
-  #   tf.reduce_mean(-tf.reduce_sum(y_ * tf.math.log(tf.nn.softmax(y)),
-  #                                 reduction_indices=[1]))
-  #
-  # can be numerically unstable.
-  #
-  # So here we use tf.compat.v1.losses.sparse_softmax_cross_entropy on the raw
-  # logit outputs of 'y', and then average across the batch.
-  cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y)
-  train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
-
-  config = tf.ConfigProto()
-  jit_level = 0
-  if FLAGS.xla:
-    # Turns on XLA JIT compilation.
-    jit_level = tf.OptimizerOptions.ON_1
-
-  config.graph_options.optimizer_options.global_jit_level = jit_level
-  run_metadata = tf.RunMetadata()
-  sess = tf.compat.v1.Session(config=config)
-  tf.global_variables_initializer().run(session=sess)
-  # Train
-  train_loops = 1000
-  for i in range(train_loops):
-    batch_xs, batch_ys = mnist.train.next_batch(100)
-
-    # Create a timeline for the last loop and export to json to view with
-    # chrome://tracing/.
-    if i == train_loops - 1:
-      sess.run(train_step,
-               feed_dict={x: batch_xs,
-                          y_: batch_ys},
-               options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE),
-               run_metadata=run_metadata)
-      trace = timeline.Timeline(step_stats=run_metadata.step_stats)
-      with open('/tmp/timeline.ctf.json', 'w') as trace_file:
-        trace_file.write(trace.generate_chrome_trace_format())
-    else:
-      sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
-
-  # Test trained model
-  correct_prediction = tf.equal(tf.argmax(y, 1), y_)
-  accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
-  print(sess.run(accuracy,
-                 feed_dict={x: mnist.test.images,
-                            y_: mnist.test.labels}))
-  sess.close()
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--data_dir',
-      type=str,
-      default='/tmp/tensorflow/mnist/input_data',
-      help='Directory for storing input data')
-  parser.add_argument(
-      '--xla', type=bool, default=True, help='Turn xla via JIT on')
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
deleted file mode 100644
index 04315ad8a3f..00000000000
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the 'License');
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A simple MNIST classifier which displays summaries in TensorBoard.
-
-This is an unimpressive MNIST model, but it is a good example of using
-tf.name_scope to make a graph legible in the TensorBoard graph explorer, and of
-naming summary tags so that they are grouped meaningfully in TensorBoard.
-
-It demonstrates the functionality of every TensorBoard dashboard.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import os
-import sys
-
-import tensorflow as tf
-
-from tensorflow.examples.tutorials.mnist import input_data
-
-FLAGS = None
-
-
-def train():
-  # Import data
-  mnist = input_data.read_data_sets(FLAGS.data_dir,
-                                    fake_data=FLAGS.fake_data)
-
-  sess = tf.compat.v1.InteractiveSession()
-  # Create a multilayer model.
-
-  # Input placeholders
-  with tf.compat.v1.name_scope('input'):
-    x = tf.compat.v1.placeholder(tf.float32, [None, 784], name='x-input')
-    y_ = tf.compat.v1.placeholder(tf.int64, [None], name='y-input')
-
-  with tf.compat.v1.name_scope('input_reshape'):
-    image_shaped_input = tf.reshape(x, [-1, 28, 28, 1])
-    tf.compat.v1.summary.image('input', image_shaped_input, 10)
-
-  # We can't initialize these variables to 0 - the network will get stuck.
-  def weight_variable(shape):
-    """Create a weight variable with appropriate initialization."""
-    initial = tf.random.truncated_normal(shape, stddev=0.1)
-    return tf.Variable(initial)
-
-  def bias_variable(shape):
-    """Create a bias variable with appropriate initialization."""
-    initial = tf.constant(0.1, shape=shape)
-    return tf.Variable(initial)
-
-  def variable_summaries(var):
-    """Attach a lot of summaries to a Tensor (for TensorBoard visualization)."""
-    with tf.compat.v1.name_scope('summaries'):
-      mean = tf.reduce_mean(input_tensor=var)
-      tf.compat.v1.summary.scalar('mean', mean)
-      with tf.compat.v1.name_scope('stddev'):
-        stddev = tf.sqrt(tf.reduce_mean(input_tensor=tf.square(var - mean)))
-      tf.compat.v1.summary.scalar('stddev', stddev)
-      tf.compat.v1.summary.scalar('max', tf.reduce_max(input_tensor=var))
-      tf.compat.v1.summary.scalar('min', tf.reduce_min(input_tensor=var))
-      tf.compat.v1.summary.histogram('histogram', var)
-
-  def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu):
-    """Reusable code for making a simple neural net layer.
-
-    It does a matrix multiply, bias add, and then uses ReLU to nonlinearize.
-    It also sets up name scoping so that the resultant graph is easy to read,
-    and adds a number of summary ops.
-    """
-    # Adding a name scope ensures logical grouping of the layers in the graph.
-    with tf.compat.v1.name_scope(layer_name):
-      # This Variable will hold the state of the weights for the layer
-      with tf.compat.v1.name_scope('weights'):
-        weights = weight_variable([input_dim, output_dim])
-        variable_summaries(weights)
-      with tf.compat.v1.name_scope('biases'):
-        biases = bias_variable([output_dim])
-        variable_summaries(biases)
-      with tf.compat.v1.name_scope('Wx_plus_b'):
-        preactivate = tf.matmul(input_tensor, weights) + biases
-        tf.compat.v1.summary.histogram('pre_activations', preactivate)
-      activations = act(preactivate, name='activation')
-      tf.compat.v1.summary.histogram('activations', activations)
-      return activations
-
-  hidden1 = nn_layer(x, 784, 500, 'layer1')
-
-  with tf.compat.v1.name_scope('dropout'):
-    keep_prob = tf.compat.v1.placeholder(tf.float32)
-    tf.compat.v1.summary.scalar('dropout_keep_probability', keep_prob)
-    dropped = tf.nn.dropout(hidden1, rate=(1 - keep_prob))
-
-  # Do not apply softmax activation yet, see below.
-  y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity)
-
-  with tf.compat.v1.name_scope('cross_entropy'):
-    # The raw formulation of cross-entropy,
-    #
-    # tf.reduce_mean(-tf.reduce_sum(y_ * tf.math.log(tf.softmax(y)),
-    #                               reduction_indices=[1]))
-    #
-    # can be numerically unstable.
-    #
-    # So here we use tf.compat.v1.losses.sparse_softmax_cross_entropy on the
-    # raw logit outputs of the nn_layer above, and then average across
-    # the batch.
-    with tf.compat.v1.name_scope('total'):
-      cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy(
-          labels=y_, logits=y)
-  tf.compat.v1.summary.scalar('cross_entropy', cross_entropy)
-
-  with tf.compat.v1.name_scope('train'):
-    train_step = tf.compat.v1.train.AdamOptimizer(FLAGS.learning_rate).minimize(
-        cross_entropy)
-
-  with tf.compat.v1.name_scope('accuracy'):
-    with tf.compat.v1.name_scope('correct_prediction'):
-      correct_prediction = tf.equal(tf.argmax(input=y, axis=1), y_)
-    with tf.compat.v1.name_scope('accuracy'):
-      accuracy = tf.reduce_mean(input_tensor=tf.cast(correct_prediction,
-                                                     tf.float32))
-  tf.compat.v1.summary.scalar('accuracy', accuracy)
-
-  # Merge all the summaries and write them out to
-  # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default)
-  merged = tf.compat.v1.summary.merge_all()
-  train_writer = tf.compat.v1.summary.FileWriter(FLAGS.log_dir + '/train',
-                                                 sess.graph)
-  test_writer = tf.compat.v1.summary.FileWriter(FLAGS.log_dir + '/test')
-  tf.compat.v1.global_variables_initializer().run()
-
-  # Train the model, and also write summaries.
-  # Every 10th step, measure test-set accuracy, and write test summaries
-  # All other steps, run train_step on training data, & add training summaries
-
-  def feed_dict(train):
-    """Make a TensorFlow feed_dict: maps data onto Tensor placeholders."""
-    if train or FLAGS.fake_data:
-      xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data)
-      k = FLAGS.dropout
-    else:
-      xs, ys = mnist.test.images, mnist.test.labels
-      k = 1.0
-    return {x: xs, y_: ys, keep_prob: k}
-
-  for i in range(FLAGS.max_steps):
-    if i % 10 == 0:  # Record summaries and test-set accuracy
-      summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
-      test_writer.add_summary(summary, i)
-      print('Accuracy at step %s: %s' % (i, acc))
-    else:  # Record train set summaries, and train
-      if i % 100 == 99:  # Record execution stats
-        run_options = tf.compat.v1.RunOptions(
-            trace_level=tf.compat.v1.RunOptions.FULL_TRACE)
-        run_metadata = tf.compat.v1.RunMetadata()
-        summary, _ = sess.run([merged, train_step],
-                              feed_dict=feed_dict(True),
-                              options=run_options,
-                              run_metadata=run_metadata)
-        train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
-        train_writer.add_summary(summary, i)
-        print('Adding run metadata for', i)
-      else:  # Record a summary
-        summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
-        train_writer.add_summary(summary, i)
-  train_writer.close()
-  test_writer.close()
-
-
-def main(_):
-  if tf.io.gfile.exists(FLAGS.log_dir):
-    tf.io.gfile.rmtree(FLAGS.log_dir)
-  tf.io.gfile.makedirs(FLAGS.log_dir)
-  with tf.Graph().as_default():
-    train()
-
-
-if __name__ == '__main__':
-  parser = argparse.ArgumentParser()
-  parser.add_argument('--fake_data', nargs='?', const=True, type=bool,
-                      default=False,
-                      help='If true, uses fake data for unit testing.')
-  parser.add_argument('--max_steps', type=int, default=1000,
-                      help='Number of steps to run trainer.')
-  parser.add_argument('--learning_rate', type=float, default=0.001,
-                      help='Initial learning rate')
-  parser.add_argument('--dropout', type=float, default=0.9,
-                      help='Keep probability for training dropout.')
-  parser.add_argument(
-      '--data_dir',
-      type=str,
-      default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
-                           'tensorflow/mnist/input_data'),
-      help='Directory for storing input data')
-  parser.add_argument(
-      '--log_dir',
-      type=str,
-      default=os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
-                           'tensorflow/mnist/logs/mnist_with_summaries'),
-      help='Summaries log directory')
-  FLAGS, unparsed = parser.parse_known_args()
-  tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/examples/tutorials/word2vec/BUILD b/tensorflow/examples/tutorials/word2vec/BUILD
deleted file mode 100644
index 0a885303919..00000000000
--- a/tensorflow/examples/tutorials/word2vec/BUILD
+++ /dev/null
@@ -1,25 +0,0 @@
-# Description:
-# TensorFlow model for word2vec
-
-package(
-    default_visibility = ["//tensorflow:internal"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-exports_files(["LICENSE"])
-
-py_binary(
-    name = "word2vec_basic",
-    srcs = [
-        "word2vec_basic.py",
-    ],
-    python_version = "PY3",
-    srcs_version = "PY2AND3",
-    tags = [
-        "no-internal-py3",
-    ],
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//third_party/py/numpy",
-    ],
-)
diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
deleted file mode 100644
index 43fd2f8dbc1..00000000000
--- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py
+++ /dev/null
@@ -1,376 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Basic word2vec example."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import collections
-import hashlib
-import math
-import os
-import random
-import sys
-from tempfile import gettempdir
-import zipfile
-
-import numpy as np
-from six.moves import urllib
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-
-from tensorflow.contrib.tensorboard.plugins import projector
-
-data_index = 0
-
-
-def _hash_file(fpath):
-  hasher = hashlib.sha256()
-  with open(fpath, 'rb') as fpath_file:
-    for chunk in iter(lambda: fpath_file.read(65535), b''):
-      hasher.update(chunk)
-  return hasher.hexdigest()
-
-
-def word2vec_basic(log_dir):
-  """Example of building, training and visualizing a word2vec model."""
-  # Create the directory for TensorBoard variables if there is not.
-  if not os.path.exists(log_dir):
-    os.makedirs(log_dir)
-
-  # Step 1: Download the data.
-  # Note: Source website does not support HTTPS right now.
-  url = 'http://mattmahoney.net/dc/'
-
-  # pylint: disable=redefined-outer-name
-  def maybe_download(filename, expected_bytes, sha256=None):
-    """Download a file if not present, and make sure it's the right size."""
-    local_filename = os.path.join(gettempdir(), filename)
-    if not os.path.exists(local_filename):
-      local_filename, _ = urllib.request.urlretrieve(url + filename,
-                                                     local_filename)
-    statinfo = os.stat(local_filename)
-
-    if sha256 and _hash_file(local_filename) != sha256:
-      raise Exception('Failed to verify ' + local_filename + ' due to hash '
-                      'mismatch. Can you get to it with a browser?')
-
-    if statinfo.st_size == expected_bytes:
-      print('Found and verified', filename)
-    else:
-      print(statinfo.st_size)
-      raise Exception('Failed to verify ' + local_filename +
-                      '. Can you get to it with a browser?')
-    return local_filename
-
-  filename = maybe_download(
-      'text8.zip',
-      31344016,
-      sha256='a6640522afe85d1963ad56c05b0ede0a0c000dddc9671758a6cc09b7a38e5232')
-
-  # Read the data into a list of strings.
-  def read_data(filename):
-    """Extract the first file enclosed in a zip file as a list of words."""
-    with zipfile.ZipFile(filename) as f:
-      data = tf.compat.as_str(f.read(f.namelist()[0])).split()
-    return data
-
-  vocabulary = read_data(filename)
-  print('Data size', len(vocabulary))
-
-  # Step 2: Build the dictionary and replace rare words with UNK token.
-  vocabulary_size = 50000
-
-  def build_dataset(words, n_words):
-    """Process raw inputs into a dataset."""
-    count = [['UNK', -1]]
-    count.extend(collections.Counter(words).most_common(n_words - 1))
-    dictionary = {word: index for index, (word, _) in enumerate(count)}
-    data = []
-    unk_count = 0
-    for word in words:
-      index = dictionary.get(word, 0)
-      if index == 0:  # dictionary['UNK']
-        unk_count += 1
-      data.append(index)
-    count[0][1] = unk_count
-    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
-    return data, count, dictionary, reversed_dictionary
-
-  # Filling 4 global variables:
-  # data - list of codes (integers from 0 to vocabulary_size-1).
-  #   This is the original text but words are replaced by their codes
-  # count - map of words(strings) to count of occurrences
-  # dictionary - map of words(strings) to their codes(integers)
-  # reverse_dictionary - map of codes(integers) to words(strings)
-  data, count, unused_dictionary, reverse_dictionary = build_dataset(
-      vocabulary, vocabulary_size)
-  del vocabulary  # Hint to reduce memory.
-  print('Most common words (+UNK)', count[:5])
-  print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
-
-  # Step 3: Function to generate a training batch for the skip-gram model.
-  def generate_batch(batch_size, num_skips, skip_window):
-    global data_index
-    assert batch_size % num_skips == 0
-    assert num_skips <= 2 * skip_window
-    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
-    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
-    span = 2 * skip_window + 1  # [ skip_window target skip_window ]
-    buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
-    if data_index + span > len(data):
-      data_index = 0
-    buffer.extend(data[data_index:data_index + span])
-    data_index += span
-    for i in range(batch_size // num_skips):
-      context_words = [w for w in range(span) if w != skip_window]
-      words_to_use = random.sample(context_words, num_skips)
-      for j, context_word in enumerate(words_to_use):
-        batch[i * num_skips + j] = buffer[skip_window]
-        labels[i * num_skips + j, 0] = buffer[context_word]
-      if data_index == len(data):
-        buffer.extend(data[0:span])
-        data_index = span
-      else:
-        buffer.append(data[data_index])
-        data_index += 1
-    # Backtrack a little bit to avoid skipping words in the end of a batch
-    data_index = (data_index - span) % len(data)
-    return batch, labels
-
-  batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
-  for i in range(8):
-    print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
-          reverse_dictionary[labels[i, 0]])
-
-  # Step 4: Build and train a skip-gram model.
-
-  batch_size = 128
-  embedding_size = 128  # Dimension of the embedding vector.
-  skip_window = 1  # How many words to consider left and right.
-  num_skips = 2  # How many times to reuse an input to generate a label.
-  num_sampled = 64  # Number of negative examples to sample.
-
-  # We pick a random validation set to sample nearest neighbors. Here we limit
-  # the validation samples to the words that have a low numeric ID, which by
-  # construction are also the most frequent. These 3 variables are used only for
-  # displaying model accuracy, they don't affect calculation.
-  valid_size = 16  # Random set of words to evaluate similarity on.
-  valid_window = 100  # Only pick dev samples in the head of the distribution.
-  valid_examples = np.random.choice(valid_window, valid_size, replace=False)
-
-  graph = tf.Graph()
-
-  with graph.as_default():
-
-    # Input data.
-    with tf.name_scope('inputs'):
-      train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
-      train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
-      valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
-
-    # Ops and variables pinned to the CPU because of missing GPU implementation
-    with tf.device('/cpu:0'):
-      # Look up embeddings for inputs.
-      with tf.name_scope('embeddings'):
-        embeddings = tf.Variable(
-            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
-        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
-
-      # Construct the variables for the NCE loss
-      with tf.name_scope('weights'):
-        nce_weights = tf.Variable(
-            tf.truncated_normal([vocabulary_size, embedding_size],
-                                stddev=1.0 / math.sqrt(embedding_size)))
-      with tf.name_scope('biases'):
-        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
-
-    # Compute the average NCE loss for the batch.
-    # tf.nce_loss automatically draws a new sample of the negative labels each
-    # time we evaluate the loss.
-    # Explanation of the meaning of NCE loss and why choosing NCE over tf.nn.sampled_softmax_loss:
-    #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
-    #   http://papers.nips.cc/paper/5165-learning-word-embeddings-efficiently-with-noise-contrastive-estimation.pdf
-    with tf.name_scope('loss'):
-      loss = tf.reduce_mean(
-          tf.nn.nce_loss(
-              weights=nce_weights,
-              biases=nce_biases,
-              labels=train_labels,
-              inputs=embed,
-              num_sampled=num_sampled,
-              num_classes=vocabulary_size))
-
-    # Add the loss value as a scalar to summary.
-    tf.summary.scalar('loss', loss)
-
-    # Construct the SGD optimizer using a learning rate of 1.0.
-    with tf.name_scope('optimizer'):
-      optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
-
-    # Compute the cosine similarity between minibatch examples and all
-    # embeddings.
-    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
-    normalized_embeddings = embeddings / norm
-    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
-                                              valid_dataset)
-    similarity = tf.matmul(
-        valid_embeddings, normalized_embeddings, transpose_b=True)
-
-    # Merge all summaries.
-    merged = tf.summary.merge_all()
-
-    # Add variable initializer.
-    init = tf.global_variables_initializer()
-
-    # Create a saver.
-    saver = tf.train.Saver()
-
-  # Step 5: Begin training.
-  num_steps = 100001
-
-  with tf.compat.v1.Session(graph=graph) as session:
-    # Open a writer to write summaries.
-    writer = tf.summary.FileWriter(log_dir, session.graph)
-
-    # We must initialize all variables before we use them.
-    init.run()
-    print('Initialized')
-
-    average_loss = 0
-    for step in xrange(num_steps):
-      batch_inputs, batch_labels = generate_batch(batch_size, num_skips,
-                                                  skip_window)
-      feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
-
-      # Define metadata variable.
-      run_metadata = tf.RunMetadata()
-
-      # We perform one update step by evaluating the optimizer op (including it
-      # in the list of returned values for session.run()
-      # Also, evaluate the merged op to get all summaries from the returned
-      # "summary" variable. Feed metadata variable to session for visualizing
-      # the graph in TensorBoard.
-      _, summary, loss_val = session.run([optimizer, merged, loss],
-                                         feed_dict=feed_dict,
-                                         run_metadata=run_metadata)
-      average_loss += loss_val
-
-      # Add returned summaries to writer in each step.
-      writer.add_summary(summary, step)
-      # Add metadata to visualize the graph for the last run.
-      if step == (num_steps - 1):
-        writer.add_run_metadata(run_metadata, 'step%d' % step)
-
-      if step % 2000 == 0:
-        if step > 0:
-          average_loss /= 2000
-        # The average loss is an estimate of the loss over the last 2000
-        # batches.
-        print('Average loss at step ', step, ': ', average_loss)
-        average_loss = 0
-
-      # Note that this is expensive (~20% slowdown if computed every 500 steps)
-      if step % 10000 == 0:
-        sim = similarity.eval()
-        for i in xrange(valid_size):
-          valid_word = reverse_dictionary[valid_examples[i]]
-          top_k = 8  # number of nearest neighbors
-          nearest = (-sim[i, :]).argsort()[1:top_k + 1]
-          log_str = 'Nearest to %s:' % valid_word
-
-          print(
-              log_str,
-              ', '.join([reverse_dictionary[nearest[k]] for k in range(top_k)]))
-    final_embeddings = normalized_embeddings.eval()
-
-    # Write corresponding labels for the embeddings.
-    with open(log_dir + '/metadata.tsv', 'w') as f:
-      for i in xrange(vocabulary_size):
-        f.write(reverse_dictionary[i] + '\n')
-
-    # Save the model for checkpoints.
-    saver.save(session, os.path.join(log_dir, 'model.ckpt'))
-
-    # Create a configuration for visualizing embeddings with the labels in
-    # TensorBoard.
-    config = projector.ProjectorConfig()
-    embedding_conf = config.embeddings.add()
-    embedding_conf.tensor_name = embeddings.name
-    embedding_conf.metadata_path = os.path.join(log_dir, 'metadata.tsv')
-    projector.visualize_embeddings(writer, config)
-
-  writer.close()
-
-  # Step 6: Visualize the embeddings.
-
-  # pylint: disable=missing-docstring
-  # Function to draw visualization of distance between embeddings.
-  def plot_with_labels(low_dim_embs, labels, filename):
-    assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
-    plt.figure(figsize=(18, 18))  # in inches
-    for i, label in enumerate(labels):
-      x, y = low_dim_embs[i, :]
-      plt.scatter(x, y)
-      plt.annotate(
-          label,
-          xy=(x, y),
-          xytext=(5, 2),
-          textcoords='offset points',
-          ha='right',
-          va='bottom')
-
-    plt.savefig(filename)
-
-  try:
-    # pylint: disable=g-import-not-at-top
-    from sklearn.manifold import TSNE
-    import matplotlib.pyplot as plt
-
-    tsne = TSNE(
-        perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
-    plot_only = 500
-    low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
-    labels = [reverse_dictionary[i] for i in xrange(plot_only)]
-    plot_with_labels(low_dim_embs, labels, os.path.join(gettempdir(),
-                                                        'tsne.png'))
-
-  except ImportError as ex:
-    print('Please install sklearn, matplotlib, and scipy to show embeddings.')
-    print(ex)
-
-
-# All functionality is run after tf.compat.v1.app.run() (b/122547914). This
-# could be split up but the methods are laid sequentially with their usage for
-# clarity.
-def main(unused_argv):
-  # Give a folder path as an argument with '--log_dir' to save
-  # TensorBoard summaries. Default is a log folder in current directory.
-  current_path = os.path.dirname(os.path.realpath(sys.argv[0]))
-
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--log_dir',
-      type=str,
-      default=os.path.join(current_path, 'log'),
-      help='The log directory for TensorBoard summaries.')
-  flags, unused_flags = parser.parse_known_args()
-  word2vec_basic(flags.log_dir)
-
-
-if __name__ == '__main__':
-  tf.app.run()
diff --git a/tensorflow/examples/wav_to_spectrogram/BUILD b/tensorflow/examples/wav_to_spectrogram/BUILD
index 9aeaf9414ae..1fb74dab01f 100644
--- a/tensorflow/examples/wav_to_spectrogram/BUILD
+++ b/tensorflow/examples/wav_to_spectrogram/BUILD
@@ -8,8 +8,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 cc_library(
     name = "wav_to_spectrogram_lib",
     srcs = ["wav_to_spectrogram.cc"],
diff --git a/tensorflow/go/genop/generate.sh b/tensorflow/go/genop/generate.sh
index 547dd790e05..4aee23f4d22 100644
--- a/tensorflow/go/genop/generate.sh
+++ b/tensorflow/go/genop/generate.sh
@@ -25,7 +25,7 @@ then
 fi
 
 # convert GOPATH's Windows style to UNIX style
-if [ $1 == "win" ]; then
+if [[ $1 == "win" ]]; then
   # eg: convert "D:\go-14;D:\go-13" to "D\go-14;D\go-13"
   GOPATH=${GOPATH//:\\/\\}
   # eg: convert "D\go-14;D\go-13" to "\D\go-14:\D\go-13"
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 3d00174ff73..b7055c079cb 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -38,6 +38,179 @@ func makeOutputList(op *tf.Operation, start int, output string) ([]tf.Output, in
 	return list, start + size, nil
 }
 
+// Operator that connects the output of an XLA computation to other consumer graph nodes.
+func XlaClusterOutput(scope *Scope, input tf.Output) (outputs tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaClusterOutput",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// An op used by XLA SPMD partitioner to switch from manual partitioning to
+//
+// automatic partitioning. It converts the shard-shaped, manually partitioned input
+// into full-shaped tensor to be partitioned automatically with the same sharding
+// used by manual partitioning.
+func XlaSpmdShardToFullShape(scope *Scope, input tf.Output, manual_sharding string, full_shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"manual_sharding": manual_sharding, "full_shape": full_shape}
+	opspec := tf.OpSpec{
+		Type: "XlaSpmdShardToFullShape",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Wraps the XLA Sort operator, documented at
+//
+//  https://www.tensorflow.org/performance/xla/operation_semantics#sort
+// .
+//
+// Sorts a tensor. Currently only sorts in ascending order are supported.
+//
+// Arguments:
+//	input: A `Tensor` of type T.
+//
+// Returns A `Tensor` of type T.
+func XlaSort(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaSort",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Receives the named tensor from another XLA computation. Wraps the XLA Recv
+//
+// operator documented at
+//  https://www.tensorflow.org/performance/xla/operation_semantics#recv .
+//
+// Arguments:
+//	dtype: The type of the tensor.
+//	tensor_name: A string key that identifies the channel.
+//	shape: The shape of the tensor.
+//
+// Returns The tensor to receive.
+func XlaRecv(scope *Scope, dtype tf.DataType, tensor_name string, shape tf.Shape) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "tensor_name": tensor_name, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "XlaRecv",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Wraps the XLA DynamicSlice operator, documented at
+//
+//  https://www.tensorflow.org/performance/xla/operation_semantics#dynamicslice
+// .
+//
+// DynamicSlice extracts a sub-array from the input array at dynamic
+// start_indices. The size of the slice in each dimension is passed in
+// size_indices, which specify the end point of exclusive slice intervals in each
+// dimension -- [start, start + size). The shape of start_indices must have rank 1,
+// with dimension size equal to the rank of operand.
+//
+// Arguments:
+//	input: A `Tensor` of type T.
+//	start_indices: List of N integers containing the slice size for each
+// dimension. Each value must be strictly greater than zero, and start + size
+// must be less than or equal to the size of the dimension to avoid
+// implementation defined behavior.
+//
+func XlaDynamicSlice(scope *Scope, input tf.Output, start_indices tf.Output, size_indices tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaDynamicSlice",
+		Input: []tf.Input{
+			input, start_indices, size_indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Set a bound for the given input value as a hint to Xla compiler,
+//
+//         returns the same value.
+func XlaSetBound(scope *Scope, input tf.Output, bound tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaSetBound",
+		Input: []tf.Input{
+			input, bound,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Wraps the XLA DotGeneral operator, documented at
+//
+//  https://www.tensorflow.org/performance/xla/operation_semantics#dotgeneral
+// .
+//
+// Arguments:
+//	lhs: the LHS tensor
+//	rhs: the RHS tensor
+//	dimension_numbers: a serialized xla::DotDimensionNumbers proto.
+//	precision_config: a serialized xla::PrecisionConfig proto.
+func XlaDot(scope *Scope, lhs tf.Output, rhs tf.Output, dimension_numbers string, precision_config string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dimension_numbers": dimension_numbers, "precision_config": precision_config}
+	opspec := tf.OpSpec{
+		Type: "XlaDot",
+		Input: []tf.Input{
+			lhs, rhs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Output a fact about factorials.
+func Fact(scope *Scope) (fact tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Fact",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // FakeQuantWithMinMaxVarsGradientAttr is an optional argument to FakeQuantWithMinMaxVarsGradient.
 type FakeQuantWithMinMaxVarsGradientAttr func(optionalAttr)
 
@@ -96,6 +269,43 @@ func FakeQuantWithMinMaxVarsGradient(scope *Scope, gradients tf.Output, inputs t
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Computes the eigen decomposition of a batch of self-adjoint matrices
+//
+// (Note: Only real inputs are supported).
+//
+// Computes the eigenvalues and eigenvectors of the innermost M-by-N matrices in
+// tensor such that tensor[...,:,:] = u[..., :, :] * Diag(s[..., :]) * Transpose(v[...,:,:]).
+//
+// Arguments:
+//	a: the input tensor.
+//	max_iter: maximum number of sweep update, i.e., the whole lower triangular
+// part or upper triangular part based on parameter lower. Heuristically, it has
+// been argued that approximately log(min (M, N)) sweeps are needed in practice
+// (Ref: Golub & van Loan "Matrix Computation").
+//	epsilon: the tolerance ratio.
+//	precision_config: a serialized xla::PrecisionConfig proto.
+//
+// Returns:
+//	s: Singular values. The values are sorted in reverse order of magnitude, so
+// s[..., 0] is the largest value, s[..., 1] is the second largest, etc.
+//	u: Left singular vectors.
+//	v: Right singular vectors.
+func XlaSvd(scope *Scope, a tf.Output, max_iter int64, epsilon float32, precision_config string) (s tf.Output, u tf.Output, v tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"max_iter": max_iter, "epsilon": epsilon, "precision_config": precision_config}
+	opspec := tf.OpSpec{
+		Type: "XlaSvd",
+		Input: []tf.Input{
+			a,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // FakeQuantWithMinMaxArgsGradientAttr is an optional argument to FakeQuantWithMinMaxArgsGradient.
 type FakeQuantWithMinMaxArgsGradientAttr func(optionalAttr)
 
@@ -158,6 +368,34 @@ func FakeQuantWithMinMaxArgsGradient(scope *Scope, gradients tf.Output, inputs t
 	return op.Output(0)
 }
 
+// Helper operator for performing XLA-style broadcasts
+//
+// Broadcasts `lhs` and `rhs` to the same rank, by adding size 1 dimensions to
+// whichever of `lhs` and `rhs` has the lower rank, using XLA's broadcasting rules
+// for binary operators.
+//
+// Arguments:
+//	lhs: the LHS input tensor
+//	rhs: the RHS input tensor
+//	broadcast_dims: an XLA-style broadcast dimension specification
+//
+// Returns:
+//	lhs_output: the broadcasted LHS tensor
+//	rhs_output: the broadcasted RHS tensor
+func XlaBroadcastHelper(scope *Scope, lhs tf.Output, rhs tf.Output, broadcast_dims tf.Output) (lhs_output tf.Output, rhs_output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaBroadcastHelper",
+		Input: []tf.Input{
+			lhs, rhs, broadcast_dims,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Subtracts sparse `updates` from an existing tensor according to `indices`.
 //
 // This operation creates a new tensor by subtracting sparse `updates` from the
@@ -357,6 +595,40 @@ func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// QuantizeAndDequantizeV4GradAttr is an optional argument to QuantizeAndDequantizeV4Grad.
+type QuantizeAndDequantizeV4GradAttr func(optionalAttr)
+
+// QuantizeAndDequantizeV4GradAxis sets the optional axis attribute to value.
+// If not specified, defaults to -1
+func QuantizeAndDequantizeV4GradAxis(value int64) QuantizeAndDequantizeV4GradAttr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// Returns the gradient of `QuantizeAndDequantizeV4`.
+//
+// Returns a gradient of 1 for inputs that are within the quantization range,
+// or 0 otherwise.
+func QuantizeAndDequantizeV4Grad(scope *Scope, gradients tf.Output, input tf.Output, input_min tf.Output, input_max tf.Output, optional ...QuantizeAndDequantizeV4GradAttr) (input_backprop tf.Output, input_min_backprop tf.Output, input_max_backprop tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeAndDequantizeV4Grad",
+		Input: []tf.Input{
+			gradients, input, input_min, input_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // QuantizeAndDequantizeV2Attr is an optional argument to QuantizeAndDequantizeV2.
 type QuantizeAndDequantizeV2Attr func(optionalAttr)
 
@@ -1438,6 +1710,45 @@ func TensorStridedSliceUpdate(scope *Scope, input tf.Output, begin tf.Output, en
 	return op.Output(0)
 }
 
+// Computes the eigen decomposition of a batch of self-adjoint matrices
+//
+// (Note: Only real inputs are supported).
+//
+// Computes the eigenvalues and eigenvectors of the innermost N-by-N matrices in
+// tensor such that tensor[...,:,:] * v[..., :,i] = e[..., i] * v[...,:,i], for
+// i=0...N-1.
+//
+// Arguments:
+//	a: the input tensor.
+//	lower: a boolean specifies whether the calculation is done with the lower
+// triangular part or the upper triangular part.
+//	max_iter: maximum number of sweep update, i.e., the whole lower triangular
+// part or upper triangular part based on parameter lower. Heuristically, it has
+// been argued that approximately logN sweeps are needed in practice (Ref: Golub &
+// van Loan "Matrix Computation").
+//	epsilon: the tolerance ratio.
+//
+// Returns:
+//	w: The eigenvalues in ascending order, each repeated according to its
+// multiplicity.
+//	v: The column v[..., :, i] is the normalized eigenvector corresponding to the
+// eigenvalue w[..., i].
+func XlaSelfAdjointEig(scope *Scope, a tf.Output, lower bool, max_iter int64, epsilon float32) (w tf.Output, v tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"lower": lower, "max_iter": max_iter, "epsilon": epsilon}
+	opspec := tf.OpSpec{
+		Type: "XlaSelfAdjointEig",
+		Input: []tf.Input{
+			a,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Ensures that the tensor's shape matches the expected shape.
 //
 // Raises an error if the input tensor's shape does not match the specified shape.
@@ -4343,6 +4654,31 @@ func NearestNeighbors(scope *Scope, points tf.Output, centers tf.Output, k tf.Ou
 	return op.Output(0), op.Output(1)
 }
 
+// Sends the named tensor to another XLA computation. Wraps the XLA Send operator
+//
+// documented at
+//  https://www.tensorflow.org/performance/xla/operation_semantics#send .
+//
+// Arguments:
+//	tensor: The tensor to send.
+//	tensor_name: A string key that identifies the channel.
+//
+// Returns the created operation.
+func XlaSend(scope *Scope, tensor tf.Output, tensor_name string) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"tensor_name": tensor_name}
+	opspec := tf.OpSpec{
+		Type: "XlaSend",
+		Input: []tf.Input{
+			tensor,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Returns the index of a data point that should be added to the seed set.
 //
 // Entries in distances are assumed to be squared distances of candidate points to
@@ -7528,6 +7864,33 @@ func ResourceAccumulatorApplyGradient(scope *Scope, handle tf.Output, local_step
 	return scope.AddOperation(opspec)
 }
 
+// Wraps the XLA Pad operator, documented at
+//
+//  https://www.tensorflow.org/performance/xla/operation_semantics#pad
+// .
+//
+// Arguments:
+//	input: A `Tensor` of type T.
+//	padding_value: A scalar `Tensor` of type T.
+//	padding_low: the padding to apply at the start of each input dimensions
+//	padding_high: the padding to apply at the end of each input dimension.
+//	padding_interior: the padding to apply between each input element.
+//
+// Returns A `Tensor` of type T.
+func XlaPad(scope *Scope, input tf.Output, padding_value tf.Output, padding_low tf.Output, padding_high tf.Output, padding_interior tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaPad",
+		Input: []tf.Input{
+			input, padding_value, padding_low, padding_high, padding_interior,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Updates the accumulator with a new value for global_step.
 //
 // Logs warning if the accumulator's value is already higher than
@@ -7748,98 +8111,6 @@ func PriorityQueueV2(scope *Scope, shapes []tf.Shape, optional ...PriorityQueueV
 	return op.Output(0)
 }
 
-// Does nothing. Serves as a control trigger for scheduling.
-//
-// Only useful as a placeholder for control edges.
-//
-// Returns the created operation.
-func ControlTrigger(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ControlTrigger",
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Interleave the values from the `data` tensors into a single tensor.
-//
-// Builds a merged tensor such that
-//
-// ```python
-//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
-// ```
-//
-// For example, if each `indices[m]` is scalar or vector, we have
-//
-// ```python
-//     # Scalar indices:
-//     merged[indices[m], ...] = data[m][...]
-//
-//     # Vector indices:
-//     merged[indices[m][i], ...] = data[m][i, ...]
-// ```
-//
-// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
-// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
-// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
-// `constant`, the output shape is
-//
-//     merged.shape = [max(indices)] + constant
-//
-// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
-// and `indices[n][j]`, the result may be invalid. This differs from the normal
-// DynamicStitch operator that defines the behavior in that case.
-//
-// For example:
-//
-// ```python
-//     indices[0] = 6
-//     indices[1] = [4, 1]
-//     indices[2] = [[5, 2], [0, 3]]
-//     data[0] = [61, 62]
-//     data[1] = [[41, 42], [11, 12]]
-//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
-//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
-//               [51, 52], [61, 62]]
-// ```
-//
-// This method can be used to merge partitions created by `dynamic_partition`
-// as illustrated on the following example:
-//
-// ```python
-//     # Apply function (increments x_i) on elements for which a certain condition
-//     # apply (x_i != -1 in this example).
-//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
-//     condition_mask=tf.not_equal(x,tf.constant(-1.))
-//     partitioned_data = tf.dynamic_partition(
-//         x, tf.cast(condition_mask, tf.int32) , 2)
-//     partitioned_data[1] = partitioned_data[1] + 1.0
-//     condition_indices = tf.dynamic_partition(
-//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
-//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
-//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
-//     # unchanged.
-// ```
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
-// </div>
-func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ParallelDynamicStitch",
-		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(data),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Partitions `data` into `num_partitions` tensors using indices from `partitions`.
 //
 // For each index tuple `js` of size `partitions.ndim`, the slice `data[js, ...]`
@@ -8107,6 +8378,80 @@ func MultiDeviceIteratorToStringHandle(scope *Scope, multi_device_iterator tf.Ou
 	return op.Output(0)
 }
 
+// QuantizeAndDequantizeV4Attr is an optional argument to QuantizeAndDequantizeV4.
+type QuantizeAndDequantizeV4Attr func(optionalAttr)
+
+// QuantizeAndDequantizeV4SignedInput sets the optional signed_input attribute to value.
+// If not specified, defaults to true
+func QuantizeAndDequantizeV4SignedInput(value bool) QuantizeAndDequantizeV4Attr {
+	return func(m optionalAttr) {
+		m["signed_input"] = value
+	}
+}
+
+// QuantizeAndDequantizeV4NumBits sets the optional num_bits attribute to value.
+// If not specified, defaults to 8
+func QuantizeAndDequantizeV4NumBits(value int64) QuantizeAndDequantizeV4Attr {
+	return func(m optionalAttr) {
+		m["num_bits"] = value
+	}
+}
+
+// QuantizeAndDequantizeV4RangeGiven sets the optional range_given attribute to value.
+// If not specified, defaults to false
+func QuantizeAndDequantizeV4RangeGiven(value bool) QuantizeAndDequantizeV4Attr {
+	return func(m optionalAttr) {
+		m["range_given"] = value
+	}
+}
+
+// QuantizeAndDequantizeV4RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_TO_EVEN"
+func QuantizeAndDequantizeV4RoundMode(value string) QuantizeAndDequantizeV4Attr {
+	return func(m optionalAttr) {
+		m["round_mode"] = value
+	}
+}
+
+// QuantizeAndDequantizeV4NarrowRange sets the optional narrow_range attribute to value.
+// If not specified, defaults to false
+func QuantizeAndDequantizeV4NarrowRange(value bool) QuantizeAndDequantizeV4Attr {
+	return func(m optionalAttr) {
+		m["narrow_range"] = value
+	}
+}
+
+// QuantizeAndDequantizeV4Axis sets the optional axis attribute to value.
+// If not specified, defaults to -1
+func QuantizeAndDequantizeV4Axis(value int64) QuantizeAndDequantizeV4Attr {
+	return func(m optionalAttr) {
+		m["axis"] = value
+	}
+}
+
+// Returns the gradient of `QuantizeAndDequantizeV4`.
+//
+// This is almost identical to QuantizeAndDequantizeV2, except that it returns a
+// gradient of 1 for inputs that are within the quantization range, or 0 otherwise.
+func QuantizeAndDequantizeV4(scope *Scope, input tf.Output, input_min tf.Output, input_max tf.Output, optional ...QuantizeAndDequantizeV4Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizeAndDequantizeV4",
+		Input: []tf.Input{
+			input, input_min, input_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Gets next element for the provided shard number.
 //
 // Arguments:
@@ -8229,6 +8574,39 @@ func BoostedTreesCalculateBestFeatureSplit(scope *Scope, node_id_range tf.Output
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
 }
 
+// Wraps the XLA DynamicUpdateSlice operator, documented at
+//
+//  https://www.tensorflow.org/performance/xla/operation_semantics#dynamicupdateslice
+// .
+//
+// XlaDynamicUpdateSlice generates a result which is the value of the `input`
+// operand, with a slice update overwritten at `indices`. The shape of `update`
+// determines the shape of the sub-array of the result which is updated. The shape
+// of indices must be rank == 1, with dimension size equal to the rank of `input`.
+//
+// Handling of out-of-bounds slice indices is implementation-defined.
+//
+// Arguments:
+//	input: A `Tensor` of type T.
+//	update: A `Tensor` of type T. Same rank as `input`.
+//	indices: A vector of indices into `input`. Must have length equal to the rank of
+// `input`.
+//
+// Returns A `Tensor` of type T.
+func XlaDynamicUpdateSlice(scope *Scope, input tf.Output, update tf.Output, indices tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaDynamicUpdateSlice",
+		Input: []tf.Input{
+			input, update, indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ModelDatasetAttr is an optional argument to ModelDataset.
 type ModelDatasetAttr func(optionalAttr)
 
@@ -8248,6 +8626,14 @@ func ModelDatasetCpuBudget(value int64) ModelDatasetAttr {
 	}
 }
 
+// ModelDatasetRamBudget sets the optional ram_budget attribute to value.
+// If not specified, defaults to 0
+func ModelDatasetRamBudget(value int64) ModelDatasetAttr {
+	return func(m optionalAttr) {
+		m["ram_budget"] = value
+	}
+}
+
 // Identity transformation that models performance.
 //
 // Identity transformation that models performance.
@@ -8859,6 +9245,14 @@ func PrefetchDatasetLegacyAutotune(value bool) PrefetchDatasetAttr {
 	}
 }
 
+// PrefetchDatasetBufferSizeMin sets the optional buffer_size_min attribute to value.
+// If not specified, defaults to 0
+func PrefetchDatasetBufferSizeMin(value int64) PrefetchDatasetAttr {
+	return func(m optionalAttr) {
+		m["buffer_size_min"] = value
+	}
+}
+
 // Creates a dataset that asynchronously prefetches elements from `input_dataset`.
 //
 // Arguments:
@@ -13968,6 +14362,103 @@ func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...Fix
 	return op.Output(0)
 }
 
+// ExperimentalRebatchDatasetAttr is an optional argument to ExperimentalRebatchDataset.
+type ExperimentalRebatchDatasetAttr func(optionalAttr)
+
+// ExperimentalRebatchDatasetUseFallback sets the optional use_fallback attribute to value.
+// If not specified, defaults to true
+func ExperimentalRebatchDatasetUseFallback(value bool) ExperimentalRebatchDatasetAttr {
+	return func(m optionalAttr) {
+		m["use_fallback"] = value
+	}
+}
+
+// Creates a dataset that changes the batch size.
+//
+// Creates a dataset that changes the batch size of the dataset to current batch
+// size // num_replicas.
+//
+// Arguments:
+//	input_dataset: A variant tensor representing the input dataset.
+//	num_replicas: A scalar representing the number of replicas to distribute this batch across. As
+// a result of this transformation the current batch size would end up being
+// divided  by this parameter.
+//
+//
+func ExperimentalRebatchDataset(scope *Scope, input_dataset tf.Output, num_replicas tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ExperimentalRebatchDatasetAttr) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalRebatchDataset",
+		Input: []tf.Input{
+			input_dataset, num_replicas,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
+type TextLineReaderV2Attr func(optionalAttr)
+
+// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
+//
+// value: Number of lines to skip from the beginning of every file.
+// If not specified, defaults to 0
+func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["skip_header_lines"] = value
+	}
+}
+
+// TextLineReaderV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// A Reader that outputs the lines of a file delimited by '\n'.
+//
+// Returns The handle to reference the Reader.
+func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TextLineReaderV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Saves the input tensors to disk.
 //
 // The size of `tensor_names` must match the number of tensors in `data`. `data[i]`
@@ -14678,6 +15169,45 @@ func Svd(scope *Scope, input tf.Output, optional ...SvdAttr) (s tf.Output, u tf.
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Retrieve multiple values from the computation outfeed. Device ordinal is a
+// tensor allowing dynamic outfeed.
+//
+// This operation will block indefinitely until data is available. Output `i`
+// corresponds to XLA tuple element `i`.
+//
+// Arguments:
+//	device_ordinal: An int scalar tensor, representing the TPU device to use. This should be -1 when
+// the Op is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+//	dtypes: The element types of each element in `outputs`.
+//	shapes: The shapes of each tensor in `outputs`.
+//
+// Returns A list of tensors that will be read from the outfeed.
+func OutfeedDequeueTupleV2(scope *Scope, device_ordinal tf.Output, dtypes []tf.DataType, shapes []tf.Shape) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes, "shapes": shapes}
+	opspec := tf.OpSpec{
+		Type: "OutfeedDequeueTupleV2",
+		Input: []tf.Input{
+			device_ordinal,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("OutfeedDequeueTupleV2", err)
+		return
+	}
+	return outputs
+}
+
 // QrAttr is an optional argument to Qr.
 type QrAttr func(optionalAttr)
 
@@ -14697,6 +15227,10 @@ func QrFullMatrices(value bool) QrAttr {
 // Computes the QR decomposition of each inner matrix in `tensor` such that
 // `tensor[..., :, :] = q[..., :, :] * r[..., :,:])`
 //
+// Currently, the gradient for the QR decomposition is well-defined only when
+// the first `P` columns of the inner matrix are linearly independent, where
+// `P` is the minimum of `M` and `N`, the 2 inner-most dimmensions of `tensor`.
+//
 // ```python
 // # a is a tensor.
 // # q is a tensor of orthonormal matrices.
@@ -15037,30 +15571,6 @@ func LogMatrixDeterminant(scope *Scope, input tf.Output) (sign tf.Output, log_ab
 	return op.Output(0), op.Output(1)
 }
 
-// Computes the determinant of one or more square matrices.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor containing the determinants
-// for all input submatrices `[..., :, :]`.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[...]`.
-func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixDeterminant",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Creates a TensorList by indexing into a Tensor.
 //
 // Each member of the TensorList corresponds to one row of the input tensor,
@@ -15307,6 +15817,118 @@ func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
 	return op.Output(0)
 }
 
+// Does nothing. Serves as a control trigger for scheduling.
+//
+// Only useful as a placeholder for control edges.
+//
+// Returns the created operation.
+func ControlTrigger(scope *Scope) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ControlTrigger",
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Interleave the values from the `data` tensors into a single tensor.
+//
+// Builds a merged tensor such that
+//
+// ```python
+//     merged[indices[m][i, ..., j], ...] = data[m][i, ..., j, ...]
+// ```
+//
+// For example, if each `indices[m]` is scalar or vector, we have
+//
+// ```python
+//     # Scalar indices:
+//     merged[indices[m], ...] = data[m][...]
+//
+//     # Vector indices:
+//     merged[indices[m][i], ...] = data[m][i, ...]
+// ```
+//
+// Each `data[i].shape` must start with the corresponding `indices[i].shape`,
+// and the rest of `data[i].shape` must be constant w.r.t. `i`.  That is, we
+// must have `data[i].shape = indices[i].shape + constant`.  In terms of this
+// `constant`, the output shape is
+//
+//     merged.shape = [max(indices)] + constant
+//
+// Values may be merged in parallel, so if an index appears in both `indices[m][i]`
+// and `indices[n][j]`, the result may be invalid. This differs from the normal
+// DynamicStitch operator that defines the behavior in that case.
+//
+// For example:
+//
+// ```python
+//     indices[0] = 6
+//     indices[1] = [4, 1]
+//     indices[2] = [[5, 2], [0, 3]]
+//     data[0] = [61, 62]
+//     data[1] = [[41, 42], [11, 12]]
+//     data[2] = [[[51, 52], [21, 22]], [[1, 2], [31, 32]]]
+//     merged = [[1, 2], [11, 12], [21, 22], [31, 32], [41, 42],
+//               [51, 52], [61, 62]]
+// ```
+//
+// This method can be used to merge partitions created by `dynamic_partition`
+// as illustrated on the following example:
+//
+// ```python
+//     # Apply function (increments x_i) on elements for which a certain condition
+//     # apply (x_i != -1 in this example).
+//     x=tf.constant([0.1, -1., 5.2, 4.3, -1., 7.4])
+//     condition_mask=tf.not_equal(x,tf.constant(-1.))
+//     partitioned_data = tf.dynamic_partition(
+//         x, tf.cast(condition_mask, tf.int32) , 2)
+//     partitioned_data[1] = partitioned_data[1] + 1.0
+//     condition_indices = tf.dynamic_partition(
+//         tf.range(tf.shape(x)[0]), tf.cast(condition_mask, tf.int32) , 2)
+//     x = tf.dynamic_stitch(condition_indices, partitioned_data)
+//     # Here x=[1.1, -1., 6.2, 5.3, -1, 8.4], the -1. values remain
+//     # unchanged.
+// ```
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/DynamicStitch.png" alt>
+// </div>
+func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ParallelDynamicStitch",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(data),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a Tensor stack of all keys in a tensor map.
+//
+// input_handle: the input map
+// keys: the returned Tensor of all keys in the map
+func TensorMapStackKeys(scope *Scope, input_handle tf.Output, key_dtype tf.DataType) (keys tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorMapStackKeys",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns whether the given key exists in the map.
 //
 // input_handle: the input map
@@ -15683,44 +16305,56 @@ func TensorSummary(scope *Scope, tensor tf.Output, optional ...TensorSummaryAttr
 	return op.Output(0)
 }
 
-// Scatters tensor at indices in an input list.
+// Outputs a `Summary` protocol buffer with a histogram.
 //
-// Each member of the TensorList corresponds to one row of the input tensor,
-// specified by the given index (see `tf.gather`).
+// The generated
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// has one summary value containing a histogram for `values`.
 //
-// input_handle: The list to scatter into.
-// tensor: The input tensor.
-// indices: The indices used to index into the list.
-// output_handle: The TensorList.
-func TensorListScatterIntoExistingList(scope *Scope, input_handle tf.Output, tensor tf.Output, indices tf.Output) (output_handle tf.Output) {
+// This op reports an `InvalidArgument` error if any value is not finite.
+//
+// Arguments:
+//	tag: Scalar.  Tag to use for the `Summary.Value`.
+//	values: Any shape. Values to use to build the histogram.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorListScatterIntoExistingList",
+		Type: "HistogramSummary",
 		Input: []tf.Input{
-			input_handle, tensor, indices,
+			tag, values,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
+// Merges summaries.
+//
+// This op creates a
+// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
+// protocol buffer that contains the union of all the values in the input
+// summaries.
+//
+// When the Op is run, it reports an `InvalidArgument` error if multiple values
+// in the summaries to merge use the same tag.
 //
 // Arguments:
-//	tag: A string attached to this summary. Used for organization in TensorBoard.
-//	tensor: A tensor to serialize.
-//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
-// data.
-func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
+//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
+// buffers.
+//
+// Returns Scalar. Serialized `Summary` protocol buffer.
+func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorSummaryV2",
+		Type: "MergeSummary",
 		Input: []tf.Input{
-			tag, tensor, serialized_summary_metadata,
+			tf.OutputList(inputs),
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -16132,41 +16766,6 @@ func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, o
 	return op.Output(0)
 }
 
-// Check if the input matches the regex pattern.
-//
-// The input is a string tensor of any shape. The pattern is a scalar
-// string tensor which is applied to every element of the input tensor.
-// The boolean values (True or False) of the output tensor indicate
-// if the input matches the regex pattern provided.
-//
-// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
-//
-// Examples:
-//
-// >>> tf.strings.regex_full_match(["TF lib", "lib TF"], ".*lib$")
-// <tf.Tensor: shape=(2,), dtype=bool, numpy=array([ True, False])>
-// >>> tf.strings.regex_full_match(["TF lib", "lib TF"], ".*TF$")
-// <tf.Tensor: shape=(2,), dtype=bool, numpy=array([False,  True])>
-//
-// Arguments:
-//	input: A string tensor of the text to be processed.
-//	pattern: A scalar string tensor containing the regular expression to match the input.
-//
-// Returns A bool tensor with the same shape as `input`.
-func RegexFullMatch(scope *Scope, input tf.Output, pattern tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RegexFullMatch",
-		Input: []tf.Input{
-			input, pattern,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // MatrixDiagV3Attr is an optional argument to MatrixDiagV3.
 type MatrixDiagV3Attr func(optionalAttr)
 
@@ -16825,6 +17424,14 @@ func LookupTableFindV2(scope *Scope, table_handle tf.Output, keys tf.Output, def
 // MaxPoolGradAttr is an optional argument to MaxPoolGrad.
 type MaxPoolGradAttr func(optionalAttr)
 
+// MaxPoolGradExplicitPaddings sets the optional explicit_paddings attribute to value.
+// If not specified, defaults to <>
+func MaxPoolGradExplicitPaddings(value []int64) MaxPoolGradAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
 // MaxPoolGradDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
@@ -18641,6 +19248,31 @@ func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Ou
 	return op.Output(0)
 }
 
+// Picks the best algorithm based on device, and scrambles seed into key and counter.
+//
+// This op picks the best counter-based RNG algorithm based on device, and scrambles a shape-[2] seed into a key and a counter, both needed by the counter-based algorithm. The scrambling is opaque but approximately satisfies the property that different seed results in different key/counter pair (which will in turn result in different random numbers).
+//
+// Arguments:
+//	seed: 2 seeds (shape [2]).
+//
+// Returns:
+//	key: Key for the counter-based RNG algorithm (shape uint64[1]).
+//	counter: Counter for the counter-based RNG algorithm. Since counter size is algorithm-dependent, this output will be right-padded with zeros to reach shape uint64[2] (the current maximal counter size among algorithms).
+//	alg: The RNG algorithm (shape int32[]).
+func StatelessRandomGetKeyCounterAlg(scope *Scope, seed tf.Output) (key tf.Output, counter tf.Output, alg tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomGetKeyCounterAlg",
+		Input: []tf.Input{
+			seed,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Computes the sum along sparse segments of a tensor.
 //
 // Read
@@ -18705,6 +19337,14 @@ func CollectiveReduceV2CommunicationHint(value string) CollectiveReduceV2Attr {
 	}
 }
 
+// CollectiveReduceV2TimeoutSeconds sets the optional timeout_seconds attribute to value.
+// If not specified, defaults to 0
+func CollectiveReduceV2TimeoutSeconds(value float32) CollectiveReduceV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_seconds"] = value
+	}
+}
+
 // Mutually reduces multiple tensors of identical type and shape.
 func CollectiveReduceV2(scope *Scope, input tf.Output, group_size tf.Output, group_key tf.Output, instance_key tf.Output, merge_op string, final_op string, optional ...CollectiveReduceV2Attr) (data tf.Output) {
 	if scope.Err() != nil {
@@ -20861,73 +21501,36 @@ func DivNoNan(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 // scattered onto an existing tensor (as opposed to a zero-tensor). If the memory
 // for the existing tensor cannot be re-used, a copy is made and updated.
 //
-// If `indices` contains duplicates, then their updates are accumulated (summed).
+// If `indices` contains duplicates, then we pick the last update for the index.
 //
-// **WARNING**: The order in which updates are applied is nondeterministic, so the
-// output will be nondeterministic if `indices` contains duplicates -- because
-// of some numerical approximation issues, numbers summed in different order
-// may yield different results.
+// If an out of bound index is found on CPU, an error is returned.
+//
+// **WARNING**: There are some GPU specific semantics for this operation.
+// - If an out of bound index is found, the index is ignored.
+// - The order in which updates are applied is nondeterministic, so the output
+// will be nondeterministic if `indices` contains duplicates.
 //
 // `indices` is an integer tensor containing indices into a new tensor of shape
-// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+// `shape`.
 //
-//     indices.shape[-1] <= shape.rank
+// * `indices` must have at least 2 axes: `(num_updates, index_depth)`.
+// * The last axis of `indices` is how deep to index into `tensor` so  this index
+//   depth must be less than the rank of `tensor`: `indices.shape[-1] <= tensor.ndim`
 //
-// The last dimension of `indices` corresponds to indices into elements
-// (if `indices.shape[-1] = shape.rank`) or slices
-// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-// `shape`.  `updates` is a tensor with shape
+// if `indices.shape[-1] = tensor.rank` this Op indexes and updates scalar elements.
+// if `indices.shape[-1] < tensor.rank` it indexes and updates slices of the input
+// `tensor`.
 //
-//     indices.shape[:-1] + shape[indices.shape[-1]:]
+// Each `update` has a rank of `tensor.rank - indices.shape[-1]`.
+// The overall shape of `updates` is:
 //
-// The simplest form of scatter is to insert individual elements in a tensor by
-// index. For example, say we want to insert 4 scattered elements in a rank-1
-// tensor with 8 elements.
+// ```
+// indices.shape[:-1] + tensor.shape[indices.shape[-1]:]
+// ```
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
-// </div>
+// For usage examples see the python [tf.tensor_scatter_nd_update](
+// https://www.tensorflow.org/api_docs/python/tf/tensor_scatter_nd_update) function
 //
-// In Python, this scatter operation would look like this:
-//
-//     >>> indices = tf.constant([[4], [3], [1], [7]])
-//     >>> updates = tf.constant([9, 10, 11, 12])
-//     >>> tensor = tf.ones([8], dtype=tf.int32)
-//     >>> print(tf.tensor_scatter_nd_update(tensor, indices, updates))
-//     tf.Tensor([ 1 11  1 10  9  1  1 12], shape=(8,), dtype=int32)
-//
-// We can also, insert entire slices of a higher rank tensor all at once. For
-// example, if we wanted to insert two slices in the first dimension of a
-// rank-3 tensor with two matrices of new values.
-//
-// In Python, this scatter operation would look like this:
-//
-//     >>> indices = tf.constant([[0], [2]])
-//     >>> updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-//     ...                         [7, 7, 7, 7], [8, 8, 8, 8]],
-//     ...                        [[5, 5, 5, 5], [6, 6, 6, 6],
-//     ...                         [7, 7, 7, 7], [8, 8, 8, 8]]])
-//     >>> tensor = tf.ones([4, 4, 4], dtype=tf.int32)
-//     >>> print(tf.tensor_scatter_nd_update(tensor, indices, updates).numpy())
-//     [[[5 5 5 5]
-//       [6 6 6 6]
-//       [7 7 7 7]
-//       [8 8 8 8]]
-//      [[1 1 1 1]
-//       [1 1 1 1]
-//       [1 1 1 1]
-//       [1 1 1 1]]
-//      [[5 5 5 5]
-//       [6 6 6 6]
-//       [7 7 7 7]
-//       [8 8 8 8]]
-//      [[1 1 1 1]
-//       [1 1 1 1]
-//       [1 1 1 1]
-//       [1 1 1 1]]]
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, the index is ignored.
 //
 // Arguments:
 //	tensor: Tensor to copy/update.
@@ -21512,6 +22115,11 @@ func ResourceGather(scope *Scope, resource tf.Output, indices tf.Output, dtype t
 //
 // *NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
 // [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// Given two input tensors, the `tf.add` operation computes the sum for every element in the tensor.
+//
+// Both input and output have a range `(-inf, inf)`.
+//
 func Add(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -21589,6 +22197,12 @@ func Atan(scope *Scope, x tf.Output) (y tf.Output) {
 }
 
 // Computes acos of x element-wise.
+//
+//
+//   Provided an input tensor, the `tf.math.acos` operation returns the inverse cosine of each element of the tensor. If `y = tf.math.cos(x)` then, `x = tf.math.acos(y)`.
+//
+//   Input range is `[-1, 1]` and the output has a range of `[0, pi]`.
+//
 func Acos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -21712,6 +22326,187 @@ func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
+type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
+
+// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// var: Should be from a Variable().
+//
+// Arguments:
+//
+//	accum: Should be from a Variable().
+//	accum_update: : Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	rho: Decay factor. Must be a scalar.
+//	epsilon: Constant factor. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyAdadelta",
+		Input: []tf.Input{
+			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingADAMParametersGradAccumDebug.
+type RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingADAMParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve ADAM embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns:
+//	parameters: Parameter parameters updated by the ADAM optimization algorithm.
+//	momenta: Parameter momenta updated by the ADAM optimization algorithm.
+//	velocities: Parameter velocities updated by the ADAM optimization algorithm.
+//	gradient_accumulators: Parameter gradient_accumulators updated by the ADAM optimization algorithm.
+func RetrieveTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
+// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
+type ResourceApplyAdamAttr func(optionalAttr)
+
+// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
+//
+// value: If `True`, uses the nesterov update.
+// If not specified, defaults to false
+func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
+	return func(m optionalAttr) {
+		m["use_nesterov"] = value
+	}
+}
+
+// Update '*var' according to the Adam algorithm.
+//
+// $$\text{lr}_t := \mathrm{learning_rate} * \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
+// $$m_t := \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
+// $$v_t := \beta_2 * v_{t-1} + (1 - \beta_2) * g * g$$
+// $$\text{variable} := \text{variable} - \text{lr}_t * m_t / (\sqrt{v_t} + \epsilon)$$
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	beta2_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyAdam",
+		Input: []tf.Input{
+			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes sigmoid of `x` element-wise.
+//
+// Specifically, `y = 1 / (1 + exp(-x))`.
+func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Sigmoid",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // PrintAttr is an optional argument to Print.
 type PrintAttr func(optionalAttr)
 
@@ -21884,10 +22679,12 @@ func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 //   element in the tensor. Input range is `[-inf, inf]` and
 //   output range is `[-1,1]`.
 //
-//   ```python
-//   x = tf.constant([-float("inf"), -5, -0.5, 1, 1.2, 2, 3, float("inf")])
-//   tf.math.tanh(x) ==> [-1. -0.99990916 -0.46211717 0.7615942 0.8336547 0.9640276 0.9950547 1.]
-//   ```
+//   >>> x = tf.constant([-float("inf"), -5, -0.5, 1, 1.2, 2, 3, float("inf")])
+//   >>> tf.math.tanh(x)
+//   <tf.Tensor: shape=(8,), dtype=float32, numpy=
+//   array([-1.        , -0.99990916, -0.46211717,  0.7615942 ,  0.8336547 ,
+//           0.9640276 ,  0.9950547 ,  1.        ], dtype=float32)>
+//
 func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
@@ -24589,6 +25386,37 @@ func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Ou
 	return op.Output(0)
 }
 
+// Wraps the XLA ConvGeneralDilated operator, documented at
+//
+//  https://www.tensorflow.org/performance/xla/operation_semantics#conv_convolution
+// .
+//
+// Arguments:
+//	lhs: the input tensor
+//	rhs: the kernel tensor
+//	window_strides: the inter-window strides
+//	padding: the padding to apply at the start and end of each input dimensions
+//	lhs_dilation: dilation to apply between input elements
+//	rhs_dilation: dilation to apply between kernel elements
+//	feature_group_count: number of feature groups for grouped convolution.
+//	dimension_numbers: a serialized xla::ConvolutionDimensionNumbers proto.
+//	precision_config: a serialized xla::PrecisionConfig proto.
+func XlaConv(scope *Scope, lhs tf.Output, rhs tf.Output, window_strides tf.Output, padding tf.Output, lhs_dilation tf.Output, rhs_dilation tf.Output, feature_group_count tf.Output, dimension_numbers string, precision_config string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dimension_numbers": dimension_numbers, "precision_config": precision_config}
+	opspec := tf.OpSpec{
+		Type: "XlaConv",
+		Input: []tf.Input{
+			lhs, rhs, window_strides, padding, lhs_dilation, rhs_dilation, feature_group_count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // NthElementAttr is an optional argument to NthElement.
 type NthElementAttr func(optionalAttr)
 
@@ -25437,6 +26265,37 @@ func MaxPoolGradGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output,
 	return op.Output(0)
 }
 
+// Takes the packed uint32 input and unpacks the input to uint8 to do
+//
+// Dequantization on device.
+//
+// Arguments:
+//	input: Input tensors whose types is uint32, shape is [d0, ..., dn].
+//	min_range: The minimum scalar value possibly produced for the input.
+//	max_range: The maximum scalar value possibly produced for the input.
+//	mode: String to determine the dequantize mode in {"MIN_COMBINED", "MIN_FIRST", "SCALED"}.
+//	transpose_output: Boolean to determine if output is transposed. transpose_output
+// is faster when input is large and rank of input is higher than 1.
+//
+// Returns Output tensors whose types is bloat16. If transpose_output is true,
+// output shape is [dn * 4, dn-1, ..., d1, d0]. If transpose_output
+// is false, output shape is [d0,..., dn * 4].
+func XlaDequantize(scope *Scope, input tf.Output, min_range float32, max_range float32, mode string, transpose_output bool) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"min_range": min_range, "max_range": max_range, "mode": mode, "transpose_output": transpose_output}
+	opspec := tf.OpSpec{
+		Type: "XlaDequantize",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // MaxPoolGradV2Attr is an optional argument to MaxPoolGradV2.
 type MaxPoolGradV2Attr func(optionalAttr)
 
@@ -27278,6 +28137,28 @@ func FusedBatchNormGrad(scope *Scope, y_backprop tf.Output, x tf.Output, scale t
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
+// An op used by XLA SPMD partitioner to switch from automatic partitioning to
+//
+// manual partitioning. It annotates the input (full-shape, to be automatically
+// partitioned) with the same sharding used by manual partitioning, and outputs a
+// shard-shaped tensor to be consumed by later manually-partitioned ops. If the
+// shape is not evenly partitionable, the padding region will be masked with 0s.
+func XlaSpmdFullToShardShape(scope *Scope, input tf.Output, manual_sharding string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"manual_sharding": manual_sharding}
+	opspec := tf.OpSpec{
+		Type: "XlaSpmdFullToShardShape",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // DecodeCSVAttr is an optional argument to DecodeCSV.
 type DecodeCSVAttr func(optionalAttr)
 
@@ -27855,6 +28736,92 @@ func DecodePaddedRaw(scope *Scope, input_bytes tf.Output, fixed_length tf.Output
 	return op.Output(0)
 }
 
+// Elementwise computes the bitwise left-shift of `x` and `y`.
+//
+// If `y` is negative, or greater than or equal to the width of `x` in bits the
+// result is implementation defined.
+//
+// Example:
+//
+// ```python
+// import tensorflow as tf
+// from tensorflow.python.ops import bitwise_ops
+// import numpy as np
+// dtype_list = [tf.int8, tf.int16, tf.int32, tf.int64]
+//
+// for dtype in dtype_list:
+//   lhs = tf.constant([-1, -5, -3, -14], dtype=dtype)
+//   rhs = tf.constant([5, 0, 7, 11], dtype=dtype)
+//
+//   left_shift_result = bitwise_ops.left_shift(lhs, rhs)
+//
+//   print(left_shift_result)
+//
+// # This will print:
+// # tf.Tensor([ -32   -5 -128    0], shape=(4,), dtype=int8)
+// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int16)
+// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int32)
+// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int64)
+//
+// lhs = np.array([-2, 64, 101, 32], dtype=np.int8)
+// rhs = np.array([-1, -5, -3, -14], dtype=np.int8)
+// bitwise_ops.left_shift(lhs, rhs)
+// # <tf.Tensor: shape=(4,), dtype=int8, numpy=array([ -2,  64, 101,  32], dtype=int8)>
+// ```
+//
+func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "LeftShift",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generates a feature cross from a list of tensors, and returns it as a
+// RaggedTensor.  See `tf.ragged.cross` for more details.
+//
+// Arguments:
+//	ragged_values: The values tensor for each RaggedTensor input.
+//	ragged_row_splits: The row_splits tensor for each RaggedTensor input.
+//	sparse_indices: The indices tensor for each SparseTensor input.
+//	sparse_values: The values tensor for each SparseTensor input.
+//	sparse_shape: The dense_shape tensor for each SparseTensor input.
+//	dense_inputs: The tf.Tensor inputs.
+//	input_order: String specifying the tensor type for each input.  The `i`th character in
+// this string specifies the type of the `i`th input, and is one of: 'R' (ragged),
+// 'D' (dense), or 'S' (sparse).  This attr is used to ensure that the crossed
+// values are combined in the order of the inputs from the call to tf.ragged.cross.
+//
+//
+//
+//
+//
+//
+// Returns:
+//	output_values: The `values` for the returned `RaggedTensor`.
+//	output_row_splits: The `row_splits` for the returned `RaggedTensor`.
+func RaggedCross(scope *Scope, ragged_values []tf.Output, ragged_row_splits []tf.Output, sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shape []tf.Output, dense_inputs []tf.Output, input_order string, hashed_output bool, num_buckets int64, hash_key int64, out_values_type tf.DataType, out_row_splits_type tf.DataType) (output_values tf.Output, output_row_splits tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_order": input_order, "hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_values_type": out_values_type, "out_row_splits_type": out_row_splits_type}
+	opspec := tf.OpSpec{
+		Type: "RaggedCross",
+		Input: []tf.Input{
+			tf.OutputList(ragged_values), tf.OutputList(ragged_row_splits), tf.OutputList(sparse_indices), tf.OutputList(sparse_values), tf.OutputList(sparse_shape), tf.OutputList(dense_inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // BatchMatMulAttr is an optional argument to BatchMatMul.
 type BatchMatMulAttr func(optionalAttr)
 
@@ -27990,6 +28957,576 @@ func RaggedTensorFromVariant(scope *Scope, encoded_ragged tf.Output, input_ragge
 	return output_nested_splits, output_dense_values
 }
 
+// Returns the name of the device on which `resource` has been placed.
+func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalIteratorGetDevice",
+		Input: []tf.Input{
+			resource,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
+func ExperimentalBytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalBytesProducedStatsDataset",
+		Input: []tf.Input{
+			input_dataset, tag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
+//
+// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
+// ](http://arxiv.org/abs/1511.07289)
+func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Elu",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
+type AddSparseToTensorsMapAttr func(optionalAttr)
+
+// AddSparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
+//
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
+//
+// A `SparseTensor` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`.
+//
+// This operator takes the given `SparseTensor` and adds it to a container
+// object (a `SparseTensorsMap`).  A unique key within this container is generated
+// in the form of an `int64`, and this is the value that is returned.
+//
+// The `SparseTensor` can then be read out as part of a minibatch by passing
+// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddSparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
+//
+// Returns 0-D.  The handle of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.
+func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AddSparseToTensorsMap",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Transforms a vector of tf.Example protos (as strings) into typed tensors.
+//
+// Arguments:
+//	serialized: A scalar or vector containing binary serialized Example protos.
+//	names: A tensor containing the names of the serialized protos.
+// Corresponds 1:1 with the `serialized` tensor.
+// May contain, for example, table key (descriptive) names for the
+// corresponding serialized protos.  These are purely useful for debugging
+// purposes, and the presence of values here has no effect on the output.
+// May also be an empty vector if no names are available.
+// If non-empty, this tensor must have the same shape as "serialized".
+//	sparse_keys: Vector of strings.
+// The keys expected in the Examples' features associated with sparse values.
+//	dense_keys: Vector of strings.
+// The keys expected in the Examples' features associated with dense values.
+//	ragged_keys: Vector of strings.
+// The keys expected in the Examples' features associated with ragged values.
+//	dense_defaults: A list of Tensors (some may be empty).  Corresponds 1:1 with `dense_keys`.
+// dense_defaults[j] provides default values
+// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
+// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
+// The input type is inferred from dense_defaults[j], even when it's empty.
+// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
+// then the shape of dense_defaults[j] must match that of dense_shapes[j].
+// If dense_shapes[j] has an undefined major dimension (variable strides dense
+// feature), dense_defaults[j] must contain a single element:
+// the padding element.
+//	num_sparse: The number of sparse keys.
+//	sparse_types: A list of `num_sparse` types; the data types of data in each Feature
+// given in sparse_keys.
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	ragged_value_types: A list of `num_ragged` types; the data types of data in each Feature
+// given in ragged_keys (where `num_ragged = sparse_keys.size()`).
+// Currently the ParseExample supports DT_FLOAT (FloatList),
+// DT_INT64 (Int64List), and DT_STRING (BytesList).
+//	ragged_split_types: A list of `num_ragged` types; the data types of row_splits in each Feature
+// given in ragged_keys (where `num_ragged = sparse_keys.size()`).
+// May be DT_INT32 or DT_INT64.
+//	dense_shapes: A list of `num_dense` shapes; the shapes of data in each Feature
+// given in dense_keys (where `num_dense = dense_keys.size()`).
+// The number of elements in the Feature corresponding to dense_key[j]
+// must always equal dense_shapes[j].NumEntries().
+// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
+// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
+// The dense outputs are just the inputs row-stacked by batch.
+// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
+// the shape of the output Tensor dense_values[j] will be
+// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
+// of elements of length D1 * .... * DN, across all minibatch entries
+// in the input.  Any minibatch entry with less than M blocks of elements of
+// length D1 * ... * DN will be padded with the corresponding default_value
+// scalar element along the second dimension.
+func ParseExampleV2(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys tf.Output, dense_keys tf.Output, ragged_keys tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_types []tf.DataType, ragged_value_types []tf.DataType, ragged_split_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output, ragged_values []tf.Output, ragged_row_splits []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_types": sparse_types, "ragged_value_types": ragged_value_types, "ragged_split_types": ragged_split_types, "dense_shapes": dense_shapes}
+	opspec := tf.OpSpec{
+		Type: "ParseExampleV2",
+		Input: []tf.Input{
+			serialized, names, sparse_keys, dense_keys, ragged_keys, tf.OutputList(dense_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
+		scope.UpdateErr("ParseExampleV2", err)
+		return
+	}
+	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
+		scope.UpdateErr("ParseExampleV2", err)
+		return
+	}
+	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
+		scope.UpdateErr("ParseExampleV2", err)
+		return
+	}
+	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
+		scope.UpdateErr("ParseExampleV2", err)
+		return
+	}
+	if ragged_values, idx, err = makeOutputList(op, idx, "ragged_values"); err != nil {
+		scope.UpdateErr("ParseExampleV2", err)
+		return
+	}
+	if ragged_row_splits, idx, err = makeOutputList(op, idx, "ragged_row_splits"); err != nil {
+		scope.UpdateErr("ParseExampleV2", err)
+		return
+	}
+	return sparse_indices, sparse_values, sparse_shapes, dense_values, ragged_values, ragged_row_splits
+}
+
+// Scatter `updates` into a new tensor according to `indices`.
+//
+// Creates a new tensor by applying sparse `updates` to individual values or
+// slices within a tensor (initially zero for numeric, empty for string) of
+// the given `shape` according to indices.  This operator is the inverse of the
+// `tf.gather_nd` operator which extracts values or slices from a given tensor.
+//
+// This operation is similar to tensor_scatter_add, except that the tensor is
+// zero-initialized. Calling `tf.scatter_nd(indices, values, shape)` is identical
+// to `tensor_scatter_add(tf.zeros(shape, values.dtype), indices, values)`
+//
+// If `indices` contains duplicates, then their updates are accumulated (summed).
+//
+// **WARNING**: The order in which updates are applied is nondeterministic, so the
+// output will be nondeterministic if `indices` contains duplicates -- because
+// of some numerical approximation issues, numbers summed in different order
+// may yield different results.
+//
+// `indices` is an integer tensor containing indices into a new tensor of shape
+// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+//
+//     indices.shape[-1] <= shape.rank
+//
+// The last dimension of `indices` corresponds to indices into elements
+// (if `indices.shape[-1] = shape.rank`) or slices
+// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+// `shape`.  `updates` is a tensor with shape
+//
+//     indices.shape[:-1] + shape[indices.shape[-1]:]
+//
+// The simplest form of scatter is to insert individual elements in a tensor by
+// index. For example, say we want to insert 4 scattered elements in a rank-1
+// tensor with 8 elements.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[4], [3], [1], [7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     shape = tf.constant([8])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     print(scatter)
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [0, 11, 0, 10, 9, 0, 0, 12]
+//
+// We can also, insert entire slices of a higher rank tensor all at once. For
+// example, if we wanted to insert two slices in the first dimension of a
+// rank-3 tensor with two matrices of new values.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
+// </div>
+//
+// In Python, this scatter operation would look like this:
+//
+// ```python
+//     indices = tf.constant([[0], [2]])
+//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]],
+//                            [[5, 5, 5, 5], [6, 6, 6, 6],
+//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
+//     shape = tf.constant([4, 4, 4])
+//     scatter = tf.scatter_nd(indices, updates, shape)
+//     print(scatter)
+// ```
+//
+// The resulting tensor would look like this:
+//
+//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
+//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
+//
+// Note that on CPU, if an out of bound index is found, an error is returned.
+// On GPU, if an out of bound index is found, the index is ignored.
+//
+// Arguments:
+//	indices: Index tensor.
+//	updates: Updates to scatter into output.
+//	shape: 1-D. The shape of the resulting tensor.
+//
+// Returns A new tensor with the given shape and updates applied according
+// to the indices.
+func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScatterNd",
+		Input: []tf.Input{
+			indices, updates, shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// UniqueAttr is an optional argument to Unique.
+type UniqueAttr func(optionalAttr)
+
+// UniqueOutIdx sets the optional out_idx attribute to value.
+// If not specified, defaults to DT_INT32
+func UniqueOutIdx(value tf.DataType) UniqueAttr {
+	return func(m optionalAttr) {
+		m["out_idx"] = value
+	}
+}
+
+// Finds unique elements in a 1-D tensor.
+//
+// This operation returns a tensor `y` containing all of the unique elements of `x`
+// sorted in the same order that they occur in `x`; `x` does not need to be sorted.
+// This operation also returns a tensor `idx` the same size as `x` that contains
+// the index of each value of `x` in the unique output `y`. In other words:
+//
+// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
+//
+// Examples:
+//
+// ```
+// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
+// y, idx = unique(x)
+// y ==> [1, 2, 4, 7, 8]
+// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
+// ```
+//
+// ```
+// # tensor 'x' is [4, 5, 1, 2, 3, 3, 4, 5]
+// y, idx = unique(x)
+// y ==> [4, 5, 1, 2, 3]
+// idx ==> [0, 1, 2, 3, 4, 4, 0, 1]
+// ```
+//
+// Arguments:
+//	x: 1-D.
+//
+// Returns:
+//	y: 1-D.
+//	idx: 1-D.
+func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Unique",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
+//
+// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
+// output=SparseTensor(indices=sparse_indices, values=sparse_values,
+//                     dense_shape=sparse_dense_shape)
+//
+// Arguments:
+//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
+//	rt_dense_values: The `flat_values` for the `RaggedTensor`.
+//
+// Returns:
+//	sparse_indices: The indices for the `SparseTensor`.
+//	sparse_values: The values of the `SparseTensor`.
+//	sparse_dense_shape: `sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
+func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RaggedTensorToSparse",
+		Input: []tf.Input{
+			tf.OutputList(rt_nested_splits), rt_dense_values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Produce a string tensor that encodes the state of a Reader.
+//
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderSerializeStateV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes rectified linear 6: `min(max(features, 0), 6)`.
+func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu6",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Split a `SparseTensor` into `num_split` tensors along one dimension.
+//
+// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
+// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
+// For example, if `split_dim = 1` and `num_split = 2` and the input is
+//
+//     input_tensor = shape = [2, 7]
+//     [    a   d e  ]
+//     [b c          ]
+//
+// Graphically the output tensors are:
+//
+//     output_tensor[0] = shape = [2, 4]
+//     [    a  ]
+//     [b c    ]
+//
+//     output_tensor[1] = shape = [2, 3]
+//     [ d e  ]
+//     [      ]
+//
+// Arguments:
+//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
+// `[0, rank(shape))`.
+//	indices: 2-D tensor represents the indices of the sparse tensor.
+//	values: 1-D tensor represents the values of the sparse tensor.
+//	shape: 1-D. tensor represents the shape of the sparse tensor.
+// output indices: A list of 1-D tensors represents the indices of the output
+// sparse tensors.
+//	num_split: The number of ways to split.
+//
+// Returns:
+//	output_indices
+//	output_values: A list of 1-D tensors represents the values of the output sparse
+// tensors.
+//	output_shape: A list of 1-D tensors represents the shape of the output sparse
+// tensors.
+func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_split": num_split}
+	opspec := tf.OpSpec{
+		Type: "SparseSplit",
+		Input: []tf.Input{
+			split_dim, indices, values, shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
+		scope.UpdateErr("SparseSplit", err)
+		return
+	}
+	return output_indices, output_values, output_shape
+}
+
+// RaggedRangeAttr is an optional argument to RaggedRange.
+type RaggedRangeAttr func(optionalAttr)
+
+// RaggedRangeTsplits sets the optional Tsplits attribute to value.
+// If not specified, defaults to DT_INT64
+func RaggedRangeTsplits(value tf.DataType) RaggedRangeAttr {
+	return func(m optionalAttr) {
+		m["Tsplits"] = value
+	}
+}
+
+// Returns a `RaggedTensor` containing the specified sequences of numbers.
+//
+//
+// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
+// `rt_nested_splits`, such that
+// `result[i] = range(starts[i], limits[i], deltas[i])`.
+//
+// ```python
+// (rt_nested_splits, rt_dense_values) = ragged_range(
+//       starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
+// result = tf.ragged.from_row_splits(rt_dense_values, rt_nested_splits)
+// print(result)
+// <tf.RaggedTensor [[2], [], [8, 9, 10, 11]] >
+// ```
+//
+// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
+// The vector inputs must all have the same size.  Scalar inputs are broadcast
+// to match the size of the vector inputs.
+//
+// Arguments:
+//	starts: The starts of each range.
+//	limits: The limits of each range.
+//	deltas: The deltas of each range.
+//
+// Returns:
+//	rt_nested_splits: The `row_splits` for the returned `RaggedTensor`.
+//	rt_dense_values: The `flat_values` for the returned `RaggedTensor`.
+func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output, optional ...RaggedRangeAttr) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RaggedRange",
+		Input: []tf.Input{
+			starts, limits, deltas,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
 type RandomPoissonV2Attr func(optionalAttr)
 
@@ -28063,6 +29600,109 @@ func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...
 	return op.Output(0)
 }
 
+// LoadTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingCenteredRMSPropParameters.
+type LoadTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
+
+// LoadTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func LoadTPUEmbeddingCenteredRMSPropParametersTableId(value int64) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// LoadTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingCenteredRMSPropParametersTableName(value string) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// LoadTPUEmbeddingCenteredRMSPropParametersConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func LoadTPUEmbeddingCenteredRMSPropParametersConfig(value string) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Load centered RMSProp embedding parameters.
+//
+// An op that loads optimization parameters into HBM for embedding. Must be
+// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
+// embedding table configuration. For example, this op is used to install
+// parameters that are loaded from a checkpoint before a training loop is
+// executed.
+//
+// Arguments:
+//	parameters: Value of parameters used in the centered RMSProp optimization algorithm.
+//	ms: Value of ms used in the centered RMSProp optimization algorithm.
+//	mom: Value of mom used in the centered RMSProp optimization algorithm.
+//	mg: Value of mg used in the centered RMSProp optimization algorithm.
+//
+//
+//
+// Returns the created operation.
+func LoadTPUEmbeddingCenteredRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingCenteredRMSPropParametersAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "LoadTPUEmbeddingCenteredRMSPropParameters",
+		Input: []tf.Input{
+			parameters, ms, mom, mg,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
+
+// RandomPoissonSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Use RandomPoissonV2 instead.
+//
+// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
+func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomPoisson",
+		Input: []tf.Input{
+			shape, rate,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a dataset that takes a Bernoulli sample of the contents of another dataset.
 //
 // There is no transformation in the `tf.data` Python API for creating this dataset.
@@ -28527,6 +30167,45 @@ func BlockLSTMV2(scope *Scope, seq_len_max tf.Output, x tf.Output, cs_prev tf.Ou
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
 }
 
+// Return a tensor with the same shape and contents as the input tensor or value.
+func Identity(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Identity",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs a `Summary` protocol buffer with scalar values.
+//
+// The input `tags` and `values` must have the same shape.  The generated summary
+// has a summary value for each tag-value pair in `tags` and `values`.
+//
+// Arguments:
+//	tags: Tags for the summary.
+//	values: Same shape as `tags.  Values for the summary.
+//
+// Returns Scalar.  Serialized `Summary` protocol buffer.
+func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ScalarSummary",
+		Input: []tf.Input{
+			tags, values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
 type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
 
@@ -28594,45 +30273,6 @@ func Neg(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Return a tensor with the same shape and contents as the input tensor or value.
-func Identity(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Identity",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Outputs a `Summary` protocol buffer with scalar values.
-//
-// The input `tags` and `values` must have the same shape.  The generated summary
-// has a summary value for each tag-value pair in `tags` and `values`.
-//
-// Arguments:
-//	tags: Tags for the summary.
-//	values: Same shape as `tags.  Values for the summary.
-//
-// Returns Scalar.  Serialized `Summary` protocol buffer.
-func ScalarSummary(scope *Scope, tags tf.Output, values tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ScalarSummary",
-		Input: []tf.Input{
-			tags, values,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Concatenates tensors along one dimension.
 //
 // Arguments:
@@ -28799,33 +30439,6 @@ func WriteAudioSummary(scope *Scope, writer tf.Output, step tf.Output, tag tf.Ou
 	return scope.AddOperation(opspec)
 }
 
-// Outputs a `Summary` protocol buffer with a histogram.
-//
-// The generated
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// has one summary value containing a histogram for `values`.
-//
-// This op reports an `InvalidArgument` error if any value is not finite.
-//
-// Arguments:
-//	tag: Scalar.  Tag to use for the `Summary.Value`.
-//	values: Any shape. Values to use to build the histogram.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func HistogramSummary(scope *Scope, tag tf.Output, values tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "HistogramSummary",
-		Input: []tf.Input{
-			tag, values,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Performs gradient updates of embedding tables.
 //
 // Arguments:
@@ -29058,61 +30671,6 @@ func GRUBlockCellGrad(scope *Scope, x tf.Output, h_prev tf.Output, w_ru tf.Outpu
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// TextLineReaderV2Attr is an optional argument to TextLineReaderV2.
-type TextLineReaderV2Attr func(optionalAttr)
-
-// TextLineReaderV2SkipHeaderLines sets the optional skip_header_lines attribute to value.
-//
-// value: Number of lines to skip from the beginning of every file.
-// If not specified, defaults to 0
-func TextLineReaderV2SkipHeaderLines(value int64) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["skip_header_lines"] = value
-	}
-}
-
-// TextLineReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func TextLineReaderV2Container(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// TextLineReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func TextLineReaderV2SharedName(value string) TextLineReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// A Reader that outputs the lines of a file delimited by '\n'.
-//
-// Returns The handle to reference the Reader.
-func TextLineReaderV2(scope *Scope, optional ...TextLineReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TextLineReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Encode audio data using the WAV file format.
 //
 // This operation will generate a string suitable to be saved out to create a .wav
@@ -29317,6 +30875,37 @@ func StatefulStandardNormalV2(scope *Scope, resource tf.Output, algorithm tf.Out
 	return op.Output(0)
 }
 
+// Helper used to compute the gradient for `RaggedTensorToVariant`.
+//
+// Computes the gradient for the dense_values input to the RaggedTensorToVariant
+// op, given the variant-encoded ragged gradients of the outputs, along with
+// the outer row-splits and the shape of the dense-values that were provided as
+// inputs to the RaggedTensorToVariant op.
+//
+// Arguments:
+//	encoded_ragged_grad: A `variant` Tensor containing encoded `RaggedTensor` gradients.
+//	row_splits: Outermost row-splits that were used as input to the RaggedTensorToVariant op.
+//	dense_values_shape: Shape of the dense_values that was used as an input to the
+// RaggedTensorToVariant op.
+//
+//
+// Returns Gradient for the dense_values of the RaggedTensorToVariant op.
+func RaggedTensorToVariantGradient(scope *Scope, encoded_ragged_grad tf.Output, row_splits tf.Output, dense_values_shape tf.Output, Tvalues tf.DataType) (dense_values_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"Tvalues": Tvalues}
+	opspec := tf.OpSpec{
+		Type: "RaggedTensorToVariantGradient",
+		Input: []tf.Input{
+			encoded_ragged_grad, row_splits, dense_values_shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
 type ResourceSparseApplyFtrlAttr func(optionalAttr)
 
@@ -29452,6 +31041,32 @@ func StatefulUniform(scope *Scope, resource tf.Output, algorithm tf.Output, shap
 	return op.Output(0)
 }
 
+// Wraps the XLA Gather operator documented at
+//
+//   https://www.tensorflow.org/xla/operation_semantics#gather
+//
+// Arguments:
+//	operand: The array we're gathering from.
+//	start_indices: Array containing the starting indices of the slices we gather.
+//	slice_sizes: slice_sizes[i] is the bounds for the slice on dimension i.
+//	dimension_numbers: A serialized xla::GatherDimensionNumbers proto.
+//	indices_are_sorted: Boolean indicating if the indices are sorted.
+func XlaGather(scope *Scope, operand tf.Output, start_indices tf.Output, slice_sizes tf.Output, dimension_numbers string, indices_are_sorted bool) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dimension_numbers": dimension_numbers, "indices_are_sorted": indices_are_sorted}
+	opspec := tf.OpSpec{
+		Type: "XlaGather",
+		Input: []tf.Input{
+			operand, start_indices, slice_sizes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // QuantizedConv2DAttr is an optional argument to QuantizedConv2D.
 type QuantizedConv2DAttr func(optionalAttr)
 
@@ -29895,6 +31510,77 @@ func ResourceScatterMin(scope *Scope, resource tf.Output, indices tf.Output, upd
 	return scope.AddOperation(opspec)
 }
 
+// Scatters tensor at indices in an input list.
+//
+// Each member of the TensorList corresponds to one row of the input tensor,
+// specified by the given index (see `tf.gather`).
+//
+// input_handle: The list to scatter into.
+// tensor: The input tensor.
+// indices: The indices used to index into the list.
+// output_handle: The TensorList.
+func TensorListScatterIntoExistingList(scope *Scope, input_handle tf.Output, tensor tf.Output, indices tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListScatterIntoExistingList",
+		Input: []tf.Input{
+			input_handle, tensor, indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Outputs a `Summary` protocol buffer with a tensor and per-plugin data.
+//
+// Arguments:
+//	tag: A string attached to this summary. Used for organization in TensorBoard.
+//	tensor: A tensor to serialize.
+//	serialized_summary_metadata: A serialized SummaryMetadata proto. Contains plugin
+// data.
+func TensorSummaryV2(scope *Scope, tag tf.Output, tensor tf.Output, serialized_summary_metadata tf.Output) (summary tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorSummaryV2",
+		Input: []tf.Input{
+			tag, tensor, serialized_summary_metadata,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Advance the counter of a counter-based RNG.
+//
+// The state of the RNG after
+// `rng_read_and_skip(n)` will be the same as that after `uniform([n])`
+// (or any other distribution). The actual increment added to the
+// counter is an unspecified implementation choice.
+//
+// Arguments:
+//	resource: The handle of the resource variable that stores the state of the RNG.
+//	alg: The RNG algorithm.
+//	delta: The amount of advancement.
+//
+// Returns The old value of the resource variable, before incrementing. Since state size is algorithm-dependent, this output will be right-padded with zeros to reach shape int64[3] (the current maximal state size among algorithms).
+func RngReadAndSkip(scope *Scope, resource tf.Output, alg tf.Output, delta tf.Output) (value tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RngReadAndSkip",
+		Input: []tf.Input{
+			resource, alg, delta,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Multiplies sparse updates into the variable referenced by `resource`.
 //
 // This operation computes
@@ -30672,6 +32358,34 @@ func AssignVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.
 	return scope.AddOperation(opspec)
 }
 
+// Wraps the XLA Sort operator, documented at
+//
+//  https://www.tensorflow.org/performance/xla/operation_semantics#sort
+// .
+//
+// Sorts a tensor. Currently only sorts in ascending order are supported.
+//
+// Arguments:
+//	keys: A `Tensor` of type K.
+//	values: A `Tensor` of type V.
+//
+// Returns:
+//	sorted_keys: A `Tensor` of type K.
+//	sorted_values: A `Tensor` of type V.
+func XlaKeyValueSort(scope *Scope, keys tf.Output, values tf.Output) (sorted_keys tf.Output, sorted_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaKeyValueSort",
+		Input: []tf.Input{
+			keys, values,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Asserts that compilation succeeded. This op produces no output and closes the
 //
 // device during failure to ensure all pending device interactions fail.
@@ -30811,6 +32525,18 @@ func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...Va
 	return op.Output(0)
 }
 
+// Replica ID.
+func XlaReplicaId(scope *Scope) (id tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaReplicaId",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns conj(x - y)(x - y) element-wise.
 //
 // *NOTE*: `SquaredDifference` supports broadcasting. More about broadcasting
@@ -31008,6 +32734,21 @@ func BoostedTreesQuantileStreamResourceHandleOp(scope *Scope, optional ...Booste
 	return op.Output(0)
 }
 
+// An op which shards the input based on the given sharding attribute.
+func XlaSharding(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "XlaSharding",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // EagerPyFuncAttr is an optional argument to EagerPyFunc.
 type EagerPyFuncAttr func(optionalAttr)
 
@@ -33104,109 +34845,6 @@ func TakeManySparseFromTensorsMap(scope *Scope, sparse_handles tf.Output, dtype
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// LoadTPUEmbeddingCenteredRMSPropParametersAttr is an optional argument to LoadTPUEmbeddingCenteredRMSPropParameters.
-type LoadTPUEmbeddingCenteredRMSPropParametersAttr func(optionalAttr)
-
-// LoadTPUEmbeddingCenteredRMSPropParametersTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func LoadTPUEmbeddingCenteredRMSPropParametersTableId(value int64) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// LoadTPUEmbeddingCenteredRMSPropParametersTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingCenteredRMSPropParametersTableName(value string) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// LoadTPUEmbeddingCenteredRMSPropParametersConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func LoadTPUEmbeddingCenteredRMSPropParametersConfig(value string) LoadTPUEmbeddingCenteredRMSPropParametersAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Load centered RMSProp embedding parameters.
-//
-// An op that loads optimization parameters into HBM for embedding. Must be
-// preceded by a ConfigureTPUEmbeddingHost op that sets up the correct
-// embedding table configuration. For example, this op is used to install
-// parameters that are loaded from a checkpoint before a training loop is
-// executed.
-//
-// Arguments:
-//	parameters: Value of parameters used in the centered RMSProp optimization algorithm.
-//	ms: Value of ms used in the centered RMSProp optimization algorithm.
-//	mom: Value of mom used in the centered RMSProp optimization algorithm.
-//	mg: Value of mg used in the centered RMSProp optimization algorithm.
-//
-//
-//
-// Returns the created operation.
-func LoadTPUEmbeddingCenteredRMSPropParameters(scope *Scope, parameters tf.Output, ms tf.Output, mom tf.Output, mg tf.Output, num_shards int64, shard_id int64, optional ...LoadTPUEmbeddingCenteredRMSPropParametersAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "LoadTPUEmbeddingCenteredRMSPropParameters",
-		Input: []tf.Input{
-			parameters, ms, mom, mg,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// RandomPoissonAttr is an optional argument to RandomPoisson.
-type RandomPoissonAttr func(optionalAttr)
-
-// RandomPoissonSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomPoissonSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed2(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Use RandomPoissonV2 instead.
-//
-// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
-func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomPoisson",
-		Input: []tf.Input{
-			shape, rate,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
 //
 // The regularized incomplete beta integral is defined as:
@@ -33445,221 +35083,6 @@ func BiasAddGrad(scope *Scope, out_backprop tf.Output, optional ...BiasAddGradAt
 	return op.Output(0)
 }
 
-// Returns the name of the device on which `resource` has been placed.
-func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalIteratorGetDevice",
-		Input: []tf.Input{
-			resource,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
-func ExperimentalBytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalBytesProducedStatsDataset",
-		Input: []tf.Input{
-			input_dataset, tag,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Transforms a vector of tf.Example protos (as strings) into typed tensors.
-//
-// Arguments:
-//	serialized: A scalar or vector containing binary serialized Example protos.
-//	names: A tensor containing the names of the serialized protos.
-// Corresponds 1:1 with the `serialized` tensor.
-// May contain, for example, table key (descriptive) names for the
-// corresponding serialized protos.  These are purely useful for debugging
-// purposes, and the presence of values here has no effect on the output.
-// May also be an empty vector if no names are available.
-// If non-empty, this tensor must have the same shape as "serialized".
-//	sparse_keys: Vector of strings.
-// The keys expected in the Examples' features associated with sparse values.
-//	dense_keys: Vector of strings.
-// The keys expected in the Examples' features associated with dense values.
-//	ragged_keys: Vector of strings.
-// The keys expected in the Examples' features associated with ragged values.
-//	dense_defaults: A list of Tensors (some may be empty).  Corresponds 1:1 with `dense_keys`.
-// dense_defaults[j] provides default values
-// when the example's feature_map lacks dense_key[j].  If an empty Tensor is
-// provided for dense_defaults[j], then the Feature dense_keys[j] is required.
-// The input type is inferred from dense_defaults[j], even when it's empty.
-// If dense_defaults[j] is not empty, and dense_shapes[j] is fully defined,
-// then the shape of dense_defaults[j] must match that of dense_shapes[j].
-// If dense_shapes[j] has an undefined major dimension (variable strides dense
-// feature), dense_defaults[j] must contain a single element:
-// the padding element.
-//	num_sparse: The number of sparse keys.
-//	sparse_types: A list of `num_sparse` types; the data types of data in each Feature
-// given in sparse_keys.
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	ragged_value_types: A list of `num_ragged` types; the data types of data in each Feature
-// given in ragged_keys (where `num_ragged = sparse_keys.size()`).
-// Currently the ParseExample supports DT_FLOAT (FloatList),
-// DT_INT64 (Int64List), and DT_STRING (BytesList).
-//	ragged_split_types: A list of `num_ragged` types; the data types of row_splits in each Feature
-// given in ragged_keys (where `num_ragged = sparse_keys.size()`).
-// May be DT_INT32 or DT_INT64.
-//	dense_shapes: A list of `num_dense` shapes; the shapes of data in each Feature
-// given in dense_keys (where `num_dense = dense_keys.size()`).
-// The number of elements in the Feature corresponding to dense_key[j]
-// must always equal dense_shapes[j].NumEntries().
-// If dense_shapes[j] == (D0, D1, ..., DN) then the shape of output
-// Tensor dense_values[j] will be (|serialized|, D0, D1, ..., DN):
-// The dense outputs are just the inputs row-stacked by batch.
-// This works for dense_shapes[j] = (-1, D1, ..., DN).  In this case
-// the shape of the output Tensor dense_values[j] will be
-// (|serialized|, M, D1, .., DN), where M is the maximum number of blocks
-// of elements of length D1 * .... * DN, across all minibatch entries
-// in the input.  Any minibatch entry with less than M blocks of elements of
-// length D1 * ... * DN will be padded with the corresponding default_value
-// scalar element along the second dimension.
-func ParseExampleV2(scope *Scope, serialized tf.Output, names tf.Output, sparse_keys tf.Output, dense_keys tf.Output, ragged_keys tf.Output, dense_defaults []tf.Output, num_sparse int64, sparse_types []tf.DataType, ragged_value_types []tf.DataType, ragged_split_types []tf.DataType, dense_shapes []tf.Shape) (sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shapes []tf.Output, dense_values []tf.Output, ragged_values []tf.Output, ragged_row_splits []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_sparse": num_sparse, "sparse_types": sparse_types, "ragged_value_types": ragged_value_types, "ragged_split_types": ragged_split_types, "dense_shapes": dense_shapes}
-	opspec := tf.OpSpec{
-		Type: "ParseExampleV2",
-		Input: []tf.Input{
-			serialized, names, sparse_keys, dense_keys, ragged_keys, tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if sparse_indices, idx, err = makeOutputList(op, idx, "sparse_indices"); err != nil {
-		scope.UpdateErr("ParseExampleV2", err)
-		return
-	}
-	if sparse_values, idx, err = makeOutputList(op, idx, "sparse_values"); err != nil {
-		scope.UpdateErr("ParseExampleV2", err)
-		return
-	}
-	if sparse_shapes, idx, err = makeOutputList(op, idx, "sparse_shapes"); err != nil {
-		scope.UpdateErr("ParseExampleV2", err)
-		return
-	}
-	if dense_values, idx, err = makeOutputList(op, idx, "dense_values"); err != nil {
-		scope.UpdateErr("ParseExampleV2", err)
-		return
-	}
-	if ragged_values, idx, err = makeOutputList(op, idx, "ragged_values"); err != nil {
-		scope.UpdateErr("ParseExampleV2", err)
-		return
-	}
-	if ragged_row_splits, idx, err = makeOutputList(op, idx, "ragged_row_splits"); err != nil {
-		scope.UpdateErr("ParseExampleV2", err)
-		return
-	}
-	return sparse_indices, sparse_values, sparse_shapes, dense_values, ragged_values, ragged_row_splits
-}
-
-// Computes exponential linear: `exp(features) - 1` if < 0, `features` otherwise.
-//
-// See [Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
-// ](http://arxiv.org/abs/1511.07289)
-func Elu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Elu",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// AddSparseToTensorsMapAttr is an optional argument to AddSparseToTensorsMap.
-type AddSparseToTensorsMapAttr func(optionalAttr)
-
-// AddSparseToTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapContainer(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// AddSparseToTensorsMapSharedName sets the optional shared_name attribute to value.
-//
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddSparseToTensorsMapSharedName(value string) AddSparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Add a `SparseTensor` to a `SparseTensorsMap` return its handle.
-//
-// A `SparseTensor` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`.
-//
-// This operator takes the given `SparseTensor` and adds it to a container
-// object (a `SparseTensorsMap`).  A unique key within this container is generated
-// in the form of an `int64`, and this is the value that is returned.
-//
-// The `SparseTensor` can then be read out as part of a minibatch by passing
-// the key as a vector element to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddSparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
-//
-// Arguments:
-//	sparse_indices: 2-D.  The `indices` of the `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the `SparseTensor`.
-//
-// Returns 0-D.  The handle of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.
-func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddSparseToTensorsMapAttr) (sparse_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AddSparseToTensorsMap",
-		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Advance the counter of a counter-based RNG.
 //
 // The state of the RNG after
@@ -34227,6 +35650,307 @@ func SparseCrossV2(scope *Scope, indices []tf.Output, values []tf.Output, shapes
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// Pads a tensor with mirrored values.
+//
+// This operation pads a `input` with mirrored values according to the `paddings`
+// you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
+// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
+// how many values to add before the contents of `input` in that dimension, and
+// `paddings[D, 1]` indicates how many values to add after the contents of `input`
+// in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
+// than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
+// (if false, respectively).
+//
+// The padded size of each dimension D of the output is:
+//
+// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
+//
+// For example:
+//
+// ```
+// # 't' is [[1, 2, 3], [4, 5, 6]].
+// # 'paddings' is [[1, 1]], [2, 2]].
+// # 'mode' is SYMMETRIC.
+// # rank of 't' is 2.
+// pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
+//                       [2, 1, 1, 2, 3, 3, 2]
+//                       [5, 4, 4, 5, 6, 6, 5]
+//                       [5, 4, 4, 5, 6, 6, 5]]
+// ```
+//
+// Arguments:
+//	input: The input tensor to be padded.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
+// do not include the borders, while in symmetric mode the padded regions
+// do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
+// is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
+// it is `[1, 2, 3, 3, 2]` in symmetric mode.
+//
+// Returns The padded tensor.
+func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode}
+	opspec := tf.OpSpec{
+		Type: "MirrorPad",
+		Input: []tf.Input{
+			input, paddings,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TensorArrayV3Attr is an optional argument to TensorArrayV3.
+type TensorArrayV3Attr func(optionalAttr)
+
+// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
+//
+// value: The expected shape of an element, if known. Used to
+// validate the shapes of TensorArray elements. If this shape is not
+// fully specified, gathering zero-size TensorArrays is an error.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
+//
+// value: A boolean that determines whether writes to the TensorArray
+// are allowed to grow the size.  By default, this is not allowed.
+// If not specified, defaults to false
+func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
+	}
+}
+
+// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
+//
+// value: If true (default), Tensors in the TensorArray are cleared
+// after being read.  This disables multiple read semantics but allows early
+// release of memory.
+// If not specified, defaults to true
+func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
+//
+// value: If true (default is false), then all
+// elements in the TensorArray will be expected to have have identical shapes.
+// This allows certain behaviors, like dynamically checking for
+// consistent shapes on write, and being able to fill in properly
+// shaped zero tensors on stack -- even if the element_shape attribute
+// is not fully defined.
+// If not specified, defaults to false
+func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["identical_element_shapes"] = value
+	}
+}
+
+// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
+//
+// value: Overrides the name used for the temporary tensor_array
+// resource. Default value is the name of the 'TensorArray' op (which
+// is guaranteed unique).
+// If not specified, defaults to ""
+func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// An array of Tensors of given size.
+//
+// Write data via Write and read via Read or Pack.
+//
+// Arguments:
+//	size: The size of the array.
+//	dtype: The type of the elements on the tensor_array.
+//
+// Returns:
+//	handle: The handle to the TensorArray.
+//	flow: A scalar used to control gradient flow.
+func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayV3",
+		Input: []tf.Input{
+			size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
+type MatrixSolveLsAttr func(optionalAttr)
+
+// MatrixSolveLsFast sets the optional fast attribute to value.
+// If not specified, defaults to true
+func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
+	return func(m optionalAttr) {
+		m["fast"] = value
+	}
+}
+
+// Solves one or more linear least-squares problems.
+//
+// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
+// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
+// type as `matrix` and shape `[..., M, K]`.
+// The output is a tensor shape `[..., N, K]` where each output matrix solves
+// each of the equations
+// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
+// in the least squares sense.
+//
+// We use the following notation for (complex) matrix and right-hand sides
+// in the batch:
+//
+// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
+// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
+// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
+// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
+//
+// If `fast` is `True`, then the solution is computed by solving the normal
+// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
+// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
+// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
+// If \\(m \lt n\\) then `output` is computed as
+// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
+// minimum-norm solution to the under-determined linear system, i.e.
+// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
+// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
+// when \\(A\\) is numerically full rank and has a condition number
+// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
+// sufficiently large.
+//
+// If `fast` is `False` an algorithm based on the numerically robust complete
+// orthogonal decomposition is used. This computes the minimum-norm
+// least-squares solution, even when \\(A\\) is rank deficient. This path is
+// typically 6-7 times slower than the fast path. If `fast` is `False` then
+// `l2_regularizer` is ignored.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, N]`.
+//	rhs: Shape is `[..., M, K]`.
+//	l2_regularizer: Scalar tensor.
+//
+// @compatibility(numpy)
+// Equivalent to np.linalg.lstsq
+// @end_compatibility
+//
+// Returns Shape is `[..., N, K]`.
+func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixSolveLs",
+		Input: []tf.Input{
+			matrix, rhs, l2_regularizer,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	hashed_output: If true, returns the hash of the cross instead of the string.
+// This will allow us avoiding string manipulations.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
+// function to combine the crosses fingerprints.
+//
+//
+//
+// Returns:
+//	output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
+//	output_values: 1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.
+//	output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
+	opspec := tf.OpSpec{
+		Type: "SparseCross",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Generate a glob pattern matching all sharded file names.
 func ShardedFilespec(scope *Scope, basename tf.Output, num_shards tf.Output) (filename tf.Output) {
 	if scope.Err() != nil {
@@ -34586,29 +36310,6 @@ func HistogramFixedWidth(scope *Scope, values tf.Output, value_range tf.Output,
 	return op.Output(0)
 }
 
-// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
-//
-// Arguments:
-//
-//	thread_pool: A resource produced by the ThreadPoolHandle op.
-//
-//
-func ThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread_pool tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ThreadPoolDataset",
-		Input: []tf.Input{
-			input_dataset, thread_pool,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Bitcasts a tensor from one type to another without copying data.
 //
 // Given a tensor `input`, this operation returns a tensor that has the same buffer
@@ -34677,6 +36378,29 @@ func Bitcast(scope *Scope, input tf.Output, type_ tf.DataType) (output tf.Output
 	return op.Output(0)
 }
 
+// Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+//
+// Arguments:
+//
+//	thread_pool: A resource produced by the ThreadPoolHandle op.
+//
+//
+func ThreadPoolDataset(scope *Scope, input_dataset tf.Output, thread_pool tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ThreadPoolDataset",
+		Input: []tf.Input{
+			input_dataset, thread_pool,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
 type ResourceApplyAdagradDAAttr func(optionalAttr)
 
@@ -34848,6 +36572,14 @@ func OrderedMapClear(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapC
 // MaxPoolAttr is an optional argument to MaxPool.
 type MaxPoolAttr func(optionalAttr)
 
+// MaxPoolExplicitPaddings sets the optional explicit_paddings attribute to value.
+// If not specified, defaults to <>
+func MaxPoolExplicitPaddings(value []int64) MaxPoolAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
 // MaxPoolDataFormat sets the optional data_format attribute to value.
 //
 // value: Specify the data format of the input and output data. With the
@@ -37649,6 +39381,51 @@ func ParseSingleExample(scope *Scope, serialized tf.Output, dense_defaults []tf.
 	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
+// StatelessRandomUniformFullIntV2Attr is an optional argument to StatelessRandomUniformFullIntV2.
+type StatelessRandomUniformFullIntV2Attr func(optionalAttr)
+
+// StatelessRandomUniformFullIntV2Dtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_UINT64
+func StatelessRandomUniformFullIntV2Dtype(value tf.DataType) StatelessRandomUniformFullIntV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
+//
+// The generated values are uniform integers covering the whole range of `dtype`.
+//
+// The outputs are a deterministic function of `shape`, `key`, `counter` and `alg`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	key: Key for the counter-based RNG algorithm (shape uint64[1]).
+//	counter: Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+//	alg: The RNG algorithm (shape int32[]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformFullIntV2(scope *Scope, shape tf.Output, key tf.Output, counter tf.Output, alg tf.Output, optional ...StatelessRandomUniformFullIntV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformFullIntV2",
+		Input: []tf.Input{
+			shape, key, counter, alg,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // StringToNumberAttr is an optional argument to StringToNumber.
 type StringToNumberAttr func(optionalAttr)
 
@@ -37950,6 +39727,205 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 	return op.Output(0)
 }
 
+// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
+type ResourceScatterNdUpdateAttr func(optionalAttr)
+
+// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
+//
+// value: An optional bool. Defaults to True. If True, the assignment will
+// be protected by a lock; otherwise the behavior is undefined,
+// but may exhibit less contention.
+// If not specified, defaults to true
+func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Applies sparse `updates` to individual values or slices within a given
+//
+// variable according to `indices`.
+//
+// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+//
+// `indices` must be integer tensor, containing indices into `ref`.
+// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+//
+// The innermost dimension of `indices` (with length `K`) corresponds to
+// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+// dimension of `ref`.
+//
+// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+//
+// ```
+// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+// ```
+//
+// For example, say we want to update 4 scattered elements to a rank-1 tensor to
+// 8 elements. In Python, that update would look like this:
+//
+// ```python
+//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+//     indices = tf.constant([[4], [3], [1] ,[7]])
+//     updates = tf.constant([9, 10, 11, 12])
+//     update = tf.scatter_nd_update(ref, indices, updates)
+//     with tf.Session() as sess:
+//       print sess.run(update)
+// ```
+//
+// The resulting update to ref would look like this:
+//
+//     [1, 11, 3, 10, 9, 6, 7, 12]
+//
+// See `tf.scatter_nd` for more details about how to make updates to
+// slices.
+//
+// Arguments:
+//	ref: A resource handle. Must be from a VarHandleOp.
+//	indices: A Tensor. Must be one of the following types: int32, int64.
+// A tensor of indices into ref.
+//	updates: A Tensor. Must have the same type as ref. A tensor of updated
+// values to add to ref.
+//
+// Returns the created operation.
+func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterNdUpdate",
+		Input: []tf.Input{
+			ref, indices, updates,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// EnqueueTPUEmbeddingSparseBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseBatch.
+type EnqueueTPUEmbeddingSparseBatchAttr func(optionalAttr)
+
+// EnqueueTPUEmbeddingSparseBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
+//
+// value: The TPU device to use. Should be >= 0 and less than the number
+// of TPU cores in the task on which the node is placed.
+// If not specified, defaults to -1
+func EnqueueTPUEmbeddingSparseBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseBatchAttr {
+	return func(m optionalAttr) {
+		m["device_ordinal"] = value
+	}
+}
+
+// EnqueueTPUEmbeddingSparseBatchCombiners sets the optional combiners attribute to value.
+//
+// value: A list of string scalars, one for each embedding table that specify
+// how to normalize the embedding activations after weighted summation.
+// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
+// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
+// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
+// all tables.
+// If not specified, defaults to <>
+func EnqueueTPUEmbeddingSparseBatchCombiners(value []string) EnqueueTPUEmbeddingSparseBatchAttr {
+	return func(m optionalAttr) {
+		m["combiners"] = value
+	}
+}
+
+// An op that enqueues TPUEmbedding input indices from a SparseTensor.
+//
+// This Op eases the porting of code that uses embedding_lookup_sparse(),
+// although some Python preprocessing of the SparseTensor arguments to
+// embedding_lookup_sparse() is required to produce the arguments to this Op,
+// since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
+// step.
+//
+// The tensors at corresponding positions in the three input lists
+// must have the same shape, i.e. rank 1 with dim_size() equal to the total
+// number of lookups into the table described by the corresponding table_id.
+//
+// Arguments:
+//	sample_indices: A list of rank 1 Tensors specifying the training example and
+// feature to which the corresponding embedding_indices and aggregation_weights
+// values belong. sample_indices[i] must equal b * nf + f, where nf is the
+// number of features from the corresponding table, f is in [0, nf), and
+// b is in [0, batch size).
+//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
+//	aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e. per
+// (training example, feature) -- aggregation weights.
+//	mode_override: A string input that overrides the mode specified in the
+// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
+// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
+// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
+//
+// Returns the created operation.
+func EnqueueTPUEmbeddingSparseBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingSparseBatchAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EnqueueTPUEmbeddingSparseBatch",
+		Input: []tf.Input{
+			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StatelessRandomUniformV2Attr is an optional argument to StatelessRandomUniformV2.
+type StatelessRandomUniformV2Attr func(optionalAttr)
+
+// StatelessRandomUniformV2Dtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomUniformV2Dtype(value tf.DataType) StatelessRandomUniformV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random values from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[0, 1)`. The
+// lower bound 0 is included in the range, while the upper bound 1 is excluded.
+//
+// The outputs are a deterministic function of `shape`, `key`, `counter` and `alg`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	key: Key for the counter-based RNG algorithm (shape uint64[1]).
+//	counter: Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+//	alg: The RNG algorithm (shape int32[]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformV2(scope *Scope, shape tf.Output, key tf.Output, counter tf.Output, alg tf.Output, optional ...StatelessRandomUniformV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformV2",
+		Input: []tf.Input{
+			shape, key, counter, alg,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 //   This op is used as a placeholder in If branch functions. It doesn't provide a
 //   valid output when run, so must either be removed (e.g. replaced with a
 //   function input) or guaranteed not to be used (e.g. if mirroring an
@@ -38692,168 +40668,6 @@ func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (o
 	return op.Output(0)
 }
 
-// Produce a string tensor that encodes the state of a Reader.
-//
-// Not all Readers support being serialized, so this can produce an
-// Unimplemented error.
-//
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderSerializeStateV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes rectified linear 6: `min(max(features, 0), 6)`.
-func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu6",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Split a `SparseTensor` into `num_split` tensors along one dimension.
-//
-// If the `shape[split_dim]` is not an integer multiple of `num_split`. Slices
-// `[0 : shape[split_dim] % num_split]` gets one extra dimension.
-// For example, if `split_dim = 1` and `num_split = 2` and the input is
-//
-//     input_tensor = shape = [2, 7]
-//     [    a   d e  ]
-//     [b c          ]
-//
-// Graphically the output tensors are:
-//
-//     output_tensor[0] = shape = [2, 4]
-//     [    a  ]
-//     [b c    ]
-//
-//     output_tensor[1] = shape = [2, 3]
-//     [ d e  ]
-//     [      ]
-//
-// Arguments:
-//	split_dim: 0-D.  The dimension along which to split.  Must be in the range
-// `[0, rank(shape))`.
-//	indices: 2-D tensor represents the indices of the sparse tensor.
-//	values: 1-D tensor represents the values of the sparse tensor.
-//	shape: 1-D. tensor represents the shape of the sparse tensor.
-// output indices: A list of 1-D tensors represents the indices of the output
-// sparse tensors.
-//	num_split: The number of ways to split.
-//
-// Returns:
-//	output_indices
-//	output_values: A list of 1-D tensors represents the values of the output sparse
-// tensors.
-//	output_shape: A list of 1-D tensors represents the shape of the output sparse
-// tensors.
-func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf.Output, shape tf.Output, num_split int64) (output_indices []tf.Output, output_values []tf.Output, output_shape []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_split": num_split}
-	opspec := tf.OpSpec{
-		Type: "SparseSplit",
-		Input: []tf.Input{
-			split_dim, indices, values, shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output_indices, idx, err = makeOutputList(op, idx, "output_indices"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_values, idx, err = makeOutputList(op, idx, "output_values"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	if output_shape, idx, err = makeOutputList(op, idx, "output_shape"); err != nil {
-		scope.UpdateErr("SparseSplit", err)
-		return
-	}
-	return output_indices, output_values, output_shape
-}
-
-// RaggedRangeAttr is an optional argument to RaggedRange.
-type RaggedRangeAttr func(optionalAttr)
-
-// RaggedRangeTsplits sets the optional Tsplits attribute to value.
-// If not specified, defaults to DT_INT64
-func RaggedRangeTsplits(value tf.DataType) RaggedRangeAttr {
-	return func(m optionalAttr) {
-		m["Tsplits"] = value
-	}
-}
-
-// Returns a `RaggedTensor` containing the specified sequences of numbers.
-//
-//
-// Returns a `RaggedTensor` `result` composed from `rt_dense_values` and
-// `rt_nested_splits`, such that
-// `result[i] = range(starts[i], limits[i], deltas[i])`.
-//
-// ```python
-// (rt_nested_splits, rt_dense_values) = ragged_range(
-//       starts=[2, 5, 8], limits=[3, 5, 12], deltas=1)
-// result = tf.ragged.from_row_splits(rt_dense_values, rt_nested_splits)
-// print(result)
-// <tf.RaggedTensor [[2], [], [8, 9, 10, 11]] >
-// ```
-//
-// The input tensors `starts`, `limits`, and `deltas` may be scalars or vectors.
-// The vector inputs must all have the same size.  Scalar inputs are broadcast
-// to match the size of the vector inputs.
-//
-// Arguments:
-//	starts: The starts of each range.
-//	limits: The limits of each range.
-//	deltas: The deltas of each range.
-//
-// Returns:
-//	rt_nested_splits: The `row_splits` for the returned `RaggedTensor`.
-//	rt_dense_values: The `flat_values` for the returned `RaggedTensor`.
-func RaggedRange(scope *Scope, starts tf.Output, limits tf.Output, deltas tf.Output, optional ...RaggedRangeAttr) (rt_nested_splits tf.Output, rt_dense_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RaggedRange",
-		Input: []tf.Input{
-			starts, limits, deltas,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
 // Replaces the contents of the table with the specified keys and values.
 //
 // The tensor `keys` must be of the same type as the keys of the table.
@@ -38878,6 +40692,57 @@ func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, v
 	return scope.AddOperation(opspec)
 }
 
+// ImageProjectiveTransformV3Attr is an optional argument to ImageProjectiveTransformV3.
+type ImageProjectiveTransformV3Attr func(optionalAttr)
+
+// ImageProjectiveTransformV3FillMode sets the optional fill_mode attribute to value.
+//
+// value: Fill mode, "REFLECT", "WRAP", or "CONSTANT".
+// If not specified, defaults to "CONSTANT"
+func ImageProjectiveTransformV3FillMode(value string) ImageProjectiveTransformV3Attr {
+	return func(m optionalAttr) {
+		m["fill_mode"] = value
+	}
+}
+
+// Applies the given transform to each of the images.
+//
+// If one row of `transforms` is `[a0, a1, a2, b0, b1, b2, c0, c1]`, then it maps
+// the *output* point `(x, y)` to a transformed *input* point
+// `(x', y') = ((a0 x + a1 y + a2) / k, (b0 x + b1 y + b2) / k)`, where
+// `k = c0 x + c1 y + 1`. If the transformed point lays outside of the input
+// image, the output pixel is set to fill_value.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	transforms: 2-D Tensor, `[batch, 8]` or `[1, 8]` matrix, where each row corresponds to a 3 x 3
+// projective transformation matrix, with the last entry assumed to be 1. If there
+// is one row, the same transformation will be applied to all images.
+//	output_shape: 1-D Tensor [new_height, new_width].
+//	fill_value: float, the value to be filled when fill_mode is constant".
+//	interpolation: Interpolation method, "NEAREST" or "BILINEAR".
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ImageProjectiveTransformV3(scope *Scope, images tf.Output, transforms tf.Output, output_shape tf.Output, fill_value tf.Output, interpolation string, optional ...ImageProjectiveTransformV3Attr) (transformed_images tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"interpolation": interpolation}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ImageProjectiveTransformV3",
+		Input: []tf.Input{
+			images, transforms, output_shape, fill_value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // LoadTPUEmbeddingMomentumParametersAttr is an optional argument to LoadTPUEmbeddingMomentumParameters.
 type LoadTPUEmbeddingMomentumParametersAttr func(optionalAttr)
 
@@ -39196,48 +41061,6 @@ func RegexReplace(scope *Scope, input tf.Output, pattern tf.Output, rewrite tf.O
 	return op.Output(0)
 }
 
-// ExperimentalRebatchDatasetAttr is an optional argument to ExperimentalRebatchDataset.
-type ExperimentalRebatchDatasetAttr func(optionalAttr)
-
-// ExperimentalRebatchDatasetUseFallback sets the optional use_fallback attribute to value.
-// If not specified, defaults to true
-func ExperimentalRebatchDatasetUseFallback(value bool) ExperimentalRebatchDatasetAttr {
-	return func(m optionalAttr) {
-		m["use_fallback"] = value
-	}
-}
-
-// Creates a dataset that changes the batch size.
-//
-// Creates a dataset that changes the batch size of the dataset to current batch
-// size // num_replicas.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the input dataset.
-//	num_replicas: A scalar representing the number of replicas to distribute this batch across. As
-// a result of this transformation the current batch size would end up being
-// divided  by this parameter.
-//
-//
-func ExperimentalRebatchDataset(scope *Scope, input_dataset tf.Output, num_replicas tf.Output, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ExperimentalRebatchDatasetAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalRebatchDataset",
-		Input: []tf.Input{
-			input_dataset, num_replicas,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Concatenates tensors along one dimension.
 //
 // Arguments:
@@ -39955,187 +41778,6 @@ func DatasetToGraph(scope *Scope, input_dataset tf.Output, optional ...DatasetTo
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdadeltaAttr is an optional argument to ResourceSparseApplyAdadelta.
-type ResourceSparseApplyAdadeltaAttr func(optionalAttr)
-
-// ResourceSparseApplyAdadeltaUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyAdadeltaUseLocking(value bool) ResourceSparseApplyAdadeltaAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// var: Should be from a Variable().
-//
-// Arguments:
-//
-//	accum: Should be from a Variable().
-//	accum_update: : Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	rho: Decay factor. Must be a scalar.
-//	epsilon: Constant factor. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyAdadelta(scope *Scope, var_ tf.Output, accum tf.Output, accum_update tf.Output, lr tf.Output, rho tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyAdadeltaAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdadelta",
-		Input: []tf.Input{
-			var_, accum, accum_update, lr, rho, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Computes sigmoid of `x` element-wise.
-//
-// Specifically, `y = 1 / (1 + exp(-x))`.
-func Sigmoid(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Sigmoid",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingADAMParametersGradAccumDebug.
-type RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingADAMParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve ADAM embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the ADAM optimization algorithm.
-//	momenta: Parameter momenta updated by the ADAM optimization algorithm.
-//	velocities: Parameter velocities updated by the ADAM optimization algorithm.
-//	gradient_accumulators: Parameter gradient_accumulators updated by the ADAM optimization algorithm.
-func RetrieveTPUEmbeddingADAMParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingADAMParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, velocities tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingADAMParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// ResourceApplyAdamAttr is an optional argument to ResourceApplyAdam.
-type ResourceApplyAdamAttr func(optionalAttr)
-
-// ResourceApplyAdamUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdamUseLocking(value bool) ResourceApplyAdamAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// ResourceApplyAdamUseNesterov sets the optional use_nesterov attribute to value.
-//
-// value: If `True`, uses the nesterov update.
-// If not specified, defaults to false
-func ResourceApplyAdamUseNesterov(value bool) ResourceApplyAdamAttr {
-	return func(m optionalAttr) {
-		m["use_nesterov"] = value
-	}
-}
-
-// Update '*var' according to the Adam algorithm.
-//
-// $$\text{lr}_t := \mathrm{learning_rate} * \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
-// $$m_t := \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
-// $$v_t := \beta_2 * v_{t-1} + (1 - \beta_2) * g * g$$
-// $$\text{variable} := \text{variable} - \text{lr}_t * m_t / (\sqrt{v_t} + \epsilon)$$
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	beta2_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdam(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, beta2_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdamAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdam",
-		Input: []tf.Input{
-			var_, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // ResizeNearestNeighborAttr is an optional argument to ResizeNearestNeighbor.
 type ResizeNearestNeighborAttr func(optionalAttr)
 
@@ -42151,6 +43793,41 @@ func LSTMBlockCell(scope *Scope, x tf.Output, cs_prev tf.Output, h_prev tf.Outpu
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
 }
 
+// Check if the input matches the regex pattern.
+//
+// The input is a string tensor of any shape. The pattern is a scalar
+// string tensor which is applied to every element of the input tensor.
+// The boolean values (True or False) of the output tensor indicate
+// if the input matches the regex pattern provided.
+//
+// The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+//
+// Examples:
+//
+// >>> tf.strings.regex_full_match(["TF lib", "lib TF"], ".*lib$")
+// <tf.Tensor: shape=(2,), dtype=bool, numpy=array([ True, False])>
+// >>> tf.strings.regex_full_match(["TF lib", "lib TF"], ".*TF$")
+// <tf.Tensor: shape=(2,), dtype=bool, numpy=array([False,  True])>
+//
+// Arguments:
+//	input: A string tensor of the text to be processed.
+//	pattern: A scalar string tensor containing the regular expression to match the input.
+//
+// Returns A bool tensor with the same shape as `input`.
+func RegexFullMatch(scope *Scope, input tf.Output, pattern tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "RegexFullMatch",
+		Input: []tf.Input{
+			input, pattern,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns x - y element-wise.
 //
 // *NOTE*: `Subtract` supports broadcasting. More about broadcasting
@@ -42470,7 +44147,7 @@ func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMS
 //	mom: Should be from a Variable().
 //	lr: Scaling factor. Must be a scalar.
 //	rho: Decay rate. Must be a scalar.
-//
+//	momentum: Momentum Scale. Must be a scalar.
 //	epsilon: Ridge term. Must be a scalar.
 //	grad: The gradient.
 //
@@ -42757,35 +44434,6 @@ func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_val
 	return op.Output(0), op.Output(1)
 }
 
-// Merges summaries.
-//
-// This op creates a
-// [`Summary`](https://www.tensorflow.org/code/tensorflow/core/framework/summary.proto)
-// protocol buffer that contains the union of all the values in the input
-// summaries.
-//
-// When the Op is run, it reports an `InvalidArgument` error if multiple values
-// in the summaries to merge use the same tag.
-//
-// Arguments:
-//	inputs: Can be of any shape.  Each must contain serialized `Summary` protocol
-// buffers.
-//
-// Returns Scalar. Serialized `Summary` protocol buffer.
-func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MergeSummary",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
 type MaxPool3DGradAttr func(optionalAttr)
 
@@ -43173,52 +44821,6 @@ func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear t
 	return scope.AddOperation(opspec)
 }
 
-// Constructs a tensor by tiling a given tensor.
-//
-// This operation creates a new tensor by replicating `input` `multiples` times.
-// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
-// and the values of `input` are replicated `multiples[i]` times along the 'i'th
-// dimension. For example, tiling `[a b c d]` by `[2]` produces
-// `[a b c d a b c d]`.
-//
-// >>> a = tf.constant([[1,2,3],[4,5,6]], tf.int32)
-// >>> b = tf.constant([1,2], tf.int32)
-// >>> tf.tile(a, b)
-// <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
-// array([[1, 2, 3, 1, 2, 3],
-//        [4, 5, 6, 4, 5, 6]], dtype=int32)>
-// >>> c = tf.constant([2,1], tf.int32)
-// >>> tf.tile(a, c)
-// <tf.Tensor: shape=(4, 3), dtype=int32, numpy=
-// array([[1, 2, 3],
-//        [4, 5, 6],
-//        [1, 2, 3],
-//        [4, 5, 6]], dtype=int32)>
-// >>> d = tf.constant([2,2], tf.int32)
-// >>> tf.tile(a, d)
-// <tf.Tensor: shape=(4, 6), dtype=int32, numpy=
-// array([[1, 2, 3, 1, 2, 3],
-//        [4, 5, 6, 4, 5, 6],
-//        [1, 2, 3, 1, 2, 3],
-//        [4, 5, 6, 4, 5, 6]], dtype=int32)>
-//
-// Arguments:
-//	input: 1-D or higher.
-//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
-func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tile",
-		Input: []tf.Input{
-			input, multiples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes softmax cross entropy cost and gradients to backpropagate.
 //
 // Unlike `SoftmaxCrossEntropyWithLogits`, this operation does not accept
@@ -43318,132 +44920,208 @@ func ResourceApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf
 	return scope.AddOperation(opspec)
 }
 
-// Elementwise computes the bitwise left-shift of `x` and `y`.
+// Constructs a tensor by tiling a given tensor.
 //
-// If `y` is negative, or greater than or equal to the width of `x` in bits the
-// result is implementation defined.
+// This operation creates a new tensor by replicating `input` `multiples` times.
+// The output tensor's i'th dimension has `input.dims(i) * multiples[i]` elements,
+// and the values of `input` are replicated `multiples[i]` times along the 'i'th
+// dimension. For example, tiling `[a b c d]` by `[2]` produces
+// `[a b c d a b c d]`.
 //
-// Example:
+// >>> a = tf.constant([[1,2,3],[4,5,6]], tf.int32)
+// >>> b = tf.constant([1,2], tf.int32)
+// >>> tf.tile(a, b)
+// <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
+// array([[1, 2, 3, 1, 2, 3],
+//        [4, 5, 6, 4, 5, 6]], dtype=int32)>
+// >>> c = tf.constant([2,1], tf.int32)
+// >>> tf.tile(a, c)
+// <tf.Tensor: shape=(4, 3), dtype=int32, numpy=
+// array([[1, 2, 3],
+//        [4, 5, 6],
+//        [1, 2, 3],
+//        [4, 5, 6]], dtype=int32)>
+// >>> d = tf.constant([2,2], tf.int32)
+// >>> tf.tile(a, d)
+// <tf.Tensor: shape=(4, 6), dtype=int32, numpy=
+// array([[1, 2, 3, 1, 2, 3],
+//        [4, 5, 6, 4, 5, 6],
+//        [1, 2, 3, 1, 2, 3],
+//        [4, 5, 6, 4, 5, 6]], dtype=int32)>
 //
-// ```python
-// import tensorflow as tf
-// from tensorflow.python.ops import bitwise_ops
-// import numpy as np
-// dtype_list = [tf.int8, tf.int16, tf.int32, tf.int64]
-//
-// for dtype in dtype_list:
-//   lhs = tf.constant([-1, -5, -3, -14], dtype=dtype)
-//   rhs = tf.constant([5, 0, 7, 11], dtype=dtype)
-//
-//   left_shift_result = bitwise_ops.left_shift(lhs, rhs)
-//
-//   print(left_shift_result)
-//
-// # This will print:
-// # tf.Tensor([ -32   -5 -128    0], shape=(4,), dtype=int8)
-// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int16)
-// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int32)
-// # tf.Tensor([   -32     -5   -384 -28672], shape=(4,), dtype=int64)
-//
-// lhs = np.array([-2, 64, 101, 32], dtype=np.int8)
-// rhs = np.array([-1, -5, -3, -14], dtype=np.int8)
-// bitwise_ops.left_shift(lhs, rhs)
-// # <tf.Tensor: shape=(4,), dtype=int8, numpy=array([ -2,  64, 101,  32], dtype=int8)>
-// ```
-//
-func LeftShift(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+// Arguments:
+//	input: 1-D or higher.
+//	multiples: 1-D. Length must be the same as the number of dimensions in `input`
+func Tile(scope *Scope, input tf.Output, multiples tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "LeftShift",
+		Type: "Tile",
 		Input: []tf.Input{
-			x, y,
+			input, multiples,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Generates a feature cross from a list of tensors, and returns it as a
-// RaggedTensor.  See `tf.ragged.cross` for more details.
+// Creates and returns an empty tensor list.
 //
-// Arguments:
-//	ragged_values: The values tensor for each RaggedTensor input.
-//	ragged_row_splits: The row_splits tensor for each RaggedTensor input.
-//	sparse_indices: The indices tensor for each SparseTensor input.
-//	sparse_values: The values tensor for each SparseTensor input.
-//	sparse_shape: The dense_shape tensor for each SparseTensor input.
-//	dense_inputs: The tf.Tensor inputs.
-//	input_order: String specifying the tensor type for each input.  The `i`th character in
-// this string specifies the type of the `i`th input, and is one of: 'R' (ragged),
-// 'D' (dense), or 'S' (sparse).  This attr is used to ensure that the crossed
-// values are combined in the order of the inputs from the call to tf.ragged.cross.
+// All list elements must be tensors of dtype element_dtype and shape compatible
+// with element_shape.
 //
-//
-//
-//
-//
-//
-// Returns:
-//	output_values: The `values` for the returned `RaggedTensor`.
-//	output_row_splits: The `row_splits` for the returned `RaggedTensor`.
-func RaggedCross(scope *Scope, ragged_values []tf.Output, ragged_row_splits []tf.Output, sparse_indices []tf.Output, sparse_values []tf.Output, sparse_shape []tf.Output, dense_inputs []tf.Output, input_order string, hashed_output bool, num_buckets int64, hash_key int64, out_values_type tf.DataType, out_row_splits_type tf.DataType) (output_values tf.Output, output_row_splits tf.Output) {
+// handle: an empty tensor list.
+// element_dtype: the type of elements in the list.
+// element_shape: a shape compatible with that of elements in the list.
+func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"input_order": input_order, "hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_values_type": out_values_type, "out_row_splits_type": out_row_splits_type}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "RaggedCross",
+		Type: "EmptyTensorList",
 		Input: []tf.Input{
-			tf.OutputList(ragged_values), tf.OutputList(ragged_row_splits), tf.OutputList(sparse_indices), tf.OutputList(sparse_values), tf.OutputList(sparse_shape), tf.OutputList(dense_inputs),
+			element_shape, max_num_elements,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Output a fact about factorials.
-func Fact(scope *Scope) (fact tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Fact",
-	}
-	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.
-type RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
+// Sets up TPUEmbedding in a distributed TPU system.
+//
+// Arguments:
+//	config: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
+// describes the embedding lookups of the program.
+//
+// Returns the created operation.
+func ConfigureTPUEmbedding(scope *Scope, config string) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"config": config}
+	opspec := tf.OpSpec{
+		Type: "ConfigureTPUEmbedding",
 
-// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// QuantizedMatMulWithBiasAndReluAttr is an optional argument to QuantizedMatMulWithBiasAndRelu.
+type QuantizedMatMulWithBiasAndReluAttr func(optionalAttr)
+
+// QuantizedMatMulWithBiasAndReluToutput sets the optional Toutput attribute to value.
+// If not specified, defaults to DT_QINT32
+func QuantizedMatMulWithBiasAndReluToutput(value tf.DataType) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["Toutput"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluTransposeA sets the optional transpose_a attribute to value.
+//
+// value: If true, `a` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulWithBiasAndReluTransposeA(value bool) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["transpose_a"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluTransposeB sets the optional transpose_b attribute to value.
+//
+// value: If true, `b` is transposed before multiplication.
+// If not specified, defaults to false
+func QuantizedMatMulWithBiasAndReluTransposeB(value bool) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["transpose_b"] = value
+	}
+}
+
+// QuantizedMatMulWithBiasAndReluInputQuantMode sets the optional input_quant_mode attribute to value.
+//
+// value: Input data quantization mode. Either MIN_FIRST(default) or SCALED.
+// If not specified, defaults to "MIN_FIRST"
+func QuantizedMatMulWithBiasAndReluInputQuantMode(value string) QuantizedMatMulWithBiasAndReluAttr {
+	return func(m optionalAttr) {
+		m["input_quant_mode"] = value
+	}
+}
+
+// Perform a quantized matrix multiplication of  `a` by the matrix `b` with bias
+// add and relu fusion.
+//
+// The inputs must be two-dimensional matrices and 1D bias vector. And the inner
+// dimension of `a` (after being transposed if `transpose_a` is non-zero) must
+// match the outer dimension of `b` (after being transposed if `transposed_b` is
+// non-zero). Then do broadcast add operation with bias values on the matrix
+// multiplication result. The bias size must match inner dimension of `b`. Then do
+// relu activation to get non-negative result.
+//
+// Arguments:
+//	a: A matrix to be multiplied. Must be a two-dimensional tensor of type `quint8`.
+//	b: A matrix to be multiplied and must be a two-dimensional tensor of type `qint8`.
+//	bias: A 1D bias tensor with size matching with inner dimension of `b` (after being
+// transposed if `transposed_b` is non-zero).
+//	min_a: The float value that the lowest quantized `a` value represents.
+//	max_a: The float value that the highest quantized `a` value represents.
+//	min_b: The float value that the lowest quantized `b` value represents.
+//	max_b: The float value that the highest quantized `b` value represents.
+//
+// Returns:
+//	out
+//	min_out: The float value that the lowest quantized output value represents.
+//	max_out: The float value that the highest quantized output value represents.
+func QuantizedMatMulWithBiasAndRelu(scope *Scope, a tf.Output, b tf.Output, bias tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulWithBiasAndReluAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedMatMulWithBiasAndRelu",
+		Input: []tf.Input{
+			a, b, bias, min_a, max_a, min_b, max_b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.
+type RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
 // If not specified, defaults to -1
-func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
 		m["table_id"] = value
 	}
 }
 
-// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
 // If not specified, defaults to ""
-func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
 		m["table_name"] = value
 	}
 }
 
-// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugConfig sets the optional config attribute to value.
+// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig sets the optional config attribute to value.
 // If not specified, defaults to ""
-func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
 	return func(m optionalAttr) {
 		m["config"] = value
 	}
 }
 
-// Retrieve Adadelta embedding parameters with debug support.
+// Retrieve Momentum embedding parameters with debug support.
 //
 // An op that retrieves optimization parameters from embedding to host
 // memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
@@ -43451,11 +45129,10 @@ func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugConfig(value string) Re
 // used to retrieve updated parameters before saving a checkpoint.
 //
 // Returns:
-//	parameters: Parameter parameters updated by the Adadelta optimization algorithm.
-//	accumulators: Parameter accumulators updated by the Adadelta optimization algorithm.
-//	updates: Parameter updates updated by the Adadelta optimization algorithm.
-//	gradient_accumulators: Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
-func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output) {
+//	parameters: Parameter parameters updated by the Momentum optimization algorithm.
+//	momenta: Parameter momenta updated by the Momentum optimization algorithm.
+//	gradient_accumulators: Parameter gradient_accumulators updated by the Momentum optimization algorithm.
+func RetrieveTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -43464,400 +45141,39 @@ func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, num_shar
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug",
+		Type: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug",
 
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
-}
-
-// Pads a tensor with mirrored values.
-//
-// This operation pads a `input` with mirrored values according to the `paddings`
-// you specify. `paddings` is an integer tensor with shape `[n, 2]`, where n is
-// the rank of `input`. For each dimension D of `input`, `paddings[D, 0]` indicates
-// how many values to add before the contents of `input` in that dimension, and
-// `paddings[D, 1]` indicates how many values to add after the contents of `input`
-// in that dimension. Both `paddings[D, 0]` and `paddings[D, 1]` must be no greater
-// than `input.dim_size(D)` (or `input.dim_size(D) - 1`) if `copy_border` is true
-// (if false, respectively).
-//
-// The padded size of each dimension D of the output is:
-//
-// `paddings(D, 0) + input.dim_size(D) + paddings(D, 1)`
-//
-// For example:
-//
-// ```
-// # 't' is [[1, 2, 3], [4, 5, 6]].
-// # 'paddings' is [[1, 1]], [2, 2]].
-// # 'mode' is SYMMETRIC.
-// # rank of 't' is 2.
-// pad(t, paddings) ==> [[2, 1, 1, 2, 3, 3, 2]
-//                       [2, 1, 1, 2, 3, 3, 2]
-//                       [5, 4, 4, 5, 6, 6, 5]
-//                       [5, 4, 4, 5, 6, 6, 5]]
-// ```
-//
-// Arguments:
-//	input: The input tensor to be padded.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	mode: Either `REFLECT` or `SYMMETRIC`. In reflect mode the padded regions
-// do not include the borders, while in symmetric mode the padded regions
-// do include the borders. For example, if `input` is `[1, 2, 3]` and `paddings`
-// is `[0, 2]`, then the output is `[1, 2, 3, 2, 1]` in reflect mode, and
-// it is `[1, 2, 3, 3, 2]` in symmetric mode.
-//
-// Returns The padded tensor.
-func MirrorPad(scope *Scope, input tf.Output, paddings tf.Output, mode string) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode}
-	opspec := tf.OpSpec{
-		Type: "MirrorPad",
-		Input: []tf.Input{
-			input, paddings,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TensorArrayV3Attr is an optional argument to TensorArrayV3.
-type TensorArrayV3Attr func(optionalAttr)
-
-// TensorArrayV3ElementShape sets the optional element_shape attribute to value.
-//
-// value: The expected shape of an element, if known. Used to
-// validate the shapes of TensorArray elements. If this shape is not
-// fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["element_shape"] = value
-	}
-}
-
-// TensorArrayV3DynamicSize sets the optional dynamic_size attribute to value.
-//
-// value: A boolean that determines whether writes to the TensorArray
-// are allowed to grow the size.  By default, this is not allowed.
-// If not specified, defaults to false
-func TensorArrayV3DynamicSize(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["dynamic_size"] = value
-	}
-}
-
-// TensorArrayV3ClearAfterRead sets the optional clear_after_read attribute to value.
-//
-// value: If true (default), Tensors in the TensorArray are cleared
-// after being read.  This disables multiple read semantics but allows early
-// release of memory.
-// If not specified, defaults to true
-func TensorArrayV3ClearAfterRead(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["clear_after_read"] = value
-	}
-}
-
-// TensorArrayV3IdenticalElementShapes sets the optional identical_element_shapes attribute to value.
-//
-// value: If true (default is false), then all
-// elements in the TensorArray will be expected to have have identical shapes.
-// This allows certain behaviors, like dynamically checking for
-// consistent shapes on write, and being able to fill in properly
-// shaped zero tensors on stack -- even if the element_shape attribute
-// is not fully defined.
-// If not specified, defaults to false
-func TensorArrayV3IdenticalElementShapes(value bool) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["identical_element_shapes"] = value
-	}
-}
-
-// TensorArrayV3TensorArrayName sets the optional tensor_array_name attribute to value.
-//
-// value: Overrides the name used for the temporary tensor_array
-// resource. Default value is the name of the 'TensorArray' op (which
-// is guaranteed unique).
-// If not specified, defaults to ""
-func TensorArrayV3TensorArrayName(value string) TensorArrayV3Attr {
-	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
-	}
-}
-
-// An array of Tensors of given size.
-//
-// Write data via Write and read via Read or Pack.
-//
-// Arguments:
-//	size: The size of the array.
-//	dtype: The type of the elements on the tensor_array.
-//
-// Returns:
-//	handle: The handle to the TensorArray.
-//	flow: A scalar used to control gradient flow.
-func TensorArrayV3(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV3Attr) (handle tf.Output, flow tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayV3",
-		Input: []tf.Input{
-			size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// MatrixSolveLsAttr is an optional argument to MatrixSolveLs.
-type MatrixSolveLsAttr func(optionalAttr)
-
-// MatrixSolveLsFast sets the optional fast attribute to value.
-// If not specified, defaults to true
-func MatrixSolveLsFast(value bool) MatrixSolveLsAttr {
-	return func(m optionalAttr) {
-		m["fast"] = value
-	}
-}
-
-// Solves one or more linear least-squares problems.
-//
-// `matrix` is a tensor of shape `[..., M, N]` whose inner-most 2 dimensions
-// form real or complex matrices of size `[M, N]`. `Rhs` is a tensor of the same
-// type as `matrix` and shape `[..., M, K]`.
-// The output is a tensor shape `[..., N, K]` where each output matrix solves
-// each of the equations
-// `matrix[..., :, :]` * `output[..., :, :]` = `rhs[..., :, :]`
-// in the least squares sense.
-//
-// We use the following notation for (complex) matrix and right-hand sides
-// in the batch:
-//
-// `matrix`=\\(A \in \mathbb{C}^{m \times n}\\),
-// `rhs`=\\(B  \in \mathbb{C}^{m \times k}\\),
-// `output`=\\(X  \in \mathbb{C}^{n \times k}\\),
-// `l2_regularizer`=\\(\lambda \in \mathbb{R}\\).
-//
-// If `fast` is `True`, then the solution is computed by solving the normal
-// equations using Cholesky decomposition. Specifically, if \\(m \ge n\\) then
-// \\(X = (A^H A + \lambda I)^{-1} A^H B\\), which solves the least-squares
-// problem \\(X = \mathrm{argmin}_{Z \in \Re^{n \times k} } ||A Z - B||_F^2 + \lambda ||Z||_F^2\\).
-// If \\(m \lt n\\) then `output` is computed as
-// \\(X = A^H (A A^H + \lambda I)^{-1} B\\), which (for \\(\lambda = 0\\)) is the
-// minimum-norm solution to the under-determined linear system, i.e.
-// \\(X = \mathrm{argmin}_{Z \in \mathbb{C}^{n \times k} } ||Z||_F^2 \\),
-// subject to \\(A Z = B\\). Notice that the fast path is only numerically stable
-// when \\(A\\) is numerically full rank and has a condition number
-// \\(\mathrm{cond}(A) \lt \frac{1}{\sqrt{\epsilon_{mach} } }\\) or \\(\lambda\\) is
-// sufficiently large.
-//
-// If `fast` is `False` an algorithm based on the numerically robust complete
-// orthogonal decomposition is used. This computes the minimum-norm
-// least-squares solution, even when \\(A\\) is rank deficient. This path is
-// typically 6-7 times slower than the fast path. If `fast` is `False` then
-// `l2_regularizer` is ignored.
-//
-// Arguments:
-//	matrix: Shape is `[..., M, N]`.
-//	rhs: Shape is `[..., M, K]`.
-//	l2_regularizer: Scalar tensor.
-//
-// @compatibility(numpy)
-// Equivalent to np.linalg.lstsq
-// @end_compatibility
-//
-// Returns Shape is `[..., N, K]`.
-func MatrixSolveLs(scope *Scope, matrix tf.Output, rhs tf.Output, l2_regularizer tf.Output, optional ...MatrixSolveLsAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixSolveLs",
-		Input: []tf.Input{
-			matrix, rhs, l2_regularizer,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Generates sparse cross from a list of sparse and dense tensors.
-//
-// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
-// representing features of one feature column. It outputs a 2D `SparseTensor` with
-// the batchwise crosses of these features.
-//
-// For example, if the inputs are
-//
-//     inputs[0]: SparseTensor with shape = [2, 2]
-//     [0, 0]: "a"
-//     [1, 0]: "b"
-//     [1, 1]: "c"
-//
-//     inputs[1]: SparseTensor with shape = [2, 1]
-//     [0, 0]: "d"
-//     [1, 0]: "e"
-//
-//     inputs[2]: Tensor [["f"], ["g"]]
-//
-// then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: "a_X_d_X_f"
-//     [1, 0]: "b_X_e_X_g"
-//     [1, 1]: "c_X_e_X_g"
-//
-// if hashed_output=true then the output will be
-//
-//     shape = [2, 2]
-//     [0, 0]: FingerprintCat64(
-//                 Fingerprint64("f"), FingerprintCat64(
-//                     Fingerprint64("d"), Fingerprint64("a")))
-//     [1, 0]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("b")))
-//     [1, 1]: FingerprintCat64(
-//                 Fingerprint64("g"), FingerprintCat64(
-//                     Fingerprint64("e"), Fingerprint64("c")))
-//
-// Arguments:
-//	indices: 2-D.  Indices of each input `SparseTensor`.
-//	values: 1-D.   values of each `SparseTensor`.
-//	shapes: 1-D.   Shapes of each `SparseTensor`.
-//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
-//	hashed_output: If true, returns the hash of the cross instead of the string.
-// This will allow us avoiding string manipulations.
-//	num_buckets: It is used if hashed_output is true.
-// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
-//	hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
-// function to combine the crosses fingerprints.
-//
-//
-//
-// Returns:
-//	output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
-//	output_values: 1-D.  Non-empty values of the concatenated or hashed
-// `SparseTensor`.
-//	output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
-func SparseCross(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, hashed_output bool, num_buckets int64, hash_key int64, out_type tf.DataType, internal_type tf.DataType) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"hashed_output": hashed_output, "num_buckets": num_buckets, "hash_key": hash_key, "out_type": out_type, "internal_type": internal_type}
-	opspec := tf.OpSpec{
-		Type: "SparseCross",
-		Input: []tf.Input{
-			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Reverses specific dimensions of a tensor.
+// StatelessRandomUniformFullIntAttr is an optional argument to StatelessRandomUniformFullInt.
+type StatelessRandomUniformFullIntAttr func(optionalAttr)
+
+// StatelessRandomUniformFullIntDtype sets the optional dtype attribute to value.
 //
-// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
-// of `tensor`, this operation reverses each dimension i of `tensor` where
-// `dims[i]` is `True`.
+// value: The type of the output.
+// If not specified, defaults to DT_UINT64
+func StatelessRandomUniformFullIntDtype(value tf.DataType) StatelessRandomUniformFullIntAttr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
 //
-// `tensor` can have up to 8 dimensions. The number of dimensions
-// of `tensor` must equal the number of elements in `dims`. In other words:
+// The generated values are uniform integers covering the whole range of `dtype`.
 //
-// `rank(tensor) = size(dims)`
-//
-// For example:
-//
-// ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
-//
-// # 'dims' is [False, False, False, True]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
-//
-// # 'dims' is [False, True, False, False]
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
-//
-// # 'dims' is [False, False, True, False]
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
-// ```
+// The outputs are a deterministic function of `shape` and `seed`.
 //
 // Arguments:
-//	tensor: Up to 8-D.
-//	dims: 1-D. The dimensions to reverse.
+//	shape: The shape of the output tensor.
+//	seed: 2 seeds (shape [2]).
 //
-// Returns The same shape as `tensor`.
-func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Reverse",
-		Input: []tf.Input{
-			tensor, dims,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StringLowerAttr is an optional argument to StringLower.
-type StringLowerAttr func(optionalAttr)
-
-// StringLowerEncoding sets the optional encoding attribute to value.
-// If not specified, defaults to ""
-func StringLowerEncoding(value string) StringLowerAttr {
-	return func(m optionalAttr) {
-		m["encoding"] = value
-	}
-}
-
-// Converts all uppercase characters into their respective lowercase replacements.
-//
-// Example:
-//
-// >>> tf.strings.lower("CamelCase string and ALL CAPS")
-// <tf.Tensor: shape=(), dtype=string, numpy=b'camelcase string and all caps'>
-//
-func StringLower(scope *Scope, input tf.Output, optional ...StringLowerAttr) (output tf.Output) {
+// Returns Random values with specified shape.
+func StatelessRandomUniformFullInt(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformFullIntAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -43866,9 +45182,9 @@ func StringLower(scope *Scope, input tf.Output, optional ...StringLowerAttr) (ou
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "StringLower",
+		Type: "StatelessRandomUniformFullInt",
 		Input: []tf.Input{
-			input,
+			shape, seed,
 		},
 		Attrs: attrs,
 	}
@@ -43876,60 +45192,19 @@ func StringLower(scope *Scope, input tf.Output, optional ...StringLowerAttr) (ou
 	return op.Output(0)
 }
 
-// Wraps an arbitrary MLIR computation expressed as a module with a main() function.
+// Shuts down a running distributed TPU system.
 //
-// This operation does not have an associated kernel and is not intended to be
-// executed in a regular TensorFlow session. Instead it is intended to be used for
-// testing or for special case where a user intends to pass custom MLIR computation
-// through a TensorFlow graph with the intent of having custom tooling processing
-// it downstream (when targeting a different environment, like TensorFlow lite for
-// example).
-// The MLIR module is expected to have a main() function that will be used as an
-// entry point. The inputs to the operations will be passed as argument to the
-// main() function and the returned values of the main function mapped to the
-// outputs.
-// Example usage:
+// The op returns an error if no system is running.
 //
-// ```
-// import tensorflow as tf
-// from tensorflow.compiler.mlir.tensorflow.gen_mlir_passthrough_op import mlir_passthrough_op
-//
-// mlir_module = '''python
-// func @main(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10x10xf32> {
-//    %add = "magic.op"(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10x10xf32>
-//    return %ret : tensor<10x10xf32>
-// }
-// '''
-//
-// @tf.function
-// def foo(x, y):
-//   return mlir_passthrough_op([x, y], mlir_module, Toutputs=[tf.float32])
-//
-// graph_def = foo.get_concrete_function(tf.TensorSpec([10], tf.float32), tf.TensorSpec([10], tf.float32)).graph.as_graph_def()
-// ```
-func MlirPassthroughOp(scope *Scope, inputs []tf.Output, mlir_module string, Toutputs []tf.DataType) (outputs []tf.Output) {
+// Returns the created operation.
+func ShutdownDistributedTPU(scope *Scope) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"mlir_module": mlir_module, "Toutputs": Toutputs}
 	opspec := tf.OpSpec{
-		Type: "MlirPassthroughOp",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
+		Type: "ShutdownDistributedTPU",
 	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
-		scope.UpdateErr("MlirPassthroughOp", err)
-		return
-	}
-	return outputs
+	return scope.AddOperation(opspec)
 }
 
 // Converts each string in the input Tensor to its hash mod by a number of buckets.
@@ -44336,159 +45611,6 @@ func LoadTPUEmbeddingADAMParameters(scope *Scope, parameters tf.Output, momenta
 	return scope.AddOperation(opspec)
 }
 
-// ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
-type ResourceScatterNdUpdateAttr func(optionalAttr)
-
-// ResourceScatterNdUpdateUseLocking sets the optional use_locking attribute to value.
-//
-// value: An optional bool. Defaults to True. If True, the assignment will
-// be protected by a lock; otherwise the behavior is undefined,
-// but may exhibit less contention.
-// If not specified, defaults to true
-func ResourceScatterNdUpdateUseLocking(value bool) ResourceScatterNdUpdateAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Applies sparse `updates` to individual values or slices within a given
-//
-// variable according to `indices`.
-//
-// `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
-//
-// `indices` must be integer tensor, containing indices into `ref`.
-// It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
-//
-// The innermost dimension of `indices` (with length `K`) corresponds to
-// indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-// dimension of `ref`.
-//
-// `updates` is `Tensor` of rank `Q-1+P-K` with shape:
-//
-// ```
-// [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
-// ```
-//
-// For example, say we want to update 4 scattered elements to a rank-1 tensor to
-// 8 elements. In Python, that update would look like this:
-//
-// ```python
-//     ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
-//     indices = tf.constant([[4], [3], [1] ,[7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     update = tf.scatter_nd_update(ref, indices, updates)
-//     with tf.Session() as sess:
-//       print sess.run(update)
-// ```
-//
-// The resulting update to ref would look like this:
-//
-//     [1, 11, 3, 10, 9, 6, 7, 12]
-//
-// See `tf.scatter_nd` for more details about how to make updates to
-// slices.
-//
-// Arguments:
-//	ref: A resource handle. Must be from a VarHandleOp.
-//	indices: A Tensor. Must be one of the following types: int32, int64.
-// A tensor of indices into ref.
-//	updates: A Tensor. Must have the same type as ref. A tensor of updated
-// values to add to ref.
-//
-// Returns the created operation.
-func ResourceScatterNdUpdate(scope *Scope, ref tf.Output, indices tf.Output, updates tf.Output, optional ...ResourceScatterNdUpdateAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterNdUpdate",
-		Input: []tf.Input{
-			ref, indices, updates,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// EnqueueTPUEmbeddingSparseBatchAttr is an optional argument to EnqueueTPUEmbeddingSparseBatch.
-type EnqueueTPUEmbeddingSparseBatchAttr func(optionalAttr)
-
-// EnqueueTPUEmbeddingSparseBatchDeviceOrdinal sets the optional device_ordinal attribute to value.
-//
-// value: The TPU device to use. Should be >= 0 and less than the number
-// of TPU cores in the task on which the node is placed.
-// If not specified, defaults to -1
-func EnqueueTPUEmbeddingSparseBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddingSparseBatchAttr {
-	return func(m optionalAttr) {
-		m["device_ordinal"] = value
-	}
-}
-
-// EnqueueTPUEmbeddingSparseBatchCombiners sets the optional combiners attribute to value.
-//
-// value: A list of string scalars, one for each embedding table that specify
-// how to normalize the embedding activations after weighted summation.
-// Supported combiners are 'mean', 'sum', or 'sqrtn'. It is invalid to have
-// the sum of the weights be 0 for 'mean' or the sum of the squared weights be
-// 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
-// all tables.
-// If not specified, defaults to <>
-func EnqueueTPUEmbeddingSparseBatchCombiners(value []string) EnqueueTPUEmbeddingSparseBatchAttr {
-	return func(m optionalAttr) {
-		m["combiners"] = value
-	}
-}
-
-// An op that enqueues TPUEmbedding input indices from a SparseTensor.
-//
-// This Op eases the porting of code that uses embedding_lookup_sparse(),
-// although some Python preprocessing of the SparseTensor arguments to
-// embedding_lookup_sparse() is required to produce the arguments to this Op,
-// since only a single EnqueueTPUEmbeddingSparseBatch Op is allowed per training
-// step.
-//
-// The tensors at corresponding positions in the three input lists
-// must have the same shape, i.e. rank 1 with dim_size() equal to the total
-// number of lookups into the table described by the corresponding table_id.
-//
-// Arguments:
-//	sample_indices: A list of rank 1 Tensors specifying the training example and
-// feature to which the corresponding embedding_indices and aggregation_weights
-// values belong. sample_indices[i] must equal b * nf + f, where nf is the
-// number of features from the corresponding table, f is in [0, nf), and
-// b is in [0, batch size).
-//	embedding_indices: A list of rank 1 Tensors, indices into the embedding tables.
-//	aggregation_weights: A list of rank 1 Tensors containing per sample -- i.e. per
-// (training example, feature) -- aggregation weights.
-//	mode_override: A string input that overrides the mode specified in the
-// TPUEmbeddingConfiguration. Supported values are {'unspecified', 'inference',
-// 'training', 'backward_pass_only'}. When set to 'unspecified', the mode set
-// in TPUEmbeddingConfiguration is used, otherwise mode_override is used.
-//
-// Returns the created operation.
-func EnqueueTPUEmbeddingSparseBatch(scope *Scope, sample_indices []tf.Output, embedding_indices []tf.Output, aggregation_weights []tf.Output, mode_override tf.Output, optional ...EnqueueTPUEmbeddingSparseBatchAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "EnqueueTPUEmbeddingSparseBatch",
-		Input: []tf.Input{
-			tf.OutputList(sample_indices), tf.OutputList(embedding_indices), tf.OutputList(aggregation_weights), mode_override,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Transforms a vector of brain.Example protos (as strings) into typed tensors.
 //
 // Arguments:
@@ -44827,6 +45949,53 @@ func TPUEmbeddingActivations(scope *Scope, embedding_variable tf.Output, sliced_
 	return op.Output(0)
 }
 
+// StatelessTruncatedNormalV2Attr is an optional argument to StatelessTruncatedNormalV2.
+type StatelessTruncatedNormalV2Attr func(optionalAttr)
+
+// StatelessTruncatedNormalV2Dtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessTruncatedNormalV2Dtype(value tf.DataType) StatelessTruncatedNormalV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// The outputs are a deterministic function of `shape`, `key`, `counter` and `alg`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	key: Key for the counter-based RNG algorithm (shape uint64[1]).
+//	counter: Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+//	alg: The RNG algorithm (shape int32[]).
+//
+// Returns Random values with specified shape.
+func StatelessTruncatedNormalV2(scope *Scope, shape tf.Output, key tf.Output, counter tf.Output, alg tf.Output, optional ...StatelessTruncatedNormalV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessTruncatedNormalV2",
+		Input: []tf.Input{
+			shape, key, counter, alg,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // QuantizeAndDequantizeV3Attr is an optional argument to QuantizeAndDequantizeV3.
 type QuantizeAndDequantizeV3Attr func(optionalAttr)
 
@@ -44947,6 +46116,51 @@ func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, el
 	return op.Output(0)
 }
 
+// StatelessRandomNormalV2Attr is an optional argument to StatelessRandomNormalV2.
+type StatelessRandomNormalV2Attr func(optionalAttr)
+
+// StatelessRandomNormalV2Dtype sets the optional dtype attribute to value.
+//
+// value: The type of the output.
+// If not specified, defaults to DT_FLOAT
+func StatelessRandomNormalV2Dtype(value tf.DataType) StatelessRandomNormalV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs deterministic pseudorandom values from a normal distribution.
+//
+// The generated values will have mean 0 and standard deviation 1.
+//
+// The outputs are a deterministic function of `shape`, `key`, `counter` and `alg`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	key: Key for the counter-based RNG algorithm (shape uint64[1]).
+//	counter: Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+//	alg: The RNG algorithm (shape int32[]).
+//
+// Returns Random values with specified shape.
+func StatelessRandomNormalV2(scope *Scope, shape tf.Output, key tf.Output, counter tf.Output, alg tf.Output, optional ...StatelessRandomNormalV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomNormalV2",
+		Input: []tf.Input{
+			shape, key, counter, alg,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug.
 type RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugAttr func(optionalAttr)
 
@@ -45984,104 +47198,6 @@ func StatelessRandomUniform(scope *Scope, shape tf.Output, seed tf.Output, optio
 	return op.Output(0)
 }
 
-// StatelessRandomUniformFullIntAttr is an optional argument to StatelessRandomUniformFullInt.
-type StatelessRandomUniformFullIntAttr func(optionalAttr)
-
-// StatelessRandomUniformFullIntDtype sets the optional dtype attribute to value.
-//
-// value: The type of the output.
-// If not specified, defaults to DT_UINT64
-func StatelessRandomUniformFullIntDtype(value tf.DataType) StatelessRandomUniformFullIntAttr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs deterministic pseudorandom random integers from a uniform distribution.
-//
-// The generated values are uniform integers covering the whole range of `dtype`.
-//
-// The outputs are a deterministic function of `shape` and `seed`.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	seed: 2 seeds (shape [2]).
-//
-// Returns Random values with specified shape.
-func StatelessRandomUniformFullInt(scope *Scope, shape tf.Output, seed tf.Output, optional ...StatelessRandomUniformFullIntAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatelessRandomUniformFullInt",
-		Input: []tf.Input{
-			shape, seed,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingMomentumParametersGradAccumDebug.
-type RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr func(optionalAttr)
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId sets the optional table_id attribute to value.
-// If not specified, defaults to -1
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_id"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName sets the optional table_name attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["table_name"] = value
-	}
-}
-
-// RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig sets the optional config attribute to value.
-// If not specified, defaults to ""
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr {
-	return func(m optionalAttr) {
-		m["config"] = value
-	}
-}
-
-// Retrieve Momentum embedding parameters with debug support.
-//
-// An op that retrieves optimization parameters from embedding to host
-// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
-// the correct embedding table configuration. For example, this op is
-// used to retrieve updated parameters before saving a checkpoint.
-//
-// Returns:
-//	parameters: Parameter parameters updated by the Momentum optimization algorithm.
-//	momenta: Parameter momenta updated by the Momentum optimization algorithm.
-//	gradient_accumulators: Parameter gradient_accumulators updated by the Momentum optimization algorithm.
-func RetrieveTPUEmbeddingMomentumParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingMomentumParametersGradAccumDebugAttr) (parameters tf.Output, momenta tf.Output, gradient_accumulators tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RetrieveTPUEmbeddingMomentumParametersGradAccumDebug",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // MaxPoolGradGradV2Attr is an optional argument to MaxPoolGradGradV2.
 type MaxPoolGradGradV2Attr func(optionalAttr)
 
@@ -46511,6 +47627,26 @@ func SerializeSparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Ou
 	return op.Output(0)
 }
 
+// An op which supports basic einsum op with 2 inputs and 1 output.
+//
+// This op has better TPU performance since it doesn't have explicitly reshape and
+// transpose operations as tf.einsum does.
+func XlaEinsum(scope *Scope, a tf.Output, b tf.Output, equation string) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"equation": equation}
+	opspec := tf.OpSpec{
+		Type: "XlaEinsum",
+		Input: []tf.Input{
+			a, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Extracts the average gradient in the given ConditionalAccumulator.
 //
 // The op blocks until sufficient (i.e., more than num_required)
@@ -46603,6 +47739,35 @@ func InfeedEnqueue(scope *Scope, input tf.Output, optional ...InfeedEnqueueAttr)
 	return scope.AddOperation(opspec)
 }
 
+// Outputs deterministic pseudorandom random integers from a uniform distribution.
+//
+// The generated values follow a uniform distribution in the range `[minval, maxval)`.
+//
+// The outputs are a deterministic function of `shape`, `key`, `counter`, `alg`, `minval` and `maxval`.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	key: Key for the counter-based RNG algorithm (shape uint64[1]).
+//	counter: Initial counter for the counter-based RNG algorithm (shape uint64[2] or uint64[1] depending on the algorithm). If a larger vector is given, only the needed portion on the left (i.e. [:N]) will be used.
+//	alg: The RNG algorithm (shape int32[]).
+//	minval: Minimum value (inclusive, scalar).
+//	maxval: Maximum value (exclusive, scalar).
+//
+// Returns Random values with specified shape.
+func StatelessRandomUniformIntV2(scope *Scope, shape tf.Output, key tf.Output, counter tf.Output, alg tf.Output, minval tf.Output, maxval tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "StatelessRandomUniformIntV2",
+		Input: []tf.Input{
+			shape, key, counter, alg, minval, maxval,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // EnqueueTPUEmbeddingIntegerBatchAttr is an optional argument to EnqueueTPUEmbeddingIntegerBatch.
 type EnqueueTPUEmbeddingIntegerBatchAttr func(optionalAttr)
 
@@ -46866,157 +48031,6 @@ func RetrieveTPUEmbeddingFTRLParameters(scope *Scope, num_shards int64, shard_id
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// UnicodeDecodeWithOffsetsAttr is an optional argument to UnicodeDecodeWithOffsets.
-type UnicodeDecodeWithOffsetsAttr func(optionalAttr)
-
-// UnicodeDecodeWithOffsetsErrors sets the optional errors attribute to value.
-//
-// value: Error handling policy when there is invalid formatting found in the input.
-// The value of 'strict' will cause the operation to produce a InvalidArgument
-// error on any invalid input formatting. A value of 'replace' (the default) will
-// cause the operation to replace any invalid formatting in the input with the
-// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
-// skip any invalid formatting in the input and produce no corresponding output
-// character.
-// If not specified, defaults to "replace"
-func UnicodeDecodeWithOffsetsErrors(value string) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["errors"] = value
-	}
-}
-
-// UnicodeDecodeWithOffsetsReplacementChar sets the optional replacement_char attribute to value.
-//
-// value: The replacement character codepoint to be used in place of any invalid
-// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
-// be used. The default value is the default unicode replacement character is
-// 0xFFFD or U+65533.)
-// If not specified, defaults to 65533
-func UnicodeDecodeWithOffsetsReplacementChar(value int64) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["replacement_char"] = value
-	}
-}
-
-// UnicodeDecodeWithOffsetsReplaceControlCharacters sets the optional replace_control_characters attribute to value.
-//
-// value: Whether to replace the C0 control characters (00-1F) with the
-// `replacement_char`. Default is false.
-// If not specified, defaults to false
-func UnicodeDecodeWithOffsetsReplaceControlCharacters(value bool) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["replace_control_characters"] = value
-	}
-}
-
-// UnicodeDecodeWithOffsetsTsplits sets the optional Tsplits attribute to value.
-// If not specified, defaults to DT_INT64
-func UnicodeDecodeWithOffsetsTsplits(value tf.DataType) UnicodeDecodeWithOffsetsAttr {
-	return func(m optionalAttr) {
-		m["Tsplits"] = value
-	}
-}
-
-// Decodes each string in `input` into a sequence of Unicode code points.
-//
-// The character codepoints for all strings are returned using a single vector
-// `char_values`, with strings expanded to characters in row-major order.
-// Similarly, the character start byte offsets are returned using a single vector
-// `char_to_byte_starts`, with strings expanded in row-major order.
-//
-// The `row_splits` tensor indicates where the codepoints and start offsets for
-// each input string begin and end within the `char_values` and
-// `char_to_byte_starts` tensors.  In particular, the values for the `i`th
-// string (in row-major order) are stored in the slice
-// `[row_splits[i]:row_splits[i+1]]`. Thus:
-//
-// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
-//   character in the `i`th string (in row-major order).
-// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
-//   string (in row-major order).
-//
-// Arguments:
-//	input: The text to be decoded. Can have any shape. Note that the output is flattened
-// to a vector of char values.
-//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
-// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
-//
-// Returns:
-//	row_splits: A 1D int32 tensor containing the row splits.
-//	char_values: A 1D int32 Tensor containing the decoded codepoints.
-//	char_to_byte_starts: A 1D int32 Tensor containing the byte index in the input string where each
-// character in `char_values` starts.
-func UnicodeDecodeWithOffsets(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeWithOffsetsAttr) (row_splits tf.Output, char_values tf.Output, char_to_byte_starts tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"input_encoding": input_encoding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UnicodeDecodeWithOffsets",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// TPUPartitionedOutputAttr is an optional argument to TPUPartitionedOutput.
-type TPUPartitionedOutputAttr func(optionalAttr)
-
-// TPUPartitionedOutputPartitionDim sets the optional partition_dim attribute to value.
-//
-// value: An integer describles which dimension is partitioned.
-// If not specified, defaults to 0
-func TPUPartitionedOutputPartitionDim(value int64) TPUPartitionedOutputAttr {
-	return func(m optionalAttr) {
-		m["partition_dim"] = value
-	}
-}
-
-// An op that demultiplexes a tensor to be sharded by XLA to a list of partitioned
-//
-// outputs outside the XLA computation.
-//
-// Arguments:
-//	inputs: A tensor which represents the full shape of partitioned tensors.
-//
-//
-// Returns A list of partitioned inputs which must have the same shape.
-func TPUPartitionedOutput(scope *Scope, inputs tf.Output, num_splits int64, optional ...TPUPartitionedOutputAttr) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_splits": num_splits}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TPUPartitionedOutput",
-		Input: []tf.Input{
-			inputs,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("TPUPartitionedOutput", err)
-		return
-	}
-	return output
-}
-
 // Computes the mean along segments of a tensor.
 //
 // Read
@@ -47508,6 +48522,165 @@ func ParseSequenceExampleV2(scope *Scope, serialized tf.Output, debug_name tf.Ou
 	return context_sparse_indices, context_sparse_values, context_sparse_shapes, context_dense_values, context_ragged_values, context_ragged_row_splits, feature_list_sparse_indices, feature_list_sparse_values, feature_list_sparse_shapes, feature_list_dense_values, feature_list_dense_lengths, feature_list_ragged_values, feature_list_ragged_outer_splits, feature_list_ragged_inner_splits
 }
 
+// Reverses specific dimensions of a tensor.
+//
+// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
+// of `tensor`, this operation reverses each dimension i of `tensor` where
+// `dims[i]` is `True`.
+//
+// `tensor` can have up to 8 dimensions. The number of dimensions
+// of `tensor` must equal the number of elements in `dims`. In other words:
+//
+// `rank(tensor) = size(dims)`
+//
+// For example:
+//
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [False, False, False, True]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is [False, True, False, False]
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is [False, False, True, False]
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
+//
+// Arguments:
+//	tensor: Up to 8-D.
+//	dims: 1-D. The dimensions to reverse.
+//
+// Returns The same shape as `tensor`.
+func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Reverse",
+		Input: []tf.Input{
+			tensor, dims,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Wraps an arbitrary MLIR computation expressed as a module with a main() function.
+//
+// This operation does not have an associated kernel and is not intended to be
+// executed in a regular TensorFlow session. Instead it is intended to be used for
+// testing or for special case where a user intends to pass custom MLIR computation
+// through a TensorFlow graph with the intent of having custom tooling processing
+// it downstream (when targeting a different environment, like TensorFlow lite for
+// example).
+// The MLIR module is expected to have a main() function that will be used as an
+// entry point. The inputs to the operations will be passed as argument to the
+// main() function and the returned values of the main function mapped to the
+// outputs.
+// Example usage:
+//
+// ```
+// import tensorflow as tf
+// from tensorflow.compiler.mlir.tensorflow.gen_mlir_passthrough_op import mlir_passthrough_op
+//
+// mlir_module = '''python
+// func @main(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10x10xf32> {
+//    %add = "magic.op"(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10x10xf32>
+//    return %ret : tensor<10x10xf32>
+// }
+// '''
+//
+// @tf.function
+// def foo(x, y):
+//   return mlir_passthrough_op([x, y], mlir_module, Toutputs=[tf.float32])
+//
+// graph_def = foo.get_concrete_function(tf.TensorSpec([10], tf.float32), tf.TensorSpec([10], tf.float32)).graph.as_graph_def()
+// ```
+func MlirPassthroughOp(scope *Scope, inputs []tf.Output, mlir_module string, Toutputs []tf.DataType) (outputs []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mlir_module": mlir_module, "Toutputs": Toutputs}
+	opspec := tf.OpSpec{
+		Type: "MlirPassthroughOp",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if outputs, idx, err = makeOutputList(op, idx, "outputs"); err != nil {
+		scope.UpdateErr("MlirPassthroughOp", err)
+		return
+	}
+	return outputs
+}
+
+// StringLowerAttr is an optional argument to StringLower.
+type StringLowerAttr func(optionalAttr)
+
+// StringLowerEncoding sets the optional encoding attribute to value.
+// If not specified, defaults to ""
+func StringLowerEncoding(value string) StringLowerAttr {
+	return func(m optionalAttr) {
+		m["encoding"] = value
+	}
+}
+
+// Converts all uppercase characters into their respective lowercase replacements.
+//
+// Example:
+//
+// >>> tf.strings.lower("CamelCase string and ALL CAPS")
+// <tf.Tensor: shape=(), dtype=string, numpy=b'camelcase string and all caps'>
+//
+func StringLower(scope *Scope, input tf.Output, optional ...StringLowerAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringLower",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // CudnnRNNAttr is an optional argument to CudnnRNN.
 type CudnnRNNAttr func(optionalAttr)
 
@@ -48367,105 +49540,6 @@ func LoadTPUEmbeddingAdagradParameters(scope *Scope, parameters tf.Output, accum
 	return scope.AddOperation(opspec)
 }
 
-// QuantizedMatMulWithBiasAndReluAttr is an optional argument to QuantizedMatMulWithBiasAndRelu.
-type QuantizedMatMulWithBiasAndReluAttr func(optionalAttr)
-
-// QuantizedMatMulWithBiasAndReluToutput sets the optional Toutput attribute to value.
-// If not specified, defaults to DT_QINT32
-func QuantizedMatMulWithBiasAndReluToutput(value tf.DataType) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["Toutput"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluTransposeA sets the optional transpose_a attribute to value.
-//
-// value: If true, `a` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulWithBiasAndReluTransposeA(value bool) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["transpose_a"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluTransposeB sets the optional transpose_b attribute to value.
-//
-// value: If true, `b` is transposed before multiplication.
-// If not specified, defaults to false
-func QuantizedMatMulWithBiasAndReluTransposeB(value bool) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["transpose_b"] = value
-	}
-}
-
-// QuantizedMatMulWithBiasAndReluInputQuantMode sets the optional input_quant_mode attribute to value.
-//
-// value: Input data quantization mode. Either MIN_FIRST(default) or SCALED.
-// If not specified, defaults to "MIN_FIRST"
-func QuantizedMatMulWithBiasAndReluInputQuantMode(value string) QuantizedMatMulWithBiasAndReluAttr {
-	return func(m optionalAttr) {
-		m["input_quant_mode"] = value
-	}
-}
-
-// Perform a quantized matrix multiplication of  `a` by the matrix `b` with bias
-// add and relu fusion.
-//
-// The inputs must be two-dimensional matrices and 1D bias vector. And the inner
-// dimension of `a` (after being transposed if `transpose_a` is non-zero) must
-// match the outer dimension of `b` (after being transposed if `transposed_b` is
-// non-zero). Then do broadcast add operation with bias values on the matrix
-// multiplication result. The bias size must match inner dimension of `b`. Then do
-// relu activation to get non-negative result.
-//
-// Arguments:
-//	a: A matrix to be multiplied. Must be a two-dimensional tensor of type `quint8`.
-//	b: A matrix to be multiplied and must be a two-dimensional tensor of type `qint8`.
-//	bias: A 1D bias tensor with size matching with inner dimension of `b` (after being
-// transposed if `transposed_b` is non-zero).
-//	min_a: The float value that the lowest quantized `a` value represents.
-//	max_a: The float value that the highest quantized `a` value represents.
-//	min_b: The float value that the lowest quantized `b` value represents.
-//	max_b: The float value that the highest quantized `b` value represents.
-//
-// Returns:
-//	out
-//	min_out: The float value that the lowest quantized output value represents.
-//	max_out: The float value that the highest quantized output value represents.
-func QuantizedMatMulWithBiasAndRelu(scope *Scope, a tf.Output, b tf.Output, bias tf.Output, min_a tf.Output, max_a tf.Output, min_b tf.Output, max_b tf.Output, optional ...QuantizedMatMulWithBiasAndReluAttr) (out tf.Output, min_out tf.Output, max_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedMatMulWithBiasAndRelu",
-		Input: []tf.Input{
-			a, b, bias, min_a, max_a, min_b, max_b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Shuts down a running distributed TPU system.
-//
-// The op returns an error if no system is running.
-//
-// Returns the created operation.
-func ShutdownDistributedTPU(scope *Scope) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ShutdownDistributedTPU",
-	}
-	return scope.AddOperation(opspec)
-}
-
 // SerializeManySparseAttr is an optional argument to SerializeManySparse.
 type SerializeManySparseAttr func(optionalAttr)
 
@@ -48513,6 +49587,56 @@ func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values t
 	return op.Output(0)
 }
 
+// TPUPartitionedOutputAttr is an optional argument to TPUPartitionedOutput.
+type TPUPartitionedOutputAttr func(optionalAttr)
+
+// TPUPartitionedOutputPartitionDim sets the optional partition_dim attribute to value.
+//
+// value: An integer describles which dimension is partitioned.
+// If not specified, defaults to 0
+func TPUPartitionedOutputPartitionDim(value int64) TPUPartitionedOutputAttr {
+	return func(m optionalAttr) {
+		m["partition_dim"] = value
+	}
+}
+
+// An op that demultiplexes a tensor to be sharded by XLA to a list of partitioned
+//
+// outputs outside the XLA computation.
+//
+// Arguments:
+//	inputs: A tensor which represents the full shape of partitioned tensors.
+//
+//
+// Returns A list of partitioned inputs which must have the same shape.
+func TPUPartitionedOutput(scope *Scope, inputs tf.Output, num_splits int64, optional ...TPUPartitionedOutputAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_splits": num_splits}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TPUPartitionedOutput",
+		Input: []tf.Input{
+			inputs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("TPUPartitionedOutput", err)
+		return
+	}
+	return output
+}
+
 // RequantizePerChannelAttr is an optional argument to RequantizePerChannel.
 type RequantizePerChannelAttr func(optionalAttr)
 
@@ -48819,6 +49943,284 @@ func LoadTPUEmbeddingStochasticGradientDescentParameters(scope *Scope, parameter
 	return scope.AddOperation(opspec)
 }
 
+// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
+type FusedResizeAndPadConv2DAttr func(optionalAttr)
+
+// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
+//
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
+	return func(m optionalAttr) {
+		m["resize_align_corners"] = value
+	}
+}
+
+// Performs a resize and padding as a preprocess during a convolution.
+//
+// It's often possible to do spatial transformations more efficiently as part of
+// the packing stage of a convolution, so this op allows for an optimized
+// implementation where these stages are fused together. This prevents the need to
+// write out the intermediate results as whole tensors, reducing memory pressure,
+// and we can get some latency gains by merging the transformation calculations.
+// The data_format attribute for Conv2D isn't supported by this op, and defaults to
+// 'NHWC' order.
+// Internally this op uses a single per-graph scratch buffer, which means that it
+// will block if multiple versions are being run in parallel. This is because this
+// operator is primarily an optimization to minimize memory usage.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//	paddings: A two-column matrix specifying the padding sizes. The number of
+// rows must be the same as the rank of `input`.
+//	filter: 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.
+//
+//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
+// of `input`. Must be in the same order as the dimension specified with format.
+//	padding: The type of padding algorithm to use.
+func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedResizeAndPadConv2D",
+		Input: []tf.Input{
+			input, size, paddings, filter,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that zips together `input_datasets`.
+//
+// The elements of the resulting dataset are created by zipping corresponding
+// elements from each of the input datasets.
+//
+// The size of the resulting dataset will match the size of the smallest input
+// dataset, and no error will be raised if input datasets have different sizes.
+//
+// Arguments:
+//	input_datasets: List of `N` variant Tensors representing datasets to be zipped together.
+//
+//
+func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "ZipDataset",
+		Input: []tf.Input{
+			tf.OutputList(input_datasets),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Rounds the values of a tensor to the nearest integer, element-wise.
+//
+// Rounds half to even.  Also known as bankers rounding. If you want to round
+// according to the current system rounding mode use std::cint.
+func Round(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Round",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a tree ensemble model and returns a handle to it.
+//
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble resource to be created.
+//	stamp_token: Token to use as the initial value of the resource stamp.
+//	tree_ensemble_serialized: Serialized proto of the tree ensemble.
+//
+// Returns the created operation.
+func BoostedTreesCreateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesCreateEnsemble",
+		Input: []tf.Input{
+			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Calculates the softmax of a CSRSparseMatrix.
+//
+// Calculate the softmax of the innermost dimensions of a SparseMatrix.
+//
+// Missing values are treated as `-inf` (i.e., logits of zero probability); and
+// the output has the same sparsity structure as the input (though missing values
+// in the output may now be treated as having probability zero).
+//
+// Arguments:
+//	logits: A CSRSparseMatrix.
+//
+//
+// Returns A CSRSparseMatrix.
+func SparseMatrixSoftmax(scope *Scope, logits tf.Output, type_ tf.DataType) (softmax tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"type": type_}
+	opspec := tf.OpSpec{
+		Type: "SparseMatrixSoftmax",
+		Input: []tf.Input{
+			logits,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Retrieves a single tensor from the computation outfeed. Device ordinal is a
+// tensor allowing dynamic outfeed.
+//
+// This operation will block indefinitely until data is available.
+//
+// Arguments:
+//	device_ordinal: An int scalar tensor, representing the TPU device to use. This should be -1 when
+// the Op is running on a TPU device, and >= 0 when the Op is running on the CPU
+// device.
+//	dtype: The type of elements in the tensor.
+//	shape: The shape of the tensor.
+//
+// Returns A tensor that will be read from the device outfeed.
+func OutfeedDequeueV2(scope *Scope, device_ordinal tf.Output, dtype tf.DataType, shape tf.Shape) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "OutfeedDequeueV2",
+		Input: []tf.Input{
+			device_ordinal,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// CollectiveGatherV2Attr is an optional argument to CollectiveGatherV2.
+type CollectiveGatherV2Attr func(optionalAttr)
+
+// CollectiveGatherV2CommunicationHint sets the optional communication_hint attribute to value.
+// If not specified, defaults to "auto"
+func CollectiveGatherV2CommunicationHint(value string) CollectiveGatherV2Attr {
+	return func(m optionalAttr) {
+		m["communication_hint"] = value
+	}
+}
+
+// CollectiveGatherV2TimeoutSeconds sets the optional timeout_seconds attribute to value.
+// If not specified, defaults to 0
+func CollectiveGatherV2TimeoutSeconds(value float32) CollectiveGatherV2Attr {
+	return func(m optionalAttr) {
+		m["timeout_seconds"] = value
+	}
+}
+
+// Mutually accumulates multiple tensors of identical type and shape.
+func CollectiveGatherV2(scope *Scope, input tf.Output, group_size tf.Output, group_key tf.Output, instance_key tf.Output, optional ...CollectiveGatherV2Attr) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "CollectiveGatherV2",
+		Input: []tf.Input{
+			input, group_size, group_key, instance_key,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr is an optional argument to RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug.
+type RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr func(optionalAttr)
+
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId sets the optional table_id attribute to value.
+// If not specified, defaults to -1
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableId(value int64) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_id"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName sets the optional table_name attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugTableName(value string) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["table_name"] = value
+	}
+}
+
+// RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugConfig sets the optional config attribute to value.
+// If not specified, defaults to ""
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugConfig(value string) RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr {
+	return func(m optionalAttr) {
+		m["config"] = value
+	}
+}
+
+// Retrieve Adadelta embedding parameters with debug support.
+//
+// An op that retrieves optimization parameters from embedding to host
+// memory. Must be preceded by a ConfigureTPUEmbeddingHost op that sets up
+// the correct embedding table configuration. For example, this op is
+// used to retrieve updated parameters before saving a checkpoint.
+//
+// Returns:
+//	parameters: Parameter parameters updated by the Adadelta optimization algorithm.
+//	accumulators: Parameter accumulators updated by the Adadelta optimization algorithm.
+//	updates: Parameter updates updated by the Adadelta optimization algorithm.
+//	gradient_accumulators: Parameter gradient_accumulators updated by the Adadelta optimization algorithm.
+func RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug(scope *Scope, num_shards int64, shard_id int64, optional ...RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugAttr) (parameters tf.Output, accumulators tf.Output, updates tf.Output, gradient_accumulators tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_shards": num_shards, "shard_id": shard_id}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
+}
+
 // CropAndResizeGradImageAttr is an optional argument to CropAndResizeGradImage.
 type CropAndResizeGradImageAttr func(optionalAttr)
 
@@ -49367,7 +50769,15 @@ func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataTy
 //
 // This operation converts Unicode code points to script codes corresponding to
 // each code point. Script codes correspond to International Components for
-// Unicode (ICU) UScriptCode values. See http://icu-project.org/apiref/icu4c/uscript_8h.html.
+// Unicode (ICU) UScriptCode values.
+//
+// See
+// [ICU project docs](http://icu-project.org/apiref/icu4c/uscript_8h.html)
+// for more details on script codes.
+//
+// For an example, see the unicode strings guide on [unicode scripts]
+// (https://www.tensorflow.org/tutorials/load_data/unicode#representing_unicode).
+//
 // Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints. Output shape will
 // match input shape.
 //
@@ -49555,102 +50965,6 @@ func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_s
 	return op.Output(0)
 }
 
-// Creates a dataset that zips together `input_datasets`.
-//
-// The elements of the resulting dataset are created by zipping corresponding
-// elements from each of the input datasets.
-//
-// The size of the resulting dataset will match the size of the smallest input
-// dataset, and no error will be raised if input datasets have different sizes.
-//
-// Arguments:
-//	input_datasets: List of `N` variant Tensors representing datasets to be zipped together.
-//
-//
-func ZipDataset(scope *Scope, input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "ZipDataset",
-		Input: []tf.Input{
-			tf.OutputList(input_datasets),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Rounds the values of a tensor to the nearest integer, element-wise.
-//
-// Rounds half to even.  Also known as bankers rounding. If you want to round
-// according to the current system rounding mode use std::cint.
-func Round(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Round",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a tree ensemble model and returns a handle to it.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble resource to be created.
-//	stamp_token: Token to use as the initial value of the resource stamp.
-//	tree_ensemble_serialized: Serialized proto of the tree ensemble.
-//
-// Returns the created operation.
-func BoostedTreesCreateEnsemble(scope *Scope, tree_ensemble_handle tf.Output, stamp_token tf.Output, tree_ensemble_serialized tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesCreateEnsemble",
-		Input: []tf.Input{
-			tree_ensemble_handle, stamp_token, tree_ensemble_serialized,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Calculates the softmax of a CSRSparseMatrix.
-//
-// Calculate the softmax of the innermost dimensions of a SparseMatrix.
-//
-// Missing values are treated as `-inf` (i.e., logits of zero probability); and
-// the output has the same sparsity structure as the input (though missing values
-// in the output may now be treated as having probability zero).
-//
-// Arguments:
-//	logits: A CSRSparseMatrix.
-//
-//
-// Returns A CSRSparseMatrix.
-func SparseMatrixSoftmax(scope *Scope, logits tf.Output, type_ tf.DataType) (softmax tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"type": type_}
-	opspec := tf.OpSpec{
-		Type: "SparseMatrixSoftmax",
-		Input: []tf.Input{
-			logits,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // RestoreAttr is an optional argument to Restore.
 type RestoreAttr func(optionalAttr)
 
@@ -50035,6 +51349,107 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantize(scope *Scope, input tf
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// UnicodeDecodeWithOffsetsAttr is an optional argument to UnicodeDecodeWithOffsets.
+type UnicodeDecodeWithOffsetsAttr func(optionalAttr)
+
+// UnicodeDecodeWithOffsetsErrors sets the optional errors attribute to value.
+//
+// value: Error handling policy when there is invalid formatting found in the input.
+// The value of 'strict' will cause the operation to produce a InvalidArgument
+// error on any invalid input formatting. A value of 'replace' (the default) will
+// cause the operation to replace any invalid formatting in the input with the
+// `replacement_char` codepoint. A value of 'ignore' will cause the operation to
+// skip any invalid formatting in the input and produce no corresponding output
+// character.
+// If not specified, defaults to "replace"
+func UnicodeDecodeWithOffsetsErrors(value string) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["errors"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsReplacementChar sets the optional replacement_char attribute to value.
+//
+// value: The replacement character codepoint to be used in place of any invalid
+// formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+// be used. The default value is the default unicode replacement character is
+// 0xFFFD or U+65533.)
+// If not specified, defaults to 65533
+func UnicodeDecodeWithOffsetsReplacementChar(value int64) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replacement_char"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsReplaceControlCharacters sets the optional replace_control_characters attribute to value.
+//
+// value: Whether to replace the C0 control characters (00-1F) with the
+// `replacement_char`. Default is false.
+// If not specified, defaults to false
+func UnicodeDecodeWithOffsetsReplaceControlCharacters(value bool) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["replace_control_characters"] = value
+	}
+}
+
+// UnicodeDecodeWithOffsetsTsplits sets the optional Tsplits attribute to value.
+// If not specified, defaults to DT_INT64
+func UnicodeDecodeWithOffsetsTsplits(value tf.DataType) UnicodeDecodeWithOffsetsAttr {
+	return func(m optionalAttr) {
+		m["Tsplits"] = value
+	}
+}
+
+// Decodes each string in `input` into a sequence of Unicode code points.
+//
+// The character codepoints for all strings are returned using a single vector
+// `char_values`, with strings expanded to characters in row-major order.
+// Similarly, the character start byte offsets are returned using a single vector
+// `char_to_byte_starts`, with strings expanded in row-major order.
+//
+// The `row_splits` tensor indicates where the codepoints and start offsets for
+// each input string begin and end within the `char_values` and
+// `char_to_byte_starts` tensors.  In particular, the values for the `i`th
+// string (in row-major order) are stored in the slice
+// `[row_splits[i]:row_splits[i+1]]`. Thus:
+//
+// * `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
+//   character in the `i`th string (in row-major order).
+// * `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+//   string (in row-major order).
+//
+// Arguments:
+//	input: The text to be decoded. Can have any shape. Note that the output is flattened
+// to a vector of char values.
+//	input_encoding: Text encoding of the input strings. This is any of the encodings supported
+// by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+//
+// Returns:
+//	row_splits: A 1D int32 tensor containing the row splits.
+//	char_values: A 1D int32 Tensor containing the decoded codepoints.
+//	char_to_byte_starts: A 1D int32 Tensor containing the byte index in the input string where each
+// character in `char_values` starts.
+func UnicodeDecodeWithOffsets(scope *Scope, input tf.Output, input_encoding string, optional ...UnicodeDecodeWithOffsetsAttr) (row_splits tf.Output, char_values tf.Output, char_to_byte_starts tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"input_encoding": input_encoding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "UnicodeDecodeWithOffsets",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Removes keys and its associated values from a table.
 //
 // The tensor `keys` must of the same type as the keys of the table. Keys not
@@ -50296,64 +51711,6 @@ func LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug(scope *Scope, param
 	return scope.AddOperation(opspec)
 }
 
-// FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
-type FusedResizeAndPadConv2DAttr func(optionalAttr)
-
-// FusedResizeAndPadConv2DResizeAlignCorners sets the optional resize_align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func FusedResizeAndPadConv2DResizeAlignCorners(value bool) FusedResizeAndPadConv2DAttr {
-	return func(m optionalAttr) {
-		m["resize_align_corners"] = value
-	}
-}
-
-// Performs a resize and padding as a preprocess during a convolution.
-//
-// It's often possible to do spatial transformations more efficiently as part of
-// the packing stage of a convolution, so this op allows for an optimized
-// implementation where these stages are fused together. This prevents the need to
-// write out the intermediate results as whole tensors, reducing memory pressure,
-// and we can get some latency gains by merging the transformation calculations.
-// The data_format attribute for Conv2D isn't supported by this op, and defaults to
-// 'NHWC' order.
-// Internally this op uses a single per-graph scratch buffer, which means that it
-// will block if multiple versions are being run in parallel. This is because this
-// operator is primarily an optimization to minimize memory usage.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	size: A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//	paddings: A two-column matrix specifying the padding sizes. The number of
-// rows must be the same as the rank of `input`.
-//	filter: 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.
-//
-//	strides: 1-D of length 4.  The stride of the sliding window for each dimension
-// of `input`. Must be in the same order as the dimension specified with format.
-//	padding: The type of padding algorithm to use.
-func FusedResizeAndPadConv2D(scope *Scope, input tf.Output, size tf.Output, paddings tf.Output, filter tf.Output, mode string, strides []int64, padding string, optional ...FusedResizeAndPadConv2DAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"mode": mode, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FusedResizeAndPadConv2D",
-		Input: []tf.Input{
-			input, size, paddings, filter,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // EnqueueTPUEmbeddingRaggedTensorBatchAttr is an optional argument to EnqueueTPUEmbeddingRaggedTensorBatch.
 type EnqueueTPUEmbeddingRaggedTensorBatchAttr func(optionalAttr)
 
@@ -50442,50 +51799,30 @@ func EnqueueTPUEmbeddingRaggedTensorBatch(scope *Scope, sample_splits []tf.Outpu
 	return scope.AddOperation(opspec)
 }
 
-// Creates and returns an empty tensor list.
+// Computes the determinant of one or more square matrices.
 //
-// All list elements must be tensors of dtype element_dtype and shape compatible
-// with element_shape.
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor containing the determinants
+// for all input submatrices `[..., :, :]`.
 //
-// handle: an empty tensor list.
-// element_dtype: the type of elements in the list.
-// element_shape: a shape compatible with that of elements in the list.
-func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[...]`.
+func MatrixDeterminant(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "EmptyTensorList",
+		Type: "MatrixDeterminant",
 		Input: []tf.Input{
-			element_shape, max_num_elements,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Sets up TPUEmbedding in a distributed TPU system.
-//
-// Arguments:
-//	config: Serialized tensorflow.tpu.TPUEmbeddingConfiguration that
-// describes the embedding lookups of the program.
-//
-// Returns the created operation.
-func ConfigureTPUEmbedding(scope *Scope, config string) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"config": config}
-	opspec := tf.OpSpec{
-		Type: "ConfigureTPUEmbedding",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Returns the number of gradients aggregated in the given accumulators.
 //
 // Arguments:
@@ -50541,196 +51878,3 @@ func TPUReplicatedOutput(scope *Scope, input tf.Output, num_replicas int64) (out
 	}
 	return outputs
 }
-
-// Scatter `updates` into a new tensor according to `indices`.
-//
-// Creates a new tensor by applying sparse `updates` to individual values or
-// slices within a tensor (initially zero for numeric, empty for string) of
-// the given `shape` according to indices.  This operator is the inverse of the
-// `tf.gather_nd` operator which extracts values or slices from a given tensor.
-//
-// This operation is similar to tensor_scatter_add, except that the tensor is
-// zero-initialized. Calling `tf.scatter_nd(indices, values, shape)` is identical
-// to `tensor_scatter_add(tf.zeros(shape, values.dtype), indices, values)`
-//
-// If `indices` contains duplicates, then their updates are accumulated (summed).
-//
-// **WARNING**: The order in which updates are applied is nondeterministic, so the
-// output will be nondeterministic if `indices` contains duplicates -- because
-// of some numerical approximation issues, numbers summed in different order
-// may yield different results.
-//
-// `indices` is an integer tensor containing indices into a new tensor of shape
-// `shape`.  The last dimension of `indices` can be at most the rank of `shape`:
-//
-//     indices.shape[-1] <= shape.rank
-//
-// The last dimension of `indices` corresponds to indices into elements
-// (if `indices.shape[-1] = shape.rank`) or slices
-// (if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
-// `shape`.  `updates` is a tensor with shape
-//
-//     indices.shape[:-1] + shape[indices.shape[-1]:]
-//
-// The simplest form of scatter is to insert individual elements in a tensor by
-// index. For example, say we want to insert 4 scattered elements in a rank-1
-// tensor with 8 elements.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
-// </div>
-//
-// In Python, this scatter operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[4], [3], [1], [7]])
-//     updates = tf.constant([9, 10, 11, 12])
-//     shape = tf.constant([8])
-//     scatter = tf.scatter_nd(indices, updates, shape)
-//     print(scatter)
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [0, 11, 0, 10, 9, 0, 0, 12]
-//
-// We can also, insert entire slices of a higher rank tensor all at once. For
-// example, if we wanted to insert two slices in the first dimension of a
-// rank-3 tensor with two matrices of new values.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd2.png" alt>
-// </div>
-//
-// In Python, this scatter operation would look like this:
-//
-// ```python
-//     indices = tf.constant([[0], [2]])
-//     updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]],
-//                            [[5, 5, 5, 5], [6, 6, 6, 6],
-//                             [7, 7, 7, 7], [8, 8, 8, 8]]])
-//     shape = tf.constant([4, 4, 4])
-//     scatter = tf.scatter_nd(indices, updates, shape)
-//     print(scatter)
-// ```
-//
-// The resulting tensor would look like this:
-//
-//     [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]],
-//      [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
-//      [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]]
-//
-// Note that on CPU, if an out of bound index is found, an error is returned.
-// On GPU, if an out of bound index is found, the index is ignored.
-//
-// Arguments:
-//	indices: Index tensor.
-//	updates: Updates to scatter into output.
-//	shape: 1-D. The shape of the resulting tensor.
-//
-// Returns A new tensor with the given shape and updates applied according
-// to the indices.
-func ScatterNd(scope *Scope, indices tf.Output, updates tf.Output, shape tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ScatterNd",
-		Input: []tf.Input{
-			indices, updates, shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// UniqueAttr is an optional argument to Unique.
-type UniqueAttr func(optionalAttr)
-
-// UniqueOutIdx sets the optional out_idx attribute to value.
-// If not specified, defaults to DT_INT32
-func UniqueOutIdx(value tf.DataType) UniqueAttr {
-	return func(m optionalAttr) {
-		m["out_idx"] = value
-	}
-}
-
-// Finds unique elements in a 1-D tensor.
-//
-// This operation returns a tensor `y` containing all of the unique elements of `x`
-// sorted in the same order that they occur in `x`; `x` does not need to be sorted.
-// This operation also returns a tensor `idx` the same size as `x` that contains
-// the index of each value of `x` in the unique output `y`. In other words:
-//
-// `y[idx[i]] = x[i] for i in [0, 1,...,rank(x) - 1]`
-//
-// Examples:
-//
-// ```
-// # tensor 'x' is [1, 1, 2, 4, 4, 4, 7, 8, 8]
-// y, idx = unique(x)
-// y ==> [1, 2, 4, 7, 8]
-// idx ==> [0, 0, 1, 2, 2, 2, 3, 4, 4]
-// ```
-//
-// ```
-// # tensor 'x' is [4, 5, 1, 2, 3, 3, 4, 5]
-// y, idx = unique(x)
-// y ==> [4, 5, 1, 2, 3]
-// idx ==> [0, 1, 2, 3, 4, 4, 0, 1]
-// ```
-//
-// Arguments:
-//	x: 1-D.
-//
-// Returns:
-//	y: 1-D.
-//	idx: 1-D.
-func Unique(scope *Scope, x tf.Output, optional ...UniqueAttr) (y tf.Output, idx tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Unique",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Converts a `RaggedTensor` into a `SparseTensor` with the same values.
-//
-// input=ragged.from_nested_row_splits(rt_dense_values, rt_nested_splits)
-// output=SparseTensor(indices=sparse_indices, values=sparse_values,
-//                     dense_shape=sparse_dense_shape)
-//
-// Arguments:
-//	rt_nested_splits: The `row_splits` for the `RaggedTensor`.
-//	rt_dense_values: The `flat_values` for the `RaggedTensor`.
-//
-// Returns:
-//	sparse_indices: The indices for the `SparseTensor`.
-//	sparse_values: The values of the `SparseTensor`.
-//	sparse_dense_shape: `sparse_dense_shape` is a tight bounding box of the input `RaggedTensor`.
-func RaggedTensorToSparse(scope *Scope, rt_nested_splits []tf.Output, rt_dense_values tf.Output) (sparse_indices tf.Output, sparse_values tf.Output, sparse_dense_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RaggedTensorToSparse",
-		Input: []tf.Input{
-			tf.OutputList(rt_nested_splits), rt_dense_values,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
diff --git a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
index f40090ac45d..19f5e29da2b 100644
--- a/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
+++ b/tensorflow/java/maven/spark-tensorflow-connector/pom.xml
@@ -35,7 +35,7 @@
         <java.version>1.8</java.version>
         <spark.version>2.4.5</spark.version>
         <yarn.api.version>2.7.3</yarn.api.version>
-        <junit.version>4.11</junit.version>
+        <junit.version>4.13.1</junit.version>
     </properties>
 
     <build>
diff --git a/tensorflow/java/maven/tensorflow-hadoop/pom.xml b/tensorflow/java/maven/tensorflow-hadoop/pom.xml
index e900d81e5da..675a3369cf1 100644
--- a/tensorflow/java/maven/tensorflow-hadoop/pom.xml
+++ b/tensorflow/java/maven/tensorflow-hadoop/pom.xml
@@ -16,7 +16,7 @@
         <maven.compiler.target>1.6</maven.compiler.target>
         <hadoop.version>2.6.0</hadoop.version>
         <protobuf.version>3.5.1</protobuf.version>
-        <junit.version>4.11</junit.version>
+        <junit.version>4.13.1</junit.version>
     </properties>
 
     <licenses>
diff --git a/tensorflow/java/src/main/java/org/tensorflow/EagerSession.java b/tensorflow/java/src/main/java/org/tensorflow/EagerSession.java
index cbb878ed867..3403e80594f 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/EagerSession.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/EagerSession.java
@@ -168,9 +168,9 @@ public final class EagerSession implements ExecutionEnvironment, AutoCloseable {
      * <p>Warning: the support of this feature is subject to changes since TensorFlow protos might
      * not be supported on public endpoints in the future.
      *
+     * <p>See also: <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/config.proto">config.proto</a>
+     *
      * @param value a serialized config proto
-     * @see <a
-     *     href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/config.proto"/>
      */
     public Options config(byte[] value) {
       config = value;
diff --git a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
index 3c9a678cf56..97de99cb75e 100644
--- a/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
+++ b/tensorflow/java/src/main/java/org/tensorflow/NativeLibrary.java
@@ -51,7 +51,7 @@ final class NativeLibrary {
       // (1) The native library has already been statically loaded, OR
       // (2) The required native code has been statically linked (through a custom launcher), OR
       // (3) The native code is part of another library (such as an application-level library)
-      // that has already been loaded. For example, tensorflow/examples/android and
+      // that has already been loaded. For example, tensorflow/tools/android/test and
       // tensorflow/tools/android/inference_interface include the required native code in
       // differently named libraries.
       //
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index e80e32fe6cf..597f81194cd 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -1,6 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "if_not_windows", "tf_cc_test")
-load("//tensorflow/lite:build_def.bzl", "if_tflite_experimental_runtime", "tflite_cc_shared_object", "tflite_copts", "tflite_experimental_runtime_linkopts")
+load("//tensorflow/lite:build_def.bzl", "tflite_cc_shared_object", "tflite_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -15,13 +16,6 @@ exports_files(glob([
     "models/testdata/*",
 ]))
 
-config_setting(
-    name = "enable_default_profiler",
-    values = {
-        "copt": "-DTFLITE_ENABLE_DEFAULT_PROFILER",
-    },
-)
-
 config_setting(
     name = "gemmlowp_profiling",
     values = {
@@ -43,18 +37,6 @@ config_setting(
     },
 )
 
-config_setting(
-    name = "tflite_experimental_runtime_eager",
-    values = {"define": "tflite_experimental_runtime=eager"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "tflite_experimental_runtime_non_eager",
-    values = {"define": "tflite_experimental_runtime=non-eager"},
-    visibility = ["//visibility:public"],
-)
-
 config_setting(
     name = "tf_lite_static_memory",
     values = {
@@ -90,6 +72,7 @@ FRAMEWORK_LIB_HDRS = [
 cc_library(
     name = "version",
     hdrs = ["version.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
     # Note that we only use the header defines from :version_lib.
     deps = ["//tensorflow/core:version_lib"],
@@ -107,6 +90,7 @@ cc_library(
     name = "arena_planner",
     srcs = ["arena_planner.cc"],
     hdrs = ["arena_planner.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
     deps = [
         ":graph_info",
@@ -137,6 +121,7 @@ cc_test(
 cc_library(
     name = "context",
     hdrs = ["context.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
     deps = ["//tensorflow/lite/c:common"],
 )
@@ -145,6 +130,7 @@ cc_library(
     name = "external_cpu_backend_context",
     srcs = ["external_cpu_backend_context.cc"],
     hdrs = ["external_cpu_backend_context.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
     deps = [
         "//tensorflow/lite/c:common",
@@ -154,6 +140,7 @@ cc_library(
 cc_library(
     name = "graph_info",
     hdrs = ["graph_info.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
     deps = ["//tensorflow/lite/c:common"],
 )
@@ -161,6 +148,7 @@ cc_library(
 cc_library(
     name = "memory_planner",
     hdrs = ["memory_planner.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
     deps = ["//tensorflow/lite/c:common"],
 )
@@ -169,6 +157,7 @@ cc_library(
     name = "simple_memory_arena",
     srcs = ["simple_memory_arena.cc"],
     hdrs = ["simple_memory_arena.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
     deps = ["//tensorflow/lite/c:common"],
 )
@@ -188,9 +177,16 @@ cc_library(
         "builtin_ops.h",
         "context_util.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     deps = ["//tensorflow/lite/c:common"],
 )
 
+cc_library(
+    name = "builtin_ops",
+    hdrs = ["builtin_ops.h"],
+    compatible_with = get_compatible_with_portable(),
+)
+
 exports_files(["builtin_ops.h"])
 
 cc_library(
@@ -198,6 +194,7 @@ cc_library(
     hdrs = [
         "string_type.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
 )
 
@@ -219,6 +216,7 @@ cc_library(
     hdrs = [
         "allocation.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
     deps = [
         ":string",
@@ -235,11 +233,10 @@ cc_library(
         "interpreter.cc",
         "interpreter_builder.cc",
         "model_builder.cc",
-        "mutable_op_resolver.cc",
         "optional_debug_tools.cc",
-        "stderr_reporter.cc",
     ],
     hdrs = FRAMEWORK_LIB_HDRS,
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
     visibility = [
         "//tensorflow/lite:__subpackages__",
@@ -249,28 +246,29 @@ cc_library(
         ":arena_planner",
         ":external_cpu_backend_context",
         ":graph_info",
+        ":kernel_api",
         ":memory_planner",
         ":minimal_logging",
+        ":mutable_op_resolver",
         ":shared_library",
         ":simple_memory_arena",
+        ":stderr_reporter",
         ":string",
         ":type_to_tflitetype",
         ":util",
         ":version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/api:verifier",
         "//tensorflow/lite/delegates:status",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/nnapi:nnapi_implementation",
+        "//tensorflow/lite/profiling:platform_profiler",
         "//tensorflow/lite/schema:schema_fbs",
-    ] + select({
-        ":enable_default_profiler": [
-            "//tensorflow/lite/profiling:platform_profiler",
-        ],
-        "//conditions:default": [],
-    }),
+        "//tensorflow/lite/schema:schema_utils",
+    ],
     alwayslink = 1,
 )
 
@@ -280,17 +278,13 @@ cc_library(
     srcs = [
     ],
     hdrs = FRAMEWORK_LIB_HDRS,
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
-    defines = if_tflite_experimental_runtime(
-        if_eager = ["TFLITE_EXPERIMENTAL_RUNTIME_EAGER"],
-        if_non_eager = ["TFLITE_EXPERIMENTAL_RUNTIME_NON_EAGER"],
-        if_none = [],
-    ),
     deps = [
-        ":framework_lib",
         ":allocation",
         ":arena_planner",
         ":external_cpu_backend_context",
+        ":framework_lib",
         ":graph_info",
         ":memory_planner",
         ":minimal_logging",
@@ -301,17 +295,79 @@ cc_library(
         ":version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/api:verifier",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/schema:schema_fbs",
-    ] + tflite_experimental_runtime_linkopts(),
+    ],
+)
+
+cc_library(
+    name = "error_reporter",
+    hdrs = ["error_reporter.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        "//tensorflow/lite:stderr_reporter",
+        "//tensorflow/lite/core/api:error_reporter",
+    ],
+)
+
+cc_library(
+    name = "stderr_reporter",
+    srcs = ["stderr_reporter.cc"],
+    hdrs = ["stderr_reporter.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        ":minimal_logging",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/api:error_reporter",
+    ],
+)
+
+cc_library(
+    name = "op_resolver",
+    hdrs = ["op_resolver.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        "//tensorflow/lite:mutable_op_resolver",
+        "//tensorflow/lite/core/api:op_resolver",
+    ],
+)
+
+cc_library(
+    name = "mutable_op_resolver",
+    srcs = ["mutable_op_resolver.cc"],
+    hdrs = ["mutable_op_resolver.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        ":util",
+        "//tensorflow/lite/core/api:op_resolver",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
 )
 
 cc_library(
     name = "string_util",
     srcs = ["string_util.cc"],
     hdrs = ["string_util.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS,
     deps = [
         ":string",
@@ -356,6 +412,7 @@ cc_library(
 
 cc_library(
     name = "tflite_with_xnnpack_default",
+    compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:private"],
     # TODO(b/151246885): put ":tflite_with_xnnpack_enabled" to macos/windows
     # once we have a good testing coverage on these two platforms.
@@ -373,6 +430,7 @@ cc_library(
         "core/macros.h",
         "tflite_with_xnnpack_optional.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
     deps = [
         "//tensorflow/lite/c:common",
@@ -478,8 +536,10 @@ cc_test(
     data = [
         "testdata/0_subgraphs.bin",
         "testdata/2_subgraphs.bin",
+        "testdata/add_shared_tensors.bin",
         "testdata/empty_model.bin",
         "testdata/multi_add_flex.bin",
+        "testdata/segment_sum_invalid_buffer.bin",
         "testdata/sparse_tensor.bin",
         "testdata/test_min_runtime.bin",
         "testdata/test_model.bin",
@@ -564,10 +624,20 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "stderr_reporter_test",
+    srcs = ["stderr_reporter_test.cc"],
+    deps = [
+        ":stderr_reporter",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "util",
     srcs = ["util.cc"],
     hdrs = ["util.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS + tflite_copts(),
     deps = [
         ":kernel_api",
@@ -611,6 +681,7 @@ cc_library(
         ],
     }),
     hdrs = ["minimal_logging.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = TFLITE_DEFAULT_COPTS + tflite_copts(),
     linkopts = select({
         "//tensorflow:android": ["-llog"],
@@ -631,6 +702,7 @@ cc_library(
             "type_to_tflitetype.h",
         ],
     }),
+    compatible_with = get_compatible_with_portable(),
     deps = ["//tensorflow/lite/c:common"],
 )
 
@@ -660,6 +732,7 @@ cc_test(
 cc_library(
     name = "shared_library",
     hdrs = ["shared_library.h"],
+    compatible_with = get_compatible_with_portable(),
     linkopts = if_not_windows(["-ldl"]),
 )
 
@@ -668,6 +741,13 @@ cc_library(
     hdrs = ["core/macros.h"],
 )
 
+cc_library(
+    name = "stateful_error_reporter",
+    hdrs = ["stateful_error_reporter.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = ["//tensorflow/lite/core/api"],
+)
+
 # Shared lib target for convenience, pulls in the core runtime and builtin ops.
 # Note: This target is not yet finalized, and the exact set of exported (C/C++)
 # APIs is subject to change. The output library name is platform dependent:
diff --git a/tensorflow/lite/CMakeLists.txt b/tensorflow/lite/CMakeLists.txt
index cfd8ebfc141..a75728e8a9d 100644
--- a/tensorflow/lite/CMakeLists.txt
+++ b/tensorflow/lite/CMakeLists.txt
@@ -20,8 +20,6 @@
 # This has only been tested on Windows, Linux and macOS.
 #
 # The following are not currently supported:
-# - GPU acceleration
-# - Android
 # - iOS
 # - Micro backend
 # - Tests
@@ -38,25 +36,35 @@ set(TENSORFLOW_SOURCE_DIR "" CACHE PATH
   "Directory that contains the TensorFlow project"
 )
 if(NOT TENSORFLOW_SOURCE_DIR)
-  set(TENSORFLOW_SOURCE_DIR "${CMAKE_SOURCE_DIR}/../../")
+  get_filename_component(TENSORFLOW_SOURCE_DIR
+    "${CMAKE_CURRENT_LIST_DIR}/../../"
+    ABSOLUTE
+  )
 endif()
 set(TF_SOURCE_DIR "${TENSORFLOW_SOURCE_DIR}/tensorflow")
-set(TFLITE_SOURCE_DIR "${CMAKE_SOURCE_DIR}")
-set(CMAKE_MODULE_PATH "${TFLITE_SOURCE_DIR}/tools/cmake/modules" ${CMAKE_MODULE_PATH})
-set(CMAKE_PREFIX_PATH "${TFLITE_SOURCE_DIR}/tools/cmake/modules" ${CMAKE_PREFIX_PATH})
+set(TFLITE_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}")
+set(CMAKE_MODULE_PATH
+  "${TFLITE_SOURCE_DIR}/tools/cmake/modules"
+  ${CMAKE_MODULE_PATH}
+)
+set(CMAKE_PREFIX_PATH
+  "${TFLITE_SOURCE_DIR}/tools/cmake/modules"
+  ${CMAKE_PREFIX_PATH}
+)
+# b/168750039: To workaround absl module not found error on Android build.
+set(absl_DIR ${CMAKE_MODULE_PATH})
 
 option(TFLITE_ENABLE_RUY "Enable experimental RUY integration" OFF)
 option(TFLITE_ENABLE_RESOURCE "Enable experimental support for resources" ON)
 option(TFLITE_ENABLE_NNAPI "Enable NNAPI (Android only)." ON)
 option(TFLITE_ENABLE_MMAP "Enable MMAP (unsupported on Windows)" ON)
-option(TFLITE_ENABLE_GPU "Enable GPU (not supported)" OFF)
+option(TFLITE_ENABLE_GPU "Enable GPU" OFF)
 # This must be enabled when converting from TF models with SELECT_TF_OPS
 # enabled.
 # https://www.tensorflow.org/lite/guide/ops_select#converting_the_model
 # This is currently not supported.
 option(TFLITE_ENABLE_FLEX "Enable SELECT_TF_OPS" OFF) # TODO: Add support
-option(TFLITE_ENABLE_XNNPACK "Enable XNNPACK backend" OFF) # TODO: Add XNNPACK
-option(TFLITE_ENABLE_PROFILING "Enable profiling" OFF)
+option(TFLITE_ENABLE_XNNPACK "Enable XNNPACK backend" ON)
 set(CMAKE_CXX_STANDARD 14)  # Some components require C++14.
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(_TFLITE_ENABLE_NNAPI "${TFLITE_ENABLE_NNAPI}")
@@ -120,40 +128,9 @@ find_package(gemmlowp REQUIRED)
 find_package(neon2sse REQUIRED)
 find_package(ruy REQUIRED)
 # Generate TensorFlow Lite FlatBuffer code.
-# This is not currently neccessary since the generated code is checked into
-# the repository but it would likely be preferable to do this in future.
-# NOTE: This will not work for cross compilation (e.g for iOS, Android etc.)
-# as flatc needs to be compiled with the host toolchain and this currently
-# builds with the target toolchain. Instead this should recursively call
-# cmake with the default host toolchain to build flatc.
-set(TFLITE_FLATBUFFERS_SCHEMAS "${TFLITE_SOURCE_DIR}/schema/schema.fbs")
-set(TFLITE_FLATBUFFERS_GEN_DIR
-  "${CMAKE_BINARY_DIR}/flatbuffers_generated/"
-)
-set(TFLITE_FLATBUFFERS_HDRS "")
-foreach(INPUT_SCHEMA ${TFLITE_FLATBUFFERS_SCHEMAS})
-  file(RELATIVE_PATH FILENAME "${TENSORFLOW_SOURCE_DIR}" "${INPUT_SCHEMA}")
-  get_filename_component(OUTPUT_DIR
-    "${TFLITE_FLATBUFFERS_GEN_DIR}/${FILENAME}" DIRECTORY
-  )
-  get_filename_component(OUTPUT_BASENAME
-    "${FILENAME}" NAME_WE
-  )
-  set(OUTPUT_FILENAME "${OUTPUT_DIR}/${OUTPUT_BASENAME}_generated.h")
-  list(APPEND TFLITE_FLATBUFFERS_HDRS "${OUTPUT_FILENAME}")
-  add_custom_command(
-    OUTPUT "${OUTPUT_FILENAME}"
-    COMMAND flatc
-      --cpp
-      --gen-mutable
-      --gen-object-api
-      --reflect-names
-      -I "${TENSORFLOW_SOURCE_DIR}"
-      -o "${OUTPUT_DIR}"
-      "${INPUT_SCHEMA}"
-    DEPENDS
-      "${INPUT_SCHEMA}")
-endforeach()
+# We used to have an actual compilation logic with flatc but decided to use
+# schema_generated.h since flatc doesn't work with cross compilation.
+set(TFLITE_FLATBUFFERS_SCHEMA_DIR "${TFLITE_SOURCE_DIR}/schema")
 set(TF_TARGET_PRIVATE_OPTIONS "")
 if(CMAKE_CXX_COMPILER_ID MATCHES "Clang$")
   # TensorFlow uses a heap of deprecated proto fields so surpress these
@@ -175,6 +152,16 @@ if(CMAKE_SYSTEM_NAME MATCHES "Windows")
   # use of std::min std::max.
   # Use NOGDI to ERROR macro which breaks TensorFlow logging.
   list(APPEND TFLITE_TARGET_PRIVATE_OPTIONS "-DNOMINMAX" "-DNOGDI")
+  # lite/kernels/conv.cc has more than 64k sections so enable /bigobj to
+  # support compilation with MSVC2015.
+  if(MSVC)
+    list(APPEND TFLITE_TARGET_PRIVATE_OPTIONS "/bigobj")
+  elseif(CMAKE_COMPILER_IS_GNUCXX)
+    list(APPEND TFLITE_TARGET_PRIVATE_OPTIONS "-Wa,-mbig-obj")
+  endif()
+endif()
+if(CMAKE_SYSTEM_NAME MATCHES "Android")
+  find_library(ANDROID_LOG_LIB log)
 endif()
 # Build a list of source files to compile into the TF Lite library.
 populate_tflite_source_vars("." TFLITE_SRCS)
@@ -203,9 +190,60 @@ if(TFLITE_ENABLE_FLEX)
   )
 endif()
 if(TFLITE_ENABLE_GPU)
-  # Implementation is under delegates/gpu.
-  message(FATAL_ERROR
-    "GPU acceleration is not currently supported in CMake builds"
+  find_package(opencl_headers REQUIRED)
+  find_package(vulkan_headers REQUIRED)
+  populate_tflite_source_vars(
+    "delegates/gpu/cl" TFLITE_DELEGATES_GPU_CL_SRCS
+    FILTER "(_test|gl_interop|egl_sync)\\.(cc|h)$"
+  )
+  populate_tflite_source_vars(
+    "delegates/gpu/cl/kernels" TFLITE_DELEGATES_GPU_CL_KERNELS_SRCS
+    FILTER "(_test)\\.(cc|h)$"
+  )
+  populate_tflite_source_vars(
+    "delegates/gpu/cl/kernels/special"
+    TFLITE_DELEGATES_GPU_CL_KERNELS_SPECIAL_SRCS
+    FILTER "(_test)\\.(cc|h)$"
+  )
+  populate_tflite_source_vars(
+    "delegates/gpu/cl/selectors" TFLITE_DELEGATES_GPU_CL_SELECTORS_SRCS
+    FILTER "(_test)\\.(cc|h)$"
+  )
+  populate_tflite_source_vars(
+    "delegates/gpu/common" TFLITE_DELEGATES_GPU_COMMON_SRCS
+    FILTER "(_test)\\.(cc|h)$"
+  )
+  populate_tflite_source_vars(
+    "delegates/gpu/common/default" TFLITE_DELEGATES_GPU_COMMON_DEFAULT_SRCS
+    FILTER "(_test)\\.(cc|h)$"
+  )
+  populate_tflite_source_vars(
+    "delegates/gpu/common/memory_management"
+    TFLITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_SRCS
+    FILTER "(_test)\\.(cc|h)$"
+  )
+  populate_tflite_source_vars(
+    "delegates/gpu/common/transformations"
+    TFLITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_SRCS
+    FILTER "(_test)\\.(cc|h)$"
+  )
+  list(APPEND TFLITE_DELEGATES_GPU_SRCS
+    ${TFLITE_SOURCE_DIR}/delegates/gpu/api.cc
+    ${TFLITE_SOURCE_DIR}/delegates/gpu/delegate.cc
+    ${TFLITE_DELEGATES_GPU_CL_SRCS}
+    ${TFLITE_DELEGATES_GPU_CL_KERNELS_SRCS}
+    ${TFLITE_DELEGATES_GPU_CL_KERNELS_SPECIAL_SRCS}
+    ${TFLITE_DELEGATES_GPU_CL_SELECTORS_SRCS}
+    ${TFLITE_SOURCE_DIR}/delegates/gpu/cl/selectors/default/default_selector.cc
+    ${TFLITE_DELEGATES_GPU_COMMON_SRCS}
+    ${TFLITE_DELEGATES_GPU_COMMON_DEFAULT_SRCS}
+    ${TFLITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_SRCS}
+    ${TFLITE_DELEGATES_GPU_COMMON_TRANSFORMATIONS_SRCS}
+  )
+  list(APPEND TFLITE_TARGET_PUBLIC_OPTIONS "-DCL_DELEGATE_NO_GL" "-DEGL_NO_X11")
+  list(APPEND TFLITE_TARGET_DEPENDENCIES
+    absl::any
+    absl::flat_hash_map
   )
 endif()
 if(_TFLITE_ENABLE_NNAPI)
@@ -225,8 +263,13 @@ else()
   )
 endif()
 if(TFLITE_ENABLE_XNNPACK)
+  find_package(xnnpack REQUIRED)
   populate_tflite_source_vars("delegates/xnnpack"
     TFLITE_DELEGATES_XNNPACK_SRCS
+    FILTER ".*(_test|_tester)\\.(cc|h)"
+  )
+  list(APPEND TFLITE_TARGET_DEPENDENCIES
+    XNNPACK
   )
 endif()
 if (TFLITE_ENABLE_RESOURCE)
@@ -270,31 +313,30 @@ populate_tflite_source_vars("kernels/internal/reference/integer_ops"
 populate_tflite_source_vars("kernels/internal/reference/sparse_ops"
   TFLITE_KERNEL_INTERNAL_REF_SPARSE_OPS_SRCS
 )
-if(TFLITE_ENABLE_PROFILING)
-  populate_tflite_source_vars("profiling" TFLITE_KERNEL_PROFILING_SRCS)
-endif()
-populate_tflite_source_vars("tools/optimize" TFLITE_TOOLS_OPTIMIZE_SRCS)
-populate_tflite_source_vars("tools/optimize/calibration"
-  TFLITE_TOOLS_OPTIMIZE_CALIBRATION_SRCS
+
+# Common include directories
+set(TFLITE_INCLUDE_DIRS
+  "${TENSORFLOW_SOURCE_DIR}"
+  "${TFLITE_FLATBUFFERS_SCHEMA_DIR}"
 )
-populate_tflite_source_vars("tools/optimize/calibration/builtin_logging_ops"
-  TFLITE_TOOLS_OPTIMIZE_CALIBRATION_OPS_SRCS
+include_directories(
+  BEFORE
+    ${TFLITE_INCLUDE_DIRS}
 )
-populate_tflite_source_vars("tools/optimize/sparsity"
-  TFLITE_TOOLS_OPTIMIZE_SPARSITY_SRCS
-)
-add_library(tensorflowlite
+
+# TFLite library
+add_library(tensorflow-lite
   ${TFLITE_CORE_API_SRCS}
   ${TFLITE_CORE_SRCS}
   ${TFLITE_C_SRCS}
   ${TFLITE_DELEGATES_FLEX_SRCS}
+  ${TFLITE_DELEGATES_GPU_SRCS}
   ${TFLITE_DELEGATES_NNAPI_SRCS}
   ${TFLITE_DELEGATES_SRCS}
   ${TFLITE_DELEGATES_XNNPACK_SRCS}
   ${TFLITE_EXPERIMENTAL_RESOURCE_SRCS}
   ${TFLITE_EXPERIMENTAL_RUY_PROFILER_SRCS}
   ${TFLITE_EXPERIMENTAL_RUY_SRCS}
-  ${TFLITE_FLATBUFFERS_HDRS}
   ${TFLITE_KERNEL_INTERNAL_OPT_INTEGER_OPS_SRCS}
   ${TFLITE_KERNEL_INTERNAL_OPT_SPARSE_OPS_SRCS}
   ${TFLITE_KERNEL_INTERNAL_OPT_SRCS}
@@ -302,16 +344,18 @@ add_library(tensorflowlite
   ${TFLITE_KERNEL_INTERNAL_REF_SPARSE_OPS_SRCS}
   ${TFLITE_KERNEL_INTERNAL_REF_SRCS}
   ${TFLITE_KERNEL_INTERNAL_SRCS}
-  ${TFLITE_KERNEL_PROFILING_SRCS}
   ${TFLITE_KERNEL_SRCS}
   ${TFLITE_NNAPI_SRCS}
   ${TFLITE_SRCS}
-  ${TFLITE_TOOLS_OPTIMIZE_CALIBRATION_OPS_SRCS}
-  ${TFLITE_TOOLS_OPTIMIZE_CALIBRATION_SRCS}
-  ${TFLITE_TOOLS_OPTIMIZE_SPARSITY_SRCS}
-  ${TFLITE_TOOLS_OPTIMIZE_SRCS}
+  ${TFLITE_SOURCE_DIR}/profiling/platform_profiler.cc
+  ${TFLITE_SOURCE_DIR}/schema/schema_utils.cc
+  ${TFLITE_SOURCE_DIR}/tools/optimize/sparsity/format_converter.cc
 )
-target_link_libraries(tensorflowlite
+target_include_directories(tensorflow-lite
+  PUBLIC
+    ${TFLITE_INCLUDE_DIRS}
+)
+target_link_libraries(tensorflow-lite
   PUBLIC
     Eigen3::Eigen
     NEON_2_SSE
@@ -328,14 +372,80 @@ target_link_libraries(tensorflowlite
     ruy
     ${TFLITE_TARGET_DEPENDENCIES}
 )
-target_include_directories(tensorflowlite
-  PUBLIC
-   "${TENSORFLOW_SOURCE_DIR}"
-  PRIVATE
-    "${TFLITE_FLATBUFFERS_GEN_DIR}"
-)
-target_compile_options(tensorflowlite
+target_compile_options(tensorflow-lite
   PUBLIC ${TFLITE_TARGET_PUBLIC_OPTIONS}
   PRIVATE ${TFLITE_TARGET_PRIVATE_OPTIONS}
 )
-add_library(tensorflow::tensorflowlite ALIAS tensorflowlite)
+add_library(tensorflow::tensorflowlite ALIAS tensorflow-lite)
+
+# Benchmark Tool
+populate_source_vars("${TFLITE_SOURCE_DIR}/tools/benchmark"
+  TFLITE_BENCHMARK_SRCS
+  FILTER "(_test|_plus_flex_main|_performance_options.*)\\.cc$"
+)
+list(APPEND TFLITE_BENCHMARK_SRCS
+  ${TF_SOURCE_DIR}/core/util/stats_calculator.cc
+  ${TFLITE_SOURCE_DIR}/profiling/memory_info.cc
+  ${TFLITE_SOURCE_DIR}/profiling/platform_profiler.cc
+  ${TFLITE_SOURCE_DIR}/profiling/profile_summarizer.cc
+  ${TFLITE_SOURCE_DIR}/profiling/profile_summary_formatter.cc
+  ${TFLITE_SOURCE_DIR}/profiling/time.cc
+  ${TFLITE_SOURCE_DIR}/tools/command_line_flags.cc
+  ${TFLITE_SOURCE_DIR}/tools/delegates/default_execution_provider.cc
+  ${TFLITE_SOURCE_DIR}/tools/evaluation/utils.cc
+  ${TFLITE_SOURCE_DIR}/tools/optimize/sparsity/format_converter.cc
+  ${TFLITE_SOURCE_DIR}/tools/tool_params.cc
+)
+
+list(APPEND TFLITE_BENCHMARK_LIBS
+  tensorflow-lite
+  ${CMAKE_DL_LIBS}
+)
+
+# TODO(b/171007016): Enable performance options on Windows.
+if(NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
+  list(APPEND TFLITE_BENCHMARK_SRCS
+    ${TFLITE_SOURCE_DIR}/tools/benchmark/benchmark_performance_options.cc
+  )
+endif()
+
+if(TFLITE_ENABLE_XNNPACK)
+  list(APPEND TFLITE_BENCHMARK_SRCS
+    ${TFLITE_SOURCE_DIR}/tools/delegates/xnnpack_delegate_provider.cc
+  )
+else()
+  set(TFLITE_BENCHMARK_CC_OPTIONS "-DTFLITE_WITHOUT_XNNPACK")
+endif()  # TFLITE_ENABLE_XNNPACK
+
+if(CMAKE_SYSTEM_NAME MATCHES "Android")
+  list(APPEND TFLITE_BENCHMARK_SRCS
+    ${TFLITE_SOURCE_DIR}/profiling/atrace_profiler.cc
+  )
+  if(_TFLITE_ENABLE_NNAPI)
+    list(APPEND TFLITE_BENCHMARK_SRCS
+      ${TFLITE_SOURCE_DIR}/tools/delegates/nnapi_delegate_provider.cc
+    )
+  endif()  # _TFLITE_ENABLE_NNAPI
+  list(APPEND TFLITE_BENCHMARK_LIBS
+    ${ANDROID_LOG_LIB}
+    absl::strings
+  )
+endif()  # Android
+
+if(TFLITE_ENABLE_GPU)
+  list(APPEND TFLITE_BENCHMARK_SRCS
+    ${TFLITE_SOURCE_DIR}/tools/delegates/gpu_delegate_provider.cc
+  )
+endif()  # TFLITE_ENABLE_GPU
+
+add_executable(benchmark_model
+  EXCLUDE_FROM_ALL
+  ${TFLITE_BENCHMARK_SRCS}
+)
+target_compile_options(benchmark_model
+  PRIVATE
+    ${TFLITE_BENCHMARK_CC_OPTIONS}
+)
+target_link_libraries(benchmark_model
+    ${TFLITE_BENCHMARK_LIBS}
+)
diff --git a/tensorflow/lite/arena_planner.cc b/tensorflow/lite/arena_planner.cc
index dd5e3777fc1..b134a5de044 100644
--- a/tensorflow/lite/arena_planner.cc
+++ b/tensorflow/lite/arena_planner.cc
@@ -140,7 +140,7 @@ TfLiteStatus ArenaPlanner::PlanAllocations() {
   }
 
   // Count references to node input tensors.
-  for (size_t i = 0; i < graph_info_->num_nodes(); ++i) {
+  for (size_t i = 0; i < graph_info_->num_execution_nodes(); ++i) {
     const TfLiteNode& node = graph_info_->node(i);
     TfLiteIntArray* node_inputs = node.inputs;
     for (int j = 0; j < node_inputs->size; ++j) {
@@ -158,7 +158,7 @@ TfLiteStatus ArenaPlanner::PlanAllocations() {
     }
   }
   // Go through the graph in execution order.
-  for (size_t i = 0; i < graph_info_->num_nodes(); ++i) {
+  for (size_t i = 0; i < graph_info_->num_execution_nodes(); ++i) {
     const TfLiteNode& node = graph_info_->node(i);
 
     // First queue output tensors for allocation.
@@ -197,8 +197,8 @@ TfLiteStatus ArenaPlanner::ExecuteAllocations(int first_node, int last_node) {
   dealloc_node_.resize(graph_info_->num_tensors(), kNodeNotAssigned);
   allocs_.resize(graph_info_->num_tensors());
   // Set allocation and deallocation for temporary tensors.
-  for (size_t i = first_node;
-       i <= static_cast<size_t>(last_node) && i < graph_info_->num_nodes();
+  for (size_t i = first_node; i <= static_cast<size_t>(last_node) &&
+                              i < graph_info_->num_execution_nodes();
        ++i) {
     const TfLiteNode& node = graph_info_->node(i);
     TfLiteIntArray* node_temporaries = node.temporaries;
diff --git a/tensorflow/lite/arena_planner_test.cc b/tensorflow/lite/arena_planner_test.cc
index 813e10082a3..47ecc68cf40 100644
--- a/tensorflow/lite/arena_planner_test.cc
+++ b/tensorflow/lite/arena_planner_test.cc
@@ -134,7 +134,8 @@ class TestGraphInfo : public GraphInfo {
   TfLiteTensor* tensor(size_t index) override {
     return &graph_->tensors()->at(index);
   }
-  size_t num_nodes() const override { return graph_->nodes().size(); }
+  size_t num_execution_nodes() const override { return graph_->nodes().size(); }
+  size_t num_total_nodes() const override { return graph_->nodes().size(); }
   const TfLiteNode& node(size_t index) const override {
     return graph_->nodes()[index];
   }
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index bdddac82d5b..0f731c43577 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -169,6 +169,33 @@ def tflite_cc_shared_object(
 def tf_to_tflite(name, src, options, out):
     """Convert a frozen tensorflow graphdef to TF Lite's flatbuffer.
 
+    Args:
+      name: Name of rule.
+      src: name of the input graphdef file.
+      options: options passed to TFLite Converter.
+      out: name of the output flatbuffer file.
+    """
+
+    toco_cmdline = " ".join([
+        "$(location //tensorflow/lite/python:tflite_convert)",
+        "--experimental_new_converter",
+        ("--graph_def_file=$(location %s)" % src),
+        ("--output_file=$(location %s)" % out),
+    ] + options)
+    native.genrule(
+        name = name,
+        srcs = [src],
+        outs = [out],
+        cmd = toco_cmdline,
+        tools = ["//tensorflow/lite/python:tflite_convert"] + tf_binary_additional_srcs(),
+    )
+
+def DEPRECATED_tf_to_tflite(name, src, options, out):
+    """DEPRECATED Convert a frozen tensorflow graphdef to TF Lite's flatbuffer, using toco.
+
+    Please use tf_to_tflite instead.
+    TODO(b/138396996): Migrate away from this deprecated rule.
+
     Args:
       name: Name of rule.
       src: name of the input graphdef file.
@@ -742,27 +769,6 @@ def gen_model_coverage_test(src, model_name, data, failure_type, tags, size = "m
             ] + flex_dep(target_op_sets),
         )
 
-def if_tflite_experimental_runtime(if_eager, if_non_eager, if_none = []):
-    return select({
-        "//tensorflow/lite:tflite_experimental_runtime_eager": if_eager,
-        "//tensorflow/lite:tflite_experimental_runtime_non_eager": if_non_eager,
-        "//conditions:default": if_none,
-    })
-
-def tflite_experimental_runtime_linkopts(if_eager = [], if_non_eager = [], if_none = []):
-    return if_tflite_experimental_runtime(
-        if_eager = [
-            # "//tensorflow/lite/experimental/tf_runtime:eager_interpreter",
-            # "//tensorflow/lite/experimental/tf_runtime:eager_model",
-            # "//tensorflow/lite/experimental/tf_runtime:subgraph",
-        ] + if_eager,
-        if_non_eager = [
-            # "//tensorflow/lite/experimental/tf_runtime:interpreter",
-            # "//tensorflow/lite/experimental/tf_runtime:model",
-        ] + if_non_eager,
-        if_none = [] + if_none,
-    )
-
 def tflite_custom_cc_library(
         name,
         models = [],
diff --git a/tensorflow/lite/builtin_ops.h b/tensorflow/lite/builtin_ops.h
index 85140289ac1..a37607f6260 100644
--- a/tensorflow/lite/builtin_ops.h
+++ b/tensorflow/lite/builtin_ops.h
@@ -24,7 +24,8 @@ extern "C" {
 #endif  // __cplusplus
 
 // The enum for builtin operators.
-// Note: CUSTOM and DELEGATE are 2 special ops which are not real built-in ops.
+// Note: CUSTOM, DELEGATE, and PLACEHOLDER_FOR_GREATER_OP_CODES are 3 special
+// ops which are not real built-in ops.
 typedef enum {
   kTfLiteBuiltinAdd = 0,
   kTfLiteBuiltinAveragePool2d = 1,
@@ -153,6 +154,7 @@ typedef enum {
   kTfLiteBuiltinDensify = 124,
   kTfLiteBuiltinSegmentSum = 125,
   kTfLiteBuiltinBatchMatmul = 126,
+  kTfLiteBuiltinPlaceholderForGreaterOpCodes = 127,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index 5ac6d7881ac..e8db0dcf440 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -3,6 +3,7 @@ load(
     "tflite_cc_shared_object",
     "tflite_copts",
 )
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -46,6 +47,7 @@ cc_library(
     visibility = ["//visibility:private"],
     deps = [
         ":common",
+        "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/core/api",
     ],
@@ -59,11 +61,12 @@ cc_library(
     deps = [
         ":c_api_internal",
         ":common",
+        "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:version",
-        "//tensorflow/lite/core/api",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels/internal:compatibility",
     ],
     alwayslink = 1,
 )
@@ -123,6 +126,8 @@ cc_library(
         "builtin_op_data.h",
         "common.h",
     ],
+    compatible_with = get_compatible_with_portable(),
+    deps = ["//tensorflow/lite:builtin_ops"],
     alwayslink = 1,
 )
 
diff --git a/tensorflow/lite/c/c_api.cc b/tensorflow/lite/c/c_api.cc
index 4afd413ba9c..205c665d08b 100644
--- a/tensorflow/lite/c/c_api.cc
+++ b/tensorflow/lite/c/c_api.cc
@@ -16,10 +16,12 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #include "tensorflow/lite/error_reporter.h"
 #include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/version.h"
@@ -31,21 +33,55 @@ extern "C" {
 namespace {
 class CallbackErrorReporter : public tflite::ErrorReporter {
  public:
-  using ErrorCallback = void (*)(void* user_data, const char* format,
-                                 va_list args);
-
-  CallbackErrorReporter(ErrorCallback callback, void* user_data)
-      : callback_(callback), user_data_(user_data) {}
+  explicit CallbackErrorReporter(TfLiteErrorReporterCallback callback)
+      : callback_(callback) {}
 
   int Report(const char* format, va_list args) override {
-    callback_(user_data_, format, args);
+    callback_.error_reporter(callback_.user_data, format, args);
     return 0;
   }
 
  private:
-  ErrorCallback callback_;
-  void* user_data_;
+  TfLiteErrorReporterCallback callback_;
 };
+
+/// `CallbackOpResolver` is a (C++) `tflite::OpResolver` that forwards the
+/// methods to (C ABI) callback functions from a `TfLiteOpResolverCallbacks`
+/// struct.
+///
+/// The SetCallbacks method must be called before calling any of the FindOp
+/// methods.
+class CallbackOpResolver : public ::tflite::OpResolver {
+ public:
+  CallbackOpResolver() {}
+  void SetCallbacks(
+      const struct TfLiteOpResolverCallbacks& op_resolver_callbacks) {
+    op_resolver_callbacks_ = op_resolver_callbacks;
+  }
+  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                   int version) const override {
+    if (op_resolver_callbacks_.find_builtin_op == nullptr) {
+      return nullptr;
+    }
+    return op_resolver_callbacks_.find_builtin_op(
+        op_resolver_callbacks_.user_data,
+        static_cast<TfLiteBuiltinOperator>(op), version);
+  }
+  const TfLiteRegistration* FindOp(const char* op, int version) const override {
+    if (op_resolver_callbacks_.find_custom_op == nullptr) {
+      return nullptr;
+    }
+    return op_resolver_callbacks_.find_custom_op(
+        op_resolver_callbacks_.user_data, op, version);
+  }
+
+ private:
+  CallbackOpResolver(const CallbackOpResolver&) = delete;
+  CallbackOpResolver& operator=(const CallbackOpResolver&) = delete;
+
+  struct TfLiteOpResolverCallbacks op_resolver_callbacks_ = {};
+};
+
 }  // namespace
 
 // LINT.IfChange
@@ -89,62 +125,16 @@ void TfLiteInterpreterOptionsSetErrorReporter(
     TfLiteInterpreterOptions* options,
     void (*reporter)(void* user_data, const char* format, va_list args),
     void* user_data) {
-  options->error_reporter = reporter;
-  options->error_reporter_user_data = user_data;
+  options->error_reporter_callback.error_reporter = reporter;
+  options->error_reporter_callback.user_data = user_data;
 }
 
 TfLiteInterpreter* TfLiteInterpreterCreate(
     const TfLiteModel* model,
     const TfLiteInterpreterOptions* optional_options) {
-  if (!model || !model->impl) {
-    return nullptr;
-  }
-
-  std::unique_ptr<tflite::ErrorReporter> optional_error_reporter;
-  if (optional_options && optional_options->error_reporter != nullptr) {
-    optional_error_reporter.reset(
-        new CallbackErrorReporter(optional_options->error_reporter,
-                                  optional_options->error_reporter_user_data));
-  }
-
-  // TODO(b/111881878): Allow use of C API without pulling in all builtin ops.
   tflite::ops::builtin::BuiltinOpResolver resolver;
-  if (optional_options) {
-    resolver.AddAll(optional_options->op_resolver);
-  }
-  tflite::ErrorReporter* error_reporter = optional_error_reporter
-                                              ? optional_error_reporter.get()
-                                              : tflite::DefaultErrorReporter();
-  tflite::InterpreterBuilder builder(model->impl->GetModel(), resolver,
-                                     error_reporter);
-
-  std::unique_ptr<tflite::Interpreter> interpreter;
-  if (builder(&interpreter) != kTfLiteOk) {
-    return nullptr;
-  }
-
-  if (optional_options) {
-    if (optional_options->num_threads !=
-        TfLiteInterpreterOptions::kDefaultNumThreads) {
-      interpreter->SetNumThreads(optional_options->num_threads);
-    }
-
-    if (optional_options->use_nnapi) {
-      if (interpreter->ModifyGraphWithDelegate(tflite::NnApiDelegate()) !=
-          kTfLiteOk) {
-        return nullptr;
-      }
-    }
-
-    for (auto* delegate : optional_options->delegates) {
-      if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) {
-        return nullptr;
-      }
-    }
-  }
-
-  return new TfLiteInterpreter{model->impl, std::move(optional_error_reporter),
-                               std::move(interpreter)};
+  return tflite::internal::InterpreterCreateWithOpResolver(
+      model, optional_options, &resolver);
 }
 
 void TfLiteInterpreterDelete(TfLiteInterpreter* interpreter) {
@@ -240,3 +230,77 @@ TfLiteStatus TfLiteTensorCopyToBuffer(const TfLiteTensor* tensor,
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
+
+namespace tflite {
+namespace internal {
+
+TfLiteInterpreter* InterpreterCreateWithOpResolver(
+    const TfLiteModel* model, const TfLiteInterpreterOptions* optional_options,
+    tflite::MutableOpResolver* mutable_resolver) {
+  TFLITE_DCHECK_NE(mutable_resolver, nullptr);
+  if (!model || !model->impl) {
+    return nullptr;
+  }
+
+  std::unique_ptr<tflite::ErrorReporter> optional_error_reporter;
+  if (optional_options &&
+      optional_options->error_reporter_callback.error_reporter != nullptr) {
+    optional_error_reporter.reset(
+        new CallbackErrorReporter(optional_options->error_reporter_callback));
+  }
+
+  // By default, we use the provided mutable_op_resolver, adding any builtin or
+  // custom ops registered with `TfLiteInterpreterOptionsAddBuiltinOp` and/or
+  // `TfLiteInterpreterOptionsAddCustomOp`.
+  tflite::OpResolver* op_resolver = mutable_resolver;
+  if (optional_options) {
+    mutable_resolver->AddAll(optional_options->mutable_op_resolver);
+  }
+  // However, if `TfLiteInterpreterOptionsSetOpResolver` has been called with
+  // a non-null callback parameter, then we instead use a
+  // `CallbackOpResolver` that will forward to the callbacks provided there.
+  CallbackOpResolver callback_op_resolver;
+  if (optional_options &&
+      (optional_options->op_resolver_callbacks.find_builtin_op != nullptr ||
+       optional_options->op_resolver_callbacks.find_custom_op != nullptr)) {
+    callback_op_resolver.SetCallbacks(optional_options->op_resolver_callbacks);
+    op_resolver = &callback_op_resolver;
+  }
+
+  tflite::ErrorReporter* error_reporter = optional_error_reporter
+                                              ? optional_error_reporter.get()
+                                              : tflite::DefaultErrorReporter();
+  tflite::InterpreterBuilder builder(model->impl->GetModel(), *op_resolver,
+                                     error_reporter);
+
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  if (builder(&interpreter) != kTfLiteOk) {
+    return nullptr;
+  }
+
+  if (optional_options) {
+    if (optional_options->num_threads !=
+        TfLiteInterpreterOptions::kDefaultNumThreads) {
+      interpreter->SetNumThreads(optional_options->num_threads);
+    }
+
+    if (optional_options->use_nnapi) {
+      if (interpreter->ModifyGraphWithDelegate(tflite::NnApiDelegate()) !=
+          kTfLiteOk) {
+        return nullptr;
+      }
+    }
+
+    for (auto* delegate : optional_options->delegates) {
+      if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) {
+        return nullptr;
+      }
+    }
+  }
+
+  return new TfLiteInterpreter{model->impl, std::move(optional_error_reporter),
+                               std::move(interpreter)};
+}
+
+}  // namespace internal
+}  // namespace tflite
diff --git a/tensorflow/lite/c/c_api.h b/tensorflow/lite/c/c_api.h
index 880b80e69b4..152bcf986fe 100644
--- a/tensorflow/lite/c/c_api.h
+++ b/tensorflow/lite/c/c_api.h
@@ -188,7 +188,7 @@ TFL_CAPI_EXPORT extern int32_t TfLiteInterpreterGetOutputTensorCount(
     const TfLiteInterpreter* interpreter);
 
 // Returns the tensor associated with the output index.
-// REQUIRES: 0 <= input_index < TfLiteInterpreterGetOutputTensorCount(tensor)
+// REQUIRES: 0 <= output_index < TfLiteInterpreterGetOutputTensorCount(tensor)
 //
 // NOTE: The shape and underlying data buffer for output tensors may be not
 // be available until after the output tensor has been both sized and allocated.
diff --git a/tensorflow/lite/c/c_api_experimental.cc b/tensorflow/lite/c/c_api_experimental.cc
index cff1b3d1530..23a5ca7a275 100644
--- a/tensorflow/lite/c/c_api_experimental.cc
+++ b/tensorflow/lite/c/c_api_experimental.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/lite/c/c_api.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/mutable_op_resolver.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -38,8 +37,17 @@ void TfLiteInterpreterOptionsAddBuiltinOp(
     TfLiteInterpreterOptions* options, TfLiteBuiltinOperator op,
     const TfLiteRegistration* registration, int32_t min_version,
     int32_t max_version) {
-  options->op_resolver.AddBuiltin(static_cast<tflite::BuiltinOperator>(op),
-                                  registration, min_version, max_version);
+  options->mutable_op_resolver.AddBuiltin(
+      static_cast<tflite::BuiltinOperator>(op), registration, min_version,
+      max_version);
+}
+
+TfLiteInterpreter* TfLiteInterpreterCreateWithSelectedOps(
+    const TfLiteModel* model,
+    const TfLiteInterpreterOptions* optional_options) {
+  tflite::MutableOpResolver resolver;
+  return tflite::internal::InterpreterCreateWithOpResolver(
+      model, optional_options, &resolver);
 }
 
 void TfLiteInterpreterOptionsAddCustomOp(TfLiteInterpreterOptions* options,
@@ -47,7 +55,21 @@ void TfLiteInterpreterOptionsAddCustomOp(TfLiteInterpreterOptions* options,
                                          const TfLiteRegistration* registration,
                                          int32_t min_version,
                                          int32_t max_version) {
-  options->op_resolver.AddCustom(name, registration, min_version, max_version);
+  options->mutable_op_resolver.AddCustom(name, registration, min_version,
+                                         max_version);
+}
+
+void TfLiteInterpreterOptionsSetOpResolver(
+    TfLiteInterpreterOptions* options,
+    const TfLiteRegistration* (*find_builtin_op)(void* user_data,
+                                                 TfLiteBuiltinOperator op,
+                                                 int version),
+    const TfLiteRegistration* (*find_custom_op)(void* user_data, const char* op,
+                                                int version),
+    void* op_resolver_user_data) {
+  options->op_resolver_callbacks.find_builtin_op = find_builtin_op;
+  options->op_resolver_callbacks.find_custom_op = find_custom_op;
+  options->op_resolver_callbacks.user_data = op_resolver_user_data;
 }
 
 void TfLiteInterpreterOptionsSetUseNNAPI(TfLiteInterpreterOptions* options,
diff --git a/tensorflow/lite/c/c_api_experimental.h b/tensorflow/lite/c/c_api_experimental.h
index 0398c385874..bfbdd9c8fdd 100644
--- a/tensorflow/lite/c/c_api_experimental.h
+++ b/tensorflow/lite/c/c_api_experimental.h
@@ -23,33 +23,99 @@ limitations under the License.
 extern "C" {
 #endif  // __cplusplus
 
-// Resets all variable tensors to zero.
+/// Resets all variable tensors to zero.
+///
+/// WARNING: This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteInterpreterResetVariableTensors(
     TfLiteInterpreter* interpreter);
 
-// Adds an op registration for a builtin operator.
-//
-// NOTE: The interpreter will make a copy of `registration` internally, so the
-// caller should ensure that its contents (function pointers, etc...) remain
-// valid for the duration of the interpreter's lifetime. A common practice is
-// making the provided TfLiteRegistration instance static.
+/// Adds an op registration for a builtin operator.
+///
+/// Op registrations are used to map ops referenced in the flatbuffer model
+/// to executable function pointers (`TfLiteRegistration`s).
+///
+/// NOTE: The interpreter will make a shallow copy of `registration` internally,
+/// so the caller should ensure that its contents (function pointers, etc...)
+/// remain valid for the duration of the interpreter's lifetime. A common
+/// practice is making the provided `TfLiteRegistration` instance static.
+///
+/// Code that uses this function should NOT call
+/// `TfLiteInterpreterOptionsSetOpResolver' on the same options object.
+///
+/// WARNING: This is an experimental API and subject to change.
 TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddBuiltinOp(
     TfLiteInterpreterOptions* options, TfLiteBuiltinOperator op,
     const TfLiteRegistration* registration, int32_t min_version,
     int32_t max_version);
 
-// Adds an op registration for a custom operator.
-//
-// NOTE: The interpreter will make a copy of `registration` internally, so the
-// caller should ensure that its contents (function pointers, etc...) remain
-// valid for the duration of any created interpreter's lifetime. A common
-// practice is making the provided TfLiteRegistration instance static.
+/// Adds an op registration for a custom operator.
+///
+/// Op registrations are used to map ops referenced in the flatbuffer model
+/// to executable function pointers (`TfLiteRegistration`s).
+///
+/// NOTE: The interpreter will make a shallow copy of `registration` internally,
+/// so the caller should ensure that its contents (function pointers, etc...)
+/// remain valid for the duration of any created interpreter's lifetime. A
+/// common practice is making the provided `TfLiteRegistration` instance static.
+///
+/// Code that uses this function should NOT call
+/// `TfLiteInterpreterOptionsSetOpResolver' on the same options object.
+///
+/// WARNING: This is an experimental API and subject to change.
 TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddCustomOp(
     TfLiteInterpreterOptions* options, const char* name,
     const TfLiteRegistration* registration, int32_t min_version,
     int32_t max_version);
 
-// Enable or disable the NN API for the interpreter (true to enable).
+/// Registers callbacks for resolving builtin or custom operators.
+///
+/// The `TfLiteInterpreterOptionsSetOpResolver` function provides an alternative
+/// method for registering builtin ops and/or custom ops, by providing operator
+/// resolver callbacks.  Unlike using `TfLiteInterpreterOptionsAddBuiltinOp`
+/// and/or `TfLiteInterpreterOptionsAddAddCustomOp`, these let you register all
+/// the operators in a single call.
+///
+/// Code that uses this function should NOT call
+/// `TfLiteInterpreterOptionsAddBuiltin' or
+/// `TfLiteInterpreterOptionsAddCustomOp' on the same options object.
+///
+/// WARNING: This is an experimental API and subject to change.
+void TfLiteInterpreterOptionsSetOpResolver(
+    TfLiteInterpreterOptions* options,
+    const TfLiteRegistration* (*find_builtin_op)(void* user_data,
+                                                 TfLiteBuiltinOperator op,
+                                                 int version),
+    const TfLiteRegistration* (*find_custom_op)(void* user_data,
+                                                const char* custom_op,
+                                                int version),
+    void* op_resolver_user_data);
+
+/// Returns a new interpreter using the provided model and options, or null on
+/// failure, where the model uses only the operators explicitly added to the
+/// options.  This is the same as `TFLiteInterpreterCreate` from `c_api.h`,
+/// except that the only operators that are supported are the ones registered
+/// in `options` via calls to `TfLiteInterpreterOptionsSetOpResolver`,
+/// `TfLiteInterpreterOptionsAddBuiltinOp`, and/or
+/// `TfLiteInterpreterOptionsAddCustomOp`.
+///
+/// * `model` must be a valid model instance. The caller retains ownership of
+///   the object, and can destroy it immediately after creating the interpreter;
+///   the interpreter will maintain its own reference to the underlying model
+///   data.
+/// * `options` should not be null. The caller retains ownership of the object,
+///   and can safely destroy it immediately after creating the interpreter.
+///
+/// NOTE: The client *must* explicitly allocate tensors before attempting to
+/// access input tensor data or invoke the interpreter.
+///
+/// WARNING: This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteInterpreter*
+TfLiteInterpreterCreateWithSelectedOps(const TfLiteModel* model,
+                                       const TfLiteInterpreterOptions* options);
+
+/// Enable or disable the NN API for the interpreter (true to enable).
+///
+/// WARNING: This is an experimental API and subject to change.
 TFL_CAPI_EXPORT extern void TfLiteInterpreterOptionsSetUseNNAPI(
     TfLiteInterpreterOptions* options, bool enable);
 
diff --git a/tensorflow/lite/c/c_api_experimental_test.cc b/tensorflow/lite/c/c_api_experimental_test.cc
index 18bc7bb0397..4de137ec0e6 100644
--- a/tensorflow/lite/c/c_api_experimental_test.cc
+++ b/tensorflow/lite/c/c_api_experimental_test.cc
@@ -23,8 +23,8 @@ limitations under the License.
 
 namespace {
 
-TfLiteRegistration* GetDummyRegistration() {
-  static TfLiteRegistration registration = {
+const TfLiteRegistration* GetDummyRegistration() {
+  static const TfLiteRegistration registration = {
       /*init=*/nullptr,
       /*free=*/nullptr,
       /*prepare=*/nullptr,
@@ -53,6 +53,112 @@ TEST(CApiExperimentalTest, Smoke) {
   TfLiteModelDelete(model);
 }
 
+// Test using TfLiteInterpreterCreateWithSelectedOps.
+TEST(CApiExperimentalTest, SelectedBuiltins) {
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  ASSERT_NE(model, nullptr);
+
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsAddBuiltinOp(options, kTfLiteBuiltinAdd,
+                                       GetDummyRegistration(), 1, 1);
+
+  TfLiteInterpreter* interpreter =
+      TfLiteInterpreterCreateWithSelectedOps(model, options);
+  ASSERT_NE(interpreter, nullptr);
+  ASSERT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
+  EXPECT_EQ(TfLiteInterpreterResetVariableTensors(interpreter), kTfLiteOk);
+  EXPECT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
+
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteModelDelete(model);
+}
+
+// Test that when using TfLiteInterpreterCreateWithSelectedOps,
+// we do NOT get the standard builtin operators by default.
+TEST(CApiExperimentalTest, MissingBuiltin) {
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  ASSERT_NE(model, nullptr);
+
+  // Install a custom error reporter into the interpreter by way of options.
+  tflite::TestErrorReporter reporter;
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+  TfLiteInterpreterOptionsSetErrorReporter(
+      options,
+      [](void* user_data, const char* format, va_list args) {
+        reinterpret_cast<tflite::TestErrorReporter*>(user_data)->Report(format,
+                                                                        args);
+      },
+      &reporter);
+
+  // Create an interpreter with no builtins at all.
+  TfLiteInterpreter* interpreter =
+      TfLiteInterpreterCreateWithSelectedOps(model, options);
+
+  // Check that interpreter creation failed, because the model contain a buitin
+  // op that wasn't supported, and that we got the expected error messages.
+  ASSERT_EQ(interpreter, nullptr);
+  EXPECT_EQ(reporter.error_messages(),
+            "Didn't find op for builtin opcode 'ADD' version '1'\n"
+            "Registration failed.\n");
+  EXPECT_EQ(reporter.num_calls(), 2);
+
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteModelDelete(model);
+}
+
+struct OpResolverData {
+  bool called_for_add = false;
+};
+
+const TfLiteRegistration* MyFindBuiltinOp(void* user_data,
+                                          TfLiteBuiltinOperator op,
+                                          int version) {
+  OpResolverData* my_data = static_cast<OpResolverData*>(user_data);
+  if (op == kTfLiteBuiltinAdd && version == 1) {
+    my_data->called_for_add = true;
+    return GetDummyRegistration();
+  }
+  return nullptr;
+}
+
+const TfLiteRegistration* MyFindCustomOp(void*, const char* custom_op,
+                                         int version) {
+  if (absl::string_view(custom_op) == "foo" && version == 1) {
+    return GetDummyRegistration();
+  }
+  return nullptr;
+}
+
+// Test using TfLiteInterpreterCreateWithSelectedOps.
+TEST(CApiExperimentalTest, SetOpResolver) {
+  TfLiteModel* model =
+      TfLiteModelCreateFromFile("tensorflow/lite/testdata/add.bin");
+  ASSERT_NE(model, nullptr);
+
+  TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+
+  OpResolverData my_data;
+  TfLiteInterpreterOptionsSetOpResolver(options, MyFindBuiltinOp,
+                                        MyFindCustomOp, &my_data);
+  EXPECT_FALSE(my_data.called_for_add);
+
+  TfLiteInterpreter* interpreter =
+      TfLiteInterpreterCreateWithSelectedOps(model, options);
+  ASSERT_NE(interpreter, nullptr);
+  ASSERT_EQ(TfLiteInterpreterAllocateTensors(interpreter), kTfLiteOk);
+  EXPECT_EQ(TfLiteInterpreterResetVariableTensors(interpreter), kTfLiteOk);
+  EXPECT_EQ(TfLiteInterpreterInvoke(interpreter), kTfLiteOk);
+  EXPECT_TRUE(my_data.called_for_add);
+
+  TfLiteInterpreterDelete(interpreter);
+  TfLiteInterpreterOptionsDelete(options);
+  TfLiteModelDelete(model);
+}
+
 }  // namespace
 
 int main(int argc, char** argv) {
diff --git a/tensorflow/lite/c/c_api_internal.h b/tensorflow/lite/c/c_api_internal.h
index f13712362a6..ee07e3e06a5 100644
--- a/tensorflow/lite/c/c_api_internal.h
+++ b/tensorflow/lite/c/c_api_internal.h
@@ -20,13 +20,15 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 
-// Internal structures used by the C API. These are likely to change and should
-// not be depended on directly by any C API clients.
+// Internal structures and subroutines used by the C API. These are likely to
+// change and should not be depended on directly by any C API clients.
 //
 // NOTE: This header does not follow C conventions and does not define a C API.
 // It is effectively an (internal) implementation detail of the C API.
@@ -36,20 +38,54 @@ struct TfLiteModel {
   std::shared_ptr<const tflite::FlatBufferModel> impl;
 };
 
+// The `TfLiteOpResolver` struct is an abstract callback interface that
+// contains function pointers for callbacks that return a
+// `TfLiteRegistration` given an op code or custom op name. This mechanism is
+// used to map ops referenced in the flatbuffer model to executable function
+// pointers (`TfLiteRegistration`s).
+// This struct mirrors the tflite::OpResolver C++ abstract base class.
+struct TfLiteOpResolverCallbacks {
+  // Opaque data that gets passed down to the callback functions.
+  void* user_data = nullptr;
+
+  // Callback that finds the op registration for a builtin operator by enum
+  // code.  The `user_data` parameter will be set to the
+  // `op_resolver_user_data` value that was passed to
+  // `TfLiteInterpreterOptionsSetOpResolver`.
+  const TfLiteRegistration* (*find_builtin_op)(void* user_data,
+                                               TfLiteBuiltinOperator op,
+                                               int version);
+  // Callback that finds the op registration of a custom operator by op name.
+  // The `user_data` parameter will be set to the `op_resolver_user_data` value
+  // that was passed to `TfLiteInterpreterOptionsSetOpResolver`.
+  const TfLiteRegistration* (*find_custom_op)(void* user_data, const char* op,
+                                              int version);
+};
+
+// This struct mirrors the tflite::ErrorResolver C++ abstract base class.
+struct TfLiteErrorReporterCallback {
+  // Opaque data that gets passed down to the callback function.
+  void* user_data = nullptr;
+
+  // Callback function that reports an error.
+  void (*error_reporter)(void* user_data, const char* format,
+                         va_list args) = nullptr;
+};
+
 struct TfLiteInterpreterOptions {
   enum {
     kDefaultNumThreads = -1,
   };
   int num_threads = kDefaultNumThreads;
 
-  tflite::MutableOpResolver op_resolver;
+  tflite::MutableOpResolver mutable_op_resolver;
 
-  void (*error_reporter)(void* user_data, const char* format,
-                         va_list args) = nullptr;
-  void* error_reporter_user_data = nullptr;
+  TfLiteOpResolverCallbacks op_resolver_callbacks = {};
 
   std::vector<TfLiteDelegate*> delegates;
 
+  TfLiteErrorReporterCallback error_reporter_callback;
+
   bool use_nnapi = false;
 };
 
@@ -60,10 +96,38 @@ struct TfLiteInterpreter {
 
   // The interpreter does not take ownership of the provided ErrorReporter
   // instance, so we ensure its validity here. Note that the interpreter may use
-  // the reporter in its destructor, so it should be declared first.
+  // the reporter in its destructor, so the reporter should be declared first.
   std::unique_ptr<tflite::ErrorReporter> optional_error_reporter;
 
   std::unique_ptr<tflite::Interpreter> impl;
 };
 
+namespace tflite {
+namespace internal {
+
+// This adds the builtin and/or custom operators specified in options in
+// `optional_options` (if any) to `mutable_resolver`, and then returns a newly
+// created TfLiteInterpreter using `mutable_op_resolver` as the default
+// OpResolver, and using any other options in `optional_options`, and using
+// the provided `model`.
+//
+// * `model` must be a valid model instance. The caller retains ownership of the
+//   object, and can destroy it immediately after creating the interpreter; the
+//   interpreter will maintain its own reference to the underlying model data.
+// * `optional_options` may be null. The caller retains ownership of the object,
+//   and can safely destroy it immediately after creating the interpreter.
+// * `mutable_resolver` must not be null. The caller retains ownership of the
+//   MutableOpResolver object, and can safely destroy it immediately after
+//   creating the interpreter.
+//
+// NOTE: The client *must* explicitly allocate tensors before attempting to
+// access input tensor data or invoke the interpreter.
+
+TfLiteInterpreter* InterpreterCreateWithOpResolver(
+    const TfLiteModel* model, const TfLiteInterpreterOptions* optional_options,
+    tflite::MutableOpResolver* mutable_resolver);
+
+}  // namespace internal
+}  // namespace tflite
+
 #endif  // TENSORFLOW_LITE_C_C_API_INTERNAL_H_
diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h
index d320a90d005..8917c254825 100644
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@@ -226,6 +226,17 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
     }                                                                      \
   } while (0)
 
+#define TF_LITE_ENSURE_NEAR(context, a, b, epsilon)                          \
+  do {                                                                       \
+    auto delta = ((a) > (b)) ? ((a) - (b)) : ((b) - (a));                    \
+    if (delta > epsilon) {                                                   \
+      TF_LITE_KERNEL_LOG((context), "%s:%d %s not near %s (%f != %f)",       \
+                         __FILE__, __LINE__, #a, #b, static_cast<double>(a), \
+                         static_cast<double>(b));                            \
+      return kTfLiteError;                                                   \
+    }                                                                        \
+  } while (0)
+
 #define TF_LITE_ENSURE_OK(context, status) \
   do {                                     \
     const TfLiteStatus s = (status);       \
@@ -410,7 +421,7 @@ typedef struct TfLiteCustomAllocation {
   size_t bytes;
 } TfLiteCustomAllocation;
 
-// An tensor in the interpreter system which is a wrapper around a buffer of
+// A tensor in the interpreter system which is a wrapper around a buffer of
 // data including a dimensionality (or NULL if not currently defined).
 #ifndef TF_LITE_STATIC_MEMORY
 typedef struct TfLiteTensor {
diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index a1e6fc41cd9..38b2e295da2 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -1,17 +1,16 @@
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite/micro:build_def.bzl", "micro_copts")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
-    default_visibility = ["//visibility:public"],
+    default_visibility = ["//visibility:private"],
     licenses = ["notice"],  # Apache 2.0
 )
 
 cc_library(
     name = "api",
     srcs = [
-        "error_reporter.cc",
         "flatbuffer_conversions.cc",
-        "op_resolver.cc",
         "tensor_utils.cc",
     ],
     hdrs = [
@@ -21,17 +20,67 @@ cc_library(
         "profiler.h",
         "tensor_utils.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + micro_copts(),
+    visibility = ["//visibility:public"],
     deps = [
+        ":error_reporter",
+        ":op_resolver",
         "@flatbuffers//:runtime_cc",
         "//tensorflow/lite/c:common",
         # TODO(b/158301698): consider moving internal:compatibility to a more
         # central location.
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
     ],
 )
 
+# We define separate targets for "op_resolver" and "error_reporter",
+# even though those headers are also exported by the "api" target,
+# so that targets which only want to depend on these small abstract base
+# class modules can express more fine-grained dependencies without
+# pulling in tensor_utils and flatbuffer_conversions.
+
+cc_library(
+    name = "op_resolver",
+    srcs = ["op_resolver.cc"],
+    hdrs = ["op_resolver.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts() + micro_copts(),
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        ":error_reporter",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
+        "@flatbuffers//:runtime_cc",
+    ],
+)
+
+cc_library(
+    name = "error_reporter",
+    srcs = ["error_reporter.cc"],
+    hdrs = ["error_reporter.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts() + micro_copts(),
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [],
+)
+
+cc_library(
+    name = "verifier",
+    hdrs = ["verifier.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts() + micro_copts(),
+    visibility = ["//visibility:public"],
+    deps = [":error_reporter"],
+)
+
 cc_test(
     name = "error_reporter_test",
     size = "small",
@@ -48,6 +97,7 @@ cc_test(
     srcs = ["op_resolver_test.cc"],
     deps = [
         ":api",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 5d2936f3636..77621c3f2fd 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -333,6 +333,10 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       return ParseReshape(op, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_RESIZE_BILINEAR: {
+      return ParseResizeBilinear(op, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR: {
       return ParseResizeNearestNeighbor(op, error_reporter, allocator,
                                         builtin_data);
@@ -346,6 +350,10 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       return ParseRsqrt(op, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_SHAPE: {
+      return ParseShape(op, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_SIN: {
       return ParseSin(op, error_reporter, allocator, builtin_data);
     }
@@ -358,6 +366,10 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       return ParseSplit(op, error_reporter, allocator, builtin_data);
     }
 
+    case BuiltinOperator_SPLIT_V: {
+      return ParseSplitV(op, error_reporter, allocator, builtin_data);
+    }
+
     case BuiltinOperator_SQRT: {
       return ParseSqrt(op, error_reporter, allocator, builtin_data);
     }
@@ -560,22 +572,6 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_RESIZE_BILINEAR: {
-      auto params = safe_allocator.Allocate<TfLiteResizeBilinearParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params =
-              op->builtin_options_as_ResizeBilinearOptions()) {
-        params->align_corners = schema_params->align_corners();
-        params->half_pixel_centers = schema_params->half_pixel_centers();
-      } else {
-        // Some older models did not populate the ResizeBilinearOptions field in
-        // the flatbuffer, so ensure it's set to a sensible default.
-        params->align_corners = false;
-        params->half_pixel_centers = false;
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_SKIP_GRAM: {
       auto params = safe_allocator.Allocate<TfLiteSkipGramParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
@@ -619,15 +615,7 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_SPLIT_V: {
-      auto params = safe_allocator.Allocate<TfLiteSplitParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params = op->builtin_options_as_SplitVOptions()) {
-        params->num_splits = schema_params->num_splits();
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
+
     case BuiltinOperator_SQUEEZE: {
       auto params = safe_allocator.Allocate<TfLiteSqueezeParams>();
       TF_LITE_ENSURE(error_reporter, params != nullptr);
@@ -667,16 +655,6 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
       *builtin_data = params.release();
       return kTfLiteOk;
     }
-    case BuiltinOperator_SHAPE: {
-      auto params = safe_allocator.Allocate<TfLiteShapeParams>();
-      TF_LITE_ENSURE(error_reporter, params != nullptr);
-      if (const auto* schema_params = op->builtin_options_as_ShapeOptions()) {
-        TF_LITE_ENSURE_STATUS(ConvertTensorType(
-            schema_params->out_type(), &params->out_type, error_reporter));
-      }
-      *builtin_data = params.release();
-      return kTfLiteOk;
-    }
     case BuiltinOperator_DELEGATE: {
       // TODO(ycling): Revisit when supporting saving delegated models.
       TF_LITE_REPORT_ERROR(error_reporter,
@@ -825,6 +803,8 @@ TfLiteStatus ParseOpDataTfLite(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_DENSIFY:
     case BuiltinOperator_SEGMENT_SUM:
       return kTfLiteOk;
+    case BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES:
+      return kTfLiteError;
   }
   return kTfLiteError;
 }  // NOLINT[readability/fn_size]
@@ -1475,6 +1455,33 @@ TfLiteStatus ParseReshape(const Operator* op, ErrorReporter* error_reporter,
   return kTfLiteOk;
 }
 
+TfLiteStatus ParseResizeBilinear(const Operator* op,
+                                 ErrorReporter* error_reporter,
+                                 BuiltinDataAllocator* allocator,
+                                 void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteResizeBilinearParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteResizeBilinearParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const ResizeBilinearOptions* schema_params =
+      op->builtin_options_as_ResizeBilinearOptions();
+
+  if (schema_params != nullptr) {
+    params->align_corners = schema_params->align_corners();
+    params->half_pixel_centers = schema_params->half_pixel_centers();
+  } else {
+    params->align_corners = false;
+    params->half_pixel_centers = false;
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 TfLiteStatus ParseResizeNearestNeighbor(const Operator* op,
                                         ErrorReporter* error_reporter,
                                         BuiltinDataAllocator* allocator,
@@ -1518,6 +1525,29 @@ TfLiteStatus ParseRsqrt(const Operator*, ErrorReporter*, BuiltinDataAllocator*,
   return kTfLiteOk;
 }
 
+TfLiteStatus ParseShape(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data) {
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+  std::unique_ptr<TfLiteShapeParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteShapeParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const ShapeOptions* schema_params = op->builtin_options_as_ShapeOptions();
+
+  if (schema_params != nullptr) {
+    TF_LITE_ENSURE_STATUS(ConvertTensorType(schema_params->out_type(),
+                                            &params->out_type, error_reporter));
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
@@ -1575,6 +1605,30 @@ TfLiteStatus ParseSplit(const Operator* op, ErrorReporter* error_reporter,
   return kTfLiteOk;
 }
 
+TfLiteStatus ParseSplitV(const Operator* op, ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data) {
+  CheckParsePointerParams(op, error_reporter, allocator, builtin_data);
+  SafeBuiltinDataAllocator safe_allocator(allocator);
+
+  std::unique_ptr<TfLiteSplitVParams,
+                  SafeBuiltinDataAllocator::BuiltinDataDeleter>
+      params = safe_allocator.Allocate<TfLiteSplitVParams>();
+  TF_LITE_ENSURE(error_reporter, params != nullptr);
+
+  const SplitVOptions* schema_params = op->builtin_options_as_SplitVOptions();
+
+  if (schema_params != nullptr) {
+    params->num_splits = schema_params->num_splits();
+  } else {
+    // TODO(b/157480169): We should either return kTfLiteError or fill in some
+    // reasonable defaults in the params struct. We are not doing so until we
+    // better undertand the ramifications of changing the legacy behavior.
+  }
+
+  *builtin_data = params.release();
+  return kTfLiteOk;
+}
+
 // We have this parse function instead of directly returning kTfLiteOk from the
 // switch-case in ParseOpData because this function is used as part of the
 // selective registration for the OpResolver implementation in micro.
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index aaeb98c0a2e..136809977c9 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -45,7 +45,7 @@ class BuiltinDataAllocator {
     // platform targets support that properly.
     static_assert(std::is_pod<T>::value, "Builtin data structure must be POD.");
     void* allocated_memory = this->Allocate(sizeof(T), alignof(T));
-    return new (allocated_memory) T;
+    return new (allocated_memory) T();
   }
 
   virtual ~BuiltinDataAllocator() {}
@@ -205,6 +205,11 @@ TfLiteStatus ParseRelu6(const Operator* op, ErrorReporter* error_reporter,
 TfLiteStatus ParseReshape(const Operator* op, ErrorReporter* error_reporter,
                           BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParseResizeBilinear(const Operator* op,
+                                 ErrorReporter* error_reporter,
+                                 BuiltinDataAllocator* allocator,
+                                 void** builtin_data);
+
 TfLiteStatus ParseResizeNearestNeighbor(const Operator* op,
                                         ErrorReporter* error_reporter,
                                         BuiltinDataAllocator* allocator,
@@ -216,6 +221,9 @@ TfLiteStatus ParseRound(const Operator* op, ErrorReporter* error_reporter,
 TfLiteStatus ParseRsqrt(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParseShape(const Operator* op, ErrorReporter* error_reporter,
+                        BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseSin(const Operator* op, ErrorReporter* error_reporter,
                       BuiltinDataAllocator* allocator, void** builtin_data);
 
@@ -225,6 +233,9 @@ TfLiteStatus ParseSoftmax(const Operator* op, ErrorReporter* error_reporter,
 TfLiteStatus ParseSplit(const Operator* op, ErrorReporter* error_reporter,
                         BuiltinDataAllocator* allocator, void** builtin_data);
 
+TfLiteStatus ParseSplitV(const Operator* op, ErrorReporter* error_reporter,
+                         BuiltinDataAllocator* allocator, void** builtin_data);
+
 TfLiteStatus ParseSqrt(const Operator* op, ErrorReporter* error_reporter,
                        BuiltinDataAllocator* allocator, void** builtin_data);
 
diff --git a/tensorflow/lite/core/api/op_resolver.cc b/tensorflow/lite/core/api/op_resolver.cc
index c239d9ed23e..c5dffb63549 100644
--- a/tensorflow/lite/core/api/op_resolver.cc
+++ b/tensorflow/lite/core/api/op_resolver.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 
 namespace tflite {
 
@@ -26,7 +27,7 @@ TfLiteStatus GetRegistrationFromOpCode(
     ErrorReporter* error_reporter, const TfLiteRegistration** registration) {
   TfLiteStatus status = kTfLiteOk;
   *registration = nullptr;
-  auto builtin_code = opcode->builtin_code();
+  auto builtin_code = GetBuiltinCode(opcode);
   int version = opcode->version();
 
   if (builtin_code > BuiltinOperator_MAX ||
diff --git a/tensorflow/lite/core/api/op_resolver_test.cc b/tensorflow/lite/core/api/op_resolver_test.cc
index 4dfca5c971a..44acc92ba8c 100644
--- a/tensorflow/lite/core/api/op_resolver_test.cc
+++ b/tensorflow/lite/core/api/op_resolver_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstring>
 
 #include <gtest/gtest.h>
+#include "tensorflow/lite/schema/schema_utils.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/core/api/verifier.h b/tensorflow/lite/core/api/verifier.h
new file mode 100644
index 00000000000..ca1cfb044bd
--- /dev/null
+++ b/tensorflow/lite/core/api/verifier.h
@@ -0,0 +1,38 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// \file
+/// Abstract interface for verifying a model.
+#ifndef TENSORFLOW_LITE_CORE_API_VERIFIER_H_
+#define TENSORFLOW_LITE_CORE_API_VERIFIER_H_
+
+#include "tensorflow/lite/core/api/error_reporter.h"
+
+namespace tflite {
+
+/// Abstract interface that verifies whether a given model is legit.
+/// It facilitates the use-case to verify and build a model without loading it
+/// twice.
+/// (See also "tensorflow/lite/tools/verifier.h".)
+class TfLiteVerifier {
+ public:
+  /// Returns true if the model is legit.
+  virtual bool Verify(const char* data, int length,
+                      ErrorReporter* reporter) = 0;
+  virtual ~TfLiteVerifier() {}
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_CORE_API_VERIFIER_H_
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index ecdb04c8b3c..2b9246a1100 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 
 #include "tensorflow/lite/arena_planner.h"
+#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/core/api/tensor_utils.h"
@@ -30,8 +31,6 @@ limitations under the License.
 
 namespace tflite {
 
-namespace impl {
-
 namespace {
 
 struct TfLiteQuantizationDeleter {
@@ -87,6 +86,7 @@ template <typename TensorIntArray>
 bool HasDynamicTensorImpl(const TfLiteContext& context,
                           const TensorIntArray& int_array) {
   for (int i : int_array) {
+    if (i == kTfLiteOptionalTensor) continue;
     const TfLiteTensor& tensor = context.tensors[i];
     if (tensor.allocation_type == kTfLiteDynamic) {
       return true;
@@ -167,9 +167,10 @@ class InterpreterInfo : public GraphInfo {
   TfLiteTensor* tensor(size_t index) override {
     return &subgraph_->tensors()[index];
   }
-  size_t num_nodes() const override {
+  size_t num_execution_nodes() const override {
     return subgraph_->execution_plan().size();
   }
+  size_t num_total_nodes() const override { return subgraph_->nodes_size(); }
   const TfLiteNode& node(size_t index) const override {
     int node_index = subgraph_->execution_plan()[index];
     return subgraph_->nodes_and_registration()[node_index].first;
@@ -582,6 +583,33 @@ TfLiteStatus Subgraph::CheckTensorIndices(const char* label, const int* indices,
   return kTfLiteOk;
 }
 
+// We have two arrays and we need to check that elements from one array don't
+// show up in the other. We could sort both arrays and then iterate with two
+// pointers from start to finish always increasing the smaller one but since
+// these arrays are usually short (<25 elements for inputs, usually <3 for
+// outputs), this might be slower than the naive approach (if arrays have size n
+// and m, with n >> m ~ O(1), first approach is O(nlogn) whereas the other is
+// O(n)). Plus, sorting the input and output arrays might not be something we
+// want as it destroys ordering of elements.
+//
+// If it turns out that this is an issue, we can switch to the other algorithm.
+TfLiteStatus Subgraph::CheckInputAndOutputForOverlap(const int* input_indices,
+                                                     int num_inputs,
+                                                     const int* output_indices,
+                                                     int num_outputs) {
+  for (int i = 0; i < num_inputs; i++) {
+    for (int j = 0; j < num_outputs; j++) {
+      if (input_indices[i] == output_indices[j]) {
+        ReportError("Tensor %d is both input %d and output %d\n",
+                    input_indices[i], i, j);
+        consistent_ = false;
+        return kTfLiteError;
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
 namespace {
 // Multiply two sizes and return true if overflow occurred;
 // This is based off tensorflow/overflow.h but is simpler as we already
@@ -674,13 +702,17 @@ TfLiteStatus Subgraph::ResetVariableTensors() {
       continue;
     }
 
-    // Variable tensors have to be `kTfLiteArenaRwPersistent`, and must be
-    // allocated after the initial `PrepareOpsAndTensors()` is called.
-    TF_LITE_ENSURE_EQ(&context_, tensor.allocation_type,
-                      kTfLiteArenaRwPersistent);
-    TF_LITE_ENSURE(&context_, tensor.data.raw != nullptr);
-
-    tflite::ResetVariableTensor(&tensor);
+    if (tensor.allocation_type == kTfLiteArenaRwPersistent) {
+      // If variable tensors allocation type is `kTfLiteArenaRwPersistent`, then
+      // they must be allocated after the initial `PrepareOpsAndTensors()` is
+      // called.
+      TF_LITE_ENSURE(&context_, tensor.data.raw != nullptr);
+      tflite::ResetVariableTensor(&tensor);
+    } else {
+      // If variable tensors allocation type is not `kTfLiteArenaRwPersistent`,
+      // then it can only be `kTfLiteCustom` in which case, we do not reset it.
+      TF_LITE_ENSURE_EQ(&context_, tensor.allocation_type, kTfLiteCustom);
+    }
   }
   return kTfLiteOk;
 }
@@ -704,6 +736,16 @@ TfLiteStatus Subgraph::AddNodeWithParameters(
       &context_,
       CheckTensorIndices("node outputs", outputs.data(), outputs.size()));
 
+  // For builtin ops, inputs and outputs must not overlap. Custom ops must do
+  // this check by themselves if they don't support overlapping tensors. This
+  // distinction is to allow custom ops to just forward a tensor, reusing it as
+  // both input and output.
+  if (builtin_data != nullptr) {
+    TF_LITE_ENSURE_OK(&context_, CheckInputAndOutputForOverlap(
+                                     inputs.data(), inputs.size(),
+                                     outputs.data(), outputs.size()));
+  }
+
   int new_node_index = nodes_and_registration_.size();
   if (node_index) *node_index = new_node_index;
   nodes_and_registration_.resize(nodes_and_registration_.size() + 1);
@@ -990,6 +1032,19 @@ TfLiteStatus Subgraph::Invoke() {
           tensor->data_is_stale) {
         TF_LITE_ENSURE_STATUS(EnsureTensorDataIsReadable(tensor_index));
       }
+      if (tensor->data.raw == nullptr && tensor->bytes > 0) {
+        if (registration.builtin_code == kTfLiteBuiltinReshape && i == 1) {
+          // In general, having a tensor here with no buffer will be an error.
+          // However, for the reshape operator, the second input tensor is only
+          // used for the shape, not for the data. Thus, null buffer is ok.
+          continue;
+        } else {
+          // In all other cases, we need to return an error as otherwise we will
+          // trigger a null pointer dereference (likely).
+          ReportError("Input tensor %d lacks data", tensor_index);
+          return kTfLiteError;
+        }
+      }
     }
 
     if (check_cancelled_func_ != nullptr &&
@@ -1341,6 +1396,48 @@ TfLiteStatus Subgraph::UndoAllDelegates() {
   execution_plan_ = pre_delegation_execution_plan_;
   pre_delegation_execution_plan_.clear();
 
+  // Handling FP16 delegation (if applies).
+  //
+  // First pass through execution plan to remember mapping of FP16
+  // dequantizations in the graph.
+  // This is required because delegates that support FP16 could remap supported
+  // nodes' inputs to point to their fp16 versions (if delegate supports fp16
+  // acceleration). This remapping is performed in FP16GraphPartitionHelper in
+  // delegates/utils. We need to undo this remapping to ensure CPU kernels work.
+  std::vector<int> fp16_to_fp32(tensors_size(), -1);
+  for (int execution_plan_index = 0;
+       execution_plan_index < execution_plan_.size(); ++execution_plan_index) {
+    int node_index = execution_plan_[execution_plan_index];
+    auto& node_and_reg = nodes_and_registration_[node_index];
+    const TfLiteNode& node = node_and_reg.first;
+    const TfLiteRegistration& reg = node_and_reg.second;
+    if (reg.builtin_code == kTfLiteBuiltinDequantize &&
+        node.inputs->size == 1 && node.outputs->size == 1) {
+      const int input_idx = node.inputs->data[0];
+      if (tensors_[input_idx].type == kTfLiteFloat16) {
+        fp16_to_fp32[input_idx] = node.outputs->data[0];
+      }
+    }
+  }
+  // Second pass through the execution plan to remap applicable nodes' fp16
+  // inputs to their original fp32 versions. Note that if a CPU kernel does
+  // support fp16, the model will not contain a DEQUANTIZE for its constant
+  // input.
+  for (int execution_plan_index = 0;
+       execution_plan_index < execution_plan_.size(); ++execution_plan_index) {
+    int node_index = execution_plan_[execution_plan_index];
+    auto& node_and_reg = nodes_and_registration_[node_index];
+    const TfLiteNode& node = node_and_reg.first;
+    const TfLiteRegistration& reg = node_and_reg.second;
+    if (reg.builtin_code == kTfLiteBuiltinDequantize) continue;
+    for (int i = 0; i < node.inputs->size; ++i) {
+      const int original_input_idx = node.inputs->data[i];
+      if (tensors_[original_input_idx].type == kTfLiteFloat16) {
+        node.inputs->data[i] = fp16_to_fp32[original_input_idx];
+      }
+    }
+  }
+
   // Delegate nodes are appended to nodes_and_registration_. Therefore,
   // cleanup nodes_and_registration_ to only contain nodes from
   // pre_delegation_execution_plan_.
@@ -1486,8 +1583,10 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
 TfLiteStatus Subgraph::SetCustomAllocationForTensor(
     int tensor_index, const TfLiteCustomAllocation& allocation) {
   TfLiteTensor* tensor = &context_.tensors[tensor_index];
-  TF_LITE_ENSURE(context(), tensor->allocation_type == kTfLiteArenaRw ||
-                                tensor->allocation_type == kTfLiteCustom);
+  TF_LITE_ENSURE(context(),
+                 (tensor->allocation_type == kTfLiteArenaRw ||
+                  tensor->allocation_type == kTfLiteArenaRwPersistent ||
+                  tensor->allocation_type == kTfLiteCustom));
   TF_LITE_ENSURE_STATUS(
       ValidateCustomAllocationForTensor(context(), tensor, allocation));
 
@@ -1510,6 +1609,4 @@ TfLiteStatus Subgraph::SetCustomAllocationForTensor(
   return kTfLiteOk;
 }
 
-}  // namespace impl
-
 }  // namespace tflite
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index 3a28b4cb99c..b94d1a0b2bc 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -30,14 +30,8 @@ limitations under the License.
 #include "tensorflow/lite/memory_planner.h"
 #include "tensorflow/lite/util.h"
 
-#if TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-#include "tensorflow/lite/experimental/tf_runtime/public/subgraph.h"
-#endif
-
 namespace tflite {
 
-namespace impl {
-
 // Forward declare since NNAPIDelegate uses Interpreter.
 class NNAPIDelegate;
 
@@ -342,8 +336,8 @@ class Subgraph {
   // for the tensor, it can no longer be reset to the TFLite arena memory.
   //
   // Parameters should satisfy the following conditions:
-  // 1. tensor->allocation_type == kTfLiteArenaRw
-  //    In general, this is true for all non-constants such as I/O tensors.
+  // 1. tensor->allocation_type == kTfLiteArenaRw or kTfLiteArenaRwPersistent
+  //    In general, this is true for I/O tensors & variable tensors.
   // 2. allocation->data has the appropriate permissions for runtime access
   //    (Read-only for inputs, Read-Write for others), and outlives Interpreter.
   // 3. allocation->bytes >= tensor->bytes.
@@ -457,6 +451,15 @@ class Subgraph {
   TfLiteStatus CheckTensorIndices(const char* label, const int* indices,
                                   int length);
 
+  // Check that the input indices and the output indices don't overlap.
+  // This is needed because same tensor must not be used both as input and
+  // output for an operator.
+  // NOTE: this changes consistent_ to be false if indices are out of bounds.
+  TfLiteStatus CheckInputAndOutputForOverlap(const int* input_indices,
+                                             int num_inputs,
+                                             const int* output_indices,
+                                             int num_outputs);
+
   // Compute the number of bytes required to represent a tensor with dimensions
   // specified by the array dims (of length dims_size). Returns the status code
   // and bytes.
@@ -739,13 +742,5 @@ class Subgraph {
   resource::ResourceMap* resources_ = nullptr;
 };
 
-}  // namespace impl
-
-#if TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-using Subgraph = tflrt::Subgraph;
-#else
-using Subgraph = impl::Subgraph;
-#endif
-
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_CORE_SUBGRAPH_H_
diff --git a/tensorflow/lite/delegates/BUILD b/tensorflow/lite/delegates/BUILD
index e1f91f32c34..d106ae4a738 100644
--- a/tensorflow/lite/delegates/BUILD
+++ b/tensorflow/lite/delegates/BUILD
@@ -14,6 +14,7 @@
 # ==============================================================================
 
 load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_linkopts")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -23,6 +24,7 @@ package(
 cc_library(
     name = "status",
     hdrs = ["status.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/c:common",
@@ -33,6 +35,7 @@ cc_library(
     name = "utils",
     srcs = ["utils.cc"],
     hdrs = ["utils.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite:kernel_api",
@@ -73,14 +76,19 @@ cc_test(
     ],
     deps = [
         ":interpreter_utils",
+        ":utils",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:util",
         "//tensorflow/lite:version",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/testing:util",
+        "//third_party/eigen3",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/lite/delegates/delegate_test.cc b/tensorflow/lite/delegates/delegate_test.cc
index aed4400ed99..b70ebdcc3aa 100644
--- a/tensorflow/lite/delegates/delegate_test.cc
+++ b/tensorflow/lite/delegates/delegate_test.cc
@@ -19,13 +19,21 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/lite/builtin_op_data.h"
+#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/delegates/interpreter_utils.h"
+#include "tensorflow/lite/delegates/utils.h"
 #include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/interpreter_builder.h"
+#include "tensorflow/lite/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
@@ -41,9 +49,12 @@ TfLiteRegistration AddOpRegistration() {
 
   reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
     // Set output size to input size
-    const TfLiteTensor* input1 = GetInput(context, node, 0);
-    const TfLiteTensor* input2 = GetInput(context, node, 1);
-    TfLiteTensor* output = GetOutput(context, node, 0);
+    const TfLiteTensor* input1;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input1));
+    const TfLiteTensor* input2;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &input2));
+    TfLiteTensor* output;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
 
     TF_LITE_ENSURE_EQ(context, input1->dims->size, input2->dims->size);
     for (int i = 0; i < input1->dims->size; ++i) {
@@ -57,13 +68,16 @@ TfLiteRegistration AddOpRegistration() {
 
   reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
     // Copy input data to output data.
-    const TfLiteTensor* a0 = GetInput(context, node, 0);
+    const TfLiteTensor* a0;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &a0));
     TF_LITE_ENSURE(context, a0);
     TF_LITE_ENSURE(context, a0->data.f);
-    const TfLiteTensor* a1 = GetInput(context, node, 1);
+    const TfLiteTensor* a1;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &a1));
     TF_LITE_ENSURE(context, a1);
     TF_LITE_ENSURE(context, a1->data.f);
-    TfLiteTensor* out = GetOutput(context, node, 0);
+    TfLiteTensor* out;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &out));
     TF_LITE_ENSURE(context, out);
     TF_LITE_ENSURE(context, out->data.f);
     int num = a0->dims->data[0];
@@ -266,7 +280,8 @@ class TestDelegate : public ::testing::Test {
             a0 = GetInput(context, node, 0);
             a1 = a0;
           }
-          TfLiteTensor* out = GetOutput(context, node, 0);
+          TfLiteTensor* out;
+          TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &out));
           int num = 1;
           for (int i = 0; i < a0->dims->size; ++i) {
             num *= a0->dims->data[i];
@@ -288,8 +303,10 @@ class TestDelegate : public ::testing::Test {
         reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
           // Shapes should already by propagated by the runtime, just need to
           // check.
-          const TfLiteTensor* input1 = GetInput(context, node, 0);
-          TfLiteTensor* output = GetOutput(context, node, 0);
+          const TfLiteTensor* input1;
+          TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input1));
+          TfLiteTensor* output;
+          TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
           const int input_dims_size = input1->dims->size;
           TF_LITE_ENSURE(context, output->dims->size == input_dims_size);
           for (int i = 0; i < input_dims_size; ++i) {
@@ -314,7 +331,8 @@ class TestDelegate : public ::testing::Test {
             input1 = GetInput(context, node, 0);
             input2 = input1;
           }
-          TfLiteTensor* output = GetOutput(context, node, 0);
+          TfLiteTensor* output;
+          TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
 
           TF_LITE_ENSURE_STATUS(context->ResizeTensor(
               context, output, TfLiteIntArrayCopy(input1->dims)));
@@ -526,6 +544,35 @@ TEST_F(TestDelegate, SecondDelegationInvokeFailure) {
   }
 }
 
+// This test ensures that node indices in multi-delegate application are handled
+// correctly by the TFLite partitioning algorithm.
+TEST_F(TestDelegate, TwoDelegates_ExecutionPlanIndicesDifferent) {
+  // First delegate supports nodes 0, 1.
+  // After this delegation, the execution plan size is 2.
+  delegate_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({0, 1}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  // Second delegate supports (original) node index 2.
+  // The execution plan has 2 nodes, so this verifies that the partitioning
+  // algorithm correctly refers to (original) node indices instead of execution
+  // plan indices.
+  delegate2_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({2}, kTfLiteDelegateFlagsNone));
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  // Verify Invoke works.
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+}
+
 TEST_F(TestDelegate, StaticDelegateMakesGraphImmutable) {
   delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
   ASSERT_EQ(
@@ -1139,11 +1186,14 @@ class TestDelegateWithDynamicTensors : public ::testing::Test {
 
     reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
       // Output 0 is dynamic
-      TfLiteTensor* output0 = GetOutput(context, node, 0);
+      TfLiteTensor* output0;
+      TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output0));
       SetTensorToDynamic(output0);
       // Output 1 has the same shape as input.
-      const TfLiteTensor* input = GetInput(context, node, 0);
-      TfLiteTensor* output1 = GetOutput(context, node, 1);
+      const TfLiteTensor* input;
+      TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+      TfLiteTensor* output1;
+      TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 1, &output1));
       TF_LITE_ENSURE_STATUS(context->ResizeTensor(
           context, output1, TfLiteIntArrayCopy(input->dims)));
       return kTfLiteOk;
@@ -1163,11 +1213,14 @@ class TestDelegateWithDynamicTensors : public ::testing::Test {
       // If tensors are resized, the runtime should propagate shapes
       // automatically if correct flag is set. Ensure values are correct.
       // Output 0 should be dynamic.
-      TfLiteTensor* output0 = GetOutput(context, node, 0);
+      TfLiteTensor* output0;
+      TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output0));
       TF_LITE_ENSURE(context, IsDynamicTensor(output0));
       // Output 1 has the same shape as input.
-      const TfLiteTensor* input = GetInput(context, node, 0);
-      TfLiteTensor* output1 = GetOutput(context, node, 1);
+      const TfLiteTensor* input;
+      TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+      TfLiteTensor* output1;
+      TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 1, &output1));
       TF_LITE_ENSURE(context, input->dims->size == output1->dims->size);
       TF_LITE_ENSURE(context, input->dims->data[0] == output1->dims->data[0]);
       return kTfLiteOk;
@@ -1240,6 +1293,294 @@ TEST_F(TestDelegateWithDynamicTensors, ShapePropagation_FlagNotSet) {
   ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteError);
 }
 
+// Tests for FP16 graphs
+// =====================
+
+// Tests delegate functionality related to FP16 graphs.
+// Model architecture:
+// 1->DEQ->2   4->DEQ->5   7->DEQ->8   10->DEQ->11
+//         |           |           |            |
+// 0----->ADD->3----->ADD->6----->MUL->9------>ADD-->12
+// Input: 0, Output:12.
+// All constants are 2, so the function is: (x + 2 + 2) * 2 + 2 = 2x + 10
+//
+// Delegate only supports ADD, so can have upto two delegated partitions.
+// TODO(b/156707497): Add more cases here once we have landed CPU kernels
+// supporting FP16.
+class TestFP16Delegation : public ::testing::TestWithParam<int> {
+ protected:
+  void SetUp() override {
+    interpreter_.reset(new Interpreter);
+    interpreter_->AddTensors(13);
+    interpreter_->SetInputs({0});
+    interpreter_->SetOutputs({12});
+
+    float16_const_ = Eigen::half_impl::float_to_half_rtne(2.f);
+
+    // TENSORS.
+    TfLiteQuantizationParams quant;
+    // Input.
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {1},
+                                               quant);
+    // fp16 constant, dequantize output, Add0 output.
+    interpreter_->SetTensorParametersReadOnly(
+        1, kTfLiteFloat16, "", {1}, quant,
+        reinterpret_cast<const char*>(&float16_const_), sizeof(TfLiteFloat16));
+    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {1},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {1},
+                                               quant);
+    // fp16 constant, dequantize output, Add1 output.
+    interpreter_->SetTensorParametersReadOnly(
+        4, kTfLiteFloat16, "", {1}, quant,
+        reinterpret_cast<const char*>(&float16_const_), sizeof(TfLiteFloat16));
+    interpreter_->SetTensorParametersReadWrite(5, kTfLiteFloat32, "", {1},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(6, kTfLiteFloat32, "", {1},
+                                               quant);
+    // fp16 constant, dequantize output, Mul0 output.
+    interpreter_->SetTensorParametersReadOnly(
+        7, kTfLiteFloat16, "", {1}, quant,
+        reinterpret_cast<const char*>(&float16_const_), sizeof(TfLiteFloat16));
+    interpreter_->SetTensorParametersReadWrite(8, kTfLiteFloat32, "", {1},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(9, kTfLiteFloat32, "", {1},
+                                               quant);
+    // fp16 constant, dequantize output, Add2 output.
+    interpreter_->SetTensorParametersReadOnly(
+        10, kTfLiteFloat16, "", {1}, quant,
+        reinterpret_cast<const char*>(&float16_const_), sizeof(TfLiteFloat16));
+    interpreter_->SetTensorParametersReadWrite(11, kTfLiteFloat32, "", {1},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(12, kTfLiteFloat32, "", {1},
+                                               quant);
+
+    // NODES.
+    auto* add_reg = ops::builtin::Register_ADD();
+    auto* mul_reg = ops::builtin::Register_MUL();
+    auto* deq_reg = ops::builtin::Register_DEQUANTIZE();
+    add_reg->builtin_code = kTfLiteBuiltinAdd;
+    deq_reg->builtin_code = kTfLiteBuiltinDequantize;
+    mul_reg->builtin_code = kTfLiteBuiltinMul;
+    TfLiteAddParams* builtin_data0 =
+        reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+    TfLiteAddParams* builtin_data1 =
+        reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+    TfLiteMulParams* builtin_data2 =
+        reinterpret_cast<TfLiteMulParams*>(malloc(sizeof(TfLiteMulParams)));
+    TfLiteAddParams* builtin_data3 =
+        reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+    builtin_data0->activation = kTfLiteActNone;
+    builtin_data1->activation = kTfLiteActNone;
+    builtin_data2->activation = kTfLiteActNone;
+    builtin_data3->activation = kTfLiteActNone;
+    interpreter_->AddNodeWithParameters({1}, {2}, nullptr, 0, nullptr, deq_reg);
+    interpreter_->AddNodeWithParameters({0, 2}, {3}, nullptr, 0, builtin_data0,
+                                        add_reg);
+    interpreter_->AddNodeWithParameters({4}, {5}, nullptr, 0, nullptr, deq_reg);
+    interpreter_->AddNodeWithParameters({3, 5}, {6}, nullptr, 0, builtin_data1,
+                                        add_reg);
+    interpreter_->AddNodeWithParameters({7}, {8}, nullptr, 0, nullptr, deq_reg);
+    interpreter_->AddNodeWithParameters({6, 8}, {9}, nullptr, 0, builtin_data2,
+                                        mul_reg);
+    interpreter_->AddNodeWithParameters({10}, {11}, nullptr, 0, nullptr,
+                                        deq_reg);
+    interpreter_->AddNodeWithParameters({9, 11}, {12}, nullptr, 0,
+                                        builtin_data3, add_reg);
+  }
+
+  void VerifyInvoke() {
+    std::vector<float> input = {3.0f};
+    std::vector<float> expected_output = {16.0f};
+
+    const int input_tensor_idx = interpreter_->inputs()[0];
+    const int output_tensor_idx = interpreter_->outputs()[0];
+
+    memcpy(interpreter_->typed_tensor<float>(input_tensor_idx), input.data(),
+           sizeof(float));
+    ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+    TfLiteTensor* output_tensor = interpreter_->tensor(output_tensor_idx);
+    for (int i = 0; i < 1; ++i) {
+      EXPECT_EQ(output_tensor->data.f[i], expected_output[i]) << i;
+    }
+  }
+
+  void TearDown() override { interpreter_.reset(); }
+
+ protected:
+  class FP16Delegate {
+   public:
+    // Uses FP16GraphPartitionHelper to accept ADD nodes with fp16 input.
+    explicit FP16Delegate(int num_delegated_subsets,
+                          bool fail_node_prepare = false,
+                          bool fail_node_invoke = false)
+        : num_delegated_subsets_(num_delegated_subsets),
+          fail_delegate_node_prepare_(fail_node_prepare),
+          fail_delegate_node_invoke_(fail_node_invoke) {
+      delegate_.Prepare = [](TfLiteContext* context,
+                             TfLiteDelegate* delegate) -> TfLiteStatus {
+        auto* fp16_delegate = static_cast<FP16Delegate*>(delegate->data_);
+        // FP16 graph partitioning.
+        delegates::IsNodeSupportedFn node_supported_fn =
+            [=](TfLiteContext* context, TfLiteNode* node,
+                TfLiteRegistration* registration,
+                std::string* unsupported_details) -> bool {
+          return registration->builtin_code == kTfLiteBuiltinAdd;
+        };
+        delegates::FP16GraphPartitionHelper partition_helper(context,
+                                                             node_supported_fn);
+        TfLiteIntArray* nodes_to_separate = nullptr;
+        if (partition_helper.Partition(nullptr) != kTfLiteOk) {
+          nodes_to_separate = TfLiteIntArrayCreate(0);
+        } else {
+          std::vector<int> ops_to_replace =
+              partition_helper.GetNodesOfFirstNLargestPartitions(
+                  fp16_delegate->num_delegated_subsets());
+          nodes_to_separate = ConvertVectorToTfLiteIntArray(ops_to_replace);
+        }
+
+        context->ReplaceNodeSubsetsWithDelegateKernels(
+            context, fp16_delegate->FakeFusedRegistration(), nodes_to_separate,
+            delegate);
+        TfLiteIntArrayFree(nodes_to_separate);
+        return kTfLiteOk;
+      };
+      delegate_.CopyFromBufferHandle =
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle buffer_handle,
+             TfLiteTensor* output) -> TfLiteStatus { return kTfLiteOk; };
+      delegate_.FreeBufferHandle = nullptr;
+      delegate_.CopyToBufferHandle = nullptr;
+      // Store type-punned data SimpleDelegate structure.
+      delegate_.data_ = static_cast<void*>(this);
+      delegate_.flags = kTfLiteDelegateFlagsNone;
+    }
+
+    TfLiteRegistration FakeFusedRegistration() {
+      TfLiteRegistration reg = {nullptr};
+      reg.custom_name = "fake_fp16_add_op";
+
+      // Different flavors of the delegate kernel's Invoke(), dependent on
+      // testing parameters.
+      if (fail_delegate_node_invoke_) {
+        reg.invoke = [](TfLiteContext* context,
+                        TfLiteNode* node) -> TfLiteStatus {
+          return kTfLiteError;
+        };
+      } else {
+        reg.invoke = [](TfLiteContext* context,
+                        TfLiteNode* node) -> TfLiteStatus {
+          float output = 0;
+          for (int i = 0; i < node->inputs->size; ++i) {
+            const TfLiteTensor* input_tensor = GetInput(context, node, i);
+            if (input_tensor->type == kTfLiteFloat32) {
+              output += input_tensor->data.f[0];
+            } else {
+              // All constants are 2.
+              output += 2;
+            }
+          }
+          TfLiteTensor* out = GetOutput(context, node, 0);
+          out->data.f[0] = output;
+          return kTfLiteOk;
+        };
+      }
+
+      // Different flavors of the delegate kernel's Prepare(), dependent on
+      // testing parameters.
+      if (fail_delegate_node_prepare_) {
+        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+          return kTfLiteError;
+        };
+      } else {
+        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+          // Set output size to input size
+          const TfLiteTensor* input = GetInput(context, node, 0);
+          TfLiteTensor* output = GetOutput(context, node, 0);
+          TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+              context, output, TfLiteIntArrayCopy(input->dims)));
+          return kTfLiteOk;
+        };
+      }
+
+      return reg;
+    }
+
+    TfLiteDelegate* get_tf_lite_delegate() { return &delegate_; }
+
+    int num_delegated_subsets() { return num_delegated_subsets_; }
+
+   private:
+    TfLiteDelegate delegate_;
+    int num_delegated_subsets_;
+    bool fail_delegate_node_prepare_ = false;
+    bool fail_delegate_node_invoke_ = false;
+  };
+
+  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<FP16Delegate> delegate_;
+  Eigen::half float16_const_;
+};
+
+TEST_P(TestFP16Delegation, NonDelegatedInterpreterWorks) {
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  VerifyInvoke();
+}
+
+TEST_P(TestFP16Delegation, DelegationWorks) {
+  delegate_ = std::unique_ptr<FP16Delegate>(
+      new FP16Delegate(/**num_delegated_subsets**/ GetParam()));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Should have 5 nodes: delegate, mul, add2 & 2 dequantize (one for mul &
+  // add2).
+  ASSERT_EQ(interpreter_->execution_plan().size(), 5);
+  VerifyInvoke();
+}
+
+TEST_P(TestFP16Delegation, DelegatePrepareFails) {
+  delegate_ = std::unique_ptr<FP16Delegate>(new FP16Delegate(
+      /**num_delegated_subsets**/ GetParam(), /**fail_node_prepare**/ true));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteDelegateError);
+  // Delegation failed, but runtime should go back to correct previous state.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 8);
+  VerifyInvoke();
+}
+
+TEST_P(TestFP16Delegation, DelegateInvokeWithCPUFallback) {
+  delegate_ = std::unique_ptr<FP16Delegate>(new FP16Delegate(
+      /**num_delegated_subsets**/ GetParam(), /**fail_node_prepare**/ false,
+      /**fail_node_invoke**/ true));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+
+  std::vector<float> input = {3.0f};
+  std::vector<float> expected_output = {16.0f};
+
+  const int input_tensor_idx = interpreter_->inputs()[0];
+  const int output_tensor_idx = interpreter_->outputs()[0];
+
+  memcpy(interpreter_->typed_tensor<float>(input_tensor_idx), input.data(),
+         sizeof(float));
+  EXPECT_EQ(
+      delegates::InterpreterUtils::InvokeWithCPUFallback(interpreter_.get()),
+      kTfLiteDelegateError);
+  TfLiteTensor* output_tensor = interpreter_->tensor(output_tensor_idx);
+  for (int i = 0; i < 1; ++i) {
+    EXPECT_EQ(output_tensor->data.f[i], expected_output[i]) << i;
+  }
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 8);
+  VerifyInvoke();
+}
+
+INSTANTIATE_TEST_SUITE_P(TestFP16Delegation, TestFP16Delegation,
+                         ::testing::Values(1, 2));
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 6210007361a..098159d9d26 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -1,5 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_opts_nortti_if_lite_protos")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite/delegates/flex:build_def.bzl", "tflite_flex_cc_library")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
 
 #
 # This is a TF Lite delegate that is powered by TensorFlow's Eager.
@@ -84,6 +86,7 @@ cc_library(
     hdrs = [
         "delegate.h",
     ],
+    copts = tflite_copts(),
     visibility = ["//visibility:public"],
     deps = [
         ":buffer_map",
@@ -124,10 +127,13 @@ tf_cc_test(
     name = "delegate_test",
     size = "small",
     srcs = ["delegate_test.cc"],
-    tags = ["no_gpu"],  # GPU + flex is not officially supported.
+    tags = [
+        "no_gpu",  # GPU + flex is not officially supported.
+    ],
     deps = [
         ":delegate",
         ":test_util",
+        "//tensorflow/lite:shared_library",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
     ],
@@ -177,8 +183,8 @@ tf_cc_test(
     srcs = ["kernel_test.cc"],
     tags = ["no_gpu"],  # GPU + flex is not officially supported.
     deps = [
+        ":delegate",
         ":delegate_data",
-        ":delegate_only_runtime",
         ":test_util",
         "@com_google_googletest//:gtest",
     ],
@@ -241,6 +247,7 @@ cc_library(
         "allowlisted_flex_ops.h",
         "allowlisted_flex_ops_internal.h",
     ],
+    compatible_with = get_compatible_with_cloud(),
     deps = select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -276,3 +283,15 @@ tf_cc_test(
         ],
     }),
 )
+
+# Alias to support selective build of image ops.
+# TODO(b/163285312): Remove after tensorflow/core refactoring completed.
+cc_library(
+    name = "portable_images_lib",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core:portable_jpeg_internal",
+        "//tensorflow/core/lib/png:png_io",
+    ],
+)
diff --git a/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc b/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc
index eefbeb72b15..eee1c99ed58 100644
--- a/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc
+++ b/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc
@@ -68,6 +68,10 @@ const std::set<std::string>& GetFlexAllowlist() {
           "AvgPoolGrad",
           "BatchMatMul",
           "BatchMatMulV2",
+          "BatchMatrixDiag",
+          "BatchMatrixDiagPart",
+          "BatchMatrixInverse",
+          "BatchMatrixSetDiag",
           "BatchNormWithGlobalNormalization",
           "BatchNormWithGlobalNormalizationGrad",
           "BatchToSpace",
@@ -75,7 +79,20 @@ const std::set<std::string>& GetFlexAllowlist() {
           "BiasAdd",
           "BiasAddGrad",
           "BiasAddV1",
+          "Bincount",
+          "Bitcast",
+          "BitwiseAnd",
+          "BitwiseOr",
+          "BitwiseXor",
           "BoostedTreesBucketize",
+          "BoostedTreesCreateQuantileStreamResource",
+          "BoostedTreesFlushQuantileSummaries",
+          "BoostedTreesMakeQuantileSummaries",
+          "BoostedTreesQuantileStreamResourceAddSummaries",
+          "BoostedTreesQuantileStreamResourceDeserialize",
+          "BoostedTreesQuantileStreamResourceFlush",
+          "BoostedTreesQuantileStreamResourceGetBucketBoundaries",
+          "BoostedTreesQuantileStreamResourceHandleOp",
           "BroadcastArgs",
           "BroadcastGradientArgs",
           "BroadcastTo",
@@ -85,6 +102,7 @@ const std::set<std::string>& GetFlexAllowlist() {
           "Cast",
           "Ceil",
           "CheckNumerics",
+          "CheckNumericsV2",
           "CombinedNonMaxSuppression",
           "Complex",
           "ComplexAbs",
@@ -99,6 +117,9 @@ const std::set<std::string>& GetFlexAllowlist() {
           "Conv2DBackpropFilter",
           "Conv2DBackpropInput",
           "Conv3D",
+          "Conv3DBackpropFilter",
+          "Conv3DBackpropFilterV2",
+          "Conv3DBackpropInput",
           "Conv3DBackpropInputV2",
           "Cos",
           "Cosh",
@@ -107,21 +128,32 @@ const std::set<std::string>& GetFlexAllowlist() {
           "CropAndResizeGradImage",
           "Cumprod",
           "Cumsum",
+          "CumulativeLogsumexp",
           "DataFormatDimMap",
           "DataFormatVecPermute",
           "DebugGradientIdentity",
           "DebugGradientRefIdentity",
+          "DecodeAndCropJpeg",
           "DecodeBase64",
+          "DecodeBmp",
+          "DecodeGif",
+          "DecodeImage",
+          "DecodeJpeg",
+          "DecodePng",
           "DecodeRaw",
           "DecodeWav",
           "DeepCopy",
           "DeleteSessionTensor",
+          "DenseBincount",
           "DepthToSpace",
           "DepthwiseConv2dNative",
           "Dequantize",
           "DestroyTemporaryVariable",
           "Diag",
+          "DiagPart",
           "Dilation2D",
+          "Dilation2DBackpropFilter",
+          "Dilation2DBackpropInput",
           "Div",
           "DivNoNan",
           "DynamicPartition",
@@ -130,7 +162,11 @@ const std::set<std::string>& GetFlexAllowlist() {
           "Elu",
           "EluGrad",
           "Empty",
+          "EmptyTensorList",
           "EncodeBase64",
+          "EncodeJpeg",
+          "EncodeJpegVariableQuality",
+          "EncodePng",
           "EncodeWav",
           "EnsureShape",
           "Enter",
@@ -172,6 +208,7 @@ const std::set<std::string>& GetFlexAllowlist() {
           "GetSessionTensor",
           "Greater",
           "GreaterEqual",
+          "HistogramSummary",
           "IFFT",
           "IFFT2D",
           "IFFT3D",
@@ -182,6 +219,7 @@ const std::set<std::string>& GetFlexAllowlist() {
           "IdentityN",
           "Imag",
           "ImageProjectiveTransformV2",
+          "ImageProjectiveTransformV3",
           "ImmutableConst",
           "InTopK",
           "InTopKV2",
@@ -190,13 +228,16 @@ const std::set<std::string>& GetFlexAllowlist() {
           "InplaceUpdate",
           "Inv",
           "InvGrad",
+          "Invert",
           "InvertPermutation",
+          "IsBoostedTreesQuantileStreamResourceInitialized",
           "IsFinite",
           "IsNan",
           "IsVariableInitialized",
           "LRN",
           "LeakyRelu",
           "LeakyReluGrad",
+          "LeftShift",
           "Less",
           "LessEqual",
           "LinSpace",
@@ -209,6 +250,9 @@ const std::set<std::string>& GetFlexAllowlist() {
           "LoopCond",
           "MatMul",
           "MatrixDiag",
+          "MatrixDiagPart",
+          "MatrixDiagPartV2",
+          "MatrixDiagPartV3",
           "MatrixDiagV2",
           "MatrixDiagV3",
           "MatrixInverse",
@@ -218,6 +262,8 @@ const std::set<std::string>& GetFlexAllowlist() {
           "Max",
           "MaxPool",
           "MaxPool3D",
+          "MaxPool3DGrad",
+          "MaxPool3DGradGrad",
           "MaxPoolGrad",
           "MaxPoolGradGrad",
           "MaxPoolGradGradV2",
@@ -228,6 +274,7 @@ const std::set<std::string>& GetFlexAllowlist() {
           "Maximum",
           "Mean",
           "Merge",
+          "MergeSummary",
           "MergeV2Checkpoints",
           "Mfcc",
           "Min",
@@ -244,6 +291,7 @@ const std::set<std::string>& GetFlexAllowlist() {
           "NonMaxSuppressionV2",
           "NonMaxSuppressionV3",
           "NonMaxSuppressionV4",
+          "NonMaxSuppressionV5",
           "NonMaxSuppressionWithOverlaps",
           "NotEqual",
           "OneHot",
@@ -253,15 +301,18 @@ const std::set<std::string>& GetFlexAllowlist() {
           "PadV2",
           "PaddingFIFOQueue",
           "PaddingFIFOQueueV2",
+          "ParallelConcat",
           "ParallelDynamicStitch",
           "ParseExample",
           "ParseExampleV2",
           "ParseSequenceExample",
+          "ParseSequenceExampleV2",
           "ParseSingleExample",
           "ParseSingleSequenceExample",
           "Placeholder",
           "PlaceholderV2",
           "PlaceholderWithDefault",
+          "PopulationCount",
           "Pow",
           "PreventGradient",
           "Print",
@@ -302,10 +353,14 @@ const std::set<std::string>& GetFlexAllowlist() {
           "RFFT",
           "RFFT2D",
           "RFFT3D",
+          "RaggedBincount",
+          "RaggedGather",
           "RaggedRange",
           "RaggedTensorToSparse",
           "RaggedTensorToTensor",
           "RandomGamma",
+          "RandomPoisson",
+          "RandomPoissonV2",
           "RandomStandardNormal",
           "RandomUniform",
           "RandomUniformInt",
@@ -315,6 +370,7 @@ const std::set<std::string>& GetFlexAllowlist() {
           "RealDiv",
           "Reciprocal",
           "ReciprocalGrad",
+          "Recv",
           "ReduceJoin",
           "RefEnter",
           "RefExit",
@@ -342,22 +398,31 @@ const std::set<std::string>& GetFlexAllowlist() {
           "ResourceApplyAdagradDA",
           "ResourceApplyAdagradV2",
           "ResourceApplyAdam",
+          "ResourceApplyAdamWithAmsgrad",
           "ResourceApplyAddSign",
           "ResourceApplyCenteredRMSProp",
           "ResourceApplyFtrl",
           "ResourceApplyFtrlV2",
           "ResourceApplyGradientDescent",
+          "ResourceApplyKerasMomentum",
           "ResourceApplyMomentum",
           "ResourceApplyPowerSign",
           "ResourceApplyProximalAdagrad",
           "ResourceApplyProximalGradientDescent",
           "ResourceApplyRMSProp",
+          "ResourceScatterNdAdd",
+          "ResourceScatterNdMax",
+          "ResourceScatterNdMin",
+          "ResourceScatterNdSub",
+          "ResourceScatterNdUpdate",
           "ResourceSparseApplyAdadelta",
           "ResourceSparseApplyAdagrad",
           "ResourceSparseApplyAdagradDA",
+          "ResourceSparseApplyAdagradV2",
           "ResourceSparseApplyCenteredRMSProp",
           "ResourceSparseApplyFtrl",
           "ResourceSparseApplyFtrlV2",
+          "ResourceSparseApplyKerasMomentum",
           "ResourceSparseApplyMomentum",
           "ResourceSparseApplyProximalAdagrad",
           "ResourceSparseApplyProximalGradientDescent",
@@ -369,14 +434,23 @@ const std::set<std::string>& GetFlexAllowlist() {
           "Reverse",
           "ReverseSequence",
           "ReverseV2",
+          "RightShift",
           "Round",
           "Rsqrt",
           "RsqrtGrad",
+          "SampleDistortedBoundingBox",
           "SampleDistortedBoundingBoxV2",
           "Save",
           "SaveSlices",
           "SaveV2",
+          "ScalarSummary",
           "ScatterNd",
+          "ScatterNdAdd",
+          "ScatterNdMax",
+          "ScatterNdMin",
+          "ScatterNdNonAliasingAdd",
+          "ScatterNdSub",
+          "ScatterNdUpdate",
           "SegmentMax",
           "SegmentMean",
           "SegmentMin",
@@ -386,6 +460,7 @@ const std::set<std::string>& GetFlexAllowlist() {
           "SelectV2",
           "Selu",
           "SeluGrad",
+          "Send",
           "Shape",
           "ShapeN",
           "ShardedFilename",
@@ -409,6 +484,7 @@ const std::set<std::string>& GetFlexAllowlist() {
           "SparseApplyAdadelta",
           "SparseApplyAdagrad",
           "SparseApplyAdagradDA",
+          "SparseApplyAdagradV2",
           "SparseApplyCenteredRMSProp",
           "SparseApplyFtrl",
           "SparseApplyFtrlV2",
@@ -416,6 +492,7 @@ const std::set<std::string>& GetFlexAllowlist() {
           "SparseApplyProximalAdagrad",
           "SparseApplyProximalGradientDescent",
           "SparseApplyRMSProp",
+          "SparseBincount",
           "SparseCross",
           "SparseCrossHashed",
           "SparseCrossV2",
@@ -446,12 +523,14 @@ const std::set<std::string>& GetFlexAllowlist() {
           "StackPush",
           "StackPushV2",
           "StackV2",
+          "StatelessMultinomial",
           "StatelessRandomGammaV2",
           "StatelessRandomNormal",
           "StatelessRandomPoisson",
           "StatelessRandomUniform",
           "StatelessRandomUniformFullInt",
           "StatelessRandomUniformInt",
+          "StatelessSampleDistortedBoundingBox",
           "StatelessTruncatedNormal",
           "StaticRegexReplace",
           "StopGradient",
@@ -459,8 +538,10 @@ const std::set<std::string>& GetFlexAllowlist() {
           "StridedSliceAssign",
           "StridedSliceGrad",
           "StringJoin",
+          "StringLower",
           "StringSplit",
           "StringSplitV2",
+          "StringStrip",
           "StringToHashBucket",
           "StringToHashBucketFast",
           "StringToHashBucketStrong",
@@ -506,6 +587,31 @@ const std::set<std::string>& GetFlexAllowlist() {
           "TensorArrayWrite",
           "TensorArrayWriteV2",
           "TensorArrayWriteV3",
+          "TensorListConcat",
+          "TensorListConcatLists",
+          "TensorListConcatV2",
+          "TensorListElementShape",
+          "TensorListFromTensor",
+          "TensorListGather",
+          "TensorListGetItem",
+          "TensorListLength",
+          "TensorListPopBack",
+          "TensorListPushBack",
+          "TensorListPushBackBatch",
+          "TensorListReserve",
+          "TensorListResize",
+          "TensorListScatter",
+          "TensorListScatterIntoExistingList",
+          "TensorListScatterV2",
+          "TensorListSetItem",
+          "TensorListSplit",
+          "TensorListStack",
+          "TensorScatterAdd",
+          "TensorScatterMax",
+          "TensorScatterMin",
+          "TensorScatterSub",
+          "TensorScatterUpdate",
+          "TensorStridedSliceUpdate",
           "Tile",
           "TileGrad",
           "Timestamp",
@@ -527,21 +633,30 @@ const std::set<std::string>& GetFlexAllowlist() {
           "UnsortedSegmentMin",
           "UnsortedSegmentProd",
           "UnsortedSegmentSum",
+          "UnwrapDatasetVariant",
           "Variable",
           "VariableV2",
           "Where",
+          "WrapDatasetVariant",
           "Xdivy",
+          "Xlog1py",
           "Xlogy",
           "ZerosLike",
           "_Arg",
           "_ArrayToList",
+          "_DeviceArg",
+          "_DeviceRetval",
+          "_FusedConv2D",
           "_HostCast",
           "_HostRecv",
           "_HostSend",
           "_ListToArray",
+          "_ParallelConcatStart",
+          "_ParallelConcatUpdate",
           "_Recv",
           "_Retval",
           "_Send",
+          "_SwitchN",
           // go/keep-sorted end
       });
   return *allowlisted_flex_ops;
diff --git a/tensorflow/lite/delegates/flex/buffer_map.cc b/tensorflow/lite/delegates/flex/buffer_map.cc
index c2611290c1b..86ea4b849ea 100644
--- a/tensorflow/lite/delegates/flex/buffer_map.cc
+++ b/tensorflow/lite/delegates/flex/buffer_map.cc
@@ -149,6 +149,11 @@ tensorflow::Tensor BufferMap::GetTensor(int tensor_index) const {
   return id_to_tensor_.at(tensor_index);
 }
 
+const tensorflow::Tensor* BufferMap::GetTensorPtr(int tensor_index) const {
+  auto& tensor = id_to_tensor_.at(tensor_index);
+  return &tensor;
+}
+
 void BufferMap::SetFromTfLite(int tensor_index, const TfLiteTensor* tensor) {
   tensorflow::TensorShape shape;
   int num_dims = tensor->dims->size;
diff --git a/tensorflow/lite/delegates/flex/buffer_map.h b/tensorflow/lite/delegates/flex/buffer_map.h
index 6c35895c249..6a29c7f80dc 100644
--- a/tensorflow/lite/delegates/flex/buffer_map.h
+++ b/tensorflow/lite/delegates/flex/buffer_map.h
@@ -47,6 +47,11 @@ class BufferMap {
   // Precondition: HasTensor() is true.
   tensorflow::Tensor GetTensor(int tensor_index) const;
 
+  // Returns the const pointer to tensorflow::Tensor associated with the given
+  // 'tensor_index'.
+  // Precondition: HasTensor() is true.
+  const tensorflow::Tensor* GetTensorPtr(int tensor_index) const;
+
   // Associates the given tensorflow::Tensor with the given 'tensor_index'.
   // Note that TensorFlow Tensors share data buffers, so this method is only a
   // shallow copy.
diff --git a/tensorflow/lite/delegates/flex/build_def.bzl b/tensorflow/lite/delegates/flex/build_def.bzl
index 9b9f1b2c4cb..5826e1f83cd 100644
--- a/tensorflow/lite/delegates/flex/build_def.bzl
+++ b/tensorflow/lite/delegates/flex/build_def.bzl
@@ -2,6 +2,7 @@
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "clean_dep",
     "if_android",
     "if_ios",
     "if_mobile",
@@ -46,12 +47,12 @@ def generate_flex_kernel_header(
         ["$(location %s)" % f for f in models],
     )
     list_ops_output = include_path + "/list_flex_ops"
-    list_ops_tool = "//tensorflow/lite/tools:list_flex_ops_main"
+    list_ops_tool = clean_dep("//tensorflow/lite/tools:list_flex_ops_main")
     if additional_deps:
         tf_cc_binary(
             name = "%s_list_flex_ops_main" % name,
             deps = [
-                "//tensorflow/lite/tools:list_flex_ops_main_lib",
+                clean_dep("//tensorflow/lite/tools:list_flex_ops_main_lib"),
             ] + additional_deps,
         )
         list_ops_tool = ":%s_list_flex_ops_main" % name
@@ -66,12 +67,12 @@ def generate_flex_kernel_header(
     )
 
     # Generate the kernel registration header file from list of flex ops.
-    tool = "//tensorflow/python/tools:print_selective_registration_header"
+    tool = clean_dep("//tensorflow/python/tools:print_selective_registration_header")
     native.genrule(
         name = "%s_kernel_registration" % name,
         srcs = [list_ops_output],
         outs = [header],
-        tools = [tool],
+        exec_tools = [tool],
         message = "Processing %s..." % list_ops_output,
         cmd = ("$(location " + tool + ")" +
                " --default_ops=\"\"" +
@@ -95,7 +96,7 @@ def tflite_flex_cc_library(
       additional_deps: Dependencies for additional TF ops.
       visibility: visibility of the generated rules.
     """
-    portable_tensorflow_lib = "//tensorflow/core:portable_tensorflow_lib"
+    portable_tensorflow_lib = clean_dep("//tensorflow/core:portable_tensorflow_lib")
     if models:
         CUSTOM_KERNEL_HEADER = generate_flex_kernel_header(
             name = "%s_tf_op_headers" % name,
@@ -108,9 +109,9 @@ def tflite_flex_cc_library(
         native.cc_library(
             name = "%s_tensorflow_lib" % name,
             srcs = if_mobile([
-                "//tensorflow/core:portable_op_registrations_and_gradients",
-                "//tensorflow/core/kernels:android_core_ops",
-                "//tensorflow/core/kernels:android_extended_ops",
+                clean_dep("//tensorflow/core:portable_op_registrations_and_gradients"),
+                clean_dep("//tensorflow/core/kernels:android_core_ops"),
+                clean_dep("//tensorflow/core/kernels:android_extended_ops"),
             ]) + [CUSTOM_KERNEL_HEADER.header],
             copts = tf_copts(android_optimization_level_override = None) + tf_opts_nortti_if_lite_protos() + if_ios(["-Os"]),
             defines = [
@@ -126,7 +127,7 @@ def tflite_flex_cc_library(
                 CUSTOM_KERNEL_HEADER.include_path,
             ],
             textual_hdrs = [
-                "//tensorflow/core/kernels:android_all_ops_textual_hdrs",
+                clean_dep("//tensorflow/core/kernels:android_all_ops_textual_hdrs"),
             ],
             visibility = visibility,
             deps = [
@@ -135,10 +136,11 @@ def tflite_flex_cc_library(
                 "//third_party/eigen3",
                 "@com_google_absl//absl/types:optional",
                 "@gemmlowp",
-                "//tensorflow/core:protos_all_cc",
                 "@icu//:common",
-                "//tensorflow/core:portable_tensorflow_lib_lite",
-                "//tensorflow/core/platform:strong_hash",
+                clean_dep("//tensorflow/core:protos_all_cc"),
+                clean_dep("//tensorflow/core:portable_tensorflow_lib_lite"),
+                clean_dep("//tensorflow/core/platform:strong_hash"),
+                clean_dep("//tensorflow/lite/delegates/flex:portable_images_lib"),
             ],
             alwayslink = 1,
         )
@@ -148,23 +150,23 @@ def tflite_flex_cc_library(
     native.cc_library(
         name = name,
         hdrs = [
-            "//tensorflow/lite/delegates/flex:delegate.h",
+            clean_dep("//tensorflow/lite/delegates/flex:delegate.h"),
         ],
         visibility = visibility,
         deps = [
-            "//tensorflow/lite/delegates/flex:delegate_data",
-            "//tensorflow/lite/delegates/flex:delegate_only_runtime",
-            "//tensorflow/lite/delegates/utils:simple_delegate",
+            clean_dep("//tensorflow/lite/delegates/flex:delegate_data"),
+            clean_dep("//tensorflow/lite/delegates/flex:delegate_only_runtime"),
+            clean_dep("//tensorflow/lite/delegates/utils:simple_delegate"),
         ] + select({
-            "//tensorflow:android": [
+            clean_dep("//tensorflow:android"): [
                 portable_tensorflow_lib,
             ],
-            "//tensorflow:ios": [
+            clean_dep("//tensorflow:ios"): [
                 portable_tensorflow_lib,
             ],
             "//conditions:default": [
-                "//tensorflow/core:tensorflow",
-                "//tensorflow/lite/c:common",
+                clean_dep("//tensorflow/core:tensorflow"),
+                clean_dep("//tensorflow/lite/c:common"),
             ],
         }) + additional_deps,
         alwayslink = 1,
@@ -202,21 +204,21 @@ def tflite_flex_jni_library(
     native.cc_library(
         name = "%s_flex_native" % name,
         srcs = [
-            "//tensorflow/lite/testing:init_tensorflow.h",
-            "//tensorflow/lite/testing:init_tensorflow.cc",
-            "//tensorflow/lite/delegates/flex/java/src/main/native:flex_delegate_jni.cc",
+            clean_dep("//tensorflow/lite/testing:init_tensorflow.h"),
+            clean_dep("//tensorflow/lite/testing:init_tensorflow.cc"),
+            clean_dep("//tensorflow/lite/delegates/flex/java/src/main/native:flex_delegate_jni.cc"),
         ],
         copts = tflite_copts(),
         visibility = visibility,
         deps = [
             ":%s_flex_delegate" % name,
-            "//tensorflow/lite/java/jni",
-            "//tensorflow/lite/delegates/utils:simple_delegate",
+            clean_dep("//tensorflow/lite/java/jni"),
+            clean_dep("//tensorflow/lite/delegates/utils:simple_delegate"),
         ] + select({
-            "//tensorflow:android": [],
-            "//tensorflow:ios": [],
+            clean_dep("//tensorflow:android"): [],
+            clean_dep("//tensorflow:ios"): [],
             "//conditions:default": [
-                "//tensorflow/core:lib",
+                clean_dep("//tensorflow/core:lib"),
             ],
         }),
         alwayslink = 1,
@@ -264,14 +266,14 @@ def tflite_flex_android_library(
 
     android_library(
         name = name,
-        srcs = ["//tensorflow/lite/delegates/flex/java/src/main/java/org/tensorflow/lite/flex:flex_delegate"],
-        manifest = "//tensorflow/lite/java:AndroidManifest.xml",
-        proguard_specs = ["//tensorflow/lite/java:proguard.flags"],
+        srcs = [clean_dep("//tensorflow/lite/delegates/flex/java/src/main/java/org/tensorflow/lite/flex:flex_delegate")],
+        manifest = clean_dep("//tensorflow/lite/java:AndroidManifest.xml"),
+        proguard_specs = [clean_dep("//tensorflow/lite/java:proguard.flags")],
         custom_package = custom_package,
         deps = [
             ":%s_native" % name,
-            "//tensorflow/lite/java:tensorflowlite_java",
-            "@org_checkerframework_qual",
+            clean_dep("//tensorflow/lite/java:tensorflowlite_java"),
+            clean_dep("@org_checkerframework_qual"),
         ],
         visibility = visibility,
     )
diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index 4664ab34700..f7d07af6595 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/core/macros.h"
 #include "tensorflow/lite/delegates/flex/buffer_map.h"
@@ -142,14 +143,16 @@ TfLiteStatus FlexDelegate::CopyFromBufferHandle(
 
 }  // namespace tflite
 
+// LINT.IfChange
 // Exported C interface function which is used by AcquireFlexDelegate() at
-// interpreter_build.cc. To export the function name globally, the function name
-// must be matched with patterns in tf_version_script.lds
+// interpreter_builder.cc. To export the function name globally, the function
+// name must be matched with patterns in tf_version_script.lds. In Android, we
+// don't use this feature so skip building.
+#if !defined(__ANDROID__)
 extern "C" {
-#if defined(_WIN32)
-__declspec(dllexport)
-#endif
-    tflite::TfLiteDelegateUniquePtr TF_AcquireFlexDelegate() {
+TFL_CAPI_EXPORT tflite::TfLiteDelegateUniquePtr TF_AcquireFlexDelegate() {
   return tflite::FlexDelegate::Create();
 }
 }  // extern "C"
+#endif  // !defined(__ANDROID__)
+// LINT.ThenChange(//tensorflow/lite/interpreter_builder.cc)
diff --git a/tensorflow/lite/delegates/flex/delegate_data.cc b/tensorflow/lite/delegates/flex/delegate_data.cc
index 2be928073ff..8e3ed964e01 100644
--- a/tensorflow/lite/delegates/flex/delegate_data.cc
+++ b/tensorflow/lite/delegates/flex/delegate_data.cc
@@ -46,7 +46,6 @@ tensorflow::Status DelegateData::Prepare(
   eager_context_ = new tensorflow::EagerContext(
       session_options,
       tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-      tensorflow::ContextMirroringPolicy::MIRRORING_NONE,
       /*async=*/false, /*lazy_copy_function_remote_inputs=*/false,
       device_mgr.release(), /*device_mgr_owned*/ true, rendezvous, nullptr);
   return tensorflow::Status();
diff --git a/tensorflow/lite/delegates/flex/delegate_test.cc b/tensorflow/lite/delegates/flex/delegate_test.cc
index d574d8fabbb..6450848bf0e 100644
--- a/tensorflow/lite/delegates/flex/delegate_test.cc
+++ b/tensorflow/lite/delegates/flex/delegate_test.cc
@@ -14,9 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/flex/delegate.h"
 
+#include <cstdint>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/flex/test_util.h"
+#include "tensorflow/lite/shared_library.h"
 
 namespace tflite {
 namespace flex {
@@ -301,6 +305,100 @@ TEST_F(DelegateTest, MultiThreaded) {
   ASSERT_EQ(GetType(8), kTfLiteFloat32);
 }
 
+#if !defined(__ANDROID__)
+TEST_F(DelegateTest, TF_AcquireFlexDelegate) {
+  auto TF_AcquireFlexDelegate =
+      reinterpret_cast<Interpreter::TfLiteDelegatePtr (*)()>(
+          SharedLibrary::GetSymbol("TF_AcquireFlexDelegate"));
+  ASSERT_TRUE(TF_AcquireFlexDelegate);
+  auto delegate_ptr = TF_AcquireFlexDelegate();
+  ASSERT_TRUE(delegate_ptr != nullptr);
+}
+#endif  // !defined(__ANDROID__)
+
+TEST_F(DelegateTest, StaticOutput) {
+  // Define the graph with input, output shapes of [2].
+  AddTensors(7, {0, 1, 2, 3}, {6}, kTfLiteFloat32, {2});
+
+  AddTfOp(testing::kAdd, {0, 2}, {4});
+  AddTfOp(testing::kAdd, {1, 3}, {5});
+  AddTfOp(testing::kMul, {4, 5}, {6});
+
+  // Apply the delegate.
+  ConfigureDelegate();
+
+  // Define inputs which matech with the original shapes.
+  SetShape(0, {2});
+  SetShape(1, {2});
+  SetShape(2, {2});
+  SetShape(3, {2});
+  SetValues(0, {1.1f, 2.2f});
+  SetValues(1, {3.3f, 4.4f});
+  SetValues(2, {1.1f, 2.2f});
+  SetValues(3, {3.3f, 4.4f});
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(6), ElementsAre(2));
+  ASSERT_THAT(GetValues(6), ElementsAre(14.52f, 38.72f));
+  ASSERT_EQ(GetType(6), kTfLiteFloat32);
+  // Since shapes are consistent, static output tensor is used.
+  ASSERT_FALSE(IsDynamicTensor(6));
+}
+
+TEST_F(DelegateTest, StaticOutputRFFT) {
+  // Define the graph with input, output shapes of [3, 257].
+  AddTensors(4, {0, 1}, {3}, kTfLiteFloat32, {3, 257});
+  int32_t rfft_length[] = {512};
+  SetConstTensor(1, {1}, kTfLiteInt32,
+                 reinterpret_cast<const char*>(&rfft_length),
+                 sizeof(rfft_length));
+
+  AddTfOp(testing::kRfft, {0, 1}, {2});
+  AddTfOp(testing::kImag, {2}, {3});
+
+  // Apply the delegate.
+  ConfigureDelegate();
+
+  // Define inputs.
+  SetShape(0, {3, 512});
+  SetValues(0, std::vector<float>(3 * 512, 1.0f));
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_EQ(GetType(3), kTfLiteFloat32);
+  // Since shapes are consistent, static output tensor is used.
+  ASSERT_FALSE(IsDynamicTensor(3));
+}
+
+TEST_F(DelegateTest, DynamicOutputAfterReshape) {
+  // Define the graph.
+  AddTensors(9, {0, 3}, {8}, kTfLiteFloat32, {3});
+
+  AddTfOp(testing::kUnpack, {0}, {1, 2});
+  AddTfOp(testing::kUnpack, {3}, {4, 5});
+  AddTfOp(testing::kAdd, {1, 4}, {6});
+  AddTfOp(testing::kAdd, {2, 5}, {7});
+  AddTfOp(testing::kMul, {6, 7}, {8});
+
+  // Apply the delegate.
+  ConfigureDelegate();
+
+  // Define inputs with reshape.
+  SetShape(0, {2, 2, 1});
+  SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
+  SetShape(3, {2, 2, 1});
+  SetValues(3, {1.1f, 2.2f, 3.3f, 4.4f});
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(8), ElementsAre(2, 1));
+  ASSERT_THAT(GetValues(8), ElementsAre(14.52f, 38.72f));
+  ASSERT_EQ(GetType(8), kTfLiteFloat32);
+  // Since shapes are inconsistent, dynamic output tensor is used.
+  ASSERT_TRUE(IsDynamicTensor(8));
+}
+
 }  // namespace
 }  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index b3e978908bd..f21c984fe3e 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/flex/delegate_data.h"
 #include "tensorflow/lite/delegates/flex/util.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/string_type.h"
 
 // Note: this is part of TF Lite's Flex delegation code which is to be
@@ -48,6 +49,16 @@ limitations under the License.
 // retrieve the associated NodeDef, which is then used to configure the
 // corresponding TensorFlow/Eager Op.
 
+using tensorflow::shape_inference::DimensionHandle;
+using tensorflow::shape_inference::InferenceContext;
+using tensorflow::shape_inference::ShapeAndType;
+using tensorflow::shape_inference::ShapeHandle;
+
+const std::string GetDimsDebugString(const TfLiteIntArray* dims) {
+  return absl::StrCat("[", absl::StrJoin(tflite::TfLiteIntArrayView(dims), ","),
+                      "]");
+}
+
 namespace tflite {
 namespace flex {
 
@@ -188,6 +199,9 @@ class OpNode {
   void set_index(int index) { index_ = index; }
 
   const tensorflow::NodeDef& nodedef() const { return nodedef_; }
+  const tensorflow::OpRegistrationData* op_reg_data() const {
+    return op_reg_data_;
+  }
 
   const OpInputs& inputs() const { return inputs_; }
   OpInputs* mutable_inputs() { return &inputs_; }
@@ -222,10 +236,9 @@ class OpNode {
     }
 
     // Fill NodeDef with defaults if it's a valid op.
-    const tensorflow::OpRegistrationData* op_reg_data;
     TF_RETURN_IF_ERROR(
-        tensorflow::OpRegistry::Global()->LookUp(nodedef_.op(), &op_reg_data));
-    AddDefaultsToNodeDef(op_reg_data->op_def, &nodedef_);
+        tensorflow::OpRegistry::Global()->LookUp(nodedef_.op(), &op_reg_data_));
+    AddDefaultsToNodeDef(op_reg_data_->op_def, &nodedef_);
 
     return tensorflow::Status::OK();
   }
@@ -312,6 +325,8 @@ class OpNode {
   int index_;
   // The corresponding NodeDef, containing the attributes for the op.
   tensorflow::NodeDef nodedef_;
+  // The corresponding OpRegistrationData pointer.
+  const tensorflow::OpRegistrationData* op_reg_data_;
   // List of inputs, as TF Lite tensor indices.
   OpInputs inputs_;
   // List of outputs, as TF Lite tensor indices.
@@ -455,10 +470,22 @@ TfLiteStatus DelegateKernel::Prepare(TfLiteContext* context, TfLiteNode* node) {
     tensor_ref_count[tensor_index] += 2;
   }
 
+  const bool shapes_are_valid =
+      (ValidateOutputTensorShapeConsistency(context) == kTfLiteOk);
+  if (shapes_are_valid) {
+    TFLITE_LOG(tflite::TFLITE_LOG_INFO,
+               "FlexDelegate: All tensor shapes are consistent.");
+  } else {
+    TFLITE_LOG(tflite::TFLITE_LOG_WARNING,
+               "FlexDelegate: Some tensor shapes are inconsistent.");
+  }
+
   // All output tensors are allocated by TensorFlow/Eager, so we
   // mark them as kTfLiteDynamic.
   for (auto tensor_index : op_data_->subgraph_outputs) {
-    SetTensorToDynamic(&context->tensors[tensor_index]);
+    if (!shapes_are_valid) {
+      SetTensorToDynamic(&context->tensors[tensor_index]);
+    }
     ++tensor_ref_count[tensor_index];
   }
 
@@ -488,6 +515,85 @@ TfLiteStatus DelegateKernel::Prepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+TfLiteStatus DelegateKernel::ValidateOutputTensorShapeConsistency(
+    TfLiteContext* context) const {
+  for (const auto& node_data : op_data_->nodes) {
+    auto op_name = node_data->name().c_str();
+    // Create an InferenceContext object.
+    auto num_inputs = node_data->inputs().Size();
+    std::vector<const tensorflow::Tensor*> input_tensors_vector(num_inputs,
+                                                                nullptr);
+    InferenceContext c(
+        TF_GRAPH_DEF_VERSION, node_data->nodedef(),
+        node_data->op_reg_data()->op_def, std::vector<ShapeHandle>(num_inputs),
+        input_tensors_vector, {},
+        std::vector<std::unique_ptr<std::vector<ShapeAndType>>>());
+
+    // Set input_shapes for ShapeInferenceFn.
+    for (int i = 0; i < num_inputs; ++i) {
+      const auto input_tensor_index = node_data->inputs().TfLiteIndex(i);
+      TfLiteTensor* tfl_tensor = &context->tensors[input_tensor_index];
+      // Provide constant input tensors since some op ("RFFT") needs it to
+      // calculate the output shape.
+      if (IsConstantTensor(tfl_tensor)) {
+        input_tensors_vector[i] =
+            op_data_->buffer_map->GetTensorPtr(input_tensor_index);
+      }
+      const auto dims_array = tfl_tensor->dims;
+      std::vector<DimensionHandle> dims(dims_array->size);
+      for (int j = 0; j < dims_array->size; ++j) {
+        dims[j] = c.MakeDim(dims_array->data[j]);
+      }
+      c.SetInput(i, c.MakeShape(dims));
+    }
+    c.set_input_tensors(input_tensors_vector);
+
+    tensorflow::Status status = c.construction_status();
+    if (!status.ok()) {
+      TFLITE_LOG(tflite::TFLITE_LOG_WARNING,
+                 "Shape construction failed for op '%s'", op_name);
+      return kTfLiteError;
+    }
+
+    // Run ShapeInferenceFn to calculate output shapes.
+    if (node_data->op_reg_data()->shape_inference_fn == nullptr) {
+      TFLITE_LOG(tflite::TFLITE_LOG_WARNING,
+                 "No shape inference function exists for op '%s'", op_name);
+      return kTfLiteError;
+    }
+    status = c.Run(node_data->op_reg_data()->shape_inference_fn);
+
+    // Compare calculated output shapes with node_data->outputs
+    auto num_outputs = node_data->outputs().Size();
+    if (num_outputs != c.num_outputs()) {
+      TFLITE_LOG(tflite::TFLITE_LOG_WARNING,
+                 "Number of output tensors are mismatched for op '%s' %d != %d",
+                 op_name, num_outputs, c.num_outputs());
+      return kTfLiteError;
+    }
+    for (int i = 0; i < num_outputs; ++i) {
+      const auto output_tensor_index = node_data->outputs().TfLiteIndex(i);
+      TfLiteTensor* tfl_tensor = &context->tensors[output_tensor_index];
+      // tfl_tensor->dims only has valid information if the given model is
+      // converted by the MLIR converter. Also when ResizeInputTensor() is
+      // called the dims information becomes invalid.
+      const std::string tfl_shape_string = GetDimsDebugString(tfl_tensor->dims);
+      const std::string calculated_shape_string = c.DebugString(c.output(i));
+      // Getting a shape string via c.DebugString() is the easiest way to get
+      // the shape information of the given ShapeHandle for now.
+      // TODO(b/169017408): Find a better approach without using debug string.
+      if (tfl_shape_string != calculated_shape_string) {
+        TFLITE_LOG(tflite::TFLITE_LOG_WARNING,
+                   "op '%s' output%d tensor#%d shape mismatch for  %s != %s",
+                   op_name, i, output_tensor_index, tfl_shape_string.c_str(),
+                   calculated_shape_string.c_str());
+        return kTfLiteError;
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
 TfLiteStatus DelegateKernel::Eval(TfLiteContext* context, TfLiteNode* node) {
   BufferMap* buffer_map = op_data_->buffer_map;
 
@@ -522,12 +628,30 @@ TfLiteStatus DelegateKernel::Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteError;
     }
 
+    // Copy TF tensor data to TFL allocated buffer for non dynamic tensors.
+    // For dynamic tensors, copy shape and put buffer_handle for the later
+    // CopyFromBufferHandle() call.
     TfLiteTensor* tensor = &context->tensors[tensor_index];
-    TF_LITE_ENSURE_OK(
-        context,
-        CopyShapeAndType(context, buffer_map->GetTensor(tensor_index), tensor));
-    tensor->buffer_handle = tensor_index;
-    tensor->data_is_stale = true;
+    const tensorflow::Tensor& tf_tensor = buffer_map->GetTensor(tensor_index);
+    if (tensor->allocation_type == kTfLiteDynamic) {
+      TF_LITE_ENSURE_OK(context, CopyShapeAndType(context, tf_tensor, tensor));
+      tensor->buffer_handle = tensor_index;
+      tensor->data_is_stale = true;
+      continue;
+    }
+    // If the tensor isn't dynamic, we can copy data directly to the buffer of
+    // the tensor. Before copying the data, check if the target buffer has
+    // expected size.
+    if (tf_tensor.NumElements() != NumElements(tensor) ||
+        tf_tensor.TotalBytes() != tensor->bytes) {
+      TF_LITE_KERNEL_LOG(
+          context, "Tensor: %s(%d) buffer size mismatch %zu(%lld) != %ld(%ld)",
+          tensor->name, tensor_index, tf_tensor.TotalBytes(),
+          tf_tensor.NumElements(), tensor->bytes, NumElements(tensor));
+      return kTfLiteError;
+    }
+    tensorflow::StringPiece t_data = tf_tensor.tensor_data();
+    memcpy(tensor->data.raw, t_data.data(), t_data.size());
   }
 
   return kTfLiteOk;
diff --git a/tensorflow/lite/delegates/flex/kernel.h b/tensorflow/lite/delegates/flex/kernel.h
index 9a7b93e31f2..b2ab485bdaa 100644
--- a/tensorflow/lite/delegates/flex/kernel.h
+++ b/tensorflow/lite/delegates/flex/kernel.h
@@ -35,6 +35,11 @@ class DelegateKernel : public SimpleDelegateKernelInterface {
   TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) override;
 
  private:
+  // Validate that the computed output tensor shape for the Flex node matches
+  // the existing output shape assigned to the output tensor.
+  TfLiteStatus ValidateOutputTensorShapeConsistency(
+      TfLiteContext* context) const;
+
   std::unique_ptr<OpData> op_data_;
 };
 
diff --git a/tensorflow/lite/delegates/flex/kernel_test.cc b/tensorflow/lite/delegates/flex/kernel_test.cc
index f7234075c95..adc65c3ced9 100644
--- a/tensorflow/lite/delegates/flex/kernel_test.cc
+++ b/tensorflow/lite/delegates/flex/kernel_test.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/lite/delegates/flex/delegate_data.h"
 #include "tensorflow/lite/delegates/flex/test_util.h"
 
+extern const std::string GetDimsDebugString(const TfLiteIntArray* dims);
+
 namespace tflite {
 namespace flex {
 namespace testing {
@@ -351,6 +353,62 @@ TEST_F(MultipleSubgraphsTest, DoNotForwardInputTensors) {
               })));
 }
 
+tensorflow::OpDef MakeOpDef(int num_inputs, int num_outputs) {
+  tensorflow::OpRegistrationData op_reg_data;
+  tensorflow::OpDefBuilder b("dummy");
+  for (int i = 0; i < num_inputs; ++i) {
+    b.Input(tensorflow::strings::StrCat("i", i, ": float"));
+  }
+  for (int i = 0; i < num_outputs; ++i) {
+    b.Output(tensorflow::strings::StrCat("o", i, ": float"));
+  }
+  CHECK(b.Attr("foo:string").Finalize(&op_reg_data).ok());
+  return op_reg_data.op_def;
+}
+
+tensorflow::PartialTensorShape S(
+    std::initializer_list<tensorflow::int64> dims) {
+  return tensorflow::PartialTensorShape(dims);
+}
+
+TEST(ValidateOutputTensorShapeConsistencyTest, ShapeHandleDebugString) {
+  // Setup test to contain an input tensor list of size 3.
+  tensorflow::OpDef op_def = MakeOpDef(4, 1);
+  tensorflow::NodeDef def;
+  tensorflow::shape_inference::InferenceContext c(
+      0, def, op_def, {S({1}), S({2, 3}), S({4, 5, 6}), {}}, {}, {}, {});
+  c.SetInput(3, c.UnknownShape());
+
+  std::vector<tensorflow::shape_inference::ShapeHandle> shapes;
+  EXPECT_EQ("[1]", c.DebugString(c.input(0)));
+  EXPECT_EQ("[2,3]", c.DebugString(c.input(1)));
+  EXPECT_EQ("[4,5,6]", c.DebugString(c.input(2)));
+  // c.DebugString() returns "?" for the unknown shape which is different with
+  // "-1" of TFLite. But this is intended behavior since we should use dynamic
+  // tensor for unknown shape so the shape comparison must fail.
+  EXPECT_EQ("?", c.DebugString(c.input(3)));
+}
+
+TEST(ValidateOutputTensorShapeConsistencyTest, GetDimsDebugString) {
+  TfLiteIntArray* dims1 = TfLiteIntArrayCreate(1);
+  dims1->data[0] = 1;
+  EXPECT_EQ("[1]", GetDimsDebugString(dims1));
+  free(dims1);
+
+  TfLiteIntArray* dims2 = TfLiteIntArrayCreate(2);
+  dims2->data[0] = 2;
+  dims2->data[1] = 3;
+  EXPECT_EQ("[2,3]", GetDimsDebugString(dims2));
+  free(dims2);
+
+  TfLiteIntArray* dims3 = TfLiteIntArrayCreate(3);
+  dims3->data[0] = 4;
+  dims3->data[1] = 5;
+  dims3->data[2] = 6;
+  EXPECT_EQ("[4,5,6]", GetDimsDebugString(dims3));
+  free(dims3);
+}
+
 }  // namespace testing
 }  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/test_util.cc b/tensorflow/lite/delegates/flex/test_util.cc
index 8c0e40b58dd..02685aa0502 100644
--- a/tensorflow/lite/delegates/flex/test_util.cc
+++ b/tensorflow/lite/delegates/flex/test_util.cc
@@ -67,6 +67,10 @@ TfLiteType FlexModelTest::GetType(int tensor_index) {
   return interpreter_->tensor(tensor_index)->type;
 }
 
+bool FlexModelTest::IsDynamicTensor(int tensor_index) {
+  return interpreter_->tensor(tensor_index)->allocation_type == kTfLiteDynamic;
+}
+
 void FlexModelTest::AddTensors(int num_tensors, const std::vector<int>& inputs,
                                const std::vector<int>& outputs, TfLiteType type,
                                const std::vector<int>& dims) {
@@ -88,6 +92,18 @@ void FlexModelTest::AddTensors(int num_tensors, const std::vector<int>& inputs,
   CHECK_EQ(interpreter_->SetOutputs(outputs), kTfLiteOk);
 }
 
+void FlexModelTest::SetConstTensor(int tensor_index,
+                                   const std::vector<int>& values,
+                                   TfLiteType type, const char* buffer,
+                                   size_t bytes) {
+  TfLiteQuantizationParams quant;
+  CHECK_EQ(interpreter_->SetTensorParametersReadOnly(tensor_index, type,
+                                                     /*name=*/"",
+                                                     /*dims=*/values, quant,
+                                                     buffer, bytes),
+           kTfLiteOk);
+}
+
 void FlexModelTest::AddTfLiteMulOp(const std::vector<int>& inputs,
                                    const std::vector<int>& outputs) {
   ++next_op_index_;
@@ -154,6 +170,10 @@ void FlexModelTest::AddTfOp(TfOpType op, const std::vector<int>& inputs,
   } else if (op == kMul) {
     string attributes = type_attribute;
     AddTfOp("FlexMul", "Mul", attributes, inputs, outputs);
+  } else if (op == kRfft) {
+    AddTfOp("FlexRFFT", "RFFT", "", inputs, outputs);
+  } else if (op == kImag) {
+    AddTfOp("FlexImag", "Imag", "", inputs, outputs);
   } else if (op == kNonExistent) {
     AddTfOp("NonExistentOp", "NonExistentOp", "", inputs, outputs);
   } else if (op == kIncompatibleNodeDef) {
diff --git a/tensorflow/lite/delegates/flex/test_util.h b/tensorflow/lite/delegates/flex/test_util.h
index 1913a406e83..c00adbfe9b3 100644
--- a/tensorflow/lite/delegates/flex/test_util.h
+++ b/tensorflow/lite/delegates/flex/test_util.h
@@ -28,6 +28,8 @@ enum TfOpType {
   kIdentity,
   kAdd,
   kMul,
+  kRfft,
+  kImag,
   // Represents an op that does not exist in TensorFlow.
   kNonExistent,
   // Represents an valid TensorFlow op where the NodeDef is incompatible.
@@ -80,6 +82,9 @@ class FlexModelTest : public ::testing::Test {
   // Returns the tensor's type at the given index.
   TfLiteType GetType(int tensor_index);
 
+  // Returns if the tensor at the given index is dynamic.
+  bool IsDynamicTensor(int tensor_index);
+
   const TestErrorReporter& error_reporter() const { return error_reporter_; }
 
   // Adds `num_tensor` tensors to the model. `inputs` contains the indices of
@@ -89,6 +94,11 @@ class FlexModelTest : public ::testing::Test {
                   const std::vector<int>& outputs, TfLiteType type,
                   const std::vector<int>& dims);
 
+  // Set a constant tensor of the given shape, type and buffer at the given
+  // index.
+  void SetConstTensor(int tensor_index, const std::vector<int>& values,
+                      TfLiteType type, const char* buffer, size_t bytes);
+
   // Adds a TFLite Mul op. `inputs` contains the indices of the input tensors
   // and `outputs` contains the indices of the output tensors.
   void AddTfLiteMulOp(const std::vector<int>& inputs,
diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index d69bed4c03a..8778653b586 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -54,7 +54,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:tensor",
-        "//tensorflow/lite/delegates/gpu/common/transformations:general_transformations",
+        "//tensorflow/lite/delegates/gpu/common/transformations:model_transformations",
         "//tensorflow/lite/delegates/gpu/gl:api",
         "//tensorflow/lite/delegates/gpu/gl:command_queue",
         "//tensorflow/lite/delegates/gpu/gl:compiler",
@@ -96,7 +96,6 @@ objc_library(
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:tensor",
         "//tensorflow/lite/delegates/gpu/common:types",
-        "//tensorflow/lite/delegates/gpu/common/transformations:general_transformations",
         "//tensorflow/lite/delegates/gpu/metal:api",
         "//tensorflow/lite/delegates/gpu/metal:buffer_convert",
         "//tensorflow/lite/delegates/gpu/metal:compiled_model",
diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 9ae3836d6c4..63171348b74 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -55,6 +55,7 @@ cc_library(
         ":cl_device",
         ":gpu_object",
         ":opencl_wrapper",
+        ":serialization_cc_fbs",
         ":tensor_type",
         ":util",
         "//tensorflow/lite/delegates/gpu/common:access_type",
@@ -76,7 +77,10 @@ cc_test(
     ],
     deps = [
         ":arguments",
+        ":buffer",
+        ":device_info",
         ":gpu_object",
+        ":tensor",
         ":tensor_type",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "@com_google_absl//absl/strings",
@@ -283,7 +287,7 @@ cc_library(
         ":cl_command_queue",
         ":cl_context",
         ":cl_device",
-        ":cl_kernel",
+        ":device_info",
         ":precision",
         ":program_cache",
         ":tensor",
@@ -343,7 +347,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model_builder",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common/transformations:general_transformations",
+        "//tensorflow/lite/delegates/gpu/common/transformations:model_transformations",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -355,6 +359,7 @@ cc_library(
     deps = [
         ":cl_context",
         ":opencl_wrapper",
+        ":serialization_cc_fbs",
         "//tensorflow/lite/delegates/gpu/common:access_type",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:status",
@@ -363,18 +368,30 @@ cc_library(
 
 cc_library(
     name = "inference_context",
-    srcs = ["inference_context.cc"],
-    hdrs = ["inference_context.h"],
+    srcs = [
+        "inference_context.cc",
+        "serialization.cc",
+    ],
+    hdrs = [
+        "inference_context.h",
+        "serialization.h",
+    ],
     deps = [
+        ":arguments",
         ":buffer",
         ":cl_command_queue",
+        ":cl_context",
         ":cl_device",
         ":environment",
+        ":gpu_object",
+        ":linear_storage",
         ":model_hints",
         ":opencl_wrapper",
         ":precision",
+        ":serialization_cc_fbs",
         ":storage_type_util",
         ":tensor_type",
+        ":texture2d",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
         "//tensorflow/lite/delegates/gpu/cl/selectors:operation_selector",
         "//tensorflow/lite/delegates/gpu/cl/selectors:special_selector",
@@ -392,6 +409,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common/transformations:merge_padding_with",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -463,6 +481,14 @@ cc_library(
     ],
 )
 
+flatbuffer_cc_library(
+    name = "serialization_cc_fbs",
+    srcs = ["serialization.fbs"],
+    flatc_args = [
+        "--scoped-enums",
+    ],
+)
+
 cc_library(
     name = "storage_type_util",
     srcs = ["storage_type_util.cc"],
diff --git a/tensorflow/lite/delegates/gpu/cl/api.cc b/tensorflow/lite/delegates/gpu/cl/api.cc
index 01d32aa9206..e2135d05b53 100644
--- a/tensorflow/lite/delegates/gpu/cl/api.cc
+++ b/tensorflow/lite/delegates/gpu/cl/api.cc
@@ -570,6 +570,56 @@ TensorObjectDef TensorToDef(const Tensor& tensor) {
   return def;
 }
 
+CalculationsPrecision GetPrecision(const Environment& env,
+                                   const InferenceOptions& options) {
+  CalculationsPrecision precision;
+  switch (GetPosition(options, InferencePriority::MAX_PRECISION)) {
+    case 1:
+      precision = CalculationsPrecision::F32;
+      break;
+    case 2:
+      precision = CalculationsPrecision::F32_F16;
+      break;
+    case 3:
+      precision = CalculationsPrecision::F16;
+      break;
+    default:
+      precision = CalculationsPrecision::F16;
+      break;
+  }
+  // Increase precision if lower precision is not supported.
+  if (!env.IsSupported(precision)) {
+    precision = CalculationsPrecision::F32_F16;
+    if (!env.IsSupported(precision)) {
+      precision = CalculationsPrecision::F32;
+    }
+  }
+  return precision;
+}
+
+TensorStorageType GetStorageTypeFromOptions(const Environment& env,
+                                            const InferenceOptions& options) {
+  // Fallback to BUFFER that should be supported by default.
+  std::vector<TensorStorageType> preferred_storage_types;
+  if (GetRelativeImportance(options, InferencePriority::MIN_LATENCY,
+                            InferencePriority::MIN_MEMORY_USAGE) ==
+      PriorityImportance::HIGHER) {
+    preferred_storage_types = {GetFastestStorageType(env.device().GetInfo()),
+                               TensorStorageType::BUFFER};
+  } else {
+    preferred_storage_types = {
+        GetStorageTypeWithMinimalMemoryConsumption(env.device().GetInfo()),
+        TensorStorageType::BUFFER};
+  }
+
+  for (TensorStorageType storage_type : preferred_storage_types) {
+    if (env.IsSupported(storage_type)) {
+      return storage_type;
+    }
+  }
+  return TensorStorageType::UNKNOWN;
+}
+
 class InferenceBuilderImpl : public InferenceBuilder {
  public:
   explicit InferenceBuilderImpl(Environment* environment)
@@ -580,11 +630,14 @@ class InferenceBuilderImpl : public InferenceBuilder {
                           const GraphFloat32& graph) {
     context_ = absl::make_unique<InferenceContext>();
     InferenceContext::CreateInferenceInfo create_info;
-    create_info.precision = GetPrecision(options);
-    create_info.storage_type = GetStorageType(options);
+    create_info.precision = GetPrecision(*environment_, options);
+    create_info.storage_type =
+        GetStorageTypeFromOptions(*environment_, options);
     if (options.usage == InferenceUsage::FAST_SINGLE_ANSWER) {
       create_info.hints.Add(ModelHints::kReduceKernelsCount);
       create_info.hints.Add(ModelHints::kFastTuning);
+    } else if (options.usage == InferenceUsage::SUSTAINED_SPEED) {
+      create_info.hints.Add(ModelHints::kAllowSpecialKernels);
     }
     RETURN_IF_ERROR(context_->InitFromGraph(create_info, graph, environment_));
 
@@ -601,8 +654,32 @@ class InferenceBuilderImpl : public InferenceBuilder {
         absl::make_unique<TensorTieFactory>(environment_, context_.get());
 #endif
 
-    inputs_ = LinkTensors(graph, graph.inputs());
-    outputs_ = LinkTensors(graph, graph.outputs());
+    inputs_ = LinkTensors(context_->GetInputIds(), AccessType::READ);
+    outputs_ = LinkTensors(context_->GetOutputIds(), AccessType::WRITE);
+    return absl::OkStatus();
+  }
+
+  absl::Status Initialize(const InferenceEnvironmentOptions& env_options,
+                          const std::vector<uint8_t>& serialized_model) {
+    context_ = absl::make_unique<InferenceContext>();
+    RETURN_IF_ERROR(
+        context_->RestoreDeserialized(serialized_model, environment_));
+
+#ifdef CL_DELEGATE_ALLOW_GL
+    if (env_options.IsGlAware() &&
+        IsGlSharingSupported(environment_->device())) {
+      gl_interop_fabric_ = absl::make_unique<GlInteropFabric>(
+          env_options.egl_display, environment_);
+    }
+    tie_factory_ = absl::make_unique<TensorTieFactory>(
+        environment_, context_.get(), gl_interop_fabric_.get());
+#else
+    tie_factory_ =
+        absl::make_unique<TensorTieFactory>(environment_, context_.get());
+#endif
+
+    inputs_ = LinkTensors(context_->GetInputIds(), AccessType::READ);
+    outputs_ = LinkTensors(context_->GetOutputIds(), AccessType::WRITE);
     return absl::OkStatus();
   }
 
@@ -669,64 +746,14 @@ class InferenceBuilderImpl : public InferenceBuilder {
   }
 
  private:
-  TensorStorageType GetStorageType(const InferenceOptions& options) const {
-    // Fallback to BUFFER that should be supported by default.
-    std::vector<TensorStorageType> preferred_storage_types;
-    if (GetRelativeImportance(options, InferencePriority::MIN_LATENCY,
-                              InferencePriority::MIN_MEMORY_USAGE) ==
-        PriorityImportance::HIGHER) {
-      preferred_storage_types = {GetFastestStorageType(environment_->device()),
-                                 TensorStorageType::BUFFER};
-    } else {
-      preferred_storage_types = {
-          GetStorageTypeWithMinimalMemoryConsumption(environment_->device()),
-          TensorStorageType::BUFFER};
-    }
-
-    for (TensorStorageType storage_type : preferred_storage_types) {
-      if (environment_->IsSupported(storage_type)) {
-        return storage_type;
-      }
-    }
-    return TensorStorageType::UNKNOWN;
-  }
-
-  CalculationsPrecision GetPrecision(const InferenceOptions& options) const {
-    CalculationsPrecision precision;
-    switch (GetPosition(options, InferencePriority::MAX_PRECISION)) {
-      case 1:
-        precision = CalculationsPrecision::F32;
-        break;
-      case 2:
-        precision = CalculationsPrecision::F32_F16;
-        break;
-      case 3:
-        precision = CalculationsPrecision::F16;
-        break;
-      default:
-        precision = CalculationsPrecision::F16;
-        break;
-    }
-    // Increase precision if lower precision is not supported.
-    if (!environment_->IsSupported(precision)) {
-      precision = CalculationsPrecision::F32_F16;
-      if (!environment_->IsSupported(precision)) {
-        precision = CalculationsPrecision::F32;
-      }
-    }
-    return precision;
-  }
-
   // Links internal tensors with external user-facing objects.
-  std::vector<TensorTieDef> LinkTensors(const GraphFloat32& graph,
-                                        const std::vector<Value*>& values) {
+  std::vector<TensorTieDef> LinkTensors(const std::vector<ValueId>& ids,
+                                        AccessType access) {
     std::vector<TensorTieDef> links;
-    links.reserve(values.size());
-    for (const auto& value : values) {
-      TensorObjectDef def = TensorToDef(*context_->GetTensor(value->id));
-      AccessType access =
-          graph.IsGraphInput(value->id) ? AccessType::READ : AccessType::WRITE;
-      links.push_back({value->id, access, def, def});
+    links.reserve(ids.size());
+    for (const auto& id : ids) {
+      TensorObjectDef def = TensorToDef(*context_->GetTensor(id));
+      links.push_back({id, access, def, def});
     }
     return links;
   }
@@ -839,6 +866,39 @@ class InferenceEnvironmentImpl : public InferenceEnvironment {
     return environment_.Init();
   }
 
+  absl::Status BuildSerializedModel(
+      const InferenceOptions& options, GraphFloat32 model,
+      std::vector<uint8_t>* serialized_model) final {
+    if (!IsValid(options)) {
+      return absl::InvalidArgumentError("InferenceOptions are invalid.");
+    }
+    InferenceOptions resolved_options = options;
+    ResolveAutoPriority(&resolved_options);
+    if (environment_.program_cache() &&
+        !options_.serialized_binary_cache.empty()) {
+      // Ignore returned error. Cache is discarded.
+      environment_.program_cache()
+          ->AddSerializedCache(environment_.context(), environment_.device(),
+                               options_.serialized_binary_cache)
+          .IgnoreError();
+    }
+
+    RETURN_IF_ERROR(RunGraphTransforms(&model));
+    InferenceContext context;
+    InferenceContext::CreateInferenceInfo create_info;
+    create_info.precision = GetPrecision(environment_, options);
+    create_info.storage_type = GetStorageTypeFromOptions(environment_, options);
+    if (options.usage == InferenceUsage::FAST_SINGLE_ANSWER) {
+      create_info.hints.Add(ModelHints::kReduceKernelsCount);
+      create_info.hints.Add(ModelHints::kFastTuning);
+    } else if (options.usage == InferenceUsage::SUSTAINED_SPEED) {
+      create_info.hints.Add(ModelHints::kAllowSpecialKernels);
+    }
+    RETURN_IF_ERROR(context.InitFromGraph(create_info, model, &environment_,
+                                          serialized_model));
+    return absl::OkStatus();
+  }
+
   absl::Status NewInferenceBuilder(
       const InferenceOptions& options, GraphFloat32 model,
       std::unique_ptr<InferenceBuilder>* builder) final {
@@ -864,6 +924,24 @@ class InferenceEnvironmentImpl : public InferenceEnvironment {
     return absl::OkStatus();
   }
 
+  absl::Status NewInferenceBuilder(
+      const std::vector<uint8_t>& serialized_model,
+      std::unique_ptr<InferenceBuilder>* builder) final {
+    if (environment_.program_cache() &&
+        !options_.serialized_binary_cache.empty()) {
+      // Ignore returned error. Cache is discarded.
+      environment_.program_cache()
+          ->AddSerializedCache(environment_.context(), environment_.device(),
+                               options_.serialized_binary_cache)
+          .IgnoreError();
+    }
+
+    auto builder_impl = absl::make_unique<InferenceBuilderImpl>(&environment_);
+    RETURN_IF_ERROR(builder_impl->Initialize(options_, serialized_model));
+    *builder = std::move(builder_impl);
+    return absl::OkStatus();
+  }
+
   std::vector<uint8_t> GetSerializedBinaryCache() const final {
     std::vector<uint8_t> data;
     // Is there was a problem, data would be empty.
diff --git a/tensorflow/lite/delegates/gpu/cl/api.h b/tensorflow/lite/delegates/gpu/cl/api.h
index 826d4f2bc78..65671117522 100644
--- a/tensorflow/lite/delegates/gpu/cl/api.h
+++ b/tensorflow/lite/delegates/gpu/cl/api.h
@@ -75,6 +75,20 @@ class InferenceEnvironment {
  public:
   virtual ~InferenceEnvironment() {}
 
+  // Converts GraphFloat32 into intermediate, device-specific representation.
+  // This serialized_model specific for device and InferenceOptions.
+  // serialized_model cannot be used with another device or InferenceOptions.
+  // Loading serialized_model is much faster than loading GraphFloat32.
+  // serialized_model must be used with appropriate NewInferenceBuilder
+  // method (see below).
+  virtual absl::Status BuildSerializedModel(
+      const InferenceOptions& options, GraphFloat32 model,
+      std::vector<uint8_t>* serialized_model) = 0;
+
+  virtual absl::Status NewInferenceBuilder(
+      const std::vector<uint8_t>& serialized_model,
+      std::unique_ptr<InferenceBuilder>* builder) = 0;
+
   virtual absl::Status NewInferenceBuilder(
       const InferenceOptions& options, GraphFloat32 model,
       std::unique_ptr<InferenceBuilder>* builder) = 0;
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index 5623de2419c..7c5e635816e 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -256,13 +256,6 @@ void Arguments::AddObjectRef(const std::string& name, AccessType access_type,
   object_refs_[name] = {std::move(descriptor_ptr)};
 }
 
-void Arguments::AddObject(const std::string& name, AccessType access_type,
-                          GPUObjectPtr&& object,
-                          GPUObjectDescriptorPtr&& descriptor_ptr) {
-  descriptor_ptr->SetAccess(access_type);
-  objects_[name] = {std::move(object), std::move(descriptor_ptr)};
-}
-
 void Arguments::AddObject(const std::string& name,
                           GPUObjectDescriptorPtr&& descriptor_ptr) {
   descriptor_ptr->SetAccess(AccessType::READ);
@@ -666,56 +659,64 @@ absl::Status Arguments::Bind(cl_kernel kernel, int offset) {
 
 std::string Arguments::AddActiveArgument(const std::string& arg_name,
                                          bool use_f32_for_halfs) {
-  if (auto it = int_values_.find(arg_name); it != int_values_.end()) {
-    int int_index;
-    if (it->second.active) {
-      int_index = it->second.offset;
-    } else {
-      it->second.active = true;
-      it->second.offset = shared_int4s_data_.size();
-      int_index = it->second.offset;
-      shared_int4s_data_.push_back(it->second.value);
-    }
-    std::string index = std::to_string(int_index / 4);
-    std::string postfixes[4] = {"x", "y", "z", "w"};
-    return "shared_int4_" + index + "." + postfixes[int_index % 4];
-  }
-  if (auto it = float_values_.find(arg_name); it != float_values_.end()) {
-    int float_index;
-    if (it->second.active) {
-      float_index = it->second.offset;
-    } else {
-      it->second.active = true;
-      it->second.offset = shared_float4s_data_.size();
-      float_index = it->second.offset;
-      shared_float4s_data_.push_back(it->second.value);
-    }
-    std::string index = std::to_string(float_index / 4);
-    std::string postfixes[4] = {"x", "y", "z", "w"};
-    return "shared_float4_" + index + "." + postfixes[float_index % 4];
-  }
-  if (auto it = half_values_.find(arg_name); it != half_values_.end()) {
-    int half_index;
-    if (it->second.active) {
-      half_index = it->second.offset;
-    } else {
-      it->second.active = true;
-      if (use_f32_for_halfs) {
-        it->second.store_as_f32 = true;
-        it->second.offset = shared_float4s_data_.size();
-        shared_float4s_data_.push_back(it->second.value);
+  {
+    auto it = int_values_.find(arg_name);
+    if (it != int_values_.end()) {
+      int int_index;
+      if (it->second.active) {
+        int_index = it->second.offset;
       } else {
-        it->second.offset = shared_half4s_data_.size();
-        shared_half4s_data_.push_back(it->second.value);
+        it->second.active = true;
+        it->second.offset = shared_int4s_data_.size();
+        int_index = it->second.offset;
+        shared_int4s_data_.push_back(it->second.value);
       }
-      half_index = it->second.offset;
+      std::string index = std::to_string(int_index / 4);
+      std::string postfixes[4] = {"x", "y", "z", "w"};
+      return "shared_int4_" + index + "." + postfixes[int_index % 4];
     }
-    std::string index = std::to_string(half_index / 4);
-    std::string postfixes[4] = {"x", "y", "z", "w"};
-    if (it->second.store_as_f32) {
-      return "(half)(shared_float4_" + index + "." + postfixes[half_index % 4] +
-             ")";
-    } else {
+  }
+  {
+    auto it = float_values_.find(arg_name);
+    if (it != float_values_.end()) {
+      int float_index;
+      if (it->second.active) {
+        float_index = it->second.offset;
+      } else {
+        it->second.active = true;
+        it->second.offset = shared_float4s_data_.size();
+        float_index = it->second.offset;
+        shared_float4s_data_.push_back(it->second.value);
+      }
+      std::string index = std::to_string(float_index / 4);
+      std::string postfixes[4] = {"x", "y", "z", "w"};
+      return "shared_float4_" + index + "." + postfixes[float_index % 4];
+    }
+  }
+  {
+    auto it = half_values_.find(arg_name);
+    if (it != half_values_.end()) {
+      int half_index;
+      if (it->second.active) {
+        half_index = it->second.offset;
+      } else {
+        it->second.active = true;
+        if (use_f32_for_halfs) {
+          it->second.store_as_f32 = true;
+          it->second.offset = shared_float4s_data_.size();
+          shared_float4s_data_.push_back(it->second.value);
+        } else {
+          it->second.offset = shared_half4s_data_.size();
+          shared_half4s_data_.push_back(it->second.value);
+        }
+        half_index = it->second.offset;
+      }
+      std::string index = std::to_string(half_index / 4);
+      std::string postfixes[4] = {"x", "y", "z", "w"};
+      if (it->second.store_as_f32) {
+        return "(half)(shared_float4_" + index + "." +
+               postfixes[half_index % 4] + ")";
+      }
       return "shared_half4_" + index + "." + postfixes[half_index % 4];
     }
   }
@@ -755,24 +756,38 @@ void Arguments::ResolveObjectNames(const std::string& object_name,
   }
 }
 
+GPUObjectDescriptor* Arguments::GetObjectDescriptor(
+    const std::string& object_name) const {
+  {
+    auto it = object_refs_.find(object_name);
+    if (it != object_refs_.end()) {
+      return it->second.descriptor.get();
+    }
+  }
+  {
+    auto it = objects_.find(object_name);
+    if (it != objects_.end()) {
+      return it->second.descriptor.get();
+    }
+  }
+  return nullptr;
+}
+
 absl::Status Arguments::ResolveSelector(
     const std::map<std::string, std::string>& linkables,
     const std::string& object_name, const std::string& selector,
     const std::vector<std::string>& args,
     const std::vector<std::string>& template_args, std::string* result) {
-  const GPUObjectDescriptor* desc_ptr;
-  if (auto it = object_refs_.find(object_name); it != object_refs_.end()) {
-    desc_ptr = it->second.descriptor.get();
-  } else if (auto it = objects_.find(object_name); it != objects_.end()) {
-    desc_ptr = it->second.descriptor.get();
-  } else {
+  const GPUObjectDescriptor* desc_ptr = GetObjectDescriptor(object_name);
+  if (!desc_ptr) {
     return absl::NotFoundError(
         absl::StrCat("No object with name - ", object_name));
   }
   auto names = desc_ptr->GetGPUResources().GetNames();
   const auto* tensor_desc = dynamic_cast<const TensorDescriptor*>(desc_ptr);
   if (tensor_desc && selector == "Write") {
-    if (auto it = linkables.find(object_name); it != linkables.end()) {
+    auto it = linkables.find(object_name);
+    if (it != linkables.end()) {
       if (desc_ptr->GetAccess() != AccessType::WRITE &&
           desc_ptr->GetAccess() != AccessType::READ_WRITE) {
         return absl::FailedPreconditionError(absl::StrCat(
@@ -850,11 +865,16 @@ absl::Status Arguments::AllocateObjects(CLContext* context) {
   for (auto& t : objects_) {
     RETURN_IF_ERROR(
         t.second.descriptor->CreateGPUObject(context, &t.second.obj_ptr));
-    t.second.descriptor->Release();
   }
   return absl::OkStatus();
 }
 
+void Arguments::ReleaseCPURepresentation() {
+  for (auto& t : objects_) {
+    t.second.descriptor->Release();
+  }
+}
+
 absl::Status Arguments::AddObjectArgs() {
   for (auto& t : objects_) {
     AddGPUResources(t.first, t.second.descriptor->GetGPUResources());
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.h b/tensorflow/lite/delegates/gpu/cl/arguments.h
index 643e1b7655d..a5435c4fc2f 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.h
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
 #include "tensorflow/lite/delegates/gpu/cl/util.h"
 #include "tensorflow/lite/delegates/gpu/common/access_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -33,49 +34,37 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class Arguments {
+class ArgumentsBinder {
+ public:
+  virtual absl::Status SetInt(const std::string& name, int value) = 0;
+  virtual absl::Status SetFloat(const std::string& name, float value) = 0;
+  virtual absl::Status SetHalf(const std::string& name, half value) = 0;
+  virtual ~ArgumentsBinder() = default;
+};
+
+class Arguments : public ArgumentsBinder {
  public:
   Arguments() = default;
   void AddFloat(const std::string& name, float value = 0.0f);
   void AddHalf(const std::string& name, half value = half(0.0f));
   void AddInt(const std::string& name, int value = 0);
-  void AddBuffer(const std::string& name, const GPUBufferDescriptor& desc);
-  void AddImage2D(const std::string& name, const GPUImage2DDescriptor& desc);
-  void AddImage2DArray(const std::string& name,
-                       const GPUImage2DArrayDescriptor& desc);
-  void AddImage3D(const std::string& name, const GPUImage3DDescriptor& desc);
-  void AddImageBuffer(const std::string& name,
-                      const GPUImageBufferDescriptor& desc);
-  void AddCustomMemory(const std::string& name,
-                       const GPUCustomMemoryDescriptor& desc);
-
   void AddObjectRef(const std::string& name, AccessType access_type,
                     GPUObjectDescriptorPtr&& descriptor_ptr);
-  void AddObject(const std::string& name, AccessType access_type,
-                 GPUObjectPtr&& object,
-                 GPUObjectDescriptorPtr&& descriptor_ptr);
   void AddObject(const std::string& name,
                  GPUObjectDescriptorPtr&& descriptor_ptr);
 
-  absl::Status SetInt(const std::string& name, int value);
-  absl::Status SetFloat(const std::string& name, float value);
-  absl::Status SetHalf(const std::string& name, half value);
-  absl::Status SetImage2D(const std::string& name, cl_mem memory);
-  absl::Status SetBuffer(const std::string& name, cl_mem memory);
-  absl::Status SetImage2DArray(const std::string& name, cl_mem memory);
-  absl::Status SetImage3D(const std::string& name, cl_mem memory);
-  absl::Status SetImageBuffer(const std::string& name, cl_mem memory);
-  absl::Status SetCustomMemory(const std::string& name, cl_mem memory);
+  absl::Status SetInt(const std::string& name, int value) override;
+  absl::Status SetFloat(const std::string& name, float value) override;
+  absl::Status SetHalf(const std::string& name, half value) override;
   absl::Status SetObjectRef(const std::string& name, const GPUObject* object);
 
-  std::string GetListOfArgs();
-
   absl::Status Bind(cl_kernel kernel, int offset = 0);
 
   void RenameArgs(const std::string& postfix, std::string* code) const;
   absl::Status Merge(Arguments&& args, const std::string& postfix);
 
   absl::Status AllocateObjects(CLContext* context);
+  void ReleaseCPURepresentation();
   absl::Status TransformToCLCode(
       const DeviceInfo& device_info,
       const std::map<std::string, std::string>& linkables, std::string* code);
@@ -86,7 +75,33 @@ class Arguments {
   Arguments(const Arguments&) = delete;
   Arguments& operator=(const Arguments&) = delete;
 
+  ~Arguments() override = default;
+
  private:
+  friend flatbuffers::Offset<data::Arguments> Encode(
+      const Arguments& args, flatbuffers::FlatBufferBuilder* builder);
+  friend absl::Status Decode(CLContext* context, const data::Arguments* fb_args,
+                             Arguments* args);
+
+  void AddBuffer(const std::string& name, const GPUBufferDescriptor& desc);
+  void AddImage2D(const std::string& name, const GPUImage2DDescriptor& desc);
+  void AddImage2DArray(const std::string& name,
+                       const GPUImage2DArrayDescriptor& desc);
+  void AddImage3D(const std::string& name, const GPUImage3DDescriptor& desc);
+  void AddImageBuffer(const std::string& name,
+                      const GPUImageBufferDescriptor& desc);
+  void AddCustomMemory(const std::string& name,
+                       const GPUCustomMemoryDescriptor& desc);
+
+  absl::Status SetImage2D(const std::string& name, cl_mem memory);
+  absl::Status SetBuffer(const std::string& name, cl_mem memory);
+  absl::Status SetImage2DArray(const std::string& name, cl_mem memory);
+  absl::Status SetImage3D(const std::string& name, cl_mem memory);
+  absl::Status SetImageBuffer(const std::string& name, cl_mem memory);
+  absl::Status SetCustomMemory(const std::string& name, cl_mem memory);
+
+  std::string GetListOfArgs();
+
   std::string AddActiveArgument(const std::string& arg_name,
                                 bool use_f32_for_halfs);
   void AddGPUResources(const std::string& name, const GPUResources& resources);
@@ -110,6 +125,9 @@ class Arguments {
                           const std::vector<std::string>& member_names,
                           std::string* code);
 
+  GPUObjectDescriptor* GetObjectDescriptor(
+      const std::string& object_name) const;
+
   static constexpr char kArgsPrefix[] = "args.";
 
   struct IntValue {
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments_test.cc b/tensorflow/lite/delegates/gpu/cl/arguments_test.cc
index 29a15e16a57..722ca5b1827 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments_test.cc
@@ -14,85 +14,58 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/cl/arguments.h"
 
+#include <cstdint>
 #include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/strings/match.h"
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 
 namespace tflite {
 namespace gpu {
 namespace cl {
-namespace {
-struct TestDescriptor : public GPUObjectDescriptor {
-  absl::Status PerformSelector(const std::string& selector,
-                               const std::vector<std::string>& args,
-                               const std::vector<std::string>& template_args,
-                               std::string* result) const override {
-    if (selector == "Length") {
-      *result = "length";
-      return absl::OkStatus();
-    } else if (selector == "Read") {
-      if (args.size() != 1) {
-        return absl::NotFoundError(
-            absl::StrCat("TestDescriptor Read require one argument, but ",
-                         args.size(), " was passed"));
-      }
-      *result = absl::StrCat("buffer[", args[0], "]");
-      return absl::OkStatus();
-    } else {
-      return absl::NotFoundError(absl::StrCat(
-          "TestDescriptor don't have selector with name - ", selector));
-    }
-  }
-
-  GPUResources GetGPUResources(AccessType access_type) const override {
-    GPUResources resources;
-    resources.ints.push_back("length");
-    GPUBufferDescriptor desc;
-    desc.data_type = DataType::FLOAT32;
-    desc.element_size = 4;
-    resources.buffers.push_back({"buffer", desc});
-    return resources;
-  }
-};
-}  // namespace
-
 TEST(ArgumentsTest, TestSelectorResolve) {
-  TestDescriptor descriptor;
-  Arguments args;
-  args.AddObjectRef("object", AccessType::WRITE,
-                    absl::make_unique<TestDescriptor>(descriptor));
-  std::string sample_code = R"(
-  if (a < 3) {
-    value = args.object.Read(id);
-  }
-)";
-  const std::string expected_result = R"(
-  if (a < 3) {
-    value = object_buffer[id];
-  }
-)";
-  ASSERT_OK(args.TransformToCLCode({}, &sample_code));
-  EXPECT_EQ(sample_code, expected_result);
+  BufferDescriptor desc;
+  desc.element_type = DataType::FLOAT32;
+  desc.element_size = 4;
+  desc.memory_type = MemoryType::GLOBAL;
 
-  std::string cl_arguments = args.GetListOfArgs();
-  EXPECT_TRUE(cl_arguments.find("__global float4* object_buffer") !=
-              std::string::npos);
+  Arguments args;
+  args.AddObjectRef("weights", AccessType::READ,
+                    absl::make_unique<BufferDescriptor>(std::move(desc)));
+  std::string sample_code = R"(
+__kernel void main_function($0) {
+  if (a < 3) {
+    value = args.weights.Read(id);
+  }
+})";
+
+  DeviceInfo device_info;
+  ASSERT_OK(args.TransformToCLCode(device_info, {}, &sample_code));
+  EXPECT_TRUE(absl::StrContains(sample_code, "value = weights_buffer[id];"));
+  EXPECT_TRUE(
+      absl::StrContains(sample_code, "__global float4* weights_buffer"));
 }
 
 TEST(ArgumentsTest, TestNoSelector) {
-  TestDescriptor descriptor;
+  BufferDescriptor desc;
+  desc.element_type = DataType::FLOAT32;
+  desc.element_size = 4;
+  desc.memory_type = MemoryType::GLOBAL;
+
   Arguments args;
-  args.AddObjectRef("object", AccessType::WRITE,
-                    absl::make_unique<TestDescriptor>(descriptor));
+  args.AddObjectRef("weights", AccessType::READ,
+                    absl::make_unique<BufferDescriptor>(std::move(desc)));
   std::string sample_code = R"(
   if (a < 3) {
-    value = args.object.Write(id);
+    value = args.weights.UnknownSelector(id);
   }
 )";
-  EXPECT_FALSE(args.TransformToCLCode({}, &sample_code).ok());
+  DeviceInfo device_info;
+  EXPECT_FALSE(args.TransformToCLCode(device_info, {}, &sample_code).ok());
 }
 
 TEST(ArgumentsTest, TestRenameArgs) {
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
index a1795b18b27..10937cfc56b 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
@@ -56,14 +56,15 @@ void CLCommandQueue::Release() {
   }
 }
 
-absl::Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
-                                              int3 work_group_size,
-                                              CLEvent* event) {
+absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
+                                      const int3& work_groups_count,
+                                      const int3& work_group_size,
+                                      CLEvent* event) {
   std::vector<size_t> local(3);
   std::vector<size_t> global(3);
   for (int i = 0; i < 3; ++i) {
     local[i] = work_group_size[i];
-    global[i] = AlignByN(grid[i], work_group_size[i]);
+    global[i] = work_groups_count[i] * work_group_size[i];
   }
   cl_event resulting_event;
   const int error_code = clEnqueueNDRangeKernel(
@@ -80,9 +81,10 @@ absl::Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
   return absl::OkStatus();
 }
 
-absl::Status CLCommandQueue::DispatchImplicit(const CLKernel& kernel, int3 grid,
-                                              int3 work_group_size) {
-  return DispatchImplicit(kernel, grid, work_group_size, nullptr);
+absl::Status CLCommandQueue::Dispatch(const CLKernel& kernel,
+                                      const int3& work_groups_count,
+                                      const int3& work_group_size) {
+  return Dispatch(kernel, work_groups_count, work_group_size, nullptr);
 }
 
 absl::Status CLCommandQueue::EnqueueEvent(CLEvent* event) {
@@ -191,12 +193,13 @@ void ProfilingCommandQueue::SetEventsLabel(const std::string& name) {
 
 void ProfilingCommandQueue::ResetMeasurements() { events_.clear(); }
 
-absl::Status ProfilingCommandQueue::DispatchImplicit(const CLKernel& kernel,
-                                                     int3 grid,
-                                                     int3 work_group_size) {
+absl::Status ProfilingCommandQueue::Dispatch(const CLKernel& kernel,
+                                             const int3& work_groups_count,
+                                             const int3& work_group_size) {
   events_.push_back(CLEvent());
-  RETURN_IF_ERROR(CLCommandQueue::DispatchImplicit(
-      kernel, grid, work_group_size, &events_[events_.size() - 1]));
+  RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
+                                           work_group_size,
+                                           &events_[events_.size() - 1]));
   events_.back().SetName(current_label_);
   return absl::OkStatus();
 }
@@ -213,14 +216,15 @@ ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
 }
 
 absl::Status ProfilingCommandQueue::GetBestWorkGroupIndex(
-    const CLKernel& kernel, const DeviceInfo& device_info, const int3& grid,
+    const CLKernel& kernel, const DeviceInfo& device_info,
+    const std::vector<int3>& work_groups_count,
     const std::vector<int3>& work_group_sizes, int* index) {
   // Some Adreno 3xx can have wrong numbers for some events
   const bool possible_bug_with_events = device_info.IsAdreno3xx();
   events_.resize(work_group_sizes.size());
   for (int i = 0; i < work_group_sizes.size(); ++i) {
-    RETURN_IF_ERROR(CLCommandQueue::DispatchImplicit(
-        kernel, grid, work_group_sizes[i], &events_[i]));
+    RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count[i],
+                                             work_group_sizes[i], &events_[i]));
 
     // reducing the speed of memory leak on Mali for some kernels
     if (device_info.IsMali() && i % 8 == 7) {
@@ -330,24 +334,34 @@ absl::Duration ProfilingInfo::GetTotalTime() const {
 
 std::string ProfilingInfo::GetDetailedReport() const {
   std::string result;
-  std::map<std::string, double> timing;
+  struct OpStatistic {
+    int count;
+    double total_time;
+  };
+  std::map<std::string, OpStatistic> statistics;
   result +=
       "Per kernel timing(" + std::to_string(dispatches.size()) + " kernels):\n";
   for (const auto& dispatch : dispatches) {
     result += "  " + dispatch.label + " - " +
               std::to_string(absl::ToDoubleMilliseconds(dispatch.duration)) +
-              "ms\n";
+              " ms\n";
     auto name = dispatch.label.substr(0, dispatch.label.find(" "));
-    if (timing.find(name) != timing.end()) {
-      timing[name] += absl::ToDoubleMilliseconds(dispatch.duration);
+    if (statistics.find(name) != statistics.end()) {
+      statistics[name].count++;
+      statistics[name].total_time +=
+          absl::ToDoubleMilliseconds(dispatch.duration);
     } else {
-      timing[name] = absl::ToDoubleMilliseconds(dispatch.duration);
+      statistics[name].count = 1;
+      statistics[name].total_time =
+          absl::ToDoubleMilliseconds(dispatch.duration);
     }
   }
   result += "--------------------\n";
   result += "Accumulated time per operation type:\n";
-  for (auto& t : timing) {
-    result += "  " + t.first + " - " + std::to_string(t.second) + "ms\n";
+  for (auto& t : statistics) {
+    auto stat = t.second;
+    result += "  " + t.first + "(x" + std::to_string(stat.count) + ") - " +
+              std::to_string(stat.total_time) + " ms\n";
   }
   result += "--------------------\n";
   result += "Ideal total time: " +
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
index 178e3b21a1e..519b87640e7 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
@@ -74,14 +74,15 @@ class CLCommandQueue {
 
   cl_command_queue queue() const { return queue_; }
 
-  virtual absl::Status DispatchImplicit(const CLKernel& kernel, int3 grid,
-                                        int3 work_group_size);
+  virtual absl::Status Dispatch(const CLKernel& kernel,
+                                const int3& work_groups_count,
+                                const int3& work_group_size);
+
+  absl::Status Dispatch(const CLKernel& kernel, const int3& work_groups_count,
+                        const int3& work_group_size, CLEvent* event);
 
   absl::Status EnqueueEvent(CLEvent* event);
 
-  absl::Status DispatchImplicit(const CLKernel& kernel, int3 grid,
-                                int3 work_group_size, CLEvent* event);
-
   absl::Status EnqueueWriteImage(cl_mem memory, int3 region, const void* data);
   absl::Status EnqueueReadImage(cl_mem memory, int3 region, void* data);
 
@@ -110,13 +111,13 @@ class ProfilingCommandQueue : public CLCommandQueue {
   ProfilingCommandQueue(const ProfilingCommandQueue&) = delete;
   ProfilingCommandQueue& operator=(const ProfilingCommandQueue&) = delete;
 
-  absl::Status DispatchImplicit(const CLKernel& kernel, int3 grid,
-                                int3 work_group_size) override;
+  absl::Status Dispatch(const CLKernel& kernel, const int3& work_groups_count,
+                        const int3& work_group_size) override;
 
   // will write index for fastest work_group among work_group_sizes
   absl::Status GetBestWorkGroupIndex(const CLKernel& kernel,
                                      const DeviceInfo& device_info,
-                                     const int3& grid,
+                                     const std::vector<int3>& work_groups_count,
                                      const std::vector<int3>& work_group_sizes,
                                      int* index);
 
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.h b/tensorflow/lite/delegates/gpu/cl/cl_device.h
index e7cd274661d..79335a61aff 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.h
@@ -73,6 +73,7 @@ class CLDevice {
   bool SupportsOneLayerTextureArray() const;
   void DisableOneLayerTextureArray();
 
+  const DeviceInfo& GetInfo() const { return info_; }
   // We update device info during context creation, so as supported texture
   // formats can be requested from context only.
   mutable DeviceInfo info_;
diff --git a/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h b/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h
new file mode 100644
index 00000000000..8a12bf2a9db
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h
@@ -0,0 +1,207 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// automatically generated by the FlatBuffers compiler, do not modify
+
+
+#ifndef FLATBUFFERS_GENERATED_COMPILEDPROGRAMCACHE_TFLITE_GPU_CL_DATA_H_
+#define FLATBUFFERS_GENERATED_COMPILEDPROGRAMCACHE_TFLITE_GPU_CL_DATA_H_
+
+#include "flatbuffers/flatbuffers.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace data {
+
+struct Program;
+
+struct CompiledCache;
+
+struct Program FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FINGERPRINT = 4,
+    VT_BINARY = 6
+  };
+  uint64_t fingerprint() const {
+    return GetField<uint64_t>(VT_FINGERPRINT, 0);
+  }
+  const flatbuffers::Vector<uint8_t> *binary() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_BINARY);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint64_t>(verifier, VT_FINGERPRINT) &&
+           VerifyOffset(verifier, VT_BINARY) &&
+           verifier.VerifyVector(binary()) &&
+           verifier.EndTable();
+  }
+};
+
+struct ProgramBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_fingerprint(uint64_t fingerprint) {
+    fbb_.AddElement<uint64_t>(Program::VT_FINGERPRINT, fingerprint, 0);
+  }
+  void add_binary(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> binary) {
+    fbb_.AddOffset(Program::VT_BINARY, binary);
+  }
+  explicit ProgramBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ProgramBuilder &operator=(const ProgramBuilder &);
+  flatbuffers::Offset<Program> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<Program>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<Program> CreateProgram(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    uint64_t fingerprint = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> binary = 0) {
+  ProgramBuilder builder_(_fbb);
+  builder_.add_fingerprint(fingerprint);
+  builder_.add_binary(binary);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<Program> CreateProgramDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    uint64_t fingerprint = 0,
+    const std::vector<uint8_t> *binary = nullptr) {
+  auto binary__ = binary ? _fbb.CreateVector<uint8_t>(*binary) : 0;
+  return tflite::gpu::cl::data::CreateProgram(
+      _fbb,
+      fingerprint,
+      binary__);
+}
+
+struct CompiledCache FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_DRIVER_VERSION = 4,
+    VT_PROGRAMS = 6
+  };
+  const flatbuffers::String *driver_version() const {
+    return GetPointer<const flatbuffers::String *>(VT_DRIVER_VERSION);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<Program>> *programs() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Program>> *>(VT_PROGRAMS);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_DRIVER_VERSION) &&
+           verifier.VerifyString(driver_version()) &&
+           VerifyOffset(verifier, VT_PROGRAMS) &&
+           verifier.VerifyVector(programs()) &&
+           verifier.VerifyVectorOfTables(programs()) &&
+           verifier.EndTable();
+  }
+};
+
+struct CompiledCacheBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_driver_version(flatbuffers::Offset<flatbuffers::String> driver_version) {
+    fbb_.AddOffset(CompiledCache::VT_DRIVER_VERSION, driver_version);
+  }
+  void add_programs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Program>>> programs) {
+    fbb_.AddOffset(CompiledCache::VT_PROGRAMS, programs);
+  }
+  explicit CompiledCacheBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  CompiledCacheBuilder &operator=(const CompiledCacheBuilder &);
+  flatbuffers::Offset<CompiledCache> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<CompiledCache>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<CompiledCache> CreateCompiledCache(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> driver_version = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Program>>> programs = 0) {
+  CompiledCacheBuilder builder_(_fbb);
+  builder_.add_programs(programs);
+  builder_.add_driver_version(driver_version);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<CompiledCache> CreateCompiledCacheDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *driver_version = nullptr,
+    const std::vector<flatbuffers::Offset<Program>> *programs = nullptr) {
+  auto driver_version__ = driver_version ? _fbb.CreateString(driver_version) : 0;
+  auto programs__ = programs ? _fbb.CreateVector<flatbuffers::Offset<Program>>(*programs) : 0;
+  return tflite::gpu::cl::data::CreateCompiledCache(
+      _fbb,
+      driver_version__,
+      programs__);
+}
+
+inline const tflite::gpu::cl::data::CompiledCache *GetCompiledCache(const void *buf) {
+  return flatbuffers::GetRoot<tflite::gpu::cl::data::CompiledCache>(buf);
+}
+
+inline const tflite::gpu::cl::data::CompiledCache *GetSizePrefixedCompiledCache(const void *buf) {
+  return flatbuffers::GetSizePrefixedRoot<tflite::gpu::cl::data::CompiledCache>(buf);
+}
+
+inline const char *CompiledCacheIdentifier() {
+  return "AFCM";
+}
+
+inline bool CompiledCacheBufferHasIdentifier(const void *buf) {
+  return flatbuffers::BufferHasIdentifier(
+      buf, CompiledCacheIdentifier());
+}
+
+inline bool VerifyCompiledCacheBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifyBuffer<tflite::gpu::cl::data::CompiledCache>(CompiledCacheIdentifier());
+}
+
+inline bool VerifySizePrefixedCompiledCacheBuffer(
+    flatbuffers::Verifier &verifier) {
+  return verifier.VerifySizePrefixedBuffer<tflite::gpu::cl::data::CompiledCache>(CompiledCacheIdentifier());
+}
+
+inline const char *CompiledCacheExtension() {
+  return "jetbin";
+}
+
+inline void FinishCompiledCacheBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<tflite::gpu::cl::data::CompiledCache> root) {
+  fbb.Finish(root, CompiledCacheIdentifier());
+}
+
+inline void FinishSizePrefixedCompiledCacheBuffer(
+    flatbuffers::FlatBufferBuilder &fbb,
+    flatbuffers::Offset<tflite::gpu::cl::data::CompiledCache> root) {
+  fbb.FinishSizePrefixed(root, CompiledCacheIdentifier());
+}
+
+}  // namespace data
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // FLATBUFFERS_GENERATED_COMPILEDPROGRAMCACHE_TFLITE_GPU_CL_DATA_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/device_info.cc b/tensorflow/lite/delegates/gpu/cl/device_info.cc
index 5d035e34617..43d050e8371 100644
--- a/tensorflow/lite/delegates/gpu/cl/device_info.cc
+++ b/tensorflow/lite/delegates/gpu/cl/device_info.cc
@@ -40,7 +40,8 @@ MaliGPU GetMaliGPUVersion(const std::string& device_name) {
       {"T830", MaliGPU::T830}, {"T860", MaliGPU::T860}, {"T880", MaliGPU::T880},
       {"G31", MaliGPU::G31},   {"G51", MaliGPU::G51},   {"G71", MaliGPU::G71},
       {"G52", MaliGPU::G52},   {"G72", MaliGPU::G72},   {"G76", MaliGPU::G76},
-      {"G57", MaliGPU::G57},   {"G77", MaliGPU::G77},
+      {"G57", MaliGPU::G57},   {"G77", MaliGPU::G77},   {"G68", MaliGPU::G68},
+      {"G78", MaliGPU::G78},
   };
   for (const auto& v : kMapping) {
     if (device_name.find(v.first) != std::string::npos) {
@@ -212,7 +213,8 @@ bool MaliInfo::IsBifrost() const {
 }
 
 bool MaliInfo::IsValhall() const {
-  return gpu_version == MaliGPU::G57 || gpu_version == MaliGPU::G77;
+  return gpu_version == MaliGPU::G57 || gpu_version == MaliGPU::G77 ||
+         gpu_version == MaliGPU::G68 || gpu_version == MaliGPU::G78;
 }
 
 bool DeviceInfo::SupportsTextureArray() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/device_info.h b/tensorflow/lite/delegates/gpu/cl/device_info.h
index abb3feb07b1..f28f4719232 100644
--- a/tensorflow/lite/delegates/gpu/cl/device_info.h
+++ b/tensorflow/lite/delegates/gpu/cl/device_info.h
@@ -95,6 +95,8 @@ enum class MaliGPU {
   G76,
   G57,
   G77,
+  G68,
+  G78,
   UNKNOWN
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/environment.cc b/tensorflow/lite/delegates/gpu/cl/environment.cc
index 785e88299a7..5b06b307133 100644
--- a/tensorflow/lite/delegates/gpu/cl/environment.cc
+++ b/tensorflow/lite/delegates/gpu/cl/environment.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
 #include "tensorflow/lite/delegates/gpu/cl/util.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 
@@ -26,59 +25,6 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 namespace {
-
-std::string GetKernelOneLayerTextureArray() {
-  return R"(
-
-__kernel void main_function(__write_only image2d_array_t dst) {
-  int X = (int)(get_global_id(0));
-  int Y = (int)(get_global_id(1));
-
-  write_imagef(dst, (int4)(X, Y, 0, 0), (float4)(2.0, 2.0, 2.0, 2.0));
-}
-)";
-}
-
-// Some Adreno < 600 have bug with one layer texture array. b/131099086
-// If we have one layer texture array and will write smt from kernel to this
-// texture, we will get zeroes instead of actual values.
-// The same kernel will work, if we use texture array with more than one layer.
-// With help of this code we can detect this bug.
-absl::Status CheckKernelSupportOfOneLayerTextureArray(Environment* env,
-                                                      bool* result) {
-  // No bug on Adreno 6xx
-  if (env->device().info_.adreno_info.gpu_version >= 600) {
-    *result = true;
-    return absl::OkStatus();
-  }
-  CLKernel kernel;
-  RETURN_IF_ERROR(env->program_cache()->GetOrCreateCLKernel(
-      GetKernelOneLayerTextureArray(), "main_function", env->context(),
-      env->device(), &kernel));
-
-  Tensor tensor;
-  const BHWC shape(1, 4, 4, 4);
-  RETURN_IF_ERROR(CreateTensor(
-      env->context(), shape,
-      {DataType::FLOAT32, TensorStorageType::TEXTURE_ARRAY, Layout::HWC},
-      &tensor));
-  RETURN_IF_ERROR(kernel.SetMemory(0, tensor.GetMemoryPtr()));
-  RETURN_IF_ERROR(env->queue()->DispatchImplicit(kernel, {4, 4, 1}, {4, 4, 1}));
-  TensorFloat32 tensor_gpu;
-  tensor_gpu.shape = shape;
-  tensor_gpu.data.resize(shape.DimensionsProduct());
-  RETURN_IF_ERROR(tensor.ReadData(env->queue(), &tensor_gpu));
-
-  *result = true;
-  for (int i = 0; i < 64; ++i) {
-    if (tensor_gpu.data[i] != 2.0) {
-      *result = false;
-      break;
-    }
-  }
-  return absl::OkStatus();
-}
-
 absl::Status CreateEnvironment(Environment* result, bool shared,
                                cl_context_properties egl_context,
                                cl_context_properties egl_display) {
@@ -99,16 +45,7 @@ absl::Status CreateEnvironment(Environment* result, bool shared,
   *result = Environment(std::move(gpu), std::move(context), std::move(queue),
                         std::move(profiling_queue));
 
-  if (result->device().IsAdreno() && result->device().SupportsTextureArray()) {
-    bool supports_one_layer;
-    RETURN_IF_ERROR(
-        CheckKernelSupportOfOneLayerTextureArray(result, &supports_one_layer));
-    if (!supports_one_layer) {
-      result->GetDevicePtr()->DisableOneLayerTextureArray();
-    }
-  }
-
-  return absl::OkStatus();
+  return result->Init();
 }
 
 }  // namespace
@@ -141,10 +78,12 @@ Environment& Environment::operator=(Environment&& environment) {
 
 absl::Status Environment::Init() {
   if (device().IsAdreno() && device().SupportsTextureArray()) {
-    bool supports_one_layer;
-    RETURN_IF_ERROR(
-        CheckKernelSupportOfOneLayerTextureArray(this, &supports_one_layer));
-    if (!supports_one_layer) {
+    // Some Adreno < 600 have bug with one layer texture array. b/131099086
+    // If we have one layer texture array and will write smt from kernel to this
+    // texture, we will get zeroes instead of actual values.
+    // The same kernel will work, if we use texture array with more than one
+    // layer.
+    if (device().info_.adreno_info.gpu_version < 600) {
       GetDevicePtr()->DisableOneLayerTextureArray();
     }
   }
@@ -232,54 +171,54 @@ bool Environment::IsSupported(TensorStorageType storage_type) const {
   return false;
 }
 
-TensorStorageType GetFastestStorageType(const CLDevice& gpu) {
-  if (gpu.IsAdreno()) {
-    if (gpu.IsAdreno6xxOrHigher()) {
+TensorStorageType GetFastestStorageType(const DeviceInfo& gpu_info) {
+  if (gpu_info.IsAdreno()) {
+    if (gpu_info.IsAdreno6xxOrHigher()) {
       return TensorStorageType::TEXTURE_ARRAY;
     } else {
       return TensorStorageType::TEXTURE_2D;
     }
-  } else if (gpu.IsPowerVR()) {
+  } else if (gpu_info.IsPowerVR()) {
     return TensorStorageType::TEXTURE_2D;
-  } else if (gpu.IsMali()) {
-    const MaliInfo mali_info = gpu.info_.mali_info;
+  } else if (gpu_info.IsMali()) {
+    const MaliInfo mali_info = gpu_info.mali_info;
     if (mali_info.IsMaliT8xx() || mali_info.IsBifrostGen3() ||
         mali_info.IsValhall()) {
       return TensorStorageType::TEXTURE_2D;
     } else {
       return TensorStorageType::BUFFER;
     }
-  } else if (gpu.IsNvidia()) {
-    return gpu.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER
-                                     : TensorStorageType::BUFFER;
-  } else if (gpu.IsAMD()) {
-    return gpu.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER
-                                     : TensorStorageType::BUFFER;
-  } else if (gpu.IsIntel()) {
+  } else if (gpu_info.IsNvidia()) {
+    return gpu_info.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER
+                                          : TensorStorageType::BUFFER;
+  } else if (gpu_info.IsAMD()) {
+    return gpu_info.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER
+                                          : TensorStorageType::BUFFER;
+  } else if (gpu_info.IsIntel()) {
     return TensorStorageType::BUFFER;
   }
   return TensorStorageType::BUFFER;
 }
 
 TensorStorageType GetStorageTypeWithMinimalMemoryConsumption(
-    const CLDevice& gpu) {
-  if (gpu.IsAdreno()) {
-    if (gpu.IsAdreno3xx() || gpu.IsAdreno4xx()) {
+    const DeviceInfo& gpu_info) {
+  if (gpu_info.IsAdreno()) {
+    if (gpu_info.IsAdreno3xx() || gpu_info.IsAdreno4xx()) {
       return TensorStorageType::BUFFER;
     } else {
       return TensorStorageType::IMAGE_BUFFER;
     }
-  } else if (gpu.IsPowerVR()) {
+  } else if (gpu_info.IsPowerVR()) {
     return TensorStorageType::BUFFER;
-  } else if (gpu.IsMali()) {
+  } else if (gpu_info.IsMali()) {
     return TensorStorageType::BUFFER;
-  } else if (gpu.IsNvidia()) {
-    return gpu.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER
-                                     : TensorStorageType::BUFFER;
-  } else if (gpu.IsAMD()) {
-    return gpu.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER
-                                     : TensorStorageType::BUFFER;
-  } else if (gpu.IsIntel()) {
+  } else if (gpu_info.IsNvidia()) {
+    return gpu_info.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER
+                                          : TensorStorageType::BUFFER;
+  } else if (gpu_info.IsAMD()) {
+    return gpu_info.SupportsImageBuffer() ? TensorStorageType::IMAGE_BUFFER
+                                          : TensorStorageType::BUFFER;
+  } else if (gpu_info.IsIntel()) {
     return TensorStorageType::BUFFER;
   }
   return TensorStorageType::BUFFER;
diff --git a/tensorflow/lite/delegates/gpu/cl/environment.h b/tensorflow/lite/delegates/gpu/cl/environment.h
index 640f2d8cac3..1f5b4befdce 100644
--- a/tensorflow/lite/delegates/gpu/cl/environment.h
+++ b/tensorflow/lite/delegates/gpu/cl/environment.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/program_cache.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -75,9 +75,9 @@ class Environment {
   ProgramCache program_cache_;
 };
 
-TensorStorageType GetFastestStorageType(const CLDevice& gpu);
+TensorStorageType GetFastestStorageType(const DeviceInfo& gpu_info);
 TensorStorageType GetStorageTypeWithMinimalMemoryConsumption(
-    const CLDevice& gpu);
+    const DeviceInfo& gpu_info);
 
 absl::Status CreateEnvironment(Environment* result);
 
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.cc b/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.cc
index fc8fcde439b..e0933ed56e1 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.cc
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.cc
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
+#include "tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h"
 
 namespace tflite {
 namespace gpu {
@@ -97,8 +97,8 @@ class Delegate {
     // Apply general transformations on the graph.
     NullTransformationReporter reporter;
     ModelTransformer transformer(&graph, &reporter);
-    if (!ApplyGeneralTransformations(&transformer)) {
-      return absl::InternalError("Graph general transformations failed");
+    if (!ApplyModelTransformations(&transformer)) {
+      return absl::InternalError("Graph transformations failed");
     }
 
     InferenceEnvironmentOptions env_options;
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.h b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
index 297a5f70858..abd77a4489b 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_object.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
 #include "tensorflow/lite/delegates/gpu/common/access_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -164,6 +165,10 @@ class GPUObjectDescriptor {
   AccessType GetAccess() const { return access_type_; }
 
  protected:
+  friend flatbuffers::Offset<data::GPUObjectDescriptor> Encode(
+      const GPUObjectDescriptor& desc, flatbuffers::FlatBufferBuilder* builder);
+  friend void Decode(const data::GPUObjectDescriptor* fb_obj,
+                     GPUObjectDescriptor* obj);
   mutable std::map<std::string, std::string> state_vars_;
   AccessType access_type_;
 };
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index 9cb8ddee818..ca0c0319f54 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -153,7 +153,7 @@ CLNode& CLNode::operator=(CLNode&& node) {
 
 absl::Status InferenceContext::InitFromGraph(
     const CreateInferenceInfo& create_info, const GraphFloat32& graph,
-    Environment* env) {
+    Environment* env, std::vector<uint8_t>* serialized_model) {
   CreationContext creation_context;
   creation_context.device = env->GetDevicePtr();
   creation_context.context = &env->context();
@@ -188,15 +188,63 @@ absl::Status InferenceContext::InitFromGraph(
   if (create_info.hints.Check(ModelHints::kFastTuning)) {
     tuning_parameters.tuning_type = TuningType::FAST;
   }
+  if (tuning_parameters.info->IsMali()) {
+    const MaliInfo& info = tuning_parameters.info->mali_info;
+    if (info.IsMaliT6xx()) {
+      // Mali T628 hangs forever in clFinish when used profiling queue
+      // TuningType::FAST does not use profiling queue.
+      tuning_parameters.tuning_type = TuningType::FAST;
+    }
+  }
   RETURN_IF_ERROR(Tune(tuning_parameters));
+
+  if (serialized_model) {
+    flatbuffers::FlatBufferBuilder builder;
+    auto encoded_fb = Encode(*this, &builder);
+    data::FinishInferenceContextBuffer(builder, encoded_fb);
+    serialized_model->resize(builder.GetSize());
+    std::memcpy(serialized_model->data(), builder.GetBufferPointer(),
+                builder.GetSize());
+  }
+  for (auto& node : nodes_) {
+    node.operation->args_.ReleaseCPURepresentation();
+  }
+  return absl::OkStatus();
+}
+
+absl::Status InferenceContext::RestoreDeserialized(
+    const std::vector<uint8_t>& serialized_model, Environment* env) {
+  flatbuffers::Verifier verifier(serialized_model.data(),
+                                 serialized_model.size());
+  if (!data::VerifyInferenceContextBuffer(verifier)) {
+    return absl::DataLossError("Deserialization failed.");
+  }
+  auto decoded_fb = data::GetInferenceContext(serialized_model.data());
+  RETURN_IF_ERROR(Decode(&env->context(), decoded_fb, this));
+
+  CreationContext creation_context;
+  creation_context.device = env->GetDevicePtr();
+  creation_context.context = &env->context();
+  creation_context.queue = env->queue();
+  creation_context.cache = env->program_cache();
+
+  RETURN_IF_ERROR(AllocateMemory(creation_context.context));
+  BindMemoryToOperations();
+  for (auto& node : nodes_) {
+    RETURN_IF_ERROR(node.operation->CompileDeserialized(creation_context));
+  }
+  RETURN_IF_ERROR(UpdateParams());
+  for (auto& node : nodes_) {
+    node.operation->args_.ReleaseCPURepresentation();
+  }
   return absl::OkStatus();
 }
 
 absl::Status InferenceContext::InitFromGraphWithTransforms(
     const CreateInferenceInfo& create_info, GraphFloat32* graph,
-    Environment* env) {
+    Environment* env, std::vector<uint8_t>* serialized_model) {
   RETURN_IF_ERROR(RunGraphTransforms(graph));
-  RETURN_IF_ERROR(InitFromGraph(create_info, *graph, env));
+  RETURN_IF_ERROR(InitFromGraph(create_info, *graph, env, serialized_model));
   return absl::OkStatus();
 }
 
@@ -206,6 +254,11 @@ void InferenceContext::CopyInAndOutIds(const GraphFloat32& graph) {
     input_ids_.push_back(input->id);
   }
 
+  const auto variable_inputs = graph.variable_inputs();
+  for (const auto& variable_input : variable_inputs) {
+    variable_ids_and_refs_[variable_input->id] = variable_input->tensor.ref;
+  }
+
   const auto outputs = graph.outputs();
   for (const auto& output : outputs) {
     output_ids_.push_back(output->id);
@@ -261,10 +314,12 @@ absl::Status InferenceContext::ConvertOperations(const DeviceInfo& device_info,
     if (consumed_nodes.find(node.id) != consumed_nodes.end()) {
       continue;
     }
+    std::string op_name = node.operation.type + " " + std::to_string(node.id);
     GPUOperationsSubgraph gpu_subgraph;
     if (hints.Check(ModelHints::kAllowSpecialKernels) &&
         GPUSubgraphFromGraph(device_info, precision_, graph, node.id,
-                             tensor_descriptors, &consumed_nodes, &gpu_subgraph)
+                             tensor_descriptors, &consumed_nodes, &gpu_subgraph,
+                             &op_name)
             .ok()) {
       // Mapping of subgraph (set of nodes) to GPU operations. Should happen
       // before straigtforward mapping.
@@ -333,7 +388,7 @@ absl::Status InferenceContext::ConvertOperations(const DeviceInfo& device_info,
           cl_node.outputs[j] = mapping_to_global_ids[-(id + 1)];
         }
       }
-      cl_node.name = node.operation.type + " " + std::to_string(node.id);
+      cl_node.name = op_name;
       nodes_.push_back(std::move(cl_node));
     }
   }
@@ -387,41 +442,71 @@ absl::Status InferenceContext::Merge() {
   return absl::OkStatus();
 }
 
-void InferenceContext::GetUsages(
-    const std::function<bool(const TensorDescriptor&)>& functor,
-    std::map<ValueId, int2>* usages) {
+void InferenceContext::GetUsages(const std::function<bool(ValueId)>& functor,
+                                 std::map<ValueId, int2>* usages) {
   for (ValueId in_id : input_ids_) {
-    const auto& desc = tensor_reserver_.Get(in_id).descriptor;
-    if (functor(desc)) {
+    if (functor(in_id)) {
       AddUsage(in_id, 0, usages);
     }
   }
   for (int op_index = 0; op_index < nodes_.size(); ++op_index) {
     auto tensors = GetCLNodeTensors(nodes_[op_index]);
     for (auto& tensor : tensors) {
-      if (functor(tensor.second)) {
+      if (functor(tensor.first)) {
         AddUsage(tensor.first, op_index, usages);
       }
     }
   }
   for (ValueId out_id : output_ids_) {
-    const auto& desc = tensor_reserver_.Get(out_id).descriptor;
-    if (functor(desc)) {
+    if (functor(out_id)) {
       AddUsage(out_id, nodes_.size(), usages);
     }
   }
 }
 
+InferenceContext::TensorMemoryType InferenceContext::GetTensorMemoryType(
+    ValueId id) {
+  if (variable_ids_and_refs_.find(id) != variable_ids_and_refs_.end()) {
+    return TensorMemoryType::VARIABLE;
+  } else if (IsBufferBased(tensor_reserver_.Get(id).descriptor.storage_type)) {
+    return TensorMemoryType::BUFFER;
+  } else {
+    return TensorMemoryType::STRONG_SHAPE;
+  }
+}
+
 absl::Status InferenceContext::AllocateMemory(CLContext* context) {
+  RETURN_IF_ERROR(AllocateMemoryForVariableTensors(context));
   RETURN_IF_ERROR(AllocateMemoryForBuffers(context));
   RETURN_IF_ERROR(AllocateMemoryForStrongShapes(context));
   return absl::OkStatus();
 }
 
+absl::Status InferenceContext::AllocateMemoryForVariableTensors(
+    CLContext* context) {
+  std::map<ValueId, int> ref_value_to_tensor_index;
+
+  for (auto value_and_ref_value : variable_ids_and_refs_) {
+    if (ref_value_to_tensor_index.find(value_and_ref_value.second) ==
+        ref_value_to_tensor_index.end()) {
+      const auto& t = tensor_reserver_.Get(value_and_ref_value.first);
+      const auto& shape = t.shape;
+      const auto& descriptor = t.descriptor;
+
+      RETURN_IF_ERROR(
+          CreateTensor(*context, shape, descriptor,
+                       &variable_tensors_[value_and_ref_value.second]));
+    }
+  }
+  return absl::OkStatus();
+}
+
 absl::Status InferenceContext::AllocateMemoryForBuffers(CLContext* context) {
   std::map<ValueId, int2> buffer_usages;
   GetUsages(
-      [](const TensorDescriptor& t) { return IsBufferBased(t.storage_type); },
+      [this](ValueId id) {
+        return GetTensorMemoryType(id) == TensorMemoryType::BUFFER;
+      },
       &buffer_usages);
 
   std::vector<TensorUsageRecord<size_t>> buffer_usage_records;
@@ -455,7 +540,7 @@ absl::Status InferenceContext::AllocateMemoryForBuffers(CLContext* context) {
   for (auto& node : nodes_) {
     auto tensors = GetCLNodeTensors(node);
     for (auto& t : tensors) {
-      if (!IsBufferBased(t.second.storage_type)) continue;
+      if (GetTensorMemoryType(t.first) != TensorMemoryType::BUFFER) continue;
       const int tensor_index = graph_ids_to_shared_buffer_tensors_[t.first];
       if (created_tensors[tensor_index]) continue;
       const auto& shape = tensor_reserver_.Get(t.first).shape;
@@ -473,7 +558,9 @@ absl::Status InferenceContext::AllocateMemoryForStrongShapes(
     CLContext* context) {
   std::map<ValueId, int2> usages;
   GetUsages(
-      [](const TensorDescriptor& t) { return !IsBufferBased(t.storage_type); },
+      [this](ValueId id) {
+        return GetTensorMemoryType(id) == TensorMemoryType::STRONG_SHAPE;
+      },
       &usages);
 
   std::vector<TensorUsageRecord<DummyTensor>> usage_records;
@@ -492,7 +579,9 @@ absl::Status InferenceContext::AllocateMemoryForStrongShapes(
   for (auto& node : nodes_) {
     auto tensors = GetCLNodeTensors(node);
     for (auto& t : tensors) {
-      if (IsBufferBased(t.second.storage_type)) continue;
+      if (GetTensorMemoryType(t.first) != TensorMemoryType::STRONG_SHAPE) {
+        continue;
+      }
       const auto& shape = tensor_reserver_.Get(t.first).shape;
       const auto id = assignment.object_ids[remap_from_graph_ids[t.first]];
       graph_ids_to_strong_shape_tensors_[t.first] = id;
@@ -581,13 +670,18 @@ uint64_t InferenceContext::GetSizeOfMemoryAllocatedForIntermediateTensors()
   for (const auto& b : shared_buffers_) {
     total_memory += b.GetMemorySizeInBytes();
   }
+  for (const auto& t : variable_tensors_) {
+    total_memory += t.second.GetMemorySizeInBytes();
+  }
 
   return total_memory;
 }
 
 Tensor* InferenceContext::GetTensor(ValueId id) {
-  if (graph_ids_to_shared_buffer_tensors_.find(id) !=
-      graph_ids_to_shared_buffer_tensors_.end()) {
+  if (variable_ids_and_refs_.find(id) != variable_ids_and_refs_.end()) {
+    return &variable_tensors_[variable_ids_and_refs_[id]];
+  } else if (graph_ids_to_shared_buffer_tensors_.find(id) !=
+             graph_ids_to_shared_buffer_tensors_.end()) {
     return &shared_buffer_tensors_[graph_ids_to_shared_buffer_tensors_[id]];
   } else {
     return &strong_shape_tensors_[graph_ids_to_strong_shape_tensors_[id]];
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.h b/tensorflow/lite/delegates/gpu/cl/inference_context.h
index 8486f2ddcd3..ec8055ebcde 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.h
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.h
@@ -26,10 +26,12 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/environment.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/cl/model_hints.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -62,15 +64,17 @@ class InferenceContext {
     TensorStorageType storage_type;
     ModelHints hints;
   };
+
   absl::Status InitFromGraph(const CreateInferenceInfo& create_info,
-                             const GraphFloat32& graph, Environment* env);
+                             const GraphFloat32& graph, Environment* env,
+                             std::vector<uint8_t>* serialized_model = nullptr);
 
   // Applies OpenCL-specific transformations to the graph before the
   // initialization. These transformations are either impossible or useless in
   // other backends.
   absl::Status InitFromGraphWithTransforms(
       const CreateInferenceInfo& create_info, GraphFloat32* graph,
-      Environment* env);
+      Environment* env, std::vector<uint8_t>* serialized_model = nullptr);
 
   absl::Status AddToQueue(CLCommandQueue* queue);
   absl::Status Profile(ProfilingCommandQueue* queue, ProfilingInfo* result);
@@ -87,7 +91,22 @@ class InferenceContext {
   absl::Status GetOutputTensor(ValueId id, CLCommandQueue* queue,
                                TensorFloat32* result);
 
+  const std::vector<ValueId>& GetInputIds() const { return input_ids_; }
+  const std::vector<ValueId>& GetOutputIds() const { return output_ids_; }
+
+  absl::Status RestoreDeserialized(const std::vector<uint8_t>& serialized_model,
+                                   Environment* env);
+
  private:
+  enum TensorMemoryType { STRONG_SHAPE = 0, BUFFER = 1, VARIABLE = 2 };
+
+  friend flatbuffers::Offset<data::InferenceContext> Encode(
+      const InferenceContext& inference,
+      flatbuffers::FlatBufferBuilder* builder);
+  friend absl::Status Decode(CLContext* context,
+                             const data::InferenceContext* fb_inference,
+                             InferenceContext* inference);
+
   void CopyInAndOutIds(const GraphFloat32& graph);
   absl::Status ConvertOperations(const DeviceInfo& device_info,
                                  const GraphFloat32& graph, ModelHints hints);
@@ -98,14 +117,18 @@ class InferenceContext {
   absl::Status Merge();
   absl::Status AllocateMemory(CLContext* context);
 
+  absl::Status AllocateMemoryForVariableTensors(CLContext* context);
+
   absl::Status AllocateMemoryForBuffers(CLContext* context);
 
   absl::Status AllocateMemoryForStrongShapes(CLContext* context);
 
   // utility function
-  void GetUsages(const std::function<bool(const TensorDescriptor&)>& functor,
+  void GetUsages(const std::function<bool(ValueId)>& functor,
                  std::map<ValueId, int2>* usages);
 
+  TensorMemoryType GetTensorMemoryType(ValueId id);
+
   void BindMemoryToOperations();
   absl::Status Compile(const CreationContext& creation_context);
   absl::Status Tune(const TuningParameters& tuning_parameters);
@@ -154,12 +177,39 @@ class InferenceContext {
     void SetNext(ValueId id) { next_ = id; }
     DummyTensor Get(ValueId id) { return reservations_[id]; }
 
+    std::vector<std::pair<ValueId, TensorDescriptor>> GetTensorDescs() const {
+      std::vector<std::pair<ValueId, TensorDescriptor>> result;
+      for (auto& v : reservations_) {
+        TensorDescriptor desc = v.second.descriptor;
+        desc.shape.b = v.second.shape.b;
+        desc.shape.h = v.second.shape.h;
+        desc.shape.w = v.second.shape.w;
+        desc.shape.d = 1;
+        desc.shape.c = v.second.shape.c;
+        result.push_back({v.first, desc});
+      }
+      return result;
+    }
+
+    void Add(const std::vector<std::pair<ValueId, TensorDescriptor>>& tensors) {
+      for (auto& v : tensors) {
+        DummyTensor dummy;
+        dummy.descriptor = v.second;
+        dummy.shape.b = v.second.shape.b;
+        dummy.shape.h = v.second.shape.h;
+        dummy.shape.w = v.second.shape.w;
+        dummy.shape.c = v.second.shape.c;
+        Add(v.first, dummy);
+      }
+    }
+
    private:
     absl::flat_hash_map<ValueId, DummyTensor> reservations_;
     ValueId next_;
   };
   TensorReserver tensor_reserver_;
 
+  std::map<ValueId, Tensor> variable_tensors_;
   std::vector<Buffer> shared_buffers_;
   std::vector<Tensor>
       shared_buffer_tensors_;  // use references to memory from shared_buffers_
@@ -169,6 +219,7 @@ class InferenceContext {
   std::map<ValueId, ValueId> graph_ids_to_strong_shape_tensors_;
 
   std::vector<ValueId> input_ids_;
+  std::map<ValueId, ValueId> variable_ids_and_refs_;
   std::vector<ValueId> output_ids_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index 02f5f9c4a4a..d7e7c7dd498 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -104,32 +104,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "conv_3d",
-    srcs = ["conv_3d.cc"],
-    hdrs = ["conv_3d.h"],
-    deps = [
-        ":gpu_operation",
-        ":util",
-        ":work_group_picking",
-        "//tensorflow/lite/delegates/gpu/cl:buffer",
-        "//tensorflow/lite/delegates/gpu/cl:cl_device",
-        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
-        "//tensorflow/lite/delegates/gpu/cl:precision",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
-        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
-        "//tensorflow/lite/delegates/gpu/cl:texture2d",
-        "//tensorflow/lite/delegates/gpu/cl:util",
-        "//tensorflow/lite/delegates/gpu/common:data_type",
-        "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common:tensor",
-        "//tensorflow/lite/delegates/gpu/common:types",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 cc_library(
     name = "conv_buffer_1x1",
     srcs = ["conv_buffer_1x1.cc"],
@@ -233,6 +207,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl:precision",
         "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl:texture2d",
         "//tensorflow/lite/delegates/gpu/cl:util",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:operations",
@@ -241,6 +216,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:tensor",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:winograd_util",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -263,50 +239,6 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "conv_texture",
-    srcs = ["conv_texture.cc"],
-    hdrs = ["conv_texture.h"],
-    deps = [
-        ":gpu_operation",
-        ":util",
-        ":work_group_picking",
-        "//tensorflow/lite/delegates/gpu/cl:cl_command_queue",
-        "//tensorflow/lite/delegates/gpu/cl:cl_context",
-        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
-        "//tensorflow/lite/delegates/gpu/cl:precision",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
-        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
-        "//tensorflow/lite/delegates/gpu/cl:texture2d",
-        "//tensorflow/lite/delegates/gpu/cl:util",
-        "//tensorflow/lite/delegates/gpu/common:data_type",
-        "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common:tensor",
-        "//tensorflow/lite/delegates/gpu/common:types",
-        "//tensorflow/lite/delegates/gpu/common:winograd_util",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-cc_test(
-    name = "conv_texture_test",
-    srcs = ["conv_texture_test.cc"],
-    linkstatic = True,
-    tags = tf_gpu_tests_tags() + [
-        "linux",
-        "local",
-    ],
-    deps = [
-        ":cl_test",
-        ":conv_texture",
-        "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 cc_library(
     name = "conv_weights_converter",
     srcs = ["conv_weights_converter.cc"],
@@ -384,30 +316,6 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "convolution_transposed_3d",
-    srcs = ["convolution_transposed_3d.cc"],
-    hdrs = ["convolution_transposed_3d.h"],
-    deps = [
-        ":gpu_operation",
-        ":util",
-        ":work_group_picking",
-        "//tensorflow/lite/delegates/gpu/cl:buffer",
-        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
-        "//tensorflow/lite/delegates/gpu/cl:tensor",
-        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
-        "//tensorflow/lite/delegates/gpu/cl:texture2d",
-        "//tensorflow/lite/delegates/gpu/cl:util",
-        "//tensorflow/lite/delegates/gpu/common:data_type",
-        "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common:tensor",
-        "//tensorflow/lite/delegates/gpu/common:types",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 cc_library(
     name = "convolution_transposed_3x3",
     srcs = ["convolution_transposed_3x3.cc"],
@@ -681,17 +589,24 @@ cc_library(
     hdrs = ["fully_connected.h"],
     deps = [
         ":gpu_operation",
+        ":tuning_parameters",
         ":util",
+        "//tensorflow/lite/delegates/gpu/cl:arguments",
         "//tensorflow/lite/delegates/gpu/cl:buffer",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/cl:device_info",
         "//tensorflow/lite/delegates/gpu/cl:linear_storage",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
         "//tensorflow/lite/delegates/gpu/cl:tensor",
-        "//tensorflow/lite/delegates/gpu/cl:util",
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl:texture2d",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:tensor",
         "//tensorflow/lite/delegates/gpu/common:types",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -706,8 +621,14 @@ cc_test(
     deps = [
         ":cl_test",
         ":fully_connected",
+        ":gpu_operation",
+        "//tensorflow/lite/delegates/gpu/cl:environment",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -722,13 +643,19 @@ cc_library(
         ":work_group_picking",
         "//tensorflow/lite/delegates/gpu/cl:arguments",
         "//tensorflow/lite/delegates/gpu/cl:buffer",
+        "//tensorflow/lite/delegates/gpu/cl:cl_command_queue",
         "//tensorflow/lite/delegates/gpu/cl:cl_context",
         "//tensorflow/lite/delegates/gpu/cl:cl_device",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/cl:cl_program",
+        "//tensorflow/lite/delegates/gpu/cl:device_info",
         "//tensorflow/lite/delegates/gpu/cl:precision",
         "//tensorflow/lite/delegates/gpu/cl:program_cache",
+        "//tensorflow/lite/delegates/gpu/cl:serialization_cc_fbs",
         "//tensorflow/lite/delegates/gpu/cl:tensor",
         "//tensorflow/lite/delegates/gpu/cl:tensor_type",
         "//tensorflow/lite/delegates/gpu/common:access_type",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:types",
         "@com_google_absl//absl/strings",
@@ -767,6 +694,25 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "lstm_full_test",
+    srcs = ["lstm_full_test.cc"],
+    linkstatic = True,
+    tags = tf_gpu_tests_tags() + [
+        "linux",
+        "local",
+    ],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/delegates/gpu:delegate",
+        "//tensorflow/lite/kernels:test_main",
+        "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 cc_library(
     name = "mean_stddev_normalization",
     srcs = ["mean_stddev_normalization.cc"],
@@ -1005,6 +951,37 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "reduce",
+    srcs = ["reduce.cc"],
+    hdrs = ["reduce.h"],
+    deps = [
+        ":gpu_operation",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+    ],
+)
+
+cc_test(
+    name = "reduce_test",
+    srcs = ["reduce_test.cc"],
+    linkstatic = True,
+    tags = tf_gpu_tests_tags() + [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":cl_test",
+        ":reduce",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "relu",
     srcs = ["relu.cc"],
@@ -1259,7 +1236,7 @@ cc_library(
     hdrs = ["tuning_parameters.h"],
     deps = [
         "//tensorflow/lite/delegates/gpu/cl:cl_command_queue",
-        "//tensorflow/lite/delegates/gpu/cl:cl_device",
+        "//tensorflow/lite/delegates/gpu/cl:device_info",
     ],
 )
 
@@ -1302,11 +1279,8 @@ cc_library(
     deps = [
         "//tensorflow/lite/delegates/gpu/cl:device_info",
         "//tensorflow/lite/delegates/gpu/cl:precision",
-        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
-        "//tensorflow/lite/delegates/gpu/common:access_type",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:tensor",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
@@ -1364,9 +1338,8 @@ cc_library(
     hdrs = ["work_group_picking.h"],
     deps = [
         ":tuning_parameters",
-        "//tensorflow/lite/delegates/gpu/cl:cl_command_queue",
         "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
-        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/cl:device_info",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
         "//tensorflow/lite/delegates/gpu/common:workgroup_selection",
@@ -1381,7 +1354,6 @@ test_suite(
         "conv_buffer_1x1_test",
         "conv_constants_test",
         "conv_powervr_test",
-        "conv_texture_test",
         "convolution_transposed_3x3_thin_test",
         "convolution_transposed_4x4_test",
         "convolution_transposed_test",
@@ -1397,6 +1369,7 @@ test_suite(
         "padding_test",
         "pooling_test",
         "prelu_test",
+        "reduce_test",
         "relu_test",
         "reshape_test",
         "reshapex4_test",
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
index 0112241117e..efe97f9931b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/cl_test.cc
@@ -55,6 +55,7 @@ absl::Status ExecuteGPUOperation(const std::vector<TensorFloat32>& src_cpu,
 
   RETURN_IF_ERROR(operation->Compile(creation_context));
   RETURN_IF_ERROR(operation->UpdateParams());
+  operation->args_.ReleaseCPURepresentation();
   RETURN_IF_ERROR(operation->AddToQueue(creation_context.queue));
   RETURN_IF_ERROR(creation_context.queue->WaitForCompletion());
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
deleted file mode 100644
index 06664f67768..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.cc
+++ /dev/null
@@ -1,863 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h"
-
-#include <algorithm>
-#include <string>
-#include <utility>
-
-#include "absl/strings/substitute.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
-#include "tensorflow/lite/delegates/gpu/cl/precision.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-namespace {
-std::string GenerateUploadByThreads(const std::string& local_ptr_name,
-                                    const std::string& global_ptr_name,
-                                    const std::string& global_offset_name,
-                                    const std::string& lid_name,
-                                    int total_work_items,
-                                    int elements_to_upload) {
-  std::string c;
-  std::string offset =
-      global_offset_name.empty() ? "" : global_offset_name + " + ";
-  const int groups = elements_to_upload / total_work_items;
-  const int reminder = elements_to_upload % total_work_items;
-  for (int i = 0; i < groups; ++i) {
-    c += "    " + local_ptr_name + "[" + lid_name + " + " +
-         std::to_string(total_work_items * i) + "] = " + global_ptr_name + "[" +
-         offset + lid_name + " + " + std::to_string(total_work_items * i) +
-         "];\n";
-  }
-  if (reminder != 0) {
-    c += "    if (" + lid_name + " < " + std::to_string(reminder) + ") {\n";
-    c += "      " + local_ptr_name + "[" + lid_name + " + " +
-         std::to_string(total_work_items * groups) + "] = " + global_ptr_name +
-         "[" + offset + lid_name + " + " +
-         std::to_string(total_work_items * groups) + "];\n";
-    c += "    }\n";
-  }
-  return c;
-}
-
-std::string GenerateAsyncUpload(const std::string& local_ptr_name,
-                                const std::string& global_ptr_name,
-                                const std::string& global_offset_name,
-                                int elements_to_upload) {
-  std::string c;
-  std::string offset =
-      global_offset_name.empty() ? "" : " + " + global_offset_name;
-  c += "    async_work_group_copy(" + local_ptr_name + ", " + global_ptr_name +
-       offset + ", " + std::to_string(elements_to_upload) + ", 0);\n";
-  return c;
-}
-
-std::string GenerateGlobalCoordinates(const int4& block_size,
-                                      const int3& work_group_launch_order) {
-  std::string c;
-  int3 launch_remap;
-  launch_remap[work_group_launch_order.x] = 0;
-  launch_remap[work_group_launch_order.y] = 1;
-  launch_remap[work_group_launch_order.z] = 2;
-  if (work_group_launch_order[0] == 0) {
-    c += "  int DST_X = get_global_id(0) * " + std::to_string(block_size.x) +
-         ";\n";
-  } else {
-    c += "  int DST_X = (get_group_id(" + std::to_string(launch_remap[0]) +
-         ") * get_local_size(0) + get_local_id(0)) * " +
-         std::to_string(block_size.x) + ";\n";
-  }
-  if (work_group_launch_order[1] == 1) {
-    c += "  int DST_Y = get_global_id(1) * " + std::to_string(block_size.y) +
-         ";\n";
-  } else {
-    c += "  int DST_Y = (get_group_id(" + std::to_string(launch_remap[1]) +
-         ") * get_local_size(1) + get_local_id(1)) * " +
-         std::to_string(block_size.y) + ";\n";
-  }
-  if (work_group_launch_order[2] == 2) {
-    c += "  int linear_id_z = get_global_id(2);\n";
-  } else {
-    c += "  int linear_id_z = get_group_id(" + std::to_string(launch_remap[2]) +
-         ") * get_local_size(2) + get_local_id(2);\n";
-  }
-  c += "  int DST_S = (linear_id_z % args.grid_size_s) * " +
-       std::to_string(block_size.w) + ";\n";
-  c += "  int DST_Z = (linear_id_z / args.grid_size_s) * " +
-       std::to_string(block_size.z) + ";\n";
-  return c;
-}
-
-std::string GenerateConv(CalculationsPrecision precision,
-                         const int4& block_size, int offset,
-                         bool weights_are_buffer) {
-  std::string c;
-  const std::string channels[] = {"x", "y", "z", "w"};
-  for (int s = 0; s < block_size.w; ++s) {
-    switch (precision) {
-      case CalculationsPrecision::F32:
-      case CalculationsPrecision::F16:
-        for (int ch = 0; ch < 4; ++ch) {
-          const std::string weight_id = std::to_string(s * 4 + ch + offset);
-          std::string weight_name;
-          if (weights_are_buffer) {
-            weight_name = "weights_cache[" + weight_id + "]";
-          } else {
-            weight_name = "f" + weight_id;
-          }
-          for (int z = 0; z < block_size.z; ++z) {
-            for (int y = 0; y < block_size.y; ++y) {
-              for (int x = 0; x < block_size.x; ++x) {
-                std::string id =
-                    std::to_string(z) + std::to_string(y) + std::to_string(x);
-                c += "    r" + std::to_string(s) + id + " += " + weight_name +
-                     " * src" + id + "." + channels[ch] + ";\n";
-              }
-            }
-          }
-        }
-        break;
-      case CalculationsPrecision::F32_F16:
-        for (int z = 0; z < block_size.z; ++z) {
-          for (int y = 0; y < block_size.y; ++y) {
-            for (int x = 0; x < block_size.x; ++x) {
-              std::string id =
-                  std::to_string(z) + std::to_string(y) + std::to_string(x);
-              std::vector<std::string> weight_names(4);
-              for (int i = 0; i < 4; ++i) {
-                std::string weight_id = std::to_string(s * 4 + i + offset);
-                if (weights_are_buffer) {
-                  weight_names[i] = "weights_cache[" + weight_id + "]";
-                } else {
-                  weight_names[i] = "f" + weight_id;
-                }
-              }
-              c += absl::Substitute(
-                  "    $0 += convert_float4($1.x * $2 + $1.y * $3 + $1.z * "
-                  "$4 + $1.w * $5);\n",
-                  "r" + std::to_string(s) + id, "src" + id, weight_names[0],
-                  weight_names[1], weight_names[2], weight_names[3]);
-            }
-          }
-        }
-        break;
-    }
-  }
-  return c;
-}
-}  // namespace
-
-Conv3D::Conv3D(const OperationDef& definition,
-               const Convolution3DAttributes& attr,
-               const DeviceInfo& device_info)
-    : GPUOperation(definition),
-      stride_(attr.strides.w, attr.strides.h, attr.strides.d),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
-               -attr.padding.prepended.d),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
-                   attr.weights.shape.d),
-      dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d),
-      conv_params_(GuessBestParams(device_info, definition, attr)) {
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  code_ = GenerateConv3D(definition_, stride_correction, conv_params_);
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      device_info.IsPowerVR()) {
-    compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
-  }
-}
-
-Conv3D::Conv3D(Conv3D&& operation)
-    : GPUOperation(std::move(operation)),
-      stride_(operation.stride_),
-      padding_(operation.padding_),
-      kernel_size_(operation.kernel_size_),
-      dilation_(operation.dilation_),
-      conv_params_(operation.conv_params_) {}
-
-Conv3D& Conv3D::operator=(Conv3D&& operation) {
-  if (this != &operation) {
-    std::swap(stride_, operation.stride_);
-    std::swap(padding_, operation.padding_);
-    std::swap(kernel_size_, operation.kernel_size_);
-    std::swap(dilation_, operation.dilation_);
-    std::swap(conv_params_, operation.conv_params_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-absl::Status Conv3D::BindArguments() {
-  if (!conv_params_.x_kernel_is_1) {
-    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
-    RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
-    RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
-  }
-  if (!conv_params_.y_kernel_is_1) {
-    RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
-    RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
-    RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
-  }
-  if (!conv_params_.z_kernel_is_1) {
-    RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
-    RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
-    RETURN_IF_ERROR(args_.SetInt("dilation_z", dilation_.z));
-  }
-  return args_.SetInt("grid_size_s", DivideRoundUp(dst_[0]->Slices(),
-                                                   conv_params_.block_size.w));
-}
-
-int3 Conv3D::GetGridSize() const {
-  const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
-                                   conv_params_.block_size.x);
-  const int grid_y =
-      DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
-  const int grid_z =
-      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w) *
-      DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
-  int3 wg;
-  wg.x = DivideRoundUp(grid_x, work_group_size_.x);
-  wg.y = DivideRoundUp(grid_y, work_group_size_.y);
-  wg.z = DivideRoundUp(grid_z, work_group_size_.z);
-  return int3(wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
-              wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y,
-              wg[conv_params_.work_group_launch_order[2]] * work_group_size_.z);
-}
-
-void Conv3D::GetPossibleKernelWorkGroups(TuningType tuning_type,
-                                         const DeviceInfo& device_info,
-                                         const KernelInfo& kernel_info,
-                                         std::vector<int3>* work_groups) const {
-  if (conv_params_.weights_upload_type ==
-          WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP ||
-      conv_params_.weights_upload_type ==
-          WeightsUploadType::LOCAL_MEM_BY_THREADS) {
-    work_groups->push_back(work_group_size_);
-    return;
-  }
-  if (conv_params_.work_group_launch_order[0] == 0 &&
-      conv_params_.work_group_launch_order[1] == 1 &&
-      conv_params_.work_group_launch_order[2] == 2) {
-    GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
-                              work_groups);
-  } else {
-    work_groups->push_back(work_group_size_);
-  }
-}
-
-std::string Conv3D::GenerateConv3D(const OperationDef& op_def,
-                                   bool stride_correction,
-                                   const Conv3D::ConvParams& conv_params) {
-  auto src_desc = op_def.src_tensors[0];
-  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
-  if (op_def.IsBatchSupported()) {
-    src_desc.SetStateVar("BatchedWidth", "true");
-  }
-  AddSrcTensor("src_tensor", src_desc);
-
-  auto dst_desc = op_def.dst_tensors[0];
-  if (op_def.IsBatchSupported()) {
-    dst_desc.SetStateVar("BatchedWidth", "true");
-  }
-  AddDstTensor("dst_tensor", dst_desc);
-
-  if (!conv_params_.x_kernel_is_1) {
-    args_.AddInt("stride_x");
-    args_.AddInt("padding_x");
-    args_.AddInt("kernel_size_x");
-    args_.AddInt("dilation_x");
-  }
-  if (!conv_params_.y_kernel_is_1) {
-    args_.AddInt("stride_y");
-    args_.AddInt("padding_y");
-    args_.AddInt("kernel_size_y");
-    args_.AddInt("dilation_y");
-  }
-  if (!conv_params_.z_kernel_is_1) {
-    args_.AddInt("stride_z");
-    args_.AddInt("padding_z");
-    args_.AddInt("kernel_size_z");
-    args_.AddInt("dilation_z");
-  }
-  args_.AddInt("grid_size_s");
-
-  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
-  const bool buffer_type = src_tensor_type == TensorStorageType::BUFFER ||
-                           src_tensor_type == TensorStorageType::IMAGE_BUFFER;
-
-  const bool manual_clamp_x = buffer_type && !conv_params.x_kernel_is_1;
-  const bool manual_clamp_y = buffer_type && !conv_params.y_kernel_is_1;
-  const bool manual_clamp_z =
-      src_tensor_type != TensorStorageType::TEXTURE_3D &&
-      !conv_params.z_kernel_is_1;
-
-  const bool can_read_out_of_x = !buffer_type;
-  const bool can_read_out_of_y = !buffer_type;
-  const bool can_read_out_of_z =
-      src_tensor_type == TensorStorageType::TEXTURE_3D ||
-      src_tensor_type == TensorStorageType::TEXTURE_2D ||
-      src_tensor_type == TensorStorageType::SINGLE_TEXTURE_2D;
-
-  const bool is1x1x1 = conv_params.x_kernel_is_1 && conv_params.y_kernel_is_1 &&
-                       conv_params.z_kernel_is_1;
-
-  const bool need_local_mem =
-      conv_params.weights_upload_type ==
-          Conv3D::WeightsUploadType::LOCAL_MEM_BY_THREADS ||
-      conv_params.weights_upload_type ==
-          Conv3D::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
-
-  const int4 block_size = conv_params.block_size;
-  std::string c = GetCommonDefines(op_def.precision);
-  if (need_local_mem) {  // we use fixed workgroup size when use local mem
-    c += "__attribute__((reqd_work_group_size(" +
-         std::to_string(work_group_size_.x) + ", " +
-         std::to_string(work_group_size_.y) + ", " +
-         std::to_string(work_group_size_.z) + ")))\n";
-  }
-  c += "__kernel void main_function(\n";
-  c += "$0) {\n";
-  c += GenerateGlobalCoordinates(block_size,
-                                 conv_params.work_group_launch_order);
-  if (!need_local_mem) {
-    c += "  if (DST_X >= args.dst_tensor.Width() || DST_Y >= "
-         "args.dst_tensor.Height() || DST_Z >= args.dst_tensor.Depth()) "
-         "return;\n";
-  }
-  if (conv_params.weights_upload_type ==
-      Conv3D::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
-    c += "  int lid = get_local_id(1) * " + std::to_string(work_group_size_.x) +
-         " + get_local_id(0);\n";
-  }
-  for (int s = 0; s < block_size.w; ++s) {
-    for (int z = 0; z < block_size.z; ++z) {
-      for (int y = 0; y < block_size.y; ++y) {
-        for (int x = 0; x < block_size.x; ++x) {
-          c += "  ACCUM_FLT4 r" + std::to_string(s) + std::to_string(z) +
-               std::to_string(y) + std::to_string(x) +
-               " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
-        }
-      }
-    }
-  }
-  if (!conv_params.x_kernel_is_1) {
-    for (int x = 0; x < block_size.x; ++x) {
-      const std::string xc = "(DST_X + " + std::to_string(x) + ")";
-      if (stride_correction) {
-        c += "  int xc" + std::to_string(x) + " = " +
-             GetXStrideCorrected(xc, "args.src_tensor.Batch()", "args.stride_x",
-                                 "args.padding_x") +
-             ";\n";
-      } else {
-        c += "  int xc" + std::to_string(x) + " = " + xc +
-             " * args.stride_x + args.padding_x;\n";
-      }
-    }
-  } else if (!can_read_out_of_x) {
-    for (int x = 0; x < block_size.x; ++x) {
-      const std::string xc = "(DST_X + " + std::to_string(x) + ")";
-      c += "  int xc" + std::to_string(x) + " = clamp(" + xc +
-           ", 0, args.src_tensor.Width() - 1);\n";
-    }
-  }
-  if (!conv_params.y_kernel_is_1) {
-    for (int y = 0; y < block_size.y; ++y) {
-      const std::string yc = "(DST_Y + " + std::to_string(y) + ")";
-      c += "  int yc" + std::to_string(y) + " = " + yc +
-           " * args.stride_y + args.padding_y;\n";
-    }
-  } else if (!can_read_out_of_y) {
-    for (int y = 0; y < block_size.y; ++y) {
-      const std::string yc = "(DST_Y + " + std::to_string(y) + ")";
-      c += "  int yc" + std::to_string(y) + " = clamp(" + yc +
-           ", 0, args.src_tensor.Height() - 1);\n";
-    }
-  }
-  if (!conv_params.z_kernel_is_1) {
-    for (int z = 0; z < block_size.z; ++z) {
-      const std::string zc = "(DST_Z + " + std::to_string(z) + ")";
-      c += "  int zc" + std::to_string(z) + " = " + zc +
-           " * args.stride_z + args.padding_z;\n";
-    }
-  } else if (!can_read_out_of_z) {
-    for (int z = 0; z < block_size.z; ++z) {
-      const std::string zc = "(DST_Z + " + std::to_string(z) + ")";
-      c += "  int zc" + std::to_string(z) + " = clamp(" + zc +
-           ", 0, args.src_tensor.Depth() - 1);\n";
-    }
-  }
-  if (need_local_mem) {
-    c += "  __local FLT4 weights_cache[" +
-         std::to_string(block_size.w * 4 * conv_params.src_depth_loop_size) +
-         "];\n";
-  }
-  if (conv_params.weights_upload_type ==
-      Conv3D::WeightsUploadType::GLOBAL_MEM) {
-    c += "  __global FLT4* weights_cache;\n";
-  }
-  std::string kernel_size;
-  kernel_size += conv_params.x_kernel_is_1 ? "" : " * args.kernel_size_x";
-  kernel_size += conv_params.y_kernel_is_1 ? "" : " * args.kernel_size_y";
-  kernel_size += conv_params.z_kernel_is_1 ? "" : " * args.kernel_size_z";
-  if (conv_params.AreWeightsBuffer()) {
-    c += "  __global FLT4* filters_loc = args.weights.GetPtr() + DST_S * 4 * "
-         "args.src_tensor.Slices()" +
-         kernel_size + ";\n";
-  }
-  if (buffer_type) {
-    c += "  const int src_layer_offset = args.src_tensor.SliceStride();\n";
-  }
-  if (!is1x1x1) {
-    c += "  int filter_offset = 0;\n";
-  }
-  if (!conv_params.z_kernel_is_1) {
-    c += "  for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
-    for (int z = 0; z < block_size.z; ++z) {
-      const std::string zck = "zck" + std::to_string(z);
-      c += "  int zck" + std::to_string(z) + " = kz * args.dilation_z + zc" +
-           std::to_string(z) + ";\n";
-      if (manual_clamp_z) {
-        c += "  bool mz" + std::to_string(z) + " = " + zck + " >= 0 && " + zck +
-             " < args.src_tensor.Depth();\n";
-        c += "  " + zck + " = clamp(" + zck +
-             ", 0, args.src_tensor.Depth() - 1);\n";
-      }
-    }
-  }
-  if (!conv_params.y_kernel_is_1) {
-    c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
-    for (int y = 0; y < block_size.y; ++y) {
-      const std::string yck = "yck" + std::to_string(y);
-      c += "  int " + yck + " = ky * args.dilation_y + yc" + std::to_string(y) +
-           ";\n";
-      if (manual_clamp_y) {
-        c += "  bool my" + std::to_string(y) + " = " + yck + " >= 0 && " + yck +
-             " < args.src_tensor.Height();\n";
-        c += "  " + yck + " = clamp(" + yck +
-             ", 0, args.src_tensor.Height() - 1);\n";
-      }
-    }
-  }
-  if (!conv_params.x_kernel_is_1) {
-    c += "  for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
-    for (int x = 0; x < block_size.x; ++x) {
-      const std::string xck = "xck" + std::to_string(x);
-      c += "  int xck" + std::to_string(x) + " = kx * args.dilation_x + xc" +
-           std::to_string(x) + ";\n";
-      if (manual_clamp_x) {
-        c += "  bool mx" + std::to_string(x) + " = " + xck + " >= 0 && " + xck +
-             " < args.src_tensor.Width();\n";
-        c += "  " + xck + " = clamp(" + xck +
-             ", 0, args.src_tensor.Width() - 1);\n";
-      }
-    }
-  }
-
-  auto get_src_x_coord = [&](int id) {
-    std::string xs = std::to_string(id);
-    std::string xc = "xck" + xs;
-    if (conv_params.x_kernel_is_1) {
-      if (can_read_out_of_x) {
-        xc = "DST_X + " + xs;
-      } else {
-        xc = "xc" + xs;
-      }
-    }
-    return xc;
-  };
-  auto get_src_y_coord = [&](int id) {
-    std::string ys = std::to_string(id);
-    std::string yc = "yck" + ys;
-    if (conv_params.y_kernel_is_1) {
-      if (can_read_out_of_y) {
-        yc = "DST_Y + " + ys;
-      } else {
-        yc = "yc" + ys;
-      }
-    }
-    return yc;
-  };
-  auto get_src_z_coord = [&](int id) {
-    std::string zs = std::to_string(id);
-    std::string zc = "zck" + zs;
-    if (conv_params.z_kernel_is_1) {
-      if (can_read_out_of_z) {
-        zc = "DST_Z + " + zs;
-      } else {
-        zc = "zc" + zs;
-      }
-    }
-    return zc;
-  };
-
-  if (buffer_type) {
-    for (int z = 0; z < block_size.z; ++z) {
-      const std::string zs = std::to_string(z);
-      const std::string zc = get_src_z_coord(z);
-      for (int y = 0; y < block_size.y; ++y) {
-        const std::string ys = std::to_string(y);
-        const std::string yc = get_src_y_coord(y);
-        for (int x = 0; x < block_size.x; ++x) {
-          const std::string xs = std::to_string(x);
-          const std::string xc = get_src_x_coord(x);
-          const std::string id = zs + ys + xs;
-          c += "  args.src_tensor.GetAddress(src_a_" + id + ", " + xc + ", " +
-               yc + ", " + zc + ", 0);\n";
-          if (!is1x1x1 && src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-            std::string condition;
-            if (manual_clamp_x) {
-              if (!condition.empty()) {
-                condition += " && ";
-              }
-              condition += "mx" + xs;
-            }
-            if (manual_clamp_y) {
-              if (!condition.empty()) {
-                condition += " && ";
-              }
-              condition += "my" + ys;
-            }
-            if (manual_clamp_z) {
-              if (!condition.empty()) {
-                condition += " && ";
-              }
-              condition += "mz" + zs;
-            }
-            c += "  src_a_" + id + " = select(-1, src_a_" + id + ", " +
-                 condition + ");\n";
-            c += "  int dz_" + id + " = select(0, src_layer_offset, " +
-                 condition + ");\n";
-          }
-        }
-      }
-    }
-  }
-
-  auto declare_src = [&]() {
-    for (int z = 0; z < block_size.z; ++z) {
-      const std::string zs = std::to_string(z);
-      for (int y = 0; y < block_size.y; ++y) {
-        const std::string ys = std::to_string(y);
-        for (int x = 0; x < block_size.x; ++x) {
-          const std::string xs = std::to_string(x);
-          const std::string id = zs + ys + xs;
-          c += "  FLT4 src" + id + ";\n";
-        }
-      }
-    }
-  };
-
-  auto read_src = [&]() {
-    for (int z = 0; z < block_size.z; ++z) {
-      const std::string zs = std::to_string(z);
-      const std::string zc = get_src_z_coord(z);
-      for (int y = 0; y < block_size.y; ++y) {
-        const std::string ys = std::to_string(y);
-        const std::string yc = get_src_y_coord(y);
-        for (int x = 0; x < block_size.x; ++x) {
-          const std::string xs = std::to_string(x);
-          const std::string xc = get_src_x_coord(x);
-          std::string multiplier;
-          multiplier += manual_clamp_x ? " * (FLT)(mx" + xs + ")" : "";
-          multiplier += manual_clamp_y ? " * (FLT)(my" + ys + ")" : "";
-          multiplier += manual_clamp_z ? " * (FLT)(mz" + zs + ")" : "";
-          const std::string id = zs + ys + xs;
-          if (buffer_type) {
-            if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-              multiplier = "";
-            }
-            c += "    src" + id + " = args.src_tensor.Read(src_a_" + id + ")" +
-                 multiplier + ";\n";
-            if (!is1x1x1 &&
-                src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-              c += "    src_a_" + id + " += dz_" + id + ";\n";
-            } else {
-              c += "    src_a_" + id + " += src_layer_offset;\n";
-            }
-          } else {
-            c += "    src" + id + " = args.src_tensor.Read(" + xc + ", " + yc +
-                 ", " + zc + ", s)" + multiplier + ";\n";
-          }
-        }
-      }
-    }
-  };
-  c += "  int s = 0;\n";
-  declare_src();
-  c += "  do {\n";
-  const int total_work_items =
-      work_group_size_.x * work_group_size_.y * work_group_size_.z;
-  if (conv_params.weights_upload_type ==
-      Conv3D::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
-    c +=
-        GenerateAsyncUpload("weights_cache", "filters_loc",
-                            /*global_offset_name*/ "",
-                            block_size.w * 4 * conv_params.src_depth_loop_size);
-  } else if (conv_params.weights_upload_type ==
-             Conv3D::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
-    c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-    c += GenerateUploadByThreads(
-        "weights_cache", "filters_loc",
-        /*global_offset_name*/ "", "lid", total_work_items,
-        block_size.w * 4 * conv_params.src_depth_loop_size);
-  } else if (conv_params.weights_upload_type ==
-             Conv3D::WeightsUploadType::GLOBAL_MEM) {
-    c += "    weights_cache = filters_loc;\n";
-  } else {  // TEXTURES_MEM
-    for (int dst_s = 0; dst_s < block_size.w; ++dst_s) {
-      const std::string f_y = is1x1x1 ? "s" : "filter_offset";
-      c += absl::Substitute(
-          R"(    FLT4 f$2 = args.weights0.Read(DST_S + $0, $1);
-    FLT4 f$3 = args.weights1.Read(DST_S + $0, $1);
-    FLT4 f$4 = args.weights2.Read(DST_S + $0, $1);
-    FLT4 f$5 = args.weights3.Read(DST_S + $0, $1);
-)",
-          dst_s, f_y, dst_s * 4 + 0, dst_s * 4 + 1, dst_s * 4 + 2,
-          dst_s * 4 + 3);
-    }
-    if (!is1x1x1) {
-      c += "    filter_offset++;\n";
-    }
-  }
-  read_src();
-  c += "    s += 1;\n";
-  if (conv_params.weights_upload_type ==
-      Conv3D::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
-    c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-  }
-  c += GenerateConv(op_def.precision, block_size, 0,
-                    conv_params.AreWeightsBuffer());
-  for (int i = 1; i < conv_params.src_depth_loop_size; ++i) {
-    read_src();
-    c += GenerateConv(op_def.precision, block_size, i * block_size.w * 4,
-                      conv_params.AreWeightsBuffer());
-    c += "    s += 1;\n";
-  }
-  if (conv_params.AreWeightsBuffer()) {
-    c += "    filters_loc += " +
-         std::to_string(block_size.w * 4 * conv_params.src_depth_loop_size) +
-         ";\n";
-  }
-  c += "  } while (s < args.src_tensor.Slices());\n";
-  if (!conv_params.z_kernel_is_1) {
-    c += "  }\n";
-  }
-  if (!conv_params.y_kernel_is_1) {
-    c += "  }\n";
-  }
-  if (!conv_params.x_kernel_is_1) {
-    c += "  }\n";
-  }
-  if (conv_params.weights_upload_type ==
-      Conv3D::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
-    c += GenerateAsyncUpload("weights_cache", "args.biases.GetPtr()", "DST_S",
-                             block_size.w);
-  } else if (conv_params.weights_upload_type ==
-             Conv3D::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
-    c += "  barrier(CLK_LOCAL_MEM_FENCE);\n";
-    c +=
-        GenerateUploadByThreads("weights_cache", "args.biases.GetPtr()",
-                                "DST_S", "lid", total_work_items, block_size.w);
-    c += "  barrier(CLK_LOCAL_MEM_FENCE);\n";
-  } else if (conv_params.weights_upload_type ==
-             Conv3D::WeightsUploadType::GLOBAL_MEM) {
-    c += "  weights_cache = args.biases.GetPtr() + DST_S;\n";
-  }
-  if (need_local_mem) {
-    c += "  if (DST_X >= args.dst_tensor.Width() || DST_Y >= "
-         "args.dst_tensor.Height() || DST_Z >= args.dst_tensor.Depth()) "
-         "return;\n";
-  }
-  for (int s = 0; s < block_size.w; ++s) {
-    const std::string dsts =
-        "DST_S" + (s == 0 ? "" : " + " + std::to_string(s));
-    c += "  if (" + dsts + " >= args.dst_tensor.Slices()) return;\n";
-    for (int z = 0; z < block_size.z; ++z) {
-      const std::string dstz =
-          "DST_Z" + (z == 0 ? "" : " + " + std::to_string(z));
-      for (int y = 0; y < block_size.y; ++y) {
-        const std::string dsty =
-            "DST_Y" + (y == 0 ? "" : " + " + std::to_string(y));
-        for (int x = 0; x < block_size.x; ++x) {
-          const std::string dstx =
-              "DST_X" + (x == 0 ? "" : " + " + std::to_string(x));
-          const std::string r_id = std::to_string(s) + std::to_string(z) +
-                                   std::to_string(y) + std::to_string(x);
-          c += "  if (" + dstx + " < args.dst_tensor.Width() && " + dsty +
-               " < args.dst_tensor.Height() && " + dstz +
-               " < args.dst_tensor.Depth()) {\n";
-          if (conv_params.AreWeightsBuffer()) {
-            c += "    FLT4 res = TO_FLT4(r" + r_id + ") + weights_cache[" +
-                 std::to_string(s) + "];\n";
-          } else {
-            c += "    FLT4 res = TO_FLT4(r" + r_id + ") + args.biases.Read(" +
-                 dsts + ");\n";
-          }
-          c += "    args.dst_tensor.Write(res, " + dstx + ", " + dsty + ", " +
-               dstz + ", " + dsts + ");\n";
-          c += "  }\n";
-        }
-      }
-    }
-  }
-  c += "}\n";
-  return c;
-}
-
-Conv3D::ConvParams Conv3D::GuessBestParams(const DeviceInfo& device_info,
-                                           const OperationDef& definition,
-                                           int src_slices, int dst_slices,
-                                           bool x_kernel_is_1,
-                                           bool y_kernel_is_1,
-                                           bool z_kernel_is_1) {
-  ConvParams conv_params;
-  conv_params.x_kernel_is_1 = x_kernel_is_1;
-  conv_params.y_kernel_is_1 = y_kernel_is_1;
-  conv_params.z_kernel_is_1 = z_kernel_is_1;
-  if (device_info.IsNvidia()) {
-    conv_params.block_size = int4(1, 1, 1, 4);
-    work_group_size_ = int3(8, 4, 1);
-    conv_params.work_group_launch_order = int3(2, 0, 1);
-    conv_params.src_depth_loop_size = 1;
-    conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
-    if (dst_slices % 4 == 0 || dst_slices >= 8) {
-      conv_params.block_size.w = 4;
-    } else if (dst_slices % 2 == 0 || dst_slices >= 4) {
-      conv_params.block_size.w = 2;
-    } else {
-      conv_params.block_size.w = dst_slices;
-    }
-    if (src_slices % 2 == 0) {
-      conv_params.src_depth_loop_size = 2;
-    }
-    if (src_slices % 4 == 0 && conv_params.block_size.w <= 2) {
-      conv_params.src_depth_loop_size = 4;
-    }
-  } else if (device_info.IsPowerVR()) {
-    conv_params.block_size = int4(1, 1, 1, 4);
-    work_group_size_ = int3(8, 4, 1);
-    conv_params.work_group_launch_order = int3(2, 0, 1);
-    conv_params.src_depth_loop_size = 1;
-    conv_params.weights_upload_type =
-        WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
-    if (dst_slices % 8 == 0 || dst_slices >= 32) {
-      conv_params.block_size.w = 8;
-    } else if (dst_slices % 4 == 0 || dst_slices >= 8) {
-      conv_params.block_size.w = 4;
-    } else if (dst_slices % 2 == 0 || dst_slices >= 4) {
-      conv_params.block_size.w = 2;
-    } else {
-      conv_params.block_size.w = dst_slices;
-    }
-    if (definition.precision == CalculationsPrecision::F16) {
-      conv_params.block_size.w = std::min(4, conv_params.block_size.w);
-      if (src_slices % 2 == 0) {
-        conv_params.src_depth_loop_size = 2;
-      }
-      if (src_slices % 4 == 0 && conv_params.block_size.w <= 2) {
-        conv_params.src_depth_loop_size = 4;
-      }
-      if (conv_params.block_size.w == 1) {
-        if (src_slices % 2 == 0) {
-          conv_params.src_depth_loop_size = 2;
-        }
-        if (src_slices % 4 == 0) {
-          conv_params.src_depth_loop_size = 4;
-        }
-        if (src_slices <= 8) {
-          conv_params.src_depth_loop_size = src_slices;
-        }
-      }
-      conv_params.block_size.x = 2;
-      work_group_size_ = int3(4, 8, 1);
-    }
-  } else if (device_info.IsAdreno()) {
-    conv_params.block_size = int4(2, 2, 1, 2);
-    work_group_size_ = int3(8, 4, 1);
-    conv_params.work_group_launch_order = int3(0, 1, 2);
-    conv_params.src_depth_loop_size = 1;
-    conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM;
-  } else if (device_info.IsMali()) {
-    conv_params.block_size = int4(1, 1, 1, 4);
-    work_group_size_ = int3(8, 4, 1);
-    conv_params.work_group_launch_order = int3(0, 1, 2);
-    conv_params.src_depth_loop_size = 1;
-    conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
-    if (dst_slices % 4 == 0 || dst_slices >= 8) {
-      conv_params.block_size.w = 4;
-    } else if (dst_slices % 2 == 0 || dst_slices >= 4) {
-      conv_params.block_size.w = 2;
-    } else {
-      conv_params.block_size.w = dst_slices;
-    }
-    if (src_slices % 2 == 0) {
-      conv_params.src_depth_loop_size = 2;
-    }
-    if (src_slices % 4 == 0 && conv_params.block_size.w <= 2) {
-      conv_params.src_depth_loop_size = 4;
-    }
-  } else {
-    conv_params.block_size = int4(2, 2, 1, 2);
-    work_group_size_ = int3(8, 4, 1);
-    conv_params.work_group_launch_order = int3(0, 1, 2);
-    conv_params.src_depth_loop_size = 1;
-    conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM;
-  }
-
-  return conv_params;
-}
-
-Conv3D::ConvParams Conv3D::GuessBestParams(
-    const DeviceInfo& device_info, const OperationDef& definition,
-    const Convolution3DAttributes& attr) {
-  const int dst_slices = DivideRoundUp(attr.weights.shape.o, 4);
-  const int src_slices = DivideRoundUp(attr.weights.shape.i, 4);
-  const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
-                             attr.dilations.w == 1 &&
-                             attr.padding.prepended.w == 0 &&
-                             attr.padding.appended.w == 0;
-  const bool y_kernel_is_1 = attr.weights.shape.h == 1 && attr.strides.h == 1 &&
-                             attr.dilations.h == 1 &&
-                             attr.padding.prepended.h == 0 &&
-                             attr.padding.appended.h == 0;
-  const bool z_kernel_is_1 = attr.weights.shape.d == 1 && attr.strides.d == 1 &&
-                             attr.dilations.d == 1 &&
-                             attr.padding.prepended.d == 0 &&
-                             attr.padding.appended.d == 0;
-  return GuessBestParams(device_info, definition, src_slices, dst_slices,
-                         x_kernel_is_1, y_kernel_is_1, z_kernel_is_1);
-}
-
-Conv3D CreateConv3D(const DeviceInfo& device_info,
-                    const OperationDef& definition,
-                    const Convolution3DAttributes& attr) {
-  Conv3D result(definition, attr, device_info);
-  result.UploadData(attr.weights, attr.bias);
-  return result;
-}
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
deleted file mode 100644
index d4a86b0ca5e..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_3d.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_3D_H_
-#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_3D_H_
-
-#include <vector>
-
-#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
-#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
-#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
-#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
-#include "tensorflow/lite/delegates/gpu/cl/util.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
-#include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/shape.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/tensor.h"
-#include "tensorflow/lite/delegates/gpu/common/types.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-
-class Conv3D : public GPUOperation {
- public:
-  Conv3D() = default;
-  void GetPossibleKernelWorkGroups(
-      TuningType tuning_type, const DeviceInfo& device_info,
-      const KernelInfo& kernel_info,
-      std::vector<int3>* work_groups) const override;
-  absl::Status BindArguments() override;
-  int3 GetGridSize() const override;
-
-  // Move only
-  Conv3D(Conv3D&& operation);
-  Conv3D& operator=(Conv3D&& operation);
-  Conv3D(const Conv3D&) = delete;
-  Conv3D& operator=(const Conv3D&) = delete;
-
- private:
-  enum class WeightsUploadType {
-    LOCAL_MEM_ASYNC_SUBGROUP,  // we use it for PowerVR with workgroup size = 32
-    LOCAL_MEM_BY_THREADS,
-    GLOBAL_MEM,
-    TEXTURES_MEM,
-  };
-
-  struct ConvParams {
-    int4 block_size;  // WHDS
-    int3 work_group_launch_order;
-    int src_depth_loop_size;
-    WeightsUploadType weights_upload_type;
-    bool AreWeightsBuffer() const {
-      return weights_upload_type != WeightsUploadType::TEXTURES_MEM;
-    }
-    bool x_kernel_is_1;
-    bool y_kernel_is_1;
-    bool z_kernel_is_1;
-  };
-
-  Conv3D(const OperationDef& definition, const Convolution3DAttributes& attr,
-         const DeviceInfo& device_info);
-
-  template <DataType T>
-  void UploadData(const tflite::gpu::Tensor<OHWDI, T>& weights,
-                  const tflite::gpu::Tensor<Linear, T>& biases);
-  template <DataType T>
-  void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights);
-
-  template <DataType S, typename T>
-  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
-                            absl::Span<T> dst);
-
-  friend Conv3D CreateConv3D(const DeviceInfo& device_info,
-                             const OperationDef& definition,
-                             const Convolution3DAttributes& attr);
-
-  friend std::string GenerateConv3D(const OperationDef& op_def,
-                                    bool stride_correction,
-                                    const ConvParams& conv_params,
-                                    Arguments* args);
-
-  ConvParams GuessBestParams(const DeviceInfo& device_info,
-                             const OperationDef& definition,
-                             const Convolution3DAttributes& attr);
-
-  ConvParams GuessBestParams(const DeviceInfo& device_info,
-                             const OperationDef& definition, int src_slices,
-                             int dst_slices, bool x_kernel_is_1,
-                             bool y_kernel_is_1, bool z_kernel_is_1);
-
-  std::string GenerateConv3D(const OperationDef& op_def, bool stride_correction,
-                             const Conv3D::ConvParams& conv_params);
-
-  int3 stride_;
-  int3 padding_;
-  int3 kernel_size_;
-  int3 dilation_;
-  ConvParams conv_params_;
-};
-
-template <DataType T>
-void Conv3D::UploadData(const tflite::gpu::Tensor<OHWDI, T>& weights,
-                        const tflite::gpu::Tensor<Linear, T>& biases) {
-  UploadWeights(weights);
-  TensorLinearDescriptor desc;
-  desc.storage_type = conv_params_.AreWeightsBuffer()
-                          ? LinearStorageType::BUFFER
-                          : LinearStorageType::TEXTURE_2D;
-  desc.element_type = definition_.GetDataType();
-  desc.UploadLinearData(biases);
-  args_.AddObject("biases",
-                  absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
-}
-
-template <DataType T>
-void Conv3D::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights) {
-  const int block_size = conv_params_.block_size.w;
-  const int dst_slices =
-      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
-  const int src_slices = DivideRoundUp(weights.shape.i, 4);
-  const int kernel_x = kernel_size_.x;
-  const int kernel_y = kernel_size_.y;
-  const int kernel_z = kernel_size_.z;
-  const int texture_width = dst_slices;
-  const int texture_height = src_slices * kernel_x * kernel_y * kernel_z;
-
-  const int elements_count =
-      kernel_x * kernel_y * kernel_z * src_slices * dst_slices * 4;
-  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
-
-  const int float4_size = f32_weights ? 16 : 8;
-
-  std::vector<uint8_t> data(float4_size * elements_count);
-
-  if (f32_weights) {
-    float4* ptr = reinterpret_cast<float4*>(data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
-  } else {
-    half4* ptr = reinterpret_cast<half4*>(data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
-  }
-
-  if (conv_params_.AreWeightsBuffer()) {
-    BufferDescriptor desc;
-    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc.element_size = 4;
-    desc.size = float4_size * elements_count;
-    desc.data = std::move(data);
-    args_.AddObject("weights",
-                    absl::make_unique<BufferDescriptor>(std::move(desc)));
-  } else {
-    int sub_size = float4_size * elements_count / 4;
-    Texture2DDescriptor desc0;
-    desc0.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc0.size = int2(texture_width, texture_height);
-    desc0.data.resize(sub_size);
-    memcpy(desc0.data.data(), data.data(), sub_size);
-    args_.AddObject("weights0",
-                    absl::make_unique<Texture2DDescriptor>(std::move(desc0)));
-
-    Texture2DDescriptor desc1;
-    desc1.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc1.size = int2(texture_width, texture_height);
-    desc1.data.resize(sub_size);
-    memcpy(desc1.data.data(), data.data() + sub_size, sub_size);
-    args_.AddObject("weights1",
-                    absl::make_unique<Texture2DDescriptor>(std::move(desc1)));
-
-    Texture2DDescriptor desc2;
-    desc2.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc2.size = int2(texture_width, texture_height);
-    desc2.data.resize(sub_size);
-    memcpy(desc2.data.data(), data.data() + sub_size * 2, sub_size);
-    args_.AddObject("weights2",
-                    absl::make_unique<Texture2DDescriptor>(std::move(desc2)));
-
-    Texture2DDescriptor desc3;
-    desc3.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc3.size = int2(texture_width, texture_height);
-    desc3.data.resize(sub_size);
-    memcpy(desc3.data.data(), data.data() + sub_size * 3, sub_size);
-    args_.AddObject("weights3",
-                    absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
-  }
-}
-
-template <DataType S, typename T>
-void Conv3D::RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
-                                  absl::Span<T> dst) {
-  const int block_size = conv_params_.block_size.w;
-  const int dst_slices =
-      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
-  const int src_slices = DivideRoundUp(weights.shape.i, 4);
-  const int kernel_x = kernel_size_.x;
-  const int kernel_y = kernel_size_.y;
-  const int kernel_z = kernel_size_.z;
-  const int texture_width = dst_slices;
-  const int texture_height = src_slices * kernel_x * kernel_y * kernel_z;
-
-  int counter = 0;
-  for (int d = 0; d < dst_slices / block_size; ++d) {
-    for (int z = 0; z < kernel_z; ++z) {
-      for (int y = 0; y < kernel_y; ++y) {
-        for (int x = 0; x < kernel_x; ++x) {
-          for (int s = 0; s < src_slices; ++s) {
-            for (int sub_d = 0; sub_d < block_size; ++sub_d) {
-              T filters[4];
-              for (int i = 0; i < 4; ++i) {
-                for (int j = 0; j < 4; ++j) {
-                  const int s_ch = s * 4 + j;
-                  const int d_ch = (d * block_size + sub_d) * 4 + i;
-                  if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
-                    const int f_index =
-                        weights.shape.LinearIndex({d_ch, y, x, z, s_ch});
-                    filters[j][i] = weights.data[f_index];
-                  } else {
-                    filters[j][i] = 0.0f;
-                  }
-                }
-              }
-              if (conv_params_.AreWeightsBuffer()) {
-                dst[counter++] = filters[0];
-                dst[counter++] = filters[1];
-                dst[counter++] = filters[2];
-                dst[counter++] = filters[3];
-              } else {
-                int x_coord = d * block_size + sub_d;
-                int y_coord =
-                    ((z * kernel_y + y) * kernel_x + x) * src_slices + s;
-                int offset = y_coord * dst_slices + x_coord;
-                dst[offset + texture_width * texture_height * 0] = filters[0];
-                dst[offset + texture_width * texture_height * 1] = filters[1];
-                dst[offset + texture_width * texture_height * 2] = filters[2];
-                dst[offset + texture_width * texture_height * 3] = filters[3];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-Conv3D CreateConv3D(const DeviceInfo& device_info,
-                    const OperationDef& definition,
-                    const Convolution3DAttributes& attr);
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_3D_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
index dc54286c0fc..c3663634177 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.cc
@@ -45,84 +45,29 @@ int GetOptimalMaxConstantSize(const DeviceInfo& info) {
     return GetAdrenoOptimalMaxConstantSize(info.adreno_info.gpu_version);
   }
 }
-}  // namespace
 
-ConvConstants::ConvConstants(const OperationDef& definition,
-                             const Convolution2DAttributes& attr,
-                             const DeviceInfo& device_info)
-    : GPUOperation(definition),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
-      stride_(attr.strides.w, attr.strides.h),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
-      dilation_(attr.dilations.w, attr.dilations.h),
-      src_channels_(attr.weights.shape.i),
-      dst_channels_(attr.weights.shape.o) {
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  code_ =
-      GenerateConvolutionConstantCode(definition_, kernel_size_, src_channels_,
-                                      dst_channels_, stride_correction);
-  if (definition_.precision == CalculationsPrecision::F16 &&
-      device_info.IsAdreno3xx()) {
-    compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
-  }
-  if (definition_.precision != CalculationsPrecision::F32 &&
-      device_info.IsPowerVR()) {
-    // BUG, some PowerVRs (GE8320) produce incorrect result without it
-    compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
-  }
-}
-
-ConvConstants::ConvConstants(ConvConstants&& kernel)
-    : GPUOperation(std::move(kernel)),
-      kernel_size_(kernel.kernel_size_),
-      stride_(kernel.stride_),
-      padding_(kernel.padding_),
-      dilation_(kernel.dilation_),
-      src_channels_(kernel.src_channels_),
-      dst_channels_(kernel.dst_channels_) {}
-
-ConvConstants& ConvConstants::operator=(ConvConstants&& kernel) {
-  if (this != &kernel) {
-    std::swap(kernel_size_, kernel.kernel_size_);
-    std::swap(stride_, kernel.stride_);
-    std::swap(padding_, kernel.padding_);
-    std::swap(dilation_, kernel.dilation_);
-    std::swap(src_channels_, kernel.src_channels_);
-    std::swap(dst_channels_, kernel.dst_channels_);
-    GPUOperation::operator=(std::move(kernel));
-  }
-  return *this;
-}
-
-std::string ConvConstants::GenerateConvolutionConstantCode(
-    const OperationDef& op_def, const int2& kernel_size, int src_channels,
-    int dst_channels, bool stride_correction) {
+std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
+                                            const OHWI& weights_shape,
+                                            bool stride_correction,
+                                            GPUOperation* op) {
   auto src_desc = op_def.src_tensors[0];
   src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
   if (op_def.IsBatchSupported()) {
     src_desc.SetStateVar("BatchedWidth", "true");
   }
-  AddSrcTensor("src_tensor", src_desc);
+  op->AddSrcTensor("src_tensor", src_desc);
 
   auto dst_desc = op_def.dst_tensors[0];
   if (op_def.IsBatchSupported()) {
     dst_desc.SetStateVar("BatchedWidth", "true");
   }
-  AddDstTensor("dst_tensor", dst_desc);
-
-  args_.AddInt("stride_x");
-  args_.AddInt("stride_y");
-  args_.AddInt("padding_x");
-  args_.AddInt("padding_y");
-  args_.AddInt("dilation_x");
-  args_.AddInt("dilation_y");
+  op->AddDstTensor("dst_tensor", dst_desc);
 
   std::string c = GetCommonDefines(op_def.precision);
 
-  const int out_z = DivideRoundUp(dst_channels, 4);
+  const int out_z = DivideRoundUp(weights_shape.o, 4);
   const std::string kOutZ = std::to_string(out_z);
-  const int src_depth = DivideRoundUp(src_channels, 4);
+  const int src_depth = DivideRoundUp(weights_shape.i, 4);
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
   const bool manual_clamp = src_tensor_type == TensorStorageType::BUFFER ||
@@ -176,11 +121,16 @@ std::string ConvConstants::GenerateConvolutionConstantCode(
        "return;\n";
   if (stride_correction) {
     c += "  int start_x = " +
-         GetXStrideCorrected("X", "args.src_tensor.Batch()", "args.stride_x",
-                             "args.padding_x") +
+         GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x",
+                               "args.padding_x") +
          ";\n";
   } else {
-    c += "  int start_x = X * args.stride_x + args.padding_x;\n";
+    if (op_def.IsBatchSupported()) {
+      c += "  int start_x = X * args.stride_x + args.padding_x * "
+           "args.src_tensor.Batch();\n";
+    } else {
+      c += "  int start_x = X * args.stride_x + args.padding_x;\n";
+    }
   }
   c += "  int start_y = Y * args.stride_y + args.padding_y;\n";
   c += "  ACCUM_FLT4 r[" + kOutZ + "];\n";
@@ -189,22 +139,25 @@ std::string ConvConstants::GenerateConvolutionConstantCode(
   c += "  }\n";
   int filters_counter = 0;
   for (int s = 0; s < src_depth; ++s) {
-    const int ch_count = std::min(4, src_channels - s * 4);
+    const int ch_count = std::min(4, weights_shape.i - s * 4);
     const std::string s_conv = "CONV" + std::to_string(ch_count);
     const std::string s_count = ch_count == 1 ? "" : std::to_string(ch_count);
     const std::string s_type = absl::StrCat("FLT", s_count);
     const std::string s_postfix = postfixes[ch_count - 1];
-    for (int ky = 0; ky < kernel_size.y; ++ky) {
+    const std::string dilation_x =
+        op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
+                                  : "args.dilation_x";
+    for (int ky = 0; ky < weights_shape.h; ++ky) {
       std::string s_y = absl::StrCat("(start_y + ", ky, " * args.dilation_y)");
       if (manual_clamp) {
         c += "  {\n";
         c += "  bool y_out = " + s_y + " < 0 || " + s_y +
              " >= args.src_tensor.Height();\n";
       }
-      for (int kx = 0; kx < kernel_size.x; ++kx) {
+      for (int kx = 0; kx < weights_shape.w; ++kx) {
         c += "  {\n";
         std::string s_x =
-            absl::StrCat("(start_x + ", kx, " * args.dilation_x)");
+            absl::StrCat("(start_x + ", kx, " * " + dilation_x + ")");
         if (manual_clamp) {
           c += "    bool x_out = " + s_x + "< 0 || " + s_x +
                ">= args.src_tensor.Width();\n";
@@ -240,20 +193,7 @@ std::string ConvConstants::GenerateConvolutionConstantCode(
   return c;
 }
 
-absl::Status ConvConstants::BindArguments() {
-  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
-  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
-  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
-  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
-  RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
-  return args_.SetInt("dilation_y", dilation_.y);
-}
-
-int3 ConvConstants::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  return int3(grid_x, grid_y, 1);
-}
+}  // namespace
 
 bool IsConvConstantsSupported(const DeviceInfo& device_info,
                               const OperationDef& definition,
@@ -277,20 +217,41 @@ bool IsConvConstantsSupported(const DeviceInfo& device_info,
   return filters_buffer_size <= kConstantMaxSize && flt4_registers <= 8;
 }
 
-ConvConstants CreateConvConstants(const DeviceInfo& device_info,
-                                  const OperationDef& definition,
-                                  const Convolution2DAttributes& attr) {
-  ConvConstants result(definition, attr, device_info);
-  result.UploadWeights(attr.weights);
+GPUOperation CreateConvConstants(const DeviceInfo& device_info,
+                                 const OperationDef& definition,
+                                 const Convolution2DAttributes& attr) {
+  GPUOperation op(definition);
+  UploadWeightsForConvConstants(attr.weights, definition.precision, &op);
+  op.args_.AddInt("stride_x", attr.strides.w);
+  op.args_.AddInt("stride_y", attr.strides.h);
+  op.args_.AddInt("padding_x", -attr.padding.prepended.w);
+  op.args_.AddInt("padding_y", -attr.padding.prepended.h);
+  op.args_.AddInt("dilation_x", attr.dilations.w);
+  op.args_.AddInt("dilation_y", attr.dilations.h);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
+
+  const bool stride_correction =
+      definition.IsBatchSupported() && attr.strides.w != 1;
+  op.code_ = GenerateConvolutionConstantCode(definition, attr.weights.shape,
+                                             stride_correction, &op);
+  if (definition.precision == CalculationsPrecision::F16 &&
+      device_info.IsAdreno3xx()) {
+    op.compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
+  }
+  if (definition.precision != CalculationsPrecision::F32 &&
+      device_info.IsPowerVR()) {
+    // BUG, some PowerVRs (GE8320) produce incorrect result without it
+    op.compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
+  }
 
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::BUFFER;
   desc.element_type = definition.GetDataType();
   desc.memory_type = MemoryType::CONSTANT;
   desc.UploadLinearData(attr.bias);
-  result.args_.AddObject(
+  op.args_.AddObject(
       "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
-  return result;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
index 5be433588ce..c341ecb5753 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h
@@ -32,78 +32,8 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class ConvConstants : public GPUOperation {
- public:
-  ConvConstants() = default;
-  absl::Status BindArguments() override;
-  int3 GetGridSize() const override;
-
-  // Move only
-  ConvConstants(ConvConstants&& kernel);
-  ConvConstants& operator=(ConvConstants&& kernel);
-  ConvConstants(const ConvConstants&) = delete;
-  ConvConstants& operator=(const ConvConstants&) = delete;
-
- private:
-  friend ConvConstants CreateConvConstants(const DeviceInfo& device_info,
-                                           const OperationDef& definition,
-                                           const Convolution2DAttributes& attr);
-  ConvConstants(const OperationDef& definition,
-                const Convolution2DAttributes& attr,
-                const DeviceInfo& device_info);
-
-  template <DataType T>
-  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
-
-  template <DataType S, typename T>
-  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
-                            absl::Span<T> dst);
-
-  std::string GenerateConvolutionConstantCode(const OperationDef& op_def,
-                                              const int2& kernel_size,
-                                              int src_channels,
-                                              int dst_channels,
-                                              bool stride_correction);
-
-  int2 kernel_size_;
-  int2 stride_;
-  int2 padding_;
-  int2 dilation_;
-  int src_channels_;
-  int dst_channels_;
-};
-
-template <DataType T>
-void ConvConstants::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
-  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
-  const int kernel_x = weights.shape.w;
-  const int kernel_y = weights.shape.h;
-
-  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
-  const int float_size = f32_weights ? 4 : 2;
-  const int float_count = src_channels_ * dst_depth * 4 * kernel_x * kernel_y;
-
-  BufferDescriptor desc;
-  desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-  desc.element_size = 4;
-  desc.memory_type = MemoryType::CONSTANT;
-  desc.size = float_size * float_count;
-  desc.data.resize(desc.size);
-
-  if (f32_weights) {
-    float4* ptr = reinterpret_cast<float4*>(desc.data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr, float_count / 4));
-  } else {
-    half4* ptr = reinterpret_cast<half4*>(desc.data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr, float_count / 4));
-  }
-
-  args_.AddObject("weigths",
-                  absl::make_unique<BufferDescriptor>(std::move(desc)));
-}
-
 template <DataType S, typename T>
-void ConvConstants::RearrangeWeightsData(
+void RearrangeWeightsForConvConstants(
     const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
@@ -115,7 +45,7 @@ void ConvConstants::RearrangeWeightsData(
     for (int y = 0; y < kernel_y; ++y) {
       for (int x = 0; x < kernel_x; ++x) {
         for (int d = 0; d < dst_depth; ++d) {
-          const int channels_count = std::min(4, src_channels_ - s * 4);
+          const int channels_count = std::min(4, weights.shape.i - s * 4);
           T filters[4];
           for (int i = 0; i < 4; ++i) {
             for (int j = 0; j < channels_count; ++j) {
@@ -145,13 +75,46 @@ void ConvConstants::RearrangeWeightsData(
   }
 }
 
+template <DataType T>
+void UploadWeightsForConvConstants(const tflite::gpu::Tensor<OHWI, T>& weights,
+                                   CalculationsPrecision precision,
+                                   GPUOperation* op) {
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+
+  const bool f32_weights = precision == CalculationsPrecision::F32;
+  const int float_size = f32_weights ? 4 : 2;
+  const int float_count = weights.shape.i * dst_depth * 4 * kernel_x * kernel_y;
+
+  BufferDescriptor desc;
+  desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+  desc.element_size = 4;
+  desc.memory_type = MemoryType::CONSTANT;
+  desc.size = float_size * float_count;
+  desc.data.resize(desc.size);
+
+  if (f32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(desc.data.data());
+    RearrangeWeightsForConvConstants(weights,
+                                     absl::MakeSpan(ptr, float_count / 4));
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(desc.data.data());
+    RearrangeWeightsForConvConstants(weights,
+                                     absl::MakeSpan(ptr, float_count / 4));
+  }
+
+  op->args_.AddObject("weigths",
+                      absl::make_unique<BufferDescriptor>(std::move(desc)));
+}
+
 bool IsConvConstantsSupported(const DeviceInfo& device_info,
                               const OperationDef& definition,
                               const Convolution2DAttributes& attr);
 
-ConvConstants CreateConvConstants(const DeviceInfo& device_info,
-                                  const OperationDef& definition,
-                                  const Convolution2DAttributes& attr);
+GPUOperation CreateConvConstants(const DeviceInfo& device_info,
+                                 const OperationDef& definition,
+                                 const Convolution2DAttributes& attr);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc
index 4aa60b8d334..17821e14e0a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_constants_test.cc
@@ -55,7 +55,7 @@ TEST_F(OpenCLOperationTest, ConvConstantsSimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvConstants operation =
+      GPUOperation operation =
           CreateConvConstants(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 1), &dst_tensor));
@@ -90,7 +90,7 @@ TEST_F(OpenCLOperationTest, ConvConstants) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ConvConstants operation =
+      GPUOperation operation =
           CreateConvConstants(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
index bd4f6d70994..8952504bda0 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.cc
@@ -19,11 +19,13 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
@@ -70,57 +72,77 @@ std::string GenerateAsyncUpload(const std::string& local_ptr_name,
   return c;
 }
 
-std::string GenerateBlockCoords(const int3& block_size,
+std::string GenerateBlockCoords(const int4& block_size,
                                 const int3& work_group_launch_order,
-                                bool linear_hw) {
+                                bool linear_spatial, bool need_depth) {
   std::string c;
   int3 launch_remap;
   launch_remap[work_group_launch_order.x] = 0;
   launch_remap[work_group_launch_order.y] = 1;
   launch_remap[work_group_launch_order.z] = 2;
-  if (linear_hw) {
+  if (linear_spatial) {
     if (work_group_launch_order[0] == 0) {
-      c += "  int linear_hw = get_global_id(0);\n";
+      c += "  int linear_spatial = get_global_id(0);\n";
     } else {
-      c += "  int linear_hw = get_group_id(" + std::to_string(launch_remap[0]) +
+      c += "  int linear_spatial = get_group_id(" +
+           std::to_string(launch_remap[0]) +
            ") * get_local_size(0) + get_local_id(0);\n";
     }
-    c += "  int Y = (linear_hw / args.task_size_x) * " +
-         std::to_string(block_size.y) + ";\n";
-    c += "  int X = (linear_hw % args.task_size_x) * " +
-         std::to_string(block_size.x) + ";\n";
-    if (work_group_launch_order[1] == 1) {
-      c += "  int Z = get_global_id(1) * " + std::to_string(block_size.z) +
-           ";\n";
-    } else {
-      c += "  int Z = (get_group_id(" + std::to_string(launch_remap[1]) +
-           ") * get_local_size(1) + get_local_id(1)) * " +
+    if (need_depth) {
+      c += "  int DST_X = (linear_spatial % args.task_size_x) * " +
+           std::to_string(block_size.x) + ";\n";
+      c += "  linear_spatial = linear_spatial / args.task_size_x;\n";
+      c += "  int DST_Y = (linear_spatial % args.task_size_y) * " +
+           std::to_string(block_size.y) + ";\n";
+      c += "  int DST_Z = (linear_spatial / args.task_size_y) * " +
            std::to_string(block_size.z) + ";\n";
-    }
-  } else {
-    if (work_group_launch_order[0] == 0) {
-      c += "  int X = get_global_id(0) * " + std::to_string(block_size.x) +
-           ";\n";
     } else {
-      c += "  int X = (get_group_id(" + std::to_string(launch_remap[0]) +
-           ") * get_local_size(0) + get_local_id(0)) * " +
+      c += "  int DST_Y = (linear_spatial / args.task_size_x) * " +
+           std::to_string(block_size.y) + ";\n";
+      c += "  int DST_X = (linear_spatial % args.task_size_x) * " +
            std::to_string(block_size.x) + ";\n";
     }
     if (work_group_launch_order[1] == 1) {
-      c += "  int Y = get_global_id(1) * " + std::to_string(block_size.y) +
+      c += "  int DST_S = get_global_id(1) * " + std::to_string(block_size.w) +
            ";\n";
     } else {
-      c += "  int Y = (get_group_id(" + std::to_string(launch_remap[1]) +
+      c += "  int DST_S = (get_group_id(" + std::to_string(launch_remap[1]) +
            ") * get_local_size(1) + get_local_id(1)) * " +
+           std::to_string(block_size.w) + ";\n";
+    }
+  } else {
+    if (work_group_launch_order[0] == 0) {
+      c += "  int DST_X = get_global_id(0) * " + std::to_string(block_size.x) +
+           ";\n";
+    } else {
+      c += "  int DST_X = (get_group_id(" + std::to_string(launch_remap[0]) +
+           ") * get_local_size(0) + get_local_id(0)) * " +
+           std::to_string(block_size.x) + ";\n";
+    }
+    std::string global_id_1;
+    if (work_group_launch_order[1] == 1) {
+      global_id_1 = "get_global_id(1)";
+    } else {
+      global_id_1 = "(get_group_id(" + std::to_string(launch_remap[1]) +
+                    ") * get_local_size(1) + get_local_id(1))";
+    }
+    if (need_depth) {
+      c += "  int linear_id_1 = " + global_id_1 + ";\n";
+      c += "  int DST_Z = (linear_id_1 / args.task_size_y) * " +
+           std::to_string(block_size.z) + ";\n";
+      c += "  int DST_Y = (linear_id_1 % args.task_size_y) * " +
+           std::to_string(block_size.y) + ";\n";
+    } else {
+      c += "  int DST_Y = " + global_id_1 + " * " +
            std::to_string(block_size.y) + ";\n";
     }
     if (work_group_launch_order[2] == 2) {
-      c += "  int Z = get_global_id(2) * " + std::to_string(block_size.z) +
+      c += "  int DST_S = get_global_id(2) * " + std::to_string(block_size.w) +
            ";\n";
     } else {
-      c += "  int Z = (get_group_id(" + std::to_string(launch_remap[2]) +
+      c += "  int DST_S = (get_group_id(" + std::to_string(launch_remap[2]) +
            ") * get_local_size(2) + get_local_id(2)) * " +
-           std::to_string(block_size.z) + ";\n";
+           std::to_string(block_size.w) + ";\n";
     }
   }
 
@@ -132,10 +154,10 @@ ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                          const Convolution2DAttributes& attr,
                          const DeviceInfo& device_info, const BHWC* dst_shape)
     : GPUOperation(definition),
-      stride_padding_(attr.strides.w, attr.strides.h, -attr.padding.prepended.w,
-                      -attr.padding.prepended.h),
-      kernel_dilation_(attr.weights.shape.w, attr.weights.shape.h,
-                       attr.dilations.w, attr.dilations.h),
+      stride_(attr.strides.w, attr.strides.h, 1, 1),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 1, 1),
+      dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
       conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}
 
 ConvPowerVR::ConvPowerVR(const OperationDef& definition,
@@ -143,10 +165,10 @@ ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                          const BHWC& weights_shape,
                          const DeviceInfo& device_info, const BHWC* dst_shape)
     : GPUOperation(definition),
-      stride_padding_(attr.strides.w, attr.strides.h, -attr.padding.prepended.w,
-                      -attr.padding.prepended.h),
-      kernel_dilation_(weights_shape.w, weights_shape.h, attr.dilations.w,
-                       attr.dilations.h),
+      stride_(attr.strides.w, attr.strides.h, 1, 1),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
+      kernel_size_(weights_shape.w, weights_shape.h, 1, 1),
+      dilation_(attr.dilations.w, attr.dilations.h, 1, 1),
       conv_params_(GuessBestParams(device_info, definition, attr, weights_shape,
                                    dst_shape)) {}
 
@@ -154,25 +176,45 @@ ConvPowerVR::ConvPowerVR(const OperationDef& definition,
                          const FullyConnectedAttributes& attr,
                          const DeviceInfo& device_info, const BHWC* dst_shape)
     : GPUOperation(definition),
-      stride_padding_(1, 1, 0, 0),
-      kernel_dilation_(1, 1, 1, 1),
+      stride_(1, 1, 1, 1),
+      padding_(0, 0, 0, 0),
+      kernel_size_(1, 1, 1, 1),
+      dilation_(1, 1, 1, 1),
       conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}
 
 ConvPowerVR::ConvPowerVR(const OperationDef& definition)
     : GPUOperation(definition),
-      stride_padding_(1, 1, 0, 0),
-      kernel_dilation_(1, 1, 1, 1) {}
+      stride_(1, 1, 1, 1),
+      padding_(0, 0, 0, 0),
+      kernel_size_(1, 1, 1, 1),
+      dilation_(1, 1, 1, 1) {}
 
 ConvPowerVR::ConvPowerVR(ConvPowerVR&& operation)
     : GPUOperation(std::move(operation)),
-      stride_padding_(operation.stride_padding_),
-      kernel_dilation_(operation.kernel_dilation_),
+      stride_(operation.stride_),
+      padding_(operation.padding_),
+      kernel_size_(operation.kernel_size_),
+      dilation_(operation.dilation_),
       conv_params_(operation.conv_params_) {}
 
+ConvPowerVR::ConvPowerVR(const OperationDef& definition,
+                         const Convolution3DAttributes& attr,
+                         const DeviceInfo& device_info, const BHWDC* dst_shape)
+    : GPUOperation(definition),
+      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 1),
+      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
+               -attr.padding.prepended.d, 0),
+      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
+                   attr.weights.shape.d, 1),
+      dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d, 1),
+      conv_params_(GuessBestParams(device_info, definition, attr, dst_shape)) {}
+
 ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
   if (this != &operation) {
-    std::swap(stride_padding_, operation.stride_padding_);
-    std::swap(kernel_dilation_, operation.kernel_dilation_);
+    std::swap(stride_, operation.stride_);
+    std::swap(padding_, operation.padding_);
+    std::swap(kernel_size_, operation.kernel_size_);
+    std::swap(dilation_, operation.dilation_);
     std::swap(conv_params_, operation.conv_params_);
     GPUOperation::operator=(std::move(operation));
   }
@@ -180,63 +222,88 @@ ConvPowerVR& ConvPowerVR::operator=(ConvPowerVR&& operation) {
 }
 
 void ConvPowerVR::GenerateCode(const DeviceInfo& device_info) {
+  if (conv_params_.linear_spatial) {
+    grid_dimension_ = 2;
+  }
   const bool stride_correction =
-      definition_.IsBatchSupported() && stride_padding_.x != 1;
+      definition_.IsBatchSupported() && stride_.x != 1;
   code_ =
       GenerateConv(device_info, definition_, stride_correction, conv_params_);
   if (definition_.precision == CalculationsPrecision::F16 &&
       device_info.IsPowerVR()) {
     compiler_options_.push_back(CompilerOptions::POWERVR_FP16);
   }
-  if (conv_params_.IsPrivateMemBroadcast()) {
+  if (conv_params_.IsPrivateMemBroadcast() && device_info.IsCL20OrHigher()) {
     compiler_options_.push_back(CompilerOptions::CL_2_0);
   }
+  bool kernel_is_trivial =
+      conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
+  if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
+    kernel_is_trivial = kernel_is_trivial & conv_params_.z_kernel_is_1;
+  }
+  if (device_info.IsAdreno3xx() &&
+      definition_.precision == CalculationsPrecision::F16 &&
+      kernel_is_trivial) {
+    compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
+  }
 }
 
-absl::Status ConvPowerVR::BindArguments() {
-  if (!conv_params_.x_kernel_is_1 || !conv_params_.y_kernel_is_1) {
-    RETURN_IF_ERROR(args_.SetInt("stride_x", stride_padding_.x));
-    RETURN_IF_ERROR(args_.SetInt("stride_y", stride_padding_.y));
-    RETURN_IF_ERROR(
-        args_.SetInt("padding_x", stride_padding_.z * src_[0]->Batch()));
-    RETURN_IF_ERROR(args_.SetInt("padding_y", stride_padding_.w));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_dilation_.x));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_dilation_.y));
-    RETURN_IF_ERROR(
-        args_.SetInt("dilation_x", kernel_dilation_.z * src_[0]->Batch()));
-    RETURN_IF_ERROR(args_.SetInt("dilation_y", kernel_dilation_.w));
+absl::Status ConvPowerVR::BindArguments(ArgumentsBinder* args) {
+  if (!conv_params_.x_kernel_is_1) {
+    RETURN_IF_ERROR(args->SetInt("stride_x", stride_.x));
+    RETURN_IF_ERROR(args->SetInt("padding_x", padding_.x * src_[0]->Batch()));
+    RETURN_IF_ERROR(args->SetInt("kernel_size_x", kernel_size_.x));
+    RETURN_IF_ERROR(args->SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
   }
-  if (conv_params_.linear_hw) {
+  if (!conv_params_.y_kernel_is_1) {
+    RETURN_IF_ERROR(args->SetInt("stride_y", stride_.y));
+    RETURN_IF_ERROR(args->SetInt("padding_y", padding_.y));
+    RETURN_IF_ERROR(args->SetInt("kernel_size_y", kernel_size_.y));
+    RETURN_IF_ERROR(args->SetInt("dilation_y", dilation_.y));
+  }
+  if (definition_.src_tensors[0].HasAxis(Axis::DEPTH) &&
+      !conv_params_.z_kernel_is_1) {
+    RETURN_IF_ERROR(args->SetInt("stride_z", stride_.z));
+    RETURN_IF_ERROR(args->SetInt("padding_z", padding_.z));
+    RETURN_IF_ERROR(args->SetInt("kernel_size_z", kernel_size_.z));
+    RETURN_IF_ERROR(args->SetInt("dilation_z", dilation_.z));
+  }
+  if (conv_params_.linear_spatial) {
     const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
                                      conv_params_.block_size.x);
-    RETURN_IF_ERROR(args_.SetInt("task_size_x", grid_x));
+    RETURN_IF_ERROR(args->SetInt("task_size_x", grid_x));
+  }
+  if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
+    const int task_size_y =
+        DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
+    RETURN_IF_ERROR(args->SetInt("task_size_y", task_size_y));
   }
   return absl::OkStatus();
 }
 
 int3 ConvPowerVR::GetGridSize() const {
-  const int grid_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
-                                   conv_params_.block_size.x);
-  const int grid_y =
+  const int task_size_x = DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(),
+                                        conv_params_.block_size.x);
+  const int task_size_y =
       DivideRoundUp(dst_[0]->Height(), conv_params_.block_size.y);
-  const int grid_z =
-      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.z);
+  const int task_size_z =
+      DivideRoundUp(dst_[0]->Depth(), conv_params_.block_size.z);
+  const int task_size_s =
+      DivideRoundUp(dst_[0]->Slices(), conv_params_.block_size.w);
   int3 wg;
 
-  if (conv_params_.linear_hw) {
-    wg.x = DivideRoundUp(grid_x * grid_y, work_group_size_.x);
-    wg.y = DivideRoundUp(grid_z, work_group_size_.y);
-    return int3(
-        wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
-        wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y, 1);
+  if (conv_params_.linear_spatial) {
+    int grid_x = task_size_x * task_size_y;
+    if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
+      grid_x *= task_size_z;
+    }
+    return int3(grid_x, task_size_s, 1);
   } else {
-    wg.x = DivideRoundUp(grid_x, work_group_size_.x);
-    wg.y = DivideRoundUp(grid_y, work_group_size_.y);
-    wg.z = DivideRoundUp(grid_z, work_group_size_.z);
-    return int3(
-        wg[conv_params_.work_group_launch_order[0]] * work_group_size_.x,
-        wg[conv_params_.work_group_launch_order[1]] * work_group_size_.y,
-        wg[conv_params_.work_group_launch_order[2]] * work_group_size_.z);
+    int grid_y = task_size_y;
+    if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
+      grid_y *= task_size_z;
+    }
+    return int3(task_size_x, grid_y, task_size_s);
   }
 }
 
@@ -251,14 +318,8 @@ void ConvPowerVR::GetPossibleKernelWorkGroups(
     work_groups->push_back(work_group_size_);
     return;
   }
-  if (conv_params_.work_group_launch_order[0] == 0 &&
-      conv_params_.work_group_launch_order[1] == 1 &&
-      conv_params_.work_group_launch_order[2] == 2) {
-    GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
-                              work_groups);
-  } else {
-    work_groups->push_back(work_group_size_);
-  }
+  GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
+                            work_groups);
 }
 
 std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
@@ -284,31 +345,80 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
     AddSrcBuffer("weights", desc);
   }
 
+  const auto& src_def = op_def.src_tensors[0];
+
+  auto generate_id = [&](const std::string& x, const std::string& y,
+                         const std::string& z) {
+    std::string id;
+    if (src_def.HasAxis(Axis::WIDTH)) {
+      id += "_w" + x;
+    }
+    if (src_def.HasAxis(Axis::HEIGHT)) {
+      id += "_h" + y;
+    }
+    if (src_def.HasAxis(Axis::DEPTH)) {
+      id += "_d" + z;
+    }
+    return id;
+  };
+
+  auto generate_id_full = [&](const std::string& x, const std::string& y,
+                              const std::string& z, const std::string& s) {
+    return generate_id(x, y, z) + "_s" + s;
+  };
+
+  auto generate_check = [&](const std::string& x, const std::string& y,
+                            const std::string& z) {
+    std::string check;
+    const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
+    const std::vector<std::string> names{"in_x", "in_y", "in_z"};
+    const std::vector<bool> is_1{conv_params_.x_kernel_is_1,
+                                 conv_params_.y_kernel_is_1,
+                                 conv_params_.z_kernel_is_1};
+    const std::vector<std::string> coords{x, y, z};
+    for (int i = 0; i < axes.size(); ++i) {
+      const auto& axis = axes[i];
+      if (src_def.HasAxis(axis) && !src_def.SupportsZeroClamp(axis) &&
+          !is_1[i]) {
+        if (!check.empty()) {
+          check += " && ";
+        }
+        check += names[i] + coords[i];
+      }
+    }
+    return check;
+  };
+
   auto dst_desc = op_def.dst_tensors[0];
   if (op_def.IsBatchSupported()) {
     dst_desc.SetStateVar("BatchedWidth", "true");
   }
   AddDstTensor("dst_tensor", dst_desc);
 
-  const bool is1x1 = conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
-  if (!is1x1) {
+  if (!conv_params_.x_kernel_is_1) {
     args_.AddInt("stride_x");
-    args_.AddInt("stride_y");
     args_.AddInt("padding_x");
-    args_.AddInt("padding_y");
     args_.AddInt("kernel_size_x");
-    args_.AddInt("kernel_size_y");
     args_.AddInt("dilation_x");
+  }
+  if (!conv_params_.y_kernel_is_1) {
+    args_.AddInt("stride_y");
+    args_.AddInt("padding_y");
+    args_.AddInt("kernel_size_y");
     args_.AddInt("dilation_y");
   }
-  if (conv_params_.linear_hw) {
+  if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
+    args_.AddInt("stride_z");
+    args_.AddInt("padding_z");
+    args_.AddInt("kernel_size_z");
+    args_.AddInt("dilation_z");
+  }
+  if (conv_params_.linear_spatial) {
     args_.AddInt("task_size_x");
   }
-
-  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
-  const bool buffer_type = src_tensor_type == TensorStorageType::BUFFER ||
-                           src_tensor_type == TensorStorageType::IMAGE_BUFFER;
-  const bool manual_clamp = buffer_type && !is1x1;
+  if (src_def.HasAxis(Axis::DEPTH)) {
+    args_.AddInt("task_size_y");
+  }
 
   const bool need_local_mem =
       conv_params.weights_upload_type ==
@@ -317,10 +427,10 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
           ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
 
   const int local_mem_size =
-      conv_params.block_size.z * 4 * conv_params.src_depth_loop_size;
+      conv_params.block_size.w * 4 * conv_params.src_depth_loop_size;
 
   const bool use_simd_broadcast = conv_params.IsPrivateMemBroadcast();
-  const int simd_size = conv_params.GetSimdSize();
+  const int simd_size = conv_params.simd_size;
 
   const bool late_oob_check = need_local_mem || use_simd_broadcast;
 
@@ -340,9 +450,11 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
   if (use_simd_broadcast) {
     if (device_info.cl_version == OpenCLVersion::CL_2_0) {
       c += "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n";
+    } else if (device_info.SupportsExtension("cl_intel_subgroups")) {
+      c += "#pragma OPENCL EXTENSION cl_intel_subgroups : enable\n";
     }
   }
-  const int3 block_size = conv_params.block_size;
+  const int4 block_size = conv_params.block_size;
   if (conv_params.fixed_work_group_size) {
     c += "__attribute__((reqd_work_group_size(" +
          std::to_string(work_group_size_.x) + ", " +
@@ -353,28 +465,41 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
     c += "__attribute__((intel_reqd_sub_group_size(" +
          std::to_string(simd_size) + ")))\n";
   }
+  std::string dst_oob_check;
+  if (src_def.HasAxis(Axis::DEPTH)) {
+    if (conv_params.linear_spatial) {
+      dst_oob_check =
+          "DST_Z >= args.dst_tensor.Depth() || DST_S >= "
+          "args.dst_tensor.Slices()";
+    } else {
+      dst_oob_check =
+          "DST_X >= args.dst_tensor.Width() || DST_Z >= "
+          "args.dst_tensor.Depth() || DST_S >= args.dst_tensor.Slices()";
+    }
+  } else {
+    if (conv_params.linear_spatial) {
+      dst_oob_check =
+          "DST_Y >= args.dst_tensor.Height() || DST_S >= "
+          "args.dst_tensor.Slices()";
+    } else {
+      dst_oob_check =
+          "DST_X >= args.dst_tensor.Width() || DST_Y >= "
+          "args.dst_tensor.Height() || DST_S >= args.dst_tensor.Slices()";
+    }
+  }
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
-  c += GenerateBlockCoords(conv_params.block_size,
-                           conv_params.work_group_launch_order,
-                           conv_params.linear_hw);
-  std::vector<std::string> dst_x(conv_params.block_size.x);
-  for (int x = 0; x < conv_params.block_size.x; ++x) {
-    dst_x[x] = "(X + " + std::to_string(x) + ")";
-  }
-  std::vector<std::string> dst_y(conv_params.block_size.y);
-  for (int y = 0; y < conv_params.block_size.y; ++y) {
-    dst_y[y] = "(Y + " + std::to_string(y) + ")";
-  }
+  c += GenerateBlockCoords(conv_params.block_size, work_group_launch_order_,
+                           conv_params.linear_spatial,
+                           src_def.HasAxis(Axis::DEPTH));
   if (!late_oob_check) {
-    c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
-         "|| Z >= args.dst_tensor.Slices()) {\n";
+    c += "  if (" + dst_oob_check + ") {\n";
     c += "    return;\n";
     c += "  }\n";
   }
   if (conv_params.weights_upload_type ==
       ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
-    if (conv_params.linear_hw) {
+    if (conv_params.linear_spatial) {
       c += "  int lid = get_local_id(0);\n";
     } else {
       c += "  int lid = get_local_id(1) * " +
@@ -384,135 +509,263 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
   if (use_simd_broadcast) {
     c += "  int simd_id = get_sub_group_local_id();\n";
   }
-  for (int z = 0; z < block_size.z; ++z) {
-    for (int y = 0; y < block_size.y; ++y) {
-      for (int x = 0; x < block_size.x; ++x) {
-        c += "  ACCUM_FLT4 r" + std::to_string(z) + std::to_string(y) +
-             std::to_string(x) + " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+  for (int s = 0; s < block_size.w; ++s) {
+    const std::string sind = std::to_string(s);
+    for (int z = 0; z < block_size.z; ++z) {
+      const std::string zind = std::to_string(z);
+      for (int y = 0; y < block_size.y; ++y) {
+        const std::string yind = std::to_string(y);
+        for (int x = 0; x < block_size.x; ++x) {
+          const std::string xind = std::to_string(x);
+          c += "  ACCUM_FLT4 r" + generate_id_full(xind, yind, zind, sind) +
+               " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+        }
       }
     }
   }
-  if (!is1x1) {
+  if (!conv_params_.x_kernel_is_1) {
     for (int x = 0; x < block_size.x; ++x) {
+      const std::string xind = std::to_string(x);
+      const std::string xc = "(DST_X + " + xind + ")";
       if (stride_correction) {
-        c += "  int xc" + std::to_string(x) + " = " +
-             GetXStrideCorrected(dst_x[x], "args.src_tensor.Batch()",
-                                 "args.stride_x", "args.padding_x") +
+        c += "  int xc" + xind + " = " +
+             GetXStrideCorrected(xc, "args.src_tensor.Batch()", "args.stride_x",
+                                 "args.padding_x") +
              ";\n";
       } else {
-        c += "  int xc" + std::to_string(x) + " = " + dst_x[x] +
+        c += "  int xc" + xind + " = " + xc +
              " * args.stride_x + args.padding_x;\n";
       }
     }
+  } else {
+    for (int x = 0; x < block_size.x; ++x) {
+      const std::string xind = std::to_string(x);
+      c += "  int xc" + xind + " = DST_X + " + xind + ";\n";
+      if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) {
+        c += "  xc" + xind + " = clamp(xc" + xind +
+             ", 0, args.src_tensor.Width() - 1);\n";
+      }
+    }
+  }
+  if (!conv_params_.y_kernel_is_1) {
     for (int y = 0; y < block_size.y; ++y) {
-      c += "  int yc" + std::to_string(y) + " = " + dst_y[y] +
+      const std::string yind = std::to_string(y);
+      const std::string yc = "(DST_Y + " + yind + ")";
+      c += "  int yc" + yind + " = " + yc +
            " * args.stride_y + args.padding_y;\n";
     }
+  } else {
+    for (int y = 0; y < block_size.y; ++y) {
+      const std::string yind = std::to_string(y);
+      c += "  int yc" + yind + " = DST_Y + " + yind + ";\n";
+      if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) {
+        c += "  yc" + yind + " = clamp(yc" + yind +
+             ", 0, args.src_tensor.Height() - 1);\n";
+      }
+    }
+  }
+  if (src_def.HasAxis(Axis::DEPTH)) {
+    if (!conv_params_.z_kernel_is_1) {
+      for (int z = 0; z < block_size.z; ++z) {
+        const std::string zind = std::to_string(z);
+        const std::string zc = "(DST_Z + " + zind + ")";
+        c += "  int zc" + zind + " = " + zc +
+             " * args.stride_z + args.padding_z;\n";
+      }
+    } else {
+      for (int z = 0; z < block_size.z; ++z) {
+        const std::string zind = std::to_string(z);
+        c += "  int zc" + zind + " = DST_Z + " + zind + ";\n";
+        if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) {
+          c += "  zc" + zind + " = clamp(zc" + zind +
+               ", 0, args.src_tensor.Depth() - 1);\n";
+        }
+      }
+    }
+  }
+  bool trivial_kernel_size =
+      conv_params_.x_kernel_is_1 && conv_params_.y_kernel_is_1;
+  if (src_def.HasAxis(Axis::DEPTH)) {
+    trivial_kernel_size = trivial_kernel_size && conv_params_.z_kernel_is_1;
   }
   if (need_local_mem) {
     c += "  __local " + weights_data_type + " weights_cache[" +
          std::to_string(local_mem_size) + "];\n";
-  } else {
+  } else if (conv_params.AreWeightsBuffer()) {
     c += "    " + weights_global_ptr + " weights_cache;\n";
+  } else if (!trivial_kernel_size) {
+    c += "  int filter_offset = 0;\n";
   }
-  if (is1x1) {
+  if (conv_params.AreWeightsBuffer()) {
     if (conv_params.different_weights_for_height) {
       c += "  " + weights_global_ptr +
-           " filters_loc = args.weights.GetPtr() + (Z * "
-           "args.src_tensor.Height() + Y * " +
-           std::to_string(block_size.z) + ") * 4 * args.src_tensor.Slices();\n";
+           " filters_loc = args.weights.GetPtr() + (DST_S * "
+           "args.src_tensor.Height() + DST_Y * " +
+           std::to_string(block_size.w) + ") * 4 * args.src_tensor.Slices();\n";
     } else {
+      std::string kernel_spatial_offset = "";
+      if (!conv_params_.x_kernel_is_1) {
+        kernel_spatial_offset += " * args.kernel_size_x";
+      }
+      if (!conv_params_.y_kernel_is_1) {
+        kernel_spatial_offset += " * args.kernel_size_y";
+      }
+      if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
+        kernel_spatial_offset += " * args.kernel_size_z";
+      }
       c += "  " + weights_global_ptr +
-           " filters_loc = args.weights.GetPtr() + Z * 4 * "
-           "args.src_tensor.Slices();\n";
+           " filters_loc = args.weights.GetPtr() + DST_S * 4 * "
+           "args.src_tensor.Slices()" +
+           kernel_spatial_offset + ";\n";
     }
-  } else {
-    c += "  " + weights_global_ptr +
-         " filters_loc = args.weights.GetPtr() + Z * 4 * "
-         "args.src_tensor.Slices() *args.kernel_size_x * args.kernel_size_y;\n";
   }
-  if (buffer_type) {
-    c += "  const int src_layer_offset = args.src_tensor.SliceStride();\n";
+  if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
+    c += "  for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
+    for (int z = 0; z < block_size.z; ++z) {
+      const std::string zck = "zck" + std::to_string(z);
+      c += "  int zck" + std::to_string(z) + " = kz * args.dilation_z + zc" +
+           std::to_string(z) + ";\n";
+      if (!src_def.SupportsZeroClamp(Axis::DEPTH)) {
+        c += "  bool in_z" + std::to_string(z) + " = " + zck + " >= 0 && " +
+             zck + " < args.src_tensor.Depth();\n";
+        if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) {
+          c += "  " + zck + " = clamp(" + zck +
+               ", 0, args.src_tensor.Depth() - 1);\n";
+        }
+      }
+    }
   }
-  if (!is1x1) {
+  if (!conv_params_.y_kernel_is_1) {
     c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
     for (int y = 0; y < block_size.y; ++y) {
       const std::string yck = "yck" + std::to_string(y);
       c += "  int " + yck + " = ky * args.dilation_y + yc" + std::to_string(y) +
            ";\n";
-      if (manual_clamp) {
-        c += "  bool my" + std::to_string(y) + " = " + yck + " >= 0 && " + yck +
-             " < args.src_tensor.Height();\n";
-        c += "  " + yck + " = clamp(" + yck +
-             ", 0, args.src_tensor.Height() - 1);\n";
+      if (!src_def.SupportsZeroClamp(Axis::HEIGHT)) {
+        c += "  bool in_y" + std::to_string(y) + " = " + yck + " >= 0 && " +
+             yck + " < args.src_tensor.Height();\n";
+        if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) {
+          c += "  " + yck + " = clamp(" + yck +
+               ", 0, args.src_tensor.Height() - 1);\n";
+        }
       }
     }
+  }
+  if (!conv_params_.x_kernel_is_1) {
     c += "  for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
     for (int x = 0; x < block_size.x; ++x) {
       const std::string xck = "xck" + std::to_string(x);
       c += "  int xck" + std::to_string(x) + " = kx * args.dilation_x + xc" +
            std::to_string(x) + ";\n";
-      if (manual_clamp) {
-        c += "  bool mx" + std::to_string(x) + " = " + xck + " >= 0 && " + xck +
-             " < args.src_tensor.Width();\n";
-        c += "  " + xck + " = clamp(" + xck +
-             ", 0, args.src_tensor.Width() - 1);\n";
+      if (!src_def.SupportsZeroClamp(Axis::WIDTH)) {
+        c += "  bool in_x" + std::to_string(x) + " = " + xck + " >= 0 && " +
+             xck + " < args.src_tensor.Width();\n";
+        if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) {
+          c += "  " + xck + " = clamp(" + xck +
+               ", 0, args.src_tensor.Width() - 1);\n";
+        }
       }
     }
   }
-  if (buffer_type) {
+  const bool need_multiple_slice_strides =
+      src_def.ReturnsZeroForNegOneRead() && !trivial_kernel_size;
+  for (int z = 0; z < block_size.z; ++z) {
+    const std::string zind = std::to_string(z);
     for (int y = 0; y < block_size.y; ++y) {
-      const std::string yck = "yck" + std::to_string(y);
+      const std::string yind = std::to_string(y);
       for (int x = 0; x < block_size.x; ++x) {
-        const std::string xck = "xck" + std::to_string(x);
-        std::string xc =
-            is1x1 ? "min(" + dst_x[x] + ", args.src_tensor.Width() - 1)" : xck;
-        std::string yc =
-            is1x1 ? "min(" + dst_y[y] + ", args.src_tensor.Height() - 1)" : yck;
-        std::string id = std::to_string(y) + std::to_string(x);
-        c += "  int src_a_" + id + " = " + yc +
-             " * args.src_tensor.Width() + " + xc + ";\n";
+        const std::string xind = std::to_string(x);
+        std::string xc = conv_params.x_kernel_is_1 ? "xc" + xind : "xck" + xind;
+        std::string yc = conv_params.y_kernel_is_1 ? "yc" + yind : "yck" + yind;
+        const std::string id = generate_id(xind, yind, zind);
+        std::string coords = "" + xc + ", " + yc;
+        if (src_def.HasAxis(Axis::DEPTH)) {
+          std::string zc =
+              conv_params.z_kernel_is_1 ? "zc" + zind : "zck" + zind;
+          coords += ", " + zc;
+        }
+        if (src_def.IsLinear()) {
+          c += "  args.src_tensor.GetAddress(addr" + id + ", " + coords +
+               ", 0);\n";
+          if (need_multiple_slice_strides) {
+            const std::string check = generate_check(xind, yind, zind);
+            c += "  addr" + id + " = select(-1, addr" + id + ", (" + check +
+                 "));\n";
+            c += "  int ds" + id +
+                 " = select(0, args.src_tensor.SliceStride(), (" + check +
+                 "));\n";
+          }
+        }
       }
     }
   }
+  if (src_def.IsLinear() && !need_multiple_slice_strides) {
+    c += "  int ds = args.src_tensor.SliceStride();\n";
+  }
 
   auto declare_src = [&]() {
-    for (int y = 0; y < block_size.y; ++y) {
-      for (int x = 0; x < block_size.x; ++x) {
-        const std::string id = std::to_string(y) + std::to_string(x);
-        c += "    " + weights_data_type + " src" + id + ";\n";
+    for (int z = 0; z < block_size.z; ++z) {
+      const std::string zind = std::to_string(z);
+      for (int y = 0; y < block_size.y; ++y) {
+        const std::string yind = std::to_string(y);
+        for (int x = 0; x < block_size.x; ++x) {
+          const std::string xind = std::to_string(x);
+          const std::string id = generate_id(xind, yind, zind);
+          c += "    " + weights_data_type + " src" + id + ";\n";
+        }
       }
     }
   };
   const bool conditional_read = device_info.IsMali();
   auto read_src = [&]() {
     const std::string cl_type = ToCLDataType(conv_params.weights_data_type);
-    for (int y = 0; y < block_size.y; ++y) {
-      for (int x = 0; x < block_size.x; ++x) {
-        if (buffer_type) {
-          std::string id = std::to_string(y) + std::to_string(x);
-          if (is1x1) {
-            c += "    src" + id + " = args.src_tensor.Read<" + cl_type +
-                 ">(src_a_" + id + ");\n";
+    for (int z = 0; z < block_size.z; ++z) {
+      const std::string zind = std::to_string(z);
+      for (int y = 0; y < block_size.y; ++y) {
+        const std::string yind = std::to_string(y);
+        for (int x = 0; x < block_size.x; ++x) {
+          const std::string xind = std::to_string(x);
+          std::string id = generate_id(xind, yind, zind);
+          const std::string check = generate_check(xind, yind, zind);
+          std::string address;
+          if (src_def.IsLinear()) {
+            address = "addr" + id;
           } else {
-            std::string condition =
-                "mx" + std::to_string(x) + " && my" + std::to_string(y);
-            if (conditional_read) {
-              c += "    src" + id + " = " + condition +
-                   " ? args.src_tensor.Read<" + cl_type + ">(src_a_" + id +
-                   ") : (FLT4)(0.0f);\n";
+            std::string xc =
+                conv_params.x_kernel_is_1 ? "xc" + xind : "xck" + xind;
+            std::string yc =
+                conv_params.y_kernel_is_1 ? "yc" + yind : "yck" + yind;
+            address = "" + xc + ", " + yc;
+            if (src_def.HasAxis(Axis::DEPTH)) {
+              std::string zc =
+                  conv_params.z_kernel_is_1 ? "zc" + zind : "zck" + zind;
+              address += ", " + zc;
+            }
+            address += ", s";
+          }
+          if (src_def.ReturnsZeroForNegOneRead()) {
+            c += "    src" + id + " = args.src_tensor.Read<" + cl_type + ">(" +
+                 address + ");\n";
+            const std::string ds = trivial_kernel_size ? "ds" : "ds" + id;
+            c += "    " + address + " += " + ds + ";\n";
+          } else {
+            if (!check.empty()) {
+              if (conditional_read) {
+                c += "    src" + id + " = " + check +
+                     " ? args.src_tensor.Read<" + cl_type + ">(" + address +
+                     ") : (FLT4)(0.0f);\n";
+              } else {
+                c += "    src" + id + " = args.src_tensor.Read<" + cl_type +
+                     ">(" + address + ") * (FLT)(" + check + ");\n";
+              }
             } else {
               c += "    src" + id + " = args.src_tensor.Read<" + cl_type +
-                   ">(src_a_" + id + ") * (FLT)(" + condition + ");\n";
+                   ">(" + address + ");\n";
+            }
+            if (src_def.IsLinear()) {
+              c += "    " + address + " += ds;\n";
             }
           }
-          c += "    src_a_" + id + " += src_layer_offset;\n";
-        } else {
-          std::string id = std::to_string(y) + std::to_string(x);
-          const std::string xc = is1x1 ? dst_x[x] : "xck" + std::to_string(x);
-          const std::string yc = is1x1 ? dst_y[y] : "yck" + std::to_string(y);
-          c += "    src" + id + " = args.src_tensor.Read<" + cl_type + ">(" +
-               xc + ", " + yc + ", s);\n";
         }
       }
     }
@@ -522,59 +775,80 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
         conv_params.weights_data_type == DataType::FLOAT16);
   auto conv_core = [&](int shared_offset) {
     const std::string channels[] = {"x", "y", "z", "w"};
-    for (int z = 0; z < block_size.z; ++z) {
+    for (int s = 0; s < block_size.w; ++s) {
+      const std::string sind = std::to_string(s);
       if (weights_type_as_accum_type) {
         for (int ch = 0; ch < 4; ++ch) {
-          for (int y = 0; y < block_size.y; ++y) {
-            for (int x = 0; x < block_size.x; ++x) {
-              std::string id = std::to_string(y) + std::to_string(x);
-              if (use_simd_broadcast) {
-                int simd_id = (z * 4 + ch + shared_offset) / simd_size;
-                int thread_id = (z * 4 + ch + shared_offset) % simd_size;
-                std::string w_val_x = "sub_group_broadcast(simd_w" +
-                                      std::to_string(simd_id) + ".x, " +
-                                      std::to_string(thread_id) + "u)";
-                std::string w_val_y = "sub_group_broadcast(simd_w" +
-                                      std::to_string(simd_id) + ".y, " +
-                                      std::to_string(thread_id) + "u)";
-                std::string w_val_z = "sub_group_broadcast(simd_w" +
-                                      std::to_string(simd_id) + ".z, " +
-                                      std::to_string(thread_id) + "u)";
-                std::string w_val_w = "sub_group_broadcast(simd_w" +
-                                      std::to_string(simd_id) + ".w, " +
-                                      std::to_string(thread_id) + "u)";
-                c += "    r" + std::to_string(z) + id + ".x += " + w_val_x +
-                     " * src" + id + "." + channels[ch] + ";\n";
-                c += "    r" + std::to_string(z) + id + ".y += " + w_val_y +
-                     " * src" + id + "." + channels[ch] + ";\n";
-                c += "    r" + std::to_string(z) + id + ".z += " + w_val_z +
-                     " * src" + id + "." + channels[ch] + ";\n";
-                c += "    r" + std::to_string(z) + id + ".w += " + w_val_w +
-                     " * src" + id + "." + channels[ch] + ";\n";
-              } else {
-                std::string w_val = "weights_cache[" +
-                                    std::to_string(z * 4 + ch + shared_offset) +
-                                    "]";
-                c += "    r" + std::to_string(z) + id + " += " + w_val +
-                     " * src" + id + "." + channels[ch] + ";\n";
+          for (int z = 0; z < block_size.z; ++z) {
+            const std::string zind = std::to_string(z);
+            for (int y = 0; y < block_size.y; ++y) {
+              const std::string yind = std::to_string(y);
+              for (int x = 0; x < block_size.x; ++x) {
+                const std::string xind = std::to_string(x);
+                std::string R = "r" + generate_id_full(xind, yind, zind, sind);
+                std::string S = "src" + generate_id(xind, yind, zind);
+                if (use_simd_broadcast) {
+                  int simd_id = (s * 4 + ch + shared_offset) / simd_size;
+                  int thread_id = (s * 4 + ch + shared_offset) % simd_size;
+                  std::string w_val_x = "sub_group_broadcast(simd_w" +
+                                        std::to_string(simd_id) + ".x, " +
+                                        std::to_string(thread_id) + "u)";
+                  std::string w_val_y = "sub_group_broadcast(simd_w" +
+                                        std::to_string(simd_id) + ".y, " +
+                                        std::to_string(thread_id) + "u)";
+                  std::string w_val_z = "sub_group_broadcast(simd_w" +
+                                        std::to_string(simd_id) + ".z, " +
+                                        std::to_string(thread_id) + "u)";
+                  std::string w_val_w = "sub_group_broadcast(simd_w" +
+                                        std::to_string(simd_id) + ".w, " +
+                                        std::to_string(thread_id) + "u)";
+                  c += "    " + R + ".x += " + w_val_x + " * " + S + "." +
+                       channels[ch] + ";\n";
+                  c += "    " + R + ".y += " + w_val_y + " * " + S + "." +
+                       channels[ch] + ";\n";
+                  c += "    " + R + ".z += " + w_val_z + " * " + S + "." +
+                       channels[ch] + ";\n";
+                  c += "    " + R + ".w += " + w_val_w + " * " + S + "." +
+                       channels[ch] + ";\n";
+                } else {
+                  const std::string weight_id =
+                      std::to_string(s * 4 + ch + shared_offset);
+                  std::string w_val;
+                  if (conv_params.AreWeightsBuffer()) {
+                    w_val = "weights_cache[" + weight_id + "]";
+                  } else {
+                    w_val = "f" + weight_id;
+                  }
+                  c += "    " + R + " += " + w_val + " * " + S + "." +
+                       channels[ch] + ";\n";
+                }
               }
             }
           }
         }
       } else {  // F32_F16 precision and weights type is float16
-        for (int y = 0; y < block_size.y; ++y) {
-          for (int x = 0; x < block_size.x; ++x) {
-            std::string id = std::to_string(y) + std::to_string(x);
-            std::string R = "r" + std::to_string(z) + id;
-            std::string S = "src" + id;
-            const int dz = z * 4 + shared_offset;
-            std::string f0 = "weights_cache[" + std::to_string(dz + 0) + "]";
-            std::string f1 = "weights_cache[" + std::to_string(dz + 1) + "]";
-            std::string f2 = "weights_cache[" + std::to_string(dz + 2) + "]";
-            std::string f3 = "weights_cache[" + std::to_string(dz + 3) + "]";
-            c += "    " + R + " += convert_float4(" + S + ".x * " + f0 + " + " +
-                 S + ".y * " + f1 + " + " + S + ".z * " + f2 + " + " + S +
-                 ".w * " + f3 + ");\n";
+        for (int z = 0; z < block_size.z; ++z) {
+          const std::string zind = std::to_string(z);
+          for (int y = 0; y < block_size.y; ++y) {
+            const std::string yind = std::to_string(y);
+            for (int x = 0; x < block_size.x; ++x) {
+              const std::string xind = std::to_string(x);
+              std::string R = "r" + generate_id_full(xind, yind, zind, sind);
+              std::string S = "src" + generate_id(xind, yind, zind);
+              std::vector<std::string> F(4);
+              for (int i = 0; i < 4; ++i) {
+                std::string weight_id =
+                    std::to_string(s * 4 + i + shared_offset);
+                if (conv_params.AreWeightsBuffer()) {
+                  F[i] = "weights_cache[" + weight_id + "]";
+                } else {
+                  F[i] = "f" + weight_id;
+                }
+              }
+              c += "    " + R + " += convert_float4(" + S + ".x * " + F[0] +
+                   " + " + S + ".y * " + F[1] + " + " + S + ".z * " + F[2] +
+                   " + " + S + ".w * " + F[3] + ");\n";
+            }
           }
         }
       }
@@ -611,8 +885,26 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
            "];\n";
       c += "    }\n";
     }
-  } else {  // GLOBAL_MEM/CONSTANT_MEM
+  } else if (conv_params.AreWeightsBuffer()) {  // GLOBAL_MEM/CONSTANT_MEM
     c += "    weights_cache = filters_loc;\n";
+  } else {  // TEXTURES_MEM
+    for (int dst_s = 0; dst_s < block_size.w; ++dst_s) {
+      std::string f_y = trivial_kernel_size ? "s" : "filter_offset";
+      if (conv_params.different_weights_for_height) {
+        f_y = "DST_Y * args.src_tensor.Slices() + s";
+      }
+      c += absl::Substitute(
+          R"(    FLT4 f$2 = args.weights0.Read(DST_S + $0, $1);
+    FLT4 f$3 = args.weights1.Read(DST_S + $0, $1);
+    FLT4 f$4 = args.weights2.Read(DST_S + $0, $1);
+    FLT4 f$5 = args.weights3.Read(DST_S + $0, $1);
+)",
+          dst_s, f_y, dst_s * 4 + 0, dst_s * 4 + 1, dst_s * 4 + 2,
+          dst_s * 4 + 3);
+    }
+    if (!trivial_kernel_size) {
+      c += "    filter_offset++;\n";
+    }
   }
   read_src();
   c += "    s += 1;\n";
@@ -623,61 +915,96 @@ std::string ConvPowerVR::GenerateConv(const DeviceInfo& device_info,
   conv_core(0);
   for (int i = 1; i < conv_params.src_depth_loop_size; ++i) {
     read_src();
-    conv_core(i * block_size.z * 4);
+    conv_core(i * block_size.w * 4);
     c += "    s += 1;\n";
   }
-  c += "    filters_loc += " + std::to_string(local_mem_size) + ";\n";
+  if (conv_params.AreWeightsBuffer()) {
+    c += "    filters_loc += " + std::to_string(local_mem_size) + ";\n";
+  }
   c += "  } while (s < args.src_tensor.Slices());\n";
-  if (!is1x1) {
-    c += "  };\n";
+  if (!conv_params.x_kernel_is_1) {
     c += "  };\n";
   }
-  if (conv_params.weights_upload_type ==
-      ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
-    c += GenerateAsyncUpload("weights_cache", "args.biases.GetPtr()", "Z",
-                             block_size.z);
-  } else if (conv_params.weights_upload_type ==
-             ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
-    c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-    c += GenerateUploadByThreads("weights_cache", "args.biases.GetPtr()", "Z",
-                                 "lid", total_work_items, block_size.z);
-    c += "    barrier(CLK_LOCAL_MEM_FENCE);\n";
-  } else {
-    c += "    weights_cache = args.biases.GetPtr() + Z;\n";
+  if (!conv_params.y_kernel_is_1) {
+    c += "  };\n";
+  }
+  if (src_def.HasAxis(Axis::DEPTH) && !conv_params_.z_kernel_is_1) {
+    c += "  };\n";
+  }
+  if (conv_params.AreWeightsBuffer()) {
+    if (conv_params.weights_upload_type ==
+        ConvPowerVR::WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP) {
+      c += GenerateAsyncUpload("weights_cache", "args.biases.GetPtr()", "DST_S",
+                               block_size.w);
+    } else if (conv_params.weights_upload_type ==
+               ConvPowerVR::WeightsUploadType::LOCAL_MEM_BY_THREADS) {
+      c += "  barrier(CLK_LOCAL_MEM_FENCE);\n";
+      c += GenerateUploadByThreads("weights_cache", "args.biases.GetPtr()",
+                                   "DST_S", "lid", total_work_items,
+                                   block_size.w);
+      c += "  barrier(CLK_LOCAL_MEM_FENCE);\n";
+    } else {
+      c += "  weights_cache = args.biases.GetPtr() + DST_S;\n";
+    }
   }
   if (late_oob_check) {
-    c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
-         "|| Z >= args.dst_tensor.Slices()) {\n";
+    c += "  if (" + dst_oob_check + ") {\n";
     c += "    return;\n";
     c += "  }\n";
   }
-  for (int z = 0; z < block_size.z; ++z) {
-    const std::string sz = std::to_string(z);
-    c += "  if (Z + " + sz + " >= args.dst_tensor.Slices()) return;\n";
-    c += "  {\n";
-    c += "    FLT4 bias_val = TO_FLT4(weights_cache[" + sz + "]);\n";
-    for (int y = 0; y < block_size.y; ++y) {
-      for (int x = 0; x < block_size.x; ++x) {
-        const std::string xs = dst_x[x];
-        const std::string ys = dst_y[y];
-        const std::string zs = "Z + " + sz;
-        const std::string r_id = sz + std::to_string(y) + std::to_string(x);
-        bool need_x_check = x != 0;
-        bool need_y_check = y != 0;
-        if (need_x_check && need_y_check) {
-          c += "  if (" + xs + " < args.dst_tensor.Width() && " + ys +
-               " < args.dst_tensor.Height()) {\n";
-        } else if (need_x_check && !need_y_check) {
-          c += "  if (" + xs + " < args.dst_tensor.Width()) {\n";
-        } else if (!need_x_check && need_y_check) {
-          c += "  if (" + ys + " < args.dst_tensor.Height()) {\n";
-        } else {
-          c += "  {\n";
+
+  auto generate_dst_check = [&](int x, int y, int z) {
+    std::string check;
+    const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
+    const std::vector<std::string> names{"Width()", "Height()", "Depth()"};
+    std::vector<std::string> coords(3);
+    coords[0] = "DST_X + " + std::to_string(x);
+    coords[1] = "DST_Y + " + std::to_string(y);
+    coords[2] = "DST_Z + " + std::to_string(z);
+    const std::vector<int> ids{x, y, z};
+    for (int i = 0; i < axes.size(); ++i) {
+      const auto& axis = axes[i];
+      if (src_def.HasAxis(axis) && ids[i] != 0) {
+        if (!check.empty()) {
+          check += " && ";
+        }
+        check += coords[i] + " < args.dst_tensor." + names[i];
+      }
+    }
+    return check;
+  };
+
+  for (int s = 0; s < block_size.w; ++s) {
+    const std::string sind = std::to_string(s);
+    c += "  if (DST_S + " + sind + " >= args.dst_tensor.Slices()) return;\n";
+    c += "  {\n";
+    if (conv_params.AreWeightsBuffer()) {
+      c += "    FLT4 bias_val = TO_FLT4(weights_cache[" + sind + "]);\n";
+    } else {
+      c += "    FLT4 bias_val = args.biases.Read(DST_S + " + sind + ");\n";
+    }
+    for (int z = 0; z < block_size.z; ++z) {
+      const std::string zind = std::to_string(z);
+      for (int y = 0; y < block_size.y; ++y) {
+        const std::string yind = std::to_string(y);
+        for (int x = 0; x < block_size.x; ++x) {
+          const std::string xind = std::to_string(x);
+          const std::string id = generate_id_full(xind, yind, zind, sind);
+          const std::string check = generate_dst_check(x, y, z);
+          std::string coords = "DST_X + " + xind + ", DST_Y + " + yind;
+          if (src_def.HasAxis(Axis::DEPTH)) {
+            coords += ", DST_Z + " + zind;
+          }
+          coords += ", DST_S + " + sind;
+          if (!check.empty()) {
+            c += "  if (" + check + ") {\n";
+          } else {
+            c += "  {\n";
+          }
+          c += "    FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
+          c += "    args.dst_tensor.Write(res, " + coords + ");\n";
+          c += "  }\n";
         }
-        c += "    FLT4 res = TO_FLT4(r" + r_id + ") + bias_val;\n";
-        c += "    args.dst_tensor.Write(res, " + xs + ", " + ys + ", " + zs +
-             ");\n";
-        c += "  }\n";
       }
     }
     c += "  }\n";
@@ -691,7 +1018,7 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     int src_depth, int dst_depth, bool x_kernel_is_1, bool y_kernel_is_1,
     bool different_weights_for_height, const BHWC* dst_shape) {
   ConvParams conv_params;
-  conv_params.linear_hw = false;
+  conv_params.linear_spatial = false;
   conv_params.weights_data_type =
       DeduceDataTypeFromPrecision(definition.precision);
   conv_params.x_kernel_is_1 = x_kernel_is_1;
@@ -700,84 +1027,84 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
   if (device_info.IsNvidia()) {
     if (different_weights_for_height) {
       work_group_size_ = int3(32, 1, 1);
-      conv_params.work_group_launch_order = int3(2, 0, 1);
+      work_group_launch_order_ = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     } else {
-      conv_params.linear_hw = true;
+      conv_params.linear_spatial = true;
       work_group_size_ = int3(32, 1, 1);
-      conv_params.work_group_launch_order = int3(1, 0, 2);
+      work_group_launch_order_ = int3(1, 0, 2);
       conv_params.fixed_work_group_size = true;
     }
-    conv_params.block_size = int3(2, 1, 4);
+    conv_params.block_size = int4(2, 1, 1, 4);
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
     if (dst_depth % 4 == 0 || dst_depth >= 8) {
-      conv_params.block_size.z = 4;
+      conv_params.block_size.w = 4;
     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
-      conv_params.block_size.z = 2;
+      conv_params.block_size.w = 2;
     } else {
-      conv_params.block_size.z = dst_depth;
+      conv_params.block_size.w = dst_depth;
     }
     if (dst_shape) {
       int task_size = dst_shape->w * dst_shape->b * dst_shape->h * dst_depth;
       float task_size_per_cu =
           static_cast<float>(task_size) / device_info.compute_units_count;
       int block_size = conv_params.block_size.x * conv_params.block_size.y *
-                       conv_params.block_size.z;
+                       conv_params.block_size.w;
       float threads_per_cu = task_size_per_cu / block_size;
       float warps_per_cu = threads_per_cu / 32 /*warp_size*/;
       if (warps_per_cu < 8.0f) {
         conv_params.block_size.x = 1;
       }
-      if (warps_per_cu < 4.0f && conv_params.block_size.z >= 4) {
-        conv_params.block_size.z /= 2;
+      if (warps_per_cu < 4.0f && conv_params.block_size.w >= 4) {
+        conv_params.block_size.w /= 2;
       }
-      if (warps_per_cu < 2.0f && conv_params.block_size.z >= 2) {
-        conv_params.block_size.z /= 2;
+      if (warps_per_cu < 2.0f && conv_params.block_size.w >= 2) {
+        conv_params.block_size.w /= 2;
       }
     }
     if (src_depth % 2 == 0) {
       conv_params.src_depth_loop_size = 2;
     }
-    if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
+    if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
       conv_params.src_depth_loop_size = 4;
     }
   } else if (device_info.IsPowerVR()) {
     if (different_weights_for_height) {
       work_group_size_ = int3(32, 1, 1);
-      conv_params.work_group_launch_order = int3(2, 0, 1);
+      work_group_launch_order_ = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     } else {
-      conv_params.linear_hw = true;
+      conv_params.linear_spatial = true;
       work_group_size_ = int3(32, 1, 1);
-      conv_params.work_group_launch_order = int3(1, 0, 2);
+      work_group_launch_order_ = int3(1, 0, 2);
       conv_params.fixed_work_group_size = true;
     }
     conv_params.weights_data_type =
         definition.precision == CalculationsPrecision::F16 ? DataType::FLOAT16
                                                            : DataType::FLOAT32;
-    conv_params.block_size = int3(1, 1, 4);
+    conv_params.block_size = int4(1, 1, 1, 4);
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type =
         WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
     if (dst_depth % 8 == 0 || dst_depth >= 32) {
-      conv_params.block_size.z = 8;
+      conv_params.block_size.w = 8;
     } else if (dst_depth % 4 == 0 || dst_depth >= 8) {
-      conv_params.block_size.z = 4;
+      conv_params.block_size.w = 4;
     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
-      conv_params.block_size.z = 2;
+      conv_params.block_size.w = 2;
     } else {
-      conv_params.block_size.z = dst_depth;
+      conv_params.block_size.w = dst_depth;
     }
     if (definition.precision == CalculationsPrecision::F16) {
-      conv_params.block_size.z = std::min(4, conv_params.block_size.z);
+      conv_params.block_size.w = std::min(4, conv_params.block_size.w);
       if (src_depth % 2 == 0) {
         conv_params.src_depth_loop_size = 2;
       }
-      if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
+      if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
         conv_params.src_depth_loop_size = 4;
       }
-      if (conv_params.block_size.z == 1) {
+      if (conv_params.block_size.w == 1) {
         if (src_depth % 2 == 0) {
           conv_params.src_depth_loop_size = 2;
         }
@@ -793,28 +1120,28 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
   } else if (device_info.IsAMD()) {
     if (different_weights_for_height) {
       work_group_size_ = int3(32, 1, 1);
-      conv_params.work_group_launch_order = int3(2, 0, 1);
+      work_group_launch_order_ = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     } else {
       work_group_size_ = int3(8, 4, 1);
-      conv_params.work_group_launch_order = int3(2, 0, 1);
+      work_group_launch_order_ = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     }
 
-    conv_params.block_size = int3(2, 1, 1);
+    conv_params.block_size = int4(2, 1, 1, 1);
     if (x_kernel_is_1 && y_kernel_is_1) {
       conv_params.block_size.y = 2;
     }
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::CONSTANT_MEM;
     if (dst_depth % 8 == 0 || dst_depth >= 32) {
-      conv_params.block_size.z = 8;
+      conv_params.block_size.w = 8;
     } else if (dst_depth % 4 == 0 || dst_depth >= 8) {
-      conv_params.block_size.z = 4;
+      conv_params.block_size.w = 4;
     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
-      conv_params.block_size.z = 2;
+      conv_params.block_size.w = 2;
     } else {
-      conv_params.block_size.z = 1;
+      conv_params.block_size.w = 1;
     }
     if (src_depth % 2 == 0 && src_depth >= 16) {
       conv_params.src_depth_loop_size = 2;
@@ -831,20 +1158,20 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     }
     if (block_size == 8) {
       if (dst_depth == 1 || dst_depth == 3) {
-        conv_params.block_size = int3(2, 2, 1);
+        conv_params.block_size = int4(2, 2, 1, 1);
       } else {
-        conv_params.block_size = int3(2, 2, 2);
+        conv_params.block_size = int4(2, 2, 1, 2);
       }
     } else if (block_size == 4) {
       if (dst_depth == 1 || dst_depth == 3) {
-        conv_params.block_size = int3(2, 2, 1);
+        conv_params.block_size = int4(2, 2, 1, 1);
       } else {
-        conv_params.block_size = int3(2, 1, 2);
+        conv_params.block_size = int4(2, 1, 1, 2);
       }
     } else if (block_size == 2) {
-      conv_params.block_size = int3(2, 1, 1);
+      conv_params.block_size = int4(2, 1, 1, 1);
     } else {
-      conv_params.block_size = int3(1, 1, 1);
+      conv_params.block_size = int4(1, 1, 1, 1);
     }
     conv_params.src_depth_loop_size = 1;
     MaliInfo mali_info = device_info.mali_info;
@@ -856,70 +1183,88 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
       conv_params.src_depth_loop_size = 4;
     }
     work_group_size_ = int3(4, 4, 1);
-    conv_params.work_group_launch_order = int3(0, 1, 2);
+    work_group_launch_order_ = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
   } else if (device_info.IsAdreno()) {
-    conv_params.block_size = int3(2, 2, 1);
+    conv_params.block_size = int4(2, 2, 1, 2);
+    if (device_info.IsAdreno3xx()) {
+      if (definition.precision == CalculationsPrecision::F16) {
+        conv_params.block_size = int4(2, 2, 1, 2);
+      } else if (definition.precision == CalculationsPrecision::F32_F16) {
+        conv_params.block_size = int4(2, 1, 1, 2);
+      } else {  // F32
+        conv_params.block_size = int4(2, 2, 1, 1);
+      }
+    }
     work_group_size_ = int3(8, 2, 1);
-    conv_params.work_group_launch_order = int3(0, 1, 2);
+    work_group_launch_order_ = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
     conv_params.src_depth_loop_size = 1;
-    conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
+    if (definition.src_tensors.size() == 2) {
+      // dynamic weights supported only with buffers.
+      conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
+    } else {
+      conv_params.weights_upload_type = WeightsUploadType::TEXTURES_MEM_X4;
+    }
   } else if (device_info.IsIntel()) {
     if (different_weights_for_height) {
       work_group_size_ = int3(16, 1, 1);
-      conv_params.work_group_launch_order = int3(0, 1, 2);
+      work_group_launch_order_ = int3(0, 1, 2);
       conv_params.fixed_work_group_size = true;
     } else {
-      conv_params.linear_hw = true;
+      conv_params.linear_spatial = true;
       work_group_size_ = int3(16, 1, 1);
-      conv_params.work_group_launch_order = int3(0, 1, 2);
+      work_group_launch_order_ = int3(0, 1, 2);
       conv_params.fixed_work_group_size = true;
     }
-    conv_params.block_size = int3(1, 1, 4);
+    conv_params.block_size = int4(1, 1, 1, 4);
     conv_params.src_depth_loop_size = 1;
+    int sub_group_size = 16;
+    const bool supports_subgroups =
+        device_info.SupportsExtension("cl_khr_subgroups") ||
+        device_info.SupportsExtension("cl_intel_subgroups");
     if (definition.precision != CalculationsPrecision::F32_F16 &&
-        device_info.SupportsExtension("cl_khr_subgroups") &&
+        supports_subgroups &&
         device_info.SupportsExtension("cl_intel_required_subgroup_size") &&
-        device_info.IsCL20OrHigher() &&
-        device_info.SupportsSubGroupWithSize(16)) {
+        device_info.SupportsSubGroupWithSize(sub_group_size)) {
       conv_params.weights_upload_type =
-          WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST;
+          WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
+      conv_params.simd_size = sub_group_size;
     } else {
       conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
     }
     if (dst_depth % 4 == 0 || dst_depth >= 8) {
-      conv_params.block_size.z = 4;
+      conv_params.block_size.w = 4;
     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
-      conv_params.block_size.z = 2;
+      conv_params.block_size.w = 2;
     } else {
-      conv_params.block_size.z = dst_depth;
+      conv_params.block_size.w = dst_depth;
     }
     if (src_depth % 2 == 0) {
       conv_params.src_depth_loop_size = 2;
     }
-    if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
+    if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
       conv_params.src_depth_loop_size = 4;
     }
   } else {
-    conv_params.block_size = int3(1, 1, 4);
+    conv_params.block_size = int4(1, 1, 1, 4);
     work_group_size_ = int3(8, 2, 1);
-    conv_params.work_group_launch_order = int3(0, 1, 2);
+    work_group_launch_order_ = int3(0, 1, 2);
     conv_params.fixed_work_group_size = false;
     conv_params.src_depth_loop_size = 1;
     conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
     if (dst_depth % 4 == 0 || dst_depth >= 8) {
-      conv_params.block_size.z = 4;
+      conv_params.block_size.w = 4;
     } else if (dst_depth % 2 == 0 || dst_depth >= 4) {
-      conv_params.block_size.z = 2;
+      conv_params.block_size.w = 2;
     } else {
-      conv_params.block_size.z = dst_depth;
+      conv_params.block_size.w = dst_depth;
     }
     if (src_depth % 2 == 0) {
       conv_params.src_depth_loop_size = 2;
     }
-    if (src_depth % 4 == 0 && conv_params.block_size.z <= 2) {
+    if (src_depth % 4 == 0 && conv_params.block_size.w <= 2) {
       conv_params.src_depth_loop_size = 4;
     }
   }
@@ -944,6 +1289,41 @@ ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
                          x_kernel_is_1, y_kernel_is_1, false, dst_shape);
 }
 
+ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const Convolution3DAttributes& attr, const BHWDC* dst_shape) {
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
+  const int src_depth = DivideRoundUp(attr.weights.shape.i, 4);
+  const bool x_kernel_is_1 = attr.weights.shape.w == 1 && attr.strides.w == 1 &&
+                             attr.dilations.w == 1 &&
+                             attr.padding.prepended.w == 0 &&
+                             attr.padding.appended.w == 0;
+  const bool y_kernel_is_1 = attr.weights.shape.h == 1 && attr.strides.h == 1 &&
+                             attr.dilations.h == 1 &&
+                             attr.padding.prepended.h == 0 &&
+                             attr.padding.appended.h == 0;
+  const bool z_kernel_is_1 = attr.weights.shape.d == 1 && attr.strides.d == 1 &&
+                             attr.dilations.d == 1 &&
+                             attr.padding.prepended.d == 0 &&
+                             attr.padding.appended.d == 0;
+
+  ConvPowerVR::ConvParams result;
+  BHWC shape;
+  if (dst_shape) {
+    shape.b = dst_shape->b;
+    shape.h = dst_shape->h * dst_shape->d;
+    shape.w = dst_shape->w;
+    shape.c = dst_shape->c;
+    result = GuessBestParams(device_info, definition, src_depth, dst_depth,
+                             x_kernel_is_1, y_kernel_is_1, false, &shape);
+  } else {
+    result = GuessBestParams(device_info, definition, src_depth, dst_depth,
+                             x_kernel_is_1, y_kernel_is_1, false, nullptr);
+  }
+  result.z_kernel_is_1 = z_kernel_is_1;
+  return result;
+}
+
 ConvPowerVR::ConvParams ConvPowerVR::GuessBestParams(
     const DeviceInfo& device_info, const OperationDef& definition,
     const Convolution2DAttributes& attr, const BHWC& weights_shape,
@@ -1031,6 +1411,17 @@ ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo& device_info,
   return result;
 }
 
+ConvPowerVR CreateConvPowerVR3D(const DeviceInfo& device_info,
+                                const OperationDef& definition,
+                                const Convolution3DAttributes& attr,
+                                const BHWDC* dst_shape) {
+  ConvPowerVR result(definition, attr, device_info, dst_shape);
+  result.GenerateCode(device_info);
+  result.UploadWeights(attr.weights);
+  result.UploadBias(attr.bias);
+  return result;
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
index bceb25044f7..30e412cd923 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_POWERVR_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_POWERVR_H_
 
+#include <cstring>
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
@@ -25,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
 #include "tensorflow/lite/delegates/gpu/cl/util.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
@@ -45,13 +47,13 @@ class ConvPowerVR : public GPUOperation {
       TuningType tuning_type, const DeviceInfo& device_info,
       const KernelInfo& kernel_info,
       std::vector<int3>* work_groups) const override;
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   ConvWeightsDescription GetConvWeightsDescription() const {
     ConvWeightsDescription desc;
     desc.layout = ConvWeightsLayout::kOHWIOGroupI4O4;
-    desc.output_group_size = conv_params_.block_size.z;
+    desc.output_group_size = conv_params_.block_size.w;
     return desc;
   }
 
@@ -67,11 +69,8 @@ class ConvPowerVR : public GPUOperation {
     LOCAL_MEM_BY_THREADS,
     GLOBAL_MEM,
     CONSTANT_MEM,
-    PRIVATE_MEM_SIMD8_BROADCAST,
-    PRIVATE_MEM_SIMD16_BROADCAST,
-    PRIVATE_MEM_SIMD32_BROADCAST,
-    PRIVATE_MEM_SIMD64_BROADCAST,
-    PRIVATE_MEM_SIMD128_BROADCAST,
+    PRIVATE_MEM_SIMD_BROADCAST,
+    TEXTURES_MEM_X4,  // 4 textures for weights
   };
 
   struct ConvParams {
@@ -83,47 +82,26 @@ class ConvPowerVR : public GPUOperation {
     // weights, so for PowerVR in this kernel we have F32 weights for
     // F32_F16 precision mode
     DataType weights_data_type;  // used for weights and biases
-    int3 block_size;
-    int3 work_group_launch_order;
+    int4 block_size;             // WHDS
     bool fixed_work_group_size;
-    bool linear_hw;
+    bool linear_spatial;  // spatial dimensions are Width/Height/Depth
     bool different_weights_for_height;
     int src_depth_loop_size;
     WeightsUploadType weights_upload_type;
     bool x_kernel_is_1;
     bool y_kernel_is_1;
+    bool z_kernel_is_1;
+
+    // used only with PRIVATE_MEM_SIMD_BROADCAST
+    int simd_size = 1;
+
+    bool AreWeightsBuffer() const {
+      return weights_upload_type != WeightsUploadType::TEXTURES_MEM_X4;
+    }
 
     bool IsPrivateMemBroadcast() const {
       return weights_upload_type ==
-                 WeightsUploadType::PRIVATE_MEM_SIMD8_BROADCAST ||
-             weights_upload_type ==
-                 WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST ||
-             weights_upload_type ==
-                 WeightsUploadType::PRIVATE_MEM_SIMD32_BROADCAST ||
-             weights_upload_type ==
-                 WeightsUploadType::PRIVATE_MEM_SIMD64_BROADCAST ||
-             weights_upload_type ==
-                 WeightsUploadType::PRIVATE_MEM_SIMD128_BROADCAST;
-    }
-
-    int GetSimdSize() const {
-      if (weights_upload_type ==
-          WeightsUploadType::PRIVATE_MEM_SIMD8_BROADCAST) {
-        return 8;
-      } else if (weights_upload_type ==
-                 WeightsUploadType::PRIVATE_MEM_SIMD16_BROADCAST) {
-        return 16;
-      } else if (weights_upload_type ==
-                 WeightsUploadType::PRIVATE_MEM_SIMD32_BROADCAST) {
-        return 32;
-      } else if (weights_upload_type ==
-                 WeightsUploadType::PRIVATE_MEM_SIMD64_BROADCAST) {
-        return 64;
-      } else if (weights_upload_type ==
-                 WeightsUploadType::PRIVATE_MEM_SIMD128_BROADCAST) {
-        return 128;
-      }
-      return 1;
+             WeightsUploadType::PRIVATE_MEM_SIMD_BROADCAST;
     }
   };
 
@@ -137,6 +115,9 @@ class ConvPowerVR : public GPUOperation {
               const FullyConnectedAttributes& attr,
               const DeviceInfo& device_info, const BHWC* dst_shape = nullptr);
   explicit ConvPowerVR(const OperationDef& definition);
+  ConvPowerVR(const OperationDef& definition,
+              const Convolution3DAttributes& attr,
+              const DeviceInfo& device_info, const BHWDC* dst_shape = nullptr);
 
   void GenerateCode(const DeviceInfo& device_info);
 
@@ -150,6 +131,9 @@ class ConvPowerVR : public GPUOperation {
   template <DataType T>
   void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
 
+  template <DataType T>
+  void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights);
+
   template <DataType T>
   void UploadBias(const tflite::gpu::Tensor<Linear, T>& bias);
 
@@ -172,6 +156,11 @@ class ConvPowerVR : public GPUOperation {
       const DeviceInfo& device_info, const OperationDef& definition,
       const Convolution2DAttributes& attr, const BHWC* dst_shape);
 
+  friend ConvPowerVR CreateConvPowerVR3D(const DeviceInfo& device_info,
+                                         const OperationDef& definition,
+                                         const Convolution3DAttributes& attr,
+                                         const BHWDC* dst_shape);
+
   ConvParams GuessBestParams(const DeviceInfo& device_info,
                              const OperationDef& definition,
                              const Convolution2DAttributes& attr,
@@ -189,6 +178,10 @@ class ConvPowerVR : public GPUOperation {
                                      const OperationDef& definition,
                                      const Convolution2DAttributes& attr,
                                      const BHWC* dst_shape = nullptr);
+  ConvParams GuessBestParams(const DeviceInfo& device_info,
+                             const OperationDef& definition,
+                             const Convolution3DAttributes& attr,
+                             const BHWDC* dst_shape = nullptr);
   ConvParams GuessBestParams(const DeviceInfo& device_info,
                              const OperationDef& definition, int src_depth,
                              int dst_depth, bool x_kernel_is_1,
@@ -200,8 +193,10 @@ class ConvPowerVR : public GPUOperation {
                            const OperationDef& op_def, bool stride_correction,
                            const ConvParams& conv_params);
 
-  int4 stride_padding_;
-  int4 kernel_dilation_;
+  int4 stride_;
+  int4 padding_;
+  int4 kernel_size_;
+  int4 dilation_;
   ConvParams conv_params_;
 };
 
@@ -236,7 +231,7 @@ void ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias) {
   const int float_size = conv_params_.weights_data_type == DataType::FLOAT32
                              ? sizeof(float)
                              : sizeof(half);
-  int aligned_channels = AlignByN(bias.shape.v, 4 * conv_params_.block_size.z);
+  int aligned_channels = AlignByN(bias.shape.v, 4 * conv_params_.block_size.w);
   desc.size = float_size * aligned_channels;
   desc.data.resize(desc.size);
   if (conv_params_.weights_data_type == DataType::FLOAT32) {
@@ -256,37 +251,125 @@ void ConvPowerVR::UploadBias(const tflite::gpu::Tensor<Linear, T>& bias) {
 
 template <DataType T>
 void ConvPowerVR::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
-  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
-  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int dst_slices =
+      AlignByN(DivideRoundUp(weights.shape.o, 4), conv_params_.block_size.w);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
 
   const bool f32_weights = conv_params_.weights_data_type == DataType::FLOAT32;
   const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
 
-  const int dst_depth_aligned = AlignByN(dst_depth, conv_params_.block_size.z);
   const int elements_count =
-      weights.shape.h * weights.shape.w * src_depth * dst_depth_aligned * 4;
+      weights.shape.h * weights.shape.w * src_slices * dst_slices * 4;
 
-  BufferDescriptor desc;
-  desc.element_type = conv_params_.weights_data_type;
-  desc.element_size = 4;
-  desc.memory_type = conv_params_.weights_upload_type ==
-                             ConvPowerVR::WeightsUploadType::CONSTANT_MEM
-                         ? MemoryType::CONSTANT
-                         : MemoryType::GLOBAL;
-  desc.size = float4_size * elements_count;
-  desc.data.resize(desc.size);
+  std::vector<uint8_t> data(float4_size * elements_count);
 
   if (f32_weights) {
-    float4* ptr = reinterpret_cast<float4*>(desc.data.data());
-    RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
-                                     absl::MakeSpan(ptr, elements_count));
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    if (conv_params_.AreWeightsBuffer()) {
+      RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.w,
+                                       absl::MakeSpan(ptr, elements_count));
+    } else {
+      RearrangeWeightsToI4HWIOOGroupO4(weights, conv_params_.block_size.w,
+                                       absl::MakeSpan(ptr, elements_count));
+    }
   } else {
-    half4* ptr = reinterpret_cast<half4*>(desc.data.data());
-    RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.z,
-                                     absl::MakeSpan(ptr, elements_count));
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    if (conv_params_.AreWeightsBuffer()) {
+      RearrangeWeightsToOHWIOGroupI4O4(weights, conv_params_.block_size.w,
+                                       absl::MakeSpan(ptr, elements_count));
+    } else {
+      RearrangeWeightsToI4HWIOOGroupO4(weights, conv_params_.block_size.w,
+                                       absl::MakeSpan(ptr, elements_count));
+    }
+  }
+  if (conv_params_.AreWeightsBuffer()) {
+    BufferDescriptor desc;
+    desc.element_type = conv_params_.weights_data_type;
+    desc.element_size = 4;
+    desc.memory_type = conv_params_.weights_upload_type ==
+                               ConvPowerVR::WeightsUploadType::CONSTANT_MEM
+                           ? MemoryType::CONSTANT
+                           : MemoryType::GLOBAL;
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    args_.AddObject("weights",
+                    absl::make_unique<BufferDescriptor>(std::move(desc)));
+  } else {
+    const int texture_width = dst_slices;
+    const int texture_height = src_slices * weights.shape.h * weights.shape.w;
+    const int sub_size = float4_size * texture_width * texture_height;
+    for (int i = 0; i < 4; ++i) {
+      Texture2DDescriptor desc;
+      desc.element_type = conv_params_.weights_data_type;
+      desc.size = int2(texture_width, texture_height);
+      desc.data.resize(sub_size);
+      std::memcpy(desc.data.data(), data.data() + sub_size * i, sub_size);
+      const std::string name = "weights" + std::to_string(i);
+      args_.AddObject(name,
+                      absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+    }
+  }
+}
+
+template <DataType T>
+void ConvPowerVR::UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights) {
+  const int block_size = conv_params_.block_size.w;
+  const int dst_slices =
+      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
+
+  const int elements_count = weights.shape.d * weights.shape.h *
+                             weights.shape.w * src_slices * dst_slices * 4;
+  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+
+  const int float4_size = f32_weights ? 16 : 8;
+
+  std::vector<uint8_t> data(float4_size * elements_count);
+
+  if (f32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    if (conv_params_.AreWeightsBuffer()) {
+      RearrangeWeightsToODHWIOGroupI4O4(weights, conv_params_.block_size.w,
+                                        absl::MakeSpan(ptr, elements_count));
+    } else {
+      RearrangeWeightsToI4DHWIOOGroupO4(weights, conv_params_.block_size.w,
+                                        absl::MakeSpan(ptr, elements_count));
+    }
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    if (conv_params_.AreWeightsBuffer()) {
+      RearrangeWeightsToODHWIOGroupI4O4(weights, conv_params_.block_size.w,
+                                        absl::MakeSpan(ptr, elements_count));
+    } else {
+      RearrangeWeightsToI4DHWIOOGroupO4(weights, conv_params_.block_size.w,
+                                        absl::MakeSpan(ptr, elements_count));
+    }
+  }
+
+  if (conv_params_.AreWeightsBuffer()) {
+    BufferDescriptor desc;
+    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 4;
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    args_.AddObject("weights",
+                    absl::make_unique<BufferDescriptor>(std::move(desc)));
+  } else {
+    const int texture_width = dst_slices;
+    const int texture_height =
+        src_slices * weights.shape.d * weights.shape.h * weights.shape.w;
+    int sub_size = float4_size * texture_width * texture_height;
+    for (int i = 0; i < 4; ++i) {
+      Texture2DDescriptor desc;
+      desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+      desc.size = int2(texture_width, texture_height);
+      desc.data.resize(sub_size);
+      memcpy(desc.data.data(), data.data() + sub_size * i, sub_size);
+      const std::string name = "weights" + std::to_string(i);
+      args_.AddObject(name,
+                      absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+    }
   }
-  args_.AddObject("weights",
-                  absl::make_unique<BufferDescriptor>(std::move(desc)));
 }
 
 ConvPowerVR CreateConvPowerVR(const DeviceInfo& device_info,
@@ -310,6 +393,11 @@ ConvPowerVR CreateConvPowerVRWino4x4To6x6(const DeviceInfo& device_info,
                                           const Convolution2DAttributes& attr,
                                           const BHWC* dst_shape = nullptr);
 
+ConvPowerVR CreateConvPowerVR3D(const DeviceInfo& device_info,
+                                const OperationDef& definition,
+                                const Convolution3DAttributes& attr,
+                                const BHWDC* dst_shape = nullptr);
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
deleted file mode 100644
index bff328772d7..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.cc
+++ /dev/null
@@ -1,461 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/substitute.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
-#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
-#include "tensorflow/lite/delegates/gpu/cl/precision.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-namespace {
-bool UseFP16SIMD(const DeviceInfo& device_info, CalculationsPrecision precision,
-                 bool kernel1x1) {
-  if (!device_info.IsAdreno()) {
-    return false;
-  }
-  switch (precision) {
-    case CalculationsPrecision::F32:
-    case CalculationsPrecision::F32_F16:
-      return false;
-    case CalculationsPrecision::F16:
-      return device_info.IsAdreno3xx() && kernel1x1;
-  }
-}
-}  // namespace
-
-ConvTexture::ConvTexture(const OperationDef& definition,
-                         const Convolution2DAttributes& attr)
-    : GPUOperation(definition),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h),
-      stride_(attr.strides.w, attr.strides.h),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h),
-      dilation_(attr.dilations.w, attr.dilations.h),
-      different_weights_for_height_(false),
-      block_size_(2, 2, 2) {
-  work_group_size_ = int3(4, 4, 2);
-}
-
-ConvTexture::ConvTexture(const OperationDef& definition)
-    : GPUOperation(definition),
-      kernel_size_(1, 1),
-      stride_(1, 1),
-      padding_(0, 0),
-      dilation_(1, 1),
-      different_weights_for_height_(false),
-      block_size_(4, 1, 2) {
-  work_group_size_ = int3(16, 1, 2);
-}
-
-ConvTexture::ConvTexture(ConvTexture&& operation)
-    : GPUOperation(std::move(operation)),
-      kernel_size_(operation.kernel_size_),
-      stride_(operation.stride_),
-      padding_(operation.padding_),
-      dilation_(operation.dilation_),
-      different_weights_for_height_(operation.different_weights_for_height_),
-      block_size_(operation.block_size_) {}
-
-ConvTexture& ConvTexture::operator=(ConvTexture&& operation) {
-  if (this != &operation) {
-    std::swap(kernel_size_, operation.kernel_size_);
-    std::swap(stride_, operation.stride_);
-    std::swap(padding_, operation.padding_);
-    std::swap(dilation_, operation.dilation_);
-    std::swap(different_weights_for_height_,
-              operation.different_weights_for_height_);
-    std::swap(block_size_, operation.block_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-std::string ConvTexture::GenerateConvCode(const OperationDef& op_def,
-                                          const int3& block_size, bool is1x1,
-                                          bool adreno4xx_optimization,
-                                          bool stride_correction,
-                                          bool different_weights_for_height) {
-  auto src_desc = op_def.src_tensors[0];
-  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
-  if (op_def.IsBatchSupported()) {
-    src_desc.SetStateVar("BatchedWidth", "true");
-  }
-  AddSrcTensor("src_tensor", src_desc);
-
-  auto dst_desc = op_def.dst_tensors[0];
-  if (op_def.IsBatchSupported()) {
-    dst_desc.SetStateVar("BatchedWidth", "true");
-  }
-  AddDstTensor("dst_tensor", dst_desc);
-
-  if (!is1x1) {
-    args_.AddInt("kernel_size_x");
-    args_.AddInt("kernel_size_y");
-    args_.AddInt("dilation_x");
-    args_.AddInt("dilation_y");
-  }
-  args_.AddInt("stride_x");
-  args_.AddInt("stride_y");
-  args_.AddInt("padding_x");
-  args_.AddInt("padding_y");
-
-  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
-  const bool is_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER ||
-                         src_tensor_type == TensorStorageType::BUFFER;
-
-  std::vector<std::string> xs(block_size.x);
-  for (int x = 0; x < block_size.x; ++x) {
-    xs[x] = std::to_string(x);
-  }
-
-  std::vector<std::string> ys(block_size.y);
-  for (int y = 0; y < block_size.y; ++y) {
-    ys[y] = std::to_string(y);
-  }
-
-  std::vector<std::string> zs(block_size.z);
-  for (int z = 0; z < block_size.z; ++z) {
-    zs[z] = std::to_string(z);
-  }
-
-  std::string c = GetCommonDefines(op_def.precision);
-  for (int z = 0; z < block_size.z; ++z) {
-    const std::string f0 = std::to_string(z * 4 + 0);
-    const std::string f1 = std::to_string(z * 4 + 1);
-    const std::string f2 = std::to_string(z * 4 + 2);
-    const std::string f3 = std::to_string(z * 4 + 3);
-    switch (op_def.precision) {
-      case CalculationsPrecision::F32:
-      case CalculationsPrecision::F16:
-        c += "#define CONV" + zs[z] + "(R, S)    \\\n";
-        c += "R += S.x * f" + f0 + "; \\\n";
-        c += "R += S.y * f" + f1 + "; \\\n";
-        c += "R += S.z * f" + f2 + "; \\\n";
-        c += "R += S.w * f" + f3 + ";   \n";
-        break;
-      case CalculationsPrecision::F32_F16:
-        c += "#define CONV" + zs[z] + "(R, S) \\\n";
-        c += "R += convert_float4(S.x * f" + f0 + " + S.y * f" + f1 +
-             " + S.z * f" + f2 + " + S.w * f" + f3 + ");\n";
-        break;
-    }
-  }
-
-  c += "__kernel void main_function(\n";
-  c += "$0) {\n";
-  c += "  int X = get_global_id(0) * " + std::to_string(block_size.x) + ";\n";
-  c += "  int Y = get_global_id(1) * " + std::to_string(block_size.y) + ";\n";
-  c += "  int Z = get_global_id(2) * " + std::to_string(block_size.z) + ";\n";
-  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() "
-       "|| Z >= args.dst_tensor.Slices()) return;\n";
-  std::vector<std::string> s_x(block_size.x);
-  std::vector<std::string> s_y(block_size.y);
-  for (int x = 0; x < block_size.x; ++x) {
-    if (stride_correction) {
-      c += "  int xc" + xs[x] + " = " +
-           GetXStrideCorrected("X + " + xs[x], "args.src_tensor.Batch()",
-                               "args.stride_x", "args.padding_x") +
-           ";\n";
-    } else {
-      c += "  int xc" + xs[x] + " = (X +" + xs[x] +
-           ") * args.stride_x + args.padding_x;\n";
-    }
-    s_x[x] = is1x1 ? "xc" + xs[x] : "cx" + xs[x];
-  }
-  for (int y = 0; y < block_size.y; ++y) {
-    c += "  int yc" + ys[y] + " = (Y +" + ys[y] +
-         ") * args.stride_y + args.padding_y;\n";
-    s_y[y] = is1x1 ? "yc" + ys[y] : "cy" + ys[y];
-  }
-  for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) {
-    c += "  ACCUM_FLT4 r" + std::to_string(i) +
-         " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
-  }
-  std::string f_y = is1x1 ? "s" : "filter_offset";
-  if (different_weights_for_height) {
-    f_y = "Y * args.src_tensor.Slices() + s";
-  }
-  if (!is1x1) {
-    for (int x = 0; x < block_size.x; ++x) {
-      c += "  int cx" + xs[x] + ";\n";
-    }
-    for (int y = 0; y < block_size.y; ++y) {
-      c += "  int cy" + ys[y] + ";\n";
-    }
-    c += "  int filter_offset = 0;\n";
-    c += "  for (int y = 0; y < args.kernel_size_y; ++y) {\n";
-    for (int y = 0; y < block_size.y; ++y) {
-      c += "  cy" + ys[y] + " = y * args.dilation_y + yc" + ys[y] + ";\n";
-    }
-    if (is_buffer) {
-      for (int y = 0; y < block_size.y; ++y) {
-        c += "  bool in_y" + ys[y] + " = cy" + ys[y] + " >= 0 && cy" + ys[y] +
-             " < args.src_tensor.Height();\n";
-        if (src_tensor_type == TensorStorageType::BUFFER) {
-          c += "    cy" + ys[y] + " = clamp(cy" + ys[y] +
-               ", 0, args.src_tensor.Height() - 1);\n";
-        }
-      }
-    }
-    c += "  for (int x = 0; x < args.kernel_size_x; ++x) {\n";
-    for (int x = 0; x < block_size.x; ++x) {
-      c += "  cx" + xs[x] + " = x * args.dilation_x + xc" + xs[x] + ";\n";
-    }
-    if (is_buffer) {
-      for (int x = 0; x < block_size.x; ++x) {
-        c += "  bool in_x" + xs[x] + " = cx" + xs[x] + " >= 0 && cx" + xs[x] +
-             " < args.src_tensor.Width();\n";
-        if (src_tensor_type == TensorStorageType::BUFFER) {
-          c += "    cx" + xs[x] + " = clamp(cx" + xs[x] +
-               ", 0, args.src_tensor.Width() - 1);\n";
-        }
-      }
-      for (int x = 0; x < block_size.x; ++x) {
-        for (int y = 0; y < block_size.y; ++y) {
-          const std::string id = std::to_string(y * block_size.x + x);
-          if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-            c += absl::Substitute(
-                "  int addr_$0 = select(-1, cy$2 * args.src_tensor.Width() + "
-                "cx$1, (in_x$1 "
-                "&& "
-                "in_y$2));\n",
-                y * block_size.x + x, x, y);
-            c += absl::Substitute(
-                "  int dz_$0 = select(0, args.src_tensor.Width() * "
-                "args.src_tensor.Height(), (in_x$1 && "
-                "in_y$2));\n",
-                y * block_size.x + x, x, y);
-          } else {
-            c += absl::Substitute(
-                "  int addr_$0 = cy$2 * args.src_tensor.Width() + cx$1;\n",
-                y * block_size.x + x, x, y);
-          }
-        }
-      }
-      if (src_tensor_type == TensorStorageType::BUFFER) {
-        c += "  int dz = args.src_tensor.Width() * args.src_tensor.Height();\n";
-      }
-    }
-  } else if (is_buffer) {
-    for (int y = 0; y < block_size.y; ++y) {
-      c += "  bool in_y" + ys[y] + " = yc" + ys[y] + " >= 0 && yc" + ys[y] +
-           " < args.src_tensor.Height();\n";
-    }
-    for (int x = 0; x < block_size.x; ++x) {
-      c += "  bool in_x" + xs[x] + " = xc" + xs[x] + " >= 0 && xc" + xs[x] +
-           " < args.src_tensor.Width();\n";
-    }
-    for (int x = 0; x < block_size.x; ++x) {
-      for (int y = 0; y < block_size.y; ++y) {
-        const std::string id = std::to_string(y * block_size.x + x);
-        if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-          c += absl::Substitute(
-              "  int addr_$0 = select(-1, yc$2 * args.src_tensor.Width() + "
-              "xc$1, (in_x$1 && "
-              "in_y$2));\n",
-              y * block_size.x + x, x, y);
-          c += absl::Substitute(
-              "  int dz_$0 = select(0, args.src_tensor.Width() * "
-              "args.src_tensor.Height(), (in_x$1 && "
-              "in_y$2));\n",
-              y * block_size.x + x, x, y);
-        } else {
-          c += absl::Substitute(
-              "  int addr_$0 = yc$2 * args.src_tensor.Width() + xc$1;\n",
-              y * block_size.x + x, x, y);
-        }
-      }
-    }
-    if (src_tensor_type == TensorStorageType::BUFFER) {
-      c += "  int dz = args.src_tensor.Width() * args.src_tensor.Height();\n";
-    }
-  }
-  c += "  for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
-  if (is_buffer) {
-    if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-      for (int index = 0; index < block_size.x * block_size.y; ++index) {
-        const std::string id = std::to_string(index);
-        c +=
-            "    FLT4 src" + id + " = args.src_tensor.Read(addr_" + id + ");\n";
-      }
-    } else {
-      for (int x = 0; x < block_size.x; ++x) {
-        for (int y = 0; y < block_size.y; ++y) {
-          const std::string id = std::to_string(y * block_size.x + x);
-          c += "    FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
-               ") * (FLT)(in_x" + xs[x] + " && in_y" + ys[y] + "); addr_" + id +
-               " += dz;\n";
-        }
-      }
-    }
-  }
-  for (int z = 0; z < block_size.z; ++z) {
-    c += absl::Substitute(R"(    FLT4 f$2 = args.weights0.Read($0, $1);
-    FLT4 f$3 = args.weights1.Read($0, $1);
-    FLT4 f$4 = args.weights2.Read($0, $1);
-    FLT4 f$5 = args.weights3.Read($0, $1);
-)",
-                          "Z + " + zs[z], f_y, z * 4 + 0, z * 4 + 1, z * 4 + 2,
-                          z * 4 + 3);
-  }
-  if (!is_buffer) {
-    for (int x = 0; x < block_size.x; ++x) {
-      for (int y = 0; y < block_size.y; ++y) {
-        const std::string id = std::to_string(y * block_size.x + x);
-        c += "    FLT4 src" + id + " = args.src_tensor.Read(" + s_x[x] + ", " +
-             s_y[y] + ", s);\n";
-      }
-    }
-  }
-  for (int z = 0; z < block_size.z; ++z) {
-    for (int i = 0; i < block_size.x * block_size.y; ++i) {
-      c += "    CONV" + zs[z] + "(r" +
-           std::to_string(i + z * block_size.x * block_size.y) + ", src" +
-           std::to_string(i) + ");\n";
-    }
-  }
-  if (!is1x1) {
-    c += "    filter_offset++;\n";
-  }
-  if (is_buffer) {
-    if (src_tensor_type == TensorStorageType::IMAGE_BUFFER) {
-      for (int index = 0; index < block_size.x * block_size.y; ++index) {
-        const std::string id = std::to_string(index);
-        c += "     addr_" + id + " += dz_" + id + ";\n";
-      }
-    }
-  }
-  c += "  }\n";  // args.src_tensor.Slices()
-  if (!is1x1) {
-    c += "  }\n";  // kernel_size_x
-    c += "  }\n";  // kernel_size_y
-  }
-  // when is1x1 && adreno4xx_optimization is true, xc0 == X and yc0 == Y
-  std::string dst_x = is1x1 && adreno4xx_optimization ? "xc0" : "X";
-  std::string dst_y = is1x1 && adreno4xx_optimization ? "yc0" : "Y";
-  for (int z = 0; z < block_size.z; ++z) {
-    c += "  if (Z < args.dst_tensor.Slices()) {\n";
-    c += "    FLT4 bias_val = args.biases.Read(Z);\n";
-    for (int y = 0; y < block_size.y; ++y) {
-      for (int x = 0; x < block_size.x; ++x) {
-        const std::string id =
-            std::to_string((z * block_size.y + y) * block_size.x + x);
-        c += "    {\n";
-        c += "      int xc = " + dst_x + " + " + xs[x] + ";\n";
-        c += "      int yc = " + dst_y + " + " + ys[y] + ";\n";
-        c += "      if (xc < args.dst_tensor.Width() && yc < "
-             "args.dst_tensor.Height()) {\n";
-        c += "        FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
-        c += "        args.dst_tensor.Write(res, xc, yc, Z);\n";
-        c += "      }\n";
-        c += "    }\n";
-      }
-    }
-    c += "  }\n";
-    c += "  Z++;\n";
-  }
-  c += "}\n";
-  return c;
-}
-
-void ConvTexture::GenerateCode(const DeviceInfo& device_info) {
-  auto storage_type = definition_.GetPrimaryStorageType();
-  bool is1x1 = kernel_size_.x == 1 && kernel_size_.y == 1;
-  bool adreno4xx_optimization =
-      stride_.x == 1 && stride_.y == 1 && padding_.x == 0 && padding_.y == 0 &&
-      device_info.IsAdreno4xx() &&
-      storage_type == TensorStorageType::TEXTURE_ARRAY &&
-      definition_.precision == CalculationsPrecision::F16;
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  code_ =
-      GenerateConvCode(definition_, block_size_, is1x1, adreno4xx_optimization,
-                       stride_correction, different_weights_for_height_);
-
-  if (UseFP16SIMD(device_info, definition_.precision, is1x1)) {
-    compiler_options_.push_back(CompilerOptions::ADRENO_FULL_SIMD_LINE);
-  }
-}
-
-absl::Status ConvTexture::BindArguments() {
-  if (!(kernel_size_.x == 1 && kernel_size_.y == 1)) {
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
-    RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
-    RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
-  }
-  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
-  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
-  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
-  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
-  return absl::OkStatus();
-}
-
-int3 ConvTexture::GetGridSize() const {
-  const int grid_x =
-      DivideRoundUp(dst_[0]->Width() * dst_[0]->Batch(), block_size_.x);
-  const int grid_y = DivideRoundUp(dst_[0]->Height(), block_size_.y);
-  const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.z);
-  return int3(grid_x, grid_y, grid_z);
-}
-
-void ConvTexture::GetPossibleKernelWorkGroups(
-    TuningType tuning_type, const DeviceInfo& device_info,
-    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
-  GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
-                            work_groups);
-}
-
-ConvTexture CreateConvTexture(const DeviceInfo& device_info,
-                              const OperationDef& definition,
-                              const Convolution2DAttributes& attr) {
-  ConvTexture result(definition, attr);
-  result.GenerateCode(device_info);
-  result.UploadData(attr.weights, attr.bias);
-  return result;
-}
-
-ConvTexture CreateConvTexture(const DeviceInfo& device_info,
-                              const OperationDef& definition,
-                              const FullyConnectedAttributes& attr) {
-  ConvTexture result(definition);
-  result.GenerateCode(device_info);
-  result.UploadData(attr.weights, attr.bias);
-  return result;
-}
-
-ConvTexture CreateConvTextureWino4x4To6x6(const DeviceInfo& device_info,
-                                          const OperationDef& definition,
-                                          const Convolution2DAttributes& attr) {
-  ConvTexture result(definition);
-  result.different_weights_for_height_ = true;
-  result.block_size_ = {4, 1, 2};
-  result.GenerateCode(device_info);
-  result.UploadDataForWinograd4x4To6x6(attr.weights);
-  return result;
-}
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
deleted file mode 100644
index 3ebd43bf32b..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
-#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
-
-#include <vector>
-
-#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
-#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
-#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
-#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
-#include "tensorflow/lite/delegates/gpu/cl/util.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
-#include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/shape.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/tensor.h"
-#include "tensorflow/lite/delegates/gpu/common/types.h"
-#include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-
-// This convolution process BLOCK_SIZE(XxYxZ) of FLT4 values per thread.
-class ConvTexture : public GPUOperation {
- public:
-  ConvTexture() = default;
-  void GetPossibleKernelWorkGroups(
-      TuningType tuning_type, const DeviceInfo& device_info,
-      const KernelInfo& kernel_info,
-      std::vector<int3>* work_groups) const override;
-  absl::Status BindArguments() override;
-  int3 GetGridSize() const override;
-
-  // Move only
-  ConvTexture(ConvTexture&& operation);
-  ConvTexture& operator=(ConvTexture&& operation);
-  ConvTexture(const ConvTexture&) = delete;
-  ConvTexture& operator=(const ConvTexture&) = delete;
-
- private:
-  friend ConvTexture CreateConvTexture(const DeviceInfo& device_info,
-                                       const OperationDef& definition,
-                                       const Convolution2DAttributes& attr);
-  friend ConvTexture CreateConvTexture(const DeviceInfo& device_info,
-                                       const OperationDef& definition,
-                                       const FullyConnectedAttributes& attr);
-
-  friend ConvTexture CreateConvTextureWino4x4To6x6(
-      const DeviceInfo& device_info, const OperationDef& definition,
-      const Convolution2DAttributes& attr);
-
-  ConvTexture(const OperationDef& definition,
-              const Convolution2DAttributes& attr);
-  explicit ConvTexture(const OperationDef& definition);
-  template <DataType T>
-  void UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
-                  const tflite::gpu::Tensor<Linear, T>& biases);
-
-  template <DataType T>
-  void UploadDataForWinograd4x4To6x6(
-      const tflite::gpu::Tensor<OHWI, T>& weights);
-
-  template <DataType T>
-  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
-
-  template <DataType S, typename T>
-  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
-                            absl::Span<T> dst_0, absl::Span<T> dst_1,
-                            absl::Span<T> dst_2, absl::Span<T> dst_3);
-
-  void GenerateCode(const DeviceInfo& device_info);
-
-  std::string GenerateConvCode(const OperationDef& op_def,
-                               const int3& block_size, bool is1x1,
-                               bool adreno4xx_optimization,
-                               bool stride_correction,
-                               bool different_weights_for_height);
-
-  int2 kernel_size_;
-  int2 stride_;
-  int2 padding_;
-  int2 dilation_;
-
-  // By default in 2d convolution we have the same weights for WH dims, but in
-  // some cases we need separate weights for H dimension and convolution kernel
-  // requires very small modifications to support it.
-  bool different_weights_for_height_;
-
-  int3 block_size_ = int3(2, 2, 2);
-};
-
-template <DataType T>
-void ConvTexture::UploadData(const tflite::gpu::Tensor<OHWI, T>& weights,
-                             const tflite::gpu::Tensor<Linear, T>& biases) {
-  UploadWeights(weights);
-
-  TensorLinearDescriptor desc;
-  desc.storage_type = LinearStorageType::TEXTURE_2D;
-  desc.element_type = definition_.GetDataType();
-  desc.UploadLinearData(biases);
-  args_.AddObject("biases",
-                  absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
-}
-
-template <DataType T>
-void ConvTexture::UploadDataForWinograd4x4To6x6(
-    const tflite::gpu::Tensor<OHWI, T>& weights) {
-  tflite::gpu::Tensor<OHWI, T> wino_weights;
-  RearrangeWeightsToWinograd4x4To6x6Weights(weights, &wino_weights);
-  UploadWeights(wino_weights);
-
-  tflite::gpu::Tensor<Linear, DataType::FLOAT32> bias;
-  bias.shape = Linear(1);
-  bias.data = {0.0f};
-  TensorLinearDescriptor desc;
-  desc.storage_type = LinearStorageType::TEXTURE_2D;
-  desc.element_type = definition_.GetDataType();
-  desc.UploadLinearData(bias);
-  args_.AddObject("biases",
-                  absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
-}
-
-template <DataType T>
-void ConvTexture::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights) {
-  int dst_depth = DivideRoundUp(weights.shape.o, 4);
-  dst_depth = AlignByN(dst_depth, block_size_.z);
-  const int src_depth = DivideRoundUp(weights.shape.i, 4);
-  const int kernel_x = weights.shape.w;
-  const int kernel_y = weights.shape.h;
-
-  int texture_width = dst_depth;
-  int texture_height = src_depth * kernel_x * kernel_y;
-
-  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
-  DataType data_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-
-  const int elements_count = texture_width * texture_height;
-  const int float4_size = f32_weights ? sizeof(float4) : sizeof(half4);
-
-  Texture2DDescriptor desc0;
-  desc0.element_type = data_type;
-  desc0.size = int2(texture_width, texture_height);
-  desc0.data.resize(elements_count * float4_size);
-
-  Texture2DDescriptor desc1;
-  desc1.element_type = data_type;
-  desc1.size = int2(texture_width, texture_height);
-  desc1.data.resize(elements_count * float4_size);
-
-  Texture2DDescriptor desc2;
-  desc2.element_type = data_type;
-  desc2.size = int2(texture_width, texture_height);
-  desc2.data.resize(elements_count * float4_size);
-
-  Texture2DDescriptor desc3;
-  desc3.element_type = data_type;
-  desc3.size = int2(texture_width, texture_height);
-  desc3.data.resize(elements_count * float4_size);
-
-  if (f32_weights) {
-    float4* ptr0 = reinterpret_cast<float4*>(desc0.data.data());
-    float4* ptr1 = reinterpret_cast<float4*>(desc1.data.data());
-    float4* ptr2 = reinterpret_cast<float4*>(desc2.data.data());
-    float4* ptr3 = reinterpret_cast<float4*>(desc3.data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr0, elements_count),
-                         absl::MakeSpan(ptr1, elements_count),
-                         absl::MakeSpan(ptr2, elements_count),
-                         absl::MakeSpan(ptr3, elements_count));
-  } else {
-    half4* ptr0 = reinterpret_cast<half4*>(desc0.data.data());
-    half4* ptr1 = reinterpret_cast<half4*>(desc1.data.data());
-    half4* ptr2 = reinterpret_cast<half4*>(desc2.data.data());
-    half4* ptr3 = reinterpret_cast<half4*>(desc3.data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr0, elements_count),
-                         absl::MakeSpan(ptr1, elements_count),
-                         absl::MakeSpan(ptr2, elements_count),
-                         absl::MakeSpan(ptr3, elements_count));
-  }
-
-  args_.AddObject("weights0",
-                  absl::make_unique<Texture2DDescriptor>(std::move(desc0)));
-  args_.AddObject("weights1",
-                  absl::make_unique<Texture2DDescriptor>(std::move(desc1)));
-  args_.AddObject("weights2",
-                  absl::make_unique<Texture2DDescriptor>(std::move(desc2)));
-  args_.AddObject("weights3",
-                  absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
-}
-
-template <DataType S, typename T>
-void ConvTexture::RearrangeWeightsData(
-    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst_0,
-    absl::Span<T> dst_1, absl::Span<T> dst_2, absl::Span<T> dst_3) {
-  int dst_depth = DivideRoundUp(weights.shape.o, 4);
-  dst_depth = AlignByN(dst_depth, block_size_.z);
-  const int src_depth = DivideRoundUp(weights.shape.i, 4);
-  const int kernel_x = weights.shape.w;
-  const int kernel_y = weights.shape.h;
-
-  int texture_width = dst_depth;
-
-  for (int d = 0; d < dst_depth / block_size_.z; ++d) {
-    for (int y = 0; y < kernel_y; ++y) {
-      for (int x = 0; x < kernel_x; ++x) {
-        for (int s = 0; s < src_depth; ++s) {
-          for (int sub_d = 0; sub_d < block_size_.z; ++sub_d) {
-            T filters[4];
-            for (int i = 0; i < 4; ++i) {
-              for (int j = 0; j < 4; ++j) {
-                const int s_ch = s * 4 + j;
-                const int d_ch = (d * block_size_.z + sub_d) * 4 + i;
-                if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
-                  const int f_index =
-                      weights.shape.LinearIndex({d_ch, y, x, s_ch});
-                  filters[j][i] = weights.data[f_index];
-                } else {
-                  filters[j][i] = 0.0f;
-                }
-              }
-            }
-            int x_coord = d * block_size_.z + sub_d;
-            int y_coord = (y * kernel_x + x) * src_depth + s;
-            int offset = y_coord * texture_width + x_coord;
-            dst_0[offset] = filters[0];
-            dst_1[offset] = filters[1];
-            dst_2[offset] = filters[2];
-            dst_3[offset] = filters[3];
-          }
-        }
-      }
-    }
-  }
-}
-
-ConvTexture CreateConvTexture(const DeviceInfo& device_info,
-                              const OperationDef& definition,
-                              const Convolution2DAttributes& attr);
-
-ConvTexture CreateConvTexture(const DeviceInfo& device_info,
-                              const OperationDef& definition,
-                              const FullyConnectedAttributes& attr);
-
-ConvTexture CreateConvTextureWino4x4To6x6(const DeviceInfo& device_info,
-                                          const OperationDef& definition,
-                                          const Convolution2DAttributes& attr);
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONV_TEXTURE_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc
deleted file mode 100644
index 2a92573b689..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_texture_test.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
-
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
-#include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-
-using ::testing::FloatNear;
-using ::testing::Pointwise;
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-namespace {
-
-TEST_F(OpenCLOperationTest, ConvTextureSimpleWeights) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 2, 2);
-  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
-
-  Convolution2DAttributes attr;
-  attr.padding.prepended = HW(0, 0);
-  attr.padding.appended = HW(1, 1);
-  attr.strides = HW(1, 1);
-  attr.dilations = HW(1, 1);
-  attr.weights.shape = OHWI(1, 2, 2, 2);
-  attr.weights.data = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
-  attr.bias.shape = Linear(1);
-  attr.bias.data = {0.0f};
-
-  for (auto storage : env_.GetSupportedStorages()) {
-    for (auto precision : env_.GetSupportedPrecisions()) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      ConvTexture operation =
-          CreateConvTexture(creation_context_.GetDeviceInfo(), op_def, attr);
-      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
-                                    BHWC(1, 2, 2, 1), &dst_tensor));
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(eps), {28.0f, 18.0f, 22.0f, 13.0f}));
-    }
-  }
-}
-
-TEST_F(OpenCLOperationTest, ConvTexture) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 2, 2);
-  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
-
-  Convolution2DAttributes attr;
-  attr.padding.prepended = HW(0, 0);
-  attr.padding.appended = HW(1, 1);
-  attr.strides = HW(1, 1);
-  attr.dilations = HW(1, 1);
-  attr.weights.shape = OHWI(2, 2, 2, 2);
-  attr.weights.data = {1.0f, 2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,
-                       9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
-  attr.bias.shape = Linear(2);
-  attr.bias.data = {0.5f, -0.5f};
-
-  for (auto storage : env_.GetSupportedStorages()) {
-    for (auto precision : env_.GetSupportedPrecisions()) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      ConvTexture operation =
-          CreateConvTexture(creation_context_.GetDeviceInfo(), op_def, attr);
-      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
-                                    BHWC(1, 2, 2, 2), &dst_tensor));
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(eps), {168.5f, 391.5f, 80.5f, 223.5f,
-                                             60.5f, 235.5f, 20.5f, 123.5f}));
-    }
-  }
-}
-
-}  // namespace
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
index d6e17ce2a86..521cbefd885 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.cc
@@ -110,12 +110,12 @@ std::string ConverterToConvWeights::GetConverterToConvWeightsCode(
   return c;
 }
 
-absl::Status ConverterToConvWeights::BindArguments() {
+absl::Status ConverterToConvWeights::BindArguments(ArgumentsBinder* args) {
   float4 mask = GetMaskForLastPlane(src_[0]->Channels());
-  RETURN_IF_ERROR(args_.SetFloat("mask_x", mask.x));
-  RETURN_IF_ERROR(args_.SetFloat("mask_y", mask.y));
-  RETURN_IF_ERROR(args_.SetFloat("mask_z", mask.z));
-  return args_.SetFloat("mask_w", mask.w);
+  RETURN_IF_ERROR(args->SetFloat("mask_x", mask.x));
+  RETURN_IF_ERROR(args->SetFloat("mask_y", mask.y));
+  RETURN_IF_ERROR(args->SetFloat("mask_z", mask.z));
+  return args->SetFloat("mask_w", mask.w);
 }
 
 int3 ConverterToConvWeights::GetGridSize() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h
index fe814d296fa..3c7314ea6c9 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h
@@ -31,7 +31,7 @@ class ConverterToConvWeights : public GPUOperation {
  public:
   ConverterToConvWeights(const OperationDef& definition,
                          const ConvWeightsDescription& conv_weights_desc);
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   // Move only
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc b/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
index d52efb43a08..77ac946637d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/converter.cc
@@ -46,9 +46,11 @@ class OpenClConverterImpl : public TensorObjectConverter {
     RETURN_IF_ERROR(kernel_.SetMemoryAuto(buffer_mem));
     RETURN_IF_ERROR(args_.SetObjectRef("tensor", tensor));
     RETURN_IF_ERROR(args_.Bind(kernel_.kernel(), kernel_.GetBindingCounter()));
-    int3 grid = int3(tensor->Width() * tensor->Batch(), tensor->Height(),
-                     tensor->Slices());
-    return queue_->DispatchImplicit(kernel_, grid, {16, 8, 1});
+    const int3 grid = int3(tensor->Width() * tensor->Batch(), tensor->Height(),
+                           tensor->Slices());
+    const int3 work_group_size = {16, 8, 1};
+    const int3 work_groups_count = GetWorkGroupsCount(grid, work_group_size);
+    return queue_->Dispatch(kernel_, work_groups_count, work_group_size);
   }
 
   Arguments args_;
@@ -63,47 +65,168 @@ bool IsSupportedDataType(DataType type) {
   return type == DataType::FLOAT16 || type == DataType::FLOAT32;
 }
 
-// Implements conversion from OpenCL-specific tensor layout to BHWC.
-class FromTensorConverter : public OpenClConverterImpl {
+bool IsBHWCOpenCLBuffer(const ObjectDef& def) {
+  return IsSupportedDataType(def.data_type) &&
+         def.object_type == ObjectType::OPENCL_BUFFER &&
+         def.data_layout == DataLayout::BHWC;
+}
+
+bool IsOpenCLTensor(const ObjectDef& def) {
+  const bool is_buffer_tensor = def.object_type == ObjectType::OPENCL_BUFFER &&
+                                def.data_layout == DataLayout::DHWC4;
+  const bool is_image2d_tensor =
+      def.object_type == ObjectType::OPENCL_TEXTURE &&
+      def.data_layout == DataLayout::HDWC4;
+  const bool is_image2d_array_tensor =
+      def.object_type == ObjectType::OPENCL_TEXTURE &&
+      def.data_layout == DataLayout::DHWC4;
+  const bool is_single_image_tensor =
+      def.object_type == ObjectType::OPENCL_TEXTURE &&
+      def.data_layout == DataLayout::BHWC;
+  return IsSupportedDataType(def.data_type) &&
+         (is_buffer_tensor || is_image2d_tensor || is_image2d_array_tensor ||
+          is_single_image_tensor);
+}
+
+absl::Status GetOpenCLMemory(const TensorObject& obj, cl_mem* memory) {
+  auto texture = absl::get_if<OpenClTexture>(&obj);
+  auto buffer = absl::get_if<OpenClBuffer>(&obj);
+  if (texture && texture->memobj) {
+    *memory = texture->memobj;
+  } else if (buffer && buffer->memobj) {
+    *memory = buffer->memobj;
+  } else {
+    return absl::InvalidArgumentError("Missing OpenCL object.");
+  }
+  return absl::OkStatus();
+}
+
+// Implements conversion from OpenCL tensor to another OpenCL tensor.
+class TensorToTensorConverter : public OpenClConverterImpl {
  public:
   static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
-    return IsSupportedDataType(input.data_type) &&
-           IsSupportedDataType(output.data_type) &&
-           // Output is always Buffer/(BHWC|DHWC4)
-           output.object_type == ObjectType::OPENCL_BUFFER &&
-           (output.data_layout == DataLayout::BHWC ||
-            output.data_layout == DataLayout::DHWC4) &&
-           // Texture2D/HDWC4 ->
-           ((input.object_type == ObjectType::OPENCL_TEXTURE &&
-             input.data_layout == DataLayout::HDWC4) ||
-            // SingleTextureArray/BHWC ->
-            (input.object_type == ObjectType::OPENCL_TEXTURE &&
-             input.data_layout == DataLayout::BHWC) ||
-            // TextureArray/DHWC4 ->
-            (input.object_type == ObjectType::OPENCL_TEXTURE &&
-             input.data_layout == DataLayout::DHWC4) ||
-            // Buffer/DHWC4 ->
-            (input.object_type == ObjectType::OPENCL_BUFFER &&
-             input.data_layout == DataLayout::DHWC4));
+    return IsOpenCLTensor(input) && IsOpenCLTensor(output);
   }
 
-  std::pair<std::string, std::string> GetToDhwc4Kernel(
-      const TensorObjectDef& input_def,
-      const TensorObjectDef& output_def) const {
-    return std::make_pair("__global " +
-                              ToCLDataType(output_def.object_def.data_type, 4) +
-                              "* dst",
-                          "dst[(d * args.tensor.Height() + y) * "
-                          "args.tensor.Width() + x] = input;");
+  absl::Status Init(const TensorObjectDef& input_def,
+                    const TensorObjectDef& output_def,
+                    Environment* environment) final {
+    src_tensor_descriptor_.layout = Layout::BHWC;
+    src_tensor_descriptor_.storage_type = ToTensorStorageType(
+        input_def.object_def.object_type, input_def.object_def.data_layout);
+    src_tensor_descriptor_.data_type = input_def.object_def.data_type;
+    args_.AddObjectRef(
+        "src_tensor", AccessType::READ,
+        absl::make_unique<TensorDescriptor>(src_tensor_descriptor_));
+
+    dst_tensor_descriptor_.layout = Layout::BHWC;
+    dst_tensor_descriptor_.storage_type = ToTensorStorageType(
+        output_def.object_def.object_type, output_def.object_def.data_layout);
+    dst_tensor_descriptor_.data_type = output_def.object_def.data_type;
+    args_.AddObjectRef(
+        "dst_tensor", AccessType::WRITE,
+        absl::make_unique<TensorDescriptor>(dst_tensor_descriptor_));
+
+    const bool need_fp16_support =
+        input_def.object_def.data_type == DataType::FLOAT16 ||
+        output_def.object_def.data_type == DataType::FLOAT16;
+    const std::string out_data_type =
+        ToCLDataType(output_def.object_def.data_type);
+    std::string shader_src;
+    if (need_fp16_support) {
+      shader_src += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+    }
+    shader_src +=
+        R"(__kernel void tensor_to_tensor($0) {
+  int linear_id = get_global_id(0);
+  int x = linear_id / args.dst_tensor.Batch();
+  int b = linear_id % args.dst_tensor.Batch();
+  int y = get_global_id(1);
+  int d = get_global_id(2);
+  if (x >= args.dst_tensor.Width() || y >= args.dst_tensor.Height() || d >= args.dst_tensor.Slices()) return;
+)";
+    shader_src += "  " + out_data_type + "4 input = args.src_tensor.Read<" +
+                  out_data_type + ">(x, y, d, b);\n";
+    shader_src += "  args.dst_tensor.Write(input, x, y, d, b);\n}";
+    queue_ = environment->queue();
+    context_ = &environment->context();
+    shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h,
+                  input_def.dimensions.w, input_def.dimensions.c);
+    RETURN_IF_ERROR(
+        args_.TransformToCLCode(environment->device().info_, {}, &shader_src));
+    return environment->program_cache()->GetOrCreateCLKernel(
+        shader_src, "tensor_to_tensor", environment->context(),
+        environment->device(), &kernel_);
   }
 
-  std::pair<std::string, std::string> GetToBhwcKernel(
-      const TensorObjectDef& input_def,
-      const TensorObjectDef& output_def) const {
-    return std::make_pair(
-        "__global " + ToCLDataType(output_def.object_def.data_type) + "* dst",
-        R"(
-  int c = d * 4;
+  absl::Status Convert(const TensorObject& input_obj,
+                       const TensorObject& output_obj) override {
+    cl_mem in_memory;
+    RETURN_IF_ERROR(GetOpenCLMemory(input_obj, &in_memory));
+    cl_mem out_memory;
+    RETURN_IF_ERROR(GetOpenCLMemory(output_obj, &out_memory));
+
+    Tensor src_tensor;
+    RETURN_IF_ERROR(CreateSharedTensor(*context_, in_memory, shape_,
+                                       src_tensor_descriptor_, &src_tensor));
+    Tensor dst_tensor;
+    RETURN_IF_ERROR(CreateSharedTensor(*context_, out_memory, shape_,
+                                       dst_tensor_descriptor_, &dst_tensor));
+    RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", &src_tensor));
+    RETURN_IF_ERROR(args_.SetObjectRef("dst_tensor", &dst_tensor));
+    RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
+    const int3 grid = int3(dst_tensor.Width() * dst_tensor.Batch(),
+                           dst_tensor.Height(), dst_tensor.Slices());
+    const int3 work_group_size = {16, 8, 1};
+    const int3 work_groups_count = GetWorkGroupsCount(grid, work_group_size);
+    return queue_->Dispatch(kernel_, work_groups_count, work_group_size);
+  }
+
+ private:
+  TensorDescriptor src_tensor_descriptor_;
+  TensorDescriptor dst_tensor_descriptor_;
+};
+
+// Implements conversion from OpenCL-specific tensor layout to BHWC OpenCL
+// buffer.
+class TensorToBHWCBufferConverter : public OpenClConverterImpl {
+ public:
+  static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
+    return IsOpenCLTensor(input) && IsBHWCOpenCLBuffer(output);
+  }
+
+  absl::Status Init(const TensorObjectDef& input_def,
+                    const TensorObjectDef& output_def,
+                    Environment* environment) final {
+    TensorStorageType src_tensor_type = ToTensorStorageType(
+        input_def.object_def.object_type, input_def.object_def.data_layout);
+    tensor_descriptor_.layout = Layout::BHWC;
+    tensor_descriptor_.storage_type = src_tensor_type;
+    tensor_descriptor_.data_type = input_def.object_def.data_type;
+    args_.AddObjectRef("tensor", AccessType::READ,
+                       absl::make_unique<TensorDescriptor>(tensor_descriptor_));
+
+    const bool need_fp16_support =
+        input_def.object_def.data_type == DataType::FLOAT16 ||
+        output_def.object_def.data_type == DataType::FLOAT16;
+    std::string shader_src;
+    if (need_fp16_support) {
+      shader_src += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+    }
+    const std::string out_data_type =
+        ToCLDataType(output_def.object_def.data_type);
+    shader_src += "__kernel void tensor_to_bhwc(";
+    shader_src += "__global " + out_data_type + "* dst, $0) {\n";
+    shader_src += R"(  int linear_id = get_global_id(0);
+  int x = linear_id / args.tensor.Batch();
+  int b = linear_id % args.tensor.Batch();
+  int y = get_global_id(1);
+  int d = get_global_id(2);
+  if (x >= args.tensor.Width() || y >= args.tensor.Height() || d >= args.tensor.Slices()) return;
+)";
+    shader_src += "  " + out_data_type + "4 input = args.tensor.Read<" +
+                  out_data_type + ">(x, y, d, b);\n";
+    shader_src += R"(  int c = d * 4;
   int index = ((b * args.tensor.Height() + y) * args.tensor.Width() + x) * args.tensor.Channels() + c;
 
   dst[index] = input.x;
@@ -115,39 +238,8 @@ class FromTensorConverter : public OpenClConverterImpl {
   }
   if (c + 3 < args.tensor.Channels()) {
     dst[index + 3] = input.w;
-  })");
   }
-
-  absl::Status Init(const TensorObjectDef& input_def,
-                    const TensorObjectDef& output_def,
-                    Environment* environment) final {
-    auto params_kernel = output_def.object_def.data_layout == DataLayout::BHWC
-                             ? GetToBhwcKernel(input_def, output_def)
-                             : GetToDhwc4Kernel(input_def, output_def);
-
-    TensorStorageType src_tensor_type = ToTensorStorageType(
-        input_def.object_def.object_type, input_def.object_def.data_layout);
-    tensor_descriptor_.layout = Layout::BHWC;
-    tensor_descriptor_.storage_type = src_tensor_type;
-    tensor_descriptor_.data_type = input_def.object_def.data_type;
-    args_.AddObjectRef("tensor", AccessType::READ,
-                       absl::make_unique<TensorDescriptor>(tensor_descriptor_));
-    std::string shader_src =
-        R"(
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
-
-__kernel void from_tensor()" +
-        params_kernel.first + R"(, $0) {
-  int linear_id = get_global_id(0);
-  int x = linear_id / args.tensor.Batch();
-  int b = linear_id % args.tensor.Batch();
-  int y = get_global_id(1);
-  int d = get_global_id(2);
-  if (x >= args.tensor.Width() || y >= args.tensor.Height() || d >= args.tensor.Slices()) return;
-  )" + ToCLDataType(output_def.object_def.data_type, 4) +
-        " input = args.tensor.Read<" +
-        ToCLDataType(output_def.object_def.data_type) + ">(x, y, d, b);\n" +
-        params_kernel.second + "\n}";
+})";
     queue_ = environment->queue();
     context_ = &environment->context();
     shape_ = BHWC(input_def.dimensions.b, input_def.dimensions.h,
@@ -155,7 +247,7 @@ __kernel void from_tensor()" +
     RETURN_IF_ERROR(
         args_.TransformToCLCode(environment->device().info_, {}, &shader_src));
     return environment->program_cache()->GetOrCreateCLKernel(
-        shader_src, "from_tensor", environment->context(),
+        shader_src, "tensor_to_bhwc", environment->context(),
         environment->device(), &kernel_);
   }
 
@@ -164,64 +256,24 @@ __kernel void from_tensor()" +
     auto output = absl::get_if<OpenClBuffer>(&output_obj);
     if (!output || !output->memobj) {
       return absl::InvalidArgumentError(
-          "Missing output in from_tensor converter");
-    }
-    cl_mem memory = nullptr;
-    auto input_texture = absl::get_if<OpenClTexture>(&input_obj);
-    if (input_texture && input_texture->memobj) {
-      memory = input_texture->memobj;
-    }
-    auto input_buffer = absl::get_if<OpenClBuffer>(&input_obj);
-    if (input_buffer && input_buffer->memobj) {
-      memory = input_buffer->memobj;
-    }
-    if (!memory) {
-      return absl::InvalidArgumentError(
-          "Missing input in from_tensor converter");
+          "Missing output in tensor_to_bhwc converter");
     }
+
+    cl_mem in_memory;
+    RETURN_IF_ERROR(GetOpenCLMemory(input_obj, &in_memory));
     Tensor tensor;
-    RETURN_IF_ERROR(CreateSharedTensor(*context_, memory, shape_,
+    RETURN_IF_ERROR(CreateSharedTensor(*context_, in_memory, shape_,
                                        tensor_descriptor_, &tensor));
     return DispatchKernel(output->memobj, &tensor);
   }
 };
 
-// Implements conversion from BHWC to OpenCL-specific tensor layout.
-class ToTensorConverter : public OpenClConverterImpl {
+// Implements conversion from BHWC OpenCL buffer to OpenCL-specific tensor
+// layout.
+class BHWCBufferToTensorConverter : public OpenClConverterImpl {
  public:
   static bool IsSupported(const ObjectDef& input, const ObjectDef& output) {
-    return IsSupportedDataType(input.data_type) &&
-           IsSupportedDataType(output.data_type) &&
-           // Input is always Buffer/BHWC
-           input.object_type == ObjectType::OPENCL_BUFFER &&
-           (input.data_layout == DataLayout::BHWC ||
-            input.data_layout == DataLayout::DHWC4) &&
-           // -> Texture2D/HDWC4
-           ((output.object_type == ObjectType::OPENCL_TEXTURE &&
-             output.data_layout == DataLayout::HDWC4) ||
-            // -> TextureArray/DHWC4
-            (output.object_type == ObjectType::OPENCL_TEXTURE &&
-             output.data_layout == DataLayout::DHWC4) ||
-            // -> SingleTextureArray/BHWC
-            (output.object_type == ObjectType::OPENCL_TEXTURE &&
-             output.data_layout == DataLayout::BHWC) ||
-            // -> Buffer/DHWC4
-            (output.object_type == ObjectType::OPENCL_BUFFER &&
-             output.data_layout == DataLayout::DHWC4));
-  }
-
-  std::pair<std::string, std::string> GetFromDhwc4Kernel(
-      const TensorObjectDef& input_def,
-      const TensorObjectDef& output_def) const {
-    return std::make_pair(
-        "__global " + ToCLDataType(input_def.object_def.data_type, 4) + "* src",
-        output_def.object_def.data_type == input_def.object_def.data_type
-            ? "result = src[(d * args.tensor.Height() + y) * "
-              "args.tensor.Width() + x];"
-            : "result = convert_" +
-                  ToCLDataType(output_def.object_def.data_type, 4) +
-                  "(src[(d * args.tensor.Height() + y) * args.tensor.Width() + "
-                  "x]);");
+    return IsBHWCOpenCLBuffer(input) && IsOpenCLTensor(output);
   }
 
   std::pair<std::string, std::string> GetFromBhwcKernel(
@@ -241,9 +293,8 @@ class ToTensorConverter : public OpenClConverterImpl {
   absl::Status Init(const TensorObjectDef& input_def,
                     const TensorObjectDef& output_def,
                     Environment* environment) final {
-    auto params_kernel = input_def.object_def.data_layout == DataLayout::BHWC
-                             ? GetFromBhwcKernel(input_def, output_def)
-                             : GetFromDhwc4Kernel(input_def, output_def);
+    auto params_kernel = GetFromBhwcKernel(input_def, output_def);
+
     TensorStorageType dst_tensor_type = ToTensorStorageType(
         output_def.object_def.object_type, output_def.object_def.data_layout);
     tensor_descriptor_.layout = Layout::BHWC;
@@ -251,23 +302,38 @@ class ToTensorConverter : public OpenClConverterImpl {
     tensor_descriptor_.data_type = output_def.object_def.data_type;
     args_.AddObjectRef("tensor", AccessType::WRITE,
                        absl::make_unique<TensorDescriptor>(tensor_descriptor_));
-    std::string shader_src =
-        R"(
-#pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
-#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-__kernel void to_tensor()" +
-        params_kernel.first + R"(, $0) {
-  int linear_id = get_global_id(0);
+    const bool need_fp16_support =
+        input_def.object_def.data_type == DataType::FLOAT16 ||
+        output_def.object_def.data_type == DataType::FLOAT16;
+    std::string shader_src;
+    if (need_fp16_support) {
+      shader_src += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
+    }
+    const std::string in_data_type =
+        ToCLDataType(input_def.object_def.data_type);
+    const std::string out_data_type =
+        ToCLDataType(output_def.object_def.data_type);
+    shader_src += "__kernel void bhwc_to_tensor(";
+    shader_src += "__global " + in_data_type + "* src, $0) {\n";
+
+    shader_src += R"(  int linear_id = get_global_id(0);
   int x = linear_id / args.tensor.Batch();
   int b = linear_id % args.tensor.Batch();
   int y = get_global_id(1);
   int d = get_global_id(2);
 
   if (x >= args.tensor.Width() || y >= args.tensor.Height() || d >= args.tensor.Slices()) return;
-  )" + ToCLDataType(output_def.object_def.data_type, 4) +
-        " result;\n" + params_kernel.second + "\n  " +
-        "args.tensor.Write(result, x, y, d, b);\n}";
+)";
+    shader_src += "  " + out_data_type + "4 result;\n";
+    shader_src += R"(  int c = d * 4;
+  int index = ((b * args.tensor.Height() + y) * args.tensor.Width() + x) * args.tensor.Channels() + c;
+  result.x = src[index];
+  result.y = c + 1 < args.tensor.Channels() ? src[index + 1] : 1;
+  result.z = c + 2 < args.tensor.Channels() ? src[index + 2] : 2;
+  result.w = c + 3 < args.tensor.Channels() ? src[index + 3] : 3;
+)";
+    shader_src += "  args.tensor.Write(result, x, y, d, b);\n}";
     queue_ = environment->queue();
     context_ = &environment->context();
     shape_ = BHWC(output_def.dimensions.b, output_def.dimensions.h,
@@ -275,31 +341,21 @@ __kernel void to_tensor()" +
     RETURN_IF_ERROR(
         args_.TransformToCLCode(environment->device().info_, {}, &shader_src));
     return environment->program_cache()->GetOrCreateCLKernel(
-        shader_src, "to_tensor", environment->context(), environment->device(),
-        &kernel_);
+        shader_src, "bhwc_to_tensor", environment->context(),
+        environment->device(), &kernel_);
   }
 
   absl::Status Convert(const TensorObject& input_obj,
                        const TensorObject& output_obj) override {
     auto input = absl::get_if<OpenClBuffer>(&input_obj);
     if (!input || !input->memobj) {
-      return absl::InvalidArgumentError("Missing input in to_tensor converter");
-    }
-    cl_mem memory = nullptr;
-    auto output_texture = absl::get_if<OpenClTexture>(&output_obj);
-    if (output_texture && output_texture->memobj) {
-      memory = output_texture->memobj;
-    }
-    auto output_buffer = absl::get_if<OpenClBuffer>(&output_obj);
-    if (output_buffer && output_buffer->memobj) {
-      memory = output_buffer->memobj;
-    }
-    if (!memory) {
       return absl::InvalidArgumentError(
-          "Missing output in to_tensor converter");
+          "Missing input in bhwc_to_tensor converter");
     }
+    cl_mem out_memory;
+    RETURN_IF_ERROR(GetOpenCLMemory(output_obj, &out_memory));
     Tensor tensor;
-    RETURN_IF_ERROR(CreateSharedTensor(*context_, memory, shape_,
+    RETURN_IF_ERROR(CreateSharedTensor(*context_, out_memory, shape_,
                                        tensor_descriptor_, &tensor));
     return DispatchKernel(input->memobj, &tensor);
   }
@@ -465,9 +521,10 @@ class OpenClTensorConverterBuilder : public TensorObjectConverterBuilder {
     const auto& output_def = output.object_def;
     return input.dimensions == output.dimensions &&
            (TrivialCopier::IsSupported(input_def, output_def) ||
+            TensorToTensorConverter::IsSupported(input_def, output_def) ||
             CpuCopier::IsSupported(input_def, output_def) ||
-            FromTensorConverter::IsSupported(input_def, output_def) ||
-            ToTensorConverter::IsSupported(input_def, output_def));
+            TensorToBHWCBufferConverter::IsSupported(input_def, output_def) ||
+            BHWCBufferToTensorConverter::IsSupported(input_def, output_def));
   }
 
   absl::Status MakeConverter(
@@ -478,12 +535,16 @@ class OpenClTensorConverterBuilder : public TensorObjectConverterBuilder {
     const auto& output_def = output.object_def;
     if (TrivialCopier::IsSupported(input_def, output_def)) {
       impl = absl::make_unique<TrivialCopier>();
+    } else if (TensorToTensorConverter::IsSupported(input_def, output_def)) {
+      impl = absl::make_unique<TensorToTensorConverter>();
     } else if (CpuCopier::IsSupported(input_def, output_def)) {
       impl = absl::make_unique<CpuCopier>();
-    } else if (FromTensorConverter::IsSupported(input_def, output_def)) {
-      impl = absl::make_unique<FromTensorConverter>();
-    } else if (ToTensorConverter::IsSupported(input_def, output_def)) {
-      impl = absl::make_unique<ToTensorConverter>();
+    } else if (TensorToBHWCBufferConverter::IsSupported(input_def,
+                                                        output_def)) {
+      impl = absl::make_unique<TensorToBHWCBufferConverter>();
+    } else if (BHWCBufferToTensorConverter::IsSupported(input_def,
+                                                        output_def)) {
+      impl = absl::make_unique<BHWCBufferToTensorConverter>();
     } else {
       return absl::UnimplementedError("Unsupported conversion");
     }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
index 18522239a47..b2bf5216f8e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.cc
@@ -17,12 +17,14 @@ limitations under the License.
 
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
@@ -33,23 +35,23 @@ ConvolutionTransposed::ConvolutionTransposed(
     const OperationDef& definition, const ConvolutionTransposedAttributes& attr,
     const DeviceInfo& device_info)
     : GPUOperation(definition),
-      stride_(attr.stride.w, attr.stride.h),
-      block_size_(2, 2, 2) {
+      stride_(attr.stride.w, attr.stride.h, 1, 1),
+      block_size_(2, 2, 1, 2) {
   const bool weights_are_buffer = device_info.IsMali();
   const bool is_f16 = definition.precision == CalculationsPrecision::F16;
   if (device_info.IsMali()) {
     if (device_info.mali_info.IsMidgard()) {
-      block_size_ = is_f16 ? int3(2, 1, 2) : int3(2, 1, 1);
+      block_size_ = is_f16 ? int4(2, 1, 1, 2) : int4(2, 1, 1, 1);
     } else {
-      block_size_ = is_f16 ? int3(2, 2, 2) : int3(2, 2, 1);
+      block_size_ = is_f16 ? int4(2, 2, 1, 2) : int4(2, 2, 1, 1);
     }
   }
   const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
   if (dst_depth == 1 || dst_depth == 3) {
     if (!device_info.IsMali()) {
-      block_size_.y *= block_size_.z;
+      block_size_.y *= block_size_.w;
     }
-    block_size_.z = 1;
+    block_size_.w = 1;
   }
 
   args_.AddInt("stride_x", stride_.x);
@@ -63,6 +65,45 @@ ConvolutionTransposed::ConvolutionTransposed(
   UploadWeights(attr.weights, weights_are_buffer);
 }
 
+ConvolutionTransposed::ConvolutionTransposed(
+    const OperationDef& definition,
+    const ConvolutionTransposed3DAttributes& attr,
+    const DeviceInfo& device_info)
+    : GPUOperation(definition),
+      stride_(attr.stride.w, attr.stride.h, attr.stride.d, 1),
+      block_size_(2, 2, 1, 2) {
+  const bool weights_are_buffer = device_info.IsMali();
+  const bool is_f16 = definition.precision == CalculationsPrecision::F16;
+  if (device_info.IsMali()) {
+    if (device_info.mali_info.IsMidgard()) {
+      block_size_ = is_f16 ? int4(2, 1, 1, 2) : int4(2, 1, 1, 1);
+    } else {
+      block_size_ = is_f16 ? int4(2, 2, 1, 2) : int4(2, 2, 1, 1);
+    }
+  }
+  const int dst_depth = DivideRoundUp(attr.weights.shape.o, 4);
+  if (dst_depth == 1 || dst_depth == 3) {
+    if (!device_info.IsMali()) {
+      block_size_.y *= block_size_.w;
+    }
+    block_size_.w = 1;
+  }
+
+  args_.AddInt("stride_x", stride_.x);
+  args_.AddInt("stride_y", stride_.y);
+  args_.AddInt("stride_z", stride_.z);
+  args_.AddInt("padding_x", attr.padding.prepended.w);
+  args_.AddInt("padding_y", attr.padding.prepended.h);
+  args_.AddInt("padding_z", attr.padding.prepended.d);
+  args_.AddInt("kernel_size_x", attr.weights.shape.w);
+  args_.AddInt("kernel_size_y", attr.weights.shape.h);
+  args_.AddInt("kernel_size_z", attr.weights.shape.d);
+  args_.AddInt("grid_size_y");
+  code_ = GenerateConvolutionTransposedCode(definition_, device_info,
+                                            weights_are_buffer, block_size_);
+  UploadWeights(attr.weights, weights_are_buffer);
+}
+
 ConvolutionTransposed::ConvolutionTransposed(ConvolutionTransposed&& operation)
     : GPUOperation(std::move(operation)),
       stride_(operation.stride_),
@@ -80,50 +121,85 @@ ConvolutionTransposed& ConvolutionTransposed::operator=(
 
 std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
     const OperationDef& op_def, const DeviceInfo& device_info,
-    bool weights_are_buffer, const int3& block_size) {
+    bool weights_are_buffer, const int4& block_size) {
   auto src_desc = op_def.src_tensors[0];
   src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
   AddSrcTensor("src_tensor", src_desc);
-
   AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
 
-  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
-  bool image_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER;
-  bool manual_clamp =
-      image_buffer || src_tensor_type == TensorStorageType::BUFFER;
+  const auto& src_def = op_def.src_tensors[0];
 
   std::string c = GetCommonDefines(op_def.precision);
 
-  for (int z = 0; z < block_size.z; ++z) {
+  for (int s = 0; s < block_size.w; ++s) {
     const std::string f0 =
-        weights_are_buffer ? "weights_cache[" + std::to_string(z) + "].s0123"
-                           : "f" + std::to_string(z * 4 + 0);
+        weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].s0123"
+                           : "f" + std::to_string(s * 4 + 0);
     const std::string f1 =
-        weights_are_buffer ? "weights_cache[" + std::to_string(z) + "].s4567"
-                           : "f" + std::to_string(z * 4 + 1);
+        weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].s4567"
+                           : "f" + std::to_string(s * 4 + 1);
     const std::string f2 =
-        weights_are_buffer ? "weights_cache[" + std::to_string(z) + "].s89ab"
-                           : "f" + std::to_string(z * 4 + 2);
+        weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].s89ab"
+                           : "f" + std::to_string(s * 4 + 2);
     const std::string f3 =
-        weights_are_buffer ? "weights_cache[" + std::to_string(z) + "].scdef"
-                           : "f" + std::to_string(z * 4 + 3);
+        weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].scdef"
+                           : "f" + std::to_string(s * 4 + 3);
     switch (op_def.precision) {
       case CalculationsPrecision::F32:
       case CalculationsPrecision::F16:
-        c += "#define CONV" + std::to_string(z) + "(R, S)    \\\n";
+        c += "#define CONV" + std::to_string(s) + "(R, S)    \\\n";
         c += "R += S.x * " + f0 + "; \\\n";
         c += "R += S.y * " + f1 + "; \\\n";
         c += "R += S.z * " + f2 + "; \\\n";
         c += "R += S.w * " + f3 + ";   \n";
         break;
       case CalculationsPrecision::F32_F16:
-        c += "#define CONV" + std::to_string(z) + "(R, S) \\\n";
+        c += "#define CONV" + std::to_string(s) + "(R, S) \\\n";
         c += "R += convert_float4(S.x * " + f0 + " + S.y * " + f1 +
              " + S.z * " + f2 + " + S.w * " + f3 + ");\n";
         break;
     }
   }
 
+  auto generate_id = [&](const std::string& x, const std::string& y,
+                         const std::string& z) {
+    std::string id;
+    if (src_def.HasAxis(Axis::WIDTH)) {
+      id += "_w" + x;
+    }
+    if (src_def.HasAxis(Axis::HEIGHT)) {
+      id += "_h" + y;
+    }
+    if (src_def.HasAxis(Axis::DEPTH)) {
+      id += "_d" + z;
+    }
+    return id;
+  };
+
+  auto generate_id_full = [&](const std::string& x, const std::string& y,
+                              const std::string& z, const std::string& s) {
+    return generate_id(x, y, z) + "_s" + s;
+  };
+
+  auto generate_check = [&](const std::string& x, const std::string& y,
+                            const std::string& z) {
+    std::string check;
+    const std::vector<Axis> axes{Axis::WIDTH, Axis::HEIGHT, Axis::DEPTH};
+    const std::vector<std::string> names{"in_x", "in_y", "in_z"};
+    const std::vector<std::string> coords{x, y, z};
+    for (int i = 0; i < axes.size(); ++i) {
+      const auto& axis = axes[i];
+      if (src_def.HasAxis(axis) && !src_def.SupportsZeroClamp(axis) &&
+          block_size[i] != 1) {
+        if (!check.empty()) {
+          check += " && ";
+        }
+        check += names[i] + coords[i];
+      }
+    }
+    return check;
+  };
+
   switch (op_def.precision) {
     case CalculationsPrecision::F32:
       c += "#define FLT16 float16\n";
@@ -149,23 +225,48 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
   c += "  int ceil_x = dst_x / args.stride_x;\n";
   c += "  dst_x = ceil_x * args.stride_x * " + std::to_string(block_size.x) +
        " + rem_x;\n";
-  c += "  int dst_y = get_global_id(1);\n";
+  if (src_def.HasAxis(Axis::DEPTH)) {
+    c += "  int linear_id_y = get_global_id(1);\n";
+    c += "  int dst_y = linear_id_y % args.grid_size_y;\n";
+    c += "  int dst_z = linear_id_y / args.grid_size_y;\n";
+    c += "  int rem_z = dst_z % args.stride_z;\n";
+    c += "  int ceil_z = dst_z / args.stride_z;\n";
+    c += "  dst_z = ceil_z * args.stride_z * " + std::to_string(block_size.z) +
+         " + rem_z;\n";
+    c += "  if (dst_z >= args.dst_tensor.Depth()) return;\n";
+  } else {
+    c += "  int dst_y = get_global_id(1);\n";
+  }
   c += "  int rem_y = dst_y % args.stride_y;\n";
   c += "  int ceil_y = dst_y / args.stride_y;\n";
   c += "  dst_y = ceil_y * args.stride_y * " + std::to_string(block_size.y) +
        " + rem_y;\n";
-  c += "  int dst_z = get_global_id(2) * " + std::to_string(block_size.z) +
+  c += "  int dst_s = get_global_id(2) * " + std::to_string(block_size.w) +
        ";\n";
   c += "  if (dst_x >= args.dst_tensor.Width() || dst_y >= "
-       "args.dst_tensor.Height() || dst_z >= "
+       "args.dst_tensor.Height() || dst_s >= "
        "args.dst_tensor.Slices()) return;\n";
   if (weights_are_buffer) {
-    c += "  int f_base = dst_z * args.src_tensor.Slices() * args.kernel_size_x "
-         "* args.kernel_size_y;\n";
+    c += "  int f_base = dst_s * args.src_tensor.Slices() * args.kernel_size_x "
+         "* args.kernel_size_y";
+    if (src_def.HasAxis(Axis::DEPTH)) {
+      c += " * args.kernel_size_z";
+    }
+    c += ";\n";
   }
-  for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) {
-    c += "  ACCUM_FLT4 r" + std::to_string(i) +
-         " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+  for (int s = 0; s < block_size.w; ++s) {
+    const std::string sind = std::to_string(s);
+    for (int z = 0; z < block_size.z; ++z) {
+      const std::string zind = std::to_string(z);
+      for (int y = 0; y < block_size.y; ++y) {
+        const std::string yind = std::to_string(y);
+        for (int x = 0; x < block_size.x; ++x) {
+          const std::string xind = std::to_string(x);
+          c += "  ACCUM_FLT4 r" + generate_id_full(xind, yind, zind, sind) +
+               " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+        }
+      }
+    }
   }
   c += "  int kernel_first_dst_x = dst_x + args.padding_x;\n";
   c += "  int kernel_first_dst_y = dst_y + args.padding_y;\n";
@@ -181,21 +282,59 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
   c +=
       "  int src_y = (kernel_first_dst_y + offset_y_strided) / args.stride_y - "
       "offset_y;\n";
-  c += "  int src_as_dst_y = src_y * args.stride_y;\n";
-  c += "  for (;src_as_dst_y > kernel_last_dst_y; src_y -= 1, src_as_dst_y -= "
-       "args.stride_y) {\n";
+  if (src_def.HasAxis(Axis::DEPTH)) {
+    c += "  int kernel_first_dst_z = dst_z + args.padding_z;\n";
+    c += "  int kernel_last_dst_z = kernel_first_dst_z - args.kernel_size_z;\n";
+    c += "  int offset_z = abs(args.padding_z);\n";
+    c += "  int offset_z_strided = offset_z * args.stride_z;\n";
+    c += "  int src_z = (kernel_first_dst_z + offset_z_strided) / "
+         "args.stride_z - offset_z;\n";
+    c += "  int src_as_dst_z = src_z * args.stride_z;\n";
+    c +=
+        "  for (;src_as_dst_z > kernel_last_dst_z; src_z -= 1, src_as_dst_z -= "
+        "args.stride_z) {\n";
+    for (int z = 0; z < block_size.z; ++z) {
+      const std::string zindex = std::to_string(z);
+      c += "    int sz" + zindex + " = src_z + " + zindex + ";\n";
+      if (!src_def.SupportsZeroClamp(Axis::DEPTH)) {
+        c += "    bool in_z" + zindex + " = sz" + zindex + " >= 0 && sz" +
+             zindex + " < args.src_tensor.Depth();\n";
+        if (!src_def.CanReadOutOfBorder(Axis::DEPTH)) {
+          c += "    sz" + zindex + " = clamp(sz" + zindex +
+               ", 0, args.src_tensor.Depth() - 1);\n";
+        }
+      }
+    }
+    if (block_size.z == 1 && !src_def.SupportsZeroClamp(Axis::DEPTH)) {
+      c += "    if (!in_z0) continue;\n";
+    }
+    c += "    int kernel_z = kernel_first_dst_z - src_as_dst_z;\n";
+    c += "    int src_as_dst_y = src_y * args.stride_y;\n";
+    c += "    int src_y_copy = src_y;\n";
+    c += "    for (;src_as_dst_y > kernel_last_dst_y; src_y_copy -= 1, "
+         "src_as_dst_y -= args.stride_y) {\n";
+  } else {
+    c += "  int src_as_dst_y = src_y * args.stride_y;\n";
+    c += "  for (;src_as_dst_y > kernel_last_dst_y; src_y -= 1, src_as_dst_y "
+         "-= args.stride_y) {\n";
+  }
   for (int y = 0; y < block_size.y; ++y) {
     const std::string yindex = std::to_string(y);
-    c += "    int sy" + yindex + " = src_y + " + yindex + ";\n";
-    if (manual_clamp) {
+    const std::string src_y =
+        src_def.HasAxis(Axis::DEPTH) ? "src_y_copy" : "src_y";
+    c += "    int sy" + yindex + " = " + src_y + " + " + yindex + ";\n";
+    if (!src_def.SupportsZeroClamp(Axis::HEIGHT)) {
       c += "    bool in_y" + yindex + " = sy" + yindex + " >= 0 && sy" +
            yindex + " < args.src_tensor.Height();\n";
-      if (!image_buffer) {
+      if (!src_def.CanReadOutOfBorder(Axis::HEIGHT)) {
         c += "    sy" + yindex + " = clamp(sy" + yindex +
              ", 0, args.src_tensor.Height() - 1);\n";
       }
     }
   }
+  if (block_size.y == 1 && !src_def.SupportsZeroClamp(Axis::HEIGHT)) {
+    c += "      if (!in_y0) continue;\n";
+  }
   c += "    int kernel_y = kernel_first_dst_y - src_as_dst_y;\n";
   c += "    int src_as_dst_x = src_x * args.stride_x;\n";
   c += "    int src_x_copy = src_x;\n";
@@ -205,132 +344,196 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
   for (int x = 0; x < block_size.x; ++x) {
     const std::string xindex = std::to_string(x);
     c += "      int sx" + xindex + " = src_x_copy + " + xindex + ";\n";
-    if (manual_clamp) {
+    if (!src_def.SupportsZeroClamp(Axis::WIDTH)) {
       c += "      bool in_x" + xindex + " = sx" + xindex + " >= 0 && sx" +
            xindex + " < args.src_tensor.Width();\n";
-      if (!image_buffer) {
+      if (!src_def.CanReadOutOfBorder(Axis::WIDTH)) {
         c += "      sx" + xindex + " = clamp(sx" + xindex +
              ", 0, args.src_tensor.Width() - 1);\n";
       }
     }
   }
-  for (int y = 0; y < block_size.y; ++y) {
-    const std::string yindex = std::to_string(y);
-    for (int x = 0; x < block_size.x; ++x) {
-      const std::string xindex = std::to_string(x);
-      const std::string id = std::to_string(y * block_size.x + x);
-      c += "      args.src_tensor.GetAddress(addr_" + id + ", sx" + xindex +
-           ", sy" + yindex + ", 0);\n";
-      if (image_buffer) {
-        c += "      addr_" + id + " = select(-1, addr_" + id + ", (in_x" +
-             xindex + " && in_y" + yindex + "));\n";
-        c += absl::Substitute(
-            "      int dz_$0 = select(0, args.src_tensor.SliceStride(), "
-            "(in_x$1 && in_y$2));\n",
-            y * block_size.x + x, x, y);
+  if (block_size.x == 1 && !src_def.SupportsZeroClamp(Axis::WIDTH)) {
+    c += "      if (!in_x0) continue;\n";
+  }
+  for (int z = 0; z < block_size.z; ++z) {
+    const std::string zind = std::to_string(z);
+    for (int y = 0; y < block_size.y; ++y) {
+      const std::string yind = std::to_string(y);
+      for (int x = 0; x < block_size.x; ++x) {
+        const std::string xind = std::to_string(x);
+        const std::string id = generate_id(xind, yind, zind);
+        const std::string check = generate_check(xind, yind, zind);
+        std::string coords = "sx" + xind + ", sy" + yind;
+        if (src_def.HasAxis(Axis::DEPTH)) {
+          coords += ", sz" + zind;
+        }
+        if (src_def.IsLinear()) {
+          c += "      args.src_tensor.GetAddress(addr" + id + ", " + coords +
+               ", 0);\n";
+        }
+        if (src_def.ReturnsZeroForNegOneRead()) {
+          c += "      addr" + id + " = select(-1, addr" + id + ", (" + check +
+               "));\n";
+          c += "      int ds" + id +
+               " = select(0, args.src_tensor.SliceStride(), (" + check +
+               "));\n";
+        }
       }
     }
   }
-  if (src_tensor_type == TensorStorageType::BUFFER) {
-    c += "      int dz = args.src_tensor.SliceStride();\n";
-  }
-  if (block_size.x == 1 && block_size.y == 1 && manual_clamp) {
-    c += "      if (!in_x0 || !in_y0) continue;\n";
+  if (src_def.storage_type == TensorStorageType::BUFFER) {
+    c += "      int ds = args.src_tensor.SliceStride();\n";
   }
   c += "      int kernel_x = kernel_first_dst_x - src_as_dst_x;\n";
-  c += "      int kernel_index = kernel_y * args.kernel_size_x + kernel_x;\n";
+  if (src_def.HasAxis(Axis::DEPTH)) {
+    c += "      int kernel_index = (kernel_z * args.kernel_size_y + kernel_y) "
+         "*  args.kernel_size_x + kernel_x;\n";
+  } else {
+    c += "      int kernel_index = kernel_y * args.kernel_size_x + kernel_x;\n";
+  }
   if (weights_are_buffer) {
     c += "      int f_offset = f_base + kernel_index * "
          "args.src_tensor.Slices() * " +
-         std::to_string(block_size.z) + ";\n";
+         std::to_string(block_size.w) + ";\n";
   } else {
     c += "      int x_c = kernel_index * args.src_tensor.Slices();\n";
   }
   c += "      for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
   const bool conditional_read = device_info.IsMali();
-  for (int y = 0; y < block_size.y; ++y) {
-    const std::string yindex = std::to_string(y);
-    for (int x = 0; x < block_size.x; ++x) {
-      const std::string xindex = std::to_string(x);
-      const std::string id = std::to_string(y * block_size.x + x);
-      if (image_buffer) {
-        c += "        FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
-             "); addr_" + id + " += dz_" + id + ";\n";
-      } else if (manual_clamp) {
-        if (conditional_read) {
-          c += "        FLT4 src" + id + " = in_x" + xindex + " && in_y" +
-               yindex + " ? args.src_tensor.Read(addr_" + id +
-               ") : (FLT4)(0.0f); addr_" + id + " += dz;\n";
+  for (int z = 0; z < block_size.z; ++z) {
+    const std::string zind = std::to_string(z);
+    for (int y = 0; y < block_size.y; ++y) {
+      const std::string yind = std::to_string(y);
+      for (int x = 0; x < block_size.x; ++x) {
+        const std::string xind = std::to_string(x);
+        const std::string id = generate_id(xind, yind, zind);
+        std::string address;
+        if (src_def.IsLinear()) {
+          address = "addr" + id;
         } else {
-          c += "        FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
-               ") * (FLT)(in_x" + xindex + " && in_y" + yindex + "); addr_" +
-               id + " += dz;\n";
+          address = "sx" + xind + ", sy" + yind;
+          if (src_def.HasAxis(Axis::DEPTH)) {
+            address += ", sz" + zind;
+          }
+          address += ", s";
+        }
+        if (src_def.ReturnsZeroForNegOneRead()) {
+          c += "        FLT4 src" + id + " = args.src_tensor.Read(" + address +
+               "); " + address + " += ds" + id + ";\n";
+        } else {
+          const std::string check = generate_check(xind, yind, zind);
+          if (!check.empty()) {
+            if (conditional_read) {
+              c += "        FLT4 src" + id + " = " + check +
+                   " ? args.src_tensor.Read(" + address + ") : (FLT4)(0.0f);\n";
+            } else {
+              c += "        FLT4 src" + id + " = args.src_tensor.Read(" +
+                   address + ") * (FLT)(" + check + ");\n";
+            }
+          } else {
+            c += "        FLT4 src" + id + " = args.src_tensor.Read(" +
+                 address + ");\n";
+          }
+          if (src_def.IsLinear()) {
+            c += "        addr" + id + " += ds;\n";
+          }
         }
-      } else {
-        c += "        FLT4 src" + id + " = args.src_tensor.Read(sx" + xindex +
-             ", sy" + yindex + ", s);\n";
       }
     }
   }
   if (weights_are_buffer) {
     c += "        __global FLT16* weights_cache = "
          "args.weights.GetPtr(f_offset);\n";
-    c += "        f_offset += " + std::to_string(block_size.z) + ";\n";
+    c += "        f_offset += " + std::to_string(block_size.w) + ";\n";
   } else {
-    for (int z = 0; z < block_size.z; ++z) {
+    for (int s = 0; s < block_size.w; ++s) {
       c += absl::Substitute(
-          R"(        FLT4 f$1 = args.weights0.Read(dst_z + $0, x_c);
-        FLT4 f$2 = args.weights1.Read(dst_z + $0, x_c);
-        FLT4 f$3 = args.weights2.Read(dst_z + $0, x_c);
-        FLT4 f$4 = args.weights3.Read(dst_z + $0, x_c);
+          R"(        FLT4 f$1 = args.weights0.Read(dst_s + $0, x_c);
+        FLT4 f$2 = args.weights1.Read(dst_s + $0, x_c);
+        FLT4 f$3 = args.weights2.Read(dst_s + $0, x_c);
+        FLT4 f$4 = args.weights3.Read(dst_s + $0, x_c);
 )",
-          z, z * 4 + 0, z * 4 + 1, z * 4 + 2, z * 4 + 3);
+          s, s * 4 + 0, s * 4 + 1, s * 4 + 2, s * 4 + 3);
     }
     c += "        x_c++;\n";
   }
-  for (int z = 0; z < block_size.z; ++z) {
-    for (int i = 0; i < block_size.x * block_size.y; ++i) {
-      c += "        CONV" + std::to_string(z) + "(r" +
-           std::to_string(i + z * block_size.x * block_size.y) + ", src" +
-           std::to_string(i) + ");\n";
+  for (int s = 0; s < block_size.w; ++s) {
+    const std::string sind = std::to_string(s);
+    for (int z = 0; z < block_size.z; ++z) {
+      const std::string zind = std::to_string(z);
+      for (int y = 0; y < block_size.y; ++y) {
+        const std::string yind = std::to_string(y);
+        for (int x = 0; x < block_size.x; ++x) {
+          const std::string xind = std::to_string(x);
+          const std::string id = generate_id(xind, yind, zind);
+          const std::string full_id = generate_id_full(xind, yind, zind, sind);
+          c += "        CONV" + sind + "(r" + full_id + ", src" + id + ");\n";
+        }
+      }
     }
   }
   c += "      }\n";
   c += "    }\n";
   c += "  }\n";
-  for (int z = 0; z < block_size.z; ++z) {
-    c += "  if (dst_z < args.dst_tensor.Slices()) {\n";
-    c += "    FLT4 bias_val = args.biases.Read(dst_z);\n";
-    for (int y = 0; y < block_size.y; ++y) {
-      for (int x = 0; x < block_size.x; ++x) {
-        const std::string id =
-            std::to_string((z * block_size.y + y) * block_size.x + x);
-        c += "    {\n";
-        c += "      int xc = dst_x + args.stride_x * " + std::to_string(x) +
-             ";\n";
-        c += "      int yc = dst_y + args.stride_y * " + std::to_string(y) +
-             ";\n";
-        c += "      if (xc < args.dst_tensor.Width() && yc < "
-             "args.dst_tensor.Height()) {\n";
-        c += "        FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
-        c += "        args.dst_tensor.Write(res, xc, yc, dst_z);\n";
-        c += "      }\n";
-        c += "    }\n";
+  if (src_def.HasAxis(Axis::DEPTH)) {
+    c += "  }\n";
+  }
+  for (int s = 0; s < block_size.w; ++s) {
+    const std::string sind = std::to_string(s);
+    c += "  if (dst_s < args.dst_tensor.Slices()) {\n";
+    c += "    FLT4 bias_val = args.biases.Read(dst_s);\n";
+    for (int z = 0; z < block_size.z; ++z) {
+      const std::string zind = std::to_string(z);
+      for (int y = 0; y < block_size.y; ++y) {
+        const std::string yind = std::to_string(y);
+        for (int x = 0; x < block_size.x; ++x) {
+          const std::string xind = std::to_string(x);
+          const std::string id = generate_id_full(xind, yind, zind, sind);
+          std::string checks =
+              "xc < args.dst_tensor.Width() && yc < args.dst_tensor.Height()";
+          std::string coords = "xc, yc";
+          c += "    {\n";
+          c += "      int xc = dst_x + args.stride_x * " + xind + ";\n";
+          c += "      int yc = dst_y + args.stride_y * " + yind + ";\n";
+          if (src_def.HasAxis(Axis::DEPTH)) {
+            c += "      int zc = dst_z + args.stride_z * " + zind + ";\n";
+            checks += " && zc < args.dst_tensor.Depth()";
+            coords += ", zc";
+          }
+          c += "      if (" + checks + ") {\n";
+          c += "        FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
+          c += "        args.dst_tensor.Write(res, " + coords + ", dst_s);\n";
+          c += "      }\n";
+          c += "    }\n";
+        }
       }
     }
     c += "  }\n";
-    c += "  dst_z++;\n";
+    c += "  dst_s++;\n";
   }
   c += "}\n";
   return c;
 }
 
+absl::Status ConvolutionTransposed::BindArguments(ArgumentsBinder* args) {
+  if (definition_.src_tensors[0].HasAxis(Axis::DEPTH)) {
+    const int aligned_h =
+        AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
+    RETURN_IF_ERROR(
+        args->SetInt("grid_size_y", DivideRoundUp(aligned_h, block_size_.y)));
+  }
+  return absl::OkStatus();
+}
+
 int3 ConvolutionTransposed::GetGridSize() const {
   const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x);
   const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
+  const int aligned_d = AlignByN(dst_[0]->Depth(), stride_.z * block_size_.z);
   const int grid_x = DivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
-  const int grid_y = DivideRoundUp(aligned_h, block_size_.y);
-  const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.z);
+  const int grid_y = DivideRoundUp(aligned_h, block_size_.y) *
+                     DivideRoundUp(aligned_d, block_size_.z);
+  const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.w);
   return int3(grid_x, grid_y, grid_z);
 }
 
@@ -356,6 +559,21 @@ ConvolutionTransposed CreateConvolutionTransposed(
   return result;
 }
 
+ConvolutionTransposed CreateConvolutionTransposed3D(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposed3DAttributes& attr) {
+  ConvolutionTransposed result(definition, attr, device_info);
+
+  TensorLinearDescriptor desc;
+  desc.storage_type =
+      DeduceLinearStorageType(definition.GetPrimaryStorageType());
+  desc.element_type = definition.GetDataType();
+  desc.UploadLinearData(attr.bias);
+  result.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  return result;
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
index 7939236409e..5aa86f33e5a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed.h
@@ -16,10 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_H_
 
+#include <cstdint>
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
 #include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
@@ -42,6 +44,7 @@ class ConvolutionTransposed : public GPUOperation {
       TuningType tuning_type, const DeviceInfo& device_info,
       const KernelInfo& kernel_info,
       std::vector<int3>* work_groups) const override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   // Move only
@@ -54,30 +57,37 @@ class ConvolutionTransposed : public GPUOperation {
   friend ConvolutionTransposed CreateConvolutionTransposed(
       const DeviceInfo& device_info, const OperationDef& definition,
       const ConvolutionTransposedAttributes& attr);
-  explicit ConvolutionTransposed(const OperationDef& definition,
-                                 const ConvolutionTransposedAttributes& attr,
-                                 const DeviceInfo& device_info);
+  friend ConvolutionTransposed CreateConvolutionTransposed3D(
+      const DeviceInfo& device_info, const OperationDef& definition,
+      const ConvolutionTransposed3DAttributes& attr);
+  ConvolutionTransposed(const OperationDef& definition,
+                        const ConvolutionTransposedAttributes& attr,
+                        const DeviceInfo& device_info);
+  ConvolutionTransposed(const OperationDef& definition,
+                        const ConvolutionTransposed3DAttributes& attr,
+                        const DeviceInfo& device_info);
+
   template <DataType T>
   void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
                      bool weights_are_buffer);
 
-  template <DataType S, typename T>
-  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
-                            absl::Span<T> dst, bool weights_are_buffer);
+  template <DataType T>
+  void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
+                     bool weights_are_buffer);
 
   std::string GenerateConvolutionTransposedCode(const OperationDef& op_def,
                                                 const DeviceInfo& device_info,
                                                 bool weights_are_buffer,
-                                                const int3& block_size);
-  int2 stride_;
-  int3 block_size_ = int3(1, 1, 1);
+                                                const int4& block_size);
+  int4 stride_;
+  int4 block_size_ = int4(1, 1, 1, 1);  // WHDS
 };
 
 template <DataType T>
 void ConvolutionTransposed::UploadWeights(
     const tflite::gpu::Tensor<OHWI, T>& weights, bool weights_are_buffer) {
   const int dst_depth =
-      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
+      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.w);
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
@@ -90,12 +100,22 @@ void ConvolutionTransposed::UploadWeights(
 
   if (f32_weights) {
     float4* ptr = reinterpret_cast<float4*>(data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count),
-                         weights_are_buffer);
+    if (weights_are_buffer) {
+      RearrangeWeightsToOHWIOGroupI4O4(weights, block_size_.w,
+                                       absl::MakeSpan(ptr, elements_count));
+    } else {
+      RearrangeWeightsToI4HWIOOGroupO4(weights, block_size_.w,
+                                       absl::MakeSpan(ptr, elements_count));
+    }
   } else {
     half4* ptr = reinterpret_cast<half4*>(data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count),
-                         weights_are_buffer);
+    if (weights_are_buffer) {
+      RearrangeWeightsToOHWIOGroupI4O4(weights, block_size_.w,
+                                       absl::MakeSpan(ptr, elements_count));
+    } else {
+      RearrangeWeightsToI4HWIOOGroupO4(weights, block_size_.w,
+                                       absl::MakeSpan(ptr, elements_count));
+    }
   }
 
   if (weights_are_buffer) {
@@ -107,90 +127,80 @@ void ConvolutionTransposed::UploadWeights(
     args_.AddObject("weights",
                     absl::make_unique<BufferDescriptor>(std::move(desc)));
   } else {
-    int sub_size = float4_size * elements_count / 4;
-    Texture2DDescriptor desc0;
-    desc0.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc0.size = int2(dst_depth, src_depth * kernel_x * kernel_y);
-    desc0.data.resize(sub_size);
-    memcpy(desc0.data.data(), data.data(), sub_size);
-    args_.AddObject("weights0",
-                    absl::make_unique<Texture2DDescriptor>(std::move(desc0)));
-
-    Texture2DDescriptor desc1;
-    desc1.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc1.size = int2(dst_depth, src_depth * kernel_x * kernel_y);
-    desc1.data.resize(sub_size);
-    memcpy(desc1.data.data(), data.data() + sub_size, sub_size);
-    args_.AddObject("weights1",
-                    absl::make_unique<Texture2DDescriptor>(std::move(desc1)));
-
-    Texture2DDescriptor desc2;
-    desc2.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc2.size = int2(dst_depth, src_depth * kernel_x * kernel_y);
-    desc2.data.resize(sub_size);
-    memcpy(desc2.data.data(), data.data() + sub_size * 2, sub_size);
-    args_.AddObject("weights2",
-                    absl::make_unique<Texture2DDescriptor>(std::move(desc2)));
-
-    Texture2DDescriptor desc3;
-    desc3.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc3.size = int2(dst_depth, src_depth * kernel_x * kernel_y);
-    desc3.data.resize(sub_size);
-    memcpy(desc3.data.data(), data.data() + sub_size * 3, sub_size);
-    args_.AddObject("weights3",
-                    absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
+    int texture_width = dst_depth;
+    int texture_height = src_depth * kernel_x * kernel_y;
+    int sub_size = float4_size * texture_width * texture_height;
+    for (int i = 0; i < 4; ++i) {
+      Texture2DDescriptor desc;
+      desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+      desc.size = int2(texture_width, texture_height);
+      desc.data.resize(sub_size);
+      memcpy(desc.data.data(), data.data() + sub_size * i, sub_size);
+      const std::string name = "weights" + std::to_string(i);
+      args_.AddObject(name,
+                      absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+    }
   }
 }
 
-template <DataType S, typename T>
-void ConvolutionTransposed::RearrangeWeightsData(
-    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst,
-    bool weights_are_buffer) {
+template <DataType T>
+void ConvolutionTransposed::UploadWeights(
+    const tflite::gpu::Tensor<OHWDI, T>& weights, bool weights_are_buffer) {
   const int dst_depth =
-      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
+      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.w);
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
-  int texture_width = dst_depth;
-  int texture_height = src_depth * kernel_x * kernel_y;
+  const int kernel_z = weights.shape.d;
 
-  int counter = 0;
-  for (int d = 0; d < dst_depth / block_size_.z; ++d) {
-    for (int y = 0; y < kernel_y; ++y) {
-      for (int x = 0; x < kernel_x; ++x) {
-        for (int s = 0; s < src_depth; ++s) {
-          for (int sub_d = 0; sub_d < block_size_.z; ++sub_d) {
-            T filters[4];
-            for (int i = 0; i < 4; ++i) {
-              for (int j = 0; j < 4; ++j) {
-                const int s_ch = s * 4 + j;
-                const int d_ch = (d * block_size_.z + sub_d) * 4 + i;
-                if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
-                  const int f_index =
-                      weights.shape.LinearIndex({d_ch, y, x, s_ch});
-                  filters[j][i] = weights.data[f_index];
-                } else {
-                  filters[j][i] = 0.0f;
-                }
-              }
-            }
-            if (weights_are_buffer) {
-              dst[counter++] = filters[0];
-              dst[counter++] = filters[1];
-              dst[counter++] = filters[2];
-              dst[counter++] = filters[3];
-            } else {
-              int x_coord = d * block_size_.z + sub_d;
-              int y_coord = (y * kernel_x + x) * src_depth + s;
-              int offset = y_coord * dst_depth + x_coord;
-              dst[offset + texture_width * texture_height * 0] = filters[0];
-              dst[offset + texture_width * texture_height * 1] = filters[1];
-              dst[offset + texture_width * texture_height * 2] = filters[2];
-              dst[offset + texture_width * texture_height * 3] = filters[3];
-            }
-          }
-        }
-      }
+  const int elements_count =
+      kernel_x * kernel_y * kernel_z * src_depth * dst_depth * 4;
+  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+
+  const int float4_size = f32_weights ? 16 : 8;
+  std::vector<uint8_t> data(float4_size * elements_count);
+
+  if (f32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    if (weights_are_buffer) {
+      RearrangeWeightsToODHWIOGroupI4O4(weights, block_size_.w,
+                                        absl::MakeSpan(ptr, elements_count));
+    } else {
+      RearrangeWeightsToI4DHWIOOGroupO4(weights, block_size_.w,
+                                        absl::MakeSpan(ptr, elements_count));
+    }
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    if (weights_are_buffer) {
+      RearrangeWeightsToODHWIOGroupI4O4(weights, block_size_.w,
+                                        absl::MakeSpan(ptr, elements_count));
+    } else {
+      RearrangeWeightsToI4DHWIOOGroupO4(weights, block_size_.w,
+                                        absl::MakeSpan(ptr, elements_count));
+    }
+  }
+
+  if (weights_are_buffer) {
+    BufferDescriptor desc;
+    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 16;
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    args_.AddObject("weights",
+                    absl::make_unique<BufferDescriptor>(std::move(desc)));
+  } else {
+    int texture_width = dst_depth;
+    int texture_height = src_depth * kernel_x * kernel_y * kernel_z;
+    int sub_size = float4_size * texture_width * texture_height;
+    for (int i = 0; i < 4; ++i) {
+      Texture2DDescriptor desc;
+      desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+      desc.size = int2(texture_width, texture_height);
+      desc.data.resize(sub_size);
+      memcpy(desc.data.data(), data.data() + sub_size * i, sub_size);
+      const std::string name = "weights" + std::to_string(i);
+      args_.AddObject(name,
+                      absl::make_unique<Texture2DDescriptor>(std::move(desc)));
     }
   }
 }
@@ -199,6 +209,10 @@ ConvolutionTransposed CreateConvolutionTransposed(
     const DeviceInfo& device_info, const OperationDef& definition,
     const ConvolutionTransposedAttributes& attr);
 
+ConvolutionTransposed CreateConvolutionTransposed3D(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const ConvolutionTransposed3DAttributes& attr);
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
deleted file mode 100644
index b2a85a89ef0..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.cc
+++ /dev/null
@@ -1,402 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h"
-
-#include <string>
-#include <utility>
-
-#include "absl/strings/substitute.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-
-ConvolutionTransposed3D::ConvolutionTransposed3D(
-    const OperationDef& definition,
-    const ConvolutionTransposed3DAttributes& attr,
-    const DeviceInfo& device_info)
-    : GPUOperation(definition),
-      stride_(attr.stride.w, attr.stride.h, attr.stride.d),
-      block_size_(2, 2, 1, 2) {
-  bool weights_are_buffer = device_info.IsMali();
-  args_.AddInt("stride_x", stride_.x);
-  args_.AddInt("stride_y", stride_.y);
-  args_.AddInt("stride_z", stride_.z);
-  args_.AddInt("padding_x", attr.padding.prepended.w);
-  args_.AddInt("padding_y", attr.padding.prepended.h);
-  args_.AddInt("padding_z", attr.padding.prepended.d);
-  args_.AddInt("kernel_size_x", attr.weights.shape.w);
-  args_.AddInt("kernel_size_y", attr.weights.shape.h);
-  args_.AddInt("kernel_size_z", attr.weights.shape.d);
-  args_.AddInt("grid_size_s");
-  code_ = GenerateConvolutionTransposed3DCode(definition_, weights_are_buffer,
-                                              block_size_);
-  UploadWeights(attr.weights, weights_are_buffer);
-  if (device_info.IsPowerVR() && block_size_.y != 1) {
-    bool is_texture3d = definition_.src_tensors[0].storage_type ==
-                        TensorStorageType::TEXTURE_3D;
-    bool is_texture_array = definition_.src_tensors[0].storage_type ==
-                            TensorStorageType::TEXTURE_ARRAY;
-    if (is_texture3d || is_texture_array) {
-      compiler_options_.push_back(CompilerOptions::CL_OPT_DISABLE);
-    }
-  }
-}
-
-ConvolutionTransposed3D::ConvolutionTransposed3D(
-    ConvolutionTransposed3D&& operation)
-    : GPUOperation(std::move(operation)),
-      stride_(operation.stride_),
-      block_size_(operation.block_size_) {}
-
-ConvolutionTransposed3D& ConvolutionTransposed3D::operator=(
-    ConvolutionTransposed3D&& operation) {
-  if (this != &operation) {
-    std::swap(stride_, operation.stride_);
-    std::swap(block_size_, operation.block_size_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-std::string ConvolutionTransposed3D::GenerateConvolutionTransposed3DCode(
-    const OperationDef& op_def, bool weights_are_buffer,
-    const int4& block_size) {
-  auto src_desc = op_def.src_tensors[0];
-  src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
-  AddSrcTensor("src_tensor", src_desc);
-
-  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
-
-  const auto src_tensor_type = op_def.src_tensors[0].storage_type;
-  bool image_buffer = src_tensor_type == TensorStorageType::IMAGE_BUFFER;
-  bool manual_clamp =
-      image_buffer || src_tensor_type == TensorStorageType::BUFFER;
-
-  std::string c = GetCommonDefines(op_def.precision);
-
-  for (int s = 0; s < block_size.w; ++s) {
-    const std::string f0 =
-        weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].s0123"
-                           : "f" + std::to_string(s * 4 + 0);
-    const std::string f1 =
-        weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].s4567"
-                           : "f" + std::to_string(s * 4 + 1);
-    const std::string f2 =
-        weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].s89ab"
-                           : "f" + std::to_string(s * 4 + 2);
-    const std::string f3 =
-        weights_are_buffer ? "weights_cache[" + std::to_string(s) + "].scdef"
-                           : "f" + std::to_string(s * 4 + 3);
-    switch (op_def.precision) {
-      case CalculationsPrecision::F32:
-      case CalculationsPrecision::F16:
-        c += "#define CONV" + std::to_string(s) + "(R, S)    \\\n";
-        c += "R += S.x * " + f0 + "; \\\n";
-        c += "R += S.y * " + f1 + "; \\\n";
-        c += "R += S.z * " + f2 + "; \\\n";
-        c += "R += S.w * " + f3 + ";   \n";
-        break;
-      case CalculationsPrecision::F32_F16:
-        c += "#define CONV" + std::to_string(s) + "(R, S) \\\n";
-        c += "R += convert_float4(S.x * " + f0 + " + S.y * " + f1 +
-             " + S.z * " + f2 + " + S.w * " + f3 + ");\n";
-        break;
-    }
-  }
-
-  switch (op_def.precision) {
-    case CalculationsPrecision::F32:
-      c += "#define FLT16 float16\n";
-      break;
-    case CalculationsPrecision::F32_F16:
-    case CalculationsPrecision::F16:
-      c += "#define FLT16 half16\n";
-      break;
-  }
-
-  c += "__kernel void main_function(\n";
-  c += "$0) {\n";
-  if (op_def.IsBatchSupported()) {
-    c += "  int linear_id = get_global_id(0);\n";
-    c += "  int dst_x = (linear_id / args.dst_tensor.Batch());\n";
-    c += "  int B = linear_id % args.dst_tensor.Batch();\n";
-    c += "  args.dst_tensor.SetBatchRef(B);\n";
-    c += "  args.src_tensor.SetBatchRef(B);\n";
-  } else {
-    c += "  int dst_x = get_global_id(0);\n";
-  }
-  c += "  int rem_x = dst_x % args.stride_x;\n";
-  c += "  int ceil_x = dst_x / args.stride_x;\n";
-  c += "  dst_x = ceil_x * args.stride_x * " + std::to_string(block_size.x) +
-       " + rem_x;\n";
-  c += "  int dst_y = get_global_id(1);\n";
-  c += "  int rem_y = dst_y % args.stride_y;\n";
-  c += "  int ceil_y = dst_y / args.stride_y;\n";
-  c += "  dst_y = ceil_y * args.stride_y * " + std::to_string(block_size.y) +
-       " + rem_y;\n";
-  c += "  int linear_id_z = get_global_id(2);\n";
-  c += "  int S = (linear_id_z % args.grid_size_s) * " +
-       std::to_string(block_size.w) + ";\n";
-  c += "  int dst_z = linear_id_z / args.grid_size_s;\n";
-  c += "  int rem_z = dst_z % args.stride_z;\n";
-  c += "  int ceil_z = dst_z / args.stride_z;\n";
-  c += "  dst_z = ceil_z * args.stride_z * " + std::to_string(block_size.z) +
-       " + rem_z;\n";
-  c += "  if (dst_x >= args.dst_tensor.Width() || dst_y >= "
-       "args.dst_tensor.Height() || dst_z >= "
-       "args.dst_tensor.Depth()) return;\n";
-  if (weights_are_buffer) {
-    c += "  int f_base = S * args.src_tensor.Slices() * args.kernel_size_x * "
-         "args.kernel_size_y * "
-         "args.kernel_size_z;\n";
-  }
-  for (int i = 0; i < block_size.x * block_size.y * block_size.z * block_size.w;
-       ++i) {
-    c += "  ACCUM_FLT4 r" + std::to_string(i) +
-         " = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
-  }
-  c += "  int kernel_first_dst_x = dst_x + args.padding_x;\n";
-  c += "  int kernel_first_dst_y = dst_y + args.padding_y;\n";
-  c += "  int kernel_first_dst_z = dst_z + args.padding_z;\n";
-  c += "  int kernel_last_dst_x = kernel_first_dst_x - args.kernel_size_x;\n";
-  c += "  int kernel_last_dst_y = kernel_first_dst_y - args.kernel_size_y;\n";
-  c += "  int kernel_last_dst_z = kernel_first_dst_z - args.kernel_size_z;\n";
-  c += "  int offset_x = abs(args.padding_x);\n";
-  c += "  int offset_x_strided = offset_x * args.stride_x;\n";
-  c +=
-      "  int src_x = (kernel_first_dst_x + offset_x_strided) / args.stride_x - "
-      "offset_x;\n";
-  c += "  int offset_y = abs(args.padding_y);\n";
-  c += "  int offset_y_strided = offset_y * args.stride_y;\n";
-  c +=
-      "  int src_y = (kernel_first_dst_y + offset_y_strided) / args.stride_y - "
-      "offset_y;\n";
-  c += "  int offset_z = abs(args.padding_z);\n";
-  c += "  int offset_z_strided = offset_z * args.stride_z;\n";
-  c +=
-      "  int src_z = (kernel_first_dst_z + offset_z_strided) / args.stride_z - "
-      "offset_z;\n";
-  c += "  int src_as_dst_z = src_z * args.stride_z;\n";
-  c += "  for (;src_as_dst_z > kernel_last_dst_z; src_z -= 1, src_as_dst_z -= "
-       "args.stride_z) {\n";
-  for (int z = 0; z < block_size.z; ++z) {
-    const std::string zindex = std::to_string(z);
-    c += "    int sz" + zindex + " = src_z + " + zindex + ";\n";
-    if (src_tensor_type != TensorStorageType::TEXTURE_3D) {
-      c += "    bool in_z" + zindex + " = sz" + zindex + " >= 0 && sz" +
-           zindex + " < args.src_tensor.Depth();\n";
-    }
-  }
-  if (block_size.z == 1 && (src_tensor_type != TensorStorageType::TEXTURE_3D)) {
-    c += "    if (!in_z0) continue;\n";
-  }
-  c += "    int kernel_z = kernel_first_dst_z - src_as_dst_z;\n";
-  c += "    int src_as_dst_y = src_y * args.stride_y;\n";
-  c += "    int src_y_copy = src_y;\n";
-  c += "    for (;src_as_dst_y > kernel_last_dst_y; src_y_copy -= 1, "
-       "src_as_dst_y -= "
-       "args.stride_y) {\n";
-  for (int y = 0; y < block_size.y; ++y) {
-    const std::string yindex = std::to_string(y);
-    c += "      int sy" + yindex + " = src_y_copy + " + yindex + ";\n";
-    if (manual_clamp) {
-      c += "      bool in_y" + yindex + " = sy" + yindex + " >= 0 && sy" +
-           yindex + " < args.src_tensor.Height();\n";
-      if (!image_buffer) {
-        c += "      sy" + yindex + " = clamp(sy" + yindex +
-             ", 0, args.src_tensor.Height() - 1);\n";
-      }
-    }
-  }
-  c += "      int kernel_y = kernel_first_dst_y - src_as_dst_y;\n";
-  c += "      int src_as_dst_x = src_x * args.stride_x;\n";
-  c += "      int src_x_copy = src_x;\n";
-  c += "      for (;src_as_dst_x > kernel_last_dst_x; src_x_copy -= 1, "
-       "src_as_dst_x "
-       "-= args.stride_x) {\n";
-  for (int x = 0; x < block_size.x; ++x) {
-    const std::string xindex = std::to_string(x);
-    c += "        int sx" + xindex + " = src_x_copy + " + xindex + ";\n";
-    if (manual_clamp) {
-      c += "        bool in_x" + xindex + " = sx" + xindex + " >= 0 && sx" +
-           xindex + " < args.src_tensor.Width();\n";
-      if (!image_buffer) {
-        c += "        sx" + xindex + " = clamp(sx" + xindex +
-             ", 0, args.src_tensor.Width() - 1);\n";
-      }
-    }
-  }
-  const std::string layer_offset = "args.src_tensor.SliceStride()";
-  for (int z = 0; z < block_size.z; ++z) {
-    const std::string zindex = std::to_string(z);
-    for (int y = 0; y < block_size.y; ++y) {
-      const std::string yindex = std::to_string(y);
-      for (int x = 0; x < block_size.x; ++x) {
-        const std::string xindex = std::to_string(x);
-        const std::string id =
-            std::to_string((z * block_size.y + y) * block_size.x + x);
-        c += "        args.src_tensor.GetAddress(addr_" + id + ", sx" + xindex +
-             ", sy" + yindex + ", sz" + zindex + ", 0);";
-        if (image_buffer) {
-          c += "        addr_" + id + " = select(-1, addr_" + id + ", (in_x" +
-               xindex + " && in_y" + yindex + "));\n";
-          c += absl::Substitute(
-              "        int dz_$0 = select(0, $3, (in_x$1 && "
-              "in_y$2));\n",
-              id, x, y, layer_offset);
-        }
-      }
-    }
-  }
-  if (src_tensor_type == TensorStorageType::BUFFER) {
-    c += "        int dz = " + layer_offset + ";\n";
-  }
-  if (block_size.x == 1 && block_size.y == 1 && manual_clamp) {
-    c += "        if (!in_x0 || !in_y0) continue;\n";
-  }
-  c += "        int kernel_x = kernel_first_dst_x - src_as_dst_x;\n";
-  c += "        int kernel_index =(kernel_z * args.kernel_size_y + kernel_y) * "
-       "args.kernel_size_x + kernel_x;\n";
-  if (weights_are_buffer) {
-    c += "        int f_offset = f_base + kernel_index * "
-         "args.src_tensor.Slices() * " +
-         std::to_string(block_size.w) + ";\n";
-  } else {
-    c += "        int x_c = kernel_index * args.src_tensor.Slices();\n";
-  }
-  c += "        for (int s = 0; s < args.src_tensor.Slices(); ++s) {\n";
-  for (int y = 0; y < block_size.y; ++y) {
-    const std::string yindex = std::to_string(y);
-    for (int x = 0; x < block_size.x; ++x) {
-      const std::string xindex = std::to_string(x);
-      const std::string id = std::to_string(y * block_size.x + x);
-      if (image_buffer) {
-        c += "          FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
-             "); addr_" + id + " += dz_" + id + ";\n";
-      } else if (manual_clamp) {
-        c += "          FLT4 src" + id + " = args.src_tensor.Read(addr_" + id +
-             ") * (FLT)(in_x" + xindex + " && in_y" + yindex + "); addr_" + id +
-             " += dz;\n";
-      } else {
-        c += "          FLT4 src" + id + " = args.src_tensor.Read(sx" + xindex +
-             ", sy" + yindex + ", sz0, s);\n";
-      }
-    }
-  }
-  if (weights_are_buffer) {
-    c += "          __global FLT16* weights_cache = "
-         "args.weights.GetPtr(f_offset);\n";
-    c += "          f_offset += " + std::to_string(block_size.w) + ";\n";
-  } else {
-    for (int z = 0; z < block_size.w; ++z) {
-      c += absl::Substitute(
-          R"(          FLT4 f$1 = args.weights0.Read(S + $0, x_c);
-          FLT4 f$2 = args.weights1.Read(S + $0, x_c);
-          FLT4 f$3 = args.weights2.Read(S + $0, x_c);
-          FLT4 f$4 = args.weights3.Read(S + $0, x_c);
-)",
-          z, z * 4 + 0, z * 4 + 1, z * 4 + 2, z * 4 + 3);
-    }
-    c += "          x_c++;\n";
-  }
-  for (int z = 0; z < block_size.w; ++z) {
-    for (int i = 0; i < block_size.x * block_size.y * block_size.z; ++i) {
-      c += "          CONV" + std::to_string(z) + "(r" +
-           std::to_string(i + z * block_size.x * block_size.y * block_size.z) +
-           ", src" + std::to_string(i) + ");\n";
-    }
-  }
-  c += "        }\n";
-  c += "      }\n";
-  c += "    }\n";
-  c += "  }\n";
-  for (int s = 0; s < block_size.w; ++s) {
-    c += "  if (S < args.dst_tensor.Slices()) {\n";
-    c += "    FLT4 bias_val = args.biases.Read(S);\n";
-    for (int z = 0; z < block_size.z; ++z) {
-      for (int y = 0; y < block_size.y; ++y) {
-        for (int x = 0; x < block_size.x; ++x) {
-          const std::string id = std::to_string(
-              ((s * block_size.z + z) * block_size.y + y) * block_size.x + x);
-          c += "    {\n";
-          c += "      int xc = dst_x + args.stride_x * " + std::to_string(x) +
-               ";\n";
-          c += "      int yc = dst_y + args.stride_y * " + std::to_string(y) +
-               ";\n";
-          c += "      int zc = dst_z + args.stride_z * " + std::to_string(z) +
-               ";\n";
-          c += "      if (xc < args.dst_tensor.Width() && yc < "
-               "args.dst_tensor.Height() && zc < args.dst_tensor.Depth()) {\n";
-          c += "        FLT4 res = TO_FLT4(r" + id + ") + bias_val;\n";
-          c += "        args.dst_tensor.Write(res, xc, yc, zc, S)\n";
-          c += "      }\n";
-          c += "    }\n";
-        }
-      }
-    }
-    c += "  }\n";
-    c += "  S++;\n";
-  }
-  c += "}\n";
-  return c;
-}
-
-absl::Status ConvolutionTransposed3D::BindArguments() {
-  return args_.SetInt("grid_size_s",
-                      DivideRoundUp(dst_[0]->Slices(), block_size_.w));
-}
-
-int3 ConvolutionTransposed3D::GetGridSize() const {
-  const int aligned_w = AlignByN(dst_[0]->Width(), stride_.x * block_size_.x);
-  const int aligned_h = AlignByN(dst_[0]->Height(), stride_.y * block_size_.y);
-  const int aligned_d = AlignByN(dst_[0]->Depth(), stride_.z * block_size_.z);
-  const int grid_x = DivideRoundUp(aligned_w, block_size_.x) * dst_[0]->Batch();
-  const int grid_y = DivideRoundUp(aligned_h, block_size_.y);
-  const int grid_z = DivideRoundUp(dst_[0]->Slices(), block_size_.w) *
-                     DivideRoundUp(aligned_d, block_size_.z);
-  return int3(grid_x, grid_y, grid_z);
-}
-
-void ConvolutionTransposed3D::GetPossibleKernelWorkGroups(
-    TuningType tuning_type, const DeviceInfo& device_info,
-    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
-  GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
-                            work_groups);
-}
-
-ConvolutionTransposed3D CreateConvolutionTransposed3D(
-    const DeviceInfo& device_info, const OperationDef& definition,
-    const ConvolutionTransposed3DAttributes& attr) {
-  ConvolutionTransposed3D result(definition, attr, device_info);
-
-  TensorLinearDescriptor desc;
-  desc.storage_type =
-      DeduceLinearStorageType(definition.GetPrimaryStorageType());
-  desc.element_type = definition.GetDataType();
-  desc.UploadLinearData(attr.bias);
-  result.args_.AddObject(
-      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
-  return result;
-}
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
deleted file mode 100644
index ebd674d612b..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3d.h
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_3D_H_
-#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_3D_H_
-
-#include <vector>
-
-#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
-#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
-#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
-#include "tensorflow/lite/delegates/gpu/cl/util.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
-#include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/shape.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/tensor.h"
-#include "tensorflow/lite/delegates/gpu/common/types.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-
-class ConvolutionTransposed3D : public GPUOperation {
- public:
-  ConvolutionTransposed3D() = default;
-  void GetPossibleKernelWorkGroups(
-      TuningType tuning_type, const DeviceInfo& device_info,
-      const KernelInfo& kernel_info,
-      std::vector<int3>* work_groups) const override;
-  absl::Status BindArguments() override;
-  int3 GetGridSize() const override;
-
-  // Move only
-  ConvolutionTransposed3D(ConvolutionTransposed3D&& operation);
-  ConvolutionTransposed3D& operator=(ConvolutionTransposed3D&& operation);
-  ConvolutionTransposed3D(const ConvolutionTransposed3D&) = delete;
-  ConvolutionTransposed3D& operator=(const ConvolutionTransposed3D&) = delete;
-
- private:
-  friend ConvolutionTransposed3D CreateConvolutionTransposed3D(
-      const DeviceInfo& device_info, const OperationDef& definition,
-      const ConvolutionTransposed3DAttributes& attr);
-  ConvolutionTransposed3D(const OperationDef& definition,
-                          const ConvolutionTransposed3DAttributes& attr,
-                          const DeviceInfo& device_info);
-  template <DataType T>
-  void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights,
-                     bool weights_are_buffer);
-
-  template <DataType S, typename T>
-  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
-                            absl::Span<T> dst, bool weights_are_buffer);
-
-  std::string GenerateConvolutionTransposed3DCode(const OperationDef& op_def,
-                                                  bool weights_are_buffer,
-                                                  const int4& block_size);
-
-  int3 stride_;
-  int4 block_size_ = int4(1, 1, 1, 1);  // WHDS
-};
-
-template <DataType T>
-void ConvolutionTransposed3D::UploadWeights(
-    const tflite::gpu::Tensor<OHWDI, T>& weights, bool weights_are_buffer) {
-  const int dst_depth =
-      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.z);
-  const int src_depth = DivideRoundUp(weights.shape.i, 4);
-  const int kernel_x = weights.shape.w;
-  const int kernel_y = weights.shape.h;
-  const int kernel_z = weights.shape.d;
-  int texture_width = dst_depth;
-  int texture_height = src_depth * kernel_x * kernel_y * kernel_z;
-
-  const int elements_count =
-      kernel_x * kernel_y * kernel_z * src_depth * dst_depth * 4;
-  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
-
-  const int float4_size = f32_weights ? 16 : 8;
-  std::vector<uint8_t> data(float4_size * elements_count);
-
-  if (f32_weights) {
-    float4* ptr = reinterpret_cast<float4*>(data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count),
-                         weights_are_buffer);
-  } else {
-    half4* ptr = reinterpret_cast<half4*>(data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count),
-                         weights_are_buffer);
-  }
-
-  if (weights_are_buffer) {
-    BufferDescriptor desc;
-    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc.element_size = 16;
-    desc.size = float4_size * elements_count;
-    desc.data = std::move(data);
-    args_.AddObject("weights",
-                    absl::make_unique<BufferDescriptor>(std::move(desc)));
-  } else {
-    int sub_size = float4_size * elements_count / 4;
-    Texture2DDescriptor desc0;
-    desc0.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc0.size = int2(texture_width, texture_height);
-    desc0.data.resize(sub_size);
-    memcpy(desc0.data.data(), data.data(), sub_size);
-    args_.AddObject("weights0",
-                    absl::make_unique<Texture2DDescriptor>(std::move(desc0)));
-
-    Texture2DDescriptor desc1;
-    desc1.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc1.size = int2(texture_width, texture_height);
-    desc1.data.resize(sub_size);
-    memcpy(desc1.data.data(), data.data() + sub_size, sub_size);
-    args_.AddObject("weights1",
-                    absl::make_unique<Texture2DDescriptor>(std::move(desc1)));
-
-    Texture2DDescriptor desc2;
-    desc2.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc2.size = int2(texture_width, texture_height);
-    desc2.data.resize(sub_size);
-    memcpy(desc2.data.data(), data.data() + sub_size * 2, sub_size);
-    args_.AddObject("weights2",
-                    absl::make_unique<Texture2DDescriptor>(std::move(desc2)));
-
-    Texture2DDescriptor desc3;
-    desc3.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc3.size = int2(texture_width, texture_height);
-    desc3.data.resize(sub_size);
-    memcpy(desc3.data.data(), data.data() + sub_size * 3, sub_size);
-    args_.AddObject("weights3",
-                    absl::make_unique<Texture2DDescriptor>(std::move(desc3)));
-  }
-}
-
-template <DataType S, typename T>
-void ConvolutionTransposed3D::RearrangeWeightsData(
-    const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst,
-    bool weights_are_buffer) {
-  const int dst_depth =
-      AlignByN(DivideRoundUp(weights.shape.o, 4), block_size_.w);
-  const int src_depth = DivideRoundUp(weights.shape.i, 4);
-  const int kernel_x = weights.shape.w;
-  const int kernel_y = weights.shape.h;
-  const int kernel_z = weights.shape.d;
-  int texture_width = dst_depth;
-  int texture_height = src_depth * kernel_x * kernel_y * kernel_z;
-
-  int counter = 0;
-  for (int d = 0; d < dst_depth / block_size_.w; ++d) {
-    for (int z = 0; z < kernel_z; ++z) {
-      for (int y = 0; y < kernel_y; ++y) {
-        for (int x = 0; x < kernel_x; ++x) {
-          for (int s = 0; s < src_depth; ++s) {
-            for (int sub_d = 0; sub_d < block_size_.w; ++sub_d) {
-              T filters[4];
-              for (int i = 0; i < 4; ++i) {
-                for (int j = 0; j < 4; ++j) {
-                  const int s_ch = s * 4 + j;
-                  const int d_ch = (d * block_size_.w + sub_d) * 4 + i;
-                  if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
-                    const int f_index =
-                        weights.shape.LinearIndex({d_ch, y, x, z, s_ch});
-                    filters[j][i] = weights.data[f_index];
-                  } else {
-                    filters[j][i] = 0.0f;
-                  }
-                }
-              }
-              if (weights_are_buffer) {
-                dst[counter++] = filters[0];
-                dst[counter++] = filters[1];
-                dst[counter++] = filters[2];
-                dst[counter++] = filters[3];
-              } else {
-                int x_coord = d * block_size_.w + sub_d;
-                int y_coord =
-                    ((z * kernel_y + y) * kernel_x + x) * src_depth + s;
-                int offset = y_coord * dst_depth + x_coord;
-                dst[offset + texture_width * texture_height * 0] = filters[0];
-                dst[offset + texture_width * texture_height * 1] = filters[1];
-                dst[offset + texture_width * texture_height * 2] = filters[2];
-                dst[offset + texture_width * texture_height * 3] = filters[3];
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-ConvolutionTransposed3D CreateConvolutionTransposed3D(
-    const DeviceInfo& device_info, const OperationDef& definition,
-    const ConvolutionTransposed3DAttributes& attr);
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_CONVOLUTION_TRANSPOSED_3D_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
index af952dd3f78..7880f31013a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.cc
@@ -29,10 +29,9 @@ namespace gpu {
 namespace cl {
 ConvolutionTransposed3x3::ConvolutionTransposed3x3(
     const OperationDef& definition, const DeviceInfo& device_info, int2 padding)
-    : GPUOperation(definition),
-      padding_(padding),
-      work_group_launch_order_(2, 0, 1) {
+    : GPUOperation(definition), padding_(padding) {
   work_group_size_ = int3(8, 4, 1);
+  work_group_launch_order_ = int3(2, 0, 1);
   if (device_info.IsPowerVR()) {
     weights_upload_type_ = WeightsUploadType::LOCAL_MEM_ASYNC;
   } else if (device_info.IsNvidia() || device_info.IsIntel()) {
@@ -54,14 +53,12 @@ ConvolutionTransposed3x3::ConvolutionTransposed3x3(
     ConvolutionTransposed3x3&& operation)
     : GPUOperation(std::move(operation)),
       padding_(operation.padding_),
-      work_group_launch_order_(operation.work_group_launch_order_),
       weights_upload_type_(operation.weights_upload_type_) {}
 
 ConvolutionTransposed3x3& ConvolutionTransposed3x3::operator=(
     ConvolutionTransposed3x3&& operation) {
   if (this != &operation) {
     std::swap(padding_, operation.padding_);
-    std::swap(work_group_launch_order_, operation.work_group_launch_order_);
     std::swap(weights_upload_type_, operation.weights_upload_type_);
     GPUOperation::operator=(std::move(operation));
   }
@@ -305,27 +302,33 @@ std::string ConvolutionTransposed3x3::GenerateConvolutionTransposedCode(
   return c;
 }
 
-absl::Status ConvolutionTransposed3x3::BindArguments() {
-  RETURN_IF_ERROR(args_.SetInt("filter_offset", 4 * 9 * src_[0]->Slices()));
+absl::Status ConvolutionTransposed3x3::BindArguments(ArgumentsBinder* args) {
+  RETURN_IF_ERROR(args->SetInt("filter_offset", 4 * 9 * src_[0]->Slices()));
   const int padding_x =
       padding_.x >= 1 ? (padding_.x - 1) / 2 : (padding_.x - 2) / 2;
   const int padding_y =
       padding_.y >= 1 ? (padding_.y - 1) / 2 : (padding_.y - 2) / 2;
-  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_x * src_[0]->Batch()));
-  return args_.SetInt("padding_y", padding_y);
+  RETURN_IF_ERROR(args->SetInt("padding_x", padding_x * src_[0]->Batch()));
+  return args->SetInt("padding_y", padding_y);
+}
+
+void ConvolutionTransposed3x3::GetPossibleKernelWorkGroups(
+    TuningType tuning_type, const DeviceInfo& device_info,
+    const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
+  if (weights_upload_type_ == WeightsUploadType::LOCAL_MEM_ASYNC ||
+      weights_upload_type_ == WeightsUploadType::LOCAL_MEM_BY_THREADS) {
+    work_groups->push_back(work_group_size_);
+    return;
+  }
+  GetPossibleWorkGroupsConv(tuning_type, device_info, kernel_info, grid_size_,
+                            work_groups);
 }
 
 int3 ConvolutionTransposed3x3::GetGridSize() const {
   const int grid_x = DivideRoundUp(dst_[0]->Width(), 2) * dst_[0]->Batch();
   const int grid_y = DivideRoundUp(dst_[0]->Height(), 2);
   const int grid_z = dst_[0]->Slices();
-  int3 wg;
-  wg.x = DivideRoundUp(grid_x, work_group_size_.x);
-  wg.y = DivideRoundUp(grid_y, work_group_size_.y);
-  wg.z = DivideRoundUp(grid_z, work_group_size_.z);
-  return int3(wg[work_group_launch_order_[0]] * work_group_size_.x,
-              wg[work_group_launch_order_[1]] * work_group_size_.y,
-              wg[work_group_launch_order_[2]] * work_group_size_.z);
+  return int3(grid_x, grid_y, grid_z);
 }
 
 bool IsConvolutionTransposed3x3Supported(
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
index ad3e459da3e..074fc23b0e7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_3x3.h
@@ -40,10 +40,8 @@ class ConvolutionTransposed3x3 : public GPUOperation {
   void GetPossibleKernelWorkGroups(
       TuningType tuning_type, const DeviceInfo& device_info,
       const KernelInfo& kernel_info,
-      std::vector<int3>* work_groups) const override {
-    work_groups->push_back(work_group_size_);
-  }
-  absl::Status BindArguments() override;
+      std::vector<int3>* work_groups) const override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   // Move only
@@ -78,7 +76,6 @@ class ConvolutionTransposed3x3 : public GPUOperation {
       int2 padding, int3 work_group_launch_order);
 
   int2 padding_;
-  int3 work_group_launch_order_;
   WeightsUploadType weights_upload_type_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
index d606a822d7e..0f389361724 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.cc
@@ -296,8 +296,8 @@ std::string ConvolutionTransposed4x4::GenerateConvolutionTransposedCode(
   return c;
 }
 
-absl::Status ConvolutionTransposed4x4::BindArguments() {
-  return args_.SetInt("filter_offset", 4 * 16 * src_[0]->Slices());
+absl::Status ConvolutionTransposed4x4::BindArguments(ArgumentsBinder* args) {
+  return args->SetInt("filter_offset", 4 * 16 * src_[0]->Slices());
 }
 
 int3 ConvolutionTransposed4x4::GetGridSize() const {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
index 2577eb47513..17d63233864 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/convolution_transposed_4x4.h
@@ -43,7 +43,7 @@ class ConvolutionTransposed4x4 : public GPUOperation {
       std::vector<int3>* work_groups) const override {
     work_groups->push_back(work_group_size_);
   }
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   // Move only
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
index 91e26b27cdf..05d5d086bc7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.cc
@@ -66,100 +66,25 @@ std::string GetSrcValue(int channel_multiplier, const std::string coords) {
 
   return c;
 }
-}  // namespace
 
-DepthwiseConvolution::DepthwiseConvolution(
-    const OperationDef& definition,
-    const DepthwiseConvolution2DAttributes& attr, bool weights_are_buffer)
-    : GPUOperation(definition),
-      weights_are_buffer_(weights_are_buffer),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h, 0, 0),
-      stride_(attr.strides.w, attr.strides.h, 0, 0),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h, 0, 0),
-      dilation_(attr.dilations.w, attr.dilations.h, 0, 0),
-      channel_multiplier_(attr.weights.shape.o) {
-  work_group_size_ = int3(8, 8, 1);
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  code_ = GenerateDepthwiseConvolutionCode(
-      definition_, stride_correction, channel_multiplier_, weights_are_buffer_);
-}
-
-DepthwiseConvolution::DepthwiseConvolution(
-    const OperationDef& definition,
-    const DepthwiseConvolution3DAttributes& attr, bool weights_are_buffer)
-    : GPUOperation(definition),
-      weights_are_buffer_(weights_are_buffer),
-      kernel_size_(attr.weights.shape.w, attr.weights.shape.h,
-                   attr.weights.shape.d, 0),
-      stride_(attr.strides.w, attr.strides.h, attr.strides.d, 0),
-      padding_(-attr.padding.prepended.w, -attr.padding.prepended.h,
-               -attr.padding.prepended.d, 0),
-      dilation_(attr.dilations.w, attr.dilations.h, attr.dilations.d, 0),
-      channel_multiplier_(attr.weights.shape.o) {
-  work_group_size_ = int3(8, 8, 1);
-  const bool stride_correction =
-      definition_.IsBatchSupported() && stride_.x != 1;
-  code_ = GenerateDepthwiseConvolutionCode(
-      definition_, stride_correction, channel_multiplier_, weights_are_buffer_);
-}
-
-DepthwiseConvolution::DepthwiseConvolution(DepthwiseConvolution&& operation)
-    : GPUOperation(std::move(operation)),
-      weights_are_buffer_(operation.weights_are_buffer_),
-      kernel_size_(operation.kernel_size_),
-      stride_(operation.stride_),
-      padding_(operation.padding_),
-      dilation_(operation.dilation_),
-      channel_multiplier_(operation.channel_multiplier_) {}
-
-DepthwiseConvolution& DepthwiseConvolution::operator=(
-    DepthwiseConvolution&& operation) {
-  if (this != &operation) {
-    std::swap(weights_are_buffer_, operation.weights_are_buffer_);
-    std::swap(kernel_size_, operation.kernel_size_);
-    std::swap(stride_, operation.stride_);
-    std::swap(padding_, operation.padding_);
-    std::swap(dilation_, operation.dilation_);
-    std::swap(channel_multiplier_, operation.channel_multiplier_);
-    GPUOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
+std::string GenerateDepthwiseConvolutionCode(
     const OperationDef& op_def, bool stride_correction, int channel_multiplier,
-    bool weights_are_buffer) {
+    bool weights_are_buffer, bool dynamic_weights, GPUOperation* op) {
   auto src_desc = op_def.src_tensors[0];
   src_desc.SetTextureAddressMode(TextureAddressMode::ZERO);
   if (op_def.IsBatchSupported()) {
     src_desc.SetStateVar("BatchedWidth", "true");
   }
-  AddSrcTensor("src_tensor", src_desc);
+  op->AddSrcTensor("src_tensor", src_desc);
+  if (dynamic_weights) {
+    op->AddSrcTensor("weights", op_def.src_tensors[1]);
+  }
 
   auto dst_desc = op_def.dst_tensors[0];
   if (op_def.IsBatchSupported()) {
     dst_desc.SetStateVar("BatchedWidth", "true");
   }
-  AddDstTensor("dst_tensor", dst_desc);
-
-  args_.AddInt("kernel_size_x");
-  args_.AddInt("stride_x");
-  args_.AddInt("padding_x");
-  args_.AddInt("dilation_x");
-  args_.AddInt("kernel_size_y");
-  args_.AddInt("stride_y");
-  args_.AddInt("padding_y");
-  args_.AddInt("dilation_y");
-  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    args_.AddInt("kernel_size_z");
-    args_.AddInt("stride_z");
-    args_.AddInt("padding_z");
-    args_.AddInt("dilation_z");
-  }
-  if (!IsSpecializedCase(channel_multiplier)) {
-    args_.AddInt("ch_multiplier");
-  }
+  op->AddDstTensor("dst_tensor", dst_desc);
 
   const auto src_tensor_type = op_def.src_tensors[0].storage_type;
 
@@ -171,14 +96,14 @@ std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
   c += "__kernel void main_function(\n";
   c += "$0) {\n";
   c += "  int X = get_global_id(0);\n";
-  c += "  int Y = get_global_id(1);\n";
   if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    c += "  int linear_id_2 = get_global_id(2);\n";
-    c += "  int S = linear_id_2 / args.dst_tensor.Depth();\n";
-    c += "  int Z = linear_id_2 % args.dst_tensor.Depth();\n";
+    c += "  int linear_id_1 = get_global_id(1);\n";
+    c += "  int Y = linear_id_1 / args.dst_tensor.Depth();\n";
+    c += "  int Z = linear_id_1 % args.dst_tensor.Depth();\n";
   } else {
-    c += "  int S = get_global_id(2);\n";
+    c += "  int Y = get_global_id(1);\n";
   }
+  c += "  int S = get_global_id(2);\n";
   c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height() || "
        "S >= args.dst_tensor.Slices()) { \n";
   c += "    return; \n";
@@ -186,23 +111,36 @@ std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
   c += "  ACCUM_FLT4 r = (ACCUM_FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
   if (stride_correction) {
     c += "  int x_offseted = " +
-         GetXStrideCorrected("X", "args.src_tensor.Batch()", "args.stride_x",
-                             "args.padding_x") +
+         GetXStrideCorrectedV2("X", "args.src_tensor.Batch()", "args.stride_x",
+                               "args.padding_x") +
          ";\n";
   } else {
-    c += "  int x_offseted = X * args.stride_x + args.padding_x;\n";
+    if (op_def.IsBatchSupported()) {
+      c += "  int x_offseted = X * args.stride_x + args.padding_x * "
+           "args.src_tensor.Batch();\n";
+    } else {
+      c += "  int x_offseted = X * args.stride_x + args.padding_x;\n";
+    }
   }
   c += "  int y_offseted = Y * args.stride_y + args.padding_y;\n";
-  std::string weights_offset = "args.kernel_size_x * args.kernel_size_y";
-  if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    c += "  int z_offseted = Z * args.stride_z + args.padding_z;\n";
-    weights_offset += " * args.kernel_size_z";
-  }
-  if (weights_are_buffer) {
-    c += "  int fx_c = S * " + weights_offset + ";\n";
-  } else {
-    c += "  int fx_c = 0;\n";
+  if (!dynamic_weights) {
+    std::string weights_offset = "args.kernel_size_x * args.kernel_size_y";
+    if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
+      c += "  int z_offseted = Z * args.stride_z + args.padding_z;\n";
+      weights_offset += " * args.kernel_size_z";
+    }
+    if (weights_are_buffer) {
+      c += "  int fx_c = S * " + weights_offset + ";\n";
+    } else {
+      c += "  int fx_c = 0;\n";
+    }
   }
+  std::string kernel_size_x =
+      dynamic_weights ? "args.weights.Width()" : "args.kernel_size_x";
+  std::string kernel_size_y =
+      dynamic_weights ? "args.weights.Height()" : "args.kernel_size_y";
+  std::string kernel_size_z =
+      dynamic_weights ? "args.weights.Depth()" : "args.kernel_size_z";
 
   std::string flat_coords = "x_c, y_c";
   if (manual_clamp) {
@@ -210,26 +148,35 @@ std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
     if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
       check += " && !outside_z";
       flat_coords += ", z_c";
-      c += "  for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
+      c += "  for (int kz = 0; kz < " + kernel_size_z + "; ++kz) {\n";
       c += "    int z_c = z_offseted + kz * args.dilation_z;\n";
       c += "    bool outside_z = z_c < 0 || z_c >= args.src_tensor.Depth();\n";
     }
-    c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
+    c += "  for (int ky = 0; ky < " + kernel_size_y + "; ++ky) {\n";
     c += "    int y_c = y_offseted + ky * args.dilation_y;\n";
     c += "    bool outside_y = y_c < 0 || y_c >= args.src_tensor.Height();\n";
-    c += "    for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
-    c += "      int x_c = x_offseted + kx * args.dilation_x;\n";
+    c += "    for (int kx = 0; kx < " + kernel_size_x + "; ++kx) {\n";
+    const std::string dilation_x =
+        op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
+                                  : "args.dilation_x";
+    c += "      int x_c = x_offseted + kx * " + dilation_x + ";\n";
     c += "      bool outside_x = x_c < 0 || x_c >= args.src_tensor.Width();\n";
     c += "      if (" + check + ") {\n";
-    if (weights_are_buffer) {
-      c += "        FLT4 f = args.weights.Read(fx_c);\n";
+    if (dynamic_weights) {
+      c += "        FLT4 f = args.weights.Read(kx, ky, S);\n";
     } else {
-      c += "        FLT4 f = args.weights.Read(fx_c, S);\n";
+      if (weights_are_buffer) {
+        c += "        FLT4 f = args.weights.Read(fx_c);\n";
+      } else {
+        c += "        FLT4 f = args.weights.Read(fx_c, S);\n";
+      }
     }
     c += GetSrcValue(channel_multiplier, flat_coords);
     c += "        r += TO_ACCUM_TYPE(src_final * f);\n";
     c += "      };\n";
-    c += "      fx_c++;\n";
+    if (!dynamic_weights) {
+      c += "      fx_c++;\n";
+    }
     c += "    }\n";
     c += "  }\n";
     if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
@@ -238,7 +185,7 @@ std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
   } else {  // Texture types with ZERO clamping
     if (op_def.dst_tensors[0].HasAxis(Axis::DEPTH)) {
       flat_coords += ", z_c";
-      c += "  for (int kz = 0; kz < args.kernel_size_z; ++kz) {\n";
+      c += "  for (int kz = 0; kz < " + kernel_size_z + "; ++kz) {\n";
       c += "    int z_c = z_offseted + kz * args.dilation_z;\n";
       if (src_tensor_type !=
           TensorStorageType::TEXTURE_3D) {  // Only TEXTURE_3D supports clamping
@@ -249,17 +196,24 @@ std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
         c += "    }\n";
       }
     }
-    c += "  for (int ky = 0; ky < args.kernel_size_y; ++ky) {\n";
+    c += "  for (int ky = 0; ky < " + kernel_size_y + "; ++ky) {\n";
     c += "    int y_c = y_offseted + ky * args.dilation_y;\n";
-    c += "    for (int kx = 0; kx < args.kernel_size_x; ++kx) {\n";
-    c += "      int x_c = x_offseted + kx * args.dilation_x;\n";
+    c += "    for (int kx = 0; kx < " + kernel_size_x + "; ++kx) {\n";
+    const std::string dilation_x =
+        op_def.IsBatchSupported() ? "args.dilation_x * args.src_tensor.Batch()"
+                                  : "args.dilation_x";
+    c += "      int x_c = x_offseted + kx * " + dilation_x + ";\n";
     c += GetSrcValue(channel_multiplier, flat_coords);
-    if (weights_are_buffer) {
-      c += "      FLT4 f = args.weights.Read(fx_c);\n";
+    if (dynamic_weights) {
+      c += "      FLT4 f = args.weights.Read(kx, ky, S);\n";
     } else {
-      c += "      FLT4 f = args.weights.Read(fx_c, S);\n";
+      if (weights_are_buffer) {
+        c += "      FLT4 f = args.weights.Read(fx_c);\n";
+      } else {
+        c += "      FLT4 f = args.weights.Read(fx_c, S);\n";
+      }
+      c += "      fx_c++;\n";
     }
-    c += "      fx_c++;\n";
     c += "      r += TO_ACCUM_TYPE(src_final * f);\n";
     c += "    }\n";
     c += "  }\n";
@@ -277,67 +231,106 @@ std::string DepthwiseConvolution::GenerateDepthwiseConvolutionCode(
 
   return c;
 }
+}  // namespace
 
-absl::Status DepthwiseConvolution::BindArguments() {
-  RETURN_IF_ERROR(args_.SetInt("kernel_size_x", kernel_size_.x));
-  RETURN_IF_ERROR(args_.SetInt("stride_x", stride_.x));
-  RETURN_IF_ERROR(args_.SetInt("padding_x", padding_.x * src_[0]->Batch()));
-  RETURN_IF_ERROR(args_.SetInt("dilation_x", dilation_.x * src_[0]->Batch()));
-  RETURN_IF_ERROR(args_.SetInt("kernel_size_y", kernel_size_.y));
-  RETURN_IF_ERROR(args_.SetInt("stride_y", stride_.y));
-  RETURN_IF_ERROR(args_.SetInt("padding_y", padding_.y));
-  RETURN_IF_ERROR(args_.SetInt("dilation_y", dilation_.y));
-  if (definition_.dst_tensors[0].HasAxis(Axis::DEPTH)) {
-    RETURN_IF_ERROR(args_.SetInt("kernel_size_z", kernel_size_.z));
-    RETURN_IF_ERROR(args_.SetInt("stride_z", stride_.z));
-    RETURN_IF_ERROR(args_.SetInt("padding_z", padding_.z));
-    RETURN_IF_ERROR(args_.SetInt("dilation_z", dilation_.z));
-  }
-  if (!IsSpecializedCase(channel_multiplier_)) {
-    RETURN_IF_ERROR(args_.SetInt("ch_multiplier", channel_multiplier_));
-  }
-  return absl::OkStatus();
-}
-
-int3 DepthwiseConvolution::GetGridSize() const {
-  const int grid_x = dst_[0]->Width() * dst_[0]->Batch();
-  const int grid_y = dst_[0]->Height();
-  const int grid_z = dst_[0]->Slices() * dst_[0]->Depth();
-  return int3(grid_x, grid_y, grid_z);
-}
-
-DepthwiseConvolution CreateDepthwiseConvolution(
+GPUOperation CreateDepthwiseConvolution2D(
     const DeviceInfo& device_info, const OperationDef& definition,
     const DepthwiseConvolution2DAttributes& attr) {
   bool weights_are_buffer = device_info.IsMali();
-  DepthwiseConvolution result(definition, attr, weights_are_buffer);
-  result.UploadWeights(attr.weights);
+  GPUOperation op(definition);
+  op.args_.AddInt("kernel_size_x", attr.weights.shape.w);
+  op.args_.AddInt("stride_x", attr.strides.w);
+  op.args_.AddInt("padding_x", -attr.padding.prepended.w);
+  op.args_.AddInt("dilation_x", attr.dilations.w);
+  op.args_.AddInt("kernel_size_y", attr.weights.shape.h);
+  op.args_.AddInt("stride_y", attr.strides.h);
+  op.args_.AddInt("padding_y", -attr.padding.prepended.h);
+  op.args_.AddInt("dilation_y", attr.dilations.h);
+  if (!IsSpecializedCase(attr.weights.shape.o)) {
+    op.args_.AddInt("ch_multiplier", attr.weights.shape.o);
+  }
+  const bool stride_correction =
+      definition.IsBatchSupported() && attr.strides.w != 1;
+  op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction,
+                                              attr.weights.shape.o,
+                                              weights_are_buffer, false, &op);
+  UploadWeightsForDWConv2D(attr.weights, weights_are_buffer,
+                           definition.precision, &op);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
 
   TensorLinearDescriptor desc;
   desc.storage_type = weights_are_buffer ? LinearStorageType::BUFFER
                                          : LinearStorageType::TEXTURE_2D;
   desc.element_type = definition.GetDataType();
   desc.UploadLinearData(attr.bias);
-  result.args_.AddObject(
+  op.args_.AddObject(
       "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
-  return result;
+  return op;
 }
 
-DepthwiseConvolution CreateDepthwiseConvolution(
+GPUOperation CreateDepthwiseConvolution2DDynamicWeights(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr) {
+  GPUOperation op(definition);
+  op.args_.AddInt("stride_x", attr.strides.w);
+  op.args_.AddInt("padding_x", -attr.padding.prepended.w);
+  op.args_.AddInt("dilation_x", attr.dilations.w);
+  op.args_.AddInt("stride_y", attr.strides.h);
+  op.args_.AddInt("padding_y", -attr.padding.prepended.h);
+  op.args_.AddInt("dilation_y", attr.dilations.h);
+  const bool stride_correction =
+      definition.IsBatchSupported() && attr.strides.w != 1;
+  op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction, 1,
+                                              false, true, &op);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
+
+  TensorLinearDescriptor desc;
+  desc.storage_type = device_info.IsMali() ? LinearStorageType::BUFFER
+                                           : LinearStorageType::TEXTURE_2D;
+  desc.element_type = definition.GetDataType();
+  desc.UploadLinearData(attr.bias);
+  op.args_.AddObject(
+      "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  return op;
+}
+
+GPUOperation CreateDepthwiseConvolution3D(
     const DeviceInfo& device_info, const OperationDef& definition,
     const DepthwiseConvolution3DAttributes& attr) {
   bool weights_are_buffer = device_info.IsMali();
-  DepthwiseConvolution result(definition, attr, weights_are_buffer);
-  result.UploadWeights(attr.weights);
+  GPUOperation op(definition);
+  op.args_.AddInt("kernel_size_x", attr.weights.shape.w);
+  op.args_.AddInt("stride_x", attr.strides.w);
+  op.args_.AddInt("padding_x", -attr.padding.prepended.w);
+  op.args_.AddInt("dilation_x", attr.dilations.w);
+  op.args_.AddInt("kernel_size_y", attr.weights.shape.h);
+  op.args_.AddInt("stride_y", attr.strides.h);
+  op.args_.AddInt("padding_y", -attr.padding.prepended.h);
+  op.args_.AddInt("dilation_y", attr.dilations.h);
+  op.args_.AddInt("kernel_size_z", attr.weights.shape.d);
+  op.args_.AddInt("stride_z", attr.strides.d);
+  op.args_.AddInt("padding_z", -attr.padding.prepended.d);
+  op.args_.AddInt("dilation_z", attr.dilations.d);
+  if (!IsSpecializedCase(attr.weights.shape.o)) {
+    op.args_.AddInt("ch_multiplier", attr.weights.shape.o);
+  }
+  const bool stride_correction =
+      definition.IsBatchSupported() && attr.strides.w != 1;
+  op.code_ = GenerateDepthwiseConvolutionCode(definition, stride_correction,
+                                              attr.weights.shape.o,
+                                              weights_are_buffer, false, &op);
+  UploadWeightsForDWConv3D(attr.weights, weights_are_buffer,
+                           definition.precision, &op);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_SToZ;
 
   TensorLinearDescriptor desc;
   desc.storage_type = weights_are_buffer ? LinearStorageType::BUFFER
                                          : LinearStorageType::TEXTURE_2D;
   desc.element_type = definition.GetDataType();
   desc.UploadLinearData(attr.bias);
-  result.args_.AddObject(
+  op.args_.AddObject(
       "biases", absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
-  return result;
+  return op;
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
index afa6375eb83..3bb034849bc 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h
@@ -35,102 +35,9 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class DepthwiseConvolution : public GPUOperation {
- public:
-  DepthwiseConvolution() = default;
-  absl::Status BindArguments() override;
-  int3 GetGridSize() const override;
-
-  // Move only
-  DepthwiseConvolution(DepthwiseConvolution&& operation);
-  DepthwiseConvolution& operator=(DepthwiseConvolution&& operation);
-  DepthwiseConvolution(const DepthwiseConvolution&) = delete;
-  DepthwiseConvolution& operator=(const DepthwiseConvolution&) = delete;
-
- private:
-  friend DepthwiseConvolution CreateDepthwiseConvolution(
-      const DeviceInfo& device_info, const OperationDef& definition,
-      const DepthwiseConvolution2DAttributes& attr);
-  friend DepthwiseConvolution CreateDepthwiseConvolution(
-      const DeviceInfo& device_info, const OperationDef& definition,
-      const DepthwiseConvolution3DAttributes& attr);
-  DepthwiseConvolution(const OperationDef& definition,
-                       const DepthwiseConvolution2DAttributes& attr,
-                       bool weights_are_buffer);
-  DepthwiseConvolution(const OperationDef& definition,
-                       const DepthwiseConvolution3DAttributes& attr,
-                       bool weights_are_buffer);
-
-  template <DataType T>
-  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
-
-  template <DataType S, typename T>
-  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWI, S>& weights,
-                            absl::Span<T> dst);
-
-  template <DataType T>
-  void UploadWeights(const tflite::gpu::Tensor<OHWDI, T>& weights);
-
-  template <DataType S, typename T>
-  void RearrangeWeightsData(const tflite::gpu::Tensor<OHWDI, S>& weights,
-                            absl::Span<T> dst);
-
-  std::string GenerateDepthwiseConvolutionCode(const OperationDef& op_def,
-                                               bool stride_correction,
-                                               int channel_multiplier,
-                                               bool weights_are_buffer);
-
-  bool weights_are_buffer_;
-
-  int4 kernel_size_;
-  int4 stride_;
-  int4 padding_;
-  int4 dilation_;
-  int channel_multiplier_;
-};
-
-template <DataType T>
-void DepthwiseConvolution::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights) {
-  const int dst_channels = weights.shape.i * weights.shape.o;
-  const int dst_slices = DivideRoundUp(dst_channels, 4);
-  const int kernel_x = weights.shape.w;
-  const int kernel_y = weights.shape.h;
-
-  const int elements_count = kernel_x * kernel_y * dst_slices;
-
-  const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
-  const int float4_size = fp32_weights ? 16 : 8;
-
-  std::vector<uint8_t> data(float4_size * elements_count);
-
-  if (fp32_weights) {
-    float4* ptr = reinterpret_cast<float4*>(data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
-  } else {
-    half4* ptr = reinterpret_cast<half4*>(data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
-  }
-
-  if (weights_are_buffer_) {
-    BufferDescriptor desc;
-    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc.element_size = 4;
-    desc.size = float4_size * elements_count;
-    desc.data = std::move(data);
-    args_.AddObject("weights", absl::make_unique<BufferDescriptor>(desc));
-  } else {
-    Texture2DDescriptor desc;
-    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc.size = int2(kernel_x * kernel_y, dst_slices);
-    desc.data = std::move(data);
-    args_.AddObject("weights", absl::make_unique<Texture2DDescriptor>(desc));
-  }
-}
-
 template <DataType S, typename T>
-void DepthwiseConvolution::RearrangeWeightsData(
-    const tflite::gpu::Tensor<OHWI, S>& weights, absl::Span<T> dst) {
+void RearrangeWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI, S>& weights,
+                                 absl::Span<T> dst) {
   const int dst_channels = weights.shape.i * weights.shape.o;
   const int dst_depth = DivideRoundUp(dst_channels, 4);
   const int kernel_x = weights.shape.w;
@@ -158,50 +65,50 @@ void DepthwiseConvolution::RearrangeWeightsData(
 }
 
 template <DataType T>
-void DepthwiseConvolution::UploadWeights(
-    const tflite::gpu::Tensor<OHWDI, T>& weights) {
+void UploadWeightsForDWConv2D(const tflite::gpu::Tensor<OHWI, T>& weights,
+                              bool weights_are_buffer,
+                              CalculationsPrecision precision,
+                              GPUOperation* op) {
   const int dst_channels = weights.shape.i * weights.shape.o;
   const int dst_slices = DivideRoundUp(dst_channels, 4);
   const int kernel_x = weights.shape.w;
   const int kernel_y = weights.shape.h;
-  const int kernel_z = weights.shape.d;
 
-  const int elements_count = kernel_x * kernel_y * kernel_z * dst_slices;
+  const int elements_count = kernel_x * kernel_y * dst_slices;
 
-  const bool fp32_weights = definition_.precision == CalculationsPrecision::F32;
+  const bool fp32_weights = precision == CalculationsPrecision::F32;
   const int float4_size = fp32_weights ? 16 : 8;
 
   std::vector<uint8_t> data(float4_size * elements_count);
 
   if (fp32_weights) {
     float4* ptr = reinterpret_cast<float4*>(data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
+    RearrangeWeightsForDWConv2D(weights, absl::MakeSpan(ptr, elements_count));
   } else {
     half4* ptr = reinterpret_cast<half4*>(data.data());
-    RearrangeWeightsData(weights, absl::MakeSpan(ptr, elements_count));
+    RearrangeWeightsForDWConv2D(weights, absl::MakeSpan(ptr, elements_count));
   }
 
-  if (weights_are_buffer_) {
+  if (weights_are_buffer) {
     BufferDescriptor desc;
     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
     desc.element_size = 4;
     desc.size = float4_size * elements_count;
     desc.data = std::move(data);
-    args_.AddObject("weights",
-                    absl::make_unique<BufferDescriptor>(std::move(desc)));
+    op->args_.AddObject("weights", absl::make_unique<BufferDescriptor>(desc));
   } else {
     Texture2DDescriptor desc;
     desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc.size = int2(kernel_x * kernel_y * kernel_z, dst_slices);
+    desc.size = int2(kernel_x * kernel_y, dst_slices);
     desc.data = std::move(data);
-    args_.AddObject("weights",
-                    absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+    op->args_.AddObject("weights",
+                        absl::make_unique<Texture2DDescriptor>(desc));
   }
 }
 
 template <DataType S, typename T>
-void DepthwiseConvolution::RearrangeWeightsData(
-    const tflite::gpu::Tensor<OHWDI, S>& weights, absl::Span<T> dst) {
+void RearrangeWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI, S>& weights,
+                                 absl::Span<T> dst) {
   const int dst_channels = weights.shape.i * weights.shape.o;
   const int dst_slices = DivideRoundUp(dst_channels, 4);
   const int kernel_x = weights.shape.w;
@@ -231,11 +138,59 @@ void DepthwiseConvolution::RearrangeWeightsData(
   }
 }
 
-DepthwiseConvolution CreateDepthwiseConvolution(
+template <DataType T>
+void UploadWeightsForDWConv3D(const tflite::gpu::Tensor<OHWDI, T>& weights,
+                              bool weights_are_buffer,
+                              CalculationsPrecision precision,
+                              GPUOperation* op) {
+  const int dst_channels = weights.shape.i * weights.shape.o;
+  const int dst_slices = DivideRoundUp(dst_channels, 4);
+  const int kernel_x = weights.shape.w;
+  const int kernel_y = weights.shape.h;
+  const int kernel_z = weights.shape.d;
+
+  const int elements_count = kernel_x * kernel_y * kernel_z * dst_slices;
+
+  const bool fp32_weights = precision == CalculationsPrecision::F32;
+  const int float4_size = fp32_weights ? 16 : 8;
+
+  std::vector<uint8_t> data(float4_size * elements_count);
+
+  if (fp32_weights) {
+    float4* ptr = reinterpret_cast<float4*>(data.data());
+    RearrangeWeightsForDWConv3D(weights, absl::MakeSpan(ptr, elements_count));
+  } else {
+    half4* ptr = reinterpret_cast<half4*>(data.data());
+    RearrangeWeightsForDWConv3D(weights, absl::MakeSpan(ptr, elements_count));
+  }
+
+  if (weights_are_buffer) {
+    BufferDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 4;
+    desc.size = float4_size * elements_count;
+    desc.data = std::move(data);
+    op->args_.AddObject("weights",
+                        absl::make_unique<BufferDescriptor>(std::move(desc)));
+  } else {
+    Texture2DDescriptor desc;
+    desc.element_type = fp32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.size = int2(kernel_x * kernel_y * kernel_z, dst_slices);
+    desc.data = std::move(data);
+    op->args_.AddObject(
+        "weights", absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+  }
+}
+
+GPUOperation CreateDepthwiseConvolution2D(
     const DeviceInfo& device_info, const OperationDef& definition,
     const DepthwiseConvolution2DAttributes& attr);
 
-DepthwiseConvolution CreateDepthwiseConvolution(
+GPUOperation CreateDepthwiseConvolution2DDynamicWeights(
+    const DeviceInfo& device_info, const OperationDef& definition,
+    const DepthwiseConvolution2DAttributes& attr);
+
+GPUOperation CreateDepthwiseConvolution3D(
     const DeviceInfo& device_info, const OperationDef& definition,
     const DepthwiseConvolution3DAttributes& attr);
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_test.cc
index 5c3e596a2e5..eb43c0c30e3 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv_test.cc
@@ -55,7 +55,7 @@ TEST_F(OpenCLOperationTest, DepthwiseConvSimpleWeights) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      DepthwiseConvolution operation = CreateDepthwiseConvolution(
+      GPUOperation operation = CreateDepthwiseConvolution2D(
           creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
@@ -90,7 +90,7 @@ TEST_F(OpenCLOperationTest, DepthwiseConvNoMultiplier) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      DepthwiseConvolution operation = CreateDepthwiseConvolution(
+      GPUOperation operation = CreateDepthwiseConvolution2D(
           creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 2), &dst_tensor));
@@ -126,7 +126,7 @@ TEST_F(OpenCLOperationTest, DepthwiseConvMultiplier2) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      DepthwiseConvolution operation = CreateDepthwiseConvolution(
+      GPUOperation operation = CreateDepthwiseConvolution2D(
           creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 2, 2, 4), &dst_tensor));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
index afec0ab8a56..f50045131c2 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
@@ -42,10 +42,10 @@ std::string GetOneInputCode(const OperationType& op_type,
       result = "\n";
       break;
     case OperationType::ELU:
-      result = "$0.x = $0.x < (FLT)(0.0f) ? exp($0.x) - (FLT)(1.0f) : $0.x;\n";
-      result += "$0.y = $0.y < (FLT)(0.0f) ? exp($0.y) - (FLT)(1.0f) : $0.y;\n";
-      result += "$0.z = $0.z < (FLT)(0.0f) ? exp($0.z) - (FLT)(1.0f) : $0.z;\n";
-      result += "$0.w = $0.w < (FLT)(0.0f) ? exp($0.w) - (FLT)(1.0f) : $0.w;\n";
+      result = "$0.x = $0.x < (FLT)(0.0f) ? expm1($0.x) : $0.x;\n";
+      result += "$0.y = $0.y < (FLT)(0.0f) ? expm1($0.y) : $0.y;\n";
+      result += "$0.z = $0.z < (FLT)(0.0f) ? expm1($0.z) : $0.z;\n";
+      result += "$0.w = $0.w < (FLT)(0.0f) ? expm1($0.w) : $0.w;\n";
       break;
     case OperationType::EXP:
       result = "$0 = exp($0);\n";
@@ -58,23 +58,17 @@ std::string GetOneInputCode(const OperationType& op_type,
     case OperationType::LOG:
       result = "$0 = log($0);\n";
       break;
+    case OperationType::NEG:
+      result = "$0 = -($0);\n";
+      break;
     case OperationType::RSQRT:
       result = "$0 = rsqrt($0);\n";
       break;
     case OperationType::SIGMOID:
       if (precision != CalculationsPrecision::F32) {
         result =
-            "$0.x = convert_half(native_recip(1.0f + "
-            "native_exp(convert_float(-$0.x))));\n";
-        result +=
-            "$0.y = convert_half(native_recip(1.0f + "
-            "native_exp(convert_float(-$0.y))));\n";
-        result +=
-            "$0.z = convert_half(native_recip(1.0f + "
-            "native_exp(convert_float(-$0.z))));\n";
-        result +=
-            "$0.w = convert_half(native_recip(1.0f + "
-            "native_exp(convert_float(-$0.w))));\n";
+            "$0 = convert_half4(native_recip(1.0f + "
+            "native_exp(convert_float4(-$0))));\n";
       } else {
         result = "$0 = (FLT4)(1.0f) / ((FLT4)(1.0f) + exp(-($0)));\n";
       }
@@ -89,7 +83,12 @@ std::string GetOneInputCode(const OperationType& op_type,
       result = "$0 *= $0;\n";
       break;
     case OperationType::TANH:
-      result = "$0 = tanh($0);\n";
+      if (precision != CalculationsPrecision::F32) {
+        result = "float4 t = native_exp(convert_float4($0 * 2.0h));\n";
+        result += "$0 = convert_half4(native_divide(t - 1.0f, t + 1.0f));\n";
+      } else {
+        result = "$0 = tanh($0);\n";
+      }
       break;
     default:
       return "Unknown operation type;\n";
@@ -128,6 +127,43 @@ std::string GetTwoInputCode(const OperationType& op_type,
     case OperationType::SUB:
       result += "$0 = $1 - $2;\n";
       break;
+    // Comparison operators
+    case OperationType::LESS:
+      result = "$0.x = $1.x < $2.x ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.y = $1.y < $2.y ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.z = $1.z < $2.z ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.w = $1.w < $2.w ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      break;
+    case OperationType::LESS_EQUAL:
+      result = "$0.x = $1.x <= $2.x ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.y = $1.y <= $2.y ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.z = $1.z <= $2.z ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.w = $1.w <= $2.w ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      break;
+    case OperationType::GREATER:
+      result = "$0.x = $1.x > $2.x ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.y = $1.y > $2.y ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.z = $1.z > $2.z ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.w = $1.w > $2.w ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      break;
+    case OperationType::GREATER_EQUAL:
+      result = "$0.x = $1.x >= $2.x ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.y = $1.y >= $2.y ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.z = $1.z >= $2.z ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.w = $1.w >= $2.w ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      break;
+    case OperationType::EQUAL:
+      result = "$0.x = $1.x == $2.x ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.y = $1.y == $2.y ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.z = $1.z == $2.z ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.w = $1.w == $2.w ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      break;
+    case OperationType::NOT_EQUAL:
+      result = "$0.x = $1.x != $2.x ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.y = $1.y != $2.y ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.z = $1.z != $2.z ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      result += "$0.w = $1.w != $2.w ? (FLT)(1.0f) : (FLT)(0.0f);\n";
+      break;
     default:
       return "Unknown operation type;\n";
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
index d883a734214..b48f66ce600 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
@@ -208,6 +208,30 @@ TEST_F(OpenCLOperationTest, Log) {
   }
 }
 
+TEST_F(OpenCLOperationTest, Neg) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {1.0f, -2.0f, 0.0f, 4.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateElementwiseOneInput(op_def, OperationType::NEG);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {-1.0f, 2.0f, 0.0f, -4.0f}));
+    }
+  }
+}
+
 TEST_F(OpenCLOperationTest, Rsqrt) {
   TensorFloat32 src_tensor;
   src_tensor.shape = BHWC(1, 2, 1, 2);
@@ -817,6 +841,174 @@ TEST_F(OpenCLOperationTest, SubWithScalarAtFirstPosition) {
   }
 }
 
+TEST_F(OpenCLOperationTest, Less) {
+  TensorFloat32 src_tensor_0, src_tensor_1;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_1.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {0.0f, 1.0f, 2.0f, 3.0f};
+  src_tensor_1.data = {1.0f, 0.0f, 2.0f, -4.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation = CreateElementwiseTwoInput(
+          op_def, OperationType::LESS, src_tensor_1.shape);
+      ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
+                                    creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {1.0f, 0.0f, 0.0f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, LessEqual) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  ElementwiseAttributes attr;
+  attr.param = 2.0f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::LESS_EQUAL, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {1.0f, 1.0f, 1.0f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, Greater) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  ElementwiseAttributes attr;
+  attr.param = 2.0f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::GREATER, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 0.0f, 0.0f, 1.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, GreaterEqual) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  ElementwiseAttributes attr;
+  attr.param = 2.0f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::GREATER_EQUAL, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 0.0f, 1.0f, 1.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, Equal) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  ElementwiseAttributes attr;
+  attr.param = 2.0f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::EQUAL, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {0.0f, 0.0f, 1.0f, 0.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, NotEqual) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {0.0f, 1.0f, 2.0f, 3.0f};
+
+  ElementwiseAttributes attr;
+  attr.param = 2.0f;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateElementwise(creation_context_.GetDeviceInfo(), op_def,
+                            OperationType::NOT_EQUAL, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {1.0f, 1.0f, 0.0f, 1.0f}));
+    }
+  }
+}
+
 }  // namespace
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
index 999344384aa..1940a1a020c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.cc
@@ -17,28 +17,50 @@ limitations under the License.
 
 #include <string>
 #include <utility>
+#include <vector>
 
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
 
 namespace tflite {
 namespace gpu {
 namespace cl {
+namespace {
+bool UseBufferForWeights(const DeviceInfo& device_info) {
+  return device_info.IsAdreno() || device_info.IsAMD() || device_info.IsMali();
+}
+}  // namespace
 
 FullyConnected::FullyConnected(const OperationDef& definition,
                                const DeviceInfo& device_info)
     : GPUOperation(definition) {
   if (device_info.IsAdreno()) {
     if (device_info.IsAdreno3xx()) {
-      work_group_size_ = int3(8, 4, 1);
-    } else if (device_info.IsAdreno4xx()) {
       work_group_size_ = int3(16, 4, 1);
+    } else if (device_info.IsAdreno4xx()) {
+      work_group_size_ = int3(32, 4, 1);
     } else {
       work_group_size_ = int3(32, 4, 1);
     }
+  } else if (device_info.IsIntel()) {
+    work_group_size_ = int3(8, 4, 1);
+  } else if (device_info.IsNvidia()) {
+    work_group_size_ = int3(8, 4, 1);
+  } else if (device_info.IsPowerVR()) {
+    work_group_size_ = int3(8, 4, 1);
   } else {
     work_group_size_ = int3(16, 4, 1);
   }
-  code_ = GetFullyConnectedKernelCode(definition_, work_group_size_);
+  code_ = GetFullyConnectedKernelCode(definition_, device_info);
 }
 
 FullyConnected::FullyConnected(FullyConnected&& kernel)
@@ -58,10 +80,12 @@ FullyConnected& FullyConnected::operator=(FullyConnected&& kernel) {
 // optimized shaders
 
 std::string FullyConnected::GetFullyConnectedKernelCode(
-    const OperationDef& op_def, const int3& work_group_size) {
+    const OperationDef& op_def, const DeviceInfo& device_info) {
   AddSrcTensor("src_tensor", op_def.src_tensors[0]);
   AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
 
+  const bool weights_are_buffer = UseBufferForWeights(device_info);
+
   std::string c = GetCommonDefines(op_def.precision);
   switch (op_def.precision) {
     case CalculationsPrecision::F32:
@@ -73,35 +97,54 @@ std::string FullyConnected::GetFullyConnectedKernelCode(
       break;
   }
 
-  const std::string wg_x = std::to_string(work_group_size.x);
-  const std::string wg_y = std::to_string(work_group_size.y);
-  c += "__kernel void main_function(\n";
-  c += "$0) {\n";
-  c += "  int gid = get_global_id(0);\n";
-  c += "  bool inside = gid < args.dst_tensor.Slices();\n";
-  c += "  gid = min(gid, args.dst_tensor.Slices() - 1);\n";
-  c += "  int2 tid = (int2)(get_local_id(0), get_local_id(1));\n";
-  c += "  ACCUM_FLT4 s = (ACCUM_FLT4)(0.0f);\n";
-  c += "  for (uint c = tid.y; c < args.src_tensor.Slices(); c += " + wg_y +
-       ") {\n";
-  c += "    FLT4 v = args.src_tensor.Read(0, 0, c);\n";
-  c += "    FLT16 w = args.weights.Read(c * args.dst_tensor.Slices() + gid);\n";
-  c += "    s.x += dot(v, w.s0123);\n";
-  c += "    s.y += dot(v, w.s4567);\n";
-  c += "    s.z += dot(v, w.s89ab);\n";
-  c += "    s.w += dot(v, w.scdef);\n";
-  c += "  }\n";
-  c += "  __local ACCUM_FLT4 temp[" + wg_x + "][" + wg_y + "];\n";
-  c += "  temp[tid.x][tid.y] = s;\n";
-  c += "  barrier(CLK_LOCAL_MEM_FENCE);\n";
-  c += "  if (tid.y == 0 && inside) {\n";
-  for (int i = 1; i < work_group_size.y; ++i) {
+  c += "#define WG_X " + std::to_string(work_group_size_.x) + "\n";
+  c += "#define WG_Y " + std::to_string(work_group_size_.y) + "\n";
+
+  c += R"(__kernel void main_function($0) {
+  int gid = get_global_id(0);
+  int2 tid = (int2)(get_local_id(0), get_local_id(1));
+  ACCUM_FLT4 s = (ACCUM_FLT4)(0.0f);
+  if (gid < args.dst_tensor.Slices()) {
+    for (int c = tid.y; c < args.src_tensor.Slices(); c += WG_Y) {
+      FLT4 v = args.src_tensor.Read(0, 0, c);
+)";
+  if (weights_are_buffer) {
+    c += R"(FLT16 w = args.weights.Read(c * args.dst_tensor.Slices() + gid);
+      FLT4 partial = v.s0 * w.s0123;
+      partial = mad(v.s1, w.s4567, partial);
+      partial = mad(v.s2, w.s89ab, partial);
+      partial = mad(v.s3, w.scdef, partial);
+      s += TO_ACCUM_TYPE(partial);
+)";
+  } else {
+    c += R"(FLT4 w0 = args.weights.Read(c * 4 + 0, gid);
+      FLT4 w1 = args.weights.Read(c * 4 + 1, gid);
+      FLT4 w2 = args.weights.Read(c * 4 + 2, gid);
+      FLT4 w3 = args.weights.Read(c * 4 + 3, gid);
+      FLT4 partial = v.s0 * w0;
+      partial = mad(v.s1, w1, partial);
+      partial = mad(v.s2, w2, partial);
+      partial = mad(v.s3, w3, partial);
+      s += TO_ACCUM_TYPE(partial);
+)";
+  }
+  c += R"(    }
+  }
+  __local ACCUM_FLT4 temp[WG_X][WG_Y];
+  temp[tid.x][tid.y] = s;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (gid >= args.dst_tensor.Slices()) {
+    return;
+  }
+  if (tid.y == 0) {
+)";
+  for (int i = 1; i < work_group_size_.y; ++i) {
     c += "    s += temp[tid.x][" + std::to_string(i) + "];\n";
   }
-  c += "    FLT4 r0 = TO_FLT4(s) + args.biases.Read(gid);\n";
-  c += "    args.dst_tensor.Write(r0, 0, 0, gid);\n";
-  c += "  }\n";
-  c += "}\n";
+  c += R"(    FLT4 r0 = TO_FLT4(s) + args.biases.Read(gid);
+    args.dst_tensor.Write(r0, 0, 0, gid);
+  }
+})";
 
   return c;
 }
@@ -114,7 +157,7 @@ FullyConnected CreateFullyConnected(const DeviceInfo& device_info,
                                     const OperationDef& definition,
                                     const FullyConnectedAttributes& attr) {
   FullyConnected result(definition, device_info);
-  result.UploadWeights(attr.weights);
+  result.UploadWeights(attr.weights, UseBufferForWeights(device_info));
 
   TensorLinearDescriptor desc;
   desc.storage_type = LinearStorageType::TEXTURE_2D;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
index f1fc7dc199f..ec572b24fb5 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h
@@ -16,19 +16,27 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FULLY_CONNECTED_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_FULLY_CONNECTED_H_
 
+#include <stdint.h>
+
+#include <string>
+#include <utility>
 #include <vector>
 
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
-#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
-#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/tuning_parameters.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
 
 namespace tflite {
 namespace gpu {
@@ -36,52 +44,77 @@ namespace cl {
 
 template <DataType T, typename S>
 void RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
-                                absl::Span<S> dst) {
+                                S* dst) {
   const int src_channels = weights.shape.i;
   const int padded_src_channels = AlignByN(src_channels, 4);
   const int dst_channels = weights.shape.o;
   const int padded_dst_channels = AlignByN(dst_channels, 4);
 
-  // The weights are to be rearranged in such a way that the first 4 elements of
-  // each row, starting from row_0, are copied onto the destination buffer. The
-  // next set of 4 elements are then copied and so on. As an example, an 8x8
-  // matrix would be rearranged as below.
+  // Change the travelsal order of the weight matrix in the following way:
+  // The matrix is segmented to blocks of 4x4. If (any) dimension of the matrix
+  // size is not divisible by 4, then pad with zeros. Each block is stored
+  // contigously. The 16 elements within a block are ordered as 4 elements of
+  // the first column, 4 elems of the second, etc. Blocks then traversed as
+  // columns first, rows last. As an example, an 8x8 matrix would be traversed
+  // as below.
   //
-  //  | a0 a1 a2 a3 a4 a5 a6 a7 |              | a0 a1 a2 a3 b0 b1 b2 b3 |
-  //  | b0 b1 b2 b3 b4 b5 b6 b7 |              | c0 c1 c2 c3 d0 d1 d2 d3 |
-  //  | c0 c1 c2 c3 c4 c5 c6 c7 |              | e0 e1 e2 e3 f0 f1 f2 f3 |
-  //  | d0 d1 d2 d3 d4 d5 d6 d7 |  --------->  | g0 g1 g2 g3 h0 h1 h2 h3 |
-  //  | e0 e1 e2 e3 e4 e5 e6 e7 |              | a4 a5 a6 a7 b4 b5 b6 b7 |
-  //  | f0 f1 f2 f3 f4 f5 f6 f7 |              | c4 c5 c6 c7 d4 d5 d6 d7 |
-  //  | g0 g1 g2 g3 g4 g5 g6 g7 |              | e4 e5 e6 e7 f4 f5 f6 f7 |
-  //  | h0 h1 h2 h3 h4 h5 h6 h7 |              | g4 g5 g6 g7 h4 h5 h6 h7 |
+  //  |  0  4  8 12 32 36 40 44 |
+  //  |  1  5  9 13 33 37 41 45 |
+  //  |  2  6 10 14 34 38 42 46 |
+  //  |  3  7 11 15 35 39 43 47 |
+  //  | 16 20 24 28 48 52 56 60 |
+  //  | 17 21 25 29 49 53 57 61 |
+  //  | 18 22 26 30 50 54 58 62 |
+  //  | 19 23 27 31 51 55 59 63 |
+  //
+  // The benefit of doing this is that reading contigous 16 elements gives a 4x4
+  // block of the matrix, where the first 4 elements is the first row of the
+  // block, second 4 elements is the second row of the block, etc. Subsequent
+  // blocks contain elements of the same 4 columns.
 
-  for (int y = 0; y < dst_channels; y++) {
-    int x = 0;
-    for (; x + 4 <= src_channels; x += 4) {
-      const int idx_data_0 = src_channels * y + x;
-      S filter = S(weights.data[idx_data_0], weights.data[idx_data_0 + 1],
-                   weights.data[idx_data_0 + 2], weights.data[idx_data_0 + 3]);
-      dst[y + padded_dst_channels * x / 4] = filter;
-    }
-
-    // If the width is not a multiple of 4, padding is required and the padded
-    // region is filled with zeros.
-    if (src_channels != padded_src_channels) {
-      const int idx_data_0 = src_channels * y + x;
-
-      S filter = S(x < src_channels ? weights.data[idx_data_0] : 0.0,
-                   x + 1 < src_channels ? weights.data[idx_data_0 + 1] : 0.0,
-                   x + 2 < src_channels ? weights.data[idx_data_0 + 2] : 0.0,
-                   x + 3 < src_channels ? weights.data[idx_data_0 + 3] : 0.0);
-      dst[y + padded_dst_channels * x / 4] = filter;
+  for (int block_y = 0; 4 * block_y < padded_dst_channels; block_y++) {
+    for (int y_in_block = 0; y_in_block < 4; y_in_block++) {
+      for (int block_x = 0; 4 * block_x < padded_src_channels; block_x++) {
+        for (int x_in_block = 0; x_in_block < 4; x_in_block++) {
+          int y = 4 * block_y + y_in_block;
+          int x = 4 * block_x + x_in_block;
+          // Consider destination as an array with extents
+          // [padded_src_channels/4][padded_dst_channels/4][4][4]
+          int dst_index = block_x * padded_dst_channels * 4 + block_y * 16 +
+                          x_in_block * 4 + y_in_block;
+          if (x < src_channels && y < dst_channels) {
+            dst[dst_index] = weights.data[src_channels * y + x];
+          } else {
+            dst[dst_index] = 0.0f;
+          }
+        }
+      }
     }
   }
+}
 
-  // Fill the padded columns with zeros.
-  for (int y = dst_channels; y < padded_dst_channels; y++) {
-    for (int x = 0; x < padded_src_channels; x += 4) {
-      dst[y + padded_dst_channels * x / 4] = S(0.0);
+template <DataType T, typename S>
+void RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
+                                S* dst) {
+  const int src_channels = weights.shape.i;
+  const int src_depth = DivideRoundUp(src_channels, 4);
+  const int dst_channels = weights.shape.o;
+  const int dst_depth = DivideRoundUp(dst_channels, 4);
+
+  int counter = 0;
+  for (int d = 0; d < dst_depth; ++d) {
+    for (int s = 0; s < src_depth; ++s) {
+      for (int i = 0; i < 4; ++i) {
+        const int src_ch = s * 4 + i;
+        for (int j = 0; j < 4; ++j) {
+          const int dst_ch = d * 4 + j;
+          if (src_ch < src_channels && dst_ch < dst_channels) {
+            dst[counter++] = weights.data[dst_ch * src_channels + src_ch];
+          } else {
+            dst[counter++] = 0.0f;
+          }
+        }
+      }
     }
   }
 }
@@ -110,15 +143,16 @@ class FullyConnected : public GPUOperation {
       const FullyConnectedAttributes& attr);
 
   template <DataType T>
-  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights);
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
+                     bool weights_are_buffer);
 
   std::string GetFullyConnectedKernelCode(const OperationDef& op_def,
-                                          const int3& work_group_size);
+                                          const DeviceInfo& device_info);
 };
 
 template <DataType T>
-void FullyConnected::UploadWeights(
-    const tflite::gpu::Tensor<OHWI, T>& weights) {
+void FullyConnected::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
+                                   bool weights_are_buffer) {
   const int src_depth = DivideRoundUp(weights.shape.i, 4);
   const int dst_depth = DivideRoundUp(weights.shape.o, 4);
 
@@ -127,22 +161,40 @@ void FullyConnected::UploadWeights(
 
   const int float4_size = f32_weights ? 16 : 8;
 
-  BufferDescriptor desc;
-  desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-  desc.element_size = 16;
-  desc.size = float4_size * elements_count;
-  desc.data.resize(desc.size);
+  if (weights_are_buffer) {
+    BufferDescriptor desc;
+    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 16;
+    desc.size = float4_size * elements_count;
+    desc.data.resize(desc.size);
 
-  if (f32_weights) {
-    float4* ptr = reinterpret_cast<float4*>(desc.data.data());
-    RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(ptr, elements_count));
+    if (f32_weights) {
+      float* ptr = reinterpret_cast<float*>(desc.data.data());
+      RearrangeFCWeightsToIOO4I4(weights, ptr);
+    } else {
+      half* ptr = reinterpret_cast<half*>(desc.data.data());
+      RearrangeFCWeightsToIOO4I4(weights, ptr);
+    }
+
+    args_.AddObject("weights",
+                    absl::make_unique<BufferDescriptor>(std::move(desc)));
   } else {
-    half4* ptr = reinterpret_cast<half4*>(desc.data.data());
-    RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(ptr, elements_count));
-  }
+    Texture2DDescriptor desc;
+    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.size = int2(src_depth * 4, dst_depth);
+    desc.data.resize(float4_size * elements_count);
 
-  args_.AddObject("weights",
-                  absl::make_unique<BufferDescriptor>(std::move(desc)));
+    if (f32_weights) {
+      float* ptr = reinterpret_cast<float*>(desc.data.data());
+      RearrangeFCWeightsToOIO4I4(weights, ptr);
+    } else {
+      half* ptr = reinterpret_cast<half*>(desc.data.data());
+      RearrangeFCWeightsToOIO4I4(weights, ptr);
+    }
+
+    args_.AddObject("weights",
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+  }
 }
 
 FullyConnected CreateFullyConnected(const DeviceInfo& device_info,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
index f58487c1941..c9853187b3c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/fully_connected_test.cc
@@ -19,9 +19,15 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 using ::testing::ElementsAreArray;
 using ::testing::FloatNear;
@@ -39,7 +45,8 @@ TEST_F(OpenCLOperationTest, FullyConnected) {
 
   FullyConnectedAttributes attr;
   attr.weights.shape = OHWI(2, 1, 1, 4);
-  attr.weights.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+  attr.weights.data = {0.0f, 1.0f, 2.0f, 3.0f,  //
+                       4.0f, 5.0f, 6.0f, 7.0f};
   attr.bias.shape = Linear(2);
   attr.bias.data = {0.5f, -0.5f};
 
@@ -56,7 +63,101 @@ TEST_F(OpenCLOperationTest, FullyConnected) {
           CreateFullyConnected(creation_context_.GetDeviceInfo(), op_def, attr);
       ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
                                     BHWC(1, 1, 1, 2), &dst_tensor));
-      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {14.5f, 37.5f}));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {14.5f, 37.5f}))
+          << "Failed using precision " << ToString(precision);
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, FullyConnectedLarge) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 1, 1, 8);
+  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f};
+
+  FullyConnectedAttributes attr;
+  attr.weights.shape = OHWI(12, 1, 1, 8);
+  attr.weights.data = {
+      0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,   //
+      8.0f,  9.0f,  10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f,  //
+      16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f,  //
+      24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f,  //
+      32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f, 38.0f, 39.0f,  //
+      40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f,  //
+      48.0f, 49.0f, 50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f,  //
+      56.0f, 57.0f, 58.0f, 59.0f, 60.0f, 61.0f, 62.0f, 63.0f,  //
+      64.0f, 65.0f, 66.0f, 67.0f, 68.0f, 69.0f, 70.0f, 71.0f,  //
+      72.0f, 73.0f, 74.0f, 75.0f, 76.0f, 77.0f, 78.0f, 79.0f,  //
+      80.0f, 81.0f, 82.0f, 83.0f, 84.0f, 85.0f, 86.0f, 87.0f,  //
+      88.0f, 89.0f, 90.0f, 91.0f, 92.0f, 93.0f, 94.0f, 95.0f,  //
+  };
+  attr.bias.shape = Linear(12);
+  attr.bias.data = {-0.6f, -0.5f, -0.4f, -0.3f, -0.2f, -0.1f,
+                    0.1f,  0.2f,  0.3f,  0.4f,  0.5f,  0.6f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 0.0f : 0.601f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      FullyConnected operation =
+          CreateFullyConnected(creation_context_.GetDeviceInfo(), op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, 12), &dst_tensor));
+      EXPECT_THAT(
+          dst_tensor.data,
+          Pointwise(FloatNear(eps),
+                    {139.4f, 363.5f, 587.6f, 811.7f, 1035.8f, 1259.9f, 1484.1f,
+                     1708.2f, 1932.3f, 2156.4f, 2380.5f, 2604.6f}))
+          << "Failed using precision " << ToString(precision);
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, FullyConnectedExtraLarge) {
+  static const int kInputSize = 1024;
+  static const int kOutputSize = 1024;
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 1, 1, kInputSize);
+  src_tensor.data.assign(kInputSize, 1.1f);
+
+  FullyConnectedAttributes attr;
+  attr.weights.shape = OHWI(1024, 1, 1, kInputSize);
+  attr.weights.data.assign(kOutputSize * kInputSize, 2.2f);
+  attr.bias.shape = Linear(kOutputSize);
+  attr.bias.data.assign(kOutputSize, 3.3f);
+
+  std::vector<float> expected(kOutputSize, 2481.38f);
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      float eps;
+      switch (precision) {
+        case CalculationsPrecision::F32:
+          eps = 2.45e-3f;
+          break;
+        case CalculationsPrecision::F32_F16:
+          eps = 1.38f;
+          break;
+        case CalculationsPrecision::F16:
+          eps = 38.7f;
+          break;
+      }
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      FullyConnected operation =
+          CreateFullyConnected(creation_context_.GetDeviceInfo(), op_def, attr);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 1, 1, kOutputSize), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), expected))
+          << "Failed using precision " << ToString(precision);
     }
   }
 }
@@ -64,53 +165,74 @@ TEST_F(OpenCLOperationTest, FullyConnected) {
 TEST_F(OpenCLOperationTest, RearrageWeights) {
   tflite::gpu::Tensor<OHWI, DataType::FLOAT32> weights;
   weights.shape = OHWI(8, 1, 1, 8);
-  weights.data = {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  10.0, 11.0,
-                  12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 20.0, 21.0, 22.0, 23.0,
-                  24.0, 25.0, 26.0, 27.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0,
-                  36.0, 37.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0,
-                  50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 60.0, 61.0,
-                  62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 70.0, 71.0, 72.0, 73.0,
-                  74.0, 75.0, 76.0, 77.0};
-
-  std::vector<float> expected_rearranged_data = {
-      0.0,  1.0,  2.0,  3.0,  10.0, 11.0, 12.0, 13.0, 20.0, 21.0, 22.0,
-      23.0, 30.0, 31.0, 32.0, 33.0, 40.0, 41.0, 42.0, 43.0, 50.0, 51.0,
-      52.0, 53.0, 60.0, 61.0, 62.0, 63.0, 70.0, 71.0, 72.0, 73.0, 4.0,
-      5.0,  6.0,  7.0,  14.0, 15.0, 16.0, 17.0, 24.0, 25.0, 26.0, 27.0,
-      34.0, 35.0, 36.0, 37.0, 44.0, 45.0, 46.0, 47.0, 54.0, 55.0, 56.0,
-      57.0, 64.0, 65.0, 66.0, 67.0, 74.0, 75.0, 76.0, 77.0,
+  weights.data = {
+      0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,   //
+      10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f,  //
+      20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f,  //
+      30.0f, 31.0f, 32.0f, 33.0f, 34.0f, 35.0f, 36.0f, 37.0f,  //
+      40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f, 47.0f,  //
+      50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f, 57.0f,  //
+      60.0f, 61.0f, 62.0f, 63.0f, 64.0f, 65.0f, 66.0f, 67.0f,  //
+      70.0f, 71.0f, 72.0f, 73.0f, 74.0f, 75.0f, 76.0f, 77.0f   //
   };
 
+  std::vector<float> expected_rearranged_data = {
+      // Top-left block
+      0.0f, 10.0f, 20.0f, 30.0f, 1.0f, 11.0f, 21.0f, 31.0f, 2.0f, 12.0f, 22.0f,
+      32.0f, 3.0f, 13.0f, 23.0f, 33.0f,
+      // Bottom-left block
+      40.0f, 50.0f, 60.0f, 70.0f, 41.0f, 51.0f, 61.0f, 71.0f, 42.0f, 52.0f,
+      62.0f, 72.0f, 43.0f, 53.0f, 63.0f, 73.0f,
+      // Top-right block
+      4.0f, 14.0f, 24.0f, 34.0f, 5.0f, 15.0f, 25.0f, 35.0f, 6.0f, 16.0f, 26.0f,
+      36.0f, 7.0f, 17.0f, 27.0f, 37.0f,
+      // Bottom-right block
+      44.0f, 54.0f, 64.0f, 74.0f, 45.0f, 55.0f, 65.0f, 75.0f, 46.0f, 56.0f,
+      66.0f, 76.0f, 47.0f, 57.0f, 67.0f, 77.0f};
+
   std::vector<float> data(8 * 8);
-  float4* data_ptr = static_cast<float4*>(static_cast<void*>(data.data()));
-  RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(data_ptr, 8 * 8 / 4));
+  RearrangeFCWeightsToIOO4I4(weights, data.data());
 
   EXPECT_THAT(data, ElementsAreArray(expected_rearranged_data));
 }
 
 TEST_F(OpenCLOperationTest, RearrageWeightsWhenPaddingIsRequired) {
   tflite::gpu::Tensor<OHWI, DataType::FLOAT32> weights;
-  weights.shape = OHWI(7, 1, 1, 7);
+  weights.shape = OHWI(9, 1, 1, 7);
   weights.data = {
-      0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  10.0, 11.0, 12.0,
-      13.0, 14.0, 15.0, 16.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0,
-      26.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 40.0, 41.0,
-      42.0, 43.0, 44.0, 45.0, 46.0, 50.0, 51.0, 52.0, 53.0, 54.0,
-      55.0, 56.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0,
+      0.0f,  1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,   //
+      10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f,  //
+      20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f,  //
+      30.0f, 31.0f, 32.0f, 33.0f, 34.0f, 35.0f, 36.0f,  //
+      40.0f, 41.0f, 42.0f, 43.0f, 44.0f, 45.0f, 46.0f,  //
+      50.0f, 51.0f, 52.0f, 53.0f, 54.0f, 55.0f, 56.0f,  //
+      60.0f, 61.0f, 62.0f, 63.0f, 64.0f, 65.0f, 66.0f,  //
+      70.0f, 71.0f, 72.0f, 73.0f, 74.0f, 75.0f, 76.0f,  //
+      80.0f, 81.0f, 82.0f, 83.0f, 84.0f, 85.0f, 86.0f,  //
   };
 
   std::vector<float> expected_rearranged_data = {
-      0.0,  1.0,  2.0,  3.0,  10.0, 11.0, 12.0, 13.0, 20.0, 21.0, 22.0,
-      23.0, 30.0, 31.0, 32.0, 33.0, 40.0, 41.0, 42.0, 43.0, 50.0, 51.0,
-      52.0, 53.0, 60.0, 61.0, 62.0, 63.0, 0.0,  0.0,  0.0,  0.0,  4.0,
-      5.0,  6.0,  0.0,  14.0, 15.0, 16.0, 0.0,  24.0, 25.0, 26.0, 0.0,
-      34.0, 35.0, 36.0, 0.0,  44.0, 45.0, 46.0, 0.0,  54.0, 55.0, 56.0,
-      0.0,  64.0, 65.0, 66.0, 0.0,  0.0,  0.0,  0.0,  0.0,
-  };
+      // Top-left block
+      0.0f, 10.0f, 20.0f, 30.0f, 1.0f, 11.0f, 21.0f, 31.0f, 2.0f, 12.0f, 22.0f,
+      32.0f, 3.0f, 13.0f, 23.0f, 33.0f,
+      // Mid-left block
+      40.0f, 50.0f, 60.0f, 70.0f, 41.0f, 51.0f, 61.0f, 71.0f, 42.0f, 52.0f,
+      62.0f, 72.0f, 43.0f, 53.0f, 63.0f, 73.0f,
+      // Bottom-left block
+      80.0f, 0.0f, 0.0f, 0.0f, 81.0f, 0.0f, 0.0f, 0.0f, 82.0f, 0.0f, 0.0f, 0.0f,
+      83.0f, 0.0f, 0.0f, 0.0f,
+      // Top-right block
+      4.0f, 14.0f, 24.0f, 34.0f, 5.0f, 15.0f, 25.0f, 35.0f, 6.0f, 16.0f, 26.0f,
+      36.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+      // Mid-left block
+      44.0f, 54.0f, 64.0f, 74.0f, 45.0f, 55.0f, 65.0f, 75.0f, 46.0f, 56.0f,
+      66.0f, 76.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+      // Bottom-right block
+      84.0f, 0.0f, 0.0f, 0.0f, 85.0f, 0.0f, 0.0f, 0.0f, 86.0f, 0.0f, 0.0f, 0.0f,
+      0.0f, 0.0f, 0.0f, 0.0f};
 
-  std::vector<float> data(8 * 8);
-  float4* data_ptr = static_cast<float4*>(static_cast<void*>(data.data()));
-  RearrangeFCWeightsToIOO4I4(weights, absl::MakeSpan(data_ptr, 8 * 8 / 4));
+  std::vector<float> data(12 * 8);
+  RearrangeFCWeightsToIOO4I4(weights, data.data());
 
   EXPECT_THAT(data, ElementsAreArray(expected_rearranged_data));
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
index f9d6ec762ec..b39f03af846 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
@@ -49,6 +49,33 @@ std::string GetElementWiseCode(const OperationDef& op_def,
   return c;
 }
 
+int3 GetWorkGroupsCount(int grid_dimension, const int3& grid_size,
+                        const int3& work_group_size,
+                        const int3& work_group_launch_order) {
+  int3 work_groups_count;
+  if (grid_dimension == 1) {
+    work_groups_count.x = DivideRoundUp(grid_size.x, work_group_size.x);
+    work_groups_count.y = 1;
+    work_groups_count.z = 1;
+  } else if (grid_dimension == 2) {
+    int3 wgs;
+    wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
+    wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
+    work_groups_count.x = wgs[work_group_launch_order[0]];
+    work_groups_count.y = wgs[work_group_launch_order[1]];
+    work_groups_count.z = 1;
+  } else {  // grid_dimension == 3
+    int3 wgs;
+    wgs.x = DivideRoundUp(grid_size.x, work_group_size.x);
+    wgs.y = DivideRoundUp(grid_size.y, work_group_size.y);
+    wgs.z = DivideRoundUp(grid_size.z, work_group_size.z);
+    work_groups_count.x = wgs[work_group_launch_order[0]];
+    work_groups_count.y = wgs[work_group_launch_order[1]];
+    work_groups_count.z = wgs[work_group_launch_order[2]];
+  }
+  return work_groups_count;
+}
+
 }  // namespace
 
 DataType OperationDef::GetDataType() const {
@@ -106,9 +133,12 @@ GPUOperation::GPUOperation(GPUOperation&& operation)
       src_(std::move(operation.src_)),
       dst_(std::move(operation.dst_)),
       kernel_(std::move(operation.kernel_)),
+      grid_dimension_(operation.grid_dimension_),
+      work_group_launch_order_(operation.work_group_launch_order_),
       grid_size_(operation.grid_size_),
       src_tensors_names_(std::move(operation.src_tensors_names_)),
       dst_tensors_names_(std::move(operation.dst_tensors_names_)),
+      work_groups_count_(operation.work_groups_count_),
       linkable_count_(operation.linkable_count_),
       elementwise_code_(std::move(operation.elementwise_code_)) {}
 
@@ -126,9 +156,12 @@ GPUOperation& GPUOperation::operator=(GPUOperation&& operation) {
     src_ = std::move(operation.src_);
     dst_ = std::move(operation.dst_);
     kernel_ = std::move(operation.kernel_);
+    std::swap(grid_dimension_, operation.grid_dimension_);
+    std::swap(work_group_launch_order_, operation.work_group_launch_order_);
     std::swap(grid_size_, operation.grid_size_);
     src_tensors_names_ = std::move(operation.src_tensors_names_);
     dst_tensors_names_ = std::move(operation.dst_tensors_names_);
+    std::swap(work_groups_count_, operation.work_groups_count_);
     std::swap(linkable_count_, operation.linkable_count_);
     elementwise_code_ = std::move(operation.elementwise_code_);
   }
@@ -183,12 +216,15 @@ absl::Status GPUOperation::UpdateParams() {
   for (int i = 0; i < dst_tensors_names_.size(); ++i) {
     RETURN_IF_ERROR(args_.SetObjectRef(dst_tensors_names_[i], dst_[i]));
   }
-  RETURN_IF_ERROR(BindArguments());
+  RETURN_IF_ERROR(BindArguments(&args_));
   grid_size_ = GetGridSize();
+  work_groups_count_ = GetWorkGroupsCount(
+      grid_dimension_, grid_size_, work_group_size_, work_group_launch_order_);
   return absl::OkStatus();
 }
 
-absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
+absl::Status GPUOperation::AssembleCode(const DeviceInfo& device_info,
+                                        CLContext* context) {
   if (elementwise_) {
     auto src_desc =
         absl::make_unique<TensorDescriptor>(definition_.src_tensors[0]);
@@ -206,29 +242,35 @@ absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
     dst_tensors_names_.insert(dst_tensors_names_.begin(), "dst_tensor");
     args_.AddObjectRef("dst_tensor", AccessType::WRITE, std::move(dst_desc));
 
-    std::string code =
-        GetElementWiseCode(definition_, check_src_channels_size_);
     elementwise_code_ = "{\n" + code_ + "\n}\n" + elementwise_code_;
-    RETURN_IF_ERROR(args_.AllocateObjects(creation_context.context));
+    code_ = GetElementWiseCode(definition_, check_src_channels_size_);
+    RETURN_IF_ERROR(args_.AllocateObjects(context));
     RETURN_IF_ERROR(args_.TransformToCLCode(
-        creation_context.device->info_,
-        {{dst_tensors_names_[0], elementwise_code_}}, &code));
-    code = absl::Substitute(code, args_.GetListOfArgs());
-    RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
-        code, "main_function", *creation_context.context,
-        *creation_context.device, &kernel_));
+        device_info, {{dst_tensors_names_[0], elementwise_code_}}, &code_));
   } else {
-    RETURN_IF_ERROR(args_.AllocateObjects(creation_context.context));
+    RETURN_IF_ERROR(args_.AllocateObjects(context));
     RETURN_IF_ERROR(args_.TransformToCLCode(
-        creation_context.device->info_,
-        {{dst_tensors_names_[0], elementwise_code_}}, &code_));
-    RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
-        code_, "main_function", compiler_options_, *creation_context.context,
-        *creation_context.device, &kernel_));
+        device_info, {{dst_tensors_names_[0], elementwise_code_}}, &code_));
   }
+  return absl::OkStatus();
+}
+
+absl::Status GPUOperation::Compile(const CreationContext& creation_context) {
+  RETURN_IF_ERROR(
+      AssembleCode(creation_context.GetDeviceInfo(), creation_context.context));
+  RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
+      code_, "main_function", compiler_options_, *creation_context.context,
+      *creation_context.device, &kernel_));
   return PostCompileCheck(creation_context.device->info_, kernel_.info_);
 }
 
+absl::Status GPUOperation::CompileDeserialized(
+    const CreationContext& creation_context) {
+  return creation_context.cache->GetOrCreateCLKernel(
+      code_, "main_function", compiler_options_, *creation_context.context,
+      *creation_context.device, &kernel_);
+}
+
 void GPUOperation::GetPossibleKernelWorkGroups(
     TuningType tuning_type, const DeviceInfo& device_info,
     const KernelInfo& kernel_info, std::vector<int3>* work_groups) const {
@@ -246,14 +288,26 @@ absl::Status GPUOperation::Tune(const TuningParameters& params) {
   }
   if (possible_work_groups.size() == 1) {
     work_group_size_ = possible_work_groups[0];
+    work_groups_count_ =
+        GetWorkGroupsCount(grid_dimension_, grid_size_, work_group_size_,
+                           work_group_launch_order_);
     return absl::OkStatus();
   } else {
+    std::vector<int3> work_groups_count(possible_work_groups.size());
+    for (int i = 0; i < work_groups_count.size(); ++i) {
+      work_groups_count[i] =
+          GetWorkGroupsCount(grid_dimension_, grid_size_,
+                             possible_work_groups[i], work_group_launch_order_);
+    }
     RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
     int best_work_group_index;
     RETURN_IF_ERROR(params.queue->GetBestWorkGroupIndex(
-        kernel_, *params.info, grid_size_, possible_work_groups,
+        kernel_, *params.info, work_groups_count, possible_work_groups,
         &best_work_group_index));
     work_group_size_ = possible_work_groups[best_work_group_index];
+    work_groups_count_ =
+        GetWorkGroupsCount(grid_dimension_, grid_size_, work_group_size_,
+                           work_group_launch_order_);
     return absl::OkStatus();
   }
 }
@@ -283,7 +337,7 @@ int3 GPUOperation::GetGridSize() const {
     const int grid_z = 1;
     return int3(grid_x, grid_y, grid_z);
   }
-  return int3(0, 0, 0);
+  return grid_size_;
 }
 
 void GPUOperation::AddUniquePostfix(const std::string& unique_postfix) {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
index 2fa8c90c1da..57d8690c54e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h
@@ -16,20 +16,24 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_GPU_OPERATION_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_GPU_OPERATION_H_
 
-#include <memory>
 #include <string>
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/cl/arguments.h"
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_program.h"
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/tuning_parameters.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/program_cache.h"
+#include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 
@@ -117,7 +121,7 @@ class GPUOperation {
 
   absl::Status AddToQueue(CLCommandQueue* queue) {
     RETURN_IF_ERROR(args_.Bind(kernel_.kernel()));
-    return queue->DispatchImplicit(kernel_, grid_size_, work_group_size_);
+    return queue->Dispatch(kernel_, work_groups_count_, work_group_size_);
   }
 
   virtual void GetPossibleKernelWorkGroups(
@@ -126,8 +130,12 @@ class GPUOperation {
 
   absl::Status Tune(const TuningParameters& params);
 
+  absl::Status AssembleCode(const DeviceInfo& device_info, CLContext* context);
+
   absl::Status Compile(const CreationContext& creation_context);
 
+  absl::Status CompileDeserialized(const CreationContext& creation_context);
+
   virtual absl::Status PostCompileCheck(const DeviceInfo& device_info,
                                         const KernelInfo& kernel_info) {
     return absl::OkStatus();
@@ -161,7 +169,14 @@ class GPUOperation {
   bool check_src_channels_size_ = false;
 
  protected:
-  virtual absl::Status BindArguments() { return absl::OkStatus(); }
+  friend flatbuffers::Offset<data::GPUOperation> Encode(
+      const GPUOperation& op, flatbuffers::FlatBufferBuilder* builder);
+  friend absl::Status Decode(CLContext* context,
+                             const data::GPUOperation* fb_op, GPUOperation* op);
+
+  virtual absl::Status BindArguments(ArgumentsBinder* args) {
+    return absl::OkStatus();
+  }
   virtual int3 GetGridSize() const;
 
   // Defines operation calculation precision and format of src/dst tensors.
@@ -169,11 +184,14 @@ class GPUOperation {
   std::vector<Tensor*> src_;
   std::vector<Tensor*> dst_;
   CLKernel kernel_;
+  int grid_dimension_ = 3;  // can be 1, 2 or 3
+  int3 work_group_launch_order_ = int3(0, 1, 2);
   int3 grid_size_ = int3(0, 0, 0);
   std::vector<std::string> src_tensors_names_;
   std::vector<std::string> dst_tensors_names_;
 
  private:
+  int3 work_groups_count_ = int3(0, 0, 0);
   int linkable_count_ = 0;
   std::string elementwise_code_;  // temporary, used during op construction
 };
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/lstm_full_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/lstm_full_test.cc
new file mode 100644
index 00000000000..08cb622ff91
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/lstm_full_test.cc
@@ -0,0 +1,1181 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Unit test for TFLite LSTM op.
+
+#include <initializer_list>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/delegates/gpu/delegate.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class LSTMOpModel : public SingleOpModel {
+ public:
+  LSTMOpModel(int n_batch, int n_input, int n_cell, int n_output, bool use_cifg,
+              bool use_peephole, bool use_projection_weights,
+              bool use_projection_bias, const TensorType weight_type,
+              bool model_has_legacy_20_inputs, bool is_layer_norm,
+              bool asymmetric_quantize_inputs,
+              std::initializer_list<float> input_to_input_weights,
+              std::initializer_list<float> input_to_forget_weights,
+              std::initializer_list<float> input_to_cell_weights,
+              std::initializer_list<float> input_to_output_weights,
+              std::initializer_list<float> recurrent_to_input_weights,
+              std::initializer_list<float> recurrent_to_forget_weights,
+              std::initializer_list<float> recurrent_to_cell_weights,
+              std::initializer_list<float> recurrent_to_output_weights,
+              std::initializer_list<float> cell_to_input_weights,
+              std::initializer_list<float> cell_to_forget_weights,
+              std::initializer_list<float> cell_to_output_weights,
+              std::initializer_list<float> input_gate_bias,
+              std::initializer_list<float> forget_gate_bias,
+              std::initializer_list<float> cell_gate_bias,
+              std::initializer_list<float> output_gate_bias,
+              std::initializer_list<float> projection_weights,
+              std::initializer_list<float> projection_bias,
+              std::initializer_list<float> input_layer_norm_coefficients,
+              std::initializer_list<float> forget_layer_norm_coefficients,
+              std::initializer_list<float> cell_layer_norm_coefficients,
+              std::initializer_list<float> output_layer_norm_coefficients)
+      : n_input_(n_input),
+        n_output_(n_output),
+        n_batch_(n_batch),
+        weight_type_(weight_type) {
+    input_ = AddInput({TensorType_FLOAT32, {n_batch, n_input}});
+
+    if (use_cifg) {
+      AddNullInput();
+    } else {
+      AddConstInput({weight_type, {n_cell, n_input}}, input_to_input_weights);
+    }
+    AddConstInput({weight_type, {n_cell, n_input}}, input_to_forget_weights);
+    AddConstInput({weight_type, {n_cell, n_input}}, input_to_cell_weights);
+    AddConstInput({weight_type, {n_cell, n_input}}, input_to_output_weights);
+
+    if (use_cifg) {
+      AddNullInput();
+    } else {
+      AddConstInput({weight_type, {n_cell, n_output}},
+                    recurrent_to_input_weights);
+    }
+    AddConstInput({weight_type, {n_cell, n_output}},
+                  recurrent_to_forget_weights);
+    AddConstInput({weight_type, {n_cell, n_output}}, recurrent_to_cell_weights);
+    AddConstInput({weight_type, {n_cell, n_output}},
+                  recurrent_to_output_weights);
+
+    if (use_peephole) {
+      if (use_cifg) {
+        AddNullInput();
+      } else {
+        AddConstInput({weight_type, {n_cell}}, cell_to_input_weights);
+      }
+      AddConstInput({weight_type, {n_cell}}, cell_to_forget_weights);
+      AddConstInput({weight_type, {n_cell}}, cell_to_output_weights);
+    } else {
+      AddNullInput();
+      AddNullInput();
+      AddNullInput();
+    }
+
+    if (use_cifg) {
+      AddNullInput();
+    } else {
+      AddConstInput({TensorType_FLOAT32, {n_cell}}, input_gate_bias);
+    }
+    AddConstInput({TensorType_FLOAT32, {n_cell}}, forget_gate_bias);
+    AddConstInput({TensorType_FLOAT32, {n_cell}}, cell_gate_bias);
+    AddConstInput({TensorType_FLOAT32, {n_cell}}, output_gate_bias);
+
+    if (use_projection_weights) {
+      AddConstInput({weight_type, {n_output, n_cell}}, projection_weights);
+    } else {
+      AddNullInput();
+    }
+    if (use_projection_bias) {
+      CHECK(use_projection_weights);
+      AddConstInput({TensorType_FLOAT32, {n_output}}, projection_bias);
+    } else {
+      AddNullInput();
+    }
+
+    // Adding the 2 state tensors.
+    AddVariableInput({TensorType_FLOAT32, {n_batch, n_output}});
+    AddVariableInput({TensorType_FLOAT32, {n_batch, n_cell}});
+
+    // Layer norm weights.
+    if (!model_has_legacy_20_inputs) {
+      if (is_layer_norm) {
+        if (use_cifg) {
+          AddNullInput();
+        } else {
+          AddConstInput({TensorType_FLOAT32, {n_cell}},
+                        input_layer_norm_coefficients);
+        }
+        AddConstInput({TensorType_FLOAT32, {n_cell}},
+                      forget_layer_norm_coefficients);
+        AddConstInput({TensorType_FLOAT32, {n_cell}},
+                      cell_layer_norm_coefficients);
+        AddConstInput({TensorType_FLOAT32, {n_cell}},
+                      output_layer_norm_coefficients);
+      } else {
+        AddNullInput();
+        AddNullInput();
+        AddNullInput();
+        AddNullInput();
+      }
+    }
+
+    output_ = AddOutput({TensorType_FLOAT32, {n_batch, n_output}});
+
+    // TODO(b/161825581): Add tests where cell_clip and/or proj_clip is not the
+    // default 0.
+    SetBuiltinOp(
+        BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
+        CreateLSTMOptions(builder_, ActivationFunctionType_TANH,
+                          /*cell_clip=*/0.0f, /*proj_clip=*/0.0f,
+                          LSTMKernelType_FULL, asymmetric_quantize_inputs)
+            .Union());
+
+    // Input shapes are already set up, no need to pass them again.
+    BuildInterpreter(/*input_shapes=*/{}, /*num_threads=*/-1,
+                     /*allow_fp32_relax_to_fp16=*/false,
+                     /*apply_delegate=*/false);
+
+    auto options = TfLiteGpuDelegateOptionsV2Default();
+    // MeanStddevNormalization is only implemented in OpenCL now.
+    options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY;
+    SetDelegate(TfLiteGpuDelegateV2Create(&options));
+  }
+
+  ~LSTMOpModel() { EXPECT_EQ(CountOpsExecutedByCpuKernel(), 0); }
+
+  void SetInput(int offset, const float* begin, const float* end) {
+    SingleOpModel::PopulateTensor(input_, offset, const_cast<float*>(begin),
+                                  const_cast<float*>(end));
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+  int num_inputs() { return n_input_; }
+  int num_outputs() { return n_output_; }
+  int num_batches() { return n_batch_; }
+
+ protected:
+  int input_;
+  int output_;
+
+  int n_input_;
+  int n_output_;
+  int n_batch_;
+
+ private:
+  const TensorType weight_type_;
+};
+
+// GetParam() => model_has_legacy_20_inputs
+class LstmOpTest : public ::testing::TestWithParam<bool> {
+ protected:
+  // Weights of the LSTM model. Some are optional.
+  std::initializer_list<float> input_to_input_weights_;
+  std::initializer_list<float> input_to_forget_weights_;
+  std::initializer_list<float> input_to_cell_weights_;
+  std::initializer_list<float> input_to_output_weights_;
+  std::initializer_list<float> recurrent_to_input_weights_;
+  std::initializer_list<float> recurrent_to_forget_weights_;
+  std::initializer_list<float> recurrent_to_cell_weights_;
+  std::initializer_list<float> recurrent_to_output_weights_;
+  std::initializer_list<float> cell_to_input_weights_;
+  std::initializer_list<float> cell_to_forget_weights_;
+  std::initializer_list<float> cell_to_output_weights_;
+  std::initializer_list<float> input_gate_bias_;
+  std::initializer_list<float> forget_gate_bias_;
+  std::initializer_list<float> cell_gate_bias_;
+  std::initializer_list<float> output_gate_bias_;
+  std::initializer_list<float> projection_weights_;
+  std::initializer_list<float> input_layer_norm_coefficients_;
+  std::initializer_list<float> forget_layer_norm_coefficients_;
+  std::initializer_list<float> cell_layer_norm_coefficients_;
+  std::initializer_list<float> output_layer_norm_coefficients_;
+
+  // LSTM input is stored as num_steps * num_batch * num_inputs vector.
+  std::vector<std::vector<std::vector<float>>> lstm_input_;
+  // LSTM output is stored as num_steps * num_batch * num_outputs vector.
+  std::vector<std::vector<std::vector<float>>> lstm_golden_output_;
+
+  // Compares output up to tolerance to the result of the lstm given the input.
+  void VerifyGoldens(LSTMOpModel* lstm, float tolerance) {
+    EXPECT_EQ(lstm->ApplyDelegate(), kTfLiteOk);
+
+    const int num_inputs = lstm->num_inputs();
+    const int num_outputs = lstm->num_outputs();
+    const int num_batches = lstm->num_batches();
+
+    ASSERT_EQ(lstm_input_.size(), lstm_golden_output_.size());
+    const int num_steps = lstm_input_.size();
+
+    for (int i = 0; i < num_steps; ++i) {
+      ASSERT_EQ(num_batches, lstm_input_[i].size());
+      for (int b = 0; b < num_batches; ++b) {
+        ASSERT_EQ(num_inputs, lstm_input_[i][b].size());
+        const float* batch_start = lstm_input_[i][b].data();
+        const float* batch_end = batch_start + num_inputs;
+        lstm->SetInput(b * num_inputs, batch_start, batch_end);
+      }
+
+      lstm->Invoke();
+
+      std::vector<float> expected;
+      ASSERT_EQ(num_batches, lstm_golden_output_[i].size());
+      for (int b = 0; b < num_batches; ++b) {
+        ASSERT_EQ(num_outputs, lstm_golden_output_[i][b].size());
+        const float* batch_start = lstm_golden_output_[i][b].data();
+        const float* batch_end = batch_start + num_outputs;
+        expected.insert(expected.end(), batch_start, batch_end);
+      }
+
+      EXPECT_THAT(lstm->GetOutput(),
+                  ElementsAreArray(ArrayFloatNear(expected, tolerance)));
+    }
+  }
+};
+
+TEST_P(LstmOpTest, NoCifg_NoPeephole_NoProjection_NoLayerNorm) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+
+  bool model_has_legacy_20_inputs = GetParam();
+
+  input_to_input_weights_ = {-0.45018822, -0.02338299, -0.0870589,  -0.34550029,
+                             0.04266912,  -0.15680569, -0.34856534, 0.43890524};
+  input_to_cell_weights_ = {-0.50013041, 0.1370284,  0.11810488, 0.2013163,
+                            -0.20583314, 0.44344562, 0.22077113, -0.29909778};
+  input_to_forget_weights_ = {0.09701663,  0.20334584,  -0.50592935,
+                              -0.31343272, -0.40032279, 0.44781327,
+                              0.01387155,  -0.35593212};
+  input_to_output_weights_ = {-0.25065863, -0.28290087, 0.04613829, 0.40525138,
+                              0.44272184,  0.03897077,  -0.1556896, 0.19487578};
+  input_gate_bias_ = {0., 0., 0., 0.};
+  cell_gate_bias_ = {0., 0., 0., 0.};
+  forget_gate_bias_ = {1., 1., 1., 1.};
+  output_gate_bias_ = {0., 0., 0., 0.};
+
+  recurrent_to_input_weights_ = {
+      -0.0063535,  -0.2042388,  0.31454784,  -0.35746509,
+      0.28902304,  0.08183324,  -0.16555229, 0.02286911,
+      -0.13566875, 0.03034258,  0.48091322,  -0.12528998,
+      0.24077177,  -0.51332325, -0.33502164, 0.10629296};
+
+  recurrent_to_cell_weights_ = {
+      -0.3407414,  0.24443203,  -0.2078532,  0.26320225,
+      0.05695659,  -0.00123841, -0.4744786,  -0.35869038,
+      -0.06418842, -0.13502428, -0.501764,   0.22830659,
+      -0.46367589, 0.26016325,  -0.03894562, -0.16368064};
+
+  recurrent_to_forget_weights_ = {
+      -0.48684245, -0.06655136, 0.42224967,  0.2112639,
+      0.27654213,  0.20864892,  -0.07646349, 0.45877004,
+      0.00141793,  -0.14609534, 0.36447752,  0.09196436,
+      0.28053468,  0.01560611,  -0.20127171, -0.01140004};
+
+  recurrent_to_output_weights_ = {
+      0.43385774,  -0.17194885, 0.2718237,  0.09215671,
+      0.24107647,  -0.39835793, 0.18212086, 0.01301402,
+      0.48572797,  -0.50656658, 0.20047462, -0.20607421,
+      -0.51818722, -0.15390486, 0.0468148,  0.39922136};
+
+  // num_steps * num_batch * num_inputs
+  lstm_input_ = {{{2., 3.}}, {{3., 4.}}, {{1., 1.}}};
+  // num_steps * num_batch * num_outputs
+  lstm_golden_output_ = {{{-0.02973187, 0.1229473, 0.20885126, -0.15358765}},
+                         {{-0.03716109, 0.12507336, 0.41193449, -0.20860538}},
+                         {{-0.15053082, 0.09120187, 0.24278517, -0.12222792}}};
+
+  LSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/false,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*weight_type=*/TensorType_FLOAT32,
+      model_has_legacy_20_inputs,
+      /*is_layer_norm=*/false, /*asymmetric_quantize_inputs=*/false,
+      input_to_input_weights_, input_to_forget_weights_, input_to_cell_weights_,
+      input_to_output_weights_, recurrent_to_input_weights_,
+      recurrent_to_forget_weights_, recurrent_to_cell_weights_,
+      recurrent_to_output_weights_, cell_to_input_weights_,
+      cell_to_forget_weights_, cell_to_output_weights_, input_gate_bias_,
+      forget_gate_bias_, cell_gate_bias_, output_gate_bias_,
+      projection_weights_, {}, input_layer_norm_coefficients_,
+      forget_layer_norm_coefficients_, cell_layer_norm_coefficients_,
+      output_layer_norm_coefficients_);
+
+  VerifyGoldens(&lstm, 0.00001f);
+}
+
+TEST_P(LstmOpTest, Cifg_Peephole_NoProjection_NoLayerNorm) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
+
+  bool model_has_legacy_20_inputs = GetParam();
+
+  input_to_cell_weights_ = {-0.49770179, -0.27711356, -0.09624726, 0.05100781,
+                            0.04717243,  0.48944736,  -0.38535351, -0.17212132};
+
+  input_to_forget_weights_ = {-0.55291498, -0.42866567, 0.13056988, -0.3633365,
+                              -0.22755712, 0.28253698,  0.24407166, 0.33826375};
+
+  input_to_output_weights_ = {0.10725588,  -0.02335852, -0.55932593,
+                              -0.09426838, -0.44257352, 0.54939759,
+                              0.01533556,  0.42751634};
+  cell_gate_bias_ = {0., 0., 0., 0.};
+  forget_gate_bias_ = {1., 1., 1., 1.};
+  output_gate_bias_ = {0., 0., 0., 0.};
+
+  recurrent_to_cell_weights_ = {
+      0.54066205,  -0.32668582, -0.43562764, -0.56094903,
+      0.42957711,  0.01841056,  -0.32764608, -0.33027974,
+      -0.10826075, 0.20675004,  0.19069612,  -0.03026325,
+      -0.54532051, 0.33003211,  0.44901288,  0.21193194};
+
+  recurrent_to_forget_weights_ = {
+      -0.13832897, -0.0515101,  -0.2359007, -0.16661474,
+      -0.14340827, 0.36986142,  0.23414481, 0.55899,
+      0.10798943,  -0.41174671, 0.17751795, -0.34484994,
+      -0.35874045, -0.11352962, 0.27268326, 0.54058349};
+
+  recurrent_to_output_weights_ = {
+      0.41613156, 0.42610586,  -0.16495961, -0.5663873,
+      0.30579174, -0.05115908, -0.33941799, 0.23364776,
+      0.11178309, 0.09481031,  -0.26424935, 0.46261835,
+      0.50248802, 0.26114327,  -0.43736315, 0.33149987};
+
+  cell_to_forget_weights_ = {0.47485286, -0.51955009, -0.24458408, 0.31544167};
+  cell_to_output_weights_ = {-0.17135078, 0.82760304, 0.85573703, -0.77109635};
+
+  lstm_input_ = {{{2., 3.}}, {{3., 4.}}, {{1., 1.}}};
+  lstm_golden_output_ = {{{-0.36444446, -0.00352185, 0.12886585, -0.05163646}},
+                         {{-0.42312205, -0.01218222, 0.24201041, -0.08124574}},
+                         {{-0.358325, -0.04621704, 0.21641694, -0.06471302}}};
+
+  LSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*weight_type=*/TensorType_FLOAT32,
+      model_has_legacy_20_inputs,
+      /*is_layer_norm=*/false, /*asymmetric_quantize_inputs=*/false,
+      input_to_input_weights_, input_to_forget_weights_, input_to_cell_weights_,
+      input_to_output_weights_, recurrent_to_input_weights_,
+      recurrent_to_forget_weights_, recurrent_to_cell_weights_,
+      recurrent_to_output_weights_, cell_to_input_weights_,
+      cell_to_forget_weights_, cell_to_output_weights_, input_gate_bias_,
+      forget_gate_bias_, cell_gate_bias_, output_gate_bias_,
+      projection_weights_, {}, input_layer_norm_coefficients_,
+      forget_layer_norm_coefficients_, cell_layer_norm_coefficients_,
+      output_layer_norm_coefficients_);
+
+  VerifyGoldens(&lstm, 0.00001f);
+}
+
+TEST_P(LstmOpTest, NoCifg_Peephole_Projection_NoLayerNorm) {
+  const int n_batch = 1;
+  const int n_input = 5;
+  const int n_cell = 20;
+  const int n_output = 16;
+
+  bool model_has_legacy_20_inputs = GetParam();
+
+  input_to_input_weights_ = {
+      0.021393683,  0.06124551,    0.046905167,  -0.014657677,  -0.03149463,
+      0.09171803,   0.14647801,    0.10797193,   -0.0057968358, 0.0019193048,
+      -0.2726754,   0.10154029,    -0.018539885, 0.080349885,   -0.10262385,
+      -0.022599787, -0.09121155,   -0.008675967, -0.045206103,  -0.0821282,
+      -0.008045952, 0.015478081,   0.055217247,  0.038719587,   0.044153627,
+      -0.06453243,  0.05031825,    -0.046935108, -0.008164439,  0.014574226,
+      -0.1671009,   -0.15519552,   -0.16819797,  -0.13971269,   -0.11953059,
+      0.25005487,   -0.22790983,   0.009855087,  -0.028140958,  -0.11200698,
+      0.11295408,   -0.0035217577, 0.054485075,  0.05184695,    0.064711206,
+      0.10989193,   0.11674786,    0.03490607,   0.07727357,    0.11390585,
+      -0.1863375,   -0.1034451,    -0.13945189,  -0.049401227,  -0.18767063,
+      0.042483903,  0.14233552,    0.13832581,   0.18350165,    0.14545603,
+      -0.028545704, 0.024939531,   0.050929718,  0.0076203286,  -0.0029723682,
+      -0.042484224, -0.11827596,   -0.09171104,  -0.10808628,   -0.16327988,
+      -0.2273378,   -0.0993647,    -0.017155107, 0.0023917493,  0.049272764,
+      0.0038534778, 0.054764505,   0.089753784,  0.06947234,    0.08014476,
+      -0.04544234,  -0.0497073,    -0.07135631,  -0.048929106,  -0.004042012,
+      -0.009284026, 0.018042054,   0.0036860977, -0.07427302,   -0.11434604,
+      -0.018995456, 0.031487543,   0.012834908,  0.019977754,   0.044256654,
+      -0.39292613,  -0.18519334,   -0.11651281,  -0.06809892,   0.011373677};
+
+  input_to_forget_weights_ = {
+      -0.0018401089, -0.004852237,  0.03698424,   0.014181704,   0.028273236,
+      -0.016726194,  -0.05249759,   -0.10204261,  0.00861066,    -0.040979505,
+      -0.009899187,  0.01923892,    -0.028177269, -0.08535103,   -0.14585495,
+      0.10662567,    -0.01909731,   -0.017883534, -0.0047269356, -0.045103323,
+      0.0030784295,  0.076784775,   0.07463696,   0.094531395,   0.0814421,
+      -0.12257899,   -0.033945758,  -0.031303465, 0.045630626,   0.06843887,
+      -0.13492945,   -0.012480007,  -0.0811829,   -0.07224499,   -0.09628791,
+      0.045100946,   0.0012300825,  0.013964662,  0.099372394,   0.02543059,
+      0.06958324,    0.034257296,   0.0482646,    0.06267997,    0.052625068,
+      0.12784666,    0.07077897,    0.025725935,  0.04165009,    0.07241905,
+      0.018668644,   -0.037377294,  -0.06277783,  -0.08833636,   -0.040120605,
+      -0.011405586,  -0.007808335,  -0.010301386, -0.005102167,  0.027717464,
+      0.05483423,    0.11449111,    0.11289652,   0.10939839,    0.13396506,
+      -0.08402166,   -0.01901462,   -0.044678304, -0.07720565,   0.014350063,
+      -0.11757958,   -0.0652038,    -0.08185733,  -0.076754324,  -0.092614375,
+      0.10405491,    0.052960336,   0.035755895,  0.035839386,   -0.012540553,
+      0.036881298,   0.02913376,    0.03420159,   0.05448447,    -0.054523353,
+      0.02582715,    0.02327355,    -0.011857179, -0.0011980024, -0.034641717,
+      -0.026125094,  -0.17582615,   -0.15923657,  -0.27486774,   -0.0006143371,
+      0.0001771948,  -8.470171e-05, 0.02651807,   0.045790765,   0.06956496};
+
+  input_to_cell_weights_ = {
+      -0.04580283,  -0.09549462,   -0.032418985,  -0.06454633,   -0.043528453,
+      0.043018587,  -0.049152344,  -0.12418144,   -0.078985475,  -0.07596889,
+      0.019484362,  -0.11434962,   -0.0074034138, -0.06314844,   -0.092981495,
+      0.0062155537, -0.025034338,  -0.0028890965, 0.048929527,   0.06235075,
+      0.10665918,   -0.032036792,  -0.08505916,   -0.10843358,   -0.13002433,
+      -0.036816437, -0.02130134,   -0.016518239,  0.0047691227,  -0.0025825808,
+      0.066017866,  0.029991534,   -0.10652836,   -0.1037554,    -0.13056071,
+      -0.03266643,  -0.033702414,  -0.006473424,  -0.04611692,   0.014419339,
+      -0.025174323, 0.0396852,     0.081777506,   0.06157468,    0.10210095,
+      -0.009658194, 0.046511717,   0.03603906,    0.0069369148,  0.015960095,
+      -0.06507666,  0.09551598,    0.053568836,   0.06408714,    0.12835667,
+      -0.008714329, -0.20211966,   -0.12093674,   0.029450472,   0.2849013,
+      -0.029227901, 0.1164364,     -0.08560263,   0.09941786,    -0.036999565,
+      -0.028842626, -0.0033637602, -0.017012902,  -0.09720865,   -0.11193351,
+      -0.029155117, -0.017936034,  -0.009768936,  -0.04223324,   -0.036159635,
+      0.06505112,   -0.021742892,  -0.023377212,  -0.07221364,   -0.06430552,
+      0.05453865,   0.091149814,   0.06387331,    0.007518393,   0.055960953,
+      0.069779344,  0.046411168,   0.10509911,    0.07463894,    0.0075130584,
+      0.012850982,  0.04555431,    0.056955688,   0.06555285,    0.050801456,
+      -0.009862683, 0.00826772,    -0.026555609,  -0.0073611983, -0.0014897042};
+
+  input_to_output_weights_ = {
+      -0.0998932,   -0.07201956,  -0.052803773,  -0.15629593,  -0.15001918,
+      -0.07650751,  0.02359855,   -0.075155355,  -0.08037709,  -0.15093534,
+      0.029517552,  -0.04751393,  0.010350531,   -0.02664851,  -0.016839722,
+      -0.023121163, 0.0077019283, 0.012851257,   -0.05040649,  -0.0129761,
+      -0.021737747, -0.038305793, -0.06870586,   -0.01481247,  -0.001285394,
+      0.10124236,   0.083122835,  0.053313006,   -0.062235646, -0.075637154,
+      -0.027833903, 0.029774971,  0.1130802,     0.09218906,   0.09506135,
+      -0.086665764, -0.037162706, -0.038880914,  -0.035832845, -0.014481564,
+      -0.09825003,  -0.12048569,  -0.097665586,  -0.05287633,  -0.0964047,
+      -0.11366429,  0.035777505,  0.13568819,    0.052451383,  0.050649304,
+      0.05798951,   -0.021852335, -0.099848844,  0.014740475,  -0.078897946,
+      0.04974699,   0.014160473,  0.06973932,    0.04964942,   0.033364646,
+      0.08190124,   0.025535367,  0.050893165,   0.048514254,  0.06945813,
+      -0.078907564, -0.06707616,  -0.11844508,   -0.09986688,  -0.07509403,
+      0.06263226,   0.14925587,   0.20188436,    0.12098451,   0.14639415,
+      0.0015017595, -0.014267382, -0.03417257,   0.012711468,  0.0028300495,
+      -0.024758482, -0.05098548,  -0.0821182,    0.014225672,  0.021544158,
+      0.08949725,   0.07505268,   -0.0020780868, 0.04908258,   0.06476295,
+      -0.022907063, 0.027562456,  0.040185735,   0.019567577,  -0.015598739,
+      -0.049097303, -0.017121866, -0.083368234,  -0.02332002,  -0.0840956};
+
+  input_gate_bias_ = {0.02234832,   0.14757581,  0.18176508,  0.10380666,
+                      0.053110216,  -0.06928846, -0.13942584, -0.11816189,
+                      0.19483899,   0.03652339,  -0.10250295, 0.036714908,
+                      -0.18426876,  0.036065217, 0.21810818,  0.02383196,
+                      -0.043370757, 0.08690144,  -0.04444982, 0.00030581196};
+
+  forget_gate_bias_ = {0.035185695, -0.042891346, -0.03032477, 0.23027696,
+                       0.11098921,  0.15378423,   0.09263801,  0.09790885,
+                       0.09508917,  0.061199076,  0.07665568,  -0.015443159,
+                       -0.03499149, 0.046190713,  0.08895977,  0.10899629,
+                       0.40694186,  0.06030037,   0.012413437, -0.06108739};
+
+  cell_gate_bias_ = {-0.024379363, 0.0055531194, 0.23377132,   0.033463873,
+                     -0.1483596,   -0.10639995,  -0.091433935, 0.058573797,
+                     -0.06809782,  -0.07889636,  -0.043246906, -0.09829136,
+                     -0.4279842,   0.034901652,  0.18797937,   0.0075234566,
+                     0.016178843,  0.1749513,    0.13975595,   0.92058027};
+
+  output_gate_bias_ = {0.046159424, -0.0012809046, 0.03563469,   0.12648113,
+                       0.027195795, 0.35373217,    -0.018957434, 0.008907322,
+                       -0.0762701,  0.12018895,    0.04216877,   0.0022856654,
+                       0.040952638, 0.3147856,     0.08225149,   -0.057416286,
+                       -0.14995944, -0.008040261,  0.13208859,   0.029760877};
+
+  recurrent_to_input_weights_ = {
+      -0.001374326,   -0.078856036,   0.10672688,    0.029162422,
+      -0.11585556,    0.02557986,     -0.13446963,   -0.035785314,
+      -0.01244275,    0.025961924,    -0.02337298,   -0.044228926,
+      -0.055839065,   -0.046598054,   -0.010546039,  -0.06900766,
+      0.027239809,    0.022582639,    -0.013296484,  -0.05459212,
+      0.08981,        -0.045407712,   0.08682226,    -0.06867011,
+      -0.14390695,    -0.02916037,    0.000996957,   0.091420636,
+      0.14283475,     -0.07390571,    -0.06402044,   0.062524505,
+      -0.093129106,   0.04860203,     -0.08364217,   -0.08119002,
+      0.009352075,    0.22920375,     0.0016303885,  0.11583097,
+      -0.13732095,    0.012405723,    -0.07551853,   0.06343048,
+      0.12162708,     -0.031923793,   -0.014335606,  0.01790974,
+      -0.10650317,    -0.0724401,     0.08554849,    -0.05727212,
+      0.06556731,     -0.042729504,   -0.043227166,  0.011683251,
+      -0.013082158,   -0.029302018,   -0.010899579,  -0.062036745,
+      -0.022509435,   -0.00964907,    -0.01567329,   0.04260106,
+      -0.07787477,    -0.11576462,    0.017356863,   0.048673786,
+      -0.017577527,   -0.05527947,    -0.082487635,  -0.040137455,
+      -0.10820036,    -0.04666372,    0.022746278,   -0.07851417,
+      0.01068115,     0.032956902,    0.022433773,   0.0026891115,
+      0.08944216,     -0.0685835,     0.010513544,   0.07228705,
+      0.02032331,     -0.059686817,   -0.0005566496, -0.086984694,
+      0.040414046,    -0.1380399,     0.094208956,   -0.05722982,
+      0.012092817,    -0.04989123,    -0.086576,     -0.003399834,
+      -0.04696032,    -0.045747425,   0.10091314,    0.048676282,
+      -0.029037097,   0.031399418,    -0.0040285117, 0.047237843,
+      0.09504992,     0.041799378,    -0.049185462,  -0.031518843,
+      -0.10516937,    0.026374253,    0.10058866,    -0.0033195973,
+      -0.041975245,   0.0073591834,   0.0033782164,  -0.004325073,
+      -0.10167381,    0.042500053,    -0.01447153,   0.06464186,
+      -0.017142897,   0.03312627,     0.009205989,   0.024138335,
+      -0.011337001,   0.035530265,    -0.010912711,  0.0706555,
+      -0.005894094,   0.051841937,    -0.1401738,    -0.02351249,
+      0.0365468,      0.07590991,     0.08838724,    0.021681072,
+      -0.10086113,    0.019608743,    -0.06195883,   0.077335775,
+      0.023646897,    -0.095322326,   0.02233014,    0.09756986,
+      -0.048691444,   -0.009579111,   0.07595467,    0.11480546,
+      -0.09801813,    0.019894179,    0.08502348,    0.004032281,
+      0.037211012,    0.068537936,    -0.048005626,  -0.091520436,
+      -0.028379958,   -0.01556313,    0.06554592,    -0.045599163,
+      -0.01672207,    -0.020169014,   -0.011877351,  -0.20212261,
+      0.010889619,    0.0047078193,   0.038385306,   0.08540671,
+      -0.017140968,   -0.0035865551,  0.016678626,   0.005633034,
+      0.015963363,    0.00871737,     0.060130805,   0.028611384,
+      0.10109069,     -0.015060172,   -0.07894427,   0.06401885,
+      0.011584063,    -0.024466386,   0.0047652307,  -0.09041358,
+      0.030737216,    -0.0046374933,  0.14215417,    -0.11823516,
+      0.019899689,    0.006106124,    -0.027092824,  0.0786356,
+      0.05052217,     -0.058925,      -0.011402121,  -0.024987547,
+      -0.0013661642,  -0.06832946,    -0.015667673,  -0.1083353,
+      -0.00096863037, -0.06988685,    -0.053350925,  -0.027275559,
+      -0.033664223,   -0.07978348,    -0.025200296,  -0.017207067,
+      -0.058403496,   -0.055697463,   0.005798788,   0.12965427,
+      -0.062582195,   0.0013350133,   -0.10482091,   0.0379771,
+      0.072521195,    -0.0029455067,  -0.13797039,   -0.03628521,
+      0.013806405,    -0.017858358,   -0.01008298,   -0.07700066,
+      -0.017081132,   0.019358726,    0.0027079724,  0.004635139,
+      0.062634714,    -0.02338735,    -0.039547626,  -0.02050681,
+      0.03385117,     -0.083611414,   0.002862572,   -0.09421313,
+      0.058618143,    -0.08598433,    0.00972939,    0.023867095,
+      -0.053934585,   -0.023203006,   0.07452513,    -0.048767887,
+      -0.07314807,    -0.056307215,   -0.10433547,   -0.06440842,
+      0.04328182,     0.04389765,     -0.020006588,  -0.09076438,
+      -0.11652589,    -0.021705797,   0.03345259,    -0.010329105,
+      -0.025767034,   0.013057034,    -0.07316461,   -0.10145612,
+      0.06358255,     0.18531723,     0.07759293,    0.12006465,
+      0.1305557,      0.058638252,    -0.03393652,   0.09622831,
+      -0.16253184,    -2.4580743e-06, 0.079869635,   -0.070196845,
+      -0.005644518,   0.06857898,     -0.12598175,   -0.035084512,
+      0.03156317,     -0.12794146,    -0.031963028,  0.04692781,
+      0.030070418,    0.0071660685,   -0.095516115,  -0.004643372,
+      0.040170413,    -0.062104587,   -0.0037324072, 0.0554317,
+      0.08184801,     -0.019164372,   0.06791302,    0.034257166,
+      -0.10307039,    0.021943003,    0.046745934,   0.0790918,
+      -0.0265588,     -0.007824208,   0.042546265,   -0.00977924,
+      -0.0002440307,  -0.017384544,   -0.017990116,  0.12252321,
+      -0.014512694,   -0.08251313,    0.08861942,    0.13589665,
+      0.026351685,    0.012641483,    0.07466548,    0.044301085,
+      -0.045414884,   -0.051112458,   0.03444247,    -0.08502782,
+      -0.04106223,    -0.028126027,   0.028473156,   0.10467447};
+
+  recurrent_to_cell_weights_ = {
+      -0.037322544,   0.018592842,   0.0056175636,  -0.06253426,
+      0.055647098,    -0.05713207,   -0.05626563,   0.005559383,
+      0.03375411,     -0.025757805,  -0.088049285,  0.06017052,
+      -0.06570978,    0.007384076,   0.035123326,   -0.07920549,
+      0.053676967,    0.044480428,   -0.07663568,   0.0071805613,
+      0.08089997,     0.05143358,    0.038261272,   0.03339287,
+      -0.027673481,   0.044746667,   0.028349208,   0.020090483,
+      -0.019443132,   -0.030755889,  -0.0040000007, 0.04465846,
+      -0.021585021,   0.0031670958,  0.0053199246,  -0.056117613,
+      -0.10893326,    0.076739706,   -0.08509834,   -0.027997585,
+      0.037871376,    0.01449768,    -0.09002357,   -0.06111149,
+      -0.046195522,   0.0422062,     -0.005683705,  -0.1253618,
+      -0.012925729,   -0.04890792,   0.06985068,    0.037654128,
+      0.03398274,     -0.004781977,  0.007032333,   -0.031787455,
+      0.010868644,    -0.031489216,  0.09525667,    0.013939797,
+      0.0058680447,   0.0167067,     0.02668468,    -0.04797466,
+      -0.048885044,   -0.12722108,   0.035304096,   0.06554885,
+      0.00972396,     -0.039238118,  -0.05159735,   -0.11329045,
+      0.1613692,      -0.03750952,   0.06529313,    -0.071974665,
+      -0.11769596,    0.015524369,   -0.0013754242, -0.12446318,
+      0.02786344,     -0.014179351,  0.005264273,   0.14376344,
+      0.015983658,    0.03406988,    -0.06939408,   0.040699873,
+      0.02111075,     0.09669095,    0.041345075,   -0.08316494,
+      -0.07684199,    -0.045768797,  0.032298047,   -0.041805092,
+      0.0119405,      0.0061010392,  0.12652606,    0.0064572375,
+      -0.024950314,   0.11574242,    0.04508852,    -0.04335324,
+      0.06760663,     -0.027437469,  0.07216407,    0.06977076,
+      -0.05438599,    0.034033038,   -0.028602652,  0.05346137,
+      0.043184172,    -0.037189785,  0.10420091,    0.00882477,
+      -0.054019816,   -0.074273005,  -0.030617684,  -0.0028467078,
+      0.024302477,    -0.0038869337, 0.005332455,   0.0013399826,
+      0.04361412,     -0.007001822,  0.09631092,    -0.06702025,
+      -0.042049985,   -0.035070654,  -0.04103342,   -0.10273396,
+      0.0544271,      0.037184782,   -0.13150354,   -0.0058036847,
+      -0.008264958,   0.042035464,   0.05891794,    0.029673764,
+      0.0063542654,   0.044788733,   0.054816857,   0.062257513,
+      -0.00093483756, 0.048938446,   -0.004952862,  -0.007730018,
+      -0.04043371,    -0.017094059,  0.07229206,    -0.023670016,
+      -0.052195564,   -0.025616996,  -0.01520939,   0.045104615,
+      -0.007376126,   0.003533447,   0.006570588,   0.056037236,
+      0.12436656,     0.051817212,   0.028532185,   -0.08686856,
+      0.11868599,     0.07663395,    -0.07323171,   0.03463402,
+      -0.050708205,   -0.04458982,   -0.11590894,   0.021273347,
+      0.1251325,      -0.15313013,   -0.12224372,   0.17228661,
+      0.023029093,    0.086124025,   0.006445803,   -0.03496501,
+      0.028332196,    0.04449512,    -0.042436164,  -0.026587414,
+      -0.006041347,   -0.09292539,   -0.05678812,   0.03897832,
+      0.09465633,     0.008115513,   -0.02171956,   0.08304309,
+      0.071401566,    0.019622514,   0.032163795,   -0.004167056,
+      0.02295182,     0.030739572,   0.056506045,   0.004612461,
+      0.06524936,     0.059999723,   0.046395954,   -0.0045512207,
+      -0.1335546,     -0.030136576,  0.11584653,    -0.014678886,
+      0.0020118146,   -0.09688814,   -0.0790206,    0.039770417,
+      -0.0329582,     0.07922767,    0.029322514,   0.026405897,
+      0.04207835,     -0.07073373,   0.063781224,   0.0859677,
+      -0.10925287,    -0.07011058,   0.048005477,   0.03438226,
+      -0.09606514,    -0.006669445,  -0.043381985,  0.04240257,
+      -0.06955775,    -0.06769346,   0.043903265,   -0.026784198,
+      -0.017840602,   0.024307009,   -0.040079936,  -0.019946516,
+      0.045318738,    -0.12233574,   0.026170589,   0.0074471775,
+      0.15978073,     0.10185836,    0.10298046,    -0.015476589,
+      -0.039390966,   -0.072174534,  0.0739445,     -0.1211869,
+      -0.0347889,     -0.07943156,   0.014809798,   -0.12412325,
+      -0.0030663363,  0.039695457,   0.0647603,     -0.08291318,
+      -0.018529687,   -0.004423833,  0.0037507233,  0.084633216,
+      -0.01514876,    -0.056505352,  -0.012800942,  -0.06994386,
+      0.012962922,    -0.031234352,  0.07029052,    0.016418684,
+      0.03618972,     0.055686004,   -0.08663945,   -0.017404709,
+      -0.054761406,   0.029065743,   0.052404847,   0.020238016,
+      0.0048197987,   -0.0214882,    0.07078733,    0.013016777,
+      0.06262858,     0.009184685,   0.020785125,   -0.043904778,
+      -0.0270329,     -0.03299152,   -0.060088247,  -0.015162964,
+      -0.001828936,   0.12642565,    -0.056757294,  0.013586685,
+      0.09232601,     -0.035886683,  0.06000002,    0.05229691,
+      -0.052580316,   -0.082029596,  -0.010794592,  0.012947712,
+      -0.036429964,   -0.085508935,  -0.13127148,   -0.017744139,
+      0.031502828,    0.036232427,   -0.031581745,  0.023051167,
+      -0.05325106,    -0.03421577,   0.028793324,   -0.034633752,
+      -0.009881397,   -0.043551125,  -0.018609839,  0.0019097115,
+      -0.008799762,   0.056595087,   0.0022273948,  0.055752404};
+
+  recurrent_to_forget_weights_ = {
+      -0.057784554,  -0.026057621,  -0.068447545,   -0.022581743,
+      0.14811787,    0.10826372,    0.09471067,     0.03987225,
+      -0.0039523416, 0.00030638507, 0.053185795,    0.10572994,
+      0.08414449,    -0.022036452,  -0.00066928595, -0.09203576,
+      0.032950465,   -0.10985798,   -0.023809856,   0.0021431844,
+      -0.02196096,   -0.00326074,   0.00058621005,  -0.074678116,
+      -0.06193199,   0.055729095,   0.03736828,     0.020123724,
+      0.061878487,   -0.04729229,   0.034919553,    -0.07585433,
+      -0.04421272,   -0.044019096,  0.085488975,    0.04058006,
+      -0.06890133,   -0.030951202,  -0.024628663,   -0.07672815,
+      0.034293607,   0.08556707,    -0.05293577,    -0.033561368,
+      -0.04899627,   0.0241671,     0.015736353,    -0.095442444,
+      -0.029564252,  0.016493602,   -0.035026584,   0.022337519,
+      -0.026871363,  0.004780428,   0.0077918363,   -0.03601621,
+      0.016435321,   -0.03263031,   -0.09543275,    -0.047392778,
+      0.013454138,   0.028934088,   0.01685226,     -0.086110644,
+      -0.046250615,  -0.01847454,   0.047608484,    0.07339695,
+      0.034546845,   -0.04881143,   0.009128804,    -0.08802852,
+      0.03761666,    0.008096139,   -0.014454086,   0.014361001,
+      -0.023502491,  -0.0011840804, -0.07607001,    0.001856849,
+      -0.06509276,   -0.006021153,  -0.08570962,    -0.1451793,
+      0.060212336,   0.055259194,   0.06974018,     0.049454916,
+      -0.027794661,  -0.08077226,   -0.016179763,   0.1169753,
+      0.17213494,    -0.0056326236, -0.053934924,   -0.0124349,
+      -0.11520337,   0.05409887,    0.088759385,    0.0019655675,
+      0.0042065294,  0.03881498,    0.019844765,    0.041858196,
+      -0.05695512,   0.047233116,   0.038937137,    -0.06542224,
+      0.014429736,   -0.09719407,   0.13908425,     -0.05379757,
+      0.012321099,   0.082840554,   -0.029899208,   0.044217527,
+      0.059855383,   0.07711018,    -0.045319796,   0.0948846,
+      -0.011724666,  -0.0033288454, -0.033542685,   -0.04764985,
+      -0.13873616,   0.040668588,   0.034832682,    -0.015319203,
+      -0.018715994,  0.046002675,   0.0599172,      -0.043107376,
+      0.0294216,     -0.002314414,  -0.022424703,   0.0030315618,
+      0.0014641669,  0.0029166266,  -0.11878115,    0.013738511,
+      0.12375372,    -0.0006038222, 0.029104086,    0.087442465,
+      0.052958444,   0.07558703,    0.04817258,     0.044462286,
+      -0.015213451,  -0.08783778,   -0.0561384,     -0.003008196,
+      0.047060397,   -0.002058388,  0.03429439,     -0.018839769,
+      0.024734668,   0.024614193,   -0.042046934,   0.09597743,
+      -0.0043254104, 0.04320769,    0.0064070094,   -0.0019131786,
+      -0.02558259,   -0.022822596,  -0.023273505,   -0.02464396,
+      -0.10991725,   -0.006240552,  0.0074488563,   0.024044557,
+      0.04383914,    -0.046476185,  0.028658995,    0.060410924,
+      0.050786525,   0.009452605,   -0.0073054377,  -0.024810238,
+      0.0052906186,  0.0066939713,  -0.0020913032,  0.014515517,
+      0.015898481,   0.021362653,   -0.030262267,   0.016587038,
+      -0.011442813,  0.041154444,   -0.007631438,   -0.03423484,
+      -0.010977775,  0.036152758,   0.0066366293,   0.11915515,
+      0.02318443,    -0.041350313,  0.021485701,    -0.10906167,
+      -0.028218046,  -0.00954771,   0.020531068,    -0.11995105,
+      -0.03672871,   0.024019798,   0.014255957,    -0.05221243,
+      -0.00661567,   -0.04630967,   0.033188973,    0.10107534,
+      -0.014027541,  0.030796422,   -0.10270911,    -0.035999842,
+      0.15443139,    0.07684145,    0.036571592,    -0.035900835,
+      -0.0034699554, 0.06209149,    0.015920248,    -0.031122351,
+      -0.03858649,   0.01849943,    0.13872518,     0.01503974,
+      0.069941424,   -0.06948533,   -0.0088794185,  0.061282158,
+      -0.047401894,  0.03100163,    -0.041533746,   -0.10430945,
+      0.044574402,   -0.01425562,   -0.024290353,   0.034563623,
+      0.05866852,    0.023947537,   -0.09445152,    0.035450947,
+      0.02247216,    -0.0042998926, 0.061146557,    -0.10250651,
+      0.020881841,   -0.06747029,   0.10062043,     -0.0023941975,
+      0.03532124,    -0.016341697,  0.09685456,     -0.016764693,
+      0.051808182,   0.05875331,    -0.04536488,    0.001626336,
+      -0.028892258,  -0.01048663,   -0.009793449,   -0.017093895,
+      0.010987891,   0.02357273,    -0.00010856845, 0.0099760275,
+      -0.001845119,  -0.03551521,   0.0018358806,   0.05763657,
+      -0.01769146,   0.040995963,   0.02235177,     -0.060430344,
+      0.11475477,    -0.023854522,  0.10071741,     0.0686208,
+      -0.014250481,  0.034261297,   0.047418304,    0.08562733,
+      -0.030519066,  0.0060542435,  0.014653856,    -0.038836084,
+      0.04096551,    0.032249358,   -0.08355519,    -0.026823482,
+      0.056386515,   -0.010401743,  -0.028396193,   0.08507674,
+      0.014410365,   0.020995233,   0.17040324,     0.11511526,
+      0.02459721,    0.0066619175,  0.025853224,    -0.023133837,
+      -0.081302024,  0.017264642,   -0.009585969,   0.09491168,
+      -0.051313367,  0.054532815,   -0.014298593,   0.10657464,
+      0.007076659,   0.10964551,    0.0409152,      0.008275321,
+      -0.07283536,   0.07937492,    0.04192024,     -0.1075027};
+
+  recurrent_to_output_weights_ = {
+      0.025825322,   -0.05813119,  0.09495884,   -0.045984812,   -0.01255415,
+      -0.0026479573, -0.08196161,  -0.054914974, -0.0046604523,  -0.029587349,
+      -0.044576716,  -0.07480124,  -0.082868785, 0.023254942,    0.027502948,
+      -0.0039728214, -0.08683098,  -0.08116779,  -0.014675607,   -0.037924774,
+      -0.023314456,  -0.007401714, -0.09255757,  0.029460307,    -0.08829125,
+      -0.005139627,  -0.08989442,  -0.0555066,   0.13596267,     -0.025062224,
+      -0.048351806,  -0.03850004,  0.07266485,   -0.022414139,   0.05940088,
+      0.075114764,   0.09597592,   -0.010211725, -0.0049794707,  -0.011523867,
+      -0.025980417,  0.072999895,  0.11091378,   -0.081685916,   0.014416728,
+      0.043229222,   0.034178585,  -0.07530371,  0.035837382,    -0.085607,
+      -0.007721233,  -0.03287832,  -0.043848954, -0.06404588,    -0.06632928,
+      -0.073643476,  0.008214239,  -0.045984086, 0.039764922,    0.03474462,
+      0.060612556,   -0.080590084, 0.049127717,  0.04151091,     -0.030063879,
+      0.008801774,   -0.023021035, -0.019558564, 0.05158114,     -0.010947698,
+      -0.011825728,  0.0075720972, 0.0699727,    -0.0039981045,  0.069350146,
+      0.08799282,    0.016156472,  0.035502106,  0.11695009,     0.006217345,
+      0.13392477,    -0.037875112, 0.025745004,  0.08940699,     -0.00924166,
+      0.0046702605,  -0.036598757, -0.08811812,  0.10522024,     -0.032441203,
+      0.008176899,   -0.04454919,  0.07058152,   0.0067963637,   0.039206743,
+      0.03259838,    0.03725492,   -0.09515802,  0.013326398,    -0.052055415,
+      -0.025676316,  0.03198509,   -0.015951829, -0.058556724,   0.036879618,
+      0.043357447,   0.028362012,  -0.05908629,  0.0059240665,   -0.04995891,
+      -0.019187413,  0.0276265,    -0.01628143,  0.0025863599,   0.08800015,
+      0.035250366,   -0.022165963, -0.07328642,  -0.009415526,   -0.07455109,
+      0.11690406,    0.0363299,    0.07411125,   0.042103454,    -0.009660886,
+      0.019076364,   0.018299393,  -0.046004917, 0.08891175,     0.0431396,
+      -0.026327137,  -0.051502608, 0.08979574,   -0.051670972,   0.04940282,
+      -0.07491107,   -0.021240504, 0.022596184,  -0.034280192,   0.060163025,
+      -0.058211457,  -0.051837247, -0.01349775,  -0.04639988,    -0.035936575,
+      -0.011681591,  0.064818054,  0.0073146066, -0.021745546,   -0.043124277,
+      -0.06471268,   -0.07053354,  -0.029321948, -0.05330136,    0.016933719,
+      -0.053782392,  0.13747959,   -0.1361751,   -0.11569455,    0.0033329215,
+      0.05693899,    -0.053219706, 0.063698,     0.07977434,     -0.07924483,
+      0.06936997,    0.0034815092, -0.007305279, -0.037325785,   -0.07251102,
+      -0.033633437,  -0.08677009,  0.091591336,  -0.14165086,    0.021752775,
+      0.019683983,   0.0011612234, -0.058154266, 0.049996935,    0.0288841,
+      -0.0024567875, -0.14345716,  0.010955264,  -0.10234828,    0.1183656,
+      -0.0010731248, -0.023590032, -0.072285876, -0.0724771,     -0.026382286,
+      -0.0014920527, 0.042667855,  0.0018776858, 0.02986552,     0.009814309,
+      0.0733756,     0.12289186,   0.018043943,  -0.0458958,     0.049412545,
+      0.033632483,   0.05495232,   0.036686596,  -0.013781798,   -0.010036754,
+      0.02576849,    -0.08307328,  0.010112348,  0.042521734,    -0.05869831,
+      -0.071689695,  0.03876447,   -0.13275425,  -0.0352966,     -0.023077697,
+      0.10285965,    0.084736146,  0.15568255,   -0.00040734606, 0.027835453,
+      -0.10292561,   -0.032401145, 0.10053256,   -0.026142767,   -0.08271222,
+      -0.0030240538, -0.016368777, 0.1070414,    0.042672627,    0.013456989,
+      -0.0437609,    -0.022309763, 0.11576483,   0.04108048,     0.061026827,
+      -0.0190714,    -0.0869359,   0.037901703,  0.0610107,      0.07202949,
+      0.01675338,    0.086139716,  -0.08795751,  -0.014898893,   -0.023771819,
+      -0.01965048,   0.007955471,  -0.043740474, 0.03346837,     -0.10549954,
+      0.090567775,   0.042013682,  -0.03176985,  0.12569028,     -0.02421228,
+      -0.029526481,  0.023851605,  0.031539805,  0.05292009,     -0.02344001,
+      -0.07811758,   -0.08834428,  0.10094801,   0.16594367,     -0.06861939,
+      -0.021256343,  -0.041093912, -0.06669611,  0.035498552,    0.021757556,
+      -0.09302526,   -0.015403468, -0.06614931,  -0.051798206,   -0.013874718,
+      0.03630673,    0.010412845,  -0.08077351,  0.046185967,    0.0035662893,
+      0.03541868,    -0.094149634, -0.034814864, 0.003128424,    -0.020674974,
+      -0.03944324,   -0.008110165, -0.11113267,  0.08484226,     0.043586485,
+      0.040582247,   0.0968012,    -0.065249965, -0.028036479,   0.0050708856,
+      0.0017462453,  0.0326779,    0.041296225,  0.09164146,     -0.047743853,
+      -0.015952192,  -0.034451712, 0.084197424,  -0.05347844,    -0.11768019,
+      0.085926116,   -0.08251791,  -0.045081906, 0.0948852,      0.068401024,
+      0.024856757,   0.06978981,   -0.057309967, -0.012775832,   -0.0032452994,
+      0.01977615,    -0.041040014, -0.024264973, 0.063464895,    0.05431621,
+  };
+
+  cell_to_input_weights_ = {
+      0.040369894, 0.030746894,  0.24704495,  0.018586371,  -0.037586458,
+      -0.15312155, -0.11812848,  -0.11465643, 0.20259799,   0.11418174,
+      -0.10116027, -0.011334949, 0.12411352,  -0.076769054, -0.052169047,
+      0.21198851,  -0.38871562,  -0.09061183, -0.09683246,  -0.21929175};
+
+  cell_to_forget_weights_ = {
+      -0.01998659,  -0.15568835,  -0.24248174,   -0.012770197, 0.041331276,
+      -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766,
+      -0.047248036, 0.021479502,  0.033189066,   0.11952997,   -0.020432774,
+      0.64658105,   -0.06650122,  -0.03467612,   0.095340036,  0.23647355};
+
+  cell_to_output_weights_ = {0.08286371,  -0.08261836, -0.51210177, 0.002913762,
+                             0.17764764,  -0.5495371,  -0.08460716, -0.24552552,
+                             0.030037103, 0.04123544,  -0.11940523, 0.007358328,
+                             0.1890978,   0.4833202,   -0.34441817, 0.36312827,
+                             -0.26375428, 0.1457655,   -0.19724406, 0.15548733};
+
+  projection_weights_ = {
+      -0.009802181,  0.09401916,    0.0717386,     -0.13895074,  0.09641832,
+      0.060420845,   0.08539281,    0.054285463,   0.061395317,  0.034448683,
+      -0.042991187,  0.019801661,   -0.16840284,   -0.015726732, -0.23041931,
+      -0.024478018,  -0.10959692,   -0.013875541,  0.18600968,   -0.061274476,
+      0.0138165,     -0.08160894,   -0.07661644,   0.032372914,  0.16169067,
+      0.22465782,    -0.03993472,   -0.004017731,  0.08633481,   -0.28869787,
+      0.08682067,    0.17240396,    0.014975425,   0.056431185,  0.031037588,
+      0.16702051,    0.0077946745,  0.15140012,    0.29405436,   0.120285,
+      -0.188994,     -0.027265169,  0.043389652,   -0.022061434, 0.014777949,
+      -0.20203483,   0.094781205,   0.19100232,    0.13987629,   -0.036132768,
+      -0.06426278,   -0.05108664,   0.13221376,    0.009441198,  -0.16715929,
+      0.15859416,    -0.040437475,  0.050779544,   -0.022187516, 0.012166504,
+      0.027685808,   -0.07675938,   -0.0055694645, -0.09444123,  0.0046453946,
+      0.050794356,   0.10770313,    -0.20790008,   -0.07149004,  -0.11425117,
+      0.008225835,   -0.035802525,  0.14374903,    0.15262283,   0.048710253,
+      0.1847461,     -0.007487823,  0.11000021,    -0.09542012,  0.22619456,
+      -0.029149994,  0.08527916,    0.009043713,   0.0042746216, 0.016261552,
+      0.022461696,   0.12689082,    -0.043589946,  -0.12035478,  -0.08361797,
+      -0.050666027,  -0.1248618,    -0.1275799,    -0.071875185, 0.07377272,
+      0.09944291,    -0.18897448,   -0.1593054,    -0.06526116,  -0.040107165,
+      -0.004618631,  -0.067624845,  -0.007576253,  0.10727444,   0.041546922,
+      -0.20424393,   0.06907816,    0.050412357,   0.00724631,   0.039827548,
+      0.12449835,    0.10747581,    0.13708383,    0.09134148,   -0.12617786,
+      -0.06428341,   0.09956831,    0.1208086,     -0.14676677,  -0.0727722,
+      0.1126304,     0.010139365,   0.015571211,   -0.038128063, 0.022913318,
+      -0.042050496,  0.16842307,    -0.060597885,  0.10531834,   -0.06411776,
+      -0.07451711,   -0.03410368,   -0.13393489,   0.06534304,   0.003620307,
+      0.04490757,    0.05970546,    0.05197996,    0.02839995,   0.10434969,
+      -0.013699693,  -0.028353551,  -0.07260381,   0.047201227,  -0.024575593,
+      -0.036445823,  0.07155557,    0.009672501,   -0.02328883,  0.009533515,
+      -0.03606021,   -0.07421458,   -0.028082801,  -0.2678904,   -0.13221288,
+      0.18419984,    -0.13012612,   -0.014588381,  -0.035059117, -0.04824723,
+      0.07830115,    -0.056184657,  0.03277091,    0.025466874,  0.14494097,
+      -0.12522776,   -0.098633975,  -0.10766018,   -0.08317623,  0.08594209,
+      0.07749552,    0.039474737,   0.1776665,     -0.07409566,  -0.0477268,
+      0.29323658,    0.10801441,    0.1154011,     0.013952499,  0.10739139,
+      0.10708251,    -0.051456142,  0.0074137426,  -0.10430189,  0.10034707,
+      0.045594677,   0.0635285,     -0.0715442,    -0.089667566, -0.10811871,
+      0.00026344223, 0.08298446,    -0.009525053,  0.006585689,  -0.24567553,
+      -0.09450807,   0.09648481,    0.026996298,   -0.06419476,  -0.04752702,
+      -0.11063944,   -0.23441927,   -0.17608605,   -0.052156363, 0.067035615,
+      0.19271925,    -0.0032889997, -0.043264326,  0.09663576,   -0.057112187,
+      -0.10100678,   0.0628376,     0.04447668,    0.017961001,  -0.10094388,
+      -0.10190601,   0.18335468,    0.10494553,    -0.052095775, -0.0026118709,
+      0.10539724,    -0.04383912,   -0.042349473,  0.08438151,   -0.1947263,
+      0.02251204,    0.11216432,    -0.10307853,   0.17351969,   -0.039091777,
+      0.08066188,    -0.00561982,   0.12633002,    0.11335965,   -0.0088127935,
+      -0.019777594,  0.06864014,    -0.059751723,  0.016233567,  -0.06894641,
+      -0.28651384,   -0.004228674,  0.019708522,   -0.16305895,  -0.07468996,
+      -0.0855457,    0.099339016,   -0.07580735,   -0.13775392,  0.08434318,
+      0.08330512,    -0.12131499,   0.031935584,   0.09180414,   -0.08876437,
+      -0.08049874,   0.008753825,   0.03498998,    0.030215185,  0.03907079,
+      0.089751154,   0.029194152,   -0.03337423,   -0.019092513, 0.04331237,
+      0.04299654,    -0.036394123,  -0.12915532,   0.09793732,   0.07512415,
+      -0.11319543,   -0.032502122,  0.15661901,    0.07671967,   -0.005491124,
+      -0.19379048,   -0.218606,     0.21448623,    0.017840758,  0.1416943,
+      -0.07051762,   0.19488361,    0.02664691,    -0.18104725,  -0.09334311,
+      0.15026465,    -0.15493552,   -0.057762887,  -0.11604192,  -0.262013,
+      -0.01391798,   0.012185008,   0.11156489,    -0.07483202,  0.06693364,
+      -0.26151478,   0.046425626,   0.036540434,   -0.16435726,  0.17338543,
+      -0.21401681,   -0.11385144,   -0.08283257,   -0.069031075, 0.030635102,
+      0.010969227,   0.11109743,    0.010919218,   0.027526086,  0.13519906,
+      0.01891392,    -0.046839405,  -0.040167913,  0.017953383,  -0.09700955,
+      0.0061885654,  -0.07000971,   0.026893595,   -0.038844477, 0.14543656};
+
+  lstm_input_ = {// Step 1
+                 {{0.787926, 0.151646, 0.071352, 0.118426, 0.458058}},
+                 // Step 2
+                 {{0.596268, 0.998386, 0.568695, 0.864524, 0.571277}},
+                 // Step 3
+                 {{0.073204, 0.296072, 0.743333, 0.069199, 0.045348}},
+                 // Step 4
+                 {{0.867394, 0.291279, 0.013714, 0.482521, 0.626339}}};
+
+  lstm_golden_output_ = {
+      {{-0.00396806, 0.029352, -0.00279226, 0.0159977, -0.00835576, -0.0211779,
+        0.0283512, -0.0114597, 0.00907307, -0.0244004, -0.0152191, -0.0259063,
+        0.00914318, 0.00415118, 0.017147, 0.0134203}},
+
+      {{-0.0166936, 0.0381209, 0.000889694, 0.0143363, -0.0328911, -0.0234288,
+        0.0333051, -0.012229, 0.0110322, -0.0457725, -0.000832209, -0.0202817,
+        0.0327257, 0.0121308, 0.0155969, 0.0312091}},
+
+      {{-0.0213783, 0.0350169, 0.000324794, 0.0276012, -0.0263374, -0.0371449,
+        0.0446149, -0.0205474, 0.0103729, -0.0576349, -0.0150052, -0.0292043,
+        0.0376827, 0.0136115, 0.0243435, 0.0354492}},
+
+      {{-0.0189322, 0.0464512, -0.00251373, 0.0225745, -0.0308346, -0.0317124,
+        0.0460407, -0.0189395, 0.0149363, -0.0530162, -0.0150767, -0.0340193,
+        0.0286833, 0.00824207, 0.0264887, 0.0305169}}};
+
+  LSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, /*weight_type=*/TensorType_FLOAT32,
+      model_has_legacy_20_inputs,
+      /*is_layer_norm=*/false, /*asymmetric_quantize_inputs=*/false,
+      input_to_input_weights_, input_to_forget_weights_, input_to_cell_weights_,
+      input_to_output_weights_, recurrent_to_input_weights_,
+      recurrent_to_forget_weights_, recurrent_to_cell_weights_,
+      recurrent_to_output_weights_, cell_to_input_weights_,
+      cell_to_forget_weights_, cell_to_output_weights_, input_gate_bias_,
+      forget_gate_bias_, cell_gate_bias_, output_gate_bias_,
+      projection_weights_, {}, input_layer_norm_coefficients_,
+      forget_layer_norm_coefficients_, cell_layer_norm_coefficients_,
+      output_layer_norm_coefficients_);
+
+  VerifyGoldens(&lstm, 0.00001f);
+}
+
+TEST_F(LstmOpTest, NoCifg_Peephole_Projection_LayerNorm) {
+  const int n_batch = 1;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+
+  input_to_input_weights_ = {0.5,  0.6,  0.7,  -0.8, -0.9, 0.1,  0.2,
+                             0.3,  -0.4, 0.5,  -0.8, 0.7,  -0.6, 0.5,
+                             -0.4, -0.5, -0.4, -0.3, -0.2, -0.1};
+
+  input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
+                              -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
+                              -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
+
+  input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5,  -0.2,
+                            -0.3, -0.2, -0.6, 0.6,  -0.1, -0.4, -0.3,
+                            -0.7, 0.7,  -0.9, -0.5, 0.8,  0.6};
+
+  input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3,
+                              -0.3, -0.8, -0.2, 0.6,  -0.2, 0.4,  -0.7,
+                              -0.3, -0.5, 0.1,  0.5,  -0.6, -0.4};
+
+  input_gate_bias_ = {0.03, 0.15, 0.22, 0.38};
+
+  forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
+
+  cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
+
+  output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
+
+  recurrent_to_input_weights_ = {-0.2, -0.3, 0.4,  0.1,  -0.5, 0.9,
+                                 -0.2, -0.3, -0.7, 0.05, -0.2, -0.6};
+
+  recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8,  -0.08,
+                                -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
+
+  recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4,
+                                  0.9,  0.3,  -0.1, 0.2,  0.5, 0.2};
+
+  recurrent_to_output_weights_ = {0.3,  -0.1, 0.1,  -0.2, -0.5, -0.7,
+                                  -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
+
+  cell_to_input_weights_ = {0.05, 0.1, 0.25, 0.15};
+
+  cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
+
+  cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
+
+  input_layer_norm_coefficients_ = {0.1, 0.2, 0.3, 0.5};
+  forget_layer_norm_coefficients_ = {0.2, 0.2, 0.4, 0.3};
+  cell_layer_norm_coefficients_ = {0.7, 0.2, 0.3, 0.8};
+  output_layer_norm_coefficients_ = {0.6, 0.2, 0.2, 0.5};
+
+  projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
+                         0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
+
+  lstm_input_ = {
+      {{0.7, 0.8, 0.1, 0.2, 0.3}},
+      {{0.8, 0.1, 0.2, 0.4, 0.5}},
+      {{0.2, 0.7, 0.7, 0.1, 0.7}},
+  };
+
+  lstm_golden_output_ = {{{0.0244077, 0.128027, -0.00170918}},
+                         {{0.0137642, 0.140751, 0.0395835}},
+                         {{-0.00459231, 0.155278, 0.0837377}}};
+
+  LSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, /*weight_type=*/TensorType_FLOAT32,
+      /*model_has_legacy_20_inputs=*/false,
+      /*is_layer_norm=*/true, /*asymmetric_quantize_inputs=*/false,
+      input_to_input_weights_, input_to_forget_weights_, input_to_cell_weights_,
+      input_to_output_weights_, recurrent_to_input_weights_,
+      recurrent_to_forget_weights_, recurrent_to_cell_weights_,
+      recurrent_to_output_weights_, cell_to_input_weights_,
+      cell_to_forget_weights_, cell_to_output_weights_, input_gate_bias_,
+      forget_gate_bias_, cell_gate_bias_, output_gate_bias_,
+      projection_weights_, {}, input_layer_norm_coefficients_,
+      forget_layer_norm_coefficients_, cell_layer_norm_coefficients_,
+      output_layer_norm_coefficients_);
+
+  VerifyGoldens(&lstm, 0.00001f);
+}
+
+TEST_F(LstmOpTest, Cifg_Peephole_Projection_LayerNorm) {
+  const int n_batch = 1;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+
+  input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
+                              -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
+                              -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
+  input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5,  -0.2,
+                            -0.3, -0.2, -0.6, 0.6,  -0.1, -0.4, -0.3,
+                            -0.7, 0.7,  -0.9, -0.5, 0.8,  0.6};
+  input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3,
+                              -0.3, -0.8, -0.2, 0.6,  -0.2, 0.4,  -0.7,
+                              -0.3, -0.5, 0.1,  0.5,  -0.6, -0.4};
+
+  forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
+  cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
+  output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
+
+  recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8,  -0.08,
+                                -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
+  recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4,
+                                  0.9,  0.3,  -0.1, 0.2,  0.5, 0.2};
+  recurrent_to_output_weights_ = {0.3,  -0.1, 0.1,  -0.2, -0.5, -0.7,
+                                  -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
+
+  cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
+  cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
+
+  forget_layer_norm_coefficients_ = {0.2, 0.2, 0.4, 0.3};
+  cell_layer_norm_coefficients_ = {0.7, 0.2, 0.3, 0.8};
+  output_layer_norm_coefficients_ = {0.6, 0.2, 0.2, 0.5};
+  projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
+                         0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
+
+  lstm_input_ = {{{0.7, 0.8, 0.1, 0.2, 0.3}},
+                 {{0.8, 0.1, 0.2, 0.4, 0.5}},
+                 {{0.2, 0.7, 0.7, 0.1, 0.7}}};
+  lstm_golden_output_ = {{{0.02129706, 0.140816242, 0.0112733059}},
+                         {{0.0132302344, 0.152308047, 0.0346313119}},
+                         {{-0.0123688057, 0.165790111, 0.0893077999}}};
+
+  LSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, /*weight_type=*/TensorType_FLOAT32,
+      /*model_has_legacy_20_inputs=*/false,
+      /*is_layer_norm=*/true, /*asymmetric_quantize_inputs=*/false,
+      input_to_input_weights_, input_to_forget_weights_, input_to_cell_weights_,
+      input_to_output_weights_, recurrent_to_input_weights_,
+      recurrent_to_forget_weights_, recurrent_to_cell_weights_,
+      recurrent_to_output_weights_, cell_to_input_weights_,
+      cell_to_forget_weights_, cell_to_output_weights_, input_gate_bias_,
+      forget_gate_bias_, cell_gate_bias_, output_gate_bias_,
+      projection_weights_, {}, input_layer_norm_coefficients_,
+      forget_layer_norm_coefficients_, cell_layer_norm_coefficients_,
+      output_layer_norm_coefficients_);
+
+  VerifyGoldens(&lstm, 0.00001f);
+}
+
+#ifdef GTEST_HAS_DEATH_TEST
+TEST_F(LstmOpTest, InvalidTypes) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  const int n_cell = 4;
+  const int n_output = 4;
+
+  EXPECT_DEATH(
+      LSTMOpModel lstm(
+          n_batch, n_input, n_cell, n_output,
+          /*use_cifg=*/false, /*use_peephole=*/false,
+          /*use_projection_weights=*/false,
+          /*use_projection_bias=*/false,
+          /*weight_type=*/TensorType_INT32,
+          /*model_has_legacy_20_inputs=*/true,
+          /*is_layer_norm=*/false,
+          /*asymmetric_quantize_inputs=*/false,
+          /*input_to_input_weights=*/{}, /*input_to_forget_weights=*/{},
+          /*input_to_cell_weights=*/{}, /*input_to_output_weights=*/{},
+          /*recurrent_to_input_weights=*/{},
+          /*recurrent_to_forget_weights=*/{}, /*recurrent_to_cell_weights=*/{},
+          /*recurrent_to_output_weights=*/{}, /*cell_to_input_weights=*/{},
+          /*cell_to_forget_weights=*/{}, /*cell_to_output_weights=*/{},
+          /*input_gate_bias=*/{}, /*forget_gate_bias=*/{},
+          /*cell_gate_bias=*/{}, /*output_gate_bias=*/{},
+          /*projection_weights=*/{}, /*projection_bias=*/{},
+          /*input_layer_norm_coefficients=*/{},
+          /*forget_layer_norm_coefficients=*/{},
+          /*cell_layer_norm_coefficients=*/{},
+          /*output_layer_norm_coefficients=*/{}),
+      "");
+
+  EXPECT_DEATH(
+      LSTMOpModel lstm(
+          n_batch, n_input, n_cell, n_output,
+          /*use_cifg=*/false, /*use_peephole=*/false,
+          /*use_projection_weights=*/false,
+          /*use_projection_bias=*/false,
+          /*weight_type=*/TensorType_COMPLEX64,
+          /*model_has_legacy_20_inputs=*/true,
+          /*is_layer_norm=*/false,
+          /*asymmetric_quantize_inputs=*/false,
+          /*input_to_input_weights=*/{}, /*input_to_forget_weights=*/{},
+          /*input_to_cell_weights=*/{}, /*input_to_output_weights=*/{},
+          /*recurrent_to_input_weights=*/{},
+          /*recurrent_to_forget_weights=*/{}, /*recurrent_to_cell_weights=*/{},
+          /*recurrent_to_output_weights=*/{}, /*cell_to_input_weights=*/{},
+          /*cell_to_forget_weights=*/{}, /*cell_to_output_weights=*/{},
+          /*input_gate_bias=*/{}, /*forget_gate_bias=*/{},
+          /*cell_gate_bias=*/{}, /*output_gate_bias=*/{},
+          /*projection_weights=*/{}, /*projection_bias=*/{},
+          /*input_layer_norm_coefficients=*/{},
+          /*forget_layer_norm_coefficients=*/{},
+          /*cell_layer_norm_coefficients=*/{},
+          /*output_layer_norm_coefficients=*/{}),
+      "");
+}
+#endif
+
+// Test parameter controls model_has_legacy_20_inputs in LSTMOpModel.
+INSTANTIATE_TEST_SUITE_P(Parameterized, LstmOpTest, ::testing::Bool());
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
index e1628a7e9a7..c5b659463ea 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean.cc
@@ -35,6 +35,15 @@ Mean::Mean(const OperationDef& definition, const DeviceInfo& device_info)
   if (device_info.IsAdreno3xx()) {
     work_group_size_ = int3(16, 8, 1);
   }
+  if (device_info.IsMali()) {
+    const MaliInfo& mali_info = device_info.mali_info;
+    if (mali_info.IsMaliT6xx() || mali_info.IsMaliT7xx() ||
+        mali_info.IsMaliT8xx()) {
+      work_group_size_ = int3(8, 4, 1);
+    } else {
+      work_group_size_ = int3(8, 8, 1);
+    }
+  }
   code_ = GetMeanKernelCode(definition_, work_group_size_);
 }
 
@@ -108,12 +117,12 @@ std::string Mean::GetMeanKernelCode(const OperationDef& op_def,
   return c;
 }
 
-absl::Status Mean::BindArguments() {
+absl::Status Mean::BindArguments(ArgumentsBinder* args) {
   const double total_size = src_[0]->Width() * src_[0]->Height();
   const double size_0 = work_group_size_.x * work_group_size_.y;
   const double size_1 = total_size / size_0;
-  RETURN_IF_ERROR(args_.SetFloat("inv_multiplier_1", 1.0 / size_1));
-  RETURN_IF_ERROR(args_.SetFloat("inv_multiplier_2", 1.0 / size_0));
+  RETURN_IF_ERROR(args->SetFloat("inv_multiplier_1", 1.0 / size_1));
+  RETURN_IF_ERROR(args->SetFloat("inv_multiplier_2", 1.0 / size_0));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean.h b/tensorflow/lite/delegates/gpu/cl/kernels/mean.h
index 12735c0b916..3bf2061d329 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean.h
@@ -37,7 +37,7 @@ class Mean : public GPUOperation {
       std::vector<int3>* work_groups) const override {
     work_groups->push_back(work_group_size_);
   }
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   // Move only
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
index c36dacdaafc..dabf71066f6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.cc
@@ -29,15 +29,23 @@ namespace cl {
 namespace {
 
 std::string GetVectorReduceCode() {
-  return R"(static inline float reduce_vector(float4 v) {
+  return R"(float reduce_vector(float4 v) {
   return dot(v, (float4)(1.0f));
 })";
 }
 
 std::string GetReduceCode() {
   // If it is supported, use the built-in work_group_reduce_add function.
-  // Otherwise, implement a reduction using __local memory. Note this only works
-  // with power-of-two work group sizes.
+  // Otherwise, implement a reduction using __local memory.
+
+  // In the reduction step add upper half of the still-to-be-summed vector to
+  // the lower half, while taking care of odd sizes and rounding. E.g.:
+  // Number of items still to be summed before: 5
+  // Local memory before: [a, b, c, d, e];
+  // Local memory after: [a+d, b+e, c, d, e];
+  // Threads doing work: id < 2 = floor(5/2)
+  // Offset to the added items: 3 = ceil(5/2)
+  // Number of items still to be summed after: 3 = ceil(5/2)
   return R"(
 #if (__OPENCL_C_VERSION__ >= 200) && (__OPENCL_C_VERSION__ < 300) && \
   !defined(__opencl_c_work_group_collective_functions)
@@ -45,35 +53,85 @@ std::string GetReduceCode() {
 #endif
 
 #ifdef __opencl_c_work_group_collective_functions
-#define local_reduce(input, tmp) work_group_reduce_add(input)
+#define local_reduce(item, tmp) work_group_reduce_add(item)
 #else  // !defined(__opencl_c_work_group_collective_functions)
-static inline float local_reduce(float input, __local float* tmp) {
+float local_reduce(float item, __local float* tmp) {
   const int local_id = get_local_id(0);
-  tmp[local_id] = input;
+  tmp[local_id] = item;
   barrier(CLK_LOCAL_MEM_FENCE);
-  int reduction_size = get_local_size(0) / 2;
-  while (reduction_size > 0) {
-    if (local_id < reduction_size) {
-      tmp[local_id] += tmp[local_id + reduction_size];
+  // The number of items still need to be summed
+  int reduction_size = get_local_size(0);
+  while (reduction_size > 1) {
+    const int active_thread_limit = reduction_size / 2;
+    const int offset = (reduction_size + 1) / 2;
+    if (local_id < active_thread_limit) {
+      item += tmp[local_id + offset];
+      tmp[local_id] = item;
     }
     barrier(CLK_LOCAL_MEM_FENCE);
-    reduction_size /=  2;
+    reduction_size = offset;
   }
   return tmp[0];
 }
 #endif  // defined(__opencl_c_work_group_collective_functions)
 )";
 }
+
+std::string GetFilterCode() {
+  return R"(
+float4 filter_outside_tensor(float4 x, int num_channels, int slice) {
+  return select(x, (float4)(0.0f), slice * 4 + (int4)(0, 1, 2, 3) >= num_channels);
+}
+)";
+}
 }  // namespace
 
 MeanStdDevNormalization::MeanStdDevNormalization(const OperationDef& definition,
-                                                 const DeviceInfo& device_info)
+                                                 const DeviceInfo& device_info,
+                                                 const int tensor_slices)
     : GPUOperation(definition) {
   // The kernel code does not inherently need a fixed size, but in order to not
   // hardcode the __local array's size for the reductions, we would need to pass
   // that size to the kernel at runtime, and that is currently not supported.
-  // For now, fix workgroup size to 128 threads.
-  work_group_size_.x = 128;
+  // For now, fix workgroup size to the biggest supported by the device, but not
+  // larger than the number of tensor slices.
+  int desired_work_group_size =
+      std::min(tensor_slices, device_info.max_work_group_size_x);
+  if (device_info.IsMali()) {
+    // Don't use more than 64 work items per work group on ARM Mali. They
+    // implement local memory using the global memory, larger workgroups have
+    // severe performance penalty.
+    desired_work_group_size = 64;
+  }
+  if (device_info.IsAdreno()) {
+    AdrenoInfo info = device_info.adreno_info;
+    if (device_info.IsAdreno3xx()) {
+      if (info.gpu_version < 320) {
+        desired_work_group_size = 64;
+      } else {
+        desired_work_group_size = 128;
+      }
+    } else if (device_info.IsAdreno4xx()) {
+      if (info.gpu_version < 430) {
+        desired_work_group_size = 128;
+      } else {
+        desired_work_group_size = 256;
+      }
+    } else if (device_info.IsAdreno5xx()) {
+      if (info.gpu_version < 530) {
+        desired_work_group_size = 128;
+      } else {
+        desired_work_group_size = 256;
+      }
+    }
+  }
+  if (device_info.IsPowerVR()) {
+    desired_work_group_size = 64;
+  }
+  while (desired_work_group_size >= tensor_slices * 2) {
+    desired_work_group_size /= 2;
+  }
+  work_group_size_.x = desired_work_group_size;
   work_group_size_.y = 1;  // Required
   work_group_size_.z = 1;  // Required
   code_ = GetNormalizationCode();
@@ -91,6 +149,7 @@ std::string MeanStdDevNormalization::GetNormalizationCode() {
   std::string c = GetCommonDefines(definition_.precision);
   c += GetVectorReduceCode();
   c += GetReduceCode();
+  c += GetFilterCode();
   c += "__attribute__((reqd_work_group_size(" +
        std::to_string(work_group_size_.x) + ", 1, 1)))\n";
   c += R"(__kernel void main_function($0) {
@@ -99,17 +158,12 @@ std::string MeanStdDevNormalization::GetNormalizationCode() {
        std::to_string(work_group_size_.x) + R"(];
 #endif
   const int B = get_global_id(1);
-  if (get_global_id(2) > 0) { return; }
-  if (B >= args.src_tensor.Batch()) { return; }
   // Calculate the total sum of the input tensor.
   // First, get a local sum of input[local_id_x + N*local_size_x] for all N.
   float4 private_sum4 = (float4)(0.0f);
   for (int S = get_local_id(0); S < args.src_tensor.Slices(); S += get_local_size(0)) {
     const float4 t = args.src_tensor.Read<float>(0, 0, S, B);
-    // Filter out reads beyond the end of the tensor.
-    const int4 is_after_end_of_tensor = (int4)(0, 1, 2, 3) >= (args.src_tensor.Channels() - S * 4);
-    const float4 filtered_t = select(t, (float4)(0.0f), is_after_end_of_tensor);
-    private_sum4 += filtered_t;
+    private_sum4 += filter_outside_tensor(t, args.src_tensor.Channels(), S);
   }
   // Reduce the vector to a single float and do a workgroup reduce.
   const float private_sum = reduce_vector(private_sum4);
@@ -120,19 +174,16 @@ std::string MeanStdDevNormalization::GetNormalizationCode() {
   float4 private_sum_diff_sq4 = (float4)(0.0f);
   for (int S = get_local_id(0); S < args.src_tensor.Slices(); S += get_local_size(0)) {
     const float4 t = args.src_tensor.Read<float>(0, 0, S, B);
-    const float4 diff = t - mean;
-    // Filter out reads beyond the end of the tensor.
-    const int4 is_after_end_of_tensor = (int4)(0, 1, 2, 3) >= (args.src_tensor.Channels() - S * 4);
-    const float4 filtered_diff = select(diff, (float4)(0.0f), is_after_end_of_tensor);
+    const float4 diff = filter_outside_tensor(t - mean, args.src_tensor.Channels(), S);
     // sum_diff_sq += diff²
-    private_sum_diff_sq4 = mad(filtered_diff, filtered_diff, private_sum_diff_sq4);
+    private_sum_diff_sq4 = mad(diff, diff, private_sum_diff_sq4);
   }
   // Reduce
   const float private_sum_diff_sq = reduce_vector(private_sum_diff_sq4);
   const float sum_diff_sq = local_reduce(private_sum_diff_sq, tmp);
   // Calculate 1/stddev (with the 'regulazing constant' as in tensor_utils.cc)
   const float variance = sum_diff_sq / args.src_tensor.Channels();
-  const float stddev_inv =  rsqrt(variance + 1.0e-8f);
+  const float stddev_inv = native_rsqrt(variance + 1.0e-8f);
   // Calculate (t-mean)/stddev for each element
   for (int S = get_local_id(0); S < args.src_tensor.Slices(); S += get_local_size(0)) {
     const float4 t = args.src_tensor.Read<float>(0, 0, S, B);
@@ -153,8 +204,9 @@ int3 MeanStdDevNormalization::GetGridSize() const {
 }
 
 MeanStdDevNormalization CreateMeanStdDevNormalization(
-    const OperationDef& definition, const DeviceInfo& device_info) {
-  return MeanStdDevNormalization(definition, device_info);
+    const OperationDef& definition, const DeviceInfo& device_info,
+    const int tensor_slices) {
+  return MeanStdDevNormalization(definition, device_info, tensor_slices);
 }
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h
index e898803e377..3312d23122f 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h
@@ -30,7 +30,8 @@ namespace cl {
 class MeanStdDevNormalization : public GPUOperation {
  public:
   explicit MeanStdDevNormalization(const OperationDef& definition,
-                                   const DeviceInfo& device_info);
+                                   const DeviceInfo& device_info,
+                                   const int tensor_slices);
 
   void GetPossibleKernelWorkGroups(
       TuningType tuning_type, const DeviceInfo& device_info,
@@ -52,7 +53,8 @@ class MeanStdDevNormalization : public GPUOperation {
 };
 
 MeanStdDevNormalization CreateMeanStdDevNormalization(
-    const OperationDef& definition, const DeviceInfo& device_info);
+    const OperationDef& definition, const DeviceInfo& device_info,
+    const int tensor_slices);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
index 8ff34be17d8..7ceaf964edd 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization_test.cc
@@ -55,7 +55,7 @@ TEST_P(MeanStddevNormalizationTest, SeparateBatches) {
       op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
       TensorFloat32 dst_tensor;
       auto operation =
-          CreateMeanStdDevNormalization(op_def, env_.GetDevicePtr()->info_);
+          CreateMeanStdDevNormalization(op_def, env_.GetDevicePtr()->info_, 1);
       ASSERT_OK(ExecuteGPUOperation({src_tensor}, creation_context_, &operation,
                                     BHWC(1, 1, 1, 4), &dst_tensor));
 
@@ -88,8 +88,6 @@ INSTANTIATE_TEST_SUITE_P(
         std::make_tuple(100.0f, 100.0f, 2.63e-4f)  // large mean, large variance
         ));
 
-GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(MeanStddevNormalizationTest);
-
 TEST_F(OpenCLOperationTest, MeanStddevNormalizationAllBatches) {
   TensorFloat32 src_tensor;
   src_tensor.shape = BHWC(9, 1, 1, 4);
@@ -106,6 +104,8 @@ TEST_F(OpenCLOperationTest, MeanStddevNormalizationAllBatches) {
   };
   for (auto storage : env_.GetSupportedStorages()) {
     for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps =
+          precision == CalculationsPrecision::F32 ? 2.53e-05f : 3.57e-4f;
       OperationDef op_def;
       op_def.precision = precision;
       auto data_type = DeduceDataTypeFromPrecision(precision);
@@ -113,7 +113,7 @@ TEST_F(OpenCLOperationTest, MeanStddevNormalizationAllBatches) {
       op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
       TensorFloat32 dst_tensor;
       auto operation =
-          CreateMeanStdDevNormalization(op_def, env_.GetDevicePtr()->info_);
+          CreateMeanStdDevNormalization(op_def, env_.GetDevicePtr()->info_, 1);
       ASSERT_OK(ExecuteGPUOperation({src_tensor}, creation_context_, &operation,
                                     BHWC(9, 1, 1, 4), &dst_tensor));
 
@@ -130,8 +130,57 @@ TEST_F(OpenCLOperationTest, MeanStddevNormalizationAllBatches) {
           -ksqrt16, -ksqrt04, ksqrt04, ksqrt16,  // large mean, small variance
           -ksqrt16, -ksqrt04, ksqrt04, ksqrt16,  // large mean, large variance
       };
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(3.57e-4f), expected_output));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), expected_output))
+          << "Failed using precision " << ToString(precision);
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, MeanStddevNormalizationLargeVector) {
+  const float mean = 100.0f;
+  const float diff = 1.0f;
+  // Some large vector that is not a round multiple of any SIMD vector sizes.
+  constexpr int kVectorSize = 16 * 16 + 16 + 1;
+
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 1, 1, kVectorSize);
+  src_tensor.data.resize(kVectorSize);
+  // First input is mean.
+  src_tensor.data[0] = mean;
+  // Rest is alternating between mean + diff and mean - diff.
+  for (int i = 1; i < kVectorSize - 1; i += 2) {
+    src_tensor.data[i + 0] = mean + diff;
+    src_tensor.data[i + 1] = mean - diff;
+  }
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps =
+          precision == CalculationsPrecision::F32 ? 0.0f : 8.60e-4f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::BHWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::BHWC});
+      TensorFloat32 dst_tensor;
+      auto operation = CreateMeanStdDevNormalization(
+          op_def, env_.GetDevicePtr()->info_, (kVectorSize + 3) / 4);
+      ASSERT_OK(ExecuteGPUOperation({src_tensor}, creation_context_, &operation,
+                                    BHWC(1, 1, 1, kVectorSize), &dst_tensor));
+
+      float expected_output[kVectorSize];
+      // First output should be 0.
+      expected_output[0] = 0.0;
+      // Rest should be alternating between ±√(N/(N-1)).
+      const float expected_elem =
+          std::sqrt(static_cast<double>(kVectorSize) /
+                    static_cast<double>(kVectorSize - 1));
+      for (int i = 1; i < kVectorSize - 1; i += 2) {
+        expected_output[i + 0] = +expected_elem;
+        expected_output[i + 1] = -expected_elem;
+      }
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), expected_output))
+          << "Failed using precision " << ToString(precision);
     }
   }
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reduce.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reduce.cc
new file mode 100644
index 00000000000..b24d54abbfc
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reduce.cc
@@ -0,0 +1,103 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/reduce.h"
+
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+std::string GetReduceChannelsKernelCode(const OperationDef& op_def,
+                                        const OperationType& op_type) {
+  std::string c = GetCommonDefines(op_def.precision);
+  if (op_type == OperationType::REDUCE_SUM) {
+    c += "#define OP(a, b) ((a) + (b))\n";
+  } else if (op_type == OperationType::REDUCE_PRODUCT) {
+    c += "#define OP(a, b) ((a) * (b))\n";
+  } else if (op_type == OperationType::REDUCE_MAXIMUM) {
+    c += "#define OP(a, b) max(a, b)\n";
+  } else if (op_type == OperationType::REDUCE_MINIMUM) {
+    c += "#define OP(a, b) min(a, b)\n";
+  }
+  c += "__kernel void main_function($0) {\n";
+  c += "  int X = get_global_id(0);\n";
+  c += "  int Y = get_global_id(1);\n";
+  c += "  if (X >= args.dst_tensor.Width() || Y >= args.dst_tensor.Height()) "
+       "return;\n";
+  if (op_type == OperationType::REDUCE_SUM) {
+    c += "  FLT4 reduced = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n";
+  } else if (op_type == OperationType::REDUCE_PRODUCT) {
+    c += "  FLT4 reduced = (FLT4)(1.0f, 1.0f, 1.0f, 1.0f);\n";
+  } else {
+    c += "  FLT4 V0 = args.src_tensor.Read(X, Y, 0);\n";
+    c += "  FLT4 reduced = (FLT4)(V0.x, V0.x, V0.x, V0.x);\n";
+  }
+  c += "  int s = 0;\n";
+  c += "  for (; s < args.src_tensor.Slices() - 1; ++s) {\n";
+  c += "    FLT4 V = args.src_tensor.Read(X, Y, s);\n";
+  c += "    reduced = OP(reduced, V);\n";
+  c += "  }\n";
+  c += "  FLT reduced_final = OP(OP(reduced.x, reduced.y), OP(reduced.z, "
+       "reduced.w));\n";
+  c += "  FLT last_reduce;\n";
+  c += "  FLT4 last_val = args.src_tensor.Read(X, Y, s);\n";
+  c += "  int ch_rem = args.src_tensor.Channels() % 4;\n";
+  c += "  if (ch_rem == 0) {\n";
+  c += "    last_reduce = OP(OP(last_val.x, last_val.y), OP(last_val.z, "
+       "last_val.w));\n";
+  c += "  } else if (ch_rem == 1) {\n";
+  c += "    last_reduce = OP(OP(last_val.x, last_val.y), last_val.z);\n";
+  c += "  } else if (ch_rem == 2) {\n";
+  c += "    last_reduce = OP(last_val.x, last_val.y);\n";
+  c += "  } else {\n";
+  c += "    last_reduce = last_val.x;\n";
+  c += "  }\n";
+  c += "  reduced_final = OP(reduced_final, last_reduce);\n";
+  c += "  FLT4 result = (FLT4)(reduced_final, 0.0f, 0.0f, 0.0f);\n";
+  c += "  args.dst_tensor.Write(result, X, Y, 0);\n";
+  c += "}\n";
+  return c;
+}
+}  // namespace
+
+GPUOperation CreateReduce(const OperationDef& definition,
+                          const ReduceAttributes& attr,
+                          const OperationType& op_type) {
+  GPUOperation op(definition);
+  auto src_desc = definition.src_tensors[0];
+  if (definition.IsBatchSupported()) {
+    src_desc.SetStateVar("BatchedWidth", "true");
+  }
+  op.AddSrcTensor("src_tensor", src_desc);
+  auto dst_desc = definition.dst_tensors[0];
+  if (definition.IsBatchSupported()) {
+    dst_desc.SetStateVar("BatchedWidth", "true");
+  }
+  op.AddDstTensor("dst_tensor", dst_desc);
+  op.code_ = GetReduceChannelsKernelCode(definition, op_type);
+  op.tensor_to_grid_ = TensorToGrid::kWBToX_HDToY_ZIs1;
+  return op;
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reduce.h b/tensorflow/lite/delegates/gpu/cl/kernels/reduce.h
new file mode 100644
index 00000000000..def7ced4871
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reduce.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_REDUCE_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_REDUCE_H_
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+GPUOperation CreateReduce(const OperationDef& definition,
+                          const ReduceAttributes& attr,
+                          const OperationType& op_type);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_REDUCE_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/reduce_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/reduce_test.cc
new file mode 100644
index 00000000000..7f100410d3c
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/reduce_test.cc
@@ -0,0 +1,141 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/reduce.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+
+TEST_F(OpenCLOperationTest, ReduceSumChannels) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 5);
+  src_tensor.data = {1.1, 2.1, 0.7, 0.3, 1.2, 3.1, 4.1, 0.0, 1.0, 4.4};
+  ReduceAttributes attr;
+  attr.axis = Axis::CHANNELS;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateReduce(op_def, attr, OperationType::REDUCE_SUM);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {5.4f, 12.6f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ReduceProductChannels) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 2);
+  src_tensor.data = {1.1, 2.0, 3.1, 4.0};
+  ReduceAttributes attr;
+  attr.axis = Axis::CHANNELS;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateReduce(op_def, attr, OperationType::REDUCE_PRODUCT);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {2.2f, 12.4f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ReduceMaxChannels) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 6);
+  src_tensor.data = {1.1,  2.0,  -0.3, -100.0, 32.6, 1.1,
+                     -3.1, -4.0, -5.0, -7.0,   -2.0, -100.0};
+  ReduceAttributes attr;
+  attr.axis = Axis::CHANNELS;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateReduce(op_def, attr, OperationType::REDUCE_MAXIMUM);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {32.6f, -2.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, ReduceMinChannels) {
+  TensorFloat32 src_tensor;
+  src_tensor.shape = BHWC(1, 2, 1, 6);
+  src_tensor.data = {1.1,  2.0,  -0.3, -100.0, 32.6, 1.1,
+                     -3.1, -4.0, -5.0, -7.0,   -2.0, 100.0};
+  ReduceAttributes attr;
+  attr.axis = Axis::CHANNELS;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      GPUOperation operation =
+          CreateReduce(op_def, attr, OperationType::REDUCE_MINIMUM);
+      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 1), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data, Pointwise(FloatNear(eps), {-100.0f, -7.0f}));
+    }
+  }
+}
+
+}  // namespace
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc b/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc
index a0fd699062c..91266ef29a6 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/resize.cc
@@ -132,13 +132,13 @@ std::string Resize::GetResizeCode(const OperationDef& op_def,
   return c;
 }
 
-absl::Status Resize::BindArguments() {
-  RETURN_IF_ERROR(args_.SetInt("border_x", src_[0]->Width() - 1));
-  RETURN_IF_ERROR(args_.SetInt("border_y", src_[0]->Height() - 1));
-  RETURN_IF_ERROR(args_.SetFloat(
+absl::Status Resize::BindArguments(ArgumentsBinder* args) {
+  RETURN_IF_ERROR(args->SetInt("border_x", src_[0]->Width() - 1));
+  RETURN_IF_ERROR(args->SetInt("border_y", src_[0]->Height() - 1));
+  RETURN_IF_ERROR(args->SetFloat(
       "scale_factor_x",
       CalculateResizeScale(src_[0]->Width(), dst_[0]->Width(), attr_)));
-  RETURN_IF_ERROR(args_.SetFloat(
+  RETURN_IF_ERROR(args->SetFloat(
       "scale_factor_y",
       CalculateResizeScale(src_[0]->Height(), dst_[0]->Height(), attr_)));
   return absl::OkStatus();
@@ -286,17 +286,17 @@ std::string Resize3D::GetResize3DCode(const OperationDef& op_def,
   return c;
 }
 
-absl::Status Resize3D::BindArguments() {
-  RETURN_IF_ERROR(args_.SetInt("border_x", src_[0]->Width() - 1));
-  RETURN_IF_ERROR(args_.SetInt("border_y", src_[0]->Height() - 1));
-  RETURN_IF_ERROR(args_.SetInt("border_z", src_[0]->Depth() - 1));
-  RETURN_IF_ERROR(args_.SetFloat(
+absl::Status Resize3D::BindArguments(ArgumentsBinder* args) {
+  RETURN_IF_ERROR(args->SetInt("border_x", src_[0]->Width() - 1));
+  RETURN_IF_ERROR(args->SetInt("border_y", src_[0]->Height() - 1));
+  RETURN_IF_ERROR(args->SetInt("border_z", src_[0]->Depth() - 1));
+  RETURN_IF_ERROR(args->SetFloat(
       "scale_factor_x",
       CalculateResizeScale(src_[0]->Width(), dst_[0]->Width(), attr_)));
-  RETURN_IF_ERROR(args_.SetFloat(
+  RETURN_IF_ERROR(args->SetFloat(
       "scale_factor_y",
       CalculateResizeScale(src_[0]->Height(), dst_[0]->Height(), attr_)));
-  RETURN_IF_ERROR(args_.SetFloat(
+  RETURN_IF_ERROR(args->SetFloat(
       "scale_factor_z",
       CalculateResizeScale(src_[0]->Depth(), dst_[0]->Depth(), attr_)));
   return absl::OkStatus();
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/resize.h b/tensorflow/lite/delegates/gpu/cl/kernels/resize.h
index 0349afe5664..859d750b7e0 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/resize.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/resize.h
@@ -27,7 +27,7 @@ namespace cl {
 
 class Resize : public GPUOperation {
  public:
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   // Move only
@@ -53,7 +53,7 @@ Resize CreateResize(const OperationDef& definition,
 
 class Resize3D : public GPUOperation {
  public:
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   // Move only
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
index e7cf72aa72a..d4d0442e61d 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.cc
@@ -109,14 +109,14 @@ std::string Softmax1x1::GetSoftmaxKernelCode(const OperationDef& op_def) {
   return c;
 }
 
-absl::Status Softmax1x1::BindArguments() {
+absl::Status Softmax1x1::BindArguments(ArgumentsBinder* args) {
   float4 mask = GetMaskForLastPlane(src_[0]->Channels());
-  RETURN_IF_ERROR(args_.SetFloat("mask_x", mask.x));
-  RETURN_IF_ERROR(args_.SetFloat("mask_y", mask.y));
-  RETURN_IF_ERROR(args_.SetFloat("mask_z", mask.z));
-  RETURN_IF_ERROR(args_.SetFloat("mask_w", mask.w));
+  RETURN_IF_ERROR(args->SetFloat("mask_x", mask.x));
+  RETURN_IF_ERROR(args->SetFloat("mask_y", mask.y));
+  RETURN_IF_ERROR(args->SetFloat("mask_z", mask.z));
+  RETURN_IF_ERROR(args->SetFloat("mask_w", mask.w));
   RETURN_IF_ERROR(
-      args_.SetInt("slices_x32", DivideRoundUp(src_[0]->Slices(), 32)));
+      args->SetInt("slices_x32", DivideRoundUp(src_[0]->Slices(), 32)));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h
index 5bc9278d612..202f46d2a51 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/softmax1x1.h
@@ -35,7 +35,7 @@ class Softmax1x1 : public GPUOperation {
       std::vector<int3>* work_groups) const override {
     work_groups->push_back(work_group_size_);
   }
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   // Move only
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/special/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/special/BUILD
index d5ff93e6845..f601556900c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/special/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/special/BUILD
@@ -23,3 +23,30 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:types",
     ],
 )
+
+cc_library(
+    name = "fc_fc_add",
+    srcs = ["fc_fc_add.cc"],
+    hdrs = ["fc_fc_add.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/cl:arguments",
+        "//tensorflow/lite/delegates/gpu/cl:buffer",
+        "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
+        "//tensorflow/lite/delegates/gpu/cl:device_info",
+        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
+        "//tensorflow/lite/delegates/gpu/cl:precision",
+        "//tensorflow/lite/delegates/gpu/cl:tensor",
+        "//tensorflow/lite/delegates/gpu/cl:tensor_type",
+        "//tensorflow/lite/delegates/gpu/cl:texture2d",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:tuning_parameters",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:util",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "@com_google_absl//absl/memory",
+    ],
+)
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/special/fc_fc_add.cc b/tensorflow/lite/delegates/gpu/cl/kernels/special/fc_fc_add.cc
new file mode 100644
index 00000000000..a8d3d434bd9
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/special/fc_fc_add.cc
@@ -0,0 +1,207 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/kernels/special/fc_fc_add.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+bool UseBufferForWeights(const DeviceInfo& device_info) {
+  return device_info.IsAdreno() || device_info.IsAMD() || device_info.IsMali();
+}
+}  // namespace
+
+FCFCAdd::FCFCAdd(const OperationDef& definition, const DeviceInfo& device_info)
+    : GPUOperation(definition) {
+  if (device_info.IsAdreno()) {
+    if (device_info.IsAdreno3xx()) {
+      work_group_size_ = int3(16, 4, 1);
+    } else if (device_info.IsAdreno4xx()) {
+      work_group_size_ = int3(32, 4, 1);
+    } else {
+      work_group_size_ = int3(32, 4, 1);
+    }
+  } else if (device_info.IsIntel()) {
+    work_group_size_ = int3(8, 4, 1);
+  } else if (device_info.IsNvidia()) {
+    work_group_size_ = int3(8, 4, 1);
+  } else if (device_info.IsPowerVR()) {
+    work_group_size_ = int3(8, 4, 1);
+  } else {
+    work_group_size_ = int3(16, 4, 1);
+  }
+  code_ = GetFCFCAddKernelCode(definition_, device_info);
+}
+
+FCFCAdd::FCFCAdd(FCFCAdd&& kernel) : GPUOperation(std::move(kernel)) {}
+
+FCFCAdd& FCFCAdd::operator=(FCFCAdd&& kernel) {
+  if (this != &kernel) {
+    GPUOperation::operator=(std::move(kernel));
+  }
+  return *this;
+}
+
+// We split vec vec dot (every thread do vec vec dot product in basic
+// vec mat mult) on 4 parts to create more threads
+// tid.y thread process every 4-th element in vec vec dot
+// Good results for ~1024 x 1024 sizes, for other can be written more
+// optimized shaders
+
+std::string FCFCAdd::GetFCFCAddKernelCode(const OperationDef& op_def,
+                                          const DeviceInfo& device_info) {
+  AddSrcTensor("src_tensor_0", op_def.src_tensors[0]);
+  AddSrcTensor("src_tensor_1", op_def.src_tensors[1]);
+  AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
+
+  const bool weights_are_buffer = UseBufferForWeights(device_info);
+
+  std::string c = GetCommonDefines(op_def.precision);
+  switch (op_def.precision) {
+    case CalculationsPrecision::F32:
+      c += "#define FLT16 float16\n";
+      break;
+    case CalculationsPrecision::F32_F16:
+    case CalculationsPrecision::F16:
+      c += "#define FLT16 half16\n";
+      break;
+  }
+
+  c += "#define WG_X " + std::to_string(work_group_size_.x) + "\n";
+  c += "#define WG_Y " + std::to_string(work_group_size_.y) + "\n";
+
+  c += R"(__kernel void main_function($0) {
+  int gid = get_global_id(0);
+  int2 tid = (int2)(get_local_id(0), get_local_id(1));
+  ACCUM_FLT4 s = (ACCUM_FLT4)(0.0f);
+  if (gid < args.dst_tensor.Slices()) {
+    for (int c = tid.y; c < args.src_tensor_0.Slices(); c += WG_Y) {
+      FLT4 v = args.src_tensor_0.Read(0, 0, c);
+)";
+  if (weights_are_buffer) {
+    c += R"(FLT16 w = args.weights0.Read(c * args.dst_tensor.Slices() + gid);
+      FLT4 partial = v.s0 * w.s0123;
+      partial = mad(v.s1, w.s4567, partial);
+      partial = mad(v.s2, w.s89ab, partial);
+      partial = mad(v.s3, w.scdef, partial);
+      s += TO_ACCUM_TYPE(partial);
+)";
+  } else {
+    c += R"(FLT4 w0 = args.weights0.Read(c * 4 + 0, gid);
+      FLT4 w1 = args.weights0.Read(c * 4 + 1, gid);
+      FLT4 w2 = args.weights0.Read(c * 4 + 2, gid);
+      FLT4 w3 = args.weights0.Read(c * 4 + 3, gid);
+      FLT4 partial = v.s0 * w0;
+      partial = mad(v.s1, w1, partial);
+      partial = mad(v.s2, w2, partial);
+      partial = mad(v.s3, w3, partial);
+      s += TO_ACCUM_TYPE(partial);
+)";
+  }
+  c += R"(    }
+    for (int c = tid.y; c < args.src_tensor_1.Slices(); c += WG_Y) {
+      FLT4 v = args.src_tensor_1.Read(0, 0, c);
+      )";
+  if (weights_are_buffer) {
+    c += R"(FLT16 w = args.weights1.Read(c * args.dst_tensor.Slices() + gid);
+      FLT4 partial = v.s0 * w.s0123;
+      partial = mad(v.s1, w.s4567, partial);
+      partial = mad(v.s2, w.s89ab, partial);
+      partial = mad(v.s3, w.scdef, partial);
+      s += TO_ACCUM_TYPE(partial);
+)";
+  } else {
+    c += R"(FLT4 w0 = args.weights1.Read(c * 4 + 0, gid);
+      FLT4 w1 = args.weights1.Read(c * 4 + 1, gid);
+      FLT4 w2 = args.weights1.Read(c * 4 + 2, gid);
+      FLT4 w3 = args.weights1.Read(c * 4 + 3, gid);
+      FLT4 partial = v.s0 * w0;
+      partial = mad(v.s1, w1, partial);
+      partial = mad(v.s2, w2, partial);
+      partial = mad(v.s3, w3, partial);
+      s += TO_ACCUM_TYPE(partial);
+)";
+  }
+  c += R"(    }
+  }
+  __local ACCUM_FLT4 temp[WG_X][WG_Y];
+  temp[tid.x][tid.y] = s;
+  barrier(CLK_LOCAL_MEM_FENCE);
+  if (gid >= args.dst_tensor.Slices()) {
+    return;
+  }
+  if (tid.y == 0) {
+)";
+  for (int i = 1; i < work_group_size_.y; ++i) {
+    c += "    s += temp[tid.x][" + std::to_string(i) + "];\n";
+  }
+  c +=
+      R"(    FLT4 r0 = TO_FLT4(s) + args.biases0.Read(gid) + args.biases1.Read(gid);
+    args.dst_tensor.Write(r0, 0, 0, gid);
+  }
+})";
+
+  return c;
+}
+
+int3 FCFCAdd::GetGridSize() const { return int3(dst_[0]->Slices(), 1, 1); }
+
+FCFCAdd CreateFCFCAdd(const DeviceInfo& device_info,
+                      const OperationDef& definition,
+                      const FullyConnectedAttributes& attr0,
+                      const FullyConnectedAttributes& attr1) {
+  FCFCAdd result(definition, device_info);
+  result.UploadWeights(attr0.weights, "weights0",
+                       UseBufferForWeights(device_info));
+  result.UploadWeights(attr1.weights, "weights1",
+                       UseBufferForWeights(device_info));
+
+  TensorLinearDescriptor desc0;
+  desc0.storage_type = LinearStorageType::TEXTURE_2D;
+  desc0.element_type = definition.GetDataType();
+  desc0.UploadLinearData(attr0.bias);
+  result.args_.AddObject(
+      "biases0", absl::make_unique<TensorLinearDescriptor>(std::move(desc0)));
+
+  TensorLinearDescriptor desc1;
+  desc1.storage_type = LinearStorageType::TEXTURE_2D;
+  desc1.element_type = definition.GetDataType();
+  desc1.UploadLinearData(attr1.bias);
+  result.args_.AddObject(
+      "biases1", absl::make_unique<TensorLinearDescriptor>(std::move(desc1)));
+
+  return result;
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/special/fc_fc_add.h b/tensorflow/lite/delegates/gpu/cl/kernels/special/fc_fc_add.h
new file mode 100644
index 00000000000..fea9d1a4990
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/special/fc_fc_add.h
@@ -0,0 +1,189 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SPECIAL_FC_FC_ADD_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SPECIAL_FC_FC_ADD_H_
+
+#include <stdint.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/tuning_parameters.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+template <DataType T, typename S>
+void RearrangeFCWeightsToIOO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
+                                S* dst) {
+  const int src_channels = weights.shape.i;
+  const int padded_src_channels = AlignByN(src_channels, 4);
+  const int dst_channels = weights.shape.o;
+  const int padded_dst_channels = AlignByN(dst_channels, 4);
+
+  for (int block_y = 0; 4 * block_y < padded_dst_channels; block_y++) {
+    for (int y_in_block = 0; y_in_block < 4; y_in_block++) {
+      for (int block_x = 0; 4 * block_x < padded_src_channels; block_x++) {
+        for (int x_in_block = 0; x_in_block < 4; x_in_block++) {
+          int y = 4 * block_y + y_in_block;
+          int x = 4 * block_x + x_in_block;
+          int dst_index = block_x * padded_dst_channels * 4 + block_y * 16 +
+                          x_in_block * 4 + y_in_block;
+          if (x < src_channels && y < dst_channels) {
+            dst[dst_index] = weights.data[src_channels * y + x];
+          } else {
+            dst[dst_index] = 0.0f;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType T, typename S>
+void RearrangeFCWeightsToOIO4I4(const tflite::gpu::Tensor<OHWI, T>& weights,
+                                S* dst) {
+  const int src_channels = weights.shape.i;
+  const int src_depth = DivideRoundUp(src_channels, 4);
+  const int dst_channels = weights.shape.o;
+  const int dst_depth = DivideRoundUp(dst_channels, 4);
+
+  int counter = 0;
+  for (int d = 0; d < dst_depth; ++d) {
+    for (int s = 0; s < src_depth; ++s) {
+      for (int i = 0; i < 4; ++i) {
+        const int src_ch = s * 4 + i;
+        for (int j = 0; j < 4; ++j) {
+          const int dst_ch = d * 4 + j;
+          if (src_ch < src_channels && dst_ch < dst_channels) {
+            dst[counter++] = weights.data[dst_ch * src_channels + src_ch];
+          } else {
+            dst[counter++] = 0.0f;
+          }
+        }
+      }
+    }
+  }
+}
+
+class FCFCAdd : public GPUOperation {
+ public:
+  FCFCAdd() = default;
+  void GetPossibleKernelWorkGroups(
+      TuningType tuning_type, const DeviceInfo& device_info,
+      const KernelInfo& kernel_info,
+      std::vector<int3>* work_groups) const override {
+    work_groups->push_back(work_group_size_);
+  }
+  int3 GetGridSize() const override;
+
+  // Move only
+  FCFCAdd(FCFCAdd&& kernel);
+  FCFCAdd& operator=(FCFCAdd&& kernel);
+  FCFCAdd(const FCFCAdd&) = delete;
+  FCFCAdd& operator=(const FCFCAdd&) = delete;
+
+ private:
+  FCFCAdd(const OperationDef& definition, const DeviceInfo& device_info);
+  friend FCFCAdd CreateFCFCAdd(const DeviceInfo& device_info,
+                               const OperationDef& definition,
+                               const FullyConnectedAttributes& attr0,
+                               const FullyConnectedAttributes& attr1);
+
+  template <DataType T>
+  void UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
+                     const std::string& name, bool weights_are_buffer);
+
+  std::string GetFCFCAddKernelCode(const OperationDef& op_def,
+                                   const DeviceInfo& device_info);
+};
+
+template <DataType T>
+void FCFCAdd::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
+                            const std::string& name, bool weights_are_buffer) {
+  const int src_depth = DivideRoundUp(weights.shape.i, 4);
+  const int dst_depth = DivideRoundUp(weights.shape.o, 4);
+
+  const int elements_count = src_depth * dst_depth * 4;
+  const bool f32_weights = definition_.precision == CalculationsPrecision::F32;
+
+  const int float4_size = f32_weights ? 16 : 8;
+
+  if (weights_are_buffer) {
+    BufferDescriptor desc;
+    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    desc.element_size = 16;
+    desc.size = float4_size * elements_count;
+    desc.data.resize(desc.size);
+
+    if (f32_weights) {
+      float* ptr = reinterpret_cast<float*>(desc.data.data());
+      RearrangeFCWeightsToIOO4I4(weights, ptr);
+    } else {
+      half* ptr = reinterpret_cast<half*>(desc.data.data());
+      RearrangeFCWeightsToIOO4I4(weights, ptr);
+    }
+
+    args_.AddObject(name, absl::make_unique<BufferDescriptor>(std::move(desc)));
+  } else {
+    Texture2DDescriptor desc;
+    desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
+    // desc.element_type = DataType::UINT8;
+    // desc.normalized = true;
+    // desc.normalized_type = f32_weights ? DataType::FLOAT32 :
+    // DataType::FLOAT16;
+    desc.size = int2(src_depth * 4, dst_depth);
+    desc.data.resize(float4_size * elements_count);
+
+    if (f32_weights) {
+      float* ptr = reinterpret_cast<float*>(desc.data.data());
+      RearrangeFCWeightsToOIO4I4(weights, ptr);
+    } else {
+      half* ptr = reinterpret_cast<half*>(desc.data.data());
+      RearrangeFCWeightsToOIO4I4(weights, ptr);
+    }
+
+    args_.AddObject(name,
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+  }
+}
+
+FCFCAdd CreateFCFCAdd(const DeviceInfo& device_info,
+                      const OperationDef& definition,
+                      const FullyConnectedAttributes& attr0,
+                      const FullyConnectedAttributes& attr1);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_SPECIAL_FC_FC_ADD_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
index b2ce0690a9c..1f8f985f3ee 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.cc
@@ -154,17 +154,17 @@ std::string StridedSlice::GetStridedSliceCode(const OperationDef& op_def,
   return c;
 }
 
-absl::Status StridedSlice::BindArguments() {
+absl::Status StridedSlice::BindArguments(ArgumentsBinder* args) {
   int4 offset = GetOffset(attributes_, src_[0]->Width(), src_[0]->Height(),
                           src_[0]->Channels(), src_[0]->Batch());
-  RETURN_IF_ERROR(args_.SetInt("offset_x", offset.x));
-  RETURN_IF_ERROR(args_.SetInt("offset_y", offset.y));
-  RETURN_IF_ERROR(args_.SetInt("offset_z", offset.z));
-  RETURN_IF_ERROR(args_.SetInt("offset_b", offset.w));
-  RETURN_IF_ERROR(args_.SetInt("stride_x", attributes_.strides.w));
-  RETURN_IF_ERROR(args_.SetInt("stride_y", attributes_.strides.h));
-  RETURN_IF_ERROR(args_.SetInt("stride_z", attributes_.strides.c));
-  RETURN_IF_ERROR(args_.SetInt("stride_b", attributes_.strides.b));
+  RETURN_IF_ERROR(args->SetInt("offset_x", offset.x));
+  RETURN_IF_ERROR(args->SetInt("offset_y", offset.y));
+  RETURN_IF_ERROR(args->SetInt("offset_z", offset.z));
+  RETURN_IF_ERROR(args->SetInt("offset_b", offset.w));
+  RETURN_IF_ERROR(args->SetInt("stride_x", attributes_.strides.w));
+  RETURN_IF_ERROR(args->SetInt("stride_y", attributes_.strides.h));
+  RETURN_IF_ERROR(args->SetInt("stride_z", attributes_.strides.c));
+  RETURN_IF_ERROR(args->SetInt("stride_b", attributes_.strides.b));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h
index 5a6d8ad6047..dddff2faf35 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/strided_slice.h
@@ -27,7 +27,7 @@ namespace cl {
 class StridedSlice : public GPUOperation {
  public:
   StridedSlice(const OperationDef& definition, const SliceAttributes& attr);
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
 
   // Move only
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/tuning_parameters.h b/tensorflow/lite/delegates/gpu/cl/kernels/tuning_parameters.h
index d6098b0cb81..c57ccade4b2 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/tuning_parameters.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/tuning_parameters.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_TUNING_PARAMETERS_H_
 
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
-#include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
index f0e0c412b7e..25fa60c776a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.cc
@@ -188,6 +188,14 @@ int GetRecommendedBlockSizeForConv(const DeviceInfo& device_info,
   return block_size;
 }
 
+int3 GetWorkGroupsCount(const int3& grid_size, const int3& work_group_size) {
+  int3 work_groups_count;
+  work_groups_count.x = DivideRoundUp(grid_size.x, work_group_size.x);
+  work_groups_count.y = DivideRoundUp(grid_size.y, work_group_size.y);
+  work_groups_count.z = DivideRoundUp(grid_size.z, work_group_size.z);
+  return work_groups_count;
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/util.h b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
index aa9f599e4d8..69f6808146c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/util.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/util.h
@@ -17,15 +17,13 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_UTIL_H_
 
 #include <string>
+#include <vector>
 
 #include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
-#include "tensorflow/lite/delegates/gpu/common/access_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
@@ -58,15 +56,12 @@ void RearrangeWeightsToOHWIOGroupI4O4(
     absl::Span<T> dst) {
   const int dst_slices = DivideRoundUp(weights.shape.o, 4);
   const int src_slices = DivideRoundUp(weights.shape.i, 4);
-  const int kernel_x = weights.shape.w;
-  const int kernel_y = weights.shape.h;
-
   const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
 
   int counter = 0;
   for (int d = 0; d < dst_groups; ++d) {
-    for (int y = 0; y < kernel_y; ++y) {
-      for (int x = 0; x < kernel_x; ++x) {
+    for (int y = 0; y < weights.shape.h; ++y) {
+      for (int x = 0; x < weights.shape.w; ++x) {
         for (int s = 0; s < src_slices; ++s) {
           for (int d_group = 0; d_group < out_group_size; ++d_group) {
             for (int j = 0; j < 4; ++j) {
@@ -91,6 +86,118 @@ void RearrangeWeightsToOHWIOGroupI4O4(
   }
 }
 
+template <DataType S, typename T>
+void RearrangeWeightsToODHWIOGroupI4O4(
+    const tflite::gpu::Tensor<OHWDI, S>& weights, int out_group_size,
+    absl::Span<T> dst) {
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
+  const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
+
+  int counter = 0;
+  for (int d = 0; d < dst_groups; ++d) {
+    for (int z = 0; z < weights.shape.d; ++z) {
+      for (int y = 0; y < weights.shape.h; ++y) {
+        for (int x = 0; x < weights.shape.w; ++x) {
+          for (int s = 0; s < src_slices; ++s) {
+            for (int d_group = 0; d_group < out_group_size; ++d_group) {
+              for (int j = 0; j < 4; ++j) {
+                T filter;
+                for (int i = 0; i < 4; ++i) {
+                  const int s_ch = s * 4 + j;
+                  const int d_ch = (d * out_group_size + d_group) * 4 + i;
+                  if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                    const int f_index =
+                        weights.shape.LinearIndex({d_ch, y, x, z, s_ch});
+                    filter[i] = weights.data[f_index];
+                  } else {
+                    filter[i] = 0.0f;
+                  }
+                }
+                dst[counter++] = filter;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsToI4HWIOOGroupO4(
+    const tflite::gpu::Tensor<OHWI, S>& weights, int out_group_size,
+    absl::Span<T> dst) {
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
+  const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
+
+  int counter = 0;
+  for (int j = 0; j < 4; ++j) {
+    for (int y = 0; y < weights.shape.h; ++y) {
+      for (int x = 0; x < weights.shape.w; ++x) {
+        for (int s = 0; s < src_slices; ++s) {
+          for (int d = 0; d < dst_groups; ++d) {
+            for (int d_group = 0; d_group < out_group_size; ++d_group) {
+              T filter;
+              for (int i = 0; i < 4; ++i) {
+                const int s_ch = s * 4 + j;
+                const int d_ch = (d * out_group_size + d_group) * 4 + i;
+                if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                  const int f_index =
+                      weights.shape.LinearIndex({d_ch, y, x, s_ch});
+                  filter[i] = weights.data[f_index];
+                } else {
+                  filter[i] = 0.0f;
+                }
+              }
+              dst[counter++] = filter;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+template <DataType S, typename T>
+void RearrangeWeightsToI4DHWIOOGroupO4(
+    const tflite::gpu::Tensor<OHWDI, S>& weights, int out_group_size,
+    absl::Span<T> dst) {
+  const int dst_slices = DivideRoundUp(weights.shape.o, 4);
+  const int src_slices = DivideRoundUp(weights.shape.i, 4);
+  const int dst_groups = DivideRoundUp(dst_slices, out_group_size);
+
+  int counter = 0;
+  for (int j = 0; j < 4; ++j) {
+    for (int z = 0; z < weights.shape.d; ++z) {
+      for (int y = 0; y < weights.shape.h; ++y) {
+        for (int x = 0; x < weights.shape.w; ++x) {
+          for (int s = 0; s < src_slices; ++s) {
+            for (int d = 0; d < dst_groups; ++d) {
+              for (int d_group = 0; d_group < out_group_size; ++d_group) {
+                T filter;
+                for (int i = 0; i < 4; ++i) {
+                  const int s_ch = s * 4 + j;
+                  const int d_ch = (d * out_group_size + d_group) * 4 + i;
+                  if (s_ch < weights.shape.i && d_ch < weights.shape.o) {
+                    const int f_index =
+                        weights.shape.LinearIndex({d_ch, y, x, z, s_ch});
+                    filter[i] = weights.data[f_index];
+                  } else {
+                    filter[i] = 0.0f;
+                  }
+                }
+                dst[counter++] = filter;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 // Returns float4 mask for last plane(batch of 4 channels)
 // assumes that plane size is 4;
 // for example we have 7 channels, in our data structures we align it to 8
@@ -106,6 +213,8 @@ int3 GetFirstSuitableWorkGroup(const std::vector<int3>& wgs, int max_wg_size);
 int GetRecommendedBlockSizeForConv(const DeviceInfo& device,
                                    CalculationsPrecision precision,
                                    int task_size);
+
+int3 GetWorkGroupsCount(const int3& grid_size, const int3& work_group_size);
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index 0f94847f08a..1244f769b48 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -262,16 +262,16 @@ int3 Winograd4x4To36::SelectBestWorkGroup(const KernelInfo& kernel_info) const {
   return GetFirstSuitableWorkGroup(wgs, kernel_info.max_work_group_size);
 }
 
-absl::Status Winograd4x4To36::BindArguments() {
+absl::Status Winograd4x4To36::BindArguments(ArgumentsBinder* args) {
   const int tiles_x = DivideRoundUp(
       src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2, 4);
   const int tiles_y = DivideRoundUp(
       src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2, 4);
   const int tiles_total = tiles_x * tiles_y;
-  RETURN_IF_ERROR(args_.SetInt("padding_x", -padding_.prepended.w));
-  RETURN_IF_ERROR(args_.SetInt("padding_y", -padding_.prepended.h));
-  RETURN_IF_ERROR(args_.SetInt("tiles_total", tiles_total));
-  RETURN_IF_ERROR(args_.SetInt("tiles_x", tiles_x));
+  RETURN_IF_ERROR(args->SetInt("padding_x", -padding_.prepended.w));
+  RETURN_IF_ERROR(args->SetInt("padding_y", -padding_.prepended.h));
+  RETURN_IF_ERROR(args->SetInt("tiles_total", tiles_total));
+  RETURN_IF_ERROR(args->SetInt("tiles_x", tiles_x));
   return absl::OkStatus();
 }
 
@@ -463,9 +463,9 @@ int3 Winograd36To4x4::SelectBestWorkGroup(const KernelInfo& kernel_info) const {
   return GetFirstSuitableWorkGroup(wgs, kernel_info.max_work_group_size);
 }
 
-absl::Status Winograd36To4x4::BindArguments() {
+absl::Status Winograd36To4x4::BindArguments(ArgumentsBinder* args) {
   const int tiles_x = DivideRoundUp(dst_[0]->Width(), 4);
-  RETURN_IF_ERROR(args_.SetInt("tiles_x", tiles_x));
+  RETURN_IF_ERROR(args->SetInt("tiles_x", tiles_x));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
index a5da49e7939..609e38a4c9a 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
@@ -36,7 +36,7 @@ class Winograd4x4To36 : public GPUOperation {
   Winograd4x4To36() = default;
   Winograd4x4To36(const OperationDef& definition, const Padding2D& padding,
                   const DeviceInfo& device_info);
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
   void GetPossibleKernelWorkGroups(
       TuningType tuning_type, const DeviceInfo& device_info,
@@ -73,7 +73,7 @@ class Winograd36To4x4 : public GPUOperation {
   Winograd36To4x4() = default;
   Winograd36To4x4(const OperationDef& definition,
                   const DeviceInfo& device_info);
-  absl::Status BindArguments() override;
+  absl::Status BindArguments(ArgumentsBinder* args) override;
   int3 GetGridSize() const override;
   void GetPossibleKernelWorkGroups(
       TuningType tuning_type, const DeviceInfo& device_info,
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h b/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h
index 0c1be10782e..ea58ff25bc2 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h
@@ -16,10 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_WORK_GROUP_PICKING_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_WORK_GROUP_PICKING_H_
 
-#include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
+#include <vector>
+
 #include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
+#include "tensorflow/lite/delegates/gpu/cl/device_info.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/tuning_parameters.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/common/workgroup_selection.h"
 
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
index 75920f4f8c5..8f7b314b707 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
@@ -204,8 +204,9 @@ absl::Status LinearStorage::CreateFromTensorLinearDescriptor(
     return CreateCLBuffer(context->context(), depth_ * float4_size, read_only,
                           data_ptr, &memory_);
   } else {
-    return CreateFloatRGBAImage2D(context->context(), depth_, 1,
-                                  desc.element_type, data_ptr, &memory_);
+    return CreateRGBAImage2D(context->context(), depth_, 1,
+                             DataTypeToChannelType(desc.element_type), data_ptr,
+                             &memory_);
   }
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
index bf2fd449291..add0e2fd4e9 100644
--- a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
+++ b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
@@ -36,7 +36,7 @@ namespace cl {
 
 #ifdef __ANDROID__
 #define LoadFunction(function)                                                 \
-  if (is_pixel) {                                                              \
+  if (use_wrapper) {                                                           \
     function = reinterpret_cast<PFN_##function>(loadOpenCLPointer(#function)); \
   } else {                                                                     \
     function = reinterpret_cast<PFN_##function>(dlsym(libopencl, #function));  \
@@ -53,7 +53,7 @@ namespace cl {
 #ifdef __WINDOWS__
 void LoadOpenCLFunctions(HMODULE libopencl);
 #else
-void LoadOpenCLFunctions(void* libopencl, bool is_pixel);
+void LoadOpenCLFunctions(void* libopencl, bool use_wrapper);
 #endif
 
 absl::Status LoadOpenCL() {
@@ -77,8 +77,11 @@ absl::Status LoadOpenCL() {
   // record error
   std::string error(dlerror());
 #ifdef __ANDROID__
-  // Pixel phone?
+  // Pixel phone or auto?
   libopencl = dlopen("libOpenCL-pixel.so", RTLD_NOW | RTLD_LOCAL);
+  if (!libopencl) {
+    libopencl = dlopen("libOpenCL-car.so", RTLD_NOW | RTLD_LOCAL);
+  }
   if (libopencl) {
     typedef void (*enableOpenCL_t)();
     enableOpenCL_t enableOpenCL =
@@ -96,11 +99,11 @@ absl::Status LoadOpenCL() {
 #ifdef __WINDOWS__
 void LoadOpenCLFunctions(HMODULE libopencl) {
 #else
-void LoadOpenCLFunctions(void* libopencl, bool is_pixel) {
+void LoadOpenCLFunctions(void* libopencl, bool use_wrapper) {
 #ifdef __ANDROID__
   typedef void* (*loadOpenCLPointer_t)(const char* name);
   loadOpenCLPointer_t loadOpenCLPointer;
-  if (is_pixel) {
+  if (use_wrapper) {
     loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>(
         dlsym(libopencl, "loadOpenCLPointer"));
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/run_tests.sh b/tensorflow/lite/delegates/gpu/cl/run_tests.sh
index 16d2feb8a5a..0eed264a06f 100755
--- a/tensorflow/lite/delegates/gpu/cl/run_tests.sh
+++ b/tensorflow/lite/delegates/gpu/cl/run_tests.sh
@@ -64,11 +64,17 @@ trap "cleanup_device" EXIT
 declare -a BUILD_CONFIG
 abi_version=$(ADB shell getprop ro.product.cpu.abi | tr -d '\r')
 if [[ "$abi_version" == "armeabi-v7a" ]]; then
-#"32 bit"
+#"32 bit ARM"
 BUILD_CONFIG=( --config=android_arm -c opt --copt=-fPIE --linkopt=-pie )
-else
-#"64 bit"
+elif [[ "$abi_version" == "arm64-v8a" ]]; then
+#"64 bit ARM"
 BUILD_CONFIG=( --config=android_arm64 -c opt )
+elif [[ "$abi_version" == "x86_64" ]]; then
+# x86_64
+BUILD_CONFIG=( --config=android_x86_64 -c opt )
+else
+echo "Error: Unknown processor ABI"
+exit 1
 fi
 
 targets=($(bazel query 'tests('$test_target')'))
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
index 3e2b8855af9..8a22741f013 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
@@ -14,7 +14,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_common",
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_constants",
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_powervr",
-        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_texture",
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_weights_converter",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
         "//tensorflow/lite/delegates/gpu/cl/kernels:work_group_picking",
@@ -82,7 +81,6 @@ cc_library(
     deps = [
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_buffer_1x1",
         "//tensorflow/lite/delegates/gpu/cl/kernels:conv_powervr",
-        "//tensorflow/lite/delegates/gpu/cl/kernels:conv_texture",
         "//tensorflow/lite/delegates/gpu/cl/kernels:fully_connected",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
         "//tensorflow/lite/delegates/gpu/common:operations",
@@ -110,6 +108,8 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl/kernels:elementwise",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
         "//tensorflow/lite/delegates/gpu/cl/kernels:mean_stddev_normalization",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:reduce",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:transpose",
         "//tensorflow/lite/delegates/gpu/cl/selectors:default_selector",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:model",
@@ -130,6 +130,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl/kernels:add",
         "//tensorflow/lite/delegates/gpu/cl/kernels:concat_xy",
         "//tensorflow/lite/delegates/gpu/cl/kernels:concat_z",
+        "//tensorflow/lite/delegates/gpu/cl/kernels:depthwise_conv",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
         "//tensorflow/lite/delegates/gpu/cl/kernels:lstm",
         "//tensorflow/lite/delegates/gpu/cl/kernels:max_unpooling",
@@ -165,6 +166,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl:tensor_type",
         "//tensorflow/lite/delegates/gpu/cl/kernels:gpu_operation",
         "//tensorflow/lite/delegates/gpu/cl/kernels/special:depthwise_conv_plus_1x1_conv",
+        "//tensorflow/lite/delegates/gpu/cl/kernels/special:fc_fc_add",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:operations",
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
index eab957e28a6..a3282f05200 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_constants.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_weights_converter.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
@@ -35,11 +34,11 @@ std::unique_ptr<GPUOperation> SelectConvolutionAdreno(
     const DeviceInfo& device_info, const OperationDef& op_def,
     ModelHints hints) {
   if (IsConvConstantsSupported(device_info, op_def, attr)) {
-    ConvConstants conv = CreateConvConstants(device_info, op_def, attr);
-    return absl::make_unique<ConvConstants>(std::move(conv));
+    GPUOperation conv = CreateConvConstants(device_info, op_def, attr);
+    return absl::make_unique<GPUOperation>(std::move(conv));
   } else {
-    ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
-    return absl::make_unique<ConvTexture>(std::move(conv));
+    ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
   }
 }
 
@@ -47,8 +46,9 @@ std::unique_ptr<GPUOperation> SelectConvolutionWinogradAdreno(
     const Convolution2DAttributes& attr, const BHWC& dst_shape,
     const DeviceInfo& device_info, const OperationDef& op_def,
     ModelHints hints) {
-  ConvTexture conv = CreateConvTextureWino4x4To6x6(device_info, op_def, attr);
-  return absl::make_unique<ConvTexture>(std::move(conv));
+  ConvPowerVR conv =
+      CreateConvPowerVRWino4x4To6x6(device_info, op_def, attr, &dst_shape);
+  return absl::make_unique<ConvPowerVR>(std::move(conv));
 }
 
 std::unique_ptr<GPUOperation> SelectConvolutionDynamicWeightsAdreno(
@@ -66,8 +66,8 @@ std::unique_ptr<GPUOperation> SelectConvolutionNVidia(
     const Convolution2DAttributes& attr, const BHWC& dst_shape,
     const DeviceInfo& device_info, const OperationDef& op_def) {
   if (IsConvConstantsSupported(device_info, op_def, attr)) {
-    ConvConstants conv = CreateConvConstants(device_info, op_def, attr);
-    return absl::make_unique<ConvConstants>(std::move(conv));
+    GPUOperation conv = CreateConvConstants(device_info, op_def, attr);
+    return absl::make_unique<GPUOperation>(std::move(conv));
   } else {
     ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
     return absl::make_unique<ConvPowerVR>(std::move(conv));
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc
index 2d61defe64b..b04335a4d7d 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/dw_convolution_selector.cc
@@ -33,8 +33,8 @@ std::unique_ptr<GPUOperation> SelectDWConvolutionAdreno(
     return absl::make_unique<DepthwiseConv3x3>(
         CreateDepthwiseConv3x3(device_info, op_def, attr));
   } else {
-    return absl::make_unique<DepthwiseConvolution>(
-        CreateDepthwiseConvolution(device_info, op_def, attr));
+    return absl::make_unique<GPUOperation>(
+        CreateDepthwiseConvolution2D(device_info, op_def, attr));
   }
 }
 
@@ -45,8 +45,8 @@ std::unique_ptr<GPUOperation> SelectDWConvolutionPowerVR(
     return absl::make_unique<DepthwiseConv3x3>(
         CreateDepthwiseConv3x3(device_info, op_def, attr));
   } else {
-    return absl::make_unique<DepthwiseConvolution>(
-        CreateDepthwiseConvolution(device_info, op_def, attr));
+    return absl::make_unique<GPUOperation>(
+        CreateDepthwiseConvolution2D(device_info, op_def, attr));
   }
 }
 
@@ -62,8 +62,8 @@ std::unique_ptr<GPUOperation> SelectDWConvolutionMali(
     return absl::make_unique<DepthwiseConv3x3>(
         CreateDepthwiseConv3x3(device_info, op_def, attr));
   } else {
-    return absl::make_unique<DepthwiseConvolution>(
-        CreateDepthwiseConvolution(device_info, op_def, attr));
+    return absl::make_unique<GPUOperation>(
+        CreateDepthwiseConvolution2D(device_info, op_def, attr));
   }
 }
 }  // namespace
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
index 24c48d52f2a..6c6ee044cdd 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_buffer_1x1.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/conv_powervr.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/conv_texture.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/fully_connected.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -31,8 +30,9 @@ std::unique_ptr<GPUOperation> SelectFullyConnectedGeneric(
     const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
     const OperationDef& op_def, int batch_size) {
   if (op_def.IsBatchSupported()) {
-    ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
-    return absl::make_unique<ConvTexture>(std::move(conv));
+    BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
+    ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
   } else {
     FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
     return absl::make_unique<FullyConnected>(std::move(fc));
@@ -43,8 +43,9 @@ std::unique_ptr<GPUOperation> SelectFullyConnectedAdreno(
     const FullyConnectedAttributes& attr, const DeviceInfo& device_info,
     const OperationDef& op_def, int batch_size) {
   if (op_def.IsBatchSupported()) {
-    ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
-    return absl::make_unique<ConvTexture>(std::move(conv));
+    BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
+    ConvPowerVR conv = CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+    return absl::make_unique<ConvPowerVR>(std::move(conv));
   } else {
     FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
     return absl::make_unique<FullyConnected>(std::move(fc));
@@ -71,8 +72,10 @@ std::unique_ptr<GPUOperation> SelectFullyConnectedMali(
       ConvBuffer1x1 conv = CreateConvBuffer1x1(device_info, op_def, attr);
       return absl::make_unique<ConvBuffer1x1>(std::move(conv));
     } else {
-      ConvTexture conv = CreateConvTexture(device_info, op_def, attr);
-      return absl::make_unique<ConvTexture>(std::move(conv));
+      BHWC dst_shape = BHWC(batch_size, 1, 1, attr.weights.shape.o);
+      ConvPowerVR conv =
+          CreateConvPowerVR(device_info, op_def, attr, &dst_shape);
+      return absl::make_unique<ConvPowerVR>(std::move(conv));
     }
   } else {
     FullyConnected fc = CreateFullyConnected(device_info, op_def, attr);
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
index 5e8e4a9fea7..f7981fc67bb 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/mean_stddev_normalization.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/reduce.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/transpose.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/convolution_selector.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/convolution_transposed_selector.h"
 #include "tensorflow/lite/delegates/gpu/cl/selectors/default_selector.h"
@@ -164,6 +166,80 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
       return absl::UnimplementedError(absl::StrCat(
           "No support of ", node.operation.type, " with this parameters"));
     }
+    case OperationType::BATCHED_MATMUL: {
+      // Currently only batch = 1 is supported.
+      // Matmul replaced with this sequence:
+      //   1) Transpose second tensor(weights). (1x1xHxW)->(Wx1x1xH)
+      //   2) Convert second tensor(weights) from 1) to Convolution weights
+      //   3) Run usual convolution
+      auto second_shape = inputs[1]->tensor.shape;
+      auto dst_shape = outputs[0]->tensor.shape;
+      if (dst_shape.b != 1) {
+        return absl::UnimplementedError(
+            "Currently only batch = 1 supported for BATCHED_MATMUL.");
+      }
+      BHWC weights_shape(second_shape.c, 1, 1, second_shape.w);
+      Convolution2DAttributes attr;
+      attr.strides = HW(1, 1);
+      attr.dilations = HW(1, 1);
+      attr.padding.appended = HW(0, 0);
+      attr.padding.prepended = HW(0, 0);
+      attr.bias.shape = Linear(weights_shape.b);
+      attr.bias.data.resize(weights_shape.b, 0.0f);
+
+      TensorDescriptor transposed_desc = {op_def.src_tensors[1].data_type,
+                                          op_def.src_tensors[1].storage_type,
+                                          Layout::BHWC};
+      transposed_desc.storage_type = SelectBestStorageType(
+          device_info, weights_shape, transposed_desc.storage_type,
+          transposed_desc.data_type, transposed_desc.layout);
+      TensorDescriptor weights_desc = {op_def.src_tensors[1].data_type,
+                                       TensorStorageType::BUFFER, Layout::BHWC};
+      gpu_subgraph->operations.clear();
+      gpu_subgraph->operations.resize(3);
+      auto& transpose_op = gpu_subgraph->operations[0];
+      auto& converter_op = gpu_subgraph->operations[1];
+      auto& conv_op = gpu_subgraph->operations[2];
+      conv_op.input_ids = {static_cast<int>(inputs[0]->id), -1};
+      conv_op.output_ids = {static_cast<int>(outputs[0]->id)};
+      OperationDef conv_def = op_def;
+      conv_def.src_tensors[1] = weights_desc;
+      ConvWeightsDescription conv_weights_desc;
+      conv_op.operation = SelectConvolutionWithDynamicWeights(
+          attr, weights_shape, dst_shape, device_info, conv_def, hints,
+          &conv_weights_desc);
+
+      int aligned_output =
+          AlignByN(weights_shape.b, conv_weights_desc.output_group_size * 4);
+      int aligned_input = AlignByN(weights_shape.c, 4);
+      gpu_subgraph->new_tensors = {{BHWC(1, 1, 1,
+                                         aligned_output * aligned_input *
+                                             weights_shape.h * weights_shape.w),
+                                    weights_desc},
+                                   {weights_shape, transposed_desc}};
+      OperationDef converter_def;
+      converter_def.precision = op_def.precision;
+      converter_def.src_tensors.push_back(transposed_desc);
+      converter_def.dst_tensors.push_back(weights_desc);
+
+      converter_op.input_ids = {-2};
+      converter_op.output_ids = {-1};
+      converter_op.operation =
+          SelectConverterToConvWeights(conv_weights_desc, converter_def, hints);
+
+      OperationDef transpose_def;
+      transpose_def.precision = op_def.precision;
+      transpose_def.src_tensors.push_back(op_def.src_tensors[1]);
+      transpose_def.dst_tensors.push_back(transposed_desc);
+
+      transpose_op.input_ids = {static_cast<int>(inputs[1]->id)};
+      transpose_op.output_ids = {-2};
+      TransposeAttributes transpose_attr;
+      transpose_attr.perm = BHWC(3, 0, 1, 2);
+      transpose_op.operation = absl::make_unique<GPUOperation>(
+          CreateTranspose(transpose_def, transpose_attr));
+      return absl::OkStatus();
+    }
     case OperationType::CONCAT: {
       auto attr = absl::any_cast<ConcatAttributes>(node.operation.attributes);
       std::vector<int> channels(inputs.size());
@@ -190,6 +266,10 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
         }
       } else {
         auto weights_shape = inputs[1]->tensor.shape;
+        if (attr.bias.data.empty()) {
+          attr.bias.shape = Linear(weights_shape.b);
+          attr.bias.data.resize(weights_shape.b, 0.0f);
+        }
         TensorDescriptor weights_desc = {op_def.src_tensors[1].data_type,
                                          TensorStorageType::BUFFER,
                                          Layout::BHWC};
@@ -235,7 +315,16 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
     case OperationType::DEPTHWISE_CONVOLUTION: {
       auto attr = absl::any_cast<DepthwiseConvolution2DAttributes>(
           node.operation.attributes);
-      *gpu_op = SelectDWConvolution(attr, device_info, op_def);
+      if (inputs.size() == 1) {
+        *gpu_op = SelectDWConvolution(attr, device_info, op_def);
+      } else {
+        if (inputs[1]->tensor.shape.b != 1) {
+          return absl::UnimplementedError(
+              "No support of depthwise runtime weights with channel multiplier "
+              "!= 1");
+        }
+        *gpu_op = SelectDWConvolutionDynamicWeights(attr, device_info, op_def);
+      }
       return absl::OkStatus();
     }
     case OperationType::FULLY_CONNECTED: {
@@ -260,8 +349,8 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
       return SelectMean(attr, op_def, device_info, gpu_op);
     }
     case OperationType::MEAN_STDDEV_NORMALIZATION: {
-      MeanStdDevNormalization operation =
-          CreateMeanStdDevNormalization(op_def, device_info);
+      MeanStdDevNormalization operation = CreateMeanStdDevNormalization(
+          op_def, device_info, (inputs[0]->tensor.shape.c + 3) / 4);
       *gpu_op =
           absl::make_unique<MeanStdDevNormalization>(std::move(operation));
       return absl::OkStatus();
@@ -331,6 +420,7 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
     case OperationType::EXP:
     case OperationType::HARD_SWISH:
     case OperationType::LOG:
+    case OperationType::NEG:
     case OperationType::RSQRT:
     case OperationType::SIGMOID:
     case OperationType::SIN:
@@ -342,9 +432,15 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
       return absl::OkStatus();
     }
     case OperationType::DIV:
+    case OperationType::EQUAL:
+    case OperationType::GREATER:
+    case OperationType::GREATER_EQUAL:
+    case OperationType::LESS:
+    case OperationType::LESS_EQUAL:
     case OperationType::MAXIMUM:
     case OperationType::MINIMUM:
     case OperationType::MUL:
+    case OperationType::NOT_EQUAL:
     case OperationType::POW:
     case OperationType::SQUARED_DIFF:
     case OperationType::SUB: {
@@ -364,6 +460,19 @@ absl::Status GPUOperationFromNode(const DeviceInfo& device_info,
       return absl::UnimplementedError(absl::StrCat(
           "No support of ", node.operation.type, " with this parameters"));
     }
+    case OperationType::REDUCE_MAXIMUM:
+    case OperationType::REDUCE_MINIMUM:
+    case OperationType::REDUCE_PRODUCT:
+    case OperationType::REDUCE_SUM: {
+      auto attr = absl::any_cast<ReduceAttributes>(node.operation.attributes);
+      if (attr.axis != Axis::CHANNELS) {
+        return absl::UnimplementedError(
+            "Currently we can reduce only in channels dimension.");
+      }
+      GPUOperation operation = CreateReduce(op_def, attr, op_type);
+      *gpu_op = absl::make_unique<GPUOperation>(std::move(operation));
+      return absl::OkStatus();
+    }
     default:
       return SelectDefault(device_info, op_def, hints, inputs, outputs, node,
                            gpu_subgraph);
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
index 4dbb1ffd734..713892f9902 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/add.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/concat_xy.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/concat_z.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/depthwise_conv.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/lstm.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/mean.h"
@@ -110,6 +111,13 @@ absl::Status SelectConcat(const ConcatAttributes& attr,
   }
 }
 
+std::unique_ptr<GPUOperation> SelectDWConvolutionDynamicWeights(
+    const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def) {
+  return absl::make_unique<GPUOperation>(
+      CreateDepthwiseConvolution2DDynamicWeights(device_info, op_def, attr));
+}
+
 void SelectReshape(int src_channels, int dst_channels,
                    const OperationDef& op_def,
                    std::unique_ptr<GPUOperation>* ptr) {
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
index c6c604da982..084298442e3 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
@@ -57,6 +57,10 @@ absl::Status SelectConcat(const ConcatAttributes& attr,
                           const DeviceInfo& device_info,
                           std::unique_ptr<GPUOperation>* ptr);
 
+std::unique_ptr<GPUOperation> SelectDWConvolutionDynamicWeights(
+    const DepthwiseConvolution2DAttributes& attr, const DeviceInfo& device_info,
+    const OperationDef& op_def);
+
 void SelectReshape(int src_channels, int dst_channels,
                    const OperationDef& op_def,
                    std::unique_ptr<GPUOperation>* ptr);
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.cc
index 31480f231b0..631eabc4569 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/special/depthwise_conv_plus_1x1_conv.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/special/fc_fc_add.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
@@ -39,6 +40,10 @@ absl::Status TryDepthwiseConvPlus1x1Conv(
       OperationType::DEPTHWISE_CONVOLUTION) {
     return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
   }
+  auto dw_inputs = graph.FindInputs(dw_node->id);
+  if (dw_inputs.size() != 1) {
+    return absl::NotFoundError("DepthwiseConvPlus1x1Conv not suitable.");
+  }
   auto dw_outputs = graph.FindOutputs(dw_node->id);
   auto consumers = graph.FindConsumers(dw_outputs[0]->id);
   if (consumers.size() != 1) {
@@ -59,7 +64,6 @@ absl::Status TryDepthwiseConvPlus1x1Conv(
       dw_node->operation.attributes);
   auto conv_attr =
       absl::any_cast<Convolution2DAttributes>(conv_node->operation.attributes);
-  auto dw_inputs = graph.FindInputs(dw_node->id);
   auto conv_outputs = graph.FindOutputs(conv_node->id);
   OperationDef op_def;
   op_def.precision = precision;
@@ -82,22 +86,108 @@ absl::Status TryDepthwiseConvPlus1x1Conv(
   consumed_nodes->insert(conv_node->id);
   return absl::OkStatus();
 }
+
+// fully connected + fully connected + add
+absl::Status TryFCFCAdd(
+    const DeviceInfo& device_info, CalculationsPrecision precision,
+    const GraphFloat32& graph, NodeId first_node_id,
+    const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
+    std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph) {
+  auto* fc0_node = graph.GetNode(first_node_id);
+  if (OperationTypeFromString(fc0_node->operation.type) !=
+      OperationType::FULLY_CONNECTED) {
+    return absl::NotFoundError("FCFCAdd not suitable.");
+  }
+  auto fc0_inputs = graph.FindInputs(fc0_node->id);
+  if (fc0_inputs.size() != 1) {
+    return absl::NotFoundError("FCFCAdd not suitable.");
+  }
+  auto fc0_output_id = graph.FindOutputs(fc0_node->id)[0]->id;
+  auto consumers = graph.FindConsumers(fc0_output_id);
+  if (consumers.size() != 1) {
+    return absl::NotFoundError("FCFCAdd not suitable.");
+  }
+  auto* add_node = consumers[0];
+  if (consumed_nodes->find(add_node->id) != consumed_nodes->end()) {
+    return absl::NotFoundError("FCFCAdd not suitable.");
+  }
+  if (OperationTypeFromString(add_node->operation.type) != OperationType::ADD) {
+    return absl::NotFoundError("FCFCAdd not suitable.");
+  }
+  auto add_inputs = graph.FindInputs(add_node->id);
+  if (add_inputs.size() != 2) {
+    return absl::NotFoundError("FCFCAdd not suitable.");
+  }
+  auto fc1_output_id = add_inputs[0]->id + add_inputs[1]->id - fc0_output_id;
+  auto* fc1_node = graph.FindProducer(fc1_output_id);
+  if (OperationTypeFromString(fc1_node->operation.type) !=
+      OperationType::FULLY_CONNECTED) {
+    return absl::NotFoundError("FCFCAdd not suitable.");
+  }
+  if (consumed_nodes->find(fc1_node->id) != consumed_nodes->end()) {
+    return absl::NotFoundError("FCFCAdd not suitable.");
+  }
+  auto fc1_inputs = graph.FindInputs(fc1_node->id);
+  if (fc1_inputs.size() != 1) {
+    return absl::NotFoundError("FCFCAdd not suitable.");
+  }
+  auto fc0_attr =
+      absl::any_cast<FullyConnectedAttributes>(fc0_node->operation.attributes);
+  auto fc1_attr =
+      absl::any_cast<FullyConnectedAttributes>(fc1_node->operation.attributes);
+  if (fc0_attr.weights.shape.o != fc1_attr.weights.shape.o) {
+    return absl::NotFoundError("FCFCAdd not suitable.");
+  }
+  auto add_outputs = graph.FindOutputs(add_node->id);
+
+  OperationDef op_def;
+  op_def.precision = precision;
+  auto it = tensor_descriptors.find(fc0_inputs[0]->id);
+  if (it != tensor_descriptors.end()) {
+    op_def.src_tensors.push_back(it->second);
+  }
+  it = tensor_descriptors.find(fc1_inputs[0]->id);
+  if (it != tensor_descriptors.end()) {
+    op_def.src_tensors.push_back(it->second);
+  }
+  it = tensor_descriptors.find(add_outputs[0]->id);
+  if (it != tensor_descriptors.end()) {
+    op_def.dst_tensors.push_back(it->second);
+  }
+
+  for (int i = 0; i < fc1_inputs.size(); ++i) {
+    fc0_inputs.push_back(fc1_inputs[i]);
+  }
+  std::unique_ptr<GPUOperation>* gpu_op =
+      InitSingleOpSubgraph(fc0_inputs, add_outputs, gpu_subgraph);
+  FCFCAdd fc = CreateFCFCAdd(device_info, op_def, fc0_attr, fc1_attr);
+  *gpu_op = absl::make_unique<FCFCAdd>(std::move(fc));
+  consumed_nodes->insert(fc0_node->id);
+  consumed_nodes->insert(fc1_node->id);
+  consumed_nodes->insert(add_node->id);
+  return absl::OkStatus();
+}
 }  // namespace
 
 absl::Status GPUSubgraphFromGraph(
     const DeviceInfo& device_info, CalculationsPrecision precision,
     const GraphFloat32& graph, NodeId first_node_id,
     const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
-    std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph) {
-  if (!device_info.IsNvidia()) {
-    return absl::NotFoundError(
-        "Experimental feature, enabled for NVidia only, but device is not "
-        "nvidia gpu.");
-  }
-  if (TryDepthwiseConvPlus1x1Conv(precision, graph, first_node_id,
+    std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph,
+    std::string* name) {
+  if ((device_info.IsAdreno() || device_info.IsNvidia()) &&
+      TryDepthwiseConvPlus1x1Conv(precision, graph, first_node_id,
                                   tensor_descriptors, consumed_nodes,
                                   gpu_subgraph)
           .ok()) {
+    *name = "depthwise_conv_plus_1x1_conv";
+    return absl::OkStatus();
+  }
+  if ((device_info.IsIntel() || device_info.IsNvidia()) &&
+      TryFCFCAdd(device_info, precision, graph, first_node_id,
+                 tensor_descriptors, consumed_nodes, gpu_subgraph)
+          .ok()) {
+    *name = "fully_connected_x2_and_add";
     return absl::OkStatus();
   }
   return absl::NotFoundError("No special combination.");
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h b/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h
index 3ea99b2515a..6091415e14c 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/special_selector.h
@@ -34,7 +34,8 @@ absl::Status GPUSubgraphFromGraph(
     const DeviceInfo& device_info, CalculationsPrecision precision,
     const GraphFloat32& graph, NodeId first_node_id,
     const std::map<ValueId, TensorDescriptor>& tensor_descriptors,
-    std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph);
+    std::set<NodeId>* consumed_nodes, GPUOperationsSubgraph* gpu_subgraph,
+    std::string* name);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/serialization.cc b/tensorflow/lite/delegates/gpu/cl/serialization.cc
new file mode 100644
index 00000000000..3b52fc40bdf
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/serialization.cc
@@ -0,0 +1,1049 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/serialization.h"
+
+#include <cstdint>
+
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
+#include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
+#include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
+#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
+#include "tensorflow/lite/delegates/gpu/cl/precision.h"
+#include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
+#include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
+#include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+data::AccessType ToFB(AccessType type) {
+  switch (type) {
+    case AccessType::READ:
+      return data::AccessType::READ;
+    case AccessType::WRITE:
+      return data::AccessType::WRITE;
+    case AccessType::READ_WRITE:
+      return data::AccessType::READ_WRITE;
+    default:
+      return data::AccessType::READ_WRITE;
+  }
+}
+
+data::DataType ToFB(DataType type) {
+  switch (type) {
+    case DataType::FLOAT16:
+      return data::DataType::FLOAT16;
+    case DataType::FLOAT32:
+      return data::DataType::FLOAT32;
+    default:
+      return data::DataType::UNKNOWN;
+  }
+}
+
+data::MemoryType ToFB(MemoryType type) {
+  switch (type) {
+    case MemoryType::CONSTANT:
+      return data::MemoryType::CONSTANT;
+    case MemoryType::GLOBAL:
+      return data::MemoryType::GLOBAL;
+    case MemoryType::LOCAL:
+      return data::MemoryType::LOCAL;
+  }
+}
+
+data::LinearStorageType ToFB(LinearStorageType type) {
+  switch (type) {
+    case LinearStorageType::BUFFER:
+      return data::LinearStorageType::BUFFER;
+    case LinearStorageType::TEXTURE_2D:
+      return data::LinearStorageType::TEXTURE_2D;
+  }
+}
+
+data::TensorStorageType ToFB(TensorStorageType type) {
+  switch (type) {
+    case TensorStorageType::BUFFER:
+      return data::TensorStorageType::BUFFER;
+    case TensorStorageType::IMAGE_BUFFER:
+      return data::TensorStorageType::IMAGE_BUFFER;
+    case TensorStorageType::TEXTURE_2D:
+      return data::TensorStorageType::TEXTURE_2D;
+    case TensorStorageType::TEXTURE_ARRAY:
+      return data::TensorStorageType::TEXTURE_ARRAY;
+    case TensorStorageType::TEXTURE_3D:
+      return data::TensorStorageType::TEXTURE_3D;
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return data::TensorStorageType::SINGLE_TEXTURE_2D;
+    case TensorStorageType::UNKNOWN:
+      return data::TensorStorageType::UNKNOWN;
+  }
+}
+
+data::Layout ToFB(Layout type) {
+  switch (type) {
+    case Layout::HWC:
+      return data::Layout::HWC;
+    case Layout::BHWC:
+      return data::Layout::BHWC;
+    case Layout::HWDC:
+      return data::Layout::HWDC;
+    case Layout::BHWDC:
+      return data::Layout::BHWDC;
+    default:
+      return data::Layout::UNKNOWN;
+  }
+}
+
+data::CalculationsPrecision ToFB(CalculationsPrecision type) {
+  switch (type) {
+    case CalculationsPrecision::F32:
+      return data::CalculationsPrecision::F32;
+    case CalculationsPrecision::F32_F16:
+      return data::CalculationsPrecision::F32_F16;
+    case CalculationsPrecision::F16:
+      return data::CalculationsPrecision::F16;
+  }
+}
+
+data::TensorToGrid ToFB(TensorToGrid type) {
+  switch (type) {
+    case TensorToGrid::kCustom:
+      return data::TensorToGrid::CUSTOM;
+    case TensorToGrid::kWBToX_HDToY_SToZ:
+      return data::TensorToGrid::WB_TO_X_HD_TO_Y_S_TO_Z;
+    case TensorToGrid::kWBToX_HDToY_ZIs1:
+      return data::TensorToGrid::WB_TO_X_HD_TO_Y_Z_IS_1;
+    case TensorToGrid::kWBToX_HToY_DToZ:
+      return data::TensorToGrid::WB_TO_X_H_TO_Y_D_TO_Z;
+    case TensorToGrid::kBToX_YIs1_ZIs1:
+      return data::TensorToGrid::B_TO_X_Y_IS_1_Z_IS_1;
+  }
+}
+
+data::CompilerOptions ToFB(CompilerOptions type) {
+  switch (type) {
+    case CompilerOptions::ADRENO_FULL_SIMD_LINE:
+      return data::CompilerOptions::ADRENO_FULL_SIMD_LINE;
+    case CompilerOptions::ADRENO_MORE_WAVES:
+      return data::CompilerOptions::ADRENO_MORE_WAVES;
+    case CompilerOptions::POWERVR_FP16:
+      return data::CompilerOptions::POWERVR_FP16;
+    case CompilerOptions::CL_OPT_DISABLE:
+      return data::CompilerOptions::CL_OPT_DISABLE;
+    case CompilerOptions::CL_2_0:
+      return data::CompilerOptions::CL_2_0;
+    case CompilerOptions::CL_3_0:
+      return data::CompilerOptions::CL_3_0;
+  }
+}
+
+DataType ToEnum(data::DataType type) {
+  switch (type) {
+    case data::DataType::FLOAT16:
+      return DataType::FLOAT16;
+    case data::DataType::FLOAT32:
+      return DataType::FLOAT32;
+    default:
+      return DataType::UNKNOWN;
+  }
+}
+
+AccessType ToEnum(data::AccessType type) {
+  switch (type) {
+    case data::AccessType::READ:
+      return AccessType::READ;
+    case data::AccessType::WRITE:
+      return AccessType::WRITE;
+    case data::AccessType::READ_WRITE:
+      return AccessType::READ_WRITE;
+  }
+}
+
+MemoryType ToEnum(data::MemoryType type) {
+  switch (type) {
+    case data::MemoryType::CONSTANT:
+      return MemoryType::CONSTANT;
+    case data::MemoryType::GLOBAL:
+      return MemoryType::GLOBAL;
+    case data::MemoryType::LOCAL:
+      return MemoryType::LOCAL;
+  }
+}
+
+LinearStorageType ToEnum(data::LinearStorageType type) {
+  switch (type) {
+    case data::LinearStorageType::BUFFER:
+      return LinearStorageType::BUFFER;
+    case data::LinearStorageType::TEXTURE_2D:
+      return LinearStorageType::TEXTURE_2D;
+  }
+}
+
+TensorStorageType ToEnum(data::TensorStorageType type) {
+  switch (type) {
+    case data::TensorStorageType::BUFFER:
+      return TensorStorageType::BUFFER;
+    case data::TensorStorageType::IMAGE_BUFFER:
+      return TensorStorageType::IMAGE_BUFFER;
+    case data::TensorStorageType::TEXTURE_2D:
+      return TensorStorageType::TEXTURE_2D;
+    case data::TensorStorageType::TEXTURE_ARRAY:
+      return TensorStorageType::TEXTURE_ARRAY;
+    case data::TensorStorageType::TEXTURE_3D:
+      return TensorStorageType::TEXTURE_3D;
+    case data::TensorStorageType::SINGLE_TEXTURE_2D:
+      return TensorStorageType::SINGLE_TEXTURE_2D;
+    case data::TensorStorageType::UNKNOWN:
+      return TensorStorageType::UNKNOWN;
+  }
+}
+
+Layout ToEnum(data::Layout type) {
+  switch (type) {
+    case data::Layout::HWC:
+      return Layout::HWC;
+    case data::Layout::BHWC:
+      return Layout::BHWC;
+    case data::Layout::HWDC:
+      return Layout::HWDC;
+    case data::Layout::BHWDC:
+      return Layout::BHWDC;
+    default:
+      return Layout::UNKNOWN;
+  }
+}
+
+CalculationsPrecision ToEnum(data::CalculationsPrecision type) {
+  switch (type) {
+    case data::CalculationsPrecision::F32:
+      return CalculationsPrecision::F32;
+    case data::CalculationsPrecision::F32_F16:
+      return CalculationsPrecision::F32_F16;
+    case data::CalculationsPrecision::F16:
+      return CalculationsPrecision::F16;
+  }
+}
+
+TensorToGrid ToEnum(data::TensorToGrid type) {
+  switch (type) {
+    case data::TensorToGrid::CUSTOM:
+      return TensorToGrid::kCustom;
+    case data::TensorToGrid::WB_TO_X_HD_TO_Y_S_TO_Z:
+      return TensorToGrid::kWBToX_HDToY_SToZ;
+    case data::TensorToGrid::WB_TO_X_HD_TO_Y_Z_IS_1:
+      return TensorToGrid::kWBToX_HDToY_ZIs1;
+    case data::TensorToGrid::WB_TO_X_H_TO_Y_D_TO_Z:
+      return TensorToGrid::kWBToX_HToY_DToZ;
+    case data::TensorToGrid::B_TO_X_Y_IS_1_Z_IS_1:
+      return TensorToGrid::kBToX_YIs1_ZIs1;
+  }
+}
+
+CompilerOptions ToEnum(data::CompilerOptions type) {
+  switch (type) {
+    case data::CompilerOptions::ADRENO_FULL_SIMD_LINE:
+      return CompilerOptions::ADRENO_FULL_SIMD_LINE;
+    case data::CompilerOptions::ADRENO_MORE_WAVES:
+      return CompilerOptions::ADRENO_MORE_WAVES;
+    case data::CompilerOptions::POWERVR_FP16:
+      return CompilerOptions::POWERVR_FP16;
+    case data::CompilerOptions::CL_OPT_DISABLE:
+      return CompilerOptions::CL_OPT_DISABLE;
+    case data::CompilerOptions::CL_2_0:
+      return CompilerOptions::CL_2_0;
+    case data::CompilerOptions::CL_3_0:
+      return CompilerOptions::CL_3_0;
+  }
+}
+
+}  // namespace
+
+flatbuffers::Offset<data::Int2> Encode(
+    const int2& v, flatbuffers::FlatBufferBuilder* builder) {
+  data::Int2Builder int2_builder(*builder);
+  int2_builder.add_x(v.x);
+  int2_builder.add_y(v.y);
+  return int2_builder.Finish();
+}
+
+flatbuffers::Offset<data::Int3> Encode(
+    const int3& v, flatbuffers::FlatBufferBuilder* builder) {
+  data::Int3Builder int3_builder(*builder);
+  int3_builder.add_x(v.x);
+  int3_builder.add_y(v.y);
+  int3_builder.add_z(v.z);
+  return int3_builder.Finish();
+}
+
+flatbuffers::Offset<data::GPUObjectDescriptor> Encode(
+    const GPUObjectDescriptor& desc, flatbuffers::FlatBufferBuilder* builder) {
+  std::vector<flatbuffers::Offset<data::StateVariable>> state_vars_fb;
+  for (auto& v0 : desc.state_vars_) {
+    auto key_fb = builder->CreateString(v0.first);
+    auto value_fb = builder->CreateString(v0.second);
+    data::StateVariableBuilder state_builder(*builder);
+    state_builder.add_key(key_fb);
+    state_builder.add_value(value_fb);
+    state_vars_fb.push_back(state_builder.Finish());
+  }
+  auto state_vars_fb_vec = builder->CreateVector(state_vars_fb);
+  data::GPUObjectDescriptorBuilder obj_builder(*builder);
+  obj_builder.add_state_vars(state_vars_fb_vec);
+  obj_builder.add_access_type(ToFB(desc.access_type_));
+  return obj_builder.Finish();
+}
+
+void Decode(const data::GPUObjectDescriptor* fb_obj, GPUObjectDescriptor* obj) {
+  obj->access_type_ = ToEnum(fb_obj->access_type());
+  for (auto state_fb : *fb_obj->state_vars()) {
+    std::string key(state_fb->key()->c_str(), state_fb->key()->size());
+    std::string value(state_fb->value()->c_str(), state_fb->value()->size());
+    obj->state_vars_[key] = value;
+  }
+}
+
+flatbuffers::Offset<data::BufferDescriptor> Encode(
+    const BufferDescriptor& desc, flatbuffers::FlatBufferBuilder* builder) {
+  auto obj_fb =
+      Encode(*static_cast<const GPUObjectDescriptor*>(&desc), builder);
+
+  std::vector<flatbuffers::Offset<flatbuffers::String>> attributes_fb;
+  for (auto& attr : desc.attributes) {
+    attributes_fb.push_back(builder->CreateString(attr));
+  }
+  auto attributes_fb_vec = builder->CreateVector(attributes_fb);
+  auto data_fb = builder->CreateVector(desc.data);
+  data::BufferDescriptorBuilder buf_builder(*builder);
+  buf_builder.add_base_obj(obj_fb);
+  buf_builder.add_element_type(ToFB(desc.element_type));
+  buf_builder.add_element_size(desc.element_size);
+  buf_builder.add_memory_type(ToFB(desc.memory_type));
+  buf_builder.add_attributes(attributes_fb_vec);
+  buf_builder.add_size(desc.size);
+  buf_builder.add_data(data_fb);
+  return buf_builder.Finish();
+}
+
+void Decode(const data::BufferDescriptor* fb_desc, BufferDescriptor* desc) {
+  Decode(fb_desc->base_obj(), desc);
+  desc->element_type = ToEnum(fb_desc->element_type());
+  desc->element_size = fb_desc->element_size();
+  desc->memory_type = ToEnum(fb_desc->memory_type());
+  for (auto attr_fb : *fb_desc->attributes()) {
+    std::string attr(attr_fb->c_str(), attr_fb->size());
+    desc->attributes.push_back(attr);
+  }
+  desc->size = fb_desc->size();
+  desc->data =
+      std::vector<uint8_t>(fb_desc->data()->data(),
+                           fb_desc->data()->data() + fb_desc->data()->size());
+}
+
+flatbuffers::Offset<data::Texture2DDescriptor> Encode(
+    const Texture2DDescriptor& desc, flatbuffers::FlatBufferBuilder* builder) {
+  auto obj_fb =
+      Encode(*static_cast<const GPUObjectDescriptor*>(&desc), builder);
+
+  auto data_fb = builder->CreateVector(desc.data);
+  auto size_fb = Encode(desc.size, builder);
+  data::Texture2DDescriptorBuilder tex_builder(*builder);
+  tex_builder.add_base_obj(obj_fb);
+  tex_builder.add_element_type(ToFB(desc.element_type));
+  tex_builder.add_normalized(desc.normalized);
+  tex_builder.add_normalized_type(ToFB(desc.normalized_type));
+  tex_builder.add_size(size_fb);
+  tex_builder.add_data(data_fb);
+  return tex_builder.Finish();
+}
+
+void Decode(const data::Texture2DDescriptor* fb_desc,
+            Texture2DDescriptor* desc) {
+  Decode(fb_desc->base_obj(), desc);
+  desc->element_type = ToEnum(fb_desc->element_type());
+  desc->normalized = fb_desc->normalized();
+  desc->normalized_type = ToEnum(fb_desc->normalized_type());
+  desc->size.x = fb_desc->size()->x();
+  desc->size.y = fb_desc->size()->y();
+  desc->data =
+      std::vector<uint8_t>(fb_desc->data()->data(),
+                           fb_desc->data()->data() + fb_desc->data()->size());
+}
+
+flatbuffers::Offset<data::TensorLinearDescriptor> Encode(
+    const TensorLinearDescriptor& desc,
+    flatbuffers::FlatBufferBuilder* builder) {
+  auto obj_fb =
+      Encode(*static_cast<const GPUObjectDescriptor*>(&desc), builder);
+
+  auto data_fb = builder->CreateVector(desc.data);
+  data::TensorLinearDescriptorBuilder tensor_builder(*builder);
+  tensor_builder.add_base_obj(obj_fb);
+  tensor_builder.add_element_type(ToFB(desc.element_type));
+  tensor_builder.add_storage_type(ToFB(desc.storage_type));
+  tensor_builder.add_memory_type(ToFB(desc.memory_type));
+  tensor_builder.add_size(desc.size);
+  tensor_builder.add_data(data_fb);
+  return tensor_builder.Finish();
+}
+
+void Decode(const data::TensorLinearDescriptor* fb_desc,
+            TensorLinearDescriptor* desc) {
+  Decode(fb_desc->base_obj(), desc);
+  desc->element_type = ToEnum(fb_desc->element_type());
+  desc->storage_type = ToEnum(fb_desc->storage_type());
+  desc->memory_type = ToEnum(fb_desc->memory_type());
+  desc->size = fb_desc->size();
+  desc->data =
+      std::vector<uint8_t>(fb_desc->data()->data(),
+                           fb_desc->data()->data() + fb_desc->data()->size());
+}
+
+flatbuffers::Offset<data::TensorDescriptor> Encode(
+    const TensorDescriptor& desc, flatbuffers::FlatBufferBuilder* builder) {
+  auto obj_fb =
+      Encode(*static_cast<const GPUObjectDescriptor*>(&desc), builder);
+
+  data::BHWDCBuilder shape_builder(*builder);
+  shape_builder.add_b(desc.shape.b);
+  shape_builder.add_h(desc.shape.h);
+  shape_builder.add_w(desc.shape.w);
+  shape_builder.add_d(desc.shape.d);
+  shape_builder.add_c(desc.shape.c);
+  auto shape_fb = shape_builder.Finish();
+
+  auto data_fb = builder->CreateVector(desc.data);
+  data::TensorDescriptorBuilder tensor_builder(*builder);
+  tensor_builder.add_base_obj(obj_fb);
+  tensor_builder.add_data_type(ToFB(desc.data_type));
+  tensor_builder.add_storage_type(ToFB(desc.storage_type));
+  tensor_builder.add_layout(ToFB(desc.layout));
+  tensor_builder.add_shape(shape_fb);
+  tensor_builder.add_data(data_fb);
+  return tensor_builder.Finish();
+}
+
+void Decode(const data::TensorDescriptor* fb_desc, TensorDescriptor* desc) {
+  Decode(fb_desc->base_obj(), desc);
+  desc->data_type = ToEnum(fb_desc->data_type());
+  desc->storage_type = ToEnum(fb_desc->storage_type());
+  desc->layout = ToEnum(fb_desc->layout());
+  desc->shape.b = fb_desc->shape()->b();
+  desc->shape.h = fb_desc->shape()->h();
+  desc->shape.w = fb_desc->shape()->w();
+  desc->shape.d = fb_desc->shape()->d();
+  desc->shape.c = fb_desc->shape()->c();
+  desc->data =
+      std::vector<uint8_t>(fb_desc->data()->data(),
+                           fb_desc->data()->data() + fb_desc->data()->size());
+}
+
+flatbuffers::Offset<data::OperationDef> Encode(
+    const OperationDef& def, flatbuffers::FlatBufferBuilder* builder) {
+  std::vector<flatbuffers::Offset<data::TensorDescriptor>> src_tensors_fb;
+  for (auto& desc : def.src_tensors) {
+    auto desc_fb = Encode(desc, builder);
+    src_tensors_fb.push_back(desc_fb);
+  }
+
+  std::vector<flatbuffers::Offset<data::TensorDescriptor>> dst_tensors_fb;
+  for (auto& desc : def.dst_tensors) {
+    auto desc_fb = Encode(desc, builder);
+    dst_tensors_fb.push_back(desc_fb);
+  }
+
+  auto src_tensors_fb_vec = builder->CreateVector(src_tensors_fb);
+  auto dst_tensors_fb_vec = builder->CreateVector(dst_tensors_fb);
+
+  data::OperationDefBuilder def_builder(*builder);
+  def_builder.add_precision(ToFB(def.precision));
+  def_builder.add_src_tensors(src_tensors_fb_vec);
+  def_builder.add_dst_tensors(dst_tensors_fb_vec);
+  return def_builder.Finish();
+}
+
+void Decode(const data::OperationDef* fb_def, OperationDef* def) {
+  for (auto src_fb : *fb_def->src_tensors()) {
+    TensorDescriptor desc;
+    Decode(src_fb, &desc);
+    def->src_tensors.push_back(std::move(desc));
+  }
+  for (auto dst_fb : *fb_def->dst_tensors()) {
+    TensorDescriptor desc;
+    Decode(dst_fb, &desc);
+    def->dst_tensors.push_back(std::move(desc));
+  }
+  def->precision = ToEnum(fb_def->precision());
+}
+
+flatbuffers::Offset<data::TensorDescWithId> Encode(
+    const TensorDescriptor& desc, const ValueId& id,
+    flatbuffers::FlatBufferBuilder* builder) {
+  auto desc_fb = Encode(desc, builder);
+  data::TensorDescWithIdBuilder desc_builder(*builder);
+  desc_builder.add_desc(desc_fb);
+  desc_builder.add_id(id);
+  return desc_builder.Finish();
+}
+
+void Decode(const data::TensorDescWithId* fb_desc, TensorDescriptor* desc,
+            ValueId* id) {
+  Decode(fb_desc->desc(), desc);
+  *id = fb_desc->id();
+}
+
+absl::Status Decode(CLContext* context, const data::Arguments* fb_args,
+                    Arguments* args) {
+  args->shared_int4s_data_ = std::vector<int32_t>(
+      fb_args->shared_int4s()->data(),
+      fb_args->shared_int4s()->data() + fb_args->shared_int4s()->size());
+
+  args->shared_float4s_data_ = std::vector<float>(
+      fb_args->shared_float4s()->data(),
+      fb_args->shared_float4s()->data() + fb_args->shared_float4s()->size());
+
+  std::vector<float> tmp = std::vector<float>(
+      fb_args->shared_half4s()->data(),
+      fb_args->shared_half4s()->data() + fb_args->shared_half4s()->size());
+
+  args->shared_half4s_data_.resize(tmp.size());
+  for (int i = 0; i < tmp.size(); ++i) {
+    args->shared_half4s_data_[i] = tmp[i];
+  }
+
+  args->int_values_.clear();
+  for (auto int_values_fb : *fb_args->int_values()) {
+    Arguments::IntValue value;
+    value.value = int_values_fb->value();
+    value.offset = int_values_fb->offset();
+    value.active = int_values_fb->active();
+    std::string name(int_values_fb->name()->c_str(),
+                     int_values_fb->name()->size());
+    args->int_values_[name] = value;
+  }
+
+  args->float_values_.clear();
+  for (auto float_values_fb : *fb_args->float_values()) {
+    Arguments::FloatValue value;
+    value.value = float_values_fb->value();
+    value.offset = float_values_fb->offset();
+    value.active = float_values_fb->active();
+    std::string name(float_values_fb->name()->c_str(),
+                     float_values_fb->name()->size());
+    args->float_values_[name] = value;
+  }
+
+  args->half_values_.clear();
+  for (auto half_values_fb : *fb_args->half_values()) {
+    Arguments::HalfValue value;
+    value.value = half_values_fb->value();
+    value.offset = half_values_fb->offset();
+    value.active = half_values_fb->active();
+    value.store_as_f32 = half_values_fb->store_as_f32();
+    std::string name(half_values_fb->name()->c_str(),
+                     half_values_fb->name()->size());
+    args->half_values_[name] = value;
+  }
+
+  for (auto buffer_pair_fb : *fb_args->buffer_objects()) {
+    std::string key(buffer_pair_fb->key()->c_str(),
+                    buffer_pair_fb->key()->size());
+    BufferDescriptor desc;
+    Decode(buffer_pair_fb->value(), &desc);
+    args->AddObject(key, absl::make_unique<BufferDescriptor>(std::move(desc)));
+  }
+
+  for (auto texture_pair_fb : *fb_args->texture2d_objects()) {
+    std::string key(texture_pair_fb->key()->c_str(),
+                    texture_pair_fb->key()->size());
+    Texture2DDescriptor desc;
+    Decode(texture_pair_fb->value(), &desc);
+    args->AddObject(key,
+                    absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+  }
+
+  for (auto tensor_pair_fb : *fb_args->tensor_linear_objects()) {
+    std::string key(tensor_pair_fb->key()->c_str(),
+                    tensor_pair_fb->key()->size());
+    TensorLinearDescriptor desc;
+    Decode(tensor_pair_fb->value(), &desc);
+    args->AddObject(key,
+                    absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  }
+
+  for (auto tensor_pair_fb : *fb_args->tensor_objects()) {
+    std::string key(tensor_pair_fb->key()->c_str(),
+                    tensor_pair_fb->key()->size());
+    TensorDescriptor desc;
+    Decode(tensor_pair_fb->value(), &desc);
+    args->AddObject(key, absl::make_unique<TensorDescriptor>(std::move(desc)));
+  }
+
+  for (auto buffer_pair_fb : *fb_args->buffer_refs()) {
+    std::string key(buffer_pair_fb->key()->c_str(),
+                    buffer_pair_fb->key()->size());
+    BufferDescriptor desc;
+    Decode(buffer_pair_fb->value(), &desc);
+    auto access_type = desc.GetAccess();
+    args->AddObjectRef(key, access_type,
+                       absl::make_unique<BufferDescriptor>(std::move(desc)));
+  }
+
+  for (auto texture_pair_fb : *fb_args->texture2d_refs()) {
+    std::string key(texture_pair_fb->key()->c_str(),
+                    texture_pair_fb->key()->size());
+    Texture2DDescriptor desc;
+    Decode(texture_pair_fb->value(), &desc);
+    auto access_type = desc.GetAccess();
+    args->AddObjectRef(key, access_type,
+                       absl::make_unique<Texture2DDescriptor>(std::move(desc)));
+  }
+
+  for (auto tensor_pair_fb : *fb_args->tensor_linear_refs()) {
+    std::string key(tensor_pair_fb->key()->c_str(),
+                    tensor_pair_fb->key()->size());
+    TensorLinearDescriptor desc;
+    Decode(tensor_pair_fb->value(), &desc);
+    auto access_type = desc.GetAccess();
+    args->AddObjectRef(
+        key, access_type,
+        absl::make_unique<TensorLinearDescriptor>(std::move(desc)));
+  }
+
+  for (auto tensor_pair_fb : *fb_args->tensor_refs()) {
+    std::string key(tensor_pair_fb->key()->c_str(),
+                    tensor_pair_fb->key()->size());
+    TensorDescriptor desc;
+    Decode(tensor_pair_fb->value(), &desc);
+    auto access_type = desc.GetAccess();
+    args->AddObjectRef(key, access_type,
+                       absl::make_unique<TensorDescriptor>(std::move(desc)));
+  }
+
+  RETURN_IF_ERROR(args->AllocateObjects(context));
+  RETURN_IF_ERROR(args->AddObjectArgs());
+  return absl::OkStatus();
+}
+
+flatbuffers::Offset<data::Arguments> Encode(
+    const Arguments& args, flatbuffers::FlatBufferBuilder* builder) {
+  std::vector<flatbuffers::Offset<data::IntValue>> int_values_fb;
+  for (auto& value : args.int_values_) {
+    auto name_fb = builder->CreateString(value.first);
+    data::IntValueBuilder value_builder(*builder);
+    value_builder.add_name(name_fb);
+    value_builder.add_value(value.second.value);
+    value_builder.add_offset(value.second.offset);
+    value_builder.add_active(value.second.active);
+    int_values_fb.push_back(value_builder.Finish());
+  }
+
+  std::vector<flatbuffers::Offset<data::FloatValue>> float_values_fb;
+  for (auto& value : args.float_values_) {
+    auto name_fb = builder->CreateString(value.first);
+    data::FloatValueBuilder value_builder(*builder);
+    value_builder.add_name(name_fb);
+    value_builder.add_value(value.second.value);
+    value_builder.add_offset(value.second.offset);
+    value_builder.add_active(value.second.active);
+    float_values_fb.push_back(value_builder.Finish());
+  }
+
+  std::vector<flatbuffers::Offset<data::HalfValue>> half_values_fb;
+  for (auto& value : args.half_values_) {
+    auto name_fb = builder->CreateString(value.first);
+    data::HalfValueBuilder value_builder(*builder);
+    value_builder.add_name(name_fb);
+    value_builder.add_value(value.second.value);
+    value_builder.add_offset(value.second.offset);
+    value_builder.add_active(value.second.active);
+    value_builder.add_store_as_f32(value.second.store_as_f32);
+    half_values_fb.push_back(value_builder.Finish());
+  }
+
+  std::vector<flatbuffers::Offset<data::BufferDescriptorMapValue>>
+      buffer_objs_fb;
+  for (auto& value : args.objects_) {
+    const auto* buffer_desc =
+        dynamic_cast<const BufferDescriptor*>(value.second.descriptor.get());
+    if (!buffer_desc) continue;
+    auto desc_fb = Encode(*buffer_desc, builder);
+    auto key_fb = builder->CreateString(value.first);
+    data::BufferDescriptorMapValueBuilder buf_map_builder(*builder);
+    buf_map_builder.add_key(key_fb);
+    buf_map_builder.add_value(desc_fb);
+    buffer_objs_fb.push_back(buf_map_builder.Finish());
+  }
+  std::vector<flatbuffers::Offset<data::Texture2DDescriptorMapValue>>
+      texture2d_objs_fb;
+  for (auto& value : args.objects_) {
+    const auto* texture_desc =
+        dynamic_cast<const Texture2DDescriptor*>(value.second.descriptor.get());
+    if (!texture_desc) continue;
+    auto desc_fb = Encode(*texture_desc, builder);
+    auto key_fb = builder->CreateString(value.first);
+    data::Texture2DDescriptorMapValueBuilder tex_map_builder(*builder);
+    tex_map_builder.add_key(key_fb);
+    tex_map_builder.add_value(desc_fb);
+    texture2d_objs_fb.push_back(tex_map_builder.Finish());
+  }
+  std::vector<flatbuffers::Offset<data::TensorLinearDescriptorMapValue>>
+      tensor_linear_objs_fb;
+  for (auto& value : args.objects_) {
+    const auto* tensor_desc = dynamic_cast<const TensorLinearDescriptor*>(
+        value.second.descriptor.get());
+    if (!tensor_desc) continue;
+    auto desc_fb = Encode(*tensor_desc, builder);
+    auto key_fb = builder->CreateString(value.first);
+    data::TensorLinearDescriptorMapValueBuilder ten_map_builder(*builder);
+    ten_map_builder.add_key(key_fb);
+    ten_map_builder.add_value(desc_fb);
+    tensor_linear_objs_fb.push_back(ten_map_builder.Finish());
+  }
+  std::vector<flatbuffers::Offset<data::TensorDescriptorMapValue>>
+      tensor_objs_fb;
+  for (auto& value : args.objects_) {
+    const auto* tensor_desc =
+        dynamic_cast<const TensorDescriptor*>(value.second.descriptor.get());
+    if (!tensor_desc) continue;
+    auto desc_fb = Encode(*tensor_desc, builder);
+    auto key_fb = builder->CreateString(value.first);
+    data::TensorDescriptorMapValueBuilder ten_map_builder(*builder);
+    ten_map_builder.add_key(key_fb);
+    ten_map_builder.add_value(desc_fb);
+    tensor_objs_fb.push_back(ten_map_builder.Finish());
+  }
+
+  std::vector<flatbuffers::Offset<data::BufferDescriptorMapValue>>
+      buffer_refs_fb;
+  for (auto& value : args.object_refs_) {
+    const auto* buffer_desc =
+        dynamic_cast<const BufferDescriptor*>(value.second.descriptor.get());
+    if (!buffer_desc) continue;
+    auto desc_fb = Encode(*buffer_desc, builder);
+    auto key_fb = builder->CreateString(value.first);
+    data::BufferDescriptorMapValueBuilder buf_map_builder(*builder);
+    buf_map_builder.add_key(key_fb);
+    buf_map_builder.add_value(desc_fb);
+    buffer_refs_fb.push_back(buf_map_builder.Finish());
+  }
+  std::vector<flatbuffers::Offset<data::Texture2DDescriptorMapValue>>
+      texture2d_refs_fb;
+  for (auto& value : args.object_refs_) {
+    const auto* texture_desc =
+        dynamic_cast<const Texture2DDescriptor*>(value.second.descriptor.get());
+    if (!texture_desc) continue;
+    auto desc_fb = Encode(*texture_desc, builder);
+    auto key_fb = builder->CreateString(value.first);
+    data::Texture2DDescriptorMapValueBuilder tex_map_builder(*builder);
+    tex_map_builder.add_key(key_fb);
+    tex_map_builder.add_value(desc_fb);
+    texture2d_refs_fb.push_back(tex_map_builder.Finish());
+  }
+  std::vector<flatbuffers::Offset<data::TensorLinearDescriptorMapValue>>
+      tensor_linear_refs_fb;
+  for (auto& value : args.object_refs_) {
+    const auto* tensor_desc = dynamic_cast<const TensorLinearDescriptor*>(
+        value.second.descriptor.get());
+    if (!tensor_desc) continue;
+    auto desc_fb = Encode(*tensor_desc, builder);
+    auto key_fb = builder->CreateString(value.first);
+    data::TensorLinearDescriptorMapValueBuilder ten_map_builder(*builder);
+    ten_map_builder.add_key(key_fb);
+    ten_map_builder.add_value(desc_fb);
+    tensor_linear_refs_fb.push_back(ten_map_builder.Finish());
+  }
+  std::vector<flatbuffers::Offset<data::TensorDescriptorMapValue>>
+      tensor_refs_fb;
+  for (auto& value : args.object_refs_) {
+    const auto* tensor_desc =
+        dynamic_cast<const TensorDescriptor*>(value.second.descriptor.get());
+    if (!tensor_desc) continue;
+    auto desc_fb = Encode(*tensor_desc, builder);
+    auto key_fb = builder->CreateString(value.first);
+    data::TensorDescriptorMapValueBuilder ten_map_builder(*builder);
+    ten_map_builder.add_key(key_fb);
+    ten_map_builder.add_value(desc_fb);
+    tensor_refs_fb.push_back(ten_map_builder.Finish());
+  }
+
+  auto shared_int4s_data_fb = builder->CreateVector(args.shared_int4s_data_);
+  auto shared_float4s_data_fb =
+      builder->CreateVector(args.shared_float4s_data_);
+  std::vector<float> tmp(args.shared_half4s_data_.size());
+  for (int i = 0; i < tmp.size(); ++i) {
+    tmp[i] = args.shared_half4s_data_[i];
+  }
+  auto shared_half4s_data_fb = builder->CreateVector(tmp);
+  auto int_values_fb_vec = builder->CreateVector(int_values_fb);
+  auto float_values_fb_vec = builder->CreateVector(float_values_fb);
+  auto half_values_fb_vec = builder->CreateVector(half_values_fb);
+  auto buffer_objs_fb_vec = builder->CreateVector(buffer_objs_fb);
+  auto texture2d_objs_fb_vec = builder->CreateVector(texture2d_objs_fb);
+  auto tensor_linear_objs_fb_vec = builder->CreateVector(tensor_linear_objs_fb);
+  auto tensor_objs_fb_vec = builder->CreateVector(tensor_objs_fb);
+  auto buffer_refs_fb_vec = builder->CreateVector(buffer_refs_fb);
+  auto texture2d_refs_fb_vec = builder->CreateVector(texture2d_refs_fb);
+  auto tensor_linear_refs_fb_vec = builder->CreateVector(tensor_linear_refs_fb);
+  auto tensor_refs_fb_vec = builder->CreateVector(tensor_refs_fb);
+  data::ArgumentsBuilder arguments_builder(*builder);
+  arguments_builder.add_shared_int4s(shared_int4s_data_fb);
+  arguments_builder.add_shared_float4s(shared_float4s_data_fb);
+  arguments_builder.add_shared_half4s(shared_half4s_data_fb);
+  arguments_builder.add_int_values(int_values_fb_vec);
+  arguments_builder.add_float_values(float_values_fb_vec);
+  arguments_builder.add_half_values(half_values_fb_vec);
+  arguments_builder.add_buffer_objects(buffer_objs_fb_vec);
+  arguments_builder.add_texture2d_objects(texture2d_objs_fb_vec);
+  arguments_builder.add_tensor_linear_objects(tensor_linear_objs_fb_vec);
+  arguments_builder.add_tensor_objects(tensor_objs_fb_vec);
+  arguments_builder.add_buffer_refs(buffer_refs_fb_vec);
+  arguments_builder.add_texture2d_refs(texture2d_refs_fb_vec);
+  arguments_builder.add_tensor_linear_refs(tensor_linear_refs_fb_vec);
+  arguments_builder.add_tensor_refs(tensor_refs_fb_vec);
+  return arguments_builder.Finish();
+}
+
+absl::Status Decode(CLContext* context, const data::GPUOperation* fb_op,
+                    GPUOperation* op) {
+  RETURN_IF_ERROR(Decode(context, fb_op->arguments(), &op->args_));
+  op->code_ = std::string(fb_op->code()->c_str(), fb_op->code()->size());
+  op->work_group_size_.x = fb_op->work_group_size()->x();
+  op->work_group_size_.y = fb_op->work_group_size()->y();
+  op->work_group_size_.z = fb_op->work_group_size()->z();
+  for (auto option_fb : *fb_op->compiler_options()) {
+    op->compiler_options_.push_back(ToEnum(option_fb->option()));
+  }
+  op->tensor_to_grid_ = ToEnum(fb_op->tensor_to_grid());
+  op->elementwise_ = fb_op->elementwise();
+  op->linkable_ = fb_op->linkable();
+  op->check_src_channels_size_ = fb_op->check_src_channels_size();
+  Decode(fb_op->definition(), &op->definition_);
+  op->grid_dimension_ = fb_op->grid_dimension();
+  op->work_group_launch_order_.x = fb_op->work_group_launch_order()->x();
+  op->work_group_launch_order_.y = fb_op->work_group_launch_order()->y();
+  op->work_group_launch_order_.z = fb_op->work_group_launch_order()->z();
+  op->grid_size_.x = fb_op->grid_size()->x();
+  op->grid_size_.y = fb_op->grid_size()->y();
+  op->grid_size_.z = fb_op->grid_size()->z();
+  for (auto name_fb : *fb_op->src_tensors_names()) {
+    std::string name(name_fb->c_str(), name_fb->size());
+    op->src_tensors_names_.push_back(std::move(name));
+  }
+  for (auto name_fb : *fb_op->dst_tensors_names()) {
+    std::string name(name_fb->c_str(), name_fb->size());
+    op->dst_tensors_names_.push_back(std::move(name));
+  }
+  op->work_groups_count_.x = fb_op->work_groups_count()->x();
+  op->work_groups_count_.y = fb_op->work_groups_count()->y();
+  op->work_groups_count_.z = fb_op->work_groups_count()->z();
+  op->linkable_count_ = fb_op->linkable_count();
+  op->elementwise_code_ = std::string(fb_op->elementwise_code()->c_str(),
+                                      fb_op->elementwise_code()->size());
+  return absl::OkStatus();
+}
+
+flatbuffers::Offset<data::GPUOperation> Encode(
+    const GPUOperation& op, flatbuffers::FlatBufferBuilder* builder) {
+  auto args_fb = Encode(op.args_, builder);
+  auto code_fb = builder->CreateString(op.code_);
+  auto work_group_size_fb = Encode(op.work_group_size_, builder);
+  std::vector<flatbuffers::Offset<data::CompilerOption>> compiler_options_fb;
+  for (int i = 0; i < op.compiler_options_.size(); ++i) {
+    data::CompilerOptionBuilder option_builder(*builder);
+    option_builder.add_option(ToFB(op.compiler_options_[i]));
+    compiler_options_fb.push_back(option_builder.Finish());
+  }
+  auto compiler_options_fb_vec = builder->CreateVector(compiler_options_fb);
+
+  auto def_fb = Encode(op.definition_, builder);
+  auto work_group_launch_order_fb =
+      Encode(op.work_group_launch_order_, builder);
+  auto grid_size_fb = Encode(op.grid_size_, builder);
+  auto work_groups_count_fb = Encode(op.work_groups_count_, builder);
+
+  std::vector<flatbuffers::Offset<flatbuffers::String>> src_names_fb;
+  for (auto& name : op.src_tensors_names_) {
+    src_names_fb.push_back(builder->CreateString(name));
+  }
+  auto src_names_fb_vec = builder->CreateVector(src_names_fb);
+
+  std::vector<flatbuffers::Offset<flatbuffers::String>> dst_names_fb;
+  for (auto& name : op.dst_tensors_names_) {
+    dst_names_fb.push_back(builder->CreateString(name));
+  }
+  auto dst_names_fb_vec = builder->CreateVector(dst_names_fb);
+
+  auto elementwise_code_fb = builder->CreateString(op.elementwise_code_);
+
+  data::GPUOperationBuilder op_builder(*builder);
+  op_builder.add_arguments(args_fb);
+  op_builder.add_code(code_fb);
+  op_builder.add_work_group_size(work_group_size_fb);
+  op_builder.add_compiler_options(compiler_options_fb_vec);
+  op_builder.add_tensor_to_grid(ToFB(op.tensor_to_grid_));
+  op_builder.add_elementwise(op.elementwise_);
+  op_builder.add_linkable(op.linkable_);
+  op_builder.add_check_src_channels_size(op.check_src_channels_size_);
+  op_builder.add_definition(def_fb);
+  op_builder.add_grid_dimension(op.grid_dimension_);
+  op_builder.add_work_group_launch_order(work_group_launch_order_fb);
+  op_builder.add_grid_size(grid_size_fb);
+  op_builder.add_src_tensors_names(src_names_fb_vec);
+  op_builder.add_dst_tensors_names(dst_names_fb_vec);
+  op_builder.add_work_groups_count(work_groups_count_fb);
+  op_builder.add_linkable_count(op.linkable_count_);
+  op_builder.add_elementwise_code(elementwise_code_fb);
+  return op_builder.Finish();
+}
+
+flatbuffers::Offset<data::CLNode> Encode(
+    const CLNode& node, flatbuffers::FlatBufferBuilder* builder) {
+  auto op_fb = Encode(*node.operation, builder);
+  std::vector<int32_t> in_ids(node.inputs.size());
+  for (int i = 0; i < in_ids.size(); ++i) {
+    in_ids[i] = node.inputs[i];
+  }
+  std::vector<int32_t> out_ids(node.outputs.size());
+  for (int i = 0; i < out_ids.size(); ++i) {
+    out_ids[i] = node.outputs[i];
+  }
+  auto in_ids_fb = builder->CreateVector(in_ids);
+  auto out_ids_fb = builder->CreateVector(out_ids);
+  auto name_fb = builder->CreateString(node.name);
+  data::CLNodeBuilder node_builder(*builder);
+  node_builder.add_gpu_op(op_fb);
+  node_builder.add_input_ids(in_ids_fb);
+  node_builder.add_output_ids(out_ids_fb);
+  node_builder.add_name(name_fb);
+  return node_builder.Finish();
+}
+
+absl::Status Decode(CLContext* context, const data::CLNode* fb_node,
+                    CLNode* node) {
+  GPUOperation op;
+  RETURN_IF_ERROR(Decode(context, fb_node->gpu_op(), &op));
+  node->operation = absl::make_unique<GPUOperation>(std::move(op));
+  for (auto in_fb : *fb_node->input_ids()) {
+    node->inputs.push_back(in_fb);
+  }
+  for (auto out_fb : *fb_node->output_ids()) {
+    node->outputs.push_back(out_fb);
+  }
+  node->name = std::string(fb_node->name()->c_str(), fb_node->name()->size());
+
+  return absl::OkStatus();
+}
+
+flatbuffers::Offset<data::InferenceContext> Encode(
+    const InferenceContext& inference,
+    flatbuffers::FlatBufferBuilder* builder) {
+  std::vector<int32_t> in_ids(inference.input_ids_.size());
+  for (int i = 0; i < in_ids.size(); ++i) {
+    in_ids[i] = inference.input_ids_[i];
+  }
+  std::vector<int32_t> out_ids(inference.output_ids_.size());
+  for (int i = 0; i < out_ids.size(); ++i) {
+    out_ids[i] = inference.output_ids_[i];
+  }
+  auto in_ids_fb = builder->CreateVector(in_ids);
+  auto out_ids_fb = builder->CreateVector(out_ids);
+
+  std::vector<flatbuffers::Offset<data::CLNode>> nodes_fb;
+  for (int i = 0; i < inference.nodes_.size(); ++i) {
+    auto node_fb = Encode(inference.nodes_[i], builder);
+    nodes_fb.push_back(node_fb);
+  }
+  auto nodes_fb_vec = builder->CreateVector(nodes_fb);
+
+  std::vector<flatbuffers::Offset<data::TensorDescWithId>> tensors_fb;
+  auto tensors = inference.tensor_reserver_.GetTensorDescs();
+  for (auto& tensor : tensors) {
+    auto tensor_fb = Encode(tensor.second, tensor.first, builder);
+    tensors_fb.push_back(tensor_fb);
+  }
+  auto tensors_fb_vec = builder->CreateVector(tensors_fb);
+
+  std::vector<flatbuffers::Offset<data::PairOfValueIds>>
+      variable_ids_and_refs_fb;
+  for (auto& pair : inference.variable_ids_and_refs_) {
+    data::PairOfValueIdsBuilder pair_builder(*builder);
+    pair_builder.add_first(pair.first);
+    pair_builder.add_second(pair.second);
+    variable_ids_and_refs_fb.push_back(pair_builder.Finish());
+  }
+  auto variable_ids_and_refs_fb_vec =
+      builder->CreateVector(variable_ids_and_refs_fb);
+
+  data::InferenceContextBuilder inf_builder(*builder);
+  inf_builder.add_need_flush(inference.need_flush_);
+  inf_builder.add_flush_periodically(inference.flush_periodically_);
+  inf_builder.add_flush_period(inference.flush_period_);
+  inf_builder.add_need_manual_release(inference.need_manual_release_);
+  inf_builder.add_precision(ToFB(inference.precision_));
+  inf_builder.add_storage_type(ToFB(inference.storage_type_));
+  inf_builder.add_nodes(nodes_fb_vec);
+  inf_builder.add_tensors(tensors_fb_vec);
+  inf_builder.add_input_ids(in_ids_fb);
+  inf_builder.add_output_ids(out_ids_fb);
+  inf_builder.add_variable_ids_and_refs(variable_ids_and_refs_fb_vec);
+  return inf_builder.Finish();
+}
+
+absl::Status Decode(CLContext* context,
+                    const data::InferenceContext* fb_inference,
+                    InferenceContext* inference) {
+  inference->need_flush_ = fb_inference->need_flush();
+  inference->flush_periodically_ = fb_inference->flush_periodically();
+  inference->flush_period_ = fb_inference->flush_period();
+  inference->need_manual_release_ = fb_inference->need_manual_release();
+  inference->precision_ = ToEnum(fb_inference->precision());
+  inference->storage_type_ = ToEnum(fb_inference->storage_type());
+
+  inference->nodes_.resize(fb_inference->nodes()->size());
+  int counter = 0;
+  for (auto node_fb : *fb_inference->nodes()) {
+    RETURN_IF_ERROR(Decode(context, node_fb, &inference->nodes_[counter]));
+    counter++;
+  }
+
+  std::vector<std::pair<ValueId, TensorDescriptor>> tensors;
+  for (auto tensor_fb : *fb_inference->tensors()) {
+    TensorDescriptor desc;
+    Decode(tensor_fb->desc(), &desc);
+    tensors.push_back({tensor_fb->id(), std::move(desc)});
+  }
+  inference->tensor_reserver_.Add(tensors);
+  for (auto in_fb : *fb_inference->input_ids()) {
+    inference->input_ids_.push_back(in_fb);
+  }
+  for (auto out_fb : *fb_inference->output_ids()) {
+    inference->output_ids_.push_back(out_fb);
+  }
+
+  for (auto variable_id : *fb_inference->variable_ids_and_refs()) {
+    inference->variable_ids_and_refs_[variable_id->first()] =
+        variable_id->second();
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/serialization.fbs b/tensorflow/lite/delegates/gpu/cl/serialization.fbs
new file mode 100644
index 00000000000..0c0d2241b5a
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/serialization.fbs
@@ -0,0 +1,278 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+namespace tflite.gpu.cl.data;
+
+table Int4 {
+  x:int32;
+  y:int32;
+  z:int32;
+  w:int32;
+}
+
+table Int3 {
+  x:int32;
+  y:int32;
+  z:int32;
+}
+
+table Int2 {
+  x:int32;
+  y:int32;
+}
+
+table IntValue {
+  name:string;
+  value:int32;
+  active:bool;
+  offset:uint32;
+}
+
+table FloatValue {
+  name:string;
+  value:float;
+  active:bool;
+  offset:uint32;
+}
+
+table HalfValue {
+  name:string;
+  value:float;
+  active:bool;
+  store_as_f32:bool;
+  offset:uint32;
+}
+
+enum AccessType : byte {
+  READ = 0,
+  WRITE = 1,
+  READ_WRITE = 2,
+}
+
+enum DataType : byte {
+  UNKNOWN = 0,
+  FLOAT32 = 1,
+  FLOAT16 = 2,
+}
+
+enum MemoryType : byte {
+  GLOBAL = 0,
+  CONSTANT = 1,
+  LOCAL = 2,
+}
+
+table StateVariable {
+  key:string;
+  value:string;
+}
+
+table GPUObjectDescriptor {
+  state_vars:[StateVariable];
+  access_type:AccessType;
+}
+
+table BufferDescriptor {
+  base_obj:GPUObjectDescriptor;
+  element_type:DataType;
+  element_size:int32;
+  memory_type:MemoryType;
+  attributes:[string];
+  size:int32;
+  data:[uint8];
+}
+
+table Texture2DDescriptor {
+  base_obj:GPUObjectDescriptor;
+  element_type:DataType;
+  normalized:bool;
+  normalized_type:DataType;
+  size:Int2;
+  data:[uint8];
+}
+
+enum LinearStorageType : byte {
+  BUFFER = 0,
+  TEXTURE_2D = 1,
+}
+
+table TensorLinearDescriptor {
+  base_obj:GPUObjectDescriptor;
+  storage_type:LinearStorageType;
+  element_type:DataType;
+  memory_type:MemoryType;
+  size:int32;
+  data:[uint8];
+}
+
+enum TensorStorageType : byte {
+  UNKNOWN = 0,
+  BUFFER = 1,
+  IMAGE_BUFFER = 2,
+  TEXTURE_2D = 3,
+  TEXTURE_3D = 4,
+  TEXTURE_ARRAY = 5,
+  SINGLE_TEXTURE_2D = 6,
+}
+
+enum Layout : byte {
+  UNKNOWN = 0,
+  HWC = 1,
+  BHWC = 2,
+  HWDC = 3,
+  BHWDC = 4,
+}
+
+table BHWDC {
+  b:int32;
+  h:int32;
+  w:int32;
+  d:int32;
+  c:int32;
+}
+
+table TensorDescriptor {
+  base_obj:GPUObjectDescriptor;
+  data_type:DataType;
+  storage_type:TensorStorageType;
+  layout:Layout;
+  shape:BHWDC;
+  data:[uint8];
+}
+
+table BufferDescriptorMapValue {
+  key:string;
+  value:BufferDescriptor;
+}
+
+table Texture2DDescriptorMapValue {
+  key:string;
+  value:Texture2DDescriptor;
+}
+
+table TensorLinearDescriptorMapValue {
+  key:string;
+  value:TensorLinearDescriptor;
+}
+
+table TensorDescriptorMapValue {
+  key:string;
+  value:TensorDescriptor;
+}
+
+table Arguments {
+  int_values:[IntValue];
+  shared_int4s:[int32];
+
+  float_values:[FloatValue];
+  shared_float4s:[float];
+
+  half_values:[HalfValue];
+  shared_half4s:[float];
+
+  buffer_refs:[BufferDescriptorMapValue];
+  texture2d_refs:[Texture2DDescriptorMapValue];
+  tensor_linear_refs:[TensorLinearDescriptorMapValue];
+  tensor_refs:[TensorDescriptorMapValue];
+
+  buffer_objects:[BufferDescriptorMapValue];
+  texture2d_objects:[Texture2DDescriptorMapValue];
+  tensor_linear_objects:[TensorLinearDescriptorMapValue];
+  tensor_objects:[TensorDescriptorMapValue];
+}
+
+enum CalculationsPrecision : byte {
+  F32 = 0,
+  F32_F16 = 1,
+  F16 = 2,
+}
+
+enum TensorToGrid : byte {
+  CUSTOM = 0,
+  WB_TO_X_HD_TO_Y_S_TO_Z = 1,
+  WB_TO_X_HD_TO_Y_Z_IS_1 = 2,
+  WB_TO_X_H_TO_Y_D_TO_Z = 3,
+  B_TO_X_Y_IS_1_Z_IS_1 = 4,
+}
+
+enum CompilerOptions : byte {
+  ADRENO_FULL_SIMD_LINE = 0,
+  ADRENO_MORE_WAVES = 1,
+  POWERVR_FP16 = 2,
+  CL_OPT_DISABLE = 3,
+  CL_2_0 = 4,
+  CL_3_0 = 5,
+}
+
+table OperationDef {
+  precision:CalculationsPrecision;
+  src_tensors:[TensorDescriptor];
+  dst_tensors:[TensorDescriptor];
+}
+
+table CompilerOption {
+  option:CompilerOptions;
+}
+
+table GPUOperation {
+  arguments:Arguments;
+  code:string;
+  work_group_size:Int3;
+  compiler_options:[CompilerOption];
+  tensor_to_grid:TensorToGrid;
+  elementwise:bool;
+  linkable:bool;
+  check_src_channels_size:bool;
+  definition:OperationDef;
+  grid_dimension:int32;
+  work_group_launch_order:Int3;
+  grid_size:Int3;
+  src_tensors_names:[string];
+  dst_tensors_names:[string];
+  work_groups_count:Int3;
+  linkable_count:int32;
+  elementwise_code:string;
+}
+
+table TensorDescWithId {
+  desc:TensorDescriptor;
+  id:int32;
+}
+
+table CLNode {
+  gpu_op:GPUOperation;
+  input_ids:[int32];
+  output_ids:[int32];
+  name:string;
+}
+
+table PairOfValueIds {
+  first:int32;
+  second:int32;
+}
+
+table InferenceContext {
+  need_flush:bool;
+  flush_periodically:bool;
+  flush_period:int32;
+  need_manual_release:bool;
+  precision:CalculationsPrecision;
+  storage_type:TensorStorageType;
+  nodes:[CLNode];
+  tensors:[TensorDescWithId];
+  input_ids:[int32];
+  variable_ids_and_refs:[PairOfValueIds];
+  output_ids:[int32];
+}
+
+root_type InferenceContext;
diff --git a/tensorflow/lite/delegates/gpu/cl/serialization.h b/tensorflow/lite/delegates/gpu/cl/serialization.h
new file mode 100644
index 00000000000..1273e62a100
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/serialization.h
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_SERIALIZATION_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_SERIALIZATION_H_
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
+#include "tensorflow/lite/delegates/gpu/cl/serialization_generated.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class InferenceContext;
+
+flatbuffers::Offset<data::InferenceContext> Encode(
+    const InferenceContext& inference, flatbuffers::FlatBufferBuilder* builder);
+
+absl::Status Decode(CLContext* context,
+                    const data::InferenceContext* fb_inference,
+                    InferenceContext* inference);
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_SERIALIZATION_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.cc b/tensorflow/lite/delegates/gpu/cl/tensor.cc
index 72c53c5b1ac..c35554b875b 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@@ -605,8 +605,11 @@ absl::Status Tensor::CreateFromDescriptor(const TensorDescriptor& desc,
   descriptor_.layout = desc.layout;
   memory_owner_ = true;
   CLMemory memory;
-  RETURN_IF_ERROR(AllocateTensorMemory(*context, shape_, descriptor_,
-                                       desc.data.data(), &memory));
+  uint8_t* data_ptr = desc.data.empty()
+                          ? nullptr
+                          : const_cast<unsigned char*>(desc.data.data());
+  RETURN_IF_ERROR(
+      AllocateTensorMemory(*context, shape_, descriptor_, data_ptr, &memory));
   memory_ = memory.Release();
   if (desc.storage_type == TensorStorageType::IMAGE_BUFFER) {
     RETURN_IF_ERROR(CreateImageBufferFromBuffer(
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index 7bd5de6e31e..f31df43539e 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -771,6 +771,46 @@ void TensorDescriptor::UploadData(absl::Span<const float> src) {
   }
 }
 
+bool TensorDescriptor::SupportsZeroClamp(const Axis& axis) const {
+  switch (storage_type) {
+    case TensorStorageType::UNKNOWN:
+      return false;
+    case TensorStorageType::BUFFER:
+    case TensorStorageType::IMAGE_BUFFER:
+      return false;
+    case TensorStorageType::TEXTURE_ARRAY:
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return axis == Axis::WIDTH || axis == Axis::HEIGHT;
+    case TensorStorageType::TEXTURE_3D:
+      return axis == Axis::WIDTH || axis == Axis::HEIGHT || axis == Axis::DEPTH;
+  }
+}
+
+bool TensorDescriptor::CanReadOutOfBorder(const Axis& axis) const {
+  switch (storage_type) {
+    case TensorStorageType::UNKNOWN:
+      return false;
+    case TensorStorageType::BUFFER:
+      return false;
+    case TensorStorageType::IMAGE_BUFFER:
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::TEXTURE_3D:
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+    case TensorStorageType::TEXTURE_ARRAY:
+      return true;
+  }
+}
+
+bool TensorDescriptor::IsLinear() const {
+  return storage_type == TensorStorageType::BUFFER ||
+         storage_type == TensorStorageType::IMAGE_BUFFER;
+}
+
+bool TensorDescriptor::ReturnsZeroForNegOneRead() const {
+  return storage_type == TensorStorageType::IMAGE_BUFFER;
+}
+
 namespace {
 int GetLinearIndex(const TensorDescriptor& desc, const BHWDC& shape, int b,
                    int x, int y, int d, int s, int sub_c) {
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
index 094e3905966..2157bf05543 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
@@ -82,6 +82,16 @@ struct TensorDescriptor : public GPUObjectDescriptor {
   void UploadData(const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& src);
   void UploadData(const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src);
 
+  bool SupportsZeroClamp(const Axis& axis) const;
+  bool CanReadOutOfBorder(const Axis& axis) const;
+  bool IsLinear() const;
+
+  // applicable only for types that: IsLinear -> true.
+  // In this case for address we have 1d component - addr (int)
+  // If for addr == -1 this linear storage type returns FLT4(0.0), this function
+  // returns true, otherwise false
+  bool ReturnsZeroForNegOneRead() const;
+
   DataType data_type = DataType::UNKNOWN;
   TensorStorageType storage_type = TensorStorageType::UNKNOWN;
   // This field describes logical layout, actual(physical) GPU layout can be
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/BUILD b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
index c82190ca0e6..a14dfd72cfd 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/testing/BUILD
@@ -3,20 +3,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-cc_binary(
-    name = "performance_profiling",
-    srcs = ["performance_profiling.cc"],
-    deps = [
-        "//tensorflow/lite/delegates/gpu/cl:environment",
-        "//tensorflow/lite/delegates/gpu/cl:inference_context",
-        "//tensorflow/lite/delegates/gpu/common:model",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common/testing:tflite_model_reader",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "@com_google_absl//absl/time",
-    ],
-)
-
 cc_binary(
     name = "delegate_testing",
     srcs = ["delegate_testing.cc"],
@@ -34,3 +20,38 @@ cc_binary(
         "@com_google_absl//absl/time",
     ],
 )
+
+cc_binary(
+    name = "internal_api_samples",
+    srcs = ["internal_api_samples.cc"],
+    tags = [
+        "nobuilder",
+        "notap",
+    ],
+    deps = [
+        "//tensorflow/lite/delegates/gpu:api",
+        "//tensorflow/lite/delegates/gpu/cl:api",
+        "//tensorflow/lite/delegates/gpu/cl:environment",
+        "//tensorflow/lite/delegates/gpu/cl:inference_context",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common/testing:tflite_model_reader",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:kernel_util",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_binary(
+    name = "performance_profiling",
+    srcs = ["performance_profiling.cc"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/cl:environment",
+        "//tensorflow/lite/delegates/gpu/cl:inference_context",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common/testing:tflite_model_reader",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_absl//absl/time",
+    ],
+)
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/delegate_testing.cc b/tensorflow/lite/delegates/gpu/cl/testing/delegate_testing.cc
index 10b7ac34404..3a618e55c06 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/delegate_testing.cc
+++ b/tensorflow/lite/delegates/gpu/cl/testing/delegate_testing.cc
@@ -132,6 +132,7 @@ int main(int argc, char** argv) {
   options.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY;
   options.inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_MEMORY_USAGE;
   options.inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
+  options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE;
   options.max_delegated_partitions = 1;
   auto* gpu_delegate = TfLiteGpuDelegateV2Create(&options);
   status = gpu_inference->ModifyGraphWithDelegate(gpu_delegate);
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/internal_api_samples.cc b/tensorflow/lite/delegates/gpu/cl/testing/internal_api_samples.cc
new file mode 100644
index 00000000000..be297546709
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/testing/internal_api_samples.cc
@@ -0,0 +1,453 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <chrono>  // NOLINT(build/c++11)
+#include <iostream>
+#include <string>
+
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/api.h"
+#include "tensorflow/lite/delegates/gpu/cl/api.h"
+#include "tensorflow/lite/delegates/gpu/cl/environment.h"
+#include "tensorflow/lite/delegates/gpu/cl/inference_context.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/register.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+void FillInputTensors(tflite::Interpreter* interpreter) {
+  for (int k = 0; k < interpreter->inputs().size(); ++k) {
+    TfLiteTensor* tensor_ptr = interpreter->tensor(interpreter->inputs()[k]);
+    const auto tensor_elements_count = tflite::NumElements(tensor_ptr);
+    if (tensor_ptr->type == kTfLiteFloat32) {
+      float* p = interpreter->typed_input_tensor<float>(k);
+      for (int i = 0; i < tensor_elements_count; ++i) {
+        p[i] = std::sin(i);
+      }
+    } else {
+      std::cout << "No support of non Float32 input/output tensors"
+                << std::endl;
+    }
+  }
+}
+
+void CompareCPUGPUResults(tflite::Interpreter* cpu,
+                          const std::vector<int64_t>& outputs,
+                          const std::vector<std::vector<float>>& gpu,
+                          float eps) {
+  for (int i = 0; i < gpu.size(); ++i) {
+    TfLiteTensor* tensor_ptr = cpu->tensor(outputs[i]);
+    const float* cpu_out = tensor_ptr->data.f;
+    const float* gpu_out = gpu[i].data();
+    const int kMaxPrint = 10;
+    int printed = 0;
+    int total_different = 0;
+    for (int k = 0; k < tensor_ptr->bytes / 4; ++k) {
+      const float abs_diff = fabs(cpu_out[k] - gpu_out[k]);
+      if (abs_diff > eps) {
+        total_different++;
+        if (printed < kMaxPrint) {
+          std::cout << "Output #" << i << ": element #" << k << ": CPU value - "
+                    << cpu_out[k] << ", GPU value - " << gpu_out[k]
+                    << ", abs diff - " << abs_diff << std::endl;
+          printed++;
+        }
+        if (printed == kMaxPrint) {
+          std::cout << "Printed " << kMaxPrint
+                    << " different elements, threshhold - " << eps
+                    << ", next different elements skipped" << std::endl;
+          printed++;
+        }
+      }
+    }
+    std::cout << "Total " << total_different
+              << " different elements, for output #" << i << ", threshhold - "
+              << eps << std::endl;
+  }
+}
+}  // namespace
+
+absl::Status RunModelSampleWithInternalAPISerializedKernels(
+    const std::string& model_name, const std::vector<uint8_t>& kernel_cache);
+
+absl::Status RunModelSampleWithInternalAPISerialized(
+    tflite::Interpreter* cpu, const std::vector<int64_t>& in_refs,
+    const std::vector<int64_t>& out_refs,
+    const std::vector<uint8_t>& kernel_cache,
+    const std::vector<uint8_t>& serialized_model);
+
+// Run Jet with OpenCL internal API and compares correctness with TFLite CPU
+absl::Status RunModelSampleWithInternalAPI(const std::string& model_name,
+                                           std::vector<uint8_t>* kernel_cache) {
+  auto flatbuffer = tflite::FlatBufferModel::BuildFromFile(model_name.c_str());
+
+  ops::builtin::BuiltinOpResolver op_resolver;
+  InterpreterBuilder tfl_builder(*flatbuffer, op_resolver);
+
+  // CPU.
+  std::unique_ptr<tflite::Interpreter> cpu_inference;
+  tfl_builder(&cpu_inference);
+  if (!cpu_inference) {
+    return absl::InternalError("Failed to build CPU inference.");
+  }
+  auto status = cpu_inference->AllocateTensors();
+  if (status != kTfLiteOk) {
+    return absl::InternalError("Failed to AllocateTensors for CPU inference.");
+  }
+  for (int k = 0; k < cpu_inference->inputs().size(); ++k) {
+    TfLiteTensor* tensor_ptr =
+        cpu_inference->tensor(cpu_inference->inputs()[k]);
+    if (tensor_ptr->type != kTfLiteFloat32) {
+      return absl::InvalidArgumentError(
+          "Internal api supports only F32 input tensors");
+    }
+  }
+  for (int k = 0; k < cpu_inference->outputs().size(); ++k) {
+    TfLiteTensor* tensor_ptr =
+        cpu_inference->tensor(cpu_inference->outputs()[k]);
+    if (tensor_ptr->type != kTfLiteFloat32) {
+      return absl::InvalidArgumentError(
+          "Internal api supports only F32 output tensors");
+    }
+  }
+  FillInputTensors(cpu_inference.get());
+  status = cpu_inference->Invoke();
+  if (status != kTfLiteOk) {
+    return absl::InternalError("Failed to Invoke CPU inference.");
+  }
+
+  const auto start = std::chrono::high_resolution_clock::now();
+  GraphFloat32 graph_cl;
+  RETURN_IF_ERROR(BuildFromFlatBuffer(*flatbuffer, op_resolver, &graph_cl));
+
+  auto inputs = graph_cl.inputs();
+  auto outputs = graph_cl.outputs();
+  std::vector<int64_t> in_refs(inputs.size());
+  std::vector<int64_t> out_refs(outputs.size());
+  for (int i = 0; i < inputs.size(); ++i) {
+    in_refs[i] = inputs[i]->tensor.ref;
+  }
+  for (int i = 0; i < outputs.size(); ++i) {
+    out_refs[i] = outputs[i]->tensor.ref;
+  }
+
+  Environment env;
+  RETURN_IF_ERROR(CreateEnvironment(&env));
+
+  std::unique_ptr<InferenceEnvironment> inf_env;
+  // Initializes environment.
+  InferenceEnvironmentOptions env_options;
+  env_options.device = env.device().id();
+  env_options.context = env.context().context();
+  env_options.command_queue = env.queue()->queue();
+  RETURN_IF_ERROR(NewInferenceEnvironment(env_options, &inf_env, nullptr));
+
+  std::unique_ptr<InferenceBuilder> builder;
+  // Initializes builder.
+  InferenceOptions options;
+  options.priority1 = InferencePriority::MIN_LATENCY;
+  options.priority2 = InferencePriority::MIN_MEMORY_USAGE;
+  options.priority3 = InferencePriority::MAX_PRECISION;
+  options.usage = InferenceUsage::SUSTAINED_SPEED;
+
+  RETURN_IF_ERROR(
+      inf_env->NewInferenceBuilder(options, std::move(graph_cl), &builder));
+
+  // Sets input/output object def for builder_.
+  ObjectDef obj_def;
+  obj_def.data_type = DataType::FLOAT32;
+  obj_def.data_layout = DataLayout::BHWC;
+  obj_def.object_type = ObjectType::CPU_MEMORY;
+  obj_def.user_provided = true;
+  for (int i = 0; i < in_refs.size(); ++i) {
+    RETURN_IF_ERROR(builder->SetInputObjectDef(i, obj_def));
+  }
+  for (int i = 0; i < out_refs.size(); ++i) {
+    RETURN_IF_ERROR(builder->SetOutputObjectDef(i, obj_def));
+  }
+
+  std::unique_ptr<::tflite::gpu::InferenceRunner> runner;
+  // Builds runner.
+  RETURN_IF_ERROR(builder->Build(&runner));
+
+  const auto end = std::chrono::high_resolution_clock::now();
+  std::cout << "Initialization total time - " << (end - start).count() * 1e-6f
+            << "ms" << std::endl;
+
+  if (kernel_cache) {
+    *kernel_cache = inf_env->GetSerializedBinaryCache();
+    std::cout << "Kernel cache size - " << kernel_cache->size() << std::endl;
+  }
+
+  // Sets the input/output object.
+  for (int i = 0; i < in_refs.size(); ++i) {
+    TfLiteTensor* tensor_ptr = cpu_inference->tensor(in_refs[i]);
+    RETURN_IF_ERROR(runner->SetInputObject(
+        i, CpuMemory{tensor_ptr->data.data, tensor_ptr->bytes}));
+  }
+
+  std::vector<std::vector<float>> output_tensors(out_refs.size());
+  for (int i = 0; i < out_refs.size(); ++i) {
+    TfLiteTensor* tensor_ptr = cpu_inference->tensor(out_refs[i]);
+    output_tensors[i].resize(tensor_ptr->bytes / 4);
+    RETURN_IF_ERROR(runner->SetOutputObject(
+        i, CpuMemory{output_tensors[i].data(), tensor_ptr->bytes}));
+  }
+
+  RETURN_IF_ERROR(runner->Run());
+
+  CompareCPUGPUResults(cpu_inference.get(), out_refs, output_tensors, 1e-4f);
+
+  return absl::OkStatus();
+}
+
+absl::Status RunModelSampleWithInternalAPISerializedKernels(
+    const std::string& model_name, const std::vector<uint8_t>& kernel_cache) {
+  auto flatbuffer = tflite::FlatBufferModel::BuildFromFile(model_name.c_str());
+
+  ops::builtin::BuiltinOpResolver op_resolver;
+  InterpreterBuilder tfl_builder(*flatbuffer, op_resolver);
+
+  // CPU.
+  std::unique_ptr<tflite::Interpreter> cpu_inference;
+  tfl_builder(&cpu_inference);
+  if (!cpu_inference) {
+    return absl::InternalError("Failed to build CPU inference.");
+  }
+  auto status = cpu_inference->AllocateTensors();
+  if (status != kTfLiteOk) {
+    return absl::InternalError("Failed to AllocateTensors for CPU inference.");
+  }
+  for (int k = 0; k < cpu_inference->inputs().size(); ++k) {
+    TfLiteTensor* tensor_ptr =
+        cpu_inference->tensor(cpu_inference->inputs()[k]);
+    if (tensor_ptr->type != kTfLiteFloat32) {
+      return absl::InvalidArgumentError(
+          "Internal api supports only F32 input tensors");
+    }
+  }
+  for (int k = 0; k < cpu_inference->outputs().size(); ++k) {
+    TfLiteTensor* tensor_ptr =
+        cpu_inference->tensor(cpu_inference->outputs()[k]);
+    if (tensor_ptr->type != kTfLiteFloat32) {
+      return absl::InvalidArgumentError(
+          "Internal api supports only F32 output tensors");
+    }
+  }
+  FillInputTensors(cpu_inference.get());
+  status = cpu_inference->Invoke();
+  if (status != kTfLiteOk) {
+    return absl::InternalError("Failed to Invoke CPU inference.");
+  }
+
+  const auto start = std::chrono::high_resolution_clock::now();
+  GraphFloat32 graph_cl;
+  RETURN_IF_ERROR(BuildFromFlatBuffer(*flatbuffer, op_resolver, &graph_cl));
+
+  auto inputs = graph_cl.inputs();
+  auto outputs = graph_cl.outputs();
+  std::vector<int64_t> in_refs(inputs.size());
+  std::vector<int64_t> out_refs(outputs.size());
+  for (int i = 0; i < inputs.size(); ++i) {
+    in_refs[i] = inputs[i]->tensor.ref;
+  }
+  for (int i = 0; i < outputs.size(); ++i) {
+    out_refs[i] = outputs[i]->tensor.ref;
+  }
+
+  Environment env;
+  RETURN_IF_ERROR(CreateEnvironment(&env));
+
+  std::unique_ptr<InferenceEnvironment> inf_env;
+  // Initializes environment.
+  InferenceEnvironmentOptions env_options;
+  env_options.device = env.device().id();
+  env_options.context = env.context().context();
+  env_options.command_queue = env.queue()->queue();
+  env_options.serialized_binary_cache =
+      absl::MakeSpan(kernel_cache.data(), kernel_cache.size());
+  RETURN_IF_ERROR(NewInferenceEnvironment(env_options, &inf_env, nullptr));
+
+  InferenceOptions options;
+  options.priority1 = InferencePriority::MIN_LATENCY;
+  options.priority2 = InferencePriority::MIN_MEMORY_USAGE;
+  options.priority3 = InferencePriority::MAX_PRECISION;
+  options.usage = InferenceUsage::SUSTAINED_SPEED;
+
+  std::vector<uint8_t> serialized_model;
+  RETURN_IF_ERROR(inf_env->BuildSerializedModel(options, std::move(graph_cl),
+                                                &serialized_model));
+  std::unique_ptr<InferenceBuilder> builder;
+  RETURN_IF_ERROR(inf_env->NewInferenceBuilder(serialized_model, &builder));
+
+  // Sets input/output object def for builder_.
+  ObjectDef obj_def;
+  obj_def.data_type = DataType::FLOAT32;
+  obj_def.data_layout = DataLayout::BHWC;
+  obj_def.object_type = ObjectType::CPU_MEMORY;
+  obj_def.user_provided = true;
+  for (int i = 0; i < in_refs.size(); ++i) {
+    RETURN_IF_ERROR(builder->SetInputObjectDef(i, obj_def));
+  }
+  for (int i = 0; i < out_refs.size(); ++i) {
+    RETURN_IF_ERROR(builder->SetOutputObjectDef(i, obj_def));
+  }
+
+  std::unique_ptr<::tflite::gpu::InferenceRunner> runner;
+  // Builds runner.
+  RETURN_IF_ERROR(builder->Build(&runner));
+
+  const auto end = std::chrono::high_resolution_clock::now();
+  std::cout << "Initialization total time(with kernel cache) - "
+            << (end - start).count() * 1e-6f << "ms" << std::endl;
+
+  // Sets the input/output object.
+  for (int i = 0; i < in_refs.size(); ++i) {
+    TfLiteTensor* tensor_ptr = cpu_inference->tensor(in_refs[i]);
+    RETURN_IF_ERROR(runner->SetInputObject(
+        i, CpuMemory{tensor_ptr->data.data, tensor_ptr->bytes}));
+  }
+
+  std::vector<std::vector<float>> output_tensors(out_refs.size());
+  for (int i = 0; i < out_refs.size(); ++i) {
+    TfLiteTensor* tensor_ptr = cpu_inference->tensor(out_refs[i]);
+    output_tensors[i].resize(tensor_ptr->bytes / 4);
+    RETURN_IF_ERROR(runner->SetOutputObject(
+        i, CpuMemory{output_tensors[i].data(), tensor_ptr->bytes}));
+  }
+
+  RETURN_IF_ERROR(runner->Run());
+
+  CompareCPUGPUResults(cpu_inference.get(), out_refs, output_tensors, 1e-4f);
+
+  RETURN_IF_ERROR(RunModelSampleWithInternalAPISerialized(
+      cpu_inference.get(), in_refs, out_refs, kernel_cache, serialized_model));
+
+  return absl::OkStatus();
+}
+
+absl::Status RunModelSampleWithInternalAPISerialized(
+    tflite::Interpreter* cpu, const std::vector<int64_t>& in_refs,
+    const std::vector<int64_t>& out_refs,
+    const std::vector<uint8_t>& kernel_cache,
+    const std::vector<uint8_t>& serialized_model) {
+  FillInputTensors(cpu);
+  auto status = cpu->Invoke();
+  if (status != kTfLiteOk) {
+    return absl::InternalError("Failed to Invoke CPU inference.");
+  }
+
+  const auto start = std::chrono::high_resolution_clock::now();
+
+  Environment env;
+  RETURN_IF_ERROR(CreateEnvironment(&env));
+
+  std::unique_ptr<InferenceEnvironment> inf_env;
+  // Initializes environment.
+  InferenceEnvironmentOptions env_options;
+  env_options.device = env.device().id();
+  env_options.context = env.context().context();
+  env_options.command_queue = env.queue()->queue();
+  env_options.serialized_binary_cache =
+      absl::MakeSpan(kernel_cache.data(), kernel_cache.size());
+  RETURN_IF_ERROR(NewInferenceEnvironment(env_options, &inf_env, nullptr));
+
+  std::unique_ptr<InferenceBuilder> builder;
+  RETURN_IF_ERROR(inf_env->NewInferenceBuilder(serialized_model, &builder));
+
+  // Sets input/output object def for builder_.
+  ObjectDef obj_def;
+  obj_def.data_type = DataType::FLOAT32;
+  obj_def.data_layout = DataLayout::BHWC;
+  obj_def.object_type = ObjectType::CPU_MEMORY;
+  obj_def.user_provided = true;
+  for (int i = 0; i < in_refs.size(); ++i) {
+    RETURN_IF_ERROR(builder->SetInputObjectDef(i, obj_def));
+  }
+  for (int i = 0; i < out_refs.size(); ++i) {
+    RETURN_IF_ERROR(builder->SetOutputObjectDef(i, obj_def));
+  }
+
+  std::unique_ptr<::tflite::gpu::InferenceRunner> runner;
+  // Builds runner.
+  RETURN_IF_ERROR(builder->Build(&runner));
+
+  const auto end = std::chrono::high_resolution_clock::now();
+  std::cout << "Serialized initialization total time - "
+            << (end - start).count() * 1e-6f << "ms" << std::endl;
+
+  // Sets the input/output object.
+  for (int i = 0; i < in_refs.size(); ++i) {
+    TfLiteTensor* tensor_ptr = cpu->tensor(in_refs[i]);
+    RETURN_IF_ERROR(runner->SetInputObject(
+        i, CpuMemory{tensor_ptr->data.data, tensor_ptr->bytes}));
+  }
+
+  std::vector<std::vector<float>> output_tensors(out_refs.size());
+  for (int i = 0; i < out_refs.size(); ++i) {
+    TfLiteTensor* tensor_ptr = cpu->tensor(out_refs[i]);
+    output_tensors[i].resize(tensor_ptr->bytes / 4);
+    RETURN_IF_ERROR(runner->SetOutputObject(
+        i, CpuMemory{output_tensors[i].data(), tensor_ptr->bytes}));
+  }
+
+  RETURN_IF_ERROR(runner->Run());
+
+  std::cout << "Comparing results second time:" << std::endl;
+
+  CompareCPUGPUResults(cpu, out_refs, output_tensors, 1e-4f);
+
+  return absl::OkStatus();
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  if (argc <= 1) {
+    std::cerr << "Expected model path as second argument.";
+    return -1;
+  }
+
+  auto load_status = tflite::gpu::cl::LoadOpenCL();
+  if (!load_status.ok()) {
+    std::cerr << load_status.message();
+    return -1;
+  }
+
+  std::vector<uint8_t> kernel_cache;
+  auto run_status =
+      tflite::gpu::cl::RunModelSampleWithInternalAPI(argv[1], &kernel_cache);
+  if (!run_status.ok()) {
+    std::cerr << run_status.message();
+    return -1;
+  }
+  run_status = tflite::gpu::cl::RunModelSampleWithInternalAPISerializedKernels(
+      argv[1], kernel_cache);
+  if (!run_status.ok()) {
+    std::cerr << run_status.message();
+    return -1;
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
index ab2e52f14ed..540004ad746 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
@@ -43,7 +43,7 @@ absl::Status RunModelSample(const std::string& model_name) {
   create_info.precision = env.IsSupported(CalculationsPrecision::F16)
                               ? CalculationsPrecision::F16
                               : CalculationsPrecision::F32;
-  create_info.storage_type = GetFastestStorageType(env.device());
+  create_info.storage_type = GetFastestStorageType(env.device().GetInfo());
   create_info.hints.Add(ModelHints::kAllowSpecialKernels);
   std::cout << "Precision: " << ToString(create_info.precision) << std::endl;
   std::cout << "Storage type: " << ToString(create_info.storage_type)
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/run_delegate_testing.sh b/tensorflow/lite/delegates/gpu/cl/testing/run_delegate_testing.sh
index 7b86407dbad..70d2a5cf3dc 100755
--- a/tensorflow/lite/delegates/gpu/cl/testing/run_delegate_testing.sh
+++ b/tensorflow/lite/delegates/gpu/cl/testing/run_delegate_testing.sh
@@ -78,11 +78,17 @@ ADB push "$model_path" "$OPENCL_DIR"
 declare -a BUILD_CONFIG
 abi_version=$(ADB shell getprop ro.product.cpu.abi | tr -d '\r')
 if [[ "$abi_version" == "armeabi-v7a" ]]; then
-#"32 bit"
+#"32 bit ARM"
 BUILD_CONFIG=( --config=android_arm -c opt --copt=-fPIE --linkopt=-pie )
-else
-#"64 bit"
+elif [[ "$abi_version" == "arm64-v8a" ]]; then
+#"64 bit ARM"
 BUILD_CONFIG=( --config=android_arm64 -c opt )
+elif [[ "$abi_version" == "x86_64" ]]; then
+# x86_64
+BUILD_CONFIG=( --config=android_x86_64 -c opt )
+else
+echo "Error: Unknown processor ABI"
+exit 1
 fi
 
 bazel build "${BUILD_CONFIG[@]}" //$SHELL_DIR:$BINARY_NAME
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/run_internal_api_samples.sh b/tensorflow/lite/delegates/gpu/cl/testing/run_internal_api_samples.sh
new file mode 100755
index 00000000000..21900c55875
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/testing/run_internal_api_samples.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+shopt -s expand_aliases  # to work with commands aliases in .sh
+
+description="Example of intetrnal api usage:
+How to use:
+[-h or --help, print instructions]
+[-m or --model_path, path to the model in .tflite format]
+[-d or --device, select device](optional, if you have few connected devices)"
+
+model_path=""
+alias ADB='adb'
+host=""
+
+while [[ "$1" != "" ]]; do
+  case $1 in
+    -m | --model_path)
+      shift
+      model_path=$1
+      ;;
+    -d | --device)
+      shift
+      if [[ "$1" == "HOST" ]]
+      then
+      host="HOST"
+      fi
+      alias ADB='adb -s '$1''
+      ;;
+    -h | --help)
+      echo "$description"
+      exit
+      ;;
+  esac
+  shift
+done
+
+if [ "$model_path" = "" ]
+then
+echo "No model provided."
+echo "$description"
+exit
+fi
+
+SHELL_DIR=$(dirname "$0")
+BINARY_NAME=internal_api_samples
+
+if [[ "$host" == "HOST" ]]
+then
+bazel build -c opt --copt -DCL_DELEGATE_NO_GL //"$SHELL_DIR":"$BINARY_NAME"
+chmod +x bazel-bin/"$SHELL_DIR"/"$BINARY_NAME"
+./bazel-bin/"$SHELL_DIR"/"$BINARY_NAME" "$model_path"
+exit
+fi
+
+model_name=${model_path##*/}  # finds last token after '/'
+
+OPENCL_DIR=/data/local/tmp/internal_api_samples/
+
+ADB shell mkdir -p $OPENCL_DIR
+
+ADB push "$model_path" "$OPENCL_DIR"
+
+declare -a BUILD_CONFIG
+abi_version=$(ADB shell getprop ro.product.cpu.abi | tr -d '\r')
+if [[ "$abi_version" == "armeabi-v7a" ]]; then
+#"32 bit ARM"
+BUILD_CONFIG=( --config=android_arm -c opt --copt=-fPIE --linkopt=-pie )
+elif [[ "$abi_version" == "arm64-v8a" ]]; then
+#"64 bit ARM"
+BUILD_CONFIG=( --config=android_arm64 -c opt )
+elif [[ "$abi_version" == "x86_64" ]]; then
+# x86_64
+BUILD_CONFIG=( --config=android_x86_64 -c opt )
+else
+echo "Error: Unknown processor ABI"
+exit 1
+fi
+
+bazel build "${BUILD_CONFIG[@]}" --copt -DCL_DELEGATE_NO_GL //$SHELL_DIR:$BINARY_NAME
+
+ADB push bazel-bin/$SHELL_DIR/$BINARY_NAME $OPENCL_DIR
+
+ADB shell chmod +x $OPENCL_DIR/$BINARY_NAME
+ADB shell "cd $OPENCL_DIR && ./$BINARY_NAME $model_name"
+
+# clean up files from device
+ADB shell rm -rf $OPENCL_DIR
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/run_performance_profiling.sh b/tensorflow/lite/delegates/gpu/cl/testing/run_performance_profiling.sh
index 0fd2d33de14..56d1e1010ed 100755
--- a/tensorflow/lite/delegates/gpu/cl/testing/run_performance_profiling.sh
+++ b/tensorflow/lite/delegates/gpu/cl/testing/run_performance_profiling.sh
@@ -83,11 +83,17 @@ ADB push "$model_path" "$OPENCL_DIR"
 declare -a BUILD_CONFIG
 abi_version=$(ADB shell getprop ro.product.cpu.abi | tr -d '\r')
 if [[ "$abi_version" == "armeabi-v7a" ]]; then
-#"32 bit"
+#"32 bit ARM"
 BUILD_CONFIG=( --config=android_arm -c opt --copt=-fPIE --linkopt=-pie )
-else
-#"64 bit"
+elif [[ "$abi_version" == "arm64-v8a" ]]; then
+#"64 bit ARM"
 BUILD_CONFIG=( --config=android_arm64 -c opt )
+elif [[ "$abi_version" == "x86_64" ]]; then
+# x86_64
+BUILD_CONFIG=( --config=android_x86_64 -c opt )
+else
+echo "Error: Unknown processor ABI"
+exit 1
 fi
 
 bazel build "${BUILD_CONFIG[@]}" //$SHELL_DIR:$BINARY_NAME
diff --git a/tensorflow/lite/delegates/gpu/cl/texture2d.cc b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
index 28d26f03260..77cc7c9353c 100644
--- a/tensorflow/lite/delegates/gpu/cl/texture2d.cc
+++ b/tensorflow/lite/delegates/gpu/cl/texture2d.cc
@@ -24,10 +24,9 @@ namespace {
 absl::Status CreateTexture2D(int width, int height, DataType type, void* data,
                              CLContext* context, Texture2D* result) {
   cl_mem texture;
-  RETURN_IF_ERROR(CreateFloatRGBAImage2D(context->context(), width, height,
-                                         type, data, &texture));
-  cl_channel_type channel_type =
-      type == DataType::FLOAT32 ? CL_FLOAT : CL_HALF_FLOAT;
+  cl_channel_type channel_type = DataTypeToChannelType(type);
+  RETURN_IF_ERROR(CreateRGBAImage2D(context->context(), width, height,
+                                    channel_type, data, &texture));
   *result = Texture2D(texture, width, height, channel_type);
 
   return absl::OkStatus();
@@ -37,6 +36,8 @@ absl::Status CreateTexture2D(int width, int height, DataType type, void* data,
 Texture2DDescriptor::Texture2DDescriptor(Texture2DDescriptor&& desc)
     : GPUObjectDescriptor(std::move(desc)),
       element_type(desc.element_type),
+      normalized(desc.normalized),
+      normalized_type(desc.normalized_type),
       size(desc.size),
       data(std::move(desc.data)) {}
 
@@ -44,6 +45,8 @@ Texture2DDescriptor& Texture2DDescriptor::operator=(
     Texture2DDescriptor&& desc) {
   if (this != &desc) {
     std::swap(element_type, desc.element_type);
+    std::swap(normalized, desc.normalized);
+    std::swap(normalized_type, desc.normalized_type);
     std::swap(size, desc.size);
     data = std::move(desc.data);
     GPUObjectDescriptor::operator=(std::move(desc));
@@ -80,8 +83,38 @@ absl::Status Texture2DDescriptor::PerformReadSelector(
         absl::StrCat("Texture2DDescriptor Read require two arguments, but ",
                      args.size(), " was passed"));
   }
-  const std::string read =
-      element_type == DataType::FLOAT16 ? "read_imageh" : "read_imagef";
+  std::string read;
+  switch (element_type) {
+    case DataType::FLOAT32:
+      read = "read_imagef";
+      break;
+    case DataType::FLOAT16:
+      read = "read_imageh";
+      break;
+    case DataType::INT8:
+    case DataType::INT16:
+    case DataType::INT32:
+      if (normalized) {
+        read = normalized_type == DataType::FLOAT16 ? "read_imageh"
+                                                    : "read_imagef";
+      } else {
+        read = "read_imagei";
+      }
+      break;
+    case DataType::UINT8:
+    case DataType::UINT16:
+    case DataType::UINT32:
+      if (normalized) {
+        read = normalized_type == DataType::FLOAT16 ? "read_imageh"
+                                                    : "read_imagef";
+      } else {
+        read = "read_imageui";
+      }
+      break;
+    default:
+      read = "unknown_type";
+      break;
+  }
   *result = absl::StrCat(read, "(tex2d, smp_none, (int2)(", args[0],
                          ", " + args[1] + "))");
   return absl::OkStatus();
@@ -145,13 +178,12 @@ absl::Status Texture2D::CreateFromTexture2DDescriptor(
     const Texture2DDescriptor& desc, CLContext* context) {
   width_ = desc.size.x;
   height_ = desc.size.y;
-  channel_type_ =
-      desc.element_type == DataType::FLOAT32 ? CL_FLOAT : CL_HALF_FLOAT;
+  channel_type_ = DataTypeToChannelType(desc.element_type, desc.normalized);
   uint8_t* data_ptr = desc.data.empty()
                           ? nullptr
                           : const_cast<unsigned char*>(desc.data.data());
-  return CreateFloatRGBAImage2D(context->context(), desc.size.x, desc.size.y,
-                                desc.element_type, data_ptr, &texture_);
+  return CreateRGBAImage2D(context->context(), desc.size.x, desc.size.y,
+                           channel_type_, data_ptr, &texture_);
 }
 
 // Creates new 4-channel 2D texture with f32 elements
diff --git a/tensorflow/lite/delegates/gpu/cl/texture2d.h b/tensorflow/lite/delegates/gpu/cl/texture2d.h
index 51e0fc7e42c..15864305f21 100644
--- a/tensorflow/lite/delegates/gpu/cl/texture2d.h
+++ b/tensorflow/lite/delegates/gpu/cl/texture2d.h
@@ -32,7 +32,11 @@ namespace gpu {
 namespace cl {
 
 struct Texture2DDescriptor : public GPUObjectDescriptor {
-  DataType element_type;  // FLOAT32 or FLOAT16
+  DataType element_type;
+  bool normalized = false;   // used with INT data types, if normalized, we read
+                             // in kernel float data.
+  DataType normalized_type;  // can be FLOAT32 or FLOAT16, using with normalized
+                             // = true
 
   // optional
   int2 size = int2(0, 0);
diff --git a/tensorflow/lite/delegates/gpu/cl/util.cc b/tensorflow/lite/delegates/gpu/cl/util.cc
index 199e0129968..d0e65537519 100644
--- a/tensorflow/lite/delegates/gpu/cl/util.cc
+++ b/tensorflow/lite/delegates/gpu/cl/util.cc
@@ -184,8 +184,32 @@ absl::Status CreateCLBuffer(cl_context context, int size_in_bytes,
   return absl::OkStatus();
 }
 
-absl::Status CreateFloatRGBAImage2D(cl_context context, int width, int height,
-                                    DataType type, void* data, cl_mem* result) {
+cl_channel_type DataTypeToChannelType(DataType type, bool normalized) {
+  switch (type) {
+    case DataType::FLOAT32:
+      return CL_FLOAT;
+    case DataType::FLOAT16:
+      return CL_HALF_FLOAT;
+    case DataType::INT8:
+      return normalized ? CL_SNORM_INT8 : CL_SIGNED_INT8;
+    case DataType::UINT8:
+      return normalized ? CL_UNORM_INT8 : CL_UNSIGNED_INT8;
+    case DataType::INT16:
+      return normalized ? CL_SNORM_INT16 : CL_SIGNED_INT16;
+    case DataType::UINT16:
+      return normalized ? CL_UNORM_INT16 : CL_UNSIGNED_INT16;
+    case DataType::INT32:
+      return CL_SIGNED_INT32;
+    case DataType::UINT32:
+      return CL_UNSIGNED_INT32;
+    default:
+      return CL_FLOAT;
+  }
+}
+
+absl::Status CreateRGBAImage2D(cl_context context, int width, int height,
+                               cl_channel_type channel_type, void* data,
+                               cl_mem* result) {
   cl_image_desc desc;
   desc.image_type = CL_MEM_OBJECT_IMAGE2D;
   desc.image_width = width;
@@ -199,8 +223,7 @@ absl::Status CreateFloatRGBAImage2D(cl_context context, int width, int height,
 
   cl_image_format format;
   format.image_channel_order = CL_RGBA;
-  format.image_channel_data_type =
-      type == DataType::FLOAT32 ? CL_FLOAT : CL_HALF_FLOAT;
+  format.image_channel_data_type = channel_type;
 
   cl_mem_flags flags = CL_MEM_READ_WRITE;
   if (data) {
diff --git a/tensorflow/lite/delegates/gpu/cl/util.h b/tensorflow/lite/delegates/gpu/cl/util.h
index 8e22c017fe7..54a6c74a3ff 100644
--- a/tensorflow/lite/delegates/gpu/cl/util.h
+++ b/tensorflow/lite/delegates/gpu/cl/util.h
@@ -52,8 +52,10 @@ void CopyLinearFLT4(const tflite::gpu::Tensor<Linear, S>& src,
 absl::Status CreateCLBuffer(cl_context context, int size_in_bytes,
                             bool read_only, void* data, cl_mem* result);
 
-absl::Status CreateFloatRGBAImage2D(cl_context context, int width, int height,
-                                    DataType type, void* data, cl_mem* result);
+cl_channel_type DataTypeToChannelType(DataType type, bool normalized = false);
+absl::Status CreateRGBAImage2D(cl_context context, int width, int height,
+                               cl_channel_type channel_type, void* data,
+                               cl_mem* result);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/common/BUILD b/tensorflow/lite/delegates/gpu/common/BUILD
index 60a0fda422c..99d915f0ed2 100644
--- a/tensorflow/lite/delegates/gpu/common/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/BUILD
@@ -10,6 +10,7 @@ cc_library(
     srcs = ["convert.cc"],
     hdrs = ["convert.h"],
     deps = [
+        ":data_type",
         ":shape",
         ":status",
         ":tensor",
@@ -22,7 +23,10 @@ cc_library(
 )
 
 exports_files(
-    ["custom_parsers.h"],
+    [
+        "custom_parsers.h",
+        "custom_transformations.h",
+    ],
     visibility = ["//tensorflow/lite/delegates/gpu/common:__subpackages__"],
 )
 
@@ -73,6 +77,7 @@ cc_library(
         ":types",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -81,11 +86,11 @@ cc_library(
     srcs = ["model.cc"],
     hdrs = ["model.h"],
     deps = [
-        ":data_type",
         ":shape",
         ":status",
         ":tensor",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:any",
         "@com_google_absl//absl/types:optional",
@@ -97,16 +102,44 @@ cc_test(
     srcs = ["model_test.cc"],
     deps = [
         ":model",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
 
+cc_library(
+    name = "lstm_parser",
+    srcs = ["lstm_parser.cc"],
+    hdrs = ["lstm_parser.h"],
+    deps = [
+        ":data_type",
+        ":model",
+        ":model_builder_helper",
+        ":object_reader",
+        ":operations",
+        ":shape",
+        ":status",
+        ":tensor",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:lstm_shared",
+        "//tensorflow/lite/kernels/internal:quantization_util",
+        "//tensorflow/lite/kernels/internal:tensor",
+        "//tensorflow/lite/kernels/internal:tensor_utils",
+        "//tensorflow/lite/kernels/internal:types",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:any",
+    ],
+)
+
 cc_library(
     name = "model_builder",
     srcs = ["model_builder.cc"],
     hdrs = ["model_builder.h"],
     deps = [
         ":data_type",
+        ":lstm_parser",
         ":model",
         ":model_builder_helper",
         ":model_transformer",
@@ -115,14 +148,15 @@ cc_library(
         ":shape",
         ":status",
         ":tensor",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "//tensorflow/lite/delegates:utils",
-        "//tensorflow/lite:context",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/delegates/gpu/common/transformations:general_transformations",
+        "//tensorflow/lite/delegates/gpu/common/transformations:model_transformations",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:reference_base",
         "//tensorflow/lite/kernels/internal:tensor",
@@ -133,10 +167,14 @@ cc_test(
     name = "model_builder_test",
     srcs = ["model_builder_test.cc"],
     deps = [
+        ":data_type",
         ":model_builder",
+        ":shape",
+        ":tensor",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -152,10 +190,8 @@ cc_library(
         ":shape",
         ":status",
         ":tensor",
-        "//tensorflow/lite:context",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels/internal:reference_base",
         "//tensorflow/lite/kernels/internal:tensor",
@@ -186,10 +222,12 @@ cc_library(
         ":model",
         ":model_builder_helper",
         ":status",
+        ":tensor",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/kernels:kernel_util",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -199,9 +237,9 @@ cc_library(
     hdrs = ["operations.h"],
     deps = [
         ":data_type",
-        ":model",
         ":shape",
         ":status",
+        ":tensor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:variant",
     ],
@@ -213,11 +251,12 @@ cc_library(
     hdrs = ["quantization_util.h"],
     deps = [
         ":status",
-        "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:optimized_base",
+        "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/kernels/internal:types",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -227,6 +266,9 @@ cc_test(
     deps = [
         ":quantization_util",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/c:common",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -237,10 +279,7 @@ cc_library(
     name = "shape",
     srcs = ["shape.cc"],
     hdrs = ["shape.h"],
-    deps = [
-        "@com_google_absl//absl/hash",
-        "@com_google_absl//absl/strings",
-    ],
+    deps = ["@com_google_absl//absl/strings"],
 )
 
 cc_test(
@@ -288,6 +327,9 @@ cc_test(
     srcs = ["memory_management_test.cc"],
     deps = [
         ":memory_management",
+        ":shape",
+        ":types",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -334,9 +376,5 @@ cc_library(
     name = "workgroup_selection",
     srcs = ["workgroup_selection.cc"],
     hdrs = ["workgroup_selection.h"],
-    deps = [
-        ":status",
-        ":types",
-        ":util",
-    ],
+    deps = [":util"],
 )
diff --git a/tensorflow/lite/delegates/gpu/common/convert.cc b/tensorflow/lite/delegates/gpu/common/convert.cc
index fb0caf9f167..3920692bdca 100644
--- a/tensorflow/lite/delegates/gpu/common/convert.cc
+++ b/tensorflow/lite/delegates/gpu/common/convert.cc
@@ -15,9 +15,19 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/convert.h"
 
+#include <stdint.h>
+#include <string.h>
+
+#include <string>
+#include <vector>
+
 #include <fp16.h>
 #include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 
diff --git a/tensorflow/lite/delegates/gpu/common/convert.h b/tensorflow/lite/delegates/gpu/common/convert.h
index 3aba9c913c5..c7a6c17380a 100644
--- a/tensorflow/lite/delegates/gpu/common/convert.h
+++ b/tensorflow/lite/delegates/gpu/common/convert.h
@@ -16,9 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CONVERT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CONVERT_H_
 
+#include <stdint.h>
+
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
diff --git a/tensorflow/lite/delegates/gpu/common/custom_parsers.h b/tensorflow/lite/delegates/gpu/common/custom_parsers.h
index d70e5849315..2644864cb58 100644
--- a/tensorflow/lite/delegates/gpu/common/custom_parsers.h
+++ b/tensorflow/lite/delegates/gpu/common/custom_parsers.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CUSTOM_PARSERS_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CUSTOM_PARSERS_H_
 
-#include <string>
+#include <stdint.h>
 
 #include "absl/strings/string_view.h"
 #include "absl/types/any.h"
diff --git a/tensorflow/lite/delegates/gpu/common/custom_transformations.h b/tensorflow/lite/delegates/gpu/common/custom_transformations.h
new file mode 100644
index 00000000000..3ca73a0d245
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/custom_transformations.h
@@ -0,0 +1,29 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CUSTOM_TRANSFORMATIONS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CUSTOM_TRANSFORMATIONS_H_
+
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+
+namespace tflite {
+namespace gpu {
+
+// Applies all implemented custom model transformations.
+bool ApplyCustomTransformations(ModelTransformer* transformer);
+
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_CUSTOM_TRANSFORMATIONS_H_
diff --git a/tensorflow/lite/delegates/gpu/common/default/BUILD b/tensorflow/lite/delegates/gpu/common/default/BUILD
index b085f68fcfb..91ce7e6c028 100644
--- a/tensorflow/lite/delegates/gpu/common/default/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/default/BUILD
@@ -14,3 +14,12 @@ cc_library(
         "@com_google_absl//absl/types:any",
     ],
 )
+
+cc_library(
+    name = "custom_transformations",
+    srcs = ["custom_transformations.cc"],
+    hdrs = ["//tensorflow/lite/delegates/gpu/common:custom_transformations.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:model_transformer",
+    ],
+)
diff --git a/tensorflow/lite/delegates/gpu/common/default/custom_parsers.cc b/tensorflow/lite/delegates/gpu/common/default/custom_parsers.cc
index 5aa1303d55c..a4981a9d459 100644
--- a/tensorflow/lite/delegates/gpu/common/default/custom_parsers.cc
+++ b/tensorflow/lite/delegates/gpu/common/default/custom_parsers.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/custom_parsers.h"
 
+#include <stdint.h>
+
 #include <string>
 
 #include "absl/strings/str_cat.h"
diff --git a/tensorflow/lite/delegates/gpu/common/default/custom_transformations.cc b/tensorflow/lite/delegates/gpu/common/default/custom_transformations.cc
new file mode 100644
index 00000000000..c57b9276068
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/default/custom_transformations.cc
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/common/custom_transformations.h"
+
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+
+namespace tflite {
+namespace gpu {
+
+bool ApplyCustomTransformations(ModelTransformer* transformer) { return true; }
+
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_info.cc b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
index 14fb48a2d2d..b56745df971 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_info.cc
+++ b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 
-#include <algorithm>
-#include <cctype>
 #include <string>
 
 #include "absl/strings/ascii.h"
diff --git a/tensorflow/lite/delegates/gpu/common/lstm_parser.cc b/tensorflow/lite/delegates/gpu/common/lstm_parser.cc
new file mode 100644
index 00000000000..bd84559fd54
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/lstm_parser.cc
@@ -0,0 +1,551 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/common/lstm_parser.h"
+
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/any.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
+#include "tensorflow/lite/delegates/gpu/common/object_reader.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/lstm_shared.h"
+#include "tensorflow/lite/string_type.h"
+
+namespace tflite {
+namespace gpu {
+namespace {
+
+Value* CreateNewSimilarValue(GraphFloat32* graph, const Value* old_value) {
+  Value* new_value = graph->NewValue();
+  new_value->quant_params = old_value->quant_params;
+  new_value->tensor.shape = old_value->tensor.shape;
+  new_value->tensor.type = old_value->tensor.type;
+  new_value->tensor.ref = -1;
+  return new_value;
+}
+
+absl::Status SetFullyConnectedWeights(int weights_tensor_id,
+                                      ObjectReader* reader,
+                                      FullyConnectedAttributes* attr) {
+  Tensor<HW, DataType::FLOAT32> weights;
+  RETURN_IF_ERROR(reader->ReadTensor(weights_tensor_id, &weights));
+  attr->weights.data = std::move(weights.data);
+  attr->weights.id = weights.id;
+  attr->weights.shape.o = weights.shape.h;
+  attr->weights.shape.h = 1;
+  attr->weights.shape.w = 1;
+  attr->weights.shape.i = weights.shape.w;
+  return absl::OkStatus();
+}
+
+bool HasTensor(const TfLiteNode* node, const int index) {
+  return (index < node->inputs->size) &&
+         (node->inputs->data[index] != kTfLiteOptionalTensor);
+}
+
+bool HasCifg(const TfLiteNode* node) {
+  return !HasTensor(
+      node, tflite::ops::builtin::lstm::full::kInputToInputWeightsTensor);
+}
+
+bool HasPeephole(const TfLiteNode* node) {
+  // Use forget weights to detect peephole instead of input weights as input
+  // weights may be missing for cifg.
+  return HasTensor(
+      node, tflite::ops::builtin::lstm::full::kCellToForgetWeightsTensor);
+}
+
+bool HasNormalization(const TfLiteNode* node) {
+  return HasTensor(
+      node,
+      tflite::ops::builtin::lstm::full::kForgetLayerNormCoefficientsTensor);
+}
+
+bool HasProjection(const TfLiteNode* node) {
+  return HasTensor(node,
+                   tflite::ops::builtin::lstm::full::kProjectionWeightsTensor);
+}
+
+// Builds subgraph for a single LSTM gate.
+// Returns a Value representing the gate's output.
+// High-level parameters:
+//   - Has normalization (if true: provide normalization weights).
+//   - Has peephole connection (if true: provide peephole weights).
+//   - Which activation function to use.
+// Note: no support for aux input.
+//
+// Implements the following:
+// (*: matrix multiply, .*: elementwise multiply, +: elementwise add):
+//   temp = input_weights * input_tensor + recurrent_weights * output_state;
+//   if (peephole):
+//     temp += peephole_weights .* cell_state;
+//   if (layer normalization):
+//     gate = activate(normalization_weights .* mean_stddev_norm(temp) + bias);
+//   else:
+//     gate = activate(temp + bias);
+//
+absl::Status BuildLstmGate(GraphFloat32* graph, ObjectReader* reader,
+                           Value* output_state, Value* cell_state,
+                           int input_weight_id, int recurrent_weight_id,
+                           int cell_weight_id, int bias_id,
+                           int normalization_weight_id,
+                           const TfLiteFusedActivation activation,
+                           bool has_peephole, bool has_normalization,
+                           Value** gate_out) {
+  Value* input_times_weights = CreateNewSimilarValue(graph, cell_state);
+  {
+    // #1 matrix multiplication: input_weights * input_tensor
+    // If has no normalization, also adds bias.
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::FULLY_CONNECTED);
+    FullyConnectedAttributes fc_attr;
+    RETURN_IF_ERROR(
+        SetFullyConnectedWeights(input_weight_id, reader, &fc_attr));
+    if (!has_normalization) {
+      RETURN_IF_ERROR(reader->ReadTensor(bias_id, &(fc_attr.bias)));
+    }
+    node->operation.attributes = std::move(fc_attr);
+    RETURN_IF_ERROR(
+        reader->AddInput(node, tflite::ops::builtin::lstm::full::kInputTensor));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, input_times_weights->id));
+  }
+
+  Value* output_state_times_weights = CreateNewSimilarValue(graph, cell_state);
+  {
+    // #2 matrix multiplication: recurrent_weights * output_state
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::FULLY_CONNECTED);
+    FullyConnectedAttributes fc_attr;
+    RETURN_IF_ERROR(
+        SetFullyConnectedWeights(recurrent_weight_id, reader, &fc_attr));
+    node->operation.attributes = std::move(fc_attr);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, output_state->id));
+    RETURN_IF_ERROR(
+        graph->SetProducer(node->id, output_state_times_weights->id));
+  }
+
+  Value* cell_state_times_weights;
+  if (has_peephole) {
+    // #3 elementwise multiplication: cell_weight .* cell_state
+    cell_state_times_weights = CreateNewSimilarValue(graph, cell_state);
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MUL);
+    ElementwiseAttributes attr;
+    Tensor<Linear, DataType::FLOAT32> weights;
+    RETURN_IF_ERROR(reader->ReadTensor(cell_weight_id, &weights));
+    attr.param = std::move(weights);
+    node->operation.attributes = std::move(attr);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, cell_state->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, cell_state_times_weights->id));
+  }
+
+  Value* gate_before_normalization = CreateNewSimilarValue(graph, cell_state);
+  Node* add_node = graph->NewNode();
+  {
+    // #4 elementwise addition: #1 + #2 + #3
+    add_node->operation.type = ToString(OperationType::ADD);
+    RETURN_IF_ERROR(graph->AddConsumer(add_node->id, input_times_weights->id));
+    RETURN_IF_ERROR(
+        graph->AddConsumer(add_node->id, output_state_times_weights->id));
+    if (has_peephole) {
+      RETURN_IF_ERROR(
+          graph->AddConsumer(add_node->id, cell_state_times_weights->id));
+    }
+    RETURN_IF_ERROR(
+        graph->SetProducer(add_node->id, gate_before_normalization->id));
+  }
+
+  if (!has_normalization) {
+    // #5 Activation function: activate(temp + bias)
+    // Bias is added in node #1.
+    RETURN_IF_ERROR(MaybeFuseActivation(activation, graph, add_node));
+    *gate_out = gate_before_normalization;
+    return absl::OkStatus();
+  }
+
+  Value* normalized_gate =
+      CreateNewSimilarValue(graph, gate_before_normalization);
+  {
+    // #6 Normalization: normalize(temp)
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MEAN_STDDEV_NORMALIZATION);
+    RETURN_IF_ERROR(
+        graph->AddConsumer(node->id, gate_before_normalization->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, normalized_gate->id));
+  }
+  Value* reweighted_normalized_gate =
+      CreateNewSimilarValue(graph, normalized_gate);
+  {
+    // #7 Elementwise multiplication: norm_weights .* #6
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MUL);
+    ElementwiseAttributes attr;
+    Tensor<Linear, DataType::FLOAT32> norm_weights;
+    RETURN_IF_ERROR(reader->ReadTensor(normalization_weight_id, &norm_weights));
+    attr.param = std::move(norm_weights);
+    node->operation.attributes = std::move(attr);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, normalized_gate->id));
+    RETURN_IF_ERROR(
+        graph->SetProducer(node->id, reweighted_normalized_gate->id));
+  }
+  Value* gate = CreateNewSimilarValue(graph, reweighted_normalized_gate);
+  {
+    // #8 Elementwise add: #7 + bias
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::ADD);
+    ElementwiseAttributes attr;
+    Tensor<Linear, DataType::FLOAT32> bias;
+    RETURN_IF_ERROR(reader->ReadTensor(bias_id, &bias));
+    attr.param = std::move(bias);
+    node->operation.attributes = std::move(attr);
+    RETURN_IF_ERROR(
+        graph->AddConsumer(node->id, reweighted_normalized_gate->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, gate->id));
+
+    // #9: Activation function
+    RETURN_IF_ERROR(MaybeFuseActivation(activation, graph, node));
+  }
+  *gate_out = gate;
+  return absl::OkStatus();
+}
+
+// Builds subgraph for LSTM cell state update.
+// Returns a Value representing the updated cell state.
+// High-level parameters:
+//  - clip: if > 0, clamp the resulting cell state to [-clip, +clip].
+//
+// Implements the following:
+// (*: matrix multiply, .*: elementwise multiply, +: elementwise add):
+//
+//   cell_state_new = clip(forget_gate .* cell_state + input_gate .* cell_gate);
+//
+absl::Status BuildCellStateUpdate(GraphFloat32* graph, ObjectReader* reader,
+                                  Value* forget_gate, Value* input_gate,
+                                  Value* cell_gate, float cell_clip,
+                                  Value** cell_state_new) {
+  Value* cell_state;
+  RETURN_IF_ERROR(reader->ReadValue(
+      tflite::ops::builtin::lstm::full::kCellStateTensor, &cell_state));
+  Value* cell_state_contrib = CreateNewSimilarValue(graph, cell_gate);
+  {
+    // #1 elementwise multiplication: forget_gate .* cell_state
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MUL);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, forget_gate->id));
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, cell_state->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, cell_state_contrib->id));
+  }
+  Value* cell_gate_contrib = CreateNewSimilarValue(graph, cell_gate);
+  {
+    // #2 elementwise multiplication: input_gate .* cell_gate
+    // Note, with CIFG input_gate is equal to 1-forget_gate.
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MUL);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, input_gate->id));
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, cell_gate->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, cell_gate_contrib->id));
+  }
+  Value* new_cell_state = CreateNewSimilarValue(graph, cell_gate);
+  {
+    // #3 elementwise add: #1 + #2
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::ADD);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, cell_state_contrib->id));
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, cell_gate_contrib->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, new_cell_state->id));
+  }
+
+  if (cell_clip <= 0.0f) {
+    *cell_state_new = new_cell_state;
+    return absl::OkStatus();
+  }
+
+  Value* max_clipped_state = CreateNewSimilarValue(graph, new_cell_state);
+  {
+    // #4 elementwise minimum: min(#3, clip)
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MINIMUM);
+    ElementwiseAttributes attr;
+    attr.param = cell_clip;
+    node->operation.attributes = std::move(attr);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, new_cell_state->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, max_clipped_state->id));
+  }
+  Value* clipped_cell_state = CreateNewSimilarValue(graph, max_clipped_state);
+  {
+    // #5 elementwise maximum: max(#4, -clip)
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MAXIMUM);
+    ElementwiseAttributes attr;
+    attr.param = -cell_clip;
+    node->operation.attributes = std::move(attr);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, max_clipped_state->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, clipped_cell_state->id));
+  }
+  *cell_state_new = clipped_cell_state;
+  return absl::OkStatus();
+}
+
+// Build subgraph for LSTM output state update.
+// Returns value representing the updated output state.
+// High-level parameters:
+//   - Has projection (if true, provide projection_weights).
+//   - Has projection bias (only with projection).
+//   - clip: clamp the projection output to [-clip, clip].
+//   - Which activation function to use.
+// Note the updated output state does not depend on the old output state
+// directly, only through the output gate.
+//
+// Implements the following:
+// (*: matrix multiply, .*: elementwise multiply, +: elementwise add):
+//
+//   temp = output_gate .* activate(cell_state);
+//   if (projection):
+//     output_state_new = clip(projection_weights * temp + projection_bias);
+//   else:
+//     output_state_new = temp;
+//
+absl::Status BuildOutputStateUpdate(GraphFloat32* graph, ObjectReader* reader,
+                                    Value* output_state, Value* output_gate,
+                                    Value* cell_state,
+                                    TfLiteFusedActivation activation,
+                                    bool has_projection, float proj_clip,
+                                    Value** output_state_new) {
+  Value* activated_state = CreateNewSimilarValue(graph, cell_state);
+  {
+    // #1 activation: activate(cell_state)
+    Node* node = graph->NewNode();
+    switch (activation) {
+      case kTfLiteActTanh:
+        node->operation.type = ToString(OperationType::TANH);
+        break;
+      case kTfLiteActSigmoid:
+        node->operation.type = ToString(OperationType::SIGMOID);
+        break;
+      default:
+        return absl::InvalidArgumentError(
+            absl::StrCat("Unsupported activation: ", activation));
+    }
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, cell_state->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, activated_state->id));
+  }
+
+  Value* new_output_state = CreateNewSimilarValue(graph, cell_state);
+  {
+    // #2 elementwise multiplication: output_gate .* #1
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MUL);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, activated_state->id));
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, output_gate->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, new_output_state->id));
+  }
+
+  if (!has_projection) {
+    *output_state_new = new_output_state;
+    return absl::OkStatus();
+  }
+
+  Value* projected_output_state = CreateNewSimilarValue(graph, output_state);
+  {
+    // #3 matrix multiplication: projection_weights * #2 + projection_bias
+    Node* node = graph->NewNode();
+    FullyConnectedAttributes fc_attr;
+    RETURN_IF_ERROR(SetFullyConnectedWeights(
+        tflite::ops::builtin::lstm::full::kProjectionWeightsTensor, reader,
+        &fc_attr));
+    // Projection bias is optional
+    reader
+        ->ReadTensor(tflite::ops::builtin::lstm::full::kProjectionBiasTensor,
+                     &(fc_attr.bias))
+        .IgnoreError();
+    node->operation.attributes = std::move(fc_attr);
+    node->operation.type = ToString(OperationType::FULLY_CONNECTED);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, new_output_state->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, projected_output_state->id));
+  }
+
+  if (proj_clip <= 0.0f) {
+    *output_state_new = projected_output_state;
+    return absl::OkStatus();
+  }
+
+  Value* max_clipped_state =
+      CreateNewSimilarValue(graph, projected_output_state);
+  {
+    // #4 elementwise minimum: min(#3, clip)
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MINIMUM);
+    ElementwiseAttributes attr;
+    attr.param = proj_clip;
+    node->operation.attributes = std::move(attr);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, projected_output_state->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, max_clipped_state->id));
+  }
+  Value* clipped_output_state = CreateNewSimilarValue(graph, max_clipped_state);
+  {
+    // #5 elementwise maximum: max(#4, -clip)
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::MAXIMUM);
+    ElementwiseAttributes attr;
+    attr.param = -proj_clip;
+    node->operation.attributes = std::move(attr);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, max_clipped_state->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, clipped_output_state->id));
+  }
+  *output_state_new = clipped_output_state;
+  return absl::OkStatus();
+}
+
+}  // namespace
+
+// Build subgraph for a single LSTM OP.
+// Returns a mapping for the used variable tensors' updated Values.
+//
+// High-level parameters:
+//   - Has CIFG:
+//       If false, calculate input_gate regularly.
+//       If true, calculate input_gate to 1-forget_gate.
+//   - Has peephole: see BuildLstmGate. Applies to all gates.
+//   - Has normalization: see BuildLstmGate. Applies to all gates.
+//   - Has projection, projection_bias, proj_clip: see BuildOutputStateUpdate
+//   - Which activation to use:
+//       Applies to only cell gate and output state update.
+//       Other gates always use Sigmoid.
+//
+absl::Status ParseLSTMAttributes(
+    const TfLiteNode* tflite_node, const TfLiteRegistration* registration,
+    GraphFloat32* graph, ObjectReader* reader, const TfLiteLSTMParams* params,
+    absl::flat_hash_map<int, ValueId>* new_variable_input_values) {
+  const bool has_cifg = HasCifg(tflite_node);
+  const bool has_peephole = HasPeephole(tflite_node);
+  const bool has_normalization = HasNormalization(tflite_node);
+  const bool has_projection = HasProjection(tflite_node);
+
+  Value* old_cell_state;
+  RETURN_IF_ERROR(reader->ReadValue(
+      tflite::ops::builtin::lstm::full::kCellStateTensor, &old_cell_state));
+
+  if (old_cell_state->tensor.shape.b != 1) {
+    return absl::InvalidArgumentError(
+        "Batched execution is not supported for LSTM");
+  }
+
+  Value* old_output_state;
+  RETURN_IF_ERROR(reader->ReadValue(
+      tflite::ops::builtin::lstm::full::kOutputStateTensor, &old_output_state));
+
+  Value* forget_gate;
+  RETURN_IF_ERROR(BuildLstmGate(
+      graph, reader, old_output_state, old_cell_state,
+      tflite::ops::builtin::lstm::full::kInputToForgetWeightsTensor,
+      tflite::ops::builtin::lstm::full::kRecurrentToForgetWeightsTensor,
+      tflite::ops::builtin::lstm::full::kCellToForgetWeightsTensor,
+      tflite::ops::builtin::lstm::full::kForgetGateBiasTensor,
+      tflite::ops::builtin::lstm::full::kForgetLayerNormCoefficientsTensor,
+      kTfLiteActSigmoid, has_peephole, has_normalization, &forget_gate));
+
+  Value* input_gate;
+  if (has_cifg) {
+    // When using cifg, input_gate is computed as (1 - forget_gate).
+    Node* node = graph->NewNode();
+    input_gate = CreateNewSimilarValue(graph, forget_gate);
+
+    node->operation.type = ToString(OperationType::SUB);
+    ElementwiseAttributes attr;
+    attr.param = 1.0f;
+    attr.runtime_tensor_is_second = true;
+    node->operation.attributes = std::move(attr);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, forget_gate->id));
+    RETURN_IF_ERROR(graph->SetProducer(node->id, input_gate->id));
+  } else {
+    RETURN_IF_ERROR(BuildLstmGate(
+        graph, reader, old_output_state, old_cell_state,
+        tflite::ops::builtin::lstm::full::kInputToInputWeightsTensor,
+        tflite::ops::builtin::lstm::full::kRecurrentToInputWeightsTensor,
+        tflite::ops::builtin::lstm::full::kCellToInputWeightsTensor,
+        tflite::ops::builtin::lstm::full::kInputGateBiasTensor,
+        tflite::ops::builtin::lstm::full::kInputLayerNormCoefficientsTensor,
+        kTfLiteActSigmoid, has_peephole, has_normalization, &input_gate));
+  }
+
+  // Cell state will not have peephole connections to itself
+  Value* cell_gate;
+  RETURN_IF_ERROR(BuildLstmGate(
+      graph, reader, old_output_state, old_cell_state,
+      tflite::ops::builtin::lstm::full::kInputToCellWeightsTensor,
+      tflite::ops::builtin::lstm::full::kRecurrentToCellWeightsTensor,
+      /*cell_weight_id=*/-1,
+      tflite::ops::builtin::lstm::full::kCellGateBiasTensor,
+      tflite::ops::builtin::lstm::full::kCellLayerNormCoefficientsTensor,
+      params->activation, /*has_peephole=*/false, has_normalization,
+      &cell_gate));
+
+  Value* new_cell_state;
+  RETURN_IF_ERROR(BuildCellStateUpdate(graph, reader, forget_gate, input_gate,
+                                       cell_gate, params->cell_clip,
+                                       &new_cell_state));
+
+  Value* output_gate;
+  RETURN_IF_ERROR(BuildLstmGate(
+      graph, reader, old_output_state, new_cell_state,
+      tflite::ops::builtin::lstm::full::kInputToOutputWeightsTensor,
+      tflite::ops::builtin::lstm::full::kRecurrentToOutputWeightsTensor,
+      tflite::ops::builtin::lstm::full::kCellToOutputWeightsTensor,
+      tflite::ops::builtin::lstm::full::kOutputGateBiasTensor,
+      tflite::ops::builtin::lstm::full::kOutputLayerNormCoefficientsTensor,
+      kTfLiteActSigmoid, has_peephole, has_normalization, &output_gate));
+
+  Value* new_output_state;
+  RETURN_IF_ERROR(BuildOutputStateUpdate(graph, reader, old_output_state,
+                                         output_gate, new_cell_state,
+                                         params->activation, has_projection,
+                                         params->proj_clip, &new_output_state));
+
+  {
+    // Copy updated output state to output.
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::COPY);
+    RETURN_IF_ERROR(graph->AddConsumer(node->id, new_output_state->id));
+    RETURN_IF_ERROR(reader->AddOutput(
+        node, tflite::ops::builtin::lstm::full::kOutputTensor));
+  }
+
+  new_variable_input_values->clear();
+  new_variable_input_values->emplace(
+      tflite::ops::builtin::lstm::full::kCellStateTensor, new_cell_state->id);
+  new_variable_input_values->emplace(
+      tflite::ops::builtin::lstm::full::kOutputStateTensor,
+      new_output_state->id);
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/lstm_parser.h b/tensorflow/lite/delegates/gpu/common/lstm_parser.h
new file mode 100644
index 00000000000..b7c32371abc
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/lstm_parser.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_LSTM_PARSER_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_LSTM_PARSER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/object_reader.h"
+
+namespace tflite {
+namespace gpu {
+
+absl::Status ParseLSTMAttributes(
+    const TfLiteNode* tflite_node, const TfLiteRegistration* registration,
+    GraphFloat32* graph, ObjectReader* reader, const TfLiteLSTMParams* params,
+    absl::flat_hash_map<int, ValueId>* new_variable_input_values);
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_LSTM_PARSER_H_
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management.cc b/tensorflow/lite/delegates/gpu/common/memory_management.cc
index d7e6a060eb2..2a637d54016 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management.cc
@@ -15,17 +15,21 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/memory_management.h"
 
-#include <algorithm>
-#include <limits>
+#include <cstddef>
 #include <numeric>
-#include <queue>
-#include <set>
-#include <type_traits>
+#include <utility>
 #include <vector>
 
+#include "tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management.h b/tensorflow/lite/delegates/gpu/common/memory_management.h
index 7df4947ee3d..9f1adcebd7f 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management.h
@@ -16,16 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_H_
 
-#include <cstdint>
-#include <memory>
+#include <stddef.h>
+
 #include <vector>
 
 #include "absl/memory/memory.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h"
-#include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h"
-#include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h"
-#include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h"
-#include "tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h
index fdccce5159f..018e5a95b51 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/equality_assignment.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_EQUALITY_ASSIGNMENT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_EQUALITY_ASSIGNMENT_H_
 
+#include <stddef.h>
+
+#include <cstddef>
 #include <queue>
 #include <vector>
 
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.cc b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.cc
index 2c138b4c14c..b07ab61a1a5 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.cc
@@ -16,12 +16,13 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h"
 
 #include <algorithm>
-#include <cstdint>
-#include <cstdlib>
+#include <cstddef>
 #include <set>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h
index 47035229920..e207ab323b5 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_breadth_assignment.h
@@ -16,7 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_BREADTH_ASSIGNMENT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_BREADTH_ASSIGNMENT_H_
 
-#include <cstdint>
+#include <stddef.h>
+
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.cc b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.cc
index 76309ce8f1b..130f27152cd 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.cc
@@ -16,8 +16,13 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h"
 
 #include <algorithm>
+#include <cstddef>
+#include <iterator>
+#include <vector>
 
+#include "absl/status/status.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h
index b0ad9d18911..198a25c7a57 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_SIZE_ASSIGNMENT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_BY_SIZE_ASSIGNMENT_H_
 
+#include <stddef.h>
+
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h
index 8c3719e4a8b..048ed389700 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/greedy_in_order_assignment.h
@@ -16,7 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_IN_ORDER_ASSIGNMENT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_GREEDY_IN_ORDER_ASSIGNMENT_H_
 
+#include <stddef.h>
+
 #include <algorithm>
+#include <cstddef>
+#include <iterator>
 #include <list>
 #include <queue>
 #include <set>
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/internal.cc b/tensorflow/lite/delegates/gpu/common/memory_management/internal.cc
index bbcd373287f..27126aa929f 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/internal.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/internal.cc
@@ -16,6 +16,11 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
 
 #include <algorithm>
+#include <cstddef>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/internal.h b/tensorflow/lite/delegates/gpu/common/memory_management/internal.h
index 702fd2992cc..4d48f75da9f 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/internal.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/internal.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_INTERNAL_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_INTERNAL_H_
 
-#include <cstdint>
+#include <stddef.h>
+
 #include <limits>
-#include <memory>
 #include <vector>
 
 #include "absl/memory/memory.h"
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/internal_test.cc b/tensorflow/lite/delegates/gpu/common/memory_management/internal_test.cc
index 757cb89b366..ed83e3c5109 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/internal_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/internal_test.cc
@@ -15,8 +15,12 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
 
+#include <cstddef>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.cc b/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.cc
index 059c23fab33..c56ac2e391b 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.cc
@@ -16,11 +16,15 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h"
 
 #include <algorithm>
+#include <cstddef>
+#include <limits>
 #include <queue>
-#include <set>
+#include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h
index 1284c12c5c2..df734ad9ea4 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/min_cost_flow_assignment.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_MIN_COST_FLOW_ASSIGNMENT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_MIN_COST_FLOW_ASSIGNMENT_H_
 
+#include <stddef.h>
+
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h b/tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h
index 8a00c67d853..d700f62006c 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/naive_assignment.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_NAIVE_ASSIGNMENT_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_NAIVE_ASSIGNMENT_H_
 
+#include <stddef.h>
+
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/common/memory_management/internal.h"
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/types.cc b/tensorflow/lite/delegates/gpu/common/memory_management/types.cc
index 5cec0cab4c4..101ca5316f1 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/types.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/types.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 
 #include <algorithm>
-#include <cstdint>
+#include <cstddef>
 #include <queue>
 #include <vector>
 
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/types.h b/tensorflow/lite/delegates/gpu/common/memory_management/types.h
index a511152ed0b..f3257fcf5f8 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/types.h
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/types.h
@@ -16,8 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_TYPES_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MEMORY_MANAGEMENT_TYPES_H_
 
-#include <cstdint>
-#include <memory>
+#include <stddef.h>
+
+#include <cstddef>
 #include <vector>
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management/types_test.cc b/tensorflow/lite/delegates/gpu/common/memory_management/types_test.cc
index 0312dc27877..22558ec8b94 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management/types_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management/types_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
 
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
diff --git a/tensorflow/lite/delegates/gpu/common/memory_management_test.cc b/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
index 12f5b6ebe6c..ba951354d17 100644
--- a/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/memory_management_test.cc
@@ -15,8 +15,15 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/memory_management.h"
 
+#include <cstddef>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "tensorflow/lite/delegates/gpu/common/memory_management/types.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/model.cc b/tensorflow/lite/delegates/gpu/common/model.cc
index a2f9da428ba..696a747a817 100644
--- a/tensorflow/lite/delegates/gpu/common/model.cc
+++ b/tensorflow/lite/delegates/gpu/common/model.cc
@@ -15,7 +15,21 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 
+#include <stdint.h>
+
+#include <algorithm>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -32,6 +46,11 @@ std::vector<Value*> GraphFloat32::inputs() const {
   return FilterValues([](const ValueDef& v) { return v.producer == nullptr; });
 }
 
+std::vector<Value*> GraphFloat32::variable_inputs() const {
+  return FilterValues(
+      [](const ValueDef& v) { return v.value->tensor.is_variable_input; });
+}
+
 std::vector<Value*> GraphFloat32::outputs() const {
   return FilterValues([](const ValueDef& v) { return v.consumers.empty(); });
 }
@@ -397,19 +416,19 @@ absl::Status RemoveFollowingNode(GraphFloat32* graph, const Node* to_remove,
   return graph->DeleteNode(to_remove->id);
 }
 
-absl::Status RemoveOneInputOneOutputNode(GraphFloat32* graph,
-                                         const Node* to_remove) {
-  auto inputs = graph->FindInputs(to_remove->id);
-  auto outputs = graph->FindOutputs(to_remove->id);
+absl::Status RemoveSimpleNodeKeepInput(GraphFloat32* graph,
+                                       const Node* simple_node) {
+  const auto inputs = graph->FindInputs(simple_node->id);
+  const auto outputs = graph->FindOutputs(simple_node->id);
   if (inputs.size() != 1 || outputs.size() != 1) {
-    return absl::InvalidArgumentError(
-        "To_remove node must have 1 input and 1 output");
+    return absl::FailedPreconditionError(
+        "simple_node node must have 1 input and 1 output");
   }
-  auto input_id = inputs[0]->id;
-  auto output_id = outputs[0]->id;
-  Node* producer = graph->FindProducer(input_id);
-  auto consumers = graph->FindConsumers(output_id);
-  RETURN_IF_ERROR(graph->DeleteNode(to_remove->id));
+  const auto input_id = inputs[0]->id;
+  const auto output_id = outputs[0]->id;
+  const Node* producer = graph->FindProducer(input_id);
+  const auto consumers = graph->FindConsumers(output_id);
+  RETURN_IF_ERROR(graph->DeleteNode(simple_node->id));
   for (auto& consumer : consumers) {
     RETURN_IF_ERROR(graph->ReplaceInput(consumer->id, output_id, input_id));
   }
@@ -420,6 +439,38 @@ absl::Status RemoveOneInputOneOutputNode(GraphFloat32* graph,
   return absl::OkStatus();
 }
 
+absl::Status RemoveSimpleNodeKeepOutput(GraphFloat32* graph,
+                                        const Node* simple_node) {
+  const auto inputs = graph->FindInputs(simple_node->id);
+  const auto outputs = graph->FindOutputs(simple_node->id);
+  if (inputs.size() != 1 || outputs.size() != 1) {
+    return absl::FailedPreconditionError(
+        "simple_node must have 1 input and 1 output");
+  }
+  const auto input_id = inputs[0]->id;
+  const auto output_id = outputs[0]->id;
+  const Node* producer = graph->FindProducer(input_id);
+  const auto input_consumers = graph->FindConsumers(input_id);
+  if (input_consumers.size() != 1) {
+    return absl::FailedPreconditionError(
+        "simple_node should be the only consumer on the node.");
+  }
+
+  RETURN_IF_ERROR(graph->DeleteNode(simple_node->id));
+  if (producer) {
+    RETURN_IF_ERROR(graph->RemoveProducer(input_id));
+    RETURN_IF_ERROR(graph->SetProducer(producer->id, output_id));
+  }
+
+  RETURN_IF_ERROR(graph->DeleteValue(input_id));
+
+  const auto output_consumers = graph->FindConsumers(output_id);
+  if (!producer && output_consumers.empty()) {
+    RETURN_IF_ERROR(graph->DeleteValue(output_id));
+  }
+  return absl::OkStatus();
+}
+
 absl::Status AddOutput(GraphFloat32* graph, const Node* from_node,
                        Value** output) {
   auto link = graph->NewValue();
@@ -430,14 +481,27 @@ absl::Status AddOutput(GraphFloat32* graph, const Node* from_node,
 
 absl::Status ConnectTwoNodes(GraphFloat32* graph, const Node* from_node,
                              const Node* to_node, Value** output) {
-  Value* link;
-  RETURN_IF_ERROR(AddOutput(graph, from_node, &link));
-  RETURN_IF_ERROR(graph->AddConsumer(to_node->id, link->id));
-  *output = link;
+  const Node* output_producer =
+      *output ? graph->FindProducer((*output)->id) : nullptr;
+  // Output is already initialized, but producer is not from_node.
+  if (*output && output_producer && output_producer->id != from_node->id) {
+    return absl::InvalidArgumentError("Wrong output is passed.");
+  }
+  // Output is already initialized, and producer is from_node.
+  if (*output) {
+    RETURN_IF_ERROR(graph->AddConsumer(to_node->id, (*output)->id));
+  } else {
+    // Output is not initialized.
+    Value* link;
+    RETURN_IF_ERROR(AddOutput(graph, from_node, &link));
+    RETURN_IF_ERROR(graph->AddConsumer(to_node->id, link->id));
+    *output = link;
+  }
   return absl::OkStatus();
 }
 
 bool IsBatchMatchesForAllValues(const GraphFloat32& model) {
+  if (model.values().empty()) return true;
   const int32_t b = model.values()[0]->tensor.shape.b;
   for (auto value : model.values()) {
     if (value->tensor.shape.b != b) {
diff --git a/tensorflow/lite/delegates/gpu/common/model.h b/tensorflow/lite/delegates/gpu/common/model.h
index f6d160977f9..2e9aac8c53c 100644
--- a/tensorflow/lite/delegates/gpu/common/model.h
+++ b/tensorflow/lite/delegates/gpu/common/model.h
@@ -24,10 +24,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/memory/memory.h"
-#include "absl/strings/str_cat.h"
 #include "absl/types/any.h"
 #include "absl/types/optional.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
@@ -92,6 +90,9 @@ class GraphFloat32 {
   // @return graph outputs, that are values without consumers.
   std::vector<Value*> outputs() const;
 
+  // @return values updated in place with a previously defined tensor reference.
+  std::vector<Value*> variable_inputs() const;
+
   // @return inputs into the given node. Returns empty vector for deleted node.
   std::vector<Value*> FindInputs(NodeId id) const;
 
@@ -235,18 +236,28 @@ absl::Status RemovePrecedingNode(GraphFloat32* graph, const Node* to_remove,
 absl::Status RemoveFollowingNode(GraphFloat32* graph, const Node* to_remove,
                                  const Node* to_keep);
 
-// Removes to_remove node.
-// Requires that node has one input and one output;
-absl::Status RemoveOneInputOneOutputNode(GraphFloat32* graph,
-                                         const Node* to_remove);
+// Removes simple_node and its output value from the graph. Node is considered
+// simple if it has only one input and one output value. Input value is kept.
+absl::Status RemoveSimpleNodeKeepInput(GraphFloat32* graph,
+                                       const Node* simple_node);
+
+// Removes simple_node and its input value from the graph. Node is considered
+// simple if it has only one input and one output value. Output value is kept.
+// simple_node should be an exclusive consumer of its input value.
+absl::Status RemoveSimpleNodeKeepOutput(GraphFloat32* graph,
+                                        const Node* simple_node);
 
 absl::Status AddOutput(GraphFloat32* graph, const Node* from_node,
                        Value** output);
 
+// Makes a direct connection between from_node and to_node. All input parameters
+// except output are expected to be initialized before passing to the function.
+// If from_node already has an output value, which is not yet consumed by
+// to_node, it may be passed as output parameter.
 absl::Status ConnectTwoNodes(GraphFloat32* graph, const Node* from_node,
                              const Node* to_node, Value** output);
 
-// @return true if all tensors have same batch value.
+// @return true if all tensors have same batch value or if model has no values.
 bool IsBatchMatchesForAllValues(const GraphFloat32& model);
 
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index efa75a244bf..c200f0926aa 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -27,7 +27,9 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
@@ -36,6 +38,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/gpu/common/custom_parsers.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/lstm_parser.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
@@ -44,7 +47,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
-#include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
+#include "tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h"
 #include "tensorflow/lite/delegates/utils.h"
 #include "tensorflow/lite/kernels/internal/reference/dequantize.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
@@ -81,6 +84,13 @@ class TFLiteOperationParser {
   virtual absl::Status IsSupported(const TfLiteContext* context,
                                    const TfLiteNode* tflite_node,
                                    const TfLiteRegistration* registration) = 0;
+
+  // Return the value ids in the graph that correspond to the updated values of
+  // the variable input tensor.
+  virtual absl::flat_hash_map<int, ValueId>
+  GetNewValueIdsForVariableInputNodes() {
+    return absl::flat_hash_map<int, ValueId>();
+  }
 };
 
 HW ToHW(int32_t h, int32_t w) { return HW(h > 0 ? h : 1, w > 0 ? w : 1); }
@@ -298,6 +308,27 @@ class AddOperationParser : public TFLiteOperationParser {
   }
 };
 
+class BatchedMatMulOperationParser : public TFLiteOperationParser {
+ public:
+  absl::Status IsSupported(const TfLiteContext* context,
+                           const TfLiteNode* tflite_node,
+                           const TfLiteRegistration* registration) final {
+    return CheckInputsOutputs(context, tflite_node,
+                              /*runtime_inputs=*/2, /*outputs=*/1);
+  }
+
+  absl::Status Parse(const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration,
+                     GraphFloat32* graph, ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::BATCHED_MATMUL);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddInput(node, 1));
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+    return absl::OkStatus();
+  }
+};
+
 class ConcatenationOperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
@@ -420,7 +451,7 @@ class Conv2DOperationParser : public TFLiteOperationParser {
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 3));
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 5));
     const int runtime_inputs =
         GetNumberOfRuntimeInputsForNode(context, tflite_node);
     if (runtime_inputs > 2) {
@@ -475,55 +506,28 @@ class Conv2DOperationParser : public TFLiteOperationParser {
   }
 };
 
-class Convolution2DTransposeBiasParser : public TFLiteOperationParser {
- public:
-  absl::Status IsSupported(const TfLiteContext* context,
-                           const TfLiteNode* tflite_node,
-                           const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
-    const TfLiteTransposeConvParams* tf_options;
-    RETURN_IF_ERROR(RetrieveCustomInitialData(tflite_node, &tf_options));
-    RETURN_IF_ERROR(
-        CheckStrides(tf_options->stride_height, tf_options->stride_width));
-    return absl::OkStatus();
-  }
-
-  absl::Status Parse(const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration,
-                     GraphFloat32* graph, ObjectReader* reader) final {
-    auto* node = graph->NewNode();
-    node->operation.type = ToString(OperationType::CONVOLUTION_TRANSPOSED);
-    RETURN_IF_ERROR(reader->AddInput(node, 0));
-    RETURN_IF_ERROR(reader->AddOutputs(node));
-
-    const TfLiteTransposeConvParams* tf_options;
-    auto status = RetrieveCustomInitialData(tflite_node, &tf_options);
-
-    ConvolutionTransposedAttributes attr;
-    attr.stride = status.ok()
-                      ? HW(tf_options->stride_height, tf_options->stride_width)
-                      : HW(1, 1);
-
-    RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
-    reader->ReadTensor(2, &attr.bias).IgnoreError();  // bias is optional
-
-    UpdatePadding(status.ok() ? tf_options->padding : kTfLitePaddingUnknown,
-                  graph->FindInputs(node->id)[0]->tensor.shape, &attr);
-
-    node->operation.attributes = std::move(attr);
-    return absl::OkStatus();
-  }
-};
-
 class DepthwiseConvolutionOperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 3));
-    RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
-                                       /*runtime_inputs=*/1, /*outputs=*/1));
-    RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 6));
+    const int runtime_inputs =
+        GetNumberOfRuntimeInputsForNode(context, tflite_node);
+    if (runtime_inputs > 2) {
+      return absl::InternalError(
+          absl::StrCat("Expected 1 or 2 input tensor(s), but node has ",
+                       runtime_inputs, " runtime inputs."));
+    }
+    const int runtime_outputs = NumOutputs(tflite_node);
+    if (runtime_outputs != 1) {
+      return absl::InternalError(
+          absl::StrCat("Expected 1 output tensor(s), but node has ",
+                       runtime_outputs, " runtime outputs."));
+    }
+    if (runtime_inputs == 1) {
+      RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
+    }
     const TfLiteDepthwiseConvParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     RETURN_IF_ERROR(CheckStridesAndDilation(
@@ -577,7 +581,12 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddOutputs(node));
 
     DepthwiseConvolution2DAttributes attr;
-    RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
+    const int runtime_inputs = reader->GetNumberOfRuntimeInputs();
+    if (runtime_inputs == 2) {
+      RETURN_IF_ERROR(reader->AddInput(node, 1));
+    } else {  // runtime_inputs == 1;
+      RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
+    }
     reader->ReadTensor(2, &attr.bias).IgnoreError();  // bias is optional
     const TfLiteDepthwiseConvParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
@@ -797,8 +806,10 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
       case OperationType::ABS:
       case OperationType::COPY:
       case OperationType::COS:
+      case OperationType::ELU:
       case OperationType::EXP:
       case OperationType::LOG:
+      case OperationType::NEG:
       case OperationType::RSQRT:
       case OperationType::SIGMOID:
       case OperationType::SIN:
@@ -814,6 +825,8 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
   bool IsTwoArgumentOperation() const {
     switch (operation_type_) {
       case OperationType::DIV:
+      case OperationType::MAXIMUM:
+      case OperationType::MINIMUM:
       case OperationType::POW:
       case OperationType::SQUARED_DIFF:
       case OperationType::SUB:
@@ -825,8 +838,11 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
 
   bool IsTwoArgumentOperationWithConst() const {
     switch (operation_type_) {
-      case OperationType::MINIMUM:
+      case OperationType::DIV:
       case OperationType::MAXIMUM:
+      case OperationType::MINIMUM:
+      case OperationType::POW:
+      case OperationType::SQUARED_DIFF:
       case OperationType::SUB:
         return true;
       default:
@@ -850,6 +866,10 @@ class FullyConnectedOperationParser : public TFLiteOperationParser {
       return absl::UnimplementedError(
           "Unsupported FullyConnected weights format.");
     }
+    if (GetNumberOfRuntimeInputsForNode(context, tflite_node) > 2) {
+      return absl::UnimplementedError(
+          "FullyConnected doesn't support more than 2 runtime inputs.");
+    }
     // TODO(eignasheva): check input shape
     return absl::OkStatus();
   }
@@ -857,11 +877,31 @@ class FullyConnectedOperationParser : public TFLiteOperationParser {
   absl::Status Parse(const TfLiteNode* tflite_node,
                      const TfLiteRegistration* registration,
                      GraphFloat32* graph, ObjectReader* reader) final {
+    const TfLiteFullyConnectedParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+
+    if (reader->GetNumberOfRuntimeInputs() == 2) {
+      // Create Convolution2D, so as it supports runtime weights.
+      Node* node = graph->NewNode();
+      node->operation.type = ToString(OperationType::CONVOLUTION_2D);
+      RETURN_IF_ERROR(reader->AddInput(node, 0));
+      RETURN_IF_ERROR(reader->AddInput(node, 1));
+      RETURN_IF_ERROR(reader->AddOutputs(node));
+
+      Convolution2DAttributes attr;
+      reader->ReadTensor(2, &attr.bias).IgnoreError();  // bias is optional
+
+      attr.strides = HW(1, 1);
+      attr.dilations = HW(1, 1);
+      attr.padding.appended = HW(0, 0);
+      attr.padding.prepended = HW(0, 0);
+      RETURN_IF_ERROR(MaybeFuseActivation(tf_options->activation, graph, node));
+      node->operation.attributes = std::move(attr);
+      return absl::OkStatus();
+    }
     Node* node = graph->NewNode();
     RETURN_IF_ERROR(reader->AddInput(node, 0));
 
-    const TfLiteFullyConnectedParams* tf_options;
-    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     if (tf_options->weights_format !=
         kTfLiteFullyConnectedWeightsFormatDefault) {
       return absl::UnimplementedError(
@@ -870,13 +910,11 @@ class FullyConnectedOperationParser : public TFLiteOperationParser {
 
     FullyConnectedAttributes attr;
     RETURN_IF_ERROR(GetFullyConnectedAttributes(1, 2, reader, &attr));
+    const int weights_width = attr.weights.shape.i;
 
-    Tensor<HW, DataType::FLOAT32> weights;
-    RETURN_IF_ERROR(reader->ReadTensor(1, &weights));
     auto input = graph->FindInputs(node->id)[0];
     int batch_size = input->tensor.shape.b;
-    if (input->tensor.shape.DimensionsProduct() / batch_size !=
-        weights.shape.w) {
+    if (input->tensor.shape.DimensionsProduct() / batch_size != weights_width) {
       return absl::UnimplementedError(
           "Amount of input data should match weights width");
     }
@@ -888,7 +926,7 @@ class FullyConnectedOperationParser : public TFLiteOperationParser {
       Value* reshaped_value = graph->NewValue();
       reshaped_value->tensor.type = DataType::FLOAT32;
       reshaped_value->tensor.shape =
-          BHWC(input->tensor.shape.b, 1, 1, weights.shape.w);
+          BHWC(input->tensor.shape.b, 1, 1, weights_width);
       RETURN_IF_ERROR(graph->SetProducer(reshape->id, reshaped_value->id));
       reshape->operation.type = ToString(OperationType::RESHAPE);
       ReshapeAttributes attr;
@@ -943,18 +981,39 @@ class HardSwishOperationParser : public TFLiteOperationParser {
 //                 /      \
 //           new_state1    activation0
 //
+// For full LSTM cells, see this blog post:
+// https://colah.github.io/posts/2015-08-Understanding-LSTMs/
+// In addition to Peephole connections and Combined Input Forget Gates (CIFG)
+// described in that post, this code also adds the following optional features:
+// - Configurable activations (sigmoid or TANH)
+// - L2 Normalization of gates: https://arxiv.org/abs/1607.06450
+// - Output projection:
+//     https://www.isca-speech.org/archive/interspeech_2014/i14_0338.html
+// - Configurable clipping of cell state and output state.
 class LSTMOperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 3));
     const TfLiteLSTMParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     switch (tf_options->kernel_type) {
-      case kTfLiteLSTMFullKernel:
-        // TODO(b/157166356): Add check for input/output tensor counts.
+      case kTfLiteLSTMFullKernel: {
+        const int inputs = NumInputs(tflite_node);
+        if (inputs != 20 && inputs != 24) {
+          return absl::InternalError(
+              absl::StrCat("Expected 20 or 24 input tensors, but node has ",
+                           inputs, " input(s)."));
+        }
+        const int runtime_outputs = NumOutputs(tflite_node);
+        if (runtime_outputs != 1) {
+          return absl::InternalError(
+              absl::StrCat("Expected 1 output tensor, but node has ",
+                           runtime_outputs, " output(s)."));
+        }
         return CheckFullParameters(tf_options);
+      }
       case kTfLiteLSTMBasicKernel:
         RETURN_IF_ERROR(
             CheckInputsConstsOutputs(context, tflite_node, /*runtime_inputs=*/3,
@@ -976,6 +1035,11 @@ class LSTMOperationParser : public TFLiteOperationParser {
     }
   }
 
+  absl::flat_hash_map<int, ValueId> GetNewValueIdsForVariableInputNodes()
+      final {
+    return new_variable_input_value_map_;
+  }
+
  private:
   absl::Status ParseBasic(const TfLiteNode* tflite_node,
                           const TfLiteRegistration* registration,
@@ -1048,14 +1112,24 @@ class LSTMOperationParser : public TFLiteOperationParser {
                          const TfLiteRegistration* registration,
                          GraphFloat32* graph, ObjectReader* reader,
                          const TfLiteLSTMParams* tf_options) {
-    return absl::UnimplementedError(
-        "Full LSTM support is not yet implemented.");
+    // Invoke full LSTM parser
+    RETURN_IF_ERROR(ParseLSTMAttributes(tflite_node, registration, graph,
+                                        reader, tf_options,
+                                        &new_variable_input_value_map_));
+    return absl::OkStatus();
   }
 
   absl::Status CheckFullParameters(const TfLiteLSTMParams* tf_options) {
-    return absl::UnimplementedError(
-        "Full LSTM support is not yet implemented.");
+    if (tf_options->activation != kTfLiteActSigmoid &&
+        tf_options->activation != kTfLiteActTanh) {
+      return absl::UnimplementedError(
+          "Only sigmoid or tanh activation is supported.");
+    }
+
+    return absl::OkStatus();
   }
+
+  absl::flat_hash_map<int, ValueId> new_variable_input_value_map_;
 };
 
 class MulOperationParser : public TFLiteOperationParser {
@@ -1067,8 +1141,11 @@ class MulOperationParser : public TFLiteOperationParser {
     if (tflite_node->inputs->size != 2) {
       return absl::UnimplementedError("MUL requires two input tensors.");
     }
-    auto input0 = tflite::GetInput(context, tflite_node, 0);
-    auto input1 = tflite::GetInput(context, tflite_node, 1);
+    const TfLiteTensor* input0 = GetInput(context, tflite_node, 0);
+    const TfLiteTensor* input1 = GetInput(context, tflite_node, 1);
+    if (input0 == nullptr || input1 == nullptr) {
+      return absl::InvalidArgumentError("At least one input tensor is null");
+    }
     if (input0->dims->size == input1->dims->size) {
       // this code checks that at least one input of Mul not smaller in all
       // dimensions. Sometimes Mul used for matrix-vector multiplication that we
@@ -1099,7 +1176,6 @@ class MulOperationParser : public TFLiteOperationParser {
   absl::Status Parse(const TfLiteNode* tflite_node,
                      const TfLiteRegistration* registration,
                      GraphFloat32* graph, ObjectReader* reader) final {
-    // Determine runtime/constant tensors.
     const TfLiteTensor* input0 = reader->GetInputTensor(0);
     if (!input0) {
       return absl::InvalidArgumentError(
@@ -1120,10 +1196,22 @@ class MulOperationParser : public TFLiteOperationParser {
 
     Node* node = graph->NewNode();
     node->operation.type = ToString(OperationType::MUL);
+    RETURN_IF_ERROR(reader->AddOutputs(node));
 
-    // The "larger" input tensor must be bound to 1st input and the "smaller"
-    // input tensor ("mask") must be bound to 2nd input.
+    // Determine runtime/constant tensors.
     if (runtime_tensor0 && runtime_tensor1) {
+      if (input0 == input1) {
+        // replace MUL(A, A) with POW(A, 2.0)
+        // TODO(b/166831113): Support the same inputs for operations.
+        node->operation.type = ToString(OperationType::POW);
+        ElementwiseAttributes attr;
+        attr.param = 2.0f;
+        node->operation.attributes = std::move(attr);
+        return reader->AddInput(node, 0);
+      }
+
+      // The "larger" input tensor must be bound to 1st input and the "smaller"
+      // input tensor must be bound to 2nd input.
       BHWC shape0;
       RETURN_IF_ERROR(ExtractTensorShape(*input0, &shape0));
       BHWC shape1;
@@ -1135,57 +1223,78 @@ class MulOperationParser : public TFLiteOperationParser {
         input_tensor0 = 1;
         input_tensor1 = 0;
       }
-      RETURN_IF_ERROR(
-          ParseApplyMask(node, input_tensor0, input_tensor1, graph, reader));
+      RETURN_IF_ERROR(reader->AddInput(node, input_tensor0));
+      RETURN_IF_ERROR(reader->AddInput(node, input_tensor1));
     } else {
-      // The runtime input tensor must be bound to 1st input and the constant
-      // input tensor must be bound to 2nd input.
-      int runtime_tensor = 0;
-      int constant_tensor = 1;
-      TfLiteIntArray* constant_dims = input1->dims;
-      if (constant_tensor0 && runtime_tensor1) {
-        runtime_tensor = 1;
-        constant_tensor = 0;
-        constant_dims = input0->dims;
-      }
-      RETURN_IF_ERROR(ParseMultiplyScalar(node, runtime_tensor, constant_tensor,
-                                          constant_dims, graph, reader));
+      ElementwiseAttributes attr;
+      RETURN_IF_ERROR(ParseInputsWithConstTensor(node, reader, &attr.param));
+      node->operation.attributes = std::move(attr);
     }
 
     const TfLiteMulParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     return MaybeFuseActivation(tf_options->activation, graph, node);
   }
+};
 
- private:
-  absl::Status ParseApplyMask(Node* node, int input_tensor0, int input_tensor1,
-                              GraphFloat32* graph, ObjectReader* reader) {
-    RETURN_IF_ERROR(reader->AddInput(node, input_tensor0));
-    RETURN_IF_ERROR(reader->AddInput(node, input_tensor1));
-    return reader->AddOutputs(node);
+class PackOperationParser : public TFLiteOperationParser {
+ public:
+  absl::Status IsSupported(const TfLiteContext* context,
+                           const TfLiteNode* tflite_node,
+                           const TfLiteRegistration* registration) final {
+    const TfLitePackParams* tf_options;
+    return RetrieveBuiltinData(tflite_node, &tf_options);
   }
 
-  absl::Status ParseMultiplyScalar(Node* node, int runtime_tensor,
-                                   int constant_tensor,
-                                   const TfLiteIntArray* constant_dims,
-                                   GraphFloat32* graph, ObjectReader* reader) {
-    RETURN_IF_ERROR(reader->AddInput(node, runtime_tensor));
-    ElementwiseAttributes attr;
-    if (constant_dims->size <= 0 || NumElements(constant_dims) == 1) {
-      Tensor<Scalar, DataType::FLOAT32> tensor;
-      RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
-      attr.param = tensor.data[0];
-    } else if (constant_dims->size == 3) {
-      Tensor<HWC, DataType::FLOAT32> tensor;
-      RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
-      attr.param = std::move(tensor);
+  absl::Status Parse(const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration,
+                     GraphFloat32* graph, ObjectReader* reader) final {
+    if (tflite_node->inputs->size == 1) {
+      // Pack with single input can be replaced with Reshape
+      Node* node = graph->NewNode();
+      node->operation.type = ToString(OperationType::RESHAPE);
+      RETURN_IF_ERROR(reader->AddInput(node, 0));
+      RETURN_IF_ERROR(reader->AddOutputs(node));
+      // New shape comes from output shape.
+      ReshapeAttributes attr;
+      attr.new_shape = graph->FindOutputs(node->id)[0]->tensor.shape;
+      node->operation.attributes = attr;
+      return absl::OkStatus();
     } else {
-      Tensor<Linear, DataType::FLOAT32> tensor;
-      RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor));
-      attr.param = std::move(tensor);
+      // Pack with few inputs can be replaced with Concat
+      const TfLitePackParams* tf_options;
+      RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+
+      // Read inputs first to make sure const node is added to a graph before
+      // concat node to ensure topological order.
+      std::vector<const Value*> inputs;
+      for (uint32_t idx = 0; idx < tflite_node->inputs->size; ++idx) {
+        Value* value;
+        const auto status = reader->ReadValue(idx, &value);
+        if (status.ok()) {
+          inputs.push_back(value);
+        } else {
+          TensorFloat32 tensor;
+          RETURN_IF_ERROR(reader->ReadTensor(idx, &tensor));
+          Value* value;
+          RETURN_IF_ERROR(NewConstNode(std::move(tensor), graph, &value));
+          inputs.push_back(value);
+        }
+      }
+
+      Node* node = graph->NewNode();
+      node->operation.type = ToString(OperationType::CONCAT);
+      RETURN_IF_ERROR(reader->AddOutputs(node));
+      for (const Value* input : inputs) {
+        RETURN_IF_ERROR(graph->AddConsumer(node->id, input->id));
+      }
+      const TfLiteTensor* output = reader->GetOutputTensor(0);
+      ConcatAttributes attr;
+      RETURN_IF_ERROR(
+          ExtractAxisFromIndex(*output, tf_options->axis, &attr.axis));
+      node->operation.attributes = attr;
+      return absl::OkStatus();
     }
-    node->operation.attributes = std::move(attr);
-    return reader->AddOutputs(node);
   }
 };
 
@@ -1251,7 +1360,10 @@ class PadOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
                                        /*runtime_inputs=*/1, /*outputs=*/1));
     RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
-    auto pad_tensor = tflite::GetInput(context, tflite_node, 1);
+    const TfLiteTensor* pad_tensor = GetInput(context, tflite_node, 1);
+    if (pad_tensor == nullptr) {
+      return absl::InvalidArgumentError("Padding tensor was null");
+    }
     if (pad_tensor->dims->size != 2) {
       return absl::InvalidArgumentError(absl::StrCat(
           "Invalid paddings tensor dimension: expected 2 dim, got ",
@@ -1381,6 +1493,52 @@ class Pooling2DOperationParser : public TFLiteOperationParser {
   const PoolingType type_;
 };
 
+class ReduceOperationParser : public TFLiteOperationParser {
+ public:
+  explicit ReduceOperationParser(OperationType operation_type)
+      : operation_type_(operation_type) {}
+
+  absl::Status IsSupported(const TfLiteContext* context,
+                           const TfLiteNode* tflite_node,
+                           const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
+                                       /*runtime_inputs=*/1, /*outputs=*/1));
+    auto* axes = &context->tensors[tflite_node->inputs->data[1]];
+    if (axes->allocation_type != kTfLiteMmapRo || axes->type != kTfLiteInt32) {
+      return absl::UnimplementedError(
+          "Reduce has unsupported tensor for axes.");
+    }
+    if (tflite::NumElements(axes) != 1) {
+      return absl::UnimplementedError(
+          "Supported reduce in single dimensions only.");
+    }
+    return absl::OkStatus();
+  }
+
+  absl::Status Parse(const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration,
+                     GraphFloat32* graph, ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    node->operation.type = ToString(operation_type_);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+
+    const TfLiteReducerParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+
+    Tensor<Scalar, DataType::INT32> axes;
+    RETURN_IF_ERROR(reader->ReadTensor(1, &axes));
+    const TfLiteTensor* input = reader->GetInputTensor(0);
+    ReduceAttributes attr;
+    RETURN_IF_ERROR(ExtractAxisFromIndex(*input, axes.data[0], &attr.axis));
+    node->operation.attributes = attr;
+    return absl::OkStatus();
+  }
+
+ private:
+  const OperationType operation_type_;
+};
+
 class QuantizeOperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
@@ -1594,6 +1752,15 @@ class SliceOperationParser : public TFLiteOperationParser {
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
     RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
+    if (tflite_node->inputs->size < 3) {
+      return absl::UnimplementedError("SLICE requires 3 inputs.");
+    }
+    const TfLiteTensor* input = GetInput(context, tflite_node, 0);
+    if (input->dims->size != 3 && input->dims->size != 4) {
+      return absl::UnimplementedError(
+          "SLICE supports for 3 or 4 dimensional tensors only.");
+    }
+
     return absl::OkStatus();
   }
 
@@ -1607,6 +1774,9 @@ class SliceOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->ReadValue(0, &input));
     RETURN_IF_ERROR(graph->AddConsumer(node->id, input->id));
 
+    const TfLiteTensor* tfl_input = reader->GetInputTensor(0);
+    const int input_dims = tfl_input->dims->size;
+
     SliceAttributes attr;
     attr.strides = BHWC(1, 1, 1, 1);
     Tensor<Linear, DataType::INT32> starts, sizes;
@@ -1615,36 +1785,65 @@ class SliceOperationParser : public TFLiteOperationParser {
     if (starts.data.size() != sizes.data.size()) {
       return absl::InvalidArgumentError("Starts amount != sizes amount.");
     }
-    const auto& in_shape = input->tensor.shape;
-    if (starts.data.size() == 4) {
-      sizes.data[0] =
-          sizes.data[0] != -1 ? sizes.data[0] : in_shape.b - starts.data[0];
-      sizes.data[1] =
-          sizes.data[1] != -1 ? sizes.data[1] : in_shape.h - starts.data[1];
-      sizes.data[2] =
-          sizes.data[2] != -1 ? sizes.data[2] : in_shape.w - starts.data[2];
-      sizes.data[3] =
-          sizes.data[3] != -1 ? sizes.data[3] : in_shape.c - starts.data[3];
-      attr.starts =
-          BHWC(starts.data[0], starts.data[1], starts.data[2], starts.data[3]);
-      attr.ends =
-          BHWC(starts.data[0] + sizes.data[0], starts.data[1] + sizes.data[1],
-               starts.data[2] + sizes.data[2], starts.data[3] + sizes.data[3]);
-    } else if (starts.data.size() == 3) {
-      sizes.data[0] =
-          sizes.data[0] != -1 ? sizes.data[0] : in_shape.h - starts.data[0];
-      sizes.data[1] =
-          sizes.data[1] != -1 ? sizes.data[1] : in_shape.w - starts.data[1];
-      sizes.data[2] =
-          sizes.data[2] != -1 ? sizes.data[2] : in_shape.c - starts.data[2];
-      attr.starts = BHWC(0, starts.data[0], starts.data[1], starts.data[2]);
-      attr.ends =
-          BHWC(in_shape.b, starts.data[0] + sizes.data[0],
-               starts.data[1] + sizes.data[1], starts.data[2] + sizes.data[2]);
+    BHWC bhwc_starts(0, 0, 0, 0);
+    BHWC bhwc_sizes = input->tensor.shape;
+    if (input_dims == 4) {
+      // input in BHWC layout
+      if (starts.data.size() == 4) {
+        bhwc_starts.b = starts.data[0];
+        bhwc_starts.h = starts.data[1];
+        bhwc_starts.w = starts.data[2];
+        bhwc_starts.c = starts.data[3];
+        bhwc_sizes.b = sizes.data[0];
+        bhwc_sizes.h = sizes.data[1];
+        bhwc_sizes.w = sizes.data[2];
+        bhwc_sizes.c = sizes.data[3];
+      } else if (starts.data.size() == 3) {
+        // if input is 4D(BHWC) and args 3D, we assume that args in HWC layout
+        bhwc_starts.h = starts.data[0];
+        bhwc_starts.w = starts.data[1];
+        bhwc_starts.c = starts.data[2];
+        bhwc_sizes.h = sizes.data[0];
+        bhwc_sizes.w = sizes.data[1];
+        bhwc_sizes.c = sizes.data[2];
+      } else {
+        return absl::UnimplementedError(
+            "Slicing is supported for 3 or 4 dimensional tensors only.");
+      }
+    } else if (input_dims == 3) {
+      // input in BWC layout
+      if (starts.data.size() == 3) {
+        bhwc_starts.b = starts.data[0];
+        bhwc_starts.w = starts.data[1];
+        bhwc_starts.c = starts.data[2];
+        bhwc_sizes.b = sizes.data[0];
+        bhwc_sizes.w = sizes.data[1];
+        bhwc_sizes.c = sizes.data[2];
+      } else {
+        return absl::UnimplementedError(
+            "Slicing is supported for 3 or 4 dimensional tensors only.");
+      }
     } else {
       return absl::UnimplementedError(
           "Slicing is supported for 3 or 4 dimensional tensors only.");
     }
+    const auto& in_shape = input->tensor.shape;
+    if (bhwc_sizes.b == -1) {
+      bhwc_sizes.b = in_shape.b - bhwc_starts.b;
+    }
+    if (bhwc_sizes.h == -1) {
+      bhwc_sizes.h = in_shape.h - bhwc_starts.h;
+    }
+    if (bhwc_sizes.w == -1) {
+      bhwc_sizes.w = in_shape.w - bhwc_starts.w;
+    }
+    if (bhwc_sizes.c == -1) {
+      bhwc_sizes.c = in_shape.c - bhwc_starts.c;
+    }
+    attr.starts = bhwc_starts;
+    attr.ends =
+        BHWC(bhwc_starts.b + bhwc_sizes.b, bhwc_starts.h + bhwc_sizes.h,
+             bhwc_starts.w + bhwc_sizes.w, bhwc_starts.c + bhwc_sizes.c);
     RETURN_IF_ERROR(UpdateIfNegative(in_shape, &attr));
 
     auto out_shape = graph->FindOutputs(node->id)[0]->tensor.shape;
@@ -1771,6 +1970,15 @@ class StridedSliceOperationParser : public TFLiteOperationParser {
     const TfLiteStridedSliceParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     RETURN_IF_ERROR(CheckOptionsSupport(tf_options));
+
+    if (tflite_node->inputs->size < 4) {
+      return absl::UnimplementedError("STRIDED_SLICE requires 4 inputs.");
+    }
+    const TfLiteTensor* input = GetInput(context, tflite_node, 0);
+    if (input->dims->size != 3 && input->dims->size != 4) {
+      return absl::UnimplementedError(
+          "STRIDED_SLICE supports for 3 or 4 dimensional tensors only.");
+    }
     return absl::OkStatus();
   }
 
@@ -1790,6 +1998,7 @@ class StridedSliceOperationParser : public TFLiteOperationParser {
     bool read_without_batch = tmp.data.size() == 3;
     bool read_with_batch = tmp.data.size() == 4;
     if (!read_without_batch && !read_with_batch) {
+      // Error: Must be catched in IsSupported()
       return absl::UnimplementedError(
           "Slicing is supported for 3 or 4 dimensional tensors only.");
     }
@@ -1941,12 +2150,13 @@ class StridedSliceOperationParser : public TFLiteOperationParser {
   }
 };
 
-class TransposeConvOperationParser : public TFLiteOperationParser {
+// Builtin op version of TRANSPOSE_CONV.
+class TransposeConvBuiltinOperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 3));
     RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
     const TfLiteTransposeConvParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
@@ -1955,8 +2165,8 @@ class TransposeConvOperationParser : public TFLiteOperationParser {
     return absl::OkStatus();
   }
 
-  // TFLite's TRANSPOSE_CONV expects 3 input (output shape, weights, and input)
-  // and allows configurable padding & stride.
+  // TFLite's TRANSPOSE_CONV expects 3-4 input tensors (output shape, weights,
+  // input, and an optional bias) and allows configurable padding & stride.
   // TODO(impjdi): Translate output_shape to attr.adjacent.
   absl::Status Parse(const TfLiteNode* tflite_node,
                      const TfLiteRegistration* registration,
@@ -1976,8 +2186,7 @@ class TransposeConvOperationParser : public TFLiteOperationParser {
                       ? HW(tf_options->stride_height, tf_options->stride_width)
                       : HW(1, 1);
     RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
-
-    // TFLite does not support bias.
+    reader->ReadTensor(3, &attr.bias).IgnoreError();  // bias is optional
 
     UpdatePadding(tf_options->padding,
                   graph->FindInputs(node->id)[0]->tensor.shape, &attr);
@@ -1986,6 +2195,45 @@ class TransposeConvOperationParser : public TFLiteOperationParser {
   }
 };
 
+// Custom op version of TRANSPOSE_CONV.
+class TransposeConvCustomOperationParser : public TFLiteOperationParser {
+ public:
+  absl::Status IsSupported(const TfLiteContext* context,
+                           const TfLiteNode* tflite_node,
+                           const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
+    const TfLiteTransposeConvParams* tf_options;
+    RETURN_IF_ERROR(RetrieveCustomInitialData(tflite_node, &tf_options));
+    RETURN_IF_ERROR(
+        CheckStrides(tf_options->stride_height, tf_options->stride_width));
+    return absl::OkStatus();
+  }
+
+  absl::Status Parse(const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration,
+                     GraphFloat32* graph, ObjectReader* reader) final {
+    auto* node = graph->NewNode();
+    node->operation.type = ToString(OperationType::CONVOLUTION_TRANSPOSED);
+    RETURN_IF_ERROR(reader->AddInput(node, 0));
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+
+    const TfLiteTransposeConvParams* tf_options;
+    auto status = RetrieveCustomInitialData(tflite_node, &tf_options);
+
+    ConvolutionTransposedAttributes attr;
+    attr.stride = status.ok()
+                      ? HW(tf_options->stride_height, tf_options->stride_width)
+                      : HW(1, 1);
+    RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
+    reader->ReadTensor(2, &attr.bias).IgnoreError();  // bias is optional
+
+    UpdatePadding(status.ok() ? tf_options->padding : kTfLitePaddingUnknown,
+                  graph->FindInputs(node->id)[0]->tensor.shape, &attr);
+    node->operation.attributes = std::move(attr);
+    return absl::OkStatus();
+  }
+};
+
 class TransposeOperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
@@ -2015,29 +2263,18 @@ class TransposeOperationParser : public TFLiteOperationParser {
     if (perm.data.size() == 4) {
       attr.perm = BHWC(perm.data[0], perm.data[1], perm.data[2], perm.data[3]);
     } else if (perm.data.size() == 3) {
-      std::vector<Axis> index_to_axis = {Axis::CHANNELS, Axis::WIDTH,
-                                         Axis::BATCH};
-      std::map<Axis, Axis> remap = {
-          {Axis::HEIGHT, Axis::HEIGHT},
-          {index_to_axis[perm.data[2]], Axis::BATCH},
-          {index_to_axis[perm.data[1]], Axis::WIDTH},
-          {index_to_axis[perm.data[0]], Axis::CHANNELS}};
-      attr.perm.b = axis_to_index[remap[Axis::BATCH]];
-      attr.perm.h = axis_to_index[remap[Axis::HEIGHT]];
-      attr.perm.w = axis_to_index[remap[Axis::WIDTH]];
-      attr.perm.c = axis_to_index[remap[Axis::CHANNELS]];
-
+      std::vector<Axis> index_to_axis = {Axis::BATCH, Axis::WIDTH,
+                                         Axis::CHANNELS};
+      attr.perm.b = axis_to_index[index_to_axis[perm.data[0]]];
+      attr.perm.h = 1;
+      attr.perm.w = axis_to_index[index_to_axis[perm.data[1]]];
+      attr.perm.c = axis_to_index[index_to_axis[perm.data[2]]];
     } else if (perm.data.size() == 2) {
-      std::vector<Axis> index_to_axis = {Axis::CHANNELS, Axis::BATCH};
-      std::map<Axis, Axis> remap = {
-          {Axis::HEIGHT, Axis::HEIGHT},
-          {Axis::WIDTH, Axis::WIDTH},
-          {index_to_axis[perm.data[1]], Axis::BATCH},
-          {index_to_axis[perm.data[0]], Axis::CHANNELS}};
-      attr.perm.b = axis_to_index[remap[Axis::BATCH]];
-      attr.perm.h = axis_to_index[remap[Axis::HEIGHT]];
-      attr.perm.w = axis_to_index[remap[Axis::WIDTH]];
-      attr.perm.c = axis_to_index[remap[Axis::CHANNELS]];
+      std::vector<Axis> index_to_axis = {Axis::BATCH, Axis::CHANNELS};
+      attr.perm.b = axis_to_index[index_to_axis[perm.data[0]]];
+      attr.perm.h = 1;
+      attr.perm.w = 2;
+      attr.perm.c = axis_to_index[index_to_axis[perm.data[1]]];
     } else {
       return absl::InvalidArgumentError(
           "Permutation for transpose is invalid.");
@@ -2181,6 +2418,7 @@ class RoIToTransformMatrixOperationParser : public TFLiteOperationParser {
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
     RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
                                        /*runtime_inputs=*/1, /*outputs=*/1));
     return absl::OkStatus();
@@ -2205,48 +2443,14 @@ class RoIToTransformMatrixOperationParser : public TFLiteOperationParser {
     output_value->tensor.shape = output_shape;
     return absl::OkStatus();
   }
-
- private:
 };
 
-class RoIToTransformMatrixV2OperationParser : public TFLiteOperationParser {
- public:
-  absl::Status IsSupported(const TfLiteContext* context,
-                           const TfLiteNode* tflite_node,
-                           const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
-                                       /*runtime_inputs=*/1, /*outputs=*/1));
-    return absl::OkStatus();
-  }
-
-  absl::Status Parse(const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration,
-                     GraphFloat32* graph, ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    RETURN_IF_ERROR(reader->AddInput(node, 0));  // bbox
-    RETURN_IF_ERROR(reader->AddOutputs(node));
-
-    std::string op_name = "roi_to_transform_matrix_v2";
-    node->operation.type = op_name;
-    BHWC output_shape;
-    RETURN_IF_ERROR(ParseCustomAttributes(
-        op_name, registration->version, tflite_node->custom_initial_data,
-        tflite_node->custom_initial_data_size, &(node->operation.attributes),
-        &output_shape));
-
-    auto output_value = graph->FindOutputs(node->id)[0];
-    output_value->tensor.shape = output_shape;
-    return absl::OkStatus();
-  }
-
- private:
-};
-
-class TransformTensorOperationParser : public TFLiteOperationParser {
+class TransformTensorBilinearOperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
     RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
                                        /*runtime_inputs=*/2, /*outputs=*/1));
     return absl::OkStatus();
@@ -2260,7 +2464,7 @@ class TransformTensorOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddInput(node, 1));  // bbox
     RETURN_IF_ERROR(reader->AddOutputs(node));
 
-    std::string op_name = "transform_tensor";
+    std::string op_name = "transform_tensor_bilinear";
     node->operation.type = op_name;
     BHWC output_shape;
     RETURN_IF_ERROR(ParseCustomAttributes(
@@ -2275,45 +2479,6 @@ class TransformTensorOperationParser : public TFLiteOperationParser {
              graph->FindInputs(node->id)[0]->tensor.shape.c);
     return absl::OkStatus();
   }
-
- private:
-};
-
-class TransformTensorBilinearV2OperationParser : public TFLiteOperationParser {
- public:
-  absl::Status IsSupported(const TfLiteContext* context,
-                           const TfLiteNode* tflite_node,
-                           const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
-                                       /*runtime_inputs=*/2, /*outputs=*/1));
-    return absl::OkStatus();
-  }
-
-  absl::Status Parse(const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration,
-                     GraphFloat32* graph, ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    RETURN_IF_ERROR(reader->AddInput(node, 0));  // data
-    RETURN_IF_ERROR(reader->AddInput(node, 1));  // bbox
-    RETURN_IF_ERROR(reader->AddOutputs(node));
-
-    std::string op_name = "transform_tensor_bilinear_v2";
-    node->operation.type = op_name;
-    BHWC output_shape;
-    RETURN_IF_ERROR(ParseCustomAttributes(
-        op_name, registration->version, tflite_node->custom_initial_data,
-        tflite_node->custom_initial_data_size, &(node->operation.attributes),
-        &output_shape));
-
-    auto output_value = graph->FindOutputs(node->id)[0];
-
-    output_value->tensor.shape =
-        BHWC(1, output_shape.h, output_shape.w,
-             graph->FindInputs(node->id)[0]->tensor.shape.c);
-    return absl::OkStatus();
-  }
-
- private:
 };
 
 class TransformLandmarksOperationParser : public TFLiteOperationParser {
@@ -2321,6 +2486,7 @@ class TransformLandmarksOperationParser : public TFLiteOperationParser {
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
     RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
                                        /*runtime_inputs=*/2, /*outputs=*/1));
     return absl::OkStatus();
@@ -2346,42 +2512,6 @@ class TransformLandmarksOperationParser : public TFLiteOperationParser {
     output_value->tensor.shape = graph->FindInputs(node->id)[0]->tensor.shape;
     return absl::OkStatus();
   }
-
- private:
-};
-
-class TransformLandmarksV2OperationParser : public TFLiteOperationParser {
- public:
-  absl::Status IsSupported(const TfLiteContext* context,
-                           const TfLiteNode* tflite_node,
-                           const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
-                                       /*runtime_inputs=*/2, /*outputs=*/1));
-    return absl::OkStatus();
-  }
-
-  absl::Status Parse(const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration,
-                     GraphFloat32* graph, ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    RETURN_IF_ERROR(reader->AddInput(node, 0));  // data
-    RETURN_IF_ERROR(reader->AddInput(node, 1));  // bbox
-    RETURN_IF_ERROR(reader->AddOutputs(node));
-    std::string op_name = "transform_landmarks_v2";
-    node->operation.type = op_name;
-
-    auto output_value = graph->FindOutputs(node->id)[0];
-    output_value->tensor.shape = graph->FindInputs(node->id)[0]->tensor.shape;
-    BHWC output_shape = output_value->tensor.shape;
-    RETURN_IF_ERROR(ParseCustomAttributes(
-        op_name, registration->version, tflite_node->custom_initial_data,
-        tflite_node->custom_initial_data_size, &(node->operation.attributes),
-        &output_shape));
-
-    return absl::OkStatus();
-  }
-
- private:
 };
 
 class Landmarks2TransformMatrixOperationParser : public TFLiteOperationParser {
@@ -2389,6 +2519,7 @@ class Landmarks2TransformMatrixOperationParser : public TFLiteOperationParser {
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
     return CheckInputsOutputs(context, tflite_node, /*runtime_inputs=*/1,
                               /*outputs=*/1);
   }
@@ -2414,37 +2545,6 @@ class Landmarks2TransformMatrixOperationParser : public TFLiteOperationParser {
   }
 };
 
-class Landmarks2TransformMatrixV2OperationParser
-    : public TFLiteOperationParser {
- public:
-  absl::Status IsSupported(const TfLiteContext* context,
-                           const TfLiteNode* tflite_node,
-                           const TfLiteRegistration* registration) final {
-    return CheckInputsOutputs(context, tflite_node, /*runtime_inputs=*/1,
-                              /*outputs=*/1);
-  }
-
-  absl::Status Parse(const TfLiteNode* tflite_node,
-                     const TfLiteRegistration* registration,
-                     GraphFloat32* graph, ObjectReader* reader) final {
-    Node* node = graph->NewNode();
-    RETURN_IF_ERROR(reader->AddInput(node, 0));  // landmarks
-    RETURN_IF_ERROR(reader->AddOutputs(node));   // transform matrix
-
-    const std::string op_name = "landmarks_to_transform_matrix_v2";
-    node->operation.type = op_name;
-    BHWC output_shape;
-    RETURN_IF_ERROR(ParseCustomAttributes(
-        op_name, registration->version, tflite_node->custom_initial_data,
-        tflite_node->custom_initial_data_size, &(node->operation.attributes),
-        &output_shape));
-
-    auto output_value = graph->FindOutputs(node->id)[0];
-    output_value->tensor.shape = output_shape;
-    return absl::OkStatus();
-  }
-};
-
 class AlignmentPointsToTransformMatrixOperationParser
     : public TFLiteOperationParser {
  public:
@@ -2563,6 +2663,8 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
       return std::make_unique<AddOperationParser>();
     case kTfLiteBuiltinAveragePool2d:
       return std::make_unique<Pooling2DOperationParser>(PoolingType::AVERAGE);
+    case kTfLiteBuiltinBatchMatmul:
+      return std::make_unique<BatchedMatMulOperationParser>();
     case kTfLiteBuiltinConcatenation:
       return std::make_unique<ConcatenationOperationParser>();
     case kTfLiteBuiltinConv2d:
@@ -2607,10 +2709,23 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
       return std::make_unique<PadOperationParser>(/*mirror_pad=*/true);
     case kTfLiteBuiltinMul:
       return std::make_unique<MulOperationParser>();
+    case kTfLiteBuiltinNeg:
+      return std::make_unique<ElementwiseOperationParser>(OperationType::NEG);
+    case kTfLiteBuiltinPack:
+      return std::make_unique<PackOperationParser>();
     case kTfLiteBuiltinPad:
       return std::make_unique<PadOperationParser>(/*mirror_pad=*/false);
     case kTfLiteBuiltinPow:
       return std::make_unique<ElementwiseOperationParser>(OperationType::POW);
+    case kTfLiteBuiltinReduceMax:
+      return std::make_unique<ReduceOperationParser>(
+          OperationType::REDUCE_MAXIMUM);
+    case kTfLiteBuiltinReduceMin:
+      return std::make_unique<ReduceOperationParser>(
+          OperationType::REDUCE_MINIMUM);
+    case kTfLiteBuiltinReduceProd:
+      return std::make_unique<ReduceOperationParser>(
+          OperationType::REDUCE_PRODUCT);
     case kTfLiteBuiltinQuantize:
       if (allow_quant_ops) {
         return std::make_unique<QuantizeOperationParser>();
@@ -2652,17 +2767,19 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
       return std::make_unique<StridedSliceOperationParser>();
     case kTfLiteBuiltinSub:
       return std::make_unique<ElementwiseOperationParser>(OperationType::SUB);
+    case kTfLiteBuiltinSum:
+      return std::make_unique<ReduceOperationParser>(OperationType::REDUCE_SUM);
     case kTfLiteBuiltinTanh:
       return std::make_unique<ElementwiseOperationParser>(OperationType::TANH);
     case kTfLiteBuiltinTranspose:
       return std::make_unique<TransposeOperationParser>();
     case kTfLiteBuiltinTransposeConv:
-      return std::make_unique<TransposeConvOperationParser>();
+      return std::make_unique<TransposeConvBuiltinOperationParser>();
 
     case kTfLiteBuiltinCustom:
       const absl::string_view custom_name = registration->custom_name;
       if (custom_name == "Convolution2DTransposeBias") {
-        return std::make_unique<Convolution2DTransposeBiasParser>();
+        return std::make_unique<TransposeConvCustomOperationParser>();
       }
       if (custom_name == "MaxPoolingWithArgmax2D") {
         return std::make_unique<Pooling2DOperationParser>(PoolingType::MAX);
@@ -2673,27 +2790,17 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
       if (custom_name == "RoIToTransformMatrix") {
         return std::make_unique<RoIToTransformMatrixOperationParser>();
       }
-      if (custom_name == "RoIToTransformMatrixV2") {
-        return std::make_unique<RoIToTransformMatrixV2OperationParser>();
-      }
-      if (custom_name == "TransformTensor") {
-        return std::make_unique<TransformTensorOperationParser>();
-      }
-      if (custom_name == "TransformTensorBilinearV2") {
-        return std::make_unique<TransformTensorBilinearV2OperationParser>();
+      if (custom_name == "TransformTensor" /*for version 1*/ ||
+          custom_name == "TransformTensorBilinear" /*for version 2*/) {
+        return std::make_unique<TransformTensorBilinearOperationParser>();
       }
       if (custom_name == "TransformLandmarks") {
         return std::make_unique<TransformLandmarksOperationParser>();
       }
-      if (custom_name == "TransformLandmarksV2") {
-        return std::make_unique<TransformLandmarksV2OperationParser>();
-      }
-      if (custom_name == "Landmarks2TransformMatrix") {
+      if (custom_name == "Landmarks2TransformMatrix" ||
+          custom_name == "Landmarks2TransformMatrixV2") {
         return std::make_unique<Landmarks2TransformMatrixOperationParser>();
       }
-      if (custom_name == "Landmarks2TransformMatrixV2") {
-        return std::make_unique<Landmarks2TransformMatrixV2OperationParser>();
-      }
       if (custom_name == "AlignmentPointsToTransformMatrix") {
         return std::make_unique<
             AlignmentPointsToTransformMatrixOperationParser>();
@@ -2813,6 +2920,44 @@ absl::Status PrecreateIOTensors(
   return absl::OkStatus();
 }
 
+absl::Status CopyVariableTensorOutputs(
+    TfLiteNode* tflite_node, TfLiteRegistration* registration,
+    GraphFloat32* graph, ObjectReader& reader,
+    const absl::flat_hash_map<int, ValueId>& new_variable_tensor_values) {
+  absl::flat_hash_map<int, ValueId> new_variable_tensor_values_copy(
+      new_variable_tensor_values);
+  // Retrieve the final value id for the variable input tensors.
+  for (int i = 0; i < tflite_node->inputs->size; i++) {
+    int tensor_idx = tflite_node->inputs->data[i];
+    Value* value;
+    if (!reader.ReadValueByTensorIdx(tensor_idx, &value).ok()) continue;
+    if (value->tensor.is_variable_input) {
+      if (new_variable_tensor_values_copy.find(i) ==
+          new_variable_tensor_values_copy.end()) {
+        return absl::InvalidArgumentError(
+            absl::StrCat(GetOpNameByRegistration(*registration),
+                         " did not provide a new value for the variable input "
+                         "tensor with index ",
+                         tensor_idx));
+      } else {
+        Node* node = graph->NewNode();
+        node->operation.type = ToString(OperationType::COPY);
+        RETURN_IF_ERROR(graph->AddConsumer(
+            node->id, new_variable_tensor_values_copy.at(i)));
+        RETURN_IF_ERROR(reader.AddUpdate(node, i));
+        new_variable_tensor_values_copy.erase(
+            new_variable_tensor_values_copy.find(i));
+      }
+    }
+  }
+  if (!new_variable_tensor_values_copy.empty()) {
+    return absl::InvalidArgumentError(
+        "More input variable tensors asked to be copied than present on the "
+        "node");
+  }
+  return absl::OkStatus();
+}
+
 absl::Status BuildModel(TfLiteContext* context,
                         const TfLiteDelegateParams* delegate_params,
                         GraphFloat32* graph,
@@ -2843,6 +2988,7 @@ absl::Status BuildModel(TfLiteContext* context,
     tflite_nodes.push_back(i);
   }
   absl::flat_hash_map<int, Value*> tensor_to_value;
+  std::vector<ValueId> variable_inputs_to_value_id;
   RETURN_IF_ERROR(PrecreateIOTensors(context, graph,
                                      delegate_params->input_tensors,
                                      quant_conversion_map, &tensor_to_value));
@@ -2863,6 +3009,23 @@ absl::Status BuildModel(TfLiteContext* context,
       return absl::InternalError(absl::StrCat(
           GetOpNameByRegistration(*registration), ": ", status.message()));
     }
+
+    absl::flat_hash_map<int, ValueId> new_value_for_variable_input_tensors =
+        operations[i]->GetNewValueIdsForVariableInputNodes();
+
+    RETURN_IF_ERROR(
+        CopyVariableTensorOutputs(tflite_node, registration, graph, reader,
+                                  new_value_for_variable_input_tensors));
+  }
+
+  // Variable input tensors expect to be unchanged throughout model execution.
+  // They need to be an output of the graph in order to have them unchanged.
+  for (auto value_id : variable_inputs_to_value_id) {
+    if (!graph->IsGraphOutput(value_id)) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Variable input tensors must be a graph output. Value ",
+                       value_id, " is not a graph output"));
+    }
   }
   return absl::OkStatus();
 }
@@ -2876,8 +3039,8 @@ absl::Status BuildFinalModel(
   // Apply general transformations on the graph.
   NullTransformationReporter reporter;
   ModelTransformer transformer(graph, &reporter);
-  if (!ApplyGeneralTransformations(&transformer)) {
-    return absl::InternalError("Graph general transformations failed");
+  if (!ApplyModelTransformations(&transformer)) {
+    return absl::InternalError("Graph transformations failed");
   }
   return absl::OkStatus();
 }
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.h b/tensorflow/lite/delegates/gpu/common/model_builder.h
index 9d80e9636f0..ab18f056d58 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.h
@@ -16,13 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_H_
 
-#include <cstdint>
-#include <string>
-
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
index b030fb7e700..4f67495152c 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
@@ -15,19 +15,27 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
 
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <any>
+#include <limits>
 #include <string>
+#include <vector>
 
 #include <fp16.h>
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/context.h"
 #include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/utils.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
@@ -89,15 +97,19 @@ absl::Status ExtractTensorShape(const TfLiteTensor& tflite_tensor, BHWC* bhwc) {
   const TfLiteIntArray* dims = tflite_tensor.dims;
   switch (dims->size) {
     case 1:
+      // B layout
       *bhwc = BHWC(dims->data[0], 1, 1, 1);
       return absl::OkStatus();
     case 2:
+      // BC layout
       *bhwc = BHWC(dims->data[0], 1, 1, dims->data[1]);
       return absl::OkStatus();
     case 3:
+      // BWC layout
       *bhwc = BHWC(dims->data[0], 1, dims->data[1], dims->data[2]);
       return absl::OkStatus();
     case 4:
+      // BHWC layout
       *bhwc = BHWC(dims->data[0], dims->data[1], dims->data[2], dims->data[3]);
       return absl::OkStatus();
     default:
@@ -107,6 +119,40 @@ absl::Status ExtractTensorShape(const TfLiteTensor& tflite_tensor, BHWC* bhwc) {
   }
 }
 
+absl::Status ExtractAxisFromIndex(const TfLiteTensor& tflite_tensor, int index,
+                                  Axis* axis) {
+  const TfLiteIntArray* dims = tflite_tensor.dims;
+  if (index == -1) {
+    index = dims->size - 1;
+  }
+  if (index < 0 || index >= dims->size) {
+    return absl::OutOfRangeError("Index for axis out of range");
+  }
+  std::vector<Axis> index_to_axis;
+  switch (dims->size) {
+    case 1:
+      // B layout
+      index_to_axis = {Axis::BATCH};
+      break;
+    case 2:
+      // BC layout
+      index_to_axis = {Axis::BATCH, Axis::CHANNELS};
+      break;
+    case 3:
+      // BWC layout
+      index_to_axis = {Axis::BATCH, Axis::WIDTH, Axis::CHANNELS};
+      break;
+    case 4:
+      // BHWC layout
+      index_to_axis = {Axis::BATCH, Axis::HEIGHT, Axis::WIDTH, Axis::CHANNELS};
+      break;
+    default:
+      return absl::UnavailableError("Unknown layout.");
+  }
+  *axis = index_to_axis[index];
+  return absl::OkStatus();
+}
+
 absl::Status ConvertTfLiteTensorToTensorRef(const TfLiteTensor& tflite_tensor,
                                             TensorRef<BHWC>* tensor_ref) {
   tensor_ref->type = ToDataType(tflite_tensor.type);
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
index 849ef049683..93889314e81 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
@@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_HELPER_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_HELPER_H_
 
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
 #include <string>
 
 #include "absl/strings/str_cat.h"
@@ -29,7 +33,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/dequantize.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace gpu {
@@ -42,6 +45,9 @@ DataType ToDataType(TfLiteType type);
 
 absl::Status ExtractTensorShape(const TfLiteTensor& tflite_tensor, BHWC* bhwc);
 
+absl::Status ExtractAxisFromIndex(const TfLiteTensor& tflite_tensor, int index,
+                                  Axis* axis);
+
 absl::Status ConvertTfLiteTensorToTensorRef(const TfLiteTensor& tflite_tensor,
                                             TensorRef<BHWC>* tensor_ref);
 
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_test.cc b/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
index c5ee71b3f3f..9bc848b9210 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_test.cc
@@ -15,15 +15,21 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
 
-#include <cstdlib>
+#include <stddef.h>
+#include <stdint.h>
+
+#include <cstdlib>
+#include <utility>
+#include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "tensorflow/lite/builtin_ops.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/stderr_reporter.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/model_test.cc b/tensorflow/lite/delegates/gpu/common/model_test.cc
index 87f65eb730a..816674b6674 100644
--- a/tensorflow/lite/delegates/gpu/common/model_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_test.cc
@@ -15,11 +15,9 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 
-#include <initializer_list>
-#include <vector>
-
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 
 namespace tflite {
 namespace gpu {
@@ -139,98 +137,140 @@ TEST(Model, RemoveProducer) {
   ASSERT_FALSE(graph.RemoveProducer(graph_output->id).ok());
 }
 
-TEST(Model, RemoveSimpleNodeDegenerateCase) {
-  GraphFloat32 graph;
-  Node* node = graph.NewNode();
-  Value* graph_input = graph.NewValue();
-  Value* graph_output = graph.NewValue();
+class OneNodeModel : public testing::Test {
+ protected:
+  void SetUp() override {
+    node_ = graph_.NewNode();
+    Value* graph_input = graph_.NewValue();
+    Value* graph_output = graph_.NewValue();
+    ASSERT_TRUE(graph_.AddConsumer(node_->id, graph_input->id).ok());
+    ASSERT_TRUE(graph_.SetProducer(node_->id, graph_output->id).ok());
+    EXPECT_THAT(graph_.inputs(), UnorderedElementsAre(graph_input));
+    EXPECT_THAT(graph_.outputs(), UnorderedElementsAre(graph_output));
+    EXPECT_THAT(graph_.nodes(), ElementsAre(node_));
+  }
+  GraphFloat32 graph_;
+  Node* node_;
+};
 
-  ASSERT_TRUE(graph.AddConsumer(node->id, graph_input->id).ok());
-  ASSERT_TRUE(graph.SetProducer(node->id, graph_output->id).ok());
-  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_input));
-  EXPECT_THAT(graph.outputs(), UnorderedElementsAre(graph_output));
-  EXPECT_THAT(graph.nodes(), ElementsAre(node));
-
-  ASSERT_TRUE(RemoveOneInputOneOutputNode(&graph, node).ok());
-  EXPECT_THAT(graph.inputs(), UnorderedElementsAre());
-  EXPECT_THAT(graph.outputs(), UnorderedElementsAre());
-  EXPECT_THAT(graph.nodes(), ElementsAre());
+TEST_F(OneNodeModel, DeleteNodeKeepInput) {
+  ASSERT_TRUE(RemoveSimpleNodeKeepInput(&graph_, node_).ok());
+  EXPECT_TRUE(graph_.inputs().empty());
+  EXPECT_TRUE(graph_.outputs().empty());
+  EXPECT_TRUE(graph_.nodes().empty());
 }
 
-TEST(Model, RemoveSimpleNodeNoPreviousNode) {
-  GraphFloat32 graph;
-  Node* simple_node = graph.NewNode();
-  Node* consumer_node = graph.NewNode();
-  Value* graph_input = graph.NewValue();
-  Value* graph_output = graph.NewValue();
-  Value* value = graph.NewValue();
-
-  ASSERT_TRUE(graph.AddConsumer(simple_node->id, graph_input->id).ok());
-  ASSERT_TRUE(graph.SetProducer(simple_node->id, value->id).ok());
-  ASSERT_TRUE(graph.AddConsumer(consumer_node->id, value->id).ok());
-  ASSERT_TRUE(graph.SetProducer(consumer_node->id, graph_output->id).ok());
-  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_input));
-  EXPECT_THAT(graph.outputs(), UnorderedElementsAre(graph_output));
-  EXPECT_THAT(graph.nodes(), ElementsAre(simple_node, consumer_node));
-
-  ASSERT_TRUE(RemoveOneInputOneOutputNode(&graph, simple_node).ok());
-  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_input));
-  EXPECT_THAT(graph.outputs(), UnorderedElementsAre(graph_output));
-  EXPECT_THAT(graph.nodes(), ElementsAre(consumer_node));
+TEST_F(OneNodeModel, DeleteNodeKeepOutput) {
+  ASSERT_TRUE(RemoveSimpleNodeKeepOutput(&graph_, node_).ok());
+  EXPECT_TRUE(graph_.inputs().empty());
+  EXPECT_TRUE(graph_.outputs().empty());
+  EXPECT_TRUE(graph_.nodes().empty());
 }
 
-TEST(Model, RemoveSimpleNodeNoAfterNodes) {
-  GraphFloat32 graph;
-  Node* simple_node = graph.NewNode();
-  Node* producer_node = graph.NewNode();
-  Value* graph_input = graph.NewValue();
-  Value* graph_output = graph.NewValue();
-  Value* value = graph.NewValue();
+class TwoNodesModel : public testing::Test {
+ protected:
+  void SetUp() override {
+    graph_input_ = graph_.NewValue();
+    first_node_ = graph_.NewNode();
+    value_ = graph_.NewValue();
+    second_node_ = graph_.NewNode();
+    graph_output_ = graph_.NewValue();
 
-  ASSERT_TRUE(graph.AddConsumer(simple_node->id, value->id).ok());
-  ASSERT_TRUE(graph.SetProducer(simple_node->id, graph_output->id).ok());
-  ASSERT_TRUE(graph.AddConsumer(producer_node->id, graph_input->id).ok());
-  ASSERT_TRUE(graph.SetProducer(producer_node->id, value->id).ok());
-  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_input));
-  EXPECT_THAT(graph.outputs(), UnorderedElementsAre(graph_output));
-  EXPECT_THAT(graph.nodes(), ElementsAre(simple_node, producer_node));
+    ASSERT_TRUE(graph_.AddConsumer(first_node_->id, graph_input_->id).ok());
+    ASSERT_TRUE(graph_.SetProducer(first_node_->id, value_->id).ok());
+    ASSERT_TRUE(graph_.AddConsumer(second_node_->id, value_->id).ok());
+    ASSERT_TRUE(graph_.SetProducer(second_node_->id, graph_output_->id).ok());
+    EXPECT_THAT(graph_.inputs(), UnorderedElementsAre(graph_input_));
+    EXPECT_THAT(graph_.outputs(), UnorderedElementsAre(graph_output_));
+    EXPECT_THAT(graph_.nodes(), ElementsAre(first_node_, second_node_));
+  }
+  GraphFloat32 graph_;
+  Node* first_node_;
+  Node* second_node_;
+  Value* graph_input_;
+  Value* value_;
+  Value* graph_output_;
+};
 
-  ASSERT_TRUE(RemoveOneInputOneOutputNode(&graph, simple_node).ok());
-  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_input));
-  EXPECT_THAT(graph.outputs(), UnorderedElementsAre(value));
-  EXPECT_THAT(graph.nodes(), ElementsAre(producer_node));
+TEST_F(TwoNodesModel, DeleteFirstNodeKeepInput) {
+  ASSERT_TRUE(RemoveSimpleNodeKeepInput(&graph_, first_node_).ok());
+  EXPECT_THAT(graph_.inputs(), UnorderedElementsAre(graph_input_));
+  EXPECT_THAT(graph_.outputs(), UnorderedElementsAre(graph_output_));
+  EXPECT_THAT(graph_.nodes(), ElementsAre(second_node_));
 }
 
-TEST(Model, RemoveSimpleNodeGeneralCase) {
-  GraphFloat32 graph;
-  Node* simple_node = graph.NewNode();
-  Node* producer_node = graph.NewNode();
-  Node* consumer_node = graph.NewNode();
-  Value* graph_input = graph.NewValue();
-  Value* graph_output = graph.NewValue();
-  Value* value0 = graph.NewValue();
-  Value* value1 = graph.NewValue();
-
-  ASSERT_TRUE(graph.AddConsumer(producer_node->id, graph_input->id).ok());
-  ASSERT_TRUE(graph.SetProducer(producer_node->id, value0->id).ok());
-  ASSERT_TRUE(graph.AddConsumer(simple_node->id, value0->id).ok());
-  ASSERT_TRUE(graph.SetProducer(simple_node->id, value1->id).ok());
-  ASSERT_TRUE(graph.AddConsumer(consumer_node->id, value1->id).ok());
-  ASSERT_TRUE(graph.SetProducer(consumer_node->id, graph_output->id).ok());
-  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_input));
-  EXPECT_THAT(graph.outputs(), UnorderedElementsAre(graph_output));
-  EXPECT_THAT(graph.nodes(),
-              ElementsAre(simple_node, producer_node, consumer_node));
-
-  ASSERT_TRUE(RemoveOneInputOneOutputNode(&graph, simple_node).ok());
-  EXPECT_THAT(graph.inputs(), UnorderedElementsAre(graph_input));
-  EXPECT_THAT(graph.outputs(), UnorderedElementsAre(graph_output));
-  EXPECT_THAT(graph.nodes(), ElementsAre(producer_node, consumer_node));
-  EXPECT_THAT(graph.values(),
-              UnorderedElementsAre(graph_input, graph_output, value0));
+TEST_F(TwoNodesModel, DeleteFirstNodeKeepOutput) {
+  ASSERT_TRUE(RemoveSimpleNodeKeepOutput(&graph_, first_node_).ok());
+  EXPECT_THAT(graph_.inputs(), UnorderedElementsAre(value_));
+  EXPECT_THAT(graph_.outputs(), UnorderedElementsAre(graph_output_));
+  EXPECT_THAT(graph_.nodes(), ElementsAre(second_node_));
 }
 
-TEST(Model, RemoveSimpleNodeComplexCase) {
+TEST_F(TwoNodesModel, DeleteSecondNodeKeepInput) {
+  ASSERT_TRUE(RemoveSimpleNodeKeepInput(&graph_, second_node_).ok());
+  EXPECT_THAT(graph_.inputs(), UnorderedElementsAre(graph_input_));
+  EXPECT_THAT(graph_.outputs(), UnorderedElementsAre(value_));
+  EXPECT_THAT(graph_.nodes(), ElementsAre(first_node_));
+}
+
+TEST_F(TwoNodesModel, DeleteSecondNodeKeepOutput) {
+  ASSERT_TRUE(RemoveSimpleNodeKeepOutput(&graph_, second_node_).ok());
+  EXPECT_THAT(graph_.inputs(), UnorderedElementsAre(graph_input_));
+  EXPECT_THAT(graph_.outputs(), UnorderedElementsAre(graph_output_));
+  EXPECT_THAT(graph_.nodes(), ElementsAre(first_node_));
+}
+
+class ThreeNodesModel : public testing::Test {
+ protected:
+  void SetUp() override {
+    first_node_ = graph_.NewNode();
+    second_node_ = graph_.NewNode();
+    third_node_ = graph_.NewNode();
+    graph_input_ = graph_.NewValue();
+    value0_ = graph_.NewValue();
+    value1_ = graph_.NewValue();
+    graph_output_ = graph_.NewValue();
+
+    ASSERT_TRUE(graph_.AddConsumer(first_node_->id, graph_input_->id).ok());
+    ASSERT_TRUE(graph_.SetProducer(first_node_->id, value0_->id).ok());
+    ASSERT_TRUE(graph_.AddConsumer(second_node_->id, value0_->id).ok());
+    ASSERT_TRUE(graph_.SetProducer(second_node_->id, value1_->id).ok());
+    ASSERT_TRUE(graph_.AddConsumer(third_node_->id, value1_->id).ok());
+    ASSERT_TRUE(graph_.SetProducer(third_node_->id, graph_output_->id).ok());
+    EXPECT_THAT(graph_.inputs(), UnorderedElementsAre(graph_input_));
+    EXPECT_THAT(graph_.outputs(), UnorderedElementsAre(graph_output_));
+    EXPECT_THAT(graph_.nodes(),
+                ElementsAre(first_node_, second_node_, third_node_));
+  }
+  GraphFloat32 graph_;
+  Node* first_node_;
+  Node* second_node_;
+  Node* third_node_;
+  Value* graph_input_;
+  Value* value0_;
+  Value* value1_;
+  Value* graph_output_;
+};
+
+TEST_F(ThreeNodesModel, DeleteMiddleNodeKeepInput) {
+  ASSERT_TRUE(RemoveSimpleNodeKeepInput(&graph_, second_node_).ok());
+  EXPECT_THAT(graph_.inputs(), UnorderedElementsAre(graph_input_));
+  EXPECT_THAT(graph_.outputs(), UnorderedElementsAre(graph_output_));
+  EXPECT_THAT(graph_.nodes(), ElementsAre(first_node_, third_node_));
+  EXPECT_THAT(graph_.values(),
+              UnorderedElementsAre(graph_input_, value0_, graph_output_));
+}
+
+TEST_F(ThreeNodesModel, DeleteMiddleNodeKeepOutput) {
+  ASSERT_TRUE(RemoveSimpleNodeKeepOutput(&graph_, second_node_).ok());
+  EXPECT_THAT(graph_.inputs(), UnorderedElementsAre(graph_input_));
+  EXPECT_THAT(graph_.outputs(), UnorderedElementsAre(graph_output_));
+  EXPECT_THAT(graph_.nodes(), ElementsAre(first_node_, third_node_));
+  EXPECT_THAT(graph_.values(),
+              UnorderedElementsAre(graph_input_, value1_, graph_output_));
+}
+
+TEST(Model, RemoveSimpleNodeKeepInputComplexCase) {
   // We have this graph and we are going to delete n1 and preserve order of
   // v0, v1 for n0 node and v2, v3 for n2 node
   //  v0   v1
@@ -276,7 +316,11 @@ TEST(Model, RemoveSimpleNodeComplexCase) {
   EXPECT_THAT(graph.outputs(), UnorderedElementsAre(o1, o2));
   EXPECT_THAT(graph.nodes(), ElementsAre(n0, n1, n2));
 
-  ASSERT_TRUE(RemoveOneInputOneOutputNode(&graph, n1).ok());
+  // Node should be the only consumer of the input value to be able to be
+  // deleted with this function.
+  ASSERT_FALSE(RemoveSimpleNodeKeepOutput(&graph, n1).ok());
+
+  ASSERT_TRUE(RemoveSimpleNodeKeepInput(&graph, n1).ok());
   EXPECT_THAT(graph.inputs(), UnorderedElementsAre(v0, v1, v3));
   EXPECT_THAT(graph.outputs(), UnorderedElementsAre(o1, o2));
   EXPECT_THAT(graph.nodes(), ElementsAre(n0, n2));
@@ -466,6 +510,29 @@ TEST(Model, InsertNodeAfter) {
   EXPECT_THAT(graph.nodes(), ElementsAre(node1, new_node1, node2, new_node2));
 }
 
+TEST(BatchMatchingTest, EmptyGraph) {
+  GraphFloat32 graph;
+  ASSERT_TRUE(IsBatchMatchesForAllValues(graph));
+}
+
+TEST(BatchMatchingTest, AllMatch) {
+  GraphFloat32 graph;
+  Value* a = graph.NewValue();
+  Value* b = graph.NewValue();
+  a->tensor.shape = BHWC(1, 1, 1, 1);
+  b->tensor.shape = BHWC(1, 1, 1, 1);
+  ASSERT_TRUE(IsBatchMatchesForAllValues(graph));
+}
+
+TEST(BatchMatchingTest, NotAllMatch) {
+  GraphFloat32 graph;
+  Value* a = graph.NewValue();
+  Value* b = graph.NewValue();
+  a->tensor.shape = BHWC(1, 1, 1, 1);
+  b->tensor.shape = BHWC(2, 1, 1, 1);
+  ASSERT_FALSE(IsBatchMatchesForAllValues(graph));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/model_transformer.cc b/tensorflow/lite/delegates/gpu/common/model_transformer.cc
index 81287dd61e5..3be7ec55196 100644
--- a/tensorflow/lite/delegates/gpu/common/model_transformer.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_transformer.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 
diff --git a/tensorflow/lite/delegates/gpu/common/model_transformer.h b/tensorflow/lite/delegates/gpu/common/model_transformer.h
index fd2667390f3..b640b14e0b4 100644
--- a/tensorflow/lite/delegates/gpu/common/model_transformer.h
+++ b/tensorflow/lite/delegates/gpu/common/model_transformer.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <deque>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
diff --git a/tensorflow/lite/delegates/gpu/common/object_reader.cc b/tensorflow/lite/delegates/gpu/common/object_reader.cc
index c837fa061c0..04e4a14804a 100644
--- a/tensorflow/lite/delegates/gpu/common/object_reader.cc
+++ b/tensorflow/lite/delegates/gpu/common/object_reader.cc
@@ -16,13 +16,18 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/object_reader.h"
 
 #include <cstdint>
+#include <optional>
+#include <string>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/utils.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace gpu {
@@ -58,6 +63,9 @@ absl::Status ObjectReader::ReadNonConstantTensor(
                 &fp_tensor_index) != kTfLiteOk) {
           return absl::InternalError("Could not add new tensor to graph");
         }
+        // `tflite_tensor` value could be invalid when the `context->tensors`
+        // is reallocated. Thus reassigning `tflite_tensor` with a fresh value.
+        tflite_tensor = &context->tensors[tensor_idx];
 
         // Remember this tensor for later.
         (*quant_conversion_map)[fp_tensor_index] = tensor_idx;
@@ -67,10 +75,8 @@ absl::Status ObjectReader::ReadNonConstantTensor(
         RETURN_IF_ERROR(
             ConvertTfLiteTensorToTensorRef(*fp_tflite_tensor, &value->tensor));
         value->tensor.ref = fp_tensor_index;
+        value->tensor.is_variable_input = tflite_tensor->is_variable;
         value->quant_params.emplace();
-        // tflite_tensor from the outer scope is invalidated due to calling
-        // CreateNewTensorWithDifferentType
-        tflite_tensor = &context->tensors[tensor_idx];
         RETURN_IF_ERROR(
             PopulateQuantParams(*tflite_tensor, &value->quant_params.value()));
         (*tensor_to_value)[fp_tensor_index] = value;
@@ -84,6 +90,7 @@ absl::Status ObjectReader::ReadNonConstantTensor(
       RETURN_IF_ERROR(
           ConvertTfLiteTensorToTensorRef(*tflite_tensor, &value->tensor));
       value->tensor.ref = tensor_idx;
+      value->tensor.is_variable_input = tflite_tensor->is_variable;
       (*tensor_to_value)[tensor_idx] = value;
     }
   }
@@ -154,6 +161,53 @@ absl::Status ObjectReader::AddInput(const Node* node, uint32_t idx) {
   return graph_->AddConsumer(node->id, input->id);
 }
 
+absl::Status ObjectReader::AddUpdate(const Node* node, uint32_t idx) {
+  if (node_->inputs->size <= idx) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Data id ", idx, " must be less than tflite node inputs size ",
+        node_->inputs->size));
+  }
+
+  int update_tensor_idx = node_->inputs->data[idx];
+  TfLiteTensor* update_tensor = context_->tensors + update_tensor_idx;
+  if (!update_tensor->is_variable) {
+    return absl::InvalidArgumentError(
+        "The tensor must be a variable tensor to update it in place");
+  }
+
+  Value* value;
+  RETURN_IF_ERROR(ReadValueByTensorIdx(update_tensor_idx, &value));
+  if (!value->tensor.is_variable_input) {
+    return absl::InternalError(
+        "Variable input tensor is not marked as variable");
+  }
+
+  // We cannot create a cycle in the graph. The way around this when a node
+  // updates a tensor in place would be to add a new value to the graph that
+  // points to the same tensor.
+  Value* updated_value = graph_->NewValue();
+  updated_value->tensor = value->tensor;
+  updated_value->quant_params = value->quant_params;
+  RETURN_IF_ERROR(graph_->SetProducer(node->id, updated_value->id));
+
+  // We also need to update the tensor_to_value arrays so that the nodes added
+  // after the current node will access the tensor with the updated value rather
+  // than the initial value.
+  if (quant_conversion_map_ != nullptr &&
+      quant_conversion_map_->find(update_tensor_idx) !=
+          quant_conversion_map_->end()) {
+    // If quantization conversion map exists, then the index provided is not the
+    // actual tensor idx. We need to find the float version of the tensor from
+    // the map.
+    tensor_to_value_->at(quant_conversion_map_->at(update_tensor_idx)) =
+        updated_value;
+  } else {
+    tensor_to_value_->at(update_tensor_idx) = updated_value;
+  }
+
+  return absl::OkStatus();
+}
+
 TfLiteTensor* ObjectReader::GetInputTensor(int index) const {
   return index >= 0 && index < node_->inputs->size
              ? context_->tensors + node_->inputs->data[index]
diff --git a/tensorflow/lite/delegates/gpu/common/object_reader.h b/tensorflow/lite/delegates/gpu/common/object_reader.h
index 246bc71f9c5..3c7d7f6a859 100644
--- a/tensorflow/lite/delegates/gpu/common/object_reader.h
+++ b/tensorflow/lite/delegates/gpu/common/object_reader.h
@@ -86,6 +86,8 @@ class ObjectReader {
 
   absl::Status AddInput(const Node* node, uint32_t idx);
 
+  absl::Status AddUpdate(const Node* node, uint32_t idx);
+
   TfLiteTensor* GetInputTensor(int index) const;
 
   TfLiteTensor* GetOutputTensor(int index) const;
diff --git a/tensorflow/lite/delegates/gpu/common/operations.cc b/tensorflow/lite/delegates/gpu/common/operations.cc
index fbffe9d65ff..19d7bd919c5 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@@ -15,11 +15,17 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 
+#include <algorithm>
 #include <cstdint>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -76,6 +82,8 @@ std::string ToString(enum OperationType op) {
       return "batch_normalization";
     case OperationType::BATCH_TO_SPACE:
       return "batch_to_space";
+    case OperationType::BATCHED_MATMUL:
+      return "batched_matmul";
     case OperationType::CONCAT:
       return "concat";
     case OperationType::CONST:
@@ -94,12 +102,22 @@ std::string ToString(enum OperationType op) {
       return "div";
     case OperationType::ELU:
       return "elu";
+    case OperationType::EQUAL:
+      return "equal";
     case OperationType::EXP:
       return "exp";
     case OperationType::FULLY_CONNECTED:
       return "fully_connected";
+    case OperationType::GREATER:
+      return "greater";
+    case OperationType::GREATER_EQUAL:
+      return "greater_equal";
     case OperationType::HARD_SWISH:
       return "hard_swish";
+    case OperationType::LESS:
+      return "less";
+    case OperationType::LESS_EQUAL:
+      return "less_equal";
     case OperationType::LOG:
       return "log";
     case OperationType::LSTM:
@@ -116,6 +134,10 @@ std::string ToString(enum OperationType op) {
       return "minimum";
     case OperationType::MUL:
       return "mul";
+    case OperationType::NEG:
+      return "neg";
+    case OperationType::NOT_EQUAL:
+      return "not_equal";
     case OperationType::PAD:
       return "pad";
     case OperationType::POOLING_2D:
@@ -126,6 +148,14 @@ std::string ToString(enum OperationType op) {
       return "prelu";
     case OperationType::QUANTIZE_AND_DEQUANTIZE:
       return "quantize_and_dequantize";
+    case OperationType::REDUCE_MAXIMUM:
+      return "reduce_maximum";
+    case OperationType::REDUCE_MINIMUM:
+      return "reduce_minimum";
+    case OperationType::REDUCE_PRODUCT:
+      return "reduce_product";
+    case OperationType::REDUCE_SUM:
+      return "reduce_sum";
     case OperationType::RELU:
       return "relu";
     case OperationType::RESHAPE:
@@ -169,6 +199,7 @@ OperationType OperationTypeFromString(const std::string& name) {
           {"abs", OperationType::ABS},
           {"add", OperationType::ADD},
           {"batch_normalization", OperationType::BATCH_NORMALIZATION},
+          {"batched_matmul", OperationType::BATCHED_MATMUL},
           {"concat", OperationType::CONCAT},
           {"const", OperationType::CONST},
           {"convolution_2d", OperationType::CONVOLUTION_2D},
@@ -178,9 +209,14 @@ OperationType OperationTypeFromString(const std::string& name) {
           {"depthwise_convolution", OperationType::DEPTHWISE_CONVOLUTION},
           {"div", OperationType::DIV},
           {"elu", OperationType::ELU},
+          {"equal", OperationType::EQUAL},
           {"exp", OperationType::EXP},
           {"fully_connected", OperationType::FULLY_CONNECTED},
+          {"greater", OperationType::GREATER},
+          {"greater_equal", OperationType::GREATER_EQUAL},
           {"hard_swish", OperationType::HARD_SWISH},
+          {"less", OperationType::LESS},
+          {"less_equal", OperationType::LESS_EQUAL},
           {"log", OperationType::LOG},
           {"lstm", OperationType::LSTM},
           {"maximum", OperationType::MAXIMUM},
@@ -190,11 +226,17 @@ OperationType OperationTypeFromString(const std::string& name) {
            OperationType::MEAN_STDDEV_NORMALIZATION},
           {"minimum", OperationType::MINIMUM},
           {"mul", OperationType::MUL},
+          {"neg", OperationType::NEG},
+          {"not_equal", OperationType::NOT_EQUAL},
           {"pad", OperationType::PAD},
           {"pooling_2d", OperationType::POOLING_2D},
           {"pow", OperationType::POW},
           {"prelu", OperationType::PRELU},
           {"quantize_and_dequantize", OperationType::QUANTIZE_AND_DEQUANTIZE},
+          {"reduce_maximum", OperationType::REDUCE_MAXIMUM},
+          {"reduce_minimum", OperationType::REDUCE_MINIMUM},
+          {"reduce_product", OperationType::REDUCE_PRODUCT},
+          {"reduce_sum", OperationType::REDUCE_SUM},
           {"relu", OperationType::RELU},
           {"resize", OperationType::RESIZE},
           {"reshape", OperationType::RESHAPE},
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index 563dbdec96e..a93f63a02b7 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -17,14 +17,15 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_OPERATIONS_H_
 
 #include <cstdint>
+#include <set>
 #include <string>
 #include <vector>
 
 #include "absl/types/variant.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
-#include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -36,6 +37,7 @@ enum class OperationType {
   ADD,
   BATCH_TO_SPACE,
   BATCH_NORMALIZATION,
+  BATCHED_MATMUL,
   CONCAT,
   CONST,
   CONVOLUTION_2D,
@@ -45,9 +47,14 @@ enum class OperationType {
   DEPTHWISE_CONVOLUTION,
   DIV,
   ELU,
+  EQUAL,
   EXP,
   FULLY_CONNECTED,
+  GREATER,
+  GREATER_EQUAL,
   HARD_SWISH,
+  LESS,
+  LESS_EQUAL,
   LOG,
   LSTM,
   MAXIMUM,
@@ -56,12 +63,18 @@ enum class OperationType {
   MEAN_STDDEV_NORMALIZATION,
   MINIMUM,
   MUL,
+  NEG,
+  NOT_EQUAL,
   PAD,
   POOLING_2D,
   POW,
   PRELU,
   // Used to accurately run inference on quantized models.
   QUANTIZE_AND_DEQUANTIZE,
+  REDUCE_MAXIMUM,
+  REDUCE_MINIMUM,
+  REDUCE_PRODUCT,
+  REDUCE_SUM,
   RELU,
   RESHAPE,
   RESIZE,
@@ -358,6 +371,10 @@ struct PReLUAttributes {
       alpha;
 };
 
+struct ReduceAttributes {
+  Axis axis = Axis::UNKNOWN;
+};
+
 struct SoftmaxAttributes {
   Axis axis = Axis::UNKNOWN;
 };
diff --git a/tensorflow/lite/delegates/gpu/common/quantization_util.cc b/tensorflow/lite/delegates/gpu/common/quantization_util.cc
index fe92989a3ae..bbd99023a2f 100644
--- a/tensorflow/lite/delegates/gpu/common/quantization_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/quantization_util.cc
@@ -15,9 +15,15 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
 
+#include <stdint.h>
+
+#include <vector>
+
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/lite/builtin_ops.h"
+#include "absl/status/status.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/quantization_util.h b/tensorflow/lite/delegates/gpu/common/quantization_util.h
index fc01d612d6f..584f6876a9c 100644
--- a/tensorflow/lite/delegates/gpu/common/quantization_util.h
+++ b/tensorflow/lite/delegates/gpu/common/quantization_util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_QUANTIZATION_UTIL_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_QUANTIZATION_UTIL_H_
 
+#include <stdint.h>
+
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
diff --git a/tensorflow/lite/delegates/gpu/common/quantization_util_test.cc b/tensorflow/lite/delegates/gpu/common/quantization_util_test.cc
index b5cdaec91e0..ffded543123 100644
--- a/tensorflow/lite/delegates/gpu/common/quantization_util_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/quantization_util_test.cc
@@ -15,8 +15,18 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
 
+#include <stdint.h>
+
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/util.h"
 
 using ::testing::Eq;
diff --git a/tensorflow/lite/delegates/gpu/common/shape.cc b/tensorflow/lite/delegates/gpu/common/shape.cc
index 074637a7774..c66ecea1215 100644
--- a/tensorflow/lite/delegates/gpu/common/shape.cc
+++ b/tensorflow/lite/delegates/gpu/common/shape.cc
@@ -14,6 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 
+#include <stdint.h>
+
+#include <string>
+#include <vector>
+
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 
diff --git a/tensorflow/lite/delegates/gpu/common/shape.h b/tensorflow/lite/delegates/gpu/common/shape.h
index 544d2c1f4d0..a017ff28e63 100644
--- a/tensorflow/lite/delegates/gpu/common/shape.h
+++ b/tensorflow/lite/delegates/gpu/common/shape.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SHAPE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_SHAPE_H_
 
-#include <sys/types.h>
+#include <stddef.h>
+#include <stdint.h>
 
-#include <algorithm>
 #include <array>
 #include <functional>
 #include <numeric>
@@ -26,8 +26,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/hash/hash.h"
-
 namespace tflite {
 namespace gpu {
 
diff --git a/tensorflow/lite/delegates/gpu/common/shape_test.cc b/tensorflow/lite/delegates/gpu/common/shape_test.cc
index 41519115729..3cbf1fddfc2 100644
--- a/tensorflow/lite/delegates/gpu/common/shape_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/shape_test.cc
@@ -14,10 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 
-#include <initializer_list>
+#include <stdint.h>
+
 #include <vector>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/tensor.h b/tensorflow/lite/delegates/gpu/common/tensor.h
index fc39d3485ba..ba0fd48810c 100644
--- a/tensorflow/lite/delegates/gpu/common/tensor.h
+++ b/tensorflow/lite/delegates/gpu/common/tensor.h
@@ -16,7 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TENSOR_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TENSOR_H_
 
-#include <string>
+#include <stdint.h>
+
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
@@ -71,6 +72,10 @@ struct TensorRef {
   // Opaque reference to a tensor. Upstream component is responsible for
   // resolving this reference into an actual tensor.
   int64_t ref = -1;
+
+  // Specifies if the tensor should be a variable input tensor that must be an
+  // output as well as an input to the graph.
+  bool is_variable_input = false;
 };
 
 template <typename ShapeT, DataType Type>
diff --git a/tensorflow/lite/delegates/gpu/common/testing/BUILD b/tensorflow/lite/delegates/gpu/common/testing/BUILD
index a7f97eb67b3..dd8792d6895 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/testing/BUILD
@@ -10,6 +10,8 @@ cc_library(
     hdrs = ["interpreter_utils.h"],
     deps = [
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:tensor",
@@ -25,13 +27,12 @@ cc_library(
     hdrs = ["tflite_model_reader.h"],
     deps = [
         "//tensorflow/lite:framework_lib",
-        "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_builder",
+        "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common/transformations:general_transformations",
-        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/delegates/gpu/common/transformations:model_transformations",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
index b5ceff30d1e..50150964e92 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
@@ -24,10 +24,12 @@ cc_library(
     hdrs = ["utils.h"],
     deps = [
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -48,6 +50,7 @@ cc_test(
     deps = [
         ":feature_parity",
         ":utils",
+        "//tensorflow/lite:framework_lib",
         "//tensorflow/lite/delegates/gpu:gl_delegate",
         "@com_google_googletest//:gtest_main",
     ],
@@ -65,6 +68,7 @@ cc_test(
     deps = [
         ":feature_parity",
         ":utils",
+        "//tensorflow/lite:framework_lib",
         "//tensorflow/lite/delegates/gpu:delegate",
         "@com_google_googletest//:gtest_main",
     ],
@@ -82,6 +86,7 @@ cc_test(
     deps = [
         ":feature_parity",
         ":utils",
+        "//tensorflow/lite:framework_lib",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/feature_parity.h b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/feature_parity.h
index 7661a4ad296..dacb486e303 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/feature_parity.h
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/feature_parity.h
@@ -16,9 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_FEATURE_PARITY_FEATURE_PARITY_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_FEATURE_PARITY_FEATURE_PARITY_H_
 
-#include <functional>
-#include <string>
-#include <utility>
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/add.h"
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD
index 4fef0a28525..56894c8810a 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD
@@ -20,9 +20,7 @@ cc_library(
     srcs = ["add.cc"],
     hdrs = ["add.h"],
     deps = [
-        "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common/testing/feature_parity:utils",
         "@flatbuffers",
     ],
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/add.cc b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/add.cc
index dbb3851ca56..06649b36e79 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/add.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/add.cc
@@ -15,11 +15,14 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/add.h"
 
+#include <stdint.h>
+
+#include <string>
+#include <utility>
 #include <vector>
 
-#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h"
-#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/opencl_test.cc b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/opencl_test.cc
index 24c0e0c424b..3dbb8638196 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/opencl_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/opencl_test.cc
@@ -13,11 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdint.h>
+
+#include <memory>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/feature_parity.h"
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h"
 #include "tensorflow/lite/delegates/gpu/delegate.h"
+#include "tensorflow/lite/interpreter.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/opengl_test.cc b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/opengl_test.cc
index 2f403d2e583..ed0aa104e65 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/opengl_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/opengl_test.cc
@@ -15,12 +15,14 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/feature_parity.h"
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h"
 #include "tensorflow/lite/delegates/gpu/gl_delegate.h"
+#include "tensorflow/lite/interpreter.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc
index bdcbf7ed62e..6eb94f63b6f 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc
@@ -15,15 +15,18 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h"
 
+#include <memory>
+#include <optional>
 #include <ostream>
 #include <string>
+#include <utility>
 
 #include "absl/status/status.h"
 #include "absl/strings/substitute.h"
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/string_type.h"
 
 std::ostream& operator<<(std::ostream& os, const TfLiteTensor& tensor) {
   std::string shape;
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h
index 7c34978fb55..20d43b85468 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h
@@ -16,14 +16,24 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_FEATURE_PARITY_UTILS_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TESTING_FEATURE_PARITY_UTILS_H_
 
+#include <stddef.h>
+
 #include <cstdint>
+#include <memory>
+#include <optional>
 #include <ostream>
 #include <string>
+#include <tuple>
+#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/xnnpack_test.cc b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/xnnpack_test.cc
index 3d05d64437d..bdd12951c8c 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/xnnpack_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/xnnpack_test.cc
@@ -13,11 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdint.h>
+
+#include <memory>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/feature_parity.h"
 #include "tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#include "tensorflow/lite/interpreter.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.cc b/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.cc
index 08d9448f7e5..ae00e213fa3 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.cc
@@ -16,15 +16,18 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.h"
 
 #include <cstring>
+#include <memory>
+#include <string>
 #include <vector>
 
 #include "absl/memory/memory.h"
-#include "tensorflow/lite/context.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/string_type.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.h b/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.h
index ca2825b7563..86656abbe0f 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.h
+++ b/tensorflow/lite/delegates/gpu/common/testing/interpreter_utils.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/lite/context.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
diff --git a/tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.cc b/tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.cc
index a67602cf245..7ba3de641ef 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.cc
@@ -14,16 +14,18 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/gpu/common/testing/tflite_model_reader.h"
 
+#include <stddef.h>
+
 #include <memory>
 
-#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
+#include "tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h"
 #include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
 #include "tensorflow/lite/model_builder.h"
 
 namespace tflite {
@@ -93,8 +95,8 @@ absl::Status BuildFromFlatBuffer(const tflite::FlatBufferModel& flatbuffer,
 
   NullTransformationReporter reporter;
   ModelTransformer transformer(graph, &reporter);
-  if (!ApplyGeneralTransformations(&transformer)) {
-    return absl::InternalError("Graph general transformations failed");
+  if (!ApplyModelTransformations(&transformer)) {
+    return absl::InternalError("Graph transformations failed");
   }
 
   return absl::OkStatus();
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/BUILD b/tensorflow/lite/delegates/gpu/common/transformations/BUILD
index bf26b03f534..6cb358bcc93 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/transformations/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow/core/platform:build_config.bzl", "tf_platform_alias")
+
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
@@ -12,9 +14,9 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:any",
     ],
 )
@@ -24,11 +26,11 @@ cc_library(
     srcs = ["add_quant_adjustments.cc"],
     hdrs = ["add_quant_adjustments.h"],
     deps = [
-        "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:any",
@@ -40,10 +42,13 @@ cc_test(
     srcs = ["add_quant_adjustments_test.cc"],
     deps = [
         ":add_quant_adjustments",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:any",
         "@com_google_absl//absl/types:optional",
         "@com_google_googletest//:gtest_main",
@@ -56,9 +61,13 @@ cc_library(
     hdrs = ["fuse_add_to_conv.h"],
     deps = [
         "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -67,8 +76,13 @@ cc_test(
     srcs = ["fuse_add_to_conv_test.cc"],
     deps = [
         ":fuse_add_to_conv",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -82,8 +96,10 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -92,17 +108,21 @@ cc_test(
     srcs = ["fuse_mul_to_conv_test.cc"],
     deps = [
         ":fuse_mul_to_conv",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:model",
+        "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
 
 cc_library(
-    name = "general_transformations",
-    srcs = ["general_transformations.cc"],
-    hdrs = ["general_transformations.h"],
+    name = "model_transformations",
+    srcs = ["model_transformations.cc"],
+    hdrs = ["model_transformations.h"],
     deps = [
         ":add_quant_adjustments",
         ":fuse_add_to_conv",
@@ -112,7 +132,7 @@ cc_library(
         ":merge_padding_with",
         ":remove_noop",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
-    ],
+    ] + tf_platform_alias("custom_transformations", "//tensorflow/lite/delegates/gpu/common/"),
 )
 
 cc_library(
@@ -123,7 +143,8 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:any",
     ],
@@ -138,6 +159,8 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:any",
         "@com_google_googletest//:gtest_main",
     ],
@@ -151,8 +174,11 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:any",
     ],
 )
@@ -165,6 +191,9 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:any",
         "@com_google_googletest//:gtest_main",
     ],
@@ -173,7 +202,6 @@ cc_test(
 cc_library(
     name = "matching",
     hdrs = ["matching.h"],
-    deps = ["//tensorflow/lite/delegates/gpu/common:model"],
 )
 
 cc_library(
@@ -186,7 +214,9 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:any",
@@ -198,10 +228,13 @@ cc_test(
     srcs = ["merge_padding_with_test.cc"],
     deps = [
         ":merge_padding_with",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:any",
         "@com_google_googletest//:gtest_main",
     ],
@@ -216,8 +249,11 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -230,6 +266,9 @@ cc_test(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:shape",
+        "//tensorflow/lite/delegates/gpu/common:tensor",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc b/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc
index 29d70d8f4a9..af274d8381e 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/add_bias.cc
@@ -15,13 +15,18 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/add_bias.h"
 
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "absl/memory/memory.h"
-#include "absl/strings/str_cat.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -65,6 +70,12 @@ class AddBias : public NodeTransformation {
     }
     if (node->operation.type ==
         ToString(OperationType::DEPTHWISE_CONVOLUTION)) {
+      if (graph->FindInputs(node->id).size() != 1) {
+        return {TransformStatus::DECLINED,
+                "This transformation is only applicable to depth wise conv "
+                "with one "
+                "runtime input."};
+      }
       auto& attr = absl::any_cast<DepthwiseConvolution2DAttributes&>(
           node->operation.attributes);
       return FillBias(attr.weights.shape.o * attr.weights.shape.i, &attr.bias);
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.cc b/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.cc
index 6262d1575b7..7f43d70c842 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.cc
@@ -15,15 +15,19 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.h"
 
+#include <memory>
+#include <optional>
 #include <string>
+#include <vector>
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/any.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc
index 2ff84981f9d..9ef909d4ab7 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments_test.cc
@@ -15,14 +15,20 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.h"
 
-#include <gmock/gmock.h>
+#include <memory>
+#include <string>
+#include <vector>
+
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/types/any.h"
 #include "absl/types/optional.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -59,7 +65,7 @@ TEST(AddQuantAdjustments, OneNode) {
 
   ASSERT_TRUE(graph.AddConsumer(add_node->id, input->id).ok());
 
-  Value* output;
+  Value* output = nullptr;
   AddQuantParams(&input->quant_params, /*min=*/0.0, /*max=*/2.0,
                  /*scale=*/0.008);
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
@@ -114,18 +120,18 @@ TEST(AddQuantAdjustments, GeneralCase) {
 
   // Connections.
   ASSERT_TRUE(graph.AddConsumer(add1_node->id, input->id).ok());
-  Value* link1;
+  Value* link1 = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, add1_node, quant_node, &link1).ok());
   AddQuantParams(&link1->quant_params, /*min=*/0.0, /*max=*/2.0,
                  /*scale=*/0.008);
   link1->tensor.shape = BHWC(1, 4, 4, 8);
   ASSERT_TRUE(graph.AddConsumer(add2_node->id, link1->id).ok());
-  Value* link2;
+  Value* link2 = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, quant_node, add2_node, &link2).ok());
   AddQuantParams(&link2->quant_params, /*min=*/-1.0, /*max=*/1.0,
                  /*scale=*/0.008);
   link2->tensor.shape = BHWC(1, 4, 4, 8);
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, add2_node, &output).ok());
   AddQuantParams(&output->quant_params, /*min=*/-1.0, /*max=*/1.0,
                  /*scale=*/0.008);
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
index fdbd6e03755..62c3ec39854 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
@@ -15,8 +15,20 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h"
 
+#include <any>
+#include <memory>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/strings/string_view.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+#include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -42,6 +54,10 @@ class MergeConvolutionWithAdd : public SequenceTransformation {
   TransformResult ApplyToNodesSequence(const std::vector<Node*>& sequence,
                                        GraphFloat32* graph) final {
     auto& conv_node = *sequence[0];
+    if (graph->FindInputs(conv_node.id).size() != 1) {
+      return {TransformStatus::DECLINED,
+              "This fusion is only applicable to ops with one runtime input."};
+    }
     auto& add_node = *sequence[1];
     if (add_node.operation.type != ToString(OperationType::ADD)) {
       return {TransformStatus::SKIPPED, ""};
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h
index 53a0cef63c8..26f93dc3765 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc
index 4a48c7c0b28..76bf7e4a72a 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv_test.cc
@@ -15,10 +15,20 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h"
 
+#include <any>
+#include <memory>
+#include <string>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 using ::testing::FloatNear;
 using ::testing::Pointwise;
@@ -57,11 +67,11 @@ TEST(MergeConvolutionWithAddTest, Smoke) {
 
   ASSERT_TRUE(graph.AddConsumer(conv_node->id, input->id).ok());
 
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   output->tensor.shape = BHWC(1, 4, 4, 16);
 
-  Value* link1;
+  Value* link1 = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, conv_node, add_node, &link1).ok());
   link1->tensor.shape = BHWC(1, 4, 4, 16);
 
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
index 25ec6299f11..41bd485a76c 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
@@ -15,9 +15,18 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h"
 
+#include <any>
+#include <memory>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "absl/strings/string_view.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h
index 8d64ae50488..92fab4553f1 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv_test.cc
index ea990dd8267..b35cb832335 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv_test.cc
@@ -15,11 +15,20 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h"
 
+#include <any>
+#include <memory>
+#include <string>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 using ::testing::FloatNear;
 using ::testing::Pointwise;
@@ -58,11 +67,11 @@ TEST(MergeConvolutionWithMulTest, Smoke) {
 
   ASSERT_TRUE(graph.AddConsumer(conv_node->id, input->id).ok());
 
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, mul_node, &output).ok());
   output->tensor.shape = BHWC(1, 4, 4, 16);
 
-  Value* link1;
+  Value* link1 = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, conv_node, mul_node, &link1).ok());
   link1->tensor.shape = BHWC(1, 4, 4, 16);
 
@@ -109,11 +118,11 @@ TEST(MergeMulWithConvolutionTest, Smoke) {
 
   ASSERT_TRUE(graph.AddConsumer(mul_node->id, input->id).ok());
 
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, conv_node, &output).ok());
   output->tensor.shape = BHWC(1, 4, 4, 16);
 
-  Value* link1;
+  Value* link1 = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, mul_node, conv_node, &link1).ok());
   link1->tensor.shape = BHWC(1, 4, 4, 16);
 
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.cc b/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.cc
index 1236cdec214..226e7d4b2a9 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.cc
@@ -15,11 +15,17 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.h"
 
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "absl/memory/memory.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected_test.cc
index d3606d4a097..29f1b4bfbef 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected_test.cc
@@ -15,13 +15,18 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/make_fully_connected.h"
 
-#include <gmock/gmock.h>
+#include <memory>
+#include <string>
+#include <vector>
+
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -68,16 +73,16 @@ TEST(MakeFullyConnected, Smoke) {
 
   ASSERT_TRUE(graph.AddConsumer(conv1x1_node0->id, input->id).ok());
 
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, conv1x1_node2, &output).ok());
   output->tensor.shape = BHWC(1, 1, 1, 32);
 
-  Value* link1;
+  Value* link1 = nullptr;
   ASSERT_TRUE(
       ConnectTwoNodes(&graph, conv1x1_node0, conv4x4_node1, &link1).ok());
   link1->tensor.shape = BHWC(1, 4, 4, 16);
 
-  Value* link2;
+  Value* link2 = nullptr;
   ASSERT_TRUE(
       ConnectTwoNodes(&graph, conv4x4_node1, conv1x1_node2, &link2).ok());
   link2->tensor.shape = BHWC(1, 1, 1, 16);
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/make_padding.cc b/tensorflow/lite/delegates/gpu/common/transformations/make_padding.cc
index 17aac83baf7..51335a83c38 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/make_padding.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/make_padding.cc
@@ -15,11 +15,19 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/make_padding.h"
 
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/make_padding_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/make_padding_test.cc
index f8be3218239..8aafd75ba5b 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/make_padding_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/make_padding_test.cc
@@ -15,12 +15,18 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/make_padding.h"
 
-#include <gmock/gmock.h>
+#include <memory>
+#include <string>
+#include <vector>
+
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -38,7 +44,7 @@ TEST(MakePadding, Smoke) {
   attr.axis = Axis::HEIGHT;
   concat_node->operation.attributes = attr;
 
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, concat_node, &output).ok());
   output->tensor.shape = BHWC(1, 7, 3, 5);
 
@@ -50,7 +56,7 @@ TEST(MakePadding, Smoke) {
       std::vector<float>(const_attr.tensor.shape.DimensionsProduct(), 0);
   const_node->operation.attributes = const_attr;
 
-  Value* const_link;
+  Value* const_link = nullptr;
   ASSERT_TRUE(
       ConnectTwoNodes(&graph, const_node, concat_node, &const_link).ok());
   const_link->tensor.shape = const_attr.tensor.shape;
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/matching.h b/tensorflow/lite/delegates/gpu/common/transformations/matching.h
index 0dfd21e50ba..b28c8b05fed 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/matching.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/matching.h
@@ -18,9 +18,10 @@ limitations under the License.
 
 // A file provides predicates to match subgraphs.
 
+#include <algorithm>
+#include <iterator>
 #include <string>
-
-#include "tensorflow/lite/delegates/gpu/common/model.h"
+#include <vector>
 
 namespace tflite {
 namespace gpu {
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
index 6a4e24b5042..509d715f550 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
@@ -15,16 +15,22 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.h"
 
+#include <memory>
 #include <string>
+#include <variant>
 #include <vector>
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/any.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/matching.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with_test.cc
index 40029efbc65..826a9b82854 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with_test.cc
@@ -15,13 +15,19 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.h"
 
-#include <gmock/gmock.h>
+#include <memory>
+#include <string>
+#include <vector>
+
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/types/any.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -40,7 +46,7 @@ TEST(MergePaddingWith, Smoke) {
   pad_node->operation.attributes = attr;
 
   auto conv_node = graph.NewNode();
-  Value* temp;
+  Value* temp = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, pad_node, conv_node, &temp).ok());
   ASSERT_TRUE(AddOutput(&graph, conv_node, &temp).ok());
   conv_node->operation.type = ToString(OperationType::CONVOLUTION_2D);
@@ -77,16 +83,17 @@ TEST(MergePaddingWith, MergeTwo) {
   pad_node1->operation.attributes = attr;
 
   auto pad_node2 = graph.NewNode();
-  Value* temp;
-  ASSERT_TRUE(ConnectTwoNodes(&graph, pad_node1, pad_node2, &temp).ok());
+  Value* temp1 = nullptr;
+  ASSERT_TRUE(ConnectTwoNodes(&graph, pad_node1, pad_node2, &temp1).ok());
   pad_node2->operation.type = ToString(OperationType::PAD);
   attr.prepended = BHWC(0, 0, 0, 0);
   attr.appended = BHWC(0, 2, 2, 0);
   pad_node2->operation.attributes = attr;
 
   auto conv_node = graph.NewNode();
-  ASSERT_TRUE(ConnectTwoNodes(&graph, pad_node2, conv_node, &temp).ok());
-  ASSERT_TRUE(AddOutput(&graph, conv_node, &temp).ok());
+  Value* temp2 = nullptr;
+  ASSERT_TRUE(ConnectTwoNodes(&graph, pad_node2, conv_node, &temp2).ok());
+  ASSERT_TRUE(AddOutput(&graph, conv_node, &temp2).ok());
   conv_node->operation.type = ToString(OperationType::CONVOLUTION_2D);
   Convolution2DAttributes conv_attr;
   conv_attr.padding.appended = HW(0, 0);
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/general_transformations.cc b/tensorflow/lite/delegates/gpu/common/transformations/model_transformations.cc
similarity index 87%
rename from tensorflow/lite/delegates/gpu/common/transformations/general_transformations.cc
rename to tensorflow/lite/delegates/gpu/common/transformations/model_transformations.cc
index f9ae7f41f8f..d1a6cf127f5 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/general_transformations.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/model_transformations.cc
@@ -13,8 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
+#include "tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h"
 
+#include <memory>
+
+#include "tensorflow/lite/delegates/gpu/common/custom_transformations.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/add_quant_adjustments.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.h"
@@ -26,6 +30,8 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 
+namespace {
+
 bool ApplyGeneralTransformations(ModelTransformer* transformer) {
   // whenever any of these transforms return false, that means that a graph
   // is in the broken state and processing should not continue.
@@ -57,5 +63,12 @@ bool ApplyGeneralTransformations(ModelTransformer* transformer) {
                             NewMergeMulWithConvolution().get());
 }
 
+}  // namespace
+
+bool ApplyModelTransformations(ModelTransformer* transformer) {
+  return ApplyCustomTransformations(transformer) &&
+         ApplyGeneralTransformations(transformer);
+}
+
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h b/tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h
similarity index 89%
rename from tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h
rename to tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h
index ffc5bba4f1a..69592c9777b 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h
+++ b/tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h
@@ -21,8 +21,9 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 
+// Applies custom and general transformations to the model in the proper order.
 // @return false when something went wrong that turned a graph in a broken state
-bool ApplyGeneralTransformations(ModelTransformer* transformer);
+bool ApplyModelTransformations(ModelTransformer* transformer);
 
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
index 6cc370899e4..a97d9185c71 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
@@ -15,14 +15,25 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/remove_noop.h"
 
+#include <algorithm>
+#include <any>
+#include <functional>
+#include <iterator>
+#include <memory>
 #include <string>
+#include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/memory/memory.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
+#include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -118,7 +129,7 @@ class RemoveIdentityReshape : public NodeTransformation {
       return {TransformStatus::SKIPPED,
               "Can not apply transformation when node output is graph output"};
     }
-    absl::Status status = RemoveOneInputOneOutputNode(graph, node);
+    absl::Status status = RemoveSimpleNodeKeepInput(graph, node);
     if (!status.ok()) {
       return {TransformStatus::INVALID,
               "Unable to remove a node: " + std::string(status.message())};
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop_test.cc b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop_test.cc
index a6aafee4f06..b76962d3ecb 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop_test.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop_test.cc
@@ -15,12 +15,20 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/transformations/remove_noop.h"
 
+#include <any>
+#include <memory>
+#include <string>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+#include "tensorflow/lite/delegates/gpu/common/tensor.h"
 
 namespace tflite {
 namespace gpu {
@@ -35,12 +43,12 @@ TEST(RemoveSingleInputAdd, Smoke) {
   ASSERT_TRUE(graph.AddConsumer(first_node->id, input->id).ok());
 
   auto add_node = graph.NewNode();
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   add_node->operation.type = ToString(OperationType::ADD);
   add_node->operation.attributes = ElementwiseAttributes();
 
-  Value* temp;
+  Value* temp = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, first_node, add_node, &temp).ok());
   ASSERT_EQ(2, graph.nodes().size());
   ASSERT_EQ(3, graph.values().size());
@@ -63,14 +71,14 @@ TEST(RemoveSingleInputAdd, DoNotTrigger_TensorHWC) {
   ASSERT_TRUE(graph.AddConsumer(first_node->id, input->id).ok());
 
   auto add_node = graph.NewNode();
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   add_node->operation.type = ToString(OperationType::ADD);
   ElementwiseAttributes attr;
   attr.param = Tensor<HWC, DataType::FLOAT32>();
   add_node->operation.attributes = attr;
 
-  Value* temp;
+  Value* temp = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, first_node, add_node, &temp).ok());
   ASSERT_EQ(2, graph.nodes().size());
   ASSERT_EQ(3, graph.values().size());
@@ -90,14 +98,14 @@ TEST(RemoveSingleInputAdd, DoNotTrigger_LinearTensor) {
   ASSERT_TRUE(graph.AddConsumer(first_node->id, input->id).ok());
 
   auto add_node = graph.NewNode();
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   add_node->operation.type = ToString(OperationType::ADD);
   ElementwiseAttributes attr;
   attr.param = Tensor<Linear, DataType::FLOAT32>();
   add_node->operation.attributes = attr;
 
-  Value* temp;
+  Value* temp = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, first_node, add_node, &temp).ok());
   ASSERT_EQ(2, graph.nodes().size());
   ASSERT_EQ(3, graph.values().size());
@@ -117,14 +125,14 @@ TEST(RemoveSingleInputAdd, DoNotTrigger_Scalar) {
   ASSERT_TRUE(graph.AddConsumer(first_node->id, input->id).ok());
 
   auto add_node = graph.NewNode();
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   add_node->operation.type = ToString(OperationType::ADD);
   ElementwiseAttributes attr;
   attr.param = 0.5f;
   add_node->operation.attributes = attr;
 
-  Value* temp;
+  Value* temp = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, first_node, add_node, &temp).ok());
   ASSERT_EQ(2, graph.nodes().size());
   ASSERT_EQ(3, graph.values().size());
@@ -146,13 +154,14 @@ TEST(RemoveSingleInputAdd, DoNotTrigger_Multiple) {
   ASSERT_TRUE(graph.AddConsumer(node_b->id, input->id).ok());
 
   auto add_node = graph.NewNode();
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, add_node, &output).ok());
   add_node->operation.type = ToString(OperationType::ADD);
 
-  Value* temp;
-  ASSERT_TRUE(ConnectTwoNodes(&graph, node_a, add_node, &temp).ok());
-  ASSERT_TRUE(ConnectTwoNodes(&graph, node_b, add_node, &temp).ok());
+  Value* temp_a = nullptr;
+  Value* temp_b = nullptr;
+  ASSERT_TRUE(ConnectTwoNodes(&graph, node_a, add_node, &temp_a).ok());
+  ASSERT_TRUE(ConnectTwoNodes(&graph, node_b, add_node, &temp_b).ok());
   ASSERT_EQ(3, graph.nodes().size());
   ASSERT_EQ(4, graph.values().size());
 
@@ -171,7 +180,7 @@ TEST(RemoveDegenerateUpsampling, Smoke) {
   ASSERT_TRUE(graph.AddConsumer(first_node->id, input->id).ok());
 
   auto node_to_remove = graph.NewNode();
-  Value* output;
+  Value* output = nullptr;
   ASSERT_TRUE(AddOutput(&graph, node_to_remove, &output).ok());
   output->tensor.shape = BHWC(1, 5, 5, 1);
   node_to_remove->operation.type = ToString(OperationType::RESIZE);
@@ -180,7 +189,7 @@ TEST(RemoveDegenerateUpsampling, Smoke) {
   attr.type = SamplingType::BILINEAR;
   node_to_remove->operation.attributes = attr;
 
-  Value* link;
+  Value* link = nullptr;
   ASSERT_TRUE(ConnectTwoNodes(&graph, first_node, node_to_remove, &link).ok());
   link->tensor.shape = output->tensor.shape;
   ASSERT_EQ(2, graph.nodes().size());
diff --git a/tensorflow/lite/delegates/gpu/common/types.h b/tensorflow/lite/delegates/gpu/common/types.h
index 8725b4234fe..4ddb46f305d 100644
--- a/tensorflow/lite/delegates/gpu/common/types.h
+++ b/tensorflow/lite/delegates/gpu/common/types.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <array>
 #include <cstddef>
 #include <cstdint>
-#include <string>
 
 #include <fp16.h>
 
diff --git a/tensorflow/lite/delegates/gpu/common/winograd_util.cc b/tensorflow/lite/delegates/gpu/common/winograd_util.cc
index 16be80eef41..4b9581d0f39 100644
--- a/tensorflow/lite/delegates/gpu/common/winograd_util.cc
+++ b/tensorflow/lite/delegates/gpu/common/winograd_util.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/winograd_util.h"
 
+#include <cmath>
+#include <vector>
+
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
diff --git a/tensorflow/lite/delegates/gpu/common/winograd_util.h b/tensorflow/lite/delegates/gpu/common/winograd_util.h
index 2e80a6ce121..e88ceacb490 100644
--- a/tensorflow/lite/delegates/gpu/common/winograd_util.h
+++ b/tensorflow/lite/delegates/gpu/common/winograd_util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_WINOGRAD_UTIL_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_WINOGRAD_UTIL_H_
 
+#include <vector>
+
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
diff --git a/tensorflow/lite/delegates/gpu/common/workgroup_selection.cc b/tensorflow/lite/delegates/gpu/common/workgroup_selection.cc
index 5ae2a53f449..439eb0ade90 100644
--- a/tensorflow/lite/delegates/gpu/common/workgroup_selection.cc
+++ b/tensorflow/lite/delegates/gpu/common/workgroup_selection.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/workgroup_selection.h"
 
+#include <math.h>
+
 #include <set>
+#include <vector>
 
 #include "tensorflow/lite/delegates/gpu/common/util.h"
 
diff --git a/tensorflow/lite/delegates/gpu/common/workgroup_selection.h b/tensorflow/lite/delegates/gpu/common/workgroup_selection.h
index a08bfce991a..67c51b45177 100644
--- a/tensorflow/lite/delegates/gpu/common/workgroup_selection.h
+++ b/tensorflow/lite/delegates/gpu/common/workgroup_selection.h
@@ -18,9 +18,6 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/types.h"
-
 namespace tflite {
 namespace gpu {
 
diff --git a/tensorflow/lite/delegates/gpu/delegate.cc b/tensorflow/lite/delegates/gpu/delegate.cc
index bfc2b7f08c4..98303b51da8 100644
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@@ -448,7 +448,7 @@ TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default() {
       .inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION,
       .inference_priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
       .inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO,
-      .experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE,
+      .experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT,
       .max_delegated_partitions = 1,
   };
   return options;
diff --git a/tensorflow/lite/delegates/gpu/delegate.h b/tensorflow/lite/delegates/gpu/delegate.h
index 9af586bfd75..40a06bb4384 100644
--- a/tensorflow/lite/delegates/gpu/delegate.h
+++ b/tensorflow/lite/delegates/gpu/delegate.h
@@ -51,6 +51,7 @@ enum TfLiteGpuInferencePriority {
 enum TfLiteGpuExperimentalFlags {
   TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE = 0,
   // Enables inference on quantized models with the delegate.
+  // NOTE: This is enabled in TfLiteGpuDelegateOptionsV2Default.
   TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT = 1 << 0,
   // Enforces execution with the provided backend.
   TFLITE_GPU_EXPERIMENTAL_FLAGS_CL_ONLY = 1 << 1,
@@ -108,6 +109,8 @@ typedef struct {
 //   priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION
 //   priority2 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO
 //   priority3 = TFLITE_GPU_INFERENCE_PRIORITY_AUTO
+//   experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT
+//   max_delegated_partitions = 1
 TFL_CAPI_EXPORT TfLiteGpuDelegateOptionsV2 TfLiteGpuDelegateOptionsV2Default();
 
 // Creates a new delegate instance that need to be destroyed with
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/BUILD b/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
index f62f48750bd..801e87fd775 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/BUILD
@@ -38,7 +38,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/gl:object",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:variant",
@@ -88,6 +87,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/gl:compiler_options",
         "//tensorflow/lite/delegates/gpu/gl:object",
         "//tensorflow/lite/delegates/gpu/gl:variable",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h b/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h
index 5c4de49c44b..318709fe7ff 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/object_accessor.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_OBJECT_ACCESSOR_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_COMPILER_OBJECT_ACCESSOR_H_
 
+#include <map>
 #include <string>
 #include <vector>
 
-#include "absl/container/flat_hash_map.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/preprocessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/object.h"
@@ -85,7 +85,7 @@ class ObjectAccessor : public InlineRewrite {
   RewriteStatus RewriteWrite(absl::string_view location,
                              absl::string_view value, std::string* output);
 
-  absl::flat_hash_map<std::string, Object> name_to_object_;
+  std::map<std::string, Object> name_to_object_;
 
   const bool is_mali_;
   const bool sampler_textures_;
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.cc b/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.cc
index e473f9e77ff..34c24edc5a3 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/common/gpu_info.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -48,6 +49,11 @@ absl::Status ShaderCodegen::Build(CompiledNodeAttributes attr,
 
   const auto add_uniform_parameter = [&](Variable&& variable) {
     const std::string name = variable.name;
+    const Variable& const_ref = variable;
+    if (variable_accessor.IsEmptyVariableLength(const_ref)) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Empty uniform vector value \"", name, "\""));
+    }
     if (!variable_accessor.AddUniformParameter(std::move(variable))) {
       return absl::AlreadyExistsError(
           absl::StrCat("Uniform parameter \"", name, "\""));
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.cc b/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.cc
index 96461f26ab8..2bb4a73c0ae 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.cc
@@ -73,6 +73,21 @@ std::string GetVariableType(const Variable::ValueType& value) {
   return absl::visit(VariableTypeGetter(), value);
 }
 
+struct LengthGetter {
+  template <typename T>
+  int operator()(const T& param) const {
+    return 1;
+  }
+  template <typename T>
+  int operator()(const std::vector<T>& param) const {
+    return param.size();
+  }
+};
+
+int GetLength(const Variable::ValueType& value) {
+  return absl::visit(LengthGetter(), value);
+}
+
 template <typename T>
 void FormatValue(std::string* result, T t) {
   absl::StrAppend(result, t);
@@ -459,6 +474,11 @@ bool VariableAccessor::AddUniformParameter(Variable&& variable) {
   return true;
 }
 
+bool VariableAccessor::IsEmptyVariableLength(const Variable& variable) const {
+  const auto& value = variable.value;
+  return IsVariableLength(value) && GetLength(value) == 0;
+}
+
 std::string VariableAccessor::GetConstDeclarations() const {
   // Variable length variables are declared as const and accessed via variable
   // with index.
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h b/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h
index db4b031548b..f6d5344d3b3 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h
@@ -57,6 +57,9 @@ class VariableAccessor : public InlineRewrite {
   // Returns true if variable was successfully added.
   bool AddUniformParameter(Variable&& variable);
 
+  // Returns true if variable value is an empty vector.
+  bool IsEmptyVariableLength(const Variable& variable) const;
+
   // Returns const variables that need to be inlined in the a shader's code.
   std::string GetConstDeclarations() const;
 
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index 8e13b58051b..a5d49b2c394 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -673,6 +673,8 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/gl:request_gpu_info",
         "//tensorflow/lite/delegates/gpu/gl:runtime_options",
         "//tensorflow/lite/delegates/gpu/gl/workgroups:default_calculator",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_googletest//:gtest",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc
index 71217a8e709..ceda5b68ca8 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc
@@ -38,6 +38,10 @@ class DepthwiseConvolution : public NodeShader {
  public:
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
+    if (ctx.input_shapes.size() != 1) {
+      return absl::UnimplementedError(
+          "DepthWise Convolution does not support more than 1 runtime tensor");
+    }
     const auto& attr =
         absl::any_cast<const DepthwiseConvolution2DAttributes&>(ctx.op_attr);
     auto weights = attr.weights.shape;
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
index 5d50fcc0118..9c874864bb1 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc
@@ -69,6 +69,9 @@ class ElementwiseOneArgument : public NodeShader {
             value_0.w = value_0.w > 0.0 ? log(value_0.w) : nan;
         )";
         break;
+      case OperationType::NEG:
+        source = "value_0 = -(value_0);";
+        break;
       case OperationType::RSQRT:
         source = R"(
             const float nan = normalize(vec4(0, 0, 0, 0)).x;
@@ -222,12 +225,13 @@ std::unique_ptr<NodeShader> NewElementwiseNodeShader(
     OperationType operation_type) {
   switch (operation_type) {
     case OperationType::ABS:
-    case OperationType::COPY:
     case OperationType::COS:
+    case OperationType::COPY:
     case OperationType::ELU:
     case OperationType::EXP:
-    case OperationType::LOG:
     case OperationType::HARD_SWISH:
+    case OperationType::LOG:
+    case OperationType::NEG:
     case OperationType::RSQRT:
     case OperationType::SIGMOID:
     case OperationType::SIN:
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc
index a32a4ea9f76..5ff7bfc9ed7 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise_test.cc
@@ -129,6 +129,18 @@ TEST(ElementwiseOneArgumentTest, Log) {
               Pointwise(FloatNear(1e-6), {0.0, 1.14473, 0.0, 0.0}));
 }
 
+TEST(ElementwiseOneArgumentTest, Neg) {
+  OperationType op_type = OperationType::NEG;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  ASSERT_TRUE(model.PopulateTensor(0, {1.0, -3.1415926, 0.0, 1.0}));
+  ASSERT_OK(model.Invoke(*NewElementwiseNodeShader(op_type)));
+  EXPECT_THAT(model.GetOutput(0),
+              Pointwise(FloatNear(1e-6), {-1.0, 3.1415926, 0.0, -1.0}));
+}
+
 TEST(ElementwiseOneArgumentTest, Rsqrt) {
   OperationType op_type = OperationType::RSQRT;
   const BHWC shape(1, 2, 2, 1);
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc b/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
index 645e5b6c728..efab4dd2274 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/registry.cc
@@ -103,6 +103,7 @@ class Registry : public NodeShader {
     insert_elementwise_op(Type::EXP);
     insert_elementwise_op(Type::HARD_SWISH);
     insert_elementwise_op(Type::LOG);
+    insert_elementwise_op(Type::NEG);
     insert_elementwise_op(Type::MAXIMUM);
     insert_elementwise_op(Type::MINIMUM);
     insert_elementwise_op(Type::POW);
@@ -125,17 +126,20 @@ class Registry : public NodeShader {
 
   absl::Status GenerateCode(const GenerationContext& ctx,
                             GeneratedCode* generated_code) const final {
-    std::vector<std::string> errors;
     auto it = shaders_.find(ctx.op_type);
-    if (it != shaders_.end()) {
-      for (auto& shader : it->second) {
-        const auto status = shader->GenerateCode(ctx, generated_code);
-        if (status.ok()) return status;
-        errors.push_back(std::string(status.message()));
-      }
+    if (it == shaders_.end()) {
+      return absl::NotFoundError(
+          absl::StrCat("No shader implementation for ", ctx.op_type));
     }
-    return absl::NotFoundError(absl::StrCat(
-        "Suitable node shader is not found: ", absl::StrJoin(errors, ", ")));
+    std::vector<std::string> errors;
+    for (const auto& shader : it->second) {
+      const auto status = shader->GenerateCode(ctx, generated_code);
+      // Return the first suitable shader.
+      if (status.ok()) return absl::OkStatus();
+      errors.push_back(std::string(status.message()));
+    }
+    return errors.empty() ? absl::OkStatus()
+                          : absl::UnknownError(absl::StrJoin(errors, ", "));
   }
 
  private:
diff --git a/tensorflow/lite/delegates/gpu/gl/variable.h b/tensorflow/lite/delegates/gpu/gl/variable.h
index 1c5bb26db62..5237481f96e 100644
--- a/tensorflow/lite/delegates/gpu/gl/variable.h
+++ b/tensorflow/lite/delegates/gpu/gl/variable.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <array>
 #include <cstdint>
+#include <string>
 #include <vector>
 
 #include "absl/types/variant.h"
diff --git a/tensorflow/lite/delegates/gpu/gl_delegate.cc b/tensorflow/lite/delegates/gpu/gl_delegate.cc
index 2f25539802a..8b049d483b1 100644
--- a/tensorflow/lite/delegates/gpu/gl_delegate.cc
+++ b/tensorflow/lite/delegates/gpu/gl_delegate.cc
@@ -35,7 +35,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
-#include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
+#include "tensorflow/lite/delegates/gpu/common/transformations/model_transformations.h"
 #include "tensorflow/lite/delegates/gpu/gl/api.h"
 #include "tensorflow/lite/delegates/gpu/gl/command_queue.h"
 #include "tensorflow/lite/delegates/gpu/gl/compiler.h"
@@ -138,8 +138,8 @@ class Delegate {
     // Apply general transformations on the graph.
     NullTransformationReporter reporter;
     ModelTransformer transformer(&graph, &reporter);
-    if (!ApplyGeneralTransformations(&transformer)) {
-      return absl::InternalError("Graph general transformations failed");
+    if (!ApplyModelTransformations(&transformer)) {
+      return absl::InternalError("Graph transformations failed");
     }
 
     if (!env_) RETURN_IF_ERROR(EglEnvironment::NewEglEnvironment(&env_));
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
index 78cab0d2cbf..5eb6881be88 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
+++ b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
@@ -68,7 +68,7 @@ public class GpuDelegate implements Delegate, Closeable {
      *
      * <p>WARNING: This is an experimental API and subject to change.
      *
-     * @param quantizedModelsAllowed When {@code true}, the GPU may run quantized models.
+     * @param quantizedModelsAllowed When {@code true} (default), the GPU may run quantized models.
      */
     public Options setQuantizedModelsAllowed(boolean quantizedModelsAllowed) {
       this.quantizedModelsAllowed = quantizedModelsAllowed;
@@ -87,7 +87,7 @@ public class GpuDelegate implements Delegate, Closeable {
     }
 
     boolean precisionLossAllowed = true;
-    boolean quantizedModelsAllowed = false;
+    boolean quantizedModelsAllowed = true;
     int inferencePreference = INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER;
   }
 
diff --git a/tensorflow/lite/delegates/gpu/metal/BUILD b/tensorflow/lite/delegates/gpu/metal/BUILD
index c4e7ca7c10d..9f694b55cdb 100644
--- a/tensorflow/lite/delegates/gpu/metal/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/BUILD
@@ -38,6 +38,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "arguments",
+    srcs = ["arguments.cc"],
+    hdrs = ["arguments.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common:status",
+    ],
+)
+
 objc_library(
     name = "buffer_convert",
     srcs = ["buffer_convert.mm"],
@@ -134,6 +143,7 @@ objc_library(
     deps = [
         ":common",
         ":compute_task_descriptor",
+        ":metal_arguments",
         ":runtime_options",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:shape",
@@ -149,6 +159,7 @@ objc_library(
     hdrs = ["compute_task_descriptor.h"],
     copts = DEFAULT_COPTS,
     deps = [
+        ":arguments",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:types",
@@ -213,6 +224,20 @@ ios_unit_test(
     deps = [":inference_context_test_lib"],
 )
 
+objc_library(
+    name = "metal_arguments",
+    srcs = ["metal_arguments.mm"],
+    hdrs = ["metal_arguments.h"],
+    copts = DEFAULT_COPTS,
+    sdk_frameworks = ["Metal"],
+    deps = [
+        ":arguments",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "runtime_options",
     hdrs = ["runtime_options.h"],
diff --git a/tensorflow/lite/delegates/gpu/metal/api.cc b/tensorflow/lite/delegates/gpu/metal/api.cc
index fcab962ee61..1f3476170b0 100644
--- a/tensorflow/lite/delegates/gpu/metal/api.cc
+++ b/tensorflow/lite/delegates/gpu/metal/api.cc
@@ -267,6 +267,11 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
           device_info, options);
       break;
     case OperationType::DEPTHWISE_CONVOLUTION:
+      if (graph.FindInputs(node->id).size() != 1) {
+        return absl::UnimplementedError(
+            "DepthWise Convolution does not support more than 1 runtime "
+            "tensor");
+      }
       *tasks =
           SelectDepthWiseConv(node_id, inputs[0], outputs[0],
                               absl::any_cast<DepthwiseConvolution2DAttributes>(
@@ -371,6 +376,7 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
     case OperationType::EXP:
     case OperationType::HARD_SWISH:
     case OperationType::LOG:
+    case OperationType::NEG:
     case OperationType::RSQRT:
     case OperationType::SIGMOID:
     case OperationType::SIN:
@@ -404,10 +410,22 @@ absl::Status RegisterPrimaryOps(const GraphFloat32& graph, const Node* node,
     } break;
     case OperationType::BATCH_NORMALIZATION:
     case OperationType::BATCH_TO_SPACE:
+    case OperationType::BATCHED_MATMUL:
     case OperationType::CONST:
     case OperationType::LSTM:
     // TODO(b/162763635): implement MeanStddevNormalization for Metal.
     case OperationType::MEAN_STDDEV_NORMALIZATION:
+    case OperationType::REDUCE_MAXIMUM:
+    case OperationType::REDUCE_MINIMUM:
+    case OperationType::REDUCE_PRODUCT:
+    case OperationType::REDUCE_SUM:
+    // comparison operations
+    case OperationType::LESS:
+    case OperationType::LESS_EQUAL:
+    case OperationType::EQUAL:
+    case OperationType::NOT_EQUAL:
+    case OperationType::GREATER:
+    case OperationType::GREATER_EQUAL:
     case OperationType::SPACE_TO_BATCH:
     case OperationType::TRANSPOSE:
     case OperationType::UNKNOWN:
diff --git a/tensorflow/lite/delegates/gpu/metal/arguments.cc b/tensorflow/lite/delegates/gpu/metal/arguments.cc
new file mode 100644
index 00000000000..f4a308e59be
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/arguments.cc
@@ -0,0 +1,64 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/metal/arguments.h"
+
+#include "absl/strings/ascii.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+namespace {
+bool IsWordSymbol(char symbol) {
+  return absl::ascii_isalnum(symbol) || symbol == '_';
+}
+
+bool HasWord(const std::string& word, const std::string& text) {
+  size_t pos = text.find(word);
+  while (pos != std::string::npos) {
+    char prev = pos == 0 ? '.' : text[pos - 1];
+    char next = pos + word.size() < text.size() ? text[pos + word.size()] : '.';
+    if (!IsWordSymbol(prev) & !IsWordSymbol(next)) {
+      return true;
+    }
+    pos = text.find(word, pos + 1);
+  }
+  return false;
+}
+}  // namespace
+
+// Static
+constexpr char Arguments::kArgsPrefix[];
+
+void Arguments::AddFloat(const std::string& name, float value) {
+  float_values_[name].value = value;
+}
+
+void Arguments::AddInt(const std::string& name, int value) {
+  int_values_[name].value = value;
+}
+
+void Arguments::GetActiveArguments(const std::string& code) {
+  for (auto& float_val : float_values_) {
+    float_val.second.active = HasWord(kArgsPrefix + float_val.first, code);
+  }
+  for (auto& int_val : int_values_) {
+    int_val.second.active = HasWord(kArgsPrefix + int_val.first, code);
+  }
+}
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/metal/arguments.h b/tensorflow/lite/delegates/gpu/metal/arguments.h
new file mode 100644
index 00000000000..fbdcfef1358
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/arguments.h
@@ -0,0 +1,77 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_ARGUMENTS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_ARGUMENTS_H_
+
+#include <map>
+#include <string>
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+class Arguments {
+ public:
+  Arguments() = default;
+
+  // Move only
+  Arguments(Arguments&& args) = default;
+  Arguments& operator=(Arguments&& args) = default;
+  Arguments(const Arguments&) = delete;
+  Arguments& operator=(const Arguments&) = delete;
+
+  void AddFloat(const std::string& name, float value = 0.0f);
+  void AddInt(const std::string& name, int value = 0);
+
+ private:
+  friend class MetalArguments;
+  void GetActiveArguments(const std::string& code);
+
+  static constexpr char kArgsPrefix[] = "args.";
+  struct IntValue {
+    int value;
+
+    // many arguments generated automatically and not used
+    // this flag active if argument was used in kernel_code
+    // Will be filled after GetActiveArguments call
+    bool active = false;
+  };
+  std::map<std::string, IntValue> int_values_;
+
+  struct FloatValue {
+    float value;
+
+    // many arguments generated automatically and not used
+    // this flag active if argument was used in kernel_code
+    // Will be filled after GetActiveArguments call
+    bool active = false;
+  };
+  std::map<std::string, FloatValue> float_values_;
+};
+
+class ArgumentsSetter {
+ public:
+  virtual absl::Status SetInt(const std::string& name, int value) = 0;
+  virtual absl::Status SetFloat(const std::string& name, float value) = 0;
+  virtual ~ArgumentsSetter() = default;
+};
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_ARGUMENTS_H_
diff --git a/tensorflow/lite/delegates/gpu/metal/compiled_model.cc b/tensorflow/lite/delegates/gpu/metal/compiled_model.cc
index 74202edd585..dc55c70f39f 100644
--- a/tensorflow/lite/delegates/gpu/metal/compiled_model.cc
+++ b/tensorflow/lite/delegates/gpu/metal/compiled_model.cc
@@ -535,6 +535,7 @@ ComputeTaskDescriptorPtr FuseChain(const FusionSequence& chain) {
       uniform_index++;
       fused_descriptor->uniform_buffers.push_back({"", buffer.data_function});
     }
+    fused_descriptor->args = std::move(desc->args);
 
     if (desc->is_linkable) {
       call_code +=
@@ -546,8 +547,8 @@ ComputeTaskDescriptorPtr FuseChain(const FusionSequence& chain) {
 
   ComputeTaskDescriptorPtr non_linkable = sequence.front();
   fused_descriptor->shader_source =
-      absl::Substitute(non_linkable->shader_source, function_code,
-                       buffer_declarations, call_code);
+      absl::Substitute(non_linkable->shader_source, function_code + "$0",
+                       buffer_declarations + "$1", call_code);
   std::vector<ValueId> alias;
   alias.reserve(chain.size() - 1);
   for (int i = 0; i < chain.size() - 1; i++) {
diff --git a/tensorflow/lite/delegates/gpu/metal/compiled_model_test.mm b/tensorflow/lite/delegates/gpu/metal/compiled_model_test.mm
index 3a76178d71f..0bafcb2616e 100644
--- a/tensorflow/lite/delegates/gpu/metal/compiled_model_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/compiled_model_test.mm
@@ -158,6 +158,7 @@ static std::vector<ComputeTaskDescriptorPtr> Add2Linkable(int id, ValueId input_
                                                           ValueId input_id2, ValueId output_id) {
   std::vector<ComputeTaskDescriptorPtr> descriptors;
   descriptors.push_back(ComputeTaskDescriptorPtr(new ComputeTaskDescriptor({
+      {},  // args
       id,
       true,  // linkable
       true,  // associative_op
diff --git a/tensorflow/lite/delegates/gpu/metal/compute_task.mm b/tensorflow/lite/delegates/gpu/metal/compute_task.mm
index 7bfbb55feff..bcff35e29ad 100644
--- a/tensorflow/lite/delegates/gpu/metal/compute_task.mm
+++ b/tensorflow/lite/delegates/gpu/metal/compute_task.mm
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <tuple>
 
+#include "tensorflow/lite/delegates/gpu/metal/metal_arguments.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
@@ -70,11 +71,15 @@ struct UniformBuffer {
   uint3 _groupsCount;
   DispatchParamsFunction _resizeFunction;
   std::string _description;
+  tflite::gpu::metal::MetalArguments _metal_args;
 }
 
 - (absl::Status)compileWithDevice:(id<MTLDevice>)device
                    taskDescriptor:(ComputeTaskDescriptorPtr)desc
                    runtimeOptions:(const RuntimeOptions&)options {
+  size_t offset = desc->input_buffers.size() + desc->uniform_buffers.size()
+                  + desc->immutable_buffers.size() + 1;
+  RETURN_IF_ERROR(_metal_args.Init(offset, &desc->args, &desc->shader_source));
   NSString* barrier;
   // simdgroup_barrier is supported on macOS 10.13+ and Metal shading language version 2.0
   if (@available(macOS 10.13, iOS 10.0, tvOS 10.0, *)) {
@@ -251,6 +256,7 @@ struct UniformBuffer {
     [encoder setBytes:uniform.data.data() length:uniform.data.size() atIndex:bindIndex];
     bindIndex++;
   }
+  _metal_args.Encode(encoder, bindIndex);
 
   MTLSize groupsCount = MTLSizeMake(_groupsCount.x, _groupsCount.y, _groupsCount.z);
   MTLSize groupsSize = MTLSizeMake(_groupsSize.x, _groupsSize.y, _groupsSize.z);
diff --git a/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h b/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h
index 923f4dcc245..7b65f2bdb02 100644
--- a/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h
+++ b/tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/metal/arguments.h"
 #include "tensorflow/lite/delegates/gpu/metal/runtime_options.h"
 
 namespace tflite {
@@ -79,6 +80,7 @@ struct ComputeTaskDescriptor {
     UniformsFunction data_function;
   };
 
+  Arguments args;
   // Unique ID to match the graph compilation errors.
   int id;
   bool is_linkable;
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
index f4f4c180976..e90f8a41c8b 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@@ -122,6 +122,7 @@ cc_library(
     srcs = ["conv.cc"],
     hdrs = ["conv.h"],
     deps = [
+        "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:shape",
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc
index 9edfc884638..fb4de84be8e 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise.cc
@@ -45,6 +45,7 @@ std::string OneInputFunctor(OperationType op_type, const std::string& value) {
        "$0.w < FLT(0.0f) ? exp($0.w) - FLT(1.0f) : $0.w)"},
       {OperationType::EXP, "exp($0)"},
       {OperationType::LOG, "log($0)"},
+      {OperationType::NEG, "-($0)"},
       {OperationType::SQRT, "sqrt($0)"},
       {OperationType::RSQRT, "1.0 / sqrt($0)"},
       {OperationType::SQUARE, "$0 * $0"},
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
index 4972fdeb1a9..867ed596ed8 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm
@@ -257,6 +257,19 @@ TensorRef<BHWC> GetTensorRef(int ref, const BHWC& shape) {
   XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
 }
 
+- (void)testNeg {
+  OperationType op_type = OperationType::NEG;
+  const BHWC shape(1, 2, 2, 1);
+  SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}},
+                      /*inputs=*/{GetTensorRef(0, shape)},
+                      /*outputs=*/{GetTensorRef(1, shape)});
+  XCTAssertTrue(model.PopulateTensor(0, {-1.0, 3.1415926, 0.0, 1.0}));
+  auto status = model.Invoke();
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+  status = CompareVectors({1.0, -3.1415926, 0.0, -1.0}, model.GetOutput(0), 1e-6f);
+  XCTAssertTrue(status.ok(), @"%s", std::string(status.message()).c_str());
+}
+
 - (void)testPow {
   OperationType op_type = OperationType::POW;
   const BHWC shape(1, 2, 2, 1);
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc
index 9872328f6d7..152ec0f542a 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected.cc
@@ -51,13 +51,6 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
     #include <metal_stdlib>
     using namespace metal;
 
-    struct uniforms {
-      uint src_depth;
-      uint dst_channels;
-      uint out_channels;
-      uint dummy;
-    };
-
     $$0
     kernel void ComputeFunction(
                                 $$1
@@ -71,11 +64,11 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
   float summa = 0.0f;
   threadgroup FLT4 local_vector[32];
   for (int j = 0; j < $0; ++j) {
-    local_vector[tid_index] = j * 32 + tid_index >= params.src_depth ?
+    local_vector[tid_index] = j * 32 + tid_index >= args.src_slices ?
       FLT4(0.0f) : vector[j * 32 + tid_index];
     $1(mem_flags::mem_threadgroup);
     for (uint i = 0, counter = j * 32 + tid.y * 8; i < 8; ++i, ++counter) {
-      summa += dot(local_vector[tid.y * 8 + i], matrix[counter * params.dst_channels + ugid.x]);
+      summa += dot(local_vector[tid.y * 8 + i], matrix[counter * args.dst_channels_alignedx8 + ugid.x]);
     }
     $1(mem_flags::mem_none);
   }
@@ -87,10 +80,10 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
   for (uint i = 0; i < $0; ++i, ++counter) {
     )";
     if (src_depth % 4 != 0) {
-      code << "    if (counter >= params.src_depth) continue;" << std::endl;
+      code << "    if (counter >= args.src_slices) continue;" << std::endl;
     }
     code << "    summa += dot(vector[counter], matrix[counter * "
-            "params.dst_channels + ugid.x]);"
+            "args.dst_channels_alignedx8 + ugid.x]);"
          << std::endl;
     code << "  }" << std::endl;
   }
@@ -106,7 +99,7 @@ std::string GetFullyConnectedCode(const DeviceInfo& device_info,
     temp[tid.x][0] = summa;
   }
   $1(mem_flags::mem_threadgroup);
-  if (tid.y == 0 && tid.x % 4 == 0 && ugid.x < params.out_channels) {
+  if (tid.y == 0 && tid.x % 4 == 0 && ugid.x < args.dst_channels) {
     const int linear_index = ugid.x / 4;
     FLT4 value = FLT4(temp[tid.x][0], temp[tid.x + 1][0], temp[tid.x + 2][0], temp[tid.x + 3][0]) +
       biases[linear_index];
@@ -132,6 +125,11 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
   desc->shader_source = GetFullyConnectedCode(device_info, attr.weights.shape.i,
                                               attr.weights.shape.o);
 
+  desc->args.AddInt("dst_channels", attr.weights.shape.o);
+  desc->args.AddInt("src_slices", DivideRoundUp(attr.weights.shape.i, 4));
+  desc->args.AddInt("dst_channels_alignedx8",
+                    AlignByN(attr.weights.shape.o, 8));
+
   desc->input_buffers = {
       {input_id, "device FLT4* const vector"},
   };
@@ -174,19 +172,6 @@ std::vector<ComputeTaskDescriptorPtr> FullyConnected(
                                      attr.weights.shape.o)},
   };
 
-  desc->uniform_buffers = {
-      {"constant uniforms& params",
-       [attr](const std::map<ValueId, BHWC>& buffers) {
-         std::vector<uint32_t> uniform_params{
-             static_cast<uint32_t>(DivideRoundUp(attr.weights.shape.i, 4)),
-             static_cast<uint32_t>(AlignByN(attr.weights.shape.o, 8)),
-             static_cast<uint32_t>(attr.weights.shape.o),
-             static_cast<uint32_t>(0),
-         };
-         return GetByteBuffer(uniform_params);
-       }},
-  };
-
   desc->resize_function = [attr](const std::map<ValueId, BHWC>& buffers) {
     const uint3 groups_size{8, 4, 1};
     const int dst_channels_aligned = AlignByN(attr.weights.shape.o, 8);
diff --git a/tensorflow/lite/delegates/gpu/metal/metal_arguments.h b/tensorflow/lite/delegates/gpu/metal/metal_arguments.h
new file mode 100644
index 00000000000..496287c8ff0
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/metal_arguments.h
@@ -0,0 +1,80 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_METAL_METAL_ARGUMENTS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_METAL_METAL_ARGUMENTS_H_
+
+#import <Metal/Metal.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/metal/arguments.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+
+class MetalArguments : public ArgumentsSetter {
+ public:
+  MetalArguments() = default;
+
+  absl::Status Init(int buffer_offset, Arguments* args, std::string* code);
+
+  // Move only
+  MetalArguments(MetalArguments&& args) = default;
+  MetalArguments& operator=(MetalArguments&& args) = default;
+  MetalArguments(const MetalArguments&) = delete;
+  MetalArguments& operator=(const MetalArguments&) = delete;
+
+  absl::Status SetInt(const std::string& name, int value) override;
+  absl::Status SetFloat(const std::string& name, float value) override;
+
+  void Encode(id<MTLComputeCommandEncoder> encoder, int buffer_offset) const;
+
+ private:
+  static constexpr char kArgsPrefix[] = "args.";
+  struct IntValue {
+    int value;
+
+    // many arguments generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+
+    // offset to shared storage.
+    uint32_t bytes_offset = -1;
+  };
+  std::map<std::string, IntValue> int_values_;
+
+  struct FloatValue {
+    float value;
+
+    // many arguments generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+
+    // offset to shared storage.
+    uint32_t bytes_offset = -1;
+  };
+  std::map<std::string, FloatValue> float_values_;
+  std::vector<uint8_t> const_data_;
+};
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_METAL_METAL_ARGUMENTS_H_
diff --git a/tensorflow/lite/delegates/gpu/metal/metal_arguments.mm b/tensorflow/lite/delegates/gpu/metal/metal_arguments.mm
new file mode 100644
index 00000000000..421e4a98db7
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/metal/metal_arguments.mm
@@ -0,0 +1,137 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/gpu/metal/metal_arguments.h"
+
+#include <string>
+
+#include "absl/strings/substitute.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace metal {
+namespace {
+bool IsWordSymbol(char symbol) {
+  return absl::ascii_isalnum(symbol) || symbol == '_';
+}
+
+void ReplaceAllWords(const std::string& old_word, const std::string& new_word,
+                     std::string* str) {
+  size_t position = str->find(old_word);
+  while (position != std::string::npos) {
+    char prev = position == 0 ? '.' : (*str)[position - 1];
+    char next = position + old_word.size() < str->size()
+                    ? (*str)[position + old_word.size()]
+                    : '.';
+    if (IsWordSymbol(prev) || IsWordSymbol(next)) {
+      position = str->find(old_word, position + 1);
+      continue;
+    }
+    str->replace(position, old_word.size(), new_word);
+    position = str->find(old_word, position + new_word.size());
+  }
+}
+}  // namespace
+
+// Static
+constexpr char MetalArguments::kArgsPrefix[];
+
+absl::Status MetalArguments::Init(int buffer_offset, Arguments* args, std::string* code) {
+  args->GetActiveArguments(*code);
+  std::string struct_desc = "struct uniforms_buffer {\n";
+  std::string struct_decl;
+  int pos = 0;
+  for (auto& fvalue : args->float_values_) {
+    auto& new_val = float_values_[fvalue.first];
+    new_val.value = fvalue.second.value;
+    new_val.active = fvalue.second.active;
+    if (fvalue.second.active) {
+      new_val.bytes_offset = pos * 4;
+      pos++;
+      struct_desc += "  float " + fvalue.first + ";\n";
+      ReplaceAllWords(kArgsPrefix + fvalue.first, "U." + fvalue.first, code);
+    }
+  }
+  for (auto& ivalue : args->int_values_) {
+    auto& new_val = int_values_[ivalue.first];
+    new_val.value = ivalue.second.value;
+    new_val.active = ivalue.second.active;
+    if (ivalue.second.active) {
+      new_val.bytes_offset = pos * 4;
+      pos++;
+      struct_desc += "  int " + ivalue.first + ";\n";
+      ReplaceAllWords(kArgsPrefix + ivalue.first, "U." + ivalue.first, code);
+    }
+  }
+  if (pos != 0) {
+    struct_decl = "constant uniforms_buffer& U[[buffer(" + std::to_string(buffer_offset) + ")]],\n";
+    int aligned_pos = AlignByN(pos, 4);
+    for (int i = pos; i < aligned_pos; i++) {
+      struct_desc += "  int dummy" + std::to_string(i - pos) + ";\n";
+    }
+    struct_desc += "};";
+    const_data_.resize(aligned_pos * 4);
+    for (auto& it : float_values_) {
+      float* ptr = reinterpret_cast<float*>(&const_data_[it.second.bytes_offset]);
+      *ptr = it.second.value;
+    }
+    for (auto& it : int_values_) {
+      int32_t* ptr = reinterpret_cast<int32_t*>(&const_data_[it.second.bytes_offset]);
+      *ptr = it.second.value;
+    }
+  } else {
+    struct_desc = "";
+    struct_decl = "";
+  }
+  *code = absl::Substitute(*code, struct_desc, struct_decl);
+  return absl::OkStatus();
+}
+
+absl::Status MetalArguments::SetInt(const std::string& name, int value) {
+  auto it = int_values_.find(name);
+  if (it == int_values_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No int argument with name - ", name));
+  }
+  it->second.value = value;
+  if (it->second.active) {
+    int32_t* ptr = reinterpret_cast<int32_t*>(&const_data_[it->second.bytes_offset]);
+    *ptr = value;
+  }
+  return absl::OkStatus();
+}
+absl::Status MetalArguments::SetFloat(const std::string& name, float value) {
+  auto it = float_values_.find(name);
+  if (it == float_values_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No float argument with name - ", name));
+  }
+  it->second.value = value;
+  if (it->second.active) {
+    float* ptr = reinterpret_cast<float*>(&const_data_[it->second.bytes_offset]);
+    *ptr = value;
+  }
+  return absl::OkStatus();
+}
+
+void MetalArguments::Encode(id<MTLComputeCommandEncoder> encoder, int buffer_offset) const {
+  if (!const_data_.empty()) {
+    [encoder setBytes:const_data_.data() length:const_data_.size() atIndex:buffer_offset];
+  }
+}
+
+}  // namespace metal
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/metal_delegate.h b/tensorflow/lite/delegates/gpu/metal_delegate.h
index e4bdba36799..ea9da126954 100644
--- a/tensorflow/lite/delegates/gpu/metal_delegate.h
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.h
@@ -47,6 +47,7 @@ typedef struct {
   bool allow_precision_loss;
   TFLGpuDelegateWaitType wait_type;
   // Allows execution of integer quantized models
+  // TODO(b/169350710): Enable by default.
   bool enable_quantization;
 } TFLGpuDelegateOptions;
 
diff --git a/tensorflow/lite/delegates/gpu/metal_delegate.mm b/tensorflow/lite/delegates/gpu/metal_delegate.mm
index c2e5289c604..e97e89d54c0 100644
--- a/tensorflow/lite/delegates/gpu/metal_delegate.mm
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.mm
@@ -38,7 +38,6 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/metal/api.h"
 #include "tensorflow/lite/delegates/gpu/metal/buffer_convert.h"
@@ -632,7 +631,7 @@ class Delegate {
   std::vector<BufferDescriptor> graph_inputs_;
   std::vector<BufferDescriptor> graph_outputs_;
 
-  id<MTLComputeCommandEncoder> external_command_encoder_;
+  id<MTLComputeCommandEncoder> external_command_encoder_ = nil;
   std::function<id<MTLComputeCommandEncoder>(bool is_last)> control_encoder_;
   id<MTLCommandQueue> command_queue_;
   std::unique_ptr<GpuAlarmClock> gpu_alarm_clock_;
diff --git a/tensorflow/lite/delegates/interpreter_utils.h b/tensorflow/lite/delegates/interpreter_utils.h
index f736c2db1f4..fa55046f01f 100644
--- a/tensorflow/lite/delegates/interpreter_utils.h
+++ b/tensorflow/lite/delegates/interpreter_utils.h
@@ -22,7 +22,6 @@ limitations under the License.
 
 namespace tflite {
 namespace delegates {
-#if !TFLITE_EXPERIMENTAL_RUNTIME_EAGER
 class InterpreterUtils {
  public:
   /// Invokes an interpreter with automatic fallback from delegation to CPU.
@@ -45,7 +44,6 @@ class InterpreterUtils {
   /// WARNING: This is an experimental API and subject to change.
   static TfLiteStatus InvokeWithCPUFallback(Interpreter* interpreter);
 };
-#endif  // !TFLITE_EXPERIMENTAL_RUNTIME_EAGER
 }  // namespace delegates
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index 53ac979c34c..7a34b0846f2 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = [
@@ -26,6 +27,7 @@ cc_library(
         "nnapi_delegate.h",
         "nnapi_delegate_kernel.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/lite:allocation",
         "//tensorflow/lite:kernel_api",
diff --git a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
index 43f9c1b0953..c30e22892cc 100644
--- a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
+++ b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
@@ -347,6 +347,8 @@ ResizeBilinearOpTest/ResizeBilinearOpTest/.+/0,29
 // align_corners & half_pixel_centers are not implemented in NNAPI before API 30
 ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+AlignCorners.*/0,30
 ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+HalfPixelCenters.*/0,30
+// 16-bit tests are not supported
+-ResizeNearestNeighborOpTest.+Int16/.+
 // Only models with constant size tensor are accelerated
 ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest/.+/0,29
 
@@ -362,6 +364,7 @@ SelectOpTest/.+,29
 -SliceOpTest/SliceOpTest/SliceString/.+
 -SliceOpTest/SliceOpTest/SliceInt64/.+
 -SliceOpTest/SliceOpTest/SliceBool/.+
+-SliceOpTest/SliceOpTest/SliceInt16/.+
 # Only constant tensors
 SliceOpTest/SliceOpTest/.+/0,29
 
@@ -411,6 +414,7 @@ TopKV2OpTest/TopKV2OpTest/.+/0,29
 -TransposeTest/5DDividedIntoTwo2Ds.*
 -TransposeTest/Complex5DTest.*
 -TransposeTest/.+DynamicTensor
+-TransposeTest/TestRefOps4DInt16
 TransposeTest/.+
 
 # transpose_conv_test
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 98754024aa1..d81d950af76 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -103,18 +103,35 @@ std::string NnApiErrorDescription(int error_code) {
 }
 // LINT.ThenChange()
 
-#define RETURN_TFLITE_ERROR_IF_NN_ERROR(context, code, call_desc, p_errno)    \
-  do {                                                                        \
-    const auto _code = (code);                                                \
-    const auto _call_desc = (call_desc);                                      \
-    if (_code != ANEURALNETWORKS_NO_ERROR) {                                  \
-      const auto error_desc = NnApiErrorDescription(_code);                   \
-      context->ReportError(context,                                           \
-                           "NN API returned error %s at line %d while %s.\n", \
-                           error_desc.c_str(), __LINE__, _call_desc);         \
-      *p_errno = _code;                                                       \
-      return kTfLiteError;                                                    \
-    }                                                                         \
+#define RETURN_TFLITE_ERROR_IF_NN_ERROR(context, code, call_desc, p_errno)  \
+  do {                                                                      \
+    const auto _code = (code);                                              \
+    const auto _call_desc = (call_desc);                                    \
+    if (_code != ANEURALNETWORKS_NO_ERROR) {                                \
+      const auto error_desc = NnApiErrorDescription(_code);                 \
+      TF_LITE_KERNEL_LOG(context,                                           \
+                         "NN API returned error %s at line %d while %s.\n", \
+                         error_desc.c_str(), __LINE__, _call_desc);         \
+      *p_errno = _code;                                                     \
+      return kTfLiteError;                                                  \
+    }                                                                       \
+  } while (0)
+
+#define RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(context, code, call_desc, \
+                                                   p_tensor, p_errno)        \
+  do {                                                                       \
+    const auto _code = (code);                                               \
+    const auto _call_desc = (call_desc);                                     \
+    if (_code != ANEURALNETWORKS_NO_ERROR) {                                 \
+      const auto error_desc = NnApiErrorDescription(_code);                  \
+      TF_LITE_KERNEL_LOG(context,                                            \
+                         "NN API returned error %s at line %d while %s "     \
+                         "for tensor '%s'.\n",                               \
+                         error_desc.c_str(), __LINE__, _call_desc,           \
+                         (p_tensor)->name ? (p_tensor)->name : "no-name");   \
+      *p_errno = _code;                                                      \
+      return kTfLiteError;                                                   \
+    }                                                                        \
   } while (0)
 
 bool IsFloat(TfLiteType type) {
@@ -147,6 +164,48 @@ bool IsQuantized(TfLiteType type) {
   }
 }
 
+bool IsInt32(TfLiteType type) {
+  switch (type) {
+    case kTfLiteInt32:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsFloatOrQuantized(TfLiteType type) {
+  switch (type) {
+    case kTfLiteFloat32:
+    case kTfLiteUInt8:
+    case kTfLiteInt8:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsFloatOrInt32(TfLiteType type) {
+  switch (type) {
+    case kTfLiteFloat32:
+    case kTfLiteInt32:
+      return true;
+    default:
+      return false;
+  }
+}
+
+bool IsFloatQuantizedOrInt32(TfLiteType type) {
+  switch (type) {
+    case kTfLiteFloat32:
+    case kTfLiteUInt8:
+    case kTfLiteInt8:
+    case kTfLiteInt32:
+      return true;
+    default:
+      return false;
+  }
+}
+
 bool IsScalarInputSupported(int builtin_code) {
   switch (builtin_code) {
     case kTfLiteBuiltinAdd:
@@ -950,10 +1009,10 @@ class NNAPIOpBuilder {
     TF_LITE_ENSURE_EQ(context_, NumElements(tensor), 1);
 
     ANeuralNetworksOperandType operand_type{.type = nn_type};
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+    RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
         context_,
         nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type),
-        "adding operand", nnapi_errno_);
+        "adding operand", tensor, nnapi_errno_);
     int ann_tensor_index = operand_mapping_->lite_index_to_ann(tensor_index);
     if (ann_tensor_index != -1) {
       augmented_inputs_.push_back(ann_tensor_index);
@@ -1280,17 +1339,18 @@ class NNAPIOpBuilder {
 
     ANeuralNetworksOperandType operand_type{nn_type, tensor_rank, tensor_dims,
                                             scale, zeroPoint};
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+    RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
         context_,
         nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type),
-        "adding operand", nnapi_errno_);
+        "adding operand", tensor, nnapi_errno_);
 
     if (nn_type == ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL) {
-      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+      RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
           context_,
           nnapi_->ANeuralNetworksModel_setOperandSymmPerChannelQuantParams(
               nn_model_, ann_tensor_index, &ann_perchannel_params),
-          "setting new operand per channel quantization params", nnapi_errno_);
+          "setting new operand per channel quantization params", tensor,
+          nnapi_errno_);
     }
     if (tensor->allocation_type == kTfLiteMmapRo) {
       if (IsQuantized(tensor_type) && need_int8_conversion &&
@@ -1319,12 +1379,12 @@ class NNAPIOpBuilder {
           new_tensor->data.uint8[i] = static_cast<const uint8_t>(
               static_cast<int32_t>(tensor->data.int8[i]) + 128);
         }
-        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
             context_,
             nnapi_->ANeuralNetworksModel_setOperandValue(
                 nn_model_, ann_tensor_index, new_tensor->data.raw,
                 new_tensor->bytes),
-            "setting new operand value", nnapi_errno_);
+            "setting new operand value", tensor, nnapi_errno_);
 #ifdef TFLITE_NNAPI_ALLOW_MMAP_SHARING
       } else if (tensor->allocation &&
                  static_cast<const Allocation*>(tensor->allocation)->type() ==
@@ -1344,19 +1404,19 @@ class NNAPIOpBuilder {
         // Compute the offset to the base pointer of the MMAPAllocation.
         auto offset = reinterpret_cast<const uint8_t*>(tensor->data.raw) -
                       reinterpret_cast<const uint8_t*>(mmap_alloc->base());
-        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
             context_,
             nnapi_->ANeuralNetworksModel_setOperandValueFromMemory(
                 nn_model_, ann_tensor_index, ann_memory_handle, offset,
                 tensor->bytes),
-            "setting new operand value from memory", nnapi_errno_);
+            "setting new operand value from memory", tensor, nnapi_errno_);
 #endif
       } else {
-        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
             context_,
             nnapi_->ANeuralNetworksModel_setOperandValue(
                 nn_model_, ann_tensor_index, tensor->data.raw, tensor->bytes),
-            "setting new operand value", nnapi_errno_);
+            "setting new operand value", tensor, nnapi_errno_);
       }
     }
     indices->push_back(ann_tensor_index);
@@ -1512,11 +1572,30 @@ bool ExpectIsFloatOrQuant8Operator(const TfLiteContext* context,
                                    const TfLiteNode* node,
                                    OpValidationContext* val_ctx) {
   const auto input_type = context->tensors[node->inputs->data[0]].type;
-  return Expect(IsFloat(input_type) || IsQuantized(input_type),
+  return Expect(IsFloatOrQuantized(input_type),
                 NNAPIValidationFailureType::kUnsupportedInputType,
                 "Input should be Float or Quant8", val_ctx);
 }
 
+bool ExpectIsFloatOrInt32Operator(const TfLiteContext* context,
+                                  const TfLiteNode* node,
+                                  OpValidationContext* val_ctx) {
+  const auto input_type = context->tensors[node->inputs->data[0]].type;
+  return Expect(IsFloatOrInt32(input_type),
+                NNAPIValidationFailureType::kUnsupportedInputType,
+                "Input should be Float or Int32", val_ctx);
+}
+
+bool ExpectIsFloatQuant8OrInt32Operator(const TfLiteContext* context,
+                                        const TfLiteNode* node,
+                                        OpValidationContext* val_ctx) {
+  const auto input_type = context->tensors[node->inputs->data[0]].type;
+  return Expect(IsFloatQuantizedOrInt32(input_type),
+                NNAPIValidationFailureType::kUnsupportedInputType,
+                "Input should be Float, Quant8, or Int32", val_ctx);
+}
+
+// When using NN API version 1.0 or 1.1, the condition below must be true for
 // When using NN API version 1.0 or 1.1, the condition below must be true for
 // quantized versions of the following ops:
 // * CONV_2D
@@ -1553,7 +1632,17 @@ bool NNAPIDelegateKernel::Validate(
   switch (builtin_code) {
     case kTfLiteBuiltinAdd: {
       ExpectMaxOpVersion(version, 2, &val_ctx);
-      ExpectIsFloatOrQuant8Operator(context, node, &val_ctx);
+      if (android_sdk_version >= kMinSdkVersionForNNAPI13) {
+        ExpectIsFloatQuant8OrInt32Operator(context, node, &val_ctx);
+        if (IsInt32(context->tensors[node->inputs->data[0]].type)) {
+          Expect(reinterpret_cast<TfLiteAddParams*>(node->builtin_data)
+                         ->activation == kTfLiteActNone,
+                 NNAPIValidationFailureType::kNoActivationExpected,
+                 "No activation function supported", &val_ctx);
+        }
+      } else {
+        ExpectIsFloatOrQuant8Operator(context, node, &val_ctx);
+      }
     } break;
     case kTfLiteBuiltinArgMax:
     case kTfLiteBuiltinArgMin: {
@@ -1598,7 +1687,17 @@ bool NNAPIDelegateKernel::Validate(
     } break;
     case kTfLiteBuiltinMul: {
       ExpectMaxOpVersion(version, 2, &val_ctx);
-      ExpectIsFloatOrQuant8Operator(context, node, &val_ctx);
+      if (android_sdk_version >= kMinSdkVersionForNNAPI13) {
+        ExpectIsFloatQuant8OrInt32Operator(context, node, &val_ctx);
+        if (IsInt32(context->tensors[node->inputs->data[0]].type)) {
+          Expect(reinterpret_cast<TfLiteMulParams*>(node->builtin_data)
+                         ->activation == kTfLiteActNone,
+                 NNAPIValidationFailureType::kNoActivationExpected,
+                 "No activation function supported", &val_ctx);
+        }
+      } else {
+        ExpectIsFloatOrQuant8Operator(context, node, &val_ctx);
+      }
     } break;
     case kTfLiteBuiltinAveragePool2d: {
       ExpectMaxOpVersion(version, 2, &val_ctx);
@@ -1953,9 +2052,17 @@ bool NNAPIDelegateKernel::Validate(
       Expect((android_sdk_version >= kMinSdkVersionForNNAPI11 &&
               IsFloat(input_type)) ||
                  (android_sdk_version >= kMinSdkVersionForNNAPI12 &&
-                  IsQuantized(input_type)),
+                  IsQuantized(input_type)) ||
+                 (android_sdk_version >= kMinSdkVersionForNNAPI13 &&
+                  IsInt32(input_type)),
              NNAPIValidationFailureType::kUnsupportedInputType,
              "NNAPI only support float sub.", &val_ctx);
+      if (IsInt32(input_type)) {
+        Expect(reinterpret_cast<TfLiteSubParams*>(node->builtin_data)
+                       ->activation == kTfLiteActNone,
+               NNAPIValidationFailureType::kNoActivationExpected,
+               "No activation function supported", &val_ctx);
+      }
       const int input0_rank =
           context->tensors[node->inputs->data[0]].dims->size;
       const int input1_rank =
@@ -3666,13 +3773,13 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
     if (tensor->allocation_type != kTfLiteMmapRo) {
       if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
           tensor->buffer_handle < tensor_memory_map_->size()) {
-        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
             context,
             nnapi_->ANeuralNetworksExecution_setInputFromMemory(
                 execution, relative_input_index, input_nn_operand_type_ptr,
                 tensor_memory_map_->at(tensor->buffer_handle).memory, 0,
                 tensor->bytes),
-            "associating NNAPI execution input with a memory object",
+            "associating NNAPI execution input with a memory object", tensor,
             nnapi_errno);
         relative_input_index++;
         continue;
@@ -3719,23 +3826,23 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
         TF_LITE_ENSURE_OK(
             context, GetSizeOfType(context, ann_type_equivalent, &type_size));
         tensor_size = NumElements(tensor) * type_size;
-        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
             context,
             nnapi_->ANeuralNetworksExecution_setInputFromMemory(
                 execution, relative_input_index, input_nn_operand_type_ptr,
                 nn_input_memory_->get_handle(), input_offset, tensor_size),
-            "associating NNAPI execution input with a memory object",
+            "associating NNAPI execution input with a memory object", tensor,
             nnapi_errno);
       } else {
         // copy data to pre-allocated shared memory.
         memcpy(nn_input_memory_->get_data_ptr() + input_offset,
                tensor->data.raw, tensor->bytes);
-        RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
             context,
             nnapi_->ANeuralNetworksExecution_setInputFromMemory(
                 execution, relative_input_index, input_nn_operand_type_ptr,
                 nn_input_memory_->get_handle(), input_offset, tensor->bytes),
-            "associating NNAPI execution input with a memory object",
+            "associating NNAPI execution input with a memory object", tensor,
             nnapi_errno);
         tensor_size = tensor->bytes;
       }
@@ -3767,21 +3874,23 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
     }
     if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
         tensor->buffer_handle < tensor_memory_map_->size()) {
-      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+      RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
           context,
           nnapi_->ANeuralNetworksExecution_setOutputFromMemory(
               execution, relative_output_index, output_nn_operand_type_ptr,
               tensor_memory_map_->at(tensor->buffer_handle).memory, 0,
               tensor->bytes),
-          "associating NNAPI execution output to a memory object", nnapi_errno);
+          "associating NNAPI execution output to a memory object", tensor,
+          nnapi_errno);
 
     } else {
-      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+      RETURN_TFLITE_ERROR_IF_NN_ERROR_FOR_TENSOR(
           context,
           nnapi_->ANeuralNetworksExecution_setOutputFromMemory(
               execution, relative_output_index, output_nn_operand_type_ptr,
               nn_output_memory_->get_handle(), output_offset, tensor->bytes),
-          "associating NNAPI execution output to a memory object", nnapi_errno);
+          "associating NNAPI execution output to a memory object", tensor,
+          nnapi_errno);
       output_offset += tensor->bytes;
       output_offset += getNumPaddingBytes(tensor->bytes);
     }
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
index d5c86acf16f..345fd6da168 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
@@ -270,7 +270,10 @@ class ArgMaxOpModel : public SingleOpModel, public AcceleratedModel {
 
     SetBuiltinOp(BuiltinOperator_ARG_MAX, BuiltinOptions_ArgMaxOptions,
                  CreateArgMaxOptions(builder_, output_type).Union());
-    BuildInterpreter({input_shape, {1}});
+    BuildInterpreter({input_shape, {1}}, /*num_threads*/ -1,
+                     /*allow_fp32_relax_to_fp16=*/false,
+                     /*apply_delegate=*/false);
+    ApplyDelegate();
   }
 };
 
@@ -410,7 +413,8 @@ class AddSubOpsAcceleratedModel : public MultiOpModel, public AcceleratedModel {
                  {add_output, input3_}, {output_});
     BuildInterpreter({GetShape(input1_), GetShape(input2_), GetShape(input3_)},
                      /*num_threads=*/-1, allow_fp32_relax_to_fp16,
-                     /*apply_delegate=*/true);
+                     /*apply_delegate=*/false);
+    ApplyDelegate();
   }
 };
 
@@ -591,7 +595,8 @@ class HardSwishAddOpsAcceleratedModel : public MultiOpModel,
                  CreateAddOptions(builder_, activation_type).Union(),
                  {input1_, hard_swish_output}, {output_});
     BuildInterpreter({GetShape(input1_), GetShape(input2_)}, /*num_threads=*/-1,
-                     allow_fp32_relax_to_fp16, /*apply_delegate=*/true);
+                     allow_fp32_relax_to_fp16, /*apply_delegate=*/false);
+    ApplyDelegate();
   }
 };
 
@@ -721,7 +726,8 @@ class QuantizedWeightsConvolutionOpModel : public SingleOpModel,
 
     BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)},
                      num_threads, /*allow_fp32_relax_to_fp16=*/false,
-                     /*apply_delegate=*/true);
+                     /*apply_delegate=*/false);
+    ApplyDelegate();
   }
 
   void SetInput(std::initializer_list<float> data) {
@@ -867,7 +873,11 @@ class LongIdentityModel : public MultiOpModel, public AcceleratedModel {
         {intermediate_outputs[intermediate_outputs.size() - 1], zero_input_},
         {output_});
 
-    BuildInterpreter({GetShape(input_), GetShape(zero_input_)});
+    BuildInterpreter({GetShape(input_), GetShape(zero_input_)},
+                     /*num_threads*/ -1,
+                     /*allow_fp32_relax_to_fp16=*/false,
+                     /*apply_delegate=*/false);
+    ApplyDelegate();
 
     std::vector<float> zero(GetTensorSize(input_), 0.0);
     PopulateTensor(zero_input_, zero);
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc
index f347799b4b8..976876d04e2 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc
@@ -75,7 +75,11 @@ class FloatAddOpModel : public SingleOpModelWithNNAPI {
     SetBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
                  CreateAddOptions(builder_, activation_type).Union());
     BuildInterpreter({GetShape(input1_), GetShape(input2_)}, /*num_threads=*/-1,
-                     allow_fp32_relax_to_fp16, /*apply_delegate=*/true);
+                     allow_fp32_relax_to_fp16, /*apply_delegate=*/false);
+    // We defer applying the 'stateful_delegate_' till now (i.e. via setting
+    // 'apply_delegate=false' above) so that default TfLite delegates won't be
+    // applied.
+    ApplyDelegate();
   }
 };
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index fe022199dd6..205a44991dc 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -2682,7 +2682,7 @@ class RNNOpModel : public SingleOpModelWithNNAPI {
     weights_ = AddInput(weights);
     recurrent_weights_ = AddInput(recurrent_weights);
     bias_ = AddInput(TensorType_FLOAT32);
-    hidden_state_ = AddInput(TensorType_FLOAT32, true);
+    hidden_state_ = AddVariableInput(TensorType_FLOAT32);
     output_ = AddOutput(TensorType_FLOAT32);
     SetBuiltinOp(
         BuiltinOperator_RNN, BuiltinOptions_RNNOptions,
@@ -2872,9 +2872,8 @@ class BaseSVDFOpModel : public SingleOpModelWithNNAPI {
     // when using NNAPI delegate.
     bias_ = AddInput(TensorType_FLOAT32);
     const int num_filters = units * rank;
-    activation_state_ = AddInput(
-        TensorData{TensorType_FLOAT32, {batches, memory_size * num_filters}},
-        /*is_variable=*/true);
+    activation_state_ = AddVariableInput(
+        TensorData{TensorType_FLOAT32, {batches, memory_size * num_filters}});
     output_ = AddOutput(TensorType_FLOAT32);
     SetBuiltinOp(
         BuiltinOperator_SVDF, BuiltinOptions_SVDFOptions,
@@ -3098,8 +3097,8 @@ class LSTMOpModel : public SingleOpModelWithNNAPI {
     }
 
     // Adding the 2 input state tensors.
-    input_activation_state_ = AddInput(TensorType_FLOAT32, true);
-    input_cell_state_ = AddInput(TensorType_FLOAT32, true);
+    input_activation_state_ = AddVariableInput(TensorType_FLOAT32);
+    input_cell_state_ = AddVariableInput(TensorType_FLOAT32);
 
     const bool use_layer_norm = input_shapes.size() > 20;
     // Layer norm weights.
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 3c580edae10..e96dfdd187b 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite_combined")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -38,6 +39,7 @@ cc_library(
 cc_library(
     name = "xnnpack_delegate_hdrs_only",
     hdrs = ["xnnpack_delegate.h"],
+    compatible_with = get_compatible_with_portable(),
     visibility = ["//tensorflow/lite:__subpackages__"],
     deps = [
         "//tensorflow/lite/c:common",
@@ -75,6 +77,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -92,6 +95,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -109,6 +113,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -126,6 +131,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -143,6 +149,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -159,6 +166,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -175,6 +183,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -191,6 +200,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -208,6 +218,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -224,6 +235,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -240,6 +252,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -256,6 +269,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -272,6 +286,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
diff --git a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
index 02fec4f5a61..bc18c76f7eb 100644
--- a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
@@ -91,12 +92,16 @@ void BinaryElementwiseTester::Test(tflite::BuiltinOperator binary_op,
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
index dec1c589682..b81928717f3 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
@@ -39,12 +40,16 @@ void Conv2DTester::Test(TfLiteDelegate* delegate) const {
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
index 238a29c9b9d..ca40e89375a 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
@@ -39,12 +40,16 @@ void DepthwiseConv2DTester::Test(TfLiteDelegate* delegate) const {
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
index 9696b07b7a3..0ea0580bd79 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
@@ -59,12 +60,16 @@ void FullyConnectedTester::Test(TfLiteDelegate* delegate) const {
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc
index 7aefccaa671..c44143eb18a 100644
--- a/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/leaky_relu_tester.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
@@ -44,12 +45,16 @@ void LeakyReluTester::Test(TfLiteDelegate* delegate) const {
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/pad_tester.cc b/tensorflow/lite/delegates/xnnpack/pad_tester.cc
index e364b880124..0365fd4cbd5 100644
--- a/tensorflow/lite/delegates/xnnpack/pad_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/pad_tester.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
@@ -63,12 +64,16 @@ void PadTester::Test(TfLiteDelegate* delegate) const {
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc
index fab83e76fd2..bb6e8be7b7d 100644
--- a/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/pool_2d_tester.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
@@ -43,15 +44,19 @@ void Pool2DTester::Test(tflite::BuiltinOperator pool_op,
   const tflite::Model* model = tflite::GetModel(buffer.data());
 
   std::unique_ptr<tflite::Interpreter> delegate_interpreter;
-  ASSERT_EQ(tflite::InterpreterBuilder(
-                model, tflite::ops::builtin::BuiltinOpResolver())(
-                &delegate_interpreter),
-            kTfLiteOk);
+  ASSERT_EQ(
+      tflite::InterpreterBuilder(
+          model,
+          tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
+          &delegate_interpreter),
+      kTfLiteOk);
   std::unique_ptr<tflite::Interpreter> default_interpreter;
-  ASSERT_EQ(tflite::InterpreterBuilder(
-                model, tflite::ops::builtin::BuiltinOpResolver())(
-                &default_interpreter),
-            kTfLiteOk);
+  ASSERT_EQ(
+      tflite::InterpreterBuilder(
+          model,
+          tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
+          &default_interpreter),
+      kTfLiteOk);
 
   ASSERT_TRUE(delegate_interpreter);
   ASSERT_TRUE(default_interpreter);
diff --git a/tensorflow/lite/delegates/xnnpack/prelu_tester.cc b/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
index ab20c2c51dc..cee690e4dbd 100644
--- a/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/prelu_tester.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
@@ -45,12 +46,16 @@ void PreluTester::Test(TfLiteDelegate* delegate) const {
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/reduce_tester.cc b/tensorflow/lite/delegates/xnnpack/reduce_tester.cc
index edd09ba9d07..9628dbcc1d4 100644
--- a/tensorflow/lite/delegates/xnnpack/reduce_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/reduce_tester.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
@@ -45,12 +46,16 @@ void ReduceTester::Test(tflite::BuiltinOperator reduce_op,
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/reshape_tester.cc b/tensorflow/lite/delegates/xnnpack/reshape_tester.cc
index 534f90d37df..f44e3bd9ea3 100644
--- a/tensorflow/lite/delegates/xnnpack/reshape_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/reshape_tester.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
@@ -46,12 +47,16 @@ void ReshapeTester::Test(TfLiteDelegate* delegate) const {
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc b/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
index 34730c05719..df11d740696 100644
--- a/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/resize_bilinear_tester.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
@@ -44,12 +45,16 @@ void ResizeBilinearTester::Test(TfLiteDelegate* delegate) const {
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/softmax_tester.cc b/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
index c93aa0d789f..4ec916db17f 100644
--- a/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/softmax_tester.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
@@ -44,12 +45,16 @@ void SoftmaxTester::Test(TfLiteDelegate* delegate) const {
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
index ad6984538dc..df22ae2db7a 100644
--- a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
@@ -52,12 +53,16 @@ void UnaryElementwiseTester::Test(tflite::BuiltinOperator unary_op,
 
   std::unique_ptr<Interpreter> delegate_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &delegate_interpreter),
       kTfLiteOk);
   std::unique_ptr<Interpreter> default_interpreter;
   ASSERT_EQ(
-      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
           &default_interpreter),
       kTfLiteOk);
 
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index eec223597cb..914c4ec7f8f 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -1216,7 +1216,7 @@ class Subgraph {
 
     const TfLiteTensor& bias_tensor = tensors[node->inputs->data[2]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
-        logging_context, filter_tensor, node->inputs->data[2], node_index));
+        logging_context, bias_tensor, node->inputs->data[2], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
                                            node->inputs->data[2]));
     if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
diff --git a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
index 665131195e4..7f060034012 100644
--- a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
+++ b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -387,7 +387,7 @@ void ProcessInputWithQuantizedModel(
 - (void)dealloc {
 #if TFLITE_USE_GPU_DELEGATE
   if (delegate) {
-    TFLGpuDelegateDelete(delegate);
+    DeleteGpuDelegate(delegate);
   }
 #endif
   [self teardownAVCapture];
@@ -416,10 +416,10 @@ void ProcessInputWithQuantizedModel(
   tflite::InterpreterBuilder(*model, resolver)(&interpreter);
 
 #if TFLITE_USE_GPU_DELEGATE
-  TFLGpuDelegateOptions options;
+  GpuDelegateOptions options;
   options.allow_precision_loss = true;
-  options.wait_type = TFLGpuDelegateWaitTypeActive;
-  delegate = TFLGpuDelegateCreate(&options);
+  options.wait_type = GpuDelegateOptions::WaitType::kActive;
+  delegate = NewGpuDelegate(&options);
   interpreter->ModifyGraphWithDelegate(delegate);
 #endif
 
diff --git a/tensorflow/lite/examples/label_image/BUILD b/tensorflow/lite/examples/label_image/BUILD
index 633f767c5e9..e7d2d1a9672 100644
--- a/tensorflow/lite/examples/label_image/BUILD
+++ b/tensorflow/lite/examples/label_image/BUILD
@@ -29,16 +29,20 @@ cc_binary(
     }),
     deps = [
         ":bitmap_helpers",
-        "//tensorflow/lite/c:common",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/profiling:profiler",
+        "//tensorflow/lite/tools:command_line_flags",
+        "//tensorflow/lite/tools:tool_params",
+        "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
+        "//tensorflow/lite/tools/delegates:tflite_execution_providers",
         "//tensorflow/lite/tools/evaluation:utils",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/lite/delegates/gpu:delegate",
diff --git a/tensorflow/lite/examples/label_image/README.md b/tensorflow/lite/examples/label_image/README.md
index b283e169359..c13dffe1e3b 100644
--- a/tensorflow/lite/examples/label_image/README.md
+++ b/tensorflow/lite/examples/label_image/README.md
@@ -90,62 +90,152 @@ adb push tensorflow/lite/examples/label_image/testdata/grace_hopper.bmp  /data/l
 adb push /tmp/labels.txt /data/local/tmp
 ```
 
-Run it, `adb shell "/data/local/tmp/label_image \ -m
-/data/local/tmp/mobilenet_v1_1.0_224.tflite \ -i
-/data/local/tmp/grace_hopper.bmp \ -l /data/local/tmp/labels.txt"` then you
-should see something like the followings: `Loaded model
-/data/local/tmp/mobilenet_v1_1.0_224.tflite resolved reporter INFO: Initialized
-TensorFlow Lite runtime. invoked average time: 25.03 ms 0.907071: 653 military
-uniform 0.0372416: 907 Windsor tie 0.00733753: 466 bulletproof vest 0.00592852:
-458 bow tie 0.00414091: 514 cornet`
+Run it,
 
-Run the model with NNAPI delegate (`-a 1`), `adb shell
-"/data/local/tmp/label_image \ -m /data/local/tmp/mobilenet_v1_1.0_224.tflite \
--i /data/local/tmp/grace_hopper.bmp \ -l /data/local/tmp/labels.txt -a 1 -f 1"`
-then you should see something like the followings: `Loaded model
-/data/local/tmp/mobilenet_v1_1.0_224.tflite resolved reporter INFO: Initialized
-TensorFlow Lite runtime. INFO: Created TensorFlow Lite delegate for NNAPI.
-Applied NNAPI delegate. invoked average time:10.348 ms 0.905401: 653 military
-uniform 0.0379589: 907 Windsor tie 0.00735866: 466 bulletproof vest 0.00605307:
-458 bow tie 0.00422573: 514 cornet`
+```
+adb shell "/data/local/tmp/label_image \
+    -m /data/local/tmp/mobilenet_v1_1.0_224.tflite \
+    -i /data/local/tmp/grace_hopper.bmp \
+    -l /data/local/tmp/labels.txt"
+```
+
+then you should see something like the following:
+
+```
+Loaded model /data/local/tmp/mobilenet_v1_1.0_224.tflite
+resolved reporter
+INFO: Initialized TensorFlow Lite runtime.
+invoked
+average time: 25.03 ms
+0.907071: 653 military uniform
+0.0372416: 907 Windsor tie
+0.00733753: 466 bulletproof vest
+0.00592852: 458 bow tie
+0.00414091: 514 cornet
+```
+
+Run the model with NNAPI delegate (`-a 1`),
+
+```
+adb shell "/data/local/tmp/label_image \
+    -m /data/local/tmp/mobilenet_v1_1.0_224.tflite \
+    -i /data/local/tmp/grace_hopper.bmp \
+    -l /data/local/tmp/labels.txt -a 1 -f 1"
+```
+
+then you should see something like the following:
+
+```
+Loaded model /data/local/tmp/mobilenet_v1_1.0_224.tflite
+resolved reporter
+INFO: Initialized TensorFlow Lite runtime.
+INFO: Created TensorFlow Lite delegate for NNAPI. Applied NNAPI delegate.
+invoked
+average time:10.348 ms
+0.905401: 653 military uniform
+0.0379589: 907 Windsor tie
+0.00735866: 466 bulletproof vest
+0.00605307: 458 bow tie
+0.00422573: 514 cornet
+```
 
 To run a model with the Hexagon Delegate, assuming we have followed the
 [Hexagon Delegate Guide](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/hexagon_delegate.md)
-and installed Hexagon libraries in `/data/local/tmp`. Run it wth (`-j 1`) `adb
-shell \ "/data/local/tmp/label_image \ -m
-/data/local/tmp/mobilenet_v1_1.0_224_quant.tflite \ -i
-/data/local/tmp/grace_hopper.bmp \ -l /data/local/tmp/labels.txt -j 1"` then you
-should see something like the followings: ``` Loaded model
-/data/local/tmp/mobilenet_v1_1.0_224_quant.tflite resolved reporter INFO:
-Initialized TensorFlow Lite runtime. loaded libcdsprpc.so INFO: Created
-TensorFlow Lite delegate for Hexagon. INFO: Hexagon delegate: 31 nodes delegated
-out of 31 nodes with 1 partitions.
+and installed Hexagon libraries in `/data/local/tmp`. Run it with (`-j 1`)
 
-Applied Hexagon delegate.invoked average time: 4.231 ms 0.639216: 458 bow tie
-0.329412: 653 military uniform 0.00784314: 835 suit 0.00784314: 611 jersey
-0.00392157: 514 cornet ```
+```
+adb shell \
+    "/data/local/tmp/label_image \
+    -m /data/local/tmp/mobilenet_v1_1.0_224_quant.tflite \
+    -i /data/local/tmp/grace_hopper.bmp \
+    -l /data/local/tmp/labels.txt -j 1"
+```
 
-Run the model with the XNNPACK delegate (`-x 1`), `adb shell \
-"/data/local/tmp/label_image \ -m /data/local/tmp/mobilenet_v1_1.0_224.tflite \
--i /data/local/tmp/grace_hopper.bmp \ -l /data/local/tmp/labels.txt -x 1"` then
-you should see something like the followings: `Loaded model
-/data/local/tmp/mobilenet_v1_1.0_224.tflite resolved reporter INFO: Initialized
-TensorFlow Lite runtime. Applied XNNPACK delegate.invoked average time: 17.33 ms
-0.90707: 653 military uniform 0.0372418: 907 Windsor tie 0.0073376: 466
-bulletproof vest 0.00592857: 458 bow tie 0.00414093: 514 cornet`
+then you should see something like the followings:
+
+```
+Loaded model /data/local/tmp/mobilenet_v1_1.0_224_quant.tflite
+resolved reporter
+INFO: Initialized TensorFlow Lite runtime.
+loaded libcdsprpc.so
+INFO: Created TensorFlow Lite delegate for Hexagon.
+INFO: Hexagon delegate: 31 nodes delegated out of 31 nodes with 1 partitions.
+Applied Hexagon delegate.
+invoked
+average time: 4.231 ms
+0.639216: 458 bow tie
+0.329412: 653 military uniform
+0.00784314: 835 suit
+0.00784314: 611 jersey
+0.00392157: 514 cornet
+```
+
+Run the model with the XNNPACK delegate (`-x 1`),
+
+```shell
+adb shell \
+    "/data/local/tmp/label_image \
+    -m /data/local/tmp/mobilenet_v1_1.0_224.tflite \
+    -i /data/local/tmp/grace_hopper.bmp \
+    -l /data/local/tmp/labels.txt -x 1"
+```
+
+then you should see something like the following:
+
+```
+Loaded model /data/local/tmp/mobilenet_v1_1.0_224.tflite
+resolved reporter
+INFO: Initialized TensorFlow Lite runtime. Applied XNNPACK delegate.
+invoked
+average time: 17.33 ms
+0.90707: 653 military uniform
+0.0372418: 907 Windsor tie
+0.0073376: 466 bulletproof vest
+0.00592857: 458 bow tie
+0.00414093: 514 cornet
+```
 
 With `-h` or any other unsupported flags, `label_image` will list supported
-options `sargo:/data/local/tmp $ ./label_image -h ./label_image: invalid
-option -- h label_image --accelerated, -a: [0|1], use Android NNAPI or not
+options:
+
+```shell
+sargo:/data/local/tmp $ ./label_image -h
+./label_image: invalid option -- h
+label_image
+--accelerated, -a: [0|1], use Android NNAPI or not
 --old_accelerated, -d: [0|1], use old Android NNAPI delegate or not
---allow_fp16, -f: [0|1], allow running fp32 models with fp16 or not --count, -c:
-loop interpreter->Invoke() for certain times --gl_backend, -g: [0|1]: use GL GPU
-Delegate on Android --hexagon_delegate, -j: [0|1]: use Hexagon Delegate on
-Android --input_mean, -b: input mean --input_std, -s: input standard deviation
---image, -i: image_name.bmp --labels, -l: labels for the model --tflite_model,
--m: model_name.tflite --profiling, -p: [0|1], profiling or not --num_results,
--r: number of results to show --threads, -t: number of threads --verbose, -v:
-[0|1] print more information --warmup_runs, -w: number of warmup runs
+--allow_fp16, -f: [0|1], allow running fp32 models with fp16 or not
+--count, -c: loop interpreter->Invoke() for certain times
+--gl_backend, -g: [0|1]: use GL GPU Delegate on Android
+--hexagon_delegate, -j: [0|1]: use Hexagon Delegate on Android
+--input_mean, -b: input mean
+--input_std, -s: input standard deviation
+--image, -i: image_name.bmp
+--labels, -l: labels for the model
+--tflite_model, -m: model_name.tflite
+--profiling, -p: [0|1], profiling or not
+--num_results, -r: number of results to show
+--threads, -t: number of threads
+--verbose, -v: [0|1] print more information
+--warmup_runs, -w: number of warmup runs
 --xnnpack_delegate, -x [0:1]: xnnpack delegate`
+```
 
 See the `label_image.cc` source code for other command line options.
+
+Note that this binary also supports runtime/delegate arguments introduced by the
+[delegate registrar](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/delegates).
+If there is any conflict, the arguments mentioned earlier are given precedence.
+For example, you can run the binary with additional command line options
+such as `--use_nnapi=true --nnapi_accelerator_name=google-edgetpu` to utilize
+the EdgeTPU in a 4th-gen Pixel phone. Please be aware that the "=" in the option
+should not be omitted.
+
+```
+adb shell \
+    "/data/local/tmp/label_image \
+    -m /data/local/tmp/mobilenet_v1_1.0_224_quant.tflite \
+    -i /data/local/tmp/grace_hopper.bmp \
+    -l /data/local/tmp/labels.txt -j 1 \
+    --use_nnapi=true --nnapi_accelerator_name=google-edgetpu"
+```
diff --git a/tensorflow/lite/examples/label_image/label_image.cc b/tensorflow/lite/examples/label_image/label_image.cc
index 5967c23be33..4f6bcb4573c 100644
--- a/tensorflow/lite/examples/label_image/label_image.cc
+++ b/tensorflow/lite/examples/label_image/label_image.cc
@@ -44,13 +44,15 @@ limitations under the License.
 #include "tensorflow/lite/optional_debug_tools.h"
 #include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
+#include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
 
 #if defined(__ANDROID__)
 #include "tensorflow/lite/delegates/gpu/delegate.h"
 #endif
 
-#define LOG(x) std::cerr
+#define LOG(severity) (std::cerr << (#severity) << ": ")
 
 namespace tflite {
 namespace label_image {
@@ -60,6 +62,56 @@ double get_us(struct timeval t) { return (t.tv_sec * 1000000 + t.tv_usec); }
 using TfLiteDelegatePtr = tflite::Interpreter::TfLiteDelegatePtr;
 using TfLiteDelegatePtrMap = std::map<std::string, TfLiteDelegatePtr>;
 
+class DelegateProviders {
+ public:
+  DelegateProviders()
+      : delegates_list_(tflite::tools::GetRegisteredDelegateProviders()) {
+    for (const auto& delegate : delegates_list_) {
+      params_.Merge(delegate->DefaultParams());
+    }
+  }
+
+  // Initialize delegate-related parameters from parsing command line arguments,
+  // and remove the matching arguments from (*argc, argv). Returns true if all
+  // recognized arg values are parsed correctly.
+  bool InitFromCmdlineArgs(int* argc, const char** argv) {
+    std::vector<tflite::Flag> flags;
+    for (const auto& delegate : delegates_list_) {
+      auto delegate_flags = delegate->CreateFlags(&params_);
+      flags.insert(flags.end(), delegate_flags.begin(), delegate_flags.end());
+    }
+
+    const bool parse_result = Flags::Parse(argc, argv, flags);
+    if (!parse_result) {
+      std::string usage = Flags::Usage(argv[0], flags);
+      LOG(ERROR) << usage;
+    }
+    return parse_result;
+  }
+
+  // Create a list of TfLite delegates based on what have been initialized (i.e.
+  // 'params_').
+  TfLiteDelegatePtrMap CreateAllDelegates() const {
+    TfLiteDelegatePtrMap delegates_map;
+    for (const auto& delegate : delegates_list_) {
+      auto ptr = delegate->CreateTfLiteDelegate(params_);
+      // It's possible that a delegate of certain type won't be created as
+      // user-specified benchmark params tells not to.
+      if (ptr == nullptr) continue;
+      LOG(INFO) << delegate->GetName() << " delegate created.\n";
+      delegates_map.emplace(delegate->GetName(), std::move(ptr));
+    }
+    return delegates_map;
+  }
+
+ private:
+  // Contain delegate-related parameters that are initialized from command-line
+  // flags.
+  tflite::tools::ToolParams params_;
+
+  const tflite::tools::DelegateProviderList& delegates_list_;
+};
+
 TfLiteDelegatePtr CreateGPUDelegate(Settings* s) {
 #if defined(__ANDROID__)
   TfLiteGpuDelegateOptionsV2 gpu_opts = TfLiteGpuDelegateOptionsV2Default();
@@ -74,21 +126,26 @@ TfLiteDelegatePtr CreateGPUDelegate(Settings* s) {
 #endif
 }
 
-TfLiteDelegatePtrMap GetDelegates(Settings* s) {
+TfLiteDelegatePtrMap GetDelegates(Settings* s,
+                                  const DelegateProviders& delegate_providers) {
+  // TODO(b/169681115): deprecate delegate creation path based on "Settings" by
+  // mapping settings to DelegateProvider's parameters.
   TfLiteDelegatePtrMap delegates;
   if (s->gl_backend) {
     auto delegate = CreateGPUDelegate(s);
     if (!delegate) {
-      LOG(INFO) << "GPU acceleration is unsupported on this platform.";
+      LOG(INFO) << "GPU acceleration is unsupported on this platform.\n";
     } else {
       delegates.emplace("GPU", std::move(delegate));
     }
   }
 
   if (s->accel) {
-    auto delegate = evaluation::CreateNNAPIDelegate();
+    StatefulNnApiDelegate::Options options;
+    options.allow_fp16 = s->allow_fp16;
+    auto delegate = evaluation::CreateNNAPIDelegate(options);
     if (!delegate) {
-      LOG(INFO) << "NNAPI acceleration is unsupported on this platform.";
+      LOG(INFO) << "NNAPI acceleration is unsupported on this platform.\n";
     } else {
       delegates.emplace("NNAPI", std::move(delegate));
     }
@@ -100,7 +157,7 @@ TfLiteDelegatePtrMap GetDelegates(Settings* s) {
         evaluation::CreateHexagonDelegate(libhexagon_path, s->profiling);
 
     if (!delegate) {
-      LOG(INFO) << "Hexagon acceleration is unsupported on this platform.";
+      LOG(INFO) << "Hexagon acceleration is unsupported on this platform.\n";
     } else {
       delegates.emplace("Hexagon", std::move(delegate));
     }
@@ -109,12 +166,24 @@ TfLiteDelegatePtrMap GetDelegates(Settings* s) {
   if (s->xnnpack_delegate) {
     auto delegate = evaluation::CreateXNNPACKDelegate(s->number_of_threads);
     if (!delegate) {
-      LOG(INFO) << "XNNPACK acceleration is unsupported on this platform.";
+      LOG(INFO) << "XNNPACK acceleration is unsupported on this platform.\n";
     } else {
       delegates.emplace("XNNPACK", std::move(delegate));
     }
   }
 
+  // Independent of above delegate creation options that are specific to this
+  // binary, we use delegate providers to create TFLite delegates. Delegate
+  // providers have been used in TFLite benchmark/evaluation tools and testing
+  // so that we have a single and more comprehensive set of command line
+  // arguments for delegate creation.
+  TfLiteDelegatePtrMap delegates_from_providers =
+      delegate_providers.CreateAllDelegates();
+  for (auto& name_and_delegate : delegates_from_providers) {
+    delegates.emplace("Delegate_Provider_" + name_and_delegate.first,
+                      std::move(name_and_delegate.second));
+  }
+
   return delegates;
 }
 
@@ -126,7 +195,7 @@ TfLiteStatus ReadLabelsFile(const string& file_name,
                             size_t* found_label_count) {
   std::ifstream file(file_name);
   if (!file) {
-    LOG(FATAL) << "Labels file " << file_name << " not found\n";
+    LOG(ERROR) << "Labels file " << file_name << " not found\n";
     return kTfLiteError;
   }
   result->clear();
@@ -160,21 +229,22 @@ void PrintProfilingInfo(const profiling::ProfileEvent* e,
             << "\n";
 }
 
-void RunInference(Settings* s) {
-  if (!s->model_name.c_str()) {
+void RunInference(Settings* settings,
+                  const DelegateProviders& delegate_providers) {
+  if (!settings->model_name.c_str()) {
     LOG(ERROR) << "no model file name\n";
     exit(-1);
   }
 
   std::unique_ptr<tflite::FlatBufferModel> model;
   std::unique_ptr<tflite::Interpreter> interpreter;
-  model = tflite::FlatBufferModel::BuildFromFile(s->model_name.c_str());
+  model = tflite::FlatBufferModel::BuildFromFile(settings->model_name.c_str());
   if (!model) {
-    LOG(FATAL) << "\nFailed to mmap model " << s->model_name << "\n";
+    LOG(ERROR) << "\nFailed to mmap model " << settings->model_name << "\n";
     exit(-1);
   }
-  s->model = model.get();
-  LOG(INFO) << "Loaded model " << s->model_name << "\n";
+  settings->model = model.get();
+  LOG(INFO) << "Loaded model " << settings->model_name << "\n";
   model->error_reporter();
   LOG(INFO) << "resolved reporter\n";
 
@@ -182,14 +252,13 @@ void RunInference(Settings* s) {
 
   tflite::InterpreterBuilder(*model, resolver)(&interpreter);
   if (!interpreter) {
-    LOG(FATAL) << "Failed to construct interpreter\n";
+    LOG(ERROR) << "Failed to construct interpreter\n";
     exit(-1);
   }
 
-  interpreter->UseNNAPI(s->old_accel);
-  interpreter->SetAllowFp16PrecisionForFp32(s->allow_fp16);
+  interpreter->SetAllowFp16PrecisionForFp32(settings->allow_fp16);
 
-  if (s->verbose) {
+  if (settings->verbose) {
     LOG(INFO) << "tensors size: " << interpreter->tensors_size() << "\n";
     LOG(INFO) << "nodes size: " << interpreter->nodes_size() << "\n";
     LOG(INFO) << "inputs: " << interpreter->inputs().size() << "\n";
@@ -206,42 +275,44 @@ void RunInference(Settings* s) {
     }
   }
 
-  if (s->number_of_threads != -1) {
-    interpreter->SetNumThreads(s->number_of_threads);
+  if (settings->number_of_threads != -1) {
+    interpreter->SetNumThreads(settings->number_of_threads);
   }
 
   int image_width = 224;
   int image_height = 224;
   int image_channels = 3;
-  std::vector<uint8_t> in = read_bmp(s->input_bmp_name, &image_width,
-                                     &image_height, &image_channels, s);
+  std::vector<uint8_t> in = read_bmp(settings->input_bmp_name, &image_width,
+                                     &image_height, &image_channels, settings);
 
   int input = interpreter->inputs()[0];
-  if (s->verbose) LOG(INFO) << "input: " << input << "\n";
+  if (settings->verbose) LOG(INFO) << "input: " << input << "\n";
 
   const std::vector<int> inputs = interpreter->inputs();
   const std::vector<int> outputs = interpreter->outputs();
 
-  if (s->verbose) {
+  if (settings->verbose) {
     LOG(INFO) << "number of inputs: " << inputs.size() << "\n";
     LOG(INFO) << "number of outputs: " << outputs.size() << "\n";
   }
 
-  auto delegates_ = GetDelegates(s);
+  auto delegates_ = GetDelegates(settings, delegate_providers);
   for (const auto& delegate : delegates_) {
     if (interpreter->ModifyGraphWithDelegate(delegate.second.get()) !=
         kTfLiteOk) {
-      LOG(FATAL) << "Failed to apply " << delegate.first << " delegate.";
+      LOG(ERROR) << "Failed to apply " << delegate.first << " delegate.\n";
+      exit(-1);
     } else {
-      LOG(INFO) << "Applied " << delegate.first << " delegate.";
+      LOG(INFO) << "Applied " << delegate.first << " delegate.\n";
     }
   }
 
   if (interpreter->AllocateTensors() != kTfLiteOk) {
-    LOG(FATAL) << "Failed to allocate tensors!";
+    LOG(ERROR) << "Failed to allocate tensors!\n";
+    exit(-1);
   }
 
-  if (s->verbose) PrintInterpreterState(interpreter.get());
+  if (settings->verbose) PrintInterpreterState(interpreter.get());
 
   // get input dimension from the input tensor metadata
   // assuming one input only
@@ -250,54 +321,58 @@ void RunInference(Settings* s) {
   int wanted_width = dims->data[2];
   int wanted_channels = dims->data[3];
 
-  s->input_type = interpreter->tensor(input)->type;
-  switch (s->input_type) {
+  settings->input_type = interpreter->tensor(input)->type;
+  switch (settings->input_type) {
     case kTfLiteFloat32:
       resize<float>(interpreter->typed_tensor<float>(input), in.data(),
                     image_height, image_width, image_channels, wanted_height,
-                    wanted_width, wanted_channels, s);
+                    wanted_width, wanted_channels, settings);
       break;
     case kTfLiteInt8:
       resize<int8_t>(interpreter->typed_tensor<int8_t>(input), in.data(),
                      image_height, image_width, image_channels, wanted_height,
-                     wanted_width, wanted_channels, s);
+                     wanted_width, wanted_channels, settings);
       break;
     case kTfLiteUInt8:
       resize<uint8_t>(interpreter->typed_tensor<uint8_t>(input), in.data(),
                       image_height, image_width, image_channels, wanted_height,
-                      wanted_width, wanted_channels, s);
+                      wanted_width, wanted_channels, settings);
       break;
     default:
-      LOG(FATAL) << "cannot handle input type "
-                 << interpreter->tensor(input)->type << " yet";
+      LOG(ERROR) << "cannot handle input type "
+                 << interpreter->tensor(input)->type << " yet\n";
       exit(-1);
   }
-  auto profiler =
-      absl::make_unique<profiling::Profiler>(s->max_profiling_buffer_entries);
+  auto profiler = absl::make_unique<profiling::Profiler>(
+      settings->max_profiling_buffer_entries);
   interpreter->SetProfiler(profiler.get());
 
-  if (s->profiling) profiler->StartProfiling();
-  if (s->loop_count > 1)
-    for (int i = 0; i < s->number_of_warmup_runs; i++) {
+  if (settings->profiling) profiler->StartProfiling();
+  if (settings->loop_count > 1) {
+    for (int i = 0; i < settings->number_of_warmup_runs; i++) {
       if (interpreter->Invoke() != kTfLiteOk) {
-        LOG(FATAL) << "Failed to invoke tflite!\n";
+        LOG(ERROR) << "Failed to invoke tflite!\n";
+        exit(-1);
       }
     }
+  }
 
   struct timeval start_time, stop_time;
   gettimeofday(&start_time, nullptr);
-  for (int i = 0; i < s->loop_count; i++) {
+  for (int i = 0; i < settings->loop_count; i++) {
     if (interpreter->Invoke() != kTfLiteOk) {
-      LOG(FATAL) << "Failed to invoke tflite!\n";
+      LOG(ERROR) << "Failed to invoke tflite!\n";
+      exit(-1);
     }
   }
   gettimeofday(&stop_time, nullptr);
-  LOG(INFO) << "invoked \n";
+  LOG(INFO) << "invoked\n";
   LOG(INFO) << "average time: "
-            << (get_us(stop_time) - get_us(start_time)) / (s->loop_count * 1000)
+            << (get_us(stop_time) - get_us(start_time)) /
+                   (settings->loop_count * 1000)
             << " ms \n";
 
-  if (s->profiling) {
+  if (settings->profiling) {
     profiler->StopProfiling();
     auto profile_events = profiler->GetProfileEvents();
     for (int i = 0; i < profile_events.size(); i++) {
@@ -323,29 +398,30 @@ void RunInference(Settings* s) {
   switch (interpreter->tensor(output)->type) {
     case kTfLiteFloat32:
       get_top_n<float>(interpreter->typed_output_tensor<float>(0), output_size,
-                       s->number_of_results, threshold, &top_results,
-                       s->input_type);
+                       settings->number_of_results, threshold, &top_results,
+                       settings->input_type);
       break;
     case kTfLiteInt8:
       get_top_n<int8_t>(interpreter->typed_output_tensor<int8_t>(0),
-                        output_size, s->number_of_results, threshold,
-                        &top_results, s->input_type);
+                        output_size, settings->number_of_results, threshold,
+                        &top_results, settings->input_type);
       break;
     case kTfLiteUInt8:
       get_top_n<uint8_t>(interpreter->typed_output_tensor<uint8_t>(0),
-                         output_size, s->number_of_results, threshold,
-                         &top_results, s->input_type);
+                         output_size, settings->number_of_results, threshold,
+                         &top_results, settings->input_type);
       break;
     default:
-      LOG(FATAL) << "cannot handle output type "
-                 << interpreter->tensor(output)->type << " yet";
+      LOG(ERROR) << "cannot handle output type "
+                 << interpreter->tensor(output)->type << " yet\n";
       exit(-1);
   }
 
   std::vector<string> labels;
   size_t label_count;
 
-  if (ReadLabelsFile(s->labels_file_name, &labels, &label_count) != kTfLiteOk)
+  if (ReadLabelsFile(settings->labels_file_name, &labels, &label_count) !=
+      kTfLiteOk)
     exit(-1);
 
   for (const auto& result : top_results) {
@@ -359,7 +435,6 @@ void display_usage() {
   LOG(INFO)
       << "label_image\n"
       << "--accelerated, -a: [0|1], use Android NNAPI or not\n"
-      << "--old_accelerated, -d: [0|1], use old Android NNAPI delegate or not\n"
       << "--allow_fp16, -f: [0|1], allow running fp32 models with fp16 or not\n"
       << "--count, -c: loop interpreter->Invoke() for certain times\n"
       << "--gl_backend, -g: [0|1]: use GL GPU Delegate on Android\n"
@@ -379,13 +454,19 @@ void display_usage() {
 }
 
 int Main(int argc, char** argv) {
+  DelegateProviders delegate_providers;
+  bool parse_result = delegate_providers.InitFromCmdlineArgs(
+      &argc, const_cast<const char**>(argv));
+  if (!parse_result) {
+    return EXIT_FAILURE;
+  }
+
   Settings s;
 
   int c;
   while (true) {
     static struct option long_options[] = {
         {"accelerated", required_argument, nullptr, 'a'},
-        {"old_accelerated", required_argument, nullptr, 'd'},
         {"allow_fp16", required_argument, nullptr, 'f'},
         {"count", required_argument, nullptr, 'c'},
         {"verbose", required_argument, nullptr, 'v'},
@@ -425,10 +506,6 @@ int Main(int argc, char** argv) {
         s.loop_count =
             strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
         break;
-      case 'd':
-        s.old_accel =
-            strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
-        break;
       case 'e':
         s.max_profiling_buffer_entries =
             strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
@@ -477,7 +554,8 @@ int Main(int argc, char** argv) {
             strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
         break;
       case 'x':
-        s.xnnpack_delegate = optarg;
+        s.xnnpack_delegate =
+            strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
         break;
       case 'h':
       case '?':
@@ -488,7 +566,7 @@ int Main(int argc, char** argv) {
         exit(-1);
     }
   }
-  RunInference(&s);
+  RunInference(&s, delegate_providers);
   return 0;
 }
 
diff --git a/tensorflow/lite/examples/label_image/label_image.h b/tensorflow/lite/examples/label_image/label_image.h
index 737231e567f..1c00edb6558 100644
--- a/tensorflow/lite/examples/label_image/label_image.h
+++ b/tensorflow/lite/examples/label_image/label_image.h
@@ -25,7 +25,6 @@ namespace label_image {
 struct Settings {
   bool verbose = false;
   bool accel = false;
-  bool old_accel = false;
   TfLiteType input_type = kTfLiteFloat32;
   bool profiling = false;
   bool allow_fp16 = false;
diff --git a/tensorflow/lite/examples/minimal/minimal.cc b/tensorflow/lite/examples/minimal/minimal.cc
index 6e569f95a63..84cdf17e23f 100644
--- a/tensorflow/lite/examples/minimal/minimal.cc
+++ b/tensorflow/lite/examples/minimal/minimal.cc
@@ -28,8 +28,6 @@ limitations under the License.
 //
 // Usage: minimal <tflite model>
 
-using namespace tflite;
-
 #define TFLITE_MINIMAL_CHECK(x)                              \
   if (!(x)) {                                                \
     fprintf(stderr, "Error at %s:%d\n", __FILE__, __LINE__); \
@@ -53,8 +51,8 @@ int main(int argc, char* argv[]) {
   // which allocates memory for the Intrepter and does various set up
   // tasks so that the Interpreter can read the provided model.
   tflite::ops::builtin::BuiltinOpResolver resolver;
-  InterpreterBuilder builder(*model, resolver);
-  std::unique_ptr<Interpreter> interpreter;
+  tflite::InterpreterBuilder builder(*model, resolver);
+  std::unique_ptr<tflite::Interpreter> interpreter;
   builder(&interpreter);
   TFLITE_MINIMAL_CHECK(interpreter != nullptr);
 
@@ -64,7 +62,9 @@ int main(int argc, char* argv[]) {
   tflite::PrintInterpreterState(interpreter.get());
 
   // Fill input buffers
-  // TODO(user): Insert code to fill input tensors
+  // TODO(user): Insert code to fill input tensors.
+  // Note: The buffer of the input tensor with index `i` of type T can
+  // be accessed with `T* input = interpreter->typed_input_tensor<T>(i);`
 
   // Run inference
   TFLITE_MINIMAL_CHECK(interpreter->Invoke() == kTfLiteOk);
@@ -73,6 +73,8 @@ int main(int argc, char* argv[]) {
 
   // Read output buffers
   // TODO(user): Insert getting data out code.
+  // Note: The buffer of the output tensor with index `i` of type T can
+  // be accessed with `T* output = interpreter->typed_output_tensor<T>(i);`
 
   return 0;
 }
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin
index cbe1b34eb9b..f64f35d01b0 100644
Binary files a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin and b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin differ
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc
index 4f40878da22..2b062d5fef3 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc
+++ b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc
@@ -102,5 +102,13 @@ bool GPUCompatibilityList::IsDatabaseLoaded() const {
   return database_ != nullptr;
 }
 
+// static
+bool GPUCompatibilityList::IsValidFlatbuffer(const unsigned char* data,
+                                             int len) {
+  // Verify opensource db.
+  flatbuffers::Verifier verifier(reinterpret_cast<const uint8_t*>(data), len);
+  return tflite::acceleration::VerifyDeviceDatabaseBuffer(verifier);
+}
+
 }  // namespace acceleration
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h
index 1c5e9dec997..56ba1ca35d8 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h
+++ b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h
@@ -78,6 +78,9 @@ class GPUCompatibilityList {
   GPUCompatibilityList& operator=(const GPUCompatibilityList&) = delete;
   bool IsDatabaseLoaded() const;
 
+  // Checks if the provided byte array represents a valid compatibility list
+  static bool IsValidFlatbuffer(const unsigned char* data, int len);
+
  protected:
   const DeviceDatabase* database_;
 };
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility_test.cc b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility_test.cc
index 5576b47dcd9..42e4334970b 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility_test.cc
+++ b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.h"
 
+#include <algorithm>
 #include <memory>
 
 #include <gmock/gmock.h>
@@ -84,4 +85,17 @@ TEST_F(GPUCompatibilityTest, ReturnsDefaultOptions) {
             default_options.max_delegated_partitions);
 }
 
+TEST(GPUCompatibility, RecogniseValidCompatibilityListFlatbuffer) {
+  EXPECT_TRUE(tflite::acceleration::GPUCompatibilityList::IsValidFlatbuffer(
+      g_tflite_acceleration_devicedb_sample_binary,
+      g_tflite_acceleration_devicedb_sample_binary_len));
+}
+
+TEST(GPUCompatibility, RecogniseInvalidCompatibilityListFlatbuffer) {
+  unsigned char invalid_buffer[100];
+  std::fill(invalid_buffer, invalid_buffer + 100, ' ');
+  EXPECT_FALSE(tflite::acceleration::GPUCompatibilityList::IsValidFlatbuffer(
+      invalid_buffer, 100));
+}
+
 }  // namespace
diff --git a/tensorflow/lite/experimental/acceleration/configuration/configuration.proto b/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
index 3091eec6d46..497bf3cb58c 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
+++ b/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
@@ -51,8 +51,6 @@ enum Delegate {
   GPU = 2;
   HEXAGON = 3;
   XNNPACK = 4;
-  // TODO(b/157893534): Support exposing edgetpu tflite delegate creation
-  // options.
   EDGETPU = 5;
 }
 
@@ -119,6 +117,16 @@ message NNAPISettings {
   optional bool allow_nnapi_cpu_on_android_10_plus = 7;
 
   optional NNAPIExecutionPriority execution_priority = 8;
+
+  // Whether to allow dynamic dimension sizes without re-compilation.
+  // A tensor of with dynamic dimension must have a valid dims_signature
+  // defined.
+  // Only supported in NNAPI 1.1 and newer versions.
+  // WARNING: Setting this flag to true may result in model being rejected by
+  // accelerator. This should only be enabled if the target device supports
+  // dynamic dimensions of the model.
+  // By default this is set to false.
+  optional bool allow_dynamic_dimensions = 9;
 }
 
 // Which GPU backend to select. Default behaviour on Android is to try OpenCL
@@ -162,6 +170,43 @@ message XNNPackSettings {
   optional int32 num_threads = 1;
 }
 
+// EdgeTPU Delegate settings
+//
+message EdgeTpuSettings {
+  // Generic definitions of EdgeTPU power states.
+  enum PowerState {
+    // Undefined power state.
+    UNDEFINED = 0;
+
+    // TPU core is off but control cluster is on.
+    TPU_CORE_OFF = 1;
+
+    // A non-active low-power state that has much smaller transition time to
+    // active compared to off.
+    READY = 2;
+
+    // Device is inactive but ready and all previous data is retained (e.g.
+    // cached parameters).
+    READY_WITH_RETENTION = 3;
+
+    // Minimum power active state.
+    ACTIVE_MIN_POWER = 4;
+
+    // Low performance, low power.
+    ACTIVE_LOW_POWER = 5;
+
+    // The normal performance and power. This setting usually provides the
+    // optimal perf/power trade-off for the average use-case.
+    ACTIVE = 6;
+
+    // Maximum performance level. Potentially higher power and thermal. This
+    // setting may not be allowed in production depending on the system.
+    OVER_DRIVE = 7;
+  }
+
+  optional PowerState inference_power_state = 1;
+}
+
 message CPUSettings {
   optional int32 num_threads = 1;
 }
@@ -185,6 +230,9 @@ message TFLiteSettings {
 
   // Shared delegation settings.
   optional int32 max_delegated_partitions = 7;
+
+  // For configuring the EdgeTpuDelegate.
+  optional EdgeTpuSettings edgetpu_settings = 8;
 }
 
 // Whether to automatically fallback to TFLite CPU path on delegation errors.
diff --git a/tensorflow/lite/experimental/delegates/coreml/BUILD b/tensorflow/lite/experimental/delegates/coreml/BUILD
index 2985cd3a315..ee209704161 100644
--- a/tensorflow/lite/experimental/delegates/coreml/BUILD
+++ b/tensorflow/lite/experimental/delegates/coreml/BUILD
@@ -59,6 +59,9 @@ objc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/experimental/delegates/coreml/builders:op_builder",
+        "//tensorflow/lite/experimental/delegates/coreml/builders:op_validator",
+        "//tensorflow/lite/experimental/delegates/coreml/builders:util",
+        "//tensorflow/lite/kernels:kernel_util",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/convolution_op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/convolution_op_builder.cc
index 8dbc18722f1..e6c3f892c3b 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/convolution_op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/convolution_op_builder.cc
@@ -328,7 +328,9 @@ bool IsConvolutionOpSupported(const TfLiteRegistration* registration,
   const int kOutputShapeTensor = 0;  // Only used for TransposeConv
   const int kWeightTensor = 1;
   const int kBiasTensor = 2;  // Only used for non-TransposeConv
-  const TfLiteTensor* weights = GetInput(context, node, kWeightTensor);
+  const TfLiteTensor* weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kWeightTensor, &weights));
   const int max_kernel_size = 16384;
   if (!IsConstantTensor(weights)) {
     return false;
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/fully_connected_op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/fully_connected_op_builder.cc
index 376830922c9..4179ed0445e 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/fully_connected_op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/fully_connected_op_builder.cc
@@ -153,8 +153,10 @@ bool IsFullyConnectedOpSupported(const TfLiteRegistration* registration,
   if (fc_params->weights_format != kTfLiteFullyConnectedWeightsFormatDefault) {
     return false;
   }
-  const TfLiteTensor* input = GetInput(context, node, kInput);
-  const TfLiteTensor* weights = GetInput(context, node, kWeights);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInput, &input));
+  const TfLiteTensor* weights;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kWeights, &weights));
 
   if (!IsFloatType(input->type)) {
     return false;
@@ -169,7 +171,8 @@ bool IsFullyConnectedOpSupported(const TfLiteRegistration* registration,
   }
 
   if (node->inputs->size > 2) {
-    const TfLiteTensor* bias = GetInput(context, node, kBias);
+    const TfLiteTensor* bias;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kBias, &bias));
     if (!IsFloatType(bias->type) || !IsConstantTensor(bias)) {
       return false;
     }
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
index c775f4fdb48..bf8fb333894 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
@@ -46,6 +46,8 @@ OpBuilder* GraphBuilder::AddBuilder(int builtin_code, const TfLiteNode* node) {
       return AddBuilder(CreateLogisticOpBuilder, node);
     case kTfLiteBuiltinMaxPool2d:
       return AddBuilder(CreateMaxPool2dOpBuilder, node);
+    case kTfLiteBuiltinMean:
+      return AddBuilder(CreateMeanOpBuilder, node);
     case kTfLiteBuiltinMirrorPad:
       return AddBuilder(CreateMirrorPadOpBuilder, node);
     case kTfLiteBuiltinMul:
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h b/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h
index 4245021fc2f..c70dbf2b8e4 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h
@@ -32,6 +32,7 @@ OpBuilder* CreateFullyConnectedOpBuilder(GraphBuilder* graph_builder);
 OpBuilder* CreateHardSwishOpBuilder(GraphBuilder* graph_builder);
 OpBuilder* CreateLogisticOpBuilder(GraphBuilder* graph_builder);
 OpBuilder* CreateMaxPool2dOpBuilder(GraphBuilder* graph_builder);
+OpBuilder* CreateMeanOpBuilder(GraphBuilder* graph_builder);
 OpBuilder* CreateMirrorPadOpBuilder(GraphBuilder* graph_builder);
 OpBuilder* CreateMulOpBuilder(GraphBuilder* graph_builder);
 // PAD handles PAD and PADV2 together.
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h b/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
index b099fd7493a..a97a0f32798 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
@@ -31,6 +31,8 @@ bool IsDepthwiseConvolutionOpSupported(const TfLiteRegistration* registration,
 bool IsFullyConnectedOpSupported(const TfLiteRegistration* registration,
                                  const TfLiteNode* node,
                                  TfLiteContext* context);
+bool IsMeanOpSupported(const TfLiteRegistration* registration,
+                       const TfLiteNode* node, TfLiteContext* context);
 bool IsMirrorPadOpSupported(const TfLiteRegistration* registration,
                             const TfLiteNode* node, TfLiteContext* context);
 bool IsPadOpSupported(const TfLiteRegistration* registration,
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/pad_op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/pad_op_builder.cc
index d8ef4f61ddb..36c50d816df 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/pad_op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/pad_op_builder.cc
@@ -97,7 +97,8 @@ OpBuilder* CreateMirrorPadOpBuilder(GraphBuilder* graph_builder) {
 bool IsPadOpSupported(const TfLiteRegistration* registration,
                       const TfLiteNode* node, TfLiteContext* context) {
   // padding is d x 2 tensor, where d is the dimension of input.
-  const TfLiteTensor* padding = GetInput(context, node, 1);
+  const TfLiteTensor* padding;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &padding));
   if (!IsConstantTensor(padding)) {
     TF_LITE_KERNEL_LOG(context,
                        "%s: Only constant padding is supported for PAD.",
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/pooling_layer_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/pooling_layer_builder.cc
index 8859639b1fb..d3e3f6b6495 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/pooling_layer_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/pooling_layer_builder.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
 namespace delegates {
@@ -29,13 +31,15 @@ const char* PoolingLayerBuilder::DebugName() {
     case kTfLiteBuiltinAveragePool2d:
       GetDebugName("PoolingLayerBuilder (AVERAGE)", node_id_, str_debug_name_);
       break;
-
     case kTfLiteBuiltinMaxPool2d:
       GetDebugName("PoolingLayerBuilder (MAX)", node_id_, str_debug_name_);
       break;
     case kTfLiteBuiltinL2Pool2d:
-      GetDebugName("PoolingLayerBuilder (L2, unsupported)",
-                   node_id_, str_debug_name_);
+      GetDebugName("PoolingLayerBuilder (L2, unsupported)", node_id_,
+                   str_debug_name_);
+      break;
+    case kTfLiteBuiltinMean:
+      GetDebugName("PoolingLayerBuilder (MEAN)", node_id_, str_debug_name_);
       break;
     default:
       GetDebugName("PoolingLayerBuilder (ERROR)", node_id_, str_debug_name_);
@@ -44,13 +48,18 @@ const char* PoolingLayerBuilder::DebugName() {
 }
 
 CoreML::Specification::NeuralNetworkLayer* PoolingLayerBuilder::Build() {
-  if (layer_ == nullptr) {
-    layer_.reset(new CoreML::Specification::NeuralNetworkLayer);
-  }
   layer_->set_name(DebugName());
+  auto* pooling_params = layer_->mutable_pooling();
+
+  if (pooling_type_ == kTfLiteBuiltinMean) {
+    pooling_params->set_type(
+        CoreML::Specification::PoolingLayerParams::AVERAGE);
+    pooling_params->set_globalpooling(true);
+    return layer_.release();
+  }
+
   const TfLitePoolParams* params =
       reinterpret_cast<const TfLitePoolParams*>(builtin_data_);
-  auto* pooling_params = layer_->mutable_pooling();
   pooling_params->mutable_stride()->Add(params->stride_height);
   pooling_params->mutable_stride()->Add(params->stride_width);
   pooling_params->mutable_kernelsize()->Add(params->filter_height);
@@ -89,7 +98,12 @@ CoreML::Specification::NeuralNetworkLayer* PoolingLayerBuilder::Build() {
 
 TfLiteStatus PoolingLayerBuilder::RegisterInputs(const TfLiteIntArray* inputs,
                                                  TfLiteContext* context) {
-  if (inputs->size != 1) {
+  if (pooling_type_ == kTfLiteBuiltinMean) {
+    if (inputs->size != 2) {
+      TF_LITE_KERNEL_LOG(context, "Wrong # of inputs to Mean!.");
+      return kTfLiteError;
+    }
+  } else if (inputs->size != 1) {
     TF_LITE_KERNEL_LOG(context, "Wrong # of inputs to Pooling!.");
     return kTfLiteError;
   }
@@ -115,6 +129,38 @@ OpBuilder* CreateMaxPool2dOpBuilder(GraphBuilder* graph_builder) {
   return new PoolingLayerBuilder(graph_builder, kTfLiteBuiltinMaxPool2d);
 }
 
+OpBuilder* CreateMeanOpBuilder(GraphBuilder* graph_builder) {
+  return new PoolingLayerBuilder(graph_builder, kTfLiteBuiltinMean);
+}
+
+// Only supports averaging over H and W dimensions, as
+bool IsMeanOpSupported(const TfLiteRegistration* registration,
+                       const TfLiteNode* node, TfLiteContext* context) {
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* axis = GetInput(context, node, 1);
+  const auto* params =
+      reinterpret_cast<TfLiteReducerParams*>(node->builtin_data);
+
+  if (!params->keep_dims) {
+    TF_LITE_KERNEL_LOG(context, "keep_dims should be true for Mean op.");
+    return false;
+  }
+  if (input->dims->size != 4) {
+    TF_LITE_KERNEL_LOG(context, "Mean op is only supported for 4D input.");
+    return false;
+  }
+  const int* axis_data = GetTensorData<int>(axis);
+  std::vector<bool> axis_mask = {false, true, true, false};
+  for (int i = 0; i < axis->dims->data[0]; ++i) {
+    if (!axis_mask[(axis_data[i] + 4) % 4]) {
+      TF_LITE_KERNEL_LOG(context,
+                         "Mean op should reduce for H and W dimensions.");
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // namespace coreml
 }  // namespace delegates
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc
index b7b78653d36..39b15e9349a 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc
@@ -126,7 +126,8 @@ bool IsReshapeOpSupported(const TfLiteRegistration* registration,
   }
 
   const int kShapeTensor = 1;
-  const auto* shape = GetInput(context, node, kShapeTensor);
+  const TfLiteTensor* shape;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kShapeTensor, &shape));
   if (shape->allocation_type != kTfLiteMmapRo) {
     TF_LITE_KERNEL_LOG(context, "Reshape has non-const shape.");
     return false;
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
index 2cca58aa9fc..173546be879 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
@@ -109,6 +109,9 @@ bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration, const TfL
     case kTfLiteBuiltinMirrorPad: {
       return delegates::coreml::IsMirrorPadOpSupported(registration, node, context);
     }
+    case kTfLiteBuiltinMean: {
+      return delegates::coreml::IsMeanOpSupported(registration, node, context);
+    }
     case kTfLiteBuiltinMul: {
       return node->builtin_data != nullptr &&
              delegates::coreml::IsBinaryOpSupported(registration, node, context);
diff --git a/tensorflow/lite/experimental/examples/lstm/BUILD b/tensorflow/lite/experimental/examples/lstm/BUILD
index cb5a98e4078..019967b794c 100644
--- a/tensorflow/lite/experimental/examples/lstm/BUILD
+++ b/tensorflow/lite/experimental/examples/lstm/BUILD
@@ -5,6 +5,19 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+py_library(
+    name = "input_data",
+    srcs = ["input_data.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:private"],
+    deps = [
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "rnn",
     srcs = ["rnn.py"],
@@ -44,10 +57,10 @@ py_test(
         "no_pip",
     ],
     deps = [
+        ":input_data",
         ":rnn",
         ":rnn_cell",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/examples/tutorials/mnist:input_data",
         "//tensorflow/lite/python:lite",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform",
@@ -67,10 +80,10 @@ py_test(
         "no_pip",
     ],
     deps = [
+        ":input_data",
         ":rnn",
         ":rnn_cell",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/examples/tutorials/mnist:input_data",
         "//tensorflow/lite/python:lite",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
@@ -90,10 +103,10 @@ py_test(
         "no_pip",
     ],
     deps = [
+        ":input_data",
         ":rnn",
         ":rnn_cell",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/examples/tutorials/mnist:input_data",
         "//tensorflow/lite/python:lite",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
@@ -113,10 +126,10 @@ py_test(
         "no_pip",
     ],
     deps = [
+        ":input_data",
         ":rnn",
         ":rnn_cell",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/examples/tutorials/mnist:input_data",
         "//tensorflow/lite/python:lite",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
diff --git a/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py
index 0d2a2f58ad2..39e44fd0d91 100644
--- a/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_lstm_test.py
@@ -22,7 +22,7 @@ import numpy as np
 from six.moves import range
 import tensorflow.compat.v1 as tf
 
-from tensorflow.examples.tutorials.mnist import input_data
+from tensorflow.lite.experimental.examples.lstm import input_data
 from tensorflow.lite.experimental.examples.lstm.rnn import bidirectional_dynamic_rnn
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
diff --git a/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py
index 2f0a7821572..4f3bd6c3146 100644
--- a/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/bidirectional_sequence_rnn_test.py
@@ -22,7 +22,7 @@ import numpy as np
 from six.moves import range
 import tensorflow.compat.v1 as tf
 
-from tensorflow.examples.tutorials.mnist import input_data
+from tensorflow.lite.experimental.examples.lstm import input_data
 from tensorflow.lite.experimental.examples.lstm.rnn import bidirectional_dynamic_rnn
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
diff --git a/tensorflow/examples/tutorials/mnist/input_data.py b/tensorflow/lite/experimental/examples/lstm/input_data.py
similarity index 99%
rename from tensorflow/examples/tutorials/mnist/input_data.py
rename to tensorflow/lite/experimental/examples/lstm/input_data.py
index c203c7b5341..6116be79e62 100644
--- a/tensorflow/examples/tutorials/mnist/input_data.py
+++ b/tensorflow/lite/experimental/examples/lstm/input_data.py
@@ -25,6 +25,7 @@ import collections
 import gzip
 import os
 
+
 import numpy
 from six.moves import urllib
 from six.moves import xrange  # pylint: disable=redefined-builtin
diff --git a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
index dd7005ace4d..66a1b41bf30 100644
--- a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_lstm_test.py
@@ -22,7 +22,7 @@ import numpy as np
 from six.moves import range
 import tensorflow.compat.v1 as tf
 
-from tensorflow.examples.tutorials.mnist import input_data
+from tensorflow.lite.experimental.examples.lstm import input_data
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py
index 90b758e82fc..f07717cb07a 100644
--- a/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py
+++ b/tensorflow/lite/experimental/examples/lstm/unidirectional_sequence_rnn_test.py
@@ -22,7 +22,7 @@ import numpy as np
 from six.moves import range
 import tensorflow.compat.v1 as tf
 
-from tensorflow.examples.tutorials.mnist import input_data
+from tensorflow.lite.experimental.examples.lstm import input_data
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index 8524eab8fa4..11868fe044d 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -21,6 +21,10 @@ sh_binary(
     srcs = [
         "hide_symbols_with_allowlist.sh",
     ],
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+        "//tensorflow_lite_support:__subpackages__",
+    ],
 )
 
 strip_common_include_path_prefix(
diff --git a/tensorflow/lite/experimental/kernels/BUILD b/tensorflow/lite/experimental/kernels/BUILD
index 70ae658213f..ffab0c2a373 100644
--- a/tensorflow/lite/experimental/kernels/BUILD
+++ b/tensorflow/lite/experimental/kernels/BUILD
@@ -54,7 +54,6 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:op_macros",
-        "//tensorflow/lite/kernels/internal:optimized",
         "//tensorflow/lite/kernels/internal:optimized_base",
         "//tensorflow/lite/kernels/internal:tensor",
         "@flatbuffers",
diff --git a/tensorflow/lite/experimental/kernels/ctc_beam_search_decoder.cc b/tensorflow/lite/experimental/kernels/ctc_beam_search_decoder.cc
index c5e019fc2ee..9b7d731c5f8 100644
--- a/tensorflow/lite/experimental/kernels/ctc_beam_search_decoder.cc
+++ b/tensorflow/lite/experimental/kernels/ctc_beam_search_decoder.cc
@@ -62,14 +62,17 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // The outputs should be top_paths * 3 + 1.
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 3 * top_paths + 1);
 
-  const TfLiteTensor* inputs = GetInput(context, node, kInputsTensor);
+  const TfLiteTensor* inputs;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputsTensor, &inputs));
   TF_LITE_ENSURE_EQ(context, NumDimensions(inputs), 3);
   // TensorFlow only supports float.
   TF_LITE_ENSURE_EQ(context, inputs->type, kTfLiteFloat32);
   const int batch_size = SizeOfDimension(inputs, 1);
 
-  const TfLiteTensor* sequence_length =
-      GetInput(context, node, kSequenceLengthTensor);
+  const TfLiteTensor* sequence_length;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kSequenceLengthTensor,
+                                          &sequence_length));
   TF_LITE_ENSURE_EQ(context, NumDimensions(sequence_length), 1);
   TF_LITE_ENSURE_EQ(context, NumElements(sequence_length), batch_size);
   // TensorFlow only supports int32.
@@ -78,17 +81,23 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Resize decoded outputs.
   // Do not resize indices & values cause we don't know the values yet.
   for (int i = 0; i < top_paths; ++i) {
-    TfLiteTensor* indices = GetOutput(context, node, i);
+    TfLiteTensor* indices;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, i, &indices));
     SetTensorToDynamic(indices);
-    TfLiteTensor* values = GetOutput(context, node, i + top_paths);
+    TfLiteTensor* values;
+    TF_LITE_ENSURE_OK(context,
+                      GetOutputSafe(context, node, i + top_paths, &values));
     SetTensorToDynamic(values);
-    TfLiteTensor* output_shape = GetOutput(context, node, i + 2 * top_paths);
+    TfLiteTensor* output_shape;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, i + 2 * top_paths,
+                                             &output_shape));
     SetTensorToDynamic(output_shape);
   }
 
   // Resize log probability outputs.
-  TfLiteTensor* log_probability_output =
-      GetOutput(context, node, top_paths * 3);
+  TfLiteTensor* log_probability_output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, top_paths * 3,
+                                           &log_probability_output));
   TfLiteIntArray* log_probability_output_shape_array = TfLiteIntArrayCreate(2);
   log_probability_output_shape_array->data[0] = batch_size;
   log_probability_output_shape_array->data[1] = top_paths;
@@ -127,13 +136,18 @@ TfLiteStatus StoreAllDecodedSequences(
     const int32_t p_num = num_entries[p];
 
     // Resize the decoded outputs.
-    TfLiteTensor* indices = GetOutput(context, node, p);
+    TfLiteTensor* indices;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, p, &indices));
     TF_LITE_ENSURE_OK(context, Resize(context, {p_num, 2}, indices));
 
-    TfLiteTensor* values = GetOutput(context, node, p + top_paths);
+    TfLiteTensor* values;
+    TF_LITE_ENSURE_OK(context,
+                      GetOutputSafe(context, node, p + top_paths, &values));
     TF_LITE_ENSURE_OK(context, Resize(context, {p_num}, values));
 
-    TfLiteTensor* decoded_shape = GetOutput(context, node, p + 2 * top_paths);
+    TfLiteTensor* decoded_shape;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, p + 2 * top_paths,
+                                             &decoded_shape));
     TF_LITE_ENSURE_OK(context, Resize(context, {2}, decoded_shape));
 
     int32_t max_decoded = 0;
@@ -161,9 +175,12 @@ TfLiteStatus StoreAllDecodedSequences(
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* inputs = GetInput(context, node, kInputsTensor);
-  const TfLiteTensor* sequence_length =
-      GetInput(context, node, kSequenceLengthTensor);
+  const TfLiteTensor* inputs;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputsTensor, &inputs));
+  const TfLiteTensor* sequence_length;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kSequenceLengthTensor,
+                                          &sequence_length));
   const CTCBeamSearchDecoderParams* option =
       reinterpret_cast<CTCBeamSearchDecoderParams*>(node->user_data);
 
@@ -207,7 +224,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   std::vector<std::vector<std::vector<int>>> best_paths(batch_size);
   std::vector<float> log_probs;
 
-  TfLiteTensor* log_probabilities = GetOutput(context, node, 3 * top_paths);
+  TfLiteTensor* log_probabilities;
+  TF_LITE_ENSURE_OK(
+      context, GetOutputSafe(context, node, 3 * top_paths, &log_probabilities));
   float* log_probabilities_output = GetTensorData<float>(log_probabilities);
 
   // Assumption: the blank index is num_classes - 1
diff --git a/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru.cc b/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru.cc
index 84f54b8d5aa..1fd6dabb52f 100644
--- a/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru.cc
+++ b/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru.cc
@@ -127,44 +127,55 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, node->outputs->size, kOutputNum);
 
   // input's dim = [n_time, n_batch, n_input]
-  const TfLiteTensor* input = GetInput(context, node, kInput);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInput, &input));
   TF_LITE_ENSURE_EQ(context, input->dims->size, 3);
   const int n_time = input->dims->data[0];
   const int n_batch = input->dims->data[1];
   const int n_input = input->dims->data[2];
 
   // input_state's dim = [n_batch, n_output]
-  const TfLiteTensor* input_state = GetInput(context, node, kInputState);
+  const TfLiteTensor* input_state;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputState, &input_state));
   TF_LITE_ENSURE_EQ(context, input_state->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_state->dims->data[0], n_batch);
   const int n_output = input_state->dims->data[1];
 
   // gate_weight' dim = [2 * n_output, n_input + n_output]
-  const TfLiteTensor* gate_weight = GetInput(context, node, kGateWeight);
+  const TfLiteTensor* gate_weight;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kGateWeight, &gate_weight));
   TF_LITE_ENSURE_EQ(context, gate_weight->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, gate_weight->dims->data[0], 2 * n_output);
   TF_LITE_ENSURE_EQ(context, gate_weight->dims->data[1], n_input + n_output);
 
   // gate_bias' dim = [2 * n_output]
-  const TfLiteTensor* gate_bias = GetInput(context, node, kGateBias);
+  const TfLiteTensor* gate_bias;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kGateBias, &gate_bias));
   TF_LITE_ENSURE_EQ(context, gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, gate_bias->dims->data[0], 2 * n_output);
 
   // candidate_weight' dim = [n_output, n_input + n_output]
-  const TfLiteTensor* candidate_weight =
-      GetInput(context, node, kCandidateWeight);
+  const TfLiteTensor* candidate_weight;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kCandidateWeight,
+                                          &candidate_weight));
   TF_LITE_ENSURE_EQ(context, candidate_weight->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, candidate_weight->dims->data[0], n_output);
   TF_LITE_ENSURE_EQ(context, candidate_weight->dims->data[1],
                     n_input + n_output);
 
   // candidate_bias' dim = [n_output]
-  const TfLiteTensor* candidate_bias = GetInput(context, node, kCandidateBias);
+  const TfLiteTensor* candidate_bias;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kCandidateBias, &candidate_bias));
   TF_LITE_ENSURE_EQ(context, candidate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, candidate_bias->dims->data[0], n_output);
 
   // output's dim = [n_time, n_batch, n_output]
-  TfLiteTensor* output = GetOutput(context, node, kOutput);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, kOutput, &output));
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(3);
   output_size->data[0] = n_time;
   output_size->data[1] = n_batch;
@@ -173,7 +184,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                     context->ResizeTensor(context, output, output_size));
 
   // output_state's dim = [n_batch, n_output]
-  TfLiteTensor* output_state = GetOutput(context, node, kOutputState);
+  TfLiteTensor* output_state;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputState, &output_state));
   TF_LITE_ENSURE_OK(
       context, context->ResizeTensor(context, output_state,
                                      TfLiteIntArrayCopy(input_state->dims)));
@@ -183,7 +196,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // activation's dim = [n_batch, 2 * n_output]
   node->temporaries->data[kActivation] = *scratch_tensor_index;
-  TfLiteTensor* activation = GetTemporary(context, node, kActivation);
+  TfLiteTensor* activation;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, kActivation, &activation));
   activation->type = input->type;
   activation->allocation_type = kTfLiteArenaRw;
   TfLiteIntArray* activation_size = TfLiteIntArrayCreate(2);
@@ -194,7 +209,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // concat's dim  = [n_batch, n_input + n_output]
   node->temporaries->data[kConcat] = (*scratch_tensor_index) + kConcat;
-  TfLiteTensor* concat = GetTemporary(context, node, kConcat);
+  TfLiteTensor* concat;
+  TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kConcat, &concat));
   concat->type = input->type;
   concat->allocation_type = kTfLiteArenaRw;
   TfLiteIntArray* concat_size = TfLiteIntArrayCreate(2);
@@ -207,17 +223,33 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInput);
-  const TfLiteTensor* input_state = GetInput(context, node, kInputState);
-  const TfLiteTensor* gate_weight = GetInput(context, node, kGateWeight);
-  const TfLiteTensor* gate_bias = GetInput(context, node, kGateBias);
-  const TfLiteTensor* candidate_weight =
-      GetInput(context, node, kCandidateWeight);
-  const TfLiteTensor* candidate_bias = GetInput(context, node, kCandidateBias);
-  TfLiteTensor* output = GetOutput(context, node, kOutput);
-  TfLiteTensor* output_state = GetOutput(context, node, kOutputState);
-  TfLiteTensor* activation = GetTemporary(context, node, kActivation);
-  TfLiteTensor* concat = GetTemporary(context, node, kConcat);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInput, &input));
+  const TfLiteTensor* input_state;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputState, &input_state));
+  const TfLiteTensor* gate_weight;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kGateWeight, &gate_weight));
+  const TfLiteTensor* gate_bias;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kGateBias, &gate_bias));
+  const TfLiteTensor* candidate_weight;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kCandidateWeight,
+                                          &candidate_weight));
+  const TfLiteTensor* candidate_bias;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kCandidateBias, &candidate_bias));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, kOutput, &output));
+  TfLiteTensor* output_state;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputState, &output_state));
+  TfLiteTensor* activation;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, kActivation, &activation));
+  TfLiteTensor* concat;
+  TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kConcat, &concat));
   auto cpu_backend_context = CpuBackendContext::GetFromContext(context);
 
   if (gate_weight->type == kTfLiteFloat32) {
diff --git a/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru_test.cc b/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru_test.cc
index 593d714e557..0436636a0af 100644
--- a/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru_test.cc
+++ b/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru_test.cc
@@ -37,7 +37,7 @@ class GRUOpModel : public SingleOpModel {
       : n_batch_(n_batch), n_input_(n_input), n_output_(n_output) {
     input_ = AddInput(TensorType_FLOAT32);
     input_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_batch, n_output}}, true);
+        AddVariableInput(TensorData{TensorType_FLOAT32, {n_batch, n_output}});
     gate_weight_ = AddInput(TensorType_FLOAT32);
     gate_bias_ = AddInput(TensorType_FLOAT32);
     candidate_weight_ = AddInput(TensorType_FLOAT32);
diff --git a/tensorflow/lite/experimental/microfrontend/audio_microfrontend.cc b/tensorflow/lite/experimental/microfrontend/audio_microfrontend.cc
index b8d89f69b26..090871da2ef 100644
--- a/tensorflow/lite/experimental/microfrontend/audio_microfrontend.cc
+++ b/tensorflow/lite/experimental/microfrontend/audio_microfrontend.cc
@@ -91,8 +91,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
 
@@ -180,8 +183,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       reinterpret_cast<TfLiteAudioMicrofrontendParams*>(node->user_data);
   FrontendReset(data->state);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   if (data->out_float) {
     GenerateFeatures<float>(data, input, output);
diff --git a/tensorflow/lite/experimental/microfrontend/lib/fft.cc b/tensorflow/lite/experimental/microfrontend/lib/fft.cc
index 72779f5d60b..22c6ae93455 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/fft.cc
+++ b/tensorflow/lite/experimental/microfrontend/lib/fft.cc
@@ -38,10 +38,9 @@ void FftCompute(struct FftState* state, const int16_t* input,
   }
 
   // Apply the FFT.
-  kiss_fftr(
-      reinterpret_cast<const kiss_fftr_cfg>(state->scratch),
-      state->input,
-      reinterpret_cast<kiss_fft_cpx*>(state->output));
+  kiss_fftr(reinterpret_cast<kiss_fftr_cfg>(state->scratch),
+            state->input,
+            reinterpret_cast<kiss_fft_cpx*>(state->output));
 }
 
 void FftInit(struct FftState* state) {
diff --git a/tensorflow/lite/experimental/objc/BUILD.apple b/tensorflow/lite/experimental/objc/BUILD.apple
index d26d90c46a1..a65418a792a 100644
--- a/tensorflow/lite/experimental/objc/BUILD.apple
+++ b/tensorflow/lite/experimental/objc/BUILD.apple
@@ -64,7 +64,9 @@ objc_library(
     visibility = ios_visibility_whitelist(),
     deps = [
         "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/delegates/gpu:metal_delegate",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+        "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
index 4ab5753e016..638f5fa90f0 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
@@ -21,15 +21,7 @@ Pod::Spec.new do |s|
 
   tfl_dir = 'tensorflow/lite/'
   objc_dir = tfl_dir + 'experimental/objc/'
-  s.public_header_files = objc_dir + 'apis/*.h'
-  s.source_files = [
-    objc_dir + '{apis,sources}/*.{h,m,mm}',
-    tfl_dir + 'c/c_api.h',
-    tfl_dir + 'c/common.h',
-    tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
-  ]
-  s.module_map = objc_dir + 'apis/framework.modulemap'
-  s.dependency 'TensorFlowLiteC', '~> 0.0.1-nightly'
+
   s.pod_target_xcconfig = {
     'HEADER_SEARCH_PATHS' =>
       '"${PODS_TARGET_SRCROOT}" ' +
@@ -37,11 +29,59 @@ Pod::Spec.new do |s|
     'VALID_ARCHS' => 'i386 x86_64 armv7 arm64',
   }
 
-  s.test_spec 'Tests' do |ts|
-    ts.source_files = objc_dir + 'tests/*.m'
-    ts.resources = [
-      tfl_dir + 'testdata/add.bin',
-      tfl_dir + 'testdata/add_quantized.bin',
+  s.default_subspec = 'Core'
+
+  s.subspec 'Core' do |core|
+    core.public_header_files = objc_dir + 'apis/*.h'
+    core.source_files = [
+      objc_dir + '{apis,sources}/*.{h,m,mm}',
+      tfl_dir + 'c/c_api.h',
+      tfl_dir + 'c/common.h',
+      tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
     ]
+    core.exclude_files = [
+      objc_dir + '{apis,sources}/TFL{Metal,CoreML}Delegate.{h,m}',
+    ]
+    core.dependency 'TensorFlowLiteC', "#{s.version}"
+
+    core.test_spec 'Tests' do |ts|
+      ts.source_files = objc_dir + 'tests/*.m'
+      ts.exclude_files = objc_dir + 'tests/TFL{Metal,CoreML}DelegateTests.m'
+      ts.resources = [
+        tfl_dir + 'testdata/add.bin',
+        tfl_dir + 'testdata/add_quantized.bin',
+      ]
+    end
+  end
+
+  s.subspec 'CoreML' do |coreml|
+    coreml.source_files = [
+      objc_dir + '{apis,sources}/TFLCoreMLDelegate.{h,m}',
+    ]
+    coreml.ios.deployment_target = '12.0'
+    coreml.dependency 'TensorFlowLiteC/CoreML', "#{s.version}"
+    coreml.dependency 'TensorFlowLiteObjC/Core', "#{s.version}"
+
+    coreml.test_spec 'Tests' do |ts|
+      ts.source_files = objc_dir + 'tests/TFLCoreMLDelegateTests.m'
+      ts.resources = [
+        tfl_dir + 'testdata/add.bin',
+      ]
+    end
+  end
+
+  s.subspec 'Metal' do |metal|
+    metal.source_files = [
+      objc_dir + '{apis,sources}/TFLMetalDelegate.{h,m}',
+    ]
+    metal.dependency 'TensorFlowLiteC/Metal', "#{s.version}"
+    metal.dependency 'TensorFlowLiteObjC/Core', "#{s.version}"
+
+    metal.test_spec 'Tests' do |ts|
+      ts.source_files = objc_dir + 'tests/TFLMetalDelegateTests.m'
+      ts.resources = [
+        tfl_dir + 'testdata/multi_add.bin',
+      ]
+    end
   end
 end
diff --git a/tensorflow/lite/experimental/objc/apis/TFLCoreMLDelegate.h b/tensorflow/lite/experimental/objc/apis/TFLCoreMLDelegate.h
new file mode 100644
index 00000000000..b2ad971750e
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apis/TFLCoreMLDelegate.h
@@ -0,0 +1,87 @@
+// Copyright 2020 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+#import "TFLDelegate.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * @enum TFLCoreMLDelegateEnabledDevices
+ * This enum specifies for which devices the Core ML delegate will be enabled.
+ */
+typedef NS_ENUM(NSUInteger, TFLCoreMLDelegateEnabledDevices) {
+  /** Enables the delegate for devices with Neural Engine only. */
+  TFLCoreMLDelegateEnabledDevicesNeuralEngine,
+  /** Enables the delegate for all devices. */
+  TFLCoreMLDelegateEnabledDevicesAll,
+};
+
+/** Custom configuration options for a Core ML delegate. */
+@interface TFLCoreMLDelegateOptions : NSObject
+
+/**
+ * Indicates which devices the Core ML delegate should be enabled for. The default value is
+ * `TFLCoreMLDelegateEnabledDevicesNeuralEngine`, indicating that the delegate is enabled for
+ * Neural Engine devices only.
+ */
+@property(nonatomic) TFLCoreMLDelegateEnabledDevices enabledDevices;
+
+/**
+ * Target Core ML version for the model conversion. When it's not set, Core ML version will be set
+ * to highest available version for the platform.
+ */
+@property(nonatomic) NSUInteger coreMLVersion;
+
+/**
+ * The maximum number of Core ML delegate partitions created. Each graph corresponds to one
+ * delegated node subset in the TFLite model. The default value is `0` indicating that all possible
+ * partitions are delegated.
+ */
+@property(nonatomic) NSUInteger maxDelegatedPartitions;
+
+/**
+ * The minimum number of nodes per partition to be delegated by the Core ML delegate. The default
+ * value is `2`.
+ */
+@property(nonatomic) NSUInteger minNodesPerPartition;
+
+@end
+
+/** A delegate that uses the Core ML framework for performing TensorFlow Lite graph operations. */
+@interface TFLCoreMLDelegate : TFLDelegate
+
+/**
+ * Initializes a new Core ML delegate with default options.
+ *
+ * @return A Core ML delegate initialized with default options. `nil` when the delegate creation
+ * fails. For example, trying to initialize a Core ML delegate on an unsupported device.
+ */
+- (nullable instancetype)init;
+
+/**
+ * Initializes a new Core ML delegate with the given options.
+ *
+ * @param options Core ML delegate options.
+ *
+ * @return A Core ML delegate initialized with default options. `nil` when the delegate creation
+ * fails. For example, trying to initialize Core ML delegate on an unsupported device.
+ */
+- (nullable instancetype)initWithOptions:(TFLCoreMLDelegateOptions *)options
+    NS_DESIGNATED_INITIALIZER;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/examples/ios/simple/RunModelViewController.h b/tensorflow/lite/experimental/objc/apis/TFLDelegate.h
similarity index 55%
rename from tensorflow/examples/ios/simple/RunModelViewController.h
rename to tensorflow/lite/experimental/objc/apis/TFLDelegate.h
index 4e1a83ccf5a..8269fe0f9be 100644
--- a/tensorflow/examples/ios/simple/RunModelViewController.h
+++ b/tensorflow/lite/experimental/objc/apis/TFLDelegate.h
@@ -1,10 +1,10 @@
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2020 Google Inc. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+// You may obtain a copy of the License at:
 //
-//    http://www.apache.org/licenses/LICENSE-2.0
+// http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,13 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#import <UIKit/UIKit.h>
+#import <Foundation/Foundation.h>
 
-@interface RunModelViewController : UIViewController
+typedef void* TFLCDelegate;
 
-- (IBAction)getUrl:(id)sender;
+NS_ASSUME_NONNULL_BEGIN
 
-@property (weak, nonatomic) IBOutlet UITextView *urlContentTextView;
-@property (weak, nonatomic) IBOutlet UITextField *urlTextField;
+@interface TFLDelegate : NSObject
+
+/** Pointer to underlying TfLiteDelegate*. */
+@property(nonatomic, readonly) TFLCDelegate cDelegate;
 
 @end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/apis/TFLInterpreter.h b/tensorflow/lite/experimental/objc/apis/TFLInterpreter.h
index 333047cc307..6795dbfaed4 100644
--- a/tensorflow/lite/experimental/objc/apis/TFLInterpreter.h
+++ b/tensorflow/lite/experimental/objc/apis/TFLInterpreter.h
@@ -14,6 +14,7 @@
 
 #import <Foundation/Foundation.h>
 
+@class TFLDelegate;
 @class TFLInterpreterOptions;
 @class TFLTensor;
 
@@ -111,6 +112,25 @@ typedef NS_ENUM(NSUInteger, TFLInterpreterErrorCode) {
  */
 - (nullable instancetype)initWithModelPath:(NSString *)modelPath
                                    options:(TFLInterpreterOptions *)options
+                                     error:(NSError **)error;
+
+/**
+ * Initializes a new TensorFlow Lite interpreter instance with the given model file path, options
+ * and delegates.
+ *
+ * @param modelPath An absolute path to a TensorFlow Lite model file stored locally on the device.
+ * @param options Options to use for configuring the TensorFlow Lite interpreter.
+ * @param delegates Delegates to use with the TensorFlow Lite interpreter. When the array is empty,
+ *     no delegate will be applied.
+ * @param error An optional error parameter populated when there is an error in initializing the
+ *     interpreter.
+ *
+ * @return A new instance of `TFLInterpreter` with the given model and options. `nil` if there is an
+ *     error in initializing the interpreter.
+ */
+- (nullable instancetype)initWithModelPath:(NSString *)modelPath
+                                   options:(TFLInterpreterOptions *)options
+                                 delegates:(NSArray<TFLDelegate *> *)delegates
                                      error:(NSError **)error NS_DESIGNATED_INITIALIZER;
 
 /**
diff --git a/tensorflow/lite/experimental/objc/apis/TFLMetalDelegate.h b/tensorflow/lite/experimental/objc/apis/TFLMetalDelegate.h
new file mode 100644
index 00000000000..f73c5dedfe5
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/apis/TFLMetalDelegate.h
@@ -0,0 +1,91 @@
+// Copyright 2020 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import <Foundation/Foundation.h>
+
+#import "TFLDelegate.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * @enum TFLMetalDelegateThreadWaitType
+ * This enum specifies wait type for Metal delegate.
+ */
+typedef NS_ENUM(NSUInteger, TFLMetalDelegateThreadWaitType) {
+
+  /**
+   * The thread does not wait for the work to complete. Useful when the output of the work is used
+   * with the GPU pipeline.
+   */
+  TFLMetalDelegateThreadWaitTypeDoNotWait,
+  /** The thread waits until the work is complete. */
+  TFLMetalDelegateThreadWaitTypePassive,
+  /**
+   * The thread waits for the work to complete with minimal latency, which may require additional
+   * CPU resources.
+   */
+  TFLMetalDelegateThreadWaitTypeActive,
+  /** The thread waits for the work while trying to prevent the GPU from going into sleep mode. */
+  TFLMetalDelegateThreadWaitTypeAggressive,
+};
+
+/** Custom configuration options for a Metal delegate. */
+@interface TFLMetalDelegateOptions : NSObject
+
+/**
+ * Indicates whether the GPU delegate allows precision loss, such as allowing `Float16` precision
+ * for a `Float32` computation. The default is `false`.
+ */
+@property(nonatomic, getter=isPrecisionLossAllowed) BOOL precisionLossAllowed;
+
+/**
+ * Indicates how the current thread should wait for work on the GPU to complete. The default
+ * is `TFLMetalDelegateThreadWaitTypePassive`.
+ */
+@property(nonatomic) TFLMetalDelegateThreadWaitType waitType;
+
+/**
+ * Indicates whether the GPU delegate allows execution of an 8-bit quantized model. The default is
+ * `false`.
+ */
+@property(nonatomic, getter=isQuantizationEnabled) BOOL quantizationEnabled;
+
+@end
+
+/**
+ * A delegate that uses the `Metal` framework for performing TensorFlow Lite graph operations with
+ * GPU acceleration.
+ */
+@interface TFLMetalDelegate : TFLDelegate
+
+/**
+ * Initializes a new GPU delegate with default options.
+ *
+ * @return A new GPU delegate with default options. `nil` when the GPU delegate creation fails.
+ */
+- (nullable instancetype)init;
+
+/**
+ * Initializes a new GPU delegate with the given options.
+ *
+ * @param options GPU delegate options.
+ *
+ * @return A new GPU delegate with default options. `nil` when the GPU delegate creation fails.
+ */
+- (nullable instancetype)initWithOptions:(TFLMetalDelegateOptions *)options
+    NS_DESIGNATED_INITIALIZER;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/apis/TFLTensorFlowLite.h b/tensorflow/lite/experimental/objc/apis/TFLTensorFlowLite.h
index 7ba3f9782be..8ece632fc34 100644
--- a/tensorflow/lite/experimental/objc/apis/TFLTensorFlowLite.h
+++ b/tensorflow/lite/experimental/objc/apis/TFLTensorFlowLite.h
@@ -14,6 +14,7 @@
 
 #import <Foundation/Foundation.h>
 
+#import "TFLDelegate.h"
 #import "TFLInterpreter.h"
 #import "TFLInterpreterOptions.h"
 #import "TFLQuantizationParameters.h"
diff --git a/tensorflow/lite/experimental/objc/apis/framework.modulemap b/tensorflow/lite/experimental/objc/apis/framework.modulemap
deleted file mode 100644
index c360b20ab86..00000000000
--- a/tensorflow/lite/experimental/objc/apis/framework.modulemap
+++ /dev/null
@@ -1,5 +0,0 @@
-framework module TFLTensorFlowLite {
-    umbrella header "TFLTensorFlowLite.h"
-
-    export *
-}
diff --git a/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/ViewController.m b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/ViewController.m
index 2b805f0d1be..4368445aaae 100644
--- a/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/ViewController.m
+++ b/tensorflow/lite/experimental/objc/apps/TestApp/TestApp/ViewController.m
@@ -14,7 +14,11 @@
 
 #import "ViewController.h"
 
+#if COCOAPODS
+@import TFLTensorFlowLite;
+#else
 #import "TFLTensorFlowLite.h"
+#endif
 
 NS_ASSUME_NONNULL_BEGIN
 
@@ -62,7 +66,7 @@ static NSString *const kNilInterpreterError =
 static NSString *const kInvokeInterpreterError = @"Failed to invoke interpreter due to error: %@.";
 
 /** Model paths. */
-static NSArray *gModelPaths;
+static NSArray<NSString *> *gModelPaths;
 
 @interface ViewController ()
 
@@ -153,6 +157,7 @@ static NSArray *gModelPaths;
     NSError *error;
     weakSelf.interpreter = [[TFLInterpreter alloc] initWithModelPath:modelPath
                                                              options:options
+                                                           delegates:@[]
                                                                error:&error];
     if (weakSelf.interpreter == nil || error != nil) {
       NSString *results =
diff --git a/tensorflow/lite/experimental/objc/sources/TFLCoreMLDelegate.m b/tensorflow/lite/experimental/objc/sources/TFLCoreMLDelegate.m
new file mode 100644
index 00000000000..31a987f30bc
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLCoreMLDelegate.m
@@ -0,0 +1,86 @@
+// Copyright 2020 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLCoreMLDelegate.h"
+
+#ifdef COCOAPODS
+@import TensorFlowLiteCCoreML;
+#else
+#include "tensorflow/lite/experimental/delegates/coreml/coreml_delegate.h"
+#endif
+
+NS_ASSUME_NONNULL_BEGIN
+
+@implementation TFLCoreMLDelegateOptions
+
+- (instancetype)init {
+  self = [super init];
+  if (self != nil) {
+    _coreMLVersion = 0;
+    _maxDelegatedPartitions = 0;
+    _minNodesPerPartition = 2;
+    _enabledDevices = TFLCoreMLDelegateEnabledDevicesNeuralEngine;
+  }
+
+  return self;
+}
+
+@end
+
+@implementation TFLCoreMLDelegate
+
+@synthesize cDelegate = _cDelegate;
+
+#pragma mark - NSObject
+
+- (void)dealloc {
+  TfLiteCoreMlDelegateDelete((TfLiteDelegate*)self.cDelegate);
+}
+
+#pragma mark - Public
+
+- (nullable instancetype)init {
+  TFLCoreMLDelegateOptions* options = [[TFLCoreMLDelegateOptions alloc] init];
+  return [self initWithOptions:options];
+}
+
+- (nullable instancetype)initWithOptions:(TFLCoreMLDelegateOptions*)options {
+  self = [super init];
+  if (self != nil) {
+    TfLiteCoreMlDelegateOptions cOptions;
+
+    cOptions.coreml_version = options.coreMLVersion;
+    cOptions.max_delegated_partitions = options.maxDelegatedPartitions;
+    cOptions.min_nodes_per_partition = options.minNodesPerPartition;
+
+    switch (options.enabledDevices) {
+      case TFLCoreMLDelegateEnabledDevicesNeuralEngine:
+        cOptions.enabled_devices = TfLiteCoreMlDelegateDevicesWithNeuralEngine;
+        break;
+
+      case TFLCoreMLDelegateEnabledDevicesAll:
+        cOptions.enabled_devices = TfLiteCoreMlDelegateAllDevices;
+        break;
+    }
+    _cDelegate = TfLiteCoreMlDelegateCreate(&cOptions);
+    if (_cDelegate == nil) {
+      return nil;
+    }
+  }
+  return self;
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/examples/ios/benchmark/AppDelegate.h b/tensorflow/lite/experimental/objc/sources/TFLDelegate.m
similarity index 63%
rename from tensorflow/examples/ios/benchmark/AppDelegate.h
rename to tensorflow/lite/experimental/objc/sources/TFLDelegate.m
index 94046d97282..020b2005de4 100644
--- a/tensorflow/examples/ios/benchmark/AppDelegate.h
+++ b/tensorflow/lite/experimental/objc/sources/TFLDelegate.m
@@ -1,10 +1,10 @@
-// Copyright 2015 Google Inc. All rights reserved.
+// Copyright 2020 Google Inc. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
+// You may obtain a copy of the License at:
 //
-//    http://www.apache.org/licenses/LICENSE-2.0
+// http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,10 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#import <UIKit/UIKit.h>
+#import "tensorflow/lite/experimental/objc/apis/TFLDelegate.h"
 
-@interface AppDelegate : UIResponder<UIApplicationDelegate>
-
-@property(strong, nonatomic) UIWindow *window;
+NS_ASSUME_NONNULL_BEGIN
 
+@implementation TFLDelegate
 @end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
index 0ccafd71d1b..27275212d03 100644
--- a/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
+++ b/tensorflow/lite/experimental/objc/sources/TFLInterpreter.mm
@@ -19,6 +19,7 @@
 #import "TFLErrorUtil.h"
 #import "TFLQuantizationParameters+Internal.h"
 #import "TFLTensor+Internal.h"
+#import "tensorflow/lite/experimental/objc/apis/TFLDelegate.h"
 #import "tensorflow/lite/experimental/objc/apis/TFLInterpreterOptions.h"
 #import "tensorflow/lite/experimental/objc/apis/TFLTensor.h"
 
@@ -28,7 +29,7 @@
 NS_ASSUME_NONNULL_BEGIN
 
 FOUNDATION_EXPORT NSString *const TFLVersion =
-    TfLiteVersion() == NULL ? @"" : [NSString stringWithUTF8String:TfLiteVersion()];
+    TfLiteVersion() == nullptr ? @"" : [NSString stringWithUTF8String:TfLiteVersion()];
 
 /**
  * Error reporter for TFLInterpreter.
@@ -47,7 +48,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 @property(nonatomic, nullable) TfLiteInterpreter *interpreter;
 
 /** TfLiteDelegate backed by C API. */
-@property(nonatomic, nullable) TfLiteDelegate *xnnpack_delegate;
+@property(nonatomic, nullable) TfLiteDelegate *xnnPackDelegate;
 
 @end
 
@@ -57,7 +58,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 
 - (void)dealloc {
   TfLiteInterpreterDelete(_interpreter);
-  TfLiteXNNPackDelegateDelete(_xnnpack_delegate);
+  TfLiteXNNPackDelegateDelete(_xnnPackDelegate);
 }
 
 #pragma mark - Public
@@ -65,12 +66,20 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 - (nullable instancetype)initWithModelPath:(NSString *)modelPath error:(NSError **)error {
   return [self initWithModelPath:modelPath
                          options:[[TFLInterpreterOptions alloc] init]
+                       delegates:@[]
                            error:error];
 }
 
 - (nullable instancetype)initWithModelPath:(NSString *)modelPath
                                    options:(TFLInterpreterOptions *)options
                                      error:(NSError **)error {
+  return [self initWithModelPath:modelPath options:options delegates:@[] error:error];
+}
+
+- (nullable instancetype)initWithModelPath:(NSString *)modelPath
+                                   options:(TFLInterpreterOptions *)options
+                                 delegates:(NSArray<TFLDelegate *> *)delegates
+                                     error:(NSError **)error {
   self = [super init];
 
   if (self != nil) {
@@ -110,13 +119,20 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
       TfLiteInterpreterOptionsSetErrorReporter(cOptions, TFLInterpreterErrorReporter, nullptr);
 
       if (options.useXNNPACK) {
-        TfLiteXNNPackDelegateOptions xnnpack_options = TfLiteXNNPackDelegateOptionsDefault();
+        TfLiteXNNPackDelegateOptions xnnPackOptions = TfLiteXNNPackDelegateOptionsDefault();
         if (options.numberOfThreads > 0) {
-          xnnpack_options.num_threads = (int32_t)options.numberOfThreads;
+          xnnPackOptions.num_threads = (int32_t)options.numberOfThreads;
         }
 
-        _xnnpack_delegate = TfLiteXNNPackDelegateCreate(&xnnpack_options);
-        TfLiteInterpreterOptionsAddDelegate(cOptions, _xnnpack_delegate);
+        _xnnPackDelegate = TfLiteXNNPackDelegateCreate(&xnnPackOptions);
+        TfLiteInterpreterOptionsAddDelegate(cOptions, _xnnPackDelegate);
+      }
+
+      for (TFLDelegate *delegate in delegates) {
+        if (delegate.cDelegate != nullptr) {
+          TfLiteInterpreterOptionsAddDelegate(
+              cOptions, reinterpret_cast<TfLiteDelegate *>(delegate.cDelegate));
+        }
       }
 
       _interpreter = TfLiteInterpreterCreate(model, cOptions);
@@ -185,7 +201,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
     return NO;
   }
 
-  std::vector<int> cDimensions(self.inputTensorCount);
+  std::vector<int> cDimensions(shape.count);
   for (int dimIndex = 0; dimIndex < shape.count; ++dimIndex) {
     int dimension = shape[dimIndex].intValue;
     if (dimension <= 0) {
@@ -199,7 +215,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
   }
 
   if (TfLiteInterpreterResizeInputTensor(self.interpreter, (int32_t)index, cDimensions.data(),
-                                       (int32_t)shape.count) != kTfLiteOk) {
+                                         (int32_t)shape.count) != kTfLiteOk) {
     NSString *errorDescription = [NSString
         stringWithFormat:@"Failed to resize input tensor at index (%lu).", (unsigned long)index];
     [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeFailedToResizeInputTensor
@@ -293,7 +309,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
     return nil;
   }
 
-  NSMutableArray *shape = [NSMutableArray arrayWithCapacity:rank];
+  NSMutableArray<NSNumber *> *shape = [NSMutableArray arrayWithCapacity:rank];
   for (int32_t dimIndex = 0; dimIndex < rank; dimIndex++) {
     int32_t dimension = TfLiteTensorDim(cTensor, dimIndex);
     if (dimension <= 0) {
@@ -314,8 +330,8 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
 #pragma mark - Private
 
 - (const TfLiteTensor *)cTensorOfType:(TFLTensorType)type
-                            atIndex:(NSUInteger)index
-                              error:(NSError **)error {
+                              atIndex:(NSUInteger)index
+                                error:(NSError **)error {
   const TfLiteTensor *tensor = nullptr;
 
   switch (type) {
@@ -373,7 +389,6 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
                                                                 zeroPoint:cParams.zero_point];
   }
 
-  // TODO: Set quantization parameters when C API supports it.
   return [[TFLTensor alloc] initWithInterpreter:self
                                            type:type
                                           index:index
@@ -418,7 +433,7 @@ static void TFLInterpreterErrorReporter(void *user_data, const char *format, va_
   if (index >= totalTensorCount) {
     NSString *errorDescription =
         [NSString stringWithFormat:@"Invalid tensor index (%lu) exceeds max (%lu).",
-                                   (unsigned long)index, (unsigned long)(totalTensorCount - 1)];
+                                   (unsigned long)index, (totalTensorCount - 1)];
     [TFLErrorUtil saveInterpreterErrorWithCode:TFLInterpreterErrorCodeInvalidTensorIndex
                                    description:errorDescription
                                          error:error];
diff --git a/tensorflow/lite/experimental/objc/sources/TFLMetalDelegate.m b/tensorflow/lite/experimental/objc/sources/TFLMetalDelegate.m
new file mode 100644
index 00000000000..85894e61cd1
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/sources/TFLMetalDelegate.m
@@ -0,0 +1,86 @@
+// Copyright 2020 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#import "tensorflow/lite/experimental/objc/apis/TFLMetalDelegate.h"
+
+#ifdef COCOAPODS
+@import TensorFlowLiteCMetal;
+#else
+#include "tensorflow/lite/delegates/gpu/metal_delegate.h"
+#endif
+
+NS_ASSUME_NONNULL_BEGIN
+
+@implementation TFLMetalDelegateOptions
+
+#pragma mark - Public
+
+- (instancetype)init {
+  self = [super init];
+  if (self != nil) {
+    _waitType = TFLMetalDelegateThreadWaitTypePassive;
+  }
+  return self;
+}
+
+@end
+
+@implementation TFLMetalDelegate
+
+@synthesize cDelegate = _cDelegate;
+
+#pragma mark - NSObject
+
+- (void)dealloc {
+  TFLGpuDelegateDelete(self.cDelegate);
+}
+
+#pragma mark - Public
+
+- (nullable instancetype)init {
+  TFLMetalDelegateOptions* options = [[TFLMetalDelegateOptions alloc] init];
+  return [self initWithOptions:options];
+}
+
+- (nullable instancetype)initWithOptions:(TFLMetalDelegateOptions*)options {
+  self = [super init];
+  if (self != nil) {
+    TFLGpuDelegateOptions cOptions;
+    cOptions.allow_precision_loss = options.precisionLossAllowed;
+    cOptions.enable_quantization = options.quantizationEnabled;
+    switch (options.waitType) {
+      case TFLMetalDelegateThreadWaitTypeDoNotWait:
+        cOptions.wait_type = TFLGpuDelegateWaitTypeDoNotWait;
+        break;
+      case TFLMetalDelegateThreadWaitTypePassive:
+        cOptions.wait_type = TFLGpuDelegateWaitTypePassive;
+        break;
+      case TFLMetalDelegateThreadWaitTypeActive:
+        cOptions.wait_type = TFLGpuDelegateWaitTypeActive;
+        break;
+      case TFLMetalDelegateThreadWaitTypeAggressive:
+        cOptions.wait_type = TFLGpuDelegateWaitTypeAggressive;
+        break;
+    }
+    _cDelegate = TFLGpuDelegateCreate(&cOptions);
+    if (_cDelegate == nil) {
+      return nil;
+    }
+  }
+  return self;
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/tests/TFLCoreMLDelegateTests.m b/tensorflow/lite/experimental/objc/tests/TFLCoreMLDelegateTests.m
new file mode 100644
index 00000000000..998f6db51f2
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/tests/TFLCoreMLDelegateTests.m
@@ -0,0 +1,109 @@
+// Copyright 2020 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef COCOAPODS
+@import TFLTensorFlowLite;
+#else
+#import "tensorflow/lite/experimental/objc/apis/TFLCoreMLDelegate.h"
+#import "tensorflow/lite/experimental/objc/apis/TFLTensorFlowLite.h"
+#endif
+
+#import <XCTest/XCTest.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+/** Float model resource name.
+ *  The model has a graph equivalent to (input + input) + input.
+ */
+static NSString* const kAddFloatModelResourceName = @"add";
+
+/** Model resource type. */
+static NSString* const kAddModelResourceType = @"bin";
+
+/** Size of input and output tensors */
+static const int kTensorSize = 8 * 8 * 3;
+
+/** Size of channel dimension of input and output tensors */
+static const int kTensorChannels = 3;
+
+/** Accuracy used in comparing floating numbers. */
+static const float kTestAccuracy = 1E-5F;
+
+@interface TFLCoreMLDelegateTests : XCTestCase
+@end
+
+@implementation TFLCoreMLDelegateTests
+
+- (void)testCoreMLDelegate {
+  if (@available(iOS 11.0, *)) {
+  } else {
+    return;
+  }
+
+  NSBundle* bundle = [NSBundle bundleForClass:[self class]];
+  NSString* floatModelPath = [bundle pathForResource:kAddFloatModelResourceName
+                                              ofType:kAddModelResourceType];
+
+  TFLInterpreterOptions* options = [[TFLInterpreterOptions alloc] init];
+  TFLCoreMLDelegateOptions* coreMLOptions = [[TFLCoreMLDelegateOptions alloc] init];
+  coreMLOptions.enabledDevices = TFLCoreMLDelegateEnabledDevicesAll;
+  TFLCoreMLDelegate* coreMLDelegate = [[TFLCoreMLDelegate alloc] initWithOptions:coreMLOptions];
+  XCTAssertNotNil(coreMLDelegate);
+
+  NSError* error;
+  TFLInterpreter* interpreter = [[TFLInterpreter alloc] initWithModelPath:floatModelPath
+                                                                  options:options
+                                                                delegates:@[ coreMLDelegate ]
+                                                                    error:&error];
+  XCTAssertNil(error);
+  XCTAssertNotNil(interpreter);
+  XCTAssertTrue([interpreter allocateTensorsWithError:&error]);
+  XCTAssertNil(error);
+
+  // Copies the input data.
+  NSMutableData* inputData = [NSMutableData dataWithLength:sizeof(float) * kTensorSize];
+  for (int i = 0; i < kTensorSize / kTensorChannels; ++i) {
+    float* data = (float*)inputData.mutableBytes;
+    for (int j = 0; j < kTensorChannels; ++j) {
+      data[i * kTensorChannels + j] = j;
+    }
+  }
+
+  TFLTensor* inputTensor = [interpreter inputTensorAtIndex:0 error:&error];
+  XCTAssertNotNil(inputTensor);
+  XCTAssertTrue([inputTensor copyData:inputData error:&error]);
+  XCTAssertNil(error);
+
+  // Invokes the interpreter.
+  XCTAssertTrue([interpreter invokeWithError:&error]);
+  XCTAssertNil(error);
+
+  // Gets the output tensor data.
+  TFLTensor* outputTensor = [interpreter outputTensorAtIndex:0 error:&error];
+  NSData* outputData = [outputTensor dataWithError:&error];
+  XCTAssertNotNil(outputData);
+  XCTAssertNil(error);
+
+  float output[kTensorSize];
+  [outputData getBytes:output length:(sizeof(float) * kTensorSize)];
+  for (int i = 0; i < kTensorSize / kTensorChannels; ++i) {
+    for (int j = 0; j < kTensorChannels; ++j) {
+      XCTAssertEqualWithAccuracy(j * 3, output[i * kTensorChannels + j], kTestAccuracy);
+    }
+  }
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/tests/TFLInterpreterTests.m b/tensorflow/lite/experimental/objc/tests/TFLInterpreterTests.m
index c75d082fe7d..9791a192efc 100644
--- a/tensorflow/lite/experimental/objc/tests/TFLInterpreterTests.m
+++ b/tensorflow/lite/experimental/objc/tests/TFLInterpreterTests.m
@@ -48,9 +48,8 @@ static const NSUInteger kInvalidInputTensorIndex = 1U;
 /** Invalid output tensor index. */
 static const NSUInteger kInvalidOutputTensorIndex = 1U;
 
-/** Accurary used in comparing floating numbers. */
+/** Accuracy used in comparing floating numbers. */
 static const float kTestAccuracy = 1E-5F;
-
 /**
  * Unit tests for TFLInterpreter.
  */
@@ -97,7 +96,7 @@ static const float kTestAccuracy = 1E-5F;
 
 - (void)testSuccessfulFullRunAddFloatModel {
   // Shape for both input and output tensor.
-  NSArray<NSNumber *> *shape = @[@(kAddModelTensorFirstDimensionSize)];
+  NSArray<NSNumber *> *shape = @[ @(kAddModelTensorFirstDimensionSize) ];
 
   // Creates the interpreter options.
   TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
@@ -108,6 +107,7 @@ static const float kTestAccuracy = 1E-5F;
   NSError *error;
   TFLInterpreter *customInterpreter = [[TFLInterpreter alloc] initWithModelPath:self.floatModelPath
                                                                         options:options
+                                                                      delegates:@[]
                                                                           error:&error];
   XCTAssertNil(error);
   XCTAssertNotNil(customInterpreter);
@@ -134,7 +134,7 @@ static const float kTestAccuracy = 1E-5F;
   XCTAssertNil(error);
   XCTAssertTrue([inputTensor.name isEqualToString:@"input"]);
   XCTAssertEqual(inputTensor.dataType, TFLTensorDataTypeFloat32);
-  NSArray *inputTensorShape = [inputTensor shapeWithError:&error];
+  NSArray<NSNumber *> *inputTensorShape = [inputTensor shapeWithError:&error];
   XCTAssertNil(error);
   XCTAssertTrue([shape isEqualToArray:inputTensorShape]);
 
@@ -157,7 +157,7 @@ static const float kTestAccuracy = 1E-5F;
   XCTAssertNil(error);
   XCTAssertTrue([outputTensor.name isEqualToString:@"output"]);
   XCTAssertEqual(outputTensor.dataType, TFLTensorDataTypeFloat32);
-  NSArray *outputTensorShape = [outputTensor shapeWithError:&error];
+  NSArray<NSNumber *> *outputTensorShape = [outputTensor shapeWithError:&error];
   XCTAssertNil(error);
   XCTAssertTrue([shape isEqualToArray:outputTensorShape]);
 
@@ -180,7 +180,7 @@ static const float kTestAccuracy = 1E-5F;
 
 - (void)testSuccessfulFullRunQuantizedModel {
   // Shape for both input and output tensor.
-  NSArray<NSNumber *> *shape = @[@(kAddModelTensorFirstDimensionSize)];
+  NSArray<NSNumber *> *shape = @[ @(kAddModelTensorFirstDimensionSize) ];
 
   // Creates the interpreter options.
   TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
@@ -193,8 +193,10 @@ static const float kTestAccuracy = 1E-5F;
 
   // Creates the interpreter.
   NSError *error;
-  TFLInterpreter *customInterpreter =
-      [[TFLInterpreter alloc] initWithModelPath:quantizedModelPath options:options error:&error];
+  TFLInterpreter *customInterpreter = [[TFLInterpreter alloc] initWithModelPath:quantizedModelPath
+                                                                        options:options
+                                                                      delegates:@[]
+                                                                          error:&error];
   XCTAssertNil(error);
   XCTAssertNotNil(customInterpreter);
 
@@ -223,7 +225,7 @@ static const float kTestAccuracy = 1E-5F;
   XCTAssertEqualWithAccuracy(inputTensor.quantizationParameters.scale, kAddQuantizedModelScale,
                              kTestAccuracy);
   XCTAssertEqual(inputTensor.quantizationParameters.zeroPoint, kAddQuantizedModelZeroPoint);
-  NSArray *inputTensorShape = [inputTensor shapeWithError:&error];
+  NSArray<NSNumber *> *inputTensorShape = [inputTensor shapeWithError:&error];
   XCTAssertNil(error);
   XCTAssertTrue([shape isEqualToArray:inputTensorShape]);
 
@@ -249,7 +251,7 @@ static const float kTestAccuracy = 1E-5F;
   XCTAssertEqualWithAccuracy(outputTensor.quantizationParameters.scale, kAddQuantizedModelScale,
                              kTestAccuracy);
   XCTAssertEqual(outputTensor.quantizationParameters.zeroPoint, kAddQuantizedModelZeroPoint);
-  NSArray *outputTensorShape = [outputTensor shapeWithError:&error];
+  NSArray<NSNumber *> *outputTensorShape = [outputTensor shapeWithError:&error];
   XCTAssertNil(error);
   XCTAssertTrue([shape isEqualToArray:outputTensorShape]);
 
@@ -299,7 +301,7 @@ static const float kTestAccuracy = 1E-5F;
 }
 
 - (void)testResizeInputTensorAtIndex_invalidIndex {
-  NSArray<NSNumber *> *shape = @[@(kAddModelTensorFirstDimensionSize)];
+  NSArray<NSNumber *> *shape = @[ @(kAddModelTensorFirstDimensionSize) ];
   NSError *error;
   XCTAssertFalse([self.interpreter resizeInputTensorAtIndex:kInvalidInputTensorIndex
                                                     toShape:shape
@@ -308,14 +310,14 @@ static const float kTestAccuracy = 1E-5F;
 }
 
 - (void)testResizeInputTensorAtIndex_emptyShape {
-  NSMutableArray *emptyShape = [NSMutableArray arrayWithCapacity:0];
+  NSMutableArray<NSNumber *> *emptyShape = [NSMutableArray arrayWithCapacity:0];
   NSError *error;
   XCTAssertFalse([self.interpreter resizeInputTensorAtIndex:0 toShape:emptyShape error:&error]);
   XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidShape);
 }
 
 - (void)testResizeInputTensorAtIndex_zeroDimensionSize {
-  NSArray<NSNumber *> *shape = @[@0];
+  NSArray<NSNumber *> *shape = @[ @0 ];
   NSError *error;
   XCTAssertFalse([self.interpreter resizeInputTensorAtIndex:0 toShape:shape error:&error]);
   XCTAssertEqual(error.code, TFLInterpreterErrorCodeInvalidShape);
@@ -349,6 +351,25 @@ static const float kTestAccuracy = 1E-5F;
   XCTAssertEqual(error.code, TFLInterpreterErrorCodeCopyDataToOutputTensorNotAllowed);
 }
 
+- (void)testNilCDelegate {
+  // Creates the interpreter options.
+  TFLInterpreterOptions *options = [[TFLInterpreterOptions alloc] init];
+
+  // Creates the interpreter.
+  NSError *error;
+  TFLDelegate *delegate = [[TFLDelegate alloc] init];  // Base delegate's cDelegate is nil.
+  TFLInterpreter *customInterpreter = [[TFLInterpreter alloc] initWithModelPath:self.floatModelPath
+                                                                        options:options
+                                                                      delegates:@[ delegate ]
+                                                                          error:&error];
+  XCTAssertNil(error);
+  XCTAssertNotNil(customInterpreter);
+
+  // Allocates memory for tensors.
+  XCTAssertTrue([customInterpreter allocateTensorsWithError:&error]);
+  XCTAssertNil(error);
+}
+
 @end
 
 NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/objc/tests/TFLMetalDelegateTests.m b/tensorflow/lite/experimental/objc/tests/TFLMetalDelegateTests.m
new file mode 100644
index 00000000000..f627c96cfef
--- /dev/null
+++ b/tensorflow/lite/experimental/objc/tests/TFLMetalDelegateTests.m
@@ -0,0 +1,121 @@
+// Copyright 2020 Google Inc. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef COCOAPODS
+@import TFLTensorFlowLite;
+#else
+#import "tensorflow/lite/experimental/objc/apis/TFLMetalDelegate.h"
+#import "tensorflow/lite/experimental/objc/apis/TFLTensorFlowLite.h"
+#endif
+
+#import <Metal/MTLDevice.h>
+#import <Metal/Metal.h>
+#import <XCTest/XCTest.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+/**
+ * Float model resource name.
+ *  The model has four inputs (a, b, c, d) and two outputs (x, y)
+ *  x = a + (b + c)
+ *  y = (b + c) + d
+ */
+static NSString* const kAddFloatModelResourceName = @"multi_add";
+
+/** Model resource type. */
+static NSString* const kAddModelResourceType = @"bin";
+
+/** Size of input and output tensors */
+static const int kTensorSize = 8 * 8 * 3;
+
+/** Size of channel dimension of input and output tensors */
+static const int kTensorChannels = 3;
+
+/** Number of input tensors */
+static const int kNumInputs = 4;
+
+/** Number of output tensors */
+static const int kNumOutputs = 2;
+
+/** Accuracy used in comparing floating numbers. */
+static const float kTestAccuracy = 1E-5F;
+
+@interface TFLMetalDelegateTests : XCTestCase
+@end
+
+@implementation TFLMetalDelegateTests
+
+- (void)testMetalDelegate {
+  NSBundle* bundle = [NSBundle bundleForClass:[self class]];
+  NSString* floatModelPath = [bundle pathForResource:kAddFloatModelResourceName
+                                              ofType:kAddModelResourceType];
+
+  TFLInterpreterOptions* options = [[TFLInterpreterOptions alloc] init];
+  TFLMetalDelegate* metalDelegate = [[TFLMetalDelegate alloc] init];
+  XCTAssertNotNil(metalDelegate);
+
+  id<MTLDevice> mtlDevice = MTLCreateSystemDefaultDevice();
+  if (mtlDevice == nil) return;  // Stop testing if there's no GPU support
+
+  NSError* error;
+  TFLInterpreter* interpreter = [[TFLInterpreter alloc] initWithModelPath:floatModelPath
+                                                                  options:options
+                                                                delegates:@[ metalDelegate ]
+                                                                    error:&error];
+  XCTAssertNil(error);
+  XCTAssertNotNil(interpreter);
+  XCTAssertTrue([interpreter allocateTensorsWithError:&error]);
+  XCTAssertNil(error);
+
+  // Copies the input data. For each input, input[i, j, k] == k
+  NSMutableData* inputData = [NSMutableData dataWithLength:sizeof(float) * kTensorSize];
+  for (int i = 0; i < kTensorSize / kTensorChannels; ++i) {
+    float* data = (float*)inputData.mutableBytes;
+    for (int j = 0; j < kTensorChannels; ++j) {
+      data[i * kTensorChannels + j] = j;
+    }
+  }
+
+  for (int input_idx = 0; input_idx < kNumInputs; ++input_idx) {
+    TFLTensor* inputTensor = [interpreter inputTensorAtIndex:input_idx error:&error];
+    XCTAssertNotNil(inputTensor);
+    XCTAssertTrue([inputTensor copyData:inputData error:&error]);
+    XCTAssertNil(error);
+  }
+
+  // Invokes the interpreter.
+  XCTAssertTrue([interpreter invokeWithError:&error]);
+  XCTAssertNil(error);
+
+  // Gets the output tensor data. For each output, output[i, j, k] == k * 3
+  for (int output_idx = 0; output_idx < kNumOutputs; ++output_idx) {
+    TFLTensor* outputTensor = [interpreter outputTensorAtIndex:output_idx error:&error];
+    NSData* outputData = [outputTensor dataWithError:&error];
+    XCTAssertNotNil(outputData);
+    XCTAssertNil(error);
+
+    float output[kTensorSize];
+
+    [outputData getBytes:output length:(sizeof(float) * kTensorSize)];
+    for (int i = 0; i < kTensorSize / kTensorChannels; ++i) {
+      for (int j = 0; j < kTensorChannels; ++j) {
+        XCTAssertEqualWithAccuracy(j * 3, output[i * kTensorChannels + j], kTestAccuracy);
+      }
+    }
+  }
+}
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/tensorflow/lite/experimental/resource/BUILD b/tensorflow/lite/experimental/resource/BUILD
index fce783d67f2..b1b53afa45b 100644
--- a/tensorflow/lite/experimental/resource/BUILD
+++ b/tensorflow/lite/experimental/resource/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
+
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
@@ -16,6 +18,7 @@ cc_library(
         "resource_variable.h",
         "static_hashtable.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:common",
diff --git a/tensorflow/lite/experimental/swift/BUILD.apple b/tensorflow/lite/experimental/swift/BUILD.apple
index d16e5b1030b..28b3c76e52c 100644
--- a/tensorflow/lite/experimental/swift/BUILD.apple
+++ b/tensorflow/lite/experimental/swift/BUILD.apple
@@ -118,7 +118,7 @@ swift_library(
     name = "TestsLibrary",
     testonly = 1,
     srcs = glob(["Tests/*.swift"]),
-    tags = TFL_DEFAULT_TAGS,
+    tags = TFL_DEFAULT_TAGS + ["nobuilder"],
     deps = [
         ":Resources",
         ":TensorFlowLiteAllDelegates",
diff --git a/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift b/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
index 7d7e79de0c1..4d0060231f6 100644
--- a/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
+++ b/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
@@ -33,7 +33,7 @@ public final class MetalDelegate: Delegate {
   public init(options: Options = Options()) {
     self.options = options
     var delegateOptions = TFLGpuDelegateOptions()
-    delegateOptions.allow_precision_loss = options.allowsPrecisionLoss
+    delegateOptions.allow_precision_loss = options.isPrecisionLossAllowed
     delegateOptions.wait_type = options.waitType.cWaitType
     delegateOptions.enable_quantization = options.isQuantizationEnabled
     cDelegate = TFLGpuDelegateCreate(&delegateOptions)
@@ -49,7 +49,13 @@ extension MetalDelegate {
   public struct Options: Equatable, Hashable {
     /// Indicates whether the GPU delegate allows precision loss, such as allowing `Float16`
     /// precision for a `Float32` computation. The default is `false`.
-    public var allowsPrecisionLoss = false
+    public var isPrecisionLossAllowed = false
+
+    @available(*, deprecated: 2.4, renamed: "isPrecisionLossAllowed")
+    public var allowsPrecisionLoss: Bool {
+      get { return isPrecisionLossAllowed }
+      set(value) { isPrecisionLossAllowed = value }
+    }
 
     /// A type indicating how the current thread should wait for work on the GPU to complete. The
     /// default is `passive`.
diff --git a/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift b/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift
index 8af43842d7a..8e8de7c320d 100644
--- a/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift
@@ -20,16 +20,16 @@ class MetalDelegateTests: XCTestCase {
 
   func testInitDefaultGPUDelegateOptions() {
     let delegate = MetalDelegate()
-    XCTAssertFalse(delegate.options.allowsPrecisionLoss)
+    XCTAssertFalse(delegate.options.isPrecisionLossAllowed)
     XCTAssertEqual(delegate.options.waitType, .passive)
   }
 
   func testInitWithCustomGPUDelegateOptions() {
     var options = MetalDelegate.Options()
-    options.allowsPrecisionLoss = true
+    options.isPrecisionLossAllowed = true
     options.waitType = .active
     let delegate = MetalDelegate(options: options)
-    XCTAssertTrue(delegate.options.allowsPrecisionLoss)
+    XCTAssertTrue(delegate.options.isPrecisionLossAllowed)
     XCTAssertEqual(delegate.options.waitType, .active)
   }
 
@@ -58,15 +58,15 @@ class MetalDelegateOptionsTests: XCTestCase {
 
   func testInitWithDefaultValues() {
     let options = MetalDelegate.Options()
-    XCTAssertFalse(options.allowsPrecisionLoss)
+    XCTAssertFalse(options.isPrecisionLossAllowed)
     XCTAssertEqual(options.waitType, .passive)
   }
 
   func testInitWithCustomValues() {
     var options = MetalDelegate.Options()
-    options.allowsPrecisionLoss = true
+    options.isPrecisionLossAllowed = true
     options.waitType = .active
-    XCTAssertTrue(options.allowsPrecisionLoss)
+    XCTAssertTrue(options.isPrecisionLossAllowed)
     XCTAssertEqual(options.waitType, .active)
   }
 
@@ -75,17 +75,17 @@ class MetalDelegateOptionsTests: XCTestCase {
     var options2 = MetalDelegate.Options()
     XCTAssertEqual(options1, options2)
 
-    options1.allowsPrecisionLoss = true
-    options2.allowsPrecisionLoss = true
+    options1.isPrecisionLossAllowed = true
+    options2.isPrecisionLossAllowed = true
     XCTAssertEqual(options1, options2)
 
     options1.waitType = .none
     options2.waitType = .none
     XCTAssertEqual(options1, options2)
 
-    options2.allowsPrecisionLoss = false
+    options2.isPrecisionLossAllowed = false
     XCTAssertNotEqual(options1, options2)
-    options1.allowsPrecisionLoss = false
+    options1.isPrecisionLossAllowed = false
 
     options1.waitType = .aggressive
     XCTAssertNotEqual(options1, options2)
diff --git a/tensorflow/lite/experimental/tflite_api_dispatcher/BUILD b/tensorflow/lite/experimental/tflite_api_dispatcher/BUILD
index f4ade28eab8..bb64be61599 100644
--- a/tensorflow/lite/experimental/tflite_api_dispatcher/BUILD
+++ b/tensorflow/lite/experimental/tflite_api_dispatcher/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow/lite:build_def.bzl", "if_tflite_experimental_runtime", "tflite_experimental_runtime_linkopts")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -8,14 +8,10 @@ package(
 cc_library(
     name = "tflite_api_dispatcher",
     hdrs = ["tflite_api_dispatcher.h"],
-    defines = if_tflite_experimental_runtime(
-        if_eager = ["TFLITE_EXPERIMENTAL_RUNTIME_EAGER"],
-        if_non_eager = ["TFLITE_EXPERIMENTAL_RUNTIME_NON_EAGER"],
-        if_none = [],
-    ),
+    compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/lite:framework_lib",
-    ] + tflite_experimental_runtime_linkopts(),
+    ],
 )
 
 cc_library(
@@ -24,14 +20,5 @@ cc_library(
     deps = [
         ":tflite_api_dispatcher",
         "//tensorflow/lite:framework_lib",
-    ] + tflite_experimental_runtime_linkopts(
-        if_eager = [
-            # "//tensorflow/lite/experimental/tf_runtime/opdef:tflrt_opdefs",
-            # "//tensorflow/lite/experimental/tf_runtime/tfrt_ops:tfrt_tflite_ops_alwayslink",
-        ],
-        if_non_eager = [
-            # "//tensorflow/lite/experimental/tf_runtime/tfrt_kernels:tfrt_tflite_interpreter_alwayslink",
-            # "//third_party/tf_runtime:basic_kernels_alwayslink",
-        ],
-    ),
+    ],
 )
diff --git a/tensorflow/lite/experimental/tflite_api_dispatcher/tflite_api_dispatcher.h b/tensorflow/lite/experimental/tflite_api_dispatcher/tflite_api_dispatcher.h
index ab822d6ae7b..55771ed9673 100644
--- a/tensorflow/lite/experimental/tflite_api_dispatcher/tflite_api_dispatcher.h
+++ b/tensorflow/lite/experimental/tflite_api_dispatcher/tflite_api_dispatcher.h
@@ -18,51 +18,17 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXPERIMENTAL_TFLITE_API_DISPATCHER_TFLITE_API_DISPATCHER_H_
 #define TENSORFLOW_LITE_EXPERIMENTAL_TFLITE_API_DISPATCHER_TFLITE_API_DISPATCHER_H_
 
-#ifndef TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-#define TFLITE_EXPERIMENTAL_RUNTIME_EAGER (0)
-#endif
-
-#ifndef TFLITE_EXPERIMENTAL_RUNTIME_NON_EAGER
-#define TFLITE_EXPERIMENTAL_RUNTIME_NON_EAGER (0)
-#endif
-
-#if TFLITE_EXPERIMENTAL_RUNTIME_EAGER && TFLITE_EXPERIMENTAL_RUNTIME_NON_EAGER
-#error \
-    "TFLITE_EXPERIMENTAL_RUNTIME_EAGER and " \
-    "TFLITE_EXPERIMENTAL_RUNTIME_NON_EAGER should not both be true."
-#endif
-
 // Import the relevant interpreter and model files.
-#if TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-#include "tensorflow/lite/experimental/tf_runtime/lib/eager_model.h"
-#include "tensorflow/lite/experimental/tf_runtime/public/eager_interpreter.h"
-#elif TFLITE_EXPERIMENTAL_RUNTIME_NON_EAGER
-#include "tensorflow/lite/experimental/tf_runtime/lib/model.h"
-#include "tensorflow/lite/experimental/tf_runtime/public/interpreter.h"
-#else
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/model.h"
-#endif
 
 namespace tflite_api_dispatcher {
 
 // Use the correct interpreter.
-#if TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-using Interpreter = tflrt::EagerInterpreter;
-using InterpreterBuilder = tflrt::EagerTfLiteInterpreterBuilderAPI;
-using TfLiteModel = tflite::FlatBufferModel;
-using TfLiteVerifier = tflite::TfLiteVerifier;
-#elif TFLITE_EXPERIMENTAL_RUNTIME_NON_EAGER
-using Interpreter = tflrt::TfLiteInterpreterAPI;
-using InterpreterBuilder = tflrt::TfLiteInterpreterBuilderAPI;
-using TfLiteModel = tflrt::BEFModel;
-using TfLiteVerifier = tflrt::TfLiteVerifier;
-#else
 using tflite::Interpreter;
 using tflite::InterpreterBuilder;
 using TfLiteModel = tflite::FlatBufferModel;
 using TfLiteVerifier = tflite::TfLiteVerifier;
-#endif
 
 }  // namespace tflite_api_dispatcher
 
diff --git a/tensorflow/lite/experimental/writer/BUILD b/tensorflow/lite/experimental/writer/BUILD
index 2792cc7ae2f..ceb11e204d8 100644
--- a/tensorflow/lite/experimental/writer/BUILD
+++ b/tensorflow/lite/experimental/writer/BUILD
@@ -34,6 +34,7 @@ cc_library(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs_with_reflection",
+        "//tensorflow/lite/schema:schema_utils",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/writer/option_writer_generator.cc b/tensorflow/lite/experimental/writer/option_writer_generator.cc
index 898f4a95ef6..47550be2a21 100644
--- a/tensorflow/lite/experimental/writer/option_writer_generator.cc
+++ b/tensorflow/lite/experimental/writer/option_writer_generator.cc
@@ -163,8 +163,11 @@ class OpOptionData {
     op_to_option_["UNIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
     op_to_option_["MAXIMUM"] = "MaximumMinimumOptions";
     op_to_option_["MINIMUM"] = "MaximumMinimumOptions";
+
+    // These operators are not real ones.
     op_to_option_["CUSTOM"] = "";    // TODO(aselle): maybe something else.
     op_to_option_["DELEGATE"] = "";  // TODO(aselle): maybe something else.
+    op_to_option_["PLACEHOLDER_FOR_GREATER_OP_CODES"] = "";
 
     // Manually specified mappings between ops to "none" options -- these are
     // ops without a corresponding Options message in schema as yet. If these
diff --git a/tensorflow/lite/experimental/writer/writer_lib.cc b/tensorflow/lite/experimental/writer/writer_lib.cc
index 2f509daa9cb..9f18fff76d5 100644
--- a/tensorflow/lite/experimental/writer/writer_lib.cc
+++ b/tensorflow/lite/experimental/writer/writer_lib.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/experimental/writer/enum_mapping.h"
 #include "tensorflow/lite/schema/reflection/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index 097a11a77a4..22fd564635c 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -78,12 +78,6 @@ upper_tabs:
       - heading: "Convert a model"
       - title: "Overview"
         path: /lite/convert/
-      - title: "Python API"
-        path: /lite/convert/python_api
-      - title: "Command line"
-        path: /lite/convert/cmdline
-      - title: "Convert quantized models"
-        path: /lite/convert/quantization
       - title: "Convert RNN models"
         path: /lite/convert/rnn
       - title: "Add metadata"
@@ -98,7 +92,7 @@ upper_tabs:
         status: experimental
         path: /lite/guide/model_maker
 
-      - heading: "Inference"
+      - heading: "Run Inference"
       - title: "Overview"
         path: /lite/guide/inference
       - title: "Operator compatibility"
@@ -113,7 +107,7 @@ upper_tabs:
         path: /lite/guide/ops_version
         status: experimental
 
-      - heading: "Inference with metadata"
+      - heading: "Run Inference with metadata"
       - title: "Overview"
         path: /lite/inference_with_metadata/overview
       - title: "Generate model interfaces with codegen"
@@ -170,6 +164,9 @@ upper_tabs:
         path: /lite/performance/post_training_integer_quant
       - title: "Post-training float16 quantization"
         path: /lite/performance/post_training_float16_quant
+      - title: "Post-training integer quantization with int16 activations"
+        path: /lite/performance/post_training_integer_quant_16x8
+        status: experimental
       - title: "Quantization specification"
         path: /lite/performance/quantization_spec
 
@@ -233,9 +230,7 @@ upper_tabs:
       - title: "API Reference"
         path: /lite/api_docs/
       - heading: "Python"
-      - title: "Overview"
-        status: external
-        path: /api_docs/python/tf/lite
+      - include: /lite/api_docs/python/tf/lite/_toc.yaml
       - heading: "Android (Java)"
       - include: /lite/api_docs/java/_toc.yaml
       - heading: "C++"
diff --git a/tensorflow/lite/g3doc/api_docs/index.md b/tensorflow/lite/g3doc/api_docs/index.md
index 5db55fb28a3..98e5acb6ba2 100644
--- a/tensorflow/lite/g3doc/api_docs/index.md
+++ b/tensorflow/lite/g3doc/api_docs/index.md
@@ -4,7 +4,7 @@ The API reference documentation provides detailed information for each of the
 classes and methods in the TensorFlow Lite library. Choose your preferred
 platform from the list below.
 
-*   [Python API reference](https://tensorflow.org/api_docs/python/tf/lite)
+*   [Python API reference](https://tensorflow.org/lite/api_docs/python/tf/lite)
 *   [Android (Java) API reference](https://tensorflow.org/lite/api_docs/java/org/tensorflow/lite/package-summary)
 *   iOS API reference (coming soon)
 *   [C++ API reference](https://tensorflow.org/lite/api_docs/cc)
diff --git a/tensorflow/lite/g3doc/convert/cmdline.md b/tensorflow/lite/g3doc/convert/cmdline.md
deleted file mode 100644
index 7ad94f804c5..00000000000
--- a/tensorflow/lite/g3doc/convert/cmdline.md
+++ /dev/null
@@ -1,121 +0,0 @@
-# Converter command line reference
-
-This page describes how to use the [TensorFlow Lite converter](index.md) using
-the command line tool. However, the [Python API](python_api.md) is recommended
-for the majority of cases.
-
-Note: This only contains documentation on the command line tool in TensorFlow 2.
-Documentation on using the command line tool in TensorFlow 1 is available on
-GitHub
-([reference](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/cmdline_reference.md),
-[example](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/cmdline_examples.md)).
-
-## High-level overview
-
-The TensorFlow Lite Converter has a command line tool named `tflite_convert`,
-which supports models saved in the supported file formats:
-
-*   [SavedModel directory](https://www.tensorflow.org/guide/saved_model)
-    generated in 1.X or 2.X.
-*   [`tf.keras` model](https://www.tensorflow.org/guide/keras/overview)
-    formatted in the HDF5 file.
-
-Use the [Python API](python_api.md) for any conversions involving optimizations,
-or any additional parameters (e.g. custom objects in
-[Keras models](https://www.tensorflow.org/guide/keras/overview)).
-
-## Usage
-
-The following example shows a `SavedModel` being converted:
-
-```sh
-tflite_convert \
-  --saved_model_dir=/tmp/mobilenet_saved_model \
-  --output_file=/tmp/mobilenet.tflite
-```
-
-The inputs and outputs are specified using the following commonly used flags:
-
-*   `--output_file`. Type: string. Specifies the full path of the output file.
-*   `--saved_model_dir`. Type: string. Specifies the full path to the directory
-    containing the SavedModel generated in 1.X or 2.X.
-*   `--keras_model_file`. Type: string. Specifies the full path of the HDF5 file
-    containing the `tf.keras` model generated in 1.X or 2.X.
-
-To use all of the available flags, use the following command:
-
-```sh
-tflite_convert --help
-```
-
-The following flag can be used for compatibility with the TensorFlow 1.X version
-of the converter CLI:
-
-*   `--enable_v1_converter`. Type: bool. Enables user to enable the 1.X command
-    line flags instead of the 2.X flags. The 1.X command line flags are
-    specified
-    [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/cmdline_reference.md).
-
-## Installing the converter CLI
-
-To obtain the latest version of the TensorFlow Lite converter CLI, we recommend
-installing the nightly build using
-[pip](https://www.tensorflow.org/install/pip):
-
-```sh
-pip install tf-nightly
-```
-
-Alternatively, you can
-[clone the TensorFlow repository](https://www.tensorflow.org/install/source) and
-use `bazel` to run the command:
-
-```sh
-bazel run //tensorflow/lite/python:tflite_convert -- \
-  --saved_model_dir=/tmp/mobilenet_saved_model \
-  --output_file=/tmp/mobilenet.tflite
-```
-
-### Custom ops in the new converter
-
-There is a behavior change in how models containing
-[custom ops](https://www.tensorflow.org/lite/guide/ops_custom) (those for which
-users previously set `--allow_custom_ops` before) are handled in the
-[new converter](https://github.com/tensorflow/tensorflow/blob/917ebfe5fc1dfacf8eedcc746b7989bafc9588ef/tensorflow/lite/python/lite.py#L81).
-
-**Built-in TensorFlow op**
-
-If you are converting a model with a built-in TensorFlow op that does not exist
-in TensorFlow Lite, you should set `--allow_custom_ops` argument (same as
-before), explained [here](https://www.tensorflow.org/lite/guide/ops_custom).
-
-**Custom op in TensorFlow**
-
-If you are converting a model with a custom TensorFlow op, it is recommended
-that you write a [TensorFlow kernel](https://www.tensorflow.org/guide/create_op)
-and [TensorFlow Lite kernel](https://www.tensorflow.org/lite/guide/ops_custom).
-This ensures that the model is working end-to-end, from TensorFlow and
-TensorFlow Lite. This also requires setting the `--allow_custom_ops` argument.
-
-**Advanced custom op usage (not recommended)**
-
-If the above is not possible, you can still convert a TensorFlow model
-containing a custom op without a corresponding kernel. You will need to pass the
-[OpDef](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op_def.proto)
-of the custom op in TensorFlow using `--custom_opdefs` flag, as long as you have
-the corresponding OpDef registered in the TensorFlow global op registry. This
-ensures that the TensorFlow model is valid (i.e. loadable by the TensorFlow
-runtime).
-
-If the custom op is not part of the global TensorFlow op registry, then the
-corresponding OpDef needs to be specified via the `--custom_opdefs` flag. This
-is a list of an OpDef proto in string that needs to be additionally registered.
-Below is an example of a TFLiteAwesomeCustomOp with 2 inputs, 1 output, and 2
-attributes:
-
-```sh
---custom_opdefs="name: 'TFLiteAwesomeCustomOp' input_arg: { name: 'InputA'
-type: DT_FLOAT } input_arg: { name: ‘InputB' type: DT_FLOAT }
-output_arg: { name: 'Output' type: DT_FLOAT } attr : { name: 'Attr1' type:
-'float'} attr : { name: 'Attr2' type: 'list(float)'}"
-```
diff --git a/tensorflow/lite/g3doc/convert/index.md b/tensorflow/lite/g3doc/convert/index.md
index 27913fc8ba0..36774205770 100644
--- a/tensorflow/lite/g3doc/convert/index.md
+++ b/tensorflow/lite/g3doc/convert/index.md
@@ -1,66 +1,237 @@
 # TensorFlow Lite converter
 
 The TensorFlow Lite converter takes a TensorFlow model and generates a
-TensorFlow Lite model file (`.tflite`). The converter supports
-[SavedModel directories](https://www.tensorflow.org/guide/saved_model),
-[`tf.keras` models](https://www.tensorflow.org/guide/keras/overview), and
-[concrete functions](https://tensorflow.org/guide/concrete_function).
+TensorFlow Lite model (an optimized
+[FlatBuffer](https://google.github.io/flatbuffers/) format identified by the
+`.tflite` file extension). You have the following two options for using the
+converter:
 
-Note: This page contains documentation on the converter API for TensorFlow 2.0.
-The API for TensorFlow 1.X is available
-[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/index.md).
+1.  [Python API](#python_api) (***recommended***): This makes it easier to
+    convert models as part of the model development pipeline, apply
+    optimizations, add metadata and has many more features.
+2.  [Command line](#cmdline): This only supports basic model conversion.
 
-## Converting models
+Note: In case you encounter any issues during model conversion, create a
+[GitHub issue](https://github.com/tensorflow/tensorflow/issues/new?template=60-tflite-converter-issue.md).
 
-In TensorFlow Lite, there are two ways to create a TensorFlow Lite model file:
+![TFLite converter workflow](../images/convert/convert.png)
 
-*   [Python API](python_api.md) (recommended): The Python API makes it easier to
-    convert models as part of a model development pipeline and helps mitigate
-    [compatibility](../guide/ops_compatibility.md) issues early on.
-*   [Command line tool](cmdline.md): The CLI tool supports converting the models
-    saved in the supported file formats, the directory containing the SavedModel
-    and the HDF5 file containing the
-    [`tf.keras` model](https://www.tensorflow.org/guide/keras/overview).
+## Python API <a name="python_api"></a>
 
-## Device deployment
+*Helper code: To identify the installed TensorFlow version, run
+`print(tf.__version__)` and to learn more about the TensorFlow Lite converter
+API, run `print(help(tf.lite.TFLiteConverter))`.*
 
-The TensorFlow Lite model is formatted in
-[`FlatBuffer`](https://google.github.io/flatbuffers/). After conversion, The
-model file is then deployed to a client device (e.g. mobile, embedded) and run
-locally using the TensorFlow Lite interpreter. This conversion process is shown
-in the diagram below:
+If you've
+[installed TensorFlow 2.x](https://www.tensorflow.org/install/pip#tensorflow-2-packages-are-available),
+you have the following two options: (*if you've
+[installed TensorFlow 1.x](https://www.tensorflow.org/install/pip#older-versions-of-tensorflow),
+refer to
+[Github](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/python_api.md)*)
 
-![TFLite converter workflow](../images/convert/workflow.svg)
+*   Convert a TensorFlow 2.x model using
+    [`tf.lite.TFLiteConverter`](https://www.tensorflow.org/api_docs/python/tf/lite/TFLiteConverter).
+    A TensorFlow 2.x model is stored using the SavedModel format and is
+    generated either using the high-level `tf.keras.*` APIs (a Keras model) or
+    the low-level `tf.*` APIs (from which you generate concrete functions). As a
+    result, you have the following three options (examples are in the next few
+    sections):
 
-## MLIR-based conversion
+    *   `tf.lite.TFLiteConverter.from_saved_model()` (**recommended**): Converts
+        a [SavedModel](https://www.tensorflow.org/guide/saved_model).
+    *   `tf.lite.TFLiteConverter.from_keras_model()`: Converts a
+        [Keras](https://www.tensorflow.org/guide/keras/overview) model.
+    *   `tf.lite.TFLiteConverter.from_concrete_functions()`: Converts
+        [concrete functions](https://www.tensorflow.org/guide/intro_to_graphs).
 
-TensorFlow Lite has switched to use a new converter backend, based on MLIR, by
-default since TF 2.2 version. The new converter backend provides the following
-benefits:
+*   Convert a TensorFlow 1.x model using
+    [`tf.compat.v1.lite.TFLiteConverter`](https://www.tensorflow.org/api_docs/python/tf/compat/v1/lite/TFLiteConverter)
+    (examples are on
+    [Github](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/python_api.md)):
 
-*   Enables conversion of new classes of models, including Mask R-CNN, Mobile
-    BERT, and many more
-*   Adds support for functional control flow (enabled by default in TensorFlow
-    2.x)
-*   Tracks original TensorFlow node name and Python code, and exposes them
-    during conversion if errors occur
-*   Leverages MLIR, Google's cutting edge compiler technology for ML, which
-    makes it easier to extend to accommodate feature requests
-*   Adds basic support for models with input tensors containing unknown
-    dimensions
-*   Supports all existing converter functionality
+    *   `tf.compat.v1.lite.TFLiteConverter.from_saved_model()`: Converts a
+        [SavedModel](https://www.tensorflow.org/guide/saved_model).
+    *   `tf.compat.v1.lite.TFLiteConverter.from_keras_model_file()`: Converts a
+        [Keras](https://www.tensorflow.org/guide/keras/overview) model.
+    *   `tf.compat.v1.lite.TFLiteConverter.from_session()`: Converts a GraphDef
+        from a session.
+    *   `tf.compat.v1.lite.TFLiteConverter.from_frozen_graph()`: Converts a
+        Frozen GraphDef from a file. If you have checkpoints, then first convert
+        it to a Frozen GraphDef file and then use this API as shown
+        [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/python_api.md#checkpoints).
 
-## Getting Help
+Note: The following sections assume you've both installed TensorFlow 2.x and
+trained models in TensorFlow 2.x.
 
-To get help with issues you may encounter using the TensorFlow Lite converter:
+### Convert a SavedModel (recommended) <a name="saved_model"></a>
 
-*   Please create a
-    [GitHub issue](https://github.com/tensorflow/tensorflow/issues/new?template=60-tflite-converter-issue.md)
-    with the component label “TFLiteConverter”.
-*   If you are using the `allow_custom_ops` feature, please read the
-    [Python API](../convert/python_api.md) and
-    [Command Line Tool](../convert/cmdline.md) documentation
-*   Switch to the old converter by setting `--experimental_new_converter=false`
-    (from the [tflite_convert](../convert/cmdline.md) command line tool) or
-    `converter.experimental_new_converter=False` (from the
-    [Python API](https://www.tensorflow.org/api_docs/python/tf/lite/TFLiteConverter))
+The following example shows how to convert a
+[SavedModel](https://www.tensorflow.org/guide/saved_model) into a TensorFlow
+Lite model.
+
+```python
+import tensorflow as tf
+
+# Convert the model
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir) # path to the SavedModel directory
+tflite_model = converter.convert().
+
+# Save the model.
+with open('model.tflite', 'wb') as f:
+  f.write(tflite_model)
+```
+
+### Convert a Keras model <a name="keras"></a>
+
+The following example shows how to convert a
+[Keras](https://www.tensorflow.org/guide/keras/overview) model into a TensorFlow
+Lite model.
+
+```python
+import tensorflow as tf
+
+# Create a model using high-level tf.keras.* APIs
+model = tf.keras.models.Sequential([
+    tf.keras.layers.Dense(units=1, input_shape=[1])
+    tf.keras.layers.Dense(units=16, activation='relu'),
+    tf.keras.layers.Dense(units=1)
+])
+model.compile(optimizer='sgd', loss='mean_squared_error') # compile the model
+model.fit(x=[-1, 0, 1], y=[-3, -1, 1], epochs=5) # train the model
+# (to generate a SavedModel) tf.saved_model.save(model, "saved_model_keras_dir")
+
+# Convert the model.
+converter = tf.lite.TFLiteConverter.from_keras_model(model)
+tflite_model = converter.convert()
+
+# Save the model.
+with open('model.tflite', 'wb') as f:
+  f.write(tflite_model)
+```
+
+### Convert concrete functions <a name="concrete_function"></a>
+
+The following example shows how to convert
+[concrete functions](https://www.tensorflow.org/guide/intro_to_graphs) into a
+TensorFlow Lite model.
+
+Note: Currently, it only supports the conversion of a single concrete function.
+
+```python
+import tensorflow as tf
+
+# Create a model using low-level tf.* APIs
+class Squared(tf.Module):
+  @tf.function
+  def __call__(self, x):
+    return tf.square(x)
+model = Squared()
+# (ro run your model) result = Squared(5.0) # This prints "25.0"
+# (to generate a SavedModel) tf.saved_model.save(model, "saved_model_tf_dir")
+concrete_func = model.__call__.get_concrete_function()
+
+# Convert the model
+converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+tflite_model = converter.convert()
+
+# Save the model.
+with open('model.tflite', 'wb') as f:
+  f.write(tflite_model)
+```
+
+### Other features
+
+*   Apply [optimizations](../performance/model_optimization.md). A common
+    optimization used is
+    [post training quantization](../performance/post_training_quantization.md),
+    which can further reduce your model latency and size with minimal loss in
+    accuracy.
+
+*   Handle unsupported operations. You have the following options if your model
+    has operators:
+
+    1.  Supported in TensorFlow but unsupported in TensorFlow Lite: If you have
+        size constraints, you need to
+        [create the TensorFlow Lite operator](../guide/ops_custom.md), otherwise
+        just [use TensorFlow operators](../guide/ops_select.md) in your
+        TensorFlow Lite model.
+
+    2.  Unsupported in TensorFlow: You need to
+        [create the TensorFlow operator](https://www.tensorflow.org/guide/create_op)
+        and then [create the TensorFlow Lite operator](../guide/ops_custom.md).
+        If you were unsuccessful at creating the TensorFlow operator or don't
+        wish to create one (**not recommended, proceed with caution**), you can
+        still convert using the `register_custom_opdefs` method and then
+        directly [create the TensorFlow Lite operator](../guide/ops_custom.md).
+        The `register_custom_opdefs` method takes a list of a string containing
+        an
+        [OpDef](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op_def.proto)
+        (s). Below is an example of a `TFLiteAwesomeCustomOp` with 1 input, 1
+        output, and 2 attributes:
+
+        ```python
+          import tensorflow as tf
+
+          custom_opdef = """name: 'TFLiteAwesomeCustomOp' input_arg:
+          { name: 'In' type: DT_FLOAT } output_arg: { name: 'Out' type: DT_FLOAT }
+          attr : { name: 'a1' type: 'float'} attr : { name: 'a2' type: 'list(float)'}"""
+
+          # Register custom opdefs before the invocation of converter API.
+          tf.lite.python.convert.register_custom_opdefs([custom_opdef])
+
+          converter = tf.lite.TFLiteConverter.from_saved_model(...)
+          converter.allow_custom_ops = True
+        ```
+
+## Command Line Tool <a name="cmdline"></a>
+
+**It is highly recommended that you use the [Python API](#python_api) listed
+above instead, if possible.**
+
+If you've
+[installed TensorFlow 2.x from pip](https://www.tensorflow.org/install/pip), use
+the `tflite_convert` command as follows: (*if you've
+[installed TensorFlow 2.x from source](https://www.tensorflow.org/install/source)
+then you can replace '`tflite_convert`' with '`bazel run
+//tensorflow/lite/python:tflite_convert --`' in the following
+sections, and if you've
+[installed TensorFlow 1.x](https://www.tensorflow.org/install/pip#older-versions-of-tensorflow)
+then refer to Github
+([reference](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/cmdline_reference.md),
+[examples](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/cmdline_examples.md)))*
+
+`tflite_convert`: To view all the available flags, use the following command:
+
+```sh
+$ tflite_convert --help
+
+`--output_file`. Type: string. Full path of the output file.
+`--saved_model_dir`. Type: string. Full path to the SavedModel directory.
+`--keras_model_file`. Type: string. Full path to the Keras H5 model file.
+`--enable_v1_converter`. Type: bool. (default False) Enables the converter and flags used in TF 1.x instead of TF 2.x.
+
+You are required to provide the `--output_file` flag and either the `--saved_model_dir` or `--keras_model_file` flag.
+```
+
+### Converting a SavedModel <a name="cmdline_saved_model"></a>
+
+```sh
+tflite_convert \
+  --saved_model_dir=/tmp/mobilenet_saved_model \
+  --output_file=/tmp/mobilenet.tflite
+```
+
+### Converting a Keras H5 model <a name="cmdline_keras_model"></a>
+
+```sh
+tflite_convert \
+  --keras_model_file=/tmp/mobilenet_keras_model.h5 \
+  --output_file=/tmp/mobilenet.tflite
+```
+
+## Next Steps
+
+*   Add [metadata](metadata.md), which makes it easier to create platform
+    specific wrapper code when deploying models on devices.
+*   Use the [TensorFlow Lite interpreter](../guide/inference.md) to run
+    inference on a client device (e.g. mobile, embedded).
diff --git a/tensorflow/lite/g3doc/convert/metadata.md b/tensorflow/lite/g3doc/convert/metadata.md
index 089203fd9aa..667e12fae6f 100644
--- a/tensorflow/lite/g3doc/convert/metadata.md
+++ b/tensorflow/lite/g3doc/convert/metadata.md
@@ -408,8 +408,22 @@ Flatbuffers library.
 
 ### Read the metadata in Java
 
-Note: the Java Metadata Extractor library is available as an Android library
-dependency: `org.tensorflow:tensorflow-lite-metadata`.
+To use the Metadata Extractor library in your Android app, we recommend using
+the
+[TensorFlow Lite Metadata AAR hosted at JCenter](https://bintray.com/google/tensorflow/tensorflow-lite-metadata).
+It contains the `MetadataExtractor` class, as well as the FlatBuffers Java
+bindings for the
+[metadata schema](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/metadata/metadata_schema.fbs)
+and the
+[model schema](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs).
+
+You can specify this in your `build.gradle` dependencies as follows:
+
+```build
+dependencies {
+    implementation 'org.tensorflow:tensorflow-lite-metadata:0.0.0-nightly'
+}
+```
 
 You can initialize a `MetadataExtractor` object with a `ByteBuffer` that points
 to the model:
diff --git a/tensorflow/lite/g3doc/convert/python_api.md b/tensorflow/lite/g3doc/convert/python_api.md
deleted file mode 100644
index 0c43a795514..00000000000
--- a/tensorflow/lite/g3doc/convert/python_api.md
+++ /dev/null
@@ -1,267 +0,0 @@
-# Converter Python API guide
-
-This page provides examples on how to use the
-[TensorFlow Lite converter](index.md) using the Python API.
-
-Note: This only contains documentation on the Python API in TensorFlow 2.
-Documentation on using the Python API in TensorFlow 1 is available on
-[GitHub](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/r1/convert/python_api.md).
-
-[TOC]
-
-## Python API
-
-The Python API for converting TensorFlow models to TensorFlow Lite is
-`tf.lite.TFLiteConverter`. `TFLiteConverter` provides the following classmethods
-to convert a model based on the original model format:
-
-*   `TFLiteConverter.from_saved_model()`: Converts
-    [SavedModel directories](https://www.tensorflow.org/guide/saved_model).
-*   `TFLiteConverter.from_keras_model()`: Converts
-    [`tf.keras` models](https://www.tensorflow.org/guide/keras/overview).
-*   `TFLiteConverter.from_concrete_functions()`: Converts
-    [concrete functions](https://tensorflow.org/guide/concrete_function).
-
-This document contains [example usages](#examples) of the API and
-[instructions](#versioning) on running the different versions of TensorFlow.
-
-## Examples <a name="examples"></a>
-
-### Converting a SavedModel <a name="saved_model"></a>
-
-The following example shows how to convert a
-[SavedModel](https://www.tensorflow.org/guide/saved_model) into a TensorFlow
-Lite [`FlatBuffer`](https://google.github.io/flatbuffers/).
-
-```python
-import tensorflow as tf
-
-# Construct a basic model.
-root = tf.train.Checkpoint()
-root.v1 = tf.Variable(3.)
-root.v2 = tf.Variable(2.)
-root.f = tf.function(lambda x: root.v1 * root.v2 * x)
-
-# Save the model in SavedModel format.
-export_dir = "/tmp/test_saved_model"
-input_data = tf.constant(1., shape=[1, 1])
-to_save = root.f.get_concrete_function(input_data)
-tf.saved_model.save(root, export_dir, to_save)
-
-# Convert the model.
-converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)
-tflite_model = converter.convert()
-
-# Save the TF Lite model.
-with tf.io.gfile.GFile('model.tflite', 'wb') as f:
-  f.write(tflite_model)
-```
-
-This API does not have the option of specifying the input shape of any input
-arrays. If your model requires specifying the input shape, use the
-[`from_concrete_functions`](#concrete_function) classmethod instead. The code
-looks similar to the following:
-
-```python
-model = tf.saved_model.load(export_dir)
-concrete_func = model.signatures[
-  tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
-concrete_func.inputs[0].set_shape([1, 256, 256, 3])
-converter = TFLiteConverter.from_concrete_functions([concrete_func])
-```
-
-### Converting a Keras model <a name="keras"></a>
-
-The following example shows how to convert a
-[`tf.keras` model](https://www.tensorflow.org/guide/keras/overview) into a
-TensorFlow Lite [`FlatBuffer`](https://google.github.io/flatbuffers/).
-
-```python
-import tensorflow as tf
-
-# Create a simple Keras model.
-x = [-1, 0, 1, 2, 3, 4]
-y = [-3, -1, 1, 3, 5, 7]
-
-model = tf.keras.models.Sequential(
-    [tf.keras.layers.Dense(units=1, input_shape=[1])])
-model.compile(optimizer='sgd', loss='mean_squared_error')
-model.fit(x, y, epochs=50)
-
-# Convert the model.
-converter = tf.lite.TFLiteConverter.from_keras_model(model)
-tflite_model = converter.convert()
-
-# Save the TF Lite model.
-with tf.io.gfile.GFile('model.tflite', 'wb') as f:
-  f.write(tflite_model)
-```
-
-If your model requires specifying the input shape, use `tf.keras.layers.Input`
-or `tf.keras.layers.InputLayer` to create a Keras model with a fixed input shape
-as seen below or use the [`from_concrete_functions`](#concrete_function)
-classmethod as shown in the prior section to set the shape of the input arrays
-prior to conversion.
-
-```python
-input = tf.keras.layers.Input(shape=(1), batch_size=1)
-dense_layer = tf.keras.layers.Dense(units=1, input_shape=[1])
-model = tf.keras.Model(input, dense_layer(input))
-```
-
-```python
-model = tf.keras.models.Sequential(
-    [tf.keras.layers.InputLayer(input_shape=(1), batch_size=1),
-     tf.keras.layers.Dense(units=1, input_shape=[1])])
-```
-
-### Converting a concrete function <a name="concrete_function"></a>
-
-The following example shows how to convert a TensorFlow
-[concrete function](https://tensorflow.org/guide/concrete_function) into a
-TensorFlow Lite [`FlatBuffer`](https://google.github.io/flatbuffers/).
-
-```python
-import tensorflow as tf
-
-# Construct a basic model.
-root = tf.train.Checkpoint()
-root.v1 = tf.Variable(3.)
-root.v2 = tf.Variable(2.)
-root.f = tf.function(lambda x: root.v1 * root.v2 * x)
-
-# Create the concrete function.
-input_data = tf.constant(1., shape=[1, 1])
-concrete_func = root.f.get_concrete_function(input_data)
-
-# Convert the model.
-#
-# `from_concrete_function` takes in a list of concrete functions, however,
-# currently only supports converting one function at a time. Converting multiple
-# functions is under development.
-converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
-tflite_model = converter.convert()
-
-# Save the TF Lite model.
-with tf.io.gfile.GFile('model.tflite', 'wb') as f:
-  f.write(tflite_model)
-```
-
-### End-to-end MobileNet conversion <a name="mobilenet"></a>
-
-The following example shows how to convert and run inference on a pre-trained
-`tf.keras` MobileNet model to TensorFlow Lite. It compares the results of the
-TensorFlow and TensorFlow Lite model on random data. In order to load the model
-from file, use `model_path` instead of `model_content`.
-
-```python
-import numpy as np
-import tensorflow as tf
-
-# Load the MobileNet tf.keras model.
-model = tf.keras.applications.MobileNetV2(
-    weights="imagenet", input_shape=(224, 224, 3))
-
-# Convert the model.
-converter = tf.lite.TFLiteConverter.from_keras_model(model)
-tflite_model = converter.convert()
-
-# Load TFLite model and allocate tensors.
-interpreter = tf.lite.Interpreter(model_content=tflite_model)
-interpreter.allocate_tensors()
-
-# Get input and output tensors.
-input_details = interpreter.get_input_details()
-output_details = interpreter.get_output_details()
-
-# Test the TensorFlow Lite model on random input data.
-input_shape = input_details[0]['shape']
-input_data = np.array(np.random.random_sample(input_shape), dtype=np.float32)
-interpreter.set_tensor(input_details[0]['index'], input_data)
-
-interpreter.invoke()
-
-# The function `get_tensor()` returns a copy of the tensor data.
-# Use `tensor()` in order to get a pointer to the tensor.
-tflite_results = interpreter.get_tensor(output_details[0]['index'])
-
-# Test the TensorFlow model on random input data.
-tf_results = model(tf.constant(input_data))
-
-# Compare the result.
-for tf_result, tflite_result in zip(tf_results, tflite_results):
-  np.testing.assert_almost_equal(tf_result, tflite_result, decimal=5)
-```
-
-#### TensorFlow Lite Metadata
-
-Note: TensorFlow Lite Metadata is in experimental (beta) phase.
-
-TensorFlow Lite metadata provides a standard for model descriptions. The
-metadata is an important source of knowledge about what the model does and its
-input / output information. This makes it easier for other developers to
-understand the best practices and for code generators to create platform
-specific wrapper code. For more information, please refer to the
-[TensorFlow Lite Metadata](metadata.md) section.
-
-## Installing TensorFlow <a name="versioning"></a>
-
-### Installing the TensorFlow nightly <a name="2.0-nightly"></a>
-
-The TensorFlow nightly can be installed using the following command:
-
-```sh
-pip install tf-nightly
-```
-
-### Build from source code <a name="latest_package"></a>
-
-In order to run the latest version of the TensorFlow Lite Converter Python API,
-either install the nightly build with
-[pip](https://www.tensorflow.org/install/pip) (recommended) or
-[Docker](https://www.tensorflow.org/install/docker), or
-[build the pip package from source](https://www.tensorflow.org/install/source).
-
-### Custom ops in the experimental new converter
-
-There is a behavior change in how models containing
-[custom ops](https://www.tensorflow.org/lite/guide/ops_custom) (those for which
-users previously set `allow_custom_ops` before) are handled in the
-[new converter](https://github.com/tensorflow/tensorflow/blob/917ebfe5fc1dfacf8eedcc746b7989bafc9588ef/tensorflow/lite/python/lite.py#L81).
-
-**Built-in TensorFlow op**
-
-If you are converting a model with a built-in TensorFlow op that does not exist
-in TensorFlow Lite, you should set the `allow_custom_ops` attribute (same as
-before), explained [here](https://www.tensorflow.org/lite/guide/ops_custom).
-
-**Custom op in TensorFlow**
-
-If you are converting a model with a custom TensorFlow op, it is recommended
-that you write a [TensorFlow kernel](https://www.tensorflow.org/guide/create_op)
-and [TensorFlow Lite kernel](https://www.tensorflow.org/lite/guide/ops_custom).
-This ensures that the model is working end-to-end, from TensorFlow and
-TensorFlow Lite. This also requires setting the `allow_custom_ops` attribute.
-
-**Advanced custom op usage (not recommended)**
-
-If the above is not possible, you can still convert a TensorFlow model
-containing a custom op without a corresponding kernel. You will need to pass the
-[OpDef](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/framework/op_def.proto)
-of the custom op in TensorFlow using `--custom_opdefs` flag, as long as you have
-the corresponding OpDef registered in the TensorFlow global op registry. This
-ensures that the TensorFlow model is valid (i.e. loadable by the TensorFlow
-runtime).
-
-If the custom op is not part of the global TensorFlow op registry, then the
-corresponding OpDef needs to be specified via the `--custom_opdefs` flag. This
-is a list of an OpDef proto in string that needs to be additionally registered.
-Below is an example of a TFLiteAwesomeCustomOp with 2 inputs, 1 output, and 2
-attributes:
-
-```python
-converter.custom_opdefs="""name: 'TFLiteAwesomeCustomOp' input_arg: { name: 'InputA'
-type: DT_FLOAT } input_arg: { name: ‘InputB' type: DT_FLOAT }
-output_arg: { name: 'Output' type: DT_FLOAT } attr : { name: 'Attr1' type:
-'float'} attr : { name: 'Attr2' type: 'list(float)'}"""
-```
diff --git a/tensorflow/lite/g3doc/convert/quantization.md b/tensorflow/lite/g3doc/convert/quantization.md
deleted file mode 100644
index 41593fb29f9..00000000000
--- a/tensorflow/lite/g3doc/convert/quantization.md
+++ /dev/null
@@ -1,79 +0,0 @@
-# Converting Quantized Models
-
-This page provides information for how to convert quantized TensorFlow Lite
-models. For more details, please see the
-[model optimization](../performance/model_optimization.md).
-
-# Post-training: Quantizing models for CPU model size
-
-The simplest way to create a small model is to quantize the weights to 8 bits
-and quantize the inputs/activations "on-the-fly", during inference. This
-has latency benefits, but prioritizes size reduction.
-
-During conversion, set the `optimizations` flag to optimize for size:
-
-```python
-converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.optimizations = [tf.lite.Optimize.DEFAULT]
-tflite_quant_model = converter.convert()
-```
-
-# Full integer quantization of weights and activations
-
-We can get further latency improvements, reductions in peak memory usage, and
-access to integer only hardware accelerators by making sure all model math is
-quantized. To do this, we need to measure the dynamic range of activations and
-inputs with a representative data set. You can simply create an input data
-generator and provide it to our converter.
-
-```python
-import tensorflow as tf
-
-def representative_dataset_gen():
-  for _ in range(num_calibration_steps):
-    # Get sample input data as a numpy array in a method of your choosing.
-    yield [input]
-
-converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.representative_dataset = representative_dataset_gen
-tflite_quant_model = converter.convert()
-```
-
-# During training: Quantizing models for integer-only execution
-
-Quantizing models for integer-only execution gets a model with even faster
-latency, smaller size, and integer-only accelerators compatible model.
-Currently, this requires training a model with
-["fake-quantization" nodes](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize).
-
-This is only available in the v1 converter. A longer term solution that's
-compatible with 2.0 semantics is in progress.
-
-Convert the graph:
-
-```python
-converter = tf.compat.v1.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.inference_type = tf.lite.constants.QUANTIZED_UINT8
-input_arrays = converter.get_input_arrays()
-converter.quantized_input_stats = {input_arrays[0] : (0., 1.)}  # mean_value, std_dev
-tflite_model = converter.convert()
-```
-
-For fully integer models, the inputs are uint8. When the `inference_type` is set
-to `QUANTIZED_UINT8` as above, the real_input_value is standardised using the
-[standard-score](https://en.wikipedia.org/wiki/Standard_score) as follows:
-
-real_input_value = (quantized_input_value - mean_value) / std_dev_value
-
-The `mean_value` and `std_dev values` specify how those uint8 values map to the
-float input values used while training the model. For more details, please see
-the
-[TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/compat/v1/lite/TFLiteConverter)
-
-`mean` is the integer value from 0 to 255 that maps to floating point 0.0f.
-`std_dev` is 255 / (float_max - float_min).
-
-For most users, we recommend using post-training quantization. We are working on
-new tools for post-training and training-time quantization that we hope will
-simplify generating quantized models.
diff --git a/tensorflow/lite/g3doc/guide/android.md b/tensorflow/lite/g3doc/guide/android.md
index 26885debdf9..420269de941 100644
--- a/tensorflow/lite/g3doc/guide/android.md
+++ b/tensorflow/lite/g3doc/guide/android.md
@@ -16,7 +16,7 @@ to continuously classify whatever it sees from the device's rear-facing camera.
 The application can run either on device or emulator.
 
 Inference is performed using the TensorFlow Lite Java API and the
-[TensorFlow Lite Android Support Library](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/README.md).
+[TensorFlow Lite Android Support Library](../inference_with_metadata/lite_support.md).
 The demo app classifies frames in real-time, displaying the top most probable
 classifications. It allows the user to choose between a floating point or
 [quantized](https://www.tensorflow.org/lite/performance/post_training_quantization)
@@ -41,6 +41,36 @@ as a starting point.
 The following sections contain some useful information for working with
 TensorFlow Lite on Android.
 
+### Use the TensorFlow Lite Task Library
+
+TensorFlow Lite Task Library contains a set of powerful and easy-to-use
+task-specific libraries for app developers to create ML experiences with TFLite.
+It provides optimized out-of-box model interfaces for popular machine learning
+tasks, such as image classification, question and answer, etc. The model
+interfaces are specifically designed for each task to achieve the best
+performance and usability. Task Library works cross-platform and is supported on
+Java, C++, and Swift (coming soon).
+
+To use the Support Library in your Android app, we recommend using the AAR
+hosted at JCenter for
+[Task Vision library](https://bintray.com/google/tensorflow/tensorflow-lite-task-vision)
+and
+[Task Text library](https://bintray.com/google/tensorflow/tensorflow-lite-task-text)
+, respectively.
+
+You can specify this in your `build.gradle` dependencies as follows:
+
+```build
+dependencies {
+    implementation 'org.tensorflow:tensorflow-lite-task-vision:0.0.0-nightly'
+    implementation 'org.tensorflow:tensorflow-lite-task-text:0.0.0-nightly'
+}
+```
+
+See the introduction in the
+[TensorFlow Lite Task Library overview](../inference_with_metadata/task_library/overview.md)
+for more details.
+
 ### Use the TensorFlow Lite Android Support Library
 
 The TensorFlow Lite Android Support Library makes it easier to integrate models
@@ -52,8 +82,19 @@ It supports common data formats for inputs and outputs, including images and
 arrays. It also provides pre- and post-processing units that perform tasks such
 as image resizing and cropping.
 
+To use the Support Library in your Android app, we recommend using the
+[TensorFlow Lite Support Library AAR hosted at JCenter](https://bintray.com/google/tensorflow/tensorflow-lite-support).
+
+You can specify this in your `build.gradle` dependencies as follows:
+
+```build
+dependencies {
+    implementation 'org.tensorflow:tensorflow-lite-support:0.0.0-nightly'
+}
+```
+
 To get started, follow the instructions in the
-[TensorFlow Lite Android Support Library README.md](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/README.md).
+[TensorFlow Lite Android Support Library](../inference_with_metadata/lite_support.md).
 
 ### Use the TensorFlow Lite AAR from JCenter
 
diff --git a/tensorflow/lite/g3doc/guide/build_android.md b/tensorflow/lite/g3doc/guide/build_android.md
index 792c609bc0e..32fc6f1facc 100644
--- a/tensorflow/lite/g3doc/guide/build_android.md
+++ b/tensorflow/lite/g3doc/guide/build_android.md
@@ -99,7 +99,7 @@ prompt. Successful configuration should yield entries similar to the following
 in the `.tf_configure.bazelrc` file in the root folder:
 
 ```shell
-build --action_env ANDROID_NDK_HOME="/usr/local/android/android-ndk-r17c"
+build --action_env ANDROID_NDK_HOME="/usr/local/android/android-ndk-r18b"
 build --action_env ANDROID_NDK_API_LEVEL="21"
 build --action_env ANDROID_BUILD_TOOLS_VERSION="28.0.3"
 build --action_env ANDROID_SDK_API_LEVEL="23"
diff --git a/tensorflow/lite/g3doc/guide/get_started.md b/tensorflow/lite/g3doc/guide/get_started.md
index df206e73416..2dff4b54c59 100644
--- a/tensorflow/lite/g3doc/guide/get_started.md
+++ b/tensorflow/lite/g3doc/guide/get_started.md
@@ -274,7 +274,7 @@ import tensorflow as tf
 
 converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 converter.optimizations = [tf.lite.Optimize.DEFAULT]
-tflite_quant_model = converter.convert()
+tflite_quantized_model = converter.convert()
 open("converted_model.tflite", "wb").write(tflite_quantized_model)
 ```
 
diff --git a/tensorflow/lite/g3doc/guide/hosted_models.md b/tensorflow/lite/g3doc/guide/hosted_models.md
index a97be10648a..32887a53c11 100644
--- a/tensorflow/lite/g3doc/guide/hosted_models.md
+++ b/tensorflow/lite/g3doc/guide/hosted_models.md
@@ -4,8 +4,8 @@ The following is an incomplete list of pre-trained models optimized to work with
 TensorFlow Lite.
 
 To get started choosing a model, visit <a href="../models">Models</a> page with
-end-to-end examples, or pick a [TensorFlow Lite model from TensorFlow Hub]
-(https://tfhub.dev/s?deployment-format=lite).
+end-to-end examples, or pick a
+[TensorFlow Lite model from TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite).
 
 Note: The best model for a given application depends on your requirements. For
 example, some applications might benefit from higher accuracy, while others
@@ -16,6 +16,9 @@ models to find the optimal balance between size, performance, and accuracy.
 
 For more information about image classification, see
 <a href="../models/image_classification/overview.md">Image classification</a>.
+Explore the TensorFlow Lite Task Library for instructions about
+[how to integrate image classification models](../inference_with_metadata/task_library/image_classifier)
+in just a few lines of code.
 
 ### Quantized models
 
@@ -24,7 +27,8 @@ classification models offer the smallest model size and fastest performance, at
 the expense of accuracy. The performance values are measured on Pixel 3 on
 Android 10.
 
-You can find many [quantized models](https://tfhub.dev/s?deployment-format=lite&module-type=image-classification&q=quantized)
+You can find many
+[quantized models](https://tfhub.dev/s?deployment-format=lite&module-type=image-classification&q=quantized)
 from TensorFlow Hub and get more model information there.
 
 Model name                  | Paper and model                                                                                                                                                                   | Model size | Top-1 accuracy | Top-5 accuracy | CPU, 4 threads | NNAPI
@@ -54,8 +58,8 @@ Inception_V4_quant          | [paper](https://arxiv.org/abs/1602.07261), [tflite
 Note: The model files include both TF Lite FlatBuffer and Tensorflow frozen
 Graph.
 
-Note: Performance numbers were benchmarked on Pixel-3 (Android 10).
-Accuracy numbers were computed using the
+Note: Performance numbers were benchmarked on Pixel-3 (Android 10). Accuracy
+numbers were computed using the
 [TFLite image classification evaluation tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification).
 
 ### Floating point models
@@ -65,7 +69,8 @@ performance. <a href="../performance/gpu">GPU acceleration</a> requires the use
 of floating point models. The performance values are measured on Pixel 3 on
 Android 10.
 
-You can find many [image classification models](https://tfhub.dev/s?deployment-format=lite&module-type=image-classification)
+You can find many
+[image classification models](https://tfhub.dev/s?deployment-format=lite&module-type=image-classification)
 from TensorFlow Hub and get more model information there.
 
 Model name            | Paper and model                                                                                                                                                                           | Model size | Top-1 accuracy | Top-5 accuracy | CPU, 4 threads | GPU    | NNAPI
@@ -102,8 +107,9 @@ The following image classification models were created using
 <a href="https://cloud.google.com/automl/">Cloud AutoML</a>. The performance
 values are measured on Pixel 3 on Android 10.
 
-You can find these models in [TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite&q=MnasNet)
-and get more model information there.
+You can find these models in
+[TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite&q=MnasNet) and get
+more model information there.
 
 Model Name       | Paper and model                                                                                                                                                | Model size | Top-1 accuracy | Top-5 accuracy | CPU, 4 threads | GPU     | NNAPI
 ---------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------: | ---------: | -------------: | -------------: | -------------: | ------: | ----:
@@ -116,16 +122,20 @@ MnasNet_1.0_192  | [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https:
 MnasNet_1.0_224  | [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.0_224_09_07_2018.tgz)  | 17 Mb      | 74.08%         | 91.75%         | 19.4 ms        | 8.7 ms  | 19 ms
 MnasNet_1.3_224  | [paper](https://arxiv.org/abs/1807.11626), [tflite&pb](https://storage.cloud.google.com/download.tensorflow.org/models/tflite/mnasnet_1.3_224_09_07_2018.tgz)  | 24 Mb      | 75.24%         | 92.55%         | 27.9 ms        | 10.6 ms | 22.0 ms
 
-Note: Performance numbers were benchmarked on Pixel-3 (Android 10).
-Accuracy numbers were computed using the
+Note: Performance numbers were benchmarked on Pixel-3 (Android 10). Accuracy
+numbers were computed using the
 [TFLite image classification evaluation tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification).
 
 ## Object detection
 
 For more information about object detection, see
-<a href="../models/object_detection/overview.md">Object detection</a>.
+<a href="../models/object_detection/overview.md">Object detection</a>. Explore
+the TensorFlow Lite Task Library for instructions about
+[how to integrate object detection models](../inference_with_metadata/task_library/object_detector)
+in just a few lines of code.
 
-Please find [object detection models](https://tfhub.dev/s?deployment-format=lite&module-type=image-object-detection)
+Please find
+[object detection models](https://tfhub.dev/s?deployment-format=lite&module-type=image-object-detection)
 from TensorFlow Hub.
 
 ## Pose estimation
@@ -133,21 +143,29 @@ from TensorFlow Hub.
 For more information about pose estimation, see
 <a href="../models/pose_estimation/overview.md">Pose estimation</a>.
 
-Please find [pose estimation models](https://tfhub.dev/s?deployment-format=lite&module-type=image-pose-detection)
+Please find
+[pose estimation models](https://tfhub.dev/s?deployment-format=lite&module-type=image-pose-detection)
 from TensorFlow Hub.
 
 ## Image segmentation
 
 For more information about image segmentation, see
-<a href="../models/segmentation/overview.md">Segmentation</a>.
+<a href="../models/segmentation/overview.md">Segmentation</a>. Explore the
+TensorFlow Lite Task Library for instructions about
+[how to integrate image segmentation models](../inference_with_metadata/task_library/image_segmenter)
+in just a few lines of code.
 
-Please find [image segmentation models](https://tfhub.dev/s?deployment-format=lite&module-type=image-segmentation)
+Please find
+[image segmentation models](https://tfhub.dev/s?deployment-format=lite&module-type=image-segmentation)
 from TensorFlow Hub.
 
 ## Question and Answer
 
-For more information about text classification with Mobile BERT, see
-<a href="../models/bert_qa/overview.md">Question And Answer</a>.
+For more information about question and answer with MobileBERT, see
+<a href="../models/bert_qa/overview.md">Question And Answer</a>. Explore the
+TensorFlow Lite Task Library for instructions about
+[how to integrate question and answer models](../inference_with_metadata/task_library/bert_question_answerer)
+in just a few lines of code.
 
 Please find [Mobile BERT model](https://tfhub.dev/tensorflow/mobilebert/1) from
 TensorFlow Hub.
diff --git a/tensorflow/lite/g3doc/guide/inference.md b/tensorflow/lite/g3doc/guide/inference.md
index 9b3ebf45991..335720f19c0 100644
--- a/tensorflow/lite/g3doc/guide/inference.md
+++ b/tensorflow/lite/g3doc/guide/inference.md
@@ -46,7 +46,8 @@ TensorFlow Lite inference typically follows the following steps:
 ## Supported platforms
 
 TensorFlow inference APIs are provided for most common mobile/embedded platforms
-such as Android, iOS and Linux, in multiple programming languages.
+such as [Android](#android-platform), [iOS](#ios-platform) and
+[Linux](#linux-platform), in multiple programming languages.
 
 In most cases, the API design reflects a preference for performance over ease of
 use. TensorFlow Lite is designed for fast inference on small devices, so it
@@ -57,20 +58,15 @@ explicit goal and some variance between languages is to be expected.
 Across all libraries, the TensorFlow Lite API enables you to load models, feed
 inputs, and retrieve inference outputs.
 
-## Supported operations
-
-TensorFlow Lite supports a subset of TensorFlow operations with some
-limitations. For full list of operations and limitations see
-[TF Lite Ops page](https://www.tensorflow.org/mlir/tfl_ops).
-
-### Android
+### Android Platform
 
 On Android, TensorFlow Lite inference can be performed using either Java or C++
 APIs. The Java APIs provide convenience and can be used directly within your
 Android Activity classes. The C++ APIs offer more flexibility and speed, but may
 require writing JNI wrappers to move data between Java and C++ layers.
 
-See below for details about using C++ and Java, or follow the
+See below for details about using [C++](#load-and-run-a-model-in-c) and
+[Java](#load-and-run-a-model-in-java), or follow the
 [Android quickstart](android.md) for a tutorial and example code.
 
 #### TensorFlow Lite Android wrapper code generator
@@ -86,7 +82,7 @@ TensorFlow Lite model with typed objects such as `Bitmap` and `Rect`. For more
 information, please refer to the
 [TensorFlow Lite Android wrapper code generator](../inference_with_metadata/codegen.md).
 
-### iOS
+### iOS Platform
 
 On iOS, TensorFlow Lite is available with native iOS libraries written in
 [Swift](https://www.tensorflow.org/code/tensorflow/lite/experimental/swift)
@@ -96,14 +92,17 @@ You can also use
 [C API](https://www.tensorflow.org/code/tensorflow/lite/c/c_api.h)
 directly in Objective-C codes.
 
-See below for details about using Swift, Objective-C and C API, or follow the
+See below for details about using [Swift](#load-and-run-a-model-in-swift),
+[Objective-C](#load-and-run-a-model-in-objective-c) and the
+[C API](#using-c-api-in-objective-c-code), or follow the
 [iOS quickstart](ios.md) for a tutorial and example code.
 
-### Linux
+### Linux Platform
 
 On Linux platforms (including [Raspberry Pi](build_rpi.md)), you can run
-inferences using TensorFlow Lite APIs available in C++ and Python, as shown in
-the following sections.
+inferences using TensorFlow Lite APIs available in
+[C++](#load-and-run-a-model-in-c) and [Python](#load-and-run-a-model-in-python),
+as shown in the following sections.
 
 ## Running a model
 
@@ -365,7 +364,7 @@ TfLiteInterpreterInvoke(interpreter);
 
 // Extract the output tensor data.
 const TfLiteTensor* output_tensor =
-//      TfLiteInterpreterGetOutputTensor(interpreter, 0);
+    TfLiteInterpreterGetOutputTensor(interpreter, 0);
 TfLiteTensorCopyToBuffer(output_tensor, output.data(),
                          output.size() * sizeof(float));
 
@@ -377,7 +376,9 @@ TfLiteModelDelete(model);
 
 ## Load and run a model in C++
 
-*Platforms: Android and Linux*
+*Platforms: Android, iOS, and Linux*
+
+Note: C++ API on iOS is only available when using bazel.
 
 In C++, the model is stored in
 [`FlatBufferModel`](https://www.tensorflow.org/lite/api_docs/cc/class/tflite/flat-buffer-model.html)
@@ -488,11 +489,11 @@ output_data = interpreter.get_tensor(output_details[0]['index'])
 print(output_data)
 ```
 
-Alternatively to loading the model as a pre-converted `.tflite` file, you can
-combine your code with the
-[TensorFlow Lite Converter Python API](../convert/python_api.md)
+As an alternative to loading the model as a pre-converted `.tflite` file, you
+can combine your code with the
+[TensorFlow Lite Converter Python API](https://www.tensorflow.org/lite/convert/python_api)
 (`tf.lite.TFLiteConverter`), allowing you to convert your TensorFlow model into
-the TensorFlow Lite format and then run an inference:
+the TensorFlow Lite format and then run inference:
 
 ```python
 import numpy as np
@@ -521,103 +522,8 @@ For more Python sample code, see
 Tip: Run `help(tf.lite.Interpreter)` in the Python terminal to get detailed
 documentation about the interpreter.
 
-## Write a custom operator
+## Supported operations
 
-All TensorFlow Lite operators (both custom and builtin) are defined using a
-simple pure-C interface that consists of four functions:
-
-```c++
-typedef struct {
-  void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
-  void (*free)(TfLiteContext* context, void* buffer);
-  TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
-  TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
-} TfLiteRegistration;
-```
-
-Refer to `context.h` for details on `TfLiteContext` and `TfLiteNode`. The former
-provides error reporting facilities and access to global objects, including all
-the tensors. The latter allows implementations to access their inputs and
-outputs.
-
-When the interpreter loads a model, it calls `init()` once for each node in the
-graph. A given `init()` will be called more than once if the op is used multiple
-times in the graph. For custom ops a configuration buffer will be provided,
-containing a flexbuffer that maps parameter names to their values. The buffer is
-empty for builtin ops because the interpreter has already parsed the op
-parameters. Kernel implementations that require state should initialize it here
-and transfer ownership to the caller. For each `init()` call, there will be a
-corresponding call to `free()`, allowing implementations to dispose of the
-buffer they might have allocated in `init()`.
-
-Whenever the input tensors are resized, the interpreter will go through the
-graph notifying implementations of the change. This gives them the chance to
-resize their internal buffer, check validity of input shapes and types, and
-recalculate output shapes. This is all done through `prepare()`, and
-implementations can access their state using `node->user_data`.
-
-Finally, each time inference runs, the interpreter traverses the graph calling
-`invoke()`, and here too the state is available as `node->user_data`.
-
-Custom ops can be implemented in exactly the same way as builtin ops, by defined
-those four functions and a global registration function that usually looks like
-this:
-
-```c++
-namespace tflite {
-namespace ops {
-namespace custom {
-  TfLiteRegistration* Register_MY_CUSTOM_OP() {
-    static TfLiteRegistration r = {my_custom_op::Init,
-                                   my_custom_op::Free,
-                                   my_custom_op::Prepare,
-                                   my_custom_op::Eval};
-    return &r;
-  }
-}  // namespace custom
-}  // namespace ops
-}  // namespace tflite
-```
-
-Note that registration is not automatic and an explicit call to
-`Register_MY_CUSTOM_OP` should be made somewhere. While the standard
-`BuiltinOpResolver` (available from the `:builtin_ops` target) takes care of the
-registration of builtins, custom ops will have to be collected in separate
-custom libraries.
-
-### Customize the kernel library
-
-Behind the scenes the interpreter will load a library of kernels which will be
-assigned to execute each of the operators in the model. While the default
-library only contains builtin kernels, it is possible to replace it with a
-custom library.
-
-The interpreter uses an `OpResolver` to translate operator codes and names into
-actual code:
-
-```c++
-class OpResolver {
-  virtual TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const = 0;
-  virtual TfLiteRegistration* FindOp(const char* op) const = 0;
-  virtual void AddOp(tflite::BuiltinOperator op, TfLiteRegistration* registration) = 0;
-  virtual void AddOp(const char* op, TfLiteRegistration* registration) = 0;
-};
-```
-
-Regular usage requires that you use the `BuiltinOpResolver` and write:
-
-```c++
-tflite::ops::builtin::BuiltinOpResolver resolver;
-```
-
-You can optionally register custom ops (before you pass the resolver to the
-`InterpreterBuilder`):
-
-```c++
-resolver.AddOp("MY_CUSTOM_OP", Register_MY_CUSTOM_OP());
-```
-
-If the set of builtin ops is deemed to be too large, a new `OpResolver` could be
-code-generated based on a given subset of ops, possibly only the ones contained
-in a given model. This is the equivalent of TensorFlow's selective registration
-(and a simple version of it is available in the `tools` directory).
+TensorFlow Lite supports a subset of TensorFlow operations with some
+limitations. For full list of operations and limitations see
+[TF Lite Ops page](https://www.tensorflow.org/mlir/tfl_ops).
diff --git a/tensorflow/lite/g3doc/guide/ios.md b/tensorflow/lite/g3doc/guide/ios.md
index 8f15069201b..0353f2fb525 100644
--- a/tensorflow/lite/g3doc/guide/ios.md
+++ b/tensorflow/lite/g3doc/guide/ios.md
@@ -114,6 +114,28 @@ objc_library(
 )
 ```
 
+#### C/C++ API
+
+Alternatively, you can use
+[C API](https://www.tensorflow.org/code/tensorflow/lite/c/c_api.h)
+or [C++ API](https://tensorflow.org/lite/api_docs/cc)
+
+```python
+# Using C API directly
+objc_library(
+  deps = [
+      "//tensorflow/lite/c:c_api",
+  ],
+)
+
+# Using C++ API directly
+objc_library(
+  deps = [
+      "//third_party/tensorflow/lite:framework",
+  ],
+)
+```
+
 ### Import the library
 
 For Swift files, import the TensorFlow Lite module:
diff --git a/tensorflow/lite/g3doc/guide/model_maker.md b/tensorflow/lite/g3doc/guide/model_maker.md
index 3746dbd6c9f..956bd127bcf 100644
--- a/tensorflow/lite/g3doc/guide/model_maker.md
+++ b/tensorflow/lite/g3doc/guide/model_maker.md
@@ -51,6 +51,12 @@ There are two ways to install Model Maker.
 pip install tflite-model-maker
 ```
 
+If you want to install nightly version, please follow the command:
+
+```shell
+pip install tflite-model-maker-nightly
+```
+
 *   Clone the source code from GitHub and install.
 
 ```shell
diff --git a/tensorflow/lite/g3doc/guide/ops_compatibility.md b/tensorflow/lite/g3doc/guide/ops_compatibility.md
index 52f2a2fdf17..38b7ad1e063 100644
--- a/tensorflow/lite/g3doc/guide/ops_compatibility.md
+++ b/tensorflow/lite/g3doc/guide/ops_compatibility.md
@@ -5,12 +5,11 @@ inference models. As they are processed by the TensorFlow Lite Optimizing
 Converter, those operations may be elided or fused, before the supported
 operations are mapped to their TensorFlow Lite counterparts.
 
-Since the set of TensorFlow Lite operations is smaller than TensorFlow's, not
-every model is convertible. Even for supported operations, very specific usage
-patterns are sometimes expected, for performance reasons. We expect to expand
-the set of supported operations in future TensorFlow Lite releases. Additional
-ops can be included by [using select TensorFlow ops](ops_select.md), at the cost
-of binary size.
+Since the TensorFlow Lite builtin operator library only supports a limited
+number of TensorFlow operators, not every model is convertible. Even for
+supported operations, very specific usage patterns are sometimes expected, for
+performance reasons. We expect to expand the set of supported operations in
+future TensorFlow Lite releases.
 
 The best way to understand how to build a TensorFlow model that can be used with
 TensorFlow Lite is to carefully consider how operations are converted and
diff --git a/tensorflow/lite/g3doc/guide/ops_custom.md b/tensorflow/lite/g3doc/guide/ops_custom.md
index e2ccc9c72d8..0c229881984 100644
--- a/tensorflow/lite/g3doc/guide/ops_custom.md
+++ b/tensorflow/lite/g3doc/guide/ops_custom.md
@@ -1,58 +1,184 @@
 # Custom operators
 
-TensorFlow Lite currently supports a subset of TensorFlow operators. It supports
-the use of user-provided implementations (known as custom implementations) if
-the model contains an operator that is not supported. Providing custom kernels
-is also a way of executing a series of TensorFlow operations as a single fused
-TensorFlow Lite operation.
+Since the TensorFlow Lite builtin operator library only supports a limited
+number of TensorFlow operators, not every model is convertible. For details,
+refer to [operator compatibility](ops_compatibility.md).
 
-Using custom operators consists of three steps.
+To allow conversion, users can provide their own custom implementation of an
+unsupported TensorFlow operator in TensorFlow Lite, known as a custom operator.
+*If instead, you wish to combine a series of unsupported (or supported)
+TensorFlow operators into a single fused optimized custom operator, refer to
+[operator fusing](https://www.tensorflow.org/lite/convert/operation_fusion).*
 
-*   Make sure the TensorFlow Graph Def or SavedModel refers to the correctly
-    named TensorFlow Lite operator.
+Using custom operators consists of four steps.
 
-*   Register a custom kernel with TensorFlow Lite so that the runtime knows how
-    to map your operator and parameters in your graph to executable C/C++ code.
+*   [Create a TensorFlow Model.](#create-a-tensorflow-model) Make sure the Saved
+    Model (or Graph Def) refers to the correctly named TensorFlow Lite operator.
 
-*   Test and profile your operator correctness and performance, respectively. If
-    you wish to test just your custom operator, it is best to create a model
-    with just your custom operator and using the
-    [benchmark_model](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/benchmark/benchmark_model_test.cc)
+*   [Convert to a TensorFlow Lite Model.](#convert-to-a-tensorflow-lite-model)
+    Make sure you set the right TensorFlow Lite converter attribute in order to
+    successfully convert the model.
+
+*   [Create and register the operator.](#create-and-register-the-operator) This
+    is so that the TensorFlow Lite runtime knows how to map your operator and
+    parameters in your graph to executable C/C++ code.
+
+*   [Test and profile your operator.](#test-and-profile-your-operator) If you
+    wish to test just your custom operator, it is best to create a model with
+    just your custom operator and use the
+    [benchmark_model](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/benchmark/benchmark_model.cc)
     program.
 
-Below we describe a complete example of defining `Sin` and some links to
-existing conversion process involving custom operators.
+Let’s walk through an end-to-end example of running a model with a custom
+operator `tf.sin` (named as `Sin`, refer to #create-a-tensorflow-model) which is
+supported in TensorFlow, but unsupported in TensorFlow Lite.
 
-## Making a custom operator for Sin
+Note: In reality, `tf.sin` is **not** a custom operator. It is regular operator
+which is supported by both TensorFlow and TensorFlow Lite. But we **assume**
+that it is a custom operator in the following example in order to demonstrate a
+simple workflow.
+
+## Example: Custom `Sin` operator
 
 Let’s walk through an example of supporting a TensorFlow operator that
 TensorFlow Lite does not have. Assume we are using the `Sin` operator and that
 we are building a very simple model for a function `y = sin(x + offset)`, where
 `offset` is trainable.
 
-### Generating the model from TensorFlow
+### Create a TensorFlow Model
 
-The code to train the TensorFlow model will be something like:
+The following code snippet trains a simple TensorFlow model. This model just
+contains a custom operator named `Sin`, which is a function `y = sin(x +
+offset)`, where `offset` is trainable.
 
 ```python
-offset = tf.get_variable("offset", [1,], tf.float32)
-x = tf.placeholder(tf.float32, shape=(None,))
-y = tf.sin(x + offset)
-y_ = tf.placeholder(tf.float32, shape=(None,))
-loss = tf.reduce_sum(tf.square(y - y_))
-optimizer = tf.train.GradientDescentOptimizer(0.001)
-train = optimizer.minimize(loss)
+import tensorflow as tf
+
+# Define training dataset and variables
+x = [-8, 0.5, 2, 2.2, 201]
+y = [-0.6569866 ,  0.99749499,  0.14112001, -0.05837414,  0.80641841]
+offset = tf.Variable(0.0)
+
+# Define a simple model which just contains a custom operator named `Sin`
+@tf.function
+def sin(x):
+  return tf.sin(x + offset, name="Sin")
+
+  # Train model
+optimizer = tf.optimizers.Adam(0.01)
+def train(x, y):
+    with tf.GradientTape() as t:
+      predicted_y = sin(x)
+      loss = tf.reduce_sum(tf.square(predicted_y - y))
+    grads = t.gradient(loss, [offset])
+    optimizer.apply_gradients(zip(grads, [offset]))
+
+for i in range(1000):
+    train(x, y)
+
+print("The actual offset is: 1.0")
+print("The predicted offset is:", offset.numpy())
 ```
 
-If you convert this model to TensorFlow Lite format using the TensorFlow Lite
-Optimizing Converter with `--allow_custom_ops` argument, and run it with the
-default interpreter, the interpreter will raise the following error messages:
+```python
+The actual offset is: 1.0
+The predicted offset is: 1.0000001
+```
+
+At this point, if you try to generate a TensorFlow Lite model with the default
+converter flags, you will get the following error message:
 
 ```none
-Didn't find custom op for name 'Sin'
+Error:
+Some of the operators in the model are not supported by the standard TensorFlow
+Lite runtime...... Here is
+a list of operators for which you will need custom implementations: Sin.
+```
+
+### Convert to a TensorFlow Lite Model
+
+Create a TensorFlow Lite model with custom operators, by setting the converter
+attribute `allow_custom_ops` as shown below:
+
+<pre>
+converter = tf.lite.TFLiteConverter.from_concrete_functions([sin.get_concrete_function(x)])
+<b>converter.allow_custom_ops = True</b>
+tflite_model = converter.convert()
+</pre>
+
+At this point, if you run it with the default interpreter, you will get the
+following error messages:
+
+```none
+Error:
+Didn't find custom operator for name 'Sin'
 Registration failed.
 ```
 
+### Create and register the operator.
+
+All TensorFlow Lite operators (both custom and builtin) are defined using a
+simple pure-C interface that consists of four functions:
+
+```c++
+typedef struct {
+  void* (*init)(TfLiteContext* context, const char* buffer, size_t length);
+  void (*free)(TfLiteContext* context, void* buffer);
+  TfLiteStatus (*prepare)(TfLiteContext* context, TfLiteNode* node);
+  TfLiteStatus (*invoke)(TfLiteContext* context, TfLiteNode* node);
+} TfLiteRegistration;
+```
+
+Refer to
+[`common.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/c/common.h)
+for details on `TfLiteContext` and `TfLiteNode`. The former provides error
+reporting facilities and access to global objects, including all the tensors.
+The latter allows implementations to access their inputs and outputs.
+
+When the interpreter loads a model, it calls `init()` once for each node in the
+graph. A given `init()` will be called more than once if the op is used multiple
+times in the graph. For custom ops a configuration buffer will be provided,
+containing a flexbuffer that maps parameter names to their values. The buffer is
+empty for builtin ops because the interpreter has already parsed the op
+parameters. Kernel implementations that require state should initialize it here
+and transfer ownership to the caller. For each `init()` call, there will be a
+corresponding call to `free()`, allowing implementations to dispose of the
+buffer they might have allocated in `init()`.
+
+Whenever the input tensors are resized, the interpreter will go through the
+graph notifying implementations of the change. This gives them the chance to
+resize their internal buffer, check validity of input shapes and types, and
+recalculate output shapes. This is all done through `prepare()`, and
+implementations can access their state using `node->user_data`.
+
+Finally, each time inference runs, the interpreter traverses the graph calling
+`invoke()`, and here too the state is available as `node->user_data`.
+
+Custom ops can be implemented in exactly the same way as builtin ops, by
+defining those four functions and a global registration function that usually
+looks like this:
+
+```c++
+namespace tflite {
+namespace ops {
+namespace custom {
+  TfLiteRegistration* Register_MY_CUSTOM_OP() {
+    static TfLiteRegistration r = {my_custom_op::Init,
+                                   my_custom_op::Free,
+                                   my_custom_op::Prepare,
+                                   my_custom_op::Eval};
+    return &r;
+  }
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+```
+
+Note that registration is not automatic and an explicit call to
+`Register_MY_CUSTOM_OP` should be made. While the standard `BuiltinOpResolver`
+(available from the `:builtin_ops` target) takes care of the registration of
+builtins, custom ops will have to be collected in separate custom libraries.
+
 ### Defining the kernel in the TensorFlow Lite runtime
 
 All we need to do to use the op in TensorFlow Lite is define two functions
@@ -103,20 +229,53 @@ TfLiteRegistration* Register_SIN() {
 }
 ```
 
-When initializing the `OpResolver`, add the custom op into the resolver. This
-will register the operator with Tensorflow Lite so that TensorFlow Lite can use
-the new implementation. Note that the last two arguments in `TfLiteRegistration`
-correspond to the `SinPrepare` and `SinEval` functions you defined for the
-custom op. If you used `SinInit` and `SinFree` functions to initialize variables
-used in the op and to free up space, respectively, then they would be added to
-the first two arguments of `TfLiteRegistration`; those arguments are set to
-`nullptr` in this example.
+When initializing the `OpResolver`, add the custom op into the resolver (see
+below for an example). This will register the operator with Tensorflow Lite so
+that TensorFlow Lite can use the new implementation. Note that the last two
+arguments in `TfLiteRegistration` correspond to the `SinPrepare` and `SinEval`
+functions you defined for the custom op. If you used `SinInit` and `SinFree`
+functions to initialize variables used in the op and to free up space,
+respectively, then they would be added to the first two arguments of
+`TfLiteRegistration`; those arguments are set to `nullptr` in this example.
 
-```cpp
-tflite::ops::builtin::BuiltinOpResolver builtins;
-builtins.AddCustom("Sin", Register_SIN());
+### Register the operator with the kernel library
+
+Now we need to register the operator with the kernel library. This is done with
+an `OpResolver`. Behind the scenes, the interpreter will load a library of
+kernels which will be assigned to execute each of the operators in the model.
+While the default library only contains builtin kernels, it is possible to
+replace/augment it with a custom library op operators.
+
+The `OpResolver` class, which translates operator codes and names into actual
+code, is defined like this:
+
+```c++
+class OpResolver {
+  virtual TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const = 0;
+  virtual TfLiteRegistration* FindOp(const char* op) const = 0;
+  virtual void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration) = 0;
+  virtual void AddCustom(const char* op, TfLiteRegistration* registration) = 0;
+};
 ```
 
+Regular usage requires that you use the `BuiltinOpResolver` and write:
+
+```c++
+tflite::ops::builtin::BuiltinOpResolver resolver;
+```
+
+To add the custom op created above, you call `AddOp` (before you pass the
+resolver to the `InterpreterBuilder`):
+
+```c++
+resolver.AddCustom("Sin", Register_SIN());
+```
+
+If the set of builtin ops is deemed to be too large, a new `OpResolver` could be
+code-generated based on a given subset of ops, possibly only the ones contained
+in a given model. This is the equivalent of TensorFlow's selective registration
+(and a simple version of it is available in the `tools` directory).
+
 If you want to define your custom operators in Java, you would currently need to
 build your own custom JNI layer and compile your own AAR
 [in this jni code](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/java/src/main/native/builtin_ops_jni.cc).
@@ -129,9 +288,16 @@ operations instead of a single operator. Just add as many `AddCustom` operators
 as you need. In addition, `BuiltinOpResolver` also allows you to override
 implementations of builtins by using the `AddBuiltin`.
 
-## Best practices
+### Test and profile your operator
 
-### Writing TensorFlow Lite kernels best practices
+To profile your op with the TensorFlow Lite benchmark tool, you can use the
+[benchmark model tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark#tflite-model-benchmark-tool)
+for TensorFlow Lite. For testing purposes, you can make your local build of
+TensorFlow Lite aware of your custom op by adding the appropriate `AddCustom`
+call (as show above) to
+[register.cc](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/kernels/register.cc)
+
+## Best practices
 
 1.  Optimize memory allocations and de-allocations cautiously. Allocating memory
     in `Prepare` is more efficient than in `Invoke`, and allocating memory
@@ -175,61 +341,3 @@ implementations of builtins by using the `AddBuiltin`.
     Your code must not leave memory hanging when `TF_LITE_ENSURE` is used, i.e.,
     these macros should be used before any resources are allocated that will
     leak.
-
-### Conversion best practices
-
-The example above was easy to convert since it was a builtin operator in
-TensorFlow. If you are defining a new operator that fuses many operators or you
-have complicated shapes or types, you might need to provide more information and
-use graph transformations to rewrite an existing graph to use your operator
-instead of the builtin TensorFlow one.
-
-#### Converting TensorFlow models to convert graphs
-
-In TensorFlow you can use the `tf.lite.OpHint` class to encapsulate groups of
-operators when you create a TensorFlow graph. This encapsulation allows you then
-to extract a graph def that has references to those operators. `tf.lite.OpHint`
-is currently experimental and should only be used by advanced users. A full
-example of how to use this class is in the
-[OpHint code](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/op_hint.py).
-
-In addition, you can also use a manual graph substitution approach to rewrite
-Tensorflow graphs. There is an example of how this is done in single shot object
-based detection models
-[export script](https://github.com/tensorflow/models/blob/master/research/object_detection/export_tflite_ssd_graph.py).
-
-### TF graph attributes
-
-When `tflite_convert` converts a TensorFlow graph into TFLite format, it makes
-some assumptions about custom operations. If the assumptions are not correct,
-the generated graph may not execute.
-
-It is possible to add additional information about your custom op output to the
-TF graph before it is converted. The following attributes are supported:
-
--   **_output_quantized** a boolean attribute, true if the operation outputs are
-    quantized
--   **_output_types** a list of types for output tensors
--   **_output_shapes** a list of shapes for output tensors
-
-#### Setting the attributes
-
-The following example demonstrates how the attributes can be set:
-
-```python
-frozen_graph_def = tf.graph_util.convert_variables_to_constants(...)
-for node in frozen_graph_def.node:
-    if node.op == 'sin':
-      node.attr['_output_types'].list.type.extend([
-          types_pb2.DT_FLOAT,
-      ])
-      node.attr['_output_shapes'].list.shape.extend([
-          tf.TensorShape([10]),
-      ])
-      node.attr['_output_quantized'].b = False
-tflite_model = tf.lite.toco_convert(
-        frozen_graph_def,...)
-```
-
-**Note:** After the attributes are set, the graph cannot be executed by
-TensorFlow. Therefore, the attributes should be set just before the conversion.
diff --git a/tensorflow/lite/g3doc/guide/ops_select.md b/tensorflow/lite/g3doc/guide/ops_select.md
index b9e5b34076a..3aa81528c1f 100644
--- a/tensorflow/lite/g3doc/guide/ops_select.md
+++ b/tensorflow/lite/g3doc/guide/ops_select.md
@@ -1,100 +1,43 @@
-# Select TensorFlow operators to use in TensorFlow Lite
+# Select TensorFlow operators
 
 Caution: This feature is experimental.
 
-The TensorFlow Lite builtin op library has grown rapidly and will continue to
-grow, but there remains a long tail of TensorFlow ops that are not yet natively
-supported by TensorFlow Lite. These unsupported ops can be a point of friction
-in the TensorFlow Lite model conversion process. To that end, the team has
-recently been working on an experimental mechanism for reducing this friction.
+Since the TensorFlow Lite builtin operator library only supports a limited
+number of TensorFlow operators, not every model is convertible. For details,
+refer to [operator compatibility](ops_compatibility.md).
 
-This document outlines how to use TensorFlow Lite with select TensorFlow ops.
-*Note that this feature is experimental and is under active development.* As you
-use this feature, keep in mind the [known limitations](#known-limitations), and
-please send feedback about models that work and issues you are facing to
-tflite@tensorflow.org.
+To allow conversion, users can enable the usage of
+[certain TensorFlow ops](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc)
+in their TensorFlow Lite model. However, running TensorFlow Lite models with
+TensorFlow ops requires pulling in the core TensorFlow runtime, which increases
+the TensorFlow Lite interpreter binary size. For Android, you can avoid this by
+selectively building only required Tensorflow ops. For the details, refer to
+[reduce binary size](../guide/reduce_binary_size.md).
 
-TensorFlow Lite will continue to have
-[TensorFlow Lite builtin ops](ops_compatibility.md) optimized for mobile and
-embedded devices. However, TensorFlow Lite models can now use a subset of
-TensorFlow ops when TFLite builtin ops are not sufficient.
+This document outlines how to [convert](#convert_a_model) and
+[run](#run_inference) a TensorFlow Lite model containing TensorFlow ops on a
+platform of your choice. It also discusses
+[performance and size metrics](#metrics) and
+[known limitations](#known_limitations).
 
-Models converted with TensorFlow ops will require a TensorFlow Lite interpreter
-that has a larger binary size than the interpreter with only TFLite builtin ops.
-For Android, It is possible to reduce binary size by selectively linking only
-required Tensorflow ops. For the details, please see the
-[Reduce TensorFlow Lite binary size](../guide/reduce_binary_size.md) section.
+## Convert a model
 
-Additionally, performance optimizations will not be available for any TensorFlow
-ops in the TensorFlow Lite model.
-
-This document outlines how to [convert](#converting-the-model) and
-[run](#running-the-model) a TFLite model with TensorFlow ops on your platform of
-choice. It also discusses some [known limitations](#known-limitations), the
-[future plans](#future-plans) for this feature, and basic
-[performance and size metrics](#metrics).
-
-## Converting the model
-
-To convert a TensorFlow model to a TensorFlow Lite model with TensorFlow ops,
-use the `target_spec.supported_ops` argument in the
-[TensorFlow Lite converter](../convert/). The following values are valid options
-for `target_spec.supported_ops`:
-
-*   `TFLITE_BUILTINS` - Converts models using TensorFlow Lite builtin ops.
-*   `SELECT_TF_OPS` - Converts models using TensorFlow ops. The exact subset of
-    supported ops can be found in the allowlist at
-    `lite/delegates/flex/allowlisted_flex_ops.cc`.
-
-Note: `target_spec.supported_ops` was previously `target_ops` in the Python API.
-
-The recommended approach is to convert the model with `TFLITE_BUILTINS`, then
-with both `TFLITE_BUILTINS,SELECT_TF_OPS`, and finally with only
-`SELECT_TF_OPS`. Using both options (i.e. `TFLITE_BUILTINS,SELECT_TF_OPS`)
-creates models with TensorFlow Lite ops where possible. Using only
-`SELECT_TF_OPS` is useful when the model contains TensorFlow ops that are only
-partially supported by TensorFlow Lite, and one would like to avoid those
-limitations.
-
-The following example shows how to use this feature in the
-[`TFLiteConverter`](../convert/python_api.md) Python API.
+The following example shows how to generate a TensorFlow Lite model with select
+TensorFlow ops.
 
 ```python
 import tensorflow as tf
 
 converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
-                                       tf.lite.OpsSet.SELECT_TF_OPS]
+converter.target_spec.supported_ops = [
+  tf.lite.OpsSet.TFLITE_BUILTINS, # enable TensorFlow Lite ops.
+  tf.lite.OpsSet.SELECT_TF_OPS # enable TensorFlow ops.
+]
 tflite_model = converter.convert()
 open("converted_model.tflite", "wb").write(tflite_model)
 ```
 
-The following example shows how to use this feature in the
-[`tflite_convert`](../convert/cmdline.md) command line tool using the
-command line flag `target_ops`.
-
-```sh
-tflite_convert \
-  --output_file=/tmp/foo.tflite \
-  --graph_def_file=/tmp/foo.pb \
-  --input_arrays=input \
-  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
-  --target_ops=TFLITE_BUILTINS,SELECT_TF_OPS
-```
-
-When building and running `tflite_convert` directly with `bazel`, please pass
-`--define=tflite_convert_with_select_tf_ops=true` as an additional argument.
-
-```sh
-bazel run --define=tflite_convert_with_select_tf_ops=true tflite_convert -- \
-  --output_file=/tmp/foo.tflite \
-  --graph_def_file=/tmp/foo.pb \
-  --input_arrays=input \
-  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
-  --target_ops=TFLITE_BUILTINS,SELECT_TF_OPS
-```
-
-## Running the model
+## Run Inference
 
 When using a TensorFlow Lite model that has been converted with support for
 select TensorFlow ops, the client must also use a TensorFlow Lite runtime that
@@ -264,28 +207,23 @@ creating the interpreter at runtime as long as the delegate is linked into the
 client library. It is not necessary to explicitly install the delegate instance
 as is typically required with other delegate types.
 
-### Python pip package
+### Python
 
-Flex ops are included in the nightly build of the TensorFlow Python package. You
-can use TFLite models containing Flex ops by the same Python API as normal
-TFLite models. The nightly TensorFlow build can be installed with this command:
+TensorFlow Lite with select TensorFlow ops will be installed automatically with
+the [TensorFlow pip package](https://www.tensorflow.org/install/pip). You can
+also choose to only install the
+[TensorFlow Lite Interpreter pip package](https://www.tensorflow.org/lite/guide/python#install_just_the_tensorflow_lite_interpreter).
 
-```sh
-pip install tf-nightly
-```
-
-Flex ops will be added to the TensorFlow Python package's and the
-`tflite_runtime`
-[package](https://www.tensorflow.org/lite/guide/python#install_just_the_tensorflow_lite_interpreter)
-from version 2.3 for Linux and 2.4 for other environments.
+Note: TensorFlow Lite with select TensorFlow ops are available in the TensorFlow
+pip package version since 2.3 for Linux and 2.4 for other environments.
 
 ## Metrics
 
 ### Performance
 
 When using a mixture of both builtin and select TensorFlow ops, all of the same
-TensorFlow Lite optimizations and optimized builtin kernels will be be available
-and usable with the converted model.
+TensorFlow Lite optimizations and optimized builtin ops will be be available and
+usable with the converted model.
 
 The following table describes the average time taken to run inference on
 MobileNet on a Pixel 2. The listed times are an average of 100 runs. These
@@ -314,18 +252,14 @@ TFLite builtin ops and 3 Tensorflow ops. For more details, please see the
 
 ## Known limitations
 
-The following is a list of some of the known limitations:
-
-*   Control flow ops are not yet supported.
-*   The
-    [`post_training_quantization`](https://www.tensorflow.org/performance/post_training_quantization)
-    flag is currently not supported for TensorFlow ops, so it will not quantize
-    weights for any TensorFlow ops. In models with both TensorFlow Lite builtin
-    ops and TensorFlow ops, the weights for the builtin ops will be quantized.
-*   Ops that require explicit initialization from resources, like `HashTableV2`,
-    are not yet supported.
-*   Certain TensorFlow ops may not support the full set of input/output types
-    that are typically available on stock TensorFlow.
+*   Unsupported types: Certain TensorFlow ops may not support the full set of
+    input/output types that are typically available in TensorFlow.
+*   Unsupported ops: Control flow ops and ops that require explicit
+    initialization from resources, like `HashTableV2`, are not yet supported.
+*   Unsupported optimizations: If you apply an optimization known as
+    [post training quantization](../performance/post_training_quantization.md),
+    only the TensorFlow Lite ops will be quantized (or optimized), but the
+    TensorFlow ops will remain as float (or unoptimized).
 
 ## Future plans
 
diff --git a/tensorflow/lite/g3doc/images/convert/convert.png b/tensorflow/lite/g3doc/images/convert/convert.png
new file mode 100644
index 00000000000..7fe84ff9b2d
Binary files /dev/null and b/tensorflow/lite/g3doc/images/convert/convert.png differ
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/lite_support.md b/tensorflow/lite/g3doc/inference_with_metadata/lite_support.md
index e77a6f1a91d..ce7b9c0765b 100644
--- a/tensorflow/lite/g3doc/inference_with_metadata/lite_support.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/lite_support.md
@@ -40,6 +40,10 @@ dependencies {
 }
 ```
 
+Explore the
+[TensorFlow Lite Support Library AAR hosted at JCenter](https://bintray.com/google/tensorflow/tensorflow-lite-support)
+for different versions of the Support Library.
+
 ### Basic image manipulation and conversion
 
 The TensorFlow Lite Support Library has a suite of basic image manipulation
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_nl_classifier.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_nl_classifier.md
index 02d6c3321eb..ac8eb1975cc 100644
--- a/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_nl_classifier.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_nl_classifier.md
@@ -1,4 +1,4 @@
-# Bert natural language classifier
+# Integrate BERT natural language classifier
 
 The Task Library `BertNLClassifier` API is very similar to the `NLClassifier`
 that classifies input text into different categories, except that this API is
@@ -67,6 +67,34 @@ See the
 [source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/text/nlclassifier/BertNLClassifier.java)
 for more details.
 
+## Run inference in Swift
+
+### Step 1: Import CocoaPods
+
+Add the TensorFlowLiteTaskText pod in Podfile
+
+```
+target 'MySwiftAppWithTaskAPI' do
+  use_frameworks!
+  pod 'TensorFlowLiteTaskText', '~> 0.0.1-nightly'
+end
+```
+
+### Step 2: Run inference using the API
+
+```swift
+// Initialization
+let bertNLClassifier = TFLBertNLClassifier.bertNLClassifier(
+      modelPath: bertModelPath)
+
+// Run inference
+let categories = bertNLClassifier.classify(text: input)
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/ios/task/text/nlclassifier/Sources/TFLBertNLClassifier.h)
+for more details.
+
 ## Run inference in C++
 
 Note: We are working on improving the usability of the C++ Task Library, such as
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_question_answerer.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_question_answerer.md
index 5b75609343b..f5d9aff7b6d 100644
--- a/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_question_answerer.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/bert_question_answerer.md
@@ -1,4 +1,4 @@
-# Bert question answerer
+# Integrate BERT question answerer
 
 The Task Library `BertQuestionAnswerer` API loads a Bert model and answers
 questions based on the content of a given passage. For more information, see the
@@ -21,10 +21,7 @@ The following models are compatible with the `BertNLClassifier` API.
     [TensorFlow Lite Model Maker for Question Answer](https://www.tensorflow.org/lite/tutorials/model_maker_question_answer).
 
 *   The
-    [pretrained ALBERT models on TensorFlow Hub](https://tfhub.dev/tensorflow/albert_lite_base/1).
-
-*   The
-    [pretrained MobileBERT models on TensorFlow Hub](https://tfhub.dev/tensorflow/tfjs-model/mobilebert/1).
+    [pretrained BERT models on TensorFlow Hub](https://tfhub.dev/tensorflow/collections/lite/task-library/bert-question-answerer/1).
 
 *   Custom models that meet the
     [model compatibility requirements](#model-compatibility-requirements).
@@ -64,13 +61,41 @@ BertQuestionAnswerer answerer = BertQuestionAnswerer.createFromFile(androidConte
 
 // Run inference
 List<QaAnswer> answers = answerer.answer(contextOfTheQuestion, questionToAsk);
-);
 ```
 
 See the
 [source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/text/qa/BertQuestionAnswerer.java)
 for more details.
 
+## Run inference in Swift
+
+### Step 1: Import CocoaPods
+
+Add the TensorFlowLiteTaskText pod in Podfile
+
+```
+target 'MySwiftAppWithTaskAPI' do
+  use_frameworks!
+  pod 'TensorFlowLiteTaskText', '~> 0.0.1-nightly'
+end
+```
+
+### Step 2: Run inference using the API
+
+```swift
+// Initialization
+let mobileBertAnswerer = TFLBertQuestionAnswerer.questionAnswerer(
+      modelPath: mobileBertModelPath)
+
+// Run inference
+let answers = mobileBertAnswerer.answer(
+      context: context, question: question)
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/ios/task/text/qa/Sources/TFLBertQuestionAnswerer.h)
+for more details.
+
 ## Run inference in C++
 
 Note: we are working on improving the usability of the C++ Task Library, such as
@@ -94,24 +119,29 @@ for more details.
 Here is an example of the answer results of
 [ALBERT model](https://tfhub.dev/tensorflow/lite-model/albert_lite_base/squadv1/1).
 
-Context: "A bunny is white and it's very fluffy. You don't want to eat a bunny
-because bunny is so cute."
+Context: "The Amazon rainforest, alternatively, the Amazon Jungle, also known in
+English as Amazonia, is a moist broadleaf tropical rainforest in the Amazon
+biome that covers most of the Amazon basin of South America. This basin
+encompasses 7,000,000 km2 (2,700,000 sq mi), of which
+5,500,000 km2 (2,100,000 sq mi) are covered by the rainforest. This region
+includes territory belonging to nine nations."
 
-Question: "what's the color of bunny?"
+Question: "Where is Amazon rainforest?"
 
 Answers:
 
 ```
-answer[0]: 'white'
-    logit: '13.98366, start_index: 13, end_index: 13
-answer[1]: 'bunny is white'
-    logit: '6.84057, start_index: 11, end_index: 13
-answer[2]: 'white and it's very fluffy.'
-    logit: '6.73246, start_index: 13, end_index: 20
-answer[3]: 'white and it's very fluffy.'
-    logit: '6.60175, start_index: 13, end_index: 19
-answer[4]: 'is white'
-    logit: '6.05076, start_index: 12, end_index: 13
+answer[0]:  'South America.'
+logit: 1.84847, start_index: 39, end_index: 40
+answer[1]:  'most of the Amazon basin of South America.'
+logit: 1.2921, start_index: 34, end_index: 40
+answer[2]:  'the Amazon basin of South America.'
+logit: -0.0959535, start_index: 36, end_index: 40
+answer[3]:  'the Amazon biome that covers most of the Amazon basin of South America.'
+logit: -0.498558, start_index: 28, end_index: 40
+answer[4]:  'Amazon basin of South America.'
+logit: -0.774266, start_index: 37, end_index: 40
+
 ```
 
 Try out the simple
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_classifier.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_classifier.md
index 7b1c765baea..391e75a4b7d 100644
--- a/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_classifier.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_classifier.md
@@ -40,7 +40,7 @@ API.
     [pretrained image classification models from TensorFlow Lite Hosted Models](https://www.tensorflow.org/lite/guide/hosted_models#image_classification).
 
 *   The
-    [pretrained image classification models on TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite&module-type=image-classification).
+    [pretrained image classification models on TensorFlow Hub](https://tfhub.dev/tensorflow/collections/lite/task-library/image-classifier/1).
 
 *   Models created by
     [AutoML Vision Edge Image Classification](https://cloud.google.com/vision/automl/docs/edge-quickstart).
@@ -50,6 +50,10 @@ API.
 
 ## Run inference in Java
 
+See the
+[Image Classification reference app](https://github.com/tensorflow/examples/blob/master/lite/examples/image_classification/android/EXPLORE_THE_CODE.md)
+for an example of how to use `ImageClassifier` in an Android app.
+
 ### Step 1: Import Gradle dependency and other settings
 
 Copy the `.tflite` model file to the assets directory of the Android module
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_segmenter.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_segmenter.md
index 40e5f7b0e44..c17370be026 100644
--- a/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_segmenter.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_segmenter.md
@@ -29,7 +29,7 @@ The following models are guaranteed to be compatible with the `ImageSegmenter`
 API.
 
 *   The
-    [pretrained image segmentation models on TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite&module-type=image-segmentation).
+    [pretrained image segmentation models on TensorFlow Hub](https://tfhub.dev/tensorflow/collections/lite/task-library/image-segmenter/1).
 
 *   Custom models that meet the
     [model compatibility requirements](#model-compatibility-requirements).
@@ -130,7 +130,7 @@ The segmentation category mask should looks like:
 <img src="images/segmentation-output.png" alt="segmentation-output" width="30%">
 
 Try out the simple
-[CLI demo tool for ImageClassifier](https://github.com/tensorflow/tflite-support/tree/master/tensorflow_lite_support/examples/task/vision/desktop#image-segmenter)
+[CLI demo tool for ImageSegmenter](https://github.com/tensorflow/tflite-support/tree/master/tensorflow_lite_support/examples/task/vision/desktop#image-segmenter)
 with your own model and test data.
 
 ## Model compatibility requirements
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/nl_classifier.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/nl_classifier.md
index cfbf36e1332..f773b260332 100644
--- a/tensorflow/lite/g3doc/inference_with_metadata/task_library/nl_classifier.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/nl_classifier.md
@@ -1,4 +1,4 @@
-# Natural language classifier
+# Integrate Natural language classifier
 
 The Task Library's `NLClassifier` API classifies input text into different
 categories, and is a versatile and configurable API that can handle most text
@@ -29,6 +29,10 @@ API.
 
 ## Run inference in Java
 
+See the
+[Text Classification reference app](https://github.com/tensorflow/examples/blob/master/lite/examples/text_classification/android/lib_task_api/src/main/java/org/tensorflow/lite/examples/textclassification/client/TextClassificationClient.java)
+for an example of how to use `NLClassifier` in an Android app.
+
 ### Step 1: Import Gradle dependency and other settings
 
 Copy the `.tflite` model file to the assets directory of the Android module
@@ -69,6 +73,38 @@ See the
 [source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/java/src/java/org/tensorflow/lite/task/text/nlclassifier/NLClassifier.java)
 for more options to configure `NLClassifier`.
 
+## Run inference in Swift
+
+### Step 1: Import CocoaPods
+
+Add the TensorFlowLiteTaskText pod in Podfile
+
+```
+target 'MySwiftAppWithTaskAPI' do
+  use_frameworks!
+  pod 'TensorFlowLiteTaskText', '~> 0.0.1-nightly'
+end
+```
+
+### Step 2: Run inference using the API
+
+```swift
+// Initialization
+var modelOptions:TFLNLClassifierOptions = TFLNLClassifierOptions()
+modelOptions.inputTensorName = inputTensorName
+modelOptions.outputScoreTensorName = outputScoreTensorName
+let nlClassifier = TFLNLClassifier.nlClassifier(
+      modelPath: modelPath,
+      options: modelOptions)
+
+// Run inference
+let categories = nlClassifier.classify(text: input)
+```
+
+See the
+[source code](https://github.com/tensorflow/tflite-support/blob/master/tensorflow_lite_support/ios/task/text/nlclassifier/Sources/TFLNLClassifier.h)
+for more details.
+
 ## Run inference in C++
 
 Note: We are working on improving the usability of the C++ Task Library, such as
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/object_detector.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/object_detector.md
index 06b08e16b9b..f152ce69f7f 100644
--- a/tensorflow/lite/g3doc/inference_with_metadata/task_library/object_detector.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/object_detector.md
@@ -32,7 +32,7 @@ The following models are guaranteed to be compatible with the `ObjectDetector`
 API.
 
 *   The
-    [pretrained object detection models on TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite&module-type=image-object-detection&publisher=google,tensorflow).
+    [pretrained object detection models on TensorFlow Hub](https://tfhub.dev/tensorflow/collections/lite/task-library/object-detector/1).
 
 *   Models created by
     [AutoML Vision Edge Object Detection](https://cloud.google.com/vision/automl/object-detection/docs).
@@ -42,6 +42,10 @@ API.
 
 ## Run inference in Java
 
+See the
+[Object Detection reference app](https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android/)
+for an example of how to use `ObjectDetector` in an Android app.
+
 ### Step 1: Import Gradle dependency and other settings
 
 Copy the `.tflite` model file to the assets directory of the Android module
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/overview.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/overview.md
index 94b8f089a10..d054e4717d7 100644
--- a/tensorflow/lite/g3doc/inference_with_metadata/task_library/overview.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/overview.md
@@ -6,7 +6,7 @@ It provides optimized out-of-box model interfaces for popular machine learning
 tasks, such as image classification, question and answer, etc. The model
 interfaces are specifically designed for each task to achieve the best
 performance and usability. Task Library works cross-platform and is supported on
-Java, C++, and Swift(coming soon).
+Java, C++, and Swift.
 
 ## What to expect from the Task Library
 
diff --git a/tensorflow/lite/g3doc/microcontrollers/build_convert.md b/tensorflow/lite/g3doc/microcontrollers/build_convert.md
index 89c2c00fd55..cf4864674c2 100644
--- a/tensorflow/lite/g3doc/microcontrollers/build_convert.md
+++ b/tensorflow/lite/g3doc/microcontrollers/build_convert.md
@@ -12,36 +12,19 @@ guidance on designing and training a model to fit in limited memory.
 For an end-to-end, runnable example of building and converting a model, see the
 following Colab which is part of the *Hello World* example:
 
-<a class="button button-primary" href="https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb">create_sine_model.ipynb</a>
+<a class="button button-primary" href="https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb">train_hello_world_model.ipynb</a>
 
 ## Model conversion
 
 To convert a trained TensorFlow model to run on microcontrollers, you should use
 the
-[TensorFlow Lite converter Python API](https://www.tensorflow.org/lite/convert/python_api).
+[TensorFlow Lite converter Python API](https://www.tensorflow.org/lite/convert/).
 This will convert the model into a
 [`FlatBuffer`](https://google.github.io/flatbuffers/), reducing the model size,
 and modify it to use TensorFlow Lite operations.
 
-### Quantization
-
 To obtain the smallest possible model size, you should consider using
-[Post-training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization).
-This will reduce the precision of the numbers in your model, which results in a
-smaller model size. However, this is likely to reduce accuracy, particularly for
-small models. It is important to profile the accuracy of your model before and
-after quantization to confirm that this loss is acceptable.
-
-The following Python snippet shows how to convert a model using post-training
-quantization:
-
-```python
-import tensorflow as tf
-converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.optimizations = [tf.lite.Optimize.DEFAULT]
-quantized_model = converter.convert()
-open("converted_model.tflite", "wb").write(quantized_model)
-```
+[post-training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization).
 
 ### Convert to a C array
 
diff --git a/tensorflow/lite/g3doc/microcontrollers/get_started.md b/tensorflow/lite/g3doc/microcontrollers/get_started.md
index 999438311d8..9e6d0fdcbf4 100644
--- a/tensorflow/lite/g3doc/microcontrollers/get_started.md
+++ b/tensorflow/lite/g3doc/microcontrollers/get_started.md
@@ -1,14 +1,29 @@
 # Get started with microcontrollers
 
-This document will help you get started using TensorFlow Lite for
-Microcontrollers. It explains how to run the framework's example applications,
-then walks through the code for a simple application that runs inference on a
+This document explains how to train a model and run inference using a
 microcontroller.
 
+## The Hello World example
+
+The
+[Hello World](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world)
+example is designed to demonstrate the absolute basics of using TensorFlow Lite
+for Microcontrollers. We train and run a model that replicates a sine function,
+i.e, it takes a single number as its input, and outputs the number's
+[sine](https://en.wikipedia.org/wiki/Sine) value. When deployed to the
+microcontroller, its predictions are used to either blink LEDs or control an
+animation.
+
+The end-to-end workflow involves the following steps:
+
+1.  [Train a model](#train-a-model) (in Python): A jupyter notebook to train,
+    convert and optimize a model for on-device use.
+2.  [Run inference](#run-inference) (in C++ 11): An end-to-end unit test that
+    runs inference on the model using the [C++ library](library.md).
+
 ## Get a supported device
 
-To follow this guide, you'll need a supported hardware device. The example
-application we'll be using has been tested on the following devices:
+The example application we'll be using has been tested on the following devices:
 
 *   [Arduino Nano 33 BLE Sense](https://store.arduino.cc/usa/nano-33-ble-sense-with-headers)
     (using Arduino IDE)
@@ -30,61 +45,27 @@ application we'll be using has been tested on the following devices:
 Learn more about supported platforms in
 [TensorFlow Lite for Microcontrollers](index.md).
 
-## Explore the examples
+## Train a model
 
-TensorFlow Lite for Microcontrollers comes with several example applications
-that demonstrate its use for various tasks. At the time of writing, the
-following are available:
+Note: You can skip this section and use the trained model included in the
+example code.
 
-*   [Hello World](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world) -
-    Demonstrates the absolute basics of using TensorFlow Lite for
-    Microcontrollers
-*   [Micro speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) -
-    Captures audio with a microphone in order to detect the words "yes" and "no"
-*   [Person detection](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/person_detection) -
-    Captures camera data with an image sensor in order to detect the presence or
-    absence of a person
-*   [Magic wand](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/magic_wand) -
-    Captures accelerometer data in order to classify three different physical
-    gestures
+Use Google colaboratory to
+[train your own model](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb).
+For more details, refer to the `README.md`:
 
-Each example application has a `README.md` file that explains how it can be
-deployed to its supported platforms.
+<a class="button button-primary" href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world/train/README.md">Hello
+World Training README.md</a>
 
-The rest of this guide walks through the
-[Hello World](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world)
-example application.
+## Run inference
 
-## The Hello World example
-
-This example is designed to demonstrate the absolute basics of using TensorFlow
-Lite for Microcontrollers. It includes the full end-to-end workflow of training
-a model, converting it for use with TensorFlow Lite, and running inference on a
-microcontroller.
-
-In the example, a model is trained to replicate a sine function. It takes a
-single number as its input, and outputs the number's
-[sine](https://en.wikipedia.org/wiki/Sine). When deployed to a microcontroller,
-its predictions are used to either blink LEDs or control an animation.
-
-The example includes the following:
-
-*   A Jupyter notebook that demonstrates how the model is trained and converted
-*   A C++ 11 application that runs inference using the model, tested to work
-    with Arduino, SparkFun Edge, STM32F746G discovery kit, and macOS
-*   A unit test that demonstrates the process of running inference
-
-### Run the example
-
-To run the example on your device, walk through the instructions in the
+To run the model on your device, we will walk through the instructions in the
 `README.md`:
 
 <a class="button button-primary" href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world/README.md">Hello
 World README.md</a>
 
-## How to run inference
-
-The following section walks through the *Hello World* example's
+The following sections walk through the example's
 [`hello_world_test.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc),
 unit test which demonstrates how to run inference using TensorFlow Lite for
 Microcontrollers. It loads the model and runs inference several times.
@@ -359,18 +340,3 @@ the example's application code, located in
 It follows a similar process, but generates an input value based on how many
 inferences have been run, and calls a device-specific function that displays the
 model's output to the user.
-
-## Next steps
-
-To understand how the library can be used with a variety of models and
-applications, we recommend deploying the other examples and walking through
-their code.
-
-<a class="button button-primary" href="https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples">Example
-applications on GitHub</a>
-
-To learn how to use the library in your own project, read
-[Understand the C++ library](library.md).
-
-For information about training and converting models for deployment on
-microcontrollers, read [Build and convert models](build_convert.md).
diff --git a/tensorflow/lite/g3doc/microcontrollers/index.md b/tensorflow/lite/g3doc/microcontrollers/index.md
index 7a531d48f8b..77059de446a 100644
--- a/tensorflow/lite/g3doc/microcontrollers/index.md
+++ b/tensorflow/lite/g3doc/microcontrollers/index.md
@@ -1,22 +1,24 @@
 # TensorFlow Lite for Microcontrollers
 
-TensorFlow Lite for Microcontrollers is an experimental port of TensorFlow Lite
-designed to run machine learning models on microcontrollers and other devices
-with only kilobytes of memory.
+TensorFlow Lite for Microcontrollers is designed to run machine learning models
+on microcontrollers and other devices with only few kilobytes of memory. The
+core runtime just fits in 16 KB on an Arm Cortex M3 and can run many basic
+models. It doesn't require operating system support, any standard C or C++
+libraries, or dynamic memory allocation.
 
-It doesn't require operating system support, any standard C or C++ libraries, or
-dynamic memory allocation. The core runtime fits in 16 KB on an Arm Cortex M3,
-and with enough operators to run a speech keyword detection model, takes up a
-total of 22 KB.
+## Why microcontrollers are important
 
-There are example applications demonstrating the use of microcontrollers for
-tasks including wake word detection, gesture classification from accelerometer
-data, and image classification using camera data.
-
-## Get started
-
-To try the example applications and learn how to use the API, read
-[Get started with microcontrollers](get_started.md).
+Microcontrollers are typically small, low-powered computing devices that are
+embedded within hardware that requires basic computation. By bringing machine
+learning to tiny microcontrollers, we can boost the intelligence of billions of
+devices that we use in our lives, including household appliances and Internet of
+Things devices, without relying on expensive hardware or reliable internet
+connections, which is often subject to bandwidth and power constraints and
+results in high latency. This can also help preserve privacy, since no data
+leaves the device. Imagine smart appliances that can adapt to your daily
+routine, intelligent industrial sensors that understand the difference between
+problems and normal operation, and magical toys that can help kids learn in fun
+and delightful ways.
 
 ## Supported platforms
 
@@ -24,13 +26,12 @@ TensorFlow Lite for Microcontrollers is written in C++ 11 and requires a 32-bit
 platform. It has been tested extensively with many processors based on the
 [Arm Cortex-M Series](https://developer.arm.com/ip-products/processors/cortex-m)
 architecture, and has been ported to other architectures including
-[ESP32](https://www.espressif.com/en/products/hardware/esp32/overview).
+[ESP32](https://www.espressif.com/en/products/hardware/esp32/overview). The
+framework is available as an Arduino library. It can also generate projects for
+development environments such as Mbed. It is open source and can be included in
+any C++ 11 project.
 
-The framework is available as an Arduino library. It can also generate projects
-for development environments such as Mbed. It is open source and can be included
-in any C++ 11 project.
-
-There are example applications available for the following development boards:
+The following development boards are supported:
 
 *   [Arduino Nano 33 BLE Sense](https://store.arduino.cc/usa/nano-33-ble-sense-with-headers)
 *   [SparkFun Edge](https://www.sparkfun.com/products/15170)
@@ -41,61 +42,43 @@ There are example applications available for the following development boards:
 *   [Espressif ESP32-DevKitC](https://www.espressif.com/en/products/hardware/esp32-devkitc/overview)
 *   [Espressif ESP-EYE](https://www.espressif.com/en/products/hardware/esp-eye/overview)
 
-To learn more about the libraries and examples, see
-[Get started with microcontrollers](get_started.md).
+## Explore the examples
 
-## Why microcontrollers are important
+Each example application is on
+[Github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples)
+and has a `README.md` file that explains how it can be deployed to its supported
+platforms. Some examples also have end-to-end tutorials using a specific
+platform, as given below:
 
-Microcontrollers are typically small, low-powered computing devices that are
-often embedded within hardware that requires basic computation, including
-household appliances and Internet of Things devices. Billions of
-microcontrollers are manufactured each year.
+*   [Hello World](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world) -
+    Demonstrates the absolute basics of using TensorFlow Lite for
+    Microcontrollers
+    *   [Tutorial using any supported device](get_started.md)
+*   [Micro speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) -
+    Captures audio with a microphone to detect the words "yes" and "no"
+    *   [Tutorial using SparkFun Edge](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow/#0)
+*   [Magic wand](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/magic_wand) -
+    Captures accelerometer data to classify three different physical gestures
+    *   [Tutorial using Arduino Nano 33 BLE Sense](https://codelabs.developers.google.com/codelabs/ai-magicwand/#0)
+*   [Person detection](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/person_detection) -
+    Captures camera data with an image sensor to detect the presence or absence
+    of a person
 
-Microcontrollers are often optimized for low energy consumption and small size,
-at the cost of reduced processing power, memory, and storage. Some
-microcontrollers have features designed to optimize performance on machine
-learning tasks.
+## Workflow
 
-By running machine learning inference on microcontrollers, developers can add AI
-to a vast range of hardware devices without relying on network connectivity,
-which is often subject to bandwidth and power constraints and results in high
-latency. Running inference on-device can also help preserve privacy, since no
-data has to leave the device.
+The following steps are required to deploy and run a TensorFlow model on a
+microcontroller:
 
-## Developer workflow
-
-To deploy a TensorFlow model to a microcontroller, you will need to follow this
-process:
-
-1.  **Create or obtain a TensorFlow model**
-
-    The model must be small enough to fit on your target device after
-    conversion, and it can only use
-    [supported operations](build_convert.md#operation-support). If you want to
-    use operations that are not currently supported, you can provide your own
-    implementations.
-
-2.  **Convert the model to a TensorFlow Lite FlatBuffer**
-
-    You will convert your model into the standard TensorFlow Lite format using
-    the [TensorFlow Lite converter](build_convert.md#model-conversion). You may
-    wish to output a quantized model, since these are smaller in size and more
-    efficient to execute.
-
-3.  **Convert the FlatBuffer to a C byte array**
-
-    Models are kept in read-only program memory and provided in the form of a
-    simple C file. Standard tools can be used to
-    [convert the FlatBuffer into a C array](build_convert.md#convert-to-a-c-array).
-
-4.  **Integrate the TensorFlow Lite for Microcontrollers C++ library**
-
-    Write your microcontroller code to collect data, perform inference using the
-    [C++ library](library.md), and make use of the results.
-
-5.  **Deploy to your device**
-
-    Build and deploy the program to your device.
+1.  **Train a model**:
+    *   *Generate a small TensorFlow model* that can fit your target device and
+        contains [supported operations](build_convert.md#operation-support).
+    *   *Convert to a TensorFlow Lite model* using the
+        [TensorFlow Lite converter](build_convert.md#model-conversion).
+    *   *Convert to a C byte array* using
+        [standard tools](build_convert.md#convert-to-a-c-array) to store it in a
+        read-only program memory on device.
+2.  **Run inference** on device using the [C++ library](library.md) and process
+    the results.
 
 ## Limitations
 
@@ -110,9 +93,13 @@ The following limitations should be considered:
     TensorFlow operations
 *   Support for a limited set of devices
 *   Low-level C++ API requiring manual memory management
-*   Training is not supported
+*   On device training is not supported
 
 ## Next steps
 
-Read [Get started with microcontrollers](get_started.md) to try the example
-applications and learn how to use the API.
+*   [Get started with microcontrollers](get_started.md) to try the example
+    application and learn how to use the API.
+*   [Understand the C++ library](library.md) to learn how to use the library in
+    your own project.
+*   [Build and convert models](build_convert.md) to learn more about training
+    and converting models for deployment on microcontrollers.
diff --git a/tensorflow/lite/g3doc/models/bert_qa/overview.md b/tensorflow/lite/g3doc/models/bert_qa/overview.md
index f63f1d22ebb..fc75cb7f33d 100644
--- a/tensorflow/lite/g3doc/models/bert_qa/overview.md
+++ b/tensorflow/lite/g3doc/models/bert_qa/overview.md
@@ -21,9 +21,13 @@ with the
 [TensorFlow Lite APIs](https://www.tensorflow.org/api_docs/python/tf/lite), you
 can download our starter question and answer model.
 
-<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/bert_qa/mobilebert_qa_vocab.zip">Download
+<a class="button button-primary" href="https://tfhub.dev/tensorflow/lite-model/mobilebert/1/metadata/1?lite-format=tflite">Download
 starter model and vocab</a>
 
+For more information about metadata and associated fields (e.g. `vocab.txt`) see
+<a href="https://www.tensorflow.org/lite/convert/metadata#read_the_metadata_from_models">Read
+the metadata from models</a>.
+
 ## How it works
 
 The model can be used to build a system that can answer users’ questions in
@@ -64,7 +68,7 @@ Performance benchmark numbers are generated with the tool
   </thead>
   <tr>
     <td rowspan = 3>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/bert_qa/mobilebert_qa_vocab.zip">Mobile Bert</a>
+      <a href="https://tfhub.dev/tensorflow/lite-model/mobilebert/1/metadata/1?lite-format=tflite">Mobile Bert</a>
     </td>
     <td rowspan = 3>
       100.5 Mb
diff --git a/tensorflow/lite/g3doc/models/image_classification/overview.md b/tensorflow/lite/g3doc/models/image_classification/overview.md
index cff23134bda..8f401e916b5 100644
--- a/tensorflow/lite/g3doc/models/image_classification/overview.md
+++ b/tensorflow/lite/g3doc/models/image_classification/overview.md
@@ -34,6 +34,18 @@ For each example, we provide a guide that explains how it works.
 
 #### Android
 
+You can leverage the out-of-box API from TensorFlow Lite Task Library to
+[integrate image classification models](../../inference_with_metadata/task_library/image_classifier)
+in just a few lines of code. You can also
+[build your own custom inference pipleline](../../inference_with_metadata/lite_support)
+using the TensorFlow Lite Support Library.
+
+The Android example below demonstrates the implementation for both methods as
+[lib_task_api](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android/lib_task_api)
+and
+[lib_support](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android/lib_support),
+respectively.
+
 <a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android">View
 Android example</a>
 
diff --git a/tensorflow/lite/g3doc/models/object_detection/overview.md b/tensorflow/lite/g3doc/models/object_detection/overview.md
index 4b29211f642..cbe2fc05922 100644
--- a/tensorflow/lite/g3doc/models/object_detection/overview.md
+++ b/tensorflow/lite/g3doc/models/object_detection/overview.md
@@ -2,41 +2,55 @@
 
 <img src="../images/detection.png" class="attempt-right">
 
-Detect multiple objects within an image, with bounding boxes. Recognize 80
-different classes of objects.
+Given an image or a video stream, an object detection model can identify which
+of a known set of objects might be present and provide information about their
+positions within the image.
+
+For example, this screenshot of the <a href="#get_started">example
+application</a> shows how two objects have been recognized and their positions
+annotated:
+
+<img src="images/android_apple_banana.png" alt="Screenshot of Android example" width="30%">
 
 ## Get started
 
-If you are new to TensorFlow Lite and are working with Android or iOS, we
-recommend exploring the following example applications that can help you get
-started.
+If you are new to TensorFlow Lite and are working with Android or iOS, download
+the following example applications to get started.
 
 <a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android">Android
 example</a>
 <a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/ios">iOS
 example</a>
 
-If you are using a platform other than Android or iOS, or you are already
-familiar with the <a href="https://www.tensorflow.org/api_docs/python/tf/lite">TensorFlow Lite APIs</a>, you can
-download our starter object detection model and the accompanying labels.
+If you are using a platform other than Android or iOS, or if you are already
+familiar with the
+<a href="https://www.tensorflow.org/api_docs/python/tf/lite">TensorFlow Lite
+APIs</a>, you can download the starter object detection model and the
+accompanying labels.
 
-<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download
-starter model and labels</a>
+<a class="button button-primary" href="https://tfhub.dev/tensorflow/lite-model/ssd_mobilenet_v1/1/metadata/1?lite-format=tflite">Download
+starter model with Metadata</a>
 
-For more information about the starter model, see
-<a href="#starter_model">Starter model</a>.
+For more information about Metadata and associated fields (eg: `labels.txt`) see
+<a href="https://www.tensorflow.org/lite/convert/metadata#read_the_metadata_from_models">Read
+the metadata from models</a>
 
-## What is object detection?
+If you want to train a custom detection model for your own task, see
+<a href="#model-customization">Model customization</a>.
 
-Given an image or a video stream, an object detection model can identify which
-of a known set of objects might be present and provide information about their
-positions within the image.
+For the following use cases, you should use a different type of model:
 
-For example, this screenshot of our <a href="#get_started">example
-application</a> shows how two objects have been recognized and their positions
-annotated:
+<ul>
+  <li>Predicting which single label the image most likely represents (see <a href="../image_classification/overview.md">image classification</a>)</li>
+  <li>Predicting the composition of an image, for example subject versus background (see <a href="../segmentation/overview.md">segmentation</a>)</li>
+</ul>
 
-<img src="images/android_apple_banana.png" alt="Screenshot of Android example" width="30%">
+## Model description
+
+This section describes the signature for
+[Single-Shot Detector](https://arxiv.org/abs/1512.02325) models converted to
+TensorFlow Lite from the
+[TensorFlow Object Detection API](https://github.com/tensorflow/models/blob/master/research/object_detection/).
 
 An object detection model is trained to detect the presence and location of
 multiple classes of objects. For example, a model might be trained with images
@@ -44,15 +58,69 @@ that contain various pieces of fruit, along with a _label_ that specifies the
 class of fruit they represent (e.g. an apple, a banana, or a strawberry), and
 data specifying where each object appears in the image.
 
-When we subsequently provide an image to the model, it will output a list of the
-objects it detects, the location of a bounding box that contains each object,
-and a score that indicates the confidence that detection was correct.
+When an image is subsequently provided to the model, it will output a list of
+the objects it detects, the location of a bounding box that contains each
+object, and a score that indicates the confidence that detection was correct.
 
-### Model output
+### Input Signature
 
-Imagine a model has been trained to detect apples, bananas, and strawberries.
-When we pass it an image, it will output a set number of detection results - in
-this example, 5.
+The model takes an image as input.
+
+Lets assume the expected image is 300x300 pixels, with three channels (red,
+blue, and green) per pixel. This should be fed to the model as a flattened
+buffer of 270,000 byte values (300x300x3). If the model is
+<a href="../../performance/post_training_quantization.md">quantized</a>, each
+value should be a single byte representing a value between 0 and 255.
+
+You can take a look at our
+[example app code](https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android)
+to understand how to do this pre-processing on Android.
+
+### Output Signature
+
+The model outputs four arrays, mapped to the indices 0-4. Arrays 0, 1, and 2
+describe `N` detected objects, with one element in each array corresponding to
+each object.
+
+<table>
+  <thead>
+    <tr>
+      <th>Index</th>
+      <th>Name</th>
+      <th>Description</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>0</td>
+      <td>Locations</td>
+      <td>Multidimensional array of [N][4] floating point values between 0 and 1, the inner arrays representing bounding boxes in the form [top, left, bottom, right]</td>
+    </tr>
+    <tr>
+      <td>1</td>
+      <td>Classes</td>
+      <td>Array of N integers (output as floating point values) each indicating the index of a class label from the labels file</td>
+    </tr>
+    <tr>
+      <td>2</td>
+      <td>Scores</td>
+      <td>Array of N floating point values between 0 and 1 representing probability that a class was detected</td>
+    </tr>
+    <tr>
+      <td>3</td>
+      <td>Number of detections</td>
+      <td>Integer value of N</td>
+    </tr>
+  </tbody>
+</table>
+
+NOTE: The number of results (10 in the above case) is a parameter set while
+exporting the detection model to TensorFlow Lite. See
+<a href="#model-customization">Model customization</a> for more details.
+
+For example, imagine a model has been trained to detect apples, bananas, and
+strawberries. When provided an image, it will output a set number of detection
+results - in this example, 5.
 
 <table style="width: 60%;">
   <thead>
@@ -91,7 +159,7 @@ this example, 5.
   </tbody>
 </table>
 
-### Confidence score
+#### Confidence score
 
 To interpret these results, we can look at the score and the location for each
 detected object. The score is a number between 0 and 1 that indicates confidence
@@ -99,10 +167,10 @@ that the object was genuinely detected. The closer the number is to 1, the more
 confident the model is.
 
 Depending on your application, you can decide a cut-off threshold below which
-you will discard detection results. For our example, we might decide a sensible
-cut-off is a score of 0.5 (meaning a 50% probability that the detection is
-valid). In that case, we would ignore the last two objects in the array, because
-those confidence scores are below 0.5:
+you will discard detection results. For the current example, a sensible cut-off
+is a score of 0.5 (meaning a 50% probability that the detection is valid). In
+that case, the last two objects in the array would be ignored because those
+confidence scores are below 0.5:
 
 <table style="width: 60%;">
   <thead>
@@ -154,11 +222,11 @@ positive.
 
 <img src="images/false_positive.png" alt="Screenshot of Android example showing a false positive" width="30%">
 
-### Location
+#### Location
 
 For each detected object, the model will return an array of four numbers
 representing a bounding rectangle that surrounds its position. For the starter
-model we provide, the numbers are ordered as follows:
+model provided, the numbers are ordered as follows:
 
 <table style="width: 50%; margin: 0 auto;">
   <tbody>
@@ -182,7 +250,9 @@ Note: Object detection models accept input images of a specific size. This is li
 
 ## Performance benchmarks
 
-Performance benchmark numbers are generated with the tool
+Performance benchmark numbers for our
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">starter
+model</a> are generated with the tool
 [described here](https://www.tensorflow.org/lite/performance/benchmarks).
 
 <table>
@@ -197,7 +267,7 @@ Performance benchmark numbers are generated with the tool
   </thead>
   <tr>
     <td rowspan = 3>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">COCO SSD MobileNet v1</a>
+      <a href="https://tfhub.dev/tensorflow/lite-model/ssd_mobilenet_v1/1/metadata/1?lite-format=tflite">COCO SSD MobileNet v1</a>
     </td>
     <td rowspan = 3>
       27 Mb
@@ -222,91 +292,72 @@ Performance benchmark numbers are generated with the tool
 
 \*\* 2 threads used on iPhone for the best performance result.
 
-## Starter model
+## Model Customization
 
-We recommend starting with this pre-trained quantized COCO SSD MobileNet v1
-model.
+### Pre-trained models
 
-<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">Download
-starter model and labels</a>
+Mobile-optimized detection models with a variety of latency and precision
+characteristics can be found in the
+[Detection Zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf1_detection_zoo.md#mobile-models).
+Each one of them follows the input and output signatures described in the
+following sections.
 
-### Uses and limitations
+Most of the download zips contain a `model.tflite` file. If there isn't one, a
+TensorFlow Lite flatbuffer can be generated using
+[these instructions](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/running_on_mobile_tensorflowlite.md).
+SSD models from the
+[TF2 Object Detection Zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf2_detection_zoo.md)
+can also be converted to TensorFlow Lite using the instructions
+[here](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/running_on_mobile_tf2.md).
+It is important to note that detection models cannot be converted directly using
+the [TensorFlow Lite Converter](https://www.tensorflow.org/lite/convert), since
+they require an intermediate step of generating a mobile-friendly source model.
+The scripts linked above perform this step.
 
-The object detection model we provide can identify and locate up to 10 objects
-in an image. It is trained to recognize 80 classes of object. For a full list of
-classes, see the labels file in the
-<a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">model
-zip</a>.
+Both the
+[TF1](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/running_on_mobile_tensorflowlite.md)
+&
+[TF2](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/running_on_mobile_tensorflowlite.md)
+exporting scripts have parameters that can enable a larger number of output
+objects or slower, more-accurate post processing. Please use `--help` with the
+scripts to see an exhaustive list of supported arguments.
 
-If you want to train a model to recognize new classes, see
-<a href="#customize_model">Customize model</a>.
+> Currently, on-device inference is only optimized with SSD models. Better
+> support for other architectures like CenterNet and EfficientDet is being
+> investigated.
 
-For the following use cases, you should use a different type of model:
+### How to choose a model to customize?
 
-<ul>
-  <li>Predicting which single label the image most likely represents (see <a href="../image_classification/overview.md">image classification</a>)</li>
-  <li>Predicting the composition of an image, for example subject versus background (see <a href="../segmentation/overview.md">segmentation</a>)</li>
-</ul>
+Each model comes with its own precision (quantified by mAP value) and latency
+characteristics. You should choose a model that works the best for your use-case
+and intended hardware. For example, the
+[Edge TPU](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf1_detection_zoo.md#pixel4-edge-tpu-models)
+models are ideal for inference on Google's Edge TPU on Pixel 4.
 
-### Input
+You can use our
+[benchmark tool](https://www.tensorflow.org/lite/performance/measurement) to
+evaluate models and choose the most efficient option available.
 
-The model takes an image as input. The expected image is 300x300 pixels, with
-three channels (red, blue, and green) per pixel. This should be fed to the model
-as a flattened buffer of 270,000 byte values (300x300x3). Since the model is
-<a href="../../performance/post_training_quantization.md">quantized</a>, each
-value should be a single byte representing a value between 0 and 255.
+## Fine-tuning models on custom data
 
-### Output
-
-The model outputs four arrays, mapped to the indices 0-4. Arrays 0, 1, and 2
-describe 10 detected objects, with one element in each array corresponding to
-each object. There will always be 10 objects detected.
-
-<table>
-  <thead>
-    <tr>
-      <th>Index</th>
-      <th>Name</th>
-      <th>Description</th>
-    </tr>
-  </thead>
-  <tbody>
-    <tr>
-      <td>0</td>
-      <td>Locations</td>
-      <td>Multidimensional array of [10][4] floating point values between 0 and 1, the inner arrays representing bounding boxes in the form [top, left, bottom, right]</td>
-    </tr>
-    <tr>
-      <td>1</td>
-      <td>Classes</td>
-      <td>Array of 10 integers (output as floating point values) each indicating the index of a class label from the labels file</td>
-    </tr>
-    <tr>
-      <td>2</td>
-      <td>Scores</td>
-      <td>Array of 10 floating point values between 0 and 1 representing probability that a class was detected</td>
-    </tr>
-    <tr>
-      <td>3</td>
-      <td>Number and detections</td>
-      <td>Array of length 1 containing a floating point value expressing the total number of detection results</td>
-    </tr>
-  </tbody>
-</table>
-
-## Customize model
-
-The pre-trained models we provide are trained to detect 80 classes of object.
+The pre-trained models we provide are trained to detect 90 classes of objects.
 For a full list of classes, see the labels file in the
-<a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip">model
-zip</a>.
+<a href="https://tfhub.dev/tensorflow/lite-model/ssd_mobilenet_v1/1/metadata/1?lite-format=tflite">model
+metadata</a>.
 
 You can use a technique known as transfer learning to re-train a model to
 recognize classes not in the original set. For example, you could re-train the
 model to detect multiple types of vegetable, despite there only being one
 vegetable in the original training data. To do this, you will need a set of
-training images for each of the new labels you wish to train.
+training images for each of the new labels you wish to train. Please see our
+[Few-shot detection Colab](https://github.com/tensorflow/models/blob/master/research/object_detection/colab_tutorials/eager_few_shot_od_training_tflite.ipynb)
+as an example of fine-tuning a pre-trained model with few examples.
 
-Learn how to perform transfer learning in
-<a href="https://medium.com/tensorflow/training-and-serving-a-realtime-mobile-object-detector-in-30-minutes-with-cloud-tpus-b78971cf1193">Training
-and serving a real-time mobile object detector in 30 minutes</a>.
+For fine-tuning with larger datasets, take a look at the these guides for
+training your own models with the TensorFlow Object Detection API:
+[TF1](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf1_training_and_evaluation.md),
+[TF2](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf2_training_and_evaluation.md).
+Once trained, they can be converted to a TFLite-friendly format with the
+instructions here:
+[TF1](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/running_on_mobile_tensorflowlite.md),
+[TF2](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/running_on_mobile_tensorflowlite.md)
diff --git a/tensorflow/lite/g3doc/models/segmentation/overview.md b/tensorflow/lite/g3doc/models/segmentation/overview.md
index 497a9a39eca..408b0cadd91 100644
--- a/tensorflow/lite/g3doc/models/segmentation/overview.md
+++ b/tensorflow/lite/g3doc/models/segmentation/overview.md
@@ -23,7 +23,7 @@ familiar with the
 <a href="https://www.tensorflow.org/api_docs/python/tf/lite">TensorFlow Lite
 APIs</a>, you can download our starter image segmentation model.
 
-<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/deeplabv3_257_mv_gpu.tflite">Download
+<a class="button button-primary" href="https://tfhub.dev/tensorflow/lite-model/deeplabv3/1/metadata/2?lite-format=tflite">Download
 starter model</a>
 
 ## How it works
@@ -60,7 +60,7 @@ Performance benchmark numbers are generated with the tool
   </thead>
   <tr>
     <td rowspan = 3>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/deeplabv3_257_mv_gpu.tflite">Deeplab v3</a>
+      <a href="https://tfhub.dev/tensorflow/lite-model/deeplabv3/1/metadata/2?lite-format=tflite">Deeplab v3</a>
     </td>
     <td rowspan = 3>
       2.7 Mb
diff --git a/tensorflow/lite/g3doc/models/smart_reply/overview.md b/tensorflow/lite/g3doc/models/smart_reply/overview.md
index 40a69d064ef..f902406ff52 100644
--- a/tensorflow/lite/g3doc/models/smart_reply/overview.md
+++ b/tensorflow/lite/g3doc/models/smart_reply/overview.md
@@ -8,8 +8,8 @@ Our smart reply model generates reply suggestions based on chat messages. The
 suggestions are intended to be contextually relevant, one-touch responses that
 help the user to easily reply to an incoming message.
 
-<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip">Download
-starter model and labels</a>
+<a class="button button-primary" href="https://tfhub.dev/tensorflow/lite-model/smartreply/1/default/1?lite-format=tflite">Download
+starter model</a>
 
 ### Sample application
 
diff --git a/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb b/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb
index 4f2e08a73e8..7445a7c8be7 100644
--- a/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb
+++ b/tensorflow/lite/g3doc/models/style_transfer/overview.ipynb
@@ -3,7 +3,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "g_nWetWWd_ns"
       },
       "source": [
@@ -12,11 +11,9 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
         "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
         "id": "2pHVBk_seED1"
       },
       "outputs": [],
@@ -37,7 +34,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "M7vSdG6sAIQn"
       },
       "source": [
@@ -47,7 +43,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "fwc5GKHBASdc"
       },
       "source": [
@@ -70,7 +65,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "31O0iaROAw8z"
       },
       "source": [
@@ -101,7 +95,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "ak0S4gkOCSxs"
       },
       "source": [
@@ -111,7 +104,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "oee6G_bBCgAM"
       },
       "source": [
@@ -127,7 +119,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "a7ZETsRVNMo7"
       },
       "source": [
@@ -137,7 +128,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "3n8oObKZN4c8"
       },
       "source": [
@@ -146,10 +136,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "xz62Lb1oNm97"
       },
       "outputs": [],
@@ -160,10 +148,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "1Ua5FpcJNrIj"
       },
       "outputs": [],
@@ -183,7 +169,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "1b988wrrQnVF"
       },
       "source": [
@@ -192,10 +177,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "16g57cIMQnen"
       },
       "outputs": [],
@@ -210,7 +193,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "MQZXL7kON-gM"
       },
       "source": [
@@ -223,10 +205,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "Cg0Vi-rXRUFl"
       },
       "outputs": [],
@@ -269,7 +249,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "xE4Yt8nArTeR"
       },
       "source": [
@@ -278,10 +257,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "ncPA4esJRcEu"
       },
       "outputs": [],
@@ -304,7 +281,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "CJ7R-CHbjC3s"
       },
       "source": [
@@ -314,7 +290,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "euu00ldHjKwD"
       },
       "source": [
@@ -323,10 +298,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "o3zd9cTFRiS_"
       },
       "outputs": [],
@@ -357,7 +330,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "00t8S2PekIyW"
       },
       "source": [
@@ -366,10 +338,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "cZp5bCj8SX1w"
       },
       "outputs": [],
@@ -405,7 +375,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "vv_71Td-QtrW"
       },
       "source": [
@@ -416,10 +385,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "eJcAURXQQtJ7"
       },
       "outputs": [],
@@ -432,10 +399,8 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 0,
+      "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "4S3yg2MgkmRD"
       },
       "outputs": [],
@@ -460,7 +425,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "9k9jGIep8p1c"
       },
       "source": [
@@ -492,21 +456,14 @@
         "\u003c/table\u003e\n",
         "\n",
         "*\u0026ast; 4 threads used. \u003cbr/\u003e*\n",
-        "*\u0026ast;\u0026ast; 2 threads on iPhone for the best performance.*\n",
-        "\n"
+        "*\u0026ast;\u0026ast; 2 threads on iPhone for the best performance.*\n"
       ]
     }
   ],
   "metadata": {
     "colab": {
       "collapsed_sections": [],
-      "last_runtime": {
-        "build_target": "//learning/brain/python/client:colab_notebook",
-        "kind": "private"
-      },
-      "name": "Artistic Style Transfer with TensorFlow Lite.ipynb",
-      "private_outputs": true,
-      "provenance": [],
+      "name": "overview.ipynb",
       "toc_visible": true
     },
     "kernelspec": {
diff --git a/tensorflow/lite/g3doc/models/text_classification/overview.md b/tensorflow/lite/g3doc/models/text_classification/overview.md
index a462507b56a..1761974ee3f 100644
--- a/tensorflow/lite/g3doc/models/text_classification/overview.md
+++ b/tensorflow/lite/g3doc/models/text_classification/overview.md
@@ -7,7 +7,16 @@ Use a pre-trained model to category a paragraph into predefined groups.
 <img src="images/screenshot.gif" class="attempt-right" style="max-width: 300px">
 
 If you are new to TensorFlow Lite and are working with Android, we recommend
-exploring the following example applications that can help you get started.
+exploring the guide of TensorFLow Lite Task Library to
+[integrate text classification models](../../inference_with_metadata/task_library/nl_classifier).
+within just a few lines of code. You can also integrate the model using the
+[TensorFlow Lite Interpreter Java API](../../guide/inference#load_and_run_a_model_in_java).
+
+The Android example below demonstrates the implementation for both methods as
+[lib_task_api](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification/android/lib_task_api)
+and
+[lib_interpreter](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification/android/lib_interpreter),
+respectively.
 
 <a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification/android">Android
 example</a>
@@ -16,7 +25,7 @@ If you are using a platform other than Android, or you are already familiar with
 the TensorFlow Lite APIs, you can download our starter text classification
 model.
 
-<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/text_classification/text_classification.tflite">Download
+<a class="button button-primary" href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/text_classification/text_classification_v2.tflite">Download
 starter model</a>
 
 ## How it works
@@ -60,7 +69,7 @@ Performance benchmark numbers are generated with the tool
   </thead>
   <tr>
     <td rowspan = 3>
-      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/text_classification/text_classification.tflite">Text Classification</a>
+      <a href="https://storage.googleapis.com/download.tensorflow.org/models/tflite/text_classification/text_classification_v2.tflite">Text Classification</a>
     </td>
     <td rowspan = 3>
       0.6 Mb
diff --git a/tensorflow/lite/g3doc/performance/best_practices.md b/tensorflow/lite/g3doc/performance/best_practices.md
index e4abb564b26..9df0ace4db0 100644
--- a/tensorflow/lite/g3doc/performance/best_practices.md
+++ b/tensorflow/lite/g3doc/performance/best_practices.md
@@ -38,6 +38,12 @@ has a built-in profiler that shows per operator profiling statistics. This can
 help in understanding performance bottlenecks and which operators dominate the
 computation time.
 
+You can also use
+[TensrFlow Lite tracing](measurement.md#trace_tensorflow_lite_internals_in_android)
+to profile the model in your Android application, using standard Android system
+tracing, and to visualize the operator invocations by time with GUI based
+profiling tools.
+
 ## Profile and optimize operators in the graph
 
 If a particular operator appears frequently in the model and, based on
@@ -116,7 +122,7 @@ interpreter execution. TensorFlow Lite can use delegates by:
 
 Be aware that some accelerators work better for different types of models. Some
 delegates only support float models or models optimized in a specific way. It is
-important to [benchmark](benchmarks.md) each delegate to see if it is a good
+important to [benchmark](measurement.md) each delegate to see if it is a good
 choice for your application. For example, if you have a very small model, it may
 not be worth delegating the model to either the NN API or the GPU. Conversely,
 accelerators are a great choice for large models that have high arithmetic
diff --git a/tensorflow/lite/g3doc/performance/coreml_delegate.md b/tensorflow/lite/g3doc/performance/coreml_delegate.md
index 2803b080a13..8d588fb1b1e 100644
--- a/tensorflow/lite/g3doc/performance/coreml_delegate.md
+++ b/tensorflow/lite/g3doc/performance/coreml_delegate.md
@@ -69,35 +69,42 @@ The Core ML delegate uses C API for Objective-C codes.
 
 #### Step 1. Include `coreml_delegate.h`.
 
-```objectivec
+```c
 #include "tensorflow/lite/experimental/delegates/coreml/coreml_delegate.h"
 ```
 
 #### Step 2. Create a delegate and initialize a TensorFlow Lite Interpreter
 
-After initializing the interpreter, call `interpreter->ModifyGraphWithDelegate`
-with initialized Core ML delegate to apply the delegate.
+After initializing the interpreter options, call
+`TfLiteInterpreterOptionsAddDelegate` with initialized Core ML delegate to apply
+the delegate. Then initialize the interpreter with the created option.
 
-```objectivec
-// initializer interpreter with model.
-tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+```c
+// Initialize interpreter with model
+TfLiteModel* model = TfLiteModelCreateFromFile(model_path);
 
-// Add following section to use the Core ML delegate.
-TfLiteCoreMlDelegateOptions options = {};
-delegate = TfLiteCoreMlDelegateCreate(&options);
-if (delegate != nullptr) {
-  interpreter->ModifyGraphWithDelegate(delegate);
-}
-// ...
+// Initialize interpreter with Core ML delegate
+TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+TfLiteDelegate* delegate = TfLiteCoreMlDelegateCreate(NULL);  // default config
+TfLiteInterpreterOptionsAddDelegate(options, delegate);
+TfLiteInterpreterOptionsDelete(options);
+
+TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+
+TfLiteInterpreterAllocateTensors(interpreter);
+
+// Run inference ...
 ```
 
-#### Step 3. Dispose the delegate when it is no longer used.
+#### Step 3. Dispose resources when it is no longer used.
 
 Add this code to the section where you dispose of the delegate (e.g. `dealloc`
 of class).
 
-```objectivec
+```c
+TfLiteInterpreterDelete(interpreter);
 TfLiteCoreMlDelegateDelete(delegate);
+TfLiteModelDelete(model);
 ```
 
 ## Best practices
@@ -116,17 +123,17 @@ pass `TfLiteCoreMlDelegateAllDevices`. Following example shows how to do this:
 var options = CoreMLDelegate.Options()
 options.enabledDevices = .all
 let coreMLDelegate = CoreMLDelegate(options: options)!
-let interpreter: try Interpreter(modelPath: modelPath,
-                                delegates: [coreMLDelegate])
+let interpreter = try Interpreter(modelPath: modelPath,
+                                  delegates: [coreMLDelegate])
 ```
 
 #### Objective-C
 
-```objectivec
+```c
 TfLiteCoreMlDelegateOptions options;
 options.enabled_devices = TfLiteCoreMlDelegateAllDevices;
-delegate = TfLiteCoreMlDelegateCreate(&options);
-interpreter->ModifyGraphWithDelegate(delegate);
+TfLiteDelegate* delegate = TfLiteCoreMlDelegateCreate(&options);
+// Initialize interpreter with delegate
 ```
 
 ### Using Metal(GPU) delegate as a fallback.
@@ -149,14 +156,14 @@ let interpreter = try Interpreter(modelPath: modelPath,
 
 #### Objective-C
 
-```objectivec
+```c
 TfLiteCoreMlDelegateOptions options = {};
 delegate = TfLiteCoreMlDelegateCreate(&options);
-if (delegate == nullptr) {
+if (delegate == NULL) {
   // Add Metal delegate options if necessary
-  delegate = TFLGpuDelegateCreate(nullptr);
+  delegate = TFLGpuDelegateCreate(NULL);
 }
-interpreter->ModifyGraphWithDelegate(delegate);
+// Initialize interpreter with delegate
 ```
 
 The delegate creation logic reads device's machine id (e.g. iPhone11,1) to
diff --git a/tensorflow/lite/g3doc/performance/delegates.md b/tensorflow/lite/g3doc/performance/delegates.md
index 760e7273fc4..6b233075398 100644
--- a/tensorflow/lite/g3doc/performance/delegates.md
+++ b/tensorflow/lite/g3doc/performance/delegates.md
@@ -21,10 +21,11 @@ TensorFlow Lite provides the following delegates for hardware acceleration:
 
 *   **GPU delegate for cross platform acceleration** - The GPU delegate can be
     used on both Android and iOS. It is optimized to run 32-bit and 16-bit float
-    based models where a GPU is available. For an overview of the GPU delegate,
-    see [TensorFlow Lite on GPU](gpu_advanced.md). For step-by-step tutorials on
-    using the GPU delegate with Android and iOS, see
-    [TensorFlow Lite GPU Delegate Tutorial](gpu.md).
+    based models where a GPU is available. It also supports 8-bit quantized
+    models and provides GPU performance on par with their float versions. For
+    details on the GPU delegate, see [TensorFlow Lite on GPU](gpu_advanced.md).
+    For step-by-step tutorials on using the GPU delegate with Android and iOS,
+    see [TensorFlow Lite GPU Delegate Tutorial](gpu.md).
 *   **NNAPI delegate for newer Android devices** - The NNAPI delegate can be
     used to accelerate models on Android devices with GPU, DSP and / or NPU
     available. It is available in Android 8.1 (API 27+) or higher. For an
@@ -86,6 +87,8 @@ execute Conv2D and Mean operations faster.
 
 ```c++
 #include "tensorflow/lite/util.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/context_util.h"
 
 // This is where the execution of the operations or whole graph happens.
 // The class below has an empty implementation just as a guideline
diff --git a/tensorflow/lite/g3doc/performance/gpu.md b/tensorflow/lite/g3doc/performance/gpu.md
index 3cea6febb21..96e8aa6f9dc 100644
--- a/tensorflow/lite/g3doc/performance/gpu.md
+++ b/tensorflow/lite/g3doc/performance/gpu.md
@@ -14,6 +14,10 @@ run fast enough for previously not available real-time applications.
 Unlike CPUs, GPUs compute with 16-bit or 32-bit floating point numbers and do
 not require quantization for optimal performance.
 
+**NOTE:** The delegate does accept 8-bit quantized models on Android. Support on
+iOS is experimental. Refer to the [advanced documentation](gpu_advanced.md) for
+details.
+
 Another benefit with GPU inference is its power efficiency. GPUs carry out the
 computations in a very efficient and optimized manner, so that they consume less
 power and generate less heat than when the same task is run on CPUs.
@@ -47,16 +51,16 @@ package in the existing `dependencies` block.
 ```
 dependencies {
     ...
-    implementation 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
-    implementation 'org.tensorflow:tensorflow-lite-gpu:0.0.0-nightly'
+    implementation 'org.tensorflow:tensorflow-lite:2.3.0'
+    implementation 'org.tensorflow:tensorflow-lite-gpu:2.3.0'
 }
 ```
 
 #### Step 3. Build and run
 
-Run → Run ‘app’.  When you run the application you will see a button for
-enabling the GPU. Change from quantized to a float model and then click GPU to
-run on the GPU.
+Run → Run ‘app’. When you run the application you will see a button for enabling
+the GPU. Change from quantized to a float model and then click GPU to run on the
+GPU.
 
 ![running android gpu demo and switch to gpu](images/android_gpu_demo.gif)
 
@@ -70,9 +74,8 @@ Note: This requires XCode v10.1 or later.
 #### Step 1. Get the demo source code and make sure it compiles.
 
 Follow our iOS Demo App [tutorial](https://www.tensorflow.org/lite/demo_ios).
-This will get you to a point where the unmodified iOS camera demo is working
-on your phone.
-
+This will get you to a point where the unmodified iOS camera demo is working on
+your phone.
 
 #### Step 2. Modify the Podfile to use the TensorFlow Lite GPU CocoaPod
 
@@ -82,8 +85,8 @@ Until TensorFlow Lite 2.0.0
 
 We have built a binary CocoaPod that includes the GPU delegate. To switch the
 project to use it, modify the
-`tensorflow/tensorflow/lite/examples/ios/camera/Podfile` file to use
-the `TensorFlowLiteGpuExperimental` pod instead of `TensorFlowLite`.
+`tensorflow/tensorflow/lite/examples/ios/camera/Podfile` file to use the
+`TensorFlowLiteGpuExperimental` pod instead of `TensorFlowLite`.
 
 ```
 target 'YourProjectName'
@@ -153,29 +156,75 @@ Lastly make sure to select Release-only builds on 64-bit architecture. Under
 
 ### Android
 
+Note: The TensorFlow Lite Interpreter must be created on the same thread as when
+is is run. Otherwise, `TfLiteGpuDelegate Invoke: GpuDelegate must run on the
+same thread where it was initialized.` may occur.
+
 Look at the demo to see how to add the delegate. In your application, add the
 AAR as above, import `org.tensorflow.lite.gpu.GpuDelegate` module, and use
 the`addDelegate` function to register the GPU delegate to the interpreter:
 
-```java
-import org.tensorflow.lite.Interpreter;
-import org.tensorflow.lite.gpu.GpuDelegate;
+<div>
+  <devsite-selector>
+    <section>
+      <h3>Kotlin</h3>
+      <p><pre class="prettyprint lang-kotlin">
+    import org.tensorflow.lite.Interpreter
+    import org.tensorflow.lite.gpu.CompatibilityList
+    import org.tensorflow.lite.gpu.GpuDelegate
 
-// Initialize interpreter with GPU delegate
-GpuDelegate delegate = new GpuDelegate();
-Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
-Interpreter interpreter = new Interpreter(model, options);
+    val compatList = CompatibilityList()
 
-// Run inference
-while (true) {
-  writeToInput(input);
-  interpreter.run(input, output);
-  readFromOutput(output);
-}
+    val options = Interpreter.Options().apply{
+        if(compatList.isDelegateSupportedOnThisDevice){
+            // if the device has a supported GPU, add the GPU delegate
+            val delegateOptions = compatList.bestOptionsForThisDevice
+            this.addDelegate(GpuDelegate(delegateOptions))
+        } else {
+            // if the GPU is not supported, run on 4 threads
+            this.setNumThreads(4)
+        }
+    }
 
-// Clean up
-delegate.close();
-```
+    val interpreter = Interpreter(model, options)
+
+    // Run inference
+    writeToInput(input)
+    interpreter.run(input, output)
+    readFromOutput(output)
+      </pre></p>
+    </section>
+    <section>
+      <h3>Java</h3>
+      <p><pre class="prettyprint lang-java">
+    import org.tensorflow.lite.Interpreter;
+    import org.tensorflow.lite.gpu.CompatibilityList;
+    import org.tensorflow.lite.gpu.GpuDelegate;
+
+    // Initialize interpreter with GPU delegate
+    Interpreter.Options options = new Interpreter.Options();
+    CompatibilityList compatList = CompatibilityList();
+
+    if(compatList.isDelegateSupportedOnThisDevice()){
+        // if the device has a supported GPU, add the GPU delegate
+        GpuDelegate.Options delegateOptions = compatList.getBestOptionsForThisDevice();
+        GpuDelegate gpuDelegate = new GpuDelegate(delegateOptions);
+        options.addDelegate(gpuDelegate);
+    } else {
+        // if the GPU is not supported, run on 4 threads
+        options.setNumThreads(4);
+    }
+
+    Interpreter interpreter = new Interpreter(model, options);
+
+    // Run inference
+    writeToInput(input);
+    interpreter.run(input, output);
+    readFromOutput(output);
+      </pre></p>
+    </section>
+  </devsite-selector>
+</div>
 
 ### iOS
 
@@ -205,25 +254,36 @@ In your application code, include the GPU delegate header and call the
 `Interpreter::ModifyGraphWithDelegate` function to register the GPU delegate to
 the interpreter:
 
-```cpp
-#import "tensorflow/lite/delegates/gpu/metal_delegate.h"
+```objc
+#include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/delegates/gpu/metal_delegate.h"
+
+// Initialize model
+TfLiteModel* model = TfLiteModelCreateFromFile(model_path);
 
 // Initialize interpreter with GPU delegate
-std::unique_ptr<Interpreter> interpreter;
-InterpreterBuilder(*model, resolver)(&interpreter);
-auto* delegate = NewGpuDelegate(nullptr);  // default config
-if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
+TfLiteInterpreterOptions* options = TfLiteInterpreterOptionsCreate();
+TfLiteDelegate* delegate = TFLGPUDelegateCreate(nil);  // default config
+TfLiteInterpreterOptionsAddDelegate(options, metal_delegate);
+TfLiteInterpreter* interpreter = TfLiteInterpreterCreate(model, options);
+TfLiteInterpreterOptionsDelete(options);
+
+TfLiteInterpreterAllocateTensors(interpreter);
+
+NSMutableData *input_data = [NSMutableData dataWithLength:input_size * sizeof(float)];
+NSMutableData *output_data = [NSMutableData dataWithLength:output_size * sizeof(float)];
+TfLiteTensor* input = TfLiteInterpreterGetInputTensor(interpreter, 0);
+const TfLiteTensor* output = TfLiteInterpreterGetOutputTensor(interpreter, 0);
 
 // Run inference
-while (true) {
-  WriteToInputTensor(interpreter->typed_input_tensor<float>(0));
-  if (interpreter->Invoke() != kTfLiteOk) return false;
-  ReadFromOutputTensor(interpreter->typed_output_tensor<float>(0));
-}
+TfLiteTensorCopyFromBuffer(input, inputData.bytes, inputData.length);
+TfLiteInterpreterInvoke(interpreter);
+TfLiteTensorCopyToBuffer(output, outputData.mutableBytes, outputData.length);
 
 // Clean up
-interpreter = nullptr;
-DeleteGpuDelegate(delegate);
+TfLiteInterpreterDelete(interpreter);
+TFLGpuDelegateDelete(metal_delegate);
+TfLiteModelDelete(model);
 ```
 
 ## Supported Models and Ops
@@ -231,16 +291,17 @@ DeleteGpuDelegate(delegate);
 With the release of the GPU delegate, we included a handful of models that can
 be run on the backend:
 
-* [MobileNet v1 (224x224) image classification](https://ai.googleblog.com/2017/06/mobilenets-open-source-models-for.html) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/mobilenet_v1_1.0_224.tflite)
-<br /><i>(image classification model designed for mobile and embedded based vision applications)</i>
-* [DeepLab segmentation (257x257)](https://ai.googleblog.com/2018/03/semantic-image-segmentation-with.html) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/deeplabv3_257_mv_gpu.tflite)
-<br /><i>(image segmentation model that assigns semantic labels (e.g., dog, cat, car) to every pixel in the input image)</i>
-* [MobileNet SSD object detection](https://ai.googleblog.com/2018/07/accelerated-training-and-inference-with.html) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/mobile_ssd_v2_float_coco.tflite)
-<br /><i>(image classification model that detects multiple objects with bounding boxes)</i>
-* [PoseNet for pose estimation](https://github.com/tensorflow/tfjs-models/tree/master/posenet) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/multi_person_mobilenet_v1_075_float.tflite)
-<br /><i>(vision model that estimates the poses of a person(s) in image or video)</i>
+*   [MobileNet v1 (224x224) image classification](https://ai.googleblog.com/2017/06/mobilenets-open-source-models-for.html) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/mobilenet_v1_1.0_224.tflite)
+    <br /><i>(image classification model designed for mobile and embedded based vision applications)</i>
+*   [DeepLab segmentation (257x257)](https://ai.googleblog.com/2018/03/semantic-image-segmentation-with.html) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/deeplabv3_257_mv_gpu.tflite)
+    <br /><i>(image segmentation model that assigns semantic labels (e.g., dog, cat, car) to every pixel in the input image)</i>
+*   [MobileNet SSD object detection](https://ai.googleblog.com/2018/07/accelerated-training-and-inference-with.html) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/mobile_ssd_v2_float_coco.tflite)
+    <br /><i>(image classification model that detects multiple objects with bounding boxes)</i>
+*   [PoseNet for pose estimation](https://github.com/tensorflow/tfjs-models/tree/master/posenet) [[download]](https://storage.googleapis.com/download.tensorflow.org/models/tflite/gpu/multi_person_mobilenet_v1_075_float.tflite)
+    <br /><i>(vision model that estimates the poses of a person(s) in image or video)</i>
 
-To see a full list of supported ops, please see the [advanced documentation](gpu_advanced.md).
+To see a full list of supported ops, please see the
+[advanced documentation](gpu_advanced.md).
 
 ## Non-supported models and ops
 
diff --git a/tensorflow/lite/g3doc/performance/gpu_advanced.md b/tensorflow/lite/g3doc/performance/gpu_advanced.md
index 1614523b705..71415693f86 100644
--- a/tensorflow/lite/g3doc/performance/gpu_advanced.md
+++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md
@@ -65,27 +65,83 @@ allows the appropriate versions; for example, ADD v2.
 
 ## Basic usage
 
-### Android (Java)
+### Android (Kotlin / Java)
 
-Run TensorFlow Lite on GPU with `TfLiteDelegate`. In Java, you can specify the
-`GpuDelegate` through `Interpreter.Options`.
+Add the `tensorflow-lite-gpu` package alongside the existing `tensorflow-lite`
+package in the existing `dependencies` block.
 
-```java
-// NEW: Prepare GPU delegate.
-GpuDelegate delegate = new GpuDelegate();
-Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
-
-// Set up interpreter.
-Interpreter interpreter = new Interpreter(model, options);
-
-// Run inference.
-writeToInputTensor(inputTensor);
-interpreter.run(inputTensor, outputTensor);
-readFromOutputTensor(outputTensor);
-
-// Clean up.
-delegate.close();
 ```
+dependencies {
+    ...
+    implementation 'org.tensorflow:tensorflow-lite:2.3.0'
+    implementation 'org.tensorflow:tensorflow-lite-gpu:2.3.0'
+}
+```
+
+Then run TensorFlow Lite on GPU with `TfLiteDelegate`. In Java, you can specify
+the `GpuDelegate` through `Interpreter.Options`.
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>Kotlin</h3>
+      <p><pre class="prettyprint lang-kotlin">
+    import org.tensorflow.lite.Interpreter
+    import org.tensorflow.lite.gpu.CompatibilityList
+    import org.tensorflow.lite.gpu.GpuDelegate
+
+    val compatList = CompatibilityList()
+
+    val options = Interpreter.Options().apply{
+        if(compatList.isDelegateSupportedOnThisDevice){
+            // if the device has a supported GPU, add the GPU delegate
+            val delegateOptions = compatList.bestOptionsForThisDevice
+            this.addDelegate(GpuDelegate(delegateOptions))
+        } else {
+            // if the GPU is not supported, run on 4 threads
+            this.setNumThreads(4)
+        }
+    }
+
+    val interpreter = Interpreter(model, options)
+
+    // Run inference
+    writeToInput(input)
+    interpreter.run(input, output)
+    readFromOutput(output)
+      </pre></p>
+    </section>
+    <section>
+      <h3>Java</h3>
+      <p><pre class="prettyprint lang-java">
+    import org.tensorflow.lite.Interpreter;
+    import org.tensorflow.lite.gpu.CompatibilityList;
+    import org.tensorflow.lite.gpu.GpuDelegate;
+
+    // Initialize interpreter with GPU delegate
+    Interpreter.Options options = new Interpreter.Options();
+    CompatibilityList compatList = CompatibilityList();
+
+    if(compatList.isDelegateSupportedOnThisDevice()){
+        // if the device has a supported GPU, add the GPU delegate
+        GpuDelegate.Options delegateOptions = compatList.getBestOptionsForThisDevice();
+        GpuDelegate gpuDelegate = new GpuDelegate(delegateOptions);
+        options.addDelegate(gpuDelegate);
+    } else {
+        // if the GPU is not supported, run on 4 threads
+        options.setNumThreads(4);
+    }
+
+    Interpreter interpreter = new Interpreter(model, options);
+
+    // Run inference
+    writeToInput(input);
+    interpreter.run(input, output);
+    readFromOutput(output);
+      </pre></p>
+    </section>
+  </devsite-selector>
+</div>
 
 ### Android (C/C++)
 
@@ -197,7 +253,7 @@ default options (which are explicated in the Basic Usage example above).
 
 // THIS:
 var options = MetalDelegate.Options()
-options.allowsPrecisionLoss = false
+options.isPrecisionLossAllowed = false
 options.waitType = .passive
 options.isQuantizationEnabled = false
 let delegate = MetalDelegate(options: options)
@@ -229,12 +285,10 @@ While it is convenient to use `nullptr`, we recommend that you explicitly set
 the options, to avoid any unexpected behavior if default values are changed in
 the future.
 
-### Running quantized models (Experimental)
+### Running quantized models on GPU
 
-The GPU delegate already supports
-[float16 quantized](https://www.tensorflow.org/lite/performance/post_training_float16_quant)
-models. There is experimental support on Android and iOS to run 8-bit quantized
-as well. This includes all flavors of quantization, including:
+This section explains how the GPU delegate accelerates 8-bit quantized models.
+This includes all flavors of quantization, including:
 
 *   Models trained with
     [Quantization-aware training](https://www.tensorflow.org/lite/convert/quantization)
@@ -266,12 +320,14 @@ This feature can be enabled using delegate options as follows:
 
 #### Android
 
+Android APIs support quantized models by default. To disable, do the following:
+
 **C++ API**
 
 ```c++
 // NEW: Prepare custom options with feature enabled.
 TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
-options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
+options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE;
 
 auto* delegate = TfLiteGpuDelegateV2Create(options);
 if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
@@ -281,13 +337,16 @@ if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
 
 ```java
 // NEW: Prepare GPU delegate with feature turned on.
-GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true));
+GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(false));
 
 Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
 ```
 
 #### iOS
 
+Support for quantized models on iOS APIs is experimental. To enable, do the
+following:
+
 **Swift API**
 
 ```swift
diff --git a/tensorflow/lite/g3doc/performance/measurement.md b/tensorflow/lite/g3doc/performance/measurement.md
index 179406f517e..9d2f7247ac7 100644
--- a/tensorflow/lite/g3doc/performance/measurement.md
+++ b/tensorflow/lite/g3doc/performance/measurement.md
@@ -451,6 +451,25 @@ help notice where the inference call is made.
 
 ```
 
+### Enable TensorFlow Lite tracing
+
+To enable TensorFlow Lite tracing, set the Android system property
+`debug.tflite.tracing` to 1 before starting the Android app.
+
+```shell
+adb shell setprop debug.tflite.trace 1
+```
+
+If this property has been set when TensorFlow Lite interpreter is initialized,
+key events (e.g., operator invocation) from the interpreter will be traced.
+
+After you captured all the traces, disable tracing by setting the property value
+to 0.
+
+```shell
+adb shell setprop debug.tflite.trace 0
+```
+
 ### Android Studio CPU Profiler
 
 Capture traces with the
diff --git a/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
index 6015d3e1a65..2ebaaaf0703 100644
--- a/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
@@ -3,7 +3,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "c8Cx-rUMVX25"
       },
       "source": [
@@ -15,8 +14,6 @@
       "execution_count": 1,
       "metadata": {
         "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
         "id": "I9sUhVL_VZNO"
       },
       "outputs": [],
@@ -37,7 +34,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "6Y8E0lw5eYWm"
       },
       "source": [
@@ -47,7 +43,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "CGuqeuPSVNo-"
       },
       "source": [
@@ -70,7 +65,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "BTC1rDAuei_1"
       },
       "source": [
@@ -86,7 +80,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "2XsEP17Zelz9"
       },
       "source": [
@@ -96,7 +89,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "dDqqUIZjZjac"
       },
       "source": [
@@ -107,8 +99,6 @@
       "cell_type": "code",
       "execution_count": 2,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "gyqAw1M9lyab"
       },
       "outputs": [],
@@ -126,12 +116,7 @@
       "cell_type": "code",
       "execution_count": 3,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "c6nb7OPlXs_3",
-        "outputId": "be7e4e14-cd67-4554-e928-ad803f36dad9"
+        "id": "c6nb7OPlXs_3"
       },
       "outputs": [
         {
@@ -154,7 +139,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "eQ6Q0qqKZogR"
       },
       "source": [
@@ -165,12 +149,7 @@
       "cell_type": "code",
       "execution_count": 4,
       "metadata": {
-        "colab": {
-          "height": 102
-        },
-        "colab_type": "code",
-        "id": "hWSAjQWagIHl",
-        "outputId": "9bf2b530-5a05-415f-f856-cab3642256e9"
+        "id": "hWSAjQWagIHl"
       },
       "outputs": [
         {
@@ -230,7 +209,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "5NMaNZQCkW9X"
       },
       "source": [
@@ -240,7 +218,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "xl8_fzVAZwOh"
       },
       "source": [
@@ -255,8 +232,6 @@
       "cell_type": "code",
       "execution_count": 5,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "_i8B2nDZmAgQ"
       },
       "outputs": [],
@@ -268,7 +243,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "F2o2ZfF0aiCx"
       },
       "source": [
@@ -279,8 +253,6 @@
       "cell_type": "code",
       "execution_count": 6,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "vptWZq2xnclo"
       },
       "outputs": [],
@@ -293,12 +265,7 @@
       "cell_type": "code",
       "execution_count": 7,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "Ie9pQaQrn5ue",
-        "outputId": "5df7381a-78ee-4f3e-e1a9-0f3a028384cf"
+        "id": "Ie9pQaQrn5ue"
       },
       "outputs": [
         {
@@ -322,7 +289,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "7BONhYtYocQY"
       },
       "source": [
@@ -333,8 +299,6 @@
       "cell_type": "code",
       "execution_count": 8,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "HEZ6ET1AHAS3"
       },
       "outputs": [],
@@ -346,7 +310,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "xW84iMYjHd9t"
       },
       "source": [
@@ -357,12 +320,7 @@
       "cell_type": "code",
       "execution_count": 9,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "yuNfl3CoHNK3",
-        "outputId": "839f02cd-0a8c-4551-aaa3-0c05c845ad2e"
+        "id": "yuNfl3CoHNK3"
       },
       "outputs": [
         {
@@ -387,7 +345,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "PhMmUTl4sbkz"
       },
       "source": [
@@ -398,12 +355,7 @@
       "cell_type": "code",
       "execution_count": 10,
       "metadata": {
-        "colab": {
-          "height": 68
-        },
-        "colab_type": "code",
-        "id": "JExfcfLDscu4",
-        "outputId": "6ca316c2-cb0e-40e9-ffb1-a8bcf267e101"
+        "id": "JExfcfLDscu4"
       },
       "outputs": [
         {
@@ -423,7 +375,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "L8lQHMp_asCq"
       },
       "source": [
@@ -433,7 +384,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "-5l6-ciItvX6"
       },
       "source": [
@@ -443,7 +393,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Ap_jE7QRvhPf"
       },
       "source": [
@@ -454,8 +403,6 @@
       "cell_type": "code",
       "execution_count": 11,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "Jn16Rc23zTss"
       },
       "outputs": [],
@@ -468,8 +415,6 @@
       "cell_type": "code",
       "execution_count": 12,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "J8Pztk1mvNVL"
       },
       "outputs": [],
@@ -481,7 +426,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "2opUt_JTdyEu"
       },
       "source": [
@@ -492,8 +436,6 @@
       "cell_type": "code",
       "execution_count": 13,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "AKslvo2kwWac"
       },
       "outputs": [],
@@ -512,12 +454,7 @@
       "cell_type": "code",
       "execution_count": 14,
       "metadata": {
-        "colab": {
-          "height": 281
-        },
-        "colab_type": "code",
-        "id": "XZClM2vo3_bm",
-        "outputId": "fec12377-9f68-45a7-b4a6-ad902d8db171"
+        "id": "XZClM2vo3_bm"
       },
       "outputs": [
         {
@@ -547,8 +484,6 @@
       "cell_type": "code",
       "execution_count": 15,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "3gwhv4lKbYZ4"
       },
       "outputs": [],
@@ -567,12 +502,7 @@
       "cell_type": "code",
       "execution_count": 16,
       "metadata": {
-        "colab": {
-          "height": 281
-        },
-        "colab_type": "code",
-        "id": "CIH7G_MwbY2x",
-        "outputId": "6a65e499-6618-4b3e-94f6-1d12af8fb251"
+        "id": "CIH7G_MwbY2x"
       },
       "outputs": [
         {
@@ -599,7 +529,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "LwN7uIdCd8Gw"
       },
       "source": [
@@ -610,8 +539,6 @@
       "cell_type": "code",
       "execution_count": 17,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "05aeAuWjvjPx"
       },
       "outputs": [],
@@ -652,12 +579,7 @@
       "cell_type": "code",
       "execution_count": 18,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "T5mWkSbMcU5z",
-        "outputId": "818e9142-70cf-420b-8e64-38c2ca11a370"
+        "id": "T5mWkSbMcU5z"
       },
       "outputs": [
         {
@@ -675,7 +597,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Km3cY9ry8ZlG"
       },
       "source": [
@@ -686,12 +607,7 @@
       "cell_type": "code",
       "execution_count": 19,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "-9cnwiPp6EGm",
-        "outputId": "53e00eac-51af-4030-be1a-3df986640f8d"
+        "id": "-9cnwiPp6EGm"
       },
       "outputs": [
         {
@@ -713,7 +629,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "L7lfxkor8pgv"
       },
       "source": [
@@ -741,7 +656,6 @@
     "colab": {
       "collapsed_sections": [],
       "name": "post_training_float16_quant.ipynb",
-      "provenance": [],
       "toc_visible": true
     },
     "kernelspec": {
diff --git a/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
index a2835f53d82..21c7bd9be60 100644
--- a/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
@@ -3,7 +3,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "_DDaAex5Q7u-"
       },
       "source": [
@@ -15,8 +14,6 @@
       "execution_count": null,
       "metadata": {
         "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
         "id": "W1dWWdNHQ9L0"
       },
       "outputs": [],
@@ -37,7 +34,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "6Y8E0lw5eYWm"
       },
       "source": [
@@ -47,7 +43,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "CIGrZZPTZVeO"
       },
       "source": [
@@ -70,7 +65,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "BTC1rDAuei_1"
       },
       "source": [
@@ -82,15 +76,12 @@
         "\n",
         "You actually have several options as to how much you want to quantize a model. In this tutorial, you'll perform \"full integer quantization,\" which converts all weights and activation outputs into 8-bit integer data—whereas other strategies may leave some amount of data in floating-point.\n",
         "\n",
-        "To learn more about the various quantization strategies, read about [TensorFlow Lite model optimization](https://www.tensorflow.org/lite/performance/model_optimization).\n",
-        "\n",
-        "\n"
+        "To learn more about the various quantization strategies, read about [TensorFlow Lite model optimization](https://www.tensorflow.org/lite/performance/model_optimization).\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "dDqqUIZjZjac"
       },
       "source": [
@@ -100,7 +91,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "I0nR5AMEWq0H"
       },
       "source": [
@@ -111,8 +101,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "WsN6s5L1ieNl"
       },
       "outputs": [],
@@ -128,7 +116,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "2XsEP17Zelz9"
       },
       "source": [
@@ -138,7 +125,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "5NMaNZQCkW9X"
       },
       "source": [
@@ -151,12 +137,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 51
-        },
-        "colab_type": "code",
-        "id": "eMsw_6HujaqM",
-        "outputId": "0f362bef-a5b8-46f2-c41c-cba008998b72"
+        "id": "eMsw_6HujaqM"
       },
       "outputs": [
         {
@@ -225,7 +206,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "KuTEoGFYd8aM"
       },
       "source": [
@@ -235,7 +215,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "xl8_fzVAZwOh"
       },
       "source": [
@@ -250,8 +229,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "_i8B2nDZmAgQ"
       },
       "outputs": [],
@@ -264,7 +241,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "7BONhYtYocQY"
       },
       "source": [
@@ -274,7 +250,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "jPYZwgZTwJMT"
       },
       "source": [
@@ -284,7 +259,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Hjvq1vpJd4U_"
       },
       "source": [
@@ -295,12 +269,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "HEZ6ET1AHAS3",
-        "outputId": "82a75458-10d2-484a-8e09-a8af56212e10"
+        "id": "HEZ6ET1AHAS3"
       },
       "outputs": [
         {
@@ -328,7 +297,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "o5wuE-RcdX_3"
       },
       "source": [
@@ -338,7 +306,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "UgKDdnHQEhpb"
       },
       "source": [
@@ -348,7 +315,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "rTe8avZJHMDO"
       },
       "source": [
@@ -360,10 +326,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "FiwiWU3gHdkW",
-        "outputId": "61093d59-5b47-4e59-a577-46f056281bab"
+        "id": "FiwiWU3gHdkW"
       },
       "outputs": [
         {
@@ -397,7 +360,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "_GC3HFlptf7x"
       },
       "source": [
@@ -410,12 +372,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 51
-        },
-        "colab_type": "code",
-        "id": "id1OEKFELQwp",
-        "outputId": "024a710f-44cc-43d1-89a7-456a1727523c"
+        "id": "id1OEKFELQwp"
       },
       "outputs": [
         {
@@ -438,7 +395,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "RACBJuj2XO8x"
       },
       "source": [
@@ -452,7 +408,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "FQgTqbvPvxGJ"
       },
       "source": [
@@ -462,7 +417,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "mwR9keYAwArA"
       },
       "source": [
@@ -473,12 +427,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 51
-        },
-        "colab_type": "code",
-        "id": "kzjEjcDs3BHa",
-        "outputId": "0462645b-f8e1-489a-f703-8093f83645d5"
+        "id": "kzjEjcDs3BHa"
       },
       "outputs": [
         {
@@ -516,7 +465,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "wYd6NxD03yjB"
       },
       "source": [
@@ -527,12 +475,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 51
-        },
-        "colab_type": "code",
-        "id": "PaNkOS-twz4k",
-        "outputId": "b7b22b48-c305-4b4c-80c6-506d9f3c2013"
+        "id": "PaNkOS-twz4k"
       },
       "outputs": [
         {
@@ -555,7 +498,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "TO17AP84wzBb"
       },
       "source": [
@@ -565,7 +507,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "sse224YJ4KMm"
       },
       "source": [
@@ -575,7 +516,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "4_9nZ4nv4b9P"
       },
       "source": [
@@ -586,12 +526,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "BEY59dC14uRv",
-        "outputId": "20a3397a-1466-48eb-f421-adc8ebf3f60f"
+        "id": "BEY59dC14uRv"
       },
       "outputs": [
         {
@@ -624,7 +559,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "9t9yaTeF9fyM"
       },
       "source": [
@@ -634,7 +568,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "L8lQHMp_asCq"
       },
       "source": [
@@ -647,8 +580,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "X092SbeWfd1A"
       },
       "outputs": [],
@@ -679,12 +610,6 @@
         "    interpreter.invoke()\n",
         "    output = interpreter.get_tensor(output_details[\"index\"])[0]\n",
         "\n",
-        "    # Check if the output type is quantized, then rescale output data to float\n",
-        "    if output_details['dtype'] == np.uint8:\n",
-        "      output_scale, output_zero_point = output_details[\"quantization\"]\n",
-        "      test_image = test_image.astype(np.float32)\n",
-        "      test_image = test_image / input_scale + input_zero_point\n",
-        "\n",
         "    predictions[i] = output.argmax()\n",
         "\n",
         "  return predictions\n"
@@ -693,18 +618,15 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "2opUt_JTdyEu"
       },
       "source": [
-        "### Test the models on one image\n",
-        "\n"
+        "### Test the models on one image\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "QpPpFPaz7eEM"
       },
       "source": [
@@ -719,8 +641,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "zR2cHRUcUZ6e"
       },
       "outputs": [],
@@ -745,7 +665,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "A5OTJ_6Vcslt"
       },
       "source": [
@@ -756,12 +675,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 296
-        },
-        "colab_type": "code",
-        "id": "iTK0x980coto",
-        "outputId": "1881b045-e953-416f-a25f-6c083409c7be"
+        "id": "iTK0x980coto"
       },
       "outputs": [
         {
@@ -785,7 +699,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "o3N6-UGl1dfE"
       },
       "source": [
@@ -796,12 +709,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 296
-        },
-        "colab_type": "code",
-        "id": "rc1i9umMcp0t",
-        "outputId": "480bc68f-812b-460e-82fe-d66f70b4345e"
+        "id": "rc1i9umMcp0t"
       },
       "outputs": [
         {
@@ -825,7 +733,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "LwN7uIdCd8Gw"
       },
       "source": [
@@ -835,7 +742,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "RFKOD4DG8XmU"
       },
       "source": [
@@ -846,8 +752,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "05aeAuWjvjPx"
       },
       "outputs": [],
@@ -869,7 +773,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "xnFilQpBuMh5"
       },
       "source": [
@@ -880,12 +783,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "T5mWkSbMcU5z",
-        "outputId": "7e05d400-1455-4c1a-f3f0-b81422c3a0ba"
+        "id": "T5mWkSbMcU5z"
       },
       "outputs": [
         {
@@ -903,7 +801,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Km3cY9ry8ZlG"
       },
       "source": [
@@ -914,12 +811,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "-9cnwiPp6EGm",
-        "outputId": "1e7409cf-748d-45c9-aa2f-36ccd9454f45"
+        "id": "-9cnwiPp6EGm"
       },
       "outputs": [
         {
@@ -937,7 +829,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "L7lfxkor8pgv"
       },
       "source": [
@@ -951,7 +842,6 @@
     "colab": {
       "collapsed_sections": [],
       "name": "post_training_integer_quant.ipynb",
-      "provenance": [],
       "toc_visible": true
     },
     "kernelspec": {
diff --git a/tensorflow/lite/g3doc/performance/post_training_integer_quant_16x8.ipynb b/tensorflow/lite/g3doc/performance/post_training_integer_quant_16x8.ipynb
new file mode 100644
index 00000000000..5983a80d10b
--- /dev/null
+++ b/tensorflow/lite/g3doc/performance/post_training_integer_quant_16x8.ipynb
@@ -0,0 +1,590 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "c8Cx-rUMVX25"
+      },
+      "source": [
+        "##### Copyright 2020 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "I9sUhVL_VZNO"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6Y8E0lw5eYWm"
+      },
+      "source": [
+        "# Post-training integer quantization with int16 activations"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "CGuqeuPSVNo-"
+      },
+      "source": [
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/lite/performance/post_training_quant_16x8\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/post_training_quant_16x8.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/post_training_quant_16x8.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/lite/g3doc/performance/post_training_quant_16x8.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BTC1rDAuei_1"
+      },
+      "source": [
+        "## Overview\n",
+        "\n",
+        "[TensorFlow Lite](https://www.tensorflow.org/lite/) now supports\n",
+        "converting activations to 16-bit integer values and weights to 8-bit integer values during model conversion from TensorFlow to TensorFlow Lite's flat buffer format. We refer to this mode as the \"16x8 quantization mode\". This mode can improve accuracy of the quantized model significantly, when activations are sensitive to the quantization, while still achieving almost 3-4x reduction in model size. Moreover, this fully quantized model can be consumed by integer-only hardware accelerators. \n",
+        "\n",
+        "Some examples of models that benefit from this mode of the post-training quantization include: \n",
+        "* super-resolution, \n",
+        "* audio signal processing such\n",
+        "as noise cancelling and beamforming, \n",
+        "* image de-noising, \n",
+        "* HDR reconstruction\n",
+        "from a single image\n",
+        "\n",
+        "In this tutorial, you train an MNIST model from scratch, check its accuracy in TensorFlow, and then convert the model into a Tensorflow Lite flatbuffer using this mode. At the end you check the accuracy of the converted model and compare it to the original float32 model. Note that this example demonstrates the usage of this mode and doesn't show benefits over other available quantization techniques in TensorFlow Lite."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2XsEP17Zelz9"
+      },
+      "source": [
+        "## Build an MNIST model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "dDqqUIZjZjac"
+      },
+      "source": [
+        "### Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "gyqAw1M9lyab"
+      },
+      "outputs": [],
+      "source": [
+        "import logging\n",
+        "logging.getLogger(\"tensorflow\").setLevel(logging.DEBUG)\n",
+        "\n",
+        "import tensorflow as tf\n",
+        "from tensorflow import keras\n",
+        "import numpy as np\n",
+        "import pathlib"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "srTSFKjn1tMp"
+      },
+      "source": [
+        "Check that the 16x8 quantization mode is available "
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "c6nb7OPlXs_3"
+      },
+      "outputs": [],
+      "source": [
+        "tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "eQ6Q0qqKZogR"
+      },
+      "source": [
+        "### Train and export the model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "hWSAjQWagIHl"
+      },
+      "outputs": [],
+      "source": [
+        "# Load MNIST dataset\n",
+        "mnist = keras.datasets.mnist\n",
+        "(train_images, train_labels), (test_images, test_labels) = mnist.load_data()\n",
+        "\n",
+        "# Normalize the input image so that each pixel value is between 0 to 1.\n",
+        "train_images = train_images / 255.0\n",
+        "test_images = test_images / 255.0\n",
+        "\n",
+        "# Define the model architecture\n",
+        "model = keras.Sequential([\n",
+        "  keras.layers.InputLayer(input_shape=(28, 28)),\n",
+        "  keras.layers.Reshape(target_shape=(28, 28, 1)),\n",
+        "  keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation=tf.nn.relu),\n",
+        "  keras.layers.MaxPooling2D(pool_size=(2, 2)),\n",
+        "  keras.layers.Flatten(),\n",
+        "  keras.layers.Dense(10)\n",
+        "])\n",
+        "\n",
+        "# Train the digit classification model\n",
+        "model.compile(optimizer='adam',\n",
+        "              loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
+        "              metrics=['accuracy'])\n",
+        "model.fit(\n",
+        "  train_images,\n",
+        "  train_labels,\n",
+        "  epochs=1,\n",
+        "  validation_data=(test_images, test_labels)\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "5NMaNZQCkW9X"
+      },
+      "source": [
+        "For the example, you trained the model for just a single epoch, so it only trains to ~96% accuracy."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xl8_fzVAZwOh"
+      },
+      "source": [
+        "### Convert to a TensorFlow Lite model\n",
+        "\n",
+        "Using the Python [TFLiteConverter](https://www.tensorflow.org/lite/convert/python_api), you can now convert the trained model into a TensorFlow Lite model.\n",
+        "\n",
+        "Now, convert the model using `TFliteConverter` into default float32 format:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "_i8B2nDZmAgQ"
+      },
+      "outputs": [],
+      "source": [
+        "converter = tf.lite.TFLiteConverter.from_keras_model(model)\n",
+        "tflite_model = converter.convert()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "F2o2ZfF0aiCx"
+      },
+      "source": [
+        "Write it out to a `.tflite` file:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "vptWZq2xnclo"
+      },
+      "outputs": [],
+      "source": [
+        "tflite_models_dir = pathlib.Path(\"/tmp/mnist_tflite_models/\")\n",
+        "tflite_models_dir.mkdir(exist_ok=True, parents=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Ie9pQaQrn5ue"
+      },
+      "outputs": [],
+      "source": [
+        "tflite_model_file = tflite_models_dir/\"mnist_model.tflite\"\n",
+        "tflite_model_file.write_bytes(tflite_model)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "7BONhYtYocQY"
+      },
+      "source": [
+        "To instead quantize the model to 16x8 quantization mode, first set the `optimizations` flag to use default optimizations. Then specify that 16x8 quantization mode is the required supported operation in the  target specification:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "HEZ6ET1AHAS3"
+      },
+      "outputs": [],
+      "source": [
+        "converter.optimizations = [tf.lite.Optimize.DEFAULT]\n",
+        "converter.target_spec.supported_ops = [tf.lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zLxQwZq9CpN7"
+      },
+      "source": [
+        "As in the case of int8 post-training quantization, it is possible to produce a fully integer quantized model by setting converter options `inference_input(output)_type` to tf.int16."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "yZekFJC5-fOG"
+      },
+      "source": [
+        "Set the calibration data:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Y3a6XFqvHbYM"
+      },
+      "outputs": [],
+      "source": [
+        "mnist_train, _ = tf.keras.datasets.mnist.load_data()\n",
+        "images = tf.cast(mnist_train[0], tf.float32) / 255.0\n",
+        "mnist_ds = tf.data.Dataset.from_tensor_slices((images)).batch(1)\n",
+        "def representative_data_gen():\n",
+        "  for input_value in mnist_ds.take(100):\n",
+        "    # Model has only one input so each data point has one element.\n",
+        "    yield [input_value]\n",
+        "converter.representative_dataset = representative_data_gen"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xW84iMYjHd9t"
+      },
+      "source": [
+        "Finally, convert the model as usual. Note, by default the converted model will still use float input and outputs for invocation convenience."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "yuNfl3CoHNK3"
+      },
+      "outputs": [],
+      "source": [
+        "tflite_16x8_model = converter.convert()\n",
+        "tflite_model_16x8_file = tflite_models_dir/\"mnist_model_quant_16x8.tflite\"\n",
+        "tflite_model_16x8_file.write_bytes(tflite_16x8_model)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "PhMmUTl4sbkz"
+      },
+      "source": [
+        "Note how the resulting file is approximately `1/3` the size."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "JExfcfLDscu4"
+      },
+      "outputs": [],
+      "source": [
+        "!ls -lh {tflite_models_dir}"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "L8lQHMp_asCq"
+      },
+      "source": [
+        "## Run the TensorFlow Lite models"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-5l6-ciItvX6"
+      },
+      "source": [
+        "Run the TensorFlow Lite model using the Python TensorFlow Lite Interpreter."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Ap_jE7QRvhPf"
+      },
+      "source": [
+        "### Load the model into the interpreters"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Jn16Rc23zTss"
+      },
+      "outputs": [],
+      "source": [
+        "interpreter = tf.lite.Interpreter(model_path=str(tflite_model_file))\n",
+        "interpreter.allocate_tensors()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "J8Pztk1mvNVL"
+      },
+      "outputs": [],
+      "source": [
+        "interpreter_16x8 = tf.lite.Interpreter(model_path=str(tflite_model_16x8_file))\n",
+        "interpreter_16x8.allocate_tensors()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "2opUt_JTdyEu"
+      },
+      "source": [
+        "### Test the models on one image"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "AKslvo2kwWac"
+      },
+      "outputs": [],
+      "source": [
+        "test_image = np.expand_dims(test_images[0], axis=0).astype(np.float32)\n",
+        "\n",
+        "input_index = interpreter.get_input_details()[0][\"index\"]\n",
+        "output_index = interpreter.get_output_details()[0][\"index\"]\n",
+        "\n",
+        "interpreter.set_tensor(input_index, test_image)\n",
+        "interpreter.invoke()\n",
+        "predictions = interpreter.get_tensor(output_index)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "XZClM2vo3_bm"
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pylab as plt\n",
+        "\n",
+        "plt.imshow(test_images[0])\n",
+        "template = \"True:{true}, predicted:{predict}\"\n",
+        "_ = plt.title(template.format(true= str(test_labels[0]),\n",
+        "                              predict=str(np.argmax(predictions[0]))))\n",
+        "plt.grid(False)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "3gwhv4lKbYZ4"
+      },
+      "outputs": [],
+      "source": [
+        "test_image = np.expand_dims(test_images[0], axis=0).astype(np.float32)\n",
+        "\n",
+        "input_index = interpreter_16x8.get_input_details()[0][\"index\"]\n",
+        "output_index = interpreter_16x8.get_output_details()[0][\"index\"]\n",
+        "\n",
+        "interpreter_16x8.set_tensor(input_index, test_image)\n",
+        "interpreter_16x8.invoke()\n",
+        "predictions = interpreter_16x8.get_tensor(output_index)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CIH7G_MwbY2x"
+      },
+      "outputs": [],
+      "source": [
+        "plt.imshow(test_images[0])\n",
+        "template = \"True:{true}, predicted:{predict}\"\n",
+        "_ = plt.title(template.format(true= str(test_labels[0]),\n",
+        "                              predict=str(np.argmax(predictions[0]))))\n",
+        "plt.grid(False)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "LwN7uIdCd8Gw"
+      },
+      "source": [
+        "### Evaluate the models"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "05aeAuWjvjPx"
+      },
+      "outputs": [],
+      "source": [
+        "# A helper function to evaluate the TF Lite model using \"test\" dataset.\n",
+        "def evaluate_model(interpreter):\n",
+        "  input_index = interpreter.get_input_details()[0][\"index\"]\n",
+        "  output_index = interpreter.get_output_details()[0][\"index\"]\n",
+        "\n",
+        "  # Run predictions on every image in the \"test\" dataset.\n",
+        "  prediction_digits = []\n",
+        "  for test_image in test_images:\n",
+        "    # Pre-processing: add batch dimension and convert to float32 to match with\n",
+        "    # the model's input data format.\n",
+        "    test_image = np.expand_dims(test_image, axis=0).astype(np.float32)\n",
+        "    interpreter.set_tensor(input_index, test_image)\n",
+        "\n",
+        "    # Run inference.\n",
+        "    interpreter.invoke()\n",
+        "\n",
+        "    # Post-processing: remove batch dimension and find the digit with highest\n",
+        "    # probability.\n",
+        "    output = interpreter.tensor(output_index)\n",
+        "    digit = np.argmax(output()[0])\n",
+        "    prediction_digits.append(digit)\n",
+        "\n",
+        "  # Compare prediction results with ground truth labels to calculate accuracy.\n",
+        "  accurate_count = 0\n",
+        "  for index in range(len(prediction_digits)):\n",
+        "    if prediction_digits[index] == test_labels[index]:\n",
+        "      accurate_count += 1\n",
+        "  accuracy = accurate_count * 1.0 / len(prediction_digits)\n",
+        "\n",
+        "  return accuracy"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "T5mWkSbMcU5z"
+      },
+      "outputs": [],
+      "source": [
+        "print(evaluate_model(interpreter))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Km3cY9ry8ZlG"
+      },
+      "source": [
+        "Repeat the evaluation on the 16x8 quantized model:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "-9cnwiPp6EGm"
+      },
+      "outputs": [],
+      "source": [
+        "# NOTE: This quantization mode is an experimental post-training mode,\n",
+        "# it does not have any optimized kernels implementations or\n",
+        "# specialized machine learning hardware accelerators. Therefore,\n",
+        "# it could be slower than the float interpreter.\n",
+        "print(evaluate_model(interpreter_16x8))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "L7lfxkor8pgv"
+      },
+      "source": [
+        "In this example, you have quantized a model to 16x8 with no difference in the accuracy, but with the 3x reduced size.\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "post_training_integer_quant_16x8.ipynb",
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/lite/g3doc/performance/post_training_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
index bae28ca4a5d..311c8180ceb 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
@@ -3,7 +3,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "_-GR0EDHM1SO"
       },
       "source": [
@@ -15,8 +14,6 @@
       "execution_count": 1,
       "metadata": {
         "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
         "id": "R3yYtBPkM2qZ"
       },
       "outputs": [],
@@ -37,7 +34,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "6Y8E0lw5eYWm"
       },
       "source": [
@@ -47,7 +43,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "CIGrZZPTZVeO"
       },
       "source": [
@@ -70,7 +65,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "BTC1rDAuei_1"
       },
       "source": [
@@ -107,7 +101,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "2XsEP17Zelz9"
       },
       "source": [
@@ -117,7 +110,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "dDqqUIZjZjac"
       },
       "source": [
@@ -128,8 +120,6 @@
       "cell_type": "code",
       "execution_count": 2,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "gyqAw1M9lyab"
       },
       "outputs": [],
@@ -146,7 +136,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "eQ6Q0qqKZogR"
       },
       "source": [
@@ -157,12 +146,7 @@
       "cell_type": "code",
       "execution_count": 3,
       "metadata": {
-        "colab": {
-          "height": 51
-        },
-        "colab_type": "code",
-        "id": "hWSAjQWagIHl",
-        "outputId": "961899f8-1597-4417-b21d-cae94a330ecc"
+        "id": "hWSAjQWagIHl"
       },
       "outputs": [
         {
@@ -219,7 +203,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "5NMaNZQCkW9X"
       },
       "source": [
@@ -229,7 +212,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "xl8_fzVAZwOh"
       },
       "source": [
@@ -244,8 +226,6 @@
       "cell_type": "code",
       "execution_count": 4,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "_i8B2nDZmAgQ"
       },
       "outputs": [],
@@ -257,7 +237,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "F2o2ZfF0aiCx"
       },
       "source": [
@@ -268,8 +247,6 @@
       "cell_type": "code",
       "execution_count": 5,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "vptWZq2xnclo"
       },
       "outputs": [],
@@ -282,12 +259,7 @@
       "cell_type": "code",
       "execution_count": 6,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "Ie9pQaQrn5ue",
-        "outputId": "046db0bc-1745-4e94-9f21-f7e91bdaebda"
+        "id": "Ie9pQaQrn5ue"
       },
       "outputs": [
         {
@@ -311,7 +283,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "7BONhYtYocQY"
       },
       "source": [
@@ -322,12 +293,7 @@
       "cell_type": "code",
       "execution_count": 7,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "g8PUvLWDlmmz",
-        "outputId": "d79b45d3-babf-4890-8036-de2f497da88a"
+        "id": "g8PUvLWDlmmz"
       },
       "outputs": [
         {
@@ -353,7 +319,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "PhMmUTl4sbkz"
       },
       "source": [
@@ -364,12 +329,7 @@
       "cell_type": "code",
       "execution_count": 8,
       "metadata": {
-        "colab": {
-          "height": 119
-        },
-        "colab_type": "code",
-        "id": "JExfcfLDscu4",
-        "outputId": "d1fda4c2-343e-40fb-f90f-b6bde00c523e"
+        "id": "JExfcfLDscu4"
       },
       "outputs": [
         {
@@ -392,7 +352,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "L8lQHMp_asCq"
       },
       "source": [
@@ -405,7 +364,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Ap_jE7QRvhPf"
       },
       "source": [
@@ -416,8 +374,6 @@
       "cell_type": "code",
       "execution_count": 9,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "Jn16Rc23zTss"
       },
       "outputs": [],
@@ -430,8 +386,6 @@
       "cell_type": "code",
       "execution_count": 10,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "J8Pztk1mvNVL"
       },
       "outputs": [],
@@ -443,7 +397,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "2opUt_JTdyEu"
       },
       "source": [
@@ -454,8 +407,6 @@
       "cell_type": "code",
       "execution_count": 11,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "AKslvo2kwWac"
       },
       "outputs": [],
@@ -474,12 +425,7 @@
       "cell_type": "code",
       "execution_count": 12,
       "metadata": {
-        "colab": {
-          "height": 281
-        },
-        "colab_type": "code",
-        "id": "XZClM2vo3_bm",
-        "outputId": "0fa4155b-01f8-4fea-f586-d9044d73572e"
+        "id": "XZClM2vo3_bm"
       },
       "outputs": [
         {
@@ -508,7 +454,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "LwN7uIdCd8Gw"
       },
       "source": [
@@ -519,8 +464,6 @@
       "cell_type": "code",
       "execution_count": 13,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "05aeAuWjvjPx"
       },
       "outputs": [],
@@ -561,12 +504,7 @@
       "cell_type": "code",
       "execution_count": 14,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "DqXBnDfJ7qxL",
-        "outputId": "78f393f8-c4a5-41e0-abe4-ab6a5c394e51"
+        "id": "DqXBnDfJ7qxL"
       },
       "outputs": [
         {
@@ -584,7 +522,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Km3cY9ry8ZlG"
       },
       "source": [
@@ -595,12 +532,7 @@
       "cell_type": "code",
       "execution_count": 15,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "-9cnwiPp6EGm",
-        "outputId": "d82552d7-8a2c-49dc-a19a-56010a013102"
+        "id": "-9cnwiPp6EGm"
       },
       "outputs": [
         {
@@ -618,7 +550,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "L7lfxkor8pgv"
       },
       "source": [
@@ -628,7 +559,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "M0o1FtmWeKZm"
       },
       "source": [
@@ -645,8 +575,6 @@
       "cell_type": "code",
       "execution_count": 16,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "jrXZxSJiJfYN"
       },
       "outputs": [],
@@ -665,12 +593,7 @@
       "cell_type": "code",
       "execution_count": 17,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "LwnV4KxwVEoG",
-        "outputId": "7d50f90d-6104-43a3-863c-28db9465d483"
+        "id": "LwnV4KxwVEoG"
       },
       "outputs": [
         {
@@ -696,12 +619,7 @@
       "cell_type": "code",
       "execution_count": 18,
       "metadata": {
-        "colab": {
-          "height": 34
-        },
-        "colab_type": "code",
-        "id": "2qkZD0VoVExe",
-        "outputId": "76a47590-fa91-49b9-f568-4e00b46c9537"
+        "id": "2qkZD0VoVExe"
       },
       "outputs": [
         {
@@ -728,12 +646,7 @@
       "cell_type": "code",
       "execution_count": 19,
       "metadata": {
-        "colab": {
-          "height": 102
-        },
-        "colab_type": "code",
-        "id": "vhOjeg1x9Knp",
-        "outputId": "c643a660-f815-49f0-ac4b-ac48af3c1203"
+        "id": "vhOjeg1x9Knp"
       },
       "outputs": [
         {
@@ -755,7 +668,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "qqHLaqFMCjRZ"
       },
       "source": [
@@ -770,7 +682,6 @@
     "colab": {
       "collapsed_sections": [],
       "name": "post_training_quant.ipynb",
-      "provenance": [],
       "toc_visible": true
     },
     "kernelspec": {
diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
index 5bfe60e1e2a..2fd4f078c4c 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quantization.md
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -199,6 +199,9 @@ The disadvantage of this quantization is:
 
 Note: This is an experimental feature.
 
+A tutorial for this quantization mode can be found
+[here](post_training_integer_quant_16x8.ipynb).
+
 ### Model accuracy
 
 Since weights are quantized post training, there could be an accuracy loss,
diff --git a/tensorflow/lite/g3doc/r1/convert/cmdline_examples.md b/tensorflow/lite/g3doc/r1/convert/cmdline_examples.md
index 4c001bc7c90..a2377412c8d 100644
--- a/tensorflow/lite/g3doc/r1/convert/cmdline_examples.md
+++ b/tensorflow/lite/g3doc/r1/convert/cmdline_examples.md
@@ -2,175 +2,195 @@
 
 This page shows how to use the TensorFlow Lite Converter in the command line.
 
+_Note: If possible, use the **recommended** [Python API](python_api.md)
+instead._
+
 ## Command-line tools <a name="tools"></a>
 
+### Starting from TensorFlow 1.9
+
 There are two approaches to running the converter in the command line.
 
-*   `tflite_convert`: Starting from TensorFlow 1.9, the command-line tool
-    `tflite_convert` is installed as part of the Python package. All of the
-    examples below use `tflite_convert` for simplicity.
-    *   Example: `tflite_convert --output_file=...`
-*   `bazel`: In order to run the latest version of the TensorFlow Lite Converter
-    either install the nightly build using
-    [pip](https://www.tensorflow.org/install/pip) or
-    [clone the TensorFlow repository](https://www.tensorflow.org/install/source)
-    and use `bazel`.
-    *   Example: `bazel run
-        //third_party/tensorflow/lite/python:tflite_convert --
+*   `tflite_convert` (**recommended**):
+    *   *Install*: TensorFlow using
+        [pip](https://www.tensorflow.org/install/pip).
+    *   *Example*: `tflite_convert --output_file=...`
+*   `bazel`:
+    *   *Install*: TensorFlow from
+        [source](https://www.tensorflow.org/install/source).
+    *   *Example*: `bazel run tensorflow/lite/python:tflite_convert --
         --output_file=...`
 
-### Converting models prior to TensorFlow 1.9 <a name="pre_tensorflow_1.9"></a>
+*All of the following examples use `tflite_convert` for simplicity.
+Alternatively, you can replace '`tflite_convert`' with '`bazel run
+tensorflow/lite/python:tflite_convert --`'*
+
+### Prior to TensorFlow 1.9 <a name="pre_tensorflow_1.9"></a>
 
 The recommended approach for using the converter prior to TensorFlow 1.9 is the
-[Python API](python_api.md#pre_tensorflow_1.9). If a command line tool is
-desired, the `toco` command line tool was available in TensorFlow 1.7. Enter
-`toco --help` in Terminal for additional details on the command-line flags
-available. There were no command line tools in TensorFlow 1.8.
+[Python API](python_api.md). Only in TensorFlow 1.7, a command line tool `toco`
+was available (run `toco --help` for additional details).
 
-## Basic examples <a name="basic"></a>
+## Usage <a name="usage"></a>
 
-The following section shows examples of how to convert a basic float-point model
-from each of the supported data formats into a TensorFlow Lite FlatBuffers.
+### Setup <a name="download_models"></a>
 
-### Convert a TensorFlow GraphDef <a name="graphdef"></a>
-
-The follow example converts a basic TensorFlow GraphDef (frozen by
-[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py))
-into a TensorFlow Lite FlatBuffer to perform floating-point inference. Frozen
-graphs contain the variables stored in Checkpoint files as Const ops.
+Before we begin, download the models required to run the examples in this
+document:
 
 ```
+echo "Download MobileNet V1"
 curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
   | tar xzv -C /tmp
+
+echo "Download Inception V1"
+curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
+  | tar xzv -C /tmp
+```
+
+### Basic examples <a name="basic"></a>
+
+The following section shows examples of how to convert a basic model from each
+of the supported data formats into a TensorFlow Lite model.
+
+#### Convert a SavedModel <a name="savedmodel"></a>
+
+```
+tflite_convert \
+  --saved_model_dir=/tmp/saved_model \
+  --output_file=/tmp/foo.tflite
+```
+
+#### Convert a tf.keras model <a name="keras"></a>
+
+```
+tflite_convert \
+  --keras_model_file=/tmp/keras_model.h5 \
+  --output_file=/tmp/foo.tflite
+```
+
+#### Convert a Frozen GraphDef <a name="graphdef"></a>
+
+```
 tflite_convert \
-  --output_file=/tmp/foo.tflite \
   --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
+  --output_file=/tmp/foo.tflite \
   --input_arrays=input \
   --output_arrays=MobilenetV1/Predictions/Reshape_1
 ```
 
-The value for `input_shapes` is automatically determined whenever possible.
+Frozen GraphDef models (or frozen graphs) are produced by
+[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py)
+and require additional flags `--input_arrays` and `--output_arrays` as this
+information is not stored in the model format.
 
-### Convert a TensorFlow SavedModel <a name="savedmodel"></a>
+### Advanced examples
 
-The follow example converts a basic TensorFlow SavedModel into a Tensorflow Lite
-FlatBuffer to perform floating-point inference.
+#### Convert a quantization aware trained model into a quantized TensorFlow Lite model
+
+If you have a quantization aware trained model (i.e, a model inserted with
+`FakeQuant*` operations which record the (min, max) ranges of tensors in order
+to quantize them), then convert it into a quantized TensorFlow Lite model as
+shown below:
 
 ```
 tflite_convert \
+  --graph_def_file=/tmp/some_mobilenetv1_quantized_frozen_graph.pb \
   --output_file=/tmp/foo.tflite \
-  --saved_model_dir=/tmp/saved_model
-```
-
-[SavedModel](https://www.tensorflow.org/guide/saved_model#using_savedmodel_with_estimators)
-has fewer required flags than frozen graphs due to access to additional data
-contained within the SavedModel. The values for `--input_arrays` and
-`--output_arrays` are an aggregated, alphabetized list of the inputs and outputs
-in the [SignatureDefs](../../serving/signature_defs.md) within
-the
-[MetaGraphDef](https://www.tensorflow.org/saved_model#apis_to_build_and_load_a_savedmodel)
-specified by `--saved_model_tag_set`. As with the GraphDef, the value for
-`input_shapes` is automatically determined whenever possible.
-
-There is currently no support for MetaGraphDefs without a SignatureDef or for
-MetaGraphDefs that use the [`assets/`
-directory](https://www.tensorflow.org/guide/saved_model#structure_of_a_savedmodel_directory).
-
-### Convert a tf.Keras model <a name="keras"></a>
-
-The following example converts a `tf.keras` model into a TensorFlow Lite
-Flatbuffer. The `tf.keras` file must contain both the model and the weights.
-
-```
-tflite_convert \
-  --output_file=/tmp/foo.tflite \
-  --keras_model_file=/tmp/keras_model.h5
-```
-
-## Quantization
-
-### Convert a TensorFlow GraphDef for quantized inference <a name="graphdef_quant"></a>
-
-The TensorFlow Lite Converter is compatible with fixed point quantization models
-described
-[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/quantize/README.md).
-These are float models with `FakeQuant*` ops inserted at the boundaries of fused
-layers to record min-max range information. This generates a quantized inference
-workload that reproduces the quantization behavior that was used during
-training.
-
-The following command generates a quantized TensorFlow Lite FlatBuffer from a
-"quantized" TensorFlow GraphDef.
-
-```
-tflite_convert \
-  --output_file=/tmp/foo.tflite \
-  --graph_def_file=/tmp/some_quantized_graph.pb \
-  --inference_type=QUANTIZED_UINT8 \
   --input_arrays=input \
   --output_arrays=MobilenetV1/Predictions/Reshape_1 \
-  --mean_values=128 \
-  --std_dev_values=127
+  --inference_type=INT8 \
+  --mean_values=-0.5 \
+  --std_dev_values=127.7
 ```
 
-### Use \"dummy-quantization\" to try out quantized inference on a float graph <a name="dummy_quant"></a>
+*If you're setting `--inference_type=UINT8` then update `--mean_values=128` and
+`--std_dev_values=127`*
 
-In order to evaluate the possible benefit of generating a quantized graph, the
-converter allows "dummy-quantization" on float graphs. The flags
-`--default_ranges_min` and `--default_ranges_max` accept plausible values for
-the min-max ranges of the values in all arrays that do not have min-max
-information. "Dummy-quantization" will produce lower accuracy but will emulate
-the performance of a correctly quantized model.
+#### Convert a model with \"dummy-quantization\" into a quantized TensorFlow Lite model
+
+If you have a regular float model and only want to estimate the benefit of a
+quantized model, i.e, estimate the performance of the model as if it were
+quantized aware trained, then perform "dummy-quantization" using the flags
+`--default_ranges_min` and `--default_ranges_max`. When specified, they will be
+used as default (min, max) range for all the tensors that lack (min, max) range
+information. This will allow quantization to proceed and help you emulate the
+performance of a quantized TensorFlow Lite model but it will have a lower
+accuracy.
 
 The example below contains a model using Relu6 activation functions. Therefore,
 a reasonable guess is that most activation ranges should be contained in [0, 6].
 
 ```
-curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
-  | tar xzv -C /tmp
 tflite_convert \
-  --output_file=/tmp/foo.cc \
   --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --inference_type=QUANTIZED_UINT8 \
+  --output_file=/tmp/foo.tflite \
   --input_arrays=input \
   --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --inference_type=INT8 \
+  --mean_values=-0.5 \
+  --std_dev_values=127.7 \
   --default_ranges_min=0 \
-  --default_ranges_max=6 \
-  --mean_values=128 \
-  --std_dev_values=127
+  --default_ranges_max=6
 ```
 
-## Specifying input and output arrays
+*If you're setting `--inference_type=UINT8` then update `--mean_values=128` and
+`--std_dev_values=127`*
 
-### Multiple input arrays
+#### Convert a model with select TensorFlow operators.
+
+Since TensorFlow Lite only supports a limited number of TensorFlow operators,
+not every model is convertible. For details, refer to
+[operator compatibility](https://www.tensorflow.org/lite/guide/ops_compatibility).
+To allow conversion, users can enable the usage of
+[certain TensorFlow ops](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc)
+in their TensorFlow Lite model, as shown in the following example.
+
+```
+tflite_convert \
+  --graph_def_file=/tmp/foo.pb \
+  --output_file=/tmp/foo.tflite \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --target_ops=TFLITE_BUILTINS,SELECT_TF_OPS
+```
+
+When building and running `tflite_convert` with `bazel`, please pass
+`--define=tflite_convert_with_select_tf_ops=true` as an additional argument.
+
+```
+bazel run --define=tflite_convert_with_select_tf_ops=true tflite_convert -- \
+  --graph_def_file=/tmp/foo.pb \
+  --output_file=/tmp/foo.tflite \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --target_ops=TFLITE_BUILTINS,SELECT_TF_OPS
+```
+
+#### Convert a model with multiple input arrays
 
 The flag `input_arrays` takes in a comma-separated list of input arrays as seen
 in the example below. This is useful for models or subgraphs with multiple
-inputs.
+inputs. Note that `--input_shapes` is provided as a colon-separated list. Each
+input shape corresponds to the input array at the same position in the
+respective list.
 
 ```
-curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
-  | tar xzv -C /tmp
 tflite_convert \
   --graph_def_file=/tmp/inception_v1_2016_08_28_frozen.pb \
   --output_file=/tmp/foo.tflite \
-  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
   --input_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_3/MaxPool_0a_3x3/MaxPool,InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu \
+  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
   --output_arrays=InceptionV1/Logits/Predictions/Reshape_1
 ```
 
-Note that `input_shapes` is provided as a colon-separated list. Each input shape
-corresponds to the input array at the same position in the respective list.
+#### Convert a model with multiple output arrays
 
-### Multiple output arrays
-
-The flag `output_arrays` takes in a comma-separated list of output arrays as
+The flag `--output_arrays` takes in a comma-separated list of output arrays as
 seen in the example below. This is useful for models or subgraphs with multiple
 outputs.
 
 ```
-curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
-  | tar xzv -C /tmp
 tflite_convert \
   --graph_def_file=/tmp/inception_v1_2016_08_28_frozen.pb \
   --output_file=/tmp/foo.tflite \
@@ -178,50 +198,45 @@ tflite_convert \
   --output_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu
 ```
 
-### Specifying subgraphs
+### Convert a model by specifying subgraphs
 
 Any array in the input file can be specified as an input or output array in
-order to extract subgraphs out of an input graph file. The TensorFlow Lite
-Converter discards the parts of the graph outside of the specific subgraph. Use
-[graph visualizations](#graph_visualizations) to identify the input and output
-arrays that make up the desired subgraph.
+order to extract subgraphs out of an input model file. The TensorFlow Lite
+Converter discards the parts of the model outside of the specific subgraph. Use
+[visualization](#visualization) to identify the input and output arrays that
+make up the desired subgraph.
 
 The follow command shows how to extract a single fused layer out of a TensorFlow
 GraphDef.
 
 ```
-curl https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_2016_08_28_frozen.pb.tar.gz \
-  | tar xzv -C /tmp
 tflite_convert \
   --graph_def_file=/tmp/inception_v1_2016_08_28_frozen.pb \
   --output_file=/tmp/foo.pb \
-  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
   --input_arrays=InceptionV1/InceptionV1/Mixed_3b/Branch_1/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_2/Conv2d_0a_1x1/Relu,InceptionV1/InceptionV1/Mixed_3b/Branch_3/MaxPool_0a_3x3/MaxPool,InceptionV1/InceptionV1/Mixed_3b/Branch_0/Conv2d_0a_1x1/Relu \
+  --input_shapes=1,28,28,96:1,28,28,16:1,28,28,192:1,28,28,64 \
   --output_arrays=InceptionV1/InceptionV1/Mixed_3b/concat_v2
 ```
 
-Note that the final representation in TensorFlow Lite FlatBuffers tends to have
+Note that the final representation in TensorFlow Lite models tends to have
 coarser granularity than the very fine granularity of the TensorFlow GraphDef
 representation. For example, while a fully-connected layer is typically
-represented as at least four separate ops in TensorFlow GraphDef (Reshape,
-MatMul, BiasAdd, Relu...), it is typically represented as a single "fused" op
-(FullyConnected) in the converter's optimized representation and in the final
-on-device representation. As the level of granularity gets coarser, some
-intermediate arrays (say, the array between the MatMul and the BiasAdd in the
-TensorFlow GraphDef) are dropped.
+represented as at least four separate operations in TensorFlow GraphDef
+(Reshape, MatMul, BiasAdd, Relu...), it is typically represented as a single
+"fused" op (FullyConnected) in the converter's optimized representation and in
+the final on-device representation. As the level of granularity gets coarser,
+some intermediate arrays (say, the array between the MatMul and the BiasAdd in
+the TensorFlow GraphDef) are dropped.
 
 When specifying intermediate arrays as `--input_arrays` and `--output_arrays`,
 it is desirable (and often required) to specify arrays that are meant to survive
-in the final form of the graph, after fusing. These are typically the outputs of
+in the final form of the model, after fusing. These are typically the outputs of
 activation functions (since everything in each layer until the activation
 function tends to get fused).
 
-## Logging
+## Visualization <a name="visualization"></a>
 
-
-## Graph visualizations
-
-The converter can export a graph to the Graphviz Dot format for easy
+The converter can export a model to the Graphviz Dot format for easy
 visualization using either the `--output_format` flag or the
 `--dump_graphviz_dir` flag. The subsections below outline the use cases for
 each.
@@ -229,21 +244,20 @@ each.
 ### Using `--output_format=GRAPHVIZ_DOT` <a name="using_output_format_graphviz_dot"></a>
 
 The first way to get a Graphviz rendering is to pass `GRAPHVIZ_DOT` into
-`--output_format`. This results in a plausible visualization of the graph. This
+`--output_format`. This results in a plausible visualization of the model. This
 reduces the requirements that exist during conversion from a TensorFlow GraphDef
-to a TensorFlow Lite FlatBuffer. This may be useful if the conversion to TFLite
-is failing.
+to a TensorFlow Lite model. This may be useful if the conversion to TFLite is
+failing.
 
 ```
-curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
-  | tar xzv -C /tmp
 tflite_convert \
   --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
   --output_file=/tmp/foo.dot \
   --output_format=GRAPHVIZ_DOT \
-  --input_shape=1,128,128,3 \
   --input_arrays=input \
+  --input_shape=1,128,128,3 \
   --output_arrays=MobilenetV1/Predictions/Reshape_1
+
 ```
 
 The resulting `.dot` file can be rendered into a PDF as follows:
@@ -267,12 +281,10 @@ Example PDF files are viewable online in the next section.
 The second way to get a Graphviz rendering is to pass the `--dump_graphviz_dir`
 flag, specifying a destination directory to dump Graphviz rendering to. Unlike
 the previous approach, this one retains the original output format. This
-provides a visualization of the actual graph resulting from a specific
+provides a visualization of the actual model resulting from a specific
 conversion process.
 
 ```
-curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
-  | tar xzv -C /tmp
 tflite_convert \
   --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
   --output_file=/tmp/foo.tflite \
@@ -283,14 +295,14 @@ tflite_convert \
 
 This generates a few files in the destination directory. The two most important
 files are `toco_AT_IMPORT.dot` and `/tmp/toco_AFTER_TRANSFORMATIONS.dot`.
-`toco_AT_IMPORT.dot` represents the original graph containing only the
+`toco_AT_IMPORT.dot` represents the original model containing only the
 transformations done at import time. This tends to be a complex visualization
 with limited information about each node. It is useful in situations where a
 conversion command fails.
 
-`toco_AFTER_TRANSFORMATIONS.dot` represents the graph after all transformations
+`toco_AFTER_TRANSFORMATIONS.dot` represents the model after all transformations
 were applied to it, just before it is exported. Typically, this is a much
-smaller graph with more information about each node.
+smaller model with more information about each node.
 
 As before, these can be rendered to PDFs:
 
@@ -316,15 +328,15 @@ Sample output files can be seen here below. Note that it is the same
 <tr><td>before</td><td>after</td></tr>
 </table>
 
-### Graph "video" logging
+### Video logging
 
 When `--dump_graphviz_dir` is used, one may additionally pass
-`--dump_graphviz_video`. This causes a graph visualization to be dumped after
-each individual graph transformation, resulting in thousands of files.
+`--dump_graphviz_video`. This causes a model visualization to be dumped after
+each individual model transformation, resulting in thousands of files.
 Typically, one would then bisect into these files to understand when a given
-change was introduced in the graph.
+change was introduced in the model.
 
-### Legend for the graph visualizations <a name="graphviz_legend"></a>
+### Legend for the Visualizations <a name="graphviz_legend"></a>
 
 *   Operators are red square boxes with the following hues of red:
     *   Most operators are
diff --git a/tensorflow/lite/g3doc/r1/convert/cmdline_reference.md b/tensorflow/lite/g3doc/r1/convert/cmdline_reference.md
index 8cca69d5963..386d9063f9f 100644
--- a/tensorflow/lite/g3doc/r1/convert/cmdline_reference.md
+++ b/tensorflow/lite/g3doc/r1/convert/cmdline_reference.md
@@ -1,42 +1,41 @@
 # Converter command line reference
 
 This page is complete reference of command-line flags used by the TensorFlow
-Lite Converter's command line starting from TensorFlow 1.9 up until the most
-recent build of TensorFlow.
+Lite Converter's command line tool.
 
 ## High-level flags
 
 The following high level flags specify the details of the input and output
 files. The flag `--output_file` is always required. Additionally, either
-`--graph_def_file`, `--saved_model_dir` or `--keras_model_file` is required.
+`--saved_model_dir`, `--keras_model_file` or `--graph_def_file` is required.
 
 *   `--output_file`. Type: string. Specifies the full path of the output file.
-*   `--graph_def_file`. Type: string. Specifies the full path of the input
-    GraphDef file frozen using
-    [freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py).
 *   `--saved_model_dir`. Type: string. Specifies the full path to the directory
     containing the SavedModel.
 *   `--keras_model_file`. Type: string. Specifies the full path of the HDF5 file
     containing the tf.keras model.
+*   `--graph_def_file`. Type: string. Specifies the full path of the input
+    GraphDef file frozen using
+    [freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py).
 *   `--output_format`. Type: string. Default: `TFLITE`. Specifies the format of
     the output file. Allowed values:
-    *   `TFLITE`: TensorFlow Lite FlatBuffer format.
+    *   `TFLITE`: TensorFlow Lite model format.
     *   `GRAPHVIZ_DOT`: GraphViz `.dot` format containing a visualization of the
         graph after graph transformations.
         *   Note that passing `GRAPHVIZ_DOT` to `--output_format` leads to loss
-            of TFLite specific transformations. Therefore, the resulting
-            visualization may not reflect the final set of graph
-            transformations. To get a final visualization with all graph
-            transformations use `--dump_graphviz_dir` instead.
+            of TFLite specific transformations. To get a final visualization
+            with all graph transformations use `--dump_graphviz_dir` instead.
 
 The following flags specify optional parameters when using SavedModels.
 
-*   `--saved_model_tag_set`. Type: string. Default:
-    [kSavedModelTagServe](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/saved_model/tag_constants.h).
+*   `--saved_model_tag_set`. Type: string. Default: "serve" (for more options,
+    refer to
+    [tag_constants.h](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/saved_model/tag_constants.h)).
     Specifies a comma-separated set of tags identifying the MetaGraphDef within
     the SavedModel to analyze. All tags in the tag set must be specified.
-*   `--saved_model_signature_key`. Type: string. Default:
-    `tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`.
+*   `--saved_model_signature_key`. Type: string. Default: "serving_default" (for
+    more options, refer to
+    [tf.compat.v1.saved_model.signature_constants](https://www.tensorflow.org/api_docs/python/tf/compat/v1/saved_model/signature_constants)).
     Specifies the key identifying the SignatureDef containing inputs and
     outputs.
 
@@ -46,9 +45,9 @@ The following flags specify optional parameters when using SavedModels.
 file.
 
 *   `--input_arrays`. Type: comma-separated list of strings. Specifies the list
-    of names of input activation tensors.
+    of names of input tensors.
 *   `--output_arrays`. Type: comma-separated list of strings. Specifies the list
-    of names of output activation tensors.
+    of names of output tensors.
 
 The following flags define properties of the input tensors. Each item in the
 `--input_arrays` flag should correspond to each item in the following flags
@@ -56,8 +55,7 @@ based on index.
 
 *   `--input_shapes`. Type: colon-separated list of comma-separated lists of
     integers. Each comma-separated list of integers gives the shape of one of
-    the input arrays specified in
-    [TensorFlow convention](https://www.tensorflow.org/guide/tensors#shape).
+    the input arrays.
     *   Example: `--input_shapes=1,60,80,3` for a typical vision model means a
         batch size of 1, an input image height of 60, an input image width of
         80, and an input image depth of 3 (representing RGB channels).
@@ -65,24 +63,23 @@ based on index.
         has a shape of [2, 3] and "bar" has a shape of [4, 5, 6].
 *   `--std_dev_values`, `--mean_values`. Type: comma-separated list of floats.
     These specify the (de-)quantization parameters of the input array, when it
-    is quantized. This is only needed if `inference_input_type` is
-    `QUANTIZED_UINT8`.
+    is quantized. Only needed if `inference_input_type` is `INT8` or `UINT8`.
     *   The meaning of `mean_values` and `std_dev_values` is as follows: each
         quantized value in the quantized input array will be interpreted as a
         mathematical real number (i.e. as an input activation value) according
         to the following formula:
-        *   `real_value = (quantized_input_value - mean_value) / std_dev_value`.
+        *   `real_value = (quantized_value - mean_value) / std_dev_value`.
     *   When performing float inference (`--inference_type=FLOAT`) on a
         quantized input, the quantized input would be immediately dequantized by
         the inference code according to the above formula, before proceeding
         with float inference.
-    *   When performing quantized inference
-        (`--inference_type=QUANTIZED_UINT8`), no dequantization is performed by
-        the inference code. However, the quantization parameters of all arrays,
-        including those of the input arrays as specified by `mean_value` and
-        `std_dev_value`, determine the fixed-point multipliers used in the
-        quantized inference code. `mean_value` must be an integer when
-        performing quantized inference.
+    *   When performing quantized inference (`inference_type`
+        is`INT8`or`UINT8`), no dequantization is performed by the inference
+        code. However, the quantization parameters of all arrays, including
+        those of the input arrays as specified by`mean_value`and`std_dev_value`,
+        determine the fixed-point multipliers used in the quantized inference
+        code.`mean_value` must be an integer when performing quantized
+        inference.
 
 ## Transformation flags
 
@@ -92,7 +89,7 @@ have.
 
 *   `--inference_type`. Type: string. Default: `FLOAT`. Data type of all
     real-number arrays in the output file except for input arrays (defined by
-    `--inference_input_type`). Must be `{FLOAT, QUANTIZED_UINT8}`.
+    `--inference_input_type`). Must be `{FLOAT, INT8, UINT8}`.
 
     This flag only impacts real-number arrays including float and quantized
     arrays. This excludes all other data types including plain integer arrays
@@ -101,15 +98,18 @@ have.
     *   If `FLOAT`, then real-numbers arrays will be of type float in the output
         file. If they were quantized in the input file, then they get
         dequantized.
-    *   If `QUANTIZED_UINT8`, then real-numbers arrays will be quantized as
-        uint8 in the output file. If they were float in the input file, then
-        they get quantized.
+    *   If `INT8`, then real-numbers arrays will be quantized as int8 in the
+        output file. If they were float in the input file, then they get
+        quantized.
+    *   If `UINT8`, then real-numbers arrays will be quantized as uint8 in the
+        output file. If they were float in the input file, then they get
+        quantized.
 
 *   `--inference_input_type`. Type: string. Data type of a real-number input
     array in the output file. By default the `--inference_type` is used as type
     of all of the input arrays. Flag is primarily intended for generating a
     float-point graph with a quantized input array. A Dequantized operator is
-    added immediately after the input array. Must be `{FLOAT, QUANTIZED_UINT8}`.
+    added immediately after the input array. Must be `{FLOAT, INT8, UINT8}`.
 
     The flag is typically used for vision models taking a bitmap as input but
     requiring floating-point inference. For such image models, the uint8 input
diff --git a/tensorflow/lite/g3doc/r1/convert/index.md b/tensorflow/lite/g3doc/r1/convert/index.md
index 4080689ce26..7a4e8c7bc95 100644
--- a/tensorflow/lite/g3doc/r1/convert/index.md
+++ b/tensorflow/lite/g3doc/r1/convert/index.md
@@ -1,48 +1,48 @@
 # TensorFlow Lite converter
 
-The TensorFlow Lite converter is used to convert TensorFlow models into an
-optimized [FlatBuffer](https://google.github.io/flatbuffers/) format, so that
-they can be used by the TensorFlow Lite interpreter.
+The TensorFlow Lite converter takes a TensorFlow model and generates a
+TensorFlow Lite model, which is an optimized
+[FlatBuffer](https://google.github.io/flatbuffers/) (identified by the `.tflite`
+file extension).
 
 Note: This page contains documentation on the converter API for TensorFlow 1.x.
 The API for TensorFlow 2.0 is available
 [here](https://www.tensorflow.org/lite/convert/).
 
-## FlatBuffers
+## Options
+
+The TensorFlow Lite Converter can be used in two ways:
+
+*   [Python API](python_api.md) (**recommended**): Using the Python API makes it
+    easier to convert models as part of a model development pipeline and helps
+    mitigate compatibility issues early on.
+*   [Command line](cmdline_examples.md)
+
+## Workflow
+
+### Why use the 'FlatBuffer' format?
 
 FlatBuffer is an efficient open-source cross-platform serialization library. It
-is similar to
-[protocol buffers](https://developers.google.com/protocol-buffers), with the
-distinction that FlatBuffers do not need a parsing/unpacking step to a secondary
-representation before data can be accessed, avoiding per-object memory
-allocation. The code footprint of FlatBuffers is an order of magnitude smaller
-than protocol buffers.
+is similar to [protocol buffers](https://developers.google.com/protocol-buffers)
+used in the TensorFlow model format, with the distinction that FlatBuffers do
+not need a parsing/unpacking step to a secondary representation before data can
+be accessed, avoiding per-object memory allocation. The code footprint of
+FlatBuffers is an order of magnitude smaller than protocol buffers.
 
-## From model training to device deployment
-
-The TensorFlow Lite converter generates a TensorFlow Lite
-[FlatBuffer](https://google.github.io/flatbuffers/) file (`.tflite`) from a
-TensorFlow model.
+### Convert the model
 
 The converter supports the following input formats:
 
 *   [SavedModels](https://www.tensorflow.org/guide/saved_model#using_savedmodel_with_estimators)
-*   Frozen `GraphDef`: Models generated by
+*   `tf.keras` H5 models.
+*   Frozen `GraphDef` models generated using
     [freeze_graph.py](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py).
-*   `tf.keras` HDF5 models.
-*   Any model taken from a `tf.Session` (Python API only).
+*   `tf.Session` models (Python API only).
 
-The TensorFlow Lite `FlatBuffer` file is then deployed to a client device, and
-the TensorFlow Lite interpreter uses the compressed model for on-device
-inference. This conversion process is shown in the diagram below:
+### Run inference
+
+The TensorFlow Lite model is then deployed to a client device, and the
+TensorFlow Lite interpreter uses the compressed model for on-device inference.
+This conversion process is shown in the diagram below:
 
 ![TFLite converter workflow](../images/convert/workflow.svg)
-
-## Options
-
-The TensorFlow Lite Converter can be used from either of these two options:
-
-*   [Python](python_api.md) (**Preferred**): Using the Python API makes it
-    easier to convert models as part of a model development pipeline, and helps
-    mitigate [compatibility](../tf_ops_compatibility.md) issues early on.
-*   [Command line](cmdline_examples.md)
diff --git a/tensorflow/lite/g3doc/r1/convert/python_api.md b/tensorflow/lite/g3doc/r1/convert/python_api.md
index 30d65750100..0eca3a43677 100644
--- a/tensorflow/lite/g3doc/r1/convert/python_api.md
+++ b/tensorflow/lite/g3doc/r1/convert/python_api.md
@@ -1,119 +1,67 @@
 # Converter Python API guide
 
 This page describes how to convert TensorFlow models into the TensorFlow Lite
-format using the TensorFlow Lite Converter Python API.
+format using the
+[`tf.compat.v1.lite.TFLiteConverter`](https://www.tensorflow.org/api_docs/python/tf/compat/v1/lite/TFLiteConverter)
+Python API. It provides the following class methods based on the original format
+of the model:
 
-If you're looking for information about how to run a TensorFlow Lite model,
-see [TensorFlow Lite inference](../guide/inference.md).
+*   `tf.compat.v1.lite.TFLiteConverter.from_saved_model()`: Converts a
+    [SavedModel](https://www.tensorflow.org/guide/saved_model).
+*   `tf.compat.v1.lite.TFLiteConverter.from_keras_model_file()`: Converts a
+    [Keras](https://www.tensorflow.org/guide/keras/overview) model file.
+*   `tf.compat.v1.lite.TFLiteConverter.from_session()`: Converts a GraphDef from
+    a session.
+*   `tf.compat.v1.lite.TFLiteConverter.from_frozen_graph()`: Converts a Frozen
+    GraphDef from a file. If you have checkpoints, then first convert it to a
+    Frozen GraphDef file and then use this API as shown [here](#checkpoints).
 
-Note: This page describes the converter in the TensorFlow nightly release,
-installed using `pip install tf-nightly`. For docs describing older versions
-reference ["Converting models from TensorFlow 1.12"](#pre_tensorflow_1.12).
-
-
-## High-level overview
-
-While the TensorFlow Lite Converter can be used from the command line, it is
-often convenient to use in a Python script as part of the model development
-pipeline. This allows you to know early that you are designing a model that can
-be targeted to devices with mobile.
-
-## API
-
-The API for converting TensorFlow models to TensorFlow Lite is
-`tf.lite.TFLiteConverter`, which provides class methods based on the original
-format of the model. For example, `TFLiteConverter.from_session()` is available
-for GraphDefs, `TFLiteConverter.from_saved_model()` is available for
-SavedModels, and `TFLiteConverter.from_keras_model_file()` is available for
-`tf.Keras` files.
-
-Example usages for simple float-point models are shown in
-[Basic Examples](#basic). Examples usages for more complex models is shown in
-[Complex Examples](#complex).
+In the following sections, we discuss [basic examples](#basic) and
+[complex examples](#complex).
 
 ## Basic examples <a name="basic"></a>
 
-The following section shows examples of how to convert a basic float-point model
-from each of the supported data formats into a TensorFlow Lite FlatBuffers.
+The following section shows examples of how to convert a basic model from each
+of the supported model formats into a TensorFlow Lite model.
 
-### Exporting a GraphDef from tf.Session <a name="basic_graphdef_sess"></a>
+### Convert a SavedModel <a name="basic_savedmodel"></a>
 
-The following example shows how to convert a TensorFlow GraphDef into a
-TensorFlow Lite FlatBuffer from a `tf.Session` object.
+The following example shows how to convert a
+[SavedModel](https://www.tensorflow.org/guide/saved_model) into a TensorFlow
+Lite model.
 
 ```python
 import tensorflow as tf
 
-img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
-var = tf.get_variable("weights", dtype=tf.float32, shape=(1, 64, 64, 3))
-val = img + var
-out = tf.identity(val, name="out")
-
-with tf.Session() as sess:
-  sess.run(tf.global_variables_initializer())
-  converter = tf.lite.TFLiteConverter.from_session(sess, [img], [out])
-  tflite_model = converter.convert()
-  open("converted_model.tflite", "wb").write(tflite_model)
-```
-
-### Exporting a GraphDef from file <a name="basic_graphdef_file"></a>
-
-The following example shows how to convert a TensorFlow GraphDef into a
-TensorFlow Lite FlatBuffer when the GraphDef is stored in a file. Both `.pb` and
-`.pbtxt` files are accepted.
-
-The example uses
-[Mobilenet_1.0_224](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz).
-The function only supports GraphDefs frozen using
-[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py).
-
-```python
-import tensorflow as tf
-
-graph_def_file = "/path/to/Downloads/mobilenet_v1_1.0_224/frozen_graph.pb"
-input_arrays = ["input"]
-output_arrays = ["MobilenetV1/Predictions/Softmax"]
-
-converter = tf.lite.TFLiteConverter.from_frozen_graph(
-  graph_def_file, input_arrays, output_arrays)
+# Convert the model.
+converter = tf.compat.v1.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 tflite_model = converter.convert()
-open("converted_model.tflite", "wb").write(tflite_model)
+
+# Save the model.
+with open('model.tflite', 'wb') as f:
+  f.write(tflite_model)
 ```
 
-### Exporting a SavedModel <a name="basic_savedmodel"></a>
+### Convert a Keras model file <a name="basic_keras_file"></a>
 
-The following example shows how to convert a SavedModel into a TensorFlow Lite
-FlatBuffer.
+The following example shows how to convert a
+[Keras](https://www.tensorflow.org/guide/keras/overview) model file into a
+TensorFlow Lite model.
 
 ```python
 import tensorflow as tf
 
-converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+# Convert the model.
+converter = tf.compat.v1.lite.TFLiteConverter.from_keras_model_file('keras_model.h5')
 tflite_model = converter.convert()
-open("converted_model.tflite", "wb").write(tflite_model)
+
+# Save the model.
+with open('model.tflite', 'wb') as f:
+  f.write(tflite_model)
 ```
 
-For more complex SavedModels, the optional parameters that can be passed into
-`TFLiteConverter.from_saved_model()` are `input_arrays`, `input_shapes`,
-`output_arrays`, `tag_set` and `signature_key`. Details of each parameter are
-available by running `help(tf.lite.TFLiteConverter)`.
-
-### Exporting a tf.keras File <a name="basic_keras_file"></a>
-
-The following example shows how to convert a `tf.keras` model into a TensorFlow
-Lite FlatBuffer. This example requires
-[`h5py`](http://docs.h5py.org/en/latest/build.html) to be installed.
-
-```python
-import tensorflow as tf
-
-converter = tf.lite.TFLiteConverter.from_keras_model_file("keras_model.h5")
-tflite_model = converter.convert()
-open("converted_model.tflite", "wb").write(tflite_model)
-```
-
-The `tf.keras` file must contain both the model and the weights. A comprehensive
-example including model construction can be seen below.
+The Keras file contains both the model and the weights. A comprehensive example
+is given below.
 
 ```python
 import numpy as np
@@ -134,61 +82,139 @@ y = np.random.random((1, 3, 3))
 model.train_on_batch(x, y)
 model.predict(x)
 
-# Save tf.keras model in HDF5 format.
-keras_file = "keras_model.h5"
+# Save tf.keras model in H5 format.
+keras_file = 'keras_model.h5'
 tf.keras.models.save_model(model, keras_file)
 
-# Convert to TensorFlow Lite model.
-converter = tf.lite.TFLiteConverter.from_keras_model_file(keras_file)
+# Convert the model.
+converter = tf.compat.v1.lite.TFLiteConverter.from_keras_model_file(keras_file)
 tflite_model = converter.convert()
-open("converted_model.tflite", "wb").write(tflite_model)
+
+# Save the model.
+with open('model.tflite', 'wb') as f:
+  f.write(tflite_model)
 ```
 
-## Complex examples <a name="complex"></a>
+### Convert a GraphDef from a session <a name="basic_graphdef_sess"></a>
 
-For models where the default value of the attributes is not sufficient, the
-attribute's values should be set before calling `convert()`. In order to call
-any constants use `tf.lite.constants.<CONSTANT_NAME>` as seen below with
-`QUANTIZED_UINT8`. Run `help(tf.lite.TFLiteConverter)` in the Python
-terminal for detailed documentation on the attributes.
-
-Although the examples are demonstrated on GraphDefs containing only constants.
-The same logic can be applied irrespective of the input data format.
-
-### Exporting a quantized GraphDef <a name="complex_quant"></a>
-
-The following example shows how to convert a quantized model into a TensorFlow
-Lite FlatBuffer.
+The following example shows how to convert a GraphDef from a `tf.Session` object
+into a TensorFlow Lite model .
 
 ```python
 import tensorflow as tf
 
-img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
-const = tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
-val = img + const
-out = tf.fake_quant_with_min_max_args(val, min=0., max=1., name="output")
+img = tf.placeholder(name='img', dtype=tf.float32, shape=(1, 64, 64, 3))
+var = tf.get_variable('weights', dtype=tf.float32, shape=(1, 64, 64, 3))
+val = img + var
+out = tf.identity(val, name='out')
 
 with tf.Session() as sess:
-  converter = tf.lite.TFLiteConverter.from_session(sess, [img], [out])
-  converter.inference_type = tf.lite.constants.QUANTIZED_UINT8
-  input_arrays = converter.get_input_arrays()
-  converter.quantized_input_stats = {input_arrays[0] : (0., 1.)}  # mean, std_dev
+  sess.run(tf.global_variables_initializer())
+
+  # Convert the model.
+  converter = tf.compat.v1.lite.TFLiteConverter.from_session(sess, [img], [out])
   tflite_model = converter.convert()
-  open("converted_model.tflite", "wb").write(tflite_model)
+
+  # Save the model.
+  with open('model.tflite', 'wb') as f:
+    f.write(tflite_model)
 ```
 
+### Convert a Frozen GraphDef from file <a name="basic_graphdef_file"></a>
 
-## Additional instructions
+The following example shows how to convert a Frozen GraphDef (or a frozen
+graph), usually generated using the
+[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py)
+script, into a TensorFlow Lite model.
 
-### Build from source code <a name="latest_package"></a>
+The example uses
+[Mobilenet_1.0_224](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz).
 
-In order to run the latest version of the TensorFlow Lite Converter Python API,
-either install the nightly build with
-[pip](https://www.tensorflow.org/install/pip) (recommended) or
-[Docker](https://www.tensorflow.org/install/docker), or
-[build the pip package from source](https://www.tensorflow.org/install/source).
+```python
+import tensorflow as tf
 
-### Converting models from TensorFlow 1.12 <a name="pre_tensorflow_1.12"></a>
+# Convert the model.
+converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
+    graph_def_file='/path/to/mobilenet_v1_1.0_224/frozen_graph.pb',
+                    # both `.pb` and `.pbtxt` files are accepted.
+    input_arrays=['input'],
+    input_shapes={'input' : [1, 224, 224,3]},
+    output_arrays=['MobilenetV1/Predictions/Softmax']
+)
+tflite_model = converter.convert()
+
+# Save the model.
+with open('model.tflite', 'wb') as f:
+  f.write(tflite_model)
+```
+
+#### Convert checkpoints <a name="checkpoints"></a>
+
+1.  Convert checkpoints to a Frozen GraphDef as follows
+    (*[reference](https://laid.delanover.com/how-to-freeze-a-graph-in-tensorflow/)*):
+
+    *   Install [bazel](https://docs.bazel.build/versions/master/install.html)
+    *   Clone the TensorFlow repository: `git clone
+        https://github.com/tensorflow/tensorflow.git`
+    *   Build freeze graph tool: `bazel build
+        tensorflow/python/tools:freeze_graph`
+        *   The directory from which you run this should contain a file named
+            'WORKSPACE'.
+        *   If you're running on Ubuntu 16.04 OS and face issues, update the
+            command to `bazel build -c opt --copt=-msse4.1 --copt=-msse4.2
+            tensorflow/python/tools:freeze_graph`
+    *   Run freeze graph tool: `bazel run tensorflow/python/tools:freeze_graph
+        --input_graph=/path/to/graph.pbtxt --input_binary=false
+        --input_checkpoint=/path/to/model.ckpt-00010
+        --output_graph=/path/to/frozen_graph.pb
+        --output_node_names=name1,name2.....`
+        *   If you have an input `*.pb` file instead of `*.pbtxt`, then replace
+            `--input_graph=/path/to/graph.pbtxt --input_binary=false` with
+            `--input_graph=/path/to/graph.pb`
+        *   You can find the output names by exploring the graph using
+            [Netron](https://github.com/lutzroeder/netron) or
+            [summarize graph tool](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/graph_transforms#inspecting-graphs).
+
+2.  Now [convert the Frozen GraphDef file](#basic_graphdef_file) to a TensorFlow
+    Lite model as shown in the example above.
+
+## Complex examples <a name="complex"></a>
+
+For models where the default value of the attributes is not sufficient, the
+attribute's values should be set before calling `convert()`. Run
+`help(tf.compat.v1.lite.TFLiteConverter)` in the Python terminal for detailed
+documentation on the attributes.
+
+### Convert a quantize aware trained model <a name="complex_quant"></a>
+
+The following example shows how to convert a quantize aware trained model into a
+TensorFlow Lite model.
+
+The example uses
+[Mobilenet_1.0_224](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz).
+
+```python
+import tensorflow as tf
+
+# Convert the model.
+converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
+    graph_def_file='/path/to/mobilenet_v1_1.0_224/frozen_graph.pb',
+    input_arrays=['input'],
+    input_shapes={'input' : [1, 224, 224,3]},
+    output_arrays=['MobilenetV1/Predictions/Softmax'],
+)
+converter.quantized_input_stats = {'input' : (0., 1.)}  # mean, std_dev (input range is [-1, 1])
+converter.inference_type = tf.int8 # this is the recommended type.
+# converter.inference_input_type=tf.uint8 # optional
+# converter.inference_output_type=tf.uint8 # optional
+tflite_model = converter.convert()
+
+# Save the model.
+with open('quantized_model.tflite', 'wb') as f:
+  f.write(tflite_model)
+```
+
+## Convert models from TensorFlow 1.12 <a name="pre_tensorflow_1.12"></a>
 
 Reference the following table to convert TensorFlow models to TensorFlow Lite in
 and before TensorFlow 1.12. Run `help()` to get details of each API.
diff --git a/tensorflow/lite/g3doc/tools/build_py_api_docs.py b/tensorflow/lite/g3doc/tools/build_py_api_docs.py
index 90a8e45ca6a..089e6bc1a1c 100644
--- a/tensorflow/lite/g3doc/tools/build_py_api_docs.py
+++ b/tensorflow/lite/g3doc/tools/build_py_api_docs.py
@@ -55,7 +55,7 @@ FLAGS = flags.FLAGS
 def main(_):
   doc_generator = generate_lib.DocGenerator(
       root_title='TensorFlow Lite',
-      py_modules=[('lite', tf.lite)],
+      py_modules=[('tf.lite', tf.lite)],
       base_dir=str(pathlib.Path(tf.__file__).parent),
       code_url_prefix=FLAGS.code_url_prefix,
       search_hints=FLAGS.search_hints,
diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
index e88c2e93519..ef650c6b05b 100644
--- a/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
@@ -3,7 +3,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "h2q27gKz1H20"
       },
       "source": [
@@ -15,8 +14,6 @@
       "execution_count": null,
       "metadata": {
         "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
         "id": "TUfAcER1oUS6"
       },
       "outputs": [],
@@ -37,7 +34,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Gb7qyhNL1yWt"
       },
       "source": [
@@ -47,7 +43,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "nDABAblytltI"
       },
       "source": [
@@ -64,13 +59,15 @@
         "  \u003ctd\u003e\n",
         "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
         "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://tfhub.dev/\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/hub_logo_32px.png\" /\u003eSee TF Hub model\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
         "\u003c/table\u003e"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "m86-Nh4pMHqY"
       },
       "source": [
@@ -82,7 +79,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "bcLF2PKkSbV3"
       },
       "source": [
@@ -95,11 +91,160 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "6cv3K3oaksJv"
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        },
+        "id": "6cv3K3oaksJv",
+        "outputId": "911fb544-f618-4cf7-e11d-f42c460d8f67"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Collecting tflite-model-maker\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/13/bc/4c23b9cb9ef612a1f48bac5543bd531665de5eab8f8231111aac067f8c30/tflite_model_maker-0.1.2-py3-none-any.whl (104kB)\n",
+            "\u001b[K     |████████████████████████████████| 112kB 3.0MB/s \n",
+            "\u001b[?25hCollecting sentencepiece\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)\n",
+            "\u001b[K     |████████████████████████████████| 1.1MB 8.9MB/s \n",
+            "\u001b[?25hRequirement already satisfied: numpy\u003e=1.17.3 in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (1.18.5)\n",
+            "Collecting tflite-support==0.1.0rc3.dev2\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/fa/c5/5e9ee3abd5b4ef8294432cd714407f49a66befa864905b66ee8bdc612795/tflite_support-0.1.0rc3.dev2-cp36-cp36m-manylinux2010_x86_64.whl (1.0MB)\n",
+            "\u001b[K     |████████████████████████████████| 1.0MB 15.4MB/s \n",
+            "\u001b[?25hRequirement already satisfied: tensorflow-hub\u003e=0.8.0 in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (0.9.0)\n",
+            "Requirement already satisfied: pillow in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (7.0.0)\n",
+            "Requirement already satisfied: absl-py in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (0.10.0)\n",
+            "Collecting flatbuffers==1.12\n",
+            "  Downloading https://files.pythonhosted.org/packages/eb/26/712e578c5f14e26ae3314c39a1bdc4eb2ec2f4ddc89b708cf8e0a0d20423/flatbuffers-1.12-py2.py3-none-any.whl\n",
+            "Collecting tf-models-nightly\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d3/e9/c4e5a451c268a5a75a27949562364f6086f6bb33b226a065a8beceefa9ba/tf_models_nightly-2.3.0.dev20200914-py2.py3-none-any.whl (993kB)\n",
+            "\u001b[K     |████████████████████████████████| 1.0MB 23.3MB/s \n",
+            "\u001b[?25hCollecting fire\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/34/a7/0e22e70778aca01a52b9c899d9c145c6396d7b613719cd63db97ffa13f2f/fire-0.3.1.tar.gz (81kB)\n",
+            "\u001b[K     |████████████████████████████████| 81kB 9.1MB/s \n",
+            "\u001b[?25hRequirement already satisfied: tensorflow-datasets\u003e=2.1.0 in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (2.1.0)\n",
+            "Collecting tf-nightly\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/33/d4/61c47ae889b490b9c5f07f4f61bdc057c158a1a1979c375fa019d647a19e/tf_nightly-2.4.0.dev20200914-cp36-cp36m-manylinux2010_x86_64.whl (390.1MB)\n",
+            "\u001b[K     |████████████████████████████████| 390.2MB 43kB/s \n",
+            "\u001b[?25hCollecting pybind11\u003e=2.4\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/89/e3/d576f6f02bc75bacbc3d42494e8f1d063c95617d86648dba243c2cb3963e/pybind11-2.5.0-py2.py3-none-any.whl (296kB)\n",
+            "\u001b[K     |████████████████████████████████| 296kB 50.8MB/s \n",
+            "\u001b[?25hRequirement already satisfied: six\u003e=1.12.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-hub\u003e=0.8.0-\u003etflite-model-maker) (1.15.0)\n",
+            "Requirement already satisfied: protobuf\u003e=3.8.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-hub\u003e=0.8.0-\u003etflite-model-maker) (3.12.4)\n",
+            "Requirement already satisfied: gin-config in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (0.3.0)\n",
+            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (3.2.2)\n",
+            "Requirement already satisfied: kaggle\u003e=1.3.9 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (1.5.8)\n",
+            "Requirement already satisfied: pandas\u003e=0.22.0 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (1.0.5)\n",
+            "Requirement already satisfied: dataclasses in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (0.7)\n",
+            "Requirement already satisfied: google-api-python-client\u003e=1.6.7 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (1.7.12)\n",
+            "Collecting py-cpuinfo\u003e=3.3.0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/f6/f5/8e6e85ce2e9f6e05040cf0d4e26f43a4718bcc4bce988b433276d4b1a5c1/py-cpuinfo-7.0.0.tar.gz (95kB)\n",
+            "\u001b[K     |████████████████████████████████| 102kB 13.7MB/s \n",
+            "\u001b[?25hCollecting tf-slim\u003e=1.1.0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/02/97/b0f4a64df018ca018cc035d44f2ef08f91e2e8aa67271f6f19633a015ff7/tf_slim-1.1.0-py2.py3-none-any.whl (352kB)\n",
+            "\u001b[K     |████████████████████████████████| 358kB 46.5MB/s \n",
+            "\u001b[?25hCollecting opencv-python-headless\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/b6/2a/496e06fd289c01dc21b11970be1261c87ce1cc22d5340c14b516160822a7/opencv_python_headless-4.4.0.42-cp36-cp36m-manylinux2014_x86_64.whl (36.6MB)\n",
+            "\u001b[K     |████████████████████████████████| 36.6MB 92kB/s \n",
+            "\u001b[?25hRequirement already satisfied: scipy\u003e=0.19.1 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (1.4.1)\n",
+            "Collecting tensorflow-model-optimization\u003e=0.4.1\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/55/38/4fd48ea1bfcb0b6e36d949025200426fe9c3a8bfae029f0973d85518fa5a/tensorflow_model_optimization-0.5.0-py2.py3-none-any.whl (172kB)\n",
+            "\u001b[K     |████████████████████████████████| 174kB 54.9MB/s \n",
+            "\u001b[?25hRequirement already satisfied: google-cloud-bigquery\u003e=0.31.0 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (1.21.0)\n",
+            "Requirement already satisfied: oauth2client in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (4.1.3)\n",
+            "Collecting seqeval\n",
+            "  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz\n",
+            "Requirement already satisfied: psutil\u003e=5.4.3 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (5.4.8)\n",
+            "Requirement already satisfied: pycocotools in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (2.0.2)\n",
+            "Collecting pyyaml\u003e=5.1\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/64/c2/b80047c7ac2478f9501676c988a5411ed5572f35d1beff9cae07d321512c/PyYAML-5.3.1.tar.gz (269kB)\n",
+            "\u001b[K     |████████████████████████████████| 276kB 47.1MB/s \n",
+            "\u001b[?25hRequirement already satisfied: Cython in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (0.29.21)\n",
+            "Requirement already satisfied: tensorflow-addons in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (0.8.3)\n",
+            "Requirement already satisfied: termcolor in /usr/local/lib/python3.6/dist-packages (from fire-\u003etflite-model-maker) (1.1.0)\n",
+            "Requirement already satisfied: tensorflow-metadata in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (0.24.0)\n",
+            "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (0.16.0)\n",
+            "Requirement already satisfied: promise in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (2.3)\n",
+            "Requirement already satisfied: attrs\u003e=18.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (20.2.0)\n",
+            "Requirement already satisfied: dill in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (0.3.2)\n",
+            "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (4.41.1)\n",
+            "Requirement already satisfied: requests\u003e=2.19.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (2.23.0)\n",
+            "Requirement already satisfied: wrapt in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (1.12.1)\n",
+            "Requirement already satisfied: wheel\u003e=0.26 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (0.35.1)\n",
+            "Requirement already satisfied: opt-einsum\u003e=2.3.2 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (3.3.0)\n",
+            "Requirement already satisfied: grpcio\u003e=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (1.32.0)\n",
+            "Collecting tb-nightly\u003c3.0.0a0,\u003e=2.4.0a0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/fc/cb/4dfe0d65bffb5e9663261ff664e6f5a2d37672b31dae27a0f14721ac00d3/tb_nightly-2.4.0a20200914-py3-none-any.whl (10.1MB)\n",
+            "\u001b[K     |████████████████████████████████| 10.1MB 45.9MB/s \n",
+            "\u001b[?25hRequirement already satisfied: keras-preprocessing\u003c1.2,\u003e=1.1.1 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (1.1.2)\n",
+            "Requirement already satisfied: google-pasta\u003e=0.1.8 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (0.2.0)\n",
+            "Collecting tf-estimator-nightly\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/bd/9a/3bfb9994eda11e426c809ebdf434e2ac5824a0784d980018bb53fd1620ec/tf_estimator_nightly-2.4.0.dev2020091401-py2.py3-none-any.whl (460kB)\n",
+            "\u001b[K     |████████████████████████████████| 460kB 45.6MB/s \n",
+            "\u001b[?25hRequirement already satisfied: typing-extensions\u003e=3.7.4.2 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (3.7.4.3)\n",
+            "Requirement already satisfied: h5py\u003c2.11.0,\u003e=2.10.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (2.10.0)\n",
+            "Requirement already satisfied: gast==0.3.3 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (0.3.3)\n",
+            "Requirement already satisfied: astunparse==1.6.3 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (1.6.3)\n",
+            "Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf\u003e=3.8.0-\u003etensorflow-hub\u003e=0.8.0-\u003etflite-model-maker) (50.3.0)\n",
+            "Requirement already satisfied: python-dateutil\u003e=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib-\u003etf-models-nightly-\u003etflite-model-maker) (2.8.1)\n",
+            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,\u003e=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib-\u003etf-models-nightly-\u003etflite-model-maker) (2.4.7)\n",
+            "Requirement already satisfied: kiwisolver\u003e=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib-\u003etf-models-nightly-\u003etflite-model-maker) (1.2.0)\n",
+            "Requirement already satisfied: cycler\u003e=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib-\u003etf-models-nightly-\u003etflite-model-maker) (0.10.0)\n",
+            "Requirement already satisfied: urllib3\u003c1.25,\u003e=1.21.1 in /usr/local/lib/python3.6/dist-packages (from kaggle\u003e=1.3.9-\u003etf-models-nightly-\u003etflite-model-maker) (1.24.3)\n",
+            "Requirement already satisfied: slugify in /usr/local/lib/python3.6/dist-packages (from kaggle\u003e=1.3.9-\u003etf-models-nightly-\u003etflite-model-maker) (0.0.1)\n",
+            "Requirement already satisfied: certifi in /usr/local/lib/python3.6/dist-packages (from kaggle\u003e=1.3.9-\u003etf-models-nightly-\u003etflite-model-maker) (2020.6.20)\n",
+            "Requirement already satisfied: python-slugify in /usr/local/lib/python3.6/dist-packages (from kaggle\u003e=1.3.9-\u003etf-models-nightly-\u003etflite-model-maker) (4.0.1)\n",
+            "Requirement already satisfied: pytz\u003e=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas\u003e=0.22.0-\u003etf-models-nightly-\u003etflite-model-maker) (2018.9)\n",
+            "Requirement already satisfied: google-auth\u003e=1.4.1 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client\u003e=1.6.7-\u003etf-models-nightly-\u003etflite-model-maker) (1.17.2)\n",
+            "Requirement already satisfied: uritemplate\u003c4dev,\u003e=3.0.0 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client\u003e=1.6.7-\u003etf-models-nightly-\u003etflite-model-maker) (3.0.1)\n",
+            "Requirement already satisfied: httplib2\u003c1dev,\u003e=0.17.0 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client\u003e=1.6.7-\u003etf-models-nightly-\u003etflite-model-maker) (0.17.4)\n",
+            "Requirement already satisfied: google-auth-httplib2\u003e=0.0.3 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client\u003e=1.6.7-\u003etf-models-nightly-\u003etflite-model-maker) (0.0.4)\n",
+            "Requirement already satisfied: dm-tree~=0.1.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow-model-optimization\u003e=0.4.1-\u003etf-models-nightly-\u003etflite-model-maker) (0.1.5)\n",
+            "Requirement already satisfied: google-cloud-core\u003c2.0dev,\u003e=1.0.3 in /usr/local/lib/python3.6/dist-packages (from google-cloud-bigquery\u003e=0.31.0-\u003etf-models-nightly-\u003etflite-model-maker) (1.0.3)\n",
+            "Requirement already satisfied: google-resumable-media!=0.4.0,\u003c0.5.0dev,\u003e=0.3.1 in /usr/local/lib/python3.6/dist-packages (from google-cloud-bigquery\u003e=0.31.0-\u003etf-models-nightly-\u003etflite-model-maker) (0.4.1)\n",
+            "Requirement already satisfied: pyasn1\u003e=0.1.7 in /usr/local/lib/python3.6/dist-packages (from oauth2client-\u003etf-models-nightly-\u003etflite-model-maker) (0.4.8)\n",
+            "Requirement already satisfied: rsa\u003e=3.1.4 in /usr/local/lib/python3.6/dist-packages (from oauth2client-\u003etf-models-nightly-\u003etflite-model-maker) (4.6)\n",
+            "Requirement already satisfied: pyasn1-modules\u003e=0.0.5 in /usr/local/lib/python3.6/dist-packages (from oauth2client-\u003etf-models-nightly-\u003etflite-model-maker) (0.2.8)\n",
+            "Requirement already satisfied: Keras\u003e=2.2.4 in /usr/local/lib/python3.6/dist-packages (from seqeval-\u003etf-models-nightly-\u003etflite-model-maker) (2.4.3)\n",
+            "Requirement already satisfied: typeguard in /usr/local/lib/python3.6/dist-packages (from tensorflow-addons-\u003etf-models-nightly-\u003etflite-model-maker) (2.7.1)\n",
+            "Requirement already satisfied: googleapis-common-protos\u003c2,\u003e=1.52.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-metadata-\u003etensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (1.52.0)\n",
+            "Requirement already satisfied: chardet\u003c4,\u003e=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests\u003e=2.19.0-\u003etensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (3.0.4)\n",
+            "Requirement already satisfied: idna\u003c3,\u003e=2.5 in /usr/local/lib/python3.6/dist-packages (from requests\u003e=2.19.0-\u003etensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (2.10)\n",
+            "Requirement already satisfied: werkzeug\u003e=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (1.0.1)\n",
+            "Requirement already satisfied: markdown\u003e=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (3.2.2)\n",
+            "Requirement already satisfied: google-auth-oauthlib\u003c0.5,\u003e=0.4.1 in /usr/local/lib/python3.6/dist-packages (from tb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (0.4.1)\n",
+            "Requirement already satisfied: tensorboard-plugin-wit\u003e=1.6.0 in /usr/local/lib/python3.6/dist-packages (from tb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (1.7.0)\n",
+            "Requirement already satisfied: text-unidecode\u003e=1.3 in /usr/local/lib/python3.6/dist-packages (from python-slugify-\u003ekaggle\u003e=1.3.9-\u003etf-models-nightly-\u003etflite-model-maker) (1.3)\n",
+            "Requirement already satisfied: cachetools\u003c5.0,\u003e=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth\u003e=1.4.1-\u003egoogle-api-python-client\u003e=1.6.7-\u003etf-models-nightly-\u003etflite-model-maker) (4.1.1)\n",
+            "Requirement already satisfied: google-api-core\u003c2.0.0dev,\u003e=1.14.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core\u003c2.0dev,\u003e=1.0.3-\u003egoogle-cloud-bigquery\u003e=0.31.0-\u003etf-models-nightly-\u003etflite-model-maker) (1.16.0)\n",
+            "Requirement already satisfied: importlib-metadata; python_version \u003c \"3.8\" in /usr/local/lib/python3.6/dist-packages (from markdown\u003e=2.6.8-\u003etb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (1.7.0)\n",
+            "Requirement already satisfied: requests-oauthlib\u003e=0.7.0 in /usr/local/lib/python3.6/dist-packages (from google-auth-oauthlib\u003c0.5,\u003e=0.4.1-\u003etb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (1.3.0)\n",
+            "Requirement already satisfied: zipp\u003e=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version \u003c \"3.8\"-\u003emarkdown\u003e=2.6.8-\u003etb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (3.1.0)\n",
+            "Requirement already satisfied: oauthlib\u003e=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib\u003e=0.7.0-\u003egoogle-auth-oauthlib\u003c0.5,\u003e=0.4.1-\u003etb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (3.1.0)\n",
+            "Building wheels for collected packages: fire, py-cpuinfo, seqeval, pyyaml\n",
+            "  Building wheel for fire (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for fire: filename=fire-0.3.1-py2.py3-none-any.whl size=111005 sha256=8f09a5a04716eb30229b33f5a9031fa22413bd4f709aac5155f4f26c6b070f47\n",
+            "  Stored in directory: /root/.cache/pip/wheels/c1/61/df/768b03527bf006b546dce284eb4249b185669e65afc5fbb2ac\n",
+            "  Building wheel for py-cpuinfo (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for py-cpuinfo: filename=py_cpuinfo-7.0.0-cp36-none-any.whl size=20071 sha256=574e1452bf1fb528837233653837cf69e38804b69190421918f8570a6f5f7c79\n",
+            "  Stored in directory: /root/.cache/pip/wheels/f1/93/7b/127daf0c3a5a49feb2fecd468d508067c733fba5192f726ad1\n",
+            "  Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for seqeval: filename=seqeval-0.0.12-cp36-none-any.whl size=7423 sha256=788a558edd9264e4bbc86ed4a69b393b367e12e33a8c922f64289530f289f1c6\n",
+            "  Stored in directory: /root/.cache/pip/wheels/4f/32/0a/df3b340a82583566975377d65e724895b3fad101a3fb729f68\n",
+            "  Building wheel for pyyaml (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for pyyaml: filename=PyYAML-5.3.1-cp36-cp36m-linux_x86_64.whl size=44619 sha256=12850ae3031f2d470b0d073f988afc480e64941f0fdf179c25fb17e03a39d550\n",
+            "  Stored in directory: /root/.cache/pip/wheels/a7/c1/ea/cf5bd31012e735dc1dfea3131a2d5eae7978b251083d6247bd\n",
+            "Successfully built fire py-cpuinfo seqeval pyyaml\n",
+            "Installing collected packages: sentencepiece, pybind11, flatbuffers, tflite-support, py-cpuinfo, tf-slim, opencv-python-headless, tensorflow-model-optimization, tb-nightly, tf-estimator-nightly, tf-nightly, seqeval, pyyaml, tf-models-nightly, fire, tflite-model-maker\n",
+            "  Found existing installation: PyYAML 3.13\n",
+            "    Uninstalling PyYAML-3.13:\n",
+            "      Successfully uninstalled PyYAML-3.13\n",
+            "Successfully installed fire-0.3.1 flatbuffers-1.12 opencv-python-headless-4.4.0.42 py-cpuinfo-7.0.0 pybind11-2.5.0 pyyaml-5.3.1 sentencepiece-0.1.91 seqeval-0.0.12 tb-nightly-2.4.0a20200914 tensorflow-model-optimization-0.5.0 tf-estimator-nightly-2.4.0.dev2020091401 tf-models-nightly-2.3.0.dev20200914 tf-nightly-2.4.0.dev20200914 tf-slim-1.1.0 tflite-model-maker-0.1.2 tflite-support-0.1.0rc3.dev2\n"
+          ]
+        }
+      ],
       "source": [
         "!pip install tflite-model-maker"
       ]
@@ -107,7 +252,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Gx1HGRoFQ54j"
       },
       "source": [
@@ -118,8 +262,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "XtxiUeZEiXpt"
       },
       "outputs": [],
@@ -130,6 +272,7 @@
         "assert tf.__version__.startswith('2')\n",
         "\n",
         "from tflite_model_maker import configs\n",
+        "from tflite_model_maker import ExportFormat\n",
         "from tflite_model_maker import image_classifier\n",
         "from tflite_model_maker import ImageClassifierDataLoader\n",
         "from tflite_model_maker import model_spec\n",
@@ -140,7 +283,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "KKRaYHABpob5"
       },
       "source": [
@@ -150,7 +292,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "SiZZ5DHXotaW"
       },
       "source": [
@@ -164,11 +305,23 @@
       "execution_count": null,
       "metadata": {
         "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
-        "id": "3jz5x0JoskPv"
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 51
+        },
+        "id": "3jz5x0JoskPv",
+        "outputId": "07a05f1d-b20d-4f80-8175-76416747476b"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Downloading data from https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz\n",
+            "228818944/228813984 [==============================] - 1s 0us/step\n"
+          ]
+        }
+      ],
       "source": [
         "image_path = tf.keras.utils.get_file(\n",
         "      'flower_photos',\n",
@@ -179,7 +332,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "a55MR6i6nuDm"
       },
       "source": [
@@ -191,7 +343,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "NNRNv_mloS89"
       },
       "source": [
@@ -201,7 +352,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "w-VDriAdsowu"
       },
       "source": [
@@ -212,7 +362,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "6ahtcO86tZBL"
       },
       "source": [
@@ -223,11 +372,22 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "lANoNS_gtdH1"
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "id": "lANoNS_gtdH1",
+        "outputId": "d2ad1069-373b-47ff-ead1-92982df9f652"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "INFO:tensorflow:Load image with size: 3670, num_label: 5, labels: daisy, dandelion, roses, sunflowers, tulips.\n"
+          ]
+        }
+      ],
       "source": [
         "data = ImageClassifierDataLoader.from_folder(image_path)\n",
         "train_data, test_data = data.split(0.9)"
@@ -236,7 +396,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Y_9IWyIztuRF"
       },
       "source": [
@@ -247,8 +406,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "yRXMZbrwtyRD"
       },
       "outputs": [],
@@ -259,7 +416,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "oxU2fDr-t2Ya"
       },
       "source": [
@@ -270,8 +426,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "wQr02VxJt6Cs"
       },
       "outputs": [],
@@ -282,13 +436,13 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "eVZw9zU8t84y"
       },
       "source": [
         "Step 4.  Export to TensorFlow Lite model.\n",
         "\n",
-        "Here, we export TensorFlow Lite model with [metadata](https://www.tensorflow.org/lite/convert/metadata) which provides a standard for model descriptions.\n",
+        "Here, we export TensorFlow Lite model with [metadata](https://www.tensorflow.org/lite/convert/metadata) which provides a standard for model descriptions. The label file is embedded in metadata.\n",
+        "\n",
         "You could download it in the left sidebar same as the uploading part for your own use."
       ]
     },
@@ -296,8 +450,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "Zb-eIzfluCoa"
       },
       "outputs": [],
@@ -308,17 +460,15 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "pyju1qc_v-wy"
       },
       "source": [
-        "After this simple 4 steps, we could further use TensorFlow Lite model file and label file in on-device applications like in [image classification](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification) reference app.\n"
+        "After this simple 4 steps, we could further use TensorFlow Lite model file in on-device applications like in [image classification](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification) reference app."
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "R1QG32ivs9lF"
       },
       "source": [
@@ -333,7 +483,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "ygEncJxtl-nQ"
       },
       "source": [
@@ -372,8 +521,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "7tOfUr2KlgpU"
       },
       "outputs": [],
@@ -387,7 +534,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "E051HBUM5owi"
       },
       "source": [
@@ -400,8 +546,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "I_fOlZsklmlL"
       },
       "outputs": [],
@@ -412,7 +556,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "u501eT4koURB"
       },
       "source": [
@@ -423,8 +566,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "cY4UU5SUobtJ"
       },
       "outputs": [],
@@ -436,7 +577,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Z9_MYPie3EMO"
       },
       "source": [
@@ -447,8 +587,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "Ih4Wx44I482b"
       },
       "outputs": [],
@@ -467,7 +605,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "AWuoensX4vDA"
       },
       "source": [
@@ -480,8 +617,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "TvYSUuJY3QxR"
       },
       "outputs": [],
@@ -492,7 +627,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "4JFOKWnH9x8_"
       },
       "source": [
@@ -503,8 +637,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "QNXAfjl192dC"
       },
       "outputs": [],
@@ -515,7 +647,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "LP5FPk_tOxoZ"
       },
       "source": [
@@ -528,8 +659,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "A8c2ZQ0J3Riy"
       },
       "outputs": [],
@@ -540,7 +669,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "6ZCrYOWoCt05"
       },
       "source": [
@@ -551,8 +679,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "n9O9Kx7nDQWD"
       },
       "outputs": [],
@@ -588,7 +714,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "S3H0rkbLUZAG"
       },
       "source": [
@@ -598,21 +723,18 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "aeHoGAceO2xV"
       },
       "source": [
         "### Step 4: Export to TensorFlow Lite Model\n",
         "\n",
-        "Convert the existing model to TensorFlow Lite model format and save the image labels in label file. The default TFLite filename is `model.tflite`, the default label filename is `label.txt`."
+        "Convert the existing model to TensorFlow Lite model format with  [metadata](https://www.tensorflow.org/lite/convert/metadata). The default TFLite filename is `model.tflite`."
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "Im6wA9lK3TQB"
       },
       "outputs": [],
@@ -623,19 +745,41 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "ROS2Ay2jMPCl"
       },
       "source": [
-        "The TensorFlow Lite model file and label file could be used in [image classification](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification) reference app.\n",
-        "\n",
-        "As for android reference app as an example, we could add `flower_classifier.tflite` and `flower_label.txt` in [assets](https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android/app/src/main/assets) folder. Meanwhile, change label filename in [code](https://github.com/tensorflow/examples/blob/master/lite/examples/image_classification/android/app/src/main/java/org/tensorflow/lite/examples/classification/tflite/ClassifierFloatMobileNet.java#L65) and TensorFlow Lite file name in [code](https://github.com/tensorflow/examples/blob/master/lite/examples/image_classification/android/app/src/main/java/org/tensorflow/lite/examples/classification/tflite/ClassifierFloatMobileNet.java#L60). Thus, we could run the retrained float TensorFlow Lite model on the android app.\n"
+        "See [example applications and guides of image classification](https://www.tensorflow.org/lite/models/image_classification/overview#example_applications_and_guides) for more details about how to integrate the TensorFlow Lite model into mobile apps."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "habFnvRxxQ4A"
+      },
+      "source": [
+        "The allowed export formats can be one or a list of the following:\n",
+        "\n",
+        "*   `ExportFormat.TFLITE`\n",
+        "*   `ExportFormat.LABEL`\n",
+        "*   `ExportFormat.SAVED_MODEL`\n",
+        "\n",
+        "By default, it just exports TensorFlow Lite model with metadata. You can also selectively export different files. For instance, exporting only the label file as follows:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "BvxWsOTmKG4P"
+      },
+      "outputs": [],
+      "source": [
+        "model.export(export_dir='.', export_format=ExportFormat.LABEL)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "-4jQaxyT5_KV"
       },
       "source": [
@@ -646,8 +790,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "S1YoPX5wOK-u"
       },
       "outputs": [],
@@ -658,7 +800,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "zNDBP2qA54aK"
       },
       "source": [
@@ -681,18 +822,15 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Gc4Jk8TvBQfm"
       },
       "source": [
-        "## Post-training quantization on the TensorFLow Lite model\n",
-        "\n"
+        "## Post-training quantization on the TensorFLow Lite model\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "tD8BOYrHBiDt"
       },
       "source": [
@@ -702,7 +840,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "iyIo0d5TCzE2"
       },
       "source": [
@@ -713,8 +850,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "k8hL2mstCxQl"
       },
       "outputs": [],
@@ -725,7 +860,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "K1gzx_rmFMOA"
       },
       "source": [
@@ -736,8 +870,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "WTJzFQnJFMjr"
       },
       "outputs": [],
@@ -748,7 +880,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Safo0e40wKZW"
       },
       "source": [
@@ -758,7 +889,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "A4kiTJtZ_sDm"
       },
       "source": [
@@ -768,7 +898,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "794vgj6ud7Ep"
       },
       "source": [
@@ -783,8 +912,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "7JKsJ6-P6ae1"
       },
       "outputs": [],
@@ -795,7 +922,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "gm_B1Wv08AxR"
       },
       "source": [
@@ -806,8 +932,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "lB2Go3HW8X7_"
       },
       "outputs": [],
@@ -818,7 +942,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "vAciGzVWtmWp"
       },
       "source": [
@@ -835,8 +958,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "xdiMF2WMfAR4"
       },
       "outputs": [],
@@ -849,7 +970,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "T_GGIoXZCs5F"
       },
       "source": [
@@ -861,7 +981,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "UhZ5IRKdeex3"
       },
       "source": [
@@ -871,7 +990,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "svTjlZhrCrcV"
       },
       "source": [
@@ -883,19 +1001,28 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "4M9bn703AHt2"
       },
       "source": [
         "## Change the training hyperparameters\n",
-        "We could also change the training hyperparameters like `epochs`, `dropout_rate` and `batch_size` that could affect the model accuracy. For instance,\n",
+        "We could also change the training hyperparameters like `epochs`, `dropout_rate` and `batch_size` that could affect the model accuracy. The model parameters you can adjust are:\n",
         "\n",
         "\n",
         "*   `epochs`: more epochs could achieve better accuracy until it converges but training for too many epochs may lead to overfitting.\n",
-        "*   `dropout_rate`: avoid overfitting.\n",
-        "*   `batch_size`: number of samples to use in one training step.\n",
-        "*   `validation_data`: number of samples to use in one training step.\n",
+        "*   `dropout_rate`: The rate for dropout, avoid overfitting. None by default.\n",
+        "*   `batch_size`: number of samples to use in one training step.  None by default.\n",
+        "*   `validation_data`: Validation data. If None, skips validation process. None by default.\n",
+        "*   `train_whole_model`: If true, the Hub module is trained together with the classification layer on top. Otherwise, only train the top classification layer. None by default.\n",
+        "*   `learning_rate`: Base learning rate. None by default.\n",
+        "*   `momentum`: a Python float forwarded to the optimizer. Only used when\n",
+        "      `use_hub_library` is True. None by default.\n",
+        "*   `shuffle`: Boolean, whether the data should be shuffled. False by default.\n",
+        "*   `use_augmentation`: Boolean, use data augmentation for preprocessing. False by default.\n",
+        "*   `use_hub_library`: Boolean, use `make_image_classifier_lib` from tensorflow hub to retrain the model. This training pipline could achieve better performance for complicated dataset with many categories. True by default. \n",
+        "*   `warmup_steps`: Number of warmup steps for warmup schedule on learning rate. If None, the default warmup_steps is used which is the total training steps in two epochs. Only used when `use_hub_library` is False. None by default.\n",
+        "*   `model_dir`: Optional, the location of the model checkpoint files. Only used when `use_hub_library` is False. None by default.\n",
         "\n",
+        "Parameters which are None by default like `epochs` will get the concrete default parameters in [make_image_classifier_lib](https://github.com/tensorflow/hub/blob/02ab9b7d3455e99e97abecf43c5d598a5528e20c/tensorflow_hub/tools/make_image_classifier/make_image_classifier_lib.py#L54) from TensorFlow Hub library or  [train_image_classifier_lib](https://github.com/tensorflow/examples/blob/f0260433d133fd3cea4a920d1e53ecda07163aee/tensorflow_examples/lite/model_maker/core/task/train_image_classifier_lib.py#L61).\n",
         "\n",
         "For example, we could train with more epochs.\n"
       ]
@@ -904,8 +1031,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "A3k7mhH54QcK"
       },
       "outputs": [],
@@ -916,7 +1041,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "VaYBQymQDsXU"
       },
       "source": [
@@ -927,8 +1051,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "VafIYpKWD4Sw"
       },
       "outputs": [],
@@ -941,27 +1063,13 @@
     "accelerator": "GPU",
     "colab": {
       "collapsed_sections": [],
-      "name": "image_classification.ipynb",
-      "private_outputs": true,
+      "name": "model_maker_image_classification.ipynb",
       "provenance": [],
       "toc_visible": true
     },
     "kernelspec": {
       "display_name": "Python 3",
-      "language": "python",
       "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.6.8"
     }
   },
   "nbformat": 4,
diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_question_answer.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_question_answer.ipynb
index 645be959d0e..06f534522c7 100644
--- a/tensorflow/lite/g3doc/tutorials/model_maker_question_answer.ipynb
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_question_answer.ipynb
@@ -3,7 +3,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "h2q27gKz1H20"
       },
       "source": [
@@ -14,9 +13,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "cellView": "both",
-        "colab": {},
-        "colab_type": "code",
+        "cellView": "form",
         "id": "TUfAcER1oUS6"
       },
       "outputs": [],
@@ -37,7 +34,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Gb7qyhNL1yWt"
       },
       "source": [
@@ -47,7 +43,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Fw5Y7snSuG51"
       },
       "source": [
@@ -70,7 +65,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "sr3q-gvm3cI8"
       },
       "source": [
@@ -82,7 +76,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "UxEHFTk755qw"
       },
       "source": [
@@ -92,7 +85,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "cFbKTCF25-SG"
       },
       "source": [
@@ -112,7 +104,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "gb7P4WQta8Ub"
       },
       "source": [
@@ -122,7 +113,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "w7cIHjIfbDlG"
       },
       "source": [
@@ -132,11 +122,9 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "xQPdlxZBYuZG"
       },
       "source": [
-        "\n",
         "```python\n",
         "# Chooses a model specification that represents the model.\n",
         "spec = model_spec.get('mobilebert_qa')\n",
@@ -151,7 +139,7 @@
         "# Gets the evaluation result.\n",
         "metric = model.evaluate(validation_data)\n",
         "\n",
-        "# Exports the model to the TensorFlow Lite format in the export directory.\n",
+        "# Exports the model to the TensorFlow Lite format with metadata in the export directory.\n",
         "model.export(export_dir)\n",
         "```"
       ]
@@ -159,7 +147,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "exScAdvBbNEi"
       },
       "source": [
@@ -169,7 +156,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "bcLF2PKkSbV3"
       },
       "source": [
@@ -182,11 +168,160 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "qhl8lqVamEty"
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        },
+        "id": "qhl8lqVamEty",
+        "outputId": "f2ef33ec-ad6b-4b45-9c50-6d65118e80da"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Collecting tflite-model-maker\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/13/bc/4c23b9cb9ef612a1f48bac5543bd531665de5eab8f8231111aac067f8c30/tflite_model_maker-0.1.2-py3-none-any.whl (104kB)\n",
+            "\r\u001b[K     |███▏                            | 10kB 28.4MB/s eta 0:00:01\r\u001b[K     |██████▎                         | 20kB 1.8MB/s eta 0:00:01\r\u001b[K     |█████████▍                      | 30kB 2.4MB/s eta 0:00:01\r\u001b[K     |████████████▋                   | 40kB 2.7MB/s eta 0:00:01\r\u001b[K     |███████████████▊                | 51kB 2.1MB/s eta 0:00:01\r\u001b[K     |██████████████████▉             | 61kB 2.4MB/s eta 0:00:01\r\u001b[K     |██████████████████████          | 71kB 2.7MB/s eta 0:00:01\r\u001b[K     |█████████████████████████▏      | 81kB 2.9MB/s eta 0:00:01\r\u001b[K     |████████████████████████████▎   | 92kB 3.1MB/s eta 0:00:01\r\u001b[K     |███████████████████████████████▌| 102kB 3.0MB/s eta 0:00:01\r\u001b[K     |████████████████████████████████| 112kB 3.0MB/s \n",
+            "\u001b[?25hRequirement already satisfied: absl-py in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (0.10.0)\n",
+            "Collecting tf-nightly\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/33/d4/61c47ae889b490b9c5f07f4f61bdc057c158a1a1979c375fa019d647a19e/tf_nightly-2.4.0.dev20200914-cp36-cp36m-manylinux2010_x86_64.whl (390.1MB)\n",
+            "\u001b[K     |████████████████████████████████| 390.2MB 43kB/s \n",
+            "\u001b[?25hRequirement already satisfied: numpy\u003e=1.17.3 in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (1.18.5)\n",
+            "Requirement already satisfied: pillow in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (7.0.0)\n",
+            "Collecting tf-models-nightly\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d3/e9/c4e5a451c268a5a75a27949562364f6086f6bb33b226a065a8beceefa9ba/tf_models_nightly-2.3.0.dev20200914-py2.py3-none-any.whl (993kB)\n",
+            "\u001b[K     |████████████████████████████████| 1.0MB 57.6MB/s \n",
+            "\u001b[?25hCollecting flatbuffers==1.12\n",
+            "  Downloading https://files.pythonhosted.org/packages/eb/26/712e578c5f14e26ae3314c39a1bdc4eb2ec2f4ddc89b708cf8e0a0d20423/flatbuffers-1.12-py2.py3-none-any.whl\n",
+            "Requirement already satisfied: tensorflow-hub\u003e=0.8.0 in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (0.9.0)\n",
+            "Collecting fire\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/34/a7/0e22e70778aca01a52b9c899d9c145c6396d7b613719cd63db97ffa13f2f/fire-0.3.1.tar.gz (81kB)\n",
+            "\u001b[K     |████████████████████████████████| 81kB 11.5MB/s \n",
+            "\u001b[?25hCollecting sentencepiece\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)\n",
+            "\u001b[K     |████████████████████████████████| 1.1MB 50.9MB/s \n",
+            "\u001b[?25hCollecting tflite-support==0.1.0rc3.dev2\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/fa/c5/5e9ee3abd5b4ef8294432cd714407f49a66befa864905b66ee8bdc612795/tflite_support-0.1.0rc3.dev2-cp36-cp36m-manylinux2010_x86_64.whl (1.0MB)\n",
+            "\u001b[K     |████████████████████████████████| 1.0MB 50.9MB/s \n",
+            "\u001b[?25hRequirement already satisfied: tensorflow-datasets\u003e=2.1.0 in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (2.1.0)\n",
+            "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from absl-py-\u003etflite-model-maker) (1.15.0)\n",
+            "Requirement already satisfied: termcolor\u003e=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (1.1.0)\n",
+            "Requirement already satisfied: opt-einsum\u003e=2.3.2 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (3.3.0)\n",
+            "Collecting tb-nightly\u003c3.0.0a0,\u003e=2.4.0a0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/fc/cb/4dfe0d65bffb5e9663261ff664e6f5a2d37672b31dae27a0f14721ac00d3/tb_nightly-2.4.0a20200914-py3-none-any.whl (10.1MB)\n",
+            "\u001b[K     |████████████████████████████████| 10.1MB 51.4MB/s \n",
+            "\u001b[?25hRequirement already satisfied: typing-extensions\u003e=3.7.4.2 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (3.7.4.3)\n",
+            "Requirement already satisfied: wheel\u003e=0.26 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (0.35.1)\n",
+            "Collecting tf-estimator-nightly\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/bd/9a/3bfb9994eda11e426c809ebdf434e2ac5824a0784d980018bb53fd1620ec/tf_estimator_nightly-2.4.0.dev2020091401-py2.py3-none-any.whl (460kB)\n",
+            "\u001b[K     |████████████████████████████████| 460kB 36.0MB/s \n",
+            "\u001b[?25hRequirement already satisfied: google-pasta\u003e=0.1.8 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (0.2.0)\n",
+            "Requirement already satisfied: h5py\u003c2.11.0,\u003e=2.10.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (2.10.0)\n",
+            "Requirement already satisfied: keras-preprocessing\u003c1.2,\u003e=1.1.1 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (1.1.2)\n",
+            "Requirement already satisfied: wrapt\u003e=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (1.12.1)\n",
+            "Requirement already satisfied: grpcio\u003e=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (1.32.0)\n",
+            "Requirement already satisfied: protobuf\u003e=3.9.2 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (3.12.4)\n",
+            "Requirement already satisfied: gast==0.3.3 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (0.3.3)\n",
+            "Requirement already satisfied: astunparse==1.6.3 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (1.6.3)\n",
+            "Requirement already satisfied: scipy\u003e=0.19.1 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (1.4.1)\n",
+            "Collecting pyyaml\u003e=5.1\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/64/c2/b80047c7ac2478f9501676c988a5411ed5572f35d1beff9cae07d321512c/PyYAML-5.3.1.tar.gz (269kB)\n",
+            "\u001b[K     |████████████████████████████████| 276kB 59.8MB/s \n",
+            "\u001b[?25hCollecting tensorflow-model-optimization\u003e=0.4.1\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/55/38/4fd48ea1bfcb0b6e36d949025200426fe9c3a8bfae029f0973d85518fa5a/tensorflow_model_optimization-0.5.0-py2.py3-none-any.whl (172kB)\n",
+            "\u001b[K     |████████████████████████████████| 174kB 51.0MB/s \n",
+            "\u001b[?25hRequirement already satisfied: pandas\u003e=0.22.0 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (1.0.5)\n",
+            "Requirement already satisfied: dataclasses in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (0.7)\n",
+            "Requirement already satisfied: Cython in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (0.29.21)\n",
+            "Collecting opencv-python-headless\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/b6/2a/496e06fd289c01dc21b11970be1261c87ce1cc22d5340c14b516160822a7/opencv_python_headless-4.4.0.42-cp36-cp36m-manylinux2014_x86_64.whl (36.6MB)\n",
+            "\u001b[K     |████████████████████████████████| 36.6MB 83kB/s \n",
+            "\u001b[?25hRequirement already satisfied: kaggle\u003e=1.3.9 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (1.5.8)\n",
+            "Requirement already satisfied: pycocotools in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (2.0.2)\n",
+            "Requirement already satisfied: oauth2client in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (4.1.3)\n",
+            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (3.2.2)\n",
+            "Collecting tf-slim\u003e=1.1.0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/02/97/b0f4a64df018ca018cc035d44f2ef08f91e2e8aa67271f6f19633a015ff7/tf_slim-1.1.0-py2.py3-none-any.whl (352kB)\n",
+            "\u001b[K     |████████████████████████████████| 358kB 55.9MB/s \n",
+            "\u001b[?25hCollecting seqeval\n",
+            "  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz\n",
+            "Requirement already satisfied: psutil\u003e=5.4.3 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (5.4.8)\n",
+            "Collecting py-cpuinfo\u003e=3.3.0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/f6/f5/8e6e85ce2e9f6e05040cf0d4e26f43a4718bcc4bce988b433276d4b1a5c1/py-cpuinfo-7.0.0.tar.gz (95kB)\n",
+            "\u001b[K     |████████████████████████████████| 102kB 13.5MB/s \n",
+            "\u001b[?25hRequirement already satisfied: google-api-python-client\u003e=1.6.7 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (1.7.12)\n",
+            "Requirement already satisfied: gin-config in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (0.3.0)\n",
+            "Requirement already satisfied: tensorflow-addons in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (0.8.3)\n",
+            "Requirement already satisfied: google-cloud-bigquery\u003e=0.31.0 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (1.21.0)\n",
+            "Collecting pybind11\u003e=2.4\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/89/e3/d576f6f02bc75bacbc3d42494e8f1d063c95617d86648dba243c2cb3963e/pybind11-2.5.0-py2.py3-none-any.whl (296kB)\n",
+            "\u001b[K     |████████████████████████████████| 296kB 47.9MB/s \n",
+            "\u001b[?25hRequirement already satisfied: promise in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (2.3)\n",
+            "Requirement already satisfied: tensorflow-metadata in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (0.24.0)\n",
+            "Requirement already satisfied: requests\u003e=2.19.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (2.23.0)\n",
+            "Requirement already satisfied: dill in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (0.3.2)\n",
+            "Requirement already satisfied: attrs\u003e=18.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (20.2.0)\n",
+            "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (4.41.1)\n",
+            "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (0.16.0)\n",
+            "Requirement already satisfied: werkzeug\u003e=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (1.0.1)\n",
+            "Requirement already satisfied: setuptools\u003e=41.0.0 in /usr/local/lib/python3.6/dist-packages (from tb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (50.3.0)\n",
+            "Requirement already satisfied: tensorboard-plugin-wit\u003e=1.6.0 in /usr/local/lib/python3.6/dist-packages (from tb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (1.7.0)\n",
+            "Requirement already satisfied: google-auth-oauthlib\u003c0.5,\u003e=0.4.1 in /usr/local/lib/python3.6/dist-packages (from tb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (0.4.1)\n",
+            "Requirement already satisfied: google-auth\u003c2,\u003e=1.6.3 in /usr/local/lib/python3.6/dist-packages (from tb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (1.17.2)\n",
+            "Requirement already satisfied: markdown\u003e=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (3.2.2)\n",
+            "Requirement already satisfied: dm-tree~=0.1.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow-model-optimization\u003e=0.4.1-\u003etf-models-nightly-\u003etflite-model-maker) (0.1.5)\n",
+            "Requirement already satisfied: pytz\u003e=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas\u003e=0.22.0-\u003etf-models-nightly-\u003etflite-model-maker) (2018.9)\n",
+            "Requirement already satisfied: python-dateutil\u003e=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas\u003e=0.22.0-\u003etf-models-nightly-\u003etflite-model-maker) (2.8.1)\n",
+            "Requirement already satisfied: certifi in /usr/local/lib/python3.6/dist-packages (from kaggle\u003e=1.3.9-\u003etf-models-nightly-\u003etflite-model-maker) (2020.6.20)\n",
+            "Requirement already satisfied: python-slugify in /usr/local/lib/python3.6/dist-packages (from kaggle\u003e=1.3.9-\u003etf-models-nightly-\u003etflite-model-maker) (4.0.1)\n",
+            "Requirement already satisfied: slugify in /usr/local/lib/python3.6/dist-packages (from kaggle\u003e=1.3.9-\u003etf-models-nightly-\u003etflite-model-maker) (0.0.1)\n",
+            "Requirement already satisfied: urllib3\u003c1.25,\u003e=1.21.1 in /usr/local/lib/python3.6/dist-packages (from kaggle\u003e=1.3.9-\u003etf-models-nightly-\u003etflite-model-maker) (1.24.3)\n",
+            "Requirement already satisfied: pyasn1\u003e=0.1.7 in /usr/local/lib/python3.6/dist-packages (from oauth2client-\u003etf-models-nightly-\u003etflite-model-maker) (0.4.8)\n",
+            "Requirement already satisfied: rsa\u003e=3.1.4 in /usr/local/lib/python3.6/dist-packages (from oauth2client-\u003etf-models-nightly-\u003etflite-model-maker) (4.6)\n",
+            "Requirement already satisfied: httplib2\u003e=0.9.1 in /usr/local/lib/python3.6/dist-packages (from oauth2client-\u003etf-models-nightly-\u003etflite-model-maker) (0.17.4)\n",
+            "Requirement already satisfied: pyasn1-modules\u003e=0.0.5 in /usr/local/lib/python3.6/dist-packages (from oauth2client-\u003etf-models-nightly-\u003etflite-model-maker) (0.2.8)\n",
+            "Requirement already satisfied: kiwisolver\u003e=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib-\u003etf-models-nightly-\u003etflite-model-maker) (1.2.0)\n",
+            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,\u003e=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib-\u003etf-models-nightly-\u003etflite-model-maker) (2.4.7)\n",
+            "Requirement already satisfied: cycler\u003e=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib-\u003etf-models-nightly-\u003etflite-model-maker) (0.10.0)\n",
+            "Requirement already satisfied: Keras\u003e=2.2.4 in /usr/local/lib/python3.6/dist-packages (from seqeval-\u003etf-models-nightly-\u003etflite-model-maker) (2.4.3)\n",
+            "Requirement already satisfied: google-auth-httplib2\u003e=0.0.3 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client\u003e=1.6.7-\u003etf-models-nightly-\u003etflite-model-maker) (0.0.4)\n",
+            "Requirement already satisfied: uritemplate\u003c4dev,\u003e=3.0.0 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client\u003e=1.6.7-\u003etf-models-nightly-\u003etflite-model-maker) (3.0.1)\n",
+            "Requirement already satisfied: typeguard in /usr/local/lib/python3.6/dist-packages (from tensorflow-addons-\u003etf-models-nightly-\u003etflite-model-maker) (2.7.1)\n",
+            "Requirement already satisfied: google-cloud-core\u003c2.0dev,\u003e=1.0.3 in /usr/local/lib/python3.6/dist-packages (from google-cloud-bigquery\u003e=0.31.0-\u003etf-models-nightly-\u003etflite-model-maker) (1.0.3)\n",
+            "Requirement already satisfied: google-resumable-media!=0.4.0,\u003c0.5.0dev,\u003e=0.3.1 in /usr/local/lib/python3.6/dist-packages (from google-cloud-bigquery\u003e=0.31.0-\u003etf-models-nightly-\u003etflite-model-maker) (0.4.1)\n",
+            "Requirement already satisfied: googleapis-common-protos\u003c2,\u003e=1.52.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-metadata-\u003etensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (1.52.0)\n",
+            "Requirement already satisfied: chardet\u003c4,\u003e=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests\u003e=2.19.0-\u003etensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (3.0.4)\n",
+            "Requirement already satisfied: idna\u003c3,\u003e=2.5 in /usr/local/lib/python3.6/dist-packages (from requests\u003e=2.19.0-\u003etensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (2.10)\n",
+            "Requirement already satisfied: requests-oauthlib\u003e=0.7.0 in /usr/local/lib/python3.6/dist-packages (from google-auth-oauthlib\u003c0.5,\u003e=0.4.1-\u003etb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (1.3.0)\n",
+            "Requirement already satisfied: cachetools\u003c5.0,\u003e=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth\u003c2,\u003e=1.6.3-\u003etb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (4.1.1)\n",
+            "Requirement already satisfied: importlib-metadata; python_version \u003c \"3.8\" in /usr/local/lib/python3.6/dist-packages (from markdown\u003e=2.6.8-\u003etb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (1.7.0)\n",
+            "Requirement already satisfied: text-unidecode\u003e=1.3 in /usr/local/lib/python3.6/dist-packages (from python-slugify-\u003ekaggle\u003e=1.3.9-\u003etf-models-nightly-\u003etflite-model-maker) (1.3)\n",
+            "Requirement already satisfied: google-api-core\u003c2.0.0dev,\u003e=1.14.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core\u003c2.0dev,\u003e=1.0.3-\u003egoogle-cloud-bigquery\u003e=0.31.0-\u003etf-models-nightly-\u003etflite-model-maker) (1.16.0)\n",
+            "Requirement already satisfied: oauthlib\u003e=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib\u003e=0.7.0-\u003egoogle-auth-oauthlib\u003c0.5,\u003e=0.4.1-\u003etb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (3.1.0)\n",
+            "Requirement already satisfied: zipp\u003e=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version \u003c \"3.8\"-\u003emarkdown\u003e=2.6.8-\u003etb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (3.1.0)\n",
+            "Building wheels for collected packages: fire, pyyaml, seqeval, py-cpuinfo\n",
+            "  Building wheel for fire (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for fire: filename=fire-0.3.1-py2.py3-none-any.whl size=111005 sha256=f0b82e6b31e21d6db3591478a37188c727533acefe415b16b456c85ef9bef47c\n",
+            "  Stored in directory: /root/.cache/pip/wheels/c1/61/df/768b03527bf006b546dce284eb4249b185669e65afc5fbb2ac\n",
+            "  Building wheel for pyyaml (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for pyyaml: filename=PyYAML-5.3.1-cp36-cp36m-linux_x86_64.whl size=44619 sha256=cdbc63ead8369d7403f47b1adff163ebde2636c9f0c2a5ebd6413d156b2b7a9f\n",
+            "  Stored in directory: /root/.cache/pip/wheels/a7/c1/ea/cf5bd31012e735dc1dfea3131a2d5eae7978b251083d6247bd\n",
+            "  Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for seqeval: filename=seqeval-0.0.12-cp36-none-any.whl size=7423 sha256=3ac4a1cc3b88a9b1a1ed8217f2b8d3abb7f936e853383025888b94019d98a856\n",
+            "  Stored in directory: /root/.cache/pip/wheels/4f/32/0a/df3b340a82583566975377d65e724895b3fad101a3fb729f68\n",
+            "  Building wheel for py-cpuinfo (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for py-cpuinfo: filename=py_cpuinfo-7.0.0-cp36-none-any.whl size=20071 sha256=b5491e6fcabbf9ae464c0def53ec6ec27bbf01230ff96f4e34c6a7c44d55d5c9\n",
+            "  Stored in directory: /root/.cache/pip/wheels/f1/93/7b/127daf0c3a5a49feb2fecd468d508067c733fba5192f726ad1\n",
+            "Successfully built fire pyyaml seqeval py-cpuinfo\n",
+            "Installing collected packages: tb-nightly, flatbuffers, tf-estimator-nightly, tf-nightly, pyyaml, tensorflow-model-optimization, opencv-python-headless, sentencepiece, tf-slim, seqeval, py-cpuinfo, tf-models-nightly, fire, pybind11, tflite-support, tflite-model-maker\n",
+            "  Found existing installation: PyYAML 3.13\n",
+            "    Uninstalling PyYAML-3.13:\n",
+            "      Successfully uninstalled PyYAML-3.13\n",
+            "Successfully installed fire-0.3.1 flatbuffers-1.12 opencv-python-headless-4.4.0.42 py-cpuinfo-7.0.0 pybind11-2.5.0 pyyaml-5.3.1 sentencepiece-0.1.91 seqeval-0.0.12 tb-nightly-2.4.0a20200914 tensorflow-model-optimization-0.5.0 tf-estimator-nightly-2.4.0.dev2020091401 tf-models-nightly-2.3.0.dev20200914 tf-nightly-2.4.0.dev20200914 tf-slim-1.1.0 tflite-model-maker-0.1.2 tflite-support-0.1.0rc3.dev2\n"
+          ]
+        }
+      ],
       "source": [
         "!pip install tflite-model-maker"
       ]
@@ -194,7 +329,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "l6lRhVK9Q_0U"
       },
       "source": [
@@ -205,8 +339,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "XtxiUeZEiXpt"
       },
       "outputs": [],
@@ -218,6 +350,7 @@
         "assert tf.__version__.startswith('2')\n",
         "\n",
         "from tflite_model_maker import configs\n",
+        "from tflite_model_maker import ExportFormat\n",
         "from tflite_model_maker import model_spec\n",
         "from tflite_model_maker import question_answer\n",
         "from tflite_model_maker import QuestionAnswerDataLoader"
@@ -226,7 +359,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "l65ctmtW7_FF"
       },
       "source": [
@@ -236,7 +368,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "kJ_B8fMDOhMR"
       },
       "source": [
@@ -257,8 +388,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "vEAWuZQ1PFiX"
       },
       "outputs": [],
@@ -269,7 +398,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "ygEncJxtl-nQ"
       },
       "source": [
@@ -288,11 +416,25 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "7tOfUr2KlgpU"
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 85
+        },
+        "id": "7tOfUr2KlgpU",
+        "outputId": "a5d42181-82ea-47a2-f364-825701e4a1f8"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Downloading data from https://storage.googleapis.com/download.tensorflow.org/models/tflite/dataset/triviaqa-web-train-8000.json\n",
+            "32571392/32570663 [==============================] - 1s 0us/step\n",
+            "Downloading data from https://storage.googleapis.com/download.tensorflow.org/models/tflite/dataset/triviaqa-verified-web-dev.json\n",
+            "1171456/1167744 [==============================] - 0s 0us/step\n"
+          ]
+        }
+      ],
       "source": [
         "train_data_path = tf.keras.utils.get_file(\n",
         "    fname='triviaqa-web-train-8000.json',\n",
@@ -305,7 +447,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "UfZk8GNr_1nc"
       },
       "source": [
@@ -319,7 +460,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "E051HBUM5owi"
       },
       "source": [
@@ -330,8 +470,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "I_fOlZsklmlL"
       },
       "outputs": [],
@@ -343,7 +481,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "AWuoensX4vDA"
       },
       "source": [
@@ -359,11 +496,36 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "TvYSUuJY3QxR"
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 68
+        },
+        "id": "TvYSUuJY3QxR",
+        "outputId": "67c38b57-3596-467b-c71b-b1a3980e68c7"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "INFO:tensorflow:Retraining the models...\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:tensorflow:Retraining the models...\n"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Epoch 1/2\n"
+          ]
+        }
+      ],
       "source": [
         "model = question_answer.create(train_data, model_spec=spec)"
       ]
@@ -371,7 +533,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "0JKI-pNc8idH"
       },
       "source": [
@@ -382,8 +543,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "gd7Hs8TF8n3H"
       },
       "outputs": [],
@@ -394,7 +553,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "LP5FPk_tOxoZ"
       },
       "source": [
@@ -407,8 +565,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "A8c2ZQ0J3Riy"
       },
       "outputs": [],
@@ -419,7 +575,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "aeHoGAceO2xV"
       },
       "source": [
@@ -431,7 +586,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "TwA2Z2pokQJc"
       },
       "source": [
@@ -442,8 +596,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "1wBVTO8qkmum"
       },
       "outputs": [],
@@ -455,19 +607,16 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "qea2YkEGkOTH"
       },
       "source": [
-        "Export the quantized TFLite model according to the quantization config and save the vocabulary to a vocab file. The default TFLite model filename is `model.tflite`, and the default vocab filename is `vocab`."
+        "Export the quantized TFLite model according to the quantization config with [metadata](https://www.tensorflow.org/lite/convert/metadata). The default TFLite model filename is `model.tflite`."
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "Im6wA9lK3TQB"
       },
       "outputs": [],
@@ -478,17 +627,41 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "w12kvDdHJIGH"
       },
       "source": [
-        "You can use the TensorFlow Lite model file and vocab file in the [bert_qa](https://github.com/tensorflow/examples/tree/master/lite/examples/bert_qa/android) reference app by downloading it from the left sidebar on Colab."
+        "You can use the TensorFlow Lite model file in the [bert_qa](https://github.com/tensorflow/examples/tree/master/lite/examples/bert_qa/android) reference app using [BertQuestionAnswerer API](https://www.tensorflow.org/lite/inference_with_metadata/task_library/bert_question_answerer) in [TensorFlow Lite Task Library](https://www.tensorflow.org/lite/inference_with_metadata/task_library/overview) by downloading it from the left sidebar on Colab."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "VFnJPvq3VGh3"
+      },
+      "source": [
+        "The allowed export formats can be one or a list of the following:\n",
+        "\n",
+        "*   `ExportFormat.TFLITE`\n",
+        "*   `ExportFormat.VOCAB`\n",
+        "*   `ExportFormat.SAVED_MODEL`\n",
+        "\n",
+        "By default, it just exports TensorFlow Lite model with metadata. You can also selectively export different files. For instance, exporting only the vocab file as follows:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ro2hz4kXVImY"
+      },
+      "outputs": [],
+      "source": [
+        "model.export(export_dir='.', export_format=ExportFormat.VOCAB)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "HZKYthlVrTos"
       },
       "source": [
@@ -499,8 +672,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "ochbq95ZrVFX"
       },
       "outputs": [],
@@ -511,7 +682,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "EoWiA_zX8rxE"
       },
       "source": [
@@ -528,7 +698,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "mwtiksguDfhl"
       },
       "source": [
@@ -556,7 +725,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "cAOd5_bzH9AQ"
       },
       "source": [
@@ -567,8 +735,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "e9WBN0UTQoMN"
       },
       "outputs": [],
@@ -580,18 +746,15 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "6LSTdghTP0Cv"
       },
       "source": [
-        "The remaining steps are the same. Note that you must rerun both the `dataloader` and `create` parts as different model specs may have different preprocessing steps.\n",
-        "\n"
+        "The remaining steps are the same. Note that you must rerun both the `dataloader` and `create` parts as different model specs may have different preprocessing steps.\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "LvQuy7RSDir3"
       },
       "source": [
@@ -611,7 +774,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Eq6B9lKMfhS6"
       },
       "source": [
@@ -627,7 +789,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "L2d7yycrgu6L"
       },
       "source": [
@@ -639,8 +800,7 @@
     "accelerator": "GPU",
     "colab": {
       "collapsed_sections": [],
-      "name": "question_answer.ipynb",
-      "private_outputs": true,
+      "name": "model_maker_question_answer.ipynb",
       "provenance": [],
       "toc_visible": true
     },
diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
index 88cef93e761..43168e394f5 100644
--- a/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
@@ -3,7 +3,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "h2q27gKz1H20"
       },
       "source": [
@@ -15,8 +14,6 @@
       "execution_count": null,
       "metadata": {
         "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
         "id": "TUfAcER1oUS6"
       },
       "outputs": [],
@@ -37,7 +34,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Gb7qyhNL1yWt"
       },
       "source": [
@@ -47,7 +43,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Fw5Y7snSuG51"
       },
       "source": [
@@ -70,7 +65,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "sr3q-gvm3cI8"
       },
       "source": [
@@ -82,7 +76,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "bcLF2PKkSbV3"
       },
       "source": [
@@ -92,7 +85,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "2vvAObmTqglq"
       },
       "source": [
@@ -104,11 +96,160 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "qhl8lqVamEty"
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 1000
+        },
+        "id": "qhl8lqVamEty",
+        "outputId": "9757d141-eb26-46e2-fc48-376e0e244142"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Collecting tflite-model-maker\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/13/bc/4c23b9cb9ef612a1f48bac5543bd531665de5eab8f8231111aac067f8c30/tflite_model_maker-0.1.2-py3-none-any.whl (104kB)\n",
+            "\r\u001b[K     |███▏                            | 10kB 20.4MB/s eta 0:00:01\r\u001b[K     |██████▎                         | 20kB 5.8MB/s eta 0:00:01\r\u001b[K     |█████████▍                      | 30kB 7.1MB/s eta 0:00:01\r\u001b[K     |████████████▋                   | 40kB 7.7MB/s eta 0:00:01\r\u001b[K     |███████████████▊                | 51kB 6.9MB/s eta 0:00:01\r\u001b[K     |██████████████████▉             | 61kB 7.8MB/s eta 0:00:01\r\u001b[K     |██████████████████████          | 71kB 8.0MB/s eta 0:00:01\r\u001b[K     |█████████████████████████▏      | 81kB 8.4MB/s eta 0:00:01\r\u001b[K     |████████████████████████████▎   | 92kB 7.9MB/s eta 0:00:01\r\u001b[K     |███████████████████████████████▌| 102kB 8.2MB/s eta 0:00:01\r\u001b[K     |████████████████████████████████| 112kB 8.2MB/s \n",
+            "\u001b[?25hRequirement already satisfied: tensorflow-hub\u003e=0.8.0 in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (0.9.0)\n",
+            "Collecting fire\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/34/a7/0e22e70778aca01a52b9c899d9c145c6396d7b613719cd63db97ffa13f2f/fire-0.3.1.tar.gz (81kB)\n",
+            "\u001b[K     |████████████████████████████████| 81kB 7.7MB/s \n",
+            "\u001b[?25hCollecting flatbuffers==1.12\n",
+            "  Downloading https://files.pythonhosted.org/packages/eb/26/712e578c5f14e26ae3314c39a1bdc4eb2ec2f4ddc89b708cf8e0a0d20423/flatbuffers-1.12-py2.py3-none-any.whl\n",
+            "Collecting tf-models-nightly\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d3/e9/c4e5a451c268a5a75a27949562364f6086f6bb33b226a065a8beceefa9ba/tf_models_nightly-2.3.0.dev20200914-py2.py3-none-any.whl (993kB)\n",
+            "\u001b[K     |████████████████████████████████| 1.0MB 17.6MB/s \n",
+            "\u001b[?25hCollecting sentencepiece\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)\n",
+            "\u001b[K     |████████████████████████████████| 1.1MB 31.6MB/s \n",
+            "\u001b[?25hRequirement already satisfied: numpy\u003e=1.17.3 in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (1.18.5)\n",
+            "Collecting tf-nightly\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/33/d4/61c47ae889b490b9c5f07f4f61bdc057c158a1a1979c375fa019d647a19e/tf_nightly-2.4.0.dev20200914-cp36-cp36m-manylinux2010_x86_64.whl (390.1MB)\n",
+            "\u001b[K     |████████████████████████████████| 390.2MB 46kB/s \n",
+            "\u001b[?25hRequirement already satisfied: pillow in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (7.0.0)\n",
+            "Requirement already satisfied: absl-py in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (0.10.0)\n",
+            "Requirement already satisfied: tensorflow-datasets\u003e=2.1.0 in /usr/local/lib/python3.6/dist-packages (from tflite-model-maker) (2.1.0)\n",
+            "Collecting tflite-support==0.1.0rc3.dev2\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/fa/c5/5e9ee3abd5b4ef8294432cd714407f49a66befa864905b66ee8bdc612795/tflite_support-0.1.0rc3.dev2-cp36-cp36m-manylinux2010_x86_64.whl (1.0MB)\n",
+            "\u001b[K     |████████████████████████████████| 1.0MB 50.0MB/s \n",
+            "\u001b[?25hRequirement already satisfied: protobuf\u003e=3.8.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-hub\u003e=0.8.0-\u003etflite-model-maker) (3.12.4)\n",
+            "Requirement already satisfied: six\u003e=1.12.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-hub\u003e=0.8.0-\u003etflite-model-maker) (1.15.0)\n",
+            "Requirement already satisfied: termcolor in /usr/local/lib/python3.6/dist-packages (from fire-\u003etflite-model-maker) (1.1.0)\n",
+            "Requirement already satisfied: pycocotools in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (2.0.2)\n",
+            "Collecting tensorflow-model-optimization\u003e=0.4.1\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/55/38/4fd48ea1bfcb0b6e36d949025200426fe9c3a8bfae029f0973d85518fa5a/tensorflow_model_optimization-0.5.0-py2.py3-none-any.whl (172kB)\n",
+            "\u001b[K     |████████████████████████████████| 174kB 57.7MB/s \n",
+            "\u001b[?25hCollecting tf-slim\u003e=1.1.0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/02/97/b0f4a64df018ca018cc035d44f2ef08f91e2e8aa67271f6f19633a015ff7/tf_slim-1.1.0-py2.py3-none-any.whl (352kB)\n",
+            "\u001b[K     |████████████████████████████████| 358kB 54.9MB/s \n",
+            "\u001b[?25hRequirement already satisfied: tensorflow-addons in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (0.8.3)\n",
+            "Requirement already satisfied: kaggle\u003e=1.3.9 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (1.5.8)\n",
+            "Requirement already satisfied: oauth2client in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (4.1.3)\n",
+            "Collecting seqeval\n",
+            "  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz\n",
+            "Requirement already satisfied: scipy\u003e=0.19.1 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (1.4.1)\n",
+            "Requirement already satisfied: dataclasses in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (0.7)\n",
+            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (3.2.2)\n",
+            "Collecting pyyaml\u003e=5.1\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/64/c2/b80047c7ac2478f9501676c988a5411ed5572f35d1beff9cae07d321512c/PyYAML-5.3.1.tar.gz (269kB)\n",
+            "\u001b[K     |████████████████████████████████| 276kB 55.1MB/s \n",
+            "\u001b[?25hRequirement already satisfied: Cython in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (0.29.21)\n",
+            "Requirement already satisfied: google-cloud-bigquery\u003e=0.31.0 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (1.21.0)\n",
+            "Collecting opencv-python-headless\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/b6/2a/496e06fd289c01dc21b11970be1261c87ce1cc22d5340c14b516160822a7/opencv_python_headless-4.4.0.42-cp36-cp36m-manylinux2014_x86_64.whl (36.6MB)\n",
+            "\u001b[K     |████████████████████████████████| 36.6MB 88kB/s \n",
+            "\u001b[?25hRequirement already satisfied: psutil\u003e=5.4.3 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (5.4.8)\n",
+            "Requirement already satisfied: gin-config in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (0.3.0)\n",
+            "Requirement already satisfied: google-api-python-client\u003e=1.6.7 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (1.7.12)\n",
+            "Requirement already satisfied: pandas\u003e=0.22.0 in /usr/local/lib/python3.6/dist-packages (from tf-models-nightly-\u003etflite-model-maker) (1.0.5)\n",
+            "Collecting py-cpuinfo\u003e=3.3.0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/f6/f5/8e6e85ce2e9f6e05040cf0d4e26f43a4718bcc4bce988b433276d4b1a5c1/py-cpuinfo-7.0.0.tar.gz (95kB)\n",
+            "\u001b[K     |████████████████████████████████| 102kB 11.1MB/s \n",
+            "\u001b[?25hRequirement already satisfied: grpcio\u003e=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (1.32.0)\n",
+            "Requirement already satisfied: opt-einsum\u003e=2.3.2 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (3.3.0)\n",
+            "Requirement already satisfied: gast==0.3.3 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (0.3.3)\n",
+            "Requirement already satisfied: wheel\u003e=0.26 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (0.35.1)\n",
+            "Requirement already satisfied: astunparse==1.6.3 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (1.6.3)\n",
+            "Requirement already satisfied: h5py\u003c2.11.0,\u003e=2.10.0 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (2.10.0)\n",
+            "Requirement already satisfied: typing-extensions\u003e=3.7.4.2 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (3.7.4.3)\n",
+            "Collecting tb-nightly\u003c3.0.0a0,\u003e=2.4.0a0\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/fc/cb/4dfe0d65bffb5e9663261ff664e6f5a2d37672b31dae27a0f14721ac00d3/tb_nightly-2.4.0a20200914-py3-none-any.whl (10.1MB)\n",
+            "\u001b[K     |████████████████████████████████| 10.1MB 46.1MB/s \n",
+            "\u001b[?25hRequirement already satisfied: google-pasta\u003e=0.1.8 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (0.2.0)\n",
+            "Collecting tf-estimator-nightly\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/bd/9a/3bfb9994eda11e426c809ebdf434e2ac5824a0784d980018bb53fd1620ec/tf_estimator_nightly-2.4.0.dev2020091401-py2.py3-none-any.whl (460kB)\n",
+            "\u001b[K     |████████████████████████████████| 460kB 51.7MB/s \n",
+            "\u001b[?25hRequirement already satisfied: keras-preprocessing\u003c1.2,\u003e=1.1.1 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (1.1.2)\n",
+            "Requirement already satisfied: wrapt\u003e=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tf-nightly-\u003etflite-model-maker) (1.12.1)\n",
+            "Requirement already satisfied: tensorflow-metadata in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (0.24.0)\n",
+            "Requirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (0.16.0)\n",
+            "Requirement already satisfied: promise in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (2.3)\n",
+            "Requirement already satisfied: requests\u003e=2.19.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (2.23.0)\n",
+            "Requirement already satisfied: attrs\u003e=18.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (20.2.0)\n",
+            "Requirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (4.41.1)\n",
+            "Requirement already satisfied: dill in /usr/local/lib/python3.6/dist-packages (from tensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (0.3.2)\n",
+            "Collecting pybind11\u003e=2.4\n",
+            "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/89/e3/d576f6f02bc75bacbc3d42494e8f1d063c95617d86648dba243c2cb3963e/pybind11-2.5.0-py2.py3-none-any.whl (296kB)\n",
+            "\u001b[K     |████████████████████████████████| 296kB 55.2MB/s \n",
+            "\u001b[?25hRequirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf\u003e=3.8.0-\u003etensorflow-hub\u003e=0.8.0-\u003etflite-model-maker) (50.3.0)\n",
+            "Requirement already satisfied: dm-tree~=0.1.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow-model-optimization\u003e=0.4.1-\u003etf-models-nightly-\u003etflite-model-maker) (0.1.5)\n",
+            "Requirement already satisfied: typeguard in /usr/local/lib/python3.6/dist-packages (from tensorflow-addons-\u003etf-models-nightly-\u003etflite-model-maker) (2.7.1)\n",
+            "Requirement already satisfied: python-slugify in /usr/local/lib/python3.6/dist-packages (from kaggle\u003e=1.3.9-\u003etf-models-nightly-\u003etflite-model-maker) (4.0.1)\n",
+            "Requirement already satisfied: urllib3\u003c1.25,\u003e=1.21.1 in /usr/local/lib/python3.6/dist-packages (from kaggle\u003e=1.3.9-\u003etf-models-nightly-\u003etflite-model-maker) (1.24.3)\n",
+            "Requirement already satisfied: certifi in /usr/local/lib/python3.6/dist-packages (from kaggle\u003e=1.3.9-\u003etf-models-nightly-\u003etflite-model-maker) (2020.6.20)\n",
+            "Requirement already satisfied: slugify in /usr/local/lib/python3.6/dist-packages (from kaggle\u003e=1.3.9-\u003etf-models-nightly-\u003etflite-model-maker) (0.0.1)\n",
+            "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kaggle\u003e=1.3.9-\u003etf-models-nightly-\u003etflite-model-maker) (2.8.1)\n",
+            "Requirement already satisfied: rsa\u003e=3.1.4 in /usr/local/lib/python3.6/dist-packages (from oauth2client-\u003etf-models-nightly-\u003etflite-model-maker) (4.6)\n",
+            "Requirement already satisfied: pyasn1\u003e=0.1.7 in /usr/local/lib/python3.6/dist-packages (from oauth2client-\u003etf-models-nightly-\u003etflite-model-maker) (0.4.8)\n",
+            "Requirement already satisfied: pyasn1-modules\u003e=0.0.5 in /usr/local/lib/python3.6/dist-packages (from oauth2client-\u003etf-models-nightly-\u003etflite-model-maker) (0.2.8)\n",
+            "Requirement already satisfied: httplib2\u003e=0.9.1 in /usr/local/lib/python3.6/dist-packages (from oauth2client-\u003etf-models-nightly-\u003etflite-model-maker) (0.17.4)\n",
+            "Requirement already satisfied: Keras\u003e=2.2.4 in /usr/local/lib/python3.6/dist-packages (from seqeval-\u003etf-models-nightly-\u003etflite-model-maker) (2.4.3)\n",
+            "Requirement already satisfied: cycler\u003e=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib-\u003etf-models-nightly-\u003etflite-model-maker) (0.10.0)\n",
+            "Requirement already satisfied: kiwisolver\u003e=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib-\u003etf-models-nightly-\u003etflite-model-maker) (1.2.0)\n",
+            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,\u003e=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib-\u003etf-models-nightly-\u003etflite-model-maker) (2.4.7)\n",
+            "Requirement already satisfied: google-resumable-media!=0.4.0,\u003c0.5.0dev,\u003e=0.3.1 in /usr/local/lib/python3.6/dist-packages (from google-cloud-bigquery\u003e=0.31.0-\u003etf-models-nightly-\u003etflite-model-maker) (0.4.1)\n",
+            "Requirement already satisfied: google-cloud-core\u003c2.0dev,\u003e=1.0.3 in /usr/local/lib/python3.6/dist-packages (from google-cloud-bigquery\u003e=0.31.0-\u003etf-models-nightly-\u003etflite-model-maker) (1.0.3)\n",
+            "Requirement already satisfied: google-auth-httplib2\u003e=0.0.3 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client\u003e=1.6.7-\u003etf-models-nightly-\u003etflite-model-maker) (0.0.4)\n",
+            "Requirement already satisfied: uritemplate\u003c4dev,\u003e=3.0.0 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client\u003e=1.6.7-\u003etf-models-nightly-\u003etflite-model-maker) (3.0.1)\n",
+            "Requirement already satisfied: google-auth\u003e=1.4.1 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client\u003e=1.6.7-\u003etf-models-nightly-\u003etflite-model-maker) (1.17.2)\n",
+            "Requirement already satisfied: pytz\u003e=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas\u003e=0.22.0-\u003etf-models-nightly-\u003etflite-model-maker) (2018.9)\n",
+            "Requirement already satisfied: markdown\u003e=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (3.2.2)\n",
+            "Requirement already satisfied: google-auth-oauthlib\u003c0.5,\u003e=0.4.1 in /usr/local/lib/python3.6/dist-packages (from tb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (0.4.1)\n",
+            "Requirement already satisfied: tensorboard-plugin-wit\u003e=1.6.0 in /usr/local/lib/python3.6/dist-packages (from tb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (1.7.0)\n",
+            "Requirement already satisfied: werkzeug\u003e=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (1.0.1)\n",
+            "Requirement already satisfied: googleapis-common-protos\u003c2,\u003e=1.52.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow-metadata-\u003etensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (1.52.0)\n",
+            "Requirement already satisfied: idna\u003c3,\u003e=2.5 in /usr/local/lib/python3.6/dist-packages (from requests\u003e=2.19.0-\u003etensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (2.10)\n",
+            "Requirement already satisfied: chardet\u003c4,\u003e=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests\u003e=2.19.0-\u003etensorflow-datasets\u003e=2.1.0-\u003etflite-model-maker) (3.0.4)\n",
+            "Requirement already satisfied: text-unidecode\u003e=1.3 in /usr/local/lib/python3.6/dist-packages (from python-slugify-\u003ekaggle\u003e=1.3.9-\u003etf-models-nightly-\u003etflite-model-maker) (1.3)\n",
+            "Requirement already satisfied: google-api-core\u003c2.0.0dev,\u003e=1.14.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core\u003c2.0dev,\u003e=1.0.3-\u003egoogle-cloud-bigquery\u003e=0.31.0-\u003etf-models-nightly-\u003etflite-model-maker) (1.16.0)\n",
+            "Requirement already satisfied: cachetools\u003c5.0,\u003e=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth\u003e=1.4.1-\u003egoogle-api-python-client\u003e=1.6.7-\u003etf-models-nightly-\u003etflite-model-maker) (4.1.1)\n",
+            "Requirement already satisfied: importlib-metadata; python_version \u003c \"3.8\" in /usr/local/lib/python3.6/dist-packages (from markdown\u003e=2.6.8-\u003etb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (1.7.0)\n",
+            "Requirement already satisfied: requests-oauthlib\u003e=0.7.0 in /usr/local/lib/python3.6/dist-packages (from google-auth-oauthlib\u003c0.5,\u003e=0.4.1-\u003etb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (1.3.0)\n",
+            "Requirement already satisfied: zipp\u003e=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version \u003c \"3.8\"-\u003emarkdown\u003e=2.6.8-\u003etb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (3.1.0)\n",
+            "Requirement already satisfied: oauthlib\u003e=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib\u003e=0.7.0-\u003egoogle-auth-oauthlib\u003c0.5,\u003e=0.4.1-\u003etb-nightly\u003c3.0.0a0,\u003e=2.4.0a0-\u003etf-nightly-\u003etflite-model-maker) (3.1.0)\n",
+            "Building wheels for collected packages: fire, seqeval, pyyaml, py-cpuinfo\n",
+            "  Building wheel for fire (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for fire: filename=fire-0.3.1-py2.py3-none-any.whl size=111005 sha256=9eaa2d36e17621d136f8ab1707a5a4e8994c53d5076a9edde21aab7696ba3e09\n",
+            "  Stored in directory: /root/.cache/pip/wheels/c1/61/df/768b03527bf006b546dce284eb4249b185669e65afc5fbb2ac\n",
+            "  Building wheel for seqeval (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for seqeval: filename=seqeval-0.0.12-cp36-none-any.whl size=7423 sha256=1ce4604da2a395f0304db708bf2e2c1831033ed8b1f7c23927d70ed9ed7b7110\n",
+            "  Stored in directory: /root/.cache/pip/wheels/4f/32/0a/df3b340a82583566975377d65e724895b3fad101a3fb729f68\n",
+            "  Building wheel for pyyaml (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for pyyaml: filename=PyYAML-5.3.1-cp36-cp36m-linux_x86_64.whl size=44619 sha256=d51b6ef3e90de74d0c1cee8f7aafe0a6d8674348c8437cd89ad5c60a6c3dc726\n",
+            "  Stored in directory: /root/.cache/pip/wheels/a7/c1/ea/cf5bd31012e735dc1dfea3131a2d5eae7978b251083d6247bd\n",
+            "  Building wheel for py-cpuinfo (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for py-cpuinfo: filename=py_cpuinfo-7.0.0-cp36-none-any.whl size=20071 sha256=096439bff3cb3e4cc21b86472c629017fd9c972d6e2ed231e1a91d2096fc687d\n",
+            "  Stored in directory: /root/.cache/pip/wheels/f1/93/7b/127daf0c3a5a49feb2fecd468d508067c733fba5192f726ad1\n",
+            "Successfully built fire seqeval pyyaml py-cpuinfo\n",
+            "Installing collected packages: fire, flatbuffers, tensorflow-model-optimization, tf-slim, seqeval, pyyaml, opencv-python-headless, sentencepiece, tb-nightly, tf-estimator-nightly, tf-nightly, py-cpuinfo, tf-models-nightly, pybind11, tflite-support, tflite-model-maker\n",
+            "  Found existing installation: PyYAML 3.13\n",
+            "    Uninstalling PyYAML-3.13:\n",
+            "      Successfully uninstalled PyYAML-3.13\n",
+            "Successfully installed fire-0.3.1 flatbuffers-1.12 opencv-python-headless-4.4.0.42 py-cpuinfo-7.0.0 pybind11-2.5.0 pyyaml-5.3.1 sentencepiece-0.1.91 seqeval-0.0.12 tb-nightly-2.4.0a20200914 tensorflow-model-optimization-0.5.0 tf-estimator-nightly-2.4.0.dev2020091401 tf-models-nightly-2.3.0.dev20200914 tf-nightly-2.4.0.dev20200914 tf-slim-1.1.0 tflite-model-maker-0.1.2 tflite-support-0.1.0rc3.dev2\n"
+          ]
+        }
+      ],
       "source": [
         "!pip install tflite-model-maker"
       ]
@@ -116,7 +257,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "l6lRhVK9Q_0U"
       },
       "source": [
@@ -127,8 +267,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "XtxiUeZEiXpt"
       },
       "outputs": [],
@@ -140,6 +278,7 @@
         "assert tf.__version__.startswith('2')\n",
         "\n",
         "from tflite_model_maker import configs\n",
+        "from tflite_model_maker import ExportFormat\n",
         "from tflite_model_maker import model_spec\n",
         "from tflite_model_maker import text_classifier\n",
         "from tflite_model_maker import TextClassifierDataLoader"
@@ -148,7 +287,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "BRd13bfetO7B"
       },
       "source": [
@@ -160,11 +298,23 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "R2BSkxWg6Rhx"
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 51
+        },
+        "id": "R2BSkxWg6Rhx",
+        "outputId": "972b735a-d5f6-4152-9c0f-12b7b97b8d86"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Downloading data from https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media\u0026token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8\n",
+            "7446528/7439277 [==============================] - 0s 0us/step\n"
+          ]
+        }
+      ],
       "source": [
         "data_dir = tf.keras.utils.get_file(\n",
         "      fname='SST-2.zip',\n",
@@ -176,7 +326,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "6MSCjPAvs2EQ"
       },
       "source": [
@@ -188,7 +337,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "uO5egTlrtWxm"
       },
       "source": [
@@ -198,7 +346,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "xushUyZXqP59"
       },
       "source": [
@@ -208,7 +355,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "WlKU3SMX6TnB"
       },
       "source": [
@@ -218,7 +364,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "PBPUIhEjMjTR"
       },
       "source": [
@@ -231,8 +376,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "CtdZ-JDwMimd"
       },
       "outputs": [],
@@ -243,7 +386,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "s5U-A3tw6Y27"
       },
       "source": [
@@ -254,8 +396,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "HD5BvzWe6YKa"
       },
       "outputs": [],
@@ -279,7 +419,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "2uZkLR6N6gDR"
       },
       "source": [
@@ -290,8 +429,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "kwlYdTcg63xy"
       },
       "outputs": [],
@@ -302,7 +439,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "-BzCHLWJ6h7q"
       },
       "source": [
@@ -313,8 +449,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "8xmnl6Yy7ARn"
       },
       "outputs": [],
@@ -325,11 +459,10 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "CgCDMe0e6jlT"
       },
       "source": [
-        "Step 5.  Export as a TensorFlow Lite  model.\n",
+        "Step 5.  Export as a TensorFlow Lite model with [metadata](https://www.tensorflow.org/lite/convert/metadata).\n",
         "\n",
         "Since MobileBERT is too big for on-device applications, use [dynamic range quantization](https://www.tensorflow.org/lite/performance/post_training_quantization#dynamic_range_quantization) on the model to compress it by almost 4x with minimal performance degradation."
       ]
@@ -338,8 +471,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "ZQRLmkGumr9Y"
       },
       "outputs": [],
@@ -352,8 +483,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "Hm_UULdW7A9T"
       },
       "outputs": [],
@@ -364,19 +493,17 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "rVxaf3x_7OfB"
       },
       "source": [
         "You can also download the model using the left sidebar in Colab.\n",
         "\n",
-        "After executing the 5 steps above, you can further use the TensorFlow Lite model file and label file in on-device applications like in a [text classification](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification) reference app."
+        "After executing the 5 steps above, you can further use the TensorFlow Lite model file in on-device applications using [BertNLClassifier API](https://www.tensorflow.org/lite/inference_with_metadata/task_library/bert_nl_classifier) in [TensorFlow Lite Task Library](https://www.tensorflow.org/lite/inference_with_metadata/task_library/overview)."
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "l65ctmtW7_FF"
       },
       "source": [
@@ -386,7 +513,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "kJ_B8fMDOhMR"
       },
       "source": [
@@ -407,8 +533,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "vEAWuZQ1PFiX"
       },
       "outputs": [],
@@ -419,7 +543,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "ygEncJxtl-nQ"
       },
       "source": [
@@ -427,17 +550,13 @@
         "\n",
         "The [SST-2](https://nlp.stanford.edu/sentiment/index.html) (Stanford Sentiment Treebank) is one of the tasks in the [GLUE](https://gluebenchmark.com/) benchmark . It contains 67,349 movie reviews for training and 872 movie reviews for validation. The dataset has two classes: positive and negative movie reviews.\n",
         "\n",
-        "Download the archived version of the dataset and extract it.\n",
-        "\n",
-        "\n"
+        "Download the archived version of the dataset and extract it.\n"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "7tOfUr2KlgpU"
       },
       "outputs": [],
@@ -452,7 +571,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "E051HBUM5owi"
       },
       "source": [
@@ -472,8 +590,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "I_fOlZsklmlL"
       },
       "outputs": [],
@@ -497,7 +613,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "MlHvVvv2hw4H"
       },
       "source": [
@@ -507,7 +622,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "AWuoensX4vDA"
       },
       "source": [
@@ -520,8 +634,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "TvYSUuJY3QxR"
       },
       "outputs": [],
@@ -532,7 +644,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "0JKI-pNc8idH"
       },
       "source": [
@@ -543,8 +654,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "gd7Hs8TF8n3H"
       },
       "outputs": [],
@@ -555,7 +664,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "LP5FPk_tOxoZ"
       },
       "source": [
@@ -570,8 +678,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "A8c2ZQ0J3Riy"
       },
       "outputs": [],
@@ -582,21 +688,18 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "aeHoGAceO2xV"
       },
       "source": [
         "## Export as a TensorFlow Lite Model\n",
         "\n",
-        "Convert the existing model to TensorFlow Lite model format that you can later use in an on-device ML application. Save the text labels in a label file and vocabulary in a vocab file. The default TFLite filename is `model.tflite`, the default label filename is `label.txt` and the default vocab filename is `vocab`."
+        "Convert the existing model to TensorFlow Lite model format with [metadata](https://www.tensorflow.org/lite/convert/metadata) that you can later use in an on-device ML application. The label file and the vocab file are embedded in metadata. The default TFLite filename is `model.tflite`."
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "Im6wA9lK3TQB"
       },
       "outputs": [],
@@ -607,17 +710,42 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "w12kvDdHJIGH"
       },
       "source": [
-        "The TensorFlow Lite model file and label file can be used in the [text classification](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification) reference app by adding `model.tflite`, `text_label.txt` and `vocab.txt` to the [assets directory](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification/android/app/src/main/assets). Do not forget to also change the filenames in the [code](https://github.com/tensorflow/examples/blob/master/lite/examples/text_classification/android/app/src/main/java/org/tensorflow/lite/examples/textclassification/TextClassificationClient.java#L43)."
+        "The TensorFlow Lite model file can be used in the [text classification](https://github.com/tensorflow/examples/tree/master/lite/examples/text_classification) reference app using [NLClassifier API](https://www.tensorflow.org/lite/inference_with_metadata/task_library/nl_classifier) in [TensorFlow Lite Task Library](https://www.tensorflow.org/lite/inference_with_metadata/task_library/overview)."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "AVy0ormoMZwL"
+      },
+      "source": [
+        "The allowed export formats can be one or a list of the following:\n",
+        "\n",
+        "*   `ExportFormat.TFLITE`\n",
+        "*   `ExportFormat.LABEL`\n",
+        "*   `ExportFormat.VOCAB`\n",
+        "*   `ExportFormat.SAVED_MODEL`\n",
+        "\n",
+        "By default, it just exports TensorFlow Lite model with metadata. You can also selectively export different files. For instance, exporting only the label file and vocab file as follows:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "nbK7nzK_Mfx4"
+      },
+      "outputs": [],
+      "source": [
+        "model.export(export_dir='average_word_vec/', export_format=[ExportFormat.LABEL, ExportFormat.VOCAB])"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "HZKYthlVrTos"
       },
       "source": [
@@ -628,8 +756,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "ochbq95ZrVFX"
       },
       "outputs": [],
@@ -640,7 +766,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "EoWiA_zX8rxE"
       },
       "source": [
@@ -657,7 +782,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "mwtiksguDfhl"
       },
       "source": [
@@ -669,7 +793,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "cAOd5_bzH9AQ"
       },
       "source": [
@@ -680,8 +803,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "e9WBN0UTQoMN"
       },
       "outputs": [],
@@ -692,7 +813,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "6LSTdghTP0Cv"
       },
       "source": [
@@ -703,8 +823,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "DVZurFBORG3J"
       },
       "outputs": [],
@@ -721,7 +839,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "tD7QVVHeRZoM"
       },
       "source": [
@@ -732,8 +849,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "PzpV246_JGEu"
       },
       "outputs": [],
@@ -744,7 +859,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "E8VxPiOLy4Gv"
       },
       "source": [
@@ -770,8 +884,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "4tr9BLcjy4Sh"
       },
       "outputs": [],
@@ -783,7 +895,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "LvQuy7RSDir3"
       },
       "source": [
@@ -800,8 +911,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "rnWFaYZBG6NW"
       },
       "outputs": [],
@@ -812,7 +921,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "nUaKQZBQHBQR"
       },
       "source": [
@@ -823,8 +931,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "BMPi1xflHDSY"
       },
       "outputs": [],
@@ -835,7 +941,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Eq6B9lKMfhS6"
       },
       "source": [
@@ -850,8 +955,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "QfFCWrwyggrT"
       },
       "outputs": [],
@@ -862,7 +965,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "L2d7yycrgu6L"
       },
       "source": [
@@ -874,8 +976,7 @@
     "accelerator": "GPU",
     "colab": {
       "collapsed_sections": [],
-      "name": "text_classification.ipynb",
-      "private_outputs": true,
+      "name": "model_maker_text_classification.ipynb",
       "provenance": [],
       "toc_visible": true
     },
diff --git a/tensorflow/lite/graph_info.cc b/tensorflow/lite/graph_info.cc
index 8968fe6cb21..47fa8ff86b1 100644
--- a/tensorflow/lite/graph_info.cc
+++ b/tensorflow/lite/graph_info.cc
@@ -40,7 +40,7 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
       std::vector<NodeSubset>* node_subsets)
       : info_(info),
         node_subsets_(node_subsets),
-        node_type_(info->num_nodes(), NodeSubset::kTfNonPartition) {
+        node_type_(info_->num_total_nodes(), NodeSubset::kTfNonPartition) {
     // Populate the node_type_ map.
     for (auto node_index : TfLiteIntArrayView(nodes_to_partition)) {
       node_type_[node_index] = NodeSubset::kTfPartition;
@@ -54,10 +54,11 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
     tensor_epochs_.clear();
     tensor_epochs_.resize(info_->num_tensors(), kEpochAlwaysReady);
     node_epochs_.clear();
-    node_epochs_.resize(info_->num_nodes(), kEpochNotReady);
+    node_epochs_.resize(info_->num_execution_nodes(), kEpochNotReady);
     // Set computed tensors to be kEpochNotReady (initializer set everything to
     // AlwaysReady).
-    for (int node_index = 0; node_index < info_->num_nodes(); node_index++) {
+    for (int node_index = 0; node_index < info_->num_execution_nodes();
+         node_index++) {
       const TfLiteNode& node = info_->node(node_index);
       for (int output_tensor_index : TfLiteIntArrayView(node.outputs)) {
         tensor_epochs_[output_tensor_index] = kEpochNotReady;
@@ -112,10 +113,10 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
     kEpochAlwaysReady = -2
   };
 
-  // Updates the  node `node_index` and returns true if it is assigned to an
-  // epoch. False is returned if the node is already set to an epoch, its inputs
-  // are not all assigned to epochs, or if it cannot be assigned to the current
-  // epoch since the epoch's node_type doesn't match.
+  // Updates the node at `node_index` in the execution plan and returns true if
+  // it is assigned to an epoch. False is returned if the node is already set to
+  // an epoch, its inputs are not all assigned to epochs, or if it cannot be
+  // assigned to the current epoch since the epoch's node_type doesn't match.
   bool UpdateNode(int node_index) {
     const TfLiteNode& node = info_->node(node_index);
     NodeSubset& current_subset = node_subsets_->back();
@@ -132,18 +133,20 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
         return false;
       }
     }
+
+    int original_node_idx = info_->node_index(node_index);
     // When we are starting a new epoch, the first ready node defines
     // the type of that epoch.
     if (current_subset.type == NodeSubset::kTfUnexplored) {
-      current_subset.type = node_type_[node_index];
+      current_subset.type = node_type_[original_node_idx];
     }
     // The node gets assigned to this epoch if it is the same type as
     // the epoch's assigned type. Note, if this is the current ready
     // node encountered during this epoch, this condition will be
     // automatically true.
-    if (current_subset.type == node_type_[node_index]) {
+    if (current_subset.type == node_type_[original_node_idx]) {
       node_epochs_[node_index] = current_epoch;
-      current_subset.nodes.push_back(info_->node_index(node_index));
+      current_subset.nodes.push_back(original_node_idx);
       // All outputs of this node now are assigned to this epoch as
       // well.
       for (int output_tensor_index : TfLiteIntArrayView(node.outputs)) {
@@ -180,7 +183,8 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
     // loop until no more nodes can be updated.
     while (true) {
       bool did_something = false;
-      for (int node_index = 0; node_index < info_->num_nodes(); node_index++) {
+      for (int node_index = 0; node_index < info_->num_execution_nodes();
+           node_index++) {
         if (UpdateNode(node_index)) {
           did_something = true;
         }
@@ -193,6 +197,9 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
   const GraphInfo* info_;
   // List of node_subsets to populate
   std::vector<NodeSubset>* node_subsets_;
+  // NOTE: This vector contains a place-holder for *all* nodes in the graph, not
+  // just ones in the execution plan. This is because nodes_to_partition is
+  // passed in as a list of original node indices & not execution plan indices.
   std::vector<NodeSubset::Type> node_type_;
   // Maps from tensor index to the epoch in which it is assigned. Also special
   // negative values of kEpochNotReady if not assigned, kEpochAlwaysReady if it
diff --git a/tensorflow/lite/graph_info.h b/tensorflow/lite/graph_info.h
index cf84c1466af..2236f99068b 100644
--- a/tensorflow/lite/graph_info.h
+++ b/tensorflow/lite/graph_info.h
@@ -34,15 +34,21 @@ class GraphInfo {
   // num_tensors().
   virtual TfLiteTensor* tensor(size_t index) = 0;
 
-  // Total number of nodes in the graph.
-  virtual size_t num_nodes() const = 0;
+  // Number of nodes in the current execution plan.
+  virtual size_t num_execution_nodes() const = 0;
 
-  // Returns a node given its index which is expected to be between 0 and
-  // num_nodes().
+  // Total number of known nodes, which may include nodes that are no longer in
+  // the execution plan. This happens in case of applying multiple delegates.
+  // Should be >= num_execution_nodes()
+  virtual size_t num_total_nodes() const = 0;
+
+  // Returns a node given its index in the execution plan, which is expected to
+  // be between 0 and num_execution_nodes().
   virtual const TfLiteNode& node(size_t index) const = 0;
 
   // Returns an implementation-specific node index which may be different from
-  // index.
+  // execution-plan index.
+  // Expected to be between 0 and num_total_nodes().
   virtual size_t node_index(size_t index) const = 0;
 
   // Returns the indices of the input tensors.
diff --git a/tensorflow/lite/graph_info_test.cc b/tensorflow/lite/graph_info_test.cc
index 82d63ed6846..4ab11d9db18 100644
--- a/tensorflow/lite/graph_info_test.cc
+++ b/tensorflow/lite/graph_info_test.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/lite/graph_info.h"
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-
-#include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
@@ -34,6 +34,8 @@ class SimpleTestGraph : public GraphInfo {
  public:
   explicit SimpleTestGraph(int node_index_offset = 0)
       : node_index_offset_(node_index_offset) {
+    // 'node_index_offset' number of nodes are not present in the execution
+    // plan. (and hence not considered for partitioning)
     for (int i = 0; i < node_index_offset; ++i) AddNode({}, {});
   }
 
@@ -44,7 +46,8 @@ class SimpleTestGraph : public GraphInfo {
     }
   }
 
-  size_t num_nodes() const override {
+  size_t num_total_nodes() const override { return nodes_.size(); }
+  size_t num_execution_nodes() const override {
     return nodes_.size() - node_index_offset_;
   }
   const TfLiteNode& node(size_t index) const override {
@@ -156,7 +159,7 @@ TEST(PartitionTest, Nodes1PartitionNodes0) {
   CheckPartitionSubgraphs(generated_subgraphs, {expected_subgraph});
 }
 
-TEST(PartitionTest, Nodes1PartitionNodes0WithOffset) {
+TEST(PartitionTest, Nodes1PartitionNodes0_WithOffset) {
   constexpr int node_index_offset = 17;
   SimpleTestGraph graph(node_index_offset);
   graph.AddTensors(2);
@@ -243,6 +246,33 @@ TEST(PartitionTest, Nodes2PartitionNodes1) {
                           {expected_subgraph0, expected_subgraph1});
 }
 
+// Same as above, but with node offset to ensure correct handling of original vs
+// execution plan indices.
+TEST(PartitionTest, Nodes2PartitionNodes1_WithOffset) {
+  constexpr int node_index_offset = 17;
+  SimpleTestGraph graph(node_index_offset);
+  graph.AddTensors(3);
+  graph.AddNode({0}, {1});
+  graph.AddNode({1}, {2});
+  graph.SetInputsAndOutputs({0}, {2});
+  std::vector<int> nodes_to_partition = {node_index_offset + 1};
+  std::vector<NodeSubset> generated_subgraphs;
+  PartitionGraph(graph, nodes_to_partition, &generated_subgraphs);
+
+  NodeSubset expected_subgraph0;
+  expected_subgraph0.type = NodeSubset::kTfPartition;
+  expected_subgraph0.nodes = {node_index_offset + 0};
+  expected_subgraph0.input_tensors = {0};
+  expected_subgraph0.output_tensors = {1};
+  NodeSubset expected_subgraph1;
+  expected_subgraph1.type = NodeSubset::kTfPartition;
+  expected_subgraph1.nodes = {node_index_offset + 1};
+  expected_subgraph1.input_tensors = {1};
+  expected_subgraph1.output_tensors = {2};
+  CheckPartitionSubgraphs(generated_subgraphs,
+                          {expected_subgraph0, expected_subgraph1});
+}
+
 // Test a 2 node graph where both nodes are fully partitioned.
 // Input: tensor(0) -> node(0) -> tensor(1) -> node(1) -> tensor(2),
 //    nodes_to_partition = [0, 1]
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index a79ea86f61e..25acac96cf4 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -52,8 +52,6 @@ static_assert(sizeof(TfLiteFloat16) == sizeof(uint16_t),
 
 namespace tflite {
 
-namespace impl {
-
 namespace {
 
 // Gets the current TfLiteQuantization from the legacy TfLiteQuantizationParams.
@@ -475,6 +473,4 @@ Profiler* Interpreter::GetProfiler() {
   return primary_subgraph().GetProfiler();
 }
 
-}  // namespace impl
-
 }  // namespace tflite
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index f27a17dfafe..6a77c5a5f11 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -36,10 +36,6 @@ limitations under the License.
 #include "tensorflow/lite/stderr_reporter.h"
 #include "tensorflow/lite/type_to_tflitetype.h"
 
-#if TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-#include "tensorflow/lite/experimental/tf_runtime/public/eager_interpreter.h"
-#endif
-
 namespace tflite {
 
 class InterpreterTest;
@@ -48,8 +44,6 @@ namespace delegates {
 class InterpreterUtils;  // Class for friend declarations.
 }  // namespace delegates
 
-namespace impl {
-
 /// An interpreter for a graph of nodes that input and output from tensors.
 /// Each node of the graph processes a set of input tensors and produces a
 /// set of output Tensors. All inputs/output tensors are referenced by index.
@@ -374,7 +368,7 @@ class Interpreter {
   /// WARNING: NNAPI cannot be disabled after the graph has been prepared
   /// (via `AllocateTensors`) with NNAPI enabled.
   ///
-  /// NOTE: This API is deprecated, prefer using the NNAPI delegate directly.
+  /// WARNING: This API is deprecated, prefer using the NNAPI delegate directly.
   /// This method will be removed in a future release.
   void UseNNAPI(bool enable);
 
@@ -386,8 +380,12 @@ class Interpreter {
   TfLiteStatus SetNumThreads(int num_threads);
 
   /// Allow float16 precision for FP32 calculation when possible.
-  /// default: not allow.
-  /// WARNING: This is an experimental API and subject to change.
+  /// Default: not allow.
+  ///
+  /// WARNING: This API is deprecated: prefer controlling this via delegate
+  /// options, e.g. `tflite::StatefulNnApiDelegate::Options::allow_fp16' or
+  /// `TfLiteGpuDelegateOptionsV2::is_precision_loss_allowed`.
+  /// This method will be removed in a future release.
   void SetAllowFp16PrecisionForFp32(bool allow);
 
   /// Get the half precision flag.
@@ -538,8 +536,8 @@ class Interpreter {
   // for the tensor, it can no longer be reset to the TFLite arena memory.
   //
   // Parameters should satisfy the following conditions:
-  // 1. tensor->allocation_type == kTfLiteArenaRw
-  //    In general, this is true for all non-constants such as I/O tensors.
+  // 1. tensor->allocation_type == kTfLiteArenaRw or kTfLiteArenaRwPersistent
+  //    In general, this is true for I/O tensors & variable tensors.
   // 2. allocation->data has the appropriate permissions for runtime access
   //    (Read-only for inputs, Read-Write for others), and outlives Interpreter.
   // 3. allocation->bytes >= tensor->bytes.
@@ -582,6 +580,11 @@ class Interpreter {
   const Subgraph& primary_subgraph() const {
     return *subgraphs_.front();  // Safe as subgraphs_ always has 1 entry.
   }
+
+  /// WARNING: Experimental interface, subject to change
+  // Get the error reporter associated with this interpreter.
+  ErrorReporter* error_reporter() const { return error_reporter_; }
+
 #endif  // DOXYGEN_SKIP
 
  private:
@@ -608,9 +611,6 @@ class Interpreter {
   // Returns true if cancellation function returns true.
   bool IsCancelled();
 
-  // Get the error reporter associated with this interpreter.
-  ErrorReporter* error_reporter() { return error_reporter_; }
-
   // A pure C data structure used to communicate with the pure C plugin
   // interface. To avoid copying tensor metadata, this is also the definitive
   // structure to store tensors.
@@ -659,13 +659,5 @@ class Interpreter {
   std::vector<TfLiteDelegatePtr> lazy_delegate_providers_;
 };
 
-}  // namespace impl
-
-#if TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-using Interpreter = tflrt::EagerInterpreter;
-#else
-using Interpreter = impl::Interpreter;
-#endif
-
 }  // namespace tflite
 #endif  // TENSORFLOW_LITE_INTERPRETER_H_
diff --git a/tensorflow/lite/interpreter_builder.cc b/tensorflow/lite/interpreter_builder.cc
index 0765f00faf3..f5c8d97b962 100644
--- a/tensorflow/lite/interpreter_builder.cc
+++ b/tensorflow/lite/interpreter_builder.cc
@@ -27,15 +27,13 @@ limitations under the License.
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/profiling/platform_profiler.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/shared_library.h"
 #include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
 
-#if defined(TFLITE_ENABLE_DEFAULT_PROFILER)
-#include "tensorflow/lite/profiling/platform_profiler.h"
-#endif
-
 // aligned_alloc is available (via cstdlib/stdlib.h) with C++17/C11.
 #if __cplusplus >= 201703L || __STDC_VERSION__ >= 201112L
 #if !defined(__ANDROID__) || __ANDROID_API__ >= 28
@@ -140,8 +138,6 @@ TFLITE_ATTRIBUTE_WEAK Interpreter::TfLiteDelegatePtr AcquireFlexDelegate() {
   return Interpreter::TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
 }
 
-namespace impl {
-
 InterpreterBuilder::InterpreterBuilder(const FlatBufferModel& model,
                                        const OpResolver& op_resolver)
     : model_(model.GetModel()),
@@ -170,7 +166,7 @@ TfLiteStatus InterpreterBuilder::BuildLocalIndexToRegistrationMapping() {
   }
   int num_custom_ops = 0;
   for (const OperatorCode* opcode : *opcodes) {
-    if (opcode->builtin_code() == BuiltinOperator_CUSTOM) {
+    if (GetBuiltinCode(opcode) == BuiltinOperator_CUSTOM) {
       num_custom_ops++;
     }
   }
@@ -180,7 +176,7 @@ TfLiteStatus InterpreterBuilder::BuildLocalIndexToRegistrationMapping() {
     status = GetRegistrationFromOpCode(opcode, op_resolver_, error_reporter_,
                                        &registration);
     if (status != kTfLiteOk) {
-      if (opcode->builtin_code() != BuiltinOperator_CUSTOM) {
+      if (GetBuiltinCode(opcode) != BuiltinOperator_CUSTOM) {
         return status;
       }
       // If it's an unresolved custom op, allow it for now. It might be resolved
@@ -630,9 +626,7 @@ TfLiteStatus InterpreterBuilder::operator()(
     (*interpreter)->AddSubgraphs(subgraphs->size() - 1);
   }
 
-#if defined(TFLITE_ENABLE_DEFAULT_PROFILER)
-  (*interpreter)->SetProfiler(tflite::profiling::CreatePlatformProfiler());
-#endif
+  (*interpreter)->SetProfiler(tflite::profiling::MaybeCreatePlatformProfiler());
 
   for (int subgraph_index = 0; subgraph_index < subgraphs->size();
        ++subgraph_index) {
@@ -684,6 +678,4 @@ TfLiteStatus InterpreterBuilder::operator()(
   return kTfLiteOk;
 }
 
-}  // namespace impl
-
 }  // namespace tflite
diff --git a/tensorflow/lite/interpreter_builder.h b/tensorflow/lite/interpreter_builder.h
index c6638b94835..4b0052f66ce 100644
--- a/tensorflow/lite/interpreter_builder.h
+++ b/tensorflow/lite/interpreter_builder.h
@@ -30,26 +30,23 @@ limitations under the License.
 
 namespace tflite {
 
-namespace impl {
-
 /// Build an interpreter capable of interpreting `model`.
 ///
-/// model: A model whose lifetime must be at least as long as any
+/// `model`: A model whose lifetime must be at least as long as any
 ///   interpreter(s) created by the builder. In principle multiple interpreters
 ///   can be made from a single model.
-/// op_resolver: An instance that implements the OpResolver interface, which
-/// maps
-///   custom op names and builtin op codes to op registrations. The lifetime
-///   of the provided `op_resolver` object must be at least as long as the
-///   InterpreterBuilder; unlike `model` and `error_reporter`, the `op_resolver`
-///   does not need to exist for the duration of any created Interpreter
-///   objects.
-/// error_reporter: a functor that is called to report errors that handles
+/// `op_resolver`: An instance that implements the `OpResolver` interface, which
+///   maps custom op names and builtin op codes to op registrations. The
+///   lifetime of the provided `op_resolver` object must be at least as long as
+///   the `InterpreterBuilder`; unlike `model` and `error_reporter`, the
+///   `op_resolver` does not need to exist for the duration of any created
+///   `Interpreter` objects.
+/// `error_reporter`: a functor that is called to report errors that handles
 ///   printf var arg semantics. The lifetime of the `error_reporter` object must
-///   be greater than or equal to the Interpreter created by operator().
+///   be greater than or equal to the `Interpreter` created by `operator()`.
 ///
 /// Returns a kTfLiteOk when successful and sets interpreter to a valid
-/// Interpreter. Note: The user must ensure the model lifetime (and error
+/// Interpreter. Note: The user must ensure the lifetime of the model (and error
 /// reporter, if provided) is at least as long as interpreter's lifetime.
 class InterpreterBuilder {
  public:
@@ -97,8 +94,6 @@ class InterpreterBuilder {
   int num_fp32_tensors_ = 0;
 };
 
-}  // namespace impl
-
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_INTERPRETER_BUILDER_H_
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index a35799a8408..66728ea89e9 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -621,8 +621,10 @@ TfLiteRegistration GetPassthroughOpRegistration() {
   reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
     auto* first_new_tensor = static_cast<int*>(node->user_data);
 
-    const TfLiteTensor* tensor0 = GetInput(context, node, 0);
-    TfLiteTensor* tensor1 = GetOutput(context, node, 0);
+    const TfLiteTensor* tensor0;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &tensor0));
+    TfLiteTensor* tensor1;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &tensor1));
 
     TfLiteIntArray* newSize = TfLiteIntArrayCopy(tensor0->dims);
     TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, tensor1, newSize));
@@ -646,7 +648,8 @@ TfLiteRegistration GetPassthroughOpRegistration() {
     return kTfLiteOk;
   };
   reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
-    const TfLiteTensor* a0 = GetInput(context, node, 0);
+    const TfLiteTensor* a0;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &a0));
 
     auto populate = [&](int id) {
       TfLiteTensor* t = &context->tensors[id];
@@ -780,8 +783,10 @@ TEST(BasicInterpreter, ThreeStepAllocate) {
   // String-in String-out node.
   TfLiteRegistration reg_copy = {nullptr, nullptr, nullptr, nullptr};
   reg_copy.invoke = [](TfLiteContext* context, TfLiteNode* node) {
-    const TfLiteTensor* input = GetInput(context, node, 0);
-    TfLiteTensor* output = GetOutput(context, node, 0);
+    const TfLiteTensor* input;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+    TfLiteTensor* output;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
     DynamicBuffer buf;
     StringRef str_ref = GetString(input, 0);
     buf.AddString(str_ref);
@@ -792,14 +797,17 @@ TEST(BasicInterpreter, ThreeStepAllocate) {
   // String-in Int-out node.
   TfLiteRegistration reg_len = {nullptr, nullptr, nullptr, nullptr};
   reg_len.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-    TfLiteTensor* output = GetOutput(context, node, 0);
+    TfLiteTensor* output;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
     TfLiteIntArray* outputSize = TfLiteIntArrayCreate(1);
     outputSize->data[0] = 1;
     return context->ResizeTensor(context, output, outputSize);
   };
   reg_len.invoke = [](TfLiteContext* context, TfLiteNode* node) {
-    const TfLiteTensor* a0 = GetInput(context, node, 0);
-    TfLiteTensor* a1 = GetOutput(context, node, 0);
+    const TfLiteTensor* a0;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &a0));
+    TfLiteTensor* a1;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &a1));
     a1->data.i32[0] = a0->bytes;
     return kTfLiteOk;
   };
@@ -848,14 +856,18 @@ TEST(BasicInterpreter, AllocateTwice) {
 
   TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
   reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-    const TfLiteTensor* tensor0 = GetInput(context, node, 0);
-    TfLiteTensor* tensor1 = GetOutput(context, node, 0);
+    const TfLiteTensor* tensor0;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &tensor0));
+    TfLiteTensor* tensor1;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &tensor1));
     TfLiteIntArray* newSize = TfLiteIntArrayCopy(tensor0->dims);
     return context->ResizeTensor(context, tensor1, newSize);
   };
   reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
-    const TfLiteTensor* a0 = GetInput(context, node, 0);
-    TfLiteTensor* a1 = GetOutput(context, node, 0);
+    const TfLiteTensor* a0;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &a0));
+    TfLiteTensor* a1;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &a1));
     int num = a0->dims->data[0];
     for (int i = 0; i < num; i++) {
       a1->data.f[i] = a0->data.f[i];
@@ -1205,8 +1217,10 @@ class TestExecutionPlan : public ::testing::Test {
 
     reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
       // Set output size to input size
-      const TfLiteTensor* tensor0 = GetInput(context, node, 0);
-      TfLiteTensor* tensor1 = GetOutput(context, node, 0);
+      const TfLiteTensor* tensor0;
+      TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &tensor0));
+      TfLiteTensor* tensor1;
+      TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &tensor1));
       TfLiteIntArray* newSize = TfLiteIntArrayCopy(tensor0->dims);
       return context->ResizeTensor(context, tensor1, newSize);
     };
@@ -1215,8 +1229,10 @@ class TestExecutionPlan : public ::testing::Test {
       CallReporting* call_reporting =
           static_cast<CallReporting*>(node->builtin_data);
       // Copy input data to output data.
-      const TfLiteTensor* a0 = GetInput(context, node, 0);
-      TfLiteTensor* a1 = GetOutput(context, node, 0);
+      const TfLiteTensor* a0;
+      TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &a0));
+      TfLiteTensor* a1;
+      TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &a1));
       int num = a0->dims->data[0];
       for (int i = 0; i < num; i++) {
         a1->data.f[i] = a0->data.f[i];
@@ -1403,8 +1419,10 @@ class CancellationTest : public ::testing::Test {
     // Set output size to the input size in CancelOp::Prepare(). Code exists to
     // have a framework in Prepare. The input and output tensors are not used.
     reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-      const TfLiteTensor* in_tensor = GetInput(context, node, 0);
-      TfLiteTensor* out_tensor = GetOutput(context, node, 0);
+      const TfLiteTensor* in_tensor;
+      TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &in_tensor));
+      TfLiteTensor* out_tensor;
+      TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &out_tensor));
       TfLiteIntArray* new_size = TfLiteIntArrayCopy(in_tensor->dims);
       return context->ResizeTensor(context, out_tensor, new_size);
     };
@@ -1423,8 +1441,10 @@ class CancellationTest : public ::testing::Test {
     // Set output size to the input size in OkOp::Prepare(). Code exists to have
     // a framework in Prepare. The input and output tensors are not used.
     reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-      const TfLiteTensor* in_tensor = GetInput(context, node, 0);
-      TfLiteTensor* out_tensor = GetOutput(context, node, 0);
+      const TfLiteTensor* in_tensor;
+      TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &in_tensor));
+      TfLiteTensor* out_tensor;
+      TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &out_tensor));
       TfLiteIntArray* new_size = TfLiteIntArrayCopy(in_tensor->dims);
       return context->ResizeTensor(context, out_tensor, new_size);
     };
@@ -1484,9 +1504,9 @@ class TestCustomAllocation : public ::testing::Test {
   void SetUp() override {
     // Simple model with two custom ops that add 2 float tensors each.
     interpreter_.reset(new Interpreter);
-    interpreter_->AddTensors(5);
+    interpreter_->AddTensors(7);
     interpreter_->SetInputs({0, 1});
-    interpreter_->SetOutputs({3, 4});
+    interpreter_->SetOutputs({3, 4, 6});
     TfLiteQuantizationParams quant;
     interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
                                                quant);
@@ -1498,6 +1518,10 @@ class TestCustomAllocation : public ::testing::Test {
                                                quant);
     interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
                                                quant);
+    interpreter_->SetTensorParametersReadWrite(5, kTfLiteFloat32, "", {3},
+                                               quant, /*is_variable=*/true);
+    interpreter_->SetTensorParametersReadWrite(6, kTfLiteFloat32, "", {3},
+                                               quant);
     auto* add_reg = ops::builtin::Register_ADD();
     TfLiteAddParams* builtin_data0 =
         reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
@@ -1505,15 +1529,21 @@ class TestCustomAllocation : public ::testing::Test {
         reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
     TfLiteAddParams* builtin_data2 =
         reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+    TfLiteAddParams* builtin_data3 =
+        reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
     builtin_data0->activation = kTfLiteActNone;
     builtin_data1->activation = kTfLiteActNone;
     builtin_data2->activation = kTfLiteActNone;
+    builtin_data3->activation = kTfLiteActNone;
     interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, builtin_data0,
                                         add_reg);
     interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, builtin_data1,
                                         add_reg);
     interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, builtin_data2,
                                         add_reg);
+    interpreter_->AddNodeWithParameters({0, 5}, {6}, nullptr, 0, builtin_data3,
+                                        add_reg);
+    interpreter_->SetVariables({5});
   }
 
   void AssignCustomAllocForTensor(int tensor_idx, int required_alignment) {
@@ -1526,18 +1556,22 @@ class TestCustomAllocation : public ::testing::Test {
 
   void VerifyInvoke() {
     std::vector<float> input = {1.0f, 2.0f, 3.0f};
+    std::vector<float> variable = {0.0f, 1.0f, 2.0f};
     std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
-    TfLiteTensor* tensor = interpreter_->tensor(interpreter_->outputs()[0]);
 
     // typed_tensor<...> should work irrespective of custom alloc, since it
-    // accesses tensor.data.
+    // accesses output_tensor.data.
+    memcpy(interpreter_->typed_tensor<float>(interpreter_->variables()[0]),
+           variable.data(), 3 * sizeof(float));
     memcpy(interpreter_->typed_tensor<float>(0), input.data(),
            3 * sizeof(float));
     memcpy(interpreter_->typed_tensor<float>(1), input.data(),
            3 * sizeof(float));
     ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+    TfLiteTensor* output_tensor =
+        interpreter_->tensor(interpreter_->outputs()[0]);
     for (int i = 0; i < 3; ++i) {
-      EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+      EXPECT_EQ(output_tensor->data.f[i], expected_output[i]) << i;
     }
   }
 
@@ -1649,6 +1683,36 @@ TEST_F(TestCustomAllocation, CustomInputAndOutputAllocs) {
   VerifyInvoke();
 }
 
+// Ensure that custom allocs work for tensors on persistent arena as well.
+TEST_F(TestCustomAllocation, CustomAlloc_VariableTensor) {
+  // Set custom allocation for one input tensor.
+  AssignCustomAllocForTensor(interpreter_->variables()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  VerifyInvoke();
+
+  AssignCustomAllocForTensor(interpreter_->variables()[0],
+                             /*required_alignment=*/kDefaultTensorAlignment);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  std::vector<float> input = {2.0f, 3.0f, 4.0f};
+  std::vector<float> variable = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {3.0f, 5.0f, 7.0f};
+  memcpy(interpreter_->typed_tensor<float>(interpreter_->variables()[0]),
+         variable.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+
+  // expected_output = input + variable
+  TfLiteTensor* output_tensor =
+      interpreter_->tensor(interpreter_->outputs()[2]);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(output_tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
 TEST_F(TestCustomAllocation, ResizeTensorsWithoutEnoughMemory) {
   // Set custom allocations for all input tensors.
   AssignCustomAllocForTensor(interpreter_->inputs()[0],
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index 59afc0c3608..e2044212340 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -137,6 +137,16 @@ public final class Interpreter implements AutoCloseable {
       return this;
     }
 
+    /**
+     * Advanced: Set if the interpreter is able to be cancelled.
+     *
+     * @see {@link Interpreter#setCancelled(boolean)}.
+     */
+    public Options setCancellable(boolean allow) {
+      this.allowCancellation = allow;
+      return this;
+    }
+
     /**
      * Experimental: Enable an optimized set of floating point CPU kernels (provided by XNNPACK).
      *
@@ -152,7 +162,7 @@ public final class Interpreter implements AutoCloseable {
      * <ul>
      *   <li>Startup time and resize time may increase.
      *   <li>Baseline memory consumption may increase.
-     *   <li>Compatibility with other delegates (e.g., GPU) has not been fully validated.
+     *   <li>May be ignored if another delegate (eg NNAPI) have been applied.
      *   <li>Quantized models will not see any benefit.
      * </ul>
      *
@@ -167,6 +177,7 @@ public final class Interpreter implements AutoCloseable {
     Boolean useNNAPI;
     Boolean allowFp16PrecisionForFp32;
     Boolean allowBufferHandleOutput;
+    Boolean allowCancellation;
     Boolean useXNNPACK;
     final List<Delegate> delegates = new ArrayList<>();
   }
@@ -298,6 +309,8 @@ public final class Interpreter implements AutoCloseable {
    *     bound to the output {@link Tensor}. See {@link Options#setAllowBufferHandleOutput()}.
    * @throws IllegalArgumentException if {@code input} or {@code output} is null or empty, or if
    *     error occurs when running the inference.
+   * @throws IllegalArgumentException (EXPERIMENTAL, subject to change) if the inference is
+   *     interrupted by {@code setCancelled(true)}.
    */
   public void run(Object input, Object output) {
     Object[] inputs = {input};
@@ -472,18 +485,6 @@ public final class Interpreter implements AutoCloseable {
     return wrapper.getLastNativeInferenceDurationNanoseconds();
   }
 
-  /**
-   * Turns on/off Android NNAPI for hardware acceleration when it is available.
-   *
-   * @deprecated Prefer using {@link Options#setUseNNAPI(boolean)} directly for enabling NN API.
-   *     This method will be removed in a future release.
-   */
-  @Deprecated
-  public void setUseNNAPI(boolean useNNAPI) {
-    checkNotClosed();
-    wrapper.setUseNNAPI(useNNAPI);
-  }
-
   /**
    * Sets the number of threads to be used for ops that support multi-threading.
    *
@@ -524,6 +525,27 @@ public final class Interpreter implements AutoCloseable {
     wrapper.resetVariableTensors();
   }
 
+   /**
+   * Advanced: Interrupts inference in the middle of a call to {@link Interpreter#run}.
+   *
+   * <p>A cancellation flag will be set to true when this function gets called. The interpreter will
+   * check the flag between Op invocations, and if it's {@code true}, the interpreter will stop
+   * execution. The interpreter will remain a cancelled state until explicitly "uncancelled" by
+   * {@code setCancelled(false)}.
+   *
+   * <p>WARNING: This is an experimental API and subject to change.
+   *
+   * @param cancelled {@code true} to cancel inference in a best-effort way; {@code false} to
+   * resume.
+   * @throws IllegalStateException if the interpreter is not initialized with the cancellable
+   * option, which is by default off.
+   *
+   * @see {@link Interpreter.Options#setCancellable(boolean)}.
+   */
+  public void setCancelled(boolean cancelled) {
+    wrapper.setCancelled(cancelled);
+  }
+
   int getExecutionPlanLength() {
     checkNotClosed();
     return wrapper.getExecutionPlanLength();
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index 5e9a6eecf00..3d439e4470c 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -70,6 +70,9 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     this.errorHandle = errorHandle;
     this.modelHandle = modelHandle;
     this.interpreterHandle = createInterpreter(modelHandle, errorHandle, options.numThreads);
+    if (options.allowCancellation != null && options.allowCancellation) {
+      this.cancellationFlagHandle = createCancellationFlag(interpreterHandle);
+    }
     this.inputTensors = new Tensor[getInputCount(interpreterHandle)];
     this.outputTensors = new Tensor[getOutputCount(interpreterHandle)];
     if (options.allowFp16PrecisionForFp32 != null) {
@@ -105,9 +108,11 @@ final class NativeInterpreterWrapper implements AutoCloseable {
       }
     }
     delete(errorHandle, modelHandle, interpreterHandle);
+    deleteCancellationFlag(cancellationFlagHandle);
     errorHandle = 0;
     modelHandle = 0;
     interpreterHandle = 0;
+    cancellationFlagHandle = 0;
     modelByteBuffer = null;
     inputsIndexes = null;
     outputsIndexes = null;
@@ -213,10 +218,6 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native long allocateTensors(long interpreterHandle, long errorHandle);
 
-  void setUseNNAPI(boolean useNNAPI) {
-    useNNAPI(interpreterHandle, useNNAPI);
-  }
-
   void setNumThreads(int numThreads) {
     numThreads(interpreterHandle, numThreads);
   }
@@ -333,6 +334,21 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     return getExecutionPlanLength(interpreterHandle);
   }
 
+  /**
+   * Sets internal cancellation flag. If it's true, the interpreter will try to interrupt any
+   * invocation between ops.
+   */
+  void setCancelled(boolean value) {
+    if (cancellationFlagHandle == 0) {
+      throw new IllegalStateException(
+          "Cannot cancel the inference. Have you called Interpreter.Options.setCancellable?");
+    }
+    setCancelled(interpreterHandle, cancellationFlagHandle, value);
+  }
+
+  private static native void setCancelled(
+      long interpreterHandle, long cancellationFlagHandle, boolean value);
+
   private void applyDelegates(Interpreter.Options options) {
     // First apply the flex delegate if necessary. This ensures the graph is fully resolved before
     // applying other delegates.
@@ -397,6 +413,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private long modelHandle;
 
+  private long cancellationFlagHandle = 0;
+
   private long inferenceDurationNanoseconds = -1;
 
   private ByteBuffer modelByteBuffer;
@@ -434,8 +452,6 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native String[] getOutputNames(long interpreterHandle);
 
-  private static native void useNNAPI(long interpreterHandle, boolean state);
-
   private static native void numThreads(long interpreterHandle, int numThreads);
 
   private static native void allowFp16PrecisionForFp32(long interpreterHandle, boolean allow);
@@ -458,5 +474,9 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native void resetVariableTensors(long interpreterHandle, long errorHandle);
 
+  private static native long createCancellationFlag(long interpreterHandle);
+
+  private static native long deleteCancellationFlag(long cancellationFlagHandle);
+
   private static native void delete(long errorHandle, long modelHandle, long interpreterHandle);
 }
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 2d1844fbd39..959acfb205e 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stdio.h>
 #include <time.h>
 
+#include <atomic>
 #include <vector>
 
 #include "tensorflow/lite/c/common.h"
@@ -298,17 +299,6 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputNames(JNIEnv* env,
   return names;
 }
 
-JNIEXPORT void JNICALL
-Java_org_tensorflow_lite_NativeInterpreterWrapper_useNNAPI(JNIEnv* env,
-                                                           jclass clazz,
-                                                           jlong handle,
-                                                           jboolean state) {
-  tflite_api_dispatcher::Interpreter* interpreter =
-      convertLongToInterpreter(env, handle);
-  if (interpreter == nullptr) return;
-  interpreter->UseNNAPI(static_cast<bool>(state));
-}
-
 JNIEXPORT void JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_allowFp16PrecisionForFp32(
     JNIEnv* env, jclass clazz, jlong handle, jboolean allow) {
@@ -367,8 +357,14 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_useXNNPACK(
     }
     tflite_api_dispatcher::Interpreter::TfLiteDelegatePtr delegate(
         xnnpack_create(&options), xnnpack_delete);
-    if (interpreter->ModifyGraphWithDelegate(std::move(delegate)) !=
-        kTfLiteOk) {
+    auto delegation_status =
+        interpreter->ModifyGraphWithDelegate(std::move(delegate));
+    // kTfLiteApplicationError occurs in cases where delegation fails but
+    // the runtime is invokable (eg. another delegate has already been applied).
+    // We don't throw an Exception in that case.
+    // TODO(b/166483905): Add support for multiple delegates when model allows.
+    if (delegation_status != kTfLiteOk &&
+        delegation_status != kTfLiteApplicationError) {
       ThrowException(env, kIllegalArgumentException,
                      "Internal error: Failed to apply XNNPACK delegate: %s",
                      error_reporter->CachedErrorMessage());
@@ -499,6 +495,7 @@ JNIEXPORT void JNICALL Java_org_tensorflow_lite_NativeInterpreterWrapper_run(
   if (error_reporter == nullptr) return;
 
   if (interpreter->Invoke() != kTfLiteOk) {
+    // TODO(b/168266570): Return InterruptedException.
     ThrowException(env, kIllegalArgumentException,
                    "Internal error: Failed to run on the given Interpreter: %s",
                    error_reporter->CachedErrorMessage());
@@ -606,6 +603,43 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_resetVariableTensors(
   }
 }
 
+JNIEXPORT jlong JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_createCancellationFlag(
+    JNIEnv* env, jclass clazz, jlong interpreter_handle) {
+  tflite_api_dispatcher::Interpreter* interpreter =
+      convertLongToInterpreter(env, interpreter_handle);
+  if (interpreter == nullptr) {
+    ThrowException(env, kIllegalArgumentException,
+                   "Internal error: Invalid handle to interpreter.");
+  }
+  std::atomic_bool* cancellation_flag = new std::atomic_bool(false);
+  interpreter->SetCancellationFunction(cancellation_flag, [](void* payload) {
+    std::atomic_bool* cancellation_flag =
+        reinterpret_cast<std::atomic_bool*>(payload);
+    return cancellation_flag->load() == true;
+  });
+  return reinterpret_cast<jlong>(cancellation_flag);
+}
+
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_deleteCancellationFlag(
+    JNIEnv* env, jclass clazz, jlong flag_handle) {
+  std::atomic_bool* cancellation_flag =
+      reinterpret_cast<std::atomic_bool*>(flag_handle);
+  delete cancellation_flag;
+}
+
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_setCancelled(
+    JNIEnv* env, jclass clazz, jlong interpreter_handle, jlong flag_handle,
+    jboolean value) {
+  std::atomic_bool* cancellation_flag =
+      reinterpret_cast<std::atomic_bool*>(flag_handle);
+  if (cancellation_flag != nullptr) {
+    cancellation_flag->store(static_cast<bool>(value));
+  }
+}
+
 JNIEXPORT void JNICALL Java_org_tensorflow_lite_NativeInterpreterWrapper_delete(
     JNIEnv* env, jclass clazz, jlong error_handle, jlong model_handle,
     jlong interpreter_handle) {
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index 8f52422dde0..f7e10ae796c 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -394,6 +394,7 @@ public final class InterpreterTest {
   }
 
   @Test
+  // setAllowFp16PrecisionForFp32 is deprecated, suppress the warning to allow testing.
   @SuppressWarnings("deprecation")
   public void testTurnOnNNAPI() throws Exception {
     Interpreter interpreter =
@@ -635,6 +636,47 @@ public final class InterpreterTest {
     }
   }
 
+  @Test
+  public void testCancelInference() throws Exception {
+    float[][][][] inputs = new float[2][8][8][3];
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    Interpreter interpreter = new Interpreter(
+        MODEL_BUFFER, new Interpreter.Options().setCancellable(true));
+
+    // Part 1: Should be interrupted when flag is set to true.
+    try {
+      interpreter.setCancelled(true);
+      interpreter.run(inputs, parsedOutputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+    // TODO(b/168266570): Return InterruptedException.
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "Internal error: Failed to run on the given Interpreter: Client requested cancel"
+                  + " during Invoke()");
+    }
+
+    // Part 2: Should be resumed when flag is set to false.
+    interpreter.setCancelled(false);
+    interpreter.run(inputs, parsedOutputs);
+  }
+
+  @Test
+  public void testCancelInferenceOnNoncancellableInterpreter() throws Exception {
+    Interpreter interpreter = new Interpreter(MODEL_BUFFER);
+
+    try {
+      interpreter.setCancelled(true);
+      fail();
+    } catch (IllegalStateException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "Cannot cancel the inference. Have you called Interpreter.Options.setCancellable?");
+    }
+  }
+
   private static FloatBuffer fill(FloatBuffer buffer, float value) {
     while (buffer.hasRemaining()) {
       buffer.put(value);
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
index d92a7119aab..de320fd68d6 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
@@ -76,8 +76,8 @@ public final class GpuDelegateTest {
             "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
 
     Interpreter.Options options = new Interpreter.Options();
-    try (GpuDelegate delegate =
-            new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true));
+    // Default behavior allows quantized models.
+    try (GpuDelegate delegate = new GpuDelegate();
         Interpreter interpreter =
             new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) {
       byte[][] output = new byte[1][1001];
@@ -98,12 +98,13 @@ public final class GpuDelegateTest {
             "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
 
     Interpreter.Options options = new Interpreter.Options();
-    try (GpuDelegate delegate = new GpuDelegate();
+    try (GpuDelegate delegate =
+            new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(false));
         Interpreter interpreter =
             new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) {
       byte[][] output = new byte[1][1001];
       interpreter.run(img, output);
-      // Original execution plan remains since default behavior doesn't allow quantized models.
+      // Original execution plan remains since we disabled quantized models.
       assertThat(InterpreterTestHelper.executionPlanLength(interpreter)).isEqualTo(31);
       assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
       assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java
index 45d66e24d35..fc9038c4de0 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java
@@ -56,6 +56,25 @@ public final class NnApiDelegateTest {
     }
   }
 
+  @Test
+  public void testInterpreterWithNnApiAndXNNPack() throws Exception {
+    Interpreter.Options options = new Interpreter.Options();
+    options.setUseXNNPACK(true);
+
+    try (NnApiDelegate delegate = new NnApiDelegate();
+        Interpreter interpreter = new Interpreter(MODEL_BUFFER, options.addDelegate(delegate))) {
+      float[] oneD = {1.23f, 6.54f, 7.81f};
+      float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+      float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+      float[][][][] fourD = {threeD, threeD};
+      float[][][][] parsedOutputs = new float[2][8][8][3];
+      interpreter.run(fourD, parsedOutputs);
+      float[] outputOneD = parsedOutputs[0][0][0];
+      float[] expected = {3.69f, 19.62f, 23.43f};
+      assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    }
+  }
+
   @Test
   public void testInterpreterWithNnApiAllowFp16() throws Exception {
     Interpreter.Options options = new Interpreter.Options();
diff --git a/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc b/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
index 76bb7b7d4cd..7981a8bb8f5 100644
--- a/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
+++ b/tensorflow/lite/java/src/test/native/interpreter_test_jni.cc
@@ -33,15 +33,21 @@ Java_org_tensorflow_lite_InterpreterTest_getNativeHandleForDelegate(
       .free = nullptr,
       .prepare =
           [](TfLiteContext* context, TfLiteNode* node) {
-            const TfLiteTensor* input = tflite::GetInput(context, node, 0);
-            TfLiteTensor* output = tflite::GetOutput(context, node, 0);
+            const TfLiteTensor* input;
+            TF_LITE_ENSURE_OK(context,
+                              tflite::GetInputSafe(context, node, 0, &input));
+            TfLiteTensor* output;
+            TF_LITE_ENSURE_OK(context,
+                              tflite::GetOutputSafe(context, node, 0, &output));
             TfLiteIntArray* output_dims = TfLiteIntArrayCopy(input->dims);
             output->type = kTfLiteFloat32;
             return context->ResizeTensor(context, output, output_dims);
           },
       .invoke =
           [](TfLiteContext* context, TfLiteNode* node) {
-            TfLiteTensor* output = tflite::GetOutput(context, node, 0);
+            TfLiteTensor* output;
+            TF_LITE_ENSURE_OK(context,
+                              tflite::GetOutputSafe(context, node, 0, &output));
             std::fill(output->data.f,
                       output->data.f + tflite::NumElements(output), 7.0f);
             return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 3bbfdd9b901..f7aa91dc24d 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite/micro:build_def.bzl", "micro_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite_combined")
 load("//tensorflow:tensorflow.bzl", "tf_opts_nortti_if_android")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = [
@@ -97,6 +98,7 @@ config_setting(
 )
 
 ###### End of config_setting's to match aarch64 ######
+
 # Suppress warnings that are introduced by Eigen Tensor.
 EXTRA_EIGEN_COPTS = select({
     "//tensorflow:ios": [
@@ -145,6 +147,10 @@ cc_library(
         "acceleration_test_util_internal.cc",
     ],
     hdrs = ["acceleration_test_util_internal.h"],
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-lm"],
+    }),
     deps = [
         "//tensorflow/lite:minimal_logging",
         "@com_google_absl//absl/types:optional",
@@ -185,6 +191,7 @@ cc_library(
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/testing:util",
         "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools:logging",
@@ -256,12 +263,14 @@ cc_library(
     hdrs = [
         "eigen_support.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + EXTRA_EIGEN_COPTS,
+    visibility = ["//visibility:private"],
     deps = [
         ":op_macros",
         "//tensorflow/lite:arena_planner",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels/internal:optimized",
+        "//tensorflow/lite/kernels/internal:optimized_eigen",
         "//third_party/eigen3",
     ],
 )
@@ -270,10 +279,14 @@ cc_test(
     name = "eigen_support_test",
     size = "small",
     srcs = ["eigen_support_test.cc"],
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-lm"],
+    }),
     deps = [
         ":eigen_support",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels/internal:optimized",
+        "//tensorflow/lite/kernels/internal:optimized_eigen",
         "//third_party/eigen3",
         "@com_google_googletest//:gtest",
     ],
@@ -281,21 +294,14 @@ cc_test(
 
 cc_library(
     name = "tflite_with_ruy_enabled",
+    compatible_with = get_compatible_with_portable(),
     defines = ["TFLITE_WITH_RUY"],
     visibility = ["//visibility:private"],
 )
 
-cc_library(
-    name = "tflite_with_ruy_and_caching_enabled",
-    defines = [
-        "TFLITE_WITH_RUY",
-        "TFLITE_WITH_RUY_GEMV",
-    ],
-    visibility = ["//visibility:private"],
-)
-
 cc_library(
     name = "tflite_with_ruy_default",
+    compatible_with = get_compatible_with_portable(),
     visibility = ["//visibility:private"],
     deps = select({
         ":chromiumos_arm64": [":tflite_with_ruy_enabled"],
@@ -312,6 +318,7 @@ cc_library(
 
 cc_library(
     name = "tflite_with_ruy",
+    compatible_with = get_compatible_with_portable(),
     deps = select({
         ":tflite_with_ruy_explicit_true": [":tflite_with_ruy_enabled"],
         ":tflite_with_ruy_explicit_false": [],
@@ -327,7 +334,14 @@ cc_library(
     hdrs = [
         "cpu_backend_context.h",
     ],
-    copts = tflite_copts(),
+    compatible_with = get_compatible_with_portable(),
+    # TF Lite builds in other build systems should "opt in" to cpufinfo.
+    copts = tflite_copts() + select({
+        "//tensorflow:linux_ppc64le": [],
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:fuchsia": [],
+        "//conditions:default": ["-DTFLITE_HAVE_CPUINFO"],
+    }),
     deps = [
         ":tflite_with_ruy",
         ":op_macros",
@@ -338,7 +352,14 @@ cc_library(
         "@gemmlowp",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite:external_cpu_backend_context",
-    ],
+        "//tensorflow/lite/kernels/internal:compatibility",
+    ] + select({
+        # This select must match the similar select in `copts`
+        "//tensorflow:linux_ppc64le": [],
+        "//tensorflow:linux_s390x": [],
+        "//tensorflow:fuchsia": [],
+        "//conditions:default": ["@cpuinfo//:cpuinfo_with_unstripped_include_path"],
+    }),
 )
 
 cc_library(
@@ -346,6 +367,7 @@ cc_library(
     hdrs = [
         "cpu_backend_threadpool.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":cpu_backend_context",
@@ -379,12 +401,14 @@ cc_library(
         "cpu_backend_gemm_eigen.cc",
         "cpu_backend_gemm_eigen.h",
         "cpu_backend_gemm_gemmlowp.h",
+        "cpu_backend_gemm_x86.h",
     ],
     hdrs = [
         "cpu_backend_gemm.h",
         "cpu_backend_gemm_params.h",
         "cpu_backend_gemm_ruy.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":tflite_with_ruy",
@@ -429,6 +453,7 @@ cc_library(
     hdrs = [
         "op_macros.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = ["//tensorflow/lite/micro:debug_log"],
 )
@@ -441,6 +466,7 @@ cc_library(
     hdrs = [
         "kernel_util.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + micro_copts(),
     deps = [
         "//tensorflow/lite/c:common",
@@ -454,6 +480,10 @@ cc_test(
     name = "kernel_util_test",
     size = "small",
     srcs = ["kernel_util_test.cc"],
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-lm"],
+    }),
     deps = [
         ":kernel_util",
         "//tensorflow/lite/c:common",
@@ -496,6 +526,7 @@ cc_library(
     name = "padding",
     srcs = [],
     hdrs = ["padding.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/c:common",
@@ -601,7 +632,6 @@ BUILTIN_KERNEL_DEPS = [
     ":cpu_backend_context",
     ":cpu_backend_gemm",
     ":cpu_backend_threadpool",
-    ":eigen_support",
     ":kernel_util",
     ":lstm_eval",
     ":lstm_shared",
@@ -618,7 +648,6 @@ BUILTIN_KERNEL_DEPS = [
     "//tensorflow/lite/kernels/internal:compatibility",
     "//tensorflow/lite/kernels/internal:cpu_check",
     "//tensorflow/lite/kernels/internal:kernel_utils",
-    "//tensorflow/lite/kernels/internal:optimized",
     "//tensorflow/lite/kernels/internal:optimized_base",
     "//tensorflow/lite/kernels/internal:quantization_util",
     "//tensorflow/lite/kernels/internal:reference_base",
@@ -626,7 +655,14 @@ BUILTIN_KERNEL_DEPS = [
     "//tensorflow/lite/kernels/internal:tensor",
     "//tensorflow/lite/kernels/internal:tensor_utils",
     "//tensorflow/lite/kernels/internal:types",
-]
+] + select({
+    ":tflite_with_ruy_explicit_true": [],
+    # Eigen multi-therading optimizations are only used when ruy is disabled.
+    "//conditions:default": [
+        ":eigen_support",
+        "//tensorflow/lite/kernels/internal:optimized_eigen",
+    ],
+})
 
 cc_library(
     name = "builtin_op_kernels",
@@ -634,6 +670,7 @@ cc_library(
     hdrs = [
         "dequantize.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + tf_opts_nortti_if_android() + EXTRA_EIGEN_COPTS,
     visibility = ["//visibility:private"],
     deps = BUILTIN_KERNEL_DEPS + [
@@ -644,25 +681,6 @@ cc_library(
     ],
 )
 
-# Creates a target where Ruy is unconditionally enabled along with caching
-# on GEMV operations. This is useful for TF Lite deployments where custom
-# copts are not allowed, e.g. b/156119344
-cc_library(
-    name = "builtin_op_kernels_ruy_and_caching",
-    srcs = BUILTIN_KERNEL_SRCS,
-    hdrs = [
-        "dequantize.h",
-    ],
-    copts = tflite_copts() + tf_opts_nortti_if_android() + EXTRA_EIGEN_COPTS,
-    visibility = ["//visibility:private"],
-    deps = BUILTIN_KERNEL_DEPS + [
-        "@ruy//ruy/profiler:instrumentation",
-        "//tensorflow/lite/kernels/internal:cppmath",
-        "//tensorflow/lite:string",
-        "@farmhash_archive//:farmhash",
-    ] + [":tflite_with_ruy_and_caching_enabled"],
-)
-
 cc_library(
     name = "variable_op_kernels",
     srcs = [
@@ -699,6 +717,9 @@ cc_library(
     name = "custom_ops",
     srcs = [
         "complex_support.cc",
+        "cumsum.cc",
+        "multinomial.cc",
+        "random_standard_normal.cc",
         "rfft2d.cc",
     ],
     hdrs = ["custom_ops_register.h"],
@@ -711,6 +732,7 @@ cc_library(
         "//tensorflow/lite/kernels/internal:types",
         "//third_party/fft2d:fft2d_headers",
         "@fft2d",
+        "@flatbuffers",
         "@ruy//ruy/profiler:instrumentation",
     ],
 )
@@ -719,6 +741,7 @@ cc_library(
     name = "lstm_eval",
     srcs = ["lstm_eval.cc"],
     hdrs = ["lstm_eval.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":cpu_backend_context",
@@ -737,6 +760,7 @@ cc_library(
 cc_library(
     name = "lstm_shared",
     hdrs = ["lstm_shared.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
 )
 
@@ -772,6 +796,7 @@ cc_library(
         "fully_connected.h",
         "register.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         ":builtin_op_kernels",
         "//tensorflow/lite:framework_lib",
@@ -781,24 +806,6 @@ cc_library(
     ],
 )
 
-#  TODO(b/156664104) Remove once runtime flag available.
-cc_library(
-    name = "builtin_ops_ruy_and_caching_enabled",
-    srcs = ["register.cc"],
-    hdrs = [
-        "builtin_op_kernels.h",
-        "fully_connected.h",
-        "register.h",
-    ],
-    deps = [
-        ":builtin_op_kernels_ruy_and_caching",
-        "//tensorflow/lite:framework_lib",
-        "//tensorflow/lite:tflite_with_xnnpack_optional",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/schema:schema_fbs",
-    ],
-)
-
 # The builtin_ops target will resolve to optimized kernels when available. This
 # target uses reference kernels only, and is useful for validation and testing.
 # It should *not* generally be used in production.
@@ -1353,6 +1360,20 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "multinomial_test",
+    size = "small",
+    srcs = ["multinomial_test.cc"],
+    deps = [
+        ":custom_ops",
+        ":test_main",
+        ":test_util",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "pad_test",
     size = "small",
@@ -1368,6 +1389,20 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "random_standard_normal_test",
+    size = "small",
+    srcs = ["random_standard_normal_test.cc"],
+    deps = [
+        ":custom_ops",
+        ":test_main",
+        ":test_util",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "reshape_test_common",
     testonly = 1,
@@ -2135,6 +2170,7 @@ cc_test(
     srcs = ["rfft2d_test.cc"],
     deps = [
         ":custom_ops",
+        ":test_main",
         ":test_util",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
@@ -2303,4 +2339,19 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "cumsum_test",
+    srcs = ["cumsum_test.cc"],
+    deps = [
+        ":custom_ops",
+        ":test_main",
+        ":test_util",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 tflite_portable_test_suite_combined(combine_conditions = {"deps": [":test_main"]})
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 654ccbc27ec..0efac36be74 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -252,8 +252,10 @@ void* HardSwishInit(TfLiteContext* context, const char* buffer, size_t length) {
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   return context->ResizeTensor(context, output,
@@ -272,8 +274,10 @@ TfLiteStatus ReluPrepare(TfLiteContext* context, TfLiteNode* node) {
   ReluOpData* data = reinterpret_cast<ReluOpData*>(node->user_data);
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   if (input->type == kTfLiteInt8 || input->type == kTfLiteUInt8) {
@@ -298,15 +302,16 @@ void HardSwishFree(TfLiteContext* context, void* buffer) {
   delete static_cast<HardSwishData*>(buffer);
 }
 
-
 TfLiteStatus HardSwishPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_STATUS(GenericPrepare(context, node));
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
 
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
     HardSwishData* data = static_cast<HardSwishData*>(node->user_data);
     HardSwishParams* params = &data->params;
-    const TfLiteTensor* input = GetInput(context, node, 0);
+    const TfLiteTensor* input;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
     params->input_zero_point = input->params.zero_point;
     params->output_zero_point = output->params.zero_point;
     const float input_scale = input->params.scale;
@@ -338,8 +343,10 @@ TfLiteStatus HardSwishPrepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus LeakyReluPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   LeakyReluOpData* data = reinterpret_cast<LeakyReluOpData*>(node->user_data);
@@ -367,8 +374,10 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   if (kernel_type == kFixedPointOptimized) {
@@ -452,8 +461,10 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   if (kernel_type == kFixedPointOptimized) {
@@ -547,8 +558,10 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   if (output->type == kTfLiteInt16) {
     TF_LITE_ENSURE(context, input->type == kTfLiteInt8 ||
                                 input->type == kTfLiteUInt8 ||
@@ -615,8 +628,10 @@ TfLiteStatus LogSoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
   if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
@@ -624,24 +639,15 @@ TfLiteStatus LogSoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
     static const double kBeta = 1.0;
     if (input->type == kTfLiteUInt8) {
       TF_LITE_ENSURE_EQ(context, output->params.zero_point, 255);
-      data->params.table = data->f_table;
-      optimized_ops::PopulateSoftmaxLookupTable(&data->params,
-                                                input->params.scale, kBeta);
-      data->params.zero_point = output->params.zero_point;
-      data->params.scale = output->params.scale;
     }
     if (input->type == kTfLiteInt8) {
       TF_LITE_ENSURE_EQ(context, output->params.zero_point, 127);
-      static const int kScaledDiffIntegerBits = 5;
-      tflite::PreprocessLogSoftmaxScalingExp(
-          kBeta, input->params.scale, kScaledDiffIntegerBits,
-          &data->input_multiplier, &data->input_left_shift,
-          &data->reverse_scaling_divisor, &data->reverse_scaling_right_shift);
-      data->reverse_scaling_right_shift *= -1;
-      data->diff_min =
-          -1.0 * tflite::CalculateInputRadius(kScaledDiffIntegerBits,
-                                              data->input_left_shift);
     }
+    data->params.table = data->f_table;
+    optimized_ops::PopulateSoftmaxLookupTable(&data->params,
+                                              input->params.scale, kBeta);
+    data->params.zero_point = output->params.zero_point;
+    data->params.scale = output->params.scale;
   }
 
   return context->ResizeTensor(context, output,
@@ -651,9 +657,12 @@ TfLiteStatus LogSoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
-  const TfLiteTensor* alpha = GetInput(context, node, 1);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
+  const TfLiteTensor* alpha;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &alpha));
   PreluOpData* data = reinterpret_cast<PreluOpData*>(node->user_data);
 
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, alpha->type);
@@ -705,8 +714,10 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   const ReluOpData* data = reinterpret_cast<ReluOpData*>(node->user_data);
   switch (input->type) {
     case kTfLiteFloat32: {
@@ -733,8 +744,10 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   const ReluOpData* data = reinterpret_cast<ReluOpData*>(node->user_data);
   switch (input->type) {
     case kTfLiteFloat32: {
@@ -764,8 +777,10 @@ template <KernelType kernel_type>
 TfLiteStatus HardSwishEval(TfLiteContext* context, TfLiteNode* node) {
   HardSwishData* data = static_cast<HardSwishData*>(node->user_data);
 
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   switch (input->type) {
     case kTfLiteFloat32: {
       if (kernel_type == kReference) {
@@ -815,8 +830,10 @@ TfLiteStatus HardSwishEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   ReluOpData* data = reinterpret_cast<ReluOpData*>(node->user_data);
   switch (input->type) {
     case kTfLiteFloat32: {
@@ -846,8 +863,10 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
 template <KernelType kernel_type>
 TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   switch (input->type) {
     case kTfLiteFloat32: {
       if (kernel_type == kReference) {
@@ -865,12 +884,10 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       TanhParams params;
       params.input_left_shift = data->input_left_shift;
       if (kernel_type == kReference || (data->input_multiplier > 0)) {
-        const int size =
-            MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
-
         reference_integer_ops::Tanh(
-            data->input_multiplier, data->input_left_shift, size,
-            GetTensorData<int16_t>(input), GetTensorData<int16_t>(output));
+            data->input_multiplier, data->input_left_shift,
+            GetTensorShape(input), GetTensorData<int16_t>(input),
+            GetTensorShape(output), GetTensorData<int16_t>(output));
       } else {
         optimized_ops::Tanh(
             params, GetTensorShape(input), GetTensorData<int16_t>(input),
@@ -922,8 +939,10 @@ template <KernelType kernel_type>
 TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   switch (input->type) {
     case kTfLiteFloat32: {
       if (kernel_type == kReference) {
@@ -1070,8 +1089,10 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSoftmaxParams*>(node->builtin_data);
   SoftmaxOpData* data = reinterpret_cast<SoftmaxOpData*>(node->user_data);
 
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
 
   switch (input->type) {
     case kTfLiteFloat32: {
@@ -1125,8 +1146,10 @@ template <KernelType kernel_type>
 TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   const LogSoftmaxOpData* data =
       reinterpret_cast<LogSoftmaxOpData*>(node->user_data);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   switch (input->type) {
     case kTfLiteFloat32: {
       SoftmaxParams op_params;
@@ -1156,18 +1179,26 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     }
     case kTfLiteInt8: {
-      const auto input_shape = GetTensorShape(input);
-      const auto output_shape = GetTensorShape(output);
-      const int trailing_dim = input_shape.DimensionsCount() - 1;
-      const int outer_size =
-          MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-      const int depth =
-          MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
-      reference_integer_ops::LogSoftmax(
-          data->input_multiplier, data->input_left_shift,
-          data->reverse_scaling_divisor, data->reverse_scaling_right_shift,
-          data->diff_min, outer_size, depth, GetTensorData<int8_t>(input),
-          GetTensorData<int8_t>(output));
+      if (kernel_type == kGenericOptimized) {
+        SoftmaxParams op_params = data->params;
+        optimized_ops::LogSoftmax(
+            op_params, input->params.scale, GetTensorShape(input),
+            GetTensorData<int8_t>(input), GetTensorShape(output),
+            GetTensorData<int8_t>(output));
+      } else {
+        const auto input_shape = GetTensorShape(input);
+        const auto output_shape = GetTensorShape(output);
+        const int trailing_dim = input_shape.DimensionsCount() - 1;
+        const int outer_size =
+            MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+        const int depth =
+            MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+        reference_integer_ops::LogSoftmax(
+            data->input_multiplier, data->input_left_shift,
+            data->reverse_scaling_divisor, data->reverse_scaling_right_shift,
+            data->diff_min, outer_size, depth, GetTensorData<int8_t>(input),
+            GetTensorData<int8_t>(output));
+      }
       return kTfLiteOk;
     }
     default:
@@ -1184,25 +1215,49 @@ T ApplyPrelu(T input, T alpha) {
   return input >= 0.0 ? input : input * alpha;
 }
 
+template <KernelType kernel_type>
 TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  const TfLiteTensor* alpha = GetInput(context, node, 1);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  const TfLiteTensor* alpha;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &alpha));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   const PreluOpData* data = reinterpret_cast<PreluOpData*>(node->user_data);
   switch (input->type) {
     case kTfLiteFloat32: {
-      if (data->requires_broadcast) {
-        reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
-            GetTensorShape(input), GetTensorData<float>(input),
-            GetTensorShape(alpha), GetTensorData<float>(alpha),
-            GetTensorShape(output), GetTensorData<float>(output),
-            ApplyPrelu<float>);
+      if (kernel_type == kGenericOptimized) {
+        tflite::ArithmeticParams op_params;
+        bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
+            GetTensorShape(input), GetTensorShape(alpha), &op_params);
+        if (need_broadcast) {
+          optimized_ops::BroadcastPReluDispatch(
+              op_params, GetTensorShape(input), GetTensorData<float>(input),
+              GetTensorShape(alpha), GetTensorData<float>(alpha),
+              GetTensorShape(output), GetTensorData<float>(output),
+              ApplyPrelu<float>);
+        } else {
+          const int flat_size =
+              MatchingElementsSize(GetTensorShape(input), GetTensorShape(alpha),
+                                   GetTensorShape(output));
+          optimized_ops::PReluElementWise(
+              flat_size, op_params, GetTensorData<float>(alpha),
+              GetTensorData<float>(input), GetTensorData<float>(output));
+        }
       } else {
-        reference_ops::BinaryFunction<float, float, float>(
-            GetTensorShape(input), GetTensorData<float>(input),
-            GetTensorShape(alpha), GetTensorData<float>(alpha),
-            GetTensorShape(output), GetTensorData<float>(output),
-            ApplyPrelu<float>);
+        if (data->requires_broadcast) {
+          reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
+              GetTensorShape(input), GetTensorData<float>(input),
+              GetTensorShape(alpha), GetTensorData<float>(alpha),
+              GetTensorShape(output), GetTensorData<float>(output),
+              ApplyPrelu<float>);
+        } else {
+          reference_ops::BinaryFunction<float, float, float>(
+              GetTensorShape(input), GetTensorData<float>(input),
+              GetTensorShape(alpha), GetTensorData<float>(alpha),
+              GetTensorShape(output), GetTensorData<float>(output),
+              ApplyPrelu<float>);
+        }
       }
       return kTfLiteOk;
     } break;
@@ -1276,8 +1331,10 @@ void QuantizeLeakyRelu(const TfLiteTensor* input, TfLiteTensor* output,
 }
 
 TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   const auto* params =
       reinterpret_cast<TfLiteLeakyReluParams*>(node->builtin_data);
   const LeakyReluOpData* data =
@@ -1314,8 +1371,10 @@ TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus EluPrepare(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   // Use LUT to handle quantized elu path.
@@ -1328,8 +1387,10 @@ TfLiteStatus EluPrepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus EluEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   switch (input->type) {
     case kTfLiteFloat32: {
       optimized_ops::Elu(GetTensorShape(input), GetTensorData<float>(input),
@@ -1463,10 +1524,17 @@ TfLiteRegistration* Register_LOG_SOFTMAX() {
   return &r;
 }
 
+TfLiteRegistration* Register_PRELU_REF() {
+  static TfLiteRegistration r = {
+      activations::PreluInit, activations::PreluFree, activations::PreluPrepare,
+      activations::PreluEval<activations::kReference>};
+  return &r;
+}
+
 TfLiteRegistration* Register_PRELU() {
-  static TfLiteRegistration r = {activations::PreluInit, activations::PreluFree,
-                                 activations::PreluPrepare,
-                                 activations::PreluEval};
+  static TfLiteRegistration r = {
+      activations::PreluInit, activations::PreluFree, activations::PreluPrepare,
+      activations::PreluEval<activations::kGenericOptimized>};
   return &r;
 }
 
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index d8f883b9c1d..6e0316538b9 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -50,6 +50,10 @@ TfLiteRegistration* Register_LOGISTIC_REF();
 TfLiteRegistration* Register_LOGISTIC_GENERIC_OPT();
 TfLiteRegistration* Register_LOGISTIC_FIXED_POINT_OPT();
 
+// PRelu kernel registrations.
+TfLiteRegistration* Register_PRELU_REF();
+TfLiteRegistration* Register_PRELU();
+
 }  // namespace builtin
 }  // namespace ops
 
@@ -2031,6 +2035,11 @@ TEST(QuantizedActivationsOpTest, LogSoftmaxInt8) {
                                      }));
 }
 
+const auto kPReluKernelMap = new std::map<string, TfLiteRegistration*>({
+    {"Reference", ops::builtin::Register_PRELU_REF()},
+    {"GenericOptimized", ops::builtin::Register_PRELU()},
+});
+
 // A base class of PRelu op model. It provides the constructor for
 // FloatPReluOpModel and QuantizedPReluOpModel.
 class BasePReluOpModel : public SingleOpModel {
@@ -2087,7 +2096,14 @@ class QuantizedPReluOpModel : public BasePReluOpModel {
   }
 };
 
-TEST(FloatActivationsOpTest, PRelu) {
+class PReluOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kPReluKernelMap;
+  }
+};
+
+TEST_P(PReluOpTest, PReluFloat32) {
   FloatPReluOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
                       {TensorType_FLOAT32, {1, 1, 3}});
 
@@ -2107,7 +2123,7 @@ TEST(FloatActivationsOpTest, PRelu) {
                              }));
 }
 
-TEST(FloatActivationsOpTest, PReluSameShapes) {
+TEST_P(PReluOpTest, PReluFloat32SameShapes) {
   FloatPReluOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
                       {TensorType_FLOAT32, {1, 2, 2, 3}});
 
@@ -2132,7 +2148,7 @@ TEST(FloatActivationsOpTest, PReluSameShapes) {
                              }));
 }
 
-TEST(QuantizedActivationsOpTest, PRelu) {
+TEST_P(PReluOpTest, PReluUInt8) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
   QuantizedPReluOpModel m({TensorType_UINT8, {1, 2, 2, 3}, kMin, kMax},
@@ -2162,7 +2178,7 @@ TEST(QuantizedActivationsOpTest, PRelu) {
                                       }));
 }
 
-TEST(QuantizedActivationsOpTest, PReluSameShapes) {
+TEST_P(PReluOpTest, PReluUInt8SameShapes) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
   QuantizedPReluOpModel m({TensorType_UINT8, {1, 2, 2, 3}, kMin, kMax},
@@ -2197,7 +2213,7 @@ TEST(QuantizedActivationsOpTest, PReluSameShapes) {
                                       }));
 }
 
-TEST(QuantizedActivationsOpTest, PReluInt8) {
+TEST_P(PReluOpTest, PReluInt8) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
   QuantizedPReluOpModel m({TensorType_INT8, {1, 2, 2, 3}, kMin, kMax},
@@ -2227,7 +2243,7 @@ TEST(QuantizedActivationsOpTest, PReluInt8) {
                                      }));
 }
 
-TEST(QuantizedActivationsOpTest, PReluInt8SameShapes) {
+TEST_P(PReluOpTest, PReluInt8SameShapes) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
   QuantizedPReluOpModel m({TensorType_INT8, {1, 2, 2, 3}, kMin, kMax},
@@ -2303,5 +2319,9 @@ INSTANTIATE_TEST_SUITE_P(
     LogisticOpTest, LogisticOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kLogisticKernelMap)));
 
+INSTANTIATE_TEST_SUITE_P(
+    PReluOpTest, PReluOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kPReluKernelMap)));
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/add.cc b/tensorflow/lite/kernels/add.cc
index 7692ae9e54b..05be920e67e 100644
--- a/tensorflow/lite/kernels/add.cc
+++ b/tensorflow/lite/kernels/add.cc
@@ -91,9 +91,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
   output->type = input2->type;
@@ -358,9 +364,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) {
     EvalAdd<kernel_type>(context, node, params, data, input1, input2, output);
diff --git a/tensorflow/lite/kernels/add_n.cc b/tensorflow/lite/kernels/add_n.cc
index e933c5bbd66..d8fb2160833 100644
--- a/tensorflow/lite/kernels/add_n.cc
+++ b/tensorflow/lite/kernels/add_n.cc
@@ -33,13 +33,18 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, num_inputs >= 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   output->type = input1->type;
 
   // Check that all input tensors have the same shape and type.
   for (int i = kInputTensor1 + 1; i < num_inputs; ++i) {
-    const TfLiteTensor* input = GetInput(context, node, i);
+    const TfLiteTensor* input;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, i, &input));
     TF_LITE_ENSURE(context, HaveSameShapes(input1, input));
     TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input->type);
   }
@@ -55,15 +60,22 @@ template <typename T>
 void EvalAddN(TfLiteContext* context, TfLiteNode* node) {
   // TODO(haoliang): Initialize all_inputs only once during init.
   VectorOfTensors<T> all_inputs(*context, *node->inputs);
+  // Safe to use unchecked since caller checks that tensor is valid
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   int num_inputs = NumInputs(node);
+  // Safe to use unchecked since caller checks that tensor is valid
   const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
   reference_ops::AddN<T>(GetTensorShape(input1), num_inputs, all_inputs.data(),
                          GetTensorData<T>(output));
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   if (output->type == kTfLiteFloat32) {
     EvalAddN<float>(context, node);
   } else if (output->type == kTfLiteInt32) {
diff --git a/tensorflow/lite/kernels/arg_min_max.cc b/tensorflow/lite/kernels/arg_min_max.cc
index 4a3902ac57c..f782f94a9b0 100644
--- a/tensorflow/lite/kernels/arg_min_max.cc
+++ b/tensorflow/lite/kernels/arg_min_max.cc
@@ -58,15 +58,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* axis = GetInput(context, node, kAxis);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* axis;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kAxis, &axis));
   // Make sure the axis is only 1 dimension.
   TF_LITE_ENSURE_EQ(context, NumElements(axis), 1);
   // Make sure the axis is only either int32 or int64.
   TF_LITE_ENSURE(context,
                  axis->type == kTfLiteInt32 || axis->type == kTfLiteInt64);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   auto* params = reinterpret_cast<TfLiteArgMaxParams*>(node->builtin_data);
   switch (params->output_type) {
@@ -119,9 +123,13 @@ std::function<bool(T, T)> GetComparefunction(bool is_arg_max) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node, bool is_arg_max) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* axis = GetInput(context, node, kAxis);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* axis;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kAxis, &axis));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   if (IsDynamicTensor(output)) {
     TF_LITE_ENSURE_STATUS(ResizeOutput(context, input, axis, output));
   }
diff --git a/tensorflow/lite/kernels/assign_variable.cc b/tensorflow/lite/kernels/assign_variable.cc
index 4cb4e08e43a..b1c4cdf3c53 100644
--- a/tensorflow/lite/kernels/assign_variable.cc
+++ b/tensorflow/lite/kernels/assign_variable.cc
@@ -40,8 +40,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   //   everything still works fine when variable ops aren't used.
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 0);
 
-  const TfLiteTensor* input_resource_id_tensor =
-      GetInput(context, node, kInputVariableId);
+  const TfLiteTensor* input_resource_id_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputVariableId,
+                                          &input_resource_id_tensor));
   TF_LITE_ENSURE_EQ(context, input_resource_id_tensor->type, kTfLiteInt32);
   TF_LITE_ENSURE_EQ(context, NumElements(input_resource_id_tensor), 1);
 
@@ -51,9 +52,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   Subgraph* subgraph = reinterpret_cast<Subgraph*>(context->impl_);
 
-  const TfLiteTensor* input_resource_id_tensor =
-      GetInput(context, node, kInputVariableId);
-  const TfLiteTensor* input_value_tensor = GetInput(context, node, kInputValue);
+  const TfLiteTensor* input_resource_id_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputVariableId,
+                                          &input_resource_id_tensor));
+  const TfLiteTensor* input_value_tensor;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kInputValue, &input_value_tensor));
 
   int resource_id = input_resource_id_tensor->data.i32[0];
   auto& resources = subgraph->resources();
diff --git a/tensorflow/lite/kernels/audio_spectrogram.cc b/tensorflow/lite/kernels/audio_spectrogram.cc
index 8132130f4ab..46066310157 100644
--- a/tensorflow/lite/kernels/audio_spectrogram.cc
+++ b/tensorflow/lite/kernels/audio_spectrogram.cc
@@ -76,8 +76,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
 
@@ -106,8 +109,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteAudioSpectrogramParams*>(node->user_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TF_LITE_ENSURE(context, params->spectrogram->Initialize(params->window_size,
                                                           params->stride));
diff --git a/tensorflow/lite/kernels/basic_rnn.cc b/tensorflow/lite/kernels/basic_rnn.cc
index c2e503d6462..715ae5cb671 100644
--- a/tensorflow/lite/kernels/basic_rnn.cc
+++ b/tensorflow/lite/kernels/basic_rnn.cc
@@ -60,13 +60,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* input_weights = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* recurrent_weights =
-      GetInput(context, node, kRecurrentWeightsTensor);
-  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
-  const TfLiteTensor* hidden_state =
-      GetInput(context, node, kHiddenStateTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* input_weights;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kWeightsTensor, &input_weights));
+  const TfLiteTensor* recurrent_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, kRecurrentWeightsTensor, &recurrent_weights));
+  const TfLiteTensor* bias;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kBiasTensor, &bias));
+  const TfLiteTensor* hidden_state;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kHiddenStateTensor, &hidden_state));
 
   // Check all the parameters of tensor match within themselves and match the
   // input configuration.
@@ -86,7 +93,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, hidden_state->dims->data[0], batch_size);
   TF_LITE_ENSURE_EQ(context, hidden_state->dims->data[1], num_units);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   // Resize output.
   TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2);
@@ -105,7 +114,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TfLiteIntArrayFree(node->temporaries);
     node->temporaries = TfLiteIntArrayCreate(6);
     node->temporaries->data[0] = op_data->scratch_tensor_index;
-    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
+    TfLiteTensor* input_quantized;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/0,
+                                                &input_quantized));
     input_quantized->type = input_weights->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
@@ -114,8 +125,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                                        input_quantized_size));
     }
     node->temporaries->data[1] = op_data->scratch_tensor_index + 1;
-    TfLiteTensor* hidden_state_quantized =
-        GetTemporary(context, node, /*index=*/1);
+    TfLiteTensor* hidden_state_quantized;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/1,
+                                                &hidden_state_quantized));
     hidden_state_quantized->type = input_weights->type;
     hidden_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(hidden_state_quantized->dims,
@@ -127,7 +139,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                               hidden_state_quantized_size));
     }
     node->temporaries->data[2] = op_data->scratch_tensor_index + 2;
-    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/2);
+    TfLiteTensor* scaling_factors;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/2,
+                                                &scaling_factors));
     scaling_factors->type = kTfLiteFloat32;
     scaling_factors->allocation_type = kTfLiteArenaRw;
     int scaling_dims[1] = {batch_size};
@@ -138,7 +152,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                                        scaling_factors_size));
     }
     node->temporaries->data[3] = op_data->scratch_tensor_index + 3;
-    TfLiteTensor* accum_scratch = GetTemporary(context, node, /*index=*/3);
+    TfLiteTensor* accum_scratch;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, /*index=*/3, &accum_scratch));
     accum_scratch->type = kTfLiteInt32;
     accum_scratch->allocation_type = kTfLiteArenaRw;
     int accum_scratch_dims[2] = {num_units, batch_size};
@@ -151,7 +167,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                                        accum_scratch_size));
     }
     node->temporaries->data[4] = op_data->scratch_tensor_index + 4;
-    TfLiteTensor* zero_points = GetTemporary(context, node, /*index=*/4);
+    TfLiteTensor* zero_points;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, /*index=*/4, &zero_points));
     zero_points->type = kTfLiteInt32;
     zero_points->allocation_type = kTfLiteArenaRw;
     int zero_points_dims[1] = {batch_size};
@@ -162,7 +180,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                                        zero_points_size));
     }
     node->temporaries->data[5] = op_data->scratch_tensor_index + 5;
-    TfLiteTensor* row_sums = GetTemporary(context, node, /*index=*/5);
+    TfLiteTensor* row_sums;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, /*index=*/5, &row_sums));
     row_sums->type = kTfLiteInt32;
     row_sums->allocation_type = kTfLiteArenaRwPersistent;
     int row_sums_dims[2] = {2, num_units};
@@ -260,14 +280,23 @@ TfLiteStatus EvalHybrid(const TfLiteTensor* input,
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteRNNParams*>(node->builtin_data);
   auto* op_data = reinterpret_cast<OpData*>(node->user_data);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* input_weights = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* recurrent_weights =
-      GetInput(context, node, kRecurrentWeightsTensor);
-  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* input_weights;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kWeightsTensor, &input_weights));
+  const TfLiteTensor* recurrent_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, kRecurrentWeightsTensor, &recurrent_weights));
+  const TfLiteTensor* bias;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kBiasTensor, &bias));
   TfLiteTensor* hidden_state =
-      &context->tensors[node->inputs->data[kHiddenStateTensor]];
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+      GetVariableInput(context, node, kHiddenStateTensor);
+  TF_LITE_ENSURE(context, hidden_state != nullptr);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   // We already checked that weight types are consistent, so branch on one.
   switch (input_weights->type) {
@@ -277,12 +306,23 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteUInt8:
     case kTfLiteInt8: {
       // TODO(mirkov): implement eval with quantized inputs as well.
-      TfLiteTensor* input_quantized = GetTemporary(context, node, 0);
-      TfLiteTensor* hidden_state_quantized = GetTemporary(context, node, 1);
-      TfLiteTensor* scaling_factors = GetTemporary(context, node, 2);
-      TfLiteTensor* accum_scratch = GetTemporary(context, node, 3);
-      TfLiteTensor* zero_points = GetTemporary(context, node, 4);
-      TfLiteTensor* row_sums = GetTemporary(context, node, 5);
+      TfLiteTensor* input_quantized;
+      TF_LITE_ENSURE_OK(context,
+                        GetTemporarySafe(context, node, 0, &input_quantized));
+      TfLiteTensor* hidden_state_quantized;
+      TF_LITE_ENSURE_OK(
+          context, GetTemporarySafe(context, node, 1, &hidden_state_quantized));
+      TfLiteTensor* scaling_factors;
+      TF_LITE_ENSURE_OK(context,
+                        GetTemporarySafe(context, node, 2, &scaling_factors));
+      TfLiteTensor* accum_scratch;
+      TF_LITE_ENSURE_OK(context,
+                        GetTemporarySafe(context, node, 3, &accum_scratch));
+      TfLiteTensor* zero_points;
+      TF_LITE_ENSURE_OK(context,
+                        GetTemporarySafe(context, node, 4, &zero_points));
+      TfLiteTensor* row_sums;
+      TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, 5, &row_sums));
       return EvalHybrid(input, input_weights, recurrent_weights, bias, params,
                         input_quantized, hidden_state_quantized,
                         scaling_factors, hidden_state, output, zero_points,
diff --git a/tensorflow/lite/kernels/basic_rnn_test.cc b/tensorflow/lite/kernels/basic_rnn_test.cc
index 2146d086c9a..675c7dcbf5b 100644
--- a/tensorflow/lite/kernels/basic_rnn_test.cc
+++ b/tensorflow/lite/kernels/basic_rnn_test.cc
@@ -179,7 +179,7 @@ class RNNOpModel : public SingleOpModel {
     weights_ = AddInput(weights);
     recurrent_weights_ = AddInput(recurrent_weights);
     bias_ = AddInput(TensorType_FLOAT32);
-    hidden_state_ = AddInput(TensorType_FLOAT32, true);
+    hidden_state_ = AddVariableInput(TensorType_FLOAT32);
     output_ = AddOutput(TensorType_FLOAT32);
     SetBuiltinOp(BuiltinOperator_RNN, BuiltinOptions_RNNOptions,
                  CreateRNNOptions(builder_, ActivationFunctionType_RELU,
diff --git a/tensorflow/lite/kernels/batch_matmul.cc b/tensorflow/lite/kernels/batch_matmul.cc
index a414a226504..5f6afa3d14f 100644
--- a/tensorflow/lite/kernels/batch_matmul.cc
+++ b/tensorflow/lite/kernels/batch_matmul.cc
@@ -154,7 +154,9 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
   // Temp tensor for Transposed LHS;
   {
     node->temporaries->data[0] = op_data->scratch_tensor_index;
-    TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
+    TfLiteTensor* scratch_buffer;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, /*index=*/0, &scratch_buffer));
     TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(lhs_rank);
     for (int i = 0; i < lhs_rank - 2; ++i) {
       scratch_buffer_size->data[i] = lhs->dims->data[i];
@@ -175,7 +177,9 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
   // is set by the caller, the data is already in the desired layout.
   {
     node->temporaries->data[1] = op_data->scratch_tensor_index + 1;
-    TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/1);
+    TfLiteTensor* scratch_buffer;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, /*index=*/1, &scratch_buffer));
     const TfLiteTensor* rhs = op_context->rhs;
     int rhs_rank = NumDimensions(rhs);
     TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(rhs_rank);
@@ -215,7 +219,9 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
     }
     op_data->compute_row_sums = true;
     node->temporaries->data[2] = op_data->scratch_tensor_index + 2;
-    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/2);
+    TfLiteTensor* input_quantized;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/2,
+                                                &input_quantized));
     input_quantized->type = op_context->rhs->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
 
@@ -225,7 +231,9 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
                                                      input_quantized_size));
 
     node->temporaries->data[3] = op_data->scratch_tensor_index + 3;
-    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/3);
+    TfLiteTensor* scaling_factors;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/3,
+                                                &scaling_factors));
     scaling_factors->type = kTfLiteFloat32;
     scaling_factors->allocation_type = kTfLiteArenaRw;
     // Total size of scaling factors is batch size * number of total batches
@@ -238,7 +246,9 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
     }
 
     node->temporaries->data[4] = op_data->scratch_tensor_index + 4;
-    TfLiteTensor* accum_scratch = GetTemporary(context, node, /*index=*/4);
+    TfLiteTensor* accum_scratch;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, /*index=*/4, &accum_scratch));
     accum_scratch->type = kTfLiteInt32;
     accum_scratch->allocation_type = kTfLiteArenaRw;
     int accum_scratch_dims[2] = {num_units, batch_size};
@@ -252,7 +262,9 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
     }
 
     node->temporaries->data[5] = op_data->scratch_tensor_index + 5;
-    TfLiteTensor* input_offsets = GetTemporary(context, node, /*index=*/5);
+    TfLiteTensor* input_offsets;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, /*index=*/5, &input_offsets));
     input_offsets->type = kTfLiteInt32;
     input_offsets->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqualsArray(input_offsets->dims, 1, scaling_dims)) {
@@ -262,7 +274,9 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
                                                        input_offsets_size));
     }
     node->temporaries->data[6] = op_data->scratch_tensor_index + 6;
-    TfLiteTensor* row_sums = GetTemporary(context, node, /*index=*/6);
+    TfLiteTensor* row_sums;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, /*index=*/6, &row_sums));
     row_sums->type = kTfLiteInt32;
     row_sums->allocation_type = kTfLiteArenaRwPersistent;
     int row_sums_dims[1] = {num_weights_matrices * num_units};
@@ -288,13 +302,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   bool adj_x = op_context.params->adj_x;
   bool adj_y = op_context.params->adj_y;
 
-  const TfLiteTensor* lhs_data = GetInput(context, node, kInputLHSTensor);
-  const TfLiteTensor* rhs_data = GetInput(context, node, kInputRHSTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* lhs_data;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputLHSTensor, &lhs_data));
+  const TfLiteTensor* rhs_data;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputRHSTensor, &rhs_data));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training.
-  if (lhs_data->type == kTfLiteInt8) {
+  if (lhs_data->type == kTfLiteInt8 || lhs_data->type == kTfLiteInt16) {
     double real_multiplier = 0.0;
     TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
         context, lhs_data, rhs_data, output, &real_multiplier));
@@ -302,16 +322,34 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     QuantizeMultiplier(real_multiplier, &op_data->output_multiplier, &exponent);
     op_data->output_shift = exponent;
     // BatchMatMul has no fused activation functions. Therefore, set
-    // output activation min and max to min and max of int8_t type,
-    // respecitvely.
-    op_data->output_activation_min = std::numeric_limits<int8_t>::min();
-    op_data->output_activation_max = std::numeric_limits<int8_t>::max();
+    // output activation min and max to min and max of int8_t or int16_t
+    // type.
+    if (lhs_data->type == kTfLiteInt8) {
+      op_data->output_activation_min = std::numeric_limits<int8_t>::min();
+      op_data->output_activation_max = std::numeric_limits<int8_t>::max();
+    } else {
+      op_data->output_activation_min = std::numeric_limits<int16_t>::min();
+      op_data->output_activation_max = std::numeric_limits<int16_t>::max();
+    }
+  }
+
+  if (lhs_data->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, lhs_data->params.zero_point, 0);
+    TF_LITE_ENSURE_EQ(context, rhs_data->params.zero_point, 0);
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
   }
 
   TF_LITE_ENSURE(context, lhs_data->type == kTfLiteFloat32 ||
-                              lhs_data->type == kTfLiteInt8);
+                              lhs_data->type == kTfLiteInt8 ||
+                              lhs_data->type == kTfLiteInt16);
   TF_LITE_ENSURE(context, rhs_data->type == kTfLiteFloat32 ||
-                              rhs_data->type == kTfLiteInt8);
+                              rhs_data->type == kTfLiteInt8 ||
+                              rhs_data->type == kTfLiteInt16);
+  // Either we have a hybrid quantization with a float32 and an int8 input,
+  // otherwise both inputs should be of the same type.
+  TF_LITE_ENSURE(context, (lhs_data->type == kTfLiteFloat32 &&
+                           rhs_data->type == kTfLiteInt8) ||
+                              lhs_data->type == rhs_data->type);
   // Support dimensions between 2 and 4, inclusive.
   TF_LITE_ENSURE(context, NumDimensions(lhs_data) >= 2);
   TF_LITE_ENSURE(context, NumDimensions(lhs_data) <= 4);
@@ -382,9 +420,14 @@ TfLiteStatus TransposeRowsColumns(TfLiteContext* context,
         tensor_in, GetTensorData<int8_t>(tensor_in), tensor_out,
         GetTensorData<int8_t>(tensor_out));
     return kTfLiteOk;
+  } else if (tensor_in->type == kTfLiteInt16) {
+    TransposeRowsColumnsImpl<int16_t>(
+        tensor_in, GetTensorData<int16_t>(tensor_in), tensor_out,
+        GetTensorData<int16_t>(tensor_out));
+    return kTfLiteOk;
   } else {
-    TF_LITE_KERNEL_LOG(context,
-                       "Can only transpose tensors with float and int8 type.");
+    TF_LITE_KERNEL_LOG(
+        context, "Can only transpose tensors with float, int8 or int16 type.");
     return kTfLiteError;
   }
 }
@@ -481,10 +524,10 @@ TfLiteStatus EvalInt8(TfLiteContext* context, const OpData* data,
   op_params.rhs_cacheable = IsConstantTensor(rhs);
 
   if (kernel_type == kReference) {
-    reference_ops::BatchMatMul(op_params, rhs_shape, GetTensorData<int8_t>(rhs),
-                               lhs_shape, GetTensorData<int8_t>(lhs),
-                               GetTensorShape(output),
-                               GetTensorData<int8_t>(output));
+    reference_ops::BatchMatMul<int8_t, int32_t>(
+        op_params, rhs_shape, GetTensorData<int8_t>(rhs), lhs_shape,
+        GetTensorData<int8_t>(lhs), GetTensorShape(output),
+        GetTensorData<int8_t>(output));
   } else {
     optimized_ops::BatchMatMul(op_params, rhs_shape, GetTensorData<int8_t>(rhs),
                                lhs_shape, GetTensorData<int8_t>(lhs),
@@ -495,27 +538,68 @@ TfLiteStatus EvalInt8(TfLiteContext* context, const OpData* data,
   return kTfLiteOk;
 }
 
+template <KernelType kernel_type>
+TfLiteStatus EvalInt16(TfLiteContext* context, const OpData* data,
+                       const RuntimeShape& lhs_shape, const TfLiteTensor* lhs,
+                       const RuntimeShape& rhs_shape, const TfLiteTensor* rhs,
+                       const RuntimeShape& output_shape, TfLiteTensor* output) {
+  // Reuse params struct from FullyConnected Op.
+  FullyConnectedParams op_params;
+  int32_t input_offset = -lhs->params.zero_point;
+  int32_t filter_offset = -rhs->params.zero_point;
+  int32_t output_offset = output->params.zero_point;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_shift = data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+
+  // optimized_ops not yet implemnted for int16_t, use reference_ops in all
+  // cases.
+  reference_ops::BatchMatMul<int16_t, int64_t>(
+      op_params, rhs_shape, GetTensorData<int16_t>(rhs), lhs_shape,
+      GetTensorData<int16_t>(lhs), GetTensorShape(output),
+      GetTensorData<int16_t>(output));
+  return kTfLiteOk;
+}
+
 template <KernelType kernel_type>
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                            OpData* data, const RuntimeShape& lhs_shape,
                            const TfLiteTensor* lhs,
                            const RuntimeShape& rhs_shape,
                            const TfLiteTensor* rhs, TfLiteTensor* output) {
-  if (lhs->type == kTfLiteFloat32) {
-    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/2);
-    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/3);
-    TfLiteTensor* accum_scratch = GetTemporary(context, node, /*index=*/4);
-    TfLiteTensor* input_offsets = GetTemporary(context, node, /*index=*/5);
-    TfLiteTensor* row_sums = GetTemporary(context, node, /*index=*/6);
+  if (lhs->type == kTfLiteFloat32 && rhs->type == kTfLiteInt8) {
+    TfLiteTensor* input_quantized;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/2,
+                                                &input_quantized));
+    TfLiteTensor* scaling_factors;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/3,
+                                                &scaling_factors));
+    TfLiteTensor* accum_scratch;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, /*index=*/4, &accum_scratch));
+    TfLiteTensor* input_offsets;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, /*index=*/5, &input_offsets));
+    TfLiteTensor* row_sums;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, /*index=*/6, &row_sums));
     return EvalHybrid<kernel_type>(
         context, node, data, lhs_shape, lhs, rhs_shape, rhs, input_quantized,
         scaling_factors, accum_scratch, row_sums, input_offsets, output);
-  } else if (lhs->type == kTfLiteInt8) {
+  } else if (lhs->type == kTfLiteInt8 && rhs->type == kTfLiteInt8) {
     return EvalInt8<kernel_type>(context, data, lhs_shape, lhs, rhs_shape, rhs,
                                  GetTensorShape(output), output);
+  } else if (lhs->type == kTfLiteInt16 && rhs->type == kTfLiteInt16) {
+    return EvalInt16<kernel_type>(context, data, lhs_shape, lhs, rhs_shape, rhs,
+                                  GetTensorShape(output), output);
   } else {
     TF_LITE_KERNEL_LOG(
-        context, "Currently only hybrid and int8 quantization is supported.\n");
+        context,
+        "Currently only hybrid, int8 and int16 quantization are supported.\n");
     return kTfLiteError;
   }
   return kTfLiteOk;
@@ -524,7 +608,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 TfLiteTensor* GetTempRhs(TfLiteContext* context, TfLiteNode* node,
                          const TfLiteTensor* rhs) {
   TfLiteTensor* transposed_rhs = GetTemporary(context, node, 1);
-  if (rhs->type == kTfLiteInt8) {
+  if (transposed_rhs == nullptr) {
+    return nullptr;
+  }
+
+  if (rhs->type == kTfLiteInt8 || rhs->type == kTfLiteInt16) {
     // Get the quantization params from the RHS tensor.
     transposed_rhs->params.scale = rhs->params.scale;
     transposed_rhs->params.zero_point = rhs->params.zero_point;
@@ -535,7 +623,11 @@ TfLiteTensor* GetTempRhs(TfLiteContext* context, TfLiteNode* node,
 TfLiteTensor* GetTempLhs(TfLiteContext* context, TfLiteNode* node,
                          const TfLiteTensor* lhs) {
   TfLiteTensor* transposed_lhs = GetTemporary(context, node, 0);
-  if (lhs->type == kTfLiteInt8) {
+  if (transposed_lhs == nullptr) {
+    return nullptr;
+  }
+
+  if (lhs->type == kTfLiteInt8 || lhs->type == kTfLiteInt16) {
     // Get the quantization params from the LHS tensor.
     transposed_lhs->params.scale = lhs->params.scale;
     transposed_lhs->params.zero_point = lhs->params.zero_point;
@@ -558,9 +650,15 @@ template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
   OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-  const TfLiteTensor* lhs = GetInput(context, node, kInputLHSTensor);
-  const TfLiteTensor* rhs = GetInput(context, node, kInputRHSTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* lhs;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputLHSTensor, &lhs));
+  const TfLiteTensor* rhs;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputRHSTensor, &rhs));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   RuntimeShape orig_lhs_shape = GetTensorShape(lhs);
   RuntimeShape orig_rhs_shape = GetTensorShape(rhs);
 
@@ -602,6 +700,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       }
       break;
     case kTfLiteInt8:
+    case kTfLiteInt16:
       EvalQuantized<kernel_type>(context, node, op_data, lhs_shape, lhs_tensor,
                                  rhs_shape, rhs_tensor, output);
       break;
diff --git a/tensorflow/lite/kernels/batch_matmul_test.cc b/tensorflow/lite/kernels/batch_matmul_test.cc
index 98df8ebe3db..7abef73d5a2 100644
--- a/tensorflow/lite/kernels/batch_matmul_test.cc
+++ b/tensorflow/lite/kernels/batch_matmul_test.cc
@@ -483,7 +483,12 @@ class QuantizedBatchMatMulOpModel : public SingleOpModel {
     input_size_ = total_input_size / batches_;
 
     lhs_id_ = AddInput(lhs);
-    rhs_id_ = AddInput({lhs.type, {input_size_, units_}, lhs.min, lhs.max});
+    rhs_id_ = AddInput({lhs.type,
+                        {input_size_, units_},
+                        0,
+                        0,
+                        GetScale(lhs_id_),
+                        GetZeroPoint(lhs_id_)});
 
     output_id_ = AddOutput(output);
 
@@ -553,6 +558,35 @@ TEST_P(QuantizedBatchMatMulOpTest, SimpleTestQuantizedInt8) {
   EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAre(22, 22, 22, 56, 56, 56));
 }
 
+TEST_P(QuantizedBatchMatMulOpTest, SimpleTestQuantizedInt16) {
+  const float inputs_scale = 10.0 / std::numeric_limits<int16_t>::max();
+  const float output_scale = 1.0;
+  const int32_t zero_point = 0;
+
+  QuantizedBatchMatMulOpModel m(
+      /*units=*/3, /*batches*/ 2,
+      /*lhs=*/
+      {TensorType_INT16, {2, 10}, 0, 0, inputs_scale, zero_point},
+      /*output=*/
+      {TensorType_INT16, {}, 0, 0, output_scale, zero_point});
+
+  m.SetWeights<int16_t>({
+      1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5,  5,  5,
+      6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10,
+  });
+
+  m.SetInput<int16_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear({23, 23, 23, 57, 57, 57})));
+  EXPECT_THAT(m.GetOutput<int16_t>(), ElementsAre(23, 23, 23, 57, 57, 57));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     QuantizedBatchMatMulOpTest, QuantizedBatchMatMulOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
index 1ce131a96ac..7ccc67c79f6 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -192,8 +193,10 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
   TF_LITE_ENSURE(context, params->cell_clip >= 0);
   TF_LITE_ENSURE(context, params->proj_clip >= 0);
 
-  const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, input_to_forget_weights_tensor);
+  const TfLiteTensor* input_to_forget_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, input_to_forget_weights_tensor,
+                                 &input_to_forget_weights));
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
@@ -211,16 +214,20 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
                             input_to_forget_weights->type);
   }
 
-  const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, input_to_cell_weights_tensor);
+  const TfLiteTensor* input_to_cell_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, input_to_cell_weights_tensor,
+                                 &input_to_cell_weights));
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
   TF_LITE_ENSURE_TYPES_EQ(context, input_to_cell_weights->type,
                           input_to_forget_weights->type);
 
-  const TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, input_to_output_weights_tensor);
+  const TfLiteTensor* input_to_output_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, input_to_output_weights_tensor,
+                                 &input_to_output_weights));
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
@@ -239,8 +246,10 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
                             input_to_forget_weights->type);
   }
 
-  const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, recurrent_to_forget_weights_tensor);
+  const TfLiteTensor* recurrent_to_forget_weights;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, recurrent_to_forget_weights_tensor,
+                            &recurrent_to_forget_weights));
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
                     n_cell);
@@ -249,8 +258,10 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
   TF_LITE_ENSURE_TYPES_EQ(context, recurrent_to_forget_weights->type,
                           input_to_forget_weights->type);
 
-  const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, recurrent_to_cell_weights_tensor);
+  const TfLiteTensor* recurrent_to_cell_weights;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, recurrent_to_cell_weights_tensor,
+                            &recurrent_to_cell_weights));
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
@@ -316,20 +327,25 @@ TfLiteStatus CheckLstmTensorDimensionsAndTypes(
     TF_LITE_ENSURE_TYPES_EQ(context, input_gate_bias->type, kTfLiteFloat32);
   }
 
-  const TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, forget_gate_bias_tensor);
+  const TfLiteTensor* forget_gate_bias;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, forget_gate_bias_tensor, &forget_gate_bias));
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
   TF_LITE_ENSURE_TYPES_EQ(context, forget_gate_bias->type, kTfLiteFloat32);
 
-  const TfLiteTensor* cell_gate_bias =
-      GetInput(context, node, cell_gate_bias_tensor);
+  const TfLiteTensor* cell_gate_bias;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, cell_gate_bias_tensor,
+                                          &cell_gate_bias));
   TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, cell_gate_bias->type, kTfLiteFloat32);
 
-  const TfLiteTensor* output_gate_bias =
-      GetInput(context, node, output_gate_bias_tensor);
+  const TfLiteTensor* output_gate_bias;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, output_gate_bias_tensor, &output_gate_bias));
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
   TF_LITE_ENSURE_TYPES_EQ(context, output_gate_bias->type, kTfLiteFloat32);
@@ -413,7 +429,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Inferring batch size, number of outputs and sequence length and
   // number of cells from the input tensors.
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
   TF_LITE_ENSURE_EQ(context, input->dims->size, 3);
   const bool time_major = params->time_major;
@@ -421,15 +438,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const int n_batch = time_major ? input->dims->data[1] : input->dims->data[0];
   const int n_input = input->dims->data[2];
 
-  const TfLiteTensor* fw_input_to_output_weights =
-      GetInput(context, node, kFwInputToOutputWeightsTensor);
+  const TfLiteTensor* fw_input_to_output_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kFwInputToOutputWeightsTensor,
+                                 &fw_input_to_output_weights));
   const int n_fw_cell = fw_input_to_output_weights->dims->data[0];
   TF_LITE_ENSURE_EQ(context, fw_input_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, fw_input_to_output_weights->dims->data[1],
                     n_input);
 
-  const TfLiteTensor* bw_input_to_output_weights =
-      GetInput(context, node, kBwInputToOutputWeightsTensor);
+  const TfLiteTensor* bw_input_to_output_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kBwInputToOutputWeightsTensor,
+                                 &bw_input_to_output_weights));
   const int n_bw_cell = bw_input_to_output_weights->dims->data[0];
   TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->data[1],
@@ -437,8 +458,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->type,
                     fw_input_to_output_weights->type);
 
-  const TfLiteTensor* fw_recurrent_to_output_weights =
-      GetInput(context, node, kFwRecurrentToOutputWeightsTensor);
+  const TfLiteTensor* fw_recurrent_to_output_weights;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kFwRecurrentToOutputWeightsTensor,
+                            &fw_recurrent_to_output_weights));
   TF_LITE_ENSURE_EQ(context, fw_recurrent_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, fw_recurrent_to_output_weights->dims->data[0],
                     n_fw_cell);
@@ -446,8 +469,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                     fw_input_to_output_weights->type);
   const int n_fw_output = fw_recurrent_to_output_weights->dims->data[1];
 
-  const TfLiteTensor* bw_recurrent_to_output_weights =
-      GetInput(context, node, kBwRecurrentToOutputWeightsTensor);
+  const TfLiteTensor* bw_recurrent_to_output_weights;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kBwRecurrentToOutputWeightsTensor,
+                            &bw_recurrent_to_output_weights));
   TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->data[0],
                     n_bw_cell);
@@ -504,7 +529,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
 
   // Get the pointer to output, activation_state and cell_state buffer tensors.
-  TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor);
+  TfLiteTensor* fw_output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kFwOutputTensor, &fw_output));
   TfLiteTensor* fw_activation_state =
       GetVariableInput(context, node, kFwInputActivationStateTensor);
   TF_LITE_ENSURE(context, fw_activation_state != nullptr);
@@ -541,8 +568,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Create a scratch buffer tensor.
   node->temporaries->data[kFwScratchBuffer] =
       op_data->scratch_tensor_index + kFwScratchBuffer;
-  TfLiteTensor* fw_scratch_buffer =
-      GetTemporary(context, node, kFwScratchBuffer);
+  TfLiteTensor* fw_scratch_buffer;
+  TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kFwScratchBuffer,
+                                              &fw_scratch_buffer));
   fw_scratch_buffer->type = input->type;
   fw_scratch_buffer->allocation_type = kTfLiteArenaRw;
 
@@ -581,7 +609,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Resize the output tensors.
   if (!params->merge_outputs) {
-    TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor);
+    TfLiteTensor* bw_output;
+    TF_LITE_ENSURE_OK(
+        context, GetOutputSafe(context, node, kBwOutputTensor, &bw_output));
     TfLiteIntArray* bw_output_size = TfLiteIntArrayCreate(3);
     bw_output_size->data[0] = time_major ? max_time : n_batch;
     bw_output_size->data[1] = time_major ? n_batch : max_time;
@@ -600,8 +630,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Create a scratch buffer tensor.
   node->temporaries->data[kBwScratchBuffer] =
       op_data->scratch_tensor_index + kBwScratchBuffer;
-  TfLiteTensor* bw_scratch_buffer =
-      GetTemporary(context, node, kBwScratchBuffer);
+  TfLiteTensor* bw_scratch_buffer;
+  TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kBwScratchBuffer,
+                                              &bw_scratch_buffer));
   bw_scratch_buffer->type = input->type;
   bw_scratch_buffer->allocation_type = kTfLiteArenaRw;
 
@@ -631,8 +662,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // (if present), activation_state and cell_state tensors.
     node->temporaries->data[kInputQuantized] =
         op_data->scratch_tensor_index + kInputQuantized;
-    TfLiteTensor* input_quantized =
-        GetTemporary(context, node, kInputQuantized);
+    TfLiteTensor* input_quantized;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kInputQuantized,
+                                                &input_quantized));
     input_quantized->type = fw_input_to_output_weights->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
@@ -643,8 +675,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
     node->temporaries->data[kFwActivationStateQuantized] =
         op_data->scratch_tensor_index + kFwActivationStateQuantized;
-    TfLiteTensor* fw_activation_state_quantized =
-        GetTemporary(context, node, kFwActivationStateQuantized);
+    TfLiteTensor* fw_activation_state_quantized;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, kFwActivationStateQuantized,
+                                  &fw_activation_state_quantized));
     fw_activation_state_quantized->type = fw_input_to_output_weights->type;
     fw_activation_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(fw_activation_state_quantized->dims,
@@ -657,8 +691,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kBwActivationStateQuantized] =
         op_data->scratch_tensor_index + kBwActivationStateQuantized;
-    TfLiteTensor* bw_activation_state_quantized =
-        GetTemporary(context, node, kBwActivationStateQuantized);
+    TfLiteTensor* bw_activation_state_quantized;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, kBwActivationStateQuantized,
+                                  &bw_activation_state_quantized));
     bw_activation_state_quantized->type = fw_input_to_output_weights->type;
     bw_activation_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(bw_activation_state_quantized->dims,
@@ -671,8 +707,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kFwCellStateQuantized] =
         op_data->scratch_tensor_index + kFwCellStateQuantized;
-    TfLiteTensor* fw_cell_state_quantized =
-        GetTemporary(context, node, kFwCellStateQuantized);
+    TfLiteTensor* fw_cell_state_quantized;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kFwCellStateQuantized,
+                                       &fw_cell_state_quantized));
     fw_cell_state_quantized->type = fw_input_to_output_weights->type;
     fw_cell_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(fw_cell_state_quantized->dims,
@@ -685,8 +723,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kBwCellStateQuantized] =
         op_data->scratch_tensor_index + kBwCellStateQuantized;
-    TfLiteTensor* bw_cell_state_quantized =
-        GetTemporary(context, node, kBwCellStateQuantized);
+    TfLiteTensor* bw_cell_state_quantized;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kBwCellStateQuantized,
+                                       &bw_cell_state_quantized));
     bw_cell_state_quantized->type = fw_input_to_output_weights->type;
     bw_cell_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(bw_cell_state_quantized->dims,
@@ -705,7 +745,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // the scaling factor of the matrix).
     node->temporaries->data[kInputScalingFactors] =
         op_data->scratch_tensor_index + kInputScalingFactors;
-    TfLiteTensor* input_sf = GetTemporary(context, node, kInputScalingFactors);
+    TfLiteTensor* input_sf;
+    TF_LITE_ENSURE_OK(
+        context,
+        GetTemporarySafe(context, node, kInputScalingFactors, &input_sf));
     input_sf->type = kTfLiteFloat32;
     input_sf->allocation_type = kTfLiteArenaRw;
     int scaling_dims[1] = {n_batch};
@@ -717,8 +760,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kAuxInputScalingFactors] =
         op_data->scratch_tensor_index + kAuxInputScalingFactors;
-    TfLiteTensor* aux_input_sf =
-        GetTemporary(context, node, kAuxInputScalingFactors);
+    TfLiteTensor* aux_input_sf;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kAuxInputScalingFactors,
+                                       &aux_input_sf));
     aux_input_sf->type = kTfLiteFloat32;
     aux_input_sf->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqualsArray(aux_input_sf->dims, 1, scaling_dims)) {
@@ -729,8 +774,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kOutputStateScalingFactors] =
         op_data->scratch_tensor_index + kOutputStateScalingFactors;
-    TfLiteTensor* output_state_sf =
-        GetTemporary(context, node, kOutputStateScalingFactors);
+    TfLiteTensor* output_state_sf;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, kOutputStateScalingFactors,
+                                  &output_state_sf));
     output_state_sf->type = kTfLiteFloat32;
     output_state_sf->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqualsArray(output_state_sf->dims, 1, scaling_dims)) {
@@ -741,8 +788,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kProductScalingFactors] =
         op_data->scratch_tensor_index + kProductScalingFactors;
-    TfLiteTensor* prod_scaling_factors =
-        GetTemporary(context, node, kProductScalingFactors);
+    TfLiteTensor* prod_scaling_factors;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kProductScalingFactors,
+                                       &prod_scaling_factors));
     prod_scaling_factors->type = kTfLiteFloat32;
     prod_scaling_factors->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqualsArray(prod_scaling_factors->dims, 1,
@@ -758,8 +807,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // this is used for diagonal matrices, only need to store n_cell values.
     node->temporaries->data[kRecoveredCellWeights] =
         op_data->scratch_tensor_index + kRecoveredCellWeights;
-    TfLiteTensor* recovered_cell_weights =
-        GetTemporary(context, node, kRecoveredCellWeights);
+    TfLiteTensor* recovered_cell_weights;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kRecoveredCellWeights,
+                                       &recovered_cell_weights));
     recovered_cell_weights->type = kTfLiteFloat32;
     recovered_cell_weights->allocation_type = kTfLiteArenaRw;
     int recovered_cell_dims[1] = {n_fw_cell};
@@ -775,8 +826,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // Allocate a temporary tensor to store the accumulated int32 values.
     node->temporaries->data[kAccumScratchBuffer] =
         op_data->scratch_tensor_index + kAccumScratchBuffer;
-    TfLiteTensor* accum_scratch =
-        GetTemporary(context, node, kAccumScratchBuffer);
+    TfLiteTensor* accum_scratch;
+    TF_LITE_ENSURE_OK(
+        context,
+        GetTemporarySafe(context, node, kAccumScratchBuffer, &accum_scratch));
     accum_scratch->type = kTfLiteInt32;
     accum_scratch->allocation_type = kTfLiteArenaRw;
     int n_cell = std::max(n_fw_cell, n_bw_cell);
@@ -797,7 +850,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // Allocate temporary tensors for storing zero-points.
     node->temporaries->data[kInputZeroPoints] =
         op_data->scratch_tensor_index + kInputZeroPoints;
-    TfLiteTensor* input_zp = GetTemporary(context, node, kInputZeroPoints);
+    TfLiteTensor* input_zp;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, kInputZeroPoints, &input_zp));
     input_zp->type = kTfLiteFloat32;
     input_zp->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqualsArray(input_zp->dims, 1, scaling_dims)) {
@@ -808,8 +863,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kAuxInputZeroPoints] =
         op_data->scratch_tensor_index + kAuxInputZeroPoints;
-    TfLiteTensor* aux_input_zp =
-        GetTemporary(context, node, kAuxInputZeroPoints);
+    TfLiteTensor* aux_input_zp;
+    TF_LITE_ENSURE_OK(
+        context,
+        GetTemporarySafe(context, node, kAuxInputZeroPoints, &aux_input_zp));
     aux_input_zp->type = kTfLiteFloat32;
     aux_input_zp->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqualsArray(aux_input_zp->dims, 1, scaling_dims)) {
@@ -820,8 +877,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kOutputStateZeroPoints] =
         op_data->scratch_tensor_index + kOutputStateZeroPoints;
-    TfLiteTensor* output_state_zp =
-        GetTemporary(context, node, kOutputStateZeroPoints);
+    TfLiteTensor* output_state_zp;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kOutputStateZeroPoints,
+                                       &output_state_zp));
     output_state_zp->type = kTfLiteFloat32;
     output_state_zp->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqualsArray(output_state_zp->dims, 1, scaling_dims)) {
@@ -844,7 +903,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kFwRowSums] =
         op_data->scratch_tensor_index + kFwRowSums;
-    TfLiteTensor* fw_row_sums = GetTemporary(context, node, kFwRowSums);
+    TfLiteTensor* fw_row_sums;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, kFwRowSums, &fw_row_sums));
     fw_row_sums->type = kTfLiteInt32;
     fw_row_sums->allocation_type = kTfLiteArenaRwPersistent;
     int fw_row_sums_dims[2] = {fw_row_sums_rows, n_fw_cell};
@@ -867,7 +928,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kBwRowSums] =
         op_data->scratch_tensor_index + kBwRowSums;
-    TfLiteTensor* bw_row_sums = GetTemporary(context, node, kBwRowSums);
+    TfLiteTensor* bw_row_sums;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, kBwRowSums, &bw_row_sums));
     bw_row_sums->type = kTfLiteInt32;
     bw_row_sums->allocation_type = kTfLiteArenaRwPersistent;
     int bw_row_sums_dims[2] = {bw_row_sums_rows, n_bw_cell};
@@ -884,8 +947,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     if (has_aux_input) {
       node->temporaries->data[kAuxInputQuantized] =
           op_data->scratch_tensor_index + kAuxInputQuantized;
-      TfLiteTensor* aux_input_quantized =
-          GetTemporary(context, node, kAuxInputQuantized);
+      TfLiteTensor* aux_input_quantized;
+      TF_LITE_ENSURE_OK(context,
+                        GetTemporarySafe(context, node, kAuxInputQuantized,
+                                         &aux_input_quantized));
       aux_input_quantized->type = fw_input_to_output_weights->type;
       aux_input_quantized->allocation_type = kTfLiteArenaRw;
       if (!TfLiteIntArrayEqual(aux_input_quantized->dims, aux_input->dims)) {
@@ -906,26 +971,39 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       node->builtin_data);
   auto* op_data = reinterpret_cast<OpData*>(node->user_data);
   // Input tensor.
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
 
   // Tensors for the forward cell.
   const TfLiteTensor* fw_input_to_input_weights =
       GetOptionalInputTensor(context, node, kFwInputToInputWeightsTensor);
-  const TfLiteTensor* fw_input_to_forget_weights =
-      GetInput(context, node, kFwInputToForgetWeightsTensor);
-  const TfLiteTensor* fw_input_to_cell_weights =
-      GetInput(context, node, kFwInputToCellWeightsTensor);
-  const TfLiteTensor* fw_input_to_output_weights =
-      GetInput(context, node, kFwInputToOutputWeightsTensor);
+  const TfLiteTensor* fw_input_to_forget_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kFwInputToForgetWeightsTensor,
+                                 &fw_input_to_forget_weights));
+  const TfLiteTensor* fw_input_to_cell_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kFwInputToCellWeightsTensor,
+                                 &fw_input_to_cell_weights));
+  const TfLiteTensor* fw_input_to_output_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kFwInputToOutputWeightsTensor,
+                                 &fw_input_to_output_weights));
 
   const TfLiteTensor* fw_recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kFwRecurrentToInputWeightsTensor);
-  const TfLiteTensor* fw_recurrent_to_forget_weights =
-      GetInput(context, node, kFwRecurrentToForgetWeightsTensor);
-  const TfLiteTensor* fw_recurrent_to_cell_weights =
-      GetInput(context, node, kFwRecurrentToCellWeightsTensor);
-  const TfLiteTensor* fw_recurrent_to_output_weights =
-      GetInput(context, node, kFwRecurrentToOutputWeightsTensor);
+  const TfLiteTensor* fw_recurrent_to_forget_weights;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kFwRecurrentToForgetWeightsTensor,
+                            &fw_recurrent_to_forget_weights));
+  const TfLiteTensor* fw_recurrent_to_cell_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kFwRecurrentToCellWeightsTensor,
+                                 &fw_recurrent_to_cell_weights));
+  const TfLiteTensor* fw_recurrent_to_output_weights;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kFwRecurrentToOutputWeightsTensor,
+                            &fw_recurrent_to_output_weights));
 
   const TfLiteTensor* fw_cell_to_input_weights =
       GetOptionalInputTensor(context, node, kFwCellToInputWeightsTensor);
@@ -936,12 +1014,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   const TfLiteTensor* fw_input_gate_bias =
       GetOptionalInputTensor(context, node, kFwInputGateBiasTensor);
-  const TfLiteTensor* fw_forget_gate_bias =
-      GetInput(context, node, kFwForgetGateBiasTensor);
-  const TfLiteTensor* fw_cell_gate_bias =
-      GetInput(context, node, kFwCellGateBiasTensor);
-  const TfLiteTensor* fw_output_gate_bias =
-      GetInput(context, node, kFwOutputGateBiasTensor);
+  const TfLiteTensor* fw_forget_gate_bias;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kFwForgetGateBiasTensor,
+                                 &fw_forget_gate_bias));
+  const TfLiteTensor* fw_cell_gate_bias;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kFwCellGateBiasTensor,
+                                          &fw_cell_gate_bias));
+  const TfLiteTensor* fw_output_gate_bias;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kFwOutputGateBiasTensor,
+                                 &fw_output_gate_bias));
 
   const TfLiteTensor* fw_projection_weights =
       GetOptionalInputTensor(context, node, kFwProjectionWeightsTensor);
@@ -950,30 +1033,44 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteTensor* fw_activation_state =
       GetVariableInput(context, node, kFwInputActivationStateTensor);
-  TF_LITE_ENSURE(context, fw_activation_state != nullptr);
+  TFLITE_DCHECK(fw_activation_state != nullptr);
   TfLiteTensor* fw_cell_state =
       GetVariableInput(context, node, kFwInputCellStateTensor);
-  TF_LITE_ENSURE(context, fw_cell_state != nullptr);
-  TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor);
+  TFLITE_DCHECK(fw_cell_state != nullptr);
+  TfLiteTensor* fw_output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kFwOutputTensor, &fw_output));
 
   // Tensors for the backward cell.
   const TfLiteTensor* bw_input_to_input_weights =
       GetOptionalInputTensor(context, node, kBwInputToInputWeightsTensor);
-  const TfLiteTensor* bw_input_to_forget_weights =
-      GetInput(context, node, kBwInputToForgetWeightsTensor);
-  const TfLiteTensor* bw_input_to_cell_weights =
-      GetInput(context, node, kBwInputToCellWeightsTensor);
-  const TfLiteTensor* bw_input_to_output_weights =
-      GetInput(context, node, kBwInputToOutputWeightsTensor);
+  const TfLiteTensor* bw_input_to_forget_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kBwInputToForgetWeightsTensor,
+                                 &bw_input_to_forget_weights));
+  const TfLiteTensor* bw_input_to_cell_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kBwInputToCellWeightsTensor,
+                                 &bw_input_to_cell_weights));
+  const TfLiteTensor* bw_input_to_output_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kBwInputToOutputWeightsTensor,
+                                 &bw_input_to_output_weights));
 
   const TfLiteTensor* bw_recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kBwRecurrentToInputWeightsTensor);
-  const TfLiteTensor* bw_recurrent_to_forget_weights =
-      GetInput(context, node, kBwRecurrentToForgetWeightsTensor);
-  const TfLiteTensor* bw_recurrent_to_cell_weights =
-      GetInput(context, node, kBwRecurrentToCellWeightsTensor);
-  const TfLiteTensor* bw_recurrent_to_output_weights =
-      GetInput(context, node, kBwRecurrentToOutputWeightsTensor);
+  const TfLiteTensor* bw_recurrent_to_forget_weights;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kBwRecurrentToForgetWeightsTensor,
+                            &bw_recurrent_to_forget_weights));
+  const TfLiteTensor* bw_recurrent_to_cell_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kBwRecurrentToCellWeightsTensor,
+                                 &bw_recurrent_to_cell_weights));
+  const TfLiteTensor* bw_recurrent_to_output_weights;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kBwRecurrentToOutputWeightsTensor,
+                            &bw_recurrent_to_output_weights));
 
   const TfLiteTensor* bw_cell_to_input_weights =
       GetOptionalInputTensor(context, node, kBwCellToInputWeightsTensor);
@@ -984,12 +1081,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   const TfLiteTensor* bw_input_gate_bias =
       GetOptionalInputTensor(context, node, kBwInputGateBiasTensor);
-  const TfLiteTensor* bw_forget_gate_bias =
-      GetInput(context, node, kBwForgetGateBiasTensor);
-  const TfLiteTensor* bw_cell_gate_bias =
-      GetInput(context, node, kBwCellGateBiasTensor);
-  const TfLiteTensor* bw_output_gate_bias =
-      GetInput(context, node, kBwOutputGateBiasTensor);
+  const TfLiteTensor* bw_forget_gate_bias;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kBwForgetGateBiasTensor,
+                                 &bw_forget_gate_bias));
+  const TfLiteTensor* bw_cell_gate_bias;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kBwCellGateBiasTensor,
+                                          &bw_cell_gate_bias));
+  const TfLiteTensor* bw_output_gate_bias;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kBwOutputGateBiasTensor,
+                                 &bw_output_gate_bias));
 
   const TfLiteTensor* bw_projection_weights =
       GetOptionalInputTensor(context, node, kBwProjectionWeightsTensor);
@@ -999,19 +1101,21 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // State tensors.
   TfLiteTensor* bw_activation_state =
       GetVariableInput(context, node, kBwInputActivationStateTensor);
-  TF_LITE_ENSURE(context, bw_activation_state != nullptr);
+  TFLITE_DCHECK(bw_activation_state != nullptr);
   TfLiteTensor* bw_cell_state =
       GetVariableInput(context, node, kBwInputCellStateTensor);
-  TF_LITE_ENSURE(context, bw_cell_state != nullptr);
+  TFLITE_DCHECK(bw_cell_state != nullptr);
   TfLiteTensor* bw_output = params->merge_outputs
                                 ? nullptr
                                 : GetOutput(context, node, kBwOutputTensor);
 
   // Temporary tensors.
-  TfLiteTensor* fw_scratch_buffer =
-      GetTemporary(context, node, kFwScratchBuffer);
-  TfLiteTensor* bw_scratch_buffer =
-      GetTemporary(context, node, kBwScratchBuffer);
+  TfLiteTensor* fw_scratch_buffer;
+  TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kFwScratchBuffer,
+                                              &fw_scratch_buffer));
+  TfLiteTensor* bw_scratch_buffer;
+  TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kBwScratchBuffer,
+                                              &bw_scratch_buffer));
 
   // (Optional) auxiliary inputs.
   const TfLiteTensor* aux_input =
@@ -1112,34 +1216,63 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
     case kTfLiteUInt8:
     case kTfLiteInt8: {
-      TfLiteTensor* input_quantized =
-          GetTemporary(context, node, kInputQuantized);
-      TfLiteTensor* fw_activation_state_quantized =
-          GetTemporary(context, node, kFwActivationStateQuantized);
-      TfLiteTensor* bw_activation_state_quantized =
-          GetTemporary(context, node, kBwActivationStateQuantized);
-      TfLiteTensor* fw_cell_state_quantized =
-          GetTemporary(context, node, kFwCellStateQuantized);
-      TfLiteTensor* bw_cell_state_quantized =
-          GetTemporary(context, node, kBwCellStateQuantized);
-      TfLiteTensor* prod_scaling_factors =
-          GetTemporary(context, node, kProductScalingFactors);
-      TfLiteTensor* recovered_cell_weights =
-          GetTemporary(context, node, kRecoveredCellWeights);
+      TfLiteTensor* input_quantized;
+      TF_LITE_ENSURE_OK(
+          context,
+          GetTemporarySafe(context, node, kInputQuantized, &input_quantized));
+      TfLiteTensor* fw_activation_state_quantized;
+      TF_LITE_ENSURE_OK(
+          context, GetTemporarySafe(context, node, kFwActivationStateQuantized,
+                                    &fw_activation_state_quantized));
+      TfLiteTensor* bw_activation_state_quantized;
+      TF_LITE_ENSURE_OK(
+          context, GetTemporarySafe(context, node, kBwActivationStateQuantized,
+                                    &bw_activation_state_quantized));
+      TfLiteTensor* fw_cell_state_quantized;
+      TF_LITE_ENSURE_OK(context,
+                        GetTemporarySafe(context, node, kFwCellStateQuantized,
+                                         &fw_cell_state_quantized));
+      TfLiteTensor* bw_cell_state_quantized;
+      TF_LITE_ENSURE_OK(context,
+                        GetTemporarySafe(context, node, kBwCellStateQuantized,
+                                         &bw_cell_state_quantized));
+      TfLiteTensor* prod_scaling_factors;
+      TF_LITE_ENSURE_OK(context,
+                        GetTemporarySafe(context, node, kProductScalingFactors,
+                                         &prod_scaling_factors));
+      TfLiteTensor* recovered_cell_weights;
+      TF_LITE_ENSURE_OK(context,
+                        GetTemporarySafe(context, node, kRecoveredCellWeights,
+                                         &recovered_cell_weights));
       TfLiteTensor* aux_input_quantized =
           use_aux_input ? GetTemporary(context, node, kAuxInputQuantized)
                         : nullptr;
-      TfLiteTensor* accum_scratch =
-          GetTemporary(context, node, kAccumScratchBuffer);
-      TfLiteTensor* fw_row_sums = GetTemporary(context, node, kFwRowSums);
-      TfLiteTensor* bw_row_sums = GetTemporary(context, node, kBwRowSums);
+      TfLiteTensor* accum_scratch;
+      TF_LITE_ENSURE_OK(
+          context,
+          GetTemporarySafe(context, node, kAccumScratchBuffer, &accum_scratch));
+      TfLiteTensor* fw_row_sums;
+      TF_LITE_ENSURE_OK(
+          context, GetTemporarySafe(context, node, kFwRowSums, &fw_row_sums));
+      TfLiteTensor* bw_row_sums;
+      TF_LITE_ENSURE_OK(
+          context, GetTemporarySafe(context, node, kBwRowSums, &bw_row_sums));
       const int fw_row_sums_size = fw_row_sums->dims->data[0];
       const int bw_row_sums_size = bw_row_sums->dims->data[0];
       TfLiteStatus fw_pass_status = lstm_eval::EvalHybrid(
-          input, fw_input_to_input_weights, fw_input_to_forget_weights,
-          fw_input_to_cell_weights, fw_input_to_output_weights,
-          fw_recurrent_to_input_weights, fw_recurrent_to_forget_weights,
-          fw_recurrent_to_cell_weights, fw_recurrent_to_output_weights,
+          input, fw_input_to_input_weights,
+          /*input_to_input_weights_ledger*/ nullptr, fw_input_to_forget_weights,
+          /*input_to_forget_weights_ledger*/ nullptr, fw_input_to_cell_weights,
+          /*input_to_cell_weights_ledger*/ nullptr, fw_input_to_output_weights,
+          /*input_to_output_weights_ledger*/ nullptr,
+          fw_recurrent_to_input_weights,
+          /*recurrent_to_input_weights_ledger*/ nullptr,
+          fw_recurrent_to_forget_weights,
+          /*recurrent_to_forget_weights_ledger*/ nullptr,
+          fw_recurrent_to_cell_weights,
+          /*recurrent_to_cell_weights_ledger*/ nullptr,
+          fw_recurrent_to_output_weights,
+          /*recurrent_to_output_weights_ledger*/ nullptr,
           fw_cell_to_input_weights, fw_cell_to_forget_weights,
           fw_cell_to_output_weights,
           /*input_layer_norm_coefficients=*/nullptr,
@@ -1149,7 +1282,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           fw_aux_input_to_input_weights, fw_aux_input_to_forget_weights,
           fw_aux_input_to_cell_weights, fw_aux_input_to_output_weights,
           fw_input_gate_bias, fw_forget_gate_bias, fw_cell_gate_bias,
-          fw_output_gate_bias, fw_projection_weights, fw_projection_bias,
+          fw_output_gate_bias, fw_projection_weights,
+          /*projection_weights_ledger*/ nullptr, fw_projection_bias,
           &lstm_params,
           /*forward_sequence=*/true, /*time_major=*/true, /*output_offset=*/0,
           fw_scratch_buffer, GetTemporary(context, node, kInputScalingFactors),
@@ -1167,10 +1301,19 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE_OK(context, fw_pass_status);
 
       TfLiteStatus bw_pass_status = lstm_eval::EvalHybrid(
-          bw_input, bw_input_to_input_weights, bw_input_to_forget_weights,
-          bw_input_to_cell_weights, bw_input_to_output_weights,
-          bw_recurrent_to_input_weights, bw_recurrent_to_forget_weights,
-          bw_recurrent_to_cell_weights, bw_recurrent_to_output_weights,
+          bw_input, bw_input_to_input_weights,
+          /*input_to_input_weights_ledger*/ nullptr, bw_input_to_forget_weights,
+          /*input_to_forget_weights_ledger*/ nullptr, bw_input_to_cell_weights,
+          /*input_to_cell_weights_ledger*/ nullptr, bw_input_to_output_weights,
+          /*input_to_output_weights_ledger*/ nullptr,
+          bw_recurrent_to_input_weights,
+          /*recurrent_to_input_weights_ledger*/ nullptr,
+          bw_recurrent_to_forget_weights,
+          /*recurrent_to_forget_weights_ledger*/ nullptr,
+          bw_recurrent_to_cell_weights,
+          /*recurrent_to_cell_weights_ledger*/ nullptr,
+          bw_recurrent_to_output_weights,
+          /*recurrent_to_output_weights_ledger*/ nullptr,
           bw_cell_to_input_weights, bw_cell_to_forget_weights,
           bw_cell_to_output_weights,
           /*input_layer_norm_coefficients=*/nullptr,
@@ -1180,7 +1323,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           bw_aux_input_to_input_weights, bw_aux_input_to_forget_weights,
           bw_aux_input_to_cell_weights, bw_aux_input_to_output_weights,
           bw_input_gate_bias, bw_forget_gate_bias, bw_cell_gate_bias,
-          bw_output_gate_bias, bw_projection_weights, bw_projection_bias,
+          bw_output_gate_bias, bw_projection_weights,
+          /*projection_weights_ledger*/ nullptr, bw_projection_bias,
           &lstm_params,
           /*forward_sequence=*/false, /*time_major=*/true, bw_output_offset,
           bw_scratch_buffer, GetTemporary(context, node, kInputScalingFactors),
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
index 778751aa04b..6f47fd0d315 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_lstm_test.cc
@@ -160,20 +160,16 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
     }
 
     // Adding the 2 input state tensors.
-    fw_input_activation_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_fw_output_ * n_batch_}},
-                 /*is_variable=*/true);
-    fw_input_cell_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_fw_cell_ * n_batch_}},
-                 /*is_variable=*/true);
+    fw_input_activation_state_ = AddVariableInput(
+        TensorData{TensorType_FLOAT32, {n_fw_output_ * n_batch_}});
+    fw_input_cell_state_ = AddVariableInput(
+        TensorData{TensorType_FLOAT32, {n_fw_cell_ * n_batch_}});
 
     // Adding the 2 input state tensors.
-    bw_input_activation_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_bw_output_ * n_batch_}},
-                 /*is_variable=*/true);
-    bw_input_cell_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_bw_cell_ * n_batch_}},
-                 /*is_variable=*/true);
+    bw_input_activation_state_ = AddVariableInput(
+        TensorData{TensorType_FLOAT32, {n_bw_output_ * n_batch_}});
+    bw_input_cell_state_ = AddVariableInput(
+        TensorData{TensorType_FLOAT32, {n_bw_cell_ * n_batch_}});
 
     fw_output_ = AddOutput(TensorType_FLOAT32);
 
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc b/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
index bc88740b6ed..8ce19e5fb99 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_rnn.cc
@@ -97,21 +97,34 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, node->outputs->size,
                     params->merge_outputs ? 1 : 2);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* fw_input_weights =
-      GetInput(context, node, kFwWeightsTensor);
-  const TfLiteTensor* fw_recurrent_weights =
-      GetInput(context, node, kFwRecurrentWeightsTensor);
-  const TfLiteTensor* fw_bias = GetInput(context, node, kFwBiasTensor);
-  const TfLiteTensor* fw_hidden_state =
-      GetInput(context, node, kFwHiddenStateTensor);
-  const TfLiteTensor* bw_input_weights =
-      GetInput(context, node, kBwWeightsTensor);
-  const TfLiteTensor* bw_recurrent_weights =
-      GetInput(context, node, kBwRecurrentWeightsTensor);
-  const TfLiteTensor* bw_bias = GetInput(context, node, kBwBiasTensor);
-  const TfLiteTensor* bw_hidden_state =
-      GetInput(context, node, kBwHiddenStateTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* fw_input_weights;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kFwWeightsTensor,
+                                          &fw_input_weights));
+  const TfLiteTensor* fw_recurrent_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kFwRecurrentWeightsTensor,
+                                 &fw_recurrent_weights));
+  const TfLiteTensor* fw_bias;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kFwBiasTensor, &fw_bias));
+  const TfLiteTensor* fw_hidden_state;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kFwHiddenStateTensor,
+                                          &fw_hidden_state));
+  const TfLiteTensor* bw_input_weights;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kBwWeightsTensor,
+                                          &bw_input_weights));
+  const TfLiteTensor* bw_recurrent_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kBwRecurrentWeightsTensor,
+                                 &bw_recurrent_weights));
+  const TfLiteTensor* bw_bias;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kBwBiasTensor, &bw_bias));
+  const TfLiteTensor* bw_hidden_state;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kBwHiddenStateTensor,
+                                          &bw_hidden_state));
 
   const TfLiteTensor* aux_input =
       GetOptionalInputTensor(context, node, kAuxInputTensor);
@@ -186,8 +199,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
     node->temporaries->data[kInputQuantized] =
         op_data->scratch_tensor_index + kInputQuantized;
-    TfLiteTensor* input_quantized =
-        GetTemporary(context, node, kInputQuantized);
+    TfLiteTensor* input_quantized;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kInputQuantized,
+                                                &input_quantized));
     input_quantized->type = fw_input_weights->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
@@ -198,8 +212,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
     node->temporaries->data[kFwHiddenStateQuantized] =
         op_data->scratch_tensor_index + kFwHiddenStateQuantized;
-    TfLiteTensor* fw_hidden_state_quantized =
-        GetTemporary(context, node, kFwHiddenStateQuantized);
+    TfLiteTensor* fw_hidden_state_quantized;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kFwHiddenStateQuantized,
+                                       &fw_hidden_state_quantized));
     fw_hidden_state_quantized->type = fw_input_weights->type;
     fw_hidden_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(fw_hidden_state_quantized->dims,
@@ -213,8 +229,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
     node->temporaries->data[kBwHiddenStateQuantized] =
         op_data->scratch_tensor_index + kBwHiddenStateQuantized;
-    TfLiteTensor* bw_hidden_state_quantized =
-        GetTemporary(context, node, kBwHiddenStateQuantized);
+    TfLiteTensor* bw_hidden_state_quantized;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kBwHiddenStateQuantized,
+                                       &bw_hidden_state_quantized));
     bw_hidden_state_quantized->type = fw_input_weights->type;
     bw_hidden_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(bw_hidden_state_quantized->dims,
@@ -229,8 +247,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // Allocate temporary tensors to store scaling factors of quantization.
     node->temporaries->data[kScalingFactors] =
         op_data->scratch_tensor_index + kScalingFactors;
-    TfLiteTensor* scaling_factors =
-        GetTemporary(context, node, kScalingFactors);
+    TfLiteTensor* scaling_factors;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kScalingFactors,
+                                                &scaling_factors));
     scaling_factors->type = kTfLiteFloat32;
     scaling_factors->allocation_type = kTfLiteArenaRw;
     int scaling_dims[1] = {batch_size};
@@ -242,7 +261,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kAccumScratch] =
         op_data->scratch_tensor_index + kAccumScratch;
-    TfLiteTensor* accum_scratch = GetTemporary(context, node, kAccumScratch);
+    TfLiteTensor* accum_scratch;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kAccumScratch,
+                                                &accum_scratch));
     accum_scratch->type = kTfLiteInt32;
     accum_scratch->allocation_type = kTfLiteArenaRw;
     int accum_scratch_dims[2] = {std::max(fw_num_units, bw_num_units),
@@ -257,8 +278,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kZeroPoints] =
         op_data->scratch_tensor_index + kZeroPoints;
-    TfLiteTensor* zero_points =
-        GetTemporary(context, node, /*index=*/kZeroPoints);
+    TfLiteTensor* zero_points;
+    TF_LITE_ENSURE_OK(
+        context,
+        GetTemporarySafe(context, node, /*index=*/kZeroPoints, &zero_points));
     zero_points->type = kTfLiteInt32;
     zero_points->allocation_type = kTfLiteArenaRw;
     int zero_points_dims[1] = {batch_size};
@@ -271,8 +294,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     const int num_row_sums = has_aux_input ? 3 : 2;
     node->temporaries->data[kFwRowSums] =
         op_data->scratch_tensor_index + kFwRowSums;
-    TfLiteTensor* fw_row_sums =
-        GetTemporary(context, node, /*index=*/kFwRowSums);
+    TfLiteTensor* fw_row_sums;
+    TF_LITE_ENSURE_OK(
+        context,
+        GetTemporarySafe(context, node, /*index=*/kFwRowSums, &fw_row_sums));
     fw_row_sums->type = kTfLiteInt32;
     fw_row_sums->allocation_type = kTfLiteArenaRwPersistent;
     int fw_row_sums_dims[2] = {num_row_sums, fw_num_units};
@@ -285,8 +310,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kBwRowSums] =
         op_data->scratch_tensor_index + kBwRowSums;
-    TfLiteTensor* bw_row_sums = GetTemporary(context, node,
-                                             /*index=*/kBwRowSums);
+    TfLiteTensor* bw_row_sums;
+    TF_LITE_ENSURE_OK(
+        context,
+        GetTemporarySafe(context, node, /*index=*/kBwRowSums, &bw_row_sums));
     bw_row_sums->type = kTfLiteInt32;
     bw_row_sums->allocation_type = kTfLiteArenaRwPersistent;
     int bw_row_sums_dims[2] = {num_row_sums, bw_num_units};
@@ -300,8 +327,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     if (has_aux_input) {
       node->temporaries->data[kAuxInputQuantized] =
           op_data->scratch_tensor_index + kAuxInputQuantized;
-      TfLiteTensor* aux_input_quantized =
-          GetTemporary(context, node, kAuxInputQuantized);
+      TfLiteTensor* aux_input_quantized;
+      TF_LITE_ENSURE_OK(context,
+                        GetTemporarySafe(context, node, kAuxInputQuantized,
+                                         &aux_input_quantized));
       aux_input_quantized->type = fw_input_weights->type;
       aux_input_quantized->allocation_type = kTfLiteArenaRw;
       if (!TfLiteIntArrayEqual(aux_input_quantized->dims, aux_input->dims)) {
@@ -315,7 +344,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
 
   // Resize outputs.
-  TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor);
+  TfLiteTensor* fw_output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kFwOutputTensor, &fw_output));
   TfLiteIntArray* fw_output_size_array = TfLiteIntArrayCreate(3);
   fw_output_size_array->data[0] = (time_major) ? max_time : batch_size;
   fw_output_size_array->data[1] = (time_major) ? batch_size : max_time;
@@ -324,7 +355,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(
       context, context->ResizeTensor(context, fw_output, fw_output_size_array));
   if (!params->merge_outputs) {
-    TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor);
+    TfLiteTensor* bw_output;
+    TF_LITE_ENSURE_OK(
+        context, GetOutputSafe(context, node, kBwOutputTensor, &bw_output));
     TfLiteIntArray* bw_output_size_array = TfLiteIntArrayCreate(3);
     bw_output_size_array->data[0] = batch_size;
     bw_output_size_array->data[1] = max_time;
@@ -678,17 +711,28 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const auto* params = reinterpret_cast<TfLiteBidirectionalSequenceRNNParams*>(
       node->builtin_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* fw_input_weights =
-      GetInput(context, node, kFwWeightsTensor);
-  const TfLiteTensor* fw_recurrent_weights =
-      GetInput(context, node, kFwRecurrentWeightsTensor);
-  const TfLiteTensor* fw_bias = GetInput(context, node, kFwBiasTensor);
-  const TfLiteTensor* bw_input_weights =
-      GetInput(context, node, kBwWeightsTensor);
-  const TfLiteTensor* bw_recurrent_weights =
-      GetInput(context, node, kBwRecurrentWeightsTensor);
-  const TfLiteTensor* bw_bias = GetInput(context, node, kBwBiasTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* fw_input_weights;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kFwWeightsTensor,
+                                          &fw_input_weights));
+  const TfLiteTensor* fw_recurrent_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kFwRecurrentWeightsTensor,
+                                 &fw_recurrent_weights));
+  const TfLiteTensor* fw_bias;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kFwBiasTensor, &fw_bias));
+  const TfLiteTensor* bw_input_weights;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kBwWeightsTensor,
+                                          &bw_input_weights));
+  const TfLiteTensor* bw_recurrent_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kBwRecurrentWeightsTensor,
+                                 &bw_recurrent_weights));
+  const TfLiteTensor* bw_bias;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kBwBiasTensor, &bw_bias));
 
   // Get auxiliary inputs.
   const TfLiteTensor* aux_input =
@@ -700,12 +744,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteTensor* fw_hidden_state =
       GetVariableInput(context, node, kFwHiddenStateTensor);
-  TF_LITE_ENSURE(context, fw_hidden_state != nullptr);
+  TFLITE_DCHECK(fw_hidden_state != nullptr);
   TfLiteTensor* bw_hidden_state =
       GetVariableInput(context, node, kBwHiddenStateTensor);
-  TF_LITE_ENSURE(context, bw_hidden_state != nullptr);
+  TFLITE_DCHECK(bw_hidden_state != nullptr);
 
-  TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor);
+  TfLiteTensor* fw_output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kFwOutputTensor, &fw_output));
   TfLiteTensor* bw_output = params->merge_outputs
                                 ? nullptr
                                 : GetOutput(context, node, kBwOutputTensor);
@@ -741,18 +787,34 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                        bw_hidden_state, bw_output);
     case kTfLiteUInt8:
     case kTfLiteInt8: {
-      TfLiteTensor* input_quantized =
-          GetTemporary(context, node, kInputQuantized);
-      TfLiteTensor* fw_hidden_state_quantized =
-          GetTemporary(context, node, kFwHiddenStateQuantized);
-      TfLiteTensor* bw_hidden_state_quantized =
-          GetTemporary(context, node, kBwHiddenStateQuantized);
-      TfLiteTensor* scaling_factors =
-          GetTemporary(context, node, kScalingFactors);
-      TfLiteTensor* zero_points = GetTemporary(context, node, kZeroPoints);
-      TfLiteTensor* accum_scratch = GetTemporary(context, node, kAccumScratch);
-      TfLiteTensor* fw_row_sums = GetTemporary(context, node, kFwRowSums);
-      TfLiteTensor* bw_row_sums = GetTemporary(context, node, kBwRowSums);
+      TfLiteTensor* input_quantized;
+      TF_LITE_ENSURE_OK(
+          context,
+          GetTemporarySafe(context, node, kInputQuantized, &input_quantized));
+      TfLiteTensor* fw_hidden_state_quantized;
+      TF_LITE_ENSURE_OK(context,
+                        GetTemporarySafe(context, node, kFwHiddenStateQuantized,
+                                         &fw_hidden_state_quantized));
+      TfLiteTensor* bw_hidden_state_quantized;
+      TF_LITE_ENSURE_OK(context,
+                        GetTemporarySafe(context, node, kBwHiddenStateQuantized,
+                                         &bw_hidden_state_quantized));
+      TfLiteTensor* scaling_factors;
+      TF_LITE_ENSURE_OK(
+          context,
+          GetTemporarySafe(context, node, kScalingFactors, &scaling_factors));
+      TfLiteTensor* zero_points;
+      TF_LITE_ENSURE_OK(
+          context, GetTemporarySafe(context, node, kZeroPoints, &zero_points));
+      TfLiteTensor* accum_scratch;
+      TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kAccumScratch,
+                                                  &accum_scratch));
+      TfLiteTensor* fw_row_sums;
+      TF_LITE_ENSURE_OK(
+          context, GetTemporarySafe(context, node, kFwRowSums, &fw_row_sums));
+      TfLiteTensor* bw_row_sums;
+      TF_LITE_ENSURE_OK(
+          context, GetTemporarySafe(context, node, kBwRowSums, &bw_row_sums));
       TfLiteTensor* aux_input_quantized =
           use_aux_input ? GetTemporary(context, node, kAuxInputQuantized)
                         : nullptr;
diff --git a/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc b/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
index 870b99d7437..e683a2a2271 100644
--- a/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
+++ b/tensorflow/lite/kernels/bidirectional_sequence_rnn_test.cc
@@ -680,11 +680,11 @@ class BidirectionalRNNOpModel : public SingleOpModel {
     fw_weights_ = AddInput(tensor_type);
     fw_recurrent_weights_ = AddInput(tensor_type);
     fw_bias_ = AddInput(TensorType_FLOAT32);
-    fw_hidden_state_ = AddInput(TensorType_FLOAT32, true);
+    fw_hidden_state_ = AddVariableInput(TensorType_FLOAT32);
     bw_weights_ = AddInput(tensor_type);
     bw_recurrent_weights_ = AddInput(tensor_type);
     bw_bias_ = AddInput(TensorType_FLOAT32);
-    bw_hidden_state_ = AddInput(TensorType_FLOAT32, true);
+    bw_hidden_state_ = AddVariableInput(TensorType_FLOAT32);
 
     const auto input_shape =
         (time_major) ? std::vector<int>({sequence_len_, batches_, input_size_})
diff --git a/tensorflow/lite/kernels/cast.cc b/tensorflow/lite/kernels/cast.cc
index ab95afa979f..3615f825e8f 100644
--- a/tensorflow/lite/kernels/cast.cc
+++ b/tensorflow/lite/kernels/cast.cc
@@ -32,8 +32,11 @@ constexpr int kOutputTensor = 0;
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   // TODO(ahentz): these two checks would make the new implementation
   // incompatible with some existing models, where params is not specified. It
@@ -98,8 +101,11 @@ TfLiteStatus copyToTensor(TfLiteContext* context, const FromT* in,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   const int num_elements = NumElements(input);
   TF_LITE_ENSURE_EQ(context, num_elements, NumElements(output));
   switch (input->type) {
diff --git a/tensorflow/lite/kernels/ceil.cc b/tensorflow/lite/kernels/ceil.cc
index 95c660f3376..a593f296876 100644
--- a/tensorflow/lite/kernels/ceil.cc
+++ b/tensorflow/lite/kernels/ceil.cc
@@ -29,8 +29,11 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
@@ -40,8 +43,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   if (input->type != kTfLiteFloat32) {
     TF_LITE_UNSUPPORTED_TYPE(context, input->type, "Ceil");
   }
diff --git a/tensorflow/lite/kernels/comparisons.cc b/tensorflow/lite/kernels/comparisons.cc
index 7d1c6b7804e..d0a1876c5c6 100644
--- a/tensorflow/lite/kernels/comparisons.cc
+++ b/tensorflow/lite/kernels/comparisons.cc
@@ -41,9 +41,15 @@ TfLiteStatus ComparisonPrepareCommon(TfLiteContext* context, TfLiteNode* node,
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   // Don't support string.
   if (!is_string_allowed) {
@@ -145,9 +151,15 @@ void ComparisonString(bool (*opname)(const StringRef&, const StringRef&),
 }
 
 TfLiteStatus EqualEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   bool requires_broadcast = !HaveSameShapes(input1, input2);
   switch (input1->type) {
     case kTfLiteBool:
@@ -189,9 +201,15 @@ TfLiteStatus EqualEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus NotEqualEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   bool requires_broadcast = !HaveSameShapes(input1, input2);
   switch (input1->type) {
     case kTfLiteBool:
@@ -233,9 +251,15 @@ TfLiteStatus NotEqualEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus GreaterEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   bool requires_broadcast = !HaveSameShapes(input1, input2);
   switch (input1->type) {
     case kTfLiteFloat32:
@@ -268,9 +292,15 @@ TfLiteStatus GreaterEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus GreaterEqualEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   bool requires_broadcast = !HaveSameShapes(input1, input2);
   switch (input1->type) {
     case kTfLiteFloat32:
@@ -303,9 +333,15 @@ TfLiteStatus GreaterEqualEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   bool requires_broadcast = !HaveSameShapes(input1, input2);
   switch (input1->type) {
     case kTfLiteFloat32:
@@ -338,9 +374,15 @@ TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus LessEqualEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   bool requires_broadcast = !HaveSameShapes(input1, input2);
   switch (input1->type) {
     case kTfLiteFloat32:
diff --git a/tensorflow/lite/kernels/complex_support_test.cc b/tensorflow/lite/kernels/complex_support_test.cc
index cb60345010b..20f88678c97 100644
--- a/tensorflow/lite/kernels/complex_support_test.cc
+++ b/tensorflow/lite/kernels/complex_support_test.cc
@@ -159,9 +159,3 @@ TEST(ImagOpTest, SimpleDoubleTest) {
 }  // namespace custom
 }  // namespace ops
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/concatenation.cc b/tensorflow/lite/kernels/concatenation.cc
index 5d5f06ba013..01f7f9fcc48 100644
--- a/tensorflow/lite/kernels/concatenation.cc
+++ b/tensorflow/lite/kernels/concatenation.cc
@@ -45,7 +45,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // The number of dimensions of the input tensors must match, and all
   // dimensions except 'axis' must be equal.
-  const TfLiteTensor* t0 = GetInput(context, node, 0);
+  const TfLiteTensor* t0;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &t0));
   TfLiteType input_type = t0->type;
   if (axis < 0) axis += t0->dims->size;
   TF_LITE_ENSURE(context, axis >= 0);
@@ -57,13 +58,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context,
                  input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
                      input_type == kTfLiteInt8 || input_type == kTfLiteInt16 ||
-                     input_type == kTfLiteInt32 || input_type == kTfLiteInt64);
+                     input_type == kTfLiteInt32 || input_type == kTfLiteInt64 ||
+                     input_type == kTfLiteBool);
 
   // Output dimensions will match input dimensions, except 'axis', which
   // will be the sum of inputs
   int sum_axis = t0->dims->data[axis];
   for (int i = 1; i < num_inputs; ++i) {
-    const TfLiteTensor* t = GetInput(context, node, i);
+    const TfLiteTensor* t;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, i, &t));
     TF_LITE_ENSURE_EQ(context, t->dims->size, t0->dims->size);
     TF_LITE_ENSURE_EQ(context, t->type, input_type);
     for (int d = 0; d < t0->dims->size; ++d) {
@@ -80,7 +83,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     output_size->data[d] = (d == axis) ? sum_axis : t0->dims->data[d];
   }
 
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   TF_LITE_ENSURE_TYPES_EQ(context, output->type, input_type);
 
   if (input_type == kTfLiteInt8) {
@@ -88,7 +92,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // is a restriction we introduced to Int8 kernels.
     VectorOfTensors<int8_t> all_inputs(*context, *node->inputs);
     for (int i = 0; i < node->inputs->size; ++i) {
-      const TfLiteTensor* t = GetInput(context, node, i);
+      const TfLiteTensor* t;
+      TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, i, &t));
       TF_LITE_ENSURE_EQ(context, t->params.scale, output->params.scale);
       TF_LITE_ENSURE_EQ(context, t->params.zero_point,
                         output->params.zero_point);
@@ -103,7 +108,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data);
   int axis = params->axis;
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   if (axis < 0) axis += output->dims->size;
 
 // TODO(ahentz): Creating 'all_inputs' below is not very efficient. We should
@@ -167,6 +173,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt16:
       TF_LITE_CONCATENATION(int16_t);
       break;
+    case kTfLiteBool:
+      TF_LITE_CONCATENATION(bool);
+      break;
     default:
       context->ReportError(context, "Type '%s' is not supported currently.",
                            TfLiteTypeGetName(output->type));
diff --git a/tensorflow/lite/kernels/concatenation_test.cc b/tensorflow/lite/kernels/concatenation_test.cc
index 4e362598aae..5a36895d847 100644
--- a/tensorflow/lite/kernels/concatenation_test.cc
+++ b/tensorflow/lite/kernels/concatenation_test.cc
@@ -90,6 +90,15 @@ class QuantizedConcatenationOpModel : public BaseConcatenationOpModel {
   }
 };
 
+class BoolConcatenationOpModel : public BaseConcatenationOpModel {
+ public:
+  using BaseConcatenationOpModel::BaseConcatenationOpModel;
+  void SetInput(int index, std::initializer_list<bool> data) {
+    PopulateTensor(index, data);
+  }
+  std::vector<bool> GetOutput() { return ExtractVector<bool>(output_); }
+};
+
 TEST(ConcatenationOpTest, ThreeDimensionalOneInput) {
   ConcatenationOpModel m0({TensorType_FLOAT32, {2, 1, 2}}, /*axis=*/1,
                           /*num_inputs=*/1);
@@ -447,5 +456,27 @@ TEST(ConcatenationOpTest, TwoInputsTwoAxesNegativeAxesNonQuantized) {
               ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
 }
 
+TEST(ConcatenationOpTest, BoolTypeOneInput) {
+  BoolConcatenationOpModel m0({TensorType_BOOL, {2, 1, 2}}, /*axis=*/1,
+                              /*num_inputs=*/1);
+  m0.SetInput(0, {true, false, false, true});
+  m0.Invoke();
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({true, false, false, true}));
+}
+
+TEST(ConcatenationOpTest, BoolTypeTwoInputs) {
+  BoolConcatenationOpModel m0(
+      {{TensorType_BOOL, {2, 1, 2}}, {TensorType_BOOL, {2, 3, 2}}},
+      /*axis=*/1, /*num_inputs=*/2, TensorType_BOOL);
+  m0.SetInput(0, {false, false, false, false});
+  m0.SetInput(1, {true, true, true, true, true, true, true, true, true, true,
+                  true, true});
+  m0.Invoke();
+  EXPECT_THAT(
+      m0.GetOutput(),
+      ElementsAreArray({false, false, true, true, true, true, true, true, false,
+                        false, true, true, true, true, true, true}));
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 3c11ecf172b..5c978f8dbfb 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -19,14 +19,21 @@ limitations under the License.
 #include <cstdint>
 #include <vector>
 
+// Only use multi-threaded Eigen if ruy is disabled.
+#if !defined(TFLITE_WITH_RUY)
+#define TFLITE_WITH_MULTITHREADED_EIGEN
+#endif
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
+#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
 #include "tensorflow/lite/kernels/eigen_support.h"
+#endif
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 // b/131835803 forces us to include multithreaded_conv.h before optimized_ops.h
-#ifndef TFLITE_WITH_RUY
+#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
 #include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
 #endif
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
@@ -122,12 +129,16 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // Instead, we allocate a new object to use as scratch space for im2col, and
   // to carry information from Prepare() to Eval().
   auto* data = new OpData;
+#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
   eigen_support::IncrementUsageCounter(context);
+#endif
   return data;
 }
 
 void Free(TfLiteContext* context, void* buffer) {
+#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
   eigen_support::DecrementUsageCounter(context);
+#endif
   delete reinterpret_cast<OpData*>(buffer);
 }
 
@@ -211,8 +222,10 @@ static TfLiteStatus AllocateTemporaryTensorsIfRequired(TfLiteContext* context,
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TF_LITE_ENSURE(context, node->inputs->size >= 2);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  const TfLiteTensor* filter = GetInput(context, node, 1);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  const TfLiteTensor* filter;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &filter));
 
   // If we're using the optimized multithreaded EigenTensor implementation of
   // convolution, it expects the filter weights to be transposed compared to
@@ -305,9 +318,12 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
   // Check number of inputs/outputs
   TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
-  TfLiteTensor* output = GetOutput(context, node, 0);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  const TfLiteTensor* filter = GetInput(context, node, 1);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  const TfLiteTensor* filter;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &filter));
 
   // Check dimensionality of input, filter
   TF_LITE_ENSURE_EQ(context, input->dims->size, 4);
@@ -329,7 +345,7 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
   TF_LITE_ENSURE(context, has_bias);
 
   if (has_bias) {
-    bias = GetInput(context, node, 2);
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 2, &bias));
     if (input_type == kTfLiteUInt8 || input_type == kTfLiteInt8) {
       TF_LITE_ENSURE_TYPES_EQ(context, bias->type, kTfLiteInt32);
       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
@@ -482,8 +498,10 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
   if (is_hybrid) {
     node->temporaries->data[data->input_quantized_index] =
         data->input_quantized_id;
-    TfLiteTensor* input_quantized =
-        GetTemporary(context, node, data->input_quantized_index);
+    TfLiteTensor* input_quantized;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, data->input_quantized_index,
+                                  &input_quantized));
     input_quantized->type = kTfLiteInt8;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
@@ -494,8 +512,10 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
 
     node->temporaries->data[data->scaling_factors_index] =
         data->scaling_factors_id;
-    TfLiteTensor* scaling_factors =
-        GetTemporary(context, node, data->scaling_factors_index);
+    TfLiteTensor* scaling_factors;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, data->scaling_factors_index,
+                                  &scaling_factors));
     scaling_factors->type = kTfLiteFloat32;
     scaling_factors->allocation_type = kTfLiteArenaRw;
     // Only one scale factor per batch is typically necessary. See optimized
@@ -511,8 +531,10 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
     }
 
     node->temporaries->data[data->accum_scratch_index] = data->accum_scratch_id;
-    TfLiteTensor* accum_scratch =
-        GetTemporary(context, node, data->accum_scratch_index);
+    TfLiteTensor* accum_scratch;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, data->accum_scratch_index,
+                                       &accum_scratch));
     accum_scratch->type = kTfLiteInt32;
     accum_scratch->allocation_type = kTfLiteArenaRw;
     const int scratch_width = batches * out_height * out_width;
@@ -534,8 +556,10 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
           context, affine_quantization->scale->size,
           filter->dims->data[affine_quantization->quantized_dimension]);
       node->temporaries->data[data->input_offset_index] = data->input_offset_id;
-      TfLiteTensor* input_offsets =
-          GetTemporary(context, node, data->input_offset_index);
+      TfLiteTensor* input_offsets;
+      TF_LITE_ENSURE_OK(
+          context, GetTemporarySafe(context, node, data->input_offset_index,
+                                    &input_offsets));
       input_offsets->type = kTfLiteInt32;
       input_offsets->allocation_type = kTfLiteArenaRw;
       // See above comment for the need to allocate for height of inputs.
@@ -549,8 +573,10 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
                                                          input_offsets_size));
       }
       node->temporaries->data[data->row_sums_index] = data->row_sums_id;
-      TfLiteTensor* row_sums =
-          GetTemporary(context, node, data->row_sums_index);
+      TfLiteTensor* row_sums;
+      TF_LITE_ENSURE_OK(
+          context,
+          GetTemporarySafe(context, node, data->row_sums_index, &row_sums));
       row_sums->type = kTfLiteInt32;
       row_sums->allocation_type = kTfLiteArenaRwPersistent;
       // See above comment for the need to allocate for height of inputs.
@@ -765,12 +791,7 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
       break;
     }
     case kMultithreadOptimized: {
-#ifdef TFLITE_WITH_RUY
-      // See Register_CONV_2D: we should never be here when TFLITE_WITH_RUY
-      // was enabled. We #if out this code in order to get the corresponding
-      // binary size benefits.
-      TFLITE_DCHECK(false);
-#else
+#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
       const float* filter_data;
       if (data->need_hwcn_weights) {
         filter_data = GetTensorData<float>(hwcn_weights);
@@ -785,29 +806,45 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
           GetTensorData<float>(output), GetTensorShape(im2col),
           GetTensorData<float>(im2col));
       break;
-#endif
+#else  // !defined(TFLITE_WITH_MULTITHREADED_EIGEN)
+      // See Register_CONV_2D: we should never be here when TFLITE_WITH_RUY
+      // was enabled. We #if out this code in order to get the corresponding
+      // binary size benefits.
+      TFLITE_DCHECK(false);
+#endif  // defined(TFLITE_WITH_MULTITHREADED_EIGEN)
     }
   }
 }
 
 template <KernelType kernel_type>
-void EvalHybridPerChannel(TfLiteContext* context, TfLiteNode* node,
-                          TfLiteConvParams* params, OpData* data,
-                          const TfLiteTensor* input, const TfLiteTensor* filter,
-                          const TfLiteTensor* bias, TfLiteTensor* im2col,
-                          TfLiteTensor* output) {
+TfLiteStatus EvalHybridPerChannel(TfLiteContext* context, TfLiteNode* node,
+                                  TfLiteConvParams* params, OpData* data,
+                                  const TfLiteTensor* input,
+                                  const TfLiteTensor* filter,
+                                  const TfLiteTensor* bias,
+                                  TfLiteTensor* im2col, TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
 
   const int input_size = NumElements(input) / SizeOfDimension(input, 0);
   const int batch_size = SizeOfDimension(input, 0);
-  int8_t* quantized_input_ptr_batch = GetTensorData<int8_t>(
-      GetTemporary(context, node, data->input_quantized_index));
-  float* scaling_factors_ptr = GetTensorData<float>(
-      GetTemporary(context, node, data->scaling_factors_index));
-  int32_t* input_offset_ptr = GetTensorData<int32_t>(
-      GetTemporary(context, node, data->input_offset_index));
+  TfLiteTensor* quantized_input_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, data->input_quantized_index,
+                                     &quantized_input_tensor));
+  int8_t* quantized_input_ptr_batch =
+      GetTensorData<int8_t>(quantized_input_tensor);
+  TfLiteTensor* scaling_factors_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, data->scaling_factors_index,
+                                     &scaling_factors_tensor));
+  float* scaling_factors_ptr = GetTensorData<float>(scaling_factors_tensor);
+  TfLiteTensor* input_offset_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, data->input_offset_index,
+                                     &input_offset_tensor));
+  int32_t* input_offset_ptr = GetTensorData<int32_t>(input_offset_tensor);
 
   for (int b = 0; b < batch_size; ++b) {
     const int offset = b * input_size;
@@ -848,10 +885,14 @@ void EvalHybridPerChannel(TfLiteContext* context, TfLiteNode* node,
     case kGenericOptimized:
     case kMultithreadOptimized:
     case kCblasOptimized: {
-      TfLiteTensor* row_sums =
-          GetTemporary(context, node, data->row_sums_index);
-      TfLiteTensor* scratch =
-          GetTemporary(context, node, data->accum_scratch_index);
+      TfLiteTensor* row_sums;
+      TF_LITE_ENSURE_OK(
+          context,
+          GetTemporarySafe(context, node, data->row_sums_index, &row_sums));
+      TfLiteTensor* scratch;
+      TF_LITE_ENSURE_OK(
+          context,
+          GetTemporarySafe(context, node, data->accum_scratch_index, &scratch));
       optimized_ops::HybridConvPerChannel(
           op_params, scaling_factors_ptr, GetTensorShape(input),
           quantized_input_ptr_batch, GetTensorShape(filter), filter_ptr,
@@ -866,14 +907,16 @@ void EvalHybridPerChannel(TfLiteContext* context, TfLiteNode* node,
       break;
     }
   }
+
+  return kTfLiteOk;
 }
 
 template <KernelType kernel_type>
-void EvalHybrid(TfLiteContext* context, TfLiteNode* node,
-                TfLiteConvParams* params, OpData* data,
-                const TfLiteTensor* input, const TfLiteTensor* filter,
-                const TfLiteTensor* bias, TfLiteTensor* im2col,
-                TfLiteTensor* accum_scratch, TfLiteTensor* output) {
+TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
+                        TfLiteConvParams* params, OpData* data,
+                        const TfLiteTensor* input, const TfLiteTensor* filter,
+                        const TfLiteTensor* bias, TfLiteTensor* im2col,
+                        TfLiteTensor* accum_scratch, TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
@@ -882,10 +925,17 @@ void EvalHybrid(TfLiteContext* context, TfLiteNode* node,
   const int batch_size = SizeOfDimension(input, 0);
 
   const float* input_ptr = GetTensorData<float>(input);
-  int8_t* quantized_input_ptr_batch = GetTensorData<int8_t>(
-      GetTemporary(context, node, data->input_quantized_index));
-  float* scaling_factors_ptr = GetTensorData<float>(
-      GetTemporary(context, node, data->scaling_factors_index));
+  TfLiteTensor* quantized_input_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, data->input_quantized_index,
+                                     &quantized_input_tensor));
+  int8_t* quantized_input_ptr_batch =
+      GetTensorData<int8_t>(quantized_input_tensor);
+  TfLiteTensor* scaling_factors_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, data->scaling_factors_index,
+                                     &scaling_factors_tensor));
+  float* scaling_factors_ptr = GetTensorData<float>(scaling_factors_tensor);
 
   // Per-batch input quantization for higher accuracy.
   {
@@ -928,6 +978,8 @@ void EvalHybrid(TfLiteContext* context, TfLiteNode* node,
       break;
     }
   }
+
+  return kTfLiteOk;
 }
 
 template <KernelType kernel_type, TfLiteType input_type>
@@ -935,9 +987,12 @@ TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* output = GetOutput(context, node, 0);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  const TfLiteTensor* filter = GetInput(context, node, 1);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  const TfLiteTensor* filter;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &filter));
   bool has_bias = node->inputs->size == 3;
   const TfLiteTensor* bias = has_bias ? GetInput(context, node, 2) : nullptr;
   TfLiteTensor* im2col =
@@ -959,14 +1014,17 @@ TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteFloat32:
       if (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8) {
         if (data->is_hybrid_per_channel) {
-          EvalHybridPerChannel<kernel_type>(context, node, params, data, input,
-                                            filter, bias, im2col, output);
+          TF_LITE_ENSURE_OK(context, EvalHybridPerChannel<kernel_type>(
+                                         context, node, params, data, input,
+                                         filter, bias, im2col, output));
         } else {
           TfLiteTensor* accum_scratch =
               &context->tensors[node->temporaries
                                     ->data[data->accum_scratch_index]];
-          EvalHybrid<kernel_type>(context, node, params, data, input, filter,
-                                  bias, im2col, accum_scratch, output);
+          TF_LITE_ENSURE_OK(context,
+                            EvalHybrid<kernel_type>(context, node, params, data,
+                                                    input, filter, bias, im2col,
+                                                    accum_scratch, output));
         }
       } else {
         EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
@@ -995,7 +1053,8 @@ TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
 
   switch (input->type) {
     case kTfLiteFloat32:
@@ -1053,11 +1112,10 @@ TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() {
 TfLiteRegistration* Register_CONV_2D() {
 #if defined TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
   return Register_CONVOLUTION_CBLAS_OPT();
-#elif defined TFLITE_WITH_RUY
-  // TFLITE_WITH_RUY optimizes the generic kernel type.
-  return Register_CONVOLUTION_GENERIC_OPT();
-#else
+#elif defined TFLITE_WITH_MULTITHREADED_EIGEN
   return Register_CONVOLUTION_MULTITHREADED_OPT();
+#else
+  return Register_CONVOLUTION_GENERIC_OPT();
 #endif
 }
 
diff --git a/tensorflow/lite/kernels/cpu_backend_context.cc b/tensorflow/lite/kernels/cpu_backend_context.cc
index a99d08769ea..6eacb3d2216 100644
--- a/tensorflow/lite/kernels/cpu_backend_context.cc
+++ b/tensorflow/lite/kernels/cpu_backend_context.cc
@@ -17,10 +17,15 @@ limitations under the License.
 
 #include <memory>
 
+#ifdef TFLITE_HAVE_CPUINFO
+#include "include/cpuinfo.h"
+#endif
+
 #include "public/gemmlowp.h"
 #include "ruy/context.h"  // from @ruy
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/external_cpu_backend_context.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 
 namespace {
@@ -30,6 +35,62 @@ const int kDefaultNumThreadpoolThreads = 1;
 
 namespace tflite {
 
+#ifdef TFLITE_HAVE_CPUINFO
+CpuBackendContext::CpuInfo::~CpuInfo() {
+  if (init_status_ == InitStatus::kInitialized) {
+    cpuinfo_deinitialize();
+  }
+}
+
+bool CpuBackendContext::CpuInfo::EnsureInitialized() {
+  if (init_status_ == InitStatus::kNotYetAttempted) {
+    init_status_ = Initialize();
+  }
+  return init_status_ == InitStatus::kInitialized;
+}
+
+CpuBackendContext::CpuInfo::InitStatus
+CpuBackendContext::CpuInfo::Initialize() {
+  TFLITE_DCHECK_EQ(init_status_, InitStatus::kNotYetAttempted);
+  if (!cpuinfo_initialize()) {
+    return InitStatus::kFailed;
+  }
+  return InitStatus::kInitialized;
+}
+
+bool CpuBackendContext::CpuInfo::Avx2Fma() {
+  return EnsureInitialized() && cpuinfo_has_x86_avx2() &&
+         cpuinfo_has_x86_fma3();
+}
+
+bool CpuBackendContext::CpuInfo::Avx() {
+  return EnsureInitialized() && cpuinfo_has_x86_avx();
+}
+
+bool CpuBackendContext::CpuInfo::Avx512() {
+  return EnsureInitialized() && cpuinfo_has_x86_avx512f() &&
+         cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512cd() &&
+         cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512vl();
+}
+#else
+
+CpuBackendContext::CpuInfo::~CpuInfo() {}
+
+bool CpuBackendContext::CpuInfo::EnsureInitialized() {
+  if (init_status_ == InitStatus::kNotYetAttempted) {
+    init_status_ = InitStatus::kInitialized;
+  }
+  TFLITE_DCHECK_EQ(init_status_, InitStatus::kInitialized);
+  return true;
+}
+
+bool CpuBackendContext::CpuInfo::Avx2Fma() { return false; }
+
+bool CpuBackendContext::CpuInfo::Avx() { return false; }
+
+bool CpuBackendContext::CpuInfo::Avx512() { return false; }
+#endif  // TFLITE_HAVE_CPUINFO
+
 CpuBackendContext* CpuBackendContext::GetFromContext(TfLiteContext* context) {
   auto* external_context = static_cast<ExternalCpuBackendContext*>(
       context->GetExternalContext(context, kTfLiteCpuBackendContext));
@@ -79,4 +140,8 @@ void CpuBackendContext::SetMaxNumThreads(int max_num_threads) {
 
 void CpuBackendContext::SetUseCaching(bool flag) { use_caching_ = flag; }
 
+bool CpuBackendContext::HasAvxOrAbove() {
+  return cpuinfo_.Avx() || cpuinfo_.Avx2Fma() || cpuinfo_.Avx512();
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/cpu_backend_context.h b/tensorflow/lite/kernels/cpu_backend_context.h
index 124b9b849a2..e0207176eb4 100644
--- a/tensorflow/lite/kernels/cpu_backend_context.h
+++ b/tensorflow/lite/kernels/cpu_backend_context.h
@@ -50,7 +50,35 @@ class CpuBackendContext final : public TfLiteInternalBackendContext {
 
   void ClearCaches() override { ruy_context_->ClearPrepackedCache(); }
 
+  bool HasAvxOrAbove();
+
  private:
+  // Copy the wrapper class for cpuinfo from Ruy.
+  class CpuInfo final {
+   public:
+    CpuInfo() {}
+    ~CpuInfo();
+
+    // X86 features
+    bool Avx();
+    bool Avx2Fma();
+    bool Avx512();
+
+   private:
+    enum class InitStatus {
+      kNotYetAttempted,
+      kInitialized,
+      kFailed,
+    };
+
+    InitStatus init_status_ = InitStatus::kNotYetAttempted;
+
+    bool EnsureInitialized();
+    InitStatus Initialize();
+    CpuInfo(const CpuInfo&) = delete;
+    CpuInfo& operator=(const CpuInfo&) = delete;
+  };
+
   // To enable a smooth transition from the current direct usage
   // of the underlying gemmlowp context to going through abstractions
   // (see :cpu_backend_gemm), for now a CpuBackendContext always
@@ -59,6 +87,7 @@ class CpuBackendContext final : public TfLiteInternalBackendContext {
   // elide what can be elided based on TFLITE_WITH_RUY.
   const std::unique_ptr<ruy::Context> ruy_context_;
   const std::unique_ptr<gemmlowp::GemmContext> gemmlowp_context_;
+  CpuInfo cpuinfo_;
 
   // The maximum of threads used for parallelizing TfLite ops. However,
   // cpu_backend_threadpool::Execute creates as many threads as it's
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm.h b/tensorflow/lite/kernels/cpu_backend_gemm.h
index 14ff571e7da..6950e182dfa 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm.h
+++ b/tensorflow/lite/kernels/cpu_backend_gemm.h
@@ -27,22 +27,54 @@ limitations under the License.
 #ifndef TFLITE_WITH_RUY
 #include "tensorflow/lite/kernels/cpu_backend_gemm_eigen.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_x86.h"
 #endif
 
 namespace tflite {
 
 namespace cpu_backend_gemm {
 
+// The main entry point for CpuBackendGemm::Gemm.
+//
+// If TFLITE_WITH_RUY is set, CpuBackendGemm::Gemm will always go to Ruy aka
+// GemmImplUsingRuy. Other cases are as follows:
+//
+//                    |Quantized (uint8)|Quantized (int8)| Float |
+// TFLITE_WITH_RUY    |      Ruy        |      Ruy       | Ruy   |
+// !TFLITE_WITH_RUY   |      gemmlowp   |  Ruy/gemmlowp* | eigen |
+// * - Ruy if NEON is not available.
+
+//  On x86 platforms:
+//  (default)         |      gemmlowp   |     Ruy        | eigen |
+//  TFLITE_X86_RUY_\  |      Ruy        |     Ruy        | Ruy   |
+//  ENABLED && (AVX
+//  or above available)
+
+
+#if (defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || \
+     defined(_M_X64))
+#define TFLITE_X86_PLATFORM
+#endif
+
+// TODO(b/168923364)  Set TFLITE_X86_RUY_ENABLED default 'on' when ready.
+#if defined(TFLITE_X86_PLATFORM) && defined(TFLITE_X86_RUY_ENABLED)
+/* GEMM dispatch implementation for x86.
+ */
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+struct GemmImpl : detail::GemmImplX86<LhsScalar, RhsScalar, AccumScalar,
+                                      DstScalar, quantization_flavor> {};
+#else
 /* Generic implementation using ruy.
  * Non-ruy implementation will be partial specializations of this template.
  */
-
 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
           typename DstScalar, QuantizationFlavor quantization_flavor>
 struct GemmImpl : detail::GemmImplUsingRuy<LhsScalar, RhsScalar, AccumScalar,
                                            DstScalar, quantization_flavor> {};
+#endif
 
-#ifndef TFLITE_WITH_RUY
+#if !defined(TFLITE_WITH_RUY) && !defined(TFLITE_X86_RUY_ENABLED)
 
 /* Specializations using gemmlowp */
 
@@ -56,7 +88,7 @@ struct GemmImpl<SrcScalar, SrcScalar, std::int32_t, DstScalar,
 // When SrcScalar=int8 or DstScalar=int8, gemmlowp fails to compile
 // outside of NEON. We avoid the compilation failure by subspecializing these
 // cases, rerouting it back to ruy.
-#ifndef GEMMLOWP_NEON
+#if !defined(GEMMLOWP_NEON)
 template <typename SrcScalar, QuantizationFlavor quantization_flavor>
 struct GemmImpl<SrcScalar, SrcScalar, std::int32_t, std::int8_t,
                 quantization_flavor>
@@ -82,7 +114,7 @@ template <>
 struct GemmImpl<float, float, float, float, QuantizationFlavor::kFloatingPoint>
     : detail::GemmImplUsingEigen {};
 
-#endif  // not TFLITE_WITH_RUY
+#endif  // not TFLITE_WITH_RUY && not TFLITE_X86_RUY_ENABLED
 
 /* Public entry point */
 
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_x86.h b/tensorflow/lite/kernels/cpu_backend_gemm_x86.h
new file mode 100644
index 00000000000..20af9536d47
--- /dev/null
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_x86.h
@@ -0,0 +1,115 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_X86_H_
+#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_X86_H_
+
+// If TFLITE_WITH_RUY is set, Ruy is the only GEMM option. In this header
+// we select either Ruy or an alternative based on the SIMD extentions
+// available on the given x86 platform.
+#ifndef TFLITE_WITH_RUY
+
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_eigen.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_gemmlowp.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
+#include "tensorflow/lite/kernels/cpu_backend_gemm_ruy.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+namespace cpu_backend_gemm {
+namespace detail {
+
+template <typename LhsScalar, typename RhsScalar, typename AccumScalar,
+          typename DstScalar, QuantizationFlavor quantization_flavor>
+struct GemmImplX86 {
+  static void Run(
+      const MatrixParams<LhsScalar>& lhs_params, const LhsScalar* lhs_data,
+      const MatrixParams<RhsScalar>& rhs_params, const RhsScalar* rhs_data,
+      const MatrixParams<DstScalar>& dst_params, DstScalar* dst_data,
+      const GemmParams<AccumScalar, DstScalar, quantization_flavor>& params,
+      CpuBackendContext* context) {
+    // Run-time dispatch to Ruy for platforms with AVX or above.
+    if (context->HasAvxOrAbove()) {
+      detail::GemmImplUsingRuy<LhsScalar, RhsScalar, AccumScalar, DstScalar,
+                               quantization_flavor>::Run(lhs_params, lhs_data,
+                                                         rhs_params, rhs_data,
+                                                         dst_params, dst_data,
+                                                         params, context);
+    } else {
+      // Dispatch to gemmlowp for SSE.
+      detail::GemmImplUsingGemmlowp<
+          LhsScalar, RhsScalar, AccumScalar, DstScalar,
+          quantization_flavor>::Run(lhs_params, lhs_data, rhs_params, rhs_data,
+                                    dst_params, dst_data, params, context);
+    }
+  }
+};
+
+// For float, again prefer Ruy in all cases, but defer to eigen if no flavor of
+// AVX is present.
+template <>
+struct GemmImplX86<float, float, float, float,
+                   QuantizationFlavor::kFloatingPoint> {
+  static void Run(const MatrixParams<float>& lhs_params, const float* lhs_data,
+                  const MatrixParams<float>& rhs_params, const float* rhs_data,
+                  const MatrixParams<float>& dst_params, float* dst_data,
+                  const GemmParams<float, float,
+                                   QuantizationFlavor::kFloatingPoint>& params,
+                  CpuBackendContext* context) {
+    // Run-time dispatch to Ruy for platforms with AVX or above.
+    if (context->HasAvxOrAbove()) {
+      detail::GemmImplUsingRuy<
+          float, float, float, float,
+          QuantizationFlavor::kFloatingPoint>::Run(lhs_params, lhs_data,
+                                                   rhs_params, rhs_data,
+                                                   dst_params, dst_data, params,
+                                                   context);
+    } else {
+      // Dispatch to gemmlowp for SSE.
+      GemmImplUsingEigen::Run(lhs_params, lhs_data, rhs_params, rhs_data,
+                              dst_params, dst_data, params, context);
+    }
+  }
+};
+
+// gemmlowp requires NEON for certain quantization cases. See note in
+// cpu_backend_gemm.h
+#if !defined(GEMMLOWP_NEON)
+template <typename SrcScalar, QuantizationFlavor quantization_flavor>
+struct GemmImplX86<SrcScalar, SrcScalar, std::int32_t, std::int8_t,
+                   quantization_flavor>
+    : detail::GemmImplUsingRuy<SrcScalar, SrcScalar, std::int32_t, std::int8_t,
+                               quantization_flavor> {};
+
+template <typename DstScalar, QuantizationFlavor quantization_flavor>
+struct GemmImplX86<std::int8_t, std::int8_t, std::int32_t, DstScalar,
+                   quantization_flavor>
+    : detail::GemmImplUsingRuy<std::int8_t, std::int8_t, std::int32_t,
+                               DstScalar, quantization_flavor> {};
+
+template <QuantizationFlavor quantization_flavor>
+struct GemmImplX86<std::int8_t, std::int8_t, std::int32_t, std::int8_t,
+                   quantization_flavor>
+    : detail::GemmImplUsingRuy<std::int8_t, std::int8_t, std::int32_t,
+                               std::int8_t, quantization_flavor> {};
+#endif  // not GEMMLOWP_NEON
+}  // namespace detail
+}  // namespace cpu_backend_gemm
+}  // namespace tflite
+
+#endif  // not TFLITE_WITH_RUY
+
+#endif  // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_X86_H_
diff --git a/tensorflow/lite/kernels/cumsum.cc b/tensorflow/lite/kernels/cumsum.cc
new file mode 100644
index 00000000000..c123f479d3f
--- /dev/null
+++ b/tensorflow/lite/kernels/cumsum.cc
@@ -0,0 +1,132 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+// TODO(b/161933288): Promote this op to builtin-op when we can add new builtin
+// ops.
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace cumsum {
+
+typedef struct {
+  bool exclusive;
+  bool reverse;
+} TfLiteCumsumParams;
+
+static const int kInputTensor = 0;
+static const int kAxisTensor = 1;
+static const int kOutputTensor = 0;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new TfLiteCumsumParams;
+  const uint8_t* buffer_data = reinterpret_cast<const uint8_t*>(buffer);
+
+  const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_data, length).AsMap();
+  data->exclusive = m["exclusive"].AsBool();
+  data->reverse = m["reverse"].AsBool();
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<TfLiteCumsumParams*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* axis = GetInput(context, node, kAxisTensor);
+
+  TF_LITE_ENSURE(context, input->type == kTfLiteInt32 ||
+                              input->type == kTfLiteFloat32 ||
+                              input->type == kTfLiteInt64);
+  TF_LITE_ENSURE_EQ(context, axis->type, kTfLiteInt32);
+
+  TF_LITE_ENSURE_EQ(context, NumElements(axis), 1);
+
+  TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TfLiteIntArray* output_shape = TfLiteIntArrayCopy(input->dims);
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* axis_tensor = GetInput(context, node, kAxisTensor);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  auto* params = reinterpret_cast<TfLiteCumsumParams*>(node->user_data);
+
+  int axis = *GetTensorData<int>(axis_tensor);
+  if (axis < 0) axis += NumDimensions(input);
+
+  if (axis < 0 || axis >= NumDimensions(input)) {
+    TF_LITE_KERNEL_LOG(context, "Invalid axis: ", axis);
+    return kTfLiteError;
+  }
+
+  switch (input->type) {
+    case kTfLiteInt32: {
+      optimized_ops::CumSum(GetTensorData<int>(input), GetTensorShape(input),
+                            axis, params->exclusive, params->reverse,
+                            GetTensorData<int>(output));
+      break;
+    }
+    case kTfLiteInt64: {
+      optimized_ops::CumSum(GetTensorData<int64_t>(input),
+                            GetTensorShape(input), axis, params->exclusive,
+                            params->reverse, GetTensorData<int64_t>(output));
+      break;
+    }
+    case kTfLiteFloat32: {
+      optimized_ops::CumSum(GetTensorData<float>(input), GetTensorShape(input),
+                            axis, params->exclusive, params->reverse,
+                            GetTensorData<float>(output));
+      break;
+    }
+    default: {
+      TF_LITE_KERNEL_LOG(
+          context,
+          "Unsupported input type, cumsum only supports int32 & float32.");
+      return kTfLiteError;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace cumsum
+
+TfLiteRegistration* Register_CUMSUM() {
+  static TfLiteRegistration r = {cumsum::Init, cumsum::Free, cumsum::Prepare,
+                                 cumsum::Eval};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/cumsum_test.cc b/tensorflow/lite/kernels/cumsum_test.cc
new file mode 100644
index 00000000000..98b5d195b32
--- /dev/null
+++ b/tensorflow/lite/kernels/cumsum_test.cc
@@ -0,0 +1,165 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/custom_ops_register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/testing/util.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+
+TfLiteRegistration* Register_CUMSUM();
+
+namespace {
+
+template <typename T>
+class CumsumOpModel : public SingleOpModel {
+ public:
+  CumsumOpModel(const TensorData& input, const TensorData& output,
+                bool exclusive, bool reverse) {
+    input_ = AddInput(input);
+    axis_ = AddInput({TensorType_INT32, {1}});
+
+    output_ = AddOutput(output);
+
+    flexbuffers::Builder fbb;
+    fbb.Map([&]() {
+      fbb.Bool("exclusive", exclusive);
+      fbb.Bool("reverse", reverse);
+    });
+    fbb.Finish();
+    SetCustomOp("Cumsum", fbb.GetBuffer(), Register_CUMSUM);
+
+    BuildInterpreter({GetShape(input_), GetShape(axis_)});
+  }
+
+  int input() { return input_; }
+  int axis() { return axis_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+
+ private:
+  int input_;
+  int axis_;
+  int output_;
+};
+
+TEST(CumsumOpTest, SimpleIntTest) {
+  CumsumOpModel<int32_t> m({TensorType_INT32, {2, 4}}, {TensorType_INT32, {}},
+                           false, false);
+
+  m.PopulateTensor<int>(m.input(), {1, 2, 3, 4, 5, 6, 7, 8});
+  m.PopulateTensor<int>(m.axis(), {1});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              testing::ElementsAreArray({1, 3, 6, 10, 5, 11, 18, 26}));
+}
+
+TEST(CumsumOpTest, SimpleInt64Test) {
+  CumsumOpModel<int64_t> m({TensorType_INT64, {2, 4}}, {TensorType_INT64, {}},
+                           false, false);
+
+  m.PopulateTensor<int64_t>(
+      m.input(), {100000000001l, 100000000002l, 100000000003l, 100000000004l,
+                  100000000005l, 100000000006l, 100000000007l, 100000000008l});
+  m.PopulateTensor<int>(m.axis(), {1});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), testing::ElementsAreArray(
+                                 {100000000001l, 200000000003l, 300000000006l,
+                                  400000000010l, 100000000005l, 200000000011l,
+                                  300000000018l, 400000000026l}));
+}
+
+TEST(CumsumOpTest, SimpleIntAxis0Test) {
+  CumsumOpModel<int32_t> m({TensorType_INT32, {2, 4}}, {TensorType_INT32, {}},
+                           false, false);
+
+  m.PopulateTensor<int>(m.input(), {1, 2, 3, 4, 5, 6, 7, 8});
+  m.PopulateTensor<int>(m.axis(), {0});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              testing::ElementsAreArray({1, 2, 3, 4, 6, 8, 10, 12}));
+}
+
+TEST(CumsumOpTest, Simple1DIntTest) {
+  CumsumOpModel<int32_t> m({TensorType_INT32, {8}}, {TensorType_INT32, {}},
+                           false, false);
+
+  m.PopulateTensor<int>(m.input(), {1, 2, 3, 4, 5, 6, 7, 8});
+  m.PopulateTensor<int>(m.axis(), {0});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              testing::ElementsAreArray({1, 3, 6, 10, 15, 21, 28, 36}));
+}
+
+TEST(CumsumOpTest, SimpleIntReverseTest) {
+  CumsumOpModel<int32_t> m({TensorType_INT32, {2, 4}}, {TensorType_INT32, {}},
+                           false, true);
+
+  m.PopulateTensor<int>(m.input(), {1, 2, 3, 4, 5, 6, 7, 8});
+  m.PopulateTensor<int>(m.axis(), {1});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              testing::ElementsAreArray({10, 9, 7, 4, 26, 21, 15, 8}));
+}
+
+TEST(CumsumOpTest, SimpleIntExclusiveTest) {
+  CumsumOpModel<int32_t> m({TensorType_INT32, {2, 4}}, {TensorType_INT32, {}},
+                           true, false);
+
+  m.PopulateTensor<int>(m.input(), {1, 2, 3, 4, 5, 6, 7, 8});
+  m.PopulateTensor<int>(m.axis(), {1});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              testing::ElementsAreArray({0, 1, 3, 6, 0, 5, 11, 18}));
+}
+
+TEST(CumsumOpTest, SimpleFloatTest) {
+  CumsumOpModel<float> m({TensorType_FLOAT32, {2, 4}}, {TensorType_FLOAT32, {}},
+                         false, false);
+
+  m.PopulateTensor<float>(m.input(), {1, 2, 3, 4, 5, 6, 7, 8});
+  m.PopulateTensor<int>(m.axis(), {1});
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), testing::ElementsAreArray(
+                                 ArrayFloatNear({1, 3, 6, 10, 5, 11, 18, 26})));
+}
+
+}  // namespace
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/custom_ops_register.h b/tensorflow/lite/kernels/custom_ops_register.h
index bf4c3d7e473..8aadd379a43 100644
--- a/tensorflow/lite/kernels/custom_ops_register.h
+++ b/tensorflow/lite/kernels/custom_ops_register.h
@@ -21,14 +21,18 @@ namespace tflite {
 namespace ops {
 namespace custom {
 
-TfLiteRegistration* Register_RFFT2D();
+TfLiteRegistration* Register_CUMSUM();
 TfLiteRegistration* Register_HASHTABLE();
 TfLiteRegistration* Register_HASHTABLE_FIND();
 TfLiteRegistration* Register_HASHTABLE_IMPORT();
 TfLiteRegistration* Register_HASHTABLE_SIZE();
-TfLiteRegistration* Register_REAL();
 TfLiteRegistration* Register_IMAG();
-}
+TfLiteRegistration* Register_MULTINOMIAL();
+TfLiteRegistration* Register_RANDOM_STANDARD_NORMAL();
+TfLiteRegistration* Register_REAL();
+TfLiteRegistration* Register_RFFT2D();
+
+}  // namespace custom
 }  // namespace ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/densify_test.cc b/tensorflow/lite/kernels/densify_test.cc
index d453606cf2e..384a2bed3c7 100644
--- a/tensorflow/lite/kernels/densify_test.cc
+++ b/tensorflow/lite/kernels/densify_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdint>
-#include <initializer_list>
 #include <memory>
 #include <vector>
 
@@ -42,7 +41,7 @@ using ::testing::ElementsAreArray;
 template <typename T>
 class DensifyOpModel : public SingleOpModel {
  public:
-  DensifyOpModel(const TensorData& input, std::initializer_list<T> input_data,
+  DensifyOpModel(const TensorData& input, const std::vector<T>& input_data,
                  int version = 1) {
     input_ = AddConstSparseInput(input, input_data);
     output_ = AddOutput({input.type, input.shape});
@@ -65,9 +64,8 @@ class DensifyOpModel : public SingleOpModel {
 };
 
 TEST(DensifyOpTest, Float) {
-  std::initializer_list<float> dense_values = {6, 0, 9, 8, 0, 0,
-                                               0, 0, 5, 0, 0, 7};
-  std::initializer_list<float> sparse_values = {6, 9, 8, 5, 7};
+  std::vector<float> dense_values = {6, 0, 9, 8, 0, 0, 0, 0, 5, 0, 0, 7};
+  std::vector<float> sparse_values = {6, 9, 8, 5, 7};
   TensorData input = {};
   input.type = TensorType_FLOAT32;
   input.shape = {3, 4};
@@ -80,9 +78,8 @@ TEST(DensifyOpTest, Float) {
 }
 
 TEST(DensifyOpTest, Float3D) {
-  std::initializer_list<float> dense_values = {6, 0, 9, 8, 0, 0,
-                                               0, 0, 5, 0, 0, 7};
-  std::initializer_list<float> sparse_values = {6, 9, 8, 5, 7};
+  std::vector<float> dense_values = {6, 0, 9, 8, 0, 0, 0, 0, 5, 0, 0, 7};
+  std::vector<float> sparse_values = {6, 9, 8, 5, 7};
   TensorData input = {};
   input.type = TensorType_FLOAT32;
   input.shape = {3, 2, 2};
@@ -95,9 +92,8 @@ TEST(DensifyOpTest, Float3D) {
 }
 
 TEST(DensifyOpTest, Int8) {
-  std::initializer_list<int8_t> dense_values = {6, 0, 9, 8, 0, 0,
-                                                0, 0, 5, 0, 0, 7};
-  std::initializer_list<int8_t> sparse_values = {6, 9, 8, 5, 7};
+  std::vector<int8_t> dense_values = {6, 0, 9, 8, 0, 0, 0, 0, 5, 0, 0, 7};
+  std::vector<int8_t> sparse_values = {6, 9, 8, 5, 7};
   TensorData input = {};
   input.type = TensorType_INT8;
   input.shape = {3, 4};
diff --git a/tensorflow/lite/kernels/depth_to_space.cc b/tensorflow/lite/kernels/depth_to_space.cc
index 1637ad4350f..d61049f85a9 100644
--- a/tensorflow/lite/kernels/depth_to_space.cc
+++ b/tensorflow/lite/kernels/depth_to_space.cc
@@ -45,8 +45,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
 
@@ -84,8 +87,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteDepthToSpaceParams*>(node->builtin_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
 #define TF_LITE_DEPTH_TO_SPACE(type, scalar)                               \
   tflite::DepthToSpaceParams op_params;                                    \
diff --git a/tensorflow/lite/kernels/depthwise_conv.cc b/tensorflow/lite/kernels/depthwise_conv.cc
index 961a987cf02..a76853da190 100644
--- a/tensorflow/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/kernels/depthwise_conv.cc
@@ -104,12 +104,17 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   bool hasBias = NumInputs(node) == 3;
 
   TF_LITE_ENSURE(context, hasBias || NumInputs(node) == 2);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* filter;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kFilterTensor, &filter));
   const TfLiteTensor* bias = nullptr;
 
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
   TF_LITE_ENSURE_EQ(context, NumDimensions(filter), 4);
@@ -132,7 +137,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(filter, 0), 1);
 
   if (hasBias) {
-    bias = GetInput(context, node, kBiasTensor);
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kBiasTensor, &bias));
     if (data_type == kTfLiteUInt8 || data_type == kTfLiteInt8) {
       TF_LITE_ENSURE_TYPES_EQ(context, bias->type, kTfLiteInt32);
       TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
@@ -224,8 +229,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
     node->temporaries->data[data->input_quantized_index] =
         data->input_quantized_id;
-    TfLiteTensor* input_quantized =
-        GetTemporary(context, node, data->input_quantized_index);
+    TfLiteTensor* input_quantized;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, data->input_quantized_index,
+                                  &input_quantized));
     input_quantized->type = kTfLiteInt8;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
@@ -235,8 +242,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[data->scaling_factors_index] =
         data->scaling_factors_id;
-    TfLiteTensor* scaling_factors =
-        GetTemporary(context, node, data->scaling_factors_index);
+    TfLiteTensor* scaling_factors;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, data->scaling_factors_index,
+                                  &scaling_factors));
     scaling_factors->type = kTfLiteFloat32;
     scaling_factors->allocation_type = kTfLiteArenaRw;
     const int batch_size = SizeOfDimension(input, 0);
@@ -248,8 +257,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                                        scaling_factors_size));
     }
     node->temporaries->data[data->input_offset_index] = data->input_offset_id;
-    TfLiteTensor* input_offsets =
-        GetTemporary(context, node, data->input_offset_index);
+    TfLiteTensor* input_offsets;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, data->input_offset_index,
+                                       &input_offsets));
     input_offsets->type = kTfLiteInt32;
     input_offsets->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqualsArray(input_offsets->dims, 1, scaling_dims)) {
@@ -446,13 +457,21 @@ TfLiteStatus EvalHybridPerChannel(TfLiteContext* context, TfLiteNode* node,
                            &output_activation_max);
   const int input_size = NumElements(input) / SizeOfDimension(input, 0);
   const int batch_size = SizeOfDimension(input, 0);
-  const TfLiteTensor* input_quantized =
-      GetTemporary(context, node, data->input_quantized_index);
+  TfLiteTensor* input_quantized;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, data->input_quantized_index,
+                                     &input_quantized));
   int8_t* quantized_input_ptr_batch = input_quantized->data.int8;
-  float* scaling_factors_ptr = GetTensorData<float>(
-      GetTemporary(context, node, data->scaling_factors_index));
-  int32_t* input_offset_ptr = GetTensorData<int32_t>(
-      GetTemporary(context, node, data->input_offset_index));
+  TfLiteTensor* scaling_factors_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, data->scaling_factors_index,
+                                     &scaling_factors_tensor));
+  float* scaling_factors_ptr = GetTensorData<float>(scaling_factors_tensor);
+  TfLiteTensor* input_offset_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, data->input_offset_index,
+                                     &input_offset_tensor));
+  int32_t* input_offset_ptr = GetTensorData<int32_t>(input_offset_tensor);
 
   for (int b = 0; b < batch_size; ++b) {
     const int offset = b * input_size;
@@ -504,9 +523,14 @@ TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* filter;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kFilterTensor, &filter));
   const TfLiteTensor* bias =
       (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
   TFLITE_DCHECK_EQ(input_type, input->type);
@@ -547,7 +571,8 @@ TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
 
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
diff --git a/tensorflow/lite/kernels/detection_postprocess.cc b/tensorflow/lite/kernels/detection_postprocess.cc
index c0b5b2ddf7c..3e1b7a3a034 100644
--- a/tensorflow/lite/kernels/detection_postprocess.cc
+++ b/tensorflow/lite/kernels/detection_postprocess.cc
@@ -146,12 +146,17 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   auto* op_data = static_cast<OpData*>(node->user_data);
   // Inputs: box_encodings, scores, anchors
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
-  const TfLiteTensor* input_box_encodings =
-      GetInput(context, node, kInputTensorBoxEncodings);
-  const TfLiteTensor* input_class_predictions =
-      GetInput(context, node, kInputTensorClassPredictions);
-  const TfLiteTensor* input_anchors =
-      GetInput(context, node, kInputTensorAnchors);
+  const TfLiteTensor* input_box_encodings;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorBoxEncodings,
+                                 &input_box_encodings));
+  const TfLiteTensor* input_class_predictions;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorClassPredictions,
+                                 &input_class_predictions));
+  const TfLiteTensor* input_anchors;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensorAnchors,
+                                          &input_anchors));
   TF_LITE_ENSURE_EQ(context, NumDimensions(input_box_encodings), 3);
   TF_LITE_ENSURE_EQ(context, NumDimensions(input_class_predictions), 3);
   TF_LITE_ENSURE_EQ(context, NumDimensions(input_anchors), 2);
@@ -163,27 +168,35 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // num_detections
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 4);
   // Output Tensor detection_boxes: size is set to (1, num_detected_boxes, 4)
-  TfLiteTensor* detection_boxes =
-      GetOutput(context, node, kOutputTensorDetectionBoxes);
+  TfLiteTensor* detection_boxes;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensorDetectionBoxes,
+                                  &detection_boxes));
   detection_boxes->type = kTfLiteFloat32;
   SetTensorSizes(context, detection_boxes,
                  {kBatchSize, num_detected_boxes, kNumCoordBox});
 
   // Output Tensor detection_classes: size is set to (1, num_detected_boxes)
-  TfLiteTensor* detection_classes =
-      GetOutput(context, node, kOutputTensorDetectionClasses);
+  TfLiteTensor* detection_classes;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensorDetectionClasses,
+                                  &detection_classes));
   detection_classes->type = kTfLiteFloat32;
   SetTensorSizes(context, detection_classes, {kBatchSize, num_detected_boxes});
 
   // Output Tensor detection_scores: size is set to (1, num_detected_boxes)
-  TfLiteTensor* detection_scores =
-      GetOutput(context, node, kOutputTensorDetectionScores);
+  TfLiteTensor* detection_scores;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensorDetectionScores,
+                                  &detection_scores));
   detection_scores->type = kTfLiteFloat32;
   SetTensorSizes(context, detection_scores, {kBatchSize, num_detected_boxes});
 
   // Output Tensor num_detections: size is set to 1
-  TfLiteTensor* num_detections =
-      GetOutput(context, node, kOutputTensorNumDetections);
+  TfLiteTensor* num_detections;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensorNumDetections,
+                                  &num_detections));
   num_detections->type = kTfLiteFloat32;
   // TODO (chowdhery): Make it a scalar when available
   SetTensorSizes(context, num_detections, {1});
@@ -269,13 +282,16 @@ T ReInterpretTensor(TfLiteTensor* tensor) {
 TfLiteStatus DecodeCenterSizeBoxes(TfLiteContext* context, TfLiteNode* node,
                                    OpData* op_data) {
   // Parse input tensor boxencodings
-  const TfLiteTensor* input_box_encodings =
-      GetInput(context, node, kInputTensorBoxEncodings);
+  const TfLiteTensor* input_box_encodings;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorBoxEncodings,
+                                 &input_box_encodings));
   TF_LITE_ENSURE_EQ(context, input_box_encodings->dims->data[0], kBatchSize);
   const int num_boxes = input_box_encodings->dims->data[1];
   TF_LITE_ENSURE(context, input_box_encodings->dims->data[2] >= kNumCoordBox);
-  const TfLiteTensor* input_anchors =
-      GetInput(context, node, kInputTensorAnchors);
+  const TfLiteTensor* input_anchors;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensorAnchors,
+                                          &input_anchors));
 
   // Decode the boxes to get (ymin, xmin, ymax, xmax) based on the anchors
   CenterSizeEncoding box_centersize;
@@ -389,8 +405,10 @@ TfLiteStatus NonMaxSuppressionSingleClassHelper(
     TfLiteContext* context, TfLiteNode* node, OpData* op_data,
     const std::vector<float>& scores, std::vector<int>* selected,
     int max_detections) {
-  const TfLiteTensor* input_box_encodings =
-      GetInput(context, node, kInputTensorBoxEncodings);
+  const TfLiteTensor* input_box_encodings;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorBoxEncodings,
+                                 &input_box_encodings));
   const TfLiteTensor* decoded_boxes =
       &context->tensors[op_data->decoded_boxes_index];
   const int num_boxes = input_box_encodings->dims->data[1];
@@ -468,21 +486,33 @@ TfLiteStatus NonMaxSuppressionMultiClassRegularHelper(TfLiteContext* context,
                                                       TfLiteNode* node,
                                                       OpData* op_data,
                                                       const float* scores) {
-  const TfLiteTensor* input_box_encodings =
-      GetInput(context, node, kInputTensorBoxEncodings);
-  const TfLiteTensor* input_class_predictions =
-      GetInput(context, node, kInputTensorClassPredictions);
+  const TfLiteTensor* input_box_encodings;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorBoxEncodings,
+                                 &input_box_encodings));
+  const TfLiteTensor* input_class_predictions;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorClassPredictions,
+                                 &input_class_predictions));
   const TfLiteTensor* decoded_boxes =
       &context->tensors[op_data->decoded_boxes_index];
 
-  TfLiteTensor* detection_boxes =
-      GetOutput(context, node, kOutputTensorDetectionBoxes);
-  TfLiteTensor* detection_classes =
-      GetOutput(context, node, kOutputTensorDetectionClasses);
-  TfLiteTensor* detection_scores =
-      GetOutput(context, node, kOutputTensorDetectionScores);
-  TfLiteTensor* num_detections =
-      GetOutput(context, node, kOutputTensorNumDetections);
+  TfLiteTensor* detection_boxes;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensorDetectionBoxes,
+                                  &detection_boxes));
+  TfLiteTensor* detection_classes;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensorDetectionClasses,
+                                  &detection_classes));
+  TfLiteTensor* detection_scores;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensorDetectionScores,
+                                  &detection_scores));
+  TfLiteTensor* num_detections;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensorNumDetections,
+                                  &num_detections));
 
   const int num_boxes = input_box_encodings->dims->data[1];
   const int num_classes = op_data->num_classes;
@@ -595,21 +625,33 @@ TfLiteStatus NonMaxSuppressionMultiClassFastHelper(TfLiteContext* context,
                                                    TfLiteNode* node,
                                                    OpData* op_data,
                                                    const float* scores) {
-  const TfLiteTensor* input_box_encodings =
-      GetInput(context, node, kInputTensorBoxEncodings);
-  const TfLiteTensor* input_class_predictions =
-      GetInput(context, node, kInputTensorClassPredictions);
+  const TfLiteTensor* input_box_encodings;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorBoxEncodings,
+                                 &input_box_encodings));
+  const TfLiteTensor* input_class_predictions;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorClassPredictions,
+                                 &input_class_predictions));
   const TfLiteTensor* decoded_boxes =
       &context->tensors[op_data->decoded_boxes_index];
 
-  TfLiteTensor* detection_boxes =
-      GetOutput(context, node, kOutputTensorDetectionBoxes);
-  TfLiteTensor* detection_classes =
-      GetOutput(context, node, kOutputTensorDetectionClasses);
-  TfLiteTensor* detection_scores =
-      GetOutput(context, node, kOutputTensorDetectionScores);
-  TfLiteTensor* num_detections =
-      GetOutput(context, node, kOutputTensorNumDetections);
+  TfLiteTensor* detection_boxes;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensorDetectionBoxes,
+                                  &detection_boxes));
+  TfLiteTensor* detection_classes;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensorDetectionClasses,
+                                  &detection_classes));
+  TfLiteTensor* detection_scores;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensorDetectionScores,
+                                  &detection_scores));
+  TfLiteTensor* num_detections;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensorNumDetections,
+                                  &num_detections));
 
   const int num_boxes = input_box_encodings->dims->data[1];
   const int num_classes = op_data->num_classes;
@@ -680,10 +722,14 @@ void DequantizeClassPredictions(const TfLiteTensor* input_class_predictions,
 TfLiteStatus NonMaxSuppressionMultiClass(TfLiteContext* context,
                                          TfLiteNode* node, OpData* op_data) {
   // Get the input tensors
-  const TfLiteTensor* input_box_encodings =
-      GetInput(context, node, kInputTensorBoxEncodings);
-  const TfLiteTensor* input_class_predictions =
-      GetInput(context, node, kInputTensorClassPredictions);
+  const TfLiteTensor* input_box_encodings;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorBoxEncodings,
+                                 &input_box_encodings));
+  const TfLiteTensor* input_class_predictions;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorClassPredictions,
+                                 &input_class_predictions));
   const int num_boxes = input_box_encodings->dims->data[1];
   const int num_classes = op_data->num_classes;
   TF_LITE_ENSURE_EQ(context, input_class_predictions->dims->data[0],
diff --git a/tensorflow/lite/kernels/div.cc b/tensorflow/lite/kernels/div.cc
index c9eb1db531a..f744b4ba1b7 100644
--- a/tensorflow/lite/kernels/div.cc
+++ b/tensorflow/lite/kernels/div.cc
@@ -74,9 +74,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
   output->type = input2->type;
@@ -200,9 +206,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteDivParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) {
     EvalDiv<kernel_type>(context, node, params, data, input1, input2, output);
diff --git a/tensorflow/lite/kernels/elementwise.cc b/tensorflow/lite/kernels/elementwise.cc
index d23cdedc6c8..40f13002b8e 100644
--- a/tensorflow/lite/kernels/elementwise.cc
+++ b/tensorflow/lite/kernels/elementwise.cc
@@ -66,8 +66,10 @@ template <IsSupportedType is_supported_type, const char* op_name>
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   if (!is_supported_type(input->type)) {
     TF_LITE_UNSUPPORTED_TYPE(context, input->type, op_name);
@@ -114,8 +116,10 @@ template <typename T>
 inline TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node,
                              std::function<T(T)> func,
                              TfLiteType expected_type) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, expected_type);
   const int64_t num_elements = NumElements(input);
   const T* in_data = GetTensorData<T>(input);
diff --git a/tensorflow/lite/kernels/embedding_lookup.cc b/tensorflow/lite/kernels/embedding_lookup.cc
index 36e0737c7e2..d865f69eb9b 100644
--- a/tensorflow/lite/kernels/embedding_lookup.cc
+++ b/tensorflow/lite/kernels/embedding_lookup.cc
@@ -46,14 +46,17 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* lookup = GetInput(context, node, 0);
+  const TfLiteTensor* lookup;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &lookup));
   TF_LITE_ENSURE_EQ(context, NumDimensions(lookup), 1);
   TF_LITE_ENSURE_EQ(context, lookup->type, kTfLiteInt32);
 
-  const TfLiteTensor* value = GetInput(context, node, 1);
+  const TfLiteTensor* value;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &value));
   TF_LITE_ENSURE(context, NumDimensions(value) >= 2);
 
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   TfLiteIntArray* outputSize = TfLiteIntArrayCreate(NumDimensions(value));
 
   outputSize->data[0] = SizeOfDimension(lookup, 0);
@@ -129,9 +132,12 @@ TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* lookup = GetInput(context, node, 0);
-  const TfLiteTensor* value = GetInput(context, node, 1);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* lookup;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &lookup));
+  const TfLiteTensor* value;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &value));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   switch (value->type) {
     case kTfLiteFloat32:
       return EvalSimple(context, node, lookup, value, output);
diff --git a/tensorflow/lite/kernels/embedding_lookup_sparse.cc b/tensorflow/lite/kernels/embedding_lookup_sparse.cc
index 745b5090094..e9ad7e50cf1 100644
--- a/tensorflow/lite/kernels/embedding_lookup_sparse.cc
+++ b/tensorflow/lite/kernels/embedding_lookup_sparse.cc
@@ -83,19 +83,23 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 5);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* ids = GetInput(context, node, 0);
+  const TfLiteTensor* ids;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &ids));
   TF_LITE_ENSURE_EQ(context, NumDimensions(ids), 1);
   TF_LITE_ENSURE_EQ(context, ids->type, kTfLiteInt32);
 
-  const TfLiteTensor* indices = GetInput(context, node, 1);
+  const TfLiteTensor* indices;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &indices));
   TF_LITE_ENSURE_EQ(context, NumDimensions(indices), 2);
   TF_LITE_ENSURE_EQ(context, indices->type, kTfLiteInt32);
 
-  const TfLiteTensor* shape = GetInput(context, node, 2);
+  const TfLiteTensor* shape;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 2, &shape));
   TF_LITE_ENSURE_EQ(context, NumDimensions(shape), 1);
   TF_LITE_ENSURE_EQ(context, shape->type, kTfLiteInt32);
 
-  const TfLiteTensor* weights = GetInput(context, node, 3);
+  const TfLiteTensor* weights;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 3, &weights));
   TF_LITE_ENSURE_EQ(context, NumDimensions(weights), 1);
   TF_LITE_ENSURE_EQ(context, weights->type, kTfLiteFloat32);
 
@@ -104,11 +108,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(indices, 0),
                     SizeOfDimension(weights, 0));
 
-  const TfLiteTensor* value = GetInput(context, node, 4);
+  const TfLiteTensor* value;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 4, &value));
   TF_LITE_ENSURE(context, NumDimensions(value) >= 2);
 
   // Mark the output as a dynamic tensor.
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
   output->allocation_type = kTfLiteDynamic;
 
@@ -140,12 +146,18 @@ void FinalizeAggregation(TfLiteCombinerType combiner, int num_elements,
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteEmbeddingLookupSparseParams*>(node->builtin_data);
-  TfLiteTensor* output = GetOutput(context, node, 0);
-  const TfLiteTensor* ids = GetInput(context, node, 0);
-  const TfLiteTensor* indices = GetInput(context, node, 1);
-  const TfLiteTensor* dense_shape = GetInput(context, node, 2);
-  const TfLiteTensor* weights = GetInput(context, node, 3);
-  const TfLiteTensor* value = GetInput(context, node, 4);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
+  const TfLiteTensor* ids;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &ids));
+  const TfLiteTensor* indices;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &indices));
+  const TfLiteTensor* dense_shape;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 2, &dense_shape));
+  const TfLiteTensor* weights;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 3, &weights));
+  const TfLiteTensor* value;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 4, &value));
 
   const int lookup_rank = SizeOfDimension(indices, 1);
   const int embedding_rank = NumDimensions(value);
diff --git a/tensorflow/lite/kernels/expand_dims.cc b/tensorflow/lite/kernels/expand_dims.cc
index 721ab3d510a..950131c8d69 100644
--- a/tensorflow/lite/kernels/expand_dims.cc
+++ b/tensorflow/lite/kernels/expand_dims.cc
@@ -73,9 +73,12 @@ TfLiteStatus GetAxisValueFromTensor(TfLiteContext* context,
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, kInput);
-  const TfLiteTensor* axis = GetInput(context, node, kAxis);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInput, &input));
+  const TfLiteTensor* axis;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kAxis, &axis));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   output->type = input->type;
   if (IsConstantTensor(axis)) {
     int axis_value;
@@ -89,9 +92,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Just copy input to output.
-  const TfLiteTensor* input = GetInput(context, node, kInput);
-  TfLiteTensor* output = GetOutput(context, node, 0);
-  const TfLiteTensor* axis = GetInput(context, node, kAxis);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInput, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
+  const TfLiteTensor* axis;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kAxis, &axis));
   if (IsDynamicTensor(output)) {
     int axis_value;
     TF_LITE_ENSURE_OK(context,
diff --git a/tensorflow/lite/kernels/fill.cc b/tensorflow/lite/kernels/fill.cc
index 68ec3e9eca3..5bf2bc6489c 100644
--- a/tensorflow/lite/kernels/fill.cc
+++ b/tensorflow/lite/kernels/fill.cc
@@ -72,8 +72,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* dims = GetInput(context, node, kDimsTensor);
-  const TfLiteTensor* value = GetInput(context, node, kValueTensor);
+  const TfLiteTensor* dims;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kDimsTensor, &dims));
+  const TfLiteTensor* value;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kValueTensor, &value));
 
   // Make sure the 1st input tensor is 1-D.
   TF_LITE_ENSURE_EQ(context, NumDimensions(dims), 1);
@@ -85,7 +87,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Make sure the 2nd input tensor is a scalar.
   TF_LITE_ENSURE_EQ(context, NumDimensions(value), 0);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   output->type = value->type;
 
   if (IsConstantTensor(dims)) {
@@ -111,12 +115,16 @@ TfLiteStatus FillString(const TfLiteTensor* value, TfLiteTensor* output) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* value = GetInput(context, node, kValueTensor);
+  const TfLiteTensor* value;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kValueTensor, &value));
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   if (IsDynamicTensor(output)) {
-    const TfLiteTensor* dims = GetInput(context, node, kDimsTensor);
+    const TfLiteTensor* dims;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kDimsTensor, &dims));
     TF_LITE_ENSURE_OK(context, ResizeOutput(context, dims, output));
   }
 #define TF_LITE_FILL(data_type)                                               \
diff --git a/tensorflow/lite/kernels/floor.cc b/tensorflow/lite/kernels/floor.cc
index d629b48d1a6..1151136b7f9 100644
--- a/tensorflow/lite/kernels/floor.cc
+++ b/tensorflow/lite/kernels/floor.cc
@@ -35,8 +35,11 @@ enum KernelType {
 };
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
@@ -47,8 +50,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 template <KernelType type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   if (type == kGenericOptimized) {
     optimized_ops::Floor(GetTensorShape(input), GetTensorData<float>(input),
diff --git a/tensorflow/lite/kernels/floor_div.cc b/tensorflow/lite/kernels/floor_div.cc
index 24682fdebe1..19b806c91d5 100644
--- a/tensorflow/lite/kernels/floor_div.cc
+++ b/tensorflow/lite/kernels/floor_div.cc
@@ -64,9 +64,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Reinterprete the opaque data provided by user.
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
 
@@ -126,9 +132,15 @@ TfLiteStatus EvalImpl(TfLiteContext* context, bool requires_broadcast,
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   switch (input1->type) {
     case kTfLiteInt32: {
diff --git a/tensorflow/lite/kernels/floor_mod.cc b/tensorflow/lite/kernels/floor_mod.cc
index a4bc9fa9841..e8e2176bced 100644
--- a/tensorflow/lite/kernels/floor_mod.cc
+++ b/tensorflow/lite/kernels/floor_mod.cc
@@ -58,9 +58,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Reinterprete the opaque data provided by user.
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
 
@@ -120,9 +126,15 @@ TfLiteStatus EvalImpl(TfLiteContext* context, bool requires_broadcast,
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   switch (input1->type) {
     case kTfLiteInt32: {
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index 9cbbcae9c51..1ba3932b476 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -155,13 +155,18 @@ TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node) {
                                                                           : 2;
   TF_LITE_ENSURE_EQ(context, node->outputs->size, expected_outputs_count);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* filter;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kWeightsTensor, &filter));
   const TfLiteTensor* bias =
       (node->inputs->size == 3)
           ? GetOptionalInputTensor(context, node, kBiasTensor)
           : nullptr;
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   // Check proper datatype match among all Input Tensors
   TF_LITE_ENSURE_STATUS(
@@ -214,7 +219,9 @@ TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node) {
     node->temporaries = TfLiteIntArrayCreate(5);
     node->temporaries->data[0] = data->scratch_tensor_index;
 
-    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
+    TfLiteTensor* input_quantized;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/0,
+                                                &input_quantized));
     input_quantized->type = filter->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
 
@@ -223,7 +230,9 @@ TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node) {
                                                      input_quantized_size));
 
     node->temporaries->data[1] = data->scratch_tensor_index + 1;
-    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/1);
+    TfLiteTensor* scaling_factors;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/1,
+                                                &scaling_factors));
     scaling_factors->type = kTfLiteFloat32;
     scaling_factors->allocation_type = kTfLiteArenaRw;
 
@@ -236,7 +245,9 @@ TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node) {
     }
 
     node->temporaries->data[2] = data->scratch_tensor_index + 2;
-    TfLiteTensor* accum_scratch = GetTemporary(context, node, /*index=*/2);
+    TfLiteTensor* accum_scratch;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, /*index=*/2, &accum_scratch));
     accum_scratch->type = kTfLiteInt32;
     accum_scratch->allocation_type = kTfLiteArenaRw;
     int accum_scratch_dims[2] = {num_units, batch_size};
@@ -250,7 +261,9 @@ TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node) {
     }
 
     node->temporaries->data[3] = data->scratch_tensor_index + 3;
-    TfLiteTensor* input_offsets = GetTemporary(context, node, /*index=*/3);
+    TfLiteTensor* input_offsets;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, /*index=*/3, &input_offsets));
     input_offsets->type = kTfLiteInt32;
     input_offsets->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqualsArray(input_offsets->dims, 1, scaling_dims)) {
@@ -260,7 +273,9 @@ TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node) {
                                                        input_offsets_size));
     }
     node->temporaries->data[4] = data->scratch_tensor_index + 4;
-    TfLiteTensor* row_sums = GetTemporary(context, node, /*index=*/4);
+    TfLiteTensor* row_sums;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, /*index=*/4, &row_sums));
     row_sums->type = kTfLiteInt32;
     row_sums->allocation_type = kTfLiteArenaRwPersistent;
     int row_sums_dims[1] = {num_units};
@@ -300,8 +315,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Check for supported activation types.
   auto* params =
       reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
-  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kWeightsTensor, &filter));
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   const bool is_quantized =
       ((filter->type == kTfLiteUInt8) || (filter->type == kTfLiteInt8));
   const bool is_hybrid = is_quantized && (input->type == kTfLiteFloat32);
@@ -484,11 +502,21 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   int32_t output_offset = output->params.zero_point;
   // Only the Pie path supports quantized models and float inputs/outputs.
   if (input->type == kTfLiteFloat32) {
-    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
-    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/1);
-    TfLiteTensor* accum_scratch = GetTemporary(context, node, /*index=*/2);
-    TfLiteTensor* input_offsets = GetTemporary(context, node, /*index=*/3);
-    TfLiteTensor* row_sums = GetTemporary(context, node, /*index=*/4);
+    TfLiteTensor* input_quantized;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/0,
+                                                &input_quantized));
+    TfLiteTensor* scaling_factors;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/1,
+                                                &scaling_factors));
+    TfLiteTensor* accum_scratch;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, /*index=*/2, &accum_scratch));
+    TfLiteTensor* input_offsets;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, /*index=*/3, &input_offsets));
+    TfLiteTensor* row_sums;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, /*index=*/4, &row_sums));
     return EvalHybrid(context, node, params, data, input, filter, bias,
                       input_quantized, scaling_factors, accum_scratch, row_sums,
                       input_offsets, output);
@@ -693,13 +721,18 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* filter;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kWeightsTensor, &filter));
   const TfLiteTensor* bias =
       (node->inputs->size == 3)
           ? GetOptionalInputTensor(context, node, kBiasTensor)
           : nullptr;
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   switch (filter->type) {
     case kTfLiteFloat32:
@@ -708,8 +741,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteUInt8:
       if (params->weights_format ==
           kTfLiteFullyConnectedWeightsFormatShuffled4x16Int8) {
-        TfLiteTensor* shuffled_input_workspace =
-            GetOutput(context, node, kShuffledInputWorkspaceTensor);
+        TfLiteTensor* shuffled_input_workspace;
+        TF_LITE_ENSURE_OK(
+            context, GetOutputSafe(context, node, kShuffledInputWorkspaceTensor,
+                                   &shuffled_input_workspace));
         return EvalShuffledQuantized<kernel_type>(context, node, params, data,
                                                   input, filter, bias, output,
                                                   shuffled_input_workspace);
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
index 7f02ed079bd..9a80c4eebfa 100644
--- a/tensorflow/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -1144,7 +1144,7 @@ class SparseFullyConnectedOpModel : public SingleOpModel {
   SparseFullyConnectedOpModel(TfLiteRegistration* registration, int units,
                               int batches, const TensorData& input,
                               const TensorData& weights,
-                              std::initializer_list<T> weights_data,
+                              const std::vector<T>& weights_data,
                               int num_threads = 1)
       : batches_(batches), units_(units) {
     int total_input_size = 1;
diff --git a/tensorflow/lite/kernels/gather.cc b/tensorflow/lite/kernels/gather.cc
index 01a1e2a8a17..7ec53edb062 100644
--- a/tensorflow/lite/kernels/gather.cc
+++ b/tensorflow/lite/kernels/gather.cc
@@ -38,9 +38,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   const auto* params =
       reinterpret_cast<const TfLiteGatherParams*>(node->builtin_data);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* positions = GetInput(context, node, kInputPositions);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* positions;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputPositions, &positions));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   switch (positions->type) {
     case kTfLiteInt64:
@@ -132,9 +137,14 @@ TfLiteStatus GatherStrings(TfLiteContext* context, const TfLiteTensor* input,
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const auto* params =
       reinterpret_cast<const TfLiteGatherParams*>(node->builtin_data);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* positions = GetInput(context, node, kInputPositions);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* positions;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputPositions, &positions));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   if (positions->type == kTfLiteInt32) {
     switch (input->type) {
diff --git a/tensorflow/lite/kernels/gather_nd.cc b/tensorflow/lite/kernels/gather_nd.cc
index fd31b8c4ddd..836a9ffd450 100644
--- a/tensorflow/lite/kernels/gather_nd.cc
+++ b/tensorflow/lite/kernels/gather_nd.cc
@@ -33,9 +33,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* params = GetInput(context, node, kParams);
-  const TfLiteTensor* indices = GetInput(context, node, kIndices);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* params;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kParams, &params));
+  const TfLiteTensor* indices;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kIndices, &indices));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   switch (params->type) {
     case kTfLiteFloat32:
@@ -140,9 +144,13 @@ TfLiteStatus EvalGatherNd(TfLiteContext* context, const TfLiteTensor* params,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* params = GetInput(context, node, kParams);
-  const TfLiteTensor* indices = GetInput(context, node, kIndices);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* params;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kParams, &params));
+  const TfLiteTensor* indices;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kIndices, &indices));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   switch (indices->type) {
     case kTfLiteInt32:
diff --git a/tensorflow/lite/kernels/hashtable/hashtable.cc b/tensorflow/lite/kernels/hashtable/hashtable.cc
index d1f9551ddf0..bca8f2c9208 100644
--- a/tensorflow/lite/kernels/hashtable/hashtable.cc
+++ b/tensorflow/lite/kernels/hashtable/hashtable.cc
@@ -80,9 +80,9 @@ TfLiteStatus PrepareHashtable(TfLiteContext* context, TfLiteNode* node) {
                               (params->key_dtype == kTfLiteString &&
                                params->value_dtype == kTfLiteInt64));
 
-  TfLiteTensor* resource_handle_tensor =
-      GetOutput(context, node, kResourceHandleTensor);
-  TF_LITE_ENSURE(context, resource_handle_tensor != nullptr);
+  TfLiteTensor* resource_handle_tensor;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, kResourceHandleTensor,
+                                           &resource_handle_tensor));
   TF_LITE_ENSURE_EQ(context, resource_handle_tensor->type, kTfLiteInt32);
   TfLiteIntArray* outputSize = TfLiteIntArrayCreate(1);
   outputSize->data[0] = 1;
@@ -97,8 +97,9 @@ TfLiteStatus EvalHashtable(TfLiteContext* context, TfLiteNode* node) {
   // The resource id is generated based on the given table name.
   const int resource_id = std::hash<std::string>{}(params->table_name);
 
-  TfLiteTensor* resource_handle_tensor =
-      GetOutput(context, node, kResourceHandleTensor);
+  TfLiteTensor* resource_handle_tensor;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, kResourceHandleTensor,
+                                           &resource_handle_tensor));
   auto* resource_handle_data =
       GetTensorData<std::int32_t>(resource_handle_tensor);
   resource_handle_data[0] = resource_id;
diff --git a/tensorflow/lite/kernels/hashtable/hashtable_find.cc b/tensorflow/lite/kernels/hashtable/hashtable_find.cc
index 10236cfce07..f26fe824b4c 100644
--- a/tensorflow/lite/kernels/hashtable/hashtable_find.cc
+++ b/tensorflow/lite/kernels/hashtable/hashtable_find.cc
@@ -34,17 +34,23 @@ TfLiteStatus PrepareHashtableFind(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input_resource_id_tensor =
-      GetInput(context, node, kInputResourceIdTensor);
+  const TfLiteTensor* input_resource_id_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputResourceIdTensor,
+                                          &input_resource_id_tensor));
   TF_LITE_ENSURE_EQ(context, input_resource_id_tensor->type, kTfLiteInt32);
   TF_LITE_ENSURE_EQ(context, NumDimensions(input_resource_id_tensor), 1);
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(input_resource_id_tensor, 0), 1);
 
-  const TfLiteTensor* default_value_tensor =
-      GetInput(context, node, kDefaultValueTensor);
+  const TfLiteTensor* default_value_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kDefaultValueTensor,
+                                          &default_value_tensor));
 
-  const TfLiteTensor* key_tensor = GetInput(context, node, kKeyTensor);
-  TfLiteTensor* output_tensor = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* key_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kKeyTensor, &key_tensor));
+  TfLiteTensor* output_tensor;
+  TF_LITE_ENSURE_OK(
+      context, GetOutputSafe(context, node, kOutputTensor, &output_tensor));
   TF_LITE_ENSURE_EQ(context, default_value_tensor->type, output_tensor->type);
   TF_LITE_ENSURE(context, (key_tensor->type == kTfLiteInt64 &&
                            output_tensor->type == kTfLiteString) ||
@@ -55,14 +61,19 @@ TfLiteStatus PrepareHashtableFind(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus EvalHashtableFind(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input_resource_id_tensor =
-      GetInput(context, node, kInputResourceIdTensor);
+  const TfLiteTensor* input_resource_id_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputResourceIdTensor,
+                                          &input_resource_id_tensor));
   int resource_id = input_resource_id_tensor->data.i32[0];
 
-  const TfLiteTensor* key_tensor = GetInput(context, node, kKeyTensor);
-  const TfLiteTensor* default_value_tensor =
-      GetInput(context, node, kDefaultValueTensor);
-  TfLiteTensor* output_tensor = GetOutput(context, node, 0);
+  const TfLiteTensor* key_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kKeyTensor, &key_tensor));
+  const TfLiteTensor* default_value_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kDefaultValueTensor,
+                                          &default_value_tensor));
+  TfLiteTensor* output_tensor;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output_tensor));
 
   Subgraph* subgraph = reinterpret_cast<Subgraph*>(context->impl_);
   auto& resources = subgraph->resources();
diff --git a/tensorflow/lite/kernels/hashtable/hashtable_import.cc b/tensorflow/lite/kernels/hashtable/hashtable_import.cc
index 1b5c0424526..fad9345a1a4 100644
--- a/tensorflow/lite/kernels/hashtable/hashtable_import.cc
+++ b/tensorflow/lite/kernels/hashtable/hashtable_import.cc
@@ -33,14 +33,19 @@ TfLiteStatus PrepareHashtableImport(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 0);
 
-  const TfLiteTensor* input_resource_id_tensor =
-      GetInput(context, node, kInputResourceIdTensor);
+  const TfLiteTensor* input_resource_id_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputResourceIdTensor,
+                                          &input_resource_id_tensor));
   TF_LITE_ENSURE_EQ(context, input_resource_id_tensor->type, kTfLiteInt32);
   TF_LITE_ENSURE_EQ(context, NumDimensions(input_resource_id_tensor), 1);
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(input_resource_id_tensor, 0), 1);
 
-  const TfLiteTensor* key_tensor = GetInput(context, node, kKeyTensor);
-  const TfLiteTensor* value_tensor = GetInput(context, node, kValueTensor);
+  const TfLiteTensor* key_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kKeyTensor, &key_tensor));
+  const TfLiteTensor* value_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kValueTensor, &value_tensor));
   TF_LITE_ENSURE(context, (key_tensor->type == kTfLiteInt64 &&
                            value_tensor->type == kTfLiteString) ||
                               (key_tensor->type == kTfLiteString &&
@@ -52,12 +57,17 @@ TfLiteStatus PrepareHashtableImport(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus EvalHashtableImport(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input_resource_id_tensor =
-      GetInput(context, node, kInputResourceIdTensor);
+  const TfLiteTensor* input_resource_id_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputResourceIdTensor,
+                                          &input_resource_id_tensor));
   const int resource_id = input_resource_id_tensor->data.i32[0];
 
-  const TfLiteTensor* key_tensor = GetInput(context, node, kKeyTensor);
-  const TfLiteTensor* value_tensor = GetInput(context, node, kValueTensor);
+  const TfLiteTensor* key_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kKeyTensor, &key_tensor));
+  const TfLiteTensor* value_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kValueTensor, &value_tensor));
 
   Subgraph* subgraph = reinterpret_cast<Subgraph*>(context->impl_);
   auto& resources = subgraph->resources();
diff --git a/tensorflow/lite/kernels/hashtable/hashtable_size.cc b/tensorflow/lite/kernels/hashtable/hashtable_size.cc
index 48029795ae0..34a8031a3c0 100644
--- a/tensorflow/lite/kernels/hashtable/hashtable_size.cc
+++ b/tensorflow/lite/kernels/hashtable/hashtable_size.cc
@@ -32,14 +32,16 @@ TfLiteStatus PrepareHashtableSize(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input_resource_id_tensor =
-      GetInput(context, node, kInputResourceIdTensor);
+  const TfLiteTensor* input_resource_id_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputResourceIdTensor,
+                                          &input_resource_id_tensor));
   TF_LITE_ENSURE_EQ(context, input_resource_id_tensor->type, kTfLiteInt32);
   TF_LITE_ENSURE_EQ(context, NumDimensions(input_resource_id_tensor), 1);
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(input_resource_id_tensor, 0), 1);
 
-  TfLiteTensor* output_tensor = GetOutput(context, node, kOutputTensor);
-  TF_LITE_ENSURE(context, output_tensor != nullptr);
+  TfLiteTensor* output_tensor;
+  TF_LITE_ENSURE_OK(
+      context, GetOutputSafe(context, node, kOutputTensor, &output_tensor));
   TF_LITE_ENSURE_EQ(context, output_tensor->type, kTfLiteInt64);
   TfLiteIntArray* outputSize = TfLiteIntArrayCreate(1);
   outputSize->data[0] = 1;
@@ -47,11 +49,14 @@ TfLiteStatus PrepareHashtableSize(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus EvalHashtableSize(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input_resource_id_tensor =
-      GetInput(context, node, kInputResourceIdTensor);
+  const TfLiteTensor* input_resource_id_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputResourceIdTensor,
+                                          &input_resource_id_tensor));
   int resource_id = input_resource_id_tensor->data.i32[0];
 
-  TfLiteTensor* output_tensor = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output_tensor;
+  TF_LITE_ENSURE_OK(
+      context, GetOutputSafe(context, node, kOutputTensor, &output_tensor));
   auto* output_data = GetTensorData<std::int64_t>(output_tensor);
 
   Subgraph* subgraph = reinterpret_cast<Subgraph*>(context->impl_);
diff --git a/tensorflow/lite/kernels/hashtable_lookup.cc b/tensorflow/lite/kernels/hashtable_lookup.cc
index 65e50fe41c2..2563d8ade5f 100644
--- a/tensorflow/lite/kernels/hashtable_lookup.cc
+++ b/tensorflow/lite/kernels/hashtable_lookup.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include <cstring>
 
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/string_util.h"
 
@@ -54,15 +55,18 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 2);
 
-  const TfLiteTensor* lookup = GetInput(context, node, 0);
+  const TfLiteTensor* lookup;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &lookup));
   TF_LITE_ENSURE_EQ(context, NumDimensions(lookup), 1);
   TF_LITE_ENSURE_EQ(context, lookup->type, kTfLiteInt32);
 
-  const TfLiteTensor* key = GetInput(context, node, 1);
+  const TfLiteTensor* key;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &key));
   TF_LITE_ENSURE_EQ(context, NumDimensions(key), 1);
   TF_LITE_ENSURE_EQ(context, key->type, kTfLiteInt32);
 
-  const TfLiteTensor* value = GetInput(context, node, 2);
+  const TfLiteTensor* value;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 2, &value));
   TF_LITE_ENSURE(context, NumDimensions(value) >= 1);
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(key, 0),
                     SizeOfDimension(value, 0));
@@ -70,12 +74,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_EQ(context, NumDimensions(value), 1);
   }
 
-  TfLiteTensor* hits = GetOutput(context, node, 1);
+  TfLiteTensor* hits;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 1, &hits));
   TF_LITE_ENSURE_EQ(context, hits->type, kTfLiteUInt8);
   TfLiteIntArray* hitSize = TfLiteIntArrayCreate(1);
   hitSize->data[0] = SizeOfDimension(lookup, 0);
 
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   TF_LITE_ENSURE_EQ(context, value->type, output->type);
 
   TfLiteStatus status = kTfLiteOk;
@@ -94,11 +100,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* output = GetOutput(context, node, 0);
-  TfLiteTensor* hits = GetOutput(context, node, 1);
-  const TfLiteTensor* lookup = GetInput(context, node, 0);
-  const TfLiteTensor* key = GetInput(context, node, 1);
-  const TfLiteTensor* value = GetInput(context, node, 2);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
+  TfLiteTensor* hits;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 1, &hits));
+  const TfLiteTensor* lookup;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &lookup));
+  const TfLiteTensor* key;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &key));
+  const TfLiteTensor* value;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 2, &value));
 
   const int num_rows = SizeOfDimension(value, 0);
   const int row_bytes = value->bytes / num_rows;
diff --git a/tensorflow/lite/kernels/if.cc b/tensorflow/lite/kernels/if.cc
index 4c39a07bf8b..7099442ced7 100644
--- a/tensorflow/lite/kernels/if.cc
+++ b/tensorflow/lite/kernels/if.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
 namespace tflite {
@@ -52,7 +53,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, node->inputs->size > 0);
 
   // The first input is the condition.
-  const TfLiteTensor* cond = GetInput(context, node, 0);
+  const TfLiteTensor* cond;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &cond));
   // Currently only bool is supported.
   // TODO(ycling): Support other types since TensorFlow also support
   // non-bool types as condition.
@@ -83,7 +85,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     for (int i = 0; i < num_inputs; ++i) {
       // The first input of the node is the condition. The indices of the inputs
       // passed to the subgraphs are offset by 1.
-      const TfLiteTensor* input = GetInput(context, node, i + 1);
+      const TfLiteTensor* input;
+      TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, i + 1, &input));
       std::vector<int> dims(input->dims->data,
                             input->dims->data + input->dims->size);
       subgraph->ResizeInputTensor(i, dims);
@@ -113,7 +116,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
 
   for (int i = 0; i < num_outputs; ++i) {
-    TfLiteTensor* output = GetOutput(context, node, i);
+    TfLiteTensor* output;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, i, &output));
     if (has_dynamic_output_tensors) {
       SetTensorToDynamic(output);
     } else {
@@ -133,7 +137,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* cond = GetInput(context, node, 0);
+  const TfLiteTensor* cond;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &cond));
   bool cond_value = cond->data.b[0];
 
   Subgraph* this_subgraph = reinterpret_cast<Subgraph*>(context->impl_);
@@ -147,7 +152,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   Subgraph& active_branch_subgraph =
       *(*subgraphs)[active_branch_subgraph_index];
   for (int i = 0; i < active_branch_subgraph.inputs().size(); ++i) {
-    const TfLiteTensor* input = GetInput(context, node, i + 1);
+    const TfLiteTensor* input;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, i + 1, &input));
     TfLiteTensor* subgraph_input =
         active_branch_subgraph.tensor(active_branch_subgraph.inputs()[i]);
     TF_LITE_ENSURE_EQ(context, input->bytes, subgraph_input->bytes);
@@ -164,7 +170,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   bool has_dynamic_output_tensors = false;
   for (int i = 0; i < node->outputs->size; ++i) {
-    TfLiteTensor* output = GetOutput(context, node, i);
+    TfLiteTensor* output;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, i, &output));
     if (IsDynamicTensor(output)) {
       has_dynamic_output_tensors = true;
       break;
@@ -173,7 +180,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   if (has_dynamic_output_tensors) {
     for (int i = 0; i < node->outputs->size; ++i) {
-      TfLiteTensor* output = GetOutput(context, node, i);
+      TfLiteTensor* output;
+      TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, i, &output));
       TfLiteTensor* subgraph_output =
           active_branch_subgraph.tensor(active_branch_subgraph.outputs()[i]);
       TfLiteIntArray* output_size = TfLiteIntArrayCopy(subgraph_output->dims);
@@ -185,7 +193,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   for (int i = 0; i < active_branch_subgraph.outputs().size(); ++i) {
     const TfLiteTensor* subgraph_output =
         active_branch_subgraph.tensor(active_branch_subgraph.outputs()[i]);
-    TfLiteTensor* output = GetOutput(context, node, i);
+    TfLiteTensor* output;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, i, &output));
     TF_LITE_ENSURE_EQ(context, output->bytes, subgraph_output->bytes);
     memcpy(output->data.raw, subgraph_output->data.raw, output->bytes);
   }
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 2588d4f076f..94135c6adbe 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -3,6 +3,7 @@ load("//tensorflow:tensorflow.bzl", "transitive_hdrs")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 load("//tensorflow/lite/micro:build_def.bzl", "micro_copts")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite_combined")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = [
@@ -47,6 +48,7 @@ NEON_FLAGS_IF_APPLICABLE = select({
 cc_library(
     name = "compatibility",
     hdrs = ["compatibility.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/kernels:op_macros",
@@ -56,6 +58,7 @@ cc_library(
 cc_library(
     name = "types",
     hdrs = ["types.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":compatibility",
@@ -219,6 +222,7 @@ cc_library(
     name = "common",
     srcs = [],
     hdrs = ["common.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":cppmath",
@@ -253,6 +257,7 @@ cc_library(
         "optimized/optimized_ops.h",
         "optimized/sparse_ops/fully_connected.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":common",
@@ -336,12 +341,13 @@ cc_library(
 )
 
 cc_library(
-    name = "optimized",
+    name = "optimized_eigen",
     hdrs = [
         "optimized/eigen_spatial_convolutions.h",
         "optimized/eigen_tensor_reduced_instantiations_oss.h",
         "optimized/multithreaded_conv.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":common",
@@ -372,6 +378,7 @@ cc_library(
         "max.h",
         "min.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
 )
 
@@ -379,6 +386,7 @@ cc_library(
     name = "quantization_util",
     srcs = ["quantization_util.cc"],
     hdrs = ["quantization_util.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + micro_copts(),
     deps = [
         ":compatibility",
@@ -406,6 +414,7 @@ cc_library(
     hdrs = [
         "transpose_utils.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":types",
@@ -427,6 +436,7 @@ cc_library(
     hdrs = [
         "strided_slice_logic.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":compatibility",
@@ -459,6 +469,7 @@ cc_library(
         "reference/integer_ops/fully_connected.h",
         "reference/integer_ops/l2normalization.h",
         "reference/integer_ops/logistic.h",
+        "reference/integer_ops/mean.h",
         "reference/integer_ops/mul.h",
         "reference/integer_ops/pooling.h",
         "reference/integer_ops/tanh.h",
@@ -487,13 +498,13 @@ cc_library(
         "//conditions:default": [
             "reference/integer_ops/dequantize.h",
             "reference/integer_ops/log_softmax.h",
-            "reference/integer_ops/mean.h",
             "reference/integer_ops/transpose_conv.h",
             "reference/reference_ops.h",
             "reference/string_comparisons.h",
             "reference/sparse_ops/fully_connected.h",
         ],
     }),
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":common",
@@ -608,6 +619,7 @@ cc_library(
             "tensor.h",
         ],
     }),
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":types",
@@ -645,6 +657,7 @@ cc_library(
         "reference/portable_tensor_utils.h",
         "reference/portable_tensor_utils_impl.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":common",
@@ -665,6 +678,7 @@ cc_library(
         "optimized/neon_tensor_utils.h",
         "optimized/neon_tensor_utils_impl.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + NEON_FLAGS_IF_APPLICABLE + HARD_FP_FLAGS_IF_APPLICABLE,
     deps = [
         ":common",
@@ -689,6 +703,7 @@ cc_library(
         "optimized/sse_tensor_utils.h",
         "optimized/sse_tensor_utils_impl.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         ":cpu_check",
@@ -706,6 +721,7 @@ cc_library(
     name = "kernel_utils",
     srcs = ["kernel_utils.cc"],
     hdrs = ["kernel_utils.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + micro_copts(),
     deps = [
         ":tensor_utils",
@@ -728,6 +744,7 @@ cc_library(
         "mfcc_mel_filterbank.h",
         "spectrogram.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         "//third_party/fft2d:fft2d_headers",
@@ -743,6 +760,7 @@ cc_library(
     hdrs = [
         "tensor_utils.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts() + NEON_FLAGS_IF_APPLICABLE,
     deps = [
         ":cpu_check",
@@ -1036,6 +1054,7 @@ cc_library(
         "optimized/neon_check.h",
         "optimized/sse_check.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = select({
         ":haswell": tflite_deps_intel,
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index 66a2d977f39..dec6c9721a3 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -241,8 +241,12 @@ inline Integer FloorLog2(Integer n) {
 
 // generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in
 // softmax
-inline void gen_lut(const std::function<double(double)>& func, double min,
-                    double max, int16_t* table, const int num) {
+// func - the function to build the LUT for (e.g exp(x))
+// min,max - table limits
+// table - pointer to buffer
+// num - number of elements in the LUT
+inline void gen_lut(double (*func)(double), double min, double max,
+                    int16_t* table, const int num) {
   // size of table should equal to num + 1
   // last element only for slope calculation
   double step = (max - min) / (num - 1);
@@ -263,6 +267,34 @@ inline void gen_lut(const std::function<double(double)>& func, double min,
       std::min(std::max(TfLiteRound(func(max) * 32768.0), -32768.0), 32767.0);
 }
 
+// generate INT16 LUT for function(), e.g., table exp(x) and 1/(1+x) used in
+// softmax
+// func - the function to build the LUT for (e.g exp(x))
+// min,max - table limits
+// table - pointer to buffer
+// num - number of elements in the LUT
+inline void gen_lut(float (*func)(float), float min, float max, int16_t* table,
+                    const int num) {
+  // size of table should equal to num + 1
+  // last element only for slope calculation
+  float step = (max - min) / (num - 1);
+  float half_step = step / 2.0f;
+  for (int i = 0; i < num - 1; i++) {
+    float sample_val = TfLiteRound(func(min + i * step) * 32768.0f);
+    float midpoint_interp_val =
+        TfLiteRound((func(min + (i + 1) * step) * 32768.0f +
+                     TfLiteRound(func(min + i * step) * 32768.0f)) /
+                    2.0f);
+    float midpoint_val =
+        TfLiteRound(func(min + i * step + half_step) * 32768.0f);
+    float midpoint_err = midpoint_interp_val - midpoint_val;
+    float bias = TfLiteRound(midpoint_err / 2.0f);
+    table[i] = std::min(std::max(sample_val - bias, -32768.0f), 32767.0f);
+  }
+  table[num - 1] = std::min(
+      std::max(TfLiteRound(func(max) * 32768.0f), -32768.0f), 32767.0f);
+}
+
 // int16_t func table lookup, e.g., lookup exp() and 1/(1+x) used in softmax
 inline int16_t generic_int16_table_lookup(int16_t value, const int16_t* lut) {
   // 512 base value, lut[513] only for calculate slope
diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
index 1b4b88fc622..0f4257d3700 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h
@@ -1819,8 +1819,7 @@ inline void DepthwiseConvWithRounding(
 #if defined(__ANDROID__) && defined(__clang__)
   CpuFlags cpu_flags;
   GetCpuFlags(&cpu_flags);
-  // TODO(b/150208140): Re-enable once erroneous activation in test is resolved.
-  const bool has_dot_product_instructions = false && cpu_flags.neon_dotprod;
+  const bool has_dot_product_instructions = cpu_flags.neon_dotprod;
 
   // Dispatch to dot-product 3x3 kernels when supported.
   if (has_dot_product_instructions) {
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index c505ee81767..6cabea11ac4 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -2536,9 +2536,9 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
     const int32 input2_val = params.input2_offset + input2_data[i];
     const int32 unclamped_result =
         params.output_offset +
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(input1_val * input2_val,
-                                                       params.output_multiplier,
-                                                       params.output_shift);
+        MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                      params.output_multiplier,
+                                      params.output_shift);
     const int32 clamped_output =
         std::min(params.quantized_activation_max,
                  std::max(params.quantized_activation_min, unclamped_result));
@@ -4307,30 +4307,36 @@ inline void LogSoftmax(const SoftmaxParams& params,
 // TODO(tflite): notes for optimization:
 // 1) See if e^ is also bottleneck in the reference fully-integer
 // version and apply lookup there and compare.
+template <typename T>
 inline void LogSoftmax(const SoftmaxParams& params, float input_scale,
-                       const RuntimeShape& input_shape, const uint8* input_data,
-                       const RuntimeShape& output_shape, uint8* output_data) {
-  ruy::profiler::ScopeLabel label("LogSoftmax/Uint8");
+                       const RuntimeShape& input_shape, const T* input_data,
+                       const RuntimeShape& output_shape, T* output_data) {
+  ruy::profiler::ScopeLabel label("LogSoftmax");
   const int trailing_dim = input_shape.DimensionsCount() - 1;
   const int excluding_last_dim =
       MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
   const int last_dim =
       MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
 
-  const int32_t clamp_max = std::numeric_limits<uint8>::max();
-  const int32_t clamp_min = std::numeric_limits<uint8>::min();
+  const int32_t clamp_max = std::numeric_limits<T>::max();
+  const int32_t clamp_min = std::numeric_limits<T>::min();
+
+  int32_t zero_point_offset = 0;
+  if (std::is_same<T, int8_t>::value) {
+    zero_point_offset = 128;
+  }
   for (int i = 0; i < excluding_last_dim; ++i) {
-    uint8_t max_val = std::numeric_limits<uint8>::min();
+    T max_val = std::numeric_limits<T>::min();
     // Find max quantized value.
     for (int j = 0; j < last_dim; ++j) {
       max_val = std::max(max_val, input_data[j]);
     }
 
     float sum_exp = 0.0f;
-    const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+    const int32_t max_q8 = std::numeric_limits<T>::max();
     // Offset into table to compute exp(scale*(x - xmax)) instead of
     // exp(scale*(x)) to prevent overflow.
-    const float* table_offset = &params.table[max_uint8 - max_val];
+    const float* table_offset = &params.table[max_q8 - max_val];
     // Calculate sum(exp(scale*(x - x_max))).
     for (int j = 0; j < last_dim; ++j) {
       sum_exp += table_offset[input_data[j]];
@@ -4340,7 +4346,8 @@ inline void LogSoftmax(const SoftmaxParams& params, float input_scale,
     // params.scale is the output scale.
     const float scale = input_scale / params.scale;
     const float precomputed =
-        (input_scale * max_val + log_sum_exp) / params.scale;
+        (input_scale * (max_val + zero_point_offset) + log_sum_exp) /
+        params.scale;
     for (int j = 0; j < last_dim; ++j) {
       // Equivalent to (input_scale * (input_data[j] - max_val) - log_sum_exp) /
       // output_scale.
@@ -4350,7 +4357,7 @@ inline void LogSoftmax(const SoftmaxParams& params, float input_scale,
       // Use std::rint over std::round (which is used in
       // FakeQuant) since it's multiple times faster on tested arm32.
       const int32_t prob_quantized = std::rint(log_prob) + params.zero_point;
-      output_data[j] = static_cast<uint8_t>(
+      output_data[j] = static_cast<T>(
           std::max(std::min(clamp_max, prob_quantized), clamp_min));
     }
     input_data += last_dim;
@@ -8132,6 +8139,166 @@ inline void BroadcastMinimumDispatch(const ArithmeticParams& params,
                           MinimumElementwise, MinimumScalarBroadcast);
 }
 
+template <typename T>
+void CumsumImpl(const T* input_data, const RuntimeShape& shape, int axis,
+                bool exclusive, bool reverse, T* output_data) {
+  Eigen::array<Eigen::DenseIndex, 3> dims = {1, 1, 1};
+
+  for (int i = 0; i < axis; ++i) {
+    dims[0] *= shape.Dims(i);
+  }
+  dims[1] = shape.Dims(axis);
+  for (int i = axis + 1; i < shape.DimensionsCount(); ++i) {
+    dims[2] *= shape.Dims(i);
+  }
+
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 3, Eigen::RowMajor, Eigen::DenseIndex>,
+      Eigen::Aligned>
+      ConstTensor;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<T, 3, Eigen::RowMajor, Eigen::DenseIndex>, Eigen::Aligned>
+      Tensor;
+  ConstTensor input(input_data, dims);
+  Tensor output(output_data, dims);
+
+  if (reverse) {
+    Eigen::array<bool, 3> reverse_idx = {false, true, false};
+    output =
+        input.reverse(reverse_idx).cumsum(1, exclusive).reverse(reverse_idx);
+  } else {
+    output = input.cumsum(1, exclusive);
+  }
+}
+
+template <typename T>
+void CumSum(const T* input_data, const RuntimeShape& shape, int axis,
+            bool exclusive, bool reverse, T* output_data) {
+  const int dim = shape.DimensionsCount();
+  TFLITE_DCHECK_GE(dim, 1);
+  CumsumImpl<T>(input_data, shape, axis, exclusive, reverse, output_data);
+}
+
+inline void PReluScalarBroadcast(int size, const ArithmeticParams& params,
+                                 float alpha, const float* input_data,
+                                 float* output_data) {
+  ruy::profiler::ScopeLabel label("PreluScalarBroadcast/float");
+  int i = 0;
+
+#ifdef USE_NEON
+  const float32x4_t zero_dup = vdupq_n_f32(0.0f);
+  const float32x4_t alpha_dup = vdupq_n_f32(alpha);
+  for (; i <= size - 16; i += 16) {
+    const float32x4_t input1 = vld1q_f32(input_data + i);
+    const float32x4_t input2 = vld1q_f32(input_data + i + 4);
+    const float32x4_t input3 = vld1q_f32(input_data + i + 8);
+    const float32x4_t input4 = vld1q_f32(input_data + i + 12);
+
+    const float32x4_t temp1 = vmulq_f32(input1, alpha_dup);
+    const float32x4_t temp2 = vmulq_f32(input2, alpha_dup);
+    const float32x4_t temp3 = vmulq_f32(input3, alpha_dup);
+    const float32x4_t temp4 = vmulq_f32(input4, alpha_dup);
+
+    const uint32x4_t mask1 = vcgeq_f32(input1, zero_dup);
+    const uint32x4_t mask2 = vcgeq_f32(input2, zero_dup);
+    const uint32x4_t mask3 = vcgeq_f32(input3, zero_dup);
+    const uint32x4_t mask4 = vcgeq_f32(input4, zero_dup);
+
+    const float32x4_t result1 = vbslq_f32(mask1, input1, temp1);
+    vst1q_f32(output_data + i, result1);
+    const float32x4_t result2 = vbslq_f32(mask2, input2, temp2);
+    vst1q_f32(output_data + i + 4, result2);
+    const float32x4_t result3 = vbslq_f32(mask3, input3, temp3);
+    vst1q_f32(output_data + i + 8, result3);
+    const float32x4_t result4 = vbslq_f32(mask4, input4, temp4);
+    vst1q_f32(output_data + i + 12, result4);
+  }
+
+  for (; i <= size - 4; i += 4) {
+    const float32x4_t input = vld1q_f32(input_data + i);
+    const float32x4_t temp = vmulq_f32(input, alpha_dup);
+    const uint32x4_t mask = vcgeq_f32(input, zero_dup);
+    const float32x4_t result = vbslq_f32(mask, input, temp);
+    vst1q_f32(output_data + i, result);
+  }
+#endif  // USE_NEON
+  for (; i < size; ++i) {
+    const float input = input_data[i];
+    output_data[i] = input >= 0.f ? input : input * alpha;
+  }
+}
+
+inline void PReluElementWise(int flat_size, const ArithmeticParams& params,
+                             const float* alpha_data, const float* input_data,
+                             float* output_data) {
+  ruy::profiler::ScopeLabel label("PreluElementWise/float");
+
+  int i = 0;
+#ifdef USE_NEON
+  const float32x4_t zero_dup = vdupq_n_f32(0.0f);
+  for (; i <= flat_size - 16; i += 16) {
+    const float32x4_t input1 = vld1q_f32(input_data + i);
+    const float32x4_t alpha1 = vld1q_f32(alpha_data + i);
+    const float32x4_t input2 = vld1q_f32(input_data + i + 4);
+    const float32x4_t alpha2 = vld1q_f32(alpha_data + i + 4);
+    const float32x4_t input3 = vld1q_f32(input_data + i + 8);
+    const float32x4_t alpha3 = vld1q_f32(alpha_data + i + 8);
+    const float32x4_t input4 = vld1q_f32(input_data + i + 12);
+    const float32x4_t alpha4 = vld1q_f32(alpha_data + i + 12);
+
+    const float32x4_t temp1 = vmulq_f32(input1, alpha1);
+    const float32x4_t temp2 = vmulq_f32(input2, alpha2);
+    const float32x4_t temp3 = vmulq_f32(input3, alpha3);
+    const float32x4_t temp4 = vmulq_f32(input4, alpha4);
+
+    const uint32x4_t mask1 = vcgeq_f32(input1, zero_dup);
+    const uint32x4_t mask2 = vcgeq_f32(input2, zero_dup);
+    const uint32x4_t mask3 = vcgeq_f32(input3, zero_dup);
+    const uint32x4_t mask4 = vcgeq_f32(input4, zero_dup);
+
+    const float32x4_t result1 = vbslq_f32(mask1, input1, temp1);
+    vst1q_f32(output_data + i, result1);
+    const float32x4_t result2 = vbslq_f32(mask2, input2, temp2);
+    vst1q_f32(output_data + i + 4, result2);
+    const float32x4_t result3 = vbslq_f32(mask3, input3, temp3);
+    vst1q_f32(output_data + i + 8, result3);
+    const float32x4_t result4 = vbslq_f32(mask4, input4, temp4);
+    vst1q_f32(output_data + i + 12, result4);
+  }
+
+  for (; i <= flat_size - 4; i += 4) {
+    const float32x4_t input = vld1q_f32(input_data + i);
+    const float32x4_t alpha = vld1q_f32(alpha_data + i);
+
+    const float32x4_t temp = vmulq_f32(input, alpha);
+    const uint32x4_t mask = vcgeq_f32(input, zero_dup);
+    const float32x4_t result = vbslq_f32(mask, input, temp);
+    vst1q_f32(output_data + i, result);
+  }
+#endif  // USE_NEON
+  for (; i < flat_size; ++i) {
+    const float input = input_data[i];
+    const float alpha = alpha_data[i];
+    output_data[i] = input >= 0.f ? input : input * alpha;
+  }
+}
+
+inline void BroadcastPReluDispatch(
+    const ArithmeticParams& params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& alpha_shape,
+    const float* alpha_data, const RuntimeShape& output_shape,
+    float* output_data, float (*func)(float, float)) {
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
+    return reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
+        input_shape, input_data, alpha_shape, alpha_data, output_shape,
+        output_data, func);
+  }
+
+  BinaryBroadcastFiveFold(params, input_shape, input_data, alpha_shape,
+                          alpha_data, output_shape, output_data,
+                          PReluElementWise, PReluScalarBroadcast);
+}
+
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/batch_matmul.h b/tensorflow/lite/kernels/internal/reference/batch_matmul.h
index 24c3ffe3d7e..f06199c7700 100644
--- a/tensorflow/lite/kernels/internal/reference/batch_matmul.h
+++ b/tensorflow/lite/kernels/internal/reference/batch_matmul.h
@@ -217,10 +217,11 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data,
   }
 }
 
+template <typename T, typename AccumT>
 inline void BatchMatMul(const FullyConnectedParams& params,
-                        const RuntimeShape& lhs_shape, const int8_t* lhs_data,
-                        const RuntimeShape& rhs_shape, const int8_t* rhs_data,
-                        const RuntimeShape& output_shape, int8_t* output_data) {
+                        const RuntimeShape& lhs_shape, const T* lhs_data,
+                        const RuntimeShape& rhs_shape, const T* rhs_data,
+                        const RuntimeShape& output_shape, T* output_data) {
   const RuntimeShape extended_lhs_shape =
       RuntimeShape::ExtendedShape(5, lhs_shape);
   const RuntimeShape extended_rhs_shape =
@@ -276,33 +277,33 @@ inline void BatchMatMul(const FullyConnectedParams& params,
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
 
   for (int b0 = 0; b0 < batch_dim0; ++b0) {
-    const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
-    const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
+    const T* lhs_ptr0 = lhs_data + (b0 * lhs_ext0);
+    const T* rhs_ptr0 = rhs_data + (b0 * rhs_ext0);
     for (int b1 = 0; b1 < batch_dim1; ++b1) {
-      const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
-      const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
+      const T* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1;
+      const T* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1;
       for (int b2 = 0; b2 < batch_dim2; ++b2) {
-        const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
-        const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
-        int8_t* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) +
-                                         b1 * batch_dim2 + b2) *
-                                            lhs_rows * rhs_cols;
+        const T* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2;
+        const T* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2;
+        T* out_ptr = output_data +
+                     ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) *
+                         lhs_rows * rhs_cols;
 
         for (int j = 0; j < rhs_cols; ++j) {
           for (int i = 0; i < lhs_rows; ++i) {
-            int32_t total = 0;
+            AccumT total = 0;
             for (int k = 0; k < accum_depth; ++k) {
-              int32_t lhs_val = lhs_ptr2[accum_depth * i + k];
-              int32_t rhs_val = rhs_ptr2[accum_depth * j + k];
+              AccumT lhs_val = lhs_ptr2[accum_depth * i + k];
+              AccumT rhs_val = rhs_ptr2[accum_depth * j + k];
               total += (lhs_val + filter_offset) * (rhs_val + input_offset);
             }
-            total = MultiplyByQuantizedMultiplier(total, output_multiplier,
-                                                  output_shift);
-            total += output_offset;
-            total = std::max(total, output_activation_min);
-            total = std::min(total, output_activation_max);
+            int32_t total_scaled = MultiplyByQuantizedMultiplier(
+                total, output_multiplier, output_shift);
+            total_scaled += output_offset;
+            total_scaled = std::max(total_scaled, output_activation_min);
+            total_scaled = std::min(total_scaled, output_activation_max);
             const int idx = lhs_rows * j + i;
-            out_ptr[idx] = static_cast<int8_t>(total);
+            out_ptr[idx] = static_cast<T>(total_scaled);
           }
         }
       }
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
index f4bcb2bd06e..3e9cd0caa51 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
@@ -63,45 +63,47 @@ inline void ConvPerChannel(
   const int output_width = output_shape.Dims(2);
   for (int batch = 0; batch < batches; ++batch) {
     for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
       for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          const int in_x_origin = (out_x * stride_width) - pad_width;
-          const int in_y_origin = (out_y * stride_height) - pad_height;
           int32_t acc = 0;
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+
+              if (!is_point_inside_image) {
+                continue;
+              }
+
               for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // Zero padding by omitting the areas outside the image.
-                const bool is_point_inside_image =
-                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height);
-                if (is_point_inside_image) {
-                  int32_t input_val = input_data[Offset(
-                      input_shape, batch, in_y, in_x, in_channel)];
-                  int32_t filter_val =
-                      filter_data[Offset(filter_shape, out_channel, filter_y,
-                                         filter_x, in_channel)];
-                  // Accumulate with 32 bits accumulator.
-                  // In the nudging process during model quantization, we force
-                  // real value of 0.0 be represented by a quantized value. This
-                  // guarantees that the input_offset is a int8_t, even though
-                  // it is represented using int32_t. int32_t += int8_t *
-                  // (int8_t - int8_t) so the highest value we can get from each
-                  // accumulation is [-127, 127] * ([-128, 127] -
-                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
-                  // = 14.98, which means we can accumulate at least 2^16
-                  // multiplications without overflow. The accumulator is
-                  // applied to a filter so the accumulation logic will hold as
-                  // long as the filter size (filter_y * filter_x * in_channel)
-                  // does not exceed 2^16, which is the case in all the models
-                  // we have seen so far.
-                  // TODO(jianlijianli): Add a check to make sure the
-                  // accumulator depth is smaller than 2^16.
-                  acc += filter_val * (input_val + input_offset);
-                }
+                int32_t input_val = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                int32_t filter_val = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                // Accumulate with 32 bits accumulator.
+                // In the nudging process during model quantization, we force
+                // real value of 0.0 be represented by a quantized value. This
+                // guarantees that the input_offset is a int8_t, even though
+                // it is represented using int32_t. int32_t += int8_t *
+                // (int8_t - int8_t) so the highest value we can get from each
+                // accumulation is [-127, 127] * ([-128, 127] -
+                // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                // = 14.98, which means we can accumulate at least 2^16
+                // multiplications without overflow. The accumulator is
+                // applied to a filter so the accumulation logic will hold as
+                // long as the filter size (filter_y * filter_x * in_channel)
+                // does not exceed 2^16, which is the case in all the models
+                // we have seen so far.
+                // TODO(jianlijianli): Add a check to make sure the
+                // accumulator depth is smaller than 2^16.
+                acc += filter_val * (input_val + input_offset);
               }
             }
           }
@@ -164,35 +166,37 @@ inline void ConvPerChannel(
   const int output_width = output_shape.Dims(2);
   for (int batch = 0; batch < batches; ++batch) {
     for (int out_y = 0; out_y < output_height; ++out_y) {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
       for (int out_x = 0; out_x < output_width; ++out_x) {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
         for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          const int in_x_origin = (out_x * stride_width) - pad_width;
-          const int in_y_origin = (out_y * stride_height) - pad_height;
           std::int64_t acc = 0;
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                  (in_y < input_height);
+
+              if (!is_point_inside_image) {
+                continue;
+              }
+
               for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // Zero padding by omitting the areas outside the image.
-                const bool is_point_inside_image =
-                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height);
-                if (is_point_inside_image) {
-                  int32_t input_val = input_data[Offset(
-                      input_shape, batch, in_y, in_x, in_channel)];
-                  int32_t filter_val =
-                      filter_data[Offset(filter_shape, out_channel, filter_y,
-                                         filter_x, in_channel)];
-                  // Accumulate with 64 bits accumulator.
-                  // int64_t += int8_t * int16_t so the highest value we can
-                  // get from each accumulation is [-127, 127] * ([-32768,
-                  // 32767] -
-                  // [-32768, 32767]), which is [-8322945, 8322945].
-                  // log2(8322945) = 22.99.
-                  acc += filter_val * input_val;
-                }
+                int32_t input_val = input_data[Offset(input_shape, batch, in_y,
+                                                      in_x, in_channel)];
+                int32_t filter_val = filter_data[Offset(
+                    filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                // Accumulate with 64 bits accumulator.
+                // int64_t += int8_t * int16_t so the highest value we can
+                // get from each accumulation is [-127, 127] * ([-32768,
+                // 32767] -
+                // [-32768, 32767]), which is [-8322945, 8322945].
+                // log2(8322945) = 22.99.
+                acc += filter_val * input_val;
               }
             }
           }
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h b/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
index 1e29f8c61a7..bd484270012 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/mean.h
@@ -23,9 +23,9 @@ namespace reference_integer_ops {
 template <typename integer_type>
 inline void Mean(const tflite::MeanParams& op_params, int32_t multiplier,
                  int32_t shift, const RuntimeShape& unextended_input_shape,
-                 const integer_type* input_data, int32 input_zero_point,
+                 const integer_type* input_data, int32_t input_zero_point,
                  const RuntimeShape& unextended_output_shape,
-                 integer_type* output_data, int32 output_zero_point) {
+                 integer_type* output_data, int32_t output_zero_point) {
   // Current implementation only supports dimension equals 4 and simultaneous
   // reduction over width and height.
   TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
@@ -53,7 +53,7 @@ inline void Mean(const tflite::MeanParams& op_params, int32_t multiplier,
 
   for (int out_b = 0; out_b < output_batch; ++out_b) {
     for (int out_d = 0; out_d < output_depth; ++out_d) {
-      int32 acc = 0;
+      int32_t acc = 0;
       for (int in_h = 0; in_h < input_height; ++in_h) {
         for (int in_w = 0; in_w < input_width; ++in_w) {
           acc += input_data[Offset(input_shape, out_b, in_h, in_w, out_d)] -
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
index baae65ab30e..81ff34fef63 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
@@ -25,8 +25,8 @@ namespace reference_integer_ops {
 
 inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
                  int32_t input_multiplier, int32_t input_shift,
-                 int32_t input_size, const int8_t* input_data,
-                 int8_t* output_data) {
+                 const RuntimeShape& input_shape, const int8_t* input_data,
+                 const RuntimeShape& output_shape, int8_t* output_data) {
   // Integer bits must be in sync with Prepare() function.
   static constexpr int32_t kInputIntegerBits = 4;
   static constexpr int32_t kOutputScale = 7;
@@ -34,7 +34,9 @@ inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
   static constexpr int32_t kMaxInt8 = std::numeric_limits<int8_t>::max();
   using F4 = gemmlowp::FixedPoint<int32_t, kInputIntegerBits>;
 
-  for (int i = 0; i < input_size; ++i) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; ++i) {
     const int32_t input =
         static_cast<int32_t>(input_data[i]) - input_zero_point;
     if (input <= -input_range_radius) {
@@ -58,14 +60,16 @@ inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
 }
 
 inline void Tanh(int32_t input_multiplier, int32_t input_left_shift,
-                 int32_t input_size, const int16_t* ptr_input_data,
-                 int16_t* ptr_output_data) {
+                 const RuntimeShape& input_shape, const int16_t* ptr_input_data,
+                 const RuntimeShape& output_shape, int16_t* ptr_output_data) {
   // We use the LUT for sigmoid and take into account, that
   // tanh(x) = 2*sigmoid(2*x) - 1
 
   int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
 
-  for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
+  int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; ++i, ptr_input_data++, ptr_output_data++) {
     int32_t input_data = (*ptr_input_data) * input_data_mul;
 
     if (input_left_shift == 1) {
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index d257a170091..338adf8c2ee 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -182,7 +182,7 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
 
   for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
     const float batch_scaling_factor = scaling_factors[batch];
-    const float batch_offset = input_offset[batch];
+    const int32_t batch_offset = input_offset[batch];
     const int8_t* row_ptr = matrix;
     for (int row = 0; row < m_rows; ++row) {
       int32_t dotprod = 0;
diff --git a/tensorflow/lite/kernels/internal/reference/reduce.h b/tensorflow/lite/kernels/internal/reference/reduce.h
index 597d015d0b1..a7c86ddbc71 100644
--- a/tensorflow/lite/kernels/internal/reference/reduce.h
+++ b/tensorflow/lite/kernels/internal/reference/reduce.h
@@ -70,6 +70,9 @@ inline bool ResolveAxis(const int num_dims, const int* axis,
     // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1]  */
     int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
     TFLITE_DCHECK(current >= 0 && current < num_dims);
+    if (current < 0 || current >= num_dims) {
+      return false;
+    }
     bool is_dup = false;
     for (int j = 0; j < *out_num_axis; ++j) {
       if (out_axis[j] == current) {
@@ -129,6 +132,11 @@ inline bool ReduceGeneric(const T* input_data, const int* input_dims,
                           bool keep_dims, int* temp_index, int* resolved_axis,
                           T init_value,
                           T reducer(const T current, const T in)) {
+  // Return early when input shape has zero dim.
+  for (int i = 0; i < input_num_dims; ++i) {
+    if (input_dims[i] == 0) return true;
+  }
+
   // Reset output data.
   if (!InitTensorDataForReduce(output_dims, output_num_dims, init_value,
                                output_data)) {
@@ -186,11 +194,11 @@ inline bool Mean(const T* input_data, const int* input_dims,
   }
 
   // Calculate mean by dividing output_data by num of aggregated element.
-  U num_elements_in_axis = 1;
+  size_t num_elements_in_axis = 1;
   for (int idx = 0; idx < num_resolved_axis; ++idx) {
     size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
     // Overflow prevention.
-    if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
+    if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
       return false;
     }
     num_elements_in_axis *= current;
@@ -359,11 +367,11 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
   }
 
   // Calculate mean by dividing output_data by num of aggregated element.
-  U num_elements_in_axis = 1;
+  size_t num_elements_in_axis = 1;
   for (int idx = 0; idx < num_resolved_axis; ++idx) {
     size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
     // Overflow prevention.
-    if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
+    if (current > (std::numeric_limits<size_t>::max() / num_elements_in_axis)) {
       return false;
     }
     num_elements_in_axis *= current;
@@ -373,8 +381,7 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
     const float scale = input_scale / output_scale;
     if (compute_sum) {
       // TODO(b/116341117): Eliminate float and do this completely in 8bit.
-      const float bias =
-          -input_zero_point * scale * num_elements_in_axis + 0.5f;
+      const float bias = -input_zero_point * scale * num_elements_in_axis;
       for (size_t idx = 0; idx < num_outputs; ++idx) {
         const U value =
             static_cast<U>(TfLiteRound(temp_sum[idx] * scale + bias)) +
@@ -382,7 +389,7 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
         output_data[idx] = static_cast<T>(value);
       }
     } else {
-      const float bias = -input_zero_point * scale + 0.5f;
+      const float bias = -input_zero_point * scale;
       for (size_t idx = 0; idx < num_outputs; ++idx) {
         float float_mean = static_cast<float>(temp_sum[idx]) /
                            static_cast<float>(num_elements_in_axis);
diff --git a/tensorflow/lite/kernels/internal/reference/softmax.h b/tensorflow/lite/kernels/internal/reference/softmax.h
index ee5bd1e902f..1b3f11818e5 100644
--- a/tensorflow/lite/kernels/internal/reference/softmax.h
+++ b/tensorflow/lite/kernels/internal/reference/softmax.h
@@ -16,7 +16,6 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_SOFTMAX_H_
 
 #include <limits>
-#include <vector>
 
 #include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/kernels/internal/common.h"
@@ -145,6 +144,23 @@ inline void Softmax(const SoftmaxParams& params,
   }
 }
 
+// Computes exp(input - max_input)
+inline int16_t SoftMaxCalculateExp(const SoftmaxParams& params,
+                                   const int16_t* input_data, const int depth,
+                                   int16_t max_in_row, int i, int c) {
+  int32_t input_diff = input_data[i * depth + c] - max_in_row;
+  // scale the input_diff such that [-65535, 0] correspond to [-10.0, 0.0]
+  // exp lut generated with range [-10, 0], as exp(-10) is negligible.
+  int32_t scaled_diff = MultiplyByQuantizedMultiplier(
+      input_diff, params.input_multiplier, params.input_left_shift);
+  // recenter to [-32768, 32767]
+  int32_t sym_scaled_diff = scaled_diff + 32767;
+  int16_t sat_sym_scaled_diff =
+      std::min(std::max(sym_scaled_diff, static_cast<int32_t>(-32768)),
+               static_cast<int32_t>(32767));
+  // apply the exp() LUT activation function
+  return generic_int16_table_lookup(sat_sym_scaled_diff, params.exp_lut);
+}
 // Quantized softmax with int16_t input and int16_t output.
 inline void SoftmaxInt16(const SoftmaxParams& params,
                          const RuntimeShape& input_shape,
@@ -164,28 +180,16 @@ inline void SoftmaxInt16(const SoftmaxParams& params,
       max_in_row = std::max(max_in_row, input_data[i * depth + c]);
     }
 
-    // Compute exp(input - max_input)
-    std::vector<int16_t> exp_result_Q015(depth);
+    // This loops computes the exp values and their sum. We will need the exp
+    // values later on in the function so we cache them in the output_data
+    // buffer. This is an optimization done to avoid calculating the exp values
+    // twice making use of the output_data buffer as scratch memory.
+    int32_t sum_of_exps = 0;  // Q16.15 fixed point format.
+    int16_t* exp_results_Q015 = output_data + i * depth;
     for (int c = 0; c < depth; ++c) {
-      int32_t input_diff = input_data[i * depth + c] - max_in_row;
-      // scale the input_diff such that [-65535, 0] correspond to [-10.0, 0.0]
-      int32_t scaled_diff = MultiplyByQuantizedMultiplier(
-          input_diff, params.input_multiplier, params.input_left_shift);
-      // recenter to [-32768, 32767]
-      int32_t sym_scaled_diff = scaled_diff + 32767;
-      int16_t sat_sym_scaled_diff =
-          std::min(std::max(sym_scaled_diff, static_cast<int32_t>(-32768)),
-                   static_cast<int32_t>(32767));
-      // apply the exp() LUT activation function
-      exp_result_Q015[c] =
-          generic_int16_table_lookup(sat_sym_scaled_diff, params.exp_lut);
-    }
-
-    // sum_of_exps is a Q16.15 fixed point format.
-    int32_t sum_of_exps = 0;
-    for (int c = 0; c < depth; ++c) {
-      // Q16.15 + Q0.15
-      sum_of_exps += exp_result_Q015[c];
+      exp_results_Q015[c] =
+          SoftMaxCalculateExp(params, input_data, depth, max_in_row, i, c);
+      sum_of_exps += exp_results_Q015[c];
     }
 
     // Compute the reciprocal 1/sum_of_exps
@@ -211,7 +215,7 @@ inline void SoftmaxInt16(const SoftmaxParams& params,
     for (int c = 0; c < depth; ++c) {
       uint8_t right_shift = 31 - headroom_plus_one;
       int64_t round = 1 << (right_shift - 1);
-      int32_t result = (static_cast<int64_t>(exp_result_Q015[c]) *
+      int32_t result = (static_cast<int64_t>(exp_results_Q015[c]) *
                             static_cast<int64_t>(reciprocal_scale_Q015) +
                         round) >>
                        right_shift;
diff --git a/tensorflow/lite/kernels/internal/reference/svdf.h b/tensorflow/lite/kernels/internal/reference/svdf.h
index bb986e4de0a..c61abf3adb5 100644
--- a/tensorflow/lite/kernels/internal/reference/svdf.h
+++ b/tensorflow/lite/kernels/internal/reference/svdf.h
@@ -36,7 +36,7 @@ namespace reference_ops {
 
 static inline void ApplyTimeWeightsBiasAndActivation(
     int batch_size, int memory_size, int num_filters, int num_units, int rank,
-    const float* const __restrict__ weights_time_ptr,
+    const float* const __restrict__ weights_time_data,
     const float* const __restrict__ bias_ptr, TfLiteFusedActivation activation,
     float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr,
     float* const __restrict__ output_ptr) {
@@ -45,7 +45,7 @@ static inline void ApplyTimeWeightsBiasAndActivation(
     float* state_ptr_batch = state_ptr + b * memory_size * num_filters;
     float* scratch_ptr_batch = scratch_ptr + b * num_filters;
     tensor_utils::BatchVectorBatchVectorDotProduct(
-        weights_time_ptr, state_ptr_batch, memory_size, num_filters,
+        weights_time_data, state_ptr_batch, memory_size, num_filters,
         scratch_ptr_batch);
   }
 
@@ -74,44 +74,41 @@ static inline void ApplyTimeWeightsBiasAndActivation(
 }
 
 inline void EvalIntegerSVDF(
-    TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input_tensor,
-    const TfLiteTensor* weights_feature_tensor,
-    const TfLiteTensor* weights_time_tensor, const TfLiteTensor* bias_tensor,
-    const TfLiteSVDFParams* params, TfLiteTensor* state_tensor,
-    TfLiteTensor* output_tensor, TfLiteTensor* scratch_tensor,
-    TfLiteTensor* output_temp_tensor, int32_t scale_1_a, int scale_1_b,
-    int32_t scale_2_a, int scale_2_b, int32_t input_zp, int32_t output_zp) {
+    const TfLiteSVDFParams* params, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& weights_feature_shape,
+    const int8_t* weights_feature_data, const RuntimeShape& weights_time_shape,
+    const int16_t* weights_time_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, int16_t* state_data,
+    const RuntimeShape& output_shape, int8_t* output_data,
+    int32_t* scratch_data, int32_t* output_temp_data, int32_t scale_1_a,
+    int scale_1_b, int32_t scale_2_a, int scale_2_b, int32_t input_zp,
+    int32_t output_zp) {
   const int n_rank = params->rank;
-  const int n_batch = input_tensor->dims->data[0];
-  const int n_input = input_tensor->dims->data[1];
-  const int n_filter = weights_feature_tensor->dims->data[0];
+  const int n_batch = input_shape.Dims(0);
+  const int n_input = input_shape.Dims(1);
+  const int n_filter = weights_feature_shape.Dims(0);
   const int n_unit = n_filter / n_rank;
-  const int n_memory = weights_time_tensor->dims->data[1];
-
-  int16_t* const state_ptr = GetTensorData<int16_t>(state_tensor);
+  const int n_memory = weights_time_shape.Dims(1);
 
   // Left shift the activation_state.
   // std::copy is fine for overlapping ranges if the output is outside of the
   // input range. (This is not true for copy_n.)
-  std::copy(state_ptr + 1, state_ptr + n_batch * n_memory * n_filter,
-            state_ptr);
+  std::copy(state_data + 1, state_data + n_batch * n_memory * n_filter,
+            state_data);
 
   // Feature matmul.
   // Note: no need to clear the latest activation, matmul is not accumulative.
   {
-    const int8_t* input = GetTensorData<int8_t>(input_tensor);
-    const int8_t* weight_feature =
-        GetTensorData<int8_t>(weights_feature_tensor);
     const int32_t output_max = std::numeric_limits<int16_t>::max();
     const int32_t output_min = std::numeric_limits<int16_t>::min();
-    int16_t* result_in_batch = state_ptr + (n_memory - 1);
+    int16_t* result_in_batch = state_data + (n_memory - 1);
     for (int b = 0; b < n_batch; b++) {
-      const int8_t* matrix_ptr = weight_feature;
+      const int8_t* matrix_data = weights_feature_data;
       for (int r = 0; r < n_filter; r++) {
         int32_t dot_prod = 0;
-        const int8_t* vector_in_batch = input + b * n_input;
+        const int8_t* vector_in_batch = input_data + b * n_input;
         for (int c = 0; c < n_input; c++) {
-          dot_prod += *matrix_ptr++ * (*vector_in_batch++ - input_zp);
+          dot_prod += *matrix_data++ * (*vector_in_batch++ - input_zp);
         }
         dot_prod =
             MultiplyByQuantizedMultiplier(dot_prod, scale_1_a, scale_1_b);
@@ -131,171 +128,134 @@ inline void EvalIntegerSVDF(
   // Time.
   {
     for (int b = 0; b < n_batch; ++b) {
-      const int16_t* state_ptr_batch = state_ptr + b * n_memory * n_filter;
-      int32_t* scratch_ptr_batch =
-          GetTensorData<int32_t>(scratch_tensor) + b * n_filter;
+      const int16_t* state_data_batch = state_data + b * n_memory * n_filter;
+      int32_t* scratch_data_batch = scratch_data + b * n_filter;
       tensor_utils::BatchVectorBatchVectorDotProduct(
-          GetTensorData<int16_t>(weights_time_tensor), state_ptr_batch,
-          n_memory, n_filter, scratch_ptr_batch);
+          weights_time_data, state_data_batch, n_memory, n_filter,
+          scratch_data_batch);
     }
   }
 
   // Reduce, add bias, rescale, activation.
   {
-    int32_t* output_temp = GetTensorData<int32_t>(output_temp_tensor);
     // Add bias.
-    if (bias_tensor) {
-      tensor_utils::VectorBatchVectorAssign(GetTensorData<int32_t>(bias_tensor),
-                                            n_unit, n_batch, output_temp);
+    if (bias_data) {
+      tensor_utils::VectorBatchVectorAssign(bias_data, n_unit, n_batch,
+                                            output_temp_data);
     } else {
-      std::fill_n(output_temp, n_batch * n_unit, 0);
+      std::fill_n(output_temp_data, n_batch * n_unit, 0);
     }
     // Reduce.
     for (int b = 0; b < n_batch; ++b) {
-      int32_t* output_temp_ptr = output_temp + b * n_unit;
-      int32_t* scratch_ptr_batch =
-          GetTensorData<int32_t>(scratch_tensor) + b * n_filter;
-      tensor_utils::ReductionSumVector(scratch_ptr_batch, output_temp_ptr,
+      int32_t* output_temp_ptr = output_temp_data + b * n_unit;
+      int32_t* scratch_data_batch = scratch_data + b * n_filter;
+      tensor_utils::ReductionSumVector(scratch_data_batch, output_temp_ptr,
                                        n_unit, n_rank);
     }
     // Rescale.
     const int32_t output_max = std::numeric_limits<int8_t>::max();
     const int32_t output_min = std::numeric_limits<int8_t>::min();
     for (int i = 0; i < n_batch * n_unit; ++i) {
-      int32_t x1 = output_temp[i];
+      int32_t x1 = output_temp_data[i];
       int32_t x2 = MultiplyByQuantizedMultiplier(x1, scale_2_a, scale_2_b);
       int32_t x3 = x2 + output_zp;
       int32_t x4 = std::min(std::max(output_min, x3), output_max);
-      GetTensorData<int8_t>(output_tensor)[i] = static_cast<int8_t>(x4);
+      output_data[i] = static_cast<int8_t>(x4);
     }
   }
 }
 
-inline void EvalFloatSVDF(TfLiteContext* context, TfLiteNode* node,
-                          const TfLiteTensor* input,
-                          const TfLiteTensor* weights_feature,
-                          const TfLiteTensor* weights_time,
-                          const TfLiteTensor* bias,
-                          const TfLiteSVDFParams* params, TfLiteTensor* scratch,
-                          TfLiteTensor* state, TfLiteTensor* output) {
+inline void EvalFloatSVDF(
+    const TfLiteSVDFParams* params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& weights_feature_shape,
+    const float* weights_feature_data, const RuntimeShape& weights_time_shape,
+    const float* weights_time_data, const RuntimeShape& bias_shape,
+    const float* bias_data, float* scratch_data, float* state_data,
+    const RuntimeShape& output_shape, float* output_data) {
   const int rank = params->rank;
-  const int batch_size = input->dims->data[0];
-  const int input_size = input->dims->data[1];
-  const int num_filters = weights_feature->dims->data[0];
+  const int batch_size = input_shape.Dims(0);
+  const int input_size = input_shape.Dims(1);
+  const int num_filters = weights_feature_shape.Dims(0);
   const int num_units = num_filters / rank;
-  const int memory_size = weights_time->dims->data[1];
-
-  // Raw pointers to tensor data.
-  const float* input_ptr = GetTensorData<float>(input);
-  const float* weights_feature_ptr = GetTensorData<float>(weights_feature);
-  const float* weights_time_ptr = GetTensorData<float>(weights_time);
-  const float* bias_ptr = GetTensorData<float>(bias);
-
-  float* state_ptr = GetTensorData<float>(state);
-  float* scratch_ptr = GetTensorData<float>(scratch);
-
-  float* output_ptr = GetTensorData<float>(output);
+  const int memory_size = weights_time_shape.Dims(1);
 
   // Left shift the activation_state.
   // std::copy is fine for overlapping ranges if the output is outside of the
   // input range. (This is not true for copy_n.)
-  std::copy(state_ptr + 1, state_ptr + batch_size * memory_size * num_filters,
-            state_ptr);
+  std::copy(state_data + 1, state_data + batch_size * memory_size * num_filters,
+            state_data);
 
   // Clear scratch (the matmul is accumulative).
-  std::fill_n(scratch_ptr, batch_size * num_filters, 0.0f);
+  std::fill_n(scratch_data, batch_size * num_filters, 0.0f);
 
   // Compute conv1d(inputs, weights_feature).
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-      weights_feature_ptr, num_filters, input_size, input_ptr, batch_size,
-      scratch_ptr);
+      weights_feature_data, num_filters, input_size, input_data, batch_size,
+      scratch_data);
 
   // Copy the latest activation from scratch into activation_state:
   // The last, i.e. (memory_size-1)th entry for each batch, and filter.
   for (int i = 0; i < batch_size * num_filters; ++i) {
-    state_ptr[i * memory_size + memory_size - 1] = scratch_ptr[i];
+    state_data[i * memory_size + memory_size - 1] = scratch_data[i];
   }
 
   ApplyTimeWeightsBiasAndActivation(
-      batch_size, memory_size, num_filters, num_units, rank, weights_time_ptr,
-      bias_ptr, params->activation, state_ptr, scratch_ptr, output_ptr);
+      batch_size, memory_size, num_filters, num_units, rank, weights_time_data,
+      bias_data, params->activation, state_data, scratch_data, output_data);
 }
 
 inline void EvalHybridSVDF(
-    TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* input,
-    const TfLiteTensor* weights_feature, const TfLiteTensor* weights_time,
-    const TfLiteTensor* bias, const TfLiteSVDFParams* params,
-    TfLiteTensor* scratch, TfLiteTensor* scaling_factors,
-    TfLiteTensor* input_quantized, TfLiteTensor* state, TfLiteTensor* output,
-    TfLiteTensor* zero_points, TfLiteTensor* row_sums, bool* compute_row_sums) {
+    const TfLiteSVDFParams* params, const RuntimeShape& input_shape,
+    const float* input_data, const RuntimeShape& weights_feature_shape,
+    const int8_t* weights_feature_data, const float weights_feature_scale,
+    const RuntimeShape& weights_time_shape, const float* weights_time_data,
+    const RuntimeShape& bias_shape, const float* bias_data, float* scratch,
+    float* scaling_factors, int8_t* quantized_input, float* state,
+    const RuntimeShape& output_shape, float* output_data, int32_t* zero_points,
+    int32_t* row_sums, bool* compute_row_sums) {
   const int rank = params->rank;
-  const int batch_size = input->dims->data[0];
-  const int input_size = input->dims->data[1];
-  const int num_filters = weights_feature->dims->data[0];
+  const int batch_size = input_shape.Dims(0);
+  const int input_size = input_shape.Dims(1);
+  const int num_filters = weights_feature_shape.Dims(0);
   const int num_units = num_filters / rank;
-  const int memory_size = weights_time->dims->data[1];
-
-  // Raw pointers to tensor data.
-  const float* input_ptr = GetTensorData<float>(input);
-  const int8_t* weights_feature_ptr = GetTensorData<int8_t>(weights_feature);
-  const float* weights_time_ptr = GetTensorData<float>(weights_time);
-  const float* bias_ptr = GetTensorData<float>(bias);
-
-  int8_t* quantized_input_ptr = GetTensorData<int8_t>(input_quantized);
-  float* scaling_factors_ptr = GetTensorData<float>(scaling_factors);
-  float* state_ptr = GetTensorData<float>(state);
-  float* scratch_ptr = GetTensorData<float>(scratch);
-
-  float* output_ptr = GetTensorData<float>(output);
-
-  int32_t* zero_points_ptr = nullptr;
-  int32_t* row_sums_ptr = nullptr;
-  if (params->asymmetric_quantize_inputs && row_sums != nullptr) {
-    zero_points_ptr = GetTensorData<int32_t>(zero_points);
-    row_sums_ptr = GetTensorData<int32_t>(row_sums);
-  }
-
-  // Initialize the weights scale.
-  const float weights_feature_scale = weights_feature->params.scale;
+  const int memory_size = weights_time_shape.Dims(1);
 
   // Left shift the activation_state.
   // std::copy is fine for overlapping ranges if the output is outside of the
   // input range. (This is not true for copy_n.)
-  std::copy(state_ptr + 1, state_ptr + batch_size * memory_size * num_filters,
-            state_ptr);
+  std::copy(state + 1, state + batch_size * memory_size * num_filters, state);
 
   // Clear scratch (the matmul is accumulative).
-  std::fill_n(scratch_ptr, batch_size * num_filters, 0.0f);
+  std::fill_n(scratch, batch_size * num_filters, 0.0f);
 
-  if (!tensor_utils::IsZeroVector(input_ptr, batch_size * input_size)) {
+  if (!tensor_utils::IsZeroVector(input_data, batch_size * input_size)) {
     // Quantize input from float to int8_t.
-    tensor_utils::BatchQuantizeFloats(input_ptr, batch_size, input_size,
-                                      quantized_input_ptr, scaling_factors_ptr,
-                                      zero_points_ptr,
-                                      params->asymmetric_quantize_inputs);
+    tensor_utils::BatchQuantizeFloats(
+        input_data, batch_size, input_size, quantized_input, scaling_factors,
+        zero_points, params->asymmetric_quantize_inputs);
     for (int b = 0; b < batch_size; ++b) {
-      scaling_factors_ptr[b] *= weights_feature_scale;
+      scaling_factors[b] *= weights_feature_scale;
     }
 
     // Compute conv1d(inputs, weights_feature).
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        weights_feature_ptr, num_filters, input_size, quantized_input_ptr,
-        scaling_factors_ptr, batch_size, scratch_ptr,
-        /*per_channel_scale=*/nullptr, zero_points_ptr,
-        reinterpret_cast<int32_t*>(scratch_ptr), row_sums_ptr, compute_row_sums,
+        weights_feature_data, num_filters, input_size, quantized_input,
+        scaling_factors, batch_size, scratch,
+        /*per_channel_scale=*/nullptr, zero_points,
+        reinterpret_cast<int32_t*>(scratch), row_sums, compute_row_sums,
         /*context=*/nullptr);
   }
   // Copy the latest activation from scratch into activation_state:
   // The last, i.e. (memory_size-1)th entry for each batch, and filter.
   for (int i = 0; i < batch_size * num_filters; ++i) {
-    state_ptr[i * memory_size + memory_size - 1] = scratch_ptr[i];
+    state[i * memory_size + memory_size - 1] = scratch[i];
   }
 
   // TODO(alanchiao): can optimize hybrid case ~5% by unrolling loop in applying
   // time weights so that the inner loop multiplies eight elements at a time.
   ApplyTimeWeightsBiasAndActivation(
-      batch_size, memory_size, num_filters, num_units, rank, weights_time_ptr,
-      bias_ptr, params->activation, state_ptr, scratch_ptr, output_ptr);
+      batch_size, memory_size, num_filters, num_units, rank, weights_time_data,
+      bias_data, params->activation, state, scratch, output_data);
 }
 
 }  // namespace reference_ops
diff --git a/tensorflow/lite/kernels/internal/resize_nearest_neighbor_test.cc b/tensorflow/lite/kernels/internal/resize_nearest_neighbor_test.cc
index f8a455e7451..debeb36e48f 100644
--- a/tensorflow/lite/kernels/internal/resize_nearest_neighbor_test.cc
+++ b/tensorflow/lite/kernels/internal/resize_nearest_neighbor_test.cc
@@ -91,6 +91,17 @@ TEST(ResizeNearestNeighborReference, Test2x2To3x3) {
                                      output_shape, output_data);
 }
 
+TEST(ResizeNearestNeighborReference, Test2x2To3x3Int16) {
+  RuntimeShape input_shape = {1, 2, 2, 1};
+  std::vector<int16_t> input_data = {1, 2, 3, 4};
+  std::vector<int32> output_size_data = {3, 3};
+  RuntimeShape output_shape = {1, 3, 3, 1};
+  std::vector<int16_t> output_data = {1, 1, 2, 1, 1, 2, 3, 3, 4};
+
+  TestReferenceResizeNearestNeighbor(input_shape, input_data, output_size_data,
+                                     output_shape, output_data);
+}
+
 TEST(ResizeNearestNeighborReference, Test2x2To3x3_AlignCorners) {
   RuntimeShape input_shape = {1, 2, 2, 1};
   std::vector<uint8> input_data = {1, 2, 3, 4};
diff --git a/tensorflow/lite/kernels/internal/strided_slice_logic.h b/tensorflow/lite/kernels/internal/strided_slice_logic.h
index d9b5acbbbb4..3d91fbdb8e2 100644
--- a/tensorflow/lite/kernels/internal/strided_slice_logic.h
+++ b/tensorflow/lite/kernels/internal/strided_slice_logic.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <limits>
 #include <vector>
+
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
@@ -69,8 +70,8 @@ inline void StridedSlicePadIndices(tflite::StridedSliceParams* p,
 }
 
 // Return the index for the first element along that axis. This index will be a
-// positive integer between [0, axis_size - 1] that can be used to index
-// directly into the data.
+// positive integer between [0, axis_size] (or [-1, axis_size -1] if stride < 0)
+// that can be used to index directly into the data.
 inline int StartForAxis(const tflite::StridedSliceParams& params,
                         const RuntimeShape& input_shape, int axis) {
   const auto begin_mask = params.begin_mask;
@@ -102,7 +103,13 @@ inline int StartForAxis(const tflite::StridedSliceParams& params,
   }
 
   // Clamping
-  start = Clamp(start, 0, axis_size - 1);
+  if (strides[axis] > 0) {
+    // Forward iteration
+    start = Clamp(start, 0, axis_size);
+  } else {
+    // Backward iteration
+    start = Clamp(start, -1, axis_size - 1);
+  }
 
   return start;
 }
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 117152c2a9d..1d0a4d50eb2 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -2051,6 +2051,37 @@ TEST(uKernels, MeanStddevNormalizationAllBatches) {
                           ArrayFloatNear(expected_output, 1.81e-4f)));
 }
 
+TEST(uKernels, MeanStddevNormalizationLargeVector) {
+  const float mean = 100.0f;
+  const float diff = 1.0f;
+  // Some large vector that is not a round multiple of any SIMD vector sizes.
+  // Note this is odd.
+  constexpr int kVectorSize = 16 * 16 + 16 + 1;
+
+  float input[kVectorSize];
+  // First input is mean.
+  input[0] = mean;
+  // Rest is alternating between mean + diff and mean - diff.
+  for (int i = 1; i < kVectorSize - 1; i += 2) {
+    input[i + 0] = mean + diff;
+    input[i + 1] = mean - diff;
+  }
+  float output[kVectorSize];
+  MeanStddevNormalization(input, output, kVectorSize, 1);
+
+  float expected_output[kVectorSize];
+  // First output should be 0.
+  expected_output[0] = 0.0;
+  // Rest should be alternating between ±√(N/(N-1)).
+  const float expected_elem = std::sqrt(static_cast<double>(kVectorSize) /
+                                        static_cast<double>(kVectorSize - 1));
+  for (int i = 1; i < kVectorSize - 1; i += 2) {
+    expected_output[i + 0] = +expected_elem;
+    expected_output[i + 1] = -expected_elem;
+  }
+  EXPECT_THAT(output, testing::Pointwise(testing::FloatEq(), expected_output));
+}
+
 }  // namespace tensor_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h
index 9db742ddf03..37403a88a7c 100644
--- a/tensorflow/lite/kernels/internal/types.h
+++ b/tensorflow/lite/kernels/internal/types.h
@@ -438,7 +438,7 @@ int MatchingArraySize(const ArrayType1& array1, int index1,
 inline int MatchingDim(const RuntimeShape& shape1, int index1,
                        const RuntimeShape& shape2, int index2) {
   TFLITE_DCHECK_EQ(shape1.Dims(index1), shape2.Dims(index2));
-  return shape1.Dims(index1);
+  return std::min(shape1.Dims(index1), shape2.Dims(index2));
 }
 
 template <typename... Args>
@@ -1044,7 +1044,9 @@ struct SoftmaxParams {
   int32_t zero_point;
   float scale;
   float* table;
+  // int16 LUT for exp(x), where x uniform distributed between [-10.0 , 0.0]
   int16_t* exp_lut;
+  // int16 LUT for 1 / (1 + x), where x uniform distributed between [0.0 , 1.0]
   int16_t* one_over_one_plus_x_lut;
   uint8_t* uint8_table1;
   uint8_t* uint8_table2;
diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index 27d9da84c0a..a834d8ab913 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -30,13 +30,70 @@ namespace tflite {
 
 namespace {
 
+// Assumes tensor_index is a valid index (in bounds)
+inline TfLiteTensor* GetTensorAtIndex(const TfLiteContext* context,
+                                      int tensor_index) {
+  if (context->tensors != nullptr) {
+    return &context->tensors[tensor_index];
+  } else {
+    return context->GetTensor(context, tensor_index);
+  }
+}
+
+// Validate in a single place to reduce binary size
+inline TfLiteStatus ValidateTensorIndexingSafe(const TfLiteContext* context,
+                                               int index, int max_size,
+                                               const int* tensor_indices,
+                                               int* tensor_index) {
+  if (index < 0 || index >= max_size) {
+    TF_LITE_KERNEL_LOG(const_cast<TfLiteContext*>(context),
+                       "Invalid tensor index %d (not in [0, %d))\n", index,
+                       max_size);
+    return kTfLiteError;
+  }
+  if (tensor_indices[index] == kTfLiteOptionalTensor) {
+    TF_LITE_KERNEL_LOG(const_cast<TfLiteContext*>(context),
+                       "Tensor at index %d was optional but was expected\n",
+                       index);
+    return kTfLiteError;
+  }
+
+  *tensor_index = tensor_indices[index];
+  return kTfLiteOk;
+}
+
+// Same as above but returns -1 for invalid inputs instead of status + logging
+// error.
+inline int ValidateTensorIndexing(const TfLiteContext* context, int index,
+                                  int max_size, const int* tensor_indices) {
+  if (index >= 0 && index < max_size) {
+    const int tensor_index = tensor_indices[index];
+    if (tensor_index != kTfLiteOptionalTensor) {
+      return tensor_index;
+    }
+  }
+  return -1;
+}
+
 inline TfLiteTensor* GetMutableInput(const TfLiteContext* context,
                                      const TfLiteNode* node, int index) {
-  if (context->tensors != nullptr) {
-    return &context->tensors[node->inputs->data[index]];
-  } else {
-    return context->GetTensor(context, node->inputs->data[index]);
+  const int tensor_index = ValidateTensorIndexing(
+      context, index, node->inputs->size, node->inputs->data);
+  if (tensor_index < 0) {
+    return nullptr;
   }
+  return GetTensorAtIndex(context, tensor_index);
+}
+
+inline TfLiteStatus GetMutableInputSafe(const TfLiteContext* context,
+                                        const TfLiteNode* node, int index,
+                                        const TfLiteTensor** tensor) {
+  int tensor_index;
+  TF_LITE_ENSURE_OK(
+      context, ValidateTensorIndexingSafe(context, index, node->inputs->size,
+                                          node->inputs->data, &tensor_index));
+  *tensor = GetTensorAtIndex(context, tensor_index);
+  return kTfLiteOk;
 }
 
 }  // anonymous namespace.
@@ -46,6 +103,11 @@ const TfLiteTensor* GetInput(const TfLiteContext* context,
   return GetMutableInput(context, node, index);
 }
 
+TfLiteStatus GetInputSafe(const TfLiteContext* context, const TfLiteNode* node,
+                          int index, const TfLiteTensor** tensor) {
+  return GetMutableInputSafe(context, node, index, tensor);
+}
+
 TfLiteTensor* GetVariableInput(TfLiteContext* context, const TfLiteNode* node,
                                int index) {
   TfLiteTensor* tensor = GetMutableInput(context, node, index);
@@ -54,23 +116,73 @@ TfLiteTensor* GetVariableInput(TfLiteContext* context, const TfLiteNode* node,
 
 TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
                         int index) {
-  if (context->tensors != nullptr) {
-    return &context->tensors[node->outputs->data[index]];
-  } else {
-    return context->GetTensor(context, node->outputs->data[index]);
+  const int tensor_index = ValidateTensorIndexing(
+      context, index, node->outputs->size, node->outputs->data);
+  if (tensor_index < 0) {
+    return nullptr;
   }
+  return GetTensorAtIndex(context, tensor_index);
+}
+
+TfLiteStatus GetOutputSafe(const TfLiteContext* context, const TfLiteNode* node,
+                           int index, TfLiteTensor** tensor) {
+  int tensor_index;
+  TF_LITE_ENSURE_OK(
+      context, ValidateTensorIndexingSafe(context, index, node->outputs->size,
+                                          node->outputs->data, &tensor_index));
+  *tensor = GetTensorAtIndex(context, tensor_index);
+  return kTfLiteOk;
 }
 
 const TfLiteTensor* GetOptionalInputTensor(const TfLiteContext* context,
                                            const TfLiteNode* node, int index) {
-  const bool use_tensor = index < node->inputs->size &&
-                          node->inputs->data[index] != kTfLiteOptionalTensor;
-  if (use_tensor) {
-    return GetMutableInput(context, node, index);
-  }
-  return nullptr;
+  return GetInput(context, node, index);
 }
 
+#ifndef TF_LITE_STATIC_MEMORY
+TfLiteTensor* GetTemporary(TfLiteContext* context, const TfLiteNode* node,
+                           int index) {
+  const int tensor_index = ValidateTensorIndexing(
+      context, index, node->temporaries->size, node->temporaries->data);
+  if (tensor_index < 0) {
+    return nullptr;
+  }
+  return GetTensorAtIndex(context, tensor_index);
+}
+
+TfLiteStatus GetTemporarySafe(const TfLiteContext* context,
+                              const TfLiteNode* node, int index,
+                              TfLiteTensor** tensor) {
+  int tensor_index;
+  TF_LITE_ENSURE_OK(context, ValidateTensorIndexingSafe(
+                                 context, index, node->temporaries->size,
+                                 node->temporaries->data, &tensor_index));
+  *tensor = GetTensorAtIndex(context, tensor_index);
+  return kTfLiteOk;
+}
+
+const TfLiteTensor* GetIntermediates(TfLiteContext* context,
+                                     const TfLiteNode* node, int index) {
+  const int tensor_index = ValidateTensorIndexing(
+      context, index, node->intermediates->size, node->intermediates->data);
+  if (tensor_index < 0) {
+    return nullptr;
+  }
+  return GetTensorAtIndex(context, tensor_index);
+}
+
+TfLiteStatus GetIntermediatesSafe(const TfLiteContext* context,
+                                  const TfLiteNode* node, int index,
+                                  TfLiteTensor** tensor) {
+  int tensor_index;
+  TF_LITE_ENSURE_OK(context, ValidateTensorIndexingSafe(
+                                 context, index, node->intermediates->size,
+                                 node->intermediates->data, &tensor_index));
+  *tensor = GetTensorAtIndex(context, tensor_index);
+  return kTfLiteOk;
+}
+#endif  // TF_LITE_STATIC_MEMORY
+
 // Per-axis
 TfLiteStatus PopulateConvolutionQuantizationParams(
     TfLiteContext* context, const TfLiteTensor* input,
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index 0d6aa8fc790..06f24b8e7d1 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -29,41 +29,134 @@ namespace tflite {
 // benchmark_model for MobileNet + MobileBERT is unaffected. If such a change is
 // made, move the newly non-inlined function declarations to the top of this
 // header file.
+
+// Note: You must check if result is not null:
+//
+//   TfLiteTensor* my_tensor = GetInput(context, node, kMyTensorIdx);
+//   TF_LITE_ENSURE(context, my_tensor != nullptr);
+//
+// This is because the index might point to the optional tensor constant
+// (kTfLiteOptionalTensor) in which case there is no tensor to return.
 const TfLiteTensor* GetInput(const TfLiteContext* context,
                              const TfLiteNode* node, int index);
 
+// Same as `GetInput` but returns boolean and uses output argument for tensor.
+//
+//   TfLiteTensor* my_tensor;
+//   TF_LITE_ENSURE_OK(context,
+//                     GetInputSafe(context, node, kMyTensorIdx, &my_tensor));
+//   // can use my_tensor directly from here onwards, it is not nullptr
+//
+// Should be used in cases where the binary size is too large.
+TfLiteStatus GetInputSafe(const TfLiteContext* context, const TfLiteNode* node,
+                          int index, const TfLiteTensor** tensor);
+
 // Note: You must check if result is not null:
-// TfLiteTensor* my_tensor = GetVariableInput(context, node, kMyTensorIdx);
-// TF_LITE_ENSURE(context, my_tensor != nullptr);
+//
+//   TfLiteTensor* my_tensor = GetVariableInput(context, node, kMyTensorIdx);
+//   TF_LITE_ENSURE(context, my_tensor != nullptr);
+//
+// This is because the index might point to the optional tensor constant
+// (kTfLiteOptionalTensor) in which case there is no tensor to return.
 TfLiteTensor* GetVariableInput(TfLiteContext* context, const TfLiteNode* node,
                                int index);
 
+// Note: You must check if result is not null:
+//
+//   TfLiteTensor* my_tensor = GetOutput(context, node, kMyTensorIdx);
+//   TF_LITE_ENSURE(context, my_tensor != nullptr);
+//
+// This is because the index might point to the optional tensor constant
+// (kTfLiteOptionalTensor) in which case there is no tensor to return.
 TfLiteTensor* GetOutput(TfLiteContext* context, const TfLiteNode* node,
                         int index);
 
+// Same as `GetOutput` but returns boolean and uses output argument for tensor.
+//
+//   TfLiteTensor* my_tensor;
+//   TF_LITE_ENSURE_OK(context,
+//                     GetOutputSafe(context, node, kMyTensorIdx, &my_tensor));
+//   // can use my_tensor directly from here onwards, it is not nullptr
+//
+// Should be used in cases where the binary size is too large.
+TfLiteStatus GetOutputSafe(const TfLiteContext* context, const TfLiteNode* node,
+                           int index, TfLiteTensor** tensor);
+
+// Note: You must check if result is not null:
+//
+//   TfLiteTensor* my_tensor = GetOptionalInputTensor(context, node, kIdx);
+//   TF_LITE_ENSURE(context, my_tensor != nullptr);
+//
+// This is because the index might point to the optional tensor constant
+// (kTfLiteOptionalTensor) in which case there is no tensor to return.
+//
+// Deprecated. GetInput has the same functionality.
 const TfLiteTensor* GetOptionalInputTensor(const TfLiteContext* context,
                                            const TfLiteNode* node, int index);
 
+#ifndef TF_LITE_STATIC_MEMORY
+// Note: You must check if result is not null:
+//
+//   TfLiteTensor* my_tensor = GetTemporary(context, node, kMyTensorIdx);
+//   TF_LITE_ENSURE(context, my_tensor != nullptr);
+//
+// This is because the index might point to the optional tensor constant
+// (kTfLiteOptionalTensor) in which case there is no tensor to return.
+TfLiteTensor* GetTemporary(TfLiteContext* context, const TfLiteNode* node,
+                           int index);
+
+// Same as `GetTemporary` but returns boolean and uses output argument for
+// tensor.
+//
+//   TfLiteTensor* my_tensor;
+//   TF_LITE_ENSURE_OK(context,
+//                     GetTemporarySafe(context, node, kMyTensorIdx,
+//                     &my_tensor));
+//   // can use my_tensor directly from here onwards, it is not nullptr
+//
+// Should be used in cases where the binary size is too large.
+TfLiteStatus GetTemporarySafe(const TfLiteContext* context,
+                              const TfLiteNode* node, int index,
+                              TfLiteTensor** tensor);
+
+// Note: You must check if result is not null:
+//
+//   TfLiteTensor* my_tensor = GetIntermediates(context, node, kMyTensorIdx);
+//   TF_LITE_ENSURE(context, my_tensor != nullptr);
+//
+// This is because the index might point to the optional tensor constant
+// (kTfLiteOptionalTensor) in which case there is no tensor to return.
+const TfLiteTensor* GetIntermediates(TfLiteContext* context,
+                                     const TfLiteNode* node, int index);
+
+// Same as `GetIntermediates` but returns boolean and uses output argument for
+// tensor.
+//
+//   TfLiteTensor* my_tensor;
+//   TF_LITE_ENSURE_OK(context,
+//                     GetIntermediatesSafe(context, node, kMyTensorIdx,
+//                     &my_tensor));
+//   // can use my_tensor directly from here onwards, it is not nullptr
+//
+// Should be used in cases where the binary size is too large.
+TfLiteStatus GetIntermediatesSafe(const TfLiteContext* context,
+                                  const TfLiteNode* node, int index,
+                                  TfLiteTensor** tensor);
+#endif  // TF_LITE_STATIC_MEMORY
+
 inline int NumDimensions(const TfLiteTensor* t) { return t->dims->size; }
 inline int SizeOfDimension(const TfLiteTensor* t, int dim) {
   return t->dims->data[dim];
 }
 
+inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
+inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
+
 #ifndef TF_LITE_STATIC_MEMORY
-inline TfLiteTensor* GetTemporary(TfLiteContext* context,
-                                  const TfLiteNode* node, int index) {
-  return &context->tensors[node->temporaries->data[index]];
-}
-inline const TfLiteTensor* GetIntermediates(TfLiteContext* context,
-                                            const TfLiteNode* node, int index) {
-  return &context->tensors[node->intermediates->data[index]];
-}
 inline int NumIntermediates(const TfLiteNode* node) {
   return node->intermediates->size;
 }
 #endif  // TF_LITE_STATIC_MEMORY
-inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
-inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
 
 inline int64_t NumElements(const TfLiteIntArray* dims) {
   int64_t count = 1;
diff --git a/tensorflow/lite/kernels/l2norm.cc b/tensorflow/lite/kernels/l2norm.cc
index 857ef62a155..d809518f9e8 100644
--- a/tensorflow/lite/kernels/l2norm.cc
+++ b/tensorflow/lite/kernels/l2norm.cc
@@ -44,8 +44,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TF_LITE_ENSURE(context, NumDimensions(input) <= 4);
 
@@ -74,8 +77,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   // TODO(b/143912164): instead of hardcode the epsilon here, we should read it
   // from tensorflow, i.e., adding a params.
diff --git a/tensorflow/lite/kernels/local_response_norm.cc b/tensorflow/lite/kernels/local_response_norm.cc
index ed964365920..10288213915 100644
--- a/tensorflow/lite/kernels/local_response_norm.cc
+++ b/tensorflow/lite/kernels/local_response_norm.cc
@@ -39,8 +39,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
 
@@ -61,8 +64,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteLocalResponseNormParams*>(node->builtin_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   if (output->type == kTfLiteFloat32) {
 #define TF_LITE_LOCAL_RESPONSE_NORM(type)                            \
diff --git a/tensorflow/lite/kernels/logical.cc b/tensorflow/lite/kernels/logical.cc
index a703f3f5358..9688682c282 100644
--- a/tensorflow/lite/kernels/logical.cc
+++ b/tensorflow/lite/kernels/logical.cc
@@ -54,9 +54,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Reinterprete the opaque data provided by user.
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
 
@@ -84,9 +90,15 @@ TfLiteStatus LogicalImpl(TfLiteContext* context, TfLiteNode* node,
                          bool (*func)(bool, bool)) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   if (data->requires_broadcast) {
     reference_ops::BroadcastBinaryFunction4DSlow<bool, bool, bool>(
diff --git a/tensorflow/lite/kernels/lsh_projection.cc b/tensorflow/lite/kernels/lsh_projection.cc
index b809748c59c..81f97ecf9a9 100644
--- a/tensorflow/lite/kernels/lsh_projection.cc
+++ b/tensorflow/lite/kernels/lsh_projection.cc
@@ -73,22 +73,26 @@ TfLiteStatus Resize(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, NumInputs(node) == 2 || NumInputs(node) == 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* hash = GetInput(context, node, 0);
+  const TfLiteTensor* hash;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &hash));
   TF_LITE_ENSURE_EQ(context, NumDimensions(hash), 2);
   // Support up to 32 bits.
   TF_LITE_ENSURE(context, SizeOfDimension(hash, 1) <= 32);
 
-  const TfLiteTensor* input = GetInput(context, node, 1);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &input));
   TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
 
   if (NumInputs(node) == 3) {
-    const TfLiteTensor* weight = GetInput(context, node, 2);
+    const TfLiteTensor* weight;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 2, &weight));
     TF_LITE_ENSURE_EQ(context, NumDimensions(weight), 1);
     TF_LITE_ENSURE_EQ(context, SizeOfDimension(weight, 0),
                       SizeOfDimension(input, 0));
   }
 
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   TfLiteIntArray* outputSize = TfLiteIntArrayCreate(1);
   switch (params->type) {
     case kTfLiteLshProjectionSparse:
@@ -170,9 +174,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteLSHProjectionParams*>(node->builtin_data);
 
-  int32_t* out_buf = GetOutput(context, node, 0)->data.i32;
-  const TfLiteTensor* hash = GetInput(context, node, 0);
-  const TfLiteTensor* input = GetInput(context, node, 1);
+  TfLiteTensor* out_tensor;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &out_tensor));
+  int32_t* out_buf = out_tensor->data.i32;
+  const TfLiteTensor* hash;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &hash));
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &input));
   const TfLiteTensor* weight =
       NumInputs(node) == 2 ? nullptr : GetInput(context, node, 2);
 
diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc
index c39f715446b..3eb26565bc2 100644
--- a/tensorflow/lite/kernels/lstm.cc
+++ b/tensorflow/lite/kernels/lstm.cc
@@ -55,6 +55,10 @@ struct OpData {
   int scratch_tensor_index;
   lstm_eval::IntegerLstmParameter integer_lstm_param;
   bool compute_row_sums;
+
+  // Only used for sparse hybrid lstm kernels.
+  int ledger_index;
+  bool ledger_initialized;
 };
 
 namespace full {
@@ -77,6 +81,63 @@ enum HybridTemporaryTensor {
   kNumHybridTemporaryTensors = 12,
 };
 
+constexpr int kLedgersToAdd = 9;
+constexpr int kInputToInputWeightsLedgerOffset = 0;
+constexpr int kInputToForgetWeightsLedgerOffset = 1;
+constexpr int kInputToCellWeightsLedgerOffset = 2;
+constexpr int kInputToOutputWeightsLedgerOffset = 3;
+constexpr int kRecurrentToInputWeightsLedgerOffset = 4;
+constexpr int kRecurrentToForgetWeightsLedgerOffset = 5;
+constexpr int kRecurrentToCellWeightsLedgerOffset = 6;
+constexpr int kRecurrentToOutputWeightsLedgerOffset = 7;
+constexpr int kProjectionWeightsLedgerOffset = 8;
+
+TfLiteStatus make_ledger(const TfLiteSparsity* sparsity, TfLiteContext* context,
+                         TfLiteTensor* ledger) {
+  ledger->type = kTfLiteUInt8;
+  ledger->allocation_type = kTfLiteArenaRwPersistent;
+  if (sparsity == nullptr) {
+    return kTfLiteOk;
+  }
+  TfLiteIntArray* ledger_size = TfLiteIntArrayCreate(1);
+  ledger_size->data[0] = sparsity->dim_metadata[1].array_indices->size +
+                         sparsity->dim_metadata[1].array_segments->size - 1;
+  return context->ResizeTensor(context, ledger, ledger_size);
+}
+
+TfLiteStatus copy_ledger(const TfLiteSparsity* sparsity, TfLiteTensor* ledger) {
+  if (sparsity == nullptr) {
+    return kTfLiteOk;
+  }
+
+  const auto* array_segments = sparsity->dim_metadata[1].array_segments;
+  const auto* array_indices = sparsity->dim_metadata[1].array_indices;
+  uint8_t* output_data = GetTensorData<uint8_t>(ledger);
+  int output_data_ptr = 0;
+
+  for (int i = 0; i < array_segments->size - 1; i++) {
+    int row_start = array_segments->data[i];
+    int row_end = array_segments->data[i + 1];
+    if (row_end - row_start > UINT8_MAX) {
+      return kTfLiteError;
+    }
+    // Copy num of non-zero blocks in row i.
+    output_data[output_data_ptr] = static_cast<uint8_t>(row_end - row_start);
+    output_data_ptr++;
+
+    for (int j = row_start; j < row_end; j++) {
+      if (array_indices->data[j] > UINT8_MAX) {
+        return kTfLiteError;
+      }
+      // Copy indices of non-zero blocks in row i.
+      output_data[output_data_ptr] =
+          static_cast<uint8_t>(array_indices->data[j]);
+      output_data_ptr++;
+    }
+  }
+  return kTfLiteOk;
+}
+
 TfLiteStatus PopulateQuantizedLstmParams8x8_16(
     TfLiteContext* context, TfLiteNode* node,
     lstm_eval::IntegerLstmParameter* integer_lstm_param) {
@@ -88,7 +149,9 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
   const TfLiteTensor* cell_state =
       GetVariableInput(context, node, kCellStateTensor);
   TF_LITE_ENSURE(context, cell_state != nullptr);
-  const TfLiteTensor* output_tensor = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output_tensor;
+  TF_LITE_ENSURE_OK(
+      context, GetOutputSafe(context, node, kOutputTensor, &output_tensor));
 
   auto* cell_state_params =
       static_cast<TfLiteAffineQuantization*>(cell_state->quantization.params);
@@ -112,25 +175,38 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
   OpData* op_data = static_cast<OpData*>(node->user_data);
   const bool use_layer_norm = op_data->use_layer_norm;
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
 
   const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, kInputToForgetWeightsTensor);
-  const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, kInputToCellWeightsTensor);
-  const TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, kInputToOutputWeightsTensor);
+  const TfLiteTensor* input_to_forget_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputToForgetWeightsTensor,
+                                 &input_to_forget_weights));
+  const TfLiteTensor* input_to_cell_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputToCellWeightsTensor,
+                                 &input_to_cell_weights));
+  const TfLiteTensor* input_to_output_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputToOutputWeightsTensor,
+                                 &input_to_output_weights));
 
   const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, kRecurrentToForgetWeightsTensor);
-  const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, kRecurrentToCellWeightsTensor);
-  const TfLiteTensor* recurrent_to_output_weights =
-      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+  const TfLiteTensor* recurrent_to_forget_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kRecurrentToForgetWeightsTensor,
+                                 &recurrent_to_forget_weights));
+  const TfLiteTensor* recurrent_to_cell_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kRecurrentToCellWeightsTensor,
+                                 &recurrent_to_cell_weights));
+  const TfLiteTensor* recurrent_to_output_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kRecurrentToOutputWeightsTensor,
+                                 &recurrent_to_output_weights));
 
   const TfLiteTensor* cell_to_input_weights =
       GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
@@ -166,7 +242,9 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
   std::vector<int32> intermediate_zp;
   for (int i = 0; i < 4; ++i) {
     if (use_layer_norm) {
-      const TfLiteTensor* intermediate = GetIntermediates(context, node, i);
+      TfLiteTensor* intermediate;
+      TF_LITE_ENSURE_OK(context,
+                        GetIntermediatesSafe(context, node, i, &intermediate));
       auto* params = static_cast<TfLiteAffineQuantization*>(
           intermediate->quantization.params);
       intermediate_scale.push_back(params->scale->data[0]);
@@ -179,7 +257,8 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_16(
   }
   // In the absense of projection, hidden becomes otuput and this intermediate
   // is ignored.
-  const TfLiteTensor* hidden = GetIntermediates(context, node, 4);
+  TfLiteTensor* hidden;
+  TF_LITE_ENSURE_OK(context, GetIntermediatesSafe(context, node, 4, &hidden));
   auto* hidden_params =
       static_cast<TfLiteAffineQuantization*>(hidden->quantization.params);
   intermediate_scale.push_back(hidden_params->scale->data[0]);
@@ -385,24 +464,37 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
     TfLiteContext* context, TfLiteNode* node,
     lstm_eval::IntegerLstmParameter* integer_lstm_param) {
   // Get all tensors.
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, kInputToForgetWeightsTensor);
-  const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, kInputToCellWeightsTensor);
-  const TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, kInputToOutputWeightsTensor);
+  const TfLiteTensor* input_to_forget_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputToForgetWeightsTensor,
+                                 &input_to_forget_weights));
+  const TfLiteTensor* input_to_cell_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputToCellWeightsTensor,
+                                 &input_to_cell_weights));
+  const TfLiteTensor* input_to_output_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputToOutputWeightsTensor,
+                                 &input_to_output_weights));
 
   const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, kRecurrentToForgetWeightsTensor);
-  const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, kRecurrentToCellWeightsTensor);
-  const TfLiteTensor* recurrent_to_output_weights =
-      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+  const TfLiteTensor* recurrent_to_forget_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kRecurrentToForgetWeightsTensor,
+                                 &recurrent_to_forget_weights));
+  const TfLiteTensor* recurrent_to_cell_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kRecurrentToCellWeightsTensor,
+                                 &recurrent_to_cell_weights));
+  const TfLiteTensor* recurrent_to_output_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kRecurrentToOutputWeightsTensor,
+                                 &recurrent_to_output_weights));
 
   const TfLiteTensor* cell_to_input_weights =
       GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
@@ -422,12 +514,15 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
 
   const TfLiteTensor* input_gate_bias =
       GetOptionalInputTensor(context, node, kInputGateBiasTensor);
-  const TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, kForgetGateBiasTensor);
-  const TfLiteTensor* cell_gate_bias =
-      GetInput(context, node, kCellGateBiasTensor);
-  const TfLiteTensor* output_gate_bias =
-      GetInput(context, node, kOutputGateBiasTensor);
+  const TfLiteTensor* forget_gate_bias;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kForgetGateBiasTensor,
+                                          &forget_gate_bias));
+  const TfLiteTensor* cell_gate_bias;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kCellGateBiasTensor,
+                                          &cell_gate_bias));
+  const TfLiteTensor* output_gate_bias;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kOutputGateBiasTensor,
+                                          &output_gate_bias));
 
   const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
@@ -713,7 +808,9 @@ TfLiteStatus PopulateQuantizedLstmParams8x8_8(
   const float cell_clip = params->cell_clip;
   const float proj_clip = params->proj_clip;
 
-  const TfLiteTensor* output_tensor = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output_tensor;
+  TF_LITE_ENSURE_OK(
+      context, GetOutputSafe(context, node, kOutputTensor, &output_tensor));
 
   auto* cell_state_params = reinterpret_cast<TfLiteAffineQuantization*>(
       cell_state->quantization.params);
@@ -744,6 +841,9 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // TODO(b/159066113): maybe just add the minimum required temp tensors?
   context->AddTensors(context, kNumHybridTemporaryTensors,
                       &op_data->scratch_tensor_index);
+  // Tensors used for the sparse hybrid kernel.
+  context->AddTensors(context, /*tensors_to_add=*/kLedgersToAdd,
+                      &op_data->ledger_index);
   return op_data;
 }
 
@@ -761,8 +861,10 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE(context, params->cell_clip >= 0);
   TF_LITE_ENSURE(context, params->proj_clip >= 0);
 
-  const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, kInputToForgetWeightsTensor);
+  const TfLiteTensor* input_to_forget_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputToForgetWeightsTensor,
+                                 &input_to_forget_weights));
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
@@ -781,8 +883,10 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                             input_to_forget_weights->type);
   }
 
-  const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, kInputToCellWeightsTensor);
+  const TfLiteTensor* input_to_cell_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputToCellWeightsTensor,
+                                 &input_to_cell_weights));
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
@@ -801,8 +905,10 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                             input_to_forget_weights->type);
   }
 
-  const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, kRecurrentToForgetWeightsTensor);
+  const TfLiteTensor* recurrent_to_forget_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kRecurrentToForgetWeightsTensor,
+                                 &recurrent_to_forget_weights));
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
                     n_cell);
@@ -811,8 +917,10 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE_TYPES_EQ(context, recurrent_to_forget_weights->type,
                           input_to_forget_weights->type);
 
-  const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, kRecurrentToCellWeightsTensor);
+  const TfLiteTensor* recurrent_to_cell_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kRecurrentToCellWeightsTensor,
+                                 &recurrent_to_cell_weights));
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
@@ -884,8 +992,9 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     }
   }
 
-  const TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, kForgetGateBiasTensor);
+  const TfLiteTensor* forget_gate_bias;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kForgetGateBiasTensor,
+                                          &forget_gate_bias));
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
   if (is_integer) {
@@ -894,8 +1003,9 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_TYPES_EQ(context, forget_gate_bias->type, kTfLiteFloat32);
   }
 
-  const TfLiteTensor* cell_gate_bias =
-      GetInput(context, node, kCellGateBiasTensor);
+  const TfLiteTensor* cell_gate_bias;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kCellGateBiasTensor,
+                                          &cell_gate_bias));
   TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->data[0], n_cell);
   if (is_integer) {
@@ -904,8 +1014,9 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_TYPES_EQ(context, cell_gate_bias->type, kTfLiteFloat32);
   }
 
-  const TfLiteTensor* output_gate_bias =
-      GetInput(context, node, kOutputGateBiasTensor);
+  const TfLiteTensor* output_gate_bias;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kOutputGateBiasTensor,
+                                          &output_gate_bias));
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
   if (is_integer) {
@@ -1041,7 +1152,8 @@ TfLiteStatus PrecomputeZeroPointTimesWeightWithBias(
 TfLiteStatus PopulatePrecomputedZPTimesWeightsWithBias(TfLiteContext* context,
                                                        OpData* op_data,
                                                        TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   const TfLiteTensor* output_state =
       GetVariableInput(context, node, kOutputStateTensor);
   TF_LITE_ENSURE(context, output_state != nullptr);
@@ -1051,21 +1163,33 @@ TfLiteStatus PopulatePrecomputedZPTimesWeightsWithBias(TfLiteContext* context,
 
   const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, kInputToForgetWeightsTensor);
-  const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, kInputToCellWeightsTensor);
-  const TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, kInputToOutputWeightsTensor);
+  const TfLiteTensor* input_to_forget_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputToForgetWeightsTensor,
+                                 &input_to_forget_weights));
+  const TfLiteTensor* input_to_cell_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputToCellWeightsTensor,
+                                 &input_to_cell_weights));
+  const TfLiteTensor* input_to_output_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputToOutputWeightsTensor,
+                                 &input_to_output_weights));
 
   const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, kRecurrentToForgetWeightsTensor);
-  const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, kRecurrentToCellWeightsTensor);
-  const TfLiteTensor* recurrent_to_output_weights =
-      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+  const TfLiteTensor* recurrent_to_forget_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kRecurrentToForgetWeightsTensor,
+                                 &recurrent_to_forget_weights));
+  const TfLiteTensor* recurrent_to_cell_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kRecurrentToCellWeightsTensor,
+                                 &recurrent_to_cell_weights));
+  const TfLiteTensor* recurrent_to_output_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kRecurrentToOutputWeightsTensor,
+                                 &recurrent_to_output_weights));
 
   const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
@@ -1190,20 +1314,25 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Inferring batch size, number of outputs and number of cells from the
   // input tensors.
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   const bool is_integer = input->type == kTfLiteInt8;
   TF_LITE_ENSURE(context, input->dims->size > 1);
   const int n_batch = input->dims->data[0];
   const int n_input = input->dims->data[1];
 
-  const TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, kInputToOutputWeightsTensor);
+  const TfLiteTensor* input_to_output_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputToOutputWeightsTensor,
+                                 &input_to_output_weights));
   const int n_cell = input_to_output_weights->dims->data[0];
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
 
-  const TfLiteTensor* recurrent_to_output_weights =
-      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+  const TfLiteTensor* recurrent_to_output_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kRecurrentToOutputWeightsTensor,
+                                 &recurrent_to_output_weights));
   TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0],
                     n_cell);
@@ -1215,7 +1344,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                           n_cell, use_layer_norm, is_integer));
 
   // Get the pointer to output, output_state and cell_state tensors.
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TfLiteTensor* output_state =
       GetVariableInput(context, node, kOutputStateTensor);
@@ -1239,6 +1370,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // The weights are of consistent type, so it suffices to check one.
   const bool is_hybrid_op = IsHybridOp(input, input_to_output_weights);
 
+  const bool is_sparse_op = (input_to_output_weights->sparsity != nullptr);
+
   // The type of Integer LSTM.
   const int num_intermediate_tensors = node->intermediates->size;
   if (is_integer) {
@@ -1251,7 +1384,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteIntArrayFree(node->temporaries);
   if (is_hybrid_op) {
-    node->temporaries = TfLiteIntArrayCreate(kNumHybridTemporaryTensors);
+    if (is_sparse_op) {
+      node->temporaries =
+          TfLiteIntArrayCreate(kNumHybridTemporaryTensors + kLedgersToAdd);
+    } else {
+      node->temporaries = TfLiteIntArrayCreate(kNumHybridTemporaryTensors);
+    }
   } else if (is_integer) {
     if (is_8x8_16) {
       node->temporaries = TfLiteIntArrayCreate(6);
@@ -1268,7 +1406,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   if (!is_integer) {
     node->temporaries->data[kScratchBuffer] =
         op_data->scratch_tensor_index + kScratchBuffer;
-    TfLiteTensor* scratch_buffer = GetTemporary(context, node, kScratchBuffer);
+    TfLiteTensor* scratch_buffer;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kScratchBuffer,
+                                                &scratch_buffer));
     scratch_buffer->type = input->type;
     scratch_buffer->allocation_type = kTfLiteArenaRw;
 
@@ -1289,13 +1429,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
 
   if (is_hybrid_op) {
-    op_data->compute_row_sums = true;
+    if (!is_sparse_op) {
+      op_data->compute_row_sums = true;
+    }
     // Allocate temporary tensors to store quantized values of input,
     // output_state and cell_state tensors.
     node->temporaries->data[kInputQuantized] =
         op_data->scratch_tensor_index + kInputQuantized;
-    TfLiteTensor* input_quantized =
-        GetTemporary(context, node, kInputQuantized);
+    TfLiteTensor* input_quantized;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kInputQuantized,
+                                                &input_quantized));
     input_quantized->type = input_to_output_weights->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
@@ -1305,8 +1448,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kOutputStateQuantized] =
         op_data->scratch_tensor_index + kOutputStateQuantized;
-    TfLiteTensor* output_state_quantized =
-        GetTemporary(context, node, kOutputStateQuantized);
+    TfLiteTensor* output_state_quantized;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kOutputStateQuantized,
+                                       &output_state_quantized));
     output_state_quantized->type = input_to_output_weights->type;
     output_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(output_state_quantized->dims,
@@ -1319,8 +1464,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kCellStateQuantized] =
         op_data->scratch_tensor_index + kCellStateQuantized;
-    TfLiteTensor* cell_state_quantized =
-        GetTemporary(context, node, kCellStateQuantized);
+    TfLiteTensor* cell_state_quantized;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kCellStateQuantized,
+                                       &cell_state_quantized));
     cell_state_quantized->type = input_to_output_weights->type;
     cell_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(cell_state_quantized->dims, cell_state->dims)) {
@@ -1337,7 +1484,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // the scaling factor of the matrix).
     node->temporaries->data[kInputScalingFactors] =
         op_data->scratch_tensor_index + kInputScalingFactors;
-    TfLiteTensor* input_sf = GetTemporary(context, node, kInputScalingFactors);
+    TfLiteTensor* input_sf;
+    TF_LITE_ENSURE_OK(
+        context,
+        GetTemporarySafe(context, node, kInputScalingFactors, &input_sf));
     input_sf->type = kTfLiteFloat32;
     input_sf->allocation_type = kTfLiteArenaRw;
     int scaling_dims[1] = {n_batch};
@@ -1349,8 +1499,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kOutputStateScalingFactors] =
         op_data->scratch_tensor_index + kOutputStateScalingFactors;
-    TfLiteTensor* output_state_sf =
-        GetTemporary(context, node, kOutputStateScalingFactors);
+    TfLiteTensor* output_state_sf;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, kOutputStateScalingFactors,
+                                  &output_state_sf));
     output_state_sf->type = kTfLiteFloat32;
     output_state_sf->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqualsArray(output_state_sf->dims, 1, scaling_dims)) {
@@ -1361,8 +1513,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kProductScalingFactors] =
         op_data->scratch_tensor_index + kProductScalingFactors;
-    TfLiteTensor* prod_scaling_factors =
-        GetTemporary(context, node, kProductScalingFactors);
+    TfLiteTensor* prod_scaling_factors;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kProductScalingFactors,
+                                       &prod_scaling_factors));
     prod_scaling_factors->type = kTfLiteFloat32;
     prod_scaling_factors->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqualsArray(prod_scaling_factors->dims, 1,
@@ -1378,8 +1532,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // this is used for diagonal matrices, only need to store n_cell values.
     node->temporaries->data[kRecoveredCellWeights] =
         op_data->scratch_tensor_index + kRecoveredCellWeights;
-    TfLiteTensor* recovered_cell_weights =
-        GetTemporary(context, node, kRecoveredCellWeights);
+    TfLiteTensor* recovered_cell_weights;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kRecoveredCellWeights,
+                                       &recovered_cell_weights));
     recovered_cell_weights->type = kTfLiteFloat32;
     recovered_cell_weights->allocation_type = kTfLiteArenaRw;
     int recovered_cell_dims[1] = {n_cell};
@@ -1395,7 +1551,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // multiplication before multiplication by scaling factor
     node->temporaries->data[kAccumScratch] =
         op_data->scratch_tensor_index + kAccumScratch;
-    TfLiteTensor* accum_scratch = GetTemporary(context, node, kAccumScratch);
+    TfLiteTensor* accum_scratch;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kAccumScratch,
+                                                &accum_scratch));
     accum_scratch->type = kTfLiteInt32;
     accum_scratch->allocation_type = kTfLiteArenaRw;
     int accum_scratch_dims[2] = {n_cell, n_batch};
@@ -1409,7 +1567,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kInputZeroPoints] =
         op_data->scratch_tensor_index + kInputZeroPoints;
-    TfLiteTensor* input_zp = GetTemporary(context, node, kInputZeroPoints);
+    TfLiteTensor* input_zp;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, kInputZeroPoints, &input_zp));
     input_zp->type = kTfLiteFloat32;
     input_zp->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqualsArray(input_zp->dims, 1, scaling_dims)) {
@@ -1420,8 +1580,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kOutputStateZeroPoints] =
         op_data->scratch_tensor_index + kOutputStateZeroPoints;
-    TfLiteTensor* output_state_zp =
-        GetTemporary(context, node, kOutputStateZeroPoints);
+    TfLiteTensor* output_state_zp;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kOutputStateZeroPoints,
+                                       &output_state_zp));
     output_state_zp->type = kTfLiteFloat32;
     output_state_zp->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqualsArray(output_state_zp->dims, 1, scaling_dims)) {
@@ -1443,7 +1605,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       row_sums_rows += ceil(static_cast<float>(n_output) / n_cell);
     }
 
-    TfLiteTensor* row_sums = GetTemporary(context, node, kRowSums);
+    TfLiteTensor* row_sums;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kRowSums, &row_sums));
     row_sums->type = kTfLiteInt32;
     row_sums->allocation_type = kTfLiteArenaRwPersistent;
     const int row_sums_dims[2] = {row_sums_rows, n_cell};
@@ -1454,6 +1618,125 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE_OK(
           context, context->ResizeTensor(context, row_sums, row_sums_size));
     }
+
+    if (is_sparse_op) {
+      op_data->ledger_initialized = false;
+      int offset = kNumHybridTemporaryTensors;
+      {
+        node->temporaries->data[offset + kInputToInputWeightsLedgerOffset] =
+            op_data->ledger_index + kInputToInputWeightsLedgerOffset;
+        const TfLiteTensor* input_to_input_weights =
+            GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+        TfLiteTensor* input_to_input_weights_ledger =
+            &context->tensors[op_data->ledger_index +
+                              kInputToInputWeightsLedgerOffset];
+        auto status = make_ledger(input_to_input_weights == nullptr
+                                      ? nullptr
+                                      : input_to_input_weights->sparsity,
+                                  context, input_to_input_weights_ledger);
+        if (status != kTfLiteOk) return status;
+      }
+      {
+        node->temporaries->data[offset + kInputToForgetWeightsLedgerOffset] =
+            op_data->ledger_index + kInputToForgetWeightsLedgerOffset;
+        const TfLiteTensor* input_to_forget_weights =
+            GetInput(context, node, kInputToForgetWeightsTensor);
+        TfLiteTensor* input_to_forget_weights_ledger =
+            &context->tensors[op_data->ledger_index +
+                              kInputToForgetWeightsLedgerOffset];
+        auto status = make_ledger(input_to_forget_weights->sparsity, context,
+                                  input_to_forget_weights_ledger);
+        if (status != kTfLiteOk) return status;
+      }
+      {
+        node->temporaries->data[offset + kInputToCellWeightsLedgerOffset] =
+            op_data->ledger_index + kInputToCellWeightsLedgerOffset;
+        const TfLiteTensor* input_to_cell_weights =
+            GetInput(context, node, kInputToCellWeightsTensor);
+        TfLiteTensor* input_to_cell_weights_ledger =
+            &context->tensors[op_data->ledger_index +
+                              kInputToCellWeightsLedgerOffset];
+        auto status = make_ledger(input_to_cell_weights->sparsity, context,
+                                  input_to_cell_weights_ledger);
+        if (status != kTfLiteOk) return status;
+      }
+      {
+        node->temporaries->data[offset + kInputToOutputWeightsLedgerOffset] =
+            op_data->ledger_index + kInputToOutputWeightsLedgerOffset;
+        const TfLiteTensor* input_to_output_weights =
+            GetInput(context, node, kInputToOutputWeightsTensor);
+        TfLiteTensor* input_to_output_weights_ledger =
+            &context->tensors[op_data->ledger_index +
+                              kInputToOutputWeightsLedgerOffset];
+        auto status = make_ledger(input_to_output_weights->sparsity, context,
+                                  input_to_output_weights_ledger);
+        if (status != kTfLiteOk) return status;
+      }
+      {
+        node->temporaries->data[offset + kRecurrentToInputWeightsLedgerOffset] =
+            op_data->ledger_index + kRecurrentToInputWeightsLedgerOffset;
+        const TfLiteTensor* recurrent_to_input_weights = GetOptionalInputTensor(
+            context, node, kRecurrentToInputWeightsTensor);
+        TfLiteTensor* recurrent_to_input_weights_ledger =
+            &context->tensors[op_data->ledger_index +
+                              kRecurrentToInputWeightsLedgerOffset];
+        auto status = make_ledger(recurrent_to_input_weights == nullptr
+                                      ? nullptr
+                                      : recurrent_to_input_weights->sparsity,
+                                  context, recurrent_to_input_weights_ledger);
+        if (status != kTfLiteOk) return status;
+      }
+      {
+        node->temporaries
+            ->data[offset + kRecurrentToForgetWeightsLedgerOffset] =
+            op_data->ledger_index + kRecurrentToForgetWeightsLedgerOffset;
+        const TfLiteTensor* recurrent_to_forget_weights =
+            GetInput(context, node, kRecurrentToForgetWeightsTensor);
+        TfLiteTensor* recurrent_to_forget_weights_ledger =
+            &context->tensors[op_data->ledger_index +
+                              kRecurrentToForgetWeightsLedgerOffset];
+        auto status = make_ledger(recurrent_to_forget_weights->sparsity,
+                                  context, recurrent_to_forget_weights_ledger);
+        if (status != kTfLiteOk) return status;
+      }
+      {
+        node->temporaries->data[offset + kRecurrentToCellWeightsLedgerOffset] =
+            op_data->ledger_index + kRecurrentToCellWeightsLedgerOffset;
+        const TfLiteTensor* recurrent_to_cell_weights =
+            GetInput(context, node, kRecurrentToCellWeightsTensor);
+        TfLiteTensor* recurrent_to_cell_weights_ledger =
+            &context->tensors[op_data->ledger_index +
+                              kRecurrentToCellWeightsLedgerOffset];
+        auto status = make_ledger(recurrent_to_cell_weights->sparsity, context,
+                                  recurrent_to_cell_weights_ledger);
+        if (status != kTfLiteOk) return status;
+      }
+      {
+        node->temporaries
+            ->data[offset + kRecurrentToOutputWeightsLedgerOffset] =
+            op_data->ledger_index + kRecurrentToOutputWeightsLedgerOffset;
+        const TfLiteTensor* recurrent_to_output_weights =
+            GetInput(context, node, kRecurrentToOutputWeightsTensor);
+        TfLiteTensor* recurrent_to_output_weights_ledger =
+            &context->tensors[op_data->ledger_index +
+                              kRecurrentToOutputWeightsLedgerOffset];
+        auto status = make_ledger(recurrent_to_output_weights->sparsity,
+                                  context, recurrent_to_output_weights_ledger);
+        if (status != kTfLiteOk) return status;
+      }
+      {
+        node->temporaries->data[offset + kProjectionWeightsLedgerOffset] =
+            op_data->ledger_index + kProjectionWeightsLedgerOffset;
+        const TfLiteTensor* projection_weights =
+            GetInput(context, node, kProjectionWeightsTensor);
+        TfLiteTensor* projection_weights_ledger =
+            &context->tensors[op_data->ledger_index +
+                              kProjectionWeightsLedgerOffset];
+        auto status = make_ledger(projection_weights->sparsity, context,
+                                  projection_weights_ledger);
+        if (status != kTfLiteOk) return status;
+      }
+    }
   }
 
   if (is_integer) {
@@ -1472,8 +1755,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       for (int scratch_index = 0; scratch_index < 6; ++scratch_index) {
         node->temporaries->data[scratch_index] =
             op_data->scratch_tensor_index + scratch_index;
-        TfLiteTensor* scratch_tensor =
-            GetTemporary(context, node, scratch_index);
+        TfLiteTensor* scratch_tensor;
+        TF_LITE_ENSURE_OK(
+            context,
+            GetTemporarySafe(context, node, scratch_index, &scratch_tensor));
         scratch_tensor->type = kTfLiteInt16;
         if (scratch_index == 4) {
           scratch_tensor->type = kTfLiteInt8;
@@ -1509,8 +1794,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       for (int scratch_index = 0; scratch_index < 8; ++scratch_index) {
         node->temporaries->data[scratch_index] =
             op_data->scratch_tensor_index + scratch_index;
-        TfLiteTensor* scratch_tensor =
-            GetTemporary(context, node, scratch_index);
+        TfLiteTensor* scratch_tensor;
+        TF_LITE_ENSURE_OK(
+            context,
+            GetTemporarySafe(context, node, scratch_index, &scratch_tensor));
         if (scratch_index == 0 || scratch_index == 1) {
           scratch_tensor->type = kTfLiteInt8;
         } else {
@@ -1539,25 +1826,38 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const auto* params = static_cast<TfLiteLSTMParams*>(node->builtin_data);
   OpData* op_data = static_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
 
   const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, kInputToForgetWeightsTensor);
-  const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, kInputToCellWeightsTensor);
-  const TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, kInputToOutputWeightsTensor);
+  const TfLiteTensor* input_to_forget_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputToForgetWeightsTensor,
+                                 &input_to_forget_weights));
+  const TfLiteTensor* input_to_cell_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputToCellWeightsTensor,
+                                 &input_to_cell_weights));
+  const TfLiteTensor* input_to_output_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputToOutputWeightsTensor,
+                                 &input_to_output_weights));
 
   const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, kRecurrentToForgetWeightsTensor);
-  const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, kRecurrentToCellWeightsTensor);
-  const TfLiteTensor* recurrent_to_output_weights =
-      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+  const TfLiteTensor* recurrent_to_forget_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kRecurrentToForgetWeightsTensor,
+                                 &recurrent_to_forget_weights));
+  const TfLiteTensor* recurrent_to_cell_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kRecurrentToCellWeightsTensor,
+                                 &recurrent_to_cell_weights));
+  const TfLiteTensor* recurrent_to_output_weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kRecurrentToOutputWeightsTensor,
+                                 &recurrent_to_output_weights));
 
   const TfLiteTensor* cell_to_input_weights =
       GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
@@ -1577,12 +1877,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   const TfLiteTensor* input_gate_bias =
       GetOptionalInputTensor(context, node, kInputGateBiasTensor);
-  const TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, kForgetGateBiasTensor);
-  const TfLiteTensor* cell_gate_bias =
-      GetInput(context, node, kCellGateBiasTensor);
-  const TfLiteTensor* output_gate_bias =
-      GetInput(context, node, kOutputGateBiasTensor);
+  const TfLiteTensor* forget_gate_bias;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kForgetGateBiasTensor,
+                                          &forget_gate_bias));
+  const TfLiteTensor* cell_gate_bias;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kCellGateBiasTensor,
+                                          &cell_gate_bias));
+  const TfLiteTensor* output_gate_bias;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kOutputGateBiasTensor,
+                                          &output_gate_bias));
 
   const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
@@ -1591,16 +1894,20 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   TfLiteTensor* output_state =
       GetVariableInput(context, node, kOutputStateTensor);
-  TF_LITE_ENSURE(context, output_state != nullptr);
+  TFLITE_DCHECK(output_state != nullptr);
   TfLiteTensor* cell_state = GetVariableInput(context, node, kCellStateTensor);
-  TF_LITE_ENSURE(context, cell_state != nullptr);
+  TFLITE_DCHECK(cell_state != nullptr);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   switch (input_to_output_weights->type) {
     case kTfLiteFloat32: {
       // Index the scratch buffers pointers to the global scratch buffer.
-      TfLiteTensor* scratch_buffer = GetTemporary(context, node, 0);
+      TfLiteTensor* scratch_buffer;
+      TF_LITE_ENSURE_OK(context,
+                        GetTemporarySafe(context, node, 0, &scratch_buffer));
       return lstm_eval::EvalFloat(
           input, input_to_input_weights, input_to_forget_weights,
           input_to_cell_weights, input_to_output_weights,
@@ -1624,14 +1931,118 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteUInt8:
     case kTfLiteInt8: {
       const bool is_hybrid = (input->type == kTfLiteFloat32);
+      const bool is_sparse = input_to_output_weights->sparsity != nullptr;
       if (is_hybrid) {
-        TfLiteTensor* row_sums = GetTemporary(context, node, kRowSums);
+        TfLiteTensor* row_sums;
+        TF_LITE_ENSURE_OK(context,
+                          GetTemporarySafe(context, node, kRowSums, &row_sums));
         const int row_sums_size = row_sums->dims->data[0];
+        if (is_sparse) {
+          TfLiteTensor* input_to_input_weights_ledger =
+              &context->tensors[op_data->ledger_index +
+                                kInputToInputWeightsLedgerOffset];
+          TfLiteTensor* input_to_forget_weights_ledger =
+              &context->tensors[op_data->ledger_index +
+                                kInputToForgetWeightsLedgerOffset];
+          TfLiteTensor* input_to_cell_weights_ledger =
+              &context->tensors[op_data->ledger_index +
+                                kInputToCellWeightsLedgerOffset];
+          TfLiteTensor* input_to_output_weights_ledger =
+              &context->tensors[op_data->ledger_index +
+                                kInputToOutputWeightsLedgerOffset];
+          TfLiteTensor* recurrent_to_input_weights_ledger =
+              &context->tensors[op_data->ledger_index +
+                                kRecurrentToInputWeightsLedgerOffset];
+          TfLiteTensor* recurrent_to_forget_weights_ledger =
+              &context->tensors[op_data->ledger_index +
+                                kRecurrentToForgetWeightsLedgerOffset];
+          TfLiteTensor* recurrent_to_cell_weights_ledger =
+              &context->tensors[op_data->ledger_index +
+                                kRecurrentToCellWeightsLedgerOffset];
+          TfLiteTensor* recurrent_to_output_weights_ledger =
+              &context->tensors[op_data->ledger_index +
+                                kRecurrentToOutputWeightsLedgerOffset];
+          TfLiteTensor* projection_weights_ledger =
+              &context->tensors[op_data->ledger_index +
+                                kProjectionWeightsLedgerOffset];
+          if (!op_data->ledger_initialized) {
+            copy_ledger(input_to_input_weights == nullptr
+                            ? nullptr
+                            : input_to_input_weights->sparsity,
+                        input_to_input_weights_ledger);
+            copy_ledger(input_to_forget_weights->sparsity,
+                        input_to_forget_weights_ledger);
+            copy_ledger(input_to_cell_weights->sparsity,
+                        input_to_cell_weights_ledger);
+            copy_ledger(input_to_output_weights->sparsity,
+                        input_to_output_weights_ledger);
+            copy_ledger(recurrent_to_input_weights == nullptr
+                            ? nullptr
+                            : recurrent_to_input_weights->sparsity,
+                        recurrent_to_input_weights_ledger);
+            copy_ledger(recurrent_to_forget_weights->sparsity,
+                        recurrent_to_forget_weights_ledger);
+            copy_ledger(recurrent_to_cell_weights->sparsity,
+                        recurrent_to_cell_weights_ledger);
+            copy_ledger(recurrent_to_output_weights->sparsity,
+                        recurrent_to_output_weights_ledger);
+            copy_ledger(projection_weights->sparsity,
+                        projection_weights_ledger);
+            op_data->ledger_initialized = true;
+          }
+          return lstm_eval::EvalHybrid(
+              input, input_to_input_weights, input_to_input_weights_ledger,
+              input_to_forget_weights, input_to_forget_weights_ledger,
+              input_to_cell_weights, input_to_cell_weights_ledger,
+              input_to_output_weights, input_to_output_weights_ledger,
+              recurrent_to_input_weights, recurrent_to_input_weights_ledger,
+              recurrent_to_forget_weights, recurrent_to_forget_weights_ledger,
+              recurrent_to_cell_weights, recurrent_to_cell_weights_ledger,
+              recurrent_to_output_weights, recurrent_to_output_weights_ledger,
+              cell_to_input_weights, cell_to_forget_weights,
+              cell_to_output_weights, input_layer_norm_coefficients,
+              forget_layer_norm_coefficients, cell_layer_norm_coefficients,
+              output_layer_norm_coefficients,
+              /*aux_input=*/nullptr,
+              /*aux_input_to_input_weights=*/nullptr,
+              /*aux_input_to_forget_weights=*/nullptr,
+              /*aux_input_to_cell_weights=*/nullptr,
+              /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
+              forget_gate_bias, cell_gate_bias, output_gate_bias,
+              projection_weights, projection_weights_ledger, projection_bias,
+              params,
+              /*forward_sequence=*/true, /*time_major=*/true,
+              /*output_offset=*/0, GetTemporary(context, node, kScratchBuffer),
+              GetTemporary(context, node, kInputScalingFactors),
+              /*aux_input_sf=*/nullptr,
+              GetTemporary(context, node, kOutputStateScalingFactors),
+              GetTemporary(context, node, kProductScalingFactors),
+              GetTemporary(context, node, kRecoveredCellWeights),
+              GetTemporary(context, node, kInputQuantized),
+              /*aux_input_quantized=*/nullptr,
+              GetTemporary(context, node, kOutputStateQuantized),
+              GetTemporary(context, node, kCellStateQuantized), output_state,
+              cell_state, GetTemporary(context, node, kAccumScratch), output,
+              GetTemporary(context, node, kInputZeroPoints),
+              /*aux_input_zp=*/nullptr,
+              GetTemporary(context, node, kOutputStateZeroPoints), row_sums,
+              row_sums_size, &op_data->compute_row_sums,
+              CpuBackendContext::GetFromContext(context));
+        }
         return lstm_eval::EvalHybrid(
-            input, input_to_input_weights, input_to_forget_weights,
-            input_to_cell_weights, input_to_output_weights,
-            recurrent_to_input_weights, recurrent_to_forget_weights,
-            recurrent_to_cell_weights, recurrent_to_output_weights,
+            input, input_to_input_weights,
+            /*input_to_input_weights_ledger*/ nullptr, input_to_forget_weights,
+            /*input_to_forget_weights_ledger*/ nullptr, input_to_cell_weights,
+            /*input_to_cell_weights_ledger*/ nullptr, input_to_output_weights,
+            /*input_to_output_weights_ledger*/ nullptr,
+            recurrent_to_input_weights,
+            /*recurrent_to_input_weights_ledger*/ nullptr,
+            recurrent_to_forget_weights,
+            /*recurrent_to_forget_weights_ledger*/ nullptr,
+            recurrent_to_cell_weights,
+            /*recurrent_to_cell_weights_ledger*/ nullptr,
+            recurrent_to_output_weights,
+            /*recurrent_to_output_weights_ledger*/ nullptr,
             cell_to_input_weights, cell_to_forget_weights,
             cell_to_output_weights, input_layer_norm_coefficients,
             forget_layer_norm_coefficients, cell_layer_norm_coefficients,
@@ -1641,7 +2052,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
             /*aux_input_to_cell_weights=*/nullptr,
             /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
             forget_gate_bias, cell_gate_bias, output_gate_bias,
-            projection_weights, projection_bias, params,
+            projection_weights, /*projection_weights_ledger*/ nullptr,
+            projection_bias, params,
             /*forward_sequence=*/true, /*time_major=*/true, /*output_offset=*/0,
             GetTemporary(context, node, kScratchBuffer),
             GetTemporary(context, node, kInputScalingFactors),
@@ -1662,12 +2074,24 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       } else {
         const int num_intermediate_tensors = node->intermediates->size;
         if (num_intermediate_tensors == 5) {
-          TfLiteTensor* scratch0 = GetTemporary(context, node, 0);
-          TfLiteTensor* scratch1 = GetTemporary(context, node, 1);
-          TfLiteTensor* scratch2 = GetTemporary(context, node, 2);
-          TfLiteTensor* scratch3 = GetTemporary(context, node, 3);
-          TfLiteTensor* scratch4 = GetTemporary(context, node, 4);
-          TfLiteTensor* scratch5 = GetTemporary(context, node, 5);
+          TfLiteTensor* scratch0;
+          TF_LITE_ENSURE_OK(context,
+                            GetTemporarySafe(context, node, 0, &scratch0));
+          TfLiteTensor* scratch1;
+          TF_LITE_ENSURE_OK(context,
+                            GetTemporarySafe(context, node, 1, &scratch1));
+          TfLiteTensor* scratch2;
+          TF_LITE_ENSURE_OK(context,
+                            GetTemporarySafe(context, node, 2, &scratch2));
+          TfLiteTensor* scratch3;
+          TF_LITE_ENSURE_OK(context,
+                            GetTemporarySafe(context, node, 3, &scratch3));
+          TfLiteTensor* scratch4;
+          TF_LITE_ENSURE_OK(context,
+                            GetTemporarySafe(context, node, 4, &scratch4));
+          TfLiteTensor* scratch5;
+          TF_LITE_ENSURE_OK(context,
+                            GetTemporarySafe(context, node, 5, &scratch5));
           return lstm_eval::EvalInteger8x8_16(
               input, input_to_input_weights, input_to_forget_weights,
               input_to_cell_weights, input_to_output_weights,
@@ -1683,14 +2107,30 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
               scratch3, scratch4, scratch5,
               CpuBackendContext::GetFromContext(context));
         } else {
-          TfLiteTensor* scratch0 = GetTemporary(context, node, 0);
-          TfLiteTensor* scratch1 = GetTemporary(context, node, 1);
-          TfLiteTensor* scratch2 = GetTemporary(context, node, 2);
-          TfLiteTensor* scratch3 = GetTemporary(context, node, 3);
-          TfLiteTensor* scratch4 = GetTemporary(context, node, 4);
-          TfLiteTensor* scratch5 = GetTemporary(context, node, 5);
-          TfLiteTensor* scratch6 = GetTemporary(context, node, 6);
-          TfLiteTensor* scratch7 = GetTemporary(context, node, 7);
+          TfLiteTensor* scratch0;
+          TF_LITE_ENSURE_OK(context,
+                            GetTemporarySafe(context, node, 0, &scratch0));
+          TfLiteTensor* scratch1;
+          TF_LITE_ENSURE_OK(context,
+                            GetTemporarySafe(context, node, 1, &scratch1));
+          TfLiteTensor* scratch2;
+          TF_LITE_ENSURE_OK(context,
+                            GetTemporarySafe(context, node, 2, &scratch2));
+          TfLiteTensor* scratch3;
+          TF_LITE_ENSURE_OK(context,
+                            GetTemporarySafe(context, node, 3, &scratch3));
+          TfLiteTensor* scratch4;
+          TF_LITE_ENSURE_OK(context,
+                            GetTemporarySafe(context, node, 4, &scratch4));
+          TfLiteTensor* scratch5;
+          TF_LITE_ENSURE_OK(context,
+                            GetTemporarySafe(context, node, 5, &scratch5));
+          TfLiteTensor* scratch6;
+          TF_LITE_ENSURE_OK(context,
+                            GetTemporarySafe(context, node, 6, &scratch6));
+          TfLiteTensor* scratch7;
+          TF_LITE_ENSURE_OK(context,
+                            GetTemporarySafe(context, node, 7, &scratch7));
           return lstm_eval::EvalInteger8x8_8(
               input, input_to_input_weights, input_to_forget_weights,
               input_to_cell_weights, input_to_output_weights,
@@ -1751,12 +2191,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, node->inputs->size == kInputNum);
   TF_LITE_ENSURE(context, node->outputs->size == kOutputNum);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputData);
-  const TfLiteTensor* prev_activation =
-      GetInput(context, node, kInputPrevActivation);
-  const TfLiteTensor* weights = GetInput(context, node, kInputWeights);
-  const TfLiteTensor* bias = GetInput(context, node, kInputBiases);
-  const TfLiteTensor* prev_state = GetInput(context, node, kInputPrevState);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputData, &input));
+  const TfLiteTensor* prev_activation;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputPrevActivation,
+                                          &prev_activation));
+  const TfLiteTensor* weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputWeights, &weights));
+  const TfLiteTensor* bias;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputBiases, &bias));
+  const TfLiteTensor* prev_state;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputPrevState, &prev_state));
 
   TF_LITE_ENSURE_EQ(context, input->dims->size, 2);
   const int num_batches = input->dims->data[0];
@@ -1778,11 +2225,18 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, prev_state->dims->data[0], num_batches);
   TF_LITE_ENSURE_EQ(context, prev_state->dims->data[1], activation_depth);
 
-  TfLiteTensor* activation_out = GetOutput(context, node, kOutputActivation);
-  TfLiteTensor* state_out = GetOutput(context, node, kOutputState);
-  TfLiteTensor* concat_temp = GetOutput(context, node, kOutputConcatTemp);
-  TfLiteTensor* activation_temp =
-      GetOutput(context, node, kOutputActivationTemp);
+  TfLiteTensor* activation_out;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, kOutputActivation,
+                                           &activation_out));
+  TfLiteTensor* state_out;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputState, &state_out));
+  TfLiteTensor* concat_temp;
+  TF_LITE_ENSURE_OK(
+      context, GetOutputSafe(context, node, kOutputConcatTemp, &concat_temp));
+  TfLiteTensor* activation_temp;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, kOutputActivationTemp,
+                                           &activation_temp));
 
   TF_LITE_ENSURE_OK(context, context->ResizeTensor(
                                  context, activation_out,
@@ -1811,18 +2265,32 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputData);
-  const TfLiteTensor* prev_activation =
-      GetInput(context, node, kInputPrevActivation);
-  const TfLiteTensor* weights = GetInput(context, node, kInputWeights);
-  const TfLiteTensor* bias = GetInput(context, node, kInputBiases);
-  const TfLiteTensor* prev_state = GetInput(context, node, kInputPrevState);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputData, &input));
+  const TfLiteTensor* prev_activation;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputPrevActivation,
+                                          &prev_activation));
+  const TfLiteTensor* weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputWeights, &weights));
+  const TfLiteTensor* bias;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputBiases, &bias));
+  const TfLiteTensor* prev_state;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputPrevState, &prev_state));
 
-  TfLiteTensor* activation_out = GetOutput(context, node, kOutputActivation);
-  TfLiteTensor* state_out = GetOutput(context, node, kOutputState);
-  TfLiteTensor* concat_temp = GetOutput(context, node, kOutputConcatTemp);
-  TfLiteTensor* activation_temp =
-      GetOutput(context, node, kOutputActivationTemp);
+  TfLiteTensor* activation_out;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, kOutputActivation,
+                                           &activation_out));
+  TfLiteTensor* state_out;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputState, &state_out));
+  TfLiteTensor* concat_temp;
+  TF_LITE_ENSURE_OK(
+      context, GetOutputSafe(context, node, kOutputConcatTemp, &concat_temp));
+  TfLiteTensor* activation_temp;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, kOutputActivationTemp,
+                                           &activation_temp));
 
   if (input->type == kTfLiteFloat32 &&
       prev_activation->type == kTfLiteFloat32 &&
diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc
index e11a7c5a026..695100fa92f 100644
--- a/tensorflow/lite/kernels/lstm_eval.cc
+++ b/tensorflow/lite/kernels/lstm_eval.cc
@@ -312,6 +312,7 @@ void CalculateLstmGateHybrid(
     // Input and weights
     const int8_t* input, const float* input_sf, const int32_t* input_zp,
     const int8_t* input_to_gate_weights,
+    const uint8_t* input_to_gate_weights_ledger,
     const float input_to_gate_weights_scale, int32_t* input_to_gate_row_sums,
     // Aux input and weights
     const int8_t* aux_input, const float* aux_input_sf,
@@ -321,6 +322,7 @@ void CalculateLstmGateHybrid(
     // Output state and weights
     const int8_t* output_state, const float* output_state_sf,
     const int32_t* output_state_zp, const int8_t* recurrent_to_gate_weights,
+    const uint8_t* recurrent_to_gate_weights_ledger,
     const float recurrent_to_gate_weights_scale,
     int32_t* recurrent_to_gate_row_sums,
     // Cell state and weights (peephole LSTM)
@@ -356,11 +358,22 @@ void CalculateLstmGateHybrid(
   // For each batch and cell: compute input_weight * input.
   // Skip if input is all zeros.
   if (!is_input_all_zeros) {
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        input_to_gate_weights, n_cell, n_input, input,
-        input_to_gate_weights_scale, input_sf, n_batch, gate,
-        /*per_channel_scale=*/nullptr, input_zp, accum_scratch,
-        input_to_gate_row_sums, compute_row_sums, scratch0, context);
+    if (input_to_gate_weights_ledger != nullptr) {
+      std::vector<float> scales(n_batch);
+      for (int i = 0; i < n_batch; i++) {
+        scales[i] = input_to_gate_weights_scale * input_sf[i];
+      }
+      tensor_utils::SparseMatrixBatchVectorMultiplyAccumulate(
+          input_to_gate_weights, input_to_gate_weights_ledger, n_cell, n_input,
+          input, scales.data(), n_batch, gate);
+
+    } else {
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          input_to_gate_weights, n_cell, n_input, input,
+          input_to_gate_weights_scale, input_sf, n_batch, gate,
+          /*per_channel_scale=*/nullptr, input_zp, accum_scratch,
+          input_to_gate_row_sums, compute_row_sums, scratch0, context);
+    }
   }
   // For each batch and cell: compute aux_input_weight * aux_input.
   // Skip if auxiliary input is not available or all zeros.
@@ -374,11 +387,21 @@ void CalculateLstmGateHybrid(
   // For each batch and cell: compute recurrent_weight * output_state.
   // Skip if output state is all zeros.
   if (!is_output_state_all_zeros) {
-    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-        recurrent_to_gate_weights, n_cell, n_output, output_state,
-        recurrent_to_gate_weights_scale, output_state_sf, n_batch, gate,
-        /*per_channel_scale=*/nullptr, output_state_zp, accum_scratch,
-        recurrent_to_gate_row_sums, compute_row_sums, scratch0, context);
+    if (recurrent_to_gate_weights_ledger != nullptr) {
+      std::vector<float> scales(n_batch);
+      for (int i = 0; i < n_batch; i++) {
+        scales[i] = recurrent_to_gate_weights_scale * input_sf[i];
+      }
+      tensor_utils::SparseMatrixBatchVectorMultiplyAccumulate(
+          recurrent_to_gate_weights, recurrent_to_gate_weights_ledger, n_cell,
+          n_output, output_state, scales.data(), n_batch, gate);
+    } else {
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          recurrent_to_gate_weights, n_cell, n_output, output_state,
+          recurrent_to_gate_weights_scale, output_state_sf, n_batch, gate,
+          /*per_channel_scale=*/nullptr, output_state_zp, accum_scratch,
+          recurrent_to_gate_row_sums, compute_row_sums, scratch0, context);
+    }
   }
   // For each batch and cell: compute cell_weight .* cell_state (peephole LSTM)
   if (use_peephole) {
@@ -422,11 +445,12 @@ void CalculateLstmGateHybrid(
 void CalculateLstmOutputHybrid(
     int n_batch, int n_cell, int n_output, const float* cell_state,
     const float* output_gate, TfLiteFusedActivation activation,
-    const int8_t* projection_weights, float projection_weights_scale,
-    const float* projection_bias, const float proj_clip, float* output_state,
-    bool asymmetric_quantize_inputs, int32_t* projection_weights_row_sums,
-    bool* compute_row_sums, CpuBackendContext* context, float* scratch0,
-    int8_t* scratch1, float* scratch2, int32_t* scratch3, int32_t* scratch4) {
+    const int8_t* projection_weights, const uint8_t* projection_weights_ledger,
+    float projection_weights_scale, const float* projection_bias,
+    const float proj_clip, float* output_state, bool asymmetric_quantize_inputs,
+    int32_t* projection_weights_row_sums, bool* compute_row_sums,
+    CpuBackendContext* context, float* scratch0, int8_t* scratch1,
+    float* scratch2, int32_t* scratch3, int32_t* scratch4) {
   tensor_utils::ApplyActivationToVector(cell_state, n_batch * n_cell,
                                         activation, scratch0);
   tensor_utils::VectorVectorCwiseProduct(output_gate, scratch0,
@@ -447,11 +471,21 @@ void CalculateLstmOutputHybrid(
       tensor_utils::BatchQuantizeFloats(scratch0, n_batch, n_cell, scratch1,
                                         scratch2, scratch3,
                                         asymmetric_quantize_inputs);
-      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
-          projection_weights, n_output, n_cell, scratch1,
-          projection_weights_scale, scratch2, n_batch, output_state,
-          /*per_channel_scale=*/nullptr, scratch3, scratch4,
-          projection_weights_row_sums, compute_row_sums, scratch2, context);
+      if (projection_weights_ledger != nullptr) {
+        std::vector<float> scales(n_batch);
+        for (int i = 0; i < n_batch; i++) {
+          scales[i] = projection_weights_scale * scratch2[i];
+        }
+        tensor_utils::SparseMatrixBatchVectorMultiplyAccumulate(
+            projection_weights, projection_weights_ledger, n_output, n_cell,
+            scratch1, scales.data(), n_batch, output_state);
+      } else {
+        tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+            projection_weights, n_output, n_cell, scratch1,
+            projection_weights_scale, scratch2, n_batch, output_state,
+            /*per_channel_scale=*/nullptr, scratch3, scratch4,
+            projection_weights_row_sums, compute_row_sums, scratch2, context);
+      }
     }
     if (proj_clip > 0.0f) {
       tensor_utils::CwiseClipping(output_state, n_batch * n_output, proj_clip);
@@ -955,11 +989,16 @@ inline void LstmStepFloat(
 //   output_ptr       - size 'n_batch * output_batch_leading_dim'
 inline void LstmStepHybrid(
     const float* input_ptr, const int8_t* input_to_input_weights_ptr,
+    const uint8_t* input_to_input_weights_ledger_ptr,
     float input_to_input_weights_scale,
     const int8_t* input_to_forget_weights_ptr,
+    const uint8_t* input_to_forget_weights_ledger_ptr,
     float input_to_forget_weights_scale,
-    const int8_t* input_to_cell_weights_ptr, float input_to_cell_weights_scale,
+    const int8_t* input_to_cell_weights_ptr,
+    const uint8_t* input_to_cell_weights_ledger_ptr,
+    float input_to_cell_weights_scale,
     const int8_t* input_to_output_weights_ptr,
+    const uint8_t* input_to_output_weights_ledger_ptr,
     float input_to_output_weights_scale, const float* aux_input_ptr,
     const int8_t* aux_input_to_input_weights_ptr,
     float aux_input_to_input_weights_scale,
@@ -970,12 +1009,16 @@ inline void LstmStepHybrid(
     const int8_t* aux_input_to_output_weights_ptr,
     float aux_input_to_output_weights_scale,
     const int8_t* recurrent_to_input_weights_ptr,
+    const uint8_t* recurrent_to_input_weights_ledger_ptr,
     float recurrent_to_input_weights_scale,
     const int8_t* recurrent_to_forget_weights_ptr,
+    const uint8_t* recurrent_to_forget_weights_ledger_ptr,
     float recurrent_to_forget_weights_scale,
     const int8_t* recurrent_to_cell_weights_ptr,
+    const uint8_t* recurrent_to_cell_weights_ledger_ptr,
     float recurrent_to_cell_weights_scale,
     const int8_t* recurrent_to_output_weights_ptr,
+    const uint8_t* recurrent_to_output_weights_ledger_ptr,
     float recurrent_to_output_weights_scale,
     const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale,
     const int8_t* cell_to_forget_weights_ptr,
@@ -988,19 +1031,21 @@ inline void LstmStepHybrid(
     const float* output_layer_norm_coefficients_ptr,
     const float* input_gate_bias_ptr, const float* forget_gate_bias_ptr,
     const float* cell_gate_bias_ptr, const float* output_gate_bias_ptr,
-    const int8_t* projection_weights_ptr, float projection_weights_scale,
-    const float* projection_bias_ptr, const TfLiteLSTMParams* params,
-    int n_batch, int n_cell, int n_input, int n_aux_input, int n_output,
-    int output_batch_leading_dim, float* scratch0, float* scratch1,
-    float* scratch2, float* scratch3, float* input_sf, float* aux_input_sf,
-    float* output_state_sf, float* scaling_factors_scratch,
-    float* recovered_cell_weights, int8_t* quantized_input_ptr,
-    int8_t* quantized_aux_input_ptr, int8_t* quantized_output_state_ptr,
-    int8_t* quantized_output_scratch, float* output_state_ptr,
-    float* cell_state_ptr, int32_t* accum_scratch_ptr, float* output_ptr,
-    int32_t* input_zp, int32_t* aux_input_zp, int32_t* output_state_zp,
-    int32_t* row_sums, int row_sums_size, bool* compute_row_sums,
-    bool asymmetric_quantize_inputs, CpuBackendContext* context) {
+    const int8_t* projection_weights_ptr,
+    const uint8_t* projection_weights_ledger_ptr,
+    float projection_weights_scale, const float* projection_bias_ptr,
+    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
+    int n_aux_input, int n_output, int output_batch_leading_dim,
+    float* scratch0, float* scratch1, float* scratch2, float* scratch3,
+    float* input_sf, float* aux_input_sf, float* output_state_sf,
+    float* scaling_factors_scratch, float* recovered_cell_weights,
+    int8_t* quantized_input_ptr, int8_t* quantized_aux_input_ptr,
+    int8_t* quantized_output_state_ptr, int8_t* quantized_output_scratch,
+    float* output_state_ptr, float* cell_state_ptr, int32_t* accum_scratch_ptr,
+    float* output_ptr, int32_t* input_zp, int32_t* aux_input_zp,
+    int32_t* output_state_zp, int32_t* row_sums, int row_sums_size,
+    bool* compute_row_sums, bool asymmetric_quantize_inputs,
+    CpuBackendContext* context) {
   ruy::profiler::ScopeLabel label("LstmStepHybrid");
   // Since we have already checked that weights are all there or none, we
   // can check the existence of only one to the get the condition.
@@ -1106,11 +1151,12 @@ inline void LstmStepHybrid(
     // Calculate the input gate. (If not CIFG.)
     CalculateLstmGateHybrid(
         quantized_input_ptr, input_sf, input_zp, input_to_input_weights_ptr,
-        input_to_input_weights_scale, input_to_input_row_sums,
-        quantized_aux_input_ptr, aux_input_sf, aux_input_zp,
-        aux_input_to_input_weights_ptr, aux_input_to_input_weights_scale,
-        aux_input_to_input_row_sums, quantized_output_state_ptr,
-        output_state_sf, output_state_zp, recurrent_to_input_weights_ptr,
+        input_to_input_weights_ledger_ptr, input_to_input_weights_scale,
+        input_to_input_row_sums, quantized_aux_input_ptr, aux_input_sf,
+        aux_input_zp, aux_input_to_input_weights_ptr,
+        aux_input_to_input_weights_scale, aux_input_to_input_row_sums,
+        quantized_output_state_ptr, output_state_sf, output_state_zp,
+        recurrent_to_input_weights_ptr, recurrent_to_input_weights_ledger_ptr,
         recurrent_to_input_weights_scale, recurrent_to_input_row_sums,
         cell_state_ptr, cell_to_input_weights_ptr, cell_to_input_weights_scale,
         input_layer_norm_coefficients_ptr, input_gate_bias_ptr, n_batch,
@@ -1122,11 +1168,12 @@ inline void LstmStepHybrid(
   // Calculate the forget gate.
   CalculateLstmGateHybrid(
       quantized_input_ptr, input_sf, input_zp, input_to_forget_weights_ptr,
-      input_to_forget_weights_scale, input_to_forget_row_sums,
-      quantized_aux_input_ptr, aux_input_sf, aux_input_zp,
-      aux_input_to_forget_weights_ptr, aux_input_to_forget_weights_scale,
-      aux_input_to_forget_row_sums, quantized_output_state_ptr, output_state_sf,
-      output_state_zp, recurrent_to_forget_weights_ptr,
+      input_to_forget_weights_ledger_ptr, input_to_forget_weights_scale,
+      input_to_forget_row_sums, quantized_aux_input_ptr, aux_input_sf,
+      aux_input_zp, aux_input_to_forget_weights_ptr,
+      aux_input_to_forget_weights_scale, aux_input_to_forget_row_sums,
+      quantized_output_state_ptr, output_state_sf, output_state_zp,
+      recurrent_to_forget_weights_ptr, recurrent_to_forget_weights_ledger_ptr,
       recurrent_to_forget_weights_scale, recurrent_to_forget_row_sums,
       cell_state_ptr, cell_to_forget_weights_ptr, cell_to_forget_weights_scale,
       forget_layer_norm_coefficients_ptr, forget_gate_bias_ptr, n_batch,
@@ -1137,11 +1184,12 @@ inline void LstmStepHybrid(
   // Calculate the cell update gate.
   CalculateLstmGateHybrid(
       quantized_input_ptr, input_sf, input_zp, input_to_cell_weights_ptr,
-      input_to_cell_weights_scale, input_to_cell_row_sums,
-      quantized_aux_input_ptr, aux_input_sf, aux_input_zp,
-      aux_input_to_cell_weights_ptr, aux_input_to_cell_weights_scale,
-      aux_input_to_cell_row_sums, quantized_output_state_ptr, output_state_sf,
-      output_state_zp, recurrent_to_cell_weights_ptr,
+      input_to_cell_weights_ledger_ptr, input_to_cell_weights_scale,
+      input_to_cell_row_sums, quantized_aux_input_ptr, aux_input_sf,
+      aux_input_zp, aux_input_to_cell_weights_ptr,
+      aux_input_to_cell_weights_scale, aux_input_to_cell_row_sums,
+      quantized_output_state_ptr, output_state_sf, output_state_zp,
+      recurrent_to_cell_weights_ptr, recurrent_to_cell_weights_ledger_ptr,
       recurrent_to_cell_weights_scale, recurrent_to_cell_row_sums,
       /*cell_state=*/nullptr, /*cell_to_gate_weights=*/nullptr,
       /*cell_to_gate_weights_scale=*/0.0f, cell_layer_norm_coefficients_ptr,
@@ -1157,11 +1205,12 @@ inline void LstmStepHybrid(
   // Calculate the output gate.
   CalculateLstmGateHybrid(
       quantized_input_ptr, input_sf, input_zp, input_to_output_weights_ptr,
-      input_to_output_weights_scale, input_to_output_row_sums,
-      quantized_aux_input_ptr, aux_input_sf, aux_input_zp,
-      aux_input_to_output_weights_ptr, aux_input_to_output_weights_scale,
-      aux_input_to_output_row_sums, quantized_output_state_ptr, output_state_sf,
-      output_state_zp, recurrent_to_output_weights_ptr,
+      input_to_output_weights_ledger_ptr, input_to_output_weights_scale,
+      input_to_output_row_sums, quantized_aux_input_ptr, aux_input_sf,
+      aux_input_zp, aux_input_to_output_weights_ptr,
+      aux_input_to_output_weights_scale, aux_input_to_output_row_sums,
+      quantized_output_state_ptr, output_state_sf, output_state_zp,
+      recurrent_to_output_weights_ptr, recurrent_to_output_weights_ledger_ptr,
       recurrent_to_output_weights_scale, recurrent_to_output_row_sums,
       cell_state_ptr, cell_to_output_weights_ptr, cell_to_output_weights_scale,
       output_layer_norm_coefficients_ptr, output_gate_bias_ptr, n_batch,
@@ -1172,11 +1221,11 @@ inline void LstmStepHybrid(
   // Update the output state.
   CalculateLstmOutputHybrid(
       n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch,
-      params->activation, projection_weights_ptr, projection_weights_scale,
-      projection_bias_ptr, params->proj_clip, output_state_ptr,
-      asymmetric_quantize_inputs, projection_weights_row_sums, compute_row_sums,
-      context, scratch2, quantized_output_scratch, input_sf, input_zp,
-      accum_scratch_ptr);
+      params->activation, projection_weights_ptr, projection_weights_ledger_ptr,
+      projection_weights_scale, projection_bias_ptr, params->proj_clip,
+      output_state_ptr, asymmetric_quantize_inputs, projection_weights_row_sums,
+      compute_row_sums, context, scratch2, quantized_output_scratch, input_sf,
+      input_zp, accum_scratch_ptr);
   // Copy output state to the output. Note that the output's rows may not be
   // contiguous (output_batch_leading_dim != n_output).
   for (int b = 0; b < n_batch; b++) {
@@ -1829,13 +1878,21 @@ TfLiteStatus EvalFloat(
 
 TfLiteStatus EvalHybrid(
     const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_input_weights_ledger,
     const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_forget_weights_ledger,
     const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_cell_weights_ledger,
     const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* input_to_output_weights_ledger,
     const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_input_weights_ledger,
     const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_forget_weights_ledger,
     const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_cell_weights_ledger,
     const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* recurrent_to_output_weights_ledger,
     const TfLiteTensor* cell_to_input_weights,
     const TfLiteTensor* cell_to_forget_weights,
     const TfLiteTensor* cell_to_output_weights,
@@ -1850,9 +1907,11 @@ TfLiteStatus EvalHybrid(
     const TfLiteTensor* aux_input_to_output_weights,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
     const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
-    int output_offset, TfLiteTensor* scratch_buffer, TfLiteTensor* input_sf,
+    const TfLiteTensor* projection_weights,
+    const TfLiteTensor* projection_weights_ledger,
+    const TfLiteTensor* projection_bias, const TfLiteLSTMParams* params,
+    bool forward_sequence, bool time_major, int output_offset,
+    TfLiteTensor* scratch_buffer, TfLiteTensor* input_sf,
     TfLiteTensor* aux_input_sf, TfLiteTensor* output_state_sf,
     TfLiteTensor* prod_scaling_factors, TfLiteTensor* recovered_cell_weights,
     TfLiteTensor* input_quantized, TfLiteTensor* aux_input_quantized,
@@ -1929,12 +1988,16 @@ TfLiteStatus EvalHybrid(
           GetTensorData<float>(output) + t_rel * output_step + output_offset;
       LstmStepHybrid(
           input_ptr, GetTensorData<int8_t>(input_to_input_weights),
+          GetTensorData<uint8_t>(input_to_input_weights_ledger),
           GetTensorScale(input_to_input_weights),
           GetTensorData<int8_t>(input_to_forget_weights),
+          GetTensorData<uint8_t>(input_to_forget_weights_ledger),
           GetTensorScale(input_to_forget_weights),
           GetTensorData<int8_t>(input_to_cell_weights),
+          GetTensorData<uint8_t>(input_to_cell_weights_ledger),
           GetTensorScale(input_to_cell_weights),
           GetTensorData<int8_t>(input_to_output_weights),
+          GetTensorData<uint8_t>(input_to_output_weights_ledger),
           GetTensorScale(input_to_output_weights), aux_input_ptr,
           GetTensorData<int8_t>(aux_input_to_input_weights),
           GetTensorScale(aux_input_to_input_weights),
@@ -1945,12 +2008,16 @@ TfLiteStatus EvalHybrid(
           GetTensorData<int8_t>(aux_input_to_output_weights),
           GetTensorScale(aux_input_to_output_weights),
           GetTensorData<int8_t>(recurrent_to_input_weights),
+          GetTensorData<uint8_t>(recurrent_to_input_weights_ledger),
           GetTensorScale(recurrent_to_input_weights),
           GetTensorData<int8_t>(recurrent_to_forget_weights),
+          GetTensorData<uint8_t>(recurrent_to_forget_weights_ledger),
           GetTensorScale(recurrent_to_forget_weights),
           GetTensorData<int8_t>(recurrent_to_cell_weights),
+          GetTensorData<uint8_t>(recurrent_to_cell_weights_ledger),
           GetTensorScale(recurrent_to_cell_weights),
           GetTensorData<int8_t>(recurrent_to_output_weights),
+          GetTensorData<uint8_t>(recurrent_to_output_weights_ledger),
           GetTensorScale(recurrent_to_output_weights),
           GetTensorData<int8_t>(cell_to_input_weights),
           GetTensorScale(cell_to_input_weights),
@@ -1967,6 +2034,7 @@ TfLiteStatus EvalHybrid(
           GetTensorData<float>(cell_gate_bias),
           GetTensorData<float>(output_gate_bias),
           GetTensorData<int8_t>(projection_weights),
+          GetTensorData<uint8_t>(projection_weights_ledger),
           GetTensorScale(projection_weights),
           GetTensorData<float>(projection_bias), params, n_batch, n_cell,
           n_input, aux_input_size, n_output, output_batch_leading_dim,
@@ -2018,12 +2086,16 @@ TfLiteStatus EvalHybrid(
 
         LstmStepHybrid(
             input_ptr, GetTensorData<int8_t>(input_to_input_weights),
+            GetTensorData<uint8_t>(input_to_input_weights_ledger),
             GetTensorScale(input_to_input_weights),
             GetTensorData<int8_t>(input_to_forget_weights),
+            GetTensorData<uint8_t>(input_to_forget_weights_ledger),
             GetTensorScale(input_to_forget_weights),
             GetTensorData<int8_t>(input_to_cell_weights),
+            GetTensorData<uint8_t>(input_to_cell_weights_ledger),
             GetTensorScale(input_to_cell_weights),
             GetTensorData<int8_t>(input_to_output_weights),
+            GetTensorData<uint8_t>(input_to_output_weights_ledger),
             GetTensorScale(input_to_output_weights), aux_input_ptr,
             GetTensorData<int8_t>(aux_input_to_input_weights),
             GetTensorScale(aux_input_to_input_weights),
@@ -2034,12 +2106,16 @@ TfLiteStatus EvalHybrid(
             GetTensorData<int8_t>(aux_input_to_output_weights),
             GetTensorScale(aux_input_to_output_weights),
             GetTensorData<int8_t>(recurrent_to_input_weights),
+            GetTensorData<uint8_t>(recurrent_to_input_weights_ledger),
             GetTensorScale(recurrent_to_input_weights),
             GetTensorData<int8_t>(recurrent_to_forget_weights),
+            GetTensorData<uint8_t>(recurrent_to_forget_weights_ledger),
             GetTensorScale(recurrent_to_forget_weights),
             GetTensorData<int8_t>(recurrent_to_cell_weights),
+            GetTensorData<uint8_t>(recurrent_to_cell_weights_ledger),
             GetTensorScale(recurrent_to_cell_weights),
             GetTensorData<int8_t>(recurrent_to_output_weights),
+            GetTensorData<uint8_t>(recurrent_to_output_weights_ledger),
             GetTensorScale(recurrent_to_output_weights),
             GetTensorData<int8_t>(cell_to_input_weights),
             GetTensorScale(cell_to_input_weights),
@@ -2056,6 +2132,7 @@ TfLiteStatus EvalHybrid(
             GetTensorData<float>(cell_gate_bias),
             GetTensorData<float>(output_gate_bias),
             GetTensorData<int8_t>(projection_weights),
+            GetTensorData<uint8_t>(projection_weights_ledger),
             GetTensorScale(projection_weights),
             GetTensorData<float>(projection_bias), params,
             /*n_batch=*/1, n_cell, n_input, aux_input_size, n_output,
diff --git a/tensorflow/lite/kernels/lstm_eval.h b/tensorflow/lite/kernels/lstm_eval.h
index d3fdf037b5c..5807c9ee56d 100644
--- a/tensorflow/lite/kernels/lstm_eval.h
+++ b/tensorflow/lite/kernels/lstm_eval.h
@@ -125,13 +125,21 @@ TfLiteStatus EvalFloat(
 
 TfLiteStatus EvalHybrid(
     const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_input_weights_ledger,
     const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_forget_weights_ledger,
     const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_cell_weights_ledger,
     const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* input_to_output_weights_ledger,
     const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_input_weights_ledger,
     const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_forget_weights_ledger,
     const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_cell_weights_ledger,
     const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* recurrent_to_output_weights_ledger,
     const TfLiteTensor* cell_to_input_weights,
     const TfLiteTensor* cell_to_forget_weights,
     const TfLiteTensor* cell_to_output_weights,
@@ -146,9 +154,11 @@ TfLiteStatus EvalHybrid(
     const TfLiteTensor* aux_input_to_output_weights,
     const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
     const TfLiteTensor* cell_gate_bias, const TfLiteTensor* output_gate_bias,
-    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
-    const TfLiteLSTMParams* params, bool forward_sequence, bool time_major,
-    int output_offset, TfLiteTensor* scratch_buffer, TfLiteTensor* input_sf,
+    const TfLiteTensor* projection_weights,
+    const TfLiteTensor* projection_weights_ledger,
+    const TfLiteTensor* projection_bias, const TfLiteLSTMParams* params,
+    bool forward_sequence, bool time_major, int output_offset,
+    TfLiteTensor* scratch_buffer, TfLiteTensor* input_sf,
     TfLiteTensor* aux_input_sf, TfLiteTensor* output_state_sf,
     TfLiteTensor* prod_scaling_factors, TfLiteTensor* recovered_cell_weights,
     TfLiteTensor* input_quantized, TfLiteTensor* aux_input_quantized,
diff --git a/tensorflow/lite/kernels/lstm_eval_test.cc b/tensorflow/lite/kernels/lstm_eval_test.cc
index adaa5db1e20..c7d935a4b4f 100644
--- a/tensorflow/lite/kernels/lstm_eval_test.cc
+++ b/tensorflow/lite/kernels/lstm_eval_test.cc
@@ -906,14 +906,14 @@ void TestOneHybridAsymmLSTM() {
   constexpr float kDefaultScale = 18.0;
   ops::builtin::lstm_eval::EvalHybrid(
       one_parameter.GetFloatInput(),
-      HybridLstmParam::addScale(one_parameter.Geti2i(), kDefaultScale),
-      HybridLstmParam::addScale(one_parameter.Geti2f(), kDefaultScale),
-      HybridLstmParam::addScale(one_parameter.Geti2c(), kDefaultScale),
-      HybridLstmParam::addScale(one_parameter.Geti2o(), kDefaultScale),
-      HybridLstmParam::addScale(one_parameter.Getr2i(), kDefaultScale),
-      HybridLstmParam::addScale(one_parameter.Getr2f(), kDefaultScale),
-      HybridLstmParam::addScale(one_parameter.Getr2c(), kDefaultScale),
-      HybridLstmParam::addScale(one_parameter.Getr2o(), kDefaultScale),
+      HybridLstmParam::addScale(one_parameter.Geti2i(), kDefaultScale), nullptr,
+      HybridLstmParam::addScale(one_parameter.Geti2f(), kDefaultScale), nullptr,
+      HybridLstmParam::addScale(one_parameter.Geti2c(), kDefaultScale), nullptr,
+      HybridLstmParam::addScale(one_parameter.Geti2o(), kDefaultScale), nullptr,
+      HybridLstmParam::addScale(one_parameter.Getr2i(), kDefaultScale), nullptr,
+      HybridLstmParam::addScale(one_parameter.Getr2f(), kDefaultScale), nullptr,
+      HybridLstmParam::addScale(one_parameter.Getr2c(), kDefaultScale), nullptr,
+      HybridLstmParam::addScale(one_parameter.Getr2o(), kDefaultScale), nullptr,
       /*cell_to_input_weights=*/nullptr,
       /*cell_to_forget_weights=*/nullptr,
       /*cell_to_output_weights=*/nullptr, one_parameter.GetInputLayerNorm(),
@@ -926,7 +926,7 @@ void TestOneHybridAsymmLSTM() {
       /*aux_input_to_output_weights=*/nullptr, one_parameter.GetInputBias(),
       one_parameter.GetForgetBias(), one_parameter.GetCellBias(),
       one_parameter.GetOutputBias(),
-      HybridLstmParam::addScale(one_parameter.GetProjection(), 1.0),
+      HybridLstmParam::addScale(one_parameter.GetProjection(), 1.0), nullptr,
       one_parameter.GetProjectionBias(), &param,
       /*forward_sequence=*/true,
       /*time_major=*/true,
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index 16e28619daf..9e0084e813d 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -101,8 +101,8 @@ class LSTMOpModel : public SingleOpModel {
     }
 
     // Adding the 2 state tensors.
-    AddInput({TensorType_FLOAT32, {n_batch, n_output}}, true);
-    AddInput({TensorType_FLOAT32, {n_batch, n_cell}}, true);
+    AddVariableInput({TensorType_FLOAT32, {n_batch, n_output}});
+    AddVariableInput({TensorType_FLOAT32, {n_batch, n_cell}});
 
     // Layer norm weights.
     if (!model_has_legacy_20_inputs) {
@@ -1412,16 +1412,14 @@ class LSTMIntegerOpModel : public SingleOpModel {
     }
 
     // Adding the 2 state tensors.
-    AddInput({TensorType_INT16,
-              {n_batch, n_output},
-              ranges[18].first,
-              ranges[18].second},
-             true);
-    AddInput({TensorType_INT16,
-              {n_batch, n_cell},
-              ranges[19].first,
-              ranges[19].second},
-             true);
+    AddVariableInput({TensorType_INT16,
+                      {n_batch, n_output},
+                      ranges[18].first,
+                      ranges[18].second});
+    AddVariableInput({TensorType_INT16,
+                      {n_batch, n_cell},
+                      ranges[19].first,
+                      ranges[19].second});
 
     // Layer norm weights.
     if (use_layer_norm) {
@@ -1460,9 +1458,13 @@ class LSTMIntegerOpModel : public SingleOpModel {
         BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
         CreateLSTMOptions(builder_, ActivationFunctionType_TANH).Union());
 
-    BuildInterpreter({});  // Input sizes are already set
+    BuildInterpreter(/*input_shapes=*/{}, /*num_threads=*/-1,
+                     /*allow_fp32_relax_to_fp16=*/false,
+                     /*apply_delegate=*/true, /*allocate_and_delegate=*/false);
   }
 
+  void PerformAllocateAndDelegate() { AllocateAndDelegate(true); }
+
   void SetInputToInputWeights(const std::vector<float>& f) {
     QuantizeAndPopulate<int8_t>(input_to_input_weights_, f);
   }
@@ -1694,6 +1696,8 @@ TEST(IntegerLstmOpTest, NoCifg_NoPeephole_Projection_LayerNorm) {
                           /*use_layer_norm=*/true,
                           /*use_8x8_8_implementation=*/false, ranges,
                           intermediates);
+  // Do allocate.
+  lstm.PerformAllocateAndDelegate();
 
   // Set weights.
   lstm.SetInputToInputWeights(input_to_input_weights);
@@ -1861,6 +1865,9 @@ TEST(IntegerLstmOpTest, NoCifg_Peephole_Projection_LayerNorm) {
                           /*use_8x8_8_implementation=*/false, ranges,
                           intermediates);
 
+  // Do allocate.
+  lstm.PerformAllocateAndDelegate();
+
   // Set weights.
   lstm.SetInputToInputWeights(input_to_input_weights);
   lstm.SetInputToCellWeights(input_to_cell_weights);
@@ -2028,6 +2035,9 @@ TEST(IntegerLstmOpTest, Cifg_NoPeephole_Projection_LayerNorm_8x8_8) {
                           /*use_8x8_8_implementation=*/true, ranges,
                           intermediates);
 
+  // Do allocate.
+  lstm.PerformAllocateAndDelegate();
+
   // Set weights.
   // lstm.SetInputToInputWeights(input_to_input_weights);
   lstm.SetInputToCellWeights(input_to_cell_weights);
@@ -2114,11 +2124,623 @@ TEST(LstmOpTest, InvalidTypes) {
 }
 #endif
 
+class HybridSparseLSTMOpModel : public ::tflite::SingleOpModel {
+ public:
+  HybridSparseLSTMOpModel(
+      int n_batch, int n_input, int n_cell, int n_output, bool use_cifg,
+      bool use_peephole, bool use_projection_weights, bool use_projection_bias,
+      float cell_clip, float proj_clip,
+      const std::vector<std::vector<int>>& input_shapes,
+      const TensorData& input_weights_td,
+      const std::vector<float>& input_to_input_weights,
+      const std::vector<float>& input_to_forget_weights,
+      const std::vector<float>& input_to_cell_weights,
+      const std::vector<float>& input_to_output_weights,
+      const TensorData& recurrent_weights_td,
+      const std::vector<float>& recurrent_to_input_weights,
+      const std::vector<float>& recurrent_to_forget_weights,
+      const std::vector<float>& recurrent_to_cell_weights,
+      const std::vector<float>& recurrent_to_output_weights,
+      const ::tflite::TensorType& weight_type = ::tflite::TensorType_INT8)
+      : n_batch_(n_batch),
+        n_input_(n_input),
+        n_cell_(n_cell),
+        n_output_(n_output) {
+    input_ = AddInput(::tflite::TensorType_FLOAT32);
+
+    if (use_cifg) {
+      input_to_input_weights_ = AddNullInput();
+    } else {
+      input_to_input_weights_ =
+          AddConstSparseInput(input_weights_td, input_to_input_weights, true);
+    }
+
+    input_to_forget_weights_ =
+        AddConstSparseInput(input_weights_td, input_to_forget_weights, true);
+
+    input_to_cell_weights_ =
+        AddConstSparseInput(input_weights_td, input_to_cell_weights, true);
+
+    input_to_output_weights_ =
+        AddConstSparseInput(input_weights_td, input_to_output_weights, true);
+
+    if (use_cifg) {
+      recurrent_to_input_weights_ = AddNullInput();
+    } else {
+      recurrent_to_input_weights_ = AddConstSparseInput(
+          recurrent_weights_td, recurrent_to_input_weights, true);
+    }
+
+    recurrent_to_forget_weights_ = AddConstSparseInput(
+        recurrent_weights_td, recurrent_to_forget_weights, true);
+    recurrent_to_cell_weights_ = AddConstSparseInput(
+        recurrent_weights_td, recurrent_to_cell_weights, true);
+    recurrent_to_output_weights_ = AddConstSparseInput(
+        recurrent_weights_td, recurrent_to_output_weights, true);
+
+    if (use_peephole) {
+      if (use_cifg) {
+        cell_to_input_weights_ = AddNullInput();
+      } else {
+        cell_to_input_weights_ = AddInput(weight_type);
+      }
+      cell_to_forget_weights_ = AddInput(weight_type);
+      cell_to_output_weights_ = AddInput(weight_type);
+    } else {
+      cell_to_input_weights_ = AddNullInput();
+      cell_to_forget_weights_ = AddNullInput();
+      cell_to_output_weights_ = AddNullInput();
+    }
+
+    if (use_cifg) {
+      input_gate_bias_ = AddNullInput();
+    } else {
+      input_gate_bias_ = AddInput(::tflite::TensorType_FLOAT32);
+    }
+    forget_gate_bias_ = AddInput(::tflite::TensorType_FLOAT32);
+    cell_bias_ = AddInput(::tflite::TensorType_FLOAT32);
+    output_gate_bias_ = AddInput(::tflite::TensorType_FLOAT32);
+
+    if (use_projection_weights) {
+      projection_weights_ = AddInput(weight_type);
+      if (use_projection_bias) {
+        projection_bias_ = AddInput(::tflite::TensorType_FLOAT32);
+      } else {
+        projection_bias_ = AddNullInput();
+      }
+    } else {
+      projection_weights_ = AddNullInput();
+      projection_bias_ = AddNullInput();
+    }
+
+    // Adding the 2 state tensors.
+    output_state_ = AddVariableInput(::tflite::TensorData{
+        ::tflite::TensorType_FLOAT32, {n_output_ * n_batch_}});
+    cell_state_ = AddVariableInput(::tflite::TensorData{
+        ::tflite::TensorType_FLOAT32, {n_cell_ * n_batch_}});
+
+    if (use_cifg) {
+      input_layer_norm_weights_ = AddNullInput();
+    } else {
+      input_layer_norm_weights_ = AddInput(::tflite::TensorType_FLOAT32);
+    }
+    forget_layer_norm_weights_ = AddInput(::tflite::TensorType_FLOAT32);
+    cell_layer_norm_weights_ = AddInput(::tflite::TensorType_FLOAT32);
+    output_layer_norm_weights_ = AddInput(::tflite::TensorType_FLOAT32);
+
+    output_ = AddOutput(::tflite::TensorType_FLOAT32);
+
+    SetBuiltinOp(
+        BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions,
+        CreateLSTMOptions(builder_, ActivationFunctionType_TANH, cell_clip,
+                          proj_clip, LSTMKernelType_FULL, false)
+            .Union());
+    BuildInterpreter(input_shapes);
+  }
+
+  void SetCellToInputWeights(std::vector<float> f) {
+    SignedSymmetricQuantizeAndPopulate(cell_to_input_weights_, f);
+  }
+
+  void SetCellToForgetWeights(std::vector<float> f) {
+    SignedSymmetricQuantizeAndPopulate(cell_to_forget_weights_, f);
+  }
+
+  void SetCellToOutputWeights(std::vector<float> f) {
+    SignedSymmetricQuantizeAndPopulate(cell_to_output_weights_, f);
+  }
+
+  void SetInputLayerNormWeights(std::vector<float> f) {
+    PopulateTensor(input_layer_norm_weights_, f);
+  }
+
+  void SetForgetLayerNormWeights(std::vector<float> f) {
+    PopulateTensor(forget_layer_norm_weights_, f);
+  }
+
+  void SetCellLayerNormWeights(std::vector<float> f) {
+    PopulateTensor(cell_layer_norm_weights_, f);
+  }
+
+  void SetOutputLayerNormWeights(std::vector<float> f) {
+    PopulateTensor(output_layer_norm_weights_, f);
+  }
+
+  void SetInputGateBias(std::vector<float> f) {
+    PopulateTensor(input_gate_bias_, f);
+  }
+
+  void SetForgetGateBias(std::vector<float> f) {
+    PopulateTensor(forget_gate_bias_, f);
+  }
+
+  void SetCellBias(std::vector<float> f) { PopulateTensor(cell_bias_, f); }
+
+  void SetOutputGateBias(std::vector<float> f) {
+    PopulateTensor(output_gate_bias_, f);
+  }
+
+  void SetProjectionWeights(std::vector<float> f) {
+    SignedSymmetricQuantizeAndPopulate(projection_weights_, f);
+  }
+
+  void SetProjectionBias(std::vector<float> f) {
+    PopulateTensor(projection_bias_, f);
+  }
+
+  void SetInput(int offset, const float* begin, const float* end) {
+    PopulateTensor(input_, offset, const_cast<float*>(begin),
+                   const_cast<float*>(end));
+  }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+  int num_inputs() { return n_input_; }
+  int num_outputs() { return n_output_; }
+  int num_cells() { return n_cell_; }
+  int num_batches() { return n_batch_; }
+
+ protected:
+  int input_;
+  int input_to_input_weights_;
+  int input_to_forget_weights_;
+  int input_to_cell_weights_;
+  int input_to_output_weights_;
+
+  int recurrent_to_input_weights_;
+  int recurrent_to_forget_weights_;
+  int recurrent_to_cell_weights_;
+  int recurrent_to_output_weights_;
+
+  int cell_to_input_weights_;
+  int cell_to_forget_weights_;
+  int cell_to_output_weights_;
+
+  int input_layer_norm_weights_;
+  int forget_layer_norm_weights_;
+  int cell_layer_norm_weights_;
+  int output_layer_norm_weights_;
+
+  int input_gate_bias_;
+  int forget_gate_bias_;
+  int cell_bias_;
+  int output_gate_bias_;
+
+  int projection_weights_;
+  int projection_bias_;
+
+  int output_state_;
+  int cell_state_;
+
+  int output_;
+
+  int n_batch_;
+  int n_input_;
+  int n_cell_;
+  int n_output_;
+};
+
+class BaseSparseLstmTest : public ::testing::Test {
+ protected:
+  // Weights of the Sparse Layer Norm LSTM model. Some are optional.
+  std::vector<float> input_to_input_weights_;
+  std::vector<float> input_to_cell_weights_;
+  std::vector<float> input_to_forget_weights_;
+  std::vector<float> input_to_output_weights_;
+  std::vector<float> input_gate_bias_;
+  std::vector<float> cell_gate_bias_;
+  std::vector<float> forget_gate_bias_;
+  std::vector<float> output_gate_bias_;
+  std::vector<float> recurrent_to_input_weights_;
+  std::vector<float> recurrent_to_cell_weights_;
+  std::vector<float> recurrent_to_forget_weights_;
+  std::vector<float> recurrent_to_output_weights_;
+  std::vector<float> cell_to_input_weights_;
+  std::vector<float> cell_to_forget_weights_;
+  std::vector<float> cell_to_output_weights_;
+  std::vector<float> input_layer_norm_weights_;
+  std::vector<float> forget_layer_norm_weights_;
+  std::vector<float> cell_layer_norm_weights_;
+  std::vector<float> output_layer_norm_weights_;
+  std::vector<float> projection_weights_;
+
+  std::vector<int> input_to_input_weights_size_;
+  std::vector<int> input_to_cell_weights_size_;
+  std::vector<int> input_to_forget_weights_size_;
+  std::vector<int> input_to_output_weights_size_;
+  std::vector<int> recurrent_to_input_weights_size_;
+  std::vector<int> recurrent_to_cell_weights_size_;
+  std::vector<int> recurrent_to_forget_weights_size_;
+  std::vector<int> recurrent_to_output_weights_size_;
+
+  int n_batch_;
+  int n_input_;
+  int n_cell_;
+  int n_output_;
+  float cell_clip_;
+  float proj_clip_;
+
+  // Layer Norm LSTM input is stored as num_batch x num_inputs vector.
+  std::vector<std::vector<float>> sparse_layer_norm_lstm_input_;
+
+  // Compares output up to tolerance to the result of the layer_norm_lstm given
+  // the input.
+  void VerifyGoldens(const std::vector<std::vector<float>>& input,
+                     const std::vector<std::vector<float>>& output,
+                     HybridSparseLSTMOpModel* sparse_layer_norm_lstm,
+                     float tolerance = 1e-5) {
+    const int num_batches = input.size();
+    EXPECT_GT(num_batches, 0);
+    const int num_inputs = sparse_layer_norm_lstm->num_inputs();
+    EXPECT_GT(num_inputs, 0);
+    const int input_sequence_size = input[0].size() / num_inputs;
+    EXPECT_GT(input_sequence_size, 0);
+    for (int i = 0; i < input_sequence_size; ++i) {
+      for (int b = 0; b < num_batches; ++b) {
+        const float* batch_start = input[b].data() + i * num_inputs;
+        const float* batch_end = batch_start + num_inputs;
+
+        sparse_layer_norm_lstm->SetInput(
+            b * sparse_layer_norm_lstm->num_inputs(), batch_start, batch_end);
+      }
+
+      sparse_layer_norm_lstm->Invoke();
+
+      const int num_outputs = sparse_layer_norm_lstm->num_outputs();
+      std::vector<float> expected;
+      for (int b = 0; b < num_batches; ++b) {
+        const float* golden_start_batch = output[b].data() + i * num_outputs;
+        const float* golden_end_batch = golden_start_batch + num_outputs;
+        expected.insert(expected.end(), golden_start_batch, golden_end_batch);
+      }
+      EXPECT_THAT(
+          sparse_layer_norm_lstm->GetOutput(),
+          ElementsAreArray(::tflite::ArrayFloatNear(expected, tolerance)));
+    }
+  }
+};
+
+class NoCifgPeepholeProjectionNoClippingSparseLstmTest
+    : public BaseSparseLstmTest {
+  void SetUp() override {
+    n_batch_ = 2;
+    n_input_ = 48;
+    n_cell_ = 4;
+    n_output_ = 16;
+    cell_clip_ = 0.0;
+    proj_clip_ = 0.0;
+
+    /* clang-format off */
+    input_to_input_weights_ = {
+      /* 1st row */
+      1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
+      14.14, 15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 33.33, 34.34, 35.35, 36.36, 37.37, 38.38,
+      39.39, 40.40, 41.41, 42.42, 43.43, 44.44, 0.0, 0.0, 0.0, 0.0,
+      /* 2nd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, -17.17, -18.18, -19.19, -20.2, -21.21, -22.22, -23.23, -24.24,
+      -25.25, -26.26, -27.27, -28.28, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      /* 3rd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 17.17, -18.18, 19.19, -20.2, 21.21, -22.22, 23.23, -24.24, 25.25,
+      -26.26, 27.27, -28.28, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      /* 4th row */
+      -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
+      -13.13, 14.14, -15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -33.33, 34.34, -35.35, 36.36, -37.37,
+      38.38, -39.39, 40.40, -41.41, 42.42, -43.43, 44.44, 0.0, 0.0, 0.0, 0};
+    input_to_input_weights_size_ = {4, 48};
+
+    input_to_forget_weights_ = {
+      /* 1st row */
+      1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
+      14.14, 15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 33.33, 34.34, 35.35, 36.36, 37.37, 38.38,
+      39.39, 40.40, 41.41, 42.42, 43.43, 44.44, 0.0, 0.0, 0.0, 0.0,
+      /* 2nd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, -17.17, -18.18, -19.19, -20.2, -21.21, -22.22, -23.23, -24.24,
+      -25.25, -26.26, -27.27, -28.28, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      /* 3rd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 17.17, -18.18, 19.19, -20.2, 21.21, -22.22, 23.23, -24.24, 25.25,
+      -26.26, 27.27, -28.28, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      /* 4th row */
+      -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
+      -13.13, 14.14, -15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -33.33, 34.34, -35.35, 36.36, -37.37,
+      38.38, -39.39, 40.40, -41.41, 42.42, -43.43, 44.44, 0.0, 0.0, 0.0, 0};
+    input_to_forget_weights_size_ = {4, 48};
+
+    input_to_cell_weights_ = {
+      /* 1st row */
+      1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
+      14.14, 15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 33.33, 34.34, 35.35, 36.36, 37.37, 38.38,
+      39.39, 40.40, 41.41, 42.42, 43.43, 44.44, 0.0, 0.0, 0.0, 0.0,
+      /* 2nd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, -17.17, -18.18, -19.19, -20.2, -21.21, -22.22, -23.23, -24.24,
+      -25.25, -26.26, -27.27, -28.28, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      /* 3rd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 17.17, -18.18, 19.19, -20.2, 21.21, -22.22, 23.23, -24.24, 25.25,
+      -26.26, 27.27, -28.28, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      /* 4th row */
+      -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
+      -13.13, 14.14, -15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -33.33, 34.34, -35.35, 36.36, -37.37,
+      38.38, -39.39, 40.40, -41.41, 42.42, -43.43, 44.44, 0.0, 0.0, 0.0, 0};
+    input_to_cell_weights_size_ = {4, 48};
+
+    input_to_output_weights_ = {
+      /* 1st row */
+      1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
+      14.14, 15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 33.33, 34.34, 35.35, 36.36, 37.37, 38.38,
+      39.39, 40.40, 41.41, 42.42, 43.43, 44.44, 0.0, 0.0, 0.0, 0.0,
+      /* 2nd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, -17.17, -18.18, -19.19, -20.2, -21.21, -22.22, -23.23, -24.24,
+      -25.25, -26.26, -27.27, -28.28, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      /* 3rd row */
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 17.17, -18.18, 19.19, -20.2, 21.21, -22.22, 23.23, -24.24, 25.25,
+      -26.26, 27.27, -28.28, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      /* 4th row */
+      -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
+      -13.13, 14.14, -15.15, 16.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -33.33, 34.34, -35.35, 36.36, -37.37,
+      38.38, -39.39, 40.40, -41.41, 42.42, -43.43, 44.44, 0.0, 0.0, 0.0, 0};
+    input_to_output_weights_size_ = {4, 48};
+
+    input_gate_bias_ = {0.03, 0.15, 0.22, 0.38};
+
+    forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
+
+    cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
+
+    output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
+
+    recurrent_to_input_weights_ = {
+      -0.2, -0.3, 0.4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,   // 1st row
+      0.1,  -0.5, 0.9, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,   // 2nd row
+      -0.2, -0.3, -0.7, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 3rd row
+      0.05, -0.2, -0.6, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 4th row
+    };
+    recurrent_to_input_weights_size_ = {4, 16};
+
+    recurrent_to_cell_weights_ = {
+      -0.3, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,     // 1st row
+      -0.3, 0.8,  -0.08, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 2nd row
+      -0.2, 0.3, 0.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,     // 3rd row
+      -0.6, -0.1, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,    // 4th row
+    };
+    recurrent_to_cell_weights_size_ = {4, 16};
+
+    recurrent_to_forget_weights_ = {
+      -0.5, -0.3, -0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 1st row
+      -0.2, 0.6, 0.4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 2nd row
+      0.9,  0.3,  -0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 3rd row
+      0.2, 0.5, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,    // 4th row
+    };
+    recurrent_to_forget_weights_size_ = {4, 16};
+
+    recurrent_to_output_weights_ = {
+      0.3,  -0.1, 0.1,  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 1st row
+      -0.2, -0.5, -0.7,  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 2nd row
+      -0.2, -0.6, -0.1,  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 3rd row
+      -0.4, -0.7, -0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0,  // 4th row
+    };
+    recurrent_to_output_weights_size_ = {4, 16};
+
+    cell_to_input_weights_ = {0.05, 0.1, 0.25, 0.15};
+
+    cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
+
+    cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
+
+    input_layer_norm_weights_ = {0.1, 0.2, 0.3, 0.5};
+    forget_layer_norm_weights_ = {0.2, 0.2, 0.4, 0.3};
+    cell_layer_norm_weights_ = {0.7, 0.2, 0.3, 0.8};
+    output_layer_norm_weights_ = {0.6, 0.2, 0.2, 0.5};
+
+    projection_weights_ = {
+      -0.1, 0.2, 0.01, -0.2,  // 1st row
+      0.1, 0.5, 0.3, 0.08,    // 2nd row
+      0.07, 0.2, -0.4, 0.2,   // 3rd row
+      0.0, 0.0, 0.0, 0.0,     // 4th row
+      0.0, 0.0, 0.0, 0.0,     // 5th row
+      0.0, 0.0, 0.0, 0.0,     // 6th row
+      0.0, 0.0, 0.0, 0.0,     // 7th row
+      0.0, 0.0, 0.0, 0.0,     // 8th row
+      0.0, 0.0, 0.0, 0.0,     // 9th row
+      0.0, 0.0, 0.0, 0.0,     // 10th row
+      0.0, 0.0, 0.0, 0.0,     // 11th row
+      0.0, 0.0, 0.0, 0.0,     // 12th row
+      0.0, 0.0, 0.0, 0.0,     // 13th row
+      0.0, 0.0, 0.0, 0.0,     // 14th row
+      0.0, 0.0, 0.0, 0.0,     // 15th row
+      0.0, 0.0, 0.0, 0.0,     // 16th row
+      0.0, 0.0, 0.0, 0.0,     // 17th row
+      0.0, 0.0, 0.0, 0.0,     // 18th row
+    };
+
+    sparse_layer_norm_lstm_input_ = {
+      // Batch0: 2 (input_sequence_size) * 45 (n_input_)
+      {
+        1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0,
+        -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+        1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0,
+        -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,  // seq 0
+        2.5, 0.0, -2.1, 0.0, 3.0, 0.0, -1.3, 0.0, 1.3, 0.0, -1.1, 0.0, 2.0, 0.0,
+        -1.7, 0.0, 1.9, 0.0, -1.5, 0.0, 0.5, 0.0, -0.7, 0.0, 0.8, 0.0, -0.3,
+        0.0, 2.8, 0.0, -2.8, 0.0, 1.1, -2.3, 1.9, -1.9, 2.1, -0.5, 2.4, -0.1,
+        1.0, -2.5, 0.7, -1.9, 0.2,  0.1, 0.2, 0.3,  // seq 1
+      },
+      // Batch1: 2 (input_sequence_size) * 45 (n_input_)
+      {
+        1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0,
+        -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,
+        1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0,
+        -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0, 1.0, -1.0,  // seq 0
+        2.5, 0.0, -2.1, 0.0, 3.0, 0.0, -1.3, 0.0, 1.3, 0.0, -1.1, 0.0, 2.0, 0.0,
+        -1.7, 0.0, 1.9, 0.0, -1.5, 0.0, 0.5, 0.0, -0.7, 0.0, 0.8, 0.0, -0.3,
+        0.0, 2.8, 0.0, -2.8, 0.0, 1.1, -2.3, 1.9, -1.9, 2.1, -0.5, 2.4, -0.1,
+        1.0, -2.5, 0.7, -1.9, 0.2, -1.0, 1.0, -1.0,   // seq 1
+      },
+    };
+    /* clang-format on */
+  }
+};
+
+TEST_F(NoCifgPeepholeProjectionNoClippingSparseLstmTest,
+       HybridSparseLstmBlackBoxTest) {
+  TensorData input_weight = {};
+  input_weight.type = TensorType_FLOAT32;
+  input_weight.shape = {4, 48};
+  input_weight.traversal_order = {0, 1, 2};
+  input_weight.format = {kTfLiteDimDense, kTfLiteDimSparseCSR};
+  input_weight.block_map = {1};
+  input_weight.block_size = {16};
+  TensorData recurrent_weight = {};
+  recurrent_weight.type = TensorType_FLOAT32;
+  recurrent_weight.shape = {4, 16};
+  recurrent_weight.traversal_order = {0, 1, 2};
+  recurrent_weight.format = {kTfLiteDimDense, kTfLiteDimSparseCSR};
+  recurrent_weight.block_map = {1};
+  recurrent_weight.block_size = {16};
+  HybridSparseLSTMOpModel sparse_layer_norm_lstm(
+      n_batch_, n_input_, n_cell_, n_output_,
+      /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, cell_clip_, proj_clip_,
+      {
+          {n_batch_, n_input_},  // input tensor
+
+          {input_to_input_weights_size_},
+          {input_to_forget_weights_size_},
+          {input_to_cell_weights_size_},
+          {input_to_output_weights_size_},
+
+          {recurrent_to_input_weights_size_},
+          {recurrent_to_forget_weights_size_},
+          {recurrent_to_cell_weights_size_},
+          {recurrent_to_output_weights_size_},
+
+          {n_cell_},  // cell_to_input_weight tensor
+          {n_cell_},  // cell_to_forget_weight tensor
+          {n_cell_},  // cell_to_output_weight tensor
+
+          {n_cell_},  // input_gate_bias tensor
+          {n_cell_},  // forget_gate_bias tensor
+          {n_cell_},  // cell_bias tensor
+          {n_cell_},  // output_gate_bias tensor
+
+          {n_output_, n_cell_},  // projection_weight tensor
+          {0},                   // projection_bias tensor
+
+          {n_output_ * n_batch_},  // output_state tensor
+          {n_cell_ * n_batch_},    // cell_state tensor
+
+          {n_cell_},  // input_layer_norm_weight tensor
+          {n_cell_},  // forget_layer_norm_weight tensor
+          {n_cell_},  // cell_layer_norm_weight tensor
+          {n_cell_},  // output_layer_norm_weight tensor
+      },
+      input_weight, input_to_input_weights_, input_to_forget_weights_,
+      input_to_cell_weights_, input_to_output_weights_, recurrent_weight,
+      recurrent_to_input_weights_, recurrent_to_forget_weights_,
+      recurrent_to_cell_weights_, recurrent_to_output_weights_);
+
+  sparse_layer_norm_lstm.SetInputGateBias(input_gate_bias_);
+  sparse_layer_norm_lstm.SetCellBias(cell_gate_bias_);
+  sparse_layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
+  sparse_layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
+
+  sparse_layer_norm_lstm.SetCellToInputWeights(cell_to_input_weights_);
+  sparse_layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  sparse_layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  sparse_layer_norm_lstm.SetInputLayerNormWeights(input_layer_norm_weights_);
+  sparse_layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
+  sparse_layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
+  sparse_layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
+
+  sparse_layer_norm_lstm.SetProjectionWeights(projection_weights_);
+
+  /* clang-format off */
+  const std::vector<std::vector<float>> sparse_layer_norm_lstm_golden_output = {
+    {
+      // Batch0: 2 (input_sequence_size) * 3 (n_output_)
+      0.0550758, 0.138464, -0.0628034, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0,
+      0.069672, 0.195428, -0.0605584, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0,
+    },
+    {
+      // Batch1: 3 (input_sequence_size) * 3 (n_output_)
+      0.0550758, 0.138464, -0.0628034, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0,
+      0.069672, 0.195428, -0.0605584, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0,
+    }};
+  /* clang-format on */
+
+  VerifyGoldens(sparse_layer_norm_lstm_input_,
+                sparse_layer_norm_lstm_golden_output, &sparse_layer_norm_lstm);
+}
+
 // Test parameter controls asymmetric_quantize_inputs in LSTMOpModel.
 INSTANTIATE_TEST_SUITE_P(
     Parameterized, LstmOpTest,
     ::testing::Combine(::testing::Values(TensorType_FLOAT32, TensorType_UINT8,
-                                         TensorType_UINT8),
+                                         TensorType_INT8),
                        ::testing::Bool(), ::testing::Bool()));
 
 }  // namespace
diff --git a/tensorflow/lite/kernels/matrix_diag.cc b/tensorflow/lite/kernels/matrix_diag.cc
index c921650926f..ce3219ed281 100644
--- a/tensorflow/lite/kernels/matrix_diag.cc
+++ b/tensorflow/lite/kernels/matrix_diag.cc
@@ -32,12 +32,15 @@ constexpr int kOutputTensor = 0;
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   TfLiteIntArray* input_dims = input->dims;
   int input_dims_size = input_dims->size;
   TF_LITE_ENSURE(context, input_dims_size >= 1);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   // Resize the output tensor.
   TfLiteIntArray* output_shape = TfLiteIntArrayCreate(input_dims_size + 1);
   for (int i = 0; i < input_dims_size; i++) {
@@ -116,8 +119,11 @@ void FillDiagHelper(const TfLiteTensor* input, TfLiteTensor* output) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   FillDiagHelper(input, output);
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/kernels/matrix_set_diag.cc b/tensorflow/lite/kernels/matrix_set_diag.cc
index e9c17f985d3..514892e8de3 100644
--- a/tensorflow/lite/kernels/matrix_set_diag.cc
+++ b/tensorflow/lite/kernels/matrix_set_diag.cc
@@ -33,12 +33,15 @@ constexpr int kOutputTensor = 0;
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   TfLiteIntArray* input_dims = input->dims;
   int input_dims_size = input_dims->size;
   TF_LITE_ENSURE(context, input_dims_size >= 2);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TfLiteIntArray* output_shape = TfLiteIntArrayCreate(input_dims_size);
   for (int i = 0; i < input_dims_size; i++) {
@@ -126,9 +129,14 @@ void FillDiagHelper(const TfLiteTensor* input, const TfLiteTensor* diag,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* diag = GetInput(context, node, kDiagonalTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* diag;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kDiagonalTensor, &diag));
   FillDiagHelper(input, diag, output);
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/kernels/mfcc.cc b/tensorflow/lite/kernels/mfcc.cc
index a3bf5baafaa..28c185e13a2 100644
--- a/tensorflow/lite/kernels/mfcc.cc
+++ b/tensorflow/lite/kernels/mfcc.cc
@@ -73,9 +73,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input_wav = GetInput(context, node, kInputTensorWav);
-  const TfLiteTensor* input_rate = GetInput(context, node, kInputTensorRate);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input_wav;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorWav, &input_wav));
+  const TfLiteTensor* input_rate;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorRate, &input_rate));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(input_wav), 3);
   TF_LITE_ENSURE_EQ(context, NumElements(input_rate), 1);
@@ -101,9 +107,15 @@ template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteMfccParams*>(node->user_data);
 
-  const TfLiteTensor* input_wav = GetInput(context, node, kInputTensorWav);
-  const TfLiteTensor* input_rate = GetInput(context, node, kInputTensorRate);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input_wav;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorWav, &input_wav));
+  const TfLiteTensor* input_rate;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorRate, &input_rate));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   const int32 sample_rate = *GetTensorData<int>(input_rate);
 
diff --git a/tensorflow/lite/kernels/mirror_pad.cc b/tensorflow/lite/kernels/mirror_pad.cc
index 8f4f02f7848..f5427d193c7 100644
--- a/tensorflow/lite/kernels/mirror_pad.cc
+++ b/tensorflow/lite/kernels/mirror_pad.cc
@@ -162,8 +162,10 @@ struct MirrorPadWorkerTask : cpu_backend_threadpool::Task {
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   ruy::profiler::ScopeLabel label("MirrorPad");
-  const TfLiteTensor* input_tensor = GetInput(context, node, 0);
-  const TfLiteTensor* padding_matrix = GetInput(context, node, 1);
+  const TfLiteTensor* input_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input_tensor));
+  const TfLiteTensor* padding_matrix;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &padding_matrix));
   auto* params =
       reinterpret_cast<TfLiteMirrorPaddingParams*>(node->builtin_data);
 
@@ -172,7 +174,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   }
   const int input_dims = NumDimensions(input_tensor);
 
-  TfLiteTensor* output_tensor = GetOutput(context, node, 0);
+  TfLiteTensor* output_tensor;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output_tensor));
   if (IsDynamicTensor(output_tensor)) {
     auto output_size = GetPaddedOutputShape(input_tensor, padding_matrix);
     if (output_size == nullptr) {
@@ -258,9 +261,12 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 void Free(TfLiteContext* context, void* buffer) {}
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input_tensor = GetInput(context, node, 0);
-  const TfLiteTensor* padding_matrix = GetInput(context, node, 1);
-  TfLiteTensor* output_tensor = GetOutput(context, node, 0);
+  const TfLiteTensor* input_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input_tensor));
+  const TfLiteTensor* padding_matrix;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &padding_matrix));
+  TfLiteTensor* output_tensor;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output_tensor));
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(padding_matrix), 2);
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(padding_matrix, 0),
diff --git a/tensorflow/lite/kernels/mul.cc b/tensorflow/lite/kernels/mul.cc
index 5c34c9c7199..1b3f25dec6d 100644
--- a/tensorflow/lite/kernels/mul.cc
+++ b/tensorflow/lite/kernels/mul.cc
@@ -75,9 +75,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
 
@@ -259,9 +265,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32) {
     EvalMul<kernel_type>(context, node, params, data, input1, input2, output);
diff --git a/tensorflow/lite/kernels/mul_test.cc b/tensorflow/lite/kernels/mul_test.cc
index 9499fd40bea..5efeb6b29b6 100644
--- a/tensorflow/lite/kernels/mul_test.cc
+++ b/tensorflow/lite/kernels/mul_test.cc
@@ -113,7 +113,7 @@ TEST(FloatMulOpTest, ActivationRELU_N1_TO_1) {
 }
 
 TEST(FloatMulOpTest, VariousInputShapes) {
-  std::vector<std::vector<int>> test_shapes = {
+  const std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
@@ -130,7 +130,7 @@ TEST(FloatMulOpTest, VariousInputShapes) {
 }
 
 TEST(FloatMulOpTest, WithScalarBroadcast) {
-  std::vector<std::vector<int>> test_shapes = {
+  const std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
@@ -147,7 +147,7 @@ TEST(FloatMulOpTest, WithScalarBroadcast) {
 }
 
 TEST(FloatMulOpTest, WithBroadcast) {
-  std::vector<std::vector<int>> test_shapes = {
+  const std::vector<std::vector<int>> test_shapes = {
       {2, 4}, {2, 1, 4}, {1, 2, 4}, {1, 2, 1, 4}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
@@ -166,9 +166,9 @@ TEST(FloatMulOpTest, WithBroadcast) {
 
 TEST(FloatMulOpTest, MixedBroadcast) {
   const std::vector<int> base_shape = {2, 3, 1, 2};
-  std::vector<std::vector<int>> test_shapes = {
+  const std::vector<std::vector<int>> test_shapes = {
       {1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
-  std::vector<std::vector<float>> test_outputs = {
+  const std::vector<std::vector<float>> test_outputs = {
       {-0.06f, 0.69f,  0.12f,  1.15f, -0.30f, 2.07f,  0.18f,  0.15f, -0.36f,
        0.25f,  0.90f,  0.45f,  0.16f, -0.33f, -0.32f, -0.55f, 0.80f, -0.99f,
        0.24f,  0.84f,  -0.48f, 1.40f, 1.20f,  2.52f,  -0.32f, 0.00f, 0.64f,
@@ -214,7 +214,7 @@ TEST(FloatMulOpTest, MixedBroadcast) {
 }
 
 TEST(FloatMulOpTest, WithBroadcast2Elements) {
-  std::vector<std::vector<int>> test_shapes = {
+  const std::vector<std::vector<int>> test_shapes = {
       {2, 2}, {2, 1, 2}, {1, 2, 2}, {1, 2, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     FloatMulOpModel m({TensorType_FLOAT32, test_shapes[i]},
@@ -250,7 +250,7 @@ TEST(IntegerMulOpTest, ActivationRELU_N1_TO_1) {
 }
 
 TEST(IntegerMulOpTest, VariousInputShapes) {
-  std::vector<std::vector<int>> test_shapes = {
+  const std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     IntegerMulOpModel m({TensorType_INT32, test_shapes[i]},
@@ -265,7 +265,7 @@ TEST(IntegerMulOpTest, VariousInputShapes) {
 }
 
 TEST(IntegerMulOpTest, WithBroadcast) {
-  std::vector<std::vector<int>> test_shapes = {
+  const std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
   for (int i = 0; i < test_shapes.size(); ++i) {
     IntegerMulOpModel m({TensorType_INT32, test_shapes[i]},
@@ -392,32 +392,39 @@ float GetTolerance(int min, int max) {
 
 template <TensorType tensor_type, typename integer_dtype>
 void WithBroadcast() {
-  float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
-  std::vector<std::vector<int>> test_shapes = {
+  const float kQuantizedTolerance = GetTolerance(-3.0, 3.0);
+  const std::vector<std::vector<int>> test_shapes = {
       {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  // Test with a smaller than 1 and greater than 1 quantization multiplier
+  const std::vector<std::pair<float, float>> test_input_range = {{-3.0, 3.0},
+                                                                 {-6.0, 6.0}};
   for (int i = 0; i < test_shapes.size(); ++i) {
-    QuantizedMulOpModel m({tensor_type, test_shapes[i], -3.0, 3.0},
-                          {tensor_type, {}, -3.0, 3.0},  // always a scalar
-                          {tensor_type, {}, -3.0, 3.0},
-                          ActivationFunctionType_NONE);
-    m.QuantizeAndPopulate<integer_dtype>(m.input1(),
-                                         {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
-    m.QuantizeAndPopulate<integer_dtype>(m.input2(), {0.1});
-    m.Invoke();
-    EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
-                ElementsAreArray(ArrayFloatNear(
-                    {-0.2, 0.02, 0.07, 0.08, 0.11, 0.2}, kQuantizedTolerance)))
-        << "With shape number " << i;
+    for (int j = 0; j < test_input_range.size(); ++j) {
+      const std::pair<float, float>& input_range = test_input_range[j];
+      QuantizedMulOpModel m(
+          {tensor_type, test_shapes[i], input_range.first, input_range.second},
+          {tensor_type, {}, input_range.first, input_range.second},
+          {tensor_type, {}, -0.2, 0.2}, ActivationFunctionType_NONE);
+      m.QuantizeAndPopulate<integer_dtype>(m.input1(),
+                                           {-2.0, 0.2, 0.7, 0.8, 1.1, 2.0});
+      m.QuantizeAndPopulate<integer_dtype>(m.input2(), {0.1});
+      m.Invoke();
+      EXPECT_THAT(
+          m.GetDequantizedOutput<integer_dtype>(),
+          ElementsAreArray(ArrayFloatNear({-0.2, 0.02, 0.07, 0.08, 0.11, 0.2},
+                                          kQuantizedTolerance)))
+          << "With shape number " << i << " and range number " << j;
+    }
   }
 }
 
 template <enum TensorType tensor_type, typename integer_dtype>
 void QuantizedWithMixedBroadcast() {
-  float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
+  const float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
   const std::vector<int> base_shape = {2, 3, 1, 2};
-  std::vector<std::vector<int>> test_shapes = {
+  const std::vector<std::vector<int>> test_shapes = {
       {1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
-  std::vector<std::vector<float>> test_outputs = {
+  const std::vector<std::vector<float>> test_outputs = {
       {-0.06f, 0.69f,  0.12f,  1.15f, -0.30f, 2.07f,  0.18f,  0.15f, -0.36f,
        0.25f,  0.90f,  0.45f,  0.16f, -0.33f, -0.32f, -0.55f, 0.80f, -0.99f,
        0.24f,  0.84f,  -0.48f, 1.40f, 1.20f,  2.52f,  -0.32f, 0.00f, 0.64f,
diff --git a/tensorflow/lite/kernels/multinomial.cc b/tensorflow/lite/kernels/multinomial.cc
new file mode 100644
index 00000000000..ea471802992
--- /dev/null
+++ b/tensorflow/lite/kernels/multinomial.cc
@@ -0,0 +1,206 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <random>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace multinomial {
+
+struct MultinomialParams {
+  std::default_random_engine rng;
+};
+
+// Draws a sample from a categorical distribution.
+template <typename FloatType, typename IntegralType>
+TfLiteStatus MultinomialSample(std::default_random_engine& rng,
+                               const FloatType* logits, int logits_size,
+                               IntegralType* outputs, int output_size) {
+  // Computes arg_max(cumsum(exp(logits)) > rand()).
+  // TODO(b/169166131): Remove hard-coded double for constrained use-cases.
+  std::vector<double> cumulative_odds;
+  cumulative_odds.reserve(logits_size);
+  double last_odds = 0.0;
+
+  // Compute max logit for numerical stability.
+  FloatType max_logit = std::numeric_limits<FloatType>::lowest();
+  for (int i = 0; i < logits_size; i++) {
+    max_logit = std::max(max_logit, logits[i]);
+  }
+
+  for (int i = 0; i < logits_size; i++) {
+    FloatType odds = std::exp(logits[i] - max_logit) + last_odds;
+    cumulative_odds.push_back(odds);
+    last_odds = odds;
+  }
+
+  std::uniform_real_distribution<double> distribution{0.0,
+                                                      cumulative_odds.back()};
+
+  for (int i = 0; i < output_size; i++) {
+    double sample = distribution(rng);
+    auto it = std::lower_bound(cumulative_odds.begin(), cumulative_odds.end(),
+                               sample);
+    if (it == cumulative_odds.end()) {
+      // This should be impossible by construction.
+      return kTfLiteError;
+    }
+    *outputs++ = static_cast<IntegralType>(it - cumulative_odds.begin());
+  }
+  return kTfLiteOk;
+}
+
+template <typename FloatType>
+TfLiteStatus MultinomialSample(TfLiteContext* context,
+                               std::default_random_engine& rng,
+                               const FloatType* logits, int logits_size,
+                               TfLiteTensor* output, int outputs_offset,
+                               int output_size) {
+  switch (output->type) {
+    case kTfLiteInt32:
+      return MultinomialSample<FloatType, int32_t>(
+          rng, logits, logits_size,
+          GetTensorData<int32_t>(output) + outputs_offset, output_size);
+      break;
+    case kTfLiteInt64:
+      return MultinomialSample<FloatType, int64_t>(
+          rng, logits, logits_size,
+          GetTensorData<int64_t>(output) + outputs_offset, output_size);
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context,
+                         "Unsupported datatype for multinomial output: %s",
+                         TfLiteTypeGetName(output->type));
+      return kTfLiteError;
+  }
+}
+
+TfLiteStatus MultinomialSample(TfLiteContext* context,
+                               std::default_random_engine& rng,
+                               const TfLiteTensor* logits, int logits_offset,
+                               int logits_size, TfLiteTensor* output,
+                               int outputs_offset, int output_size) {
+  switch (logits->type) {
+    case kTfLiteFloat16:
+      TF_LITE_KERNEL_LOG(context, "TfLiteFloat16 is currently not supported.");
+      return kTfLiteError;
+      break;
+    case kTfLiteFloat32:
+      TF_LITE_ENSURE_OK(
+          context,
+          MultinomialSample<float>(
+              context, rng, GetTensorData<float>(logits) + logits_offset,
+              logits_size, output, outputs_offset, output_size));
+      break;
+    case kTfLiteFloat64:
+      TF_LITE_ENSURE_OK(
+          context,
+          MultinomialSample<double>(
+              context, rng, GetTensorData<double>(logits) + logits_offset,
+              logits_size, output, outputs_offset, output_size));
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context,
+                         "Unsupported datatype for multinomial logit input: %s",
+                         TfLiteTypeGetName(output->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return new MultinomialParams();
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<MultinomialParams*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  // TODO(b/169166131): Handle optional seed input.
+  TF_LITE_ENSURE_EQ(context, tflite::NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, tflite::NumOutputs(node), 1);
+
+  // 'logits' is a float matrix [batch_size, num_categories]
+  const TfLiteTensor* logits_input = tflite::GetInput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, tflite::NumDimensions(logits_input), 2);
+  int batch_size = tflite::SizeOfDimension(logits_input, 0);
+
+  // 'num_samples' is an int scalar.
+  const TfLiteTensor* num_samples_input = tflite::GetInput(context, node, 1);
+  TF_LITE_ENSURE_EQ(context, tflite::NumDimensions(num_samples_input), 0);
+  // TODO(b/169166131): Allow different integer input types.
+  TF_LITE_ENSURE_EQ(context, num_samples_input->type, kTfLiteInt32);
+  // TODO(b/169166131): Support dynamic output tensors.
+  TF_LITE_ENSURE(context, IsConstantTensor(num_samples_input));
+
+  int num_samples = *num_samples_input->data.i32;
+
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(2);
+  output_shape->data[0] = batch_size;
+  output_shape->data[1] = num_samples;
+
+  TfLiteTensor* output = tflite::GetOutput(context, node, 0);
+  // ResizeTensor takes ownership of output_shape.
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  // TODO(b/169166131): Handle optional seed input.
+  MultinomialParams* params =
+      reinterpret_cast<MultinomialParams*>(node->user_data);
+  TF_LITE_ENSURE(context, params != nullptr);
+
+  const TfLiteTensor* logits = tflite::GetInput(context, node, 0);
+  int batch_size = tflite::SizeOfDimension(logits, 0);
+  int logits_size = tflite::SizeOfDimension(logits, 1);
+
+  const TfLiteTensor* num_samples_input = tflite::GetInput(context, node, 1);
+  int output_size = *num_samples_input->data.i32;
+
+  TfLiteTensor* output = tflite::GetOutput(context, node, 0);
+
+  for (int batch = 0; batch < batch_size; ++batch) {
+    int logits_offset = logits_size * batch;
+    int output_offset = output_size * batch;
+
+    TF_LITE_ENSURE_OK(
+        context,
+        MultinomialSample(context, params->rng, logits, logits_offset,
+                          logits_size, output, output_offset, output_size));
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace multinomial
+
+TfLiteRegistration* Register_MULTINOMIAL() {
+  static TfLiteRegistration r = {multinomial::Init, multinomial::Free,
+                                 multinomial::Prepare, multinomial::Eval};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/multinomial_test.cc b/tensorflow/lite/kernels/multinomial_test.cc
new file mode 100644
index 00000000000..ee3d5826fe8
--- /dev/null
+++ b/tensorflow/lite/kernels/multinomial_test.cc
@@ -0,0 +1,225 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <limits>
+#include <random>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/custom_ops_register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/testing/util.h"
+
+namespace tflite {
+namespace {
+
+template <typename T>
+tflite::TensorType GetTTEnum();
+
+template <>
+tflite::TensorType GetTTEnum<float>() {
+  return tflite::TensorType_FLOAT32;
+}
+
+template <>
+tflite::TensorType GetTTEnum<double>() {
+  return tflite::TensorType_FLOAT64;
+}
+
+template <>
+tflite::TensorType GetTTEnum<int>() {
+  return tflite::TensorType_INT32;
+}
+
+template <>
+tflite::TensorType GetTTEnum<int64_t>() {
+  return tflite::TensorType_INT64;
+}
+
+class MultinomialOpModel : public tflite::SingleOpModel {
+ public:
+  MultinomialOpModel(tflite::TensorData logits, int num_samples,
+                     tflite::TensorData output) {
+    logits_ = AddInput(logits);
+    num_samples_ = AddConstInput(tflite::TensorType_INT32, {num_samples}, {});
+    output_ = AddOutput(output);
+    SetCustomOp("Multinomial", {}, ops::custom::Register_MULTINOMIAL);
+    BuildInterpreter({GetShape(logits_), GetShape(num_samples_)});
+  }
+
+  int logits_;
+  int num_samples_;
+  int output_;
+
+  int logits() { return logits_; }
+  int num_samples() { return num_samples_; }
+  int output() { return output_; }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+};
+
+}  // namespace
+}  // namespace tflite
+
+template <typename Type1, typename Type2>
+struct TypePair {
+  using T1 = Type1;
+  using T2 = Type2;
+};
+
+template <typename TypePair>
+class MultinomialTest : public ::testing::Test {
+ public:
+  using FloatType = typename TypePair::T1;
+  using IntegralType = typename TypePair::T2;
+};
+
+using TestTypes =
+    ::testing::Types<TypePair<float, int>, TypePair<double, int>,
+                     TypePair<float, int64_t>, TypePair<double, int64_t> >;
+
+TYPED_TEST_SUITE(MultinomialTest, TestTypes);
+
+TYPED_TEST(MultinomialTest, TestMultiBatch) {
+  const int kNumSamples = 1000;
+  using Float = typename TestFixture::FloatType;
+  using Int = typename TestFixture::IntegralType;
+  tflite::MultinomialOpModel m({tflite::GetTTEnum<Float>(), {3, 3}},
+                               kNumSamples, {tflite::GetTTEnum<Int>(), {}});
+  // Add 3 batches of 3 logits each.
+  m.PopulateTensor<Float>(m.logits(),
+                          std::vector<Float>(9, static_cast<Float>(0.0f)));
+
+  m.Invoke();
+  auto output = m.GetOutput<Int>();
+  EXPECT_EQ(output.size(), kNumSamples * 3);
+
+  int c0 = std::count(output.begin(), output.end(), 0);
+  int c1 = std::count(output.begin(), output.end(), 1);
+  int c2 = std::count(output.begin(), output.end(), 2);
+
+  EXPECT_EQ(c0 + c1 + c2, 3 * kNumSamples);
+
+  // Make sure they're all sampled with roughly equal probability.
+  EXPECT_GT(c0, 750);
+  EXPECT_GT(c1, 750);
+  EXPECT_GT(c2, 750);
+  EXPECT_LT(c0, 1250);
+  EXPECT_LT(c1, 1250);
+  EXPECT_LT(c2, 1250);
+}
+
+// Test that higher log odds are sampled more often.
+TYPED_TEST(MultinomialTest, TestSampleHighLogOdds) {
+  const int kNumSamples = 1000;
+  using Float = typename TestFixture::FloatType;
+  using Int = typename TestFixture::IntegralType;
+  tflite::MultinomialOpModel m({tflite::GetTTEnum<Float>(), {1, 3}},
+                               kNumSamples, {tflite::GetTTEnum<Int>(), {}});
+
+  // Add 1 batch of 3 logits.
+  m.PopulateTensor<Float>(m.logits(),
+                          {static_cast<Float>(0.0f), static_cast<Float>(1.0f),
+                           static_cast<Float>(0.0f)});
+  m.Invoke();
+  auto output = m.GetOutput<Int>();
+  EXPECT_EQ(output.size(), kNumSamples);
+
+  int c0 = std::count(output.begin(), output.end(), 0);
+  int c1 = std::count(output.begin(), output.end(), 1);
+  int c2 = std::count(output.begin(), output.end(), 2);
+  EXPECT_EQ(c0 + c1 + c2, kNumSamples);
+  EXPECT_GT(c1, c0);
+  EXPECT_GT(c1, c2);
+}
+
+// Test that very low log odds are never sampled.
+TYPED_TEST(MultinomialTest, TestVeryLowLogOdds) {
+  const int kNumSamples = 1000;
+  using Float = typename TestFixture::FloatType;
+  using Int = typename TestFixture::IntegralType;
+  tflite::MultinomialOpModel m({tflite::GetTTEnum<Float>(), {1, 3}},
+                               kNumSamples, {tflite::GetTTEnum<Int>(), {}});
+
+  // Add 1 batch of 3 logits.
+  m.PopulateTensor<Float>(
+      m.logits(), {static_cast<Float>(-1000.0f), static_cast<Float>(-1000.0f),
+                   static_cast<Float>(0.0f)});
+  m.Invoke();
+  auto output = m.GetOutput<Int>();
+  EXPECT_EQ(output.size(), kNumSamples);
+
+  int c0 = std::count(output.begin(), output.end(), 0);
+  int c1 = std::count(output.begin(), output.end(), 1);
+  int c2 = std::count(output.begin(), output.end(), 2);
+  EXPECT_EQ(c0, 0);
+  EXPECT_EQ(c1, 0);
+  EXPECT_EQ(c2, kNumSamples);
+}
+
+TYPED_TEST(MultinomialTest, TestSamplesDifferent) {
+  using Float = typename TestFixture::FloatType;
+  using Int = typename TestFixture::IntegralType;
+  const int kNumSamples = 5;
+  const int kNumLogits = 1000;
+
+  tflite::MultinomialOpModel m({tflite::GetTTEnum<Float>(), {1, kNumLogits}},
+                               kNumSamples, {tflite::GetTTEnum<Int>(), {}});
+
+  std::vector<Float> logits(kNumLogits, static_cast<Float>(0.0f));
+  m.PopulateTensor<Float>(m.logits(), logits);
+
+  m.Invoke();
+  auto output1 = m.GetOutput<Int>();
+  m.Invoke();
+  auto output2 = m.GetOutput<Int>();
+
+  bool successive_samples_are_different = false;
+  for (int i = 0; i < kNumSamples; ++i) {
+    if (output1[i] == output2[i]) continue;
+    successive_samples_are_different = true;
+    break;
+  }
+  EXPECT_TRUE(successive_samples_are_different);
+}
+
+TYPED_TEST(MultinomialTest, TestSamplesPrecise) {
+  using Float = typename TestFixture::FloatType;
+  using Int = typename TestFixture::IntegralType;
+  const int kNumSamples = 100000;
+  const int kNumLogits = 2;
+
+  tflite::MultinomialOpModel m({tflite::GetTTEnum<Float>(), {1, kNumLogits}},
+                               kNumSamples, {tflite::GetTTEnum<Int>(), {}});
+
+  std::vector<Float> logits(
+      {static_cast<Float>(1000.0), static_cast<float>(1001.0)});
+  m.PopulateTensor<Float>(m.logits(), logits);
+
+  m.Invoke();
+  auto output = m.GetOutput<Int>();
+  int c0 = std::count(output.begin(), output.end(), 0);
+  int c1 = std::count(output.begin(), output.end(), 1);
+
+  double p0 = static_cast<double>(c0) / (c0 + c1);
+  EXPECT_LT(std::abs(p0 - 0.26894142137), 0.01);
+}
diff --git a/tensorflow/lite/kernels/neg.cc b/tensorflow/lite/kernels/neg.cc
index 4a4ce8fcbd5..befcb2d03e3 100644
--- a/tensorflow/lite/kernels/neg.cc
+++ b/tensorflow/lite/kernels/neg.cc
@@ -34,8 +34,11 @@ constexpr int kOutputTensor = 0;
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   output->type = input->type;
   return context->ResizeTensor(context, output,
@@ -43,8 +46,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   switch (input->type) {
     case kTfLiteInt64:
       reference_ops::Negate(
diff --git a/tensorflow/lite/kernels/non_max_suppression.cc b/tensorflow/lite/kernels/non_max_suppression.cc
index d6e13cdbd33..70f80e68cd0 100644
--- a/tensorflow/lite/kernels/non_max_suppression.cc
+++ b/tensorflow/lite/kernels/non_max_suppression.cc
@@ -79,20 +79,25 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
 
   // Boxes & Scores.
-  const TfLiteTensor* input_boxes = GetInput(context, node, kInputTensorBoxes);
+  const TfLiteTensor* input_boxes;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kInputTensorBoxes, &input_boxes));
   TF_LITE_ENSURE_EQ(context, input_boxes->type, kTfLiteFloat32);
   TF_LITE_ENSURE_EQ(context, NumDimensions(input_boxes), 2);
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(input_boxes, 1), 4);
   const int num_boxes = SizeOfDimension(input_boxes, 0);
-  const TfLiteTensor* input_scores =
-      GetInput(context, node, kInputTensorScores);
+  const TfLiteTensor* input_scores;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kInputTensorScores, &input_scores));
   TF_LITE_ENSURE_EQ(context, input_scores->type, kTfLiteFloat32);
   TF_LITE_ENSURE_EQ(context, NumDimensions(input_scores), 1);
   TF_LITE_ENSURE_EQ(context, num_boxes, SizeOfDimension(input_scores, 0));
 
   // Max output size.
-  const TfLiteTensor* input_max_output_size =
-      GetInput(context, node, kInputTensorMaxOutputSize);
+  const TfLiteTensor* input_max_output_size;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorMaxOutputSize,
+                                 &input_max_output_size));
   TF_LITE_ENSURE_EQ(context, input_max_output_size->type, kTfLiteInt32);
   TF_LITE_ENSURE_EQ(context, NumDimensions(input_max_output_size), 0);
   const bool is_max_output_size_const = IsConstantTensor(input_max_output_size);
@@ -103,30 +108,43 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
 
   // IoU & Score thresholds.
-  const TfLiteTensor* input_iou_threshold =
-      GetInput(context, node, kInputTensorIouThreshold);
+  const TfLiteTensor* input_iou_threshold;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorIouThreshold,
+                                 &input_iou_threshold));
   TF_LITE_ENSURE_EQ(context, input_iou_threshold->type, kTfLiteFloat32);
   TF_LITE_ENSURE_EQ(context, NumDimensions(input_iou_threshold), 0);
-  const TfLiteTensor* input_score_threshold =
-      GetInput(context, node, kInputTensorScoreThreshold);
+  const TfLiteTensor* input_score_threshold;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorScoreThreshold,
+                                 &input_score_threshold));
   TF_LITE_ENSURE_EQ(context, input_iou_threshold->type, kTfLiteFloat32);
   TF_LITE_ENSURE_EQ(context, NumDimensions(input_score_threshold), 0);
 
   if (is_soft_nms) {
-    const TfLiteTensor* input_sigma =
-        GetInput(context, node, kInputTensorSigma);
+    const TfLiteTensor* input_sigma;
+    TF_LITE_ENSURE_OK(
+        context, GetInputSafe(context, node, kInputTensorSigma, &input_sigma));
     TF_LITE_ENSURE_EQ(context, input_sigma->type, kTfLiteFloat32);
     TF_LITE_ENSURE_EQ(context, NumDimensions(input_sigma), 0);
 
     TF_LITE_ENSURE_EQ(context, NumOutputs(node), 3);
-    TfLiteTensor* output_selected_indices =
-        GetOutput(context, node, kSoftNMSOutputTensorSelectedIndices);
+    TfLiteTensor* output_selected_indices;
+    TF_LITE_ENSURE_OK(
+        context,
+        GetOutputSafe(context, node, kSoftNMSOutputTensorSelectedIndices,
+                      &output_selected_indices));
     output_selected_indices->type = kTfLiteInt32;
-    TfLiteTensor* output_selected_scores =
-        GetOutput(context, node, kSoftNMSOutputTensorSelectedScores);
+    TfLiteTensor* output_selected_scores;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node,
+                                             kSoftNMSOutputTensorSelectedScores,
+                                             &output_selected_scores));
     output_selected_scores->type = kTfLiteFloat32;
-    TfLiteTensor* output_num_selected_indices =
-        GetOutput(context, node, kSoftNMSOutputTensorNumSelectedIndices);
+    TfLiteTensor* output_num_selected_indices;
+    TF_LITE_ENSURE_OK(
+        context,
+        GetOutputSafe(context, node, kSoftNMSOutputTensorNumSelectedIndices,
+                      &output_num_selected_indices));
     output_num_selected_indices->type = kTfLiteInt32;
     SetTensorSizes(context, output_num_selected_indices, {});
 
@@ -139,11 +157,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
   } else {
     TF_LITE_ENSURE_EQ(context, NumOutputs(node), 2);
-    TfLiteTensor* output_selected_indices =
-        GetOutput(context, node, kNMSOutputTensorSelectedIndices);
+    TfLiteTensor* output_selected_indices;
+    TF_LITE_ENSURE_OK(
+        context, GetOutputSafe(context, node, kNMSOutputTensorSelectedIndices,
+                               &output_selected_indices));
     output_selected_indices->type = kTfLiteInt32;
-    TfLiteTensor* output_num_selected_indices =
-        GetOutput(context, node, kNMSOutputTensorNumSelectedIndices);
+    TfLiteTensor* output_num_selected_indices;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node,
+                                             kNMSOutputTensorNumSelectedIndices,
+                                             &output_num_selected_indices));
     output_num_selected_indices->type = kTfLiteInt32;
     SetTensorSizes(context, output_num_selected_indices, {});
 
@@ -179,20 +201,29 @@ void ResetUnusedElementsToZeroes(const int max_output_size,
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const bool is_soft_nms = NumInputs(node) == 6;
 
-  const TfLiteTensor* input_boxes = GetInput(context, node, kInputTensorBoxes);
+  const TfLiteTensor* input_boxes;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kInputTensorBoxes, &input_boxes));
   const int num_boxes = SizeOfDimension(input_boxes, 0);
-  const TfLiteTensor* input_scores =
-      GetInput(context, node, kInputTensorScores);
-  const TfLiteTensor* input_max_output_size =
-      GetInput(context, node, kInputTensorMaxOutputSize);
+  const TfLiteTensor* input_scores;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kInputTensorScores, &input_scores));
+  const TfLiteTensor* input_max_output_size;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorMaxOutputSize,
+                                 &input_max_output_size));
   const int max_output_size_value = *GetTensorData<int>(input_max_output_size);
   TF_LITE_ENSURE(context, (max_output_size_value >= 0));
   const bool is_max_output_size_const = IsConstantTensor(input_max_output_size);
-  const TfLiteTensor* input_iou_threshold =
-      GetInput(context, node, kInputTensorIouThreshold);
+  const TfLiteTensor* input_iou_threshold;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorIouThreshold,
+                                 &input_iou_threshold));
   const float iou_threshold = *GetTensorData<float>(input_iou_threshold);
-  const TfLiteTensor* input_score_threshold =
-      GetInput(context, node, kInputTensorScoreThreshold);
+  const TfLiteTensor* input_score_threshold;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorScoreThreshold,
+                                 &input_score_threshold));
   const float score_threshold = *GetTensorData<float>(input_score_threshold);
 
   TfLiteTensor* output_selected_indices = nullptr;
@@ -200,8 +231,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output_num_selected_indices = nullptr;
 
   if (is_soft_nms) {
-    const TfLiteTensor* input_sigma =
-        GetInput(context, node, kInputTensorSigma);
+    const TfLiteTensor* input_sigma;
+    TF_LITE_ENSURE_OK(
+        context, GetInputSafe(context, node, kInputTensorSigma, &input_sigma));
     const float soft_nms_sigma = *GetTensorData<float>(input_sigma);
     if (soft_nms_sigma < 0) {
       context->ReportError(context, "Invalid sigma value for soft NMS: %f",
@@ -209,12 +241,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteError;
     }
 
-    output_selected_indices =
-        GetOutput(context, node, kSoftNMSOutputTensorSelectedIndices);
-    output_selected_scores =
-        GetOutput(context, node, kSoftNMSOutputTensorSelectedScores);
-    output_num_selected_indices =
-        GetOutput(context, node, kSoftNMSOutputTensorNumSelectedIndices);
+    TF_LITE_ENSURE_OK(
+        context,
+        GetOutputSafe(context, node, kSoftNMSOutputTensorSelectedIndices,
+                      &output_selected_indices));
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node,
+                                             kSoftNMSOutputTensorSelectedScores,
+                                             &output_selected_scores));
+    TF_LITE_ENSURE_OK(
+        context,
+        GetOutputSafe(context, node, kSoftNMSOutputTensorNumSelectedIndices,
+                      &output_num_selected_indices));
     if (!is_max_output_size_const) {
       SetTensorSizes(context, output_selected_indices, {max_output_size_value});
       SetTensorSizes(context, output_selected_scores, {max_output_size_value});
@@ -228,10 +265,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         max_output_size_value, *output_num_selected_indices->data.i32,
         output_selected_indices->data.i32, output_selected_scores->data.f);
   } else {
-    output_selected_indices =
-        GetOutput(context, node, kNMSOutputTensorSelectedIndices);
-    output_num_selected_indices =
-        GetOutput(context, node, kNMSOutputTensorNumSelectedIndices);
+    TF_LITE_ENSURE_OK(
+        context, GetOutputSafe(context, node, kNMSOutputTensorSelectedIndices,
+                               &output_selected_indices));
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node,
+                                             kNMSOutputTensorNumSelectedIndices,
+                                             &output_num_selected_indices));
     if (!is_max_output_size_const) {
       SetTensorSizes(context, output_selected_indices, {max_output_size_value});
     }
diff --git a/tensorflow/lite/kernels/non_max_suppression_test.cc b/tensorflow/lite/kernels/non_max_suppression_test.cc
index 9b7baa147e5..3ca54010fcd 100644
--- a/tensorflow/lite/kernels/non_max_suppression_test.cc
+++ b/tensorflow/lite/kernels/non_max_suppression_test.cc
@@ -77,7 +77,7 @@ class NonMaxSuppressionV4OpModel : public BaseNMSOp {
       input_max_output_size_ =
           AddConstInput(TensorType_INT32, {max_output_size});
     } else {
-      input_max_output_size_ = AddInput(TensorType_INT32, {});
+      input_max_output_size_ = AddInput(TensorType_INT32);
     }
     input_iou_threshold_ = AddConstInput(TensorType_FLOAT32, {iou_threshold});
     input_score_threshold_ = AddInput({TensorType_FLOAT32, {}});
@@ -168,7 +168,7 @@ class NonMaxSuppressionV5OpModel : public BaseNMSOp {
       input_max_output_size_ =
           AddConstInput(TensorType_INT32, {max_output_size});
     } else {
-      input_max_output_size_ = AddInput(TensorType_INT32, {});
+      input_max_output_size_ = AddInput(TensorType_INT32);
     }
     input_iou_threshold_ = AddConstInput(TensorType_FLOAT32, {iou_threshold});
     input_score_threshold_ = AddInput({TensorType_FLOAT32, {}});
diff --git a/tensorflow/lite/kernels/numeric_verify.cc b/tensorflow/lite/kernels/numeric_verify.cc
index bbd2448ece0..5b4011f6649 100644
--- a/tensorflow/lite/kernels/numeric_verify.cc
+++ b/tensorflow/lite/kernels/numeric_verify.cc
@@ -109,7 +109,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   node->temporaries = TfLiteIntArrayCreate(1);
   node->temporaries->data[0] = op_data->cache_tensor_id;
 
-  TfLiteTensor* dequantized = GetTemporary(context, node, /*index=*/0);
+  TfLiteTensor* dequantized;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, /*index=*/0, &dequantized));
   dequantized->type = op_context.ref->type;
   dequantized->allocation_type = kTfLiteDynamic;
 
@@ -142,7 +144,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   }
 
   // Dequantize the input
-  TfLiteTensor* dequantized = GetTemporary(context, node, /*index=*/0);
+  TfLiteTensor* dequantized;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, /*index=*/0, &dequantized));
   auto status = builtin::dequantize::DequantizeImpl<kernel_type>(
       context, node, op_context.input, dequantized);
   if (status != kTfLiteOk) {
diff --git a/tensorflow/lite/kernels/optional_tensor_test.cc b/tensorflow/lite/kernels/optional_tensor_test.cc
index 9e83c74da8d..1468c89e375 100644
--- a/tensorflow/lite/kernels/optional_tensor_test.cc
+++ b/tensorflow/lite/kernels/optional_tensor_test.cc
@@ -94,10 +94,10 @@ class LSTMOpModel : public SingleOpModel {
     }
 
     // Adding the 2 input state tensors.
-    input_activation_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}}, true);
+    input_activation_state_ = AddVariableInput(
+        TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}});
     input_cell_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}}, true);
+        AddVariableInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}});
 
     output_ = AddOutput(TensorType_FLOAT32);
 
diff --git a/tensorflow/lite/kernels/pack.cc b/tensorflow/lite/kernels/pack.cc
index 90a87b0c8c7..16ea01b6dca 100644
--- a/tensorflow/lite/kernels/pack.cc
+++ b/tensorflow/lite/kernels/pack.cc
@@ -38,7 +38,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), data->values_count);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input0 = GetInput(context, node, 0);
+  const TfLiteTensor* input0;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input0));
   const int dimension_size = NumDimensions(input0) + 1;
   if (data->axis < 0) {
     data->axis += dimension_size;
@@ -55,7 +56,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
   // Make sure all inputs have the same shape and type.
   for (int i = 1; i < data->values_count; ++i) {
-    const TfLiteTensor* input = GetInput(context, node, i);
+    const TfLiteTensor* input;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, i, &input));
     TF_LITE_ENSURE(context, HaveSameShapes(input0, input));
     TF_LITE_ENSURE_TYPES_EQ(context, input0->type, input->type);
   }
@@ -72,13 +74,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
   }
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   TF_LITE_ENSURE_TYPES_EQ(context, output->type, input0->type);
 
   // Guarantee input/output quantization params match as we do not support
   // packing quantized tensors.
   for (int i = 0; i < data->values_count; i++) {
-    const TfLiteTensor* input = GetInput(context, node, i);
+    const TfLiteTensor* input;
+    TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, i, &input));
     TF_LITE_ENSURE_EQ(context, input->params.zero_point,
                       output->params.zero_point);
     TF_LITE_ENSURE_EQ(context, input->params.scale, output->params.scale);
@@ -106,7 +111,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLitePackParams* data =
       reinterpret_cast<TfLitePackParams*>(node->builtin_data);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   switch (output->type) {
     case kTfLiteFloat32: {
       return PackImpl<float>(context, node, output, data->values_count,
diff --git a/tensorflow/lite/kernels/pooling.cc b/tensorflow/lite/kernels/pooling.cc
index a1380080a1e..1ae3d207b13 100644
--- a/tensorflow/lite/kernels/pooling.cc
+++ b/tensorflow/lite/kernels/pooling.cc
@@ -71,8 +71,10 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TfLiteTensor* output = GetOutput(context, node, 0);
-  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
@@ -368,8 +370,10 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* output = GetOutput(context, node, 0);
-  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
       AverageEvalFloat<kernel_type>(context, node, params, data, input, output);
@@ -399,8 +403,10 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* output = GetOutput(context, node, 0);
-  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
       MaxEvalFloat<kernel_type>(context, node, params, data, input, output);
@@ -430,8 +436,10 @@ TfLiteStatus L2Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* output = GetOutput(context, node, 0);
-  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
       L2EvalFloat<kernel_type>(context, node, params, data, input, output);
diff --git a/tensorflow/lite/kernels/pow.cc b/tensorflow/lite/kernels/pow.cc
index 7f995929ec7..ef8d77df44b 100644
--- a/tensorflow/lite/kernels/pow.cc
+++ b/tensorflow/lite/kernels/pow.cc
@@ -54,9 +54,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
 
@@ -112,9 +118,15 @@ TfLiteStatus CheckValue(TfLiteContext* context, const TfLiteTensor* input) {
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   switch (output->type) {
     case kTfLiteInt32: {
diff --git a/tensorflow/lite/kernels/quant_basic_lstm_test.cc b/tensorflow/lite/kernels/quant_basic_lstm_test.cc
index 3e081c221c5..787cb6d2e97 100644
--- a/tensorflow/lite/kernels/quant_basic_lstm_test.cc
+++ b/tensorflow/lite/kernels/quant_basic_lstm_test.cc
@@ -61,9 +61,8 @@ class QuantizedLSTMOpModel : public MultiOpModel {
 
     input_ = AddInput(input_tensor_data);
 
-    prev_output_ =
-        AddInput({TensorType_UINT8, output_shape, 0.0f, 0.0f, 1. / 128., 128},
-                 /*is_variable=*/true);
+    prev_output_ = AddVariableInput(
+        {TensorType_UINT8, output_shape, 0.0f, 0.0f, 1. / 128., 128});
     // Biases and Weights have to be constant in order to allow NNAPI
     // delegation
     weights_ = AddConstInput<uint8_t>({TensorType_UINT8, weight_shape, 0.0f,
@@ -72,9 +71,8 @@ class QuantizedLSTMOpModel : public MultiOpModel {
     biases_ = AddConstInput<int32_t>(
         {TensorType_INT32, bias_shape, 0.0f, 0.0f, weightsScale / 128, 0},
         biases);
-    prev_cell_state_ =
-        AddInput({TensorType_INT16, state_shape, 0.0f, 0.0f, 1. / 2048., 0},
-                 /*is_variable=*/true);
+    prev_cell_state_ = AddVariableInput(
+        {TensorType_INT16, state_shape, 0.0f, 0.0f, 1. / 2048., 0});
 
     sum_out_ = AddOutput(input_tensor_data);
 
diff --git a/tensorflow/lite/kernels/quantize.cc b/tensorflow/lite/kernels/quantize.cc
index 1779500e6a2..8ddc18be2b1 100644
--- a/tensorflow/lite/kernels/quantize.cc
+++ b/tensorflow/lite/kernels/quantize.cc
@@ -97,8 +97,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
 
   // TODO(b/128934713): Add support for fixed-point per-channel quantization.
   // Currently this only support affine per-layer quantization.
@@ -118,8 +120,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   } else {
     // Requantize use case.
     if (input->type == kTfLiteInt16) {
-      TF_LITE_ENSURE(
-          context, output->type == kTfLiteInt8 || output->type == kTfLiteInt16);
+      TF_LITE_ENSURE(context, output->type == kTfLiteInt8 ||
+                                  output->type == kTfLiteInt16 ||
+                                  output->type == kTfLiteInt32);
     } else {
       TF_LITE_ENSURE(context,
                      input->type == kTfLiteInt8 || input->type == kTfLiteUInt8);
@@ -141,8 +144,10 @@ template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = static_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
 
   const RuntimeShape input_shape = GetTensorShape(input);
   const RuntimeShape output_shape = GetTensorShape(output);
@@ -194,6 +199,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                   output->params.zero_point,
                                   GetTensorData<int16_t>(output));
           return kTfLiteOk;
+        case kTfLiteInt32:
+          // This case is not supported by the converter or other TFLite tools.
+          // The only use case is for applications that take quantized int32
+          // inference outputs.
+          Requantize<kernel_type>(GetTensorData<int16_t>(input),
+                                  MatchingFlatSize(input_shape, output_shape),
+                                  data->output_multiplier, data->output_shift,
+                                  input->params.zero_point,
+                                  output->params.zero_point,
+                                  GetTensorData<int32_t>(output));
+          return kTfLiteOk;
         default:
           ReportError(context, input->type, output->type);
           return kTfLiteError;
diff --git a/tensorflow/lite/kernels/quantize_test.cc b/tensorflow/lite/kernels/quantize_test.cc
index d7392b3e3ea..a8d68f6875b 100644
--- a/tensorflow/lite/kernels/quantize_test.cc
+++ b/tensorflow/lite/kernels/quantize_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <cstdint>
 #include <initializer_list>
+#include <limits>
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -458,5 +459,59 @@ TEST(QuantizeOpTest, Int16Int8SmallerScaleNeonPath) {
                                 19, 17, 15, 13, 11, 9,  7,  5,  3,  1}));
 }
 
+// Input scale 1.0, output scale 1.0, input zeropoint 0, output zeropoint 0
+TEST(QuantizeOpTest, Int16Int32SameScale) {
+  QuantizeOpModel m({TensorType_INT16,
+                     {1, 1, 2, 5},
+                     std::numeric_limits<int16_t>::min(),
+                     std::numeric_limits<int16_t>::max()},
+                    {TensorType_INT32,
+                     {1, 1, 2, 5},
+                     std::numeric_limits<int32_t>::min(),
+                     std::numeric_limits<int32_t>::max()});
+
+  // Input will quantized to {1,3,5,7,9,11,13,15,17,19}.
+  m.SetInputAndQuantize<int16_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int32_t>(),
+              ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
+}
+
+// Input scale 0.500000, output scale 1.000000, input zeropoint -1, output
+// zeropoint 0
+TEST(QuantizeOpTest, Int16Int32LargerScale) {
+  QuantizeOpModel m({TensorType_INT16,
+                     {1, 1, 2, 5},
+                     std::numeric_limits<int16_t>::min() / 2.0,
+                     std::numeric_limits<int16_t>::max() / 2.0},
+                    {TensorType_INT32,
+                     {1, 1, 2, 5},
+                     std::numeric_limits<int32_t>::min(),
+                     std::numeric_limits<int32_t>::max()});
+
+  m.SetInputAndQuantize<int16_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int32_t>(),
+              ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
+}
+
+// Input scale 1.000000, output scale 0.500000, input zeropoint -1, output
+// zeropoint 0
+TEST(QuantizeOpTest, Int16Int32SmallerScale) {
+  QuantizeOpModel m({TensorType_INT16,
+                     {1, 1, 2, 5},
+                     std::numeric_limits<int16_t>::min(),
+                     std::numeric_limits<int16_t>::max()},
+                    {TensorType_INT32,
+                     {1, 1, 2, 5},
+                     std::numeric_limits<int32_t>::min() / 2.0,
+                     std::numeric_limits<int32_t>::max() / 2.0});
+
+  m.SetInputAndQuantize<int16_t>({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int32_t>(),
+              ElementsAreArray({2, 4, 6, 8, 10, 12, 14, 16, 18, 20}));
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/random_standard_normal.cc b/tensorflow/lite/kernels/random_standard_normal.cc
new file mode 100644
index 00000000000..9b0b7b0b5d8
--- /dev/null
+++ b/tensorflow/lite/kernels/random_standard_normal.cc
@@ -0,0 +1,127 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <random>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace random_standard_normal {
+
+struct OpData {
+  std::default_random_engine rng;
+};
+
+// Draws a sample from standard normal distribution.
+template <typename Float>
+TfLiteStatus RandomStandardNormalSample(std::default_random_engine& rng,
+                                        Float* output, size_t output_size) {
+  std::normal_distribution<Float> dist;
+  for (Float* it = output; it != output + output_size; ++it) {
+    *it = dist(rng);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus RandomStandardNormalSample(TfLiteContext* context,
+                                        std::default_random_engine& rng,
+                                        TfLiteTensor* output,
+                                        size_t output_size) {
+  switch (output->type) {
+    case kTfLiteFloat32:
+      TF_LITE_ENSURE_OK(context,
+                        RandomStandardNormalSample<float>(
+                            rng, GetTensorData<float>(output), output_size));
+      break;
+    case kTfLiteFloat64:
+      TF_LITE_ENSURE_OK(context,
+                        RandomStandardNormalSample<double>(
+                            rng, GetTensorData<double>(output), output_size));
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(
+          context, "Unsupported output datatype for RandomStandardNormal: %s",
+          TfLiteTypeGetName(output->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return new OpData();
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  // TODO(b/169611265): Handle optional seed input.
+  TF_LITE_ENSURE_EQ(context, tflite::NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, tflite::NumOutputs(node), 1);
+
+  // Input is a shape tensor.
+  const TfLiteTensor* input = tflite::GetInput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, tflite::NumDimensions(input), 1);
+  // TODO(b/169611265): Support dynamic output tensors.
+  TF_LITE_ENSURE(context, IsConstantTensor(input));
+
+  // TODO(b/169611265): Handle other input data types.
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteInt32);
+
+  int output_dims = tflite::SizeOfDimension(input, 0);
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(output_dims);
+  for (int i = 0; i < output_dims; i++) {
+    output_shape->data[i] = input->data.i32[i];
+  }
+
+  TfLiteTensor* output = tflite::GetOutput(context, node, 0);
+  // ResizeTensor takes ownership of output_shape.
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  // TODO(b/169611265): Handle optional seed input.
+  OpData* params = reinterpret_cast<OpData*>(node->user_data);
+  TF_LITE_ENSURE(context, params != nullptr);
+
+  TfLiteTensor* output = tflite::GetOutput(context, node, 0);
+  size_t output_size = tflite::NumElements(output);
+
+  TF_LITE_ENSURE_OK(context, RandomStandardNormalSample(context, params->rng,
+                                                        output, output_size));
+
+  return kTfLiteOk;
+}
+
+}  // namespace random_standard_normal
+
+TfLiteRegistration* Register_RANDOM_STANDARD_NORMAL() {
+  static TfLiteRegistration r = {
+      random_standard_normal::Init, random_standard_normal::Free,
+      random_standard_normal::Prepare, random_standard_normal::Eval};
+  return &r;
+}
+
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/random_standard_normal_test.cc b/tensorflow/lite/kernels/random_standard_normal_test.cc
new file mode 100644
index 00000000000..88e71f27669
--- /dev/null
+++ b/tensorflow/lite/kernels/random_standard_normal_test.cc
@@ -0,0 +1,103 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <limits>
+#include <random>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/custom_ops_register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/testing/util.h"
+
+namespace tflite {
+namespace {
+
+template <typename T>
+tflite::TensorType GetTTEnum();
+
+template <>
+tflite::TensorType GetTTEnum<float>() {
+  return tflite::TensorType_FLOAT32;
+}
+
+template <>
+tflite::TensorType GetTTEnum<double>() {
+  return tflite::TensorType_FLOAT64;
+}
+
+class RandomStandardNormalOpModel : public tflite::SingleOpModel {
+ public:
+  RandomStandardNormalOpModel(const std::initializer_list<int>& input,
+                              tflite::TensorData output) {
+    input_ = AddConstInput(tflite::TensorType_INT32, input,
+                           {static_cast<int>(input.size())});
+    output_ = AddOutput(output);
+    SetCustomOp("RandomStandardNormal", {},
+                ops::custom::Register_RANDOM_STANDARD_NORMAL);
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  int input_;
+  int output_;
+
+  int input() { return input_; }
+  int output() { return output_; }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+};
+
+}  // namespace
+}  // namespace tflite
+
+template <typename FloatType>
+class RandomStandardNormalTest : public ::testing::Test {
+ public:
+  using Float = FloatType;
+};
+
+using TestTypes = ::testing::Types<float, double>;
+
+TYPED_TEST_SUITE(RandomStandardNormalTest, TestTypes);
+
+TYPED_TEST(RandomStandardNormalTest, TestOutput) {
+  using Float = typename TestFixture::Float;
+  tflite::RandomStandardNormalOpModel m({1000, 50, 5},
+                                        {tflite::GetTTEnum<Float>(), {}});
+  m.Invoke();
+  auto output = m.GetOutput<Float>();
+  EXPECT_EQ(output.size(), 1000 * 50 * 5);
+
+  double sum = 0;
+  for (auto r : output) {
+    sum += r;
+  }
+  double avg = sum / output.size();
+  ASSERT_LT(std::abs(avg), 0.05);  // Average should approximately 0.
+
+  double sum_squared = 0;
+  for (auto r : output) {
+    sum_squared += std::pow(r - avg, 2);
+  }
+  double var = sum_squared / output.size();
+  EXPECT_LT(std::abs(1 - var), 0.05);  // Variance should be approximately 1.
+}
diff --git a/tensorflow/lite/kernels/range.cc b/tensorflow/lite/kernels/range.cc
index 71ee4208ed9..6cc3e52e6fa 100644
--- a/tensorflow/lite/kernels/range.cc
+++ b/tensorflow/lite/kernels/range.cc
@@ -83,9 +83,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* start = GetInput(context, node, kStartTensor);
-  const TfLiteTensor* limit = GetInput(context, node, kLimitTensor);
-  const TfLiteTensor* delta = GetInput(context, node, kDeltaTensor);
+  const TfLiteTensor* start;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kStartTensor, &start));
+  const TfLiteTensor* limit;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kLimitTensor, &limit));
+  const TfLiteTensor* delta;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kDeltaTensor, &delta));
   // Make sure all the inputs are scalars.
   TF_LITE_ENSURE_EQ(context, NumDimensions(start), 0);
   TF_LITE_ENSURE_EQ(context, NumDimensions(limit), 0);
@@ -103,7 +106,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_TYPES_EQ(context, limit->type, dtype);
   TF_LITE_ENSURE_TYPES_EQ(context, delta->type, dtype);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   output->type = dtype;
 
   if (IsConstantTensor(start) && IsConstantTensor(limit) &&
@@ -130,11 +135,16 @@ void EvalImpl(const TfLiteTensor* start, const TfLiteTensor* delta,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* start = GetInput(context, node, kStartTensor);
-  const TfLiteTensor* limit = GetInput(context, node, kLimitTensor);
-  const TfLiteTensor* delta = GetInput(context, node, kDeltaTensor);
+  const TfLiteTensor* start;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kStartTensor, &start));
+  const TfLiteTensor* limit;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kLimitTensor, &limit));
+  const TfLiteTensor* delta;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kDeltaTensor, &delta));
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   if (IsDynamicTensor(output)) {
     TF_LITE_ENSURE_OK(context,
diff --git a/tensorflow/lite/kernels/rank.cc b/tensorflow/lite/kernels/rank.cc
index 2202f6dd953..aa0f9c2fd25 100644
--- a/tensorflow/lite/kernels/rank.cc
+++ b/tensorflow/lite/kernels/rank.cc
@@ -31,8 +31,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   output->type = kTfLiteInt32;
 
   // By design, the input shape is always known at the time of Prepare, even
diff --git a/tensorflow/lite/kernels/read_variable.cc b/tensorflow/lite/kernels/read_variable.cc
index 78b6a136be4..942cd4b7c36 100644
--- a/tensorflow/lite/kernels/read_variable.cc
+++ b/tensorflow/lite/kernels/read_variable.cc
@@ -34,12 +34,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, node->inputs->size, 1);
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
 
-  const TfLiteTensor* input_resource_id_tensor =
-      GetInput(context, node, kInputVariableId);
+  const TfLiteTensor* input_resource_id_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputVariableId,
+                                          &input_resource_id_tensor));
   TF_LITE_ENSURE_EQ(context, input_resource_id_tensor->type, kTfLiteInt32);
   TF_LITE_ENSURE_EQ(context, NumElements(input_resource_id_tensor), 1);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputValue);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputValue, &output));
   SetTensorToDynamic(output);
 
   return kTfLiteOk;
@@ -48,15 +51,18 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   Subgraph* subgraph = reinterpret_cast<Subgraph*>(context->impl_);
 
-  const TfLiteTensor* input_resource_id_tensor =
-      GetInput(context, node, kInputVariableId);
+  const TfLiteTensor* input_resource_id_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputVariableId,
+                                          &input_resource_id_tensor));
   int resource_id = input_resource_id_tensor->data.i32[0];
   auto& resources = subgraph->resources();
   auto* variable = resource::GetResourceVariable(&resources, resource_id);
   TF_LITE_ENSURE(context, variable != nullptr);
 
   TfLiteTensor* variable_tensor = variable->GetTensor();
-  TfLiteTensor* output = GetOutput(context, node, kOutputValue);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputValue, &output));
 
   TF_LITE_ENSURE_TYPES_EQ(context, variable_tensor->type, output->type);
   TF_LITE_ENSURE_OK(
diff --git a/tensorflow/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc
index 10fd7b02b61..c3debef0f86 100644
--- a/tensorflow/lite/kernels/reduce.cc
+++ b/tensorflow/lite/kernels/reduce.cc
@@ -170,7 +170,9 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
   TfLiteIntArrayFree(node->temporaries);
   node->temporaries = TfLiteIntArrayCreate(3);
   node->temporaries->data[0] = op_data->scratch_tensor_index;
-  TfLiteTensor* scratch_tensor = GetTemporary(context, node, /*index=*/0);
+  TfLiteTensor* scratch_tensor;
+  TF_LITE_ENSURE_OK(
+      context, GetTemporarySafe(context, node, /*index=*/0, &scratch_tensor));
   scratch_tensor->type = kTfLiteInt32;
   scratch_tensor->allocation_type = kTfLiteArenaRw;
   TfLiteIntArray* index_size = TfLiteIntArrayCreate(1);
@@ -180,11 +182,15 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
 
   // Creates a temp tensor to store resolved axis given input data.
   node->temporaries->data[1] = op_data->scratch_tensor_index + 1;
-  TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1);
+  TfLiteTensor* resolved_axis;
+  TF_LITE_ENSURE_OK(
+      context, GetTemporarySafe(context, node, /*index=*/1, &resolved_axis));
   resolved_axis->type = kTfLiteInt32;
   // Creates a temp tensor to store temp sums when calculating mean.
   node->temporaries->data[2] = op_data->scratch_tensor_index + 2;
-  TfLiteTensor* temp_sum = GetTemporary(context, node, /*index=*/2);
+  TfLiteTensor* temp_sum;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, /*index=*/2, &temp_sum));
   switch (op_context->input->type) {
     case kTfLiteFloat32:
       temp_sum->type = kTfLiteFloat32;
@@ -217,7 +223,9 @@ TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_TYPES_EQ(context, op_context.axis->type, kTfLiteInt32);
   TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context));
 
-  TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1);
+  TfLiteTensor* resolved_axis;
+  TF_LITE_ENSURE_OK(
+      context, GetTemporarySafe(context, node, /*index=*/1, &resolved_axis));
   // Leaves work to Eval if axis is not constant; else resizes output.
   if (!IsConstantTensor(op_context.axis)) {
     SetTensorToDynamic(op_context.output);
@@ -233,7 +241,8 @@ TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) {
 
 TfLiteStatus PrepareAny(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
-  const TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteBool);
   return PrepareSimple(context, node);
 }
@@ -254,7 +263,9 @@ TfLiteStatus PrepareMeanOrSum(TfLiteContext* context, TfLiteNode* node) {
     QuantizeMultiplier(real_multiplier, &data->multiplier, &exponent);
     data->shift = exponent;
   }
-  TfLiteTensor* temp_sum = GetTemporary(context, node, /*index=*/2);
+  TfLiteTensor* temp_sum;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, /*index=*/2, &temp_sum));
   if (!IsConstantTensor(op_context.axis)) {
     SetTensorToDynamic(temp_sum);
     return kTfLiteOk;
@@ -284,6 +295,11 @@ TfLiteStatus EvalMeanReferenceOps(TfLiteContext* context,
   op_params.axis_count = num_axis;
   ResolveAxis(GetTensorData<int>(op_context.axis), num_axis, &op_params);
   const TfLiteTensor* input = op_context.input;
+  // Return early when input shape has zero dim.
+  for (int i = 0; i < input->dims->size; ++i) {
+    if (input->dims->data[i] == 0) return kTfLiteOk;
+  }
+
   // TODO(b/139102329): Handle all the cases in the combined reference
   // method.
   if (op_context.params->keep_dims && NumDimensions(input) == 4 &&
@@ -343,9 +359,15 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   int num_axis = static_cast<int>(NumElements(op_context.axis));
-  TfLiteTensor* temp_index = GetTemporary(context, node, /*index=*/0);
-  TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1);
-  TfLiteTensor* temp_sum = GetTemporary(context, node, /*index=*/2);
+  TfLiteTensor* temp_index;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, /*index=*/0, &temp_index));
+  TfLiteTensor* resolved_axis;
+  TF_LITE_ENSURE_OK(
+      context, GetTemporarySafe(context, node, /*index=*/1, &resolved_axis));
+  TfLiteTensor* temp_sum;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, /*index=*/2, &temp_sum));
   // Resize the output tensor if the output tensor is dynamic.
   if (IsDynamicTensor(op_context.output)) {
     TF_LITE_ENSURE_OK(context,
@@ -354,14 +376,19 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_OK(context, ResizeTempSum(context, &op_context, temp_sum));
   }
 
+  // Return early when input shape has zero dim.
+  const TfLiteTensor* input = op_context.input;
+  for (int i = 0; i < input->dims->size; ++i) {
+    if (input->dims->data[i] == 0) return kTfLiteOk;
+  }
+
   if (kernel_type == kGenericOptimized) {
     // Use optimized ops if available.
-    switch (op_context.input->type) {
+    switch (input->type) {
       case kTfLiteInt8: {
         tflite::MeanParams op_params;
         op_params.axis_count = num_axis;
         ResolveAxis(GetTensorData<int>(op_context.axis), num_axis, &op_params);
-        const TfLiteTensor* input = op_context.input;
         if (op_context.params->keep_dims && NumDimensions(input) == 4 &&
             op_params.axis_count == 2 &&
             ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
@@ -381,7 +408,6 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
         tflite::MeanParams op_params;
         op_params.axis_count = num_axis;
         ResolveAxis(GetTensorData<int>(op_context.axis), num_axis, &op_params);
-        const TfLiteTensor* input = op_context.input;
         if (op_context.params->keep_dims && NumDimensions(input) == 4 &&
             op_params.axis_count == 2 &&
             ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
@@ -490,30 +516,40 @@ TfLiteStatus EvalLogic(TfLiteContext* context, TfLiteNode* node,
                        OpContext* op_context, T init_value,
                        T reducer(const T current, const T in)) {
   int64_t num_axis = NumElements(op_context->axis);
-  TfLiteTensor* temp_index = GetTemporary(context, node, /*index=*/0);
-  TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1);
+  TfLiteTensor* temp_index;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, /*index=*/0, &temp_index));
+  TfLiteTensor* resolved_axis;
+  TF_LITE_ENSURE_OK(
+      context, GetTemporarySafe(context, node, /*index=*/1, &resolved_axis));
   // Resize the output tensor if the output tensor is dynamic.
   if (IsDynamicTensor(op_context->output)) {
     TF_LITE_ENSURE_OK(context,
                       ResizeTempAxis(context, op_context, resolved_axis));
     TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, op_context));
   }
-  if (op_context->input->type == kTfLiteUInt8 ||
-      op_context->input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, op_context->input->params.scale,
+
+  const TfLiteTensor* input = op_context->input;
+  // Return early when input shape has zero dim.
+  for (int i = 0; i < input->dims->size; ++i) {
+    if (input->dims->data[i] == 0) return kTfLiteOk;
+  }
+
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, input->params.scale,
                       op_context->output->params.scale);
-    TF_LITE_ENSURE_EQ(context, op_context->input->params.zero_point,
+    TF_LITE_ENSURE_EQ(context, input->params.zero_point,
                       op_context->output->params.zero_point);
   }
   TF_LITE_ENSURE(
       context,
       reference_ops::ReduceGeneric<T>(
-          GetTensorData<T>(op_context->input), op_context->input->dims->data,
-          op_context->input->dims->size, GetTensorData<T>(op_context->output),
-          op_context->output->dims->data, op_context->output->dims->size,
-          GetTensorData<int>(op_context->axis), num_axis,
-          op_context->params->keep_dims, GetTensorData<int>(temp_index),
-          GetTensorData<int>(resolved_axis), init_value, reducer));
+          GetTensorData<T>(input), input->dims->data, input->dims->size,
+          GetTensorData<T>(op_context->output), op_context->output->dims->data,
+          op_context->output->dims->size, GetTensorData<int>(op_context->axis),
+          num_axis, op_context->params->keep_dims,
+          GetTensorData<int>(temp_index), GetTensorData<int>(resolved_axis),
+          init_value, reducer));
   return kTfLiteOk;
 }
 
@@ -621,9 +657,15 @@ TfLiteStatus EvalSum(TfLiteContext* context, TfLiteNode* node) {
   if (need_rescale) {
     // Rescaling 8bit reduce sum.
     int num_axis = static_cast<int>(NumElements(op_context.axis));
-    TfLiteTensor* temp_index = GetTemporary(context, node, /*index=*/0);
-    TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1);
-    TfLiteTensor* temp_sum = GetTemporary(context, node, /*index=*/2);
+    TfLiteTensor* temp_index;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, /*index=*/0, &temp_index));
+    TfLiteTensor* resolved_axis;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, /*index=*/1, &resolved_axis));
+    TfLiteTensor* temp_sum;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, /*index=*/2, &temp_sum));
     // Resize the output tensor if the output tensor is dynamic.
     if (IsDynamicTensor(op_context.output)) {
       TF_LITE_ENSURE_OK(context,
@@ -631,6 +673,11 @@ TfLiteStatus EvalSum(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
       TF_LITE_ENSURE_OK(context, ResizeTempSum(context, &op_context, temp_sum));
     }
+    // Return early when input shape has zero dim.
+    for (int i = 0; i < input->dims->size; ++i) {
+      if (input->dims->data[i] == 0) return kTfLiteOk;
+    }
+
     if (input->type == kTfLiteUInt8) {
       TF_LITE_ENSURE(
           context,
diff --git a/tensorflow/lite/kernels/reduce_test.cc b/tensorflow/lite/kernels/reduce_test.cc
index fd5724a102b..2e724189fde 100644
--- a/tensorflow/lite/kernels/reduce_test.cc
+++ b/tensorflow/lite/kernels/reduce_test.cc
@@ -267,6 +267,17 @@ TEST(ConstFloatMeanOpTest, KeepDims) {
   EXPECT_THAT(m.GetOutput<float>(),
               ElementsAreArray(ArrayFloatNear({10.5, 12.5, 14.5})));
 }
+
+TEST(ConstFloatMeanOpTest, ZeroInputDim) {
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  MeanOpConstModel m({TensorType_FLOAT32, {4, 0, 2}}, {TensorType_FLOAT32, {3}},
+                     {2}, {0, 2}, true);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 0, 1}));
+}
+
 // Uses a set of reduction conditions that trigger the specialized 4D version
 // of Mean.
 TEST(ConstFloatMeanOpTest, KeepDims4DMean) {
@@ -663,6 +674,16 @@ TEST(ConstFloatSumOpTest, KeepDims) {
               ElementsAreArray(ArrayFloatNear({84, 100, 116})));
 }
 
+TEST(ConstFloatSumOpTest, ZeroInputDim) {
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  SumOpConstModel m({TensorType_FLOAT32, {4, 0, 2}}, {TensorType_FLOAT32, {3}},
+                    {2}, {0, 2}, true);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 0, 1}));
+}
+
 TEST(DynamicFloatSumOpTest, NotKeepDims) {
   std::vector<float> data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
                              9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
@@ -842,6 +863,16 @@ TEST(ConstFloatProdOpTest, KeepDims) {
                   ArrayFloatNear({7.74592e+06, 1.197504e+08, 6.6889152e+08})));
 }
 
+TEST(ConstFloatProdOpTest, ZeroInputDim) {
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  ProdOpConstModel m({TensorType_FLOAT32, {4, 0, 2}}, {TensorType_FLOAT32, {3}},
+                     {2}, {0, 2}, true);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 0, 1}));
+}
+
 TEST(DynamicFloatProdOpTest, NotKeepDims) {
   std::vector<float> data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
                              9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
@@ -915,6 +946,16 @@ TEST(ConstFloatMaxOpTest, KeepDims) {
               ElementsAreArray(ArrayFloatNear({20, 22, 24})));
 }
 
+TEST(ConstFloatMaxOpTest, ZeroInputDim) {
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  MaxOpConstModel m({TensorType_FLOAT32, {4, 0, 2}}, {TensorType_FLOAT32, {3}},
+                    {2}, {0, 2}, true);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 0, 1}));
+}
+
 TEST(DynamicFloatMaxOpTest, NotKeepDims) {
   std::vector<float> data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
                              9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
@@ -1128,6 +1169,16 @@ TEST(ConstFloatMinOpTest, KeepDims) {
               ElementsAreArray(ArrayFloatNear({1, 3, 5})));
 }
 
+TEST(ConstFloatMinOpTest, ZeroInputDim) {
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  MinOpConstModel m({TensorType_FLOAT32, {4, 0, 2}}, {TensorType_FLOAT32, {3}},
+                    {2}, {0, 2}, true);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 0, 1}));
+}
+
 TEST(DynamicFloatMinOpTest, NotKeepDims) {
   std::vector<float> data = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
                              9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
@@ -1338,6 +1389,16 @@ TEST(ConstAnyOpTest, KeepDims) {
   EXPECT_THAT(m.GetOutput<bool>(), ElementsAreArray({true, false, true}));
 }
 
+TEST(ConstAnyOpTest, ZeroInputDim) {
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  AnyOpConstModel m({TensorType_BOOL, {2, 0, 2}}, {TensorType_BOOL, {3}}, {2},
+                    {0, 2}, true);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 0, 1}));
+}
+
 TEST(DynamicAnyOpTest, NotKeepDims) {
   std::vector<bool> data = {false, false, false, false, false, false,
                             false, true,  false, false, false, true};
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index e020298fc8f..8d8095bdc82 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -125,7 +125,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
              Register_RESIZE_NEAREST_NEIGHBOR(),
              /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM());
   AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH(),
              /* min_version = */ 1,
@@ -136,7 +136,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE(),
              /* min_version = */ 1,
-             /* max_version = */ 4);
+             /* max_version = */ 5);
   AddBuiltin(BuiltinOperator_MEAN, Register_MEAN(),
              /* min_version = */ 1,
              /* max_version = */ 3);
@@ -203,7 +203,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SELECT_V2, Register_SELECT_V2());
   AddBuiltin(BuiltinOperator_SLICE, Register_SLICE(),
              /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_SIN, Register_SIN());
   AddBuiltin(BuiltinOperator_COS, Register_COS());
   AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV(),
@@ -293,7 +293,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SEGMENT_SUM, Register_SEGMENT_SUM());
   AddBuiltin(BuiltinOperator_BATCH_MATMUL, Register_BATCH_MATMUL(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddCustom("NumericVerify", tflite::ops::custom::Register_NUMERIC_VERIFY());
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/lite/kernels/register.h b/tensorflow/lite/kernels/register.h
index 1a6095c7140..f40798ffd0a 100644
--- a/tensorflow/lite/kernels/register.h
+++ b/tensorflow/lite/kernels/register.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_REGISTER_H_
 #define TENSORFLOW_LITE_KERNELS_REGISTER_H_
 
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/model.h"  // Legacy.
 #include "tensorflow/lite/mutable_op_resolver.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/kernels/register_ref.cc b/tensorflow/lite/kernels/register_ref.cc
index 233520e2165..b9a5b13b477 100644
--- a/tensorflow/lite/kernels/register_ref.cc
+++ b/tensorflow/lite/kernels/register_ref.cc
@@ -25,6 +25,7 @@ namespace ops {
 
 namespace custom {
 
+TfLiteRegistration* Register_NUMERIC_VERIFY_REF();
 TfLiteRegistration* Register_AUDIO_SPECTROGRAM();
 TfLiteRegistration* Register_MFCC();
 TfLiteRegistration* Register_DETECTION_POSTPROCESS();
@@ -108,6 +109,7 @@ TfLiteRegistration* Register_REDUCE_ANY();
 TfLiteRegistration* Register_SELECT();
 TfLiteRegistration* Register_SLICE_REF();
 TfLiteRegistration* Register_SIN();
+TfLiteRegistration* Register_COS();
 TfLiteRegistration* Register_TRANSPOSECONV_REF();
 TfLiteRegistration* Register_EXPAND_DIMS();
 TfLiteRegistration* Register_SPARSE_TO_DENSE();
@@ -116,6 +118,7 @@ TfLiteRegistration* Register_NOT_EQUAL();
 TfLiteRegistration* Register_SQRT();
 TfLiteRegistration* Register_RSQRT();
 TfLiteRegistration* Register_SHAPE();
+TfLiteRegistration* Register_RANK();
 TfLiteRegistration* Register_POW();
 TfLiteRegistration* Register_FAKE_QUANT();
 TfLiteRegistration* Register_PACK();
@@ -133,12 +136,26 @@ TfLiteRegistration* Register_LEAKY_RELU();
 TfLiteRegistration* Register_SQUARED_DIFFERENCE();
 TfLiteRegistration* Register_FILL();
 TfLiteRegistration* Register_MIRROR_PAD();
+TfLiteRegistration* Register_UNIQUE();
+TfLiteRegistration* Register_REVERSE_V2();
+TfLiteRegistration* Register_ADD_N();
+TfLiteRegistration* Register_GATHER_ND();
+TfLiteRegistration* Register_WHERE();
+TfLiteRegistration* Register_REVERSE_SEQUENCE();
+TfLiteRegistration* Register_MATRIX_DIAG();
 TfLiteRegistration* Register_QUANTIZE();
+TfLiteRegistration* Register_MATRIX_SET_DIAG();
+TfLiteRegistration* Register_IF();
+TfLiteRegistration* Register_WHILE();
+TfLiteRegistration* Register_NON_MAX_SUPPRESSION_V4();
+TfLiteRegistration* Register_NON_MAX_SUPPRESSION_V5();
+TfLiteRegistration* Register_SCATTER_ND();
+TfLiteRegistration* Register_DENSIFY();
+TfLiteRegistration* Register_BATCH_MATMUL_REF();
 TfLiteRegistration* Register_HARD_SWISH_REF();
 TfLiteRegistration* Register_DEPTH_TO_SPACE_REF();
 TfLiteRegistration* Register_SELECT_V2();
 TfLiteRegistration* Register_SEGMENT_SUM();
-TfLiteRegistration* Register_BATCH_MATMUL_REF();
 
 namespace {
 
@@ -172,128 +189,267 @@ const TfLiteRegistration* BuiltinRefOpResolver::FindOp(const char* op,
 }
 
 BuiltinRefOpResolver::BuiltinRefOpResolver() {
-  AddBuiltin(BuiltinOperator_ABS, Register_ABS());
-  AddBuiltin(BuiltinOperator_RELU, Register_RELU());
+  AddBuiltin(BuiltinOperator_ABS, Register_ABS(), /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_HARD_SWISH, Register_HARD_SWISH_REF());
+  AddBuiltin(BuiltinOperator_RELU, Register_RELU(), /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_RELU_N1_TO_1, Register_RELU_N1_TO_1());
-  AddBuiltin(BuiltinOperator_RELU6, Register_RELU6());
-  AddBuiltin(BuiltinOperator_TANH, Register_TANH_REF());
-  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC_REF());
-  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_REF());
-  AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_REF());
+  AddBuiltin(BuiltinOperator_RELU6, Register_RELU6(), /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_TANH, Register_TANH_REF(), /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_L2_POOL_2D, Register_L2_POOL_REF());
-  AddBuiltin(BuiltinOperator_CONV_2D, Register_CONVOLUTION_REF());
+  AddBuiltin(BuiltinOperator_CONV_2D, Register_CONVOLUTION_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 5);
   AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D,
              Register_DEPTHWISE_CONVOLUTION_REF(),
-             /* min_version */ 1,
-             /* max_version */ 2);
-  AddBuiltin(BuiltinOperator_SVDF, Register_SVDF());
-  AddBuiltin(BuiltinOperator_RNN, Register_RNN());
+             /* min_version = */ 1,
+             /* max_version = */ 6);
+  AddBuiltin(BuiltinOperator_SVDF, Register_SVDF(),
+             /* min_version = */ 1,
+             /* max_version = */ 4);
+  AddBuiltin(BuiltinOperator_RNN, Register_RNN(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN,
-             Register_BIDIRECTIONAL_SEQUENCE_RNN());
+             Register_BIDIRECTIONAL_SEQUENCE_RNN(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
-             Register_UNIDIRECTIONAL_SEQUENCE_RNN());
-  AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP());
+             Register_UNIDIRECTIONAL_SEQUENCE_RNN(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP, Register_EMBEDDING_LOOKUP(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_EMBEDDING_LOOKUP_SPARSE,
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED_REF(),
              /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 9);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
-  AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX());
-  AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION_REF());
-  AddBuiltin(BuiltinOperator_ADD, Register_ADD_REF());
+  AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_ADD, Register_ADD_REF(),
+             /* min_version */ 1,
+             /* max_version */ 4);
   AddBuiltin(BuiltinOperator_SPACE_TO_BATCH_ND,
-             Register_SPACE_TO_BATCH_ND_REF());
+             Register_SPACE_TO_BATCH_ND_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_BATCH_TO_SPACE_ND,
-             Register_BATCH_TO_SPACE_ND_REF());
-  AddBuiltin(BuiltinOperator_MUL, Register_MUL_REF());
-  AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2NORM_REF());
+             Register_BATCH_TO_SPACE_ND_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_MUL, Register_MUL_REF(), /* min_version = */ 1,
+             /* max_version = */ 4);
+  AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2NORM_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
              Register_LOCAL_RESPONSE_NORM_REF());
   AddBuiltin(BuiltinOperator_LSTM, Register_LSTM(), /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 4);
   AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
              Register_BIDIRECTIONAL_SEQUENCE_LSTM(), /* min_version */ 1,
-             /* max_version */ 2);
+             /* max_version */ 3);
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
-             Register_UNIDIRECTIONAL_SEQUENCE_LSTM());
-  AddBuiltin(BuiltinOperator_PAD, Register_PAD_REF());
-  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2_REF());
+             Register_UNIDIRECTIONAL_SEQUENCE_LSTM(), /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_PAD, Register_PAD_REF(), /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2_REF(), /* min_version = */ 1,
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
-  AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR_REF());
+  AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
-             Register_RESIZE_NEAREST_NEIGHBOR_REF());
+             Register_RESIZE_NEAREST_NEIGHBOR_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM());
-  AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH_REF());
+  AddBuiltin(BuiltinOperator_SPACE_TO_DEPTH, Register_SPACE_TO_DEPTH_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_DEPTH_TO_SPACE, Register_DEPTH_TO_SPACE_REF());
-  AddBuiltin(BuiltinOperator_GATHER, Register_GATHER());
-  AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE_REF());
-  AddBuiltin(BuiltinOperator_MEAN, Register_MEAN_REF());
-  AddBuiltin(BuiltinOperator_DIV, Register_DIV_REF());
-  AddBuiltin(BuiltinOperator_SUB, Register_SUB_REF());
-  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT());
-  AddBuiltin(BuiltinOperator_SPLIT_V, Register_SPLIT_V());
+  AddBuiltin(BuiltinOperator_GATHER, Register_GATHER(),
+             /* min_version = */ 1,
+             /* max_version = */ 4);
+  AddBuiltin(BuiltinOperator_TRANSPOSE, Register_TRANSPOSE_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 5);
+  AddBuiltin(BuiltinOperator_MEAN, Register_MEAN_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_DIV, Register_DIV_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_SUB, Register_SUB_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 5);
+  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT(),
+             /* min_version = */ 1,
+             /* max_version = */ 4);
+  AddBuiltin(BuiltinOperator_SPLIT_V, Register_SPLIT_V(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_SQUEEZE, Register_SQUEEZE());
-  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE_REF());
+  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_EXP, Register_EXP());
-  AddBuiltin(BuiltinOperator_TOPK_V2, Register_TOPK_V2());
+  AddBuiltin(BuiltinOperator_TOPK_V2, Register_TOPK_V2(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_LOG, Register_LOG());
-  AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX_REF());
+  AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_CAST, Register_CAST());
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE(),
-             /* min_version */ 1,
-             /* max_version */ 2);
+             /* min_version = */ 1,
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
-  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
-  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
-  AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
-  AddBuiltin(BuiltinOperator_ARG_MIN, Register_ARG_MIN());
-  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER());
-  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL());
-  AddBuiltin(BuiltinOperator_LESS, Register_LESS());
-  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL());
+  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM(),
+             /* min_version = */ 1,
+             /* max_version = */ 4);
+  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM(),
+             /* min_version = */ 1,
+             /* max_version = */ 4);
+  AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_ARG_MIN, Register_ARG_MIN(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_LESS, Register_LESS(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR_REF());
   AddBuiltin(BuiltinOperator_NEG, Register_NEG());
-  AddBuiltin(BuiltinOperator_SELECT, Register_SELECT());
-  AddBuiltin(BuiltinOperator_SLICE, Register_SLICE_REF());
+  AddBuiltin(BuiltinOperator_SELECT, Register_SELECT(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_SELECT_V2, Register_SELECT_V2());
+  AddBuiltin(BuiltinOperator_SLICE, Register_SLICE_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_SIN, Register_SIN());
-  AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSECONV_REF());
-  AddBuiltin(BuiltinOperator_TILE, Register_TILE());
-  AddBuiltin(BuiltinOperator_SUM, Register_SUM());
+  AddBuiltin(BuiltinOperator_COS, Register_COS());
+  AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSECONV_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_TILE, Register_TILE(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_SUM, Register_SUM(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_REDUCE_PROD, Register_REDUCE_PROD());
-  AddBuiltin(BuiltinOperator_REDUCE_MAX, Register_REDUCE_MAX());
-  AddBuiltin(BuiltinOperator_REDUCE_MIN, Register_REDUCE_MIN());
+  AddBuiltin(BuiltinOperator_REDUCE_MAX, Register_REDUCE_MAX(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_REDUCE_MIN, Register_REDUCE_MIN(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_REDUCE_ANY, Register_REDUCE_ANY());
   AddBuiltin(BuiltinOperator_EXPAND_DIMS, Register_EXPAND_DIMS());
-  AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE());
-  AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL());
-  AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL());
+  AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_SQRT, Register_SQRT());
   AddBuiltin(BuiltinOperator_RSQRT, Register_RSQRT());
   AddBuiltin(BuiltinOperator_SHAPE, Register_SHAPE());
+  AddBuiltin(BuiltinOperator_RANK, Register_RANK());
   AddBuiltin(BuiltinOperator_POW, Register_POW());
   AddBuiltin(BuiltinOperator_FAKE_QUANT, Register_FAKE_QUANT(), 1, 2);
-  AddBuiltin(BuiltinOperator_PACK, Register_PACK());
+  AddBuiltin(BuiltinOperator_PACK, Register_PACK(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_ONE_HOT, Register_ONE_HOT());
   AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR());
   AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND());
   AddBuiltin(BuiltinOperator_LOGICAL_NOT, Register_LOGICAL_NOT());
-  AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK());
-  AddBuiltin(BuiltinOperator_FLOOR_DIV, Register_FLOOR_DIV());
+  AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK(),
+             /* min_version = */ 1,
+             /* max_version = */ 4);
+  AddBuiltin(BuiltinOperator_FLOOR_DIV, Register_FLOOR_DIV(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_SQUARE, Register_SQUARE());
   AddBuiltin(BuiltinOperator_ZEROS_LIKE, Register_ZEROS_LIKE());
   AddBuiltin(BuiltinOperator_FLOOR_MOD, Register_FLOOR_MOD());
   AddBuiltin(BuiltinOperator_RANGE, Register_RANGE());
-  AddBuiltin(BuiltinOperator_LEAKY_RELU, Register_LEAKY_RELU());
+  AddBuiltin(BuiltinOperator_LEAKY_RELU, Register_LEAKY_RELU(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_SQUARED_DIFFERENCE, Register_SQUARED_DIFFERENCE());
-  AddBuiltin(BuiltinOperator_FILL, Register_FILL());
-  AddBuiltin(BuiltinOperator_MIRROR_PAD, Register_MIRROR_PAD());
-  AddBuiltin(BuiltinOperator_QUANTIZE, Register_QUANTIZE());
-  AddBuiltin(BuiltinOperator_HARD_SWISH, Register_HARD_SWISH_REF());
-  AddBuiltin(BuiltinOperator_SELECT_V2, Register_SELECT_V2());
-  AddBuiltin(BuiltinOperator_SEGMENT_SUM, Register_SEGMENT_SUM());
-
+  AddBuiltin(BuiltinOperator_FILL, Register_FILL(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_MIRROR_PAD, Register_MIRROR_PAD(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_UNIQUE, Register_UNIQUE());
+  AddBuiltin(BuiltinOperator_REVERSE_V2, Register_REVERSE_V2(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_ADD_N, Register_ADD_N());
+  AddBuiltin(BuiltinOperator_GATHER_ND, Register_GATHER_ND(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_WHERE, Register_WHERE());
+  AddBuiltin(BuiltinOperator_REVERSE_SEQUENCE, Register_REVERSE_SEQUENCE());
+  AddBuiltin(BuiltinOperator_MATRIX_DIAG, Register_MATRIX_DIAG());
+  AddBuiltin(BuiltinOperator_QUANTIZE, Register_QUANTIZE(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
+  AddBuiltin(BuiltinOperator_MATRIX_SET_DIAG, Register_MATRIX_SET_DIAG());
+  AddBuiltin(BuiltinOperator_IF, Register_IF());
+  AddBuiltin(BuiltinOperator_WHILE, Register_WHILE());
+  AddBuiltin(BuiltinOperator_NON_MAX_SUPPRESSION_V4,
+             Register_NON_MAX_SUPPRESSION_V4());
+  AddBuiltin(BuiltinOperator_NON_MAX_SUPPRESSION_V5,
+             Register_NON_MAX_SUPPRESSION_V5());
+  AddBuiltin(BuiltinOperator_SCATTER_ND, Register_SCATTER_ND());
+  AddBuiltin(BuiltinOperator_DENSIFY, Register_DENSIFY());
+  AddBuiltin(BuiltinOperator_BATCH_MATMUL, Register_BATCH_MATMUL_REF(),
+             /* min_version = */ 1,
+             /* max_version = */ 3);
+  AddCustom("NumericVerify",
+            tflite::ops::custom::Register_NUMERIC_VERIFY_REF());
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
   AddCustom("Mfcc", tflite::ops::custom::Register_MFCC());
diff --git a/tensorflow/lite/kernels/reshape.cc b/tensorflow/lite/kernels/reshape.cc
index ab6f0d8577d..2a21fa730bc 100644
--- a/tensorflow/lite/kernels/reshape.cc
+++ b/tensorflow/lite/kernels/reshape.cc
@@ -38,8 +38,11 @@ TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) {
   std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)>
       scoped_output_shape(output_shape, TfLiteIntArrayFree);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   // Tensorflow's Reshape allows one of the shape components to have the
   // special -1 value, meaning it will be calculated automatically based on the
@@ -70,6 +73,7 @@ TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) {
 inline TfLiteIntArray* GetOutputShapeFromTensor(TfLiteContext* context,
                                                 TfLiteNode* node) {
   const TfLiteTensor* shape = GetInput(context, node, kShapeTensor);
+  if (shape == nullptr) return nullptr;
 
   TfLiteIntArray* output_shape = TfLiteIntArrayCreate(shape->dims->data[0]);
   for (int i = 0; i < output_shape->size; ++i) {
@@ -103,7 +107,8 @@ inline TfLiteIntArray* GetOutputShapeFromParam(TfLiteContext* context,
 // Check if the shape tensor is valid. Shapes should be int32 vectors.
 inline bool ShapeIsVector(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* shape = GetInput(context, node, kShapeTensor);
-  return (shape->dims->size == 1 && shape->type == kTfLiteInt32);
+  return (shape != nullptr && shape->dims->size == 1 &&
+          shape->type == kTfLiteInt32);
 }
 
 TfLiteIntArray* GetOutputShape(TfLiteContext* context, TfLiteNode* node) {
@@ -122,7 +127,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // calculate their shapes now. String tensors don't benefit from having their
   // shapes precalculated because the actual memory can only be allocated after
   // we know all the content.
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   if (output->type != kTfLiteString) {
     if (NumInputs(node) == 1 ||
         IsConstantTensor(GetInput(context, node, kShapeTensor))) {
@@ -135,8 +142,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   // There are two ways in which the 'output' can be made dynamic: it could be
   // a string tensor, or its shape cannot be calculated during Prepare(). In
diff --git a/tensorflow/lite/kernels/resize_bilinear.cc b/tensorflow/lite/kernels/resize_bilinear.cc
index b0488a0b464..5978a78411c 100644
--- a/tensorflow/lite/kernels/resize_bilinear.cc
+++ b/tensorflow/lite/kernels/resize_bilinear.cc
@@ -61,9 +61,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* size;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kSizeTensor, &size));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   // TODO(ahentz): Our current implementations rely on the inputs being 4D.
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
@@ -96,9 +100,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteResizeBilinearParams*>(node->builtin_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+  const TfLiteTensor* size;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kSizeTensor, &size));
 
   if (IsDynamicTensor(output)) {
     TF_LITE_ENSURE_OK(context,
diff --git a/tensorflow/lite/kernels/resize_nearest_neighbor.cc b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
index 13c54c4f906..bef39559507 100644
--- a/tensorflow/lite/kernels/resize_nearest_neighbor.cc
+++ b/tensorflow/lite/kernels/resize_nearest_neighbor.cc
@@ -60,9 +60,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* size;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kSizeTensor, &size));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   // TODO(ahentz): Our current implementations rely on the input being 4D,
   // and the size being 1D tensor with exactly 2 elements.
@@ -85,9 +89,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteResizeNearestNeighborParams*>(node->builtin_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+  const TfLiteTensor* size;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kSizeTensor, &size));
 
   if (IsDynamicTensor(output)) {
     TF_LITE_ENSURE_OK(context,
@@ -121,10 +129,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
         GetTensorShape(size), GetTensorData<int32>(size),
         GetTensorShape(output), GetTensorData<int8_t>(output));
+  } else if (output->type == kTfLiteInt16) {
+    reference_ops::ResizeNearestNeighbor(
+        op_params, GetTensorShape(input), GetTensorData<int16_t>(input),
+        GetTensorShape(size), GetTensorData<int32>(size),
+        GetTensorShape(output), GetTensorData<int16_t>(output));
   } else {
-    TF_LITE_KERNEL_LOG(context,
-                       "Output type is %s, requires float, uint8 or int8.",
-                       TfLiteTypeGetName(output->type));
+    TF_LITE_KERNEL_LOG(
+        context, "Output type is %s, requires float, uint8, int8 or int16.",
+        TfLiteTypeGetName(output->type));
     return kTfLiteError;
   }
 
diff --git a/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc b/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
index b22ad48afb9..5f3d982df66 100644
--- a/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
+++ b/tensorflow/lite/kernels/resize_nearest_neighbor_test.cc
@@ -106,6 +106,14 @@ TEST_P(ResizeNearestNeighborOpTest, HorizontalResizeInt8) {
   EXPECT_THAT(m.GetOutput<int8_t>(),
               ElementsAreArray(ArrayFloatNear({-3, -3, 6})));
 }
+TEST_P(ResizeNearestNeighborOpTest, HorizontalResizeInt16) {
+  ResizeNearestNeighborOpModel m({TensorType_INT16, {1, 1, 2, 1}}, {1, 3},
+                                 GetParam());
+  m.SetInput<int16_t>({-3, 6});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear({-3, -3, 6})));
+}
 TEST_P(ResizeNearestNeighborOpTest, VerticalResize) {
   ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}}, {3, 1},
                                  GetParam());
@@ -130,6 +138,14 @@ TEST_P(ResizeNearestNeighborOpTest, VerticalResizeInt8) {
   EXPECT_THAT(m.GetOutput<int8_t>(),
               ElementsAreArray(ArrayFloatNear({3, 3, -9})));
 }
+TEST_P(ResizeNearestNeighborOpTest, VerticalResizeInt16) {
+  ResizeNearestNeighborOpModel m({TensorType_INT16, {1, 2, 1, 1}}, {3, 1},
+                                 GetParam());
+  m.SetInput<int16_t>({3, -9});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear({3, 3, -9})));
+}
 TEST_P(ResizeNearestNeighborOpTest, TwoDimensionalResize) {
   ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {3, 3},
                                  GetParam());
@@ -172,6 +188,20 @@ TEST_P(ResizeNearestNeighborOpTest, TwoDimensionalResizeInt8) {
                                          9, 9, 12,  //
                                      })));
 }
+TEST_P(ResizeNearestNeighborOpTest, TwoDimensionalResizeInt16) {
+  ResizeNearestNeighborOpModel m({TensorType_INT16, {1, 2, 2, 1}}, {3, 3},
+                                 GetParam());
+  m.SetInput<int16_t>({
+      3, -6,  //
+      9, 12   //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int16_t>(), ElementsAreArray(ArrayFloatNear({
+                                          3, 3, -6,  //
+                                          3, 3, -6,  //
+                                          9, 9, 12,  //
+                                      })));
+}
 TEST_P(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatches) {
   ResizeNearestNeighborOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}}, {3, 3},
                                  GetParam());
@@ -284,6 +314,25 @@ TEST_P(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatchesInt8) {
                                          12, 12, 16,  //
                                      })));
 }
+TEST_P(ResizeNearestNeighborOpTest, TwoDimensionalResizeWithTwoBatchesInt16) {
+  ResizeNearestNeighborOpModel m({TensorType_INT16, {2, 2, 2, 1}}, {3, 3},
+                                 GetParam());
+  m.SetInput<int16_t>({
+      3, 6,    //
+      9, -12,  //
+      -4, 10,  //
+      12, 16   //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int16_t>(), ElementsAreArray(ArrayFloatNear({
+                                          3, 3, 6,     //
+                                          3, 3, 6,     //
+                                          9, 9, -12,   //
+                                          -4, -4, 10,  //
+                                          -4, -4, 10,  //
+                                          12, 12, 16,  //
+                                      })));
+}
 TEST_P(ResizeNearestNeighborOpTest, ThreeDimensionalResizeUInt8) {
   ResizeNearestNeighborOpModel m({TensorType_UINT8, {1, 2, 2, 2}}, {3, 3},
                                  GetParam());
@@ -342,6 +391,20 @@ TEST_P(ResizeNearestNeighborOpTest, ThreeDimensionalResizeInt8) {
                                          10, 12, 10, 12, -14, 16,  //
                                      })));
 }
+TEST_P(ResizeNearestNeighborOpTest, ThreeDimensionalResizeInt16) {
+  ResizeNearestNeighborOpModel m({TensorType_INT16, {1, 2, 2, 2}}, {3, 3},
+                                 GetParam());
+  m.SetInput<int16_t>({
+      3, 4, -6, 10,     //
+      10, 12, -14, 16,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int16_t>(), ElementsAreArray(ArrayFloatNear({
+                                          3, 4, 3, 4, -6, 10,       //
+                                          3, 4, 3, 4, -6, 10,       //
+                                          10, 12, 10, 12, -14, 16,  //
+                                      })));
+}
 INSTANTIATE_TEST_SUITE_P(ResizeNearestNeighborOpTest,
                          ResizeNearestNeighborOpTest,
                          testing::Values(TestType::kConst, TestType::kDynamic));
diff --git a/tensorflow/lite/kernels/reverse.cc b/tensorflow/lite/kernels/reverse.cc
index 34cc92da5d8..a7ef54dae12 100644
--- a/tensorflow/lite/kernels/reverse.cc
+++ b/tensorflow/lite/kernels/reverse.cc
@@ -35,8 +35,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* axis = GetInput(context, node, kAxisTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* axis;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kAxisTensor, &axis));
   TF_LITE_ENSURE_EQ(context, NumDimensions(axis), 1);
   TF_LITE_ENSURE(context, NumDimensions(input) >= NumElements(axis));
 
@@ -59,7 +61,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     context->ReportError(context, "Current does not support more than 1 axis.");
   }
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   TfLiteIntArray* output_shape = TfLiteIntArrayCopy(input->dims);
   TF_LITE_ENSURE_TYPES_EQ(context, output->type, input->type);
 
@@ -67,8 +71,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* axis_tensor = GetInput(context, node, kAxisTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* axis_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kAxisTensor, &axis_tensor));
   int axis = GetTensorData<int32_t>(axis_tensor)[0];
   const int rank = NumDimensions(input);
   if (axis < 0) {
@@ -76,7 +83,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   }
 
   TF_LITE_ENSURE(context, axis >= 0 && axis < rank);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   switch (output->type) {
     case kTfLiteFloat32: {
diff --git a/tensorflow/lite/kernels/reverse_sequence.cc b/tensorflow/lite/kernels/reverse_sequence.cc
index b36b1f803ca..dbbdf3003a2 100644
--- a/tensorflow/lite/kernels/reverse_sequence.cc
+++ b/tensorflow/lite/kernels/reverse_sequence.cc
@@ -36,8 +36,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* seq_lengths = GetInput(context, node, kSeqLengthsTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* seq_lengths;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kSeqLengthsTensor, &seq_lengths));
   TF_LITE_ENSURE_EQ(context, NumDimensions(seq_lengths), 1);
 
   if (input->type != kTfLiteInt32 && input->type != kTfLiteFloat32 &&
@@ -56,7 +59,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     return kTfLiteError;
   }
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   TfLiteIntArray* output_shape = TfLiteIntArrayCopy(input->dims);
   TF_LITE_ENSURE_TYPES_EQ(context, output->type, input->type);
 
@@ -65,9 +70,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 template <typename T, typename TS>
 TfLiteStatus ReverseSequenceImpl(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* seq_lengths_tensor =
-      GetInput(context, node, kSeqLengthsTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* seq_lengths_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kSeqLengthsTensor,
+                                          &seq_lengths_tensor));
   const TS* seq_lengths = GetTensorData<TS>(seq_lengths_tensor);
 
   auto* params =
@@ -86,7 +93,9 @@ TfLiteStatus ReverseSequenceImpl(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE(context, seq_lengths[i] <= SizeOfDimension(input, seq_dim));
   }
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   reference_ops::ReverseSequence<T, TS>(
       seq_lengths, seq_dim, batch_dim, GetTensorShape(input),
@@ -98,8 +107,9 @@ TfLiteStatus ReverseSequenceImpl(TfLiteContext* context, TfLiteNode* node) {
 
 template <typename T>
 TfLiteStatus ReverseSequenceHelper(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* seq_lengths_tensor =
-      GetInput(context, node, kSeqLengthsTensor);
+  const TfLiteTensor* seq_lengths_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kSeqLengthsTensor,
+                                          &seq_lengths_tensor));
   switch (seq_lengths_tensor->type) {
     case kTfLiteInt32: {
       return ReverseSequenceImpl<T, int32_t>(context, node);
@@ -119,7 +129,9 @@ TfLiteStatus ReverseSequenceHelper(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   switch (output->type) {
     case kTfLiteFloat32: {
diff --git a/tensorflow/lite/kernels/rfft2d.cc b/tensorflow/lite/kernels/rfft2d.cc
index 4b0b4a6140a..c786aed73e7 100644
--- a/tensorflow/lite/kernels/rfft2d.cc
+++ b/tensorflow/lite/kernels/rfft2d.cc
@@ -73,16 +73,20 @@ static TfLiteStatus InitTemporaryTensors(TfLiteContext* context,
   data->fft_double_working_area_id = first_new_index + 1;
 
   // Set up FFT integer working area buffer.
-  TfLiteTensor* fft_integer_working_area =
-      GetTemporary(context, node, kFftIntegerWorkingAreaTensor);
+  TfLiteTensor* fft_integer_working_area;
+  TF_LITE_ENSURE_OK(
+      context, GetTemporarySafe(context, node, kFftIntegerWorkingAreaTensor,
+                                &fft_integer_working_area));
   fft_integer_working_area->type = kTfLiteInt32;
   // If fft_length is not a constant tensor, fft_integer_working_area will be
   // set to dynamic later in Prepare.
   fft_integer_working_area->allocation_type = kTfLiteArenaRw;
 
   // Set up FFT double working area buffer.
-  TfLiteTensor* fft_double_working_area =
-      GetTemporary(context, node, kFftDoubleWorkingAreaTensor);
+  TfLiteTensor* fft_double_working_area;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, kFftDoubleWorkingAreaTensor,
+                                     &fft_double_working_area));
   // fft_double_working_area is a double tensor. Ideally, double should be
   // added into tflite data types. However, since fft_double_working_area is a
   // temporary tensor, and there are no ops having double input/output tensors
@@ -100,10 +104,13 @@ static TfLiteStatus InitTemporaryTensors(TfLiteContext* context,
 
 TfLiteStatus ResizeOutputandTemporaryTensors(TfLiteContext* context,
                                              TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   const int num_dims = NumDimensions(input);
   TF_LITE_ENSURE(context, num_dims >= 2);
-  const TfLiteTensor* fft_length = GetInput(context, node, kFftLengthTensor);
+  const TfLiteTensor* fft_length;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kFftLengthTensor, &fft_length));
   const int32_t* fft_length_data = GetTensorData<int32_t>(fft_length);
   // The lib, fft2d, can only handle fft_lengths of power of 2.
   TF_LITE_ENSURE(context, IsPowerOfTwo(fft_length_data[0]));
@@ -116,15 +123,19 @@ TfLiteStatus ResizeOutputandTemporaryTensors(TfLiteContext* context,
   int half_fft_working_length = fft_working_length / 2;
 
   // Resize output tensor.
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   TfLiteIntArray* output_shape = TfLiteIntArrayCopy(input->dims);
   output_shape->data[num_dims - 2] = fft_length_data[0];
   output_shape->data[num_dims - 1] = fft_length_data[1] / 2 + 1;
   TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_shape));
 
   // Resize temporary tensors, fft_integer_working_area.
-  TfLiteTensor* fft_integer_working_area =
-      GetTemporary(context, node, kFftIntegerWorkingAreaTensor);
+  TfLiteTensor* fft_integer_working_area;
+  TF_LITE_ENSURE_OK(
+      context, GetTemporarySafe(context, node, kFftIntegerWorkingAreaTensor,
+                                &fft_integer_working_area));
   TfLiteIntArray* fft_integer_working_area_shape = TfLiteIntArrayCreate(1);
   fft_integer_working_area_shape->data[0] =
       2 + static_cast<int>(sqrt(fft_working_length));
@@ -132,8 +143,10 @@ TfLiteStatus ResizeOutputandTemporaryTensors(TfLiteContext* context,
                                               fft_integer_working_area_shape));
 
   // Resize temporary tensors, fft_double_working_area.
-  TfLiteTensor* fft_double_working_area =
-      GetTemporary(context, node, kFftDoubleWorkingAreaTensor);
+  TfLiteTensor* fft_double_working_area;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, kFftDoubleWorkingAreaTensor,
+                                     &fft_double_working_area));
   TfLiteIntArray* fft_double_working_area_shape = TfLiteIntArrayCreate(1);
   fft_double_working_area_shape->data[0] =
       half_fft_working_length + fft_width / 4;
@@ -157,7 +170,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
   // Check type and shape of the input tensor
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   TF_LITE_ENSURE(context, NumDimensions(input) >= 2);
   if (input->type != kTfLiteFloat32) {
     context->ReportError(context,
@@ -167,7 +181,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   }
 
   // Check type and shape of the fft_length tensor
-  const TfLiteTensor* fft_length = GetInput(context, node, kFftLengthTensor);
+  const TfLiteTensor* fft_length;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kFftLengthTensor, &fft_length));
   const RuntimeShape fft_length_shape = GetTensorShape(fft_length);
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(fft_length), 1);
@@ -183,17 +199,23 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_STATUS(InitTemporaryTensors(context, node));
 
   // Set output type
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   output->type = kTfLiteComplex64;
 
   // Exit early if fft_length is a non-const tensor. Set output tensor and
   // temporary tensors to dynamic, so that their tensor sizes can be determined
   // in Eval.
   if (!IsConstantTensor(fft_length)) {
-    TfLiteTensor* fft_integer_working_area =
-        GetTemporary(context, node, kFftIntegerWorkingAreaTensor);
-    TfLiteTensor* fft_double_working_area =
-        GetTemporary(context, node, kFftDoubleWorkingAreaTensor);
+    TfLiteTensor* fft_integer_working_area;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, kFftIntegerWorkingAreaTensor,
+                                  &fft_integer_working_area));
+    TfLiteTensor* fft_double_working_area;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, kFftDoubleWorkingAreaTensor,
+                                  &fft_double_working_area));
     SetTensorToDynamic(fft_integer_working_area);
     SetTensorToDynamic(fft_double_working_area);
     SetTensorToDynamic(output);
@@ -325,11 +347,16 @@ void PrepareOutputBuffer(complex<float>* output_data, int fft_height,
 }
 
 TfLiteStatus Rfft2dHelper(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   const float* input_data = GetTensorData<float>(input);
-  const TfLiteTensor* fft_length = GetInput(context, node, kFftLengthTensor);
+  const TfLiteTensor* fft_length;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kFftLengthTensor, &fft_length));
   const int32_t* fft_length_data = GetTensorData<int32_t>(fft_length);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   complex<float>* output_data = GetTensorData<complex<float>>(output);
 
   int fft_height, fft_width;
@@ -358,14 +385,18 @@ TfLiteStatus Rfft2dHelper(TfLiteContext* context, TfLiteNode* node) {
   }
 
   // Get buffer for integer working area.
-  TfLiteTensor* fft_integer_working_area =
-      GetTemporary(context, node, kFftIntegerWorkingAreaTensor);
+  TfLiteTensor* fft_integer_working_area;
+  TF_LITE_ENSURE_OK(
+      context, GetTemporarySafe(context, node, kFftIntegerWorkingAreaTensor,
+                                &fft_integer_working_area));
   int* fft_integer_working_area_data =
       GetTensorData<int>(fft_integer_working_area);
 
   // Get buffer for double working area.
-  TfLiteTensor* fft_double_working_area =
-      GetTemporary(context, node, kFftDoubleWorkingAreaTensor);
+  TfLiteTensor* fft_double_working_area;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, kFftDoubleWorkingAreaTensor,
+                                     &fft_double_working_area));
   // Get double value out of the memory of fft_double_working_area_data.
   double* fft_double_working_area_data = reinterpret_cast<double*>(
       GetTensorData<int64_t>(fft_double_working_area));
@@ -393,10 +424,15 @@ TfLiteStatus Rfft2dHelper(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* fft_length = GetInput(context, node, kFftLengthTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* fft_length;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kFftLengthTensor, &fft_length));
   const int32_t* fft_length_data = GetTensorData<int32_t>(fft_length);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   if (output->type != kTfLiteComplex64) {
     context->ReportError(context,
diff --git a/tensorflow/lite/kernels/rfft2d_test.cc b/tensorflow/lite/kernels/rfft2d_test.cc
index e9b23bacf0c..e7d806a5a76 100644
--- a/tensorflow/lite/kernels/rfft2d_test.cc
+++ b/tensorflow/lite/kernels/rfft2d_test.cc
@@ -150,9 +150,3 @@ TEST(Rfft2dOpTest, InputDimsGreaterThan2) {
 }  // namespace custom
 }  // namespace ops
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/kernels/round.cc b/tensorflow/lite/kernels/round.cc
index 72c793c1152..60db1a144a3 100644
--- a/tensorflow/lite/kernels/round.cc
+++ b/tensorflow/lite/kernels/round.cc
@@ -30,8 +30,11 @@ constexpr int kInputTensor = 0;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
@@ -41,8 +44,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   optimized_ops::Round(GetTensorShape(input), GetTensorData<float>(input),
                        GetTensorShape(output), GetTensorData<float>(output));
diff --git a/tensorflow/lite/kernels/scatter_nd.cc b/tensorflow/lite/kernels/scatter_nd.cc
index 4e904f66692..93e2fe36c3f 100644
--- a/tensorflow/lite/kernels/scatter_nd.cc
+++ b/tensorflow/lite/kernels/scatter_nd.cc
@@ -74,9 +74,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* indices = GetInput(context, node, kIndices);
-  const TfLiteTensor* updates = GetInput(context, node, kUpdates);
-  const TfLiteTensor* shape = GetInput(context, node, kShape);
+  const TfLiteTensor* indices;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kIndices, &indices));
+  const TfLiteTensor* updates;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kUpdates, &updates));
+  const TfLiteTensor* shape;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kShape, &shape));
 
   switch (updates->type) {
     case kTfLiteFloat32:
@@ -96,7 +99,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     return kTfLiteError;
   }
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   output->type = updates->type;
 
   if (IsConstantTensor(shape)) {
@@ -163,10 +168,15 @@ TfLiteStatus EvalScatterNd(TfLiteContext* context, const TfLiteTensor* indices,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* indices = GetInput(context, node, kIndices);
-  const TfLiteTensor* updates = GetInput(context, node, kUpdates);
-  const TfLiteTensor* shape = GetInput(context, node, kShape);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* indices;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kIndices, &indices));
+  const TfLiteTensor* updates;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kUpdates, &updates));
+  const TfLiteTensor* shape;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kShape, &shape));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   switch (indices->type) {
     case kTfLiteInt32:
diff --git a/tensorflow/lite/kernels/segment_sum.cc b/tensorflow/lite/kernels/segment_sum.cc
index 8185359321e..42f83c3ad11 100644
--- a/tensorflow/lite/kernels/segment_sum.cc
+++ b/tensorflow/lite/kernels/segment_sum.cc
@@ -34,11 +34,24 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
                                 const TfLiteTensor* data,
                                 const TfLiteTensor* segment_ids,
                                 TfLiteTensor* output) {
-  int max_index = -1;
+  // Segment ids should be of same cardinality as first input dimension and they
+  // should be increasing by at most 1, from 0 (e.g., [0, 0, 1, 2, 3] is valid)
   const int segment_id_size = segment_ids->dims->data[0];
-  if (segment_id_size > 0) {
-    max_index = segment_ids->data.i32[segment_id_size - 1];
+  TF_LITE_ENSURE_EQ(context, segment_id_size, data->dims->data[0]);
+  int previous_segment_id = -1;
+  for (int i = 0; i < segment_id_size; i++) {
+    const int current_segment_id = GetTensorData<int32_t>(segment_ids)[i];
+    if (i == 0) {
+      TF_LITE_ENSURE_EQ(context, current_segment_id, 0);
+    } else {
+      int delta = current_segment_id - previous_segment_id;
+      TF_LITE_ENSURE(context, delta == 0 || delta == 1);
+    }
+    previous_segment_id = current_segment_id;
   }
+
+  const int max_index = previous_segment_id;
+
   const int data_rank = NumDimensions(data);
   TfLiteIntArray* output_shape = TfLiteIntArrayCreate(NumDimensions(data));
   output_shape->data[0] = max_index + 1;
@@ -51,11 +64,15 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* data = GetInput(context, node, kInputDataTensor);
-  const TfLiteTensor* segment_ids =
-      GetInput(context, node, kInputSegmentIdsTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
+  const TfLiteTensor* data;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputDataTensor, &data));
+  const TfLiteTensor* segment_ids;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputSegmentIdsTensor,
+                                          &segment_ids));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   TF_LITE_ENSURE(context,
                  data->type == kTfLiteInt32 || data->type == kTfLiteFloat32);
   TF_LITE_ENSURE_EQ(context, segment_ids->type, kTfLiteInt32);
@@ -69,10 +86,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* data = GetInput(context, node, kInputDataTensor);
-  const TfLiteTensor* segment_ids =
-      GetInput(context, node, kInputSegmentIdsTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* data;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputDataTensor, &data));
+  const TfLiteTensor* segment_ids;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputSegmentIdsTensor,
+                                          &segment_ids));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   if (IsDynamicTensor(output)) {
     TF_LITE_ENSURE_OK(context,
diff --git a/tensorflow/lite/kernels/segment_sum_test.cc b/tensorflow/lite/kernels/segment_sum_test.cc
index ec531ffd92d..286742c0933 100644
--- a/tensorflow/lite/kernels/segment_sum_test.cc
+++ b/tensorflow/lite/kernels/segment_sum_test.cc
@@ -110,5 +110,37 @@ TEST(SegmentSumOpModelTest, Float32Test_ThreeDimensions) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 2, 1}));
 }
 
+TEST(SegmentSumOpModelTest, TestFailIfSegmentsAreNotSorted) {
+  SegmentSumOpModel<int32_t> model({TensorType_INT32, {3, 2}},
+                                   {TensorType_INT32, {3}});
+  model.PopulateTensor<int32_t>(model.data(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int32_t>(model.segment_ids(), {0, 3, 1});
+  ASSERT_EQ(model.InvokeUnchecked(), kTfLiteError);
+}
+
+TEST(SegmentSumOpModelTest, TestFailIfSegmentsAreNotConsecutive) {
+  SegmentSumOpModel<int32_t> model({TensorType_INT32, {3, 2}},
+                                   {TensorType_INT32, {3}});
+  model.PopulateTensor<int32_t>(model.data(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int32_t>(model.segment_ids(), {0, 3, 5});
+  ASSERT_EQ(model.InvokeUnchecked(), kTfLiteError);
+}
+
+TEST(SegmentSumOpModelTest, TestFailIfSegmentsAreNegative) {
+  SegmentSumOpModel<int32_t> model({TensorType_INT32, {3, 2}},
+                                   {TensorType_INT32, {3}});
+  model.PopulateTensor<int32_t>(model.data(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int32_t>(model.segment_ids(), {-1, 0, 1});
+  ASSERT_EQ(model.InvokeUnchecked(), kTfLiteError);
+}
+
+TEST(SegmentSumOpModelTest, TestFailIfSegmentsAreNotTheRightCardinality) {
+  SegmentSumOpModel<int32_t> model({TensorType_INT32, {3, 2}},
+                                   {TensorType_INT32, {2}});
+  model.PopulateTensor<int32_t>(model.data(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int32_t>(model.segment_ids(), {0, 1});
+  ASSERT_EQ(model.InvokeUnchecked(), kTfLiteError);
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/select.cc b/tensorflow/lite/kernels/select.cc
index 08b510f77ad..4ecb7fefd22 100644
--- a/tensorflow/lite/kernels/select.cc
+++ b/tensorflow/lite/kernels/select.cc
@@ -61,11 +61,18 @@ TfLiteStatus SelectPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input_condition =
-      GetInput(context, node, kInputTensorCondition);
-  const TfLiteTensor* input_x = GetInput(context, node, kInputTensorX);
-  const TfLiteTensor* input_y = GetInput(context, node, kInputTensorY);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input_condition;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensorCondition,
+                                          &input_condition));
+  const TfLiteTensor* input_x;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorX, &input_x));
+  const TfLiteTensor* input_y;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorY, &input_y));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   // Input must be bool.
   TF_LITE_ENSURE_TYPES_EQ(context, input_condition->type, kTfLiteBool);
@@ -111,11 +118,18 @@ TfLiteStatus SelectPrepare(TfLiteContext* context, TfLiteNode* node) {
 
 TfLiteStatus SelectEval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
-  const TfLiteTensor* input_condition =
-      GetInput(context, node, kInputTensorCondition);
-  const TfLiteTensor* input_x = GetInput(context, node, kInputTensorX);
-  const TfLiteTensor* input_y = GetInput(context, node, kInputTensorY);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input_condition;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensorCondition,
+                                          &input_condition));
+  const TfLiteTensor* input_x;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorX, &input_x));
+  const TfLiteTensor* input_y;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensorY, &input_y));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
 #define TF_LITE_SELECT(type, op)                                           \
   reference_ops::op(GetTensorShape(input_condition),                       \
diff --git a/tensorflow/lite/kernels/shape.cc b/tensorflow/lite/kernels/shape.cc
index afeadc38c20..39fa8a64a1b 100644
--- a/tensorflow/lite/kernels/shape.cc
+++ b/tensorflow/lite/kernels/shape.cc
@@ -40,8 +40,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   auto* params = reinterpret_cast<TfLiteShapeParams*>(node->builtin_data);
   switch (params->out_type) {
diff --git a/tensorflow/lite/kernels/skip_gram.cc b/tensorflow/lite/kernels/skip_gram.cc
index f81d152bb70..6505988f4d2 100644
--- a/tensorflow/lite/kernels/skip_gram.cc
+++ b/tensorflow/lite/kernels/skip_gram.cc
@@ -35,6 +35,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/string_util.h"
 
@@ -48,10 +49,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TF_LITE_ENSURE_TYPES_EQ(context, GetInput(context, node, 0)->type,
-                          kTfLiteString);
-  TF_LITE_ENSURE_TYPES_EQ(context, GetOutput(context, node, 0)->type,
-                          kTfLiteString);
+  const TfLiteTensor* input_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input_tensor));
+  TF_LITE_ENSURE_TYPES_EQ(context, input_tensor->type, kTfLiteString);
+  TfLiteTensor* output_tensor;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output_tensor));
+  TF_LITE_ENSURE_TYPES_EQ(context, output_tensor->type, kTfLiteString);
   return kTfLiteOk;
 }
 
@@ -91,7 +94,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   // Split sentence to words.
   std::vector<StringRef> words;
-  tflite::StringRef strref = tflite::GetString(GetInput(context, node, 0), 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  tflite::StringRef strref = tflite::GetString(input, 0);
   int prev_idx = 0;
   for (int i = 1; i < strref.len; i++) {
     if (isspace(*(strref.str + i))) {
diff --git a/tensorflow/lite/kernels/slice.cc b/tensorflow/lite/kernels/slice.cc
index bb123302995..246d061d168 100644
--- a/tensorflow/lite/kernels/slice.cc
+++ b/tensorflow/lite/kernels/slice.cc
@@ -113,10 +113,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* begin = GetInput(context, node, kBeginTensor);
-  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* begin;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kBeginTensor, &begin));
+  const TfLiteTensor* size;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kSizeTensor, &size));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   // Ensure validity of input tensor and its dimension.
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
@@ -142,10 +147,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* begin = GetInput(context, node, kBeginTensor);
-  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* begin;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kBeginTensor, &begin));
+  const TfLiteTensor* size;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kSizeTensor, &size));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   if (IsDynamicTensor(output)) {
     TF_LITE_ENSURE_OK(context,
@@ -214,6 +224,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt8:
       TF_LITE_SLICE(int8_t, kernel_type);
       break;
+    case kTfLiteInt16:
+      TF_LITE_SLICE(int16_t, kernel_type);
+      break;
     case kTfLiteUInt8:
       TF_LITE_SLICE(uint8_t, kernel_type);
       break;
diff --git a/tensorflow/lite/kernels/slice_test.cc b/tensorflow/lite/kernels/slice_test.cc
index 1e61e1e68aa..346a283fdd4 100644
--- a/tensorflow/lite/kernels/slice_test.cc
+++ b/tensorflow/lite/kernels/slice_test.cc
@@ -226,6 +226,16 @@ TEST_P(SliceOpTest, SliceInt8) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
 }
 
+TEST_P(SliceOpTest, SliceInt16) {
+  SliceOpModel<int16_t, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
+                                   {2, 1, -1, 1}, TensorType_INT32,
+                                   TensorType_INT16, GetParam());
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
+}
+
 TEST_P(SliceOpTest, SliceString) {
   SliceOpModel<string, int32_t> m({3, 2, 3, 1}, {4}, {1, 0, 0, 0}, {4},
                                   {2, 1, -1, 1}, TensorType_INT32,
diff --git a/tensorflow/lite/kernels/space_to_depth.cc b/tensorflow/lite/kernels/space_to_depth.cc
index ac001d903a4..4db7440b57e 100644
--- a/tensorflow/lite/kernels/space_to_depth.cc
+++ b/tensorflow/lite/kernels/space_to_depth.cc
@@ -45,8 +45,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
 
@@ -80,8 +83,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteSpaceToDepthParams*>(node->builtin_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
 #define TF_LITE_SPACE_TO_DEPTH(type, scalar)                               \
   tflite::SpaceToDepthParams op_params;                                    \
diff --git a/tensorflow/lite/kernels/sparse_to_dense.cc b/tensorflow/lite/kernels/sparse_to_dense.cc
index 4aea0f491bc..566a89ce75c 100644
--- a/tensorflow/lite/kernels/sparse_to_dense.cc
+++ b/tensorflow/lite/kernels/sparse_to_dense.cc
@@ -143,12 +143,18 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 4);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* indices = GetInput(context, node, kIndicesTensor);
-  const TfLiteTensor* output_shape =
-      GetInput(context, node, kOutputShapeTensor);
-  const TfLiteTensor* values = GetInput(context, node, kValueInputTensor);
-  const TfLiteTensor* default_value =
-      GetInput(context, node, kDefaultValueTensor);
+  const TfLiteTensor* indices;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kIndicesTensor, &indices));
+  const TfLiteTensor* output_shape;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kOutputShapeTensor, &output_shape));
+  const TfLiteTensor* values;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kValueInputTensor, &values));
+  const TfLiteTensor* default_value;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kDefaultValueTensor,
+                                          &default_value));
 
   // TODO(renjieliu): Handle validate_indices.
 
@@ -178,7 +184,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(
       context, CheckDimensionsMatch(context, indices, output_shape, values));
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   output->type = values->type;
   TF_LITE_ENSURE_EQ(context, NumDimensions(output_shape), 1);
 
@@ -191,13 +199,21 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 template <typename T, typename TI>
 TfLiteStatus SparseToDenseImpl(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* indices = GetInput(context, node, kIndicesTensor);
-  const TfLiteTensor* output_shape =
-      GetInput(context, node, kOutputShapeTensor);
-  const TfLiteTensor* values = GetInput(context, node, kValueInputTensor);
-  const TfLiteTensor* default_value =
-      GetInput(context, node, kDefaultValueTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* indices;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kIndicesTensor, &indices));
+  const TfLiteTensor* output_shape;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kOutputShapeTensor, &output_shape));
+  const TfLiteTensor* values;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kValueInputTensor, &values));
+  const TfLiteTensor* default_value;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kDefaultValueTensor,
+                                          &default_value));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   if (IsDynamicTensor(output)) {
     TF_LITE_ENSURE_OK(context,
@@ -238,8 +254,12 @@ TfLiteStatus EvalForIndexType(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* indices = GetInput(context, node, kIndicesTensor);
-  const TfLiteTensor* values = GetInput(context, node, kValueInputTensor);
+  const TfLiteTensor* indices;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kIndicesTensor, &indices));
+  const TfLiteTensor* values;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kValueInputTensor, &values));
 
   switch (values->type) {
     case kTfLiteFloat32:
diff --git a/tensorflow/lite/kernels/split.cc b/tensorflow/lite/kernels/split.cc
index 3b7781f409e..251a9d893c3 100644
--- a/tensorflow/lite/kernels/split.cc
+++ b/tensorflow/lite/kernels/split.cc
@@ -41,7 +41,9 @@ struct OpContext {
 
 TfLiteStatus UseDynamicOutputTensors(TfLiteContext* context, TfLiteNode* node) {
   for (int i = 0; i < NumOutputs(node); ++i) {
-    SetTensorToDynamic(GetOutput(context, node, i));
+    TfLiteTensor* tensor;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, i, &tensor));
+    SetTensorToDynamic(tensor);
   }
   return kTfLiteOk;
 }
@@ -65,7 +67,8 @@ TfLiteStatus ResizeOutputTensors(TfLiteContext* context, TfLiteNode* node,
   for (int i = 0; i < NumOutputs(node); ++i) {
     TfLiteIntArray* output_dims = TfLiteIntArrayCopy(input->dims);
     output_dims->data[axis_value] = slice_size;
-    TfLiteTensor* output = GetOutput(context, node, i);
+    TfLiteTensor* output;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, i, &output));
     TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_dims));
   }
 
@@ -85,7 +88,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                      input_type == kTfLiteInt8 || input_type == kTfLiteInt16 ||
                      input_type == kTfLiteInt32);
   for (int i = 0; i < NumOutputs(node); ++i) {
-    GetOutput(context, node, i)->type = input_type;
+    TfLiteTensor* tensor;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, i, &tensor));
+    tensor->type = input_type;
   }
 
   // If we know the contents of the 'axis' tensor, resize all outputs.
diff --git a/tensorflow/lite/kernels/split_v.cc b/tensorflow/lite/kernels/split_v.cc
index 7d60086a91d..054e00572f5 100644
--- a/tensorflow/lite/kernels/split_v.cc
+++ b/tensorflow/lite/kernels/split_v.cc
@@ -45,7 +45,9 @@ struct OpContext {
 
 TfLiteStatus UseDynamicOutputTensors(TfLiteContext* context, TfLiteNode* node) {
   for (int i = 0; i < NumOutputs(node); ++i) {
-    SetTensorToDynamic(GetOutput(context, node, i));
+    TfLiteTensor* tensor;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, i, &tensor));
+    SetTensorToDynamic(tensor);
   }
   return kTfLiteOk;
 }
@@ -113,7 +115,8 @@ TfLiteStatus ResizeOutputTensors(TfLiteContext* context, TfLiteNode* node,
   for (int i = 0; i < NumOutputs(node); ++i) {
     TfLiteIntArray* output_dims = TfLiteIntArrayCopy(input->dims);
     output_dims->data[axis_value] = size_splits_vector.at(i);
-    TfLiteTensor* output = GetOutput(context, node, i);
+    TfLiteTensor* output;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, i, &output));
     TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_dims));
   }
 
@@ -133,7 +136,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                      input_type == kTfLiteInt16 || input_type == kTfLiteInt32 ||
                      input_type == kTfLiteInt64 || input_type == kTfLiteInt8);
   for (int i = 0; i < NumOutputs(node); ++i) {
-    GetOutput(context, node, i)->type = input_type;
+    TfLiteTensor* tensor;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, i, &tensor));
+    tensor->type = input_type;
   }
 
   auto size_splits = op_context.size_splits;
diff --git a/tensorflow/lite/kernels/squared_difference.cc b/tensorflow/lite/kernels/squared_difference.cc
index ff09995845e..c1a20dcc3b6 100644
--- a/tensorflow/lite/kernels/squared_difference.cc
+++ b/tensorflow/lite/kernels/squared_difference.cc
@@ -60,9 +60,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
   output->type = input2->type;
@@ -101,9 +107,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
   ruy::profiler::ScopeLabel label("SquaredDifference");
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   if (output->type == kTfLiteFloat32) {
     EvalSquaredDifference<float>(context, node, data, input1, input2, output);
diff --git a/tensorflow/lite/kernels/strided_slice.cc b/tensorflow/lite/kernels/strided_slice.cc
index 83221cd4a3d..d10e99c1997 100644
--- a/tensorflow/lite/kernels/strided_slice.cc
+++ b/tensorflow/lite/kernels/strided_slice.cc
@@ -71,17 +71,27 @@ StridedSliceParams BuildStridedSliceParams(StridedSliceContext* op_context) {
   op_params.stop_indices_count = op_context->dims;
   op_params.strides_count = op_context->dims;
 
-  for (int i = 0; i < op_context->dims; ++i) {
-    op_params.start_indices[i] = GetTensorData<int32_t>(op_context->begin)[i];
-    op_params.stop_indices[i] = GetTensorData<int32_t>(op_context->end)[i];
-    op_params.strides[i] = GetTensorData<int32_t>(op_context->strides)[i];
-  }
-
   op_params.begin_mask = op_context->params->begin_mask;
   op_params.ellipsis_mask = 0;
   op_params.end_mask = op_context->params->end_mask;
   op_params.new_axis_mask = 0;
   op_params.shrink_axis_mask = op_context->params->shrink_axis_mask;
+
+  int begin_count = GetTensorShape(op_context->begin).Dims(0);
+  for (int i = 0; i < begin_count; ++i) {
+    op_params.start_indices[i] = GetTensorData<int32_t>(op_context->begin)[i];
+    op_params.stop_indices[i] = GetTensorData<int32_t>(op_context->end)[i];
+    op_params.strides[i] = GetTensorData<int32_t>(op_context->strides)[i];
+  }
+
+  // If the length of begin and end smaller than number of input dims, set the
+  // mask bit of begin and end for that index.
+  for (int i = begin_count; i < op_context->dims; ++i) {
+    op_params.start_indices[i] = op_params.stop_indices[i] = 0;
+    op_params.strides[i] = 1;
+    op_params.begin_mask |= (1 << i);
+    op_params.end_mask |= (1 << i);
+  }
   return op_params;
 }
 
@@ -95,7 +105,7 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
   RuntimeShape input_shape = GetTensorShape(op_context->input);
 
   for (int idx = op_context->dims - 1; idx >= 0; --idx) {
-    int32_t stride = GetTensorData<int32_t>(op_context->strides)[idx];
+    int32_t stride = op_params.strides[idx];
     TF_LITE_ENSURE_MSG(context, stride != 0, "stride value has to be non-zero");
 
     int32_t begin =
diff --git a/tensorflow/lite/kernels/strided_slice_test.cc b/tensorflow/lite/kernels/strided_slice_test.cc
index 5f625d3f201..d66cf884474 100644
--- a/tensorflow/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/kernels/strided_slice_test.cc
@@ -649,5 +649,48 @@ TYPED_TEST(StridedSliceOpTest, In5D_IdentityShrinkAxis1) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 1, 2}));
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4}));
 }
+
+TYPED_TEST(StridedSliceOpTest, In3D_SmallBegin) {
+  StridedSliceOpModel<TypeParam> m({2, 3, 2}, {1}, {1}, {1}, 0, 0, 0, 0, 0);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0});
+  m.SetEnd({1});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TYPED_TEST(StridedSliceOpTest, In3D_SmallBeginWithhrinkAxis1) {
+  StridedSliceOpModel<TypeParam> m({2, 3, 2}, {1}, {1}, {1}, 0, 0, 0, 0, 1);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0});
+  m.SetEnd({1});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TYPED_TEST(StridedSliceOpTest, In3D_BackwardSmallBegin) {
+  StridedSliceOpModel<TypeParam> m({1, 1, 2}, {1}, {1}, {1}, 0, 1, 0, 0, 0);
+  m.SetInput({1, 2});
+  m.SetBegin({1});
+  m.SetEnd({0});
+  m.SetStrides({1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({0, 1, 2}));
+}
+
+TYPED_TEST(StridedSliceOpTest, In3D_Backward) {
+  StridedSliceOpModel<TypeParam> m({1, 1, 2}, {3}, {3}, {3}, 6, 7, 0, 0, 0);
+  m.SetInput({1, 2});
+  m.SetBegin({1, 0, 0});
+  m.SetEnd({0, -1, -1});
+  m.SetStrides({1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({0, 1, 2}));
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/sub.cc b/tensorflow/lite/kernels/sub.cc
index f93ebecd46d..d9beba967d3 100644
--- a/tensorflow/lite/kernels/sub.cc
+++ b/tensorflow/lite/kernels/sub.cc
@@ -217,9 +217,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
   output->type = input2->type;
@@ -435,9 +441,15 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor1, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kInputTensor2, &input2));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32 ||
       output->type == kTfLiteInt64) {
diff --git a/tensorflow/lite/kernels/svdf.cc b/tensorflow/lite/kernels/svdf.cc
index 1b8bf904b8a..8f5c9a86bff 100644
--- a/tensorflow/lite/kernels/svdf.cc
+++ b/tensorflow/lite/kernels/svdf.cc
@@ -82,11 +82,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
   TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* weights_feature =
-      GetInput(context, node, kWeightsFeatureTensor);
-  const TfLiteTensor* weights_time =
-      GetInput(context, node, kWeightsTimeTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* weights_feature;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kWeightsFeatureTensor,
+                                          &weights_feature));
+  const TfLiteTensor* weights_time;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kWeightsTimeTensor, &weights_time));
 
   TF_LITE_ENSURE(context,
                  input->type == kTfLiteFloat32 || input->type == kTfLiteInt8);
@@ -108,8 +111,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_EQ(context, bias->dims->data[0], num_units);
   }
 
-  const TfLiteTensor* state = GetInput(context, node, kStateTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* state;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kStateTensor, &state));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   // Check the shape of input state tensors.
   TF_LITE_ENSURE_EQ(context, NumDimensions(state), 2);
@@ -143,7 +149,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   scratch_size_array->data[0] = batch_size;
   scratch_size_array->data[1] = num_filters;
 
-  TfLiteTensor* scratch_tensor = GetTemporary(context, node, /*index=*/0);
+  TfLiteTensor* scratch_tensor;
+  TF_LITE_ENSURE_OK(
+      context, GetTemporarySafe(context, node, /*index=*/0, &scratch_tensor));
 
   // The scratch buffer is of type int32 for full integer svdf and it's of type
   // float32 for hybrid and float case.
@@ -161,7 +169,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // Tell interpreter to allocate temporary tensors to store quantized values
     // of input tensors.
     node->temporaries->data[1] = scratch_tensor_index + 1;
-    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
+    TfLiteTensor* input_quantized;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/1,
+                                                &input_quantized));
     input_quantized->type = weights_feature->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
@@ -172,7 +182,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
     // Tell interpreter to allocate temporary tensors to store scaling factors.
     node->temporaries->data[2] = scratch_tensor_index + 2;
-    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/2);
+    TfLiteTensor* scaling_factors;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/2,
+                                                &scaling_factors));
     scaling_factors->type = kTfLiteFloat32;
     scaling_factors->allocation_type = kTfLiteArenaRw;
     int scaling_dims[1] = {batch_size};
@@ -186,7 +198,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // Used to store dequantized weights_time matrix for hybrid computation of
     // matmul(state, weights_time), which occurs in floating point.
     node->temporaries->data[3] = scratch_tensor_index + 3;
-    TfLiteTensor* float_weights_time = GetTemporary(context, node, /*index=*/3);
+    TfLiteTensor* float_weights_time;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/3,
+                                                &float_weights_time));
     float_weights_time->type = kTfLiteFloat32;
     // Persistent so that we can compute the dequantized weights only once.
     float_weights_time->allocation_type = kTfLiteArenaRwPersistent;
@@ -199,7 +213,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
 
     node->temporaries->data[4] = scratch_tensor_index + 4;
-    TfLiteTensor* zero_points = GetTemporary(context, node, /*index=*/4);
+    TfLiteTensor* zero_points;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, /*index=*/4, &zero_points));
     zero_points->type = kTfLiteFloat32;
     zero_points->allocation_type = kTfLiteArenaRw;
     int zero_points_dims[1] = {batch_size};
@@ -211,7 +227,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
 
     node->temporaries->data[5] = scratch_tensor_index + 5;
-    TfLiteTensor* row_sums = GetTemporary(context, node, /*index=*/5);
+    TfLiteTensor* row_sums;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, /*index=*/5, &row_sums));
     row_sums->type = kTfLiteFloat32;
     row_sums->allocation_type = kTfLiteArenaRwPersistent;
     int row_sums_dims[1] = {num_filters};
@@ -228,7 +246,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     output_temp_size_array->data[0] = num_units;
     output_temp_size_array->data[1] = batch_size;
     node->temporaries->data[1] = scratch_tensor_index + 1;
-    TfLiteTensor* output_temp = GetTemporary(context, node, /*index=*/1);
+    TfLiteTensor* output_temp;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, /*index=*/1, &output_temp));
     output_temp->type = kTfLiteInt32;
     output_temp->allocation_type = kTfLiteArenaRw;
     TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, output_temp,
@@ -263,37 +283,56 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
   OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* weights_feature =
-      GetInput(context, node, kWeightsFeatureTensor);
-  const TfLiteTensor* weights_time =
-      GetInput(context, node, kWeightsTimeTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* weights_feature;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kWeightsFeatureTensor,
+                                          &weights_feature));
+  const TfLiteTensor* weights_time;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kWeightsTimeTensor, &weights_time));
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
 
-  TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0);
+  TfLiteTensor* scratch;
+  TF_LITE_ENSURE_OK(context,
+                    GetTemporarySafe(context, node, /*index=*/0, &scratch));
 
   TfLiteTensor* state = GetVariableInput(context, node, kStateTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   switch (weights_feature->type) {
     case kTfLiteFloat32: {
-      reference_ops::EvalFloatSVDF(context, node, input, weights_feature,
-                                   weights_time, bias, params, scratch, state,
-                                   output);
+      reference_ops::EvalFloatSVDF(
+          params, GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(weights_feature),
+          GetTensorData<float>(weights_feature), GetTensorShape(weights_time),
+          GetTensorData<float>(weights_time), GetTensorShape(bias),
+          GetTensorData<float>(bias), GetTensorData<float>(scratch),
+          GetTensorData<float>(state), GetTensorShape(output),
+          GetTensorData<float>(output));
       return kTfLiteOk;
       break;
     }
     case kTfLiteUInt8:
     case kTfLiteInt8: {
       if (input->type == kTfLiteFloat32) {
-        TfLiteTensor* input_quantized =
-            GetTemporary(context, node, /*index=*/1);
-        TfLiteTensor* scaling_factors =
-            GetTemporary(context, node, /*index=*/2);
-        TfLiteTensor* float_weights_time =
-            GetTemporary(context, node, /*index=*/3);
-        TfLiteTensor* zero_points = GetTemporary(context, node, /*index=*/4);
-        TfLiteTensor* row_sums = GetTemporary(context, node, /*index=*/5);
+        TfLiteTensor* input_quantized;
+        TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/1,
+                                                    &input_quantized));
+        TfLiteTensor* scaling_factors;
+        TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/2,
+                                                    &scaling_factors));
+        TfLiteTensor* float_weights_time;
+        TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/3,
+                                                    &float_weights_time));
+        TfLiteTensor* zero_points;
+        TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/4,
+                                                    &zero_points));
+        TfLiteTensor* row_sums;
+        TF_LITE_ENSURE_OK(
+            context, GetTemporarySafe(context, node, /*index=*/5, &row_sums));
         // Dequantize weights time.
         // TODO(alanchiao): this dequantization initialization only needs to
         // happen once per model and should theoretically be placed in either
@@ -312,24 +351,47 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           op_data->float_weights_time_initialized = true;
         }
 
+        int32_t* zero_points_ptr = nullptr;
+        int32_t* row_sums_ptr = nullptr;
+        if (params->asymmetric_quantize_inputs && row_sums != nullptr) {
+          zero_points_ptr = GetTensorData<int32_t>(zero_points);
+          row_sums_ptr = GetTensorData<int32_t>(row_sums);
+        }
+
         reference_ops::EvalHybridSVDF(
-            context, node, input, weights_feature, float_weights_time, bias,
-            params, scratch, scaling_factors, input_quantized, state, output,
-            zero_points, row_sums, &op_data->compute_row_sums);
+            params, GetTensorShape(input), GetTensorData<float>(input),
+            GetTensorShape(weights_feature),
+            GetTensorData<int8_t>(weights_feature),
+            weights_feature->params.scale, GetTensorShape(float_weights_time),
+            GetTensorData<float>(float_weights_time), GetTensorShape(bias),
+            GetTensorData<float>(bias), GetTensorData<float>(scratch),
+            GetTensorData<float>(scaling_factors),
+            GetTensorData<int8_t>(input_quantized), GetTensorData<float>(state),
+            GetTensorShape(output), GetTensorData<float>(output),
+            zero_points_ptr, row_sums_ptr, &op_data->compute_row_sums);
         return kTfLiteOk;
       } else {
         auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
             input->quantization.params);
         auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
             output->quantization.params);
-        TfLiteTensor* output_temp = GetTemporary(context, node, /*index=*/1);
+        TfLiteTensor* output_temp;
+        TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/1,
+                                                    &output_temp));
 
         // Currently supports only ReLU.
         // TODO(jianlijianli): support other activations.
         TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActRelu);
+
         reference_ops::EvalIntegerSVDF(
-            context, node, input, weights_feature, weights_time, bias, params,
-            state, output, scratch, output_temp, op_data->effective_scale_1_a,
+            params, GetTensorShape(input), GetTensorData<int8_t>(input),
+            GetTensorShape(weights_feature),
+            GetTensorData<int8_t>(weights_feature),
+            GetTensorShape(weights_time), GetTensorData<int16_t>(weights_time),
+            GetTensorShape(bias), GetTensorData<int32_t>(bias),
+            GetTensorData<int16_t>(state), GetTensorShape(output),
+            GetTensorData<int8_t>(output), GetTensorData<int32_t>(scratch),
+            GetTensorData<int32_t>(output_temp), op_data->effective_scale_1_a,
             op_data->effective_scale_1_b, op_data->effective_scale_2_a,
             op_data->effective_scale_2_b, input_params->zero_point->data[0],
             output_params->zero_point->data[0]);
diff --git a/tensorflow/lite/kernels/svdf_test.cc b/tensorflow/lite/kernels/svdf_test.cc
index b0ac2011948..c13810d3b75 100644
--- a/tensorflow/lite/kernels/svdf_test.cc
+++ b/tensorflow/lite/kernels/svdf_test.cc
@@ -143,9 +143,8 @@ class BaseSVDFOpModel : public SingleOpModel {
     weights_time_ = AddInput(weights_time_type);
     bias_ = AddNullInput();
     const int num_filters = units * rank;
-    activation_state_ = AddInput(
-        TensorData{TensorType_FLOAT32, {batches, memory_size * num_filters}},
-        /*is_variable=*/true);
+    activation_state_ = AddVariableInput(
+        TensorData{TensorType_FLOAT32, {batches, memory_size * num_filters}});
     output_ = AddOutput(TensorType_FLOAT32);
     SetBuiltinOp(BuiltinOperator_SVDF, BuiltinOptions_SVDFOptions,
                  CreateSVDFOptions(builder_, rank, ActivationFunctionType_NONE,
@@ -482,9 +481,8 @@ class IntegerSVDFOpModel : public SingleOpModel {
     weights_time_ =
         AddInput({TensorType_INT16, {num_filters, memory_size}, -1, 1});
     bias_ = AddInput({TensorType_INT32, {units}, -512, 512});
-    activation_state_ = AddInput(
-        {TensorType_INT16, {batches, memory_size * num_filters}, -16, 16},
-        /*is_variable=*/true);
+    activation_state_ = AddVariableInput(
+        {TensorType_INT16, {batches, memory_size * num_filters}, -16, 16});
     output_ = AddOutput({TensorType_INT8, {batches, units}, -0.5, 0.5});
     SetBuiltinOp(
         BuiltinOperator_SVDF, BuiltinOptions_SVDFOptions,
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
index c3a40252cfb..64274812d7f 100644
--- a/tensorflow/lite/kernels/test_util.cc
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -44,6 +44,7 @@ limitations under the License.
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/tools/logging.h"
@@ -79,12 +80,23 @@ std::vector<Matcher<std::complex<float>>> ArrayComplex64Near(
   return matchers;
 }
 
-int SingleOpModel::AddInput(const TensorData& t, bool is_variable) {
+int SingleOpModel::AddInput(const TensorData& t) {
   int id = 0;
   if (t.per_channel_quantization) {
     id = AddTensorPerChannelQuant(t);
   } else {
-    id = AddTensor<float>(t, {}, is_variable);
+    id = AddTensor<float>(t, {});
+  }
+  inputs_.push_back(id);
+  return id;
+}
+
+int SingleOpModel::AddVariableInput(const TensorData& t) {
+  int id = 0;
+  if (t.per_channel_quantization) {
+    id = AddTensorPerChannelQuant(t);
+  } else {
+    id = AddTensor<float>(t, {}, true);
   }
   inputs_.push_back(id);
   return id;
@@ -145,10 +157,22 @@ void SingleOpModel::SetCustomOp(
       CustomOptionsFormat_FLEXBUFFERS));
 }
 
+void SingleOpModel::AllocateAndDelegate(bool apply_delegate) {
+  CHECK(interpreter_->AllocateTensors() == kTfLiteOk)
+      << "Cannot allocate tensors";
+  interpreter_->ResetVariableTensors();
+
+  // In some rare cases a test may need to postpone modifying the graph with
+  // a delegate, e.g. if tensors are not fully specified. In such cases the
+  // test has to explicitly call ApplyDelegate() when necessary.
+  if (apply_delegate) ApplyDelegate();
+}
+
 void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
                                      int num_threads,
                                      bool allow_fp32_relax_to_fp16,
-                                     bool apply_delegate) {
+                                     bool apply_delegate,
+                                     bool allocate_and_delegate) {
   auto opcodes = builder_.CreateVector(opcodes_);
   auto operators = builder_.CreateVector(operators_);
   auto tensors = builder_.CreateVector(tensors_);
@@ -169,7 +193,10 @@ void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
   UpdateOpVersion(buffer_pointer);
 
   if (!resolver_) {
-    auto resolver = new ops::builtin::BuiltinOpResolver();
+    MutableOpResolver* resolver =
+        apply_delegate
+            ? new ops::builtin::BuiltinOpResolver()
+            : new ops::builtin::BuiltinOpResolverWithoutDefaultDelegates();
     for (const auto& reg : custom_registrations_) {
       resolver->AddCustom(reg.first.data(), reg.second());
     }
@@ -190,14 +217,9 @@ void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
 
   interpreter_->SetAllowFp16PrecisionForFp32(allow_fp32_relax_to_fp16);
 
-  CHECK(interpreter_->AllocateTensors() == kTfLiteOk)
-      << "Cannot allocate tensors";
-  interpreter_->ResetVariableTensors();
-
-  // In some rare cases a test may need to postpone modifying the graph with
-  // a delegate, e.g. if tensors are not fully specified. In such cases the
-  // test has to explicitly call ApplyDelegate() when necessary.
-  if (apply_delegate) ApplyDelegate();
+  if (allocate_and_delegate) {
+    AllocateAndDelegate(apply_delegate);
+  }
 }
 
 TfLiteStatus SingleOpModel::ApplyDelegate() {
@@ -229,7 +251,7 @@ void SingleOpModel::BuildInterpreter(
     std::vector<std::vector<int>> input_shapes) {
   BuildInterpreter(input_shapes, /*num_threads=*/-1,
                    /*allow_fp32_relax_to_fp16=*/false,
-                   /*apply_delegate=*/true);
+                   /*apply_delegate=*/true, /*allocate_and_delegate=*/true);
 }
 
 // static
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index 3e13335b160..9cd272f3030 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -105,6 +105,7 @@ inline std::vector<float> Dequantize(const std::vector<T>& data, float scale,
 // the actual data is known. This mimics what happens in practice: quantization
 // parameters are calculated during training or post training..
 struct TensorData {
+  // NOLINTNEXTLINE
   TensorData(TensorType type = TensorType_FLOAT32, std::vector<int> shape = {},
              float min = 0.0f, float max = 0.0f, float scale = 0.0f,
              int32_t zero_point = 0, bool per_channel_quantization = false,
@@ -188,10 +189,8 @@ class SingleOpModel {
   SingleOpModel& operator=(const SingleOpModel&) = delete;
 
   // Add a TensorType input tensor and return its index.
-  int AddInput(TensorType type, bool is_variable = false) {
-    return AddInput(TensorData{type}, is_variable);
-  }
-  int AddInput(const TensorData& t, bool is_variable = false);
+  int AddInput(const TensorData& t);
+  int AddVariableInput(const TensorData& t);
 
   int AddIntermediate(TensorType type, const std::vector<float>& scale,
                       const std::vector<int64_t>& zero_point);
@@ -214,9 +213,82 @@ class SingleOpModel {
     return AddConstInput(TensorData{type, shape}, data);
   }
 
+  // TODO(b/166202747): Use a better way to do type specialization. Reduce
+  // duplicate code in the two functions below.
+  int AddConstSparseInput(const TensorData& t,
+                          const std::vector<int8_t>& data) {
+    int id = tensors_.size();
+    const int dims_count = t.traversal_order.size();
+    std::vector<int8_t> dense_data(data);
+
+    tflite::optimize::sparsity::FormatConverter<int8_t> converter(
+        t.shape, t.traversal_order, t.format, t.block_size, t.block_map);
+    converter.DenseToSparse(dense_data.data());
+
+    const auto dim_metadata = converter.GetDimMetadata();
+    const auto sparse_data = converter.GetData();
+
+    // Build sparsity parameter.
+    std::vector<flatbuffers::Offset<DimensionMetadata>> fb_dim_metadata(
+        dims_count);
+    for (int i = 0; i < dims_count; i++) {
+      const int metadata_idx = 2 * i;
+      if (i < t.shape.size() &&
+          t.format[t.traversal_order[i]] == kTfLiteDimSparseCSR) {
+        auto array_segments =
+            CreateInt32Vector(builder_,
+                              builder_.CreateVector(dim_metadata[metadata_idx]))
+                .Union();
+        auto array_indices =
+            CreateInt32Vector(
+                builder_, builder_.CreateVector(dim_metadata[metadata_idx + 1]))
+                .Union();
+        fb_dim_metadata[i] = CreateDimensionMetadata(
+            builder_, DimensionType_SPARSE_CSR, 0,
+            SparseIndexVector_Int32Vector, array_segments,
+            SparseIndexVector_Int32Vector, array_indices);
+      } else {
+        fb_dim_metadata[i] = CreateDimensionMetadata(
+            builder_, DimensionType_DENSE, dim_metadata[metadata_idx][0]);
+      }
+    }
+
+    flatbuffers::Offset<SparsityParameters> s_param = CreateSparsityParameters(
+        builder_, builder_.CreateVector(t.traversal_order),
+        builder_.CreateVector(t.block_map),
+        builder_.CreateVector(fb_dim_metadata));
+
+    int buffer_id = 0;
+    if (!data.empty()) {
+      // Initialize buffers list with empty buffer to allow for non-const
+      // tensors.
+      if (buffers_.empty()) {
+        buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
+      }
+
+      // Add compressed data as a Buffer to buffers list.
+      buffer_id = buffers_.size();
+      auto data_buffer = builder_.CreateVector(
+          reinterpret_cast<const uint8_t*>(sparse_data.data()),
+          sparse_data.size());
+      buffers_.push_back(CreateBuffer(builder_, data_buffer));
+    }
+
+    tensors_.push_back(CreateTensor(
+        builder_, builder_.CreateVector<int>(t.shape), t.type,
+        /*buffer=*/buffer_id,
+        /*name=*/0, /*quantization=*/0, /*is_variable=*/false, s_param));
+
+    inputs_.push_back(id);
+    tensor_data_[id] = t;
+
+    return id;
+  }
+
   // Add a constant sparse tensor as input.
   template <typename T>
-  int AddConstSparseInput(const TensorData& t, std::initializer_list<T> data) {
+  int AddConstSparseInput(const TensorData& t, const std::vector<T>& data,
+                          bool symmetric_quantize = false) {
     int id = tensors_.size();
     const int dims_count = t.traversal_order.size();
     std::vector<T> dense_data(data);
@@ -258,8 +330,9 @@ class SingleOpModel {
         builder_.CreateVector(t.block_map),
         builder_.CreateVector(fb_dim_metadata));
 
+    flatbuffers::Offset<QuantizationParameters> q_params = 0;
     int buffer_id = 0;
-    if (data.size()) {
+    if (!data.empty()) {
       // Initialize buffers list with empty buffer to allow for non-const
       // tensors.
       if (buffers_.empty()) {
@@ -268,16 +341,31 @@ class SingleOpModel {
 
       // Add compressed data as a Buffer to buffers list.
       buffer_id = buffers_.size();
-      auto data_buffer = builder_.CreateVector(
-          reinterpret_cast<const uint8_t*>(sparse_data.data()),
-          sizeof(T) * sparse_data.size());
-      buffers_.push_back(CreateBuffer(builder_, data_buffer));
+      if (symmetric_quantize) {
+        const int length = sparse_data.size();
+        std::vector<int8_t> q(length);
+        float min, max, scaling_factor;
+        tensor_utils::SymmetricQuantizeFloats(
+            sparse_data.data(), length, q.data(), &min, &max, &scaling_factor);
+        q_params = CreateQuantizationParameters(
+            builder_, 0, 0, builder_.CreateVector<float>({scaling_factor}),
+            builder_.CreateVector<int64_t>({0}));
+        auto data_buffer = builder_.CreateVector(
+            reinterpret_cast<const uint8_t*>(q.data()), q.size());
+        buffers_.push_back(CreateBuffer(builder_, data_buffer));
+      } else {
+        auto data_buffer = builder_.CreateVector(
+            reinterpret_cast<const uint8_t*>(sparse_data.data()),
+            sizeof(T) * sparse_data.size());
+        buffers_.push_back(CreateBuffer(builder_, data_buffer));
+      }
     }
 
-    tensors_.push_back(CreateTensor(
-        builder_, builder_.CreateVector<int>(t.shape), t.type,
-        /*buffer=*/buffer_id,
-        /*name=*/0, /*quantization=*/0, /*is_variable=*/false, s_param));
+    tensors_.push_back(
+        CreateTensor(builder_, builder_.CreateVector<int>(t.shape),
+                     symmetric_quantize ? TensorType_INT8 : t.type,
+                     /*buffer=*/buffer_id,
+                     /*name=*/0, q_params, /*is_variable=*/false, s_param));
 
     inputs_.push_back(id);
     tensor_data_[id] = t;
@@ -289,7 +377,6 @@ class SingleOpModel {
   int AddNullInput();
 
   // Add a TensorType output tensor and return its index.
-  int AddOutput(TensorType type) { return AddOutput(TensorData{type}); }
   int AddOutput(const TensorData& t);
 
   template <typename T>
@@ -392,11 +479,19 @@ class SingleOpModel {
                    const std::vector<uint8_t>& custom_option,
                    const std::function<TfLiteRegistration*()>& registration);
 
+  // Allocate tensors and apply delegate.
+  // Note that this is called by default in BuiltInterpreter().
+  void AllocateAndDelegate(bool apply_delegate);
+
   // Build the interpreter for this model. Also, resize and allocate all
   // tensors given the shapes of the inputs.
+  // Note: 'apply_delegate' also serves to tell whether default TfLite delegates
+  // should be applied implicitly for a test case. For example, when testing the
+  // specific implementation of a TfLite delegate, it might be necessary to set
+  // this to false.
   void BuildInterpreter(std::vector<std::vector<int>> input_shapes,
                         int num_threads, bool allow_fp32_relax_to_fp16,
-                        bool apply_delegate);
+                        bool apply_delegate, bool allocate_and_delegate = true);
 
   void BuildInterpreter(std::vector<std::vector<int>> input_shapes);
 
@@ -843,7 +938,9 @@ struct TypeUnion;
 template <>
 struct TypeUnion<float> {
  public:
+  // NOLINTNEXTLINE
   static constexpr TensorType tensor_type = TensorType::TensorType_FLOAT32;
+  // NOLINTNEXTLINE
   static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteFloat32;
   typedef float ScalarType;
 };
@@ -851,7 +948,9 @@ struct TypeUnion<float> {
 template <>
 struct TypeUnion<int32_t> {
  public:
+  // NOLINTNEXTLINE
   static constexpr TensorType tensor_type = TensorType::TensorType_INT32;
+  // NOLINTNEXTLINE
   static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteInt32;
   typedef int32_t ScalarType;
 };
@@ -859,7 +958,9 @@ struct TypeUnion<int32_t> {
 template <>
 struct TypeUnion<int16_t> {
  public:
+  // NOLINTNEXTLINE
   static constexpr TensorType tensor_type = TensorType::TensorType_INT16;
+  // NOLINTNEXTLINE
   static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteInt16;
   typedef int16_t ScalarType;
 };
@@ -867,7 +968,9 @@ struct TypeUnion<int16_t> {
 template <>
 struct TypeUnion<int8_t> {
  public:
+  // NOLINTNEXTLINE
   static constexpr TensorType tensor_type = TensorType::TensorType_INT8;
+  // NOLINTNEXTLINE
   static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteInt8;
   typedef int8_t ScalarType;
 };
@@ -875,7 +978,9 @@ struct TypeUnion<int8_t> {
 template <>
 struct TypeUnion<uint8_t> {
  public:
+  // NOLINTNEXTLINE
   static constexpr TensorType tensor_type = TensorType::TensorType_UINT8;
+  // NOLINTNEXTLINE
   static constexpr TfLiteType tflite_type = TfLiteType::kTfLiteUInt8;
   typedef uint8_t ScalarType;
 };
diff --git a/tensorflow/lite/kernels/tile.cc b/tensorflow/lite/kernels/tile.cc
index 08d246203ae..b2df35e739c 100644
--- a/tensorflow/lite/kernels/tile.cc
+++ b/tensorflow/lite/kernels/tile.cc
@@ -49,9 +49,14 @@ TfLiteIntArray* MultiplyShapeDims(const TfLiteIntArray& shape,
 }
 
 TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* multipliers = GetInput(context, node, kInputMultipliers);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+  const TfLiteTensor* multipliers;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kInputMultipliers, &multipliers));
 
   const int num_dimensions = NumDimensions(input);
   const int num_multipliers = NumElements(multipliers);
@@ -208,12 +213,17 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
-  const TfLiteTensor* multipliers = GetInput(context, node, kInputMultipliers);
+  const TfLiteTensor* multipliers;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kInputMultipliers, &multipliers));
   // Only int32 and int64 multipliers type is supported.
   if (multipliers->type != kTfLiteInt32 && multipliers->type != kTfLiteInt64) {
     context->ReportError(context,
@@ -231,9 +241,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  const TfLiteTensor* multipliers = GetInput(context, node, kInputMultipliers);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+  const TfLiteTensor* multipliers;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kInputMultipliers, &multipliers));
 
   if (IsDynamicTensor(output)) {
     TF_LITE_ENSURE_OK(context, ResizeOutput(context, node));
diff --git a/tensorflow/lite/kernels/topk_v2.cc b/tensorflow/lite/kernels/topk_v2.cc
index 3fb241356e1..5531e75ac13 100644
--- a/tensorflow/lite/kernels/topk_v2.cc
+++ b/tensorflow/lite/kernels/topk_v2.cc
@@ -35,14 +35,16 @@ constexpr int kOutputIndexes = 1;
 
 namespace {
 TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* top_k = GetInput(context, node, kInputTopK);
+  const TfLiteTensor* top_k;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTopK, &top_k));
   // INT32 number of top results is supported.
   TF_LITE_ENSURE_TYPES_EQ(context, top_k->type, kTfLiteInt32);
   // Check that the tensor contains only one value.
   TF_LITE_ENSURE_EQ(context, NumElements(top_k), 1);
   const int32 k = *GetTensorData<int32_t>(top_k);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   const int num_dimensions = NumDimensions(input);
   // Check that input has one or more dimensions.
   TF_LITE_ENSURE_MSG(context, input->dims->size >= 1,
@@ -59,8 +61,12 @@ TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) {
   }
   output_indexes_shape->data[num_dimensions - 1] = k;
   output_values_shape->data[num_dimensions - 1] = k;
-  TfLiteTensor* output_indexes = GetOutput(context, node, kOutputIndexes);
-  TfLiteTensor* output_values = GetOutput(context, node, kOutputValues);
+  TfLiteTensor* output_indexes;
+  TF_LITE_ENSURE_OK(
+      context, GetOutputSafe(context, node, kOutputIndexes, &output_indexes));
+  TfLiteTensor* output_values;
+  TF_LITE_ENSURE_OK(
+      context, GetOutputSafe(context, node, kOutputValues, &output_values));
   // Force output types.
   output_indexes->type = kTfLiteInt32;
   output_values->type = input->type;
@@ -195,19 +201,27 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 2);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output_values = GetOutput(context, node, kOutputValues);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output_values;
+  TF_LITE_ENSURE_OK(
+      context, GetOutputSafe(context, node, kOutputValues, &output_values));
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output_values->type);
 
-  const TfLiteTensor* top_k = GetInput(context, node, kInputTopK);
+  const TfLiteTensor* top_k;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTopK, &top_k));
   TF_LITE_ENSURE_TYPES_EQ(context, top_k->type, kTfLiteInt32);
 
   // Set output dynamic if the input is not const.
   if (IsConstantTensor(top_k)) {
     TF_LITE_ENSURE_OK(context, ResizeOutput(context, node));
   } else {
-    TfLiteTensor* output_indexes = GetOutput(context, node, kOutputIndexes);
-    TfLiteTensor* output_values = GetOutput(context, node, kOutputValues);
+    TfLiteTensor* output_indexes;
+    TF_LITE_ENSURE_OK(
+        context, GetOutputSafe(context, node, kOutputIndexes, &output_indexes));
+    TfLiteTensor* output_values;
+    TF_LITE_ENSURE_OK(
+        context, GetOutputSafe(context, node, kOutputValues, &output_values));
     SetTensorToDynamic(output_indexes);
     SetTensorToDynamic(output_values);
   }
@@ -215,16 +229,22 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* output_values = GetOutput(context, node, kOutputValues);
-  TfLiteTensor* output_indexes = GetOutput(context, node, kOutputIndexes);
+  TfLiteTensor* output_values;
+  TF_LITE_ENSURE_OK(
+      context, GetOutputSafe(context, node, kOutputValues, &output_values));
+  TfLiteTensor* output_indexes;
+  TF_LITE_ENSURE_OK(
+      context, GetOutputSafe(context, node, kOutputIndexes, &output_indexes));
   if (IsDynamicTensor(output_values)) {
     TF_LITE_ENSURE_OK(context, ResizeOutput(context, node));
   }
-  const TfLiteTensor* top_k = GetInput(context, node, kInputTopK);
+  const TfLiteTensor* top_k;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTopK, &top_k));
   const int32 k = top_k->data.i32[0];
   // The tensor can have more than 2 dimensions or even be a vector, the code
   // anyway calls the internal dimension as row;
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   const int32 row_size = input->dims->data[input->dims->size - 1];
   int32 num_rows = 1;
   for (int i = 0; i < input->dims->size - 1; ++i) {
diff --git a/tensorflow/lite/kernels/transpose.cc b/tensorflow/lite/kernels/transpose.cc
index 3a6d1b1f1ed..f5ddcb2b362 100644
--- a/tensorflow/lite/kernels/transpose.cc
+++ b/tensorflow/lite/kernels/transpose.cc
@@ -130,6 +130,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         TF_LITE_TRANSPOSE(reference_ops, int8_t);
       }
       break;
+    case kTfLiteInt16:
+      TF_LITE_TRANSPOSE(reference_ops, int16_t);
+      break;
     case kTfLiteInt64:
       TF_LITE_TRANSPOSE(reference_ops, int64_t);
       break;
diff --git a/tensorflow/lite/kernels/transpose_conv.cc b/tensorflow/lite/kernels/transpose_conv.cc
index 07dc4bbac53..52ee0414dd6 100644
--- a/tensorflow/lite/kernels/transpose_conv.cc
+++ b/tensorflow/lite/kernels/transpose_conv.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
-#include "tensorflow/lite/kernels/eigen_support.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 // NOLINTNEXTLINE - This header file should't go to the top.
 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h"
@@ -96,13 +95,10 @@ struct OpData {
 };
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* data = new OpData;
-  eigen_support::IncrementUsageCounter(context);
-  return data;
+  return new OpData;
 }
 
 void Free(TfLiteContext* context, void* buffer) {
-  eigen_support::DecrementUsageCounter(context);
   delete reinterpret_cast<OpData*>(buffer);
 }
 
@@ -254,13 +250,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
   // Retrieve tensors
-  const TfLiteTensor* output_shape =
-      GetInput(context, node, kOutputShapeTensor);
-  const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* input = GetInput(context, node, kDataInputTensor);
+  const TfLiteTensor* output_shape;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kOutputShapeTensor, &output_shape));
+  const TfLiteTensor* weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kWeightsTensor, &weights));
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kDataInputTensor, &input));
   const TfLiteTensor* bias = nullptr;
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   // Tensor sanity checks
   TF_LITE_ENSURE_EQ(context, NumDimensions(output_shape), 1);
@@ -310,7 +313,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* col2im = nullptr;
   if (data->has_col2im) {
     node->temporaries->data[data->col2im_index] = data->col2im_id;
-    col2im = GetTemporary(context, node, user_data->col2im_index);
+    TF_LITE_ENSURE_OK(
+        context,
+        GetTemporarySafe(context, node, user_data->col2im_index, &col2im));
   }
 
   if (!IsConstantTensor(output_shape)) {
@@ -330,8 +335,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   if (data->weights_are_transposed) {
     node->temporaries->data[data->transposed_weights_index] =
         data->transposed_weights_id;
-    TfLiteTensor* transposed_weights =
-        GetTemporary(context, node, user_data->transposed_weights_index);
+    TfLiteTensor* transposed_weights;
+    TF_LITE_ENSURE_OK(
+        context,
+        GetTemporarySafe(context, node, user_data->transposed_weights_index,
+                         &transposed_weights));
     if (!IsConstantTensor(weights)) {
       SetTensorToDynamic(transposed_weights);
     } else {
@@ -343,8 +351,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       input->type == kTfLiteInt16) {
     node->temporaries->data[data->scratch_tensor_index] =
         data->scratch_tensor_id;
-    TfLiteTensor* scratch_buffer =
-        GetTemporary(context, node, data->scratch_tensor_index);
+    TfLiteTensor* scratch_buffer;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, data->scratch_tensor_index,
+                                  &scratch_buffer));
     if (input->type == kTfLiteInt16) {
       scratch_buffer->type = kTfLiteInt64;
     } else {
@@ -553,15 +563,22 @@ void EvalQuantizedPerChannel16x8(
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Retrieve tensors (All should be allocated by now)
-  const TfLiteTensor* output_shape =
-      GetInput(context, node, kOutputShapeTensor);
-  const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* input = GetInput(context, node, kDataInputTensor);
+  const TfLiteTensor* output_shape;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kOutputShapeTensor, &output_shape));
+  const TfLiteTensor* weights;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kWeightsTensor, &weights));
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, kDataInputTensor, &input));
   const TfLiteTensor* bias =
       (NumInputs(node) == 4)
           ? GetOptionalInputTensor(context, node, kBiasTensor)
           : nullptr;
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
   TfLiteTensor* col2im = data->has_col2im
                              ? GetTemporary(context, node, data->col2im_index)
@@ -608,8 +625,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       break;
     }
     case kTfLiteUInt8: {
-      TfLiteTensor* scratch_buffer =
-          GetTemporary(context, node, data->scratch_tensor_index);
+      TfLiteTensor* scratch_buffer;
+      TF_LITE_ENSURE_OK(
+          context, GetTemporarySafe(context, node, data->scratch_tensor_index,
+                                    &scratch_buffer));
       if (IsDynamicTensor(scratch_buffer)) {
         TF_LITE_ENSURE_OK(context,
                           ResizeTensor(context, output_shape, scratch_buffer));
@@ -625,8 +644,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       break;
     }
     case kTfLiteInt8: {
-      TfLiteTensor* scratch_buffer =
-          GetTemporary(context, node, data->scratch_tensor_index);
+      TfLiteTensor* scratch_buffer;
+      TF_LITE_ENSURE_OK(
+          context, GetTemporarySafe(context, node, data->scratch_tensor_index,
+                                    &scratch_buffer));
       if (IsDynamicTensor(scratch_buffer)) {
         TF_LITE_ENSURE_OK(context,
                           ResizeTensor(context, output_shape, scratch_buffer));
@@ -640,8 +661,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       break;
     }
     case kTfLiteInt16: {
-      TfLiteTensor* scratch_buffer =
-          GetTemporary(context, node, data->scratch_tensor_index);
+      TfLiteTensor* scratch_buffer;
+      TF_LITE_ENSURE_OK(
+          context, GetTemporarySafe(context, node, data->scratch_tensor_index,
+                                    &scratch_buffer));
       if (IsDynamicTensor(scratch_buffer)) {
         TF_LITE_ENSURE_OK(context,
                           ResizeTensor(context, output_shape, scratch_buffer));
diff --git a/tensorflow/lite/kernels/transpose_test.cc b/tensorflow/lite/kernels/transpose_test.cc
index a88abec7161..f4fc106949f 100644
--- a/tensorflow/lite/kernels/transpose_test.cc
+++ b/tensorflow/lite/kernels/transpose_test.cc
@@ -180,13 +180,14 @@ TEST(TransposeTest, TestRefOps4D) {
   ASSERT_EQ(out, ref);
 }
 
-TEST(TransposeTest, TestRefOps4DInt8) {
-  std::vector<int8_t> out;
+template <typename T>
+void TransposeTestTestRefOps4D() {
+  std::vector<T> out;
   // Basic 4d.
   RunTestPermutation({2, 3, 4, 5}, {2, 0, 1, 3}, &out);
   ASSERT_EQ(
       out,
-      std::vector<int8_t>(
+      std::vector<T>(
           {0,  1,  2,  3,  4,  20, 21, 22, 23, 24, 40,  41,  42,  43,  44,
            60, 61, 62, 63, 64, 80, 81, 82, 83, 84, 100, 101, 102, 103, 104,
            5,  6,  7,  8,  9,  25, 26, 27, 28, 29, 45,  46,  47,  48,  49,
@@ -197,11 +198,15 @@ TEST(TransposeTest, TestRefOps4DInt8) {
            75, 76, 77, 78, 79, 95, 96, 97, 98, 99, 115, 116, 117, 118, 119}));
   RunTestPermutation({2, 3, 4, 5}, {0, 1, 2, 3}, &out);
   // Basic identity.
-  std::vector<int8_t> ref(out.size());
+  std::vector<T> ref(out.size());
   for (int k = 0; k < ref.size(); k++) ref[k] = k;
   ASSERT_EQ(out, ref);
 }
 
+TEST(TransposeTest, TestRefOps4DInt8) { TransposeTestTestRefOps4D<int8_t>(); }
+
+TEST(TransposeTest, TestRefOps4DInt16) { TransposeTestTestRefOps4D<int16_t>(); }
+
 class TransposeOpModel : public SingleOpModel {
  public:
   void SetInput(std::initializer_list<float> data) {
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
index 0849c6dc0e4..240d7125a5f 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/kernel_utils.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -88,14 +89,19 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
   }
 
-  const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, lstm::full::kInputToForgetWeightsTensor);
+  const TfLiteTensor* input_to_forget_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kInputToForgetWeightsTensor,
+                   &input_to_forget_weights));
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
 
-  const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, lstm::full::kInputToCellWeightsTensor);
+  const TfLiteTensor* input_to_cell_weights;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node,
+                                          lstm::full::kInputToCellWeightsTensor,
+                                          &input_to_cell_weights));
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
@@ -110,16 +116,22 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                       n_output);
   }
 
-  const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, lstm::full::kRecurrentToForgetWeightsTensor);
+  const TfLiteTensor* recurrent_to_forget_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kRecurrentToForgetWeightsTensor,
+                   &recurrent_to_forget_weights));
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
                     n_cell);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
                     n_output);
 
-  const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, lstm::full::kRecurrentToCellWeightsTensor);
+  const TfLiteTensor* recurrent_to_cell_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kRecurrentToCellWeightsTensor,
+                   &recurrent_to_cell_weights));
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
@@ -176,18 +188,24 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
   }
 
-  const TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, lstm::full::kForgetGateBiasTensor);
+  const TfLiteTensor* forget_gate_bias;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, lstm::full::kForgetGateBiasTensor,
+                            &forget_gate_bias));
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
 
-  const TfLiteTensor* cell_gate_bias =
-      GetInput(context, node, lstm::full::kCellGateBiasTensor);
+  const TfLiteTensor* cell_gate_bias;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, lstm::full::kCellGateBiasTensor,
+                                 &cell_gate_bias));
   TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, cell_gate_bias->dims->data[0], n_cell);
 
-  const TfLiteTensor* output_gate_bias =
-      GetInput(context, node, lstm::full::kOutputGateBiasTensor);
+  const TfLiteTensor* output_gate_bias;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, lstm::full::kOutputGateBiasTensor,
+                            &output_gate_bias));
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
 
@@ -229,27 +247,33 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                               kTfLiteFloat32);
     }
 
-    const TfLiteTensor* forget_layer_norm_coefficients =
-        GetInput(context, node, lstm::full::kForgetLayerNormCoefficientsTensor);
-    TF_LITE_ENSURE(context, forget_layer_norm_coefficients != nullptr);
+    const TfLiteTensor* forget_layer_norm_coefficients;
+    TF_LITE_ENSURE_OK(
+        context, GetInputSafe(context, node,
+                              lstm::full::kForgetLayerNormCoefficientsTensor,
+                              &forget_layer_norm_coefficients));
     TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, forget_layer_norm_coefficients->dims->data[0],
                       n_cell);
     TF_LITE_ENSURE_TYPES_EQ(context, forget_layer_norm_coefficients->type,
                             kTfLiteFloat32);
 
-    const TfLiteTensor* cell_layer_norm_coefficients =
-        GetInput(context, node, lstm::full::kCellLayerNormCoefficientsTensor);
-    TF_LITE_ENSURE(context, cell_layer_norm_coefficients != nullptr);
+    const TfLiteTensor* cell_layer_norm_coefficients;
+    TF_LITE_ENSURE_OK(context,
+                      GetInputSafe(context, node,
+                                   lstm::full::kCellLayerNormCoefficientsTensor,
+                                   &cell_layer_norm_coefficients));
     TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_layer_norm_coefficients->dims->data[0],
                       n_cell);
     TF_LITE_ENSURE_TYPES_EQ(context, cell_layer_norm_coefficients->type,
                             kTfLiteFloat32);
 
-    const TfLiteTensor* output_layer_norm_coefficients =
-        GetInput(context, node, lstm::full::kOutputLayerNormCoefficientsTensor);
-    TF_LITE_ENSURE(context, output_layer_norm_coefficients != nullptr);
+    const TfLiteTensor* output_layer_norm_coefficients;
+    TF_LITE_ENSURE_OK(
+        context, GetInputSafe(context, node,
+                              lstm::full::kOutputLayerNormCoefficientsTensor,
+                              &output_layer_norm_coefficients));
     TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, output_layer_norm_coefficients->dims->data[0],
                       n_cell);
@@ -291,7 +315,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Inferring batch size, number of outputs and sequence length and
   // number of cells from the input tensors.
-  const TfLiteTensor* input = GetInput(context, node, lstm::full::kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, lstm::full::kInputTensor, &input));
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
   TF_LITE_ENSURE(context, input->dims->size > 1);
   const auto* params =
@@ -301,14 +327,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const int n_batch = time_major ? input->dims->data[1] : input->dims->data[0];
   const int n_input = input->dims->data[2];
 
-  const TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, lstm::full::kInputToOutputWeightsTensor);
+  const TfLiteTensor* input_to_output_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kInputToOutputWeightsTensor,
+                   &input_to_output_weights));
   const int n_cell = input_to_output_weights->dims->data[0];
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
 
-  const TfLiteTensor* recurrent_to_output_weights =
-      GetInput(context, node, lstm::full::kRecurrentToOutputWeightsTensor);
+  const TfLiteTensor* recurrent_to_output_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kRecurrentToOutputWeightsTensor,
+                   &recurrent_to_output_weights));
   TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0],
                     n_cell);
@@ -320,7 +352,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                                n_cell, is_layer_norm_lstm));
 
   // Get the pointer to output, output_state and cell_state buffer tensors.
-  TfLiteTensor* output = GetOutput(context, node, lstm::full::kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node,
+                                           lstm::full::kOutputTensor, &output));
 
   TfLiteTensor* output_state =
       GetVariableInput(context, node, lstm::full::kOutputStateTensor);
@@ -351,7 +385,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       scratch_tensor_index + kScratchBuffer;
 
   // Create a scratch buffer tensor.
-  TfLiteTensor* scratch_buffer = GetTemporary(context, node, kScratchBuffer);
+  TfLiteTensor* scratch_buffer;
+  TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kScratchBuffer,
+                                              &scratch_buffer));
   scratch_buffer->type = input->type;
   scratch_buffer->allocation_type = kTfLiteArenaRw;
 
@@ -376,8 +412,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // output_state and cell_state tensors.
     node->temporaries->data[kInputQuantized] =
         scratch_tensor_index + kInputQuantized;
-    TfLiteTensor* input_quantized =
-        GetTemporary(context, node, kInputQuantized);
+    TfLiteTensor* input_quantized;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kInputQuantized,
+                                                &input_quantized));
     input_quantized->type = input_to_output_weights->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
@@ -387,8 +424,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kOutputStateQuantized] =
         scratch_tensor_index + kOutputStateQuantized;
-    TfLiteTensor* output_state_quantized =
-        GetTemporary(context, node, kOutputStateQuantized);
+    TfLiteTensor* output_state_quantized;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kOutputStateQuantized,
+                                       &output_state_quantized));
     output_state_quantized->type = input_to_output_weights->type;
     output_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(output_state_quantized->dims,
@@ -401,8 +440,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kCellStateQuantized] =
         scratch_tensor_index + kCellStateQuantized;
-    TfLiteTensor* cell_state_quantized =
-        GetTemporary(context, node, kCellStateQuantized);
+    TfLiteTensor* cell_state_quantized;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kCellStateQuantized,
+                                       &cell_state_quantized));
     cell_state_quantized->type = input_to_output_weights->type;
     cell_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(cell_state_quantized->dims, cell_state->dims)) {
@@ -420,7 +461,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // the scaling factor of the matrix).
     node->temporaries->data[kInputScalingFactors] =
         op_data->scratch_tensor_index + kInputScalingFactors;
-    TfLiteTensor* input_sf = GetTemporary(context, node, kInputScalingFactors);
+    TfLiteTensor* input_sf;
+    TF_LITE_ENSURE_OK(
+        context,
+        GetTemporarySafe(context, node, kInputScalingFactors, &input_sf));
     input_sf->type = kTfLiteFloat32;
     input_sf->allocation_type = kTfLiteArenaRw;
     int scaling_dims[1] = {n_batch};
@@ -432,8 +476,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kOutputStateScalingFactors] =
         op_data->scratch_tensor_index + kOutputStateScalingFactors;
-    TfLiteTensor* output_state_sf =
-        GetTemporary(context, node, kOutputStateScalingFactors);
+    TfLiteTensor* output_state_sf;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, kOutputStateScalingFactors,
+                                  &output_state_sf));
     output_state_sf->type = kTfLiteFloat32;
     output_state_sf->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqualsArray(output_state_sf->dims, 1, scaling_dims)) {
@@ -444,8 +490,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kProductScalingFactors] =
         scratch_tensor_index + kProductScalingFactors;
-    TfLiteTensor* prod_scaling_factors =
-        GetTemporary(context, node, kProductScalingFactors);
+    TfLiteTensor* prod_scaling_factors;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kProductScalingFactors,
+                                       &prod_scaling_factors));
     prod_scaling_factors->type = kTfLiteFloat32;
     prod_scaling_factors->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqualsArray(prod_scaling_factors->dims, 1,
@@ -461,8 +509,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // this is used for diagonal matrices, only need to store n_cell values.
     node->temporaries->data[kRecoveredCellWeights] =
         scratch_tensor_index + kRecoveredCellWeights;
-    TfLiteTensor* recovered_cell_weights =
-        GetTemporary(context, node, kRecoveredCellWeights);
+    TfLiteTensor* recovered_cell_weights;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kRecoveredCellWeights,
+                                       &recovered_cell_weights));
     recovered_cell_weights->type = kTfLiteFloat32;
     recovered_cell_weights->allocation_type = kTfLiteArenaRw;
     int recovered_cell_dims[1] = {n_cell};
@@ -478,7 +528,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     // Allocate a temporary tensor to store the accumulated int32 values.
     node->temporaries->data[kAccumScratch] =
         scratch_tensor_index + kAccumScratch;
-    TfLiteTensor* accum_scratch = GetTemporary(context, node, kAccumScratch);
+    TfLiteTensor* accum_scratch;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kAccumScratch,
+                                                &accum_scratch));
     accum_scratch->type = kTfLiteInt32;
     accum_scratch->allocation_type = kTfLiteArenaRw;
     int accum_scratch_dims[2] = {n_cell, n_batch};
@@ -492,7 +544,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kInputZeroPoints] =
         op_data->scratch_tensor_index + kInputZeroPoints;
-    TfLiteTensor* input_zp = GetTemporary(context, node, kInputZeroPoints);
+    TfLiteTensor* input_zp;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, kInputZeroPoints, &input_zp));
     input_zp->type = kTfLiteFloat32;
     input_zp->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqualsArray(input_zp->dims, 1, scaling_dims)) {
@@ -503,8 +557,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
     node->temporaries->data[kOutputStateZeroPoints] =
         op_data->scratch_tensor_index + kOutputStateZeroPoints;
-    TfLiteTensor* output_state_zp =
-        GetTemporary(context, node, kOutputStateZeroPoints);
+    TfLiteTensor* output_state_zp;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kOutputStateZeroPoints,
+                                       &output_state_zp));
     output_state_zp->type = kTfLiteFloat32;
     output_state_zp->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqualsArray(output_state_zp->dims, 1, scaling_dims)) {
@@ -514,7 +570,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                                        output_state_zp_size));
     }
     node->temporaries->data[kRowSums] = scratch_tensor_index + kRowSums;
-    TfLiteTensor* row_sums = GetTemporary(context, node, kRowSums);
+    TfLiteTensor* row_sums;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, kRowSums, &row_sums));
     row_sums->type = kTfLiteInt32;
     row_sums->allocation_type = kTfLiteArenaRwPersistent;
     int row_sums_rows = use_cifg ? 6 : 8;
@@ -542,25 +600,44 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
   const bool is_layer_norm_lstm = op_data->is_layer_norm_lstm;
   const bool time_major = params->time_major;
-  const TfLiteTensor* input = GetInput(context, node, lstm::full::kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, lstm::full::kInputTensor, &input));
 
   const TfLiteTensor* input_to_input_weights = GetOptionalInputTensor(
       context, node, lstm::full::kInputToInputWeightsTensor);
-  const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, lstm::full::kInputToForgetWeightsTensor);
-  const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, lstm::full::kInputToCellWeightsTensor);
-  const TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, lstm::full::kInputToOutputWeightsTensor);
+  const TfLiteTensor* input_to_forget_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kInputToForgetWeightsTensor,
+                   &input_to_forget_weights));
+  const TfLiteTensor* input_to_cell_weights;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node,
+                                          lstm::full::kInputToCellWeightsTensor,
+                                          &input_to_cell_weights));
+  const TfLiteTensor* input_to_output_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kInputToOutputWeightsTensor,
+                   &input_to_output_weights));
 
   const TfLiteTensor* recurrent_to_input_weights = GetOptionalInputTensor(
       context, node, lstm::full::kRecurrentToInputWeightsTensor);
-  const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, lstm::full::kRecurrentToForgetWeightsTensor);
-  const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, lstm::full::kRecurrentToCellWeightsTensor);
-  const TfLiteTensor* recurrent_to_output_weights =
-      GetInput(context, node, lstm::full::kRecurrentToOutputWeightsTensor);
+  const TfLiteTensor* recurrent_to_forget_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kRecurrentToForgetWeightsTensor,
+                   &recurrent_to_forget_weights));
+  const TfLiteTensor* recurrent_to_cell_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kRecurrentToCellWeightsTensor,
+                   &recurrent_to_cell_weights));
+  const TfLiteTensor* recurrent_to_output_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, lstm::full::kRecurrentToOutputWeightsTensor,
+                   &recurrent_to_output_weights));
 
   const TfLiteTensor* cell_to_input_weights = GetOptionalInputTensor(
       context, node, lstm::full::kCellToInputWeightsTensor);
@@ -571,12 +648,18 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   const TfLiteTensor* input_gate_bias =
       GetOptionalInputTensor(context, node, lstm::full::kInputGateBiasTensor);
-  const TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, lstm::full::kForgetGateBiasTensor);
-  const TfLiteTensor* cell_gate_bias =
-      GetInput(context, node, lstm::full::kCellGateBiasTensor);
-  const TfLiteTensor* output_gate_bias =
-      GetInput(context, node, lstm::full::kOutputGateBiasTensor);
+  const TfLiteTensor* forget_gate_bias;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, lstm::full::kForgetGateBiasTensor,
+                            &forget_gate_bias));
+  const TfLiteTensor* cell_gate_bias;
+  TF_LITE_ENSURE_OK(context,
+                    GetInputSafe(context, node, lstm::full::kCellGateBiasTensor,
+                                 &cell_gate_bias));
+  const TfLiteTensor* output_gate_bias;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, lstm::full::kOutputGateBiasTensor,
+                            &output_gate_bias));
 
   const TfLiteTensor* projection_weights = GetOptionalInputTensor(
       context, node, lstm::full::kProjectionWeightsTensor);
@@ -584,14 +667,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       GetOptionalInputTensor(context, node, lstm::full::kProjectionBiasTensor);
 
   // Index the scratch buffers pointers to the global scratch buffer.
-  TfLiteTensor* scratch_buffer = GetTemporary(context, node, kScratchBuffer);
+  TfLiteTensor* scratch_buffer;
+  TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kScratchBuffer,
+                                              &scratch_buffer));
 
   TfLiteTensor* output_state =
       GetVariableInput(context, node, lstm::full::kOutputStateTensor);
-  TF_LITE_ENSURE(context, output_state != nullptr);
+  TFLITE_DCHECK(output_state != nullptr);
   TfLiteTensor* cell_state =
       GetVariableInput(context, node, lstm::full::kCellStateTensor);
-  TF_LITE_ENSURE(context, cell_state != nullptr);
+  TFLITE_DCHECK(cell_state != nullptr);
 
   const TfLiteTensor* input_layer_norm_coefficients =
       is_layer_norm_lstm
@@ -614,7 +699,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                      lstm::full::kOutputLayerNormCoefficientsTensor)
           : nullptr;
 
-  TfLiteTensor* output = GetOutput(context, node, lstm::full::kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node,
+                                           lstm::full::kOutputTensor, &output));
 
   // Copy out the LSTM specific params so they can be passed in the function.
   TfLiteLSTMParams lstm_params;
@@ -647,14 +734,25 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteUInt8:
     case kTfLiteInt8: {
       OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
-      TfLiteTensor* row_sums = GetTemporary(context, node, kRowSums);
+      TfLiteTensor* row_sums;
+      TF_LITE_ENSURE_OK(context,
+                        GetTemporarySafe(context, node, kRowSums, &row_sums));
       const int row_sums_size = row_sums->dims->data[0];
       return lstm_eval::EvalHybrid(
-          input, input_to_input_weights, input_to_forget_weights,
-          input_to_cell_weights, input_to_output_weights,
-          recurrent_to_input_weights, recurrent_to_forget_weights,
-          recurrent_to_cell_weights, recurrent_to_output_weights,
-          cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
+          input, input_to_input_weights,
+          /*input_to_input_weights_ledger*/ nullptr, input_to_forget_weights,
+          /*input_to_forget_weights_ledger*/ nullptr, input_to_cell_weights,
+          /*input_to_cell_weights_ledger*/ nullptr, input_to_output_weights,
+          /*input_to_output_weights_ledger*/ nullptr,
+          recurrent_to_input_weights,
+          /*recurrent_to_input_weights_ledger*/ nullptr,
+          recurrent_to_forget_weights,
+          /*recurrent_to_forget_weights_ledger*/ nullptr,
+          recurrent_to_cell_weights,
+          /*recurrent_to_cell_weights_ledger*/ nullptr,
+          recurrent_to_output_weights,
+          /*recurrent_to_output_weights_ledger*/ nullptr, cell_to_input_weights,
+          cell_to_forget_weights, cell_to_output_weights,
           input_layer_norm_coefficients, forget_layer_norm_coefficients,
           cell_layer_norm_coefficients, output_layer_norm_coefficients,
           /*aux_input=*/nullptr,
@@ -663,7 +761,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
           /*aux_input_to_cell_weights=*/nullptr,
           /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
           forget_gate_bias, cell_gate_bias, output_gate_bias,
-          projection_weights, projection_bias, &lstm_params,
+          projection_weights, /*projection_weights_ledger*/ nullptr,
+          projection_bias, &lstm_params,
           /*forward_sequence=*/true, time_major,
           /*output_offset=*/0, scratch_buffer,
           GetTemporary(context, node, kInputScalingFactors),
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
index 74584ec9e85..90a96ca98fe 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_lstm_test.cc
@@ -101,11 +101,10 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
     }
 
     // Adding the 2 state tensors.
-    output_state_ =
-        AddInput(TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}},
-                 /*is_variable=*/true);
-    cell_state_ = AddInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}},
-                           /*is_variable=*/true);
+    output_state_ = AddVariableInput(
+        TensorData{TensorType_FLOAT32, {n_output_ * n_batch_}});
+    cell_state_ =
+        AddVariableInput(TensorData{TensorType_FLOAT32, {n_cell_ * n_batch_}});
 
     // Layer norm weights.
     if (is_layer_norm) {
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
index 350ca293cbf..ba243d42976 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
@@ -61,13 +61,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* input_weights = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* recurrent_weights =
-      GetInput(context, node, kRecurrentWeightsTensor);
-  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
-  const TfLiteTensor* hidden_state =
-      GetInput(context, node, kHiddenStateTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* input_weights;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kWeightsTensor, &input_weights));
+  const TfLiteTensor* recurrent_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, kRecurrentWeightsTensor, &recurrent_weights));
+  const TfLiteTensor* bias;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kBiasTensor, &bias));
+  const TfLiteTensor* hidden_state;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kHiddenStateTensor, &hidden_state));
 
   // Check all the parameters of tensor match within themselves and match the
   // input configuration.
@@ -92,7 +99,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, hidden_state->dims->data[0], batch_size);
   TF_LITE_ENSURE_EQ(context, hidden_state->dims->data[1], num_units);
 
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   // Resize output.
   TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(3);
@@ -112,7 +121,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TfLiteIntArrayFree(node->temporaries);
     node->temporaries = TfLiteIntArrayCreate(6);
     node->temporaries->data[0] = op_data->scratch_tensor_index;
-    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
+    TfLiteTensor* input_quantized;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/0,
+                                                &input_quantized));
     input_quantized->type = input_weights->type;
     input_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
@@ -121,8 +132,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                                        input_quantized_size));
     }
     node->temporaries->data[1] = op_data->scratch_tensor_index + 1;
-    TfLiteTensor* hidden_state_quantized =
-        GetTemporary(context, node, /*index=*/1);
+    TfLiteTensor* hidden_state_quantized;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/1,
+                                                &hidden_state_quantized));
     hidden_state_quantized->type = input_weights->type;
     hidden_state_quantized->allocation_type = kTfLiteArenaRw;
     if (!TfLiteIntArrayEqual(hidden_state_quantized->dims,
@@ -134,7 +146,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                               hidden_state_quantized_size));
     }
     node->temporaries->data[2] = op_data->scratch_tensor_index + 2;
-    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/2);
+    TfLiteTensor* scaling_factors;
+    TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /*index=*/2,
+                                                &scaling_factors));
     scaling_factors->type = kTfLiteFloat32;
     scaling_factors->allocation_type = kTfLiteArenaRw;
     int scaling_dims[1] = {batch_size};
@@ -145,7 +159,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                                        scaling_factors_size));
     }
     node->temporaries->data[3] = op_data->scratch_tensor_index + 3;
-    TfLiteTensor* accum_scratch = GetTemporary(context, node, /*index=*/3);
+    TfLiteTensor* accum_scratch;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, /*index=*/3, &accum_scratch));
     accum_scratch->type = kTfLiteInt32;
     accum_scratch->allocation_type = kTfLiteArenaRw;
     int accum_scratch_dims[2] = {num_units, batch_size};
@@ -158,7 +174,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                                        accum_scratch_size));
     }
     node->temporaries->data[4] = op_data->scratch_tensor_index + 4;
-    TfLiteTensor* zero_points = GetTemporary(context, node, /*index=*/4);
+    TfLiteTensor* zero_points;
+    TF_LITE_ENSURE_OK(
+        context, GetTemporarySafe(context, node, /*index=*/4, &zero_points));
     zero_points->type = kTfLiteInt32;
     zero_points->allocation_type = kTfLiteArenaRw;
     int zero_points_dims[1] = {batch_size};
@@ -169,7 +187,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                                        zero_points_size));
     }
     node->temporaries->data[5] = op_data->scratch_tensor_index + 5;
-    TfLiteTensor* row_sums = GetTemporary(context, node, /*index=*/5);
+    TfLiteTensor* row_sums;
+    TF_LITE_ENSURE_OK(context,
+                      GetTemporarySafe(context, node, /*index=*/5, &row_sums));
     row_sums->type = kTfLiteInt32;
     row_sums->allocation_type = kTfLiteArenaRwPersistent;
     int row_sums_dims[2] = {2, num_units};
@@ -335,15 +355,24 @@ TfLiteStatus EvalHybrid(
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSequenceRNNParams*>(node->builtin_data);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  const TfLiteTensor* input_weights = GetInput(context, node, kWeightsTensor);
-  const TfLiteTensor* recurrent_weights =
-      GetInput(context, node, kRecurrentWeightsTensor);
-  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  const TfLiteTensor* input_weights;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node, kWeightsTensor, &input_weights));
+  const TfLiteTensor* recurrent_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, kRecurrentWeightsTensor, &recurrent_weights));
+  const TfLiteTensor* bias;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kBiasTensor, &bias));
   // The hidden_state is a variable input tensor that can be modified.
   TfLiteTensor* hidden_state =
-      const_cast<TfLiteTensor*>(GetInput(context, node, kHiddenStateTensor));
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+      GetVariableInput(context, node, kHiddenStateTensor);
+  TF_LITE_ENSURE(context, hidden_state != nullptr);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   switch (input_weights->type) {
     case kTfLiteFloat32:
@@ -353,12 +382,23 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt8: {
       // TODO(mirkov): implement eval with quantized inputs as well.
       auto* op_data = reinterpret_cast<OpData*>(node->user_data);
-      TfLiteTensor* input_quantized = GetTemporary(context, node, 0);
-      TfLiteTensor* hidden_state_quantized = GetTemporary(context, node, 1);
-      TfLiteTensor* scaling_factors = GetTemporary(context, node, 2);
-      TfLiteTensor* accum_scratch = GetTemporary(context, node, 3);
-      TfLiteTensor* zero_points = GetTemporary(context, node, 4);
-      TfLiteTensor* row_sums = GetTemporary(context, node, 5);
+      TfLiteTensor* input_quantized;
+      TF_LITE_ENSURE_OK(context,
+                        GetTemporarySafe(context, node, 0, &input_quantized));
+      TfLiteTensor* hidden_state_quantized;
+      TF_LITE_ENSURE_OK(
+          context, GetTemporarySafe(context, node, 1, &hidden_state_quantized));
+      TfLiteTensor* scaling_factors;
+      TF_LITE_ENSURE_OK(context,
+                        GetTemporarySafe(context, node, 2, &scaling_factors));
+      TfLiteTensor* accum_scratch;
+      TF_LITE_ENSURE_OK(context,
+                        GetTemporarySafe(context, node, 3, &accum_scratch));
+      TfLiteTensor* zero_points;
+      TF_LITE_ENSURE_OK(context,
+                        GetTemporarySafe(context, node, 4, &zero_points));
+      TfLiteTensor* row_sums;
+      TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, 5, &row_sums));
       return EvalHybrid(input, input_weights, recurrent_weights, bias, params,
                         input_quantized, hidden_state_quantized,
                         scaling_factors, hidden_state, output, zero_points,
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc b/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
index f1486267c17..8ae562ea0b0 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_rnn_test.cc
@@ -183,7 +183,7 @@ class UnidirectionalRNNOpModel : public SingleOpModel {
     weights_ = AddInput(weights);
     recurrent_weights_ = AddInput(recurrent_weights);
     bias_ = AddInput(TensorType_FLOAT32);
-    hidden_state_ = AddInput(TensorType_FLOAT32, true);
+    hidden_state_ = AddVariableInput(TensorType_FLOAT32);
     output_ = AddOutput(TensorType_FLOAT32);
     SetBuiltinOp(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN,
                  BuiltinOptions_SequenceRNNOptions,
diff --git a/tensorflow/lite/kernels/unique.cc b/tensorflow/lite/kernels/unique.cc
index dd5c801b468..db2610fd642 100644
--- a/tensorflow/lite/kernels/unique.cc
+++ b/tensorflow/lite/kernels/unique.cc
@@ -44,11 +44,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 2);
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output_unique_tensor =
-      GetOutput(context, node, kOutputUniqueTensor);
-  TfLiteTensor* output_index_tensor =
-      GetOutput(context, node, kOutputIndexTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output_unique_tensor;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, kOutputUniqueTensor,
+                                           &output_unique_tensor));
+  TfLiteTensor* output_index_tensor;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, kOutputIndexTensor,
+                                           &output_index_tensor));
 
   // The op only supports 1D input.
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
@@ -70,7 +73,8 @@ TfLiteStatus EvalImpl(TfLiteContext* context, const TfLiteTensor* input,
   // Note that we prefer to use map than unordered_map as it showed less
   // increase in the binary size.
   std::map<T, int> unique_values;
-  TfLiteTensor* output_indexes = GetOutput(context, node, 1);
+  TfLiteTensor* output_indexes;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 1, &output_indexes));
   std::vector<T> output_values;
   I* indexes = GetTensorData<I>(output_indexes);
   const T* data = GetTensorData<T>(input);
@@ -88,7 +92,8 @@ TfLiteStatus EvalImpl(TfLiteContext* context, const TfLiteTensor* input,
     }
   }
   // Allocate output tensor.
-  TfLiteTensor* unique_output = GetOutput(context, node, 0);
+  TfLiteTensor* unique_output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &unique_output));
   std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
       TfLiteIntArrayCreate(NumDimensions(input)), TfLiteIntArrayFree);
   shape->data[0] = unique_values.size();
@@ -127,8 +132,11 @@ TfLiteStatus EvalImpl(TfLiteContext* context, const TfLiteTensor* input,
 }  // namespace
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* output_index_tensor = GetOutput(context, node, 1);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
+  TfLiteTensor* output_index_tensor;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, 1, &output_index_tensor));
   TF_LITE_ENSURE_EQ(context, NumElements(output_index_tensor),
                     NumElements(input));
 
diff --git a/tensorflow/lite/kernels/unpack.cc b/tensorflow/lite/kernels/unpack.cc
index a41556ed079..685d19bdd07 100644
--- a/tensorflow/lite/kernels/unpack.cc
+++ b/tensorflow/lite/kernels/unpack.cc
@@ -38,7 +38,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), data->num);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   TF_LITE_ENSURE(context, NumElements(input) > 0);
   int axis = data->axis;
   if (axis < 0) {
@@ -67,7 +68,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, data->num, input_shape->data[axis]);
   for (int i = 0; i < data->num; ++i) {
     TfLiteIntArray* copied_output_shape = TfLiteIntArrayCopy(output_shape);
-    TfLiteTensor* output = GetOutput(context, node, i);
+    TfLiteTensor* output;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, i, &output));
     TF_LITE_ENSURE_TYPES_EQ(context, output->type, input->type);
     // Guarantee input/output quantization params match as we do not support
     // rescaling of unpacked quantized tensors.
@@ -98,7 +100,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteUnpackParams* data =
       reinterpret_cast<TfLiteUnpackParams*>(node->builtin_data);
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   switch (input->type) {
     case kTfLiteFloat32: {
       UnpackImpl<float>(context, node, input, data->num, data->axis);
diff --git a/tensorflow/lite/kernels/where.cc b/tensorflow/lite/kernels/where.cc
index 8eb09bf2798..5169902a296 100644
--- a/tensorflow/lite/kernels/where.cc
+++ b/tensorflow/lite/kernels/where.cc
@@ -56,9 +56,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  const TfLiteTensor* cond_tensor =
-      GetInput(context, node, kInputConditionTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* cond_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputConditionTensor,
+                                          &cond_tensor));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   if (cond_tensor->type != kTfLiteBool) {
     context->ReportError(context,
@@ -81,9 +84,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* cond_tensor =
-      GetInput(context, node, kInputConditionTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* cond_tensor;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputConditionTensor,
+                                          &cond_tensor));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
 
   if (IsDynamicTensor(output)) {
     TF_LITE_ENSURE_OK(context,
diff --git a/tensorflow/lite/kernels/while.cc b/tensorflow/lite/kernels/while.cc
index b50cdff9974..1b55f5ee8fb 100644
--- a/tensorflow/lite/kernels/while.cc
+++ b/tensorflow/lite/kernels/while.cc
@@ -195,7 +195,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     }
   }
   for (int i = 0; i < num_inputs; ++i) {
-    TfLiteTensor* output = GetOutput(context, node, i);
+    TfLiteTensor* output;
+    TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, i, &output));
     if (op_data->body_has_dynamic_output_tensors) {
       SetTensorToDynamic(output);
     } else {
diff --git a/tensorflow/lite/kernels/zeros_like.cc b/tensorflow/lite/kernels/zeros_like.cc
index 8586c945e7c..d71006ed13a 100644
--- a/tensorflow/lite/kernels/zeros_like.cc
+++ b/tensorflow/lite/kernels/zeros_like.cc
@@ -32,8 +32,11 @@ constexpr int kOutputTensor = 0;
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   output->type = input->type;
 
   return context->ResizeTensor(context, output,
@@ -41,8 +44,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
   const int num_elements = NumElements(input);
   switch (input->type) {
     case kTfLiteInt64:
diff --git a/tensorflow/lite/micro/BUILD b/tensorflow/lite/micro/BUILD
index 242ea693de4..bd8f39bb925 100644
--- a/tensorflow/lite/micro/BUILD
+++ b/tensorflow/lite/micro/BUILD
@@ -31,13 +31,11 @@ cc_library(
     srcs = [
         "micro_allocator.cc",
         "micro_interpreter.cc",
-        "micro_optional_debug_tools.cc",
         "simple_memory_allocator.cc",
     ],
     hdrs = [
         "micro_allocator.h",
         "micro_interpreter.h",
-        "micro_optional_debug_tools.h",
         "simple_memory_allocator.h",
     ],
     copts = micro_copts(),
@@ -89,6 +87,7 @@ cc_library(
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@flatbuffers//:runtime_cc",
     ],
 )
@@ -110,6 +109,7 @@ cc_library(
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels:op_macros",
         "//tensorflow/lite/kernels/internal:compatibility",
+        "//tensorflow/lite/micro/kernels:fully_connected",
         "//tensorflow/lite/micro/kernels:micro_ops",
         "//tensorflow/lite/schema:schema_fbs",
     ],
@@ -329,6 +329,7 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":micro_framework",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
diff --git a/tensorflow/lite/micro/CONTRIBUTING.md b/tensorflow/lite/micro/CONTRIBUTING.md
index f81404a99b6..4227eb42858 100644
--- a/tensorflow/lite/micro/CONTRIBUTING.md
+++ b/tensorflow/lite/micro/CONTRIBUTING.md
@@ -1,17 +1,27 @@
-# Resources
+<!--
+Semi-automated TOC generation with instructions from
+https://github.com/ekalinin/github-markdown-toc#auto-insert-and-update-toc
+-->
 
-A
-[TF Lite Micro Github issue](https://github.com/tensorflow/tensorflow/issues/new?labels=comp%3Amicro&template=70-tflite-micro-issue.md)
-should be the primary method of getting in touch with the TensorFlow Lite Micro
-(TFLM) team.
+<!--ts-->
 
-The following resources may also be useful:
+*   [Contributing Guidelines](#contributing-guidelines)
+    *   [General Pull Request Guidelines](#general-pull-request-guidelines)
+    *   [Guidelines for Specific Contribution Categories](#guidelines-for-specific-contribution-categories)
+        *   [Bug Fixes](#bug-fixes)
+        *   [Reference Kernel Implementations](#reference-kernel-implementations)
+        *   [Optimized Kernel Implementations](#optimized-kernel-implementations)
+        *   [New Target / Platform / IDE / Examples](#new-target--platform--ide--examples)
+        *   [New Features](#new-features)
+*   [Development Workflow Notes](#development-workflow-notes)
+    *   [Before submitting your PR](#before-submitting-your-pr)
+    *   [During the PR review](#during-the-pr-review)
+    *   [Reviewer notes](#reviewer-notes)
+    *   [Python notes](#python-notes)
 
-1.  SIG Micro [email group](https://groups.google.com/a/tensorflow.org/g/micro)
-    and
-    [monthly meetings](http://doc/1YHq9rmhrOUdcZnrEnVCWvd87s2wQbq4z17HbeRl-DBc).
+<!-- Added by: advaitjain, at: Mon 05 Oct 2020 02:38:02 PM PDT -->
 
-1.  SIG Micro [gitter chat room](https://gitter.im/tensorflow/sig-micro).
+<!--te-->
 
 # Contributing Guidelines
 
@@ -72,7 +82,13 @@ We strongly recommend that contributors:
         *   We will be adding internal checks that automate this requirement by
             matching the PR description to the regexp: `(Fixes|Issue) #`
 
-## Guidlines for Specific Contribution Categories
+1.  Unit tests are critical to a healthy codebase. PRs without tests should be
+    the exception rather than the norm. And contributions to improve, simplify,
+    or make the unit tests more exhaustive are welcome! Please refer to
+    [this guideline](https://google.github.io/eng-practices/review/developer/small-cls.html#test_code)
+    on how test code and writing small PRs should be reconciled.
+
+## Guidelines for Specific Contribution Categories
 
 We provide some additional guidelines for different categories of contributions.
 
@@ -86,6 +102,9 @@ fixing a bug needs a bigger architectural change.
     [TF Lite Micro Github issue](https://github.com/tensorflow/tensorflow/issues/new?labels=comp%3Amicro&template=70-tflite-micro-issue.md)
     to determine the scope of the bug fix.
 1.  Send a PR (if that is determined to be the best path forward).
+1.  Bugfix PRs should be accompanied by a test case that fails prior to the fix
+    and passes with the fix. This validates that the fix works as expected, and
+    helps prevent future regressions.
 
 ### Reference Kernel Implementations
 
@@ -123,8 +142,9 @@ October 2020 and will provide additional guidelines at that time.
 
 *   If you would like to have an exception to this pause, with the understanding
     that your optimized kernels will break as we improve the underlying
-    framework, then please contact the TFLM team and we can figure out some
-    middle ground.
+    framework, then please send an email to the [SIG Micro email
+    group](https://groups.google.com/a/tensorflow.org/g/micro) to figure out
+    a middle ground.
 
 *   Every optimized kernel directory must have a README.md with the github IDs
     of the maintainers and any other relevant documentation. PRs that add
@@ -156,3 +176,132 @@ and more scalable.
 Having said that, we still invite feature requests via
 [TF Lite Micro Github issues](https://github.com/tensorflow/tensorflow/issues/new?labels=comp%3Amicro&template=70-tflite-micro-issue.md)
 to determine if the requested feature aligns with the TFLM roadmap.
+
+# Development Workflow Notes
+
+## Before submitting your PR
+
+1.  Run in-place clang-format on all the files that are modified in your git
+    tree with
+
+    ```
+    clang-format -i -style=google `git ls-files -m | grep "\.cc"`
+    clang-format -i -style=google `git ls-files -m | grep "\.h"`
+    ```
+
+1.  Make sure your code is lint-free.
+
+    Get a copy of
+    [cpplint](https://github.com/google/styleguide/tree/gh-pages/cpplint)
+
+    Run cpplint.py on all modified files in your git tree:
+
+    ```
+    cpplint.py `git ls-files -m`
+    ```
+
+1.  Run all the tests for x86, and any other platform that you are modifying.
+
+    ```
+    tensorflow/lite/micro/tools/make/tools/ci_build/test_x86.sh
+    ```
+
+    Please check the READMEs in the optimized kernel directories for specific
+    instructions.
+
+1.  Sometimes, bugs are caught by the address sanitizer that can go unnoticed
+    via the Makefile. To run a test with the address sanitizer, use the
+    following command (replace `micro_interpreter_test` with the target that you
+    want to test:
+
+    ```
+    CC=clang BAZEL_COMPILER=llvm bazel run --copt=-DADDRESS_SANITIZER \
+    --copt=-fsanitize=address --linkopt=-fsanitize=address \
+    tensorflow/lite/micro:micro_interpreter_test
+    ```
+
+## During the PR review
+
+1.  Do not change the git version history.
+
+    *   Always merge upstream/master (***do not rebase***) and no force-pushes
+        please.
+
+    *   Having an extra merge commit it ok as the github review tool handles
+        that gracefully.
+
+    Assuming that you forked tensorflow and added a remote called upstream with:
+
+    `git remote add upstream https://github.com/tensorflow/tensorflow.git`
+
+    Fetch the latest changes from upstream and merge into your local branch.
+
+    ```
+    git fetch upstream
+    git merge upstream/master
+    ```
+
+    In case of a merge conflict, resolve via:
+
+    ```
+    git mergetool
+
+    # Use your favorite diff tools (e.g. meld) to resolve the conflicts.
+
+    git add <files that were manually resolved>
+
+    git commit
+    ```
+
+1.  If a force push seems to be the only path forward, please stop and let your
+    PR reviewer know ***before*** force pushing. We will attempt to do the merge
+    for you. This will also help us better understand in what conditions a
+    force-push may be unavoidable.
+
+## Reviewer notes
+
+*   [GIthub CLI](https://cli.github.com) can be useful to quickly checkout a PR
+    to test locally.
+
+    `gh pr checkout <PR number>`
+
+*   Google engineers on the Tensorflow team will have the permissions to push
+    edits to most PRs. This can be useful to make some small fixes as a result
+    of errors due to internal checks that are not easily reproducible via
+    github.
+
+    One example of this is
+    [this comment](https://github.com/tensorflow/tensorflow/pull/38634#issuecomment-683190474).
+
+    And a sketch of the steps:
+
+    ```
+    git remote add <remote_name> git@github.com:<PR author>/tensorflow.git
+    git fetch <remote_name>
+
+    git checkout -b <local-branch-name> <remote_name>/<PR branch name>
+
+    # make changes and commit to local branch
+
+    # push changes to remove branch
+
+    git push <remote_name> <PR branch name>
+
+    # remove the temp remote to clean up your git environment.
+
+    git remote rm <remote_name>
+    ```
+
+## Python notes
+
+Most PRs for TensorFlow Lite Micro will be C++ only. Adding some notes on Python
+that can be expanded and improved as necessary.
+
+*   [TensorFlow guide](https://www.tensorflow.org/community/contribute/code_style#python_style)
+    for Python development
+
+*   [yapf](https://github.com/google/yapf/) should be used for formatting.
+
+    ```
+    yapf log_parser.py -i --style='{based_on_style: pep8, indent_width: 2}'
+    ```
diff --git a/tensorflow/lite/micro/README.md b/tensorflow/lite/micro/README.md
index c12c83b98b1..c9dcc43c91a 100644
--- a/tensorflow/lite/micro/README.md
+++ b/tensorflow/lite/micro/README.md
@@ -1,291 +1,50 @@
+<!--
+Semi-automated TOC generation with instructions from
+https://github.com/ekalinin/github-markdown-toc#auto-insert-and-update-toc
+-->
+
+<!--ts-->
+
+*   [TensorFlow Lite for Microcontrollers](#tensorflow-lite-for-microcontrollers)
+*   [Getting Help and Involved](#getting-help-and-involved)
+*   [Additional Documentation for the TFLM Internals](#additional-documentation-for-the-tflm-internals)
+
+<!-- Added by: advaitjain, at: Mon 05 Oct 2020 02:37:34 PM PDT -->
+
+<!--te-->
+
 # TensorFlow Lite for Microcontrollers
 
-TensorFlow Lite for Microcontrollers is a port of TensorFlow Lite designed to run
-machine learning models on microcontrollers and other devices with only kilobytes 
-of memory.
+TensorFlow Lite for Microcontrollers is a port of TensorFlow Lite designed to
+run machine learning models on microcontrollers and other devices with only
+kilobytes of memory.
 
 To learn how to use the framework, visit the developer documentation at
 [tensorflow.org/lite/microcontrollers](https://www.tensorflow.org/lite/microcontrollers).
 
-## Porting to a new platform
+# Getting Help and Involved
 
-The remainder of this document provides guidance on porting TensorFlow Lite for
-Microcontrollers to new platforms. You should read the
-[developer documentation](https://www.tensorflow.org/lite/microcontrollers)
-first.
+A
+[TF Lite Micro Github issue](https://github.com/tensorflow/tensorflow/issues/new?labels=comp%3Amicro&template=70-tflite-micro-issue.md)
+should be the primary method of getting in touch with the TensorFlow Lite Micro
+(TFLM) team.
 
-### Requirements
+The following resources may also be useful:
 
-Since the core neural network operations are pure arithmetic, and don't require
-any I/O or other system-specific functionality, the code doesn't have to have
-many dependencies. We've tried to enforce this, so that it's as easy as possible
-to get TensorFlow Lite Micro running even on 'bare metal' systems without an OS.
-Here are the core requirements that a platform needs to run the framework:
+1.  SIG Micro [email group](https://groups.google.com/a/tensorflow.org/g/micro)
+    and
+    [monthly meetings](http://doc/1YHq9rmhrOUdcZnrEnVCWvd87s2wQbq4z17HbeRl-DBc).
 
--   C/C++ compiler capable of C++11 compatibility. This is probably the most
-    restrictive of the requirements, since C++11 is not as widely adopted in the
-    embedded world as it is elsewhere. We made the decision to require it since
-    one of the main goals of TFL Micro is to share as much code as possible with
-    the wider TensorFlow codebase, and since that relies on C++11 features, we
-    need compatibility to achieve it. We only use a small, sane, subset of C++
-    though, so don't worry about having to deal with template metaprogramming or
-    similar challenges!
+1.  SIG Micro [gitter chat room](https://gitter.im/tensorflow/sig-micro).
 
--   Debug logging. The core network operations don't need any I/O functions, but
-    to be able to run tests and tell if they've worked as expected, the
-    framework needs some way to write out a string to some kind of debug
-    console. This will vary from system to system, for example on Linux it could
-    just be `fprintf(stderr, debug_string)` whereas an embedded device might
-    write the string out to a specified UART. As long as there's some mechanism
-    for outputting debug strings, you should be able to use TFL Micro on that
-    platform.
+If you are interested in contributing code to TensorFlow Lite for
+Microcontrollers then please read our [contributions guide](CONTRIBUTING.md).
 
--   Math library. The C standard `libm.a` library is needed to handle some of
-    the mathematical operations used to calculate neural network results.
+# Additional Documentation
 
--   Global variable initialization. We do use a pattern of relying on global
-    variables being set before `main()` is run in some places, so you'll need to
-    make sure your compiler toolchain supports this.
+For developers that are interested in more details of the internals of the
+project, we have additional documentation in the [docs](docs/) folder.
 
-And that's it! You may be wondering about some other common requirements that
-are needed by a lot of non-embedded software, so here's a brief list of things
-that aren't necessary to get started with TFL Micro on a new platform:
-
--   Operating system. Since the only platform-specific function we need is
-    `DebugLog()`, there's no requirement for any kind of Posix or similar
-    functionality around files, processes, or threads.
-
--   C or C++ standard libraries. The framework tries to avoid relying on any
-    standard library functions that require linker-time support. This includes
-    things like string functions, but still allows us to use headers like
-    `stdtypes.h` which typically just define constants and typedefs.
-    Unfortunately this distinction isn't officially defined by any standard, so
-    it's possible that different toolchains may decide to require linked code
-    even for the subset we use, but in practice we've found it's usually a
-    pretty obvious decision and stable over platforms and toolchains.
-
--   Dynamic memory allocation. All the TFL Micro code avoids dynamic memory
-    allocation, instead relying on local variables on the stack in most cases,
-    or global variables for a few situations. These are all fixed-size, which
-    can mean some compile-time configuration to ensure there's enough space for
-    particular networks, but does avoid any need for a heap and the
-    implementation of `malloc\new` on a platform.
-
--   Floating point. Eight-bit integer arithmetic is enough for inference on many
-    networks, so if a model sticks to these kind of quantized operations, no
-    floating point instructions should be required or executed by the framework.
-
-### Getting started
-
-We recommend that you start trying to compile and run one of the simplest tests
-in the framework as your first step. The full TensorFlow codebase can seem
-overwhelming to work with at first, so instead you can begin with a collection
-of self-contained project folders that only include the source files needed for
-a particular test or executable. You can find a set of pre-generated projects
-[here](https://drive.google.com/open?id=1cawEQAkqquK_SO4crReDYqf_v7yAwOY8).
-
-As mentioned above, the one function you will need to implement for a completely
-new platform is debug logging. If your device is just a variation on an existing
-platform you may be able to reuse code that's already been written. To
-understand what's available, begin with the default reference implementation at
-[tensorflow/lite/micro/debug_log.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/debug_log.cc),
-which uses fprintf and stderr. If your platform has this level of support for
-the C standard library in its toolchain, then you can just reuse this.
-Otherwise, you'll need to do some research into how your platform and device can
-communicate logging statements to the outside world. As another example, take a
-look at
-[the Mbed version of `DebugLog()`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/mbed/debug_log.cc),
-which creates a UART object and uses it to output strings to the host's console
-if it's connected.
-
-Begin by navigating to the micro_error_reporter_test folder in the pregenerated
-projects you downloaded. Inside here, you'll see a set of folders containing all
-the source code you need. If you look through them, you should find a total of
-around 60 C or C++ files that compiled together will create the test executable.
-There's an example makefile in the directory that lists all of the source files
-and include paths for the headers. If you're building on a Linux or MacOS host
-system, you may just be able to reuse that same makefile to cross-compile for
-your system, as long as you swap out the `CC` and `CXX` variables from their
-defaults, to point to your cross compiler instead (for example
-`arm-none-eabi-gcc` or `riscv64-unknown-elf-gcc`). Otherwise, set up a project
-in the build system you are using. It should hopefully be fairly
-straightforward, since all of the source files in the folder need to be
-compiled, so on many IDEs you can just drag the whole lot in. Then you need to
-make sure that C++11 compatibility is turned on, and that the right include
-paths (as mentioned in the makefile) have been added.
-
-You'll see the default `DebugLog()` implementation in
-'tensorflow/lite/micro/debug_log.cc' inside the
-micro_error_reporter_test folder. Modify that file to add the right
-implementation for your platform, and then you should be able to build the set
-of files into an executable. Transfer that executable to your target device (for
-example by flashing it), and then try running it. You should see output that
-looks something like this:
-
-```
-Number: 42
-Badly-formed format string
-Another  badly-formed  format string
-~~ALL TESTS PASSED~~~
-```
-
-If not, you'll need to debug what went wrong, but hopefully with this small
-starting project it should be manageable.
-
-### Troubleshooting
-
-When we've been porting to new platforms, it's often been hard to figure out
-some of the fundamentals like linker settings and other toolchain setup flags.
-If you are having trouble, see if you can find a simple example program for your
-platform, like one that just blinks an LED. If you're able to build and run that
-successfully, then start to swap in parts of the TF Lite Micro codebase to that
-working project, taking it a step at a time and ensuring it's still working
-after every change. For example, a first step might be to paste in your
-`DebugLog()` implementation and call `DebugLog("Hello World!")` from the main
-function.
-
-Another common problem on embedded platforms is the stack size being too small.
-Mbed defaults to 4KB for the main thread's stack, which is too small for most
-models since TensorFlow Lite allocates buffers and other data structures that
-require more memory. The exact size will depend on which model you're running,
-but try increasing it if you are running into strange corruption issues that
-might be related to stack overwriting.
-
-### Optimizing for your platform
-
-The default reference implementations in TensorFlow Lite Micro are written to be
-portable and easy to understand, not fast, so you'll want to replace performance
-critical parts of the code with versions specifically tailored to your
-architecture. The framework has been designed with this in mind, and we hope the
-combination of small modules and many tests makes it as straightforward as
-possible to swap in your own code a piece at a time, ensuring you have a working
-version at every step. To write specialized implementations for a platform, it's
-useful to understand how optional components are handled inside the build
-system.
-
-### Code module organization
-
-We have adopted a system of small modules with platform-specific implementations
-to help with portability. Every module is just a standard `.h` header file
-containing the interface (either functions or a class), with an accompanying
-reference implementation in a `.cc` with the same name. The source file
-implements all of the code that's declared in the header. If you have a
-specialized implementation, you can create a folder in the same directory as the
-header and reference source, name it after your platform, and put your
-implementation in a `.cc` file inside that folder. We've already seen one
-example of this, where the Mbed and Bluepill versions of `DebugLog()` are inside
-[mbed](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/mbed)
-and
-[bluepill](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/bluepill)
-folders, children of the
-[same directory](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite//micro)
-where the stdio-based
-[`debug_log.cc`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/debug_log.cc)
-reference implementation is found.
-
-The advantage of this approach is that we can automatically pick specialized
-implementations based on the current build target, without having to manually
-edit build files for every new platform. It allows incremental optimizations
-from a always-working foundation, without cluttering the reference
-implementations with a lot of variants.
-
-To see why we're doing this, it's worth looking at the alternatives. TensorFlow
-Lite has traditionally used preprocessor macros to separate out some
-platform-specific code within particular files, for example:
-
-```
-#ifndef USE_NEON
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
-#define USE_NEON
-#include <arm_neon.h>
-#endif
-```
-
-There’s also a tradition in gemmlowp of using file suffixes to indicate
-platform-specific versions of particular headers, with kernel_neon.h being
-included by kernel.h if `USE_NEON` is defined. As a third variation, kernels are
-separated out using a directory structure, with
-tensorflow/lite/kernels/internal/reference containing portable implementations,
-and tensorflow/lite/kernels/internal/optimized holding versions optimized for
-NEON on Arm platforms.
-
-These approaches are hard to extend to multiple platforms. Using macros means
-that platform-specific code is scattered throughout files in a hard-to-find way,
-and can make following the control flow difficult since you need to understand
-the macro state to trace it. For example, I temporarily introduced a bug that
-disabled NEON optimizations for some kernels when I removed
-tensorflow/lite/kernels/internal/common.h from their includes, without realizing
-it was where USE_NEON was defined!
-
-It’s also tough to port to different build systems, since figuring out the right
-combination of macros to use can be hard, especially since some of them are
-automatically defined by the compiler, and others are only set by build scripts,
-often across multiple rules.
-
-The approach we are using extends the file system approach that we use for
-kernel implementations, but with some specific conventions:
-
--   For each module in TensorFlow Lite, there will be a parent directory that
-    contains tests, interface headers used by other modules, and portable
-    implementations of each part.
--   Portable means that the code doesn’t include code from any libraries except
-    flatbuffers, or other TF Lite modules. You can include a limited subset of
-    standard C or C++ headers, but you can’t use any functions that require
-    linking against those libraries, including fprintf, etc. You can link
-    against functions in the standard math library, in <math.h>.
--   Specialized implementations are held inside subfolders of the parent
-    directory, named after the platform or library that they depend on. So, for
-    example if you had my_module/foo.cc, a version that used RISC-V extensions
-    would live in my_module/riscv/foo.cc. If you had a version that used the
-    CMSIS library, it should be in my_module/cmsis/foo.cc.
--   These specialized implementations should completely replace the top-level
-    implementations. If this involves too much code duplication, the top-level
-    implementation should be split into smaller files, so only the
-    platform-specific code needs to be replaced.
--   There is a convention about how build systems pick the right implementation
-    file. There will be an ordered list of 'tags' defining the preferred
-    implementations, and to generate the right list of source files, each module
-    will be examined in turn. If a subfolder with a tag’s name contains a .cc
-    file with the same base name as one in the parent folder, then it will
-    replace the parent folder’s version in the list of build files. If there are
-    multiple subfolders with matching tags and file names, then the tag that’s
-    latest in the ordered list will be chosen. This allows us to express “I’d
-    like generically-optimized fixed point if it’s available, but I’d prefer
-    something using the CMSIS library” using the list 'fixed_point cmsis'. These
-    tags are passed in as `TAGS="<foo>"` on the command line when you use the
-    main Makefile to build.
--   There is an implicit “reference” tag at the start of every list, so that
-    it’s possible to support directory structures like the current
-    tensorflow/kernels/internal where portable implementations are held in a
-    “reference” folder that’s a sibling to the NEON-optimized folder.
--   The headers for each unit in a module should remain platform-agnostic, and
-    be the same for all implementations. Private headers inside a sub-folder can
-    be used as needed, but shouldn’t be referred to by any portable code at the
-    top level.
--   Tests should be at the parent level, with no platform-specific code.
--   No platform-specific macros or #ifdef’s should be used in any portable code.
-
-The implementation of these rules is handled inside the Makefile, with a
-[`specialize` function](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/tools/make/helper_functions.inc#L42)
-that takes a list of reference source file paths as an input, and returns the
-equivalent list with specialized versions of those files swapped in if they
-exist.
-
-### Implementing more optimizations
-
-Clearly, getting debug logging support is only the beginning of the work you'll
-need to do on a particular platform. It's very likely that you'll want to
-optimize the core deep learning operations that take up the most time when
-running models you care about. The good news is that the process for providing
-optimized implementations is the same as the one you just went through to
-provide your own logging. You'll need to identify parts of the code that are
-bottlenecks, and then add specialized implementations in their own folders.
-These don't need to be platform specific, they can also be broken out by which
-library they rely on for example. [Here's where we do that for the CMSIS
-implementation of integer fast-fourier
-transforms](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_features_generator.cc).
-This more complex case shows that you can also add helper source files alongside
-the main implementation, as long as you
-[mention them in the platform-specific makefile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/CMSIS/Makefile.inc).
-You can also do things like update the list of libraries that need to be linked
-in, or add include paths to required headers.
+*   [Benchmarks](benchmarks/README.md)
+*   [Memory Management](docs/memory_management.md)
+*   [New Platform Support](docs/new_platform_support.md)
diff --git a/tensorflow/lite/micro/all_ops_resolver.cc b/tensorflow/lite/micro/all_ops_resolver.cc
index ff461cb947e..0a2a0c0f7fe 100644
--- a/tensorflow/lite/micro/all_ops_resolver.cc
+++ b/tensorflow/lite/micro/all_ops_resolver.cc
@@ -63,15 +63,18 @@ AllOpsResolver::AllOpsResolver() {
   AddPadV2();
   AddPrelu();
   AddQuantize();
+  AddReduceMax();
   AddRelu();
   AddRelu6();
   AddReshape();
   AddResizeNearestNeighbor();
   AddRound();
   AddRsqrt();
+  AddShape();
   AddSin();
   AddSoftmax();
   AddSplit();
+  AddSplitV();
   AddSqrt();
   AddSquare();
   AddStridedSlice();
diff --git a/tensorflow/lite/micro/benchmarks/BUILD b/tensorflow/lite/micro/benchmarks/BUILD
index 5a8a88133e6..f2eb0144d32 100644
--- a/tensorflow/lite/micro/benchmarks/BUILD
+++ b/tensorflow/lite/micro/benchmarks/BUILD
@@ -18,30 +18,6 @@ cc_library(
     ],
 )
 
-cc_binary(
-    name = "conv_benchmark",
-    srcs = [
-        "conv_benchmark.cc",
-    ],
-    deps = [
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_time",
-        "//tensorflow/lite/micro/testing:micro_test",
-    ],
-)
-
-cc_binary(
-    name = "depthwise_conv_benchmark",
-    srcs = [
-        "depthwise_conv_benchmark.cc",
-    ],
-    deps = [
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_time",
-        "//tensorflow/lite/micro/testing:micro_test",
-    ],
-)
-
 cc_library(
     name = "keyword_scrambled_model_data",
     srcs = [
@@ -65,6 +41,7 @@ cc_binary(
         "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro/kernels:fully_connected",
     ],
 )
 
diff --git a/tensorflow/lite/micro/benchmarks/Makefile.inc b/tensorflow/lite/micro/benchmarks/Makefile.inc
index 4a57ef39d69..a47bc2e723a 100644
--- a/tensorflow/lite/micro/benchmarks/Makefile.inc
+++ b/tensorflow/lite/micro/benchmarks/Makefile.inc
@@ -1,7 +1,3 @@
-$(eval $(call add_third_party_download,$(PERSON_MODEL_URL),$(PERSON_MODEL_MD5),person_model_grayscale,))
-$(eval $(call add_third_party_download,$(PERSON_MODEL_INT8_URL),$(PERSON_MODEL_INT8_MD5),person_model_int8,))
-
-
 KEYWORD_BENCHMARK_SRCS := \
 tensorflow/lite/micro/benchmarks/keyword_benchmark.cc \
 tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.cc
@@ -36,3 +32,4 @@ $(PERSON_DETECTION_BENCHMARK_SRCS),$(PERSON_DETECTION_BENCHMARK_HDRS)))
 
 $(eval $(call microlite_test,person_detection_experimental_benchmark,\
 $(PERSON_DETECTION_EXPERIMENTAL_BENCHMARK_SRCS),$(PERSON_DETECTION_EXPERIMENTAL_BENCHMARK_HDRS)))
+
diff --git a/tensorflow/lite/micro/benchmarks/conv_benchmark.cc b/tensorflow/lite/micro/benchmarks/conv_benchmark.cc
deleted file mode 100644
index d64b31dd39a..00000000000
--- a/tensorflow/lite/micro/benchmarks/conv_benchmark.cc
+++ /dev/null
@@ -1,230 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/micro_time.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
-
-namespace tflite {
-namespace testing {
-namespace {
-
-// Takes in quantized tensors along with expected outputs, and runs a single
-// iteration of the conv op with the supplied parameters. Compares outputs vs
-// the expected outputs and logs any differences found. Additionally, logs the
-// number of clock ticks taken by the invoke call.
-TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
-                                 TfLiteConvParams* conv_params, int tolerance,
-                                 int output_length,
-                                 const int8_t* expected_output_data,
-                                 ErrorReporter* reporter) {
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, reporter, &context);
-
-  const TfLiteRegistration registration = ops::micro::Register_CONV_2D();
-
-  const char* init_data = reinterpret_cast<const char*>(conv_params);
-
-  // Init data size is always 0 for builtin ops.
-  const size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration.init) {
-    user_data = registration.init(&context, init_data, init_data_size);
-  }
-
-  // For an N element array, the raw array will be {N, Element 1, ... Element N}
-  // There are 3 inputs at index 0, 1 and 2 in the tensors array.
-  int inputs_array_data[] = {3, 0, 1, 2};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  // There is 1 output at index 3 in the tensors array.
-  int outputs_array_data[] = {1, 3};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(conv_params);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-
-  if (registration.prepare) {
-    TfLiteStatus prepare_status = registration.prepare(&context, &node);
-    if (prepare_status != kTfLiteOk) {
-      return prepare_status;
-    }
-  }
-
-  int32_t start = tflite::GetCurrentTimeTicks();
-  TfLiteStatus invoke_status = registration.invoke(&context, &node);
-  TF_LITE_REPORT_ERROR(reporter, "invoke took %d cycles\n",
-                       tflite::GetCurrentTimeTicks() - start);
-
-  if (registration.free) {
-    registration.free(&context, user_data);
-  }
-
-  if (invoke_status != kTfLiteOk) {
-    return invoke_status;
-  }
-
-  int8_t* output_data = tensors[3].data.int8;
-  for (int i = 0; i < output_length; ++i) {
-    if (std::abs(expected_output_data[i] - output_data[i]) > tolerance) {
-      TF_LITE_REPORT_ERROR(reporter, "output[%d] failed, was %d expected %d\n",
-                           i, static_cast<int>(output_data[i]),
-                           static_cast<int>(expected_output_data[i]));
-    }
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace
-}  // namespace testing
-}  // namespace tflite
-
-int main() {
-  tflite::MicroErrorReporter reporter;
-  const int input_shape[] = {4, 1, 1, 1, 32};
-  const int filter_shape[] = {4, 32, 1, 1, 32};
-  const int bias_shape[] = {1, 32};
-  const int output_shape[] = {4, 1, 1, 1, 32};
-  float filter_values[32 * 32];
-  float input_values[32];
-  float bias_values[32];
-
-  // Generated these outputs using the floating point reference conv kernel.
-  // TODO(b/149942509): Do this comparison automatically on random inputs.
-  float expected_output[32] = {
-      5168.000000,  3377.000000,  306.000000,   -4045.000000, -4556.000000,
-      -1227.000000, 822.000000,   1591.000000,  5176.000000,  3385.000000,
-      314.000000,   -4037.000000, -4548.000000, -1219.000000, 830.000000,
-      1599.000000,  5184.000000,  3393.000000,  322.000000,   -4029.000000,
-      -4540.000000, -1211.000000, 838.000000,   1607.000000,  5192.000000,
-      3401.000000,  330.000000,   -4021.000000, -4532.000000, -1203.000000,
-      846.000000,   1615.000000};
-
-  for (int i = 0; i < 32; i++) {
-    bias_values[i] = i;
-    input_values[i] = i - 16;
-  }
-
-  for (int i = 0; i < 32 * 32; i++) {
-    filter_values[i] = (i * 25) % 256 - 128;
-  }
-
-  TfLiteConvParams conv_params;
-  conv_params.activation = kTfLiteActNone;
-  conv_params.dilation_height_factor = 1;
-  conv_params.dilation_width_factor = 1;
-  conv_params.stride_height = 1;
-  conv_params.stride_width = 1;
-  conv_params.padding = kTfLitePaddingValid;
-
-  TfLiteIntArray* input_dims = tflite::testing::IntArrayFromInts(input_shape);
-  TfLiteIntArray* filter_dims = tflite::testing::IntArrayFromInts(filter_shape);
-  TfLiteIntArray* bias_dims = tflite::testing::IntArrayFromInts(bias_shape);
-  TfLiteIntArray* output_dims = tflite::testing::IntArrayFromInts(output_shape);
-  const int output_dims_count = tflite::ElementCount(*output_dims);
-
-  // Quantization Parameters.  All scales except output are 1.0, and all zero
-  // points are 0. This direct-maps the values to floating point and makes it
-  // easy to reson about them.
-  int input_zero_point = 0;
-  float input_scale = 1.0f;
-  int filter_zero_point = 0;
-  float filter_scale = 1.0f;
-  int output_zero_point = 0;
-  // Output scale of 50 is needed to accomodate a float range of [-6400, 6350]
-  float output_scale = 50.0f;
-
-  // Create per-tensor quantized int8_t input tensor.
-  int8_t input_quantized[32];
-  TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor(
-      input_values, input_quantized, input_dims, input_scale, input_zero_point);
-  // Set zero point and scale arrays with a single element for each.
-  int input_zero_points[] = {1, input_zero_point};
-  float input_scales[] = {1, input_scale};
-  TfLiteAffineQuantization input_quant = {
-      tflite::testing::FloatArrayFromFloats(input_scales),
-      tflite::testing::IntArrayFromInts(input_zero_points)};
-  input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
-
-  // Create per-tensor quantized int8_t filter tensor.
-  int8_t filter_quantized[32 * 32];
-  TfLiteTensor filter_tensor = tflite::testing::CreateQuantizedTensor(
-      filter_values, filter_quantized, filter_dims, filter_scale,
-      filter_zero_point);
-  // Set zero point and scale arrays with a single element for each.
-  int filter_zero_points[] = {1, filter_zero_point};
-  float filter_scales[] = {1, filter_scale};
-  TfLiteAffineQuantization filter_quant = {
-      tflite::testing::FloatArrayFromFloats(filter_scales),
-      tflite::testing::IntArrayFromInts(filter_zero_points)};
-  filter_tensor.quantization = {kTfLiteAffineQuantization, &filter_quant};
-
-  // Create per-tensor quantized int32_t bias tensor.
-  int32_t bias_quantized[32];
-  tflite::SymmetricQuantize(bias_values, bias_quantized, 32,
-                            input_scale * output_scale);
-  TfLiteTensor bias_tensor =
-      tflite::testing::CreateInt32Tensor(bias_quantized, bias_dims);
-
-  // There is a single zero point of 0, and a single scale of
-  // input_scale * filter_scale.
-  int bias_zero_points[] = {1, 0};
-  float bias_scales[] = {1, input_scale * filter_scale};
-  TfLiteAffineQuantization bias_quant = {
-      tflite::testing::FloatArrayFromFloats(bias_scales),
-      tflite::testing::IntArrayFromInts(bias_zero_points)};
-  bias_tensor.quantization = {kTfLiteAffineQuantization, &bias_quant};
-
-  // Create per-tensor quantized int8_t output tensor.
-  int8_t output_quantized[32];
-  TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor(
-      output_quantized, output_dims, output_scale, output_zero_point);
-  // Set zero point and scale arrays with a single element for each.
-  int output_zero_points[] = {1, output_zero_point};
-  float output_scales[] = {1, output_scale};
-  TfLiteAffineQuantization output_quant = {
-      tflite::testing::FloatArrayFromFloats(output_scales),
-      tflite::testing::IntArrayFromInts(output_zero_points)};
-  output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
-
-  // The 3 inputs include the input, filter and bias tensors.
-  TfLiteTensor tensors[] = {
-      input_tensor,
-      filter_tensor,
-      bias_tensor,
-      output_tensor,
-  };
-
-  int8_t golden_quantized[32];
-  tflite::AsymmetricQuantize(expected_output, golden_quantized,
-                             output_dims_count, output_scale,
-                             output_zero_point);
-
-  // Rounding errors due to quantization should not exceed 1.
-  constexpr int kQuantizationTolerance = 1;
-  const int num_tensors = sizeof(tensors) / sizeof(TfLiteTensor);
-  TfLiteStatus status = tflite::testing::ValidateConvGoldens(
-      tensors, num_tensors, &conv_params, kQuantizationTolerance,
-      output_dims_count, golden_quantized, &reporter);
-  if (status != kTfLiteOk) {
-    TF_LITE_REPORT_ERROR(&reporter, "Model invoke failed\n");
-  }
-  return 0;
-}
diff --git a/tensorflow/lite/micro/benchmarks/depthwise_conv_benchmark.cc b/tensorflow/lite/micro/benchmarks/depthwise_conv_benchmark.cc
deleted file mode 100644
index a4133680b9f..00000000000
--- a/tensorflow/lite/micro/benchmarks/depthwise_conv_benchmark.cc
+++ /dev/null
@@ -1,240 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/micro_time.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
-
-namespace tflite {
-namespace testing {
-namespace {
-
-// Takes in quantized tensors along with expected outputs, and runs a single
-// iteration of the depthwise_conv op with the supplied parameters. Compares
-// outputs vs the expected outputs and logs any differences found. Additionally,
-// logs the number of clock ticks taken by the invoke call.
-TfLiteStatus ValidateDepthwiseConvGoldens(TfLiteTensor* tensors,
-                                          int tensors_size,
-                                          TfLiteFusedActivation activation,
-                                          int tolerance, int output_length,
-                                          const int8_t* expected_output_data,
-                                          ErrorReporter* reporter) {
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, reporter, &context);
-
-  const TfLiteRegistration registration =
-      ops::micro::Register_DEPTHWISE_CONV_2D();
-
-  int input_depth = tensors[0].dims->data[3];
-  int output_depth = tensors[1].dims->data[3];
-  TF_LITE_ENSURE(&context, input_depth > 0);
-  int depth_mul = output_depth / input_depth;
-  TfLiteDepthwiseConvParams builtin_data;
-  builtin_data.padding = kTfLitePaddingValid;
-  builtin_data.activation = activation;
-  builtin_data.stride_height = 1;
-  builtin_data.stride_width = 1;
-  builtin_data.dilation_height_factor = 1;
-  builtin_data.dilation_width_factor = 1;
-  builtin_data.depth_multiplier = depth_mul;
-
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration.init) {
-    user_data = registration.init(&context, init_data, init_data_size);
-  }
-
-  // For an N element array, the raw array will be {N, Element 1, ... Element N}
-  // There are 3 inputs at index 0, 1 and 2 in the tensors array.
-  int inputs_array_data[] = {3, 0, 1, 2};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  // There is 1 output at index 3 in the tensors array.
-  int outputs_array_data[] = {1, 3};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-
-  if (registration.prepare) {
-    TfLiteStatus prepare_status = registration.prepare(&context, &node);
-    if (prepare_status != kTfLiteOk) {
-      return prepare_status;
-    }
-  }
-
-  int32_t start = tflite::GetCurrentTimeTicks();
-  TfLiteStatus invoke_status = registration.invoke(&context, &node);
-  TF_LITE_REPORT_ERROR(reporter, "invoke took %d cycles\n",
-                       tflite::GetCurrentTimeTicks() - start);
-
-  if (registration.free) {
-    registration.free(&context, user_data);
-  }
-
-  if (invoke_status != kTfLiteOk) {
-    return invoke_status;
-  }
-
-  int8_t* output_data = tensors[3].data.int8;
-  for (int i = 0; i < output_length; ++i) {
-    if (std::abs(expected_output_data[i] - output_data[i]) > tolerance) {
-      TF_LITE_REPORT_ERROR(reporter, "outputs[%d] was %d expected %d\n", i,
-                           static_cast<int>(output_data[i]),
-                           static_cast<int>(expected_output_data[i]));
-    }
-  }
-  return kTfLiteOk;
-}
-
-}  // namespace
-}  // namespace testing
-}  // namespace tflite
-
-int main() {
-  tflite::MicroErrorReporter reporter;
-  const int input_elements = 32 * 4;
-  const int filter_elements = 32 * 4;
-  const int bias_elements = 32;
-  const int output_elements = 32;
-  const int input_shape[] = {4, 1, 4, 1, 32};
-  const int filter_shape[] = {4, 1, 4, 1, 32};
-  const int bias_shape[] = {1, 32};
-  const int output_shape[] = {4, 1, 1, 1, 32};
-  float input_values[input_elements];
-  float filter_values[filter_elements];
-  float bias_values[bias_elements];
-  const float golden[] = {
-      10304.000000, 8483.000000, 6862.000000,  5441.000000,   4220.000000,
-      3199.000000,  2378.000000, -8227.000000, -10952.000000, -5797.000000,
-      -6586.000000, 6393.000000, 5748.000000,  5303.000000,   5058.000000,
-      5013.000000,  5168.000000, -7021.000000, -11330.000000, -11087.000000,
-      -7572.000000, 3823.000000, 4154.000000,  4685.000000,   5416.000000,
-      6347.000000,  7478.000000, -6295.000000, -5020.000000,  -10969.000000,
-      -9038.000000, 773.000000};
-
-  for (int i = 0; i < input_elements; i++) {
-    input_values[i] = i - 64;
-  }
-
-  for (int i = 0; i < filter_elements; i++) {
-    filter_values[i] = (i * 25) % 256 - 128;
-  }
-
-  for (int i = 0; i < bias_elements; i++) {
-    bias_values[i] = 64 - i;
-  }
-
-  // Quantization Parameters.  All scales except output are 1.0, and all zero
-  // points are 0. This direct-maps the values to floating point and makes it
-  // easy to reson about them.
-  const float input_scale = 1.0f;
-  const float filter_scale = 1.0f;
-  const float output_scale = 100.0f;
-  const int input_zero_point = 0;
-  const int output_zero_point = 0;
-
-  TfLiteIntArray* input_dims = tflite::testing::IntArrayFromInts(input_shape);
-  TfLiteIntArray* filter_dims = tflite::testing::IntArrayFromInts(filter_shape);
-  TfLiteIntArray* bias_dims = tflite::testing::IntArrayFromInts(bias_shape);
-  TfLiteIntArray* output_dims = tflite::testing::IntArrayFromInts(output_shape);
-
-  // Create per-tensor quantized int8_t input tensor.
-  int8_t input_quantized[input_elements];
-  TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor(
-      input_values, input_quantized, input_dims, input_scale, input_zero_point);
-
-  // Set zero point and scale arrays with a single element for each.
-  int input_zero_points[] = {1, input_zero_point};
-  float input_scales[] = {1, input_scale};
-  TfLiteAffineQuantization input_quant = {
-      tflite::testing::FloatArrayFromFloats(input_scales),
-      tflite::testing::IntArrayFromInts(input_zero_points)};
-  input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
-
-  // Create per-tensor quantized int8_t filter tensor.
-  int8_t filter_quantized[filter_elements];
-  TfLiteTensor filter_tensor = tflite::testing::CreateQuantizedTensor(
-      filter_values, filter_quantized, filter_dims, filter_scale, 0);
-
-  // Set zero point and scale arrays with a single element for each.
-  int filter_zero_points[] = {1, 0};
-  float filter_scales[] = {1, filter_scale};
-  TfLiteAffineQuantization filter_quant = {
-      tflite::testing::FloatArrayFromFloats(filter_scales),
-      tflite::testing::IntArrayFromInts(filter_zero_points)};
-  filter_tensor.quantization = {kTfLiteAffineQuantization, &filter_quant};
-
-  // Create per-tensor quantized int32_t bias tensor.
-  int32_t bias_quantized[bias_elements];
-  // See https://www.tensorflow.org/lite/performance/quantization_spec for a
-  // detailed explanation of why bias scale is input_scale * filter_scale.
-  tflite::SymmetricQuantize(bias_values, bias_quantized, bias_elements,
-                            input_scale * output_scale);
-  TfLiteTensor bias_tensor =
-      tflite::testing::CreateInt32Tensor(bias_quantized, bias_dims);
-
-  // Set zero point and scale arrays with a single element for each.
-  int bias_zero_points[] = {1, 0};
-  float bias_scales[] = {1, input_scale * filter_scale};
-  TfLiteAffineQuantization bias_quant = {
-      tflite::testing::FloatArrayFromFloats(bias_scales),
-      tflite::testing::IntArrayFromInts(bias_zero_points)};
-  bias_tensor.quantization = {kTfLiteAffineQuantization, &bias_quant};
-
-  // Create per-tensor quantized int8_t output tensor.
-  int8_t output_quantized[output_elements];
-  TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor(
-      output_quantized, output_dims, output_scale, output_zero_point);
-
-  // Set zero point and scale arrays with a single element for each.
-  int output_zero_points[] = {1, output_zero_point};
-  float output_scales[] = {1, output_scale};
-  TfLiteAffineQuantization output_quant = {
-      tflite::testing::FloatArrayFromFloats(output_scales),
-      tflite::testing::IntArrayFromInts(output_zero_points)};
-  output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
-
-  // The 3 inputs include the input, filter and bias tensors.
-  constexpr int kInputsSize = 3;
-  constexpr int kOutputsSize = 1;
-  constexpr int kTensorsSize = kInputsSize + kOutputsSize;
-  TfLiteTensor tensors[kTensorsSize] = {
-      input_tensor,
-      filter_tensor,
-      bias_tensor,
-      output_tensor,
-  };
-
-  int8_t golden_quantized[output_elements];
-  tflite::AsymmetricQuantize(golden, golden_quantized, output_elements,
-                             output_scale, 0);
-
-  // Errors due to quantization should not exceed 1.
-  constexpr int kQuantizationTolerance = 1;
-  TfLiteStatus status = tflite::testing::ValidateDepthwiseConvGoldens(
-      tensors, kTensorsSize, kTfLiteActNone, kQuantizationTolerance,
-      output_elements, golden_quantized, &reporter);
-  if (status != kTfLiteOk) {
-    TF_LITE_REPORT_ERROR(&reporter, "Model invoke failed\n");
-  }
-  return 0;
-}
diff --git a/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc b/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
index b7e6ad5c99e..815be071f1f 100644
--- a/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
+++ b/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.h"
 #include "tensorflow/lite/micro/benchmarks/micro_benchmark.h"
+#include "tensorflow/lite/micro/kernels/fully_connected.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
@@ -31,25 +32,41 @@ limitations under the License.
 
 namespace {
 
-// Create an area of memory to use for input, output, and intermediate arrays.
-// Align arena to 16 bytes to avoid alignment warnings on certain platforms.
-constexpr int tensor_arena_size = 21 * 1024;
-alignas(16) uint8_t tensor_arena[tensor_arena_size];
-// A random number generator seed to generate input values.
+using KeywordBenchmarkRunner = MicroBenchmarkRunner<int16_t>;
+using KeywordOpResolver = tflite::MicroMutableOpResolver<6>;
+
 constexpr int kRandomSeed = 42;
 
-MicroBenchmarkRunner<int16_t>* benchmark_runner = nullptr;
+// Create an area of memory to use for input, output, and intermediate arrays.
+// Align arena to 16 bytes to avoid alignment warnings on certain platforms.
+constexpr int kTensorArenaSize = 21 * 1024;
+alignas(16) uint8_t tensor_arena[kTensorArenaSize];
 
-void InitializeBenchmarkRunner() {
-  // NOLINTNEXTLINE
-  static MicroBenchmarkRunner<int16_t> runner(g_keyword_scrambled_model_data,
-                                              tensor_arena, tensor_arena_size);
-  benchmark_runner = &runner;
+uint8_t benchmark_runner_buffer[sizeof(KeywordBenchmarkRunner)];
+uint8_t op_resolver_buffer[sizeof(KeywordOpResolver)];
+KeywordBenchmarkRunner* benchmark_runner = nullptr;
+
+// Initialize benchmark runner instance explicitly to avoid global init order
+// issues on Sparkfun. Use new since static variables within a method
+// are automatically surrounded by locking, which breaks bluepill and stm32f4.
+void CreateBenchmarkRunner() {
+  // We allocate the KeywordOpResolver from a global buffer because the object's
+  // lifetime must exceed that of the KeywordBenchmarkRunner object.
+  KeywordOpResolver* op_resolver = new (op_resolver_buffer) KeywordOpResolver();
+  op_resolver->AddDequantize();
+  op_resolver->AddFullyConnected(tflite::Register_FULLY_CONNECTED_INT8());
+  op_resolver->AddQuantize();
+  op_resolver->AddSoftmax();
+  op_resolver->AddSvdf();
+
+  benchmark_runner = new (benchmark_runner_buffer)
+      KeywordBenchmarkRunner(g_keyword_scrambled_model_data, op_resolver,
+                             tensor_arena, kTensorArenaSize);
 }
 
 // Initializes keyword runner and sets random inputs.
 void InitializeKeywordRunner() {
-  InitializeBenchmarkRunner();
+  CreateBenchmarkRunner();
   benchmark_runner->SetRandomInput(kRandomSeed);
 }
 
diff --git a/tensorflow/lite/micro/benchmarks/micro_benchmark.h b/tensorflow/lite/micro/benchmarks/micro_benchmark.h
index 2c7390b9cd4..83b5cbbdd5c 100644
--- a/tensorflow/lite/micro/benchmarks/micro_benchmark.h
+++ b/tensorflow/lite/micro/benchmarks/micro_benchmark.h
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <climits>
 
-#include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
+#include "tensorflow/lite/micro/micro_op_resolver.h"
 #include "tensorflow/lite/micro/micro_time.h"
 
 namespace micro_benchmark {
@@ -43,31 +43,34 @@ extern tflite::ErrorReporter* reporter;
   return 0;                          \
   }
 
-#define TF_LITE_MICRO_BENCHMARK(func)                                         \
-  if (tflite::ticks_per_second() == 0) {                                      \
-    TF_LITE_REPORT_ERROR(micro_benchmark::reporter,                           \
-                         "no timer implementation found");                    \
-    return 0;                                                                 \
-  }                                                                           \
-  start_ticks = tflite::GetCurrentTimeTicks();                                \
-  func;                                                                       \
-  duration_ticks = tflite::GetCurrentTimeTicks() - start_ticks;               \
-  if (duration_ticks > INT_MAX / 1000) {                                      \
-    duration_ms = duration_ticks / (tflite::ticks_per_second() / 1000);       \
-  } else {                                                                    \
-    duration_ms = (duration_ticks * 1000) / tflite::ticks_per_second();       \
-  }                                                                           \
-  TF_LITE_REPORT_ERROR(micro_benchmark::reporter, "%s took %d ticks (%d ms)", \
-                       #func, duration_ticks, duration_ms);
+#define TF_LITE_MICRO_BENCHMARK(func)                                   \
+  if (tflite::ticks_per_second() == 0) {                                \
+    TF_LITE_REPORT_ERROR(micro_benchmark::reporter,                     \
+                         "no timer implementation found");              \
+    return 0;                                                           \
+  }                                                                     \
+  start_ticks = tflite::GetCurrentTimeTicks();                          \
+  func;                                                                 \
+  duration_ticks = tflite::GetCurrentTimeTicks() - start_ticks;         \
+  if (duration_ticks > INT_MAX / 1000) {                                \
+    duration_ms = duration_ticks / (tflite::ticks_per_second() / 1000); \
+  } else {                                                              \
+    duration_ms = (duration_ticks * 1000) / tflite::ticks_per_second(); \
+  }                                                                     \
+  micro_benchmark::reporter->Report("%s took %d ticks (%d ms)", #func,  \
+                                    duration_ticks, duration_ms);
 
 template <typename inputT>
 class MicroBenchmarkRunner {
  public:
-  MicroBenchmarkRunner(const uint8_t* model, uint8_t* tensor_arena,
-                       int tensor_arena_size)
+  // The lifetimes of model, op_resolver and tensor_arena must exceed that of
+  // the created MicroBenchmarkRunner object.
+  MicroBenchmarkRunner(const uint8_t* model,
+                       const tflite::MicroOpResolver* op_resolver,
+                       uint8_t* tensor_arena, int tensor_arena_size)
       : model_(tflite::GetModel(model)),
         reporter_(&micro_reporter_),
-        interpreter_(model_, resolver_, tensor_arena, tensor_arena_size,
+        interpreter_(model_, *op_resolver, tensor_arena, tensor_arena_size,
                      reporter_) {
     interpreter_.AllocateTensors();
   }
@@ -109,7 +112,6 @@ class MicroBenchmarkRunner {
   const tflite::Model* model_;
   tflite::MicroErrorReporter micro_reporter_;
   tflite::ErrorReporter* reporter_;
-  tflite::AllOpsResolver resolver_;
   tflite::MicroInterpreter interpreter_;
 };
 
diff --git a/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc b/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc
index 0eabe93b5f7..687440ac9fb 100644
--- a/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc
+++ b/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/benchmarks/micro_benchmark.h"
 #include "tensorflow/lite/micro/examples/person_detection/model_settings.h"
 #include "tensorflow/lite/micro/examples/person_detection/no_person_image_data.h"
@@ -21,7 +22,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/examples/person_detection/person_image_data.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
-#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
@@ -34,19 +34,32 @@ limitations under the License.
 
 namespace {
 
+using PersonDetectionOpResolver = tflite::AllOpsResolver;
+using PersonDetectionBenchmarkRunner = MicroBenchmarkRunner<uint8_t>;
+
+constexpr int kRandomSeed = 42;
+
 // Create an area of memory to use for input, output, and intermediate arrays.
 // Align arena to 16 bytes to avoid alignment warnings on certain platforms.
 constexpr int kTensorArenaSize = 95 * 1024;
-constexpr int kRandomSeed = 42;
 alignas(16) uint8_t tensor_arena[kTensorArenaSize];
 
-MicroBenchmarkRunner<uint8_t>* benchmark_runner = nullptr;
+uint8_t op_resolver_buffer[sizeof(PersonDetectionOpResolver)];
+uint8_t benchmark_runner_buffer[sizeof(PersonDetectionBenchmarkRunner)];
+PersonDetectionBenchmarkRunner* benchmark_runner = nullptr;
 
-void InitializeBenchmarkRunner() {
-  // NOLINTNEXTLINE
-  static MicroBenchmarkRunner<uint8_t> runner(g_person_detect_model_data,
-                                              tensor_arena, kTensorArenaSize);
-  benchmark_runner = &runner;
+// Initialize benchmark runner instance explicitly to avoid global init order
+// issues on Sparkfun. Use new since static variables within a method
+// are automatically surrounded by locking, which breaks bluepill and stm32f4.
+void CreateBenchmarkRunner() {
+  // We allocate PersonDetectionOpResolver from a global buffer because the
+  // object's lifetime must exceed that of the PersonDetectionBenchmarkRunner
+  // object.
+  benchmark_runner = new (benchmark_runner_buffer)
+      PersonDetectionBenchmarkRunner(g_person_detect_model_data,
+                                     new (op_resolver_buffer)
+                                         PersonDetectionOpResolver(),
+                                     tensor_arena, kTensorArenaSize);
 }
 
 void PersonDetectionTenIterationsWithRandomInput() {
@@ -76,7 +89,7 @@ void PersonDetectionTenIerationsWithoutPerson() {
 
 TF_LITE_MICRO_BENCHMARKS_BEGIN
 
-TF_LITE_MICRO_BENCHMARK(InitializeBenchmarkRunner());
+TF_LITE_MICRO_BENCHMARK(CreateBenchmarkRunner());
 TF_LITE_MICRO_BENCHMARK(PersonDetectionTenIterationsWithRandomInput());
 TF_LITE_MICRO_BENCHMARK(PersonDetectionTenIerationsWithPerson());
 TF_LITE_MICRO_BENCHMARK(PersonDetectionTenIerationsWithoutPerson());
diff --git a/tensorflow/lite/micro/benchmarks/person_detection_experimental_benchmark.cc b/tensorflow/lite/micro/benchmarks/person_detection_experimental_benchmark.cc
index 65412136bdc..68850c90188 100644
--- a/tensorflow/lite/micro/benchmarks/person_detection_experimental_benchmark.cc
+++ b/tensorflow/lite/micro/benchmarks/person_detection_experimental_benchmark.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/benchmarks/micro_benchmark.h"
 #include "tensorflow/lite/micro/examples/person_detection_experimental/model_settings.h"
 #include "tensorflow/lite/micro/examples/person_detection_experimental/no_person_image_data.h"
@@ -21,7 +22,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/examples/person_detection_experimental/person_image_data.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
-#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
@@ -34,32 +34,49 @@ limitations under the License.
 
 namespace {
 
+using PersonDetectionExperimentalOpResolver = tflite::AllOpsResolver;
+using PersonDetectionExperimentalBenchmarkRunner = MicroBenchmarkRunner<int8_t>;
+
 // Create an area of memory to use for input, output, and intermediate arrays.
 // Align arena to 16 bytes to avoid alignment warnings on certain platforms.
-constexpr int tensor_arena_size = 135 * 1024;
-alignas(16) uint8_t tensor_arena[tensor_arena_size];
+constexpr int kTensorArenaSize = 135 * 1024;
+alignas(16) uint8_t tensor_arena[kTensorArenaSize];
 
-MicroBenchmarkRunner<int8_t>* runner;
+uint8_t op_resolver_buffer[sizeof(PersonDetectionExperimentalOpResolver)];
+uint8_t
+    benchmark_runner_buffer[sizeof(PersonDetectionExperimentalBenchmarkRunner)];
+PersonDetectionExperimentalBenchmarkRunner* benchmark_runner = nullptr;
+
+// Initialize benchmark runner instance explicitly to avoid global init order
+// issues on Sparkfun. Use new since static variables within a method
+// are automatically surrounded by locking, which breaks bluepill and stm32f4.
+void CreateBenchmarkRunner() {
+  // We allocate PersonDetectionExperimentalOpResolver from a global buffer
+  // because the object's lifetime must exceed that of the
+  // PersonDetectionBenchmarkRunner object.
+  benchmark_runner =
+      new (benchmark_runner_buffer) PersonDetectionExperimentalBenchmarkRunner(
+          g_person_detect_model_data,
+          new (op_resolver_buffer) PersonDetectionExperimentalOpResolver(),
+          tensor_arena, kTensorArenaSize);
+}
 
 void InitializeBenchmarkRunner() {
-  // NOLINTNEXTLINE
-  static MicroBenchmarkRunner<int8_t> benchmark_runner(
-      g_person_detect_model_data, tensor_arena, tensor_arena_size);
-  runner = &benchmark_runner;
-  runner->SetInput(reinterpret_cast<const int8_t*>(g_person_data));
+  CreateBenchmarkRunner();
+  benchmark_runner->SetInput(reinterpret_cast<const int8_t*>(g_person_data));
 }
 
 void PersonDetectionTenIerationsWithPerson() {
-  runner->SetInput(reinterpret_cast<const int8_t*>(g_person_data));
+  benchmark_runner->SetInput(reinterpret_cast<const int8_t*>(g_person_data));
   for (int i = 0; i < 10; i++) {
-    runner->RunSingleIteration();
+    benchmark_runner->RunSingleIteration();
   }
 }
 
 void PersonDetectionTenIerationsWithoutPerson() {
-  runner->SetInput(reinterpret_cast<const int8_t*>(g_no_person_data));
+  benchmark_runner->SetInput(reinterpret_cast<const int8_t*>(g_no_person_data));
   for (int i = 0; i < 10; i++) {
-    runner->RunSingleIteration();
+    benchmark_runner->RunSingleIteration();
   }
 }
 
@@ -68,7 +85,7 @@ void PersonDetectionTenIerationsWithoutPerson() {
 TF_LITE_MICRO_BENCHMARKS_BEGIN
 
 TF_LITE_MICRO_BENCHMARK(InitializeBenchmarkRunner());
-TF_LITE_MICRO_BENCHMARK(runner->RunSingleIteration());
+TF_LITE_MICRO_BENCHMARK(benchmark_runner->RunSingleIteration());
 TF_LITE_MICRO_BENCHMARK(PersonDetectionTenIerationsWithPerson());
 TF_LITE_MICRO_BENCHMARK(PersonDetectionTenIerationsWithoutPerson());
 
diff --git a/tensorflow/lite/micro/cortex_m_generic/README.md b/tensorflow/lite/micro/cortex_m_generic/README.md
new file mode 100644
index 00000000000..c76483c79e9
--- /dev/null
+++ b/tensorflow/lite/micro/cortex_m_generic/README.md
@@ -0,0 +1,54 @@
+# Generic Cortex-Mx customizations
+
+The customization requires a definition where the debug log goes to. The purpose
+of the generic Cortex-Mx target is to generate a TFLM library file for use in
+application projects outside of this repo. As the chip HAL and the board
+specific layer are only defined in the application project, the TFLM library
+cannot write the debug log anywhere. Instead, we allow the application layer to
+register a callback function for writing the TFLM kernel debug log.
+
+# Usage
+
+See debug_log_callback.h
+
+# How to build
+
+Required parameters: - TARGET: cortex_m_generic - TARGET_ARCH: cortex-mXX (For
+all options see:
+tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc)
+
+Optional parameters: - TOOLCHAIN: armgcc (default) or armmclang - For
+Cortex-M55, ARM Compiler 6.14 or later is required.
+
+Some examples:
+
+Building with arm-gcc ``` make -f tensorflow/lite/micro/tools/make/Makefile
+TARGET=cortex_m_generic TARGET_ARCH=cortex-m7 microlite make -f
+tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_generic
+TARGET_ARCH=cortex-m7 TAGS=cmsis-nn microlite
+
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_generic
+TARGET_ARCH=cortex-m4 TAGS=cmsis-nn microlite make -f
+tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_generic
+TARGET_ARCH=cortex-m4+fp TAGS=cmsis-nn microlite ```
+
+Building with armclang `make -f tensorflow/lite/micro/tools/make/Makefile
+TOOLCHAIN=armclang TARGET=cortex_m_generic TARGET_ARCH=cortex-m55 microlite make
+-f tensorflow/lite/micro/tools/make/Makefile TOOLCHAIN=armclang
+TARGET=cortex_m_generic TARGET_ARCH=cortex-m55 TAGS=cmsis-nn microlite make -f
+tensorflow/lite/micro/tools/make/Makefile TOOLCHAIN=armclang
+TARGET=cortex_m_generic TARGET_ARCH=cortex-m55+nofp TAGS=cmsis-nn microlite`
+
+The Tensorflow Lite Micro makefiles download a specific version of the arm-gcc
+compiler to tensorflow/lite/micro/tools/make/downloads/gcc_embedded. If desired,
+a different version can be used by providing `TARGET_TOOLCHAIN_ROOT` option to
+the Makefile. `make -f tensorflow/lite/micro/tools/make/Makefile
+TARGET=cortex_m_generic TARGET_ARCH=cortex-m4
+TARGET_TOOLCHAIN_ROOT=/path/to/arm-gcc/ microlite`
+
+Similarly, `TAGS=cmsis-nn` downloads a specific version of CMSIS to
+tensorflow/lite/micro/tools/make/downloads/cmsis. While this is the only version
+that is regularly tested, you can use your own version of CMSIS as well by
+providing `CMSIS_PATH` to the Makefile: `make -f
+tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_generic
+TARGET_ARCH=cortex-m4 TAGS=cmsis-nn CMSIS_PATH=/path/to/own/cmsis microlite`
diff --git a/tensorflow/lite/micro/cortex_m_generic/debug_log.cc b/tensorflow/lite/micro/cortex_m_generic/debug_log.cc
new file mode 100644
index 00000000000..bc79d439170
--- /dev/null
+++ b/tensorflow/lite/micro/cortex_m_generic/debug_log.cc
@@ -0,0 +1,43 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implementation for the DebugLog() function that prints to the debug logger on
+// an generic Cortex-M device.
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#include "tensorflow/lite/micro/debug_log.h"
+
+#include "tensorflow/lite/micro/cortex_m_generic/debug_log_callback.h"
+
+static DebugLogCallback debug_log_callback = nullptr;
+
+void RegisterDebugLogCallback(void (*cb)(const char* s)) {
+  debug_log_callback = cb;
+}
+
+void DebugLog(const char* s) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+  if (debug_log_callback != nullptr) {
+    debug_log_callback(s);
+  }
+#endif
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
diff --git a/tensorflow/lite/micro/cortex_m_generic/debug_log_callback.h b/tensorflow/lite/micro/cortex_m_generic/debug_log_callback.h
new file mode 100644
index 00000000000..c1afd19a578
--- /dev/null
+++ b/tensorflow/lite/micro/cortex_m_generic/debug_log_callback.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MICRO_CORTEX_M_GENERIC_DEBUG_LOG_CALLBACK_H_
+#define TENSORFLOW_LITE_MICRO_CORTEX_M_GENERIC_DEBUG_LOG_CALLBACK_H_
+
+// The application layer must implement and register a callback before calling
+// the network in a way similar to
+//
+//    void debug_log_printf(const char* s)
+//    {
+//        printf(s);
+//    }
+//
+//    int main(void)
+//    {
+//        // Register callback for printing debug log
+//        RegisterDebugLogCallback(debug_log_printf);
+//
+//        // now call the network
+//        TfLiteStatus invoke_status = interpreter->Invoke();
+//    }
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef void (*DebugLogCallback)(const char* s);
+
+// Registers and application-specific callback for debug logging. It must be
+// called before the first call to DebugLog().
+void RegisterDebugLogCallback(DebugLogCallback callback);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_MICRO_CORTEX_M_GENERIC_DEBUG_LOG_CALLBACK_H_
diff --git a/tensorflow/lite/micro/debug_log.cc b/tensorflow/lite/micro/debug_log.cc
index 7ef582bd376..46ca253a6d5 100644
--- a/tensorflow/lite/micro/debug_log.cc
+++ b/tensorflow/lite/micro/debug_log.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -36,6 +36,15 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/debug_log.h"
 
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
 #include <cstdio>
+#endif
 
-extern "C" void DebugLog(const char* s) { fprintf(stderr, "%s", s); }
+extern "C" void DebugLog(const char* s) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+  // Reusing TF_LITE_STRIP_ERROR_STRINGS to disable DebugLog completely to get
+  // maximum reduction in binary size. This is because we have DebugLog calls
+  // via TF_LITE_CHECK that are not stubbed out by TF_LITE_REPORT_ERROR.
+  fprintf(stderr, "%s", s);
+#endif
+}
diff --git a/tensorflow/lite/micro/debug_log.h b/tensorflow/lite/micro/debug_log.h
index 1004ab9f5db..c2840d0f4b5 100644
--- a/tensorflow/lite/micro/debug_log.h
+++ b/tensorflow/lite/micro/debug_log.h
@@ -15,9 +15,17 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_DEBUG_LOG_H_
 #define TENSORFLOW_LITE_MICRO_DEBUG_LOG_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
 // This function should be implemented by each target platform, and provide a
 // way for strings to be output to some text stream. For more information, see
 // tensorflow/lite/micro/debug_log.cc.
-extern "C" void DebugLog(const char* s);
+void DebugLog(const char* s);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
 
 #endif  // TENSORFLOW_LITE_MICRO_DEBUG_LOG_H_
diff --git a/tensorflow/lite/micro/docs/memory_management.md b/tensorflow/lite/micro/docs/memory_management.md
new file mode 100644
index 00000000000..a936cb6d7c3
--- /dev/null
+++ b/tensorflow/lite/micro/docs/memory_management.md
@@ -0,0 +1,173 @@
+<!--
+Semi-automated TOC generation with instructions from
+https://github.com/ekalinin/github-markdown-toc#auto-insert-and-update-toc
+-->
+
+<!--ts-->
+
+*   [Memory Management in TensorFlow Lite Micro](#memory-management-in-tensorflow-lite-micro)
+    *   [Tensor Arena](#tensor-arena)
+        *   [Head Section](#head-section)
+        *   [Temporary Section](#temporary-section)
+        *   [Tail Section](#tail-section)
+    *   [Recording Memory APIs](#recording-memory-apis)
+        *   [Allocation Section Details](#allocation-section-details)
+
+<!-- Added by: advaitjain, at: Mon 05 Oct 2020 02:21:02 PM PDT -->
+
+<!--te-->
+
+# Memory Management in TensorFlow Lite Micro
+
+This document outlines how memory is managed internally by TensorFlow Lite Micro
+(TFLM) today. It outlines the "online" allocation strategy used by the default
+TFLM APIs for loading a model into a shared tensor arena.
+
+## Tensor Arena
+
+The main "working" space for TFLM allocations is inside a single `char` or
+`int8_t` buffer. This buffer can be managed by passing it directly into a
+`tflite::MicroInterpreter` constructor or through a `tflite::MicroAllocator`
+instance that can be passed into a `tflite::MicroInterpreter` constructor.
+Internally, the `tflite::MicroAllocator` classifies allocations into 3 different
+sections:
+
+*   **Head** - non-persistent allocations.
+*   **Temporary** - short term "scoped" allocations.
+*   **Tail** - persistent allocations.
+
+The illustration below represents typical allocations in TFLM:
+
+## ```
+
+| | | | | HEAD |<-- TEMPORARY -->| TAIL |
+
+## | | | |
+
+*   Lowest Address Highest Address * ```
+
+### Head Section
+
+This non-persistent section typically holds shared Tensor buffers. This section
+does not allocate small iterative chunks, it can only be set by a specific
+length for the entire section.
+
+This allocation length of this section is managed by the
+`tflite::GreedyMemoryPlanner`. That memory planner looks at the entire graph of
+a model and tries to reuse as many buffers as possible to create the smallest
+length for the head. The Tensor buffers for this section can be accessed via a
+`TfLiteEvalTensor` or `TfLiteTensor` instance on the `tflite::MicroInterpreter`.
+
+### Temporary Section
+
+This section is used to allocate "scoped" or short-term, non-guaranteed buffers.
+Allocations from this section start from the current end address of the head
+section and grow towards the tail section. An allocation chain can be reset (and
+must be reset before adjusting the head) and moves the current allocation start
+address back to the end of the head section.
+
+TFLM currently uses these allocations for a scope allocation of large C structs
+or scratch memory that is expected to be valid for at least the lifetime of a
+method call. This section.
+
+### Tail Section
+
+This section holds all persistent allocations used by TFLM. This section
+contains many random sized allocations and grows towards the end of the head
+section. Allocations in this section come from a variety of areas inside of
+TFLM. TFLM provides a [recording API](#Recording-Memory-APIs) to assist with
+auditing the contents of this section.
+
+## Recording Memory APIs
+
+TFLM provides simple APIs for auditing memory usage in the shared tensor arena.
+These APIs are opt-in and require some additional memory overhead and a working
+debug logging implementation
+[(reference implementation)](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/debug_log.cc).
+
+A typical bare-bones TFLM interpreter setup looks as such:
+
+```c++
+// Buffer for the tensor arena:
+size_t tensor_arena_size = 2048;
+uint8_t tensor_arena[tensor_arena_size];
+
+// Interpreter using the shared tensor arena above:
+tflite::MicroInterpreter interpreter(
+  tflite::GetModel(my_model_data), ops_resolver,
+  tensor_arena, tensor_arena_size, error_reporter);
+
+// Invoke one time which will allocate internals:
+if (interpreter.Invoke() != kTfLiteOk) {
+  TF_LITE_REPORT_ERROR(error_reporter, "Exception during invoke()!");
+}
+```
+
+Recording API can simply be used by including the `RecordingMicroInterpreter`
+class (`recording_micro_interpreter.h`) and replace `tflite::MicroInterpreter`
+with `tflite::RecordingMicroInterpreter`. The same call to `invoke()` is
+performed, but another call is made to `PrintAllocations()` which will output
+detailed allocation logging:
+
+```c++
+// Add an include to the recording API:
+#include "recording_micro_interpreter.h"
+
+// Simply change the class name from 'MicroInterpreter' to 'RecordingMicroInterpreter':
+tflite::RecoridngMicroInterpreter interpreter(
+  tflite::GetModel(my_model_data), ops_resolver,
+  tensor_arena, tensor_arena_size, error_reporter);
+
+// Invoke one time which will allocate internals:
+if (interpreter.Invoke() != kTfLiteOk) {
+  TF_LITE_REPORT_ERROR(error_reporter, "Exception during invoke()!");
+}
+
+// Print out detailed allocation information:
+interpreter.PrintAllocations();
+```
+
+The output of this call will look something similar to this (output from the
+[memory_arena_threshold_test](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/memory_arena_threshold_test.cc#L205)):
+`sh [RecordingMicroAllocator] Arena allocation total 9568 bytes
+[RecordingMicroAllocator] Arena allocation head 7744 bytes
+[RecordingMicroAllocator] Arena allocation tail 1824 bytes
+[RecordingMicroAllocator] 'TfLiteEvalTensor data' used 360 bytes with alignment
+overhead (requested 360 bytes for 15 allocations) [RecordingMicroAllocator]
+'Persistent TfLiteTensor data' used 0 bytes with alignment overhead (requested 0
+bytes for 0 tensors) [RecordingMicroAllocator] 'Persistent TfLiteTensor
+quantization data' used 0 bytes with alignment overhead (requested 0 bytes for 0
+allocations) [RecordingMicroAllocator] 'TfLiteTensor variable buffer data' used
+0 bytes with alignment overhead (requested 0 bytes for 0 allocations)
+[RecordingMicroAllocator] 'NodeAndRegistration struct' used 392 bytes with
+alignment overhead (requested 392 bytes for 7 NodeAndRegistration structs)
+[RecordingMicroAllocator] 'Operator runtime data' used 136 bytes with alignment
+overhead (requested 136 bytes for 5 OpData structs)`
+
+### Allocation Section Details
+
+More information about each recorded allocation section:
+
+*   'TfLiteEvalTensor data'
+    *   C struct that holds the data type, dimension, and a pointer to the
+        buffer representing the Tensor.
+*   'Persistent TfLiteTensor data'
+    *   C struct that holds more information than a `TfLiteEvalTensor` struct in
+        the graph.
+    *   Allocations in this bucket will only show up when accessing tensors from
+        the accessors on `tflite::MicroInterpreter`.
+*   'Persistent TfLiteTensor quantization data'
+    *   Length of persistent quantization data assigned to persistent
+        `TfLiteTensor` structs.
+    *   Allocations in this bucket will only show up when accessing tensors from
+        the accessors on `tflite::MicroInterpreter`.
+*   'TfLiteTensor variable buffer data'
+    *   Length of buffer data from a variable tensor (retains data throughout
+        calls to `invoke()`).
+*   'NodeAndRegistration struct'
+    *   C struct that holds a `TfLiteRegistration` and `TfLiteNode` struct
+        instance.
+    *   Each operator in a model will contain one `NodeAndRegistration` struct.
+*   'Operator runtime data'
+    *   Persistent allocations of data cached by TFLM kernels (e.g. quantization
+        params, multipliers, etc).
diff --git a/tensorflow/lite/micro/docs/new_platform_support.md b/tensorflow/lite/micro/docs/new_platform_support.md
new file mode 100644
index 00000000000..0e1287a87cb
--- /dev/null
+++ b/tensorflow/lite/micro/docs/new_platform_support.md
@@ -0,0 +1,306 @@
+<!--
+Semi-automated TOC generation with instructions from
+https://github.com/ekalinin/github-markdown-toc#auto-insert-and-update-toc
+-->
+
+<!--ts-->
+
+*   [Porting to a new platform](#porting-to-a-new-platform)
+    *   [Requirements](#requirements)
+    *   [Getting started](#getting-started)
+    *   [Troubleshooting](#troubleshooting)
+    *   [Optimizing for your platform](#optimizing-for-your-platform)
+    *   [Code module organization](#code-module-organization)
+    *   [Implementing more optimizations](#implementing-more-optimizations)
+
+<!-- Added by: advaitjain, at: Mon 05 Oct 2020 02:36:46 PM PDT -->
+
+<!--te-->
+
+***Please note that we are currently pausing accepting new platforms***. Please
+see our [contributions guide](../CONTRIBUTING.md) for more details and context.
+
+Parts of the documentation below will likely change as we start accepting new
+platform support again.
+
+# Porting to a new platform
+
+The remainder of this document provides guidance on porting TensorFlow Lite for
+Microcontrollers to new platforms. You should read the
+[developer documentation](https://www.tensorflow.org/lite/microcontrollers)
+first.
+
+## Requirements
+
+Since the core neural network operations are pure arithmetic, and don't require
+any I/O or other system-specific functionality, the code doesn't have to have
+many dependencies. We've tried to enforce this, so that it's as easy as possible
+to get TensorFlow Lite Micro running even on 'bare metal' systems without an OS.
+Here are the core requirements that a platform needs to run the framework:
+
+-   C/C++ compiler capable of C++11 compatibility. This is probably the most
+    restrictive of the requirements, since C++11 is not as widely adopted in the
+    embedded world as it is elsewhere. We made the decision to require it since
+    one of the main goals of TFL Micro is to share as much code as possible with
+    the wider TensorFlow codebase, and since that relies on C++11 features, we
+    need compatibility to achieve it. We only use a small subset of C++ though,
+    so don't worry about having to deal with template metaprogramming or
+    similar challenges!
+
+-   Debug logging. The core network operations don't need any I/O functions, but
+    to be able to run tests and tell if they've worked as expected, the
+    framework needs some way to write out a string to some kind of debug
+    console. This will vary from system to system, for example on Linux it could
+    just be `fprintf(stderr, debug_string)` whereas an embedded device might
+    write the string out to a specified UART. As long as there's some mechanism
+    for outputting debug strings, you should be able to use TFL Micro on that
+    platform.
+
+-   Math library. The C standard `libm.a` library is needed to handle some of
+    the mathematical operations used to calculate neural network results.
+
+-   Global variable initialization. We do use a pattern of relying on global
+    variables being set before `main()` is run in some places, so you'll need to
+    make sure your compiler toolchain supports this.
+
+And that's it! You may be wondering about some other common requirements that
+are needed by a lot of non-embedded software, so here's a brief list of things
+that aren't necessary to get started with TFL Micro on a new platform:
+
+-   Operating system. Since the only platform-specific function we need is
+    `DebugLog()`, there's no requirement for any kind of Posix or similar
+    functionality around files, processes, or threads.
+
+-   C or C++ standard libraries. The framework tries to avoid relying on any
+    standard library functions that require linker-time support. This includes
+    things like string functions, but still allows us to use headers like
+    `stdtypes.h` which typically just define constants and typedefs.
+    Unfortunately this distinction isn't officially defined by any standard, so
+    it's possible that different toolchains may decide to require linked code
+    even for the subset we use, but in practice we've found it's usually a
+    pretty obvious decision and stable over platforms and toolchains.
+
+-   Dynamic memory allocation. All the TFL Micro code avoids dynamic memory
+    allocation, instead relying on local variables on the stack in most cases,
+    or global variables for a few situations. These are all fixed-size, which
+    can mean some compile-time configuration to ensure there's enough space for
+    particular networks, but does avoid any need for a heap and the
+    implementation of `malloc\new` on a platform.
+
+-   Floating point. Eight-bit integer arithmetic is enough for inference on many
+    networks, so if a model sticks to these kind of quantized operations, no
+    floating point instructions should be required or executed by the framework.
+
+## Getting started
+
+We recommend that you start trying to compile and run one of the simplest tests
+in the framework as your first step. The full TensorFlow codebase can seem
+overwhelming to work with at first, so instead you can begin with a collection
+of self-contained project folders that only include the source files needed for
+a particular test or executable. You can find a set of pre-generated projects
+[here](https://drive.google.com/open?id=1cawEQAkqquK_SO4crReDYqf_v7yAwOY8).
+
+As mentioned above, the one function you will need to implement for a completely
+new platform is debug logging. If your device is just a variation on an existing
+platform you may be able to reuse code that's already been written. To
+understand what's available, begin with the default reference implementation at
+[tensorflow/lite/micro/debug_log.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/debug_log.cc),
+which uses fprintf and stderr. If your platform has this level of support for
+the C standard library in its toolchain, then you can just reuse this.
+Otherwise, you'll need to do some research into how your platform and device can
+communicate logging statements to the outside world. As another example, take a
+look at
+[the Mbed version of `DebugLog()`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/mbed/debug_log.cc),
+which creates a UART object and uses it to output strings to the host's console
+if it's connected.
+
+Begin by navigating to the micro_error_reporter_test folder in the pregenerated
+projects you downloaded. Inside here, you'll see a set of folders containing all
+the source code you need. If you look through them, you should find a total of
+around 60 C or C++ files that compiled together will create the test executable.
+There's an example makefile in the directory that lists all of the source files
+and include paths for the headers. If you're building on a Linux or MacOS host
+system, you may just be able to reuse that same makefile to cross-compile for
+your system, as long as you swap out the `CC` and `CXX` variables from their
+defaults, to point to your cross compiler instead (for example
+`arm-none-eabi-gcc` or `riscv64-unknown-elf-gcc`). Otherwise, set up a project
+in the build system you are using. It should hopefully be fairly
+straightforward, since all of the source files in the folder need to be
+compiled, so on many IDEs you can just drag the whole lot in. Then you need to
+make sure that C++11 compatibility is turned on, and that the right include
+paths (as mentioned in the makefile) have been added.
+
+You'll see the default `DebugLog()` implementation in
+'tensorflow/lite/micro/debug_log.cc' inside the micro_error_reporter_test
+folder. Modify that file to add the right implementation for your platform, and
+then you should be able to build the set of files into an executable. Transfer
+that executable to your target device (for example by flashing it), and then try
+running it. You should see output that looks something like this:
+
+```
+Number: 42
+Badly-formed format string
+Another  badly-formed  format string
+~~ALL TESTS PASSED~~~
+```
+
+If not, you'll need to debug what went wrong, but hopefully with this small
+starting project it should be manageable.
+
+## Troubleshooting
+
+When we've been porting to new platforms, it's often been hard to figure out
+some of the fundamentals like linker settings and other toolchain setup flags.
+If you are having trouble, see if you can find a simple example program for your
+platform, like one that just blinks an LED. If you're able to build and run that
+successfully, then start to swap in parts of the TF Lite Micro codebase to that
+working project, taking it a step at a time and ensuring it's still working
+after every change. For example, a first step might be to paste in your
+`DebugLog()` implementation and call `DebugLog("Hello World!")` from the main
+function.
+
+Another common problem on embedded platforms is the stack size being too small.
+Mbed defaults to 4KB for the main thread's stack, which is too small for most
+models since TensorFlow Lite allocates buffers and other data structures that
+require more memory. The exact size will depend on which model you're running,
+but try increasing it if you are running into strange corruption issues that
+might be related to stack overwriting.
+
+## Optimizing for your platform
+
+The default reference implementations in TensorFlow Lite Micro are written to be
+portable and easy to understand, not fast, so you'll want to replace performance
+critical parts of the code with versions specifically tailored to your
+architecture. The framework has been designed with this in mind, and we hope the
+combination of small modules and many tests makes it as straightforward as
+possible to swap in your own code a piece at a time, ensuring you have a working
+version at every step. To write specialized implementations for a platform, it's
+useful to understand how optional components are handled inside the build
+system.
+
+## Code module organization
+
+We have adopted a system of small modules with platform-specific implementations
+to help with portability. Every module is just a standard `.h` header file
+containing the interface (either functions or a class), with an accompanying
+reference implementation in a `.cc` with the same name. The source file
+implements all of the code that's declared in the header. If you have a
+specialized implementation, you can create a folder in the same directory as the
+header and reference source, name it after your platform, and put your
+implementation in a `.cc` file inside that folder. We've already seen one
+example of this, where the Mbed and Bluepill versions of `DebugLog()` are inside
+[mbed](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/mbed)
+and
+[bluepill](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/bluepill)
+folders, children of the
+[same directory](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite//micro)
+where the stdio-based
+[`debug_log.cc`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/debug_log.cc)
+reference implementation is found.
+
+The advantage of this approach is that we can automatically pick specialized
+implementations based on the current build target, without having to manually
+edit build files for every new platform. It allows incremental optimizations
+from a always-working foundation, without cluttering the reference
+implementations with a lot of variants.
+
+To see why we're doing this, it's worth looking at the alternatives. TensorFlow
+Lite has traditionally used preprocessor macros to separate out some
+platform-specific code within particular files, for example:
+
+```
+#ifndef USE_NEON
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define USE_NEON
+#include <arm_neon.h>
+#endif
+```
+
+There’s also a tradition in gemmlowp of using file suffixes to indicate
+platform-specific versions of particular headers, with kernel_neon.h being
+included by kernel.h if `USE_NEON` is defined. As a third variation, kernels are
+separated out using a directory structure, with
+tensorflow/lite/kernels/internal/reference containing portable implementations,
+and tensorflow/lite/kernels/internal/optimized holding versions optimized for
+NEON on Arm platforms.
+
+These approaches are hard to extend to multiple platforms. Using macros means
+that platform-specific code is scattered throughout files in a hard-to-find way,
+and can make following the control flow difficult since you need to understand
+the macro state to trace it. For example, I temporarily introduced a bug that
+disabled NEON optimizations for some kernels when I removed
+tensorflow/lite/kernels/internal/common.h from their includes, without realizing
+it was where USE_NEON was defined!
+
+It’s also tough to port to different build systems, since figuring out the right
+combination of macros to use can be hard, especially since some of them are
+automatically defined by the compiler, and others are only set by build scripts,
+often across multiple rules.
+
+The approach we are using extends the file system approach that we use for
+kernel implementations, but with some specific conventions:
+
+-   For each module in TensorFlow Lite, there will be a parent directory that
+    contains tests, interface headers used by other modules, and portable
+    implementations of each part.
+-   Portable means that the code doesn’t include code from any libraries except
+    flatbuffers, or other TF Lite modules. You can include a limited subset of
+    standard C or C++ headers, but you can’t use any functions that require
+    linking against those libraries, including fprintf, etc. You can link
+    against functions in the standard math library, in <math.h>.
+-   Specialized implementations are held inside subfolders of the parent
+    directory, named after the platform or library that they depend on. So, for
+    example if you had my_module/foo.cc, a version that used RISC-V extensions
+    would live in my_module/riscv/foo.cc. If you had a version that used the
+    CMSIS library, it should be in my_module/cmsis/foo.cc.
+-   These specialized implementations should completely replace the top-level
+    implementations. If this involves too much code duplication, the top-level
+    implementation should be split into smaller files, so only the
+    platform-specific code needs to be replaced.
+-   There is a convention about how build systems pick the right implementation
+    file. There will be an ordered list of 'tags' defining the preferred
+    implementations, and to generate the right list of source files, each module
+    will be examined in turn. If a subfolder with a tag’s name contains a .cc
+    file with the same base name as one in the parent folder, then it will
+    replace the parent folder’s version in the list of build files. If there are
+    multiple subfolders with matching tags and file names, then the tag that’s
+    latest in the ordered list will be chosen. This allows us to express “I’d
+    like generically-optimized fixed point if it’s available, but I’d prefer
+    something using the CMSIS library” using the list 'fixed_point cmsis'. These
+    tags are passed in as `TAGS="<foo>"` on the command line when you use the
+    main Makefile to build.
+-   There is an implicit “reference” tag at the start of every list, so that
+    it’s possible to support directory structures like the current
+    tensorflow/kernels/internal where portable implementations are held in a
+    “reference” folder that’s a sibling to the NEON-optimized folder.
+-   The headers for each unit in a module should remain platform-agnostic, and
+    be the same for all implementations. Private headers inside a sub-folder can
+    be used as needed, but shouldn’t be referred to by any portable code at the
+    top level.
+-   Tests should be at the parent level, with no platform-specific code.
+-   No platform-specific macros or #ifdef’s should be used in any portable code.
+
+The implementation of these rules is handled inside the Makefile, with a
+[`specialize` function](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/tools/make/helper_functions.inc#L42)
+that takes a list of reference source file paths as an input, and returns the
+equivalent list with specialized versions of those files swapped in if they
+exist.
+
+## Implementing more optimizations
+
+Clearly, getting debug logging support is only the beginning of the work you'll
+need to do on a particular platform. It's very likely that you'll want to
+optimize the core deep learning operations that take up the most time when
+running models you care about. The good news is that the process for providing
+optimized implementations is the same as the one you just went through to
+provide your own logging. You'll need to identify parts of the code that are
+bottlenecks, and then add specialized implementations in their own folders.
+These don't need to be platform specific, they can also be broken out by which
+library they rely on for example. [Here's where we do that for the CMSIS
+implementation of integer fast-fourier
+transforms](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/simple_features/simple_features_generator.cc).
+This more complex case shows that you can also add helper source files alongside
+the main implementation, as long as you
+[mention them in the platform-specific makefile](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/CMSIS/Makefile.inc).
+You can also do things like update the list of libraries that need to be linked
+in, or add include paths to required headers.
diff --git a/tensorflow/lite/micro/examples/hello_world/README.md b/tensorflow/lite/micro/examples/hello_world/README.md
index 0966a1fd1b1..4253b470759 100644
--- a/tensorflow/lite/micro/examples/hello_world/README.md
+++ b/tensorflow/lite/micro/examples/hello_world/README.md
@@ -86,6 +86,11 @@ get it started.
     *   Plug in the microSD card into the J11 connector.
     *   Push the RST button. If a red LED is lit beside RST button, push the CFG
         button.
+    *   Type or copy next commands one-by-another into serial terminal: `setenv
+        loadaddr 0x10800000 setenv bootfile app.elf setenv bootdelay 1 setenv
+        bootcmd fatload mmc 0 \$\{loadaddr\} \$\{bootfile\} \&\& bootelf
+        saveenv`
+    *   Push the RST button.
 
 6.  If you have the MetaWare Debugger installed in your environment:
 
@@ -274,6 +279,13 @@ Following the Steps to run hello world example at HIMAX WE1 EVB platform.
     cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
     ```
 
+    make sure this tool directory is in $PATH. You can permanently set it to
+    PATH by
+
+    ```
+    export PATH=$PATH:$(pwd)
+    ```
+
 5.  run image generate tool, generate flash image file.
 
     *   Before running image generate tool, by typing `sudo chmod +x image_gen`
diff --git a/tensorflow/lite/micro/examples/hello_world/output_handler_test.cc b/tensorflow/lite/micro/examples/hello_world/output_handler_test.cc
index 206113d1427..db529832ef9 100644
--- a/tensorflow/lite/micro/examples/hello_world/output_handler_test.cc
+++ b/tensorflow/lite/micro/examples/hello_world/output_handler_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/examples/hello_world/output_handler.h"
 
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 TF_LITE_MICRO_TESTS_BEGIN
 
diff --git a/tensorflow/lite/micro/examples/hello_world/sparkfun_edge/output_handler.cc b/tensorflow/lite/micro/examples/hello_world/sparkfun_edge/output_handler.cc
index 2e727095a5c..87f2cdff104 100644
--- a/tensorflow/lite/micro/examples/hello_world/sparkfun_edge/output_handler.cc
+++ b/tensorflow/lite/micro/examples/hello_world/sparkfun_edge/output_handler.cc
@@ -55,7 +55,7 @@ void HandleOutput(tflite::ErrorReporter* error_reporter, float x_value,
     // The blue LED is lit for all negative values
     am_devices_led_on(am_bsp_psLEDs, AM_BSP_LED_BLUE);
     // The red LED is lit in only some cases
-    if (y_value <= -0.75) {
+    if (y_value <= -0.75f) {
       am_devices_led_on(am_bsp_psLEDs, AM_BSP_LED_RED);
     } else {
       am_devices_led_off(am_bsp_psLEDs, AM_BSP_LED_RED);
@@ -68,13 +68,14 @@ void HandleOutput(tflite::ErrorReporter* error_reporter, float x_value,
     // The green LED is lit for all positive values
     am_devices_led_on(am_bsp_psLEDs, AM_BSP_LED_GREEN);
     // The yellow LED is lit in only some cases
-    if (y_value >= 0.75) {
+    if (y_value >= 0.75f) {
       am_devices_led_on(am_bsp_psLEDs, AM_BSP_LED_YELLOW);
     } else {
       am_devices_led_off(am_bsp_psLEDs, AM_BSP_LED_YELLOW);
     }
   }
   // Log the current X and Y values
-  TF_LITE_REPORT_ERROR(error_reporter, "x_value: %f, y_value: %f\n", x_value,
-                       y_value);
+  TF_LITE_REPORT_ERROR(error_reporter, "x_value: %f, y_value: %f\n",
+                       static_cast<double>(x_value),
+                       static_cast<double>(y_value));
 }
diff --git a/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/prj.conf b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/prj.conf
index e36145c332a..f4d8a9fed1d 100644
--- a/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/prj.conf
+++ b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/prj.conf
@@ -14,3 +14,4 @@
 # ==============================================================================
 CONFIG_CPLUSPLUS=y
 CONFIG_NEWLIB_LIBC=y
+CONFIG_NETWORKING=n
diff --git a/tensorflow/lite/micro/examples/magic_wand/README.md b/tensorflow/lite/micro/examples/magic_wand/README.md
index efc235b07f8..fea1eda4d6d 100644
--- a/tensorflow/lite/micro/examples/magic_wand/README.md
+++ b/tensorflow/lite/micro/examples/magic_wand/README.md
@@ -223,6 +223,13 @@ Following the Steps to run magic wand example at HIMAX WE1 EVB platform.
     cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
     ```
 
+    make sure this tool directory is in $PATH. You can permanently set it to
+    PATH by
+
+    ```
+    export PATH=$PATH:$(pwd)
+    ```
+
 5.  run image generate tool, generate flash image file.
 
     *   Before running image generate tool, by typing `sudo chmod +x image_gen`
diff --git a/tensorflow/lite/micro/examples/magic_wand/output_handler_test.cc b/tensorflow/lite/micro/examples/magic_wand/output_handler_test.cc
index 133d62427a1..2c34dfc37a7 100644
--- a/tensorflow/lite/micro/examples/magic_wand/output_handler_test.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/output_handler_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/examples/magic_wand/output_handler.h"
 
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 TF_LITE_MICRO_TESTS_BEGIN
 
diff --git a/tensorflow/lite/micro/examples/micro_speech/BUILD b/tensorflow/lite/micro/examples/micro_speech/BUILD
index 268ce77dee0..d5aa451b351 100644
--- a/tensorflow/lite/micro/examples/micro_speech/BUILD
+++ b/tensorflow/lite/micro/examples/micro_speech/BUILD
@@ -316,6 +316,7 @@ tflite_micro_cc_test(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
diff --git a/tensorflow/lite/micro/examples/micro_speech/README.md b/tensorflow/lite/micro/examples/micro_speech/README.md
index e04813f2088..7d4c060b6f4 100644
--- a/tensorflow/lite/micro/examples/micro_speech/README.md
+++ b/tensorflow/lite/micro/examples/micro_speech/README.md
@@ -23,6 +23,7 @@ kilobytes of Flash.
 -   [Deploy to STM32F746](#deploy-to-STM32F746)
 -   [Deploy to NXP FRDM K66F](#deploy-to-nxp-frdm-k66f)
 -   [Deploy to HIMAX WE1 EVB](#deploy-to-himax-we1-evb)
+-   [Deploy to CEVA-BX1](#deploy-to-ceva-bx1)
 -   [Run on macOS](#run-on-macos)
 -   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
 -   [Train your own model](#train-your-own-model)
@@ -35,16 +36,14 @@ board. General information and instructions on using the board with TensorFlow
 Lite Micro can be found in the common
 [ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
 
-This example is quantized with symmetric uint8 scheme. As noted in
-[kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md),
-embARC MLI supports optimized kernels for int8 quantization only. Therefore,
-this example will only use TFLM reference kernels.
+This example uses asymmetric int8 quantization and can therefore leverage
+optimized int8 kernels from the embARC MLI library
 
-The ARC EM SDP board contains the rich set of extension interfaces. You can
-choose any compatible microphone and modify
+The ARC EM SDP board contains a rich set of extension interfaces. You can choose
+any compatible microphone and modify
 [audio_provider.cc](/tensorflow/lite/micro/examples/micro_speech/audio_provider.cc)
-file accordingly to use input from your specific camera. By default, results of
-running this example are printed to the console. If you would like to instead
+file accordingly to use input from your specific microphone. By default, results
+of running this example are printed to the console. If you would like to instead
 implement some target-specific actions, you need to modify
 [command_responder.cc](/tensorflow/lite/micro/examples/micro_speech/command_responder.cc)
 accordingly.
@@ -64,9 +63,14 @@ recommended to get started with example for mock data. The project for ARC EM
 SDP platform can be generated with the following command:
 
 ```
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_micro_speech_mock_make_project
+make -f tensorflow/lite/micro/tools/make/Makefile \
+TARGET=arc_emsdp TAGS=reduce_codesize  \
+generate_micro_speech_mock_make_project
 ```
 
+Note that `TAGS=reduce_codesize` applies example specific changes of code to
+reduce total size of application. It can be ommited.
+
 ### Build and Run Example
 
 For more detailed information on building and running examples see the
@@ -107,6 +111,11 @@ get it started.
     *   Plug in the microSD card into the J11 connector.
     *   Push the RST button. If a red LED is lit beside RST button, push the CFG
         button.
+    *   Type or copy next commands one-by-another into serial terminal: `setenv
+        loadaddr 0x10800000 setenv bootfile app.elf setenv bootdelay 1 setenv
+        bootcmd fatload mmc 0 \$\{loadaddr\} \$\{bootfile\} \&\& bootelf
+        saveenv`
+    *   Push the RST button.
 
 6.  If you have the MetaWare Debugger installed in your environment:
 
@@ -498,29 +507,21 @@ using [ARM Mbed](https://github.com/ARMmbed/mbed-cli).
     make -f tensorflow/lite/micro/tools/make/Makefile TARGET=mbed TAGS="nxp_k66f" generate_micro_speech_mbed_project
     ```
 
-4.  Go to the location of the generated project. The generated project is
-    usually in
+4.  Change into the following directory that has been generated:
     `tensorflow/lite/micro/tools/make/gen/mbed_cortex-m4/prj/micro_speech/mbed`
 
-5.  Create a mbed project using the generated files: `mbed new .`
+5.  Create an Mbed project using the generated files, run ensuring your
+    environment is using Python 2.7: `mbed config root .`
 
-6.  Change the project setting to use C++ 11 rather than C++ 14 using:
+6.  Next, tell Mbed to download the dependencies and prepare to build: `mbed
+    deploy`
 
-    ```
-    python -c 'import fileinput, glob;
-    for filename in glob.glob("mbed-os/tools/profiles/*.json"):
-      for line in fileinput.input(filename, inplace=True):
-        print line.replace("\"-std=gnu++14\"","\"-std=c++11\", \"-fpermissive\"")'
-    ```
+7.  Finally, we can run the following command to compile the code: `mbed compile
+    -m K66F -t GCC_ARM`
 
-7.  To compile project, use the following command:
-
-    ```
-    mbed compile --target K66F --toolchain GCC_ARM --profile release
-    ```
-
-8.  For some mbed compilers, you may get compile error in mbed_rtc_time.cpp. Go
-    to `mbed-os/platform/mbed_rtc_time.h` and comment line 32 and line 37:
+8.  For some Mbed compilers (such as GCC), you may get compile error in
+    mbed_rtc_time.cpp. Go to `mbed-os/platform/mbed_rtc_time.h` and comment line
+    32 and line 37:
 
     ```
     //#if !defined(__GNUC__) || defined(__CC_ARM) || defined(__clang__)
@@ -531,11 +532,10 @@ using [ARM Mbed](https://github.com/ARMmbed/mbed-cli).
     //#endif
     ```
 
-9.  Look at helpful resources from NXP website such as
-    [NXP FRDM-K66F User guide](https://www.nxp.com/docs/en/user-guide/FRDMK66FUG.pdf)
-    and
-    [NXP FRDM-K66F Getting Started](https://www.nxp.com/document/guide/get-started-with-the-frdm-k66f:NGS-FRDM-K66F)
-    to understand information about the board.
+9.  If your system does not recognize the board with the `mbed detect` command.
+    Follow the instructions for setting up
+    [DAPLink](https://armmbed.github.io/DAPLink/?board=FRDM-K66F) for the
+    [K66F](https://os.mbed.com/platforms/FRDM-K66F/).
 
 10. Connect the USB cable to the micro USB port. When the Ethernet port is
     facing towards you, the micro USB port is left of the Ethernet port.
@@ -543,7 +543,7 @@ using [ARM Mbed](https://github.com/ARMmbed/mbed-cli).
 11. To compile and flash in a single step, add the `--flash` option:
 
     ```
-    mbed compile --target K66F --toolchain GCC_ARM --profile release --flash
+    mbed compile -m K66F -t GCC_ARM --flash
     ```
 
 12. Disconnect USB cable from the device to power down the device and connect
@@ -645,6 +645,13 @@ Following the Steps to run micro speech example at HIMAX WE1 EVB platform.
     cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
     ```
 
+    make sure this tool directory is in $PATH. You can permanently set it to
+    PATH by
+
+    ```
+    export PATH=$PATH:$(pwd)
+    ```
+
 5.  run image generate tool, generate flash image file.
 
     *   Before running image generate tool, by typing `sudo chmod +x image_gen`
@@ -664,6 +671,34 @@ application output in the serial terminal and lighting LED.
 
 ![Animation on Himax WE1 EVB](https://raw.githubusercontent.com/HimaxWiseEyePlus/bsp_tflu/master/HIMAX_WE1_EVB_user_guide/images/tflm_example_micro_speech_int8_led.gif)
 
+## Deploy to CEVA-BX1
+
+The following instructions will help you build and deploy the sample to the
+[CEVA-BX1](https://www.ceva-dsp.com/product/ceva-bx1-sound/)
+
+1.  Contact CEVA at [sales@ceva-dsp.com](mailto:sales@ceva-dsp.com)
+2.  Download and install CEVA-BX Toolbox v18.0.2 and run
+3.  Set the TARGET_TOOLCHAIN_ROOT variable in
+    /tensorflow/lite/micro/tools/make/templates/ceva_bx1/ceva_app_makefile.tpl
+    To your installation location. For example: TARGET_TOOLCHAIN_ROOT :=
+    /home/myuser/work/CEVA-ToolBox/V18/BX
+4.  Generate the Makefile for the project: /tensorflow$ make -f
+    tensorflow/lite/micro/tools/make/Makefile TARGET=ceva TARGET_ARCH=bx1
+    generate_micro_speech_make_project
+5.  Build the project:
+    /tensorflow/lite/micro/tools/make/gen/ceva_bx1/prj/micro_speech/make$ make
+6.  This should build the project and create a file called micro_speech.elf.
+7.  The supplied configuarion reads input from a files and expects a file called
+    input.wav (easily changed in audio_provider.cc) to be placed in the same
+    directory of the .elf file
+8.  We used Google's speech command dataset: V0.0.2:
+    http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz V0.0.1:
+    http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz
+9.  Follow CEVA Toolbox instructions for creating a debug target and running the
+    project.
+10. Output should look like: Heard silence (208) @352ms Heard no (201) @1696ms
+    Heard yes (203) @3904ms
+
 ## Run on macOS
 
 The example contains an audio provider compatible with macOS. If you have access
diff --git a/tensorflow/lite/micro/examples/micro_speech/apollo3/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/apollo3/Makefile.inc
index 21e167e9290..6d550e2f2b4 100644
--- a/tensorflow/lite/micro/examples/micro_speech/apollo3/Makefile.inc
+++ b/tensorflow/lite/micro/examples/micro_speech/apollo3/Makefile.inc
@@ -15,7 +15,7 @@ ifeq ($(TARGET), apollo3evb)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
 	-o $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY) $(PUSHBUTTON_MICRO_SPEECH_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+	$(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
   pushbutton_micro_speech_test: $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY)
   pushbutton_micro_speech_test_bin: $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY).bin
   test_pushbutton_micro_speech: $(PUSHBUTTON_MICRO_SPEECH_TEST_BINARY)
@@ -39,7 +39,7 @@ ifeq ($(TARGET), apollo3evb)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
 	-o $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY) $(PUSHBUTTON_CMSIS_SPEECH_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+	$(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
   pushbutton_cmsis_speech_test: $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY)
   pushbutton_cmsis_speech_test_bin: $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY).bin
   test_pushbutton_cmsis_speech: $(PUSHBUTTON_CMSIS_SPEECH_TEST_BINARY)
@@ -62,7 +62,7 @@ ifeq ($(TARGET), apollo3evb)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
 	-o $(PREPROCESSOR_1K_MICRO_TEST_BINARY) $(PREPROCESSOR_1K_MICRO_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+	$(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
   preprocessor_1k_micro_test: $(PREPROCESSOR_1K_MICRO_TEST_BINARY)
   preprocessor_1k_micro_test_bin: $(PREPROCESSOR_1K_MICRO_TEST_BINARY).bin
   test_preprocessor_1k_micro: $(PREPROCESSOR_1K_MICRO_TEST_BINARY)
@@ -85,7 +85,7 @@ ifeq ($(TARGET), apollo3evb)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
 	-o $(PREPROCESSOR_1K_CMSIS_TEST_BINARY) $(PREPROCESSOR_1K_CMSIS_TEST_OBJS) \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+	$(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
   preprocessor_1k_cmsis_test: $(PREPROCESSOR_1K_CMSIS_TEST_BINARY)
   preprocessor_1k_cmsis_test_bin: $(PREPROCESSOR_1K_CMSIS_TEST_BINARY).bin
   test_preprocessor_1k_cmsis: $(PREPROCESSOR_1K_CMSIS_TEST_BINARY)
diff --git a/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc
index 850263f0eb9..74860daf82c 100644
--- a/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc
+++ b/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc
@@ -4,7 +4,8 @@ ifeq ($(TARGET), arc_emsdp)
 # In particular:
 # - Extend Heap and stack size for application needs
 # - Use Linker command file with better usage of fast memory
-# - In case project was generated with MLI usage, reduce scratch buffers.
+# - Optional (TAGS=reduce_codesize): In case project was 
+#   generated with MLI usage, reduce scratch buffers.
 
   MICRO_SPEECH_HDRS += \
   micro_speech_patch.txt
@@ -15,14 +16,35 @@ ifeq ($(TARGET), arc_emsdp)
   MICRO_SPEECH_MOCK_HDRS += \
   micro_speech_patch.txt
 
-%/micro_speech_patch.txt: %/emsdp.lcf %/Makefile
-	@cp tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf $< 
-	@echo emsdp.lcf > $@
+  ARC_MLI_BACKEND_PATH = /tensorflow/lite/micro/kernels/arc_mli
+
+# Apply changes in generated project files. 
+# See related comment echoed (@echo <comment>) after each change 
+# to get understanding on it's purpose.
+%/micro_speech_patch.txt: %/emsdp.lcf %/Makefile %$(ARC_MLI_BACKEND_PATH)/depthwise_conv.cc
+	@cp tensorflow/lite/micro/examples/micro_speech/arc_emsdp/emsdp.lcf $< 
+	@echo emsdp.lcf: Example specific memory map > $@
+	
 	@sed -E -i 's#-Hheap=[^ ]*#\-Hheap=16K \-Hstack=16K#g' $(word 2, $^)
-	@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= false\n\
+	@echo Makefile: Set exact heap/stack size >> $@
+
+	@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= true\n\
 	CXXFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0\
 	CCFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0#'\
 	  $(word 2, $^)
-	@echo Makefile >> $@
+	@echo Makefile: set scratch buffers size to 0 >> $@
+	@echo Makefile: No Reference fallback for MLI supported functions >> $@
+
+
+ifneq ($(filter $(ALL_TAGS), reduce_codesize),)
+# In case 'reduce_codesize' tag is present, we replace common MLI functions with 
+# specializations appropriate for this particular graph. But such changes of code 
+# with high probability may not be acceptable for other graphs and will need 
+# to be adjusted by the user
+
+	@sed -E -i 's#mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32\(# \
+	mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32_generic\(#g' $(word 3, $^)
+	@echo $(word 3, $^): Use generic function >> $@
+endif
 
 endif
diff --git a/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/emsdp.lcf b/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/emsdp.lcf
new file mode 100644
index 00000000000..ae17db1164a
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/emsdp.lcf
@@ -0,0 +1,72 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Difference with common EMSDP LCF file (to reduce data access time): 
+# - move data and ro_data in data from external PSRAM to DCCM (includes model)
+# - move text from SRAM to ICCM
+# - move BSS from PSRAM to XCCM (includes tensor arena)
+# - move stack to YCCM (as exclusive bank not shared with other data)
+#
+# CCMWRAP memory regions indicate unusable portions of the address space
+#   due to CCM memory wrapping into upper addresses beyond its size
+
+MEMORY {
+    PSRAM   : ORIGIN = 0x10000400, LENGTH = (0x01000000 >> 1) - 0x400
+    SRAM    : ORIGIN = 0x20000000, LENGTH = 0x00040000
+    IVT     : ORIGIN = 0x60000000, LENGTH = 0x400
+    ICCM0   : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400)
+#   CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
+    DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00020000
+#   CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
+    XCCM    : ORIGIN = 0x90000000, LENGTH = 0x00004000
+#   CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000
+    YCCM    : ORIGIN = 0xa0000000, LENGTH = 0x00004000
+#   CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
+    }
+
+SECTIONS {
+
+    GROUP BLOCK(4) : {
+        .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4)
+    } > IVT
+
+    GROUP BLOCK(4): {
+        .text? : { *('.text$crt*') }
+        * (TEXT): {}
+        * (LIT): {}
+    } > ICCM0
+
+    GROUP BLOCK(4): {
+        .debug_log? : {}
+    } > SRAM
+
+    GROUP BLOCK(4): {
+    /* _SDA_BASE_ computed implicitly */
+        .sdata?: {}
+        .sbss?: {}
+        * (DATA): {}
+       .Zdata? : {}
+       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
+    } > DCCM
+
+    GROUP BLOCK(4): {
+        * (BSS): {}
+        .Xdata? : {}
+    } > XCCM
+
+    GROUP BLOCK(4): {
+       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:8K): {}
+        .Ydata? : {}
+    } > YCCM
+}
+
+
diff --git a/tensorflow/lite/micro/examples/micro_speech/ceva/audio_provider.cc b/tensorflow/lite/micro/examples/micro_speech/ceva/audio_provider.cc
new file mode 100755
index 00000000000..4764fcb105a
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/ceva/audio_provider.cc
@@ -0,0 +1,118 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/micro_speech/audio_provider.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+
+namespace {
+
+int32_t g_latest_audio_timestamp = 0;
+
+constexpr int kNoOfSamples = 512;
+bool g_is_audio_initialized = false;
+constexpr int kAudioCaptureBufferSize = kAudioSampleFrequency * 0.5;
+int16_t g_audio_capture_buffer[kAudioCaptureBufferSize];
+int16_t g_audio_output_buffer[kMaxAudioSampleSize];
+int16_t audio[513];
+
+}  // namespace
+
+// Main entry point for getting audio data.
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples) {
+  if (!g_is_audio_initialized) {
+    g_is_audio_initialized = true;
+  }
+  // This should only be called when the main thread notices that the latest
+  // audio sample data timestamp has changed, so that there's new data in the
+  // capture ring buffer. The ring buffer will eventually wrap around and
+  // overwrite the data, but the assumption is that the main thread is checking
+  // often enough and the buffer is large enough that this call will be made
+  // before that happens.
+  const int start_offset = start_ms * (kAudioSampleFrequency / 1000);
+  const int duration_sample_count =
+      duration_ms * (kAudioSampleFrequency / 1000);
+  for (int i = 0; i < duration_sample_count; ++i) {
+    const int capture_index = (start_offset + i) % kAudioCaptureBufferSize;
+    g_audio_output_buffer[i] = g_audio_capture_buffer[capture_index];
+  }
+  *audio_samples_size = kMaxAudioSampleSize;
+  *audio_samples = g_audio_output_buffer;
+  return kTfLiteOk;
+}
+int32_t LatestAudioTimestamp() { return g_latest_audio_timestamp; }
+
+char filename[50] = "input.wav";
+FILE* infile;
+void init_audio() {
+  uint8_t mem[3];
+  printf("Using filename %s\n", filename);
+
+  infile = fopen(filename, "rb");
+  if (!infile) {
+    printf("Can't open file\n");
+    exit(1);
+  }
+
+  // skip wav header
+  for (int i = 0; i < 44; i++) {
+    fread(mem, 1, 1, infile);
+  }
+}
+
+void read_samples() {
+  int i = 0;
+  uint8_t mem[3];
+  bool done;
+  for (int i = 0; i < kNoOfSamples; i++) {
+    if (fread((char*)mem, 1, 2, infile) == 2) {
+      audio[i] = (int16_t)mem[0] + (((int16_t)mem[1]) << 8);
+    } else {
+      done = true;
+      fclose(infile);
+      infile = fopen(filename, "rb");
+      printf("EOF reached\n");
+
+      break;
+    }
+  }
+}
+
+void CaptureSamples(const int16_t* sample_data) {
+  const int sample_size = kNoOfSamples;
+  const int32_t time_in_ms =
+      g_latest_audio_timestamp + (sample_size / (kAudioSampleFrequency / 1000));
+
+  const int32_t start_sample_offset =
+      g_latest_audio_timestamp * (kAudioSampleFrequency / 1000);
+
+  for (int i = 0; i < sample_size; ++i) {
+    const int capture_index =
+        (start_sample_offset + i) % kAudioCaptureBufferSize;
+    g_audio_capture_buffer[capture_index] = sample_data[i];
+  }
+  // This is how we let the outside world know that new audio data has arrived.
+  g_latest_audio_timestamp = time_in_ms;
+}
+
+void GetAudio() {
+  read_samples();
+  CaptureSamples(audio);
+}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/micro_speech/ceva/main_functions.cc b/tensorflow/lite/micro/examples/micro_speech/ceva/main_functions.cc
new file mode 100644
index 00000000000..db19645f2b1
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/ceva/main_functions.cc
@@ -0,0 +1,206 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <fstream>
+#include <iostream>
+
+#include "tensorflow/lite/micro/examples/micro_speech/audio_provider.h"
+#include "tensorflow/lite/micro/examples/micro_speech/command_responder.h"
+#include "tensorflow/lite/micro/examples/micro_speech/feature_provider.h"
+#include "tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h"
+#include "tensorflow/lite/micro/examples/micro_speech/micro_features/model.h"
+#include "tensorflow/lite/micro/examples/micro_speech/recognize_commands.h"
+#include "tensorflow/lite/micro/micro_interpreter.h"
+#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+extern int32_t g_latest_audio_timestamp;
+void setup_tf();
+void CaptureSamples(const int16_t* sample_data);
+void init_audio();
+void GetAudio();
+int detection_loop();
+void read_samples();
+int32_t LatestAudioTimestamp();
+extern "C" {
+void setup() {
+  init_audio();
+  setup_tf();
+}
+}
+
+extern "C" {
+void loop() {
+  // get audio samples
+  // run detection
+  GetAudio();
+  detection_loop();
+}
+}
+
+// Globals, used for compatibility with Arduino-style sketches.
+namespace {
+tflite::ErrorReporter* error_reporter = nullptr;
+const tflite::Model* model = nullptr;
+tflite::MicroInterpreter* interpreter = nullptr;
+TfLiteTensor* model_input = nullptr;
+FeatureProvider* feature_provider = nullptr;
+RecognizeCommands* recognizer = nullptr;
+int32_t previous_time = 0;
+int8_t feature_buffer[kFeatureElementCount];
+int8_t* model_input_buffer = nullptr;
+// Create an area of memory to use for input, output, and intermediate arrays.
+// The size of this will depend on the model you're using, and may need to be
+// determined by experimentation.
+constexpr int kTensorArenaSize = 10 * 1024;
+uint8_t tensor_arena[kTensorArenaSize];
+}  // namespace
+
+// The name of this function is important for Arduino compatibility.
+void setup_tf() {
+  // Set up logging. Google style is to avoid globals or statics because of
+  // lifetime uncertainty, but since this has a trivial destructor it's okay.
+  // NOLINTNEXTLINE(runtime-global-variables)
+  static tflite::MicroErrorReporter micro_error_reporter;
+  int i;
+  error_reporter = &micro_error_reporter;
+
+  // Map the model into a usable data structure. This doesn't involve any
+  // copying or parsing, it's a very lightweight operation.
+  model = tflite::GetModel(g_model);
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    TF_LITE_REPORT_ERROR(error_reporter,
+                         "Model provided is schema version %d not equal to "
+                         "supported version %d.",
+                         model->version(), TFLITE_SCHEMA_VERSION);
+    return;
+  }
+
+  // Pull in only the operation implementations we need.
+  // This relies on a complete list of all the ops needed by this graph.
+  // An easier approach is to just use the AllOpsResolver, but this will
+  // incur some penalty in code space for op implementations that are not
+  // needed by this graph.
+  //
+  // tflite::AllOpsResolver resolver;
+  // NOLINTNEXTLINE(runtime-global-variables)
+  static tflite::MicroMutableOpResolver<4> micro_op_resolver(error_reporter);
+  if (micro_op_resolver.AddDepthwiseConv2D() != kTfLiteOk) {
+    return;
+  }
+  if (micro_op_resolver.AddFullyConnected() != kTfLiteOk) {
+    return;
+  }
+  if (micro_op_resolver.AddSoftmax() != kTfLiteOk) {
+    return;
+  }
+  if (micro_op_resolver.AddReshape() != kTfLiteOk) {
+    return;
+  }
+  // Build an interpreter to run the model with.
+  static tflite::MicroInterpreter static_interpreter(
+      model, micro_op_resolver, tensor_arena, kTensorArenaSize, error_reporter);
+  interpreter = &static_interpreter;
+
+  // Allocate memory from the tensor_arena for the model's tensors.
+  TfLiteStatus allocate_status = interpreter->AllocateTensors();
+  if (allocate_status != kTfLiteOk) {
+    TF_LITE_REPORT_ERROR(error_reporter, "AllocateTensors() failed");
+    return;
+  }
+
+  // Get information about the memory area to use for the model's input.
+  model_input = interpreter->input(0);
+  if ((model_input->dims->size != 2) || (model_input->dims->data[0] != 1) ||
+      (model_input->dims->data[1] !=
+       (kFeatureSliceCount * kFeatureSliceSize)) ||
+      (model_input->type != kTfLiteInt8)) {
+    TF_LITE_REPORT_ERROR(error_reporter,
+                         "Bad input tensor parameters in model");
+    return;
+  }
+  model_input_buffer = model_input->data.int8;
+
+  // Prepare to access the audio spectrograms from a microphone or other source
+  // that will provide the inputs to the neural network.
+  // NOLINTNEXTLINE(runtime-global-variables)
+  static FeatureProvider static_feature_provider(kFeatureElementCount,
+                                                 feature_buffer);
+  feature_provider = &static_feature_provider;
+
+  static RecognizeCommands static_recognizer(error_reporter);
+  recognizer = &static_recognizer;
+
+  previous_time = 0;
+}
+
+int detection_loop() {
+  // Fetch the spectrogram for the current time.
+  int retVal = 0;
+  const int32_t current_time = LatestAudioTimestamp();
+  int how_many_new_slices = 0;
+  int static frame_counter = 0;
+
+  TfLiteStatus feature_status = feature_provider->PopulateFeatureData(
+      error_reporter, previous_time, current_time, &how_many_new_slices);
+
+  printf("frame =  %d\n", frame_counter);
+  frame_counter++;
+  if (feature_status != kTfLiteOk) {
+    TF_LITE_REPORT_ERROR(error_reporter, "Feature generation failed");
+    return retVal;
+  }
+  previous_time = current_time;
+  // If no new audio samples have been received since last time, don't bother
+  // running the network model.
+  if (how_many_new_slices == 0) {
+    printf("no new slices\n");
+    return retVal;
+  }
+  // Copy feature buffer to input tensor
+  for (int i = 0; i < kFeatureElementCount; i++) {
+    model_input_buffer[i] = feature_buffer[i];
+  }
+
+  // Run the model on the spectrogram input and make sure it succeeds.
+  TfLiteStatus invoke_status = interpreter->Invoke();
+  if (invoke_status != kTfLiteOk) {
+    TF_LITE_REPORT_ERROR(error_reporter, "Invoke failed");
+    return retVal;
+  }
+
+  // The output from the model is a vector containing the scores for each
+  // kind of prediction, so figure out what the highest scoring category was.
+  TfLiteTensor* output = interpreter->output(0);
+
+  const char* found_command = nullptr;
+  uint8_t score = 0;
+  bool is_new_command = false;
+  TfLiteStatus process_status = recognizer->ProcessLatestResults(
+      output, current_time, &found_command, &score, &is_new_command);
+  if (process_status != kTfLiteOk) {
+    TF_LITE_REPORT_ERROR(error_reporter,
+                         "RecognizeCommands::ProcessLatestResults() failed");
+    return retVal;
+  }
+  // Do something based on the recognized command. The default implementation
+  // just prints to the error console, but you should replace this with your
+  // own function for a real application.
+  RespondToCommand(error_reporter, current_time, found_command, score,
+                   is_new_command);
+
+  return retVal;
+}
diff --git a/tensorflow/lite/micro/examples/micro_speech/command_responder_test.cc b/tensorflow/lite/micro/examples/micro_speech/command_responder_test.cc
index 818b0840d08..99ff2d1d9ec 100644
--- a/tensorflow/lite/micro/examples/micro_speech/command_responder_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/command_responder_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/examples/micro_speech/command_responder.h"
 
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 TF_LITE_MICRO_TESTS_BEGIN
 
diff --git a/tensorflow/lite/micro/examples/micro_speech/recognize_commands_test.cc b/tensorflow/lite/micro/examples/micro_speech/recognize_commands_test.cc
index 089da9173c7..805ff40820f 100644
--- a/tensorflow/lite/micro/examples/micro_speech/recognize_commands_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/recognize_commands_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/examples/micro_speech/recognize_commands.h"
 
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 TF_LITE_MICRO_TESTS_BEGIN
 
diff --git a/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc b/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
index 563500f2115..1d945420b9a 100644
--- a/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
+++ b/tensorflow/lite/micro/examples/network_tester/network_tester_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
 
diff --git a/tensorflow/lite/micro/examples/person_detection/Makefile.inc b/tensorflow/lite/micro/examples/person_detection/Makefile.inc
index a295bb83f71..8dea8f96549 100644
--- a/tensorflow/lite/micro/examples/person_detection/Makefile.inc
+++ b/tensorflow/lite/micro/examples/person_detection/Makefile.inc
@@ -1,6 +1,3 @@
-$(eval $(call add_third_party_download,$(PERSON_MODEL_URL),$(PERSON_MODEL_MD5),person_model_grayscale,))
-$(eval $(call add_third_party_download,$(RUY_URL),$(RUY_MD5),ruy,))
-
 person_detection_MODEL_SRCS := \
 tensorflow/lite/micro/examples/person_detection/model_settings.cc \
 $(MAKEFILE_DIR)/downloads/person_model_grayscale/person_detect_model_data.cc
@@ -56,9 +53,24 @@ include $(wildcard tensorflow/lite/micro/examples/person_detection/*/Makefile.in
 $(eval $(call microlite_test,person_detection_test,\
 $(person_detection_TEST_SRCS),$(person_detection_TEST_HDRS)))
 
+# Three conflicting issues here:
+# 1. The image_provider_test fails on Sparkfun Edge we do not have a way to
+#    filter out individual tests within and example.
+# 2. We do not want to completely remove person_detection from the sparkfun_edge
+#    build.
+# 3. We do want to keep as many targets as possible be part of the sparkfun_edge
+#    CI build to avoid getting into similar situations where some parts of the
+#    code are supported on a platform while other parts are not.
+#
+# The current nasty workaround is to explicitly exclude the offending test for
+# the sparkfun_edge target. Note that we are not exluding it for
+# TARGET=apollo3evb becuase that is not part of our CI builds (and the two are
+# basically equivalent).
+ifneq ($(TARGET),sparkfun_edge)
 # Tests the image provider module.
 $(eval $(call microlite_test,image_provider_test,\
 $(IMAGE_PROVIDER_TEST_SRCS),$(IMAGE_PROVIDER_TEST_HDRS)))
+endif
 
 # Tests the detection responder module.
 $(eval $(call microlite_test,detection_responder_test,\
diff --git a/tensorflow/lite/micro/examples/person_detection/README.md b/tensorflow/lite/micro/examples/person_detection/README.md
index 423941dcad8..3069edf810b 100644
--- a/tensorflow/lite/micro/examples/person_detection/README.md
+++ b/tensorflow/lite/micro/examples/person_detection/README.md
@@ -28,8 +28,8 @@ This example is quantized with symmetric uint8 scheme. As noted in
 embARC MLI supports optimized kernels for int8 quantization only. Therefore,
 this example will only use TFLM reference kernels.
 
-The ARC EM SDP board contains the reach set of extension interfaces. You can
-choose any compatible camera and modify
+The ARC EM SDP board contains a rich set of extension interfaces. You can choose
+any compatible camera and modify
 [image_provider.cc](/tensorflow/lite/micro/examples/person_detection/image_provider.cc)
 file accordingly to use input from your specific camera. By default, results of
 running this example are printed to the console. If you would like to instead
@@ -94,6 +94,11 @@ get it started.
     *   Plug in the microSD card into the J11 connector.
     *   Push the RST button. If a red LED is lit beside RST button, push the CFG
         button.
+    *   Type or copy next commands one-by-another into serial terminal: `setenv
+        loadaddr 0x10800000 setenv bootfile app.elf setenv bootdelay 1 setenv
+        bootcmd fatload mmc 0 \$\{loadaddr\} \$\{bootfile\} \&\& bootelf
+        saveenv`
+    *   Push the RST button.
 
 6.  If you have the MetaWare Debugger installed in your environment:
 
diff --git a/tensorflow/lite/micro/examples/person_detection/detection_responder_test.cc b/tensorflow/lite/micro/examples/person_detection/detection_responder_test.cc
index 1714079f39a..7292a4a1ade 100644
--- a/tensorflow/lite/micro/examples/person_detection/detection_responder_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection/detection_responder_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/examples/person_detection/detection_responder.h"
 
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 TF_LITE_MICRO_TESTS_BEGIN
 
diff --git a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
index 548b95e0acc..22ea96f6d49 100644
--- a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
-#include "tensorflow/lite/micro/micro_optional_debug_tools.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
@@ -45,7 +44,6 @@ TF_LITE_MICRO_TEST(TestInvoke) {
                          "to supported version %d.\n",
                          model->version(), TFLITE_SCHEMA_VERSION);
   }
-  PrintModelData(model, &micro_error_reporter);
 
   // Pull in only the operation implementations we need.
   // This relies on a complete list of all the ops needed by this graph.
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/Makefile.inc b/tensorflow/lite/micro/examples/person_detection_experimental/Makefile.inc
index f01fb7676ec..3e242878a79 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/Makefile.inc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/Makefile.inc
@@ -55,9 +55,24 @@ include $(wildcard tensorflow/lite/micro/examples/person_detection_experimental/
 $(eval $(call microlite_test,person_detection_test_int8,\
 $(person_detection_TEST_SRCS),$(person_detection_TEST_HDRS)))
 
+# Three conflicting issues here:
+# 1. The image_provider_test fails on Sparkfun Edge we do not have a way to
+#    filter out individual tests within and example.
+# 2. We do not want to completely remove person_detection from the sparkfun_edge
+#    build.
+# 3. We do want to keep as many targets as possible be part of the sparkfun_edge
+#    CI build to avoid getting into similar situations where some parts of the
+#    code are supported on a platform while other parts are not.
+#
+# The current nasty workaround is to explicitly exclude the offending test for
+# the sparkfun_edge target. Note that we are not exluding it for
+# TARGET=apollo3evb becuase that is not part of our CI builds (and the two are
+# basically equivalent).
+ifneq ($(TARGET),sparkfun_edge)
 # Tests the image provider module.
 $(eval $(call microlite_test,image_provider_test_int8,\
 $(IMAGE_PROVIDER_TEST_SRCS),$(IMAGE_PROVIDER_TEST_HDRS)))
+endif
 
 # Tests the detection responder module.
 $(eval $(call microlite_test,detection_responder_test_int8,\
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/README.md b/tensorflow/lite/micro/examples/person_detection_experimental/README.md
index 4312e3a8b8b..f5f1d64d2ab 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/README.md
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/README.md
@@ -50,9 +50,14 @@ The example project for ARC EM SDP platform can be generated with the following
 command:
 
 ```
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp generate_person_detection_int8_make_project
+make -f tensorflow/lite/micro/tools/make/Makefile \
+TARGET=arc_emsdp TAGS=reduce_codesize \
+generate_person_detection_int8_make_project
 ```
 
+Note that `TAGS=reduce_codesize` applies example specific changes of code to
+reduce total size of application. It can be ommited.
+
 ### Build and Run Example
 
 For more detailed information on building and running examples see the
@@ -93,6 +98,11 @@ get it started.
     *   Plug in the microSD card into the J11 connector.
     *   Push the RST button. If a red LED is lit beside RST button, push the CFG
         button.
+    *   Type or copy next commands one-by-another into serial terminal: `setenv
+        loadaddr 0x10800000 setenv bootfile app.elf setenv bootdelay 1 setenv
+        bootcmd fatload mmc 0 \$\{loadaddr\} \$\{bootfile\} \&\& bootelf
+        saveenv`
+    *   Push the RST button.
 
 6.  If you have the MetaWare Debugger installed in your environment:
 
@@ -343,6 +353,13 @@ Following the Steps to run person detection example at HIMAX WE1 EVB platform.
     cd ../../../../../downloads/himax_we1_sdk/image_gen_linux_v3/
     ```
 
+    make sure this tool directory is in $PATH. You can permanently set it to
+    PATH by
+
+    ```
+    export PATH=$PATH:$(pwd)
+    ```
+
 5.  run image generate tool, generate flash image file.
 
     *   Before running image generate tool, by typing `sudo chmod +x image_gen`
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc
index c00f9b89953..0ecfdfa8737 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc
@@ -4,6 +4,8 @@ ifeq ($(TARGET), arc_emsdp)
 # for experimental person detection example. In particular:
 # - Use Linker command file with better usage of fast memory
 # - Stripout TFLM reference code by default.
+# - Optional: replace mli switchers with specialized kernels 
+#   for smaller code size
 
   person_detection_HDRS += \
   person_detection_int8_patch.txt
@@ -11,11 +13,31 @@ ifeq ($(TARGET), arc_emsdp)
   person_detection_TEST_HDRS += \
   person_detection_int8_patch.txt
   
-
-%/person_detection_int8_patch.txt: %/emsdp.lcf %/Makefile
+  ARC_MLI_BACKEND_PATH = /tensorflow/lite/micro/kernels/arc_mli
+  
+# Apply changes in generated project files. 
+# See related comment echoed (@echo <comment>) after each change 
+# to get understanding on it's purpose.
+%/person_detection_int8_patch.txt: %/emsdp.lcf %/Makefile %$(ARC_MLI_BACKEND_PATH)/conv.cc %$(ARC_MLI_BACKEND_PATH)/depthwise_conv.cc %$(ARC_MLI_BACKEND_PATH)/pooling.cc
 	@cp tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf $< 
-	@echo emsdp.lcf > $@
+	@echo emsdp.lcf: Replace with example specific memory map  > $@
+
 	@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= true#' $(word 2, $^)
-	@echo Makefile > $@
+	@echo Makefile: No Reference fallback for MLI supported functions >> $@
+
+ifneq ($(filter $(ALL_TAGS), reduce_codesize),)
+# In case 'reduce_codesize' tag is present, we replace common MLI functions with 
+# specializations appropriate for this particular graph. But such changes of code 
+# with high probability may not be acceptable for other graphs and will need 
+# to be adjusted by the user
+
+	@sed -E -i 's#mli_krn_conv2d_nhwc_sa8_sa8_sa32#mli_krn_conv2d_nhwc_sa8_sa8_sa32_k1x1_nopad#' $(word 3, $^)
+	@sed -E -i 's#mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32#mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32_k3x3_krnpad#' $(word 4, $^)
+	@sed -E -i 's#mli_krn_avepool_hwc_sa8#mli_krn_avepool_hwc_sa8_k3x3_nopad#' $(word 5, $^)
+	@sed -E -i 's#mli_krn_maxpool_hwc_sa8\(in_ptr, \&cfg, out_ptr\);#return kTfLiteError;#' $(word 5, $^)
+	@echo $(word 3, $^): Use specialization >> $@
+	@echo $(word 4, $^): Use specialization >> $@
+	@echo $(word 5, $^): Use specialization and remove max pooling >> $@
+endif
 
 endif
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/detection_responder_test.cc b/tensorflow/lite/micro/examples/person_detection_experimental/detection_responder_test.cc
index 3d86baa9d59..2cf6f68d7f4 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/detection_responder_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/detection_responder_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/examples/person_detection_experimental/detection_responder.h"
 
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 TF_LITE_MICRO_TESTS_BEGIN
 
diff --git a/tensorflow/lite/micro/hexagon/micro_time.cc b/tensorflow/lite/micro/hexagon/micro_time.cc
new file mode 100644
index 00000000000..9baf77b5653
--- /dev/null
+++ b/tensorflow/lite/micro/hexagon/micro_time.cc
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Hexagon timer implementation.
+// To include this with make, add TARGET=hexagon.
+#include "tensorflow/lite/micro/micro_time.h"
+
+#include <time.h>
+
+namespace tflite {
+
+int32_t ticks_per_second() { return CLOCKS_PER_SEC; }
+
+int32_t GetCurrentTimeTicks() { return clock(); }
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
index dcf2337aa24..6eaf3549b32 100644
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@@ -19,11 +19,82 @@ config_setting(
     define_values = {"tflm_build": "xtensa_hifimini_staging"},
 )
 
+package_group(
+    name = "micro",
+    packages = ["//tensorflow/lite/micro/..."],
+)
+
 package_group(
     name = "micro_top_level",
     packages = ["//tensorflow/lite/micro"],
 )
 
+cc_library(
+    name = "fixedpoint_utils",
+    hdrs = select({
+        "//conditions:default": [
+        ],
+        ":xtensa_hifimini": [
+            "xtensa_hifimini/fixedpoint_utils.h",
+        ],
+        ":xtensa_hifimini_staging": [
+            "xtensa_hifimini/fixedpoint_utils.h",
+        ],
+    }),
+    copts = micro_copts(),
+    deps = select({
+        "//conditions:default": [],
+        ":xtensa_hifimini": [
+            #"//third_party/xtensa/cstub64s:hifi_mini",
+            "//tensorflow/lite/kernels/internal:compatibility",
+        ],
+    }),
+)
+
+cc_library(
+    name = "fully_connected",
+    srcs = select({
+        "//conditions:default": [
+            "fully_connected.cc",
+        ],
+        ":xtensa_hifimini": [
+            "xtensa_hifimini/fully_connected.cc",
+        ],
+        ":xtensa_hifimini_staging": [
+            "xtensa_hifimini_staging/fully_connected.cc",
+        ],
+    }),
+    hdrs = ["fully_connected.h"],
+    copts = micro_copts(),
+    visibility = [
+        # Kernel variants need to be visible to the examples and benchmarks.
+        ":micro",
+    ],
+    deps = [
+        ":activation_utils",
+        ":fixedpoint_utils",
+        ":kernel_util",
+        ":micro_utils",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels:op_macros",
+        "//tensorflow/lite/kernels:padding",
+        "//tensorflow/lite/kernels/internal:common",
+        "//tensorflow/lite/kernels/internal:compatibility",
+        "//tensorflow/lite/kernels/internal:quantization_util",
+        "//tensorflow/lite/kernels/internal:reference_base",
+        "//tensorflow/lite/kernels/internal:tensor",
+        "//tensorflow/lite/kernels/internal:types",
+        "//tensorflow/lite/micro:memory_helpers",
+        "//tensorflow/lite/micro:micro_utils",
+    ] + select({
+        "//conditions:default": [],
+        ":xtensa_hifimini": [
+            #"//third_party/xtensa/cstub64s:hifi_mini",
+        ],
+    }),
+)
+
 cc_library(
     name = "micro_ops",
     srcs = [
@@ -53,7 +124,9 @@ cc_library(
         "reshape.cc",
         "resize_nearest_neighbor.cc",
         "round.cc",
+        "shape.cc",
         "split.cc",
+        "split_v.cc",
         "strided_slice.cc",
         "sub.cc",
         "tanh.cc",
@@ -62,7 +135,6 @@ cc_library(
         "//conditions:default": [
             "conv.cc",
             "depthwise_conv.cc",
-            "fully_connected.cc",
             "quantize.cc",
             "softmax.cc",
             "svdf.cc",
@@ -70,8 +142,6 @@ cc_library(
         ":xtensa_hifimini": [
             "xtensa_hifimini/conv.cc",
             "xtensa_hifimini/depthwise_conv.cc",
-            "xtensa_hifimini/fixedpoint_utils.h",
-            "xtensa_hifimini/fully_connected.cc",
             "xtensa_hifimini/quantize.cc",
             "xtensa_hifimini/softmax.cc",
             "xtensa_hifimini/svdf.cc",
@@ -84,8 +154,6 @@ cc_library(
             # behavior that we get with the Makefiles.
             "conv.cc",
             "depthwise_conv.cc",
-            "xtensa_hifimini/fixedpoint_utils.h",
-            "xtensa_hifimini_staging/fully_connected.cc",
             "xtensa_hifimini_staging/quantize.cc",
             "xtensa_hifimini_staging/softmax.cc",
             "xtensa_hifimini_staging/svdf.cc",
@@ -101,6 +169,7 @@ cc_library(
     deps = [
         ":activation_utils",
         ":kernel_util",
+        ":fixedpoint_utils",
         ":micro_utils",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:kernel_util",
@@ -134,6 +203,7 @@ tflite_micro_cc_test(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:debug_log",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -147,6 +217,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -161,6 +232,7 @@ tflite_micro_cc_test(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:tensor",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -171,13 +243,13 @@ tflite_micro_cc_test(
         "fully_connected_test.cc",
     ],
     deps = [
+        ":fully_connected",
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro:micro_utils",
         "//tensorflow/lite/micro:op_resolvers",
         "//tensorflow/lite/micro:test_helpers",
-        "//tensorflow/lite/micro/kernels:micro_ops",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -205,6 +277,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -233,6 +306,7 @@ tflite_micro_cc_test(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_utils",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -246,6 +320,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -259,6 +334,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -272,6 +348,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -285,6 +362,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -298,6 +376,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -311,6 +390,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -324,6 +404,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -337,6 +418,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -350,6 +432,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -363,6 +446,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -376,6 +460,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -389,6 +474,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -403,6 +489,7 @@ tflite_micro_cc_test(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:debug_log",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -417,6 +504,7 @@ tflite_micro_cc_test(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:debug_log",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -431,6 +519,22 @@ tflite_micro_cc_test(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:debug_log",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
+        "//tensorflow/lite/micro/testing:micro_test",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "split_v_test",
+    srcs = [
+        "split_v_test.cc",
+    ],
+    deps = [
+        ":kernel_runner",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:debug_log",
+        "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -444,6 +548,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -556,6 +661,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -569,6 +675,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -597,6 +704,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -611,6 +719,7 @@ tflite_micro_cc_test(
         ":micro_ops",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -624,6 +733,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -637,6 +747,7 @@ tflite_micro_cc_test(
         ":kernel_runner",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -649,6 +760,7 @@ tflite_micro_cc_test(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -661,6 +773,7 @@ tflite_micro_cc_test(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro:op_resolvers",
+        "//tensorflow/lite/micro:test_helpers",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
diff --git a/tensorflow/lite/micro/kernels/activations.cc b/tensorflow/lite/micro/kernels/activations.cc
index 2bdc0b5169a..b6feb786a95 100644
--- a/tensorflow/lite/micro/kernels/activations.cc
+++ b/tensorflow/lite/micro/kernels/activations.cc
@@ -139,7 +139,9 @@ TfLiteStatus ReluPrepare(TfLiteContext* context, TfLiteNode* node) {
   ReluOpData* data = static_cast<ReluOpData*>(node->user_data);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   if (input->type == kTfLiteInt8) {
     CalculateReluOpData<int8_t>(input, output, data);
@@ -200,6 +202,7 @@ TfLiteStatus Relu6Prepare(TfLiteContext* context, TfLiteNode* node) {
   Relu6OpData* data = static_cast<Relu6OpData*>(node->user_data);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
 
   if (input->type == kTfLiteInt8) {
     data->six_int8 = FloatToAsymmetricQuantizedInt8(6.0f, input->params.scale,
diff --git a/tensorflow/lite/micro/kernels/activations_test.cc b/tensorflow/lite/micro/kernels/activations_test.cc
index db23bdec475..edbe717bedb 100644
--- a/tensorflow/lite/micro/kernels/activations_test.cc
+++ b/tensorflow/lite/micro/kernels/activations_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/add.cc b/tensorflow/lite/micro/kernels/add.cc
index 79a04875def..e50d22cd740 100644
--- a/tensorflow/lite/micro/kernels/add.cc
+++ b/tensorflow/lite/micro/kernels/add.cc
@@ -110,19 +110,22 @@ void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
   tflite::ArithmeticParams op_params;
   SetActivationParams(data->output_activation_min_f32,
                       data->output_activation_max_f32, &op_params);
-#define TF_LITE_ADD(opname)                                               \
-  reference_ops::opname(op_params, tflite::micro::GetTensorShape(input1), \
-                        tflite::micro::GetTensorData<float>(input1),      \
-                        tflite::micro::GetTensorShape(input2),            \
-                        tflite::micro::GetTensorData<float>(input2),      \
-                        tflite::micro::GetTensorShape(output),            \
-                        tflite::micro::GetTensorData<float>(output))
   if (data->requires_broadcast) {
-    TF_LITE_ADD(BroadcastAdd4DSlow);
+    reference_ops::BroadcastAdd4DSlow(
+        op_params, tflite::micro::GetTensorShape(input1),
+        tflite::micro::GetTensorData<float>(input1),
+        tflite::micro::GetTensorShape(input2),
+        tflite::micro::GetTensorData<float>(input2),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<float>(output));
   } else {
-    TF_LITE_ADD(Add);
+    reference_ops::Add(op_params, tflite::micro::GetTensorShape(input1),
+                       tflite::micro::GetTensorData<float>(input1),
+                       tflite::micro::GetTensorShape(input2),
+                       tflite::micro::GetTensorData<float>(input2),
+                       tflite::micro::GetTensorShape(output),
+                       tflite::micro::GetTensorData<float>(output));
   }
-#undef TF_LITE_ADD
 }
 
 TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
@@ -147,27 +150,42 @@ TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
     bool need_broadcast = reference_ops::ProcessBroadcastShapes(
         tflite::micro::GetTensorShape(input1),
         tflite::micro::GetTensorShape(input2), &op_params);
-#define TF_LITE_ADD(type, opname, dtype)                         \
-  type::opname(op_params, tflite::micro::GetTensorShape(input1), \
-               tflite::micro::GetTensorData<dtype>(input1),      \
-               tflite::micro::GetTensorShape(input2),            \
-               tflite::micro::GetTensorData<dtype>(input2),      \
-               tflite::micro::GetTensorShape(output),            \
-               tflite::micro::GetTensorData<dtype>(output));
     if (output->type == kTfLiteInt8) {
       if (need_broadcast) {
-        TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
+        reference_integer_ops::BroadcastAdd4DSlow(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<int8_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<int8_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output));
       } else {
-        TF_LITE_ADD(reference_integer_ops, Add, int8_t);
+        reference_integer_ops::Add(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<int8_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<int8_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output));
       }
     } else {
       if (need_broadcast) {
-        TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, uint8_t);
+        reference_ops::BroadcastAdd4DSlow(
+            op_params, tflite::micro::GetTensorShape(input1),
+            tflite::micro::GetTensorData<uint8_t>(input1),
+            tflite::micro::GetTensorShape(input2),
+            tflite::micro::GetTensorData<uint8_t>(input2),
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<uint8_t>(output));
       } else {
-        TF_LITE_ADD(reference_ops, Add, uint8_t);
+        reference_ops::Add(op_params, tflite::micro::GetTensorShape(input1),
+                           tflite::micro::GetTensorData<uint8_t>(input1),
+                           tflite::micro::GetTensorShape(input2),
+                           tflite::micro::GetTensorData<uint8_t>(input2),
+                           tflite::micro::GetTensorShape(output),
+                           tflite::micro::GetTensorData<uint8_t>(output));
       }
     }
-#undef TF_LITE_ADD
   }
 
   return kTfLiteOk;
@@ -183,8 +201,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->builtin_data != nullptr);
 
   const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TF_LITE_ENSURE(context, input1 != nullptr);
   const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TF_LITE_ENSURE(context, input2 != nullptr);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   OpData* data = static_cast<OpData*>(node->user_data);
   auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
diff --git a/tensorflow/lite/micro/kernels/add_test.cc b/tensorflow/lite/micro/kernels/add_test.cc
index 5ea9daee621..241dda7d090 100644
--- a/tensorflow/lite/micro/kernels/add_test.cc
+++ b/tensorflow/lite/micro/kernels/add_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/arc_mli/README.md b/tensorflow/lite/micro/kernels/arc_mli/README.md
index 9bd0085f373..1ded4c84f8a 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/README.md
+++ b/tensorflow/lite/micro/kernels/arc_mli/README.md
@@ -1,5 +1,12 @@
 # EmbARC MLI Library Based Optimizations of TensorFlow Lite Micro Kernels for ARC Platforms.
 
+## Maintainers
+
+*   [dzakhar](https://github.com/dzakhar)
+*   [JaccovG](https://github.com/JaccovG)
+
+## Introduction
+
 This folder contains kernel implementations which use optimized
 [embARC MLI Library](https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli).
 It allows acceleration of inference operations which use int8 (asymmetric
diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv.cc b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
index 905feb1a529..55ef2650bef 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
@@ -30,30 +30,35 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace conv {
+namespace {
 
 constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 256;
 
 // Conv is quantized along dimension 0:
 // https://www.tensorflow.org/lite/performance/quantization_spec
 constexpr int kConvQuantizedDimension = 0;
 
+// This file has 2 implementation of Conv.
+
 struct OpData {
   TfLitePaddingValues padding;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
+
   // The scaling factor from input to output (aka the 'real multiplier') can
   // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
   int output_shift;
 
   // Per channel output multiplier and shift.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
 
   // The range of the fused activation layer. For example for kNone and
   // uint8_t these would be 0 and 255.
@@ -85,16 +90,15 @@ bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
                  (params->dilation_width_factor == 1) &&
                  (params->dilation_height_factor == 1) &&
                  (affine_quantization->scale->size ==
-                  filter->dims->data[kConvQuantizedDimension]) &&
-                 affine_quantization->scale->size <= (kMaxChannels * 2);
+                  filter->dims->data[kConvQuantizedDimension]);
   return ret_val;
 }
 
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteConvParams* params, int width, int height,
-                             int filter_width, int filter_height, int out_width,
-                             int out_height, const TfLiteType data_type,
-                             bool mli_is_applicable, OpData* data) {
+                             const TfLiteConvParams* params, int width,
+                             int height, int filter_width, int filter_height,
+                             int out_width, int out_height,
+                             const TfLiteType data_type, OpData* data) {
   bool has_bias = node->inputs->size == 3;
   // Check number of inputs/outputs
   TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
@@ -106,16 +110,16 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
       params->stride_height, params->stride_width,
       params->dilation_height_factor, params->dilation_width_factor, height,
       width, filter_height, filter_width, padding, &out_height, &out_width);
-
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training.
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
-  if (data_type != kTfLiteFloat32 && !mli_is_applicable) {
-    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-    const TfLiteTensor* bias =
-        GetOptionalInputTensor(context, node, kBiasTensor);
-    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (data_type != kTfLiteFloat32 &&
+      !IsMliApplicable(context, input, filter, bias, params)) {
     int output_channels = filter->dims->data[kConvQuantizedDimension];
 
     TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
@@ -130,21 +134,82 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
   return kTfLiteOk;
 }
 
-TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteConvParams* params, OpData* data,
-                           const TfLiteTensor* input,
-                           const TfLiteTensor* filter, const TfLiteTensor* bias,
-                           TfLiteTensor* im2col, TfLiteTensor* hwcn_weights,
-                           TfLiteTensor* output) {
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  const auto params = static_cast<const TfLiteConvParams*>(node->builtin_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+
+  int input_width = input->dims->data[2];
+  int input_height = input->dims->data[1];
+  int filter_width = filter->dims->data[2];
+  int filter_height = filter->dims->data[1];
+  int output_width = output->dims->data[2];
+  int output_height = output->dims->data[1];
+
+  // Dynimically allocate per-channel quantization parameters.
+  const int num_channels = filter->dims->data[kConvQuantizedDimension];
+  data->per_channel_output_multiplier =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  data->per_channel_output_shift =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        static_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    TF_LITE_ENSURE(context, affine_quantization->zero_point);
+
+    TF_LITE_ENSURE(context,
+                   affine_quantization->scale->size == 1 ||
+                       affine_quantization->scale->size ==
+                           filter->dims->data[kConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  TF_LITE_ENSURE_STATUS(CalculateOpData(
+      context, node, params, input_width, input_height, filter_width,
+      filter_height, output_width, output_height, input->type, data));
+
+  data->input_zero_point = input->params.zero_point;
+  data->filter_zero_point = filter->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
+
+  return kTfLiteOk;
+}  // namespace conv
+
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteConvParams* params, const OpData& data,
+                   const TfLiteTensor* input, const TfLiteTensor* filter,
+                   const TfLiteTensor* bias, TfLiteTensor* im2col,
+                   TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
+  const int32_t input_offset = -data.input_zero_point;
+  const int32_t filter_offset = -data.filter_zero_point;
+  const int32_t output_offset = data.output_zero_point;
 
   ConvParams op_params;
   op_params.padding_type = RuntimePaddingType(params->padding);
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
@@ -152,44 +217,42 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   op_params.input_offset = input_offset;
   op_params.weights_offset = filter_offset;
   op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.output_multiplier = data.output_multiplier;
+  op_params.output_shift = -data.output_shift;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
   reference_ops::Conv(op_params, GetTensorShape(input),
                       GetTensorData<uint8_t>(input), GetTensorShape(filter),
                       GetTensorData<uint8_t>(filter), GetTensorShape(bias),
                       GetTensorData<int32_t>(bias), GetTensorShape(output),
                       GetTensorData<uint8_t>(output), GetTensorShape(im2col),
                       GetTensorData<uint8_t>(im2col), nullptr);
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
                      "Type %s (%d) is not supported by ARC MLI Library.",
                      TfLiteTypeGetName(input->type), input->type);
-  return kTfLiteError;
 #endif
 }
 
 TfLiteStatus EvalMliQuantizedPerChannel(
     TfLiteContext* context, TfLiteNode* node, TfLiteConvParams* params,
-    OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter,
+    const OpData& data, const TfLiteTensor* input, const TfLiteTensor* filter,
     const TfLiteTensor* bias, TfLiteTensor* output) {
   // Run Conv MLI kernel
   // MLI optimized version only supports int8_t dataype and dilation factor of 1
   if ((input->type == kTfLiteInt8) && (params->dilation_width_factor == 1) &&
       (params->dilation_height_factor == 1)) {
-    mli_tensor mli_in = {0};
-    mli_tensor mli_weights = {0};
-    mli_tensor mli_bias = {0};
-    mli_tensor mli_out = {0};
+    mli_tensor mli_in = {};
+    mli_tensor mli_weights = {};
+    mli_tensor mli_bias = {};
+    mli_tensor mli_out = {};
     mli_conv2d_cfg cfg = {};
 
     // reuse space allocated for OpData parameters
-    mli_weights.el_params.asym.scale.pi16 =
-        (int16_t*)data->per_channel_output_multiplier;
-    mli_bias.el_params.asym.scale.pi16 =
-        (int16_t*)data->per_channel_output_shift;
+    mli_weights.el_params.asym.scale.pi32 =
+        (int32_t*)data.per_channel_output_multiplier;
+    mli_bias.el_params.asym.scale.pi32 =
+        (int32_t*)data.per_channel_output_shift;
 
     int16_t filter_zero_point = 0;
     int16_t bias_zero_point = 0;
@@ -219,10 +282,10 @@ TfLiteStatus EvalMliQuantizedPerChannel(
       cfg.padding_top = 0;
       cfg.padding_bottom = 0;
     } else {
-      cfg.padding_left = data->padding.width;
-      cfg.padding_right = data->padding.width + data->padding.width_offset;
-      cfg.padding_top = data->padding.height;
-      cfg.padding_bottom = data->padding.height + data->padding.height_offset;
+      cfg.padding_left = data.padding.width;
+      cfg.padding_right = data.padding.width + data.padding.width_offset;
+      cfg.padding_top = data.padding.height;
+      cfg.padding_bottom = data.padding.height + data.padding.height_offset;
     }
 
     // for height slicing
@@ -276,7 +339,7 @@ TfLiteStatus EvalMliQuantizedPerChannel(
     mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
 
     void* input_buffer_ptr = NULL;
-    int input_buffer_size = 0;
+    uint32_t input_buffer_size = 0;
 
     while (!w_slice.Done()) {
       mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
@@ -331,54 +394,51 @@ TfLiteStatus EvalMliQuantizedPerChannel(
   return kTfLiteOk;
 }
 
-TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                                     TfLiteConvParams* params, OpData* data,
-                                     const TfLiteTensor* input,
-                                     const TfLiteTensor* filter,
-                                     const TfLiteTensor* bias,
-                                     TfLiteTensor* output) {
+void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteConvParams* params, const OpData& data,
+                             const TfLiteTensor* input,
+                             const TfLiteTensor* filter,
+                             const TfLiteTensor* bias, TfLiteTensor* output,
+                             TfLiteTensor* im2col) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   ConvParams op_params;
-  op_params.input_offset = -input->params.zero_point;
-  op_params.output_offset = output->params.zero_point;
+  op_params.input_offset = -data.input_zero_point;
+  op_params.output_offset = data.output_zero_point;
   op_params.stride_height = params->stride_height;
   op_params.stride_width = params->stride_width;
   op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.padding_values.height = data.padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
 
   reference_integer_ops::ConvPerChannel(
-      op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, GetTensorShape(input),
+      op_params, data.per_channel_output_multiplier,
+      data.per_channel_output_shift, GetTensorShape(input),
       GetTensorData<int8_t>(input), GetTensorShape(filter),
       GetTensorData<int8_t>(filter), GetTensorShape(bias),
       GetTensorData<int32_t>(bias), GetTensorShape(output),
       GetTensorData<int8_t>(output));
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
                      "Node configuration is not supported by ARC MLI Library.");
-  return kTfLiteError;
 #endif
 }
 
-TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteConvParams* params, OpData* data,
-                       const TfLiteTensor* input, const TfLiteTensor* filter,
-                       const TfLiteTensor* bias, TfLiteTensor* im2col,
-                       TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteConvParams* params, const OpData& data,
+               const TfLiteTensor* input, const TfLiteTensor* filter,
+               const TfLiteTensor* bias, TfLiteTensor* im2col,
+               TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
-
   ConvParams op_params;
   op_params.padding_type = RuntimePaddingType(params->padding);
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
@@ -392,12 +452,10 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
                       GetTensorData<float>(bias), GetTensorShape(output),
                       GetTensorData<float>(output), GetTensorShape(im2col),
                       GetTensorData<float>(im2col));
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
                      "Type %s (%d) is not supported by ARC MLI Library.",
                      TfLiteTypeGetName(input->type), input->type);
-  return kTfLiteError;
 #endif
 }
 
@@ -409,59 +467,26 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
 
-  int input_width = input->dims->data[2];
-  int input_height = input->dims->data[1];
-  int filter_width = filter->dims->data[2];
-  int filter_height = filter->dims->data[1];
-  int output_width = output->dims->data[2];
-  int output_height = output->dims->data[1];
-
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-
-    TF_LITE_ENSURE(context,
-                   affine_quantization->scale->size == 1 ||
-                       affine_quantization->scale->size ==
-                           filter->dims->data[kConvQuantizedDimension]);
-    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
-                      affine_quantization->zero_point->size);
-  }
-  bool mli_is_applicable =
-      IsMliApplicable(context, input, filter, bias, params);
-  TF_LITE_ENSURE_STATUS(
-      CalculateOpData(context, node, params, input_width, input_height,
-                      filter_width, filter_height, output_width, output_height,
-                      input->type, mli_is_applicable, &data));
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
-      return EvalFloat(context, node, params, &data, input, filter, bias,
-                       nullptr, nullptr, output);
+      EvalFloat(context, node, params, data, input, filter, bias, nullptr,
+                nullptr, output);
       break;
     case kTfLiteInt8:
-      if (mli_is_applicable) {
-        return EvalMliQuantizedPerChannel(context, node, params, &data, input,
-                                          filter, bias, output);
-
+      if (IsMliApplicable(context, input, filter, bias, params)) {
+        EvalMliQuantizedPerChannel(context, node, params, data, input, filter,
+                                   bias, output);
       } else {
-        return EvalQuantizedPerChannel(context, node, params, &data, input,
-                                       filter, bias, output);
+        EvalQuantizedPerChannel(context, node, params, data, input, filter,
+                                bias, output, nullptr);
       }
       break;
     case kTfLiteUInt8:
-      return EvalQuantized(context, node, params, &data, input, filter, bias,
-                           nullptr, nullptr, output);
+      EvalQuantized(context, node, params, data, input, filter, bias, nullptr,
+                    nullptr, output);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -471,19 +496,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-}  // namespace conv
+}  // namespace
 
 TfLiteRegistration Register_CONV_2D() {
-  return {/*init=*/nullptr,
+  return {/*init=*/Init,
           /*free=*/nullptr,
-          /*prepare=*/nullptr,
-          /*invoke=*/conv::Eval,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
index 9f8a6b4004c..d30a5308708 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
@@ -31,16 +31,12 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace depthwise_conv {
 namespace {
 
 constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 256;
 
 // Depthwise conv is quantized along dimension 3:
 // https://www.tensorflow.org/lite/performance/quantization_spec
@@ -48,15 +44,20 @@ constexpr int kDepthwiseConvQuantizedDimension = 3;
 
 struct OpData {
   TfLitePaddingValues padding;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
+
   // The scaling factor from input to output (aka the 'real multiplier') can
   // be represented as a fixed point multiplier plus a left shift.
   int32_t output_multiplier;
   int output_shift;
 
   // Per channel output multiplier and shift.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
-
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
   // The range of the fused activation layer. For example for kNone and
   // uint8_t these would be 0 and 255.
   int32_t output_activation_min;
@@ -81,16 +82,14 @@ bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
                  (params->dilation_height_factor == 1) &&
                  (affine_quantization->scale->size ==
                   filter->dims->data[kDepthwiseConvQuantizedDimension]) &&
-                 ((in_ch == filters_num) || (in_ch == 1)) &&
-                 affine_quantization->scale->size <= (kMaxChannels * 2);
+                 ((in_ch == filters_num) || (in_ch == 1));
   return ret_val;
 }
 
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
                              TfLiteDepthwiseConvParams* params, int width,
                              int height, int filter_width, int filter_height,
-                             const TfLiteType data_type, bool mli_is_applicable,
-                             OpData* data) {
+                             const TfLiteType data_type, OpData* data) {
   bool has_bias = node->inputs->size == 3;
   // Check number of inputs/outputs
   TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
@@ -105,41 +104,94 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training.
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
-  if (data_type != kTfLiteFloat32 && !mli_is_applicable) {
-    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-    const TfLiteTensor* bias =
-        GetOptionalInputTensor(context, node, kBiasTensor);
-    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (data_type != kTfLiteFloat32 &&
+      !IsMliApplicable(context, input, filter, bias, params)) {
     int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
 
-    // Ensure filter and bias channel count does not exceed space reserved for
-    // quantization metadata.
-    const auto filter_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    const auto bias_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(bias->quantization.params);
-    TF_LITE_ENSURE(context, filter_quantization->scale->size <= kMaxChannels);
-    TF_LITE_ENSURE(context, bias_quantization->scale->size <= kMaxChannels);
-
-    TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
+    return tflite::PopulateConvolutionQuantizationParams(
         context, input, filter, bias, output, params->activation,
         &data->output_multiplier, &data->output_shift,
         &data->output_activation_min, &data->output_activation_max,
         data->per_channel_output_multiplier,
-        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels));
+        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels);
   }
 #endif
   return kTfLiteOk;
 }
 
-}  // namespace
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
 
-TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteDepthwiseConvParams* params, OpData* data,
-                       const TfLiteTensor* input, const TfLiteTensor* filter,
-                       const TfLiteTensor* bias, TfLiteTensor* output) {
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+
+  const TfLiteType data_type = input->type;
+  int width = SizeOfDimension(input, 2);
+  int height = SizeOfDimension(input, 1);
+  int filter_width = SizeOfDimension(filter, 2);
+  int filter_height = SizeOfDimension(filter, 1);
+
+  // Per channel quantization is only needed for int8 inference. For other
+  // quantized types, only a single scale and zero point is needed.
+  const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
+  // Dynimically allocate per-channel quantization parameters.
+  data->per_channel_output_multiplier =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  data->per_channel_output_shift =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    TF_LITE_ENSURE(context, affine_quantization->zero_point);
+    TF_LITE_ENSURE(
+        context, affine_quantization->scale->size == 1 ||
+                     affine_quantization->scale->size ==
+                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
+                                        filter_width, filter_height, data_type,
+                                        data));
+
+  data->input_zero_point = input->params.zero_point;
+  data->filter_zero_point = filter->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
+
+  return kTfLiteOk;
+}
+
+void EvalFloat(TfLiteContext* context, TfLiteNode* node,
+               TfLiteDepthwiseConvParams* params, const OpData& data,
+               const TfLiteTensor* input, const TfLiteTensor* filter,
+               const TfLiteTensor* bias, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
@@ -148,8 +200,8 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
   tflite::DepthwiseParams op_params;
   // Padding type is ignored, but still set.
   op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
@@ -163,30 +215,27 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
       GetTensorShape(filter), GetTensorData<float>(filter),
       GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
       GetTensorData<float>(output));
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
                      "Type %s (%d) is not supported by ARC MLI Library.",
                      TfLiteTypeGetName(input->type), input->type);
-  return kTfLiteError;
 #endif
 }
-
 TfLiteStatus EvalMliQuantizedPerChannel(
     TfLiteContext* context, TfLiteNode* node, TfLiteDepthwiseConvParams* params,
-    OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter,
+    const OpData& data, const TfLiteTensor* input, const TfLiteTensor* filter,
     const TfLiteTensor* bias, TfLiteTensor* output) {
   // Run Depthwise Conv MLI kernel
-  mli_tensor mli_in = {0};
-  mli_tensor mli_weights = {0};
-  mli_tensor mli_bias = {0};
-  mli_tensor mli_out = {0};
+  mli_tensor mli_in = {};
+  mli_tensor mli_weights = {};
+  mli_tensor mli_bias = {};
+  mli_tensor mli_out = {};
   mli_conv2d_cfg cfg = {};
 
   // reuse space allocated for OpData parameters
-  mli_weights.el_params.asym.scale.pi16 =
-      (int16_t*)data->per_channel_output_multiplier;
-  mli_bias.el_params.asym.scale.pi16 = (int16_t*)data->per_channel_output_shift;
+  mli_weights.el_params.asym.scale.pi32 =
+      (int32_t*)data.per_channel_output_multiplier;
+  mli_bias.el_params.asym.scale.pi32 = (int32_t*)data.per_channel_output_shift;
 
   int16_t filter_zero_point = 0;
   int16_t bias_zero_point = 0;
@@ -216,10 +265,10 @@ TfLiteStatus EvalMliQuantizedPerChannel(
     cfg.padding_top = 0;
     cfg.padding_bottom = 0;
   } else {
-    cfg.padding_left = data->padding.width;
-    cfg.padding_right = data->padding.width + data->padding.width_offset;
-    cfg.padding_top = data->padding.height;
-    cfg.padding_bottom = data->padding.height + data->padding.height_offset;
+    cfg.padding_left = data.padding.width;
+    cfg.padding_right = data.padding.width + data.padding.width_offset;
+    cfg.padding_top = data.padding.height;
+    cfg.padding_bottom = data.padding.height + data.padding.height_offset;
   }
 
   // for height slicing
@@ -287,7 +336,7 @@ TfLiteStatus EvalMliQuantizedPerChannel(
   mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
 
   void* input_buffer_ptr = NULL;
-  int input_buffer_size = 0;
+  uint32_t input_buffer_size = 0;
   int padding_top = cfg.padding_top;
   int padding_bottom = cfg.padding_bottom;
 
@@ -348,89 +397,87 @@ TfLiteStatus EvalMliQuantizedPerChannel(
   return kTfLiteOk;
 }
 
-TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                                     TfLiteDepthwiseConvParams* params,
-                                     OpData* data, const TfLiteTensor* input,
-                                     const TfLiteTensor* filter,
-                                     const TfLiteTensor* bias,
-                                     TfLiteTensor* output) {
+void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteDepthwiseConvParams* params,
+                             const OpData& data, const TfLiteTensor* input,
+                             const TfLiteTensor* filter,
+                             const TfLiteTensor* bias, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   DepthwiseParams op_params;
   op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
   op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.depth_multiplier = params->depth_multiplier;
-  op_params.input_offset = -input->params.zero_point;
+  op_params.input_offset = -data.input_zero_point;
   op_params.weights_offset = 0;
-  op_params.output_offset = output->params.zero_point;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.output_offset = data.output_zero_point;
+  op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
+  op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
 
   reference_integer_ops::DepthwiseConvPerChannel(
-      op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, GetTensorShape(input),
+      op_params, data.per_channel_output_multiplier,
+      data.per_channel_output_shift, GetTensorShape(input),
       GetTensorData<int8_t>(input), GetTensorShape(filter),
       GetTensorData<int8_t>(filter), GetTensorShape(bias),
       GetTensorData<int32_t>(bias), GetTensorShape(output),
       GetTensorData<int8_t>(output));
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
                      "Node configuration is not supported by ARC MLI Library.");
-  return kTfLiteError;
 #endif
 }
 
-TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteDepthwiseConvParams* params, OpData* data,
-                           const TfLiteTensor* input,
-                           const TfLiteTensor* filter, const TfLiteTensor* bias,
-                           TfLiteTensor* output) {
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteDepthwiseConvParams* params, const OpData& data,
+                   const TfLiteTensor* input, const TfLiteTensor* filter,
+                   const TfLiteTensor* bias, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
-  const int32_t input_offset = -input->params.zero_point;
-  const int32_t filter_offset = -filter->params.zero_point;
-  const int32_t output_offset = output->params.zero_point;
+  const int32_t input_offset = -data.input_zero_point;
+  const int32_t filter_offset = -data.filter_zero_point;
+  const int32_t output_offset = data.output_zero_point;
 
   tflite::DepthwiseParams op_params;
   // Padding type is ignored, but still set.
   op_params.padding_type = PaddingType::kSame;
-  op_params.padding_values.width = data->padding.width;
-  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
   op_params.stride_width = params->stride_width;
   op_params.stride_height = params->stride_height;
   op_params.dilation_width_factor = params->dilation_width_factor;
   op_params.dilation_height_factor = params->dilation_height_factor;
   op_params.depth_multiplier = params->depth_multiplier;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
   op_params.input_offset = input_offset;
   op_params.weights_offset = filter_offset;
   op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_multiplier = data.output_multiplier;
   // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
+  op_params.output_shift = -data.output_shift;
 
   tflite::reference_ops::DepthwiseConv(
       op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
       GetTensorShape(filter), GetTensorData<uint8_t>(filter),
       GetTensorShape(bias), GetTensorData<int32_t>(bias),
       GetTensorShape(output), GetTensorData<uint8_t>(output));
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
                      "Type %s (%d) is not supported by ARC MLI Library.",
                      TfLiteTypeGetName(input->type), input->type);
-  return kTfLiteError;
 #endif
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
   auto* params =
       reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
@@ -438,55 +485,21 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bias =
       (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
 
-  const TfLiteType data_type = input->type;
-  int width = SizeOfDimension(input, 2);
-  int height = SizeOfDimension(input, 1);
-  int filter_width = SizeOfDimension(filter, 2);
-  int filter_height = SizeOfDimension(filter, 1);
-
-  OpData data;
-
-  // All per-channel quantized tensors need valid zero point and scale arrays.
-  if (input->type == kTfLiteInt8) {
-    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
-                      kTfLiteAffineQuantization);
-
-    const auto* affine_quantization =
-        reinterpret_cast<TfLiteAffineQuantization*>(
-            filter->quantization.params);
-    TF_LITE_ENSURE(context, affine_quantization);
-    TF_LITE_ENSURE(context, affine_quantization->scale);
-    TF_LITE_ENSURE(context, affine_quantization->zero_point);
-    TF_LITE_ENSURE(
-        context, affine_quantization->scale->size == 1 ||
-                     affine_quantization->scale->size ==
-                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
-    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
-                      affine_quantization->zero_point->size);
-  }
-
-  bool mli_is_applicable =
-      IsMliApplicable(context, input, filter, bias, params);
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
-                                        filter_width, filter_height, data_type,
-                                        mli_is_applicable, &data));
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
-      return EvalFloat(context, node, params, &data, input, filter, bias,
-                       output);
+      EvalFloat(context, node, params, data, input, filter, bias, output);
       break;
     case kTfLiteInt8:
-      if (mli_is_applicable) {
-        return EvalMliQuantizedPerChannel(context, node, params, &data, input,
-                                          filter, bias, output);
+      if (IsMliApplicable(context, input, filter, bias, params)) {
+        EvalMliQuantizedPerChannel(context, node, params, data, input, filter,
+                                   bias, output);
       } else {
-        return EvalQuantizedPerChannel(context, node, params, &data, input,
-                                       filter, bias, output);
+        EvalQuantizedPerChannel(context, node, params, data, input, filter,
+                                bias, output);
       }
       break;
     case kTfLiteUInt8:
-      return EvalQuantized(context, node, params, &data, input, filter, bias,
-                           output);
+      EvalQuantized(context, node, params, data, input, filter, bias, output);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
@@ -496,19 +509,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-}  // namespace depthwise_conv
+}  // namespace
 
 TfLiteRegistration Register_DEPTHWISE_CONV_2D() {
-  return {/*init=*/nullptr,
+  return {/*init=*/Init,
           /*free=*/nullptr,
-          /*prepare=*/nullptr,
-          /*invoke=*/depthwise_conv::Eval,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
index 24b3fed0998..2d201653efc 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
@@ -65,7 +65,7 @@ bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
 }
 
 TfLiteStatus CalculateOpData(TfLiteContext* context,
-                             TfLiteFullyConnectedParams* params,
+                             const TfLiteFullyConnectedParams* params,
                              TfLiteType data_type, const TfLiteTensor* input,
                              const TfLiteTensor* filter,
                              const TfLiteTensor* bias, TfLiteTensor* output,
@@ -91,47 +91,41 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
 }  // namespace
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  OpData* data = nullptr;
-  TfLiteStatus status = context->AllocatePersistentBuffer(
-      context, sizeof(OpData), reinterpret_cast<void**>(&data));
-  if (status != kTfLiteOk || data == nullptr) {
-    return nullptr;
-  }
-  return data;
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  OpData* data = reinterpret_cast<OpData*>(node->user_data);
-  auto* params =
-      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  const auto params =
+      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TF_LITE_ENSURE(context, data != nullptr);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   TF_LITE_ENSURE_MSG(context, input->type == filter->type,
                      "Hybrid models are not supported on TFLite Micro.");
 
-  TfLiteType data_type = input->type;
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
-                                        filter, bias, output, data));
-
-  return kTfLiteOk;
+  return CalculateOpData(context, params, input->type, input, filter, bias,
+                         output, data);
 }
 
 TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                                  TfLiteFullyConnectedParams* params,
-                                  OpData* data, const TfLiteTensor* input,
+                                  const TfLiteFullyConnectedParams* params,
+                                  const OpData& data, const TfLiteTensor* input,
                                   const TfLiteTensor* filter,
                                   const TfLiteTensor* bias,
                                   TfLiteTensor* output) {
-  mli_tensor mli_in = {0};
-  mli_tensor mli_weights = {0};
-  mli_tensor mli_bias = {0};
-  mli_tensor mli_out = {0};
+  mli_tensor mli_in = {};
+  mli_tensor mli_weights = {};
+  mli_tensor mli_bias = {};
+  mli_tensor mli_out = {};
 
   ConvertToMliTensor<int8_t>(input, &mli_in);
   ConvertToMliTensor<int8_t>(filter, &mli_weights);
@@ -229,19 +223,18 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
-                               TfLiteFullyConnectedParams* params, OpData* data,
-                               const TfLiteTensor* input,
+                               const OpData& data, const TfLiteTensor* input,
                                const TfLiteTensor* filter,
                                const TfLiteTensor* bias, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
-  FullyConnectedParams op_params;
+  tflite::FullyConnectedParams op_params;
   op_params.input_offset = -input->params.zero_point;
   op_params.weights_offset = -filter->params.zero_point;
   op_params.output_offset = output->params.zero_point;
-  op_params.output_multiplier = data->output_multiplier;
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.output_multiplier = data.output_multiplier;
+  op_params.output_shift = -data.output_shift;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
 
   reference_integer_ops::FullyConnected(
       op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
@@ -257,8 +250,7 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                           TfLiteFullyConnectedParams* params, OpData* data,
-                           const TfLiteTensor* input,
+                           const OpData& data, const TfLiteTensor* input,
                            const TfLiteTensor* filter, const TfLiteTensor* bias,
                            TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
@@ -270,11 +262,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   op_params.input_offset = input_offset;
   op_params.weights_offset = filter_offset;
   op_params.output_offset = output_offset;
-  op_params.output_multiplier = data->output_multiplier;
+  op_params.output_multiplier = data.output_multiplier;
   // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
-  op_params.output_shift = -data->output_shift;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
+  op_params.output_shift = -data.output_shift;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
 
 #define TF_LITE_FULLY_CONNECTED(output_data_type)                      \
   reference_ops::FullyConnected(                                       \
@@ -294,6 +286,7 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                          TfLiteTypeGetName(output->type), output->type);
       return kTfLiteError;
   }
+
   return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
@@ -304,12 +297,12 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteFullyConnectedParams* params, OpData* data,
+                       TfLiteFusedActivation activation,
                        const TfLiteTensor* input, const TfLiteTensor* filter,
                        const TfLiteTensor* bias, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   float output_activation_min, output_activation_max;
-  CalculateActivationRange(params->activation, &output_activation_min,
+  CalculateActivationRange(activation, &output_activation_min,
                            &output_activation_max);
   tflite::FullyConnectedParams op_params;
   op_params.float_activation_min = output_activation_min;
@@ -329,38 +322,38 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params =
-      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+  const auto* params =
+      static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  OpData* data = reinterpret_cast<OpData*>(node->user_data);
-  TF_LITE_ENSURE(context, data != nullptr);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
   // Checks in Prepare ensure input, output and filter types are all the same.
   switch (input->type) {
     case kTfLiteFloat32:
-      return EvalFloat(context, node, params, data, input, filter, bias,
+      return EvalFloat(context, node, params->activation, input, filter, bias,
                        output);
     case kTfLiteInt8:
       if (IsMliApplicable(context, input, filter, bias, params)) {
         return EvalMliQuantizedInt8(context, node, params, data, input, filter,
                                     bias, output);
       } else {
-        return EvalQuantizedInt8(context, node, params, data, input, filter,
-                                 bias, output);
+        return EvalQuantizedInt8(context, node, data, input, filter, bias,
+                                 output);
       }
 
     case kTfLiteUInt8:
-      return EvalQuantized(context, node, params, data, input, filter, bias,
-                           output);
+      return EvalQuantized(context, node, data, input, filter, bias, output);
 
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(filter->type), filter->type);
+                         TfLiteTypeGetName(input->type), input->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
index 4637470f62e..e20eea22a03 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
@@ -29,15 +29,15 @@ TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim,
       pad_pre_(padding_pre),
       pad_post_(padding_post),
       overlap_(overlap),
-      sub_cfg_{0},
-      sub_tensor_{0},
+      sub_cfg_{},
+      sub_tensor_{},
       done_(false) {
   /* In the interleave mode, the slicing happens from the deepest dimension up
   to the slice_dim for example in an HWC layout this can mode can be used to
   slice in the C dimenstion. in this mode the data is not contiguous in memory
   anymore */
   if (interleave_mode) {
-    for (int i = 0; i < full_tensor->rank; i++) {
+    for (int i = 0; i < static_cast<int>(full_tensor->rank); i++) {
       if (i > slice_dim) {
         sub_cfg_.size[i] = 1;
       } else if (i == slice_dim) {
@@ -53,7 +53,7 @@ TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim,
     dimension up to the slice_dim for example in an HWC layout this mode can be
     used to slice in the H dimension. in this mode the data of the slice is
     still contiguous in memory (if that was the case in the input tensor */
-    for (int i = 0; i < full_tensor->rank; i++) {
+    for (int i = 0; i < static_cast<int>(full_tensor->rank); i++) {
       if (i < slice_dim) {
         sub_cfg_.size[i] = 1;
       } else if (i == slice_dim) {
diff --git a/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h b/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
index 1764f1fdf45..2888ad51810 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
+++ b/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 
 constexpr int kFracBitsQ15 = 15;
+constexpr int kFracBitsQ31 = 31;
 
 namespace tflite {
 namespace ops {
@@ -50,14 +51,14 @@ static void ConvertToMliQuantParams(const TfLiteTensor* tfT, mli_tensor* mliT) {
   float fscale = tfT->params.scale;
   int exp;
   frexpf(fscale, &exp);
-  int frac_bits = kFracBitsQ15 - exp;
-  int32_t iscale = (1 << frac_bits) * fscale + 0.5f;
+  int frac_bits = kFracBitsQ31 - exp;
+  int32_t iscale = (int32_t)((1ll << frac_bits) * fscale + 0.5f);
   mliT->el_params.asym.scale_frac_bits = frac_bits;
-  mliT->el_params.asym.scale.i16 = (int16_t)iscale;
+  mliT->el_params.asym.scale.i32 = (int32_t)iscale;
 }
 
-static void ConvertToMliQuantParamsPerChannel(const TfLiteTensor* tfT,
-                                              mli_tensor* mliT) {
+static inline void ConvertToMliQuantParamsPerChannel(const TfLiteTensor* tfT,
+                                                     mli_tensor* mliT) {
   // mli tensor scale and zero_point arrays should be allocated at this point
   TFLITE_DCHECK_NE(mliT->el_params.asym.scale.pi16, 0);
   TFLITE_DCHECK_NE(mliT->el_params.asym.zero_point.pi16, 0);
@@ -75,7 +76,7 @@ static void ConvertToMliQuantParamsPerChannel(const TfLiteTensor* tfT,
   for (int i = 0; i < num_channels; i++) {
     int exp;
     frexpf(fscale[i], &exp);
-    int cur_frac_bits = kFracBitsQ15 - exp;
+    int cur_frac_bits = kFracBitsQ31 - exp;
     if (i == 0) {
       min_frac_bits = cur_frac_bits;
     } else {
@@ -86,8 +87,8 @@ static void ConvertToMliQuantParamsPerChannel(const TfLiteTensor* tfT,
   mliT->el_params.asym.scale_frac_bits = min_frac_bits;
 
   for (int i = 0; i < num_channels; i++) {
-    int16_t iscale = (int16_t)((1 << min_frac_bits) * fscale[i] + 0.5f);
-    mliT->el_params.asym.scale.pi16[i] = iscale;
+    int32_t iscale = (int32_t)((1ll << min_frac_bits) * fscale[i] + 0.5f);
+    mliT->el_params.asym.scale.pi32[i] = iscale;
   }
 }
 
diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
index 44bc966a8e2..2194d3c71f2 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
@@ -66,10 +66,9 @@ TfLiteStatus CalculateOpData(const TfLiteContext* context,
   return kTfLiteOk;
 }
 
-TfLiteStatus AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
-                              const TfLitePoolParams* params,
-                              const OpData* data, const TfLiteTensor* input,
-                              TfLiteTensor* output) {
+void AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
+                      const TfLitePoolParams* params, const OpData* data,
+                      const TfLiteTensor* input, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   float activation_min, activation_max;
   CalculateActivationRange(params->activation, &activation_min,
@@ -87,12 +86,10 @@ TfLiteStatus AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
   reference_ops::AveragePool(
       op_params, GetTensorShape(input), GetTensorData<float>(input),
       GetTensorShape(output), GetTensorData<float>(output));
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
                      "Type %s (%d) is not supported by ARC MLI Library.",
                      TfLiteTypeGetName(input->type), input->type);
-  return kTfLiteError;
 #endif
 }
 
@@ -100,9 +97,9 @@ TfLiteStatus AverageEvalFloat(TfLiteContext* context, const TfLiteNode* node,
 TfLiteStatus EvalMli(TfLiteContext* context, const TfLitePoolParams* params,
                      const OpData* data, const TfLiteTensor* input,
                      TfLiteTensor* output, const MliPoolingType pooling_type) {
-  mli_tensor mli_in = {0};
-  mli_tensor mli_out = {0};
-  mli_pool_cfg cfg = {0};
+  mli_tensor mli_in = {};
+  mli_tensor mli_out = {};
+  mli_pool_cfg cfg = {};
 
   ConvertToMliTensor<int8_t>(input, &mli_in);
   ConvertToMliTensor<int8_t>(output, &mli_out);
@@ -178,16 +175,15 @@ TfLiteStatus EvalMli(TfLiteContext* context, const TfLitePoolParams* params,
   return kTfLiteOk;
 }
 
-TfLiteStatus AverageEvalQuantized(TfLiteContext* context,
-                                  const TfLiteNode* node,
-                                  const TfLitePoolParams* params,
-                                  const OpData* data, const TfLiteTensor* input,
-                                  TfLiteTensor* output) {
+void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
+                          const TfLitePoolParams* params, const OpData* data,
+                          const TfLiteTensor* input, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
   int32_t activation_min, activation_max;
   (void)CalculateActivationRangeQuantized(context, params->activation, output,
                                           &activation_min, &activation_max);
+
   PoolParams op_params;
   op_params.stride_height = params->stride_height;
   op_params.stride_width = params->stride_width;
@@ -207,19 +203,17 @@ TfLiteStatus AverageEvalQuantized(TfLiteContext* context,
         op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
         GetTensorShape(output), GetTensorData<int8_t>(output));
   }
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(
       context,
       "Node configuration or type %s (%d) is not supported by ARC MLI Library.",
       TfLiteTypeGetName(input->type), input->type);
-  return kTfLiteError;
 #endif
 }
 
-TfLiteStatus MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
-                          TfLitePoolParams* params, OpData* data,
-                          const TfLiteTensor* input, TfLiteTensor* output) {
+void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
+                  TfLitePoolParams* params, OpData* data,
+                  const TfLiteTensor* input, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   float activation_min, activation_max;
   CalculateActivationRange(params->activation, &activation_min,
@@ -237,20 +231,19 @@ TfLiteStatus MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
   reference_ops::MaxPool(op_params, GetTensorShape(input),
                          GetTensorData<float>(input), GetTensorShape(output),
                          GetTensorData<float>(output));
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(context,
                      "Type %s (%d) is not supported by ARC MLI Library.",
                      TfLiteTypeGetName(input->type), input->type);
-  return kTfLiteError;
 #endif
 }
 
-TfLiteStatus MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                              TfLitePoolParams* params, OpData* data,
-                              const TfLiteTensor* input, TfLiteTensor* output) {
+void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                      TfLitePoolParams* params, OpData* data,
+                      const TfLiteTensor* input, TfLiteTensor* output) {
 #if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
+
   int32_t activation_min, activation_max;
   (void)CalculateActivationRangeQuantized(context, params->activation, output,
                                           &activation_min, &activation_max);
@@ -274,13 +267,11 @@ TfLiteStatus MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
         op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
         GetTensorShape(output), GetTensorData<int8_t>(output));
   }
-  return kTfLiteOk;
 #else
   TF_LITE_KERNEL_LOG(
       context,
       "Node configuration or type %s (%d) is not supported by ARC MLI Library.",
       TfLiteTypeGetName(input->type), input->type);
-  return kTfLiteError;
 #endif
 }
 }  // namespace
@@ -297,15 +288,14 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
   // Inputs and outputs share the same type, guaranteed by the converter.
   switch (input->type) {
     case kTfLiteFloat32:
-      return AverageEvalFloat(context, node, params, &data, input, output);
+      AverageEvalFloat(context, node, params, &data, input, output);
       break;
     case kTfLiteUInt8:
     case kTfLiteInt8:
       if (IsMliApplicable(context, input, params)) {
-        return EvalMli(context, params, &data, input, output, AveragePooling);
+        EvalMli(context, params, &data, input, output, AveragePooling);
       } else {
-        return AverageEvalQuantized(context, node, params, &data, input,
-                                    output);
+        AverageEvalQuantized(context, node, params, &data, input, output);
       }
       break;
     default:
@@ -327,14 +317,14 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
 
   switch (input->type) {
     case kTfLiteFloat32:
-      return MaxEvalFloat(context, node, params, &data, input, output);
+      MaxEvalFloat(context, node, params, &data, input, output);
       break;
     case kTfLiteUInt8:
     case kTfLiteInt8:
       if (IsMliApplicable(context, input, params)) {
-        return EvalMli(context, params, &data, input, output, MaxPooling);
+        EvalMli(context, params, &data, input, output, MaxPooling);
       } else {
-        return MaxEvalQuantized(context, node, params, &data, input, output);
+        MaxEvalQuantized(context, node, params, &data, input, output);
       }
       break;
     default:
diff --git a/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
index 534d5ef3230..aaf04154602 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
@@ -25,6 +25,7 @@ namespace tflite {
 namespace ops {
 namespace micro {
 
+#ifdef __Xxy
 static void get_arc_two_buffer_sizes(int request_size_1, int request_size_2,
                                      int* grant_size_1, int* grant_size_2) {
   int maxrequest = 0;
@@ -66,7 +67,6 @@ static void get_arc_two_buffer_sizes(int request_size_1, int request_size_2,
 
 static TfLiteStatus get_arc_scratch_buffer_for_io_tensors(
     TfLiteContext* context, mli_tensor* in, mli_tensor* out) {
-#ifdef __Xxy
   int request_size_in = 0;
   int request_size_out = 0;
   int grant_size_in = 0;
@@ -103,9 +103,10 @@ static TfLiteStatus get_arc_scratch_buffer_for_io_tensors(
     out->capacity = grant_size_out;
     if (out->data == NULL) return kTfLiteError;
   }
-#endif
+
   return kTfLiteOk;
 }
+#endif
 
 TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
                                                      mli_tensor* in,
@@ -246,12 +247,15 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
   int max_lines_in = 0;
   int max_lines_out = 0;
   int max_out_lines_for_input = 0;
-  bool fit = (in->capacity >= in_height * line_size_in) &&
-             (out->capacity >= out_height * line_size_out);
+  bool fit = (static_cast<int>(in->capacity) >= in_height * line_size_in) &&
+             (static_cast<int>(out->capacity) >= out_height * line_size_out);
   if (fit) {
     // in case both tensors completely fit in the capacity, there is no need for
-    // slicing
-    *in_slice_height = in_height;
+    // slicing. As padding can affect effective input region, we also derive it
+    // from output height, and rely on a clipping logic which intend to reduce
+    // last smaller slice. I.e the only slice is a kind of
+    // "smaller last slice that need to be corrected"
+    *in_slice_height = std::max(in_height, out_height * stride_height);
     *out_slice_height = out_height;
   } else {
     // First compute how many lines fit into the input tensor, and compute how
@@ -298,8 +302,8 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
   int max_ch_weigths = 0;
   int max_ch_bias = 0;
 
-  bool fit = (weights->capacity >= channels * ch_size_w) &&
-             (bias->capacity >= channels * ch_size_b);
+  bool fit = (static_cast<int>(weights->capacity) >= channels * ch_size_w) &&
+             (static_cast<int>(bias->capacity) >= channels * ch_size_b);
   if (fit) {
     // in case both tensors completely fit in the capacity, there is no need for
     // slicing
diff --git a/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
index 2ee91da5eb7..1e188fc420b 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
@@ -78,8 +78,8 @@ void *get_arc_scratch_buffer(int size) {
   // find a local memory that fits the data size.
   for (int mem_idx = 0; mem_idx < num_mem; ++mem_idx) {
     // Best Fit
-    if ((size <= scratch_sizes[mem_idx]) &&
-        (scratch_sizes[mem_idx] - size < best_mem_delta)) {
+    if ((size <= static_cast<int>(scratch_sizes[mem_idx])) &&
+        (static_cast<int>(scratch_sizes[mem_idx]) - size < best_mem_delta)) {
       best_mem_idx = mem_idx;
       best_mem_delta = scratch_sizes[mem_idx] - size;
     }
@@ -97,7 +97,7 @@ void get_arc_scratch_buffer_max_size(int *size) {
   const int num_mem = sizeof(scratch_mem) / sizeof(scratch_mem[0]);
   // find the largest available buffer.
   for (int i = 0; i < num_mem; i++) {
-    if (scratch_sizes[i] > maxavailable) {
+    if (static_cast<int>(scratch_sizes[i]) > maxavailable) {
       maxavailable = scratch_sizes[i];
     }
   }
@@ -110,10 +110,10 @@ void get_arc_scratch_buffer_two_max_sizes(int *size1, int *size2) {
   const int num_mem = sizeof(scratch_mem) / sizeof(scratch_mem[0]);
   // find the two largest available buffers.
   for (int i = 0; i < num_mem; i++) {
-    if (scratch_sizes[i] > maxavailable) {
+    if (static_cast<int>(scratch_sizes[i]) > maxavailable) {
       secondavail = maxavailable;
       maxavailable = scratch_sizes[i];
-    } else if (scratch_sizes[i] > secondavail) {
+    } else if (static_cast<int>(scratch_sizes[i]) > secondavail) {
       secondavail = scratch_sizes[i];
     }
   }
diff --git a/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h
index f139659960e..6e3feb3a53d 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h
@@ -24,8 +24,8 @@ namespace ops {
 namespace micro {
 
 void init_arc_scratch_buffers(void);
-void* get_arc_scratch_buffer(
-    int size);  // Function to assign fast memory from one of 3 scratch buffers.
+void* get_arc_scratch_buffer(int size);  // Function to assign fast memory
+                                         // from one of 3 scratch buffers.
 
 void get_arc_scratch_buffer_max_size(int* size);
 void get_arc_scratch_buffer_two_max_sizes(int* size1, int* size2);
diff --git a/tensorflow/lite/micro/kernels/arg_min_max_test.cc b/tensorflow/lite/micro/kernels/arg_min_max_test.cc
index dfd04bf74b9..b85d59555f8 100644
--- a/tensorflow/lite/micro/kernels/arg_min_max_test.cc
+++ b/tensorflow/lite/micro/kernels/arg_min_max_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/ceil.cc b/tensorflow/lite/micro/kernels/ceil.cc
index 3bce8a73f55..f929ce628c7 100644
--- a/tensorflow/lite/micro/kernels/ceil.cc
+++ b/tensorflow/lite/micro/kernels/ceil.cc
@@ -30,7 +30,9 @@ constexpr int kOutputTensor = 0;
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
diff --git a/tensorflow/lite/micro/kernels/ceil_test.cc b/tensorflow/lite/micro/kernels/ceil_test.cc
index 27caa507c00..a388d285db3 100644
--- a/tensorflow/lite/micro/kernels/ceil_test.cc
+++ b/tensorflow/lite/micro/kernels/ceil_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/circular_buffer.cc b/tensorflow/lite/micro/kernels/circular_buffer.cc
index 7f5aebaca2d..f70203062a4 100644
--- a/tensorflow/lite/micro/kernels/circular_buffer.cc
+++ b/tensorflow/lite/micro/kernels/circular_buffer.cc
@@ -77,7 +77,9 @@ void Free(TfLiteContext* context, void* buffer) { op_data_counter = 0; }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE(context, input != nullptr);
   TF_LITE_ENSURE(context, output != nullptr);
diff --git a/tensorflow/lite/micro/kernels/circular_buffer_test.cc b/tensorflow/lite/micro/kernels/circular_buffer_test.cc
index 770f4565670..3e321ce070a 100644
--- a/tensorflow/lite/micro/kernels/circular_buffer_test.cc
+++ b/tensorflow/lite/micro/kernels/circular_buffer_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/kernels/micro_ops.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
index cf1ce8cb5cb..80a0a2ae748 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
@@ -28,15 +28,12 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace conv {
+namespace {
 
 constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 256;
 
 // Conv is quantized along dimension 0:
 // https://www.tensorflow.org/lite/performance/quantization_spec
@@ -56,9 +53,8 @@ struct OpData {
   int output_shift;
 
   // Per channel output multiplier and shift.
-  // TODO(b/141139247): Allocate these dynamically when possible.
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
 
   // The range of the fused activation layer. For example for kNone and
   // uint8_t these would be 0 and 255.
@@ -82,10 +78,10 @@ inline PaddingType RuntimePaddingType(TfLitePadding padding) {
 }
 
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteConvParams* params, int width, int height,
-                             int filter_width, int filter_height, int out_width,
-                             int out_height, const TfLiteType data_type,
-                             OpData* data) {
+                             const TfLiteConvParams* params, int width,
+                             int height, int filter_width, int filter_height,
+                             int out_width, int out_height,
+                             const TfLiteType data_type, OpData* data) {
   bool has_bias = node->inputs->size == 3;
   // Check number of inputs/outputs
   TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
@@ -124,13 +120,13 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
-  int32_t buf_size = 0;
-
   TFLITE_DCHECK(node->user_data != nullptr);
   TFLITE_DCHECK(node->builtin_data != nullptr);
-  auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
-  auto* data = reinterpret_cast<OpData*>(node->user_data);
+
+  int32_t buf_size = 0;
+  const auto params = static_cast<const TfLiteConvParams*>(node->builtin_data);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
   const TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
@@ -159,6 +155,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   output_dims.w = output->dims->data[2];
   output_dims.c = output_shape.Dims(3);
 
+  // Dynamically allocate per-channel quantization parameters.
+  // TODO(#42883): This allocation is done even for non-int8 cases to get around
+  // a bug in kernel_utils.cc which incorrectly uses per_channel_output_shift in
+  // non-int8 cases. Protect this section with a if (input->type == kTfLiteInt8)
+  // when the issue is fixed.
+  const int num_channels = filter->dims->data[kConvQuantizedDimension];
+  data->per_channel_output_multiplier =
+      static_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  data->per_channel_output_shift =
+      static_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+
   TF_LITE_ENSURE_STATUS(CalculateOpData(
       context, node, params, input_dims.w, input_dims.h, filter_dims.w,
       filter_dims.h, output_dims.w, output_dims.h, input->type, data));
@@ -191,7 +200,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   } else {
     data->buffer_idx = -1;
   }
-#endif
   return kTfLiteOk;
 }
 
@@ -240,127 +248,122 @@ TfLiteStatus EvalQuantizedPerChannel(
     const OpData& data, const TfLiteEvalTensor* input,
     const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
     TfLiteEvalTensor* output, TfLiteEvalTensor* im2col) {
-  // Initialize cmsis-nn convolution parameters
   cmsis_nn_conv_params conv_params;
-  conv_params.input_offset = -data.input_zero_point;
-  conv_params.output_offset = data.output_zero_point;
-  conv_params.stride.h = params->stride_height;
-  conv_params.stride.w = params->stride_width;
   conv_params.dilation.h = params->dilation_height_factor;
   conv_params.dilation.w = params->dilation_width_factor;
-  conv_params.padding.h = data.padding.height;
-  conv_params.padding.w = data.padding.width;
-  conv_params.activation.min = data.output_activation_min;
-  conv_params.activation.max = data.output_activation_max;
+  // TODO(#43557) Remove checks for dilation and call to reference
+  // implementation when dilation is supported in the optimized implementation
+  // by CMSIS-NN.
+  if (conv_params.dilation.h == 1 && conv_params.dilation.w == 1) {
+    // Initialize cmsis-nn convolution parameters
+    conv_params.input_offset = -data.input_zero_point;
+    conv_params.output_offset = data.output_zero_point;
+    conv_params.stride.h = params->stride_height;
+    conv_params.stride.w = params->stride_width;
+    conv_params.padding.h = data.padding.height;
+    conv_params.padding.w = data.padding.width;
+    conv_params.activation.min = data.output_activation_min;
+    conv_params.activation.max = data.output_activation_max;
 
-  // Initialize cmsis-nn per channel quantization parameters
-  cmsis_nn_per_channel_quant_params quant_params;
-  quant_params.multiplier =
-      const_cast<int32_t*>(data.per_channel_output_multiplier);
-  quant_params.shift = const_cast<int32_t*>(data.per_channel_output_shift);
+    // Initialize cmsis-nn per channel quantization parameters
+    cmsis_nn_per_channel_quant_params quant_params;
+    quant_params.multiplier =
+        const_cast<int32_t*>(data.per_channel_output_multiplier);
+    quant_params.shift = const_cast<int32_t*>(data.per_channel_output_shift);
 
-#if defined(__ARM_FEATURE_DSP) || defined(__ARM_FEATURE_MVE)
-  RuntimeShape filter_shape = tflite::micro::GetTensorShape(filter);
-  RuntimeShape input_shape = tflite::micro::GetTensorShape(input);
-  RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
-  RuntimeShape bias_shape = tflite::micro::GetTensorShape(bias);
+    RuntimeShape filter_shape = tflite::micro::GetTensorShape(filter);
+    RuntimeShape input_shape = tflite::micro::GetTensorShape(input);
+    RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+    RuntimeShape bias_shape = tflite::micro::GetTensorShape(bias);
 
-  // Consistency check.
-  TFLITE_DCHECK_LE(conv_params.activation.min, conv_params.activation.max);
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-  const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
-  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
-  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
-  if (tflite::micro::GetTensorData<int8_t>(bias)) {
-    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
-  }
+    // Consistency check.
+    TFLITE_DCHECK_LE(conv_params.activation.min, conv_params.activation.max);
+    TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+    TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+    const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
+    const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+    const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+    if (tflite::micro::GetTensorData<int8_t>(bias)) {
+      TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+    }
 
-  // Initialize cmsis-nn dimensions
-  // Input
-  cmsis_nn_dims input_dims;
-  input_dims.n = batch_size;
-  input_dims.h = input_shape.Dims(1);
-  input_dims.w = input_shape.Dims(2);
-  input_dims.c = input_depth;
+    // Initialize cmsis-nn dimensions
+    // Input
+    cmsis_nn_dims input_dims;
+    input_dims.n = batch_size;
+    input_dims.h = input_shape.Dims(1);
+    input_dims.w = input_shape.Dims(2);
+    input_dims.c = input_depth;
 
-  // Filter
-  cmsis_nn_dims filter_dims;
-  filter_dims.n = output_depth;
-  filter_dims.h = filter_shape.Dims(1);
-  filter_dims.w = filter_shape.Dims(2);
-  filter_dims.c = input_depth;
+    // Filter
+    cmsis_nn_dims filter_dims;
+    filter_dims.n = output_depth;
+    filter_dims.h = filter_shape.Dims(1);
+    filter_dims.w = filter_shape.Dims(2);
+    filter_dims.c = input_depth;
 
-  // Bias
-  cmsis_nn_dims bias_dims;
-  bias_dims.n = 1;
-  bias_dims.h = 1;
-  bias_dims.w = 1;
-  bias_dims.c = output_depth;
+    // Bias
+    cmsis_nn_dims bias_dims;
+    bias_dims.n = 1;
+    bias_dims.h = 1;
+    bias_dims.w = 1;
+    bias_dims.c = output_depth;
 
-  // Output
-  cmsis_nn_dims output_dims;
-  output_dims.n = batch_size;
-  output_dims.h = output_shape.Dims(1);
-  output_dims.w = output_shape.Dims(2);
-  output_dims.c = output_depth;
+    // Output
+    cmsis_nn_dims output_dims;
+    output_dims.n = batch_size;
+    output_dims.h = output_shape.Dims(1);
+    output_dims.w = output_shape.Dims(2);
+    output_dims.c = output_depth;
 
-  // Initialize cmsis-nn context
-  cmsis_nn_context ctx;
-  ctx.buf = nullptr;
-  ctx.size = 0;
+    // Initialize cmsis-nn context
+    cmsis_nn_context ctx;
+    ctx.buf = nullptr;
+    ctx.size = 0;
 
-  if (data.buffer_idx > -1) {
-    ctx.buf = context->GetScratchBuffer(context, data.buffer_idx);
-    // Note: ctx.size is currently not used in cmsis-nn.
-    // The buffer should be allocated in the Prepare function through
-    // arm_convolve_wrapper_s8_get_buffer_size
-  }
+    if (data.buffer_idx > -1) {
+      ctx.buf = context->GetScratchBuffer(context, data.buffer_idx);
+      // Note: ctx.size is currently not used in cmsis-nn.
+      // The buffer should be allocated in the Prepare function through
+      // arm_convolve_wrapper_s8_get_buffer_size
+    }
 
-  // arm_convolve_wrapper_s8 dispatches the optimized kernel accordingly with
-  // the parameters passed
-  arm_status status = arm_convolve_wrapper_s8(
-      &ctx, &conv_params, &quant_params, &input_dims,
-      tflite::micro::GetTensorData<int8_t>(input), &filter_dims,
-      tflite::micro::GetTensorData<int8_t>(filter), &bias_dims,
-      tflite::micro::GetTensorData<int32_t>(bias), &output_dims,
-      tflite::micro::GetTensorData<int8_t>(output));
-
-  if (status == ARM_MATH_SUCCESS) {
-    return kTfLiteOk;
+    // arm_convolve_wrapper_s8 dispatches the optimized kernel accordingly with
+    // the parameters passed
+    TFLITE_DCHECK_EQ(
+        arm_convolve_wrapper_s8(
+            &ctx, &conv_params, &quant_params, &input_dims,
+            tflite::micro::GetTensorData<int8_t>(input), &filter_dims,
+            tflite::micro::GetTensorData<int8_t>(filter), &bias_dims,
+            tflite::micro::GetTensorData<int32_t>(bias), &output_dims,
+            tflite::micro::GetTensorData<int8_t>(output)),
+        ARM_MATH_SUCCESS);
   } else {
-    return kTfLiteError;
+    // TODO(b/154032858): Investigate removing extra copies.
+    ConvParams op_params;
+    op_params.input_offset = -data.input_zero_point;
+    op_params.output_offset = data.output_zero_point;
+    op_params.stride_height = params->stride_height;
+    op_params.stride_width = params->stride_width;
+    op_params.dilation_height_factor = params->dilation_height_factor;
+    op_params.dilation_width_factor = params->dilation_width_factor;
+    op_params.padding_values.height = data.padding.height;
+    op_params.padding_values.width = data.padding.width;
+    op_params.quantized_activation_min = data.output_activation_min;
+    op_params.quantized_activation_max = data.output_activation_max;
+
+    reference_integer_ops::ConvPerChannel(
+        op_params, data.per_channel_output_multiplier,
+        data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int8_t>(input),
+        tflite::micro::GetTensorShape(filter),
+        tflite::micro::GetTensorData<int8_t>(filter),
+        tflite::micro::GetTensorShape(bias),
+        tflite::micro::GetTensorData<int32_t>(bias),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int8_t>(output));
   }
-
-#else
-#pragma message( \
-    "CMSIS-NN optimization for conv not available for this target. Using reference kernel.")
-
-  ConvParams op_params;
-  conv_params.input_offset = -data.input_zero_point;
-  conv_params.output_offset = data.output_zero_point;
-  op_params.stride_height = params->stride_height;
-  op_params.stride_width = params->stride_width;
-  op_params.dilation_height_factor = params->dilation_height_factor;
-  op_params.dilation_width_factor = params->dilation_width_factor;
-  op_params.padding_values.height = data.padding.height;
-  op_params.padding_values.width = data.padding.width;
-  op_params.quantized_activation_min = data->output_activation_min;
-  op_params.quantized_activation_max = data->output_activation_max;
-
-  reference_integer_ops::ConvPerChannel(
-      op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, tflite::micro::GetTensorShape(input),
-      tflite::micro::GetTensorData<int8_t>(input),
-      tflite::micro::GetTensorShape(filter),
-      tflite::micro::GetTensorData<int8_t>(filter),
-      tflite::micro::GetTensorShape(bias),
-      tflite::micro::GetTensorData<int32_t>(bias),
-      tflite::micro::GetTensorShape(output),
-      tflite::micro::GetTensorData<int8_t>(output));
-
-#endif
   return kTfLiteOk;
 }
 
@@ -437,19 +440,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-}  // namespace conv
+}  // namespace
 
 TfLiteRegistration Register_CONV_2D() {
-  return {/*init=*/conv::Init,
+  return {/*init=*/Init,
           /*free=*/nullptr,
-          /*prepare=*/conv::Prepare,
-          /*invoke=*/conv::Eval,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc b/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
index 42ac15a0837..3a59b71c985 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/depthwise_conv.cc
@@ -28,16 +28,12 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace depthwise_conv {
 namespace {
 
 constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
-constexpr int kMaxChannels = 256;
 
 // Depthwise conv is quantized along dimension 3:
 // https://www.tensorflow.org/lite/performance/quantization_spec
@@ -57,9 +53,9 @@ struct OpData {
   int output_shift;
 
   // Per channel output multiplier and shift.
-  // TODO: Allocate dynamic buffers when b/158779832 is resolved
-  int32_t per_channel_output_multiplier[kMaxChannels];
-  int32_t per_channel_output_shift[kMaxChannels];
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
+
   // The range of the fused activation layer. For example for kNone and
   // uint8_t these would be 0 and 255.
   int32_t output_activation_min;
@@ -105,8 +101,6 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
   return kTfLiteOk;
 }
 
-}  // namespace
-
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
   return context->AllocatePersistentBuffer(context, sizeof(OpData));
@@ -131,11 +125,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   int filter_height = SizeOfDimension(filter, 1);
 
   if (input->type == kTfLiteInt8) {
-    // Allocate memory for per-channel quantization parameters
-    const int num_channels =
-        filter->dims->data[kDepthwiseConvQuantizedDimension];
-    TFLITE_DCHECK_LE(num_channels, kMaxChannels);
-
     TF_LITE_ENSURE_EQ(context, filter->quantization.type,
                       kTfLiteAffineQuantization);
 
@@ -154,6 +143,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                       affine_quantization->zero_point->size);
   }
 
+  // Allocate memory for per-channel quantization parameters
+  const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
+
+  data->per_channel_output_multiplier =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  data->per_channel_output_shift =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+
   TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
                                         filter_width, filter_height, data_type,
                                         data));
@@ -460,19 +459,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-}  // namespace depthwise_conv
+}  // namespace
 
 TfLiteRegistration Register_DEPTHWISE_CONV_2D() {
-  return {/*init=*/depthwise_conv::Init,
+  return {/*init=*/Init,
           /*free=*/nullptr,
-          /*prepare=*/depthwise_conv::Prepare,
-          /*invoke=*/depthwise_conv::Eval,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc b/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
index 8af92e6d245..9f901d436a1 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/fully_connected.cc
@@ -23,12 +23,10 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/kernels/fully_connected.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace fully_connected {
 namespace {
 
 struct OpData {
@@ -56,6 +54,11 @@ constexpr int kWeightsTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
 
+// TODO(b/169801227): This global struct is needed for the linker to drop unused
+// code (for example, by using Register_FULLY_CONNECTED_INT8 instead of
+// Register_FULLY_CONNECTED).
+TfLiteRegistration fully_connected_registration;
+
 TfLiteStatus CalculateOpData(TfLiteContext* context,
                              TfLiteFusedActivation activation,
                              TfLiteType data_type, const TfLiteTensor* input,
@@ -82,8 +85,6 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
   return status;
 }
 
-}  // namespace
-
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
   return context->AllocatePersistentBuffer(context, sizeof(OpData));
@@ -142,7 +143,7 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
   // The 'if' condition can be removed when null handling of bias is added to
   // arm_fully_connected_s8
   if (nullptr != tflite::micro::GetTensorData<int32_t>(bias)) {
-    RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
+    const RuntimeShape output_shape = tflite::micro::GetTensorShape(output);
     TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 2);
     const int batches = output_shape.Dims(0);
     const int output_depth = output_shape.Dims(1);
@@ -154,6 +155,7 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
     cmsis_nn_fc_params fc_params;
     fc_params.input_offset = -data.input_zero_point;
     fc_params.output_offset = data.output_zero_point;
+    fc_params.filter_offset = -data.filter_zero_point;
     fc_params.activation.min = data.output_activation_min;
     fc_params.activation.max = data.output_activation_max;
 
@@ -164,9 +166,9 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
 
     cmsis_nn_dims input_dims;
     input_dims.n = batches;
-    input_dims.h = input_shape.Dims(1);
-    input_dims.w = input_shape.Dims(2);
-    input_dims.c = input_shape.Dims(3);
+    input_dims.h = 1;
+    input_dims.w = 1;
+    input_dims.c = accum_depth;
 
     cmsis_nn_dims filter_dims;
     filter_dims.n = accum_depth;
@@ -332,19 +334,59 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-}  // namespace fully_connected
+// Note that the current function names are not ideal at all (this EvalInt8
+// function internally calls EvalQuantizedInt8, and there is similar name
+// aliasing in the Eval function too). We will be attempting to have a more
+// descriptive naming convention but holding off on that for now, since the
+// renaming might be coupled with reducing code duplication and some additional
+// refactoring.
+TfLiteStatus EvalInt8(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kWeightsTensor);
+  const TfLiteEvalTensor* bias =
+      tflite::micro::GetEvalInput(context, node, kBiasTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
 
-TfLiteRegistration Register_FULLY_CONNECTED() {
-  return {/*init=*/fully_connected::Init,
-          /*free=*/nullptr,
-          /*prepare=*/fully_connected::Prepare,
-          /*invoke=*/fully_connected::Eval,
-          /*profiling_string=*/nullptr,
-          /*builtin_code=*/0,
-          /*custom_name=*/nullptr,
-          /*version=*/0};
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
+
+  // Checks in Prepare ensure input, output and filter types are all the same.
+  if (input->type != kTfLiteInt8) {
+    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                       TfLiteTypeGetName(input->type), input->type);
+    return kTfLiteError;
+  }
+
+  return EvalQuantizedInt8(context, node, data, input, filter, bias, output);
+}
+
+}  // namespace
+
+TfLiteRegistration Register_FULLY_CONNECTED() {
+  fully_connected_registration.init = Init;
+  fully_connected_registration.free = nullptr;
+  fully_connected_registration.prepare = Prepare;
+  fully_connected_registration.invoke = Eval;
+  fully_connected_registration.profiling_string = nullptr;
+  fully_connected_registration.builtin_code = 0;
+  fully_connected_registration.custom_name = nullptr;
+  fully_connected_registration.version = 0;
+  return fully_connected_registration;
+}
+
+TfLiteRegistration Register_FULLY_CONNECTED_INT8() {
+  fully_connected_registration.init = Init;
+  fully_connected_registration.free = nullptr;
+  fully_connected_registration.prepare = Prepare;
+  fully_connected_registration.invoke = EvalInt8;
+  fully_connected_registration.profiling_string = nullptr;
+  fully_connected_registration.builtin_code = 0;
+  fully_connected_registration.custom_name = nullptr;
+  fully_connected_registration.version = 0;
+  return fully_connected_registration;
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc b/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc
index 00d884eb415..e7e23818f5e 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/mul.cc
@@ -62,8 +62,9 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
         context, params->activation, output, &data->output_activation_min,
         &data->output_activation_max));
 
-    double real_multiplier =
-        input1->params.scale * input2->params.scale / output->params.scale;
+    double real_multiplier = static_cast<double>(input1->params.scale) *
+                             static_cast<double>(input2->params.scale) /
+                             static_cast<double>(output->params.scale);
     QuantizeMultiplier(real_multiplier, &data->output_multiplier,
                        &data->output_shift);
   }
@@ -210,7 +211,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace mul
 
 TfLiteRegistration Register_MUL() {
-  return {mul::Init, nullptr /* Free */, mul::Prepare, mul::Eval};
+  return {/* Init=*/mul::Init,
+          /* Free=*/nullptr,
+          /* Prepare=*/mul::Prepare,
+          /*invoke=*/mul::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
 }
 
 }  // namespace micro
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc b/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
index 194bba4f26a..60e1a9a88b0 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/softmax.cc
@@ -21,9 +21,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace activations {
 namespace {
 
 TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
@@ -68,8 +65,6 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
   return kTfLiteOk;
 }
 
-}  // namespace
-
 void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
   return context->AllocatePersistentBuffer(context, sizeof(SoftmaxParams));
@@ -157,19 +152,17 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
-}  // namespace activations
+}  // namespace
 
 TfLiteRegistration Register_SOFTMAX() {
-  return {/*init=*/activations::SoftmaxInit,
+  return {/*init=*/SoftmaxInit,
           /*free=*/nullptr,
-          /*prepare=*/activations::SoftmaxPrepare,
-          /*invoke=*/activations::SoftmaxEval,
+          /*prepare=*/SoftmaxPrepare,
+          /*invoke=*/SoftmaxEval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/svdf.cc b/tensorflow/lite/micro/kernels/cmsis-nn/svdf.cc
new file mode 100644
index 00000000000..16358e62e10
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/svdf.cc
@@ -0,0 +1,476 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include <cstdint>
+
+#include "cmsis/CMSIS/NN/Include/arm_nn_types.h"
+#include "cmsis/CMSIS/NN/Include/arm_nnfunctions.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/activation_utils.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+
+namespace tflite {
+namespace {
+
+struct OpData {
+  int32_t effective_scale_1_a;
+  int32_t effective_scale_2_a;
+  // b versions of each scale are kept at int since the numbers are just the
+  // shift value - typically between [-32, 32].
+  int effective_scale_1_b;
+  int effective_scale_2_b;
+  int scratch_tensor_index;
+  int scratch_output_tensor_index;
+
+  // Cached tensor zero point values for quantized operations.
+  int input_zero_point;
+  int output_zero_point;
+};
+
+// Input tensors.
+constexpr int kInputTensor = 0;
+constexpr int kWeightsFeatureTensor = 1;
+constexpr int kWeightsTimeTensor = 2;
+constexpr int kBiasTensor = 3;
+// This is a variable tensor, and will be modified by this op.
+constexpr int kInputActivationStateTensor = 4;
+
+// Output tensor.
+constexpr int kOutputTensor = 0;
+
+/**
+ * This version of SVDF is specific to TFLite Micro. It contains the following
+ * differences between the TFLite version:
+ *
+ * 1.) Scratch tensor allocation - scratch tensors must be known ahead of time
+ * for the Micro interpreter.
+ * 2.) Output dimensions - the TFLite version determines output size and runtime
+ * and resizes the output tensor. Micro runtime does not support tensor
+ * resizing.
+ */
+static inline void ApplyTimeWeightsBiasAndActivation(
+    int batch_size, int memory_size, int num_filters, int num_units, int rank,
+    const float* const __restrict__ weights_time_ptr,
+    const float* const __restrict__ bias_ptr, TfLiteFusedActivation activation,
+    float* const __restrict__ state_ptr, float* const __restrict__ scratch_ptr,
+    float* const __restrict__ output_ptr) {
+  // Compute matmul(activation_state, weights_time).
+  for (int b = 0; b < batch_size; ++b) {
+    // Perform batched vector dot product:
+    float* scratch_ptr_batch = scratch_ptr + b * num_filters;
+    const float* vector1_ptr = weights_time_ptr;
+    const float* vector2_ptr = state_ptr + b * memory_size * num_filters;
+    for (int i = 0; i < num_filters; ++i) {
+      *scratch_ptr_batch = 0.f;
+      for (int j = 0; j < memory_size; ++j) {
+        *scratch_ptr_batch += *vector1_ptr++ * *vector2_ptr++;
+      }
+      scratch_ptr_batch++;
+    }
+  }
+
+  // Initialize output with bias if provided.
+  if (bias_ptr) {
+    // VectorBatchVectorAssign
+    for (int i = 0; i < batch_size; ++i) {
+      float* output_data = output_ptr + i * num_units;
+      const float* bias_data = bias_ptr;
+      for (int j = 0; j < num_units; ++j) {
+        *output_data++ = *bias_data++;
+      }
+    }
+  } else {
+    float* output_data = output_ptr;
+    for (int i = 0; i < batch_size * num_units; ++i) {
+      *output_data++ = 0.0f;
+    }
+  }
+
+  // Reduction sum.
+  for (int b = 0; b < batch_size; ++b) {
+    float* output_ptr_batch = output_ptr + b * num_units;
+    float* scratch_ptr_batch = scratch_ptr + b * num_filters;
+
+    // Reduction sum vector
+    for (int i = 0; i < num_units; ++i) {
+      for (int j = 0; j < rank; j++) {
+        output_ptr_batch[i] += *scratch_ptr_batch++;
+      }
+    }
+  }
+
+  // Apply activation.
+  for (int b = 0; b < batch_size; ++b) {
+    float* output_ptr_batch = output_ptr + b * num_units;
+    for (int i = 0; i < num_units; ++i) {
+      *output_ptr_batch =
+          tflite::ops::micro::ActivationValFloat(activation, *output_ptr_batch);
+      ++output_ptr_batch;
+    }
+  }
+}
+
+inline void EvalFloatSVDF(
+    TfLiteContext* context, TfLiteNode* node, const TfLiteEvalTensor* input,
+    const TfLiteEvalTensor* weights_feature,
+    const TfLiteEvalTensor* weights_time, const TfLiteEvalTensor* bias,
+    const TfLiteSVDFParams* params, int scratch_tensor_index,
+    TfLiteEvalTensor* activation_state, TfLiteEvalTensor* output) {
+  const int rank = params->rank;
+  const int batch_size = input->dims->data[0];
+  const int input_size = input->dims->data[1];
+  const int num_filters = weights_feature->dims->data[0];
+  const int num_units = num_filters / rank;
+  const int memory_size = weights_time->dims->data[1];
+
+  const float* weights_feature_ptr =
+      tflite::micro::GetTensorData<float>(weights_feature);
+  const float* weights_time_ptr =
+      tflite::micro::GetTensorData<float>(weights_time);
+  const float* bias_ptr = tflite::micro::GetTensorData<float>(bias);
+  const float* input_ptr = tflite::micro::GetTensorData<float>(input);
+
+  float* state_ptr = tflite::micro::GetTensorData<float>(activation_state);
+
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
+
+  float* scratch_ptr = static_cast<float*>(
+      context->GetScratchBuffer(context, scratch_tensor_index));
+
+  float* output_ptr = tflite::micro::GetTensorData<float>(output);
+
+  // Left shift the activation_state.
+  {
+    float* new_state_start = state_ptr;
+    const float* old_state_start = state_ptr + 1;
+    const float* old_state_end =
+        state_ptr + batch_size * num_filters * memory_size;
+    while (old_state_start != old_state_end) {
+      *new_state_start++ = *old_state_start++;
+    }
+  }
+
+  // Note: no need to clear the latest activation, matmul is not accumulative.
+
+  // Compute conv1d(inputs, weights_feature).
+  // The activation_state's rightmost column is used to save current cycle
+  // activation. This is achieved by starting at state_ptr[memory_size - 1] and
+  // having the stride equal to memory_size.
+
+  // Perform batched matrix vector multiply operation:
+  {
+    const float* matrix = weights_feature_ptr;
+    const float* vector = input_ptr;
+    float* result = &state_ptr[memory_size - 1];
+    float* result_in_batch = result;
+    for (int i = 0; i < batch_size; ++i) {
+      const float* matrix_ptr = matrix;
+      for (int j = 0; j < num_filters; ++j) {
+        float dot_prod = 0.0f;
+        const float* vector_in_batch = vector + i * input_size;
+        for (int k = 0; k < input_size; ++k) {
+          dot_prod += *matrix_ptr++ * *vector_in_batch++;
+        }
+        *result_in_batch = dot_prod;
+        result_in_batch += memory_size;
+      }
+    }
+  }
+
+  ApplyTimeWeightsBiasAndActivation(
+      batch_size, memory_size, num_filters, num_units, rank, weights_time_ptr,
+      bias_ptr, params->activation, state_ptr, scratch_ptr, output_ptr);
+}
+
+void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
+                     const TfLiteEvalTensor* input_tensor,
+                     const TfLiteEvalTensor* weights_feature_tensor,
+                     const TfLiteEvalTensor* weights_time_tensor,
+                     const TfLiteEvalTensor* bias_tensor,
+                     const TfLiteSVDFParams* params,
+                     TfLiteEvalTensor* activation_state_tensor,
+                     TfLiteEvalTensor* output_tensor, const OpData& data) {
+  cmsis_nn_dims input_dims;
+  input_dims.n = input_tensor->dims->data[0];
+  input_dims.h = input_tensor->dims->data[1];
+
+  cmsis_nn_dims weights_feature_dims;
+  weights_feature_dims.n = weights_feature_tensor->dims->data[0];
+  weights_feature_dims.h = weights_feature_tensor->dims->data[1];
+
+  cmsis_nn_dims weights_time_dims;
+  weights_time_dims.n = weights_time_tensor->dims->data[0];
+  weights_time_dims.h = weights_time_tensor->dims->data[1];
+
+  cmsis_nn_dims bias_dims;
+  bias_dims.n = bias_tensor->dims->data[0];
+
+  cmsis_nn_dims state_dims;
+  state_dims.n = bias_tensor->dims->data[0];
+  state_dims.h = bias_tensor->dims->data[1];
+
+  cmsis_nn_dims output_dims;
+  output_dims.n = output_tensor->dims->data[0];
+  output_dims.h = output_tensor->dims->data[1];
+
+  cmsis_nn_svdf_params svdf_params;
+  svdf_params.rank = params->rank;
+  svdf_params.input_offset = data.input_zero_point;
+  svdf_params.output_offset = data.output_zero_point;
+
+  svdf_params.input_activation.min = INT16_MIN;
+  svdf_params.input_activation.max = INT16_MAX;
+
+  svdf_params.output_activation.min = INT8_MIN;
+  svdf_params.output_activation.max = INT8_MAX;
+
+  cmsis_nn_per_tensor_quant_params in_quant_params;
+  in_quant_params.multiplier = data.effective_scale_1_a;
+  in_quant_params.shift = data.effective_scale_1_b;
+
+  cmsis_nn_per_tensor_quant_params out_quant_params;
+  out_quant_params.multiplier = data.effective_scale_2_a;
+  out_quant_params.shift = data.effective_scale_2_b;
+
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
+
+  cmsis_nn_context scratch_ctx;
+  scratch_ctx.buf = static_cast<int32_t*>(
+      context->GetScratchBuffer(context, data.scratch_tensor_index));
+
+  cmsis_nn_context scratch_output_ctx;
+  scratch_output_ctx.buf = static_cast<int32_t*>(
+      context->GetScratchBuffer(context, data.scratch_output_tensor_index));
+
+  int8_t* output_data = tflite::micro::GetTensorData<int8_t>(output_tensor);
+  arm_svdf_s8(
+      &scratch_ctx, &scratch_output_ctx, &svdf_params, &in_quant_params,
+      &out_quant_params, &input_dims,
+      (int8_t*)tflite::micro::GetTensorData<int8_t>(input_tensor), &state_dims,
+      (int16_t*)tflite::micro::GetTensorData<int16_t>(activation_state_tensor),
+      &weights_feature_dims,
+      (int8_t*)tflite::micro::GetTensorData<int8_t>(weights_feature_tensor),
+      &weights_time_dims,
+      (int16_t*)tflite::micro::GetTensorData<int16_t>(weights_time_tensor),
+      &bias_dims, (int32_t*)tflite::micro::GetTensorData<int32_t>(bias_tensor),
+      &output_dims, output_data);
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  const auto* params = static_cast<const TfLiteSVDFParams*>(node->builtin_data);
+
+  // Validate Tensor Inputs (dtype depends on quantization):
+  // [0] = Input, {2, batch_size, input_size}
+  // [1] = Weights Feature, {2, num_filters, input_size}
+  // [2] = Weights Time, {2, num_filters, memory_size}
+  // [3] = Bias (optional), {1, num_units}
+  // [4] = Activation State (variable),
+  //         {2, batch_size, memory_size * num_filters}
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* weights_feature =
+      GetInput(context, node, kWeightsFeatureTensor);
+  const TfLiteTensor* weights_time =
+      GetInput(context, node, kWeightsTimeTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  const TfLiteTensor* activation_state =
+      GetInput(context, node, kInputActivationStateTensor);
+
+  // Define input constants based on input tensor definition above:
+  const int rank = params->rank;
+  const int input_size = input->dims->data[1];
+  const int batch_size = input->dims->data[0];
+  const int num_filters = weights_feature->dims->data[0];
+  TF_LITE_ENSURE_EQ(context, num_filters % rank, 0);
+  const int num_units = num_filters / rank;
+  const int memory_size = weights_time->dims->data[1];
+
+  // Validate Input Tensor:
+  TF_LITE_ENSURE(context,
+                 input->type == kTfLiteFloat32 || input->type == kTfLiteInt8);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
+
+  // Validate Tensor Output:
+  // [0] = float/int8, {2, batch_size, num_units}
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(output), 2);
+  TF_LITE_ENSURE_EQ(context, output->dims->data[0], batch_size);
+  TF_LITE_ENSURE_EQ(context, output->dims->data[1], num_units);
+
+  // Validate Weights Feature Input Tensor:
+  TF_LITE_ENSURE_EQ(context, NumDimensions(weights_feature), 2);
+  TF_LITE_ENSURE_EQ(context, weights_feature->dims->data[1], input_size);
+
+  // Validate Weights Time Input Tensor:
+  TF_LITE_ENSURE_EQ(context, NumDimensions(weights_time), 2);
+  TF_LITE_ENSURE_EQ(context, weights_time->dims->data[0], num_filters);
+  TF_LITE_ENSURE_EQ(context, weights_time->dims->data[1], memory_size);
+
+  // Validate Optional Bias Input Tensor:
+  if (bias != nullptr) {
+    TF_LITE_ENSURE_EQ(context, bias->dims->data[0], num_units);
+  }
+
+  // Validate Activation State Input Tensor:
+  TF_LITE_ENSURE_EQ(context, NumDimensions(activation_state), 2);
+  TF_LITE_ENSURE_EQ(context, activation_state->dims->data[0], batch_size);
+  TF_LITE_ENSURE_EQ(context, activation_state->dims->data[1],
+                    memory_size * num_filters);
+  // Since is_variable is not part of TFLiteEvalTensor, check is_variable here.
+  TF_LITE_ENSURE_EQ(context, activation_state->is_variable, true);
+
+  TF_LITE_ENSURE_EQ(context, node->inputs->size, 5);
+
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteInt8);
+    TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteInt16);
+    TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteInt16);
+    if (bias != nullptr) {
+      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteInt32);
+    }
+
+    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
+
+    const double effective_scale_1 = static_cast<double>(
+        input->params.scale * weights_feature->params.scale /
+        activation_state->params.scale);
+    const double effective_scale_2 =
+        static_cast<double>(activation_state->params.scale *
+                            weights_time->params.scale / output->params.scale);
+
+    // TODO(b/162018098): Use TF_LITE_ENSURE_NEAR when it is ready.
+    TF_LITE_ENSURE(
+        context,
+        std::abs(static_cast<double>(bias->params.scale) -
+                 static_cast<double>(activation_state->params.scale *
+                                     weights_time->params.scale)) < 1e-5);
+
+    QuantizeMultiplier(effective_scale_1, &(data->effective_scale_1_a),
+                       &(data->effective_scale_1_b));
+    QuantizeMultiplier(effective_scale_2, &(data->effective_scale_2_a),
+                       &(data->effective_scale_2_b));
+
+    data->input_zero_point = input->params.zero_point;
+    data->output_zero_point = output->params.zero_point;
+
+    TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
+
+    const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
+        context, batch_size * num_filters * sizeof(int32_t),
+        &(data->scratch_tensor_index));
+    TF_LITE_ENSURE_OK(context, scratch_status);
+
+    const TfLiteStatus scratch_output_status =
+        context->RequestScratchBufferInArena(
+            context, batch_size * num_units * sizeof(int32_t),
+            &(data->scratch_output_tensor_index));
+    TF_LITE_ENSURE_OK(context, scratch_output_status);
+  } else {
+    TF_LITE_ENSURE_EQ(context, weights_feature->type, kTfLiteFloat32);
+    TF_LITE_ENSURE_EQ(context, weights_time->type, kTfLiteFloat32);
+    TF_LITE_ENSURE_EQ(context, activation_state->type, kTfLiteFloat32);
+    if (bias != nullptr) {
+      TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
+    }
+    TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
+
+    TFLITE_DCHECK(context->RequestScratchBufferInArena != nullptr);
+    const TfLiteStatus scratch_status = context->RequestScratchBufferInArena(
+        context, batch_size * num_filters * sizeof(float),
+        &(data->scratch_tensor_index));
+    TF_LITE_ENSURE_OK(context, scratch_status);
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
+
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* weights_feature =
+      tflite::micro::GetEvalInput(context, node, kWeightsFeatureTensor);
+  const TfLiteEvalTensor* weights_time =
+      tflite::micro::GetEvalInput(context, node, kWeightsTimeTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 5)
+          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          : nullptr;
+  TfLiteEvalTensor* activation_state = tflite::micro::GetMutableEvalInput(
+      context, node, kInputActivationStateTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+
+  switch (weights_feature->type) {
+    case kTfLiteFloat32: {
+      EvalFloatSVDF(context, node, input, weights_feature, weights_time, bias,
+                    params, data.scratch_tensor_index, activation_state,
+                    output);
+      return kTfLiteOk;
+      break;
+    }
+
+    case kTfLiteInt8: {
+      EvalIntegerSVDF(context, node, input, weights_feature, weights_time, bias,
+                      params, activation_state, output, data);
+      return kTfLiteOk;
+      break;
+    }
+
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
+                         TfLiteTypeGetName(weights_feature->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_SVDF() {
+  return {/*init=*/Init,
+          /*free=*/nullptr,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/comparisons.cc b/tensorflow/lite/micro/kernels/comparisons.cc
index ed7a20086f8..35007640702 100644
--- a/tensorflow/lite/micro/kernels/comparisons.cc
+++ b/tensorflow/lite/micro/kernels/comparisons.cc
@@ -619,7 +619,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = static_cast<OpData*>(node->user_data);
 
   const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TF_LITE_ENSURE(context, input1 != nullptr);
   const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TF_LITE_ENSURE(context, input2 != nullptr);
 
   if (input1->type == kTfLiteUInt8 || input1->type == kTfLiteInt8) {
     auto input1_offset = -input1->params.zero_point;
diff --git a/tensorflow/lite/micro/kernels/comparisons_test.cc b/tensorflow/lite/micro/kernels/comparisons_test.cc
index 393f0e22187..855192baed2 100644
--- a/tensorflow/lite/micro/kernels/comparisons_test.cc
+++ b/tensorflow/lite/micro/kernels/comparisons_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/concatenation.cc b/tensorflow/lite/micro/kernels/concatenation.cc
index 636a7636a7b..8127cc322ee 100644
--- a/tensorflow/lite/micro/kernels/concatenation.cc
+++ b/tensorflow/lite/micro/kernels/concatenation.cc
@@ -136,8 +136,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteConcatenationParams* params =
       reinterpret_cast<TfLiteConcatenationParams*>(node->builtin_data);
 
-  TfLiteType input_type = GetInput(context, node, 0)->type;
-  TfLiteType output_type = GetOutput(context, node, kOutputTensor)->type;
+  const TfLiteTensor* input_tensor = GetInput(context, node, 0);
+  TF_LITE_ENSURE(context, input_tensor != nullptr);
+  TfLiteType input_type = input_tensor->type;
+  const TfLiteTensor* output_tensor = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output_tensor != nullptr);
+  TfLiteType output_type = output_tensor->type;
 
   // Check activation and input type
   TF_LITE_ENSURE_EQ(context, params->activation, kTfLiteActNone);
@@ -156,6 +160,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Shapes with dimensions >4 are not yet supported with static allocation.
   for (int i = 0; i < num_inputs; ++i) {
     const TfLiteTensor* input = GetInput(context, node, i);
+    TF_LITE_ENSURE(context, input != nullptr);
     int num_dimensions = NumDimensions(input);
 
     if (num_dimensions > 4) {
@@ -173,6 +178,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = static_cast<OpData*>(node->user_data);
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   switch (output_type) {  // Already know in/outtypes are same.
     case kTfLiteFloat32:
@@ -199,6 +205,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       // Store input scale and zero point values in OpParams:
       for (int i = 0; i < node->inputs->size; ++i) {
         const TfLiteTensor* t = GetInput(context, node, i);
+        TF_LITE_ENSURE(context, t != nullptr);
         input_scales[i] = t->params.scale;
         input_zero_points[i] = t->params.zero_point;
       }
@@ -220,7 +227,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteType output_type = GetOutput(context, node, kOutputTensor)->type;
+  const TfLiteTensor* output_tensor = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output_tensor != nullptr);
+  TfLiteType output_type = output_tensor->type;
 
   switch (output_type) {  // Already know in/outtypes are same.
     case kTfLiteFloat32:
diff --git a/tensorflow/lite/micro/kernels/concatenation_test.cc b/tensorflow/lite/micro/kernels/concatenation_test.cc
index d82a804e659..d7ed2213c98 100644
--- a/tensorflow/lite/micro/kernels/concatenation_test.cc
+++ b/tensorflow/lite/micro/kernels/concatenation_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/conv.cc b/tensorflow/lite/micro/kernels/conv.cc
index 6601213fc51..55efa486234 100644
--- a/tensorflow/lite/micro/kernels/conv.cc
+++ b/tensorflow/lite/micro/kernels/conv.cc
@@ -26,9 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace conv {
+namespace {
 
 constexpr int kInputTensor = 0;
 constexpr int kFilterTensor = 1;
@@ -97,10 +95,13 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
   // parameters set. This is usually done during quantized training.
   if (data_type != kTfLiteFloat32) {
     const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+    TF_LITE_ENSURE(context, input != nullptr);
     const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+    TF_LITE_ENSURE(context, filter != nullptr);
     const TfLiteTensor* bias =
         GetOptionalInputTensor(context, node, kBiasTensor);
     TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+    TF_LITE_ENSURE(context, output != nullptr);
     int output_channels = filter->dims->data[kConvQuantizedDimension];
 
     TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
@@ -127,8 +128,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const auto params = static_cast<const TfLiteConvParams*>(node->builtin_data);
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  TF_LITE_ENSURE(context, filter != nullptr);
 
   int input_width = input->dims->data[2];
   int input_height = input->dims->data[1];
@@ -140,10 +144,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Dynimically allocate per-channel quantization parameters.
   const int num_channels = filter->dims->data[kConvQuantizedDimension];
   data->per_channel_output_multiplier =
-      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+      static_cast<int32_t*>(context->AllocatePersistentBuffer(
           context, num_channels * sizeof(int32_t)));
   data->per_channel_output_shift =
-      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+      static_cast<int32_t*>(context->AllocatePersistentBuffer(
           context, num_channels * sizeof(int32_t)));
 
   // All per-channel quantized tensors need valid zero point and scale arrays.
@@ -295,6 +299,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
   const OpData& data = *(static_cast<const OpData*>(node->user_data));
 
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
+                     "Hybrid models are not supported on TFLite Micro.");
+
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
       EvalFloat(context, node, params, data, input, filter, bias, nullptr,
@@ -316,19 +324,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-}  // namespace conv
+}  // namespace
 
 TfLiteRegistration Register_CONV_2D() {
-  return {/*init=*/conv::Init,
+  return {/*init=*/Init,
           /*free=*/nullptr,
-          /*prepare=*/conv::Prepare,
-          /*invoke=*/conv::Eval,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/conv_test.cc b/tensorflow/lite/micro/kernels/conv_test.cc
index be646d63659..955eab4bbd3 100644
--- a/tensorflow/lite/micro/kernels/conv_test.cc
+++ b/tensorflow/lite/micro/kernels/conv_test.cc
@@ -17,27 +17,32 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/micro_utils.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
 namespace {
 
 // Common inputs and outputs.
-static const int kInputElements = 16;
+constexpr int kInputElements = 16;
 static const int kInputShape[] = {4, 2, 2, 4, 1};
-static const float kInputData[] = {1, 1, 1, 1, 2, 2, 2, 2,
-                                   1, 2, 3, 4, 1, 2, 3, 4};
-static const int kFilterElements = 12;
+static const float kInputData[kInputElements] = {1, 1, 1, 1, 2, 2, 2, 2,
+                                                 1, 2, 3, 4, 1, 2, 3, 4};
+
+constexpr int kFilterElements = 12;
 static const int kFilterShape[] = {4, 3, 2, 2, 1};
-static const float kFilterData[] = {1, 2, 3, 4, -1, 1, -1, 1, -1, -1, 1, 1};
-static const int kBiasElements = 3;
+static const float kFilterData[kFilterElements] = {1,  2, 3,  4,  -1, 1,
+                                                   -1, 1, -1, -1, 1,  1};
+
+constexpr int kBiasElements = 3;
 static const int kBiasShape[] = {1, 3};
-static const float kBiasData[] = {1, 2, 3};
-static const int kOutputElements = 12;
+static const float kBiasData[kBiasElements] = {1, 2, 3};
+
+constexpr int kOutputElements = 12;
 static const int kOutputShape[] = {4, 2, 1, 2, 3};
-static const float kGoldenData[] = {18, 2, 5, 18, 2, 5, 17, 4, 3, 37, 4, 3};
+static const float kGoldenData[kOutputElements] = {18, 2, 5, 18, 2, 5,
+                                                   17, 4, 3, 37, 4, 3};
 
 static TfLiteConvParams common_conv_params = {
     kTfLitePaddingValid,  // padding
@@ -49,31 +54,37 @@ static TfLiteConvParams common_conv_params = {
 };
 
 template <typename T>
-TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
-                                 const T* expected_output_data, T* output_data,
-                                 int output_length,
-                                 TfLiteConvParams* conv_params,
-                                 float tolerance = 1e-5) {
+TfLiteStatus InvokeConv(TfLiteTensor* tensors, int tensors_size, T* output_data,
+                        int output_length, TfLiteConvParams* conv_params) {
   int inputs_array_data[] = {3, 0, 1, 2};
   TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
   int outputs_array_data[] = {1, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  const TfLiteRegistration registration =
-      tflite::ops::micro::Register_CONV_2D();
+  const TfLiteRegistration registration = Register_CONV_2D();
   micro::KernelRunner runner(
       registration, tensors, tensors_size, inputs_array, outputs_array,
       reinterpret_cast<void*>(conv_params), micro_test::reporter);
 
   const char* init_data = reinterpret_cast<const char*>(conv_params);
-
-  // TODO(b/154240825): Use a test macro here which fails and returns.
   TfLiteStatus status = runner.InitAndPrepare(init_data);
   if (status != kTfLiteOk) {
     return status;
   }
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+  return runner.Invoke();
+}
 
+template <typename T>
+TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
+                                 const T* expected_output_data, T* output_data,
+                                 int output_length,
+                                 TfLiteConvParams* conv_params,
+                                 float tolerance = 1e-5) {
+  TfLiteStatus status = InvokeConv(tensors, tensors_size, output_data,
+                                   output_length, conv_params);
+  if (status != kTfLiteOk) {
+    return status;
+  }
   for (int i = 0; i < output_length; ++i) {
     TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i],
                               tolerance);
@@ -274,6 +285,64 @@ TF_LITE_MICRO_TEST(SimpleTestQuantized) {
       &tflite::testing::common_conv_params);
 }
 
+TF_LITE_MICRO_TEST(InputOutputDifferentTypeIsError) {
+  using tflite::testing::CreateFloatTensor;
+  using tflite::testing::CreateQuantizedTensor;
+  using tflite::testing::IntArrayFromInts;
+
+  TfLiteIntArray* input_dims = IntArrayFromInts(tflite::testing::kInputShape);
+  TfLiteIntArray* filter_dims = IntArrayFromInts(tflite::testing::kFilterShape);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(tflite::testing::kBiasShape);
+  TfLiteIntArray* output_dims = IntArrayFromInts(tflite::testing::kOutputShape);
+  const int output_dims_count = tflite::ElementCount(*output_dims);
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+
+  int8_t output_data[tflite::testing::kOutputElements];
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(tflite::testing::kInputData, input_dims),
+      CreateFloatTensor(tflite::testing::kFilterData, filter_dims),
+      CreateFloatTensor(tflite::testing::kBiasData, bias_dims),
+      CreateQuantizedTensor(output_data, output_dims, /*scale=*/0.0f,
+                            /*zero_point=*/0),
+  };
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError, tflite::testing::InvokeConv(
+                        tensors, tensors_size, output_data, output_dims_count,
+                        &tflite::testing::common_conv_params));
+}
+
+TF_LITE_MICRO_TEST(HybridModeIsError) {
+  using tflite::testing::CreateFloatTensor;
+  using tflite::testing::CreateQuantizedTensor;
+  using tflite::testing::IntArrayFromInts;
+
+  TfLiteIntArray* input_dims = IntArrayFromInts(tflite::testing::kInputShape);
+  TfLiteIntArray* filter_dims = IntArrayFromInts(tflite::testing::kFilterShape);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(tflite::testing::kBiasShape);
+  TfLiteIntArray* output_dims = IntArrayFromInts(tflite::testing::kOutputShape);
+  const int output_dims_count = tflite::ElementCount(*output_dims);
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+
+  int8_t filter_data[tflite::testing::kFilterElements] = {};
+  float output_data[tflite::testing::kOutputElements];
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(tflite::testing::kInputData, input_dims),
+      CreateQuantizedTensor(filter_data, filter_dims,
+                            /*scale=*/0.0f,
+                            /*zero_point=*/0),
+      CreateFloatTensor(tflite::testing::kBiasData, bias_dims),
+      CreateFloatTensor(output_data, output_dims),
+  };
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError, tflite::testing::InvokeConv(
+                        tensors, tensors_size, output_data, output_dims_count,
+                        &tflite::testing::common_conv_params));
+}
+
 TF_LITE_MICRO_TEST(SimpleTestDilatedQuantized) {
   const int output_dims_count = 24;
   uint8_t output_data[output_dims_count];
@@ -412,41 +481,46 @@ TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannel) {
   // padding, stride_<width,height>, activation, dilation_<width, height>
   TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1,
                                   kTfLiteActNone,      1, 1};
-  const int kInputShape[] = {4, 1, 2, 2, 4};  // [len,N,H,W,C]
-  const int kInputElements =
-      kInputShape[1] * kInputShape[2] * kInputShape[3] * kInputShape[4];
-  float kInputData[/* kInputElements */] = {1, 1, 1, 1, 2, 2, 2, 2,
-                                            1, 2, 3, 4, 1, 2, 3, 4};
-  const int kFilterShape[] = {4, 3, 1, 1, 4};
-  const int kFilterElements =
-      kFilterShape[1] * kFilterShape[2] * kFilterShape[3] * kFilterShape[4];
-  float kFilterData[/* kFilterElements */] = {1,  2, 3,  4,  -1, 1,
+
+  constexpr int input_shape[] = {4, 1, 2, 2, 4};  // [len,N,H,W,C]
+  constexpr int input_elements =
+      input_shape[1] * input_shape[2] * input_shape[3] * input_shape[4];
+  constexpr float input_data[input_elements] = {1, 1, 1, 1, 2, 2, 2, 2,
+                                                1, 2, 3, 4, 1, 2, 3, 4};
+
+  constexpr int filter_shape[] = {4, 3, 1, 1, 4};
+  constexpr int filter_elements =
+      filter_shape[1] * filter_shape[2] * filter_shape[3] * filter_shape[4];
+  const float filter_data[filter_elements] = {1,  2, 3,  4,  -1, 1,
                                               -1, 1, -1, -1, 1,  1};
-  const int kBiasElements = kFilterShape[1];
-  const int kBiasShape[] = {1, kBiasElements};
-  float kBiasData[/* kBiasElements */] = {1, 2, 3};
-  const int kOutputShape[] = {4, 1, 2, 2, kBiasElements};
-  const int kOutputElements = 4 * 3;
-  int8_t output_data[kOutputElements];
-  const float kGoldenData[/* kOutputElements */] = {11, 2, 3, 21, 2, 3,
-                                                    31, 4, 7, 31, 4, 7};
+
+  constexpr int bias_elements = filter_shape[1];
+  constexpr int bias_shape[] = {1, bias_elements};
+  constexpr float bias_data[bias_elements] = {1, 2, 3};
+
+  constexpr int output_shape[] = {4, 1, 2, 2, bias_elements};
+  constexpr int output_elements = 4 * 3;
+  int8_t output_data[output_elements];
+
+  const float golden_data[output_elements] = {11, 2, 3, 21, 2, 3,
+                                              31, 4, 7, 31, 4, 7};
 
   const float input_scale = 0.5f;
   const float output_scale = 1.0f;
   const int input_zero_point = 0;
   const int output_zero_point = 0;
 
-  int8_t input_quantized[kInputElements];
-  int8_t filter_quantized[kFilterElements];
-  int32_t bias_quantized[kBiasElements];
-  int8_t golden_quantized[kOutputElements];
-  int zero_points[kBiasElements + 1];
-  float scales[kBiasElements + 1];
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
 
   tflite::testing::TestConvQuantizedPerChannel(
-      kInputShape, kInputData, input_quantized, input_scale, input_zero_point,
-      kFilterShape, kFilterData, filter_quantized, kBiasShape, kBiasData,
-      bias_quantized, scales, zero_points, kOutputShape, kGoldenData,
+      input_shape, input_data, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_data, filter_quantized, bias_shape, bias_data,
+      bias_quantized, scales, zero_points, output_shape, golden_data,
       golden_quantized, output_data, output_scale, output_zero_point,
       &conv_params);
 }
@@ -456,41 +530,46 @@ TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannelRelu6) {
   // padding, stride_<width,height>, activation, dilation_<width, height>
   TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1,
                                   kTfLiteActRelu6,     1, 1};
-  const int kInputShape[] = {4, 1, 2, 2, 4};  // [len,N,H,W,C]
-  const int kInputElements =
-      kInputShape[1] * kInputShape[2] * kInputShape[3] * kInputShape[4];
-  float kInputData[/* kInputElements */] = {1, 1, 1, 1, 2, 2, 2, 2,
-                                            1, 2, 3, 4, 1, 2, 3, 4};
-  const int kFilterShape[] = {4, 3, 1, 1, 4};
-  const int kFilterElements =
-      kFilterShape[1] * kFilterShape[2] * kFilterShape[3] * kFilterShape[4];
-  float kFilterData[/* kFilterElements */] = {1,  2, 3,  4,  -1, 1,
+
+  constexpr int input_shape[] = {4, 1, 2, 2, 4};  // [len,N,H,W,C]
+  constexpr int input_elements =
+      input_shape[1] * input_shape[2] * input_shape[3] * input_shape[4];
+  constexpr float input_data[input_elements] = {1, 1, 1, 1, 2, 2, 2, 2,
+                                                1, 2, 3, 4, 1, 2, 3, 4};
+
+  constexpr int filter_shape[] = {4, 3, 1, 1, 4};
+  constexpr int filter_elements =
+      filter_shape[1] * filter_shape[2] * filter_shape[3] * filter_shape[4];
+  const float filter_data[filter_elements] = {1,  2, 3,  4,  -1, 1,
                                               -1, 1, -1, -1, 1,  1};
-  const int kBiasElements = kFilterShape[1];
-  const int kBiasShape[] = {1, kBiasElements};
-  float kBiasData[/* kBiasElements */] = {1, 2, -3};
-  const int kOutputShape[] = {4, 1, 2, 2, kBiasElements};
-  const int kOutputElements = 4 * 3;
-  int8_t output_data[kOutputElements];
-  const float kGoldenData[/* kOutputElements */] = {6, 2, 0, 6, 2, 0,
-                                                    6, 4, 1, 6, 4, 1};
+
+  constexpr int bias_elements = filter_shape[1];
+  constexpr int bias_shape[] = {1, bias_elements};
+  constexpr float bias_data[bias_elements] = {1, 2, -3};
+
+  constexpr int output_shape[] = {4, 1, 2, 2, bias_elements};
+  constexpr int output_elements = 4 * 3;
+  int8_t output_data[output_elements];
+
+  const float golden_data[output_elements] = {6, 2, 0, 6, 2, 0,
+                                              6, 4, 1, 6, 4, 1};
 
   const float input_scale = 0.023529f;
   const float output_scale = 0.023529f;
   const int input_zero_point = -128;
   const int output_zero_point = -128;
 
-  int8_t input_quantized[kInputElements];
-  int8_t filter_quantized[kFilterElements];
-  int32_t bias_quantized[kBiasElements];
-  int8_t golden_quantized[kOutputElements];
-  int zero_points[kBiasElements + 1];
-  float scales[kBiasElements + 1];
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
 
   tflite::testing::TestConvQuantizedPerChannel(
-      kInputShape, kInputData, input_quantized, input_scale, input_zero_point,
-      kFilterShape, kFilterData, filter_quantized, kBiasShape, kBiasData,
-      bias_quantized, scales, zero_points, kOutputShape, kGoldenData,
+      input_shape, input_data, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_data, filter_quantized, bias_shape, bias_data,
+      bias_quantized, scales, zero_points, output_shape, golden_data,
       golden_quantized, output_data, output_scale, output_zero_point,
       &conv_params);
 }
diff --git a/tensorflow/lite/micro/kernels/depthwise_conv.cc b/tensorflow/lite/micro/kernels/depthwise_conv.cc
index 2f6083d56c1..85b51233e90 100644
--- a/tensorflow/lite/micro/kernels/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/depthwise_conv.cc
@@ -27,9 +27,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace depthwise_conv {
 namespace {
 
 constexpr int kInputTensor = 0;
@@ -82,10 +79,13 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
   // parameters set. This is usually done during quantized training.
   if (data_type != kTfLiteFloat32) {
     const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+    TF_LITE_ENSURE(context, input != nullptr);
     const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+    TF_LITE_ENSURE(context, filter != nullptr);
     const TfLiteTensor* bias =
         GetOptionalInputTensor(context, node, kBiasTensor);
     TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+    TF_LITE_ENSURE(context, output != nullptr);
     int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
 
     return tflite::PopulateConvolutionQuantizationParams(
@@ -98,8 +98,6 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
   return kTfLiteOk;
 }
 
-}  // namespace
-
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
   return context->AllocatePersistentBuffer(context, sizeof(OpData));
@@ -114,8 +112,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = static_cast<OpData*>(node->user_data);
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
   const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  TF_LITE_ENSURE(context, filter != nullptr);
 
   const TfLiteType data_type = input->type;
   int width = SizeOfDimension(input, 2);
@@ -309,19 +310,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-}  // namespace depthwise_conv
+}  // namespace
 
 TfLiteRegistration Register_DEPTHWISE_CONV_2D() {
-  return {/*init=*/depthwise_conv::Init,
+  return {/*init=*/Init,
           /*free=*/nullptr,
-          /*prepare=*/depthwise_conv::Prepare,
-          /*invoke=*/depthwise_conv::Eval,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
index e16e9f893cb..358d508a564 100644
--- a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
@@ -47,8 +47,7 @@ TfLiteStatus ValidateDepthwiseConvGoldens(
   int outputs_array_data[] = {1, 3};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  const TfLiteRegistration registration =
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D();
+  const TfLiteRegistration registration = Register_DEPTHWISE_CONV_2D();
   micro::KernelRunner runner(
       registration, tensors, tensors_size, inputs_array, outputs_array,
       reinterpret_cast<void*>(conv_params), micro_test::reporter);
diff --git a/tensorflow/lite/micro/kernels/dequantize.cc b/tensorflow/lite/micro/kernels/dequantize.cc
index df501887866..f4e2eb9fbbc 100644
--- a/tensorflow/lite/micro/kernels/dequantize.cc
+++ b/tensorflow/lite/micro/kernels/dequantize.cc
@@ -52,7 +52,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // TODO(b/140515557): Add cached dequant to improve hybrid model performance.
   const TfLiteTensor* input = GetInput(context, node, 0);
+  TF_LITE_ENSURE(context, input != nullptr);
   TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE(context, input->type == kTfLiteUInt8 ||
                               input->type == kTfLiteInt8 ||
diff --git a/tensorflow/lite/micro/kernels/dequantize_test.cc b/tensorflow/lite/micro/kernels/dequantize_test.cc
index 6b499204b98..86059c63647 100644
--- a/tensorflow/lite/micro/kernels/dequantize_test.cc
+++ b/tensorflow/lite/micro/kernels/dequantize_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/elementwise.cc b/tensorflow/lite/micro/kernels/elementwise.cc
index 64880344664..581e532bb13 100644
--- a/tensorflow/lite/micro/kernels/elementwise.cc
+++ b/tensorflow/lite/micro/kernels/elementwise.cc
@@ -41,7 +41,9 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   const TfLiteTensor* input = GetInput(context, node, 0);
+  TF_LITE_ENSURE(context, input != nullptr);
   TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE(context, output != nullptr);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   if (!IsSupportedType(input->type)) {
     TF_LITE_KERNEL_LOG(context, "Input data type %s (%d) is not supported.",
diff --git a/tensorflow/lite/micro/kernels/elementwise_test.cc b/tensorflow/lite/micro/kernels/elementwise_test.cc
index b7094cbd445..1f3b49b3616 100644
--- a/tensorflow/lite/micro/kernels/elementwise_test.cc
+++ b/tensorflow/lite/micro/kernels/elementwise_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/debug_log.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/ethos-u/ethosu.cc b/tensorflow/lite/micro/kernels/ethos-u/ethosu.cc
index ddb144406bb..05b44714773 100644
--- a/tensorflow/lite/micro/kernels/ethos-u/ethosu.cc
+++ b/tensorflow/lite/micro/kernels/ethos-u/ethosu.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <ethosu_driver.h>
 
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/tools/make/downloads/flatbuffers/include/flatbuffers/flexbuffers.h"
 
 namespace tflite {
@@ -26,30 +27,58 @@ namespace ethosu {
 
 constexpr uint8_t CO_TYPE_ETHOSU = 1;
 
+struct OpData {
+  int cms_data_size;
+  int base_addr_idx;
+  int base_addr_size_idx;
+};
+
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  return nullptr;
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
 }
 
 void Free(TfLiteContext* context, void* buffer) {}
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(context != nullptr);
   TF_LITE_ENSURE(context, node->inputs->size > 0);
-  TF_LITE_ENSURE(context, context->tensors);
+  TFLITE_DCHECK(node->user_data != nullptr);
   TF_LITE_ENSURE(context, node->custom_initial_data_size > 0);
+
+  OpData* data = static_cast<OpData*>(node->user_data);
+  int num_base_addr = node->inputs->size + node->outputs->size;
+
+  // Request arrays for the base address pointers and sizes
+  TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
+      context, num_base_addr * sizeof(uint64_t), &data->base_addr_idx));
+  TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
+      context, num_base_addr * sizeof(size_t), &data->base_addr_size_idx));
+
+  // Get command stream data size
+  TfLiteTensor* tensor = context->GetTensor(context, node->inputs->data[0]);
+  data->cms_data_size = tensor->bytes;
+
   return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(context != nullptr);
+  TFLITE_DCHECK(context->GetScratchBuffer != nullptr);
+
   // Get base addresses
-  TfLiteTensor* tensor;
-  int num_base_addr = node->inputs->size + node->outputs->size;
+  TfLiteEvalTensor* tensor;
   int i = 0;
   int num_tensors = 0;
-  uint64_t base_addrs[num_base_addr];
   void* cms_data;
-  int cms_data_size;
   uint8_t co_type;
   int result;
+  const OpData* data = static_cast<const OpData*>(node->user_data);
+  uint64_t* base_addrs = static_cast<uint64_t*>(
+      context->GetScratchBuffer(context, data->base_addr_idx));
+  size_t* base_addrs_size = static_cast<size_t*>(
+      context->GetScratchBuffer(context, data->base_addr_size_idx));
 
   const uint8_t* custom_data =
       static_cast<uint8_t const*>(node->custom_initial_data);
@@ -60,26 +89,32 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     return kTfLiteError;
   }
 
-  // Get command stream data address and size
-  tensor = &(context->tensors[node->inputs->data[0]]);
+  // Get command stream data address
+  tensor = context->GetEvalTensor(context, node->inputs->data[0]);
   cms_data = reinterpret_cast<void*>(tensor->data.uint8);
-  cms_data_size = tensor->bytes;
 
   // Get adresses to weights/scratch/input data
   for (i = 1; i < node->inputs->size; ++i) {
-    tensor = &(context->tensors[node->inputs->data[i]]);
+    tensor = context->GetEvalTensor(context, node->inputs->data[i]);
     base_addrs[num_tensors] = reinterpret_cast<uint64_t>(tensor->data.uint8);
+    base_addrs_size[num_tensors] = tensor->dims->size;
     num_tensors++;
   }
 
   // Get adresses to output data
   for (i = 0; i < node->outputs->size; ++i) {
-    tensor = &(context->tensors[node->outputs->data[i]]);
+    tensor = context->GetEvalTensor(context, node->outputs->data[i]);
     base_addrs[num_tensors] = reinterpret_cast<uint64_t>(tensor->data.uint8);
+    base_addrs_size[num_tensors] = tensor->dims->size;
     num_tensors++;
   }
 
-  result = ethosu_invoke(cms_data, cms_data_size, base_addrs, num_tensors);
+  // Ethos-U guarantees that the tensors that require a base pointer are among
+  // the 8 first tensors
+  num_tensors = std::min(num_tensors, 8);
+
+  result = ethosu_invoke_v2(cms_data, data->cms_data_size, base_addrs,
+                            base_addrs_size, num_tensors);
   if (-1 == result) {
     return kTfLiteError;
   } else {
@@ -89,8 +124,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace ethosu
 
-TfLiteRegistration Register_ETHOSU() {
-  return {ethosu::Init, ethosu::Free, ethosu::Prepare, ethosu::Eval};
+TfLiteRegistration* Register_ETHOSU() {
+  static TfLiteRegistration r = {ethosu::Init,
+                                 ethosu::Free,
+                                 ethosu::Prepare,
+                                 ethosu::Eval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
 }
 
 const char* GetString_ETHOSU() { return "ethos-u"; }
diff --git a/tensorflow/lite/micro/kernels/floor_test.cc b/tensorflow/lite/micro/kernels/floor_test.cc
index 3a27a937b17..dc9086a07cd 100644
--- a/tensorflow/lite/micro/kernels/floor_test.cc
+++ b/tensorflow/lite/micro/kernels/floor_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/fully_connected.cc b/tensorflow/lite/micro/kernels/fully_connected.cc
index 03078f893fb..d3fdeacb016 100644
--- a/tensorflow/lite/micro/kernels/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/fully_connected.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,21 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
+#include "tensorflow/lite/micro/kernels/fully_connected.h"
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace fully_connected {
 namespace {
 
 struct OpData {
@@ -77,8 +75,6 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
   return status;
 }
 
-}  // namespace
-
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
   return context->AllocatePersistentBuffer(context, sizeof(OpData));
@@ -93,9 +89,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       static_cast<const TfLiteFullyConnectedParams*>(node->builtin_data);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
   const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  TF_LITE_ENSURE(context, filter != nullptr);
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   TF_LITE_ENSURE_MSG(context, input->type == filter->type,
@@ -238,19 +237,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-}  // namespace fully_connected
+}  // namespace
 
 TfLiteRegistration Register_FULLY_CONNECTED() {
-  return {/*init=*/fully_connected::Init,
+  return {/*init=*/Init,
           /*free=*/nullptr,
-          /*prepare=*/fully_connected::Prepare,
-          /*invoke=*/fully_connected::Eval,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/fully_connected.h b/tensorflow/lite/micro/kernels/fully_connected.h
new file mode 100644
index 00000000000..3e6467183fe
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/fully_connected.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_FULLY_CONNECTED_H_
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+
+// This is the most generic TfLiteRegistration. The actual supported types may
+// still be target dependent. The only requirement is that every implementation
+// (reference or optimized) must define this function.
+TfLiteRegistration Register_FULLY_CONNECTED();
+
+#if defined(CMSIS_NN) || defined(ARDUINO)
+// The Arduino is a special case where we use the CMSIS kernels, but because of
+// the current approach to building for Arduino, we do not support -DCMSIS_NN as
+// part of the build. As a result, we use defined(ARDUINO) as proxy for the
+// CMSIS kernels for this one special case.
+
+// Returns a TfLiteRegistration struct for cmsis-nn kernel variant that only
+// supports int8.
+TfLiteRegistration Register_FULLY_CONNECTED_INT8();
+
+#else
+// Note that while this block gets used for both reference and optimized kernels
+// that do not have any specialized implementations, the only goal here is to
+// define fallback implementation that allow reference kernels to still be used
+// from applications that call a more specific kernel variant.
+
+inline TfLiteRegistration Register_FULLY_CONNECTED_INT8() {
+  return Register_FULLY_CONNECTED();
+}
+
+#endif
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_KERNELS_FULLY_CONNECTED_H_
diff --git a/tensorflow/lite/micro/kernels/fully_connected_test.cc b/tensorflow/lite/micro/kernels/fully_connected_test.cc
index 95892f5e1d0..3f113010485 100644
--- a/tensorflow/lite/micro/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/micro/kernels/fully_connected_test.cc
@@ -20,22 +20,252 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
-#include "tensorflow/lite/micro/kernels/micro_ops.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
 namespace {
 
+// Simple test data for 2x2x10 input 2x3x10 weights.
+const int simple_input_size = 20;
+const int simple_input_dims[] = {2, 2, 10};
+const float simple_input_data[] = {
+    1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+    1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+};
+const int simple_weights_size = 30;
+const int simple_weights_dims[] = {2, 3, 10};
+const float simple_weights_data[] = {
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+};
+const int simple_bias_dims[] = {1, 3};
+const float simple_bias_data[] = {1, 2, 3};
+const float simple_golden[] = {
+    24, 25, 26, 58, 59, 60,
+};
+const int simple_output_size = 6;
+const int simple_output_dims[] = {2, 2, 3};
+
+// Test data for 2x2x10 input 2x3x10 weights with negative outputs to test relu.
+const int relu_input_size = 20;
+const int relu_input_dims[] = {2, 2, 10};
+const float relu_input_data[] = {
+    1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+    1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+};
+const int relu_weights_size = 30;
+const int relu_weights_dims[] = {2, 3, 10};
+const float relu_weights_data[] = {
+    1,  2,  3,  4,  5,  6,  7,  8,  9,  10,   // u = 0
+    -1, -2, -3, -4, -5, -6, -7, -8, -9, -10,  // u = 1
+    1,  2,  3,  4,  5,  6,  7,  8,  9,  10,   // u = 2
+};
+const int relu_bias_dims[] = {1, 3};
+const float relu_bias_data[] = {1, -2, 3};
+const float relu_golden[] = {
+    24, 0, 26, 58, 0, 60,
+};
+const int relu_output_size = 6;
+const int relu_output_dims[] = {2, 2, 3};
+
+// Input and filter similar to real model. Input shape is 1x64 and output is
+// 1x16.
+const int representative_64x16_input_size = 64;
+const int representative_64x16_input_dims[] = {2, 1, 64};
+const float representative_64x16_input_data[] = {
+    0.0000, 0.1543, 0.0000, 0.0000, 1.8520, 0.0000, 4.7844, 1.1832,
+    0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.5948, 0.0000,
+    1.5948, 1.9549, 0.0000, 1.2347, 0.0000, 1.5948, 1.5948, 0.5145,
+    0.0000, 0.0000, 0.0000, 0.0000, 2.6237, 0.0000, 0.0000, 0.0000,
+    1.3890, 5.3503, 2.3665, 2.9838, 0.0000, 1.2861, 0.0000, 3.0867,
+    0.9775, 0.0000, 5.9676, 0.0000, 0.0000, 1.4405, 0.5145, 2.5723,
+    3.1896, 4.4757, 0.0000, 0.0000, 0.0000, 0.0000, 4.1671, 0.0000,
+    2.8295, 3.0353, 0.0000, 2.7780, 0.0000, 0.0000, 0.0000, 0.0000};
+const int representative_64x16_weights_size = 64 * 16;
+const int representative_64x16_weights_dims[] = {2, 16, 64};
+const float representative_64x16_weights_data[] = {
+    -0.1075, 0.1245,  0.1811,  -0.1302, -0.1868, 0.0679,  0.1245,  0.2321,
+    -0.1981, -0.2094, 0.1358,  -0.1698, 0.0113,  0.0566,  0.1358,  -0.2490,
+    0.0000,  -0.1189, -0.0170, -0.0396, -0.3113, 0.1641,  -0.4188, 0.0566,
+    -0.4471, 0.4754,  -0.0396, 0.0113,  -0.0340, 0.0170,  0.0170,  0.1811,
+    -0.0792, 0.4981,  0.2490,  -0.1924, 0.0792,  0.1868,  -0.1075, -0.3962,
+    0.1358,  0.2547,  -0.1245, -0.0962, -0.0283, 0.4132,  -0.0057, -0.5150,
+    0.1019,  0.1585,  -0.0962, -0.2207, -0.2377, 0.2830,  0.4471,  0.0170,
+    0.0566,  0.2038,  0.1019,  -0.0226, 0.2830,  0.1415,  0.0283,  -0.0792,
+    0.4301,  0.3226,  -0.1132, 0.4981,  -0.3849, -0.2943, -0.2547, -0.2264,
+    0.0453,  -0.0170, 0.0396,  0.1415,  0.3000,  0.2547,  0.0962,  0.2151,
+    -0.1585, -0.1302, -0.0057, -0.2773, 0.0283,  -0.0906, 0.1302,  -0.1075,
+    -0.0566, 0.1755,  0.2773,  0.0283,  0.0566,  0.1528,  -0.0736, -0.2830,
+    0.0792,  0.0962,  -0.2321, -0.0113, 0.2660,  -0.2887, -0.0566, 0.0057,
+    -0.2547, -0.0679, -0.2321, 0.0340,  0.1868,  0.2490,  0.2264,  -0.3509,
+    0.1585,  -0.0849, -0.0623, 0.1132,  0.3396,  -0.2490, 0.1528,  0.0679,
+    0.1755,  0.4754,  -0.0057, -0.2151, -0.1415, -0.1302, -0.2717, 0.1641,
+    0.5037,  -0.2321, 0.0170,  -0.1755, -0.1075, -0.0226, 0.2038,  -0.0340,
+    -0.5150, -0.3113, 0.1472,  -0.0226, 0.1528,  0.1189,  -0.1472, 0.0396,
+    -0.3000, -0.1924, -0.0283, 0.0283,  0.1641,  0.0736,  0.1472,  -0.1755,
+    -0.1132, 0.0113,  -0.1868, -0.2604, -0.3283, -0.0509, 0.0283,  -0.0679,
+    0.0623,  0.0792,  -0.0283, -0.0962, 0.0396,  0.1641,  0.4584,  0.3226,
+    0.0226,  -0.1811, 0.2377,  -0.1019, 0.2321,  0.1811,  -0.1924, -0.0057,
+    0.0736,  0.0113,  0.2547,  -0.2264, -0.0170, -0.0396, 0.1245,  -0.1415,
+    0.1755,  0.3679,  -0.2377, -0.0396, -0.1585, -0.3000, -0.1641, -0.1302,
+    -0.0396, -0.1698, 0.1189,  0.2434,  0.1132,  -0.1245, -0.1415, 0.0453,
+    0.1868,  -0.0906, -0.1189, -0.0509, 0.0057,  -0.1189, -0.0057, 0.0170,
+    -0.1924, 0.2207,  0.0792,  -0.4641, -0.2660, 0.2943,  0.1358,  -0.0340,
+    -0.3339, -0.1189, 0.0906,  -0.4358, 0.0453,  -0.1755, 0.1415,  0.0340,
+    0.1924,  -0.0057, 0.2321,  -0.2094, -0.1132, 0.0000,  0.1924,  -0.3000,
+    0.0340,  -0.3396, -0.0906, -0.0340, 0.1641,  -0.0226, -0.1472, -0.1019,
+    0.2377,  -0.0962, -0.3396, -0.5433, 0.0906,  0.2151,  -0.0679, 0.1755,
+    0.1528,  0.0283,  -0.4188, -0.0340, -0.0057, -0.0679, 0.0509,  0.1472,
+    -0.3849, -0.0113, 0.3962,  0.0849,  0.1472,  0.0340,  -0.1358, 0.1641,
+    -0.2038, 0.2151,  -0.1189, -0.3679, 0.0906,  -0.0679, 0.5716,  -0.0057,
+    -0.0736, 0.0113,  0.2830,  -0.2887, 0.0396,  0.0849,  -0.0736, -0.0736,
+    -0.3679, 0.2264,  0.0113,  -0.1641, 0.0396,  -0.1132, -0.0623, 0.3113,
+    0.5999,  -0.1415, 0.1472,  -0.2038, -0.1132, -0.2377, 0.0566,  0.1755,
+    -0.0057, -0.0453, 0.0226,  0.1132,  0.1698,  0.0340,  -0.0226, 0.0226,
+    0.4415,  -0.3792, 0.0792,  0.3736,  -0.5999, -0.3056, -0.1924, -0.1132,
+    -0.0962, 0.0283,  0.0000,  -0.3339, -0.3226, 0.3679,  -0.0453, -0.1641,
+    0.0170,  0.1302,  -0.0170, -0.0509, 0.1755,  -0.0283, -0.1302, -0.2887,
+    -0.0679, 0.0340,  0.4641,  0.2321,  0.7188,  0.3339,  -0.1075, 0.4754,
+    -0.0226, 0.3226,  -0.1528, -0.0849, 0.0509,  -0.1981, 0.0113,  0.2321,
+    0.2773,  -0.1019, 0.4075,  0.0396,  0.0792,  0.1132,  -0.0906, -0.4188,
+    0.1924,  -0.3679, -0.6396, 0.1358,  0.4981,  0.4132,  -0.0283, 0.3849,
+    -0.3509, -0.0566, -0.0962, 0.3113,  -0.1811, 0.4019,  0.0453,  -0.0057,
+    -0.1868, -0.2490, -0.0792, -0.3622, 0.1924,  -0.0453, -0.1528, -0.1811,
+    0.5943,  -0.1302, 0.3170,  -0.0170, 0.0509,  -0.1528, -0.1755, 0.5547,
+    0.2490,  -0.0906, 0.0000,  0.1698,  0.0000,  0.0340,  -0.1132, -0.0509,
+    -0.1755, -0.2943, 0.1472,  0.0849,  0.0000,  0.1528,  -0.0566, 0.1528,
+    -0.5264, -0.5320, -0.0736, 0.0566,  0.2604,  -0.4075, 0.0962,  -0.3453,
+    -0.1415, 0.0057,  0.3905,  0.2830,  0.3679,  0.5320,  -0.2660, 0.0340,
+    0.0736,  0.0057,  0.2207,  0.4471,  0.0849,  0.3000,  -0.0057, -0.0623,
+    0.1415,  -0.0566, 0.5264,  -0.0340, 0.0226,  -0.0623, -0.0113, -0.5037,
+    -0.4471, 0.0170,  -0.0396, -0.1358, -0.1698, 0.1924,  0.0057,  -0.1585,
+    0.0849,  -0.1698, 0.0057,  -0.1245, -0.0170, -0.1755, -0.0792, 0.5264,
+    0.1358,  0.2434,  0.1585,  -0.4188, -0.1472, -0.1358, -0.0849, -0.1189,
+    0.5037,  0.0736,  -0.0453, -0.2434, 0.1868,  -0.0679, 0.1415,  -0.2717,
+    0.2604,  0.0057,  -0.1528, -0.1811, 0.0226,  -0.1641, 0.3170,  -0.1981,
+    0.1245,  0.0226,  0.0566,  0.2830,  -0.1755, 0.0396,  -0.2094, 0.1924,
+    0.1698,  0.0283,  0.1641,  0.0849,  0.0000,  -0.1698, -0.1415, -0.3000,
+    0.4471,  0.3056,  -0.0283, -0.4245, -0.0453, 0.0226,  0.0000,  -0.1075,
+    -0.1528, -0.3226, 0.2773,  -0.2264, -0.1811, 0.1755,  -0.3566, -0.4188,
+    0.1755,  -0.0057, 0.2038,  0.1075,  0.3679,  -0.0792, 0.2207,  -0.0453,
+    0.3736,  0.2943,  -0.0113, -0.0623, 0.2264,  0.0113,  -0.0396, -0.2207,
+    0.0453,  -0.2830, -0.1302, 0.0623,  -0.1924, -0.1811, -0.2717, 0.2830,
+    0.2094,  0.0170,  -0.3170, -0.0283, -0.1189, -0.0509, -0.0566, -0.3622,
+    0.1132,  -0.0906, 0.1132,  0.4019,  -0.4698, -0.1019, -0.1075, -0.2094,
+    -0.2207, -0.0509, 0.0057,  0.1019,  -0.0509, 0.2264,  -0.5716, 0.0226,
+    -0.4019, 0.1641,  -0.3000, 0.3849,  0.1245,  0.0679,  0.3056,  0.2377,
+    0.0679,  -0.0170, -0.5377, -0.0170, 0.0057,  0.1358,  -0.1132, -0.2038,
+    0.0679,  0.1075,  -0.2773, 0.5943,  0.0623,  -0.1472, 0.3566,  0.0396,
+    -0.2377, 0.2604,  0.0849,  0.1358,  -0.3792, -0.0340, -0.1415, 0.3566,
+    -0.3736, 0.1245,  0.0566,  0.3396,  0.0736,  0.4019,  -0.1528, 0.1075,
+    0.0792,  -0.2547, 0.0453,  -0.1755, 0.1868,  -0.2547, 0.1075,  0.0623,
+    0.1698,  -0.0170, 0.1585,  -0.0736, -0.4358, -0.0113, -0.6792, -0.0849,
+    -0.0396, -0.6056, 0.1358,  0.1189,  0.2547,  0.1528,  0.2887,  0.0453,
+    -0.1075, -0.3283, -0.0453, -0.0509, 0.2038,  0.2547,  0.0849,  -0.0566,
+    -0.1698, 0.0509,  -0.0113, -0.1585, 0.1924,  -0.0792, -0.1868, 0.0509,
+    -0.1698, -0.0849, -0.0170, 0.0453,  0.3170,  0.0906,  -0.5943, -0.1245,
+    0.1585,  -0.1755, -0.2151, 0.0906,  0.1924,  0.3170,  -0.2490, -0.5660,
+    -0.0283, 0.0962,  -0.1358, 0.1585,  0.0057,  -0.2604, 0.1189,  -0.0170,
+    0.3509,  0.0623,  0.0679,  -0.1302, -0.0792, 0.0906,  -0.0792, 0.0849,
+    -0.1924, 0.2604,  -0.1245, -0.3679, 0.0340,  0.0113,  -0.1698, 0.2490,
+    0.0283,  0.1019,  -0.3736, 0.1019,  -0.2207, -0.0340, 0.3170,  0.1755,
+    0.0962,  0.3226,  -0.0113, -0.1189, -0.2321, -0.0226, -0.2434, -0.0170,
+    -0.1585, -0.0283, -0.1132, 0.0679,  -0.4188, -0.0453, 0.1528,  -0.1302,
+    -0.3792, 0.1415,  -0.1358, -0.1811, 0.1302,  0.1415,  0.5207,  0.0509,
+    -0.1358, -0.0396, -0.2434, 0.0396,  0.0792,  -0.2264, -0.1415, 0.0906,
+    0.1245,  0.0170,  0.0623,  -0.1415, 0.2773,  -0.3566, -0.0396, 0.2887,
+    0.4188,  0.1698,  -0.2547, 0.1132,  -0.0453, -0.0113, -0.1358, 0.1075,
+    0.0566,  0.1075,  0.2604,  -0.0849, -0.2490, 0.1415,  0.0509,  -0.2151,
+    0.0340,  0.1698,  0.0509,  -0.0906, 0.0566,  -0.1075, -0.2151, 0.2038,
+    -0.1924, -0.0113, 0.2830,  0.1358,  -0.1189, 0.0113,  -0.5603, -0.2830,
+    -0.2943, 0.0453,  -0.0396, 0.1358,  0.0566,  0.2038,  -0.3283, -0.0509,
+    0.0509,  0.1641,  0.2094,  -0.2038, -0.1868, -0.1585, -0.2207, -0.1302,
+    0.0396,  -0.1019, -0.0679, 0.1075,  -0.4584, -0.2207, 0.2434,  -0.0113,
+    0.0849,  0.1755,  -0.3056, 0.1585,  -0.2547, 0.0453,  0.0906,  -0.1358,
+    -0.0679, -0.0509, 0.0679,  -0.3509, 0.0057,  0.0453,  0.4132,  -0.1981,
+    0.2264,  -0.0736, 0.1075,  0.0679,  -0.0906, -0.3113, 0.0509,  0.0849,
+    0.2604,  0.0623,  -0.3113, 0.3849,  0.0000,  0.6396,  -0.2038, -0.1019,
+    0.1245,  -0.0453, 0.1641,  0.1075,  -0.1075, -0.2660, -0.4528, -0.0566,
+    -0.0170, 0.0453,  0.0340,  0.1189,  -0.2434, -0.0283, -0.1811, 0.2547,
+    0.0000,  -0.0226, 0.4471,  0.1019,  -0.1472, 0.0849,  0.1075,  0.1075,
+    0.0283,  -0.2773, 0.4415,  -0.1811, 0.2717,  0.3170,  0.0509,  0.0623,
+    -0.0962, 0.1585,  -0.0792, -0.1811, -0.0792, -0.3283, 0.0962,  -0.1698,
+    -0.0736, 0.0453,  0.0962,  -0.3566, -0.4584, 0.3396,  -0.4811, 0.3056,
+    -0.1755, 0.2490,  -0.1698, -0.2377, -0.3339, -0.0453, 0.1811,  0.0736,
+    0.0340,  -0.0962, -0.0113, -0.3056, -0.3339, 0.2038,  0.2038,  -0.1924,
+    0.2547,  -0.4471, -0.0849, -0.2038, 0.3566,  -0.4811, 0.3453,  0.0849,
+    0.1189,  0.3170,  -0.1358, 0.2717,  0.0113,  -0.4754, -0.1924, 0.4245,
+    -0.2773, 0.3453,  0.2264,  0.2943,  0.5320,  0.2773,  -0.2264, -0.1019,
+    -0.1132, -0.3962, 0.3679,  0.0509,  -0.0623, -0.0906, -0.5603, -0.1641,
+    -0.3170, -0.2377, 0.1415,  -0.0509, 0.0792,  0.0170,  -0.0226, -0.0057,
+    -0.1358, -0.4245, 0.3905,  0.3113,  0.0340,  -0.1189, 0.2887,  -0.2943,
+    -0.3056, 0.2434,  0.1019,  -0.0170, 0.3849,  0.1528,  -0.0736, -0.0170,
+    0.0792,  0.1755,  0.0509,  0.3509,  0.1472,  0.1528,  0.1472,  0.0057,
+    0.0113,  -0.0113, -0.3283, -0.3962, -0.0792, -0.1245, -0.0283, -0.1868,
+    0.4019,  0.2943,  -0.0906, -0.2321, 0.6056,  0.1189,  0.0340,  -0.2207,
+    -0.0453, 0.3339,  0.2377,  -0.1641, 0.3736,  0.2151,  -0.2547, 0.0453,
+    0.1924,  -0.1019, -0.0340, -0.2207, 0.3962,  -0.4471, -0.2547, -0.2151,
+    -0.3736, 0.0283,  0.1189,  0.0283,  0.0736,  0.0396,  0.1019,  0.0283,
+    0.0170,  0.2321,  0.3509,  -0.0226, -0.0226, 0.0736,  0.0283,  0.1641,
+    -0.0906, 0.1811,  0.0226,  0.5716,  -0.0396, -0.0509, -0.1641, -0.0509,
+    0.4132,  -0.2604, 0.1019,  -0.0283, -0.0340, 0.0453,  0.1472,  -0.0057,
+    0.2717,  -0.2094, 0.3396,  0.0340,  0.1245,  0.2547,  -0.5886, 0.2717,
+    -0.0906, 0.1641,  0.0962,  -0.0792, -0.0113, 0.2264,  -0.0736, 0.3170,
+    0.0623,  0.0679,  0.0623,  -0.0792, -0.2207, 0.1924,  0.1245,  -0.2773};
+const int representative_64x16_bias_dims[] = {1, 16};
+const float representative_64x16_bias_data[] = {
+    -0.0084, 0.0006,  0.0000,  0.0000,  -0.0087, -0.0006, -0.0003, -0.0003,
+    0.0006,  -0.0003, -0.0003, -0.0003, -0.0253, 0.0012,  0.0000,  0.0000};
+const float representative_64x16_golden[] = {
+    3.8624,  -2.9580, 4.3043,  -1.2844, -1.5769, -2.7998, -0.1011, -3.4029,
+    -1.0557, -7.1931, -1.4852, -0.4163, 1.7186,  -0.6965, 0.3580,  2.7378};
+const int representative_64x16_output_size = 16;
+const int representative_64x16_output_dims[] = {2, 1, 16};
+
+template <typename T>
+TfLiteStatus ValidateFullyConnectedGoldens(
+    TfLiteTensor* tensors, const int tensors_size,
+    const TfLiteFusedActivation activation, const float tolerance,
+    const int output_len, const T* golden, T* output_data) {
+  TfLiteFullyConnectedParams builtin_data = {
+      activation, kTfLiteFullyConnectedWeightsFormatDefault, false, false};
+
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  const TfLiteRegistration registration = Register_FULLY_CONNECTED();
+  micro::KernelRunner runner(
+      registration, tensors, tensors_size, inputs_array, outputs_array,
+      reinterpret_cast<void*>(&builtin_data), micro_test::reporter);
+
+  TfLiteStatus status = runner.InitAndPrepare();
+  if (status != kTfLiteOk) {
+    return status;
+  }
+
+  status = runner.Invoke();
+  if (status != kTfLiteOk) {
+    return status;
+  }
+
+  for (int i = 0; i < output_len; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(golden[i], output_data[i], tolerance);
+  }
+  return kTfLiteOk;
+}
+
 TfLiteStatus TestFullyConnectedFloat(
     const int* input_dims_data, const float* input_data,
     const int* weights_dims_data, const float* weights_data,
-    const int* bias_dims_data, const float* bias_data,
-    const float* expected_output_data, const int* output_dims_data,
-    TfLiteFusedActivation activation, float* output_data) {
+    const int* bias_dims_data, const float* bias_data, const float* golden,
+    const int* output_dims_data, TfLiteFusedActivation activation,
+    float* output_data) {
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   TfLiteIntArray* weights_dims = IntArrayFromInts(weights_dims_data);
   TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
@@ -52,44 +282,21 @@ TfLiteStatus TestFullyConnectedFloat(
       CreateFloatTensor(output_data, output_dims),
   };
 
-  TfLiteFullyConnectedParams builtin_data = {
-      activation, kTfLiteFullyConnectedWeightsFormatDefault, false, false};
-
-  int inputs_array_data[] = {3, 0, 1, 2};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 3};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-
-  const TfLiteRegistration registration =
-      ops::micro::Register_FULLY_CONNECTED();
-  micro::KernelRunner runner(
-      registration, tensors, tensors_size, inputs_array, outputs_array,
-      reinterpret_cast<void*>(&builtin_data), micro_test::reporter);
-
-  TfLiteStatus status = runner.InitAndPrepare();
-  if (status != kTfLiteOk) {
-    return status;
-  }
-
-  status = runner.Invoke();
-  if (status != kTfLiteOk) {
-    return status;
-  }
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f);
-  }
-  return kTfLiteOk;
+  return ValidateFullyConnectedGoldens(tensors, tensors_size, activation, 1e-4f,
+                                       output_dims_count, golden, output_data);
 }
 
 template <typename T>
 TfLiteStatus TestFullyConnectedQuantized(
-    const int* input_dims_data, const T* input_data, const float input_min,
-    const float input_max, const int* weights_dims_data, const T* weights_data,
-    const float weights_min, const float weights_max, const int* bias_dims_data,
-    const int32_t* bias_data, const float bias_scale,
-    const T* expected_output_data, const int* output_dims_data,
-    const float output_min, const float output_max,
-    TfLiteFusedActivation activation, T* output_data) {
+    const int* input_dims_data, const float* input_data, T* input_quantized,
+    const float input_scale, const int input_zero_point,
+    const int* weights_dims_data, const float* weights_data,
+    T* weights_quantized, const float weights_scale,
+    const int weights_zero_point, const int* bias_dims_data,
+    const float* bias_data, int32_t* bias_quantized, const float* golden,
+    T* golden_quantized, const int* output_dims_data, const float output_scale,
+    const int output_zero_point, TfLiteFusedActivation activation,
+    T* output_data) {
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   TfLiteIntArray* weights_dims = IntArrayFromInts(weights_dims_data);
   TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
@@ -100,40 +307,22 @@ TfLiteStatus TestFullyConnectedQuantized(
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateQuantizedTensor(input_data, input_dims, input_min, input_max),
-      CreateQuantizedTensor(weights_data, weights_dims, weights_min,
-                            weights_max),
-      CreateQuantized32Tensor(bias_data, bias_dims, bias_scale),
-      CreateQuantizedTensor(output_data, output_dims, output_min, output_max),
+      CreateQuantizedTensor(input_data, input_quantized, input_dims,
+                            input_scale, input_zero_point),
+      CreateQuantizedTensor(weights_data, weights_quantized, weights_dims,
+                            weights_scale, weights_zero_point),
+      CreateQuantizedBiasTensor(bias_data, bias_quantized, bias_dims,
+                                input_scale, weights_scale),
+      CreateQuantizedTensor(output_data, output_dims, output_scale,
+                            output_zero_point),
   };
 
-  TfLiteFullyConnectedParams builtin_data = {
-      activation, kTfLiteFullyConnectedWeightsFormatDefault, false, false};
+  AsymmetricQuantize(golden, golden_quantized, output_dims_count, output_scale,
+                     output_zero_point);
 
-  int inputs_array_data[] = {3, 0, 1, 2};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 3};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-
-  const TfLiteRegistration registration =
-      ops::micro::Register_FULLY_CONNECTED();
-  micro::KernelRunner runner(
-      registration, tensors, tensors_size, inputs_array, outputs_array,
-      reinterpret_cast<void*>(&builtin_data), micro_test::reporter);
-
-  TfLiteStatus status = runner.InitAndPrepare();
-  if (status != kTfLiteOk) {
-    return status;
-  }
-
-  status = runner.Invoke();
-  if (status != kTfLiteOk) {
-    return status;
-  }
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
-  }
-  return kTfLiteOk;
+  return ValidateFullyConnectedGoldens(tensors, tensors_size, activation, 0.0f,
+                                       output_dims_count, golden_quantized,
+                                       output_data);
 }
 
 }  // namespace
@@ -143,789 +332,297 @@ TfLiteStatus TestFullyConnectedQuantized(
 TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(SimpleTest) {
-  const int input_dims_data[] = {2, 2, 10};
-  const float input_data[] = {
-      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
-      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const float weights_data[] = {
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
-  };
-  const int bias_dims_data[] = {1, 3};
-  const float bias_data[] = {1, 2, 3};
-  const float expected_output_data[] = {
-      24, 25, 26, 58, 59, 60,
-  };
-  const int output_dims_data[] = {2, 2, 3};
-
-  const int output_dims_count = 6;
-  float output_data[output_dims_count];
+  float output_data[tflite::testing::simple_output_size];
   TF_LITE_MICRO_EXPECT_EQ(
       tflite::testing::TestFullyConnectedFloat(
-          input_dims_data, input_data, weights_dims_data, weights_data,
-          bias_dims_data, bias_data, expected_output_data, output_dims_data,
+          tflite::testing::simple_input_dims,
+          tflite::testing::simple_input_data,
+          tflite::testing::simple_weights_dims,
+          tflite::testing::simple_weights_data,
+          tflite::testing::simple_bias_dims, tflite::testing::simple_bias_data,
+          tflite::testing::simple_golden, tflite::testing::simple_output_dims,
           kTfLiteActNone, output_data),
       kTfLiteOk);
 }
 
-TF_LITE_MICRO_TEST(SimpleTest2) {
-  const int input_dims_data[] = {2, 2, 2};
-  const float input_data[] = {
-      1, 2,  // b = 0
-      2, 1,  // b = 1
-  };
-  const int weights_dims_data[] = {2, 1, 2};
-  const float weights_data[] = {
-      2, 4,  // u = 0
-  };
-  const int bias_dims_data[] = {1, 1};
-  const float bias_data[] = {1};
-  const float expected_output_data[] = {
-      11,
-      9,
-  };
-  const int output_dims_data[] = {2, 2, 1};
+TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8) {
+  const float input_scale = 1.0f;
+  const int input_zero_point = 127;
+  const float weights_scale = 1.0f;
+  const int weights_zero_point = 128;
+  const float output_scale = 0.5f;
+  const int output_zero_point = 127;
+
+  uint8_t input_quantized[tflite::testing::simple_input_size];
+  uint8_t weights_quantized[tflite::testing::simple_weights_size];
+  int32_t bias_quantized[tflite::testing::simple_output_size];
+  uint8_t golden_quantized[tflite::testing::simple_output_size];
+  uint8_t output_data[tflite::testing::simple_output_size];
 
-  const int output_dims_count = 6;
-  float output_data[output_dims_count];
   TF_LITE_MICRO_EXPECT_EQ(
-      tflite::testing::TestFullyConnectedFloat(
-          input_dims_data, input_data, weights_dims_data, weights_data,
-          bias_dims_data, bias_data, expected_output_data, output_dims_data,
+      tflite::testing::TestFullyConnectedQuantized(
+          tflite::testing::simple_input_dims,
+          tflite::testing::simple_input_data, input_quantized, input_scale,
+          input_zero_point, tflite::testing::simple_weights_dims,
+          tflite::testing::simple_weights_data, weights_quantized,
+          weights_scale, weights_zero_point, tflite::testing::simple_bias_dims,
+          tflite::testing::simple_bias_data, bias_quantized,
+          tflite::testing::simple_golden, golden_quantized,
+          tflite::testing::simple_output_dims, output_scale, output_zero_point,
+          kTfLiteActNone, output_data),
+      kTfLiteOk);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8) {
+  const float input_scale = 1.0f;
+  const int input_zero_point = -1;
+  const float weights_scale = 1.0f;
+  const int weights_zero_point = 0;
+  const float output_scale = 0.5f;
+  const int output_zero_point = -1;
+
+  int8_t input_quantized[tflite::testing::simple_input_size];
+  int8_t weights_quantized[tflite::testing::simple_weights_size];
+  int32_t bias_quantized[tflite::testing::simple_output_size];
+  int8_t golden_quantized[tflite::testing::simple_output_size];
+  int8_t output_data[tflite::testing::simple_output_size];
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      tflite::testing::TestFullyConnectedQuantized(
+          tflite::testing::simple_input_dims,
+          tflite::testing::simple_input_data, input_quantized, input_scale,
+          input_zero_point, tflite::testing::simple_weights_dims,
+          tflite::testing::simple_weights_data, weights_quantized,
+          weights_scale, weights_zero_point, tflite::testing::simple_bias_dims,
+          tflite::testing::simple_bias_data, bias_quantized,
+          tflite::testing::simple_golden, golden_quantized,
+          tflite::testing::simple_output_dims, output_scale, output_zero_point,
           kTfLiteActNone, output_data),
       kTfLiteOk);
 }
 
 TF_LITE_MICRO_TEST(SimpleTestRelu) {
-  const int input_dims_data[] = {2, 2, 10};
-  const float input_data[] = {
-      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
-      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const float weights_data[] = {
-      1,  2,  3,  4,  5,  6,  7,  8,  9,  10,   // u = 0
-      -1, -2, -3, -4, -5, -6, -7, -8, -9, -10,  // u = 1
-      1,  2,  3,  4,  5,  6,  7,  8,  9,  10,   // u = 2
-  };
-  const int bias_dims_data[] = {1, 3};
-  const float bias_data[] = {1, -2, 3};
-  const float expected_output_data[] = {
-      24, 0, 26, 58, 0, 60,
-  };
-  const int output_dims_data[] = {2, 2, 3};
-
-  const int output_dims_count = 6;
-  float output_data[output_dims_count];
+  float output_data[tflite::testing::relu_output_size];
   TF_LITE_MICRO_EXPECT_EQ(
       tflite::testing::TestFullyConnectedFloat(
-          input_dims_data, input_data, weights_dims_data, weights_data,
-          bias_dims_data, bias_data, expected_output_data, output_dims_data,
-          kTfLiteActRelu, output_data),
-      kTfLiteOk);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8) {
-  using tflite::testing::F2Q;
-  using tflite::testing::F2Q32;
-
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float weights_min = -63.5f;
-  const float weights_max = 64.0f;
-  const float bias_scale = 0.25f;
-  const float output_min = -127.0f;
-  const float output_max = 128.0f;
-
-  const int input_dims_data[] = {2, 2, 10};
-  const uint8_t input_data[] = {
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const uint8_t weights_data[] = {
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-  };
-  const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(2, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const uint8_t expected_output_data[] = {
-      F2Q(24, output_min, output_max), F2Q(25, output_min, output_max),
-      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
-      F2Q(59, output_min, output_max), F2Q(60, output_min, output_max),
-  };
-  const int output_dims_data[] = {2, 2, 3};
-
-  const int output_dims_count = 6;
-  uint8_t output_data[output_dims_count];
-  TF_LITE_MICRO_EXPECT_EQ(
-      tflite::testing::TestFullyConnectedQuantized<uint8_t>(
-          input_dims_data, input_data, input_min, input_max, weights_dims_data,
-          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-          bias_scale, expected_output_data, output_dims_data, output_min,
-          output_max, kTfLiteActNone, output_data),
-      kTfLiteOk);
-}
-
-// TODO(b/138811455): Fix code duplication in micro tests
-TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8) {
-  using tflite::testing::F2Q32;
-  using tflite::testing::F2QS;
-
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float weights_min = -64.0f;
-  const float weights_max = 63.5f;
-  const float bias_scale = 0.25f;
-  const float output_min = -127.0f;
-  const float output_max = 128.0f;
-
-  const int input_dims_data[] = {2, 2, 10};
-  const int8_t input_data[] = {
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
-      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
-      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const int8_t weights_data[] = {
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-  };
-  const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(2, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const int8_t expected_output_data[] = {
-      F2QS(24, output_min, output_max), F2QS(25, output_min, output_max),
-      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
-      F2QS(59, output_min, output_max), F2QS(60, output_min, output_max),
-  };
-  const int output_dims_data[] = {2, 2, 3};
-
-  const int output_dims_count = 6;
-  int8_t output_data[output_dims_count];
-  TF_LITE_MICRO_EXPECT_EQ(
-      tflite::testing::TestFullyConnectedQuantized<int8_t>(
-          input_dims_data, input_data, input_min, input_max, weights_dims_data,
-          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-          bias_scale, expected_output_data, output_dims_data, output_min,
-          output_max, kTfLiteActNone, output_data),
+          tflite::testing::relu_input_dims, tflite::testing::relu_input_data,
+          tflite::testing::relu_weights_dims,
+          tflite::testing::relu_weights_data, tflite::testing::relu_bias_dims,
+          tflite::testing::relu_bias_data, tflite::testing::relu_golden,
+          tflite::testing::relu_output_dims, kTfLiteActRelu, output_data),
       kTfLiteOk);
 }
 
 TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8Relu) {
-  using tflite::testing::F2Q;
-  using tflite::testing::F2Q32;
+  const float input_scale = 1.0f;
+  const int input_zero_point = 127;
+  const float weights_scale = 1.0f;
+  const int weights_zero_point = 128;
 
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float weights_min = -63.5f;
-  const float weights_max = 64.0f;
-  const float bias_scale = 0.25f;
-  const float output_min = -127.0f;
-  const float output_max = 128.0f;
+  const float output_scale = 0.5f;
+  const int output_zero_point = 0;
 
-  const int input_dims_data[] = {2, 2, 10};
-  const uint8_t input_data[] = {
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const uint8_t weights_data[] = {
-      F2Q(1, weights_min, weights_max),  F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max),  F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max),  F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max),  F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max),  F2Q(10, weights_min, weights_max),
-      F2Q(-1, weights_min, weights_max), F2Q(-2, weights_min, weights_max),
-      F2Q(-3, weights_min, weights_max), F2Q(-4, weights_min, weights_max),
-      F2Q(-5, weights_min, weights_max), F2Q(-6, weights_min, weights_max),
-      F2Q(-7, weights_min, weights_max), F2Q(-8, weights_min, weights_max),
-      F2Q(-9, weights_min, weights_max), F2Q(-10, weights_min, weights_max),
-      F2Q(1, weights_min, weights_max),  F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max),  F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max),  F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max),  F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max),  F2Q(10, weights_min, weights_max),
-  };
-  const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(0, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const uint8_t expected_output_data[] = {
-      F2Q(24, output_min, output_max), F2Q(0, output_min, output_max),
-      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
-      F2Q(0, output_min, output_max),  F2Q(60, output_min, output_max),
-  };
-  const int output_dims_data[] = {2, 2, 3};
+  uint8_t input_quantized[tflite::testing::relu_input_size];
+  uint8_t weights_quantized[tflite::testing::relu_weights_size];
+  int32_t bias_quantized[tflite::testing::relu_output_size];
+  uint8_t golden_quantized[tflite::testing::relu_output_size];
+  uint8_t output_data[tflite::testing::relu_output_size];
 
-  const int output_dims_count = 6;
-  uint8_t output_data[output_dims_count];
   TF_LITE_MICRO_EXPECT_EQ(
-      tflite::testing::TestFullyConnectedQuantized<uint8_t>(
-          input_dims_data, input_data, input_min, input_max, weights_dims_data,
-          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-          bias_scale, expected_output_data, output_dims_data, output_min,
-          output_max, kTfLiteActRelu, output_data),
+      tflite::testing::TestFullyConnectedQuantized(
+          tflite::testing::relu_input_dims, tflite::testing::relu_input_data,
+          input_quantized, input_scale, input_zero_point,
+          tflite::testing::relu_weights_dims,
+          tflite::testing::relu_weights_data, weights_quantized, weights_scale,
+          weights_zero_point, tflite::testing::relu_bias_dims,
+          tflite::testing::relu_bias_data, bias_quantized,
+          tflite::testing::relu_golden, golden_quantized,
+          tflite::testing::relu_output_dims, output_scale, output_zero_point,
+          kTfLiteActRelu, output_data),
       kTfLiteOk);
 }
 
 TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8Relu) {
-  using tflite::testing::F2Q32;
-  using tflite::testing::F2QS;
+  const float input_scale = 1.0f;
+  const int input_zero_point = -1;
+  const float weights_scale = 1.0f;
+  const int weights_zero_point = 0;
 
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float weights_min = -64.0f;
-  const float weights_max = 63.5f;
-  const float bias_scale = 0.25f;
-  const float output_min = -127.0f;
-  const float output_max = 128.0f;
+  const float output_scale = 0.5f;
+  const int output_zero_point = -128;
 
-  const int input_dims_data[] = {2, 2, 10};
-  const int8_t input_data[] = {
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
-      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
-      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const int8_t weights_data[] = {
-      F2QS(1, weights_min, weights_max),  F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max),  F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max),  F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max),  F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max),  F2QS(10, weights_min, weights_max),
-      F2QS(-1, weights_min, weights_max), F2QS(-2, weights_min, weights_max),
-      F2QS(-3, weights_min, weights_max), F2QS(-4, weights_min, weights_max),
-      F2QS(-5, weights_min, weights_max), F2QS(-6, weights_min, weights_max),
-      F2QS(-7, weights_min, weights_max), F2QS(-8, weights_min, weights_max),
-      F2QS(-9, weights_min, weights_max), F2QS(-10, weights_min, weights_max),
-      F2QS(1, weights_min, weights_max),  F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max),  F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max),  F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max),  F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max),  F2QS(10, weights_min, weights_max),
-  };
-  const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(0, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const int8_t expected_output_data[] = {
-      F2QS(24, output_min, output_max), F2QS(0, output_min, output_max),
-      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
-      F2QS(0, output_min, output_max),  F2QS(60, output_min, output_max),
-  };
-  const int output_dims_data[] = {2, 2, 3};
+  int8_t input_quantized[tflite::testing::relu_input_size];
+  int8_t weights_quantized[tflite::testing::relu_weights_size];
+  int32_t bias_quantized[tflite::testing::relu_output_size];
+  int8_t golden_quantized[tflite::testing::relu_output_size];
+  int8_t output_data[tflite::testing::relu_output_size];
 
-  const int output_dims_count = 6;
-  int8_t output_data[output_dims_count];
   TF_LITE_MICRO_EXPECT_EQ(
-      tflite::testing::TestFullyConnectedQuantized<int8_t>(
-          input_dims_data, input_data, input_min, input_max, weights_dims_data,
-          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-          bias_scale, expected_output_data, output_dims_data, output_min,
-          output_max, kTfLiteActRelu, output_data),
-      kTfLiteOk);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8OutputMultiplierGreaterThan1) {
-  using tflite::testing::F2Q;
-  using tflite::testing::F2Q32;
-
-  const float input_min = -127.0f;
-  const float input_max = 128.0f;
-  const float weights_min = -127.0f;
-  const float weights_max = 128.0f;
-  const float bias_scale = 1.0f;
-  const float output_min = -63.5f;
-  const float output_max = 64.0f;
-
-  const int input_dims_data[] = {2, 2, 10};
-  const uint8_t input_data[] = {
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const uint8_t weights_data[] = {
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-  };
-  const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(2, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const uint8_t expected_output_data[] = {
-      F2Q(24, output_min, output_max), F2Q(25, output_min, output_max),
-      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
-      F2Q(59, output_min, output_max), F2Q(60, output_min, output_max),
-  };
-  const int output_dims_data[] = {2, 2, 3};
-
-  const int output_dims_count = 6;
-  uint8_t output_data[output_dims_count];
-  TF_LITE_MICRO_EXPECT_EQ(
-      tflite::testing::TestFullyConnectedQuantized<uint8_t>(
-          input_dims_data, input_data, input_min, input_max, weights_dims_data,
-          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-          bias_scale, expected_output_data, output_dims_data, output_min,
-          output_max, kTfLiteActNone, output_data),
-      kTfLiteOk);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8OutputMultiplierGreaterThan1) {
-  using tflite::testing::F2Q32;
-  using tflite::testing::F2QS;
-
-  const float input_min = -127.0f;
-  const float input_max = 128.0f;
-  const float weights_min = -128.0f;
-  const float weights_max = 127.0f;
-  const float bias_scale = 1.0f;
-  const float output_min = -63.5f;
-  const float output_max = 64.0f;
-
-  const int input_dims_data[] = {2, 2, 10};
-  const int8_t input_data[] = {
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
-      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
-      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const int8_t weights_data[] = {
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-  };
-  const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(2, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const int8_t expected_output_data[] = {
-      F2QS(24, output_min, output_max), F2QS(25, output_min, output_max),
-      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
-      F2QS(59, output_min, output_max), F2QS(60, output_min, output_max),
-  };
-  const int output_dims_data[] = {2, 2, 3};
-
-  const int output_dims_count = 6;
-  int8_t output_data[output_dims_count];
-  TF_LITE_MICRO_EXPECT_EQ(
-      tflite::testing::TestFullyConnectedQuantized<int8_t>(
-          input_dims_data, input_data, input_min, input_max, weights_dims_data,
-          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-          bias_scale, expected_output_data, output_dims_data, output_min,
-          output_max, kTfLiteActNone, output_data),
+      tflite::testing::TestFullyConnectedQuantized(
+          tflite::testing::relu_input_dims, tflite::testing::relu_input_data,
+          input_quantized, input_scale, input_zero_point,
+          tflite::testing::relu_weights_dims,
+          tflite::testing::relu_weights_data, weights_quantized, weights_scale,
+          weights_zero_point, tflite::testing::relu_bias_dims,
+          tflite::testing::relu_bias_data, bias_quantized,
+          tflite::testing::relu_golden, golden_quantized,
+          tflite::testing::relu_output_dims, output_scale, output_zero_point,
+          kTfLiteActRelu, output_data),
       kTfLiteOk);
 }
 
 TF_LITE_MICRO_TEST(SimpleTest4DInput) {
-  const int input_dims_data[] = {4, 1, 1, 5, 1};
-  const float input_data[] = {
-      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
-      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const float weights_data[] = {
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
-  };
-  const int bias_dims_data[] = {1, 3};
-  const float bias_data[] = {1, 2, 3};
-  const float expected_output_data[] = {
-      24, 25, 26, 58, 59, 60,  // Expected results.
-  };
-  const int output_dims_data[] = {2, 2, 3};
+  const int input_dims_4d[] = {4, 1, 1, 2, 10};
+
+  float output_data[tflite::testing::simple_output_size];
 
-  const int output_dims_count = 6;
-  float output_data[output_dims_count];
   TF_LITE_MICRO_EXPECT_EQ(
       tflite::testing::TestFullyConnectedFloat(
-          input_dims_data, input_data, weights_dims_data, weights_data,
-          bias_dims_data, bias_data, expected_output_data, output_dims_data,
+          input_dims_4d, tflite::testing::simple_input_data,
+          tflite::testing::simple_weights_dims,
+          tflite::testing::simple_weights_data,
+          tflite::testing::simple_bias_dims, tflite::testing::simple_bias_data,
+          tflite::testing::simple_golden, tflite::testing::simple_output_dims,
           kTfLiteActNone, output_data),
       kTfLiteOk);
 }
 
 TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedUInt8) {
-  using tflite::testing::F2Q;
-  using tflite::testing::F2Q32;
+  const float input_scale = 1.0f;
+  const int input_zero_point = 127;
+  const float weights_scale = 1.0f;
+  const int weights_zero_point = 128;
 
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float weights_min = -63.5f;
-  const float weights_max = 64.0f;
-  const float bias_scale = 0.25f;
-  const float output_min = -127.0f;
-  const float output_max = 128.0f;
+  const float output_scale = 0.5f;
+  const int output_zero_point = 127;
 
-  const int input_dims_data[] = {4, 1, 1, 5, 1};
-  const uint8_t input_data[] = {
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const uint8_t weights_data[] = {
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-  };
-  const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(2, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const uint8_t expected_output_data[] = {
-      F2Q(24, output_min, output_max), F2Q(25, output_min, output_max),
-      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
-      F2Q(59, output_min, output_max), F2Q(60, output_min, output_max),
-  };
-  const int output_dims_data[] = {2, 2, 3};
+  const int input_dims_4d[] = {4, 1, 1, 2, 10};
+
+  uint8_t input_quantized[tflite::testing::simple_input_size];
+  uint8_t weights_quantized[tflite::testing::simple_weights_size];
+  int32_t bias_quantized[tflite::testing::simple_output_size];
+  uint8_t golden_quantized[tflite::testing::simple_output_size];
+  uint8_t output_data[tflite::testing::simple_output_size];
 
-  const int output_dims_count = 6;
-  uint8_t output_data[output_dims_count];
   TF_LITE_MICRO_EXPECT_EQ(
-      tflite::testing::TestFullyConnectedQuantized<uint8_t>(
-          input_dims_data, input_data, input_min, input_max, weights_dims_data,
-          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-          bias_scale, expected_output_data, output_dims_data, output_min,
-          output_max, kTfLiteActNone, output_data),
+      tflite::testing::TestFullyConnectedQuantized(
+          input_dims_4d, tflite::testing::simple_input_data, input_quantized,
+          input_scale, input_zero_point, tflite::testing::simple_weights_dims,
+          tflite::testing::simple_weights_data, weights_quantized,
+          weights_scale, weights_zero_point, tflite::testing::simple_bias_dims,
+          tflite::testing::simple_bias_data, bias_quantized,
+          tflite::testing::simple_golden, golden_quantized,
+          tflite::testing::simple_output_dims, output_scale, output_zero_point,
+          kTfLiteActNone, output_data),
       kTfLiteOk);
 }
 
 TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedInt8) {
-  using tflite::testing::F2Q32;
-  using tflite::testing::F2QS;
+  const float input_scale = 1.0f;
+  const int input_zero_point = -1;
+  const float weights_scale = 1.0f;
+  const int weights_zero_point = 0;
 
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float weights_min = -64.0f;
-  const float weights_max = 63.5f;
-  const float bias_scale = 0.25f;
-  const float output_min = -127.0f;
-  const float output_max = 128.0f;
+  const float output_scale = 0.5f;
+  const int output_zero_point = -1;
 
-  const int input_dims_data[] = {4, 1, 1, 5, 1};
-  const int8_t input_data[] = {
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
-      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
-      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const int8_t weights_data[] = {
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-  };
-  const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(2, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const int8_t expected_output_data[] = {
-      F2QS(24, output_min, output_max), F2QS(25, output_min, output_max),
-      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
-      F2QS(59, output_min, output_max), F2QS(60, output_min, output_max),
-  };
-  const int output_dims_data[] = {2, 2, 3};
+  const int input_dims_4d[] = {4, 1, 1, 2, 10};
+
+  int8_t input_quantized[tflite::testing::simple_input_size];
+  int8_t weights_quantized[tflite::testing::simple_weights_size];
+  int32_t bias_quantized[tflite::testing::simple_output_size];
+  int8_t golden_quantized[tflite::testing::simple_output_size];
+  int8_t output_data[tflite::testing::simple_output_size];
 
-  const int output_dims_count = 6;
-  int8_t output_data[output_dims_count];
   TF_LITE_MICRO_EXPECT_EQ(
-      tflite::testing::TestFullyConnectedQuantized<int8_t>(
-          input_dims_data, input_data, input_min, input_max, weights_dims_data,
-          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-          bias_scale, expected_output_data, output_dims_data, output_min,
-          output_max, kTfLiteActNone, output_data),
+      tflite::testing::TestFullyConnectedQuantized(
+          input_dims_4d, tflite::testing::simple_input_data, input_quantized,
+          input_scale, input_zero_point, tflite::testing::simple_weights_dims,
+          tflite::testing::simple_weights_data, weights_quantized,
+          weights_scale, weights_zero_point, tflite::testing::simple_bias_dims,
+          tflite::testing::simple_bias_data, bias_quantized,
+          tflite::testing::simple_golden, golden_quantized,
+          tflite::testing::simple_output_dims, output_scale, output_zero_point,
+          kTfLiteActNone, output_data),
       kTfLiteOk);
 }
 
-TF_LITE_MICRO_TEST(
-    SimpleTest4DInputQuantizedUInt8OutputMultiplierGreaterThan1) {
-  using tflite::testing::F2Q;
-  using tflite::testing::F2Q32;
+TF_LITE_MICRO_TEST(Representative1x64Input1x16Output) {
+  float output_data[tflite::testing::representative_64x16_output_size];
 
-  const float input_min = -127.0f;
-  const float input_max = 128.0f;
-  const float weights_min = -127.0f;
-  const float weights_max = 128.0f;
-  const float bias_scale = 1.0f;
-  const float output_min = -63.5f;
-  const float output_max = 64.0f;
-
-  const int input_dims_data[] = {4, 1, 1, 5, 1};
-  const uint8_t input_data[] = {
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const uint8_t weights_data[] = {
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-  };
-  const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(2, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const uint8_t expected_output_data[] = {
-      F2Q(24, output_min, output_max), F2Q(25, output_min, output_max),
-      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
-      F2Q(59, output_min, output_max), F2Q(60, output_min, output_max),
-  };
-  const int output_dims_data[] = {2, 2, 3};
-
-  const int output_dims_count = 6;
-  uint8_t output_data[output_dims_count];
   TF_LITE_MICRO_EXPECT_EQ(
-      tflite::testing::TestFullyConnectedQuantized<uint8_t>(
-          input_dims_data, input_data, input_min, input_max, weights_dims_data,
-          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-          bias_scale, expected_output_data, output_dims_data, output_min,
-          output_max, kTfLiteActNone, output_data),
+      tflite::testing::TestFullyConnectedFloat(
+          tflite::testing::representative_64x16_input_dims,
+          tflite::testing::representative_64x16_input_data,
+          tflite::testing::representative_64x16_weights_dims,
+          tflite::testing::representative_64x16_weights_data,
+          tflite::testing::representative_64x16_bias_dims,
+          tflite::testing::representative_64x16_bias_data,
+          tflite::testing::representative_64x16_golden,
+          tflite::testing::representative_64x16_output_dims, kTfLiteActNone,
+          output_data),
       kTfLiteOk);
 }
 
-TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedInt8OutputMultiplierGreaterThan1) {
-  using tflite::testing::F2Q32;
-  using tflite::testing::F2QS;
+TF_LITE_MICRO_TEST(Representative1x64Input1x16OutputQuantizedUInt8) {
+  const float input_scale = 0.051445;
+  const int input_zero_point = 0;
+  const float weights_scale = 0.005660;
+  const int weights_zero_point = 128;
 
-  const float input_min = -127.0f;
-  const float input_max = 128.0f;
-  const float weights_min = -128.0f;
-  const float weights_max = 127.0f;
-  const float bias_scale = 1.0f;
-  const float output_min = -63.5f;
-  const float output_max = 64.0f;
+  const float output_scale = 0.069785;
+  const int output_zero_point = 119;
 
-  const int input_dims_data[] = {4, 1, 1, 5, 1};
-  const int8_t input_data[] = {
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
-      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
-      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const int8_t weights_data[] = {
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-  };
-  const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(2, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const int8_t expected_output_data[] = {
-      F2QS(24, output_min, output_max), F2QS(25, output_min, output_max),
-      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
-      F2QS(59, output_min, output_max), F2QS(60, output_min, output_max),
-  };
-  const int output_dims_data[] = {2, 2, 3};
+  uint8_t input_quantized[tflite::testing::representative_64x16_input_size];
+  uint8_t weights_quantized[tflite::testing::representative_64x16_weights_size];
+  int32_t bias_quantized[tflite::testing::representative_64x16_output_size];
+  uint8_t golden_quantized[tflite::testing::representative_64x16_output_size];
+  uint8_t output_data[tflite::testing::representative_64x16_output_size];
 
-  const int output_dims_count = 6;
-  int8_t output_data[output_dims_count];
   TF_LITE_MICRO_EXPECT_EQ(
-      tflite::testing::TestFullyConnectedQuantized<int8_t>(
-          input_dims_data, input_data, input_min, input_max, weights_dims_data,
-          weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-          bias_scale, expected_output_data, output_dims_data, output_min,
-          output_max, kTfLiteActNone, output_data),
+      tflite::testing::TestFullyConnectedQuantized(
+          tflite::testing::representative_64x16_input_dims,
+          tflite::testing::representative_64x16_input_data, input_quantized,
+          input_scale, input_zero_point,
+          tflite::testing::representative_64x16_weights_dims,
+          tflite::testing::representative_64x16_weights_data, weights_quantized,
+          weights_scale, weights_zero_point,
+          tflite::testing::representative_64x16_bias_dims,
+          tflite::testing::representative_64x16_bias_data, bias_quantized,
+          tflite::testing::representative_64x16_golden, golden_quantized,
+          tflite::testing::representative_64x16_output_dims, output_scale,
+          output_zero_point, kTfLiteActNone, output_data),
+      kTfLiteOk);
+}
+
+TF_LITE_MICRO_TEST(Representative1x64Input1x16OutputQuantizedInt8) {
+  const float input_scale = 0.051445;
+  const int input_zero_point = -128;
+  const float weights_scale = 0.005660;
+  const int weights_zero_point = 0;
+
+  const float output_scale = 0.069785;
+  const int output_zero_point = -9;
+
+  int8_t input_quantized[tflite::testing::representative_64x16_input_size];
+  int8_t weights_quantized[tflite::testing::representative_64x16_weights_size];
+  int32_t bias_quantized[tflite::testing::representative_64x16_output_size];
+  int8_t golden_quantized[tflite::testing::representative_64x16_output_size];
+  int8_t output_data[tflite::testing::representative_64x16_output_size];
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      tflite::testing::TestFullyConnectedQuantized(
+          tflite::testing::representative_64x16_input_dims,
+          tflite::testing::representative_64x16_input_data, input_quantized,
+          input_scale, input_zero_point,
+          tflite::testing::representative_64x16_weights_dims,
+          tflite::testing::representative_64x16_weights_data, weights_quantized,
+          weights_scale, weights_zero_point,
+          tflite::testing::representative_64x16_bias_dims,
+          tflite::testing::representative_64x16_bias_data, bias_quantized,
+          tflite::testing::representative_64x16_golden, golden_quantized,
+          tflite::testing::representative_64x16_output_dims, output_scale,
+          output_zero_point, kTfLiteActNone, output_data),
       kTfLiteOk);
 }
 
diff --git a/tensorflow/lite/micro/kernels/hard_swish.cc b/tensorflow/lite/micro/kernels/hard_swish.cc
index 11e1d1a769f..a0a245f8b99 100644
--- a/tensorflow/lite/micro/kernels/hard_swish.cc
+++ b/tensorflow/lite/micro/kernels/hard_swish.cc
@@ -45,7 +45,9 @@ TfLiteStatus HardSwishPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
     HardSwishParams* params = static_cast<HardSwishParams*>(node->user_data);
diff --git a/tensorflow/lite/micro/kernels/hard_swish_test.cc b/tensorflow/lite/micro/kernels/hard_swish_test.cc
index 83cdacc96bc..91345870023 100644
--- a/tensorflow/lite/micro/kernels/hard_swish_test.cc
+++ b/tensorflow/lite/micro/kernels/hard_swish_test.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/kernel_util.cc b/tensorflow/lite/micro/kernels/kernel_util.cc
index 1ddfc1d3a29..deca92b648f 100644
--- a/tensorflow/lite/micro/kernels/kernel_util.cc
+++ b/tensorflow/lite/micro/kernels/kernel_util.cc
@@ -27,5 +27,15 @@ bool HaveSameShapes(const TfLiteEvalTensor* input1,
   return TfLiteIntArrayEqual(input1->dims, input2->dims);
 }
 
+const RuntimeShape GetTensorShape(const TfLiteEvalTensor* tensor) {
+  if (tensor == nullptr || tensor->dims == nullptr) {
+    return RuntimeShape();
+  }
+  TfLiteIntArray* dims = tensor->dims;
+  const int dims_size = dims->size;
+  const int32_t* dims_data = reinterpret_cast<const int32_t*>(dims->data);
+  return RuntimeShape(dims_size, dims_data);
+}
+
 }  // namespace micro
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/kernel_util.h b/tensorflow/lite/micro/kernels/kernel_util.h
index b1b75046bfb..79cd58ec045 100644
--- a/tensorflow/lite/micro/kernels/kernel_util.h
+++ b/tensorflow/lite/micro/kernels/kernel_util.h
@@ -63,15 +63,7 @@ const T* GetTensorData(const TfLiteEvalTensor* tensor) {
 }
 
 // Returns the shape of a TfLiteEvalTensor struct.
-inline const RuntimeShape GetTensorShape(const TfLiteEvalTensor* tensor) {
-  if (tensor == nullptr || tensor->dims == nullptr) {
-    return RuntimeShape();
-  }
-  TfLiteIntArray* dims = tensor->dims;
-  const int dims_size = dims->size;
-  const int32_t* dims_data = reinterpret_cast<const int32_t*>(dims->data);
-  return RuntimeShape(dims_size, dims_data);
-}
+const RuntimeShape GetTensorShape(const TfLiteEvalTensor* tensor);
 
 // Return true if the given tensors have the same shape.
 bool HaveSameShapes(const TfLiteEvalTensor* input1,
diff --git a/tensorflow/lite/micro/kernels/l2norm.cc b/tensorflow/lite/micro/kernels/l2norm.cc
index 02fdfc0f39b..401741a065c 100644
--- a/tensorflow/lite/micro/kernels/l2norm.cc
+++ b/tensorflow/lite/micro/kernels/l2norm.cc
@@ -50,7 +50,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE(context, NumDimensions(input) <= 4);
 
diff --git a/tensorflow/lite/micro/kernels/l2norm_test.cc b/tensorflow/lite/micro/kernels/l2norm_test.cc
index 791f9036c56..b37c6394a66 100644
--- a/tensorflow/lite/micro/kernels/l2norm_test.cc
+++ b/tensorflow/lite/micro/kernels/l2norm_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
@@ -30,64 +30,32 @@ constexpr float kInputMax = 2.0;
 constexpr float kOutputMin = -1.0;
 constexpr float kOutputMax = 127.0 / 128.0;
 
-void QuantizeInputData(const float input_data[], int length,
-                       uint8_t* quantized_data) {
-  for (int i = 0; i < 6; i++) {
-    quantized_data[i] = tflite::testing::F2Q(
-        input_data[i], tflite::testing::kInputMin, tflite::testing::kInputMax);
-  }
-}
-
-void QuantizeInputData(const float input_data[], int length,
-                       int8_t* quantized_data) {
-  for (int i = 0; i < 6; i++) {
-    quantized_data[i] = tflite::testing::F2QS(
-        input_data[i], tflite::testing::kInputMin, tflite::testing::kInputMax);
-  }
-}
-
 TfLiteTensor CreateL2NormTensor(const float* data, TfLiteIntArray* dims,
                                 bool is_input) {
   return CreateFloatTensor(data, dims);
 }
 
-TfLiteTensor CreateL2NormTensor(const uint8_t* data, TfLiteIntArray* dims,
-                                bool is_input) {
-  TfLiteTensor tensor;
-
-  if (is_input) {
-    tensor = CreateQuantizedTensor(data, dims, kInputMin, kInputMax);
-  } else {
-    tensor = CreateQuantizedTensor(data, dims, kOutputMin, kOutputMax);
-  }
-
-  tensor.quantization.type = kTfLiteAffineQuantization;
-  return tensor;
-}
-
-TfLiteTensor CreateL2NormTensor(const int8_t* data, TfLiteIntArray* dims,
-                                bool is_input) {
-  TfLiteTensor tensor;
-
-  if (is_input) {
-    tensor = CreateQuantizedTensor(data, dims, kInputMin, kInputMax);
-  } else {
-    tensor = CreateQuantizedTensor(data, dims, kOutputMin, kOutputMax);
-  }
-
-  tensor.quantization.type = kTfLiteAffineQuantization;
-  return tensor;
-}
-
 template <typename T>
-inline float Dequantize(const T data, float scale, int32_t zero_point) {
-  return scale * (data - zero_point);
+TfLiteTensor CreateL2NormTensor(const T* data, TfLiteIntArray* dims,
+                                bool is_input) {
+  float kInputScale = ScaleFromMinMax<T>(kInputMin, kInputMax);
+  int kInputZeroPoint = ZeroPointFromMinMax<T>(kInputMin, kInputMax);
+  float kOutputScale = ScaleFromMinMax<T>(kOutputMin, kOutputMax);
+  int kOutputZeroPoint = ZeroPointFromMinMax<T>(kOutputMin, kOutputMax);
+  TfLiteTensor tensor;
+  if (is_input) {
+    tensor = CreateQuantizedTensor(data, dims, kInputScale, kInputZeroPoint);
+  } else {
+    tensor = CreateQuantizedTensor(data, dims, kOutputScale, kOutputZeroPoint);
+  }
+
+  tensor.quantization.type = kTfLiteAffineQuantization;
+  return tensor;
 }
 
 template <typename T>
 void TestL2Normalization(const int* input_dims_data, const T* input_data,
-                         const float* expected_output_data, T* output_data,
-                         float variance) {
+                         const T* expected_output_data, T* output_data) {
   TfLiteIntArray* dims = IntArrayFromInts(input_dims_data);
 
   const int output_dims_count = ElementCount(*dims);
@@ -116,25 +84,8 @@ void TestL2Normalization(const int* input_dims_data, const T* input_data,
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
 
-  // Compare the results from dequantization and expected outputs, and make
-  // sure the difference is within a threshold.
-  if (tensors[1].quantization.type != kTfLiteNoQuantization) {
-    TfLiteTensor* output_tensor = &tensors[1];
-    int32_t zero_point = output_tensor->params.zero_point;
-    float scale = output_tensor->params.scale;
-
-    for (int i = 0; i < output_dims_count; ++i) {
-      float output_val = Dequantize(output_data[i], scale, zero_point);
-
-      TF_LITE_MICRO_EXPECT_LE(expected_output_data[i] - variance, output_val);
-      TF_LITE_MICRO_EXPECT_GE(expected_output_data[i] + variance, output_val);
-    }
-  } else {
-    for (int i = 0; i < output_dims_count; ++i) {
-      float output_val = static_cast<float>(output_data[i]);
-      TF_LITE_MICRO_EXPECT_LE(expected_output_data[i] - variance, output_val);
-      TF_LITE_MICRO_EXPECT_GE(expected_output_data[i] + variance, output_val);
-    }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
   }
 }
 
@@ -153,7 +104,7 @@ TF_LITE_MICRO_TEST(SimpleFloatTest) {
   float output_data[data_length];
 
   tflite::testing::TestL2Normalization<float>(
-      input_dims, input_data, expected_output_data, output_data, 0);
+      input_dims, input_data, expected_output_data, output_data);
 }
 
 TF_LITE_MICRO_TEST(ZerosVectorFloatTest) {
@@ -164,7 +115,7 @@ TF_LITE_MICRO_TEST(ZerosVectorFloatTest) {
   float output_data[data_length];
 
   tflite::testing::TestL2Normalization<float>(
-      input_dims, input_data, expected_output_data, output_data, 0);
+      input_dims, input_data, expected_output_data, output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleFloatWithRankLessThanFourTest) {
@@ -176,7 +127,7 @@ TF_LITE_MICRO_TEST(SimpleFloatWithRankLessThanFourTest) {
   float output_data[data_length];
 
   tflite::testing::TestL2Normalization<float>(
-      input_dims, input_data, expected_output_data, output_data, 0);
+      input_dims, input_data, expected_output_data, output_data);
 }
 
 TF_LITE_MICRO_TEST(MultipleBatchFloatTest) {
@@ -195,107 +146,91 @@ TF_LITE_MICRO_TEST(MultipleBatchFloatTest) {
   float output_data[data_length];
 
   tflite::testing::TestL2Normalization<float>(
-      input_dims, input_data, expected_output_data, output_data, 0);
+      input_dims, input_data, expected_output_data, output_data);
 }
 
 TF_LITE_MICRO_TEST(ZerosVectorUint8Test) {
   const int input_dims[] = {4, 1, 1, 1, 6};
   constexpr int data_length = 6;
-  const float input_data[data_length] = {0};
-  const float expected_output_data[data_length] = {0};
-  uint8_t quantized_input[data_length];
+  const uint8_t input_data[data_length] = {127, 127, 127, 127, 127, 127};
+  const uint8_t expected_output[data_length] = {128, 128, 128, 128, 128, 128};
   uint8_t output_data[data_length];
 
-  tflite::testing::QuantizeInputData(input_data, data_length, quantized_input);
-
-  tflite::testing::TestL2Normalization<uint8_t>(
-      input_dims, quantized_input, expected_output_data, output_data, .1);
+  tflite::testing::TestL2Normalization<uint8_t>(input_dims, input_data,
+                                                expected_output, output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleUint8Test) {
   const int input_dims[] = {4, 1, 1, 1, 6};
   constexpr int data_length = 6;
-  float input_data[data_length] = {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1};
-  float expected_output[data_length] = {-0.55, 0.3, 0.35, 0.6, -0.35, 0.05};
-  uint8_t quantized_input[data_length];
+  const uint8_t input_data[data_length] = {57, 165, 172, 204, 82, 133};
+  const uint8_t expected_output[data_length] = {
+      58, 166, 173, 205, 83, 134,
+  };
   uint8_t output_data[data_length];
 
-  tflite::testing::QuantizeInputData(input_data, data_length, quantized_input);
-
-  tflite::testing::TestL2Normalization<uint8_t>(
-      input_dims, quantized_input, expected_output, output_data, .1);
+  tflite::testing::TestL2Normalization<uint8_t>(input_dims, input_data,
+                                                expected_output, output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleInt8Test) {
   const int input_dims[] = {4, 1, 1, 1, 6};
   constexpr int data_length = 6;
-  float input_data[data_length] = {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1};
-  float expected_output[data_length] = {-0.55, 0.3, 0.35, 0.6, -0.35, 0.05};
-  int8_t quantized_input[data_length];
+  const int8_t input_data[data_length] = {-71, 37, 44, 76, -46, 5};
+  const int8_t expected_output[data_length] = {-70, 38, 45, 77, -45, 6};
   int8_t output_data[data_length];
 
-  tflite::testing::QuantizeInputData(input_data, data_length, quantized_input);
-
-  tflite::testing::TestL2Normalization<int8_t>(
-      input_dims, quantized_input, expected_output, output_data, .1);
+  tflite::testing::TestL2Normalization<int8_t>(input_dims, input_data,
+                                               expected_output, output_data);
 }
 
 TF_LITE_MICRO_TEST(ZerosVectorInt8Test) {
   const int input_dims[] = {4, 1, 1, 1, 6};
   constexpr int data_length = 6;
-  const float input_data[data_length] = {0};
-  const float expected_output_data[data_length] = {0};
-  int8_t quantized_input[data_length];
+  const int8_t input_data[data_length] = {-1, -1, -1, -1, -1, -1};
+  const int8_t expected_output[data_length] = {0, 0, 0, 0, 0, 0};
   int8_t output_data[data_length];
 
-  tflite::testing::QuantizeInputData(input_data, data_length, quantized_input);
-
-  tflite::testing::TestL2Normalization<int8_t>(
-      input_dims, quantized_input, expected_output_data, output_data, .1);
+  tflite::testing::TestL2Normalization<int8_t>(input_dims, input_data,
+                                               expected_output, output_data);
 }
 
 TF_LITE_MICRO_TEST(MultipleBatchUint8Test) {
-  const int input_dims[] = {4, 1, 1, 1, 6};
+  const int input_dims[] = {2, 3, 6};
   constexpr int data_length = 18;
-  float input_data[data_length] = {
-      -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 1
-      -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 2
-      -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 3
+  const uint8_t input_data[data_length] = {
+      57, 165, 172, 204, 82, 133,  // batch 1
+      57, 165, 172, 204, 82, 133,  // batch 2
+      57, 165, 172, 204, 82, 133,  // batch 3
   };
-  float expected_output[data_length] = {
-      -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 1
-      -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 2
-      -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 3
+  const uint8_t expected_output[data_length] = {
+      58, 166, 173, 205, 83, 134,  // batch 1
+      58, 166, 173, 205, 83, 134,  // batch 2
+      58, 166, 173, 205, 83, 134,  // batch 3
   };
-  uint8_t quantized_input[data_length];
   uint8_t output_data[data_length];
 
-  tflite::testing::QuantizeInputData(input_data, data_length, quantized_input);
-
-  tflite::testing::TestL2Normalization<uint8_t>(
-      input_dims, quantized_input, expected_output, output_data, .1);
+  tflite::testing::TestL2Normalization<uint8_t>(input_dims, input_data,
+                                                expected_output, output_data);
 }
 
 TF_LITE_MICRO_TEST(MultipleBatchInt8Test) {
-  const int input_dims[] = {4, 1, 1, 1, 6};
+  const int input_dims[] = {2, 3, 6};
   constexpr int data_length = 18;
-  float input_data[data_length] = {
-      -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 1
-      -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 2
-      -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 3
+  const int8_t input_data[data_length] = {
+      -71, 37, 44, 76, -46, 5,  // batch 1
+      -71, 37, 44, 76, -46, 5,  // batch 2
+      -71, 37, 44, 76, -46, 5,  // batch 3
   };
-  float expected_output[data_length] = {
-      -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 1
-      -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 2
-      -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 3
+  const int8_t expected_output[data_length] = {
+      -70, 38, 45, 77, -45, 6,  // batch 1
+      -70, 38, 45, 77, -45, 6,  // batch 2
+      -70, 38, 45, 77, -45, 6,  // batch 3
   };
-  int8_t quantized_input[data_length];
   int8_t output_data[data_length];
 
-  tflite::testing::QuantizeInputData(input_data, data_length, quantized_input);
-
-  tflite::testing::TestL2Normalization<int8_t>(
-      input_dims, quantized_input, expected_output, output_data, .1);
+  tflite::testing::TestL2Normalization<int8_t>(input_dims, input_data,
+                                               expected_output, output_data);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/logical_test.cc b/tensorflow/lite/micro/kernels/logical_test.cc
index d5355c830b6..67606e772e4 100644
--- a/tensorflow/lite/micro/kernels/logical_test.cc
+++ b/tensorflow/lite/micro/kernels/logical_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/logistic.cc b/tensorflow/lite/micro/kernels/logistic.cc
index 7a371da252b..3fa81ba84d0 100644
--- a/tensorflow/lite/micro/kernels/logistic.cc
+++ b/tensorflow/lite/micro/kernels/logistic.cc
@@ -43,7 +43,9 @@ struct OpData {
 TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
                                        OpData* data) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
   if (input->type == kTfLiteInt8) {
diff --git a/tensorflow/lite/micro/kernels/logistic_test.cc b/tensorflow/lite/micro/kernels/logistic_test.cc
index b1e56df3161..7ba2dd8f52f 100644
--- a/tensorflow/lite/micro/kernels/logistic_test.cc
+++ b/tensorflow/lite/micro/kernels/logistic_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/maximum_minimum_test.cc b/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
index 7fab5407cdb..0db93ff18cb 100644
--- a/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
@@ -104,14 +104,11 @@ void TestMaxMinQuantized(const TfLiteRegistration& registration,
   }
 }
 
-void TestMaxMinQuantizedInt32(const TfLiteRegistration& registration,
-                              const int* input1_dims_data,
-                              const int32_t* input1_data, float input1_scale,
-                              const int* input2_dims_data,
-                              const int32_t* input2_data, float input2_scale,
-                              const int32_t* expected_output_data,
-                              float output_scale, const int* output_dims_data,
-                              int32_t* output_data) {
+void TestMaxMinQuantizedInt32(
+    const TfLiteRegistration& registration, const int* input1_dims_data,
+    const int32_t* input1_data, const int* input2_dims_data,
+    const int32_t* input2_data, const int32_t* expected_output_data,
+    const int* output_dims_data, int32_t* output_data) {
   TfLiteIntArray* input1_dims = IntArrayFromInts(input1_dims_data);
   TfLiteIntArray* input2_dims = IntArrayFromInts(input2_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
@@ -121,9 +118,9 @@ void TestMaxMinQuantizedInt32(const TfLiteRegistration& registration,
   constexpr int outputs_size = 1;
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateQuantized32Tensor(input1_data, input1_dims, input1_scale),
-      CreateQuantized32Tensor(input2_data, input2_dims, input2_scale),
-      CreateQuantized32Tensor(output_data, output_dims, output_scale),
+      CreateInt32Tensor(input1_data, input1_dims),
+      CreateInt32Tensor(input2_data, input2_dims),
+      CreateInt32Tensor(output_data, output_dims),
   };
 
   int inputs_array_data[] = {2, 0, 1};
@@ -210,9 +207,6 @@ TF_LITE_MICRO_TEST(FloatWithBroadcastTest) {
 }
 
 TF_LITE_MICRO_TEST(Int32WithBroadcastTest) {
-  const float input1_scale = 0.5;
-  const float input2_scale = 0.5;
-  const float output_scale = 0.5;
   const int dims[] = {3, 3, 1, 2};
   const int dims_scalar[] = {1, 1};
   const int32_t data1[] = {1, 0, -1, -2, 3, 11};
@@ -222,14 +216,12 @@ TF_LITE_MICRO_TEST(Int32WithBroadcastTest) {
   int32_t output_data[6];
 
   tflite::testing::TestMaxMinQuantizedInt32(
-      tflite::ops::micro::Register_MAXIMUM(), dims, data1, input1_scale,
-      dims_scalar, data2, input2_scale, golden_max, output_scale, dims,
-      output_data);
+      tflite::ops::micro::Register_MAXIMUM(), dims, data1, dims_scalar, data2,
+      golden_max, dims, output_data);
 
   tflite::testing::TestMaxMinQuantizedInt32(
-      tflite::ops::micro::Register_MINIMUM(), dims, data1, input1_scale,
-      dims_scalar, data2, input2_scale, golden_min, output_scale, dims,
-      output_data);
+      tflite::ops::micro::Register_MINIMUM(), dims, data1, dims_scalar, data2,
+      golden_min, dims, output_data);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/micro_ops.h b/tensorflow/lite/micro/kernels/micro_ops.h
index 7e63f342280..a65fc4f6a15 100644
--- a/tensorflow/lite/micro/kernels/micro_ops.h
+++ b/tensorflow/lite/micro/kernels/micro_ops.h
@@ -17,10 +17,6 @@ limitations under the License.
 
 #include "tensorflow/lite/c/common.h"
 
-namespace tflite {
-namespace ops {
-namespace micro {
-
 // Forward declaration of all micro op kernel registration methods. These
 // registrations are included with the standard `BuiltinOpResolver`.
 //
@@ -29,6 +25,22 @@ namespace micro {
 // their model requires, using a custom `(Micro)MutableOpResolver`. Selective
 // registration in turn allows the linker to strip unused kernels.
 
+namespace tflite {
+
+// TFLM is incrementally moving towards a flat tflite namespace
+// (https://abseil.io/tips/130). Any new ops (or cleanup of existing ops should
+// have their Register function declarations in the tflite namespace.
+
+TfLiteRegistration Register_CONV_2D();
+TfLiteRegistration Register_DEPTHWISE_CONV_2D();
+TfLiteRegistration Register_QUANTIZE();
+TfLiteRegistration Register_SHAPE();
+TfLiteRegistration Register_SOFTMAX();
+TfLiteRegistration Register_SVDF();
+
+namespace ops {
+namespace micro {
+
 TfLiteRegistration Register_ABS();
 TfLiteRegistration Register_ADD();
 TfLiteRegistration Register_ARG_MAX();
@@ -37,14 +49,11 @@ TfLiteRegistration Register_AVERAGE_POOL_2D();
 TfLiteRegistration Register_CEIL();
 // TODO(b/160234179): Change custom OPs to also return by value.
 TfLiteRegistration* Register_CIRCULAR_BUFFER();
-TfLiteRegistration Register_CONV_2D();
 TfLiteRegistration Register_CONCATENATION();
 TfLiteRegistration Register_COS();
-TfLiteRegistration Register_DEPTHWISE_CONV_2D();
 TfLiteRegistration Register_DEQUANTIZE();
 TfLiteRegistration Register_EQUAL();
 TfLiteRegistration Register_FLOOR();
-TfLiteRegistration Register_FULLY_CONNECTED();
 TfLiteRegistration Register_GREATER();
 TfLiteRegistration Register_GREATER_EQUAL();
 TfLiteRegistration Register_HARD_SWISH();
@@ -66,7 +75,7 @@ TfLiteRegistration Register_PACK();
 TfLiteRegistration Register_PAD();
 TfLiteRegistration Register_PADV2();
 TfLiteRegistration Register_PRELU();
-TfLiteRegistration Register_QUANTIZE();
+TfLiteRegistration Register_REDUCE_MAX();
 TfLiteRegistration Register_RELU();
 TfLiteRegistration Register_RELU6();
 TfLiteRegistration Register_RESHAPE();
@@ -74,13 +83,12 @@ TfLiteRegistration Register_RESIZE_NEAREST_NEIGHBOR();
 TfLiteRegistration Register_ROUND();
 TfLiteRegistration Register_RSQRT();
 TfLiteRegistration Register_SIN();
-TfLiteRegistration Register_SOFTMAX();
 TfLiteRegistration Register_SPLIT();
+TfLiteRegistration Register_SPLIT_V();
 TfLiteRegistration Register_SQRT();
 TfLiteRegistration Register_SQUARE();
 TfLiteRegistration Register_STRIDED_SLICE();
 TfLiteRegistration Register_SUB();
-TfLiteRegistration Register_SVDF();
 TfLiteRegistration Register_UNPACK();
 TfLiteRegistration Register_L2_NORMALIZATION();
 TfLiteRegistration Register_TANH();
diff --git a/tensorflow/lite/micro/kernels/mul.cc b/tensorflow/lite/micro/kernels/mul.cc
index 36e41a36456..b3f3bd4f84a 100644
--- a/tensorflow/lite/micro/kernels/mul.cc
+++ b/tensorflow/lite/micro/kernels/mul.cc
@@ -51,8 +51,11 @@ struct OpData {
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
                              TfLiteMulParams* params, OpData* data) {
   const TfLiteTensor* input1 = GetInput(context, node, kInput1Tensor);
+  TF_LITE_ENSURE(context, input1 != nullptr);
   const TfLiteTensor* input2 = GetInput(context, node, kInput2Tensor);
+  TF_LITE_ENSURE(context, input2 != nullptr);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
diff --git a/tensorflow/lite/micro/kernels/mul_test.cc b/tensorflow/lite/micro/kernels/mul_test.cc
index 86b4d8be57c..8503a1502d1 100644
--- a/tensorflow/lite/micro/kernels/mul_test.cc
+++ b/tensorflow/lite/micro/kernels/mul_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/neg_test.cc b/tensorflow/lite/micro/kernels/neg_test.cc
index 544a3eddc1c..f3c0e7d36a8 100644
--- a/tensorflow/lite/micro/kernels/neg_test.cc
+++ b/tensorflow/lite/micro/kernels/neg_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/pack_test.cc b/tensorflow/lite/micro/kernels/pack_test.cc
index c05595df146..5ac80d698b5 100644
--- a/tensorflow/lite/micro/kernels/pack_test.cc
+++ b/tensorflow/lite/micro/kernels/pack_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/debug_log.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
@@ -168,9 +168,9 @@ void TestPackTwoInputsQuantized32(const int* input1_dims_data,
   constexpr int output_size = 1;
   constexpr int tensors_size = input_size + output_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateQuantized32Tensor(input1_data, input1_dims, 1.0),
-      CreateQuantized32Tensor(input2_data, input2_dims, 1.0),
-      CreateQuantized32Tensor(output_data, output_dims, 1.0)};
+      CreateInt32Tensor(input1_data, input1_dims),
+      CreateInt32Tensor(input2_data, input2_dims),
+      CreateInt32Tensor(output_data, output_dims)};
 
   TfLitePackParams builtin_data = {
       .values_count = 2,
diff --git a/tensorflow/lite/micro/kernels/pad.cc b/tensorflow/lite/micro/kernels/pad.cc
index 497632f22a0..5d9d436448d 100644
--- a/tensorflow/lite/micro/kernels/pad.cc
+++ b/tensorflow/lite/micro/kernels/pad.cc
@@ -50,10 +50,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
   const TfLiteTensor* input = GetInput(context, node, /*index=*/0);
+  TF_LITE_ENSURE(context, input != nullptr);
   const TfLiteTensor* paddings = GetInput(context, node, /*index=*/1);
+  TF_LITE_ENSURE(context, paddings != nullptr);
   const TfLiteTensor* constant_values =
       NumInputs(node) == 3 ? GetInput(context, node, /*index=*/2) : nullptr;
   TfLiteTensor* output = GetOutput(context, node, /*index=*/0);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
diff --git a/tensorflow/lite/micro/kernels/pad_test.cc b/tensorflow/lite/micro/kernels/pad_test.cc
index 4d391057858..e94bc993fea 100644
--- a/tensorflow/lite/micro/kernels/pad_test.cc
+++ b/tensorflow/lite/micro/kernels/pad_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/pooling.cc b/tensorflow/lite/micro/kernels/pooling.cc
index 90d48aaee5a..64aef0e1bcc 100644
--- a/tensorflow/lite/micro/kernels/pooling.cc
+++ b/tensorflow/lite/micro/kernels/pooling.cc
@@ -222,7 +222,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = static_cast<OpData*>(node->user_data);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, data));
 
diff --git a/tensorflow/lite/micro/kernels/pooling_test.cc b/tensorflow/lite/micro/kernels/pooling_test.cc
index a33f5df6fd4..9782b49ad98 100644
--- a/tensorflow/lite/micro/kernels/pooling_test.cc
+++ b/tensorflow/lite/micro/kernels/pooling_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/prelu.cc b/tensorflow/lite/micro/kernels/prelu.cc
index 8665dbc2abb..b48491d66a8 100644
--- a/tensorflow/lite/micro/kernels/prelu.cc
+++ b/tensorflow/lite/micro/kernels/prelu.cc
@@ -95,8 +95,11 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
   PreluParams* params = static_cast<PreluParams*>(node->user_data);
 
   const TfLiteTensor* input = GetInput(context, node, 0);
+  TF_LITE_ENSURE(context, input != nullptr);
   const TfLiteTensor* alpha = GetInput(context, node, 1);
+  TF_LITE_ENSURE(context, alpha != nullptr);
   TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   return CalculatePreluParams(input, alpha, output, params);
 }
diff --git a/tensorflow/lite/micro/kernels/prelu_test.cc b/tensorflow/lite/micro/kernels/prelu_test.cc
index f559ddff993..3a0b10a0d94 100644
--- a/tensorflow/lite/micro/kernels/prelu_test.cc
+++ b/tensorflow/lite/micro/kernels/prelu_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/quantize.cc b/tensorflow/lite/micro/kernels/quantize.cc
index 9e512b35743..81f5f073b48 100644
--- a/tensorflow/lite/micro/kernels/quantize.cc
+++ b/tensorflow/lite/micro/kernels/quantize.cc
@@ -23,9 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_utils.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace quantize {
+namespace {
 
 struct OpData {
   tflite::QuantizationParams quantization_params;
@@ -50,7 +48,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
   const TfLiteTensor* input = GetInput(context, node, 0);
+  TF_LITE_ENSURE(context, input != nullptr);
   TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   // TODO(b/128934713): Add support for fixed-point per-channel quantization.
   // Currently this only support affine per-layer quantization.
@@ -72,8 +72,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   if (((input->type == kTfLiteInt16 || input->type == kTfLiteInt8) &&
        output->type == kTfLiteInt8) ||
       (input->type == kTfLiteInt16 && output->type == kTfLiteInt16)) {
-    double effective_scale =
-        static_cast<double>(input->params.scale / output->params.scale);
+    double effective_scale = static_cast<double>(input->params.scale) /
+                             static_cast<double>(output->params.scale);
 
     QuantizeMultiplier(effective_scale, &data->output_multiplier,
                        &data->output_shift);
@@ -173,22 +173,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-}  // namespace quantize
+}  // namespace
 
-// This Op (QUANTIZE) quantizes the input and produces quantized output.
-// AffineQuantize takes scale and zero point and quantizes the float value to
-// quantized output, in int8_t or uint8_t format.
 TfLiteRegistration Register_QUANTIZE() {
-  return {/*init=*/quantize::Init,
+  return {/*init=*/Init,
           /*free=*/nullptr,
-          /*prepare=*/quantize::Prepare,
-          /*invoke=*/quantize::Eval,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/quantize_test.cc b/tensorflow/lite/micro/kernels/quantize_test.cc
index b92758d3b93..b630fb53bca 100644
--- a/tensorflow/lite/micro/kernels/quantize_test.cc
+++ b/tensorflow/lite/micro/kernels/quantize_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
 #include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
@@ -35,8 +34,7 @@ void ValidateQuantizeGoldens(TfLiteTensor* tensors, int tensors_size,
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
   // Version 1 of quantize supports int8_t and uint8_t quantization.
-  const TfLiteRegistration registration =
-      tflite::ops::micro::Register_QUANTIZE();
+  const TfLiteRegistration registration = Register_QUANTIZE();
   micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
                              outputs_array,
                              /*builtin_data=*/nullptr, micro_test::reporter);
diff --git a/tensorflow/lite/micro/kernels/reduce.cc b/tensorflow/lite/micro/kernels/reduce.cc
index 5cae782482e..8c60269cb02 100644
--- a/tensorflow/lite/micro/kernels/reduce.cc
+++ b/tensorflow/lite/micro/kernels/reduce.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/mean.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -32,10 +33,27 @@ namespace reduce {
 constexpr int kMaxNumberOfAxis = 4;
 constexpr int kMaxNumberOfReducedAxis = 2;
 
+struct OpData {
+  int32_t multiplier;
+  int shift;
+  int temp_buffer_idx;
+  int resolved_axis_idx;
+  int input_zp;
+  float input_scale;
+  int output_zp;
+  float output_scale;
+  int num_output_elements;
+};
+
+void* InitReduce(TfLiteContext* context, const char* buffer, size_t length) {
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
 TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) {
   // Inputs Tensor (dtype depends on quantization):
   // [0] = Input
   // [1] = Axis
+  const TfLiteTensor* input = GetInput(context, node, 0);
 
   // Outputs Tensor (dtype depends on quantization):
   // [0] = Output
@@ -46,11 +64,61 @@ TfLiteStatus PrepareSimple(TfLiteContext* context, TfLiteNode* node) {
 
   // Validate axis type
   const TfLiteTensor* axis = GetInput(context, node, 1);
+  TF_LITE_ENSURE(context, axis != nullptr);
   TF_LITE_ENSURE_TYPES_EQ(context, axis->type, kTfLiteInt32);
+
+  if (input->type == kTfLiteInt8) {
+    OpData* data = static_cast<OpData*>(node->user_data);
+    const TfLiteTensor* output = GetOutput(context, node, 0);
+    const double real_multiplier = static_cast<double>(input->params.scale) /
+                                   static_cast<double>(output->params.scale);
+    QuantizeMultiplier(real_multiplier, &data->multiplier, &data->shift);
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus PrepareMax(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_OK(context, PrepareSimple(context, node));
+
+  OpData* op_data = static_cast<OpData*>(node->user_data);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* axis = GetInput(context, node, 1);
+
+  op_data->input_scale = input->params.scale;
+  op_data->output_scale = output->params.scale;
+  op_data->num_output_elements = NumElements(output);
+
+  context->RequestScratchBufferInArena(context, sizeof(int) * input->dims->size,
+                                       &op_data->temp_buffer_idx);
+  context->RequestScratchBufferInArena(
+      context, sizeof(int) * static_cast<int>(ElementCount(*axis->dims)),
+      &op_data->resolved_axis_idx);
+
   return kTfLiteOk;
 }
 
 TfLiteStatus PrepareMeanOrSum(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+  const TfLiteTensor* output = GetOutput(context, node, 0);
+  if (input->type == kTfLiteInt8) {
+    const double real_multiplier = static_cast<double>(input->params.scale) /
+                                   static_cast<double>(output->params.scale);
+    QuantizeMultiplier(real_multiplier, &op_data->multiplier, &op_data->shift);
+  }
+
+  int output_size = NumElements(output);
+  if (input->type == kTfLiteInt8 || input->type == kTfLiteUInt8) {
+    context->RequestScratchBufferInArena(context, output_size * sizeof(int32_t),
+                                         &op_data->temp_buffer_idx);
+    op_data->input_zp = input->params.zero_point;
+    op_data->input_scale = input->params.scale;
+    op_data->output_zp = output->params.zero_point;
+    op_data->output_scale = output->params.scale;
+  }
+
   TF_LITE_ENSURE_OK(context, PrepareSimple(context, node));
   // TODO(b/144955155): Support uint8_t(b/144955155) and int8_t(b/144955018)
   return kTfLiteOk;
@@ -74,30 +142,25 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
   TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
   TfLiteReducerParams* params =
       reinterpret_cast<TfLiteReducerParams*>(node->builtin_data);
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
 
   int num_axis = static_cast<int>(ElementCount(*axis->dims));
   int temp_index[kMaxNumberOfAxis];
   int resolved_axis[kMaxNumberOfReducedAxis];
 
+  tflite::MeanParams op_params;
+  ResolveAxis(tflite::micro::GetTensorData<int>(axis), num_axis, &op_params);
+
+  // Special case mean implementation exists for 4D mean across axes 1 and 2.
+  bool special_case_4d_axes_1_and_2 =
+      input->dims->size == 4 && op_params.axis_count == 2 &&
+      ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+       (op_params.axis[0] == 2 && op_params.axis[1] == 1));
+
   switch (input->type) {
     case kTfLiteFloat32: {
-      tflite::MeanParams op_params;
-      ResolveAxis(tflite::micro::GetTensorData<int>(axis), num_axis,
-                  &op_params);
-      // TODO(b/146571391): Support only 4D Input and 2D Axis for Mean until
-      // scratch tensor allocation has been implemented in (b/132070898)
-      bool is_valid_inputs =
-          (input->dims->size == 4 && op_params.axis_count == 2 &&
-           ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
-            (op_params.axis[0] == 2 && op_params.axis[1] == 1)));
-      TF_LITE_ENSURE_MSG(
-          context, is_valid_inputs == true,
-          "Number of Input "
-          "dimensions != 4 OR the Axis is not either [1, 2] or [2, 1]");
-      // TODO(b/139102329): Handle the below special case in the combined
-      // reference method.
       // Defer to specialized implementation for 4D Mean across axes 1 & 2.
-      if (params->keep_dims) {
+      if (params->keep_dims && special_case_4d_axes_1_and_2) {
         reference_ops::Mean(op_params, tflite::micro::GetTensorShape(input),
                             tflite::micro::GetTensorData<float>(input),
                             tflite::micro::GetTensorShape(output),
@@ -114,18 +177,146 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
                 tflite::micro::GetTensorData<float>(output)));
       }
     } break;
+    case kTfLiteInt8: {
+      // Defer to specialized implementation for 4D Mean across axes 1 & 2.
+      if (params->keep_dims && special_case_4d_axes_1_and_2) {
+        reference_integer_ops::Mean(
+            op_params, op_data->multiplier, op_data->shift,
+            tflite::micro::GetTensorShape(input),
+            tflite::micro::GetTensorData<int8_t>(input), op_data->input_zp,
+            tflite::micro::GetTensorShape(output),
+            tflite::micro::GetTensorData<int8_t>(output), op_data->output_zp);
+      } else if (op_data->input_zp == op_data->output_zp &&
+                 op_data->input_scale == op_data->output_scale) {
+        int32_t* temp_buffer = static_cast<int32_t*>(
+            context->GetScratchBuffer(context, op_data->temp_buffer_idx));
+        TF_LITE_ENSURE(
+            context,
+            reference_ops::Mean(
+                tflite::micro::GetTensorData<int8_t>(input), input->dims->data,
+                input->dims->size, tflite::micro::GetTensorData<int8_t>(output),
+                output->dims->data, output->dims->size,
+                tflite::micro::GetTensorData<int>(axis), num_axis,
+                params->keep_dims, temp_index, resolved_axis, temp_buffer));
+      } else {
+        int32_t* temp_buffer = static_cast<int32_t*>(
+            context->GetScratchBuffer(context, op_data->temp_buffer_idx));
+        TF_LITE_ENSURE(
+            context,
+            reference_ops::QuantizedMeanOrSum(
+                tflite::micro::GetTensorData<int8_t>(input), op_data->input_zp,
+                op_data->input_scale, input->dims->data, input->dims->size,
+                tflite::micro::GetTensorData<int8_t>(output),
+                op_data->output_zp, op_data->output_scale, output->dims->data,
+                output->dims->size, tflite::micro::GetTensorData<int>(axis),
+                num_axis, params->keep_dims, temp_index, resolved_axis,
+                temp_buffer, false));
+      }
+    } break;
+    case kTfLiteUInt8: {
+      // Defer to specialized implementation for 4D Mean across axes 1 & 2.
+      if (params->keep_dims && special_case_4d_axes_1_and_2) {
+        reference_ops::Mean(op_params, tflite::micro::GetTensorShape(input),
+                            tflite::micro::GetTensorData<uint8_t>(input),
+                            op_data->input_zp, op_data->input_scale,
+                            tflite::micro::GetTensorShape(output),
+                            tflite::micro::GetTensorData<uint8_t>(output),
+                            op_data->output_zp, op_data->output_scale);
+      } else if (op_data->input_zp == op_data->output_zp &&
+                 op_data->input_scale == op_data->output_scale) {
+        uint32_t* temp_buffer = static_cast<uint32_t*>(
+            context->GetScratchBuffer(context, op_data->temp_buffer_idx));
+        TF_LITE_ENSURE(
+            context,
+            reference_ops::Mean(tflite::micro::GetTensorData<uint8_t>(input),
+                                input->dims->data, input->dims->size,
+                                tflite::micro::GetTensorData<uint8_t>(output),
+                                output->dims->data, output->dims->size,
+                                tflite::micro::GetTensorData<int>(axis),
+                                num_axis, params->keep_dims, temp_index,
+                                resolved_axis, temp_buffer));
+      } else {
+        uint32_t* temp_buffer = static_cast<uint32_t*>(
+            context->GetScratchBuffer(context, op_data->temp_buffer_idx));
+        TF_LITE_ENSURE(
+            context,
+            reference_ops::QuantizedMeanOrSum(
+                tflite::micro::GetTensorData<uint8_t>(input), op_data->input_zp,
+                op_data->input_scale, input->dims->data, input->dims->size,
+                tflite::micro::GetTensorData<uint8_t>(output),
+                op_data->output_zp, op_data->output_scale, output->dims->data,
+                output->dims->size, tflite::micro::GetTensorData<int>(axis),
+                num_axis, params->keep_dims, temp_index, resolved_axis,
+                temp_buffer, false));
+      }
+    } break;
     default:
-      // TODO(b/144955155): Support uint8_t(b/144955155) and int8_t(b/144955018)
       TF_LITE_ENSURE_MSG(context, false,
-                         "Currently, only float32 input type "
+                         "Currently, only float32, int8 or uint8 input type "
                          "is supported.");
   }
   return kTfLiteOk;
 }
+
+TfLiteStatus EvalMax(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  const TfLiteEvalTensor* axis = tflite::micro::GetEvalInput(context, node, 1);
+  TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
+  TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
+  TfLiteReducerParams* params =
+      static_cast<TfLiteReducerParams*>(node->builtin_data);
+  OpData* op_data = static_cast<OpData*>(node->user_data);
+
+  // Interpret an axis tensor with null dimensions as a scalar
+  int num_axis = static_cast<int>(ElementCount(*axis->dims));
+  int* temp_buffer = static_cast<int*>(
+      context->GetScratchBuffer(context, op_data->temp_buffer_idx));
+  int* resolved_axis = static_cast<int*>(
+      context->GetScratchBuffer(context, op_data->resolved_axis_idx));
+  switch (input->type) {
+    case kTfLiteFloat32:
+      TF_LITE_ENSURE(
+          context,
+          reference_ops::ReduceGeneric<float>(
+              tflite::micro::GetTensorData<float>(input), input->dims->data,
+              input->dims->size, tflite::micro::GetTensorData<float>(output),
+              output->dims->data, output->dims->size,
+              tflite::micro::GetTensorData<int>(axis), num_axis,
+              params->keep_dims, temp_buffer, resolved_axis,
+              std::numeric_limits<float>::lowest(),
+              [](const float current, const float in) -> float {
+                return (in > current) ? in : current;
+              }));
+      break;
+    case kTfLiteInt8:
+      TF_LITE_ENSURE_EQ(context, static_cast<double>(op_data->input_scale),
+                        static_cast<double>(op_data->output_scale));
+      TF_LITE_ENSURE_EQ(context, op_data->input_zp, op_data->output_zp);
+      TF_LITE_ENSURE(
+          context,
+          reference_ops::ReduceGeneric<int8_t>(
+              tflite::micro::GetTensorData<int8_t>(input), input->dims->data,
+              input->dims->size, tflite::micro::GetTensorData<int8_t>(output),
+              output->dims->data, output->dims->size,
+              tflite::micro::GetTensorData<int>(axis), num_axis,
+              params->keep_dims, temp_buffer, resolved_axis,
+              std::numeric_limits<int8_t>::lowest(),
+              [](const int8_t current, const int8_t in) -> int8_t {
+                return (in > current) ? in : current;
+              }));
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context,
+                         "Only float32 and int8 types are supported.\n");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
 }  // namespace reduce
 
 TfLiteRegistration Register_MEAN() {
-  return {/*init=*/nullptr,
+  return {/*init=*/reduce::InitReduce,
           /*free=*/nullptr,
           /*prepare=*/reduce::PrepareMeanOrSum,
           /*invoke=*/reduce::EvalMean,
@@ -134,6 +325,18 @@ TfLiteRegistration Register_MEAN() {
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
+
+TfLiteRegistration Register_REDUCE_MAX() {
+  return {/*init=*/reduce::InitReduce,
+          /*free=*/nullptr,
+          /*prepare=*/reduce::PrepareMax,
+          /*invoke=*/reduce::EvalMax,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/reduce_test.cc b/tensorflow/lite/micro/kernels/reduce_test.cc
index 1e3ded2bd77..fdb8fe95466 100644
--- a/tensorflow/lite/micro/kernels/reduce_test.cc
+++ b/tensorflow/lite/micro/kernels/reduce_test.cc
@@ -17,32 +17,58 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
 namespace {
 
-// Common inputs and outputs.
-// static const int kInputElements4D = 24;
+// Common 2D inputs, outputs and axis.
+static const int kInputElements2D = 8;
+static const int kInputShape2D[] = {2, 2, 4};
+static const float kInputData2D[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
+
+static const int kAxisShape2D[] = {1, 1};
+static const int32_t kAxisData2D[] = {1};
+
+static const int kOutputElements2D = 2;
+static const int kOutputShape2D[] = {2, 1, 2};
+static const float kGoldenData2D[] = {2.5, 6.5};
+
+// Common 3D inputs, outputs and axis.
+static const int kInputElements3D = 8;
+static const int kInputShape3D[] = {3, 2, 2, 2};
+static const float kInputData3D[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0};
+
+static const int kAxisShape3D[] = {1, 2};
+static const int32_t kAxisData3D[] = {1, 2};
+
+static const int kOutputElements3D = 2;
+static const int kOutputShape3D[] = {2, 1, 2};
+static const float kGoldenData3D[] = {2.5, 6.5};
+
+// Common 4D inputs, outputs and axis.
+static const int kInputElements4D = 24;
 static const int kInputShape4D[] = {4, 2, 2, 3, 2};
 static const float kInputData4D[] = {
     1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0,
     13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
 
-// static const int kAxisElements = 3;
-static const int kAxisShape[] = {1, 2};
-static const int32_t kAxisData[] = {1, 2};
+static const int kAxisShape4D[] = {1, 2};
+static const int32_t kAxisData4D[] = {1, 2};
 
-static const int kOutputElements = 4;
-static const int kOutputShape[] = {4, 2, 1, 1, 2};
-static const float kGoldenData[] = {6, 7, 18, 19};
+static const int kOutputElements4D = 4;
+static const int kOutputShape4D[] = {4, 2, 1, 1, 2};
+static const float kGoldenData4D[] = {6, 7, 18, 19};
+
+// Axis shape and contents are independent of input / output dimensions.
 
 template <typename T>
 TfLiteStatus ValidateReduceGoldens(TfLiteTensor* tensors, int tensors_size,
                                    const T* expected_output_data,
                                    T* output_data, int output_length,
+                                   const TfLiteRegistration& registration,
                                    TfLiteReducerParams* params,
                                    float tolerance = 1e-5) {
   int inputs_array_data[] = {2, 0, 1};
@@ -50,7 +76,6 @@ TfLiteStatus ValidateReduceGoldens(TfLiteTensor* tensors, int tensors_size,
   int outputs_array_data[] = {1, 2};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  const TfLiteRegistration registration = tflite::ops::micro::Register_MEAN();
   micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
                              outputs_array, params, micro_test::reporter);
 
@@ -74,6 +99,8 @@ void TestMeanFloatInput4D(const int* input_dims_data, const float* input_data,
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
+  const TfLiteRegistration registration = tflite::ops::micro::Register_MEAN();
+
   constexpr int num_of_inputs = 2;   // input and axis
   constexpr int num_of_outputs = 1;  // output
 
@@ -84,10 +111,114 @@ void TestMeanFloatInput4D(const int* input_dims_data, const float* input_data,
       CreateFloatTensor(output_data, output_dims),
   };
 
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, ValidateReduceGoldens(
+                     tensors, tensors_size, expected_output_data, output_data,
+                     output_dims_count, registration, params, tolerance));
+}
+
+void TestReduceOpFloat(const int* input_dims_data, const float* input_data,
+                       const int* axis_dims_data, const int32_t* axis_data,
+                       const int* output_dims_data, float* output_data,
+                       const float* expected_output_data,
+                       const TfLiteRegistration& registration,
+                       TfLiteReducerParams* params, float tolerance = 1e-5) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* axis_dims = IntArrayFromInts(axis_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int num_of_inputs = 2;   // input and axis
+  constexpr int num_of_outputs = 1;  // output
+
+  constexpr int tensors_size = num_of_inputs + num_of_outputs;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims),
+      CreateInt32Tensor(axis_data, axis_dims),
+      CreateFloatTensor(output_data, output_dims),
+  };
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, ValidateReduceGoldens(
+                     tensors, tensors_size, expected_output_data, output_data,
+                     output_dims_count, registration, params, tolerance));
+}
+
+template <typename T>
+void TestReduceOpQuantized(
+    const int* input_dims_data, const float* input_data, T* input_data_quant,
+    float input_scale, int input_zero_point, const int* axis_dims_data,
+    const int32_t* axis_data, const int* output_dims_data,
+    const float* expected_output_data, T* output_data_quant,
+    T* expected_output_data_quant, float output_scale, int output_zero_point,
+    const TfLiteRegistration& registration, TfLiteReducerParams* params) {
+  // Convert dimesion arguments to TfLiteArrays
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* axis_dims = IntArrayFromInts(axis_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+
+  // Get number of elements in input and output tensors
+  const int output_dims_count = ElementCount(*output_dims);
+
+  // Initialize tensors
+  constexpr int tensors_size = 3;
+  TfLiteTensor tensors[] = {
+      CreateQuantizedTensor(input_data, input_data_quant, input_dims,
+                            input_scale, input_zero_point),
+      CreateInt32Tensor(axis_data, axis_dims),
+      CreateQuantizedTensor(output_data_quant, output_dims, output_scale,
+                            output_zero_point),
+  };
+
+  // Quantize expected output
+  tflite::AsymmetricQuantize(expected_output_data, expected_output_data_quant,
+                             output_dims_count, output_scale,
+                             output_zero_point);
+
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk,
-      ValidateReduceGoldens(tensors, tensors_size, expected_output_data,
-                            output_data, output_dims_count, params, tolerance));
+      ValidateReduceGoldens(tensors, tensors_size, expected_output_data_quant,
+                            output_data_quant, output_dims_count, registration,
+                            params, 0.01));
+}
+
+template <typename T>
+void TestMeanOpQuantized(const int* input_dims_data, const float* input_data,
+                         T* input_data_quant, float input_scale,
+                         int input_zero_point, const int* axis_dims_data,
+                         const int32_t* axis_data, const int* output_dims_data,
+                         const float* expected_output_data,
+                         T* output_data_quant, T* expected_output_data_quant,
+                         float output_scale, int output_zero_point,
+                         TfLiteReducerParams* params) {
+  // Convert dimesion arguments to TfLiteArrays
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* axis_dims = IntArrayFromInts(axis_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+
+  // Get number of elements in input and output tensors
+  const int output_dims_count = ElementCount(*output_dims);
+
+  // Initialize tensors
+  constexpr int tensors_size = 3;
+  TfLiteTensor tensors[] = {
+      CreateQuantizedTensor(input_data, input_data_quant, input_dims,
+                            input_scale, input_zero_point),
+      CreateInt32Tensor(axis_data, axis_dims),
+      CreateQuantizedTensor(output_data_quant, output_dims, output_scale,
+                            output_zero_point),
+  };
+
+  // Quantize expected output
+  tflite::AsymmetricQuantize(expected_output_data, expected_output_data_quant,
+                             output_dims_count, output_scale,
+                             output_zero_point);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      ValidateReduceGoldens(tensors, tensors_size, expected_output_data_quant,
+                            output_data_quant, output_dims_count,
+                            tflite::ops::micro::Register_MEAN(), params, 1.0));
 }
 
 }  // namespace
@@ -96,8 +227,124 @@ void TestMeanFloatInput4D(const int* input_dims_data, const float* input_data,
 
 TF_LITE_MICRO_TESTS_BEGIN
 
+TF_LITE_MICRO_TEST(MeanFloat2DKeepDims) {
+  float output_data[tflite::testing::kOutputElements2D];
+
+  TfLiteReducerParams params = {true};
+
+  tflite::testing::TestMeanFloatInput4D(
+      tflite::testing::kInputShape2D, tflite::testing::kInputData2D,
+      tflite::testing::kAxisShape2D, tflite::testing::kAxisData2D,
+      tflite::testing::kOutputShape2D, tflite::testing::kGoldenData2D,
+      output_data, &params);
+}
+
+TF_LITE_MICRO_TEST(MeanInt82DKeepDims) {
+  int8_t expected_output_data_quant[tflite::testing::kOutputElements2D];
+  int8_t output_data_quant[tflite::testing::kOutputElements2D];
+  int8_t input_data_quant[tflite::testing::kInputElements2D];
+
+  float input_scale = 0.5f;
+  int input_zero_point = 0;
+  float output_scale = 0.5f;
+  int output_zero_point = 0;
+
+  TfLiteReducerParams params = {
+      true  // keep_dims
+  };
+
+  tflite::testing::TestMeanOpQuantized<int8_t>(
+      tflite::testing::kInputShape2D, tflite::testing::kInputData2D,
+      input_data_quant, input_scale, input_zero_point,
+      tflite::testing::kAxisShape2D, tflite::testing::kAxisData2D,
+      tflite::testing::kOutputShape2D, tflite::testing::kGoldenData2D,
+      output_data_quant, expected_output_data_quant, output_scale,
+      output_zero_point, &params);
+}
+
+TF_LITE_MICRO_TEST(MeanUInt82DKeepDims) {
+  uint8_t expected_output_data_quant[tflite::testing::kOutputElements2D];
+  uint8_t output_data_quant[tflite::testing::kOutputElements2D];
+  uint8_t input_data_quant[tflite::testing::kInputElements2D];
+
+  float input_scale = 0.5f;
+  int input_zero_point = 128;
+  float output_scale = 0.5f;
+  int output_zero_point = 128;
+
+  TfLiteReducerParams params = {
+      true  // keep_dims
+  };
+
+  tflite::testing::TestMeanOpQuantized<uint8_t>(
+      tflite::testing::kInputShape2D, tflite::testing::kInputData2D,
+      input_data_quant, input_scale, input_zero_point,
+      tflite::testing::kAxisShape2D, tflite::testing::kAxisData2D,
+      tflite::testing::kOutputShape2D, tflite::testing::kGoldenData2D,
+      output_data_quant, expected_output_data_quant, output_scale,
+      output_zero_point, &params);
+}
+
+TF_LITE_MICRO_TEST(MeanFloat3DKeepDims) {
+  float output_data[tflite::testing::kOutputElements3D];
+
+  TfLiteReducerParams params = {true};
+
+  tflite::testing::TestMeanFloatInput4D(
+      tflite::testing::kInputShape3D, tflite::testing::kInputData3D,
+      tflite::testing::kAxisShape3D, tflite::testing::kAxisData3D,
+      tflite::testing::kOutputShape3D, tflite::testing::kGoldenData3D,
+      output_data, &params);
+}
+
+TF_LITE_MICRO_TEST(MeanInt83DKeepDims) {
+  int8_t expected_output_data_quant[tflite::testing::kOutputElements3D];
+  int8_t output_data_quant[tflite::testing::kOutputElements3D];
+  int8_t input_data_quant[tflite::testing::kInputElements3D];
+
+  float input_scale = 0.5f;
+  int input_zero_point = 0;
+  float output_scale = 0.5f;
+  int output_zero_point = 0;
+
+  TfLiteReducerParams params = {
+      true  // keep_dims
+  };
+
+  tflite::testing::TestMeanOpQuantized<int8_t>(
+      tflite::testing::kInputShape3D, tflite::testing::kInputData3D,
+      input_data_quant, input_scale, input_zero_point,
+      tflite::testing::kAxisShape3D, tflite::testing::kAxisData3D,
+      tflite::testing::kOutputShape3D, tflite::testing::kGoldenData3D,
+      output_data_quant, expected_output_data_quant, output_scale,
+      output_zero_point, &params);
+}
+
+TF_LITE_MICRO_TEST(MeanUInt83DKeepDims) {
+  uint8_t expected_output_data_quant[tflite::testing::kOutputElements3D];
+  uint8_t output_data_quant[tflite::testing::kOutputElements3D];
+  uint8_t input_data_quant[tflite::testing::kInputElements3D];
+
+  float input_scale = 0.5f;
+  int input_zero_point = 138;
+  float output_scale = 0.5f;
+  int output_zero_point = 138;
+
+  TfLiteReducerParams params = {
+      true  // keep_dims
+  };
+
+  tflite::testing::TestMeanOpQuantized<uint8_t>(
+      tflite::testing::kInputShape3D, tflite::testing::kInputData3D,
+      input_data_quant, input_scale, input_zero_point,
+      tflite::testing::kAxisShape3D, tflite::testing::kAxisData3D,
+      tflite::testing::kOutputShape3D, tflite::testing::kGoldenData3D,
+      output_data_quant, expected_output_data_quant, output_scale,
+      output_zero_point, &params);
+}
+
 TF_LITE_MICRO_TEST(MeanFloat4DKeepDims) {
-  float output_data[tflite::testing::kOutputElements];
+  float output_data[tflite::testing::kOutputElements4D];
 
   TfLiteReducerParams params = {
       true  // keep_dims
@@ -105,42 +352,268 @@ TF_LITE_MICRO_TEST(MeanFloat4DKeepDims) {
 
   tflite::testing::TestMeanFloatInput4D(
       tflite::testing::kInputShape4D, tflite::testing::kInputData4D,
-      tflite::testing::kAxisShape, tflite::testing::kAxisData,
-      tflite::testing::kOutputShape, tflite::testing::kGoldenData, output_data,
-      &params);
+      tflite::testing::kAxisShape4D, tflite::testing::kAxisData4D,
+      tflite::testing::kOutputShape4D, tflite::testing::kGoldenData4D,
+      output_data, &params);
+}
+
+TF_LITE_MICRO_TEST(MeanInt84DKeepDims) {
+  int8_t expected_output_data_quant[tflite::testing::kOutputElements4D];
+  int8_t output_data_quant[tflite::testing::kOutputElements4D];
+  int8_t input_data_quant[tflite::testing::kInputElements4D];
+
+  float input_scale = 0.5f;
+  int input_zero_point = 0;
+  float output_scale = 0.5f;
+  int output_zero_point = 0;
+
+  TfLiteReducerParams params = {
+      true  // keep_dims
+  };
+
+  tflite::testing::TestMeanOpQuantized<int8_t>(
+      tflite::testing::kInputShape4D, tflite::testing::kInputData4D,
+      input_data_quant, input_scale, input_zero_point,
+      tflite::testing::kAxisShape4D, tflite::testing::kAxisData4D,
+      tflite::testing::kOutputShape4D, tflite::testing::kGoldenData4D,
+      output_data_quant, expected_output_data_quant, output_scale,
+      output_zero_point, &params);
+}
+
+TF_LITE_MICRO_TEST(MeanUInt84DKeepDims) {
+  uint8_t expected_output_data_quant[tflite::testing::kOutputElements4D];
+  uint8_t output_data_quant[tflite::testing::kOutputElements4D];
+  uint8_t input_data_quant[tflite::testing::kInputElements4D];
+
+  float input_scale = 0.5f;
+  int input_zero_point = 128;
+  float output_scale = 0.5f;
+  int output_zero_point = 128;
+
+  TfLiteReducerParams params = {
+      true  // keep_dims
+  };
+
+  tflite::testing::TestMeanOpQuantized<uint8_t>(
+      tflite::testing::kInputShape4D, tflite::testing::kInputData4D,
+      input_data_quant, input_scale, input_zero_point,
+      tflite::testing::kAxisShape4D, tflite::testing::kAxisData4D,
+      tflite::testing::kOutputShape4D, tflite::testing::kGoldenData4D,
+      output_data_quant, expected_output_data_quant, output_scale,
+      output_zero_point, &params);
 }
 
 TF_LITE_MICRO_TEST(MeanFloat4DWithoutKeepDims) {
-  const int kOutputShape[] = {2, 2, 2};
-  float output_data[tflite::testing::kOutputElements];
-
+  const int kOutputShape4D[] = {2, 2, 2};
+  float output_data[tflite::testing::kOutputElements4D];
   TfLiteReducerParams params = {
       false  // keep_dims
   };
 
   tflite::testing::TestMeanFloatInput4D(
       tflite::testing::kInputShape4D, tflite::testing::kInputData4D,
-      tflite::testing::kAxisShape, tflite::testing::kAxisData, kOutputShape,
-      tflite::testing::kGoldenData, output_data, &params);
+      tflite::testing::kAxisShape4D, tflite::testing::kAxisData4D,
+      kOutputShape4D, tflite::testing::kGoldenData4D, output_data, &params);
+}
+
+TF_LITE_MICRO_TEST(MeanInt84DWithoutKeepDims) {
+  int8_t expected_output_data_quant[tflite::testing::kOutputElements4D];
+  int8_t output_data_quant[tflite::testing::kOutputElements4D];
+  int8_t input_data_quant[tflite::testing::kInputElements4D];
+
+  const int kOutputShape4D[] = {2, 2, 2};
+  TfLiteReducerParams params = {
+      false  // keep_dims
+  };
+  float input_scale = 0.5f;
+  int input_zero_point = 0;
+  float output_scale = 0.5f;
+  int output_zero_point = 0;
+
+  tflite::testing::TestMeanOpQuantized<int8_t>(
+      tflite::testing::kInputShape4D, tflite::testing::kInputData4D,
+      input_data_quant, input_scale, input_zero_point,
+      tflite::testing::kAxisShape4D, tflite::testing::kAxisData4D,
+      kOutputShape4D, tflite::testing::kGoldenData4D, output_data_quant,
+      expected_output_data_quant, output_scale, output_zero_point, &params);
+}
+
+TF_LITE_MICRO_TEST(MeanUInt84DWithoutKeepDims) {
+  uint8_t expected_output_data_quant[tflite::testing::kOutputElements4D];
+  uint8_t output_data_quant[tflite::testing::kOutputElements4D];
+  uint8_t input_data_quant[tflite::testing::kInputElements4D];
+
+  const int kOutputShape4D[] = {2, 2, 2};
+  TfLiteReducerParams params = {
+      false  // keep_dims
+  };
+  float input_scale = 0.5f;
+  int input_zero_point = 128;
+  float output_scale = 0.5f;
+  int output_zero_point = 128;
+
+  tflite::testing::TestMeanOpQuantized<uint8_t>(
+      tflite::testing::kInputShape4D, tflite::testing::kInputData4D,
+      input_data_quant, input_scale, input_zero_point,
+      tflite::testing::kAxisShape4D, tflite::testing::kAxisData4D,
+      kOutputShape4D, tflite::testing::kGoldenData4D, output_data_quant,
+      expected_output_data_quant, output_scale, output_zero_point, &params);
 }
 
 TF_LITE_MICRO_TEST(MeanFloat4DWithoutKeepDimsWithPrecision) {
   const int kInputShape4D[] = {4, 2, 2, 3, 1};
   const float kInputData4D[] = {1.0,  24.0, 13.0, 3.0,  9.0,  17.0,
                                 11.0, 36.0, 14.0, 19.0, 17.0, 22.0};
-  const int kOutputElements = 2;
-  const int kOutputShape[] = {2, 2, 1};
-  const float kGoldenData[] = {11.166667, 19.833334};
-  float output_data[kOutputElements];
-
+  const int kOutputElements4D = 2;
+  const int kOutputShape4D[] = {2, 2, 1};
+  const float kGoldenData4D[] = {11.166667, 19.833334};
+  float output_data[kOutputElements4D];
   TfLiteReducerParams params = {
       false  // keep_dims
   };
 
   tflite::testing::TestMeanFloatInput4D(
-      kInputShape4D, kInputData4D, tflite::testing::kAxisShape,
-      tflite::testing::kAxisData, kOutputShape, kGoldenData, output_data,
+      kInputShape4D, kInputData4D, tflite::testing::kAxisShape4D,
+      tflite::testing::kAxisData4D, kOutputShape4D, kGoldenData4D, output_data,
       &params);
 }
 
+TF_LITE_MICRO_TEST(FloatMaxOpTestNotKeepDims) {
+  const int input_shape[] = {3, 4, 3, 2};
+  const float input_data[] = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                              9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                              17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  const int axis_shape[] = {1, 4};
+  const int32_t axis_data[] = {1, 0, -3, -3};
+  const int output_shape[] = {1, 2};
+  const float expected_output_data[] = {23, 24};
+  float output_data[2];
+
+  TfLiteReducerParams params = {false};
+
+  tflite::testing::TestReduceOpFloat(
+      input_shape, input_data, axis_shape, axis_data, output_shape, output_data,
+      expected_output_data, tflite::ops::micro::Register_REDUCE_MAX(), &params);
+}
+
+TF_LITE_MICRO_TEST(FloatMaxOpTestKeepDims) {
+  const int input_shape[] = {3, 4, 3, 2};
+  const float input_data[] = {1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,
+                              9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+                              17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0};
+  const int axis_shape[] = {1, 2};
+  const int32_t axis_data[] = {0, 2};
+  const int output_shape[] = {1, 3};
+  const float expected_output_data[] = {20, 22, 24};
+  float output_data[3];
+
+  TfLiteReducerParams params = {true};
+
+  tflite::testing::TestReduceOpFloat(
+      input_shape, input_data, axis_shape, axis_data, output_shape, output_data,
+      expected_output_data, tflite::ops::micro::Register_REDUCE_MAX(), &params);
+}
+
+TF_LITE_MICRO_TEST(Int8MaxOpTestKeepDims) {
+  const int input_shape[] = {3, 1, 3, 2};
+  const float input_data[] = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  const int axis_shape[] = {1, 1};
+  const int32_t axis_data[] = {1, 1};
+  const int output_shape[] = {1, 2};
+  const float expected_output_data[] = {0.5, 0.6};
+
+  float input_scale = 2 / 255.0;
+  int input_zp = 0;
+
+  TfLiteReducerParams params = {true};
+
+  int8_t input_data_quant[6];
+  int8_t output_data_quant[2];
+  int8_t expected_output_data_quant[2];
+
+  tflite::testing::TestReduceOpQuantized<int8_t>(
+      input_shape, input_data, input_data_quant, input_scale, input_zp,
+      axis_shape, axis_data, output_shape, expected_output_data,
+      output_data_quant, expected_output_data_quant, input_scale, input_zp,
+      tflite::ops::micro::Register_REDUCE_MAX(), &params);
+}
+
+TF_LITE_MICRO_TEST(Int8MaxOpTestWithoutKeepDims) {
+  const int input_shape[] = {3, 1, 3, 2};
+  const float input_data[] = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
+  const int axis_shape[] = {1, 1};
+  const int32_t axis_data[] = {1, 1};
+  const int output_shape[] = {1, 2};
+  const float expected_output_data[] = {0.5, 0.6};
+
+  float input_scale = 2 / 255.0;
+  int input_zp = 0;
+  float output_scale = 2 / 255.0;
+  int output_zp = 0;
+
+  TfLiteReducerParams params = {false};
+
+  int8_t input_data_quant[6];
+  int8_t output_data_quant[2];
+  int8_t expected_output_data_quant[2];
+
+  tflite::testing::TestReduceOpQuantized<int8_t>(
+      input_shape, input_data, input_data_quant, input_scale, input_zp,
+      axis_shape, axis_data, output_shape, expected_output_data,
+      output_data_quant, expected_output_data_quant, output_scale, output_zp,
+      tflite::ops::micro::Register_REDUCE_MAX(), &params);
+}
+
+TF_LITE_MICRO_TEST(MeanInt84DWithoutKeepDimsWithPrecision) {
+  const int kInputShape4D[] = {4, 2, 2, 3, 1};
+  const float kInputData4D[] = {1.0,  24.0, 13.0, 3.0,  9.0,  17.0,
+                                11.0, 36.0, 14.0, 19.0, 17.0, 22.0};
+  const int kOutputShape4D[] = {2, 2, 1};
+  const float kGoldenData4D[] = {11.166667, 19.833334};
+  TfLiteReducerParams params = {
+      false  // keep_dims
+  };
+  float input_scale = 0.5f;
+  int input_zero_point = 0;
+  float output_scale = 0.5f;
+  int output_zero_point = 0;
+
+  int8_t output_data_quant[2];
+  int8_t expected_output_data_quant[2];
+  int8_t input_data_quant[12];
+
+  tflite::testing::TestMeanOpQuantized<int8_t>(
+      kInputShape4D, kInputData4D, input_data_quant, input_scale,
+      input_zero_point, tflite::testing::kAxisShape4D,
+      tflite::testing::kAxisData4D, kOutputShape4D, kGoldenData4D,
+      output_data_quant, expected_output_data_quant, output_scale,
+      output_zero_point, &params);
+}
+
+TF_LITE_MICRO_TEST(MeanUInt84DWithoutKeepDimsWithPrecision) {
+  const int kInputShape4D[] = {4, 2, 2, 3, 1};
+  const float kInputData4D[] = {1.0,  24.0, 13.0, 3.0,  9.0,  17.0,
+                                11.0, 36.0, 14.0, 19.0, 17.0, 22.0};
+  const int kOutputShape4D[] = {2, 2, 1};
+  const float kGoldenData4D[] = {11.166667, 19.833334};
+  TfLiteReducerParams params = {
+      false  // keep_dims
+  };
+
+  float input_scale = 0.5f;
+  int input_zero_point = 128;
+  float output_scale = 0.5f;
+  int output_zero_point = 128;
+
+  uint8_t output_data_quant[2];
+  uint8_t expected_output_data_quant[2];
+  uint8_t input_data_quant[12];
+
+  tflite::testing::TestMeanOpQuantized<uint8_t>(
+      kInputShape4D, kInputData4D, input_data_quant, input_scale,
+      input_zero_point, tflite::testing::kAxisShape4D,
+      tflite::testing::kAxisData4D, kOutputShape4D, kGoldenData4D,
+      output_data_quant, expected_output_data_quant, output_scale,
+      output_zero_point, &params);
+}
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/reshape.cc b/tensorflow/lite/micro/kernels/reshape.cc
index a865892b347..8e47e2a0577 100644
--- a/tensorflow/lite/micro/kernels/reshape.cc
+++ b/tensorflow/lite/micro/kernels/reshape.cc
@@ -32,7 +32,9 @@ constexpr int kOutputTensor = 0;
 
 TfLiteStatus ReshapeOutput(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
   // Tensorflow's Reshape allows one of the shape components to have the
   // special -1 value, meaning it will be calculated automatically based on the
   // input. Here we calculate what that dimension should be so that the number
diff --git a/tensorflow/lite/micro/kernels/reshape_test.cc b/tensorflow/lite/micro/kernels/reshape_test.cc
index 91ecbdc7a49..48d1956f1c8 100644
--- a/tensorflow/lite/micro/kernels/reshape_test.cc
+++ b/tensorflow/lite/micro/kernels/reshape_test.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
@@ -113,22 +112,41 @@ void TestReshapeWithoutShape(TfLiteTensor* input_tensor,
                          expected_dims_len, expect_failure);
 }
 
-template <typename T = float, TfLiteType tensor_type = kTfLiteFloat32>
-void TestReshape(const int* input_dims_data, const T* input_data,
+void TestReshape(const int* input_dims_data, const float* input_data,
                  const int* shape_dims_data, const int32_t* shape_data,
-                 int* output_dims_data, T* output_data,
-                 const T* expected_output, const size_t expected_output_len,
+                 int* output_dims_data, float* output_data,
+                 const float* expected_output, const size_t expected_output_len,
                  const int* expected_dims, const size_t expected_dims_len,
                  bool expect_failure = false) {
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   TfLiteIntArray* shape_dims = IntArrayFromInts(shape_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
-  TfLiteTensor input_tensor =
-      CreateTensor<T, tensor_type>(input_data, input_dims);
-  TfLiteTensor shape_tensor =
-      CreateTensor<int32_t, kTfLiteInt32>(shape_data, shape_dims);
-  TfLiteTensor output_tensor =
-      CreateTensor<T, tensor_type>(output_data, output_dims);
+  TfLiteTensor input_tensor = CreateFloatTensor(input_data, input_dims);
+  TfLiteTensor shape_tensor = CreateInt32Tensor(shape_data, shape_dims);
+  TfLiteTensor output_tensor = CreateFloatTensor(output_data, output_dims);
+
+  TestReshapeWithShape(&input_tensor, &shape_tensor, &output_tensor,
+                       expected_output, expected_output_len, expected_dims,
+                       expected_dims_len, expect_failure);
+}
+
+template <typename T>
+void TestReshapeQuantized(const int* input_dims_data, const T* input_data,
+                          const int* shape_dims_data, const int32_t* shape_data,
+                          int* output_dims_data, T* output_data,
+                          const T* expected_output,
+                          const size_t expected_output_len,
+                          const int* expected_dims,
+                          const size_t expected_dims_len,
+                          bool expect_failure = false) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* shape_dims = IntArrayFromInts(shape_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  TfLiteTensor input_tensor = CreateQuantizedTensor(
+      input_data, input_dims, /*scale=*/1.f, /*zero_point=*/0);
+  TfLiteTensor shape_tensor = CreateInt32Tensor(shape_data, shape_dims);
+  TfLiteTensor output_tensor = CreateQuantizedTensor(
+      output_data, output_dims, /*scale=*/1.f, /*zero_point=*/0);
 
   TestReshapeWithShape(&input_tensor, &shape_tensor, &output_tensor,
                        expected_output, expected_output_len, expected_dims,
@@ -233,11 +251,11 @@ TF_LITE_MICRO_TEST(ReshapeWithRegularShapesShouldSucceed) {
                                output_dims, output_data_float,
                                golden_output_float, golden_output_len,
                                golden_dims, golden_dims_len, false);
-  tflite::testing::TestReshape<int8_t, kTfLiteInt8>(
+  tflite::testing::TestReshapeQuantized(
       input_dims, input_int8, shape_dims, shape_int32, output_dims,
       output_data_int8, golden_output_int8, golden_output_len, golden_dims,
       golden_dims_len, false);
-  tflite::testing::TestReshape<uint8_t, kTfLiteUInt8>(
+  tflite::testing::TestReshapeQuantized(
       input_dims, input_uint8, shape_dims, shape_int32, output_dims,
       output_data_uint8, golden_output_uint8, golden_output_len, golden_dims,
       golden_dims_len, false);
@@ -265,11 +283,11 @@ TF_LITE_MICRO_TEST(ReshapeWithStretchDimensionShouldSucceed) {
                                output_dims, output_data_float,
                                golden_output_float, golden_output_len,
                                golden_dims, golden_dims_len, false);
-  tflite::testing::TestReshape<int8_t, kTfLiteInt8>(
+  tflite::testing::TestReshapeQuantized(
       input_dims, input_int8, shape_dims, shape_int32, output_dims,
       output_data_int8, golden_output_int8, golden_output_len, golden_dims,
       golden_dims_len, false);
-  tflite::testing::TestReshape<uint8_t, kTfLiteUInt8>(
+  tflite::testing::TestReshapeQuantized(
       input_dims, input_uint8, shape_dims, shape_int32, output_dims,
       output_data_uint8, golden_output_uint8, golden_output_len, golden_dims,
       golden_dims_len, false);
@@ -297,11 +315,11 @@ TF_LITE_MICRO_TEST(ReshapeWithScalarOutputShouldSucceed) {
                                output_dims, output_data_float,
                                golden_output_float, golden_output_len,
                                golden_dims, golden_dims_len, false);
-  tflite::testing::TestReshape<int8_t, kTfLiteInt8>(
+  tflite::testing::TestReshapeQuantized(
       input_dims, input_int8, shape_dims, shape_int32, output_dims,
       output_data_int8, golden_output_int8, golden_output_len, golden_dims,
       golden_dims_len, false);
-  tflite::testing::TestReshape<uint8_t, kTfLiteUInt8>(
+  tflite::testing::TestReshapeQuantized(
       input_dims, input_uint8, shape_dims, shape_int32, output_dims,
       output_data_uint8, golden_output_uint8, golden_output_len, golden_dims,
       golden_dims_len, false);
@@ -327,8 +345,8 @@ TF_LITE_MICRO_TEST(ReshapeWithLegacyScalarOutputShouldSucceed) {
   TfLiteIntArray* shape_dims = IntArrayFromInts(shape_dims_data);
 
   const int32_t shape_data[] = {0};
-  auto shape_tensor = tflite::testing::CreateTensor<int32_t, kTfLiteInt32>(
-      shape_data, shape_dims);
+  auto shape_tensor =
+      tflite::testing::CreateInt32Tensor(shape_data, shape_dims);
   const float expected_output_with_shape[] = {};
   const int expected_output_with_shape_len = 0;
   const float expected_output_no_shape[] = {3};
diff --git a/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc b/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc
index 222cb3a0044..971de83c2ce 100644
--- a/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc
+++ b/tensorflow/lite/micro/kernels/resize_nearest_neighbor.cc
@@ -32,7 +32,6 @@ constexpr int kSizeTensor = 1;
 constexpr int kOutputTensor = 0;
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-#if defined(DEBUG)
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -53,7 +52,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_KERNEL_LOG(context, "Dynamic tensors are unsupported in tfmicro.");
     return kTfLiteError;
   }
-#endif
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc b/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
index 47813d5d8a3..9362a89a3ed 100644
--- a/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
+++ b/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
@@ -48,7 +48,7 @@ void TestResizeNearestNeighbor(const int* input_dims_data, const T* input_data,
                                const int* output_dims_data, T* output_data) {
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
 
-  int expected_size_dims_data[] = {2, 1, 2};
+  int expected_size_dims_data[] = {1, 2};
   TfLiteIntArray* expected_size_dims =
       IntArrayFromInts(expected_size_dims_data);
 
@@ -63,6 +63,8 @@ void TestResizeNearestNeighbor(const int* input_dims_data, const T* input_data,
       TestCreateTensor(output_data, output_dims),
   };
 
+  tensors[1].allocation_type = kTfLiteMmapRo;
+
   TfLiteResizeNearestNeighborParams builtin_data = {false, false};
 
   int inputs_array_data[] = {2, 0, 1};
diff --git a/tensorflow/lite/micro/kernels/round.cc b/tensorflow/lite/micro/kernels/round.cc
index 7b4adfc61c0..5804016b3e4 100644
--- a/tensorflow/lite/micro/kernels/round.cc
+++ b/tensorflow/lite/micro/kernels/round.cc
@@ -30,7 +30,9 @@ constexpr int kOutputTensor = 0;
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
diff --git a/tensorflow/lite/micro/kernels/round_test.cc b/tensorflow/lite/micro/kernels/round_test.cc
index 7048a98f52b..8067d8cd091 100644
--- a/tensorflow/lite/micro/kernels/round_test.cc
+++ b/tensorflow/lite/micro/kernels/round_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/shape.cc b/tensorflow/lite/micro/kernels/shape.cc
new file mode 100755
index 00000000000..df962f629cd
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/shape.cc
@@ -0,0 +1,73 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/memory_helpers.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+
+namespace tflite {
+
+namespace {
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+void ExtractShape(const TfLiteEvalTensor* input, int32_t* output_data) {
+  for (int i = 0; i < input->dims->size; ++i) {
+    output_data[i] = input->dims->data[i];
+  }
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  if (output->type != kTfLiteInt32) {
+    TF_LITE_KERNEL_LOG(context, "Output type %s (%d) not supported.",
+                       TfLiteTypeGetName(output->type), output->type);
+    return kTfLiteError;
+  } else {
+    ExtractShape(input, tflite::micro::GetTensorData<int32_t>(output));
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+TfLiteRegistration Register_SHAPE() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/shape_test.cc b/tensorflow/lite/micro/kernels/shape_test.cc
new file mode 100755
index 00000000000..7c7e0db82db
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/shape_test.cc
@@ -0,0 +1,138 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void ValidateShape(TfLiteTensor* tensors, const int tensor_count,
+                   int32_t* output_data, const int32_t* expected_output,
+                   int output_dims_count) {
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  const TfLiteRegistration registration = tflite::Register_SHAPE();
+  micro::KernelRunner runner(registration, tensors, tensor_count, inputs_array,
+                             outputs_array, nullptr, micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output[i], output_data[i]);
+  }
+}
+
+void TestShape(const int* input_dims_data, const float* input_data,
+               const int* output_dims_data, const int32_t* expected_output_data,
+               int32_t* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims),
+      CreateInt32Tensor(output_data, output_dims, true),
+  };
+
+  ValidateShape(tensors, tensors_size, output_data, expected_output_data,
+                output_dims_count);
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestShape0) {
+  int input_shape[] = {1, 5};
+  float input_values[] = {1, 3, 1, 3, 5};
+  int output_dims[] = {1, 1};  // this is actually input_shapes shape
+  int32_t expected_output_data[] = {5};
+  int32_t output_data[1];
+
+  tflite::testing::TestShape(input_shape, input_values, output_dims,
+                             expected_output_data, output_data);
+}
+
+TF_LITE_MICRO_TEST(TestShape1) {
+  int input_shape[] = {2, 4, 3};
+  float input_values[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int output_dims[] = {2, 1, 1};
+  int32_t expected_output_data[] = {4, 3};
+  int32_t output_data[2];
+
+  tflite::testing::TestShape(input_shape, input_values, output_dims,
+                             expected_output_data, output_data);
+}
+
+TF_LITE_MICRO_TEST(TestShape2) {
+  int input_shape[] = {2, 12, 1};
+  float input_values[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int output_dims[] = {2, 1, 1};
+  int32_t expected_output_data[] = {12, 1};
+  int32_t output_data[2];
+
+  tflite::testing::TestShape(input_shape, input_values, output_dims,
+                             expected_output_data, output_data);
+}
+
+TF_LITE_MICRO_TEST(TestShape3) {
+  int input_shape[] = {2, 2, 6};
+  float input_values[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int output_dims[] = {2, 1, 1};
+  int32_t expected_output_data[] = {2, 6};
+  int32_t output_data[2];
+
+  tflite::testing::TestShape(input_shape, input_values, output_dims,
+                             expected_output_data, output_data);
+}
+
+TF_LITE_MICRO_TEST(TestShape4) {
+  int input_shape[] = {2, 2, 2, 3};
+  float input_values[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int output_dims[] = {3, 1, 1, 1};
+  int32_t expected_output_data[] = {2, 2, 3};
+  int32_t output_data[3];
+
+  tflite::testing::TestShape(input_shape, input_values, output_dims,
+                             expected_output_data, output_data);
+}
+
+TF_LITE_MICRO_TEST(TestShape5) {
+  int input_shape[] = {1, 1};
+  float input_values[] = {1};
+  int output_dims[] = {1, 1};
+  int32_t expected_output_data[] = {1};
+  int32_t output_data[1];
+
+  tflite::testing::TestShape(input_shape, input_values, output_dims,
+                             expected_output_data, output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/softmax.cc b/tensorflow/lite/micro/kernels/softmax.cc
index e85c1a4a306..c96fa561c7c 100644
--- a/tensorflow/lite/micro/kernels/softmax.cc
+++ b/tensorflow/lite/micro/kernels/softmax.cc
@@ -25,28 +25,32 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace activations {
 namespace {
 
+// Softmax parameter data that persists in user_data
+static constexpr int kInt16LUTArraySize = 513;
+
 TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
                                     const TfLiteTensor* input,
                                     TfLiteTensor* output,
                                     const TfLiteSoftmaxParams* params,
                                     SoftmaxParams* op_data) {
-  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
+  if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8 ||
+      input->type == kTfLiteInt16) {
     if (input->type == kTfLiteUInt8) {
       TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteUInt8);
       TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
-    } else {
+    } else if (input->type == kTfLiteInt16) {
+      TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+      TF_LITE_ENSURE_NEAR(context, output->params.scale, 1.f / 32768,
+                          (0.001f * 1.f / 32768));
+    } else {  // input->type == kTfLiteInt8
       TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteInt8);
       if (output->type == kTfLiteInt16) {
         TF_LITE_ENSURE_EQ(context, output->params.zero_point, -32768);
-        // NOTE: Current int16_t softmax output does not require symmetric
-        // scaling
-        // - so no need to verify scale here.
-      } else {
+        TF_LITE_ENSURE_NEAR(context, output->params.scale, 1.f / 65536,
+                            (0.001f * 1.f / 65536));
+      } else {  // output->type == kTfLiteint8
         TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt8);
         TF_LITE_ENSURE_EQ(context, output->params.zero_point, -128);
         TF_LITE_ENSURE(context, output->params.scale == 1.f / 256);
@@ -55,15 +59,28 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
 
     static const int kScaledDiffIntegerBits = 5;
 
-    int input_left_shift;
-    tflite::PreprocessSoftmaxScaling(
-        static_cast<double>(params->beta),
-        static_cast<double>(input->params.scale), kScaledDiffIntegerBits,
-        &op_data->input_multiplier, &input_left_shift);
-    op_data->input_left_shift = input_left_shift;
-    op_data->diff_min =
-        -1.0 * tflite::CalculateInputRadius(kScaledDiffIntegerBits,
-                                            op_data->input_left_shift);
+    // Calculate input_multiplier and input_left_shift
+    if (input->type == kTfLiteInt16) {
+      int input_left_shift;
+      double input_scale_beta_rescale =
+          static_cast<double>(input->params.scale) *
+          static_cast<double>(params->beta) /
+          (10.0 / 65535.0);  // scale the input_diff such that [-65535, 0]
+                             // correspond to [-10.0, 0.0]
+      QuantizeMultiplier(input_scale_beta_rescale, &op_data->input_multiplier,
+                         &input_left_shift);
+      op_data->input_left_shift = input_left_shift;
+    } else {
+      int input_left_shift;
+      tflite::PreprocessSoftmaxScaling(
+          static_cast<double>(params->beta),
+          static_cast<double>(input->params.scale), kScaledDiffIntegerBits,
+          &op_data->input_multiplier, &input_left_shift);
+      op_data->input_left_shift = input_left_shift;
+      op_data->diff_min =
+          -1.0 * tflite::CalculateInputRadius(kScaledDiffIntegerBits,
+                                              op_data->input_left_shift);
+    }
   } else {
     TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
     TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
@@ -72,8 +89,6 @@ TfLiteStatus CalculateSoftmaxParams(TfLiteContext* context,
   return kTfLiteOk;
 }
 
-}  // namespace
-
 // Takes a tensor and performs softmax along the last dimension.
 void SoftmaxFloat(const TfLiteEvalTensor* input, TfLiteEvalTensor* output,
                   const SoftmaxParams& op_data) {
@@ -91,7 +106,7 @@ void SoftmaxQuantized(const TfLiteEvalTensor* input, TfLiteEvalTensor* output,
         tflite::micro::GetTensorData<uint8_t>(input),
         tflite::micro::GetTensorShape(output),
         tflite::micro::GetTensorData<uint8_t>(output));
-  } else {
+  } else if (input->type == kTfLiteInt8) {
     if (output->type == kTfLiteInt16) {
       tflite::reference_ops::Softmax(
           op_data, tflite::micro::GetTensorShape(input),
@@ -105,6 +120,12 @@ void SoftmaxQuantized(const TfLiteEvalTensor* input, TfLiteEvalTensor* output,
           tflite::micro::GetTensorShape(output),
           tflite::micro::GetTensorData<int8_t>(output));
     }
+  } else {
+    tflite::reference_ops::SoftmaxInt16(
+        op_data, tflite::micro::GetTensorShape(input),
+        tflite::micro::GetTensorData<int16_t>(input),
+        tflite::micro::GetTensorShape(output),
+        tflite::micro::GetTensorData<int16_t>(output));
   }
 }
 
@@ -114,18 +135,52 @@ void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
 }
 
 TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
-
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   const TfLiteTensor* input = GetInput(context, node, 0);
+  TF_LITE_ENSURE(context, input != nullptr);
   TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
-
   TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE(context, output != nullptr);
 
-  TFLITE_DCHECK(node->user_data != nullptr);
-  SoftmaxParams* data = static_cast<SoftmaxParams*>(node->user_data);
-  return CalculateSoftmaxParams(context, input, output, params, data);
+  TF_LITE_ENSURE(context, node->user_data != nullptr);
+  SoftmaxParams* op_data = static_cast<SoftmaxParams*>(node->user_data);
+  // Only allocate LUTs for KTfLiteInt16 data type
+  if (input->type == kTfLiteInt16) {
+    void* raw_exp_lut = context->AllocatePersistentBuffer(
+        context, sizeof(int16_t) * kInt16LUTArraySize);
+    TF_LITE_ENSURE(context, raw_exp_lut != nullptr);
+    op_data->exp_lut = reinterpret_cast<int16_t*>(raw_exp_lut);
+    void* one_over_one_plus_x_lut = context->AllocatePersistentBuffer(
+        context, sizeof(int16_t) * kInt16LUTArraySize);
+    TF_LITE_ENSURE(context, one_over_one_plus_x_lut != nullptr);
+    op_data->one_over_one_plus_x_lut =
+        reinterpret_cast<int16_t*>(one_over_one_plus_x_lut);
+  }
+
+  if (output->type == kTfLiteInt16) {
+    TF_LITE_ENSURE(context, input->type == kTfLiteInt8 ||
+                                input->type == kTfLiteUInt8 ||
+                                input->type == kTfLiteInt16);
+  } else {
+    TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  }
+
+  // Populate LUT if required
+  if (input->type == kTfLiteInt16) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+    // exp LUT only used on negative values
+    // we consider exp(-10.0) is insignificant to accumulation
+    gen_lut([](float value) { return std::exp(value); }, -10.0f, 0.0f,
+            op_data->exp_lut, kInt16LUTArraySize);
+    gen_lut([](float value) { return 1.0f / (1.0f + value); }, 0.0f, 1.0f,
+            op_data->one_over_one_plus_x_lut, kInt16LUTArraySize);
+    op_data->zero_point = output->params.zero_point;
+    op_data->scale = output->params.scale;
+  }
+
+  auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
+  return CalculateSoftmaxParams(context, input, output, params, op_data);
 }
 
 TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
@@ -133,16 +188,17 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteEvalTensor* output = tflite::micro::GetEvalOutput(context, node, 0);
 
   TFLITE_DCHECK(node->user_data != nullptr);
-  SoftmaxParams* data = static_cast<SoftmaxParams*>(node->user_data);
+  SoftmaxParams op_data = *static_cast<SoftmaxParams*>(node->user_data);
 
   switch (input->type) {
     case kTfLiteFloat32: {
-      SoftmaxFloat(input, output, *data);
+      SoftmaxFloat(input, output, op_data);
       return kTfLiteOk;
     }
     case kTfLiteInt8:
-    case kTfLiteUInt8: {
-      SoftmaxQuantized(input, output, *data);
+    case kTfLiteUInt8:
+    case kTfLiteInt16: {
+      SoftmaxQuantized(input, output, op_data);
       return kTfLiteOk;
     }
     default:
@@ -151,19 +207,17 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteError;
   }
 }
-}  // namespace activations
+}  // namespace
 
 TfLiteRegistration Register_SOFTMAX() {
-  return {/*init=*/activations::SoftmaxInit,
+  return {/*init=*/SoftmaxInit,
           /*free=*/nullptr,
-          /*prepare=*/activations::SoftmaxPrepare,
-          /*invoke=*/activations::SoftmaxEval,
+          /*prepare=*/SoftmaxPrepare,
+          /*invoke=*/SoftmaxEval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/softmax_test.cc b/tensorflow/lite/micro/kernels/softmax_test.cc
index 27828d2de34..21fc1074760 100644
--- a/tensorflow/lite/micro/kernels/softmax_test.cc
+++ b/tensorflow/lite/micro/kernels/softmax_test.cc
@@ -28,8 +28,13 @@ namespace {
 // quantization parameters.
 const float output_scale_int8 = 1.0f / 256.0f;
 const float output_scale_uint8 = 1.0f / 256.0f;
+const float output_scale_int16 = 1.0f / 32768.0f;
 const int output_zero_point_int8 = -128;
 const int output_zero_point_uint8 = 0;
+const int output_zero_point_int16 = 0;
+
+// Empirical tolerance in quantization space
+const float tolerance_int16 = 7.0;
 
 // 1-dimensional test data.
 const int flat_size_1d = 5;
@@ -252,8 +257,7 @@ void ValidateSoftmaxGoldens(TfLiteTensor* tensors, const int tensor_count,
   int outputs_array_data[] = {1, 1};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  const TfLiteRegistration registration =
-      tflite::ops::micro::Register_SOFTMAX();
+  const TfLiteRegistration registration = Register_SOFTMAX();
   micro::KernelRunner runner(registration, tensors, tensor_count, inputs_array,
                              outputs_array, &builtin_data,
                              micro_test::reporter);
@@ -291,7 +295,7 @@ void TestSoftmaxQuantized(const int* input_dims_data, const float* input_data,
                           int input_zero_point, const int* output_dims_data,
                           const float* golden, T* golden_quantized,
                           float output_scale, int output_zero_point,
-                          T* output_data) {
+                          T* output_data, float tolerance = 1.0) {
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
@@ -310,7 +314,7 @@ void TestSoftmaxQuantized(const int* input_dims_data, const float* input_data,
                      output_zero_point);
 
   ValidateSoftmaxGoldens(tensors, tensors_size, output_data, golden_quantized,
-                         output_dims_count, 1.0);
+                         output_dims_count, tolerance);
 }
 
 }  // namespace
@@ -356,6 +360,21 @@ TF_LITE_MICRO_TEST(Softmax1DQuantizedInt8ShouldMatchGolden) {
       tflite::testing::output_zero_point_int8, output_data);
 }
 
+TF_LITE_MICRO_TEST(Softmax1DQuantizedInt16ShouldMatchGolden) {
+  const float input_scale = 0.1f;
+  const int input_zero_point = 0;
+
+  int16_t input_quantized[tflite::testing::flat_size_1d];
+  int16_t golden_quantized[tflite::testing::flat_size_1d];
+  int16_t output_data[tflite::testing::flat_size_1d];
+  tflite::testing::TestSoftmaxQuantized(
+      tflite::testing::shape_1d, tflite::testing::input_data_1d,
+      input_quantized, input_scale, input_zero_point, tflite::testing::shape_1d,
+      tflite::testing::golden_1d, golden_quantized,
+      tflite::testing::output_scale_int16,
+      tflite::testing::output_zero_point_int16, output_data);
+}
+
 TF_LITE_MICRO_TEST(Softmax2DFloatShouldMatchGolden) {
   float output_data[tflite::testing::flat_size_2d];
   tflite::testing::TestSoftmaxFloat(
@@ -393,6 +412,21 @@ TF_LITE_MICRO_TEST(Softmax2DQuantizedInt8ShouldMatchGolden) {
       tflite::testing::output_zero_point_int8, output_data);
 }
 
+TF_LITE_MICRO_TEST(Softmax2DQuantizedInt16ShouldMatchGolden) {
+  const float input_scale = 0.1f;
+  const int input_zero_point = 0;
+
+  int16_t input_quantized[tflite::testing::flat_size_2d];
+  int16_t golden_quantized[tflite::testing::flat_size_2d];
+  int16_t output_data[tflite::testing::flat_size_2d];
+  tflite::testing::TestSoftmaxQuantized(
+      tflite::testing::shape_2d, tflite::testing::input_data_2d,
+      input_quantized, input_scale, input_zero_point, tflite::testing::shape_2d,
+      tflite::testing::golden_2d, golden_quantized,
+      tflite::testing::output_scale_int16,
+      tflite::testing::output_zero_point_int16, output_data);
+}
+
 TF_LITE_MICRO_TEST(Softmax3DFloatShouldMatchGolden) {
   float output_data[tflite::testing::flat_size_3d];
   tflite::testing::TestSoftmaxFloat(
@@ -430,6 +464,22 @@ TF_LITE_MICRO_TEST(Softmax3DQuantizedInt8ShouldMatchGolden) {
       tflite::testing::output_zero_point_int8, output_data);
 }
 
+TF_LITE_MICRO_TEST(Softmax3DQuantizedInt16ShouldMatchGolden) {
+  const float input_scale = 0.1f;
+  const int input_zero_point = 0;
+
+  int16_t input_quantized[tflite::testing::flat_size_3d];
+  int16_t golden_quantized[tflite::testing::flat_size_3d];
+  int16_t output_data[tflite::testing::flat_size_3d];
+  tflite::testing::TestSoftmaxQuantized(
+      tflite::testing::shape_3d, tflite::testing::input_data_3d,
+      input_quantized, input_scale, input_zero_point, tflite::testing::shape_3d,
+      tflite::testing::golden_3d, golden_quantized,
+      tflite::testing::output_scale_int16,
+      tflite::testing::output_zero_point_int16, output_data,
+      tflite::testing::tolerance_int16);
+}
+
 TF_LITE_MICRO_TEST(Softmax4DFloatShouldMatchGolden) {
   float output_data[tflite::testing::flat_size_4d];
   tflite::testing::TestSoftmaxFloat(
@@ -467,4 +517,19 @@ TF_LITE_MICRO_TEST(Softmax4DQuantizedInt8ShouldMatchGolden) {
       tflite::testing::output_zero_point_int8, output_data);
 }
 
+TF_LITE_MICRO_TEST(Softmax4DQuantizedInt16ShouldMatchGolden) {
+  const float input_scale = 0.1f;
+  const int input_zero_point = 0;
+
+  int16_t input_quantized[tflite::testing::flat_size_4d];
+  int16_t golden_quantized[tflite::testing::flat_size_4d];
+  int16_t output_data[tflite::testing::flat_size_4d];
+  tflite::testing::TestSoftmaxQuantized(
+      tflite::testing::shape_4d, tflite::testing::input_data_4d,
+      input_quantized, input_scale, input_zero_point, tflite::testing::shape_4d,
+      tflite::testing::golden_4d, golden_quantized,
+      tflite::testing::output_scale_int16,
+      tflite::testing::output_zero_point_int16, output_data,
+      tflite::testing::tolerance_int16);
+}
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/split.cc b/tensorflow/lite/micro/kernels/split.cc
index 9bff0b700e7..a1236d7120c 100644
--- a/tensorflow/lite/micro/kernels/split.cc
+++ b/tensorflow/lite/micro/kernels/split.cc
@@ -69,6 +69,7 @@ TfLiteStatus SplitImpl(TfLiteContext* context, TfLiteNode* node,
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* axis = GetInput(context, node, 0);
+  TF_LITE_ENSURE(context, axis != nullptr);
 
   // Dynamic output tensors are needed if axis tensor is not constant.
   // But Micro doesn't support dynamic memory allocation, so we only support
diff --git a/tensorflow/lite/micro/kernels/split_test.cc b/tensorflow/lite/micro/kernels/split_test.cc
index 711e807b2e8..cd9a90804e0 100644
--- a/tensorflow/lite/micro/kernels/split_test.cc
+++ b/tensorflow/lite/micro/kernels/split_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/debug_log.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
@@ -42,7 +42,7 @@ void TestSplitTwoOutputsFloat(
   constexpr int axis_size = 1;
   constexpr int tensors_size = input_size + output_size + axis_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateQuantized32Tensor(axis_data, axis_dims, 1.0),
+      CreateInt32Tensor(axis_data, axis_dims),
       CreateFloatTensor(input_data, input_dims),
       CreateFloatTensor(output1_data, output1_dims),
       CreateFloatTensor(output2_data, output2_dims)};
@@ -104,7 +104,7 @@ void TestSplitFourOutputsFloat(
   constexpr int axis_size = 1;
   constexpr int tensors_size = input_size + output_size + axis_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateQuantized32Tensor(axis_data, axis_dims, 1.0),
+      CreateInt32Tensor(axis_data, axis_dims),
       CreateFloatTensor(input_data, input_dims),
       CreateFloatTensor(output1_data, output1_dims),
       CreateFloatTensor(output2_data, output2_dims),
@@ -171,7 +171,7 @@ void TestSplitTwoOutputsQuantized(
   constexpr int axis_size = 1;
   constexpr int tensors_size = input_size + output_size + axis_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateQuantized32Tensor(axis_data, axis_dims, 1.0),
+      CreateInt32Tensor(axis_data, axis_dims),
       CreateQuantizedTensor(input_data, input_dims, 0, 10),
       CreateQuantizedTensor(output1_data, output1_dims, 0, 10),
       CreateQuantizedTensor(output2_data, output2_dims, 0, 10)};
@@ -227,10 +227,10 @@ void TestSplitTwoOutputsQuantized32(
   constexpr int axis_size = 1;
   constexpr int tensors_size = input_size + output_size + axis_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateQuantized32Tensor(axis_data, axis_dims, 1.0),
-      CreateQuantized32Tensor(input_data, input_dims, 1.0),
-      CreateQuantized32Tensor(output1_data, output1_dims, 1.0),
-      CreateQuantized32Tensor(output2_data, output2_dims, 1.0)};
+      CreateInt32Tensor(axis_data, axis_dims),
+      CreateInt32Tensor(input_data, input_dims),
+      CreateInt32Tensor(output1_data, output1_dims),
+      CreateInt32Tensor(output2_data, output2_dims)};
 
   // Currently only support constant axis tensor.
   tensors[0].allocation_type = kTfLiteMmapRo;
diff --git a/tensorflow/lite/micro/kernels/split_v.cc b/tensorflow/lite/micro/kernels/split_v.cc
new file mode 100755
index 00000000000..600523aba21
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/split_v.cc
@@ -0,0 +1,135 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace split_v {
+
+template <typename T>
+TfLiteStatus SplitImpl(TfLiteContext* context, TfLiteNode* node,
+                       const TfLiteEvalTensor* input, int axis_value) {
+  const TfLiteIntArray* input_dims = input->dims;
+  const TfLiteEvalTensor* output0 =
+      tflite::micro::GetEvalOutput(context, node, 0);
+
+  const int split_dimensions = input_dims->size;
+
+  TFLITE_DCHECK_LT(axis_value, split_dimensions);
+  TFLITE_DCHECK_EQ(output0->dims->size, split_dimensions);
+
+  int64_t split_size = 0;
+  const int output_count = NumOutputs(node);
+  for (int i = 0; i < output_count; i++) {
+    split_size +=
+        tflite::micro::GetEvalOutput(context, node, i)->dims->data[axis_value];
+  }
+  TFLITE_DCHECK_EQ(split_size, input_dims->data[axis_value]);
+  int64_t outer_size = 1;
+  for (int i = 0; i < axis_value; ++i) {
+    outer_size *= input_dims->data[i];
+  }
+
+  int64_t base_inner_size = 1;
+  for (int i = axis_value + 1; i < split_dimensions; ++i) {
+    base_inner_size *= input_dims->data[i];
+  }
+
+  const T* input_ptr = tflite::micro::GetTensorData<T>(input);
+  for (int k = 0; k < outer_size; ++k) {
+    for (int i = 0; i < output_count; ++i) {
+      TfLiteEvalTensor* output_tensor =
+          tflite::micro::GetEvalOutput(context, node, i);
+      T* output_data = tflite::micro::GetTensorData<T>(output_tensor);
+      const int copy_size =
+          output_tensor->dims->data[axis_value] * base_inner_size;
+      T* output_ptr = output_data + k * copy_size;
+      for (int j = 0; j < copy_size; ++j) output_ptr[j] = input_ptr[j];
+      input_ptr += copy_size;
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+
+  // Dynamic output tensors are needed if axis tensor is not constant.
+  // But Micro doesn't support dynamic memory allocation, so we only support
+  // constant axis tensor for now.
+  const TfLiteTensor* axis = GetInput(context, node, 2);
+  TF_LITE_ENSURE_MSG(context, IsConstantTensor(axis),
+                     "Non constant axis tensor not supported");
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteEvalTensor* input = tflite::micro::GetEvalInput(context, node, 0);
+  const TfLiteEvalTensor* axis = tflite::micro::GetEvalInput(context, node, 2);
+
+  int axis_value = tflite::micro::GetTensorData<int32_t>(axis)[0];
+  if (axis_value < 0) {
+    axis_value += input->dims->size;
+  }
+
+  TF_LITE_ENSURE(context, axis_value >= 0);
+  TF_LITE_ENSURE(context, axis_value < input->dims->size);
+
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      return SplitImpl<float>(context, node, input, axis_value);
+    }
+    case kTfLiteInt8: {
+      return SplitImpl<int8_t>(context, node, input, axis_value);
+    }
+    case kTfLiteInt16: {
+      return SplitImpl<int16_t>(context, node, input, axis_value);
+    }
+    case kTfLiteInt32: {
+      return SplitImpl<int32_t>(context, node, input, axis_value);
+    }
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s currently not supported.",
+                         TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace split_v
+
+TfLiteRegistration Register_SPLIT_V() {
+  return {/*init=*/nullptr,
+          /*free=*/nullptr,
+          /*prepare=*/split_v::Prepare,
+          /*invoke=*/split_v::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/split_v_test.cc b/tensorflow/lite/micro/kernels/split_v_test.cc
new file mode 100755
index 00000000000..6a41b2b1985
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/split_v_test.cc
@@ -0,0 +1,468 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/debug_log.h"
+#include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+
+namespace tflite {
+namespace testing {
+
+template <int N>
+struct OutputTensors {
+  float* data[N];
+  int* dims[N];
+  float* expected_output_data[N];
+};
+template <int N>
+void TestSplitVFloat(const int* input_dims_data, const float* input_data,
+                     const int* axis_dims_data, const int32_t* axis_data,
+                     const int* split_dims_data, const int32_t* split_data,
+                     const OutputTensors<N>& output_tensors) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* axis_dims = IntArrayFromInts(axis_dims_data);
+  TfLiteIntArray* split_dims = IntArrayFromInts(split_dims_data);
+  TfLiteIntArray* output_dims[N];
+  for (int i = 0; i < N; i++)
+    output_dims[i] = IntArrayFromInts(output_tensors.dims[i]);
+
+  // Place a unique value in the uninitialized output buffer.
+  for (int i = 0; i < N; i++) {
+    int dim_count = ElementCount(*output_dims[i]);
+    for (int j = 0; j < dim_count; j++) {
+      (output_tensors.data[i])[j] = 23;
+    }
+  }
+  constexpr int input_size = 1;
+  constexpr int axis_size = 1;
+  constexpr int split_size = 1;
+  constexpr int output_size = N;
+
+  constexpr int tensors_size =
+      input_size + output_size + axis_size + split_size;
+
+  // first input tensor is data
+  // second is size_splits
+  // third is axis
+  // then come outputs
+
+  TfLiteTensor tensors[tensors_size];
+  tensors[0] = CreateFloatTensor(input_data, input_dims);
+  tensors[1] = CreateInt32Tensor(split_data, split_dims);
+  tensors[2] = CreateInt32Tensor(axis_data, axis_dims);
+
+  // add output tensors
+  for (int i = 0; i < N; i++)
+    tensors[3 + i] = CreateFloatTensor(output_tensors.data[i], output_dims[i]);
+
+  tensors[2].allocation_type = kTfLiteMmapRo;
+  tensors[1].allocation_type = kTfLiteMmapRo;
+
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[N + 1];
+  outputs_array_data[0] = N;
+  for (int i = 0; i < N; i++) outputs_array_data[i + 1] = i + 3;
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  const TfLiteRegistration registration =
+      tflite::ops::micro::Register_SPLIT_V();
+  micro::KernelRunner runner(registration, tensors, tensors_size, inputs_array,
+                             outputs_array, nullptr, micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.InitAndPrepare());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, runner.Invoke());
+
+  for (int i = 0; i < N; i++) {
+    int dim_count = ElementCount(*output_dims[i]);
+    for (int j = 0; j < dim_count; j++) {
+      TF_LITE_MICRO_EXPECT_NEAR((output_tensors.expected_output_data[i])[j],
+                                (output_tensors.data[i])[j], 1e-5f);
+    }
+  }
+}
+
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SPLIT_V_ThreeOutputs) {
+  constexpr int output1_dims_count = 3;
+  constexpr int output2_dims_count = 3;
+  constexpr int output3_dims_count = 6;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+  float output3_data[output3_dims_count];
+  int input_shape[] = {2, 4, 3};
+  float input_values[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  int axis_shape[] = {1, 1};
+  int32_t axis_values[] = {0};
+  int split_shape[] = {1, 3};
+  int32_t split_values[] = {1, 1, 2};
+  int output1_shape[] = {2, 1, 3};
+  float output1_values[] = {1, 2, 3};
+  int output2_shape[] = {2, 1, 3};
+  float output2_values[] = {4, 5, 6};
+  int output3_shape[] = {2, 2, 3};
+  float output3_values[] = {7, 8, 9, 10, 11, 12};
+
+  tflite::testing::OutputTensors<3> output_tensors;
+  output_tensors.data[0] = output1_data;
+  output_tensors.data[1] = output2_data;
+  output_tensors.data[2] = output3_data;
+
+  output_tensors.dims[0] = output1_shape;
+  output_tensors.dims[1] = output2_shape;
+  output_tensors.dims[2] = output3_shape;
+
+  output_tensors.expected_output_data[0] = output1_values;
+  output_tensors.expected_output_data[1] = output2_values;
+  output_tensors.expected_output_data[2] = output3_values;
+
+  tflite::testing::TestSplitVFloat(input_shape, input_values, axis_shape,
+                                   axis_values, split_shape, split_values,
+                                   output_tensors);
+}
+
+TF_LITE_MICRO_TEST(SPLIT_V_FourDimensionalFloatAxis0) {
+  constexpr int output1_dims_count = 8;
+  constexpr int output2_dims_count = 8;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+
+  int input_shape[] = {4, 2, 2, 2, 2};
+  float input_values[] = {1, 2,  3,  4,  5,  6,  7,  8,
+                          9, 10, 11, 12, 13, 14, 15, 16};
+  int axis_shape[] = {1, 1};
+  int32_t axis_values[] = {0};
+  int split_shape[] = {1, 2};
+  int32_t split_values[] = {1, 1};
+  int output1_shape[] = {4, 1, 2, 2, 2};
+  float output1_values[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  int output2_shape[] = {4, 1, 2, 2, 2};
+  float output2_values[] = {9, 10, 11, 12, 13, 14, 15, 16};
+
+  tflite::testing::OutputTensors<2> output_tensors;
+
+  output_tensors.data[0] = output1_data;
+  output_tensors.data[1] = output2_data;
+
+  output_tensors.dims[0] = output1_shape;
+  output_tensors.dims[1] = output2_shape;
+
+  output_tensors.expected_output_data[0] = output1_values;
+  output_tensors.expected_output_data[1] = output2_values;
+
+  tflite::testing::TestSplitVFloat(input_shape, input_values, axis_shape,
+                                   axis_values, split_shape, split_values,
+                                   output_tensors);
+}
+
+TF_LITE_MICRO_TEST(SPLIT_V_FourDimensionalFloatAxis1) {
+  constexpr int output1_dims_count = 8;
+  constexpr int output2_dims_count = 8;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+
+  int input_shape[] = {4, 2, 2, 2, 2};
+  float input_values[] = {1, 2,  3,  4,  5,  6,  7,  8,
+                          9, 10, 11, 12, 13, 14, 15, 16};
+  int axis_shape[] = {1, 1};
+  int32_t axis_values[] = {1};
+  int split_shape[] = {1, 2};
+  int32_t split_values[] = {1, 1};
+  int output1_shape[] = {4, 2, 1, 2, 2};
+  float output1_values[] = {1, 2, 3, 4, 9, 10, 11, 12};
+  int output2_shape[] = {4, 2, 1, 2, 2};
+  float output2_values[] = {5, 6, 7, 8, 13, 14, 15, 16};
+
+  tflite::testing::OutputTensors<2> output_tensors;
+
+  output_tensors.data[0] = output1_data;
+  output_tensors.data[1] = output2_data;
+
+  output_tensors.dims[0] = output1_shape;
+  output_tensors.dims[1] = output2_shape;
+
+  output_tensors.expected_output_data[0] = output1_values;
+  output_tensors.expected_output_data[1] = output2_values;
+
+  tflite::testing::TestSplitVFloat(input_shape, input_values, axis_shape,
+                                   axis_values, split_shape, split_values,
+                                   output_tensors);
+}
+
+TF_LITE_MICRO_TEST(SPLIT_VFourDimensionalFloatAxis2) {
+  constexpr int output1_dims_count = 8;
+  constexpr int output2_dims_count = 8;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+
+  int input_shape[] = {4, 2, 2, 2, 2};
+  float input_values[] = {1, 2,  3,  4,  5,  6,  7,  8,
+                          9, 10, 11, 12, 13, 14, 15, 16};
+  int axis_shape[] = {1, 1};
+  int32_t axis_values[] = {2};
+  int split_shape[] = {1, 2};
+  int32_t split_values[] = {1, 1};
+  int output1_shape[] = {4, 2, 2, 1, 2};
+  float output1_values[] = {1, 2, 5, 6, 9, 10, 13, 14};
+  int output2_shape[] = {4, 2, 2, 1, 2};
+  float output2_values[] = {3, 4, 7, 8, 11, 12, 15, 16};
+
+  tflite::testing::OutputTensors<2> output_tensors;
+
+  output_tensors.data[0] = output1_data;
+  output_tensors.data[1] = output2_data;
+
+  output_tensors.dims[0] = output1_shape;
+  output_tensors.dims[1] = output2_shape;
+
+  output_tensors.expected_output_data[0] = output1_values;
+  output_tensors.expected_output_data[1] = output2_values;
+
+  tflite::testing::TestSplitVFloat(input_shape, input_values, axis_shape,
+                                   axis_values, split_shape, split_values,
+                                   output_tensors);
+}
+
+TF_LITE_MICRO_TEST(SPLIT_V_FourDimensionalFloatAxis3) {
+  constexpr int output1_dims_count = 8;
+  constexpr int output2_dims_count = 8;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+  int input_shape[] = {4, 2, 2, 2, 2};
+  float input_values[] = {1, 2,  3,  4,  5,  6,  7,  8,
+                          9, 10, 11, 12, 13, 14, 15, 16};
+  int axis_shape[] = {1, 1};
+  int32_t axis_values[] = {3};
+  int split_shape[] = {1, 2};
+  int32_t split_values[] = {1, 1};
+  int output1_shape[] = {4, 2, 2, 2, 1};
+  float output1_values[] = {1, 3, 5, 7, 9, 11, 13, 15};
+  int output2_shape[] = {4, 2, 2, 2, 1};
+  float output2_values[] = {2, 4, 6, 8, 10, 12, 14, 16};
+
+  tflite::testing::OutputTensors<2> output_tensors;
+
+  output_tensors.data[0] = output1_data;
+  output_tensors.data[1] = output2_data;
+
+  output_tensors.dims[0] = output1_shape;
+  output_tensors.dims[1] = output2_shape;
+
+  output_tensors.expected_output_data[0] = output1_values;
+  output_tensors.expected_output_data[1] = output2_values;
+
+  tflite::testing::TestSplitVFloat(input_shape, input_values, axis_shape,
+                                   axis_values, split_shape, split_values,
+                                   output_tensors);
+}
+
+TF_LITE_MICRO_TEST(SPLIT_V_FourDimensionalFloatNegativeAxis) {
+  constexpr int output1_dims_count = 8;
+  constexpr int output2_dims_count = 8;
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+
+  int input_shape[] = {4, 2, 2, 2, 2};
+  float input_values[] = {1, 2,  3,  4,  5,  6,  7,  8,
+                          9, 10, 11, 12, 13, 14, 15, 16};
+  int axis_shape[] = {1, 1};
+  int32_t axis_values[] = {-4};
+  int split_shape[] = {1, 2};
+  int32_t split_values[] = {1, 1};
+  int output1_shape[] = {4, 1, 2, 2, 2};
+  float output1_values[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  int output2_shape[] = {4, 1, 2, 2, 2};
+  float output2_values[] = {9, 10, 11, 12, 13, 14, 15, 16};
+
+  tflite::testing::OutputTensors<2> output_tensors;
+
+  output_tensors.data[0] = output1_data;
+  output_tensors.data[1] = output2_data;
+
+  output_tensors.dims[0] = output1_shape;
+  output_tensors.dims[1] = output2_shape;
+
+  output_tensors.expected_output_data[0] = output1_values;
+  output_tensors.expected_output_data[1] = output2_values;
+
+  tflite::testing::TestSplitVFloat(input_shape, input_values, axis_shape,
+                                   axis_values, split_shape, split_values,
+                                   output_tensors);
+}
+
+TF_LITE_MICRO_TEST(SPLIT_V_OneDimensionalFloatAxis0) {
+  constexpr int output1_dims_count = 1;
+  constexpr int output2_dims_count = 1;
+  constexpr int output3_dims_count = 1;
+  constexpr int output4_dims_count = 1;
+  constexpr int output5_dims_count = 1;
+  constexpr int output6_dims_count = 1;
+  constexpr int output7_dims_count = 1;
+  constexpr int output8_dims_count = 1;
+
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+  float output3_data[output3_dims_count];
+  float output4_data[output4_dims_count];
+  float output5_data[output5_dims_count];
+  float output6_data[output6_dims_count];
+  float output7_data[output7_dims_count];
+  float output8_data[output8_dims_count];
+  int input_shape[] = {1, 8};
+  float input_values[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  int axis_shape[] = {1, 1};
+  int32_t axis_value[] = {0};
+  int split_size_shape[] = {1, 8};
+  int32_t split[] = {1, 1, 1, 1, 1, 1, 1, 1};
+  int output1_shape[] = {1, 1};
+  float output1_values[] = {1};
+  int output2_shape[] = {1, 1};
+  float output2_values[] = {2};
+
+  int output3_shape[] = {1, 1};
+  float output3_values[] = {3};
+  int output4_shape[] = {1, 1};
+  float output4_values[] = {4};
+
+  int output5_shape[] = {1, 1};
+  float output5_values[] = {5};
+  int output6_shape[] = {1, 1};
+  float output6_values[] = {6};
+
+  int output7_shape[] = {1, 1};
+  float output7_values[] = {7};
+  int output8_shape[] = {1, 1};
+  float output8_values[] = {8};
+
+  tflite::testing::OutputTensors<8> output_tensors;
+
+  output_tensors.data[0] = output1_data;
+  output_tensors.data[1] = output2_data;
+  output_tensors.data[2] = output3_data;
+  output_tensors.data[3] = output4_data;
+  output_tensors.data[4] = output5_data;
+  output_tensors.data[5] = output6_data;
+  output_tensors.data[6] = output7_data;
+  output_tensors.data[7] = output8_data;
+
+  output_tensors.dims[0] = output1_shape;
+  output_tensors.dims[1] = output2_shape;
+  output_tensors.dims[2] = output3_shape;
+  output_tensors.dims[3] = output4_shape;
+  output_tensors.dims[4] = output5_shape;
+  output_tensors.dims[5] = output6_shape;
+  output_tensors.dims[6] = output7_shape;
+  output_tensors.dims[7] = output8_shape;
+
+  output_tensors.expected_output_data[0] = output1_values;
+  output_tensors.expected_output_data[1] = output2_values;
+  output_tensors.expected_output_data[2] = output3_values;
+  output_tensors.expected_output_data[3] = output4_values;
+  output_tensors.expected_output_data[4] = output5_values;
+  output_tensors.expected_output_data[5] = output6_values;
+  output_tensors.expected_output_data[6] = output7_values;
+  output_tensors.expected_output_data[7] = output8_values;
+
+  tflite::testing::TestSplitVFloat(input_shape, input_values, axis_shape,
+                                   axis_value, split_size_shape, split,
+                                   output_tensors);
+}
+
+TF_LITE_MICRO_TEST(SPLIT_V_OneDimensionalFloatTest2) {
+  constexpr int output1_dims_count = 1;
+  constexpr int output2_dims_count = 1;
+  constexpr int output3_dims_count = 1;
+  constexpr int output4_dims_count = 1;
+  constexpr int output5_dims_count = 1;
+  constexpr int output6_dims_count = 1;
+  constexpr int output7_dims_count = 2;
+
+  float output1_data[output1_dims_count];
+  float output2_data[output2_dims_count];
+  float output3_data[output3_dims_count];
+  float output4_data[output4_dims_count];
+  float output5_data[output5_dims_count];
+  float output6_data[output6_dims_count];
+  float output7_data[output7_dims_count];
+
+  int input_shape[] = {1, 8};
+  float input_values[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  int axis_shape[] = {1, 1};
+  int32_t axis_value[] = {0};
+  int split_size_shape[] = {1, 8};
+  int32_t split[] = {1, 1, 1, 1, 1, 1, 2, -1};
+  int output1_shape[] = {1, 1};
+  float output1_values[] = {1};
+  int output2_shape[] = {1, 1};
+  float output2_values[] = {2};
+
+  int output3_shape[] = {1, 1};
+  float output3_values[] = {3};
+  int output4_shape[] = {1, 1};
+  float output4_values[] = {4};
+
+  int output5_shape[] = {1, 1};
+  float output5_values[] = {5};
+  int output6_shape[] = {1, 1};
+  float output6_values[] = {6};
+
+  int output7_shape[] = {1, 2};
+  float output7_values[] = {7, 8};
+  int output8_shape[] = {1, 0};
+  float output8_values[1] = {};
+
+  tflite::testing::OutputTensors<8> output_tensors;
+
+  output_tensors.data[0] = output1_data;
+  output_tensors.data[1] = output2_data;
+  output_tensors.data[2] = output3_data;
+  output_tensors.data[3] = output4_data;
+  output_tensors.data[4] = output5_data;
+  output_tensors.data[5] = output6_data;
+  output_tensors.data[6] = output7_data;
+  output_tensors.data[7] = NULL;
+
+  output_tensors.dims[0] = output1_shape;
+  output_tensors.dims[1] = output2_shape;
+  output_tensors.dims[2] = output3_shape;
+  output_tensors.dims[3] = output4_shape;
+  output_tensors.dims[4] = output5_shape;
+  output_tensors.dims[5] = output6_shape;
+  output_tensors.dims[6] = output7_shape;
+  output_tensors.dims[7] = output8_shape;
+
+  output_tensors.expected_output_data[0] = output1_values;
+  output_tensors.expected_output_data[1] = output2_values;
+  output_tensors.expected_output_data[2] = output3_values;
+  output_tensors.expected_output_data[3] = output4_values;
+  output_tensors.expected_output_data[4] = output5_values;
+  output_tensors.expected_output_data[5] = output6_values;
+  output_tensors.expected_output_data[6] = output7_values;
+  output_tensors.expected_output_data[7] = output8_values;
+
+  tflite::testing::TestSplitVFloat(input_shape, input_values, axis_shape,
+                                   axis_value, split_size_shape, split,
+                                   output_tensors);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/strided_slice_test.cc b/tensorflow/lite/micro/kernels/strided_slice_test.cc
index a2a472af990..a6de5bd1e59 100644
--- a/tensorflow/lite/micro/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/micro/kernels/strided_slice_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
@@ -75,9 +75,9 @@ void TestStridedSliceFloat(const int* input_shape, const int* begin_shape,
   constexpr int tensors_size = inputs_size + outputs_size;
   TfLiteTensor tensors[tensors_size] = {
       CreateFloatTensor(input_data, input_dims),
-      CreateQuantized32Tensor(begin_data, begin_dims, 1.0),
-      CreateQuantized32Tensor(end_data, end_dims, 1.0),
-      CreateQuantized32Tensor(strides_data, strides_dims, 1.0),
+      CreateInt32Tensor(begin_data, begin_dims),
+      CreateInt32Tensor(end_data, end_dims),
+      CreateInt32Tensor(strides_data, strides_dims),
       CreateFloatTensor(output_data, output_dims),
   };
 
@@ -106,9 +106,9 @@ void TestStridedSliceQuantized(
       std::numeric_limits<T>::max() + std::numeric_limits<T>::min() / 2;
   TfLiteTensor tensors[tensors_size] = {
       CreateQuantizedTensor(input_data, input_dims, 1.0, zero_point),
-      CreateQuantized32Tensor(begin_data, begin_dims, 1.0),
-      CreateQuantized32Tensor(end_data, end_dims, 1.0),
-      CreateQuantized32Tensor(strides_data, strides_dims, 1.0),
+      CreateInt32Tensor(begin_data, begin_dims),
+      CreateInt32Tensor(end_data, end_dims),
+      CreateInt32Tensor(strides_data, strides_dims),
       CreateQuantizedTensor(output_data, output_dims, 1.0, zero_point),
   };
 
diff --git a/tensorflow/lite/micro/kernels/sub.cc b/tensorflow/lite/micro/kernels/sub.cc
index 8ba1594932f..2cc61a9b542 100644
--- a/tensorflow/lite/micro/kernels/sub.cc
+++ b/tensorflow/lite/micro/kernels/sub.cc
@@ -108,8 +108,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
 
   const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  TF_LITE_ENSURE(context, input1 != nullptr);
   const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TF_LITE_ENSURE(context, input2 != nullptr);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE_STATUS(
       CalculateOpData(context, params, input1, input2, output, data));
diff --git a/tensorflow/lite/micro/kernels/sub_test.cc b/tensorflow/lite/micro/kernels/sub_test.cc
index fdfe4234c64..1cc0c80527b 100644
--- a/tensorflow/lite/micro/kernels/sub_test.cc
+++ b/tensorflow/lite/micro/kernels/sub_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/svdf.cc b/tensorflow/lite/micro/kernels/svdf.cc
index 5cb8e06f9a7..764fdc1bf06 100644
--- a/tensorflow/lite/micro/kernels/svdf.cc
+++ b/tensorflow/lite/micro/kernels/svdf.cc
@@ -27,9 +27,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/micro_utils.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace svdf {
 namespace {
 
 struct OpData {
@@ -47,6 +44,17 @@ struct OpData {
   int output_zero_point;
 };
 
+// Input tensors.
+constexpr int kInputTensor = 0;
+constexpr int kWeightsFeatureTensor = 1;
+constexpr int kWeightsTimeTensor = 2;
+constexpr int kBiasTensor = 3;
+// This is a variable tensor, and will be modified by this op.
+constexpr int kInputActivationStateTensor = 4;
+
+// Output tensor.
+constexpr int kOutputTensor = 0;
+
 /**
  * This version of SVDF is specific to TFLite Micro. It contains the following
  * differences between the TFLite version:
@@ -112,7 +120,8 @@ static inline void ApplyTimeWeightsBiasAndActivation(
   for (int b = 0; b < batch_size; ++b) {
     float* output_ptr_batch = output_ptr + b * num_units;
     for (int i = 0; i < num_units; ++i) {
-      *output_ptr_batch = ActivationValFloat(activation, *output_ptr_batch);
+      *output_ptr_batch =
+          tflite::ops::micro::ActivationValFloat(activation, *output_ptr_batch);
       ++output_ptr_batch;
     }
   }
@@ -335,19 +344,6 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
   }
 }
 
-}  // namespace
-
-// Input tensors.
-constexpr int kInputTensor = 0;
-constexpr int kWeightsFeatureTensor = 1;
-constexpr int kWeightsTimeTensor = 2;
-constexpr int kBiasTensor = 3;
-// This is a variable tensor, and will be modified by this op.
-constexpr int kInputActivationStateTensor = 4;
-
-// Output tensor.
-constexpr int kOutputTensor = 0;
-
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
   return context->AllocatePersistentBuffer(context, sizeof(OpData));
@@ -366,13 +362,17 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // [4] = Activation State (variable),
   //         {2, batch_size, memory_size * num_filters}
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
   const TfLiteTensor* weights_feature =
       GetInput(context, node, kWeightsFeatureTensor);
+  TF_LITE_ENSURE(context, weights_feature != nullptr);
   const TfLiteTensor* weights_time =
       GetInput(context, node, kWeightsTimeTensor);
+  TF_LITE_ENSURE(context, weights_time != nullptr);
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   const TfLiteTensor* activation_state =
       GetInput(context, node, kInputActivationStateTensor);
+  TF_LITE_ENSURE(context, activation_state != nullptr);
 
   // Define input constants based on input tensor definition above:
   const int rank = params->rank;
@@ -392,6 +392,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // [0] = float/int8_t, {2, batch_size, num_units}
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
   TF_LITE_ENSURE_EQ(context, NumDimensions(output), 2);
   TF_LITE_ENSURE_EQ(context, output->dims->data[0], batch_size);
   TF_LITE_ENSURE_EQ(context, output->dims->data[1], num_units);
@@ -530,19 +531,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-}  // namespace svdf
+}  // namespace
 
 TfLiteRegistration Register_SVDF() {
-  return {/*init=*/svdf::Init,
+  return {/*init=*/Init,
           /*free=*/nullptr,
-          /*prepare=*/svdf::Prepare,
-          /*invoke=*/svdf::Eval,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/svdf_test.cc b/tensorflow/lite/micro/kernels/svdf_test.cc
index 8999d8fb4a0..771ff66a4b7 100644
--- a/tensorflow/lite/micro/kernels/svdf_test.cc
+++ b/tensorflow/lite/micro/kernels/svdf_test.cc
@@ -497,7 +497,7 @@ void ValidateSVDFGoldens(const int batch_size, const int num_units,
   int outputs_array_data[] = {1, 5};
   TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
 
-  const TfLiteRegistration registration = tflite::ops::micro::Register_SVDF();
+  const TfLiteRegistration registration = Register_SVDF();
   micro::KernelRunner runner(registration, tensors, tensor_count, inputs_array,
                              outputs_array, &params, micro_test::reporter);
 
diff --git a/tensorflow/lite/micro/kernels/tanh.cc b/tensorflow/lite/micro/kernels/tanh.cc
index 5fa32f8f7ce..7743a87f390 100644
--- a/tensorflow/lite/micro/kernels/tanh.cc
+++ b/tensorflow/lite/micro/kernels/tanh.cc
@@ -51,7 +51,9 @@ TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE(context, output != nullptr);
 
   TF_LITE_ENSURE_TYPES_EQ(context, input->type, output->type);
 
@@ -76,6 +78,7 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = static_cast<OpData*>(node->user_data);
 
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE(context, input != nullptr);
   data->input_zero_point = input->params.zero_point;
   return CalculateArithmeticOpData(context, node, data);
 }
@@ -124,8 +127,9 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt8: {
       reference_integer_ops::Tanh(
           data.input_zero_point, data.input_range_radius, data.input_multiplier,
-          data.input_left_shift, NumElements(input->dims),
+          data.input_left_shift, tflite::micro::GetTensorShape(input),
           tflite::micro::GetTensorData<int8_t>(input),
+          tflite::micro::GetTensorShape(output),
           tflite::micro::GetTensorData<int8_t>(output));
       return kTfLiteOk;
     } break;
diff --git a/tensorflow/lite/micro/kernels/tanh_test.cc b/tensorflow/lite/micro/kernels/tanh_test.cc
index ef1564f4675..4a4f94bc2e5 100644
--- a/tensorflow/lite/micro/kernels/tanh_test.cc
+++ b/tensorflow/lite/micro/kernels/tanh_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
diff --git a/tensorflow/lite/micro/kernels/unpack_test.cc b/tensorflow/lite/micro/kernels/unpack_test.cc
index 5b2c36cdf3f..b5c17bd8d2f 100644
--- a/tensorflow/lite/micro/kernels/unpack_test.cc
+++ b/tensorflow/lite/micro/kernels/unpack_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/debug_log.h"
 #include "tensorflow/lite/micro/kernels/kernel_runner.h"
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 namespace tflite {
 namespace testing {
@@ -222,10 +222,10 @@ void TestUnpackThreeOutputsQuantized32(
   constexpr int output_size = 3;
   constexpr int tensors_size = input_size + output_size;
   TfLiteTensor tensors[tensors_size] = {
-      CreateQuantized32Tensor(input_data, input_dims, 1.0),
-      CreateQuantized32Tensor(output1_data, output1_dims, 1.0),
-      CreateQuantized32Tensor(output2_data, output2_dims, 1.0),
-      CreateQuantized32Tensor(output3_data, output3_dims, 1.0)};
+      CreateInt32Tensor(input_data, input_dims),
+      CreateInt32Tensor(output1_data, output1_dims),
+      CreateInt32Tensor(output2_data, output2_dims),
+      CreateInt32Tensor(output3_data, output3_dims)};
 
   // Place a unique value in the uninitialized output buffer.
   for (int i = 0; i < output1_dims_count; ++i) {
diff --git a/tensorflow/lite/micro/kernels/vexriscv/README.md b/tensorflow/lite/micro/kernels/vexriscv/README.md
new file mode 100644
index 00000000000..228f179b8bf
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/vexriscv/README.md
@@ -0,0 +1,49 @@
+# VexRISC-V
+
+## Maintainers
+
+*   [danielyou0230](https://github.com/danielyou0230)
+*   [tal-x](https://github.com/tcal-x)
+
+## Background
+
+The optimized kernels for
+[VexRISC-V](https://github.com/SpinalHDL/VexRiscv)/[Litex](https://github.com/enjoy-digital/litex)
+are used to run Tensorflow Lite Micro in Zephyr on either
+
+*   Digilent Arty board (e.g. Arty A7)
+*   [Renode](https://github.com/renode/renode): Open source simulation framework
+    (no hardware required)
+
+To run on Digilent Arty board (FPGA,) you'll also need a soft-CPU gateware for
+the FPGA, please see
+[Tensorflow lite demo running in Zephyr on Litex/VexRiscv SoC](https://github.com/antmicro/litex-vexriscv-tensorflow-lite-demo)
+by Antmicro for more details.
+
+For general utilities, please refer to `utils/` under this directory, see
+[README](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/kernels/vexriscv/utils/README.md)
+for available utilities
+
+## Info
+
+To use VexRISC-V optimized kernels instead of reference kernel add
+`TAGS=vexriscv` to the make command. The kernels that doesn't have optimization
+for a certain micro architecture fallback to use TFLM reference kernels.
+
+# Example
+
+To compile the binary file with VexRISC-V optimizations, one can use the
+following command
+
+```
+make -f tensorflow/lite/micro/tools/make/Makefile \
+TAGS=vexriscv \
+TARGET=zephyr_vexriscv \
+person_detection_int8_bin
+```
+
+## Optimized kernels
+
+The following kernels are optimized specific to VexRISCV
+
+*   [DepthwiseConv2D](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/kernels/vexriscv/doc/DepthwiseConv2D_int8.md)
diff --git a/tensorflow/lite/micro/kernels/vexriscv/depthwise_conv.cc b/tensorflow/lite/micro/kernels/vexriscv/depthwise_conv.cc
new file mode 100644
index 00000000000..028c1111281
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/vexriscv/depthwise_conv.cc
@@ -0,0 +1,527 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h"
+#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/padding.h"
+#include "tensorflow/lite/micro/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace depthwise_conv {
+namespace vexriscv {
+
+constexpr int kChannelStep = 32;
+
+inline void DepthwiseConvPerChannel(
+    const DepthwiseParams& params, const int32_t* output_multiplier,
+    const int32_t* output_shift, const RuntimeShape& input_shape,
+    const int8_t* input_data, const RuntimeShape& filter_shape,
+    const int8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    int8_t* output_data) {
+  // Get parameters.
+  // TODO(b/141565753): Re-introduce ScopedProfilingLabel on Micro.
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Check dimensions of the tensors.
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int m = 0; m < depth_multiplier; ++m) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          // Divide channels to chunks of size kChannelStep
+          for (int begin_ch = 0; begin_ch < input_depth;
+               begin_ch += kChannelStep) {
+            // Allocate a partial result accumulator for each channel
+            // in current chunks
+            int32_t acc[kChannelStep] = {0};
+            // Calculate the last channel for current chunk
+            const int steps =
+                std::min(input_depth, begin_ch + kChannelStep) - begin_ch;
+
+            // Accumulate partial results to acc for a small chunk of channels
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              const int in_y = in_y_origin + dilation_height_factor * filter_y;
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+
+                if (!is_point_inside_image) {
+                  continue;
+                }
+
+                for (int offset_ch = 0; offset_ch < steps; ++offset_ch) {
+                  const int in_channel = begin_ch + offset_ch;
+                  const int output_channel = m + in_channel * depth_multiplier;
+
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  // Accumulate with 32 bits accumulator.
+                  // In the nudging process during model quantization, we force
+                  // real value of 0.0 be represented by a quantized value. This
+                  // guarantees that the input_offset is a int8_t, even though
+                  // it is represented using int32_t. int32_t += int8_t *
+                  // (int8_t - int8_t) so the highest value we can get from each
+                  // accumulation is [-127, 127] * ([-128, 127] -
+                  // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                  // = 14.98, which means we can accumulate at least 2^16
+                  // multiplications without overflow. The accumulator is
+                  // applied to a filter so the accumulation logic will hold as
+                  // long as the filter size (filter_y * filter_x * in_channel)
+                  // does not exceed 2^16, which is the case in all the models
+                  // we have seen so far.
+                  // TODO(jianlijianli): Add a check to make sure the
+                  // accumulator depth is smaller than 2^16.
+                  acc[offset_ch] += filter_val * (input_val + input_offset);
+                }
+              }
+            }
+
+            // Add bias / activations for current chunk of channels
+            for (int offset_ch = 0; offset_ch < steps; ++offset_ch) {
+              const int in_channel = begin_ch + offset_ch;
+              const int output_channel = m + in_channel * depth_multiplier;
+
+              int32_t value = acc[offset_ch];
+              if (bias_data) {
+                value += bias_data[output_channel];
+              }
+
+              value = MultiplyByQuantizedMultiplier(
+                  value, output_multiplier[output_channel],
+                  output_shift[output_channel]);
+              value += output_offset;
+              value = std::max(value, output_activation_min);
+              value = std::min(value, output_activation_max);
+
+              output_data[Offset(output_shape, batch, out_y, out_x,
+                                 output_channel)] = static_cast<int8_t>(value);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+inline void DepthwiseConv(
+    const DepthwiseParams& params, const RuntimeShape& input_shape,
+    const uint8_t* input_data, const RuntimeShape& filter_shape,
+    const uint8_t* filter_data, const RuntimeShape& bias_shape,
+    const int32_t* bias_data, const RuntimeShape& output_shape,
+    uint8_t* output_data) {
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
+  TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        for (int m = 0; m < depth_multiplier; m++) {
+          const int in_x_origin = (out_x * stride_width) - pad_width;
+          const int in_y_origin = (out_y * stride_height) - pad_height;
+          // Divide channels to chunks of size kChannelStep
+          for (int begin_ch = 0; begin_ch < input_depth;
+               begin_ch += kChannelStep) {
+            // Allocate a partial result accumulator for each channel
+            // in current chunks
+            int32_t acc[kChannelStep] = {0};
+            // Calculate the last channel for current chunk
+            const int steps =
+                std::min(input_depth, begin_ch + kChannelStep) - begin_ch;
+
+            // Accumulate partial results to acc for a small chunk of channels
+            for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+              const int in_y = in_y_origin + dilation_height_factor * filter_y;
+              for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+                const int in_x = in_x_origin + dilation_width_factor * filter_x;
+                // Zero padding by omitting the areas outside the image.
+                const bool is_point_inside_image =
+                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
+                    (in_y < input_height);
+
+                if (!is_point_inside_image) {
+                  continue;
+                }
+
+                for (int offset_ch = 0; offset_ch < steps; ++offset_ch) {
+                  const int in_channel = begin_ch + offset_ch;
+                  const int output_channel = m + in_channel * depth_multiplier;
+
+                  int32_t input_val = input_data[Offset(
+                      input_shape, batch, in_y, in_x, in_channel)];
+                  int32_t filter_val = filter_data[Offset(
+                      filter_shape, 0, filter_y, filter_x, output_channel)];
+                  acc[offset_ch] +=
+                      (filter_val + filter_offset) * (input_val + input_offset);
+                }
+              }
+            }
+
+            // Add bias / activations for current chunk of channels
+            for (int offset_ch = 0; offset_ch < steps; ++offset_ch) {
+              const int in_channel = begin_ch + offset_ch;
+              const int output_channel = m + in_channel * depth_multiplier;
+
+              int32_t value = acc[offset_ch];
+              if (bias_data) {
+                value += bias_data[output_channel];
+              }
+
+              value = MultiplyByQuantizedMultiplier(value, output_multiplier,
+                                                    output_shift);
+              value += output_offset;
+              value = std::max(value, output_activation_min);
+              value = std::min(value, output_activation_max);
+
+              output_data[Offset(output_shape, batch, out_y, out_x,
+                                 output_channel)] = static_cast<uint8_t>(value);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace vexriscv
+
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kFilterTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+
+// Depthwise conv is quantized along dimension 3:
+// https://www.tensorflow.org/lite/performance/quantization_spec
+constexpr int kDepthwiseConvQuantizedDimension = 3;
+
+struct OpData {
+  TfLitePaddingValues padding;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
+
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+
+  // Per channel output multiplier and shift.
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+};
+
+TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteDepthwiseConvParams* params, int width,
+                             int height, int filter_width, int filter_height,
+                             const TfLiteType data_type, OpData* data) {
+  bool has_bias = node->inputs->size == 3;
+  // Check number of inputs/outputs
+  TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
+
+  int unused_output_height, unused_output_width;
+  data->padding = ComputePaddingHeightWidth(
+      params->stride_height, params->stride_width, 1, 1, height, width,
+      filter_height, filter_width, params->padding, &unused_output_height,
+      &unused_output_width);
+
+  // Note that quantized inference requires that all tensors have their
+  // parameters set. This is usually done during quantized training.
+  if (data_type != kTfLiteFloat32) {
+    const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+    const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+    const TfLiteTensor* bias =
+        GetOptionalInputTensor(context, node, kBiasTensor);
+    TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+    int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
+
+    return tflite::PopulateConvolutionQuantizationParams(
+        context, input, filter, bias, output, params->activation,
+        &data->output_multiplier, &data->output_shift,
+        &data->output_activation_min, &data->output_activation_max,
+        data->per_channel_output_multiplier,
+        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels);
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  return context->AllocatePersistentBuffer(context, sizeof(OpData));
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+
+  const TfLiteType data_type = input->type;
+  int width = SizeOfDimension(input, 2);
+  int height = SizeOfDimension(input, 1);
+  int filter_width = SizeOfDimension(filter, 2);
+  int filter_height = SizeOfDimension(filter, 1);
+
+  // Per channel quantization is only needed for int8_t inference. For other
+  // quantized types, only a single scale and zero point is needed.
+  const int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
+  // Dynimically allocate per-channel quantization parameters.
+  data->per_channel_output_multiplier =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+  data->per_channel_output_shift =
+      reinterpret_cast<int32_t*>(context->AllocatePersistentBuffer(
+          context, num_channels * sizeof(int32_t)));
+
+  // All per-channel quantized tensors need valid zero point and scale arrays.
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, filter->quantization.type,
+                      kTfLiteAffineQuantization);
+
+    const auto* affine_quantization =
+        reinterpret_cast<TfLiteAffineQuantization*>(
+            filter->quantization.params);
+    TF_LITE_ENSURE(context, affine_quantization);
+    TF_LITE_ENSURE(context, affine_quantization->scale);
+    TF_LITE_ENSURE(context, affine_quantization->zero_point);
+    TF_LITE_ENSURE(
+        context, affine_quantization->scale->size == 1 ||
+                     affine_quantization->scale->size ==
+                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
+                      affine_quantization->zero_point->size);
+  }
+
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
+                                        filter_width, filter_height, data_type,
+                                        data));
+
+  data->input_zero_point = input->params.zero_point;
+  data->filter_zero_point = filter->params.zero_point;
+  data->output_zero_point = output->params.zero_point;
+
+  return kTfLiteOk;
+}
+
+void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                             TfLiteDepthwiseConvParams* params,
+                             const OpData& data, const TfLiteEvalTensor* input,
+                             const TfLiteEvalTensor* filter,
+                             const TfLiteEvalTensor* bias,
+                             TfLiteEvalTensor* output) {
+  DepthwiseParams op_params;
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.input_offset = -data.input_zero_point;
+  op_params.weights_offset = 0;
+  op_params.output_offset = data.output_zero_point;
+  // TODO(b/130439627): Use calculated value for clamping.
+  op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
+  op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
+
+  vexriscv::DepthwiseConvPerChannel(
+      op_params, data.per_channel_output_multiplier,
+      data.per_channel_output_shift, tflite::micro::GetTensorShape(input),
+      tflite::micro::GetTensorData<int8_t>(input),
+      tflite::micro::GetTensorShape(filter),
+      tflite::micro::GetTensorData<int8_t>(filter),
+      tflite::micro::GetTensorShape(bias),
+      tflite::micro::GetTensorData<int32_t>(bias),
+      tflite::micro::GetTensorShape(output),
+      tflite::micro::GetTensorData<int8_t>(output));
+}
+
+void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                   TfLiteDepthwiseConvParams* params, const OpData& data,
+                   const TfLiteEvalTensor* input,
+                   const TfLiteEvalTensor* filter, const TfLiteEvalTensor* bias,
+                   TfLiteEvalTensor* output) {
+  const int32_t input_offset = -data.input_zero_point;
+  const int32_t filter_offset = -data.filter_zero_point;
+  const int32_t output_offset = data.output_zero_point;
+
+  tflite::DepthwiseParams op_params;
+  // Padding type is ignored, but still set.
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data.padding.width;
+  op_params.padding_values.height = data.padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.quantized_activation_min = data.output_activation_min;
+  op_params.quantized_activation_max = data.output_activation_max;
+  op_params.input_offset = input_offset;
+  op_params.weights_offset = filter_offset;
+  op_params.output_offset = output_offset;
+  op_params.output_multiplier = data.output_multiplier;
+  // Legacy ops used mixed left and right shifts. Now all are +ve-means-left.
+  op_params.output_shift = -data.output_shift;
+
+  vexriscv::DepthwiseConv(op_params, tflite::micro::GetTensorShape(input),
+                          tflite::micro::GetTensorData<uint8_t>(input),
+                          tflite::micro::GetTensorShape(filter),
+                          tflite::micro::GetTensorData<uint8_t>(filter),
+                          tflite::micro::GetTensorShape(bias),
+                          tflite::micro::GetTensorData<int32_t>(bias),
+                          tflite::micro::GetTensorShape(output),
+                          tflite::micro::GetTensorData<uint8_t>(output));
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  TFLITE_DCHECK(node->builtin_data != nullptr);
+
+  auto* params =
+      reinterpret_cast<TfLiteDepthwiseConvParams*>(node->builtin_data);
+  const OpData& data = *(static_cast<const OpData*>(node->user_data));
+
+  TfLiteEvalTensor* output =
+      tflite::micro::GetEvalOutput(context, node, kOutputTensor);
+  const TfLiteEvalTensor* input =
+      tflite::micro::GetEvalInput(context, node, kInputTensor);
+  const TfLiteEvalTensor* filter =
+      tflite::micro::GetEvalInput(context, node, kFilterTensor);
+  const TfLiteEvalTensor* bias =
+      (NumInputs(node) == 3)
+          ? tflite::micro::GetEvalInput(context, node, kBiasTensor)
+          : nullptr;
+
+  // TODO(aselle): Consider whether float conv and quantized conv should be
+  // separate ops to avoid dispatch overhead here.
+  switch (input->type) {  // Already know in/out types are same.
+    case kTfLiteInt8:
+      EvalQuantizedPerChannel(context, node, params, data, input, filter, bias,
+                              output);
+      break;
+    case kTfLiteUInt8:
+      EvalQuantized(context, node, params, data, input, filter, bias, output);
+      break;
+    default:
+      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                         TfLiteTypeGetName(input->type), input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace depthwise_conv
+
+TfLiteRegistration Register_DEPTHWISE_CONV_2D() {
+  return {/*init=*/depthwise_conv::Init,
+          /*free=*/nullptr,
+          /*prepare=*/depthwise_conv::Prepare,
+          /*invoke=*/depthwise_conv::Eval,
+          /*profiling_string=*/nullptr,
+          /*builtin_code=*/0,
+          /*custom_name=*/nullptr,
+          /*version=*/0};
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/vexriscv/doc/DepthwiseConv2D_int8.md b/tensorflow/lite/micro/kernels/vexriscv/doc/DepthwiseConv2D_int8.md
new file mode 100644
index 00000000000..f4de0d92521
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/vexriscv/doc/DepthwiseConv2D_int8.md
@@ -0,0 +1,237 @@
+# Design of DepthwiseConv2D for VexRISCV
+
+*   Author: Daniel You (Google SWE intern, Summer 2020)
+*   Github Profile: [danielyou0230](https://github.com/danielyou0230)
+*   Last Update: August 28, 2020
+*   [PR#42715](https://github.com/tensorflow/tensorflow/pull/42715) (see
+    experiment results in the PR message)
+
+## Overview
+
+The kernel is optimized based on the reference kernel in Tensorflow Lite.
+Different from the straightforward implementation, this implementation takes
+memory layout in TF Lite (`NHWC`) into account, which leverages memory hierarchy
+to reduce memory miss count, to be more specific, it performs depthwise
+convolution for every channel in a fixed spatial position (iterate `C`-axis
+first, then `W`-axis, `H`-axis, and `N`-axis).
+
+## Objective
+
+With the debut of Artificial Intelligence (AI) products and services, our lives
+have been changed ever since. While much of those applications are cloud-based
+implementations, there are still many cases where AI algorithms have to be run
+on resource constrained devices. Current machine learning frameworks are still
+not well optimized for those platforms, thereby preventing more complicated
+applications running on them with acceptable performance.
+
+This design focuses on improving the performance of kernels in TensorFlow Lite
+Micro, to be more specific, this design involves one of the most popular kernels
+among the models deployed on edge devices: DepthwiseConv2D (see
+[TensorFlow Python API](https://www.tensorflow.org/api_docs/python/tf/keras/layers/DepthwiseConv2D);
+[discussion on MobileNetV1](https://groups.google.com/g/keras-users/c/sec8pYjJwwE)
+on Google groups.) The goal is to reduce the inference time on those devices
+which can in turn save more energy on them or more importantly, enable more
+complex applications running on them.
+
+## Background
+
+Existing works aim to optimize on-CPU performance focus on leveraging CPU
+specific instruction like SIMD instructions in RISC-V Vector and other
+counterparts like AVX and SSE intrinsics. An implementation released by Facebook
+([pytorch/FBGEMM](https://github.com/pytorch/FBGEMM/tree/master/src))
+demonstrated the potential that can be achieved with the aforementioned vector
+instructions.
+
+The alternative approach is to optimize on GPUs. Modern GPUs are well-known for
+having great performance in matrix multiplication and parallel computation (e.g.
+CUDA from Nvidia). Those powerful GPUs enable machine learning researchers to
+explore a wide variety of models and solve complicated problems. For resource
+constrained embedded processors, however, incorporating a GPU may not fit the
+limited hardware and power budget for their applications. Unlike running
+TensorFlow Python APIs on desktop or servers, TensorFlow Lite and TensorFlow
+Lite Micro are made to efficiently run inference on those devices, which enables
+the possibilities to make machine learning applications ubiquitous in our life.
+
+## Requirements and scale
+
+After detailed analysis on memory access patterns in existing implementations, I
+found existing code under-utilizes the memory hierarchy, specifically, the SRAM
+cache, to reduce excessive memory access time, which would be approximately 100
+times slower if memory access were optimized
+([Latency Numbers Every Programmer Should Know](https://gist.github.com/jboner/2841832),
+[The Effect Of CPU Caches And Memory Access Patterns](http://kejser.org/the-effect-of-cpu-caches-and-memory-access-patterns/).)
+Therefore, this design aims to improve the memory access pattern to better fit
+the memory layout of the TensorFlow Lite C++ library. Any integer-based models
+with DepthwiseConv2D layers using TensorFlow Lite Micro will benefit from this
+change.
+
+To begin with, the memory layout of tensors in TensorFlow Lite C++ library uses
+`NHWC` format `(n, height, width, channel)` and flattened to an 1-d tensor, the
+index of `(n, h, w, c)` in the tensor can then be calculated with `((n * H + h)
+* W + w) * C + c`. The reference implementation is depicted as follows:
+
+```
+for i-th input among N inputs
+  for c-th input channel
+    for (y, x) in input that are convolving with the filter
+      access element (i, y, x, c) in the input
+```
+
+Thus, if the current element is `(i, y, x, c)` at index `((i * H + y) * W + x) *
+C + c`, next element will be `(i, y, x + 1, c)` at index `((i * H + y) * W + (x
++ 1)) * C + c`, the difference of indices between two consecutive accesses is
+`C` (illustrated below,) which is apparently not a sequential access.
+
+![dconv_arr_index](https://user-images.githubusercontent.com/21079720/91612344-0f25e600-e932-11ea-9278-7c8161748711.png)
+
+In response to the poor memory access pattern in the reference, it would be
+beneficial to implement DepthwiseConv2D in a depth-centric manner, namely,
+accessing elements at a fixed spatial location `(y, x)` for each channel. The
+access order then becomes sequential on the 1-d tensor because the layout of
+tensors are in the format of `NHWC`.
+
+## Design ideas
+
+Instead of accessing the memory in a non-sequential manner, this design proposes
+to change the access pattern to be consistent with the memory layout in the
+current TensorFlow Lite C++ library. The idea can be broken down into two major
+parts:
+
+*   Relating sequential memory access to DepthwiseConv2D
+*   Depthwise convolution with sequential memory access scheme
+
+### Relating sequential memory access to DepthwiseConv2D
+
+Contrary to the reference implementation, the proposed solution re-orders the
+calculation to access the elements sequentially in the tensor, namely, `(0, 1,
+2, ..., H * W * C - 1)`. This can be done by interchanging the order of two
+inner loops: `for i-th input for (y, x) in input that are convolving with the
+filter for c-th input channel access element (i, y, x, c) in the input`
+
+In this case, if the current element is `(i, y, x, c)` at index `((i * H + y) *
+W + x) * C + c`, the next element will be `((i * H + y) * W + x) * C + (c + 1)`,
+the difference of between two consecutive access becomes `1`, thereby fully
+re-using the data in a cache block.
+
+### Depthwise convolution with sequential memory access scheme
+
+In the existing TF Lite reference implementation, each element in the output is
+calculated by performing `(filter_h * filter_w)` multiplications and additions
+in a row. With the proposed design, memory access patterns can be greatly
+improved by re-ordering the calculations.
+
+Rather than calculating the results in a row, this design rearranges the
+operations. To calculate the output at a specific spatial location for all
+channels (see the colored cells in the output tensor in the figure below) the
+resulting order of calculations is illustrated below, the involving input/filter
+locations are represented as `(spatial index, channel)`
+
+![dconv_org_vis](https://user-images.githubusercontent.com/21079720/91612427-409eb180-e932-11ea-9c30-a205c8f3e461.png)
+
+The calculation for each element at the output is completed when it reaches the
+bold coordinates in the table. From the table, this scheme only gets partial
+results until it reaches the last location (i.e., `(#9, 0)` to `(#9, C-1)`).
+Ideally, we can use the output tensor directly as an accumulator, no extra space
+is needed at runtime. Yet, since the output tensor is limited (8 bits) in an
+integer model, accumulating intermediate values at the output tensor will cause
+overflow: the product of two `int8` values is in the range of `int16` and there
+are `H * W` values to be accumulated, the range of the value before quantization
+is `H * W * MAX_INT16`. Therefore, an `int32` accumulator is adequate as long as
+the number of accumulations `(H*W*C)` does not exceed `2^16`. To address
+overflow when accumulating at output tensor and provide better memory access
+pattern, an `int32` array of size equals to number of channels (`C`) as
+accumulators is enough, since those `C` calculations are done once a set of
+spatial locations (`#1` to `#9`) are convolved, we don't have to allocate an
+array with size equals to the output tensor to accumulate the values.
+
+Original        | Optimized
+:-------------: | :-------------:
+(#1, 0)         | (#1, 0)
+(#2, 0)         | (#1, 1)
+...             | ...
+**(#9, 0)**     | (#1, C - 1)
+(#1, 1)         | (#2, 0)
+...             | ...
+**(#9, 1)**     | **(#9, 0)**
+...             | ...
+**(#9, C - 1)** | **(#9, C - 1)**
+
+If we implement this idea, i.e. allocating a temporary array with size equals to
+`C`, we can follow the loop structure shown below, this would work just fine,
+but as we can see in the routine, it involves allocating an `int32` array of
+**size in proportional to the input channel**, which is not preferable in those
+resource limited devices because we cannot assure there will always be enough
+memory given any application or model.
+
+```
+for i-th input among N inputs
+  for each (out_y, out_x)
+    for m < depth_multiplier; step_size = 1
+      calculate origin (in_y_origin, in_x_origin) to perform convolution
+
+      // Accumulate partial results in buffer given a origin
+      create an int32 buffer of size output_channel as accumulators
+
+      for each (filter_y, filter_x)
+        calculate (in_y, in_x) to perform convolution
+        for in_ch < in_channel; step_size = 1
+          calculate out_ch
+          // accumulate partial results
+          buffer[ch_offset] += input[indexOf(i, y, x, in_ch)] *
+                               filter[indexOf(0, f_y, f_x, out_ch)]
+
+      for in_ch < in_channel; step_size = 1
+        calculate out_ch
+        // Add bias / activation / requantize
+        value = postAccumulation(buffer[out_ch])
+        output[indexOf(i, out_y, out_x, out_ch)] = value
+```
+
+Instead, we can further breakdown the structure into chunks, namely, we can add
+an additional nested loop inside to iterate `K` channels a time until all
+channels are processed, the modified loop structure is depicted below and the
+visualization is shown in the figure below the loop.
+
+```
+for i-th input among N inputs
+  for each (out_y, out_x)
+    for m < depth_multiplier; step_size = 1
+      calculate origin (in_y_origin, in_x_origin) to perform convolution
+
+      // Accumulate partial results in buffer for K channels given a origin
+      for ch < input_ch; step_size = K
+        create an int32 buffer of size K as accumulator for current chunk
+
+        for each (filter_y, filter_x)
+          calculate (in_y, in_x) to perform convolution
+          for ch_offset < channel_step; step_size = 1
+            calculate in_ch and out_ch
+            // accumulate partial results
+            buffer[ch_offset] += input[indexOf(i, y, x, in_ch)] *
+                                 filter[indexOf(0, f_y, f_x, out_ch)]
+
+        for ch_offset < channel_step; step_size = 1
+          // Add bias / activation / requantize
+          value = postAccumulation(buffer[ch_offset])
+          output[indexOf(i, out_y, out_x, out_ch)] = value
+```
+
+![dconv_design_vis](https://user-images.githubusercontent.com/21079720/91612374-2369e300-e932-11ea-90eb-898c0270794e.png)
+
+The final problem is how the choice of `K`, according to the soft-CPU
+configuration, we have a cache size of 4KB and each memory block is 32 bytes.
+Combined with the input format we use (`int8`) whenever the OS fetches a block
+of input tensor, it loads 32 `int8` to the cache. To fully utilize that block,
+we can choose the size of the buffer to accommodate 32 partial results (128
+byte, or 4 blocks,) most applications keep the number of channels to be power of
+2s (except for the input,) 32 is a reasonable value to perform depthwise
+convolution for both small and large numbers of channels in the model.
+
+## Alternatives considered
+
+An alternative design is to dynamically allocate a buffer for each channel (an
+`int32` array of size equals to number of output channels.) This approach is
+easier to implement since after `H * W * C` calculations, we can requantize
+those `C` values and store them into the output tensor. However, we are running
+on memory constrained devices, dynamic allocation is not encouraged by the
+upstream developers.
diff --git a/tensorflow/lite/micro/kernels/vexriscv/utils/README.md b/tensorflow/lite/micro/kernels/vexriscv/utils/README.md
new file mode 100644
index 00000000000..48c1e52e133
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/vexriscv/utils/README.md
@@ -0,0 +1,133 @@
+# VexRISC-V utils
+
+This directory contains scripts for some utilities when debugging with TFLM
+applications.
+
+## log_parser.py
+
+This script is used to analyze the function call stack obtained when a
+application is running on Renode with GDB session attached or with Renode's
+logging set to true (include the following line in your `*.resc` file of enter
+this command in Renode every time you launch an simulator.)
+
+```
+sysbus.cpu LogFunctionNames true
+```
+
+Also, make sure you check out Antmicro's repo
+[antmicro/litex-vexriscv-tensorflow-lite-demo](https://github.com/antmicro/litex-vexriscv-tensorflow-lite-demo)
+for the guide to run TensorFlow Lite Micro examples on Renode.
+
+In the following guide with GDB, we will be using the example gdb script
+described here
+[Cobbled-together Profiler](https://xobs.io/cobbled-together-profiler/)
+
+### Launch Renode console
+
+Include (`include` or `i` for short) the renode script (`*.resc`), DO NOT start
+the simulation yet.
+
+The symbol `@` is the path where Renode is installed, you can navigate to
+anywhere on the disk as long as you follow the linux syntax (`../` for parent
+directory, etc.), here I install Renode under home (`/home/$USER` or `~/`)
+directory and I put the demo repository litex-vexriscv-tensorflow-lite-demo
+under home directory.
+
+```
+i @../litex-vexriscv-tensorflow-lite-demo/renode/litex-vexriscv-tflite.resc
+```
+
+### Start GDB server on Renode on port 8833
+
+```
+machine StartGdbServer 8833
+```
+
+### Launch GDB
+
+First you need to find a proper GDB executable for your target architecture,
+here, we will follow Antmicro's repo and use the riscv GDB executable from
+zephyr
+
+Usage: `[GDB] -x [GDB_SCRIPT] [TFLM_BINARY]`
+
+Example: `/opt/zephyr-sdk/riscv64-zephyr-elf/bin/riscv64-zephyr-elf-gdb \ -x
+profiling.gdb \
+../tensorflow/tensorflow/lite/micro/tools/make/gen/zephyr_vexriscv_x86_64/magic_wand/build/zephyr/zephyr.elf`
+
+### Connect GDB to Renode's gdbserver on the same port
+
+```
+(gdb) target remote :8833
+(gdb) monitor start
+(gdb) continue
+
+# Run the function in the GDB script with required parameter
+(gdb) poor_profile 1000
+```
+
+### Interrupt the gdb script regularly with a shell command
+
+```
+for i in $(seq 1000); do echo $i...; killall -INT riscv64-zephyr-elf-gdb; sleep 5; done
+```
+
+### Interpreting the log
+
+#### Parse and visualize the log
+
+```
+# The following command is used to parse and visualize the log file
+# obtained from GDB and only keep top 7 most frequent functions in the
+# image, for detail usage of the script, please refer to the source code
+
+python log_parser.py [INPUT] --regex=gdb_regex.json --visualize --top=7 --source=gdb
+```
+
+Since we are redirecting the gdb interrupt messages to the file
+`<path-you-run-gdb>/profile.txt` (see the gdb script,) we can now parse the log
+and visualize it. (set the image title with argument `--title`)
+
+```
+python log_parser.py profile.txt --regex=gdb_regex.json --visualize --top=7 --title=magic_wand
+```
+
+![image](https://user-images.githubusercontent.com/21079720/91764987-198fec00-eb8d-11ea-8eb1-90355fe4f28c.png)
+
+#### Get the statistic of the function call hierarchy
+
+To get a more detail view of how the entire function call stack looks like and
+how many time the function is called with the exact same call stack, we can add
+another option `--full-trace` to the script and it will generate a `*.json` file
+for the complete call stack trace. `python log_parser.py profile.txt
+--regex=gdb_regex.json --visualize --top=7 --full-trace`
+
+```
+# In the `*.json` file
+root
+|-- fcn0
+|    |-- [stack0, stack1, ...] # List of function call stacks, see below
+|
+|-- fcn1
+|    |-- [stack0, stack1, ...]
+...
+```
+
+```
+# Each stack* object contains the following information
+stack*
+|-- counts: 5 # Number of occurence with the exact same call stack
+|-- [list of functions in the call stack]
+```
+
+![image](https://user-images.githubusercontent.com/21079720/91755189-8bf9cf80-eb7f-11ea-884c-2354f3470271.png)
+
+### Customizing `*.json` used in the script
+
+The regular expression used in this script is configured with a standard
+`*.json` file with the following content:
+
+*   `base`: Base regular expression to clean up the log, this is set to clean up
+    the ANSI color codes in GDB
+*   `custom`: A series of other regular expressions (the script will run them in
+    order) to extract the information from the the log
diff --git a/tensorflow/lite/micro/kernels/vexriscv/utils/gdb_regex.json b/tensorflow/lite/micro/kernels/vexriscv/utils/gdb_regex.json
new file mode 100644
index 00000000000..22a986f752f
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/vexriscv/utils/gdb_regex.json
@@ -0,0 +1,9 @@
+{
+    "base": "(\\x1b\\[\\d?\\d?m)",
+    "custom": [
+        "(\\w+ in ([^()])*)",
+        "tflite[^()]*",
+        "(\\w+) \\(",
+        "#\\d+  (.*) \\("
+    ]
+}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/vexriscv/utils/log_parser.py b/tensorflow/lite/micro/kernels/vexriscv/utils/log_parser.py
new file mode 100644
index 00000000000..2294088239a
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/vexriscv/utils/log_parser.py
@@ -0,0 +1,340 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Analyze function call stack from GDB or Renode
+
+See README for detail usage
+
+Example usage:
+
+python log_parser.py profile.txt --regex=gdb_regex.json --visualize --top=7
+
+* To add a title in the graph, use the optional argument --title to set it
+
+Example usage:
+
+python log_parser.py profile.txt --regex=gdb_regex.json \
+--visualize --top=7 --title=magic_wand
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import collections
+import json
+import os
+import re
+import matplotlib.pyplot as plt
+
+
+def readlines(filename):
+  """
+  Arg:
+    filename(str):
+
+  Return:
+    (list of str):
+  """
+  with open(filename, "r") as f:
+    content = f.read().splitlines()
+
+  return content
+
+
+def writelines(data, filename):
+  # Write parsed log to file
+  with open(filename, "w") as f:
+    for line in data:
+      f.write(line + "\n")
+
+
+def load_regex_parser(filename):
+  """
+  Arg:
+    filename: string for the input json file containing regex
+  """
+  assert filename is not None
+
+  with open(filename, "r") as f:
+    content = json.load(f)
+
+  regex_parser = {}
+  for key, val in content.items():
+    if isinstance(val, list):
+      regexs = []
+      for pattern in val:
+        regexs.append(re.compile(pattern))
+
+      regex_parser[key] = regexs
+    else:
+      regex_parser[key] = re.compile(val)
+
+  return regex_parser
+
+
+def gdb_log_parser(data, output, re_file, ignore_list=None, full_trace=False):
+  """
+  Args:
+    data: list of strings of logs from GDB
+    output: string of output filename
+    re_file: path to the regex *.json file
+    ignore_list: list of string (functions) to ignore
+    full_trace: bool to generate full stack trace of the log
+  """
+  regex_parser = load_regex_parser(re_file)
+
+  trace = collections.defaultdict(list)
+  stack = []
+  processed = []
+  for line in data:
+    # Skip invalid lines
+    if not line.startswith("#"):
+      continue
+
+    # Skip redundant lines
+    if not full_trace and not line.startswith("#0"):
+      continue
+
+    # Remove ANSI color symbols
+    # line = ANSI_CLEANER.sub("", line)
+    line = regex_parser["base"].sub("", line)
+
+    # Extract function names with regex
+    find = None
+    for r in regex_parser["custom"]:
+      find = r.findall(line)
+
+      if len(find) != 0:
+        break
+
+    if find is None or len(find) == 0:
+      continue
+
+    # Extract content from `re.findall` results
+    target = find[0][0] if isinstance(find[0], tuple) else find[0]
+
+    # Extract function name from `$ADDR in $NAME`, e.g.
+    # `0x40002998 in __addsf3` -> `__addsf3`
+    if " in " in target:
+      target = target.split()[-1]
+
+    # Remove leading/trailing spaces
+    target = target.strip()
+
+    if full_trace:
+      if line.startswith("#0") and stack:
+        # Encode the trace to string
+        temp = "/".join(stack)
+        trace[stack[0]].append(temp)
+
+        # Clear up previous stack
+        stack.clear()
+
+      stack.append(target)
+
+    if not line.startswith("#0"):
+      continue
+
+    if ignore_list and target in ignore_list:
+      continue
+
+    # Strip the string before adding into parsed list
+    processed.append(target)
+
+  print("Extracted {} lines".format(len(processed)))
+
+  # Write parsed log to file
+  writelines(processed, output)
+
+  if full_trace:
+    content = {}
+    for top, paths in trace.items():
+      content[top] = []
+      counter = collections.Counter(paths)
+
+      for path, counts in counter.items():
+        info = {"counts": counts, "path": path.split("/")}
+        content[top].append(info)
+
+    name = os.path.splitext(output)[0]
+    with open(name + ".json", "w") as f:
+      json.dump(content, f, sort_keys=True, indent=4)
+
+  print("Parsed the log to `{}`".format(output))
+
+
+def renode_log_parser(data, output, ignore_list=None):
+  """
+  Args:
+    data: list of strings of logs from Renode
+    output: string of output filename
+    ignore_list: list of string (functions) to ignore
+  """
+  message = "Entering function"
+  extractor = re.compile(r"{} (.*) at".format(message))
+
+  ignore_count = 0
+  processed = []
+  for idx, line in enumerate(data):
+    print("Processing {:.2f}%".format((idx + 1) / len(data) * 100.), end="\r")
+
+    if message not in line:
+      continue
+
+    find = extractor.findall(line)
+
+    # Skip invalid find or unnamed functions
+    if len(find) == 0 or len(find[0].split()) == 0:
+      continue
+
+    entry = find[0].split()[0]
+
+    if ignore_list and entry in ignore_list:
+      ignore_count += 1
+      continue
+
+    processed.append(entry)
+
+  print("Extracted {} lines ({:.2f}%); {} lines are ignored ({:.2f}%)".format(
+      len(processed),
+      len(processed) / len(data) * 100., ignore_count,
+      ignore_count / len(data) * 100.))
+
+  # Write parsed log to file
+  writelines(processed, output)
+
+  print("Parsed the log to `{}`".format(output))
+
+
+def parse_log(filename,
+              output=None,
+              re_file=None,
+              source="gdb",
+              ignore=None,
+              full_trace=False):
+  """
+  Args:
+    filename(str)
+    output(str)
+  """
+  data = readlines(filename)
+  print("Raw log: {} lines".format(len(data)))
+
+  ignore_list = None
+  if ignore is not None:
+    ignore_list = set(readlines(ignore))
+    print("* {} patterns in the ignore list".format(len(ignore_list)))
+
+  name, ext = None, None
+  if output is None:
+    name, ext = os.path.splitext(filename)
+    output = "{}-parsed{}".format(name, ext)
+
+  if source == "gdb":
+    gdb_log_parser(data, output, re_file, ignore_list, full_trace)
+  elif source == "renode":
+    renode_log_parser(data, output, ignore_list=ignore_list)
+  else:
+    raise NotImplementedError
+
+
+def visualize_log(filename, top=None, title=None, show=False, save=True):
+  """
+  Arg:
+    filename(str)
+  """
+  data = readlines(filename)
+  print("Parsed log: {} lines".format(len(data)))
+
+  x, y = get_frequency(data)
+
+  if top is not None:
+    top *= -1
+    x, y = x[top:], y[top:]
+
+  plt.figure(figsize=(3, 5))
+  plt.barh(x, y)
+  plt.xlabel("Frequency")
+
+  if title:
+    plt.title(title)
+
+  if show:
+    plt.show()
+
+  if save:
+    fig_name = "{}.png".format(os.path.splitext(filename)[0])
+    plt.savefig(fname=fig_name, bbox_inches="tight", dpi=300)
+    print("Figure saved in {}".format(fig_name))
+
+
+def get_frequency(data):
+  """
+  Arg:
+    data(list of str):
+
+  Return:
+    keys(list of str):
+    vals(list of str):
+  """
+  counter = collections.Counter(data)
+
+  keys = [pair[0] for pair in sorted(counter.items(), key=lambda x: x[1])]
+  vals = sorted(counter.values())
+
+  return keys, vals
+
+
+if __name__ == "__main__":
+  parser = argparse.ArgumentParser()
+  parser.add_argument("input", type=str, help="Input raw log file.")
+  parser.add_argument("--output",
+                      type=str,
+                      help="Parsed log file. Default: [NAME]-parsed.[EXT]")
+  parser.add_argument("--regex",
+                      type=str,
+                      help="Path to the regex files for parsing GDB log.")
+  parser.add_argument("--visualize",
+                      action="store_true",
+                      help="Parse and visualize")
+  parser.add_argument("--top", type=int, help="Top # to visualize")
+  parser.add_argument("--source",
+                      type=str,
+                      default="gdb",
+                      choices=["gdb", "renode"],
+                      help="Source of where the log is captured")
+  parser.add_argument(
+      "--ignore",
+      type=str,
+      help="List of functions (one for each line in the file) to \
+                  ignore after parsing.")
+  parser.add_argument("--full-trace", action="store_true", help="")
+  parser.add_argument("--title",
+                      type=str,
+                      help="Set title for the visualized image")
+
+  args = parser.parse_args()
+
+  if args.output is None:
+    fname, extension = os.path.splitext(args.input)
+    args.output = "{}-parsed{}".format(fname, extension)
+
+  parse_log(args.input, args.output, args.regex, args.source, args.ignore,
+            args.full_trace)
+
+  if args.visualize:
+    visualize_log(args.output, top=args.top, title=args.title)
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/conv.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/conv.cc
index 273a2fad583..2c3577d77be 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/conv.cc
@@ -28,11 +28,37 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace conv {
-namespace xtensa {
-namespace hifimini {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kFilterTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+
+// Conv is quantized along dimension 0:
+// https://www.tensorflow.org/lite/performance/quantization_spec
+constexpr int kConvQuantizedDimension = 0;
+
+struct OpData {
+  TfLitePaddingValues padding;
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t output_zero_point;
+
+  // Per channel output multiplier and shift.
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
+
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+};
 
 void ConvPerChannel(const ConvParams& params, const int32_t* output_multiplier,
                     const int32_t* output_shift,
@@ -56,7 +82,6 @@ void ConvPerChannel(const ConvParams& params, const int32_t* output_multiplier,
   const int input_height = input_shape.Dims(1);
   const int input_width = input_shape.Dims(2);
   const int input_depth = input_shape.Dims(3);
-  const int input_depth_iters = input_depth / 2;
 
   const int filter_height = filter_shape.Dims(1);
   const int filter_width = filter_shape.Dims(2);
@@ -80,7 +105,7 @@ void ConvPerChannel(const ConvParams& params, const int32_t* output_multiplier,
           ae_q56s acc_56 = AE_ZEROQ56();
 
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+            for (int filter_x = 0; filter_x < filter_width; filter_x += 2) {
               const int in_x = in_x_origin + dilation_width_factor * filter_x;
               const int in_y = in_y_origin + dilation_height_factor * filter_y;
               const bool is_point_inside_image =
@@ -93,10 +118,10 @@ void ConvPerChannel(const ConvParams& params, const int32_t* output_multiplier,
                 // with intrinsics:
                 int input_idx =
                     ((batch * input_height + in_y) * input_width + in_x) *
-                        input_depth -
+                        input_depth * 2 -
                     2;
                 const int8_t* input_vals_offset_ptr = input_data + input_idx;
-                for (int i = 0; i < input_depth_iters; ++i) {
+                for (int i = 0; i < input_depth; i += 2) {
                   // Load signed 2x 8bit values and right shift into 24bit
                   // alignment:
                   ae_p24x2s input_vals_24x2;
@@ -113,7 +138,7 @@ void ConvPerChannel(const ConvParams& params, const int32_t* output_multiplier,
                       ((out_channel * filter_height + filter_y) * filter_width +
                        filter_x) *
                           filter_depth +
-                      (i * 2) - 2;
+                      i - 2;
                   const int8_t* filter_vals_offset_ptr =
                       filter_data + filter_idx;
 
@@ -145,9 +170,10 @@ void ConvPerChannel(const ConvParams& params, const int32_t* output_multiplier,
           ae_p24x2s acc_24x2 = AE_TRUNCP24Q48(acc_56);
 
           // Apply quantized multiplier and accumulate result at 48bit
-          // alignment:
-          acc_56 = micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
-              acc_24x2, output_multiplier[out_channel],
+          // alignment. Convert the (unsigned) 32-bit multiplier down to a
+          // 24-bit multiplier.
+          acc_56 = MultiplyByQuantizedMultiplier(
+              acc_24x2, output_multiplier[out_channel] >> 8,
               output_shift[out_channel]);
 
           // Add output offset, cap activation, and assign to the output:
@@ -223,8 +249,8 @@ inline void Conv1x32Input32x32Filter(
 
     // Apply quantized multiplier and accumulate result at 48bit alignment.
     // Convert the (unsigned) 32-bit multiplier down to a 24-bit multiplier.
-    acc_56 = micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
-        acc_24x2, output_multiplier[ch] >> 8, output_shift[ch]);
+    acc_56 = MultiplyByQuantizedMultiplier(acc_24x2, output_multiplier[ch] >> 8,
+                                           output_shift[ch]);
 
     // Add output offset, cap activation, and assign to the output:
     acc_56 = AE_ADDQ56(acc_56, output_offset_56);
@@ -235,39 +261,6 @@ inline void Conv1x32Input32x32Filter(
   }
 }
 
-}  // namespace hifimini
-}  // namespace xtensa
-
-constexpr int kInputTensor = 0;
-constexpr int kFilterTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-
-// Conv is quantized along dimension 0:
-// https://www.tensorflow.org/lite/performance/quantization_spec
-constexpr int kConvQuantizedDimension = 0;
-
-struct OpData {
-  TfLitePaddingValues padding;
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-
-  // Cached tensor zero point values for quantized operations.
-  int32_t input_zero_point;
-  int32_t output_zero_point;
-
-  // Per channel output multiplier and shift.
-  int32_t* per_channel_output_multiplier;
-  int32_t* per_channel_output_shift;
-
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-};
-
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
                              TfLiteConvParams* params, int width, int height,
                              int filter_width, int filter_height, int out_width,
@@ -386,16 +379,16 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
   op_params.quantized_activation_min = data->output_activation_min;
   op_params.quantized_activation_max = data->output_activation_max;
 
-  xtensa::hifimini::ConvPerChannel(
-      op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, tflite::micro::GetTensorShape(input),
-      tflite::micro::GetTensorData<int8_t>(input),
-      tflite::micro::GetTensorShape(filter),
-      tflite::micro::GetTensorData<int8_t>(filter),
-      tflite::micro::GetTensorShape(bias),
-      tflite::micro::GetTensorData<int32_t>(bias),
-      tflite::micro::GetTensorShape(output),
-      tflite::micro::GetTensorData<int8_t>(output));
+  ConvPerChannel(op_params, data->per_channel_output_multiplier,
+                 data->per_channel_output_shift,
+                 tflite::micro::GetTensorShape(input),
+                 tflite::micro::GetTensorData<int8_t>(input),
+                 tflite::micro::GetTensorShape(filter),
+                 tflite::micro::GetTensorData<int8_t>(filter),
+                 tflite::micro::GetTensorShape(bias),
+                 tflite::micro::GetTensorData<int32_t>(bias),
+                 tflite::micro::GetTensorShape(output),
+                 tflite::micro::GetTensorData<int8_t>(output));
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
@@ -420,7 +413,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   if (input_dims[0] == 1 && input_dims[1] == 1 && input_dims[2] == 1 &&
       input_dims[3] == 32 && filter_dims[0] == 32 && filter_dims[1] == 1 &&
       filter_dims[2] == 1 && filter_dims[3] == 32) {
-    xtensa::hifimini::Conv1x32Input32x32Filter(
+    Conv1x32Input32x32Filter(
         -op_data->input_zero_point, op_data->output_zero_point,
         op_data->output_activation_min, op_data->output_activation_max,
         op_data->per_channel_output_multiplier,
@@ -447,20 +440,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   }
   return kTfLiteOk;
 }
-
-}  // namespace conv
+}  // namespace
 
 TfLiteRegistration Register_CONV_2D() {
-  return {/*init=*/conv::Init,
+  return {/*init=*/Init,
           /*free=*/nullptr,
-          /*prepare=*/conv::Prepare,
-          /*invoke=*/conv::Eval,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/depthwise_conv.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/depthwise_conv.cc
index 0d79d56ad0c..4a37becbf4d 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/depthwise_conv.cc
@@ -28,11 +28,38 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace depthwise_conv {
-namespace xtensa {
-namespace hifimini {
+namespace {
+
+constexpr int kInputTensor = 0;
+constexpr int kFilterTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
+
+// Depthwise conv is quantized along dimension 3:
+// https://www.tensorflow.org/lite/performance/quantization_spec
+constexpr int kDepthwiseConvQuantizedDimension = 3;
+
+struct OpData {
+  TfLitePaddingValues padding;
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t output_zero_point;
+
+  // Per channel output multiplier and shift.
+  // TODO(b/141139247): Allocate these dynamically when possible.
+  int32_t* per_channel_output_multiplier;
+  int32_t* per_channel_output_shift;
+
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+};
 
 inline void DepthwiseConvPerChannel(
     const DepthwiseParams& params, const int32_t* output_multiplier,
@@ -145,7 +172,7 @@ inline void DepthwiseConvPerChannel(
 
             // Apply quantized multiplier and accumulate result at 48bit
             // alignment:
-            acc_56 = micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
+            acc_56 = MultiplyByQuantizedMultiplier(
                 acc_24x2, output_multiplier[output_channel],
                 output_shift[output_channel]);
 
@@ -260,12 +287,10 @@ inline void DepthwiseConv4x32MatchingInputAndFilter(
 
     // Apply quantized multiplier and accumulate result at 48bit
     // alignment:
-    block_0_acc = micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
-        acc_24x2_0, mult, shift);
+    block_0_acc = MultiplyByQuantizedMultiplier(acc_24x2_0, mult, shift);
     // Apply quantized multiplier and accumulate result at 48bit
     // alignment:
-    block_1_acc = micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
-        acc_24x2_1, mult, shift);
+    block_1_acc = MultiplyByQuantizedMultiplier(acc_24x2_1, mult, shift);
 
     // Add output offset, cap activation, and assign to the output:
     block_0_acc = AE_ADDQ56(block_0_acc, output_offset_56);
@@ -280,42 +305,6 @@ inline void DepthwiseConv4x32MatchingInputAndFilter(
   }
 }
 
-}  // namespace hifimini
-}  // namespace xtensa
-
-namespace {
-
-constexpr int kInputTensor = 0;
-constexpr int kFilterTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-
-// Depthwise conv is quantized along dimension 3:
-// https://www.tensorflow.org/lite/performance/quantization_spec
-constexpr int kDepthwiseConvQuantizedDimension = 3;
-
-struct OpData {
-  TfLitePaddingValues padding;
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-
-  // Cached tensor zero point values for quantized operations.
-  int32_t input_zero_point;
-  int32_t output_zero_point;
-
-  // Per channel output multiplier and shift.
-  // TODO(b/141139247): Allocate these dynamically when possible.
-  int32_t* per_channel_output_multiplier;
-  int32_t* per_channel_output_shift;
-
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-};
-
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
                              TfLiteDepthwiseConvParams* params, int width,
                              int height, int filter_width, int filter_height,
@@ -353,8 +342,6 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
   return kTfLiteOk;
 }
 
-}  // namespace
-
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
   return context->AllocatePersistentBuffer(context, sizeof(OpData));
@@ -437,16 +424,16 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
   op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
   op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
 
-  xtensa::hifimini::DepthwiseConvPerChannel(
-      op_params, data->per_channel_output_multiplier,
-      data->per_channel_output_shift, tflite::micro::GetTensorShape(input),
-      tflite::micro::GetTensorData<int8_t>(input),
-      tflite::micro::GetTensorShape(filter),
-      tflite::micro::GetTensorData<int8_t>(filter),
-      tflite::micro::GetTensorShape(bias),
-      tflite::micro::GetTensorData<int32_t>(bias),
-      tflite::micro::GetTensorShape(output),
-      tflite::micro::GetTensorData<int8_t>(output));
+  DepthwiseConvPerChannel(op_params, data->per_channel_output_multiplier,
+                          data->per_channel_output_shift,
+                          tflite::micro::GetTensorShape(input),
+                          tflite::micro::GetTensorData<int8_t>(input),
+                          tflite::micro::GetTensorShape(filter),
+                          tflite::micro::GetTensorData<int8_t>(filter),
+                          tflite::micro::GetTensorShape(bias),
+                          tflite::micro::GetTensorData<int32_t>(bias),
+                          tflite::micro::GetTensorShape(output),
+                          tflite::micro::GetTensorData<int8_t>(output));
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
@@ -473,7 +460,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   if (input_dims[0] == 1 && input_dims[1] == 4 && input_dims[2] == 1 &&
       input_dims[3] == 32 && filter_dims[0] == 1 && filter_dims[1] == 4 &&
       filter_dims[2] == 1 && filter_dims[3] == 32) {
-    xtensa::hifimini::DepthwiseConv4x32MatchingInputAndFilter(
+    DepthwiseConv4x32MatchingInputAndFilter(
         -op_data->input_zero_point, op_data->output_zero_point,
         std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
         op_data->per_channel_output_multiplier,
@@ -500,19 +487,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-}  // namespace depthwise_conv
+}  // namespace
 
 TfLiteRegistration Register_DEPTHWISE_CONV_2D() {
-  return {/*init=*/depthwise_conv::Init,
+  return {/*init=*/Init,
           /*free=*/nullptr,
-          /*prepare=*/depthwise_conv::Prepare,
-          /*invoke=*/depthwise_conv::Eval,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
index 918192c4d8f..a1d14df1352 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
@@ -25,19 +25,13 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace xtensa {
-namespace hifimini {
 
 // INT24 MIN/MAX
 #define INT24_MIN -8388608
 #define INT24_MAX 8388607
 
-//
 // Multiply 24bit value by a quantized multiplier (w/ shift) and returns a 48bit
 // aligned value in the QR register.
-//
 inline ae_q56s MultiplyByQuantizedMultiplier(ae_p24x2s x_24x2,
                                              int32_t quantized_multiplier,
                                              int shift) {
@@ -77,13 +71,10 @@ inline ae_q56s MultiplyByQuantizedMultiplier(ae_p24x2s x_24x2,
   return result_56;
 }
 
-//
 // Multiply 32bit value by a quantized multiplier (w/ shift) and returns a 48bit
 // aligned value in the QR register.
-//
-inline ae_q56s MultiplyByQuantizedMultiplier(int32_t x,
-                                             int32_t quantized_multiplier,
-                                             int shift) {
+inline ae_q56s MultiplyByQuantizedMultiplierResult48Bit(
+    int32_t x, int32_t quantized_multiplier, int shift) {
   // Convert x into a 2x24bit PR register file. If x is outside the numerical
   // limits of a 24bit integer, the "fractional" or lower 8bits are discarded.
   // If x is within the range of a 24 bit integer, the "signed" or upper 8bits
@@ -99,11 +90,10 @@ inline ae_q56s MultiplyByQuantizedMultiplier(int32_t x,
   return MultiplyByQuantizedMultiplier(x_24x2, quantized_multiplier, shift);
 }
 
-//
 // Calculate quantization params for 24bit runtimes.
-//
-inline void QuantizeMultiplier(float multiplier, int32_t* quantized_multiplier,
-                               int* shift) {
+inline void QuantizeMultiplierForInt24(float multiplier,
+                                       int32_t* quantized_multiplier,
+                                       int* shift) {
   if (multiplier == 0.0f) {
     *quantized_multiplier = 0;
     *shift = 0;
@@ -130,9 +120,7 @@ inline void QuantizeMultiplier(float multiplier, int32_t* quantized_multiplier,
   *quantized_multiplier = static_cast<int32_t>(q_fixed);
 }
 
-//
 // Convert a floating point number to a Q representation for 24 bit integers.
-//
 inline int CreateQConstantForInt24(int integer_bits, float f) {
   const float min_bounds = static_cast<float>(INT24_MIN);
   const float max_bounds = static_cast<float>(INT24_MAX);
@@ -144,10 +132,6 @@ inline int CreateQConstantForInt24(int integer_bits, float f) {
   return static_cast<int>(raw);
 }
 
-}  // namespace hifimini
-}  // namespace xtensa
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_MICRO_KERNELS_XTENSA_HIFIMINI_FIXEDPOINT_UTILS_H_
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
index 4bd9875d152..30a5b6a602a 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
@@ -28,11 +28,31 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
+namespace {
 
-namespace xtensa {
-namespace hifimini {
+struct OpData {
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+
+  // Cached tensor zero point values for quantized operations.
+  int32_t input_zero_point;
+  int32_t filter_zero_point;
+  int32_t output_zero_point;
+
+  // The range of the fused activation layer. For example for kNone and
+  // uint8_t these would be 0 and 255.
+  int32_t output_activation_min;
+  int32_t output_activation_max;
+  // The index of the temporary tensor where the quantized inputs are cached.
+  int input_quantized_index;
+};
+
+constexpr int kInputTensor = 0;
+constexpr int kWeightsTensor = 1;
+constexpr int kBiasTensor = 2;
+constexpr int kOutputTensor = 0;
 
 void FullyConnected(const FullyConnectedParams& params,
                     const RuntimeShape& input_shape, const int8_t* input_data,
@@ -118,36 +138,6 @@ void FullyConnected(const FullyConnectedParams& params,
   }
 }
 
-}  // namespace hifimini
-}  // namespace xtensa
-
-namespace fully_connected {
-namespace {
-
-struct OpData {
-  // The scaling factor from input to output (aka the 'real multiplier') can
-  // be represented as a fixed point multiplier plus a left shift.
-  int32_t output_multiplier;
-  int output_shift;
-
-  // Cached tensor zero point values for quantized operations.
-  int32_t input_zero_point;
-  int32_t filter_zero_point;
-  int32_t output_zero_point;
-
-  // The range of the fused activation layer. For example for kNone and
-  // uint8_t these would be 0 and 255.
-  int32_t output_activation_min;
-  int32_t output_activation_max;
-  // The index of the temporary tensor where the quantized inputs are cached.
-  int input_quantized_index;
-};
-
-constexpr int kInputTensor = 0;
-constexpr int kWeightsTensor = 1;
-constexpr int kBiasTensor = 2;
-constexpr int kOutputTensor = 0;
-
 TfLiteStatus CalculateOpData(TfLiteContext* context,
                              TfLiteFusedActivation activation,
                              TfLiteType data_type, const TfLiteTensor* input,
@@ -157,15 +147,13 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
   double real_multiplier = 0.0;
   TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
       context, input, filter, bias, output, &real_multiplier));
-  xtensa::hifimini::QuantizeMultiplier(
-      real_multiplier, &data->output_multiplier, &data->output_shift);
+  QuantizeMultiplierForInt24(real_multiplier, &data->output_multiplier,
+                             &data->output_shift);
   return CalculateActivationRangeQuantized(context, activation, output,
                                            &data->output_activation_min,
                                            &data->output_activation_max);
 }
 
-}  // namespace
-
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
   return context->AllocatePersistentBuffer(context, sizeof(OpData));
@@ -218,15 +206,14 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
   op_params.quantized_activation_min = data.output_activation_min;
   op_params.quantized_activation_max = data.output_activation_max;
 
-  xtensa::hifimini::FullyConnected(
-      op_params, tflite::micro::GetTensorShape(input),
-      tflite::micro::GetTensorData<int8_t>(input),
-      tflite::micro::GetTensorShape(filter),
-      tflite::micro::GetTensorData<int8_t>(filter),
-      tflite::micro::GetTensorShape(bias),
-      tflite::micro::GetTensorData<int32_t>(bias),
-      tflite::micro::GetTensorShape(output),
-      tflite::micro::GetTensorData<int8_t>(output));
+  FullyConnected(op_params, tflite::micro::GetTensorShape(input),
+                 tflite::micro::GetTensorData<int8_t>(input),
+                 tflite::micro::GetTensorShape(filter),
+                 tflite::micro::GetTensorData<int8_t>(filter),
+                 tflite::micro::GetTensorShape(bias),
+                 tflite::micro::GetTensorData<int32_t>(bias),
+                 tflite::micro::GetTensorShape(output),
+                 tflite::micro::GetTensorData<int8_t>(output));
   return kTfLiteOk;
 }
 
@@ -249,19 +236,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return EvalQuantizedInt8(context, node, data, input, filter, bias, output);
 }
 
-}  // namespace fully_connected
+}  // namespace
 
 TfLiteRegistration Register_FULLY_CONNECTED() {
-  return {/*init=*/fully_connected::Init,
+  return {/*init=*/Init,
           /*free=*/nullptr,
-          /*prepare=*/fully_connected::Prepare,
-          /*invoke=*/fully_connected::Eval,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
index bbd2a7d9fea..b867e70d98b 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
@@ -25,11 +25,12 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
+namespace {
 
-namespace xtensa {
-namespace hifimini {
+struct OpData {
+  int32_t zero_point = 0;
+  int scale_multiplier = 0;
+};
 
 void AffineQuantize(int scale_multiplier, const int32_t zero_point,
                     const RuntimeShape& input_shape, const int16_t* input_data,
@@ -98,16 +99,6 @@ void AffineQuantize(int scale_multiplier, const int32_t zero_point,
   }
 }
 
-}  // namespace hifimini
-}  // namespace xtensa
-
-namespace quantize {
-
-struct OpData {
-  int32_t zero_point = 0;
-  int scale_multiplier = 0;
-};
-
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
   return context->AllocatePersistentBuffer(context, sizeof(OpData));
@@ -121,8 +112,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
 
   // TODO(b/155682734): Fix dangerous input/output scale ratio assumptions.
-  op_data->scale_multiplier = xtensa::hifimini::CreateQConstantForInt24(
-      0, input->params.scale / output->params.scale);
+  op_data->scale_multiplier =
+      CreateQConstantForInt24(0, input->params.scale / output->params.scale);
 
   op_data->zero_point = output->params.zero_point;
 
@@ -146,31 +137,25 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     return kTfLiteError;
   }
 
-  xtensa::hifimini::AffineQuantize(
-      op_data->scale_multiplier, op_data->zero_point,
-      tflite::micro::GetTensorShape(input),
-      tflite::micro::GetTensorData<int16_t>(input),
-      tflite::micro::GetTensorShape(output),
-      tflite::micro::GetTensorData<int8_t>(output));
+  AffineQuantize(op_data->scale_multiplier, op_data->zero_point,
+                 tflite::micro::GetTensorShape(input),
+                 tflite::micro::GetTensorData<int16_t>(input),
+                 tflite::micro::GetTensorShape(output),
+                 tflite::micro::GetTensorData<int8_t>(output));
   return kTfLiteOk;
 }
 
-}  // namespace quantize
+}  // namespace
 
-// This Op (QUANTIZE) quantizes the input and produces quantized output.
-// AffineQuantize takes scale and zero point and quantizes the float value to
-// quantized output, in int8_t or uint8_t format.
 TfLiteRegistration Register_QUANTIZE() {
-  return {/*init=*/quantize::Init,
+  return {/*init=*/Init,
           /*free=*/nullptr,
-          /*prepare=*/quantize::Prepare,
-          /*invoke=*/quantize::Eval,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
index f0a0134d7e6..79a44e2c670 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
@@ -25,9 +25,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/kernel_util.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace activations {
 namespace {
 
 struct OpData {
@@ -105,8 +102,6 @@ TfLiteStatus Softmax(OpData op_data, const RuntimeShape& input_shape,
   return kTfLiteOk;
 }
 
-}  // namespace
-
 TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
                                     const TfLiteTensor* input,
                                     TfLiteTensor* output,
@@ -196,19 +191,18 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
     return kTfLiteError;
   }
 }
-}  // namespace activations
+
+}  // namespace
 
 TfLiteRegistration Register_SOFTMAX() {
-  return {/*init=*/activations::SoftmaxInit,
+  return {/*init=*/SoftmaxInit,
           /*free=*/nullptr,
-          /*prepare=*/activations::SoftmaxPrepare,
-          /*invoke=*/activations::SoftmaxEval,
+          /*prepare=*/SoftmaxPrepare,
+          /*invoke=*/SoftmaxEval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
index 545e91bab3d..28f8f1e1af0 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/svdf.cc
@@ -28,9 +28,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h"
 
 namespace tflite {
-namespace ops {
-namespace micro {
-namespace svdf {
 namespace {
 
 struct OpData {
@@ -153,10 +150,8 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
         dot_prod_56 = AE_Q56S_SLAI(dot_prod_56, 24);
         ae_p24x2s dot_prod_24x2 = AE_TRUNCP24Q48(dot_prod_56);
 
-        dot_prod_56 =
-            tflite::ops::micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
-                dot_prod_24x2, data.effective_scale_1_a,
-                data.effective_scale_1_b);
+        dot_prod_56 = MultiplyByQuantizedMultiplier(
+            dot_prod_24x2, data.effective_scale_1_a, data.effective_scale_1_b);
 
         // Cap min/max and convert to int32_t:
         dot_prod_56 = AE_MAXQ56S(dot_prod_56, output_int16_min_56);
@@ -247,10 +242,9 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
     ae_q56s output_int8_min_56 = AE_CVTQ48A32S(INT8_MIN);
     ae_q56s output_zp_56 = AE_CVTQ48A32S(data.output_zero_point);
     for (int i = 0; i < n_batch * n_unit; ++i) {
-      ae_q56s x_56 =
-          tflite::ops::micro::xtensa::hifimini::MultiplyByQuantizedMultiplier(
-              scratch_output_tensor[i], data.effective_scale_2_a,
-              data.effective_scale_2_b);
+      ae_q56s x_56 = MultiplyByQuantizedMultiplierResult48Bit(
+          scratch_output_tensor[i], data.effective_scale_2_a,
+          data.effective_scale_2_b);
       // Add output adjustment:
       x_56 = AE_ADDQ56(x_56, output_zp_56);
       // Cap min/max and convert to int32_t (already aligned to 32bit):
@@ -262,8 +256,6 @@ void EvalIntegerSVDF(TfLiteContext* context, TfLiteNode* node,
   }
 }
 
-}  // namespace
-
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context != nullptr);
   return context->AllocatePersistentBuffer(context, sizeof(OpData));
@@ -365,12 +357,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TFLITE_DCHECK(node->user_data != nullptr);
   OpData* data = static_cast<OpData*>(node->user_data);
 
-  xtensa::hifimini::QuantizeMultiplier(effective_scale_1,
-                                       &data->effective_scale_1_a,
-                                       &data->effective_scale_1_b);
-  xtensa::hifimini::QuantizeMultiplier(effective_scale_2,
-                                       &data->effective_scale_2_a,
-                                       &data->effective_scale_2_b);
+  QuantizeMultiplierForInt24(effective_scale_1, &data->effective_scale_1_a,
+                             &data->effective_scale_1_b);
+  QuantizeMultiplierForInt24(effective_scale_2, &data->effective_scale_2_a,
+                             &data->effective_scale_2_b);
 
   data->input_zero_point = input->params.zero_point;
   data->output_zero_point = output->params.zero_point;
@@ -414,19 +404,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-}  // namespace svdf
+}  // namespace
 
 TfLiteRegistration Register_SVDF() {
-  return {/*init=*/svdf::Init,
+  return {/*init=*/Init,
           /*free=*/nullptr,
-          /*prepare=*/svdf::Prepare,
-          /*invoke=*/svdf::Eval,
+          /*prepare=*/Prepare,
+          /*invoke=*/Eval,
           /*profiling_string=*/nullptr,
           /*builtin_code=*/0,
           /*custom_name=*/nullptr,
           /*version=*/0};
 }
 
-}  // namespace micro
-}  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/memory_arena_threshold_test.cc b/tensorflow/lite/micro/memory_arena_threshold_test.cc
index d2bb404051a..bf78945816b 100644
--- a/tensorflow/lite/micro/memory_arena_threshold_test.cc
+++ b/tensorflow/lite/micro/memory_arena_threshold_test.cc
@@ -49,15 +49,16 @@ constexpr int kKeywordModelNodeAndRegistrationCount = 15;
 // Run this test with '--copt=-DTF_LITE_STATIC_MEMORY' to get optimized memory
 // runtime values:
 #ifdef TF_LITE_STATIC_MEMORY
-constexpr int kKeywordModelTotalSize = 14336;
-constexpr int kKeywordModelTailSize = 13664;
+constexpr int kKeywordModelTotalSize = 14160;
+constexpr int kKeywordModelTailSize = 13488;
 #else
-constexpr int kKeywordModelTotalSize = 14704;
-constexpr int kKeywordModelTailSize = 14032;
+constexpr int kKeywordModelTotalSize = 14512;
+constexpr int kKeywordModelTailSize = 13840;
 #endif
 constexpr int kKeywordModelHeadSize = 672;
 constexpr int kKeywordModelTfLiteTensorVariableBufferDataSize = 10240;
 constexpr int kKeywordModelOpRuntimeDataSize = 148;
+constexpr int kKeywordModelPersistentBufferDataSize = 532;
 
 constexpr int kTestConvModelArenaSize = 12 * 1024;
 uint8_t test_conv_tensor_arena[kTestConvModelArenaSize];
@@ -68,14 +69,15 @@ constexpr int kTestConvModelNodeAndRegistrationCount = 7;
 // NOTE: These values are measured on x86-64:
 // TODO(b/158651472): Consider auditing these values on non-64 bit systems.
 #ifdef TF_LITE_STATIC_MEMORY
-constexpr int kTestConvModelTotalSize = 9552;
-constexpr int kTestConvModelTailSize = 1808;
+constexpr int kTestConvModelTotalSize = 9584;
+constexpr int kTestConvModelTailSize = 1840;
 #else
-constexpr int kTestConvModelTotalSize = 9712;
-constexpr int kTestConvModelTailSize = 1968;
+constexpr int kTestConvModelTotalSize = 9760;
+constexpr int kTestConvModelTailSize = 2016;
 #endif
 constexpr int kTestConvModelHeadSize = 7744;
 constexpr int kTestConvModelOpRuntimeDataSize = 136;
+constexpr int kTestConvModelPersistentBufferDataSize = 648;
 
 struct ModelAllocationThresholds {
   size_t tensor_count = 0;
@@ -85,6 +87,7 @@ struct ModelAllocationThresholds {
   size_t tail_alloc_size = 0;
   size_t tensor_variable_buffer_data_size = 0;
   size_t op_runtime_data_size = 0;
+  size_t persistent_buffer_data = 0;
 };
 
 void EnsureAllocatedSizeThreshold(const char* allocation_type, size_t actual,
@@ -148,6 +151,13 @@ void ValidateModelAllocationThresholds(
                                      kPersistentTfLiteTensorQuantizationData)
           .used_bytes,
       0);
+  EnsureAllocatedSizeThreshold(
+      "PersistentBufferData",
+      allocator
+          .GetRecordedAllocation(
+              tflite::RecordedAllocationType::kPersistentBufferData)
+          .used_bytes,
+      thresholds.persistent_buffer_data);
   EnsureAllocatedSizeThreshold(
       "NodeAndRegistration",
       allocator
@@ -195,6 +205,7 @@ TF_LITE_MICRO_TEST(TestKeywordModelMemoryThreshold) {
   thresholds.tensor_variable_buffer_data_size =
       kKeywordModelTfLiteTensorVariableBufferDataSize;
   thresholds.op_runtime_data_size = kKeywordModelOpRuntimeDataSize;
+  thresholds.persistent_buffer_data = kKeywordModelPersistentBufferDataSize;
 
   ValidateModelAllocationThresholds(interpreter.GetMicroAllocator(),
                                     thresholds);
@@ -216,6 +227,7 @@ TF_LITE_MICRO_TEST(TestConvModelMemoryThreshold) {
   thresholds.head_alloc_size = kTestConvModelHeadSize;
   thresholds.tail_alloc_size = kTestConvModelTailSize;
   thresholds.op_runtime_data_size = kTestConvModelOpRuntimeDataSize;
+  thresholds.persistent_buffer_data = kTestConvModelPersistentBufferDataSize;
 
   ValidateModelAllocationThresholds(interpreter.GetMicroAllocator(),
                                     thresholds);
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index 881b9b9abb0..770921b4234 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -36,6 +36,15 @@ limitations under the License.
 namespace tflite {
 
 namespace {
+
+// Maximum number of scratch buffer requests per operator. Operator kernels that
+// request more than this value will receive an exception.
+constexpr size_t kMaxScratchBuffersPerOp = 8;
+
+// Sentinel value used as a placeholder to mark a ScratchBufferRequest request
+// needs a node id assignment.
+constexpr int kUnassignedScratchBufferRequestIndex = -1;
+
 // Used to hold information used during allocation calculations.
 struct AllocationInfo {
   size_t bytes;
@@ -50,7 +59,7 @@ struct AllocationInfo {
 // requirement for SIMD extensions.
 constexpr int kBufferAlignment = 16;
 constexpr char kOfflineMemAllocMetadata[] = "OfflineMemoryAllocation";
-const TfLiteIntArray kZeroLengthIntArray = {0, {}};
+const TfLiteIntArray kZeroLengthIntArray = {};
 
 class MicroBuiltinDataAllocator : public BuiltinDataAllocator {
  public:
@@ -143,17 +152,12 @@ TfLiteStatus CheckOfflinePlannedOffsets(const Model* model,
 // plan. Methods need to be called in order from `Init`, `Add*`, to `Finish`.
 class AllocationInfoBuilder {
  public:
-  AllocationInfoBuilder(ErrorReporter* reporter,
-                        SimpleMemoryAllocator* allocator)
-      : reporter_(reporter), allocator_(allocator) {}
-
-  // Initializes the builder by allocating AllocationInfo array from the
-  // simple memory allocator.
-  TfLiteStatus Init(size_t tensor_count, size_t scratch_buffer_count) {
-    tensor_count_ = tensor_count;
-    buffer_count_ = scratch_buffer_count;
-    return Allocate();
-  }
+  AllocationInfoBuilder(AllocationInfo* info, size_t tensor_count,
+                        size_t scratch_buffer_count, ErrorReporter* reporter)
+      : info_(info),
+        tensor_count_(tensor_count),
+        buffer_count_(scratch_buffer_count),
+        reporter_(reporter) {}
 
   // Check if model contains offline planned buffer offsets.
   //  - If there's no metadata available, offline_planner_offsets is not set
@@ -168,37 +172,20 @@ class AllocationInfoBuilder {
                           TfLiteEvalTensor* eval_tensors);
 
   // Add allocation information for the scratch buffers.
-  TfLiteStatus AddScratchBuffers(internal::ScratchBufferHandle* buffer_handles);
+  TfLiteStatus AddScratchBuffers(
+      internal::ScratchBufferRequest* scratch_buffer_requests,
+      ScratchBufferHandle* scratch_buffer_handles);
 
   // Returns a pointer to the built AllocationInfo array.
   const AllocationInfo* Finish() const { return info_; }
-  size_t Size() const { return tensor_count_ + buffer_count_; }
 
  private:
-  // Allocate the output AllocationInfo array from the allocator_;
-  TfLiteStatus Allocate();
-
-  ErrorReporter* reporter_ = nullptr;
-  SimpleMemoryAllocator* allocator_ = nullptr;
+  AllocationInfo* info_ = nullptr;
   size_t tensor_count_ = 0;
   size_t buffer_count_ = 0;
-  AllocationInfo* info_ = nullptr;
+  ErrorReporter* reporter_ = nullptr;
 };
 
-TfLiteStatus AllocationInfoBuilder::Allocate() {
-  size_t bytes = sizeof(AllocationInfo) * Size();
-  info_ = reinterpret_cast<AllocationInfo*>(
-      allocator_->AllocateFromTail(bytes, alignof(AllocationInfo)));
-  if (info_ == nullptr) {
-    TF_LITE_REPORT_ERROR(
-        reporter_,
-        "Failed to allocate memory for allocation_info, %d bytes required",
-        bytes);
-    return kTfLiteError;
-  }
-  return kTfLiteOk;
-}
-
 TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
                                                const int32_t* offline_offsets,
                                                TfLiteEvalTensor* eval_tensors) {
@@ -242,6 +229,21 @@ TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph,
     for (size_t n = 0; n < op->inputs()->size(); ++n) {
       const int tensor_index = op->inputs()->Get(n);
       AllocationInfo* current = &info_[tensor_index];
+
+      // TODO(b/166484865): Figure out a more general solution.
+      // This workaround is needed to handle situations where subgraph input !=
+      // operator input.
+      // In case operator input(s) are not in subgraph inputs initialize them.
+      if (current->first_created == 0) {
+        for (size_t op_input = 0; op_input < op->inputs()->size(); ++op_input) {
+          const int op_tensor_index = op->inputs()->Get(op_input);
+          AllocationInfo* op_current = &info_[op_tensor_index];
+          if (op_current->needs_allocating && op_current->first_created == -1) {
+            op_current->first_created = i;
+          }
+        }
+      }
+
       if (((current->last_used == -1) || (current->last_used < i))) {
         current->last_used = i;
       }
@@ -327,18 +329,22 @@ TfLiteStatus AllocationInfoBuilder::GetOfflinePlannedOffsets(
 }
 
 TfLiteStatus AllocationInfoBuilder::AddScratchBuffers(
-    internal::ScratchBufferHandle* buffer_handles) {
+    internal::ScratchBufferRequest* scratch_buffer_requests,
+    ScratchBufferHandle* scratch_buffer_handles) {
   // Set up allocation info for buffers.
   for (size_t i = tensor_count_; i < tensor_count_ + buffer_count_; ++i) {
+    internal::ScratchBufferRequest* current_request =
+        &(scratch_buffer_requests[i - tensor_count_]);
+    ScratchBufferHandle* current_handle =
+        &(scratch_buffer_handles[i - tensor_count_]);
+
     AllocationInfo* current = &info_[i];
-    internal::ScratchBufferHandle* handle =
-        &(buffer_handles[i - tensor_count_]);
-    current->output_ptr = reinterpret_cast<void**>(&handle->data);
-    current->bytes = handle->bytes;
-    current->first_created = handle->node_idx;
-    current->last_used = handle->node_idx;
-    current->needs_allocating = true;
+    current->output_ptr = reinterpret_cast<void**>(&current_handle->data);
+    current->bytes = current_request->bytes;
+    current->first_created = current_request->node_idx;
+    current->last_used = current_request->node_idx;
     current->offline_offset = kOnlinePlannedBuffer;
+    current->needs_allocating = true;
   }
   return kTfLiteOk;
 }
@@ -444,7 +450,7 @@ void* GetFlatbufferTensorBuffer(
   // and if there is update the runtime structure to point to its location in
   // memory.
   // First see if there's any buffer information in the serialized tensor.
-  // TODO(b/160894903): Add better unit tests that validate flatbuffer values.
+  // TODO(b/170379532): Add better unit tests to validate flatbuffer values.
   void* out_buffer = nullptr;
   if (auto* buffer = (*buffers)[flatbuffer_tensor.buffer()]) {
     // If we've found a buffer, does it have any data?
@@ -655,6 +661,7 @@ TfLiteStatus MicroAllocator::StartModelAllocation(
 
   model_is_allocating_ = true;
 
+  TF_LITE_ENSURE_STATUS(InitScratchBufferData());
   TF_LITE_ENSURE_STATUS(AllocateTfLiteEvalTensors(model, eval_tensors));
   TF_LITE_ENSURE_STATUS(
       AllocateNodeAndRegistrations(model, node_and_registrations));
@@ -665,7 +672,8 @@ TfLiteStatus MicroAllocator::StartModelAllocation(
 }
 
 TfLiteStatus MicroAllocator::FinishModelAllocation(
-    const Model* model, TfLiteEvalTensor* eval_tensors) {
+    const Model* model, TfLiteEvalTensor* eval_tensors,
+    ScratchBufferHandle** scratch_buffer_handles) {
   if (!model_is_allocating_) {
     TF_LITE_REPORT_ERROR(error_reporter_,
                          "MicroAllocator: Model allocation finished before "
@@ -676,7 +684,10 @@ TfLiteStatus MicroAllocator::FinishModelAllocation(
   const SubGraph* subgraph = GetSubGraphFromModel(model);
   TFLITE_DCHECK(subgraph != nullptr);
 
-  TF_LITE_ENSURE_STATUS(CommitStaticMemoryPlan(model, subgraph, eval_tensors));
+  TF_LITE_ENSURE_STATUS(AllocateScratchBufferHandles(
+      scratch_buffer_handles, scratch_buffer_request_count_));
+  TF_LITE_ENSURE_STATUS(CommitStaticMemoryPlan(model, subgraph, eval_tensors,
+                                               *scratch_buffer_handles));
   TF_LITE_ENSURE_STATUS(AllocateVariables(subgraph, eval_tensors));
 
   model_is_allocating_ = false;
@@ -687,52 +698,74 @@ void* MicroAllocator::AllocatePersistentBuffer(size_t bytes) {
   return memory_allocator_->AllocateFromTail(bytes, kBufferAlignment);
 }
 
-TfLiteStatus MicroAllocator::RequestScratchBufferInArena(int node_id,
-                                                         size_t bytes,
+TfLiteStatus MicroAllocator::RequestScratchBufferInArena(size_t bytes,
                                                          int* buffer_idx) {
-  // A consistency check to make sure scratch_buffer_handles_ is contiguous i.e.
-  // scratch_buffer_handles_ is pointing to the last allocation from memory
-  // allocator.
-  if (scratch_buffer_handles_ != nullptr &&
-      reinterpret_cast<uint8_t*>(scratch_buffer_handles_) !=
-          memory_allocator_->GetTail()) {
-    TF_LITE_REPORT_ERROR(error_reporter_,
-                         "Internal error: AllocateFromTail can not be called "
-                         "between two RequestScratchBufferInArena calls.");
+  // All scratch buffer requests are stored in the head section of the arena
+  // when a model is in the prepare phase. First align a scratch buffer request
+  // pointer to the start of the head:
+  internal::ScratchBufferRequest* requests = GetScratchBufferRequests();
+
+  // Count the number of requested scratch buffers for the current node:
+  size_t current_node_request_count = 0;
+  for (size_t i = 0; i < scratch_buffer_request_count_; ++i) {
+    if (requests[i].node_idx == kUnassignedScratchBufferRequestIndex) {
+      ++current_node_request_count;
+    }
+  }
+
+  // First, ensure that the per-kernel request has not exceeded the limit:
+  if (current_node_request_count >= kMaxScratchBuffersPerOp) {
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
+        "Scratch buffer request exeeds limit per operator (%d)",
+        kMaxScratchBuffersPerOp);
     return kTfLiteError;
   }
 
-  internal::ScratchBufferHandle* handle =
-      reinterpret_cast<internal::ScratchBufferHandle*>(
-          memory_allocator_->AllocateFromTail(
-              sizeof(internal::ScratchBufferHandle),
-              alignof(internal::ScratchBufferHandle)));
-  if (handle == nullptr) {
-    TF_LITE_REPORT_ERROR(error_reporter_,
-                         "Failed to register scratch buffer handle for node %s",
-                         node_id);
-    return kTfLiteError;
-  }
-  *handle = {};
-  handle->bytes = bytes;
-  handle->node_idx = node_id;
-  *buffer_idx = scratch_buffer_count_;
-  scratch_buffer_count_ += 1;
-  // scratch_buffer_handles_ is in reverse order. The following code ensures
-  // that scratch_buffers[0] is pointing to the newly allocated handle.
-  scratch_buffer_handles_ = handle;
+  // Initialize and assign values for the request at the current index:
+  internal::ScratchBufferRequest* current_request =
+      &requests[scratch_buffer_request_count_];
+  *current_request = {};
+  // Assign -1 as a sentinel value that will be updated when the node finishes
+  // allocating:
+  current_request->bytes = bytes;
+  current_request->node_idx = kUnassignedScratchBufferRequestIndex;
+
+  // Assign the current request index to the out-param:
+  *buffer_idx = scratch_buffer_request_count_;
+
+  // Bump the request count to prepare for the next request:
+  ++scratch_buffer_request_count_;
   return kTfLiteOk;
 }
 
-void* MicroAllocator::GetScratchBuffer(int buffer_idx) const {
-  if (static_cast<size_t>(buffer_idx) >= scratch_buffer_count_) {
-    TF_LITE_REPORT_ERROR(error_reporter_,
-                         "Buffer %d not found. %d buffers available.",
-                         buffer_idx, scratch_buffer_count_);
-    return nullptr;
+TfLiteStatus MicroAllocator::FinishPrepareNodeAllocations(int node_id) {
+  // When a node has finished preparing, all temp allocations performed by the
+  // kernel should be cleaned up:
+  ResetTempAllocations();
+
+  // Find and update any new scratch buffer requests for the current node:
+  internal::ScratchBufferRequest* requests = GetScratchBufferRequests();
+
+  for (size_t i = 0; i < scratch_buffer_request_count_; ++i) {
+    // A request with a node_idx of -1 is a sentinel value used to indicate this
+    // was a new request for the current node. The allocator finally knows the
+    // node index at this point. Assign the value and update the list of new
+    // requests so the head section can be adjusted to allow for the next kernel
+    // to allocate at most kMaxScratchBuffersPerOp requests:
+    if (requests[i].node_idx == kUnassignedScratchBufferRequestIndex) {
+      requests[i].node_idx = node_id;
+    }
   }
-  // scratch_buffer_handles_ is in reverse order.
-  return scratch_buffer_handles_[scratch_buffer_count_ - buffer_idx - 1].data;
+
+  // Ensure that the head is re-adjusted to allow for another at-most
+  // kMaxScratchBuffersPerOp scratch buffer requests in the next operator:
+  TF_LITE_ENSURE_STATUS(memory_allocator_->SetHeadBufferSize(
+      sizeof(internal::ScratchBufferRequest) *
+          (scratch_buffer_request_count_ + kMaxScratchBuffersPerOp),
+      alignof(internal::ScratchBufferRequest)));
+
+  return kTfLiteOk;
 }
 
 size_t MicroAllocator::used_bytes() const {
@@ -986,9 +1019,9 @@ TfLiteTensor* MicroAllocator::AllocatePersistentTfLiteTensorInternal(
 TfLiteStatus MicroAllocator::PopulateTfLiteTensorFromFlatbuffer(
     const Model* model, const SubGraph* subgraph, TfLiteTensor* tensor,
     int tensor_index, bool allocate_temp) {
-  // TODO(b/160894903): This method serves as a stub to ensure quantized
-  // allocations in the tail can be recorded. Once all kernels have been ported
-  // to the new API this can be dropped.
+  // TODO(b/162311891): This method serves as a stub to ensure quantized
+  // allocations in the tail can be recorded. Once the interpreter has APIs for
+  // accessing buffers on TfLiteEvalTensor this method can be dropped.
   return internal::InitializeTfLiteTensorFromFlatbuffer(
       memory_allocator_, allocate_temp, *subgraph->tensors()->Get(tensor_index),
       model->buffers(), error_reporter_, tensor);
@@ -1010,67 +1043,138 @@ const SubGraph* MicroAllocator::GetSubGraphFromModel(const Model* model) {
 
 TfLiteStatus MicroAllocator::CommitStaticMemoryPlan(
     const Model* model, const SubGraph* subgraph,
-    TfLiteEvalTensor* eval_tensors) {
+    TfLiteEvalTensor* eval_tensors,
+    ScratchBufferHandle* scratch_buffer_handles) {
   size_t head_usage = 0;
   // Create static memory plan
   // 1. Calculate AllocationInfo to know the lifetime of each tensor/buffer.
   // 2. Add them into the planner (such as the GreedyMemoryPlanner).
   // 3. Static memory planning using the planner.
   // 4. Set tensor/buffer pointers based on the offsets from the previous step.
+  //
   // Note that AllocationInfo is only needed for creating the plan. It will be
-  // thrown away when the child allocator (tmp_allocator) goes out of scope.
-  {
-    // TODO(b/162595810): Use temp allocation buffer instead of a stack
-    // instance:
-    SimpleMemoryAllocator tmp_allocator(error_reporter_,
-                                        memory_allocator_->GetBufferHead(),
-                                        memory_allocator_->GetTail());
+  // allocated from the temp section and cleaned up at the bottom of this
+  // function.
 
-    AllocationInfoBuilder builder(error_reporter_, &tmp_allocator);
-    TF_LITE_ENSURE_STATUS(
-        builder.Init(subgraph->tensors()->size(), scratch_buffer_count_));
+  size_t allocation_info_count =
+      subgraph->tensors()->size() + scratch_buffer_request_count_;
+  size_t bytes = sizeof(AllocationInfo) * allocation_info_count;
 
-    const int32_t* offline_planner_offsets = nullptr;
-    TF_LITE_ENSURE_STATUS(
-        builder.GetOfflinePlannedOffsets(model, &offline_planner_offsets));
-    TF_LITE_ENSURE_STATUS(
-        builder.AddTensors(subgraph, offline_planner_offsets, eval_tensors));
-
-    TF_LITE_ENSURE_STATUS(builder.AddScratchBuffers(scratch_buffer_handles_));
-    const AllocationInfo* allocation_info = builder.Finish();
-
-    // Remaining arena size that memory planner can use for calculating offsets.
-    size_t remaining_arena_size =
-        tmp_allocator.GetAvailableMemory(kBufferAlignment);
-    uint8_t* planner_arena =
-        tmp_allocator.AllocateTemp(remaining_arena_size, kBufferAlignment);
-    TF_LITE_ENSURE(error_reporter_, planner_arena != nullptr);
-    GreedyMemoryPlanner planner(planner_arena, remaining_arena_size);
-    TF_LITE_ENSURE_STATUS(
-        CreatePlan(error_reporter_, &planner, allocation_info, builder.Size()));
-
-    size_t actual_available_arena_size =
-        memory_allocator_->GetAvailableMemory(kBufferAlignment);
-    // Make sure we have enough arena size.
-    if (planner.GetMaximumMemorySize() > actual_available_arena_size) {
-      TF_LITE_REPORT_ERROR(
-          error_reporter_,
-          "Arena size is too small for activation buffers. Needed %d but only "
-          "%d was available.",
-          planner.GetMaximumMemorySize(), actual_available_arena_size);
-      return kTfLiteError;
-    }
-
-    // Commit the plan.
-    TF_LITE_ENSURE_STATUS(CommitPlan(error_reporter_, &planner,
-                                     memory_allocator_->GetBufferHead(),
-                                     allocation_info, builder.Size()));
-    head_usage = planner.GetMaximumMemorySize();
+  // Allocate an array of AllocationInfo structs from the temp section. This
+  // struct will be used by AllocationInfoBuilder to find buffer usage.
+  AllocationInfo* allocation_info = reinterpret_cast<AllocationInfo*>(
+      memory_allocator_->AllocateTemp(bytes, alignof(AllocationInfo)));
+  if (allocation_info == nullptr) {
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
+        "Failed to allocate memory for allocation_info, %d bytes required",
+        bytes);
+    return kTfLiteError;
   }
 
+  // Use the AllocationInfoBuilder class to help determine where buffers are
+  // used in the subgraph.
+  AllocationInfoBuilder builder(allocation_info, subgraph->tensors()->size(),
+                                scratch_buffer_request_count_, error_reporter_);
+
+  const int32_t* offline_planner_offsets = nullptr;
   TF_LITE_ENSURE_STATUS(
-      memory_allocator_->EnsureHeadSize(head_usage, kBufferAlignment));
+      builder.GetOfflinePlannedOffsets(model, &offline_planner_offsets));
+  TF_LITE_ENSURE_STATUS(
+      builder.AddTensors(subgraph, offline_planner_offsets, eval_tensors));
+
+  internal::ScratchBufferRequest* scratch_buffer_requests =
+      GetScratchBufferRequests();
+
+  TF_LITE_ENSURE_STATUS(builder.AddScratchBuffers(scratch_buffer_requests,
+                                                  scratch_buffer_handles));
+
+  // Remaining arena size that memory planner can use for calculating offsets.
+  size_t remaining_arena_size =
+      memory_allocator_->GetAvailableMemory(kBufferAlignment);
+  uint8_t* planner_arena =
+      memory_allocator_->AllocateTemp(remaining_arena_size, kBufferAlignment);
+  TF_LITE_ENSURE(error_reporter_, planner_arena != nullptr);
+  GreedyMemoryPlanner planner(planner_arena, remaining_arena_size);
+  TF_LITE_ENSURE_STATUS(CreatePlan(error_reporter_, &planner, allocation_info,
+                                   allocation_info_count));
+
+  // Reset all temp allocations used above:
+  memory_allocator_->ResetTempAllocations();
+
+  size_t actual_available_arena_size =
+      memory_allocator_->GetAvailableMemory(kBufferAlignment);
+
+  // Make sure we have enough arena size.
+  if (planner.GetMaximumMemorySize() > actual_available_arena_size) {
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
+        "Arena size is too small for all buffers. Needed %u but only "
+        "%u was available.",
+        planner.GetMaximumMemorySize(), actual_available_arena_size);
+    return kTfLiteError;
+  }
+  // Commit the plan.
+  TF_LITE_ENSURE_STATUS(CommitPlan(error_reporter_, &planner,
+                                   memory_allocator_->GetHeadBuffer(),
+                                   allocation_info, allocation_info_count));
+  head_usage = planner.GetMaximumMemorySize();
+
+  // The head is used to store memory plans for one model at a time during the
+  // model preparation stage, and is re-purposed to store scratch buffer handles
+  // during model invocation. The head must be as large as the greater of the
+  // largest model memory plan's size and the total space required for all
+  // scratch buffer handles.
+  if (max_head_buffer_usage_ < head_usage) {
+    max_head_buffer_usage_ = head_usage;
+  }
+
+  // The head is used for storing scratch buffer allocations before finalizing a
+  // memory plan in this function. Ensure that the head is set to the largest
+  // memory plan sent through the allocator:
+  TF_LITE_ENSURE_STATUS(memory_allocator_->SetHeadBufferSize(
+      max_head_buffer_usage_, kBufferAlignment));
   return kTfLiteOk;
 }
 
+TfLiteStatus MicroAllocator::AllocateScratchBufferHandles(
+    ScratchBufferHandle** scratch_buffer_handles, size_t handle_count) {
+  TFLITE_DCHECK(scratch_buffer_handles != nullptr);
+
+  if (scratch_buffer_request_count_ == 0) {
+    // No scratch buffer requests were requested during model allocation.
+    return kTfLiteOk;
+  }
+
+  // Allocate a consecutive block of memory store the scratch buffer handles.
+  // This alignment ensures quick lookup during inference time for the model:
+  *scratch_buffer_handles = reinterpret_cast<ScratchBufferHandle*>(
+      memory_allocator_->AllocateFromTail(
+          sizeof(ScratchBufferHandle) * handle_count,
+          alignof(ScratchBufferHandle)));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus MicroAllocator::InitScratchBufferData() {
+  // A model is preparing to allocate resources, ensure that scratch buffer
+  // request counter is cleared:
+  scratch_buffer_request_count_ = 0;
+
+  // All requests will be stored in the head section. Each kernel is allowed at
+  // most kMaxScratchBuffersPerOp requests. Adjust the head to reserve at most
+  // that many requests to begin:
+  TF_LITE_ENSURE_STATUS(memory_allocator_->SetHeadBufferSize(
+      sizeof(internal::ScratchBufferRequest) * kMaxScratchBuffersPerOp,
+      alignof(internal::ScratchBufferRequest)));
+
+  return kTfLiteOk;
+}
+
+internal::ScratchBufferRequest* MicroAllocator::GetScratchBufferRequests() {
+  return reinterpret_cast<internal::ScratchBufferRequest*>(
+      AlignPointerUp(memory_allocator_->GetHeadBuffer(),
+                     alignof(internal::ScratchBufferRequest)));
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_allocator.h b/tensorflow/lite/micro/micro_allocator.h
index efd11b8b230..39a12eaf858 100644
--- a/tensorflow/lite/micro/micro_allocator.h
+++ b/tensorflow/lite/micro/micro_allocator.h
@@ -1,5 +1,5 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-b/160894903
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -28,30 +28,29 @@ limitations under the License.
 
 namespace tflite {
 
-// Namespace used for unittests.
 namespace internal {
 
 // Sets up all of the data structure members for a TfLiteTensor based on the
 // contents of a serialized tensor in the flatbuffer.
-// TODO(b/160894903): Once all kernels have been updated to the new
-// TfLiteEvalTensor API - drop the allocate_temp flag. This enables internal
-// flatbuffer quantization or dimension allocations to take place in either the
-// temp or tail section of the arena.
+// TODO(b/162311891): Drop this method when the interpreter has an API for
+// returning buffers on TfLiteEvalTensor.
 TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
     SimpleMemoryAllocator* allocator, bool allocate_temp,
     const tflite::Tensor& flatbuffer_tensor,
     const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
     ErrorReporter* error_reporter, TfLiteTensor* result);
 
-// A handle tracking scratch buffer allocation. This handle is created by
-// `RequestScratchBufferInArena`. `data` field is populated in
-// `FinishModelAllocation` after static memory planning.
-// TODO(b/150257460) As a future optimization, this struct could be replaced by
-// a union, since once `data` is populated, `bytes` and `node_idx` is not
-// needed.
+// Holds placeholder information for a scratch buffer request from a kernel.
+// This struct is only used during the model prepare stage. Each request from a
+// kernel is stored in the head section. During the prepare stage, the head
+// section will at least hold kMaxScratchBuffersPerOp number of requests plus
+// any requests from previous kernel requests.
+//
+// When the memory plan is finalized, these structs are no longer used in favor
+// of a sequential, array of ScratchBufferHandle allocations in the tail
+// section. These allocations are indexed by the request API defined in the
+// TfLiteContext struct.
 typedef struct {
-  // Pointer to the scratch buffer.
-  uint8_t* data;
   // Number of bytes required by the buffer. The actual allocated size might be
   // greater than `bytes` due to buffer alignment.
   size_t bytes;
@@ -59,7 +58,8 @@ typedef struct {
   // determine the lifetime of the buffer. In AllocationInfo, this buffer will
   // have `before` = node_idx and `after` = node_idx.
   int node_idx;
-} ScratchBufferHandle;
+} ScratchBufferRequest;
+
 }  // namespace internal
 
 typedef struct {
@@ -67,6 +67,14 @@ typedef struct {
   const TfLiteRegistration* registration;
 } NodeAndRegistration;
 
+// Holds a pointer to a buffer for a scratch buffer requested by a kernel during
+// the model prepare stage. This struct is allocated in-place and allows for
+// quick pointer-indexed lookup for speed during model inference.
+typedef struct {
+  // Pointer to location of the scratch buffer:
+  uint8_t* data;
+} ScratchBufferHandle;
+
 // Allocator responsible for allocating memory for all intermediate tensors
 // necessary to invoke a model.
 //
@@ -123,9 +131,12 @@ class MicroAllocator {
   // the 'head' section of the memory arena. All variable tensor data will also
   // be allocated. This method should be called after assigning model resources
   // in StartModelAllocation(). The eval_tensors pointer should be the value
-  // passed into this class during StartModelAllocation().
-  TfLiteStatus FinishModelAllocation(const Model* model,
-                                     TfLiteEvalTensor* eval_tensors);
+  // passed into this class during StartModelAllocation(). Scratch buffer
+  // handles are stored in the out-param `scratch_buffer_handles`. This value
+  // will be used in `GetScratchBuffer` call to retrieve scratch buffers.
+  TfLiteStatus FinishModelAllocation(
+      const Model* model, TfLiteEvalTensor* eval_tensors,
+      ScratchBufferHandle** scratch_buffer_handles);
 
   // Allocates a TfLiteTensor struct and populates the returned value with
   // properties from the model flatbuffer. This struct is allocated from
@@ -154,18 +165,19 @@ class MicroAllocator {
   // Allocates persistent buffer which has the same life time as the allocator.
   // The memory is immediately available and is allocated from the tail of the
   // arena.
-  void* AllocatePersistentBuffer(size_t bytes);
+  virtual void* AllocatePersistentBuffer(size_t bytes);
 
   // Register a scratch buffer of size `bytes` for Node with `node_id`.
-  // This method only allocates a BufferHandle holding information for memory
-  // planning. The buffer ptr is ready after `FinishModelAllocation` and can
-  // be retrieved by `GetScratchBuffer` method using the returned buffer_idx.
-  // Note that there should be no tail allocation between two consecutive
-  // `RequestScratchBufferInArena` calls.
-  TfLiteStatus RequestScratchBufferInArena(int node_id, size_t bytes,
-                                           int* buffer_idx);
-  // Returns the pointer to the planned scratch buffer.
-  void* GetScratchBuffer(int buffer_idx) const;
+  // This method only requests a buffer with a given size to be used after a
+  // model has finished allocation via FinishModelAllocation(). All requested
+  // buffers will be accessible by the out-param in that method.
+  TfLiteStatus RequestScratchBufferInArena(size_t bytes, int* buffer_idx);
+
+  // Finish allocating a specific NodeAndRegistration prepare block (kernel
+  // entry for a model) with a given node ID. This call ensures that any scratch
+  // buffer requests and temporary allocations are handled and ready for the
+  // next node prepare block.
+  TfLiteStatus FinishPrepareNodeAllocations(int node_id);
 
   // Returns the arena usage in bytes, only available after
   // `FinishModelAllocation`. Otherwise, it will return 0.
@@ -201,17 +213,15 @@ class MicroAllocator {
   virtual TfLiteStatus AllocateVariables(const SubGraph* subgraph,
                                          TfLiteEvalTensor* eval_tensors);
 
-  // TODO(b/160894903): Once all kernels have been updated to the new API drop
-  // this method. It is only used to record TfLiteTensor persistent allocations.
+  // Allocate and return a persistent TfLiteTensor.
+  // TODO(b/162311891): Drop this method when the interpreter has an API for
+  // accessing TfLiteEvalTensor structs.
   virtual TfLiteTensor* AllocatePersistentTfLiteTensorInternal(
       const Model* model, TfLiteEvalTensor* eval_tensors, int tensor_index);
 
   // Populates a TfLiteTensor struct with data from the model flatbuffer. Any
   // quantization data is allocated from either the tail (persistent) or temp
   // sections of the arena based on the allocation flag.
-  // TODO(b/160894903): Once all kernels have been updated to the new API drop
-  // this function since all allocations for quantized data will take place in
-  // the temp section.
   virtual TfLiteStatus PopulateTfLiteTensorFromFlatbuffer(
       const Model* model, const SubGraph* subgraph, TfLiteTensor* tensor,
       int tensor_index, bool allocate_temp);
@@ -225,10 +235,28 @@ class MicroAllocator {
   // Commits a memory plan for all non-persistent buffer allocations in the
   // 'head' section of the memory arena. The eval_tensors pointer is the list of
   // pre-allocated TfLiteEvalTensor structs that will point to the buffers that
-  // will be allocated into the head section in this function call.
-  virtual TfLiteStatus CommitStaticMemoryPlan(const Model* model,
-                                              const SubGraph* subgraph,
-                                              TfLiteEvalTensor* eval_tensors);
+  // will be allocated into the head section in this function call. The
+  // scratch_buffer_handles pointer is the array of pre-allocated
+  // ScratchBufferHandle structs that will point to allocated buffers also in
+  // the head section.
+  virtual TfLiteStatus CommitStaticMemoryPlan(
+      const Model* model, const SubGraph* subgraph,
+      TfLiteEvalTensor* eval_tensors,
+      ScratchBufferHandle* scratch_buffer_handles);
+
+  // Allocates an array of ScratchBufferHandle structs in the tail section for a
+  // given number of handles.
+  virtual TfLiteStatus AllocateScratchBufferHandles(
+      ScratchBufferHandle** scratch_buffer_handles, size_t handle_count);
+
+  // Clears all internal scratch buffer request counts and resets the head to
+  // prepare for kernels to request scratch buffer data when a model is
+  // preparing.
+  TfLiteStatus InitScratchBufferData();
+
+  // Returns the pointer for the array of ScratchBufferRequest allocations in
+  // the head section.
+  internal::ScratchBufferRequest* GetScratchBufferRequests();
 
   // A simple memory allocator that always allocate from the arena tail or head.
   SimpleMemoryAllocator* memory_allocator_;
@@ -236,12 +264,13 @@ class MicroAllocator {
   ErrorReporter* error_reporter_;
   bool model_is_allocating_;
 
-  // In reverse order for efficiency.
-  // i.e. scratch_buffer_handles_[0] is the handle for the last buffer,
-  // corresponding to the last RequestScratchBufferInArena call.
-  internal::ScratchBufferHandle* scratch_buffer_handles_ = nullptr;
-  // How many scratch buffers have been allocated.
-  size_t scratch_buffer_count_ = 0;
+  // Holds the number of ScratchBufferRequest instances stored in the head
+  // section when a model is allocating.
+  size_t scratch_buffer_request_count_ = 0;
+
+  // Holds the byte length of the memory plan with the largest head usage. Used
+  // to ensure that multi-tenant allocations can share the head for buffers.
+  size_t max_head_buffer_usage_ = 0;
 
   TF_LITE_REMOVE_VIRTUAL_DELETE
 };
diff --git a/tensorflow/lite/micro/micro_allocator_test.cc b/tensorflow/lite/micro/micro_allocator_test.cc
index 32d52a994d9..222c8566fb9 100644
--- a/tensorflow/lite/micro/micro_allocator_test.cc
+++ b/tensorflow/lite/micro/micro_allocator_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -142,8 +142,9 @@ TF_LITE_MICRO_TEST(TestInitializeRuntimeTensor) {
   simple_allocator->~SimpleMemoryAllocator();
 }
 
-// TODO(b/160894903): Drop this test when InitializeTfLiteTensorFromFlatbuffer()
-// always allocates from temp (kernels are using the new TfLiteEvalTensor API):
+// TODO(b/162311891): Drop this test when InitializeTfLiteTensorFromFlatbuffer()
+// always allocates from temp (interpreter returns buffers from
+// TfLiteEvalTensor):
 TF_LITE_MICRO_TEST(TestInitializeTempRuntimeTensor) {
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
@@ -246,6 +247,7 @@ TF_LITE_MICRO_TEST(TestFailsWhenModelStartsTwice) {
 TF_LITE_MICRO_TEST(TestFailsWithWrongSequence) {
   const tflite::Model* model = tflite::testing::GetSimpleMockModel();
   TfLiteEvalTensor* eval_tensors = nullptr;
+  tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   constexpr size_t arena_size = 1024;
@@ -256,7 +258,8 @@ TF_LITE_MICRO_TEST(TestFailsWithWrongSequence) {
 
   // We can't finish allocation before it ever got started.
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteError, allocator->FinishModelAllocation(model, eval_tensors));
+      kTfLiteError, allocator->FinishModelAllocation(model, eval_tensors,
+                                                     &scratch_buffer_handles));
 
   // Start twice is not allowed.
   TF_LITE_MICRO_EXPECT_EQ(
@@ -272,6 +275,7 @@ TF_LITE_MICRO_TEST(TestFailsWithWrongSequence) {
 TF_LITE_MICRO_TEST(TestMockModelAllocation) {
   const tflite::Model* model = tflite::testing::GetSimpleMockModel();
   TfLiteEvalTensor* eval_tensors = nullptr;
+  tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   constexpr size_t arena_size = 1024;
@@ -284,7 +288,8 @@ TF_LITE_MICRO_TEST(TestMockModelAllocation) {
       allocator->StartModelAllocation(model, op_resolver,
                                       &node_and_registration, &eval_tensors));
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors));
+      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors,
+                                                  &scratch_buffer_handles));
 
   size_t model_tensor_size = tflite::testing::GetModelTensorCount(model);
   TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(4), model_tensor_size);
@@ -319,6 +324,7 @@ TF_LITE_MICRO_TEST(TestMultiTenantAllocation) {
       tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
   TF_LITE_MICRO_EXPECT_NE(nullptr, allocator);
   TfLiteEvalTensor* eval_tensors = nullptr;
+  tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
 
   // Allocate for model 1. We use ComplexMockModel here to cover the code path
   // allocatig variables.
@@ -329,7 +335,8 @@ TF_LITE_MICRO_TEST(TestMultiTenantAllocation) {
       allocator->StartModelAllocation(model1, op_resolver,
                                       &node_and_registration1, &eval_tensors));
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->FinishModelAllocation(model1, eval_tensors));
+      kTfLiteOk, allocator->FinishModelAllocation(model1, eval_tensors,
+                                                  &scratch_buffer_handles));
   const size_t single_model_used_bytes = allocator->used_bytes();
 
   // Allocate for model 2.
@@ -340,7 +347,8 @@ TF_LITE_MICRO_TEST(TestMultiTenantAllocation) {
       allocator->StartModelAllocation(model2, op_resolver,
                                       &node_and_registration2, &eval_tensors));
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->FinishModelAllocation(model2, eval_tensors));
+      kTfLiteOk, allocator->FinishModelAllocation(model2, eval_tensors,
+                                                  &scratch_buffer_handles));
 
   // Allocation for two instances of the same model takes less memory as `head`
   // of the arena is reused.
@@ -350,6 +358,7 @@ TF_LITE_MICRO_TEST(TestMultiTenantAllocation) {
 TF_LITE_MICRO_TEST(TestAllocationForModelsWithBranches) {
   const tflite::Model* model = tflite::testing::GetSimpleModelWithBranch();
   TfLiteEvalTensor* eval_tensors = nullptr;
+  tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   constexpr size_t arena_size = 4096;
@@ -362,7 +371,8 @@ TF_LITE_MICRO_TEST(TestAllocationForModelsWithBranches) {
       allocator->StartModelAllocation(model, op_resolver,
                                       &node_and_registration, &eval_tensors));
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors));
+      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors,
+                                                  &scratch_buffer_handles));
 
   uint8_t* start = eval_tensors[0].data.uint8;
   // Check test_helpers.cc BuildSimpleModelWithBranch for model structure.
@@ -389,6 +399,7 @@ TF_LITE_MICRO_TEST(TestAllocationForModelsWithBranches) {
 TF_LITE_MICRO_TEST(TestAllocationForComplexModelAllocation) {
   const tflite::Model* model = tflite::testing::GetComplexMockModel();
   TfLiteEvalTensor* eval_tensors = nullptr;
+  tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
   tflite::NodeAndRegistration* node_and_registration;
   constexpr size_t arena_size = 2048;
@@ -401,7 +412,8 @@ TF_LITE_MICRO_TEST(TestAllocationForComplexModelAllocation) {
       allocator->StartModelAllocation(model, op_resolver,
                                       &node_and_registration, &eval_tensors));
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors));
+      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors,
+                                                  &scratch_buffer_handles));
 
   size_t model_tensor_size = tflite::testing::GetModelTensorCount(model);
   TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(10), model_tensor_size);
@@ -463,6 +475,8 @@ TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) {
       nbr_tensors, metadata_buffer, node_list, num_conns);
 
   TfLiteEvalTensor* eval_tensors = nullptr;
+  tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
+
   constexpr size_t arena_size = 4096;
   uint8_t arena[arena_size];
   tflite::MicroAllocator* allocator =
@@ -473,7 +487,8 @@ TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) {
       allocator->StartModelAllocation(model, op_resolver,
                                       &node_and_registration, &eval_tensors));
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors));
+      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors,
+                                                  &scratch_buffer_handles));
 
   // Since all of the tensors are online planned and the model structure is
   // identical to that in TestAllocationForModelsWithBranches,
@@ -524,6 +539,7 @@ TF_LITE_MICRO_TEST(OfflinePlannerBasic) {
       nbr_tensors, metadata_buffer, node_list, num_conns);
 
   TfLiteEvalTensor* eval_tensors = nullptr;
+  tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
   constexpr size_t arena_size = 4096;
   uint8_t arena[arena_size];
   tflite::MicroAllocator* allocator =
@@ -534,7 +550,8 @@ TF_LITE_MICRO_TEST(OfflinePlannerBasic) {
       allocator->StartModelAllocation(model, op_resolver,
                                       &node_and_registration, &eval_tensors));
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors));
+      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors,
+                                                  &scratch_buffer_handles));
 
   uint8_t* start = eval_tensors[0].data.uint8;
   TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[0].data.uint8 - start);
@@ -577,6 +594,7 @@ TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) {
       nbr_tensors, metadata_buffer, node_list, num_conns);
 
   TfLiteEvalTensor* eval_tensors = nullptr;
+  tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
   constexpr size_t arena_size = 4096;
   uint8_t arena[arena_size];
   tflite::MicroAllocator* allocator =
@@ -587,7 +605,8 @@ TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) {
       allocator->StartModelAllocation(model, op_resolver,
                                       &node_and_registration, &eval_tensors));
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors));
+      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors,
+                                                  &scratch_buffer_handles));
 
   uint8_t* start = eval_tensors[0].data.uint8;
   TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[0].data.uint8 - start);
@@ -633,6 +652,7 @@ TF_LITE_MICRO_TEST(OfflinePlannerOfflineOnline) {
       nbr_tensors, metadata_buffer, node_list, num_conns);
 
   TfLiteEvalTensor* eval_tensors = nullptr;
+  tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
   constexpr size_t arena_size = 4096;
   uint8_t arena[arena_size];
   tflite::MicroAllocator* allocator =
@@ -643,7 +663,8 @@ TF_LITE_MICRO_TEST(OfflinePlannerOfflineOnline) {
       allocator->StartModelAllocation(model, op_resolver,
                                       &node_and_registration, &eval_tensors));
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors));
+      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors,
+                                                  &scratch_buffer_handles));
 
   uint8_t* start = eval_tensors[0].data.uint8;
   TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[0].data.uint8 - start);
@@ -735,4 +756,66 @@ TF_LITE_MICRO_TEST(TestAllocateTfLiteTensorWithReset) {
   TF_LITE_MICRO_EXPECT(tensor2 == tensor1);
 }
 
+TF_LITE_MICRO_TEST(TestOperatorInputsNotInSubgraphInputs) {
+  constexpr int nbr_tensors = 5;
+  tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
+  tflite::NodeAndRegistration* node_and_registration;
+  const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize +
+                                nbr_tensors] = {
+      1, 0, nbr_tensors,  // header: version, subgraph, nbr tensors
+      // memory offsets:
+      0,    // t0
+      0,    // t1
+      0,    // t2
+      48,   // t3
+      -1};  // t4
+
+  int t0 = 0;
+  int t1 = 1;
+  int t2 = 2;
+  int t3 = 3;
+  int t4 = 4;
+
+  int num_conns = 2;
+  tflite::testing::NodeConnection node_list[2] = {
+      {
+          {t0, t1, t2},  // t0: input (actual input part of subgraph inputs as
+                         // well as operator inputs)
+                         // t1: scratch1 (only in operator inputs)
+                         // t2: scratch2 (only in operator inputs)
+          {t3}           // output
+      },
+      {
+          {t3},  // input
+          {t4}   // output
+      },
+  };
+
+  const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning(
+      nbr_tensors, metadata_buffer, node_list, num_conns,
+      1 /* only first tensor (t0) is in subgraph input list*/);
+
+  TfLiteEvalTensor* eval_tensors = nullptr;
+  tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
+  constexpr size_t arena_size = 4096;
+  uint8_t arena[arena_size];
+  tflite::MicroAllocator* allocator =
+      tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      allocator->StartModelAllocation(model, op_resolver,
+                                      &node_and_registration, &eval_tensors));
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors,
+                                                  &scratch_buffer_handles));
+
+  uint8_t* start = eval_tensors[0].data.uint8;
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[0].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[1].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[2].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(48, eval_tensors[3].data.uint8 - start);
+  TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[4].data.uint8 - start);
+}
+
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/micro_interpreter.cc b/tensorflow/lite/micro/micro_interpreter.cc
index 8c2f8e031d8..8b003d8b829 100644
--- a/tensorflow/lite/micro/micro_interpreter.cc
+++ b/tensorflow/lite/micro/micro_interpreter.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -59,13 +59,13 @@ TfLiteStatus ContextHelper::RequestScratchBufferInArena(TfLiteContext* ctx,
                                                         size_t bytes,
                                                         int* buffer_idx) {
   ContextHelper* helper = reinterpret_cast<ContextHelper*>(ctx->impl_);
-  return helper->allocator_->RequestScratchBufferInArena(
-      helper->current_node_idx_, bytes, buffer_idx);
+  return helper->allocator_->RequestScratchBufferInArena(bytes, buffer_idx);
 }
 
 void* ContextHelper::GetScratchBuffer(TfLiteContext* ctx, int buffer_idx) {
-  return reinterpret_cast<ContextHelper*>(ctx->impl_)
-      ->allocator_->GetScratchBuffer(buffer_idx);
+  ContextHelper* helper = reinterpret_cast<ContextHelper*>(ctx->impl_);
+  ScratchBufferHandle* handle = helper->scratch_buffer_handles_ + buffer_idx;
+  return handle->data;
 }
 
 void ContextHelper::ReportOpError(struct TfLiteContext* context,
@@ -92,12 +92,15 @@ TfLiteEvalTensor* ContextHelper::GetEvalTensor(
   return &helper->eval_tensors_[tensor_idx];
 }
 
-void ContextHelper::SetNodeIndex(int idx) { current_node_idx_ = idx; }
-
 void ContextHelper::SetTfLiteEvalTensors(TfLiteEvalTensor* eval_tensors) {
   eval_tensors_ = eval_tensors;
 }
 
+void ContextHelper::SetScratchBufferHandles(
+    ScratchBufferHandle* scratch_buffer_handles) {
+  scratch_buffer_handles_ = scratch_buffer_handles;
+}
+
 }  // namespace internal
 
 MicroInterpreter::MicroInterpreter(const Model* model,
@@ -229,6 +232,7 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
   // TODO(b/16157777): This call would not be needed if ContextHelper rolled
   // into the interpreter.
   context_helper_.SetTfLiteEvalTensors(eval_tensors_);
+  context_.tensors_size = subgraph_->tensors()->size();
 
   // If the system is big endian then convert weights from the flatbuffer from
   // little to big endian on startup so that it does not need to be done during
@@ -258,7 +262,6 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
   context_.GetScratchBuffer = nullptr;
 
   for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
-    context_helper_.SetNodeIndex(i);
     auto* node = &(node_and_registrations_[i].node);
     auto* registration = node_and_registrations_[i].registration;
     size_t init_data_size;
@@ -275,15 +278,12 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
           registration->init(&context_, init_data, init_data_size);
     }
   }
-  context_helper_.SetNodeIndex(-1);
 
   // Both AllocatePersistentBuffer and RequestScratchBufferInArena is
   // available in Prepare stage.
   context_.RequestScratchBufferInArena =
       context_helper_.RequestScratchBufferInArena;
   for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
-    // Set node idx to annotate the lifetime for scratch buffers.
-    context_helper_.SetNodeIndex(i);
     auto* node = &(node_and_registrations_[i].node);
     auto* registration = node_and_registrations_[i].registration;
     if (registration->prepare) {
@@ -296,9 +296,8 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
         return kTfLiteError;
       }
     }
-    allocator_.ResetTempAllocations();
+    allocator_.FinishPrepareNodeAllocations(/*node_id=*/i);
   }
-  context_helper_.SetNodeIndex(-1);
 
   // Prepare is done, we're ready for Invoke. Memory allocation is no longer
   // allowed. Kernels can only fetch scratch buffers via GetScratchBuffer.
@@ -307,7 +306,11 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
   context_.GetScratchBuffer = context_helper_.GetScratchBuffer;
 
   TF_LITE_ENSURE_OK(&context_,
-                    allocator_.FinishModelAllocation(model_, eval_tensors_));
+                    allocator_.FinishModelAllocation(model_, eval_tensors_,
+                                                     &scratch_buffer_handles_));
+  // TODO(b/16157777): Remove this when ContextHelper is rolled into this class.
+  context_helper_.SetScratchBufferHandles(scratch_buffer_handles_);
+
   TF_LITE_ENSURE_STATUS(ResetVariableTensors());
 
   tensors_allocated_ = true;
@@ -405,8 +408,8 @@ TfLiteTensor* MicroInterpreter::output(size_t index) {
                                                      outputs().Get(index));
   }
   if (output_tensor_ == nullptr) {
-    // TODO(b/160894903): This API will allocate TfLiteTensor structs from
-    // persistent (tail) memory and cache on this pointer.
+    // TODO(b/162311891): Drop these allocations when the interpreter supports
+    // handling buffers from TfLiteEvalTensor.
     output_tensor_ = allocator_.AllocatePersistentTfLiteTensor(
         model_, eval_tensors_, outputs().Get(index));
   }
diff --git a/tensorflow/lite/micro/micro_interpreter.h b/tensorflow/lite/micro/micro_interpreter.h
index 0983a007011..31720c8e82e 100644
--- a/tensorflow/lite/micro/micro_interpreter.h
+++ b/tensorflow/lite/micro/micro_interpreter.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -54,18 +54,18 @@ class ContextHelper {
   static TfLiteEvalTensor* GetEvalTensor(const struct TfLiteContext* context,
                                          int tensor_idx);
 
-  // Sets the current node index to assist with scratch buffer allocations:
-  void SetNodeIndex(int idx);
-
   // Sets the pointer to a list of TfLiteEvalTensor instances.
   void SetTfLiteEvalTensors(TfLiteEvalTensor* eval_tensors);
 
+  // Sets the pointer to a list of ScratchBufferHandle instances.
+  void SetScratchBufferHandles(ScratchBufferHandle* scratch_buffer_handles);
+
  private:
-  MicroAllocator* allocator_;
-  ErrorReporter* error_reporter_;
-  const Model* model_;
-  TfLiteEvalTensor* eval_tensors_;
-  int current_node_idx_ = -1;
+  MicroAllocator* allocator_ = nullptr;
+  ErrorReporter* error_reporter_ = nullptr;
+  const Model* model_ = nullptr;
+  TfLiteEvalTensor* eval_tensors_ = nullptr;
+  ScratchBufferHandle* scratch_buffer_handles_ = nullptr;
 };
 
 }  // namespace internal
@@ -193,12 +193,15 @@ class MicroInterpreter {
 
   TfLiteStatus initialization_status_;
 
-  const SubGraph* subgraph_;
-  TfLiteEvalTensor* eval_tensors_;
+  const SubGraph* subgraph_ = nullptr;
+  TfLiteEvalTensor* eval_tensors_ = nullptr;
+  ScratchBufferHandle* scratch_buffer_handles_ = nullptr;
+
+  // TODO(b/16157777): Drop this reference:
   internal::ContextHelper context_helper_;
 
-  // TODO(b/160894903): Clean these pointers up when all APIs are updated to new
-  // TfLiteEvalTensor buffers.
+  // TODO(b/162311891): Clean these pointers up when this class supports buffers
+  // from TfLiteEvalTensor.
   TfLiteTensor* input_tensor_;
   TfLiteTensor* output_tensor_;
 };
diff --git a/tensorflow/lite/micro/micro_interpreter_test.cc b/tensorflow/lite/micro/micro_interpreter_test.cc
index 150dbead337..07cc4cabf4b 100644
--- a/tensorflow/lite/micro/micro_interpreter_test.cc
+++ b/tensorflow/lite/micro/micro_interpreter_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/micro/all_ops_resolver.h"
-#include "tensorflow/lite/micro/micro_optional_debug_tools.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/micro/recording_micro_allocator.h"
 #include "tensorflow/lite/micro/test_helpers.h"
@@ -84,6 +83,7 @@ TF_LITE_MICRO_TEST(TestInterpreter) {
     TF_LITE_MICRO_EXPECT_LE(interpreter.arena_used_bytes(), 928 + 100);
     TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(1), interpreter.inputs_size());
     TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(2), interpreter.outputs_size());
+    TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(4), interpreter.tensors_size());
 
     TfLiteTensor* input = interpreter.input(0);
     TF_LITE_MICRO_EXPECT_NE(nullptr, input);
@@ -113,9 +113,6 @@ TF_LITE_MICRO_TEST(TestInterpreter) {
     TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(4), output->bytes);
     TF_LITE_MICRO_EXPECT_NE(nullptr, output->data.i32);
     TF_LITE_MICRO_EXPECT_EQ(42, output->data.i32[0]);
-
-    // Just to make sure that this method works.
-    tflite::PrintInterpreterState(&interpreter);
   }
 
   TF_LITE_MICRO_EXPECT_EQ(tflite::testing::MockCustom::freed_, true);
@@ -220,38 +217,45 @@ TF_LITE_MICRO_TEST(TestKernelMemoryPlanning) {
 
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
 
-  constexpr size_t allocator_buffer_size = 2048;
+  constexpr size_t allocator_buffer_size = 4096;
   uint8_t allocator_buffer[allocator_buffer_size];
-  tflite::MicroInterpreter interpreter(model, op_resolver, allocator_buffer,
-                                       allocator_buffer_size,
-                                       micro_test::reporter);
-  TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
-  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(1), interpreter.inputs_size());
-  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(2), interpreter.outputs_size());
 
-  TfLiteTensor* input = interpreter.input(0);
-  TF_LITE_MICRO_EXPECT_EQ(1, input->dims->size);
-  TF_LITE_MICRO_EXPECT_EQ(3, input->dims->data[0]);
-  input->data.uint8[0] = 2;
-  input->data.uint8[1] = 3;
-  input->data.uint8[2] = 1;
+  tflite::RecordingMicroAllocator* allocator =
+      tflite::RecordingMicroAllocator::Create(
+          allocator_buffer, allocator_buffer_size, micro_test::reporter);
 
-  uint8_t expected_median = 2;
+  // Make sure kernel memory planning works in multi-tenant context.
+  for (int i = 0; i < 3; i++) {
+    tflite::MicroInterpreter interpreter(model, op_resolver, allocator,
+                                         micro_test::reporter);
+    TF_LITE_MICRO_EXPECT_EQ(interpreter.AllocateTensors(), kTfLiteOk);
+    TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(1), interpreter.inputs_size());
+    TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(2), interpreter.outputs_size());
 
-  {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter.Invoke());
-    TfLiteTensor* median = interpreter.output(0);
-    TF_LITE_MICRO_EXPECT_EQ(expected_median, median->data.uint8[0]);
-    TfLiteTensor* invoke_count = interpreter.output(1);
-    TF_LITE_MICRO_EXPECT_EQ(1, invoke_count->data.i32[0]);
-  }
+    TfLiteTensor* input = interpreter.input(0);
+    TF_LITE_MICRO_EXPECT_EQ(1, input->dims->size);
+    TF_LITE_MICRO_EXPECT_EQ(3, input->dims->data[0]);
+    input->data.uint8[0] = 2;
+    input->data.uint8[1] = 3;
+    input->data.uint8[2] = 1;
 
-  {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter.Invoke());
-    TfLiteTensor* median = interpreter.output(0);
-    TF_LITE_MICRO_EXPECT_EQ(expected_median, median->data.uint8[0]);
-    TfLiteTensor* invoke_count = interpreter.output(1);
-    TF_LITE_MICRO_EXPECT_EQ(2, invoke_count->data.i32[0]);
+    uint8_t expected_median = 2;
+
+    {
+      TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter.Invoke());
+      TfLiteTensor* median = interpreter.output(0);
+      TF_LITE_MICRO_EXPECT_EQ(expected_median, median->data.uint8[0]);
+      TfLiteTensor* invoke_count = interpreter.output(1);
+      TF_LITE_MICRO_EXPECT_EQ(1, invoke_count->data.i32[0]);
+    }
+
+    {
+      TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, interpreter.Invoke());
+      TfLiteTensor* median = interpreter.output(0);
+      TF_LITE_MICRO_EXPECT_EQ(expected_median, median->data.uint8[0]);
+      TfLiteTensor* invoke_count = interpreter.output(1);
+      TF_LITE_MICRO_EXPECT_EQ(2, invoke_count->data.i32[0]);
+    }
   }
 }
 
@@ -262,7 +266,7 @@ TF_LITE_MICRO_TEST(TestVariableTensorReset) {
   tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver();
 
   constexpr size_t allocator_buffer_size =
-      2096 /* optimal arena size at the time of writting. */ +
+      3072 /* optimal arena size at the time of writting. */ +
       16 /* alignment */ + 100 /* some headroom */;
   uint8_t allocator_buffer[allocator_buffer_size];
   tflite::MicroInterpreter interpreter(model, op_resolver, allocator_buffer,
@@ -396,11 +400,15 @@ TF_LITE_MICRO_TEST(TestIncompleteInitializationAllocationsWithSmallArena) {
   // Interpreter fails because arena is too small:
   TF_LITE_MICRO_EXPECT_EQ(interpreter.Invoke(), kTfLiteError);
 
+  // The head will have some allocations because scratch buffer requests are
+  // stored in the head until memory plan is fully committed (e.g. model has to
+  // successfully allocate first).
+  TF_LITE_MICRO_EXPECT_EQ(
+      static_cast<size_t>(128),
+      allocator->GetSimpleMemoryAllocator()->GetHeadUsedBytes());
+
   // Ensure allocations are zero (ignore tail since some internal structs are
   // initialized with this space):
-  TF_LITE_MICRO_EXPECT_EQ(
-      static_cast<size_t>(0),
-      allocator->GetSimpleMemoryAllocator()->GetHeadUsedBytes());
   TF_LITE_MICRO_EXPECT_EQ(
       static_cast<size_t>(0),
       allocator
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index 834b464210a..0175c8dbd6a 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 #include "tensorflow/lite/micro/compatibility.h"
+#include "tensorflow/lite/micro/kernels/fully_connected.h"
 #include "tensorflow/lite/micro/kernels/micro_ops.h"
 #include "tensorflow/lite/micro/micro_op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
@@ -150,8 +151,7 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddConv2D() {
-    return AddBuiltin(BuiltinOperator_CONV_2D,
-                      tflite::ops::micro::Register_CONV_2D(), ParseConv2D);
+    return AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D(), ParseConv2D);
   }
 
   TfLiteStatus AddCos() {
@@ -161,8 +161,7 @@ class MicroMutableOpResolver : public MicroOpResolver {
 
   TfLiteStatus AddDepthwiseConv2D() {
     return AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D,
-                      tflite::ops::micro::Register_DEPTHWISE_CONV_2D(),
-                      ParseDepthwiseConv2D);
+                      Register_DEPTHWISE_CONV_2D(), ParseDepthwiseConv2D);
   }
 
   TfLiteStatus AddDequantize() {
@@ -181,9 +180,9 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       tflite::ops::micro::Register_FLOOR(), ParseFloor);
   }
 
-  TfLiteStatus AddFullyConnected() {
-    return AddBuiltin(BuiltinOperator_FULLY_CONNECTED,
-                      tflite::ops::micro::Register_FULLY_CONNECTED(),
+  TfLiteStatus AddFullyConnected(
+      const TfLiteRegistration& registration = Register_FULLY_CONNECTED()) {
+    return AddBuiltin(BuiltinOperator_FULLY_CONNECTED, registration,
                       ParseFullyConnected);
   }
 
@@ -305,8 +304,13 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddQuantize() {
-    return AddBuiltin(BuiltinOperator_QUANTIZE,
-                      tflite::ops::micro::Register_QUANTIZE(), ParseQuantize);
+    return AddBuiltin(BuiltinOperator_QUANTIZE, Register_QUANTIZE(),
+                      ParseQuantize);
+  }
+
+  TfLiteStatus AddReduceMax() {
+    return AddBuiltin(BuiltinOperator_REDUCE_MAX,
+                      tflite::ops::micro::Register_REDUCE_MAX(), ParseReducer);
   }
 
   TfLiteStatus AddRelu() {
@@ -340,14 +344,18 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       tflite::ops::micro::Register_RSQRT(), ParseRsqrt);
   }
 
+  TfLiteStatus AddShape() {
+    return AddBuiltin(BuiltinOperator_SHAPE, Register_SHAPE(), ParseShape);
+  }
+
   TfLiteStatus AddSin() {
     return AddBuiltin(BuiltinOperator_SIN, tflite::ops::micro::Register_SIN(),
                       ParseSin);
   }
 
   TfLiteStatus AddSoftmax() {
-    return AddBuiltin(BuiltinOperator_SOFTMAX,
-                      tflite::ops::micro::Register_SOFTMAX(), ParseSoftmax);
+    return AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(),
+                      ParseSoftmax);
   }
 
   TfLiteStatus AddSplit() {
@@ -355,6 +363,11 @@ class MicroMutableOpResolver : public MicroOpResolver {
                       tflite::ops::micro::Register_SPLIT(), ParseSplit);
   }
 
+  TfLiteStatus AddSplitV() {
+    return AddBuiltin(BuiltinOperator_SPLIT_V,
+                      tflite::ops::micro::Register_SPLIT_V(), ParseSplitV);
+  }
+
   TfLiteStatus AddSqrt() {
     return AddBuiltin(BuiltinOperator_SQRT, tflite::ops::micro::Register_SQRT(),
                       ParseSqrt);
@@ -377,8 +390,7 @@ class MicroMutableOpResolver : public MicroOpResolver {
   }
 
   TfLiteStatus AddSvdf() {
-    return AddBuiltin(BuiltinOperator_SVDF, tflite::ops::micro::Register_SVDF(),
-                      ParseSvdf);
+    return AddBuiltin(BuiltinOperator_SVDF, Register_SVDF(), ParseSvdf);
   }
 
   TfLiteStatus AddTanh() {
@@ -394,6 +406,8 @@ class MicroMutableOpResolver : public MicroOpResolver {
   unsigned int GetRegistrationLength() { return registrations_len_; }
 
  private:
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+
   TfLiteStatus AddBuiltin(tflite::BuiltinOperator op,
                           const TfLiteRegistration& registration,
                           MicroOpResolver::BuiltinParseFunction parser) {
@@ -449,8 +463,6 @@ class MicroMutableOpResolver : public MicroOpResolver {
   unsigned int num_buitin_ops_ = 0;
 
   ErrorReporter* error_reporter_;
-
-  TF_LITE_REMOVE_VIRTUAL_DELETE
 };
 
 };  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_optional_debug_tools.cc b/tensorflow/lite/micro/micro_optional_debug_tools.cc
deleted file mode 100644
index e7aee576351..00000000000
--- a/tensorflow/lite/micro/micro_optional_debug_tools.cc
+++ /dev/null
@@ -1,188 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/lite/micro/micro_optional_debug_tools.h"
-
-// `cinttypes` requires `__STDC_FORMAT_MACROS` to be defined to expose `PRId32`.
-#ifndef __STDC_FORMAT_MACROS
-#define __STDC_FORMAT_MACROS
-#endif
-
-#include <cinttypes>
-#include <cstddef>
-#include <cstdint>
-#include <cstdio>
-#include <vector>
-
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/micro/memory_helpers.h"
-#include "tensorflow/lite/micro/micro_allocator.h"
-#include "tensorflow/lite/micro/micro_interpreter.h"
-#include "tensorflow/lite/schema/schema_generated.h"
-
-namespace tflite {
-namespace {
-
-std::vector<int> flatbuffersVector2StdVector(
-    const flatbuffers::Vector<int32_t>& fVector) {
-  std::vector<int> stdVector;
-  stdVector.reserve(fVector.size());
-  for (size_t i = 0; i < fVector.size(); i++) {
-    stdVector.push_back(fVector.Get(i));
-  }
-  return stdVector;
-}
-
-void PrintIntVector(const std::vector<int>& v) {
-  for (const auto& it : v) {
-    printf(" %d", it);
-  }
-  printf("\n");
-}
-
-void PrintTfLiteIntVector(const TfLiteIntArray* v) {
-  if (!v) {
-    printf(" (null)\n");
-    return;
-  }
-  for (int k = 0; k < v->size; k++) {
-    printf(" %d", v->data[k]);
-  }
-  printf("\n");
-}
-
-const char* TensorTypeName(TfLiteType type) {
-  switch (type) {
-    case kTfLiteNoType:
-      return "kTfLiteNoType";
-    case kTfLiteFloat32:
-      return "kTfLiteFloat32";
-    case kTfLiteInt32:
-      return "kTfLiteInt32";
-    case kTfLiteUInt8:
-      return "kTfLiteUInt8";
-    case kTfLiteInt8:
-      return "kTfLiteInt8";
-    case kTfLiteInt64:
-      return "kTfLiteInt64";
-    case kTfLiteString:
-      return "kTfLiteString";
-    case kTfLiteBool:
-      return "kTfLiteBool";
-    case kTfLiteInt16:
-      return "kTfLiteInt16";
-    case kTfLiteComplex64:
-      return "kTfLiteComplex64";
-    case kTfLiteComplex128:
-      return "kTfLiteComplex128";
-    case kTfLiteFloat16:
-      return "kTfLiteFloat16";
-    case kTfLiteFloat64:
-      return "kTfLiteFloat64";
-  }
-  return "(invalid)";
-}
-
-const char* AllocTypeName(TfLiteAllocationType type) {
-  switch (type) {
-    case kTfLiteMemNone:
-      return "kTfLiteMemNone";
-    case kTfLiteMmapRo:
-      return "kTfLiteMmapRo";
-    case kTfLiteDynamic:
-      return "kTfLiteDynamic";
-    case kTfLiteArenaRw:
-      return "kTfLiteArenaRw";
-    case kTfLiteArenaRwPersistent:
-      return "kTfLiteArenaRwPersistent";
-    case kTfLitePersistentRo:
-      return "kTfLitePersistentRo";
-    case kTfLiteCustom:
-      return "kTfLiteCustom";
-  }
-  return "(invalid)";
-}
-}  // namespace
-
-// Helper function to print model flatbuffer data. This function is not called
-// by default. Hence it's not linked in to the final binary code.
-void PrintModelData(const Model* model, ErrorReporter* error_reporter) {
-#ifndef TF_LITE_STRIP_ERROR_STRINGS
-  auto* subgraphs = model->subgraphs();
-  const SubGraph* subgraph = (*subgraphs)[0];
-  const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors =
-      subgraph->tensors();
-  const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers =
-      model->buffers();
-  TF_LITE_REPORT_ERROR(error_reporter, "==== Model info: =====");
-  for (size_t i = 0; i < tensors->size(); ++i) {
-    const tflite::Tensor& flatbuffer_tensor = *tensors->Get(i);
-    size_t type_size, tensor_size;
-    auto* buffer = (*buffers)[flatbuffer_tensor.buffer()];
-    auto* array = buffer->data();
-    int array_size = 0;
-    if (array) {
-      array_size = array->size();
-    }
-    BytesRequiredForTensor(flatbuffer_tensor, &tensor_size, &type_size,
-                           error_reporter);
-    TF_LITE_REPORT_ERROR(
-        error_reporter, "Tensor index: %d arena tensor %d size %d ", i,
-        !array_size && !flatbuffer_tensor.is_variable(), tensor_size);
-  }
-#endif
-}
-
-// Prints a dump of what tensors and what nodes are in the interpreter.
-void PrintInterpreterState(MicroInterpreter* interpreter) {
-  printf("Interpreter has %zu tensors and %zu nodes\n",
-         interpreter->tensors_size(), interpreter->operators_size());
-  printf("Inputs:");
-  PrintIntVector(flatbuffersVector2StdVector(interpreter->inputs()));
-  printf("Outputs:");
-  PrintIntVector(flatbuffersVector2StdVector(interpreter->outputs()));
-  printf("\n");
-
-  for (size_t tensor_index = 0; tensor_index < interpreter->tensors_size();
-       tensor_index++) {
-    TfLiteTensor* tensor = interpreter->tensor(static_cast<int>(tensor_index));
-    printf("Tensor %3zu %10s %15s %10zu bytes (%4.1f MB) ", tensor_index,
-           TensorTypeName(tensor->type), AllocTypeName(tensor->allocation_type),
-           tensor->bytes, static_cast<double>(tensor->bytes / (1 << 20)));
-    PrintTfLiteIntVector(tensor->dims);
-  }
-  printf("\n");
-
-  for (size_t node_index = 0; node_index < interpreter->operators_size();
-       node_index++) {
-    const NodeAndRegistration node_and_reg =
-        interpreter->node_and_registration(static_cast<int>(node_index));
-    const TfLiteNode& node = node_and_reg.node;
-    const TfLiteRegistration* reg = node_and_reg.registration;
-    if (reg->custom_name != nullptr) {
-      printf("Node %3zu Operator Custom Name %s\n", node_index,
-             reg->custom_name);
-    } else {
-      printf("Node %3zu Operator Builtin Code %3" PRId32 " %s\n", node_index,
-             reg->builtin_code, EnumNamesBuiltinOperator()[reg->builtin_code]);
-    }
-    printf("  Inputs:");
-    PrintTfLiteIntVector(node.inputs);
-    printf("  Outputs:");
-    PrintTfLiteIntVector(node.outputs);
-  }
-}
-
-}  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_optional_debug_tools.h b/tensorflow/lite/micro/micro_optional_debug_tools.h
deleted file mode 100644
index cc9630e6f12..00000000000
--- a/tensorflow/lite/micro/micro_optional_debug_tools.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-// Optional debugging functionality. For small sized binaries, these are not
-// needed.
-#ifndef TENSORFLOW_LITE_MICRO_MICRO_OPTIONAL_DEBUG_TOOLS_H_
-#define TENSORFLOW_LITE_MICRO_MICRO_OPTIONAL_DEBUG_TOOLS_H_
-
-#include "tensorflow/lite/micro/micro_interpreter.h"
-
-namespace tflite {
-// Helper function to print model flatbuffer data. This function is not called
-// by default. Hence it's not linked in to the final binary code.
-void PrintModelData(const Model* model, ErrorReporter* error_reporter);
-// Prints a dump of what tensors and what nodes are in the interpreter.
-void PrintInterpreterState(MicroInterpreter* interpreter);
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_MICRO_MICRO_OPTIONAL_DEBUG_TOOLS_H_
diff --git a/tensorflow/lite/micro/micro_string.cc b/tensorflow/lite/micro/micro_string.cc
index 95a0ae156ae..ad769f69b0e 100644
--- a/tensorflow/lite/micro/micro_string.cc
+++ b/tensorflow/lite/micro/micro_string.cc
@@ -192,13 +192,15 @@ char* FastFloatToBufferLeft(float f, char* buffer) {
   // works properly.
   *current = '0';
 
-  // Shift fraction values and prepent zeros.
-  for (int i = 0; i < fraction_digits; i++) {
-    current--;
-    *(current + leading_zeros) = *current;
-    *current = '0';
+  // Shift fraction values and prepend zeros if necessary.
+  if (leading_zeros != 0) {
+    for (int i = 0; i < fraction_digits; i++) {
+      current--;
+      *(current + leading_zeros) = *current;
+      *current = '0';
+    }
+    current += kMaxFractionalDigits;
   }
-  current += kMaxFractionalDigits;
 
   // Truncate trailing zeros for cleaner logs. Ensure we leave at least one
   // fractional character for the case when scaled_fraction is 0.
diff --git a/tensorflow/lite/micro/micro_string_test.cc b/tensorflow/lite/micro/micro_string_test.cc
index 400f908f97f..f69812c5a9e 100644
--- a/tensorflow/lite/micro/micro_string_test.cc
+++ b/tensorflow/lite/micro/micro_string_test.cc
@@ -120,6 +120,15 @@ TF_LITE_MICRO_TEST(FloatFormatShouldPrintFractionCorrectly) {
   TF_LITE_MICRO_EXPECT_STRING_EQ(golden, buffer);
 }
 
+TF_LITE_MICRO_TEST(FloatFormatShouldPrintFractionCorrectlyNoLeadingZeros) {
+  const int kBufferLen = 24;
+  char buffer[kBufferLen];
+  const char golden[] = "Float: 1.6332993*2^-1";
+  int bytes_written = MicroSnprintf(buffer, kBufferLen, "Float: %f", 0.816650);
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<int>(sizeof(golden)), bytes_written);
+  TF_LITE_MICRO_EXPECT_STRING_EQ(golden, buffer);
+}
+
 TF_LITE_MICRO_TEST(StringFormatOverrunShouldTruncate) {
   const int kBufferLen = 10;
   char buffer[kBufferLen];
diff --git a/tensorflow/lite/micro/recording_micro_allocator.cc b/tensorflow/lite/micro/recording_micro_allocator.cc
index 7e11523fea0..6bb52974ebc 100644
--- a/tensorflow/lite/micro/recording_micro_allocator.cc
+++ b/tensorflow/lite/micro/recording_micro_allocator.cc
@@ -54,6 +54,8 @@ RecordedAllocation RecordingMicroAllocator::GetRecordedAllocation(
       return recorded_persistent_tflite_tensor_data_;
     case RecordedAllocationType::kPersistentTfLiteTensorQuantizationData:
       return recorded_persistent_tflite_tensor_quantization_data_;
+    case RecordedAllocationType::kPersistentBufferData:
+      return recorded_persistent_buffer_data_;
     case RecordedAllocationType::kTfLiteTensorVariableBufferData:
       return recorded_tflite_tensor_variable_buffer_data_;
     case RecordedAllocationType::kNodeAndRegistrationArray:
@@ -91,6 +93,8 @@ void RecordingMicroAllocator::PrintAllocations() const {
   PrintRecordedAllocation(
       RecordedAllocationType::kPersistentTfLiteTensorQuantizationData,
       "Persistent TfLiteTensor quantization data", "allocations");
+  PrintRecordedAllocation(RecordedAllocationType::kPersistentBufferData,
+                          "Persistent buffer data", "allocations");
   PrintRecordedAllocation(
       RecordedAllocationType::kTfLiteTensorVariableBufferData,
       "TfLiteTensor variable buffer data", "allocations");
@@ -101,17 +105,27 @@ void RecordingMicroAllocator::PrintAllocations() const {
                           "Operator runtime data", "OpData structs");
 }
 
+void* RecordingMicroAllocator::AllocatePersistentBuffer(size_t bytes) {
+  RecordedAllocation allocations = SnapshotAllocationUsage();
+  void* buffer = MicroAllocator::AllocatePersistentBuffer(bytes);
+  RecordAllocationUsage(allocations, recorded_persistent_buffer_data_);
+
+  return buffer;
+}
+
 void RecordingMicroAllocator::PrintRecordedAllocation(
     RecordedAllocationType allocation_type, const char* allocation_name,
     const char* allocation_description) const {
 #ifndef TF_LITE_STRIP_ERROR_STRINGS
   RecordedAllocation allocation = GetRecordedAllocation(allocation_type);
-  TF_LITE_REPORT_ERROR(
-      error_reporter(),
-      "[RecordingMicroAllocator] '%s' used %d bytes with alignment overhead "
-      "(requested %d bytes for %d %s)",
-      allocation_name, allocation.used_bytes, allocation.requested_bytes,
-      allocation.count, allocation_description);
+  if (allocation.used_bytes > 0 || allocation.requested_bytes > 0) {
+    TF_LITE_REPORT_ERROR(
+        error_reporter(),
+        "[RecordingMicroAllocator] '%s' used %d bytes with alignment overhead "
+        "(requested %d bytes for %d %s)",
+        allocation_name, allocation.used_bytes, allocation.requested_bytes,
+        allocation.count, allocation_description);
+  }
 #endif
 }
 
diff --git a/tensorflow/lite/micro/recording_micro_allocator.h b/tensorflow/lite/micro/recording_micro_allocator.h
index 9243fec12e5..47246e17540 100644
--- a/tensorflow/lite/micro/recording_micro_allocator.h
+++ b/tensorflow/lite/micro/recording_micro_allocator.h
@@ -24,10 +24,12 @@ namespace tflite {
 
 // List of buckets currently recorded by this class. Each type keeps a list of
 // allocated information during model initialization.
+// TODO(b/169834511): Add tracking for scratch buffer allocations.
 enum class RecordedAllocationType {
   kTfLiteEvalTensorData,
   kPersistentTfLiteTensorData,
   kPersistentTfLiteTensorQuantizationData,
+  kPersistentBufferData,
   kTfLiteTensorVariableBufferData,
   kNodeAndRegistrationArray,
   kOpData,
@@ -66,6 +68,8 @@ class RecordingMicroAllocator : public MicroAllocator {
   // defined in RecordedAllocationType.
   void PrintAllocations() const;
 
+  void* AllocatePersistentBuffer(size_t bytes) override;
+
  protected:
   TfLiteStatus AllocateNodeAndRegistrations(
       const Model* model,
@@ -77,12 +81,12 @@ class RecordingMicroAllocator : public MicroAllocator {
       const Model* model, TfLiteEvalTensor** eval_tensors) override;
   TfLiteStatus AllocateVariables(const SubGraph* subgraph,
                                  TfLiteEvalTensor* eval_tensors) override;
-  // TODO(b/160894903): Once all kernels have been updated to the new API drop
+  // TODO(b/162311891): Once all kernels have been updated to the new API drop
   // this method. It is only used to record TfLiteTensor persistent allocations.
   TfLiteTensor* AllocatePersistentTfLiteTensorInternal(
       const Model* model, TfLiteEvalTensor* eval_tensors,
       int tensor_index) override;
-  // TODO(b/160894903): Once all kernels have been updated to the new API drop
+  // TODO(b/162311891): Once all kernels have been updated to the new API drop
   // this function since all allocations for quantized data will take place in
   // the temp section.
   TfLiteStatus PopulateTfLiteTensorFromFlatbuffer(const Model* model,
@@ -108,6 +112,7 @@ class RecordingMicroAllocator : public MicroAllocator {
   RecordedAllocation recorded_tflite_eval_tensor_data_ = {};
   RecordedAllocation recorded_persistent_tflite_tensor_data_ = {};
   RecordedAllocation recorded_persistent_tflite_tensor_quantization_data_ = {};
+  RecordedAllocation recorded_persistent_buffer_data_ = {};
   RecordedAllocation recorded_tflite_tensor_variable_buffer_data_ = {};
   RecordedAllocation recorded_node_and_registration_array_data_ = {};
   RecordedAllocation recorded_op_data_ = {};
diff --git a/tensorflow/lite/micro/recording_micro_allocator_test.cc b/tensorflow/lite/micro/recording_micro_allocator_test.cc
index f46bd29abdd..6854341de90 100644
--- a/tensorflow/lite/micro/recording_micro_allocator_test.cc
+++ b/tensorflow/lite/micro/recording_micro_allocator_test.cc
@@ -36,6 +36,7 @@ TF_LITE_MICRO_TESTS_BEGIN
 
 TF_LITE_MICRO_TEST(TestRecordsTfLiteEvalTensorArrayData) {
   TfLiteEvalTensor* eval_tensors = nullptr;
+  tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
   tflite::AllOpsResolver all_ops_resolver;
   tflite::NodeAndRegistration* node_and_registration;
   const tflite::Model* model = tflite::GetModel(kTestConvModelData);
@@ -55,7 +56,8 @@ TF_LITE_MICRO_TEST(TestRecordsTfLiteEvalTensorArrayData) {
   TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
   if (status != kTfLiteOk) return 1;
 
-  status = micro_allocator->FinishModelAllocation(model, eval_tensors);
+  status = micro_allocator->FinishModelAllocation(model, eval_tensors,
+                                                  &scratch_buffer_handles);
   TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
   if (status != kTfLiteOk) return 1;
 
@@ -78,6 +80,7 @@ TF_LITE_MICRO_TEST(TestRecordsTfLiteEvalTensorArrayData) {
 
 TF_LITE_MICRO_TEST(TestRecordsNodeAndRegistrationArrayData) {
   TfLiteEvalTensor* eval_tensors = nullptr;
+  tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
   tflite::AllOpsResolver all_ops_resolver;
   tflite::NodeAndRegistration* node_and_registration;
   const tflite::Model* model = tflite::GetModel(kTestConvModelData);
@@ -95,7 +98,8 @@ TF_LITE_MICRO_TEST(TestRecordsNodeAndRegistrationArrayData) {
   TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
   if (status != kTfLiteOk) return 1;
 
-  status = micro_allocator->FinishModelAllocation(model, eval_tensors);
+  status = micro_allocator->FinishModelAllocation(model, eval_tensors,
+                                                  &scratch_buffer_handles);
   TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
   if (status != kTfLiteOk) return 1;
 
@@ -112,6 +116,7 @@ TF_LITE_MICRO_TEST(TestRecordsNodeAndRegistrationArrayData) {
 
 TF_LITE_MICRO_TEST(TestRecordsMultiTenantAllocations) {
   TfLiteEvalTensor* eval_tensors = nullptr;
+  tflite::ScratchBufferHandle* scratch_buffer_handles = nullptr;
   tflite::AllOpsResolver all_ops_resolver;
   tflite::NodeAndRegistration* node_and_registration;
   const tflite::Model* model = tflite::GetModel(kTestConvModelData);
@@ -133,7 +138,8 @@ TF_LITE_MICRO_TEST(TestRecordsMultiTenantAllocations) {
   TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
   if (status != kTfLiteOk) return 1;
 
-  status = micro_allocator->FinishModelAllocation(model, eval_tensors);
+  status = micro_allocator->FinishModelAllocation(model, eval_tensors,
+                                                  &scratch_buffer_handles);
   TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
   if (status != kTfLiteOk) return 1;
 
@@ -143,8 +149,8 @@ TF_LITE_MICRO_TEST(TestRecordsMultiTenantAllocations) {
   TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
   if (status != kTfLiteOk) return 1;
 
-  status = kTfLiteOk,
-  micro_allocator->FinishModelAllocation(model, eval_tensors);
+  status = kTfLiteOk, micro_allocator->FinishModelAllocation(
+                          model, eval_tensors, &scratch_buffer_handles);
   TF_LITE_MICRO_EXPECT_EQ(status, kTfLiteOk);
   if (status != kTfLiteOk) return 1;
 
@@ -233,6 +239,43 @@ TF_LITE_MICRO_TEST(TestRecordsPersistentTfLiteTensorQuantizationData) {
                           expected_requested_bytes);
 }
 
+TF_LITE_MICRO_TEST(TestRecordsPersistentBufferData) {
+  uint8_t arena[kTestConvArenaSize];
+
+  tflite::RecordingMicroAllocator* micro_allocator =
+      tflite::RecordingMicroAllocator::Create(arena, kTestConvArenaSize,
+                                              micro_test::reporter);
+  TF_LITE_MICRO_EXPECT_NE(micro_allocator, nullptr);
+  if (micro_allocator == nullptr) return 1;
+
+  void* buffer = micro_allocator->AllocatePersistentBuffer(/*bytes=*/100);
+  TF_LITE_MICRO_EXPECT_NE(buffer, nullptr);
+  if (buffer == nullptr) return 1;
+
+  tflite::RecordedAllocation recorded_allocation =
+      micro_allocator->GetRecordedAllocation(
+          tflite::RecordedAllocationType::kPersistentBufferData);
+
+  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.count, static_cast<size_t>(1));
+  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.requested_bytes,
+                          static_cast<size_t>(100));
+  TF_LITE_MICRO_EXPECT_GE(recorded_allocation.used_bytes,
+                          static_cast<size_t>(100));
+
+  buffer = micro_allocator->AllocatePersistentBuffer(/*bytes=*/50);
+  TF_LITE_MICRO_EXPECT_NE(buffer, nullptr);
+  if (buffer == nullptr) return 1;
+
+  recorded_allocation = micro_allocator->GetRecordedAllocation(
+      tflite::RecordedAllocationType::kPersistentBufferData);
+
+  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.count, static_cast<size_t>(2));
+  TF_LITE_MICRO_EXPECT_EQ(recorded_allocation.requested_bytes,
+                          static_cast<size_t>(150));
+  TF_LITE_MICRO_EXPECT_GE(recorded_allocation.used_bytes,
+                          static_cast<size_t>(150));
+}
+
 // TODO(b/158124094): Find a way to audit OpData allocations on
 // cross-architectures.
 
diff --git a/tensorflow/lite/micro/recording_simple_memory_allocator.cc b/tensorflow/lite/micro/recording_simple_memory_allocator.cc
index ef2e9f31664..ef30aca4949 100644
--- a/tensorflow/lite/micro/recording_simple_memory_allocator.cc
+++ b/tensorflow/lite/micro/recording_simple_memory_allocator.cc
@@ -57,12 +57,13 @@ size_t RecordingSimpleMemoryAllocator::GetAllocatedCount() const {
   return alloc_count_;
 }
 
-TfLiteStatus RecordingSimpleMemoryAllocator::EnsureHeadSize(size_t size,
-                                                            size_t alignment) {
-  const uint8_t* previous_head = GetHead();
-  TfLiteStatus status = SimpleMemoryAllocator::EnsureHeadSize(size, alignment);
+TfLiteStatus RecordingSimpleMemoryAllocator::SetHeadBufferSize(
+    size_t size, size_t alignment) {
+  const uint8_t* previous_head = head();
+  TfLiteStatus status =
+      SimpleMemoryAllocator::SetHeadBufferSize(size, alignment);
   if (status == kTfLiteOk) {
-    used_bytes_ += GetHead() - previous_head;
+    used_bytes_ += head() - previous_head;
     requested_head_bytes_ = size;
   }
   return status;
@@ -70,10 +71,10 @@ TfLiteStatus RecordingSimpleMemoryAllocator::EnsureHeadSize(size_t size,
 
 uint8_t* RecordingSimpleMemoryAllocator::AllocateFromTail(size_t size,
                                                           size_t alignment) {
-  const uint8_t* previous_tail = GetTail();
+  const uint8_t* previous_tail = tail();
   uint8_t* result = SimpleMemoryAllocator::AllocateFromTail(size, alignment);
   if (result != nullptr) {
-    used_bytes_ += previous_tail - GetTail();
+    used_bytes_ += previous_tail - tail();
     requested_tail_bytes_ += size;
     alloc_count_++;
   }
diff --git a/tensorflow/lite/micro/recording_simple_memory_allocator.h b/tensorflow/lite/micro/recording_simple_memory_allocator.h
index 8d3e9fb49d4..3526716e3b4 100644
--- a/tensorflow/lite/micro/recording_simple_memory_allocator.h
+++ b/tensorflow/lite/micro/recording_simple_memory_allocator.h
@@ -47,7 +47,7 @@ class RecordingSimpleMemoryAllocator : public SimpleMemoryAllocator {
   // Returns the number of alloc calls from the head or tail.
   size_t GetAllocatedCount() const;
 
-  TfLiteStatus EnsureHeadSize(size_t size, size_t alignment) override;
+  TfLiteStatus SetHeadBufferSize(size_t size, size_t alignment) override;
   uint8_t* AllocateFromTail(size_t size, size_t alignment) override;
 
  private:
diff --git a/tensorflow/lite/micro/recording_simple_memory_allocator_test.cc b/tensorflow/lite/micro/recording_simple_memory_allocator_test.cc
index 6450cb53cac..910c991978d 100644
--- a/tensorflow/lite/micro/recording_simple_memory_allocator_test.cc
+++ b/tensorflow/lite/micro/recording_simple_memory_allocator_test.cc
@@ -84,7 +84,7 @@ TF_LITE_MICRO_TEST(TestRecordsHeadSizeAdjustment) {
                                                    arena_size);
 
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/5, /*alignment=*/1));
+      kTfLiteOk, allocator.SetHeadBufferSize(/*size=*/5, /*alignment=*/1));
   TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), static_cast<size_t>(5));
   TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(),
                           static_cast<size_t>(5));
@@ -108,7 +108,7 @@ TF_LITE_MICRO_TEST(TestRecordsMisalignedHeadSizeAdjustments) {
                                                    arena_size);
 
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/10, /*alignment=*/12));
+      kTfLiteOk, allocator.SetHeadBufferSize(/*size=*/10, /*alignment=*/12));
   // Validate used bytes in 8 byte range that can included alignment of 12:
   TF_LITE_MICRO_EXPECT_GE(allocator.GetUsedBytes(), static_cast<size_t>(10));
   TF_LITE_MICRO_EXPECT_LE(allocator.GetUsedBytes(), static_cast<size_t>(20));
@@ -125,8 +125,8 @@ TF_LITE_MICRO_TEST(TestDoesNotRecordFailedTailAllocations) {
   tflite::RecordingSimpleMemoryAllocator allocator(micro_test::reporter, arena,
                                                    arena_size);
 
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteError, allocator.EnsureHeadSize(/*size=*/2048, /*alignment=*/1));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError, allocator.SetHeadBufferSize(
+                                            /*size=*/2048, /*alignment=*/1));
   TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), static_cast<size_t>(0));
   TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(),
                           static_cast<size_t>(0));
diff --git a/tensorflow/lite/micro/simple_memory_allocator.cc b/tensorflow/lite/micro/simple_memory_allocator.cc
index bea1a9d7175..08b6789e911 100644
--- a/tensorflow/lite/micro/simple_memory_allocator.cc
+++ b/tensorflow/lite/micro/simple_memory_allocator.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -60,27 +60,22 @@ SimpleMemoryAllocator* SimpleMemoryAllocator::Create(
 
 SimpleMemoryAllocator::~SimpleMemoryAllocator() {}
 
-TfLiteStatus SimpleMemoryAllocator::EnsureHeadSize(size_t size,
-                                                   size_t alignment) {
+TfLiteStatus SimpleMemoryAllocator::SetHeadBufferSize(size_t size,
+                                                      size_t alignment) {
   if (head_ != temp_) {
     TF_LITE_REPORT_ERROR(
         error_reporter_,
-        "Internal error: EnsureHeadSize() needs to be called after"
-        "ResetTempAllocations().");
+        "Internal error: SetHeadBufferSize() needs to be called "
+        "after ResetTempAllocations().");
     return kTfLiteError;
   }
 
   uint8_t* const aligned_result = AlignPointerUp(buffer_head_, alignment);
-  if (aligned_result + size < head_) {
-    // Size is below the current head size, just return.
-    return kTfLiteOk;
-  }
-
   const size_t available_memory = tail_ - aligned_result;
   if (available_memory < size) {
     TF_LITE_REPORT_ERROR(
         error_reporter_,
-        "Failed to adjust head size. Requested: %u, available %u, missing: %u",
+        "Failed to set head size. Requested: %u, available %u, missing: %u",
         size, available_memory, size - available_memory);
     return kTfLiteError;
   }
@@ -123,11 +118,7 @@ uint8_t* SimpleMemoryAllocator::AllocateTemp(size_t size, size_t alignment) {
 
 void SimpleMemoryAllocator::ResetTempAllocations() { temp_ = head_; }
 
-uint8_t* SimpleMemoryAllocator::GetHead() const { return head_; }
-
-uint8_t* SimpleMemoryAllocator::GetBufferHead() const { return buffer_head_; }
-
-uint8_t* SimpleMemoryAllocator::GetTail() const { return tail_; }
+uint8_t* SimpleMemoryAllocator::GetHeadBuffer() const { return buffer_head_; }
 
 size_t SimpleMemoryAllocator::GetHeadUsedBytes() const {
   return head_ - buffer_head_;
@@ -138,17 +129,21 @@ size_t SimpleMemoryAllocator::GetTailUsedBytes() const {
 }
 
 size_t SimpleMemoryAllocator::GetAvailableMemory(size_t alignment) const {
-  uint8_t* const aligned_head = AlignPointerUp(head_, alignment);
+  uint8_t* const aligned_temp = AlignPointerUp(temp_, alignment);
   uint8_t* const aligned_tail = AlignPointerDown(tail_, alignment);
-  return aligned_tail - aligned_head;
+  return aligned_tail - aligned_temp;
 }
 
 size_t SimpleMemoryAllocator::GetUsedBytes() const {
-  return GetBufferSize() - (tail_ - head_);
+  return GetBufferSize() - (tail_ - temp_);
 }
 
 size_t SimpleMemoryAllocator::GetBufferSize() const {
   return buffer_tail_ - buffer_head_;
 }
 
+uint8_t* SimpleMemoryAllocator::head() const { return head_; }
+
+uint8_t* SimpleMemoryAllocator::tail() const { return tail_; }
+
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/simple_memory_allocator.h b/tensorflow/lite/micro/simple_memory_allocator.h
index 8c216f47848..35adaf1ad24 100644
--- a/tensorflow/lite/micro/simple_memory_allocator.h
+++ b/tensorflow/lite/micro/simple_memory_allocator.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -43,13 +43,13 @@ class SimpleMemoryAllocator {
                                        uint8_t* buffer_head,
                                        size_t buffer_size);
 
-  // Ensure that the head (lowest address and moving upwards) memory allocation
-  // is at least a given size. This function will only increase the head size if
-  // the passed in value is larger than the current head size. Calls to this
-  // method will also invalidate all temporary allocation values. This call will
-  // fail if a chain of allocations through AllocateTemp() have not been cleaned
-  // up with a call to ResetTempAllocations().
-  virtual TfLiteStatus EnsureHeadSize(size_t size, size_t alignment);
+  // Adjusts the head (lowest address and moving upwards) memory allocation to a
+  // given size. Calls to this method will also invalidate all temporary
+  // allocation values (it sets the location of temp space at the end of the
+  // head section). This call will fail if a chain of allocations through
+  // AllocateTemp() have not been cleaned up with a call to
+  // ResetTempAllocations().
+  virtual TfLiteStatus SetHeadBufferSize(size_t size, size_t alignment);
 
   // Allocates memory starting at the tail of the arena (highest address and
   // moving downwards).
@@ -69,18 +69,31 @@ class SimpleMemoryAllocator {
   // arena (lowest address).
   virtual void ResetTempAllocations();
 
-  uint8_t* GetHead() const;
-  uint8_t* GetBufferHead() const;
-  uint8_t* GetTail() const;
+  // Returns a pointer to the buffer currently assigned to the head section.
+  // This buffer is set by calling SetHeadSize().
+  uint8_t* GetHeadBuffer() const;
 
+  // Returns the size of the head section in bytes.
   size_t GetHeadUsedBytes() const;
+
+  // Returns the size of all allocations in the tail section in bytes.
   size_t GetTailUsedBytes() const;
 
-  // Returns the number of bytes available with a given alignment.
+  // Returns the number of bytes available with a given alignment. This number
+  // takes in account any temporary allocations.
   size_t GetAvailableMemory(size_t alignment) const;
 
+  // Returns the number of used bytes in the allocator. This number takes in
+  // account any temporary allocations.
   size_t GetUsedBytes() const;
 
+ protected:
+  // Returns a pointer to the current end of the head buffer.
+  uint8_t* head() const;
+
+  // Returns a pointer to the current end of the tail buffer.
+  uint8_t* tail() const;
+
  private:
   size_t GetBufferSize() const;
 
diff --git a/tensorflow/lite/micro/simple_memory_allocator_test.cc b/tensorflow/lite/micro/simple_memory_allocator_test.cc
index adffc9566da..eea7a7fad86 100644
--- a/tensorflow/lite/micro/simple_memory_allocator_test.cc
+++ b/tensorflow/lite/micro/simple_memory_allocator_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -28,20 +28,20 @@ TF_LITE_MICRO_TEST(TestEnsureHeadSizeSimpleAlignment) {
   tflite::SimpleMemoryAllocator allocator(micro_test::reporter, arena,
                                           arena_size);
 
-  // First head adjustment
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/100, /*alignment=*/1));
-  TF_LITE_MICRO_EXPECT(arena + 100 == allocator.GetHead());
+      kTfLiteOk, allocator.SetHeadBufferSize(/*size=*/100, /*alignment=*/1));
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(100),
+                          allocator.GetHeadUsedBytes());
 
-  // Second head adjusment is smaller, head size should still be 100.
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/10, /*alignment=*/1));
-  TF_LITE_MICRO_EXPECT(arena + 100 == allocator.GetHead());
+      kTfLiteOk, allocator.SetHeadBufferSize(/*size=*/10, /*alignment=*/1));
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(10),
+                          allocator.GetHeadUsedBytes());
 
-  // Third head adjustment re-increases the head size:
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/1000, /*alignment=*/1));
-  TF_LITE_MICRO_EXPECT(arena + 1000 == allocator.GetHead());
+      kTfLiteOk, allocator.SetHeadBufferSize(/*size=*/1000, /*alignment=*/1));
+  TF_LITE_MICRO_EXPECT_EQ(static_cast<size_t>(1000),
+                          allocator.GetHeadUsedBytes());
 }
 
 TF_LITE_MICRO_TEST(TestAdjustHeadSizeMisalignment) {
@@ -52,25 +52,22 @@ TF_LITE_MICRO_TEST(TestAdjustHeadSizeMisalignment) {
 
   // First head adjustment of 100 bytes (aligned 12):
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/100, /*alignment=*/12));
+      kTfLiteOk, allocator.SetHeadBufferSize(/*size=*/100, /*alignment=*/12));
 
   // Offset alignment of 12 can lead to allocation within 8 byte range of
   // requested bytes based to arena alignment at runtime:
-  TF_LITE_MICRO_EXPECT_GE(allocator.GetHead(), arena + 100);
-  TF_LITE_MICRO_EXPECT_LE(allocator.GetHead(), arena + 100 + 11);
+  TF_LITE_MICRO_EXPECT_GE(allocator.GetHeadUsedBytes(), 100);
+  TF_LITE_MICRO_EXPECT_LE(allocator.GetHeadUsedBytes(), 100 + 11);
 
-  // Second head adjusment shrinks the head size (aligned at 12), head size
-  // should still be 100:
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/10, /*alignment=*/12));
-  TF_LITE_MICRO_EXPECT_GE(allocator.GetHead(), arena + 100);
-  TF_LITE_MICRO_EXPECT_LE(allocator.GetHead(), arena + 100 + 11);
+      kTfLiteOk, allocator.SetHeadBufferSize(/*size=*/10, /*alignment=*/12));
+  TF_LITE_MICRO_EXPECT_GE(allocator.GetHeadUsedBytes(), 10);
+  TF_LITE_MICRO_EXPECT_LE(allocator.GetHeadUsedBytes(), 100 + 11);
 
-  // Third head adjustment re-increases the head size (aligned at 12):
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/1000, /*alignment=*/12));
-  TF_LITE_MICRO_EXPECT_GE(allocator.GetHead(), arena + 1000);
-  TF_LITE_MICRO_EXPECT_LE(allocator.GetHead(), arena + 1000 + 11);
+      kTfLiteOk, allocator.SetHeadBufferSize(/*size=*/1000, /*alignment=*/12));
+  TF_LITE_MICRO_EXPECT_GE(allocator.GetHeadUsedBytes(), 1000);
+  TF_LITE_MICRO_EXPECT_LE(allocator.GetHeadUsedBytes(), 1000 + 11);
 }
 
 TF_LITE_MICRO_TEST(TestAdjustHeadSizeMisalignedHandlesCorrectBytesAvailable) {
@@ -81,7 +78,7 @@ TF_LITE_MICRO_TEST(TestAdjustHeadSizeMisalignedHandlesCorrectBytesAvailable) {
 
   // First head adjustment of 100 bytes (aligned 12):
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/100, /*alignment=*/12));
+      kTfLiteOk, allocator.SetHeadBufferSize(/*size=*/100, /*alignment=*/12));
 
   // allocator.GetAvailableMemory() should also report the actual amount of
   // memory available based on a requested offset (12):
@@ -90,23 +87,92 @@ TF_LITE_MICRO_TEST(TestAdjustHeadSizeMisalignedHandlesCorrectBytesAvailable) {
   TF_LITE_MICRO_EXPECT_LE(aligned_available_bytes, arena_size - 100);
   TF_LITE_MICRO_EXPECT_GE(aligned_available_bytes, arena_size - 100 - 24);
 
-  // Second head adjusment shrinks the head size (aligned at 12), head size
-  // should still be 100:
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/10, /*alignment=*/12));
+      kTfLiteOk, allocator.SetHeadBufferSize(/*size=*/10, /*alignment=*/12));
   aligned_available_bytes = allocator.GetAvailableMemory(/*alignment=*/12);
 
-  TF_LITE_MICRO_EXPECT_LE(aligned_available_bytes, arena_size - 100);
-  TF_LITE_MICRO_EXPECT_GE(aligned_available_bytes, arena_size - 100 - 24);
+  TF_LITE_MICRO_EXPECT_LE(aligned_available_bytes, arena_size - 10);
+  TF_LITE_MICRO_EXPECT_GE(aligned_available_bytes, arena_size - 10 - 24);
 
-  // Third head adjustment re-increases the head size (aligned at 12):
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, allocator.EnsureHeadSize(/*size=*/1000, /*alignment=*/12));
+      kTfLiteOk, allocator.SetHeadBufferSize(/*size=*/1000, /*alignment=*/12));
   aligned_available_bytes = allocator.GetAvailableMemory(/*alignment=*/12);
   TF_LITE_MICRO_EXPECT_LE(aligned_available_bytes, arena_size - 1000);
   TF_LITE_MICRO_EXPECT_GE(aligned_available_bytes, arena_size - 1000 - 24);
 }
 
+TF_LITE_MICRO_TEST(TestGetAvailableMemory) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::SimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                          arena_size);
+
+  constexpr size_t allocation_size = 100;
+  allocator.SetHeadBufferSize(/*size=*/allocation_size,
+                              /*alignment=*/1);
+  allocator.AllocateFromTail(/*size=*/allocation_size,
+                             /*alignment=*/1);
+
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAvailableMemory(/*alignment=*/1),
+                          arena_size - allocation_size * 2);
+}
+
+TF_LITE_MICRO_TEST(TestGetAvailableMemoryWithTempAllocations) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::SimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                          arena_size);
+
+  constexpr size_t allocation_size = 100;
+  allocator.AllocateTemp(/*size=*/allocation_size,
+                         /*alignment=*/1);
+
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAvailableMemory(/*alignment=*/1),
+                          arena_size - allocation_size);
+
+  // Reset temp allocations and ensure GetAvailableMemory() is back to the
+  // starting size:
+  allocator.ResetTempAllocations();
+
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAvailableMemory(/*alignment=*/1),
+                          arena_size);
+}
+
+TF_LITE_MICRO_TEST(TestGetUsedBytes) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::SimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                          arena_size);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), static_cast<size_t>(0));
+
+  constexpr size_t allocation_size = 100;
+  allocator.SetHeadBufferSize(/*size=*/allocation_size,
+                              /*alignment=*/1);
+  allocator.AllocateFromTail(/*size=*/allocation_size,
+                             /*alignment=*/1);
+
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), allocation_size * 2);
+}
+
+TF_LITE_MICRO_TEST(TestGetUsedBytesTempAllocations) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::SimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                          arena_size);
+
+  constexpr size_t allocation_size = 100;
+  allocator.AllocateTemp(/*size=*/allocation_size,
+                         /*alignment=*/1);
+
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), allocation_size);
+
+  // Reset temp allocations and ensure GetUsedBytes() is back to the starting
+  // size:
+  allocator.ResetTempAllocations();
+
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), static_cast<size_t>(0));
+}
+
 TF_LITE_MICRO_TEST(TestJustFits) {
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
@@ -190,16 +256,16 @@ TF_LITE_MICRO_TEST(TestEnsureHeadSizeWithoutResettingTemp) {
 
   // Adjustment to head should fail since temp allocation was not followed by a
   // call to ResetTempAllocations().
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError, allocator.EnsureHeadSize(100, 1));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError, allocator.SetHeadBufferSize(100, 1));
 
   allocator.ResetTempAllocations();
 
   // Reduce head size back to zero.
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator.EnsureHeadSize(0, 1));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, allocator.SetHeadBufferSize(0, 1));
 
   // The most recent head allocation should be in the same location as the
   // original temp allocation pointer.
-  TF_LITE_MICRO_EXPECT(temp == allocator.GetHead());
+  TF_LITE_MICRO_EXPECT(temp == allocator.GetHeadBuffer());
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/test_helpers.cc b/tensorflow/lite/micro/test_helpers.cc
index 23c7ca96408..82a57890231 100644
--- a/tensorflow/lite/micro/test_helpers.cc
+++ b/tensorflow/lite/micro/test_helpers.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/all_ops_resolver.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 
 namespace tflite {
 namespace testing {
@@ -107,8 +108,11 @@ class ModelBuilder {
 
   // Constructs the flatbuffer model using `builder_` and return a pointer to
   // it. The returned model has the same lifetime as `builder_`.
+  // Note the default value of 0 for num_subgraph_inputs means all tensor inputs
+  // are in subgraph input list.
   const Model* BuildModel(std::initializer_list<Tensor> inputs,
-                          std::initializer_list<Tensor> outputs);
+                          std::initializer_list<Tensor> outputs,
+                          size_t num_subgraph_inputs = 0);
 
  private:
   // Adds a tensor to the model.
@@ -179,7 +183,8 @@ void ModelBuilder::AddMetadata(const char* description_string,
 
 const Model* ModelBuilder::BuildModel(
     std::initializer_list<ModelBuilder::Tensor> inputs,
-    std::initializer_list<ModelBuilder::Tensor> outputs) {
+    std::initializer_list<ModelBuilder::Tensor> outputs,
+    size_t num_subgraph_inputs) {
   // Model schema requires an empty buffer at idx 0.
   size_t buffer_size = 1 + ModelBuilder::nbr_of_metadata_buffers_;
   flatbuffers::Offset<Buffer> buffers[kMaxMetadataBuffers];
@@ -193,10 +198,21 @@ const Model* ModelBuilder::BuildModel(
 
   // TFLM only supports single subgraph.
   constexpr size_t subgraphs_size = 1;
+
+  // Find out number of subgraph inputs.
+  if (num_subgraph_inputs == 0) {
+    // This is the default case.
+    num_subgraph_inputs = inputs.size();
+  } else {
+    // A non-zero value of num_subgraph_inputs means that some of
+    // the operator input tensors are not subgraph inputs.
+    TFLITE_DCHECK(num_subgraph_inputs < inputs.size());
+  }
+
   const flatbuffers::Offset<SubGraph> subgraphs[subgraphs_size] = {
       tflite::CreateSubGraph(
           *builder_, builder_->CreateVector(tensors_, next_tensor_id_),
-          builder_->CreateVector(inputs.begin(), inputs.size()),
+          builder_->CreateVector(inputs.begin(), num_subgraph_inputs),
           builder_->CreateVector(outputs.begin(), outputs.size()),
           builder_->CreateVector(operators_, next_operator_id_),
           builder_->CreateString("test_subgraph"))};
@@ -301,7 +317,8 @@ const Model* BuildSimpleModelWithBranch() {
 const Model* BuildModelWithOfflinePlanning(int number_of_tensors,
                                            const int32_t* metadata_buffer,
                                            NodeConnection* node_conn,
-                                           int num_conns) {
+                                           int num_conns,
+                                           int num_subgraph_inputs) {
   using flatbuffers::Offset;
   flatbuffers::FlatBufferBuilder* fb_builder = BuilderInstance();
 
@@ -323,8 +340,8 @@ const Model* BuildModelWithOfflinePlanning(int number_of_tensors,
       "OfflineMemoryAllocation", metadata_buffer,
       number_of_tensors + tflite::testing::kOfflinePlannerHeaderSize);
 
-  return model_builder.BuildModel(node_conn[0].input,
-                                  node_conn[num_conns - 1].output);
+  return model_builder.BuildModel(
+      node_conn[0].input, node_conn[num_conns - 1].output, num_subgraph_inputs);
 }
 
 const Model* BuildSimpleMockModel() {
@@ -585,7 +602,8 @@ TfLiteStatus SimpleStatefulOp::Prepare(TfLiteContext* context,
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   // Make sure that the input is in uint8_t with at least 1 data entry.
-  const TfLiteTensor* input = tflite::GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   if (input->type != kTfLiteUInt8) return kTfLiteError;
   if (NumElements(input->dims) == 0) return kTfLiteError;
 
@@ -593,15 +611,21 @@ TfLiteStatus SimpleStatefulOp::Prepare(TfLiteContext* context,
   TF_LITE_ENSURE_STATUS(context->RequestScratchBufferInArena(
       context, sizeof(uint8_t) * NumElements(input->dims),
       &data->sorting_buffer));
+  // We can interleave scratch / persistent buffer allocation.
+  data->invoke_count = reinterpret_cast<int*>(
+      context->AllocatePersistentBuffer(context, sizeof(int)));
+  *data->invoke_count = 0;
+
   return kTfLiteOk;
 }
 
 TfLiteStatus SimpleStatefulOp::Invoke(TfLiteContext* context,
                                       TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
-  data->invoke_count += 1;
+  *data->invoke_count += 1;
 
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
   const uint8_t* input_data = GetTensorData<uint8_t>(input);
   int size = NumElements(input->dims);
 
@@ -620,13 +644,17 @@ TfLiteStatus SimpleStatefulOp::Invoke(TfLiteContext* context,
     }
   }
 
-  TfLiteTensor* median = GetOutput(context, node, kMedianTensor);
+  TfLiteTensor* median;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kMedianTensor, &median));
   uint8_t* median_data = GetTensorData<uint8_t>(median);
-  TfLiteTensor* invoke_count = GetOutput(context, node, kInvokeCount);
+  TfLiteTensor* invoke_count;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kInvokeCount, &invoke_count));
   int32_t* invoke_count_data = GetTensorData<int32_t>(invoke_count);
 
   median_data[0] = sorting_buffer[size / 2];
-  invoke_count_data[0] = data->invoke_count;
+  invoke_count_data[0] = *data->invoke_count;
   return kTfLiteOk;
 }
 
@@ -660,11 +688,14 @@ TfLiteStatus MockCustom::Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus MockCustom::Invoke(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = tflite::GetInput(context, node, 0);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
   const int32_t* input_data = input->data.i32;
-  const TfLiteTensor* weight = tflite::GetInput(context, node, 1);
+  const TfLiteTensor* weight;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &weight));
   const uint8_t* weight_data = weight->data.uint8;
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
   int32_t* output_data = output->data.i32;
   output_data[0] =
       0;  // Catch output tensor sharing memory with an input tensor
@@ -710,9 +741,10 @@ const Model* GetSimpleModelWithBranch() {
 const Model* GetModelWithOfflinePlanning(int num_tensors,
                                          const int32_t* metadata_buffer,
                                          NodeConnection* node_conn,
-                                         int num_conns) {
+                                         int num_conns,
+                                         int num_subgraph_inputs) {
   const Model* model = BuildModelWithOfflinePlanning(
-      num_tensors, metadata_buffer, node_conn, num_conns);
+      num_tensors, metadata_buffer, node_conn, num_conns, num_subgraph_inputs);
   return model;
 }
 
diff --git a/tensorflow/lite/micro/test_helpers.h b/tensorflow/lite/micro/test_helpers.h
index a7897145d26..57c6c365662 100644
--- a/tensorflow/lite/micro/test_helpers.h
+++ b/tensorflow/lite/micro/test_helpers.h
@@ -19,6 +19,7 @@ limitations under the License.
 // Useful functions for writing tests.
 
 #include <cstdint>
+#include <limits>
 
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
@@ -49,7 +50,7 @@ class SimpleStatefulOp {
   static constexpr int kMedianTensor = 0;
   static constexpr int kInvokeCount = 1;
   struct OpData {
-    int invoke_count = 0;
+    int* invoke_count = nullptr;
     int sorting_buffer = kBufferNotAllocated;
   };
 
@@ -88,10 +89,22 @@ const Model* GetComplexMockModel();
 const Model* GetSimpleModelWithBranch();
 
 // Returns a simple flatbuffer model with offline planned tensors
+// @param[in]       num_tensors           Number of tensors in the model.
+// @param[in]       metadata_buffer       Metadata for offline planner.
+// @param[in]       node_con              List of connections, i.e. operators
+//                                        in the model.
+// @param[in]       num_conns             Number of connections.
+// @param[in]       num_subgraph_inputs   How many of the input tensors are in
+//                                        the subgraph inputs. The default value
+//                                        of 0 means all of the input tensors
+//                                        are in the subgraph input list. There
+//                                        must be at least 1 input tensor in the
+//                                        subgraph input list.
 const Model* GetModelWithOfflinePlanning(int num_tensors,
                                          const int32_t* metadata_buffer,
                                          NodeConnection* node_conn,
-                                         int num_conns);
+                                         int num_conns,
+                                         int num_subgraph_inputs = 0);
 
 // Returns a flatbuffer model with `simple_stateful_op`
 const Model* GetSimpleStatefulModel();
@@ -180,6 +193,21 @@ TfLiteTensor CreateSymmetricPerChannelQuantizedTensor(
 // Returns the number of tensors in the default subgraph for a tflite::Model.
 size_t GetModelTensorCount(const Model* model);
 
+// Derives the quantization scaling factor from a min and max range.
+template <typename T>
+inline float ScaleFromMinMax(const float min, const float max) {
+  return (max - min) /
+         static_cast<float>((std::numeric_limits<T>::max() * 1.0) -
+                            std::numeric_limits<T>::min());
+}
+
+// Derives the quantization zero point from a min and max range.
+template <typename T>
+inline int ZeroPointFromMinMax(const float min, const float max) {
+  return static_cast<int>(std::numeric_limits<T>::min()) +
+         static_cast<int>(-min / ScaleFromMinMax<T>(min, max) + 0.5f);
+}
+
 }  // namespace testing
 }  // namespace tflite
 
diff --git a/tensorflow/lite/micro/testing/BUILD b/tensorflow/lite/micro/testing/BUILD
index 207d500c53d..6b382929923 100644
--- a/tensorflow/lite/micro/testing/BUILD
+++ b/tensorflow/lite/micro/testing/BUILD
@@ -20,12 +20,8 @@ exports_files(["test_linux_binary.sh"])
 
 cc_library(
     name = "micro_test",
-    srcs = [
-        "test_utils.cc",
-    ],
     hdrs = [
         "micro_test.h",
-        "test_utils.h",
     ],
     visibility = [
         ":micro",
diff --git a/tensorflow/lite/micro/testing/test_utils.cc b/tensorflow/lite/micro/testing/test_utils.cc
deleted file mode 100644
index 4d931bdd33b..00000000000
--- a/tensorflow/lite/micro/testing/test_utils.cc
+++ /dev/null
@@ -1,240 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/micro/testing/test_utils.h"
-
-#include "tensorflow/lite/micro/simple_memory_allocator.h"
-
-namespace tflite {
-namespace testing {
-
-namespace {
-// TODO(b/141330728): Refactor out of test_utils.cc
-// The variables below (and the AllocatePersistentBuffer function) are only
-// needed for the kernel tests and benchmarks, i.e. where we do not have an
-// interpreter object, and the fully featured MicroAllocator.
-// Currently, these need to be sufficient for all the kernel_tests. If that
-// becomes problematic, we can investigate allowing the arena_size to be
-// specified for each call to PopulatContext.
-constexpr size_t kArenaSize = 10000;
-uint8_t raw_arena_[kArenaSize];
-SimpleMemoryAllocator* simple_memory_allocator_ = nullptr;
-constexpr size_t kBufferAlignment = 16;
-
-// We store the pointer to the ith scratch buffer to implement the Request/Get
-// ScratchBuffer API for the tests. scratch_buffers_[i] will be the ith scratch
-// buffer and will still be allocated from within raw_arena_.
-constexpr int kNumScratchBuffers = 5;
-uint8_t* scratch_buffers_[kNumScratchBuffers];
-int scratch_buffer_count_ = 0;
-
-// Note that the context parameter in this function is only needed to match the
-// signature of TfLiteContext::AllocatePersistentBuffer and isn't needed in the
-// implementation because we are assuming a single global
-// simple_memory_allocator_
-void* AllocatePersistentBuffer(TfLiteContext* context, size_t bytes) {
-  TFLITE_DCHECK(simple_memory_allocator_ != nullptr);
-  return simple_memory_allocator_->AllocateFromTail(bytes, kBufferAlignment);
-}
-
-TfLiteStatus RequestScratchBufferInArena(TfLiteContext* context, size_t bytes,
-                                         int* buffer_index) {
-  TFLITE_DCHECK(simple_memory_allocator_ != nullptr);
-  TFLITE_DCHECK(buffer_index != nullptr);
-
-  if (scratch_buffer_count_ == kNumScratchBuffers) {
-    TF_LITE_REPORT_ERROR(
-        static_cast<ErrorReporter*>(context->impl_),
-        "Exceeded the maximum number of scratch tensors allowed (%d).",
-        kNumScratchBuffers);
-    return kTfLiteError;
-  }
-
-  // For tests, we allocate scratch buffers from the tail and keep them around
-  // for the lifetime of model. This means that the arena size in the tests will
-  // be more than what we would have if the scratch buffers could share memory.
-  scratch_buffers_[scratch_buffer_count_] =
-      simple_memory_allocator_->AllocateFromTail(bytes, kBufferAlignment);
-  TFLITE_DCHECK(scratch_buffers_[scratch_buffer_count_] != nullptr);
-
-  *buffer_index = scratch_buffer_count_++;
-  return kTfLiteOk;
-}
-
-void* GetScratchBuffer(TfLiteContext* context, int buffer_index) {
-  TFLITE_DCHECK(scratch_buffer_count_ <= kNumScratchBuffers);
-  if (buffer_index >= scratch_buffer_count_) {
-    return nullptr;
-  }
-  return scratch_buffers_[buffer_index];
-}
-
-TfLiteTensor* GetTensor(const struct TfLiteContext* context, int subgraph_idx) {
-  // TODO(b/160894903): Return this value from temp allocated memory.
-  return &context->tensors[subgraph_idx];
-}
-
-}  // namespace
-
-uint8_t F2Q(float value, float min, float max) {
-  int32_t result = ZeroPointFromMinMax<uint8_t>(min, max) +
-                   (value / ScaleFromMinMax<uint8_t>(min, max)) + 0.5f;
-  if (result < std::numeric_limits<uint8_t>::min()) {
-    result = std::numeric_limits<uint8_t>::min();
-  }
-  if (result > std::numeric_limits<uint8_t>::max()) {
-    result = std::numeric_limits<uint8_t>::max();
-  }
-  return result;
-}
-
-// Converts a float value into a signed eight-bit quantized value.
-int8_t F2QS(float value, float min, float max) {
-  return F2Q(value, min, max) + std::numeric_limits<int8_t>::min();
-}
-
-int32_t F2Q32(float value, float scale) {
-  double quantized = static_cast<double>(value / scale);
-  if (quantized > std::numeric_limits<int32_t>::max()) {
-    quantized = std::numeric_limits<int32_t>::max();
-  } else if (quantized < std::numeric_limits<int32_t>::min()) {
-    quantized = std::numeric_limits<int32_t>::min();
-  }
-  return static_cast<int>(quantized);
-}
-
-// TODO(b/141330728): Move this method elsewhere as part clean up.
-void PopulateContext(TfLiteTensor* tensors, int tensors_size,
-                     ErrorReporter* error_reporter, TfLiteContext* context) {
-  simple_memory_allocator_ =
-      SimpleMemoryAllocator::Create(error_reporter, raw_arena_, kArenaSize);
-  TFLITE_DCHECK(simple_memory_allocator_ != nullptr);
-  scratch_buffer_count_ = 0;
-
-  context->tensors_size = tensors_size;
-  context->tensors = tensors;
-  context->impl_ = static_cast<void*>(error_reporter);
-  context->GetExecutionPlan = nullptr;
-  context->ResizeTensor = nullptr;
-  context->ReportError = ReportOpError;
-  context->AddTensors = nullptr;
-  context->GetNodeAndRegistration = nullptr;
-  context->ReplaceNodeSubsetsWithDelegateKernels = nullptr;
-  context->recommended_num_threads = 1;
-  context->GetExternalContext = nullptr;
-  context->SetExternalContext = nullptr;
-
-  context->GetTensor = GetTensor;
-  context->GetEvalTensor = nullptr;
-
-  context->AllocatePersistentBuffer = AllocatePersistentBuffer;
-  context->RequestScratchBufferInArena = RequestScratchBufferInArena;
-  context->GetScratchBuffer = GetScratchBuffer;
-
-  for (int i = 0; i < tensors_size; ++i) {
-    if (context->tensors[i].is_variable) {
-      ResetVariableTensor(&context->tensors[i]);
-    }
-  }
-}
-
-TfLiteTensor CreateQuantizedTensor(const uint8_t* data, TfLiteIntArray* dims,
-                                   float min, float max, bool is_variable) {
-  TfLiteTensor result;
-  result.type = kTfLiteUInt8;
-  result.data.uint8 = const_cast<uint8_t*>(data);
-  result.dims = dims;
-  result.params = {ScaleFromMinMax<uint8_t>(min, max),
-                   ZeroPointFromMinMax<uint8_t>(min, max)};
-  result.allocation_type = kTfLiteMemNone;
-  result.bytes = ElementCount(*dims) * sizeof(uint8_t);
-  result.is_variable = false;
-  return result;
-}
-
-TfLiteTensor CreateQuantizedTensor(const int8_t* data, TfLiteIntArray* dims,
-                                   float min, float max, bool is_variable) {
-  TfLiteTensor result;
-  result.type = kTfLiteInt8;
-  result.data.int8 = const_cast<int8_t*>(data);
-  result.dims = dims;
-  result.params = {ScaleFromMinMax<int8_t>(min, max),
-                   ZeroPointFromMinMax<int8_t>(min, max)};
-  result.allocation_type = kTfLiteMemNone;
-  result.bytes = ElementCount(*dims) * sizeof(int8_t);
-  result.is_variable = is_variable;
-  return result;
-}
-
-TfLiteTensor CreateQuantizedTensor(float* data, uint8_t* quantized_data,
-                                   TfLiteIntArray* dims, bool is_variable) {
-  TfLiteTensor result;
-  SymmetricQuantize(data, dims, quantized_data, &result.params.scale);
-  result.data.uint8 = quantized_data;
-  result.type = kTfLiteUInt8;
-  result.dims = dims;
-  result.params.zero_point = 128;
-  result.allocation_type = kTfLiteMemNone;
-  result.bytes = ElementCount(*dims) * sizeof(uint8_t);
-  result.is_variable = is_variable;
-  return result;
-}
-
-TfLiteTensor CreateQuantizedTensor(float* data, int8_t* quantized_data,
-                                   TfLiteIntArray* dims, bool is_variable) {
-  TfLiteTensor result;
-  SignedSymmetricQuantize(data, dims, quantized_data, &result.params.scale);
-  result.data.int8 = quantized_data;
-  result.type = kTfLiteInt8;
-  result.dims = dims;
-  result.params.zero_point = 0;
-  result.allocation_type = kTfLiteMemNone;
-  result.bytes = ElementCount(*dims) * sizeof(int8_t);
-  result.is_variable = is_variable;
-  return result;
-}
-
-TfLiteTensor CreateQuantizedTensor(float* data, int16_t* quantized_data,
-                                   TfLiteIntArray* dims, bool is_variable) {
-  TfLiteTensor result;
-  SignedSymmetricQuantize(data, dims, quantized_data, &result.params.scale);
-  result.data.i16 = quantized_data;
-  result.type = kTfLiteInt16;
-  result.dims = dims;
-  result.params.zero_point = 0;
-  result.allocation_type = kTfLiteMemNone;
-  result.bytes = ElementCount(*dims) * sizeof(int16_t);
-  result.is_variable = is_variable;
-  return result;
-}
-
-TfLiteTensor CreateQuantized32Tensor(const int32_t* data, TfLiteIntArray* dims,
-                                     float scale, bool is_variable) {
-  TfLiteTensor result;
-  result.type = kTfLiteInt32;
-  result.data.i32 = const_cast<int32_t*>(data);
-  result.dims = dims;
-  // Quantized int32_t tensors always have a zero point of 0, since the range of
-  // int32_t values is large, and because zero point costs extra cycles during
-  // processing.
-  result.params = {scale, 0};
-  result.allocation_type = kTfLiteMemNone;
-  result.bytes = ElementCount(*dims) * sizeof(int32_t);
-  result.is_variable = is_variable;
-  return result;
-}
-
-}  // namespace testing
-}  // namespace tflite
diff --git a/tensorflow/lite/micro/testing/test_utils.h b/tensorflow/lite/micro/testing/test_utils.h
deleted file mode 100644
index e83ac806d8a..00000000000
--- a/tensorflow/lite/micro/testing/test_utils.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_LITE_MICRO_TESTING_TEST_UTILS_H_
-#define TENSORFLOW_LITE_MICRO_TESTING_TEST_UTILS_H_
-
-#include <cmath>
-#include <cstdint>
-#include <limits>
-
-#include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/api/tensor_utils.h"
-#include "tensorflow/lite/micro/micro_utils.h"
-#include "tensorflow/lite/micro/test_helpers.h"
-#include "tensorflow/lite/micro/testing/micro_test.h"
-
-namespace tflite {
-namespace testing {
-
-// Note: These methods are deprecated, do not use.  See b/141332970.
-
-
-// Derives the quantization range max from scaling factor and zero point.
-template <typename T>
-inline float MaxFromZeroPointScale(const int zero_point, const float scale) {
-  return (std::numeric_limits<T>::max() - zero_point) * scale;
-}
-
-// Derives the quantization range min from scaling factor and zero point.
-template <typename T>
-inline float MinFromZeroPointScale(const int zero_point, const float scale) {
-  return (std::numeric_limits<T>::min() - zero_point) * scale;
-}
-
-// Derives the quantization scaling factor from a min and max range.
-template <typename T>
-inline float ScaleFromMinMax(const float min, const float max) {
-  return (max - min) /
-         static_cast<float>((std::numeric_limits<T>::max() * 1.0) -
-                            std::numeric_limits<T>::min());
-}
-
-// Derives the quantization zero point from a min and max range.
-template <typename T>
-inline int ZeroPointFromMinMax(const float min, const float max) {
-  return static_cast<int>(std::numeric_limits<T>::min()) +
-         static_cast<int>(-min / ScaleFromMinMax<T>(min, max) + 0.5f);
-}
-
-// Converts a float value into an unsigned eight-bit quantized value.
-uint8_t F2Q(float value, float min, float max);
-
-// Converts a float value into a signed eight-bit quantized value.
-int8_t F2QS(const float value, const float min, const float max);
-
-// Converts a float value into a signed thirty-two-bit quantized value.  Note
-// that values close to max int and min int may see significant error due to
-// a lack of floating point granularity for large values.
-int32_t F2Q32(const float value, const float scale);
-
-// TODO(b/141330728): Move this method elsewhere as part clean up.
-void PopulateContext(TfLiteTensor* tensors, int tensors_size,
-                     ErrorReporter* error_reporter, TfLiteContext* context);
-
-TfLiteTensor CreateQuantizedTensor(const uint8_t* data, TfLiteIntArray* dims,
-                                   float min, float max,
-                                   bool is_variable = false);
-
-TfLiteTensor CreateQuantizedTensor(const int8_t* data, TfLiteIntArray* dims,
-                                   float min, float max,
-                                   bool is_variable = false);
-
-TfLiteTensor CreateQuantizedTensor(float* data, uint8_t* quantized_data,
-                                   TfLiteIntArray* dims,
-                                   bool is_variable = false);
-
-TfLiteTensor CreateQuantizedTensor(float* data, int8_t* quantized_data,
-                                   TfLiteIntArray* dims,
-                                   bool is_variable = false);
-
-TfLiteTensor CreateQuantizedTensor(float* data, int16_t* quantized_data,
-                                   TfLiteIntArray* dims,
-                                   bool is_variable = false);
-
-TfLiteTensor CreateQuantized32Tensor(const int32_t* data, TfLiteIntArray* dims,
-                                     float scale, bool is_variable = false);
-
-template <typename input_type = int32_t,
-          TfLiteType tensor_input_type = kTfLiteInt32>
-inline TfLiteTensor CreateTensor(const input_type* data, TfLiteIntArray* dims,
-                                 bool is_variable = false) {
-  TfLiteTensor result;
-  result.type = tensor_input_type;
-  result.data.raw = reinterpret_cast<char*>(const_cast<input_type*>(data));
-  result.dims = dims;
-  result.allocation_type = kTfLiteMemNone;
-  result.bytes = ElementCount(*dims) * sizeof(input_type);
-  result.is_variable = is_variable;
-  return result;
-}
-
-}  // namespace testing
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_MICRO_TESTING_TEST_UTILS_H_
diff --git a/tensorflow/lite/micro/testing/util_test.cc b/tensorflow/lite/micro/testing/util_test.cc
index 261e9f29a25..1720c81ceaf 100644
--- a/tensorflow/lite/micro/testing/util_test.cc
+++ b/tensorflow/lite/micro/testing/util_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 TF_LITE_MICRO_TESTS_BEGIN
 
diff --git a/tensorflow/lite/micro/testing_helpers_test.cc b/tensorflow/lite/micro/testing_helpers_test.cc
index 885bd873b53..7ef669fb9a5 100644
--- a/tensorflow/lite/micro/testing_helpers_test.cc
+++ b/tensorflow/lite/micro/testing_helpers_test.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/lite/micro/test_helpers.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
-#include "tensorflow/lite/micro/testing/test_utils.h"
 
 TF_LITE_MICRO_TESTS_BEGIN
 
diff --git a/tensorflow/lite/micro/tools/ci_build/test_all.sh b/tensorflow/lite/micro/tools/ci_build/test_all.sh
index e0cb0b325ef..a31b5d1382f 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_all.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_all.sh
@@ -35,10 +35,10 @@ make -f tensorflow/lite/micro/tools/make/Makefile \
 echo "Starting to run micro tests at `date`"
 
 echo "Running x86 tests at `date`"
-tensorflow/lite/micro/tools/ci_build/test_x86.sh
+tensorflow/lite/micro/tools/ci_build/test_x86.sh PRESUBMIT
 
 echo "Running bluepill tests at `date`"
-tensorflow/lite/micro/tools/ci_build/test_bluepill.sh
+tensorflow/lite/micro/tools/ci_build/test_bluepill.sh PRESUBMIT
 
 echo "Running mbed tests at `date`"
 tensorflow/lite/micro/tools/ci_build/test_mbed.sh PRESUBMIT
@@ -47,9 +47,12 @@ echo "Running Sparkfun tests at `date`"
 tensorflow/lite/micro/tools/ci_build/test_sparkfun.sh
 
 echo "Running stm32f4 tests at `date`"
-tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
+tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh PRESUBMIT
 
 echo "Running Arduino tests at `date`"
 tensorflow/lite/micro/tools/ci_build/test_arduino.sh
 
+echo "Running cortex_m_generic tests at `date`"
+tensorflow/lite/micro/tools/ci_build/test_cortex_m_generic.sh
+
 echo "Finished all micro tests at `date`"
diff --git a/tensorflow/lite/micro/tools/ci_build/test_arduino.sh b/tensorflow/lite/micro/tools/ci_build/test_arduino.sh
index e333e9e6cd9..006951e4cf0 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_arduino.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_arduino.sh
@@ -25,14 +25,15 @@ cd "${ROOT_DIR}"
 
 source tensorflow/lite/micro/tools/ci_build/helper_functions.sh
 
-readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean clean_downloads
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
 
 TARGET=arduino
+TAGS=cmsis-nn
 
 # TODO(b/143715361): parallel builds do not work with generated files right now.
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile \
   TARGET=${TARGET} \
-  TAGS="cmsis-nn" \
+  TAGS=${TAGS} \
   generate_arduino_zip
 
 readable_run tensorflow/lite/micro/tools/ci_build/install_arduino_cli.sh
diff --git a/tensorflow/lite/micro/tools/ci_build/test_arduino_library.sh b/tensorflow/lite/micro/tools/ci_build/test_arduino_library.sh
index 8770ea96980..3856cb849f8 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_arduino_library.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_arduino_library.sh
@@ -33,7 +33,7 @@ rm -rf ${TEMP_BUILD_DIR}
 mkdir -p "${ARDUINO_HOME_DIR}/libraries"
 mkdir -p ${TEMP_BUILD_DIR}
 
-unzip -q ${LIBRARY_ZIP} -d "${ARDUINO_LIBRARIES_DIR}"
+unzip -o -q ${LIBRARY_ZIP} -d "${ARDUINO_LIBRARIES_DIR}"
 
 # Installs all dependencies for Arduino
 InstallLibraryDependencies () {
@@ -51,7 +51,7 @@ InstallLibraryDependencies () {
   # commit is tested to work; if we bump the commit, we need to ensure that
   # the defines in ArduCAM/memorysaver.h are correct.
   wget -O /tmp/arducam-master.zip https://github.com/ArduCAM/Arduino/archive/e216049ba304048ec9bb29adfc2cc24c16f589b1/master.zip
-  unzip /tmp/arducam-master.zip -d /tmp
+  unzip -o /tmp/arducam-master.zip -d /tmp
   cp -r /tmp/Arduino-e216049ba304048ec9bb29adfc2cc24c16f589b1/ArduCAM "${ARDUINO_LIBRARIES_DIR}"
 }
 
diff --git a/tensorflow/lite/micro/tools/ci_build/test_bluepill.sh b/tensorflow/lite/micro/tools/ci_build/test_bluepill.sh
index 1f957e9dcab..b4fe5a74835 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_bluepill.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_bluepill.sh
@@ -32,6 +32,19 @@ TARGET=bluepill
 # TODO(b/143715361): downloading first to allow for parallel builds.
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} third_party_downloads
 
-# TODO(b/143286954): Run all the tests once they pass.
-readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} kernel_add_test
+# check that the release build is ok.
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} build BUILD_TYPE=release
+
+# Next, build w/o release so that we can run the tests and get additional
+# debugging info on failures.
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} build
+
+# TODO(b/149597202): Running tests via renode are disabled as part of the
+# continuous integration until we can get Docker running inside Docker. However,
+# if this script is run locally, the tests will still be run.
+if [[ ${1} != "PRESUBMIT" ]]; then
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} test
+fi
 
diff --git a/tensorflow/lite/micro/tools/ci_build/test_cortex_m_generic.sh b/tensorflow/lite/micro/tools/ci_build/test_cortex_m_generic.sh
new file mode 100755
index 00000000000..a283430d8f5
--- /dev/null
+++ b/tensorflow/lite/micro/tools/ci_build/test_cortex_m_generic.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Tests the microcontroller code using a Cortex-M4/M4F platform.
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR=${SCRIPT_DIR}/../../../../..
+cd "${ROOT_DIR}"
+
+source tensorflow/lite/micro/tools/ci_build/helper_functions.sh
+
+TARGET=cortex_m_generic
+
+# TODO(b/143715361): downloading first to allow for parallel builds.
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile TOOLCHAIN=armgcc TAGS="cmsis-nn armgcc" TARGET=${TARGET} TARGET_ARCH=cortex-m4 third_party_downloads
+
+# Build for Cortex-M4 (no FPU) without CMSIS
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+readable_run make -j$(nproc) -f tensorflow/lite/micro/tools/make/Makefile TOOLCHAIN=armgcc TARGET=${TARGET} TARGET_ARCH=cortex-m4 microlite
+
+# Build for Cortex-M4F (FPU present) without CMSIS
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+readable_run make -j$(nproc) -f tensorflow/lite/micro/tools/make/Makefile TOOLCHAIN=armgcc TARGET=${TARGET} TARGET_ARCH=cortex-m4+fp microlite
+
+# Build for Cortex-M4 (no FPU) with CMSIS
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+readable_run make -j$(nproc) -f tensorflow/lite/micro/tools/make/Makefile TOOLCHAIN=armgcc TAGS=cmsis-nn TARGET=${TARGET} TARGET_ARCH=cortex-m4 microlite
+
+# Build for Cortex-M4 (FPU present) with CMSIS
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+readable_run make -j$(nproc) -f tensorflow/lite/micro/tools/make/Makefile TOOLCHAIN=armgcc TAGS=cmsis-nn TARGET=${TARGET} TARGET_ARCH=cortex-m4+fp microlite
diff --git a/tensorflow/lite/micro/tools/ci_build/test_sparkfun.sh b/tensorflow/lite/micro/tools/ci_build/test_sparkfun.sh
index dc08c4e4549..88367cd9c11 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_sparkfun.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_sparkfun.sh
@@ -24,12 +24,13 @@ cd "${ROOT_DIR}"
 
 source tensorflow/lite/micro/tools/ci_build/helper_functions.sh
 
-readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
-
 TARGET=sparkfun_edge
 
 # TODO(b/143715361): downloading first to allow for parallel builds.
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} third_party_downloads
-readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} micro_speech_bin
-readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} person_detection_bin
-readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} magic_wand_bin
+
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} build
+
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} TAGS=cmsis-nn build
diff --git a/tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh b/tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
index 2ef1bb1f97f..ba2dee3d810 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh
@@ -33,6 +33,7 @@ readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile TAGS=${TAGS} TARGET=${TARGET} third_party_downloads
 
 # First make sure that the release build succeeds.
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
 readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile BUILD_TYPE=release TAGS=${TAGS} TARGET=${TARGET} build
 
 # Next, build w/o release so that we can run the tests and get additional
@@ -40,7 +41,9 @@ readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile BUILD_TYPE=re
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
 readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TAGS=${TAGS} TARGET=${TARGET} build
 
-# TODO(b/149597202): Disabled until we can get Docker running inside Docker.
-# Parallell builds doesn't work very well with this
-#readable_run make -f tensorflow/lite/micro/tools/make/Makefile TAGS=${TAGS} TARGET=${TARGET} test
-
+# TODO(b/149597202): Running tests via renode are disabled as part of the
+# continuous integration until we can get Docker running inside Docker. However,
+# if this script is run locally, the tests will still be run.
+if [[ ${1} != "PRESUBMIT" ]]; then
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile TAGS=${TAGS} TARGET=${TARGET} test
+fi
diff --git a/tensorflow/lite/micro/tools/ci_build/test_x86.sh b/tensorflow/lite/micro/tools/ci_build/test_x86.sh
index 49e20b4f84d..844dccbafb7 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_x86.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_x86.sh
@@ -29,10 +29,33 @@ readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
 # TODO(b/143715361): downloading first to allow for parallel builds.
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile third_party_downloads
 
-# First make sure that the release build succeeds.
+# Next, build w/o TF_LITE_STATIC_MEMORY to catch additional build errors.
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile BUILD_TYPE=no_tf_lite_static_memory build
+
+# Next, make sure that the release build succeeds.
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
 readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile BUILD_TYPE=release build
 
+# Next, build with TAGS=posix. See b/170223867 for more details.
+# TODO(b/168123200): Since the benchmarks can not currently be run as a test, we
+# only build with TAGS=posix. Once benchmarks can be run (and are added to make
+# test) we should switch to running all the tests with TAGS=posix.
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+readable_run make -s -j8 -f tensorflow/lite/micro/tools/make/Makefile build TAGS=posix
+
 # Next, build w/o release so that we can run the tests and get additional
 # debugging info on failures.
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
 readable_run make -s -j8 -f tensorflow/lite/micro/tools/make/Makefile test
+
+if [[ ${1} != "PRESUBMIT" ]]; then
+  # Most of TFLM external contributors only use make. We are building a subset of
+  # targets with bazel as part of this script to make it easier for external
+  # contributors to fix these errors prior to creating a pull request.
+  #
+  # We only run the bazel command when this script is run locally (i.e. not via
+  # test_all.sh) to avoid duplicate work on the CI system and also avoid
+  # installing bazel on the TFLM Docker image.
+  readable_run bazel build tensorflow/lite/micro:all
+fi
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 418da265f08..c7ec3203146 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -74,29 +74,115 @@ TEST_SCRIPT := tensorflow/lite/micro/testing/test_linux_binary.sh
 
 MICROLITE_LIBS := -lm
 
-CC_WARNINGS := -Werror -Wsign-compare -Wdouble-promotion \
-               -Wshadow -Wunused-variable -Wmissing-field-initializers \
-               -Wunused-function
-# TODO(b/150240249): Add in -fno-rtti once that works for the Xtensa toolchain.
-# TODO(b/159155203): Consider TF_LITE_STATIC_MEMORY to align more with the fact
-# this flag is for an optimized micro runtime.
-CXXFLAGS := -std=c++11 -Wstrict-aliasing -DTF_LITE_STATIC_MEMORY $(CC_WARNINGS)
-CCFLAGS  := -DTF_LITE_STATIC_MEMORY $(CC_WARNINGS)
+
+# For each tag specified on the command line we add -D<tag> to the cflags to
+# allow for #idefs in the code.
+#
+# We apply the following transformations (via the tr command):
+#   1. Convert the tag name to uppercase (TAGS=xtensa_hifimini -> -DXTENSA_HIFIMINI)
+#   2. (Temporarily) Replace dash with underscore (TAGS=cmsis-nn -> -DCMSIS_NN)
+#
+# Transformation 2 is needed because CMSIS-NN is not a valid macro name.
+#
+# TODO(b/168824958): remove dash->underscore transformation once the cmsis-nn
+# and ethos-u directories have been renamed.
+TAG_DEFINES := $(foreach TAG,$(TAGS),-D$(shell echo $(TAG) | tr [a-z] [A-Z] | tr - _))
+
+OPTIMIZATION_LEVEL := -O3
+
+CC_WARNINGS := \
+  -Werror \
+  -Wsign-compare \
+  -Wdouble-promotion \
+  -Wshadow \
+  -Wunused-variable \
+  -Wmissing-field-initializers \
+  -Wunused-function \
+  -Wswitch \
+  -Wvla \
+  -Wall \
+  -Wextra \
+  -Wstrict-aliasing \
+  -Wno-unused-parameter
+
+COMMON_FLAGS := \
+  -fno-unwind-tables \
+  -ffunction-sections \
+  -fdata-sections \
+  -fmessage-length=0 \
+  -DTF_LITE_STATIC_MEMORY \
+  -DTF_LITE_DISABLE_X86_NEON \
+  $(OPTIMIZATION_LEVEL) \
+  $(CC_WARNINGS) \
+  $(TAG_DEFINES)
+
+CXXFLAGS := \
+  -std=c++11 \
+  -fno-rtti \
+  -fno-exceptions \
+  -fno-threadsafe-statics \
+  $(COMMON_FLAGS)
+
+CCFLAGS  := \
+	-std=c11 \
+	$(COMMON_FLAGS)
+
 ARFLAGS := -r
 
+ifeq ($(filter armclang,$(ALL_TAGS)),)
+LDFLAGS += \
+  -Wl,--fatal-warnings \
+  -Wl,--gc-sections
+endif
+
 # override these in the makefile.inc for specific compiler targets
 TARGET_TOOLCHAIN_PREFIX :=
 TARGET_TOOLCHAIN_ROOT :=
 
+# Specifying BUILD_TYPE=<blah> as part of the make command gives us a few
+# options to choose from.
+#
+# If BUILD_TYPE is not specified, the default build (which should be suitable
+# most of the time) has all of the error checking logic at the expense of a
+# latency increase of ~5-10% relative to BUILD_TYPE=release_with_logs.
+#
+# This default build is most suited for usual development and testing as is
+# highlighted by the discussion on this github pull request:
+# https://github.com/tensorflow/tensorflow/pull/42314#issuecomment-694360567
 ifeq ($(BUILD_TYPE), debug)
-	CXXFLAGS += -DDEBUG -g
-	CCFLAGS  += -DDEBUG -g
+	# Specifying BUILD_TYPE=debug adds debug symbols to the binary (and makes it
+	# larger) and should be used to run a binary with gdb.
+	CXXFLAGS += -g
+	CCFLAGS  += -g
 else ifeq ($(BUILD_TYPE), release)
-	CXXFLAGS += -DNDEBUG -O3 -DTF_LITE_STRIP_ERROR_STRINGS
-	CCFLAGS  += -DNDEBUG -O3 -DTF_LITE_STRIP_ERROR_STRINGS
-else
-	CXXFLAGS += -DNDEBUG -O3
-	CCFLAGS  += -DNDEBUG -O3
+	# The 'release' build results in the smallest binary (by virtue of removing
+	# strings from log messages, DCHECKs ...).
+	#
+	# The down-side is that we currently do not have a good mechanism to allow
+	# for logging that is not related to errors (e.g. profiling information, or
+	# logs that help determine if tests pass or fail). As a result, we are unable
+	# to run tests or benchmarks with BUILD_TYPE=release (which is a bit
+	# counter-intuitive). TODO(b/158205789): A global error reporter might help.
+	#
+	# For a close approximation of the release build use
+	# BUILD_TYPE=release_with_logs.
+	CXXFLAGS += -DNDEBUG -DTF_LITE_STRIP_ERROR_STRINGS
+	CCFLAGS  += -DNDEBUG -DTF_LITE_STRIP_ERROR_STRINGS
+else ifeq ($(BUILD_TYPE), release_with_logs)
+	# The latency with BUILD_TYPE=release_with_logs will be close to the 'release'
+	# build and there will still be error logs. This build type may be preferable
+	# for profiling and benchmarking.
+	CXXFLAGS += -DNDEBUG
+	CCFLAGS  += -DNDEBUG
+else ifeq ($(BUILD_TYPE), no_tf_lite_static_memory)
+	# This build should not be used to run any binaries/tests since
+	# TF_LITE_STATIC_MEMORY should be defined for all micro builds. However,
+	# having a build without TF_LITE_STATIC_MEMORY is useful to catch errors in
+	# code that is shared between TfLite Mobile and TfLite Micro. See this issue
+	# for more details:
+	# https://github.com/tensorflow/tensorflow/issues/43076
+	CXXFLAGS := $(filter-out -DTF_LITE_STATIC_MEMORY, $(CXXFLAGS))
+	CCFLAGS := $(filter-out -DTF_LITE_STATIC_MEMORY, $(CCFLAGS))
 endif
 
 # This library is the main target for this makefile. It will contain a minimal
@@ -106,7 +192,14 @@ MICROLITE_LIB_NAME := libtensorflow-microlite.a
 # These two must be defined before we include the target specific Makefile.inc
 # because we filter out the examples that are not supported for those targets.
 # See targets/xtensa_xpg_makefile.inc for an example.
-MICRO_LITE_EXAMPLE_TESTS := $(shell find tensorflow/lite/micro/examples/ -name Makefile.inc)
+#
+# We limit max depth of directories to search to not include target specific
+# Makefiles that are included directly by the main example Makefile. See
+# examples/micro_speech/Makefile.inc for an example. At the same time, we
+# search till an arbitrary depth for files named Makefile_internal.inc as a way
+# to bypass this check and allow for deeper directory structures.
+MICRO_LITE_EXAMPLE_TESTS := $(shell find tensorflow/lite/micro/examples/ -maxdepth 2 -name Makefile.inc)
+MICRO_LITE_EXAMPLE_TESTS += $(shell find tensorflow/lite/micro/examples/ -name Makefile_internal.inc)
 MICRO_LITE_BENCHMARKS := $(wildcard tensorflow/lite/micro/benchmarks/Makefile.inc)
 
 MICROLITE_TEST_SRCS := \
@@ -134,7 +227,7 @@ tensorflow/lite/core/api/op_resolver.cc \
 tensorflow/lite/core/api/tensor_utils.cc \
 tensorflow/lite/kernels/internal/quantization_util.cc \
 tensorflow/lite/kernels/kernel_util.cc \
-tensorflow/lite/micro/testing/test_utils.cc
+tensorflow/lite/schema/schema_utils.cc
 
 MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_TEST_SRCS), $(MICROLITE_CC_BASE_SRCS))
 MICROLITE_CC_SRCS := $(filter-out $(MICROLITE_BENCHMARK_SRCS), $(MICROLITE_CC_SRCS))
@@ -176,6 +269,7 @@ tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h \
+tensorflow/lite/kernels/internal/reference/integer_ops/mean.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/mul.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h \
@@ -209,6 +303,7 @@ tensorflow/lite/kernels/op_macros.h \
 tensorflow/lite/kernels/padding.h \
 tensorflow/lite/portable_type_to_tflitetype.h \
 tensorflow/lite/schema/schema_generated.h \
+tensorflow/lite/schema/schema_utils.h \
 tensorflow/lite/version.h
 
 # TODO(b/165940489): Figure out how to avoid including fixed point
@@ -259,12 +354,24 @@ THIRD_PARTY_DOWNLOADS :=
 $(eval $(call add_third_party_download,$(GEMMLOWP_URL),$(GEMMLOWP_MD5),gemmlowp,))
 $(eval $(call add_third_party_download,$(FLATBUFFERS_URL),$(FLATBUFFERS_MD5),flatbuffers,))
 $(eval $(call add_third_party_download,$(RUY_URL),$(RUY_MD5),ruy,))
+$(eval $(call add_third_party_download,$(PERSON_MODEL_URL),$(PERSON_MODEL_MD5),person_model_grayscale,))
+$(eval $(call add_third_party_download,$(PERSON_MODEL_INT8_URL),$(PERSON_MODEL_INT8_MD5),person_model_int8,))
 
-# These target-specific makefiles should modify or replace options like
-# CXXFLAGS or LIBS to work for a specific targeted architecture. All logic
-# based on platforms or architectures should happen within these files, to
-# keep this main makefile focused on the sources and dependencies.
-include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
+# The target-specific makefile must have a name that is exactly
+# TARGET_makefile.inc and is only needed for cross-compilation (i.e. when TARGET
+# is different from the HOST_OS).
+# There are also some other targets like arduino and CHRE that are also special
+# in that they do no have a <target>_makefile but are still used to create a
+# directory for the generated artifacts. We are using a workaround right now and
+# will be separating the project generation from the Makefile in the future.
+TARGETS_WITHOUT_MAKEFILES := \
+$(HOST_OS) \
+arduino \
+chre
+
+ifeq ($(findstring $(TARGET),$(TARGETS_WITHOUT_MAKEFILES)),)
+  include $(MAKEFILE_DIR)/targets/$(TARGET)_makefile.inc
+endif
 
 # Load dependencies for optimized kernel implementations.
 include $(wildcard $(MAKEFILE_DIR)/ext_libs/*.inc)
@@ -342,7 +449,7 @@ $(BINDIR)%_test : $(OBJDIR)%_test.o $(MICROLITE_LIB_PATH)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
 	-o $@ $< \
-	$(LIBFLAGS) $(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
+	$(MICROLITE_LIB_PATH) $(LDFLAGS) $(MICROLITE_LIBS)
 
 $(BINDIR)%.test_target: $(BINDIR)%_test
 	@test -f $(TEST_SCRIPT) || (echo 'Unable to find the test script. Is the software emulation available in $(TARGET)?'; exit 1)
diff --git a/tensorflow/lite/micro/tools/make/download_and_extract.sh b/tensorflow/lite/micro/tools/make/download_and_extract.sh
index e72fd7a0184..f384e6afb4d 100755
--- a/tensorflow/lite/micro/tools/make/download_and_extract.sh
+++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh
@@ -72,6 +72,7 @@ patch_am_sdk() {
 patch_kissfft() {
   sed -i -E $'s@#ifdef FIXED_POINT@// Patched automatically by download_dependencies.sh so default is 16 bit.\\\n#ifndef FIXED_POINT\\\n#define FIXED_POINT (16)\\\n#endif\\\n// End patch.\\\n\\\n#ifdef FIXED_POINT@g' tensorflow/lite/micro/tools/make/downloads/kissfft/kiss_fft.h
 
+  sed -i -E '/^#include <sys\/types.h>/d' tensorflow/lite/micro/tools/make/downloads/kissfft/kiss_fft.h
   # Fix for https://github.com/mborgerding/kissfft/issues/20
   sed -i -E $'s@#ifdef FIXED_POINT@#ifdef FIXED_POINT\\\n#include <stdint.h> /* Patched. */@g' tensorflow/lite/micro/tools/make/downloads/kissfft/kiss_fft.h
 
@@ -90,29 +91,44 @@ patch_cmsis() {
   # custom include paths.
   # These include changes were found through trial and error while trying to get the Arduino
   # library compiling with the CMSIS-NN kernels included.
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.c' -exec \
-    sed -i -E $'s@#include "arm_nnfunctions.h"@#include "cmsis/CMSIS/NN/Include/arm_nnfunctions.h"@g' {} \;
 
+  dspfiles="arm_math.h"
+  dspfiles+="\|arm_math_types.h"
+  dspfiles+="\|arm_math_memory.h"
+  dspfiles+="\|arm_common_tables.h"
+  dspfiles+="\|dsp/basic_math_functions.h"
+  dspfiles+="\|dsp/bayes_functions.h"
+  dspfiles+="\|dsp/complex_math_functions.h"
+  dspfiles+="\|dsp/controller_functions.h"
+  dspfiles+="\|dsp/distance_functions.h"
+  dspfiles+="\|dsp/fast_math_functions.h"
+  dspfiles+="\|dsp/filtering_functions.h"
+  dspfiles+="\|dsp/interpolation_functions.h"
+  dspfiles+="\|dsp/matrix_functions.h"
+  dspfiles+="\|dsp/none.h"
+  dspfiles+="\|dsp/statistics_functions.h"
+  dspfiles+="\|dsp/support_functions.h"
+  dspfiles+="\|dsp/svm_functions.h"
+  dspfiles+="\|dsp/svm_defines.h"
+  dspfiles+="\|dsp/transform_functions.h"
+  dspfiles+="\|dsp/utils.h"
+  dspfiles+="\|dsp/arm_helium_utils.h"
   find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.c' -exec \
-    sed -i -E $'s@#include "arm_nnsupportfunctions.h"@#include "cmsis/CMSIS/NN/Include/arm_nnsupportfunctions.h"@g' {} \; 
+    \( -name *.c -or -name *.h -or -name *.cpp \) -exec \
+    sed -i "s@#include \"\($dspfiles\)\"@#include \"cmsis/CMSIS/DSP/Include/\1\"@g" {} \;
 
+  nnfiles="arm_nn_tables.h"
+  nnfiles+="\|arm_nnfunctions.h"
+  nnfiles+="\|arm_nnsupportfunctions.h"
+  nnfiles+="\|arm_nn_types.h"
   find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.c' -exec \
-    sed -i -E $'s@#include "arm_nn_types.h"@#include "cmsis/CMSIS/NN/Include/arm_nn_types.h"@g' {} \;
+    \( -name *.c -or -name *.h -or -name *.cpp \) -exec \
+    sed -i "s@#include \"\($nnfiles\)\"@#include \"cmsis/CMSIS/NN/Include/\1\"@g" {} \;
 
+  corefiles="cmsis_compiler.h"
   find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "arm_math.h"@#include "cmsis/CMSIS/DSP/Include/arm_math.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "arm_common_tables.h"@#include "cmsis/CMSIS/DSP/Include/arm_common_tables.h"@g' {} \;
-
-  find tensorflow/lite/micro/tools/make/downloads/cmsis \
-    -iname '*.*' -exec \
-    sed -i -E $'s@#include "arm_nn_tables.h"@#include "cmsis/CMSIS/NN/Include/arm_nn_tables.h"@g' {} \;
+    \( -name *.c -or -name *.h -or -name *.cpp \) -exec \
+    sed -i "s@#include \"\($corefiles\)\"@#include \"cmsis/CMSIS/Core/Include/\1\"@g" {} \;
 
   # Until the fix for https://github.com/ARMmbed/mbed-os/issues/12568 is
   # rolled into Mbed version used on the Arduino IDE, we have to replace
@@ -223,6 +239,7 @@ download_and_extract() {
     fi
   else
     echo "Error unsupported archive type. Failed to extract tool after download."
+    exit 1
   fi
   rm -rf ${tempdir2} ${tempdir}
 
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc b/tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn.inc
similarity index 78%
rename from tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
rename to tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn.inc
index 8bb0d58bad1..b838cd68d13 100644
--- a/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn.inc
@@ -8,7 +8,7 @@ ifneq ($(filter cmsis-nn,$(ALL_TAGS)),)
     THIRD_PARTY_DOWNLOADS += \
       $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis))
 
-    CMSIS_PATH = $(MAKEFILE_DIR)/downloads/cmsis/
+    CMSIS_PATH := $(MAKEFILE_DIR)/downloads/cmsis/
 
     # List of files generated with:
     # find tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ -iname "*.c"
@@ -26,6 +26,7 @@ ifneq ($(filter cmsis-nn,$(ALL_TAGS)),)
       $(CMSIS_PATH)/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_q7.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/FullyConnectedFunctions/arm_fully_connected_mat_q7_vec_q15_opt.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_z.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_x.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/ConcatenationFunctions/arm_concatenation_s8_w.c \
@@ -84,18 +85,49 @@ ifneq ($(filter cmsis-nn,$(ALL_TAGS)),)
       $(CMSIS_PATH)/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_u8.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_q7.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/SoftmaxFunctions/arm_softmax_with_batch_q7.c \
+      $(CMSIS_PATH)/CMSIS/NN/Source/SVDFunctions/arm_svdf_s8.c \
       $(CMSIS_PATH)/CMSIS/NN/Source/ReshapeFunctions/arm_reshape_s8.c
 
     # Cherry-picked list of headers that are needed to compile the CMSIS-NN
     # optimized kernels. We don't include all the possible CMSIS headers because
     # of their large number. See the RFC document for more details:
     # https://docs.google.com/document/d/14GRxeVEgSKgKBKAijO7oxnI49nLoTYBFQmPok-rG0cw
+    # Note: If you add a .h here, you must update patch_cmsis() in download_and_extract.sh as well.
     THIRD_PARTY_CC_HDRS += \
-      $(CMSIS_PATH)CMSIS/NN/Include/arm_nnfunctions.h \
-      $(CMSIS_PATH)CMSIS/NN/Include/arm_nnsupportfunctions.h \
+      $(CMSIS_PATH)CMSIS/Core/Include/cmsis_compiler.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/arm_common_tables.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/arm_helium_utils.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/arm_math.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/arm_math_memory.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/arm_math_types.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/basic_math_functions.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/bayes_functions.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/complex_math_functions.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/controller_functions.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/distance_functions.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/fast_math_functions.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/filtering_functions.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/interpolation_functions.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/matrix_functions.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/none.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/statistics_functions.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/support_functions.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/svm_defines.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/svm_functions.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/transform_functions.h \
+      $(CMSIS_PATH)CMSIS/DSP/Include/dsp/utils.h \
       $(CMSIS_PATH)CMSIS/NN/Include/arm_nn_tables.h \
       $(CMSIS_PATH)CMSIS/NN/Include/arm_nn_types.h \
-      $(CMSIS_PATH)CMSIS/DSP/Include/arm_math.h \
-      $(CMSIS_PATH)CMSIS/DSP/Include/arm_common_tables.h
+      $(CMSIS_PATH)CMSIS/NN/Include/arm_nnfunctions.h \
+      $(CMSIS_PATH)CMSIS/NN/Include/arm_nnsupportfunctions.h
+
+    # Need to add the CMSIS Core includes path.
+    # All other CMSIS header files are included with their relative path
+    # in the CMSIS-NN micro kernel source files in
+    # tensorflow/lite/micro/kernels/cmsis-nn
+    INCLUDES += \
+      -I$(CMSIS_PATH)/CMSIS/Core/Include \
+      -I$(CMSIS_PATH)/CMSIS/DSP/Include \
+      -I$(CMSIS_PATH)/CMSIS/NN/Include
 
 endif
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc b/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc
index 44de3ebfc7c..c136b3e7d1f 100644
--- a/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc
@@ -1,19 +1,37 @@
 ifneq ($(filter ethos-u,$(ALL_TAGS)),)
-    # Don't want -lm flag
-    MICROLITE_LIBS :=
+    # Arm Compiler will not link the Math library (see below), therefore we're filtering it out.
+    # See Fatal error: L6450U: Cannot find library m:
+    # "Arm Compiler is designed to run in a bare metal environment,
+    # and automatically includes implementations of these functions,
+    # and so no such flag is necessary."
+    # https://developer.arm.com/documentation/100891/0611/troubleshooting/general-troubleshooting-advice
+    MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
 
     ifneq (,$(filter $(TARGET_ARCH), x86_64))
         $(error target architecture x86_64 not supported)
     endif
 
-    THIRD_PARTY_DOWNLOADS += \
-      $(eval $(call add_third_party_download,$(ETHOSU_URL),$(ETHOSU_MD5),ethosu,))
     ETHOSU_DRIVER_PATH = $(MAKEFILE_DIR)/downloads/ethosu
 
+    # The driver need to be downloaded before the recursive_find below.
+    # That won't happen with the standard way of downloading by generating a
+    # target(call add_third_party_download), so instead use the shell function.
+    NEED_DOWNLOAD := YES
+    ifeq ($(NEED_DOWNLOAD),$(shell test -d $(ETHOSU_DRIVER_PATH) || echo $(NEED_DOWNLOAD)))
+        DOWNLOAD_SCRIPT := ./tensorflow/lite/micro/tools/make/download_and_extract.sh
+        DOWNLOAD_OK := OK
+        DOWNLOAD_STATUS := $(shell $(DOWNLOAD_SCRIPT) $(ETHOSU_URL) $(ETHOSU_MD5) $(ETHOSU_DRIVER_PATH) >&2 && echo $(DOWNLOAD_OK))
+        ifneq ($(DOWNLOAD_OK),$(DOWNLOAD_STATUS))
+            $(error $(DOWNLOAD_SCRIPT) failed)
+        endif
+    endif
+
     # Currently there is a dependency to CMSIS-NN
     THIRD_PARTY_DOWNLOADS += \
         $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis))
-    CMSIS_PATH = $(MAKEFILE_DIR)/downloads/cmsis/
+    ifeq ($(CMSIS_PATH),)
+      CMSIS_PATH = $(MAKEFILE_DIR)/downloads/cmsis/
+    endif
     THIRD_PARTY_CC_HDRS += $(call recursive_find,$(CMSIS_PATH)/CMSIS/Core/Include,*.h)
 
     THIRD_PARTY_CC_HDRS += $(call recursive_find,$(ETHOSU_DRIVER_PATH)/include,*.h)
diff --git a/tensorflow/lite/micro/tools/make/helper_functions.inc b/tensorflow/lite/micro/tools/make/helper_functions.inc
index 83cb0e31254..6ff5fd1e3d5 100644
--- a/tensorflow/lite/micro/tools/make/helper_functions.inc
+++ b/tensorflow/lite/micro/tools/make/helper_functions.inc
@@ -161,12 +161,48 @@ $(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/arc/arc_
 $(PRJDIR)$(3)/$(1)/%: tensorflow/lite/micro/tools/make/templates/arc/%.tpl
 	@cp $$< $$@
 
-$(foreach var,$(ARC_TARGET_FILES_DIRS),$(eval $(call path_changing_copy_file,$(PRJDIR)$(3)/$(1),$(var))))
+$(foreach var,$(ARC_TARGET_COPY_FILES), $(eval $(call path_changing_copy_file,\
+    $(PRJDIR)$(3)/$(1)/$(word 1, $(subst !, ,$(var))),\
+    $(word 2, $(subst !, ,$(var))))))
 
 endif
 endef
 
 
+define generate_ceva_bx1_project
+ifeq ($(TARGET), ceva)
+ifeq ($(TARGET_ARCH), bx1)
+
+$(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/ceva_bx1/ceva_app_makefile.tpl
+	@mkdir -p $$(dir $$@)
+	@sed -E 's#\%\{SRCS\}\%#$(4)#g' $$< | \
+	sed -E 's#\%\{CC\}\%#$(CC_TOOL)#g' | \
+	sed -E 's#\%\{CXX\}\%#$(CXX_TOOL)#g' | \
+	sed -E 's#\%\{LD\}\%#$(LD_TOOL)#g' | \
+	sed -E 's#\%\{EXECUTABLE\}\%#$(3).elf#g' | \
+	sed -E 's#\%\{LD_FLAGS\}\%#$(6)#g' | \
+	sed -E 's#\%\{CXX_FLAGS\}\%#$(7)#g' | \
+	sed -E 's#\%\{CC_FLAGS\}\%#$(8)#g' | \
+	sed -E 's#\%\{EXTRA_APP_SETTINGS\}\%#$(ARC_EXTRA_APP_SETTINGS)#g' | \
+	sed -E 's#\%\{EXTRA_APP_RULES\}\%#$(ARC_EXTRA_APP_RULES)#g' | \
+	sed -E 's#\%\{BIN_DEPEND\}\%#$(ARC_BIN_DEPEND)#g' | \
+	sed -E 's#\%\{BIN_RULE\}\%#$(ARC_BIN_RULE)#g' | \
+	sed -E 's#\%\{EXTRA_RM_TARGETS\}\%#$(ARC_EXTRA_RM_TARGETS)#g' | \
+	sed -E 's#\%\{APP_RUN_CMD\}\%#$(ARC_APP_RUN_CMD)#g' | \
+	sed -E 's#\%\{APP_DEBUG_CMD\}\%#$(ARC_APP_DEBUG_CMD)#g' | \
+	sed -E 's#\%\{EXTRA_EXECUTE_RULES\}\%#$(ARC_EXTRA_EXECUTE_RULES)#g' > $$@
+
+$(PRJDIR)$(3)/$(1)/%: tensorflow/lite/micro/tools/make/templates/ceva_bx1/%.tpl
+	@cp $$< $$@
+
+$(foreach var,$(CEVA_TARGET_FILES_DIRS),$(eval $(call path_changing_copy_file,$(PRJDIR)$(3)/$(1),$(var))))
+
+endif
+endif
+endef
+
+
+
 
 
 # Creates a set of rules to build a standalone Arduino project for an
@@ -403,6 +439,7 @@ endef
 define generate_microlite_projects
 $(call generate_project,make,$(MAKE_PROJECT_FILES) $($(1)_MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES),$(TARGET_TOOLCHAIN_ROOT),$(TARGET_TOOLCHAIN_PREFIX))
 $(call generate_arc_project,make,$(MAKE_PROJECT_FILES) $($(1)_MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
+$(call generate_ceva_bx1_project,make,$(MAKE_PROJECT_FILES) $($(1)_MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
 $(call generate_project,mbed,$(MBED_PROJECT_FILES) $($(1)_MBED_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS),$(TARGET_TOOLCHAIN_ROOT),$(TARGET_TOOLCHAIN_PREFIX))
 $(call generate_project,keil,$(KEIL_PROJECT_FILES) $($(1)_KEIL_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS),$(TARGET_TOOLCHAIN_ROOT),$(TARGET_TOOLCHAIN_PREFIX))
 ifeq (,$(findstring _benchmark,$(1)))
@@ -422,27 +459,41 @@ define microlite_test
 ifeq (,$(findstring _test, $(1)))
 	$(eval $(call generate_project_third_party_parsing))
 endif
+
 $(1)_LOCAL_SRCS := $(2)
 $(1)_LOCAL_SRCS := $$(call specialize,$$($(1)_LOCAL_SRCS))
 ALL_SRCS += $$($(1)_LOCAL_SRCS)
 $(1)_LOCAL_HDRS := $(3)
 $(1)_LOCAL_OBJS := $$(addprefix $$(OBJDIR), \
-$$(patsubst %.S,%.o,$$(patsubst %.cc,%.o,$$(patsubst %.c,%.o,$$($(1)_LOCAL_SRCS)))))
+$$(patsubst %.cc,%.o,$$(patsubst %.c,%.o,$$($(1)_LOCAL_SRCS))))
 $(1)_BINARY := $$(BINDIR)$(1)
 $$($(1)_BINARY): $$($(1)_LOCAL_OBJS) $$(MICROLITE_LIB_PATH)
 	@mkdir -p $$(dir $$@)
 	$$(CXX) $$(CXXFLAGS) $$(INCLUDES) \
 	-o $$($(1)_BINARY) $$($(1)_LOCAL_OBJS) \
-	$$(LIBFLAGS) $$(MICROLITE_LIB_PATH) $$(LDFLAGS) $$(MICROLITE_LIBS)
+	$$(MICROLITE_LIB_PATH) $$(LDFLAGS) $$(MICROLITE_LIBS)
 $(1): $$($(1)_BINARY)
 $(1)_bin: $$($(1)_BINARY).bin
 test_$(1): $$($(1)_BINARY)
 	@test -f $$(TEST_SCRIPT) || (echo 'Unable to find the test script. Is the software emulation available in $$(TARGET)?'; exit 1)
 	$$(TEST_SCRIPT) $$($(1)_BINARY) '~~~ALL TESTS PASSED~~~'
+
 ifneq (,$(findstring _test,$(1)))
   MICROLITE_TEST_TARGETS += test_$(1)
   MICROLITE_BUILD_TARGETS += $$($(1)_BINARY)
 endif
+
+# The ifneq can make is seem that the body of the if block is executed when
+# _benchmark is not found in $(1). Actually, the check is saying that if
+# findstring does not return empty, i.e. if _benchmark is found in $(1), we
+# should add something to the MICROLITE_BUILD_TARGETS.
+#
+# This ensures that a `make build` command will builds all the tests and
+# benchmarks, though `make test` will only run the tests.
+ifneq (,$(findstring _benchmark,$(1)))
+  MICROLITE_BUILD_TARGETS += $$($(1)_BINARY)
+endif
+
 $(eval $(call generate_microlite_projects,$(1),$(call specialize,$(2)),$(3)))
 endef
 
diff --git a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
index 68792496ec3..4d5e9e542b2 100644
--- a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
@@ -1,143 +1,136 @@
-# Settings for apollo3 evb and SparkFun Edge platforms.
-ifeq ($(TARGET),$(filter $(TARGET),\
-  apollo3evb\
-  sparkfun_edge\
-  ))
-  export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
-  TARGET_ARCH := cortex-m4
-  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
-  TARGET_TOOLCHAIN_ROOT := $(TENSORFLOW_ROOT)$(MAKEFILE_DIR)/downloads/gcc_embedded/bin/
-  # Download the Ambiq Apollo3 SDK and set this variable to find the header
-  # files:
-  APOLLO3_SDK := $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)
-  # Need a pointer to the GNU ARM toolchain for crtbegin.o for the fp functions
-  # with the hard interfaces.
-  GCC_ARM := $(MAKEFILE_DIR)/downloads/gcc_embedded/
+export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
+TARGET_ARCH := cortex-m4
+TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
+TARGET_TOOLCHAIN_ROOT := $(TENSORFLOW_ROOT)$(MAKEFILE_DIR)/downloads/gcc_embedded/bin/
+# Download the Ambiq Apollo3 SDK and set this variable to find the header
+# files:
+APOLLO3_SDK := $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)
+# Need a pointer to the GNU ARM toolchain for crtbegin.o for the fp functions
+# with the hard interfaces.
+GCC_ARM := $(MAKEFILE_DIR)/downloads/gcc_embedded/
 
-  $(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,))
-  $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis))
-  $(eval $(call add_third_party_download,$(AM_SDK_URL),$(AM_SDK_MD5),$(AM_SDK_DEST),patch_am_sdk))
+$(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,))
+$(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis))
+$(eval $(call add_third_party_download,$(AM_SDK_URL),$(AM_SDK_MD5),$(AM_SDK_DEST),patch_am_sdk))
 
-  ifeq ($(findstring sparkfun,$(TARGET)), sparkfun)
-    $(eval $(call add_third_party_download,$(SF_BSPS_URL),$(SF_BSPS_MD5),$(AM_SDK_DEST)/$(SF_BSPS_DEST),))
-    # Make sure that we download the full Ambiq SDK before the SparkFun BSPs.
+ifeq ($(findstring sparkfun,$(TARGET)), sparkfun)
+  $(eval $(call add_third_party_download,$(SF_BSPS_URL),$(SF_BSPS_MD5),$(AM_SDK_DEST)/$(SF_BSPS_DEST),))
+  # Make sure that we download the full Ambiq SDK before the SparkFun BSPs.
 $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)/$(SF_BSPS_DEST): $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)
-  endif
-
-  PLATFORM_FLAGS = \
-    -DPART_apollo3 \
-    -DAM_PACKAGE_BGA \
-    -DAM_PART_APOLLO3 \
-    -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
-    -DTF_LITE_STATIC_MEMORY \
-    -DNDEBUG \
-    -DTF_LITE_MCU_DEBUG_LOG \
-    -D __FPU_PRESENT=1 \
-    -DARM_MATH_CM4 \
-    -fno-rtti \
-    -fmessage-length=0 \
-    -fno-exceptions \
-    -fno-unwind-tables \
-    -ffunction-sections \
-    -fdata-sections \
-    -funsigned-char \
-    -MMD \
-    -mcpu=cortex-m4 \
-    -mthumb \
-    -mfpu=fpv4-sp-d16 \
-    -mfloat-abi=hard \
-    -std=gnu++11 \
-    -Wvla \
-    -Wall \
-    -Wextra \
-    -Wno-missing-field-initializers \
-    -Wno-strict-aliasing \
-    -Wno-type-limits \
-    -Wno-unused-function \
-    -Wno-unused-parameter \
-    -fno-delete-null-pointer-checks \
-    -fno-threadsafe-statics \
-    -fomit-frame-pointer \
-    -fno-use-cxa-atexit \
-    -nostdlib \
-    -ggdb \
-    -O3
-  CXXFLAGS += $(PLATFORM_FLAGS)
-  CCFLAGS += $(PLATFORM_FLAGS)
-  LDFLAGS += \
-    -mthumb -mcpu=cortex-m4 -mfpu=fpv4-sp-d16 -mfloat-abi=hard \
-    -nostartfiles -static \
-    -Wl,--gc-sections -Wl,--entry,Reset_Handler \
-    -Wl,--start-group -lm -lc -lgcc -Wl,--end-group \
-    -fno-exceptions \
-    -nostdlib --specs=nano.specs -t -lstdc++ -lc -lnosys -lm \
-    -Wl,-T,$(TENSORFLOW_ROOT)$(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/apollo3evb.ld \
-    -Wl,-Map=$(TENSORFLOW_ROOT)$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref
-  BUILD_TYPE := micro
-  ifeq ($(TARGET), apollo3evb)
-    BOARD_BSP_PATH := $(APOLLO3_SDK)/boards/apollo3_evb/bsp
-  endif
-  ifeq ($(findstring sparkfun,$(TARGET)), sparkfun)
-    BOARD_BSP_PATH := $(APOLLO3_SDK)/$(SF_BSPS_DEST)/$(subst sparkfun_,,$(TARGET))/bsp
-    INCLUDES+= \
-      -I$(APOLLO3_SDK)/$(SF_BSPS_DEST)/common/third_party/hm01b0
-  endif
-  MICROLITE_LIBS := \
-    $(BOARD_BSP_PATH)/gcc/bin/libam_bsp.a \
-    $(APOLLO3_SDK)/mcu/apollo3/hal/gcc/bin/libam_hal.a \
-    $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/hard/crtbegin.o \
-    -lm
-  INCLUDES += \
-    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
-    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Include/ \
-    -I$(GCC_ARM)/arm-none-eabi/ \
-    -I$(APOLLO3_SDK)/mcu/apollo3/ \
-    -I$(APOLLO3_SDK)/mcu/apollo3/regs \
-    -I$(APOLLO3_SDK)/mcu/apollo3/hal \
-    -I$(APOLLO3_SDK)/CMSIS/AmbiqMicro/Include/ \
-    -I$(BOARD_BSP_PATH) \
-    -I$(APOLLO3_SDK)/devices/ \
-    -I$(APOLLO3_SDK)/utils/ \
-
-
-  # The startup_gcc.c file is an altered version of the examples/hello_world/gcc/startup_gcc.c
-  # file from Ambiq:
-  #   - Increase the stack size from 1k to 20k
-  #   - Change the application entry call from main() to _main()
-  # The am_*.c files should be copied from the Ambiq Apollo3 SDK
-  # _main.c contains application and target specific initialization, like
-  # setting clock speed, default uart setups, etc. and an implementation
-  # of the DebugLog interfaces.
-  MICROLITE_CC_SRCS += \
-    $(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/startup_gcc.c \
-    $(APOLLO3_SDK)/utils/am_util_delay.c \
-    $(APOLLO3_SDK)/utils/am_util_faultisr.c \
-    $(APOLLO3_SDK)/utils/am_util_id.c \
-    $(APOLLO3_SDK)/utils/am_util_stdio.c \
-    $(APOLLO3_SDK)/devices/am_devices_led.c
-
-  CMSIS_SRC_DIR := $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source
-  THIRD_PARTY_CC_SRCS := \
-  $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_dot_prod_q15.c \
-  $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_mult_q15.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_init_q15.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_q15.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_bitreversal2.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_q15.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_radix4_q15.c \
-  $(CMSIS_SRC_DIR)/CommonTables/arm_const_structs.c \
-  $(CMSIS_SRC_DIR)/CommonTables/arm_common_tables.c \
-  $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_mean_q15.c \
-  $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_max_q7.c
-
-  MICRO_SPEECH_TEST_SRCS += \
-    $(AP3_MICRO_DIR)/_main.c
-
-  TEST_SCRIPT := tensorflow/lite/micro/testing/test_apollo3evb_binary.sh
-  # These are tests that don't currently work on the Apollo3 board.
-  EXCLUDED_TESTS := \
-    tensorflow/lite/micro/micro_interpreter_test.cc \
-    tensorflow/lite/micro/simple_tensor_allocator_test.cc
-  MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
-
 endif
+
+PLATFORM_FLAGS = \
+  -DPART_apollo3 \
+  -DAM_PACKAGE_BGA \
+  -DAM_PART_APOLLO3 \
+  -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
+  -DTF_LITE_STATIC_MEMORY \
+  -DNDEBUG \
+  -DTF_LITE_MCU_DEBUG_LOG \
+  -D __FPU_PRESENT=1 \
+  -DARM_MATH_CM4 \
+  -fno-rtti \
+  -fmessage-length=0 \
+  -fno-exceptions \
+  -fno-unwind-tables \
+  -ffunction-sections \
+  -fdata-sections \
+  -funsigned-char \
+  -MMD \
+  -mcpu=cortex-m4 \
+  -mthumb \
+  -mfpu=fpv4-sp-d16 \
+  -mfloat-abi=hard \
+  -std=gnu++11 \
+  -Wvla \
+  -Wall \
+  -Wextra \
+  -Wno-missing-field-initializers \
+  -Wno-strict-aliasing \
+  -Wno-type-limits \
+  -Wno-unused-function \
+  -Wno-unused-parameter \
+  -fno-delete-null-pointer-checks \
+  -fno-threadsafe-statics \
+  -fomit-frame-pointer \
+  -fno-use-cxa-atexit \
+  -nostdlib \
+  -ggdb \
+  -O3
+CXXFLAGS += $(PLATFORM_FLAGS)
+CCFLAGS += $(PLATFORM_FLAGS)
+LDFLAGS += \
+  -mthumb -mcpu=cortex-m4 -mfpu=fpv4-sp-d16 -mfloat-abi=hard \
+  -nostartfiles -static \
+  -Wl,--gc-sections -Wl,--entry,Reset_Handler \
+  -Wl,--start-group -lm -lc -lgcc -Wl,--end-group \
+  -fno-exceptions \
+  -nostdlib --specs=nano.specs -t -lstdc++ -lc -lnosys -lm \
+  -Wl,-T,$(TENSORFLOW_ROOT)$(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/apollo3evb.ld \
+  -Wl,-Map=$(TENSORFLOW_ROOT)$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref
+BUILD_TYPE := micro
+ifeq ($(TARGET), apollo3evb)
+  BOARD_BSP_PATH := $(APOLLO3_SDK)/boards/apollo3_evb/bsp
+endif
+ifeq ($(findstring sparkfun,$(TARGET)), sparkfun)
+  BOARD_BSP_PATH := $(APOLLO3_SDK)/$(SF_BSPS_DEST)/$(subst sparkfun_,,$(TARGET))/bsp
+  INCLUDES+= \
+    -I$(APOLLO3_SDK)/$(SF_BSPS_DEST)/common/third_party/hm01b0
+endif
+MICROLITE_LIBS := \
+  $(BOARD_BSP_PATH)/gcc/bin/libam_bsp.a \
+  $(APOLLO3_SDK)/mcu/apollo3/hal/gcc/bin/libam_hal.a \
+  $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/hard/crtbegin.o \
+  -lm
+INCLUDES += \
+  -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+  -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Include/ \
+  -I$(GCC_ARM)/arm-none-eabi/ \
+  -I$(APOLLO3_SDK)/mcu/apollo3/ \
+  -I$(APOLLO3_SDK)/mcu/apollo3/regs \
+  -I$(APOLLO3_SDK)/mcu/apollo3/hal \
+  -I$(APOLLO3_SDK)/CMSIS/AmbiqMicro/Include/ \
+  -I$(BOARD_BSP_PATH) \
+  -I$(APOLLO3_SDK)/devices/ \
+  -I$(APOLLO3_SDK)/utils/ \
+
+
+# The startup_gcc.c file is an altered version of the examples/hello_world/gcc/startup_gcc.c
+# file from Ambiq:
+#   - Increase the stack size from 1k to 20k
+#   - Change the application entry call from main() to _main()
+# The am_*.c files should be copied from the Ambiq Apollo3 SDK
+# _main.c contains application and target specific initialization, like
+# setting clock speed, default uart setups, etc. and an implementation
+# of the DebugLog interfaces.
+MICROLITE_CC_SRCS += \
+  $(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/startup_gcc.c \
+  $(APOLLO3_SDK)/utils/am_util_delay.c \
+  $(APOLLO3_SDK)/utils/am_util_faultisr.c \
+  $(APOLLO3_SDK)/utils/am_util_id.c \
+  $(APOLLO3_SDK)/utils/am_util_stdio.c \
+  $(APOLLO3_SDK)/devices/am_devices_led.c
+
+CMSIS_SRC_DIR := $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source
+THIRD_PARTY_CC_SRCS := \
+$(CMSIS_SRC_DIR)/BasicMathFunctions/arm_dot_prod_q15.c \
+$(CMSIS_SRC_DIR)/BasicMathFunctions/arm_mult_q15.c \
+$(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_init_q15.c \
+$(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_q15.c \
+$(CMSIS_SRC_DIR)/TransformFunctions/arm_bitreversal2.c \
+$(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_q15.c \
+$(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_radix4_q15.c \
+$(CMSIS_SRC_DIR)/CommonTables/arm_const_structs.c \
+$(CMSIS_SRC_DIR)/CommonTables/arm_common_tables.c \
+$(CMSIS_SRC_DIR)/StatisticsFunctions/arm_mean_q15.c \
+$(CMSIS_SRC_DIR)/StatisticsFunctions/arm_max_q7.c
+
+MICRO_SPEECH_TEST_SRCS += \
+  $(AP3_MICRO_DIR)/_main.c
+
+TEST_SCRIPT := tensorflow/lite/micro/testing/test_apollo3evb_binary.sh
+# These are tests that don't currently work on the Apollo3 board.
+EXCLUDED_TESTS := \
+  tensorflow/lite/micro/micro_interpreter_test.cc \
+  tensorflow/lite/micro/simple_tensor_allocator_test.cc
+MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/README.md b/tensorflow/lite/micro/tools/make/targets/arc/README.md
index 366aede5db4..a614a80e993 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc/README.md
+++ b/tensorflow/lite/micro/tools/make/targets/arc/README.md
@@ -1,5 +1,12 @@
 # Building TensorFlow Lite for Microcontrollers for Synopsys DesignWare ARC EM/HS Processors
 
+## Maintainers
+
+*   [dzakhar](https://github.com/dzakhar)
+*   [JaccovG](https://github.com/JaccovG)
+
+## Introduction
+
 This document contains the general information on building and running
 TensorFlow Lite Micro for targets based on the Synopsys ARC EM/HS Processors.
 
@@ -97,7 +104,8 @@ output from the EM SDP.
 
 If you want to self-boot your application (start it independently from a
 debugger connection), you also need a microSD card with a minimum size of 512 MB
-and a way to write to the card from your development host
+and a way to write to the card from your development host. Note that the card
+must be formatted as FAT32 with default cluster size (but less than 32 Kbytes)
 
 ### Connect the Board
 
@@ -207,17 +215,33 @@ In both cases you will see the application output in the serial terminal.
 1.  Use the following command in the same command shell you used for building
     the application, as described in the previous step
 
+```
     make flash
+```
 
-2.  Copy the content of the created *./bin* folder into the root of microSD
+1.  Copy the content of the created *./bin* folder into the root of microSD
     card. Note that the card must be formatted as FAT32 with default cluster
     size (but less than 32 Kbytes)
 
-3.  Plug in the microSD card into the J11 connector.
+2.  Plug in the microSD card into the J11 connector.
 
-4.  Push the RST button. If a red LED is lit beside RST button, push the CFG
+3.  Push the RST button. If a red LED is lit beside RST button, push the CFG
     button.
 
+4.  Using serial terminal, create uboot environment file to automatically run
+    the application on start-up. Type or copy next sequence of commands into
+    serial terminal one-by-another:
+
+```
+   setenv loadaddr 0x10800000
+   setenv bootfile app.elf
+   setenv bootdelay 1
+   setenv bootcmd fatload mmc 0 \$\{loadaddr\} \$\{bootfile\} \&\& bootelf
+   saveenv
+```
+
+1.  Reset the board (see step 4 above)
+
 You will see the application output in the serial terminal.
 
 ## Custom ARC EM/HS Platform
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
index 596f219d3d1..28c0fcd8571 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
@@ -19,9 +19,10 @@ ifeq ($(TARGET_ARCH), arc)
 
   DLR := $$$$
 
-  # List of folders to search project files for copy with path changing
-  # For instance, TCF and LCF files are copied into the root of generated project
-  ARC_TARGET_FILES_DIRS ?=
+  # List of pairs <dst>!<src>. Each of pairs declares destination file in generated project tree,
+  # and source file in user environment. Destination and source are separated by '!' symbol 
+  # likewise to "add_third_party_download" define in helper_functions.inc
+  ARC_TARGET_COPY_FILES ?=
 
   # For the following variables see arc_app_makefile.tpl for usage
 
@@ -59,11 +60,11 @@ endef
 
 # Copy rule generator to do file copies with changing paths in generated project
 # Arguments are:
-# 1 - Path files in generated project.
-# 2 - Path files in the source repo
+# 1 - Path to file in generated project (destination).
+# 2 - Path to files in the source repo (source).
 # Used in helper_functions.inc for arc projects to copy files
 define path_changing_copy_file
-$(1)/%: $(2)/%
+$(1) : $(2) third_party_downloads
 	@mkdir -p $$(dir $$@)
 	@cp $$< $$@
 endef
@@ -97,7 +98,7 @@ ifeq ($(ARC_TOOLCHAIN), mwdt)
 # the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying.
 ifneq (,$(findstring .tcf,$(TCF_FILE)))
   TCF_FILE_NAME = $(notdir $(TCF_FILE))
-  ARC_TARGET_FILES_DIRS = $(dir $(TCF_FILE))
+  ARC_TARGET_COPY_FILES += $(notdir $(TCF_FILE))!$(TCF_FILE)
   MAKE_PROJECT_FILES += $(TCF_FILE_NAME)
 else
   TCF_FILE_NAME = $(TCF_FILE)
@@ -112,14 +113,12 @@ endif
   
   PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) 
   
-  PLATFORM_LDFLAGS += -Hnocopyr -m -Hldopt=-Coutput=memory.map -Hheap=2K 
+  PLATFORM_LDFLAGS += -Hnocopyr -m -Hldopt=-Coutput=memory.map -Hheap=24K 
 
 ifneq ($(LCF_FILE), )
   PLATFORM_LDFLAGS += $(notdir $(LCF_FILE))
+  ARC_TARGET_COPY_FILES += $(notdir $(LCF_FILE))!$(LCF_FILE)
   MAKE_PROJECT_FILES += $(notdir $(LCF_FILE))
-ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir $(LCF_FILE))),)
-  ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE))
-endif
 endif
 
   CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
index 780fd7b9750..5dc53cc1585 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
+++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
@@ -49,12 +49,14 @@ SECTIONS {
         .text? : { *('.text$crt*') }
         * (TEXT): {}
         * (LIT): {}
-    } > SRAM
+        * (DATA): {}
+        * (BSS): {}
+    } > PSRAM
 
     GROUP BLOCK(4): {
        .Zdata? : {}
        .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:32K): {}
-       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
+       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:24K): {}
     } > DCCM
         
     GROUP BLOCK(4): {
@@ -65,18 +67,6 @@ SECTIONS {
         .Ydata? : {}
     } > YCCM
 
-    GROUP BLOCK(4): {
-    /* _SDA_BASE_ computed implicitly */
-        .sdata?: {}
-        .sbss?: {}
-        * (DATA): {}
-        * (BSS): {}
-    } > PSRAM
-
-    GROUP BLOCK(4): {
-        .rodata_in_data? : {}
-    } > PSRAM
-
     GROUP BLOCK(4): {
         .debug_log? : {}
     } > SRAM
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
index 405b9698cca..99f2de05890 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
@@ -31,9 +31,7 @@ endif
 
   TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/hw/emsdp_em11d_em9d_dfss.tcf
   LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf
-  UBOOT_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/uboot.env
-  UBOOT_FILE_NAME := $(notdir $(UBOOT_FILE))
-    
+
 
 include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
   
@@ -44,7 +42,6 @@ include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
    ARC_EXTRA_APP_RULES = \
      $(DLR)\(BIN_FILE\): $(DLR)\(BIN_DIR\) $(DLR)\(OUT_NAME\)\
      \n\t\@$(DLR)\(CP\) $(DLR)\(OUT_NAME\) $(DLR)\(BIN_FILE\)\
-     \n\t\@$(DLR)\(CP\) $(UBOOT_FILE_NAME) $(DLR)\(BIN_DIR\)$(DLR)\(PS\)$(UBOOT_FILE_NAME)\
      \n \
      \n$(DLR)\(BIN_DIR\):\
      \n\t\@$(DLR)\(MKDIR\) $(DLR)\(BIN_DIR\)\
@@ -58,11 +55,6 @@ include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
    ARC_APP_DEBUG_CMD = mdb -OK -digilent -nooptions $(DLR)\(DBG_ARGS\)
    ARC_EXTRA_EXECUTE_RULES = 
 
-  MAKE_PROJECT_FILES += $(UBOOT_FILE_NAME)
-ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir $(UBOOT_FILE))),)
-  ARC_TARGET_FILES_DIRS += $(dir $(UBOOT_FILE))
-endif
-
   MAKE_PROJECT_FILES := $(filter-out README_MAKE.md, $(MAKE_PROJECT_FILES)) README_ARC_EMSDP.md
 
   # for default EMSDP configuration we can use em9d_va rt libs
diff --git a/tensorflow/lite/micro/tools/make/targets/bluepill/bluepill.lds b/tensorflow/lite/micro/tools/make/targets/bluepill/bluepill.lds
index 4366f864bd8..7497684a71b 100644
--- a/tensorflow/lite/micro/tools/make/targets/bluepill/bluepill.lds
+++ b/tensorflow/lite/micro/tools/make/targets/bluepill/bluepill.lds
@@ -12,32 +12,27 @@ limitations under the License.
 
 /* Copied and modified from:
    https://github.com/google/stm32_bare_lib/blob/master/stm32_linker_layout.lds
-   
+
    Modifications:
-    * increased the flash size to 128K from 64K. While the bluepill board has
-      64K of flash, our tests currently need more than that. Changing the flash
-      to 128K at least enables running the tests in the emulator.
-      
-      TODO(b/143153151): Change this back to 64K if/when the tests fit on real
-      bluepill hardware.
+    * increased the flash size to 512K and RAM to 256K. This far exceeds the
+      actual hardware but enables running the tests in the emulator.
 */
 
 /*
  * 0x00000000 - 0x07ffffff - aliased to flash or sys memory depending on BOOT jumpers.
- * 0x08000000 - 0x0801ffff - Flash.
+ * 0x08000000 - 0x080fffff - Flash.
  * 0x1ffff000 - 0x1ffff7ff - Boot firmware in system memory.
  * 0x1ffff800 - 0x1fffffff - Option bytes.
- * 0x20000000 - 0x20004fff - SRAM.
+ * 0x20000000 - 0x2003ffff - SRAM.
  * 0x40000000 - 0x40023400 - Peripherals
  */
 
 /* Define main entry point */
 ENTRY(_main)
 
-/* 20K of RAM and 128K of FLASH */
 MEMORY {
-RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 20K
-FLASH (rx) : ORIGIN = 0x8000000, LENGTH = 128K
+RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 256K
+FLASH (rx) : ORIGIN = 0x8000000, LENGTH = 1024K
 }
 
 /* Compute where the stack ends rather than hard coding it */
diff --git a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
index 62230f6a80a..e516554c063 100644
--- a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
@@ -1,75 +1,60 @@
-# Settings for Blue Pill platforms.
-ifeq ($(TARGET), bluepill)
-  export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
-  TARGET_ARCH := cortex-m3
-  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
+export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
+TARGET_ARCH := cortex-m3
+TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
 
-  $(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,))
-  $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis))
-  $(eval $(call add_third_party_download,$(STM32_BARE_LIB_URL),$(STM32_BARE_LIB_MD5),stm32_bare_lib,))
+$(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,))
+$(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis))
+$(eval $(call add_third_party_download,$(STM32_BARE_LIB_URL),$(STM32_BARE_LIB_MD5),stm32_bare_lib,))
 
-# TODO(b/149943573): It may be worthwhile to remove -DNDEBUG if we can get the
-#bluepill target to compile without it.
-  PLATFORM_FLAGS = \
-    -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
-    -DTF_LITE_STATIC_MEMORY \
-    -DTF_LITE_MCU_DEBUG_LOG \
-    -DNDEBUG \
-    -fno-rtti \
-    -fmessage-length=0 \
-    -fno-exceptions \
-    -fno-unwind-tables \
-    -ffunction-sections \
-    -fdata-sections \
-    -funsigned-char \
-    -MMD \
-    -mcpu=cortex-m3 \
-    -mthumb \
-    -std=gnu++11 \
-    -Wvla \
-    -Wall \
-    -Wextra \
-    -Wno-unused-parameter \
-    -Wno-strict-aliasing \
-    -Wno-type-limits \
-    -fno-delete-null-pointer-checks \
-    -fno-threadsafe-statics \
-    -fomit-frame-pointer \
-    -fno-use-cxa-atexit \
-    -nostdlib \
-    -g \
-    -Os
-  CXXFLAGS += $(PLATFORM_FLAGS)
-  CCFLAGS += $(PLATFORM_FLAGS)
-  LDFLAGS += \
-    -T $(MAKEFILE_DIR)/targets/bluepill/bluepill.lds \
-    -Wl,-Map=$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref \
-    -Wl,--gc-sections
-  BUILD_TYPE := micro
-  MICROLITE_LIBS := \
-    -lm
-  INCLUDES += \
-    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
-    -I$(MAKEFILE_DIR)/downloads/stm32_bare_lib/include
-  MICROLITE_CC_SRCS += \
-    $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.c) \
-    $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.cc)
-  EXCLUDED_SRCS := \
-    $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/debug_log.c
-  MICROLITE_CC_SRCS := $(filter-out $(EXCLUDED_SRCS), $(MICROLITE_CC_SRCS))
-  TEST_SCRIPT := tensorflow/lite/micro/testing/test_bluepill_binary.sh
-  # These are tests that don't currently work on the blue pill.
-  EXCLUDED_TESTS := \
-    tensorflow/lite/micro/micro_interpreter_test.cc \
-    tensorflow/lite/micro/simple_tensor_allocator_test.cc
-  MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
+PLATFORM_FLAGS = \
+  -DTF_LITE_MCU_DEBUG_LOG \
+  -mcpu=cortex-m3 \
+  -mthumb \
+  -Wno-vla \
+  -Wno-strict-aliasing \
+  -Wno-shadow \
+  -Wno-type-limits \
+  -fomit-frame-pointer \
+  -nostdlib
 
-# These are microcontroller-specific rules for converting the ELF output
-# of the linker into a binary image that can be loaded directly.
-OBJCOPY := $(TARGET_TOOLCHAIN_PREFIX)objcopy
+# TODO(b/168334217): Currently we always add -DNDEBUG because the build is
+# broken w/o it. Remove this workaround once the issue is resolved.
+PLATFORM_FLAGS += -DNDEBUG
 
-$(BINDIR)/%.bin: $(BINDIR)/%
-	@mkdir -p $(dir $@)
-	$(OBJCOPY) $< $@ -O binary
+CXXFLAGS += $(PLATFORM_FLAGS) -fno-use-cxa-atexit
+CCFLAGS += $(PLATFORM_FLAGS)
+
+LDFLAGS += \
+  -T $(MAKEFILE_DIR)/targets/bluepill/bluepill.lds \
+  -Wl,-Map=$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref
+
+# Additional include paths needed for the stm_32_bare_lib only.
+INCLUDES += \
+  -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+  -I$(MAKEFILE_DIR)/downloads/stm32_bare_lib/include
+
+MICROLITE_CC_SRCS += \
+  $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.c) \
+  $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.cc)
+EXCLUDED_SRCS := \
+  $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/debug_log.c
+MICROLITE_CC_SRCS := $(filter-out $(EXCLUDED_SRCS), $(MICROLITE_CC_SRCS))
+
+# TODO(b/143286954): Figure out why some tests fail and enable ince the issues
+# are resolved.
+EXCLUDED_TESTS := \
+  tensorflow/lite/micro/micro_interpreter_test.cc \
+  tensorflow/lite/micro/micro_allocator_test.cc \
+  tensorflow/lite/micro/memory_helpers_test.cc \
+  tensorflow/lite/micro/memory_arena_threshold_test.cc \
+  tensorflow/lite/micro/kernels/circular_buffer_test.cc
+MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
+
+EXCLUDED_EXAMPLE_TESTS := \
+  tensorflow/lite/micro/examples/magic_wand/Makefile.inc \
+  tensorflow/lite/micro/examples/micro_speech/Makefile.inc \
+  tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc
+MICRO_LITE_EXAMPLE_TESTS := $(filter-out $(EXCLUDED_EXAMPLE_TESTS), $(MICRO_LITE_EXAMPLE_TESTS))
+
+TEST_SCRIPT := tensorflow/lite/micro/testing/test_bluepill_binary.sh
 
-endif
diff --git a/tensorflow/lite/micro/tools/make/targets/ceva/CEVA_BX1_TFLM.ld b/tensorflow/lite/micro/tools/make/targets/ceva/CEVA_BX1_TFLM.ld
new file mode 100755
index 00000000000..75652b2948c
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/ceva/CEVA_BX1_TFLM.ld
@@ -0,0 +1,219 @@
+OUTPUT(a.elf)
+
+/* By default, program starts from reset address (the default location of the interrupt table) */
+ENTRY(__cxd_inttbl_start)
+
+/** Memory configuration parameters.
+ *  The parameters become application symbols and can be referred from application
+ */
+__internal_code_start = DEFINED(__internal_code_start) ? __internal_code_start : 0x00000000;
+__internal_code_size  = DEFINED(__internal_code_size ) ? __internal_code_size  : 256k;
+__internal_data_start = DEFINED(__internal_data_start) ? __internal_data_start : 0x00000000;
+__internal_data_size  = DEFINED(__internal_data_size ) ? __internal_data_size  : 512k;
+__external_start      = DEFINED(__external_start     ) ? __external_start      : 0x40000000;
+__external_size       = DEFINED(__external_size      ) ? __external_size       : 0x40000000;
+__rom_start           = DEFINED(__rom_start          ) ? __rom_start           : 0xC0000000;
+__rom_size            = DEFINED(__rom_size           ) ? __rom_size            : 1024M;
+
+__malloc_size         = DEFINED(__malloc_size        ) ? __malloc_size         : 32k;
+__stack_size          = DEFINED(__stack_size         ) ? __stack_size          : 32k;
+__arg_sect_size       = DEFINED(__arg_sect_size      ) ? __arg_sect_size       : 512;
+
+MEMORY {
+    INTERNAL_CODE  (rx) : ORIGIN = __internal_code_start, LENGTH = __internal_code_size
+    INTERNAL_DATA  (rw) : ORIGIN = __internal_data_start, LENGTH = __internal_data_size
+    EXTERNAL      (rwx) : ORIGIN = __external_start     , LENGTH = __external_size
+    ROM            (rx) : ORIGIN = __rom_start          , LENGTH = __rom_size
+}
+
+SECTIONS {
+    .inttbl : ALIGN(0x20) {
+        /** The interrupt vector table. Contains the NMI
+         *  and maskable interrupt handlers
+         */
+        . = 0x0;
+        KEEP(*(.inttbl))
+        . = ALIGN(0x20);
+    	KEEP(*(.sinttbl))
+    } >INTERNAL_CODE
+
+    .data.internal : ALIGN(0x20) {
+        PROVIDE(__data_internal_start = ABSOLUTE(.));
+        /* Don't map any data at address zero to avoid issues with C NULL
+         * pointer checks
+         */
+        . += 0x4;
+
+        PROVIDE(__data_start = ABSOLUTE(.));
+        *(.data .data.*)
+        PROVIDE(__data_end = ABSOLUTE(.));
+        PROVIDE(__data_size = ABSOLUTE(+__data_end - __data_start));
+
+        PROVIDE(__sdata_start = ABSOLUTE(.));
+        *(.sdata .sdata.*)
+        PROVIDE(__sdata_end = ABSOLUTE(.));
+        PROVIDE(__sdata_size = ABSOLUTE(+__sdata_end - __sdata_start));
+
+        PROVIDE(__data_internal_end = ABSOLUTE(.));
+        PROVIDE(__data_internal_size = ABSOLUTE(__data_internal_end - __data_internal_start));
+    } >INTERNAL_DATA
+
+    .data.internal.clone (NOLOAD) : ALIGN(0x20) {
+  		PROVIDE(__data_internal_clone_start = ABSOLUTE(.));
+		. = ABSOLUTE(. + __data_internal_size);
+  	} >INTERNAL_DATA
+
+    .data.internal.ro : ALIGN(0x20) {
+        PROVIDE(__data_internal_ro_start = ABSOLUTE(.));
+        PROVIDE(__rodata_start = ABSOLUTE(.));
+        *(.rodata .rodata.*)
+        PROVIDE(__rodata_end = ABSOLUTE(.));
+        PROVIDE(__rodata_size = ABSOLUTE(+__rodata_end - __rodata_start));
+
+        PROVIDE(__data_internal_ro_end = ABSOLUTE(.));
+        PROVIDE(__data_internal_ro_size = ABSOLUTE(__data_internal_ro_end - __data_internal_ro_start));
+    } >INTERNAL_DATA
+
+    .cst.call : ALIGN(4) {
+        PROVIDE(__cst_call_start = ABSOLUTE(.));
+        *(.cst.call)
+        PROVIDE(__cst_call_end = ABSOLUTE(.));
+    } >INTERNAL_DATA
+
+    .cst.mov : ALIGN(4) {
+        PROVIDE(__cst_mov_start = ABSOLUTE(.));
+        *(.cst.mov)
+        PROVIDE(__cst_mov_end = ABSOLUTE(.));
+    } >INTERNAL_DATA
+
+    .bss (NOLOAD) : ALIGN(0x20) {
+        PROVIDE(__bss_start = ABSOLUTE(.));
+        *(.bss .bss.*)
+        PROVIDE(__common_start = ABSOLUTE(.));
+        *(COMMON)
+        PROVIDE(__common_end = ABSOLUTE(.));
+        PROVIDE(__common_size = ABSOLUTE(+__common_end - __common_start));
+        PROVIDE(__bss_end = ABSOLUTE(.));
+        PROVIDE(__bss_size = ABSOLUTE(+__bss_end - __bss_start));
+    } >INTERNAL_DATA
+
+    __STACK_SECT (NOLOAD) : ALIGN(0x10) {
+        __stack_start = ABSOLUTE(.);
+        . = . + __stack_size;
+        __stack_end = ABSOLUTE(.);
+    } >INTERNAL_DATA
+
+    .text : ALIGN(0x20) {
+        PROVIDE(__text_start = ABSOLUTE(.));
+        /* The __call_saved* functions need to be placed at low addresses for
+         * calling with absolute call instructions
+         */
+        *(.text.__call_saved*)
+        *(.text .text.*)
+        PROVIDE(__text_end = ABSOLUTE(.));
+    } >EXTERNAL
+
+    .data.external : ALIGN(0x20) {
+        /** .data1, .rodata1, .sdata1 are all for large symbols which cannot
+         * fit in limited internal memory. We put them in external memory by
+         * default. */
+        PROVIDE(__data_external_start = ABSOLUTE(.));
+
+        PROVIDE(__data1_start = ABSOLUTE(.));
+        *(.data1 .data1.*)
+        PROVIDE(__data1_end = ABSOLUTE(.));
+
+        PROVIDE(__sdata1_start = ABSOLUTE(.));
+        *(.sdata1 .sdata1.*)
+        PROVIDE(__sdata1_end = ABSOLUTE(.));
+        PROVIDE(__sdata1_size = ABSOLUTE(+__sdata1_end - __sdata1_start));
+
+        PROVIDE(__data_external_end = ABSOLUTE(.));
+        PROVIDE(__data_external_size = ABSOLUTE(__data_external_end - __data_external_start));
+    } >EXTERNAL
+
+    .data.external.clone (NOLOAD) : ALIGN(0x20) {
+        PROVIDE(__data_external_clone_start = ABSOLUTE(.));
+		. = ABSOLUTE(. + __data_external_size);
+   } >EXTERNAL
+
+    .data.external.ro : ALIGN(0x20) {
+        /** .data1, .rodata1, .sdata1 are all for large symbols which cannot
+         * fit in limited internal memory. We put them in external memory by
+         * default. */
+        PROVIDE(__data_external_ro_start = ABSOLUTE(.));
+
+        PROVIDE(__rodata1_start = ABSOLUTE(.));
+        *(.rodata1 .rodata1.*)
+        PROVIDE(__rodata1_end = ABSOLUTE(.));
+        PROVIDE(__rodata1_size = ABSOLUTE(+__rodata1_end - __rodata1_start));
+
+        /* Constructors and destructors are called once per program invocation,
+         * so are never in the hot path; they shouldn't waste space in limited
+         * internal memory so we place them in slower, external memory */
+
+        . = ALIGN(4); /* constructors must be aligned on a word boundary */
+        PROVIDE(__init_array_start = ABSOLUTE(.));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.init_array*) SORT_BY_INIT_PRIORITY(.ctors*)));
+        PROVIDE(__init_array_end = ABSOLUTE(.));
+
+        PROVIDE(__fini_array_start = ABSOLUTE(.));
+        /* destructors are run in reverse order of their priority */
+        KEEP(*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.fini_array*) SORT_BY_INIT_PRIORITY(.dtors*)));
+        PROVIDE(__fini_array_end = ABSOLUTE(.));
+
+        PROVIDE(__data_external_ro_end = ABSOLUTE(.));
+        PROVIDE(__data_external_ro_size = ABSOLUTE(__data_external_ro_end - __data_external_ro_start));
+    } >EXTERNAL
+
+    .bss1 (NOLOAD) : ALIGN(0x20) {
+        /**
+         * `.bss1` is for large zero-initialized symbols that do not fit in
+         * internal data
+         */
+        PROVIDE(__bss1_start = ABSOLUTE(.));
+        *(.bss1 .bss1.*)
+        PROVIDE(__large_common_start = ABSOLUTE(.));
+        *(LARGE_COMMON)
+        PROVIDE(__large_common_end = ABSOLUTE(.));
+        PROVIDE(__large_common_size = ABSOLUTE(+__large_common_end - __large_common_start));
+        PROVIDE(__bss1_end = ABSOLUTE(.));
+        PROVIDE(__bss1_size = ABSOLUTE(+__bss1_end - __bss1_start));
+    } >EXTERNAL
+
+    /* Program arguments are loaded by `_start` routine from `__arg_sect_start`.
+     * When the user has set a zero size for the section, argc, and argv
+     * will be zero and NULL, respectively.
+     * Although likely small, they are on the slow path so by default they
+     * go at the end of external memory
+     */
+    __ARG_SECT (NOLOAD) : ALIGN(0x4) {
+        __arg_sect_start = .;
+        . = . + (__arg_sect_size ? __arg_sect_size + 4 : 0);
+        __arg_sect_end = .;
+    } >EXTERNAL
+
+    __MALLOC_SECT (NOLOAD) : ALIGN(0x10) {
+        PROVIDE(__malloc_start = ABSOLUTE(.));
+        . = . + __malloc_size;
+        PROVIDE(__malloc_end = ABSOLUTE(.));
+    } >EXTERNAL
+
+    data_internal_loadable_addr = __data_internal_clone_start;
+    data_external_loadable_addr = __data_external_clone_start;
+
+    /DISCARD/ : {
+        /* Note:  The CEVA Debugger and Restriction Checker use information 
+         * stored in the ".note.CEVA-arch" section. Do NOT discard this section
+         * for projects in development phase. This section has no effect on the
+         * applications footprint */
+        *(.comment)
+        *(.note.GNU-stack)
+        /* The X-DSP ABI uses a custom relocation format stored in its own
+         * section. These are left in the binary by default but are unneeded. */
+        *(.ceva_reloc)
+    }
+
+}
diff --git a/tensorflow/lite/micro/tools/make/targets/ceva/CEVA_BX1_TFLM_18.0.2.ld b/tensorflow/lite/micro/tools/make/targets/ceva/CEVA_BX1_TFLM_18.0.2.ld
new file mode 100755
index 00000000000..0abbef4f89d
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/ceva/CEVA_BX1_TFLM_18.0.2.ld
@@ -0,0 +1,190 @@
+OUTPUT(a.elf)
+
+/* By default, program starts from reset address (the default location of the interrupt table) */
+ENTRY(__cxd_inttbl_start)
+
+/** Memory configuration parameters.
+ *  The parameters become application symbols and can be referred from application
+ */
+__internal_code_start = DEFINED(__internal_code_start) ? __internal_code_start : 0x00000000;
+__internal_code_size  = DEFINED(__internal_code_size ) ? __internal_code_size  : 256k;
+__internal_data_start = DEFINED(__internal_data_start) ? __internal_data_start : 0x00000000;
+__internal_data_size  = DEFINED(__internal_data_size ) ? __internal_data_size  : 512k;
+__external_start      = DEFINED(__external_start     ) ? __external_start      : 0x40000000;
+__external_size       = DEFINED(__external_size      ) ? __external_size       : 0x40000000;
+__rom_start           = DEFINED(__rom_start          ) ? __rom_start           : 0xC0000000;
+__rom_size            = DEFINED(__rom_size           ) ? __rom_size            : 1024M;
+
+__malloc_size         = DEFINED(__malloc_size        ) ? __malloc_size         : 64k;
+__stack_size          = DEFINED(__stack_size         ) ? __stack_size          : 64k;
+__arg_sect_size       = DEFINED(__arg_sect_size      ) ? __arg_sect_size       : 512;
+
+MEMORY {
+    INTERNAL_CODE  (rx) : ORIGIN = __internal_code_start, LENGTH = __internal_code_size
+    INTERNAL_DATA  (rw) : ORIGIN = __internal_data_start, LENGTH = __internal_data_size
+    EXTERNAL      (rwx) : ORIGIN = __external_start     , LENGTH = __external_size
+    ROM            (rx) : ORIGIN = __rom_start          , LENGTH = __rom_size
+}
+
+SECTIONS {
+    .inttbl : ALIGN(0x20) {
+        /** The interrupt vector resides at address zero and contains the NMI
+         *  and maskable interrupt handlers
+         */
+        . = 0x0;
+        KEEP(*(.inttbl))
+        . = ALIGN(0x20);
+    	KEEP(*(.sinttbl))
+    } >INTERNAL_CODE AT>ROM
+
+    .data.internal : ALIGN(0x20) {
+        PROVIDE(__data_internal_start = ABSOLUTE(.));
+        /* Don't map any data at address zero to avoid issues with C NULL
+         * pointer checks
+         */
+        . += 0x4;
+
+        PROVIDE(__data_start = ABSOLUTE(.));
+        *(.data .data.*)
+        PROVIDE(__data_end = ABSOLUTE(.));
+        PROVIDE(__data_size = +__data_end - __data_start);
+
+        PROVIDE(__sdata_start = ABSOLUTE(.));
+        *(.sdata .sdata.*)
+        PROVIDE(__sdata_end = ABSOLUTE(.));
+        PROVIDE(__sdata_size = +__sdata_end - __sdata_start);
+
+        PROVIDE(__rodata_start = ABSOLUTE(.));
+        *(.rodata .rodata.*)
+        PROVIDE(__rodata_end = ABSOLUTE(.));
+        PROVIDE(__rodata_size = +__rodata_end - __rodata_start);
+
+        PROVIDE(__data_internal_end = ABSOLUTE(.));
+        PROVIDE(__data_internal_size = __data_internal_end - __data_internal_start);
+    } >INTERNAL_DATA AT>ROM
+
+    .cst.call : ALIGN(4) {
+        PROVIDE(__cst_call_start = ABSOLUTE(.));
+        *(.cst.call)
+        PROVIDE(__cst_call_end = ABSOLUTE(.));
+    } >INTERNAL_DATA AT>ROM
+
+    .cst.mov : ALIGN(4) {
+        PROVIDE(__cst_mov_start = ABSOLUTE(.));
+        *(.cst.mov)
+        PROVIDE(__cst_mov_end = ABSOLUTE(.));
+    } >INTERNAL_DATA AT>ROM
+
+    .bss (NOLOAD) : ALIGN(0x20) {
+        PROVIDE(__bss_start = ABSOLUTE(.));
+        *(.bss .bss.*)
+        PROVIDE(__common_start = ABSOLUTE(.));
+        *(COMMON)
+        PROVIDE(__common_end = ABSOLUTE(.));
+        PROVIDE(__common_size = +__common_end - __common_start);
+        PROVIDE(__bss_end = ABSOLUTE(.));
+        PROVIDE(__bss_size = +__bss_end - __bss_start);
+    } >INTERNAL_DATA
+
+    __STACK_SECT (NOLOAD) : ALIGN(0x10) {
+        __stack_start = ABSOLUTE(.);
+        . = . + __stack_size;
+        __stack_end = ABSOLUTE(.);
+    } >INTERNAL_DATA
+
+    .text : ALIGN(0x20) {
+        PROVIDE(__text_start = ABSOLUTE(.));
+        /* The __call_saved* functions need to be placed at low addresses for
+         * calling with absolute call instructions
+         */
+        *(.text.__call_saved*)
+        *(.text .text.*)
+        PROVIDE(__text_end = ABSOLUTE(.));
+    } >EXTERNAL AT>ROM
+
+    .data.external : ALIGN(0x20) {
+        /** .data1, .rodata1, .sdata1 are all for large symbols which cannot
+         * fit in limited internal memory. We put them in external memory by
+         * default. */
+        PROVIDE(__data_external_start = ABSOLUTE(.));
+
+        PROVIDE(__data1_start = ABSOLUTE(.));
+        *(.data1 .data1.*)
+        PROVIDE(__data1_end = ABSOLUTE(.));
+
+        PROVIDE(__sdata1_start = ABSOLUTE(.));
+        *(.sdata1 .sdata1.*)
+        PROVIDE(__sdata1_end = ABSOLUTE(.));
+        PROVIDE(__sdata1_size = +__sdata1_end - __sdata1_start);
+
+        PROVIDE(__rodata1_start = ABSOLUTE(.));
+        *(.rodata1 .rodata1.*)
+        PROVIDE(__rodata1_end = ABSOLUTE(.));
+        PROVIDE(__rodata1_size = +__rodata1_end - __rodata1_start);
+
+        /* Constructors and destructors are called once per program invocation,
+         * so are never in the hot path; they shouldn't waste space in limited
+         * internal memory so we place them in slower, external memory
+         */
+
+        . = ALIGN(4); /* constructors must be aligned on a word boundary */
+        PROVIDE(__init_array_start = ABSOLUTE(.));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.init_array*) SORT_BY_INIT_PRIORITY(.ctors*)));
+        PROVIDE(__init_array_end = ABSOLUTE(.));
+
+        PROVIDE(__fini_array_start = ABSOLUTE(.));
+        /* destructors are run in reverse order of their priority */
+        KEEP(*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.fini_array*) SORT_BY_INIT_PRIORITY(.dtors*)));
+        PROVIDE(__fini_array_end = ABSOLUTE(.));
+
+        PROVIDE(__data_external_end = ABSOLUTE(.));
+        PROVIDE(__data_external_size = __data_external_end - __data_external_start);
+    } >EXTERNAL AT>ROM
+
+    .bss1 (NOLOAD) : ALIGN(0x20) {
+        /**
+         * `.bss1` is for large zero-initialized symbols that do not fit in
+         * internal data
+         */
+        PROVIDE(__bss1_start = ABSOLUTE(.));
+        *(.bss1 .bss1.*)
+        PROVIDE(__large_common_start = ABSOLUTE(.));
+        *(LARGE_COMMON)
+        PROVIDE(__large_common_end = ABSOLUTE(.));
+        PROVIDE(__large_common_size = +__large_common_end - __large_common_start);
+        PROVIDE(__bss1_end = ABSOLUTE(.));
+        PROVIDE(__bss1_size = +__bss1_end - __bss1_start);
+    } >EXTERNAL
+
+    /* Program arguments are loaded by `_start` routine from `__arg_sect_start`.
+     * When the user has set a zero size for the section, argc, and argv
+     * will be zero and NULL, respectively.
+     * Although likely small, they are on the slow path so by default they
+     * go at the end of external memory
+     */
+    __ARG_SECT (NOLOAD) : ALIGN(0x4) {
+        __arg_sect_start = .;
+        . = . + (__arg_sect_size ? __arg_sect_size + 4 : 0);
+        __arg_sect_end = .;
+    } >EXTERNAL
+
+    __MALLOC_SECT (NOLOAD) : ALIGN(0x10) {
+        PROVIDE(__malloc_start = ABSOLUTE(.));
+        . = . + __malloc_size;
+        PROVIDE(__malloc_end = ABSOLUTE(.));
+    } >EXTERNAL
+
+    /DISCARD/ : {
+        /* Discarding .note.CEVA-arch saves a fair amount of space but
+         * confounds the restriction checker. YMMV */
+        /* *(.note.CEVA-arch) */
+        *(.comment)
+        *(.note.GNU-stack)
+        /* The X-DSP ABI uses a custom relocation format stored in its own
+         * section. These are left in the binary by default but are unneeded. */
+        *(.ceva_reloc)
+    }
+
+}
diff --git a/tensorflow/lite/micro/tools/make/targets/ceva/CEVA_BX1_TFLM_18.0.3.ld b/tensorflow/lite/micro/tools/make/targets/ceva/CEVA_BX1_TFLM_18.0.3.ld
new file mode 100755
index 00000000000..75652b2948c
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/ceva/CEVA_BX1_TFLM_18.0.3.ld
@@ -0,0 +1,219 @@
+OUTPUT(a.elf)
+
+/* By default, program starts from reset address (the default location of the interrupt table) */
+ENTRY(__cxd_inttbl_start)
+
+/** Memory configuration parameters.
+ *  The parameters become application symbols and can be referred from application
+ */
+__internal_code_start = DEFINED(__internal_code_start) ? __internal_code_start : 0x00000000;
+__internal_code_size  = DEFINED(__internal_code_size ) ? __internal_code_size  : 256k;
+__internal_data_start = DEFINED(__internal_data_start) ? __internal_data_start : 0x00000000;
+__internal_data_size  = DEFINED(__internal_data_size ) ? __internal_data_size  : 512k;
+__external_start      = DEFINED(__external_start     ) ? __external_start      : 0x40000000;
+__external_size       = DEFINED(__external_size      ) ? __external_size       : 0x40000000;
+__rom_start           = DEFINED(__rom_start          ) ? __rom_start           : 0xC0000000;
+__rom_size            = DEFINED(__rom_size           ) ? __rom_size            : 1024M;
+
+__malloc_size         = DEFINED(__malloc_size        ) ? __malloc_size         : 32k;
+__stack_size          = DEFINED(__stack_size         ) ? __stack_size          : 32k;
+__arg_sect_size       = DEFINED(__arg_sect_size      ) ? __arg_sect_size       : 512;
+
+MEMORY {
+    INTERNAL_CODE  (rx) : ORIGIN = __internal_code_start, LENGTH = __internal_code_size
+    INTERNAL_DATA  (rw) : ORIGIN = __internal_data_start, LENGTH = __internal_data_size
+    EXTERNAL      (rwx) : ORIGIN = __external_start     , LENGTH = __external_size
+    ROM            (rx) : ORIGIN = __rom_start          , LENGTH = __rom_size
+}
+
+SECTIONS {
+    .inttbl : ALIGN(0x20) {
+        /** The interrupt vector table. Contains the NMI
+         *  and maskable interrupt handlers
+         */
+        . = 0x0;
+        KEEP(*(.inttbl))
+        . = ALIGN(0x20);
+    	KEEP(*(.sinttbl))
+    } >INTERNAL_CODE
+
+    .data.internal : ALIGN(0x20) {
+        PROVIDE(__data_internal_start = ABSOLUTE(.));
+        /* Don't map any data at address zero to avoid issues with C NULL
+         * pointer checks
+         */
+        . += 0x4;
+
+        PROVIDE(__data_start = ABSOLUTE(.));
+        *(.data .data.*)
+        PROVIDE(__data_end = ABSOLUTE(.));
+        PROVIDE(__data_size = ABSOLUTE(+__data_end - __data_start));
+
+        PROVIDE(__sdata_start = ABSOLUTE(.));
+        *(.sdata .sdata.*)
+        PROVIDE(__sdata_end = ABSOLUTE(.));
+        PROVIDE(__sdata_size = ABSOLUTE(+__sdata_end - __sdata_start));
+
+        PROVIDE(__data_internal_end = ABSOLUTE(.));
+        PROVIDE(__data_internal_size = ABSOLUTE(__data_internal_end - __data_internal_start));
+    } >INTERNAL_DATA
+
+    .data.internal.clone (NOLOAD) : ALIGN(0x20) {
+  		PROVIDE(__data_internal_clone_start = ABSOLUTE(.));
+		. = ABSOLUTE(. + __data_internal_size);
+  	} >INTERNAL_DATA
+
+    .data.internal.ro : ALIGN(0x20) {
+        PROVIDE(__data_internal_ro_start = ABSOLUTE(.));
+        PROVIDE(__rodata_start = ABSOLUTE(.));
+        *(.rodata .rodata.*)
+        PROVIDE(__rodata_end = ABSOLUTE(.));
+        PROVIDE(__rodata_size = ABSOLUTE(+__rodata_end - __rodata_start));
+
+        PROVIDE(__data_internal_ro_end = ABSOLUTE(.));
+        PROVIDE(__data_internal_ro_size = ABSOLUTE(__data_internal_ro_end - __data_internal_ro_start));
+    } >INTERNAL_DATA
+
+    .cst.call : ALIGN(4) {
+        PROVIDE(__cst_call_start = ABSOLUTE(.));
+        *(.cst.call)
+        PROVIDE(__cst_call_end = ABSOLUTE(.));
+    } >INTERNAL_DATA
+
+    .cst.mov : ALIGN(4) {
+        PROVIDE(__cst_mov_start = ABSOLUTE(.));
+        *(.cst.mov)
+        PROVIDE(__cst_mov_end = ABSOLUTE(.));
+    } >INTERNAL_DATA
+
+    .bss (NOLOAD) : ALIGN(0x20) {
+        PROVIDE(__bss_start = ABSOLUTE(.));
+        *(.bss .bss.*)
+        PROVIDE(__common_start = ABSOLUTE(.));
+        *(COMMON)
+        PROVIDE(__common_end = ABSOLUTE(.));
+        PROVIDE(__common_size = ABSOLUTE(+__common_end - __common_start));
+        PROVIDE(__bss_end = ABSOLUTE(.));
+        PROVIDE(__bss_size = ABSOLUTE(+__bss_end - __bss_start));
+    } >INTERNAL_DATA
+
+    __STACK_SECT (NOLOAD) : ALIGN(0x10) {
+        __stack_start = ABSOLUTE(.);
+        . = . + __stack_size;
+        __stack_end = ABSOLUTE(.);
+    } >INTERNAL_DATA
+
+    .text : ALIGN(0x20) {
+        PROVIDE(__text_start = ABSOLUTE(.));
+        /* The __call_saved* functions need to be placed at low addresses for
+         * calling with absolute call instructions
+         */
+        *(.text.__call_saved*)
+        *(.text .text.*)
+        PROVIDE(__text_end = ABSOLUTE(.));
+    } >EXTERNAL
+
+    .data.external : ALIGN(0x20) {
+        /** .data1, .rodata1, .sdata1 are all for large symbols which cannot
+         * fit in limited internal memory. We put them in external memory by
+         * default. */
+        PROVIDE(__data_external_start = ABSOLUTE(.));
+
+        PROVIDE(__data1_start = ABSOLUTE(.));
+        *(.data1 .data1.*)
+        PROVIDE(__data1_end = ABSOLUTE(.));
+
+        PROVIDE(__sdata1_start = ABSOLUTE(.));
+        *(.sdata1 .sdata1.*)
+        PROVIDE(__sdata1_end = ABSOLUTE(.));
+        PROVIDE(__sdata1_size = ABSOLUTE(+__sdata1_end - __sdata1_start));
+
+        PROVIDE(__data_external_end = ABSOLUTE(.));
+        PROVIDE(__data_external_size = ABSOLUTE(__data_external_end - __data_external_start));
+    } >EXTERNAL
+
+    .data.external.clone (NOLOAD) : ALIGN(0x20) {
+        PROVIDE(__data_external_clone_start = ABSOLUTE(.));
+		. = ABSOLUTE(. + __data_external_size);
+   } >EXTERNAL
+
+    .data.external.ro : ALIGN(0x20) {
+        /** .data1, .rodata1, .sdata1 are all for large symbols which cannot
+         * fit in limited internal memory. We put them in external memory by
+         * default. */
+        PROVIDE(__data_external_ro_start = ABSOLUTE(.));
+
+        PROVIDE(__rodata1_start = ABSOLUTE(.));
+        *(.rodata1 .rodata1.*)
+        PROVIDE(__rodata1_end = ABSOLUTE(.));
+        PROVIDE(__rodata1_size = ABSOLUTE(+__rodata1_end - __rodata1_start));
+
+        /* Constructors and destructors are called once per program invocation,
+         * so are never in the hot path; they shouldn't waste space in limited
+         * internal memory so we place them in slower, external memory */
+
+        . = ALIGN(4); /* constructors must be aligned on a word boundary */
+        PROVIDE(__init_array_start = ABSOLUTE(.));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*)));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.init_array*) SORT_BY_INIT_PRIORITY(.ctors*)));
+        PROVIDE(__init_array_end = ABSOLUTE(.));
+
+        PROVIDE(__fini_array_start = ABSOLUTE(.));
+        /* destructors are run in reverse order of their priority */
+        KEEP(*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*)));
+        KEEP(*(SORT_BY_INIT_PRIORITY(.fini_array*) SORT_BY_INIT_PRIORITY(.dtors*)));
+        PROVIDE(__fini_array_end = ABSOLUTE(.));
+
+        PROVIDE(__data_external_ro_end = ABSOLUTE(.));
+        PROVIDE(__data_external_ro_size = ABSOLUTE(__data_external_ro_end - __data_external_ro_start));
+    } >EXTERNAL
+
+    .bss1 (NOLOAD) : ALIGN(0x20) {
+        /**
+         * `.bss1` is for large zero-initialized symbols that do not fit in
+         * internal data
+         */
+        PROVIDE(__bss1_start = ABSOLUTE(.));
+        *(.bss1 .bss1.*)
+        PROVIDE(__large_common_start = ABSOLUTE(.));
+        *(LARGE_COMMON)
+        PROVIDE(__large_common_end = ABSOLUTE(.));
+        PROVIDE(__large_common_size = ABSOLUTE(+__large_common_end - __large_common_start));
+        PROVIDE(__bss1_end = ABSOLUTE(.));
+        PROVIDE(__bss1_size = ABSOLUTE(+__bss1_end - __bss1_start));
+    } >EXTERNAL
+
+    /* Program arguments are loaded by `_start` routine from `__arg_sect_start`.
+     * When the user has set a zero size for the section, argc, and argv
+     * will be zero and NULL, respectively.
+     * Although likely small, they are on the slow path so by default they
+     * go at the end of external memory
+     */
+    __ARG_SECT (NOLOAD) : ALIGN(0x4) {
+        __arg_sect_start = .;
+        . = . + (__arg_sect_size ? __arg_sect_size + 4 : 0);
+        __arg_sect_end = .;
+    } >EXTERNAL
+
+    __MALLOC_SECT (NOLOAD) : ALIGN(0x10) {
+        PROVIDE(__malloc_start = ABSOLUTE(.));
+        . = . + __malloc_size;
+        PROVIDE(__malloc_end = ABSOLUTE(.));
+    } >EXTERNAL
+
+    data_internal_loadable_addr = __data_internal_clone_start;
+    data_external_loadable_addr = __data_external_clone_start;
+
+    /DISCARD/ : {
+        /* Note:  The CEVA Debugger and Restriction Checker use information 
+         * stored in the ".note.CEVA-arch" section. Do NOT discard this section
+         * for projects in development phase. This section has no effect on the
+         * applications footprint */
+        *(.comment)
+        *(.note.GNU-stack)
+        /* The X-DSP ABI uses a custom relocation format stored in its own
+         * section. These are left in the binary by default but are unneeded. */
+        *(.ceva_reloc)
+    }
+
+}
diff --git a/tensorflow/lite/micro/tools/make/targets/ceva_bx1_makefile.inc b/tensorflow/lite/micro/tools/make/targets/ceva_bx1_makefile.inc
new file mode 100755
index 00000000000..b683c62c8fb
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/ceva_bx1_makefile.inc
@@ -0,0 +1,49 @@
+
+ifeq ($(TARGET), ceva)
+ifeq ($(TARGET_ARCH), bx1)
+ #TARGET_ARCH := 
+CXXFLAGS := -std=c++11 -DTF_LITE_STATIC_MEMORY
+CCFLAGS  := -std=c11   -DTF_LITE_STATIC_MEMORY
+
+
+  PLATFORM_ARGS = \
+    -c \
+	-fmessage-length=0 \
+	-fpermissive \
+	-Os \
+	-g3 \
+	-Wall \
+	-pedantic \
+	-D_LIBCPP_INLINE_VISIBILITY="" \
+	-D_LIBCPP_EXTERN_TEMPLATE_INLINE_VISIBILITY="" \
+	--target=cevabx1-elf \
+	-mcpu=cevabx1v1.0.0 \
+	-m32x32 \
+	-mgetbits \
+	-mloop-buffer-size=10 \
+	-mfp=1 \
+	-mdpfp=1
+
+  TARGET_TOOLCHAIN_PREFIX := ceva
+  CXX_TOOL := clang++
+  CC_TOOL := clang
+  LD_TOOL := ceva-elf-ld
+  LD := ceva-elf-ld
+
+  CXXFLAGS += $(PLATFORM_ARGS)
+  CCFLAGS += $(PLATFORM_ARGS)
+  LDFLAGS += \
+	  -T \
+	CEVA_TFLM.ld \
+	--no-relax \
+	--no-gc-sections \
+	-defsym \
+	__internal_data_size=512k \
+	-defsym \
+	__internal_code_size=256k \
+#-L/home/yaire/CEVA-ToolBox/V18/BX/cevatools/lib/clang/7.1.0/cevabx1-unknown-unknown-elf/lib/rtlv1.0.0-fp1-dpfp1 \
+#	-lc++ -lc++abi -lc -lcompiler-rt
+    
+
+endif
+endif
diff --git a/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc b/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc
new file mode 100644
index 00000000000..fd88fd12019
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc
@@ -0,0 +1,132 @@
+# Generic Makefile target for ARM Cortex M builds.
+# For more info see: tensorflow/lite/micro/cortex_m_generic/README.md
+
+TOOLCHAIN:=armgcc
+FLOAT := soft
+GCC_TARGET_ARCH := $(TARGET_ARCH)
+
+ifeq ($(TARGET_ARCH), cortex-m0)
+  CORE=M0
+  ARM_LDFLAGS := -Wl,--cpu=Cortex-M0
+  TARGET_SPECIFIC_FLAGS +=
+
+else ifeq ($(TARGET_ARCH), cortex-m3)
+  CORE=M3
+  ARM_LDFLAGS := -Wl,--cpu=Cortex-M3
+  TARGET_SPECIFIC_FLAGS +=
+
+else ifeq ($(TARGET_ARCH), cortex-m33)
+  CORE=M33
+  ARM_LDFLAGS := -Wl,--cpu=Cortex-M33
+  TARGET_SPECIFIC_FLAGS += -D__DSP_PRESENT=1 -D__FPU_PRESENT=1 -D__VTOR_PRESENT=1 -D__FPU_USED=1
+  FLOAT=hard
+
+else ifeq ($(TARGET_ARCH), cortex-m33+nodsp)
+  CORE=M33
+  ARM_LDFLAGS := -Wl,--cpu=Cortex-M33.no_dsp.no_fp
+  TARGET_SPECIFIC_FLAGS +=
+
+else ifeq ($(TARGET_ARCH), cortex-m4)
+  CORE=M4
+  ARM_LDFLAGS := -Wl,--cpu=Cortex-M4.no_fp
+
+else ifeq ($(TARGET_ARCH), cortex-m4+fp)
+  CORE=M4
+  ARM_LDFLAGS := -Wl,--cpu=Cortex-M4
+  FLOAT=hard
+  GCC_TARGET_ARCH := cortex-m4
+
+else ifeq ($(TARGET_ARCH), cortex-m55)
+  CORE=M55
+  ARM_LDFLAGS := -Wl,--cpu=8.1-M.Main.mve.fp
+  TARGET_SPECIFIC_FLAGS += -D__DSP_PRESENT=1 -D__FPU_PRESENT=1
+  FLOAT=hard
+
+else ifeq ($(TARGET_ARCH), cortex-m55+nodsp+nofp)
+  CORE=M55
+  ARM_LDFLAGS := -Wl,--cpu=8.1-M.Main.mve.no_dsp.no_fp
+  TARGET_SPECIFIC_FLAGS +=
+
+else ifeq ($(TARGET_ARCH), cortex-m55+nofp)
+  CORE=M55
+  ARM_LDFLAGS := -Wl,--cpu=8.1-M.Main.mve.no_fp
+  TARGET_SPECIFIC_FLAGS += -D__DSP_PRESENT=1
+
+else ifeq ($(TARGET_ARCH), cortex-m7)
+  CORE=M7
+  ARM_LDFLAGS := -Wl,--cpu=Cortex-M7.no_fp
+
+else ifeq ($(TARGET_ARCH), cortex-m7+fp)
+  CORE=M7
+  ARM_LDFLAGS := -Wl,--cpu=Cortex-M7
+  FLOAT=hard
+  GCC_TARGET_ARCH := cortex-m7
+
+else
+  $(error "TARGET_ARCH=$(TARGET_ARCH) is not supported")
+endif
+
+ifneq ($(filter cortex-m55%,$(TARGET_ARCH)),)
+  ifeq ($(TOOLCHAIN), armgcc)
+    $(error "Micro architecure support is not available for arm-gcc for TARGET_ARCH=$(TARGET_ARCH)")
+  endif
+
+  # soft-abi=soft disables MVE - use softfp instead for M55.
+  ifeq ($(FLOAT),soft)
+    FLOAT=softfp
+  endif
+endif
+
+# Toolchain specfic flags
+ifeq ($(TOOLCHAIN), armclang)
+  CXX_TOOL  := armclang
+  CC_TOOL   := armclang
+  AR_TOOL   := armar
+  LD        := armlink
+
+  FLAGS_ARMC = \
+    --target=arm-arm-none-eabi \
+    -mcpu=$(TARGET_ARCH)
+
+  CXXFLAGS += $(FLAGS_ARMC)
+  CCFLAGS += $(FLAGS_ARMC)
+  LDFLAGS += $(ARM_LDFLAGS)
+
+  # Arm Compiler will not link the Math library (see below), therefore we're filtering it out.
+  # See Fatal error: L6450U: Cannot find library m:
+  # "Arm Compiler is designed to run in a bare metal environment,
+  # and automatically includes implementations of these functions,
+  # and so no such flag is necessary."
+  # https://developer.arm.com/documentation/100891/0611/troubleshooting/general-troubleshooting-advice
+  MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
+
+else ifeq ($(TOOLCHAIN), armgcc)
+  export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
+  $(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,))
+
+  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
+
+  FLAGS_GCC = -mcpu=$(GCC_TARGET_ARCH)
+  CXXFLAGS += $(FLAGS_GCC)
+  CCFLAGS += $(FLAGS_GCC)
+
+else
+  $(error "TOOLCHAIN=$(TOOLCHAIN) is not supported.")
+endif
+
+PLATFORM_FLAGS = \
+  -DTF_LITE_MCU_DEBUG_LOG \
+  -mthumb \
+  -mfloat-abi=$(FLOAT) \
+  -funsigned-char \
+  -mlittle-endian \
+  -Wno-type-limits \
+  -Wno-unused-private-field \
+  -fomit-frame-pointer \
+  -MD \
+  -DCPU_$(CORE)=1 \
+  $(TARGET_SPECIFIC_FLAGS)
+
+# Common + C/C++ flags
+CXXFLAGS += $(PLATFORM_FLAGS)
+CCFLAGS += $(PLATFORM_FLAGS)
diff --git a/tensorflow/lite/micro/tools/make/targets/esp32_makefile.inc b/tensorflow/lite/micro/tools/make/targets/esp32_makefile.inc
index b55331b24ba..b334224e3a2 100644
--- a/tensorflow/lite/micro/tools/make/targets/esp32_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/esp32_makefile.inc
@@ -2,4 +2,6 @@
 
 ifeq ($(TARGET), esp)
   TARGET_ARCH := xtensa-esp32
+  CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
+  CFLAGS += -std=c11
 endif
diff --git a/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc b/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
index e46ca0717a4..3bbe6f9aeb9 100644
--- a/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
@@ -1,10 +1,19 @@
 # Settings for Hexagon toolchain.
 # REQUIRED:
-#   - Hexagon SDK 3.5 Toolkit (for hexagon-clang++, hexagon-sim).
-#   - HEXAGON_SDK_PREFIX environment variable must be set to location of
+#   - Hexagon SDK 3.5 Toolkit (for qurt, posix libs).
+#     HEXAGON_SDK_ROOT environment variable must be set to location of
 #     Hexagon_SDK/<version>/ on your machine.
+#   - Hexagon Tools root (for hexagon-clang++, hexagon-sim).
+#     The tool folder may be a part of the Hexagon SDK
+#      (e.g. $(HEXAGON_SDK_ROOT)/tools/HEXAGON_Tools) or installed
+#       separately.
+#     HEXAGON_ROOT environment variable must be set to location of
+#     HEXAGON_Tools on your machine.
+#   - HEXAGON_TOOL_VER: The Hexagon tool version (installed under HEXAGON_ROOT).
+#      For example: 8.3.07
 #   - HEXAGON_CPU_VER: The CPU version to use, will cause a compiler exception
-#                  without providing a version. Acceptable values: v55-v67
+#      without providing a version. Valid values may vary depending on tools
+#      version, but generally in the range: v55-v67
 #
 # Unlike other targets, there is not currently a way to automatically download
 # the Hexagon SDK.  For this reason, users are required to manually download
@@ -12,8 +21,16 @@
 ifeq ($(TARGET), hexagon)
   TARGET_ARCH := hexagon
 
-  ifndef HEXAGON_SDK_PREFIX
-    $(error HEXAGON_SDK_PREFIX is undefined)
+  ifndef HEXAGON_SDK_ROOT
+    $(error HEXAGON_SDK_ROOT is undefined)
+  endif
+
+  ifndef HEXAGON_TOOL_VER
+    $(error HEXAGON_TOOL_VER is undefined)
+  endif
+
+  ifndef HEXAGON_ROOT
+    $(error HEXAGON_ROOT is undefined)
   endif
 
   ifndef HEXAGON_CPU_VER
@@ -55,6 +72,7 @@ ifeq ($(TARGET), hexagon)
     -mcpu=$(HEXAGON_CPU_VER) \
     -m$(HEXAGON_CPU_VER)
 
+  export PATH := $(HEXAGON_ROOT)/$(HEXAGON_TOOL_VER)/Tools/bin:$(PATH)
   TARGET_TOOLCHAIN_PREFIX := hexagon-
   CXX_TOOL := clang++
   CC_TOOL := clang
@@ -63,11 +81,11 @@ ifeq ($(TARGET), hexagon)
   CCFLAGS += $(PLATFORM_ARGS)
   LDFLAGS += \
     -Wl,--gc-sections -lhexagon \
-    $(HEXAGON_SDK_PREFIX)/tools/HEXAGON_Tools/8.3.07/Tools/target/hexagon/lib/v66/libstdc++.a
+    $(HEXAGON_ROOT)/$(HEXAGON_TOOL_VER)/Tools/target/hexagon/lib/v66/libstdc++.a
 
   INCLUDES += \
-    -I$(HEXAGON_SDK_PREFIX)/libs/common/qurt/computev66/include/posix \
-    -I$(HEXAGON_SDK_PREFIX)/libs/common/qurt/computev66/include/qurt
+    -I$(HEXAGON_SDK_ROOT)/libs/common/qurt/computev66/include/posix \
+    -I$(HEXAGON_SDK_ROOT)/libs/common/qurt/computev66/include/qurt
 
   TEST_SCRIPT := tensorflow/lite/micro/testing/test_hexagon_binary.sh
 endif
diff --git a/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc b/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
index 60fc2e7cca1..d19ce680b41 100644
--- a/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/himax_we1_evb_makefile.inc
@@ -30,23 +30,24 @@ ifeq ($(TARGET), himax_we1_evb)
   DEFAULT_STACKSZ := 8192
 
   TCF_FILE_NAME = $(notdir $(TCF_FILE))
-  ARC_TARGET_FILES_DIRS = $(dir $(TCF_FILE_NAME))
+  ARC_TARGET_COPY_FILES += $(notdir $(TCF_FILE))!$(TCF_FILE)
   MAKE_PROJECT_FILES += $(TCF_FILE_NAME)
+
+
     
   LCF_FILE_NAME = $(notdir $(LCF_FILE))
-  ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE))
+  ARC_TARGET_COPY_FILES += $(notdir $(LCF_FILE))!$(LCF_FILE)
   MAKE_PROJECT_FILES += $(LCF_FILE_NAME)
   
   ARCLIB_FILE_NAME = $(notdir $(ARCLIB_FILE))
-  ARC_TARGET_FILES_DIRS += $(dir $(ARCLIB_FILE))
+  ARC_TARGET_COPY_FILES += $(notdir $(ARCLIB_FILE))!$(ARCLIB_FILE)
   MAKE_PROJECT_FILES += $(ARCLIB_FILE_NAME)
   
   LIB_HEADER_FILE_NAME = $(notdir $(LIB_HEADER_FILE))
-  ARC_TARGET_FILES_DIRS += $(dir $(LIB_HEADER_FILE))
+  ARC_TARGET_COPY_FILES += $(notdir $(LIB_HEADER_FILE))!$(LIB_HEADER_FILE)
   MAKE_PROJECT_FILES += $(LIB_HEADER_FILE_NAME)
   
   
-    
   # Need a pointer to the TCF and lcf file
 
   PLATFORM_FLAGS = \
diff --git a/tensorflow/lite/micro/tools/make/targets/linux_x86_makefile.inc b/tensorflow/lite/micro/tools/make/targets/linux_x86_makefile.inc
deleted file mode 100644
index 8ea78e8f3e3..00000000000
--- a/tensorflow/lite/micro/tools/make/targets/linux_x86_makefile.inc
+++ /dev/null
@@ -1,9 +0,0 @@
-# Settings for x86 on Linux
-ifeq ($(TARGET), linux)
-  ifeq ($(TARGET_ARCH), x86_64)
-    PLATFORM_FLAGS = \
-      -DTF_LITE_DISABLE_X86_NEON
-    CXXFLAGS += $(PLATFORM_FLAGS)
-    CCFLAGS += $(PLATFORM_FLAGS)
-  endif
-endif
diff --git a/tensorflow/lite/micro/tools/make/targets/osx_makefile.inc b/tensorflow/lite/micro/tools/make/targets/osx_makefile.inc
deleted file mode 100644
index 9b1e2220575..00000000000
--- a/tensorflow/lite/micro/tools/make/targets/osx_makefile.inc
+++ /dev/null
@@ -1,13 +0,0 @@
-# Settings for Mac OS platforms.
-ifeq ($(TARGET), osx)
-
-  # Make sure we can find the embedded GCC compiler.
-  export PATH := ${PATH}:tensorflow/lite/micro/tools/make/downloads/gcc_embedded/bin/
-
-  PLATFORM_FLAGS = \
-    -DTF_LITE_DISABLE_X86_NEON
-
-  CXXFLAGS += $(PLATFORM_FLAGS)
-  CCFLAGS += $(PLATFORM_FLAGS)
-
-endif
\ No newline at end of file
diff --git a/tensorflow/lite/micro/tools/make/targets/osx_x86_64_makefile.inc b/tensorflow/lite/micro/tools/make/targets/osx_x86_64_makefile.inc
deleted file mode 100644
index 78febaf5ddd..00000000000
--- a/tensorflow/lite/micro/tools/make/targets/osx_x86_64_makefile.inc
+++ /dev/null
@@ -1,10 +0,0 @@
-# Settings for x86 on Mac
-ifeq ($(TARGET), osx)
-  ifeq ($(TARGET_ARCH), x86_64)
-    PLATFORM_FLAGS = \
-      -DTF_LITE_DISABLE_X86_NEON
-    CXXFLAGS += $(PLATFORM_FLAGS)
-    CCFLAGS += $(PLATFORM_FLAGS)
-  endif
-endif
-
diff --git a/tensorflow/lite/micro/tools/make/targets/sparkfun_edge_makefile.inc b/tensorflow/lite/micro/tools/make/targets/sparkfun_edge_makefile.inc
new file mode 100644
index 00000000000..0a4e53202eb
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/sparkfun_edge_makefile.inc
@@ -0,0 +1,2 @@
+include $(MAKEFILE_DIR)/targets/apollo3evb_makefile.inc
+
diff --git a/tensorflow/lite/micro/tools/make/targets/stm32f4/stm32f4.lds b/tensorflow/lite/micro/tools/make/targets/stm32f4/stm32f4.lds
index 8e8b3f75448..1856368b4dc 100644
--- a/tensorflow/lite/micro/tools/make/targets/stm32f4/stm32f4.lds
+++ b/tensorflow/lite/micro/tools/make/targets/stm32f4/stm32f4.lds
@@ -14,26 +14,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-/* Copied and modified from: tensorflow/lite/micro/tools/make/targets/bluepill/bluepill.lds
-
-*/
-
-/*
- * 0x00000000 - 0x07ffffff - aliased to flash or sys memory depending on BOOT jumpers.
- * 0x08000000 - 0x0801ffff - Flash.
- * 0x1ffff000 - 0x1ffff7ff - Boot firmware in system memory.
- * 0x1ffff800 - 0x1fffffff - Option bytes.
- * 0x20000000 - 0x20004fff - SRAM.
- * 0x40000000 - 0x40023400 - Peripherals
- */
-
 /* Define main entry point */
 ENTRY(_main)
 
-/* 32K of RAM and 256K of FLASH */
+/* 256K of RAM and 2048K of FLASH. Source: */
+/* https://github.com/renode/renode/blob/master/platforms/cpus/stm32f4.repl*/
 MEMORY {
-RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 32K
-FLASH (rx) : ORIGIN = 0x8000000, LENGTH = 256K
+ RAM (xrw) : ORIGIN = 0x20000000, LENGTH = 256K
+ FLASH (rx) : ORIGIN = 0x8000000, LENGTH =  2048K
 }
 
 /* Compute where the stack ends rather than hard coding it */
diff --git a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
index 15ee93d4e19..e9ee7296999 100644
--- a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
@@ -62,35 +62,24 @@ ifeq ($(TARGET), stm32f4)
   TEST_SCRIPT := tensorflow/lite/micro/testing/test_stm32f4_binary.sh
   # TODO, non working tests.. the micro_speech example partly works
   # TODO(b/158324045): Examine why some tests fail here.
+
   EXCLUDED_TESTS := \
     tensorflow/lite/micro/micro_interpreter_test.cc \
     tensorflow/lite/micro/micro_allocator_test.cc \
     tensorflow/lite/micro/memory_helpers_test.cc \
     tensorflow/lite/micro/memory_arena_threshold_test.cc \
     tensorflow/lite/micro/recording_micro_allocator_test.cc \
-    tensorflow/lite/micro/kernels/logistic_test.cc \
-    tensorflow/lite/micro/kernels/logical_test.cc \
-    tensorflow/lite/micro/kernels/maximum_minimum_test.cc \
-    tensorflow/lite/micro/kernels/comparisons_test.cc \
-    tensorflow/lite/micro/kernels/reshape_test.cc \
-    tensorflow/lite/micro/kernels/arg_min_max_test.cc \
-    tensorflow/lite/micro/kernels/elementwise_test.cc \
-    tensorflow/lite/micro/kernels/strided_slice_test.cc \
-    tensorflow/lite/micro/kernels/prelu_test.cc \
-    tensorflow/lite/micro/kernels/pack_test.cc \
-    tensorflow/lite/micro/kernels/activations_test.cc \
-    tensorflow/lite/micro/kernels/dequantize_test.cc \
-    tensorflow/lite/micro/kernels/unpack_test.cc \
-    tensorflow/lite/micro/kernels/split_test.cc \
-    tensorflow/lite/micro/simple_tensor_allocator_test.cc
+    tensorflow/lite/micro/kernels/circular_buffer_test.cc \
+    tensorflow/lite/micro/kernels/conv_test.cc \
+    tensorflow/lite/micro/kernels/fully_connected_test.cc
+
   MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
+
   EXCLUDED_EXAMPLE_TESTS := \
     tensorflow/lite/micro/examples/magic_wand/Makefile.inc \
     tensorflow/lite/micro/examples/person_detection/Makefile.inc \
     tensorflow/lite/micro/examples/person_detection_experimental/Makefile.inc \
-    tensorflow/lite/micro/examples/mobilenet_v2/Makefile.inc \
     tensorflow/lite/micro/examples/micro_speech/Makefile.inc \
-    tensorflow/lite/micro/examples/ds_cnn_l/Makefile.inc \
     tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc
   MICRO_LITE_EXAMPLE_TESTS := $(filter-out $(EXCLUDED_EXAMPLE_TESTS), $(MICRO_LITE_EXAMPLE_TESTS))
 
diff --git a/tensorflow/lite/micro/tools/make/targets/xtensa_hifi_makefile.inc b/tensorflow/lite/micro/tools/make/targets/xtensa_hifi_makefile.inc
index 539f0b87ee8..4ef11530f74 100644
--- a/tensorflow/lite/micro/tools/make/targets/xtensa_hifi_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/xtensa_hifi_makefile.inc
@@ -1,11 +1,29 @@
-# Settings for Xtensa toolchain.
-# Derived from xtensa_xpg_makefile.inc
-# The Xtensa environment variables should be configured externally (XTENSA_CORE, XTENSA_SYSTEM)
+# Settings for Xtensa toolchain for the hifi kernels.
+# REQUIRED:
+#  Environment variables:
+#   - XTENSA_BASE  must be set to location of
+#     the Xtensa developer tools installation directory.
+#  Command line arguments:
+#   - XTENSA_TOOLS_VERSION: For example: RI-2019.2-linux
+#   - XTENSA_CORE: The name of the Xtensa core to use
+#      For example: hifi3
 
 ifeq ($(TARGET), xtensa_hifi)
   TARGET_ARCH := hifi3_bd5
 
-$(eval $(call add_third_party_download,$(XTENSA_HIFI4_URL),$(XTENSA_HIFI4_MD5),xa_nnlib_hifi4,))
+  ifndef XTENSA_BASE
+    $(error XTENSA_BASE is undefined)
+  endif
+
+  ifndef XTENSA_TOOLS_VERSION
+    $(error XTENSA_TOOLS_VERSION is undefined)
+  endif
+
+  ifndef XTENSA_CORE
+    $(error XTENSA_CORE is undefined)
+  endif
+
+  $(eval $(call add_third_party_download,$(XTENSA_HIFI4_URL),$(XTENSA_HIFI4_MD5),xa_nnlib_hifi4,))
 
   PLATFORM_ARGS = \
     -mno-mul16 \
@@ -23,6 +41,7 @@ $(eval $(call add_third_party_download,$(XTENSA_HIFI4_URL),$(XTENSA_HIFI4_MD5),x
   TF_LITE_MICRO_FLAGS = \
     -DTF_LITE_STATIC_MEMORY\
 
+  export PATH := $(XTENSA_BASE)/tools/$(XTENSA_TOOLS_VERSION)/XtensaTools/bin:$(PATH)
   TARGET_TOOLCHAIN_PREFIX := xt-
   CXX_TOOL := clang++
   CC_TOOL := clang
diff --git a/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_makefile.inc b/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_makefile.inc
index 809e39b5c6f..96bea86d0d7 100644
--- a/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_makefile.inc
@@ -1,33 +1,47 @@
 # Settings for Xtensa toolchain for the hifimini kernels.
 # REQUIRED:
-#   - RI2019.2 Toolkit (for xt-clang/xt-clang++).
-#   - XTENSA_CORE: The name of the core to use, will cause a compiler exception
-#                  without providing a core.
+#  Environment variables:
+#   - XTENSA_BASE  must be set to location of
+#     the Xtensa developer tools installation directory.
+#  Command line arguments:
+#   - XTENSA_TOOLS_VERSION: For example: RI-2019.2-linux
+#   - XTENSA_CORE: The name of the Xtensa core to use
+#      For example: hifimini
 
 ifeq ($(TARGET), xtensa_hifimini)
   TARGET_ARCH := xtensa_hifimini
 
-  PLATFORM_ARGS = \
+  ifndef XTENSA_BASE
+    $(error XTENSA_BASE is undefined)
+  endif
+
+  ifndef XTENSA_TOOLS_VERSION
+    $(error XTENSA_TOOLS_VERSION is undefined)
+  endif
+
+  ifndef XTENSA_CORE
+    $(error XTENSA_CORE is undefined)
+  endif
+
+  PLATFORM_FLAGS = \
     -DTF_LITE_MCU_DEBUG_LOG \
     --xtensa-core=$(XTENSA_CORE) \
     -mcoproc \
-    -DXTENSA -DMAX_RFFT_PWR=9 -DMIN_RFFT_PWR=MAX_RFFT_PWR \
-    -fdata-sections \
-    -ffunction-sections \
-    -fno-exceptions \
-    -fno-unwind-tables \
-    -fno-use-cxa-atexit \
-    -fmessage-length=0 \
-    -fno-threadsafe-statics
+    -DXTENSA \
+    -DMAX_RFFT_PWR=9 \
+    -DMIN_RFFT_PWR=MAX_RFFT_PWR
 
+
+  export PATH := $(XTENSA_BASE)/tools/$(XTENSA_TOOLS_VERSION)/XtensaTools/bin:$(PATH)
   TARGET_TOOLCHAIN_PREFIX := xt-
   CXX_TOOL := clang++
   CC_TOOL := clang
 
-  CXXFLAGS += $(PLATFORM_ARGS)
-  CCFLAGS += $(PLATFORM_ARGS)
+  CXXFLAGS += $(PLATFORM_FLAGS)
+  CCFLAGS += $(PLATFORM_FLAGS)
 
-  LDFLAGS += -Wl,-gc-sections
+  # TODO(b/150240249): Do not remove -fno-rtti once that works for the Xtensa toolchain.
+  CXXFLAGS := $(filter-out -fno-rtti, $(CXXFLAGS))
 
   TEST_SCRIPT := tensorflow/lite/micro/testing/test_xtensa_hifimini_binary.sh
 
diff --git a/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_staging_makefile.inc b/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_staging_makefile.inc
index 5e87db13c01..557b8f6e9e6 100644
--- a/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_staging_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/xtensa_hifimini_staging_makefile.inc
@@ -1,12 +1,28 @@
-# Settings for Xtensa toolchain for hifimini_staging kernels.
+# Settings for Xtensa toolchain for the hifimini kernels.
 # REQUIRED:
-#   - RI2019.2 Toolkit (for xt-clang/xt-clang++).
-#   - XTENSA_CORE: The name of the core to use, will cause a compiler exception
-#                  without providing a core.
+#  Environment variables:
+#   - XTENSA_BASE  must be set to location of
+#     the Xtensa developer tools installation directory.
+#  Command line arguments:
+#   - XTENSA_TOOLS_VERSION: For example: RI-2019.2-linux
+#   - XTENSA_CORE: The name of the Xtensa core to use
+#      For example: hifimini
 
 ifeq ($(TARGET), xtensa_hifimini_staging)
   TARGET_ARCH := xtensa_hifimini_staging
 
+  ifndef XTENSA_BASE
+    $(error XTENSA_BASE is undefined)
+  endif
+
+  ifndef XTENSA_TOOLS_VERSION
+    $(error XTENSA_TOOLS_VERSION is undefined)
+  endif
+
+  ifndef XTENSA_CORE
+    $(error XTENSA_CORE is undefined)
+  endif
+
   PLATFORM_ARGS = \
     -DTF_LITE_MCU_DEBUG_LOG \
     --xtensa-core=$(XTENSA_CORE) \
@@ -20,6 +36,7 @@ ifeq ($(TARGET), xtensa_hifimini_staging)
     -fmessage-length=0 \
     -fno-threadsafe-statics
 
+  export PATH := $(XTENSA_BASE)/tools/$(XTENSA_TOOLS_VERSION)/XtensaTools/bin:$(PATH)
   TARGET_TOOLCHAIN_PREFIX := xt-
   CXX_TOOL := clang++
   CC_TOOL := clang
diff --git a/tensorflow/lite/micro/tools/make/templates/ceva_bx1/ceva_app_makefile.tpl b/tensorflow/lite/micro/tools/make/templates/ceva_bx1/ceva_app_makefile.tpl
new file mode 100755
index 00000000000..a76169d7c21
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/templates/ceva_bx1/ceva_app_makefile.tpl
@@ -0,0 +1,71 @@
+
+TARGET_TOOLCHAIN_ROOT := /home/yaire/CEVA-ToolBox/V18.02/BX
+#CC = %{CC_TOOL}%
+#CXX = %{CXX_TOOL}%
+#LD = %{LD_TOOL}%
+CC = ${TARGET_TOOLCHAIN_ROOT}/cevatools/bin/clang
+CXX = ${TARGET_TOOLCHAIN_ROOT}/cevatools/bin/clang++
+LD = ${TARGET_TOOLCHAIN_ROOT}/cevatools/bin/ceva-elf-ld
+AS = ${TARGET_TOOLCHAIN_ROOT}/cevatools/bin/ceva-elf-as
+TOOLS_OBJS := \
+${TARGET_TOOLCHAIN_ROOT}/cevatools/lib/clang/4.0.1/cevabx1-unknown-unknown-elf/lib/rtlv1.0.0-fp1-dpfp1/crt0.o ${TARGET_TOOLCHAIN_ROOT}/cevatools/lib/clang/4.0.1/cevabx1-unknown-unknown-elf/lib/rtlv1.0.0-fp1-dpfp1/crtn.o
+TOOLS_LIBS := \
+-lc++ -lc++abi -lc -lcompiler-rt
+
+  LDFLAGS += \
+	  -T \
+	../../../../../targets/ceva/CEVA_BX1_TFLM_18.0.2.ld \
+	--no-relax \
+	--no-gc-sections \
+	-defsym \
+	__internal_data_size=512k \
+	-defsym \
+	__internal_code_size=256k \
+	-L${TARGET_TOOLCHAIN_ROOT}/cevatools/lib/clang/4.0.1/cevabx1-unknown-unknown-elf/lib/rtlv1.0.0-fp1-dpfp1 \
+	-lc++ -lc++abi -lc -lcompiler-rt
+    
+
+OUT_NAME = %{EXECUTABLE}%
+
+CXXFLAGS += %{CXX_FLAGS}%
+CCFLAGS += %{CC_FLAGS}%
+
+#=============================================================
+# Files and directories
+#=============================================================
+SRCS := \
+%{SRCS}%
+
+OBJS := \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(SRCS)))
+
+
+#=============================================================
+# Common rules
+#=============================================================
+.PHONY: all app flash clean run debug
+
+%.o: %.cc
+	$(CXX) $(CXXFLAGS) $(EXT_CFLAGS) $(INCLUDES) -c $< -o $@
+
+%.o: %.c
+	$(CC) $(CCFLAGS) $(EXT_CFLAGS) $(INCLUDES) -c $< -o $@
+
+$(OUT_NAME): $(OBJS)
+	$(LD)  -o $@ $(OBJS) $(TOOLS_OBJS) ${TOOLS_LIBS} $(LDFLAGS)
+
+%{EXTRA_APP_RULES}%
+
+
+#=================================================================
+# Global rules
+#=================================================================
+all: $(OUT_NAME)
+
+app: $(OUT_NAME)
+
+clean: 
+	-@$(RM) $(call fix_platform_path,$(OBJS))
+	-@$(RM) $(OUT_NAME) %{EXTRA_RM_TARGETS}%
+
+
diff --git a/tensorflow/lite/micro/tools/make/templates/ceva_bx1/ceva_app_makefile_v18.0.3.tpl b/tensorflow/lite/micro/tools/make/templates/ceva_bx1/ceva_app_makefile_v18.0.3.tpl
new file mode 100755
index 00000000000..2847aaa80b0
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/templates/ceva_bx1/ceva_app_makefile_v18.0.3.tpl
@@ -0,0 +1,71 @@
+
+TARGET_TOOLCHAIN_ROOT := /home/yaire/CEVA-ToolBox/V18.02/BX
+#CC = %{CC_TOOL}%
+#CXX = %{CXX_TOOL}%
+#LD = %{LD_TOOL}%
+CC = ${TARGET_TOOLCHAIN_ROOT}/cevatools/bin/clang
+CXX = ${TARGET_TOOLCHAIN_ROOT}/cevatools/bin/clang++
+LD = ${TARGET_TOOLCHAIN_ROOT}/cevatools/bin/ceva-elf-ld
+AS = ${TARGET_TOOLCHAIN_ROOT}/cevatools/bin/ceva-elf-as
+TOOLS_OBJS := \
+${TARGET_TOOLCHAIN_ROOT}/cevatools/lib/clang/7.1.0/cevabx1-unknown-unknown-elf/lib/rtlv1.0.0-fp1-dpfp1/crt0.o ${TARGET_TOOLCHAIN_ROOT}/cevatools/lib/clang/7.1.0/cevabx1-unknown-unknown-elf/lib/rtlv1.0.0-fp1-dpfp1/crtn.o
+TOOLS_LIBS := \
+-lc++ -lc++abi -lc -lcompiler-rt
+
+  LDFLAGS += \
+	  -T \
+	../../../../../targets/ceva/CEVA_BX1_TFLM_18.0.2.ld \
+	--no-relax \
+	--no-gc-sections \
+	-defsym \
+	__internal_data_size=512k \
+	-defsym \
+	__internal_code_size=256k \
+	-L/home/yaire/CEVA-ToolBox/V18/BX/cevatools/lib/clang/7.1.0/cevabx1-unknown-unknown-elf/lib/rtlv1.0.0-fp1-dpfp1 \
+	-lc++ -lc++abi -lc -lcompiler-rt
+    
+
+OUT_NAME = %{EXECUTABLE}%
+
+CXXFLAGS += %{CXX_FLAGS}%
+CCFLAGS += %{CC_FLAGS}%
+
+#=============================================================
+# Files and directories
+#=============================================================
+SRCS := \
+%{SRCS}%
+
+OBJS := \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(SRCS)))
+
+
+#=============================================================
+# Common rules
+#=============================================================
+.PHONY: all app flash clean run debug
+
+%.o: %.cc
+	$(CXX) $(CXXFLAGS) $(EXT_CFLAGS) $(INCLUDES) -c $< -o $@
+
+%.o: %.c
+	$(CC) $(CCFLAGS) $(EXT_CFLAGS) $(INCLUDES) -c $< -o $@
+
+$(OUT_NAME): $(OBJS)
+	$(LD)  -o $@ $(OBJS) $(TOOLS_OBJS) ${TOOLS_LIBS} $(LDFLAGS)
+
+%{EXTRA_APP_RULES}%
+
+
+#=================================================================
+# Global rules
+#=================================================================
+all: $(OUT_NAME)
+
+app: $(OUT_NAME)
+
+clean: 
+	-@$(RM) $(call fix_platform_path,$(OBJS))
+	-@$(RM) $(OUT_NAME) %{EXTRA_RM_TARGETS}%
+
+
diff --git a/tensorflow/lite/micro/tools/make/templates/esp/components/tfmicro/CMakeLists.txt.tpl b/tensorflow/lite/micro/tools/make/templates/esp/components/tfmicro/CMakeLists.txt.tpl
index 43ee3e214ff..a3425175c6e 100644
--- a/tensorflow/lite/micro/tools/make/templates/esp/components/tfmicro/CMakeLists.txt.tpl
+++ b/tensorflow/lite/micro/tools/make/templates/esp/components/tfmicro/CMakeLists.txt.tpl
@@ -34,4 +34,5 @@ target_compile_options(${COMPONENT_LIB} PRIVATE
 
 target_compile_options(${COMPONENT_LIB} PRIVATE %{CC_FLAGS}%)
 target_compile_options(${COMPONENT_LIB} PRIVATE $<$<COMPILE_LANGUAGE:CXX>: %{CXX_FLAGS}% >)
+target_compile_options(${COMPONENT_LIB} INTERFACE $<$<IN_LIST:-DTF_LITE_STATIC_MEMORY,$<TARGET_PROPERTY:${COMPONENT_LIB},COMPILE_OPTIONS>>:-DTF_LITE_STATIC_MEMORY>)
 target_link_libraries(${COMPONENT_LIB} PRIVATE %{LINKER_FLAGS}%)
diff --git a/tensorflow/lite/micro/tools/make/templates/zephyr_cmake_project.cmake.tpl b/tensorflow/lite/micro/tools/make/templates/zephyr_cmake_project.cmake.tpl
index d7bb4511f32..dc1eee5547d 100644
--- a/tensorflow/lite/micro/tools/make/templates/zephyr_cmake_project.cmake.tpl
+++ b/tensorflow/lite/micro/tools/make/templates/zephyr_cmake_project.cmake.tpl
@@ -2,13 +2,11 @@ cmake_minimum_required(VERSION 3.13.1)
 include($ENV{ZEPHYR_BASE}/cmake/app/boilerplate.cmake NO_POLICY_SCOPE)
 project(tf_lite_magic_wand)
 
-set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} %{CXX_FLAGS}%")
+# -fno-threadsafe-statics -- disables the mutex around initialization of local static variables
+set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} %{CXX_FLAGS}% -fno-threadsafe-statics -Wno-sign-compare -Wno-narrowing")
 set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} %{CC_FLAGS}%")
 set(CMAKE_EXE_LINKER_FLAGS "%{LINKER_FLAGS}%")
 
-# -fno-threadsafe-statics -- disables the mutex around initialization of local static variables
-target_compile_options(app PRIVATE "-fno-threadsafe-statics")
-
 target_sources(app PRIVATE
 		%{SRCS}%
 		)
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index e2777d9fbb5..f77cad59fba 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -9,11 +9,11 @@ GEMMLOWP_URL := "https://github.com/google/gemmlowp/archive/719139ce755a0f31cbf1
 GEMMLOWP_MD5 := "7e8191b24853d75de2af87622ad293ba"
 
 ifeq ($(HOST_OS),windows)
-  FLATBUFFERS_URL := "http://mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.12.0.zip"
-  FLATBUFFERS_MD5 := "a1afdbf114dec01a861c1b8c917d0fc7"
+  FLATBUFFERS_URL := "http://mirror.tensorflow.org/github.com/google/flatbuffers/archive/dca12522a9f9e37f126ab925fd385c807ab4f84e.zip"
+  FLATBUFFERS_MD5 := "aa9adc93eb9b33fa1a2a90969e48baee"
 else
-  FLATBUFFERS_URL := "http://mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.12.0.tar.gz"
-  FLATBUFFERS_MD5 := "c62ffefb3d4548b127cca14ce047f16c"
+  FLATBUFFERS_URL := "http://mirror.tensorflow.org/github.com/google/flatbuffers/archive/dca12522a9f9e37f126ab925fd385c807ab4f84e.tar.gz"
+  FLATBUFFERS_MD5 := "dfa0ac3073b78ddacdcacf8ca189be91"
 endif
 
 ifeq ($(HOST_OS),osx)
@@ -33,8 +33,8 @@ LEON_BCC2_MD5 := "cdf78082be4882da2a92c9baa82fe765"
 TSIM_URL := "http://mirror.tensorflow.org/www.gaisler.com/anonftp/tsim/tsim-eval-2.0.63.tar.gz"
 TSIM_MD5 := "afa0095d3ed989a949e1467f94e41d2f"
 
-CMSIS_URL := "http://mirror.tensorflow.org/github.com/ARM-software/CMSIS_5/archive/9daaa7a34a5627a24009462b8fa8413a00c4fdb1.zip"
-CMSIS_MD5 := "b988dacff8925ffffcb7e5079cc713b7"
+CMSIS_URL := "http://github.com/ARM-software/CMSIS_5/archive/01f5b32badf7b78c85a24a7149b56400fa6a2999.zip"
+CMSIS_MD5 := "823916c6f1749c65fd0bfdeec20b30ed"
 
 AM_SDK_URL := "http://mirror.tensorflow.org/s3.asia.ambiqmicro.com/downloads/AmbiqSuite-Rel2.2.0.zip"
 AM_SDK_MD5 := "7605fa2d4d97e6bb7a1190c92b66b597"
@@ -76,11 +76,11 @@ PERSON_MODEL_MD5 := "55b85f76e2995153e660391d4a209ef1"
 PERSON_MODEL_INT8_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_06_23.zip"
 PERSON_MODEL_INT8_MD5 := "9b5b6d4677dd0a91b1bb992d1c4c0417"
 
-EMBARC_MLI_URL := "http://mirror.tensorflow.org/github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/58284867ca52d1f43b25045e8601999d7359d986.zip"
-EMBARC_MLI_MD5 := "2bf4982a327fdaa9d475803ce014d1ef"
+EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/ef7dd3c4e37d74a908f30713a7d0121387d3c678.zip"
+EMBARC_MLI_MD5 := "65c4ff3f4a2963e90fd014f97c69f451"
 
-EMBARC_MLI_PRE_COMPILED_URL := "http://mirror.tensorflow.org/github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC2/embARC_MLI_package.zip"
-EMBARC_MLI_PRE_COMPILED_MD5 := "a95ff9e0370434484f14e7e4114327f6"
+EMBARC_MLI_PRE_COMPILED_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC3/embARC_MLI_package.zip"
+EMBARC_MLI_PRE_COMPILED_MD5 := "173990c2dde4efef6a2c95b92d1f0244"
 
 ZEPHYR_URL := "http://mirror.tensorflow.org/github.com/antmicro/zephyr/archive/55e36b9.zip"
 ZEPHYR_MD5 := "755622eb4812fde918a6382b65d50c3b"
@@ -88,8 +88,8 @@ ZEPHYR_MD5 := "755622eb4812fde918a6382b65d50c3b"
 XTENSA_HIFI4_URL :="http://mirror.tensorflow.org/github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_06_27.zip"
 XTENSA_HIFI4_MD5 :="45fdc1209a8da62ab568aa6040f7eabf"
 
-ETHOSU_URL := "http://mirror.tensorflow.org/git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git/snapshot/ethos-u-core-driver-bcb5aaa99756f1b5c1295b079ebdd60996bc75a5.tar.gz"
-ETHOSU_MD5 := "d2073c8d88fc167fd5c46b5dcda58ea1"
+ETHOSU_URL := "https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git/snapshot/ethos-u-core-driver-2b201c340788ac582cec160b7217c2b5405b04f9.tar.gz"
+ETHOSU_MD5 := "0c148b90a1ee01de398892eb3a63e717"
 
-HIMAX_WE1_SDK_URL ="https://www.himax.com.tw/we-i/himax_we1_sdk_v03.zip"
-HIMAX_WE1_SDK_MD5 ="1cd9b17f3fdb3e9a1dfd1cc356694325"
+HIMAX_WE1_SDK_URL ="https://www.himax.com.tw/we-i/himax_we1_sdk_v04.zip"
+HIMAX_WE1_SDK_MD5 ="40b3ccb3c2e41210fe5c970d61e7e7d3"
diff --git a/tensorflow/lite/micro/xtensa_hifimini/debug_log.cc b/tensorflow/lite/micro/xtensa_hifimini/debug_log.cc
deleted file mode 100644
index 45d9317478a..00000000000
--- a/tensorflow/lite/micro/xtensa_hifimini/debug_log.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Reference implementation of the DebugLog() function that's required for a
-// platform to support the TensorFlow Lite for Microcontrollers library. This is
-// the only function that's absolutely required to be available on a target
-// device, since it's used for communicating test results back to the host so
-// that we can verify the implementation is working correctly.
-// It's designed to be as easy as possible to supply an implementation though.
-// On platforms that have a POSIX stack or C library, it can be written as a
-// single call to `fprintf(stderr, "%s", s)` to output a string to the error
-// stream of the console, but if there's no OS or C library available, there's
-// almost always an equivalent way to write out a string to some serial
-// interface that can be used instead. For example on Arm M-series MCUs, calling
-// the `bkpt #0xAB` assembler instruction will output the string in r1 to
-// whatever debug serial connection is available. If you're running mbed, you
-// can do the same by creating `Serial pc(USBTX, USBRX)` and then calling
-// `pc.printf("%s", s)`.
-// To add an equivalent function for your own platform, create your own
-// implementation file, and place it in a subfolder with named after the OS
-// you're targeting. For example, see the Cortex M bare metal version in
-// tensorflow/lite/micro/bluepill/debug_log.cc or the mbed one on
-// tensorflow/lite/micro/mbed/debug_log.cc.
-
-#include "tensorflow/lite/micro/debug_log.h"
-
-#ifndef TF_LITE_STRIP_ERROR_STRINGS
-#include <cstdio>
-#endif
-
-extern "C" void DebugLog(const char* s) {
-#ifndef TF_LITE_STRIP_ERROR_STRINGS
-  // Reusing TF_LITE_STRIP_ERROR_STRINGS to disable DebugLog completely to get
-  // maximum reduction in binary size. This is because we have DebugLog calls
-  // via TF_LITE_CHECK that are not stubbed out by TF_LITE_REPORT_ERROR.
-  fprintf(stderr, "%s", s);
-#endif
-}
diff --git a/tensorflow/lite/micro/xtensa_hifimini/micro_time.cc b/tensorflow/lite/micro/xtensa_hifimini/micro_time.cc
index 6f3844c1fe3..22880657882 100644
--- a/tensorflow/lite/micro/xtensa_hifimini/micro_time.cc
+++ b/tensorflow/lite/micro/xtensa_hifimini/micro_time.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Xtensa implementation of micro_timer.
-// To include this with make, add TAGS=xtensa-xpg.
+// Xtensa timer implementation.
+// To include this with make, add TARGET=xtensa_hifimini.
 #include "tensorflow/lite/micro/micro_time.h"
 
 #include <time.h>
diff --git a/tensorflow/lite/model.h b/tensorflow/lite/model.h
index 079b4ad2a40..39c7fd77c16 100644
--- a/tensorflow/lite/model.h
+++ b/tensorflow/lite/model.h
@@ -18,23 +18,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MODEL_H_
 #define TENSORFLOW_LITE_MODEL_H_
 
+#include "tensorflow/lite/interpreter_builder.h"
 #include "tensorflow/lite/model_builder.h"
 
-#if TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-#include "tensorflow/lite/experimental/tf_runtime/lib/eager_model.h"
-#else
-#include "tensorflow/lite/interpreter_builder.h"
-#endif
-
-namespace tflite {
-
-#if TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-using InterpreterBuilder = tflrt::EagerTfLiteInterpreterBuilderAPI;
-using Interpreter = tflrt::EagerInterpreter;
-#else
-using InterpreterBuilder = impl::InterpreterBuilder;
-#endif
-
-}  // namespace tflite
+// TODO(b/168725050): Address the issue of proxy header in this file.
 
 #endif  // TENSORFLOW_LITE_MODEL_H_
diff --git a/tensorflow/lite/model_builder.cc b/tensorflow/lite/model_builder.cc
index c63ba47b5cf..23e6ea3322f 100644
--- a/tensorflow/lite/model_builder.cc
+++ b/tensorflow/lite/model_builder.cc
@@ -29,10 +29,6 @@ limitations under the License.
 #include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
 
-#if defined(TFLITE_ENABLE_DEFAULT_PROFILER)
-#include "tensorflow/lite/profiling/platform_profiler.h"
-#endif
-
 namespace tflite {
 
 namespace {
diff --git a/tensorflow/lite/model_builder.h b/tensorflow/lite/model_builder.h
index e4233998a30..9ffb54ce2b8 100644
--- a/tensorflow/lite/model_builder.h
+++ b/tensorflow/lite/model_builder.h
@@ -26,23 +26,13 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/api/verifier.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/stderr_reporter.h"
 
 namespace tflite {
 
-/// Abstract interface that verifies whether a given model is legit.
-/// It facilitates the use-case to verify and build a model without loading it
-/// twice.
-class TfLiteVerifier {
- public:
-  /// Returns true if the model is legit.
-  virtual bool Verify(const char* data, int length,
-                      ErrorReporter* reporter) = 0;
-  virtual ~TfLiteVerifier() {}
-};
-
 /// An RAII object that represents a read-only tflite model, copied from disk,
 /// or mmapped. This uses flatbuffers as the serialization format.
 ///
diff --git a/tensorflow/lite/model_test.cc b/tensorflow/lite/model_test.cc
index ba96494225c..110c54aa571 100644
--- a/tensorflow/lite/model_test.cc
+++ b/tensorflow/lite/model_test.cc
@@ -439,6 +439,49 @@ TEST(BasicFlatBufferModel, TestParseModelWithSparseTensor) {
 
 // TODO(b/150072943): Add malformed model with sparse tensor tests.
 
+// The models here have at least a node that uses the same tensor as input and
+// output. This causes segfaults when trying to eval the operator, hence we try
+// to prevent this scenario. The earliest place we can check this is in
+// `AllocateTensors`, hence the test checks that `interpreter->AllocateTensors`
+// detects these bad models.
+TEST(BasicFlatBufferModel, TestHandleMalformedModelReuseTensor) {
+  const auto model_path =
+      "tensorflow/lite/testdata/add_shared_tensors.bin";
+
+  std::unique_ptr<tflite::FlatBufferModel> model =
+      FlatBufferModel::BuildFromFile(model_path);
+  ASSERT_NE(model, nullptr);
+
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+  InterpreterBuilder builder(*model, resolver);
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(builder(&interpreter), kTfLiteOk);
+  ASSERT_NE(interpreter, nullptr);
+  ASSERT_NE(interpreter->AllocateTensors(), kTfLiteOk);
+}
+
+// The models here have a buffer index for a tensor pointing to a null buffer.
+// This results in the tensor being interpreted as read-write, but the model
+// assumes the tensor is read-only. As such, `interpreter->Invoke()` would
+// segfault if no precondition check is added. The test checks that the
+// precondition check exists.
+TEST(BasicFlatBufferModel, TestHandleMalformedModelInvalidBuffer) {
+  const auto model_path =
+      "tensorflow/lite/testdata/segment_sum_invalid_buffer.bin";
+
+  std::unique_ptr<tflite::FlatBufferModel> model =
+      FlatBufferModel::BuildFromFile(model_path);
+  ASSERT_NE(model, nullptr);
+
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+  InterpreterBuilder builder(*model, resolver);
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(builder(&interpreter), kTfLiteOk);
+  ASSERT_NE(interpreter, nullptr);
+  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_NE(interpreter->Invoke(), kTfLiteOk);
+}
+
 // TODO(aselle): Add tests for serialization of builtin op data types.
 // These tests will occur with the evaluation tests of individual operators,
 // not here.
diff --git a/tensorflow/lite/nnapi/BUILD b/tensorflow/lite/nnapi/BUILD
index 82d775dd94b..8d08c22a44c 100644
--- a/tensorflow/lite/nnapi/BUILD
+++ b/tensorflow/lite/nnapi/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow/lite:special_rules.bzl", "if_nnapi")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = [
@@ -13,6 +14,7 @@ cc_library(
         "NeuralNetworksShim.h",
         "NeuralNetworksTypes.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     linkopts = if_nnapi(["-ldl"]),
 )
 
@@ -25,6 +27,7 @@ cc_library(
     hdrs = [
         "nnapi_implementation.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     linkopts = if_nnapi(["-ldl"]) + if_nnapi(
         supported = ["-lrt"],
         supported_android = [],
@@ -38,6 +41,7 @@ cc_library(
     name = "nnapi_util",
     srcs = ["nnapi_util.cc"],
     hdrs = ["nnapi_util.h"],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         ":nnapi_implementation",
         "//tensorflow/lite:util",
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.cc b/tensorflow/lite/nnapi/nnapi_implementation.cc
index 25b0d8920dd..856cd27602d 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.cc
+++ b/tensorflow/lite/nnapi/nnapi_implementation.cc
@@ -146,9 +146,14 @@ const NnApi LoadNnApi() {
   void* libneuralnetworks = nullptr;
   // TODO(b/123243014): change RTLD_LOCAL? Assumes there can be multiple
   // instances of nn api RT
-  libneuralnetworks = dlopen("libneuralnetworks.so", RTLD_LAZY | RTLD_LOCAL);
+  static const char nnapi_library_name[] = "libneuralnetworks.so";
+  libneuralnetworks = dlopen(nnapi_library_name, RTLD_LAZY | RTLD_LOCAL);
   if (libneuralnetworks == nullptr) {
-    NNAPI_LOG("nnapi error: unable to open library %s", "libneuralnetworks.so");
+    const char* error = dlerror();
+    if (error) {
+      NNAPI_LOG("%s\n", error);
+    }
+    NNAPI_LOG("nnapi error: unable to open library %s", nnapi_library_name);
   }
 
   nnapi.nnapi_exists = libneuralnetworks != nullptr;
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.h b/tensorflow/lite/nnapi/nnapi_implementation.h
index 5aab7c43ad4..cf8aaee9c87 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.h
+++ b/tensorflow/lite/nnapi/nnapi_implementation.h
@@ -272,6 +272,7 @@ struct NnApi {
    *
    * See {@link ANeuralNetworksModel} for information on multithreaded usage.
    *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
    */
   int (*ANeuralNetworksModel_identifyInputsAndOutputs)(
       ANeuralNetworksModel* model, uint32_t inputCount, const uint32_t* inputs,
@@ -298,6 +299,8 @@ struct NnApi {
    * Available since API level 28.
    *
    * See {@link ANeuralNetworksModel} for information on multithreaded usage.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
    */
   int (*ANeuralNetworksModel_relaxComputationFloat32toFloat16)(
       ANeuralNetworksModel* model, bool allow);
diff --git a/tensorflow/lite/profiling/BUILD b/tensorflow/lite/profiling/BUILD
index ac957590c21..b54e742e4b5 100644
--- a/tensorflow/lite/profiling/BUILD
+++ b/tensorflow/lite/profiling/BUILD
@@ -1,13 +1,14 @@
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite_combined")
 
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
 )
 
-common_copts = [
-    "-Wall",
-] + tflite_copts()
+common_copts = tflite_copts() + if_not_windows(["-Wall"])
 
 cc_library(
     name = "profiler",
@@ -23,6 +24,16 @@ cc_library(
     ],
 )
 
+cc_test(
+    name = "profiler_test",
+    srcs = ["profiler_test.cc"],
+    deps = [
+        ":profiler",
+        ":test_main",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "atrace_profiler",
     srcs = ["atrace_profiler.cc"],
@@ -35,10 +46,21 @@ cc_library(
     ],
 )
 
+cc_test(
+    name = "atrace_profiler_test",
+    srcs = ["atrace_profiler_test.cc"],
+    deps = [
+        ":atrace_profiler",
+        ":test_main",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "platform_profiler",
     srcs = ["platform_profiler.cc"],
     hdrs = ["platform_profiler.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = common_copts,
     deps = [
         "//tensorflow/lite/core/api",
@@ -48,16 +70,6 @@ cc_library(
     }),
 )
 
-cc_test(
-    name = "profiler_test",
-    srcs = ["profiler_test.cc"],
-    deps = [
-        ":profiler",
-        "//tensorflow/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
 cc_library(
     name = "profile_buffer",
     hdrs = ["profile_buffer.h"],
@@ -69,6 +81,16 @@ cc_library(
     ],
 )
 
+cc_test(
+    name = "profile_buffer_test",
+    srcs = ["profile_buffer_test.cc"],
+    deps = [
+        ":profile_buffer",
+        ":test_main",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "time",
     srcs = ["time.cc"],
@@ -76,6 +98,16 @@ cc_library(
     copts = common_copts,
 )
 
+cc_test(
+    name = "time_test",
+    srcs = ["time_test.cc"],
+    deps = [
+        ":test_main",
+        ":time",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "memory_info",
     srcs = ["memory_info.cc"],
@@ -83,31 +115,21 @@ cc_library(
     copts = common_copts,
 )
 
-cc_test(
-    name = "time_test",
-    srcs = ["time_test.cc"],
-    copts = common_copts,
-    deps = [
-        ":time",
-        "//tensorflow/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
 cc_test(
     name = "memory_info_test",
     srcs = ["memory_info_test.cc"],
-    copts = common_copts,
     tags = [
         # Some low-level checks, like heap size check, may break in asan, msan
         # and tsan. So, disable such tests.
         "noasan",
         "nomsan",
         "notsan",
+        # TODO(b/166227284): Fix the test for Android.
+        "tflite_not_portable_android",
     ],
     deps = [
         ":memory_info",
-        "//tensorflow/lite/testing:util",
+        ":test_main",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -125,10 +147,9 @@ cc_library(
 cc_test(
     name = "profile_summary_formatter_test",
     srcs = ["profile_summary_formatter_test.cc"],
-    copts = common_copts,
     deps = [
         ":profile_summary_formatter",
-        "//tensorflow/lite/testing:util",
+        ":test_main",
         "@com_google_googletest//:gtest",
     ],
 )
@@ -151,26 +172,28 @@ cc_library(
 cc_test(
     name = "profile_summarizer_test",
     srcs = ["profile_summarizer_test.cc"],
-    copts = common_copts,
     deps = [
         ":profile_summarizer",
         ":profiler",
+        ":test_main",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:subgraph_test_util",
         "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "test_main",
+    testonly = 1,
+    srcs = ["test_main.cc"],
+    visibility = ["//visibility:private"],
+    deps = [
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
 )
 
-cc_test(
-    name = "profile_buffer_test",
-    srcs = ["profile_buffer_test.cc"],
-    deps = [
-        ":profile_buffer",
-        "//tensorflow/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
+tflite_portable_test_suite_combined(combine_conditions = {"deps": [":test_main"]})
diff --git a/tensorflow/lite/profiling/atrace_profiler.cc b/tensorflow/lite/profiling/atrace_profiler.cc
index 4bdaf9d9e06..ec796daf7f2 100644
--- a/tensorflow/lite/profiling/atrace_profiler.cc
+++ b/tensorflow/lite/profiling/atrace_profiler.cc
@@ -15,6 +15,9 @@ limitations under the License.
 #include "tensorflow/lite/profiling/atrace_profiler.h"
 
 #include <dlfcn.h>
+#if defined(__ANDROID__)
+#include <sys/system_properties.h>
+#endif
 
 #include <type_traits>
 
@@ -89,8 +92,20 @@ class ATraceProfiler : public tflite::Profiler {
   FpEndSection atrace_end_section_;
 };
 
-std::unique_ptr<tflite::Profiler> CreateATraceProfiler() {
+std::unique_ptr<tflite::Profiler> MaybeCreateATraceProfiler() {
+#if defined(TFLITE_ENABLE_DEFAULT_PROFILER)
   return std::unique_ptr<tflite::Profiler>(new ATraceProfiler());
+#else  // TFLITE_ENABLE_DEFAULT_PROFILER
+#if defined(__ANDROID__)
+  constexpr char kTraceProp[] = "debug.tflite.trace";
+  char trace_enabled[PROP_VALUE_MAX] = "";
+  int length = __system_property_get(kTraceProp, trace_enabled);
+  if (length == 1 && trace_enabled[0] == '1') {
+    return std::unique_ptr<tflite::Profiler>(new ATraceProfiler());
+  }
+#endif  // __ANDROID__
+  return nullptr;
+#endif  // TFLITE_ENABLE_DEFAULT_PROFILER
 }
 
 }  // namespace profiling
diff --git a/tensorflow/lite/profiling/atrace_profiler.h b/tensorflow/lite/profiling/atrace_profiler.h
index d103cbc8536..64f59e88582 100644
--- a/tensorflow/lite/profiling/atrace_profiler.h
+++ b/tensorflow/lite/profiling/atrace_profiler.h
@@ -22,7 +22,10 @@ limitations under the License.
 namespace tflite {
 namespace profiling {
 
-std::unique_ptr<tflite::Profiler> CreateATraceProfiler();
+// Creates a profiler which reports the traced events to the Android ATrace.
+// Nullptr will be returned if the Android system property 'debug.tflite.trace'
+// is not set or the property value is not 1.
+std::unique_ptr<tflite::Profiler> MaybeCreateATraceProfiler();
 
 }  // namespace profiling
 }  // namespace tflite
diff --git a/tensorflow/lite/profiling/atrace_profiler_test.cc b/tensorflow/lite/profiling/atrace_profiler_test.cc
new file mode 100644
index 00000000000..4c7cdba0d39
--- /dev/null
+++ b/tensorflow/lite/profiling/atrace_profiler_test.cc
@@ -0,0 +1,56 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/profiling/atrace_profiler.h"
+
+#if defined(__ANDROID__)
+#include <sys/system_properties.h>
+#endif
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace tflite {
+namespace profiling {
+
+namespace {
+
+TEST(ATraceProfilerTest, MaybeCreateATraceProfiler) {
+  auto initial_state_profiler = MaybeCreateATraceProfiler();
+#if !defined(TFLITE_ENABLE_DEFAULT_PROFILER)
+  EXPECT_EQ(nullptr, initial_state_profiler.get());
+#else
+  EXPECT_NE(nullptr, initial_state_profiler.get());
+#endif
+
+#if defined(__ANDROID__)
+  if (__system_property_set("debug.tflite.trace", "1") == 0) {
+    auto on_state_profiler = MaybeCreateATraceProfiler();
+    EXPECT_NE(nullptr, on_state_profiler.get());
+  }
+
+  if (__system_property_set("debug.tflite.trace", "0") == 0) {
+    auto off_state_profiler = MaybeCreateATraceProfiler();
+#if !defined(TFLITE_ENABLE_DEFAULT_PROFILER)
+    EXPECT_EQ(nullptr, off_state_profiler.get());
+#else
+    EXPECT_NE(nullptr, off_state_profiler.get());
+#endif
+  }
+#endif  // __ANDROID__
+}
+
+}  // namespace
+}  // namespace profiling
+}  // namespace tflite
diff --git a/tensorflow/lite/profiling/memory_info_test.cc b/tensorflow/lite/profiling/memory_info_test.cc
index a6bd2e4a667..9b580b75adf 100644
--- a/tensorflow/lite/profiling/memory_info_test.cc
+++ b/tensorflow/lite/profiling/memory_info_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 #include "tensorflow/lite/profiling/memory_info.h"
 
 #include <gtest/gtest.h>
-#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace profiling {
@@ -71,9 +70,3 @@ TEST(MemoryUsage, IsSupported) {
 }  // namespace memory
 }  // namespace profiling
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/profiling/platform_profiler.cc b/tensorflow/lite/profiling/platform_profiler.cc
index cd0770c2348..6ee290cb982 100644
--- a/tensorflow/lite/profiling/platform_profiler.cc
+++ b/tensorflow/lite/profiling/platform_profiler.cc
@@ -25,11 +25,11 @@ limitations under the License.
 namespace tflite {
 namespace profiling {
 
-std::unique_ptr<tflite::Profiler> CreatePlatformProfiler() {
+std::unique_ptr<tflite::Profiler> MaybeCreatePlatformProfiler() {
 #if defined(__ANDROID__)
-  return CreateATraceProfiler();
+  return MaybeCreateATraceProfiler();
 #else
-  return std::unique_ptr<tflite::Profiler>(nullptr);
+  return nullptr;
 #endif
 }
 
diff --git a/tensorflow/lite/profiling/platform_profiler.h b/tensorflow/lite/profiling/platform_profiler.h
index 87361b30b50..52a51f87634 100644
--- a/tensorflow/lite/profiling/platform_profiler.h
+++ b/tensorflow/lite/profiling/platform_profiler.h
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tflite {
 namespace profiling {
 
-std::unique_ptr<tflite::Profiler> CreatePlatformProfiler();
+std::unique_ptr<tflite::Profiler> MaybeCreatePlatformProfiler();
 
 }  // namespace profiling
 }  // namespace tflite
diff --git a/tensorflow/lite/profiling/profile_buffer_test.cc b/tensorflow/lite/profiling/profile_buffer_test.cc
index ab98cbb0d13..457b6ff2aba 100644
--- a/tensorflow/lite/profiling/profile_buffer_test.cc
+++ b/tensorflow/lite/profiling/profile_buffer_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace profiling {
@@ -121,9 +120,3 @@ TEST(ProfileBufferTest, Enable) {
 }  // namespace
 }  // namespace profiling
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/profiling/profile_summarizer_test.cc b/tensorflow/lite/profiling/profile_summarizer_test.cc
index 98d26196b75..7eccc2d3843 100644
--- a/tensorflow/lite/profiling/profile_summarizer_test.cc
+++ b/tensorflow/lite/profiling/profile_summarizer_test.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/profiling/buffered_profiler.h"
-#include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/version.h"
 
 namespace tflite {
@@ -37,10 +36,14 @@ namespace {
 const char* kOpName = "SimpleOpEval";
 
 TfLiteStatus SimpleOpEval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input1 = tflite::GetInput(context, node, /*index=*/0);
-  const TfLiteTensor* input2 = tflite::GetInput(context, node, /*index=*/1);
+  const TfLiteTensor* input1;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, /*index=*/0, &input1));
+  const TfLiteTensor* input2;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, /*index=*/1, &input2));
 
-  TfLiteTensor* output = GetOutput(context, node, /*index=*/0);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, /*index=*/0, &output));
 
   int32_t* output_data = output->data.i32;
   *output_data = *(input1->data.i32) + *(input2->data.i32);
@@ -224,9 +227,3 @@ TEST_F(ProfileSummarizerIfOpTest, TestIfFalse) {
 }  // namespace
 }  // namespace profiling
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/profiling/profile_summary_formatter_test.cc b/tensorflow/lite/profiling/profile_summary_formatter_test.cc
index 78d46aae1ea..0de0e733842 100644
--- a/tensorflow/lite/profiling/profile_summary_formatter_test.cc
+++ b/tensorflow/lite/profiling/profile_summary_formatter_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace profiling {
@@ -156,9 +155,3 @@ TEST(SummaryWriterTest, DelegationShortSummary) {
 }  // namespace
 }  // namespace profiling
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/profiling/profiler_test.cc b/tensorflow/lite/profiling/profiler_test.cc
index 1d8455e3647..c59dca9738e 100644
--- a/tensorflow/lite/profiling/profiler_test.cc
+++ b/tensorflow/lite/profiling/profiler_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace profiling {
@@ -136,9 +135,3 @@ TEST(ProfilingTest, NoopProfiler) {
 }  // namespace
 }  // namespace profiling
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/profiling/test_main.cc b/tensorflow/lite/profiling/test_main.cc
new file mode 100644
index 00000000000..df6b8cb0477
--- /dev/null
+++ b/tensorflow/lite/profiling/test_main.cc
@@ -0,0 +1,23 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/testing/util.h"
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/profiling/time_test.cc b/tensorflow/lite/profiling/time_test.cc
index 6f08479adeb..8a85de9fe51 100644
--- a/tensorflow/lite/profiling/time_test.cc
+++ b/tensorflow/lite/profiling/time_test.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/lite/profiling/time.h"
 #include <gtest/gtest.h>
-#include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
 namespace profiling {
@@ -48,9 +47,3 @@ TEST(TimeTest, SleepForMicros) {
 }  // namespace time
 }  // namespace profiling
 }  // namespace tflite
-
-int main(int argc, char** argv) {
-  ::tflite::LogToStderr();
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 3f4e187b4eb..f1620281230 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -1,4 +1,5 @@
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_py_library")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -17,6 +18,7 @@ py_library(
     srcs = [
         "interpreter.py",
     ],
+    compatible_with = get_compatible_with_portable(),
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
@@ -97,7 +99,7 @@ py_test(
     ],
     python_version = "PY3",
     # Increased thread count for reducing timeout failures.
-    shard_count = 4,
+    shard_count = 10,
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -107,9 +109,26 @@ py_test(
         "notsan",  # b/160824139
     ],
     deps = [
+        ":convert",
         ":tflite_convert",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/saved_model",
+        "//tensorflow/python/saved_model:save",
+        "//tensorflow/python/training:training_util",
+        "//tensorflow/python/training/tracking",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -128,6 +147,7 @@ py_library(
         "//tensorflow/lite/experimental/examples/lstm:tflite_lstm_ops",
         "//tensorflow/lite/experimental/microfrontend:audio_microfrontend_py",
         "//tensorflow/lite/experimental/tensorboard:ops_util",
+        "//tensorflow/lite/python/keras/saving:saving_utils",
         "//tensorflow/lite/python/optimize:calibrator",
         "//tensorflow/python:graph_util",
         "//tensorflow/python/keras",
@@ -145,6 +165,7 @@ py_test(
     shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
+        "no_mac",  # b/170882617
         "no_windows",
     ],
     deps = [
@@ -159,7 +180,7 @@ py_test(
     name = "lite_v2_test",
     srcs = ["lite_v2_test.py"],
     python_version = "PY3",
-    shard_count = 4,
+    shard_count = 12,
     srcs_version = "PY2AND3",
     tags = [
         "no_mac",  # TODO(b/148247402): flatbuffers import broken on Mac OS.
@@ -211,11 +232,16 @@ py_library(
         "//tensorflow/lite:__subpackages__",
     ],
     deps = [
-        ":lite_constants",
         ":op_hint",
         ":schema_py",
+        "//tensorflow/lite/toco:toco_flags_proto_py",
+        "//tensorflow/python:convert_to_constants",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:error_interpolation",
+        "//tensorflow/python:graph_util",
         "//tensorflow/python:tf_optimizer",
-        "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/training:saver",
         "@absl_py//absl/logging",
         "@flatbuffers//:runtime_py",
         "@six_archive//:six",
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 8a9f8929b99..68f49d50498 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -35,6 +35,7 @@ from tensorflow.lite.python import wrap_toco
 from tensorflow.lite.toco import model_flags_pb2 as _model_flags_pb2
 from tensorflow.lite.toco import toco_flags_pb2 as _toco_flags_pb2
 from tensorflow.lite.toco import types_pb2 as _types_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import resource_loader as _resource_loader
 from tensorflow.python.util import deprecation
@@ -159,6 +160,19 @@ def mlir_sparsify(input_data_str):
   return wrap_toco.wrapped_experimental_mlir_sparsify(input_data_str)
 
 
+def register_custom_opdefs(custom_opdefs_list):
+  """Register the given custom opdefs to the TensorFlow global op registry.
+
+  Args:
+    custom_opdefs_list: String representing the custom ops OpDefs that are
+      included in the GraphDef.
+
+  Returns:
+    True if the registration is successfully completed.
+  """
+  return wrap_toco.wrapped_register_custom_opdefs(custom_opdefs_list)
+
+
 def toco_convert_protos(model_flags_str,
                         toco_flags_str,
                         input_data_str,
@@ -288,9 +302,58 @@ Alternative, use virtualenv.""")
         pass
 
 
+def build_toco_flags(inference_type=dtypes.float32,
+                     inference_input_type=None,
+                     input_format=lite_constants.TENSORFLOW_GRAPHDEF,
+                     output_format=lite_constants.TFLITE,
+                     default_ranges_stats=None,
+                     drop_control_dependency=True,
+                     reorder_across_fake_quant=False,
+                     allow_custom_ops=False,
+                     custom_opdefs=None,
+                     post_training_quantize=False,
+                     quantize_to_float16=False,
+                     dump_graphviz_dir=None,
+                     dump_graphviz_video=False,
+                     target_ops=None,
+                     conversion_summary_dir=None,
+                     **_):
+  """Build the TOCO flags object from params."""
+  toco = _toco_flags_pb2.TocoFlags()
+  toco.input_format = input_format
+  toco.output_format = output_format
+  toco.inference_type = util.convert_dtype_to_tflite_type(inference_type)
+  if inference_input_type:
+    toco.inference_input_type = util.convert_dtype_to_tflite_type(
+        inference_input_type)
+  else:
+    toco.inference_input_type = toco.inference_type
+  toco.drop_control_dependency = drop_control_dependency
+  toco.reorder_across_fake_quant = reorder_across_fake_quant
+  toco.allow_custom_ops = allow_custom_ops
+  if custom_opdefs:
+    toco.custom_opdefs.extend(custom_opdefs)
+  toco.post_training_quantize = post_training_quantize
+  toco.quantize_to_float16 = quantize_to_float16
+  if default_ranges_stats:
+    toco.default_ranges_min = default_ranges_stats[0]
+    toco.default_ranges_max = default_ranges_stats[1]
+  if dump_graphviz_dir:
+    toco.dump_graphviz_dir = dump_graphviz_dir
+  toco.dump_graphviz_include_video = dump_graphviz_video
+  if conversion_summary_dir:
+    toco.conversion_summary_dir = conversion_summary_dir
+  if target_ops:
+    if OpsSet.SELECT_TF_OPS in set(target_ops):
+      toco.enable_select_tf_ops = True
+    if set(target_ops) == set([OpsSet.SELECT_TF_OPS]):
+      toco.force_select_tf_ops = True
+  return toco
+
+
 def build_toco_convert_protos(input_tensors,
                               output_tensors,
-                              inference_type=lite_constants.FLOAT,
+                              inference_type=dtypes.float32,
                               inference_input_type=None,
                               input_format=lite_constants.TENSORFLOW_GRAPHDEF,
                               input_shapes=None,
@@ -403,37 +466,13 @@ def build_toco_convert_protos(input_tensors,
     RuntimeError: If TOCO fails to convert (in which case the runtime error's
       error text will contain the TOCO error log)
   """
-  toco = _toco_flags_pb2.TocoFlags()
-  toco.input_format = input_format
-  toco.output_format = output_format
-  toco.inference_type = util.convert_dtype_to_tflite_type(inference_type)
-  if inference_input_type:
-    toco.inference_input_type = util.convert_dtype_to_tflite_type(
-        inference_input_type)
-  else:
-    toco.inference_input_type = toco.inference_type
-  toco.drop_control_dependency = drop_control_dependency
-  toco.reorder_across_fake_quant = reorder_across_fake_quant
-  toco.allow_custom_ops = allow_custom_ops
-  if custom_opdefs:
-    toco.custom_opdefs.extend(custom_opdefs)
-  toco.post_training_quantize = post_training_quantize
-  toco.quantize_to_float16 = quantize_to_float16
-  if default_ranges_stats:
-    toco.default_ranges_min = default_ranges_stats[0]
-    toco.default_ranges_max = default_ranges_stats[1]
-  if dump_graphviz_dir:
-    toco.dump_graphviz_dir = dump_graphviz_dir
-  toco.dump_graphviz_include_video = dump_graphviz_video
-  if conversion_summary_dir:
-    toco.conversion_summary_dir = conversion_summary_dir
-  if target_ops:
-    if set(target_ops) == set([OpsSet.TFLITE_BUILTINS, OpsSet.SELECT_TF_OPS]):
-      toco.enable_select_tf_ops = True
-    elif set(target_ops) == set([OpsSet.SELECT_TF_OPS]):
-      toco.enable_select_tf_ops = True
-      toco.force_select_tf_ops = True
-
+  toco = build_toco_flags(inference_type, inference_input_type, input_format,
+                          output_format, default_ranges_stats,
+                          drop_control_dependency, reorder_across_fake_quant,
+                          allow_custom_ops, custom_opdefs,
+                          post_training_quantize, quantize_to_float16,
+                          dump_graphviz_dir, dump_graphviz_video, target_ops,
+                          conversion_summary_dir)
   model = _model_flags_pb2.ModelFlags()
   model.change_concat_input_ranges = change_concat_input_ranges
   for idx, input_tensor in enumerate(input_tensors):
@@ -575,6 +614,30 @@ def toco_convert_impl(input_data, input_tensors, output_tensors,
   return data
 
 
+def convert_saved_model(saved_model_dir=None,
+                        saved_model_version=0,
+                        saved_model_tags=None,
+                        saved_model_exported_names=None,
+                        **kwargs):
+  """Converts a saved_model using TF Lite converter."""
+  model_flags = _model_flags_pb2.ModelFlags()
+  if saved_model_dir:
+    model_flags.saved_model_dir = saved_model_dir
+  model_flags.saved_model_version = saved_model_version
+  if saved_model_tags:
+    model_flags.saved_model_tags.extend(saved_model_tags)
+  if saved_model_exported_names:
+    model_flags.saved_model_exported_names.extend(saved_model_exported_names)
+  toco_flags = build_toco_flags(**kwargs)
+  data = toco_convert_protos(
+      model_flags.SerializeToString(),
+      toco_flags.SerializeToString(),
+      None,  # input_data, unused
+      None,  # debug_info_str, unused
+      enable_mlir_converter=True)
+  return data
+
+
 @_tf_export(v1=["lite.toco_convert"])
 @deprecation.deprecated(None, "Use `lite.TFLiteConverter` instead.")
 def toco_convert(input_data, input_tensors, output_tensors, *args, **kwargs):
diff --git a/tensorflow/lite/python/convert_test.py b/tensorflow/lite/python/convert_test.py
index e3654217b3a..3cccce38669 100644
--- a/tensorflow/lite/python/convert_test.py
+++ b/tensorflow/lite/python/convert_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.lite.python import convert
-from tensorflow.lite.python import lite_constants
 from tensorflow.lite.python import op_hint
 from tensorflow.lite.python.interpreter import Interpreter
 from tensorflow.python.client import session
@@ -59,7 +58,7 @@ class ConvertTest(test_util.TensorFlowTestCase):
 
     tflite_model = convert.toco_convert(
         sess.graph_def, [in_tensor], [out_tensor],
-        inference_type=lite_constants.QUANTIZED_UINT8,
+        inference_type=dtypes.uint8,
         quantized_input_stats=[(0., 1.)])
     self.assertTrue(tflite_model)
 
@@ -73,7 +72,7 @@ class ConvertTest(test_util.TensorFlowTestCase):
     tflite_model = convert.toco_convert_graph_def(
         sess.graph_def, [("input", [1, 16, 16, 3])], ["add"],
         enable_mlir_converter=False,
-        inference_type=lite_constants.FLOAT)
+        inference_type=dtypes.float32)
     self.assertTrue(tflite_model)
 
     # Check values from converted model.
@@ -111,7 +110,7 @@ class ConvertTest(test_util.TensorFlowTestCase):
         input_arrays_map,
         output_arrays,
         enable_mlir_converter=False,
-        inference_type=lite_constants.QUANTIZED_UINT8,
+        inference_type=dtypes.uint8,
         quantized_input_stats=[(0., 1.), (0., 1.)])
     self.assertTrue(tflite_model)
 
@@ -158,7 +157,7 @@ class ConvertTest(test_util.TensorFlowTestCase):
           input_arrays_map,
           output_arrays,
           enable_mlir_converter=False,
-          inference_type=lite_constants.QUANTIZED_UINT8)
+          inference_type=dtypes.uint8)
     self.assertEqual(
         "std_dev and mean must be defined when inference_type or "
         "inference_input_type is QUANTIZED_UINT8 or INT8.",
diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
index c7f86c6d6d8..cd5a237b0ef 100644
--- a/tensorflow/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -26,7 +26,8 @@ import os
 import numpy as np
 
 # pylint: disable=g-import-not-at-top
-if not os.path.splitext(__file__)[0].endswith('tflite_runtime/interpreter'):
+if not os.path.splitext(__file__)[0].endswith(
+    os.path.join('tflite_runtime', 'interpreter')):
   # This file is part of tensorflow package.
   from tensorflow.lite.python.interpreter_wrapper import _pywrap_tensorflow_interpreter_wrapper as _interpreter_wrapper
   from tensorflow.python.util.tf_export import tf_export as _tf_export
@@ -344,7 +345,7 @@ class Interpreter(object):
     tensor_sparsity_params = self._interpreter.TensorSparsityParameters(
         tensor_index)
 
-    if not tensor_name or not tensor_type:
+    if not tensor_type:
       raise ValueError('Could not get tensor details')
 
     details = {
diff --git a/tensorflow/lite/python/interpreter_test.py b/tensorflow/lite/python/interpreter_test.py
index bcb338b84cf..62bd9710f23 100644
--- a/tensorflow/lite/python/interpreter_test.py
+++ b/tensorflow/lite/python/interpreter_test.py
@@ -218,6 +218,18 @@ class InterpreterTest(test_util.TensorFlowTestCase):
     output_data = interpreter.get_tensor(output_details[0]['index'])
     self.assertTrue((expected_output == output_data).all())
 
+  def testStringZeroDim(self):
+    data = b'abcd' + bytes(16)
+    interpreter = interpreter_wrapper.Interpreter(
+        model_path=resource_loader.get_path_to_datafile(
+            'testdata/gather_string_0d.tflite'))
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    interpreter.set_tensor(input_details[0]['index'], np.array(data))
+    test_input_tensor = interpreter.get_tensor(input_details[0]['index'])
+    self.assertEqual(len(data), len(test_input_tensor.item(0)))
+
   def testPerChannelParams(self):
     interpreter = interpreter_wrapper.Interpreter(
         model_path=resource_loader.get_path_to_datafile('testdata/pc_conv.bin'))
diff --git a/tensorflow/lite/python/interpreter_wrapper/BUILD b/tensorflow/lite/python/interpreter_wrapper/BUILD
index 427f54d0e2c..3055d37fa07 100644
--- a/tensorflow/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/lite/python/interpreter_wrapper/BUILD
@@ -1,5 +1,8 @@
 load("//tensorflow:tensorflow.bzl", "pybind_extension")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
+
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
@@ -9,6 +12,7 @@ cc_library(
     name = "numpy",
     srcs = ["numpy.cc"],
     hdrs = ["numpy.h"],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:common",
@@ -23,6 +27,7 @@ cc_library(
     hdrs = [
         "interpreter_wrapper.h",
     ],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         ":numpy",
         ":python_error_reporter",
@@ -45,8 +50,9 @@ cc_library(
     name = "python_error_reporter",
     srcs = ["python_error_reporter.cc"],
     hdrs = ["python_error_reporter.h"],
+    compatible_with = get_compatible_with_portable(),
     deps = [
-        "//tensorflow/lite/core/api",
+        "//tensorflow/lite:stateful_error_reporter",
         "//third_party/python_runtime:headers",  # buildcleaner: keep
     ],
 )
@@ -55,6 +61,7 @@ cc_library(
     name = "python_utils",
     srcs = ["python_utils.cc"],
     hdrs = ["python_utils.h"],
+    compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
@@ -75,6 +82,7 @@ pybind_extension(
         "interpreter_wrapper_pybind11.cc",
     ],
     hdrs = ["interpreter_wrapper.h"],
+    compatible_with = get_compatible_with_portable(),
     link_in_framework = True,
     module_name = "_pywrap_tensorflow_interpreter_wrapper",
     deps = [
diff --git a/tensorflow/lite/python/interpreter_wrapper/numpy.cc b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
index d2f308a74a2..b854e2ebd69 100644
--- a/tensorflow/lite/python/interpreter_wrapper/numpy.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/numpy.cc
@@ -153,6 +153,11 @@ bool FillStringBufferWithPyArray(PyObject* value,
     case NPY_OBJECT:
     case NPY_STRING:
     case NPY_UNICODE: {
+      if (PyArray_NDIM(array) == 0) {
+        dynamic_buffer->AddString(static_cast<char*>(PyArray_DATA(array)),
+                                  PyArray_NBYTES(array));
+        return true;
+      }
       UniquePyObjectRef iter(PyArray_IterNew(value));
       while (PyArray_ITER_NOTDONE(iter.get())) {
         UniquePyObjectRef item(PyArray_GETITEM(
diff --git a/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h b/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h
index 7d4e308834a..b406187e5dd 100644
--- a/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h
+++ b/tensorflow/lite/python/interpreter_wrapper/python_error_reporter.h
@@ -21,12 +21,12 @@ limitations under the License.
 #include <sstream>
 #include <string>
 
-#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/stateful_error_reporter.h"
 
 namespace tflite {
 namespace interpreter_wrapper {
 
-class PythonErrorReporter : public tflite::ErrorReporter {
+class PythonErrorReporter : public tflite::StatefulErrorReporter {
  public:
   PythonErrorReporter() {}
 
@@ -38,7 +38,7 @@ class PythonErrorReporter : public tflite::ErrorReporter {
   PyObject* exception();
 
   // Gets the last error message and clears the buffer.
-  std::string message();
+  std::string message() override;
 
  private:
   std::stringstream buffer_;
diff --git a/tensorflow/lite/python/interpreter_wrapper/python_utils.cc b/tensorflow/lite/python/interpreter_wrapper/python_utils.cc
index 1b31a0dcb54..886f1ef5335 100644
--- a/tensorflow/lite/python/interpreter_wrapper/python_utils.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/python_utils.cc
@@ -22,6 +22,11 @@ namespace python_utils {
 
 int ConvertFromPyString(PyObject* obj, char** data, Py_ssize_t* length) {
 #if PY_MAJOR_VERSION >= 3
+  if (PyUnicode_Check(obj)) {
+    // const_cast<> is for CPython 3.7 finally adding const to the API.
+    *data = const_cast<char*>(PyUnicode_AsUTF8AndSize(obj, length));
+    return *data == nullptr ? -1 : 0;
+  }
   return PyBytes_AsStringAndSize(obj, data, length);
 #else
   return PyString_AsStringAndSize(obj, data, length);
diff --git a/tensorflow/lite/python/keras/saving/BUILD b/tensorflow/lite/python/keras/saving/BUILD
new file mode 100644
index 00000000000..ff5c679a527
--- /dev/null
+++ b/tensorflow/lite/python/keras/saving/BUILD
@@ -0,0 +1,15 @@
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+py_library(
+    name = "saving_utils",
+    srcs = [
+        "saving_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+    ],
+)
diff --git a/tensorflow/lite/python/keras/saving/saving_utils.py b/tensorflow/lite/python/keras/saving/saving_utils.py
new file mode 100644
index 00000000000..03a442d2ee3
--- /dev/null
+++ b/tensorflow/lite/python/keras/saving/saving_utils.py
@@ -0,0 +1,83 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Utility functions for TensorFlow models."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+from tensorflow.python.util import nest
+from tensorflow.python.util.compat import collections_abc
+
+
+def _enforce_names_consistency(specs):
+  """Enforces that either all specs have names or none do."""
+
+  def _has_name(spec):
+    return hasattr(spec, 'name') and spec.name is not None
+
+  def _clear_name(spec):
+    spec = copy.deepcopy(spec)
+    if hasattr(spec, 'name'):
+      spec._name = None  # pylint:disable=protected-access
+    return spec
+
+  flat_specs = nest.flatten(specs)
+  name_inconsistency = (
+      any(_has_name(s) for s in flat_specs) and
+      not all(_has_name(s) for s in flat_specs))
+
+  if name_inconsistency:
+    specs = nest.map_structure(_clear_name, specs)
+  return specs
+
+
+def model_input_signature(model, keep_original_batch_size=False):
+  """Inspect model to get its input signature.
+
+  The model's input signature is a list with a single (possibly-nested) object.
+  This is due to the Keras-enforced restriction that tensor inputs must be
+  passed in as the first argument.
+
+  For example, a model with input {'feature1': <Tensor>, 'feature2': <Tensor>}
+  will have input signature: [{'feature1': TensorSpec, 'feature2': TensorSpec}]
+
+  Args:
+    model: Keras Model object.
+    keep_original_batch_size: A boolean indicating whether we want to keep using
+      the original batch size or set it to None. Default is `False`, which means
+      that the batch dim of the returned input signature will always be set to
+      `None`.
+
+  Returns:
+    A list containing either a single TensorSpec or an object with nested
+    TensorSpecs. This list does not contain the `training` argument.
+  """
+  input_specs = model._get_save_spec(dynamic_batch=not keep_original_batch_size)  # pylint: disable=protected-access
+  if input_specs is None:
+    return None
+  input_specs = _enforce_names_consistency(input_specs)
+  # Return a list with a single element as the model's input signature.
+  if isinstance(input_specs,
+                collections_abc.Sequence) and len(input_specs) == 1:
+    # Note that the isinstance check filters out single-element dictionaries,
+    # which should also be wrapped as a single-element list.
+    return input_specs
+  else:
+    return [input_specs]
+
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 4a0ae9d4c9e..eb1e5509874 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -38,6 +38,7 @@ from tensorflow.lite.experimental.microfrontend.python.ops import audio_microfro
 from tensorflow.lite.experimental.tensorboard.ops_util import get_potentially_supported_ops  # pylint: disable=unused-import
 from tensorflow.lite.python import lite_constants as constants
 from tensorflow.lite.python.convert import build_toco_convert_protos  # pylint: disable=unused-import
+from tensorflow.lite.python.convert import convert_saved_model as _convert_saved_model
 from tensorflow.lite.python.convert import ConverterError  # pylint: disable=unused-import
 from tensorflow.lite.python.convert import mlir_quantize as _mlir_quantize
 from tensorflow.lite.python.convert import mlir_sparsify as _mlir_sparsify
@@ -49,6 +50,7 @@ from tensorflow.lite.python.convert import toco_convert_protos  # pylint: disabl
 from tensorflow.lite.python.convert_saved_model import freeze_saved_model as _freeze_saved_model
 from tensorflow.lite.python.interpreter import Interpreter  # pylint: disable=unused-import
 from tensorflow.lite.python.interpreter import load_delegate  # pylint: disable=unused-import
+from tensorflow.lite.python.keras.saving import saving_utils as _keras_saving_utils
 from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs  # pylint: disable=unused-import
 from tensorflow.lite.python.op_hint import is_ophint_converted as _is_ophint_converted
 from tensorflow.lite.python.op_hint import OpHint  # pylint: disable=unused-import
@@ -61,7 +63,7 @@ from tensorflow.lite.python.util import get_grappler_config as _get_grappler_con
 from tensorflow.lite.python.util import get_tensor_name as _get_tensor_name
 from tensorflow.lite.python.util import get_tensors_from_tensor_names as _get_tensors_from_tensor_names
 from tensorflow.lite.python.util import is_frozen_graph as _is_frozen_graph
-from tensorflow.lite.python.util import modify_integer_quantized_model_io_type as _modify_integer_quantized_model_io_type
+from tensorflow.lite.python.util import modify_model_io_type as _modify_model_io_type
 from tensorflow.lite.python.util import run_graph_optimizations as _run_graph_optimizations
 from tensorflow.lite.python.util import set_tensor_shapes as _set_tensor_shapes
 from tensorflow.python import keras as _keras
@@ -161,9 +163,8 @@ class TargetSpec(object):
     supported_ops: Experimental flag, subject to change. Set of OpsSet options
       supported by the device. (default set([OpsSet.TFLITE_BUILTINS]))
     supported_types: List of types for constant values on the target device.
-      Supported values are types exported by lite.constants. Frequently, an
-      optimization choice is driven by the most compact (i.e. smallest) type in
-      this list (default [constants.FLOAT])
+      Frequently, an optimization choice is driven by the most compact
+      (i.e. smallest) type in this list (default [tf.float32])
   """
 
   def __init__(self, supported_ops=None, supported_types=None):
@@ -198,7 +199,7 @@ class QuantizationMode(object):
     return (self._any_optimization_enabled() and
             not self._is_int16x8_target_required() and
             self._representative_dataset is not None and
-            self._smallest_supported_type() == constants.INT8)
+            self._smallest_supported_type() == _dtypes.int8)
 
   def is_post_training_integer_quantize_8(self):
     """Post training integer 8 quantization."""
@@ -239,12 +240,12 @@ class QuantizationMode(object):
     return (self._any_optimization_enabled() and
             self._representative_dataset is None and
             not self.contains_training_quant_op() and
-            self._smallest_supported_type() == constants.INT8)
+            self._smallest_supported_type() == _dtypes.int8)
 
   def post_training_fp16(self):
     """Post training fp16 quantize."""
     return (self._any_optimization_enabled() and
-            self._smallest_supported_type() == constants.FLOAT16)
+            self._smallest_supported_type() == _dtypes.float16)
 
   def fp32_execution(self):
     """If none of the above are true."""
@@ -257,36 +258,36 @@ class QuantizationMode(object):
                 self.post_training_fp16())
 
   def activations_type(self):
-    return constants.INT16 if self._is_int16x8_target_required() \
-      else constants.INT8
+    return _dtypes.int16 if self._is_int16x8_target_required() \
+      else _dtypes.int8
 
   def converter_flags(self, inference_ty=None, inference_input_ty=None):
     """Flags to the converter."""
     if self.is_post_training_integer_quantize():
       # The inference_input_type is for the quantizer, then we need to keep the
       # converter inference_input_type to float.
-      inference_input_ty = constants.FLOAT
+      inference_input_ty = _dtypes.float32
 
     if self.training_time_int8_allow_float():
       return {
           "inference_type": inference_ty if inference_ty else \
             self.activations_type(),
           "inference_input_type":
-              inference_input_ty if inference_input_ty else constants.FLOAT,
+              inference_input_ty if inference_input_ty else _dtypes.float32,
           "post_training_quantize": False,  # disable dynamic range quantization
           "quantize_to_float16": False  # disable float16 quantization
       }
     elif self.post_training_dynamic_range_int8():
       return {
-          "inference_type": constants.FLOAT,
-          "inference_input_type": constants.FLOAT,
+          "inference_type": _dtypes.float32,
+          "inference_input_type": _dtypes.float32,
           "post_training_quantize": True,  # enable dynamic range quantization
           "quantize_to_float16": False  # disable float16 quantization
       }
     elif self.post_training_fp16():
       return {
-          "inference_type": constants.FLOAT,
-          "inference_input_type": constants.FLOAT,
+          "inference_type": _dtypes.float32,
+          "inference_input_type": _dtypes.float32,
           "post_training_quantize": True,
           "quantize_to_float16": True  # enable float16 quantization
       }
@@ -294,7 +295,7 @@ class QuantizationMode(object):
       # Note this might still trigger (uint8) quantization to be compatible with
       # TOCO.
       return {
-          "inference_type": inference_ty if inference_ty else constants.FLOAT,
+          "inference_type": inference_ty if inference_ty else _dtypes.float32,
           "inference_input_type": inference_input_ty,
           "post_training_quantize": False,  # enable dynamic range quantization
           "quantize_to_float16": False  # disable float16 quantization
@@ -303,8 +304,8 @@ class QuantizationMode(object):
   def quantizer_flags(self, input_ty=None, output_ty=None):
     """Default flags to the TFMOT quantizer."""
 
-    inference_input_type = input_ty if input_ty else constants.FLOAT
-    inference_output_type = output_ty if output_ty else constants.FLOAT
+    inference_input_type = input_ty if input_ty else _dtypes.float32
+    inference_output_type = output_ty if output_ty else _dtypes.float32
 
     if self.post_training_int8_no_float() \
       or self.post_training_int16x8_no_float():
@@ -326,7 +327,7 @@ class QuantizationMode(object):
       return False, None
 
   def flags_modify_model_io_type(
-      self, input_type=constants.FLOAT, output_type=constants.FLOAT):
+      self, input_type=_dtypes.float32, output_type=_dtypes.float32):
     """Flags for modifying the input and output type of a tflite model."""
     is_post_training_quantize = self.quantizer_flags(input_type, output_type)[0]
     is_training_time_only_quantize = self.training_time_int8_allow_float() and \
@@ -350,7 +351,7 @@ class QuantizationMode(object):
       return
 
     if self._target_spec.supported_types and (self._smallest_supported_type() !=
-                                              constants.INT8):
+                                              _dtypes.int8):
       raise ValueError("TFLITE_BUILTINS_INT8 requires smallest supported "
                        "type to be INT8.")
 
@@ -369,7 +370,7 @@ class QuantizationMode(object):
   def _is_int8_target_required(self):
     return (set([OpsSet.TFLITE_BUILTINS_INT8]) == set(
         self._target_spec.supported_ops) or
-            set(self._target_spec.supported_types) == set([constants.INT8]))
+            set(self._target_spec.supported_types) == set([_dtypes.int8]))
 
   def _is_int16x8_target_required(self):
     return bool(
@@ -394,7 +395,7 @@ class QuantizationMode(object):
       return min(self._target_spec.supported_types, key=lambda x: x.size)
     else:
       # The default smallest supported type is INT8.
-      return constants.INT8
+      return _dtypes.int8
 
   def contains_training_quant_op(self):
     """Checks if the graph contains any training-time quantization ops."""
@@ -553,18 +554,18 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
   def __init__(self):
     """Constructor for TFLiteConverter."""
     super(TFLiteConverterBaseV2, self).__init__()
-    self.inference_input_type = constants.FLOAT
-    self.inference_output_type = constants.FLOAT
+    self.inference_input_type = _dtypes.float32
+    self.inference_output_type = _dtypes.float32
 
   def _validate_inference_input_output_types(self, quant_mode):
     """Validate inference_input_type and inference_output_type flags."""
-    default_types = [constants.FLOAT]
+    default_types = [_dtypes.float32]
     # We support integer input/output for integer quantized models only.
     if quant_mode.training_time_int8_allow_float():
       if quant_mode.is_post_training_integer_quantize_16x8():
-        all_types = default_types + [constants.INT16]
+        all_types = default_types + [_dtypes.int16]
       else:
-        all_types = default_types + [constants.INT8, constants.QUANTIZED_UINT8]
+        all_types = default_types + [_dtypes.int8, _dtypes.uint8]
       if self.inference_input_type not in all_types or \
           self.inference_output_type not in all_types:
         all_types_names = ["tf." + t.name for t in all_types]
@@ -650,8 +651,7 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
     flags_modify_model_io_type = quant_mode.flags_modify_model_io_type(
         self.inference_input_type, self.inference_output_type)
     if flags_modify_model_io_type:
-      result = _modify_integer_quantized_model_io_type(
-          result, **flags_modify_model_io_type)
+      result = _modify_model_io_type(result, **flags_modify_model_io_type)
 
     if self._experimental_sparsify_model:
       result = _mlir_sparsify(result)
@@ -711,19 +711,63 @@ class TFLiteSavedModelConverterV2(TFLiteConverterBaseV2):
     saved_model.load_graph(graph, tags=self._saved_model_tags)
     meta_graph = saved_model.get_meta_graph_def_from_tags(
         self._saved_model_tags)
-    signature_def = meta_graph.signature_def[
-        _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
-    input_tensors = [
-        graph.get_tensor_by_name(signature_def.inputs[key].name)
-        for key in signature_def.inputs
-    ]
-    output_tensors = [
-        graph.get_tensor_by_name(signature_def.outputs[key].name)
-        for key in signature_def.outputs
-    ]
-    return super(TFLiteSavedModelConverterV2,
-                 self).convert(meta_graph.graph_def, input_tensors,
-                               output_tensors)
+    # If we can't use saved model importer, then fallback
+    # to frozen graph conversion path.
+    if self.saved_model_dir is None or not self.experimental_new_converter:
+      signature_def = meta_graph.signature_def[
+          _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+      input_tensors = [
+          graph.get_tensor_by_name(signature_def.inputs[key].name)
+          for key in signature_def.inputs
+      ]
+      output_tensors = [
+          graph.get_tensor_by_name(signature_def.outputs[key].name)
+          for key in signature_def.outputs
+      ]
+      result = _freeze_saved_model(
+          self.saved_model_dir, None, None, None, self._saved_model_tags,
+          _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY)
+      graph_def = result[0]
+      # We make sure to clear the saved_model_dir as there is some
+      # legacy code down in the caller that checks this.
+      # TODO(b/162537905): Clean these indirect dependencies.
+      self.saved_model_dir = None
+      return super(TFLiteSavedModelConverterV2,
+                   self).convert(graph_def, input_tensors,
+                                 output_tensors)
+
+    if self._trackable_obj is None:
+      self._debug_info = _get_debug_info(
+          _build_debug_info_func(self._funcs[0].graph), meta_graph.graph_def)
+    else:
+      self._debug_info = _get_debug_info(
+          _convert_debug_info_func(self._trackable_obj.graph_debug_info),
+          meta_graph.graph_def)
+
+    # Get quantization options and do some sanity checks.
+    quant_mode = QuantizationMode(self.optimizations, self.target_spec,
+                                  self.representative_dataset,
+                                  meta_graph.graph_def)
+    self._validate_inference_input_output_types(quant_mode)
+
+    converter_kwargs = self._get_base_converter_args()
+    converter_kwargs.update(quant_mode.converter_flags())
+
+    result = _convert_saved_model(**converter_kwargs)
+    calibrate_and_quantize, flags = quant_mode.quantizer_flags(
+        self.inference_input_type, self.inference_output_type)
+    if calibrate_and_quantize:
+      result = self._calibrate_quantize_model(result, **flags)
+
+    flags_modify_model_io_type = quant_mode.flags_modify_model_io_type(
+        self.inference_input_type, self.inference_output_type)
+    if flags_modify_model_io_type:
+      result = _modify_model_io_type(result, **flags_modify_model_io_type)
+
+    if self._experimental_sparsify_model:
+      result = _mlir_sparsify(result)
+
+    return result
 
 
 class TFLiteKerasModelConverterV2(TFLiteConverterBaseV2):
@@ -813,7 +857,8 @@ class TFLiteKerasModelConverterV2(TFLiteConverterBaseV2):
     if not isinstance(self._keras_model.call, _def_function.Function):
       # Pass `keep_original_batch_size=True` will ensure that we get an input
       # signature including the batch dimension specified by the user.
-      input_signature = _saving_utils.model_input_signature(
+      # TODO(b/169898786): Use the Keras public API when TFLite moves out of TF
+      input_signature = _keras_saving_utils.model_input_signature(
           self._keras_model, keep_original_batch_size=True)
 
     func = _saving_utils.trace_model_call(self._keras_model, input_signature)
@@ -944,7 +989,6 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
       tf.float32, must be in {tf.float32, tf.int8, tf.uint8})
     experimental_new_converter: Experimental flag, subject to change. Enables
       MLIR-based conversion instead of TOCO conversion. (default True)
-
   Example usage:
 
     ```python
@@ -1031,9 +1075,8 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
           signature_key = signature_keys[0]
       logging.warning("Invoking the TF1 implementation of TFLiteConverter "
                       "because eager is disabled. Consider enabling eager.")
-      return TFLiteConverter.from_saved_model(saved_model_dir,
-                                              signature_key=signature_key,
-                                              tag_set=tags)
+      return TFLiteConverter.from_saved_model(
+          saved_model_dir, signature_key=signature_key, tag_set=tags)
 
     # Ensures any graphs created in Eager mode are able to run. This is required
     # in order to create a tf.estimator.Exporter that exports a TFLite model.
@@ -1103,7 +1146,7 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
         graph debug info for a set of nodes from the `graph_def`.
     """
     super(TFLiteConverterBaseV1, self).__init__()
-    self.inference_type = constants.FLOAT
+    self.inference_type = _dtypes.float32
     self.inference_input_type = None
     self.inference_output_type = None
     self.output_format = constants.TFLITE
@@ -1150,7 +1193,7 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
   def _validate_quantized_input_stats(self, converter_kwargs, calibrate):
     """Ensure the `quantized_input_stats` flag is provided if required."""
 
-    quantized_types = frozenset({constants.INT8, constants.QUANTIZED_UINT8})
+    quantized_types = frozenset({_dtypes.int8, _dtypes.uint8})
 
     requires_quantized_input_stats = (
         (converter_kwargs["inference_type"] in quantized_types or
@@ -1582,30 +1625,26 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
       Must be `{tf.float32, tf.uint8}`. If `optimzations` are provided, this
       parameter is ignored. (default tf.float32)
     inference_input_type: Target data type of real-number input arrays. Allows
-      for a different type for input arrays.
-      If an integer type is provided and `optimizations` are not used,
-      `quantized_input_stats` must be provided.
-      If `inference_type` is tf.uint8, signaling conversion to a fully quantized
+      for a different type for input arrays. If an integer type is provided and
+      `optimizations` are not used, `quantized_input_stats` must be provided. If
+      `inference_type` is tf.uint8, signaling conversion to a fully quantized
       model from a quantization-aware trained input model, then
-      `inference_input_type` defaults to tf.uint8.
-      In all other cases, `inference_input_type` defaults to tf.float32.
-      Must be `{tf.float32, tf.uint8, tf.int8}`
+      `inference_input_type` defaults to tf.uint8. In all other cases,
+      `inference_input_type` defaults to tf.float32. Must be `{tf.float32,
+      tf.uint8, tf.int8}`
     inference_output_type: Target data type of real-number output arrays. Allows
-      for a different type for output arrays.
-      If `inference_type` is tf.uint8, signaling conversion to a fully quantized
-      model from a quantization-aware trained output model, then
-      `inference_output_type` defaults to tf.uint8.
+      for a different type for output arrays. If `inference_type` is tf.uint8,
+      signaling conversion to a fully quantized model from a quantization-aware
+      trained output model, then `inference_output_type` defaults to tf.uint8.
       In all other cases, `inference_output_type` must be tf.float32, an error
-      will be thrown otherwise.
-      Must be `{tf.float32, tf.uint8, tf.int8}`
+      will be thrown otherwise. Must be `{tf.float32, tf.uint8, tf.int8}`
     output_format: Output file format. Currently must be `{TFLITE,
       GRAPHVIZ_DOT}`. (default TFLITE)
     quantized_input_stats: Dict of strings representing input tensor names
       mapped to tuple of floats representing the mean and standard deviation
       of the training data (e.g., {"foo" : (0., 1.)}). Only need if
-      `inference_input_type` is `QUANTIZED_UINT8`.
-      real_input_value = (quantized_input_value - mean_value) / std_dev_value.
-      (default {})
+      `inference_input_type` is `QUANTIZED_UINT8`. real_input_value =
+      (quantized_input_value - mean_value) / std_dev_value. (default {})
     default_ranges_stats: Tuple of integers representing (min, max) range values
       for all arrays without a specified range. Intended for experimenting with
       quantization via "dummy quantization". (default None)
@@ -1623,13 +1662,13 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
     allow_custom_ops: Boolean indicating whether to allow custom operations.
       When false any unknown operation is an error. When true, custom ops are
       created for any op that is unknown. The developer will need to provide
-      these to the TensorFlow Lite runtime with a custom resolver.
-      (default False)
+      these to the TensorFlow Lite runtime with a custom resolver. (default
+      False)
     post_training_quantize: Deprecated. Please specify `[Optimize.DEFAULT]` for
       `optimizations` instead. Boolean indicating whether to quantize the
       weights of the converted float model.  Model size will be reduced and
-      there will be latency improvements (at the cost of accuracy).
-      (default False)
+      there will be latency improvements (at the cost of accuracy). (default
+      False)
     dump_graphviz_dir: Full filepath of folder to dump the graphs at various
       stages of processing GraphViz .dot files. Preferred over
       --output_format=GRAPHVIZ_DOT in order to keep the requirements of the
@@ -1639,18 +1678,17 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
     conversion_summary_dir: A string indicating the path to the generated
       conversion logs.
     target_ops: Deprecated. Please specify `target_spec.supported_ops` instead.
-      Set of OpsSet options indicating which converter to use.
-      (default set([OpsSet.TFLITE_BUILTINS]))
+      Set of OpsSet options indicating which converter to use. (default
+      set([OpsSet.TFLITE_BUILTINS]))
     target_spec: Experimental flag, subject to change. Specification of target
       device.
     optimizations: Experimental flag, subject to change. A list of optimizations
       to apply when converting the model. E.g. `[Optimize.DEFAULT]`
     representative_dataset: A representative dataset that can be used to
-      generate input and output samples for the model. The converter can use
-      the dataset to evaluate different optimizations.
-    experimental_new_converter: Experimental flag, subject to change.
-      Enables MLIR-based conversion instead of TOCO conversion. (default True)
-
+      generate input and output samples for the model. The converter can use the
+      dataset to evaluate different optimizations.
+    experimental_new_converter: Experimental flag, subject to change. Enables
+      MLIR-based conversion instead of TOCO conversion. (default True)
   Example usage:
 
     ```python
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index d17fc94cd20..65e7b572a85 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -155,8 +155,8 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
-    converter.inference_input_type = lite_constants.QUANTIZED_UINT8
-    converter.inference_type = lite_constants.FLOAT
+    converter.inference_input_type = dtypes.uint8
+    converter.inference_type = dtypes.float32
     converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
     tflite_model = converter.convert()
     self.assertIsNotNone(tflite_model)
@@ -788,8 +788,8 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [inp], [output])
     quantized_converter.experimental_new_converter = enable_mlir_converter
-    quantized_converter.inference_input_type = lite_constants.INT8
-    quantized_converter.inference_output_type = lite_constants.INT8
+    quantized_converter.inference_input_type = dtypes.int8
+    quantized_converter.inference_output_type = dtypes.int8
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
     quantized_converter.representative_dataset = calibration_gen
     quantized_tflite_model = quantized_converter.convert()
@@ -832,7 +832,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     quantized_converter.experimental_new_converter = enable_mlir_converter
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
     # Restricting to int8 type only
-    quantized_converter.target_spec.supported_types = [lite.constants.INT8]
+    quantized_converter.target_spec.supported_types = [dtypes.int8]
     # A representative dataset is required for full fixed point quantization.
     with self.assertRaises(ValueError) as error:
       quantized_converter.convert()
@@ -857,7 +857,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     converter = lite.TFLiteConverter.from_session(sess,
                                                   [in_tensor_1, in_tensor_2],
                                                   [out_tensor])
-    converter.inference_type = lite_constants.QUANTIZED_UINT8
+    converter.inference_type = dtypes.uint8
     converter.quantized_input_stats = {
         'inputA': (0., 1.),
         'inputB': (0., 1.)
@@ -898,7 +898,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
-    converter.inference_type = lite_constants.QUANTIZED_UINT8
+    converter.inference_type = dtypes.uint8
     converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
     converter.default_ranges_stats = (0, 6)  # min, max
     tflite_model = converter.convert()
@@ -954,16 +954,15 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     interpreter.allocate_tensors()
     self.assertEqual(interpreter.get_tensor_details()[idx]['name'], node_name)
     self.assertEqual(interpreter.get_tensor_details()[idx]['dtype'],
-                     lite.constants.FLOAT)
+                     dtypes.float32)
     # Convert model to quantized version
     quantized_converter = lite.TFLiteConverter.from_session(
         sess, [inp], [output])
     quantized_converter.experimental_new_converter = enable_mlir_converter
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_converter.target_spec.supported_types = [lite.constants.FLOAT16]
+    quantized_converter.target_spec.supported_types = [dtypes.float16]
     if include_int8:
-      quantized_converter.target_spec.supported_types.append(
-          lite.constants.INT8)
+      quantized_converter.target_spec.supported_types.append(dtypes.int8)
     if use_rep_data:
       quantized_converter.representative_dataset = calibration_gen
 
@@ -984,11 +983,11 @@ class FromSessionTest(TestModels, parameterized.TestCase):
       if is_float16_quantized:
         # Verify that bias constant is float16 type.
         self.assertEqual(interpreter.get_tensor_details()[idx]['dtype'],
-                         lite.constants.FLOAT16)
+                         dtypes.float16)
       elif is_post_training_quantized:
         # Verify that bias constants is int32 type.
         self.assertEqual(interpreter.get_tensor_details()[idx]['dtype'],
-                         lite.constants.INT32)
+                         dtypes.int32)
       else:
         raise ValueError('Invalid test options.')
 
@@ -1005,7 +1004,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
         sess, [inp], [output])
     quantized_converter.experimental_new_converter = enable_mlir_converter
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_converter.target_spec.supported_types = [lite.constants.FLOAT16]
+    quantized_converter.target_spec.supported_types = [dtypes.float16]
     # Specify only int8 builtin ops
     quantized_converter.target_spec.supported_ops = [
         lite.OpsSet.TFLITE_BUILTINS_INT8
@@ -1017,8 +1016,8 @@ class FromSessionTest(TestModels, parameterized.TestCase):
         str(error.exception))
 
   @parameterized.named_parameters(
-      ('InferenceType_INT8', lite_constants.INT8),
-      ('InferenceType_UINT8', lite_constants.QUANTIZED_UINT8))
+      ('InferenceType_INT8', dtypes.int8),
+      ('InferenceType_UINT8', dtypes.uint8))
   def testInvalidQuantizeQATModelRequiresInputStats(self, quantized_type):
     with ops.Graph().as_default():
       in_tensor = array_ops.placeholder(
@@ -1039,7 +1038,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
         'flag is set to tf.uint8 or tf.int8.', str(error.exception))
 
     with self.assertRaises(ValueError) as error:
-      quantized_converter.inference_type = lite_constants.FLOAT
+      quantized_converter.inference_type = dtypes.float32
       quantized_converter.inference_input_type = quantized_type
       quantized_converter.convert()
     self.assertEqual(
@@ -1070,7 +1069,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     converter = lite.TFLiteConverter.from_session(sess,
                                                   [in_tensor_1, in_tensor_2],
                                                   [out_tensor])
-    converter.inference_type = lite_constants.QUANTIZED_UINT8
+    converter.inference_type = dtypes.uint8
     converter.quantized_input_stats = {'inputA': (0., 1.)}  # mean, std_dev
     with self.assertRaises(ValueError) as error:
       converter.convert()
@@ -1091,9 +1090,9 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
 
     # extra flags to trigger training time quantization conversion
-    converter.inference_type = lite_constants.INT8
-    converter.inference_input_type = lite_constants.FLOAT
-    converter.inference_output_type = lite_constants.FLOAT
+    converter.inference_type = dtypes.int8
+    converter.inference_input_type = dtypes.float32
+    converter.inference_output_type = dtypes.float32
     input_arrays = converter.get_input_arrays()
     converter.quantized_input_stats = {
         input_arrays[0]: (0., 1.)
@@ -1255,7 +1254,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
-    converter.inference_type = lite_constants.QUANTIZED_UINT8
+    converter.inference_type = dtypes.uint8
     converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
     tflite_model = converter.convert()
     self.assertIsNotNone(tflite_model)
@@ -2334,7 +2333,7 @@ class DefaultConverterAttrsTest(LiteTest):
     self.assertEqual(converter.output_format, lite_constants.TFLITE)
 
     # Assert the default inference type is float.
-    self.assertEqual(converter.inference_type, lite_constants.FLOAT)
+    self.assertEqual(converter.inference_type, dtypes.float32)
 
     # Assert the default inference type overrides are None.
     self.assertIsNone(converter.inference_input_type)
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index 714eb249ec9..4851912226a 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -32,6 +32,7 @@ from tensorflow.lite.python import lite_v2_test_util
 from tensorflow.lite.python.convert import mlir_quantize
 from tensorflow.lite.python.interpreter import Interpreter
 from tensorflow.lite.toco import types_pb2 as _types_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.layers import recurrent
@@ -74,9 +75,9 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     self.assertEqual(expected_value.numpy(), actual_value)
 
   @parameterized.named_parameters(
-      ('_INT8InputOutput', lite.constants.INT8),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8),
-      ('_INT16InputOutput', lite.constants.INT16))
+      ('_INT8InputOutput', dtypes.int8),
+      ('_UINT8InputOutput', dtypes.uint8),
+      ('_INT16InputOutput', dtypes.int16))
   @test_util.run_v2_only
   def testInvalidFloat(self, inference_input_output_type):
     root = self._getSimpleVariableModel()
@@ -85,8 +86,6 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
 
     # Convert model.
     converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
-    # We don't support integer types as we don't have statistical information
-    # to quantize (only supported for post training integer quantization).
     with self.assertRaises(ValueError) as error:
       converter.inference_input_type = inference_input_output_type
       converter.inference_output_type = inference_input_output_type
@@ -196,9 +195,9 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     self.assertLess(len(quantized_tflite_model), len(float_tflite_model))
 
   @parameterized.named_parameters(
-      ('_INT8InputOutput', lite.constants.INT8),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8),
-      ('_INT16InputOutput', lite.constants.INT16))
+      ('_INT8InputOutput', dtypes.int8),
+      ('_UINT8InputOutput', dtypes.uint8),
+      ('_INT16InputOutput', dtypes.int16))
   @test_util.run_v2_only
   def testInvalidPostTrainingDynamicRangeQuantization(
       self, inference_input_output_type):
@@ -212,8 +211,6 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     # Convert quantized model.
     quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    # We don't support integer types as we don't have statistical information
-    # to quantize (only supported for post training integer quantization).
     with self.assertRaises(ValueError) as error:
       quantized_converter.inference_input_type = inference_input_output_type
       quantized_converter.inference_output_type = inference_input_output_type
@@ -223,11 +220,20 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
         'must be tf.float32.', str(error.exception))
 
   @parameterized.named_parameters(
-      ('_DefaultFLOAT32InputOutput', lite.constants.FLOAT),
-      ('_INT8InputOutput', lite.constants.INT8),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
-  def testPostTrainingIntegerAllowFloatQuantization(
-      self, inference_input_output_type):
+      ('_Default', False, False, dtypes.float32),
+      ('_INT8InputOutput', False, False, dtypes.int8),
+      ('_UINT8InputOutput', False, False, dtypes.uint8),
+      ('_INT16Quantize', False, True, dtypes.float32),
+      ('_INT16Quantize_INT16InputOutput', False, True, dtypes.int16),
+      ('_IntOnly', True, False, dtypes.float32),
+      ('_IntOnly_INT8InputOutput', True, False, dtypes.int8),
+      ('_IntOnly_UINT8InputOutput', True, False,
+       dtypes.uint8),
+      ('_IntOnly_INT16Quantize', True, True, dtypes.float32),
+      ('_IntOnly_INT16Quantize_INT16InputOutput', True, True,
+       dtypes.int16))
+  def testIntegerQuantization(self, is_int_only, is_int16_quantize,
+                              inference_input_output_type):
     func, calibration_gen = self._getIntegerQuantizeModel()
 
     # Convert float model.
@@ -239,111 +245,8 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
     quantized_converter.optimizations = [lite.Optimize.DEFAULT]
     quantized_converter.representative_dataset = calibration_gen
-    quantized_converter.inference_input_type = inference_input_output_type
-    quantized_converter.inference_output_type = inference_input_output_type
-    quantized_tflite_model = quantized_converter.convert()
-    self.assertIsNotNone(quantized_tflite_model)
-
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
-    self.assertLen(input_details, 1)
-    self.assertEqual(inference_input_output_type.as_numpy_dtype,
-                     input_details[0]['dtype'])
-    output_details = interpreter.get_output_details()
-    self.assertLen(output_details, 1)
-    self.assertEqual(inference_input_output_type.as_numpy_dtype,
-                     output_details[0]['dtype'])
-
-    # Ensure that the quantized tflite model is smaller.
-    self.assertLess(len(quantized_tflite_model), len(tflite_model))
-
-  def testPostTrainingIntegerAllowFloatQuantizationINT16InputOutput(self):
-    func, calibration_gen = self._getIntegerQuantizeModel()
-
-    # Convert float model.
-    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    # Post-training quantization 16x8 with float fallback allowed.
-    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_converter.representative_dataset = calibration_gen
-    quantized_converter.target_spec.supported_ops = [
-        lite.OpsSet.\
-        EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8,
-        lite.OpsSet.TFLITE_BUILTINS
-    ]
-    inference_input_output_type = lite.constants.INT16
-    quantized_converter.inference_input_type = inference_input_output_type
-    quantized_converter.inference_output_type = inference_input_output_type
-    quantized_tflite_model = quantized_converter.convert()
-    self.assertIsNotNone(quantized_tflite_model)
-
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
-    self.assertLen(input_details, 1)
-    self.assertEqual(inference_input_output_type.as_numpy_dtype,
-                     input_details[0]['dtype'])
-    output_details = interpreter.get_output_details()
-    self.assertLen(output_details, 1)
-    self.assertEqual(inference_input_output_type.as_numpy_dtype,
-                     output_details[0]['dtype'])
-
-    # Ensure that the quantized tflite model is smaller.
-    self.assertLess(len(quantized_tflite_model), len(tflite_model))
-
-  def testPostTrainingIntegerQuant16x8MismatchInferenceParams(self):
-    # In this test we check that when we do 16x8 post-training
-    # quantization and set inference_input(output)_type to
-    # constants.INT8, we have an error.
-    func, calibration_gen = self._getIntegerQuantizeModel()
-
-    # Convert quantized model.
-    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_converter.representative_dataset = calibration_gen
-    quantized_converter.target_spec.supported_ops = [
-        lite.OpsSet.\
-          EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
-        ]
-
-    with self.assertRaises(ValueError) as error:
-      quantized_converter.inference_input_type = lite.constants.INT8
-      quantized_converter.inference_output_type = lite.constants.INT8
-      quantized_converter.convert()
-    self.assertEqual(
-        "The inference_input_type and inference_output_type "
-        "must be in ['tf.float32', 'tf.int16'].", str(error.exception))
-
-  @parameterized.named_parameters(
-      ('_DefaultFLOAT32InputOutput_UseTargetTypesFlag', lite.constants.FLOAT,
-       False, False),
-      ('_DefaultFLOAT32InputOutput', lite.constants.FLOAT, True, False),
-      ('_INT8InputOutput', lite.constants.INT8, True, False),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8, True, False),
-      ('_INT16InputOutput', lite.constants.INT16, True, True))
-  @test_util.run_v2_only
-  def testPostTrainingIntegerNoFloatQuantization(self,
-                                                 inference_input_output_type,
-                                                 use_target_ops_flag,
-                                                 quantization_16x8):
-    func, calibration_gen = self._getIntegerQuantizeModel()
-
-    # Convert float model.
-    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    # Convert model by specifying target spec (instead of optimizations), since
-    # when targeting an integer only backend, quantization is mandatory.
-    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_converter.representative_dataset = calibration_gen
-    if use_target_ops_flag:
-      if quantization_16x8:
+    if is_int_only:
+      if is_int16_quantize:
         quantized_converter.target_spec.supported_ops = [
             lite.OpsSet.\
             EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
@@ -353,7 +256,12 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
             lite.OpsSet.TFLITE_BUILTINS_INT8
         ]
     else:
-      quantized_converter.target_spec.supported_types = [lite.constants.INT8]
+      if is_int16_quantize:
+        quantized_converter.target_spec.supported_ops = [
+            lite.OpsSet.\
+            EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8,
+            lite.OpsSet.TFLITE_BUILTINS
+        ]
     quantized_converter.inference_input_type = inference_input_output_type
     quantized_converter.inference_output_type = inference_input_output_type
     quantized_tflite_model = quantized_converter.convert()
@@ -373,6 +281,30 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     # Ensure that the quantized tflite model is smaller.
     self.assertLess(len(quantized_tflite_model), len(tflite_model))
 
+  @parameterized.named_parameters(
+      ('_INT16Quantize_INT8InputOutput', True, dtypes.int8))
+  def testInvalidIntegerQuantization(self, is_int16_quantize,
+                                     inference_input_output_type):
+    func, calibration_gen = self._getIntegerQuantizeModel()
+
+    # Convert quantized model.
+    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    quantized_converter.representative_dataset = calibration_gen
+    if is_int16_quantize:
+      quantized_converter.target_spec.supported_ops = [
+          lite.OpsSet.\
+          EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8,
+          lite.OpsSet.TFLITE_BUILTINS
+      ]
+    with self.assertRaises(ValueError) as error:
+      quantized_converter.inference_input_type = dtypes.int8
+      quantized_converter.inference_output_type = dtypes.int8
+      quantized_converter.convert()
+    self.assertEqual(
+        "The inference_input_type and inference_output_type "
+        "must be in ['tf.float32', 'tf.int16'].", str(error.exception))
+
   def testCalibrateAndQuantizeBuiltinInt16(self):
     func, calibration_gen = self._getIntegerQuantizeModel()
 
@@ -446,9 +378,9 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     return tf.keras.Sequential(QLinear(3, input_shape=(2,)))
 
   @parameterized.named_parameters(
-      ('_DefaultFLOAT32InputOutput', lite.constants.FLOAT),
-      ('_INT8InputOutput', lite.constants.INT8),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
+      ('_DefaultFLOAT32InputOutput', dtypes.float32),
+      ('_INT8InputOutput', dtypes.int8),
+      ('_UINT8InputOutput', dtypes.uint8))
   @test_util.run_v2_only
   def testTrainingTimeQuantization(self, inference_input_output_type):
     model = self._getTrainingTimeQuantizedModel()
@@ -556,6 +488,36 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     converter.convert()
     self._assertValidDebugInfo(converter._debug_info)
 
+  @test_util.run_v2_only
+  def testFlexOpWithInt8OpSet(self):
+    model = tf.keras.Sequential()
+    input_shape = (1, 4, 4, 4, 1)
+    model.add(
+        tf.keras.layers.Conv3D(
+            4,
+            kernel_size=(1, 1, 1),
+            activation='relu',
+            input_shape=input_shape[1:]))
+    model.add(tf.keras.layers.Flatten())
+    model.add(tf.keras.layers.Dense(2, activation='relu'))
+
+    @tf.function(
+        input_signature=[tf.TensorSpec(shape=input_shape, dtype=tf.float32)])
+    def _call_fn(inputs):
+      return model(inputs, training=False)
+
+    concrete_func = _call_fn.get_concrete_function(
+        tf.TensorSpec(input_shape, dtype=tf.float32))
+
+    converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
+    converter.optimizations = [tf.lite.Optimize.DEFAULT]
+    converter.target_spec.supported_ops = [
+        tf.lite.OpsSet.TFLITE_BUILTINS_INT8,
+        tf.lite.OpsSet.SELECT_TF_OPS,
+    ]
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
 
 class FromSavedModelTest(lite_v2_test_util.ModelTest):
 
@@ -766,6 +728,18 @@ class FromSavedModelTest(lite_v2_test_util.ModelTest):
     converter.convert()
     self._assertValidDebugInfo(converter._debug_info)
 
+  @test_util.run_v2_only
+  def testFallbackPath(self):
+    """Test a SavedModel fallback path using old converter."""
+    saved_model_dir = self._createV1SavedModel(shape=[1, 16, 16, 3])
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverterV2.from_saved_model(saved_model_dir)
+    converter.experimental_new_converter = False
+    tflite_model = converter.convert()
+
+    self.assertTrue(tflite_model)
+
 
 class FromKerasModelTest(lite_v2_test_util.ModelTest):
 
@@ -1046,7 +1020,7 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
     self.assertAllClose(expected_value, actual_value, atol=1e-05)
 
   @test_util.run_v2_only
-  def testKerasBidirectionalRNN(self):
+  def testKerasBidirectionalRNNReturnSequence(self):
     input_data = tf.constant(
         np.array(np.random.random_sample((1, 10, 10)), dtype=np.float32))
     model = tf.keras.models.Sequential()
@@ -1055,6 +1029,25 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
         tf.keras.layers.Bidirectional(
             recurrent_v2.LSTM(units=10, return_sequences=True),
             input_shape=(10, 10)))
+    model.add(tf.keras.layers.Flatten())
+    model.add(tf.keras.layers.Dense(5))
+    model.add(tf.keras.layers.Activation('softmax'))
+
+    # Convert model.
+    converter = lite.TFLiteConverterV2.from_keras_model(model)
+    tflite_model = converter.convert()
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])[0]
+
+    # Check values from converted model.
+    expected_value = model.predict(input_data)
+    self.assertAllClose(expected_value, actual_value, atol=1e-05)
+
+  @test_util.run_v2_only
+  def testKerasBidirectionalRNN(self):
+    input_data = tf.constant(
+        np.array(np.random.random_sample((1, 10, 10)), dtype=np.float32))
+    model = tf.keras.models.Sequential()
+    model.add(tf.keras.layers.Input(batch_size=1, shape=(10, 10), name='input'))
     model.add(tf.keras.layers.Bidirectional(recurrent_v2.LSTM(units=10)))
     model.add(tf.keras.layers.Dense(5))
     model.add(tf.keras.layers.Activation('softmax'))
diff --git a/tensorflow/lite/python/optimize/BUILD b/tensorflow/lite/python/optimize/BUILD
index 1a0d3db3b73..b921fc45cde 100644
--- a/tensorflow/lite/python/optimize/BUILD
+++ b/tensorflow/lite/python/optimize/BUILD
@@ -50,6 +50,7 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         ":_pywrap_tensorflow_lite_calibration_wrapper",  # buildcleaner: keep
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:util",
         "//third_party/py/numpy",
     ],
@@ -67,8 +68,8 @@ py_test(
     tags = ["no_oss"],
     deps = [
         ":calibrator",
-        "//tensorflow/lite/python:lite_constants",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform",
         "//third_party/py/numpy",
diff --git a/tensorflow/lite/python/optimize/calibration_wrapper.cc b/tensorflow/lite/python/optimize/calibration_wrapper.cc
index de3de413c1d..7639e493e83 100644
--- a/tensorflow/lite/python/optimize/calibration_wrapper.cc
+++ b/tensorflow/lite/python/optimize/calibration_wrapper.cc
@@ -225,7 +225,8 @@ PyObject* CalibrationWrapper::SetTensor(int index, PyObject* value) {
   for (int j = 0; j < PyArray_NDIM(array); j++) {
     // Ensure the calibration data input shape is the same as the model input
     // shape unless the dimension is unknown.
-    if (tensor->dims_signature->size == tensor->dims->size &&
+    if (tensor->dims_signature != nullptr &&
+        tensor->dims_signature->size == tensor->dims->size &&
         tensor->dims_signature->data[j] == -1) {
       has_unknown_dims = true;
     } else if (tensor->dims->data[j] != PyArray_SHAPE(array)[j]) {
diff --git a/tensorflow/lite/python/optimize/calibrator.py b/tensorflow/lite/python/optimize/calibrator.py
index 2b08ec690ff..dfef8b9cb79 100644
--- a/tensorflow/lite/python/optimize/calibrator.py
+++ b/tensorflow/lite/python/optimize/calibrator.py
@@ -18,8 +18,9 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+
+from tensorflow.python.framework import dtypes
 from tensorflow.python.util.lazy_loader import LazyLoader
-from tensorflow.lite.python import lite_constants
 
 # Lazy load since some of the performance benchmark skylark rules
 # break dependencies. Must use double quotes to match code internal rewrite
@@ -60,7 +61,7 @@ class Calibrator(object):
                              input_type,
                              output_type,
                              allow_float,
-                             activations_type=lite_constants.INT8,
+                             activations_type=dtypes.int8,
                              resize_input=True):
     """Calibrates the model with specified generator and then quantizes it.
 
diff --git a/tensorflow/lite/python/optimize/calibrator_test.py b/tensorflow/lite/python/optimize/calibrator_test.py
index 371b3514ca3..a9ab12c6095 100644
--- a/tensorflow/lite/python/optimize/calibrator_test.py
+++ b/tensorflow/lite/python/optimize/calibrator_test.py
@@ -23,8 +23,8 @@ from absl.testing import parameterized
 import numpy as np
 from six.moves import range
 
-from tensorflow.lite.python import lite_constants as constants
 from tensorflow.lite.python.optimize import calibrator as _calibrator
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
@@ -34,9 +34,9 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(
       # Activation type Int8
-      ('UseActivationTypeInt8', constants.INT8),
+      ('UseActivationTypeInt8', dtypes.int8),
       # Activation type Int16
-      ('UseActivationTypeInt16', constants.INT16))
+      ('UseActivationTypeInt16', dtypes.int16))
   def test_calibration_with_quantization(self, activations_type):
     model_path = resource_loader.get_path_to_datafile(
         'test_data/mobilenet_like_model.bin')
@@ -49,16 +49,17 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         yield [np.ones(shape=(1, 5, 5, 3), dtype=np.float32)]
 
     quantized_model = quantizer.calibrate_and_quantize(input_gen,
-                                                       constants.FLOAT,
-                                                       constants.FLOAT, False,
+                                                       dtypes.float32,
+                                                       dtypes.float32,
+                                                       False,
                                                        activations_type)
     self.assertIsNotNone(quantized_model)
 
   @parameterized.named_parameters(
       # Activation type Int8
-      ('UseActivationTypeInt8', constants.INT8),
+      ('UseActivationTypeInt8', dtypes.int8),
       # Activation type Int16
-      ('UseActivationTypeInt16', constants.INT16))
+      ('UseActivationTypeInt16', dtypes.int16))
   def test_calibration_with_quantization_allow_float(self, activations_type):
     model_path = resource_loader.get_path_to_datafile(
         'test_data/mobilenet_like_model.bin')
@@ -71,8 +72,9 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         yield [np.ones(shape=(1, 5, 5, 3), dtype=np.float32)]
 
     quantized_model = quantizer.calibrate_and_quantize(input_gen,
-                                                       constants.FLOAT,
-                                                       constants.FLOAT, True,
+                                                       dtypes.float32,
+                                                       dtypes.float32,
+                                                       True,
                                                        activations_type)
     self.assertIsNotNone(quantized_model)
 
@@ -88,7 +90,7 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         yield [np.ones(shape=(1, 5, 5, 3), dtype=np.float32)]
 
     quantized_model = quantizer.calibrate_and_quantize_single(
-        input_gen, constants.FLOAT, constants.FLOAT, True, 'conv2d_8/BiasAdd')
+        input_gen, dtypes.float32, dtypes.float32, True, 'conv2d_8/BiasAdd')
     self.assertIsNotNone(quantized_model)
 
   def test_calibration_with_string_input(self):
@@ -103,14 +105,14 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         yield [np.array(u'Test' + str(i))]
 
     quantized_model = quantizer.calibrate_and_quantize_single(
-        input_gen, constants.FLOAT, constants.FLOAT, True, 'Identity')
+        input_gen, dtypes.float32, dtypes.float32, True, 'Identity')
     self.assertIsNotNone(quantized_model)
 
   @parameterized.named_parameters(
       # Activation type Int8
-      ('UseActivationTypeInt8 - EnableMlirQuantizer', constants.INT8),
+      ('UseActivationTypeInt8 - EnableMlirQuantizer', dtypes.int8),
       # Activation type Int16
-      ('UseActivationTypeInt16 - DisableEnableMlirQuantizer', constants.INT16))
+      ('UseActivationTypeInt16 - DisableEnableMlirQuantizer', dtypes.int16))
   def test_calibration_with_quantization_multiple_inputs(
       self, activations_type):
     # Load multi add model from test data.
@@ -126,8 +128,9 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         yield [np.ones(shape=(1, 8, 8, 3), dtype=np.float32) for _ in range(4)]
 
     quantized_model = quantizer.calibrate_and_quantize(input_gen,
-                                                       constants.FLOAT,
-                                                       constants.FLOAT, False,
+                                                       dtypes.float32,
+                                                       dtypes.float32,
+                                                       False,
                                                        activations_type)
     self.assertIsNotNone(quantized_model)
 
@@ -148,8 +151,8 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         yield i
 
     with self.assertRaises(RuntimeError):
-      quantizer.calibrate_and_quantize(empty_input_gen, constants.FLOAT,
-                                       constants.FLOAT, False)
+      quantizer.calibrate_and_quantize(empty_input_gen, dtypes.float32,
+                                       dtypes.float32, False)
 
   def test_invalid_shape_calibrator_gen(self):
     model_path = resource_loader.get_path_to_datafile(
@@ -163,8 +166,8 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         yield [np.ones(shape=(1, 2, 2, 3), dtype=np.float32)]
 
     with self.assertRaisesRegex(ValueError, 'Size mismatch'):
-      quantizer.calibrate_and_quantize(input_gen, constants.FLOAT,
-                                       constants.FLOAT, False, constants.INT8,
+      quantizer.calibrate_and_quantize(input_gen, dtypes.float32,
+                                       dtypes.float32, False, dtypes.int8,
                                        False)
 
   def test_invalid_type_calibrator_gen(self):
@@ -179,8 +182,8 @@ class CalibratorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         yield [np.ones(shape=(1, 5, 5, 3), dtype=np.int32)]
 
     with self.assertRaises(ValueError):
-      quantizer.calibrate_and_quantize(input_gen, constants.FLOAT,
-                                       constants.FLOAT, False, constants.INT8)
+      quantizer.calibrate_and_quantize(input_gen, dtypes.float32,
+                                       dtypes.float32, False, dtypes.int8)
 
   def test_calibration(self):
     model_path = resource_loader.get_path_to_datafile(
diff --git a/tensorflow/lite/python/testdata/BUILD b/tensorflow/lite/python/testdata/BUILD
index d98e401c76a..83f2d14666b 100644
--- a/tensorflow/lite/python/testdata/BUILD
+++ b/tensorflow/lite/python/testdata/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow/lite:build_def.bzl", "tf_to_tflite")
+load("//tensorflow/lite:build_def.bzl", "DEPRECATED_tf_to_tflite")
 load("//tensorflow:tensorflow.bzl", "pybind_extension")
 
 package(
@@ -8,7 +8,7 @@ package(
 
 exports_files(glob(["*.pb"]))
 
-tf_to_tflite(
+DEPRECATED_tf_to_tflite(
     name = "permute_float",
     src = "permute.pbtxt",
     out = "permute_float.tflite",
@@ -18,7 +18,7 @@ tf_to_tflite(
     ],
 )
 
-tf_to_tflite(
+DEPRECATED_tf_to_tflite(
     name = "permute_uint8",
     src = "permute.pbtxt",
     out = "permute_uint8.tflite",
@@ -33,7 +33,7 @@ tf_to_tflite(
     ],
 )
 
-tf_to_tflite(
+DEPRECATED_tf_to_tflite(
     name = "gather_string",
     src = "gather.pbtxt",
     out = "gather_string.tflite",
@@ -43,11 +43,22 @@ tf_to_tflite(
     ],
 )
 
+DEPRECATED_tf_to_tflite(
+    name = "gather_string_0d",
+    src = "gather_0d.pbtxt",
+    out = "gather_string_0d.tflite",
+    options = [
+        "--input_arrays=input,indices",
+        "--output_arrays=output",
+    ],
+)
+
 filegroup(
     name = "interpreter_test_data",
     srcs = [
         "pc_conv.bin",
         ":gather_string",
+        ":gather_string_0d",
         ":permute_float",
         ":permute_uint8",
     ],
diff --git a/tensorflow/lite/python/testdata/gather_0d.pbtxt b/tensorflow/lite/python/testdata/gather_0d.pbtxt
new file mode 100644
index 00000000000..b065cb22a4e
--- /dev/null
+++ b/tensorflow/lite/python/testdata/gather_0d.pbtxt
@@ -0,0 +1,108 @@
+node {
+  name: "input"
+  op: "Placeholder"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+}
+node {
+  name: "input_const"
+  op: "Const"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_STRING
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        string_val: "abcd"
+      }
+    }
+  }
+}
+node {
+  name: "indices"
+  op: "Placeholder"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+      }
+    }
+  }
+}
+node {
+  name: "axis"
+  op: "Const"
+  device: "/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_INT32
+        tensor_shape {
+        }
+        int_val: 0
+      }
+    }
+  }
+}
+node {
+  name: "output"
+  op: "GatherV2"
+  input: "input_const"
+  input: "indices"
+  input: "axis"
+  device: "/device:CPU:0"
+  attr {
+    key: "Taxis"
+    value {
+      type: DT_INT32
+    }
+  }
+  attr {
+    key: "Tindices"
+    value {
+      type: DT_INT64
+    }
+  }
+  attr {
+    key: "Tparams"
+    value {
+      type: DT_STRING
+    }
+  }
+}
+versions {
+  producer: 27
+}
diff --git a/tensorflow/lite/python/tflite_convert.py b/tensorflow/lite/python/tflite_convert.py
index b5eb66eedc4..6a1e996135b 100644
--- a/tensorflow/lite/python/tflite_convert.py
+++ b/tensorflow/lite/python/tflite_convert.py
@@ -28,11 +28,12 @@ import six
 from six.moves import zip
 
 from tensorflow.lite.python import lite
-from tensorflow.lite.python import lite_constants
+from tensorflow.lite.python.convert import register_custom_opdefs
 from tensorflow.lite.toco import toco_flags_pb2 as _toco_flags_pb2
 from tensorflow.lite.toco.logging import gen_html
 from tensorflow.python import keras
 from tensorflow.python import tf2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.platform import app
 
 
@@ -62,13 +63,14 @@ def _parse_inference_type(value, flag):
     ValueError: Unsupported value.
   """
   if value == "FLOAT":
-    return lite_constants.FLOAT
-  if value == "QUANTIZED_UINT8":
-    return lite_constants.QUANTIZED_UINT8
+    return dtypes.float32
   if value == "INT8":
-    return lite_constants.INT8
-  raise ValueError("Unsupported value for --{0}. Only FLOAT and "
-                   "QUANTIZED_UINT8 are supported.".format(flag))
+    return dtypes.int8
+  if value == "UINT8" or value == "QUANTIZED_UINT8":
+    return dtypes.uint8
+  raise ValueError(
+      "Unsupported value for `{}` flag. Expected FLOAT, INT8 or UINT8, instead "
+      "got {}.".format(flag, value))
 
 
 def _get_tflite_converter(flags):
@@ -128,6 +130,10 @@ def _convert_tf1_model(flags):
   Raises:
     ValueError: Invalid flags.
   """
+  # Register custom opdefs before converter object creation.
+  if flags.custom_opdefs:
+    register_custom_opdefs(_parse_array(flags.custom_opdefs))
+
   # Create converter.
   converter = _get_tflite_converter(flags)
   if flags.inference_type:
@@ -146,10 +152,10 @@ def _convert_tf1_model(flags):
 
     # In quantized inference, mean_value has to be integer so that the real
     # value 0.0 is exactly representable.
-    if converter.inference_type == lite_constants.QUANTIZED_UINT8:
-      mean_values = _parse_array(flags.mean_values, type_fn=int)
-    else:
+    if converter.inference_type == dtypes.float32:
       mean_values = _parse_array(flags.mean_values, type_fn=float)
+    else:
+      mean_values = _parse_array(flags.mean_values, type_fn=int)
     quant_stats = list(zip(mean_values, std_dev_values))
     if ((not flags.input_arrays and len(input_arrays) > 1) or
         (len(input_arrays) != len(quant_stats))):
@@ -176,8 +182,7 @@ def _convert_tf1_model(flags):
 
   if flags.allow_custom_ops:
     converter.allow_custom_ops = flags.allow_custom_ops
-  if flags.custom_opdefs:
-    converter._custom_opdefs = _parse_array(flags.custom_opdefs)  # pylint: disable=protected-access
+
   if flags.target_ops:
     ops_set_options = lite.OpsSet.get_options()
     converter.target_spec.supported_ops = set()
@@ -189,13 +194,13 @@ def _convert_tf1_model(flags):
 
   if flags.post_training_quantize:
     converter.optimizations = [lite.Optimize.DEFAULT]
-    if converter.inference_type == lite_constants.QUANTIZED_UINT8:
+    if converter.inference_type != dtypes.float32:
       print("--post_training_quantize quantizes a graph of inference_type "
-            "FLOAT. Overriding inference type QUANTIZED_UINT8 to FLOAT.")
-      converter.inference_type = lite_constants.FLOAT
+            "FLOAT. Overriding inference_type to FLOAT.")
+      converter.inference_type = dtypes.float32
 
   if flags.quantize_to_float16:
-    converter.target_spec.supported_types = [lite.constants.FLOAT16]
+    converter.target_spec.supported_types = [dtypes.float16]
     if not flags.post_training_quantize:
       print("--quantize_to_float16 will only take effect with the "
             "--post_training_quantize flag enabled.")
@@ -354,14 +359,15 @@ def _get_tf1_flags(parser):
   parser.add_argument(
       "--inference_type",
       type=str.upper,
-      choices=["FLOAT", "QUANTIZED_UINT8", "INT8"],
-      help="Target data type of real-number arrays in the output file.")
+      default="FLOAT",
+      help=("Target data type of real-number arrays in the output file. "
+            "Must be either FLOAT, INT8 or UINT8."))
   parser.add_argument(
       "--inference_input_type",
       type=str.upper,
-      choices=["FLOAT", "QUANTIZED_UINT8", "INT8"],
       help=("Target data type of real-number input arrays. Allows for a "
-            "different type for input arrays in the case of quantization."))
+            "different type for input arrays in the case of quantization. "
+            "Must be either FLOAT, INT8 or UINT8."))
 
   # Input and output arrays flags.
   parser.add_argument(
diff --git a/tensorflow/lite/python/tflite_convert_test.py b/tensorflow/lite/python/tflite_convert_test.py
index 17d466db326..1cd5e61e118 100644
--- a/tensorflow/lite/python/tflite_convert_test.py
+++ b/tensorflow/lite/python/tflite_convert_test.py
@@ -22,7 +22,9 @@ import os
 
 import numpy as np
 
+from tensorflow.core.framework import graph_pb2
 from tensorflow.lite.python import tflite_convert
+from tensorflow.lite.python.convert import register_custom_opdefs
 from tensorflow.python import keras
 from tensorflow.python import tf2
 from tensorflow.python.client import session
@@ -31,6 +33,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.framework.importer import import_graph_def
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import gfile
@@ -165,6 +168,37 @@ class TfLiteConvertV1Test(TestModels):
     self._run(flags_str, should_succeed=True)
     os.remove(graph_def_file)
 
+  def testQATFrozenGraphDefUInt8(self):
+    with ops.Graph().as_default():
+      in_tensor_1 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
+      in_tensor_2 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
+      _ = array_ops.fake_quant_with_min_max_args(
+          in_tensor_1 + in_tensor_2, min=0., max=1., name='output')
+      sess = session.Session()
+
+    # Write graph to file.
+    graph_def_file = self._getFilepath('model.pb')
+    write_graph(sess.graph_def, '', graph_def_file, False)
+    sess.close()
+
+    # Define converter flags
+    flags_str = ('--std_dev_values=128,128 --mean_values=128,128 '
+                 '--graph_def_file={0} --input_arrays={1} '
+                 '--output_arrays={2}'.format(
+                     graph_def_file, 'inputA,inputB', 'output'))
+
+    # Set inference_type UINT8 and (default) inference_input_type UINT8
+    flags_str_1 = flags_str + ' --inference_type=UINT8'
+    self._run(flags_str_1, should_succeed=True)
+
+    # Set inference_type UINT8 and inference_input_type FLOAT
+    flags_str_2 = flags_str_1 + ' --inference_input_type=FLOAT'
+    self._run(flags_str_2, should_succeed=True)
+
+    os.remove(graph_def_file)
+
   def testSavedModel(self):
     saved_model_dir = self._getFilepath('model')
     with ops.Graph().as_default():
@@ -179,6 +213,73 @@ class TfLiteConvertV1Test(TestModels):
     flags_str = '--saved_model_dir={}'.format(saved_model_dir)
     self._run(flags_str, should_succeed=True)
 
+  def _createSavedModelWithCustomOp(self):
+    custom_opdefs_str = (
+        'name: \'CustomAdd\' input_arg: {name: \'Input1\' type: DT_FLOAT} '
+        'input_arg: {name: \'Input2\' type: DT_FLOAT} output_arg: {name: '
+        '\'Output\' type: DT_FLOAT}')
+
+    # Create a graph that has one add op.
+    new_graph = graph_pb2.GraphDef()
+    with ops.Graph().as_default():
+      with session.Session() as sess:
+        in_tensor = array_ops.placeholder(
+            shape=[1, 16, 16, 3], dtype=dtypes.float32, name='input')
+        out_tensor = in_tensor + in_tensor
+        inputs = {'x': in_tensor}
+        outputs = {'z': out_tensor}
+
+        new_graph.CopyFrom(sess.graph_def)
+
+    # Rename Add op name to CustomAdd.
+    for node in new_graph.node:
+      if node.op.startswith('Add'):
+        node.op = 'CustomAdd'
+        del node.attr['T']
+
+    # Register custom op defs to import modified graph def.
+    register_custom_opdefs([custom_opdefs_str])
+
+    # Store saved model.
+    saved_model_dir = self._getFilepath('model')
+    with ops.Graph().as_default():
+      with session.Session() as sess:
+        import_graph_def(new_graph, name='')
+        saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
+    return (saved_model_dir, custom_opdefs_str)
+
+  def testEnsureCustomOpdefsFlag(self):
+    saved_model_dir, _ = self._createSavedModelWithCustomOp()
+
+    # Ensure --custom_opdefs.
+    flags_str = ('--saved_model_dir={0} --allow_custom_ops '
+                 '--experimental_new_converter'.format(saved_model_dir))
+    self._run(flags_str, should_succeed=False)
+
+  def testSavedModelWithCustomOpdefsFlag(self):
+    saved_model_dir, custom_opdefs_str = self._createSavedModelWithCustomOp()
+
+    # Valid conversion.
+    flags_str = (
+        '--saved_model_dir={0} --custom_opdefs="{1}" --allow_custom_ops '
+        '--experimental_new_converter'.format(saved_model_dir,
+                                              custom_opdefs_str))
+    self._run(flags_str, should_succeed=True)
+
+  def testSavedModelWithInvalidCustomOpdefsFlag(self):
+    saved_model_dir, _ = self._createSavedModelWithCustomOp()
+
+    invalid_custom_opdefs_str = (
+        'name: \'CustomAdd\' input_arg: {name: \'Input1\' type: DT_FLOAT} '
+        'output_arg: {name: \'Output\' type: DT_FLOAT}')
+
+    # Valid conversion.
+    flags_str = (
+        '--saved_model_dir={0} --custom_opdefs="{1}" --allow_custom_ops '
+        '--experimental_new_converter'.format(saved_model_dir,
+                                              invalid_custom_opdefs_str))
+    self._run(flags_str, should_succeed=False)
+
   def testKerasFile(self):
     keras_file = self._getKerasModelFile()
 
@@ -269,9 +370,9 @@ class TfLiteConvertV1Test(TestModels):
         'attr : { name: \'nms_iou_threshold\' type: \'float\'} '
         'attr : { name: \'nms_score_threshold\' type: \'float\'} '
         'attr : { name: \'num_classes\' type: \'int\'} '
-        'attr : { name: \'w_scale\' type: \'int\'} '
-        'attr : { name: \'x_scale\' type: \'int\'} '
-        'attr : { name: \'y_scale\' type: \'int\'}')
+        'attr : { name: \'w_scale\' type: \'float\'} '
+        'attr : { name: \'x_scale\' type: \'float\'} '
+        'attr : { name: \'y_scale\' type: \'float\'}')
 
     flags_str = ('--graph_def_file={0} --input_arrays={1} '
                  '--output_arrays={2} --input_shapes={3} '
diff --git a/tensorflow/lite/python/util.py b/tensorflow/lite/python/util.py
index 79d2775d1dc..59c931f62bc 100644
--- a/tensorflow/lite/python/util.py
+++ b/tensorflow/lite/python/util.py
@@ -31,7 +31,6 @@ import flatbuffers
 from tensorflow.core.protobuf import config_pb2 as _config_pb2
 from tensorflow.core.protobuf import graph_debug_info_pb2
 from tensorflow.core.protobuf import meta_graph_pb2 as _meta_graph_pb2
-from tensorflow.lite.python import lite_constants as _lite_constants
 from tensorflow.lite.python import schema_py_generated as schema_fb
 from tensorflow.lite.python.op_hint import convert_op_hints_to_stubs
 from tensorflow.lite.python.op_hint import find_all_hinted_output_nodes
@@ -77,8 +76,10 @@ _MAP_TFLITE_ENUM_TO_TF_TYPES = {
 
 _TFLITE_FILE_IDENTIFIER = b"TFL3"
 
-_TFLITE_MODEL_INPUT_OUTPUT_TYPES = (_lite_constants.FLOAT, _lite_constants.INT8,
-                                    _lite_constants.QUANTIZED_UINT8)
+_MAP_QUANT_TO_IO_TYPES = {
+    dtypes.int8: {dtypes.int8, dtypes.uint8},
+    dtypes.int16: {dtypes.int16},
+}
 
 
 def convert_dtype_to_tflite_type(tf_dtype):
@@ -119,7 +120,7 @@ def _convert_tflite_enum_type_to_tf_type(tflite_enum_type):
   return tf_type
 
 
-def _get_dtype_name(tf_type):
+def _get_tf_type_name(tf_type):
   """Converts tf.dtype (eg: tf.float32) to str (eg: "tf.float32")."""
   return "tf." + tf_type.name
 
@@ -627,118 +628,63 @@ def _remove_tensors_from_model(model, remove_tensors_idxs):
     logging.debug("Removed tensors marked for deletion")
 
 
-def _validate_and_find_int8_quantized_inputs_outputs(model):
-  """Validate that model input is quantized and output is dequantized."""
-  if len(model.subgraphs) > 1:
-    raise ValueError("Model must only have one subgraph. Instead, it has "
-                     "{} subgraphs.".format(len(model.subgraphs)))
+def _modify_model_input_type(model, inference_input_type=dtypes.float32):
+  """Modify model input type."""
+
+  if inference_input_type == dtypes.float32:
+    return
+
   subgraph = model.subgraphs[0]
   tensors = subgraph.tensors
   operators = subgraph.operators
 
-  # Ensure model has atleast one quantize and dequantize operator
-  quant_opcode_idx, dequant_opcode_idx = None, None
+  # Find all quantize operators
+  quant_opcode_idxs = []
   for idx, opcode in enumerate(model.operatorCodes):
     if opcode.builtinCode == schema_fb.BuiltinOperator.QUANTIZE:
-      quant_opcode_idx = idx
-    elif opcode.builtinCode == schema_fb.BuiltinOperator.DEQUANTIZE:
-      dequant_opcode_idx = idx
-    if quant_opcode_idx is not None and dequant_opcode_idx is not None:
-      break
-  if quant_opcode_idx is None and dequant_opcode_idx is None:
-    raise ValueError("Model is not integer quantized as it does not "
-                     "contain quantize/dequantize operators.")
+      quant_opcode_idxs.append(idx)
+  if not quant_opcode_idxs:
+    raise ValueError("Model input is not quantized.")
 
-  # Ensure model inputs and outputs are integer quantized
-  input_quant_ops, output_dequant_ops = [], []
+  # Validate that the model input is quantized
+  input_quant_ops = []
   for op in operators:
-    # Find input quantize operator
-    if op.opcodeIndex == quant_opcode_idx and op.inputs[0] in subgraph.inputs:
-      pos, float_tensor, int_tensor = \
-          "input", tensors[op.inputs[0]], tensors[op.outputs[0]]
+    # Find operators that quantize model input
+    if op.opcodeIndex in quant_opcode_idxs and op.inputs[0] in subgraph.inputs:
+      float_tensor, quant_tensor = tensors[op.inputs[0]], tensors[op.outputs[0]]
+      # If found, validate that the operator's input type is float
+      float_type = _convert_tflite_enum_type_to_tf_type(float_tensor.type)
+      if float_type != dtypes.float32:
+        raise ValueError(
+            "Initial model input type must be tf.float32. Expected type for "
+            "tensor with name '{}' is tf.float32, instead type is {}".format(
+                float_tensor.name, _get_tf_type_name(float_type)))
+      # If found, validate that the operator output is quantized and compatible
+      # with the final model input type
+      quant_type = _convert_tflite_enum_type_to_tf_type(quant_tensor.type)
+      if quant_type not in _MAP_QUANT_TO_IO_TYPES:
+        raise ValueError(
+            "Initial model input is not quantized. Expected type for "
+            "tensor with name '{}' should be in {}, instead type is {}".format(
+                quant_tensor.name,
+                tuple(_get_tf_type_name(t) for t in
+                      _MAP_QUANT_TO_IO_TYPES.keys()),
+                _get_tf_type_name(quant_type)))
+      else:
+        inference_io_types = _MAP_QUANT_TO_IO_TYPES[quant_type]
+        if inference_input_type not in inference_io_types:
+          raise ValueError(
+              "Unsupported `inference_input_type` value. Expected to be in "
+              "{}, instead got {}.".format(
+                  tuple(_get_tf_type_name(t) for t in inference_io_types),
+                  _get_tf_type_name(inference_input_type)))
       input_quant_ops.append(op)
-    # Find output dequantize operator
-    elif op.opcodeIndex == dequant_opcode_idx and \
-        op.outputs[0] in subgraph.outputs:
-      pos, float_tensor, int_tensor = \
-          "output", tensors[op.outputs[0]], tensors[op.inputs[0]]
-      output_dequant_ops.append(op)
-    # Otherwise, ignore
-    else:
-      continue
-    # If found, validate the input/output tensor type
-    if float_tensor.type != schema_fb.TensorType.FLOAT32:
-      raise ValueError(
-          "Model {} type must be tf.float32. Expected type for tensor with "
-          "name '{}' is tf.float32, instead type is tf.{}".format(
-              pos, float_tensor.name,
-              _convert_tflite_enum_type_to_tf_type(float_tensor.type).name))
-    if int_tensor.type != schema_fb.TensorType.INT8:
-      raise ValueError(
-          "Model is not integer quantized. Expected type for tensor with "
-          "name '{}' is tf.int8, instead type is tf.{}".format(
-              int_tensor.name,
-              _convert_tflite_enum_type_to_tf_type(int_tensor.type).name))
 
-  return input_quant_ops, output_dequant_ops
-
-
-def modify_integer_quantized_model_io_type(
-    model, inference_input_type=_lite_constants.FLOAT,
-    inference_output_type=_lite_constants.FLOAT):
-  """Modify the float input/output type of an integer quantized model.
-
-  Args:
-    model: An int8 quantized tflite model with float input and output.
-    inference_input_type: tf.DType representing final input type.
-      (default tf.float32)
-    inference_output_type: tf.DType representing final output type.
-      (default tf.float32)
-
-  Returns:
-    An int8 quantized tflite model with modified input and/or output type.
-
-  Raises:
-    ValueError: If the model is not int8 quantized or the inference_input_type
-      and/or inference_input_type is unsupported.
-    RuntimeError: If the modification was unsuccessful.
-
-  """
-  # Return if input and output types default to float
-  if inference_input_type == _lite_constants.FLOAT and \
-      inference_output_type == _lite_constants.FLOAT:
-    return model
-
-  # Validate input and output types
-  if inference_input_type not in _TFLITE_MODEL_INPUT_OUTPUT_TYPES:
-    raise ValueError("The `inference_input_type` should be in {}".format(
-        tuple(_get_dtype_name(t) for t in _TFLITE_MODEL_INPUT_OUTPUT_TYPES)))
-  if inference_output_type not in _TFLITE_MODEL_INPUT_OUTPUT_TYPES:
-    raise ValueError("The `inference_output_type` should be in {}".format(
-        tuple(_get_dtype_name(t) for t in _TFLITE_MODEL_INPUT_OUTPUT_TYPES)))
-
-  logging.debug(("Attempting to modify the model input from tf.float32 to %s "
-                 "and output from tf.float32 to %s"),
-                _get_dtype_name(inference_input_type),
-                _get_dtype_name(inference_output_type))
-  # Convert the model to an object
-  model = _convert_model_from_bytearray_to_object(model)
-
-  # Validate the integer quantized model
-  input_quant_ops, output_dequant_ops = \
-      _validate_and_find_int8_quantized_inputs_outputs(model)
-
-  # Initialize references and variables
-  if len(model.subgraphs) > 1:
-    raise ValueError("Model must only have one subgraph. Instead, it has "
-                     "{} subgraphs.".format(len(model.subgraphs)))
-  subgraph = model.subgraphs[0]
-  tensors = subgraph.tensors
-  operators = subgraph.operators
-  remove_tensors_idxs = set()
+  if len(subgraph.inputs) != len(input_quant_ops):
+    raise ValueError("Model input is not quantized.")
 
   # Modify model input type
-  if inference_input_type == _lite_constants.QUANTIZED_UINT8:
+  if inference_input_type == dtypes.uint8:
     # Change quant op (float to int8) to quant op (uint8 to int8)
     for op in input_quant_ops:
       int8_quantization = tensors[op.outputs[0]].quantization
@@ -747,35 +693,154 @@ def modify_integer_quantized_model_io_type(
       uint8_quantization.zeroPoint = [int8_quantization.zeroPoint[0] + 128]
       tensors[op.inputs[0]].quantization = uint8_quantization
       tensors[op.inputs[0]].type = schema_fb.TensorType.UINT8
-  elif inference_input_type == _lite_constants.INT8:
+  elif inference_input_type in _MAP_QUANT_TO_IO_TYPES:
     # Remove the inputs and the quant operator
+    remove_tensors_idxs = set()
     for op in input_quant_ops:
       subgraph.inputs[subgraph.inputs == op.inputs[0]] = op.outputs[0]
       remove_tensors_idxs.add(op.inputs[0])
       operators.remove(op)
+    # Remove tensors marked for deletion.
+    _remove_tensors_from_model(model, remove_tensors_idxs)
+  else:
+    raise ValueError(
+        "Unsupported `inference_input_type` value {}.".format(
+            _get_tf_type_name(inference_input_type)))
+
+
+def _modify_model_output_type(model, inference_output_type=dtypes.float32):
+  """Modify model output type."""
+
+  if inference_output_type == dtypes.float32:
+    return
+
+  subgraph = model.subgraphs[0]
+  tensors = subgraph.tensors
+  operators = subgraph.operators
+
+  # Find all dequantize operators
+  dequant_opcode_idxs = []
+  for idx, opcode in enumerate(model.operatorCodes):
+    if opcode.builtinCode == schema_fb.BuiltinOperator.DEQUANTIZE:
+      dequant_opcode_idxs.append(idx)
+  if not dequant_opcode_idxs:
+    raise ValueError("Model output is not dequantized.")
+
+  # Validate that the model output is dequantized
+  output_dequant_ops = []
+  for op in operators:
+    # Find operators that dequantize model output
+    if op.opcodeIndex in dequant_opcode_idxs and \
+        op.outputs[0] in subgraph.outputs:
+      # If found, validate that the operator's output type is float
+      quant_tensor, float_tensor = tensors[op.inputs[0]], tensors[op.outputs[0]]
+      float_type = _convert_tflite_enum_type_to_tf_type(float_tensor.type)
+      if float_type != dtypes.float32:
+        raise ValueError(
+            "Initial model output type must be tf.float32. Expected type for "
+            "tensor with name '{}' is tf.float32, instead type is {}".format(
+                float_tensor.name, _get_tf_type_name(float_type)))
+      # If found, validate that the operator input is quantized and compatible
+      # with the final model output type
+      quant_type = _convert_tflite_enum_type_to_tf_type(quant_tensor.type)
+      if quant_type not in _MAP_QUANT_TO_IO_TYPES:
+        raise ValueError(
+            "Initial model output is not dequantized. Expected type for "
+            "tensor with name '{}' should be in {}, instead type is {}".format(
+                quant_tensor.name,
+                tuple(_get_tf_type_name(t) for t in
+                      _MAP_QUANT_TO_IO_TYPES.keys()),
+                _get_tf_type_name(quant_type)))
+      else:
+        inference_io_types = _MAP_QUANT_TO_IO_TYPES[quant_type]
+        if inference_output_type not in inference_io_types:
+          raise ValueError(
+              "Unsupported `inference_output_type` value. Expected to be in "
+              "{}, instead got {}.".format(
+                  tuple(_get_tf_type_name(t) for t in inference_io_types),
+                  _get_tf_type_name(inference_output_type)))
+      output_dequant_ops.append(op)
+
+  if len(subgraph.outputs) != len(output_dequant_ops):
+    raise ValueError("Model output is not dequantized.")
 
   # Modify model output type
-  if inference_output_type == _lite_constants.QUANTIZED_UINT8:
+  if inference_output_type == dtypes.uint8:
+    # Find a quantize operator
+    quant_opcode_idx = -1
+    for idx, opcode in enumerate(model.operatorCodes):
+      if opcode.builtinCode == schema_fb.BuiltinOperator.QUANTIZE:
+        quant_opcode_idx = idx
+        break
+    # Create a quantize operator, if none exist
+    if quant_opcode_idx == -1:
+      quant_op = schema_fb.OperatorCodeT()
+      quant_op.builtinCode = schema_fb.BuiltinOperator.QUANTIZE
+      quant_op.deprecatedBuiltinCode = schema_fb.BuiltinOperator.QUANTIZE
+      model.operatorCodes.append(quant_op)
+      quant_opcode_idx = len(model.operatorCodes) - 1
     # Change dequant op (int8 to float) to quant op (int8 to uint8)
     for op in output_dequant_ops:
-      op.opcodeIndex = input_quant_ops[0].opcodeIndex
+      op.opcodeIndex = quant_opcode_idx
       int8_quantization = tensors[op.inputs[0]].quantization
       uint8_quantization = schema_fb.QuantizationParametersT()
       uint8_quantization.scale = [int8_quantization.scale[0]]
       uint8_quantization.zeroPoint = [int8_quantization.zeroPoint[0] + 128]
       tensors[op.outputs[0]].quantization = uint8_quantization
       tensors[op.outputs[0]].type = schema_fb.TensorType.UINT8
-  elif inference_output_type == _lite_constants.INT8:
+  elif inference_output_type in _MAP_QUANT_TO_IO_TYPES:
     # Remove the outputs and the dequant operator
+    remove_tensors_idxs = set()
     for op in output_dequant_ops:
       subgraph.outputs[subgraph.outputs == op.outputs[0]] = op.inputs[0]
       remove_tensors_idxs.add(op.outputs[0])
       operators.remove(op)
+    # Remove tensors marked for deletion.
+    _remove_tensors_from_model(model, remove_tensors_idxs)
+  else:
+    raise ValueError(
+        "Unsupported `inference_output_type` value {}.".format(
+            _get_tf_type_name(inference_output_type)))
 
-  # Remove tensors marked for deletion.
-  _remove_tensors_from_model(model, remove_tensors_idxs)
 
-  # Convert the model to a bytearray
-  model = _convert_model_from_object_to_bytearray(model)
+def modify_model_io_type(
+    model, inference_input_type=dtypes.float32,
+    inference_output_type=dtypes.float32):
+  """Modify the input/output type of a tflite model.
+
+  Args:
+    model: A tflite model.
+    inference_input_type: tf.DType representing modified input type.
+      (default tf.float32. If model input is int8 quantized, it must be in
+      {tf.float32, tf.int8,tf.uint8}, else if model input is int16 quantized,
+      it must be in {tf.float32, tf.int16}, else it must be tf.float32)
+    inference_output_type: tf.DType representing modified output type.
+      (default tf.float32. If model output is int8 dequantized, it must be in
+      {tf.float32, tf.int8,tf.uint8}, else if model output is int16 dequantized,
+      it must be in {tf.float32, tf.int16}, else it must be tf.float32)
+  Returns:
+    A tflite model with modified input/output type.
+
+  Raises:
+    ValueError: If `inference_input_type`/`inference_output_type` is unsupported
+      or a supported integer type is specified for a model whose input/output is
+      not quantized/dequantized.
+    RuntimeError: If the modification was unsuccessful.
+
+  """
+  if inference_input_type == dtypes.float32 and \
+      inference_output_type == dtypes.float32:
+    return model
+
+  model_object = _convert_model_from_bytearray_to_object(model)
+
+  if len(model_object.subgraphs) > 1:
+    raise ValueError("Model must only have one subgraph. Instead, it has "
+                     "{} subgraphs.".format(len(model_object.subgraphs)))
+
+  _modify_model_input_type(model_object, inference_input_type)
+
+  _modify_model_output_type(model_object, inference_output_type)
+
+  return _convert_model_from_object_to_bytearray(model_object)
 
-  return model
diff --git a/tensorflow/lite/python/util_test.py b/tensorflow/lite/python/util_test.py
index 820cda4c7d6..e98b50de0de 100644
--- a/tensorflow/lite/python/util_test.py
+++ b/tensorflow/lite/python/util_test.py
@@ -24,7 +24,6 @@ import numpy as np
 from six.moves import range
 import tensorflow as tf
 
-from tensorflow.lite.python import lite_constants
 from tensorflow.lite.python import util
 from tensorflow.lite.toco import types_pb2 as _types_pb2
 from tensorflow.python.client import session
@@ -231,7 +230,7 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual([None, 3, 5], tensor.shape)
 
 
-def _generate_integer_tflite_model():
+def _generate_integer_tflite_model(quantization_type=dtypes.int8):
   """Define an integer post-training quantized tflite model."""
   # Load MNIST dataset
   n = 10  # Number of samples
@@ -277,7 +276,13 @@ def _generate_integer_tflite_model():
               np.float32)
       ]
   converter.representative_dataset = representative_dataset_gen
-  converter.target_spec.supported_ops = {tf.lite.OpsSet.TFLITE_BUILTINS_INT8}
+  if quantization_type == dtypes.int8:
+    converter.target_spec.supported_ops = {tf.lite.OpsSet.TFLITE_BUILTINS_INT8}
+  else:
+    converter.target_spec.supported_ops = {
+        tf.lite.OpsSet
+        .EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
+    }
   tflite_model = converter.convert()
 
   return tflite_model
@@ -286,22 +291,24 @@ def _generate_integer_tflite_model():
 def _test_param_modify_integer_model_io_type():
   """Function to generate parameterized inputs for testing."""
   params = []
-  str_template = "_{}{}{}"
+  str_template = "_{}{}{}{}"
   map_model_type = {
       "PostTraining": True,
       # "DuringTraining": False,
   }
-  map_types = {
-      "": lite_constants.FLOAT,
-      "INT8": lite_constants.INT8,
-      "UINT8": lite_constants.QUANTIZED_UINT8
+  map_quantize_type_to_io_types = {
+      tf.int8: {tf.float32, tf.int8, tf.uint8},
+      tf.int16: {tf.float32, tf.int16}
   }
   for k1, v1 in map_model_type.items():
-    for k2, v2 in map_types.items():
-      istr = "_Input{}".format(k2) if k2 else ""
-      for k3, v3 in map_types.items():
-        ostr = "_Output{}".format(k3) if k3 else "" if istr else "_NoUpdate"
-        params.append((str_template.format(k1, istr, ostr), v1, v2, v3))
+    for qtype, v2 in map_quantize_type_to_io_types.items():
+      qstr = "_IntegerQuantize{}".format(qtype.name.capitalize())
+      for itype in v2:
+        istr = "_Input{}".format(itype.name.capitalize())
+        for otype in v2:
+          ostr = "_Output{}".format(otype.name.capitalize())
+          params.append((str_template.format(k1, qstr, istr, ostr),
+                         v1, qtype, itype, otype))
   return params
 
 
@@ -312,10 +319,12 @@ class UtilModifyIntegerQuantizedModelIOTypeTest(
   @classmethod
   def setUpClass(cls):
     super(UtilModifyIntegerQuantizedModelIOTypeTest, cls).setUpClass()
-    cls.post_train_integer_model = _generate_integer_tflite_model()
+    cls.post_train_int8_model = _generate_integer_tflite_model()
+    cls.post_train_int16_model = _generate_integer_tflite_model(
+        quantization_type=dtypes.int16)
 
   @parameterized.named_parameters(_test_param_modify_integer_model_io_type())
-  def test(self, is_post_train, in_tftype, out_tftype):
+  def test(self, is_post_train, quantization_type, in_tftype, out_tftype):
     """Modify the float input/output type of an integer quantized model."""
 
     def _run_tflite_inference(model, in_tftype, out_tftype):
@@ -354,12 +363,16 @@ class UtilModifyIntegerQuantizedModelIOTypeTest(
 
       return output_data
 
-    model = self.__class__.post_train_integer_model if is_post_train else None
+    if is_post_train and quantization_type == tf.int8:
+      model = self.__class__.post_train_int8_model
+    elif is_post_train and quantization_type == tf.int16:
+      model = self.__class__.post_train_int16_model
+    else:
+      model = None
     # Run model inference with float input output type
     output_data = _run_tflite_inference(model, tf.float32, tf.float32)
     # Run model inference with modified integer input output type
-    model_io = util.modify_integer_quantized_model_io_type(
-        model, in_tftype, out_tftype)
+    model_io = util.modify_model_io_type(model, in_tftype, out_tftype)
     output_io_data = _run_tflite_inference(model_io, in_tftype, out_tftype)
 
      # Validate that both the outputs are the same
diff --git a/tensorflow/lite/python/wrap_toco.py b/tensorflow/lite/python/wrap_toco.py
index c6176275d81..60b33cea8fd 100644
--- a/tensorflow/lite/python/wrap_toco.py
+++ b/tensorflow/lite/python/wrap_toco.py
@@ -55,3 +55,8 @@ def wrapped_experimental_mlir_quantize(input_data_str, disable_per_channel,
 def wrapped_experimental_mlir_sparsify(input_data_str):
   """Wraps experimental mlir sparsify model."""
   return _pywrap_toco_api.ExperimentalMlirSparsifyModel(input_data_str)
+
+
+def wrapped_register_custom_opdefs(custom_opdefs_list):
+  """Wraps RegisterCustomOpdefs with lazy loader."""
+  return _pywrap_toco_api.RegisterCustomOpdefs(custom_opdefs_list)
diff --git a/tensorflow/lite/schema/BUILD b/tensorflow/lite/schema/BUILD
index 0bbb2d5e95d..a3e0952d627 100644
--- a/tensorflow/lite/schema/BUILD
+++ b/tensorflow/lite/schema/BUILD
@@ -1,6 +1,20 @@
 load("//tensorflow:tensorflow.bzl", "py_test")
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite", "tflite_schema_utils_friends")
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
+
+# This is the package group declaration to which targets for TensorFlow Lite
+# Flatbuffer schema utilities.
+#
+# Its usage should be rare, and is often abused by tools that are doing
+# Flatbuffer creation/manipulation in unofficially supported ways.
+package_group(
+    name = "utils_friends",
+    packages = [
+        "//tensorflow/compiler/mlir/lite/...",
+        "//tensorflow/lite/...",
+    ] + tflite_schema_utils_friends(),
+)
 
 package(
     default_visibility = [
@@ -59,17 +73,20 @@ exports_files([
     "schema_v1.fbs",
     "schema_v2.fbs",
     "schema_v3.fbs",
+    "schema_v3a.fbs",
 ])
 
 flatbuffer_cc_library(
     name = "schema_fbs",
     srcs = ["schema.fbs"],
+    compatible_with = get_compatible_with_portable(),
 )
 
 # Generic schema for flatbuffer converter (but with mutable makes bigger).
 flatbuffer_cc_library(
     name = "schema_fbs_with_mutable",
     srcs = ["schema.fbs"],
+    compatible_with = get_compatible_with_portable(),
     flatc_args = [
         "--gen-mutable",
         "--gen-object-api",
@@ -81,6 +98,7 @@ flatbuffer_cc_library(
 flatbuffer_cc_library(
     name = "schema_fbs_with_reflection",
     srcs = ["schema.fbs"],
+    compatible_with = get_compatible_with_portable(),
     flatc_args = [
         "--reflect-types",
         "--reflect-names",
@@ -98,7 +116,7 @@ cc_test(
     srcs = ["flatbuffer_compatibility_test.cc"],
     data = [
         "schema.fbs",
-        "schema_v3.fbs",
+        "schema_v3a.fbs",
     ],
     tags = [
         "no_oss",
@@ -112,4 +130,17 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "schema_utils",
+    srcs = ["schema_utils.cc"],
+    hdrs = ["schema_utils.h"],
+    compatible_with = get_compatible_with_portable(),
+    visibility = [":utils_friends"],
+    deps = [
+        ":schema_fbs",
+        "//tensorflow/lite/kernels/internal:compatibility",
+        "@flatbuffers//:runtime_cc",
+    ],
+)
+
 tflite_portable_test_suite()
diff --git a/tensorflow/lite/schema/builtin_ops_header/generator.cc b/tensorflow/lite/schema/builtin_ops_header/generator.cc
index e2967aee0ff..6a004223bf2 100644
--- a/tensorflow/lite/schema/builtin_ops_header/generator.cc
+++ b/tensorflow/lite/schema/builtin_ops_header/generator.cc
@@ -46,7 +46,8 @@ extern "C" {
 #endif  // __cplusplus
 
 // The enum for builtin operators.
-// Note: CUSTOM and DELEGATE are 2 special ops which are not real built-in ops.
+// Note: CUSTOM, DELEGATE, and PLACEHOLDER_FOR_GREATER_OP_CODES are 3 special
+// ops which are not real built-in ops.
 typedef enum {
 )";
 
diff --git a/tensorflow/lite/schema/flatbuffer_compatibility_test.cc b/tensorflow/lite/schema/flatbuffer_compatibility_test.cc
index 8f88b6204a1..85002b2911e 100644
--- a/tensorflow/lite/schema/flatbuffer_compatibility_test.cc
+++ b/tensorflow/lite/schema/flatbuffer_compatibility_test.cc
@@ -61,8 +61,7 @@ TEST(SchemaTest, TestCompatibility) {
   // Read file contents of schemas into strings
   // TODO(aselle): Need a reliable way to load files.
   std::string base_contents, current_contents;
-  const char *base_filename =
-      TFLITE_TF_PREFIX "lite/schema/schema_v3.fbs";
+  const char *base_filename = TFLITE_TF_PREFIX "lite/schema/schema_v3a.fbs";
   const char *current_filename =
       TFLITE_TF_PREFIX "lite/schema/schema.fbs";
 
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index baeb49f7b7a..71312a7b016 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -17,6 +17,8 @@
 // Version 1: Add subgraphs to schema.
 // Version 2: Rename operators to conform to NN API.
 // Version 3: Move buffer data from Model.Subgraph.Tensors to Model.Buffers.
+// Version 3a: Add new builtin op code field. Has backward compatibility with
+//             version 3.
 
 namespace tflite;
 
@@ -215,7 +217,7 @@ table Tensor {
 // object containing configuration parameters, builtins have a predetermined
 // set of acceptable options.
 
-enum BuiltinOperator : byte {
+enum BuiltinOperator : int32 {
   ADD = 0,
   AVERAGE_POOL_2D = 1,
   CONCATENATION = 2,
@@ -248,7 +250,6 @@ enum BuiltinOperator : byte {
   SPACE_TO_DEPTH = 26,
   SVDF = 27,
   TANH = 28,
-  // TODO(aselle): Consider rename to CONCATENATE_EMBEDDINGS
   CONCAT_EMBEDDINGS = 29,
   SKIP_GRAM = 30,
   CALL = 31,
@@ -349,7 +350,8 @@ enum BuiltinOperator : byte {
   SELECT_V2 = 123,
   DENSIFY = 124,
   SEGMENT_SUM = 125,
-  BATCH_MATMUL = 126
+  BATCH_MATMUL = 126,
+  PLACEHOLDER_FOR_GREATER_OP_CODES = 127
 }
 
 
@@ -982,12 +984,21 @@ table BatchMatMulOptions {
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
-  builtin_code:BuiltinOperator;
+  // This field is for backward compatibility. This field will be used when
+  // the value of the extended builtin_code field has less than
+  // BulitinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES.
+  deprecated_builtin_code:byte;
   custom_code:string;
 
   // The version of the operator. The version need to be bumped whenever new
   // parameters are introduced into an op.
   version:int = 1;
+
+  // This field is introduced for resolving op builtin code shortage problem
+  // (the original BuiltinOperator enum field was represented as a byte).
+  // This field will be used when the value of the extended builtin_code field
+  // has greater than BulitinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES.
+  builtin_code:BuiltinOperator;
 }
 
 enum CustomOptionsFormat : byte {
@@ -1066,6 +1077,32 @@ table Metadata {
   buffer:uint;
 }
 
+// Map from an alias name of tensor to tensor index in the graph.
+// This is used in Signature def.
+table TensorMap {
+  // Represents the alias to use for this tensor.
+  name:string;
+
+  // The actual tensor index in the primary graph, that 'name' corresponds to.
+  tensor_index:uint;
+}
+
+// This corresponds to SignatureDef in Tensorflow SavedModel.
+// The SignatureDef will be part of the SavedModel provided for conversion.
+table SignatureDef {
+  // Named inputs for this signature.
+  inputs:[TensorMap];
+
+  // Named outputs for this signature.
+  outputs:[TensorMap];
+
+  // Exported method name for this signature.
+  method_name:string;
+
+  // Key value which was in the Tensorflow SavedModel SignatureDef map.
+  key:string;
+}
+
 table Model {
   // Version of the schema.
   version:uint;
@@ -1094,6 +1131,9 @@ table Model {
 
   // Metadata about the model.
   metadata:[Metadata];
+
+  // Optional SignatureDefs for the model.
+  signature_defs:[SignatureDef];
 }
 
 root_type Model;
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index c5013edb179..055085cecfc 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -364,6 +364,12 @@ struct BufferT;
 struct Metadata;
 struct MetadataT;
 
+struct TensorMap;
+struct TensorMapT;
+
+struct SignatureDef;
+struct SignatureDefT;
+
 struct Model;
 struct ModelT;
 
@@ -781,11 +787,12 @@ enum BuiltinOperator {
   BuiltinOperator_DENSIFY = 124,
   BuiltinOperator_SEGMENT_SUM = 125,
   BuiltinOperator_BATCH_MATMUL = 126,
+  BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES = 127,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_BATCH_MATMUL
+  BuiltinOperator_MAX = BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES
 };
 
-inline const BuiltinOperator (&EnumValuesBuiltinOperator())[127] {
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[128] {
   static const BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -913,13 +920,14 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[127] {
     BuiltinOperator_SELECT_V2,
     BuiltinOperator_DENSIFY,
     BuiltinOperator_SEGMENT_SUM,
-    BuiltinOperator_BATCH_MATMUL
+    BuiltinOperator_BATCH_MATMUL,
+    BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES
   };
   return values;
 }
 
 inline const char * const *EnumNamesBuiltinOperator() {
-  static const char * const names[128] = {
+  static const char * const names[129] = {
     "ADD",
     "AVERAGE_POOL_2D",
     "CONCATENATION",
@@ -1047,13 +1055,14 @@ inline const char * const *EnumNamesBuiltinOperator() {
     "DENSIFY",
     "SEGMENT_SUM",
     "BATCH_MATMUL",
+    "PLACEHOLDER_FOR_GREATER_OP_CODES",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameBuiltinOperator(BuiltinOperator e) {
-  if (flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_BATCH_MATMUL)) return "";
+  if (flatbuffers::IsOutRange(e, BuiltinOperator_ADD, BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesBuiltinOperator()[index];
 }
@@ -9336,24 +9345,27 @@ flatbuffers::Offset<BatchMatMulOptions> CreateBatchMatMulOptions(flatbuffers::Fl
 
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
-  tflite::BuiltinOperator builtin_code;
+  int8_t deprecated_builtin_code;
   std::string custom_code;
   int32_t version;
+  tflite::BuiltinOperator builtin_code;
   OperatorCodeT()
-      : builtin_code(tflite::BuiltinOperator_ADD),
-        version(1) {
+      : deprecated_builtin_code(0),
+        version(1),
+        builtin_code(tflite::BuiltinOperator_ADD) {
   }
 };
 
 struct OperatorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   typedef OperatorCodeT NativeTableType;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_BUILTIN_CODE = 4,
+    VT_DEPRECATED_BUILTIN_CODE = 4,
     VT_CUSTOM_CODE = 6,
-    VT_VERSION = 8
+    VT_VERSION = 8,
+    VT_BUILTIN_CODE = 10
   };
-  tflite::BuiltinOperator builtin_code() const {
-    return static_cast<tflite::BuiltinOperator>(GetField<int8_t>(VT_BUILTIN_CODE, 0));
+  int8_t deprecated_builtin_code() const {
+    return GetField<int8_t>(VT_DEPRECATED_BUILTIN_CODE, 0);
   }
   const flatbuffers::String *custom_code() const {
     return GetPointer<const flatbuffers::String *>(VT_CUSTOM_CODE);
@@ -9361,12 +9373,16 @@ struct OperatorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   int32_t version() const {
     return GetField<int32_t>(VT_VERSION, 1);
   }
+  tflite::BuiltinOperator builtin_code() const {
+    return static_cast<tflite::BuiltinOperator>(GetField<int32_t>(VT_BUILTIN_CODE, 0));
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int8_t>(verifier, VT_BUILTIN_CODE) &&
+           VerifyField<int8_t>(verifier, VT_DEPRECATED_BUILTIN_CODE) &&
            VerifyOffset(verifier, VT_CUSTOM_CODE) &&
            verifier.VerifyString(custom_code()) &&
            VerifyField<int32_t>(verifier, VT_VERSION) &&
+           VerifyField<int32_t>(verifier, VT_BUILTIN_CODE) &&
            verifier.EndTable();
   }
   OperatorCodeT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -9377,8 +9393,8 @@ struct OperatorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 struct OperatorCodeBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_builtin_code(tflite::BuiltinOperator builtin_code) {
-    fbb_.AddElement<int8_t>(OperatorCode::VT_BUILTIN_CODE, static_cast<int8_t>(builtin_code), 0);
+  void add_deprecated_builtin_code(int8_t deprecated_builtin_code) {
+    fbb_.AddElement<int8_t>(OperatorCode::VT_DEPRECATED_BUILTIN_CODE, deprecated_builtin_code, 0);
   }
   void add_custom_code(flatbuffers::Offset<flatbuffers::String> custom_code) {
     fbb_.AddOffset(OperatorCode::VT_CUSTOM_CODE, custom_code);
@@ -9386,6 +9402,9 @@ struct OperatorCodeBuilder {
   void add_version(int32_t version) {
     fbb_.AddElement<int32_t>(OperatorCode::VT_VERSION, version, 1);
   }
+  void add_builtin_code(tflite::BuiltinOperator builtin_code) {
+    fbb_.AddElement<int32_t>(OperatorCode::VT_BUILTIN_CODE, static_cast<int32_t>(builtin_code), 0);
+  }
   explicit OperatorCodeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -9400,27 +9419,31 @@ struct OperatorCodeBuilder {
 
 inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(
     flatbuffers::FlatBufferBuilder &_fbb,
-    tflite::BuiltinOperator builtin_code = tflite::BuiltinOperator_ADD,
+    int8_t deprecated_builtin_code = 0,
     flatbuffers::Offset<flatbuffers::String> custom_code = 0,
-    int32_t version = 1) {
+    int32_t version = 1,
+    tflite::BuiltinOperator builtin_code = tflite::BuiltinOperator_ADD) {
   OperatorCodeBuilder builder_(_fbb);
+  builder_.add_builtin_code(builtin_code);
   builder_.add_version(version);
   builder_.add_custom_code(custom_code);
-  builder_.add_builtin_code(builtin_code);
+  builder_.add_deprecated_builtin_code(deprecated_builtin_code);
   return builder_.Finish();
 }
 
 inline flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
-    tflite::BuiltinOperator builtin_code = tflite::BuiltinOperator_ADD,
+    int8_t deprecated_builtin_code = 0,
     const char *custom_code = nullptr,
-    int32_t version = 1) {
+    int32_t version = 1,
+    tflite::BuiltinOperator builtin_code = tflite::BuiltinOperator_ADD) {
   auto custom_code__ = custom_code ? _fbb.CreateString(custom_code) : 0;
   return tflite::CreateOperatorCode(
       _fbb,
-      builtin_code,
+      deprecated_builtin_code,
       custom_code__,
-      version);
+      version,
+      builtin_code);
 }
 
 flatbuffers::Offset<OperatorCode> CreateOperatorCode(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -10576,6 +10599,193 @@ inline flatbuffers::Offset<Metadata> CreateMetadataDirect(
 
 flatbuffers::Offset<Metadata> CreateMetadata(flatbuffers::FlatBufferBuilder &_fbb, const MetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct TensorMapT : public flatbuffers::NativeTable {
+  typedef TensorMap TableType;
+  std::string name;
+  uint32_t tensor_index;
+  TensorMapT()
+      : tensor_index(0) {
+  }
+};
+
+struct TensorMap FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TensorMapT NativeTableType;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME = 4,
+    VT_TENSOR_INDEX = 6
+  };
+  const flatbuffers::String *name() const {
+    return GetPointer<const flatbuffers::String *>(VT_NAME);
+  }
+  uint32_t tensor_index() const {
+    return GetField<uint32_t>(VT_TENSOR_INDEX, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) &&
+           VerifyField<uint32_t>(verifier, VT_TENSOR_INDEX) &&
+           verifier.EndTable();
+  }
+  TensorMapT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TensorMapT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TensorMap> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TensorMapBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_name(flatbuffers::Offset<flatbuffers::String> name) {
+    fbb_.AddOffset(TensorMap::VT_NAME, name);
+  }
+  void add_tensor_index(uint32_t tensor_index) {
+    fbb_.AddElement<uint32_t>(TensorMap::VT_TENSOR_INDEX, tensor_index, 0);
+  }
+  explicit TensorMapBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  TensorMapBuilder &operator=(const TensorMapBuilder &);
+  flatbuffers::Offset<TensorMap> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TensorMap>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TensorMap> CreateTensorMap(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::String> name = 0,
+    uint32_t tensor_index = 0) {
+  TensorMapBuilder builder_(_fbb);
+  builder_.add_tensor_index(tensor_index);
+  builder_.add_name(name);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<TensorMap> CreateTensorMapDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const char *name = nullptr,
+    uint32_t tensor_index = 0) {
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  return tflite::CreateTensorMap(
+      _fbb,
+      name__,
+      tensor_index);
+}
+
+flatbuffers::Offset<TensorMap> CreateTensorMap(flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SignatureDefT : public flatbuffers::NativeTable {
+  typedef SignatureDef TableType;
+  std::vector<std::unique_ptr<tflite::TensorMapT>> inputs;
+  std::vector<std::unique_ptr<tflite::TensorMapT>> outputs;
+  std::string method_name;
+  std::string key;
+  SignatureDefT() {
+  }
+};
+
+struct SignatureDef FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SignatureDefT NativeTableType;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_INPUTS = 4,
+    VT_OUTPUTS = 6,
+    VT_METHOD_NAME = 8,
+    VT_KEY = 10
+  };
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>> *inputs() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>> *>(VT_INPUTS);
+  }
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>> *outputs() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>> *>(VT_OUTPUTS);
+  }
+  const flatbuffers::String *method_name() const {
+    return GetPointer<const flatbuffers::String *>(VT_METHOD_NAME);
+  }
+  const flatbuffers::String *key() const {
+    return GetPointer<const flatbuffers::String *>(VT_KEY);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_INPUTS) &&
+           verifier.VerifyVector(inputs()) &&
+           verifier.VerifyVectorOfTables(inputs()) &&
+           VerifyOffset(verifier, VT_OUTPUTS) &&
+           verifier.VerifyVector(outputs()) &&
+           verifier.VerifyVectorOfTables(outputs()) &&
+           VerifyOffset(verifier, VT_METHOD_NAME) &&
+           verifier.VerifyString(method_name()) &&
+           VerifyOffset(verifier, VT_KEY) &&
+           verifier.VerifyString(key()) &&
+           verifier.EndTable();
+  }
+  SignatureDefT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SignatureDefT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SignatureDef> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SignatureDefBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_inputs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>>> inputs) {
+    fbb_.AddOffset(SignatureDef::VT_INPUTS, inputs);
+  }
+  void add_outputs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>>> outputs) {
+    fbb_.AddOffset(SignatureDef::VT_OUTPUTS, outputs);
+  }
+  void add_method_name(flatbuffers::Offset<flatbuffers::String> method_name) {
+    fbb_.AddOffset(SignatureDef::VT_METHOD_NAME, method_name);
+  }
+  void add_key(flatbuffers::Offset<flatbuffers::String> key) {
+    fbb_.AddOffset(SignatureDef::VT_KEY, key);
+  }
+  explicit SignatureDefBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SignatureDefBuilder &operator=(const SignatureDefBuilder &);
+  flatbuffers::Offset<SignatureDef> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SignatureDef>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SignatureDef> CreateSignatureDef(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>>> inputs = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::TensorMap>>> outputs = 0,
+    flatbuffers::Offset<flatbuffers::String> method_name = 0,
+    flatbuffers::Offset<flatbuffers::String> key = 0) {
+  SignatureDefBuilder builder_(_fbb);
+  builder_.add_key(key);
+  builder_.add_method_name(method_name);
+  builder_.add_outputs(outputs);
+  builder_.add_inputs(inputs);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<SignatureDef> CreateSignatureDefDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    const std::vector<flatbuffers::Offset<tflite::TensorMap>> *inputs = nullptr,
+    const std::vector<flatbuffers::Offset<tflite::TensorMap>> *outputs = nullptr,
+    const char *method_name = nullptr,
+    const char *key = nullptr) {
+  auto inputs__ = inputs ? _fbb.CreateVector<flatbuffers::Offset<tflite::TensorMap>>(*inputs) : 0;
+  auto outputs__ = outputs ? _fbb.CreateVector<flatbuffers::Offset<tflite::TensorMap>>(*outputs) : 0;
+  auto method_name__ = method_name ? _fbb.CreateString(method_name) : 0;
+  auto key__ = key ? _fbb.CreateString(key) : 0;
+  return tflite::CreateSignatureDef(
+      _fbb,
+      inputs__,
+      outputs__,
+      method_name__,
+      key__);
+}
+
+flatbuffers::Offset<SignatureDef> CreateSignatureDef(flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct ModelT : public flatbuffers::NativeTable {
   typedef Model TableType;
   uint32_t version;
@@ -10585,6 +10795,7 @@ struct ModelT : public flatbuffers::NativeTable {
   std::vector<std::unique_ptr<tflite::BufferT>> buffers;
   std::vector<int32_t> metadata_buffer;
   std::vector<std::unique_ptr<tflite::MetadataT>> metadata;
+  std::vector<std::unique_ptr<tflite::SignatureDefT>> signature_defs;
   ModelT()
       : version(0) {
   }
@@ -10599,7 +10810,8 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_DESCRIPTION = 10,
     VT_BUFFERS = 12,
     VT_METADATA_BUFFER = 14,
-    VT_METADATA = 16
+    VT_METADATA = 16,
+    VT_SIGNATURE_DEFS = 18
   };
   uint32_t version() const {
     return GetField<uint32_t>(VT_VERSION, 0);
@@ -10622,6 +10834,9 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>> *metadata() const {
     return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>> *>(VT_METADATA);
   }
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::SignatureDef>> *signature_defs() const {
+    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::SignatureDef>> *>(VT_SIGNATURE_DEFS);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint32_t>(verifier, VT_VERSION) &&
@@ -10641,6 +10856,9 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyOffset(verifier, VT_METADATA) &&
            verifier.VerifyVector(metadata()) &&
            verifier.VerifyVectorOfTables(metadata()) &&
+           VerifyOffset(verifier, VT_SIGNATURE_DEFS) &&
+           verifier.VerifyVector(signature_defs()) &&
+           verifier.VerifyVectorOfTables(signature_defs()) &&
            verifier.EndTable();
   }
   ModelT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -10672,6 +10890,9 @@ struct ModelBuilder {
   void add_metadata(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>>> metadata) {
     fbb_.AddOffset(Model::VT_METADATA, metadata);
   }
+  void add_signature_defs(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::SignatureDef>>> signature_defs) {
+    fbb_.AddOffset(Model::VT_SIGNATURE_DEFS, signature_defs);
+  }
   explicit ModelBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -10692,8 +10913,10 @@ inline flatbuffers::Offset<Model> CreateModel(
     flatbuffers::Offset<flatbuffers::String> description = 0,
     flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>>> buffers = 0,
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> metadata_buffer = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>>> metadata = 0) {
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::Metadata>>> metadata = 0,
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::SignatureDef>>> signature_defs = 0) {
   ModelBuilder builder_(_fbb);
+  builder_.add_signature_defs(signature_defs);
   builder_.add_metadata(metadata);
   builder_.add_metadata_buffer(metadata_buffer);
   builder_.add_buffers(buffers);
@@ -10712,13 +10935,15 @@ inline flatbuffers::Offset<Model> CreateModelDirect(
     const char *description = nullptr,
     const std::vector<flatbuffers::Offset<tflite::Buffer>> *buffers = nullptr,
     const std::vector<int32_t> *metadata_buffer = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::Metadata>> *metadata = nullptr) {
+    const std::vector<flatbuffers::Offset<tflite::Metadata>> *metadata = nullptr,
+    const std::vector<flatbuffers::Offset<tflite::SignatureDef>> *signature_defs = nullptr) {
   auto operator_codes__ = operator_codes ? _fbb.CreateVector<flatbuffers::Offset<tflite::OperatorCode>>(*operator_codes) : 0;
   auto subgraphs__ = subgraphs ? _fbb.CreateVector<flatbuffers::Offset<tflite::SubGraph>>(*subgraphs) : 0;
   auto description__ = description ? _fbb.CreateString(description) : 0;
   auto buffers__ = buffers ? _fbb.CreateVector<flatbuffers::Offset<tflite::Buffer>>(*buffers) : 0;
   auto metadata_buffer__ = metadata_buffer ? _fbb.CreateVector<int32_t>(*metadata_buffer) : 0;
   auto metadata__ = metadata ? _fbb.CreateVector<flatbuffers::Offset<tflite::Metadata>>(*metadata) : 0;
+  auto signature_defs__ = signature_defs ? _fbb.CreateVector<flatbuffers::Offset<tflite::SignatureDef>>(*signature_defs) : 0;
   return tflite::CreateModel(
       _fbb,
       version,
@@ -10727,7 +10952,8 @@ inline flatbuffers::Offset<Model> CreateModelDirect(
       description__,
       buffers__,
       metadata_buffer__,
-      metadata__);
+      metadata__,
+      signature_defs__);
 }
 
 flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -10741,7 +10967,7 @@ inline CustomQuantizationT *CustomQuantization::UnPack(const flatbuffers::resolv
 inline void CustomQuantization::UnPackTo(CustomQuantizationT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = custom(); if (_e) { _o->custom.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->custom[_i] = _e->Get(_i); } } }
+  { auto _e = custom(); if (_e) { _o->custom.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->custom.begin()); } }
 }
 
 inline flatbuffers::Offset<CustomQuantization> CustomQuantization::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -10865,7 +11091,7 @@ inline Uint8VectorT *Uint8Vector::UnPack(const flatbuffers::resolver_function_t
 inline void Uint8Vector::UnPackTo(Uint8VectorT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->values[_i] = _e->Get(_i); } } }
+  { auto _e = values(); if (_e) { _o->values.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->values.begin()); } }
 }
 
 inline flatbuffers::Offset<Uint8Vector> Uint8Vector::Pack(flatbuffers::FlatBufferBuilder &_fbb, const Uint8VectorT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -13695,9 +13921,10 @@ inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_
 inline void OperatorCode::UnPackTo(OperatorCodeT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = builtin_code(); _o->builtin_code = _e; }
+  { auto _e = deprecated_builtin_code(); _o->deprecated_builtin_code = _e; }
   { auto _e = custom_code(); if (_e) _o->custom_code = _e->str(); }
   { auto _e = version(); _o->version = _e; }
+  { auto _e = builtin_code(); _o->builtin_code = _e; }
 }
 
 inline flatbuffers::Offset<OperatorCode> OperatorCode::Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -13708,14 +13935,16 @@ inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(flatbuffers::FlatBuf
   (void)_rehasher;
   (void)_o;
   struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const OperatorCodeT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _builtin_code = _o->builtin_code;
+  auto _deprecated_builtin_code = _o->deprecated_builtin_code;
   auto _custom_code = _o->custom_code.empty() ? 0 : _fbb.CreateString(_o->custom_code);
   auto _version = _o->version;
+  auto _builtin_code = _o->builtin_code;
   return tflite::CreateOperatorCode(
       _fbb,
-      _builtin_code,
+      _deprecated_builtin_code,
       _custom_code,
-      _version);
+      _version,
+      _builtin_code);
 }
 
 inline OperatorT *Operator::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -13732,7 +13961,7 @@ inline void Operator::UnPackTo(OperatorT *_o, const flatbuffers::resolver_functi
   { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = _e->Get(_i); } } }
   { auto _e = builtin_options_type(); _o->builtin_options.type = _e; }
   { auto _e = builtin_options(); if (_e) _o->builtin_options.value = tflite::BuiltinOptionsUnion::UnPack(_e, builtin_options_type(), _resolver); }
-  { auto _e = custom_options(); if (_e) { _o->custom_options.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->custom_options[_i] = _e->Get(_i); } } }
+  { auto _e = custom_options(); if (_e) { _o->custom_options.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->custom_options.begin()); } }
   { auto _e = custom_options_format(); _o->custom_options_format = _e; }
   { auto _e = mutating_variable_inputs(); if (_e) { _o->mutating_variable_inputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->mutating_variable_inputs[_i] = _e->Get(_i) != 0; } } }
   { auto _e = intermediates(); if (_e) { _o->intermediates.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->intermediates[_i] = _e->Get(_i); } } }
@@ -13815,7 +14044,7 @@ inline BufferT *Buffer::UnPack(const flatbuffers::resolver_function_t *_resolver
 inline void Buffer::UnPackTo(BufferT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = data(); if (_e) { _o->data.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->data[_i] = _e->Get(_i); } } }
+  { auto _e = data(); if (_e) { _o->data.resize(_e->size()); std::copy(_e->begin(), _e->end(), _o->data.begin()); } }
 }
 
 inline flatbuffers::Offset<Buffer> Buffer::Pack(flatbuffers::FlatBufferBuilder &_fbb, const BufferT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -13862,6 +14091,70 @@ inline flatbuffers::Offset<Metadata> CreateMetadata(flatbuffers::FlatBufferBuild
       _buffer);
 }
 
+inline TensorMapT *TensorMap::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new TensorMapT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void TensorMap::UnPackTo(TensorMapT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = name(); if (_e) _o->name = _e->str(); }
+  { auto _e = tensor_index(); _o->tensor_index = _e; }
+}
+
+inline flatbuffers::Offset<TensorMap> TensorMap::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTensorMap(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<TensorMap> CreateTensorMap(flatbuffers::FlatBufferBuilder &_fbb, const TensorMapT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TensorMapT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name);
+  auto _tensor_index = _o->tensor_index;
+  return tflite::CreateTensorMap(
+      _fbb,
+      _name,
+      _tensor_index);
+}
+
+inline SignatureDefT *SignatureDef::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SignatureDefT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SignatureDef::UnPackTo(SignatureDefT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = inputs(); if (_e) { _o->inputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->inputs[_i] = std::unique_ptr<tflite::TensorMapT>(_e->Get(_i)->UnPack(_resolver)); } } }
+  { auto _e = outputs(); if (_e) { _o->outputs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->outputs[_i] = std::unique_ptr<tflite::TensorMapT>(_e->Get(_i)->UnPack(_resolver)); } } }
+  { auto _e = method_name(); if (_e) _o->method_name = _e->str(); }
+  { auto _e = key(); if (_e) _o->key = _e->str(); }
+}
+
+inline flatbuffers::Offset<SignatureDef> SignatureDef::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSignatureDef(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SignatureDef> CreateSignatureDef(flatbuffers::FlatBufferBuilder &_fbb, const SignatureDefT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SignatureDefT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _inputs = _o->inputs.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::TensorMap>> (_o->inputs.size(), [](size_t i, _VectorArgs *__va) { return CreateTensorMap(*__va->__fbb, __va->__o->inputs[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _outputs = _o->outputs.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::TensorMap>> (_o->outputs.size(), [](size_t i, _VectorArgs *__va) { return CreateTensorMap(*__va->__fbb, __va->__o->outputs[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _method_name = _o->method_name.empty() ? 0 : _fbb.CreateString(_o->method_name);
+  auto _key = _o->key.empty() ? 0 : _fbb.CreateString(_o->key);
+  return tflite::CreateSignatureDef(
+      _fbb,
+      _inputs,
+      _outputs,
+      _method_name,
+      _key);
+}
+
 inline ModelT *Model::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new ModelT();
   UnPackTo(_o, _resolver);
@@ -13878,6 +14171,7 @@ inline void Model::UnPackTo(ModelT *_o, const flatbuffers::resolver_function_t *
   { auto _e = buffers(); if (_e) { _o->buffers.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->buffers[_i] = std::unique_ptr<tflite::BufferT>(_e->Get(_i)->UnPack(_resolver)); } } }
   { auto _e = metadata_buffer(); if (_e) { _o->metadata_buffer.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->metadata_buffer[_i] = _e->Get(_i); } } }
   { auto _e = metadata(); if (_e) { _o->metadata.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->metadata[_i] = std::unique_ptr<tflite::MetadataT>(_e->Get(_i)->UnPack(_resolver)); } } }
+  { auto _e = signature_defs(); if (_e) { _o->signature_defs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->signature_defs[_i] = std::unique_ptr<tflite::SignatureDefT>(_e->Get(_i)->UnPack(_resolver)); } } }
 }
 
 inline flatbuffers::Offset<Model> Model::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ModelT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -13895,6 +14189,7 @@ inline flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_f
   auto _buffers = _o->buffers.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::Buffer>> (_o->buffers.size(), [](size_t i, _VectorArgs *__va) { return CreateBuffer(*__va->__fbb, __va->__o->buffers[i].get(), __va->__rehasher); }, &_va ) : 0;
   auto _metadata_buffer = _o->metadata_buffer.size() ? _fbb.CreateVector(_o->metadata_buffer) : 0;
   auto _metadata = _o->metadata.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::Metadata>> (_o->metadata.size(), [](size_t i, _VectorArgs *__va) { return CreateMetadata(*__va->__fbb, __va->__o->metadata[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _signature_defs = _o->signature_defs.size() ? _fbb.CreateVector<flatbuffers::Offset<tflite::SignatureDef>> (_o->signature_defs.size(), [](size_t i, _VectorArgs *__va) { return CreateSignatureDef(*__va->__fbb, __va->__o->signature_defs[i].get(), __va->__rehasher); }, &_va ) : 0;
   return tflite::CreateModel(
       _fbb,
       _version,
@@ -13903,7 +14198,8 @@ inline flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_f
       _description,
       _buffers,
       _metadata_buffer,
-      _metadata);
+      _metadata,
+      _signature_defs);
 }
 
 inline bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type) {
diff --git a/tensorflow/lite/schema/schema_utils.cc b/tensorflow/lite/schema/schema_utils.cc
new file mode 100644
index 00000000000..ea930fc9587
--- /dev/null
+++ b/tensorflow/lite/schema/schema_utils.cc
@@ -0,0 +1,109 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/schema/schema_utils.h"
+
+#include <algorithm>
+
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+
+// The following GetBuiltinCode methods are the utility methods for reading
+// builtin operatore code, ensuring compatibility issues between v3 and v3a
+// schema. Always the maximum value of the two fields always will be the correct
+// value as follows:
+//
+// - Supporting schema version v3 models
+//
+// The `builtin_code` field is not available in the v3 models. Flatbuffer
+// library will feed zero value, which is the default value in the v3a schema.
+// The actual builtin operatore code value will exist in the
+// `deprecated_builtin_code` field. At the same time, it implies that
+// `deprecated_builtin_code` >= `builtin_code` and the maximum value of the two
+// fields will be same with `deprecated_builtin_code'.
+//
+// - Supporting builtin operator codes beyonds 127
+//
+// New builtin operators, whose operator code is larger than 127, can not be
+// assigned to the `deprecated_builtin_code` field. In such cases, the
+// value of the `builtin_code` field should be used for the builtin operator
+// code. In the case, the maximum value of the two fields will be the value of
+// the `builtin_code` as the right value.
+
+BuiltinOperator GetBuiltinCode(const OperatorCode *op_code) {
+  // Caller should guarantee that the given argument value is not a nullptr.
+  TFLITE_DCHECK(op_code != nullptr);
+
+  return std::max(
+      op_code->builtin_code(),
+      static_cast<BuiltinOperator>(op_code->deprecated_builtin_code()));
+}
+
+BuiltinOperator GetBuiltinCode(const OperatorCodeT *op_code) {
+  // Caller should guarantee that the given argument value is not a nullptr.
+  TFLITE_DCHECK(op_code != nullptr);
+
+  return std::max(op_code->builtin_code, static_cast<BuiltinOperator>(
+                                             op_code->deprecated_builtin_code));
+}
+
+int8_t ConvertBuiltinCodeToDeprecatedBuiltinCode(
+    const BuiltinOperator builtin_code) {
+  return (builtin_code < BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES)
+             ? static_cast<int8_t>(builtin_code)
+             : static_cast<int8_t>(
+                   BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES);
+}
+
+// The following methods are the following `OperatorCode` table object creation
+// methods for backward compatibility.  These are manually copied from the
+// flatbuffer generated code from schema v3. They serve as overloads for the
+// v3a's CreateOperatorCode functions in schema_generated.h and enable code that
+// still assumes flatbuffer schema v3 to be unchanged with the inclusion of the
+// schema_utils header.
+// TODO(b/162392898): remove once all callers are updated to use schema v3a
+// functions.
+
+flatbuffers::Offset<OperatorCode> CreateOperatorCode(
+    flatbuffers::FlatBufferBuilder &_fbb, BuiltinOperator builtin_code,
+    flatbuffers::Offset<flatbuffers::String> custom_code, int32_t version) {
+  OperatorCodeBuilder builder_(_fbb);
+  builder_.add_version(version);
+
+  int8_t deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES);
+  if (builtin_code < BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES) {
+    deprecated_builtin_code = static_cast<int8_t>(builtin_code);
+  }
+  builder_.add_deprecated_builtin_code(deprecated_builtin_code);
+  builder_.add_custom_code(custom_code);
+  builder_.add_builtin_code(builtin_code);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
+    flatbuffers::FlatBufferBuilder &_fbb, BuiltinOperator builtin_code,
+    const char *custom_code, int32_t version) {
+  auto custom_code__ = custom_code ? _fbb.CreateString(custom_code) : 0;
+  int8_t deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES);
+  if (builtin_code < BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES) {
+    deprecated_builtin_code = static_cast<int8_t>(builtin_code);
+  }
+  return CreateOperatorCode(_fbb, deprecated_builtin_code, custom_code__,
+                            version, builtin_code);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/schema/schema_utils.h b/tensorflow/lite/schema/schema_utils.h
new file mode 100644
index 00000000000..315a8d0daf4
--- /dev/null
+++ b/tensorflow/lite/schema/schema_utils.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_SCHEMA_SCHEMA_UTILS_H_
+#define TENSORFLOW_LITE_SCHEMA_SCHEMA_UTILS_H_
+
+#include "flatbuffers/flatbuffers.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// The following methods are introduced to resolve op builtin code shortage
+// problem. The new builtin opreator will be assigned to the extended builtin
+// code field in the flatbuffer schema. Those methods helps to hide builtin code
+// details.
+BuiltinOperator GetBuiltinCode(const OperatorCode *op_code);
+
+BuiltinOperator GetBuiltinCode(const OperatorCodeT *op_code);
+
+int8_t ConvertBuiltinCodeToDeprecatedBuiltinCode(
+    const BuiltinOperator builtin_code);
+
+// The following methods are for backward compatibility for the early version
+// three, which does not have an extended builtin code.
+flatbuffers::Offset<OperatorCode> CreateOperatorCode(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    BuiltinOperator builtin_code = BuiltinOperator_ADD,
+    flatbuffers::Offset<flatbuffers::String> custom_code = 0,
+    int32_t version = 1);
+
+flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    BuiltinOperator builtin_code = BuiltinOperator_ADD,
+    const char *custom_code = nullptr, int32_t version = 1);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_SCHEMA_SCHEMA_UTILS_H_
diff --git a/tensorflow/lite/schema/schema_v3a.fbs b/tensorflow/lite/schema/schema_v3a.fbs
new file mode 100644
index 00000000000..cae5a63c615
--- /dev/null
+++ b/tensorflow/lite/schema/schema_v3a.fbs
@@ -0,0 +1,1109 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Revision History
+// Version 0: Initial version.
+// Version 1: Add subgraphs to schema.
+// Version 2: Rename operators to conform to NN API.
+// Version 3: Move buffer data from Model.Subgraph.Tensors to Model.Buffers.
+// Version 3a: Add new builtin op code field. Has backward compatibility with
+//             version 3.
+
+namespace tflite;
+
+// This corresponds to the version.
+file_identifier "TFL3";
+// File extension of any written files.
+file_extension "tflite";
+
+// IMPORTANT: All new members of tables, enums and unions must be added at the
+// end to ensure backwards compatibility.
+
+// The type of data stored in a tensor.
+enum TensorType : byte {
+  FLOAT32 = 0,
+  FLOAT16 = 1,
+  INT32 = 2,
+  UINT8 = 3,
+  INT64 = 4,
+  STRING = 5,
+  BOOL = 6,
+  INT16 = 7,
+  COMPLEX64 = 8,
+  INT8 = 9,
+  FLOAT64 = 10,
+  COMPLEX128 = 11,
+}
+
+// Custom quantization parameters for experimenting with new quantization
+// techniques.
+table CustomQuantization {
+  custom:[ubyte] (force_align: 16);
+}
+
+// Represents a specific quantization technique's parameters.
+union QuantizationDetails {
+  CustomQuantization,
+}
+
+// Parameters for converting a quantized tensor back to float.
+table QuantizationParameters {
+  // These four parameters are the asymmetric linear quantization parameters.
+  // Given a quantized value q, the corresponding float value f should be:
+  //   f = scale * (q - zero_point)
+  // For other quantization types, the QuantizationDetails below is used.
+  min:[float];  // For importing back into tensorflow.
+  max:[float];  // For importing back into tensorflow.
+  scale:[float];  // For dequantizing the tensor's values.
+  zero_point:[long];
+
+  // If this is not none, the other quantization parameters (i.e. min, max,
+  // scale, zero_point fields above) are ignored and the value of the
+  // QuantizationDetails union should be used.
+  details:QuantizationDetails;
+
+  // Specifies the dimension of the Tensor's shape that the scales and
+  // zero_points correspond to. For example, a tensor t, with dims=[4, 3, 2, 1]
+  // with quantization params:
+  //   scale=[1.0, 2.0, 3.0], zero_point=[1, 2, 3], quantization_dimension=1
+  // will be quantized across the second dimension of t.
+  //   t[:, 0, :, :] will have scale[0]=1.0, zero_point[0]=1
+  //   t[:, 1, :, :] will have scale[1]=2.0, zero_point[0]=2
+  //   t[:, 2, :, :] will have scale[2]=3.0, zero_point[0]=3
+  quantized_dimension:int;
+}
+
+// Sparse tensors.
+// We use a modification of the TACO format.
+// Reference: http://tensor-compiler.org/kjolstad-oopsla17-tensor-compiler.pdf
+//
+// To encode a conceptual n-dimensional dense tensor with dims (d0, ..., dn-1),
+// potentially with a k-dimensional block (0 <= k <= n) with dims
+// (dn, ..., dn+k-1), the format needs to specify:
+//   1. In what order to traverse these dimensions. For example, to store a 2-D
+//      matrix in row major order, the traversal order would be (d0, d1),
+//      whereas to store it in column major order, the traversal order would be
+//      (d1, d0). If the 2-D matrix has a 2-D inner block, the traversal order
+//      could be (d0, d1, d2, d3).
+//   2. How each block dimension in (dn, ..., dn+k-1) maps to the original
+//      tensor dimension in (d0, ..., dn-1).
+//   3. In the traversal order defined above, the format (dense vs. sparse) and
+//      index metadata for each dimension. For a dense dimension, this is just
+//      the size of that dimension. For a sparse dimension, it's the same as
+//      the compressed index defined in the Compressed Sparse Row (CSR) format.
+//      (http://scipy-lectures.org/advanced/scipy_sparse/csr_matrix.html)
+
+// The storage type for a dimension. Currently we support:
+//   1. DENSE: each coordinate in this dimension is stored implicitly.
+//   2. SPARSE_CSR: only the coordinates with non-zero elements are stored. The
+//      compression technique is the same what CSR uses.
+// More types like a sparse dimension with a different compression technique
+// could be added to the list in the future.
+enum DimensionType : byte {
+  DENSE = 0,
+  SPARSE_CSR = 1,
+}
+
+table Int32Vector {
+  values:[int];
+}
+
+table Uint16Vector {
+  values:[ushort] (force_align: 4);
+}
+
+table Uint8Vector {
+  values:[ubyte] (force_align: 4);
+}
+
+// Variable-typed buffer to store the index metadata for a sparse dimension.
+// The widest type is Int32 instead of UInt32 because tensor's shape is a int32
+// vector. We don't want the per-dimensional index to overflow that range.
+union SparseIndexVector {
+  Int32Vector,
+  Uint16Vector,
+  Uint8Vector
+}
+
+table DimensionMetadata {
+  // Whether a dimension is dense or sparse.
+  format:DimensionType;
+  // Index metadata used for a dimension.
+  //   - If format is DimensionType.DENSE then we use the dense_size field to
+  //     store the size of that dimension. Each index in that dimension is
+  //     stored implicitly.
+  //   - If format is DimensionType.SPARSE_CSR then we use array_segments and
+  //     array_indices to encode that dimension. array_segments represents how
+  //     to segment the indices array, each segment corresponds to one element
+  //     in the previous dimension. array_indices represents the index of the
+  //     non-zero elements within this dimension (as those in the CSR matrix
+  //     format, where the first array is row pointers and the second array is
+  //     column indices).
+  dense_size:int;
+  array_segments:SparseIndexVector;
+  array_indices:SparseIndexVector;
+}
+
+// Parameters to encode a sparse TfLite tensor.
+table SparsityParameters {
+  // The traversal order of the dimensions defined in the `shape` field of the
+  // conceptual dense tensor. For a n-dimensional tensors with dims (d0, d1,
+  // ..., dn-1),
+  //   - if not block sparse, the traversal_order is just a permutation of (d0,
+  //     ..., dn-1). For example, a 2-D matrix stored in row-major order would
+  //     have traversal_order = (d0, d1).
+  //   - if block sparse with a k-dimensional block (0 <= k <= n), the
+  //     traversal_order has n + k elements. The first n elements are still a
+  //     permutation of (d0, ..., dn-1). The lask k elements are a permutation
+  //     of (dn, ..., dn+k-1), defining how to traverse a block internally. For
+  //     example, a 2-D matrix with 2-D blocks, both stored in row-major order
+  //     would have traversal_order = (d0, d1, d2, d3).
+  traversal_order:[int];
+  // For an n-dimensional tensor with a k-dimensional block (0 <= k <= n),
+  // stores how a block dimension in (dn, ..., dn+k-1) maps to the original
+  // tensor dimension in (d0, ..., dn).
+  // It's stored in the order of (dn, ..., dn+k-1).
+  // If not block-sparse, this field is NULL.
+  block_map:[int];
+  // In the traversal order defined above, the metadata needed for
+  // each dimension to locate the non-zero values in the original dense tensor.
+  // The size of the dim_metadata array = the size of the traversal_order array
+  // = n + k.
+  dim_metadata:[DimensionMetadata];
+}
+
+table Tensor {
+  // The tensor shape. The meaning of each entry is operator-specific but
+  // builtin ops use: [batch size, height, width, number of channels] (That's
+  // Tensorflow's NHWC).
+  shape:[int];
+  type:TensorType;
+  // An index that refers to the buffers table at the root of the model. Or,
+  // if there is no data buffer associated (i.e. intermediate results), then
+  // this is 0 (which refers to an always existent empty buffer).
+  //
+  // The data_buffer itself is an opaque container, with the assumption that the
+  // target device is little-endian. In addition, all builtin operators assume
+  // the memory is ordered such that if `shape` is [4, 3, 2], then index
+  // [i, j, k] maps to data_buffer[i*3*2 + j*2 + k].
+  buffer:uint;
+  name:string;  // For debugging and importing back into tensorflow.
+  quantization:QuantizationParameters;  // Optional.
+
+  is_variable:bool = false;
+
+  // Parameters to encode a sparse tensor. See the example in
+  // tensorflow/lite/testdata/sparse_tensor.json.
+  sparsity:SparsityParameters;  // Optional.
+
+  // Encodes `shape` with unknown dimensions. Unknown dimensions are
+  // represented with -1.
+  shape_signature:[int]; // Optional.
+}
+
+// A list of builtin operators. Builtin operators are slightly faster than custom
+// ones, but not by much. Moreover, while custom operators accept an opaque
+// object containing configuration parameters, builtins have a predetermined
+// set of acceptable options.
+
+enum BuiltinOperator : int32 {
+  ADD = 0,
+  AVERAGE_POOL_2D = 1,
+  CONCATENATION = 2,
+  CONV_2D = 3,
+  DEPTHWISE_CONV_2D = 4,
+  DEPTH_TO_SPACE = 5,
+  DEQUANTIZE = 6,
+  EMBEDDING_LOOKUP = 7,
+  FLOOR = 8,
+  FULLY_CONNECTED = 9,
+  HASHTABLE_LOOKUP = 10,
+  L2_NORMALIZATION = 11,
+  L2_POOL_2D = 12,
+  LOCAL_RESPONSE_NORMALIZATION = 13,
+  LOGISTIC = 14,
+  LSH_PROJECTION = 15,
+  LSTM = 16,
+  MAX_POOL_2D = 17,
+  MUL = 18,
+  RELU = 19,
+  // NOTE(aselle): RELU_N1_TO_1 used to be called RELU1, but it was renamed
+  // since different model developers use RELU1 in different ways. Never
+  // create another op called RELU1.
+  RELU_N1_TO_1 = 20,
+  RELU6 = 21,
+  RESHAPE = 22,
+  RESIZE_BILINEAR = 23,
+  RNN = 24,
+  SOFTMAX = 25,
+  SPACE_TO_DEPTH = 26,
+  SVDF = 27,
+  TANH = 28,
+  CONCAT_EMBEDDINGS = 29,
+  SKIP_GRAM = 30,
+  CALL = 31,
+  CUSTOM = 32,
+  EMBEDDING_LOOKUP_SPARSE = 33,
+  PAD = 34,
+  UNIDIRECTIONAL_SEQUENCE_RNN = 35,
+  GATHER = 36,
+  BATCH_TO_SPACE_ND = 37,
+  SPACE_TO_BATCH_ND = 38,
+  TRANSPOSE = 39,
+  MEAN = 40,
+  SUB = 41,
+  DIV = 42,
+  SQUEEZE = 43,
+  UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
+  STRIDED_SLICE = 45,
+  BIDIRECTIONAL_SEQUENCE_RNN = 46,
+  EXP = 47,
+  TOPK_V2 = 48,
+  SPLIT = 49,
+  LOG_SOFTMAX = 50,
+  // DELEGATE is a special op type for the operations which are delegated to
+  // other backends.
+  // WARNING: Experimental interface, subject to change
+  DELEGATE = 51,
+  BIDIRECTIONAL_SEQUENCE_LSTM = 52,
+  CAST = 53,
+  PRELU = 54,
+  MAXIMUM = 55,
+  ARG_MAX = 56,
+  MINIMUM = 57,
+  LESS = 58,
+  NEG = 59,
+  PADV2 = 60,
+  GREATER = 61,
+  GREATER_EQUAL = 62,
+  LESS_EQUAL = 63,
+  SELECT = 64,
+  SLICE = 65,
+  SIN = 66,
+  TRANSPOSE_CONV = 67,
+  SPARSE_TO_DENSE = 68,
+  TILE = 69,
+  EXPAND_DIMS = 70,
+  EQUAL = 71,
+  NOT_EQUAL = 72,
+  LOG = 73,
+  SUM = 74,
+  SQRT = 75,
+  RSQRT = 76,
+  SHAPE = 77,
+  POW = 78,
+  ARG_MIN = 79,
+  FAKE_QUANT = 80,
+  REDUCE_PROD = 81,
+  REDUCE_MAX = 82,
+  PACK = 83,
+  LOGICAL_OR = 84,
+  ONE_HOT = 85,
+  LOGICAL_AND = 86,
+  LOGICAL_NOT = 87,
+  UNPACK = 88,
+  REDUCE_MIN = 89,
+  FLOOR_DIV = 90,
+  REDUCE_ANY = 91,
+  SQUARE = 92,
+  ZEROS_LIKE = 93,
+  FILL = 94,
+  FLOOR_MOD = 95,
+  RANGE = 96,
+  RESIZE_NEAREST_NEIGHBOR = 97,
+  LEAKY_RELU = 98,
+  SQUARED_DIFFERENCE = 99,
+  MIRROR_PAD = 100,
+  ABS = 101,
+  SPLIT_V = 102,
+  UNIQUE = 103,
+  CEIL = 104,
+  REVERSE_V2 = 105,
+  ADD_N = 106,
+  GATHER_ND = 107,
+  COS = 108,
+  WHERE = 109,
+  RANK = 110,
+  ELU = 111,
+  REVERSE_SEQUENCE = 112,
+  MATRIX_DIAG = 113,
+  QUANTIZE = 114,
+  MATRIX_SET_DIAG = 115,
+  ROUND = 116,
+  HARD_SWISH = 117,
+  IF = 118,
+  WHILE = 119,
+  NON_MAX_SUPPRESSION_V4 = 120,
+  NON_MAX_SUPPRESSION_V5 = 121,
+  SCATTER_ND = 122,
+  SELECT_V2 = 123,
+  DENSIFY = 124,
+  SEGMENT_SUM = 125,
+  BATCH_MATMUL = 126,
+  PLACEHOLDER_FOR_GREATER_OP_CODES = 127
+}
+
+
+// Options for the builtin operators.
+union BuiltinOptions {
+  Conv2DOptions,
+  DepthwiseConv2DOptions,
+  ConcatEmbeddingsOptions,
+  LSHProjectionOptions,
+  Pool2DOptions,
+  SVDFOptions,
+  RNNOptions,
+  FullyConnectedOptions,
+  SoftmaxOptions,
+  ConcatenationOptions,
+  AddOptions,
+  L2NormOptions,
+  LocalResponseNormalizationOptions,
+  LSTMOptions,
+  ResizeBilinearOptions,
+  CallOptions,
+  ReshapeOptions,
+  SkipGramOptions,
+  SpaceToDepthOptions,
+  EmbeddingLookupSparseOptions,
+  MulOptions,
+  PadOptions,
+  GatherOptions,
+  BatchToSpaceNDOptions,
+  SpaceToBatchNDOptions,
+  TransposeOptions,
+  ReducerOptions,
+  SubOptions,
+  DivOptions,
+  SqueezeOptions,
+  SequenceRNNOptions,
+  StridedSliceOptions,
+  ExpOptions,
+  TopKV2Options,
+  SplitOptions,
+  LogSoftmaxOptions,
+  CastOptions,
+  DequantizeOptions,
+  MaximumMinimumOptions,
+  ArgMaxOptions,
+  LessOptions,
+  NegOptions,
+  PadV2Options,
+  GreaterOptions,
+  GreaterEqualOptions,
+  LessEqualOptions,
+  SelectOptions,
+  SliceOptions,
+  TransposeConvOptions,
+  SparseToDenseOptions,
+  TileOptions,
+  ExpandDimsOptions,
+  EqualOptions,
+  NotEqualOptions,
+  ShapeOptions,
+  PowOptions,
+  ArgMinOptions,
+  FakeQuantOptions,
+  PackOptions,
+  LogicalOrOptions,
+  OneHotOptions,
+  LogicalAndOptions,
+  LogicalNotOptions,
+  UnpackOptions,
+  FloorDivOptions,
+  SquareOptions,
+  ZerosLikeOptions,
+  FillOptions,
+  BidirectionalSequenceLSTMOptions,
+  BidirectionalSequenceRNNOptions,
+  UnidirectionalSequenceLSTMOptions,
+  FloorModOptions,
+  RangeOptions,
+  ResizeNearestNeighborOptions,
+  LeakyReluOptions,
+  SquaredDifferenceOptions,
+  MirrorPadOptions,
+  AbsOptions,
+  SplitVOptions,
+  UniqueOptions,
+  ReverseV2Options,
+  AddNOptions,
+  GatherNdOptions,
+  CosOptions,
+  WhereOptions,
+  RankOptions,
+  ReverseSequenceOptions,
+  MatrixDiagOptions,
+  QuantizeOptions,
+  MatrixSetDiagOptions,
+  HardSwishOptions,
+  IfOptions,
+  WhileOptions,
+  DepthToSpaceOptions,
+  NonMaxSuppressionV4Options,
+  NonMaxSuppressionV5Options,
+  ScatterNdOptions,
+  SelectV2Options,
+  DensifyOptions,
+  SegmentSumOptions,
+  BatchMatMulOptions
+}
+
+enum Padding : byte { SAME, VALID }
+
+enum ActivationFunctionType : byte {
+  NONE = 0,
+  RELU = 1,
+  RELU_N1_TO_1 = 2,
+  RELU6 = 3,
+  TANH = 4,
+  SIGN_BIT = 5,
+}
+
+table Conv2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  fused_activation_function:ActivationFunctionType;
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
+}
+
+table Pool2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  filter_width:int;
+  filter_height:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table DepthwiseConv2DOptions {
+  // Parameters for DepthwiseConv version 1 or above.
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  // `depth_multiplier` is redundant. It's used by CPU kernels in
+  // TensorFlow 2.0 or below, but ignored in versions above.
+  // See comments in lite/c/builtin_op_data.h for more details.
+  depth_multiplier:int;
+  fused_activation_function:ActivationFunctionType;
+  // Parameters for DepthwiseConv version 2 or above.
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
+}
+
+table ConcatEmbeddingsOptions {
+  num_channels:int;
+  num_columns_per_channel:[int];
+  embedding_dim_per_channel:[int]; // This could be inferred from parameters.
+}
+
+enum LSHProjectionType: byte {
+  UNKNOWN = 0,
+  SPARSE = 1,
+  DENSE = 2,
+}
+
+table LSHProjectionOptions {
+  type: LSHProjectionType;
+}
+
+table SVDFOptions {
+  rank:int;
+  fused_activation_function:ActivationFunctionType;
+  // For weights-only quantization, use asymmetric quantization for non
+  // constant inputs at evaluation time.
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow RNNCell.
+table RNNOptions {
+  fused_activation_function:ActivationFunctionType;
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow dynamic_rnn with RNNCell.
+table SequenceRNNOptions {
+  time_major:bool;
+  fused_activation_function:ActivationFunctionType;
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow bidrectional_dynamic_rnn with RNNCell.
+table BidirectionalSequenceRNNOptions {
+  time_major:bool;
+  fused_activation_function:ActivationFunctionType;
+  merge_outputs: bool;
+  asymmetric_quantize_inputs:bool;
+}
+
+enum FullyConnectedOptionsWeightsFormat: byte {
+  DEFAULT = 0,
+  SHUFFLED4x16INT8 = 1,
+}
+
+// An implementation of TensorFlow fully_connected (a.k.a Dense) layer.
+table FullyConnectedOptions {
+  // Parameters for FullyConnected version 1 or above.
+  fused_activation_function:ActivationFunctionType;
+
+  // Parameters for FullyConnected version 2 or above.
+  weights_format:FullyConnectedOptionsWeightsFormat = DEFAULT;
+
+  // Parameters for FullyConnected version 5 or above.
+  // If set to true, then the number of dimension is preserved. Furthermore,
+  // all but the last dimension of the input and output shapes will be equal.
+  keep_num_dims: bool;
+
+  // Parameters for FullyConnected version 7 or above.
+  // If set to true, then weights-only op will use asymmetric quantization for
+  // inputs.
+  asymmetric_quantize_inputs: bool;
+}
+
+table SoftmaxOptions {
+  beta: float;
+}
+
+// An implementation of TensorFlow concat.
+table ConcatenationOptions {
+  axis:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table AddOptions {
+  fused_activation_function:ActivationFunctionType;
+  // Parameters supported by version 4.
+  pot_scale_int16:bool = true;
+}
+
+table MulOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table L2NormOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table LocalResponseNormalizationOptions {
+  radius:int;
+  bias:float;
+  alpha:float;
+  beta:float;
+}
+
+enum LSTMKernelType : byte {
+  // Full LSTM kernel which supports peephole and projection.
+  FULL = 0,
+  // Basic LSTM kernels. Equivalent to TensorFlow BasicLSTMCell.
+  BASIC = 1,
+}
+
+// An implementation of TensorFlow LSTMCell and CoupledInputForgetGateLSTMCell
+table LSTMOptions {
+  // Parameters for LSTM version 1 or above.
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // Parameters for LSTM version 2 or above.
+  // Basic kernel is only supported in version 2 or above.
+  kernel_type: LSTMKernelType = FULL;
+
+  // Parameters for LSTM version 4 or above.
+  asymmetric_quantize_inputs: bool;
+}
+
+// An implementation of TensorFlow dynamic_rnn with LSTMCell.
+table UnidirectionalSequenceLSTMOptions {
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // If true then first dimension is sequence, otherwise batch.
+  time_major:bool;
+
+  // Parameter for Unidirectional Sequence LSTM version 4.
+  asymmetric_quantize_inputs:bool;
+}
+
+table BidirectionalSequenceLSTMOptions {
+  // Parameters supported by version 1:
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // If true, store the outputs of both directions into the first output.
+  merge_outputs: bool;
+
+  // Parameters supported by version 2:
+  // If true then first dimension is sequence, otherwise batch.
+  // Version 1 implementations assumed time_major to be true, so this default
+  // value should never change.
+  time_major: bool = true;
+
+  // Parameters for version 3 or above.
+  asymmetric_quantize_inputs:bool;
+}
+
+table ResizeBilinearOptions {
+  new_height: int (deprecated);
+  new_width: int (deprecated);
+  align_corners: bool;
+  half_pixel_centers: bool;
+}
+
+table ResizeNearestNeighborOptions {
+  align_corners: bool;
+  half_pixel_centers: bool;
+}
+
+// A call operation options
+table CallOptions {
+  // The subgraph index that needs to be called.
+  subgraph:uint;
+}
+
+table PadOptions {
+}
+
+table PadV2Options {
+}
+
+table ReshapeOptions {
+  new_shape:[int];
+}
+
+table SpaceToBatchNDOptions {
+}
+
+table BatchToSpaceNDOptions {
+}
+
+table SkipGramOptions {
+  ngram_size: int;
+  max_skip_size: int;
+  include_all_ngrams: bool;
+}
+
+table SpaceToDepthOptions {
+  block_size: int;
+}
+
+table DepthToSpaceOptions {
+  block_size: int;
+}
+
+table SubOptions {
+  fused_activation_function:ActivationFunctionType;
+  // Parameters supported by version 5
+  pot_scale_int16:bool = true;
+}
+
+table DivOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table TopKV2Options {
+}
+
+enum CombinerType : byte {
+  SUM = 0,
+  MEAN = 1,
+  SQRTN = 2,
+}
+
+table EmbeddingLookupSparseOptions {
+  combiner:CombinerType;
+}
+
+table GatherOptions {
+  axis: int;
+}
+
+table TransposeOptions {
+}
+
+table ExpOptions {
+}
+
+table CosOptions {
+}
+
+table ReducerOptions {
+  keep_dims: bool;
+}
+
+table SqueezeOptions {
+  squeeze_dims:[int];
+}
+
+table SplitOptions {
+  num_splits: int;
+}
+
+table SplitVOptions {
+  num_splits: int;
+}
+
+table StridedSliceOptions {
+  begin_mask: int;
+  end_mask: int;
+  ellipsis_mask: int;
+  new_axis_mask: int;
+  shrink_axis_mask: int;
+}
+
+table LogSoftmaxOptions {
+}
+
+table CastOptions {
+  in_data_type: TensorType;
+  out_data_type: TensorType;
+}
+
+table DequantizeOptions {
+}
+
+table MaximumMinimumOptions {
+}
+
+table TileOptions {
+}
+
+table ArgMaxOptions {
+  output_type : TensorType;
+}
+
+table ArgMinOptions {
+  output_type : TensorType;
+}
+
+table GreaterOptions {
+}
+
+table GreaterEqualOptions {
+}
+
+table LessOptions {
+}
+
+table LessEqualOptions {
+}
+
+table NegOptions {
+}
+
+table SelectOptions {
+}
+
+table SliceOptions {
+}
+
+table TransposeConvOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+}
+
+table ExpandDimsOptions {
+}
+
+table SparseToDenseOptions {
+  validate_indices:bool;
+}
+
+table EqualOptions {
+}
+
+table NotEqualOptions {
+}
+
+table ShapeOptions {
+  // Optional output type of the operation (int32 or int64). Defaults to int32.
+  out_type : TensorType;
+}
+
+table RankOptions {
+}
+
+table PowOptions {
+}
+
+table FakeQuantOptions {
+  // Parameters supported by version 1:
+  min:float;
+  max:float;
+  num_bits:int;
+
+  // Parameters supported by version 2:
+  narrow_range:bool;
+}
+
+table PackOptions {
+  values_count:int;
+  axis:int;
+}
+
+table LogicalOrOptions {
+}
+
+table OneHotOptions {
+  axis:int;
+}
+
+table AbsOptions {
+}
+
+
+table HardSwishOptions {
+}
+
+table LogicalAndOptions {
+}
+
+table LogicalNotOptions {
+}
+
+table UnpackOptions {
+  num:int;
+  axis:int;
+}
+
+table FloorDivOptions {
+}
+
+table SquareOptions {
+}
+
+table ZerosLikeOptions {
+}
+
+table FillOptions {
+}
+
+table FloorModOptions {
+}
+
+table RangeOptions {
+}
+
+table LeakyReluOptions {
+  alpha:float;
+}
+
+table SquaredDifferenceOptions {
+}
+
+enum MirrorPadMode : byte {
+  // Doesn't include borders.
+  REFLECT = 0,
+  // Includes borders.
+  SYMMETRIC = 1,
+}
+
+table MirrorPadOptions {
+  mode:MirrorPadMode;
+}
+
+table UniqueOptions {
+  idx_out_type:TensorType = INT32;
+}
+
+table ReverseV2Options {
+}
+
+table AddNOptions {
+}
+
+table GatherNdOptions {
+}
+
+table WhereOptions {
+}
+
+table ReverseSequenceOptions {
+  seq_dim:int;
+  batch_dim:int = 0;
+}
+
+table MatrixDiagOptions {
+}
+
+table QuantizeOptions {
+}
+
+table MatrixSetDiagOptions {
+}
+
+table IfOptions {
+  then_subgraph_index:int;
+  else_subgraph_index:int;
+}
+
+table WhileOptions {
+  cond_subgraph_index:int;
+  body_subgraph_index:int;
+}
+
+table NonMaxSuppressionV4Options {
+}
+
+table NonMaxSuppressionV5Options {
+}
+
+table ScatterNdOptions {
+}
+
+table SelectV2Options {
+}
+
+table DensifyOptions {
+}
+
+table SegmentSumOptions {
+}
+
+table BatchMatMulOptions {
+  adj_x:bool;
+  adj_y:bool;
+}
+
+// An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
+// builtin, or a string if the operator is custom.
+table OperatorCode {
+  // This field is for backward compatibility. This field will be used when
+  // the value of the extended builtin_code field has less than
+  // BulitinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES.
+  deprecated_builtin_code:byte;
+  custom_code:string;
+
+  // The version of the operator. The version need to be bumped whenever new
+  // parameters are introduced into an op.
+  version:int = 1;
+
+  // This field is introduced for resolving op builtin code shortage problem.
+  // This field will be used when the value of the extended builtin_code field
+  // is greater than BuiltinOperator_PLACEHOLDER_FOR_GREATER_OP_CODES.
+  builtin_code:BuiltinOperator;
+}
+
+enum CustomOptionsFormat : byte {
+  FLEXBUFFERS = 0,
+}
+
+// An operator takes tensors as inputs and outputs. The type of operation being
+// performed is determined by an index into the list of valid OperatorCodes,
+// while the specifics of each operations is configured using builtin_options
+// or custom_options.
+table Operator {
+  // Index into the operator_codes array. Using an integer here avoids
+  // complicate map lookups.
+  opcode_index:uint;
+
+  // Optional input are indicated by -1.
+  inputs:[int];
+  outputs:[int];
+
+  builtin_options:BuiltinOptions;
+  custom_options:[ubyte];
+  custom_options_format:CustomOptionsFormat;
+
+  // A list of booleans indicating the input tensors which are being mutated by
+  // this operator.(e.g. used by RNN and LSTM).
+  // For example, if the "inputs" array refers to 5 tensors and the second and
+  // fifth are mutable variables, then this list will contain
+  // [false, true, false, false, true].
+  //
+  // If the list is empty, no variable is mutated in this operator.
+  // The list either has the same length as `inputs`, or is empty.
+  mutating_variable_inputs:[bool];
+
+  // A list of indices to the subgraph's "tensors" that are internal to an Op.
+  // Internal tensors are those that do not flow in or out of the operation,
+  // but instead are part of internal computation. As such, the operation's
+  // implementation may manage its memory more efficiently. They are needed
+  // however (i.e. not just an implementation detail) since they are part of the
+  // computation, which may require relevant metadata such as quantization
+  // parameters.
+  intermediates:[int];
+}
+
+// The root type, defining a subgraph, which typically represents an entire
+// model.
+table SubGraph {
+  // A list of all tensors used in this subgraph.
+  tensors:[Tensor];
+
+  // Indices of the tensors that are inputs into this subgraph. Note this is
+  // the list of non-static tensors that feed into the subgraph for inference.
+  inputs:[int];
+
+  // Indices of the tensors that are outputs out of this subgraph. Note this is
+  // the list of output tensors that are considered the product of the
+  // subgraph's inference.
+  outputs:[int];
+
+  // All operators, in execution order.
+  operators:[Operator];
+
+  // Name of this subgraph (used for debugging).
+  name:string;
+}
+
+// Table of raw data buffers (used for constant tensors). Referenced by tensors
+// by index. The generous alignment accommodates mmap-friendly data structures.
+table Buffer {
+  data:[ubyte] (force_align: 16);
+}
+
+table Metadata {
+  // A human readable string to uniquely identify a Metadata.
+  name:string;
+  // An index to the buffers table.
+  buffer:uint;
+}
+
+table Model {
+  // Version of the schema.
+  version:uint;
+
+  // A list of all operator codes used in this model. This is
+  // kept in order because operators carry an index into this
+  // vector.
+  operator_codes:[OperatorCode];
+
+  // All the subgraphs of the model. The 0th is assumed to be the main
+  // model.
+  subgraphs:[SubGraph];
+
+  // A description of the model.
+  description:string;
+
+  // Buffers of the model.
+  // Note the 0th entry of this array must be an empty buffer (sentinel).
+  // This is a convention so that tensors without a buffer can provide 0 as
+  // their buffer.
+  buffers:[Buffer];
+
+  // Metadata about the model. Indirects into the existings buffers list.
+  // Deprecated, prefer to use metadata field.
+  metadata_buffer:[int];
+
+  // Metadata about the model.
+  metadata:[Metadata];
+}
+
+root_type Model;
diff --git a/tensorflow/lite/shared_library.h b/tensorflow/lite/shared_library.h
index 7cf34a03125..a7bd91b3a0a 100644
--- a/tensorflow/lite/shared_library.h
+++ b/tensorflow/lite/shared_library.h
@@ -33,11 +33,11 @@ class SharedLibrary {
     return ::LoadLibrary(lib);
   }
   static inline void* GetLibrarySymbol(void* handle, const char* symbol) {
-    return static_cast<void*>(
+    return reinterpret_cast<void*>(
         GetProcAddress(static_cast<HMODULE>(handle), symbol));
   }
   static inline void* GetSymbol(const char* symbol) {
-    return static_cast<void*>(GetProcAddress(nullptr, symbol));
+    return reinterpret_cast<void*>(GetProcAddress(nullptr, symbol));
   }
   static inline int UnLoadLibrary(void* handle) {
     return FreeLibrary(static_cast<HMODULE>(handle));
diff --git a/tensorflow/lite/special_rules.bzl b/tensorflow/lite/special_rules.bzl
index cc5fd15e5d5..68143c976f4 100644
--- a/tensorflow/lite/special_rules.bzl
+++ b/tensorflow/lite/special_rules.bzl
@@ -68,3 +68,12 @@ def tflite_hexagon_nn_skel_libraries():
     return ["//third_party/hexagon_nn_skel:libhexagon_nn_skel"]
     """
     return []
+
+def tflite_schema_utils_friends():
+    """This is a no-op outside of Google.
+
+    Return the package group declaration to which targets for Flatbuffer schema utilities."""
+
+    # Its usage should be rare, and is often abused by tools that are doing
+    # Flatbuffer creation/manipulation in unofficially supported ways."
+    return ["//..."]
diff --git a/tensorflow/lite/stateful_error_reporter.h b/tensorflow/lite/stateful_error_reporter.h
new file mode 100644
index 00000000000..cf6693431f9
--- /dev/null
+++ b/tensorflow/lite/stateful_error_reporter.h
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_STATEFUL_ERROR_REPORTER_H_
+#define TENSORFLOW_LITE_STATEFUL_ERROR_REPORTER_H_
+
+#include <string>
+
+#include "tensorflow/lite/core/api/error_reporter.h"
+
+namespace tflite {
+
+// Similar to tflite::ErrorReporter, except that it allows callers to get the
+// last error message.
+class StatefulErrorReporter : public ErrorReporter {
+ public:
+  // Returns last error message. Returns empty string if no error is reported.
+  virtual std::string message() = 0;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_STATEFUL_ERROR_REPORTER_H_
diff --git a/tensorflow/lite/stderr_reporter_test.cc b/tensorflow/lite/stderr_reporter_test.cc
new file mode 100644
index 00000000000..264b7f7b313
--- /dev/null
+++ b/tensorflow/lite/stderr_reporter_test.cc
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/stderr_reporter.h"
+
+#include <cstdio>
+
+#include <gtest/gtest.h>
+
+namespace tflite {
+
+namespace {
+
+void CheckWritesToStderr(ErrorReporter *error_reporter) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+  testing::internal::CaptureStderr();
+#endif
+
+  // Run the code under test.
+  TF_LITE_REPORT_ERROR(error_reporter, "Test: %d", 42);
+
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+  EXPECT_EQ("ERROR: Test: 42\n", testing::internal::GetCapturedStderr());
+#endif
+}
+
+TEST(StderrReporterTest, DefaultErrorReporter_WritesToStderr) {
+  CheckWritesToStderr(DefaultErrorReporter());
+}
+
+TEST(StderrReporterTest, StderrReporter_WritesToStderr) {
+  StderrReporter stderr_reporter;
+  CheckWritesToStderr(&stderr_reporter);
+}
+
+}  // namespace
+
+}  // namespace tflite
diff --git a/tensorflow/lite/testdata/add_shared_tensors.bin b/tensorflow/lite/testdata/add_shared_tensors.bin
new file mode 100644
index 00000000000..cf9f9775818
Binary files /dev/null and b/tensorflow/lite/testdata/add_shared_tensors.bin differ
diff --git a/tensorflow/lite/testdata/multi_add_flex.bin b/tensorflow/lite/testdata/multi_add_flex.bin
index 9aac2155fed..9ab31ed63d9 100644
Binary files a/tensorflow/lite/testdata/multi_add_flex.bin and b/tensorflow/lite/testdata/multi_add_flex.bin differ
diff --git a/tensorflow/lite/testdata/segment_sum_invalid_buffer.bin b/tensorflow/lite/testdata/segment_sum_invalid_buffer.bin
new file mode 100644
index 00000000000..b7b81cde6e7
Binary files /dev/null and b/tensorflow/lite/testdata/segment_sum_invalid_buffer.bin differ
diff --git a/tensorflow/lite/testdata/sparse_tensor.bin b/tensorflow/lite/testdata/sparse_tensor.bin
index ef023280887..ec4c8dcf1fc 100644
Binary files a/tensorflow/lite/testdata/sparse_tensor.bin and b/tensorflow/lite/testdata/sparse_tensor.bin differ
diff --git a/tensorflow/lite/testdata/unidirectional_sequence_lstm.bin b/tensorflow/lite/testdata/unidirectional_sequence_lstm.bin
new file mode 100644
index 00000000000..42c96d14faa
Binary files /dev/null and b/tensorflow/lite/testdata/unidirectional_sequence_lstm.bin differ
diff --git a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
index d9cd6883a8d..7825dfb560a 100644
--- a/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib.py
@@ -28,11 +28,11 @@ from google.protobuf.message import DecodeError
 from tensorflow.core.framework import graph_pb2 as _graph_pb2
 from tensorflow.lite.python import convert_saved_model as _convert_saved_model
 from tensorflow.lite.python import lite as _lite
-from tensorflow.lite.python import lite_constants as constants
 from tensorflow.lite.python import util as _util
 from tensorflow.python import keras as _keras
 from tensorflow.python.client import session as _session
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework.importer import import_graph_def as _import_graph_def
 from tensorflow.python.keras.preprocessing import image
@@ -97,7 +97,7 @@ def _convert(converter, **kwargs):
   if "post_training_quantize" in kwargs:
     converter.optimizations = [_lite.Optimize.DEFAULT]
   if kwargs.get("quantize_to_float16", False):
-    converter.target_spec.supported_types = [constants.FLOAT16]
+    converter.target_spec.supported_types = [dtypes.float16]
   if kwargs.get("post_training_quantize_16x8", False):
     input_size = kwargs.get("model_input_size")
 
diff --git a/tensorflow/lite/testing/op_tests/concat.py b/tensorflow/lite/testing/op_tests/concat.py
index 3341f3c5d22..30d7b91fd77 100644
--- a/tensorflow/lite/testing/op_tests/concat.py
+++ b/tensorflow/lite/testing/op_tests/concat.py
@@ -59,6 +59,14 @@ def make_concat_tests(options):
       "fully_quantize": [False],
       "quant_16x8": [False],
       "dynamic_range_quantize": [True],
+  }, {
+      "base_shape": [[1, 3, 4, 3]],
+      "num_tensors": [6],
+      "axis": [1],
+      "type": [tf.bool],
+      "fully_quantize": [False],
+      "quant_16x8": [False],
+      "dynamic_range_quantize": [True],
   }]
 
   def get_shape(parameters, delta):
diff --git a/tensorflow/lite/testing/op_tests/leaky_relu.py b/tensorflow/lite/testing/op_tests/leaky_relu.py
index e37df7722f5..0d2ec384917 100644
--- a/tensorflow/lite/testing/op_tests/leaky_relu.py
+++ b/tensorflow/lite/testing/op_tests/leaky_relu.py
@@ -28,12 +28,13 @@ from tensorflow.lite.testing.zip_test_utils import register_make_test_function
 def make_leaky_relu_tests(options):
   """Make a set of tests to do LeakyRelu."""
 
-  test_parameters = [
-      {
-          "input_shape": [[], [1], [5], [1, 10, 10, 3], [3, 3, 3, 3]],
-          "alpha": [0.1, 1.0, 2.0, -0.1, -1.0, -2.0],
-      },
-  ]
+  test_parameters = [{
+      "input_shape": [[], [1], [5], [1, 10, 10, 3], [3, 3, 3, 3]],
+      "alpha": [0.1, 1.0, 2.0, -0.1, -1.0, -2.0],
+      "fully_quantize": [False, True],
+      "input_range": [(-3, 10)],
+      "quant_16x8": [False, True],
+  }]
 
   def build_graph(parameters):
     """Build the graph for the test case."""
diff --git a/tensorflow/lite/testing/op_tests/reduce.py b/tensorflow/lite/testing/op_tests/reduce.py
index 72324010f4b..c960aa1c6ef 100644
--- a/tensorflow/lite/testing/op_tests/reduce.py
+++ b/tensorflow/lite/testing/op_tests/reduce.py
@@ -160,6 +160,14 @@ def make_reduce_tests(reduce_op,
             "keepdims": [True, False],
             "fully_quantize": [True],
         },
+        {
+            "input_dtype": [tf.float32],
+            "input_shape": [[2, 0, 2]],
+            "axis": [0],
+            "const_axis": [True],
+            "keepdims": [True, False],
+            "fully_quantize": [False],
+        },
     ]
     # test_parameters include fully_quantize option only when
     # allow_fully_quantize is True.
diff --git a/tensorflow/lite/testing/op_tests/strided_slice.py b/tensorflow/lite/testing/op_tests/strided_slice.py
index bc1b0115c24..3a04354c202 100644
--- a/tensorflow/lite/testing/op_tests/strided_slice.py
+++ b/tensorflow/lite/testing/op_tests/strided_slice.py
@@ -43,17 +43,17 @@ def _make_strided_slice_tests(options, test_parameters, expected_tf_failures=0):
       begin = tf.compat.v1.placeholder(
           dtype=parameters["index_type"],
           name="begin",
-          shape=[len(parameters["input_shape"])])
+          shape=[len(parameters["begin"])])
       end = tf.compat.v1.placeholder(
           dtype=parameters["index_type"],
           name="end",
-          shape=[len(parameters["input_shape"])])
+          shape=[len(parameters["end"])])
       strides = None
       if parameters["strides"] is not None:
         strides = tf.compat.v1.placeholder(
             dtype=parameters["index_type"],
             name="strides",
-            shape=[len(parameters["input_shape"])])
+            shape=[len(parameters["strides"])])
       tensors = [input_tensor, begin, end]
       if strides is not None:
         tensors.append(strides)
@@ -141,7 +141,7 @@ def make_strided_slice_tests(options):
           "begin_mask": [0],
           "end_mask": [0],
           "shrink_axis_mask": [1],
-          "constant_indices": [True],
+          "constant_indices": [True, False],
           "fully_quantize": [False],
       },
       # 2-D
@@ -201,6 +201,37 @@ def make_strided_slice_tests(options):
           "fully_quantize": [True],
       },
   ]
+
+  if options.use_experimental_converter:
+    test_parameters = test_parameters + [
+        # Begin equal to input dim.
+        {
+            "dtype": [tf.float32],
+            "index_type": [tf.int32],
+            "input_shape": [[1, 1, 2]],
+            "begin": [[1]],
+            "end": [[0]],
+            "strides": [[1]],
+            "begin_mask": [0],
+            "end_mask": [1],
+            "shrink_axis_mask": [0],
+            "constant_indices": [True, False],
+            "fully_quantize": [False],
+        },
+        {
+            "dtype": [tf.float32],
+            "index_type": [tf.int32],
+            "input_shape": [[1, 1, 2]],
+            "begin": [[1, 0, 0]],
+            "end": [[0, -1, -1]],
+            "strides": [[1, 1, 1]],
+            "begin_mask": [6],
+            "end_mask": [7],
+            "shrink_axis_mask": [0],
+            "constant_indices": [True, False],
+            "fully_quantize": [False],
+        }
+    ]
   _make_strided_slice_tests(options, test_parameters, expected_tf_failures=2)
 
 
diff --git a/tensorflow/lite/testing/op_tests/transpose_conv.py b/tensorflow/lite/testing/op_tests/transpose_conv.py
index 09c1b5f4f14..d99cd204bcb 100644
--- a/tensorflow/lite/testing/op_tests/transpose_conv.py
+++ b/tensorflow/lite/testing/op_tests/transpose_conv.py
@@ -44,7 +44,8 @@ def make_transpose_conv_tests(options):
           "data_format": ["NHWC"],
           "channel_multiplier": [1, 2],
           "output_shape": [[]],
-          "fully_quantize": [False]
+          "fully_quantize": [False],
+          "const_weight_bias": [False]
       },
       # TODO(yunluli): Adding simple tests for now to unblock edgetpu debugging.
       # Need to add more test cases.
@@ -57,7 +58,20 @@ def make_transpose_conv_tests(options):
           "data_format": ["NHWC"],
           "channel_multiplier": [1],
           "output_shape": [[1, 3, 3, 2]],
-          "fully_quantize": [True]
+          "fully_quantize": [True],
+          "const_weight_bias": [True]
+      },
+      {
+          "input_shape": [[1, 3, 3, 1]],
+          "filter_size": [[3, 3, 2, 1]],
+          "has_bias": [False],
+          "strides": [[1, 1, 1, 1]],
+          "padding": ["SAME"],
+          "data_format": ["NHWC"],
+          "channel_multiplier": [1],
+          "output_shape": [[1, 3, 3, 2]],
+          "fully_quantize": [False],
+          "const_weight_bias": [True]
       },
       {
           "input_shape": [[1, 3, 3, 1]],
@@ -68,7 +82,8 @@ def make_transpose_conv_tests(options):
           "data_format": ["NHWC"],
           "channel_multiplier": [1],
           "output_shape": [[1, 6, 6, 2]],
-          "fully_quantize": [True]
+          "fully_quantize": [True],
+          "const_weight_bias": [True]
       },
       {
           "input_shape": [[1, 4, 3, 1]],
@@ -79,7 +94,8 @@ def make_transpose_conv_tests(options):
           "data_format": ["NHWC"],
           "channel_multiplier": [1],
           "output_shape": [[1, 8, 6, 2]],
-          "fully_quantize": [True]
+          "fully_quantize": [True],
+          "const_weight_bias": [True]
       },
       {
           "input_shape": [[1, 3, 3, 1]],
@@ -90,14 +106,15 @@ def make_transpose_conv_tests(options):
           "data_format": ["NHWC"],
           "channel_multiplier": [1],
           "output_shape": [[1, 3, 3, 2]],
-          "fully_quantize": [True]
+          "fully_quantize": [True],
+          "const_weight_bias": [True]
       },
   ]
 
   def get_tensor_shapes(parameters):
     input_shape = parameters["input_shape"]
     filter_size = parameters["filter_size"]
-    if not parameters["fully_quantize"]:
+    if not parameters["const_weight_bias"]:
       filter_shape = filter_size + [
           input_shape[3], parameters["channel_multiplier"]
       ]
@@ -113,7 +130,7 @@ def make_transpose_conv_tests(options):
     filter_input = tf.compat.v1.placeholder(
         dtype=tf.float32, name="filter", shape=filter_shape)
 
-    if not parameters["fully_quantize"]:
+    if not parameters["const_weight_bias"]:
       input_tensors = [input_tensor, filter_input]
       conv_outputs = tf.nn.conv2d(
           input_tensor,
@@ -130,8 +147,11 @@ def make_transpose_conv_tests(options):
           data_format=parameters["data_format"])
     else:
       input_tensors = [input_tensor]
-      filter_input = create_tensor_data(
-          np.float32, filter_shape, min_value=-1, max_value=1)
+      if parameters["fully_quantize"]:
+        filter_input = create_tensor_data(
+            np.float32, filter_shape, min_value=-1, max_value=1)
+      else:
+        filter_input = create_tensor_data(np.float32, filter_shape)
       out = tf.nn.conv2d_transpose(
           input_tensor,
           filter_input,
@@ -140,27 +160,38 @@ def make_transpose_conv_tests(options):
           padding=parameters["padding"],
           data_format=parameters["data_format"])
       if parameters["has_bias"]:
-        bias_input = create_tensor_data(
-            np.float32, (parameters["output_shape"][-1],),
-            min_value=-1,
-            max_value=1)
+        if parameters["fully_quantize"]:
+          bias_input = create_tensor_data(
+              np.float32, (parameters["output_shape"][-1],),
+              min_value=-1,
+              max_value=1)
+        else:
+          bias_input = create_tensor_data(np.float32,
+                                          (parameters["output_shape"][-1],))
         out = tf.nn.bias_add(
             out, bias_input, data_format=parameters["data_format"])
 
+        mul_data = create_tensor_data(np.float32,
+                                      (parameters["output_shape"][-1],))
+        out = tf.math.multiply(out, mul_data)
+
     return input_tensors, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
     input_shape, filter_shape = get_tensor_shapes(parameters)
-    if not parameters["fully_quantize"]:
+    if not parameters["const_weight_bias"]:
       values = [
           create_tensor_data(np.float32, input_shape),
           create_tensor_data(np.float32, filter_shape)
       ]
     else:
-      values = [
-          create_tensor_data(
-              np.float32, input_shape, min_value=-1, max_value=1),
-      ]
+      if parameters["fully_quantize"]:
+        values = [
+            create_tensor_data(
+                np.float32, input_shape, min_value=-1, max_value=1),
+        ]
+      else:
+        values = [create_tensor_data(np.float32, input_shape),]
 
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
diff --git a/tensorflow/lite/testing/selective_build_test.cc b/tensorflow/lite/testing/selective_build_test.cc
index 1a9a5b2efdb..c3a0cf20ecc 100644
--- a/tensorflow/lite/testing/selective_build_test.cc
+++ b/tensorflow/lite/testing/selective_build_test.cc
@@ -68,7 +68,7 @@ TEST(SelectiveBuiltTest, AddModel) {
   EXPECT_THAT(RunWithRandomInputs(model), true);
 }
 
-TEST(SelectiveBuiltTest, LGTMModel) {
+TEST(SelectiveBuiltTest, LSTMModel) {
   std::string model = "third_party/tensorflow/lite/testdata/lstm.bin";
   EXPECT_THAT(RunWithRandomInputs(model), true);
 }
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index 526b3968b21..bb2d37241d4 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -361,7 +361,10 @@ TfLiteDriver::TfLiteDriver(DelegateType delegate_type, bool reference_kernel)
   if (reference_kernel) {
     resolver_.reset(new ops::builtin::BuiltinRefOpResolver);
   } else {
-    resolver_.reset(new ops::builtin::BuiltinOpResolver);
+    // TODO(b/168278077): change back to use BuiltinOpResolver after zip tests
+    // are fully validated against TfLite delegates.
+    resolver_.reset(
+        new ops::builtin::BuiltinOpResolverWithoutDefaultDelegates());
     ops::builtin::BuiltinOpResolver* buildinop_resolver_ =
         reinterpret_cast<ops::builtin::BuiltinOpResolver*>(resolver_.get());
     buildinop_resolver_->AddCustom("RFFT2D",
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index d5f0b22de2c..cd130d978f4 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -1,7 +1,6 @@
 load(
     "//tensorflow/core/platform:build_config.bzl",
-    "tf_proto_library_cc",
-    "tf_proto_library_py",
+    "tf_proto_library",
 )
 load(
     "//tensorflow:tensorflow.bzl",
@@ -15,14 +14,14 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "types_proto",
     srcs = ["types.proto"],
     cc_api_version = 2,
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "toco_flags_proto",
     srcs = ["toco_flags.proto"],
     cc_api_version = 2,
@@ -30,7 +29,7 @@ tf_proto_library_cc(
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "model_flags_proto",
     srcs = ["model_flags.proto"],
     cc_api_version = 2,
@@ -38,32 +37,6 @@ tf_proto_library_cc(
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library_py(
-    name = "types_proto",
-    srcs = [
-        "types.proto",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-tf_proto_library_py(
-    name = "toco_flags_proto",
-    srcs = [
-        "toco_flags.proto",
-    ],
-    protodeps = [":types_proto"],
-    visibility = ["//visibility:public"],
-)
-
-tf_proto_library_py(
-    name = "model_flags_proto",
-    srcs = [
-        "model_flags.proto",
-    ],
-    protodeps = [":types_proto"],
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "tensorflow_core_cc_protos_all",
     deps = ["//tensorflow/core:protos_all_cc"],
diff --git a/tensorflow/lite/toco/logging/BUILD b/tensorflow/lite/toco/logging/BUILD
index c9a9591c704..1f4127a5063 100644
--- a/tensorflow/lite/toco/logging/BUILD
+++ b/tensorflow/lite/toco/logging/BUILD
@@ -1,7 +1,6 @@
 load(
     "//tensorflow/core/platform:build_config.bzl",
-    "tf_proto_library_cc",
-    "tf_proto_library_py",
+    "tf_proto_library",
 )
 load(
     "//tensorflow:tensorflow.bzl",
@@ -13,19 +12,13 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "toco_conversion_log_proto",
     srcs = ["toco_conversion_log.proto"],
     cc_api_version = 2,
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library_py(
-    name = "toco_conversion_log_proto",
-    srcs = ["toco_conversion_log.proto"],
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "conversion_log_util",
     srcs = ["conversion_log_util.cc"],
diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index ac7d94a37bd..b71e2f7728f 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -37,6 +37,9 @@ cc_library(
     deps = [
         "@com_google_protobuf//:protobuf_headers",
         "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
+        "//tensorflow/c:kernels",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
diff --git a/tensorflow/lite/toco/python/toco_python_api.cc b/tensorflow/lite/toco/python/toco_python_api.cc
index 0f21d0854ae..edcc1f805b4 100644
--- a/tensorflow/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/lite/toco/python/toco_python_api.cc
@@ -20,10 +20,13 @@ limitations under the License.
 #include <vector>
 
 #include "google/protobuf/text_format.h"
+#include "tensorflow/c/kernels.h"
 #include "tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.h"
 #include "tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h"
 #include "tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h"
 #include "tensorflow/compiler/mlir/lite/sparsity/sparsify_model.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
@@ -118,11 +121,6 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
     PyErr_SetString(PyExc_ValueError, "Toco flags are invalid.");
     return nullptr;
   }
-  std::string input_contents_txt = ConvertArg(input_contents_txt_raw, &error);
-  if (error) {
-    PyErr_SetString(PyExc_ValueError, "Input GraphDef is invalid.");
-    return nullptr;
-  }
 
   // Use TOCO to produce new outputs.
   toco::ModelFlags model_flags;
@@ -153,10 +151,18 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
   }
 
   tensorflow::GraphDef graph_def;
-  if (!graph_def.ParseFromString(input_contents_txt)) {
-    PyErr_SetString(PyExc_ValueError,
-                    "Failed to convert GraphDef to Python String.");
-    return nullptr;
+  std::string input_contents_txt;
+  if (model_flags.saved_model_dir().empty()) {
+    input_contents_txt = ConvertArg(input_contents_txt_raw, &error);
+    if (error) {
+      PyErr_SetString(PyExc_ValueError, "Input GraphDef is invalid.");
+      return nullptr;
+    }
+    if (!graph_def.ParseFromString(input_contents_txt)) {
+      PyErr_SetString(PyExc_ValueError,
+                      "Failed to convert GraphDef to Python String.");
+      return nullptr;
+    }
   }
 
   auto& dump_options = *GraphVizDumpOptions::singleton();
@@ -314,4 +320,69 @@ PyObject* MlirSparsifyModel(PyObject* data) {
       builder.GetSize());
 }
 
+PyObject* RegisterCustomOpdefs(PyObject* list) {
+  if (!PyList_Check(list)) {
+    PyErr_SetString(PyExc_TypeError, "Expected list in argument");
+    return nullptr;
+  }
+
+  int64 size = PyList_Size(list);
+  for (int i = 0; i < size; ++i) {
+    // Get character array from Python object.
+    char* tf_opdefs;
+    Py_ssize_t len;
+    if (tflite::python_utils::ConvertFromPyString(PyList_GetItem(list, i),
+                                                  &tf_opdefs, &len) == -1) {
+      PyErr_Format(PyExc_ValueError,
+                   "Failed to convert Python string at index %d of custom op "
+                   "defs argument",
+                   i);
+      return nullptr;
+    }
+
+    // Parse op def from character array.
+    tensorflow::OpDef opdef;
+    if (!tensorflow::protobuf::TextFormat::ParseFromString(tf_opdefs, &opdef)) {
+      PyErr_Format(
+          PyExc_ValueError,
+          "Failed to parse opdefs at index %d of custom op defs argument: %s",
+          i, tf_opdefs);
+      return nullptr;
+    }
+
+    // Register extra opdefs to TensorFlow global op registry.
+    tensorflow::OpRegistry::Global()->Register(
+        [opdef](
+            tensorflow::OpRegistrationData* op_reg_data) -> tensorflow::Status {
+          *op_reg_data = tensorflow::OpRegistrationData(opdef);
+          return tensorflow::Status::OK();
+        });
+
+    // Register the corresponding fake op kernel.
+    const char* node_name = opdef.name().c_str();
+    const char* op_name = opdef.name().c_str();
+    const char* device_name = "CPU";
+    static auto fake_compute_func = [](void* kernel, TF_OpKernelContext* ctx) {
+    };
+
+    TF_KernelBuilder* builder =
+        TF_NewKernelBuilder(op_name, device_name, /*create_func=*/nullptr,
+                            fake_compute_func, /*delete_func=*/nullptr);
+
+    TF_Status* status = TF_NewStatus();
+    TF_RegisterKernelBuilder(node_name, builder, status);
+    if (TF_GetCode(status) != TF_OK) {
+      TF_DeleteStatus(status);
+      PyErr_Format(PyExc_ValueError,
+                   "Failed to register fake op kernel at index %d of custom op "
+                   "defs argument",
+                   i);
+      return nullptr;
+    }
+    TF_DeleteStatus(status);
+  }
+
+  Py_RETURN_TRUE;
+}
+
 }  // namespace toco
diff --git a/tensorflow/lite/toco/python/toco_python_api.h b/tensorflow/lite/toco/python/toco_python_api.h
index 058ae9fb942..df9d6e11bcf 100644
--- a/tensorflow/lite/toco/python/toco_python_api.h
+++ b/tensorflow/lite/toco/python/toco_python_api.h
@@ -49,6 +49,9 @@ PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
 // Sparsifies model to encode sparse tensors with proper format. Throws error if
 // sparsification fails.
 PyObject* MlirSparsifyModel(PyObject* data);
+
+// Registers the given custom opdefs to TensorFlow global op registry.
+PyObject* RegisterCustomOpdefs(PyObject* list);
 }  // namespace toco
 
 #endif  // TENSORFLOW_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
diff --git a/tensorflow/lite/toco/tflite/BUILD b/tensorflow/lite/toco/tflite/BUILD
index 18b531fd5f1..91bb77ff391 100644
--- a/tensorflow/lite/toco/tflite/BUILD
+++ b/tensorflow/lite/toco/tflite/BUILD
@@ -95,6 +95,7 @@ cc_library(
         ":types",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/toco:model",
         "//tensorflow/lite/toco:tooling_util",
         "//tensorflow/lite/tools/optimize:quantize_weights",
@@ -113,6 +114,7 @@ tf_cc_test(
         ":operator",
         "//tensorflow/core:ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
     ],
@@ -132,6 +134,7 @@ cc_library(
         ":types",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/toco:model",
         "//tensorflow/lite/toco:tooling_util",
         "//tensorflow/lite/tools:verifier",
@@ -172,6 +175,7 @@ tf_cc_test(
         "//tensorflow/core:ops",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
     ],
diff --git a/tensorflow/lite/toco/tflite/export.cc b/tensorflow/lite/toco/tflite/export.cc
index d109ab875b5..3ef1c67c721 100644
--- a/tensorflow/lite/toco/tflite/export.cc
+++ b/tensorflow/lite/toco/tflite/export.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/lite/context.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/toco/tflite/op_version.h"
 #include "tensorflow/lite/toco/tflite/operator.h"
 #include "tensorflow/lite/toco/tflite/types.h"
diff --git a/tensorflow/lite/toco/tflite/export_test.cc b/tensorflow/lite/toco/tflite/export_test.cc
index ced55921e50..b5a55cfa090 100644
--- a/tensorflow/lite/toco/tflite/export_test.cc
+++ b/tensorflow/lite/toco/tflite/export_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/toco/tflite/builtin_operator.h"
 #include "tensorflow/lite/toco/tflite/operator.h"
 #include "tensorflow/lite/toco/tflite/types.h"
@@ -171,10 +172,10 @@ class ExportTest : public ::testing::Test {
     auto* model = ::tflite::GetModel(result.data());
 
     for (const ::tflite::OperatorCode* opcode : *model->operator_codes()) {
-      if (opcode->builtin_code() != ::tflite::BuiltinOperator_CUSTOM) {
-        names.push_back(
-            std::string("builtin:") +
-            ::tflite::EnumNameBuiltinOperator(opcode->builtin_code()));
+      auto builtin_code = GetBuiltinCode(opcode);
+      if (builtin_code != ::tflite::BuiltinOperator_CUSTOM) {
+        names.push_back(std::string("builtin:") +
+                        ::tflite::EnumNameBuiltinOperator(builtin_code));
       } else {
         names.push_back(std::string("custom:") +
                         opcode->custom_code()->c_str());
diff --git a/tensorflow/lite/toco/tflite/import.cc b/tensorflow/lite/toco/tflite/import.cc
index 4253ab93160..1ea6907b73c 100644
--- a/tensorflow/lite/toco/tflite/import.cc
+++ b/tensorflow/lite/toco/tflite/import.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "flatbuffers/flexbuffers.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/toco/tflite/operator.h"
 #include "tensorflow/lite/toco/tflite/types.h"
 #include "tensorflow/lite/toco/tooling_util.h"
@@ -42,9 +43,9 @@ void LoadOperatorsTable(const ::tflite::Model& input_model,
   auto opcodes = input_model.operator_codes();
   if (!opcodes) return;
   for (const auto* opcode : *opcodes) {
-    if (opcode->builtin_code() != ::tflite::BuiltinOperator_CUSTOM) {
-      operators_table->push_back(
-          EnumNameBuiltinOperator(opcode->builtin_code()));
+    auto builtin_code = GetBuiltinCode(opcode);
+    if (builtin_code != ::tflite::BuiltinOperator_CUSTOM) {
+      operators_table->push_back(EnumNameBuiltinOperator(builtin_code));
     } else {
       operators_table->push_back(opcode->custom_code()->c_str());
     }
diff --git a/tensorflow/lite/toco/tflite/import_test.cc b/tensorflow/lite/toco/tflite/import_test.cc
index 6163ebab45b..fe7dd31a40a 100644
--- a/tensorflow/lite/toco/tflite/import_test.cc
+++ b/tensorflow/lite/toco/tflite/import_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/version.h"
 
 namespace toco {
diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc
index 222be969560..72fc4eea1e7 100644
--- a/tensorflow/lite/toco/tflite/op_version.cc
+++ b/tensorflow/lite/toco/tflite/op_version.cc
@@ -124,6 +124,7 @@ std::string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kTranspose, 1}, "1.6.0"},
           {{OperatorType::kTranspose, 2}, "1.14.0"},
           {{OperatorType::kTranspose, 3}, "1.15.0"},
+          {{OperatorType::kTranspose, 5}, kPendingReleaseOpVersion},
           {{OperatorType::kLstmCell, 1}, "1.7.0"},
           {{OperatorType::kLstmCell, 2}, "1.10.0"},
           {{OperatorType::kLstmCell, 3}, "1.14.0"},
@@ -150,6 +151,7 @@ std::string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kResizeNearestNeighbor, 1}, "1.13.1"},
           {{OperatorType::kResizeNearestNeighbor, 2}, "1.14.0"},
           {{OperatorType::kResizeNearestNeighbor, 3}, kPendingReleaseOpVersion},
+          {{OperatorType::kResizeNearestNeighbor, 4}, kPendingReleaseOpVersion},
           {{OperatorType::kSqueeze, 1}, "1.6.0"},
           {{OperatorType::kSplit, 1}, "1.5.0"},
           {{OperatorType::kSplit, 2}, "1.14.0"},
@@ -179,6 +181,7 @@ std::string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kSlice, 1}, "1.14.0"},
           {{OperatorType::kSlice, 2}, "1.14.0"},
           {{OperatorType::kSlice, 3}, "1.14.0"},
+          {{OperatorType::kSlice, 4}, kPendingReleaseOpVersion},
           {{OperatorType::kTanh, 1}, "1.14.0"},
           {{OperatorType::kTanh, 2}, "1.14.0"},
           {{OperatorType::kTanh, 3}, kPendingReleaseOpVersion},
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index c29d30e750e..22fa1ff1cea 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -84,6 +84,17 @@ py_binary(
     ],
 )
 
+py_binary(
+    name = "reverse_xxd_dump_from_cc",
+    srcs = ["reverse_xxd_dump_from_cc.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":flatbuffer_utils",
+        "//tensorflow/python:platform",
+    ],
+)
+
 py_binary(
     name = "randomize_weights",
     srcs = ["randomize_weights.py"],
@@ -149,6 +160,7 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_googlesource_code_re2//:re2",
     ],
 )
@@ -183,7 +195,10 @@ cc_library(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/api:error_reporter",
+        "//tensorflow/lite/core/api:op_resolver",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
@@ -206,6 +221,7 @@ cc_test(
         "//tensorflow/lite:util",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -267,6 +283,7 @@ cc_library(
         "//tensorflow/core:tensorflow",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/schema:schema_utils",
         "@flatbuffers",
         "@jsoncpp_git//:jsoncpp",
     ],
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index 774e7ed7088..eb3f37aef58 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -158,7 +158,6 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:cpu_backend_context",
-        "//tensorflow/lite/profiling:platform_profiler",
         "//tensorflow/lite/profiling:profile_summary_formatter",
         "//tensorflow/lite/profiling:profiler",
         "//tensorflow/lite/tools:logging",
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index 453ea5b986a..65efae7c6e4 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -36,11 +36,6 @@ and the following optional parameters:
     mean use no delay.
 *   `enable_op_profiling`: `bool` (default=false) \
     Whether to enable per-operator profiling measurement.
-*   `enable_platform_tracing`: `bool` (default=false) \
-    Whether to enable platform-wide tracing. Needs to be combined with
-    'enable_op_profiling'. Note, the platform-wide tracing might not work if the
-    tool runs as a commandline native binary. For example, on Android, the
-    ATrace-based tracing only works when the tool is launched as an APK.
 *   `profiling_output_csv_file`: `str` (default="") \
     File path to export profile data to as CSV. The results are printed to
     `stdout` if option is not set. Requires `enable_op_profiling` to be `true`
@@ -73,7 +68,9 @@ where applicable. For details about each parameter, please refer to
 *   `use_nnapi`: `bool` (default=false) \
     Note some Android P devices will fail to use NNAPI for models in
     `/data/local/tmp/` and this benchmark tool will not correctly use NNAPI.
-*   `nnapi_execution_preference`: `str` (default="")
+*   `nnapi_execution_preference`: `str` (default="") \
+    Should be one of: `fast_single_answer`, `sustained_speed`, `low_power`,
+    `undefined`.
 *   `nnapi_execution_priority`: `str` (default="") \
     Note this requires Android 11+.
 *   `nnapi_accelerator_name`: `str` (default="") \
diff --git a/tensorflow/lite/tools/benchmark/android/BUILD b/tensorflow/lite/tools/benchmark/android/BUILD
index 6645b730bac..be9c379af50 100644
--- a/tensorflow/lite/tools/benchmark/android/BUILD
+++ b/tensorflow/lite/tools/benchmark/android/BUILD
@@ -10,8 +10,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 # See README.md for details about building and executing this benchmark.
 android_binary(
     name = "benchmark_model",
diff --git a/tensorflow/lite/tools/benchmark/android/README.md b/tensorflow/lite/tools/benchmark/android/README.md
index 3475d47632a..d41090d9515 100644
--- a/tensorflow/lite/tools/benchmark/android/README.md
+++ b/tensorflow/lite/tools/benchmark/android/README.md
@@ -96,7 +96,13 @@ page for more detailed information.
 (0)-(3) Follow the steps (0)-(3) of [build/install/run](#to-buildinstallrun)
 section.
 
-(4) Set up Quick Settings tile for System Tracing app on your device. Follow the
+(4) Enable platform tracing.
+
+```
+adb shell setprop debug.tflite.trace 1
+```
+
+(5) Set up Quick Settings tile for System Tracing app on your device. Follow the
 [instruction](https://developer.android.com/topic/performance/tracing/on-device#set-up-tile).
 The System Tracing tile will be added to the Quick Settings panel.
 
@@ -105,20 +111,20 @@ Refer to the
 [guide](https://developer.android.com/topic/performance/tracing/on-device#app-menu)
 for more information.
 
-(5) Tap the System Tracing tile, which has the label "Record trace". The tile
+(6) Tap the System Tracing tile, which has the label "Record trace". The tile
 becomes enabled, and a persistent notification appears to notify you that the
 system is now recording a trace.
 
-(6) Run the benchmark with platform tracing enabled.
+(7) Run the benchmark with platform tracing enabled.
 
 ```
 adb shell am start -S \
   -n org.tensorflow.lite.benchmark/.BenchmarkModelActivity \
   --es args '"--graph=/data/local/tmp/mobilenet_quant_v1_224.tflite \
-  --num_threads=4 --enable_op_profiling=true --enable_platform_tracing=true"'
+  --num_threads=4"'
 ```
 
-(7) Wait until the benchmark finishes. It can be checked from Android log
+(8) Wait until the benchmark finishes. It can be checked from Android log
 messages, e.g.,
 
 ```
@@ -127,14 +133,14 @@ adb logcat | grep "Average inference"
 ... tflite  : Average inference timings in us: Warmup: 91471, Init: 4108, Inference: 80660.1
 ```
 
-(8) Stop tracing by tapping either the System Tracing tile in the Quick Settings
+(9) Stop tracing by tapping either the System Tracing tile in the Quick Settings
 panel or on the System Tracing notification. The system displays a new
 notification that contains the message "Saving trace". When saving is complete,
 the system dismisses the notification and displays a third notification "Trace
 saved", confirming that your trace has been saved and that you're ready to share
 the system trace.
 
-(9)
+(10)
 [Share](https://developer.android.com/topic/performance/tracing/on-device#share-trace)
 a trace file,
 [convert](https://developer.android.com/topic/performance/tracing/on-device#converting_between_trace_formats)
@@ -143,3 +149,9 @@ between tracing formats and
 an HTML report. Note that, the captured tracing file format is either in
 Perfetto format or in Systrace format depending on the Android version of your
 device. Select the appropriate method to handle the generated file.
+
+(11) Disable platform tracing.
+
+```
+adb shell setprop debug.tflite.trace 0
+```
diff --git a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
index cfce23c4595..98fdfcf7a50 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
@@ -360,13 +360,13 @@ void BenchmarkPerformanceOptions::Run() {
 }
 
 void BenchmarkPerformanceOptions::Run(int argc, char** argv) {
-  // We first parse flags for single-option runs to get information like
-  // parameters of the input model etc.
-  if (single_option_run_->ParseFlags(&argc, argv) != kTfLiteOk) return;
-
-  // Now, we parse flags that are specified for this particular binary.
+  // Parse flags that are supported by this particular binary first.
   if (!ParseFlags(&argc, argv)) return;
 
+  // Then parse flags for single-option runs to get information like parameters
+  // of the input model etc.
+  if (single_option_run_->ParseFlags(&argc, argv) != kTfLiteOk) return;
+
   // Now, the remaining are unrecognized flags and we simply print them out.
   for (int i = 1; i < argc; ++i) {
     TFLITE_LOG(WARN) << "WARNING: unrecognized commandline flag: " << argv[i];
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index ef9742eaac7..511244cee88 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/op_resolver.h"
-#include "tensorflow/lite/profiling/platform_profiler.h"
 #include "tensorflow/lite/profiling/profile_summary_formatter.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_utils.h"
@@ -61,20 +60,6 @@ constexpr int kOpProfilingEnabledDefault = true;
 constexpr int kOpProfilingEnabledDefault = false;
 #endif
 
-// Dumps platform-wide tracing files via a platform-based profiler that's built
-// upon platform tracing tools, like ATrace on Android etc.
-class PlatformProfilingListener : public BenchmarkListener {
- public:
-  explicit PlatformProfilingListener(Interpreter* interpreter) {
-    TFLITE_TOOLS_CHECK(interpreter);
-    platform_profiler_ = profiling::CreatePlatformProfiler();
-    interpreter->SetProfiler(platform_profiler_.get());
-  }
-
- private:
-  std::unique_ptr<tflite::Profiler> platform_profiler_;
-};
-
 // Dumps ruy profiling events if the ruy profiler is enabled.
 class RuyProfileListener : public BenchmarkListener {
  public:
@@ -269,8 +254,6 @@ BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
                           BenchmarkParam::Create<int32_t>(1024));
   default_params.AddParam("profiling_output_csv_file",
                           BenchmarkParam::Create<std::string>(""));
-  default_params.AddParam("enable_platform_tracing",
-                          BenchmarkParam::Create<bool>(false));
 
   for (const auto& delegate_provider :
        tools::GetRegisteredDelegateProviders()) {
@@ -331,10 +314,7 @@ std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
       CreateFlag<std::string>(
           "profiling_output_csv_file", &params_,
           "File path to export profile data as CSV, if not set "
-          "prints to stdout."),
-      CreateFlag<bool>("enable_platform_tracing", &params_,
-                       "enable platform-wide tracing, only meaningful when "
-                       "--enable_op_profiling is set to true.")};
+          "prints to stdout.")};
 
   flags.insert(flags.end(), specific_flags.begin(), specific_flags.end());
 
@@ -369,8 +349,6 @@ void BenchmarkTfLiteModel::LogParams() {
                       "Max profiling buffer entries", verbose);
   LOG_BENCHMARK_PARAM(std::string, "profiling_output_csv_file",
                       "CSV File to export profiling data to", verbose);
-  LOG_BENCHMARK_PARAM(bool, "enable_platform_tracing",
-                      "Enable platform-wide tracing", verbose);
 
   for (const auto& delegate_provider :
        tools::GetRegisteredDelegateProviders()) {
@@ -746,11 +724,6 @@ std::unique_ptr<BenchmarkListener>
 BenchmarkTfLiteModel::MayCreateProfilingListener() const {
   if (!params_.Get<bool>("enable_op_profiling")) return nullptr;
 
-  if (params_.Get<bool>("enable_platform_tracing")) {
-    return std::unique_ptr<BenchmarkListener>(
-        new PlatformProfilingListener(interpreter_.get()));
-  }
-
   return std::unique_ptr<BenchmarkListener>(new ProfilingListener(
       interpreter_.get(), params_.Get<int32_t>("max_profiling_buffer_entries"),
       params_.Get<std::string>("profiling_output_csv_file"),
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
index d320a90d005..8917c254825 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
+++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
@@ -226,6 +226,17 @@ void TfLiteFloatArrayFree(TfLiteFloatArray* a);
     }                                                                      \
   } while (0)
 
+#define TF_LITE_ENSURE_NEAR(context, a, b, epsilon)                          \
+  do {                                                                       \
+    auto delta = ((a) > (b)) ? ((a) - (b)) : ((b) - (a));                    \
+    if (delta > epsilon) {                                                   \
+      TF_LITE_KERNEL_LOG((context), "%s:%d %s not near %s (%f != %f)",       \
+                         __FILE__, __LINE__, #a, #b, static_cast<double>(a), \
+                         static_cast<double>(b));                            \
+      return kTfLiteError;                                                   \
+    }                                                                        \
+  } while (0)
+
 #define TF_LITE_ENSURE_OK(context, status) \
   do {                                     \
     const TfLiteStatus s = (status);       \
@@ -410,7 +421,7 @@ typedef struct TfLiteCustomAllocation {
   size_t bytes;
 } TfLiteCustomAllocation;
 
-// An tensor in the interpreter system which is a wrapper around a buffer of
+// A tensor in the interpreter system which is a wrapper around a buffer of
 // data including a dimensionality (or NULL if not currently defined).
 #ifndef TF_LITE_STATIC_MEMORY
 typedef struct TfLiteTensor {
diff --git a/tensorflow/lite/tools/cmake/README.md b/tensorflow/lite/tools/cmake/README.md
index 7624b6623c2..159ed6d3343 100644
--- a/tensorflow/lite/tools/cmake/README.md
+++ b/tensorflow/lite/tools/cmake/README.md
@@ -4,13 +4,13 @@ This page describes how to build the TensorFlow Lite static library with CMake
 tool.
 
 The following instructions have been tested on Ubuntu 16.04.3 64-bit PC (AMD64)
-and TensorFlow devel docker image
+, TensorFlow devel docker image and Windows 10.
 [tensorflow/tensorflow:devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
 
 **Note:** This is an experimental that is subject to change.
 
-**Note:** The following are not currently supported: Android, iOS, Tests and
-Host Tools (i.e benchmark / analysis tools etc.)
+**Note:** The following are not currently supported: iOS, Tests and
+Host Tools (i.e analysis tools etc.)
 
 #### Step 1. Install CMake tool
 
@@ -40,11 +40,45 @@ cd tflite_build
 cmake ../tensorflow_src/tensorflow/lite
 ```
 
+If you want to configure Android build with GPU delegate support,
+
+```sh
+mkdir tflite_build
+cd tflite_build
+cmake -DCMAKE_TOOLCHAIN_FILE=<NDK path>/build/cmake/android.toolchain.cmake \
+  -DANDROID_ABI=arm64-v8a -DTFLITE_ENABLE_GPU=ON ../tensorflow_src/tensorflow/lite
+```
+
+
 #### Step 4. Build TensorFlow Lite
 
+In the tflite_build directory,
+
 ```sh
 cmake --build . -j
 ```
 
+Or
+
+```sh
+make -j
+```
+
+
 **Note:** This should compile a static library `libtensorflow-lite.a` in the
 current directory.
+
+
+#### Step 5. Build TensorFlow Lite Benchmark Tool
+
+In the tflite_build directory,
+
+```sh
+cmake --build . -j -t benchmark_model
+```
+
+Or
+
+```sh
+make benchmark_model -j
+```
diff --git a/tensorflow/lite/tools/cmake/modules/Findopencl_headers.cmake b/tensorflow/lite/tools/cmake/modules/Findopencl_headers.cmake
new file mode 100644
index 00000000000..6384aa0d1da
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findopencl_headers.cmake
@@ -0,0 +1,16 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(opencl_headers)
diff --git a/tensorflow/lite/tools/cmake/modules/Findvulkan_headers.cmake b/tensorflow/lite/tools/cmake/modules/Findvulkan_headers.cmake
new file mode 100644
index 00000000000..8dbb0e6d6ae
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findvulkan_headers.cmake
@@ -0,0 +1,16 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(vulkan_headers)
diff --git a/tensorflow/lite/tools/cmake/modules/Findxnnpack.cmake b/tensorflow/lite/tools/cmake/modules/Findxnnpack.cmake
new file mode 100644
index 00000000000..ee32c9a592e
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findxnnpack.cmake
@@ -0,0 +1,16 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(xnnpack)
diff --git a/tensorflow/lite/tools/cmake/modules/eigen.cmake b/tensorflow/lite/tools/cmake/modules/eigen.cmake
index 6ad7949f350..983e0735948 100644
--- a/tensorflow/lite/tools/cmake/modules/eigen.cmake
+++ b/tensorflow/lite/tools/cmake/modules/eigen.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   eigen
   GIT_REPOSITORY https://gitlab.com/libeigen/eigen
   # TODO: Verify this is the version required by TFLite
-  GIT_TAG b9362fb8f76fbba805b56afbc0f5de0a279631b5
+  GIT_TAG d10b27fe37736d2944630ecd7557cefa95cf87c9
   # It's not currently (cmake 3.17) possible to shallow clone with a GIT TAG
   # as cmake attempts to git checkout the commit hash after the clone
   # which doesn't work as it's a shallow clone hence a different commit hash.
diff --git a/tensorflow/lite/tools/cmake/modules/opencl_headers.cmake b/tensorflow/lite/tools/cmake/modules/opencl_headers.cmake
new file mode 100644
index 00000000000..c54e526f5ed
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/opencl_headers.cmake
@@ -0,0 +1,40 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET opencl_headers OR opencl_headers_POPULATED)
+  return()
+endif()
+
+include(FetchContent)
+
+OverridableFetchContent_Declare(
+  opencl_headers
+  GIT_REPOSITORY https://github.com/KhronosGroup/OpenCL-Headers
+  # GIT_TAG must keep in sync with tensorflow/third_party/opencl_headers/workspace.bzl
+  GIT_TAG 0d5f18c6e7196863bc1557a693f1509adfcee056
+  GIT_PROGRESS TRUE
+  PREFIX "${CMAKE_BINARY_DIR}"
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/opencl_headers"
+)
+
+OverridableFetchContent_GetProperties(opencl_headers)
+if(NOT opencl_headers)
+  OverridableFetchContent_Populate(opencl_headers)
+endif()
+
+include_directories(
+  AFTER
+   "${opencl_headers_SOURCE_DIR}/"
+)
diff --git a/tensorflow/lite/tools/cmake/modules/vulkan_headers.cmake b/tensorflow/lite/tools/cmake/modules/vulkan_headers.cmake
new file mode 100644
index 00000000000..4b8fc34104b
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/vulkan_headers.cmake
@@ -0,0 +1,40 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET vulkan_headers OR vulkan_headers_POPULATED)
+  return()
+endif()
+
+include(FetchContent)
+
+OverridableFetchContent_Declare(
+  vulkan_headers
+  GIT_REPOSITORY https://github.com/KhronosGroup/Vulkan-Headers
+  # GIT_TAG must keep in sync with tensorflow/third_party/vulkan_headers/workspace.bzl
+  GIT_TAG 0e57fc1cfa56a203efe43e4dfb9b3c9e9b105593
+  GIT_PROGRESS TRUE
+  PREFIX "${CMAKE_BINARY_DIR}"
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/vulkan_headers"
+)
+
+OverridableFetchContent_GetProperties(vulkan_headers)
+if(NOT vulkan_headers)
+  OverridableFetchContent_Populate(vulkan_headers)
+endif()
+
+include_directories(
+  AFTER
+   "${vulkan_headers_SOURCE_DIR}/include"
+)
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
new file mode 100644
index 00000000000..e5f205c0b0b
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -0,0 +1,54 @@
+#
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET xnnpack OR xnnpack_POPULATED)
+  return()
+endif()
+
+include(FetchContent)
+
+OverridableFetchContent_Declare(
+  xnnpack
+  GIT_REPOSITORY https://github.com/google/xnnpack
+  GIT_TAG 0af63ab36b899559bd1a92bbc327f8137e53c15c
+  GIT_PROGRESS TRUE
+  PREFIX "${CMAKE_BINARY_DIR}"
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
+)
+OverridableFetchContent_GetProperties(xnnpack)
+if(NOT xnnpack_POPULATED)
+  OverridableFetchContent_Populate(xnnpack)
+endif()
+
+# May consider setting XNNPACK_USE_SYSTEM_LIBS if we want to control all
+# dependencies by TFLite.
+set(XNNPACK_BUILD_TESTS OFF CACHE BOOL "Disable XNNPACK test.")
+set(XNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "Disable XNNPACK benchmarks.")
+
+# The following line adds project of PTHREADPOOL, FP16 and XNNPACK which are
+# needed to compile XNNPACK delegate of TFLite.
+add_subdirectory(
+  "${xnnpack_SOURCE_DIR}"
+  "${xnnpack_BINARY_DIR}"
+  EXCLUDE_FROM_ALL
+)
+
+include_directories(
+  AFTER
+   "${PTHREADPOOL_SOURCE_DIR}/include"
+   "${FP16_SOURCE_DIR}/include"
+   "${XNNPACK_SOURCE_DIR}/include"
+   "${CPUINFO_SOURCE_DIR}/"
+)
diff --git a/tensorflow/lite/tools/evaluation/BUILD b/tensorflow/lite/tools/evaluation/BUILD
index e653379ef69..bb0d3ac4cec 100644
--- a/tensorflow/lite/tools/evaluation/BUILD
+++ b/tensorflow/lite/tools/evaluation/BUILD
@@ -41,16 +41,12 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
     ] + select({
         "//tensorflow/lite/delegates/gpu:supports_gpu_delegate": [
             "//tensorflow/lite/delegates/gpu:delegate",
         ],
         "//conditions:default": [],
-    }) + select({
-        "//tensorflow:android": [
-            "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
-        ],
-        "//conditions:default": [],
     }) + select({
         "//tensorflow:android_arm": [
             "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
diff --git a/tensorflow/lite/tools/evaluation/proto/BUILD b/tensorflow/lite/tools/evaluation/proto/BUILD
index 7ced075e2db..8b2b444e7cf 100644
--- a/tensorflow/lite/tools/evaluation/proto/BUILD
+++ b/tensorflow/lite/tools/evaluation/proto/BUILD
@@ -15,7 +15,7 @@
 
 load(
     "//tensorflow/core/platform:build_config.bzl",
-    "tf_proto_library_py",
+    "tf_proto_library",
 )
 
 package(
@@ -37,12 +37,7 @@ cc_proto_library(
     deps = ["evaluation_stages_proto"],
 )
 
-java_proto_library(
-    name = "evaluation_stages_java_proto",
-    deps = ["evaluation_stages_proto"],
-)
-
-tf_proto_library_py(
+tf_proto_library(
     name = "evaluation_stages",  # bzl adds _py
     srcs = [
         "evaluation_stages.proto",
@@ -51,7 +46,7 @@ tf_proto_library_py(
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library_py(
+tf_proto_library(
     name = "preprocessing_steps",
     srcs = [
         "preprocessing_steps.proto",
@@ -90,8 +85,3 @@ cc_proto_library(
     name = "preprocessing_steps_cc_proto",
     deps = ["preprocessing_steps_proto"],
 )
-
-java_proto_library(
-    name = "preprocessing_steps_java_proto",
-    deps = ["preprocessing_steps_proto"],
-)
diff --git a/tensorflow/lite/tools/evaluation/stages/BUILD b/tensorflow/lite/tools/evaluation/stages/BUILD
index 17cb0f4ef9f..4363aff01a9 100644
--- a/tensorflow/lite/tools/evaluation/stages/BUILD
+++ b/tensorflow/lite/tools/evaluation/stages/BUILD
@@ -51,7 +51,7 @@ cc_library(
         "//tensorflow/lite/tools/evaluation/proto:preprocessing_steps_cc_proto",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_jpeg_internal",
+            "//tensorflow/core:portable_jpeg_internal",
         ],
         "//conditions:default": [
             "//tensorflow/core:jpeg_internal",
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
index 365a00c3cd1..124f85c078b 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
@@ -51,6 +51,27 @@ void TfliteInferenceStage::UpdateModelInfo() {
   }
 }
 
+TfLiteStatus TfliteInferenceStage::ResizeInputs(
+    const std::vector<std::vector<int>>& shapes) {
+  const std::vector<int>& intepreter_inputs = interpreter_->inputs();
+  if (intepreter_inputs.size() != shapes.size()) {
+    LOG(ERROR) << "New shape is not compatible";
+    return kTfLiteError;
+  }
+
+  for (int j = 0; j < shapes.size(); ++j) {
+    int i = intepreter_inputs[j];
+    TfLiteTensor* t = interpreter_->tensor(i);
+    if (t->type != kTfLiteString) {
+      TF_LITE_ENSURE_STATUS(interpreter_->ResizeInputTensor(i, shapes[j]));
+    }
+  }
+
+  TF_LITE_ENSURE_STATUS(interpreter_->AllocateTensors());
+  UpdateModelInfo();
+  return kTfLiteOk;
+}
+
 TfLiteStatus TfliteInferenceStage::ApplyCustomDelegate(
     Interpreter::TfLiteDelegatePtr delegate) {
   if (!interpreter_) {
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
index a8a319fcd16..84ae93aaada 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.h
@@ -58,6 +58,9 @@ class TfliteInferenceStage : public EvaluationStage {
     inputs_ = &raw_input_ptrs;
   }
 
+  // Resize input tensors with given shapes.
+  TfLiteStatus ResizeInputs(const std::vector<std::vector<int>>& shapes);
+
   // Applies provided delegate to the underlying TFLite Interpreter.
   TfLiteStatus ApplyCustomDelegate(Interpreter::TfLiteDelegatePtr delegate);
 
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc
index 80e6b3a6a07..1597a18d625 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage_test.cc
@@ -121,6 +121,40 @@ TEST(TfliteInferenceStage, CorrectModelInfo) {
   EXPECT_EQ(output_shape->data[3], 3);
 }
 
+TEST(TfliteInferenceStage, TestResizeModel) {
+  // Create stage.
+  EvaluationStageConfig config = GetTfliteInferenceStageConfig();
+  TfliteInferenceStage stage(config);
+
+  // Initialize.
+  EXPECT_EQ(stage.Init(), kTfLiteOk);
+
+  // Resize.
+  EXPECT_EQ(stage.ResizeInputs({{3, 8, 8, 3}}), kTfLiteOk);
+
+  const TfLiteModelInfo* model_info = stage.GetModelInfo();
+  // Verify Input
+  EXPECT_EQ(model_info->inputs.size(), 1);
+  const TfLiteTensor* tensor = model_info->inputs[0];
+  EXPECT_EQ(tensor->type, kTfLiteUInt8);
+  EXPECT_EQ(tensor->bytes, 3 * kTotalElements);
+  const TfLiteIntArray* input_shape = tensor->dims;
+  EXPECT_EQ(input_shape->data[0], 3);
+  EXPECT_EQ(input_shape->data[1], 8);
+  EXPECT_EQ(input_shape->data[2], 8);
+  EXPECT_EQ(input_shape->data[3], 3);
+  // Verify Output
+  EXPECT_EQ(model_info->outputs.size(), 1);
+  tensor = model_info->outputs[0];
+  EXPECT_EQ(tensor->type, kTfLiteUInt8);
+  EXPECT_EQ(tensor->bytes, 3 * kTotalElements);
+  const TfLiteIntArray* output_shape = tensor->dims;
+  EXPECT_EQ(output_shape->data[0], 3);
+  EXPECT_EQ(output_shape->data[1], 8);
+  EXPECT_EQ(output_shape->data[2], 8);
+  EXPECT_EQ(output_shape->data[3], 3);
+}
+
 TEST(TfliteInferenceStage, CorrectOutput) {
   // Create stage.
   EvaluationStageConfig config = GetTfliteInferenceStageConfig();
diff --git a/tensorflow/lite/tools/evaluation/utils.cc b/tensorflow/lite/tools/evaluation/utils.cc
index d75270c07e9..56562210848 100644
--- a/tensorflow/lite/tools/evaluation/utils.cc
+++ b/tensorflow/lite/tools/evaluation/utils.cc
@@ -105,14 +105,16 @@ TfLiteDelegatePtr CreateNNAPIDelegate() {
 #endif  // defined(__ANDROID__)
 }
 
-#if defined(__ANDROID__)
 TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options) {
+#if defined(__ANDROID__)
   return TfLiteDelegatePtr(
       new StatefulNnApiDelegate(options), [](TfLiteDelegate* delegate) {
         delete reinterpret_cast<StatefulNnApiDelegate*>(delegate);
       });
-}
+#else
+  return CreateNullDelegate();
 #endif  // defined(__ANDROID__)
+}
 
 #if TFLITE_SUPPORTS_GPU_DELEGATE
 TfLiteDelegatePtr CreateGPUDelegate(TfLiteGpuDelegateOptionsV2* options) {
diff --git a/tensorflow/lite/tools/evaluation/utils.h b/tensorflow/lite/tools/evaluation/utils.h
index 02013f3e39a..bfb5c4006dc 100644
--- a/tensorflow/lite/tools/evaluation/utils.h
+++ b/tensorflow/lite/tools/evaluation/utils.h
@@ -25,16 +25,15 @@ limitations under the License.
 #define TFLITE_SUPPORTS_GPU_DELEGATE 1
 #endif
 
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+
 #if TFLITE_SUPPORTS_GPU_DELEGATE
 #include "tensorflow/lite/delegates/gpu/delegate.h"
 #endif
 
-#if defined(__ANDROID__)
-#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
-#if (defined(__arm__) || defined(__aarch64__))
+#if defined(__ANDROID__) && (defined(__arm__) || defined(__aarch64__))
 #include "tensorflow/lite/delegates/hexagon/hexagon_delegate.h"
 #endif
-#endif
 
 // TODO(b/149248802): include XNNPACK delegate when the issue is resolved.
 #if !defined(__Fuchsia__) || defined(TFLITE_WITHOUT_XNNPACK)
@@ -46,8 +45,8 @@ limitations under the License.
 namespace tflite {
 namespace evaluation {
 
-// Same w/ Interpreter::TfLiteDelegatePtr to avoid pulling
-// tensorflow/lite/interpreter.h dependency
+// Same as Interpreter::TfLiteDelegatePtr, defined here to avoid pulling
+// in tensorflow/lite/interpreter.h dependency.
 using TfLiteDelegatePtr =
     std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
 
@@ -68,10 +67,9 @@ inline TfLiteStatus GetSortedFileNames(const std::string& directory,
                             std::unordered_set<std::string>());
 }
 
+// Returns nullptr on error, e.g. if NNAPI isn't supported on this platform.
 TfLiteDelegatePtr CreateNNAPIDelegate();
-#if defined(__ANDROID__)
 TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options);
-#endif
 
 TfLiteDelegatePtr CreateGPUDelegate();
 #if TFLITE_SUPPORTS_GPU_DELEGATE
diff --git a/tensorflow/lite/tools/flatbuffer_utils.py b/tensorflow/lite/tools/flatbuffer_utils.py
index 3171759201c..29eaf1c7a3b 100644
--- a/tensorflow/lite/tools/flatbuffer_utils.py
+++ b/tensorflow/lite/tools/flatbuffer_utils.py
@@ -28,6 +28,7 @@ from __future__ import print_function
 import copy
 import os
 import random
+import re
 
 import flatbuffers
 from tensorflow.lite.python import schema_py_generated as schema_fb
@@ -81,7 +82,7 @@ def read_model_with_mutable_tensors(input_tflite_file):
 
 
 def convert_object_to_bytearray(model_object):
-  """Converts a tflite model from an object to a bytearray."""
+  """Converts a tflite model from an object to a immutable bytearray."""
   # Initial size of the buffer, which will grow automatically if needed
   builder = flatbuffers.Builder(1024)
   model_offset = model_object.Pack(builder)
@@ -153,3 +154,59 @@ def randomize_weights(model, random_seed=0):
     # end up as denormalized or NaN/Inf floating point numbers.
     for j in range(buffer_i_size):
       buffer_i_data[j] = random.randint(0, 255)
+
+
+def xxd_output_to_bytes(input_cc_file):
+  """Converts xxd output C++ source file to bytes (immutable)
+
+  Args:
+    input_cc_file: Full path name to th C++ source file dumped by xxd
+
+  Raises:
+    RuntimeError: If input_cc_file path is invalid.
+    IOError: If input_cc_file cannot be opened.
+
+  Returns:
+    A bytearray corresponding to the input cc file array.
+  """
+  # Match hex values in the string with comma as separator
+  pattern = re.compile(r'\W*(0x[0-9a-fA-F,x ]+).*')
+
+  model_bytearray = bytearray()
+
+  with open(input_cc_file) as file_handle:
+    for line in file_handle:
+      values_match = pattern.match(line)
+
+      if values_match is None:
+        continue
+
+      # Match in the parentheses (hex array only)
+      list_text = values_match.group(1)
+
+      # Extract hex values (text) from the line
+      # e.g. 0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c,
+      values_text = filter(None, list_text.split(','))
+
+      # Convert to hex
+      values = [int(x, base=16) for x in values_text]
+      model_bytearray.extend(values)
+
+  return bytes(model_bytearray)
+
+
+def xxd_output_to_object(input_cc_file):
+  """Converts xxd output C++ source file to object
+
+  Args:
+    input_cc_file: Full path name to th C++ source file dumped by xxd
+
+  Raises:
+    RuntimeError: If input_cc_file path is invalid.
+    IOError: If input_cc_file cannot be opened.
+
+  Returns:
+    A python object corresponding to the input tflite file.
+  """
+  model_bytes = xxd_output_to_bytes(input_cc_file)
+  return convert_bytearray_to_object(model_bytes)
diff --git a/tensorflow/lite/tools/flatbuffer_utils_test.py b/tensorflow/lite/tools/flatbuffer_utils_test.py
index 60235b06bc8..0b7aa282ab1 100644
--- a/tensorflow/lite/tools/flatbuffer_utils_test.py
+++ b/tensorflow/lite/tools/flatbuffer_utils_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import copy
 import os
+import subprocess
 
 from tensorflow.lite.tools import flatbuffer_utils
 from tensorflow.lite.tools import test_utils
@@ -159,5 +160,33 @@ class RandomizeWeightsTest(test_util.TensorFlowTestCase):
       self.assertNotEqual(initial_buffer.data[j], final_buffer.data[j])
 
 
+class XxdOutputToBytesTest(test_util.TensorFlowTestCase):
+
+  def testXxdOutputToBytes(self):
+    # 1. SETUP
+    # Define the initial model
+    initial_model = test_utils.build_mock_model()
+    initial_bytes = flatbuffer_utils.convert_object_to_bytearray(initial_model)
+
+    # Define temporary files
+    tmp_dir = self.get_temp_dir()
+    model_filename = os.path.join(tmp_dir, 'model.tflite')
+
+    # 2. Write model to temporary file (will be used as input for xxd)
+    flatbuffer_utils.write_model(initial_model, model_filename)
+
+    # 3. DUMP WITH xxd
+    input_cc_file = os.path.join(tmp_dir, 'model.cc')
+
+    command = 'xxd -i {} > {}'.format(model_filename, input_cc_file)
+    subprocess.call(command, shell=True)
+
+    # 4. VALIDATE
+    final_bytes = flatbuffer_utils.xxd_output_to_bytes(input_cc_file)
+
+    # Validate that the initial and final bytearray are the same
+    self.assertEqual(initial_bytes, final_bytes)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/lite/tools/gen_op_registration.cc b/tensorflow/lite/tools/gen_op_registration.cc
index 2a95df799e8..76f1d4e4046 100644
--- a/tensorflow/lite/tools/gen_op_registration.cc
+++ b/tensorflow/lite/tools/gen_op_registration.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "re2/re2.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 
 namespace tflite {
 
@@ -37,10 +38,11 @@ void ReadOpsFromModel(const ::tflite::Model* model,
   if (!opcodes) return;
   for (const auto* opcode : *opcodes) {
     const int version = opcode->version();
-    if (opcode->builtin_code() != ::tflite::BuiltinOperator_CUSTOM) {
-      auto iter_and_bool = builtin_ops->insert(std::make_pair(
-          tflite::EnumNameBuiltinOperator(opcode->builtin_code()),
-          std::make_pair(version, version)));
+    auto builtin_code = GetBuiltinCode(opcode);
+    if (builtin_code != ::tflite::BuiltinOperator_CUSTOM) {
+      auto iter_and_bool = builtin_ops->insert(
+          std::make_pair(tflite::EnumNameBuiltinOperator(builtin_code),
+                         std::make_pair(version, version)));
       auto& versions = iter_and_bool.first->second;
       versions.first = std::min(versions.first, version);
       versions.second = std::max(versions.second, version);
diff --git a/tensorflow/lite/tools/list_flex_ops.cc b/tensorflow/lite/tools/list_flex_ops.cc
index 6fdfd061134..188efc57f90 100644
--- a/tensorflow/lite/tools/list_flex_ops.cc
+++ b/tensorflow/lite/tools/list_flex_ops.cc
@@ -20,13 +20,14 @@ limitations under the License.
 #include <vector>
 
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
-#include "include/json/json.h"
+#include "json/json.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
@@ -87,7 +88,7 @@ void AddFlexOpsFromModel(const tflite::Model* model, OpKernelSet* flex_ops) {
     for (int i = 0; i < operators->size(); ++i) {
       const tflite::Operator* op = operators->Get(i);
       const tflite::OperatorCode* opcode = opcodes->Get(op->opcode_index());
-      if (opcode->builtin_code() != tflite::BuiltinOperator_CUSTOM ||
+      if (tflite::GetBuiltinCode(opcode) != tflite::BuiltinOperator_CUSTOM ||
           !tflite::IsFlexOp(opcode->custom_code()->c_str())) {
         continue;
       }
diff --git a/tensorflow/lite/tools/list_flex_ops_no_kernel.cc b/tensorflow/lite/tools/list_flex_ops_no_kernel.cc
index ea4d41c0de3..68d40be1c9c 100644
--- a/tensorflow/lite/tools/list_flex_ops_no_kernel.cc
+++ b/tensorflow/lite/tools/list_flex_ops_no_kernel.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "include/json/json.h"
+#include "json/json.h"
 #include "tensorflow/lite/tools/list_flex_ops.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/tools/list_flex_ops_test.cc b/tensorflow/lite/tools/list_flex_ops_test.cc
index 7d81dda71e6..8b3757a4039 100644
--- a/tensorflow/lite/tools/list_flex_ops_test.cc
+++ b/tensorflow/lite/tools/list_flex_ops_test.cc
@@ -103,14 +103,14 @@ TEST_F(FlexOpsListTest, TestZeroSubgraphs) {
 TEST_F(FlexOpsListTest, TestFlexAdd) {
   ReadOps("tensorflow/lite/testdata/multi_add_flex.bin");
   EXPECT_EQ(output_text_,
-            "[[\"Add\",\"BinaryOp<CPUDevice, functor::add<float>>\"]]\n");
+            "[[\"AddV2\",\"BinaryOp<CPUDevice, functor::add<float>>\"]]\n");
 }
 
 TEST_F(FlexOpsListTest, TestTwoModel) {
   ReadOps("tensorflow/lite/testdata/multi_add_flex.bin");
   ReadOps("tensorflow/lite/testdata/softplus_flex.bin");
   EXPECT_EQ(output_text_,
-            "[[\"Add\",\"BinaryOp<CPUDevice, "
+            "[[\"AddV2\",\"BinaryOp<CPUDevice, "
             "functor::add<float>>\"],[\"Softplus\",\"SoftplusOp<CPUDevice, "
             "float>\"]]\n");
 }
@@ -119,7 +119,7 @@ TEST_F(FlexOpsListTest, TestDuplicatedOp) {
   ReadOps("tensorflow/lite/testdata/multi_add_flex.bin");
   ReadOps("tensorflow/lite/testdata/multi_add_flex.bin");
   EXPECT_EQ(output_text_,
-            "[[\"Add\",\"BinaryOp<CPUDevice, functor::add<float>>\"]]\n");
+            "[[\"AddV2\",\"BinaryOp<CPUDevice, functor::add<float>>\"]]\n");
 }
 
 TEST_F(FlexOpsListTest, TestInvalidCustomOptions) {
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index c7ddff58440..9d90e9526be 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -41,6 +41,10 @@ INCLUDES := \
 -I$(MAKEFILE_DIR)/downloads/farmhash/src \
 -I$(MAKEFILE_DIR)/downloads/flatbuffers/include \
 -I$(MAKEFILE_DIR)/downloads/fp16/include \
+-I$(MAKEFILE_DIR)/downloads/cpuinfo \
+-I$(MAKEFILE_DIR)/downloads/cpuinfo/include \
+-I$(MAKEFILE_DIR)/downloads/cpuinfo/src \
+-I$(MAKEFILE_DIR)/downloads/cpuinfo/deps/clog/include \
 -I$(OBJDIR)
 # This is at the end so any globally-installed frameworks like protobuf don't
 # override local versions in the source tree.
@@ -58,7 +62,7 @@ LIBS := \
 # There are no rules for compiling objects for the host system (since we don't
 # generate things like the protobuf compiler that require that), so all of
 # these settings are for the target compiler.
-CFLAGS := -O3 -DNDEBUG -fPIC $(EXTRA_CFLAGS)
+CFLAGS := -O3 -DNDEBUG -DCPU_SETSIZE=__CPU_SETSIZE -fPIC $(EXTRA_CFLAGS)
 CXXFLAGS := $(CFLAGS) --std=c++11 $(EXTRA_CXXFLAGS)
 LDOPTS := -L/usr/local/lib
 ARFLAGS := -r
@@ -69,6 +73,13 @@ ifeq ($(HOST_OS),windows)
 CXXFLAGS += -fext-numeric-literals -D__LITTLE_ENDIAN__
 endif
 
+ifeq ($(TARGET_ARCH),x86_64)
+  ifeq ($(TARGET),linux)
+    CXXFLAGS += -DTFLITE_HAVE_CPUINFO
+    TFLITE_HAVE_CPUINFO := true
+  endif
+endif
+
 # Auto-detect optimization opportunity if building natively.
 ifeq ($(HOST_OS),$(TARGET))
 ifeq ($(HOST_ARCH),$(TARGET_ARCH))
@@ -127,7 +138,17 @@ $(wildcard tensorflow/lite/c/*.cc) \
 $(wildcard tensorflow/lite/core/*.cc) \
 $(wildcard tensorflow/lite/core/api/*.cc) \
 $(wildcard tensorflow/lite/experimental/resource/*.cc) \
+$(wildcard tensorflow/lite/schema/schema_utils.cc) \
 $(wildcard tensorflow/lite/tools/make/downloads/ruy/ruy/*.cc)
+ifeq ($(TFLITE_HAVE_CPUINFO),true)
+CORE_CC_ALL_SRCS += \
+$(wildcard tensorflow/lite/tools/make/downloads/cpuinfo/src/*.c) \
+$(wildcard tensorflow/lite/tools/make/downloads/cpuinfo/src/x86/*.c) \
+$(wildcard tensorflow/lite/tools/make/downloads/cpuinfo/src/x86/linux/*.c) \
+$(wildcard tensorflow/lite/tools/make/downloads/cpuinfo/src/x86/cache/*.c) \
+$(wildcard tensorflow/lite/tools/make/downloads/cpuinfo/src/linux/*.c) \
+$(wildcard tensorflow/lite/tools/make/downloads/cpuinfo/deps/clog/src/*.c)
+endif
 ifneq ($(BUILD_TYPE),micro)
 CORE_CC_ALL_SRCS += \
 $(wildcard tensorflow/lite/kernels/*.cc) \
@@ -167,6 +188,7 @@ $(wildcard tensorflow/lite/*/*/*/*/*/*test.cc) \
 $(wildcard tensorflow/lite/*/*/*/*/*/*tool.cc) \
 $(wildcard tensorflow/lite/kernels/*test_main.cc) \
 $(wildcard tensorflow/lite/kernels/*test_util*.cc) \
+$(wildcard tensorflow/lite/tools/make/downloads/cpuinfo/src/*/mock*.c) \
 tensorflow/lite/tflite_with_xnnpack.cc \
 $(MINIMAL_SRCS)
 
@@ -315,9 +337,6 @@ $(OBJDIR)%.o: %.cc
 $(OBJDIR)%.o: %.c
 	@mkdir -p $(dir $@)
 	$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
-$(OBJDIR)%.o: %.cpp
-	@mkdir -p $(dir $@)
-	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
 
 # The target that's compiled if there's no command-line arguments.
 all: $(LIB_PATH)  $(MINIMAL_BINARY) $(BENCHMARK_BINARY) $(BENCHMARK_PERF_OPTIONS_BINARY)
diff --git a/tensorflow/lite/tools/make/download_dependencies.sh b/tensorflow/lite/tools/make/download_dependencies.sh
index 27537823be2..698256302c3 100755
--- a/tensorflow/lite/tools/make/download_dependencies.sh
+++ b/tensorflow/lite/tools/make/download_dependencies.sh
@@ -51,6 +51,8 @@ FLATBUFFERS_SHA="62f2223fb9181d1d6338451375628975775f7522185266cd5296571ac152bc4
 FFT2D_URL="https://storage.googleapis.com/mirror.tensorflow.org/www.kurims.kyoto-u.ac.jp/~ooura/fft2d.tgz"
 FP16_URL="https://github.com/Maratyszcza/FP16/archive/febbb1c163726b5db24bed55cc9dc42529068997.zip"
 FFT2D_SHA="ada7e99087c4ed477bfdf11413f2ba8db8a840ba9bbf8ac94f4f3972e2a7cec9"
+CPUINFO_URL="https://github.com/pytorch/cpuinfo/archive/c2092219e7c874783a00a62edb94ddc672f57ab3.zip"
+CPUINFO_SHA="ea56c399a4f6ca5f749e71acb6a7bfdc653eb65d8f658cb2e414a2fcdca1fe8b"
 # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
 #                   so work around it by patching the source.
 replace_by_sed() {
@@ -115,6 +117,7 @@ download_and_extract "${FARMHASH_URL}" "${DOWNLOADS_DIR}/farmhash" "${FARMHASH_S
 download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers" "${FLATBUFFERS_SHA}"
 download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d" "${FFT2D_SHA}"
 download_and_extract "${FP16_URL}" "${DOWNLOADS_DIR}/fp16"
+download_and_extract "${CPUINFO_URL}" "${DOWNLOADS_DIR}/cpuinfo"
 
 replace_by_sed 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD
index 146f869a906..9beb7a239e8 100644
--- a/tensorflow/lite/tools/optimize/BUILD
+++ b/tensorflow/lite/tools/optimize/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
 load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
@@ -22,6 +23,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_absl//absl/memory",
         "@flatbuffers",
     ],
@@ -38,6 +40,7 @@ tf_cc_test(
         ":modify_model_interface",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
@@ -77,6 +80,7 @@ tf_cc_test(
         ":quantization_wrapper_utils",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
@@ -102,6 +106,7 @@ cc_library(
     name = "quantization_utils",
     srcs = ["quantization_utils.cc"],
     hdrs = ["quantization_utils.h"],
+    compatible_with = get_compatible_with_cloud(),
     deps = [
         ":model_utils",
         "//tensorflow/lite:framework",
@@ -123,12 +128,14 @@ cc_library(
     name = "model_utils",
     srcs = ["model_utils.cc"],
     hdrs = ["model_utils.h"],
+    compatible_with = get_compatible_with_cloud(),
     deps = [
         ":operator_property",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/kernels/internal:types",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
@@ -157,10 +164,12 @@ cc_library(
     name = "operator_property",
     srcs = ["operator_property.cc"],
     hdrs = ["operator_property.h"],
+    compatible_with = get_compatible_with_cloud(),
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite/kernels/internal:types",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
     ],
 )
 
@@ -184,6 +193,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/lite/testing:util",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -196,6 +206,7 @@ cc_library(
     name = "quantize_weights",
     srcs = ["quantize_weights.cc"],
     hdrs = ["quantize_weights.h"],
+    compatible_with = get_compatible_with_cloud(),
     deps = [
         ":quantization_utils",
         ":model_utils",
@@ -207,6 +218,7 @@ cc_library(
         # TODO(suharshs): Move the relevant quantization utils to a non-internal location.
         "//tensorflow/lite/kernels/internal:tensor_utils",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "//tensorflow/core:tflite_portable_logging",
     ],
 )
@@ -234,6 +246,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -264,6 +277,7 @@ cc_library(
         "//tensorflow/lite:util",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@flatbuffers",
     ],
 )
@@ -311,6 +325,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
diff --git a/tensorflow/lite/tools/optimize/calibration/BUILD b/tensorflow/lite/tools/optimize/calibration/BUILD
index 674ef0ae4f6..2f315ca509a 100644
--- a/tensorflow/lite/tools/optimize/calibration/BUILD
+++ b/tensorflow/lite/tools/optimize/calibration/BUILD
@@ -48,6 +48,7 @@ cc_library(
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/schema:schema_utils",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -64,6 +65,7 @@ tf_cc_test(
     data = [
         "//tensorflow/lite:testdata/lstm.bin",
         "//tensorflow/lite:testdata/multi_add.bin",
+        "//tensorflow/lite:testdata/unidirectional_sequence_lstm.bin",
     ],
     tags = [
         "tflite_not_portable_android",
diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
index 1ac996abe87..bdf27d9a980 100644
--- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
+++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.cc
@@ -125,15 +125,14 @@ void CalculateLstmOutputCalibration(
     const float* output_gate, TfLiteFusedActivation activation,
     const float* projection_weights, const float* projection_bias,
     const float proj_clip, float* output_state, float* scratch, Logger* logger,
-    const std::vector<int>& intermediate_tensor_indexes,
-    ErrorReporter* error_reporter) {
+    int intermediate_tensor_index, ErrorReporter* error_reporter) {
   tensor_utils::ApplyActivationToVector(cell_state, n_batch * n_cell,
                                         activation, scratch);
   tensor_utils::VectorVectorCwiseProduct(output_gate, scratch, n_batch * n_cell,
                                          scratch);
 
-  logger->LogTensorValue(intermediate_tensor_indexes[4], scratch,
-                         n_cell * n_batch, error_reporter);
+  logger->LogTensorValue(intermediate_tensor_index, scratch, n_cell * n_batch,
+                         error_reporter);
 
   const bool use_projection = (projection_weights != nullptr);
   const bool use_projection_bias = (projection_bias != nullptr);
@@ -252,7 +251,7 @@ inline void LstmStepCalibration(
       n_batch, n_cell, n_output, cell_state_ptr, output_gate_scratch,
       params->activation, projection_weights_ptr, projection_bias_ptr,
       params->proj_clip, output_state_ptr, scratch2, logger,
-      intermediate_tensor_indexes, error_reporter);
+      intermediate_tensor_indexes[4], error_reporter);
   // Copy output state to the output. Note that the output's rows may not be
   // contiguous (output_batch_leading_dim != n_output).
   for (int b = 0; b < n_batch; b++) {
@@ -462,30 +461,54 @@ struct OpData {
 // Resize the output, state tensors based on the sizes of the input tensors.
 // Allocate a temporary scratch tensor. Also check that the sizes of the input
 // tensors match each other.
-TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node, Logger* logger,
+TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node,
+                       LSTMType lstm_type, Logger* logger,
                        ErrorReporter* error_reporter) {
-  const auto* params = static_cast<TfLiteLSTMParams*>(node->builtin_data);
-
-  const TfLiteTensor* input =
-      GetInput(context, node, ops::builtin::lstm::full::kInputTensor);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node,
+                            ops::builtin::lstm::full::kInputTensor, &input));
 
   const TfLiteTensor* input_to_input_weights = GetOptionalInputTensor(
       context, node, ops::builtin::lstm::full::kInputToInputWeightsTensor);
-  const TfLiteTensor* input_to_forget_weights = GetInput(
-      context, node, ops::builtin::lstm::full::kInputToForgetWeightsTensor);
-  const TfLiteTensor* input_to_cell_weights = GetInput(
-      context, node, ops::builtin::lstm::full::kInputToCellWeightsTensor);
-  const TfLiteTensor* input_to_output_weights = GetInput(
-      context, node, ops::builtin::lstm::full::kInputToOutputWeightsTensor);
+  const TfLiteTensor* input_to_forget_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node,
+                   ops::builtin::lstm::full::kInputToForgetWeightsTensor,
+                   &input_to_forget_weights));
+  const TfLiteTensor* input_to_cell_weights;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node,
+                            ops::builtin::lstm::full::kInputToCellWeightsTensor,
+                            &input_to_cell_weights));
+  const TfLiteTensor* input_to_output_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node,
+                   ops::builtin::lstm::full::kInputToOutputWeightsTensor,
+                   &input_to_output_weights));
 
   const TfLiteTensor* recurrent_to_input_weights = GetOptionalInputTensor(
       context, node, ops::builtin::lstm::full::kRecurrentToInputWeightsTensor);
-  const TfLiteTensor* recurrent_to_forget_weights = GetInput(
-      context, node, ops::builtin::lstm::full::kRecurrentToForgetWeightsTensor);
-  const TfLiteTensor* recurrent_to_cell_weights = GetInput(
-      context, node, ops::builtin::lstm::full::kRecurrentToCellWeightsTensor);
-  const TfLiteTensor* recurrent_to_output_weights = GetInput(
-      context, node, ops::builtin::lstm::full::kRecurrentToOutputWeightsTensor);
+  const TfLiteTensor* recurrent_to_forget_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node,
+                   ops::builtin::lstm::full::kRecurrentToForgetWeightsTensor,
+                   &recurrent_to_forget_weights));
+  const TfLiteTensor* recurrent_to_cell_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node,
+                   ops::builtin::lstm::full::kRecurrentToCellWeightsTensor,
+                   &recurrent_to_cell_weights));
+  const TfLiteTensor* recurrent_to_output_weights;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node,
+                   ops::builtin::lstm::full::kRecurrentToOutputWeightsTensor,
+                   &recurrent_to_output_weights));
 
   const TfLiteTensor* cell_to_input_weights = GetOptionalInputTensor(
       context, node, ops::builtin::lstm::full::kCellToInputWeightsTensor);
@@ -509,12 +532,21 @@ TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node, Logger* logger,
 
   const TfLiteTensor* input_gate_bias = GetOptionalInputTensor(
       context, node, ops::builtin::lstm::full::kInputGateBiasTensor);
-  const TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, ops::builtin::lstm::full::kForgetGateBiasTensor);
-  const TfLiteTensor* cell_gate_bias =
-      GetInput(context, node, ops::builtin::lstm::full::kCellGateBiasTensor);
-  const TfLiteTensor* output_gate_bias =
-      GetInput(context, node, ops::builtin::lstm::full::kOutputGateBiasTensor);
+  const TfLiteTensor* forget_gate_bias;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node,
+                            ops::builtin::lstm::full::kForgetGateBiasTensor,
+                            &forget_gate_bias));
+  const TfLiteTensor* cell_gate_bias;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetInputSafe(context, node, ops::builtin::lstm::full::kCellGateBiasTensor,
+                   &cell_gate_bias));
+  const TfLiteTensor* output_gate_bias;
+  TF_LITE_ENSURE_OK(
+      context, GetInputSafe(context, node,
+                            ops::builtin::lstm::full::kOutputGateBiasTensor,
+                            &output_gate_bias));
 
   const TfLiteTensor* projection_weights = GetOptionalInputTensor(
       context, node, ops::builtin::lstm::full::kProjectionWeightsTensor);
@@ -522,7 +554,9 @@ TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node, Logger* logger,
       context, node, ops::builtin::lstm::full::kProjectionBiasTensor);
 
   // Index the scratch buffers pointers to the global scratch buffer.
-  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
+  TfLiteTensor* scratch_buffer;
+  TF_LITE_ENSURE_OK(
+      context, GetTemporarySafe(context, node, /*index=*/0, &scratch_buffer));
 
   TfLiteTensor* output_state = GetVariableInput(
       context, node, ops::builtin::lstm::full::kOutputStateTensor);
@@ -531,14 +565,43 @@ TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node, Logger* logger,
       context, node, ops::builtin::lstm::full::kCellStateTensor);
   TF_LITE_ENSURE(context, cell_state != nullptr);
 
-  TfLiteTensor* output =
-      GetOutput(context, node, ops::builtin::lstm::full::kOutputTensor);
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(
+      context, GetOutputSafe(context, node,
+                             ops::builtin::lstm::full::kOutputTensor, &output));
 
   std::vector<int> intermediate_tensor_indexes(node->intermediates->size);
+  // LSTM expect 5 intermediate tensors.
+  TF_LITE_ENSURE_EQ(context, node->intermediates->size, 5);
   for (int i = 0; i < node->intermediates->size; ++i) {
     intermediate_tensor_indexes[i] = node->intermediates->data[i];
   }
 
+  TfLiteLSTMParams lstm_params;
+  bool time_major = true;
+  switch (lstm_type) {
+    case LSTMType::kLSTM: {
+      lstm_params = *(static_cast<TfLiteLSTMParams*>(node->builtin_data));
+      time_major = true;
+      break;
+    }
+    case LSTMType::kUnidirectionalSequenceLSTM: {
+      const auto* params = static_cast<TfLiteUnidirectionalSequenceLSTMParams*>(
+          node->builtin_data);
+      // Copy out the LSTM specific params so they can be passed in the
+      // function.
+      lstm_params.activation = params->activation;
+      lstm_params.cell_clip = params->cell_clip;
+      lstm_params.proj_clip = params->proj_clip;
+      lstm_params.asymmetric_quantize_inputs =
+          params->asymmetric_quantize_inputs;
+      time_major = params->time_major;
+      break;
+    }
+    default:
+      return kTfLiteError;
+  }
+
   switch (input_to_output_weights->type) {
     case kTfLiteFloat32: {
       return EvalCalibration(
@@ -555,9 +618,9 @@ TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node, Logger* logger,
           /*aux_input_to_cell_weights=*/nullptr,
           /*aux_input_to_output_weights=*/nullptr, input_gate_bias,
           forget_gate_bias, cell_gate_bias, output_gate_bias,
-          projection_weights, projection_bias, params,
+          projection_weights, projection_bias, &lstm_params,
           /*forward_sequence=*/true,
-          /*time_major=*/true,
+          /*time_major=*/time_major,
           /*output_offset=*/0, scratch_buffer, output_state, cell_state, output,
           logger, intermediate_tensor_indexes, error_reporter);
     }
@@ -574,7 +637,14 @@ TfLiteStatus lstm_eval(TfLiteContext* context, TfLiteNode* node, Logger* logger,
 TfLiteStatus lstm_logging_kernel(TfLiteContext* context, TfLiteNode* node,
                                  Logger* logger,
                                  ErrorReporter* error_reporter) {
-  return lstm_eval(context, node, logger, error_reporter);
+  return lstm_eval(context, node, LSTMType::kLSTM, logger, error_reporter);
+}
+
+TfLiteStatus unidirectional_sequence_lstm_logging_kernel(
+    TfLiteContext* context, TfLiteNode* node, Logger* logger,
+    ErrorReporter* error_reporter) {
+  return lstm_eval(context, node, LSTMType::kUnidirectionalSequenceLSTM, logger,
+                   error_reporter);
 }
 
 }  // namespace builtin
diff --git a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.h b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.h
index f3306bc0564..0a9e7095507 100644
--- a/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.h
+++ b/tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.h
@@ -23,9 +23,18 @@ namespace optimize {
 namespace calibration {
 namespace builtin {
 
+enum class LSTMType {
+  kLSTM,
+  kUnidirectionalSequenceLSTM,
+};
+
 TfLiteStatus lstm_logging_kernel(TfLiteContext* context, TfLiteNode* node,
                                  Logger* logger, ErrorReporter* error_reporter);
 
+TfLiteStatus unidirectional_sequence_lstm_logging_kernel(
+    TfLiteContext* context, TfLiteNode* node, Logger* logger,
+    ErrorReporter* error_reporter);
+
 }  // namespace builtin
 }  // namespace calibration
 }  // namespace optimize
diff --git a/tensorflow/lite/tools/optimize/calibration/calibrator.cc b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
index e4e1afad7e3..6cddbc53009 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibrator.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/stderr_reporter.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/tools/optimize/calibration/builtin_logging_ops/lstm.h"
@@ -173,13 +174,17 @@ GlobalCalibratorRegistry* GetCalibratorRegistry() {
 // TODO(jianlijianli): extend this to support multiple recipe for the same
 // model.
 logging_kernel_func_ptr GetLoggingEvalFunc(TfLiteContext* context,
-                                           TfLiteNode* node) {
-  const int lstm_number_input = 24;
-  if (node->inputs->size == lstm_number_input) {
-    // LSTM Op.
-    return tflite::optimize::calibration::builtin::lstm_logging_kernel;
+                                           TfLiteNode* node,
+                                           int builtin_op_code) {
+  switch (builtin_op_code) {
+    case BuiltinOperator_LSTM:
+      return tflite::optimize::calibration::builtin::lstm_logging_kernel;
+    case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
+      return tflite::optimize::calibration::builtin::
+          unidirectional_sequence_lstm_logging_kernel;
+    default:
+      return nullptr;
   }
-  return nullptr;
 }
 
 // A wrapper implementation for |TfLiteRegistration.invoke| that logs inputs,
@@ -202,7 +207,9 @@ TfLiteStatus LoggingEval(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_STATUS(logger->LogTensorValue(
         i, tensor.data.f, tensor.bytes / sizeof(float), error_reporter));
   }
-  auto kernel_invoke_intermediate = GetLoggingEvalFunc(context, node);
+  auto builtin_op_code = calibrator->GetOpInfo(node).builtin_op_code;
+  auto kernel_invoke_intermediate =
+      GetLoggingEvalFunc(context, node, builtin_op_code);
   TfLiteStatus status;
   if (kernel_invoke_intermediate == nullptr) {
     status = kernel_invoke(context, node);
@@ -286,7 +293,7 @@ string GetOpName(const tflite::OperatorCode& opcode) {
   if (opcode.custom_code() != nullptr) {
     return opcode.custom_code()->str();
   }
-  return tflite::EnumNamesBuiltinOperator()[opcode.builtin_code()];
+  return tflite::EnumNamesBuiltinOperator()[GetBuiltinCode(&opcode)];
 }
 
 // A |CalibrationReader| that owns the Calibrator.
@@ -348,7 +355,7 @@ TfLiteStatus BuildLoggingInterpreter(
     op_info.node_index = i;
     auto op = operators->Get(i);
     auto operator_code = operator_codes->Get(op->opcode_index());
-    op_info.builtin_op_code = operator_code->builtin_code();
+    op_info.builtin_op_code = GetBuiltinCode(operator_code);
     op_info.name = GetOpName(*operator_code);
     op_info.is_custom_op = operator_code->custom_code() != nullptr;
     op_info.version = operator_code->version();
@@ -367,7 +374,7 @@ TfLiteStatus BuildLoggingInterpreter(
       custom_op_and_versions.insert(
           {op_info.name.c_str(), operator_code->version()});
     } else {
-      op_info.registration = op_resolver.FindOp(operator_code->builtin_code(),
+      op_info.registration = op_resolver.FindOp(GetBuiltinCode(operator_code),
                                                 operator_code->version());
       builtin_op_and_versions.insert(
           {op_info.builtin_op_code, operator_code->version()});
diff --git a/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc b/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
index f0cd27ef620..c2e205f2a6e 100644
--- a/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
+++ b/tensorflow/lite/tools/optimize/calibration/calibrator_test.cc
@@ -283,7 +283,7 @@ TEST(CalibratorTest, LSTM) {
   auto status = BuildLoggingInterpreter(*flatbuffer_model,
                                         ops::builtin::BuiltinOpResolver{},
                                         &interpreter, &reader);
-  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_EQ(status, kTfLiteOk);
 
   auto readonly_model = flatbuffer_model->GetModel();
   tflite::ModelT model;
@@ -294,24 +294,17 @@ TEST(CalibratorTest, LSTM) {
   status = interpreter->AllocateTensors();
 
   EXPECT_EQ(kTfLiteOk, status);
-  const std::vector<float> lstm_input = {
-      0.3, 0.2, 0.9, 0.8, 0.1,  //
-      0.1, 0.5, 0.2, 0.4, 0.2,  //
-      0.6, 0.9, 0.2, 0.5, 0.7,  //
-  };
+  const std::vector<float> lstm_input = {0.3, 0.2};
   int input_tensor_idx = interpreter->inputs()[0];
   TfLiteTensor* tensor = interpreter->tensor(input_tensor_idx);
   for (size_t j = 0; j < lstm_input.size(); j++) {
     tensor->data.f[j] = lstm_input[j];
   }
 
-  // Invoke with update == true.
-  status = interpreter->Invoke();
-  ASSERT_EQ(kTfLiteOk, status);
+  ASSERT_EQ(interpreter->Invoke(), kTfLiteOk);
 
   absl::flat_hash_map<int, CalibrationReader::CalibrationStats> stats;
-  status = reader->GetTensorStatsAsMap(&stats);
-  EXPECT_EQ(kTfLiteOk, status);
+  EXPECT_EQ(reader->GetTensorStatsAsMap(&stats), kTfLiteOk);
 
   // Check the results.
   const float eps = 1e-6f;
@@ -344,6 +337,66 @@ TEST(CalibratorTest, LSTM) {
   }
 }
 
+TEST(CalibratorTest, UnidirectionalSequenceLSTM) {
+  auto flatbuffer_model = ReadModel("unidirectional_sequence_lstm.bin");
+  ASSERT_TRUE(flatbuffer_model);
+  std::unique_ptr<Interpreter> interpreter;
+  std::unique_ptr<CalibrationReader> reader;
+  auto status = BuildLoggingInterpreter(*flatbuffer_model,
+                                        ops::builtin::BuiltinOpResolver{},
+                                        &interpreter, &reader);
+  EXPECT_EQ(kTfLiteOk, status);
+
+  auto readonly_model = flatbuffer_model->GetModel();
+  tflite::ModelT model;
+  readonly_model->UnPackTo(&model);
+
+  ASSERT_TRUE(interpreter);
+  ASSERT_TRUE(reader);
+  EXPECT_EQ(interpreter->AllocateTensors(), kTfLiteOk);
+  const std::vector<float> lstm_input = {0.3, 0.2, 0.9, 0.8};
+  int input_tensor_idx = interpreter->inputs()[0];
+  TfLiteTensor* tensor = interpreter->tensor(input_tensor_idx);
+  for (size_t j = 0; j < lstm_input.size(); j++) {
+    tensor->data.f[j] = lstm_input[j];
+  }
+
+  ASSERT_EQ(interpreter->Invoke(), kTfLiteOk);
+
+  absl::flat_hash_map<int, CalibrationReader::CalibrationStats> stats;
+  EXPECT_EQ(reader->GetTensorStatsAsMap(&stats), kTfLiteOk);
+
+  // Check the results.
+  const float eps = 1e-6f;
+  const std::unordered_map<int, CalibrationReader::CalibrationStats>
+      expected_calibration_result = {
+          // Input.
+          {0, {0.200000, 0.900000}},
+          // State.
+          {18, {0.000000, 0.520999}},
+          // State.
+          {19, {0.000000, 0.711364}},
+          // Output.
+          {24, {0.247992, 0.520999}},
+          // Intemediate_0.
+          {25, {0.080045, 0.824241}},
+          // Intemediate_1.
+          {26, {0.080045, 0.824241}},
+          // Intemediate_2.
+          {27, {0.080045, 0.824241}},
+          // Intemediate_3.
+          {28, {0.080045, 0.824241}},
+          // Intemediate_4.
+          {29, {0.000000, 0.413618}},
+      };
+  EXPECT_EQ(expected_calibration_result.size(), stats.size());
+  for (const auto& e : stats) {
+    auto expected_result = expected_calibration_result.at(e.first);
+    EXPECT_NEAR(e.second.min, expected_result.min, eps);
+    EXPECT_NEAR(e.second.max, expected_result.max, eps);
+  }
+}
+
 }  // namespace
 }  // namespace calibration
 }  // namespace optimize
diff --git a/tensorflow/lite/tools/optimize/model_utils.cc b/tensorflow/lite/tools/optimize/model_utils.cc
index f30fa8b7bdd..7224c623c77 100644
--- a/tensorflow/lite/tools/optimize/model_utils.cc
+++ b/tensorflow/lite/tools/optimize/model_utils.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/tools/optimize/operator_property.h"
 
 namespace tflite {
@@ -34,13 +35,15 @@ namespace {
 int32_t GetOrInsertOpCodeIndex(ModelT* model, const BuiltinOperator& op_code,
                                int32_t version) {
   for (size_t i = 0; i < model->operator_codes.size(); ++i) {
-    if (model->operator_codes[i]->builtin_code == op_code) {
+    if (GetBuiltinCode(model->operator_codes[i].get()) == op_code) {
       return i;
     }
   }
   model->operator_codes.push_back(absl::make_unique<OperatorCodeT>());
   int op_code_idx = model->operator_codes.size() - 1;
   model->operator_codes[op_code_idx]->builtin_code = op_code;
+  model->operator_codes[op_code_idx]->deprecated_builtin_code =
+      ConvertBuiltinCodeToDeprecatedBuiltinCode(op_code);
   // Version 2 and onwards supports INT8 inputs.
   model->operator_codes[op_code_idx]->version = version;
 
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface.cc b/tensorflow/lite/tools/optimize/modify_model_interface.cc
index d40d4455e24..570f84251a3 100644
--- a/tensorflow/lite/tools/optimize/modify_model_interface.cc
+++ b/tensorflow/lite/tools/optimize/modify_model_interface.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/tools/optimize/model_utils.h"
 
 namespace tflite {
@@ -66,7 +67,7 @@ std::vector<TensorOpTensor> GetInputTensors(const TensorType& input_type,
          op_idx--) {
       OperatorT* op = subgraph->operators[op_idx].get();
       const BuiltinOperator op_code =
-          model->operator_codes[op->opcode_index]->builtin_code;
+          GetBuiltinCode(model->operator_codes[op->opcode_index].get());
       TensorT* input_tensor = subgraph->tensors[op->inputs[0]].get();
       if (input_tensors.find(input_tensor) == input_tensors.end()) {
         continue;
@@ -141,7 +142,7 @@ std::vector<TensorOpTensor> GetOutputTensors(const TensorType& output_type,
          op_idx--) {
       OperatorT* op = subgraph->operators[op_idx].get();
       const BuiltinOperator op_code =
-          model->operator_codes[op->opcode_index]->builtin_code;
+          GetBuiltinCode(model->operator_codes[op->opcode_index].get());
       TensorT* output_tensor = subgraph->tensors[op->outputs[0]].get();
       if (output_tensors.find(output_tensor) == output_tensors.end()) {
         continue;
@@ -217,7 +218,8 @@ TfLiteStatus SetOutputTypeToUINT8(ModelT* model,
   // Find Quant op code index.
   size_t quant_op_index = 0;
   for (size_t i = 0; i < model->operator_codes.size(); ++i) {
-    if (model->operator_codes[i]->builtin_code == BuiltinOperator_QUANTIZE) {
+    if (GetBuiltinCode(model->operator_codes[i].get()) ==
+        BuiltinOperator_QUANTIZE) {
       quant_op_index = i;
     }
   }
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface.h b/tensorflow/lite/tools/optimize/modify_model_interface.h
index 5711a615812..78edadc4bed 100644
--- a/tensorflow/lite/tools/optimize/modify_model_interface.h
+++ b/tensorflow/lite/tools/optimize/modify_model_interface.h
@@ -22,9 +22,10 @@ namespace tflite {
 namespace optimize {
 
 // Changes the interface of a quantized model. This method allows the users to
-// replace float interface with other types.
-// This populates the builder with the new model.
-// Currently only int8, int16 and uint8 are supported.
+// replace float interface with other types. Currently only int8, int16 and
+// uint8 are supported.
+//
+// This method populates the builder with the new model.
 //
 // Note: This is a private API, subject to change.
 TfLiteStatus ModifyModelInterface(flatbuffers::FlatBufferBuilder* builder,
diff --git a/tensorflow/lite/tools/optimize/modify_model_interface_test.cc b/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
index 99e0ad35b2d..08a31cd0051 100644
--- a/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
+++ b/tensorflow/lite/tools/optimize/modify_model_interface_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 
 namespace tflite {
 namespace optimize {
@@ -45,12 +46,18 @@ std::unique_ptr<ModelT> CreateQuantizedModelSingleInputOutput(
 
   // Op code
   quant_op_code->builtin_code = BuiltinOperator_QUANTIZE;
+  quant_op_code->deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_QUANTIZE);
   quant_op_code->version = 2;
 
   fc_op_code->builtin_code = BuiltinOperator_FULLY_CONNECTED;
+  fc_op_code->deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_FULLY_CONNECTED);
   fc_op_code->version = 2;
 
   dequant_op_code->builtin_code = BuiltinOperator_DEQUANTIZE;
+  dequant_op_code->deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_DEQUANTIZE);
   dequant_op_code->version = 2;
 
   // Op.
@@ -136,12 +143,18 @@ std::unique_ptr<ModelT> CreateQuantizedModelMultipleInputOutput(
 
   // Op code
   quant_op_code->builtin_code = BuiltinOperator_QUANTIZE;
+  quant_op_code->deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_QUANTIZE);
   quant_op_code->version = 2;
 
   fc_op_code->builtin_code = BuiltinOperator_FULLY_CONNECTED;
+  fc_op_code->deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_FULLY_CONNECTED);
   fc_op_code->version = 2;
 
   dequant_op_code->builtin_code = BuiltinOperator_DEQUANTIZE;
+  dequant_op_code->deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_DEQUANTIZE);
   dequant_op_code->version = 2;
 
   // Op.
@@ -257,6 +270,8 @@ std::unique_ptr<ModelT> CreateFloatModel() {
 
   // Op code
   fc_op_code->builtin_code = BuiltinOperator_FULLY_CONNECTED;
+  fc_op_code->deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_FULLY_CONNECTED);
   fc_op_code->version = 2;
 
   // Op.
@@ -599,10 +614,12 @@ TEST(ModelInterface, Float) {
   EXPECT_EQ(model->subgraphs[0]->outputs.size(), 1);
   EXPECT_EQ(model->subgraphs[0]->outputs[0], 1);
   EXPECT_EQ(model->operator_codes.size(), 3);
-  EXPECT_EQ(model->operator_codes[0]->builtin_code,
+  EXPECT_EQ(GetBuiltinCode(model->operator_codes[0].get()),
             BuiltinOperator_FULLY_CONNECTED);
-  EXPECT_EQ(model->operator_codes[1]->builtin_code, BuiltinOperator_DEQUANTIZE);
-  EXPECT_EQ(model->operator_codes[2]->builtin_code, BuiltinOperator_QUANTIZE);
+  EXPECT_EQ(GetBuiltinCode(model->operator_codes[1].get()),
+            BuiltinOperator_DEQUANTIZE);
+  EXPECT_EQ(GetBuiltinCode(model->operator_codes[2].get()),
+            BuiltinOperator_QUANTIZE);
   EXPECT_EQ(model->subgraphs[0]->operators.size(), 3);
 
   auto dequantize_op = model->subgraphs[0]->operators[0].get();
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 4a7b4a59e39..6ec320c4144 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/tools/optimize/operator_property.h"
 
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 
 namespace tflite {
 namespace optimize {
@@ -41,8 +42,10 @@ const OpVariant GetOperatorVariant(const ModelT* model, int subgraph_index,
   OpVariant op_variant;
   OperatorT* op =
       model->subgraphs.at(subgraph_index)->operators[op_index].get();
-  op_variant.op_code = model->operator_codes[op->opcode_index]->builtin_code;
-  if (op_variant.op_code == BuiltinOperator_LSTM) {
+  op_variant.op_code =
+      GetBuiltinCode(model->operator_codes[op->opcode_index].get());
+  if (op_variant.op_code == BuiltinOperator_LSTM ||
+      op_variant.op_code == BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM) {
     if (op->inputs.size() == 5) {
       // The 5 input ("basic") LSTM is not supported in this tooling (yet).
       op_variant.is_quantizable = false;
@@ -97,6 +100,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.inputs = {{0, {}}, {1, {}}};
       property.outputs = {{0, {}}};
       property.version = 2;
+      property.quantize_input_as_activations = true;
       break;
     }
     case BuiltinOperator_BATCH_TO_SPACE_ND:
@@ -227,7 +231,8 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.version = 2;
       break;
     }
-    case BuiltinOperator_LSTM: {
+    case BuiltinOperator_LSTM:
+    case BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM: {
       if (!op_variant.is_quantizable) {
         // Early exist for 5 input LSTM.
         // It is not supported in this tooling yet.
@@ -871,13 +876,18 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.version = 1;
       break;
     case BuiltinOperator_RESIZE_BILINEAR:
-    case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR:
       property.inputs = {{0, {}}};
       property.outputs = {{0, {}}};
       property.restrict_same_input_output_scale = true;
       property.version = 2;
       property.quantizable_int16 = false;
       break;
+    case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR:
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
+      property.restrict_same_input_output_scale = true;
+      property.version = 2;
+      break;
     case BuiltinOperator_SHAPE:
       property.inputs = {{0, {}}};
       // Shape has no quantizable output.
diff --git a/tensorflow/lite/tools/optimize/operator_property.h b/tensorflow/lite/tools/optimize/operator_property.h
index ef84f3aaac1..58922a60e27 100644
--- a/tensorflow/lite/tools/optimize/operator_property.h
+++ b/tensorflow/lite/tools/optimize/operator_property.h
@@ -26,6 +26,27 @@ namespace operator_property {
 // the scales. For example, for bias in conv, derived_scale = {{0, 1}, {}, {}}
 // and for lstm gate bias, the derived scale is {{}, {0}, {2^-10}}
 struct DerivedScale {
+  // MSVC2015 version 14.0 and below doesn't support struct initialization with
+  // initializer lists so emulate the behavior using a float initializer list.
+#if _MSC_VER <= 1900
+  DerivedScale() {}
+  // Construct this object with a list of initializer lists. All list elements
+  // are cast to float values to avoid ambiguous construction of a union-style
+  // object that could take either std::initializer_list<float> or
+  // std::initializer_list<int>.
+  DerivedScale(std::initializer_list<std::initializer_list<float>> values) {
+    assert(values.size() == 3);
+    std::vector<std::initializer_list<float>> items(values);
+    for (auto& it : items[0]) {
+      input_tensors.push_back(static_cast<int>(it));
+    }
+    for (auto& it : items[1]) {
+      intermediate_tensors.push_back(static_cast<int>(it));
+    }
+    factors.assign(items[2]);
+  }
+#endif  // _MSC_VER <= 1900
+
   std::vector<int> input_tensors = {};
   std::vector<int> intermediate_tensors = {};
   // This is a list of extra factors that are not associated with any other
diff --git a/tensorflow/lite/tools/optimize/python/BUILD b/tensorflow/lite/tools/optimize/python/BUILD
index 050c0008924..34f57cbecaf 100644
--- a/tensorflow/lite/tools/optimize/python/BUILD
+++ b/tensorflow/lite/tools/optimize/python/BUILD
@@ -52,7 +52,7 @@ py_library(
     name = "modify_model_interface_constants",
     srcs = ["modify_model_interface_constants.py"],
     srcs_version = "PY3",
-    deps = ["//tensorflow/lite/python:lite_constants"],
+    deps = ["//tensorflow/python:dtypes"],
 )
 
 pybind_extension(
diff --git a/tensorflow/lite/tools/optimize/python/modify_model_interface_constants.py b/tensorflow/lite/tools/optimize/python/modify_model_interface_constants.py
index cbe1aa92022..f7c7cc60d5c 100644
--- a/tensorflow/lite/tools/optimize/python/modify_model_interface_constants.py
+++ b/tensorflow/lite/tools/optimize/python/modify_model_interface_constants.py
@@ -19,12 +19,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.lite.python import lite_constants
+from tensorflow.python.framework import dtypes
 
 STR_TO_TFLITE_TYPES = {
-    'INT8': lite_constants.INT8,
-    'INT16': lite_constants.INT16,
-    'UINT8': lite_constants.QUANTIZED_UINT8
+    'INT8': dtypes.int8,
+    'UINT8': dtypes.uint8,
+    'INT16': dtypes.int16,
 }
 TFLITE_TO_STR_TYPES = {v: k for k, v in STR_TO_TFLITE_TYPES.items()}
 
diff --git a/tensorflow/lite/tools/optimize/quantization_utils_test.cc b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
index 49009e49600..4ce0d01fd12 100644
--- a/tensorflow/lite/tools/optimize/quantization_utils_test.cc
+++ b/tensorflow/lite/tools/optimize/quantization_utils_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/tools/optimize/test_util.h"
 
@@ -471,8 +472,9 @@ TEST_F(QuantizationUtilsTest, SymmetricQuantizeTensorFromMinMax) {
   readonly_model->UnPackTo(&model);
   auto subgraph = model.subgraphs[0].get();
   auto conv_op = subgraph->operators.at(0).get();
-  ASSERT_EQ(model.operator_codes.at(conv_op->opcode_index)->builtin_code,
-            BuiltinOperator_CONV_2D);
+  ASSERT_EQ(
+      GetBuiltinCode(model.operator_codes.at(conv_op->opcode_index).get()),
+      BuiltinOperator_CONV_2D);
   int32_t weights_tensor_idx = conv_op->inputs[1];
   TensorT* weights_tensor = subgraph->tensors.at(weights_tensor_idx).get();
 
@@ -520,8 +522,9 @@ TEST_F(QuantizationUtilsTest, SymmetricQuantizeTensorNullQuantParams) {
   readonly_model->UnPackTo(&model);
   auto subgraph = model.subgraphs[0].get();
   auto conv_op = subgraph->operators.at(0).get();
-  ASSERT_EQ(model.operator_codes.at(conv_op->opcode_index)->builtin_code,
-            BuiltinOperator_CONV_2D);
+  ASSERT_EQ(
+      GetBuiltinCode(model.operator_codes.at(conv_op->opcode_index).get()),
+      BuiltinOperator_CONV_2D);
   int32_t weights_tensor_idx = conv_op->inputs[1];
   TensorT* weights_tensor = subgraph->tensors.at(weights_tensor_idx).get();
   // Empty quantization parameters.
@@ -554,8 +557,9 @@ TEST_F(QuantizationUtilsTest, SymmetricQuantizeTensor) {
   readonly_model->UnPackTo(&model);
   auto subgraph = model.subgraphs[0].get();
   auto conv_op = subgraph->operators.at(0).get();
-  ASSERT_EQ(model.operator_codes.at(conv_op->opcode_index)->builtin_code,
-            BuiltinOperator_CONV_2D);
+  ASSERT_EQ(
+      GetBuiltinCode(model.operator_codes.at(conv_op->opcode_index).get()),
+      BuiltinOperator_CONV_2D);
   int32_t weights_tensor_idx = conv_op->inputs[1];
   TensorT* weights_tensor = subgraph->tensors.at(weights_tensor_idx).get();
 
@@ -586,8 +590,9 @@ TEST_F(QuantizationUtilsTest, QuantizeFloat16) {
   readonly_model->UnPackTo(&model);
   auto subgraph = model.subgraphs[0].get();
   auto conv_op = subgraph->operators.at(0).get();
-  ASSERT_EQ(model.operator_codes.at(conv_op->opcode_index)->builtin_code,
-            BuiltinOperator_CONV_2D);
+  ASSERT_EQ(
+      GetBuiltinCode(model.operator_codes.at(conv_op->opcode_index).get()),
+      BuiltinOperator_CONV_2D);
   int32_t weights_tensor_idx = conv_op->inputs[1];
   TensorT* weights_tensor = subgraph->tensors.at(weights_tensor_idx).get();
 
diff --git a/tensorflow/lite/tools/optimize/quantization_wrapper_utils_test.cc b/tensorflow/lite/tools/optimize/quantization_wrapper_utils_test.cc
index 371de7d04bc..38fef0660d5 100644
--- a/tensorflow/lite/tools/optimize/quantization_wrapper_utils_test.cc
+++ b/tensorflow/lite/tools/optimize/quantization_wrapper_utils_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 
 namespace tflite {
 namespace optimize {
@@ -35,6 +36,8 @@ TEST(LstmPreprocess, Add2Tensors) {
   auto lstm_op = absl::make_unique<OperatorT>();
 
   lstm_op_code->builtin_code = BuiltinOperator_LSTM;
+  lstm_op_code->deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_LSTM);
   lstm_op_code->version = 2;
   lstm_op->opcode_index = 0;
   lstm_op->inputs = {0, 1,  2,  3,  4,  5,  6,  7,  8,  -1, -1, -1,
@@ -68,7 +71,8 @@ TEST(LstmPreprocess, Add2Tensors) {
   EXPECT_EQ(model->subgraphs[0]->tensors.size(), 26);
   EXPECT_EQ(model->buffers.size(), 1);
 
-  EXPECT_EQ(model->operator_codes[0]->builtin_code, BuiltinOperator_LSTM);
+  EXPECT_EQ(GetBuiltinCode(model->operator_codes[0].get()),
+            BuiltinOperator_LSTM);
   EXPECT_EQ(model->subgraphs[0]->tensors[0]->name, "lstm_tensor0");
   EXPECT_EQ(model->subgraphs[0]->tensors[21]->name, "intermediate_0_0");
   EXPECT_EQ(model->subgraphs[0]->tensors[22]->name, "intermediate_0_1");
@@ -94,7 +98,8 @@ TEST(LstmPreprocess, Add2Tensors) {
   EXPECT_EQ(model->subgraphs[0]->tensors.size(), 26);
   EXPECT_EQ(model->buffers.size(), 1);
 
-  EXPECT_EQ(model->operator_codes[0]->builtin_code, BuiltinOperator_LSTM);
+  EXPECT_EQ(GetBuiltinCode(model->operator_codes[0].get()),
+            BuiltinOperator_LSTM);
   EXPECT_EQ(model->subgraphs[0]->tensors[0]->name, "lstm_tensor0");
   EXPECT_EQ(model->subgraphs[0]->tensors[21]->name, "intermediate_0_0");
   EXPECT_EQ(model->subgraphs[0]->tensors[22]->name, "intermediate_0_1");
diff --git a/tensorflow/lite/tools/optimize/quantize_model.cc b/tensorflow/lite/tools/optimize/quantize_model.cc
index ca9a51abefe..e7f1c7a8bdf 100644
--- a/tensorflow/lite/tools/optimize/quantize_model.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/tools/optimize/model_utils.h"
 #include "tensorflow/lite/tools/optimize/operator_property.h"
 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
@@ -59,7 +60,7 @@ operator_property::OperatorProperty GetOperatorProperty(
   const SubGraphT* subgraph = model->subgraphs[subgraph_index].get();
   const OperatorT* op = subgraph->operators[op_idx].get();
   const BuiltinOperator op_code =
-      model->operator_codes[op->opcode_index]->builtin_code;
+      GetBuiltinCode(model->operator_codes[op->opcode_index].get());
   if (activations_type == TensorType_INT16 && !property.quantizable_int16) {
     property.quantizable = false;
   }
@@ -521,7 +522,7 @@ TfLiteStatus QuantizeOpInput(
   SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
   OperatorT* op = subgraph->operators[*op_idx].get();
   const BuiltinOperator op_code =
-      model->operator_codes[op->opcode_index]->builtin_code;
+      GetBuiltinCode(model->operator_codes[op->opcode_index].get());
   if (input_idx >= op->inputs.size()) {
     TF_LITE_REPORT_ERROR(
         error_reporter,
@@ -716,7 +717,7 @@ TfLiteStatus QuantizeOpOutput(
   SubGraphT* subgraph = model->subgraphs.at(subgraph_idx).get();
   OperatorT* op = subgraph->operators[op_idx].get();
   const BuiltinOperator op_code =
-      model->operator_codes[op->opcode_index]->builtin_code;
+      GetBuiltinCode(model->operator_codes[op->opcode_index].get());
   if (output_idx >= op->outputs.size()) {
     TF_LITE_REPORT_ERROR(
         error_reporter,
@@ -812,7 +813,7 @@ TfLiteStatus QuantizeIntemediateTensors(ModelT* model,
       if (!property.intermediates.empty()) {
         OperatorT* op = subgraph->operators[op_idx].get();
         const BuiltinOperator op_code =
-            model->operator_codes[op->opcode_index]->builtin_code;
+            GetBuiltinCode(model->operator_codes[op->opcode_index].get());
         for (const std::pair<int, operator_property::TensorProperty>& input :
              property.intermediates) {
           const int index_local = input.first;
@@ -941,7 +942,7 @@ TfLiteStatus QuantizeWeightsInputOutput(
     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
       OperatorT* op = subgraph->operators[op_idx].get();
       const BuiltinOperator op_code =
-          model->operator_codes[op->opcode_index]->builtin_code;
+          GetBuiltinCode(model->operator_codes[op->opcode_index].get());
       const string operator_name = subgraph->tensors[op->outputs[0]]->name;
       operator_property::OperatorProperty property =
           GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
@@ -1001,7 +1002,7 @@ TfLiteStatus QuantizeBiases(ModelT* model,
     for (size_t op_idx = 0; op_idx < subgraph->operators.size(); op_idx++) {
       OperatorT* op = subgraph->operators[op_idx].get();
       const BuiltinOperator op_code =
-          model->operator_codes[op->opcode_index]->builtin_code;
+          GetBuiltinCode(model->operator_codes[op->opcode_index].get());
       const string operator_name = subgraph->tensors[op->outputs[0]]->name;
       operator_property::OperatorProperty property =
           GetOperatorProperty(operator_names, model, subgraph_idx, op_idx,
diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc
index 0bf6bb8b5a9..32a23033019 100644
--- a/tensorflow/lite/tools/optimize/quantize_model_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/tools/optimize/test_util.h"
 
 // Note: More rigorous model tests can be found in subgraph_quantizer_test.cc
@@ -153,7 +154,8 @@ TEST_P(QuantizeConvModelTest, TensorShapesAndStructureIsUnchanged) {
   }
   // check op and versioning.
   EXPECT_EQ(model_.operator_codes.size(), 1);
-  EXPECT_EQ(model_.operator_codes[0]->builtin_code, BuiltinOperator_CONV_2D);
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[0].get()),
+            BuiltinOperator_CONV_2D);
   EXPECT_EQ(model_.operator_codes[0]->version, 3);
 }
 
@@ -166,9 +168,10 @@ TEST_P(QuantizeConvModelTest, OperatorsAreUnchanged) {
             readonly_model_->operator_codes()->size());
   for (size_t i = 0; i < model_.operator_codes.size(); i++) {
     const auto float_model_op = readonly_model_->operator_codes()->Get(i);
-    EXPECT_EQ(model_.operator_codes[i]->builtin_code,
-              float_model_op->builtin_code());
-    if (model_.operator_codes[i]->builtin_code == BuiltinOperator_CONV_2D) {
+    EXPECT_EQ(GetBuiltinCode(model_.operator_codes[i].get()),
+              GetBuiltinCode(float_model_op));
+    if (GetBuiltinCode(model_.operator_codes[i].get()) ==
+        BuiltinOperator_CONV_2D) {
       EXPECT_EQ(model_.operator_codes[i]->version, 3);
     } else {
       EXPECT_EQ(model_.operator_codes[i]->version, 2);
@@ -232,9 +235,9 @@ TEST_P(QuantizeConvModelTest, FloatInputAndOutput) {
         subgraph->operators[subgraph->operators.size() - 1];
     const int32_t quant_idx = quant_op->opcode_index;
     const int32_t dequant_idx = dequant_op->opcode_index;
-    EXPECT_EQ(model_.operator_codes[quant_idx]->builtin_code,
+    EXPECT_EQ(GetBuiltinCode(model_.operator_codes[quant_idx].get()),
               BuiltinOperator_QUANTIZE);
-    EXPECT_EQ(model_.operator_codes[dequant_idx]->builtin_code,
+    EXPECT_EQ(GetBuiltinCode(model_.operator_codes[dequant_idx].get()),
               BuiltinOperator_DEQUANTIZE);
     // The model should only have one input and output.
     EXPECT_EQ(subgraph->inputs.size(), 1);
@@ -297,10 +300,12 @@ TEST_P(QuantizeConvModelTest, Uint8InputAndOutput) {
         subgraph->operators[subgraph->operators.size() - 1];
     const int32_t quant_op_uint8_int8_idx = quant_op_uint8_int8->opcode_index;
     const int32_t quant_op_int8_uint8_idx = quant_op_int8_uint8->opcode_index;
-    EXPECT_EQ(model_.operator_codes[quant_op_uint8_int8_idx]->builtin_code,
-              BuiltinOperator_QUANTIZE);
-    EXPECT_EQ(model_.operator_codes[quant_op_int8_uint8_idx]->builtin_code,
-              BuiltinOperator_QUANTIZE);
+    EXPECT_EQ(
+        GetBuiltinCode(model_.operator_codes[quant_op_uint8_int8_idx].get()),
+        BuiltinOperator_QUANTIZE);
+    EXPECT_EQ(
+        GetBuiltinCode(model_.operator_codes[quant_op_int8_uint8_idx].get()),
+        BuiltinOperator_QUANTIZE);
     // The model should only have one input and output.
     EXPECT_EQ(subgraph->inputs.size(), 1);
     EXPECT_EQ(subgraph->outputs.size(), 1);
@@ -405,9 +410,9 @@ TEST_P(QuantizeConcatModelTest, AddRequantBeforeConcat) {
   EXPECT_EQ(subgraph->operators.size(), 2);
   const auto& requant = subgraph->operators[0];
   const auto& concat = subgraph->operators[1];
-  EXPECT_EQ(model_.operator_codes[requant->opcode_index]->builtin_code,
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[requant->opcode_index].get()),
             BuiltinOperator_QUANTIZE);
-  EXPECT_EQ(model_.operator_codes[concat->opcode_index]->builtin_code,
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[concat->opcode_index].get()),
             BuiltinOperator_CONCATENATION);
 
   auto zero_point_control = tensor_type_ == TensorType_INT8 ? -128 : 0;
@@ -469,10 +474,11 @@ TEST_P(QuantizeConcatModelTest, AddRequantBeforeConcat) {
 
   // check op and versioning.
   EXPECT_EQ(model_.operator_codes.size(), 2);
-  EXPECT_EQ(model_.operator_codes[0]->builtin_code,
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[0].get()),
             BuiltinOperator_CONCATENATION);
   EXPECT_EQ(model_.operator_codes[0]->version, 2);
-  EXPECT_EQ(model_.operator_codes[1]->builtin_code, BuiltinOperator_QUANTIZE);
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[1].get()),
+            BuiltinOperator_QUANTIZE);
   EXPECT_EQ(model_.operator_codes[1]->version, 2);
 }
 INSTANTIATE_TEST_SUITE_P(QuantizeConcatModelInst, QuantizeConcatModelTest,
@@ -506,9 +512,9 @@ TEST_F(QuantizeSplitModelTest, QuantizeSplit) {
   EXPECT_EQ(subgraph->operators.size(), 2);
   const auto& split = subgraph->operators[0];
   const auto& add = subgraph->operators[1];
-  EXPECT_EQ(model_.operator_codes[split->opcode_index]->builtin_code,
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[split->opcode_index].get()),
             BuiltinOperator_SPLIT);
-  EXPECT_EQ(model_.operator_codes[add->opcode_index]->builtin_code,
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[add->opcode_index].get()),
             BuiltinOperator_ADD);
 
   // There should be 5 tensors: input, output, split, split/split_dim, split:1.
@@ -541,7 +547,8 @@ TEST_F(QuantizeSplitModelTest, QuantizeSplit) {
 
   // check op and versioning.
   EXPECT_EQ(model_.operator_codes.size(), 2);
-  EXPECT_EQ(model_.operator_codes[1]->builtin_code, BuiltinOperator_SPLIT);
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[1].get()),
+            BuiltinOperator_SPLIT);
   EXPECT_EQ(model_.operator_codes[0]->version, 2);
 }
 
@@ -643,7 +650,8 @@ TEST_F(QuantizeConvModel1Test, VerifyConvQuantizationWithUnitScale) {
 
   // check op and versioning.
   EXPECT_EQ(model_.operator_codes.size(), 1);
-  EXPECT_EQ(model_.operator_codes[0]->builtin_code, BuiltinOperator_CONV_2D);
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[0].get()),
+            BuiltinOperator_CONV_2D);
   EXPECT_EQ(model_.operator_codes[0]->version, 3);
 }
 
@@ -762,7 +770,8 @@ TEST_P(QuantizeConvModel2Test, VerifyConvQuantization) {
 
   // check op and versioning.
   EXPECT_EQ(model_.operator_codes.size(), 1);
-  EXPECT_EQ(model_.operator_codes[0]->builtin_code, BuiltinOperator_CONV_2D);
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[0].get()),
+            BuiltinOperator_CONV_2D);
   EXPECT_EQ(model_.operator_codes[0]->version, 3);
 }
 
@@ -785,7 +794,7 @@ TEST_F(QuantizeSoftmaxTest, VerifySoftmaxQuantization) {
   auto op = subgraph->operators[0].get();
   // Model has a single softmax op.
   ASSERT_EQ(op->opcode_index, 0);
-  ASSERT_EQ(model_.operator_codes[0].get()->builtin_code,
+  ASSERT_EQ(GetBuiltinCode(model_.operator_codes[0].get()),
             BuiltinOperator_SOFTMAX);
 
   ASSERT_EQ(op->inputs.size(), 1);
@@ -823,7 +832,8 @@ TEST_F(QuantizeSoftmaxTest, VerifySoftmaxQuantization) {
 
   // check op and versioning.
   EXPECT_EQ(model_.operator_codes.size(), 1);
-  EXPECT_EQ(model_.operator_codes[0]->builtin_code, BuiltinOperator_SOFTMAX);
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[0].get()),
+            BuiltinOperator_SOFTMAX);
   EXPECT_EQ(model_.operator_codes[0]->version, 2);
 }
 
@@ -846,7 +856,7 @@ TEST_F(QuantizeAvgPoolTest, VerifyAvgPoolQuantization) {
   auto op = subgraph->operators[0].get();
   // Model has a single AveragePool op.
   ASSERT_EQ(op->opcode_index, 0);
-  ASSERT_EQ(model_.operator_codes[0].get()->builtin_code,
+  ASSERT_EQ(GetBuiltinCode(model_.operator_codes[0].get()),
             BuiltinOperator_AVERAGE_POOL_2D);
 
   ASSERT_EQ(op->inputs.size(), 1);
@@ -884,7 +894,7 @@ TEST_F(QuantizeAvgPoolTest, VerifyAvgPoolQuantization) {
 
   // check op and versioning.
   EXPECT_EQ(model_.operator_codes.size(), 1);
-  EXPECT_EQ(model_.operator_codes[0]->builtin_code,
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[0].get()),
             BuiltinOperator_AVERAGE_POOL_2D);
   EXPECT_EQ(model_.operator_codes[0]->version, 2);
 }
@@ -907,7 +917,7 @@ TEST_F(QuantizeMultiInputAddWithReshapeTest, VerifyReshapeQuantization) {
   // Verify Reshape is quantized.
   const auto& subgraph = model_.subgraphs[0];
   auto op = subgraph->operators[1].get();
-  ASSERT_EQ(model_.operator_codes[op->opcode_index].get()->builtin_code,
+  ASSERT_EQ(GetBuiltinCode(model_.operator_codes[op->opcode_index].get()),
             BuiltinOperator_RESHAPE);
 
   ASSERT_EQ(op->inputs.size(), 2);
@@ -921,7 +931,6 @@ TEST_F(QuantizeMultiInputAddWithReshapeTest, VerifyReshapeQuantization) {
 
   EXPECT_EQ(subgraph->tensors[op->inputs[0]].get()->type, TensorType_INT8);
   EXPECT_EQ(subgraph->tensors[op->outputs[0]].get()->type, TensorType_INT8);
-
   auto float_input_quant_params =
       float_graph->tensors()->Get(op->inputs[0])->quantization();
   auto input_quant_params =
@@ -940,9 +949,11 @@ TEST_F(QuantizeMultiInputAddWithReshapeTest, VerifyReshapeQuantization) {
 
   // check op and versioning.
   EXPECT_EQ(model_.operator_codes.size(), 2);
-  EXPECT_EQ(model_.operator_codes[0]->builtin_code, BuiltinOperator_ADD);
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[0].get()),
+            BuiltinOperator_ADD);
   EXPECT_EQ(model_.operator_codes[0]->version, 2);
-  EXPECT_EQ(model_.operator_codes[1]->builtin_code, BuiltinOperator_RESHAPE);
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[1].get()),
+            BuiltinOperator_RESHAPE);
   EXPECT_EQ(model_.operator_codes[1]->version, 1);
 }
 
@@ -955,7 +966,7 @@ TEST_F(QuantizeMultiInputAddWithReshapeTest, VerifyAddQuantization) {
   // Verify ADD is quantized.
   const auto& subgraph = model_.subgraphs[0];
   auto op = subgraph->operators[0].get();
-  ASSERT_EQ(model_.operator_codes[op->opcode_index].get()->builtin_code,
+  ASSERT_EQ(GetBuiltinCode(model_.operator_codes[op->opcode_index].get()),
             BuiltinOperator_ADD);
 
   ASSERT_EQ(op->inputs.size(), 2);
@@ -992,9 +1003,11 @@ TEST_F(QuantizeMultiInputAddWithReshapeTest, VerifyAddQuantization) {
 
   // check op and versioning.
   EXPECT_EQ(model_.operator_codes.size(), 2);
-  EXPECT_EQ(model_.operator_codes[0]->builtin_code, BuiltinOperator_ADD);
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[0].get()),
+            BuiltinOperator_ADD);
   EXPECT_EQ(model_.operator_codes[0]->version, 2);
-  EXPECT_EQ(model_.operator_codes[1]->builtin_code, BuiltinOperator_RESHAPE);
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[1].get()),
+            BuiltinOperator_RESHAPE);
   EXPECT_EQ(model_.operator_codes[1]->version, 1);
 }
 
@@ -1016,7 +1029,7 @@ TEST_F(QuantizeConstInputTest, VerifyConstOpInput) {
   // Verify ConstOp is quantized.
   const auto& subgraph = model_.subgraphs[0];
   auto op = subgraph->operators[0].get();
-  ASSERT_EQ(model_.operator_codes[op->opcode_index].get()->builtin_code,
+  ASSERT_EQ(GetBuiltinCode(model_.operator_codes[op->opcode_index].get()),
             BuiltinOperator_ADD);
 
   ASSERT_EQ(op->inputs.size(), 2);
@@ -1037,7 +1050,8 @@ TEST_F(QuantizeConstInputTest, VerifyConstOpInput) {
 
   // check op and versioning.
   EXPECT_EQ(model_.operator_codes.size(), 1);
-  EXPECT_EQ(model_.operator_codes[0]->builtin_code, BuiltinOperator_ADD);
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[0].get()),
+            BuiltinOperator_ADD);
   EXPECT_EQ(model_.operator_codes[0]->version, 2);
 }
 
@@ -1058,7 +1072,7 @@ TEST_F(QuantizeArgMaxTest, VerifyArgMax) {
 
   const auto& subgraph = model_.subgraphs[0];
   auto op = subgraph->operators[0].get();
-  ASSERT_EQ(model_.operator_codes[op->opcode_index].get()->builtin_code,
+  ASSERT_EQ(GetBuiltinCode(model_.operator_codes[op->opcode_index].get()),
             BuiltinOperator_ARG_MAX);
 
   ASSERT_EQ(op->inputs.size(), 2);
@@ -1080,7 +1094,8 @@ TEST_F(QuantizeArgMaxTest, VerifyArgMax) {
 
   // check op and versioning.
   EXPECT_EQ(model_.operator_codes.size(), 1);
-  EXPECT_EQ(model_.operator_codes[0]->builtin_code, BuiltinOperator_ARG_MAX);
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[0].get()),
+            BuiltinOperator_ARG_MAX);
   EXPECT_EQ(model_.operator_codes[0]->version, 2);
 }
 
@@ -1281,7 +1296,7 @@ TEST_F(QuantizeFCTest, VerifyFC) {
 
   const auto& subgraph = model_.subgraphs[0];
   auto op = subgraph->operators[0].get();
-  ASSERT_EQ(model_.operator_codes[op->opcode_index].get()->builtin_code,
+  ASSERT_EQ(GetBuiltinCode(model_.operator_codes[op->opcode_index].get()),
             BuiltinOperator_FULLY_CONNECTED);
 
   ASSERT_EQ(op->inputs.size(), 3);
@@ -1308,10 +1323,11 @@ TEST_F(QuantizeFCTest, VerifyFC) {
 
   // check op and versioning.
   EXPECT_EQ(model_.operator_codes.size(), 2);
-  EXPECT_EQ(model_.operator_codes[0]->builtin_code,
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[0].get()),
             BuiltinOperator_FULLY_CONNECTED);
   EXPECT_EQ(model_.operator_codes[0]->version, 4);
-  EXPECT_EQ(model_.operator_codes[1]->builtin_code, BuiltinOperator_RESHAPE);
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[1].get()),
+            BuiltinOperator_RESHAPE);
   EXPECT_EQ(model_.operator_codes[1]->version, 1);
 }
 
@@ -1347,7 +1363,7 @@ TEST_P(QuantizeCustomOpTest, VerifyMixedQuantization) {
       TensorType_FLOAT32, TensorType_FLOAT32, GetParam()};
   for (int i = 0; i < subgraph->operators.size(); ++i) {
     OperatorT* op = subgraph->operators[i].get();
-    ASSERT_EQ(model_.operator_codes[op->opcode_index]->builtin_code,
+    ASSERT_EQ(GetBuiltinCode(model_.operator_codes[op->opcode_index].get()),
               op_codes[i]);
     ASSERT_EQ(subgraph->tensors[op->inputs[0]]->type, op_input_types[i]);
   }
@@ -1384,7 +1400,7 @@ TEST_F(QuantizeOp16x8Test, VerifyMixedQuantization16x8) {
       TensorType_INT16, TensorType_INT16, TensorType_FLOAT32};
   for (int i = 0; i < subgraph->operators.size(); ++i) {
     OperatorT* op = subgraph->operators[i].get();
-    ASSERT_EQ(model_.operator_codes[op->opcode_index]->builtin_code,
+    ASSERT_EQ(GetBuiltinCode(model_.operator_codes[op->opcode_index].get()),
               op_codes[i]);
     ASSERT_EQ(subgraph->tensors[op->inputs[0]]->type, op_input_types[i]);
   }
@@ -1415,13 +1431,13 @@ TEST_F(QuantizePackTest, VerifyPack) {
   const auto& op3 = subgraph->operators[3].get();
   const auto& op4 = subgraph->operators[4].get();
 
-  ASSERT_EQ(model_.operator_codes[op1->opcode_index].get()->builtin_code,
+  ASSERT_EQ(GetBuiltinCode(model_.operator_codes[op1->opcode_index].get()),
             BuiltinOperator_QUANTIZE);
-  ASSERT_EQ(model_.operator_codes[op2->opcode_index].get()->builtin_code,
+  ASSERT_EQ(GetBuiltinCode(model_.operator_codes[op2->opcode_index].get()),
             BuiltinOperator_QUANTIZE);
-  ASSERT_EQ(model_.operator_codes[op3->opcode_index].get()->builtin_code,
+  ASSERT_EQ(GetBuiltinCode(model_.operator_codes[op3->opcode_index].get()),
             BuiltinOperator_PACK);
-  ASSERT_EQ(model_.operator_codes[op4->opcode_index].get()->builtin_code,
+  ASSERT_EQ(GetBuiltinCode(model_.operator_codes[op4->opcode_index].get()),
             BuiltinOperator_DEQUANTIZE);
 
   const auto& pack_input0 = subgraph->tensors[op3->inputs[0]].get();
@@ -1473,27 +1489,27 @@ TEST_P(QuantizeMinimumMaximumTest, VerifyMinimumMaximum) {
   const auto& dequant_op = subgraph->operators[subgraph->operators.size() - 1];
   const int32_t quant_idx = quant_op->opcode_index;
   const int32_t dequant_idx = dequant_op->opcode_index;
-  EXPECT_EQ(model_.operator_codes[quant_idx]->builtin_code,
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[quant_idx].get()),
             BuiltinOperator_QUANTIZE);
-  EXPECT_EQ(model_.operator_codes[dequant_idx]->builtin_code,
+  EXPECT_EQ(GetBuiltinCode(model_.operator_codes[dequant_idx].get()),
             BuiltinOperator_DEQUANTIZE);
   const auto& requant1 = subgraph->operators[1].get();
   // Check that we have RE operator.
   auto requant1_builtin_code =
-      model_.operator_codes[requant1->opcode_index].get()->builtin_code;
+      GetBuiltinCode(model_.operator_codes[requant1->opcode_index].get());
   ASSERT_TRUE(requant1_builtin_code == tflite::BuiltinOperator_QUANTIZE);
 
   const auto& requant2 = subgraph->operators[2].get();
   // Check that we have RE operator.
   auto requant2_builtin_code =
-      model_.operator_codes[requant2->opcode_index].get()->builtin_code;
+      GetBuiltinCode(model_.operator_codes[requant2->opcode_index].get());
   ASSERT_TRUE(requant2_builtin_code == tflite::BuiltinOperator_QUANTIZE);
 
   const auto& op = subgraph->operators[3].get();
 
   // Check that we have MINIMUM or MAXIMUM operator.
   auto op_builtin_code =
-      model_.operator_codes[op->opcode_index].get()->builtin_code;
+      GetBuiltinCode(model_.operator_codes[op->opcode_index].get());
   ASSERT_TRUE(op_builtin_code == tflite::BuiltinOperator_MINIMUM ||
               op_builtin_code == tflite::BuiltinOperator_MAXIMUM);
 
@@ -1554,7 +1570,7 @@ TEST_F(QuantizeUnpackTest, VerifyUnpack) {
 
   auto float_graph = readonly_model_->subgraphs()->Get(0);
 
-  ASSERT_EQ(model_.operator_codes[op->opcode_index].get()->builtin_code,
+  ASSERT_EQ(GetBuiltinCode(model_.operator_codes[op->opcode_index].get()),
             BuiltinOperator_UNPACK);
 
   // Get unpack input and output tensors
@@ -1603,7 +1619,7 @@ TEST_F(QuantizeTransposeTest, VerifyTranspose) {
 
   auto float_graph = readonly_model_->subgraphs()->Get(0);
 
-  ASSERT_EQ(model_.operator_codes[op->opcode_index].get()->builtin_code,
+  ASSERT_EQ(GetBuiltinCode(model_.operator_codes[op->opcode_index].get()),
             BuiltinOperator_TRANSPOSE);
 
   // The model should only have one input and one outputs.
diff --git a/tensorflow/lite/tools/optimize/quantize_weights.cc b/tensorflow/lite/tools/optimize/quantize_weights.cc
index 35c90d7859e..1b22cb56117 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/tools/optimize/model_utils.h"
 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
 
@@ -79,7 +80,7 @@ std::vector<ConsumerOpInfo> GetTensorConsumers(const ModelT* model,
 // the provided op.
 std::vector<int32_t> GetWeightInputIndices(const OperatorCodeT* op_code,
                                            const CustomOpMap& custom_op_map) {
-  const BuiltinOperator builtin_op_code = op_code->builtin_code;
+  const BuiltinOperator builtin_op_code = GetBuiltinCode(op_code);
   if (builtin_op_code == BuiltinOperator_CUSTOM) {
     const std::string custom_code = op_code->custom_code;
     const auto& custom_op_info = custom_op_map.find(custom_code);
@@ -132,7 +133,7 @@ bool IsQuantizedInput(const OperatorCodeT* op_code,
 bool IsHybridEvaluationOp(const OperatorT* op, const OperatorCodeT* op_code,
                           const CustomOpMap& custom_op_map,
                           bool use_updated_hybrid_scheme) {
-  const BuiltinOperator builtin_op_code = op_code->builtin_code;
+  const BuiltinOperator builtin_op_code = GetBuiltinCode(op_code);
   // Operations that support hybrid evaluation.
   bool eval_hybrid = false;
   if (builtin_op_code == BuiltinOperator_CUSTOM) {
@@ -196,6 +197,7 @@ TfLiteStatus InsertQuantizableInputTensorsFromOperator(
     int subgraph_index, bool use_updated_hybrid_scheme) {
   SubGraphT* subgraph = model->subgraphs.at(subgraph_index).get();
   const OperatorCodeT* op_code = model->operator_codes[op->opcode_index].get();
+  auto builtin_code = GetBuiltinCode(op_code);
 
   std::vector<int32_t> op_input_indices =
       GetWeightInputIndices(op_code, custom_op_map);
@@ -203,8 +205,7 @@ TfLiteStatus InsertQuantizableInputTensorsFromOperator(
     int32_t tensor_idx = op->inputs[op_input_idx];
     if (tensor_idx == -1) {
       LOG(INFO) << "Skipping optional tensor input " << op_input_idx
-                << " of operation "
-                << EnumNameBuiltinOperator(op_code->builtin_code);
+                << " of operation " << EnumNameBuiltinOperator(builtin_code);
       continue;
     }
 
@@ -232,16 +233,16 @@ TfLiteStatus InsertQuantizableInputTensorsFromOperator(
       continue;
     }
 
-    if (op_code->builtin_code == BuiltinOperator_DEPTHWISE_CONV_2D) {
+    if (builtin_code == BuiltinOperator_DEPTHWISE_CONV_2D) {
       tensor_map->insert({tensor_idx,
                           {tensor, /*is_per_channel=*/use_updated_hybrid_scheme,
                            /*dim=*/3}});
-    } else if (op_code->builtin_code == BuiltinOperator_CONV_2D) {
+    } else if (builtin_code == BuiltinOperator_CONV_2D) {
       tensor_map->insert({tensor_idx,
                           {tensor, /*is_per_channel=*/use_updated_hybrid_scheme,
                            /*dim=*/0}});
     } else {
-      switch (op_code->builtin_code) {
+      switch (builtin_code) {
         case BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
           op->builtin_options.AsBidirectionalSequenceLSTMOptions()
               ->asymmetric_quantize_inputs = use_updated_hybrid_scheme;
@@ -288,13 +289,16 @@ TfLiteStatus InsertQuantizableInputTensorsFromOperator(
 // If a Dequantize op_code doesn't exist, adds it and returns its index.
 int32_t GetOrInsertDequantizeOpCodeIndex(ModelT* model) {
   for (size_t i = 0; i < model->operator_codes.size(); ++i) {
-    if (model->operator_codes[i]->builtin_code == BuiltinOperator_DEQUANTIZE) {
+    if (GetBuiltinCode(model->operator_codes[i].get()) ==
+        BuiltinOperator_DEQUANTIZE) {
       return i;
     }
   }
   model->operator_codes.push_back(absl::make_unique<OperatorCodeT>());
   int op_code_idx = model->operator_codes.size() - 1;
   model->operator_codes[op_code_idx]->builtin_code = BuiltinOperator_DEQUANTIZE;
+  model->operator_codes[op_code_idx]->deprecated_builtin_code =
+      static_cast<int8_t>(BuiltinOperator_DEQUANTIZE);
   // Version 2 and onwards supports INT8 inputs.
   model->operator_codes[op_code_idx]->version = 2;
 
@@ -330,7 +334,8 @@ void MakeTensor(const string& name, const std::vector<int32_t>& shape,
 // Updates operator code versions for the operators with INT8 inputs.
 void UpdateInt8OperatorVersions(ModelT* model, bool use_updated_hybrid_scheme) {
   for (int i = 0, end = model->operator_codes.size(); i < end; ++i) {
-    const BuiltinOperator& op_code = model->operator_codes[i]->builtin_code;
+    const BuiltinOperator& op_code =
+        GetBuiltinCode(model->operator_codes[i].get());
     if (op_code == BuiltinOperator_RNN ||
         op_code == BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN ||
         op_code == BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM ||
@@ -361,7 +366,7 @@ bool IsQuantizationPassThroughOps(
   }
   const OperatorT* consumer_op = consumer_op_infos.front().op;
   const BuiltinOperator op_code =
-      model->operator_codes[consumer_op->opcode_index]->builtin_code;
+      GetBuiltinCode(model->operator_codes[consumer_op->opcode_index].get());
   return op_code == BuiltinOperator_GATHER ||
          op_code == BuiltinOperator_EMBEDDING_LOOKUP;
 }
diff --git a/tensorflow/lite/tools/optimize/quantize_weights_test.cc b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
index 94bff2d5eb8..05b135fef98 100644
--- a/tensorflow/lite/tools/optimize/quantize_weights_test.cc
+++ b/tensorflow/lite/tools/optimize/quantize_weights_test.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/tools/optimize/quantize_weights.h"
+
 #include <cstddef>
 #include <cstdint>
 #include <memory>
@@ -25,7 +27,7 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/tools/optimize/quantize_weights.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/tools/optimize/test_util.h"
 
 namespace {
@@ -120,7 +122,7 @@ class QuantizeWeightsTest : public testing::Test {
       for (size_t i = 0; i < op->outputs()->size(); ++i) {
         if (op->outputs()->Get(i) == tensor_idx) {
           const uint32_t op_code_idx = op->opcode_index();
-          *op_code = model->operator_codes()->Get(op_code_idx)->builtin_code();
+          *op_code = GetBuiltinCode(model->operator_codes()->Get(op_code_idx));
           return true;
         }
       }
@@ -195,7 +197,7 @@ TEST_F(QuantizeWeightsTest, HybridConv) {
     ASSERT_EQ(quantized_graph->operators()->size(), 1);
     const auto op = quantized_graph->operators()->Get(0);
     const uint32_t op_code_idx = op->opcode_index();
-    ASSERT_EQ(output_model->operator_codes()->Get(op_code_idx)->builtin_code(),
+    ASSERT_EQ(GetBuiltinCode(output_model->operator_codes()->Get(op_code_idx)),
               BuiltinOperator_CONV_2D);
     for (size_t i = 0; i < quantized_graph->tensors()->size(); i++) {
       const auto quant_tensor = quantized_graph->tensors()->Get(i);
@@ -254,7 +256,7 @@ TEST_F(QuantizeWeightsTest, DequantizeConv) {
     for (size_t i = 0; i < quantized_graph->operators()->size(); ++i) {
       const auto op = quantized_graph->operators()->Get(i);
       const uint32_t op_code_idx = op->opcode_index();
-      if (output_model->operator_codes()->Get(op_code_idx)->builtin_code() ==
+      if (GetBuiltinCode(output_model->operator_codes()->Get(op_code_idx)) ==
           BuiltinOperator_DEQUANTIZE) {
         dequant_input_idx = op->inputs()->Get(0);
         dequant_output_idx = op->outputs()->Get(0);
@@ -313,7 +315,7 @@ TEST_F(QuantizeWeightsTest, DequantizeConvFloat16) {
     for (size_t i = 0; i < quantized_graph->operators()->size(); ++i) {
       const auto op = quantized_graph->operators()->Get(i);
       const uint32_t op_code_idx = op->opcode_index();
-      if (output_model->operator_codes()->Get(op_code_idx)->builtin_code() ==
+      if (GetBuiltinCode(output_model->operator_codes()->Get(op_code_idx)) ==
           BuiltinOperator_DEQUANTIZE) {
         dequant_input_idx = op->inputs()->Get(0);
         dequant_output_idx = op->outputs()->Get(0);
@@ -365,7 +367,7 @@ TEST_F(QuantizeWeightsTest, SharedWeights_Hybrid) {
       const auto op = quantized_graph->operators()->Get(i);
       const uint32_t op_code_idx = op->opcode_index();
       const auto op_code =
-          output_model->operator_codes()->Get(op_code_idx)->builtin_code();
+          GetBuiltinCode(output_model->operator_codes()->Get(op_code_idx));
       if (op_code == BuiltinOperator_CONV_2D) {
         num_conv_ops++;
         // Ensure that each convolution's weights tensor is now INT8.
@@ -399,7 +401,7 @@ TEST_F(QuantizeWeightsTest, SharedWeights_Dequantize) {
       const auto op = quantized_graph->operators()->Get(i);
       const uint32_t op_code_idx = op->opcode_index();
       const auto op_code =
-          output_model->operator_codes()->Get(op_code_idx)->builtin_code();
+          GetBuiltinCode(output_model->operator_codes()->Get(op_code_idx));
       if (op_code == BuiltinOperator_CONV_2D) {
         num_conv_ops++;
         // Ensure that each convolution's weights tensor is still FLOAT
@@ -439,7 +441,7 @@ TEST_F(QuantizeWeightsTest, VerifyGatherQuantization) {
       const auto op = quantized_graph->operators()->Get(i);
       const uint32_t op_code_idx = op->opcode_index();
       const auto op_code =
-          output_model->operator_codes()->Get(op_code_idx)->builtin_code();
+          GetBuiltinCode(output_model->operator_codes()->Get(op_code_idx));
       if (op_code == BuiltinOperator_GATHER) {
         uint32_t input_tensor_index = op->inputs()->Get(0);
         const auto weights_tensor =
@@ -479,7 +481,7 @@ TEST_F(QuantizeWeightsTest, VerifyCustomOpQuantizationDequantize) {
     const auto op = quantized_graph->operators()->Get(i);
     const uint32_t op_code_idx = op->opcode_index();
     const auto op_code =
-        output_model->operator_codes()->Get(op_code_idx)->builtin_code();
+        GetBuiltinCode(output_model->operator_codes()->Get(op_code_idx));
     if (op_code == BuiltinOperator_CUSTOM) {
       uint32_t weights_tensor_index = op->inputs()->Get(1);
       const auto weights_tensor =
@@ -525,7 +527,7 @@ TEST_F(QuantizeWeightsTest, VerifyCustomOpQuantizationHybrid) {
     const auto op = quantized_graph->operators()->Get(i);
     const uint32_t op_code_idx = op->opcode_index();
     const auto op_code =
-        output_model->operator_codes()->Get(op_code_idx)->builtin_code();
+        GetBuiltinCode(output_model->operator_codes()->Get(op_code_idx));
     if (op_code == BuiltinOperator_CUSTOM) {
       uint32_t weights_tensor_index = op->inputs()->Get(1);
       const auto weights_tensor =
@@ -560,7 +562,7 @@ TEST_F(QuantizeWeightsTest, VerifyUpdatedHybridSchemeFalseQuantizationHybrid) {
     ASSERT_EQ(quantized_graph->operators()->size(), 1);
     const auto op = quantized_graph->operators()->Get(0);
     const uint32_t op_code_idx = op->opcode_index();
-    ASSERT_EQ(output_model->operator_codes()->Get(op_code_idx)->builtin_code(),
+    ASSERT_EQ(GetBuiltinCode(output_model->operator_codes()->Get(op_code_idx)),
               BuiltinOperator_CONV_2D);
     for (size_t i = 0; i < quantized_graph->tensors()->size(); i++) {
       const auto quant_tensor = quantized_graph->tensors()->Get(i);
diff --git a/tensorflow/lite/tools/optimize/sparsity/BUILD b/tensorflow/lite/tools/optimize/sparsity/BUILD
index 7f2d880ff48..9bceb585826 100644
--- a/tensorflow/lite/tools/optimize/sparsity/BUILD
+++ b/tensorflow/lite/tools/optimize/sparsity/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = [
@@ -11,6 +12,7 @@ cc_library(
     name = "format_converter",
     srcs = ["format_converter.cc"],
     hdrs = ["format_converter.h"],
+    compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/c:common",
diff --git a/tensorflow/lite/tools/pip_package/README.md b/tensorflow/lite/tools/pip_package/README.md
index 57e161c5b83..44c60972133 100644
--- a/tensorflow/lite/tools/pip_package/README.md
+++ b/tensorflow/lite/tools/pip_package/README.md
@@ -56,12 +56,20 @@ There is another build steps to build a binary wheel which uses Bazel instead of
 Makefile. You don't need to install additional dependencies.
 This approach can leverage TF's ci_build.sh for ARM cross builds.
 
-### Native build for your workstation
+### Normal build for your workstation
 
 ```sh
 tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
 ```
 
+### Optimized build for your workstation
+The output may have a compatibility issue with other machines but it gives the
+best performance for your workstation.
+
+```sh
+tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh native
+```
+
 ### Cross build for armhf Python 3.5
 
 ```sh
@@ -90,6 +98,12 @@ tensorflow/tools/ci_build/ci_build.sh PI-PYTHON38 \
   tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh aarch64
 ```
 
+### Native build for Windows
+
+```sh
+bash tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh windows
+```
+
 ## Enable TF OP support (Flex delegate)
 
 If you want to use TF ops with Python API, you need to enable flex support.
@@ -98,7 +112,7 @@ You can build TFLite interpreter with flex ops support by providing
 
 Here are some examples.
 
-### Native build with Flex for your workstation
+### Normal build with Flex for your workstation
 
 ```sh
 CUSTOM_BAZEL_FLAGS=--define=tflite_pip_with_flex=true \
diff --git a/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh b/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
index c60ceec5e2b..6724674df35 100755
--- a/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
+++ b/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
@@ -67,7 +67,11 @@ case "${TENSORFLOW_TARGET}" in
       --define tensorflow_mkldnn_contraction_kernel=0
       --copt=-O3"
     ;;
+  native)
+    BAZEL_FLAGS="--copt=-O3 --copt=-march=native"
+    ;;
   *)
+    BAZEL_FLAGS="--copt=-O3"
     ;;
 esac
 
@@ -75,10 +79,23 @@ esac
 # include path for Python 3.x builds to work.
 export CROSSTOOL_PYTHON_INCLUDE_PATH
 
+case "${TENSORFLOW_TARGET}" in
+  windows)
+    LIBRARY_EXTENSION=".pyd"
+    ;;
+  *)
+    LIBRARY_EXTENSION=".so"
+    ;;
+esac
+
 bazel build -c opt -s --config=monolithic --config=noaws --config=nogcp --config=nohdfs --config=nonccl \
   ${BAZEL_FLAGS} ${CUSTOM_BAZEL_FLAGS} //tensorflow/lite/python/interpreter_wrapper:_pywrap_tensorflow_interpreter_wrapper
-cp "${TENSORFLOW_DIR}/bazel-bin/tensorflow/lite/python/interpreter_wrapper/_pywrap_tensorflow_interpreter_wrapper.so" \
+cp "${TENSORFLOW_DIR}/bazel-bin/tensorflow/lite/python/interpreter_wrapper/_pywrap_tensorflow_interpreter_wrapper${LIBRARY_EXTENSION}" \
    "${BUILD_DIR}/tflite_runtime"
+# Bazel generates the wrapper library with r-x permissions for user.
+# At least on Windows, we need write permissions to delete the file.
+# Without this, setuptools fails to clean the build directory.
+chmod u+w "${BUILD_DIR}/tflite_runtime/_pywrap_tensorflow_interpreter_wrapper${LIBRARY_EXTENSION}"
 
 # Build python wheel.
 cd "${BUILD_DIR}"
diff --git a/tensorflow/lite/tools/pip_package/setup_with_bazel.py b/tensorflow/lite/tools/pip_package/setup_with_bazel.py
index e3e9a35a62e..2c9decc7e55 100644
--- a/tensorflow/lite/tools/pip_package/setup_with_bazel.py
+++ b/tensorflow/lite/tools/pip_package/setup_with_bazel.py
@@ -63,7 +63,7 @@ setup(
     ],
     packages=find_packages(exclude=[]),
     package_dir={'': '.'},
-    package_data={'': ['*.so']},
+    package_data={'': ['*.so', '*.pyd']},
     install_requires=[
         'numpy >= 1.16.0',
         'pybind11 >= 2.4.3',
diff --git a/tensorflow/lite/tools/reverse_xxd_dump_from_cc.py b/tensorflow/lite/tools/reverse_xxd_dump_from_cc.py
new file mode 100644
index 00000000000..cb7c73b6a2a
--- /dev/null
+++ b/tensorflow/lite/tools/reverse_xxd_dump_from_cc.py
@@ -0,0 +1,68 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""Reverses xxd dump from to binary file
+
+This script is used to convert models from C++ source file (dumped with xxd) to
+the binary model weight file and analyze it with model visualizer like Netron
+(https://github.com/lutzroeder/netron) or load the model in TensorFlow Python
+API
+to evaluate the results in Python.
+
+The command to dump binary file to C++ source file looks like
+
+xxd -i model_data.tflite > model_data.cc
+
+Example usage:
+
+python reverse_xxd_dump_from_cc.py \
+  --input_cc_file=model_data.cc \
+  --output_tflite_file=model_data.tflite
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import sys
+
+from tensorflow.lite.tools import flatbuffer_utils
+from tensorflow.python.platform import app
+
+
+def main(_):
+  """Application run loop."""
+  parser = argparse.ArgumentParser(
+      description='Reverses xxd dump from to binary file')
+  parser.add_argument(
+      '--input_cc_file',
+      type=str,
+      required=True,
+      help='Full path name to the input cc file.')
+  parser.add_argument(
+      '--output_tflite_file',
+      type=str,
+      required=True,
+      help='Full path name to the stripped output tflite file.')
+
+  args = parser.parse_args()
+
+  # Read the model from xxd output C++ source file
+  model = flatbuffer_utils.xxd_output_to_object(args.input_cc_file)
+  # Write the model
+  flatbuffer_utils.write_model(model, args.output_tflite_file)
+
+
+if __name__ == '__main__':
+  app.run(main=main, argv=sys.argv[:1])
diff --git a/tensorflow/lite/tools/sanitizers/BUILD b/tensorflow/lite/tools/sanitizers/BUILD
new file mode 100644
index 00000000000..6da688fd9cb
--- /dev/null
+++ b/tensorflow/lite/tools/sanitizers/BUILD
@@ -0,0 +1,21 @@
+load("//tensorflow:tensorflow.bzl", "pybind_extension")
+
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+pybind_extension(
+    name = "_pywrap_tensorflow_lite_sanitizers",
+    srcs = [
+        "sanitizers_pybind11.cc",
+    ],
+    link_in_framework = True,
+    module_name = "_pywrap_tensorflow_lite_sanitizers",
+    deps = [
+        "//tensorflow/python:pybind11_lib",
+        "@pybind11",
+    ],
+)
diff --git a/tensorflow/lite/tools/sanitizers/sanitizers_pybind11.cc b/tensorflow/lite/tools/sanitizers/sanitizers_pybind11.cc
new file mode 100644
index 00000000000..42e4c2951f2
--- /dev/null
+++ b/tensorflow/lite/tools/sanitizers/sanitizers_pybind11.cc
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "pybind11/pybind11.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+
+PYBIND11_MODULE(_pywrap_tensorflow_lite_sanitizers, m) {
+  m.def("TSan_Enabled", []() -> bool {
+#ifdef THREAD_SANITIZER
+    return true;
+#else
+    return false;
+#endif
+  });
+  m.def("MSan_Enabled", []() -> bool {
+#ifdef MEMORY_SANITIZER
+    return true;
+#else
+    return false;
+#endif
+  });
+  m.def("ASan_Enabled", []() -> bool {
+#ifdef ADDRESS_SANITIZER
+    return true;
+#else
+    return false;
+#endif
+  });
+}
diff --git a/tensorflow/lite/tools/verifier.cc b/tensorflow/lite/tools/verifier.cc
index bdd4a35dac7..a4cc4e0c9a0 100644
--- a/tensorflow/lite/tools/verifier.cc
+++ b/tensorflow/lite/tools/verifier.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/version.h"
 
@@ -478,6 +479,7 @@ bool VerifySubGraphConsistency(const Model& model, const SubGraph& subgraph,
         return false;
       }
       const auto& opcode = model.operator_codes()->Get(op->opcode_index());
+      auto builtin_code = GetBuiltinCode(opcode);
       // Check for invalid inputs by ensuring all exist in produced_tensors.
       for (const int input_idx : *op->inputs()) {
         if (input_idx == kTfLiteOptionalTensor) continue;
@@ -488,8 +490,7 @@ bool VerifySubGraphConsistency(const Model& model, const SubGraph& subgraph,
             output_tensors.find(input_idx) == output_tensors.end()) {
           ReportError(error_reporter,
                       "Input tensor %d to op %d (%s) is not produced",
-                      input_idx, op_idx,
-                      EnumNameBuiltinOperator(opcode->builtin_code()));
+                      input_idx, op_idx, EnumNameBuiltinOperator(builtin_code));
           return false;
         }
       }
@@ -497,31 +498,29 @@ bool VerifySubGraphConsistency(const Model& model, const SubGraph& subgraph,
       // produced_tensors.
       for (const int output_idx : *op->outputs()) {
         if (constant_tensors.find(output_idx) != constant_tensors.end()) {
-          ReportError(error_reporter,
-                      "Output tensor %d to op %d (%s) is a constant",
-                      output_idx, op_idx,
-                      EnumNameBuiltinOperator(opcode->builtin_code()));
+          ReportError(
+              error_reporter, "Output tensor %d to op %d (%s) is a constant",
+              output_idx, op_idx, EnumNameBuiltinOperator(builtin_code));
           return false;
         } else if (variable_tensors.find(output_idx) !=
                    variable_tensors.end()) {
-          ReportError(error_reporter,
-                      "Output tensor %d to op %d (%s) is a variable",
-                      output_idx, op_idx,
-                      EnumNameBuiltinOperator(opcode->builtin_code()));
+          ReportError(
+              error_reporter, "Output tensor %d to op %d (%s) is a variable",
+              output_idx, op_idx, EnumNameBuiltinOperator(builtin_code));
           return false;
         } else if (subgraph_input_tensors.find(output_idx) !=
                    subgraph_input_tensors.end()) {
           ReportError(error_reporter,
                       "Output tensor %d to op %d (%s) is a subgraph input",
                       output_idx, op_idx,
-                      EnumNameBuiltinOperator(opcode->builtin_code()));
+                      EnumNameBuiltinOperator(builtin_code));
           return false;
         } else if (output_tensors.find(output_idx) != output_tensors.end()) {
           ReportError(error_reporter,
                       "Output tensor %d to op %d (%s) is an output from "
                       "another op. There is a cycle in the graph",
                       output_idx, op_idx,
-                      EnumNameBuiltinOperator(opcode->builtin_code()));
+                      EnumNameBuiltinOperator(builtin_code));
           return false;
         }
         // This can be an input to a subsequent op.
@@ -608,14 +607,15 @@ bool VerifyOps(const Model& model, const OpResolver& resolver,
     return true;
   }
   for (const auto& opcode : *model.operator_codes()) {
-    if (opcode->builtin_code() < BuiltinOperator_MIN ||
-        opcode->builtin_code() > BuiltinOperator_MAX) {
+    auto builtin_code = GetBuiltinCode(opcode);
+    if (builtin_code < BuiltinOperator_MIN ||
+        builtin_code > BuiltinOperator_MAX) {
       ReportError(error_reporter, "Operator id '%d' is out of range.",
-                  opcode->builtin_code());
+                  builtin_code);
       return false;
     }
 
-    if (opcode->builtin_code() == BuiltinOperator_CUSTOM) {
+    if (builtin_code == BuiltinOperator_CUSTOM) {
       if (IsNullOrEmptyString(opcode->custom_code())) {
         ReportError(error_reporter,
                     "Invalid custom op name, cannot be null/empty.");
@@ -627,10 +627,9 @@ bool VerifyOps(const Model& model, const OpResolver& resolver,
         return false;
       }
     } else {
-      if (!resolver.FindOp(opcode->builtin_code(), opcode->version())) {
+      if (!resolver.FindOp(builtin_code, opcode->version())) {
         ReportError(error_reporter, "Unsupported builtin op: %s, version: %d",
-                    EnumNameBuiltinOperator(opcode->builtin_code()),
-                    opcode->version());
+                    EnumNameBuiltinOperator(builtin_code), opcode->version());
         return false;
       }
     }
diff --git a/tensorflow/lite/tools/verifier.h b/tensorflow/lite/tools/verifier.h
index 50b6432d4e3..35d7e30a172 100644
--- a/tensorflow/lite/tools/verifier.h
+++ b/tensorflow/lite/tools/verifier.h
@@ -18,8 +18,10 @@ limitations under the License.
 
 #include <stdio.h>
 
-#include "tensorflow/lite/error_reporter.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/error_reporter.h"  // Legacy.
+#include "tensorflow/lite/model.h"           // Legacy.
 
 namespace tflite {
 
diff --git a/tensorflow/lite/tools/verifier_test.cc b/tensorflow/lite/tools/verifier_test.cc
index da4eb55c4f9..f37eaaa99be 100644
--- a/tensorflow/lite/tools/verifier_test.cc
+++ b/tensorflow/lite/tools/verifier_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/lite/mutable_op_resolver.h"
 #include "tensorflow/lite/op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/schema/schema_utils.h"
 #include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
diff --git a/tensorflow/lite/tools/versioning/BUILD b/tensorflow/lite/tools/versioning/BUILD
index 34d63bd9645..4f89a6531f8 100644
--- a/tensorflow/lite/tools/versioning/BUILD
+++ b/tensorflow/lite/tools/versioning/BUILD
@@ -2,6 +2,7 @@ load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
 )
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -18,6 +19,7 @@ cc_library(
         "op_version.h",
         "runtime_version.h",
     ],
+    compatible_with = get_compatible_with_cloud(),
     deps = [
         "//tensorflow/core:tflite_portable_logging",
         "//tensorflow/lite:minimal_logging",
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 7edf459eb90..4d41b7d13e9 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -237,6 +237,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_TRANSPOSE:
+      if (op_sig.input_types.at(0) == TensorType_INT16) {
+        return 5;
+      }
       if (op_sig.options.single_input_op.num_dims > 4) {
         return 4;
       }
@@ -320,6 +323,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_SLICE:
+      if (op_sig.input_types.at(0) == TensorType_INT16) {
+        return 4;
+      }
       // Version 3 supports string input types.
       if (op_sig.input_types.at(0) == TensorType_STRING) {
         return 3;
@@ -400,8 +406,10 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
     case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR:
-      if (op_sig.options.resize.half_pixel_centers ||
-          op_sig.options.resize.align_corners) {
+      if (op_sig.input_types.at(0) == TensorType_INT16) {
+        return 4;
+      } else if (op_sig.options.resize.half_pixel_centers ||
+                 op_sig.options.resize.align_corners) {
         return 3;
       } else if (op_sig.input_types.at(0) == TensorType_INT8) {
         return 2;
@@ -535,6 +543,7 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_CONCATENATION:
+    case BuiltinOperator_BATCH_MATMUL:
     case BuiltinOperator_SOFTMAX:
     case BuiltinOperator_MEAN:
     case BuiltinOperator_PAD:
@@ -577,7 +586,6 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
     case BuiltinOperator_LESS:
     case BuiltinOperator_LESS_EQUAL:
     case BuiltinOperator_SELECT:
-    case BuiltinOperator_BATCH_MATMUL:
       if (op_sig.input_types.at(0) == TensorType_INT8) {
         return 2;
       }
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index 82ebad701cd..f954ea6b6d2 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -216,6 +216,12 @@ TEST(OpVersionTest, VersioningSpaceToDepthTest) {
 
 TEST(OpVersionTest, VersioningSliceTest) {
   OpSignature fake_op_sig = {
+      .op = BuiltinOperator_SLICE,
+      .input_types = std::vector<TensorType>{TensorType_INT16},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
+
+  fake_op_sig = {
       .op = BuiltinOperator_SLICE,
       .input_types = std::vector<TensorType>{TensorType_STRING},
   };
@@ -587,6 +593,12 @@ TEST(OpVersionTest, VersioningTileOperatorTest) {
 }
 TEST(OpVersionTest, VersioningTransposeTest) {
   OpSignature fake_op_sig = {
+      .op = BuiltinOperator_TRANSPOSE,
+      .input_types = std::vector<TensorType>{TensorType_INT16},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 5);
+
+  fake_op_sig = {
       .op = BuiltinOperator_TRANSPOSE,
       .input_types = std::vector<TensorType>{TensorType_BOOL},
   };
@@ -709,6 +721,15 @@ TEST(OpVersionTest, VersioningResizeNearestNeighborTest) {
 
   fake_op_sig.options.resize.align_corners = true;
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  // int16 input is version 4.
+  fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+      .input_types =
+          std::vector<TensorType>{TensorType_INT16, TensorType_INT32},
+      .output_types = std::vector<TensorType>{TensorType_INT16},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
 }
 TEST(OpVersionTest, VersioningAbsTest) {
   // Default.
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index a656356b84c..3c0a92f9df1 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -59,6 +59,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_AVERAGE_POOL_2D, 3}, "2.3.0"},
               {{BuiltinOperator_BATCH_MATMUL, 1}, "2.3.0"},
               {{BuiltinOperator_BATCH_MATMUL, 2}, "2.3.0"},
+              {{BuiltinOperator_BATCH_MATMUL, 3}, kPendingReleaseVersion},
               {{BuiltinOperator_CONV_2D, 1}, "1.5.0"},
               {{BuiltinOperator_CONV_2D, 2}, "1.14.0"},
               {{BuiltinOperator_CONV_2D, 3}, "1.14.0"},
@@ -159,6 +160,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_TRANSPOSE, 2}, "1.14.0"},
               {{BuiltinOperator_TRANSPOSE, 3}, "1.15.0"},
               {{BuiltinOperator_TRANSPOSE, 4}, "2.3.0"},
+              {{BuiltinOperator_TRANSPOSE, 5}, kPendingReleaseVersion},
               {{BuiltinOperator_LSTM, 1}, "1.7.0"},
               {{BuiltinOperator_LSTM, 2}, "1.10.0"},
               {{BuiltinOperator_LSTM, 3}, "1.14.0"},
@@ -191,6 +193,8 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 1}, "1.13.1"},
               {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 2}, "1.14.0"},
               {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 3}, "2.3.0"},
+              {{BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 4},
+               kPendingReleaseVersion},
               {{BuiltinOperator_RNN, 1}, "1.5.0"},
               {{BuiltinOperator_RNN, 2}, "1.14.0"},
               {{BuiltinOperator_RNN, 3}, "2.3.0"},
@@ -226,6 +230,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_SLICE, 1}, "1.14.0"},
               {{BuiltinOperator_SLICE, 2}, "1.14.0"},
               {{BuiltinOperator_SLICE, 3}, "1.14.0"},
+              {{BuiltinOperator_SLICE, 4}, kPendingReleaseVersion},
               {{BuiltinOperator_TANH, 1}, "1.14.0"},
               {{BuiltinOperator_TANH, 2}, "1.14.0"},
               {{BuiltinOperator_TANH, 3}, "2.3.0"},
diff --git a/tensorflow/lite/tools/visualize.py b/tensorflow/lite/tools/visualize.py
index 3d22d1bb05b..a9d337bee16 100644
--- a/tensorflow/lite/tools/visualize.py
+++ b/tensorflow/lite/tools/visualize.py
@@ -452,6 +452,10 @@ def CreateHtmlFile(tflite_input, html_output):
                               ("custom_code", None),
                               ("version", None)]
 
+  # Update builtin code fields.
+  for idx, d in enumerate(data["operator_codes"]):
+    d["builtin_code"] = max(d["builtin_code"], d["deprecated_builtin_code"])
+
   for subgraph_idx, g in enumerate(data["subgraphs"]):
     # Subgraph local specs on what to display
     html += "<div class='subgraph'>"
diff --git a/tensorflow/lite/tutorials/BUILD b/tensorflow/lite/tutorials/BUILD
index 277d807b2ca..d8f9e254ece 100644
--- a/tensorflow/lite/tutorials/BUILD
+++ b/tensorflow/lite/tutorials/BUILD
@@ -5,8 +5,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 py_binary(
     name = "mnist_tflite",
     srcs = [
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index f5178056428..fca40b774fe 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -8,6 +8,7 @@ tensorflow/go/op/wrappers.go
 tensorflow/lite/micro/build_def.bzl
 tensorflow/python/autograph/core/config.py
 tensorflow/python/eager/benchmarks_test_base.py
+tensorflow/python/framework/tfrt_utils.py
 tensorflow/python/tpu/profiler/pip_package/BUILD
 tensorflow/python/tpu/profiler/pip_package/README
 tensorflow/python/tpu/profiler/pip_package/build_pip_package.sh
@@ -116,6 +117,7 @@ tensorflow/third_party/llvm/BUILD
 tensorflow/third_party/llvm/expand_cmake_vars.py
 tensorflow/third_party/llvm/llvm.autogenerated.BUILD
 tensorflow/third_party/llvm/llvm.bzl
+tensorflow/third_party/llvm_openmp/BUILD
 tensorflow/third_party/lmdb.BUILD
 tensorflow/third_party/mkl/BUILD
 tensorflow/third_party/mkl/LICENSE
@@ -163,7 +165,6 @@ tensorflow/third_party/repo.bzl
 tensorflow/third_party/six.BUILD
 tensorflow/third_party/snappy.BUILD
 tensorflow/third_party/sqlite.BUILD
-tensorflow/third_party/sycl/crosstool/BUILD
 tensorflow/third_party/systemlibs/BUILD
 tensorflow/third_party/systemlibs/BUILD.tpl
 tensorflow/third_party/systemlibs/absl_py.BUILD
@@ -204,6 +205,7 @@ tensorflow/third_party/tensorrt/build_defs.bzl.tpl
 tensorflow/third_party/tensorrt/tensorrt/include/tensorrt_config.h.tpl
 tensorflow/third_party/tensorrt/tensorrt_configure.bzl
 tensorflow/third_party/termcolor.BUILD
+tensorflow/third_party/tf_toolchains.BUILD
 tensorflow/third_party/tflite_mobilenet.BUILD
 tensorflow/third_party/tflite_mobilenet_float.BUILD
 tensorflow/third_party/tflite_mobilenet_quant.BUILD
@@ -274,18 +276,6 @@ tensorflow/tools/ci_build/release/common.sh
 tensorflow/tools/ci_build/release/common_win.bat
 tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/build.sh
 tensorflow/tools/ci_build/release/macos/cpu_libtensorflow/release.sh
-tensorflow/tools/ci_build/release/macos/cpu_py2_full/nightly_release.sh
-tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh
-tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip_v1.sh
-tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
-tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh
-tensorflow/tools/ci_build/release/macos/cpu_py2_xla/build.sh
-tensorflow/tools/ci_build/release/macos/cpu_py35_full/nightly_release.sh
-tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh
-tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip_v1.sh
-tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
-tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh
-tensorflow/tools/ci_build/release/macos/cpu_py35_full/release.sh
 tensorflow/tools/ci_build/release/macos/cpu_py36_full/nightly_release.sh
 tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh
 tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip_v1.sh
@@ -300,16 +290,6 @@ tensorflow/tools/ci_build/release/macos/cpu_py37_full/release.sh
 tensorflow/tools/ci_build/release/macos/cpu_py38_full/nightly_release.sh
 tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
 tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nightly_release.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh
 tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh
 tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip.sh
 tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nonpip_v1.sh
@@ -326,16 +306,6 @@ tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh
 tensorflow/tools/ci_build/release/ubuntu_16/custom_op/nightly.sh
 tensorflow/tools/ci_build/release/ubuntu_16/custom_op/release.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nightly_release.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nightly_release.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip_v1.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
-tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nightly_release.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip_v1.sh
@@ -355,11 +325,6 @@ tensorflow/tools/ci_build/release/ubuntu_16/sanity/build.sh
 tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
 tensorflow/tools/ci_build/release/windows/cpu_libtensorflow/nightly.bat
 tensorflow/tools/ci_build/release/windows/cpu_libtensorflow/release.bat
-tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly.bat
-tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly_release.bat
-tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat
-tensorflow/tools/ci_build/release/windows/cpu_py35_full/release_pip_rename.sh
-tensorflow/tools/ci_build/release/windows/cpu_py35_full/release_v1.bat
 tensorflow/tools/ci_build/release/windows/cpu_py36_full/nightly.bat
 tensorflow/tools/ci_build/release/windows/cpu_py36_full/nightly_release.bat
 tensorflow/tools/ci_build/release/windows/cpu_py36_full/release.bat
@@ -377,11 +342,6 @@ tensorflow/tools/ci_build/release/windows/cpu_py38_full/release_pip_rename.sh
 tensorflow/tools/ci_build/release/windows/gpu_libtensorflow/nightly.bat
 tensorflow/tools/ci_build/release/windows/gpu_libtensorflow/release.bat
 tensorflow/tools/ci_build/release/windows/gpu_pip_on_cpu/build.bat
-tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly.bat
-tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly_release.bat
-tensorflow/tools/ci_build/release/windows/gpu_py35_full/release.bat
-tensorflow/tools/ci_build/release/windows/gpu_py35_full/release_pip_rename.sh
-tensorflow/tools/ci_build/release/windows/gpu_py35_full/release_v1.bat
 tensorflow/tools/ci_build/release/windows/gpu_py36_full/nightly.bat
 tensorflow/tools/ci_build/release/windows/gpu_py36_full/nightly_release.bat
 tensorflow/tools/ci_build/release/windows/gpu_py36_full/release.bat
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index c8be168d814..21d266fdbdb 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -5,6 +5,12 @@
 load("//tensorflow:tensorflow.bzl", "py_strict_library")
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "if_not_windows", "if_xla_available", "py_test", "py_tests", "tf_cc_shared_object", "tf_cc_test", "tf_cuda_library", "tf_enable_mlir_bridge", "tf_gen_op_wrapper_py")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
+
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_monitoring_python_deps")
 
@@ -34,7 +40,7 @@ load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_py_build_info_genrule")
-load("//tensorflow/core/platform:build_config.bzl", "pyx_library", "tf_additional_all_protos", "tf_additional_lib_deps", "tf_proto_library", "tf_proto_library_py", "tf_protos_grappler")  # @unused
+load("//tensorflow/core/platform:build_config.bzl", "pyx_library", "tf_additional_all_protos", "tf_additional_lib_deps", "tf_proto_library", "tf_protos_grappler")  # @unused
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static", "tf_additional_plugin_deps", "tf_additional_profiler_deps", "tf_additional_xla_deps_py")
 load("//tensorflow/python:build_defs.bzl", "tf_gen_op_wrapper_private_py")
 load(
@@ -59,6 +65,7 @@ visibility = [
     "//third_party/py/tf_agents:__subpackages__",  # For benchmarks.
     "//third_party/py/tf_slim:__subpackages__",
     "//third_party/py/tensorflow_docs:__subpackages__",
+    "//third_party/py/keras:__subpackages__",
 ]
 
 package(
@@ -66,8 +73,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 # Description:
 
 py_library(
@@ -124,6 +129,7 @@ py_library(
         "//tensorflow/python/tools/api/generator:__pkg__",
         "//tensorflow/tools/api/tests:__pkg__",
         "//tensorflow/tools/compatibility/update:__pkg__",
+        "//third_party/py/keras:__subpackages__",
         "//third_party/py/tensorflow_core:__subpackages__",
     ],
     deps = [
@@ -185,7 +191,6 @@ py_library(
         ":pywrap_tf_session",
         ":pywrap_tfe",
         ":rnn_ops_gen",
-        ":saver_test_utils",
         ":script_ops",
         ":sendrecv_ops_gen",
         ":session_ops",
@@ -214,8 +219,10 @@ py_library(
         "//tensorflow/python/data",
         "//tensorflow/python/debug:debug_py",
         "//tensorflow/python/distribute",
+        "//tensorflow/python/distribute:combinations",  # For tf.__internal__ API.
         "//tensorflow/python/distribute:distribute_config",
         "//tensorflow/python/distribute:estimator_training",
+        "//tensorflow/python/distribute:strategy_combinations",  # For tf.__internal__,
         "//tensorflow/python/dlpack",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:monitoring",
@@ -239,6 +246,7 @@ py_library(
         "//tensorflow/python/tools:module_util",
         "//tensorflow/python/tools/api/generator:create_python_api",
         "//tensorflow/python/tpu:tpu_noestimator",
+        "//tensorflow/python/training:saver_test_utils",
         "//tensorflow/python/types",
         "//third_party/py/numpy",
     ],
@@ -256,6 +264,8 @@ py_library(
     ],
     deps = [
         ":no_contrib",
+        "//tensorflow/python/distribute:multi_process_runner",
+        "//tensorflow/python/distribute:multi_worker_test_base",
     ],
 )
 
@@ -415,6 +425,7 @@ cc_library(
     name = "cost_analyzer_lib",
     srcs = ["grappler/cost_analyzer.cc"],
     hdrs = ["grappler/cost_analyzer.h"],
+    compatible_with = get_compatible_with_cloud(),
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/grappler/costs:analytical_cost_estimator",
@@ -529,6 +540,7 @@ tf_python_pybind_extension(
 # Necessary for the pywrap inclusion below.
 tf_pybind_cc_library_wrapper(
     name = "tfcompile_headers_lib",
+    compatible_with = [],
     deps = [
         "//tensorflow/compiler/aot:tfcompile_lib",
     ],
@@ -595,6 +607,7 @@ cc_library(
 cc_library(
     name = "pybind11_lib",
     hdrs = ["lib/core/pybind11_lib.h"],
+    compatible_with = get_compatible_with_portable(),
     features = ["-parse_headers"],
     visibility = tf_external_workspace_visible(visibility),
     deps = [
@@ -681,6 +694,7 @@ tf_python_pybind_extension(
         "//tensorflow/c:headers",
         "//tensorflow/c/eager:headers",
         "//tensorflow/c/eager:pywrap_required_hdrs",
+        "//tensorflow/c/experimental/ops:pywrap_required_hdrs",
         "//tensorflow/core/common_runtime/eager:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
@@ -689,17 +703,18 @@ tf_python_pybind_extension(
     deps = [
         ":pybind11_lib",
         ":pybind11_status",
+        ":safe_pyobject_ptr",
         "//tensorflow/core/framework:pywrap_required_hdrs",
         "//third_party/py/numpy:headers",
         "//tensorflow/c:pywrap_required_hdrs",
         "@pybind11",
         "//third_party/python_runtime:headers",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/c/experimental/saved_model/core:pywrap_required_hdrs",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
         "//tensorflow/core:lib_headers_for_pybind",
         "@com_google_absl//absl/types:optional",
+        "//tensorflow/core/lib/llvm_rtti",
     ] + if_static(
         extra_deps = [
             "//tensorflow/core/protobuf:eager_service_proto_cc",
@@ -760,7 +775,7 @@ tf_python_pybind_extension(
 tf_python_pybind_extension(
     name = "_pywrap_quantize_training",
     srcs = [
-        "training/quantize_training_wrapper.cc",
+        "//tensorflow/python/training:quantize_training_wrapper.cc",
     ],
     hdrs = ["//tensorflow/core/common_runtime:quantize_training_hdrs"],
     module_name = "_pywrap_quantize_training",
@@ -793,10 +808,11 @@ tf_python_pybind_extension(
 )
 
 tf_python_pybind_extension(
-    name = "_pywrap_tf32_execution",
-    srcs = ["util/tf32.cc"],
-    hdrs = ["//tensorflow/core/platform:tf32_hdr"],
-    module_name = "_pywrap_tf32_execution",
+    name = "_pywrap_tensor_float_32_execution",
+    srcs = ["util/tensor_float_32.cc"],
+    hdrs = ["//tensorflow/core/platform:tensor_float_32_hdr"],
+    compatible_with = get_compatible_with_portable(),
+    module_name = "_pywrap_tensor_float_32_execution",
     deps = [
         "@pybind11",
     ],
@@ -889,6 +905,7 @@ tf_python_pybind_extension(
     deps = [
         ":pybind11_lib",
         ":pybind11_status",
+        ":safe_pyobject_ptr",
         "//tensorflow/core:lib_headers_for_pybind",
         "//tensorflow/core:op_gen_lib",
         "//tensorflow/core:protos_all_cc",
@@ -940,12 +957,17 @@ tf_python_pybind_extension(
     ],
 )
 
+# TODO(edloper): Remove unused dependency on safe_ptr.  (Blocker: there are
+# targets that depend are relying on cpp_python_util to pull in safe_ptr's
+# third_party/tensorflow/c:c_api_no_xla dependency, which registers
+# ops/gradients, rather than depending on it themselves.)
 cc_library(
     name = "cpp_python_util",
     srcs = ["util/util.cc"],
     hdrs = ["util/util.h"],
     deps = [
         ":safe_ptr",
+        ":safe_pyobject_ptr",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//third_party/python_runtime:headers",
@@ -1000,6 +1022,20 @@ tf_python_pybind_extension(
     ],
 )
 
+cc_library(
+    name = "safe_pyobject_ptr",
+    srcs = ["lib/core/safe_pyobject_ptr.cc"],
+    hdrs = ["lib/core/safe_pyobject_ptr.h"],
+    deps = [
+        "//third_party/python_runtime:headers",
+    ],
+)
+
+cc_library(
+    name = "safe_pyobject_ptr_required_hdrs",
+    textual_hdrs = ["lib/core/safe_pyobject_ptr.h"],
+)
+
 cc_library(
     name = "safe_ptr",
     srcs = [
@@ -1008,6 +1044,7 @@ cc_library(
     ],
     hdrs = ["lib/core/safe_ptr.h"],
     deps = [
+        ":safe_pyobject_ptr",
         "//tensorflow/c:c_api_no_xla",
         "//third_party/python_runtime:headers",
     ],
@@ -1021,6 +1058,7 @@ cc_library(
         "lib/core/ndarray_tensor_bridge.h",
         "lib/core/numpy.h",
         "lib/core/safe_ptr.h",
+        "lib/core/safe_pyobject_ptr.h",
         "//tensorflow/c:headers",
         "//tensorflow/c/eager:headers",
     ],
@@ -1028,7 +1066,7 @@ cc_library(
         "-parse_headers",
     ],
     visibility = tf_external_workspace_visible(visibility + [
-        "//learning/deepmind/courier:__subpackages__",
+        "//tensorflow:ndarray_tensor_allow_list",
     ]),
     deps = [
         ":numpy_lib",
@@ -1565,6 +1603,7 @@ py_library(
         "framework/convert_to_constants.py",
     ],
     srcs_version = "PY2AND3",
+    visibility = visibility + ["//waymo/ml:__subpackages__"],
     deps = [
         ":dtypes",
         ":framework_ops",
@@ -1627,13 +1666,47 @@ py_library(
     ],
 )
 
+cc_library(
+    name = "py_context_manager",
+    srcs = ["framework/py_context_manager.cc"],
+    hdrs = ["framework/py_context_manager.h"],
+    deps = [
+        ":safe_pyobject_ptr",
+        "//tensorflow/core:lib",  # for core/platform/logging.h
+        "//third_party/python_runtime:headers",
+    ],
+)
+
+# Pybind extension used by py_context_manager_test.
+tf_python_pybind_extension(
+    name = "_py_context_manager",
+    srcs = ["framework/py_context_manager_pybind.cc"],
+    module_name = "_py_context_manager",
+    deps = [
+        ":py_context_manager",
+        "//third_party/python_runtime:headers",
+        "@pybind11",
+    ],
+)
+
+tf_py_test(
+    name = "py_context_manager_test",
+    srcs = ["framework/py_context_manager_test.py"],
+    python_version = "PY3",
+    tags = ["no_pip"],
+    tfrt_enabled = True,
+    deps = [
+        ":_py_context_manager",
+    ],
+)
+
 cc_library(
     name = "op_def_util_cc",
     srcs = ["framework/op_def_util.cc"],
     hdrs = ["framework/op_def_util.h"],
     deps = [
         ":cpp_python_util",
-        ":safe_ptr",
+        ":safe_pyobject_ptr",
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/strings",
     ],
@@ -1644,6 +1717,8 @@ cc_library(
 # depending on that target adds dependencies that register objects; and since the
 # extension is built as a shared object in some kokoro tests, this causes those objects
 # to get registered multiple times (which fails).
+# TODO(edloper): Simplify this, once cpp_python_util is changed to not depend on
+# safe_ptr (which transitively depends on third_party/tensorflow/c:c_api_no_xla).
 tf_python_pybind_extension(
     name = "_op_def_util",
     srcs = [
@@ -1660,6 +1735,7 @@ tf_python_pybind_extension(
     module_name = "_op_def_util",
     deps = [
         ":pybind11_status",
+        ":safe_pyobject_ptr",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:status",
         "//third_party/python_runtime:headers",
@@ -1916,7 +1992,7 @@ py_library(
     name = "composite_tensor",
     srcs = ["framework/composite_tensor.py"],
     srcs_version = "PY2AND3",
-    visibility = visibility + ["//tensorflow:composite_tensor_whitelist"],
+    visibility = visibility,
     deps = [
         ":dtypes",
         ":tensor_util",
@@ -1991,7 +2067,7 @@ py_library(
     name = "type_spec",
     srcs = ["framework/type_spec.py"],
     srcs_version = "PY2AND3",
-    visibility = visibility + ["//tensorflow:composite_tensor_whitelist"],
+    visibility = visibility,
     deps = [
         ":dtypes",
         ":tensor_shape",
@@ -2112,16 +2188,33 @@ py_library(
     name = "is_mlir_bridge_test_true",
     srcs = ["framework/is_mlir_bridge_test_true.py"],
     srcs_version = "PY2AND3",
+    visibility = visibility,
+)
+
+# Including this as a dependency will result in tests using
+# :framework_test_lib to NOT use MLIR.
+py_library(
+    name = "is_mlir_bridge_test_false",
+    srcs = ["framework/is_mlir_bridge_test_false.py"],
+    srcs_version = "PY2AND3",
+    visibility = visibility,
 )
 
 # Including this as a dependency will result in tests to use TFRT.
-# TODO(b/153582383): Move tf_ops_alwayslink dependency to c_api_tfrt instead.
+# TODO(b/170139514): Add a curated list of TFRT native ops.
+# TODO(kkb): Deprecated by `tfrt_utils`, remove.
 py_library(
     name = "is_tfrt_test_true",
     srcs = ["framework/is_tfrt_test_true.py"],
     srcs_version = "PY2AND3",
 )
 
+py_library(
+    name = "tfrt_utils",
+    srcs = ["framework/tfrt_utils.py"],
+    srcs_version = "PY2AND3",
+)
+
 py_library(
     name = "distributed_framework_test_lib",
     srcs_version = "PY2AND3",
@@ -2352,7 +2445,7 @@ py_library(
 )
 
 py_test(
-    name = "ops/functional_ops_test",
+    name = "functional_ops_test",
     srcs = ["ops/functional_ops_test.py"],
     python_version = "PY3",
     srcs_version = "PY2AND3",
@@ -2419,7 +2512,6 @@ tf_py_test(
     srcs = ["framework/importer_test.py"],
     main = "framework/importer_test.py",
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -2741,13 +2833,13 @@ tf_py_test(
     main = "framework/test_util_test.py",
     python_version = "PY3",
     tags = ["no_windows"],
-    tfrt_enabled = True,
     deps = [
         ":control_flow_ops",
         ":errors",
         ":framework_combinations",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
+        ":lookup_ops",
         ":platform_test",
         ":random_ops",
         ":resource_variable_ops",
@@ -2848,6 +2940,7 @@ tf_gen_op_wrapper_private_py(
         "//tensorflow/compiler/tests:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
         "//tensorflow/python/kernel_tests/v1_compat_tests:__pkg__",
+        "//tensorflow/python/training:__pkg__",
     ],
     deps = [
         "//tensorflow/c/kernels:bitcast_op_lib",
@@ -2906,7 +2999,10 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "checkpoint_ops_gen",
-    visibility = ["//tensorflow/python/kernel_tests:__pkg__"],
+    visibility = [
+        "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/training:__pkg__",
+    ],
 )
 
 tf_gen_op_wrapper_private_py(
@@ -2946,6 +3042,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/training:__pkg__",
     ],
 )
 
@@ -2977,6 +3074,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/training:__pkg__",
         "//tensorflow/python/training/tracking:__pkg__",
     ],
 )
@@ -3005,6 +3103,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/training:__pkg__",
     ],
 )
 
@@ -3116,6 +3215,10 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "stateless_random_ops_v2_gen",
+)
+
 tf_gen_op_wrapper_private_py(
     name = "list_ops_gen",
 )
@@ -3142,6 +3245,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/training:__pkg__",
     ],
 )
 
@@ -3164,7 +3268,9 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "training_ops_gen",
-    out = "training/gen_training_ops.py",
+    visibility = [
+        "//tensorflow/python/training:__pkg__",
+    ],
 )
 
 tf_gen_op_wrapper_private_py(
@@ -3411,7 +3517,6 @@ tf_py_test(
     srcs = ["ops/collective_ops_test.py"],
     python_version = "PY3",
     tags = ["no_rocm"],
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":collective_ops",
@@ -3527,7 +3632,6 @@ py_library(
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
-        "//tensorflow/python/keras/engine:base_layer_utils",
     ],
 )
 
@@ -3620,6 +3724,7 @@ py_library(
         ":gradients",
         ":gradients_util",
         ":graph_to_function_def",
+        ":handle_data_util",
         ":pywrap_tensorflow",
         ":util",
         "//tensorflow/python/compat",
@@ -3754,6 +3859,21 @@ py_library(
     ],
 )
 
+py_library(
+    name = "handle_data_util",
+    srcs = [
+        "ops/handle_data_util.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dtypes",
+        ":framework_ops",
+        ":protos_all_py",
+        ":pywrap_tf_session",
+        ":util",
+    ],
+)
+
 py_library(
     name = "gradients",
     srcs = [
@@ -3764,6 +3884,7 @@ py_library(
     deps = [
         ":gradients_impl",
         ":gradients_util",
+        ":handle_data_util",
         ":pywrap_tf_session",
         ":unconnected_gradients",
         "//tensorflow/python/eager:forwardprop",
@@ -4083,121 +4204,6 @@ py_library(
     ],
 )
 
-py_library(
-    name = "loss_scale",
-    srcs = ["training/experimental/loss_scale.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":framework",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_library(
-    name = "loss_scale_optimizer",
-    srcs = ["training/experimental/loss_scale_optimizer.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":loss_scale",
-        "//tensorflow/python/distribute:distribute_lib",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_test(
-    name = "loss_scale_optimizer_test",
-    size = "small",
-    srcs = ["training/experimental/loss_scale_optimizer_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":loss_scale_optimizer",
-        "//tensorflow/python/distribute:mirrored_strategy",
-        "//tensorflow/python/distribute:one_device_strategy",
-        "//tensorflow/python/keras/mixed_precision/experimental:test_util",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_test(
-    name = "loss_scale_test",
-    size = "medium",
-    srcs = ["training/experimental/loss_scale_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":loss_scale",
-        "//tensorflow/python/distribute:mirrored_strategy",
-        "//tensorflow/python/distribute:one_device_strategy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_library(
-    name = "mixed_precision_global_state",
-    srcs = ["training/experimental/mixed_precision_global_state.py"],
-    srcs_version = "PY2AND3",
-)
-
-py_library(
-    name = "mixed_precision",
-    srcs = ["training/experimental/mixed_precision.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":config",
-        ":loss_scale",
-        ":loss_scale_optimizer",
-        ":mixed_precision_global_state",
-        ":util",
-    ],
-)
-
-cuda_py_test(
-    name = "mixed_precision_test",
-    size = "small",
-    srcs = ["training/experimental/mixed_precision_test.py"],
-    python_version = "PY3",
-    tfrt_enabled = True,
-    deps = [
-        ":client_testlib",
-        ":mixed_precision",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_library(
-    name = "loss_scaling_gradient_tape",
-    srcs = ["training/experimental/loss_scaling_gradient_tape.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":array_ops",
-        ":loss_scale",
-        ":unconnected_gradients",
-        ":util",
-        "//tensorflow/python/distribute:distribute_lib",
-        "//tensorflow/python/eager:backprop",
-    ],
-)
-
-cuda_py_test(
-    name = "loss_scaling_gradient_tape_test",
-    size = "medium",
-    srcs = ["training/experimental/loss_scaling_gradient_tape_test.py"],
-    shard_count = 2,
-    deps = [
-        ":client_testlib",
-        ":constant_op",
-        ":framework_test_combinations_lib",
-        ":loss_scale",
-        ":loss_scaling_gradient_tape",
-        "//tensorflow/python/compat:v2_compat",
-        "//tensorflow/python/distribute:mirrored_strategy",
-        "//tensorflow/python/eager:def_function",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 py_library(
     name = "math_grad",
     srcs = ["ops/math_grad.py"],
@@ -4273,6 +4279,7 @@ py_library(
         ":auto_control_deps_utils",
         ":dtypes",
         ":framework_ops",
+        ":handle_data_util",
         ":pywrap_tf_session",
         ":resource_variable_ops_gen",
         ":tensor_shape",
@@ -4306,6 +4313,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
+        ":handle_data_util",
         ":list_ops_gen",
     ],
 )
@@ -4454,6 +4462,7 @@ py_library(
         ":math_ops",
         ":random_ops_gen",
         ":random_seed",
+        ":stateless_random_ops",
     ],
 )
 
@@ -4466,6 +4475,7 @@ py_library(
         ":framework_ops",
         ":math_ops",
         ":stateful_random_ops_gen",
+        ":stateless_random_ops_v2_gen",
         ":variables",
         "//third_party/py/numpy",
     ],
@@ -4476,7 +4486,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["ops/stateful_random_ops_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,
     xla_enabled = True,
     deps = [
@@ -4495,11 +4504,12 @@ py_library(
     srcs = ["ops/stateless_random_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":array_ops",
         ":dtypes",
         ":framework_ops",
         ":math_ops",
-        ":random_ops",
         ":stateless_random_ops_gen",
+        ":stateless_random_ops_v2_gen",
     ],
 )
 
@@ -4802,7 +4812,6 @@ py_library(
         ":linalg_ops",
         ":logging_ops",
         ":lookup_ops",
-        ":loss_scaling_gradient_tape",
         ":manip_grad",
         ":manip_ops",
         ":math_grad",
@@ -4837,6 +4846,7 @@ py_library(
         "//tensorflow/python/ops/linalg/sparse",
         "//tensorflow/python/ops/ragged",
         "//tensorflow/python/ops/structured",
+        "//tensorflow/python/training/experimental:loss_scaling_gradient_tape",
     ],
 )
 
@@ -5089,6 +5099,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["ops/gradient_checker_v2_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -5155,12 +5166,33 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "image_grad_deterministic_test",
+    size = "large",
+    srcs = ["ops/image_grad_deterministic_test.py"],
+    python_version = "PY3",
+    shard_count = 5,
+    tfrt_enabled = True,
+    deps = [
+        ":image_grad_test_base",
+    ],
+)
+
 cuda_py_test(
     name = "image_grad_test",
-    size = "medium",
+    size = "large",
     srcs = ["ops/image_grad_test.py"],
     python_version = "PY3",
+    shard_count = 5,
     tfrt_enabled = True,
+    deps = [
+        ":image_grad_test_base",
+    ],
+)
+
+py_library(
+    name = "image_grad_test_base",
+    srcs = ["ops/image_grad_test_base.py"],
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -5236,6 +5268,7 @@ cuda_py_test(
     srcs = ["ops/math_grad_test.py"],
     python_version = "PY3",
     tags = ["no_windows_gpu"],
+    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -5272,7 +5305,10 @@ cuda_py_test(
     size = "medium",
     srcs = ["ops/math_ops_linspace_test.py"],
     python_version = "PY3",
-    tags = ["no_windows_gpu"],
+    tags = [
+        "no_windows_gpu",
+    ],
+    tfrt_enabled = True,
     deps = [
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
@@ -5309,7 +5345,6 @@ cuda_py_test(
     srcs = ["ops/nn_fused_batchnorm_test.py"],
     python_version = "PY3",
     shard_count = 24,
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -5351,6 +5386,7 @@ py_test(
         ":client_testlib",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/distribute:test_util",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -5360,7 +5396,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["ops/nn_xent_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -5411,6 +5446,7 @@ cuda_py_test(
     python_version = "PY3",
     shard_count = 10,
     tags = [
+        "no_rocm",
         "no_windows_gpu",
         "noasan",  # b/159332048
         "nomsan",  # b/148630708
@@ -5440,231 +5476,6 @@ tf_py_test(
     ],
 )
 
-py_library(
-    name = "training_lib",
-    srcs = glob(
-        ["training/**/*.py"],
-        exclude = [
-            "**/*test*",
-            "training/tracking/**/*.py",
-            "training/saving/**/*.py",
-            # The following targets have their own build rules (same name as the
-            # file):
-            "training/basic_session_run_hooks.py",
-            "training/checkpoint_management.py",
-            "training/distribute.py",
-            "training/distribution_strategy_context.py",
-            "training/saver.py",
-            "training/session_run_hook.py",
-            "training/training_util.py",
-        ],
-    ),
-    srcs_version = "PY2AND3",
-    deps = [
-        ":array_ops",
-        ":array_ops_gen",
-        ":basic_session_run_hooks",
-        ":checkpoint_management",
-        ":checkpoint_ops_gen",
-        ":client",
-        ":control_flow_ops",
-        ":data_flow_ops",
-        ":device",
-        ":device_spec",
-        ":distribute",
-        ":errors",
-        ":framework",
-        ":framework_for_generated_wrappers",
-        ":framework_ops",
-        ":gradients",
-        ":init_ops",
-        ":io_ops",
-        ":layers_util",
-        ":lookup_ops",
-        ":loss_scale",
-        ":loss_scale_optimizer",
-        ":math_ops",
-        ":mixed_precision",
-        ":platform",
-        ":py_checkpoint_reader",
-        ":pywrap_tensorflow",
-        ":random_ops",
-        ":resource_variable_ops",
-        ":resources",
-        ":saver",
-        ":sdca_ops",
-        ":session",
-        ":session_run_hook",
-        ":sparse_ops",
-        ":sparse_tensor",
-        ":state_ops",
-        ":summary",
-        ":training_ops_gen",
-        ":training_util",
-        ":util",
-        ":variable_scope",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/data/experimental/service:server_lib",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/distribute:distribute_coordinator_context",
-        "//tensorflow/python/distribute:distribute_lib",
-        "//tensorflow/python/distribute:reduce_util",
-        "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/keras/optimizer_v2:legacy_learning_rate_decay",
-        "//tensorflow/python/ops/losses",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "training",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":training_lib",
-        "//tensorflow/python/training/tracking:base",
-        "//tensorflow/python/training/tracking:python_state",
-        "//tensorflow/python/training/tracking:util",
-    ],
-)
-
-# Dependency added and used by ClusterResolvers to avoid circular dependency between keras, distribute, and training.
-py_library(
-    name = "training_server_lib",
-    srcs = ["training/server_lib.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":framework",
-        ":pywrap_tf_session",
-        ":util",
-        "//tensorflow/core:protos_all_py",
-    ],
-)
-
-py_library(
-    name = "py_checkpoint_reader",
-    srcs = ["training/py_checkpoint_reader.py"],
-    deps = [
-        ":_pywrap_checkpoint_reader",
-        ":dtypes",
-        ":errors",
-        ":util",
-    ],
-)
-
-py_library(
-    name = "checkpoint_management",
-    srcs = ["training/checkpoint_management.py"],
-    deps = [
-        ":errors",
-        ":lib",
-        ":platform",
-        ":protos_all_py",
-        ":util",
-        "//tensorflow/core:protos_all_py",
-    ],
-)
-
-py_library(
-    name = "session_run_hook",
-    srcs = ["training/session_run_hook.py"],
-    srcs_version = "PY2AND3",
-    deps = [":util"],
-)
-
-py_library(
-    name = "basic_session_run_hooks",
-    srcs = ["training/basic_session_run_hooks.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":client",
-        ":framework",
-        ":platform",
-        ":protos_all_py",
-        ":session_run_hook",
-        ":training_util",
-        ":util",
-    ],
-)
-
-py_library(
-    name = "saver",
-    srcs = ["training/saver.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":array_ops",
-        ":checkpoint_management",
-        ":constant_op",
-        ":control_flow_ops",
-        ":device",
-        ":errors",
-        ":framework",
-        ":framework_ops",
-        ":io_ops",
-        ":io_ops_gen",
-        ":platform",
-        ":py_checkpoint_reader",
-        ":resource_variable_ops",
-        ":session",
-        ":state_ops",
-        ":string_ops",
-        ":training_util",
-        ":util",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/saving:saveable_object",
-        "//tensorflow/python/training/saving:saveable_object_util",
-        "//tensorflow/python/training/tracking:base",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "distribute",
-    srcs = [
-        "training/distribute.py",
-        "training/distribution_strategy_context.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python/distribute:distribute_lib",
-    ],
-)
-
-tf_py_test(
-    name = "evaluation_test",
-    size = "small",
-    srcs = ["training/evaluation_test.py"],
-    python_version = "PY3",
-    shard_count = 3,
-    tags = [
-        "manual",
-        "notap",  # Disabling until b/33000128 and b/33040312 are fixed.
-    ],
-    deps = [
-        ":array_ops",
-        ":client",
-        ":client_testlib",
-        ":framework",
-        ":framework_for_generated_wrappers",
-        ":framework_test_lib",
-        ":math_ops",
-        ":metrics",
-        ":platform",
-        ":state_ops",
-        ":summary",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/ops/losses",
-        "//third_party/py/numpy",
-    ],
-)
-
 py_library(
     name = "client",
     srcs = [
@@ -5691,6 +5502,7 @@ py_library(
 py_strict_library(
     name = "tf_export",
     srcs = ["util/tf_export.py"],
+    compatible_with = get_compatible_with_portable(),
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
@@ -5719,6 +5531,7 @@ py_strict_library(
         "util/tf_decorator.py",
         "util/tf_inspect.py",
     ],
+    compatible_with = get_compatible_with_portable(),
     srcs_version = "PY2AND3",
     visibility = [
         "//tensorflow:__subpackages__",
@@ -5736,6 +5549,7 @@ py_strict_library(
 py_strict_library(
     name = "tf_stack",
     srcs = ["util/tf_stack.py"],
+    compatible_with = get_compatible_with_portable(),
     srcs_version = "PY2AND3",
     # TODO(mdan): Remove public visibility.
     visibility = ["//visibility:public"],
@@ -5748,6 +5562,7 @@ py_strict_library(
 pybind_extension(
     name = "_tf_stack",
     srcs = ["util/tf_stack.cc"],
+    compatible_with = get_compatible_with_portable(),
     # TODO(b/138203821): change to "util._tf_stack" once the bug is fixed.
     module_name = "_tf_stack",
     deps = [
@@ -5783,8 +5598,56 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "function_parameter_canonicalizer",
+    srcs = ["util/function_parameter_canonicalizer.cc"],
+    hdrs = ["util/function_parameter_canonicalizer.h"],
+    deps = [
+        ":py_util",
+        ":safe_pyobject_ptr",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:macros",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_python_pybind_extension(
+    name = "_function_parameter_canonicalizer_binding_for_test",
+    testonly = True,
+    srcs = ["util/function_parameter_canonicalizer_binding_for_test.cc"],
+    hdrs = [
+        "util/function_parameter_canonicalizer.h",
+    ],
+    module_name = "_function_parameter_canonicalizer_binding_for_test",
+    deps = [
+        ":safe_pyobject_ptr_required_hdrs",
+        "//tensorflow/core:lib",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@com_google_absl//absl/types:span",
+        "@pybind11",
+    ],
+)
+
+tf_py_test(
+    name = "function_parameter_canonicalizer_test",
+    srcs = ["util/function_parameter_canonicalizer_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_pip",  # b/168621686
+        "no_windows",  # b/169275019
+    ],
+    tfrt_enabled = True,
+    deps = [
+        ":_function_parameter_canonicalizer_binding_for_test",
+        ":client_testlib",
+    ],
+)
+
 py_library(
     name = "global_test_configuration",
+    compatible_with = get_compatible_with_portable(),
     deps = if_mlir(["//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_pass_registration"]) +
            tf_enable_mlir_bridge(),
 )
@@ -5803,15 +5666,15 @@ py_library(
             "util/**/*_test.py",
         ],
     ),
+    compatible_with = get_compatible_with_portable(),
     srcs_version = "PY2AND3",
     visibility = visibility + [
         "//tensorflow:__pkg__",
         "//third_party/py/tensorflow_core:__subpackages__",
         "//third_party/py/tf_agents:__subpackages__",
-        "//tensorflow:composite_tensor_whitelist",
     ],
     deps = [
-        ":_pywrap_tf32_execution",
+        ":_pywrap_tensor_float_32_execution",
         # global_test_configuration is added here because all major tests depend on this
         # library. It isn't possible to add these test dependencies via tensorflow.bzl's
         # py_test because not all tensorflow tests use tensorflow.bzl's py_test.
@@ -5824,6 +5687,7 @@ py_library(
         "//third_party/py/numpy",
         "@six_archive//:six",
         "@wrapt",
+        "//tensorflow/tools/docs:doc_controls",
         "//tensorflow/tools/compatibility:all_renames_v2",
     ],
 )
@@ -5996,10 +5860,11 @@ tf_proto_library(
             "framework/cpp_shape_inference.proto",
         ],
     ),
+    protodeps = ["//tensorflow/python/training:checkpoint_state"],
     visibility = visibility,
 )
 
-tf_proto_library_py(
+tf_proto_library(
     name = "compare_test_proto",
     testonly = 1,
     srcs = ["util/protobuf/compare_test.proto"],
@@ -6110,9 +5975,9 @@ cc_library(
     deps = [
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
-        "//tensorflow/core:master_proto_cc",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:replay_log_proto_cc",
+        "//tensorflow/core/protobuf:master_proto_cc",
+        "//tensorflow/core/protobuf:replay_log_proto_cc",
     ],
 )
 
@@ -6120,6 +5985,7 @@ tf_cuda_library(
     name = "tf_session_helper",
     srcs = ["client/tf_session_helper.cc"],
     hdrs = ["client/tf_session_helper.h"],
+    compatible_with = [],
     deps = [
         ":construction_fails_op",
         ":ndarray_tensor",
@@ -6171,6 +6037,7 @@ pywrap_tensorflow_macro(
         ":cost_analyzer_lib",
         ":model_analyzer_lib",
         ":cpp_python_util",
+        ":function_parameter_canonicalizer",
         ":kernel_registry",
         ":numpy_lib",
         ":safe_ptr",
@@ -6182,15 +6049,24 @@ pywrap_tensorflow_macro(
         ":pybind11_status",
         ":pybind11_proto",
         ":python_op_gen",
+        ":safe_pyobject_ptr",
         ":tf_session_helper",
         "//third_party/python_runtime:headers",
         "//tensorflow/c:c_api",
+        "//tensorflow/c:kernels",
+        "//tensorflow/c:ops",
         "//tensorflow/c:c_api_experimental",
+        "//tensorflow/c/experimental/stream_executor:stream_executor",
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/c:python_api",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_experimental",
+        "//tensorflow/c/experimental/ops",
+        "//tensorflow/c/experimental/gradients",
+        "//tensorflow/c/experimental/gradients/tape",
+        "//tensorflow/c/eager:mnist_gradients_testutil",
+        "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
         "//tensorflow/core/data/service:server_lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_rpc_factory_registration",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
@@ -6226,7 +6102,7 @@ pywrap_tensorflow_macro(
         "@ngraph_tf//:ngraph_tf",
     ]) + if_xla_available([
         "//tensorflow/compiler/aot:tfcompile_lib",
-    ]) + if_static(extra_deps = ["//tensorflow/core/platform:tf32_utils"]),
+    ]) + if_static(extra_deps = ["//tensorflow/core/platform:tensor_float_32_utils"]),
 )
 
 # ** Targets for Windows build (start) **
@@ -6249,10 +6125,9 @@ filegroup(
         "//tensorflow/c:checkpoint_reader",  # checkpoint_reader
         "//tensorflow/c:python_api",  # tf_session
         "//tensorflow/c:tf_status_helper",  # tfe
+        "//tensorflow/compiler/jit:get_compiler_ir",  #tfe
         "//tensorflow/compiler/jit:flags",  #tfe
         "//tensorflow/compiler/mlir/python:mlir",  # mlir
-        "//tensorflow/core/common_runtime:device",  # device_lib, tfe, tf_session
-        "//tensorflow/core/common_runtime:device_factory",  # device_lib, tfe, tf_session
         "//tensorflow/core/common_runtime:graph_constructor",  # tf_session
         "//tensorflow/core/common_runtime:quantize_training",  # quantize_training
         "//tensorflow/core/common_runtime:session_options",  # device_lib, tfe, tf_session
@@ -6262,6 +6137,7 @@ filegroup(
         "//tensorflow/core:lib_internal_impl",  # device_lib
         "//tensorflow/core:op_gen_lib",  # tf_session
         "//tensorflow/core/common_runtime/eager:context",  # tfe
+        "//tensorflow/core/common_runtime/eager:tensor_handle",  # tfe
         "//tensorflow/core/common_runtime/eager:eager_executor",  # tfe
         "//tensorflow/core/grappler:devices",  # tf_cluster
         "//tensorflow/core/grappler:grappler_item",  # tf_item
@@ -6279,10 +6155,9 @@ filegroup(
         "//tensorflow/core/grappler/graph_analyzer:graph_analyzer_tool",  # graph_analyzer
         "//tensorflow/core/grappler/optimizers:meta_optimizer",  # tf_optimizer
         "//tensorflow/core/grappler/utils:topological_sort",  # tf_item
-        "//tensorflow/core/platform:tf32_utils",  # tf32
-        "//tensorflow/core/profiler/internal:annotation_stack_impl",  # profiler
+        "//tensorflow/core/platform:tensor_float_32_utils",  # tensor_float_32
         "//tensorflow/core/profiler/internal:print_model_analysis",  # tfprof
-        "//tensorflow/core/profiler/internal:traceme_recorder_impl",  # profiler
+        "//tensorflow/core/profiler/internal/cpu:traceme_recorder_impl",  # profiler
         "//tensorflow/core/profiler/lib:profiler_session_impl",  # profiler
         "//tensorflow/core/profiler/rpc:profiler_server_impl",  # profiler
         "//tensorflow/core/profiler/rpc/client:profiler_client_impl",  # profiler
@@ -6465,189 +6340,6 @@ py_library(
     ],
 )
 
-tf_py_test(
-    name = "server_lib_test",
-    size = "small",
-    srcs = ["training/server_lib_test.py"],
-    grpc_enabled = True,
-    python_version = "PY3",
-    tags = [
-        "noasan",  # TODO(b/161236904): flaky timeout in trying to start gRPC server
-    ],
-    tfrt_enabled = True,
-    deps = [
-        ":array_ops",
-        ":client",
-        ":client_testlib",
-        ":data_flow_ops",
-        ":errors",
-        ":framework_for_generated_wrappers",
-        ":math_ops",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "server_lib_multiple_containers_test",
-    size = "small",
-    srcs = ["training/server_lib_multiple_containers_test.py"],
-    grpc_enabled = True,
-    python_version = "PY3",
-    tfrt_enabled = True,
-    deps = [
-        ":array_ops",
-        ":client",
-        ":client_testlib",
-        ":data_flow_ops",
-        ":errors",
-        ":framework_for_generated_wrappers",
-        ":math_ops",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "server_lib_same_variables_clear_container_test",
-    size = "small",
-    srcs = ["training/server_lib_same_variables_clear_container_test.py"],
-    grpc_enabled = True,
-    python_version = "PY3",
-    tfrt_enabled = True,
-    deps = [
-        ":array_ops",
-        ":client",
-        ":client_testlib",
-        ":data_flow_ops",
-        ":errors",
-        ":framework_for_generated_wrappers",
-        ":math_ops",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "server_lib_same_variables_clear_test",
-    size = "small",
-    srcs = ["training/server_lib_same_variables_clear_test.py"],
-    grpc_enabled = True,
-    python_version = "PY3",
-    tfrt_enabled = True,
-    deps = [
-        ":array_ops",
-        ":client",
-        ":client_testlib",
-        ":data_flow_ops",
-        ":errors",
-        ":framework_for_generated_wrappers",
-        ":math_ops",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "server_lib_same_variables_no_clear_test",
-    size = "small",
-    srcs = ["training/server_lib_same_variables_no_clear_test.py"],
-    grpc_enabled = True,
-    python_version = "PY3",
-    tfrt_enabled = True,
-    deps = [
-        ":array_ops",
-        ":client",
-        ":client_testlib",
-        ":data_flow_ops",
-        ":errors",
-        ":framework_for_generated_wrappers",
-        ":math_ops",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "server_lib_sparse_job_test",
-    size = "small",
-    srcs = ["training/server_lib_sparse_job_test.py"],
-    grpc_enabled = True,
-    python_version = "PY3",
-    tfrt_enabled = True,
-    deps = [
-        ":array_ops",
-        ":client",
-        ":client_testlib",
-        ":data_flow_ops",
-        ":errors",
-        ":framework_for_generated_wrappers",
-        ":math_ops",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "localhost_cluster_performance_test",
-    size = "medium",
-    srcs = [
-        "training/localhost_cluster_performance_test.py",
-    ],
-    grpc_enabled = True,
-    python_version = "PY3",
-    tags = [
-        "no_oss",  # Test flaky due to port collisions.
-        "oss_serial",
-    ],
-    tfrt_enabled = True,
-    deps = [
-        ":client",
-        ":client_testlib",
-        ":distributed_framework_test_lib",
-        ":framework_for_generated_wrappers",
-        ":partitioned_variables",
-        ":training",
-        ":variable_scope",
-        ":variables",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "sync_replicas_optimizer_test",
-    size = "medium",
-    srcs = [
-        "training/sync_replicas_optimizer_test.py",
-    ],
-    grpc_enabled = True,
-    python_version = "PY3",
-    tags = [
-        "no_oss",  # Test flaky due to port collisions.
-        "notsan",  # data race due to b/62910646
-        "oss_serial",
-    ],
-    tfrt_enabled = True,
-    deps = [
-        ":client_testlib",
-        ":framework_for_generated_wrappers",
-        ":training",
-        ":variables",
-    ],
-)
-
 py_library(
     name = "timeline",
     srcs = ["client/timeline.py"],
@@ -6682,7 +6374,6 @@ tf_py_test(
         "no_pip_gpu",  # testInteractivePlacePrunedGraph fails on invalid assumption about GPU ops.
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client",
@@ -6859,7 +6550,6 @@ tf_py_test(
     srcs = ["framework/convert_to_constants_test.py"],
     python_version = "PY3",
     tags = ["no_rocm"],
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":control_flow_v2_toggles",
@@ -6913,467 +6603,6 @@ tf_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "adam_test",
-    size = "medium",
-    srcs = ["training/adam_test.py"],
-    python_version = "PY3",
-    tfrt_enabled = True,
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":framework",
-        ":math_ops",
-        ":platform",
-        ":platform_test",
-        ":training",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "moving_averages_test",
-    size = "small",
-    srcs = [
-        "training/moving_averages_test.py",
-    ],
-    python_version = "PY3",
-    tags = [
-        "no_windows",  # b/139083295: bfloat16 tests fail on Windows
-        "notsan",
-    ],
-    tfrt_enabled = True,
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":constant_op",
-        ":dtypes",
-        ":framework_for_generated_wrappers",
-        ":framework_ops",
-        ":training",
-        ":variable_scope",
-        ":variables",
-    ],
-)
-
-cuda_py_tests(
-    name = "training_tests",
-    size = "medium",
-    srcs = [
-        "training/adadelta_test.py",
-        "training/adagrad_da_test.py",
-        "training/adagrad_test.py",
-        "training/basic_loops_test.py",
-        "training/coordinator_test.py",
-        "training/device_setter_test.py",
-        "training/ftrl_test.py",
-        "training/gradient_descent_test.py",
-        "training/momentum_test.py",
-        "training/optimizer_test.py",
-        "training/proximal_adagrad_test.py",
-        "training/proximal_gradient_descent_test.py",
-        "training/quantize_training_test.py",
-        "training/queue_runner_test.py",
-        "training/rmsprop_test.py",
-        "training/slot_creator_test.py",
-        "training/tensorboard_logging_test.py",
-        "training/training_ops_test.py",
-    ],
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client",
-        ":client_testlib",
-        ":control_flow_ops",
-        ":data_flow_ops",
-        ":data_flow_ops_gen",
-        ":embedding_ops",
-        ":errors",
-        ":framework",
-        ":framework_for_generated_wrappers",
-        ":framework_test_lib",
-        ":gradients",
-        ":lookup_ops",
-        ":math_ops",
-        ":nn_grad",
-        ":nn_ops",
-        ":partitioned_variables",
-        ":platform",
-        ":platform_test",
-        ":pywrap_tensorflow",
-        ":random_ops",
-        ":resource_variable_ops",
-        ":resources",
-        ":sparse_ops",
-        ":state_ops",
-        ":state_ops_gen",
-        ":summary",
-        ":training",
-        ":util",
-        ":variable_scope",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "saver_test_utils",
-    srcs = ["training/saver_test_utils.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":lookup_ops_gen",
-        ":training",
-    ],
-)
-
-cuda_py_test(
-    name = "saver_test",
-    size = "medium",
-    srcs = [
-        "training/saver_test.py",
-    ],
-    python_version = "PY3",
-    tags = ["multi_gpu"],
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":control_flow_ops",
-        ":data_flow_ops",
-        ":errors",
-        ":gradients",
-        ":math_ops",
-        ":nn_grad",
-        ":nn_ops",
-        ":partitioned_variables",
-        ":platform",
-        ":platform_test",
-        ":py_checkpoint_reader",
-        ":random_ops",
-        ":resource_variable_ops",
-        ":saver_test_utils",
-        ":sparse_ops",
-        ":summary",
-        ":training",
-        ":util",
-        ":variable_scope",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
-cuda_py_test(
-    name = "checkpoint_management_test",
-    size = "small",
-    srcs = [
-        "training/checkpoint_management_test.py",
-    ],
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":control_flow_ops",
-        ":data_flow_ops",
-        ":errors",
-        ":gradients",
-        ":math_ops",
-        ":nn_grad",
-        ":nn_ops",
-        ":partitioned_variables",
-        ":platform",
-        ":platform_test",
-        ":pywrap_tensorflow",
-        ":random_ops",
-        ":resource_variable_ops",
-        ":saver_test_utils",
-        ":sparse_ops",
-        ":summary",
-        ":training",
-        ":util",
-        ":variable_scope",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
-        "@six_archive//:six",
-    ],
-)
-
-tf_py_test(
-    name = "saver_large_variable_test",
-    size = "medium",
-    srcs = ["training/saver_large_variable_test.py"],
-    python_version = "PY3",
-    tags = [
-        "manual",
-        "noasan",  # http://b/30379628
-        "notsan",  # http://b/30379628
-    ],
-    tfrt_enabled = True,
-    deps = [
-        ":client",
-        ":client_testlib",
-        ":errors",
-        ":framework_for_generated_wrappers",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-    ],
-)
-
-tf_py_test(
-    name = "saver_large_partitioned_variable_test",
-    size = "medium",
-    srcs = ["training/saver_large_partitioned_variable_test.py"],
-    python_version = "PY3",
-    tags = [
-        "noasan",  # http://b/30782289
-        "notsan",  # http://b/30782289
-    ],
-    tfrt_enabled = True,
-    deps = [
-        ":client",
-        ":client_testlib",
-        ":framework_for_generated_wrappers",
-        ":partitioned_variables",
-        ":training",
-        ":variables",
-    ],
-)
-
-cuda_py_test(
-    name = "session_manager_test",
-    size = "medium",  # TODO(irving): Can this be made small?
-    srcs = ["training/session_manager_test.py"],
-    grpc_enabled = True,
-    main = "training/session_manager_test.py",
-    python_version = "PY3",
-    tfrt_enabled = True,
-    deps = [
-        ":array_ops",
-        ":client",
-        ":client_testlib",
-        ":control_flow_ops",
-        ":errors",
-        ":framework_for_generated_wrappers",
-        ":platform",
-        ":training",
-        ":variables",
-    ],
-)
-
-tf_py_test(
-    name = "supervisor_test",
-    size = "small",
-    srcs = ["training/supervisor_test.py"],
-    grpc_enabled = True,
-    python_version = "PY3",
-    tags = ["no_windows"],
-    tfrt_enabled = True,
-    deps = [
-        ":array_ops",
-        ":checkpoint_management",
-        ":client_testlib",
-        ":errors",
-        ":framework",
-        ":framework_for_generated_wrappers",
-        ":io_ops",
-        ":parsing_ops",
-        ":platform",
-        ":saver",
-        ":summary",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-    ],
-)
-
-tf_py_test(
-    name = "basic_session_run_hooks_test",
-    size = "medium",
-    srcs = ["training/basic_session_run_hooks_test.py"],
-    python_version = "PY3",
-    tags = [
-        "no_pip",  # Relies on contrib
-        "no_windows",
-        "notsan",  # intermittent races on a few percent of runs
-    ],
-    tfrt_enabled = True,
-    deps = [
-        ":client",
-        ":client_testlib",
-        ":control_flow_ops",
-        ":fake_summary_writer",
-        ":framework",
-        ":framework_for_generated_wrappers",
-        ":nn_grad",
-        ":platform",
-        ":state_ops",
-        ":summary",
-        ":training",
-        ":variable_scope",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-    ],
-)
-
-tf_py_test(
-    name = "checkpoint_utils_test",
-    size = "small",
-    srcs = ["training/checkpoint_utils_test.py"],
-    python_version = "PY3",
-    tags = [
-        "manual",
-        "no_cuda_on_cpu_tap",
-        "no_oss",
-        "no_windows",
-        "notap",
-    ],
-    deps = [
-        ":client",
-        ":client_testlib",
-        ":framework_for_generated_wrappers",
-        ":io_ops",
-        ":partitioned_variables",
-        ":platform",
-        ":resource_variable_ops",
-        ":state_ops",
-        ":training",
-        ":variable_scope",
-        ":variables",
-    ],
-)
-
-tf_py_test(
-    name = "checkpoint_ops_test",
-    size = "small",
-    srcs = ["training/checkpoint_ops_test.py"],
-    python_version = "PY3",
-    tfrt_enabled = True,
-    deps = [
-        ":checkpoint_ops_gen",
-        ":client",
-        ":client_testlib",
-        ":framework_for_generated_wrappers",
-        ":io_ops",
-        ":partitioned_variables",
-        ":platform",
-        ":pywrap_tensorflow",
-        ":state_ops",
-        ":training",
-        ":variable_scope",
-        ":variables",
-    ],
-)
-
-tf_py_test(
-    name = "warm_starting_util_test",
-    size = "medium",
-    srcs = ["training/warm_starting_util_test.py"],
-    python_version = "PY3",
-    tfrt_enabled = True,
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":dtypes",
-        ":framework_ops",
-        ":init_ops",
-        ":training",
-        ":variable_scope",
-        ":variables",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "monitored_session_test",
-    size = "medium",
-    srcs = ["training/monitored_session_test.py"],
-    tags = [
-        "no_pip",
-        "notsan",  # b/67945581
-    ],
-    tfrt_enabled = True,
-    deps = [
-        ":array_ops",
-        ":checkpoint_management",
-        ":client_testlib",
-        ":control_flow_ops",
-        ":errors",
-        ":framework_for_generated_wrappers",
-        ":resource_variable_ops",
-        ":saver",
-        ":session",
-        ":state_ops",
-        ":summary",
-        ":training",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/distribute:collective_all_reduce_strategy",
-        "//tensorflow/python/distribute:distribute_coordinator",
-    ],
-)
-
-py_library(
-    name = "training_util",
-    srcs = ["training/training_util.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":dtypes",
-        ":framework",
-        ":framework_ops",
-        ":init_ops",
-        ":platform",
-        ":resource_variable_ops",
-        ":state_ops",
-        ":util",
-        ":variable_scope",
-        ":variables",
-        "//tensorflow/python/eager:context",
-    ],
-)
-
-tf_py_test(
-    name = "training_util_test",
-    size = "small",
-    srcs = ["training/training_util_test.py"],
-    python_version = "PY3",
-    tfrt_enabled = True,
-    deps = [
-        ":client_testlib",
-        ":framework",
-        ":platform",
-        ":training",
-        ":training_util",
-        ":variables",
-    ],
-)
-
-tf_py_test(
-    name = "input_test",
-    size = "medium",
-    srcs = ["training/input_test.py"],
-    python_version = "PY3",
-    tfrt_enabled = True,
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":errors",
-        ":framework",
-        ":framework_for_generated_wrappers",
-        ":math_ops",
-        ":platform",
-        ":training",
-        ":util",
-        ":variables",
-        "//third_party/py/numpy",
-    ],
-)
-
 py_library(
     name = "summary_op_util",
     srcs = ["ops/summary_op_util.py"],
@@ -7509,7 +6738,6 @@ tf_py_test(
     srcs = ["ops/dequantize_op_test.py"],
     python_version = "PY3",
     tags = ["no_windows"],
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -7524,7 +6752,6 @@ tf_py_test(
     srcs = ["ops/quantized_ops_test.py"],
     python_version = "PY3",
     tags = ["no_windows"],
-    tfrt_enabled = True,
     deps = [
         ":array_ops",
         ":client_testlib",
@@ -7539,7 +6766,6 @@ tf_py_test(
     srcs = ["ops/quantized_conv_ops_test.py"],
     python_version = "PY3",
     tags = ["no_windows"],
-    tfrt_enabled = True,
     deps = [
         ":client_testlib",
         ":framework_for_generated_wrappers",
@@ -8319,6 +7545,7 @@ tf_python_pybind_extension(
     deps = [
         ":pybind11_lib",
         ":pybind11_status",
+        ":safe_pyobject_ptr",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:status",
         "//third_party/python_runtime:headers",
@@ -8327,6 +7554,38 @@ tf_python_pybind_extension(
     ],
 )
 
+cc_library(
+    name = "unified_api_pywrap_required_headers",
+    textual_hdrs = [
+        "lib/core/numpy.h",
+        "lib/core/py_exception_registry.h",
+        "lib/core/pybind11_status.h",
+        "lib/core/bfloat16.h",
+        "lib/core/ndarray_tensor.h",
+        "lib/core/ndarray_tensor_bridge.h",
+        "lib/core/safe_ptr.h",
+        "lib/core/safe_pyobject_ptr.h",
+        "//tensorflow/c:headers",
+        "//tensorflow/c/eager:headers",
+        "//tensorflow/c/eager:pywrap_required_hdrs",
+        "//tensorflow/c/experimental/ops:pywrap_required_hdrs",
+        "//tensorflow/c/experimental/gradients:pywrap_required_hdrs",
+        "//tensorflow/c/experimental/gradients/tape:pywrap_required_hdrs",
+        "//tensorflow/core/common_runtime/eager:pywrap_required_hdrs",
+        "//tensorflow/python/eager:pywrap_required_hdrs",
+        "//tensorflow/core/common_runtime:core_cpu_lib_headers",
+        "//tensorflow/core/public:session.h",
+        "//tensorflow/core/public:session_options.h",
+    ],
+    visibility = ["//tensorflow/python/framework/experimental:__pkg__"],
+    deps = [
+        "//tensorflow/c:pywrap_required_hdrs",
+        "//tensorflow/core/framework:pywrap_required_hdrs",
+        "//third_party/py/numpy:headers",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
 py_library(
     name = "pywrap_tfe",
     srcs = ["pywrap_tfe.py"],
@@ -8341,37 +7600,47 @@ tf_python_pybind_extension(
     name = "_pywrap_tfe",
     srcs = ["tfe_wrapper.cc"],
     hdrs = [
+        "lib/core/numpy.h",
         "lib/core/safe_ptr.h",
         "util/util.h",
         ":py_exception_registry_hdr",
         "//tensorflow/c:headers",
         "//tensorflow/c/eager:headers",
         "//tensorflow/c/eager:pywrap_required_hdrs",
+        "//tensorflow/c/experimental/ops:pywrap_required_hdrs",
         "//tensorflow/core/common_runtime/eager:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime:pywrap_required_hdrs",
         "//tensorflow/core/distributed_runtime/eager:pywrap_required_hdrs",
         "//tensorflow/python/eager:pywrap_required_hdrs",
     ],
     module_name = "_pywrap_tfe",
+    # Only include TensorFlow header-only targets here.
+    # If a cc_library needs to depend on TensorFlow .cc files through srcs or
+    # deps, then you can use cc_header_only_library to keep only headers.
     deps = [
+        ":safe_pyobject_ptr",
         ":pybind11_lib",
+        "//third_party/py/numpy:headers",
         ":pybind11_status",
         "//tensorflow/core/framework:pywrap_required_hdrs",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
         "//tensorflow/c:pywrap_required_hdrs",
         "@pybind11",
         "//third_party/python_runtime:headers",
-        "//tensorflow/c/experimental/saved_model/core:pywrap_required_hdrs",
         "//tensorflow/compiler/jit:flags_headers_only",
+        "//tensorflow/compiler/jit:get_compiler_ir_hdrs_only",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal_hdrs_only",
         "//tensorflow/core/common_runtime:core_cpu_headers_lib",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_headers_for_pybind",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform",
+        "//tensorflow/core/lib/llvm_rtti",
     ] + if_static(
         extra_deps = [
             "//tensorflow/core/protobuf:eager_service_proto_cc",
@@ -8422,11 +7691,12 @@ tf_python_pybind_extension(
     module_name = "_pywrap_parallel_device",
     visibility = ["//tensorflow/python/distribute/parallel_device:__pkg__"],
     deps = [
+        ":pybind11_lib",
+        ":pybind11_status",
+        ":safe_pyobject_ptr",
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_headers_for_pybind",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/python:pybind11_lib",
-        "//tensorflow/python:pybind11_status",
         "//third_party/python_runtime:headers",
         "@pybind11",
     ],
@@ -8466,3 +7736,86 @@ cuda_py_test(
         ":client_testlib",
     ],
 )
+
+alias(
+    name = "basic_session_run_hooks",
+    actual = "//tensorflow/python/training:basic_session_run_hooks",
+)
+
+alias(
+    name = "checkpoint_management",
+    actual = "//tensorflow/python/training:checkpoint_management",
+)
+
+alias(
+    name = "distribute",
+    actual = "//tensorflow/python/training:distribute",
+)
+
+alias(
+    name = "py_checkpoint_reader",
+    actual = "//tensorflow/python/training:py_checkpoint_reader",
+)
+
+alias(
+    name = "saver",
+    actual = "//tensorflow/python/training:saver",
+)
+
+alias(
+    name = "session_run_hook",
+    actual = "//tensorflow/python/training:session_run_hook",
+)
+
+alias(
+    name = "training",
+    actual = "//tensorflow/python/training:training",
+)
+
+alias(
+    name = "training_lib",
+    actual = "//tensorflow/python/training:training_lib",
+)
+
+alias(
+    name = "training_server_lib",
+    actual = "//tensorflow/python/training:server_lib",
+)
+
+alias(
+    name = "training_util",
+    actual = "//tensorflow/python/training:training_util",
+)
+
+alias(
+    name = "loss_scale",
+    actual = "//tensorflow/python/training/experimental:loss_scale",
+)
+
+alias(
+    name = "loss_scale_optimizer",
+    actual = "//tensorflow/python/training/experimental:loss_scale_optimizer",
+)
+
+alias(
+    name = "mixed_precision",
+    actual = "//tensorflow/python/training/experimental:mixed_precision",
+)
+
+alias(
+    name = "mixed_precision_global_state",
+    actual = "//tensorflow/python/training/experimental:mixed_precision_global_state",
+)
+
+alias(
+    name = "loss_scaling_gradient_tape",
+    actual = "//tensorflow/python/training/experimental:loss_scaling_gradient_tape",
+)
+
+py_library(
+    name = "learning_rate_decay",
+    # This rule depends on a target that only python:__pkg__ has visibility for.
+    srcs = ["//tensorflow/python/training:learning_rate_decay.py"],
+    srcs_version = "PY2AND3",
+    deps = ["//tensorflow/python/keras/optimizer_v2:legacy_learning_rate_decay"],
+)
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index b5acf23ba79..22b4884dd71 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -36,6 +36,7 @@ import traceback
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,g-bad-import-order,g-import-not-at-top
+from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow
 
 from tensorflow.python.eager import context
 
@@ -101,7 +102,7 @@ from tensorflow.python.platform import flags
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.platform import resource_loader
-from tensorflow.python.platform import sysconfig
+from tensorflow.python.platform import sysconfig as sysconfig_lib
 from tensorflow.python.platform import test
 
 from tensorflow.python.compat import v2_compat
diff --git a/tensorflow/python/autograph/BUILD b/tensorflow/python/autograph/BUILD
index 874b99464d8..f3b4dfcb558 100644
--- a/tensorflow/python/autograph/BUILD
+++ b/tensorflow/python/autograph/BUILD
@@ -21,7 +21,7 @@ py_strict_library(
     srcs = [
         "__init__.py",
     ],
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/python:util",
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index 98f766463ed..4eace00fcaf 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -212,7 +212,7 @@ class ControlFlowTransformer(converter.Base):
     orelse_scope = anno.getanno(node, annos.NodeAnno.ORELSE_SCOPE)
 
     cond_vars, undefined, nouts = self._get_block_vars(
-        node, body_scope.modified | orelse_scope.modified)
+        node, body_scope.bound | orelse_scope.bound)
 
     undefined_assigns = self._create_undefined_assigns(undefined)
 
@@ -267,7 +267,7 @@ class ControlFlowTransformer(converter.Base):
     node = self.generic_visit(node)
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
 
-    loop_vars, undefined, _ = self._get_block_vars(node, body_scope.modified)
+    loop_vars, undefined, _ = self._get_block_vars(node, body_scope.bound)
 
     undefined_assigns = self._create_undefined_assigns(undefined)
 
@@ -319,7 +319,7 @@ class ControlFlowTransformer(converter.Base):
     iter_scope = anno.getanno(node, annos.NodeAnno.ITERATE_SCOPE)
 
     loop_vars, undefined, _ = self._get_block_vars(
-        node, body_scope.modified | iter_scope.modified)
+        node, body_scope.bound | iter_scope.bound)
 
     undefined_assigns = self._create_undefined_assigns(undefined)
 
diff --git a/tensorflow/python/autograph/g3doc/reference/common_errors.md b/tensorflow/python/autograph/g3doc/reference/common_errors.md
index c663413c532..7263d01e14b 100644
--- a/tensorflow/python/autograph/g3doc/reference/common_errors.md
+++ b/tensorflow/python/autograph/g3doc/reference/common_errors.md
@@ -67,7 +67,33 @@ logger, with `WARNING` severity. To direct these warnings to `stdout`, use
 This exception is raised whenever a `tf.Tensor` is type-cast as a Python `bool`,
 in a context where eager execution is not active. The exception is only raised
 when graph execution is active, for example inside a `@tf.function` with
-AutoGraph turned off. It can be caused by using a `tf.Tensor` value as:
+AutoGraph turned off.
+
+**When AutoGraph is on**, it can be caused by:
+  * placing a Tensor-dependent `break`, `continue` or `return` inside a Python
+    loop (see example below)
+  * attempting to use a `tf.Tensor` in a list comprehension, by iterating over
+    it or using it in a condition)
+
+A typical example of mixing Python and TF control flow in an incompatible way
+is:
+
+```
+for i in range(3):  # Python loop
+  if i > tf.constant(0):  # TF conditional
+    break  # raises OperatorNotAllowedInGraphError
+```
+
+The way these errors are typically fixed is by ensuring all control flow is
+TF control flow:
+
+```
+for i in tf.range(3):  # TF loop
+  if i > tf.constant(0):  # TF conditional
+    break  # works
+```
+
+**When AutoGraph is off**, it can be caused by using a `tf.Tensor` value as:
 
   * the condition of an `if` or `while` statement: `if <tensor>:`
   * the argument in a logical expression: `tensor and another_tensor`
@@ -75,9 +101,6 @@ AutoGraph turned off. It can be caused by using a `tf.Tensor` value as:
 
 Note: These operations are allowed when executing eagerly.
 
-Within the context of AutoGraph, it usually indicates eager-style control
-flow that has not been converted by AutoGraph, for any reason.
-
 When encountering this error, make sure that the function is either decorated
 with `@tf.function`, or called from another function decorated in this way. Also
 look at the console and logging output for conversion warnings (see the section
diff --git a/tensorflow/python/autograph/impl/BUILD b/tensorflow/python/autograph/impl/BUILD
index 866314f5b76..4c5475bbb74 100644
--- a/tensorflow/python/autograph/impl/BUILD
+++ b/tensorflow/python/autograph/impl/BUILD
@@ -54,6 +54,7 @@ tf_py_test(
 tf_py_test(
     name = "conversion_test",
     srcs = ["conversion_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":impl",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index fb318e71cd9..089c4848030 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -296,7 +296,10 @@ def _convert_actual(entity, program_ctx):
 
 
 def autograph_artifact(entity, extras=None):
-  setattr(entity, 'autograph_info__', extras)
+  if inspect.ismethod(entity):
+    setattr(entity.__func__, 'autograph_info__', extras)
+  else:
+    setattr(entity, 'autograph_info__', extras)
   return entity
 
 
diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py
index ad7e8e9fb37..59ae5f4d856 100644
--- a/tensorflow/python/autograph/impl/api_test.py
+++ b/tensorflow/python/autograph/impl/api_test.py
@@ -1193,6 +1193,31 @@ class ApiTest(test.TestCase):
 
     test_fn(ag_ctx.ControlStatusCtx(status=ag_ctx.Status.DISABLED))
 
+  def test_tf_convert_tf_decorator_allowlist_method(self):
+
+    def wrap(f):
+
+      def wrapper(*args, **kwargs):
+        return wrapper.__wrapped__(*args, **kwargs)
+
+      return tf_decorator.make_decorator(f, wrapper)
+
+    class TestClass(object):
+
+      @wrap
+      def method(self):
+        return converter_testing.is_inside_generated_code()
+
+    converter_testing.allowlist(TestClass.method)
+
+    obj = TestClass()
+    # It's intended that tf_convert modifies the original method in this case.
+    # This is not desirable, but options are limited.
+    converted = api.tf_convert(
+        obj.method, ag_ctx.ControlStatusCtx(status=ag_ctx.Status.ENABLED))
+    self.assertTrue(converted())
+    self.assertTrue(obj.method())
+
   def test_super_with_one_arg(self):
     test_case_self = self
 
diff --git a/tensorflow/python/autograph/lang/BUILD b/tensorflow/python/autograph/lang/BUILD
index dca39bbbd8d..ceccc6f0c93 100644
--- a/tensorflow/python/autograph/lang/BUILD
+++ b/tensorflow/python/autograph/lang/BUILD
@@ -4,8 +4,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index 77d02e69976..13b3b7a1764 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -4,8 +4,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 9bd139c031f..aaa4808cb0a 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -116,6 +116,33 @@ def _is_none_or_undef(value):
           or isinstance(value, variables.Undefined))
 
 
+def _verify_tf_condition(cond, tag):
+  """Ensures that the condition can be used in a TF control flow."""
+  extra_hint = 'to check for None, use `is not None`'
+  cond = ops.convert_to_tensor_v2(cond)
+
+  if cond.dtype != dtypes.bool:
+    raise ValueError(
+        'condition of {} expected to be `tf.bool` scalar, got {}'
+        '; to use as boolean Tensor, use `tf.cast`'
+        '; {}'.format(tag, cond, extra_hint))
+
+  if cond.shape is None or cond.shape.ndims is None:
+    # TODO(mdan): Consider a explicit size check, if not too slow.
+    cond = array_ops.reshape(cond, ())
+
+  elif cond.shape.ndims > 0:
+    known_dims = [d for d in cond.shape.as_list() if d is not None]
+    if np.prod(known_dims) > 1:
+      raise ValueError(
+          'condition of {} expected to be `tf.bool` scalar, got {}'
+          '; {}'.format(tag, cond, extra_hint))
+    else:
+      cond = array_ops.reshape(cond, ())
+
+  return cond
+
+
 def _verify_loop_init_vars(init_vars, symbol_names, first_iter_vars=None):
   """Ensures that all values in the state are valid to use in a TF loop.
 
@@ -1038,7 +1065,7 @@ def _tf_while_stmt(test, body, get_state, set_state, symbol_names, opts):
       loop_vars = loop_vars[1:]
 
     set_state(loop_vars)
-    return test()
+    return _verify_tf_condition(test(), 'while loop')
 
   def aug_body(*loop_vars):
     if require_one_iteration:
@@ -1141,6 +1168,8 @@ def if_stmt(cond, body, orelse, get_state, set_state, symbol_names, nouts):
 def _tf_if_stmt(
     cond, body, orelse, get_state, set_state, symbol_names, nouts):
   """Overload of if_stmt that stages a TF cond."""
+  cond = _verify_tf_condition(cond, 'if statement')
+
   if not nouts:
     prev_get_state, prev_set_state = get_state, set_state
     # Control flow V1 wants at least one output.
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index 32b36a29797..8d3c63b5a89 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.autograph.utils import testing
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
@@ -46,6 +47,20 @@ from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 
 
+def _unranked_item(value):
+  rand_rank = random_ops.random_uniform(
+      shape=(), minval=3, maxval=4, dtype=dtypes.int32)
+  rand_shape = array_ops.ones([rand_rank], dtype=dtypes.int32)
+  return array_ops.fill(rand_shape, value)
+
+
+def _partial_shaped_bools():
+  rand_vect = math_ops.range(
+      random_ops.random_uniform(
+          shape=(), minval=2, maxval=3, dtype=dtypes.int32))
+  return array_ops.expand_dims_v2(rand_vect, 0) < 0
+
+
 class ForLoopTest(testing.AutoGraphTestCase):
 
   def test_tensor(self):
@@ -871,6 +886,60 @@ class WhileLoopTest(testing.AutoGraphTestCase):
     with self.assertRaisesRegex(ValueError, r"'s'.* shape \(1,\) after"):
       self._basic_loop(0, lambda i, s: np.array([1], dtype=np.int32))
 
+  def _fixed_while_loop(self, cond_fn):
+    def test_():
+      return cond_fn(s)
+
+    def body():
+      nonlocal s
+      s += 1
+
+    def set_state(loop_vars):
+      nonlocal s
+      s, = loop_vars
+
+    s = constant_op.constant(0)
+    control_flow.while_stmt(
+        test=test_,
+        body=body,
+        get_state=lambda: (s,),
+        set_state=set_state,
+        symbol_names=('s',),
+        opts={})
+    return s
+
+  def _assertFixedLoopResult(self, cond, expected):
+    def test_fn():
+      return self._fixed_while_loop(cond)
+    self.assertEqual(test_fn(), expected)
+
+  def test_tensor_legal_cond_scalar(self):
+    self._assertFixedLoopResult(lambda s: constant_op.constant(False), 0)
+    self._assertFixedLoopResult(lambda s: s < 2, 2)
+
+  def test_tensor_legal_cond_single_element_nd(self):
+    self._assertFixedLoopResult(lambda s: constant_op.constant([[False]]), 0)
+    self._assertFixedLoopResult(lambda s: _unranked_item(False), 0)
+
+  def _assertCondCheckFails(self, cond):
+    with self.assertRaisesRegex(
+        ValueError, 'condition of while loop expected to be `tf.bool`'):
+      self._fixed_while_loop(cond)
+
+  def test_tensor_illegal_cond_not_bool(self):
+    self._assertCondCheckFails(lambda s: constant_op.constant(1))
+    self._assertCondCheckFails(lambda s: s)
+
+  def test_tensor_illegal_cond_not_single_element(self):
+    self._assertCondCheckFails(lambda s: constant_op.constant([1, 2, 3]))
+    self._assertCondCheckFails(lambda s: constant_op.constant([True, False]))
+
+  def test_tensor_illegal_cond_not_single_element_dynamic_shape(self):
+    self._fixed_while_loop(lambda s: _partial_shaped_bools())
+    # TODO(mdan): This error is quite bad. Measure the cost of an assertion.
+    self.assertRaisesRuntime(
+        errors_impl.InvalidArgumentError, 'requested shape has 1')
+
 
 class IfStmtTest(testing.AutoGraphTestCase):
 
@@ -1065,6 +1134,62 @@ class IfStmtTest(testing.AutoGraphTestCase):
         TypeError, "'x' has dtype int32.*but.*float32"):
       self._basic_cond(lambda: 1, lambda: 1.0)
 
+  def _fixed_cond(self, cond_val):
+    def body():
+      nonlocal x
+      x = 1
+
+    def orelse():
+      nonlocal x
+      x = -1
+
+    def set_state(cond_vars):
+      nonlocal x
+      x, = cond_vars
+
+    x = 0
+    control_flow.if_stmt(
+        cond=cond_val,
+        body=body,
+        orelse=orelse,
+        get_state=lambda: (x,),
+        set_state=set_state,
+        symbol_names=('x',),
+        nouts=1)
+    return x
+
+  def _assertFixedCondResult(self, cond, expected):
+    def test_fn():
+      return self._fixed_cond(cond)
+    self.assertEqual(test_fn(), expected)
+
+  def test_tensor_legal_cond_scalar(self):
+    self._assertFixedCondResult(constant_op.constant(True), 1)
+    self._assertFixedCondResult(constant_op.constant(False), -1)
+
+  def test_tensor_legal_cond_single_element_nd(self):
+    self._assertFixedCondResult(constant_op.constant([[True]]), 1)
+    self._assertFixedCondResult(constant_op.constant([[False]]), -1)
+    self._assertFixedCondResult(_unranked_item(True), 1)
+    self._assertFixedCondResult(_unranked_item(False), -1)
+
+  def _assertCondCheckFails(self, cond):
+    with self.assertRaisesRegex(
+        ValueError, 'condition of if statement expected to be `tf.bool`'):
+      self._fixed_cond(cond)
+
+  def test_tensor_illegal_cond_not_bool(self):
+    self._assertCondCheckFails(constant_op.constant(1))
+
+  def test_tensor_illegal_cond_not_single_element(self):
+    self._assertCondCheckFails(constant_op.constant([1, 2, 3]))
+    self._assertCondCheckFails(constant_op.constant([True, False]))
+
+  def test_tensor_illegal_cond_not_single_element_dynamic_shape(self):
+    self._fixed_cond(_partial_shaped_bools())
+    # TODO(mdan): This error is quite bad. Measure the cost of an assertion.
+    self.assertRaisesRuntime(
+        errors_impl.InvalidArgumentError, 'requested shape has 1')
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/operators/py_builtins.py b/tensorflow/python/autograph/operators/py_builtins.py
index bf5ea035b54..b35e51f3e7d 100644
--- a/tensorflow/python/autograph/operators/py_builtins.py
+++ b/tensorflow/python/autograph/operators/py_builtins.py
@@ -191,8 +191,10 @@ def _tf_abs(x):
 def _tf_dataset_abs(x):
   specs = nest.flatten(x.element_spec)
   if len(specs) == 1:
-    return x.map(math_ops.abs)
-  return x.map(lambda *e: nest.map_structure(math_ops.abs, e))
+    return x.map(math_ops.abs, num_parallel_calls=dataset_ops.AUTOTUNE)
+  return x.map(
+      lambda *e: nest.map_structure(math_ops.abs, e),
+      num_parallel_calls=dataset_ops.AUTOTUNE)
 
 
 def _py_abs(x):
@@ -427,7 +429,8 @@ def map_(fn, *iterables):
 
 
 def _tf_dataset_map(fn, *iterables):
-  return dataset_ops.DatasetV2.zip(iterables).map(fn)
+  zipped_dataset = dataset_ops.DatasetV2.zip(iterables)
+  return zipped_dataset.map(fn, num_parallel_calls=dataset_ops.AUTOTUNE)
 
 
 def _py_map(fn, *iterables):
diff --git a/tensorflow/python/autograph/pyct/BUILD b/tensorflow/python/autograph/pyct/BUILD
index 93bd9228f36..940b542fcaa 100644
--- a/tensorflow/python/autograph/pyct/BUILD
+++ b/tensorflow/python/autograph/pyct/BUILD
@@ -4,8 +4,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 filegroup(
     name = "all_files",
     srcs = glob(
@@ -31,7 +29,6 @@ py_library(
         "gast_util.py",
         "inspect_utils.py",
         "loader.py",
-        "loader_deprecated_py2.py",
         "naming.py",
         "origin_info.py",
         "parser.py",
diff --git a/tensorflow/python/autograph/pyct/loader.py b/tensorflow/python/autograph/pyct/loader.py
index 6eb925bca45..4fc01e35942 100644
--- a/tensorflow/python/autograph/pyct/loader.py
+++ b/tensorflow/python/autograph/pyct/loader.py
@@ -31,7 +31,6 @@ import tempfile
 
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.pyct import parser
-from tensorflow.python.autograph.utils import compat_util
 
 
 def _remove_file(file_name):
@@ -103,6 +102,3 @@ def load_ast(nodes,
 
   # TODO(mdan): Return a structured object.
   return module, source, source_map
-
-
-compat_util.deprecated_py2_support(__name__)
diff --git a/tensorflow/python/autograph/pyct/loader_deprecated_py2.py b/tensorflow/python/autograph/pyct/loader_deprecated_py2.py
deleted file mode 100644
index fd962916cac..00000000000
--- a/tensorflow/python/autograph/pyct/loader_deprecated_py2.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Converting AST to code and Python entities.
-
-Python 2 compatibility version. Not maintained.
-
-Adapted from Tangent.
-"""
-
-# TODO(mdan): Consolidate with parser and rename to parsing.py
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# TODO(mdan): Use six for compatibility here.
-import atexit
-import imp
-import os
-import tempfile
-
-import six
-
-from tensorflow.python.autograph.pyct import origin_info
-from tensorflow.python.autograph.pyct import parser
-
-
-def load_source(source, delete_on_exit):
-  """Loads the given source code as a Python module."""
-  if six.PY2:
-    source = source.encode('utf-8')
-    f = tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False)
-  else:
-    f = tempfile.NamedTemporaryFile(  # pylint:disable=unexpected-keyword-arg
-        mode='w', suffix='.py', delete=False, encoding='utf-8')
-
-  with f:
-    module_name = os.path.basename(f.name[:-3])
-    f.write(source)
-
-  if delete_on_exit:
-    atexit.register(lambda: os.remove(f.name))
-  return imp.load_source(module_name, f.name), f.name
-
-
-def load_ast(nodes,
-             indentation='  ',
-             include_source_map=False,
-             delete_on_exit=True):
-  """Loads the given AST as a Python module.
-
-  Compiling the AST code this way ensures that the source code is readable by
-  e.g. `pdb` or `inspect`.
-
-  Args:
-    nodes: Union[ast.AST, Iterable[ast.AST]], the code to compile, as an AST
-      object.
-    indentation: Text, the string to use for indentation.
-    include_source_map: bool, whether return a source map.
-    delete_on_exit: bool, whether to delete the temporary file used for
-      compilation on exit.
-
-  Returns:
-    Tuple[module, Text, Dict[LineLocation, OriginInfo]], containing:
-    the module containing the unparsed nodes, the source code corresponding to
-    nodes, and the source map. Is include_source_map is False, the source map
-    will be None.
-  """
-  if not isinstance(nodes, (list, tuple)):
-    nodes = (nodes,)
-
-  source = parser.unparse(nodes, indentation=indentation)
-  module, _ = load_source(source, delete_on_exit)
-
-  if include_source_map:
-    source_map = origin_info.create_source_map(nodes, source, module.__file__)
-  else:
-    source_map = None
-
-  # TODO(mdan): Return a structured object.
-  return module, source, source_map
diff --git a/tensorflow/python/autograph/utils/BUILD b/tensorflow/python/autograph/utils/BUILD
index 5d32f60465d..ba44b2b435c 100644
--- a/tensorflow/python/autograph/utils/BUILD
+++ b/tensorflow/python/autograph/utils/BUILD
@@ -4,8 +4,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/python/autograph/utils/testing.py b/tensorflow/python/autograph/utils/testing.py
index bec6966e7cb..5b74496fb74 100644
--- a/tensorflow/python/autograph/utils/testing.py
+++ b/tensorflow/python/autograph/utils/testing.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import re
+import sys
 import types
 import unittest
 
@@ -81,18 +82,29 @@ class AutoGraphTestCase(test.TestCase):
       @def_function.function(autograph=False)  # Testing autograph itself.
       def fn_wrapper():
         self.assertions = []
+        self.raises_cm = None
         self.graph_assertions = []
         self.trace_log = []
         fn()
         targets = [args for _, args in self.assertions]
         return targets
 
-      tensors = fn_wrapper()
+      try:
+        tensors = fn_wrapper()
 
-      for assertion in self.graph_assertions:
-        assertion(fn_wrapper.get_concrete_function().graph)
+        for assertion in self.graph_assertions:
+          assertion(fn_wrapper.get_concrete_function().graph)
+
+        actuals = self.evaluate(tensors)
+
+      except:  # pylint:disable=bare-except
+        if self.raises_cm is not None:
+          # Note: Yes, the Raises and function contexts cross.
+          self.raises_cm.__exit__(*sys.exc_info())
+          return
+        else:
+          raise
 
-      actuals = self.evaluate(tensors)
       for (assertion, _), values in zip(self.assertions, actuals):
         assertion(*values)
 
@@ -109,6 +121,7 @@ class AutoGraphTestCase(test.TestCase):
     super().setUp()
     self.variables = {}
     self.trace_log = []
+    self.raises_cm = None
     op_callbacks.add_op_callback(self._op_callback)
 
   def tearDown(self):
@@ -145,3 +158,9 @@ class AutoGraphTestCase(test.TestCase):
 
   def assertDictEqual(self, *args):
     self.assertions.append((super().assertDictEqual, list(args)))
+
+  def assertRaisesRuntime(self, *args):
+    if self.raises_cm is not None:
+      raise ValueError('cannot use more than one assertRaisesRuntime in a test')
+    self.raises_cm = self.assertRaisesRegex(*args)
+    self.raises_cm.__enter__()
diff --git a/tensorflow/python/client/device_lib_test.py b/tensorflow/python/client/device_lib_test.py
index fec41f50b6c..431cafa0371 100644
--- a/tensorflow/python/client/device_lib_test.py
+++ b/tensorflow/python/client/device_lib_test.py
@@ -39,8 +39,7 @@ class DeviceLibTest(test_util.TensorFlowTestCase):
     # GPU test
     if test.is_gpu_available():
       self.assertGreater(len(devices), 1)
-      self.assertTrue("GPU" in [d.device_type for d in devices] or
-                      "SYCL" in [d.device_type for d in devices])
+      self.assertIn("GPU", [d.device_type for d in devices])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index bcd27fb6318..5a83f5776a1 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import functools
 import re
 import threading
@@ -41,13 +42,14 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.experimental import mixed_precision_global_state
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
-from tensorflow.python.util.tf_export import tf_export
 from tensorflow.python.util.compat import collections_abc
+from tensorflow.python.util.tf_export import tf_export
 
 _python_session_create_counter = monitoring.Counter(
     '/tensorflow/api/python/session_create_counter',
     'Counter for number of sessions created in Python.')
 
+
 class SessionInterface(object):
   """Base class for implementations of TensorFlow client sessions."""
 
@@ -406,6 +408,12 @@ class _DictFetchMapper(_FetchMapper):
       fetches: Dict of fetches.
     """
     self._fetch_type = type(fetches)
+    if isinstance(fetches, collections.defaultdict):
+      self._type_ctor = functools.partial(collections.defaultdict,
+                                          fetches.default_factory)
+    else:
+      self._type_ctor = self._fetch_type
+
     self._keys = fetches.keys()
     self._mappers = [
         _FetchMapper.for_fetch(fetch) for fetch in fetches.values()
@@ -416,10 +424,12 @@ class _DictFetchMapper(_FetchMapper):
     return self._unique_fetches
 
   def build_results(self, values):
-    results = self._fetch_type()
-    for k, m, vi in zip(self._keys, self._mappers, self._value_indices):
-      results[k] = m.build_results([values[j] for j in vi])
-    return results
+
+    def _generator():
+      for k, m, vi in zip(self._keys, self._mappers, self._value_indices):
+        yield k, m.build_results([values[j] for j in vi])
+
+    return self._type_ctor(_generator())
 
 
 class _AttrsFetchMapper(_FetchMapper):
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 23d5ddaee44..4bf5095ae8b 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -18,8 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import random
 import os
+import random
 import sys
 import threading
 import time
@@ -68,6 +68,13 @@ try:
 except ImportError:
   attr = None
 
+try:
+  from frozendict import frozendict  # pylint:disable=g-import-not-at-top
+except ImportError:
+  frozendict = dict  # pylint:disable=invalid-name
+
+defaultdict = collections.defaultdict  # pylint:disable=invalid-name
+
 
 class SessionTest(test_util.TensorFlowTestCase):
 
@@ -222,13 +229,13 @@ class SessionTest(test_util.TensorFlowTestCase):
       res = sess.run(a)
       self.assertEqual(42.0, res)
       res = sess.run(a.op)  # An op, not a tensor.
-      self.assertEqual(None, res)
+      self.assertIsNone(res)
       tensor_runner = sess.make_callable(a)
       res = tensor_runner()
       self.assertEqual(42.0, res)
       op_runner = sess.make_callable(a.op)
       res = op_runner()
-      self.assertEqual(None, res)
+      self.assertIsNone(res)
 
   def testFetchSingletonByName(self):
     with session.Session() as sess:
@@ -236,7 +243,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       res = sess.run(a.name)
       self.assertEqual(42.0, res)
       res = sess.run(a.op)  # An op, not a tensor.
-      self.assertEqual(None, res)
+      self.assertIsNone(res)
 
   def testFetchList(self):
     with session.Session() as sess:
@@ -246,11 +253,11 @@ class SessionTest(test_util.TensorFlowTestCase):
       v = variables.Variable([54.0])
       assign = v.assign([63.0])
       res = sess.run([a, b, c, a.name, assign.op])
-      self.assertTrue(isinstance(res, list))
+      self.assertIsInstance(res, list)
       self.assertEqual([42.0, None, 44.0, 42.0, None], res)
       list_runner = sess.make_callable([a, b, c, a.name, assign.op])
       res = list_runner()
-      self.assertTrue(isinstance(res, list))
+      self.assertIsInstance(res, list)
       self.assertEqual([42.0, None, 44.0, 42.0, None], res)
 
   def testFetchTuple(self):
@@ -259,11 +266,11 @@ class SessionTest(test_util.TensorFlowTestCase):
       b = control_flow_ops.no_op()  # An op, not a tensor.
       c = constant_op.constant(44.0)
       res = sess.run((a, b, c, a.name))
-      self.assertTrue(isinstance(res, tuple))
+      self.assertIsInstance(res, tuple)
       self.assertEqual((42.0, None, 44.0, 42.0), res)
       tuple_runner = sess.make_callable((a, b, c, a.name))
       res = tuple_runner()
-      self.assertTrue(isinstance(res, tuple))
+      self.assertIsInstance(res, tuple)
       self.assertEqual((42.0, None, 44.0, 42.0), res)
 
   def testFetchNamedTuple(self):
@@ -275,15 +282,15 @@ class SessionTest(test_util.TensorFlowTestCase):
       b = control_flow_ops.no_op()  # An op, not a tensor.
       c = constant_op.constant(44.0)
       res = sess.run(ABC(a, b, c))
-      self.assertTrue(isinstance(res, ABC))
+      self.assertIsInstance(res, ABC)
       self.assertEqual(42.0, res.a)
-      self.assertEqual(None, res.b)
+      self.assertIsNone(res.b)
       self.assertEqual(44.0, res.c)
       namedtuple_runner = sess.make_callable(ABC(a, b, c))
       res = namedtuple_runner()
-      self.assertTrue(isinstance(res, ABC))
+      self.assertIsInstance(res, ABC)
       self.assertEqual(42.0, res.a)
-      self.assertEqual(None, res.b)
+      self.assertIsNone(res.b)
       self.assertEqual(44.0, res.c)
 
   def testFetchDict(self):
@@ -292,9 +299,9 @@ class SessionTest(test_util.TensorFlowTestCase):
       b = control_flow_ops.no_op()  # An op, not a tensor.
       c = constant_op.constant(44.0)
       res = sess.run({'a': a, 'b': b, 'c': c})
-      self.assertTrue(isinstance(res, dict))
+      self.assertIsInstance(res, dict)
       self.assertEqual(42.0, res['a'])
-      self.assertEqual(None, res['b'])
+      self.assertIsNone(res['b'])
       self.assertEqual(44.0, res['c'])
 
   def testFetchOrderedDict(self):
@@ -303,10 +310,10 @@ class SessionTest(test_util.TensorFlowTestCase):
       b = control_flow_ops.no_op()  # An op, not a tensor.
       c = constant_op.constant(44.0)
       res = sess.run(collections.OrderedDict([(3, a), (2, b), (1, c)]))
-      self.assertTrue(isinstance(res, collections.OrderedDict))
+      self.assertIsInstance(res, collections.OrderedDict)
       self.assertEqual([3, 2, 1], list(res.keys()))
       self.assertEqual(42.0, res[3])
-      self.assertEqual(None, res[2])
+      self.assertIsNone(res[2])
       self.assertEqual(44.0, res[1])
 
   @test_util.run_v1_only('b/120545219')
@@ -393,23 +400,23 @@ class SessionTest(test_util.TensorFlowTestCase):
       a = constant_op.constant(a_val)
 
       res = sess.run([[], tuple(), {}])
-      self.assertTrue(isinstance(res, list))
+      self.assertIsInstance(res, list)
       self.assertEqual(3, len(res))
-      self.assertTrue(isinstance(res[0], list))
+      self.assertIsInstance(res[0], list)
       self.assertEqual(0, len(res[0]))
-      self.assertTrue(isinstance(res[1], tuple))
+      self.assertIsInstance(res[1], tuple)
       self.assertEqual(0, len(res[1]))
-      self.assertTrue(isinstance(res[2], dict))
+      self.assertIsInstance(res[2], dict)
       self.assertEqual(0, len(res[2]))
 
       res = sess.run([[], tuple(), {}, a])
-      self.assertTrue(isinstance(res, list))
+      self.assertIsInstance(res, list)
       self.assertEqual(4, len(res))
-      self.assertTrue(isinstance(res[0], list))
+      self.assertIsInstance(res[0], list)
       self.assertEqual(0, len(res[0]))
-      self.assertTrue(isinstance(res[1], tuple))
+      self.assertIsInstance(res[1], tuple)
       self.assertEqual(0, len(res[1]))
-      self.assertTrue(isinstance(res[2], dict))
+      self.assertIsInstance(res[2], dict)
       self.assertEqual(0, len(res[2]))
       self.assertEqual(a_val, res[3])
 
@@ -417,7 +424,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     with session.Session() as sess:
       # pylint: disable=invalid-name
       ABC = collections.namedtuple('ABC', ['a', 'b', 'c'])
-      DEFG = collections.namedtuple('DEFG', ['d', 'e', 'f', 'g'])
+      DEFGHI = collections.namedtuple('DEFGHI', ['d', 'e', 'f', 'g', 'h', 'i'])
       # pylint: enable=invalid-name
       a_val = 42.0
       b_val = None
@@ -425,124 +432,141 @@ class SessionTest(test_util.TensorFlowTestCase):
       a = constant_op.constant(a_val)
       b = control_flow_ops.no_op()  # An op, not a tensor.
       c = constant_op.constant(c_val)
-      # List of lists, tuples, namedtuple, and dict
-      res = sess.run([[a, b, c], (a, b, c),
-                      ABC(a=a, b=b, c=c), {
-                          'a': a.name,
-                          'c': c,
-                          'b': b
-                      }])
-      self.assertTrue(isinstance(res, list))
-      self.assertEqual(4, len(res))
-      self.assertTrue(isinstance(res[0], list))
+      test_dct = {'a': a.name, 'c': c, 'b': b}
+      test_dct_types = [dict, frozendict, defaultdict]
+      # List of lists, tuples, namedtuple, dict, frozendict, and defaultdict
+      res = sess.run([
+          [a, b, c],
+          (a, b, c),
+          ABC(a=a, b=b, c=c),
+          dict(test_dct),
+          frozendict(test_dct),
+          defaultdict(str, test_dct),
+      ])
+      self.assertIsInstance(res, list)
+      self.assertEqual(6, len(res))
+      self.assertIsInstance(res[0], list)
       self.assertEqual(3, len(res[0]))
       self.assertEqual(a_val, res[0][0])
       self.assertEqual(b_val, res[0][1])
       self.assertEqual(c_val, res[0][2])
-      self.assertTrue(isinstance(res[1], tuple))
+      self.assertIsInstance(res[1], tuple)
       self.assertEqual(3, len(res[1]))
       self.assertEqual(a_val, res[1][0])
       self.assertEqual(b_val, res[1][1])
       self.assertEqual(c_val, res[1][2])
-      self.assertTrue(isinstance(res[2], ABC))
+      self.assertIsInstance(res[2], ABC)
       self.assertEqual(a_val, res[2].a)
       self.assertEqual(b_val, res[2].b)
       self.assertEqual(c_val, res[2].c)
-      self.assertTrue(isinstance(res[3], dict))
-      self.assertEqual(3, len(res[3]))
-      self.assertEqual(a_val, res[3]['a'])
-      self.assertEqual(b_val, res[3]['b'])
-      self.assertEqual(c_val, res[3]['c'])
-      # Tuple of lists, tuples, namedtuple, and dict
-      res = sess.run(([a, b, c], (a.name, b, c), ABC(a=a, b=b, c=c), {
-          'a': a,
-          'c': c,
-          'b': b
-      }))
-      self.assertTrue(isinstance(res, tuple))
-      self.assertEqual(4, len(res))
-      self.assertTrue(isinstance(res[0], list))
+      for expected_type, r in zip(test_dct_types, res[3:]):
+        self.assertIsInstance(r, expected_type)
+        self.assertEqual(3, len(r))
+        self.assertEqual(a_val, r['a'])
+        self.assertEqual(b_val, r['b'])
+        self.assertEqual(c_val, r['c'])
+      self.assertEqual(res[5].default_factory, str)
+      # Tuple of lists, tuples, namedtuple, dict, frozendict, and defaultdict
+      res = sess.run(([a, b, c], (a.name, b, c), ABC(a=a, b=b,
+                                                     c=c), dict(test_dct),
+                      frozendict(test_dct), defaultdict(str, test_dct)))
+      self.assertIsInstance(res, tuple)
+      self.assertEqual(6, len(res))
+      self.assertIsInstance(res[0], list)
       self.assertEqual(3, len(res[0]))
       self.assertEqual(a_val, res[0][0])
       self.assertEqual(b_val, res[0][1])
       self.assertEqual(c_val, res[0][2])
-      self.assertTrue(isinstance(res[1], tuple))
+      self.assertIsInstance(res[1], tuple)
       self.assertEqual(3, len(res[1]))
       self.assertEqual(a_val, res[1][0])
       self.assertEqual(b_val, res[1][1])
       self.assertEqual(c_val, res[1][2])
-      self.assertTrue(isinstance(res[2], ABC))
+      self.assertIsInstance(res[2], ABC)
       self.assertEqual(a_val, res[2].a)
       self.assertEqual(b_val, res[2].b)
       self.assertEqual(c_val, res[2].c)
-      self.assertTrue(isinstance(res[3], dict))
-      self.assertEqual(3, len(res[3]))
-      self.assertEqual(a_val, res[3]['a'])
-      self.assertEqual(b_val, res[3]['b'])
-      self.assertEqual(c_val, res[3]['c'])
-      # Namedtuple of lists, tuples, namedtuples, and dict
+      for expected_type, r in zip(test_dct_types, res[3:]):
+        self.assertIsInstance(r, expected_type)
+        self.assertEqual(3, len(r))
+        self.assertEqual(a_val, r['a'])
+        self.assertEqual(b_val, r['b'])
+        self.assertEqual(c_val, r['c'])
+      self.assertEqual(res[5].default_factory, str)
+
+      # Namedtuple of lists, tuples, namedtuples, dict, frozendict, defaultdict
       res = sess.run(
-          DEFG(
+          DEFGHI(
               d=[a, b, c],
               e=(a, b, c),
               f=ABC(a=a.name, b=b, c=c),
-              g={
-                  'a': a,
-                  'c': c,
-                  'b': b
-              }))
-      self.assertTrue(isinstance(res, DEFG))
-      self.assertTrue(isinstance(res.d, list))
+              g=dict(test_dct),
+              h=frozendict(test_dct),
+              i=defaultdict(str, test_dct)))
+      self.assertIsInstance(res, DEFGHI)
+      self.assertIsInstance(res.d, list)
       self.assertEqual(3, len(res.d))
       self.assertEqual(a_val, res.d[0])
       self.assertEqual(b_val, res.d[1])
       self.assertEqual(c_val, res.d[2])
-      self.assertTrue(isinstance(res.e, tuple))
+      self.assertIsInstance(res.e, tuple)
       self.assertEqual(3, len(res.e))
       self.assertEqual(a_val, res.e[0])
       self.assertEqual(b_val, res.e[1])
       self.assertEqual(c_val, res.e[2])
-      self.assertTrue(isinstance(res.f, ABC))
+      self.assertIsInstance(res.f, ABC)
       self.assertEqual(a_val, res.f.a)
       self.assertEqual(b_val, res.f.b)
       self.assertEqual(c_val, res.f.c)
-      self.assertTrue(isinstance(res.g, dict))
+      self.assertIsInstance(res.g, dict)
       self.assertEqual(3, len(res.g))
       self.assertEqual(a_val, res.g['a'])
       self.assertEqual(b_val, res.g['b'])
       self.assertEqual(c_val, res.g['c'])
-      # Dict of lists, tuples, namedtuples, and dict
+      self.assertIsInstance(res.h, frozendict)
+      self.assertEqual(3, len(res.h))
+      self.assertEqual(a_val, res.h['a'])
+      self.assertEqual(b_val, res.h['b'])
+      self.assertEqual(c_val, res.h['c'])
+      self.assertIsInstance(res.i, defaultdict)
+      self.assertEqual(3, len(res.i))
+      self.assertEqual(a_val, res.i['a'])
+      self.assertEqual(b_val, res.i['b'])
+      self.assertEqual(c_val, res.i['c'])
+      self.assertEqual(res.i.default_factory, str)
+      # Dict of lists, tuples, namedtuples, dict, frozendict, defaultdict
       res = sess.run({
           'd': [a, b, c],
           'e': (a, b, c),
           'f': ABC(a=a, b=b, c=c),
-          'g': {
-              'a': a.name,
-              'c': c,
-              'b': b
-          }
+          'g': dict(test_dct),
+          'h': frozendict(test_dct),
+          'i': defaultdict(str, test_dct),
       })
-      self.assertTrue(isinstance(res, dict))
-      self.assertEqual(4, len(res))
-      self.assertTrue(isinstance(res['d'], list))
+      self.assertIsInstance(res, dict)
+      self.assertEqual(6, len(res))
+      self.assertIsInstance(res['d'], list)
       self.assertEqual(3, len(res['d']))
       self.assertEqual(a_val, res['d'][0])
       self.assertEqual(b_val, res['d'][1])
       self.assertEqual(c_val, res['d'][2])
-      self.assertTrue(isinstance(res['e'], tuple))
+      self.assertIsInstance(res['e'], tuple)
       self.assertEqual(3, len(res['e']))
       self.assertEqual(a_val, res['e'][0])
       self.assertEqual(b_val, res['e'][1])
       self.assertEqual(c_val, res['e'][2])
-      self.assertTrue(isinstance(res['f'], ABC))
+      self.assertIsInstance(res['f'], ABC)
       self.assertEqual(a_val, res['f'].a)
       self.assertEqual(b_val, res['f'].b)
       self.assertEqual(c_val, res['f'].c)
-      self.assertTrue(isinstance(res['g'], dict))
-      self.assertEqual(3, len(res['g']))
-      self.assertEqual(a_val, res['g']['a'])
-      self.assertEqual(b_val, res['g']['b'])
-      self.assertEqual(c_val, res['g']['c'])
+      for expected_type, r_key in zip(test_dct_types, ('g', 'h', 'i')):
+        r = res[r_key]
+        self.assertIsInstance(r, expected_type)
+        self.assertEqual(3, len(r))
+        self.assertEqual(a_val, r['a'])
+        self.assertEqual(b_val, r['b'])
+        self.assertEqual(c_val, r['c'])
+      self.assertEqual(res['i'].default_factory, str)
 
   def testFetchTensorObject(self):
     with session.Session() as s:
@@ -1279,7 +1303,7 @@ class SessionTest(test_util.TensorFlowTestCase):
   @test_util.run_v1_only('b/120545219')
   def testNotEntered(self):
     # pylint: disable=protected-access
-    self.assertEqual(ops._default_session_stack.get_default(), None)
+    self.assertIsNone(ops._default_session_stack.get_default())
     # pylint: enable=protected-access
     with ops.device('/cpu:0'):
       sess = session.Session()
@@ -1326,10 +1350,10 @@ class SessionTest(test_util.TensorFlowTestCase):
     with warnings.catch_warnings(record=True) as w:
       sess2 = session.InteractiveSession()
     self.assertEqual(1, len(w))
-    self.assertTrue('An interactive session is already active. This can cause '
-                    'out-of-memory errors in some cases. You must explicitly '
-                    'call `InteractiveSession.close()` to release resources '
-                    'held by the other session(s).' in str(w[0].message))
+    self.assertIn('An interactive session is already active. This can cause '
+                  'out-of-memory errors in some cases. You must explicitly '
+                  'call `InteractiveSession.close()` to release resources '
+                  'held by the other session(s).', str(w[0].message))
     sess2.close()
     sess.close()
 
@@ -1610,10 +1634,10 @@ class SessionTest(test_util.TensorFlowTestCase):
       e = constant_op.constant(44.0, name=b'e')
       f = constant_op.constant(45.0, name=r'f')
 
-      self.assertTrue(isinstance(c.name, six.text_type))
-      self.assertTrue(isinstance(d.name, six.text_type))
-      self.assertTrue(isinstance(e.name, six.text_type))
-      self.assertTrue(isinstance(f.name, six.text_type))
+      self.assertIsInstance(c.name, six.text_type)
+      self.assertIsInstance(d.name, six.text_type)
+      self.assertIsInstance(e.name, six.text_type)
+      self.assertIsInstance(f.name, six.text_type)
 
       self.assertEqual(42.0, sess.run('c:0'))
       self.assertEqual(42.0, sess.run(u'c:0'))
@@ -1673,10 +1697,10 @@ class SessionTest(test_util.TensorFlowTestCase):
     with ops.device('/cpu:0'):
       with session.Session() as sess:
         sess.run(constant_op.constant(1.0))
-        self.assertTrue(not run_metadata.HasField('step_stats'))
+        self.assertFalse(run_metadata.HasField('step_stats'))
 
         sess.run(constant_op.constant(1.0), run_metadata=run_metadata)
-        self.assertTrue(not run_metadata.HasField('step_stats'))
+        self.assertFalse(run_metadata.HasField('step_stats'))
 
         sess.run(
             constant_op.constant(1.0),
@@ -1697,11 +1721,11 @@ class SessionTest(test_util.TensorFlowTestCase):
         sess.run(constant_op.constant(1.0), options=None, run_metadata=None)
         sess.run(
             constant_op.constant(1.0), options=None, run_metadata=run_metadata)
-        self.assertTrue(not run_metadata.HasField('step_stats'))
+        self.assertFalse(run_metadata.HasField('step_stats'))
 
         sess.run(
             constant_op.constant(1.0), options=run_options, run_metadata=None)
-        self.assertTrue(not run_metadata.HasField('step_stats'))
+        self.assertFalse(run_metadata.HasField('step_stats'))
 
         sess.run(
             constant_op.constant(1.0),
@@ -1730,9 +1754,9 @@ class SessionTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default(), ops.device('/cpu:0'):
       a = constant_op.constant([[1, 2]])
       sess = session.Session()
-      self.assertFalse('_output_shapes' in sess.graph_def.node[0].attr)
+      self.assertNotIn('_output_shapes', sess.graph_def.node[0].attr)
       # Avoid lint error regarding 'unused' var a.
-      self.assertTrue(a == a)
+      self.assertEqual(a, a)
 
   def testInferShapesTrue(self):
     config_pb = config_pb2.ConfigProto(
@@ -1740,9 +1764,9 @@ class SessionTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default(), ops.device('/cpu:0'):
       a = constant_op.constant([[1, 2]])
       sess = session.Session(config=config_pb)
-      self.assertTrue('_output_shapes' in sess.graph_def.node[0].attr)
+      self.assertIn('_output_shapes', sess.graph_def.node[0].attr)
       # Avoid lint error regarding 'unused' var a.
-      self.assertTrue(a == a)
+      self.assertEqual(a, a)
 
   def testBuildCostModel(self):
     run_options = config_pb2.RunOptions()
diff --git a/tensorflow/python/compat/BUILD b/tensorflow/python/compat/BUILD
index 70f492175cb..ddc8c484390 100644
--- a/tensorflow/python/compat/BUILD
+++ b/tensorflow/python/compat/BUILD
@@ -4,8 +4,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 py_library(
     name = "v2_compat",
     srcs = ["v2_compat.py"],
@@ -34,6 +32,7 @@ tf_py_test(
     size = "small",
     srcs = ["compat_test.py"],
     tags = ["nofwdcompat"],
+    tfrt_enabled = True,
     deps = [
         ":compat",
         "//tensorflow/python:client_testlib",
@@ -44,6 +43,7 @@ tf_py_test(
     name = "disable_v2_behavior_test",
     size = "small",
     srcs = ["disable_v2_behavior_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":v2_compat",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index aae1de594de..0067cf6d93f 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 8, 24)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 10, 18)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/compiler/mlir/BUILD b/tensorflow/python/compiler/mlir/BUILD
index fe59213837b..7e193795e60 100644
--- a/tensorflow/python/compiler/mlir/BUILD
+++ b/tensorflow/python/compiler/mlir/BUILD
@@ -11,7 +11,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:pywrap_mlir",
-        "//tensorflow/python:util",
+        "//tensorflow/python:tf_export",
     ],
 )
 
@@ -22,6 +22,10 @@ py_test(
     deps = [
         ":mlir",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/eager:def_function",
     ],
 )
diff --git a/tensorflow/python/compiler/mlir/mlir.py b/tensorflow/python/compiler/mlir/mlir.py
index fd9918d19f8..3b72abc2850 100644
--- a/tensorflow/python/compiler/mlir/mlir.py
+++ b/tensorflow/python/compiler/mlir/mlir.py
@@ -26,6 +26,9 @@ from tensorflow.python.util.tf_export import tf_export
 def convert_graph_def(graph_def, pass_pipeline='tf-standard-pipeline'):
   """Import a GraphDef and convert it to a textual MLIR module.
 
+  This API is only intended for inspecting the internals of TensorFlow and the
+  string returned is at the moment intended for debugging purposes.
+
   Args:
     graph_def: An object of type graph_pb2.GraphDef or a textual proto
       representation of a valid GraphDef.
@@ -35,7 +38,51 @@ def convert_graph_def(graph_def, pass_pipeline='tf-standard-pipeline'):
 
   Returns:
     A textual representation of the MLIR module corresponding to the graphdef.
-    Raises a RuntimeError on error.
+
+  Raises:
+    InvalidArgumentError: if graph_def is invalid or cannot be converted to
+      MLIR.
 
   """
   return pywrap_mlir.import_graphdef(graph_def, pass_pipeline)
+
+
+@tf_export('mlir.experimental.convert_function')
+def convert_function(concrete_function, pass_pipeline='tf-standard-pipeline'):
+  """Import a ConcreteFunction and convert it to a textual MLIR module.
+
+  This API is only intended for inspecting the internals of TensorFlow and the
+  string returned is at the moment intended for debugging purposes.
+
+  A [tf.function](https://www.tensorflow.org/api_docs/python/tf/function) can be
+  imported and converted from TensorFlow to TensorFlow MLIR with this API by
+  extracting its ConcreteFunction (eagerly-executing wrapper around a
+  [tf.Graph](https://www.tensorflow.org/api_docs/python/tf/Graph)).
+
+  For example:
+  >>> @tf.function
+  ... def add(a, b):
+  ...   return a + b
+
+  >>> concrete_function = add.get_concrete_function(
+  ...     tf.TensorSpec(None, tf.dtypes.float32),
+  ...     tf.TensorSpec(None, tf.dtypes.float32))
+  >>> tf.mlir.experimental.convert_function(concrete_function)
+  '...module attributes {...} {...}'
+
+  Args:
+    concrete_function: An object of type ConcreteFunction.
+    pass_pipeline: A textual description of an MLIR Pass Pipeline to run on the
+      module, see MLIR documentation for the
+      [textual pass pipeline syntax](https://mlir.llvm.org/docs/PassManagement/#textual-pass-pipeline-specification).
+
+  Returns:
+    A textual representation of the MLIR module corresponding to the
+    ConcreteFunction.
+
+  Raises:
+    InvalidArgumentError: if concrete_function is invalid or cannot be converted
+      to MLIR.
+
+  """
+  return pywrap_mlir.import_function(concrete_function, pass_pipeline)
diff --git a/tensorflow/python/compiler/mlir/mlir_test.py b/tensorflow/python/compiler/mlir/mlir_test.py
index 2a2362d9f6b..9cb0063dc64 100644
--- a/tensorflow/python/compiler/mlir/mlir_test.py
+++ b/tensorflow/python/compiler/mlir/mlir_test.py
@@ -19,23 +19,68 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.compiler.mlir import mlir
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import logging_ops
 from tensorflow.python.platform import test
 
 
-class MLIRImportTest(test.TestCase):
+class MLIRGraphDefImportTest(test.TestCase):
 
-  def test_import_graph_def(self):
+  def testImport(self):
     """Tests the basic flow of `tf.mlir.experimental.convert_graph_def`."""
     mlir_module = mlir.convert_graph_def('')
     # An empty graph should contain at least an empty main function.
     self.assertIn('func @main', mlir_module)
 
-  def test_invalid_pbtxt(self):
+  def testInvalidPbtxt(self):
     with self.assertRaisesRegex(errors.InvalidArgumentError,
                                 'Could not parse input proto'):
       mlir.convert_graph_def('some invalid proto')
 
 
+class MLIRConcreteFunctionImportTest(test.TestCase):
+
+  def testImport(self):
+
+    @def_function.function
+    def identity(i):
+      return i
+
+    concrete_function = identity.get_concrete_function(
+        tensor_spec.TensorSpec(None, dtypes.float32))
+    mlir_module = mlir.convert_function(concrete_function)
+    self.assertRegex(mlir_module, r'func @.*identity.*\(')
+
+  def testImportWithCall(self):
+
+    @def_function.function
+    def callee(i):
+      return i
+
+    @def_function.function
+    def caller(i):
+      return callee(i)
+
+    concrete_function = caller.get_concrete_function(
+        tensor_spec.TensorSpec(None, dtypes.float32))
+    mlir_module = mlir.convert_function(concrete_function)
+    self.assertRegex(mlir_module, r'func @.*caller.*\(')
+    self.assertRegex(mlir_module, r'func @.*callee.*\(')
+
+  def testImportWithControlRet(self):
+
+    @def_function.function
+    def logging():
+      logging_ops.print_v2('some message')
+
+    concrete_function = logging.get_concrete_function()
+    mlir_module = mlir.convert_function(concrete_function, pass_pipeline='')
+    self.assertRegex(mlir_module, r'tf\.PrintV2')
+    self.assertRegex(mlir_module, r'tf_executor.fetch.*: !tf_executor.control')
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index 387d379e601..fcbb5c0bd87 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -3,7 +3,10 @@
 #   and provide TensorRT operators and converter package.
 #   APIs are meant to change over time.
 
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
 
 # cuda_py_test and cuda_py_tests enable XLA tests by default. We can't
@@ -37,6 +40,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/compiler/tf2tensorrt:_pywrap_py_utils",
+        "//tensorflow/compiler/tf2tensorrt:trt_engine_instance_proto_py",
         "//tensorflow/compiler/tf2tensorrt:trt_ops_loader",
         "//tensorflow/python:convert_to_constants",
         "//tensorflow/python:func_graph",
@@ -97,9 +101,11 @@ cuda_py_test(
         "no_windows",
         "nomac",
     ],
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,
     deps = [
         ":trt_convert_py",
+        "//tensorflow/compiler/tf2tensorrt:trt_engine_instance_proto_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:graph_util",
@@ -145,9 +151,8 @@ cuda_py_tests(
     ],
     python_version = "PY3",
     tags = [
-        "no_cuda11",  # TODO(b/165611343): Need to address the failures.
         "no_cuda_on_cpu_tap",
-        "no_oss",
+        "no_oss",  # TODO(b/165611343): Need to address the failures for CUDA 11 in OSS build.
         "no_rocm",
         "no_windows",
         "nomac",
@@ -177,6 +182,7 @@ cuda_py_test(
         "no_windows",
         "nomac",
     ],
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base",
diff --git a/tensorflow/python/compiler/tensorrt/test/base_test.py b/tensorflow/python/compiler/tensorrt/test/base_test.py
index 195382cd8ed..9d2d3abd4fb 100644
--- a/tensorflow/python/compiler/tensorrt/test/base_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/base_test.py
@@ -70,12 +70,6 @@ class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
         ]
     }
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162448349")
-    return super().ShouldRunTest(run_params)
-
 
 class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
 
@@ -136,12 +130,6 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
     return conversion_params._replace(
         rewriter_config_template=rewrite_config_with_trt)
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162448349")
-    return super().ShouldRunTest(run_params)
-
 
 class SimpleMultiEnginesTest2(trt_test.TfTrtIntegrationTestBase):
 
diff --git a/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py b/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py
index e956ce58814..c32ea99629b 100644
--- a/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/batch_matmul_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import unittest
+
 import numpy as np
 
 from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
@@ -29,7 +31,30 @@ from tensorflow.python.ops import nn
 from tensorflow.python.platform import test
 
 
-class BatchMatMulTwoTensorTest(trt_test.TfTrtIntegrationTestBase):
+class BatchMatMultTestBase(trt_test.TfTrtIntegrationTestBase):
+  """Base class for BatchMatMult tests."""
+
+  # Shape inference of BatchMatMultV2 doesn't work. Use static batch size.
+  def BuildParams(self, graph_fn, dtype, input_shapes, output_shapes):
+    return self.BuildParamsWithMask(
+        graph_fn=graph_fn,
+        dtype=dtype,
+        input_shapes=input_shapes,
+        output_shapes=output_shapes,
+        input_mask=[[True] * len(s) for s in input_shapes],
+        output_mask=[[True] * len(s) for s in output_shapes],
+        extra_inputs=[],
+        extra_outputs=[])
+
+  @classmethod
+  def setUpClass(cls):
+    if cls is BatchMatMultTestBase:
+      raise unittest.SkipTest(
+          "BatchMatMultTestBase defines base class for other test.")
+    super(BatchMatMultTestBase, cls).setUpClass()
+
+
+class BatchMatMulTwoTensorTest(BatchMatMultTestBase):
   """Testing conversion of BatchMatMul where both inputs are tensors."""
 
   def GraphFn(self, inp, inp1):
@@ -47,7 +72,7 @@ class BatchMatMulTwoTensorTest(trt_test.TfTrtIntegrationTestBase):
     return {"TRTEngineOp_0": ["matmul", "relu"]}
 
 
-class BatchMatMulWeightBroadcastTest(trt_test.TfTrtIntegrationTestBase):
+class BatchMatMulWeightBroadcastTest(BatchMatMultTestBase):
   """Testing BatchMatMulV2: one operand is weight and both have same rank."""
 
   def GraphFn(self, inp):
@@ -66,7 +91,7 @@ class BatchMatMulWeightBroadcastTest(trt_test.TfTrtIntegrationTestBase):
     return {"TRTEngineOp_0": ["matmul", "kernel"]}
 
 
-class BatchMatMulWeightBroadcastDims2Test(trt_test.TfTrtIntegrationTestBase):
+class BatchMatMulWeightBroadcastDims2Test(BatchMatMultTestBase):
   """Testing BatchMatMulV2: weight operand must be broadcasted."""
 
   def GraphFn(self, inp):
diff --git a/tensorflow/python/compiler/tensorrt/test/combined_nms_test.py b/tensorflow/python/compiler/tensorrt/test/combined_nms_test.py
index 26e911e3b0b..3f2a5469ae6 100644
--- a/tensorflow/python/compiler/tensorrt/test/combined_nms_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/combined_nms_test.py
@@ -89,9 +89,6 @@ class CombinedNmsTest(trt_test.TfTrtIntegrationTestBase):
     }
 
   def ShouldRunTest(self, run_params):
-    # TODO(b/162447069): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, 'Skip test due to b/162447069')
     # There is no CombinedNonMaxSuppression op for GPU at the moment, so
     # calibration will fail.
     # TODO(laigd): fix this.
diff --git a/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py b/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py
index 9e71b9e3f75..ccbaf9e52fa 100644
--- a/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/const_broadcast_test.py
@@ -60,12 +60,6 @@ class ConstBroadcastTest(trt_test.TfTrtIntegrationTestBase):
     """The relative tolerance to compare floating point results."""
     return 1.e-04 if run_params.precision_mode == 'FP32' else 1.e-02
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, 'Skip test due to b/162448349')
-    return super().ShouldRunTest(run_params)
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/conv2d_test.py b/tensorflow/python/compiler/tensorrt/test/conv2d_test.py
index 400c17b343e..df1adce2178 100644
--- a/tensorflow/python/compiler/tensorrt/test/conv2d_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/conv2d_test.py
@@ -114,12 +114,6 @@ class Conv2DNCHWTest(trt_test.TfTrtIntegrationTestBase):
       return 4e-02
     return super(Conv2DNCHWTest, self).ExpectedRelativeTolerance(run_params)
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162448349")
-    return super().ShouldRunTest(run_params)
-
 
 class Conv2DNHWCTest(trt_test.TfTrtIntegrationTestBase):
   """Testing conversion of Conv2D (data_format=NCHW) in TF-TRT conversion."""
@@ -143,12 +137,6 @@ class Conv2DNHWCTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     return ["TRTEngineOp_0"]
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162448349")
-    return super().ShouldRunTest(run_params)
-
 
 class Conv2DStridedNCHWTest(trt_test.TfTrtIntegrationTestBase):
   """Testing conversion of strided Conv2D (data_format=NCHW)."""
@@ -180,12 +168,6 @@ class Conv2DStridedNCHWTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     return ["TRTEngineOp_0"]
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162448349")
-    return super().ShouldRunTest(run_params)
-
 
 class Conv2DTranposeTest(trt_test.TfTrtIntegrationTestBase):
   """Testing conversion of conv2d_transpose (AKA Conv2DBackpropInput)"""
diff --git a/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py b/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
index f02ad08777e..95dbe727ac3 100644
--- a/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/dynamic_input_shapes_test.py
@@ -98,9 +98,6 @@ class DynamicInputShapesTest(trt_test.TfTrtIntegrationTestBase):
     return ["TRTEngineOp_0"]
 
   def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162448349")
     return (run_params.dynamic_engine and not trt_test.IsQuantizationMode(
         run_params.precision_mode)), "test dynamic engine and non-INT8"
 
diff --git a/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py b/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py
index c1f0a007bf8..056edc3e4d4 100644
--- a/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/memory_alignment_test.py
@@ -67,12 +67,6 @@ class MemoryAlignmentTest(trt_test.TfTrtIntegrationTestBase):
     """The relative tolerance to compare floating point results."""
     return 0.1
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162448349")
-    return super().ShouldRunTest(run_params)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py b/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py
index 687a12486b7..b57bee6c5d7 100644
--- a/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/multi_connection_neighbor_engine_test.py
@@ -72,12 +72,6 @@ class MultiConnectionNeighborEngineTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     return ["TRTEngineOp_0", "TRTEngineOp_1"]
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162447069): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162447069")
-    return super().ShouldRunTest(run_params)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py b/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py
index 39fee5cba5d..f377fe8dceb 100644
--- a/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/neighboring_engine_test.py
@@ -61,12 +61,6 @@ class NeighboringEngineTest(trt_test.TfTrtIntegrationTestBase):
         "TRTEngineOp_1": ["weights", "conv"]
     }
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162447069): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162447069")
-    return super().ShouldRunTest(run_params)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
index d859407f1f7..000b231a61a 100644
--- a/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py
@@ -261,10 +261,6 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
     if not is_tensorrt_enabled():
       return
 
-    # TODO(b/162447069): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return
-
     model_dir = test.test_src_dir_path(
         'python/compiler/tensorrt/test/testdata/mnist')
 
diff --git a/tensorflow/python/compiler/tensorrt/test/rank_two_test.py b/tensorflow/python/compiler/tensorrt/test/rank_two_test.py
index b23e052a316..8884cd702d2 100644
--- a/tensorflow/python/compiler/tensorrt/test/rank_two_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/rank_two_test.py
@@ -67,9 +67,14 @@ class RankTwoTest(trt_test.TfTrtIntegrationTestBase):
             "abs0_2", "expand0_0", "expand0_1", "axis"
         ],
         "TRTEngineOp_1": [
-            "add", "add1_1", "add1_2", "add1_3", "c1_1", "c1_2", "c1_3",
-            "abs1_1", "abs1_2", "reciprocal0", "reciprocal1"
+            "add1_1", "add1_2", "add1_3", "c1_1", "c1_2", "c1_3", "abs1_1",
+            "abs1_2", "reciprocal1"
         ],
+        # The two ops can't be in the same cluster as the ops in TRTEngineOp_0
+        # due to trt_incompatible_op. They can't be in the same cluster as the
+        # ops in TRTEngineOP_1 because their batch size belongs to a different
+        # equivalent class.
+        "TRTEngineOp_2": ["add", "reciprocal0"]
     }
 
 
diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
index 27133a14203..76aa755391f 100644
--- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
@@ -44,6 +44,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.profiler import trace
 from tensorflow.python.saved_model import builder
 from tensorflow.python.saved_model import load
 from tensorflow.python.saved_model import loader
@@ -280,7 +281,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         is_dynamic_op=run_params.dynamic_engine,
         maximum_cached_engines=1,
         use_calibration=run_params.use_calibration,
-        max_batch_size=min(batch_list))
+        max_batch_size=max(batch_list))
     return conversion_params
 
   def GetTrtRewriterConfig(self,
@@ -418,7 +419,6 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
                 run_params,
                 saved_model_dir,
                 inputs_data,
-                config,
                 graph_state,
                 num_runs=2):
     params = self._GetParamsCached()
@@ -430,6 +430,14 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
                                  num_runs)
       gc.collect()  # Force GC to destroy the TRT engine cache.
       return results
+
+    # The default config for tf.session is None. Create a config with
+    # TensorRTOptimizer enabled to support convert_online for inference.
+    config = None
+    # TODO(b/170220818): use the default session config to run inferenence
+    #   graphs for the offline conversion case after fixing the bug.
+    if graph_state == GraphState.INFERENCE:
+      config = self._GetConfigProto(run_params, GraphState.INFERENCE)
     return self._RunGraphV1(saved_model_dir, inputs_data, config, num_runs)
 
   def _CreateConverter(self, run_params, saved_model_dir, session_config,
@@ -814,70 +822,72 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     return self._MakeSavedModelV1(run_params)
 
   def RunTest(self, run_params):
-    should_run, reason_for_skipping = self.ShouldRunTest(run_params)
-    if not should_run:
-      return self.skipTest(reason_for_skipping)
+    with trace.Trace(run_params.test_name):
+      should_run, reason_for_skipping = self.ShouldRunTest(run_params)
+      if not should_run:
+        return self.skipTest(reason_for_skipping)
 
-    saved_model_dir = self._MakeSavedModel(run_params)
+      saved_model_dir = self._MakeSavedModel(run_params)
 
-    np.random.seed(12345)  # Fix the seed so the test is deterministic.
-    inputs_data = []
-    input_specs = self._GetParamsCached().input_specs
-    for dim_list in self._GetParamsCached().input_dims:
-      assert len(input_specs) == len(dim_list)
-      current_input_data = []
-      for spec, np_shape in zip(input_specs, dim_list):
-        np_dtype = spec.dtype.as_numpy_dtype()
-        # Multiply the input by some constant to avoid all zeros input for
-        # integer types.
-        scale = 10.0 if np.issubdtype(np_dtype, np.integer) else 1.0
-        # TODO(laigd): add debug options. E.g. we can set the input data to be
-        # continuous natural numbers:
-        # seq = np.arange(np.prod(np_shape))
-        # seq.resize(np_shape)
-        # current_inputs_data.append(scale * seq.astype(np_dtype))
-        data = (scale * np.random.random_sample(np_shape)).astype(np_dtype)
-        if run_params.is_v2:
-          with ops.device("/GPU:0"):
-            data = ops.convert_to_tensor(data)
-        current_input_data.append(data)
-      inputs_data.append(current_input_data)
+      np.random.seed(12345)  # Fix the seed so the test is deterministic.
+      inputs_data = []
+      input_specs = self._GetParamsCached().input_specs
+      for dim_list in self._GetParamsCached().input_dims:
+        assert len(input_specs) == len(dim_list)
+        current_input_data = []
+        for spec, np_shape in zip(input_specs, dim_list):
+          np_dtype = spec.dtype.as_numpy_dtype()
+          # Multiply the input by some constant to avoid all zeros input for
+          # integer types.
+          scale = 10.0 if np.issubdtype(np_dtype, np.integer) else 1.0
+          # TODO(laigd): add debug options. E.g. we can set the input data to be
+          # continuous natural numbers:
+          # seq = np.arange(np.prod(np_shape))
+          # seq.resize(np_shape)
+          # current_inputs_data.append(scale * seq.astype(np_dtype))
+          data = (scale * np.random.random_sample(np_shape)).astype(np_dtype)
+          if run_params.is_v2:
+            with ops.device("/GPU:0"):
+              data = ops.convert_to_tensor(data)
+          current_input_data.append(data)
+        inputs_data.append(current_input_data)
 
-    # Verify original graph.
-    self._VerifyGraphDef(run_params, saved_model_dir, saved_model_dir,
-                         GraphState.ORIGINAL)
+      # Verify the original graph.
+      self._VerifyGraphDef(run_params, saved_model_dir, saved_model_dir,
+                           GraphState.ORIGINAL)
 
-    # Run original graph without trt to get reference result.
-    config_no_trt = self._GetConfigProto(run_params, GraphState.ORIGINAL)
-    logging.info("Running original graph w/o trt, config:\n%s",
-                 str(config_no_trt))
-    ref_result = self._RunGraph(run_params, saved_model_dir, inputs_data,
-                                config_no_trt, GraphState.ORIGINAL)
+      # Run the original graph without TensorRT to get the reference result.
+      logging.info("Running original graph w/o TensorRT\n")
+      ref_result = self._RunGraph(
+          run_params,
+          saved_model_dir,
+          inputs_data,
+          GraphState.ORIGINAL,
+          num_runs=1)
 
-    # Run calibration if necessary.
-    if IsQuantizationWithCalibration(run_params):
-      infer_saved_model_dir = self._GetCalibratedInferGraph(
-          run_params, saved_model_dir, inputs_data)
-      self._VerifyGraphDef(run_params, saved_model_dir, infer_saved_model_dir,
-                           GraphState.INFERENCE)
-    elif not run_params.convert_online:
-      infer_saved_model_dir = self._GetInferGraph(run_params, saved_model_dir)
-      self._VerifyGraphDef(run_params, saved_model_dir, infer_saved_model_dir,
-                           GraphState.INFERENCE)
-    else:
-      infer_saved_model_dir = saved_model_dir
+      # Run calibration if necessary.
+      if IsQuantizationWithCalibration(run_params):
+        infer_saved_model_dir = self._GetCalibratedInferGraph(
+            run_params, saved_model_dir, inputs_data)
+        self._VerifyGraphDef(run_params, saved_model_dir, infer_saved_model_dir,
+                             GraphState.INFERENCE)
+      elif not run_params.convert_online:
+        infer_saved_model_dir = self._GetInferGraph(run_params, saved_model_dir)
+        self._VerifyGraphDef(run_params, saved_model_dir, infer_saved_model_dir,
+                             GraphState.INFERENCE)
+      else:
+        infer_saved_model_dir = saved_model_dir
 
-    # Run inference.
-    infer_config = self._GetConfigProto(run_params, GraphState.INFERENCE)
-    logging.info("Running final inference graph, config:\n%s",
-                 str(infer_config))
-    result = self._RunGraph(run_params, infer_saved_model_dir, inputs_data,
-                            infer_config, GraphState.INFERENCE)
-    self.assertAllClose(
-        ref_result,
-        result,
-        atol=self.ExpectedAbsoluteTolerance(run_params),
-        rtol=self.ExpectedRelativeTolerance(run_params))
+      # Run the inference graph, either using the converted graph or the
+      # original graph with convert_online == True.
+      logging.info("Running final inference graph\n")
+      result = self._RunGraph(run_params, infer_saved_model_dir, inputs_data,
+                              GraphState.INFERENCE)
+      self.assertAllClose(
+          ref_result,
+          result,
+          atol=self.ExpectedAbsoluteTolerance(run_params),
+          rtol=self.ExpectedRelativeTolerance(run_params))
 
   def testIdempotence(self):
     # Test that applying tensorrt optimizer or offline conversion tools multiple
diff --git a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
index 7d991678748..ee56be3f11a 100644
--- a/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/trt_mode_test.py
@@ -59,11 +59,19 @@ class TrtModeTestBase(trt_test.TfTrtIntegrationTestBase):
     return self.BuildParams(self.GraphFn, dtypes.float32, [[1, 12, 5]],
                             [[12, 5]])
 
-  def GetConversionParams(self, run_params, implicit_batch=False):
+  def GetConversionParams(self,
+                          run_params,
+                          max_batch_size=0,
+                          implicit_batch=False):
     """Return a TrtConversionParams for test."""
 
     conversion_params = super(TrtModeTestBase,
                               self).GetConversionParams(run_params)
+    # If max_batch_size!=0, use the value for conversion_params.
+    if max_batch_size and implicit_batch:
+      conversion_params = conversion_params._replace(
+          max_batch_size=max_batch_size)
+
     rewriter_config = self.GetTrtRewriterConfig(
         run_params=run_params,
         conversion_params=conversion_params,
@@ -81,7 +89,10 @@ class ImplicitBatchTest(TrtModeTestBase):
 
   def GetConversionParams(self, run_params):
     """Return a TrtConversionParams for test using implicit batch mdoe."""
-    return super(ImplicitBatchTest, self).GetConversionParams(run_params, True)
+    # The first dimension of the input is squeezed and the batch size for the
+    # rest OPs is 12.
+    return super(ImplicitBatchTest,
+                 self).GetConversionParams(run_params, 12, True)
 
   def ExpectedEnginesToBuild(self, run_params):
     """Check that the expected engine is built.
diff --git a/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py b/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
index 43034e8b31e..8fd9606812d 100644
--- a/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/vgg_block_nchw_test.py
@@ -76,12 +76,6 @@ class VGGBlockNCHWTest(trt_test.TfTrtIntegrationTestBase):
     super(trt_test.TfTrtIntegrationTestBase, self).setUp()
     os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "True"
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162448349")
-    return super().ShouldRunTest(run_params)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py b/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
index 7b1f7e062d7..9d81cd6dcc3 100644
--- a/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
+++ b/tensorflow/python/compiler/tensorrt/test/vgg_block_test.py
@@ -67,12 +67,6 @@ class VGGBlockTest(trt_test.TfTrtIntegrationTestBase):
     super(trt_test.TfTrtIntegrationTestBase, self).setUp()
     os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "True"
 
-  def ShouldRunTest(self, run_params):
-    # TODO(b/162448349): Enable the test for TRT 7.1.3.
-    if trt_test.IsTensorRTVersionGreaterEqual(7, 1, 3):
-      return (False, "Skip test due to b/162448349")
-    return super().ShouldRunTest(run_params)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py
index a0388c3630d..b47012469e0 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert.py
@@ -50,6 +50,7 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import saver
 from tensorflow.python.training.tracking import tracking
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
@@ -154,7 +155,7 @@ class TrtConversionParams(
       there is a mismatch between which tensors TRT quantizes and which
       tensors were trained with fake quantization.
     max_batch_size: max size for the input batch. This parameter is only
-      effective when is_dynamic_op=False which is not supported in TF 2.0.
+      effective when use_implicit_batch is true.
     allow_build_at_runtime: whether to build TensorRT engines during runtime.
       If no TensorRT engine can be found in cache that can handle the given
       inputs during runtime, then a new TensorRT engine is built at runtime if
@@ -341,9 +342,8 @@ def get_tensorrt_rewriter_config(conversion_params,
     optimizer.parameter_map["is_dynamic_op"].b = conversion_params.is_dynamic_op
     optimizer.parameter_map[
         "allow_build_at_runtime"].b = conversion_params.allow_build_at_runtime
-    if not is_v2:
-      optimizer.parameter_map[
-          "max_batch_size"].i = conversion_params.max_batch_size
+    optimizer.parameter_map[
+        "max_batch_size"].i = conversion_params.max_batch_size
   else:
     rewriter_config_with_trt.CopyFrom(
         conversion_params.rewriter_config_template)
@@ -427,6 +427,8 @@ class TrtGraphConverter(object):
   ```
   """
 
+  @deprecation.deprecated_args(None, "Remove the use of this argument",
+                               "session_config")
   def __init__(self,
                input_saved_model_dir=None,
                input_saved_model_tags=None,
@@ -995,6 +997,9 @@ class TrtGraphConverterV2(object):
     assert context.executing_eagerly()
     if conversion_params is None:
       conversion_params = TrtConversionParams()
+    elif conversion_params.rewriter_config_template is not None:
+      tf_logging.warn("the rewrite_config_template field will be deprecated.")
+
     _check_trt_version_compatibility()
     _check_conversion_params(conversion_params, is_v2=True)
 
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert_test.py b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
index 1aa53a5bc1b..0baae0bd3bf 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert_test.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
@@ -27,6 +27,7 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.compiler.tf2tensorrt._pywrap_py_utils import is_tensorrt_enabled
+from tensorflow.compiler.tf2tensorrt.utils.trt_engine_instance_pb2 import TRTEngineInstance  # pylint: disable=g-importing-member
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
@@ -74,6 +75,10 @@ class TrtConvertTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def mkdtemp(self):
     return tempfile.mkdtemp(dir=self.get_temp_dir())
 
+  def testTRTEngineInstanceAvailable(self):
+    # test if we can access the TRTEngineInstance protobuf
+    assert hasattr(TRTEngineInstance(), "serialized_engine")
+
   def testGetTensorrtRewriterConfig(self):
     """Test case for TrtGraphConverter.get_tensorrt_rewriter_config()."""
     if not is_tensorrt_enabled():
@@ -86,7 +91,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         is_dynamic_op=True,
         maximum_cached_engines=2)
     rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(
-        conversion_params=conversion_params)
+        conversion_params=conversion_params, is_v2=True)
     self.assertEqual(["constfold", "layout", "constfold"],
                      rewriter_cfg.optimizers)
     self.assertEqual(rewriter_config_pb2.RewriterConfig.ONE,
diff --git a/tensorflow/python/compiler/xla/BUILD b/tensorflow/python/compiler/xla/BUILD
index 5f4e27b47cb..5d92e43c61a 100644
--- a/tensorflow/python/compiler/xla/BUILD
+++ b/tensorflow/python/compiler/xla/BUILD
@@ -99,6 +99,7 @@ cuda_py_test(
         "no_mac",
         "no_windows",
     ],
+    tfrt_enabled = True,
     xla_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/data/__init__.py b/tensorflow/python/data/__init__.py
index 39cbd3de735..04ea1e6b0c1 100644
--- a/tensorflow/python/data/__init__.py
+++ b/tensorflow/python/data/__init__.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 
 # pylint: disable=unused-import
 from tensorflow.python.data import experimental
+from tensorflow.python.data.ops.dataset_ops import AUTOTUNE
 from tensorflow.python.data.ops.dataset_ops import Dataset
 from tensorflow.python.data.ops.dataset_ops import INFINITE as INFINITE_CARDINALITY
 from tensorflow.python.data.ops.dataset_ops import make_initializable_iterator
diff --git a/tensorflow/python/data/benchmarks/BUILD b/tensorflow/python/data/benchmarks/BUILD
index 3f0faf5364a..94c189c43f1 100644
--- a/tensorflow/python/data/benchmarks/BUILD
+++ b/tensorflow/python/data/benchmarks/BUILD
@@ -10,6 +10,7 @@ exports_files(["LICENSE"])
 tf_py_test(
     name = "meta_benchmark",
     srcs = ["meta_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:session",
@@ -34,6 +35,7 @@ py_library(
 tf_py_test(
     name = "batch_benchmark",
     srcs = ["batch_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         ":benchmark_base",
         "//tensorflow/python:sparse_tensor",
@@ -45,6 +47,7 @@ tf_py_test(
 tf_py_test(
     name = "filter_benchmark",
     srcs = ["filter_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -54,6 +57,7 @@ tf_py_test(
 tf_py_test(
     name = "from_tensor_slices_benchmark",
     srcs = ["from_tensor_slices_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         ":benchmark_base",
         "//tensorflow/python/data/experimental/ops:get_single_element",
@@ -65,6 +69,7 @@ tf_py_test(
 tf_py_test(
     name = "list_files_benchmark",
     srcs = ["list_files_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         ":benchmark_base",
         "//tensorflow/python:client_testlib",
@@ -79,6 +84,7 @@ tf_py_test(
 tf_py_test(
     name = "map_benchmark",
     srcs = ["map_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -88,6 +94,7 @@ tf_py_test(
 tf_py_test(
     name = "prefetch_benchmark",
     srcs = ["prefetch_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -97,6 +104,7 @@ tf_py_test(
 tf_py_test(
     name = "range_benchmark",
     srcs = ["range_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         ":benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
diff --git a/tensorflow/python/data/benchmarks/benchmark_base.py b/tensorflow/python/data/benchmarks/benchmark_base.py
index 1e1a71668ef..7ee2de44ff4 100644
--- a/tensorflow/python/data/benchmarks/benchmark_base.py
+++ b/tensorflow/python/data/benchmarks/benchmark_base.py
@@ -75,7 +75,7 @@ class DatasetBenchmarkBase(test.Benchmark):
         if warmup:
           # Run once to warm up the session caches.
           sess.run(iterator.initializer)
-          sess.run(next_element)
+          sess.run(next_element.op)
 
         sess.run(iterator.initializer)
         start = time.time()
diff --git a/tensorflow/python/data/experimental/benchmarks/BUILD b/tensorflow/python/data/experimental/benchmarks/BUILD
index e3ca2d52ab5..a3ceb9ed37f 100644
--- a/tensorflow/python/data/experimental/benchmarks/BUILD
+++ b/tensorflow/python/data/experimental/benchmarks/BUILD
@@ -25,6 +25,7 @@ py_binary(
 tf_py_test(
     name = "autotune_benchmark",
     srcs = ["autotune_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
@@ -37,6 +38,7 @@ tf_py_test(
 tf_py_test(
     name = "choose_fastest_benchmark",
     srcs = ["choose_fastest_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
@@ -50,6 +52,7 @@ tf_py_test(
 tf_py_test(
     name = "choose_fastest_branch_benchmark",
     srcs = ["choose_fastest_branch_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
@@ -66,6 +69,7 @@ tf_py_test(
     name = "csv_dataset_benchmark",
     srcs = ["csv_dataset_benchmark.py"],
     tags = ["no_pip"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:parsing_ops",
@@ -81,6 +85,7 @@ tf_py_test(
 tf_py_test(
     name = "map_and_batch_benchmark",
     srcs = ["map_and_batch_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -99,6 +104,7 @@ tf_py_test(
 tf_py_test(
     name = "map_defun_benchmark",
     srcs = ["map_defun_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -114,6 +120,7 @@ tf_py_test(
 tf_py_test(
     name = "map_vectorization_benchmark",
     srcs = ["map_vectorization_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -133,6 +140,7 @@ tf_py_test(
     name = "matching_files_benchmark",
     size = "small",
     srcs = ["matching_files_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -148,6 +156,7 @@ tf_py_test(
 tf_py_test(
     name = "optimize_benchmark",
     srcs = ["optimize_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
@@ -161,6 +170,7 @@ tf_py_test(
 tf_py_test(
     name = "parallel_interleave_benchmark",
     srcs = ["parallel_interleave_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:math_ops",
@@ -176,6 +186,7 @@ tf_py_test(
     name = "rejection_resample_benchmark",
     srcs = ["rejection_resample_benchmark.py"],
     tags = ["no_pip"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/data/experimental/ops:resampling",
@@ -188,6 +199,7 @@ tf_py_test(
 tf_py_test(
     name = "snapshot_dataset_benchmark",
     srcs = ["snapshot_dataset_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -206,6 +218,7 @@ tf_py_test(
 tf_py_test(
     name = "unbatch_benchmark",
     srcs = ["unbatch_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 18b748904e6..e0ed58b4662 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -139,6 +139,56 @@ tf_py_test(
     ],
 )
 
+py_library(
+    name = "data_service_test_base",
+    srcs = ["data_service_test_base.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/data/experimental/service:server_lib",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "data_service_ops_test",
+    size = "medium",
+    srcs = ["data_service_ops_test.py"],
+    shard_count = 10,
+    srcs_version = "PY3",
+    deps = [
+        ":data_service_test_base",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python/data",
+        "//tensorflow/python/data/experimental/ops:testing",
+        "//tensorflow/python/data/experimental/service:server_lib",
+        "//tensorflow/python/data/kernel_tests:test_base",
+    ],
+)
+
+tf_py_test(
+    name = "data_service_ops_ft_test",
+    srcs = ["data_service_ops_ft_test.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":data_service_test_base",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python/data",
+        "//tensorflow/python/data/experimental/ops:testing",
+        "//tensorflow/python/data/experimental/service:server_lib",
+        "//tensorflow/python/data/kernel_tests:test_base",
+    ],
+)
+
 tf_py_test(
     name = "dense_to_sparse_batch_test",
     srcs = ["dense_to_sparse_batch_test.py"],
@@ -352,6 +402,7 @@ tf_py_test(
     size = "small",
     srcs = ["map_defun_op_test.py"],
     tags = ["no_pip"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
@@ -689,6 +740,7 @@ tf_py_test(
 tf_py_test(
     name = "sleep_test",
     srcs = ["sleep_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:util",
@@ -756,6 +808,7 @@ tf_py_test(
     srcs = ["stats_dataset_ops_test.py"],
     tags = [
         "no_pip",
+        "notap",
     ],
     deps = [
         ":reader_dataset_ops_test_base",
diff --git a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
index 564dda0cf11..d428baca9c0 100644
--- a/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/auto_shard_dataset_test.py
@@ -103,6 +103,43 @@ class AutoShardDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     ]
     self.assertDatasetProducesWithShuffle(dataset, expected, 5, 4, shuffle)
 
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(batch_size=[1, 3, 10])))
+  def testDatasetOfReaderDatasetsPipeline(self, batch_size):
+    # This tests a scenario where a list_files main return multiple files
+    # due to the glob containing wildcards.
+    def batch(iterator, n):
+      l = len(iterator)
+      for i in range(0, l, n):
+        yield iterator[i:min(i + n, l)]
+
+    datasets = []
+    for files in batch(self.test_filenames, batch_size):
+      datasets.append(
+          dataset_ops.Dataset.list_files(files, shuffle=False).map(
+              core_readers.TFRecordDataset))
+    dataset = dataset_ops.Dataset.from_tensor_slices(datasets)
+    dataset = dataset.flat_map(lambda x: x)
+
+    # Simulate additional ops in between flat_map and interleave. This should be
+    # a no-op since if ShardDataset is placed right after flat_map, we will only
+    # have two datasets left at this point.
+    dataset = dataset.prefetch(1)
+    dataset = dataset.prefetch(1)
+
+    dataset = dataset.interleave(
+        lambda x: x, cycle_length=1, num_parallel_calls=1)
+
+    dataset = distribute._AutoShardDataset(dataset, 5, 0)
+    expected = [
+        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
+        for f in (0, 5)
+        for r in range(0, 10)
+    ]
+
+    self.assertDatasetProduces(dataset, expected)
+
   @combinations.generate(test_base.default_test_combinations())
   def testZipReaderPipeline(self):
     dataset1 = dataset_ops.Dataset.list_files(
diff --git a/tensorflow/python/data/experimental/kernel_tests/compression_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/compression_ops_test.py
index a091bdca8b9..e7c84ee5d60 100644
--- a/tensorflow/python/data/experimental/kernel_tests/compression_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/compression_ops_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from collections import namedtuple
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import compression_ops
@@ -25,14 +26,24 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import structure
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 
 
 def _test_objects():
+
+  Item = namedtuple("Item", "id name")
+
   return [
       combinations.NamedObject("int", 1),
       combinations.NamedObject("string", "dog"),
       combinations.NamedObject("tuple", (1, 1)),
+      combinations.NamedObject("nested_tuple", ((1, 1), (2, 2))),
+      combinations.NamedObject("named_tuple", Item(id=1, name="item1")),
+      combinations.NamedObject("unicode", "アヒル"),
+      combinations.NamedObject(
+          "nested_named_tuple",
+          (Item(id=1, name="item1"), Item(id=2, name="item2"))),
       combinations.NamedObject("int_string_tuple", (1, "dog")),
       combinations.NamedObject(
           "sparse",
@@ -50,11 +61,32 @@ def _test_objects():
   ]
 
 
+def _test_v2_eager_only_objects():
+  return [
+      combinations.NamedObject(
+          "ragged",
+          ragged_factory_ops.constant([[0, 1, 2, 3], [4, 5], [6, 7, 8], [9]])),
+      combinations.NamedObject(
+          "sparse_ragged_structured", {
+              "sparse":
+                  sparse_tensor.SparseTensorValue(
+                      indices=[[0, 0], [1, 2]],
+                      values=[1, 2],
+                      dense_shape=[3, 4]),
+              "ragged":
+                  ragged_factory_ops.constant([[0, 1, 2, 3], [9]])
+          })
+  ]
+
+
 class CompressionOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(
       combinations.times(test_base.default_test_combinations(),
-                         combinations.combine(element=_test_objects())))
+                         combinations.combine(element=_test_objects())) +
+      combinations.times(
+          test_base.v2_eager_only_combinations(),
+          combinations.combine(element=_test_v2_eager_only_objects())))
   def testCompression(self, element):
     element = element._obj
 
@@ -65,7 +97,10 @@ class CompressionOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(
       combinations.times(test_base.default_test_combinations(),
-                         combinations.combine(element=_test_objects())))
+                         combinations.combine(element=_test_objects())) +
+      combinations.times(
+          test_base.v2_eager_only_combinations(),
+          combinations.combine(element=_test_v2_eager_only_objects())))
   def testDatasetCompression(self, element):
     element = element._obj
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/data_service_ops_ft_test.py b/tensorflow/python/data/experimental/kernel_tests/data_service_ops_ft_test.py
new file mode 100644
index 00000000000..33b7ca25985
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/data_service_ops_ft_test.py
@@ -0,0 +1,261 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.data service ops where servers are started late or preempted."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+import time
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.kernel_tests import data_service_test_base
+from tensorflow.python.data.experimental.ops import data_service_ops
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+TMP_WORK_DIR = data_service_test_base.TMP_WORK_DIR
+NO_WORK_DIR = data_service_test_base.NO_WORK_DIR
+
+
+class DataServiceOpsTest(data_service_test_base.TestBase,
+                         parameterized.TestCase):
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDispatcherStop(self):
+    cluster = self.create_cluster(num_workers=1)
+    num_elements = 100
+    ds = self.make_distributed_range_dataset(num_elements, cluster)
+    iterator = iter(ds)
+    results = []
+    results.append(next(iterator).numpy())
+    cluster.stop_dispatcher()
+    # After the dispatcher dies, the worker should continue providing the rest
+    # of the dataset's elements.
+    for _ in range(num_elements - 1):
+      results.append(next(iterator).numpy())
+    self.assertEqual(results, list(range(num_elements)))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDispatcherRestartBeforeReading(self):
+    cluster = self.create_cluster(num_workers=1)
+    num_elements = 100
+    ds = self.make_distributed_range_dataset(num_elements, cluster)
+    cluster.restart_dispatcher()
+
+    self.assertDatasetProduces(ds, list(range(num_elements)))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDispatcherRestartDuringReading(self):
+    cluster = self.create_cluster(num_workers=1)
+    num_elements = 100
+    ds = self.make_distributed_range_dataset(num_elements, cluster)
+    iterator = iter(ds)
+    results = []
+    for _ in range(num_elements // 2):
+      results.append(next(iterator).numpy())
+    cluster.restart_dispatcher()
+    for elem in iterator:
+      results.append(elem.numpy())
+
+    self.assertEqual(list(range(num_elements)), results)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDispatcherRestartBetweenIterations(self):
+    cluster = self.create_cluster(num_workers=1)
+    num_elements = 100
+    ds = self.make_distributed_range_dataset(100, cluster)
+    self.assertDatasetProduces(ds, list(range(num_elements)))
+    cluster.restart_dispatcher()
+    self.assertDatasetProduces(ds, list(range(num_elements)))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDispatcherManyRestarts(self):
+    cluster = self.create_cluster(num_workers=1)
+    num_elements_start = 10
+    num_elements_end = 15
+    datasets = []
+    for num_elements in range(num_elements_start, num_elements_end):
+      datasets.append(
+          self.make_distributed_range_dataset(num_elements, cluster))
+      cluster.restart_dispatcher()
+    for ds, num_elements in zip(datasets,
+                                range(num_elements_start, num_elements_end)):
+      self.assertDatasetProduces(ds, list(range(num_elements)))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDispatcherAndWorkerRestart(self):
+    cluster = self.create_cluster(num_workers=1)
+    num_elements = 100
+    ds = self.make_distributed_range_dataset(num_elements, cluster)
+
+    cluster.restart_dispatcher()
+    cluster.restart_worker()
+    self.assertDatasetProduces(ds, list(range(num_elements)))
+    cluster.restart_dispatcher()
+    cluster.restart_worker()
+    self.assertDatasetProduces(ds, list(range(num_elements)))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDispatcherAndMultiWorkerRestart(self):
+    num_workers = 2
+    cluster = self.create_cluster(num_workers=num_workers)
+    num_elements = 100
+    ds = self.make_distributed_range_dataset(num_elements, cluster)
+    iterator = iter(ds)
+    results = []
+
+    cluster.restart_dispatcher()
+    for worker_index in range(num_workers):
+      cluster.restart_worker(worker_index=worker_index)
+    for elem in iterator:
+      results.append(elem.numpy())
+    self.assertCountEqual(num_workers * list(range(num_elements)), results)
+    cluster.restart_dispatcher()
+    for worker_index in range(num_workers):
+      cluster.restart_worker(worker_index=worker_index)
+    for elem in iterator:
+      results.append(elem.numpy())
+    self.assertCountEqual(num_workers * list(range(num_elements)), results)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testStartServersLate(self):
+    # Test that the data service client performs retries instead of failing when
+    # the dataset is created before the master and worker are started.
+    try:
+      import portpicker  # pylint: disable=g-import-not-at-top
+      dispatcher_port = portpicker.pick_unused_port()
+    except:
+      raise self.skipTest("Flakes in portpicker library do not represent "
+                          "TensorFlow errors.")
+    cluster = self.create_cluster(
+        num_workers=1, dispatcher_port=dispatcher_port, start=False)
+
+    def start_servers():
+      time.sleep(0.5)
+      cluster.start_dispatcher()
+      cluster.start_workers()
+
+    start_servers_thread = threading.Thread(target=start_servers, daemon=True)
+    start_servers_thread.start()
+
+    num_elements = 10
+    ds = self.make_distributed_range_dataset(num_elements, cluster)
+    results = [elem.numpy() for elem in ds]
+    self.assertEqual(list(range(num_elements)), results)
+    start_servers_thread.join()
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testAddWorkerMidJob(self):
+    cluster = self.create_cluster(num_workers=1)
+    num_elements = 100
+    ds = self.make_distributed_range_dataset(num_elements, cluster)
+    iterator = iter(ds)
+    results = []
+    # Read halfway through the dataset.
+    for _ in range(num_elements // 2):
+      results.append(next(iterator).numpy())
+
+    cluster.add_worker()
+    # Wait for the new worker to register with the dispatcher.
+    while cluster.num_registered_workers() < 2:
+      time.sleep(10 / 1000)  # 10ms
+
+    for elem in iterator:
+      results.append(elem.numpy())
+
+    self.assertCountEqual(2 * list(range(num_elements)), results)
+
+  @combinations.generate(
+      combinations.times(test_base.eager_only_combinations(),
+                         combinations.combine(use_same_port=[True, False]),
+                         data_service_test_base.all_cluster_configurations()))
+  def testRestartWorker(self, use_same_port, work_dir, fault_tolerant_mode):
+    cluster = self.create_cluster(
+        num_workers=1,
+        work_dir=work_dir,
+        fault_tolerant_mode=fault_tolerant_mode)
+    num_elements = 100
+    ds = self.make_distributed_range_dataset(num_elements, cluster)
+    iterator = iter(ds)
+    # Read halfway through the dataset.
+    midpoint = num_elements // 2
+    for i in range(midpoint):
+      self.assertEqual(i, next(iterator).numpy())
+
+    # Stop the original worker and start a new one.
+    cluster.restart_worker(use_same_port=use_same_port)
+
+    # There may have been some elements prefetched from the first worker
+    # before it was stopped.
+    while True:
+      val = next(iterator).numpy()
+      if val == 0:
+        break
+
+    # The dataset starts over now that we read from the new worker.
+    # TODO(b/157086991): Iterate until end of sequence when we support
+    # detecting lost workers.
+    for i in range(1, num_elements // 2):
+      val = next(iterator).numpy()
+      self.assertEqual(i, val)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testChangeProcessingModeAfterRestart(self):
+    self.skipTest("b/170910141")
+    cluster = self.create_cluster(num_workers=1)
+    num_elements = 100
+    range_dataset = dataset_ops.Dataset.range(num_elements)
+    ds = range_dataset.apply(
+        data_service_ops.distribute(
+            processing_mode="parallel_epochs",
+            service=cluster.target,
+            job_name="test"))
+    iterator = iter(ds)
+    for i in range(num_elements // 2):
+      self.assertEqual(i, next(iterator).numpy())
+    cluster.restart_dispatcher()
+    ds = range_dataset.apply(
+        data_service_ops.distribute(
+            processing_mode="distributed_epoch",
+            service=cluster.target,
+            job_name="test"))
+    with self.assertRaisesOpError("already an existing job with that name "
+                                  "using processing mode <parallel_epochs>"):
+      next(iter(ds)).numpy()
+
+  @combinations.generate(
+      combinations.times(
+          test_base.eager_only_combinations(),
+          combinations.combine(work_dir=[TMP_WORK_DIR, NO_WORK_DIR])))
+  def testDistributeLargeGraphThenRegisterWorker(self, work_dir):
+    cluster = self.create_cluster(
+        num_workers=0, work_dir=work_dir, fault_tolerant_mode=False)
+    # Larger than default OSS grpc message size limit of 4MB.
+    tensor = array_ops.ones((2, 1000, 1000), dtype=dtypes.float32)
+    ds = dataset_ops.Dataset.from_tensors(tensor)
+    ds = self.make_distributed_dataset(ds, cluster)
+    it = iter(ds)
+    cluster.add_worker()
+    self.assertAllEqual(next(it), tensor)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/data_service_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/data_service_ops_test.py
similarity index 51%
rename from tensorflow/python/data/kernel_tests/data_service_ops_test.py
rename to tensorflow/python/data/experimental/kernel_tests/data_service_ops_test.py
index 310a60b8114..8a0617f4dee 100644
--- a/tensorflow/python/data/kernel_tests/data_service_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/data_service_ops_test.py
@@ -17,18 +17,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import threading
 import time
 
 from absl.testing import parameterized
 
+from tensorflow.python.data.experimental.kernel_tests import data_service_test_base
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import data_service_ops
 from tensorflow.python.data.experimental.ops import distribute_options
 from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.experimental.ops import testing
-from tensorflow.python.data.experimental.service import server_lib
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import def_function
@@ -39,6 +37,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import sparse_ops
@@ -46,215 +45,45 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
 
-
-def _address_from_target(target):
-  # Targets are in the format <protocol>://<address>
-  return target.split("://")[1]
+TMP_WORK_DIR = data_service_test_base.TMP_WORK_DIR
+NO_WORK_DIR = data_service_test_base.NO_WORK_DIR
 
 
-def _make_distributed_dataset(dataset,
-                              dispatcher,
-                              job_name=None,
-                              max_outstanding_requests=None):
-  return dataset.apply(
-      data_service_ops._distribute(
-          "parallel_epochs",
-          dispatcher.target,
-          job_name=job_name,
-          max_outstanding_requests=max_outstanding_requests,
-          task_refresh_interval_hint_ms=20))
-
-
-def _all_cluster_configurations():
-  with_work_dir = combinations.combine(
-      work_dir=None, fault_tolerant_mode=[True, False])
-  without_work_dir = combinations.combine(
-      work_dir="", fault_tolerant_mode=False)
-  return with_work_dir + without_work_dir
-
-
-def _make_distributed_range_dataset(num_elements,
-                                    dispatcher,
-                                    job_name=None,
-                                    max_outstanding_requests=None):
-  """Creates a distributed dataset.
-
-  Args:
-    num_elements: The number of elements in the range dataset that will be
-      distributed.
-    dispatcher: The dispatcher to distribute to.
-    job_name: Optional job name for the distributed dataset.
-    max_outstanding_requests: Optional limit on the number of outstanding
-      requests.
-
-  Returns:
-    The created dataset.
-  """
-  dataset = dataset_ops.Dataset.range(num_elements)
-  return _make_distributed_dataset(dataset, dispatcher, job_name,
-                                   max_outstanding_requests)
-
-
-class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  def start_dispatch_server(self,
-                            name="",
-                            port=0,
-                            work_dir=None,
-                            fault_tolerant_mode=True):
-    # If a test starts multiple independent dispatch servers, it should give
-    # them different `name` values.
-    work_dir = os.path.join(self.get_temp_dir(), "work_dir_",
-                            name) if work_dir is None else work_dir
-    return server_lib.DispatchServer(
-        port=port,
-        protocol=server_lib.DEFAULT_PROTOCOL,
-        work_dir=work_dir,
-        fault_tolerant_mode=fault_tolerant_mode)
-
-  def start_worker_server(self, dispatcher, port=0):
-    return server_lib.WorkerServer(
-        port=port,
-        dispatcher_address=_address_from_target(dispatcher.target),
-        protocol=server_lib.DEFAULT_PROTOCOL)
-
-  def restart_dispatcher(self, dispatcher):
-    """Stops `dispatcher` and returns a new dispatcher with the same port."""
-    port = int(_address_from_target(dispatcher.target).split(":")[1])
-    dispatcher._stop()
-    return self.start_dispatch_server(
-        port=port,
-        work_dir=dispatcher._work_dir,
-        fault_tolerant_mode=dispatcher._fault_tolerant_mode)
-
-  def restart_worker(self, worker, dispatcher, use_same_port=True):
-    """Stops `worker` and returns a new worker."""
-    port = 0
-    if use_same_port:
-      port = int(worker._address.split(":")[1])
-    worker._stop()
-    return self.start_worker_server(dispatcher, port)
-
-  def start_cluster(self,
-                    num_workers,
-                    name="",
-                    work_dir=None,
-                    fault_tolerant_mode=True):
-    """Creates and starts a tf.data service cluster."""
-    dispatcher = self.start_dispatch_server(
-        name=name, work_dir=work_dir, fault_tolerant_mode=fault_tolerant_mode)
-    workers = [self.start_worker_server(dispatcher) for _ in range(num_workers)]
-    return dispatcher, workers
+class DataServiceOpsTest(data_service_test_base.TestBase,
+                         parameterized.TestCase):
 
   @combinations.generate(
       combinations.times(test_base.eager_only_combinations(),
-                         _all_cluster_configurations()))
+                         data_service_test_base.all_cluster_configurations()))
   def testDistributeBasic(self, work_dir, fault_tolerant_mode):
-    dispatcher, workers = self.start_cluster(  # to avoid gcing workers, pylint: disable=unused-variable
-        1,
+    cluster = self.create_cluster(
+        num_workers=1,
         work_dir=work_dir,
         fault_tolerant_mode=fault_tolerant_mode)
     num_elements = 10
-    ds = _make_distributed_range_dataset(10, dispatcher)
+    ds = self.make_distributed_range_dataset(10, cluster)
     results = [elem.numpy() for elem in ds]
     self.assertEqual(list(range(num_elements)), results)
 
-  @combinations.generate(test_base.eager_only_combinations())
-  def testDispatcherStop(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
-    num_elements = 100
-    ds = _make_distributed_range_dataset(num_elements, dispatcher)
-    iterator = iter(ds)
-    results = []
-    results.append(next(iterator).numpy())
-    dispatcher._stop()
-    # After the dispatcher dies, the worker should continue providing the rest
-    # of the dataset's elements.
-    for _ in range(num_elements - 1):
-      results.append(next(iterator).numpy())
-    self.assertEqual(results, list(range(num_elements)))
-
-  @combinations.generate(test_base.eager_only_combinations())
-  def testDispatcherRestartBeforeReading(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
-    num_elements = 100
-    ds = _make_distributed_range_dataset(num_elements, dispatcher)
-    dispatcher = self.restart_dispatcher(dispatcher)
-
-    self.assertDatasetProduces(ds, list(range(num_elements)))
-
-  @combinations.generate(test_base.eager_only_combinations())
-  def testDispatcherRestartDuringReading(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
-    num_elements = 100
-    ds = _make_distributed_range_dataset(num_elements, dispatcher)
-    iterator = iter(ds)
-    results = []
-    for _ in range(num_elements // 2):
-      results.append(next(iterator).numpy())
-    dispatcher = self.restart_dispatcher(dispatcher)
-    for elem in iterator:
-      results.append(elem.numpy())
-
-    self.assertEqual(list(range(num_elements)), results)
-
-  @combinations.generate(test_base.eager_only_combinations())
-  def testDispatcherRestartBetweenIterations(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
-    num_elements = 100
-    ds = _make_distributed_range_dataset(100, dispatcher)
-    self.assertDatasetProduces(ds, list(range(num_elements)))
-    dispatcher = self.restart_dispatcher(dispatcher)
-    self.assertDatasetProduces(ds, list(range(num_elements)))
-
-  @combinations.generate(test_base.eager_only_combinations())
-  def testDispatcherManyRestarts(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
-    num_elements_start = 10
-    num_elements_end = 15
-    datasets = []
-    for num_elements in range(num_elements_start, num_elements_end):
-      datasets.append(_make_distributed_range_dataset(num_elements, dispatcher))
-      dispatcher = self.restart_dispatcher(dispatcher)
-    for ds, num_elements in zip(datasets,
-                                range(num_elements_start, num_elements_end)):
-      self.assertDatasetProduces(ds, list(range(num_elements)))
-
-  @combinations.generate(test_base.eager_only_combinations())
-  def testDispatcherAndWorkerRestart(self):
-    dispatcher, [worker] = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
-    num_elements = 100
-    ds = dataset_ops.Dataset.range(num_elements)
-
-    def restart():
-      return (self.restart_dispatcher(dispatcher),
-              self.restart_worker(worker, dispatcher))
-
-    ds = _make_distributed_dataset(ds, dispatcher)
-    dispatcher, worker = restart()
-    self.assertDatasetProduces(ds, list(range(num_elements)))
-    dispatcher, worker = restart()
-    self.assertDatasetProduces(ds, list(range(num_elements)))
-
   @combinations.generate(test_base.eager_only_combinations())
   def testDistributeSparse(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=1)
     element = sparse_tensor.SparseTensor(
         indices=[[0]],
         values=constant_op.constant([0], dtype=dtypes.int32),
         dense_shape=[1])
     ds = dataset_ops.Dataset.from_tensors(element)
-    ds = _make_distributed_dataset(ds, dispatcher)
+    ds = self.make_distributed_dataset(ds, cluster)
     results = [sparse_ops.sparse_tensor_to_dense(elem) for elem in ds]
     self.assertAllEqual(results, [[0]])
 
   @combinations.generate(test_base.eager_only_combinations())
   def testDistributeRagged(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=1)
     ds = dataset_ops.Dataset.from_tensor_slices([1, 5, 3, 2, 8])
     ds = ds.map(math_ops.range)
     ds = ds.apply(batching.dense_to_ragged_batch(2))
-    ds = _make_distributed_dataset(ds, dispatcher)
+    ds = self.make_distributed_dataset(ds, cluster)
     results = [elem.to_tensor() for elem in ds]
     self.assertAllEqual(results[0], [[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]])
     self.assertAllEqual(results[1], [[0, 1, 2], [0, 1, 0]])
@@ -264,10 +93,10 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testDifferentShuffleOrders(self):
     random_seed.set_random_seed(None)
     num_elements = 100
-    dispatcher, workers = self.start_cluster(2)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=2)
     ds = dataset_ops.Dataset.range(num_elements)
     ds = ds.shuffle(num_elements)
-    ds = _make_distributed_dataset(ds, dispatcher)
+    ds = self.make_distributed_dataset(ds, cluster)
     output = [elem.numpy() for elem in ds]
 
     # The output will be two sequences of range(num_elements)
@@ -284,31 +113,31 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testMultipleEpochs(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=1)
     num_elements = 3
-    ds = _make_distributed_range_dataset(num_elements, dispatcher)
+    ds = self.make_distributed_range_dataset(num_elements, cluster)
     for _ in range(10):
       self.assertEqual(list(range(num_elements)), [elem.numpy() for elem in ds])
 
   @combinations.generate(test_base.eager_only_combinations())
   def testRepeatedDataset(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=1)
     num_elements = 10
     num_repetitions = 5
-    ds = _make_distributed_range_dataset(num_elements, dispatcher)
+    ds = self.make_distributed_range_dataset(num_elements, cluster)
     ds = ds.repeat(num_repetitions)
     self.assertDatasetProduces(
         ds, expected_output=num_repetitions * list(range(num_elements)))
 
   @combinations.generate(test_base.eager_only_combinations())
   def testConcurrentEpoch(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=1)
     num_elements = 10
     num_datasets = 3
     iterators = []
     results = []
     for _ in range(num_datasets):
-      ds = _make_distributed_range_dataset(num_elements, dispatcher)
+      ds = self.make_distributed_range_dataset(num_elements, cluster)
       iterators.append(iter(ds))
       results.append([])
 
@@ -322,10 +151,10 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedEpoch(self):
     self.skipTest("Not yet implemented")
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=1)
     num_elements = 10
     num_iterators = 3
-    ds = _make_distributed_range_dataset(num_elements, dispatcher)
+    ds = self.make_distributed_range_dataset(num_elements, cluster)
     result = []
     iterators = []
     for _ in range(num_iterators):
@@ -346,114 +175,31 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(test_base.eager_only_combinations())
   def testMultiWorker(self):
     num_workers = 3
-    dispatcher, workers = self.start_cluster(num_workers)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=num_workers)
     num_elements = 10
-    ds = _make_distributed_range_dataset(num_elements, dispatcher)
+    ds = self.make_distributed_range_dataset(num_elements, cluster)
     results = [elem.numpy() for elem in ds]
     self.assertCountEqual(num_workers * list(range(num_elements)), results)
 
-  @combinations.generate(test_base.eager_only_combinations())
-  def testStartServersLate(self):
-    # Test that the data service client performs retries instead of failing when
-    # the dataset is created before the master and worker are started.
-    try:
-      import portpicker  # pylint: disable=g-import-not-at-top
-      dispatcher_port = portpicker.pick_unused_port()
-    except:
-      raise self.skipTest("Flakes in portpicker library do not represent "
-                          "TensorFlow errors.")
-    dispatcher = server_lib.DispatchServer(port=dispatcher_port, start=False)
-    worker = server_lib.WorkerServer(
-        port=0,
-        dispatcher_address=_address_from_target(dispatcher.target),
-        start=False)
-
-    def start_servers():
-      time.sleep(1)
-      dispatcher.start()
-      worker.start()
-
-    start_servers_thread = threading.Thread(target=start_servers, daemon=True)
-    start_servers_thread.start()
-
-    num_elements = 10
-    ds = _make_distributed_range_dataset(num_elements, dispatcher)
-    results = [elem.numpy() for elem in ds]
-    self.assertEqual(list(range(num_elements)), results)
-    start_servers_thread.join()
-
-  @combinations.generate(test_base.eager_only_combinations())
-  def testAddWorkerMidJob(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
-    num_elements = 100
-    ds = _make_distributed_range_dataset(num_elements, dispatcher)
-    iterator = iter(ds)
-    results = []
-    # Read halfway through the dataset.
-    for _ in range(num_elements // 2):
-      results.append(next(iterator).numpy())
-
-    new_worker = self.start_worker_server(dispatcher)  # to avoid gcing workers, pylint: disable=unused-variable
-    # Wait for the new worker to register with the dispatcher.
-    while dispatcher._num_workers() < 2:
-      time.sleep(10 / 1000)  # 10ms
-
-    for elem in iterator:
-      results.append(elem.numpy())
-
-    self.assertCountEqual(2 * list(range(num_elements)), results)
-
-  @combinations.generate(
-      combinations.times(test_base.eager_only_combinations(),
-                         combinations.combine(use_same_port=[True, False]),
-                         _all_cluster_configurations()))
-  def testRestartWorker(self, use_same_port, work_dir, fault_tolerant_mode):
-    dispatcher, [worker] = self.start_cluster(
-        1, work_dir=work_dir, fault_tolerant_mode=fault_tolerant_mode)
-    num_elements = 100
-    ds = _make_distributed_range_dataset(num_elements, dispatcher)
-    iterator = iter(ds)
-    # Read halfway through the dataset.
-    midpoint = num_elements // 2
-    for i in range(midpoint):
-      self.assertEqual(i, next(iterator).numpy())
-
-    # Stop the original worker and start a new one.
-    worker = self.restart_worker(worker, dispatcher, use_same_port)
-
-    # There may have been some elements prefetched from the first worker
-    # before it was stopped.
-    while True:
-      val = next(iterator).numpy()
-      if val == 0:
-        break
-
-    # The dataset starts over now that we read from the new worker.
-    # TODO(b/157086991): Iterate until end of sequence when we support
-    # detecting lost workers.
-    for i in range(1, num_elements // 2):
-      val = next(iterator).numpy()
-      self.assertEqual(i, val)
-
   @combinations.generate(test_base.eager_only_combinations())
   def testMaxOutstandingRequests(self):
     num_workers = 3
-    dispatcher, workers = self.start_cluster(num_workers)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=num_workers)
     num_elements = 10
-    ds = _make_distributed_range_dataset(
-        num_elements, dispatcher, max_outstanding_requests=1)
+    ds = self.make_distributed_range_dataset(
+        num_elements, cluster, max_outstanding_requests=1)
     self.assertCountEqual(num_workers * list(range(num_elements)),
                           self.getDatasetOutput(ds))
 
   @combinations.generate(test_base.eager_only_combinations())
   def testInsideFunction(self):
     num_workers = 3
-    dispatcher, workers = self.start_cluster(num_workers)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=num_workers)
     num_elements = 10
 
     @def_function.function
     def f():
-      ds = _make_distributed_range_dataset(num_elements, dispatcher)
+      ds = self.make_distributed_range_dataset(num_elements, cluster)
       result = tensor_array_ops.TensorArray(
           dtypes.int64, size=num_workers * num_elements, dynamic_size=True)
       i = 0
@@ -467,14 +213,14 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobName(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
-    num_elements = 100
+    cluster = self.create_cluster(num_workers=1)
+    num_elements = 1000
 
     def make_ds():
       return dataset_ops.Dataset.range(num_elements).shuffle(num_elements)
 
-    ds1 = _make_distributed_dataset(make_ds(), dispatcher, job_name="job_name")
-    ds2 = _make_distributed_dataset(make_ds(), dispatcher, job_name="job_name")
+    ds1 = self.make_distributed_dataset(make_ds(), cluster, job_name="job_name")
+    ds2 = self.make_distributed_dataset(make_ds(), cluster, job_name="job_name")
     iter1 = iter(ds1)
     iter2 = iter(ds2)
     results = []
@@ -489,21 +235,23 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testDifferentJobNames(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=1)
     num_elements = 10
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, dispatcher, job_name="job_name1")
-    ds2 = _make_distributed_dataset(ds, dispatcher, job_name="job_name2")
+    ds1 = self.make_distributed_range_dataset(
+        num_elements, cluster, job_name="job_name1")
+    ds2 = self.make_distributed_range_dataset(
+        num_elements, cluster, job_name="job_name2")
     self.assertDatasetProduces(ds1, list(range(num_elements)))
     self.assertDatasetProduces(ds2, list(range(num_elements)))
 
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobNameMultiIteration(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=1)
     num_elements = 10
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, dispatcher, job_name="job_name")
-    ds2 = _make_distributed_dataset(ds, dispatcher, job_name="job_name")
+    ds1 = self.make_distributed_range_dataset(
+        num_elements, cluster, job_name="job_name")
+    ds2 = self.make_distributed_range_dataset(
+        num_elements, cluster, job_name="job_name")
     # iteration 1
     self.assertDatasetProduces(ds1, list(range(num_elements)))
     self.assertDatasetProduces(ds2, [])
@@ -513,13 +261,14 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobNameRepeat(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=1)
     num_elements = 100
     num_repetitions = 3
-    ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, dispatcher, job_name="job_name")
+    ds1 = self.make_distributed_range_dataset(
+        num_elements, cluster, job_name="job_name")
     ds1 = ds1.repeat(num_repetitions)
-    ds2 = _make_distributed_dataset(ds, dispatcher, job_name="job_name")
+    ds2 = self.make_distributed_range_dataset(
+        num_elements, cluster, job_name="job_name")
     ds2 = ds2.repeat(num_repetitions)
     results = []
     iter1 = iter(ds1)
@@ -534,10 +283,49 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
       results.append(elem.numpy())
     self.assertCountEqual(num_repetitions * list(range(num_elements)), results)
 
+  @combinations.generate(
+      combinations.times(test_base.eager_only_combinations(),
+                         combinations.combine(job_name=[None, "test"])))
+  def testGcUnusedJob(self, job_name):
+    cluster = self.create_cluster(
+        num_workers=1, job_gc_check_interval_ms=50, job_gc_timeout_ms=20)
+    num_elements = 100
+    ds = self.make_distributed_range_dataset(
+        num_elements, cluster, job_name=job_name)
+    it = iter(ds)
+    self.assertEqual(next(it).numpy(), 0)
+    self.assertEqual(cluster.num_tasks_on_worker(), 1)
+    del it
+    while cluster.num_tasks_on_worker() > 0:
+      time.sleep(0.1)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDontGcUsedJob(self):
+    cluster = self.create_cluster(
+        num_workers=1, job_gc_check_interval_ms=50, job_gc_timeout_ms=20)
+    num_elements = 10
+    it1 = iter(
+        self.make_distributed_range_dataset(
+            num_elements, cluster, job_name="test1"))
+    it2 = iter(
+        self.make_distributed_range_dataset(
+            num_elements, cluster, job_name="test2"))
+    it3 = iter(  # this iterator keeps the task alive. pylint: disable=unused-variable
+        self.make_distributed_range_dataset(
+            num_elements, cluster, job_name="test2"))
+    self.assertEqual(2, cluster.num_tasks_on_worker())
+    del it1
+    del it2
+    # Check that only the first job is gced. The second job will not be gced
+    # because there is still an outstanding iterator for it.
+    while cluster.num_tasks_on_worker() > 1:
+      time.sleep(0.1)
+    self.assertEqual(1, cluster.num_tasks_on_worker())
+
   @combinations.generate(test_base.eager_only_combinations())
   def testApplyDeterminismOption(self):
     elements = list(range(10))
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=1)
 
     def dataset_fn(delay_ms):
 
@@ -554,7 +342,7 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
       opts = dataset_ops.Options()
       opts.experimental_deterministic = False
       ds = ds.with_options(opts)
-      ds = _make_distributed_dataset(ds, dispatcher)
+      ds = self.make_distributed_dataset(ds, cluster)
       return ds
 
     self.checkDeterminism(
@@ -571,8 +359,8 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     options.experimental_external_state_policy = external_state_policy
     ds = ds.with_options(options)
 
-    dispatcher, workers = self.start_cluster(3)  # to avoid gcing workers, pylint: disable=unused-variable
-    ds = _make_distributed_dataset(ds, dispatcher)
+    cluster = self.create_cluster(num_workers=3)
+    ds = self.make_distributed_dataset(ds, cluster)
     next(iter(ds))
 
   @combinations.generate(
@@ -591,19 +379,97 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.run_stateful(distribute_options.ExternalStatePolicy.FAIL)
 
   @combinations.generate(test_base.eager_only_combinations())
+  def testDistributeDistributedEpochTensorSlices(self):
+    self.skipTest("b/170910141")
+    cluster = self.create_cluster(num_workers=2)
+    vals = [5, 1, 2, 4]
+    ds = dataset_ops.Dataset.from_tensor_slices(vals)
+    ds = self.make_distributed_dataset(
+        ds, cluster, processing_mode="distributed_epoch")
+    self.assertDatasetProduces(ds, vals, assert_items_equal=True)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDistributeDistributedEpochInterleave(self):
+    self.skipTest("b/170910141")
+    cluster = self.create_cluster(num_workers=2)
+    elements = [1, 5, 0]
+    ds = dataset_ops.Dataset.from_tensor_slices(elements)
+    ds = ds.interleave(lambda x: dataset_ops.Dataset.from_tensor_slices([x]))
+    ds = self.make_distributed_dataset(
+        ds, cluster, processing_mode="distributed_epoch")
+    self.assertDatasetProduces(ds, elements, assert_items_equal=True)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDistributeDistributedEpochParallelInterleave(self):
+    self.skipTest("b/170910141")
+    cluster = self.create_cluster(num_workers=2)
+    elements = [1, 5, 0]
+    ds = dataset_ops.Dataset.from_tensor_slices(elements)
+    ds = ds.interleave(
+        lambda x: dataset_ops.Dataset.from_tensor_slices([x]),
+        num_parallel_calls=dataset_ops.AUTOTUNE)
+    ds = self.make_distributed_dataset(
+        ds, cluster, processing_mode="distributed_epoch")
+    self.assertDatasetProduces(ds, elements, assert_items_equal=True)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDistributeDistributedEpochFlatMap(self):
+    self.skipTest("b/170910141")
+    cluster = self.create_cluster(num_workers=2)
+    elements = [1, 5, 0]
+    ds = dataset_ops.Dataset.from_tensor_slices(elements)
+    ds = ds.flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices([x]))
+    ds = self.make_distributed_dataset(
+        ds, cluster, processing_mode="distributed_epoch")
+    self.assertDatasetProduces(ds, elements, assert_items_equal=True)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDistributeDistributedEpochRepeat(self):
+    self.skipTest("b/170910141")
+    cluster = self.create_cluster(num_workers=2)
+    num_repeats = 5
+    num_elements = 20
+    ds = dataset_ops.Dataset.range(num_elements).repeat(num_repeats)
+    ds = self.make_distributed_dataset(
+        ds, cluster, processing_mode="distributed_epoch")
+    self.assertDatasetProduces(
+        ds, num_repeats * list(range(num_elements)), assert_items_equal=True)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDistributeDistributedEpochShuffleAndRepeat(self):
+    self.skipTest("b/170910141")
+    cluster = self.create_cluster(num_workers=2)
+    num_repeats = 5
+    num_elements = 20
+    ds = dataset_ops.Dataset.range(num_elements).shuffle(num_elements).repeat(
+        num_repeats)
+    ds = self.make_distributed_dataset(
+        ds, cluster, processing_mode="distributed_epoch")
+    self.assertDatasetProduces(
+        ds, num_repeats * list(range(num_elements)), assert_items_equal=True)
+
   def testDistributeFromInterleave(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=1)
     ds = dataset_ops.Dataset.range(2)
 
     def interleave_fn(_):
       dataset = dataset_ops.Dataset.range(2)
-      _make_distributed_dataset(dataset, dispatcher)
+      self.make_distributed_dataset(dataset, cluster)
       return dataset
 
-    with self.assertRaisesRegex(
-        errors.InvalidArgumentError, r"The `.distribute\(...\)` dataset "
-        "transformation is not supported within tf.data functions"):
-      ds = ds.interleave(interleave_fn, cycle_length=2)
+    ds = ds.interleave(interleave_fn, cycle_length=2)
+    self.assertDatasetProduces(ds, [0, 0, 1, 1])
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testDistributeDistributedEpoch(self):
+    self.skipTest("b/170910141")
+    cluster = self.create_cluster(num_workers=2)
+    num_elements = 100
+    ds = dataset_ops.Dataset.range(num_elements)
+    ds = self.make_distributed_dataset(
+        ds, cluster, processing_mode="distributed_epoch")
+    self.assertDatasetProduces(
+        ds, list(range(num_elements)), assert_items_equal=True)
 
   @combinations.generate(test_base.eager_only_combinations())
   def testDistributeNonStringAddresses(self):
@@ -633,25 +499,25 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testFromDatasetId(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=1)
 
     num_elements = 10
     ds = dataset_ops.Dataset.range(num_elements)
-    dataset_id = data_service_ops.register_dataset(dispatcher.target, ds)
+    dataset_id = data_service_ops.register_dataset(cluster.target, ds)
     from_dataset_id_ds = data_service_ops.from_dataset_id(
-        "parallel_epochs", dispatcher.target, dataset_id, ds.element_spec)
+        "parallel_epochs", cluster.target, dataset_id, ds.element_spec)
     self.assertDatasetProduces(from_dataset_id_ds, list(range(num_elements)))
 
   @combinations.generate(test_base.eager_only_combinations())
   def testFromDatasetIdMultipleComponents(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=1)
 
     num_elements = 10
     ds = dataset_ops.Dataset.range(num_elements)
     ds = dataset_ops.Dataset.zip({"a": (ds, ds), "b": ds})
-    dataset_id = data_service_ops.register_dataset(dispatcher.target, ds)
+    dataset_id = data_service_ops.register_dataset(cluster.target, ds)
     from_dataset_id_ds = data_service_ops.from_dataset_id(
-        "parallel_epochs", dispatcher.target, dataset_id, ds.element_spec)
+        "parallel_epochs", cluster.target, dataset_id, ds.element_spec)
     output = self.getDatasetOutput(from_dataset_id_ds)
     for i in range(num_elements):
       self.assertEqual(i, output[i]["a"][0])
@@ -660,26 +526,26 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testFromDatasetIdWrongElementSpec(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=1)
 
     num_elements = 10
     ds = dataset_ops.Dataset.range(num_elements)
-    dataset_id = data_service_ops.register_dataset(dispatcher.target, ds)
+    dataset_id = data_service_ops.register_dataset(cluster.target, ds)
     wrong_spec = tensor_spec.TensorSpec(shape=(), dtype=dtypes.variant)
     from_dataset_id_ds = data_service_ops.from_dataset_id(
-        "parallel_epochs", dispatcher.target, dataset_id, wrong_spec)
+        "parallel_epochs", cluster.target, dataset_id, wrong_spec)
     with self.assertRaisesRegex(errors.FailedPreconditionError,
                                 "Expected a tensor of type variant"):
       self.evaluate(self.getNext(from_dataset_id_ds)())
 
   @combinations.generate(test_base.eager_only_combinations())
   def testFromDatasetIdNotRegistered(self):
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=1)
 
     dataset_id = 0
     element_spec = tensor_spec.TensorSpec(shape=(), dtype=dtypes.variant)
     from_dataset_id_ds = data_service_ops.from_dataset_id(
-        "parallel_epochs", dispatcher.target, dataset_id, element_spec)
+        "parallel_epochs", cluster.target, dataset_id, element_spec)
     with self.assertRaisesRegex(errors.NotFoundError, "Dataset id"):
       self.evaluate(self.getNext(from_dataset_id_ds)())
 
@@ -688,14 +554,14 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.skipTest("b/162521601")
     sleep_microseconds = int(1e6) * 1000
 
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster = self.create_cluster(num_workers=1)
     # Create a dataset which produces the first element quickly, and the second
     # element slowly. Fetching the first element triggers prefetching of the
     # second element, which we should be able to cancel.
     slow = dataset_ops.Dataset.range(1)
     slow = slow.apply(testing.sleep(sleep_microseconds))
     ds = dataset_ops.Dataset.range(1).concatenate(slow)
-    ds = _make_distributed_dataset(ds, dispatcher)
+    ds = self.make_distributed_dataset(ds, cluster)
     ds = ds.prefetch(1)
     get_next = self.getNext(ds, requires_initialization=True)
     self.assertEqual(0, self.evaluate(get_next()))
@@ -706,33 +572,31 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testRegisterEquivalentDatasets(self):
     ds_1 = dataset_ops.Dataset.range(10)
     ds_2 = dataset_ops.Dataset.range(10)
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
-    id_1 = data_service_ops.register_dataset(dispatcher.target, ds_1)
-    id_2 = data_service_ops.register_dataset(dispatcher.target, ds_2)
+    cluster = self.create_cluster(num_workers=1)
+    id_1 = data_service_ops.register_dataset(cluster.target, ds_1)
+    id_2 = data_service_ops.register_dataset(cluster.target, ds_2)
     self.assertEqual(id_1.numpy(), id_2.numpy())
 
   @combinations.generate(test_base.eager_only_combinations())
   def testRegisterDifferentDatasets(self):
     ds_1 = dataset_ops.Dataset.range(10)
     ds_2 = dataset_ops.Dataset.range(20)
-    dispatcher, workers = self.start_cluster(1)  # to avoid gcing workers, pylint: disable=unused-variable
-    id_1 = data_service_ops.register_dataset(dispatcher.target, ds_1)
-    id_2 = data_service_ops.register_dataset(dispatcher.target, ds_2)
+    cluster = self.create_cluster(num_workers=1)
+    id_1 = data_service_ops.register_dataset(cluster.target, ds_1)
+    id_2 = data_service_ops.register_dataset(cluster.target, ds_2)
     self.assertNotEqual(id_1.numpy(), id_2.numpy())
 
   @combinations.generate(test_base.eager_only_combinations())
   def testTwoLevelDistribute(self):
     cluster_1_size = 3
-    dispatcher_1, workers_1 = self.start_cluster(  # to avoid gcing workers, pylint: disable=unused-variable
-        cluster_1_size,
-        name="cluster_1")
-    dispatcher_2, workers_2 = self.start_cluster(1, name="cluster_2")  # to avoid gcing workers, pylint: disable=unused-variable
+    cluster_1 = self.create_cluster(num_workers=cluster_1_size)
+    cluster_2 = self.create_cluster(num_workers=1)
     num_sizes = 10
     size_repeats = 5
     strings = ["a" * i for i in range(num_sizes)] * size_repeats
     ds = dataset_ops.Dataset.from_tensor_slices(strings)
     ds = ds.shuffle(len(strings))
-    ds = _make_distributed_dataset(ds, dispatcher_1)
+    ds = self.make_distributed_dataset(ds, cluster_1)
     # Large enough so that all strings of the same size are windowed together.
     window_size = cluster_1_size * size_repeats
     batch_size = size_repeats
@@ -745,7 +609,7 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
             key_func=key_func,
             reduce_func=lambda _, x: x.batch(batch_size),
             window_size=window_size))
-    ds = _make_distributed_dataset(ds, dispatcher_2)
+    ds = self.make_distributed_dataset(ds, cluster_2)
 
     it = iter(ds)
     for _ in range(num_sizes):
@@ -754,6 +618,17 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
         self.assertAllEqual(next(it).numpy(), element)
     self.assertEmpty(list(it))
 
+  @combinations.generate(
+      combinations.times(test_base.eager_only_combinations()))
+  def testDistributeLargeGraph(self):
+    cluster = self.create_cluster(
+        num_workers=1, work_dir=NO_WORK_DIR, fault_tolerant_mode=False)
+    # Larger than default OSS grpc message size limit of 4MB.
+    tensor = array_ops.ones((2, 1000, 1000), dtype=dtypes.float32)
+    ds = dataset_ops.Dataset.from_tensors(tensor)
+    ds = self.make_distributed_dataset(ds, cluster)
+    self.assertDatasetProduces(ds, [tensor])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/data_service_test_base.py b/tensorflow/python/data/experimental/kernel_tests/data_service_test_base.py
new file mode 100644
index 00000000000..0bb1383a56b
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/data_service_test_base.py
@@ -0,0 +1,222 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test base for tf.data service tests."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tempfile
+
+from tensorflow.python.data.experimental.ops import data_service_ops
+from tensorflow.python.data.experimental.service import server_lib
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
+from tensorflow.python.platform import googletest
+
+# This will be resolved to a tmp directory by `start_dispatch_server`.
+TMP_WORK_DIR = "tmp_work_dir_placeholder"
+# `""` indicates not to use a work directory.
+NO_WORK_DIR = ""
+# We use a faster than normal heartbeat interval so that tests run faster.
+TEST_HEARTBEAT_INTERVAL_MS = 100
+
+
+def all_cluster_configurations():
+  with_work_dir = combinations.combine(
+      work_dir=TMP_WORK_DIR, fault_tolerant_mode=[True, False])
+  without_work_dir = combinations.combine(
+      work_dir=NO_WORK_DIR, fault_tolerant_mode=False)
+  return with_work_dir + without_work_dir
+
+
+class TestCluster(object):
+  """Test tf.data service cluster."""
+
+  def __init__(self,
+               num_workers,
+               dispatcher_port=0,
+               work_dir=TMP_WORK_DIR,
+               fault_tolerant_mode=True,
+               job_gc_check_interval_ms=None,
+               job_gc_timeout_ms=None,
+               start=True):
+    """Creates a tf.data service test cluster.
+
+    Args:
+      num_workers: The number of workers to initially add to the cluster.
+      dispatcher_port: The port to use for the dispatcher.
+      work_dir: The work directory to use for the dispatcher. If set to
+        `TMP_WORK_DIR`, the cluster will create a new temporary directory to use
+        as the work directory. If set to `NO_WORK_DIR`, no work directory will
+        be used.
+      fault_tolerant_mode: Whether the dispatcher should write its state to a
+        journal so that it can recover from restarts.
+      job_gc_check_interval_ms: How often the dispatcher should scan through to
+        delete old and unused jobs, in milliseconds.
+      job_gc_timeout_ms: How long a job needs to be unused before it becomes a
+        candidate for garbage collection, in milliseconds.
+      start: Whether to immediately start the servers in the cluster. If
+        `False`, the servers can be started later by calling
+        `start_dispatcher()` and `start_workers()`.
+    """
+    if work_dir == TMP_WORK_DIR:
+      work_dir = tempfile.mkdtemp(dir=googletest.GetTempDir())
+    self.dispatcher = server_lib.DispatchServer(
+        server_lib.DispatcherConfig(
+            port=dispatcher_port,
+            work_dir=work_dir,
+            fault_tolerant_mode=fault_tolerant_mode,
+            job_gc_check_interval_ms=job_gc_check_interval_ms,
+            job_gc_timeout_ms=job_gc_timeout_ms),
+        start=start)
+
+    self.workers = []
+    for _ in range(num_workers):
+      self.add_worker(start=start)
+
+  @property
+  def target(self):
+    return self.dispatcher.target
+
+  def dispatcher_address(self):
+    return self.dispatcher.target.split("://")[1]
+
+  def add_worker(self, start=True):
+    self.workers.append(
+        server_lib.WorkerServer(
+            server_lib.WorkerConfig(
+                dispatcher_address=self.dispatcher_address(),
+                heartbeat_interval_ms=TEST_HEARTBEAT_INTERVAL_MS),
+            start=start))
+
+  def start_dispatcher(self):
+    self.dispatcher.start()
+
+  def start_workers(self):
+    for worker in self.workers:
+      worker.start()
+
+  def stop_dispatcher(self):
+    # pylint: disable=protected-access
+    self.dispatcher._stop()
+
+  # pylint: disable=protected-access
+  def restart_dispatcher(self):
+    """Stops `dispatcher` and creates a new dispatcher with the same port.
+
+    Restarting is supported only when the dispatcher is configured with
+    `fault_tolerant_mode=True`.
+    """
+    if not self.dispatcher._config.fault_tolerant_mode:
+      raise ValueError(
+          "Trying to restart the dispatcher without fault-tolerance.")
+    port = int(self.dispatcher_address().split(":")[1])
+    self.dispatcher._stop()
+    self.dispatcher = server_lib.DispatchServer(
+        server_lib.DispatcherConfig(
+            port=port,
+            work_dir=self.dispatcher._config.work_dir,
+            fault_tolerant_mode=self.dispatcher._config.fault_tolerant_mode))
+
+  # pylint: disable=protected-access
+  def restart_worker(self, worker_index=0, use_same_port=True):
+    """Replaces the worker at index `worker_index` with a new worker."""
+    worker = self.workers[worker_index]
+    port = 0
+    if use_same_port:
+      port = int(worker._address.split(":")[1])
+    worker._stop()
+    self.workers[worker_index] = server_lib.WorkerServer(
+        server_lib.WorkerConfig(
+            dispatcher_address=self.dispatcher_address(),
+            port=port,
+            heartbeat_interval_ms=worker._config.heartbeat_interval_ms))
+
+  def num_registered_workers(self):
+    return self.dispatcher._num_workers()
+
+  def num_tasks_on_worker(self, worker_index=0):
+    return self.workers[worker_index]._num_tasks()
+
+
+class TestBase(test_base.DatasetTestBase):
+  """Base class for tf.data service tests."""
+
+  def create_cluster(self,
+                     num_workers,
+                     dispatcher_port=0,
+                     work_dir=TMP_WORK_DIR,
+                     fault_tolerant_mode=True,
+                     job_gc_check_interval_ms=None,
+                     job_gc_timeout_ms=None,
+                     start=True):
+    """Creates a tf.data service test cluster.
+
+    Args:
+      num_workers: The number of workers to initially add to the cluster.
+      dispatcher_port: The port to use for the dispatcher.
+      work_dir: The work directory to use for the dispatcher. If set to
+        `TMP_WORK_DIR`, the cluster will create a new temporary directory to use
+        as the work directory. If set to `NO_WORK_DIR`, no work directory will
+        be used.
+      fault_tolerant_mode: Whether the dispatcher should write its state to a
+        journal so that it can recover from restarts.
+      job_gc_check_interval_ms: How often the dispatcher should scan through to
+        delete old and unused jobs, in milliseconds.
+      job_gc_timeout_ms: How long a job needs to be unused before it becomes a
+        candidate for garbage collection, in milliseconds.
+      start: Whether to immediately start the servers in the cluster. If
+        `False`, the servers can be started later by calling
+        `start_dispatcher()` and `start_workers()`.
+
+    Returns:
+      The created cluster.
+    """
+    return TestCluster(
+        num_workers=num_workers,
+        dispatcher_port=dispatcher_port,
+        work_dir=work_dir,
+        fault_tolerant_mode=fault_tolerant_mode,
+        job_gc_check_interval_ms=job_gc_check_interval_ms,
+        job_gc_timeout_ms=job_gc_timeout_ms,
+        start=start)
+
+  def make_distributed_dataset(self,
+                               dataset,
+                               cluster,
+                               processing_mode="parallel_epochs",
+                               job_name=None,
+                               max_outstanding_requests=None):
+    # pylint: disable=protected-access
+    return dataset.apply(
+        data_service_ops._distribute(
+            processing_mode,
+            cluster.target,
+            job_name=job_name,
+            max_outstanding_requests=max_outstanding_requests,
+            task_refresh_interval_hint_ms=20))
+
+  def make_distributed_range_dataset(self,
+                                     num_elements,
+                                     cluster,
+                                     job_name=None,
+                                     max_outstanding_requests=None):
+    dataset = dataset_ops.Dataset.range(num_elements)
+    return self.make_distributed_dataset(
+        dataset,
+        cluster,
+        job_name=job_name,
+        max_outstanding_requests=max_outstanding_requests)
diff --git a/tensorflow/python/data/experimental/kernel_tests/io_test.py b/tensorflow/python/data/experimental/kernel_tests/io_test.py
index 1fdd24ea613..7cbb745e099 100644
--- a/tensorflow/python/data/experimental/kernel_tests/io_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/io_test.py
@@ -25,6 +25,7 @@ from absl.testing import parameterized
 from tensorflow.python.data.experimental.ops import io
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
@@ -82,6 +83,22 @@ class IOTest(test_base.DatasetTestBase, parameterized.TestCase):
       expected.extend(range(i, 42, 7))
     self.assertDatasetProduces(dataset2, expected)
 
+  @combinations.generate(
+      combinations.times(test_base.eager_only_combinations(),
+                         combinations.combine(compression=[None, "GZIP"])))
+  def testSaveInsideFunction(self, compression):
+
+    dataset = dataset_ops.Dataset.range(42)
+
+    @def_function.function
+    def save_fn():
+      io.save(dataset, self._test_dir, compression=compression)
+
+    save_fn()
+    dataset = io.load(
+        self._test_dir, dataset.element_spec, compression=compression)
+    self.assertDatasetProduces(dataset, range(42))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index f3e871a903e..e3d8c60d317 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -8,6 +8,24 @@ package(
 
 exports_files(["LICENSE"])
 
+tf_py_test(
+    name = "autotune_buffer_sizes_test",
+    size = "small",
+    srcs = ["autotune_buffer_sizes_test.py"],
+    tags = [
+        "no_oss",
+        "no_pip",
+        "no_windows",
+    ],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/experimental/ops:testing",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 tf_py_test(
     name = "choose_fastest_dataset_test",
     size = "small",
@@ -123,24 +141,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "inject_prefetch_test",
-    size = "small",
-    srcs = ["inject_prefetch_test.py"],
-    tags = [
-        "no_oss",
-        "no_pip",
-        "no_windows",
-    ],
-    deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python/data/experimental/ops:testing",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
 tf_py_test(
     name = "latency_all_edges_test",
     size = "small",
@@ -255,6 +255,7 @@ tf_py_test(
         "no_oss",
         "no_pip",
         "no_windows",
+        "noasan",  # TODO(b/337374867) fails with -fsanitize=null
     ],
     deps = [
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/inject_prefetch_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/autotune_buffer_sizes_test.py
similarity index 86%
rename from tensorflow/python/data/experimental/kernel_tests/optimization/inject_prefetch_test.py
rename to tensorflow/python/data/experimental/kernel_tests/optimization/autotune_buffer_sizes_test.py
index a4d5e9db785..c45809a2dad 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/inject_prefetch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/autotune_buffer_sizes_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the `AutotuneBuffers` rewrite."""
+"""Tests for the `AutotuneBufferSizes` rewrite."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -26,7 +26,8 @@ from tensorflow.python.framework import combinations
 from tensorflow.python.platform import test
 
 
-class InjectPrefetchTest(test_base.DatasetTestBase, parameterized.TestCase):
+class AutotuneBufferSizesTest(test_base.DatasetTestBase,
+                              parameterized.TestCase):
 
   def _enable_autotune_buffers(self, dataset):
     options = dataset_ops.Options()
@@ -74,11 +75,16 @@ class InjectPrefetchTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.range(100)
     dataset = dataset.apply(
         testing.assert_next([
-            "ParallelMap", "Prefetch", "ParallelInterleave", "Prefetch",
-            "MapAndBatch", "Prefetch", "FiniteTake"
+            "ParallelMap", "Prefetch", "ParallelMap", "Prefetch", "ParallelMap",
+            "Prefetch", "ParallelInterleave", "Prefetch", "MapAndBatch",
+            "Prefetch", "FiniteTake"
         ]))
     dataset = dataset.map(
         lambda x: x + 1, num_parallel_calls=dataset_ops.AUTOTUNE)
+    dataset = dataset.map(
+        lambda x: x + 1, num_parallel_calls=dataset_ops.AUTOTUNE)
+    dataset = dataset.map(
+        lambda x: x + 1, num_parallel_calls=dataset_ops.AUTOTUNE)
     dataset = dataset.interleave(
         lambda x: dataset_ops.Dataset.from_tensors(x + 1),
         num_parallel_calls=dataset_ops.AUTOTUNE)
@@ -87,7 +93,7 @@ class InjectPrefetchTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset.batch(1)
     dataset = dataset.take(50)
     dataset = self._enable_autotune_buffers(dataset)
-    self.assertDatasetProduces(dataset, [[i] for i in range(3, 53)])
+    self.assertDatasetProduces(dataset, [[i] for i in range(5, 55)])
 
   @combinations.generate(test_base.default_test_combinations())
   def testNoRegularMap(self):
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
index d9ebc1cc719..053cc541c5b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
@@ -33,12 +33,41 @@ class LatencyAllEdgesTest(stats_dataset_test_base.StatsDatasetTestBase,
 
   # TODO(jsimsa): Investigate why are graph-mode tests failing.
   @combinations.generate(test_base.eager_only_combinations())
-  def testLatencyStatsOptimization(self):
+  def testLatencyStatsOptimizationAutotuneOff(self):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.from_tensors(1).apply(
-        testing.assert_next(
-            ["LatencyStats", "Map", "LatencyStats", "Prefetch",
-             "LatencyStats"])).map(lambda x: x * x).prefetch(1)
+        testing.assert_next([
+            "LatencyStats", "Map", "LatencyStats", "Prefetch", "LatencyStats",
+            "MaxIntraOpParallelism", "LatencyStats", "SetStatsAggregator"
+        ])).map(lambda x: x * x).prefetch(1)
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.autotune = False
+    options.experimental_stats.latency_all_edges = True
+    options.experimental_stats.aggregator = aggregator
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[1],
+        requires_initialization=True,
+        num_test_iterations=1)
+    handle = self.getHandle(aggregator)
+    self.assertStatisticsHasCount(
+        handle, self.regexForNodeName("record_latency::TensorDataset"), 1)
+    self.assertStatisticsHasCount(
+        handle, self.regexForNodeName("record_latency::MapDataset"), 1)
+    self.assertStatisticsHasCount(
+        handle, self.regexForNodeName("record_latency::PrefetchDataset"), 1)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testLatencyStatsOptimizationAutotuneOn(self):
+    aggregator = stats_aggregator.StatsAggregator()
+    dataset = dataset_ops.Dataset.from_tensors(1).apply(
+        testing.assert_next([
+            "LatencyStats", "Map", "LatencyStats", "Prefetch", "LatencyStats",
+            "MaxIntraOpParallelism", "LatencyStats", "Model",
+            "SetStatsAggregator"
+        ])).map(lambda x: x * x).prefetch(1)
     options = dataset_ops.Options()
     options.experimental_optimization.apply_default_optimizations = False
     options.experimental_stats.latency_all_edges = True
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
index ac92ddea529..ad19cf86db3 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
@@ -102,6 +102,31 @@ class MapParallelizationTest(test_base.DatasetTestBase, parameterized.TestCase):
         expected_output=[x + 42 for x in range(5)],
         requires_initialization=True)
 
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(apply_autotune=[None, True, False])))
+  def testAutotuneOption(self, apply_autotune):
+    next_nodes = ["ParallelMap"] if (apply_autotune is not False) else ["Map"]  # pylint: disable=g-bool-id-comparison
+
+    def func(i):
+      ds = dataset_ops.Dataset.range(i).apply(
+          testing.assert_next(next_nodes)).map(lambda x: x + 1)
+      return ds
+
+    dataset = dataset_ops.Dataset.range(1, 4).interleave(
+        map_func=func, cycle_length=4, block_length=5)
+    dataset = dataset.apply(
+        testing.assert_next(next_nodes)).map(lambda x: x * 2)
+
+    options = dataset_ops.Options()
+    options.experimental_optimization.apply_default_optimizations = False
+    options.experimental_optimization.map_parallelization = True
+    if apply_autotune is not None:
+      options.experimental_optimization.autotune = apply_autotune
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(dataset, expected_output=[2, 2, 4, 2, 4, 6])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py
index 16bb1ec9cd7..14a0eafdd01 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py
@@ -104,6 +104,28 @@ def _captured_refvar_test_combinations():
   return functools.reduce(reduce_fn, cases, [])
 
 
+def _disable_intra_op_parallelism_test_combinations():
+
+  def make_tensor_dataset():
+    return dataset_ops.Dataset.from_tensors(42)
+
+  def make_map_dataset():
+    return dataset_ops.Dataset.from_tensors(42).map(lambda x: x + 1)
+
+  cases = [
+      ("FromTensors", make_tensor_dataset, [42]),
+      ("Map", make_map_dataset, [43]),
+  ]
+
+  def reduce_fn(x, y):
+    name, dataset_fn, expected_output = y
+    return x + combinations.combine(
+        dataset_fn=combinations.NamedObject(name, dataset_fn),
+        expected_output=[expected_output])
+
+  return functools.reduce(reduce_fn, cases, [])
+
+
 class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.default_test_combinations())
@@ -186,21 +208,82 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset.with_options(options)
     self.assertDatasetProduces(dataset, expected_output=[[0]])
 
-  @combinations.generate(test_base.default_test_combinations())
-  def testOptimizationDisableIntraOpParallelism(self):
-    os.environ["TF_DATA_EXPERIMENT_OPT_IN"] = "disable_intra_op_parallelism"
-    os.environ["TF_JOB_NAME"] = "test_job"
-
-    dataset = dataset_ops.Dataset.range(10).map(lambda x: x+1)
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          _disable_intra_op_parallelism_test_combinations(),
+          combinations.combine(apply_autotune=[None, True, False])))
+  def testOptimizationDisableIntraOpParallelism(self, dataset_fn,
+                                                expected_output,
+                                                apply_autotune):
+    dataset = dataset_fn()
     dataset = dataset.apply(testing.assert_next(["MaxIntraOpParallelism"]))
+    if apply_autotune is not None:
+      options = dataset_ops.Options()
+      options.experimental_optimization.autotune = apply_autotune
+      dataset = dataset.with_options(options)
+
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(autotune=False, autotune_buffers=False) +
+          combinations.combine(autotune=True, autotune_buffers=False) +
+          combinations.combine(autotune=True, autotune_buffers=True),
+          combinations.combine(set_env=[False, True])))
+  def testOptimizationEnableGradientDescent(self, autotune, autotune_buffers,
+                                            set_env):
+    if set_env:
+      os.environ["TF_DATA_EXPERIMENT_OPT_IN"] = "enable_gradient_descent"
+      os.environ["TF_JOB_NAME"] = "test_job"
+
+    dataset = dataset_ops.Dataset.range(5)
+    dataset = dataset.prefetch(buffer_size=-1)
+    dataset = dataset.map(lambda x: x + 1, num_parallel_calls=2)
+    dataset = dataset.map(lambda x: x + 1, num_parallel_calls=-1)
+    dataset = dataset.prefetch(buffer_size=3)
+    dataset = dataset.map(lambda x: x + 1, num_parallel_calls=-1)
+    dataset = dataset.prefetch(buffer_size=1)
 
     options = dataset_ops.Options()
+    options.experimental_optimization.autotune = autotune
+    options.experimental_optimization.autotune_buffers = autotune_buffers
+    dataset = dataset.with_options(options)
+
+    self.assertDatasetProduces(dataset, expected_output=list(range(3, 8)))
+
+    if set_env:
+      del os.environ["TF_DATA_EXPERIMENT_OPT_IN"]
+      del os.environ["TF_JOB_NAME"]
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(autotune=False, autotune_buffers=False) +
+          combinations.combine(autotune=True, autotune_buffers=False) +
+          combinations.combine(autotune=True, autotune_buffers=True),
+          combinations.combine(first_buffer_sizes=[(1, -1, -1, 4),
+                                                   (2, -1, 3, -1),
+                                                   (2, 1, -1, -1)]),
+          combinations.combine(second_buffer_sizes=[(1, -1, -1, 4),
+                                                    (2, -1, 3, -1),
+                                                    (2, 1, -1, -1)]))
+  )
+  def testOptimizationAutotuneBuffers(self, autotune, autotune_buffers,
+                                      first_buffer_sizes, second_buffer_sizes):
+    dataset = dataset_ops.Dataset.range(10)
+    for buffer_size in first_buffer_sizes:
+      dataset = dataset.prefetch(buffer_size=buffer_size)
+    dataset = dataset.map(lambda x: x + 1)
+    for buffer_size in second_buffer_sizes:
+      dataset = dataset.prefetch(buffer_size=buffer_size)
+    options = dataset_ops.Options()
+    options.experimental_optimization.autotune = autotune
+    options.experimental_optimization.autotune_buffers = autotune_buffers
     dataset = dataset.with_options(options)
     self.assertDatasetProduces(dataset, expected_output=list(range(1, 11)))
 
-    del os.environ["TF_DATA_EXPERIMENT_OPT_IN"]
-    del os.environ["TF_JOB_NAME"]
-
   @combinations.generate(test_base.default_test_combinations())
   def testOptimizationThreadPoolDataset(self):
     dataset = dataset_ops.Dataset.range(10).batch(10)
@@ -339,10 +422,11 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         "parallel_batch",
         "shuffle_and_repeat_fusion",
         "map_vectorization",
-        "inject_prefetch",
+        "autotune_buffer_sizes",
         "make_sloppy",
         "latency_all_edges",
         "slack",
+        "disable_prefetch_legacy_autotune",
     ]
     expected_optimizations_disabled = []
     expected_optimizations_default = []
@@ -387,10 +471,11 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         "parallel_batch",
         "shuffle_and_repeat_fusion",
         "map_vectorization",
-        "inject_prefetch",
+        "autotune_buffer_sizes",
         "make_sloppy",
         "latency_all_edges",
         "slack",
+        "disable_prefetch_legacy_autotune",
     ]
     expected_optimizations_default = []
     graph_rewrites = options._graph_rewrites()
@@ -406,23 +491,29 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     options = dataset_ops.Options()
 
     # Check defaults
-    autotune, algorithm, cpu_budget = options._autotune_settings()
+    autotune, algorithm, cpu_budget, ram_budget = options._autotune_settings()
     self.assertTrue(autotune)
     self.assertEqual(algorithm,
                      optimization_options._AutotuneAlgorithm.HILL_CLIMB)
     self.assertEqual(cpu_budget, 0)
+    self.assertEqual(ram_budget, 0)
 
   @combinations.generate(test_base.default_test_combinations())
-  def testAutotuningBufferSizes(self):
+  def testAutotuningSettings(self):
     options = dataset_ops.Options()
+    options.experimental_optimization.autotune_cpu_budget = 1000
+    options.experimental_optimization.autotune_ram_budget = 999999999
     options.experimental_optimization.autotune_buffers = True
-    self.assertIn("inject_prefetch", options._graph_rewrites().enabled)
-    autotune, algorithm, cpu_budget = options._autotune_settings()
+    self.assertIn("autotune_buffer_sizes", options._graph_rewrites().enabled)
+    self.assertIn("disable_prefetch_legacy_autotune",
+                  options._graph_rewrites().enabled)
+
+    autotune, algorithm, cpu_budget, ram_budget = options._autotune_settings()
     self.assertTrue(autotune)
     self.assertEqual(algorithm,
                      optimization_options._AutotuneAlgorithm.GRADIENT_DESCENT)
-    self.assertEqual(cpu_budget, 0)
-
+    self.assertEqual(cpu_budget, 1000)
+    self.assertEqual(ram_budget, 999999999)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
index 611fbab4b8b..c6652fb69d2 100644
--- a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
@@ -150,6 +150,17 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     self.assertDatasetProduces(device_dataset, list(range(10)))
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testPrefetchToDeviceCorrectPlacement(self):
+
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.apply(prefetching_ops.prefetch_to_device("/gpu:0"))
+
+    self.assertIn("gpu:0", dataset._variant_tensor.device.lower())
+
   @combinations.generate(test_base.graph_only_combinations())
   def testPrefetchToDeviceWithReInit(self):
     host_dataset = dataset_ops.Dataset.range(10)
diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py
index cbff39b90e5..ff83c24cf04 100644
--- a/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py
@@ -47,7 +47,7 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = multi_device_iterator._dataset  # pylint: disable=protected-access
     self.assertIn("slack", dataset.options()._graph_rewrites().enabled)
     self.assertIn("slack:slack_period:2",
-                  dataset.options()._graph_rewrite_configs())
+                  dataset.options()._graph_rewrite_configs(autotune=True))
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config):
@@ -71,7 +71,7 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset.with_options(options)
     self.assertIn("slack", dataset.options()._graph_rewrites().enabled)
     self.assertIn("slack:slack_period:1",
-                  dataset.options()._graph_rewrite_configs())
+                  dataset.options()._graph_rewrite_configs(autotune=True))
     self.assertDatasetProduces(dataset, range(10))
 
   @combinations.generate(test_base.default_test_combinations())
@@ -86,18 +86,16 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertDatasetProduces(dataset, range(1, 11))
 
   @combinations.generate(test_base.default_test_combinations())
-  def testErrorWithoutPrefetch(self):
-    """The rewrite fails if there is no prefetch() in the pipeline."""
+  def testNoErrorWithoutPrefetch(self):
+    """The rewrite should not fail if there is no prefetch() in the pipeline."""
     dataset = dataset_ops.Dataset.range(10)
     options = dataset_ops.Options()
     options.experimental_slack = True
     dataset = dataset.with_options(options)
-    with self.assertRaises(errors.InvalidArgumentError):
-      get_next = self.getNext(dataset)
-      self.evaluate(get_next())
+    self.assertDatasetProduces(dataset, range(10))
 
   @combinations.generate(test_base.default_test_combinations())
-  def testErrorWithInvalidDataset(self):
+  def testNoErrorWithInvalidDataset(self):
     """With a nested dataset op after prefetch, the rewrite should fail."""
     dataset = dataset_ops.Dataset.range(10)
     dataset = dataset.prefetch(1)
@@ -105,9 +103,7 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase):
     options = dataset_ops.Options()
     options.experimental_slack = True
     dataset = dataset.with_options(options)
-    with self.assertRaises(errors.InvalidArgumentError):
-      get_next = self.getNext(dataset)
-      self.evaluate(get_next())
+    self.assertDatasetProduces(dataset, range(10))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
index 8175480182f..941ce327555 100644
--- a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py
@@ -287,6 +287,40 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     expected_output = [[0], [1], [2], [3], [], [4], [5], [6], [7], []]
     self.assertDatasetProduces(rebatched_dataset, expected_output)
 
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(drop_remainder=[True, False])))
+  def testEmptyFirstSplits(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(8).batch(4, drop_remainder=True)
+    rebatched_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=[0, 1], drop_remainder=drop_remainder)
+
+    expected_shapes = [[None]]
+    self.assertEqual(expected_shapes, _flat_shapes(rebatched_dataset))
+
+    # We have an extra element at the end because if the desired batch size is
+    # zero, then we never read any inputs from the input_dataset at all, so we
+    # will keep producting empty outputs until we reach a non zero desired batch
+    # size split.
+    expected_output = [[], [0], [], [1], [], [2], [], [3],
+                       [], [4], [], [5], [], [6], [], [7], []]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(drop_remainder=[True, False])))
+  def testEmptyLastSplits(self, drop_remainder):
+    dataset = dataset_ops.Dataset.range(8).batch(4, drop_remainder=True)
+    rebatched_dataset = distribute._RebatchDataset(
+        dataset, batch_sizes=[1, 0], drop_remainder=drop_remainder)
+
+    expected_shapes = [[None]]
+    self.assertEqual(expected_shapes, _flat_shapes(rebatched_dataset))
+
+    expected_output = [[0], [], [1], [], [2], [], [3], [],
+                       [4], [], [5], [], [6], [], [7], []]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
   @combinations.generate(
       combinations.times(test_base.default_test_combinations(),
                          combinations.combine(drop_remainder=[True, False])))
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index b3123d65852..e676ff58e28 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -40,6 +40,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -58,6 +59,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:array_ops",
@@ -80,6 +82,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:array_ops",
@@ -93,13 +96,14 @@ tf_py_test(
 
 tf_py_test(
     name = "cache_dataset_serialization_test",
-    size = "small",
+    size = "medium",
     srcs = ["cache_dataset_serialization_test.py"],
     tags = [
         "no_oss",
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -118,6 +122,7 @@ tf_py_test(
         "no_windows",
         "notsan",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -154,13 +159,14 @@ tf_py_test(
 
 tf_py_test(
     name = "choose_fastest_dataset_serialization_test",
-    size = "small",
+    size = "medium",
     srcs = ["choose_fastest_dataset_serialization_test.py"],
     tags = [
         "no_oss",
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -178,6 +184,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -196,6 +203,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -207,13 +215,14 @@ tf_py_test(
 
 tf_py_test(
     name = "csv_dataset_serialization_test",
-    size = "small",
+    size = "medium",
     srcs = ["csv_dataset_serialization_test.py"],
     tags = [
         "no_oss",
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -231,6 +240,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -249,6 +259,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -269,6 +280,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -286,6 +298,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -311,6 +324,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -329,6 +343,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -346,7 +361,9 @@ tf_py_test(
         "no_oss",
         "no_pip",
         "no_windows",
+        "noasan",  # TODO(b/337374867) fails with -fsanitize=null
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:array_ops",
@@ -366,6 +383,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -386,6 +404,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -404,6 +423,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -427,6 +447,7 @@ tf_py_test(
     tags = [
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -438,13 +459,14 @@ tf_py_test(
 
 tf_py_test(
     name = "optimize_dataset_serialization_test",
-    size = "small",
+    size = "medium",
     srcs = ["optimize_dataset_serialization_test.py"],
     tags = [
         "no_oss",
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -455,13 +477,14 @@ tf_py_test(
 
 tf_py_test(
     name = "rebatch_dataset_serialization_test",
-    size = "small",
+    size = "medium",
     srcs = ["rebatch_dataset_serialization_test.py"],
     tags = [
         "no_oss",
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -479,6 +502,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:array_ops",
@@ -498,6 +522,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -518,6 +543,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -542,6 +568,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -558,6 +585,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -574,6 +602,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -597,6 +626,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -607,13 +637,14 @@ tf_py_test(
 
 tf_py_test(
     name = "scan_dataset_serialization_test",
-    size = "small",
+    size = "medium",
     srcs = ["scan_dataset_serialization_test.py"],
     tags = [
         "no_oss",
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -626,11 +657,13 @@ tf_py_test(
     name = "sequence_dataset_serialization_test",
     size = "medium",
     srcs = ["sequence_dataset_serialization_test.py"],
+    shard_count = 5,
     tags = [
         "no_oss",
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -648,6 +681,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_combinations",
@@ -668,6 +702,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -685,6 +720,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -702,6 +738,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -721,6 +758,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:array_ops",
@@ -740,6 +778,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:array_ops",
@@ -760,6 +799,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -779,6 +819,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -797,6 +838,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -814,6 +856,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -832,6 +875,7 @@ tf_py_test(
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
@@ -842,13 +886,14 @@ tf_py_test(
 
 tf_py_test(
     name = "zip_dataset_serialization_test",
-    size = "small",
+    size = "medium",
     srcs = ["zip_dataset_serialization_test.py"],
     tags = [
         "no_oss",
         "no_pip",
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
index dc7dd61679e..b1fa780f6b3 100644
--- a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
@@ -132,6 +132,27 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     dataset2 = dataset2.apply(snapshot.snapshot(self._snapshot_dir))
     self.assertDatasetProduces(dataset2, expected)
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testReadSnapshotDatasetAutoWriteSnappyRead(self):
+    self.createTFRecords()
+    filenames = self._test_filenames
+    expected = [
+        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
+        for f in range(0, 10)
+        for r in range(0, 100)
+    ]
+
+    dataset = core_readers._TFRecordDataset(filenames)
+    dataset = dataset.apply(
+        snapshot.snapshot(self._snapshot_dir, compression="AUTO"))
+    self.assertDatasetProduces(dataset, expected)
+
+    self.removeTFRecords()
+    dataset2 = core_readers._TFRecordDataset(filenames)
+    dataset2 = dataset2.apply(
+        snapshot.snapshot(self._snapshot_dir, compression="SNAPPY"))
+    self.assertDatasetProduces(dataset2, expected)
+
   @combinations.generate(test_base.default_test_combinations())
   def testReadSnapshotDatasetCustomShardFn(self):
     self.createTFRecords()
@@ -304,10 +325,12 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset = dataset_ops.Dataset.zip((dataset1, dataset2, dataset3, dataset4))
     dataset = dataset.apply(snapshot.snapshot(self._snapshot_dir))
-    next1 = self.getNext(dataset)
-    for i in range(0, 1000):
-      self.assertEqual((i, i + 1000, i + 2000, i + 3000),
-                       self.evaluate(next1()))
+
+    expected = list(
+        zip(
+            range(0, 1000), range(1000, 2000), range(2000, 3000),
+            range(3000, 4000)))
+    self.assertDatasetProduces(dataset, expected)
     self.assertSnapshotDirectoryContains(
         self._snapshot_dir,
         num_fingerprints=1,
diff --git a/tensorflow/python/data/experimental/kernel_tests/unique_test.py b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
index 2b1b525a1b5..abae3e18ff5 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unique_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
@@ -61,6 +61,7 @@ class UniqueTest(test_base.DatasetTestBase, parameterized.TestCase):
           ([], []),
           ([1], [1]),
           ([1, 1, 1, 1, 1, 1, 1], [1]),
+          ([1, 1, 1, 1, 0], [1, 0]),
           ([1, 2, 3, 4], [1, 2, 3, 4]),
           ([1, 2, 4, 3, 2, 1, 2, 3, 4], [1, 2, 4, 3]),
           ([[1], [1, 1], [1, 1, 1]], [[1], [1, 1], [1, 1, 1]]),
diff --git a/tensorflow/python/data/experimental/ops/batching.py b/tensorflow/python/data/experimental/ops/batching.py
index 4d2bfbb8dc9..3d04de8970c 100644
--- a/tensorflow/python/data/experimental/ops/batching.py
+++ b/tensorflow/python/data/experimental/ops/batching.py
@@ -176,7 +176,7 @@ def map_and_batch_with_legacy_function(map_func,
     num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
       representing the number of elements to process in parallel. If not
       specified, `batch_size * num_parallel_batches` elements will be processed
-      in parallel. If the value `tf.data.experimental.AUTOTUNE` is used, then
+      in parallel. If the value `tf.data.AUTOTUNE` is used, then
       the number of parallel calls is set dynamically based on available CPU.
 
   Returns:
@@ -237,7 +237,7 @@ def map_and_batch(map_func,
     num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
       representing the number of elements to process in parallel. If not
       specified, `batch_size * num_parallel_batches` elements will be processed
-      in parallel. If the value `tf.data.experimental.AUTOTUNE` is used, then
+      in parallel. If the value `tf.data.AUTOTUNE` is used, then
       the number of parallel calls is set dynamically based on available CPU.
 
   Returns:
diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
index b5dd6bba5d8..f6f967ebac1 100644
--- a/tensorflow/python/data/experimental/ops/data_service_ops.py
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -34,12 +34,17 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 class ProcessingMode(object):
+  """tf.data service processing modes."""
+
   PARALLEL_EPOCHS = "parallel_epochs"
+  DISTRIBUTED_EPOCH = "distributed_epoch"
 
   @staticmethod
   def validate(mode):
     """Raises a ValueError if the given object is not a valid processing mode."""
-    valid_modes = [ProcessingMode.PARALLEL_EPOCHS]
+    valid_modes = [
+        ProcessingMode.PARALLEL_EPOCHS, ProcessingMode.DISTRIBUTED_EPOCH
+    ]
     if mode not in valid_modes:
       raise ValueError(
           "{0} is not a valid processing mode. Valid modes: {1}".format(
@@ -62,8 +67,10 @@ class _DataServiceDatasetV2(dataset_ops.DatasetSource):
     Args:
       dataset_id: The dataset id for the dataset to read from.
       processing_mode: A string specifying the policy for how data should be
-        processed by tf.data workers. Currently, the only supported value is
-        "parallel_epochs".
+        processed by tf.data workers. Can be either "parallel_epochs" to have
+        each tf.data worker process a copy of the dataset, or
+        "distributed_epoch" to split a single iteration of the dataset across
+        all the workers.
       address: The tf.data service address, e.g. "localhost:5000".
       protocol: The protocol to use for communicating with the tf.data service,
         e.g. "grpc".
@@ -186,8 +193,10 @@ def _from_dataset_id(processing_mode,
 
   Args:
     processing_mode: A string specifying the policy for how data should be
-      processed by tf.data workers. Currently, the only supported value is
-      "parallel_epochs".
+      processed by tf.data workers. Can be either "parallel_epochs" to have
+      each tf.data worker process a copy of the dataset, or
+      "distributed_epoch" to split a single iteration of the dataset across
+      all the workers.
     service: A string indicating how to connect to the tf.data service. The
       string should be in the format "<protocol>://<address>", e.g.
       "grpc://localhost:5000".
@@ -229,13 +238,9 @@ def _from_dataset_id(processing_mode,
       job_name=job_name,
       max_outstanding_requests=max_outstanding_requests,
       task_refresh_interval_hint_ms=task_refresh_interval_hint_ms)
-  # TODO(b/157105111): Make this an autotuned parallel map when we have a way
-  # to limit memory usage.
-  # The value 16 is chosen based on experience with pipelines that require
-  # more than 8 parallel calls to prevent this stage from being a bottleneck.
   dataset = dataset.map(
       lambda x: compression_ops.uncompress(x, output_spec=element_spec),
-      num_parallel_calls=16)
+      num_parallel_calls=dataset_ops.AUTOTUNE)
 
   # Disable autosharding for shared jobs.
   if job_name:
@@ -257,8 +262,10 @@ def _distribute(processing_mode,
 
   Args:
     processing_mode: A string specifying the policy for how data should be
-      processed by tf.data workers. Currently, the only supported value is
-      "parallel_epochs".
+      processed by tf.data workers. Can be either "parallel_epochs" to have
+      each tf.data worker process a copy of the dataset, or
+      "distributed_epoch" to split a single iteration of the dataset across
+      all the workers.
     service: A string indicating how to connect to the tf.data service. The
       string should be in the format "<protocol>://<address>", e.g.
       "grpc://localhost:5000".
@@ -302,51 +309,105 @@ def distribute(processing_mode,
   the tf.data service creates a "job" which produces data for the dataset
   iteration.
 
-  The `processing_mode` argument controls what data is produced by a tf.data
-  service job. Currently, the only supported mode is "parallel_epochs".
+  The tf.data service uses a cluster of workers to prepare data for training
+  your model.
+  The `processing_mode` argument to `tf.data.experimental.service.distribute`
+  describes how to leverage multiple workers to process the input dataset.
+  Currently, there are two processing modes to choose from: "distributed_epoch"
+  and "parallel_epochs".
 
-  processing_mode="parallel_epochs" means that multiple tf.data workers will
-  iterate through the dataset in parallel, each producing all elements of the
-  dataset. For example, if the dataset contains {0, 1, 2}, every tf.data worker
-  used for execution will produce {0, 1, 2}. If there are 3 workers, the job
-  will produce the elements {0, 0, 0, 1, 1, 1, 2, 2, 2} (though not necessarily
-  in that order). To account for this, it is recommended to randomly shuffle
-  your dataset, so that different tf.data workers will iterate through the
-  dataset in different orders.
+  "distributed_epoch" means that the dataset will be split across all tf.data
+  service workers.
+  The dispatcher produces "splits" for the dataset and sends them to workers for
+  further processing. For example, if a dataset begins with a list of filenames,
+  the dispatcher will iterate through the filenames and send the filenames to
+  tf.data workers, which will perform the rest of the dataset transformations on
+  those files. "distributed_epoch" is useful when your model needs to see each
+  element of the dataset exactly once, or if it needs to see the data in a
+  generally-sequential order. "distributed_epoch" only works for datasets with
+  splittable sources, such as `Dataset.from_tensor_slices`,
+  `Dataset.list_files`, or `Dataset.range`.
 
-  In the future, there will be additional processing modes. For example,
-  a "one_epoch" mode which partitions the dataset across the tf.data
-  workers, so that the consumers see each element of the dataset only once.
+  "parallel_epochs" means that the entire input dataset will be processed
+  independently by each of the tf.data service workers.
+  For this reason, it is important to shuffle data (e.g. filenames)
+  non-deterministically, so that each worker will process the elements of the
+  dataset in a different order. "parallel_epochs" can be used to distribute
+  datasets that aren't splittable.
 
-  ```
-  dataset = tf.data.Dataset.range(5)
-  dataset = dataset.map(lambda x: x*x)
-  dataset = dataset.apply(
-      tf.data.experimental.service.distribute("parallel_epochs",
-                                              "grpc://dataservice:5000"))
-  dataset = dataset.map(lambda x: x+1)
+  With two workers, "parallel_epochs" will produce every element of the dataset
+  twice:
 
-  for element in dataset:
-    print(element)  # prints { 1, 2, 5, 10, 17 }
-  ```
+  >>> dispatcher = tf.data.experimental.service.DispatchServer()
+  >>> dispatcher_address = dispatcher.target.split("://")[1]
+  >>> # Start two workers
+  >>> workers = [
+  ...     tf.data.experimental.service.WorkerServer(
+  ...         tf.data.experimental.service.WorkerConfig(
+  ...             dispatcher_address=dispatcher_address)) for _ in range(2)
+  ... ]
+  >>> dataset = tf.data.Dataset.range(10)
+  >>> dataset = dataset.apply(tf.data.experimental.service.distribute(
+  ...     processing_mode="parallel_epochs", service=dispatcher.target))
+  >>> print(sorted(list(dataset.as_numpy_iterator())))
+  [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9]
 
-  In the above example, the first two lines (before the call to `distribute`)
-  will be executed on tf.data workers, and the elements provided over
-  RPC. The remaining transformations (after the call to `distribute`) will be
-  executed locally.
+  "distributed_epoch", on the other hand, will still produce each element once:
+
+  >>> dispatcher = tf.data.experimental.service.DispatchServer()
+  >>> dispatcher_address = dispatcher.target.split("://")[1]
+  >>> workers = [
+  ...     tf.data.experimental.service.WorkerServer(
+  ...         tf.data.experimental.service.WorkerConfig(
+  ...             dispatcher_address=dispatcher_address)) for _ in range(2)
+  ... ]
+  >>> dataset = tf.data.Dataset.range(10)
+  >>> dataset = dataset.apply(tf.data.experimental.service.distribute(
+  ...     processing_mode="distributed_epoch", service=dispatcher.target))
+  >>> print(sorted(list(dataset.as_numpy_iterator())))
+  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
+
+  When using `apply(tf.data.experimental.service.distribute(...))`, the dataset
+  before the `apply` transformation executes within the tf.data service, while
+  the operations after `apply` happen within the local process.
+
+  >>> dispatcher = tf.data.experimental.service.DispatchServer()
+  >>> dispatcher_address = dispatcher.target.split("://")[1]
+  >>> workers = [
+  ...     tf.data.experimental.service.WorkerServer(
+  ...         tf.data.experimental.service.WorkerConfig(
+  ...             dispatcher_address=dispatcher_address)) for _ in range(2)
+  ... ]
+  >>> dataset = tf.data.Dataset.range(5)
+  >>> dataset = dataset.map(lambda x: x*x)
+  >>> dataset = dataset.apply(
+  ...    tf.data.experimental.service.distribute("parallel_epochs",
+  ...                                            dispatcher.target))
+  >>> dataset = dataset.map(lambda x: x+1)
+  >>> print(sorted(list(dataset.as_numpy_iterator())))
+  [1, 1, 2, 2, 5, 5, 10, 10, 17, 17]
+
+  In the above example, the dataset operations (before applying the `distribute`
+  function on the elements) will be executed on the tf.data workers,
+  and the elements are provided over RPC. The remaining transformations
+  (after the call to `distribute`) will be executed locally. The dispatcher
+  and the workers will bind to usused free ports (which are chosen at random),
+  in order to communicate with each other. However, to bind them to specific
+  ports, the `port` parameter can be passed.
 
   The `job_name` argument allows jobs to be shared across multiple
   datasets. Instead of each dataset creating its own job, all
   datasets with the same `job_name` will consume from the same job. A new job
   will be created for each iteration of the dataset (with each repetition of
-  `Dataset.repeat` counting as a new iteration). Suppose two training workers
-  (in either a single client or multi-client setup) iterate over the below
-  dataset, and there is a single tf.data worker:
+  `Dataset.repeat` counting as a new iteration). Suppose the `DispatchServer`
+  is serving on `localhost:5000` and two training workers (in either a single
+  client or multi-client setup) iterate over the below dataset, and there is a
+  single tf.data worker:
 
   ```
   range5_dataset = tf.data.Dataset.range(5)
   dataset = range5_dataset.apply(tf.data.experimental.service.distribute(
-      "parallel_epochs", "grpc://dataservice:5000", job_name="my_job_name"))
+      "parallel_epochs", "grpc://localhost:5000", job_name="my_job_name"))
   for iteration in range(3):
     print(list(dataset))
   ```
@@ -397,8 +458,10 @@ def distribute(processing_mode,
 
   Args:
     processing_mode: A string specifying the policy for how data should be
-      processed by tf.data workers. Currently, the only supported value is
-      "parallel_epochs".
+      processed by tf.data workers. Can be either "parallel_epochs" to have
+      each tf.data worker process a copy of the dataset, or
+      "distributed_epoch" to split a single iteration of the dataset across
+      all the workers.
     service: A string indicating how to connect to the tf.data service. The
       string should be in the format "protocol://address", e.g.
       "grpc://localhost:5000".
@@ -435,10 +498,11 @@ def register_dataset(service, dataset):
   If the dataset is already registered with the tf.data service,
   `register_dataset` returns the already-registered dataset's id.
 
-  >>> dispatcher = tf.data.experimental.service.DispatchServer(port=0)
+  >>> dispatcher = tf.data.experimental.service.DispatchServer()
   >>> dispatcher_address = dispatcher.target.split("://")[1]
   >>> worker = tf.data.experimental.service.WorkerServer(
-  ...     port=0, dispatcher_address=dispatcher_address)
+  ...     tf.data.experimental.service.WorkerConfig(
+  ...         dispatcher_address=dispatcher_address))
   >>> dataset = tf.data.Dataset.range(10)
   >>> dataset_id = tf.data.experimental.service.register_dataset(
   ...     dispatcher.target, dataset)
@@ -466,14 +530,10 @@ def register_dataset(service, dataset):
 
   # Compress the dataset elements to reduce the amount of data that needs to
   # be sent over the network.
-  # TODO(b/157105111): Make this an autotuned parallel map when we have a way
-  # to limit memory usage.
-  dataset = dataset.map(lambda *x: compression_ops.compress(x))
-  # Prefetch one compressed element to reduce latency when requesting data
-  # from tf.data workers.
-  # TODO(b/157105111): Set this to autotune when we have a way to limit
-  # memory usage
-  dataset = dataset.prefetch(1)
+  dataset = dataset.map(
+      lambda *x: compression_ops.compress(x),
+      num_parallel_calls=dataset_ops.AUTOTUNE)
+  dataset = dataset.prefetch(dataset_ops.AUTOTUNE)
   # Apply options so that the dataset executed in the tf.data service will
   # be optimized and support autotuning.
   dataset = dataset._apply_options()  # pylint: disable=protected-access
@@ -518,10 +578,11 @@ def from_dataset_id(processing_mode,
   See the documentation for `tf.data.experimental.service.distribute` for more
   detail about how `from_dataset_id` works.
 
-  >>> dispatcher = tf.data.experimental.service.DispatchServer(port=0)
+  >>> dispatcher = tf.data.experimental.service.DispatchServer()
   >>> dispatcher_address = dispatcher.target.split("://")[1]
   >>> worker = tf.data.experimental.service.WorkerServer(
-  ...     port=0, dispatcher_address=dispatcher_address)
+  ...     tf.data.experimental.service.WorkerConfig(
+  ...         dispatcher_address=dispatcher_address))
   >>> dataset = tf.data.Dataset.range(10)
   >>> dataset_id = tf.data.experimental.service.register_dataset(
   ...     dispatcher.target, dataset)
@@ -535,8 +596,10 @@ def from_dataset_id(processing_mode,
 
   Args:
     processing_mode: A string specifying the policy for how data should be
-      processed by tf.data workers. Currently, the only supported value is
-      "parallel_epochs".
+      processed by tf.data workers. Can be either "parallel_epochs" to have
+      each tf.data worker process a copy of the dataset, or
+      "distributed_epoch" to split a single iteration of the dataset across
+      all the workers.
     service: A string indicating how to connect to the tf.data service. The
       string should be in the format "protocol://address", e.g.
       "grpc://localhost:5000".
diff --git a/tensorflow/python/data/experimental/ops/distribute.py b/tensorflow/python/data/experimental/ops/distribute.py
index 5105f30fd07..568c01646de 100644
--- a/tensorflow/python/data/experimental/ops/distribute.py
+++ b/tensorflow/python/data/experimental/ops/distribute.py
@@ -85,9 +85,9 @@ class _AutoShardDataset(dataset_ops.UnaryDataset):
     return self._element_spec
 
 
-def _AutoShardDatasetV1(input_dataset, num_workers, index):  # pylint: disable=invalid-name
+def _AutoShardDatasetV1(input_dataset, num_workers, index, num_replicas=None):  # pylint: disable=invalid-name
   return dataset_ops.DatasetV1Adapter(
-      _AutoShardDataset(input_dataset, num_workers, index))
+      _AutoShardDataset(input_dataset, num_workers, index, num_replicas))
 
 
 class _RebatchDataset(dataset_ops.UnaryDataset):
diff --git a/tensorflow/python/data/experimental/ops/interleave_ops.py b/tensorflow/python/data/experimental/ops/interleave_ops.py
index a3cc75a35d3..4c16d353107 100644
--- a/tensorflow/python/data/experimental/ops/interleave_ops.py
+++ b/tensorflow/python/data/experimental/ops/interleave_ops.py
@@ -37,7 +37,7 @@ from tensorflow.python.util.tf_export import tf_export
 @deprecation.deprecated(
     None,
     "Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, "
-    "num_parallel_calls=tf.data.experimental.AUTOTUNE)` instead. If sloppy "
+    "num_parallel_calls=tf.data.AUTOTUNE)` instead. If sloppy "
     "execution is desired, use `tf.data.Options.experimental_deterministic`.")
 @tf_export("data.experimental.parallel_interleave")
 def parallel_interleave(map_func,
diff --git a/tensorflow/python/data/experimental/ops/optimization_options.py b/tensorflow/python/data/experimental/ops/optimization_options.py
index fa7a0d23dea..5c69855e15f 100644
--- a/tensorflow/python/data/experimental/ops/optimization_options.py
+++ b/tensorflow/python/data/experimental/ops/optimization_options.py
@@ -117,6 +117,14 @@ class OptimizationOptions(options.OptionsBase):
       "are allowed but may result in CPU contention. If None, defaults to the "
       "number of schedulable CPU cores.")
 
+  autotune_ram_budget = options.create_option(
+      name="autotune_ram_budget",
+      ty=int,
+      docstring=
+      "When autotuning is enabled (through `autotune`), determines the RAM "
+      "budget to use. Values greater than the available RAM in bytes may "
+      "result in OOM. If None, defaults to half of the available RAM in bytes.")
+
   filter_fusion = options.create_option(
       name="filter_fusion",
       ty=bool,
@@ -223,14 +231,17 @@ class OptimizationOptions(options.OptionsBase):
         _AutotuneAlgorithm.GRADIENT_DESCENT
         if self._autotune_buffers() else _AutotuneAlgorithm.HILL_CLIMB)
     cpu_budget = 0  # Indicates that all CPU cores should be used by default.
+    ram_budget = 0  # Indicates that default value of RAM budget should be used.
 
     # Set these options if they are explicitly set by the user.
     if self.autotune is False:  # pylint: disable=g-bool-id-comparison
       autotune = False
     if self.autotune_cpu_budget is not None:
       cpu_budget = self.autotune_cpu_budget
+    if self.autotune_ram_budget is not None:
+      ram_budget = self.autotune_ram_budget
 
-    return autotune, algorithm, cpu_budget
+    return autotune, algorithm, cpu_budget, ram_budget
 
   def _graph_rewrites(self):
     """Produces lists of enabled, disabled and default graph optimizations.
@@ -288,13 +299,31 @@ class OptimizationOptions(options.OptionsBase):
       # prefetch transformations will be autotuned, though this is practically
       # equivalent to tuning the buffer sizes of the other asynchronous
       # transformations.
-      result.enabled.append("inject_prefetch")
+      result.enabled.append("autotune_buffer_sizes")
+      result.enabled.append("disable_prefetch_legacy_autotune")
+
     if self.autotune is False:  # pylint: disable=g-bool-id-comparison
-      result.disabled.append("inject_prefetch")
+      result.disabled.append("autotune_buffer_sizes")
+      result.disabled.append("disable_prefetch_legacy_autotune")
 
     return result
 
-  def _graph_rewrite_configs(self):
+  def _graph_rewrite_configs(self, autotune):
     if self.map_vectorization is not None:
-      return self.map_vectorization._graph_rewrite_configs()  # pylint: disable=protected-access
-    return []
+      graph_rewrite_configs = self.map_vectorization._graph_rewrite_configs()  # pylint: disable=protected-access
+    else:
+      graph_rewrite_configs = []
+    autotune_only_optimizations = [
+        "autotune_buffer_sizes",
+        "disable_prefetch_legacy_autotune",
+        "enable_gradient_descent",
+        "map_parallelization"
+    ]
+    if autotune is False:  # pylint: disable=g-bool-id-comparison
+      for optimization in autotune_only_optimizations:
+        graph_rewrite_configs.append(optimization + ":autotune:false")
+    else:
+      for optimization in autotune_only_optimizations:
+        graph_rewrite_configs.append(optimization + ":autotune:true")
+
+    return graph_rewrite_configs
diff --git a/tensorflow/python/data/experimental/ops/snapshot.py b/tensorflow/python/data/experimental/ops/snapshot.py
index acba57da2a4..2cf7cd8704d 100644
--- a/tensorflow/python/data/experimental/ops/snapshot.py
+++ b/tensorflow/python/data/experimental/ops/snapshot.py
@@ -334,23 +334,27 @@ def snapshot(path, compression="AUTO", reader_func=None, shard_func=None):
 
   def _apply_fn(dataset):
     """Actual dataset transformation."""
+    project_func = None
     if shard_func is None:
       dataset = dataset.enumerate()
-      dataset = _SnapshotDataset(
-          input_dataset=dataset,
-          path=path,
-          compression=compression,
-          reader_func=reader_func,
-          # This will not do the right thing where the graph is built on a
-          # different machine than the executor (e.g. Cloud TPUs).
-          shard_func=lambda index, _: index % multiprocessing.cpu_count())
-      return dataset.map(lambda _, elem: elem)
+      # This sets the amount of parallelism based on the number of CPU cores on
+      # the machine where this Python code is executed, which may differ from
+      # the number of CPU cores where the input pipeline graph is actually
+      # executed (e.g. remote Cloud TPU workers).
+      local_shard_func = lambda index, _: index % multiprocessing.cpu_count()
+      project_func = lambda _, elem: elem
     else:
-      return _SnapshotDataset(
-          input_dataset=dataset,
-          path=path,
-          compression=compression,
-          reader_func=reader_func,
-          shard_func=shard_func)
+      local_shard_func = shard_func
+    dataset = _SnapshotDataset(
+        input_dataset=dataset,
+        path=path,
+        compression=compression,
+        reader_func=reader_func,
+        # This will not do the right thing where the graph is built on a
+        # different machine than the executor (e.g. Cloud TPUs).
+        shard_func=local_shard_func)
+    if project_func is not None:
+      dataset = dataset.map(project_func)
+    return dataset
 
   return _apply_fn
diff --git a/tensorflow/python/data/experimental/ops/stats_aggregator.py b/tensorflow/python/data/experimental/ops/stats_aggregator.py
index d8174acb818..bf4bb4762b1 100644
--- a/tensorflow/python/data/experimental/ops/stats_aggregator.py
+++ b/tensorflow/python/data/experimental/ops/stats_aggregator.py
@@ -21,6 +21,7 @@ import tempfile
 
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import summary_ops_v2
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -28,6 +29,7 @@ _DEFAULT_MAX_QUEUE = 10
 
 
 @tf_export("data.experimental.StatsAggregator", v1=[])
+@deprecation.deprecated_endpoints("data.experimental.StatsAggregator")
 class StatsAggregatorV2(object):
   """A stateful resource that aggregates statistics from one or more iterators.
 
@@ -60,6 +62,12 @@ class StatsAggregatorV2(object):
   different ways of exporting statistics, and add more types of statistics.
   """
 
+  # This deprecation warning on __init__ is necessary to print deprecation
+  # messages.
+  @deprecation.deprecated(
+      None,
+      "Use TF Profiler to analyze performance instead."
+  )
   def __init__(self):
     self._resource = ged_ops.stats_aggregator_handle_v2()
     # There could be a conflict with multiple file writer in the same logdir,
@@ -79,6 +87,7 @@ class StatsAggregatorV2(object):
 
 
 @tf_export(v1=["data.experimental.StatsAggregator"])
+@deprecation.deprecated_endpoints("data.experimental.StatsAggregator")
 class StatsAggregatorV1(object):
   """A stateful resource that aggregates statistics from one or more iterators.
 
@@ -123,6 +132,12 @@ class StatsAggregatorV1(object):
   different ways of exporting statistics, and add more types of statistics.
   """
 
+  # This deprecation warning on __init__ is necessary to print deprecation
+  # messages.
+  @deprecation.deprecated(
+      None,
+      "Use TF Profiler to analyze performance instead."
+  )
   def __init__(self):
     """Creates a `StatsAggregator`."""
     self._resource = ged_ops.stats_aggregator_handle()
diff --git a/tensorflow/python/data/experimental/service/BUILD b/tensorflow/python/data/experimental/service/BUILD
index e8af29caa0e..b6096b4b0bb 100644
--- a/tensorflow/python/data/experimental/service/BUILD
+++ b/tensorflow/python/data/experimental/service/BUILD
@@ -36,10 +36,13 @@ py_library(
 tf_py_test(
     name = "server_lib_test",
     srcs = ["server_lib_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":server_lib",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:errors",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/profiler:profiler_client",
     ],
 )
 
diff --git a/tensorflow/python/data/experimental/service/__init__.py b/tensorflow/python/data/experimental/service/__init__.py
index 987eb6d6dc2..fbfb75ab4cc 100644
--- a/tensorflow/python/data/experimental/service/__init__.py
+++ b/tensorflow/python/data/experimental/service/__init__.py
@@ -36,15 +36,26 @@ training.
 The tf.data service uses a cluster of workers to prepare data for training your
 model. The `processing_mode` argument to
 `tf.data.experimental.service.distribute` describes how to leverage multiple
-workers to process the input dataset. Currently, the only supported
-processing mode is "parallel_epochs", which means that the entire input dataset
-will be processed independently by each of the tf.data service workers. For this
+workers to process the input dataset. Currently, there are two processing modes
+to choose from: "distributed_epoch" and "parallel_epochs".
+
+"distributed_epoch" means that the dataset will be split across all tf.data
+service workers. The dispatcher produces "splits" for the dataset and sends them
+to workers for further processing. For example, if a dataset begins with a list
+of filenames, the dispatcher will iterate through the filenames and send the
+filenames to tf.data workers, which will perform the rest of the dataset
+transformations on those files. "distributed_epoch" is useful when your model
+needs to see each element of the dataset exactly once, or if it needs to see the
+data in a generally-sequential order. "distributed_epoch" only works for
+datasets with splittable sources, such as `Dataset.from_tensor_slices`,
+`Dataset.list_files`, or `Dataset.range`.
+
+"parallel_epochs" means that the entire input dataset will be processed
+independently by each of the tf.data service workers. For this
 reason, it is important to shuffle data (e.g. filenames) non-deterministically,
 so that each worker will process the elements of the dataset in a different
-order. If your model  requires input data to arrive in a certain order, the
-"parallel_epochs" processing mode will not work well. We plan to support
-additional modes of processing (such as processing a different shard of the
-input data by each worker) in the near future.
+order. "parallel_epochs" can be used to distribute datasets that aren't
+splittable.
 
 ### Measure potential impact
 
@@ -94,7 +105,7 @@ address and port. To connect to the service, you will use a string in the format
 "grpc://<dispatcher_address>:<dispatcher_port>".
 
 ```
-# Create dataset however you were before using the tf.data service.
+# Create the dataset however you were before using the tf.data service.
 dataset = your_dataset_factory()
 
 service = "grpc://{}:{}".format(dispatcher_address, dispatcher_port)
@@ -107,10 +118,11 @@ dataset = dataset.apply(tf.data.experimental.service.distribute(
 
 Below is a toy example that you can run yourself.
 
->>> dispatcher = tf.data.experimental.service.DispatchServer(port=0)
+>>> dispatcher = tf.data.experimental.service.DispatchServer()
 >>> dispatcher_address = dispatcher.target.split("://")[1]
 >>> worker = tf.data.experimental.service.WorkerServer(
-...     port=0, dispatcher_address=dispatcher_address)
+...     tf.data.experimental.service.WorkerConfig(
+...         dispatcher_address=dispatcher_address))
 >>> dataset = tf.data.Dataset.range(10)
 >>> dataset = dataset.apply(tf.data.experimental.service.distribute(
 ...     processing_mode="parallel_epochs", service=dispatcher.target))
@@ -128,5 +140,7 @@ from __future__ import print_function
 from tensorflow.python.data.experimental.ops.data_service_ops import distribute
 from tensorflow.python.data.experimental.ops.data_service_ops import from_dataset_id
 from tensorflow.python.data.experimental.ops.data_service_ops import register_dataset
+from tensorflow.python.data.experimental.service.server_lib import DispatcherConfig
 from tensorflow.python.data.experimental.service.server_lib import DispatchServer
+from tensorflow.python.data.experimental.service.server_lib import WorkerConfig
 from tensorflow.python.data.experimental.service.server_lib import WorkerServer
diff --git a/tensorflow/python/data/experimental/service/server_lib.py b/tensorflow/python/data/experimental/service/server_lib.py
index 9eaeb9b7722..95179a4a7df 100644
--- a/tensorflow/python/data/experimental/service/server_lib.py
+++ b/tensorflow/python/data/experimental/service/server_lib.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 # pylint: disable=invalid-import-order,g-bad-import-order, unused-import
 from tensorflow.core.protobuf.data.experimental import service_config_pb2
 from tensorflow.python import pywrap_tensorflow
@@ -25,7 +27,55 @@ from tensorflow.python.data.experimental.service import _pywrap_server_lib
 from tensorflow.python.util.tf_export import tf_export
 
 
-DEFAULT_PROTOCOL = "grpc"
+@tf_export("data.experimental.service.DispatcherConfig")
+class DispatcherConfig(
+    collections.namedtuple("DispatcherConfig", [
+        "port", "protocol", "work_dir", "fault_tolerant_mode",
+        "job_gc_check_interval_ms", "job_gc_timeout_ms"
+    ])):
+  """Configuration class for tf.data service dispatchers.
+
+  Fields:
+    port: Specifies the port to bind to. A value of 0 indicates that the server
+      may bind to any available port.
+    protocol: The protocol to use for communicating with the tf.data service.
+      Defaults to `"grpc"`.
+    work_dir: A directory to store dispatcher state in. This
+      argument is required for the dispatcher to be able to recover from
+      restarts.
+    fault_tolerant_mode: Whether the dispatcher should write its state to a
+      journal so that it can recover from restarts. Dispatcher state, including
+      registered datasets and created jobs, is synchronously written to the
+      journal before responding to RPCs. If `True`, `work_dir` must also be
+      specified.
+    job_gc_check_interval_ms: How often the dispatcher should scan through to
+      delete old and unused jobs, in milliseconds. If not set, the runtime will
+      select a reasonable default. A higher value will reduce load on the
+      dispatcher, while a lower value will reduce the time it takes for the
+      dispatcher to garbage collect expired jobs.
+    job_gc_timeout_ms: How long a job needs to be unused before it becomes a
+      candidate for garbage collection, in milliseconds. If not set, the runtime
+      will select a reasonable default. A higher value will cause jobs to stay
+      around longer with no consumers. This is useful if there is a large gap in
+      time between when consumers read from the job. A lower value will reduce
+      the time it takes to reclaim the resources from expired jobs.
+  """
+
+  def __new__(cls,
+              port=0,
+              protocol="grpc",
+              work_dir=None,
+              fault_tolerant_mode=False,
+              job_gc_check_interval_ms=None,
+              job_gc_timeout_ms=None):
+    if job_gc_check_interval_ms is None:
+      job_gc_check_interval_ms = 10 * 60 * 1000  # 10 minutes.
+    if job_gc_timeout_ms is None:
+      job_gc_timeout_ms = 5 * 60 * 1000  # 5 minutes.
+    return super(DispatcherConfig,
+                 cls).__new__(cls, port, protocol, work_dir,
+                              fault_tolerant_mode, job_gc_check_interval_ms,
+                              job_gc_timeout_ms)
 
 
 @tf_export("data.experimental.service.DispatchServer", v1=[])
@@ -36,10 +86,10 @@ class DispatchServer(object):
   `tf.data.experimental.service.WorkerServer`s. When the workers start, they
   register themselves with the dispatcher.
 
-  >>> dispatcher = tf.data.experimental.service.DispatchServer(port=0)
+  >>> dispatcher = tf.data.experimental.service.DispatchServer()
   >>> dispatcher_address = dispatcher.target.split("://")[1]
-  >>> worker = tf.data.experimental.service.WorkerServer(
-  ...     port=0, dispatcher_address=dispatcher_address)
+  >>> worker = tf.data.experimental.service.WorkerServer(WorkerConfig(
+  ...     dispatcher_address=dispatcher_address))
   >>> dataset = tf.data.Dataset.range(10)
   >>> dataset = dataset.apply(tf.data.experimental.service.distribute(
   ...     processing_mode="parallel_epochs", service=dispatcher.target))
@@ -50,7 +100,8 @@ class DispatchServer(object):
   indefinitely after starting up the server.
 
   ```
-  dispatcher = tf.data.experimental.service.DispatchServer(port=5050)
+  dispatcher = tf.data.experimental.service.DispatchServer(
+      tf.data.experimental.service.DispatcherConfig(port=5050))
   dispatcher.join()
   ```
 
@@ -59,61 +110,44 @@ class DispatchServer(object):
 
   ```
   dispatcher = tf.data.experimental.service.DispatchServer(
-      port=5050,
-      work_dir="gs://my-bucket/dispatcher/work_dir",
-      fault_tolerant_mode=True)
+      tf.data.experimental.service.DispatcherConfig(
+          port=5050,
+          work_dir="gs://my-bucket/dispatcher/work_dir",
+          fault_tolerant_mode=True))
   ```
   """
 
-  def __init__(self,
-               port,
-               protocol=None,
-               work_dir=None,
-               fault_tolerant_mode=None,
-               start=True):
+  def __init__(self, config=None, start=True):
     """Creates a new dispatch server.
 
     Args:
-      port: Specifies the port to bind to.
-      protocol: (Optional.) Specifies the protocol to be used by the server.
-        Acceptable values include `"grpc", "grpc+local"`. Defaults to `"grpc"`.
-      work_dir: (Optional.) A directory to store dispatcher state in. This
-        argument is required for the dispatcher to be able to recover from
-        restarts.
-      fault_tolerant_mode: (Optional.) Whether the dispatcher should write
-        its state to a journal so that it can recover from restarts. Dispatcher
-        state, including registered datasets and created jobs, is synchronously
-        written to the journal before responding to RPCs. If `True`, `work_dir`
-        must also be specified. Defaults to `False`.
+      config: (Optional.) A `tf.data.experimental.service.DispatcherConfig`
+        configration. If `None`, the dispatcher will use default
+        configuration values.
       start: (Optional.) Boolean, indicating whether to start the server after
-        creating it. Defaults to `True`.
-
-    Raises:
-      tf.errors.OpError: Or one of its subclasses if an error occurs while
-        creating the TensorFlow server.
+        creating it. Defaults to True.
     """
-    self._protocol = DEFAULT_PROTOCOL if protocol is None else protocol
-    self._work_dir = "" if work_dir is None else work_dir
-    self._fault_tolerant_mode = (False if fault_tolerant_mode is None else
-                                 fault_tolerant_mode)
-    if self._fault_tolerant_mode and not self._work_dir:
+    config = config or DispatcherConfig()
+    if config.fault_tolerant_mode and not config.work_dir:
       raise ValueError(
           "Cannot enable fault tolerant mode without configuring a work_dir")
-    config = service_config_pb2.DispatcherConfig(
-        port=port,
-        protocol=self._protocol,
-        work_dir=self._work_dir,
-        fault_tolerant_mode=self._fault_tolerant_mode)
+    self._config = config
+    config_proto = service_config_pb2.DispatcherConfig(
+        port=config.port,
+        protocol=config.protocol,
+        work_dir=config.work_dir,
+        fault_tolerant_mode=config.fault_tolerant_mode,
+        job_gc_check_interval_ms=config.job_gc_check_interval_ms,
+        job_gc_timeout_ms=config.job_gc_timeout_ms)
     self._server = _pywrap_server_lib.TF_DATA_NewDispatchServer(
-        config.SerializeToString())
+        config_proto.SerializeToString())
     if start:
       self._server.start()
 
   def start(self):
     """Starts this server.
 
-    >>> dispatcher = tf.data.experimental.service.DispatchServer(port=0,
-    ...                                                          start=False)
+    >>> dispatcher = tf.data.experimental.service.DispatchServer(start=False)
     >>> dispatcher.start()
 
     Raises:
@@ -128,7 +162,8 @@ class DispatchServer(object):
     This is useful when starting a dedicated dispatch process.
 
     ```
-    dispatcher = tf.data.experimental.service.DispatchServer(port=5050)
+    dispatcher = tf.data.experimental.service.DispatchServer(
+        tf.data.experimental.service.DispatcherConfig(port=5050))
     dispatcher.join()
     ```
 
@@ -142,7 +177,7 @@ class DispatchServer(object):
   def target(self):
     """Returns a target that can be used to connect to the server.
 
-    >>> dispatcher = tf.data.experimental.service.DispatchServer(port=0)
+    >>> dispatcher = tf.data.experimental.service.DispatchServer()
     >>> dataset = tf.data.Dataset.range(10)
     >>> dataset = dataset.apply(tf.data.experimental.service.distribute(
     ...     processing_mode="parallel_epochs", service=dispatcher.target))
@@ -150,7 +185,7 @@ class DispatchServer(object):
     The returned string will be in the form protocol://address, e.g.
     "grpc://localhost:5050".
     """
-    return "{0}://localhost:{1}".format(self._protocol,
+    return "{0}://localhost:{1}".format(self._config.protocol,
                                         self._server.bound_port())
 
   def _stop(self):
@@ -178,6 +213,46 @@ class DispatchServer(object):
     return self._server.num_workers()
 
 
+@tf_export("data.experimental.service.WorkerConfig")
+class WorkerConfig(
+    collections.namedtuple("WorkerConfig", [
+        "dispatcher_address", "worker_address", "port", "protocol",
+        "heartbeat_interval_ms"
+    ])):
+  """Configuration class for tf.data service dispatchers.
+
+  Fields:
+    dispatcher_address: Specifies the address of the dispatcher.
+    worker_address: Specifies the address of the worker server. This address is
+      passed to the dispatcher so that the dispatcher can tell clients how to
+      connect to this worker.
+    port: Specifies the port to bind to. A value of 0 indicates that the worker
+      can bind to any available port.
+    protocol: (Optional.) Specifies the protocol to be used by the server.
+      Defaults to `"grpc"`.
+    heartbeat_interval_ms: How often the worker should heartbeat to the
+      dispatcher, in milliseconds. If not set, the runtime will select a
+      reasonable default. A higher value will reduce the load on the dispatcher,
+      while a lower value will reduce the time it takes to reclaim resources
+      from finished jobs.
+  """
+
+  def __new__(cls,
+              dispatcher_address,
+              worker_address=None,
+              port=0,
+              protocol="grpc",
+              heartbeat_interval_ms=None):
+    if worker_address is None:
+      worker_address = "localhost:%port%"
+    if heartbeat_interval_ms is None:
+      heartbeat_interval_ms = 30 * 1000  # 30 seconds
+
+    return super(WorkerConfig,
+                 cls).__new__(cls, dispatcher_address, worker_address, port,
+                              protocol, heartbeat_interval_ms)
+
+
 @tf_export("data.experimental.service.WorkerServer", v1=[])
 class WorkerServer(object):
   """An in-process tf.data service worker server.
@@ -187,10 +262,11 @@ class WorkerServer(object):
   RPC. A worker is associated with a single
   `tf.data.experimental.service.DispatchServer`.
 
-  >>> dispatcher = tf.data.experimental.service.DispatchServer(port=0)
+  >>> dispatcher = tf.data.experimental.service.DispatchServer()
   >>> dispatcher_address = dispatcher.target.split("://")[1]
   >>> worker = tf.data.experimental.service.WorkerServer(
-  ...     port=0, dispatcher_address=dispatcher_address)
+  ...     tf.data.experimental.service.WorkerConfig(
+  ...         dispatcher_address=dispatcher_address))
   >>> dataset = tf.data.Dataset.range(10)
   >>> dataset = dataset.apply(tf.data.experimental.service.distribute(
   ...     processing_mode="parallel_epochs", service=dispatcher.target))
@@ -207,45 +283,25 @@ class WorkerServer(object):
   ```
   """
 
-  def __init__(self,
-               port,
-               dispatcher_address,
-               worker_address=None,
-               protocol=None,
-               start=True):
+  def __init__(self, config, start=True):
     """Creates a new worker server.
 
     Args:
-      port: Specifies the port to bind to. A value of 0 indicates that the
-        worker can bind to any available port.
-      dispatcher_address: Specifies the address of the dispatcher.
-      worker_address: (Optional.) Specifies the address of the worker server.
-        This address is passed to the dispatcher so that the dispatcher can
-        tell clients how to connect to this worker. Defaults to
-        `"localhost:%port%"`, where `%port%` will be replaced with the port used
-        by the worker.
-      protocol: (Optional.) Specifies the protocol to be used by the server.
-        Acceptable values include `"grpc", "grpc+local"`. Defaults to `"grpc"`.
+      config: A `tf.data.experimental.service.WorkerConfig` configration.
       start: (Optional.) Boolean, indicating whether to start the server after
-        creating it. Defaults to `True`.
-
-    Raises:
-      tf.errors.OpError: Or one of its subclasses if an error occurs while
-        creating the TensorFlow server.
+        creating it. Defaults to True.
     """
-    if worker_address is None:
-      worker_address = "localhost:%port%"
-    if protocol is None:
-      protocol = "grpc"
-
-    self._protocol = protocol
-    config = service_config_pb2.WorkerConfig(
-        port=port,
-        protocol=protocol,
-        dispatcher_address=dispatcher_address,
-        worker_address=worker_address)
+    if config.dispatcher_address is None:
+      raise ValueError("must specify a dispatcher_address")
+    self._config = config
+    config_proto = service_config_pb2.WorkerConfig(
+        dispatcher_address=config.dispatcher_address,
+        worker_address=config.worker_address,
+        port=config.port,
+        protocol=config.protocol,
+        heartbeat_interval_ms=config.heartbeat_interval_ms)
     self._server = _pywrap_server_lib.TF_DATA_NewWorkerServer(
-        config.SerializeToString())
+        config_proto.SerializeToString())
     if start:
       self._server.start()
 
@@ -296,3 +352,7 @@ class WorkerServer(object):
     The returned string will be in the form address:port, e.g. "localhost:1000".
     """
     return "localhost:{0}".format(self._server.bound_port())
+
+  def _num_tasks(self):
+    """Returns the number of tasks currently being executed on the worker."""
+    return self._server.num_tasks()
diff --git a/tensorflow/python/data/experimental/service/server_lib_test.py b/tensorflow/python/data/experimental/service/server_lib_test.py
index f7354e64a3a..8248abdc2fd 100644
--- a/tensorflow/python/data/experimental/service/server_lib_test.py
+++ b/tensorflow/python/data/experimental/service/server_lib_test.py
@@ -18,76 +18,162 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import logging
+import tempfile
+import threading
+import unittest
 from tensorflow.python.data.experimental.service import server_lib
-
+from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
+from tensorflow.python.profiler import profiler_client
+
+_portpicker_import_error = None
+try:
+  import portpicker  # pylint: disable=g-import-not-at-top
+except ImportError as _error:  # pylint: disable=invalid-name
+  _portpicker_import_error = _error
+  portpicker = None
+
+ASSIGNED_PORTS = set()
+lock = threading.Lock()
+
+
+def pick_unused_port():
+  """Returns an unused and unassigned local port."""
+
+  if _portpicker_import_error:
+    raise _portpicker_import_error  # pylint: disable=raising-bad-type
+
+  global ASSIGNED_PORTS
+  with lock:
+    while True:
+      try:
+        port = portpicker.pick_unused_port()
+      except portpicker.NoFreePortFoundError:
+        raise unittest.SkipTest("Flakes in portpicker library do not represent "
+                                "TensorFlow errors.")
+      if port > 10000 and port not in ASSIGNED_PORTS:
+        ASSIGNED_PORTS.add(port)
+        logging.info("Using local port %r", port)
+        return port
 
 
 class ServerLibTest(test.TestCase):
 
   def testStartDispatcher(self):
-    dispatcher = server_lib.DispatchServer(0, start=False)
+    dispatcher = server_lib.DispatchServer(start=False)
     dispatcher.start()
 
+  def testStartDispatcherWithPortConfig(self):
+    port = pick_unused_port()
+    config = server_lib.DispatcherConfig(port=port)
+    dispatcher = server_lib.DispatchServer(config=config, start=True)
+    self.assertEqual(dispatcher.target, "grpc://localhost:{}".format(port))
+
+  def testStartDispatcherWithWorkDirConfig(self):
+    temp_dir = tempfile.mkdtemp()
+    config = server_lib.DispatcherConfig(work_dir=temp_dir)
+    dispatcher = server_lib.DispatchServer(  # pylint: disable=unused-variable
+        config=config, start=True)
+
+  def testStartDispatcherWithFaultTolerantConfig(self):
+    temp_dir = tempfile.mkdtemp()
+    config = server_lib.DispatcherConfig(
+        work_dir=temp_dir, fault_tolerant_mode=True)
+    dispatcher = server_lib.DispatchServer(  # pylint: disable=unused-variable
+        config=config, start=True)
+
+  def testStartDispatcherWithWrongFaultTolerantConfig(self):
+    config = server_lib.DispatcherConfig(fault_tolerant_mode=True)
+    error = "Cannot enable fault tolerant mode without configuring a work_dir"
+    with self.assertRaisesRegex(ValueError, error):
+      dispatcher = server_lib.DispatchServer(  # pylint: disable=unused-variable
+          config=config, start=True)
+
   def testMultipleStartDispatcher(self):
-    dispatcher = server_lib.DispatchServer(0, start=True)
+    dispatcher = server_lib.DispatchServer(start=True)
     dispatcher.start()
 
   def testStartWorker(self):
-    dispatcher = server_lib.DispatchServer(0)
-    worker = server_lib.WorkerServer(0, dispatcher._address, start=False)
+    dispatcher = server_lib.DispatchServer()
+    worker = server_lib.WorkerServer(
+        server_lib.WorkerConfig(dispatcher._address), start=False)
     worker.start()
 
+  def testStartWorkerWithPortConfig(self):
+    dispatcher = server_lib.DispatchServer()
+    port = pick_unused_port()
+    worker = server_lib.WorkerServer(
+        server_lib.WorkerConfig(dispatcher._address, port=port), start=True)
+    self.assertEqual(worker._address, "localhost:{}".format(port))
+
   def testMultipleStartWorker(self):
-    dispatcher = server_lib.DispatchServer(0)
-    worker = server_lib.WorkerServer(0, dispatcher._address, start=True)
+    dispatcher = server_lib.DispatchServer()
+    worker = server_lib.WorkerServer(
+        server_lib.WorkerConfig(dispatcher._address), start=True)
     worker.start()
 
   def testStopDispatcher(self):
-    dispatcher = server_lib.DispatchServer(0)
+    dispatcher = server_lib.DispatchServer()
     dispatcher._stop()
     dispatcher._stop()
 
   def testStopWorker(self):
-    dispatcher = server_lib.DispatchServer(0)
-    worker = server_lib.WorkerServer(0, dispatcher._address)
+    dispatcher = server_lib.DispatchServer()
+    worker = server_lib.WorkerServer(
+        server_lib.WorkerConfig(dispatcher._address))
     worker._stop()
     worker._stop()
 
   def testStopStartDispatcher(self):
-    dispatcher = server_lib.DispatchServer(0)
+    dispatcher = server_lib.DispatchServer()
     dispatcher._stop()
     with self.assertRaisesRegex(
         RuntimeError, "Server cannot be started after it has been stopped"):
       dispatcher.start()
 
   def testStopStartWorker(self):
-    dispatcher = server_lib.DispatchServer(0)
-    worker = server_lib.WorkerServer(0, dispatcher._address)
+    dispatcher = server_lib.DispatchServer()
+    worker = server_lib.WorkerServer(
+        server_lib.WorkerConfig(dispatcher._address))
     worker._stop()
     with self.assertRaisesRegex(
         RuntimeError, "Server cannot be started after it has been stopped"):
       worker.start()
 
   def testJoinDispatcher(self):
-    dispatcher = server_lib.DispatchServer(0)
+    dispatcher = server_lib.DispatchServer()
     dispatcher._stop()
     dispatcher.join()
 
   def testJoinWorker(self):
-    dispatcher = server_lib.DispatchServer(0)
-    worker = server_lib.WorkerServer(0, dispatcher._address)
+    dispatcher = server_lib.DispatchServer()
+    worker = server_lib.WorkerServer(
+        server_lib.WorkerConfig(dispatcher._address))
     worker._stop()
     worker.join()
 
   def testDispatcherNumWorkers(self):
-    dispatcher = server_lib.DispatchServer(0)
+    dispatcher = server_lib.DispatchServer()
     self.assertEqual(0, dispatcher._num_workers())
-    worker1 = server_lib.WorkerServer(0, dispatcher._address)  # pylint: disable=unused-variable
+    worker1 = server_lib.WorkerServer(  # pylint: disable=unused-variable
+        server_lib.WorkerConfig(dispatcher._address))
     self.assertEqual(1, dispatcher._num_workers())
-    worker2 = server_lib.WorkerServer(0, dispatcher._address)  # pylint: disable=unused-variable
+    worker2 = server_lib.WorkerServer(  # pylint: disable=unused-variable
+        server_lib.WorkerConfig(dispatcher._address))
     self.assertEqual(2, dispatcher._num_workers())
 
+  def testProfileWorker(self):
+    dispatcher = server_lib.DispatchServer()
+    worker = server_lib.WorkerServer(
+        server_lib.WorkerConfig(dispatcher._address))
+    # Test the profilers are successfully started and connected to profiler
+    # service on the worker. Since there is no op running, it is expected to
+    # return UnavailableError with no trace events collected string.
+    with self.assertRaises(errors.UnavailableError) as error:
+      profiler_client.trace(worker._address, tempfile.mkdtemp(), duration_ms=10)
+    self.assertStartsWith(str(error.exception), "No trace event was collected")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
index b268ba2403a..5a229f88d92 100644
--- a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
+++ b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
@@ -50,7 +50,14 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
       .def("stop", &tensorflow::data::WorkerGrpcDataServer::Stop)
       .def("join", &tensorflow::data::WorkerGrpcDataServer::Join,
            py::call_guard<py::gil_scoped_release>())
-      .def("bound_port", &tensorflow::data::WorkerGrpcDataServer::BoundPort);
+      .def("bound_port", &tensorflow::data::WorkerGrpcDataServer::BoundPort)
+      .def("num_tasks",
+           [](tensorflow::data::WorkerGrpcDataServer* server) -> int {
+             int num_tasks;
+             tensorflow::Status status = server->NumTasks(&num_tasks);
+             tensorflow::MaybeRaiseFromStatus(status);
+             return num_tasks;
+           });
 
   m.def(
       "TF_DATA_NewDispatchServer",
@@ -63,7 +70,7 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
         }
         std::unique_ptr<tensorflow::data::DispatchGrpcDataServer> server;
         tensorflow::Status status =
-            tensorflow::data::NewDispatchServer(config, &server);
+            tensorflow::data::NewDispatchServer(config, server);
         tensorflow::MaybeRaiseFromStatus(status);
         return server;
       },
@@ -80,7 +87,7 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
         }
         std::unique_ptr<tensorflow::data::WorkerGrpcDataServer> server;
         tensorflow::Status status =
-            tensorflow::data::NewWorkerServer(config, &server);
+            tensorflow::data::NewWorkerServer(config, server);
         tensorflow::MaybeRaiseFromStatus(status);
         return server;
       },
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 43f3a297da8..2705778fba6 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -90,25 +90,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "data_service_ops_test",
-    size = "medium",
-    srcs = ["data_service_ops_test.py"],
-    shard_count = 10,
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/data",
-        "//tensorflow/python/data/experimental/ops:testing",
-        "//tensorflow/python/data/experimental/service:server_lib",
-        "//tensorflow/python/data/kernel_tests:test_base",
-    ],
-)
-
 tf_py_test(
     name = "dataset_test",
     size = "small",
@@ -133,6 +114,7 @@ tf_py_test(
     name = "dataset_spec_test",
     size = "small",
     srcs = ["dataset_spec_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":test_base",
         "//tensorflow/python:client_testlib",
@@ -244,6 +226,7 @@ tf_py_test(
     name = "from_sparse_tensor_slices_test",
     size = "small",
     srcs = ["from_sparse_tensor_slices_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":test_base",
         "//tensorflow/core:protos_all_py",
@@ -335,6 +318,7 @@ tf_py_test(
         "no_oss",  # Test flaky due to port collisions.
         "no_windows",
     ],
+    tfrt_enabled = True,
     deps = [
         ":test_base",
         "//tensorflow/core:protos_all_py",
@@ -575,6 +559,26 @@ tf_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "placement_test",
+    size = "small",
+    srcs = ["placement_test.py"],
+    deps = [
+        ":test_base",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:test_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:def_function",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "prefetch_test",
     size = "small",
diff --git a/tensorflow/python/data/kernel_tests/from_generator_test.py b/tensorflow/python/data/kernel_tests/from_generator_test.py
index 8643228f267..316c4fe4145 100644
--- a/tensorflow/python/data/kernel_tests/from_generator_test.py
+++ b/tensorflow/python/data/kernel_tests/from_generator_test.py
@@ -28,7 +28,12 @@ from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 
 
@@ -259,7 +264,7 @@ class FromGeneratorTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     self.assertAllEqual([1, 2, 3], self.evaluate(get_next()))
     self.assertAllEqual([4, 5, 6], self.evaluate(get_next()))
-    with self.assertRaisesOpError("The expected type was int64"):
+    with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(get_next())
     self.assertAllEqual([7, 8, 9], self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
@@ -279,7 +284,7 @@ class FromGeneratorTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     self.assertAllEqual([1, 2, 3], self.evaluate(get_next()))
     self.assertAllEqual([4, 5, 6], self.evaluate(get_next()))
-    with self.assertRaisesOpError(r"element of shape \(3,\) was expected"):
+    with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(get_next())
     self.assertAllEqual([11, 12, 13], self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
@@ -300,11 +305,9 @@ class FromGeneratorTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     self.assertEqual((1, 2), self.evaluate(get_next()))
     self.assertEqual((3, 4), self.evaluate(get_next()))
-    with self.assertRaisesOpError(
-        r"The expected structure was \(tf\.int64, tf\.int64\)"):
+    with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(get_next())
-    with self.assertRaisesOpError(
-        r"The expected structure was \(tf\.int64, tf\.int64\)"):
+    with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(get_next())
     self.assertEqual((9, 10), self.evaluate(get_next()))
     with self.assertRaises(errors.OutOfRangeError):
@@ -423,8 +426,12 @@ class FromGeneratorTest(test_base.DatasetTestBase, parameterized.TestCase):
                                 stateful=True)
 
     dummy = constant_op.constant(37)
-    dataset = dataset_ops._GeneratorDataset(dummy, lambda x: x, lambda x: x,
-                                            finalize_fn).take(2)
+
+    dataset = dataset_ops._GeneratorDataset(
+        dummy, lambda x: x, lambda x: x, finalize_fn,
+        tensor_spec.TensorSpec((), dtypes.int32))
+
+    dataset = dataset.take(2)
     get_next = self.getNext(dataset)
 
     self.assertAllEqual(37, self.evaluate(get_next()))
@@ -446,6 +453,44 @@ class FromGeneratorTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     self.assertAllEqual([20], self.evaluate(get_next()))
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testFromGeneratorRaggedTensor(self):
+
+    def generator():
+      yield ragged_factory_ops.constant([[1, 2], [3]])
+
+    dataset = dataset_ops.Dataset.from_generator(
+        generator,
+        output_signature=ragged_tensor.RaggedTensorSpec(
+            shape=(2, None), dtype=dtypes.int32))
+    get_next = self.getNext(dataset)
+
+    ret = get_next()
+
+    self.assertIsInstance(ret, ragged_tensor.RaggedTensor)
+    self.assertAllEqual([[1, 2], [3]], ret)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testFromGeneratorSparseTensor(self):
+
+    def generator():
+      yield sparse_tensor.SparseTensor(
+          indices=[[0, 0], [1, 2]],
+          values=constant_op.constant([1, 2], dtype=dtypes.int64),
+          dense_shape=[3, 4])
+
+    dataset = dataset_ops.Dataset.from_generator(
+        generator,
+        output_signature=sparse_tensor.SparseTensorSpec([3, 4], dtypes.int64))
+
+    get_next = self.getNext(dataset)
+
+    ret = get_next()
+
+    self.assertIsInstance(ret, sparse_tensor.SparseTensor)
+    self.assertAllEqual([[1, 0, 0, 0], [0, 0, 2, 0], [0, 0, 0, 0]],
+                        sparse_ops.sparse_tensor_to_dense(ret))
+
   @combinations.generate(test_base.default_test_combinations())
   def testTypeIsListError(self):
 
diff --git a/tensorflow/python/data/kernel_tests/from_tensors_test.py b/tensorflow/python/data/kernel_tests/from_tensors_test.py
index e526745e0e5..2515faec041 100644
--- a/tensorflow/python/data/kernel_tests/from_tensors_test.py
+++ b/tensorflow/python/data/kernel_tests/from_tensors_test.py
@@ -252,12 +252,12 @@ class FromTensorsTest(test_base.DatasetTestBase, parameterized.TestCase):
       # placement algorithm overriding the DT_RESOURCE colocation constraints.
       with ops.device("/cpu:0"):
         var_0 = resource_variable_ops.ResourceVariable(initial_value=1)
-        dataset = dataset.map(lambda x: x + var_0.read_value())
+      dataset = dataset.map(lambda x: x + var_0.read_value())
       sess.run(var_0.initializer)
 
       with ops.device("/cpu:1"):
         var_1 = resource_variable_ops.ResourceVariable(initial_value=1)
-        dataset = dataset.map(lambda x: x + var_1.read_value())
+      dataset = dataset.map(lambda x: x + var_1.read_value())
       sess.run(var_1.initializer)
 
       iterator = dataset_ops.make_initializable_iterator(dataset)
diff --git a/tensorflow/python/data/kernel_tests/iterator_test.py b/tensorflow/python/data/kernel_tests/iterator_test.py
index 060014652ec..a59c64d50b2 100644
--- a/tensorflow/python/data/kernel_tests/iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_test.py
@@ -946,7 +946,9 @@ class IteratorTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     @def_function.function
     def fn():
-      dataset = dataset_ops._GeneratorDataset(1, init_fn, next_fn, finalize_fn)
+      output_signature = tensor_spec.TensorSpec((), dtypes.int64)
+      dataset = dataset_ops._GeneratorDataset(1, init_fn, next_fn, finalize_fn,
+                                              output_signature)
       iterator = iter(dataset)
       next(iterator)
 
diff --git a/tensorflow/python/data/kernel_tests/placement_test.py b/tensorflow/python/data/kernel_tests/placement_test.py
new file mode 100644
index 00000000000..43bf1c8c0b5
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/placement_test.py
@@ -0,0 +1,254 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.data placement within tf.functions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import prefetching_ops
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import config
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class PlacementTest(test_base.DatasetTestBase, parameterized.TestCase):
+  """Tests for tf.data placement within tf.functions.
+
+  Specifically, tf.data dataset tensors cannot be copied between devices. These
+  tests verify the ops are placed in a way that avoids this.
+  """
+
+  def setUp(self):
+    super(PlacementTest, self).setUp()
+    # Grappler optimizations can affect whether the placement issues occur,
+    # since they may inadvertently rewrite nodes and edges in a way that removes
+    # cross-device copies.
+    config.set_optimizer_experimental_options({"disable_meta_optimizer": True})
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testWhileWithCapturedDataset(self):
+    dataset = dataset_ops.Dataset.range(10)
+
+    @def_function.function
+    def f():
+      total = constant_op.constant(0, dtypes.int64)
+      for _ in math_ops.range(1):
+        for elem in dataset:
+          total += elem
+      return total
+
+    self.assertEqual(f().numpy(), 45)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testWhile(self):
+    self.skipTest("b/166625126")
+
+    @def_function.function
+    def f():
+      dataset = dataset_ops.Dataset.range(10)
+      total = constant_op.constant(0, dtypes.int64)
+      for _ in math_ops.range(1):
+        for elem in dataset:
+          total += elem
+      return total
+
+    self.assertEqual(f().numpy(), 45)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testCondWithPlacement(self):
+    # When the cond op is explicitly placed, there shouldn't be cross-device
+    # copies.
+    @def_function.function
+    def f():
+      dataset = dataset_ops.Dataset.range(10)
+
+      def fn():
+        return dataset.map(lambda x: x+1)
+
+      c = constant_op.constant(2)
+      with ops.device("/cpu:0"):
+        a = control_flow_ops.cond(math_ops.equal(c, 2), fn, fn)
+        iterator = iter(a)
+        nxt = next(iterator)
+      return nxt
+
+    self.assertEqual(f().numpy(), 1)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testCondWithColocation(self):
+    # When the cond op is colocated with the dataset, there shouldn't be
+    # cross-device copies.
+    @def_function.function
+    def f():
+      dataset = dataset_ops.Dataset.range(8)
+
+      def fn():
+        return dataset.map(lambda x: x+1)
+
+      c = constant_op.constant(2)
+      with ops.colocate_with(dataset._variant_tensor):  # pylint:disable=protected-access
+        a = control_flow_ops.cond(math_ops.equal(c, 2), fn, fn)
+        iterator = iter(a)
+        nxt = next(iterator)
+      return nxt
+
+    self.assertEqual(f().numpy(), 1)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testCond(self):
+    self.skipTest("b/166625126")
+    # Ideally, placer should avoid cross-device copies even when the cond op
+    # has no placement constraints.
+    @def_function.function
+    def f():
+      dataset = dataset_ops.Dataset.range(8)
+
+      def fn():
+        return dataset.map(lambda x: x+1)
+
+      c = constant_op.constant(2)
+      a = control_flow_ops.cond(math_ops.equal(c, 2), fn, fn)
+      iterator = iter(a)
+      nxt = next(iterator)
+      return nxt
+
+    self.assertEqual(f().numpy(), 1)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testId(self):
+    self.skipTest("b/166625126")
+    # Ideally, placer should know that Identity(dataset) should be on the same
+    # device as the dataset.
+    @def_function.function
+    def f():
+      dataset = dataset_ops.Dataset.range(10)
+      dataset = array_ops.identity(dataset)
+      return dataset
+    f()
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testIteratorOnDeviceEagerMode(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.apply(prefetching_ops.prefetch_to_device("/gpu:0"))
+    iterator = iter(dataset)
+    data = next(iterator)
+    optional_data = iterator.get_next_as_optional()
+
+    self.assertIn("gpu:0", dataset._variant_tensor.device.lower())
+    self.assertIn("gpu:0", iterator._iterator_resource.device.lower())
+    self.assertIn("gpu:0", data.device.lower())
+    self.assertIn("gpu:0", optional_data.get_value().device.lower())
+    self.assertIn("gpu:0", optional_data.has_value().device.lower())
+
+  @combinations.generate(test_base.graph_only_combinations())
+  def testIteratorOnDeviceGraphModeOneShotIterator(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    self.skipTest("TODO(b/169429285): tf.data.Dataset.make_one_shot_iterator "
+                  "does not support GPU placement.")
+
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.apply(prefetching_ops.prefetch_to_device("/gpu:0"))
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    data = iterator.get_next()
+    optional_data = iterator.get_next_as_optional()
+
+    with ops.colocate_with(dataset._variant_tensor):
+      dataset_device = test_ops.device_placement_op()
+    self.assertIn(b"GPU:0", self.evaluate(dataset_device))
+
+    with ops.colocate_with(iterator._iterator_resource):
+      iterator_device = test_ops.device_placement_op()
+    self.assertIn(b"GPU:0", self.evaluate(iterator_device))
+
+    with ops.colocate_with(data):
+      data_device = test_ops.device_placement_op()
+    self.assertIn(b"GPU:0", self.evaluate(data_device))
+
+    with ops.colocate_with(optional_data.get_value()):
+      get_value_device = test_ops.device_placement_op()
+    self.assertIn(b"GPU:0", self.evaluate(get_value_device))
+
+    with ops.colocate_with(optional_data.has_value()):
+      has_value_device = test_ops.device_placement_op()
+    self.assertIn(b"GPU:0", self.evaluate(has_value_device))
+
+  @combinations.generate(test_base.graph_only_combinations())
+  def testIteratorOnDeviceGraphModeInitializableIterator(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.apply(prefetching_ops.prefetch_to_device("/gpu:0"))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    data = iterator.get_next()
+    optional_data = iterator.get_next_as_optional()
+
+    with ops.colocate_with(dataset._variant_tensor):
+      dataset_device = test_ops.device_placement_op()
+    self.assertIn(b"GPU:0", self.evaluate(dataset_device))
+
+    with ops.colocate_with(iterator._iterator_resource):
+      iterator_device = test_ops.device_placement_op()
+    self.assertIn(b"GPU:0", self.evaluate(iterator_device))
+
+    with ops.colocate_with(data):
+      data_device = test_ops.device_placement_op()
+    self.assertIn(b"GPU:0", self.evaluate(data_device))
+
+    with ops.colocate_with(optional_data.get_value()):
+      get_value_device = test_ops.device_placement_op()
+    self.assertIn(b"GPU:0", self.evaluate(get_value_device))
+
+    with ops.colocate_with(optional_data.has_value()):
+      has_value_device = test_ops.device_placement_op()
+    self.assertIn(b"GPU:0", self.evaluate(has_value_device))
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testIterDatasetEagerModeWithExplicitDevice(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    @def_function.function
+    def comp():
+      value = constant_op.constant(0, dtype=dtypes.int64)
+      for d in iter(dataset_ops.Dataset.range(10)):
+        value += d
+      return value
+
+    with ops.device("/gpu:0"):
+      result = comp()
+    self.assertEqual(result.numpy(), 45)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py
index 31595363bd5..5da633a9ee2 100644
--- a/tensorflow/python/data/kernel_tests/test_base.py
+++ b/tensorflow/python/data/kernel_tests/test_base.py
@@ -52,10 +52,15 @@ def graph_only_combinations():
 
 
 def v2_only_combinations():
-  """Returns the default test combinations for v1 only tf.data tests."""
+  """Returns the default test combinations for v2 only tf.data tests."""
   return combinations.combine(tf_api_version=2, mode=["eager", "graph"])
 
 
+def v2_eager_only_combinations():
+  """Returns the default test combinations for v2 eager only tf.data tests."""
+  return combinations.combine(tf_api_version=2, mode="eager")
+
+
 class DatasetTestBase(test.TestCase):
   """Base class for dataset tests."""
 
diff --git a/tensorflow/python/data/kernel_tests/window_test.py b/tensorflow/python/data/kernel_tests/window_test.py
index 98b453a5900..2515bd52f60 100644
--- a/tensorflow/python/data/kernel_tests/window_test.py
+++ b/tensorflow/python/data/kernel_tests/window_test.py
@@ -239,6 +239,17 @@ class WindowTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertDatasetProduces(x, range(i*10, (i+1)*10))
       self.assertDatasetProduces(y, range(i*10, (i+1)*10))
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testDropRemainderOutput(self):
+    dataset = dataset_ops.Dataset.range(100)
+    dataset = dataset.window(30, drop_remainder=True)
+    dataset = dataset.flat_map(lambda x: x.batch(30))
+    dataset = dataset.batch(4)
+
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[[[y + 30 * x for y in range(30)] for x in range(3)]])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index a2f96267be2..c03d5ca4312 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -94,6 +94,8 @@ ops.NotDifferentiable("ReduceDataset")
 
 # A constant that can be used to enable auto-tuning.
 AUTOTUNE = -1
+tf_export("data.AUTOTUNE").export_constant(__name__, "AUTOTUNE")
+# TODO(b/168128531): Deprecate and remove this symbol.
 tf_export("data.experimental.AUTOTUNE").export_constant(__name__, "AUTOTUNE")
 
 # Constants representing infinite and unknown cardinalities.
@@ -371,10 +373,15 @@ class DatasetV2(collections_abc.Iterable, tracking_base.Trackable,
         dataset = _PrivateThreadPoolDataset(dataset,
                                             t_options.private_threadpool_size)
 
-    # (2) Apply graph rewrite options
+    # (2) Apply autotune options
+    autotune, algorithm, cpu_budget, ram_budget = options._autotune_settings()  # pylint: disable=protected-access
+    if autotune:
+      dataset = _ModelDataset(dataset, algorithm, cpu_budget, ram_budget)
+
+    # (3) Apply graph rewrite options
     # pylint: disable=protected-access
     graph_rewrites = options._graph_rewrites()
-    graph_rewrite_configs = options._graph_rewrite_configs()
+    graph_rewrite_configs = options._graph_rewrite_configs(autotune)
     # pylint: enable=protected-access
     if self._has_captured_ref():
       if graph_rewrites.enabled or graph_rewrites.default:
@@ -391,12 +398,6 @@ class DatasetV2(collections_abc.Iterable, tracking_base.Trackable,
                                  graph_rewrites.disabled,
                                  graph_rewrites.default, graph_rewrite_configs)
 
-    # (3) Apply autotune options
-    autotune, algorithm, cpu_budget = options._autotune_settings()  # pylint: disable=protected-access
-
-    if autotune:
-      dataset = _ModelDataset(dataset, algorithm, cpu_budget)
-
     # (4) Apply stats aggregator options
     if options.experimental_stats and options.experimental_stats.aggregator:  # pylint: disable=line-too-long
       dataset = _SetStatsAggregatorDataset(  # pylint: disable=protected-access
@@ -417,7 +418,8 @@ class DatasetV2(collections_abc.Iterable, tracking_base.Trackable,
       RuntimeError: If not inside of tf.function and not executing eagerly.
     """
     if context.executing_eagerly() or ops.inside_function():
-      return iterator_ops.OwnedIterator(self)
+      with ops.colocate_with(self._variant_tensor):
+        return iterator_ops.OwnedIterator(self)
     else:
       raise RuntimeError("__iter__() is only supported inside of tf.function "
                          "or when eager execution is enabled.")
@@ -725,27 +727,46 @@ class DatasetV2(collections_abc.Iterable, tracking_base.Trackable,
       del self._iterators[iterator_id]
 
   @staticmethod
-  def from_generator(generator, output_types, output_shapes=None, args=None):
+  @deprecation.deprecated_args(None, "Use output_signature instead",
+                               "output_types", "output_shapes")
+  def from_generator(generator,
+                     output_types=None,
+                     output_shapes=None,
+                     args=None,
+                     output_signature=None):
     """Creates a `Dataset` whose elements are generated by `generator`.
 
     The `generator` argument must be a callable object that returns
     an object that supports the `iter()` protocol (e.g. a generator function).
-    The elements generated by `generator` must be compatible with the given
-    `output_types` and (optional) `output_shapes` arguments.
 
-    >>> import itertools
-    >>>
+    The elements generated by `generator` must be compatible with either the
+    given `output_signature` argument or with the given `output_types` and
+    (optionally) `output_shapes` arguments, whichiver was specified.
+
+    The recommended way to call `from_generator` is to use the
+    `output_signature` argument. In this case the output will be assumed to
+    consist of objects with the classes, shapes and types defined by
+    `tf.TypeSpec` objects from `output_signature` argument:
+
     >>> def gen():
-    ...   for i in itertools.count(1):
-    ...     yield (i, [1] * i)
+    ...   ragged_tensor = tf.ragged.constant([[1, 2], [3]])
+    ...   yield 42, ragged_tensor
     >>>
     >>> dataset = tf.data.Dataset.from_generator(
     ...      gen,
-    ...      (tf.int64, tf.int64),
-    ...      (tf.TensorShape([]), tf.TensorShape([None])))
+    ...      output_signature=(
+    ...          tf.TensorSpec(shape=(), dtype=tf.int32),
+    ...          tf.RaggedTensorSpec(shape=(2, None), dtype=tf.int32)))
     >>>
-    >>> list(dataset.take(3).as_numpy_iterator())
-    [(1, array([1])), (2, array([1, 1])), (3, array([1, 1, 1]))]
+    >>> list(dataset.take(1))
+    [(<tf.Tensor: shape=(), dtype=int32, numpy=42>,
+    <tf.RaggedTensor [[1, 2], [3]]>)]
+
+    There is also a deprecated way to call `from_generator` by either with
+    `output_types` argument alone or together with `output_shapes` argument.
+    In this case the output of the function will be assumed to consist of
+    `tf.Tensor` objects with with the types defined by `output_types` and with
+    the shapes which are either unknown or defined by `output_shapes`.
 
     Note: The current implementation of `Dataset.from_generator()` uses
     `tf.numpy_function` and inherits the same constraints. In particular, it
@@ -769,32 +790,63 @@ class DatasetV2(collections_abc.Iterable, tracking_base.Trackable,
         `iter()` protocol. If `args` is not specified, `generator` must take no
         arguments; otherwise it must take as many arguments as there are values
         in `args`.
-      output_types: A nested structure of `tf.DType` objects corresponding to
-        each component of an element yielded by `generator`.
+      output_types: (Optional.) A nested structure of `tf.DType` objects
+        corresponding to each component of an element yielded by `generator`.
       output_shapes: (Optional.) A nested structure of `tf.TensorShape` objects
         corresponding to each component of an element yielded by `generator`.
       args: (Optional.) A tuple of `tf.Tensor` objects that will be evaluated
         and passed to `generator` as NumPy-array arguments.
+      output_signature: (Optional.) A nested structure of `tf.TypeSpec` objects
+        corresponding to each component of an element yielded by `generator`.
 
     Returns:
       Dataset: A `Dataset`.
     """
     if not callable(generator):
       raise TypeError("`generator` must be callable.")
-    if output_shapes is None:
-      output_shapes = nest.map_structure(
-          lambda _: tensor_shape.TensorShape(None), output_types)
+
+    if output_signature is not None:
+      if output_types is not None:
+        raise TypeError("`output_types` can not be used together with "
+                        "`output_signature`")
+      if output_shapes is not None:
+        raise TypeError("`output_shapes` can not be used together with "
+                        "`output_signature`")
+      if not all(
+          isinstance(_, type_spec.TypeSpec)
+          for _ in nest.flatten(output_signature)):
+        raise TypeError("All the elements of `output_signature` must be "
+                        "`tf.TypeSpec` objects.")
     else:
-      output_shapes = nest.map_structure_up_to(
-          output_types, tensor_shape.as_shape, output_shapes)
+      if output_types is None:
+        raise TypeError("Either `output_signature` or `output_types` must "
+                        "be specified")
+
+    if output_signature is None:
+      if output_shapes is None:
+        output_shapes = nest.map_structure(
+            lambda _: tensor_shape.TensorShape(None), output_types)
+      else:
+        output_shapes = nest.map_structure_up_to(output_types,
+                                                 tensor_shape.as_shape,
+                                                 output_shapes)
+      output_signature = nest.map_structure_up_to(output_types,
+                                                  tensor_spec.TensorSpec,
+                                                  output_shapes, output_types)
+    if all([
+        isinstance(x, tensor_spec.TensorSpec)
+        for x in nest.flatten(output_signature)
+    ]):
+      output_types = nest.pack_sequence_as(
+          output_signature, [x.dtype for x in nest.flatten(output_signature)])
+      output_shapes = nest.pack_sequence_as(
+          output_signature, [x.shape for x in nest.flatten(output_signature)])
+
     if args is None:
       args = ()
     else:
       args = tuple(ops.convert_n_to_tensor(args, name="args"))
 
-    flattened_types = [dtypes.as_dtype(dt) for dt in nest.flatten(output_types)]
-    flattened_shapes = nest.flatten(output_shapes)
-
     generator_state = DatasetV2._GeneratorState(generator)
 
     def get_iterator_id_fn(unused_dummy):
@@ -826,61 +878,112 @@ class DatasetV2(collections_abc.Iterable, tracking_base.Trackable,
       Returns:
         The next element to generate from the iterator.
       """
+      if output_types and output_shapes:
+        flattened_types = [
+            dtypes.as_dtype(dt) for dt in nest.flatten(output_types)
+        ]
+        flattened_shapes = nest.flatten(output_shapes)
 
-      def generator_py_func(iterator_id):
-        """A `py_func` that will be called to invoke the iterator."""
-        # `next()` raises `StopIteration` when there are no more
-        # elements remaining to be generated.
-        values = next(generator_state.get_iterator(iterator_id))
+        def generator_py_func(iterator_id):
+          """A `py_func` that will be called to invoke the iterator."""
+          # `next()` raises `StopIteration` when there are no more
+          # elements remaining to be generated.
+          values = next(generator_state.get_iterator(iterator_id))
 
-        # Use the same _convert function from the py_func() implementation to
-        # convert the returned values to arrays early, so that we can inspect
-        # their values.
-        try:
-          flattened_values = nest.flatten_up_to(output_types, values)
-        except (TypeError, ValueError):
-          six.reraise(TypeError, TypeError(
-              "`generator` yielded an element that did not match the expected "
-              "structure. The expected structure was %s, but the yielded "
-              "element was %s." % (output_types, values)), sys.exc_info()[2])
-        ret_arrays = []
-        for ret, dtype in zip(flattened_values, flattened_types):
+          # Use the same _convert function from the py_func() implementation to
+          # convert the returned values to arrays early, so that we can inspect
+          # their values.
           try:
-            ret_arrays.append(script_ops.FuncRegistry._convert(  # pylint: disable=protected-access
-                ret, dtype=dtype.as_numpy_dtype))
+            flattened_values = nest.flatten_up_to(output_types, values)
           except (TypeError, ValueError):
-            six.reraise(TypeError, TypeError(
-                "`generator` yielded an element that could not be converted to "
-                "the expected type. The expected type was %s, but the yielded "
-                "element was %s." % (dtype.name, ret)), sys.exc_info()[2])
+            six.reraise(
+                TypeError,
+                TypeError(
+                    "`generator` yielded an element that did not match the "
+                    "expected structure. The expected structure was %s, but "
+                    "the yielded element was %s." % (output_types, values)),
+                sys.exc_info()[2])
+          ret_arrays = []
+          for ret, dtype in zip(flattened_values, flattened_types):
+            try:
+              ret_arrays.append(
+                  script_ops.FuncRegistry._convert(  # pylint: disable=protected-access
+                      ret,
+                      dtype=dtype.as_numpy_dtype))
+            except (TypeError, ValueError):
+              six.reraise(
+                  TypeError,
+                  TypeError(
+                      "`generator` yielded an element that could not be "
+                      "converted to the expected type. The expected type was "
+                      "%s, but the yielded element was %s." %
+                      (dtype.name, ret)),
+                  sys.exc_info()[2])
 
-        # Additional type and shape checking to ensure that the components
-        # of the generated element match the `output_types` and `output_shapes`
-        # arguments.
-        for (ret_array, expected_dtype, expected_shape) in zip(
-            ret_arrays, flattened_types, flattened_shapes):
-          if ret_array.dtype != expected_dtype.as_numpy_dtype:
+          # Additional type and shape checking to ensure that the components of
+          # the generated element match the `output_types` and `output_shapes`
+          # arguments.
+          for (ret_array, expected_dtype,
+               expected_shape) in zip(ret_arrays, flattened_types,
+                                      flattened_shapes):
+            if ret_array.dtype != expected_dtype.as_numpy_dtype:
+              raise TypeError(
+                  "`generator` yielded an element of type %s where an element "
+                  "of type %s was expected." %
+                  (ret_array.dtype, expected_dtype.as_numpy_dtype))
+            if not expected_shape.is_compatible_with(ret_array.shape):
+              raise ValueError(
+                  "`generator` yielded an element of shape %s where an element "
+                  "of shape %s was expected." %
+                  (ret_array.shape, expected_shape))
+
+          return ret_arrays
+
+        flat_values = script_ops.numpy_function(generator_py_func,
+                                                [iterator_id_t],
+                                                flattened_types)
+
+        # The `py_func()` op drops the inferred shapes, so we add them back in
+        # here.
+        if output_shapes is not None:
+          for ret_t, shape in zip(flat_values, flattened_shapes):
+            ret_t.set_shape(shape)
+
+        return nest.pack_sequence_as(output_types, flat_values)
+      else:
+        flat_output_types = structure.get_flat_tensor_types(output_signature)
+
+        def generator_py_func(iterator_id):
+          """A `py_func` that will be called to invoke the iterator."""
+          # `next()` raises `StopIteration` when there are no more
+          # elements remaining to be generated.
+          values = next(generator_state.get_iterator(iterator_id.numpy()))
+
+          try:
+            values = structure.normalize_element(values, output_signature)
+          except (TypeError, ValueError):
+            six.reraise(
+                TypeError,
+                TypeError(
+                    "`generator` yielded an element that did not match the "
+                    "expected structure. The expected structure was %s, but "
+                    "the yielded element was %s." % (output_signature, values)),
+                sys.exc_info()[2])
+
+          values_spec = structure.type_spec_from_value(values)
+
+          if not structure.are_compatible(values_spec, output_signature):
             raise TypeError(
-                "`generator` yielded an element of type %s where an element "
-                "of type %s was expected." % (ret_array.dtype,
-                                              expected_dtype.as_numpy_dtype))
-          if not expected_shape.is_compatible_with(ret_array.shape):
-            raise ValueError(
-                "`generator` yielded an element of shape %s where an element "
-                "of shape %s was expected." % (ret_array.shape, expected_shape))
+                "`generator` yielded an element of %s where an element "
+                "of %s was expected." % (values_spec, output_signature))
 
-        return ret_arrays
+          return structure.to_tensor_list(output_signature, values)
 
-      flat_values = script_ops.numpy_function(generator_py_func,
-                                              [iterator_id_t], flattened_types)
-
-      # The `py_func()` op drops the inferred shapes, so we add them back in
-      # here.
-      if output_shapes is not None:
-        for ret_t, shape in zip(flat_values, flattened_shapes):
-          ret_t.set_shape(shape)
-
-      return nest.pack_sequence_as(output_types, flat_values)
+        return script_ops._eager_py_func(  # pylint: disable=protected-access
+            generator_py_func,
+            inp=[iterator_id_t],
+            Tout=flat_output_types,
+            use_tape_cache=False)
 
     def finalize_fn(iterator_id_t):
       """Releases host-side state for the iterator with ID `iterator_id_t`."""
@@ -906,7 +1009,7 @@ class DatasetV2(collections_abc.Iterable, tracking_base.Trackable,
       # given ID, and raises StopIteration when that iterator contains no
       # more elements.
       return _GeneratorDataset(dummy_arg, get_iterator_id_fn, generator_next_fn,
-                               finalize_fn)
+                               finalize_fn, output_signature)
 
     # A single-element dataset that, each time it is evaluated, contains a
     # freshly-generated and unique (for the returned dataset) int64
@@ -1678,7 +1781,7 @@ name=None))
 
     >>> dataset = Dataset.range(1, 6)  # ==> [ 1, 2, 3, 4, 5 ]
     >>> dataset = dataset.map(lambda x: x + 1,
-    ...     num_parallel_calls=tf.data.experimental.AUTOTUNE,
+    ...     num_parallel_calls=tf.data.AUTOTUNE,
     ...     deterministic=False)
 
     Args:
@@ -1686,7 +1789,7 @@ name=None))
       num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
         representing the number elements to process asynchronously in parallel.
         If not specified, elements will be processed sequentially. If the value
-        `tf.data.experimental.AUTOTUNE` is used, then the number of parallel
+        `tf.data.AUTOTUNE` is used, then the number of parallel
         calls is set dynamically based on available CPU.
       deterministic: (Optional.) A boolean controlling whether determinism
         should be traded for performance by allowing elements to be produced out
@@ -1799,7 +1902,7 @@ name=None))
     ...              "/var/data/file3.txt", "/var/data/file4.txt"]
     >>> dataset = tf.data.Dataset.from_tensor_slices(filenames)
     >>> dataset = dataset.interleave(lambda x: tf.data.TFRecordDataset(x),
-    ...     cycle_length=4, num_parallel_calls=tf.data.experimental.AUTOTUNE,
+    ...     cycle_length=4, num_parallel_calls=tf.data.AUTOTUNE,
     ...     deterministic=False)
 
     Args:
@@ -1807,7 +1910,7 @@ name=None))
       cycle_length: (Optional.) The number of input elements that will be
         processed concurrently. If not set, the tf.data runtime decides what it
         should be based on available CPU. If `num_parallel_calls` is set to
-        `tf.data.experimental.AUTOTUNE`, the `cycle_length` argument identifies
+        `tf.data.AUTOTUNE`, the `cycle_length` argument identifies
         the maximum degree of parallelism.
       block_length: (Optional.) The number of consecutive elements to produce
         from each input element before cycling to another input element. If not
@@ -1816,7 +1919,7 @@ name=None))
         threadpool, which is used to fetch inputs from cycle elements
         asynchronously and in parallel. The default behavior is to fetch inputs
         from cycle elements synchronously with no parallelism. If the value
-        `tf.data.experimental.AUTOTUNE` is used, then the number of parallel
+        `tf.data.AUTOTUNE` is used, then the number of parallel
         calls is set dynamically based on available CPU.
       deterministic: (Optional.) A boolean controlling whether determinism
         should be traded for performance by allowing elements to be produced out
@@ -2148,21 +2251,8 @@ name=None))
 
     `cardinality` may return `tf.data.INFINITE_CARDINALITY` if the dataset
     contains an infinite number of elements or `tf.data.UNKNOWN_CARDINALITY` if
-    the analysis fails to determine the number of elements in the dataset.
-
-    `cardinality` only reports known cardinality (finite or infinite), if it can
-    be inferred statically. In particular, the implementation does not iterate
-    through the dataset or evaluate user-defined functions. As a consequence,
-    the statically inferred cardinality may often be unknown. For example, if
-    the dataset reads from file(s), the cardinality will be unknown. The
-    cardinality will also be unknown if the dataset contains user-defined
-    functions which could affect the cardinality (such as the functions in
-    `filter`, `flat_map`, `interleave`, or `from_generator`).
-
-    When constructing a dataset, you can apply the
-    `tf.data.experimental.assert_cardinality` transformation to inform the
-    dataset of its expected cardinality, so that `cardinality` can produce a
-    known cardinality.
+    the analysis fails to determine the number of elements in the dataset
+    (e.g. when the dataset source is a file).
 
     >>> dataset = tf.data.Dataset.range(42)
     >>> print(dataset.cardinality().numpy())
@@ -2171,13 +2261,10 @@ name=None))
     >>> cardinality = dataset.cardinality()
     >>> print((cardinality == tf.data.INFINITE_CARDINALITY).numpy())
     True
-    >>> dataset = dataset.filter(lambda x: False)
+    >>> dataset = dataset.filter(lambda x: True)
     >>> cardinality = dataset.cardinality()
     >>> print((cardinality == tf.data.UNKNOWN_CARDINALITY).numpy())
     True
-    >>> dataset = dataset.apply(tf.data.experimental.assert_cardinality(0))
-    >>> print(dataset.cardinality().numpy())
-    0
 
     Returns:
       A scalar `tf.int64` `Tensor` representing the cardinality of the dataset.
@@ -2266,7 +2353,8 @@ class DatasetV1(DatasetV2):
 
   def _make_one_shot_iterator(self):  # pylint: disable=missing-docstring
     if context.executing_eagerly():
-      return iterator_ops.OwnedIterator(self)
+      with ops.colocate_with(self._variant_tensor):
+        return iterator_ops.OwnedIterator(self)
 
     _ensure_same_dataset_graph(self)
     # Now that we create datasets at python object creation time, the capture
@@ -2308,12 +2396,13 @@ class DatasetV1(DatasetV2):
       else:
         six.reraise(ValueError, err)
 
-    # pylint: disable=protected-access
-    return iterator_ops.Iterator(
-        gen_dataset_ops.one_shot_iterator(
-            dataset_factory=_make_dataset, **self._flat_structure), None,
-        get_legacy_output_types(self), get_legacy_output_shapes(self),
-        get_legacy_output_classes(self))
+    with ops.colocate_with(self._variant_tensor):
+      # pylint: disable=protected-access
+      return iterator_ops.Iterator(
+          gen_dataset_ops.one_shot_iterator(
+              dataset_factory=_make_dataset, **self._flat_structure), None,
+          get_legacy_output_types(self), get_legacy_output_shapes(self),
+          get_legacy_output_classes(self))
 
   @deprecation.deprecated(
       None, "This is a deprecated API that should only be used in TF 1 graph "
@@ -2373,16 +2462,20 @@ class DatasetV1(DatasetV2):
     dataset = self._apply_options()
     if shared_name is None:
       shared_name = ""
-    iterator_resource = gen_dataset_ops.iterator_v2(
-        container="", shared_name=shared_name, **self._flat_structure)
-    with ops.colocate_with(iterator_resource):
+
+    with ops.colocate_with(self._variant_tensor):
+      iterator_resource = gen_dataset_ops.iterator_v2(
+          container="", shared_name=shared_name, **self._flat_structure)
+
       initializer = gen_dataset_ops.make_iterator(
           dataset._variant_tensor,  # pylint: disable=protected-access
           iterator_resource)
-    # pylint: disable=protected-access
-    return iterator_ops.Iterator(
-        iterator_resource, initializer, get_legacy_output_types(dataset),
-        get_legacy_output_shapes(dataset), get_legacy_output_classes(dataset))
+
+      # pylint: disable=protected-access
+      return iterator_ops.Iterator(iterator_resource, initializer,
+                                   get_legacy_output_types(dataset),
+                                   get_legacy_output_shapes(dataset),
+                                   get_legacy_output_classes(dataset))
 
   @property
   @deprecation.deprecated(
@@ -2458,9 +2551,14 @@ class DatasetV1(DatasetV2):
 
   @staticmethod
   @functools.wraps(DatasetV2.from_generator)
-  def from_generator(generator, output_types, output_shapes=None, args=None):
-    return DatasetV1Adapter(DatasetV2.from_generator(
-        generator, output_types, output_shapes, args))
+  def from_generator(generator,
+                     output_types=None,
+                     output_shapes=None,
+                     args=None,
+                     output_signature=None):
+    return DatasetV1Adapter(
+        DatasetV2.from_generator(generator, output_types, output_shapes, args,
+                                 output_signature))
 
   @staticmethod
   @functools.wraps(DatasetV2.range)
@@ -2557,7 +2655,7 @@ class DatasetV1(DatasetV2):
       num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
         representing the number elements to process asynchronously in parallel.
         If not specified, elements will be processed sequentially. If the value
-        `tf.data.experimental.AUTOTUNE` is used, then the number of parallel
+        `tf.data.AUTOTUNE` is used, then the number of parallel
         calls is set dynamically based on available CPU.
       deterministic: (Optional.) A boolean controlling whether determinism
         should be traded for performance by allowing elements to be produced out
@@ -2961,11 +3059,12 @@ class Options(options_lib.OptionsBase):
                           disabled=list(set(result.disabled)),
                           default=list(set(result.default)))
 
-  def _graph_rewrite_configs(self):
+  def _graph_rewrite_configs(self, autotune):
     """Produces the list of configurations for enabled graph optimizations."""
     result = []
     if self.experimental_optimization:
-      result.extend(self.experimental_optimization._graph_rewrite_configs())  # pylint: disable=protected-access
+      result.extend(
+          self.experimental_optimization._graph_rewrite_configs(autotune))  # pylint: disable=protected-access
 
     if self.experimental_slack:
       num_devices = self.experimental_distribute.num_devices
@@ -3469,7 +3568,8 @@ class StructuredFunctionWrapper(object):
 class _GeneratorDataset(DatasetSource):
   """A `Dataset` that generates elements by invoking a function."""
 
-  def __init__(self, init_args, init_func, next_func, finalize_func):
+  def __init__(self, init_args, init_func, next_func, finalize_func,
+               output_signature):
     """Constructs a `_GeneratorDataset`.
 
     Args:
@@ -3483,6 +3583,8 @@ class _GeneratorDataset(DatasetSource):
       finalize_func: A TensorFlow function that will be called on the result of
         `init_func` immediately before a C++ iterator over this dataset is
         destroyed. The return value is ignored.
+      output_signature: A nested structure of `tf.TypeSpec` objects describing
+        the output of `next_func`.
     """
     self._init_args = init_args
 
@@ -3502,6 +3604,9 @@ class _GeneratorDataset(DatasetSource):
         finalize_func,
         self._transformation_name(),
         input_structure=self._init_func.output_structure)
+
+    self._output_signature = output_signature
+
     variant_tensor = gen_dataset_ops.generator_dataset(
         structure.to_tensor_list(self._init_structure, self._init_args) +
         self._init_func.function.captured_inputs,
@@ -3515,7 +3620,7 @@ class _GeneratorDataset(DatasetSource):
 
   @property
   def element_spec(self):
-    return self._next_func.output_structure
+    return self._output_signature
 
   def _transformation_name(self):
     return "Dataset.from_generator()"
@@ -4364,7 +4469,7 @@ class PrefetchDataset(UnaryUnchangedStructureDataset):
     # pylint: disable=protected-access
     # We colocate the prefetch dataset with its input as this collocation only
     # happens automatically in graph mode.
-    with ops.device(input_dataset._variant_tensor.device):
+    with ops.colocate_with(input_dataset._variant_tensor):
       variant_tensor = gen_dataset_ops.prefetch_dataset(
           input_dataset._variant_tensor,
           buffer_size=self._buffer_size,
@@ -4429,12 +4534,13 @@ class _OptionsDataset(UnaryUnchangedStructureDataset):
 class _ModelDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and models performance."""
 
-  def __init__(self, input_dataset, algorithm, cpu_budget):
+  def __init__(self, input_dataset, algorithm, cpu_budget, ram_budget):
     self._input_dataset = input_dataset
     variant_tensor = gen_dataset_ops.model_dataset(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
         algorithm=algorithm.value,
         cpu_budget=cpu_budget,
+        ram_budget=ram_budget,
         **self._flat_structure)
     super(_ModelDataset, self).__init__(input_dataset, variant_tensor)
 
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index f6f2da0939e..748ae00b384 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -36,7 +36,6 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.training.saver import BaseSaverBuilder
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import deprecation
@@ -206,22 +205,12 @@ class Iterator(trackable.Trackable):
         output_types, output_shapes, output_classes)
     if shared_name is None:
       shared_name = ""
-    if _device_stack_is_empty():
-      with ops.device("/cpu:0"):
-        iterator_resource = gen_dataset_ops.iterator_v2(
-            container="",
-            shared_name=shared_name,
-            output_types=structure.get_flat_tensor_types(
-                output_structure),
-            output_shapes=structure.get_flat_tensor_shapes(
-                output_structure))
-    else:
-      iterator_resource = gen_dataset_ops.iterator_v2(
-          container="",
-          shared_name=shared_name,
-          output_types=structure.get_flat_tensor_types(output_structure),
-          output_shapes=structure.get_flat_tensor_shapes(
-              output_structure))
+    iterator_resource = gen_dataset_ops.iterator_v2(
+        container="",
+        shared_name=shared_name,
+        output_types=structure.get_flat_tensor_types(output_structure),
+        output_shapes=structure.get_flat_tensor_shapes(
+            output_structure))
     return Iterator(iterator_resource, None, output_types, output_shapes,
                     output_classes)
 
@@ -289,17 +278,10 @@ class Iterator(trackable.Trackable):
     output_structure = structure.convert_legacy_structure(
         output_types, output_shapes, output_classes)
     string_handle = ops.convert_to_tensor(string_handle, dtype=dtypes.string)
-    if _device_stack_is_empty():
-      with ops.device("/cpu:0"):
-        iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
-            string_handle,
-            output_types=structure.get_flat_tensor_types(output_structure),
-            output_shapes=structure.get_flat_tensor_shapes(output_structure))
-    else:
-      iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
-          string_handle,
-          output_types=structure.get_flat_tensor_types(output_structure),
-          output_shapes=structure.get_flat_tensor_shapes(output_structure))
+    iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
+        string_handle,
+        output_types=structure.get_flat_tensor_types(output_structure),
+        output_shapes=structure.get_flat_tensor_shapes(output_structure))
     return Iterator(iterator_resource, None, output_types, output_shapes,
                     output_classes)
 
@@ -372,9 +354,12 @@ class Iterator(trackable.Trackable):
           raise TypeError("Expected output shapes compatible with %r but got "
                           "dataset with output shapes %r." %
                           (self.output_shapes, dataset_output_shapes))
+
+    # TODO(b/169442955): Investigate the need for this colocation constraint.
     with ops.colocate_with(self._iterator_resource):
+      # pylint: disable=protected-access
       return gen_dataset_ops.make_iterator(
-          dataset._variant_tensor, self._iterator_resource, name=name)  # pylint: disable=protected-access
+          dataset._variant_tensor, self._iterator_resource, name=name)
 
   def get_next(self, name=None):
     """Returns a nested structure of `tf.Tensor`s representing the next element.
@@ -424,22 +409,26 @@ class Iterator(trackable.Trackable):
     if self._get_next_call_count > GET_NEXT_CALL_WARNING_THRESHOLD:
       warnings.warn(GET_NEXT_CALL_WARNING_MESSAGE)
 
-    # pylint: disable=protected-access
-    flat_ret = gen_dataset_ops.iterator_get_next(
-        self._iterator_resource,
-        output_types=self._flat_tensor_types,
-        output_shapes=self._flat_tensor_shapes,
-        name=name)
-    return structure.from_tensor_list(self._element_spec, flat_ret)
+    # TODO(b/169442955): Investigate the need for this colocation constraint.
+    with ops.colocate_with(self._iterator_resource):
+      # pylint: disable=protected-access
+      flat_ret = gen_dataset_ops.iterator_get_next(
+          self._iterator_resource,
+          output_types=self._flat_tensor_types,
+          output_shapes=self._flat_tensor_shapes,
+          name=name)
+      return structure.from_tensor_list(self._element_spec, flat_ret)
 
   def get_next_as_optional(self):
-    # pylint: disable=protected-access
-    return optional_ops._OptionalImpl(
-        gen_dataset_ops.iterator_get_next_as_optional(
-            self._iterator_resource,
-            output_types=structure.get_flat_tensor_types(self.element_spec),
-            output_shapes=structure.get_flat_tensor_shapes(
-                self.element_spec)), self.element_spec)
+    # TODO(b/169442955): Investigate the need for this colocation constraint.
+    with ops.colocate_with(self._iterator_resource):
+      # pylint: disable=protected-access
+      return optional_ops._OptionalImpl(
+          gen_dataset_ops.iterator_get_next_as_optional(
+              self._iterator_resource,
+              output_types=structure.get_flat_tensor_types(self.element_spec),
+              output_shapes=structure.get_flat_tensor_shapes(
+                  self.element_spec)), self.element_spec)
 
   def string_handle(self, name=None):
     """Returns a string-valued `tf.Tensor` that represents this iterator.
@@ -531,25 +520,23 @@ class IteratorResourceDeleter(object):
   object is part of a reference cycle, the cycle will be collectable.
   """
 
-  __slots__ = ["_deleter", "_handle", "_device", "_eager_mode"]
+  __slots__ = ["_deleter", "_handle", "_eager_mode"]
 
-  def __init__(self, handle, device, deleter):
+  def __init__(self, handle, deleter):
     self._deleter = deleter
     self._handle = handle
-    self._device = device
     self._eager_mode = context.executing_eagerly()
 
   def __del__(self):
-    with ops.device(self._device):
-      # Make sure the resource is deleted in the same mode as it was created in.
-      if self._eager_mode:
-        with context.eager_mode():
-          gen_dataset_ops.delete_iterator(
-              handle=self._handle, deleter=self._deleter)
-      else:
-        with context.graph_mode():
-          gen_dataset_ops.delete_iterator(
-              handle=self._handle, deleter=self._deleter)
+    # Make sure the resource is deleted in the same mode as it was created in.
+    if self._eager_mode:
+      with context.eager_mode():
+        gen_dataset_ops.delete_iterator(
+            handle=self._handle, deleter=self._deleter)
+    else:
+      with context.graph_mode():
+        gen_dataset_ops.delete_iterator(
+            handle=self._handle, deleter=self._deleter)
 
 
 @tf_export("data.Iterator", v1=[])
@@ -656,11 +643,7 @@ class OwnedIterator(IteratorBase):
   in eager mode and inside of tf.functions.
   """
 
-  def __init__(self,
-               dataset=None,
-               components=None,
-               element_spec=None,
-               job_token=None):
+  def __init__(self, dataset=None, components=None, element_spec=None):
     """Creates a new iterator from the given dataset.
 
     If `dataset` is not specified, the iterator will be created from the given
@@ -673,21 +656,16 @@ class OwnedIterator(IteratorBase):
       components: Tensor components to construct the iterator from.
       element_spec: A nested structure of `TypeSpec` objects that
         represents the type specification of elements of the iterator.
-      job_token: A token to use for reading from a tf.data service job. Data
-        will be partitioned among all iterators using the same token. If `None`,
-        the iterator will not read from the tf.data service.
 
     Raises:
       ValueError: If `dataset` is not provided and either `components` or
         `element_spec` is not provided. Or `dataset` is provided and either
         `components` and `element_spec` is provided.
     """
+    super(OwnedIterator, self).__init__()
     error_message = ("Either `dataset` or both `components` and "
                      "`element_spec` need to be provided.")
 
-    self._device = context.context().device_name
-    self._job_token = job_token
-
     if dataset is None:
       if (components is None or element_spec is None):
         raise ValueError(error_message)
@@ -701,12 +679,7 @@ class OwnedIterator(IteratorBase):
     else:
       if (components is not None or element_spec is not None):
         raise ValueError(error_message)
-      if (_device_stack_is_empty() or
-          context.context().device_spec.device_type != "CPU"):
-        with ops.device("/cpu:0"):
-          self._create_iterator(dataset)
-      else:
-        self._create_iterator(dataset)
+      self._create_iterator(dataset)
 
   def _create_iterator(self, dataset):
     # pylint: disable=protected-access
@@ -729,15 +702,10 @@ class OwnedIterator(IteratorBase):
           gen_dataset_ops.anonymous_iterator_v2(
               output_types=self._flat_output_types,
               output_shapes=self._flat_output_shapes))
-      if self._job_token is None:
-        gen_dataset_ops.make_iterator(ds_variant, self._iterator_resource)
-      else:
-        gen_experimental_dataset_ops.make_data_service_iterator(
-            ds_variant, self._job_token, self._iterator_resource)
+      gen_dataset_ops.make_iterator(ds_variant, self._iterator_resource)
       # Delete the resource when this object is deleted
       self._resource_deleter = IteratorResourceDeleter(
           handle=self._iterator_resource,
-          device=self._device,
           deleter=self._deleter)
 
   def __iter__(self):
@@ -748,25 +716,21 @@ class OwnedIterator(IteratorBase):
 
   def _next_internal(self):
     if not context.executing_eagerly():
-      with ops.device(self._device):
+      # TODO(b/169442955): Investigate the need for this colocation constraint.
+      with ops.colocate_with(self._iterator_resource):
         ret = gen_dataset_ops.iterator_get_next(
             self._iterator_resource,
             output_types=self._flat_output_types,
             output_shapes=self._flat_output_shapes)
       return structure.from_compatible_tensor_list(self._element_spec, ret)
 
-    # This runs in sync mode as iterators use an error status to communicate
-    # that there is no more data to iterate over.
-    # TODO(b/77291417): Fix
+    # TODO(b/77291417): This runs in sync mode as iterators use an error status
+    # to communicate that there is no more data to iterate over.
     with context.execution_mode(context.SYNC):
-      with ops.device(self._device):
-        # TODO(ashankar): Consider removing this ops.device() context manager
-        # and instead mimic ops placement in graphs: Operations on resource
-        # handles execute on the same device as where the resource is placed.
-        ret = gen_dataset_ops.iterator_get_next(
-            self._iterator_resource,
-            output_types=self._flat_output_types,
-            output_shapes=self._flat_output_shapes)
+      ret = gen_dataset_ops.iterator_get_next(
+          self._iterator_resource,
+          output_types=self._flat_output_types,
+          output_shapes=self._flat_output_shapes)
 
       try:
         # Fast path for the case `self._structure` is not a nested structure.
@@ -836,13 +800,15 @@ class OwnedIterator(IteratorBase):
     return self._next_internal()
 
   def get_next_as_optional(self):
-    # pylint: disable=protected-access
-    return optional_ops._OptionalImpl(
-        gen_dataset_ops.iterator_get_next_as_optional(
-            self._iterator_resource,
-            output_types=structure.get_flat_tensor_types(self.element_spec),
-            output_shapes=structure.get_flat_tensor_shapes(
-                self.element_spec)), self.element_spec)
+    # TODO(b/169442955): Investigate the need for this colocation constraint.
+    with ops.colocate_with(self._iterator_resource):
+      # pylint: disable=protected-access
+      return optional_ops._OptionalImpl(
+          gen_dataset_ops.iterator_get_next_as_optional(
+              self._iterator_resource,
+              output_types=structure.get_flat_tensor_types(self.element_spec),
+              output_shapes=structure.get_flat_tensor_shapes(
+                  self.element_spec)), self.element_spec)
 
   def _gather_saveables_for_checkpoint(self):
 
@@ -958,10 +924,4 @@ def get_next_as_optional(iterator):
     A `tf.experimental.Optional` object which either contains the next element
     of the iterator (if it exists) or no value.
   """
-  # pylint: disable=protected-access
-  return optional_ops._OptionalImpl(
-      gen_dataset_ops.iterator_get_next_as_optional(
-          iterator._iterator_resource,
-          output_types=structure.get_flat_tensor_types(iterator.element_spec),
-          output_shapes=structure.get_flat_tensor_shapes(
-              iterator.element_spec)), iterator.element_spec)
+  return iterator.get_next_as_optional()
diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py
index 187e1e988e8..b1a3b40f483 100644
--- a/tensorflow/python/data/ops/multi_device_iterator_ops.py
+++ b/tensorflow/python/data/ops/multi_device_iterator_ops.py
@@ -223,7 +223,7 @@ class MultiDeviceIterator(object):
       dataset: The input dataset to be iterated over.
       devices: The list of devices to fetch data to.
       max_buffer_size: Maximum size of the host side per device buffer to keep.
-      prefetch_buffer_size: if > 1, then we setup a buffer on each device to
+      prefetch_buffer_size: if > 0, then we setup a buffer on each device to
         prefetch into.
       source_device: The host device to place the `dataset` on.  In order to
         prevent deadlocks, if the prefetch_buffer_size is greater than the
@@ -481,7 +481,7 @@ class OwnedMultiDeviceIterator(composite_tensor.CompositeTensor):
       dataset: The input dataset to be iterated over.
       devices: The list of devices to fetch data to.
       max_buffer_size: Maximum size of the host side per device buffer to keep.
-      prefetch_buffer_size: if > 1, then we setup a buffer on each device to
+      prefetch_buffer_size: if > 0, then we setup a buffer on each device to
         prefetch into.
       source_device: The host device to place the `dataset` on.  In order to
         prevent deadlocks, if the prefetch_buffer_size is greater than the
diff --git a/tensorflow/python/data/ops/optional_ops.py b/tensorflow/python/data/ops/optional_ops.py
index 3db3ab03a30..d3ca3f667af 100644
--- a/tensorflow/python/data/ops/optional_ops.py
+++ b/tensorflow/python/data/ops/optional_ops.py
@@ -175,22 +175,23 @@ class _OptionalImpl(Optional):
     self._element_spec = element_spec
 
   def has_value(self, name=None):
-    return gen_dataset_ops.optional_has_value(self._variant_tensor, name=name)
+    with ops.colocate_with(self._variant_tensor):
+      return gen_dataset_ops.optional_has_value(self._variant_tensor, name=name)
 
   def get_value(self, name=None):
     # TODO(b/110122868): Consolidate the restructuring logic with similar logic
     # in `Iterator.get_next()` and `StructuredFunctionWrapper`.
     with ops.name_scope(name, "OptionalGetValue",
                         [self._variant_tensor]) as scope:
-      return structure.from_tensor_list(
-          self._element_spec,
-          gen_dataset_ops.optional_get_value(
-              self._variant_tensor,
-              name=scope,
-              output_types=structure.get_flat_tensor_types(
-                  self._element_spec),
-              output_shapes=structure.get_flat_tensor_shapes(
-                  self._element_spec)))
+      with ops.colocate_with(self._variant_tensor):
+        result = gen_dataset_ops.optional_get_value(
+            self._variant_tensor,
+            name=scope,
+            output_types=structure.get_flat_tensor_types(self._element_spec),
+            output_shapes=structure.get_flat_tensor_shapes(self._element_spec))
+      # NOTE: We do not colocate the deserialization of composite tensors
+      # because not all ops are guaranteed to have non-GPU kernels.
+      return structure.from_tensor_list(self._element_spec, result)
 
   @property
   def element_spec(self):
diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index 245f578826b..9f9bcdeea10 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -60,7 +60,7 @@ def _sequence_like(instance, args):
   Returns:
     `args` with the type of `instance`.
   """
-  if isinstance(instance, dict):
+  if isinstance(instance, _collections_abc.Mapping):
     # Pack dictionaries in a deterministic order by sorting the keys.
     # Notice this means that we ignore the original order of `OrderedDict`
     # instances. This is intentional, to avoid potential bugs caused by mixing
@@ -79,7 +79,7 @@ def _sequence_like(instance, args):
 
 
 def _yield_value(iterable):
-  if isinstance(iterable, dict):
+  if isinstance(iterable, _collections_abc.Mapping):
     # Iterate through dictionaries in a deterministic order by sorting the
     # keys. Notice this means that we ignore the original order of `OrderedDict`
     # instances. This is intentional, to avoid potential bugs caused by mixing
@@ -238,7 +238,7 @@ def map_structure(func, *structure, **check_types_dict):
   for other in structure[1:]:
     assert_same_structure(structure[0], other, check_types=check_types)
 
-  flat_structure = [flatten(s) for s in structure]
+  flat_structure = (flatten(s) for s in structure)
   entries = zip(*flat_structure)
 
   return pack_sequence_as(
@@ -310,14 +310,14 @@ def assert_shallow_structure(shallow_tree, input_tree, check_types=True):
           "structure has length %s, while shallow structure has length %s."
           % (len(input_tree), len(shallow_tree)))
 
-    if check_types and isinstance(shallow_tree, dict):
+    if check_types and isinstance(shallow_tree, _collections_abc.Mapping):
       if set(input_tree) != set(shallow_tree):
         raise ValueError(
             "The two structures don't have the same keys. Input "
             "structure has keys %s, while shallow structure has keys %s." %
             (list(input_tree), list(shallow_tree)))
-      input_tree = list(sorted(_six.iteritems(input_tree)))
-      shallow_tree = list(sorted(_six.iteritems(shallow_tree)))
+      input_tree = sorted(_six.iteritems(input_tree))
+      shallow_tree = sorted(_six.iteritems(shallow_tree))
 
     for shallow_branch, input_branch in zip(shallow_tree, input_tree):
       assert_shallow_structure(shallow_branch, input_branch,
@@ -465,8 +465,8 @@ def map_structure_up_to(shallow_tree, func, *inputs):
 
   # Flatten each input separately, apply the function to corresponding elements,
   # then repack based on the structure of the first input.
-  all_flattened_up_to = [flatten_up_to(shallow_tree, input_tree)
-                         for input_tree in inputs]
+  all_flattened_up_to = (
+      flatten_up_to(shallow_tree, input_tree) for input_tree in inputs)
 
   results = [func(*tensors) for tensors in zip(*all_flattened_up_to)]
   return pack_sequence_as(structure=shallow_tree, flat_sequence=results)
diff --git a/tensorflow/python/data/util/structure.py b/tensorflow/python/data/util/structure.py
index 30e393c82de..8b005983719 100644
--- a/tensorflow/python/data/util/structure.py
+++ b/tensorflow/python/data/util/structure.py
@@ -67,7 +67,7 @@ def _RaggedTensorStructure(dtype, shape, ragged_rank):
 
 # TODO(jsimsa): Remove the special-case for `TensorArray` pass-through once
 # it is a subclass of `CompositeTensor`.
-def normalize_element(element):
+def normalize_element(element, element_signature=None):
   """Normalizes a nested structure of element components.
 
   * Components matching `SparseTensorSpec` are converted to `SparseTensor`.
@@ -78,19 +78,32 @@ def normalize_element(element):
 
   Args:
     element: A nested structure of individual components.
+    element_signature: (Optional.) A nested structure of `tf.DType` objects
+      corresponding to each component of `element`. If specified, it will be
+      used to set the exact type of output tensor when converting input
+      components which are not tensors themselves (e.g. numpy arrays, native
+      python types, etc.)
 
   Returns:
     A nested structure of `Tensor`, `Dataset`, `SparseTensor`, `RaggedTensor`,
     or `TensorArray` objects.
   """
-  components = nest.flatten(element)
   normalized_components = []
+  if element_signature is None:
+    components = nest.flatten(element)
+    flattened_signature = [None] * len(components)
+    pack_as = element
+  else:
+    flattened_signature = nest.flatten(element_signature)
+    components = nest.flatten_up_to(element_signature, element)
+    pack_as = element_signature
   with ops.name_scope("normalize_element"):
     # Imported here to avoid circular dependency.
     from tensorflow.python.data.ops import dataset_ops  # pylint: disable=g-import-not-at-top
-    for i, t in enumerate(components):
+    for i, (t, spec) in enumerate(zip(components, flattened_signature)):
       try:
-        spec = type_spec_from_value(t, use_fallback=False)
+        if spec is None:
+          spec = type_spec_from_value(t, use_fallback=False)
       except TypeError:
         # TypeError indicates it was not possible to compute a `TypeSpec` for
         # the value. As a fallback try converting the value to a tensor.
@@ -111,9 +124,10 @@ def normalize_element(element):
         elif isinstance(t, composite_tensor.CompositeTensor):
           normalized_components.append(t)
         else:
+          dtype = getattr(spec, "dtype", None)
           normalized_components.append(
-              ops.convert_to_tensor(t, name="component_%d" % i))
-  return nest.pack_sequence_as(element, normalized_components)
+              ops.convert_to_tensor(t, name="component_%d" % i, dtype=dtype))
+  return nest.pack_sequence_as(pack_as, normalized_components)
 
 
 def convert_legacy_structure(output_types, output_shapes, output_classes):
@@ -523,6 +537,12 @@ class NoneTensorSpec(type_spec.BatchableTypeSpec):
   def _to_legacy_output_classes(self):
     return self
 
+  def most_specific_compatible_shape(self, other):
+    if type(self) is not type(other):
+      raise ValueError("No TypeSpec is compatible with both %s and %s" %
+                       (self, other))
+    return self
+
 
 type_spec.register_type_spec_from_value_converter(type(None),
                                                   NoneTensorSpec.from_value)
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 36fdb20aeae..176d2546525 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -797,6 +797,7 @@ cuda_py_test(
     size = "small",
     srcs = ["lib/debug_gradients_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":debug_data",
@@ -1041,6 +1042,7 @@ cuda_py_test(
     size = "small",
     srcs = ["lib/debug_grappler_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Tests TF:Classic implementation.
     deps = [
         ":debug_data",
@@ -1059,6 +1061,7 @@ cuda_py_test(
     srcs = ["lib/session_debug_file_test.py"],
     python_version = "PY3",
     tags = ["notsan"],
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":debug_data",
@@ -1077,6 +1080,7 @@ cuda_py_test(
     size = "small",
     srcs = ["lib/debug_graph_reconstruction_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":debug_data",
@@ -1097,6 +1101,7 @@ cuda_py_test(
     srcs = ["lib/session_debug_multi_gpu_test.py"],
     python_version = "PY3",
     tags = ["no_windows_gpu"],
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":debug_data",
@@ -1219,6 +1224,7 @@ cuda_py_test(
     srcs = ["cli/analyzer_cli_test.py"],
     python_version = "PY3",
     tags = ["no_windows"],  # TODO: needs investigation on Windows
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":analyzer_cli",
diff --git a/tensorflow/python/debug/cli/cli_shared_test.py b/tensorflow/python/debug/cli/cli_shared_test.py
index 3182d29d125..2d461c9e630 100644
--- a/tensorflow/python/debug/cli/cli_shared_test.py
+++ b/tensorflow/python/debug/cli/cli_shared_test.py
@@ -105,7 +105,7 @@ class TimeToReadableStrTest(test_util.TensorFlowTestCase):
       cli_shared.time_to_readable_str(100, force_time_unit="ks")
 
 
-@test_util.run_deprecated_v1
+@test_util.run_v1_only("tfdbg CLI is for tf.Session only")
 class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -319,7 +319,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     self.assertEqual("run #1: 1 fetch (a:0); 1 feed (foo)", short_description)
 
 
-@test_util.run_deprecated_v1
+@test_util.run_v1_only("tfdbg CLI is for tf.Session only")
 class GetErrorIntroTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/cli/evaluator_test.py b/tensorflow/python/debug/cli/evaluator_test.py
index 3116ab6f957..dd99544a2af 100644
--- a/tensorflow/python/debug/cli/evaluator_test.py
+++ b/tensorflow/python/debug/cli/evaluator_test.py
@@ -151,7 +151,7 @@ class EvaluatorTest(test_util.TensorFlowTestCase):
       return [np.array([[1.0, 2.0, 3.0]])]
 
     with test.mock.patch.object(
-        dump, "get_tensors", side_effect=fake_get_tensors, autospec=True):
+        dump, "get_tensors", side_effect=fake_get_tensors):
       ev = evaluator.ExpressionEvaluator(dump)
       self.assertEqual(3, ev.evaluate("np.size(`a:0`)"))
 
@@ -168,7 +168,7 @@ class EvaluatorTest(test_util.TensorFlowTestCase):
         return [np.array([[-1.0], [1.0]])]
 
     with test.mock.patch.object(
-        dump, "get_tensors", side_effect=fake_get_tensors, autospec=True):
+        dump, "get_tensors", side_effect=fake_get_tensors):
       ev = evaluator.ExpressionEvaluator(dump)
       self.assertAllClose([[-3.0], [1.0]],
                           ev.evaluate("np.matmul(`a:0`, `b:0`)"))
@@ -182,7 +182,7 @@ class EvaluatorTest(test_util.TensorFlowTestCase):
       raise debug_data.WatchKeyDoesNotExistInDebugDumpDirError()
 
     with test.mock.patch.object(
-        dump, "get_tensors", side_effect=fake_get_tensors, autospec=True):
+        dump, "get_tensors", side_effect=fake_get_tensors):
       ev = evaluator.ExpressionEvaluator(dump)
       with self.assertRaisesRegex(
           ValueError, "Eval failed due to the value of .* being unavailable"):
@@ -204,7 +204,7 @@ class EvaluatorTest(test_util.TensorFlowTestCase):
         return [np.array(20.0)]
 
     with test.mock.patch.object(
-        dump, "get_tensors", side_effect=fake_get_tensors, autospec=True):
+        dump, "get_tensors", side_effect=fake_get_tensors):
       ev = evaluator.ExpressionEvaluator(dump)
       with self.assertRaisesRegex(ValueError, r"multiple \(2\) devices"):
         ev.evaluate("`a:0` + `a:0`")
@@ -224,7 +224,7 @@ class EvaluatorTest(test_util.TensorFlowTestCase):
         return [np.array([[-2.0, 2.0]])]
 
     with test.mock.patch.object(
-        dump, "get_tensors", side_effect=fake_get_tensors, autospec=True):
+        dump, "get_tensors", side_effect=fake_get_tensors):
       ev = evaluator.ExpressionEvaluator(dump)
       self.assertAllClose(
           [[4.0]],
@@ -238,7 +238,7 @@ class EvaluatorTest(test_util.TensorFlowTestCase):
         return [np.array([[-1.0], [1.0]]), np.array([[-2.0], [2.0]])]
 
     with test.mock.patch.object(
-        dump, "get_tensors", side_effect=fake_get_tensors, autospec=True):
+        dump, "get_tensors", side_effect=fake_get_tensors):
       ev = evaluator.ExpressionEvaluator(dump)
       self.assertAllClose(
           [[4.0]], ev.evaluate("np.matmul(`a:0[1]`.T, `a:0[0]`)"))
diff --git a/tensorflow/python/debug/examples/v1/examples_v1_test.sh b/tensorflow/python/debug/examples/v1/examples_v1_test.sh
index 6b52f57ba8a..e59b357a7c8 100755
--- a/tensorflow/python/debug/examples/v1/examples_v1_test.sh
+++ b/tensorflow/python/debug/examples/v1/examples_v1_test.sh
@@ -105,7 +105,7 @@ EOF
 
 # Test debugging of tf.keras, with non-debug runs included.
 cat << EOF | ${DEBUG_KERAS_BIN} --debug --ui_type=readline --use_random_config_path
-run -t 10
+run -t 11
 EOF
 
 # Test offline_analyzer.
diff --git a/tensorflow/python/debug/lib/common_test.py b/tensorflow/python/debug/lib/common_test.py
index f6413f6b7b3..8469700accf 100644
--- a/tensorflow/python/debug/lib/common_test.py
+++ b/tensorflow/python/debug/lib/common_test.py
@@ -27,7 +27,7 @@ from tensorflow.python.platform import googletest
 
 class CommonTest(test_util.TensorFlowTestCase):
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("Relies on tensor name, which is unavailable in TF2")
   def testOnFeedOneFetch(self):
     a = constant_op.constant(10.0, name="a")
     b = constant_op.constant(20.0, name="b")
@@ -36,7 +36,7 @@ class CommonTest(test_util.TensorFlowTestCase):
     self.assertItemsEqual(["a:0"], loaded[0])
     self.assertItemsEqual(["b:0"], loaded[1])
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("Relies on tensor name, which is unavailable in TF2")
   def testGetRunKeyFlat(self):
     a = constant_op.constant(10.0, name="a")
     b = constant_op.constant(20.0, name="b")
@@ -45,7 +45,7 @@ class CommonTest(test_util.TensorFlowTestCase):
     self.assertItemsEqual(["a:0"], loaded[0])
     self.assertItemsEqual(["a:0", "b:0"], loaded[1])
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("Relies on tensor name, which is unavailable in TF2")
   def testGetRunKeyNestedFetches(self):
     a = constant_op.constant(10.0, name="a")
     b = constant_op.constant(20.0, name="b")
diff --git a/tensorflow/python/debug/lib/debug_gradients_test.py b/tensorflow/python/debug/lib/debug_gradients_test.py
index 95da6cb9ff8..e7a4d2d5450 100644
--- a/tensorflow/python/debug/lib/debug_gradients_test.py
+++ b/tensorflow/python/debug/lib/debug_gradients_test.py
@@ -36,7 +36,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import gradient_descent
 
 
-@test_util.run_deprecated_v1
+@test_util.run_v1_only("Sessions are not available in TF 2.x")
 class IdentifyGradientTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/lib/source_utils_test.py b/tensorflow/python/debug/lib/source_utils_test.py
index d0a4ecdbac4..366b25e89ac 100644
--- a/tensorflow/python/debug/lib/source_utils_test.py
+++ b/tensorflow/python/debug/lib/source_utils_test.py
@@ -106,7 +106,7 @@ class GuessIsTensorFlowLibraryTest(test_util.TensorFlowTestCase):
     self.assertTrue(
         source_utils.guess_is_tensorflow_py_library(source_utils.__file__))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_v1_only("Tensor.op is not available in TF 2.x")
   def testFileInPythonKernelsPathReturnsTrue(self):
     x = constant_op.constant(42.0, name="x")
     self.assertTrue(
@@ -320,7 +320,7 @@ class SourceHelperTest(test_util.TensorFlowTestCase):
       source_utils.load_source(source_path)
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_v1_only("Sessions are not available in TF 2.x")
 class ListSourceAgainstDumpTest(test_util.TensorFlowTestCase):
 
   def createAndRunGraphWithWhileLoop(self):
diff --git a/tensorflow/python/debug/wrappers/disk_usage_test.py b/tensorflow/python/debug/wrappers/disk_usage_test.py
index 71c56b33106..e9cb388c89f 100644
--- a/tensorflow/python/debug/wrappers/disk_usage_test.py
+++ b/tensorflow/python/debug/wrappers/disk_usage_test.py
@@ -32,7 +32,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import monitored_session
 
 
-@test_util.run_deprecated_v1
+@test_util.run_v1_only("Sessions are not available in TF 2.x")
 class DumpingDebugWrapperDiskUsageLimitTest(test_util.TensorFlowTestCase):
 
   @classmethod
diff --git a/tensorflow/python/debug/wrappers/framework_test.py b/tensorflow/python/debug/wrappers/framework_test.py
index 266404a1038..7ca0cf192fa 100644
--- a/tensorflow/python/debug/wrappers/framework_test.py
+++ b/tensorflow/python/debug/wrappers/framework_test.py
@@ -141,7 +141,7 @@ class TestDebugWrapperSessionBadAction(framework.BaseDebugWrapperSession):
     return framework.OnRunEndResponse()
 
 
-@test_util.run_deprecated_v1
+@test_util.run_v1_only("Sessions are not available in TF 2.x")
 class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def _no_rewrite_session_config(self):
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index dcf6b6b30fc..4072e7ad230 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -4,7 +4,10 @@ load("//tensorflow/core/platform/default:distribute.bzl", "distribute_py_test")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 
 package(
-    default_visibility = ["//tensorflow:internal"],
+    default_visibility = [
+        "//tensorflow:internal",
+        "//third_party/py/keras:__subpackages__",  # TODO(scottzhu): remove this once keras is relying on tf.__internal__.
+    ],
     licenses = ["notice"],  # Apache 2.0
 )
 
@@ -13,6 +16,7 @@ exports_files(["LICENSE"])
 py_library(
     name = "distribute_test_lib_pip",
     deps = [
+        ":all_reduce",
         ":combinations",
         ":multi_worker_test_base",
         ":single_loss_example",
@@ -64,10 +68,12 @@ py_library(
         ":collective_util",
         ":cross_device_utils",
         ":device_util",
+        ":distribute_utils",
         ":ps_values",
         ":reduce_util",
         ":tpu_values",
         ":values",
+        ":values_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:device_lib",
         "//tensorflow/python:framework_ops",
@@ -78,8 +84,10 @@ py_library(
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:tf_export",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:executor",
         "//tensorflow/tools/docs:doc_controls",
+        "@enum34_archive//:enum",
         "@six_archive//:six",
     ],
 )
@@ -89,17 +97,17 @@ py_library(
     srcs = ["cross_device_utils.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":all_reduce",
         ":values",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:collective_ops",
-        "//tensorflow/python:device",
+        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nccl_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
     ],
 )
 
@@ -136,17 +144,17 @@ cuda_py_test(
 
 py_library(
     name = "distribute",
-    srcs = [
-        "__init__.py",
-    ],
     srcs_version = "PY2AND3",
     deps = [
+        ":all_reduce",
         ":cross_device_ops",
         ":distribute_lib",
         ":mirrored_strategy",
+        ":multi_process_runner",
+        ":multi_worker_test_base",
         ":one_device_strategy",
         ":sharded_variable",
-        "//tensorflow/python/distribute/client:parameter_server_client",
+        "//tensorflow/python/distribute/client",
         "//tensorflow/python/distribute/experimental",
     ],
 )
@@ -203,7 +211,6 @@ py_test(
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -330,6 +337,7 @@ py_library(
     name = "mirrored_strategy",
     srcs = ["mirrored_strategy.py"],
     deps = [
+        ":collective_util",
         ":cross_device_ops",
         ":device_util",
         ":distribute_lib",
@@ -418,6 +426,7 @@ py_library(
     srcs = ["collective_all_reduce_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":collective_util",
         ":cross_device_ops",
         ":cross_device_utils",
         ":input_lib",
@@ -600,7 +609,7 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/tpu:tpu_lib",
-        "//tensorflow/python/tpu:tpu_py",
+        "//tensorflow/python/tpu:tpu_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -662,12 +671,22 @@ py_library(
 py_library(
     name = "collective_util",
     srcs = ["collective_util.py"],
+    srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
     ],
 )
 
+tf_py_test(
+    name = "collective_util_test",
+    srcs = ["collective_util_test.py"],
+    deps = [
+        ":collective_util",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
 py_library(
     name = "shared_variable_creator",
     srcs = ["shared_variable_creator.py"],
@@ -790,6 +809,7 @@ py_library(
     visibility = [
         "//tensorflow:internal",
         "//tensorflow_models:__subpackages__",
+        "//third_party/py/keras:__subpackages__",
     ],
     deps = [
         ":collective_all_reduce_strategy",
@@ -813,6 +833,7 @@ py_test(
     python_version = "PY3",
     deps = [
         ":combinations",
+        ":test_util",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_combinations",
         "//tensorflow/python/distribute/cluster_resolver:tfconfig_cluster_resolver_py",
@@ -827,6 +848,7 @@ py_library(
     visibility = [
         "//tensorflow:internal",
         "//tensorflow_models:__subpackages__",
+        "//third_party/py/keras:__subpackages__",
     ],
     deps = [
         ":central_storage_strategy",
@@ -837,6 +859,7 @@ py_library(
         ":multi_process_runner",
         ":multi_worker_test_base",
         ":one_device_strategy",
+        ":test_util",
         ":tpu_strategy",
         "//tensorflow/python:config",
         "//tensorflow/python:platform",
@@ -855,12 +878,21 @@ distribute_py_test(
     disable_mlir_bridge = False,
     python_version = "PY3",
     deps = [
+        ":central_storage_strategy",
+        ":collective_all_reduce_strategy",
         ":combinations",
+        ":mirrored_strategy",
+        ":one_device_strategy",
         ":reduce_util",
         ":strategy_combinations",
+        ":test_util",
+        ":tpu_strategy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
         "//tensorflow/python:config",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/eager:def_function",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -870,6 +902,7 @@ py_library(
     srcs = ["multi_worker_test_base.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":distribute_coordinator",
         ":multi_process_runner",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -891,6 +924,9 @@ tf_py_test(
     name = "multi_worker_test_base_test",
     srcs = ["multi_worker_test_base_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_oss",  # TODO(b/170834611)
+    ],
     deps = [
         ":multi_worker_test_base",
     ],
@@ -918,7 +954,6 @@ cuda_py_test(
 distribute_py_test(
     name = "checkpointing_test",
     srcs = ["checkpointing_test.py"],
-    disable_mlir_bridge = False,
     main = "checkpointing_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -948,6 +983,7 @@ distribute_py_test(
         ":multi_worker_test_base",
         ":reduce_util",
         ":strategy_combinations",
+        ":test_util",
         ":values",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:errors",
@@ -999,6 +1035,7 @@ cuda_py_test(
     name = "cross_device_utils_test",
     srcs = ["cross_device_utils_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":combinations",
         ":cross_device_utils",
@@ -1022,27 +1059,25 @@ cuda_py_test(
         "multi_and_single_gpu",
     ],
     deps = [
-        ":collective_all_reduce_strategy",
         ":collective_util",
         ":combinations",
         ":cross_device_ops",
         ":cross_device_utils",
-        ":device_util",
+        ":multi_process_runner",
         ":multi_worker_test_base",
-        ":multi_worker_util",
         ":reduce_util",
-        ":strategy_combinations",
         ":values",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:collective_ops",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:kernels",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:indexed_slices",
+        "//tensorflow/python:util",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1066,8 +1101,12 @@ py_library(
     srcs = ["sharded_variable.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:variables",
+        "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/training/saving:saveable_object_util",
         "//tensorflow/python/training/tracking:base",
     ],
@@ -1077,17 +1116,26 @@ tf_py_test(
     name = "sharded_variable_test",
     size = "small",
     srcs = ["sharded_variable_test.py"],
+    tags = [
+        # depend through //third_party/tensorflow/python:extra_py_tests_deps.
+        "ignore_for_dep=third_party.tensorflow.python.keras.engine.base_layer",
+    ],
     deps = [
         ":sharded_variable",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:extra_py_tests_deps",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/module",
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:signature_constants",
@@ -1131,17 +1179,19 @@ distribute_py_test(
     name = "values_test",
     size = "medium",
     srcs = ["values_test.py"],
-    disable_mlir_bridge = False,
     main = "values_test.py",
     shard_count = 5,
     tags = [
         "multi_and_single_gpu",
         "no_rocm",
+        "notsan",  # b/168645872
     ],
     tpu_tags = [
-        "no_oss",  # b/150954621 Target too big to run serially reliably.
+        "no_oss",  # TODO(b/150954621) Target too big to run serially reliably.
+        "noasan",  # TODO(b/337374867) fails with -fsanitize=null
     ],
     deps = [
+        ":collective_all_reduce_strategy",
         ":combinations",
         ":distribute_lib",
         ":distribute_utils",
@@ -1153,7 +1203,6 @@ distribute_py_test(
         ":tpu_strategy",
         ":tpu_values",
         ":values",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:constant_op",
@@ -1163,7 +1212,6 @@ distribute_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:indexed_slices",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
         "//tensorflow/python:saver",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
@@ -1174,14 +1222,12 @@ distribute_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/saved_model:save_options",
-        "//tensorflow/python/saved_model/model_utils:mode_keys",
-        "//tensorflow/python/tpu:tpu_lib",
         "//tensorflow/python/types",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1230,6 +1276,7 @@ distribute_py_test(
         ":combinations",
         ":distribute_lib",
         ":strategy_combinations",
+        ":test_util",
         ":tpu_strategy",
         ":tpu_values",
         ":values",
@@ -1277,11 +1324,11 @@ distribute_py_test(
 distribute_py_test(
     name = "moving_averages_test",
     srcs = ["moving_averages_test.py"],
-    disable_mlir_bridge = False,
     main = "moving_averages_test.py",
     deps = [
         ":combinations",
         ":strategy_combinations",
+        ":test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
@@ -1332,23 +1379,6 @@ distribute_py_test(
     ],
 )
 
-distribute_py_test(
-    name = "strategy_reduce_test",
-    srcs = ["strategy_reduce_test.py"],
-    main = "strategy_reduce_test.py",
-    tags = [
-        "multi_and_single_gpu",
-    ],
-    deps = [
-        ":combinations",
-        ":strategy_combinations",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/eager:test",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 py_library(
     name = "single_loss_example",
     srcs = ["single_loss_example.py"],
@@ -1524,7 +1554,9 @@ cuda_py_test(
         ":multi_worker_test_base",
         ":multi_worker_util",
         ":reduce_util",
+        ":strategy_combinations",
         ":strategy_test_lib",
+        ":test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -1700,9 +1732,6 @@ distribute_py_test(
     shard_count = 2,
     tags = [
         "multi_and_single_gpu",
-        # TODO(b/155301154): Enable this test on multi-gpu guitar once multi process
-        # runner can run on guitar.
-        "noguitar",
         "notsan",  # TODO(b/160006974)
     ],
     xla_enable_strict_auto_jit = True,
@@ -1713,6 +1742,38 @@ distribute_py_test(
         ":reduce_util",
         ":strategy_combinations",
         ":strategy_test_lib",
+        ":test_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:def_function",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+distribute_py_test(
+    name = "strategy_gather_test",
+    srcs = ["strategy_gather_test.py"],
+    disable_mlir_bridge = False,
+    python_version = "PY3",
+    shard_count = 2,
+    tags = [
+        "multi_and_single_gpu",
+        "notsan",  # TODO(b/160006974)
+    ],
+    xla_enable_strict_auto_jit = True,
+    deps = [
+        ":collective_all_reduce_strategy",
+        ":combinations",
+        ":multi_worker_test_base",
+        ":reduce_util",
+        ":strategy_combinations",
+        ":strategy_test_lib",
+        ":test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -1758,20 +1819,21 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":collective_all_reduce_strategy",
-        ":cross_device_utils",
-        ":distribute_utils",
+        ":multi_process_runner",
         ":values",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:config",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:util",
-        "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/types",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/eager:context",
     ],
 )
 
 distribute_py_test(
     name = "test_util_test",
     srcs = ["test_util_test.py"],
+    disable_mlir_bridge = False,
     tags = [
         "multi_and_single_gpu",
     ],
@@ -1792,16 +1854,18 @@ py_library(
     srcs = ["parameter_server_strategy_v2.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":distribute_lib",
+        ":distribute_utils",
         ":parameter_server_strategy",
+        ":sharded_variable",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/distribute:distribute_lib",
-        "//tensorflow/python/distribute:input_lib",
-        "//tensorflow/python/distribute:sharded_variable",
-        "//tensorflow/python/distribute:values",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tf_decorator",
+        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/training:server_lib",
+        "//tensorflow/python/training/tracking:base",
     ],
 )
 
@@ -1809,12 +1873,27 @@ tf_py_test(
     name = "parameter_server_strategy_v2_test",
     srcs = ["parameter_server_strategy_v2_test.py"],
     python_version = "PY3",
+    tags = [
+        "notsan",  # b/168675975
+    ],
     deps = [
         ":multi_worker_test_base",
         ":parameter_server_strategy_v2",
-        "//tensorflow/python:training_server_lib",
+        ":sharded_variable",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python:linalg_ops_impl",
+        "//tensorflow/python:partitioned_variables",
         "//tensorflow/python:variables",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/training:server_lib",
+        "//tensorflow/python/training/tracking",
+        "//tensorflow/python/training/tracking:util",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/distribute/central_storage_strategy.py b/tensorflow/python/distribute/central_storage_strategy.py
index 6e2c441c468..e61570dd6bd 100644
--- a/tensorflow/python/distribute/central_storage_strategy.py
+++ b/tensorflow/python/distribute/central_storage_strategy.py
@@ -105,52 +105,6 @@ class CentralStorageStrategy(distribute_lib.Strategy):
     return super(CentralStorageStrategy, self).experimental_distribute_dataset(
         dataset, options)
 
-  def experimental_distribute_datasets_from_function(self, dataset_fn,  # pylint: disable=useless-super-delegation
-                                                     options=None):
-    """Distributes `tf.data.Dataset` instances created by calls to `dataset_fn`.
-
-    `dataset_fn` will be called once for each worker in the strategy. In this
-    case, we only have one worker so `dataset_fn` is called once. Each replica
-    on this worker will then dequeue a batch of elements from this local
-    dataset.
-
-    The `dataset_fn` should take an `tf.distribute.InputContext` instance where
-    information about batching and input replication can be accessed.
-
-    For Example:
-    ```
-    def dataset_fn(input_context):
-      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
-      d = tf.data.Dataset.from_tensors([[1.]]).repeat().batch(batch_size)
-      return d.shard(
-          input_context.num_input_pipelines, input_context.input_pipeline_id)
-
-    inputs = strategy.experimental_distribute_datasets_from_function(dataset_fn)
-
-    for batch in inputs:
-      replica_results = strategy.run(replica_fn, args=(batch,))
-    ```
-
-    IMPORTANT: The `tf.data.Dataset` returned by `dataset_fn` should have a
-    per-replica batch size, unlike `experimental_distribute_dataset`, which uses
-    the global batch size.  This may be computed using
-    `input_context.get_per_replica_batch_size`.
-
-    Args:
-      dataset_fn: A function taking a `tf.distribute.InputContext` instance and
-        returning a `tf.data.Dataset`.
-      options: `tf.distribute.InputOptions` used to control options on how this
-        dataset is distributed.
-
-    Returns:
-      A "distributed `Dataset`", which the caller can iterate over like regular
-      datasets.
-    """
-    return super(
-        CentralStorageStrategy,
-        self).experimental_distribute_datasets_from_function(dataset_fn,
-                                                             options)
-
   def experimental_local_results(self, value):  # pylint: disable=useless-super-delegation
     """Returns the list of all local per-replica values contained in `value`.
 
diff --git a/tensorflow/python/distribute/checkpointing_test.py b/tensorflow/python/distribute/checkpointing_test.py
index a4be193284e..8b4af726643 100644
--- a/tensorflow/python/distribute/checkpointing_test.py
+++ b/tensorflow/python/distribute/checkpointing_test.py
@@ -51,7 +51,7 @@ class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
       restore_checkpoint = trackable_utils.Checkpoint()
       restore_checkpoint.restore(save_path)
       initial_value = restore_checkpoint._preload_simple_restoration(
-          "v", variable_shape)
+          "v")
       v = variables_lib.Variable(initial_value)
       # Check that the variable is now tagged as restored. `Checkpoint` then
       # knows it doesn't have to restore `v`'s value when it's assigned to an
diff --git a/tensorflow/python/distribute/client/BUILD b/tensorflow/python/distribute/client/BUILD
index 9574f327075..54ddcee7a33 100644
--- a/tensorflow/python/distribute/client/BUILD
+++ b/tensorflow/python/distribute/client/BUILD
@@ -7,23 +7,13 @@ package(
 
 exports_files(["LICENSE"])
 
-py_library(
-    name = "parameter_server_client",
-    srcs = ["parameter_server_client.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":client",
-        ":utils",
-        "//tensorflow/python/distribute:parameter_server_strategy_v2",
-    ],
-)
-
 py_library(
     name = "client",
     srcs = ["client.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":metric_utils",
+        ":utils",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:func_graph",
@@ -39,7 +29,6 @@ py_library(
         "//tensorflow/python/eager:executor",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/eager:remote",
-        "@absl_py//absl/logging",
         "@six_archive//:six",
     ],
 )
@@ -49,33 +38,27 @@ tf_py_test(
     size = "small",
     srcs = ["client_test.py"],
     python_version = "PY3",
-    shard_count = 12,
+    shard_count = 50,
+    tags = [
+        "no_oss",  # TODO(b/162119374)
+        "notsan",  # TODO(b/171040359): Flaky timeout, even if maximum shards
+    ],
     deps = [
         ":client",
+        "//tensorflow/python:check_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:training_lib",
-        "//tensorflow/python:util",
-        "//tensorflow/python/eager:def_function",
-        "@absl_py//absl/logging",
-    ],
-)
-
-tf_py_test(
-    name = "parameter_server_client_test",
-    srcs = ["parameter_server_client_test.py"],
-    python_version = "PY3",
-    shard_count = 14,
-    tags = ["no_oss"],  # TODO(b/162119374)
-    deps = [
-        ":parameter_server_client",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:training_lib",
         "//tensorflow/python:training_server_lib",
+        "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:multi_worker_test_base",
-        "//tensorflow/python/distribute:sharded_variable",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
@@ -83,13 +66,12 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "parameter_server_client_mpr_test",
-    srcs = ["parameter_server_client_mpr_test.py"],
+    name = "client_mpr_test",
+    srcs = ["client_mpr_test.py"],
     python_version = "PY3",
     shard_count = 2,
     tags = ["no_oss"],  # TODO(b/162119374)
     deps = [
-        ":parameter_server_client",
         ":remote_eager_lib",
         ":utils",
         "//tensorflow/python:dtypes",
@@ -97,7 +79,9 @@ tf_py_test(
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:multi_process_runner",
         "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute:parameter_server_strategy_v2",
         "//tensorflow/python/distribute:sharded_variable",
+        "//tensorflow/python/distribute/client",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
@@ -122,6 +106,7 @@ tf_py_test(
         ":metric_utils",
         "//tensorflow/python:training_server_lib",
         "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute:parameter_server_strategy_v2",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:test",
     ],
@@ -131,10 +116,6 @@ py_library(
     name = "utils",
     srcs = ["utils.py"],
     srcs_version = "PY2AND3",
-    visibility = [
-        "//learning/tfx/users/apps_itemsuggest:__subpackages__",
-        "//tensorflow:internal",
-    ],
     deps = [
         "//tensorflow/python:training_server_lib",
     ],
diff --git a/tensorflow/python/distribute/client/client.py b/tensorflow/python/distribute/client/client.py
index ac785b2a828..6eabbfa219a 100644
--- a/tensorflow/python/distribute/client/client.py
+++ b/tensorflow/python/distribute/client/client.py
@@ -30,8 +30,9 @@ import re
 import sys
 import threading
 import weakref
-from absl import logging
 from six.moves import queue
+
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import parameter_server_strategy_v2
 from tensorflow.python.distribute.client import metric_utils
@@ -40,13 +41,12 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import executor
 from tensorflow.python.eager import function as tf_function
-from tensorflow.python.eager import remote
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import server_lib
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
 # Maximum time for failed worker to come back is 1 hour
@@ -163,13 +163,14 @@ class RemoteValue(object):
       The remote value, as a numpy data type (if scalar) or ndarray.
 
     Raises:
-      FunctionRetryableError: If the function that produces this `RemoteValue`
+      tf.errors.CancelledError: If the function that produces this `RemoteValue`
         is aborted or cancelled due to failure, and the user should handle and
         reschedule.
     """
     self._status_available_event.wait()
     if self._status is _RemoteValueStatus.ABORTED:
-      raise FunctionRetryableError(
+      raise errors.CancelledError(
+          None, None,
           "The corresponding function is aborted. Please reschedule the "
           "function.")
     if self._error is not None:
@@ -191,22 +192,21 @@ class InputError(Exception):
     super().__init__(message)
 
 
-class FunctionRetryableError(Exception):
-  """An error that represents the closure was aborted and should be retried."""
-  pass
-
-
-def _maybe_get_error_and_rebuild_remote_values(worker, structure):
+def _maybe_rebuild_remote_values(worker, structure):
   """Attempts to return errors from `RemoteValue`s. Rebuilds them if needed."""
   errors_in_structure = []
 
   def _get_error(val):
     if isinstance(val, RemoteValue):
       if val._status is _RemoteValueStatus.ABORTED:  # pylint: disable=protected-access
-        with worker.failure_handler.wait_on_failure(
-            on_recovery_fn=functools.partial(val._rebuild_on, worker),  # pylint: disable=protected-access
-            worker_device_name=worker.device_name):
-          val._rebuild_on(worker)  # pylint: disable=protected-access
+        try:
+          with worker.failure_handler.wait_on_failure(
+              on_recovery_fn=functools.partial(val._rebuild_on, worker),  # pylint: disable=protected-access
+              worker_device_name=worker.device_name):
+            val._rebuild_on(worker)  # pylint: disable=protected-access
+        except Exception as e:  # pylint: disable=broad-except
+          val._set_error(e)  # pylint: disable=protected-access
+
       error = val._get_error()  # pylint: disable=protected-access
       if error:
         errors_in_structure.append(error)
@@ -257,6 +257,18 @@ def _select_worker_slice(worker_id, structured):
   return nest.map_structure(_get, structured)
 
 
+def _disallow_remote_value_as_input(structured):
+  """Raises if any element of `structured` is a RemoteValue."""
+
+  def _raise_if_remote_value(x):
+    if isinstance(x, RemoteValue):
+      raise ValueError("RemoteValue cannot be used as an input to scheduled "
+                       "function. Please file a feature request if you need "
+                       "this feature.")
+
+  nest.map_structure(_raise_if_remote_value, structured)
+
+
 class Closure(object):
   """Hold a function to be scheduled and its arguments."""
 
@@ -267,6 +279,9 @@ class Closure(object):
     self._args = args or ()
     self._kwargs = kwargs or {}
 
+    _disallow_remote_value_as_input(self._args)
+    _disallow_remote_value_as_input(self._kwargs)
+
     if isinstance(function, def_function.Function):
       replica_args = _select_worker_slice(0, self._args)
       replica_kwargs = _select_worker_slice(0, self._kwargs)
@@ -305,17 +320,12 @@ class Closure(object):
     # It will do nothing if there is no return value.
     nest.map_structure(lambda x: x.fetch(), self._output_remote_values)  # pylint: disable=protected-access
 
-  def _set_output_remote_values_aborted(self):
-    """Set output remote_value aborted."""
-    # It will do nothing if there is no return value.
-    nest.map_structure(lambda x: x._set_aborted(), self._output_remote_values)  # pylint: disable=protected-access
-
   def _set_output_remote_values_cancelled(self):
     nest.map_structure(
         lambda x: x._set_error(  # pylint: disable=protected-access,g-long-lambda
-            FunctionRetryableError("The corresponding function is "
-                                   "cancelled. Please reschedule the "
-                                   "function.")),
+            errors.CancelledError(
+                None, None, "The corresponding function is "
+                "cancelled. Please reschedule the function.")),
         self._output_remote_values)  # pylint: disable=protected-access
 
   def execute_on(self, worker):
@@ -328,8 +338,8 @@ class Closure(object):
     replica_kwargs = _select_worker_slice(worker.worker_index, self._kwargs)
 
     e = (
-        _maybe_get_error_and_rebuild_remote_values(worker, replica_args) or
-        _maybe_get_error_and_rebuild_remote_values(worker, replica_kwargs))
+        _maybe_rebuild_remote_values(worker, replica_args) or
+        _maybe_rebuild_remote_values(worker, replica_kwargs))
     if e:
       if not isinstance(e, InputError):
         e = InputError(e)
@@ -384,7 +394,7 @@ class _CoordinatedClosureQueue(object):
 
     if _CLOSURE_QUEUE_MAX_SIZE <= 0:
       logging.warning(
-          "In ParameterServerClient, creating an infinite closure queue can "
+          "In a `Client`, creating an infinite closure queue can "
           "consume a significant amount of memory and even lead to OOM.")
     self._queue = queue.Queue(maxsize=_CLOSURE_QUEUE_MAX_SIZE)
     self._error = None
@@ -757,32 +767,11 @@ class Cluster(object):
     workers: a list of `Worker` objects in the cluster.
   """
 
-  def __init__(self, cluster_resolver, client_name="chief"):
-    """Initializes the cluster instance and connect to the remote cluster."""
-    if client_name in ["worker", "ps"]:
-      raise ValueError("Client name should not be 'worker' or 'ps'.")
-    cluster_spec = cluster_resolver.cluster_spec()
+  def __init__(self, strategy):
+    """Initializes the cluster instance."""
 
-    self._num_workers = len(cluster_spec.as_dict().get("worker", ()))
-    self._num_ps = len(cluster_spec.as_dict().get("ps", ()))
-    device_filters = server_lib.ClusterDeviceFilters()
-    # For any worker, only the devices on PS and chief nodes are visible
-    for i in range(self._num_workers):
-      device_filters.set_device_filters(
-          "worker", i, ["/job:ps", "/job:%s" % client_name])
-    # Similarly for any ps, only the devices on workers and chief are visible
-    for i in range(self._num_ps):
-      device_filters.set_device_filters(
-          "ps", i, ["/job:worker", "/job:%s" % client_name])
-
-    context.context().mirroring_policy = context.MIRRORING_ALL
-    # Allow at most one outstanding RPC for each worker at a certain time. This
-    # is to simplify worker failure handling in the runtime
-    os.environ["TF_ENABLE_EAGER_CLIENT_STREAMING_ENQUEUE"] = "False"
-    remote.connect_to_cluster(cluster_spec,
-                              job_name=client_name,
-                              protocol=cluster_resolver.rpc_layer,
-                              cluster_device_filters=device_filters)
+    self._num_workers = strategy._num_workers
+    self._num_ps = strategy._num_ps
 
     # Ignore PS failures reported by workers due to transient connection errors.
     # Transient connectivity issues between workers and PS are relayed by the
@@ -867,9 +856,7 @@ class Client(object):
   functions to be executed, and fetch the results of the functions.
 
   Currently, `Client` is not supported to be used in a standalone manner.
-  It should be used in conjunction with `ParameterServerStrategyV2`. The
-  recommended way of using the combination is through a `ParameterServerClient`
-  object. Please see `ParameterServerClient` for more information.
+  It should be used in conjunction with `ParameterServerStrategyV2`.
 
   This is currently under development, and the API as well as implementation
   is subject to changes.
@@ -893,21 +880,12 @@ class Client(object):
       raise ValueError("Only `ParameterServerStrategyV2` is supported in "
                        "`Client` currently.")
     self._strategy = strategy
-    self.cluster = Cluster(strategy._cluster_resolver)
+    self.cluster = Cluster(strategy)
 
   @property
   def strategy(self):
     return self._strategy
 
-  @contextlib.contextmanager
-  def experimental_variable_partitioning_scope(self):
-    with self._strategy.experimental_variable_partitioning_scope():
-      yield
-
-  (experimental_variable_partitioning_scope.__doc__) = (
-      parameter_server_strategy_v2.ParameterServerStrategyV2
-      .experimental_variable_partitioning_scope.__doc__)
-
   def schedule(self, fn, args=None, kwargs=None):
     """Schedules `fn` to be dispatched to a worker for execution asynchronously.
 
@@ -945,7 +923,8 @@ class Client(object):
     of values, each of which represents a component specific to a worker; in
     this case, the argument will be substituted with the corresponding component
     on the target worker. Arguments that are not `PerWorkerValues` will be
-    passed into `fn` as-is.
+    passed into `fn` as-is. Currently, `RemoteValue` is not supported to be
+    input `args` or `kwargs`.
 
     Args:
       fn: A `tf.function`; the function to be dispatched to a worker for
@@ -1155,15 +1134,12 @@ class _PerWorkerDistributedDataset(object):
     per_worker_iterator = self._client._create_per_worker_resources(
         _create_per_worker_iterator)
 
-    # Create an iterator, so the consumer function of this iterator can start
-    # tracing using this iterator without needing to wait for the completion of
-    # the iterater creation. Note: the iterator shouldn't use memory until it is
-    # consumed.
-    # TODO(b/154675763): get rid of this workaround once we can make input_fn a
-    # tf.function.
-    iterator = _create_per_worker_iterator()
+    # Setting type_spec of each RemoteValue so that functions taking these
+    # RemoteValues as inputs can be traced.
     for iterator_remote_value in per_worker_iterator._values:
-      iterator_remote_value._set_type_spec(iterator._type_spec)
+      iterator_remote_value._set_type_spec(
+          iterator_ops.IteratorSpec(
+              self._dataset_fn.structured_outputs.element_spec))
     return _PerWorkerDistributedIterator(per_worker_iterator._values)
 
   @property
diff --git a/tensorflow/python/distribute/client/parameter_server_client_mpr_test.py b/tensorflow/python/distribute/client/client_mpr_test.py
similarity index 87%
rename from tensorflow/python/distribute/client/parameter_server_client_mpr_test.py
rename to tensorflow/python/distribute/client/client_mpr_test.py
index 189f19107c7..802b23e87ec 100644
--- a/tensorflow/python/distribute/client/parameter_server_client_mpr_test.py
+++ b/tensorflow/python/distribute/client/client_mpr_test.py
@@ -13,18 +13,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Multi-process runner tests for parameter_server_client.py."""
+"""Multi-process runner tests for `Client` with `ParameterServerStrategyV2`."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
 import time
-from absl import logging
+
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
-from tensorflow.python.distribute.client import client
-from tensorflow.python.distribute.client import parameter_server_client
+from tensorflow.python.distribute import parameter_server_strategy_v2
+from tensorflow.python.distribute.client import client as client_lib
 from tensorflow.python.distribute.client import utils
 from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
 from tensorflow.python.eager import def_function
@@ -33,9 +34,10 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
 
 
-class ParameterServerClientMprTest(test.TestCase):
+class ClientMprTest(test.TestCase):
 
   def testScheduleTranslatePSFailureError(self):
     self._test_translate_ps_failure_error(test_schedule=True)
@@ -47,13 +49,15 @@ class ParameterServerClientMprTest(test.TestCase):
                                        test_schedule=False,
                                        test_join=False):
 
-    def proc_func(functions_scheduled_event, test_finished_event):
+    def fn(functions_scheduled_event, test_finished_event):
       cluster_resolver = TFConfigClusterResolver()
       if cluster_resolver.task_type != "chief":
         utils.start_server(cluster_resolver, "grpc")
-      ps_client = parameter_server_client.ParameterServerClient(
+      strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
           cluster_resolver)
-      with ps_client._strategy.scope():
+      ps_client = client_lib.Client(strategy)
+
+      with strategy.scope():
         v = variables.Variable(initial_value=0, dtype=dtypes.int32)
 
       @def_function.function
@@ -77,7 +81,7 @@ class ParameterServerClientMprTest(test.TestCase):
           while ps_client.cluster._closure_queue._error is None:
             time.sleep(1)
           ps_client.schedule(worker_fn)
-      except client.ParameterServerFailureError:
+      except client_lib.ParameterServerFailureError:
         # The following verifies that after PS fails, continue executing
         # functions on workers should fail and indicate it's PS failure.
         for worker_id in range(3):
@@ -87,7 +91,7 @@ class ParameterServerClientMprTest(test.TestCase):
               # failure.
               worker_fn()
             except Exception as e:  # pylint: disable=broad-except
-              if client._is_ps_failure(e):
+              if client_lib._is_ps_failure(e):
                 if worker_id < 2:
                   continue
                 logging.info("_test_translate_ps_failure_error ends properly.")
@@ -103,12 +107,12 @@ class ParameterServerClientMprTest(test.TestCase):
     functions_scheduled_event = manager.Event()
     test_finished_event = manager.Event()
     mpr = multi_process_runner.MultiProcessRunner(
-        proc_func,
+        fn,
         multi_worker_test_base.create_cluster_spec(
             has_chief=True, num_workers=3, num_ps=1, has_eval=False),
         args=(functions_scheduled_event, test_finished_event),
         rpc_layer="grpc",
-        list_stdout=True,
+        return_output=True,
         use_dill_for_args=False)
 
     mpr.start()
diff --git a/tensorflow/python/distribute/client/client_test.py b/tensorflow/python/distribute/client/client_test.py
index 3ea3e46d6e8..957332a7bec 100644
--- a/tensorflow/python/distribute/client/client_test.py
+++ b/tensorflow/python/distribute/client/client_test.py
@@ -19,24 +19,39 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import functools
 import platform
 import sys
 import threading
 import time
-from absl import logging
 
-from tensorflow.python.distribute.client import client
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import parameter_server_strategy_v2
+from tensorflow.python.distribute.client import client as client_lib
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.eager import cancellation
 from tensorflow.python.eager import def_function
-from tensorflow.python.platform import test
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import coordinator
+from tensorflow.python.training.server_lib import ClusterSpec
 from tensorflow.python.util import nest
 
 
 class CoordinatedClosureQueueTest(test.TestCase):
 
   def testBasic(self):
-    queue = client._CoordinatedClosureQueue()
+    queue = client_lib._CoordinatedClosureQueue()
     closure1 = self._create_closure(queue._cancellation_mgr)
     queue.put(closure1)
     self.assertIs(closure1, queue.get())
@@ -48,7 +63,7 @@ class CoordinatedClosureQueueTest(test.TestCase):
     queue.wait()
 
   def testProcessAtLeaseOnce(self):
-    closure_queue = client._CoordinatedClosureQueue()
+    closure_queue = client_lib._CoordinatedClosureQueue()
     labels = ['A', 'B', 'C', 'D', 'E']
     processed_count = collections.defaultdict(int)
 
@@ -78,7 +93,7 @@ class CoordinatedClosureQueueTest(test.TestCase):
 
     cm = cancellation.CancellationManager()
     for label in labels:
-      closure_queue.put(client.Closure(get_func(label), cm))
+      closure_queue.put(client_lib.Closure(get_func(label), cm))
     t1 = threading.Thread(target=process_queue, daemon=True)
     t1.start()
     t2 = threading.Thread(target=process_queue, daemon=True)
@@ -95,7 +110,7 @@ class CoordinatedClosureQueueTest(test.TestCase):
     coord.join([t1, t2])
 
   def testNotifyBeforeWait(self):
-    closure_queue = client._CoordinatedClosureQueue()
+    closure_queue = client_lib._CoordinatedClosureQueue()
 
     def func():
       logging.info('func running')
@@ -107,7 +122,7 @@ class CoordinatedClosureQueueTest(test.TestCase):
         closure_queue.get()
         closure_queue.mark_finished()
 
-    closure_queue.put(client.Closure(func, closure_queue._cancellation_mgr))
+    closure_queue.put(client_lib.Closure(func, closure_queue._cancellation_mgr))
     t = threading.Thread(target=process_queue)
     t.start()
     coord.join([t])
@@ -143,7 +158,7 @@ class CoordinatedClosureQueueTest(test.TestCase):
       # TODO(b/165013260): Fix this
       self.skipTest('Test is currently broken on Windows with Python 3.8')
 
-    closure_queue = client._CoordinatedClosureQueue()
+    closure_queue = client_lib._CoordinatedClosureQueue()
     closure_queue.put(self._create_closure(closure_queue._cancellation_mgr))
     closure = closure_queue.get()
 
@@ -173,10 +188,10 @@ class CoordinatedClosureQueueTest(test.TestCase):
     def some_function():
       return 1.0
 
-    return client.Closure(some_function, cancellation_mgr)
+    return client_lib.Closure(some_function, cancellation_mgr)
 
   def _put_two_closures_and_get_one(self):
-    closure_queue = client._CoordinatedClosureQueue()
+    closure_queue = client_lib._CoordinatedClosureQueue()
     closure1 = self._create_closure(closure_queue._cancellation_mgr)
     closure_queue.put(closure1)
 
@@ -203,7 +218,7 @@ class CoordinatedClosureQueueTest(test.TestCase):
     self.assertTrue(closure_queue.done())
 
     with self.assertRaisesRegex(
-        client.FunctionRetryableError,
+        errors.CancelledError,
         'The corresponding function is cancelled. Please reschedule the '
         'function.'):
       closure2._fetch_output_remote_values()
@@ -225,7 +240,7 @@ class CoordinatedClosureQueueTest(test.TestCase):
     self.assertTrue(closure_queue.done())
 
     with self.assertRaisesRegex(
-        client.FunctionRetryableError,
+        errors.CancelledError,
         'The corresponding function is cancelled. Please reschedule the '
         'function.'):
       closure2._fetch_output_remote_values()
@@ -307,14 +322,14 @@ class CoordinatedClosureQueueTest(test.TestCase):
     # The following asserts that closure3 should have been cancelled.
     if not call_wait:
       with self.assertRaisesRegex(
-          client.FunctionRetryableError,
+          errors.CancelledError,
           'The corresponding function is cancelled. Please reschedule the '
           'function.'):
         closure3._fetch_output_remote_values()
 
     # Closure2 was an inflight closure when it got cancelled.
     self.assertEqual(closure2._output_remote_values._status,
-                     client._RemoteValueStatus.READY)
+                     client_lib._RemoteValueStatus.READY)
     with self.assertRaisesRegex(ValueError, 'Fake cancellation error.'):
       closure2._fetch_output_remote_values()
 
@@ -345,7 +360,7 @@ class CoordinatedClosureQueueTest(test.TestCase):
 
   def testThreadSafey(self):
     thread_count = 10
-    queue = client._CoordinatedClosureQueue()
+    queue = client_lib._CoordinatedClosureQueue()
 
     # Each thread performs 20 queue actions: 10 are `put_back` and 10 are
     # `mark_finished`.
@@ -369,5 +384,430 @@ class CoordinatedClosureQueueTest(test.TestCase):
     self.assertTrue(queue.done())
 
 
+class ErrorReportingThread(threading.Thread):
+
+  error = None
+
+  def __init__(self, *args, **kwargs):
+    assert 'target' in kwargs
+    target = kwargs['target']
+
+    @functools.wraps(target)
+    def wrapped_target(*args, **kwargs):
+      try:
+        return target(*args, **kwargs)
+      except Exception as e:  # pylint: disable=broad-except
+        ErrorReportingThread.error = e
+
+    kwargs['target'] = wrapped_target
+    super(ErrorReportingThread, self).__init__(*args, **kwargs)
+
+
+class TestCaseWithErrorReportingThread(test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    cls._threading_thread = threading.Thread
+    threading.Thread = ErrorReportingThread
+    super(TestCaseWithErrorReportingThread, cls).setUpClass()
+
+  @classmethod
+  def tearDownClass(cls):
+    super(TestCaseWithErrorReportingThread, cls).tearDownClass()
+    threading.Thread = cls._threading_thread
+
+  def setUp(self):
+    ErrorReportingThread.error = None
+    super(TestCaseWithErrorReportingThread, self).setUp()
+
+  def tearDown(self):
+    super(TestCaseWithErrorReportingThread, self).tearDown()
+    if ErrorReportingThread.error:
+      raise ErrorReportingThread.error  # pylint: disable=raising-bad-type
+
+
+def make_client(num_workers, num_ps):
+  # TODO(rchao): Test the internal rpc_layer version.
+  cluster_def = multi_worker_test_base.create_in_process_cluster(
+      num_workers=num_workers, num_ps=num_ps, rpc_layer='grpc')
+  cluster_def['chief'] = [
+      'localhost:%d' % multi_worker_test_base.pick_unused_port()
+  ]
+  cluster_resolver = SimpleClusterResolver(
+      ClusterSpec(cluster_def), rpc_layer='grpc')
+  strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+      cluster_resolver)
+  return client_lib.Client(strategy)
+
+
+class ClientTest(TestCaseWithErrorReportingThread):
+
+  @classmethod
+  def setUpClass(cls):
+    super(ClientTest, cls).setUpClass()
+    cls.client = make_client(num_workers=3, num_ps=2)
+    cls.strategy = cls.client.strategy
+
+  def testFnReturnNestedValues(self):
+    x = constant_op.constant(1)
+
+    @def_function.function
+    def f():
+      return x + 1, (x + 2, x + 3), [x + 4], {'v': x}
+
+    got = self.client.schedule(f)
+    want = 2, (3, 4), [5], {'v': 1}
+    self.assertEqual(self.client.fetch(got), want)
+
+  def testInputFunction(self):
+
+    def input_fn():
+      return dataset_ops.DatasetV2.range(1, 2)
+
+    with self.strategy.scope():
+      v = variables.Variable(initial_value=0, dtype=dtypes.int64)
+
+    @def_function.function
+    def worker_fn(iterator):
+      x = next(iterator)
+      v.assign_add(x)
+      return x
+
+    distributed_dataset = self.client.create_per_worker_dataset(input_fn)
+    result = self.client.schedule(worker_fn, args=(iter(distributed_dataset),))
+    result = self.client.fetch(result)
+    self.assertEqual(result, (1,))
+    result = self.client.schedule(worker_fn, args=(iter(distributed_dataset),))
+    result = self.client.fetch(result)
+    self.assertEqual(result, (1,))
+
+    self.assertAlmostEqual(v.read_value().numpy(), 2, delta=1e-6)
+
+  def testAsyncScheduleAndJoin(self):
+
+    def input_fn():
+      return dataset_ops.DatasetV2.from_tensor_slices([2] * 10)
+
+    with self.strategy.scope():
+      v = variables.Variable(initial_value=0, dtype=dtypes.int32)
+
+    # TODO(yuefengz): the following tf.function has a return value which is None
+    # in its structured_outputs.
+    @def_function.function
+    def worker_fn(iterator):
+      x = next(iterator)
+      v.assign_add(x)
+
+    distributed_dataset = self.client.create_per_worker_dataset(input_fn)
+
+    iterator = iter(distributed_dataset)
+
+    # Verifying joining without any scheduling doesn't hang.
+    self.client.join()
+    self.assertEqual(v.read_value().numpy(), 0)
+
+    for _ in range(5):
+      self.client.schedule(worker_fn, args=(iterator,))
+    self.client.join()
+
+    # With 5 addition it should be 2*5 = 10.
+    self.assertEqual(v.read_value().numpy(), 10)
+
+    for _ in range(5):
+      self.client.schedule(worker_fn, args=(iterator,))
+
+    # Verifying multiple join is fine.
+    self.client.join()
+    self.client.join()
+    self.client.join()
+
+    self.assertTrue(self.client.done())
+
+    # Likewise, it's now 20.
+    self.assertEqual(v.read_value().numpy(), 20)
+
+  def testInputFunctionWithMap(self):
+    self._map_fn_tracing_count = 0
+
+    def input_fn():
+
+      def map_fn(x):
+        self._map_fn_tracing_count += 1
+        return x + 10
+
+      return dataset_ops.DatasetV2.range(0, 10).map(map_fn)
+
+    @def_function.function
+    def worker_fn(iterator):
+      return next(iterator)
+
+    distributed_dataset = self.client.create_per_worker_dataset(input_fn)
+    result = self.client.schedule(worker_fn, args=(iter(distributed_dataset),))
+    self.assertEqual(result.fetch(), (10,))
+    self.assertEqual(self._map_fn_tracing_count, 1)
+
+  def testInputFunctionCreateVariables(self):
+
+    def input_fn():
+      v = variables.Variable(initial_value=0.0)
+      return v.read_value()
+
+    with self.assertRaises(ValueError):
+      self.client.create_per_worker_dataset(input_fn)
+
+  def testPerWorkerValue(self):
+    var_shape = tuple()
+    var_dtype = dtypes.float32
+    var_name = 'var'
+
+    def create_var():
+      var = variables.Variable(
+          initial_value=0.0, dtype=var_dtype, name=var_name)
+      self.assertIn('worker', var.device)
+      return var
+
+    worker_local_var = self.client._create_per_worker_resources(create_var)
+
+    # The following is a workaround to allow `worker_local_var` to be passed in
+    # as args to the `client.schedule` method which requires tensor specs to
+    # trace tf.function but _create_worker_resources' return values don't have
+    # tensor specs. We can get rid of this workaround once
+    # _create_worker_resources is able to infer the tensor spec of the return
+    # value of the function passed in. See b/154675763.
+    for var in worker_local_var._values:
+      var._set_type_spec(tensor_spec.TensorSpec(var_shape, var_dtype, var_name))
+
+    def worker_fn(var):
+      var.assign_add(1.0)
+
+    for _ in range(10):
+      # Which slice of `worker_local_var` will be used will depend on which
+      # worker the `worker_fn` gets scheduled on.
+      self.client.schedule(worker_fn, args=(worker_local_var,))
+    self.client.join()
+
+    var_sum = sum(self.client.fetch(worker_local_var._values))
+    self.assertEqual(var_sum, 10.0)
+
+  def testDisallowRemoteValueAsInput(self):
+
+    @def_function.function
+    def func_0():
+      return 1.0
+
+    @def_function.function
+    def func_1(x):
+      return x + 1.0
+
+    remote_v = self.client.schedule(func_0)
+    with self.assertRaises(ValueError):
+      self.client.schedule(func_1, args=(remote_v,))
+
+
+class LimitedClosureQueueSizeBasicTest(ClientTest):
+  """Test basic functionality works with explicit maximum closure queue size.
+
+  Execute the same set of test cases as in `ClientTest`, with an
+  explicit size limit for the closure queue. Note that even when the queue size
+  is set to infinite, there is still a maximum practical size (depends on host
+  memory limit) that might cause the queue.put operations to be blocking when
+  scheduling a large number of closures on a big cluster. These tests make sure
+  that the client does not run into deadlocks in such scenario.
+  """
+
+  @classmethod
+  def setUpClass(cls):
+    super(LimitedClosureQueueSizeBasicTest, cls).setUpClass()
+    client_lib._CLOSURE_QUEUE_MAX_SIZE = 2
+    cls.client = make_client(num_workers=3, num_ps=2)
+    cls.strategy = cls.client.strategy
+
+
+class ErrorReportingTest(TestCaseWithErrorReportingThread):
+
+  @classmethod
+  def setUpClass(cls):
+    super(ErrorReportingTest, cls).setUpClass()
+    cls.client = make_client(num_workers=3, num_ps=2)
+    cls.strategy = cls.client.strategy
+
+    with cls.strategy.scope():
+      cls.iteration = variables.Variable(initial_value=0.0)
+
+  @def_function.function
+  def _normal_function(self):
+    x = random_ops.random_uniform((2, 10))
+    y = random_ops.random_uniform((10, 2))
+    self.iteration.assign_add(1.0)
+    return math_ops.reduce_mean(math_ops.matmul(x, y))
+
+  @def_function.function
+  def _error_function(self):
+    x = random_ops.random_uniform((2, 10))
+    y = random_ops.random_uniform((10, 2))
+    check_ops.assert_non_positive_v2(math_ops.reduce_sum(math_ops.matmul(x, y)))
+    self.iteration.assign_add(1.0)
+    return self.iteration
+
+  @def_function.function
+  def _long_function(self):
+    x = random_ops.random_uniform((1000, 1000))
+    for _ in math_ops.range(10000):
+      a = random_ops.random_uniform((1000, 1000))
+      b = random_ops.random_uniform((1000, 1000))
+      x += math_ops.matmul(a, b)
+    return x
+
+  def testJoinRaiseError(self):
+    for _ in range(3):
+      self.client.schedule(self._normal_function)
+    self.client.schedule(self._error_function)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.client.join()
+
+  def testScheduleRaiseError(self):
+    for _ in range(3):
+      self.client.schedule(self._normal_function)
+    self.client.schedule(self._error_function)
+    with self.assertRaises(errors.InvalidArgumentError):
+      while True:
+        self.client.schedule(self._normal_function)
+
+  def testScheduleRaiseErrorWithMultipleFailure(self):
+    for _ in range(3):
+      self.client.schedule(self._normal_function)
+    self.client.schedule(self._error_function)
+    with self.assertRaises(errors.InvalidArgumentError):
+      while True:
+        self.client.schedule(self._error_function)
+    self.client.join()
+
+  def testErrorWillbeCleared(self):
+    self.client.schedule(self._error_function)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.client.join()
+
+    for _ in range(3):
+      self.client.schedule(self._normal_function)
+    self.client.schedule(self._error_function)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.client.join()
+
+  def testRemoteValueReturnError(self):
+    result = self.client.schedule(self._error_function)
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      result.fetch()
+
+    # Clear the error.
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.client.join()
+
+  def testInputError(self):
+
+    worker_local_val = self.client._create_per_worker_resources(
+        self._error_function)
+
+    @def_function.function
+    def func(x):
+      return x + 1
+
+    result = self.client.schedule(func, args=(worker_local_val,))
+    with self.assertRaises(client_lib.InputError):
+      self.client.join()
+
+    with self.assertRaises(client_lib.InputError):
+      result.fetch()
+
+  def testCancellation(self):
+    for _ in range(3):
+      self.client.schedule(self._normal_function)
+    long_function = self.client.schedule(self._long_function)
+    self.client.schedule(self._error_function)
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.client.join()
+
+    with self.assertRaises(errors.CancelledError):
+      long_function.fetch()
+
+    for _ in range(3):
+      self.client.schedule(self._normal_function)
+    self.client.join()
+
+
+class LimitedClosureQueueErrorTest(ErrorReportingTest):
+  """Test error reporting works with explicit maximum closure queue size.
+
+  Execute the same set of test cases as in ErrorReportingTest, with an explicit
+  size limit for the closure queue.
+  """
+
+  @classmethod
+  def setUpClass(cls):
+    super(LimitedClosureQueueErrorTest, cls).setUpClass()
+    client_lib._CLOSURE_QUEUE_MAX_SIZE = 2
+    cls.client = make_client(num_workers=3, num_ps=2)
+    cls.strategy = cls.client.strategy
+
+    with cls.client.strategy.scope():
+      cls.iteration = variables.Variable(initial_value=0.0)
+
+
+class StrategyIntegrationTest(test.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    super(StrategyIntegrationTest, cls).setUpClass()
+    cls.client = make_client(num_workers=1, num_ps=1)
+    cls.strategy = cls.client.strategy
+
+  def testBasicVariableAssignment(self):
+    self.strategy.extended._variable_count = 0
+    with self.strategy.scope():
+      v1 = variables.Variable(initial_value=0.0)
+      v2 = variables.Variable(initial_value=1.0)
+    self.assertEqual(self.strategy.extended._variable_count, 2)
+
+    @def_function.function
+    def worker_fn():
+      v1.assign_add(0.1)
+      v2.assign_sub(0.2)
+      return v1.read_value() / v2.read_value()
+
+    results = self.client.schedule(worker_fn)
+    logging.info('Results of experimental_run_v2: %f',
+                 self.client.fetch(results))
+
+    self.assertAlmostEqual(v1.read_value().numpy(), 0.1, delta=1e-6)
+    self.assertAlmostEqual(v2.read_value().numpy(), 0.8, delta=1e-6)
+
+  def testStrategyRun(self):
+    self.assertFalse(distribution_strategy_context.in_cross_replica_context())
+    with self.strategy.scope():
+      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
+      v = variables.Variable(initial_value=1)
+
+      @def_function.function
+      def worker_fn(input_tensor):
+
+        def replica_fn(input_tensor):
+          # Within `replica_fn`, it has to be in a replica context.
+          self.assertFalse(
+              distribution_strategy_context.in_cross_replica_context())
+          return input_tensor + v
+
+        return self.strategy.run(replica_fn, args=(input_tensor,))
+
+      # Asserting scheduling in scope has the expected behavior.
+      result = self.client.schedule(worker_fn, args=(constant_op.constant(3),))
+      self.assertIsInstance(result, client_lib.RemoteValue)
+      self.assertEqual(result.fetch(), 4)
+
+    # Asserting scheduling out of scope has the expected behavior.
+    result = self.client.schedule(worker_fn, args=(constant_op.constant(3),))
+    self.assertEqual(result.fetch(), 4)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/distribute/client/metric_utils_test.py b/tensorflow/python/distribute/client/metric_utils_test.py
index 3dab4367e52..f94cdcb6d76 100644
--- a/tensorflow/python/distribute/client/metric_utils_test.py
+++ b/tensorflow/python/distribute/client/metric_utils_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import time
 from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import parameter_server_strategy_v2
 from tensorflow.python.distribute.client import client
 from tensorflow.python.distribute.client import metric_utils
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
@@ -35,6 +36,7 @@ class MetricUtilsTest(test.TestCase):
     return 'grpc'
 
   def testClientMetrics(self):
+
     metric_utils.enable_metrics = True
 
     cluster_def = multi_worker_test_base.create_in_process_cluster(
@@ -44,7 +46,9 @@ class MetricUtilsTest(test.TestCase):
     ]
     cluster_resolver = SimpleClusterResolver(
         ClusterSpec(cluster_def), rpc_layer=self.get_rpc_layer())
-    cluster = client.Cluster(cluster_resolver)
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        cluster_resolver)
+    cluster = client.Cluster(strategy)
 
     @def_function.function
     def func():
diff --git a/tensorflow/python/distribute/client/parameter_server_client.py b/tensorflow/python/distribute/client/parameter_server_client.py
deleted file mode 100644
index 8236c2410d8..00000000000
--- a/tensorflow/python/distribute/client/parameter_server_client.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Parameter server client module.
-
-This is currently under development and the API is subject to change.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.distribute import parameter_server_strategy_v2
-from tensorflow.python.distribute.client import client
-
-
-class ParameterServerClient(client.Client):
-  """A client that uses `ParameterServerStrategy` to distribute tasks.
-
-  Parameter server training refers to the distributed training architecture
-  that requires two jobs in the cluster: workers and parameter servers. The
-  variables and updates to those variables are assigned on the parameter
-  servers' tasks, and the actual computation intensive operations are assigned
-  on worker tasks. In TF2, parameter server training only starts up one
-  client process, to drive and coordinate the workers and parameter servers.
-  This is referred to as single-client architecture, as opposed to multi-client
-  approach which is seen more often in traditional TensorFlow distributed
-  training, including `tf.estimator.Estimator` and `tf.keras` with
-  `tf.distribute.experimental.MultiWorkerMirroredStrategy`.
-
-  `ParameterServerClient` is a `Client` that uses `ParameterServerStrategy` as
-  the underlying strategy to distribute, and is the starting point of parameter
-  server training/evaluation.
-
-  If 'TF_CONFIG' environment variable is used, provide a
-  `TFConfigClusterResolver` to detect configurations for multi-worker training.
-
-  """
-
-  def __init__(self, cluster_resolver):
-    super(ParameterServerClient, self).__init__(
-        parameter_server_strategy_v2.ParameterServerStrategyV2(
-            cluster_resolver))
diff --git a/tensorflow/python/distribute/client/parameter_server_client_test.py b/tensorflow/python/distribute/client/parameter_server_client_test.py
deleted file mode 100644
index a4fb06d7a8b..00000000000
--- a/tensorflow/python/distribute/client/parameter_server_client_test.py
+++ /dev/null
@@ -1,518 +0,0 @@
-# Lint as: python3
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for parameter_server_client.py."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import threading
-from absl import logging
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.distribute import multi_worker_test_base
-from tensorflow.python.distribute import sharded_variable
-from tensorflow.python.distribute.client import client
-from tensorflow.python.distribute.client import parameter_server_client
-from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
-from tensorflow.python.eager import def_function
-from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import init_ops_v2
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.training.server_lib import ClusterSpec
-
-
-class ErrorReportingThread(threading.Thread):
-
-  error = None
-
-  def __init__(self, *args, **kwargs):
-    assert "target" in kwargs
-    target = kwargs["target"]
-
-    @functools.wraps(target)
-    def wrapped_target(*args, **kwargs):
-      try:
-        return target(*args, **kwargs)
-      except Exception as e:  # pylint: disable=broad-except
-        ErrorReportingThread.error = e
-
-    kwargs["target"] = wrapped_target
-    super(ErrorReportingThread, self).__init__(*args, **kwargs)
-
-
-class TestCaseWithErrorReportingThread(test.TestCase):
-
-  @classmethod
-  def setUpClass(cls):
-    cls._threading_thread = threading.Thread
-    threading.Thread = ErrorReportingThread
-    super(TestCaseWithErrorReportingThread, cls).setUpClass()
-
-  @classmethod
-  def tearDownClass(cls):
-    super(TestCaseWithErrorReportingThread, cls).tearDownClass()
-    threading.Thread = cls._threading_thread
-
-  def setUp(self):
-    ErrorReportingThread.error = None
-    super(TestCaseWithErrorReportingThread, self).setUp()
-
-  def tearDown(self):
-    super(TestCaseWithErrorReportingThread, self).tearDown()
-    if ErrorReportingThread.error:
-      raise ErrorReportingThread.error  # pylint: disable=raising-bad-type
-
-
-def make_client(num_workers, num_ps):
-  # TODO(rchao): Test the internal rpc_layer version.
-  cluster_def = multi_worker_test_base.create_in_process_cluster(
-      num_workers=num_workers, num_ps=num_ps, rpc_layer="grpc")
-  cluster_def["chief"] = [
-      "localhost:%d" % multi_worker_test_base.pick_unused_port()
-  ]
-  cluster_resolver = SimpleClusterResolver(
-      ClusterSpec(cluster_def), rpc_layer="grpc")
-  return parameter_server_client.ParameterServerClient(cluster_resolver)
-
-
-class ParameterServerClientTest(TestCaseWithErrorReportingThread):
-
-  @classmethod
-  def setUpClass(cls):
-    super(ParameterServerClientTest, cls).setUpClass()
-    cls.client = make_client(num_workers=3, num_ps=2)
-
-  def testBasic(self):
-    self.client._strategy.extended._variable_count = 0
-    with self.client.strategy.scope():
-      v1 = variables.Variable(initial_value=0.0)
-      v2 = variables.Variable(initial_value=1.0)
-    self.assertEqual(self.client._strategy.extended._variable_count, 2)
-
-    @def_function.function
-    def worker_fn():
-      v1.assign_add(0.1)
-      v2.assign_sub(0.2)
-      return v1.read_value() / v2.read_value()
-
-    results = self.client.schedule(worker_fn)
-    logging.info("Results of experimental_run_v2: %f",
-                 self.client.fetch(results))
-
-    self.assertAlmostEqual(v1.read_value().numpy(), 0.1, delta=1e-6)
-    self.assertAlmostEqual(v2.read_value().numpy(), 0.8, delta=1e-6)
-
-  def testFnReturnNestedValues(self):
-    x = constant_op.constant(1)
-
-    @def_function.function
-    def f():
-      return x + 1, (x + 2, x + 3), [x + 4], {"v": x}
-
-    got = self.client.schedule(f)
-    want = 2, (3, 4), [5], {"v": 1}
-    self.assertEqual(self.client.fetch(got), want)
-
-  def testInputFunction(self):
-
-    def input_fn():
-      return dataset_ops.DatasetV2.range(1, 2)
-
-    with self.client.strategy.scope():
-      v = variables.Variable(initial_value=0, dtype=dtypes.int64)
-
-    @def_function.function
-    def worker_fn(iterator):
-      x = next(iterator)
-      v.assign_add(x)
-      return x
-
-    distributed_dataset = self.client.create_per_worker_dataset(input_fn)
-    result = self.client.schedule(worker_fn, args=(iter(distributed_dataset),))
-    result = self.client.fetch(result)
-    self.assertEqual(result, (1,))
-    result = self.client.schedule(worker_fn, args=(iter(distributed_dataset),))
-    result = self.client.fetch(result)
-    self.assertEqual(result, (1,))
-
-    self.assertAlmostEqual(v.read_value().numpy(), 2, delta=1e-6)
-
-  def testAsyncScheduleAndJoin(self):
-
-    def input_fn():
-      return dataset_ops.DatasetV2.from_tensor_slices([2] * 10)
-
-    with self.client.strategy.scope():
-      v = variables.Variable(initial_value=0, dtype=dtypes.int32)
-
-    # TODO(yuefengz): the following tf.function has a return value which is None
-    # in its structured_outputs.
-    @def_function.function
-    def worker_fn(iterator):
-      x = next(iterator)
-      v.assign_add(x)
-
-    distributed_dataset = self.client.create_per_worker_dataset(input_fn)
-
-    iterator = iter(distributed_dataset)
-
-    # Verifying joining without any scheduling doesn't hang.
-    self.client.join()
-    self.assertEqual(v.read_value().numpy(), 0)
-
-    for _ in range(5):
-      self.client.schedule(worker_fn, args=(iterator,))
-    self.client.join()
-
-    # With 5 addition it should be 2*5 = 10.
-    self.assertEqual(v.read_value().numpy(), 10)
-
-    for _ in range(5):
-      self.client.schedule(worker_fn, args=(iterator,))
-
-    # Verifying multiple join is fine.
-    self.client.join()
-    self.client.join()
-    self.client.join()
-
-    self.assertTrue(self.client.done())
-
-    # Likewise, it's now 20.
-    self.assertEqual(v.read_value().numpy(), 20)
-
-  def testInputFunctionWithMap(self):
-    self._map_fn_tracing_count = 0
-
-    def input_fn():
-      def map_fn(x):
-        self._map_fn_tracing_count += 1
-        return x + 10
-      return dataset_ops.DatasetV2.range(0, 10).map(map_fn)
-
-    @def_function.function
-    def worker_fn(iterator):
-      return next(iterator)
-
-    distributed_dataset = (
-        self.client.create_per_worker_dataset(input_fn))
-    result = self.client.schedule(
-        worker_fn, args=(iter(distributed_dataset),))
-    self.assertEqual(result.fetch(), (10,))
-    self.assertEqual(self._map_fn_tracing_count, 1)
-
-  def testInputFunctionCreateVariables(self):
-
-    def input_fn():
-      v = variables.Variable(initial_value=0.0)
-      return v.read_value()
-
-    with self.assertRaises(ValueError):
-      self.client.create_per_worker_dataset(input_fn)
-
-
-class LimitedClosureQueueSizeBasicTest(ParameterServerClientTest):
-  """Test basic functionality works with explicit maximum closure queue size.
-
-  Execute the same set of test cases as in ParameterServerClientTest, with an
-  explicit size limit for the closure queue. Note that even when the queue size
-  is set to infinite, there is still a maximum practical size (depends on host
-  memory limit) that might cause the queue.put operations to be blocking when
-  scheduling a large number of closures on a big cluster. These tests make sure
-  that the client does not run into deadlocks in such scenario.
-  """
-
-  @classmethod
-  def setUpClass(cls):
-    super(LimitedClosureQueueSizeBasicTest, cls).setUpClass()
-    client._CLOSURE_QUEUE_MAX_SIZE = 2
-    cls.client = make_client(num_workers=3, num_ps=2)
-
-
-class VariablePartitioningScopeTest(test.TestCase):
-
-  @classmethod
-  def setUpClass(cls):
-    super(VariablePartitioningScopeTest, cls).setUpClass()
-    cls.client = make_client(num_workers=3, num_ps=2)
-
-  def testBasic(self):
-    with self.client.strategy.scope():
-      with self.client.experimental_variable_partitioning_scope():
-        init1 = init_ops_v2.Constant([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
-        v1 = variables.Variable(
-            initial_value=lambda: init1(shape=(5, 2), dtype=dtypes.int64),
-            shape=(5, 2),
-            dtype=dtypes.int64)
-
-        init2 = init_ops_v2.Constant([0, 1, 2, 3, 4, 5])
-        v2 = variables.Variable(
-            initial_value=lambda: init2(shape=(6, 1), dtype=dtypes.int64),
-            shape=(6, 1),
-            dtype=dtypes.int64)
-
-    self.assertIsInstance(v1, sharded_variable.ShardedVariable)
-    self.assertLen(v1.variables, 2)
-    self.assertRegex(v1.variables[0].device, "/job:ps/replica:0/task:0")
-    self.assertRegex(v1.variables[1].device, "/job:ps/replica:0/task:1")
-    self.assertAllEqual(v1.variables[0].read_value().numpy(),
-                        [[0, 1], [2, 3], [4, 5]])
-    self.assertAllEqual(v1.variables[1].read_value().numpy(), [[6, 7], [8, 9]])
-
-    self.assertIsInstance(v2, sharded_variable.ShardedVariable)
-    self.assertLen(v2.variables, 2)
-    self.assertRegex(v2.variables[0].device, "/job:ps/replica:0/task:0")
-    self.assertRegex(v2.variables[1].device, "/job:ps/replica:0/task:1")
-    self.assertAllEqual(v2.variables[0].read_value().numpy(), [[0], [1], [2]])
-    self.assertAllEqual(v2.variables[1].read_value().numpy(), [[3], [4], [5]])
-
-  def testSurplusPS(self):
-    with self.client.strategy.scope():
-      with self.client.experimental_variable_partitioning_scope():
-        initializer = init_ops_v2.Constant([0])
-
-        v = variables.Variable(
-            initial_value=lambda: initializer(shape=(1,), dtype=dtypes.int64),
-            shape=(1,),
-            dtype=dtypes.int64)
-
-    self.assertIsInstance(v, sharded_variable.ShardedVariable)
-    self.assertLen(v.variables, 1)
-    self.assertRegex(v.variables[0].device, "/job:ps/replica:0/task:0")
-    self.assertAllEqual(v.variables[0].read_value().numpy(), [0])
-
-  def testInvalidArgument(self):
-    with self.assertRaisesRegex(ValueError, "initial_value"):
-      with self.client.experimental_variable_partitioning_scope():
-        variables.Variable(initial_value=[0, 1, 2], shape=(3,))
-
-    with self.assertRaisesRegex(ValueError, "shape"):
-      with self.client.experimental_variable_partitioning_scope():
-        initializer = init_ops_v2.Constant([0, 1, 2])
-        variables.Variable(
-            initial_value=lambda: initializer(shape=(3,), dtype=dtypes.int64),
-            dtype=dtypes.int64)
-
-  def testPerWorkerValue(self):
-    var_shape = tuple()
-    var_dtype = dtypes.float32
-    var_name = "var"
-
-    def create_var():
-      var = variables.Variable(
-          initial_value=0.0, dtype=var_dtype, name=var_name)
-      self.assertIn("worker", var.device)
-      return var
-
-    worker_local_var = self.client._create_per_worker_resources(create_var)
-
-    # The following is a workaround to allow `worker_local_var` to be passed in
-    # as args to the `client.schedule` method which requires tensor specs to
-    # trace tf.function but _create_worker_resources' return values don't have
-    # tensor specs. We can get rid of this workaround once
-    # _create_worker_resources is able to infer the tensor spec of the return
-    # value of the function passed in. See b/154675763.
-    for var in worker_local_var._values:
-      var._set_type_spec(tensor_spec.TensorSpec(var_shape, var_dtype, var_name))
-
-    def worker_fn(var):
-      var.assign_add(1.0)
-
-    for _ in range(10):
-      # Which slice of `worker_local_var` will be used will depend on which
-      # worker the `worker_fn` gets scheduled on.
-      self.client.schedule(worker_fn, args=(worker_local_var,))
-    self.client.join()
-
-    var_sum = sum(self.client.fetch(worker_local_var._values))
-    self.assertEqual(var_sum, 10.0)
-
-
-class ErrorReportingTest(TestCaseWithErrorReportingThread):
-
-  @classmethod
-  def setUpClass(cls):
-    super(ErrorReportingTest, cls).setUpClass()
-    cls.client = make_client(num_workers=3, num_ps=2)
-
-    with cls.client.strategy.scope():
-      cls.iteration = variables.Variable(initial_value=0.0)
-
-  @def_function.function
-  def _normal_function(self):
-    x = random_ops.random_uniform((2, 10))
-    y = random_ops.random_uniform((10, 2))
-    self.iteration.assign_add(1.0)
-    return math_ops.reduce_mean(math_ops.matmul(x, y))
-
-  @def_function.function
-  def _error_function(self):
-    x = random_ops.random_uniform((2, 10))
-    y = random_ops.random_uniform((10, 2))
-    check_ops.assert_non_positive_v2(math_ops.reduce_sum(math_ops.matmul(x, y)))
-    self.iteration.assign_add(1.0)
-    return self.iteration
-
-  @def_function.function
-  def _long_function(self):
-    x = random_ops.random_uniform((1000, 1000))
-    for _ in math_ops.range(10000):
-      a = random_ops.random_uniform((1000, 1000))
-      b = random_ops.random_uniform((1000, 1000))
-      x += math_ops.matmul(a, b)
-    return x
-
-  def testJoinRaiseError(self):
-    for _ in range(3):
-      self.client.schedule(self._normal_function)
-    self.client.schedule(self._error_function)
-    with self.assertRaises(errors.InvalidArgumentError):
-      self.client.join()
-
-  def testScheduleRaiseError(self):
-    for _ in range(3):
-      self.client.schedule(self._normal_function)
-    self.client.schedule(self._error_function)
-    with self.assertRaises(errors.InvalidArgumentError):
-      while True:
-        self.client.schedule(self._normal_function)
-
-  def testScheduleRaiseErrorWithMultipleFailure(self):
-    for _ in range(3):
-      self.client.schedule(self._normal_function)
-    self.client.schedule(self._error_function)
-    with self.assertRaises(errors.InvalidArgumentError):
-      while True:
-        self.client.schedule(self._error_function)
-    self.client.join()
-
-  def testErrorWillbeCleared(self):
-    self.client.schedule(self._error_function)
-    with self.assertRaises(errors.InvalidArgumentError):
-      self.client.join()
-
-    for _ in range(3):
-      self.client.schedule(self._normal_function)
-    self.client.schedule(self._error_function)
-    with self.assertRaises(errors.InvalidArgumentError):
-      self.client.join()
-
-  def testRemoteValueReturnError(self):
-    result = self.client.schedule(self._error_function)
-
-    with self.assertRaises(errors.InvalidArgumentError):
-      result.fetch()
-
-    # Clear the error.
-    with self.assertRaises(errors.InvalidArgumentError):
-      self.client.join()
-
-  def testInputError(self):
-    aborted = self.client.schedule(self._error_function)
-
-    @def_function.function
-    def func(x):
-      return x + 1.0
-
-    with self.assertRaises(errors.InvalidArgumentError):
-      self.client.join()
-
-    result = self.client.schedule(func, args=(aborted,))
-    with self.assertRaises(client.InputError):
-      result.fetch()
-
-    with self.assertRaises(client.InputError):
-      self.client.join()
-
-  def testCancellation(self):
-    for _ in range(3):
-      self.client.schedule(self._normal_function)
-    long_function = self.client.schedule(self._long_function)
-    self.client.schedule(self._error_function)
-
-    with self.assertRaises(errors.InvalidArgumentError):
-      self.client.join()
-
-    with self.assertRaises(client.FunctionRetryableError):
-      long_function.fetch()
-
-    for _ in range(3):
-      self.client.schedule(self._normal_function)
-    self.client.join()
-
-
-class LimitedClosureQueueErrorTest(ErrorReportingTest):
-  """Test error reporting works with explicit maximum closure queue size.
-
-  Execute the same set of test cases as in ErrorReportingTest, with an explicit
-  size limit for the closure queue.
-  """
-
-  @classmethod
-  def setUpClass(cls):
-    super(LimitedClosureQueueErrorTest, cls).setUpClass()
-    client._CLOSURE_QUEUE_MAX_SIZE = 2
-    cls.client = make_client(num_workers=3, num_ps=2)
-
-    with cls.client.strategy.scope():
-      cls.iteration = variables.Variable(initial_value=0.0)
-
-
-class StrategyRunTest(test.TestCase):
-
-  @classmethod
-  def setUpClass(cls):
-    super(StrategyRunTest, cls).setUpClass()
-    cls.client = make_client(num_workers=1, num_ps=1)
-
-  def testStrategyRun(self):
-    self.assertFalse(distribution_strategy_context.in_cross_replica_context())
-    with self.client._strategy.scope():
-      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
-      v = variables.Variable(initial_value=1)
-
-      @def_function.function
-      def worker_fn(input_tensor):
-
-        def replica_fn(input_tensor):
-          # Within `replica_fn`, it has to be in a replica context.
-          self.assertFalse(
-              distribution_strategy_context.in_cross_replica_context())
-          return input_tensor + v
-
-        return self.client._strategy.run(replica_fn, args=(input_tensor,))
-
-      # Asserting scheduling in scope has the expected behavior.
-      result = self.client.schedule(worker_fn, args=(constant_op.constant(3),))
-      self.assertIsInstance(result, client.RemoteValue)
-      self.assertEqual(result.fetch(), 4)
-
-    # Asserting scheduling out of scope has the expected behavior.
-    result = self.client.schedule(worker_fn, args=(constant_op.constant(3),))
-    self.assertEqual(result.fetch(), 4)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/distribute/client/utils.py b/tensorflow/python/distribute/client/utils.py
index 6c595579863..51869ec6e73 100644
--- a/tensorflow/python/distribute/client/utils.py
+++ b/tensorflow/python/distribute/client/utils.py
@@ -20,7 +20,8 @@ Parameter server training in TF2 is currently under development.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from absl import logging
+
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 
 
@@ -28,8 +29,11 @@ def start_server(cluster_resolver, protocol):
   """Start a server and block the process from exiting."""
   # This function is for multi-processing test or users who would like to have
   # every job run the same binary for simplicity.
-  assert (cluster_resolver.task_type == 'worker' or
-          cluster_resolver.task_type == 'ps')
+  if not (cluster_resolver.task_type == 'worker' or
+          cluster_resolver.task_type == 'ps'):
+    raise ValueError('Unexpected task_type to start a server: {}'.format(
+        cluster_resolver.task_type))
+
   server = server_lib.Server(
       cluster_resolver.cluster_spec().as_cluster_def(),
       job_name=cluster_resolver.task_type,
diff --git a/tensorflow/python/distribute/cluster_resolver/BUILD b/tensorflow/python/distribute/cluster_resolver/BUILD
index c7427af2081..6fda3f5311d 100644
--- a/tensorflow/python/distribute/cluster_resolver/BUILD
+++ b/tensorflow/python/distribute/cluster_resolver/BUILD
@@ -20,6 +20,7 @@ py_library(
         ":base_cluster_resolver_py",
         ":gce_cluster_resolver_py",
         ":kubernetes_cluster_resolver_py",
+        ":sagemaker_cluster_resolver_py",
         ":slurm_cluster_resolver_py",
         ":tfconfig_cluster_resolver_py",
         ":tpu_cluster_resolver_py",
@@ -56,6 +57,16 @@ py_library(
     ],
 )
 
+py_library(
+    name = "sagemaker_cluster_resolver_py",
+    srcs = ["sagemaker_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
 py_library(
     name = "tpu_cluster_resolver_py",
     srcs = ["tpu_cluster_resolver.py"],
@@ -87,6 +98,7 @@ tf_py_test(
     name = "base_cluster_resolver_py_test",
     srcs = ["cluster_resolver_test.py"],
     main = "cluster_resolver_test.py",
+    tfrt_enabled = True,
     deps = [
         ":base_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
@@ -102,6 +114,7 @@ tf_py_test(
     size = "small",
     srcs = ["gce_cluster_resolver_test.py"],
     main = "gce_cluster_resolver_test.py",
+    tfrt_enabled = True,
     deps = [
         ":gce_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
@@ -118,6 +131,7 @@ tf_py_test(
     srcs = ["tfconfig_cluster_resolver_test.py"],
     grpc_enabled = True,
     main = "tfconfig_cluster_resolver_test.py",
+    tfrt_enabled = True,
     deps = [
         ":tfconfig_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
@@ -128,12 +142,30 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "sagemaker_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["sagemaker_cluster_resolver_test.py"],
+    grpc_enabled = True,
+    main = "sagemaker_cluster_resolver_test.py",
+    tfrt_enabled = True,
+    deps = [
+        ":sagemaker_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
 tf_py_test(
     name = "slurm_cluster_resolver_py_test",
     size = "small",
     srcs = ["slurm_cluster_resolver_test.py"],
     main = "slurm_cluster_resolver_test.py",
     tags = [],
+    tfrt_enabled = True,
     deps = [
         ":slurm_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
@@ -149,6 +181,7 @@ tf_py_test(
     size = "small",
     srcs = ["kubernetes_cluster_resolver_test.py"],
     main = "kubernetes_cluster_resolver_test.py",
+    tfrt_enabled = True,
     deps = [
         ":kubernetes_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/distribute/cluster_resolver/sagemaker_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/sagemaker_cluster_resolver.py
new file mode 100644
index 00000000000..4b3d1f69e22
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/sagemaker_cluster_resolver.py
@@ -0,0 +1,210 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for SageMaker Environment."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.training.server_lib import ClusterSpec
+from tensorflow.python.util.tf_export import tf_export
+
+# List of envs
+# https://github.com/aws/sagemaker-training-toolkit/blob/master/ENVIRONMENT_VARIABLES.md
+# Only support Multi-Worker Mirrored Strategy
+
+_SESSION_MASTER_KEY = 'session_master'
+_RPC_LAYER_KEY = 'rpc_layer'
+_TASK_KEY = 'task'
+_CLUSTER_KEY = 'cluster'
+_WORKER_KEY = 'worker'
+_INDEX_KEY = 'index'
+_TYPE_KEY = 'type'
+
+_SM_CURRENT_HOST = 'SM_CURRENT_HOST'
+_SM_HOSTS = 'SM_HOSTS'
+
+
+def format_master_url(master, rpc_layer=None):
+  if rpc_layer:
+    return '%s://%s' % (rpc_layer, master)
+  else:
+    return master
+
+
+def _load_tf_config(port):
+  # Create a tf_config from SM Variables
+  assert all([x in os.environ for x in [_SM_CURRENT_HOST, _SM_HOSTS]
+             ]), 'Not a SageMaker Environment'
+  hosts = sorted(json.loads(
+      os.environ[_SM_HOSTS])) if os.environ[_SM_HOSTS] != '' else []
+  current_host = os.environ[_SM_CURRENT_HOST]
+
+  if current_host not in hosts:
+    return {}
+
+  host_index = hosts.index(current_host)
+  # Assign ports
+  hosts = ['%s:%s' % (host, port) for host in hosts]
+
+  tf_config = {
+      _CLUSTER_KEY: {
+          _WORKER_KEY: hosts
+      },
+      _TASK_KEY: {
+          _TYPE_KEY: _WORKER_KEY,
+          _INDEX_KEY: host_index
+      }
+  }
+  return tf_config
+
+
+def _get_value_in_tfconfig(key, port, default=None):
+  tf_config = _load_tf_config(port)
+  return tf_config[key] if key in tf_config else default
+
+
+@tf_export('distribute.cluster_resolver.SageMakerClusterResolver')
+class SageMakerClusterResolver(ClusterResolver):
+  """Implementation of a ClusterResolver which reads the Sagemaker EnvVars. This is an implementation of cluster resolvers when running in a SageMaker environment to set information about the cluster.
+
+  The cluster spec returned will be initialized from the SageMaker
+  environment variables.
+  Currently this Cluster Resolver only supports Multi-Worker Mirrored Strategy.
+  It assumes all nodes in a SageMaker Cluster are workers.
+  """
+
+  def __init__(self,
+               port=2223,
+               task_type=None,
+               task_id=None,
+               rpc_layer=None,
+               environment=None):
+    """Creates a new SageMakerClusterResolver.
+
+    Args:
+      port: (integer, optional) Override default port usage of 2223
+      task_type: (String, optional) Overrides the task type.
+      task_id: (Integer, optional) Overrides the task index.
+      rpc_layer: (String, optional) Overrides the rpc layer TensorFlow uses.
+      environment: (String, optional) Overrides the environment TensorFlow
+        operates in.
+    """
+    self._task_type = task_type
+    self._task_id = task_id
+    self._rpc_layer = rpc_layer
+    self._environment = environment
+    self._port = str(port)
+
+  @property
+  def task_type(self):
+    if self._task_type is None:
+      task_info = _get_value_in_tfconfig(_TASK_KEY, self._port, {})
+      return str(task_info['type']) if 'type' in task_info else None
+    else:
+      return str(self._task_type)
+
+  @property
+  def task_id(self):
+    if self._task_id is None:
+      task_info = _get_value_in_tfconfig(_TASK_KEY, self._port, {})
+      return int(task_info['index']) if 'index' in task_info else None
+    else:
+      return int(self._task_id)
+
+  @task_type.setter
+  def task_type(self, task_type):
+    self._task_type = task_type
+
+  @task_id.setter
+  def task_id(self, task_id):
+    self._task_id = task_id
+
+  @property
+  def environment(self):
+    return self._environment
+
+  @property
+  def rpc_layer(self):
+    if self._rpc_layer is None:
+      return _get_value_in_tfconfig(_RPC_LAYER_KEY, self._port)
+    else:
+      return self._rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
+
+  def num_accelerators(self, task_type=None, task_id=None, config_proto=None):
+    task_type = self.task_type if task_type is None else task_type
+    task_id = self.task_id if task_id is None else task_id
+    return super(SageMakerClusterResolver,
+                 self).num_accelerators(task_type, task_id, config_proto)
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec based on the SageMaker environment variables.
+
+    Returns:
+      A ClusterSpec with information from the SageMaker environment variables.
+    """
+    tf_config = _load_tf_config(self._port)
+    if 'cluster' not in tf_config:
+      return ClusterSpec({})
+    return ClusterSpec(tf_config['cluster'])
+
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
+    """Returns the master address to use when creating a TensorFlow session.
+
+    Note: this is only useful for TensorFlow 1.x.
+
+    Args:
+      task_type: (String, optional) Overrides and sets the task_type of the
+        master.
+      task_id: (Integer, optional) Overrides and sets the task id of the master.
+      rpc_layer: (String, optional) Overrides and sets the protocol over which
+        TensorFlow nodes communicate with each other.
+
+    Returns:
+      The address of the master.
+
+    Raises:
+      RuntimeError: If the task_type or task_id is not specified and the
+        SageMaker environment variables does not contain a task section.
+    """
+
+    # If `session_master` is set, just use that.
+    session_master = _get_value_in_tfconfig(_SESSION_MASTER_KEY, self._port)
+    if session_master is not None:
+      return session_master
+
+    # Return an empty string if we are the only job in the ClusterSpec.
+    cluster_spec = self.cluster_spec()
+    if (not cluster_spec.jobs or
+        (len(cluster_spec.jobs) == 1 and
+         len(cluster_spec.job_tasks(cluster_spec.jobs[0])) == 1)):
+      return ''
+
+    # We try to auto-detect the task type and id, but uses the user-supplied one
+    # where available
+    task_type = task_type if task_type is not None else self.task_type
+    task_id = task_id if task_id is not None else self.task_id
+    rpc_layer = rpc_layer if rpc_layer is not None else self.rpc_layer
+
+    return format_master_url(
+        cluster_spec.task_address(task_type, task_id), rpc_layer)
diff --git a/tensorflow/python/distribute/cluster_resolver/sagemaker_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/sagemaker_cluster_resolver_test.py
new file mode 100644
index 00000000000..aad2fc3ccec
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/sagemaker_cluster_resolver_test.py
@@ -0,0 +1,121 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for SageMakerClusterResolver."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.distribute.cluster_resolver.sagemaker_cluster_resolver import SageMakerClusterResolver
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+mock = test.mock
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SageMakerClusterResolverTest(test.TestCase):
+
+  def _verifyClusterSpecEquality(self, cluster_spec, expected_proto):
+    self.assertProtoEquals(expected_proto, cluster_spec.as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec).as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec.as_cluster_def()).as_cluster_def())
+    self.assertProtoEquals(
+        expected_proto,
+        server_lib.ClusterSpec(cluster_spec.as_dict()).as_cluster_def())
+
+  def testNormalClusterSpecRead(self):
+    os.environ['SM_HOSTS'] = '["algo-1","algo-2"]'
+    os.environ['SM_CURRENT_HOST'] = 'algo-2'
+
+    cluster_resolver = SageMakerClusterResolver()
+    expected_proto = """
+    job { name: 'worker' tasks { key: 0 value: 'algo-1:2223' }
+                         tasks { key: 1 value: 'algo-2:2223' } }
+    """
+    actual_cluster_spec = cluster_resolver.cluster_spec()
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
+  def testAutomaticMasterRead(self):
+    os.environ['SM_HOSTS'] = '["algo-1","algo-2"]'
+    os.environ['SM_CURRENT_HOST'] = 'algo-1'
+
+    cluster_resolver = SageMakerClusterResolver()
+    self.assertEqual('algo-1:2223', cluster_resolver.master())
+
+  def testSpecifiedTaskTypeAndIndexMasterRead(self):
+    os.environ['SM_HOSTS'] = '["algo-1","algo-2"]'
+    os.environ['SM_CURRENT_HOST'] = 'algo-2'
+
+    cluster_resolver = SageMakerClusterResolver()
+    self.assertEqual('algo-2:2223', cluster_resolver.master('worker', 1))
+
+  def testRpcLayerRead(self):
+    os.environ['SM_HOSTS'] = '["algo-1","algo-2"]'
+    os.environ['SM_CURRENT_HOST'] = 'algo-1'
+
+    cluster_resolver = SageMakerClusterResolver(rpc_layer='grpc')
+    self.assertEqual('grpc://algo-1:2223', cluster_resolver.master())
+
+  def testParameterOverrides(self):
+    os.environ['SM_HOSTS'] = '["algo-1","algo-2"]'
+    os.environ['SM_CURRENT_HOST'] = 'algo-1'
+
+    cluster_resolver = SageMakerClusterResolver(task_type='worker', task_id=0)
+
+    self.assertEqual('algo-1:2223', cluster_resolver.master())
+    self.assertEqual('worker', cluster_resolver.task_type)
+    self.assertEqual(0, cluster_resolver.task_id)
+
+    cluster_resolver.task_type = 'worker'
+    cluster_resolver.task_id = 1
+    cluster_resolver.rpc_layer = 'test'
+
+    self.assertEqual('test://algo-2:2223', cluster_resolver.master())
+    self.assertEqual('worker', cluster_resolver.task_type)
+    self.assertEqual(1, cluster_resolver.task_id)
+    self.assertEqual('test', cluster_resolver.rpc_layer)
+
+  def testTaskIndexOverride(self):
+    os.environ['SM_HOSTS'] = '["algo-1","algo-2"]'
+    os.environ['SM_CURRENT_HOST'] = 'algo-2'
+
+    cluster_resolver = SageMakerClusterResolver(task_id=1)
+    self.assertEqual(1, cluster_resolver.task_id)
+
+  def testZeroItemsInClusterSpecMasterRead(self):
+    os.environ['SM_HOSTS'] = ''
+    os.environ['SM_CURRENT_HOST'] = ''
+
+    cluster_resolver = SageMakerClusterResolver()
+    self.assertEqual('', cluster_resolver.master())
+
+  def testOneItemInClusterSpecMasterRead(self):
+    os.environ['SM_HOSTS'] = '["algo-1"]'
+    os.environ['SM_CURRENT_HOST'] = ''
+
+    cluster_resolver = SageMakerClusterResolver()
+    self.assertEqual('', cluster_resolver.master())
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/BUILD b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
index 4825bf3b6d8..01b21f73dee 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
@@ -20,7 +20,7 @@ py_library(
     deps = [
         "//tensorflow/python:training_server_lib",
         "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
-        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/tpu:tpu_system_metadata",
         "//tensorflow/python/tpu/client",
     ] + tf_additional_rpc_deps(),
 )
@@ -32,6 +32,7 @@ tf_py_test(
     grpc_enabled = True,
     main = "tpu_cluster_resolver_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":tpu_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index 6a133c7d4b8..f58d9d70c6b 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -25,6 +25,7 @@ import weakref
 
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.core.protobuf import tensorflow_server_pb2
+from tensorflow.python.distribute import collective_util
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import device_util
@@ -39,12 +40,12 @@ from tensorflow.python.distribute import values
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
 from tensorflow.python.eager import context
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.tracking import base
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -97,30 +98,34 @@ class CollectiveAllReduceStrategy(distribute_lib.Strategy):
   Tensorflow API.
 
   """
+
   # TODO(anjalisridhar): Update our guides with examples showing how we can use
   # the cluster_resolver argument.
 
-  def __init__(
-      self,
-      communication=cross_device_ops_lib.CollectiveCommunication.AUTO,
-      cluster_resolver=None):
+  # The starting number for collective keys. This should only be set in tests.
+  _collective_key_base = 0
+
+  def __init__(self,
+               communication=collective_util.CommunicationImplemenation.AUTO,
+               cluster_resolver=None):
     """Creates the strategy.
 
     Args:
       communication: optional
-        `tf.distribute.experimental.CollectiveCommunication`. This is a hint on
-        the preferred collective communication implementation. Possible values
-        include `AUTO`, `RING`, and `NCCL`.
+        `tf.distribute.experimental.CommunicationImplemenation`. This is a hint
+        on the preferred collective communication implementation. Possible
+        values include `AUTO`, `RING`, and `NCCL`.
       cluster_resolver: optional
         `tf.distribute.cluster_resolver.ClusterResolver`. If `None`,
         `tf.distribute.cluster_resolver.TFConfigClusterResolver` is used.
     """
-    # TODO(b/150151677): consider move communication to CollectiveHints.
+    communication_options = collective_util.Options(
+        implementation=communication)
     super(CollectiveAllReduceStrategy, self).__init__(
         CollectiveAllReduceExtended(
             self,
-            communication=communication,
-            cluster_resolver=cluster_resolver))
+            cluster_resolver=cluster_resolver,
+            communication_options=communication_options))
 
     distribute_lib.distribution_strategy_gauge.get_cell("V2").set(
         "MultiWorkerMirroredStrategy")
@@ -134,7 +139,7 @@ class CollectiveAllReduceStrategy(distribute_lib.Strategy):
   def _from_local_devices(
       cls,
       devices,
-      communication=cross_device_ops_lib.CollectiveCommunication.AUTO):
+      communication=collective_util.CommunicationImplemenation.AUTO):
     """A convenience method to create an object with a list of devices."""
     obj = cls(communication)
     obj.extended._initialize_local(TFConfigClusterResolver(), devices=devices)  # pylint: disable=protected-access
@@ -158,16 +163,17 @@ class CollectiveAllReduceStrategyV1(distribute_lib.StrategyV1):
 
   __doc__ = CollectiveAllReduceStrategy.__doc__
 
-  def __init__(
-      self,
-      communication=cross_device_ops_lib.CollectiveCommunication.AUTO,
-      cluster_resolver=None):
+  def __init__(self,
+               communication=collective_util.CommunicationImplemenation.AUTO,
+               cluster_resolver=None):
     """Initializes the object."""
+    communication_options = collective_util.Options(
+        implementation=communication)
     super(CollectiveAllReduceStrategyV1, self).__init__(
         CollectiveAllReduceExtended(
             self,
-            communication=communication,
-            cluster_resolver=cluster_resolver))
+            cluster_resolver=cluster_resolver,
+            communication_options=communication_options))
     distribute_lib.distribution_strategy_gauge.get_cell("V1").set(
         "MultiWorkerMirroredStrategy")
     # pylint: disable=protected-access
@@ -183,23 +189,20 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
   # Whether to perdically check the health of the cluster. If any worker is not
   # reachable, collectives are aborted and the user program should get a
   # tf.errors.UnavailableError. It's required to restart in order to recover.
-  _enable_check_health = False
+  _enable_check_health = True
   # Check health interval in seconds.
   _check_health_interval = 30
   # Timeout in seconds for the first check health. The first check health needs
   # to wait for cluster, which may make a longer time.
-  _check_health_initial_timeout = 1200
+  _check_health_initial_timeout = 0
+  # Times to retry before considering the peer is down.
+  _check_health_retry_limit = 3
 
-  def __init__(self,
-               container_strategy,
-               communication,
-               cluster_resolver):
+  def __init__(self, container_strategy, cluster_resolver,
+               communication_options):
     self._cluster_resolver = cluster_resolver or TFConfigClusterResolver()
     distribute_lib.StrategyExtendedV1.__init__(self, container_strategy)
-    assert isinstance(
-        communication,
-        cross_device_ops_lib.CollectiveCommunication)
-    self._communication = communication
+    self._communication_options = communication_options
     self._initialize_strategy(self._cluster_resolver)
     self._cfer_fn_cache = weakref.WeakKeyDictionary()
     self.experimental_enable_get_next_as_optional = True
@@ -244,19 +247,17 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     self._worker_device = device_util.canonicalize("/device:CPU:0")
     self._host_input_device = numpy_dataset.SingleDevice(self._worker_device)
 
-    self._collective_keys = cross_device_utils.CollectiveKeys()
+    self._collective_keys = cross_device_utils.CollectiveKeys(
+        group_key_start=1 + CollectiveAllReduceStrategy._collective_key_base)  # pylint: disable=protected-access
     self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
         devices=local_devices,
         group_size=len(local_devices),
-        collective_keys=self._collective_keys,
-        communication=self._communication)
+        collective_keys=self._collective_keys)
     # CrossDeviceOps for per host tensors.
     self._host_cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
         devices=[self._worker_device],
         group_size=self._num_workers,
-        collective_keys=self._collective_keys,
-        communication=cross_device_ops_lib.CollectiveCommunication.RING,
-    )
+        collective_keys=self._collective_keys)
     super(CollectiveAllReduceExtended, self)._initialize_single_worker(
         local_devices)
 
@@ -276,8 +277,10 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     self._rpc_layer = cluster_resolver.rpc_layer
     self._warn_nccl_no_gpu()
 
-    logging.info("Single-worker MultiWorkerMirroredStrategy with local_devices "
-                 "= %r, communication = %s", local_devices, self._communication)
+    logging.info(
+        "Single-worker MultiWorkerMirroredStrategy with local_devices "
+        "= %r, communication = %s", local_devices,
+        self._communication_options.implementation)
 
   def _initialize_multi_worker(self, cluster_resolver):
     """Initializes the object for multi-worker training."""
@@ -359,19 +362,17 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     else:
       local_devices = (self._worker_device,)
 
-    self._collective_keys = cross_device_utils.CollectiveKeys()
+    self._collective_keys = cross_device_utils.CollectiveKeys(
+        group_key_start=1 + CollectiveAllReduceStrategy._collective_key_base)  # pylint: disable=protected-access
     self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
         devices=local_devices,
         group_size=len(local_devices) * self._num_workers,
-        collective_keys=self._collective_keys,
-        communication=self._communication)
+        collective_keys=self._collective_keys)
     # CrossDeviceOps for per host tensors.
     self._host_cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
         devices=[self._worker_device],
         group_size=self._num_workers,
-        collective_keys=self._collective_keys,
-        communication=cross_device_ops_lib.CollectiveCommunication.RING,
-    )
+        collective_keys=self._collective_keys)
     super(CollectiveAllReduceExtended, self)._initialize_single_worker(
         local_devices)
 
@@ -384,20 +385,18 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     self._rpc_layer = cluster_resolver.rpc_layer
     self._warn_nccl_no_gpu()
 
-    # TODO(b/151232436): Enable check health thread by default.
     if self._enable_check_health:
       self._start_check_health_thread()
 
     logging.info(
         "MultiWorkerMirroredStrategy with cluster_spec = %r, task_type = %r, "
         "task_id = %r, num_workers = %r, local_devices = %r, "
-        "communication = %s", cluster_spec.as_dict(), task_type,
-        task_id, self._num_workers, local_devices,
-        self._communication)
+        "communication = %s", cluster_spec.as_dict(), task_type, task_id,
+        self._num_workers, local_devices,
+        self._communication_options.implementation)
 
   def __del__(self):
-    if self._enable_check_health:
-      self._stop_check_health_thread()
+    self._stop_check_health_thread()
 
   def _input_workers_with_options(self, options=None):
     host_device = device_util.get_host_for_device(self._worker_device)
@@ -427,12 +426,14 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         group_key = self._collective_keys.get_group_key([device])
         group_size = self._num_workers
         collective_instance_key = (
-            self._collective_keys.get_variable_instance_key())
+            self._collective_keys.get_instance_key(group_key, device))
 
         with ops.device(device):
           initial_value = kwargs["initial_value"]
           if callable(initial_value):
             initial_value = initial_value()
+          if isinstance(initial_value, base.CheckpointInitialValue):
+            initial_value = initial_value.wrapped_value
           assert not callable(initial_value)
           initial_value = ops.convert_to_tensor(
               initial_value, dtype=kwargs.get("dtype", None))
@@ -473,11 +474,10 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         dataset,
         self._input_workers_with_options(options),
         self._container_strategy(),
-        split_batch_by=self._num_replicas_in_sync,
+        num_replicas_in_sync=self._num_replicas_in_sync,
         input_context=input_context)
 
-  def _experimental_distribute_datasets_from_function(self, dataset_fn,
-                                                      options):
+  def _distribute_datasets_from_function(self, dataset_fn, options):
     input_context = self._make_input_context()
     return input_lib.get_distributed_datasets_from_function(
         dataset_fn=dataset_fn,
@@ -503,7 +503,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         dataset,
         self._input_workers,
         self._container_strategy(),
-        split_batch_by=self._num_replicas_in_sync,
+        num_replicas_in_sync=self._num_replicas_in_sync,
         input_context=input_context)
 
   def _make_input_fn_iterator(
@@ -564,8 +564,8 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     rewrite_options.scoped_allocator_opts.enable_op.append("CollectiveReduce")
 
     if (not ops.executing_eagerly_outside_functions() and
-        self._communication ==
-        cross_device_ops_lib.CollectiveCommunication.NCCL):
+        self._communication_options.implementation ==
+        collective_util.CommunicationImplemenation.NCCL):
       updated_config.experimental.collective_nccl = True
 
     if not self._cluster_spec:
@@ -603,7 +603,14 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     else:
       return self._host_cross_device_ops
 
-  def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
+  def _gather_to_implementation(self, value, destinations, axis, options):
+    return self._get_cross_device_ops(value)._gather(  # pylint: disable=protected-access
+        value,
+        destinations=destinations,
+        axis=axis,
+        options=options)
+
+  def _reduce_to(self, reduce_op, value, destinations, options):
     if (isinstance(value, values.Mirrored) and
         reduce_op == reduce_util.ReduceOp.MEAN):
       return value
@@ -627,93 +634,98 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         reduce_op,
         value,
         destinations=destinations,
-        experimental_hints=experimental_hints)
+        options=self._communication_options.merge(options))
 
-  def _check_health(self, device, group_key, instance_key):
-    first = True
-    # We need to use a large enough value so that the all-reduce forms a
-    # complete RING. In RING implementation, when value is too small, the
-    # all-reduce may degrade into broadcasts. This means that some worker
-    # failure may not be detected.
-    value = array_ops.ones((32, 32), dtype=dtypes.float32)
+  def _check_health(self):
     while True:
       if self._check_health_thread_should_stop.is_set():
         return
-      timeout = None
-      if first:
-        # For the first check health we set timeout since it may need to do
-        # group resolution, which may hang if the cluster is never healthy.
-        timeout = self._check_health_initial_timeout
-        first = False
-      try:
-        # We use an dummy all-reduce as a way to check the health of a cluster.
-        # For RING it should be able to detect failed workers in the cluster if
-        # the values are large enough.
-        #
-        # We're not using CrossDeviceOps because we need to run it with
-        # pre-allocated group and instance keys.
-        #
-        # TODO(b/151232436): Replace the reduce with a check health op once we
-        # add that.
-        with ops.device(device):
-          collective_ops.all_reduce(
-              value,
-              group_size=self._num_workers,
-              group_key=group_key,
-              instance_key=instance_key,
-              merge_op="Add",
-              final_op="Id",
-              subdiv_offsets=[0],
-              communication_hint="ring",
-              timeout=timeout)
-          if context.is_async():
-            context.async_wait()
-      except (errors.UnavailableError, errors.DeadlineExceededError,
-              errors.FailedPreconditionError, errors.CancelledError) as e:
-        # TODO(b/151232436): Always raise UnavailableError when a peer fails.
-        # Now there could be many kinds of errors:
-        # - Unavailable: when the peer is not reachable, e.g. it's down.
-        # - FailedPrecondition: when the peer has restarted.
-        # - DeadlineExceeded: when the first check health exceeds the deadline,
-        #   e.g. the peers take too long to be ready.
-        # - Cancelled: when failures in organic collectives aborts first,
-        #   outgoing RPCs may be aborted with Cancelled.
-        logging.error("Cluster check alive failed, aborting collectives")
-        context.context().abort_collective_ops(
-            errors.UNAVAILABLE, "cluster check alive failed: %s" % e)
-      except Exception as e:  # pylint: disable=broad-except
-        logging.exception("Unexpected exception in check alive.")
-        context.context().abort_collective_ops(
-            errors.INTERNAL, "unexecpted exception in check alive: %s" % e)
-        return
+      for job in self._cluster_spec.jobs:
+        for task_id in range(self._cluster_spec.num_tasks(job)):
+          peer = "/job:{}/replica:0/task:{}".format(job, task_id)
+          attempts = 0
+          while True:
+            attempts += 1
+            try:
+              context.context().check_collective_ops_peer_health(peer)
+              # If check_collective_ops_peer_health doesn't raise an Exception,
+              # the peer is healthy.
+              break
+            except (errors.UnavailableError,
+                    errors.FailedPreconditionError) as e:
+              # TODO(b/151232436): Always raise UnavailableError when a peer
+              # fails. Now there could be many kinds of errors:
+              # - Unavailable: when the peer is not reachable, e.g. it's down.
+              # - FailedPrecondition: when the peer has restarted.
+              if attempts < self._check_health_retry_limit:
+                logging.warning("%s seems down, retrying %d/%d", peer, attempts,
+                                self._check_health_retry_limit)
+                continue
+              logging.error(
+                  "Cluster check alive failed, %s is down, "
+                  "aborting collectives: %s", peer, e)
+              context.context().abort_collective_ops(
+                  errors.UNAVAILABLE,
+                  "cluster check alive failed, {} is down".format(peer))
+              return
+            except Exception as e:  # pylint: disable=broad-except
+              logging.error("Unexpected exception in check alive: %s", e)
+              context.context().abort_collective_ops(
+                  errors.INTERNAL,
+                  "unexecpted exception in check alive: %s" % e)
+              return
       time.sleep(self._check_health_interval)
 
   def _start_check_health_thread(self):
-    # Allocate group and instance key before starting the thread to avoid
-    # indeterminism. There can only be one thread that assigns group keys and
-    # instance keys, otherwise different workers may end up with unmatched keys
-    # since execution order between threads are arbitrary.
-    device = device_util.canonicalize(self._worker_device)
-    group_key = self._collective_keys.get_group_key([device])
-    instance_key = self._collective_keys.get_op_instance_key()
+    if not context.executing_eagerly():
+      logging.info("Check health is only supported in eager.")
+      return
+    # Use a dummy all-reduce as a barrier to wait for all workers to be up,
+    # otherwise the check health may fail immediately.
+
+    # Use array_ops.identity to create the dummy tensor so that we have a new
+    # Tensor. If we use constant it may be a cached from on a /job:localhost
+    # device, which will cause some code that relies on tensor.device to error.
+    #
+    # TODO(b/151232436): change to an explicit barrier if we have it.
+    dummy_value = array_ops.identity([])
+    logging.info("Waiting for the cluster, timeout = %s",
+                 self._check_health_initial_timeout or "inf")
+    try:
+      self._host_cross_device_ops.reduce(
+          reduce_util.ReduceOp.SUM,
+          dummy_value,
+          dummy_value,
+          options=collective_util.Options(
+              timeout_seconds=self._check_health_initial_timeout,
+              implementation=collective_util.CommunicationImplemenation.RING))
+      if context.is_async():
+        context.async_wait()
+    except errors.DeadlineExceededError:
+      raise RuntimeError(
+          "Timeout waiting for the cluster, timeout is %d seconds" %
+          self._check_health_initial_timeout)
+    logging.info("Cluster is ready.")
     self._check_health_thread_should_stop = threading.Event()
     # Start the thread as daemon to avoid it blocking the program from exiting.
     # We try best to shutdown the thread but __del__ is not guaranteed to be
     # called when program exists.
     self._check_health_thread = threading.Thread(
         target=self._check_health,
-        args=(device, group_key, instance_key),
         daemon=True)
     self._check_health_thread.start()
 
   def _stop_check_health_thread(self):
-    self._check_health_thread_should_stop.set()
-    self._check_health_thread.join()
-    self._check_health_thread = None
+    if getattr(self, "_check_health_thread", None):
+      logging.info("stopping check health thread")
+      self._check_health_thread_should_stop.set()
+      self._check_health_thread.join()
+      self._check_health_thread = None
+      logging.info("check health thread stopped")
 
   def _warn_nccl_no_gpu(self):
-    if ((self._communication ==
-         cross_device_ops_lib.CollectiveCommunication.NCCL) and
+    if ((self._communication_options.implementation ==
+         collective_util.CommunicationImplemenation.NCCL) and
         self._num_gpus_per_worker == 0):
       logging.warning("Enabled NCCL communication but no GPUs detected/"
                       "specified.")
@@ -753,3 +765,10 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
       Boolean.
     """
     return True
+
+  def _get_replica_id_in_sync_group(self, replica_id):
+    return self._id_in_cluster * len(self.worker_devices) + replica_id
+
+  def _get_local_replica_id(self, replica_id_in_sync_group):
+    return (replica_id_in_sync_group -
+            self._id_in_cluster * len(self.worker_devices))
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
index 6cfb007bd79..305008f6cb8 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
@@ -29,14 +29,16 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import cluster_resolver as cluster_resolver_lib
 from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import strategy_test_lib
+from tensorflow.python.distribute import test_util
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.eager import context
 from tensorflow.python.framework import config as tf_config
@@ -56,6 +58,12 @@ from tensorflow.python.platform import test
 from tensorflow.python.training.server_lib import ClusterSpec
 
 
+CollectiveAllReduceStrategy = (
+    collective_all_reduce_strategy.CollectiveAllReduceStrategy)
+CollectiveAllReduceExtended = (
+    collective_all_reduce_strategy.CollectiveAllReduceExtended)
+
+
 def create_test_objects(cluster_spec=None,
                         task_type=None,
                         task_id=None,
@@ -86,14 +94,10 @@ def create_test_objects(cluster_spec=None,
 class CollectiveAllReduceStrategyTestBase(
     multi_worker_test_base.MultiWorkerTestBase):
 
-  collective_key_base = 0
-
   def setUp(self):
     # We use a different key_base for each test so that collective keys won't be
     # reused.
-    # TODO(yuefengz, ayushd): enable it to reuse collective keys in different
-    # tests.
-    CollectiveAllReduceStrategyTestBase.collective_key_base += 100000
+    CollectiveAllReduceStrategy._collective_key_base += 100000
     super(CollectiveAllReduceStrategyTestBase, self).setUp()
 
   def _get_test_object(self, task_type, task_id, num_gpus=0):
@@ -102,18 +106,6 @@ class CollectiveAllReduceStrategyTestBase(
         task_type=task_type,
         task_id=task_id,
         num_gpus=num_gpus)
-
-    collective_keys = cross_device_utils.CollectiveKeys(
-        group_key_start=10 +
-        CollectiveAllReduceStrategyTestBase.collective_key_base,
-        op_instance_key_start=100 +
-        CollectiveAllReduceStrategyTestBase.collective_key_base,
-        variable_instance_key_start=10000 +
-        CollectiveAllReduceStrategyTestBase.collective_key_base)
-    strategy.extended._collective_keys = collective_keys
-    strategy.extended._cross_device_ops._collective_keys = collective_keys
-    strategy.extended._host_cross_device_ops._collective_keys = collective_keys
-
     return strategy, target, session_config
 
   def _test_minimize_loss_graph(self, task_type, task_id, num_gpus):
@@ -430,13 +422,21 @@ class DistributedCollectiveAllReduceStrategyTest(
 
   @combinations.generate(combinations.combine(mode=['eager']))
   def testEnableCollectiveOps(self):
+    # We cannot enable check health with this test because it mocks
+    # enable_collective_ops.
+    CollectiveAllReduceExtended._enable_check_health = False
     strategy, mock_called = self._get_strategy_with_mocked_methods()
+    CollectiveAllReduceExtended._enable_check_health = True
     self.assertTrue(strategy.extended._std_server_started)
     self.assertTrue(mock_called[0])
 
   @combinations.generate(combinations.combine(mode=['eager']))
   def testEnableCollectiveOpsAndClusterResolver(self):
+    # We cannot enable check health with this test because it mocks
+    # enable_collective_ops.
+    CollectiveAllReduceExtended._enable_check_health = False
     strategy, _ = self._get_strategy_with_mocked_methods()
+    CollectiveAllReduceExtended._enable_check_health = True
     self.assertEqual(strategy.cluster_resolver.task_type, 'worker')
     self.assertEqual(strategy.cluster_resolver.task_id, 1)
 
@@ -586,5 +586,29 @@ class LogicalDeviceTest(test.TestCase, parameterized.TestCase):
     context._reset_context()  # pylint: disable=protected-access
 
 
+@combinations.generate(
+    combinations.combine(
+        strategy=[
+            strategy_combinations.multi_worker_mirrored_2x1_cpu,
+            strategy_combinations.multi_worker_mirrored_2x1_gpu,
+            strategy_combinations.multi_worker_mirrored_2x2_gpu,
+        ],
+        mode=['eager']))
+class CollectiveAllReduceStrategyV2Test(test.TestCase, parameterized.TestCase):
+
+  def test_replica_id_in_sync_group(self, strategy):
+
+    def replica_fn():
+      replica_ctx = distribution_strategy_context.get_replica_context()
+      return replica_ctx.replica_id_in_sync_group, replica_ctx._replica_id
+
+    results = test_util.gather(strategy, strategy.run(replica_fn))
+    self.assertAllEqual(list(range(strategy.extended._num_replicas_in_sync)),
+                        results[0].numpy())
+    self.assertAllEqual(
+        list(range(len(strategy.extended.worker_devices))) *
+        strategy.extended._num_workers, results[1].numpy())
+
+
 if __name__ == '__main__':
-  test.main()
+  test_util.main()
diff --git a/tensorflow/python/distribute/collective_util.py b/tensorflow/python/distribute/collective_util.py
index 0d9c404e520..8780810ee0f 100644
--- a/tensorflow/python/distribute/collective_util.py
+++ b/tensorflow/python/distribute/collective_util.py
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,9 +19,143 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
+import enum
+
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
+# TODO(b/170340570): print deprecation warning for CollectiveCommunication.
+@tf_export("distribute.experimental.CommunicationImplemenation",
+           "distribute.experimental.CollectiveCommunication")
+class CommunicationImplemenation(enum.Enum):
+  """Cross device communication implementation.
+
+  Warning: The alias `tf.distribute.experimental.CollectiveCommunication` is
+  deprecated and will be removed in a future version. Use
+  `tf.distribute.experimental.CommunicationImplemenation` instead.
+
+  * `AUTO`: Automatically chosen by Tensorflow.
+  * `RING`: TensorFlow's ring algorithms for all-reduce and
+    all-gather.
+  * `NCCL`: NVIDIA®'s NCCL library. This is now only used for all-reduce on
+    GPUs; all-reduce on CPU, all-gather and broadcast fallbacks to RING.
+  """
+  AUTO = "AUTO"
+  RING = "RING"
+  NCCL = "NCCL"
+  # TODO(ayushd): add ncclAllGather implementation.
+
+
+CollectiveCommunication = CommunicationImplemenation
+
+
+@tf_export("distribute.experimental.CommunicationOptions")
+class _OptionsExported(object):
+  """Options for cross device communications like All-reduce.
+
+  This can be passed to methods like
+  `tf.distribute.get_replica_context().all_reduce()` to optimize collective
+  operation performance. Note that these are only hints, which may or may not
+  change the actual behavior. Some options only apply to certain strategy and
+  are ignored by others.
+
+  One common optimization is to break gradients all-reduce into multiple packs
+  so that weight updates can overlap with gradient all-reduce.
+
+  Examples:
+
+  ```python
+  options = tf.distribute.experimental.CommunicationOptions(
+      bytes_per_pack=50 * 1024 * 1024,
+      timeout_seconds=120,
+      implementation=tf.distribute.experimental.CommunicationImplemenation.NCCL)
+  grads = tf.distribute.get_replica_context().all_reduce(
+      'sum', grads, options=options)
+  optimizer.apply_gradients(zip(grads, vars),
+      experimental_aggregate_gradients=False)
+  ```
+
+  """
+
+  def __new__(cls, *args, **kwargs):
+    return Options.__new__(Options, *args, **kwargs)
+
+  def __init__(self,
+               bytes_per_pack=0,
+               timeout_seconds=None,
+               implementation=CommunicationImplemenation.AUTO):
+    """Creates a CollectiveHints.
+
+    Args:
+      bytes_per_pack: a non-negative integer. Breaks collective operations into
+        packs of certain size. If it's zero, the value is determined
+        automatically. This only applies to all-reduce with
+        `MultiWorkerMirroredStrategy` currently.
+      timeout_seconds: a float or None, timeout in seconds. If not None, the
+        collective raises `tf.errors.DeadlineExceededError` if it takes longer
+        than this timeout. Zero disables timeout. This can be useful when
+        debugging hanging issues.  This should only be used for debugging since
+        it creates a new thread for each collective, i.e. an overhead of
+        `timeout_seconds * num_collectives_per_second` more threads. This only
+        works for `tf.distribute.experimental.MultiWorkerMirroredStrategy`.
+      implementation: a `tf.distribute.experimental.CommunicationImplemenation`.
+        This is a hint on the preferred communication implementation.  Possible
+        values include `AUTO`, `RING`, and `NCCL`. NCCL is generally more
+        performant for GPU, but doesn't work for CPU. This only works for
+        `tf.distribute.experimental.MultiWorkerMirroredStrategy`.
+
+    Raises:
+      ValueError: When arguments have invalid value.
+    """
+    pass
+
+
+class Options(object):
+  """Implementation of OptionsInterface."""
+
+  def __init__(self,
+               bytes_per_pack=0,
+               timeout_seconds=None,
+               implementation=CommunicationImplemenation.AUTO):
+    if bytes_per_pack < 0:
+      raise ValueError("bytes_per_pack must be non-negative")
+    if isinstance(implementation, str):
+      implementation = CommunicationImplemenation(implementation.upper())
+    if not isinstance(implementation, CommunicationImplemenation):
+      raise ValueError("implementation should be a "
+                       "tf.distribute.experimental.CommunicationImplemenation")
+    self.bytes_per_pack = bytes_per_pack
+    self.timeout_seconds = timeout_seconds
+    self.implementation = implementation
+
+  __init__.__doc__ = _OptionsExported.__init__.__doc__
+
+  def merge(self, options):
+    """Merges with another options and returns a new one.
+
+    Values specified in the `options` takes precedence if they're not the
+    default.
+
+    Args:
+      options: a `tf.distribute.experimental.CollectiveCommunication`.
+
+    Returns:
+      A new `tf.distribute.experimental.CollectiveCommunication`.
+    """
+    merged = copy.deepcopy(self)
+    if options is None:
+      return merged
+    if options.bytes_per_pack != 0:
+      merged.bytes_per_pack = options.bytes_per_pack
+    if options.timeout_seconds is not None:
+      merged.timeout_seconds = options.timeout_seconds
+    if options.implementation != CommunicationImplemenation.AUTO:
+      merged.implementation = options.implementation
+    return merged
+
+
 @tf_export("distribute.experimental.CollectiveHints")
 class Hints(object):
   """Hints for collective operations like AllReduce.
@@ -61,6 +196,12 @@ class Hints(object):
 
   """
 
+  @deprecation.deprecated(
+      None, "use distribute.experimental.CommunicationOptions instead")
+  def __new__(cls, bytes_per_pack=0, timeout_seconds=None):
+    return Options(
+        bytes_per_pack=bytes_per_pack, timeout_seconds=timeout_seconds)
+
   def __init__(self, bytes_per_pack=0, timeout_seconds=None):
     """Creates a CollectiveHints.
 
@@ -80,7 +221,4 @@ class Hints(object):
     Raises:
       ValueError: When arguments have invalid value.
     """
-    if bytes_per_pack < 0:
-      raise ValueError("bytes_per_pack must be non-negative")
-    self.bytes_per_pack = bytes_per_pack
-    self.timeout_seconds = timeout_seconds
+    pass
diff --git a/tensorflow/python/distribute/collective_util_test.py b/tensorflow/python/distribute/collective_util_test.py
new file mode 100644
index 00000000000..e75d520979b
--- /dev/null
+++ b/tensorflow/python/distribute/collective_util_test.py
@@ -0,0 +1,41 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test for utilities for collectives."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import collective_util
+from tensorflow.python.eager import test
+
+
+class OptionsTest(test.TestCase):
+
+  def testCreateOptionsViaExportedAPI(self):
+    options = collective_util._OptionsExported()
+    self.assertIsInstance(options, collective_util.Options)
+
+  def testCreateOptionsViaHints(self):
+    with self.assertLogs() as cm:
+      options = collective_util.Hints(50, 1)
+    self.assertTrue(any("is deprecated" in msg for msg in cm.output))
+    self.assertIsInstance(options, collective_util.Options)
+    self.assertEqual(options.bytes_per_pack, 50)
+    self.assertEqual(options.timeout_seconds, 1)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/distribute/combinations.py b/tensorflow/python/distribute/combinations.py
index 3856b6fd132..5d7ef394a07 100644
--- a/tensorflow/python/distribute/combinations.py
+++ b/tensorflow/python/distribute/combinations.py
@@ -40,12 +40,12 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import combinations as framework_combinations
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_combinations as combinations_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
-
-FLAGS = flags.FLAGS
+from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(rchao): Rename `distribution` parameter to `strategy` or
@@ -87,21 +87,47 @@ class ClusterParameters(combinations_lib.ParameterModifier):
           raise ValueError("Only support one NamedDistribution for multi worker"
                            "tests.")
         strategy = v
+
+    if strategy:
+      has_chief = strategy.has_chief
+      num_workers = strategy.num_workers
+      runner = strategy.runner
+      if "has_chief" in kwargs and kwargs["has_chief"] != has_chief:
+        raise ValueError(
+            "both has_chief and strategy specified but are not compatible")
+      if "num_workers" in kwargs and kwargs["num_workers"] != num_workers:
+        raise ValueError(
+            "both num_workers and strategy specified but are not compatible")
+    else:
+      has_chief = kwargs.get("has_chief", False)
+      num_workers = kwargs.get("num_workers", 1)
+      runner = None
+
     # Always set cluster parameters if they're requested. So that generate()
     # works when there's no startegy in the combinations.
     update = {}
     if "has_chief" in requested_parameters:
-      update["has_chief"] = strategy.has_chief if strategy else False
+      update["has_chief"] = has_chief
     if "num_workers" in requested_parameters:
-      update["num_workers"] = strategy.num_workers if strategy else 1
+      update["num_workers"] = num_workers
     if "runner" in requested_parameters:
-      update["runner"] = strategy.runner if strategy else None
+      update["runner"] = runner
     return update
 
 
 class DistributionCombination(combinations_lib.TestCombination):
   """Sets up distribution strategy for tests."""
 
+  def should_execute_combination(self, kwargs):
+    distributions = [
+        v for v in kwargs.values() if isinstance(v, NamedDistribution)
+    ]
+    if test_util.is_xla_enabled() and any(d.no_xla for d in distributions):
+      return (
+          False,
+          "n/a: skipping strategy combination with no_xla=True in XLA tests")
+    return (True, None)
+
   def parameter_modifiers(self):
     return [
         DistributionParameter(),
@@ -200,7 +226,7 @@ class TPUCombination(combinations_lib.TestCombination):
                                   [d.required_tpu or 0 for d in distributions])
     use_cloud_tpu = any([kwargs.get("use_cloud_tpu")] +
                         [d.use_cloud_tpu for d in distributions])
-    tpu = hasattr(FLAGS, "tpu") and FLAGS.tpu or ""
+    tpu = hasattr(flags.FLAGS, "tpu") and flags.FLAGS.tpu or ""
 
     if not number_of_required_tpus and TPUCombination.TPU_TEST:
       return (False, "Test that doesn't require TPUs.")
@@ -231,7 +257,8 @@ class NamedDistribution(object):
                use_cloud_tpu=False,
                has_chief=False,
                num_workers=1,
-               use_pool_runner=False):
+               use_pool_runner=False,
+               no_xla=False):
     """Initialize NamedDistribution.
 
     Args:
@@ -244,6 +271,7 @@ class NamedDistribution(object):
       num_workers: The number of workers that the strategy requires.
       use_pool_runner: Whether to use a pool runner so that workers are re-used
         each time.
+      no_xla: Whether to skip in XLA tests.
     """
     object.__init__(self)
     self._name = name
@@ -253,22 +281,24 @@ class NamedDistribution(object):
     self.use_cloud_tpu = use_cloud_tpu
     self.has_chief = has_chief
     self.num_workers = num_workers
+    self.use_pool_runner = use_pool_runner
+    self.no_xla = no_xla
     self._runner = None
 
-    if _num_total_workers(self.has_chief, self.num_workers) > 1:
-      cluster_spec = multi_worker_test_base.create_cluster_spec(
-          has_chief=has_chief,
-          num_workers=num_workers,
-          num_ps=0,
-          has_eval=False)
-      if use_pool_runner:
-        # Need to create the strategy in the initializer so that collectives are
-        # configured before eager context initialization.
-        self._runner = multi_process_runner.MultiProcessPoolRunner(
-            cluster_spec, initializer=self._distribution_fn)
-
   @property
   def runner(self):
+    if not self._runner:
+      if (_num_total_workers(self.has_chief, self.num_workers) > 1 and
+          self.use_pool_runner):
+        # Need to create the strategy in the initializer so that collectives are
+        # configured before eager context initialization.
+        cluster_spec = multi_worker_test_base.create_cluster_spec(
+            has_chief=self.has_chief,
+            num_workers=self.num_workers,
+            num_ps=0,
+            has_eval=False)
+        self._runner = multi_process_runner.MultiProcessPoolRunner(
+            cluster_spec, initializer=self._distribution_fn)
     return self._runner
 
   @property
@@ -287,15 +317,16 @@ def concat(*combined):
   return result
 
 
+@tf_export("__internal__.distribute.combinations.generate", v1=[])
 def generate(combinations, test_combinations=()):
   # pylint: disable=g-doc-args,g-doc-return-or-yield
-  """Distributed adapter of `framework.combinations_lib.generate`.
+  """Distributed adapter of `tf.__internal__.test.combinations.generate`.
 
   All tests with distributed strategy should use this one instead of
-  `framework.test_combinations.generate`. This function has support of strategy
-  combinations, GPU/TPU and multi worker support.
+  `tf.__internal__.test.combinations.generate`. This function has support of
+  strategy combinations, GPU/TPU and multi worker support.
 
-  See `framework.test_combinations_lib.generate` for usage.
+  See `tf.__internal__.test.combinations.generate` for usage.
   """
   # pylint: enable=g-doc-args,g-doc-return-or-yield
   default_combinations = (
@@ -333,11 +364,6 @@ times = combinations_lib.times
 NamedObject = combinations_lib.NamedObject
 
 
-def main():
-  """Tests must call this main()."""
-  return multi_process_runner.test_main()
-
-
 # Identifies whether we're in the main process or worker processes.
 # `_multi_worker_test` decoration behaves differently in the main processs and
 # the worker processes. See the documentation of _multi_worker_test for detail.
diff --git a/tensorflow/python/distribute/combinations_test.py b/tensorflow/python/distribute/combinations_test.py
index 3fc3735d560..e9897a45805 100644
--- a/tensorflow/python/distribute/combinations_test.py
+++ b/tensorflow/python/distribute/combinations_test.py
@@ -25,6 +25,7 @@ import unittest
 from absl.testing import parameterized
 
 from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import test_util
 from tensorflow.python.distribute.cluster_resolver import tfconfig_cluster_resolver
 from tensorflow.python.framework import combinations as framework_combinations
 from tensorflow.python.platform import test
@@ -88,6 +89,20 @@ class ClusterCombinationTest(test.TestCase, parameterized.TestCase):
     # If combinations library doesn't raise an exception, the test is passed.
     pass
 
+  @combinations.generate(combinations.combine(num_workers=2,))
+  def testUseWithoutStrategy(self):
+    # There's no perfect way to check if the test runs in a subprocess. We
+    # approximate by checking the presence of TF_CONFIG, which is normally not
+    # set to the main process.
+    self.assertNotEqual(os.getenv("TF_CONFIG"), "")
+
+  def test_runner_creation(self):
+    cmb = combinations.NamedDistribution(
+        "Strategy1", lambda: None, has_chief=True, num_workers=2,
+        use_pool_runner=True)
+    self.assertIsNone(cmb._runner)
+    self.assertIsNotNone(cmb.runner)
+
 
 # unittest.expectedFailure doesn't work with parameterized test methods, so we
 # have to decorate the class instead.
@@ -106,14 +121,6 @@ class ClusterParametersShouldFailTest(test.TestCase, parameterized.TestCase):
     # combinations library should raise an exception.
     pass
 
-  @combinations.generate(combinations.combine(num_workers=2,))
-  def testUseWithoutStrategy(self):
-    # There's no perfect way to check if the test runs in a subprocess. We
-    # approximate by checking the presence of TF_CONFIG, which is normally not
-    # set to the main process.
-    self.assertNotEqual(os.getenv("TF_CONFIG"), "")
-    raise ValueError("actually run")
-
 
 # Tests that we *actually* run the test method in multiple workers instead of
 # just passing silently. More importantly, it verifies that the test can fail.
@@ -157,4 +164,4 @@ class CombinationsOnClassMultiWorkerExpectedFailureTest(test.TestCase,
 
 
 if __name__ == "__main__":
-  combinations.main()
+  test_util.main()
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index ed3e2d5d951..924881739da 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import collections
 import copy
-import enum
 import threading
 
 import six
@@ -34,8 +33,10 @@ from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute import values as value_lib
+from tensorflow.python.distribute import values_util
 from tensorflow.python.eager import context
-from tensorflow.python.eager import executor
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import executor as executor_lib
 from tensorflow.python.framework import kernels
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -81,7 +82,7 @@ def reduce_non_distributed_value(
     reduce_op, value, destinations, num_replicas_in_graph):
   """Reduce a non-DistributedValue `value` to `destinations`."""
   if isinstance(value, value_lib.DistributedValues):
-    raise ValueError("You are passing a `DistributedValue` to "
+    raise ValueError("You are passing a `DistributedValues` to "
                      "`reduce_non_distributed_value`, which is not allowed.")
 
   # If the same value is present on all replicas then the PerReplica value will
@@ -143,8 +144,8 @@ def _normalize_value_destination_pairs(value_destination_pairs):
 
 
 def _validate_value_destination_pairs(value_destination_pairs):
+  """Validates value_destination_pairs are valid."""
   # TODO(yuefengz): raise exceptions instead of returning False.
-  # pylint: disable=g-missing-docstring
   if not value_destination_pairs: return False
   if not isinstance(value_destination_pairs, (list, tuple)): return False
   if not all(isinstance(pair, tuple) for pair in value_destination_pairs):
@@ -196,7 +197,7 @@ def simple_broadcast(value, destinations, always_mirrored=False):
 
 def _simple_reduce(per_replica_value, reduce_to_device, accumulation_fn,
                    reduce_op):
-  # pylint: disable=g-missing-docstring
+  """Reduces the value by accumulation_fn and reduce_op."""
   all_values = per_replica_value.values
   if not all_values:
     raise ValueError("`per_replica_value` must be non-empty")
@@ -214,9 +215,32 @@ def _simple_reduce(per_replica_value, reduce_to_device, accumulation_fn,
   return reduced
 
 
+def _simple_gather(per_replica_value, reduce_to_device, axis):
+  """Concatenate all values in the DistributedValues input and return."""
+  all_values = per_replica_value.values
+  if not all_values:
+    raise ValueError("`per_replica_value` must be non-empty")
+
+  with ops.device(reduce_to_device):
+    with context.device_policy(context.DEVICE_PLACEMENT_SILENT):
+      gathered = array_ops.concat(all_values, axis)
+  return gathered
+
+
 @tf_export("distribute.CrossDeviceOps")
 class CrossDeviceOps(object):
-  """Base class for cross-device reduction and broadcasting algorithms."""
+  """Base class for cross-device reduction and broadcasting algorithms.
+
+  The main purpose of this class is to be passed to
+  `tf.distribute.MirroredStrategy` in order to choose among different cross
+  device communication implementations. Prefer using the methods of
+  `tf.distribute.Strategy` instead of the ones of this class.
+
+  Implementations:
+  * `tf.distribute.ReductionToOneDevice`
+  * `tf.distribute.NcclAllReduce`
+  * `tf.distribute.HierarchicalCopyAllReduce`
+  """
 
   def __init__(self):
     pass
@@ -226,32 +250,36 @@ class CrossDeviceOps(object):
     # Returns 1 by default, the value may be overridden by sub classes.
     return 1
 
-  def reduce(self,
-             reduce_op,
-             per_replica_value,
-             destinations,
-             experimental_hints=None):
+  def reduce(self, reduce_op, per_replica_value, destinations, options=None):
     """Reduce `per_replica_value` to `destinations`.
 
-    It runs the reduction operation defined by `reduce_op` and put the
-    result on `destinations`.
+    See `tf.distribute.StrategyExtended.reduce_to`. This can only be called in
+    the cross-replica context.
 
     Args:
-      reduce_op: An instance of `tf.distribute.ReduceOp` that indicates how
-        per_replica_value will be reduced.
-      per_replica_value: A `tf.distribute.DistributedValues` object or a tensor
-        with device set.
-      destinations: the reduction destinations.
-      experimental_hints: A `tf.distrbute.experimental.CollectiveHints`. Hints
-        to perform collective operations.
+      reduce_op: a `tf.distribute.ReduceOp` specifying how values should be
+        combined.
+      per_replica_value: a `tf.distribute.DistributedValues`, or a `tf.Tensor`
+        like object.
+      destinations: a `tf.distribute.DistributedValues`, a `tf.Variable`, a
+        `tf.Tensor` alike object, or a device string. It specifies the devices
+        to reduce to. To perform an all-reduce, pass the same to `value` and
+        `destinations`. Note that if it's a `tf.Variable`, the value is reduced
+        to the devices of that variable, and this method doesn't update the
+        variable.
+      options: a `tf.distribute.experimental.CommunicationOptions`. See
+        `tf.distribute.experimental.CommunicationOptions` for details.
 
     Returns:
-      a Mirrored object.
+      A `tf.Tensor` or `tf.distribute.DistributedValues`.
 
     Raises:
-      ValueError: if per_replica_value can't be converted to a PerReplica
-        object or if destinations aren't strings, Variables or DistributedValues
+      ValueError: if per_replica_value can't be converted to a
+        `tf.distribute.DistributedValues` or if destinations is not a string,
+        `tf.Variable` or `tf.distribute.DistributedValues`.
     """
+    if options is None:
+      options = collective_util.Options()
     if not isinstance(per_replica_value, value_lib.DistributedValues):
       per_replica_value = _make_tensor_into_per_replica(per_replica_value)
 
@@ -265,38 +293,113 @@ class CrossDeviceOps(object):
         v = array_ops.identity(per_replica_value.values[0])
       return distribute_utils.regroup((v,), wrap_class=value_lib.Mirrored)
 
-    if experimental_hints is None:
-      experimental_hints = collective_util.Hints()
+    if options is None:
+      options = collective_util.Options()
     return self.reduce_implementation(reduce_op, per_replica_value,
-                                      destinations, experimental_hints)
+                                      destinations, options)
 
-  def batch_reduce(self,
-                   reduce_op,
-                   value_destination_pairs,
-                   experimental_hints=None):
-    """Reduce PerReplica objects in a batch.
-
-    Reduce each first element in `value_destination_pairs` to each second
-    element which indicates the destinations.
-
-    This can be faster than multiple individual `reduce`s because we can
-    fuse several tensors into one or multiple packs before reduction.
+  def _gather(self, per_replica_value, destinations, axis, options=None):
+    """Gather `per_replica_value` to `destinations`.
 
     Args:
-      reduce_op: An instance of `tf.distribute.ReduceOp` that indicates how the
-        `per_replica_value` will be reduced.
-      value_destination_pairs: A list or a tuple of PerReplica objects (or
-        tensors with device set if there is one device) and destinations.
-      experimental_hints: A `tf.distrbute.experimental.CollectiveHints`. Hints
-        to perform collective operations.
+      per_replica_value: a `tf.distribute.DistributedValues`, or a `tf.Tensor`
+        like object.
+      destinations: a `tf.distribute.DistributedValues`, a `tf.Variable`, a
+        `tf.Tensor` alike object, or a device string. It specifies the devices
+        to gather to. To perform an all-gather, pass the same to `value` and
+        `destinations`. Note that if it's a `tf.Variable`, the value is gathered
+        to the devices of that variable, and this method doesn't update the
+        variable.
+      axis: specifies the dimension to gather along within each replica's
+        tensor.
+      options: a `tf.distribute.experimental.CommunicationOptions`. See
+        `tf.distribute.experimental.CommunicationOptions` for details.
 
     Returns:
-      a list of Mirrored objects.
+      A `tf.Tensor` or `tf.distribute.DistributedValues`
+
+    Raises:
+      ValueError: if per_replica_value can't be converted to a
+        `tf.distribute.DistributedValues` or if destinations is not a string,
+        `tf.Variable` or `tf.distribute.DistributedValues`.
+    """
+    if isinstance(per_replica_value, ops.IndexedSlices):
+      raise NotImplementedError("gather/all_gather does not support "
+                                "IndexedSlices")
+    if options is None:
+      options = collective_util.Options()
+
+    if not isinstance(per_replica_value, value_lib.DistributedValues):
+      per_replica_value = _make_tensor_into_per_replica(per_replica_value)
+
+    validate_destinations(destinations)
+
+    # Shortcut if `per_replica_value` only contains one value.
+    if self._num_between_graph_workers == 1 and len(
+        per_replica_value.values) == 1 and _devices_match(
+            per_replica_value, destinations):
+      with ops.device(per_replica_value.values[0].device):
+        v = array_ops.identity(per_replica_value.values[0])
+      return distribute_utils.regroup((v,), wrap_class=value_lib.Mirrored)
+
+    return self._gather_implementation(per_replica_value, destinations, axis,
+                                       options)
+
+  def _gather_implementation(self, per_replica_value, destinations, axis,
+                             options):
+    """Implementation of `gather` method of `tf.distribute.CrossDeviceOps`.
+
+    Overriding this method is useful for subclass implementers.
+
+    Args:
+      per_replica_value: a `tf.distribute.DistributedValues`, or a `tf.Tensor`
+        like object.
+      destinations: a `tf.distribute.DistributedValues`, a `tf.Variable`, a
+        `tf.Tensor` alike object, or a device string. It specifies the devices
+        to gather to. To perform an all-gather, pass the same to `value` and
+        `destinations`. Note that if it's a `tf.Variable`, the value is gathered
+        to the devices of that variable, this method doesn't update the
+        variable.
+      axis: specifies the dimension to gather along within each replica's
+        tensor.
+      options: a `tf.distribute.experimental.CommunicationOptions`. See
+        `tf.distribute.experimental.CommunicationOptions` for details.
+
+    Returns:
+      A `tf.Tensor` or `tf.distribute.DistributedValues`.
+
+    Raises:
+      ValueError: if per_replica_value can't be converted to a
+        `tf.distribute.DistributedValues` or if destinations is not a string,
+        `tf.Variable` or `tf.distribute.DistributedValues`.
+    """
+    raise NotImplementedError(
+        "_gather method must be implemented in descendants.")
+
+  def batch_reduce(self, reduce_op, value_destination_pairs, options=None):
+    """Reduce values to destinations in batches.
+
+    See `tf.distribute.StrategyExtended.batch_reduce_to`. This can only be
+    called in the cross-replica context.
+
+    Args:
+      reduce_op: a `tf.distribute.ReduceOp` specifying how values should be
+        combined.
+      value_destination_pairs: a sequence of (value, destinations) pairs. See
+        `tf.distribute.CrossDeviceOps.reduce` for descriptions.
+      options: a `tf.distribute.experimental.CommunicationOptions`. See
+        `tf.distribute.experimental.CommunicationOptions` for details.
+
+    Returns:
+      A list of `tf.Tensor` or `tf.distribute.DistributedValues`, one per pair
+      in `value_destination_pairs`.
 
     Raises:
       ValueError: if `value_destination_pairs` is not an iterable of
-        tuples of PerReplica objects and destinations.
+        tuples of `tf.distribute.DistributedValues` and destinations.
     """
+    if options is None:
+      options = collective_util.Options()
     # TODO(yuefengz): if destinations are different, split into several
     # `_batch_reduce` invocations.
     if not _validate_value_destination_pairs(value_destination_pairs):
@@ -317,76 +420,84 @@ class CrossDeviceOps(object):
           for v, _ in value_destination_pairs
       ]
 
-    if experimental_hints is None:
-      experimental_hints = collective_util.Hints()
+    if options is None:
+      options = collective_util.Options()
     return self.batch_reduce_implementation(reduce_op, value_destination_pairs,
-                                            experimental_hints)
+                                            options)
 
   def broadcast(self, tensor, destinations):
-    """Broadcast the `tensor` to destinations.
+    """Broadcast `tensor` to `destinations`.
+
+    This can only be called in the cross-replica context.
 
     Args:
-      tensor: the tensor to broadcast.
-      destinations: the broadcast destinations.
+      tensor: a `tf.Tensor` like object. The value to broadcast.
+      destinations: a `tf.distribute.DistributedValues`, a `tf.Variable`, a
+        `tf.Tensor` alike object, or a device string. It specifies the devices
+        to broadcast to. Note that if it's a `tf.Variable`, the value is
+        broadcasted to the devices of that variable, this method doesn't update
+        the variable.
 
     Returns:
-      a Mirrored object.
+      A `tf.Tensor` or `tf.distribute.DistributedValues`.
     """
     validate_destinations(destinations)
     return self.broadcast_implementation(tensor, destinations)
 
   @doc_controls.for_subclass_implementers
   def reduce_implementation(self, reduce_op, per_replica_value, destinations,
-                            experimental_hints):
-    """The implementation of reduce of `per_replica_value` to `destinations`.
+                            options):
+    """Implementation of `reduce`.
 
     Overriding this method is useful for subclass implementers.
 
-    It runs the reduction operation defined by `reduce_op` and put the
-    result on `destinations`.
-
     Args:
-      reduce_op: An instance `tf.distribute.ReduceOp` that indicates of how
-        per_replica_value will be reduced.
-      per_replica_value: A PerReplica object or a tensor with device set.
-      destinations: the reduction destinations.
-      experimental_hints: A `tf.distrbute.experimental.CollectiveHints`. Hints
-        to perform collective operations.
+      reduce_op: a `tf.distribute.ReduceOp` specifying how values should be
+        combined.
+      per_replica_value: a `tf.distribute.DistributedValues`, or a `tf.Tensor`
+        like object.
+      destinations: a `tf.distribute.DistributedValues`, a `tf.Variable`, a
+        `tf.Tensor` alike object, or a device string. It specifies the devices
+        to reduce to. To perform an all-reduce, pass the same to `value` and
+        `destinations`. Note that if it's a `tf.Variable`, the value is reduced
+        to the devices of that variable, this method doesn't update the
+        variable.
+      options: a `tf.distribute.experimental.CommunicationOptions`. See
+        `tf.distribute.experimental.CommunicationOptions` for details.
 
     Returns:
-      a Mirrored object.
+      A `tf.Tensor` or `tf.distribute.DistributedValues`.
 
     Raises:
-      ValueError: if per_replica_value can't be converted to a PerReplica
-        object.
+      ValueError: if per_replica_value can't be converted to a
+        `tf.distribute.DistributedValues` or if destinations is not a string,
+        `tf.Variable` or `tf.distribute.DistributedValues`.
     """
     raise NotImplementedError(
         "_reduce method must be implemented in descendants.")
 
   @doc_controls.for_subclass_implementers
   def batch_reduce_implementation(self, reduce_op, value_destination_pairs,
-                                  experimental_hints):
-    """Implementation of reduce PerReplica objects in a batch.
+                                  options):
+    """Implementation of `batch_reduce`.
 
     Overriding this method is useful for subclass implementers.
 
-    Reduce each first element in `value_destination_pairs` to each second
-    element which indicates the destinations.
-
     Args:
-      reduce_op: An instance of `tf.distribute.ReduceOp` that indicates how
-        per_replica_value will be reduced.
-      value_destination_pairs: An iterable of tuples of PerReplica objects
-        (or tensors with device set if there is one device) and destinations.
-      experimental_hints: A `tf.distrbute.experimental.CollectiveHints`. Hints
-        to perform collective operations.
+      reduce_op: a `tf.distribute.ReduceOp` specifying how values should be
+        combined.
+      value_destination_pairs: a sequence of (value, destinations) pairs. See
+        `reduce` for descriptions.
+      options: a `tf.distribute.experimental.CommunicationOptions`. See
+        `tf.distribute.experimental.CommunicationOptions` for details.
 
     Returns:
-      a list of Mirrored objects.
+      A list of `tf.Tensor` or `tf.distribute.DistributedValues`, one per pair
+      in `value_destination_pairs`.
 
     Raises:
       ValueError: if `value_destination_pairs` is not an iterable of
-        tuples of PerReplica objects and destinations
+        tuples of `tf.distribute.DistributedValues` and destinations.
     """
     raise NotImplementedError(
         "batch_reduce_implementation method must be implemented in descendants."
@@ -394,26 +505,36 @@ class CrossDeviceOps(object):
 
   @doc_controls.for_subclass_implementers
   def broadcast_implementation(self, tensor, destinations):
-    """Implementation of broadcast the `tensor` to destinations.
+    """Implementation of `broadcast`.
 
     Args:
-      tensor: the tensor to broadcast.
-      destinations: the broadcast destinations.
+      tensor: a `tf.Tensor` like object. The value to broadcast.
+      destinations: a `tf.distribute.DistributedValues`, a `tf.Variable`, a
+        `tf.Tensor` alike object, or a device string. It specifies the devices
+        to broadcast to.
+        `destinations`. Note that if it's a `tf.Variable`, the value is
+        broadcasted to the devices of that variable, this method doesn't update
+        the variable.
 
     Returns:
-      a Mirrored object.
+      A `tf.Tensor` or `tf.distribute.DistributedValues`.
     """
     return simple_broadcast(tensor, destinations, always_mirrored=True)
 
 
 @tf_export("distribute.ReductionToOneDevice")
 class ReductionToOneDevice(CrossDeviceOps):
-  """Always do reduction to one device first and then do broadcasting.
+  """A CrossDeviceOps implementation that copies values to one device to reduce.
 
-  Batch reduction is done by reduction on each element one by one.
+  This implementation always copies values to one device to reduce them, then
+  broadcast reduced values to the destinations. It doesn't support efficient
+  batching.
+
+  Here is how you can use `ReductionToOneDevice` in
+  `tf.distribute.MirroredStrategy`:
 
   ```
-    mirrored_strategy = tf.distribute.MirroredStrategy(
+    strategy = tf.distribute.MirroredStrategy(
       cross_device_ops=tf.distribute.ReductionToOneDevice())
   ```
   """
@@ -423,8 +544,8 @@ class ReductionToOneDevice(CrossDeviceOps):
 
     Args:
       reduce_to_device: the intermediate device to reduce to. If None, reduce
-        to the first device in `destinations` of the `reduce()` method.
-      accumulation_fn: a function that does accumulation.  If None, then
+        to the first device in `destinations` of the `reduce` method.
+      accumulation_fn: a function that does accumulation.  If None,
         `tf.math.add_n` is used.
     """
     self.reduce_to_device = reduce_to_device
@@ -432,8 +553,8 @@ class ReductionToOneDevice(CrossDeviceOps):
     super(ReductionToOneDevice, self).__init__()
 
   def reduce_implementation(self, reduce_op, per_replica_value, destinations,
-                            experimental_hints):
-    del experimental_hints  # Unused.
+                            options):
+    del options  # Unused.
     if check_destinations(destinations):
       devices = get_devices_from(destinations)
     else:
@@ -446,11 +567,25 @@ class ReductionToOneDevice(CrossDeviceOps):
                              self.accumulation_fn, reduce_op)
     return self.broadcast(reduced, destinations)
 
+  def _gather_implementation(self, per_replica_value, destinations, axis,
+                             options):
+    del options  # Unused.
+    if check_destinations(destinations):
+      devices = get_devices_from(destinations)
+    else:
+      devices = get_devices_from(per_replica_value)
+    reduce_to_device = self.reduce_to_device or devices[0]
+    logging.log_first_n(
+        logging.INFO,
+        "Gather to %s then broadcast to %r." % (reduce_to_device, devices), 10)
+    gathered = _simple_gather(per_replica_value, reduce_to_device, axis)
+    return self.broadcast(gathered, destinations)
+
   def batch_reduce_implementation(self, reduce_op, value_destination_pairs,
-                                  experimental_hints):
+                                  options):
     return [
         self.reduce_implementation(
-            reduce_op, t, destinations=v, experimental_hints=experimental_hints)
+            reduce_op, t, destinations=v, options=options)
         for t, v in value_destination_pairs
     ]
 
@@ -641,18 +776,24 @@ def _unpack_tensors(reduced, tensor_packer=None):
 
 
 class AllReduceCrossDeviceOps(CrossDeviceOps):
-  """Reduction using all-reduce."""
+  """All-reduce implementation of CrossDeviceOps.
+
+  It performs all-reduce when applicable using NCCL or hierarchical copy. For
+  the batch API, tensors will be repacked or aggregated for more efficient
+  cross-device transportation.
+
+  For reduces that are not all-reduce, it falls back to
+  `tf.distribute.ReductionToOneDevice`.
+  """
 
   def __init__(self, all_reduce_alg="nccl", num_packs=1):
-    """All-reduce implementation of CrossDeviceOps.
-
-    Before performing all-reduce, tensors will be packed for more efficient
-    cross-device transportation.
+    """Initializes the object.
 
     Args:
       all_reduce_alg: the all-reduce algorithm to use, currently only "nccl" or
         "hierarchical_copy" are supported.
-      num_packs: If non-zero, pack values into `num_packs` splits.
+      num_packs: a non-negative integer. The number of packs to split values
+        into. If zero, no packing will be done.
     """
     self._all_reduce_alg = all_reduce_alg
     self._num_packs = num_packs
@@ -660,8 +801,8 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
     super(AllReduceCrossDeviceOps, self).__init__()
 
   def reduce_implementation(self, reduce_op, per_replica_value, destinations,
-                            experimental_hints):
-    del experimental_hints  # Unused.
+                            options):
+    del options  # Unused.
     if _devices_match(per_replica_value, destinations):
       return self._batch_all_reduce(reduce_op, [per_replica_value])[0]
     else:
@@ -669,13 +810,13 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
                                                    destinations)
 
   def batch_reduce_implementation(self, reduce_op, value_destination_pairs,
-                                  experimental_hints):
+                                  options):
     if _all_devices_match(value_destination_pairs):
       return self._batch_all_reduce(reduce_op,
                                     [v[0] for v in value_destination_pairs])
     else:
       return [
-          self.reduce_implementation(reduce_op, value, dest, experimental_hints)
+          self.reduce_implementation(reduce_op, value, dest, options)
           for value, dest in value_destination_pairs
       ]
 
@@ -735,6 +876,15 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
     return self._simple_cross_replica_ops.batch_reduce(
         reduce_op, zip(sparse_values, sparse_values))
 
+  def _gather_implementation(self, per_replica_value, destinations, axis,
+                             options):
+    logging.warning("gather/all_gather with NCCL or HierarchicalCopy is not "
+                    "supported. Falling back to gather on one device and "
+                    "then broadcast. We're working on a more efficient "
+                    "implementation.")
+    return ReductionToOneDevice()._gather(per_replica_value, destinations, axis,  # pylint: disable=protected-access
+                                          options)
+
 
 # For compatibility with code using the old name of `AllReduceCrossDeviceOps`.
 AllReduceCrossTowerOps = AllReduceCrossDeviceOps
@@ -746,21 +896,32 @@ AllReduceSpecTuple = collections.namedtuple("AllReduceSpecTuple",
 
 @tf_export("distribute.NcclAllReduce")
 class NcclAllReduce(AllReduceCrossDeviceOps):
-  """Reduction using NCCL all-reduce."""
+  """NCCL all-reduce implementation of CrossDeviceOps.
+
+  It uses Nvidia NCCL for all-reduce. For the batch API, tensors will be
+  repacked or aggregated for more efficient cross-device transportation.
+
+  For reduces that are not all-reduce, it falls back to
+  `tf.distribute.ReductionToOneDevice`.
+
+  Here is how you can use `NcclAllReduce` in `tf.distribute.MirroredStrategy`:
+
+
+  ```
+    strategy = tf.distribute.MirroredStrategy(
+      cross_device_ops=tf.distribute.NcclAllReduce())
+  ```
+  """
 
   def __init__(self, num_packs=1):
-    """NCCL all-reduce implementation of CrossDeviceOps.
-
-    It uses Nvidia NCCL for all-reduce. Before performing all-reduce, tensors
-    will be repacked or aggregated for more efficient cross-device
-    transportation.
+    """Initializes the object.
 
     Args:
-      num_packs: values will be packed in this many splits.  `num_packs` should
-        be greater than or equals 0. When it is zero, no packing will be done.
+      num_packs: a non-negative integer. The number of packs to split values
+        into. If zero, no packing will be done.
 
     Raises:
-      ValueError if `num_packs` is negative.
+      ValueError: if `num_packs` is negative.
     """
     if num_packs < 0:
       raise ValueError(
@@ -772,23 +933,34 @@ class NcclAllReduce(AllReduceCrossDeviceOps):
 
 @tf_export("distribute.HierarchicalCopyAllReduce")
 class HierarchicalCopyAllReduce(AllReduceCrossDeviceOps):
-  """Reduction using hierarchical copy all-reduce.
+  """Hierarchical copy all-reduce implementation of CrossDeviceOps.
 
   It reduces to one GPU along edges in some hierarchy and broadcasts back to
-  each GPU along the same path. Before performing all-reduce, tensors will be
-  repacked or aggregated for more efficient cross-device transportation.
+  each GPU along the same path. For the batch API, tensors will be repacked or
+  aggregated for more efficient cross-device transportation.
 
   This is a reduction created for Nvidia DGX-1 which assumes GPUs connects like
   that on DGX-1 machine. If you have different GPU inter-connections, it is
   likely that it would be slower than `tf.distribute.ReductionToOneDevice`.
+
+  For reduces that are not all-reduce, it falls back to
+  `tf.distribute.ReductionToOneDevice`.
+
+  Here is how you can use `HierarchicalCopyAllReduce` in
+  `tf.distribute.MirroredStrategy`:
+
+  ```
+    strategy = tf.distribute.MirroredStrategy(
+      cross_device_ops=tf.distribute.HierarchicalCopyAllReduce())
+  ```
   """
 
   def __init__(self, num_packs=1):
     """Initializes the object.
 
     Args:
-      num_packs: values will be packed in this many splits.  `num_packs` should
-        be greater than or equals 0. When it is zero, no packing will be done.
+      num_packs: a non-negative integer. The number of packs to split values
+        into. If zero, no packing will be done.
 
     Raises:
       ValueError if `num_packs` is negative.
@@ -802,131 +974,9 @@ class HierarchicalCopyAllReduce(AllReduceCrossDeviceOps):
         num_packs=num_packs)
 
 
-class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
-  """All-reduce algorithms for distributed TensorFlow."""
-
-  def __init__(self,
-               worker_devices,
-               num_gpus_per_worker,
-               all_reduce_spec=("pscpu/pscpu", 2, -1),
-               num_packs=0):
-    """Initialize the all-reduce algorithm.
-
-    Args:
-      worker_devices: a list of device strings for workers participating in
-        all-reduce.
-      num_gpus_per_worker: number of GPU devices per worker.
-      all_reduce_spec: a tuple or a named tuple or a list of tuples specifying
-        the all-reduce algorithm.
-        1. The first element of a tuple is the name of the all-reduce algorithm.
-        Valid algorithm names are: "nccl", "nccl/xring", "nccl/rechd",
-        "nccl/pscpu", "xring", "pscpu", "psgpu", "pscpu/pscpu". Algorithms with
-        a "/" are hierarchical, so two all-reduces are executed, the first one
-        aggregates tensors within a worker and the second aggregates across
-        workers.
-        2. The second element of a tuple is the number of shards when doing
-        all-reduce. Let's say its values is M, each tensor after packing will be
-        split into M shards and then M parallel all-reduces would be performed
-        before finally they are concatenated backed into a complete tensor.
-        3. The third element is the maximum size of tensors that will be
-        applicable for the algorithm specified by the first element. For
-        example, if all_reduce_spec=[("nccl", 2, 1024), ("pscpu/pscpu", 2, -1)],
-        tensors with size not larger than 1024 bytes will be applied a 2-shard
-        "nccl" all-reduce and other tensors will be applied a 2-shard
-        "pscpu/pscpu" algorithm. The third elements should be in increasing
-        order across tuples and end with -1 which indicates infinity.
-      num_packs: see AllReduceCrossDeviceOps.
-    """
-    self._worker_devices = worker_devices
-    self._num_gpus_per_worker = num_gpus_per_worker
-    super(MultiWorkerAllReduce, self).__init__(num_packs=num_packs)
-
-    def validate_and_complete_spec(spec):
-      """Validate and complete the all-reduce spec."""
-      # TODO(yuefengz): support namedtuple.
-      if not isinstance(spec, tuple):
-        raise ValueError(
-            "A tuple is expected for all-reduce spec: %r" % all_reduce_spec)
-      if not spec or len(spec) > 3:
-        raise ValueError(
-            "Too many elements in the all-reduce spec tuple: %r" % spec)
-      if len(spec) == 1:
-        return AllReduceSpecTuple(spec[0], 1, -1)
-      elif len(spec) == 2:
-        return AllReduceSpecTuple(spec[0], spec[1], -1)
-      else:
-        return AllReduceSpecTuple(*spec)
-
-    self._all_reduce_spec = []
-    if isinstance(all_reduce_spec, six.string_types):
-      self._all_reduce_spec.append(AllReduceSpecTuple(all_reduce_spec, 1, -1))
-    elif isinstance(all_reduce_spec, tuple):
-      self._all_reduce_spec.append(validate_and_complete_spec(all_reduce_spec))
-    elif isinstance(all_reduce_spec, list):
-      self._all_reduce_spec = [
-          validate_and_complete_spec(spec) for spec in all_reduce_spec
-      ]
-
-  def _batch_all_reduce(self, reduce_op, per_replica_values):
-    """All-reduce algorithm in a batch."""
-    logging.log_first_n(
-        logging.INFO, "Distributed batch_all_reduce: %d all-reduces with "
-        "allreduce_spec = %r, num_packs = %d" %
-        (len(per_replica_values), self._all_reduce_spec, self._num_packs), 10)
-
-    device_grads = _group_value_by_device(per_replica_values)
-
-    # The all-reduce library requires fully defined shapes.
-    # TODO(yuefengz): when tensor sharding is not needed, static shapes are not
-    # required as well.
-    for device_grad in device_grads:
-      for grad, _ in device_grad:
-        if not grad.shape.is_fully_defined():
-          raise ValueError("Shape is unknown for node %r" % grad)
-
-    remaining_grads = device_grads
-    aggregated_grads = []
-    for spec_tuple in self._all_reduce_spec:
-      if spec_tuple.limit < 0:
-        this_grads = remaining_grads
-        remaining_grads = []
-      else:
-        (this_grads, remaining_grads) = cross_device_utils.split_grads_by_size(
-            spec_tuple.limit, remaining_grads)
-      if this_grads:
-        device_grad_packs, tensor_packer = _pack_tensors(
-            this_grads, self._num_packs)
-        range_agg_grads = cross_device_utils.sum_gradients_all_reduce(
-            self._worker_devices, device_grad_packs, len(self._worker_devices),
-            spec_tuple.alg, spec_tuple.shards, range(self._num_gpus_per_worker))
-        range_agg_grads = _unpack_tensors(range_agg_grads, tensor_packer)
-
-        if not aggregated_grads:
-          aggregated_grads = range_agg_grads
-        else:
-          assert len(aggregated_grads) == len(range_agg_grads)
-          for i, range_agg_grad in enumerate(range_agg_grads):
-            aggregated_grads[i] += range_agg_grad
-    assert not remaining_grads
-
-    return _ungroup_and_make_mirrored(aggregated_grads, per_replica_values[0],
-                                      reduce_op)
-
-
-@tf_export("distribute.experimental.CollectiveCommunication")
-class CollectiveCommunication(enum.Enum):
-  """Communication choices for CollectiveOps.
-
-  * `AUTO`: Default to runtime's automatic choices.
-  * `RING`: TensorFlow's ring algorithms for all-reduce and
-    all-gather.
-  * `NCCL`: Use ncclAllReduce for all-reduce, and ring algorithms for
-    all-gather.
-  """
-  AUTO = "AUTO"
-  RING = "RING"
-  NCCL = "NCCL"
-  # TODO(ayushd): add ncclAllGather implementation.
+# TODO(crccw): remove after migrating all callers.
+CollectiveCommunication = collective_util.CommunicationImplemenation
+CommunicationImplemenation = collective_util.CommunicationImplemenation
 
 
 # TODO(yuefengz): support in-graph collective all-reduce.
@@ -937,11 +987,7 @@ class CollectiveAllReduce(CrossDeviceOps):
   all workers and then put results on the right destinations.
   """
 
-  def __init__(self,
-               devices,
-               group_size,
-               collective_keys=None,
-               communication=CollectiveCommunication.AUTO):
+  def __init__(self, devices, group_size, collective_keys=None):
     """Initializes the object.
 
     Args:
@@ -949,21 +995,18 @@ class CollectiveAllReduce(CrossDeviceOps):
       group_size: the global group size. For between-graph replicated training
         it's the total number of devices across all workers.
       collective_keys: an optional CollectiveKey object.
-      communication: indicates which collective communication to use.
     """
     if group_size % len(devices) > 0:
       raise ValueError("group_size must be divisible by the number of devices.")
 
-    self._devices = tuple(device_util.canonicalize(d) for d in devices)
     self._group_size = group_size
     self._collective_keys = (collective_keys or
                              cross_device_utils.CollectiveKeys())
-    self._communication = communication
     # This lock guards all collective launches, i.e. calls to
     # cross_device_utils.build_collectve_*.
     #
     # In a multi threaded eager program we need to ensure different groups of
-    # collectives don't interleave each other, otherwise there couuld be
+    # collectives don't interleave each other, otherwise there could be
     # deadlocks. E.g. if two user threads both are launching collectives:
     #   user-thread-0  device0                 device1
     #   user-thread-1          device0 device1
@@ -974,14 +1017,21 @@ class CollectiveAllReduce(CrossDeviceOps):
     # This deadlocks since neither collective is able to finish.
     self._lock = threading.Lock()
 
+    self._devices = tuple(device_util.canonicalize(d) for d in devices)
+    group_key = self._collective_keys.get_group_key(self._devices)
     # Collective ops requires all devices to participate and is blocking. In
     # eager, we need one async executor for each device to be able to launch
     # them altogether. Note that async doesn't imply concurrency. Within an
     # async executor operations are still executed sequentially. In graph or
     # function building, the executors are not used.
     self._executors = []
-    for _ in range(len(devices)):
-      self._executors.append(executor.new_executor(enable_async=True))
+    self._launchers = []
+    for device in self._devices:
+      executor = executor_lib.new_executor(enable_async=True)
+      self._executors.append(executor)
+      launcher = cross_device_utils.CollectiveReplicaLauncher(
+          group_key, group_size, self._collective_keys, device, executor)
+      self._launchers.append(launcher)
 
     super(CollectiveAllReduce, self).__init__()
 
@@ -991,9 +1041,10 @@ class CollectiveAllReduce(CrossDeviceOps):
     return self._group_size / len(self._devices)
 
   def reduce_implementation(self, reduce_op, per_replica_value, destinations,
-                            experimental_hints):
+                            options):
+    values_util.mark_as_unsaveable()
     all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value],
-                                         experimental_hints)[0]
+                                         options)[0]
     devices = get_devices_from(destinations)
 
     if _devices_match(per_replica_value, destinations):
@@ -1022,12 +1073,13 @@ class CollectiveAllReduce(CrossDeviceOps):
     return distribute_utils.regroup(index, wrap_class=value_lib.Mirrored)
 
   def batch_reduce_implementation(self, reduce_op, value_destination_pairs,
-                                  experimental_hints):
+                                  options):
+    values_util.mark_as_unsaveable()
     all_devices_match = _all_devices_match(value_destination_pairs)
     if all_devices_match:
       return self._batch_all_reduce(reduce_op,
                                     [v[0] for v in value_destination_pairs],
-                                    experimental_hints)
+                                    options)
     else:
       if not all_devices_match:
         logging.log_first_n(
@@ -1035,41 +1087,41 @@ class CollectiveAllReduce(CrossDeviceOps):
             "destinations are different.", 10)
 
       return [
-          self.reduce_implementation(reduce_op, value, dest, experimental_hints)
+          self.reduce_implementation(reduce_op, value, dest, options)
           for value, dest in value_destination_pairs
       ]
 
-  def _batch_all_reduce(self, reduce_op, per_replica_values,
-                        experimental_hints):
+  def _batch_all_reduce(self, reduce_op, per_replica_values, options):
     """All reduce algorithm in a batch."""
     dense_values, dense_indices, sparse_values, sparse_indices = (
         cross_device_utils.split_by_sparsity(per_replica_values))
     if dense_values:
       dense_results = self._do_batch_all_reduce_dense(reduce_op, dense_values,
-                                                      experimental_hints)
+                                                      options)
     else:
       dense_results = []
     if sparse_values:
       sparse_results = self._do_batch_all_reduce_sparse(reduce_op,
-                                                        sparse_values,
-                                                        experimental_hints)
+                                                        sparse_values, options)
     else:
       sparse_results = []
     return cross_device_utils.stitch_values(
         ((dense_results, dense_indices), (sparse_results, sparse_indices)))
 
-  def _do_batch_all_reduce_dense(self, reduce_op, per_replica_values,
-                                 experimental_hints):
+  def _do_batch_all_reduce_dense(self, reduce_op, per_replica_values, options):
     """All-reduce across all workers in a batch."""
 
     batch_size = len(per_replica_values)
-    # Pass self._communication to the runtime as a communication hint.
-    communication = self._communication.value
-    # For now, we use NCCL only when batch_size > 1.
+    implementation = options.implementation.value
+    # For now, we use NCCL only when batch_size > 1 since we don't have a way to
+    # order NCCL launches. We're hoping that there's only one batched
+    # all-reduce, which is the gradients.
     # TODO(b/132575814): switch to NCCL for all collectives when communication
+    # is NCCL if and only if we can order collectives deterministically.
     # is NCCL.
-    if self._communication == CollectiveCommunication.NCCL and batch_size == 1:
-      communication = CollectiveCommunication.AUTO.value
+    if (options.implementation == CommunicationImplemenation.NCCL and
+        batch_size == 1):
+      implementation = CommunicationImplemenation.AUTO.value
 
     # Reverse the lists so that there's better chance that values follows
     # the order in which they are calculated (e.g. when they're gradients), so
@@ -1081,66 +1133,41 @@ class CollectiveAllReduce(CrossDeviceOps):
     # queuing time due to concurrent intense computation.
     #
     # TODO(b/147393503): explore solutions for optimal ordering.
-    packs = cross_device_utils.pack_by_size(
-        list(reversed(per_replica_values)), experimental_hints.bytes_per_pack)
+    values_by_device = [[] for _ in range(len(self._devices))]
+    for per_replica in reversed(per_replica_values):
+      for i in range(len(self._devices)):
+        values_by_device[i].append(per_replica.values[i])
 
-    if batch_size > 1:
-      logging.info(
-          "Collective batch_all_reduce: %d all-reduces, num_devices = %d, "
-          "group_size = %d, communication_hint = %s, num_packs = %d",
-          batch_size, len(self._devices), self._group_size, communication,
-          len(packs))
-    else:
-      logging.log_first_n(
-          logging.INFO, "Collective batch_all_reduce: %d all-reduces, "
-          "num_devices = %d, group_size = %d, communication_hint = %s, "
-          "num_packs = %d" % (batch_size, len(
-              self._devices), self._group_size, communication, len(packs)), 10)
-
-    reduced_values = []
+    outputs_by_device = []
     with self._lock:
-      for pack in packs:
-        # By placing all CollectiveReduce ops in a pack under single name scope,
-        # we ensure they will be picked up by the `ScopedAllocator` grappler
-        # optimizer and packed into a single all-reduce.
-        with ops.name_scope("allreduce"):
-          for per_replica in pack:
-            # Add control dependencies per device from the last gradients to the
-            # current set, in order to serialize NCCL launches.
-            if (communication == CollectiveCommunication.NCCL.value and
-                reduced_values):
-              control_inputs = list(reduced_values[-1])
-            else:
-              control_inputs = None
-            reduced_values.append(
-                cross_device_utils.build_collective_reduce(
-                    per_replica.values,
-                    self._devices,
-                    self._group_size,
-                    self._collective_keys,
-                    "Add",
-                    "Id",
-                    communication,
-                    control_inputs,
-                    executors=self._executors,
-                    timeout=experimental_hints.timeout_seconds))
+      for i in range(len(self._devices)):
+        packs = cross_device_utils.group_by_size(
+            values_by_device[i], options.bytes_per_pack)
+        if not context.executing_eagerly() and i == 0:
+          logging.info(
+              "Collective batch_all_reduce: %d all-reduces, num_devices = %d, "
+              "group_size = %d, implementation = %s, num_packs = %d",
+              batch_size, len(self._launchers), self._group_size,
+              implementation, len(packs))
+        outputs_by_device.append(self._launchers[i].batch_all_reduce(
+            packs, implementation, options.timeout_seconds))
 
     for e in self._executors:
       e.wait()
 
     mirrored = []
-    # Reverse the order of reduced value to recover the order in the input.
-    for value in reversed(reduced_values):
+    for values in zip(*outputs_by_device):
       if reduce_op == reduce_util.ReduceOp.MEAN:
-        for i, v in enumerate(value):
+        values = list(values)
+        for i, v in enumerate(values):
           with ops.device(v.device):
-            value[i] = v / self._group_size
+            values[i] = v / self._group_size
       mirrored.append(
-          distribute_utils.regroup(value, wrap_class=value_lib.Mirrored))
-    return mirrored
+          distribute_utils.regroup(values, wrap_class=value_lib.Mirrored))
+    # Reverse the order of reduced value to recover the order in the input.
+    return list(reversed(mirrored))
 
-  def _do_batch_all_reduce_sparse(self, reduce_op, per_replica_values,
-                                  experimental_hints):
+  def _do_batch_all_reduce_sparse(self, reduce_op, per_replica_values, options):
     """All-reduce IndexedSlices across all workers in a batch."""
 
     logging.log_first_n(
@@ -1148,26 +1175,22 @@ class CollectiveAllReduce(CrossDeviceOps):
         "%d all-reduces, group_size = %d" %
         (len(per_replica_values), self._group_size), 10)
 
-    # Pass self._communication to the runtime as a communication hint.
-    communication_hint = self._communication.value
+    implementation = options.implementation.value
     # For now, we use NCCL only when batch_size > 1.
-    # TODO(b/132575814): switch to NCCL for all collectives when communication
+    # TODO(b/132575814): switch to NCCL for all collectives when implementation
     # is NCCL.
-    if self._communication == CollectiveCommunication.NCCL and len(
+    if options.implementation == CommunicationImplemenation.NCCL and len(
         per_replica_values) == 1:
-      communication_hint = CollectiveCommunication.AUTO.value
+      implementation = CommunicationImplemenation.AUTO.value
 
     gathered_values = []
-    with self._lock, ops.name_scope("allreduce"):
+    with self._lock:
       for per_replica in per_replica_values:
-        gathered_values.append(
-            cross_device_utils.build_collective_gather_indexed_slices(
-                per_replica.values,
-                self._devices,
-                self._group_size,
-                self._collective_keys,
-                communication_hint,
-                timeout=experimental_hints.timeout_seconds))
+        outputs = []
+        for i in range(len(self._devices)):
+          outputs.append(self._launchers[i].all_reduce_indexed_slices(
+              per_replica.values[i], implementation, options.timeout_seconds))
+        gathered_values.append(outputs)
 
     mirrored = []
     for value in gathered_values:
@@ -1180,16 +1203,85 @@ class CollectiveAllReduce(CrossDeviceOps):
           distribute_utils.regroup(value, wrap_class=value_lib.Mirrored))
     return mirrored
 
+  def _gather_implementation(self, per_replica_value, destinations, axis,
+                             options):
+    all_gathered = self._batch_all_gather([per_replica_value], axis, options)[0]
+    values_util.mark_as_unsaveable()
+    devices = get_devices_from(destinations)
+
+    if _devices_match(per_replica_value, destinations):
+      return all_gathered
+
+    # Convert `all_gathered` to a `Mirrored` object, as a simple and uniform
+    # utility to access component for a particular device.
+    if not isinstance(all_gathered, value_lib.Mirrored):
+      all_gathered = value_lib.Mirrored([all_gathered])
+
+    # If we got this far, the destination devices do not match the all-gather
+    # devices, so we must map from one to the other.
+    index = []
+    # We must add these control dependencies, otherwise we can get deadlock.
+    with ops.control_dependencies(all_gathered.values):
+      for d in devices:
+        with ops.device(d):
+          for v in all_gathered.values:
+            if v.device == d:
+              index.append(array_ops.identity(v))
+              break
+            else:
+              index.append(array_ops.identity(all_gathered._primary))  # pylint: disable=protected-access
+    return distribute_utils.regroup(index, wrap_class=value_lib.Mirrored)
+
+  def _batch_all_gather(self, per_replica_values, axis, options):
+    """all gather multiple per-replica-values."""
+    batch_size = len(per_replica_values)
+    # Pass options.implementation to the runtime as a communication
+    # implementation hint.
+    implementation = options.implementation.value
+    # For now, we use NCCL only when batch_size > 1.
+    # TODO(b/132575814): switch to NCCL for all collectives when implementation
+    # is NCCL.
+    if (options.implementation == CommunicationImplemenation.NCCL and
+        batch_size == 1):
+      implementation = CommunicationImplemenation.AUTO.value
+
+    logging.log_first_n(
+        logging.INFO, "Collective batch_all_gather: %d all-gathers, "
+        "num_devices = %d, group_size = %d, implementation = %s, " %
+        (batch_size, len(self._devices), self._group_size, implementation), 10)
+
+    def compute_gathered_values():
+      gathered_values = []
+      with self._lock, ops.name_scope("allgather"):
+        for per_replica in per_replica_values:
+          outputs = []
+          for i in range(len(self._devices)):
+            outputs.append(self._launchers[i].all_gather(
+                per_replica.values[i], axis, implementation,
+                options.timeout_seconds))
+          gathered_values.append(outputs)
+      return gathered_values
+
+    if context.executing_eagerly():
+      gathered_values = def_function.function(compute_gathered_values)()
+    else:
+      gathered_values = compute_gathered_values()
+
+    mirrored = []
+    for value in gathered_values:
+      mirrored.append(
+          distribute_utils.regroup(value, wrap_class=value_lib.Mirrored))
+    return mirrored
+
   def __deepcopy__(self, memo):
     # distribute_coordinator deep-copies the strategy object, so
     # CollectiveAllReduce needs to support deep copy as well.
     collective_keys = copy.deepcopy(self._collective_keys, memo)
-    return CollectiveAllReduce(self._devices, self._group_size, collective_keys,
-                               self._communication)
+    return CollectiveAllReduce(self._devices, self._group_size, collective_keys)
 
 
-def choose_the_best(devices, session_config=None):
-  """Find the best CrossDeviceOps locally given a `tf.compat.v1.ConfigProto`.
+def select_cross_device_ops(devices, session_config=None):
+  """Find the best `CrossDeviceOps` locally given a `tf.compat.v1.ConfigProto`.
 
   Args:
     devices: a list of devices passed to `tf.distribute.Strategy`.
diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index 967de7d8426..7ea5b377e12 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -18,1063 +18,773 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import itertools
+import collections
 import os
 import threading
 import time
 
 from absl.testing import parameterized
-import numpy as np
+
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.distribute import cluster_resolver
-from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.core.protobuf import tensorflow_server_pb2
+from tensorflow.python.distribute import cluster_resolver as cluster_resolver_lib
 from tensorflow.python.distribute import collective_util
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import cross_device_utils
-from tensorflow.python.distribute import device_util
-from tensorflow.python.distribute import distribute_utils
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
-from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import reduce_util
-from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import kernels
+from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variables
+from tensorflow.python.util import nest
+
+CommunicationImplemenation = collective_util.CommunicationImplemenation
+ReduceOp = reduce_util.ReduceOp
+IndexedSlicesValue = indexed_slices.IndexedSlicesValue
+IndexedSlices = indexed_slices.IndexedSlices
 
 
-def _get_devices(devices):
-  if isinstance(devices, (tuple, list)):
-    return tuple(device_util.resolve(d) for d in devices)
-  elif isinstance(devices, value_lib.DistributedValues):
-    return devices._devices
-  elif isinstance(devices, ops.Tensor):
-    return (device_util.resolve(devices.device),)
-  return (device_util.resolve(devices),)
+def make_per_replica_value(value, devices):
+  """Creates a `PerReplica` object whose values reside in `devices`.
 
+  Args:
+    value: a tensor-convertible value or a `IndexedSlicesValue`, or a callable
+      that takes one argument (`device_idx`) and should return the value that is
+      going to be created on devices[device_idx].
+    devices: a list of device strings to create `PerReplica` values on.
 
-def _make_per_replica(values, devices, regroup=False):
-  devices = _get_devices(devices)
-  assert len(values) == len(devices)
-
-  # We simulate the result of regroup called on PerReplica which strips the
-  # PerReplica wrapper if it has only one value.
-  if len(values) == 1 and regroup:
-    with ops.device(devices[0]):
-      placed_v = array_ops.identity(values[0])
-    return placed_v
-
-  index = []
-  for d, v in zip(devices, values):
-    with ops.device(d):
-      placed_v = array_ops.identity(v)
-    index.append(placed_v)
-  return distribute_utils.regroup(index)
-
-
-# pylint: disable=g-doc-args,g-doc-return-or-yield
-def _fake_mirrored(value, devices):
-  """Create a faked Mirrored object for testing.
-
-  All components of the returned Mirrored have the same objects, which is not
-  true in reality.
+  Returns:
+    A `PerReplica` object.
   """
-  devices = _get_devices(devices)
   values = []
-  for d in devices:
-    with ops.device(d):
-      values.append(array_ops.identity(value))
-  return distribute_utils.regroup(
-      values,
-      wrap_class=value_lib.Mirrored)
-
-
-def _make_indexed_slices(values, indices, dense_shape, device):
-  with ops.device(device):
-    tensor = ops.IndexedSlices(
-        values=constant_op.constant(values),
-        indices=constant_op.constant(indices),
-        dense_shape=constant_op.constant(dense_shape))
-  return tensor
-
-
-def _make_mirrored_indexed_slices(devices, values, indices, dense_shape):
-  values = [_make_indexed_slices(values, indices, dense_shape, d)
-            for d in devices]
-  return distribute_utils.regroup(
-      values,
-      wrap_class=value_lib.Mirrored)
-
-
-_cpu_device = "/device:CPU:0"
-
-
-class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
-
-  def _assert_indexed_slices_equal(self, left, right):
-    self.assertIsInstance(left, ops.IndexedSlices)
-    self.assertIsInstance(right, ops.IndexedSlices)
-    self.assertEqual(
-        device_util.resolve(left.device), device_util.resolve(right.device))
-    self.assertAllEqual(
-        self.evaluate(ops.convert_to_tensor(left)),
-        self.evaluate(ops.convert_to_tensor(right)))
-
-  def _assert_mirrored_equal(self,
-                             left_list,
-                             right_list,
-                             sess=None,
-                             run_options=None):
-    if not isinstance(left_list, list):
-      left_list, right_list = [left_list], [right_list]
-
-    for left, right in zip(left_list, right_list):
-      self.assertEqual(type(left), type(right))
-
-      # Convert Mirrored to a list since sess.run(Mirrored) only returns one
-      # value.
-      if isinstance(left, value_lib.Mirrored):
-        left, right = left.values, right.values
-      else:
-        # When there's only one replica Mirrored is automatically unwrapped.
-        left, right = [left], [right]
-
-      for left_value, right_value in zip(left, right):
-        self.assertEqual(
-            device_util.resolve(left_value.device),
-            device_util.resolve(right_value.device))
-
-      # Densify IndexedSlices.
-      left = [ops.convert_to_tensor(v) for v in left]
-      right = [ops.convert_to_tensor(v) for v in right]
-      if not context.executing_eagerly():
-        left, right = sess.run((left, right), options=run_options)
-      for left_value, right_value in zip(left, right):
-        self.assertAllEqual(left_value, right_value)
-
-  def _testReductionAndBroadcast(self, cross_device_ops, devices):
-    if context.num_gpus() < sum(1 for d in devices if "GPU" in d.upper()):
-      self.skipTest("Not enough GPUs")
-
-    with self.cached_session() as sess:
-      values = [constant_op.constant(float(d)) for d in range(len(devices))]
-      per_replica = _make_per_replica(values, devices)
-      mean = (len(devices) - 1.) / 2.
-
-      values_2 = [constant_op.constant(d + 1.0) for d in range(len(devices))]
-      per_replica_2 = _make_per_replica(values_2, devices)
-      mean_2 = mean + 1.
-
-      destination_mirrored = _fake_mirrored(1., devices)
-      destination_different = _fake_mirrored(1.,
-                                             device_util.resolve(_cpu_device))
-      destination_str = device_util.resolve(_cpu_device)
-
-      all_destinations = [
-          destination_mirrored,
-          destination_different,
-          destination_str,
-      ]
-
-      # test reduce()
-      for destinations in all_destinations:
-        self._assert_mirrored_equal(
-            cross_device_ops.reduce(
-                reduce_util.ReduceOp.MEAN,
-                per_replica,
-                destinations=destinations), _fake_mirrored(mean, destinations),
-            sess)
-        self._assert_mirrored_equal(
-            cross_device_ops.reduce(
-                reduce_util.ReduceOp.MEAN,
-                per_replica_2,
-                destinations=destinations),
-            _fake_mirrored(mean_2, destinations), sess)
-        self._assert_mirrored_equal(
-            cross_device_ops.reduce(
-                reduce_util.ReduceOp.SUM,
-                per_replica,
-                destinations=destinations),
-            _fake_mirrored(mean * len(devices), destinations), sess)
-        self._assert_mirrored_equal(
-            cross_device_ops.reduce(
-                reduce_util.ReduceOp.SUM,
-                per_replica_2,
-                destinations=destinations),
-            _fake_mirrored(mean_2 * len(devices), destinations), sess)
-
-      # test batch_reduce()
-      for d1, d2 in itertools.product(all_destinations, all_destinations):
-        self._assert_mirrored_equal(
-            cross_device_ops.batch_reduce(reduce_util.ReduceOp.MEAN,
-                                          [(per_replica, d1),
-                                           (per_replica_2, d2)]),
-            [_fake_mirrored(mean, d1),
-             _fake_mirrored(mean_2, d2)], sess)
-        self._assert_mirrored_equal(
-            cross_device_ops.batch_reduce(reduce_util.ReduceOp.SUM,
-                                          [(per_replica, d1),
-                                           (per_replica_2, d2)]),
-            [
-                _fake_mirrored(mean * len(devices), d1),
-                _fake_mirrored(mean_2 * len(devices), d2)
-            ], sess)
-
-      # test broadcast()
-      for destinations in all_destinations:
-        self._assert_mirrored_equal(
-            cross_device_ops.broadcast(constant_op.constant(1.), destinations),
-            _fake_mirrored(1., destinations), sess)
-
-  def _testIndexedSlicesAllReduce(self, devices, cross_device_ops_instance,
-                                  reduce_op, batch_reduce):
-    with self.cached_session() as sess:
-      dense_shape = [5, 2]
-      t0 = _make_indexed_slices([[1., 2.]], [1], dense_shape, devices[0])
-      t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], dense_shape,
-                                devices[1])
-      per_replica = value_lib.PerReplica((t0, t1))
-
-      if batch_reduce:
-        result = cross_device_ops_instance.batch_reduce(
-            reduce_op, [(per_replica, per_replica)])
-      else:
-        result = cross_device_ops_instance.reduce(reduce_op, per_replica,
-                                                  per_replica)
-
-      total_indices_with_dups = [1, 1, 3]
-      total_indices_without_dups = [1, 3]
-
-      if reduce_op == reduce_util.ReduceOp.SUM:
-        total_values_with_dups = [[1., 2.], [3., 4.], [5., 6.]]
-        total_values_without_dups = [[4., 6.], [5., 6.]]
-      else:
-        assert reduce_op == reduce_util.ReduceOp.MEAN
-        total_values_with_dups = [[0.5, 1.], [1.5, 2.], [2.5, 3.]]
-        total_values_without_dups = [[2., 3.], [2.5, 3.]]
-
-      total_mirrored_with_dups = _make_mirrored_indexed_slices(
-          devices, total_values_with_dups, total_indices_with_dups, dense_shape)
-      total_mirrored_without_dups = _make_mirrored_indexed_slices(
-          devices, total_values_without_dups, total_indices_without_dups,
-          dense_shape)
-
-      # Test that the result is semantically equal to both the concatenated
-      # IndexedSlices, as well as when the duplicate indices are summed up.
-      if batch_reduce:
-        total_mirrored_with_dups = [total_mirrored_with_dups]
-        total_mirrored_without_dups = [total_mirrored_without_dups]
-
-      self._assert_mirrored_equal(total_mirrored_with_dups, result, sess)
-      self._assert_mirrored_equal(total_mirrored_without_dups, result, sess)
-
-
-class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
-
-  reduction_to_one_combinations = combinations.combine(
-      cross_device_ops=[
-          combinations.NamedObject("DefaultReductionToOneDevice",
-                                   cross_device_ops_lib.ReductionToOneDevice()),
-          combinations.NamedObject(
-              "ReductionToCPUDeviceCrossDeviceOps",
-              cross_device_ops_lib.ReductionToOneDevice(
-                  reduce_to_device=_cpu_device)),
-          combinations.NamedObject(
-              "AccumulateNCrossDeviceOp",
-              cross_device_ops_lib.ReductionToOneDevice(
-                  accumulation_fn=math_ops.add_n)),
-      ],
-      devices=[
-          ["/cpu:0"],
-          ["/cpu:0", "/gpu:0"],
-          ["/gpu:0", "/gpu:1"],
-      ],
-      mode=["graph", "eager"])
-  allreduce_combinations = combinations.combine(
-      cross_device_ops=[
-          combinations.NamedObject(
-              "AllReduce",
-              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1)),
-          combinations.NamedObject(
-              "AllReduceNoGradientRepacking",
-              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0)),
-          combinations.NamedObject("NcclAllReduce",
-                                   cross_device_ops_lib.NcclAllReduce()),
-          combinations.NamedObject(
-              "HierarchicalCopy",
-              cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
-      ],
-      devices=[
-          ["/gpu:0", "/gpu:1"],
-      ],
-      mode=["graph", "eager"])
-
-  @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
-  def testReductionAndBroadcast(self, cross_device_ops, devices):
-    if isinstance(
-        cross_device_ops._obj,  # pylint: disable=protected-access
-        cross_device_ops_lib.AllReduceCrossDeviceOps
-    ) and context.executing_eagerly():
-      self.skipTest("b/149881884")
-    self._testReductionAndBroadcast(cross_device_ops, devices)
-
-  def testChooseAlgorithm(self):
-    # Not use nccl if there is any cpu device.
-    self.assertIsInstance(
-        cross_device_ops_lib.choose_the_best(["/cpu:0"]),
-        cross_device_ops_lib.ReductionToOneDevice)
-
-    # Not use nccl if requested device is not visible to TensorFlow.
-    # TODO(yuefengz): make `choose_the_best` work with device strings
-    # self.assertIsInstance(
-    #     cross_device_ops_lib.choose_the_best(["/gpu:100"]),
-    #     cross_device_ops_lib.ReductionToOneDevice)
-
-    if context.num_gpus() < 1:
-      return
-
-    devices = ["/gpu:0"]
-
-    def mock_get_registered_kernels_for_op(op):
-      if op == "NcclAllReduce":
-        return [object]
-      else:
-        return []
-
-    # Use nccl if nccl kernel is found.
-    with test.mock.patch.object(kernels, "get_registered_kernels_for_op",
-                                mock_get_registered_kernels_for_op):
-      self.assertIsInstance(
-          cross_device_ops_lib.choose_the_best(devices),
-          cross_device_ops_lib.NcclAllReduce)
-
-    # Not use nccl if nccl kernel is not found.
-    with test.mock.patch.object(kernels,
-                                "get_registered_kernels_for_op", lambda _: []):
-      self.assertIsInstance(
-          cross_device_ops_lib.choose_the_best(devices),
-          cross_device_ops_lib.ReductionToOneDevice)
-
-  @combinations.generate(combinations.combine(
-      mode=["graph", "eager"],
-      required_gpus=1))
-  def testSimpleReduceWithIndexedSlices(self):
-    devices = ["/cpu:0", "/gpu:0"]
-    t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
-    t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
-    per_replica = value_lib.PerReplica((t0, t1))
-    result = cross_device_ops_lib._simple_reduce(
-        per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM)
-
-    # Test that the result is semantically equal to both the concatenated
-    # IndexedSlices with and without duplicate indices.
-    total_with_dups = _make_indexed_slices(
-        [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0])
-    total_without_dups = _make_indexed_slices(
-        [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
-    self._assert_indexed_slices_equal(total_with_dups, result)
-    self._assert_indexed_slices_equal(total_without_dups, result)
-
-  @combinations.generate(
-      combinations.combine(
-          cross_device_ops_instance=[
-              combinations.NamedObject(
-                  "ReductionToOneDevice",
-                  cross_device_ops_lib.ReductionToOneDevice()),
-              combinations.NamedObject(
-                  "AllReduceCrossDeviceOps",
-                  cross_device_ops_lib.AllReduceCrossDeviceOps())
-          ],
-          reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
-          batch_reduce=[True, False],
-          mode=["graph", "eager"],
-          required_gpus=1))
-  def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
-                                 batch_reduce):
-    devices = ["/cpu:0", "/gpu:0"]
-    self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance,
-                                     reduce_op, batch_reduce)
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          cross_device_ops_instance=[
-              combinations.NamedObject(
-                  "ReductionToOneDevice",
-                  cross_device_ops_lib.ReductionToOneDevice()),
-              combinations.NamedObject(
-                  "AllReduceCrossDeviceOps",
-                  cross_device_ops_lib.AllReduceCrossDeviceOps("ring"))
-          ],
-          batch_reduce=[True, False],
-          mode=["graph", "eager"]))
-  def testReduceDistributedVariable(self, distribution,
-                                    cross_device_ops_instance, batch_reduce):
-    with distribution.scope():
-      v = variables.Variable(1.)
-    if batch_reduce:
-      result = cross_device_ops_instance.batch_reduce(reduce_util.ReduceOp.MEAN,
-                                                      [(v, v)])[0]
+  for device_idx, device in enumerate(devices):
+    v = value(device_idx) if callable(value) else value
+    if isinstance(v, IndexedSlicesValue):
+      with ops.device(device):
+        values.append(
+            IndexedSlices(
+                values=array_ops.identity(v.values),
+                indices=array_ops.identity(v.indices),
+                dense_shape=array_ops.identity(v.dense_shape)))
     else:
-      result = cross_device_ops_instance.reduce(reduce_util.ReduceOp.MEAN, v, v)
-    for v in result.values:
-      self.assertIsInstance(v, ops.Tensor)
-    self.evaluate(variables.global_variables_initializer())
-    self.assertAllEqual(self.evaluate(result.values), [1.0, 1.0])
+      with ops.device(device):
+        values.append(array_ops.identity(v))
+  return value_lib.PerReplica(values)
 
 
-class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase,
-                                    CrossDeviceOpsTestBase):
-
-  worker_devices = [
-      "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
-  ]
-  multi_worker_allreduce_combinations = combinations.combine(
-      cross_device_ops=[
-          combinations.NamedObject(
-              "MultiWorkerAllReduce",
-              cross_device_ops_lib.MultiWorkerAllReduce(worker_devices, 2,
-                                                        ("pscpu/pscpu", 2, -1),
-                                                        0)),
-          combinations.NamedObject(
-              "MultiWorkerAllReducePack",
-              cross_device_ops_lib.MultiWorkerAllReduce(worker_devices, 2,
-                                                        ("pscpu/pscpu", 2, -1),
-                                                        1)),
-          combinations.NamedObject(
-              "MultiWorkerAllReduceMultipleSpecs",
-              cross_device_ops_lib.MultiWorkerAllReduce(
-                  worker_devices, 2, [("pscpu/pscpu", 2, 100),
-                                      ("xring", 2, -1)], 0)),
-      ],
-      devices=[
-          [
-              "/job:worker/replica:0/task:0/device:CPU:0",
-              "/job:worker/replica:0/task:1/device:CPU:0"
-          ],
-          [
-              "/job:worker/replica:0/task:0/device:GPU:0",
-              "/job:worker/replica:0/task:1/device:GPU:0"
-          ],
-          [
-              "/job:worker/replica:0/task:0/device:GPU:0",
-              "/job:worker/replica:0/task:0/device:GPU:1",
-              "/job:worker/replica:0/task:1/device:GPU:0",
-              "/job:worker/replica:0/task:1/device:GPU:1"
-          ],
-      ],
-      mode=["graph"])
-
-  @combinations.generate(multi_worker_allreduce_combinations)
-  def testReductionAndBroadcast(self, cross_device_ops, devices):
-    # Mimic the default device of multi-worker strategies.
-    with ops.device("/job:worker/replica:0/task:0"):
-      self._testReductionAndBroadcast(cross_device_ops, devices)
+def enable_collective_ops():
+  """Enable collectives in the current process."""
+  cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
+  context.context().configure_collective_ops(
+      collective_leader="'/job:worker/replica:0/task:0'")
+  config_proto = config_pb2.ConfigProto()
+  config_proto.experimental.collective_group_leader = (
+      "/job:worker/replica:0/task:0")
+  server_def = tensorflow_server_pb2.ServerDef(
+      cluster=cluster_resolver.cluster_spec().as_cluster_def(),
+      default_session_config=config_proto,
+      job_name=cluster_resolver.task_type,
+      task_index=cluster_resolver.task_id,
+      protocol=cluster_resolver.rpc_layer)
+  context.context().enable_collective_ops(server_def)
 
 
-NUM_WORKERS = 3
+class MultiProcessPoolRunner():
 
-CollectiveCommunication = cross_device_ops_lib.CollectiveCommunication
+  def __init__(self, num_processes):
+    cluster_spec_dict = multi_worker_test_base.create_cluster_spec(
+        num_workers=num_processes)
+    self.runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec_dict)
 
 
-class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
-                              CrossDeviceOpsTestBase):
+# Global MultiProcessPoolRunners that can be shared by test cases to avoid
+# expensive initialization cost of TensorFlow in new processes.
+#
+# Note that they have to be globals and can't be owned by test classes because
+# usually fn usually captures the test class instance, and test class
+# instance can't be pickled if it has mpr as a member (it is not allowed to
+# pickle Process objects).
+# TODO(crccw): Use `num_workers` combination once it is ready.
+global_mpr_2p = MultiProcessPoolRunner(num_processes=2)
+global_mpr_1p = MultiProcessPoolRunner(num_processes=1)
 
-  collective_key_base = 100000
 
-  @classmethod
-  def setUpClass(cls):
-    """Create a local cluster with 3 workers."""
-    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
-        num_workers=NUM_WORKERS, num_ps=0)
+def get_global_mpr(num_processes):
+  if num_processes == 1:
+    return global_mpr_1p.runner
+  elif num_processes == 2:
+    return global_mpr_2p.runner
+  else:
+    raise ValueError("get_global_mpr: num_processes must be 1 or 2, got %d" %
+                     num_processes)
+
+
+class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
 
   def setUp(self):
-    super(CollectiveAllReduceTest, self).setUp()
-    # Reusing keys is not supported well. So we have to give a different
-    # collective key base for different tests.
-    CollectiveAllReduceTest.collective_key_base += 100000
+    super().setUp()
+    # Enabling collectives can be done in "setUpClass", but requires using
+    # different collective_keys in different tests as collectives are reused
+    # across tests. Always resetting collective ops before each test offers
+    # better test isolation.
+    global_mpr_1p.runner.run(enable_collective_ops)
+    global_mpr_2p.runner.run(enable_collective_ops)
 
-  def _get_test_objects(self,
-                        task_type,
-                        task_id,
-                        num_gpus=0,
-                        communication=CollectiveCommunication.AUTO,
-                        use_strategy_object=False,
-                        local_mode=False):
-    collective_keys = cross_device_utils.CollectiveKeys(
-        group_key_start=10 + CollectiveAllReduceTest.collective_key_base,
-        op_instance_key_start=100 + CollectiveAllReduceTest.collective_key_base,
-        variable_instance_key_start=10000 +
-        CollectiveAllReduceTest.collective_key_base)
-    if local_mode:
-      if num_gpus:
-        devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
-      else:
-        devices = ["/device:CPU:0"]
+  def make_collective(self, num_processes, gpu_per_process):
+    """Returns collectives and other info to be used in tests.
 
-      if use_strategy_object:
-        strategy = (
-            collective_all_reduce_strategy.CollectiveAllReduceStrategy
-            ._from_local_devices(devices, communication=communication))  # pylint: disable=protected-access
-        strategy.extended._collective_keys = collective_keys
-        strategy.extended._cross_device_ops._collective_keys = collective_keys
-        strategy.extended._host_cross_device_ops._collective_keys = (
-            collective_keys)
-        return strategy, devices, ""
-      else:
-        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
-            devices=devices,
-            group_size=len(devices),
-            collective_keys=collective_keys,
-            communication=communication)
-        return collective_all_reduce_ops, devices, ""
-    else:
-      # NCCL requires physical GPUs for every replica, which we can't do with
-      # simulated multi host set up now.
-      assert communication != CollectiveCommunication.NCCL
-      if num_gpus:
-        devices = [
-            "/job:%s/task:%d/replica:0/device:GPU:%d" % (task_type, task_id, i)
-            for i in range(num_gpus)
-        ]
-      else:
-        devices = [
-            "/job:%s/task:%d/replica:0/device:CPU:0" % (task_type, task_id)
-        ]
+    Args:
+      num_processes: an integer indicating the number of processes that
+        participate in the collective.
+      gpu_per_process: number of GPUs (0 if no GPUs) used by each process.
 
-      if use_strategy_object:
-        resolver = cluster_resolver.SimpleClusterResolver(
-            cluster_spec=multi_worker_util.normalize_cluster_spec(
-                self._cluster_spec),
-            task_type=task_type,
-            task_id=task_id,
-            num_accelerators={"GPU": num_gpus})
-        strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
-            cluster_resolver=resolver, communication=communication)
-        strategy.extended._collective_keys = collective_keys
-        strategy.extended._cross_device_ops._collective_keys = collective_keys
-        return (strategy, devices,
-                "grpc://" + self._cluster_spec[task_type][task_id])
-      else:
-        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
-            devices=devices,
-            group_size=len(devices) * NUM_WORKERS,
-            collective_keys=collective_keys,
-            communication=communication)
-        return (collective_all_reduce_ops, devices,
-                "grpc://" + self._cluster_spec[task_type][task_id])
+    Returns:
+     A tuple of (collective, devices, group_size) where collective is a instance
+     of `CollectiveAllReduce`, devices are a list of local devices (str)
+     attached to the current process, and group_size is the group_size of
+     collective.
+    """
 
-  def _assert_mirrored_equal(self, left_list, right_list, sess=None):
-    if context.executing_eagerly():
-      run_options = None
-    else:
-      # TODO(b/151025792): figure out why missing run options would make the
-      # test flaky and whether this is a problem in TF 2.
-      run_options = config_pb2.RunOptions()
-      run_options.experimental.collective_graph_key = 5
-    super(CollectiveAllReduceTest, self)._assert_mirrored_equal(
-        left_list, right_list, sess, run_options=run_options)
-
-  def _test_reduction(self,
-                      task_type,
-                      task_id,
-                      num_gpus,
-                      communication,
-                      use_strategy_object=False,
-                      local_mode=False,
-                      hints=None):
-    collective_all_reduce, devices, master_target = self._get_test_objects(
-        task_type,
-        task_id,
-        num_gpus,
-        communication=communication,
-        use_strategy_object=use_strategy_object,
-        local_mode=local_mode)
-    if local_mode:
-      num_workers = 1
-      worker_device = None
-    else:
-      num_workers = len(self._cluster_spec.get("chief", [])) + len(
-          self._cluster_spec.get("worker", []))
-      worker_device = "/job:%s/task:%d" % (task_type, task_id)
-
-    def _reduce(test_object, reduce_op, per_replica, destinations):
-      if use_strategy_object:
-        with test_object.scope():
-          return test_object.extended.reduce_to(reduce_op, per_replica,
-                                                destinations, hints)
-      else:
-        return test_object.reduce(reduce_op, per_replica, destinations, hints)
-
-    def _batch_reduce(test_object, reduce_op, value_destination_pairs):
-      if use_strategy_object:
-        with test_object.scope():
-          return test_object.extended.batch_reduce_to(reduce_op,
-                                                      value_destination_pairs,
-                                                      hints)
-      else:
-        return test_object.batch_reduce(reduce_op, value_destination_pairs,
-                                        hints)
-
-    with ops.Graph().as_default(), \
-         ops.device(worker_device), \
-         self.cached_session(target=master_target) as sess:
-      # Collective ops doesn't support scalar tensors, so we have to construct
-      # 1-d tensors.
-      values = [constant_op.constant([float(d)]) for d in range(len(devices))]
-      per_replica = _make_per_replica(values, devices)
-      mean = np.array([(len(devices) - 1.) / 2.])
-
-      values_2 = [constant_op.constant([d + 1.0]) for d in range(len(devices))]
-      per_replica_2 = _make_per_replica(values_2, devices)
-      mean_2 = np.array([mean[0] + 1.])
-
-      destination_mirrored = _fake_mirrored(1., devices)
-      destination_different = _fake_mirrored(1., _cpu_device)
-      destination_str = _cpu_device
-
-      all_destinations = [
-          destination_different, destination_mirrored, destination_str
+    cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
+    devices = [
+        "/job:worker/replica:0/task:%d/device:CPU:0" % cluster_resolver.task_id
+    ]
+    if gpu_per_process > 0:
+      devices = [
+          "/job:worker/replica:0/task:%d/device:GPU:%d" %
+          (cluster_resolver.task_id, i) for i in range(gpu_per_process)
       ]
+    group_size = num_processes * len(devices)
+    collective = cross_device_ops_lib.CollectiveAllReduce(
+        devices=devices, group_size=group_size)
+    return collective, devices, cluster_resolver.task_id
 
-      # test reduce()
-      for destinations in all_destinations:
-        self._assert_mirrored_equal(
-            _reduce(
-                collective_all_reduce,
-                reduce_util.ReduceOp.MEAN,
-                per_replica,
-                destinations=destinations), _fake_mirrored(mean, destinations),
-            sess)
-        self._assert_mirrored_equal(
-            _reduce(
-                collective_all_reduce,
-                reduce_util.ReduceOp.MEAN,
-                per_replica_2,
-                destinations=destinations),
-            _fake_mirrored(mean_2, destinations), sess)
-        self._assert_mirrored_equal(
-            _reduce(
-                collective_all_reduce,
-                reduce_util.ReduceOp.SUM,
-                per_replica,
-                destinations=destinations),
-            _fake_mirrored(mean * len(devices) * num_workers, destinations),
-            sess)
-        self._assert_mirrored_equal(
-            _reduce(
-                collective_all_reduce,
-                reduce_util.ReduceOp.SUM,
-                per_replica_2,
-                destinations=destinations),
-            _fake_mirrored(mean_2 * len(devices) * num_workers, destinations),
-            sess)
+  def as_list(self, value):
+    """An utility to convert a `Mirrored`, `Tensor` or `IndexedSlices` to a list.
 
-      # test batch_reduce()
-      for d1, d2 in itertools.product(all_destinations, all_destinations):
-        self._assert_mirrored_equal(
-            _batch_reduce(collective_all_reduce, reduce_util.ReduceOp.MEAN,
-                          [(per_replica, d1), (per_replica_2, d2)]),
-            [_fake_mirrored(mean, d1),
-             _fake_mirrored(mean_2, d2)], sess)
-        self._assert_mirrored_equal(
-            _batch_reduce(collective_all_reduce, reduce_util.ReduceOp.SUM,
-                          [(per_replica, d1), (per_replica_2, d2)]),
-            [
-                _fake_mirrored(mean * len(devices) * num_workers, d1),
-                _fake_mirrored(mean_2 * len(devices) * num_workers, d2)
-            ], sess)
+    The reason it exists is to provide a uniformed view of returned value of
+    "reduce" calls, especially across tf.function boundaries. Returning
+    `Mirrored` from a tf.function will only evaluate the primary value, which
+    makes collective ops of non-primary device being pruned, and will eventually
+    cause hanging.
 
-  def _get_indexed_slices(self,
-                          devices,
-                          start_i,
-                          variable_length,
-                          as_per_replica=True):
-    dense_shape = [10, 2]
-    values = ([[1., 2.]], [[3., 4.]], [[2., 1.]], [[0., 0.]], [[3., 1.]],
-              [[2., 1.]])
-    indices = ([1], [2], [3], [4], [5], [6])
+    Args:
+      value: the value to convert, can be one of `Mirrored`, `Tensor` and
+        `IndexedSlices`.
 
-    # values and indices that have variable lengths.
-    vl_values = ([[1., 2.], [3., 4.]], [[3., 4.]], [[2., 1.]], [[0., 0.]],
-                 [[3., 1.], [2., 1.]], [[2., 1.]])
-    vl_indices = ([1, 2], [2], [3], [4], [5, 6], [6])
-
-    indexed_slices = []
-    for i, d in enumerate(devices):
-      idx = i + start_i
-      indexed_slices.append(
-          _make_indexed_slices(
-              vl_values[idx] if variable_length else values[idx],
-              vl_indices[idx] if variable_length else indices[idx], dense_shape,
-              d))
-    if as_per_replica:
-      per_replica = value_lib.PerReplica(indexed_slices)
-      return per_replica
+    Returns:
+      A list of `Tensor` or `IndexedSlices`.
+    """
+    if isinstance(value, ops.Tensor):
+      return [value]
+    elif isinstance(value, IndexedSlices):
+      return [value]
+    elif isinstance(value, value_lib.Mirrored):
+      return value.values
     else:
-      return indexed_slices
+      raise ValueError("unwrap: unsupported input type: %s" % type(value))
 
-  def _test_reduce_indexed_slices(self,
-                                  task_type,
-                                  task_id,
-                                  num_gpus,
-                                  communication,
-                                  batch_reduce,
-                                  variable_length,
-                                  local_mode=False):
-    collective_all_reduce, devices, master_target = self._get_test_objects(
-        task_type,
-        task_id,
-        num_gpus,
-        communication=communication,
-        local_mode=local_mode)
-    if local_mode:
-      num_workers = 1
-      worker_device = None
-    else:
-      num_workers = len(self._cluster_spec.get("chief", [])) + len(
-          self._cluster_spec.get("worker", []))
-      worker_device = "/job:%s/task:%d" % (task_type, task_id)
-    with ops.Graph().as_default(), \
-         ops.device(worker_device), \
-         self.cached_session(target=master_target) as sess:
-      per_replica = self._get_indexed_slices(devices,
-                                             (task_id or 0) * max(num_gpus, 1),
-                                             variable_length)
+  RunOptions = collections.namedtuple(  # pylint: disable=invalid-name
+      "RunOptions",
+      [
+          "mode",  # A list of str from ["eager", "func_graph"]
+          "num_processes",
+          "gpus_per_process",
+          "reduce_op",
+          "communication_options",
+          "use_scoped_allocator",
+      ])
+  RunOptions.__new__.__defaults__ = (["eager",
+                                      "func_graph"], 2, 0, ReduceOp.SUM,
+                                     collective_util.Options(), True)
 
-      if batch_reduce:
-        result = collective_all_reduce.batch_reduce(
-            reduce_util.ReduceOp.SUM, [(per_replica, per_replica)])[0]
-      else:
-        result = collective_all_reduce.reduce(reduce_util.ReduceOp.SUM,
-                                              per_replica, per_replica)
-      if num_gpus > 1:
-        self.assertIsInstance(result, value_lib.Mirrored)
+  def reduce_and_verify(self, inputs, expect, options):
+    """Reduce the given `inputs` and verify the output matches `expect`.
 
-      run_options = config_pb2.RunOptions()
-      run_options.experimental.collective_graph_key = 7
-      if num_gpus > 1:
-        result = sess.run([ops.convert_to_tensor(v) for v in result.values],
-                          options=run_options)[0]
-      else:
-        result = sess.run(ops.convert_to_tensor(result), options=run_options)
+    Args:
+      inputs: a list of `Tensor` or `IndexedSlices`, where i-th value will be
+        fed to i-th replica.
+      expect: a `Tensor` or `IndexedSlices`. This should be the expected value
+        for one replica.
+      options: a `RunOpotions` instance.
+    """
 
-      # Reduce the same indexed slices on CPU locally as our expected results.
-      devices_cpu = [(worker_device or "") + "/device:CPU:0"] * (
-          max(num_gpus, 1) * num_workers)
-      per_replica_on_cpu = self._get_indexed_slices(
-          devices_cpu, 0, variable_length, as_per_replica=False)
-      expected_result = cross_device_utils.aggregate_tensors_or_indexed_slices(
-          per_replica_on_cpu)
-      expected_result = sess.run(ops.convert_to_tensor(expected_result))
+    def replica_fn():
+      collective, devices, pid = self.make_collective(options.num_processes,
+                                                      options.gpus_per_process)
 
-      self.assertAllEqual(expected_result, result)
+      def reduce_fn():
+        value_fn = lambda device_idx: inputs[pid * len(devices) + device_idx]
+        per_replica_value = make_per_replica_value(value_fn, devices)
+        reduced_values = collective.reduce(options.reduce_op, per_replica_value,
+                                           per_replica_value,
+                                           options.communication_options)
+        reduced_values = self.as_list(reduced_values)
+        self.assertAllEqual(devices, [v.device for v in reduced_values])
+        return [ops.convert_to_tensor(v) for v in reduced_values]
+
+      per_replica_expect = [ops.convert_to_tensor(expect)] * len(devices)
+
+      if "eager" in options.mode:
+        got = reduce_fn()
+        self.assertAllClose(got, per_replica_expect)
+
+      if "func_graph" in options.mode:
+        got = def_function.function(reduce_fn)()
+        self.assertAllClose(got, per_replica_expect)
+
+    get_global_mpr(options.num_processes).run(replica_fn)
+
+  def batch_reduce_and_verify(self, inputs, expect, options):
+    """Batch reduce the given `inputs` and verify the output matches `expect`.
+
+    Args:
+      inputs: a 2-level nested list of `Tensor` or `IndexedSlices`, where i-th
+        value will be fed to i-th replica.
+      expect: a list of `Tensor` or `IndexedSlices`. This should be the expected
+        value for one replica.
+      options: a `RunOpotions` instance.
+    """
+
+    def replica_fn():
+      cross_device_utils.CollectiveReplicaLauncher._use_scoped_allocator = (
+          options.use_scoped_allocator)
+      collective, devices, pid = self.make_collective(options.num_processes,
+                                                      options.gpus_per_process)
+
+      def batch_reduce_fn():
+        batch_size = len(inputs[0])
+        value_dst_pairs = []
+        for i in range(batch_size):
+
+          def value_fn(device_idx, idx=i):
+            return inputs[pid * len(devices) + device_idx][idx]
+
+          per_replica_value = make_per_replica_value(value_fn, devices)
+          value_dst_pairs.append((per_replica_value, per_replica_value))
+        reduced_values = collective.batch_reduce(options.reduce_op,
+                                                 value_dst_pairs,
+                                                 options.communication_options)
+        reduced_values = [self.as_list(v) for v in reduced_values]
+        for v in reduced_values:
+          self.assertAllEqual(devices, [t.device for t in v])
+        return nest.map_structure(ops.convert_to_tensor, reduced_values)
+
+      per_replica_expect = nest.map_structure(
+          lambda x: [ops.convert_to_tensor(x)] * len(devices), expect)
+
+      if "eager" in options.mode:
+        got = batch_reduce_fn()
+        self.assertAllClose(got, per_replica_expect)
+
+      if "func_graph" in options.mode:
+        got = def_function.function(batch_reduce_fn)()
+        self.assertAllClose(got, per_replica_expect)
+
+    get_global_mpr(options.num_processes).run(replica_fn)
 
   @combinations.generate(
       combinations.combine(
-          mode=["graph"],
+          num_processes=[1, 2],
           required_gpus=[0, 1, 2],
-          use_strategy_object=[True, False],
-          bytes_per_pack=[0, 1, 4]))
-  def testReductionDistributed(self, required_gpus, use_strategy_object,
-                               bytes_per_pack):
-    hints = collective_util.Hints(bytes_per_pack=bytes_per_pack)
-    self._run_between_graph_clients(
-        self._test_reduction,
-        self._cluster_spec,
-        required_gpus,
-        communication=CollectiveCommunication.RING,
-        use_strategy_object=use_strategy_object,
-        hints=hints)
-
-  @combinations.generate(
-      combinations.combine(
-          mode=["graph"],
-          required_gpus=[0, 1, 2],
-          variable_length=[True, False]))
-  def testReduceIndexedSlicesDistributed(self, required_gpus, variable_length):
-    self._run_between_graph_clients(
-        self._test_reduce_indexed_slices,
-        self._cluster_spec,
-        required_gpus,
-        communication=CollectiveCommunication.RING,
-        batch_reduce=True,
-        variable_length=variable_length)
-
-  # Collective ops doesn't support strategy with one device.
-  @combinations.generate(
-      combinations.combine(
-          mode=["graph"],
-          required_gpus=2,
-          communication=[
-              CollectiveCommunication.NCCL, CollectiveCommunication.RING
+          implementation=[
+              # NCCL is only used for batch reduce, so we are not including
+              # NCCL combination here.
+              CommunicationImplemenation.AUTO,
+              CommunicationImplemenation.RING
           ],
-          use_strategy_object=[True, False]))
-  def testReductionLocal(self, required_gpus, communication,
-                         use_strategy_object):
-    self._test_reduction(
-        None,
-        None,
-        required_gpus,
-        communication=communication,
-        use_strategy_object=use_strategy_object,
-        local_mode=True)
+          reduce_op=[ReduceOp.SUM, ReduceOp.MEAN]))
+  def testAllReduceDense(self, num_processes, required_gpus, implementation,
+                         reduce_op):
+    options = self.RunOptions(
+        num_processes=num_processes,
+        gpus_per_process=required_gpus,
+        reduce_op=reduce_op,
+        communication_options=collective_util.Options(
+            implementation=implementation))
+    group_size = options.num_processes * (options.gpus_per_process or 1)
+
+    inputs_data = [1.0, 2.0, 3.0, 4.0]
+    inputs = inputs_data[0:group_size]
+
+    if group_size == 1:
+      expect = 1.0
+    if group_size == 2:
+      expect = 3.0 if reduce_op == ReduceOp.SUM else 1.5
+    elif group_size == 4:
+      expect = 10.0 if reduce_op == ReduceOp.SUM else 2.5
+
+    self.reduce_and_verify(inputs, expect, options)
 
   @combinations.generate(
       combinations.combine(
-          mode=["graph"],
-          required_gpus=2,
-          batch_reduce=[True, False],
-          variable_length=[True, False],
-          communication=[
-              CollectiveCommunication.NCCL, CollectiveCommunication.RING
-          ]))
-  def testReduceIndexedSlicesLocal(self, required_gpus, batch_reduce,
-                                   variable_length, communication):
-    self._test_reduce_indexed_slices(
-        None,
-        None,
-        required_gpus,
-        communication=communication,
-        batch_reduce=batch_reduce,
-        variable_length=variable_length,
-        local_mode=True)
+          num_processes=[1, 2],
+          required_gpus=[0, 1, 2],
+          implementation=[
+              # NCCL is only used for batch reduce, so we are not including
+              # NCCL combination here.
+              CommunicationImplemenation.AUTO,
+              CommunicationImplemenation.RING
+          ],
+          # TODO(b/166682130): add MEAN reduce once the bug is fixed.
+          reduce_op=ReduceOp.SUM))
+  def testAllReduceSparse(self, num_processes, required_gpus, implementation,
+                          reduce_op):
+    options = self.RunOptions(
+        mode=["func_graph"],  # Sparse reduce is not supported in eager.
+        num_processes=num_processes,
+        gpus_per_process=required_gpus,
+        reduce_op=reduce_op,
+        communication_options=collective_util.Options(
+            implementation=implementation))
+    group_size = options.num_processes * (options.gpus_per_process or 1)
+
+    inputs_data = [
+        IndexedSlicesValue(
+            values=[[1.], [2.]], indices=[0, 1], dense_shape=[10, 1]),
+        IndexedSlicesValue(
+            values=[[3.], [4.]], indices=[1, 2], dense_shape=[10, 1]),
+        IndexedSlicesValue(
+            values=[[5.], [6.]], indices=[7, 8], dense_shape=[10, 1]),
+        IndexedSlicesValue(
+            values=[[7.], [8.]], indices=[3, 2], dense_shape=[10, 1]),
+    ]
+    inputs = inputs_data[0:group_size]
+
+    if group_size == 1:
+      expect = IndexedSlices(
+          values=[[1.], [2.]], indices=[0, 1], dense_shape=[10, 1])
+    elif group_size == 2:
+      expect = IndexedSlices(
+          values=[[1.], [2.], [3.], [4.]],
+          indices=[0, 1, 1, 2],
+          dense_shape=[10, 1])
+    elif group_size == 4:
+      expect = IndexedSlices(
+          values=[[1.], [2.], [3.], [4.], [5.], [6.], [7.], [8.]],
+          indices=[0, 1, 1, 2, 7, 8, 3, 2],
+          dense_shape=[10, 1])
+
+    self.reduce_and_verify(inputs, expect, options)
+
+  def testAllReduceSparseVariableLength(self):
+    # One device per process, 2 processes, 2 replicas in total.
+    inputs = [
+        IndexedSlicesValue(values=[[1.]], indices=[0], dense_shape=[10, 1]),
+        IndexedSlicesValue(
+            values=[[2.], [3.], [4.]], indices=[0, 1, 2], dense_shape=[10, 1]),
+    ]
+    expect = IndexedSlices(
+        values=[[1.], [2.], [3.], [4.]],
+        indices=[0, 0, 1, 2],
+        dense_shape=[10, 1])
+    self.reduce_and_verify(
+        inputs,
+        expect,
+        self.RunOptions(
+            mode=["func_graph"],  # Sparse reduce is not supported in eager.
+            num_processes=2,
+            reduce_op=ReduceOp.SUM))
 
   @combinations.generate(
       combinations.combine(
-          required_gpus=2,
-          mode="eager",
-          communication=[
-              CollectiveCommunication.NCCL, CollectiveCommunication.RING
-          ]))
-  def testEagerMultiThread(self, communication):
-    collective, devices, _ = self._get_test_objects(
-        None,
-        None,
-        num_gpus=2,
-        communication=communication,
-        use_strategy_object=False,
-        local_mode=True)
+          num_processes=[1, 2],
+          required_gpus=[0, 1, 2],
+          implementation=[
+              CommunicationImplemenation.AUTO, CommunicationImplemenation.RING,
+              CommunicationImplemenation.NCCL
+          ],
+          reduce_op=[ReduceOp.SUM, ReduceOp.MEAN],
+          use_scoped_allocator=[True, False]))
+  def testBatchAllReduceDense(self, num_processes, required_gpus,
+                              implementation, reduce_op, use_scoped_allocator):
+    if required_gpus == 0 and implementation == CommunicationImplemenation.NCCL:
+      self.skipTest("Skip CPU + NCCL combination")
+    if num_processes == 2 and implementation == CommunicationImplemenation.NCCL:
+      self.skipTest("Skip NCCL + 2 processes combination. NCCL requires "
+                    "physical GPUs for every process.")
 
-    # We would like to simulate the following sequence:
-    #   thread-0  device0                 device1
-    #   thread-1          device0 device1
-    # If the kernel launch sequence is as-is the program will deadlock since
-    # NCCL requires the launch order to be same on each device.
-    v0 = _make_per_replica([1.0 for _ in devices], devices)
-    v1 = _make_per_replica([2.0 for _ in devices], devices)
+    options = self.RunOptions(
+        num_processes=num_processes,
+        gpus_per_process=required_gpus,
+        reduce_op=reduce_op,
+        communication_options=collective_util.Options(
+            implementation=implementation),
+        use_scoped_allocator=use_scoped_allocator)
+    group_size = options.num_processes * (options.gpus_per_process or 1)
 
-    # Add a delay to collective_ops.all_reduce according to the input tensors
-    # index in `sequence.`
-    sequence = [v0.values[0], v1.values[0], v1.values[1], v0.values[1]]
-    all_reduce = collective_ops.all_reduce
+    inputs_data = [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]
+    inputs = inputs_data[0:group_size]
 
-    def delayed_all_reduce(input_tensor, *args, **kwargs):
-      for idx, v in enumerate(sequence):
-        if input_tensor is v:
-          time.sleep(idx)
-          break
-      return all_reduce(input_tensor, *args, **kwargs)
+    if group_size == 1:
+      expect = [1.0, 2.0]
+    if group_size == 2:
+      expect = [4.0, 6.0] if reduce_op == ReduceOp.SUM else [2.0, 3.0]
+    elif group_size == 4:
+      expect = [16.0, 20.0] if reduce_op == ReduceOp.SUM else [4.0, 5.0]
 
-    with test.mock.patch.object(collective_ops, "all_reduce",
-                                delayed_all_reduce):
-      # We only use NCCL for batch reduce with two or more values, so we use two
-      # values here.
-
-      def thread_fn():
-        reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v0, v0),
-                                                                     (v0, v0)])
-        self.assertAllEqual(reduced[0].values, [2.0, 2.0])
-        self.assertAllEqual(reduced[1].values, [2.0, 2.0])
-
-      t = threading.Thread(target=thread_fn)
-      t.start()
-      reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v1, v1),
-                                                                   (v1, v1)])
-      self.assertAllEqual(reduced[0].values, [4.0, 4.0])
-      self.assertAllEqual(reduced[1].values, [4.0, 4.0])
-      t.join()
+    self.batch_reduce_and_verify(inputs, expect, options)
 
   @combinations.generate(
       combinations.combine(
-          required_gpus=2,
-          mode="eager",
-          communication=[
-              CollectiveCommunication.NCCL, CollectiveCommunication.RING
-          ]))
-  def testInputsAreFunctionArgs(self, communication):
-    # Function inputs don't have device placement.
-    hints = collective_util.Hints(bytes_per_pack=1)
-    collective, devices, _ = self._get_test_objects(
-        None,
-        None,
-        num_gpus=2,
-        communication=communication,
-        use_strategy_object=False,
-        local_mode=True)
-    devices = [device_util.canonicalize(d) for d in devices]
+          num_processes=[1, 2],
+          required_gpus=[0, 1, 2],
+          implementation=[
+              CommunicationImplemenation.AUTO,
+              CommunicationImplemenation.RING,
+              CommunicationImplemenation.NCCL,
+          ],
+          # TODO(b/166682130): add MEAN reduce once the bug is fixed.
+          reduce_op=ReduceOp.SUM,
+          use_scoped_allocator=[True, False]))
+  def testBatchAllReduceSparse(self, num_processes, required_gpus,
+                               implementation, reduce_op, use_scoped_allocator):
+    if required_gpus == 0 and implementation == CommunicationImplemenation.NCCL:
+      self.skipTest("Skip CPU + NCCL combination")
+    if num_processes == 2 and implementation == CommunicationImplemenation.NCCL:
+      self.skipTest("Skip NCCL + 2 processes combination. NCCL requires "
+                    "physical GPUs for every process.")
 
-    @def_function.function
-    def reduce_fn(v):
-      self.assertEqual(v.values[0].device, "")
-      self.assertEqual(v.values[1].device, "")
-      # We only use NCCL for batch reduce with two or more values, so we use two
-      # values here.
-      reduced = collective.batch_reduce(
-          reduce_util.ReduceOp.SUM, [(v, v), (v, v)], experimental_hints=hints)
-      self.assertEqual(reduced[0].values[0].device, devices[0])
-      self.assertEqual(reduced[0].values[1].device, devices[1])
-      self.assertEqual(reduced[1].values[0].device, devices[0])
-      self.assertEqual(reduced[1].values[1].device, devices[1])
-      # Returning Mirrored only evaluates the primary value, which causes
-      # hanging,
-      return [reduced[0].values, reduced[1].values]
+    options = self.RunOptions(
+        mode=["func_graph"],  # Sparse reduce is not supported in eager.
+        num_processes=num_processes,
+        gpus_per_process=required_gpus,
+        reduce_op=reduce_op,
+        communication_options=collective_util.Options(
+            implementation=implementation),
+        use_scoped_allocator=use_scoped_allocator)
+    group_size = options.num_processes * (options.gpus_per_process or 1)
 
-    v = _make_per_replica([1.0, 2.0], devices)
-    reduced = reduce_fn(v)
-    self.assertAllEqual(self.evaluate(reduced), [[3.0, 3.0], [3.0, 3.0]])
-
-  @combinations.generate(
-      combinations.combine(
-          required_gpus=[0, 1],
-          mode="eager",
-          communication=[CollectiveCommunication.RING]))
-  def testTimeoutReduceDense(self, communication, required_gpus):
-    hints = collective_util.Hints(timeout_seconds=1)
-    collective, devices, _ = self._get_test_objects(
-        "worker",
-        0,
-        num_gpus=required_gpus,
-        communication=communication,
-        use_strategy_object=False)
-    remote.connect_to_cluster(
-        multi_worker_util.normalize_cluster_spec(self._cluster_spec),
-        protocol="grpc")
-    devices = [device_util.canonicalize(d) for d in devices]
-    v = _make_per_replica([1.0], devices)
-
-    @def_function.function
-    def reduce_dense():
-      collective.reduce(reduce_util.ReduceOp.SUM, v, v, hints)
-
-    # The collective should time out because we only launch it on worker-0,
-    # while there're three workers in total.
-    with self.assertRaises(errors.DeadlineExceededError):
-      reduce_dense()
-
-    # Reset since collective failures poison the context.
-    context._reset_context()  # pylint: disable=protected-access
-
-  @combinations.generate(
-      combinations.combine(
-          required_gpus=[0, 1],
-          mode="eager",
-          communication=[CollectiveCommunication.RING]))
-  def testTimeoutBatchReduceDense(self, communication, required_gpus):
-    hints = collective_util.Hints(timeout_seconds=1)
-    collective, devices, _ = self._get_test_objects(
-        "worker",
-        0,
-        num_gpus=required_gpus,
-        communication=communication,
-        use_strategy_object=False)
-    remote.connect_to_cluster(
-        multi_worker_util.normalize_cluster_spec(self._cluster_spec),
-        protocol="grpc")
-    devices = [device_util.canonicalize(d) for d in devices]
-    v = _make_per_replica([1.0], devices)
-
-    @def_function.function
-    def batch_reduce_dense():
-      collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v, v), (v, v)], hints)
-
-    # The collective should time out because we only launch it on worker-0,
-    # while there're three workers in total.
-    with self.assertRaises(errors.DeadlineExceededError):
-      batch_reduce_dense()
-
-    # Reset since collective failures poison the context.
-    context._reset_context()  # pylint: disable=protected-access
-
-  @combinations.generate(
-      combinations.combine(
-          required_gpus=[0, 1],
-          mode="eager",
-          communication=[CollectiveCommunication.RING]))
-  def testTimeoutReduceSparse(self, communication, required_gpus):
-    hints = collective_util.Hints(timeout_seconds=1)
-    collective, devices, _ = self._get_test_objects(
-        "worker",
-        0,
-        num_gpus=required_gpus,
-        communication=communication,
-        use_strategy_object=False)
-    remote.connect_to_cluster(
-        multi_worker_util.normalize_cluster_spec(self._cluster_spec),
-        protocol="grpc")
-    devices = [device_util.canonicalize(d) for d in devices]
-    v = value_lib.PerReplica([
-        _make_indexed_slices([[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
+    inputs_data = ([
+        IndexedSlicesValue(
+            values=[[1.], [2.]], indices=[0, 1], dense_shape=[10, 1]),
+        IndexedSlicesValue(
+            values=[[3.], [4.]], indices=[1, 2], dense_shape=[5, 1])
+    ], [
+        IndexedSlicesValue(
+            values=[[5.], [6.]], indices=[1, 2], dense_shape=[10, 1]),
+        IndexedSlicesValue(
+            values=[[7.], [8.]], indices=[0, 1], dense_shape=[5, 1])
+    ], [
+        IndexedSlicesValue(
+            values=[[9.], [10.]], indices=[3, 4], dense_shape=[10, 1]),
+        IndexedSlicesValue(
+            values=[[11.], [12.]], indices=[3, 4], dense_shape=[5, 1])
+    ], [
+        IndexedSlicesValue(
+            values=[[13.], [14.]], indices=[8, 9], dense_shape=[10, 1]),
+        IndexedSlicesValue(
+            values=[[15.], [16.]], indices=[3, 4], dense_shape=[5, 1])
     ])
+    inputs = inputs_data[0:group_size]
 
-    @def_function.function
-    def reduce_sparse():
-      collective.reduce(reduce_util.ReduceOp.SUM, v, v, hints)
-
-    # The collective should time out because we only launch it on worker-0,
-    # while there're three workers in total.
-    with self.assertRaises(errors.DeadlineExceededError):
-      reduce_sparse()
-
-    # Reset since collective failures poison the context.
-    context._reset_context()  # pylint: disable=protected-access
+    if group_size == 1:
+      expect = [
+          IndexedSlices(
+              values=[[1.], [2.]], indices=[0, 1], dense_shape=[10, 1]),
+          IndexedSlicesValue(
+              values=[[3.], [4.]], indices=[1, 2], dense_shape=[5, 1])
+      ]
+    if group_size == 2:
+      expect = [
+          IndexedSlices(
+              values=[[1.], [2.], [5.], [6.]],
+              indices=[0, 1, 1, 2],
+              dense_shape=[10, 1]),
+          IndexedSlices(
+              values=[[3.], [4.], [7.], [8.]],
+              indices=[1, 2, 3, 4],
+              dense_shape=[5, 1])
+      ]
+    elif group_size == 4:
+      expect = [
+          IndexedSlices(
+              values=[[1.], [2.], [5.], [6.], [9.], [10.], [13.], [14.]],
+              indices=[0, 1, 1, 2, 3, 4, 8, 9],
+              dense_shape=[10, 1]),
+          IndexedSlices(
+              values=[[3.], [4.], [7.], [8.], [11.], [12.], [15.], [16.]],
+              indices=[1, 2, 0, 1, 3, 4, 3, 4],
+              dense_shape=[5, 2])
+      ]
+      self.batch_reduce_and_verify(inputs, expect, options)
 
   @combinations.generate(
       combinations.combine(
+          num_processes=[1, 2],
+          required_gpus=[0, 1, 2],
+          axis=[0, 1, 2],
+          func_mode=["eager", "func_graph"],
+          implementation=[
+              CommunicationImplemenation.NCCL, CommunicationImplemenation.AUTO,
+              CommunicationImplemenation.RING
+          ]))
+  def testAllGatherSameShape(self, num_processes, required_gpus, implementation,
+                             func_mode, axis):
+
+    def replica_fn():
+      collective, devices, _ = self.make_collective(num_processes,
+                                                    required_gpus)
+      options = collective_util.Options(implementation=implementation)
+      value = constant_op.constant([[[1, 2], [1, 2]]], dtype=dtypes.float32)
+
+      def gather_fn():
+        per_replica_value = make_per_replica_value(value, devices)
+        gathered_values = collective._gather(
+            per_replica_value, per_replica_value, axis=axis, options=options)
+        gathered_values = self.as_list(gathered_values)
+        # Skip checking devices in eager. In eager the device attribute doesn't
+        # reflect the actual device of the tensor.
+        if not context.executing_eagerly():
+          self.assertAllEqual(devices, [v.device for v in gathered_values])
+        return [ops.convert_to_tensor(v) for v in gathered_values]
+
+      group_size = num_processes * (required_gpus or 1)
+      expect = array_ops.concat([value] * group_size, axis=axis)
+      per_replica_expect = [ops.convert_to_tensor(expect)] * len(devices)
+
+      if func_mode == "eager":
+        result = gather_fn()
+        self.assertAllClose(result, per_replica_expect)
+
+      if func_mode == "func_graph":
+        result = def_function.function(gather_fn)()
+        self.assertAllClose(result, per_replica_expect)
+
+    get_global_mpr(num_processes).run(replica_fn)
+
+  @combinations.generate(
+      combinations.combine(
+          num_processes=1,
+          required_gpus=2,
+          implementation=[
+              CommunicationImplemenation.NCCL, CommunicationImplemenation.RING
+          ]))
+  def testMultiThreadedCollectiveLaunchNoInterleave(self, num_processes,
+                                                    required_gpus,
+                                                    implementation):
+
+    def replica_fn():
+      collective, devices, _ = self.make_collective(num_processes,
+                                                    required_gpus)
+      options = collective_util.Options(implementation=implementation)
+
+      # We would like to simulate the following sequence:
+      #   thread-0  device0                 device1
+      #   thread-1          device0 device1
+      # If the kernel launch sequence is as-is the program will deadlock since
+      # NCCL requires the launch order to be same on each device.
+      v0 = make_per_replica_value(1.0, devices)
+      v1 = make_per_replica_value(2.0, devices)
+
+      # Add a delay to collective_ops.all_reduce according to the input tensors
+      # index in `sequence.`
+      sequence = [v0.values[0], v1.values[0], v1.values[1], v0.values[1]]
+      all_reduce = collective_ops.all_reduce
+
+      def delayed_all_reduce(input_tensor, *args, **kwargs):
+        for idx, v in enumerate(sequence):
+          if input_tensor is v:
+            time.sleep(idx)
+            break
+        return all_reduce(input_tensor, *args, **kwargs)
+
+      with test.mock.patch.object(collective_ops, "all_reduce",
+                                  delayed_all_reduce):
+        # We only use NCCL for batch reduce with two or more values, so we use
+        # two values here.
+
+        def thread_fn():
+          reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM,
+                                            [(v0, v0), (v0, v0)], options)
+          self.assertAllEqual(reduced[0].values, [2.0, 2.0])
+          self.assertAllEqual(reduced[1].values, [2.0, 2.0])
+
+        t = threading.Thread(target=thread_fn)
+        t.start()
+        reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v1, v1),
+                                                                     (v1, v1)],
+                                          options)
+        self.assertAllEqual(reduced[0].values, [4.0, 4.0])
+        self.assertAllEqual(reduced[1].values, [4.0, 4.0])
+        t.join()
+
+    get_global_mpr(num_processes).run(replica_fn)
+
+  @combinations.generate(
+      combinations.combine(
+          num_processes=1,
+          required_gpus=2,
+          implementation=[
+              CommunicationImplemenation.NCCL, CommunicationImplemenation.RING
+          ]))
+  def testInputsAreFunctionArgs(self, num_processes, required_gpus,
+                                implementation):
+
+    def replica_fn():
+      collective, devices, _ = self.make_collective(num_processes,
+                                                    required_gpus)
+      options = collective_util.Options(implementation=implementation)
+
+      @def_function.function
+      def reduce_fn(v):
+        # Function inputs don't have device placement.
+        self.assertEqual(v.values[0].device, "")
+        self.assertEqual(v.values[1].device, "")
+        # We only use NCCL for batch reduce with two or more values, so we use
+        # two values here.
+        reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v, v),
+                                                                     (v, v)],
+                                          options)
+        self.assertEqual(reduced[0].values[0].device, devices[0])
+        self.assertEqual(reduced[0].values[1].device, devices[1])
+        self.assertEqual(reduced[1].values[0].device, devices[0])
+        self.assertEqual(reduced[1].values[1].device, devices[1])
+        # Returning Mirrored only evaluates the primary value, which causes
+        # hanging,
+        return [reduced[0].values, reduced[1].values]
+
+      v = make_per_replica_value(1.0, devices)
+      reduced = reduce_fn(v)
+      self.assertAllClose(reduced, [[2.0, 2.0], [2.0, 2.0]])
+
+    get_global_mpr(num_processes).run(replica_fn)
+
+  @combinations.generate(
+      combinations.combine(
+          num_processes=2,
           required_gpus=[0, 1],
-          mode="eager",
-          communication=[CollectiveCommunication.RING]))
-  def testTimeoutBatchReduceSparse(self, communication, required_gpus):
-    hints = collective_util.Hints(timeout_seconds=1)
-    collective, devices, _ = self._get_test_objects(
-        "worker",
-        0,
-        num_gpus=required_gpus,
-        communication=communication,
-        use_strategy_object=False)
-    remote.connect_to_cluster(
-        multi_worker_util.normalize_cluster_spec(self._cluster_spec),
-        protocol="grpc")
-    devices = [device_util.canonicalize(d) for d in devices]
-    v = value_lib.PerReplica([
-        _make_indexed_slices([[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
-    ])
+          implementation=[CommunicationImplemenation.RING]))
+  def testTimeoutReduceDense(self, num_processes, implementation,
+                             required_gpus):
 
-    @def_function.function
-    def batch_reduce_sparse():
-      collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v, v), (v, v)], hints)
+    def replica_fn():
+      collective, devices, task_id = self.make_collective(
+          num_processes, required_gpus)
+      if task_id != 0:
+        return
 
-    # The collective should time out because we only launch it on worker-0,
-    # while there're three workers in total.
-    with self.assertRaises(errors.DeadlineExceededError):
-      batch_reduce_sparse()
+      v = make_per_replica_value(1.0, devices)
+      options = collective_util.Options(
+          timeout_seconds=1, implementation=implementation)
 
-    # Reset since collective failures poison the context.
-    context._reset_context()  # pylint: disable=protected-access
+      @def_function.function
+      def reduce_dense():
+        collective.reduce(reduce_util.ReduceOp.SUM, v, v, options)
+
+      # The collective should time out because we only launch it on worker-0,
+      # while there're three workers in total.
+      with self.assertRaises(errors.DeadlineExceededError):
+        reduce_dense()
+
+    get_global_mpr(num_processes).run(replica_fn)
+
+  @combinations.generate(
+      combinations.combine(
+          num_processes=2,
+          required_gpus=[0, 1],
+          implementation=[CommunicationImplemenation.RING]))
+  def testTimeoutBatchReduceDense(self, num_processes, implementation,
+                                  required_gpus):
+
+    def replica_fn():
+      collective, devices, task_id = self.make_collective(
+          num_processes, required_gpus)
+      if task_id != 0:
+        return
+
+      v = make_per_replica_value(1.0, devices)
+      options = collective_util.Options(
+          timeout_seconds=1, implementation=implementation)
+
+      @def_function.function
+      def batch_reduce_dense():
+        collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v, v), (v, v)],
+                                options)
+
+      # The collective should time out because we only launch it on worker-0,
+      # while there're two workers in total.
+      with self.assertRaises(errors.DeadlineExceededError):
+        batch_reduce_dense()
+
+    get_global_mpr(num_processes).run(replica_fn)
+
+  @combinations.generate(
+      combinations.combine(
+          num_processes=2,
+          required_gpus=[0, 1],
+          implementation=[CommunicationImplemenation.RING]))
+  def testTimeoutReduceSparse(self, num_processes, implementation,
+                              required_gpus):
+
+    def replica_fn():
+      collective, devices, task_id = self.make_collective(
+          num_processes, required_gpus)
+      if task_id != 0:
+        return
+
+      v = make_per_replica_value(
+          IndexedSlicesValue(
+              values=[[4., 6.]], indices=[1], dense_shape=[5, 2]), devices)
+      options = collective_util.Options(
+          timeout_seconds=1, implementation=implementation)
+
+      @def_function.function
+      def reduce_sparse():
+        collective.reduce(reduce_util.ReduceOp.SUM, v, v, options)
+
+      # The collective should time out because we only launch it on worker-0,
+      # while there're two workers in total.
+      with self.assertRaises(errors.DeadlineExceededError):
+        reduce_sparse()
+
+    get_global_mpr(num_processes).run(replica_fn)
+
+  @combinations.generate(
+      combinations.combine(
+          num_processes=2,
+          required_gpus=[0, 1],
+          implementation=[CommunicationImplemenation.RING]))
+  def testTimeoutBatchReduceSparse(self, num_processes, required_gpus,
+                                   implementation):
+
+    def replica_fn():
+      collective, devices, task_id = self.make_collective(
+          num_processes, required_gpus)
+      if task_id != 0:
+        return
+
+      v = make_per_replica_value(
+          IndexedSlicesValue(
+              values=[[4., 6.]], indices=[1], dense_shape=[5, 2]), devices)
+      options = collective_util.Options(
+          timeout_seconds=1, implementation=implementation)
+
+      @def_function.function
+      def batch_reduce_sparse():
+        collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v, v), (v, v)],
+                                options)
+
+      # The collective should time out because we only launch it on worker-0,
+      # while there're two workers in total.
+      with self.assertRaises(errors.DeadlineExceededError):
+        batch_reduce_sparse()
+
+    get_global_mpr(num_processes).run(replica_fn)
 
 
 if __name__ == "__main__":
   # Set default inter op thread pool size to one to ensure we don't exhaust the
   # thread pool with the additional executors to run collectives in eager.
   os.environ["TF_NUM_INTEROP_THREADS"] = "1"
-  combinations.main()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index a8d4d176ab9..09e588975f3 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -18,15 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections as pycoll
 import copy
 import threading
 
-from tensorflow.python.distribute import all_reduce
 from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -36,8 +33,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nccl_ops
 from tensorflow.python.platform import tf_logging as logging
 
-
-OP_INSTANCE_KEY_START_NUMBER = 100
+INSTANCE_KEY_START_NUMBER = 100
 
 
 def aggregate_gradients_using_nccl(replica_grads):
@@ -171,65 +167,6 @@ def aggregate_single_gradient_using_copy(grad_and_vars, use_mean,
     return (grad, v), None
 
 
-def group_device_names(devices, group_size):
-  """Group device names into groups of group_size.
-
-  Args:
-    devices: a list of canonical device strings.
-    group_size: integer which is equal to or greater than 1.
-
-  Returns:
-    list of lists of devices, where each inner list is group_size long,
-      and each device appears at least once in an inner list.  If
-      len(devices) % group_size == 0 then each device will appear exactly once.
-
-  Raises:
-    ValueError: if group_size > len(devices)
-  """
-  num_devices = len(devices)
-  if group_size > num_devices:
-    raise ValueError(
-        'only %d devices, but group_size=%d' % (num_devices, group_size))
-  num_groups = (
-      num_devices // group_size + (1 if (num_devices % group_size != 0) else 0))
-  groups = [[] for i in range(num_groups)]
-  for i in range(num_groups * group_size):
-    groups[i % num_groups].append(devices[i % num_devices])
-  return groups
-
-
-def split_grads_by_size(threshold_size, device_grads):
-  """Break gradients into two sets according to tensor size.
-
-  Args:
-    threshold_size: int size cutoff for small vs large tensor.
-    device_grads: List of lists of (gradient, variable) tuples.  The outer
-        list is over devices. The inner list is over individual gradients.
-
-  Returns:
-    small_grads: Subset of device_grads where shape is <= threshold_size
-       elements.
-    large_grads: Subset of device_grads where shape is > threshold_size
-       elements.
-  """
-  small_grads = []
-  large_grads = []
-  for dl in device_grads:
-    small_dl = []
-    large_dl = []
-    for (g, v) in dl:
-      tensor_size = g.get_shape().num_elements()
-      if tensor_size <= threshold_size:
-        small_dl.append([g, v])
-      else:
-        large_dl.append([g, v])
-    if small_dl:
-      small_grads.append(small_dl)
-    if large_dl:
-      large_grads.append(large_dl)
-  return small_grads, large_grads
-
-
 # TODO(yuefengz): use random key starts to avoid reusing keys?
 class CollectiveKeys(object):
   """Class that manages collective keys.
@@ -243,69 +180,66 @@ class CollectiveKeys(object):
   *Instance key*: an integer key to identify the set of same counterpart of
   tensors on different devices in a device group that need to be all-reduced.
 
-  "Graph key": an integer key that is unique key graph. This is used to support
-  multiple graphs per client session. It must be non-zero and set in the
-  `config` argument of each call to `session.run`.
-
   This class is thread safe.
   """
 
-  def __init__(self,
-               group_key_start=1,
-               op_instance_key_start=OP_INSTANCE_KEY_START_NUMBER,
-               variable_instance_key_start=1000000):
+  def __init__(self, group_key_start=1):
     """Initializes the object.
 
     Args:
       group_key_start: the starting integer of group key.
-      op_instance_key_start: the starting integer of instance key for ops.
-      variable_instance_key_start: the starting integer of instance key for
-        variables.
     """
     self._group_key = group_key_start
     self._group_key_table = {}
-
-    assert op_instance_key_start != variable_instance_key_start
-    self._op_instance_key = op_instance_key_start
-    self._variable_instance_key = variable_instance_key_start
+    self._instance_key_table = {}
     self._lock = threading.Lock()
 
   def get_group_key(self, devices):
     """Returns a group key for the set of devices.
 
     Args:
-      devices: list of strings naming devices in a collective group.
+      devices: a list of canonical device strings in a collective group.
 
     Returns:
       int key uniquely identifying the set of device names.
     """
-    parsed = [pydev.DeviceSpec.from_string(d) for d in devices]
-    # In the between-graph replicated training, different workers need to get
-    # the same device key. So we remove the task_type and task_id from the
-    # devices.
-    # TODO(yuefengz): in the in-graph replicated training, we need to include
-    # task_type and task_id.
-    names = sorted(['%s:%d' % (d.device_type, d.device_index) for d in parsed])
-    key_id = ','.join(names)
+    key_id = hash(tuple(sorted(devices)))
     with self._lock:
       if key_id not in self._group_key_table:
         new_key = self._group_key
         self._group_key += 1
         self._group_key_table[key_id] = new_key
+        self._instance_key_table[new_key] = {}
+        for device in devices:
+          self._instance_key_table[new_key][device] = INSTANCE_KEY_START_NUMBER
       return self._group_key_table[key_id]
 
-  def get_op_instance_key(self):
-    """Returns a new instance key for use in defining a collective op."""
-    with self._lock:
-      v = self._op_instance_key
-      self._op_instance_key += 1
-      return v
+  def get_instance_key(self, group_key, device):
+    """Returns a new instance key for use in defining a collective op.
 
-  def get_variable_instance_key(self):
-    """Returns a new instance key for use in creating a Variable."""
+    You should call this once per each collective op of a collective instance.
+
+    Args:
+      group_key: the group key returned by get_group_key(). You should not
+        assign the group key yourself.
+      device: a canonical device string. It should be the device this collective
+        op is on.
+
+    Returns:
+      a new instance key.
+
+    Raises:
+      ValueError: when the group key is invalid or the device is not in the
+      group.
+    """
     with self._lock:
-      v = self._variable_instance_key
-      self._variable_instance_key += 1
+      group = self._instance_key_table.get(group_key, None)
+      if group is None:
+        raise ValueError('group {} not found'.format(group_key))
+      if device not in group:
+        raise ValueError('{} not in group {}'.format(device, group_key))
+      v = group[device]
+      group[device] += 1
       return v
 
   def __deepcopy__(self, memo):
@@ -314,214 +248,264 @@ class CollectiveKeys(object):
     copied = CollectiveKeys()
     copied._group_key = self._group_key
     copied._group_key_table = copy.deepcopy(self._group_key_table, memo)
-    copied._op_instance_key = self._op_instance_key
-    copied._variable_instance_key = self._variable_instance_key
+    copied._instance_key_table = copy.deepcopy(self._instance_key_table, memo)
     return copied
 
 
-def build_collective_reduce(input_tensors,
-                            devices,
-                            group_size,
-                            collective_keys,
-                            reduction_op='Add',
-                            unary_op='Id',
-                            communication_hint='AUTO',
-                            control_inputs=None,
-                            executors=None,
-                            timeout=None):
-  """Build a subgraph that does one full all-reduce, using the collective Op.
+class CollectiveReplicaLauncher(object):
+  """Launch collectives on one replica."""
 
-  If called in eager mode, it's required to supply a list of async executors for
-  each input Tensor.
+  _use_scoped_allocator = True
 
-  Args:
-    input_tensors: tensors within a single worker graph that are to be reduced
-      together; must be one per device.
-    devices: a list of device strings to run the collective on.
-    group_size: total number of devices globally that will be doing this same
-      reduction.  The reduction will actually include the corresponding tensors
-      at all these workers.
-    collective_keys: a CollectiveKeys object.
-    reduction_op: string naming the reduction op.
-    unary_op: string naming the unary final op.
-    communication_hint: string providing hint to runtime for choosing collective
-      implementation.
-    control_inputs: if not None, add control edges between control_inputs and
-      (index-wise) corresponding collective_reduce tensors
-    executors: a list of async executor. Required for eager execution.
-    timeout: a float or None. The timeout in seconds.
+  def __init__(self,
+               group_key,
+               group_size,
+               collective_keys,
+               device,
+               executor=None):
+    if executor and not executor.is_async():
+      raise ValueError('executor must be async')
+    self._group_key = group_key
+    self._group_size = group_size
+    self._collective_keys = collective_keys
+    self._device = device
+    self._executor = executor
 
-  Returns:
-    An array of final tensors, one per device, computed by the full reduction.
-
-  Raises:
-    ValueError: There must be at least two tensors over all the workers.
-  """
-  if context.executing_eagerly():
-    if (not executors or len(executors) != len(input_tensors) or
-        not all(e.is_async() for e in executors)):
-      raise ValueError(
-          'collectives requires async executors for each device in eager mode')
-  if len(input_tensors) != len(devices):
-    raise ValueError('collective requires one input tensor for each device, '
-                     'len(input_tensors) = %d, len(devices) = %d' %
-                     (len(input_tensors), len(devices)))
-
-  if group_size < 2:
-    return input_tensors
-  group_key = collective_keys.get_group_key(devices)
-  instance_key = collective_keys.get_op_instance_key()
-  subdiv_offsets = [0]  # TODO(tucker): maybe support non-default subdiv spec
-
-  out_tensors = []
-  for idx, input_tensor in enumerate(input_tensors):
+  def _executor_scope(self):
+    if context.executing_eagerly() and not self._executor:
+      raise ValueError('collectives requires a async executor in eager mode')
     if context.executing_eagerly():
-      executor_scope = context.executor_scope(executors[idx])
-    else:
-      executor_scope = ops.NullContextmanager()
-    with executor_scope, \
-         ops.device(devices[idx]), \
-         ops.control_dependencies(
-             _control_input(devices, control_inputs, idx)):
-      out_tensor = collective_ops.all_reduce(
+      return context.executor_scope(self._executor)
+    return ops.NullContextmanager()
+
+  def _control_input(self, control_input):
+    if control_input is not None:
+      return ops.control_dependencies([control_input])
+    return ops.NullContextmanager()
+
+  def all_reduce(self,
+                 input_tensor,
+                 control_input=None,
+                 communication_hint='AUTO',
+                 timeout=0):
+    """All-reduce a dense tensor.
+
+    This can be called in eager mode if a async executor is supplied when
+    creating the launcher.
+
+    Args:
+      input_tensor: a dense tensor. It must have the same shape on all replicas.
+      control_input: if not None, add control edges between control_input and
+        the all-reduce.
+      communication_hint: string providing hint to runtime for choosing
+        collective implementation.
+      timeout: a float. The timeout in seconds.
+
+    Returns:
+      The reduced tensor.
+    """
+    instance_key = self._collective_keys.get_instance_key(
+        self._group_key, self._device)
+    with self._executor_scope(), \
+         ops.device(self._device), \
+         self._control_input(control_input):
+      return collective_ops.all_reduce(
           input_tensor,
-          group_size,
-          group_key,
+          self._group_size,
+          self._group_key,
           instance_key,
-          reduction_op,
-          unary_op,
-          subdiv_offsets,
+          communication_hint=communication_hint,
+          timeout=timeout)
+
+  def batch_all_reduce(self,
+                       input_tensor_packs,
+                       communication_hint='AUTO',
+                       timeout=0):
+    """Batch all-reduce dense tensors.
+
+    This takes a list of batches of tensors. Using multiple batches have the
+    benefit that it doesn't need to wait for all inputs to be ready to start the
+    all-reduce.
+
+    This can be called in eager mode if a async executor is supplied when
+    creating the launcher.
+
+    Args:
+      input_tensor_packs: a list of lists of dense tensors.
+      communication_hint: string providing hint to runtime for choosing
+        collective implementation.
+      timeout: a float. The timeout in seconds.
+
+    Returns:
+      A flat list of reduced tensors.
+    """
+    # We don't batch with concat in eager. It's easy to get it wrong because
+    # we need to avoid any numpy() calls on values produced by the async
+    # executor. This effectively disables batching in eager, but it's unlikely
+    # to all-reduce a large number of tensors in eager.
+    batch_with_concat = (not self._use_scoped_allocator and
+                         not context.executing_eagerly())
+    outputs = []
+    for pack in input_tensor_packs:
+      # TODO(b/169168846): inserts a parallel all_gather to verify packings
+      # are the same on each replica.
+      if batch_with_concat:
+        with ops.device(self._device):
+          flat_tensors = [array_ops.reshape(t, [-1]) for t in pack]
+          shapes = [array_ops.shape(t) for t in pack]
+          if communication_hint == 'NCCL' and outputs:
+            control_input = outputs[-1]
+          else:
+            control_input = None
+          reduced = self.all_reduce(
+              array_ops.concat(flat_tensors, axis=0), control_input,
+              communication_hint, timeout)
+          num_elements = [math_ops.reduce_prod(s) for s in shapes]
+          flat_outputs = array_ops.split(reduced, num_elements, axis=0)
+          for shape, flat_output in zip(shapes, flat_outputs):
+            outputs.append(array_ops.reshape(flat_output, shape))
+      else:
+        # By placing all CollectiveReduce ops in a batch under single name
+        # scope, we ensure they will be picked up by the `ScopedAllocator`
+        # grappler optimizer and packed into a single all-reduce.
+        with ops.name_scope('allreduce'):
+          for input_tensor in pack:
+            if communication_hint == 'NCCL' and outputs:
+              control_input = outputs[-1]
+            else:
+              control_input = None
+            outputs.append(
+                self.all_reduce(input_tensor, control_input, communication_hint,
+                                timeout))
+
+    return outputs
+
+  def all_gather(self,
+                 input_tensor,
+                 axis,
+                 communication_hint='AUTO',
+                 timeout=0):
+    """All-gather a dense tensor.
+
+    This method must be called inside a tf.function.
+
+    Args:
+      input_tensor: a dense tensor. It must have the same rank on all replicas,
+        and dimensions other than `axis` need to be the same as well.
+      axis: 0-D int32 Tensor. Dimension along which to gather. Must be in the
+        range [0, rank(value)).
+      communication_hint: string providing hint to runtime for choosing
+        collective implementation. Available options are `AUTO`, `NCCL`, and
+        `RING`.
+      timeout: a float. The timeout in seconds.
+
+    Returns:
+      The gathered Tensor.
+
+    Raises:
+      RuntimeError: if called in eager mode.
+    """
+    if context.executing_eagerly():
+      raise RuntimeError('all_gather in eager mode is not supported')
+
+    instance_key_tensor = self._collective_keys.get_instance_key(
+        self._group_key, self._device)
+    instance_key_shape = self._collective_keys.get_instance_key(
+        self._group_key, self._device)
+    with ops.device(self._device):
+      # 1. Transpose
+      # E.g. Given an input_tensor with shape [2,2,5,1] and axis to gather is 3,
+      # we use perm_pre=[3 0 1 2] to reshape it to [1,2,2,5], which
+      # brings the 3rd dim first; afterwards we use perm_after=[1,2,3,0] to
+      # place it back.
+      perm_pre = array_ops.concat(
+          ([axis], math_ops.range(axis),
+           math_ops.range(axis + 1, array_ops.rank(input_tensor))),
+          axis=0)
+      input_tensor_t = array_ops.transpose(input_tensor, perm=perm_pre)
+      # 2. Pad
+      gathered_shape = collective_ops.all_gather(
+          array_ops.expand_dims_v2(array_ops.shape_v2(input_tensor_t), axis=0),
+          self._group_size,
+          self._group_key,
+          instance_key_shape,
           communication_hint,
           timeout=timeout)
-    out_tensors.append(out_tensor)
-  return out_tensors
+      first_dims = gathered_shape[:, 0]
+      full_axis_dim = math_ops.reduce_max(first_dims)
+      padded_input_tensor = _pad_util(input_tensor_t, full_axis_dim)
 
+      # 3. Gather
+      gather_padded_out_tensor = collective_ops.all_gather(
+          padded_input_tensor,
+          self._group_size,
+          self._group_key,
+          instance_key_tensor,
+          communication_hint,
+          timeout=timeout)
+      # 4. Unpad
+      split_tensors = []
+      for i in range(first_dims.shape[0]):
+        start_pos = i * full_axis_dim
+        split_tensors.append(gather_padded_out_tensor[start_pos:start_pos +
+                                                      first_dims[i]])
+      out_tensor_t = array_ops.concat(split_tensors, 0)
 
-def build_collective_gather(input_tensors,
-                            devices,
-                            group_size,
-                            collective_keys,
-                            communication_hint='AUTO',
-                            control_inputs=None,
-                            timeout=None):
-  """Build a subgraph that does one full all-gather, using the collective Op.
+      # 5. Transpose back
+      perm_after = array_ops.concat(
+          (math_ops.range(1, axis + 1), [0],
+           math_ops.range(axis + 1, array_ops.rank(input_tensor_t))),
+          axis=0)
+      return array_ops.transpose(out_tensor_t, perm=perm_after)
 
-  This method must be called in graph mode or inside a tf.function.
+  def all_reduce_indexed_slices(self,
+                                input_slices,
+                                communication_hint='AUTO',
+                                timeout=0):
+    """All-reduce an IndexedSlices.
 
-  Args:
-    input_tensors: tensors within a single worker graph that are to be gathered
-      together; must be one per device.
-    devices: a list of device strings to run the collective on.
-    group_size: total number of devices globally that will be doing this same
-      gathering. The gathering will actually include the corresponding tensors
-      at all these workers.
-    collective_keys: a CollectiveKeys object.
-    communication_hint: string providing hint to runtime for choosing collective
-      implementation.
-    control_inputs: if not None, add control edges between control_inputs and
-      (index-wise) corresponding collective_gather tensors
-    timeout: a float or None. The timeout in seconds.
+    This method must be called inside a tf.function.
 
-  Returns:
-    An array of final tensors, one per device, computed by the full gather.
-  """
-  assert not context.executing_eagerly(), (
-      'build_collective_gather can only be called in graph mode or inside '
-      'tf.function')
-  if len(input_tensors) != len(devices):
-    raise ValueError(
-        'collective requires one input tensor for each device, %d != %d' %
-        (len(input_tensors), len(devices)))
+    Args:
+      input_slices: an IndexedSlices.
+      communication_hint: string providing hint to runtime for choosing
+        collective implementation.
+      timeout: a float. The timeout in seconds.
 
-  if group_size < 2:
-    return input_tensors
-  group_key = collective_keys.get_group_key(devices)
-  instance_key = collective_keys.get_op_instance_key()
+    Returns:
+      The reduced IndexedSlices.
 
-  out_tensors = []
-  for idx, input_tensor in enumerate(input_tensors):
-    with ops.device(devices[idx]):
-      with ops.control_dependencies(
-          _control_input(devices, control_inputs, idx)):
-        out_tensor = collective_ops.all_gather(
-            input_tensor,
-            group_size,
-            group_key,
-            instance_key,
-            communication_hint,
-            timeout=timeout)
-      out_tensors.append(out_tensor)
-  return out_tensors
+    Raises:
+      RuntimeError: if called in eager mode.
+    """
+    if context.executing_eagerly():
+      raise RuntimeError(
+          'all_reduce_indexed_slices in eager mode is not supported')
 
+    gather_length_key = self._collective_keys.get_instance_key(
+        self._group_key, self._device)
+    gather_indices_key = self._collective_keys.get_instance_key(
+        self._group_key, self._device)
+    gather_values_key = self._collective_keys.get_instance_key(
+        self._group_key, self._device)
+    reduce_densified_key = self._collective_keys.get_instance_key(
+        self._group_key, self._device)
 
-def build_collective_gather_indexed_slices(input_slices_list,
-                                           devices,
-                                           group_size,
-                                           collective_keys,
-                                           communication_hint='AUTO',
-                                           control_inputs=None,
-                                           timeout=None):
-  """Build a subgraph that all-gathers IndexedSlices using the collective Op.
-
-  This method must be called in graph mode or inside a tf.function.
-
-  Args:
-    input_slices_list: a list of IndexedSlices within a single worker graph that
-      are to be gathered together; must be one per device.
-    devices: a list of device strings to run the collective on.
-    group_size: total number of devices globally that will be doing this same
-      gathering. The gathering will actually include the corresponding tensors
-      at all these workers.
-    collective_keys: a CollectiveKeys object.
-    communication_hint: string providing hint to runtime for choosing collective
-      implementation.
-    control_inputs: if not None, add control edges between control_inputs and
-      (index-wise) corresponding collective_reduce tensors
-    timeout: a float or None. The timeout in seconds.
-
-  Returns:
-    An array of final IndexedSlices, one per device, computed by the full
-    gather.
-
-  Raises:
-    ValueError: if control_inputs is not None and doesn't match the length and
-      devices of inputs.
-  """
-  assert not context.executing_eagerly(), (
-      'build_collective_gather_indexed_slices can only be called in graph mode'
-      ' or inside tf.function')
-  if len(input_slices_list) != len(devices):
-    raise ValueError(
-        'collective requires one input IndexedSlice for each device, %d != %d' %
-        (len(input_slices_list), len(devices)))
-
-  if group_size < 2:
-    return input_slices_list
-
-  group_key = collective_keys.get_group_key(devices)
-  gather_length_key = collective_keys.get_op_instance_key()
-  gather_indices_key = collective_keys.get_op_instance_key()
-  gather_values_key = collective_keys.get_op_instance_key()
-  reduce_densified_key = collective_keys.get_op_instance_key()
-
-  # Current CollectiveAllGather implementations require input IndexedSlices to
-  # have consistent length across the board, we handle the reduction of
-  # IndexedSlices as follows:
-  #   1. Gather the lengths of IndexedSlices from all participants.
-  #   2. If they have consistent length, apply all_gather.
-  #   3. Otherwise convert IndexedSlices to dense tensors and apply
-  #      all_reduce.
-  out_slices_list = []
-  for idx, input_slices in enumerate(input_slices_list):
-    # pylint: disable = cell-var-from-loop
-    with ops.device(devices[idx]):
+    # Current CollectiveAllGather implementations require input IndexedSlices to
+    # have consistent length across the board, we handle the reduction of
+    # IndexedSlices as follows:
+    #   1. Gather the lengths of IndexedSlices from all participants.
+    #   2. If they have consistent length, apply all_gather.
+    #   3. Otherwise convert IndexedSlices to dense tensors and apply
+    #      all_reduce.
+    with ops.device(self._device):
 
       def all_gather():
         """Use all_gather to aggregate `IndexedSlices`."""
         all_values = collective_ops.all_gather(
             input_slices.values,
-            group_size,
-            group_key,
+            self._group_size,
+            self._group_key,
             gather_values_key,
             communication_hint,
             timeout=timeout)
@@ -530,8 +514,8 @@ def build_collective_gather_indexed_slices(input_slices_list,
         with ops.control_dependencies(control):
           all_indices = collective_ops.all_gather(
               input_slices.indices,
-              group_size,
-              group_key,
+              self._group_size,
+              self._group_key,
               gather_indices_key,
               communication_hint,
               timeout=timeout)
@@ -545,8 +529,8 @@ def build_collective_gather_indexed_slices(input_slices_list,
         densified = ops.convert_to_tensor(input_slices)
         reduced = collective_ops.all_reduce(
             densified,
-            group_size,
-            group_key,
+            self._group_size,
+            self._group_key,
             reduce_densified_key,
             'Add',
             'Id', [0],
@@ -561,289 +545,18 @@ def build_collective_gather_indexed_slices(input_slices_list,
             dense_shape=input_slices.dense_shape)
 
       length = array_ops.shape(input_slices.indices)
-      with ops.control_dependencies(
-          _control_input(input_slices, control_inputs, idx)):
-        all_lengths = collective_ops.all_gather(
-            length,
-            group_size,
-            group_key,
-            gather_length_key,
-            communication_hint,
-            timeout=timeout)
-      out_slices = control_flow_ops.cond(
+      all_lengths = collective_ops.all_gather(
+          length,
+          self._group_size,
+          self._group_key,
+          gather_length_key,
+          communication_hint,
+          timeout=timeout)
+      return control_flow_ops.cond(
           math_ops.equal(
               math_ops.reduce_max(all_lengths),
               math_ops.reduce_min(all_lengths)), all_gather,
           densify_and_all_reduce)
-      out_slices_list.append(out_slices)
-    # pylint: enable=cell-var-from-loop
-  return out_slices_list
-
-
-def sum_grad_and_var_all_reduce(grad_and_vars,
-                                num_workers,
-                                alg,
-                                gpu_indices,
-                                aux_devices=None,
-                                num_shards=1):
-  """Apply all-reduce algorithm over specified gradient tensors."""
-  with ops.name_scope('allreduce'):
-    # Note that each grad_and_vars looks like the following:
-    #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
-    scaled_grads = [g for g, _ in grad_and_vars]
-    if alg == 'nccl':
-      summed_grads = nccl_ops.all_sum(scaled_grads)
-    elif alg == 'xring':
-      summed_grads = all_reduce.build_ring_all_reduce(
-          scaled_grads, num_workers, num_shards, gpu_indices, math_ops.add)
-    elif alg == 'nccl/xring':
-      summed_grads = all_reduce.build_nccl_then_ring(scaled_grads, num_shards,
-                                                     math_ops.add)
-    elif alg == 'nccl/rechd':
-      summed_grads = all_reduce.build_nccl_then_recursive_hd(
-          scaled_grads, math_ops.add)
-    elif alg == 'nccl/pscpu':
-      summed_grads = all_reduce.build_nccl_then_shuffle(
-          scaled_grads, aux_devices, math_ops.add, math_ops.add_n)
-    elif alg == 'pscpu/pscpu':
-      second_gather_devices = aux_devices[:num_shards]
-      summed_grads = all_reduce.build_shuffle_then_shuffle(
-          scaled_grads, aux_devices, second_gather_devices, math_ops.add_n)
-    elif alg in ['pscpu', 'psgpu']:
-      summed_grads = all_reduce.build_shuffle_all_reduce(
-          scaled_grads, aux_devices, math_ops.add_n)
-    else:
-      raise ValueError('unsupported all_reduce alg: ', alg)
-
-  result = []
-  for (_, v), g in zip(grad_and_vars, summed_grads):
-    result.append([g, v])
-  return result
-
-
-def sum_gradients_all_reduce(dev_prefixes, replica_grads, num_workers, alg,
-                             num_shards, gpu_indices):
-  """Apply all-reduce algorithm over specified gradient tensors.
-
-  Args:
-    dev_prefixes: list of prefix strings to use to generate PS device names.
-    replica_grads: the gradients to reduce.
-    num_workers: number of worker processes across entire job.
-    alg: the all-reduce algorithm to apply.
-    num_shards: alg-specific sharding factor.
-    gpu_indices: indices of local GPUs in order usable for ring-reduce.
-
-  Returns:
-    list of reduced tensors
-  """
-  alg_contains_shuffle = any(n in alg for n in ['pscpu', 'psgpu'])
-  is_hierarchical = '/' in alg
-  if 'pscpu' in alg:
-    aux_devices = [prefix + '/cpu:0' for prefix in dev_prefixes]
-  elif 'psgpu' in alg:
-    aux_devices = [
-        prefix + '/gpu:%d' % i
-        for i in range(len(gpu_indices))
-        for prefix in dev_prefixes
-    ]
-  else:
-    aux_devices = ['/job:localhost/cpu:0']
-  # Auxiliary devices for hierarchical all-reduces.
-  aux_device_groups = group_device_names(
-      aux_devices, num_shards if alg_contains_shuffle else 1)
-  group_index = 0
-  reduced_gv_list = []
-  for grad_and_vars in zip(*replica_grads):
-    reduced_gv_list.append(
-        sum_grad_and_var_all_reduce(
-            grad_and_vars, num_workers, alg, gpu_indices, aux_devices
-            if is_hierarchical else aux_device_groups[group_index], num_shards))
-    group_index = (group_index + 1) % len(aux_device_groups)
-  new_replica_grads = [list(x) for x in zip(*reduced_gv_list)]
-  return new_replica_grads
-
-
-def extract_ranges(index_list, range_size_limit=32):
-  """Extract consecutive ranges and singles from index_list.
-
-  Args:
-    index_list: List of monotone increasing non-negative integers.
-    range_size_limit: Largest size range to return.  If a larger
-      consecutive range exists, it will be returned as multiple
-      ranges.
-
-  Returns:
-    (ranges, singles) where ranges is a list of [first, last] pairs of
-      consecutive elements in index_list, and singles is all of the
-      other elements, in original order.
-  """
-  if not index_list:
-    return [], []
-  first = index_list[0]
-  last = first
-  ranges = []
-  singles = []
-  for i in index_list[1:]:
-    if i == last + 1 and (last - first) <= range_size_limit:
-      last = i
-    else:
-      if last > first:
-        ranges.append([first, last])
-      else:
-        singles.append(first)
-      first = i
-      last = i
-  if last > first:
-    ranges.append([first, last])
-  else:
-    singles.append(first)
-  return ranges, singles
-
-
-GradPackTuple = pycoll.namedtuple('GradPackTuple', 'indices vars shapes')
-
-
-def pack_range(key, packing, grad_vars, rng):
-  """Form the concatenation of a specified range of gradient tensors.
-
-  Args:
-    key: Value under which to store meta-data in packing that will be used
-      later to restore the grad_var list structure.
-    packing: Dict holding data describing packed ranges of small tensors.
-    grad_vars: List of (grad, var) pairs for one replica.
-    rng: A pair of integers giving the first, last indices of a consecutive
-      range of tensors to be packed.
-
-  Returns:
-    A tensor that is the concatenation of all the specified small tensors.
-  """
-  to_pack = grad_vars[rng[0]:rng[1] + 1]
-  members = []
-  variables = []
-  restore_shapes = []
-  with ops.name_scope('pack'):
-    for g, v in to_pack:
-      variables.append(v)
-      restore_shapes.append(g.shape)
-      with ops.device(g.device):
-        members.append(array_ops.reshape(g, [-1]))
-    packing[key] = GradPackTuple(
-        indices=range(rng[0], rng[1] + 1),
-        vars=variables,
-        shapes=restore_shapes)
-    with ops.device(members[0].device):
-      return array_ops.concat(members, 0)
-
-
-def unpack_grad_tuple(gv, gpt):
-  """Unpack a previously packed collection of gradient tensors.
-
-  Args:
-    gv: A (grad, var) pair to be unpacked.
-    gpt: A GradPackTuple describing the packing operation that produced gv.
-
-  Returns:
-    A list of (grad, var) pairs corresponding to the values that were
-     originally packed into gv, maybe following subsequent operations like
-     reduction.
-  """
-  elt_widths = [x.num_elements() for x in gpt.shapes]
-  with ops.device(gv[0].device):
-    with ops.name_scope('unpack'):
-      splits = array_ops.split(gv[0], elt_widths)
-      unpacked_gv = []
-      for idx, s in enumerate(splits):
-        unpacked_gv.append((array_ops.reshape(s, gpt.shapes[idx]),
-                            gpt.vars[idx]))
-  return unpacked_gv
-
-
-def pack_small_tensors(replica_grads, max_bytes=0, max_group=0):
-  """Concatenate small gradient tensors together for reduction.
-
-  Args:
-    replica_grads: List of lists of (gradient, variable) tuples.
-    max_bytes: Int giving max number of bytes in a tensor that
-      may be considered small.
-    max_group: Int giving max number of small tensors that may be
-      concatenated into one new tensor.
-
-  Returns:
-    new_replica_grads, packing where new_replica_grads is identical to
-      replica_grads except that all feasible small_tensors have been removed
-      from their places and concatenated into larger tensors that are
-      now in the front of the list for each replica, and packing contains
-      the data necessary to restore the replica_grads structure.
-
-  Look through the first replica for gradients of the same type (float),
-  and small size, that are all sequential.  For each such group,
-  replace by a new tensor that is a flattened concatenation.  Note
-  that the corresponding variable will be absent, which doesn't matter
-  because it isn't used during all-reduce.
-
-  Requires:
-    Every gv_list in replicas must have isomorphic structure including identical
-      tensor sizes and types.
-  """
-  small_indices = []
-  large_indices = []
-  for idx, (g, _) in enumerate(replica_grads[0]):
-    if g.dtype == dtypes.float32 and (4 * g.shape.num_elements()) <= max_bytes:
-      small_indices.append(idx)
-    else:
-      large_indices.append(idx)
-  small_ranges, small_singles = extract_ranges(
-      small_indices, range_size_limit=max_group)
-  large_indices = sorted(large_indices + small_singles)
-  num_gv = len(replica_grads[0])
-  packing = {}
-  if small_ranges:
-    new_replica_grads = []
-    for dev_idx, gv_list in enumerate(replica_grads):
-      assert len(gv_list) == num_gv
-      new_gv_list = []
-      for r in small_ranges:
-        key = '%d:%d' % (dev_idx, len(new_gv_list))
-        new_gv_list.append((pack_range(key, packing, gv_list, r),
-                            'packing_var_placeholder'))
-      for i in large_indices:
-        new_gv_list.append(gv_list[i])
-      new_replica_grads.append(new_gv_list)
-    return new_replica_grads, packing
-  else:
-    return replica_grads, None
-
-
-def unpack_small_tensors(replica_grads, packing):
-  """Undo the structure alterations to replica_grads done by pack_small_tensors.
-
-  Args:
-    replica_grads: List of List of (grad, var) tuples.
-    packing: A dict generated by pack_small_tensors describing the changes
-      it made to replica_grads.
-
-  Returns:
-    new_replica_grads: identical to replica_grads except that concatenations
-      of small tensors have been split apart and returned to their original
-      positions, paired with their original variables.
-  """
-  if not packing:
-    return replica_grads
-  new_replica_grads = []
-  num_devices = len(replica_grads)
-  num_packed = len(packing.keys()) // num_devices
-  for dev_idx, gv_list in enumerate(replica_grads):
-    gv_list = list(gv_list)
-    new_gv_list = gv_list[num_packed:]
-    for i in range(num_packed):
-      k = '%d:%d' % (dev_idx, i)
-      gpt = packing[k]
-      gv = unpack_grad_tuple(gv_list[i], gpt)
-      for gi, idx in enumerate(gpt.indices):
-        assert idx == gpt.indices[gi]
-        new_gv_list.insert(idx, gv[gi])
-    new_replica_grads.append(new_gv_list)
-  return new_replica_grads
 
 
 def aggregate_tensors_or_indexed_slices(values, accumulation_fn=math_ops.add_n):
@@ -875,18 +588,6 @@ def copy_tensor_or_indexed_slices_to_device(value, device):
   return result
 
 
-def contains_indexed_slices(value):
-  """Check whether the value is `IndexedSlices` or contains `IndexedSlices`."""
-  if isinstance(value, ops.IndexedSlices):
-    return True
-  elif isinstance(value, (list, tuple)) and value:
-    return any(contains_indexed_slices(v) for v in value)
-  elif isinstance(value, value_lib.DistributedValues):
-    return contains_indexed_slices(value.values)
-  else:
-    return False
-
-
 def is_indexed_slices(value):
   if isinstance(value, ops.IndexedSlices):
     return True
@@ -942,56 +643,35 @@ def stitch_values(values_and_indices_list):
   return result
 
 
-def per_replica_num_elements(per_replica):
-  """Returns the static number of elements of one replica.
+def group_by_size(input_tensors, bytes_per_pack):
+  """Groups `input_tensors` into chunks of `bytes_per_pack`.
 
-  Args:
-    per_replica: A PerReplica of Tensor or IndexedSlices.
-
-  Returns:
-    Number of elements. None if some replica has a different or unknown shape.
-  """
-
-  values = per_replica._values  # pylint: disable=protected-access
-  s0 = values[0].shape
-  for v in values:
-    assert not isinstance(v, ops.IndexedSlices)
-    if v.shape != s0:
-      return None
-  return s0.num_elements()
-
-
-def pack_by_size(per_replica_list, bytes_per_pack):
-  """Packs `per_replica_list` into chunks of `bytes_per_pack`.
-
-  The method preserves the original order of `per_replica_list`. The packing is
+  The method preserves the original order of `input_tensors`. The grouping is
   best effort, each pack could have more or less bytes than `bytes_per_pack`.
-  It only packs values with known shape. Note that, the usage is different from
-  `cross_device_ops._pack_tensors`, this function is intended to work with the
-  ScopeAllocator style batching used in `CollectiveAllReduce`.
+  It only groups values with known shape.
 
   Args:
-    per_replica_list: A list of PerReplica.
-    bytes_per_pack: Bytes per pack.
+    input_tensors: a list of Tensor.
+    bytes_per_pack: an integer.
 
   Returns:
-    A list of packs of PerReplica. All values are packed into one pack if
-      `bytes_per_pack` is zero or any of the value has unknown shape.
+    A list of packs of Tensor. All values are grouped into one pack if
+    `bytes_per_pack` is zero or any of the value has unknown shape.
   """
 
   if bytes_per_pack == 0:
-    return [per_replica_list]
+    return [input_tensors]
   packs = []
   last_pack_size = 0
-  for value in per_replica_list:
-    num_elements = per_replica_num_elements(value)
+  for value in input_tensors:
+    num_elements = value.shape.num_elements()
     if num_elements is None:
       # Can't pack values with unknown shape.
       logging.warning(
           'not packing values due to the unknown or inconsistent shape of %s',
           value)
-      return [per_replica_list]
-    size = num_elements * value._primary.dtype.size  # pylint: disable=protected-access
+      return [input_tensors]
+    size = num_elements * value.dtype.size
     # Try to keep each pack as close to bytes_per_pack as possible, while each
     # pack is at least bytes_per_pack large. I.E. we err on the side of having
     # few but large packs.
@@ -1003,24 +683,15 @@ def pack_by_size(per_replica_list, bytes_per_pack):
   return packs
 
 
-def _control_input(devices, control_inputs, idx):
-  """Returns the `idx`-th item in control_inputs to be used in ops.control_dependencies.
-
-  This is a helper function for building collective ops.
-
-  Args:
-    devices: a list of device strings the collective run on.
-    control_inputs: a list or None.
-    idx: the index into `inputs` and `control_inputs`.
-
-  Returns:
-    A one item list of the `idx`-th element of `control_inputs`, or an empty
-    list if `control_inputs` is None.
-  """
-  if control_inputs is None:
-    return []
-  if len(control_inputs) != len(devices):
-    raise ValueError(
-        'control_inputs must match the length of the devices, %s != %s' %
-        (len(control_inputs), len(devices)))
-  return [control_inputs[idx]]
+def _pad_util(input_tensor, full_axis_dim):
+  """Pad the `input_tensor`'s first dimension to be `full_axis_dim`."""
+  missing_axis_dim = full_axis_dim - array_ops.shape_v2(input_tensor)[0]
+  tensor_rank = array_ops.rank(input_tensor)
+  paddings_axis = [[0, missing_axis_dim]]
+  paddings = array_ops.concat([
+      paddings_axis,
+      array_ops.zeros(shape=(tensor_rank - 1, 2), dtype=dtypes.int32)
+  ],
+                              axis=0)
+  padded_input_tensor = array_ops.pad(input_tensor, paddings)
+  return padded_input_tensor
diff --git a/tensorflow/python/distribute/cross_device_utils_test.py b/tensorflow/python/distribute/cross_device_utils_test.py
index 9781bf67566..108d5478ce6 100644
--- a/tensorflow/python/distribute/cross_device_utils_test.py
+++ b/tensorflow/python/distribute/cross_device_utils_test.py
@@ -23,7 +23,6 @@ from absl.testing import parameterized
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import device_util
-from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -81,32 +80,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
   def testIsIndexedSlices(self):
     t = math_ops._as_indexed_slices(
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    self.assertTrue(cross_device_utils.contains_indexed_slices(t))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testContainsIndexedSlices_List(self):
-    t0 = math_ops._as_indexed_slices(
-        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    t1 = math_ops._as_indexed_slices(
-        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    self.assertTrue(cross_device_utils.contains_indexed_slices([t0, t1]))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testContainsIndexedSlices_Tuple(self):
-    t0 = math_ops._as_indexed_slices(
-        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    t1 = math_ops._as_indexed_slices(
-        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    self.assertTrue(cross_device_utils.contains_indexed_slices((t0, t1)))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testContainsIndexedSlices_PerReplica(self):
-    t0 = math_ops._as_indexed_slices(
-        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    t1 = math_ops._as_indexed_slices(
-        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    per_replica = value_lib.PerReplica((t0, t1))
-    self.assertTrue(cross_device_utils.contains_indexed_slices(per_replica))
+    self.assertTrue(cross_device_utils.is_indexed_slices(t))
 
   @combinations.generate(combinations.combine(
       mode=["graph", "eager"],
@@ -139,11 +113,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
         device_util.resolve(destination), device_util.resolve(result.device))
 
 
-class PackBySizeTest(test.TestCase):
-
-  def assertShape(self, per_replica, shape):
-    for v in per_replica._values:  # pylint: disable=protected-access
-      self.assertEqual(v.shape, shape)
+class GroupBySizeTest(test.TestCase):
 
   def testPreferLargerPack(self):
     # Each packs except the last one should be equal or larger than
@@ -158,49 +128,38 @@ class PackBySizeTest(test.TestCase):
         # size = 1 * 4 = 4
         array_ops.ones([1], dtype=dtypes.int32),
     ]
-    per_replica_values = [value_lib.PerReplica([v, v]) for v in values]
-    packs = cross_device_utils.pack_by_size(
-        per_replica_values, bytes_per_pack=200)
+    packs = cross_device_utils.group_by_size(values, bytes_per_pack=200)
     self.assertLen(packs, 2)
     self.assertLen(packs[0], 3)
-    self.assertShape(packs[0][0], [2, 4, 4])
-    self.assertShape(packs[0][1], [8])
-    self.assertShape(packs[0][2], [10, 10])
+    self.assertEqual(packs[0][0].shape, [2, 4, 4])
+    self.assertEqual(packs[0][1].shape, [8])
+    self.assertEqual(packs[0][2].shape, [10, 10])
     self.assertLen(packs[1], 1)
-    self.assertShape(packs[1][0], [1])
+    self.assertEqual(packs[1][0].shape, [1])
 
   def testZeroBytesPerPack(self):
     values = [
         array_ops.ones([1], dtype=dtypes.float32),
         array_ops.ones([2], dtype=dtypes.float32),
     ]
-    per_replica_values = [value_lib.PerReplica([v, v]) for v in values]
-    packs = cross_device_utils.pack_by_size(
-        per_replica_values, bytes_per_pack=0)
+    packs = cross_device_utils.group_by_size(values, bytes_per_pack=0)
     self.assertLen(packs, 1)
     self.assertLen(packs[0], 2)
-    self.assertShape(packs[0][0], [1])
-    self.assertShape(packs[0][1], [2])
+    self.assertEqual(packs[0][0].shape, [1])
+    self.assertEqual(packs[0][1].shape, [2])
 
   def testUnknownShape(self):
     def create_placeholder(shape, dtype):
       with ops.Graph().as_default():
         return array_ops.placeholder(dtype=dtype, shape=shape)
 
-    per_replica_values = [
-        value_lib.PerReplica([
-            array_ops.ones([10, 10], dtype=dtypes.float32),
-            array_ops.ones([10, 10], dtype=dtypes.float32),
-        ]),
-        value_lib.PerReplica([
-            array_ops.ones([10, 10], dtype=dtypes.float32),
-            create_placeholder([None, 10], dtype=dtypes.float32),
-        ]),
+    values = [
+        array_ops.ones([10, 10], dtype=dtypes.float32),
+        create_placeholder([None, 10], dtype=dtypes.float32),
     ]
-    packs = cross_device_utils.pack_by_size(
-        per_replica_values, bytes_per_pack=1)
+    packs = cross_device_utils.group_by_size(values, bytes_per_pack=1)
     self.assertLen(packs, 1)
-    self.assertEqual(packs[0], per_replica_values)
+    self.assertEqual(packs[0], values)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/distribute/custom_training_loop_input_test.py b/tensorflow/python/distribute/custom_training_loop_input_test.py
index a835f5e5ac9..7debd850486 100644
--- a/tensorflow/python/distribute/custom_training_loop_input_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_input_test.py
@@ -386,7 +386,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
   def testDistributeDatasetIteratorWithoutFunction(self, distribution):
     data = [5., 6., 7., 8.]
     input_iterator = iter(
-        distribution.experimental_distribute_datasets_from_function(
+        distribution.distribute_datasets_from_function(
             lambda _: get_dataset_from_tensor_slices(data)))
 
     self.assertAllEqual(
@@ -401,7 +401,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
   def testDistributeDatasetIteratorWithFunction(self, distribution):
     data = [5., 6., 7., 8.]
     input_iterator = iter(
-        distribution.experimental_distribute_datasets_from_function(
+        distribution.distribute_datasets_from_function(
             lambda _: get_dataset_from_tensor_slices(data)))
 
     @def_function.function
@@ -439,7 +439,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
   def testDistributeDatasetFunctionPrefetch(self, distribution):
     data = [5., 6., 7., 8.]
     input_iterator = iter(
-        distribution.experimental_distribute_datasets_from_function(
+        distribution.distribute_datasets_from_function(
             lambda _: get_dataset_from_tensor_slices(data)))
 
     local_results = distribution.experimental_local_results(
@@ -476,10 +476,9 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
   def testDistributeDatasetFunctionHostPrefetch(self, distribution):
     data = [5., 6., 7., 8.]
     input_iterator = iter(
-        distribution.experimental_distribute_datasets_from_function(
+        distribution.distribute_datasets_from_function(
             lambda _: get_dataset_from_tensor_slices(data),
-            distribute_lib.InputOptions(
-                experimental_prefetch_to_device=False)))
+            distribute_lib.InputOptions(experimental_prefetch_to_device=False)))
 
     local_results = distribution.experimental_local_results(
         input_iterator.get_next())
@@ -645,7 +644,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       return dataset
 
     input_iterator = iter(
-        distribution.experimental_distribute_datasets_from_function(dataset_fn))
+        distribution.distribute_datasets_from_function(dataset_fn))
 
     @def_function.function
     def step_fn(example):
@@ -673,7 +672,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       return dataset
 
     input_iterator = iter(
-        distribution.experimental_distribute_datasets_from_function(dataset_fn))
+        distribution.distribute_datasets_from_function(dataset_fn))
 
     @def_function.function
     def step_fn(example):
@@ -724,7 +723,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       return dataset
 
     input_iterator = iter(
-        distribution.experimental_distribute_datasets_from_function(dataset_fn))
+        distribution.distribute_datasets_from_function(dataset_fn))
 
     @def_function.function
     def run(inputs):
@@ -750,7 +749,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
       return dataset
 
     input_iterator = iter(
-        distribution.experimental_distribute_datasets_from_function(dataset_fn))
+        distribution.distribute_datasets_from_function(dataset_fn))
 
     def embedding_lookup(inputs):
       embedding_weights = array_ops.zeros((1, 128))
@@ -935,7 +934,7 @@ class InputIterationTest(test.TestCase, parameterized.TestCase,
     inputs = constant_op.constant([2., 3.])
     dataset = lambda _: dataset_ops.Dataset.from_tensor_slices(inputs).repeat(5)
     input_iterator = iter(
-        distribution.experimental_distribute_datasets_from_function(dataset))
+        distribution.distribute_datasets_from_function(dataset))
     with distribution.scope():
       var = variables.Variable(1.0)
 
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 173caa364a9..31b47879781 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -114,7 +114,7 @@ the same way with eager and graph execution.
   devices, with a different value for each replica. They are produced by
   iterating through a distributed dataset returned by
   `tf.distribute.Strategy.experimental_distribute_dataset` and
-  `tf.distribute.Strategy.experimental_distribute_datasets_from_function`. They
+  `tf.distribute.Strategy.distribute_datasets_from_function`. They
   are also the typical result returned by
   `tf.distribute.Strategy.run`.
 
@@ -681,14 +681,12 @@ class StrategyBase(object):
     [see the
     guide](https://www.tensorflow.org/guide/distributed_training#using_tfdistributestrategy_with_custom_training_loops):
 
-      * Start by either creating a `tf.data.Dataset` normally or using
-        `tf.distribute.experimental_make_numpy_dataset` to make a dataset out of
-        a `numpy` array.
+      * Start by creating a `tf.data.Dataset` normally.
       * Use `tf.distribute.Strategy.experimental_distribute_dataset` to convert
         a `tf.data.Dataset` to something that produces "per-replica" values.
         If you want to manually specify how the dataset should be partitioned
         across replicas, use
-        `tf.distribute.Strategy.experimental_distribute_datasets_from_function`
+        `tf.distribute.Strategy.distribute_datasets_from_function`
         instead.
       * Use `tf.distribute.Strategy.run` to run a function
         once per replica, taking values that may be "per-replica" (e.g.
@@ -872,13 +870,13 @@ class StrategyBase(object):
       `model.evaluate`, `model.predict` and `model.save` can all be called
       inside or outside the scope.
     * The following can be either inside or outside the scope:
-      ** Creating the input datasets
-      ** Defining `tf.function`s that represent your training step
-      ** Saving APIs such as `tf.saved_model.save`. Loading creates variables,
-         so that should go inside the scope if you want to train the model in a
-         distributed way.
-      ** Checkpoint saving. As mentioned above - `checkpoint.restore` may
-         sometimes need to be inside scope if it creates variables.
+        * Creating the input datasets
+        * Defining `tf.function`s that represent your training step
+        * Saving APIs such as `tf.saved_model.save`. Loading creates variables,
+          so that should go inside the scope if you want to train the model in a
+          distributed way.
+        * Checkpoint saving. As mentioned above - `checkpoint.restore` may
+          sometimes need to be inside scope if it creates variables.
 
     Returns:
       A context manager.
@@ -908,39 +906,6 @@ class StrategyBase(object):
       return self.extended._make_input_fn_iterator(  # pylint: disable=protected-access
           input_fn, replication_mode=replication_mode)
 
-  @deprecation.deprecated(
-      "2020-09-30", "Please use tf.data.Dataset.from_tensor_slices instead")
-  def experimental_make_numpy_dataset(self, numpy_input):
-    """Makes a `tf.data.Dataset` from a numpy array.
-
-    This avoids adding `numpy_input` as a large constant in the graph,
-    and copies the data to the machine or machines that will be processing
-    the input.
-
-    Note that you will likely need to use `experimental_distribute_dataset`
-    with the returned dataset to further distribute it with the strategy.
-
-    Example:
-
-    >>> strategy = tf.distribute.MirroredStrategy()
-    >>> numpy_input = np.ones([10], dtype=np.float32)
-    >>> dataset = strategy.experimental_make_numpy_dataset(numpy_input)
-    >>> dataset
-    <TensorSliceDataset shapes: (), types: tf.float32>
-    >>> dataset = dataset.batch(2)
-    >>> dist_dataset = strategy.experimental_distribute_dataset(dataset)
-
-    Args:
-      numpy_input: a nest of NumPy input arrays that will be converted into a
-        dataset. Note that the NumPy arrays are stacked, as that is normal
-        `tf.data.Dataset` behavior.
-
-    Returns:
-      A `tf.data.Dataset` representing `numpy_input`.
-    """
-    return self.extended.experimental_make_numpy_dataset(
-        numpy_input, session=None)
-
   @doc_controls.do_not_generate_docs  # DEPRECATED: TF 1.x only
   def experimental_run(self, fn, input_iterator=None):
     """DEPRECATED TF 1.x ONLY."""
@@ -1030,13 +995,13 @@ class StrategyBase(object):
 
     If the above batch splitting and dataset sharding logic is undesirable,
     please use
-    `tf.distribute.Strategy.experimental_distribute_datasets_from_function`
+    `tf.distribute.Strategy.distribute_datasets_from_function`
     instead, which does not do any automatic batching or sharding for you.
 
     Note: If you are using TPUStrategy, the order in which the data is processed
     by the workers when using
     `tf.distribute.Strategy.experimental_distribute_dataset` or
-    `tf.distribute.Strategy.experimental_distribute_datasets_from_function` is
+    `tf.distribute.Strategy.distribute_datasets_from_function` is
     not guaranteed. This is typically required if you are using
     `tf.distribute` to scale prediction. You can however insert an index for
     each element in the batch and order outputs accordingly. Refer to [this
@@ -1045,7 +1010,7 @@ class StrategyBase(object):
 
     Note: Stateful dataset transformations are currently not supported with
     `tf.distribute.experimental_distribute_dataset` or
-    `tf.distribute.experimental_distribute_datasets_from_function`. Any stateful
+    `tf.distribute.distribute_datasets_from_function`. Any stateful
     ops that the dataset may have are currently ignored. For example, if your
     dataset has a `map_fn` that uses `tf.random.uniform` to rotate an image,
     then you have a dataset graph that depends on state (i.e the random seed) on
@@ -1067,8 +1032,7 @@ class StrategyBase(object):
     # pylint: enable=line-too-long
     return self._extended._experimental_distribute_dataset(dataset, options)  # pylint: disable=protected-access
 
-  def experimental_distribute_datasets_from_function(self, dataset_fn,
-                                                     options=None):
+  def distribute_datasets_from_function(self, dataset_fn, options=None):
     # pylint: disable=line-too-long
     """Distributes `tf.data.Dataset` instances created by calls to `dataset_fn`.
 
@@ -1077,7 +1041,7 @@ class StrategyBase(object):
     instance. It is expected that the returned dataset from `dataset_fn` is
     already batched by per-replica batch size (i.e. global batch size divided by
     the number of replicas in sync) and sharded.
-    `tf.distribute.Strategy.experimental_distribute_datasets_from_function` does
+    `tf.distribute.Strategy.distribute_datasets_from_function` does
     not batch or shard the `tf.data.Dataset` instance
     returned from the input function. `dataset_fn` will be called on the CPU
     device of each of the workers and each generates a dataset where every
@@ -1088,7 +1052,7 @@ class StrategyBase(object):
     This method can be used for several purposes. First, it allows you to
     specify your own batching and sharding logic. (In contrast,
     `tf.distribute.experimental_distribute_dataset` does batching and sharding
-    for you.)For example, where
+    for you.) For example, where
     `experimental_distribute_dataset` is unable to shard the input files, this
     method might be used to manually shard the dataset (avoiding the slow
     fallback behavior in `experimental_distribute_dataset`). In cases where the
@@ -1112,7 +1076,7 @@ class StrategyBase(object):
     Note: If you are using TPUStrategy, the order in which the data is processed
     by the workers when using
     `tf.distribute.Strategy.experimental_distribute_dataset` or
-    `tf.distribute.Strategy.experimental_distribute_datasets_from_function` is
+    `tf.distribute.Strategy.distribute_datasets_from_function` is
     not guaranteed. This is typically required if you are using
     `tf.distribute` to scale prediction. You can however insert an index for
     each element in the batch and order outputs accordingly. Refer to [this
@@ -1121,14 +1085,14 @@ class StrategyBase(object):
 
     Note: Stateful dataset transformations are currently not supported with
     `tf.distribute.experimental_distribute_dataset` or
-    `tf.distribute.experimental_distribute_datasets_from_function`. Any stateful
+    `tf.distribute.distribute_datasets_from_function`. Any stateful
     ops that the dataset may have are currently ignored. For example, if your
     dataset has a `map_fn` that uses `tf.random.uniform` to rotate an image,
     then you have a dataset graph that depends on state (i.e the random seed) on
     the local machine where the python process is being executed.
 
     For a tutorial on more usage and properties of this method, refer to the
-    [tutorial on distributed input](https://www.tensorflow.org/tutorials/distribute/input#tfdistributestrategyexperimental_distribute_datasets_from_function).
+    [tutorial on distributed input](https://www.tensorflow.org/tutorials/distribute/input#tfdistributestrategyexperimental_distribute_datasets_from_function)).
     If you are interested in last partial batch handling, read [this section](https://www.tensorflow.org/tutorials/distribute/input#partial_batches).
 
     Args:
@@ -1141,9 +1105,17 @@ class StrategyBase(object):
       A `tf.distribute.DistributedDataset`.
     """
     # pylint: enable=line-too-long
-    return self._extended._experimental_distribute_datasets_from_function(  # pylint: disable=protected-access
+    return self._extended._distribute_datasets_from_function(  # pylint: disable=protected-access
         dataset_fn, options)
 
+  # TODO(b/162776748): Remove deprecated symbol.
+  @doc_controls.do_not_doc_inheritable
+  @deprecation.deprecated(None, "rename to distribute_datasets_from_function")
+  def experimental_distribute_datasets_from_function(self,
+                                                     dataset_fn,
+                                                     options=None):
+    return self.distribute_datasets_from_function(dataset_fn, options)
+
   def run(self, fn, args=(), kwargs=None, options=None):
     """Invokes `fn` on each replica, with the given arguments.
 
@@ -1152,7 +1124,7 @@ class StrategyBase(object):
     have `tf.distribute.DistributedValues`, such as those produced by a
     `tf.distribute.DistributedDataset` from
     `tf.distribute.Strategy.experimental_distribute_dataset` or
-    `tf.distribute.Strategy.experimental_distribute_datasets_from_function`,
+    `tf.distribute.Strategy.distribute_datasets_from_function`,
     when `fn` is executed on a particular replica, it will be executed with the
     component of `tf.distribute.DistributedValues` that correspond to that
     replica.
@@ -1459,6 +1431,107 @@ class StrategyBase(object):
     denom = math_ops.cast(denom, numer.dtype)
     return math_ops.truediv(numer, denom)
 
+  # TODO(wxinyi): generate docs after it is implemented for all strategies.
+  # TODO(wxinyi): hide from V1 API
+  def _gather(self, value, axis):
+    # pylint: disable=line-too-long, protected-access
+    """Gather `value` across replicas along `axis` to the current device.
+
+    Given a `tf.distribute.DistributedValues` or `tf.Tensor`-like
+    object `value`, this API gathers and concatenates `value` along the
+    `axis`-th dimension. The result is copied to the "current" device - which
+    would typically be the CPU of the worker on which the program is running.
+    For `tf.distribute.TPUStrategy`, it is the first TPU host. For multi-client
+    `MultiWorkerMirroredStrategy`, this is CPU of each worker.
+
+    This API can only be called in the cross-replica context. For a counterpart
+    in the replica context, see `tf.distribute.ReplicaContext.all_gather`.
+
+    Note: the input `value` on different replicas must have the same rank, and
+    they must have shapes that are consistent along all dimensions except the
+    `axis`-th dimension. For example, given a `tf.distribute.DistributedValues`
+    with tensors of shape `(1, 2, 3)` and `(1, 3, 3)` on two replicas, you can
+    call `gather(..., axis=1, ...)` on it, but not `gather(..., axis=0, ...)` or
+    `gather(..., axis=2, ...)`.
+
+
+    # TODO(wxinyi): convert to testable docstring after implemented for MirroredStrategy
+    ```python
+    strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"])
+    local_tensor = tf.constant([[1, 2], [3, 4]])
+    distributed_values = strategy.experimental_distribute_values_from_function(lambda _: tf.identity(local_tensor))
+    @tf.function
+    def run():
+      return strategy.gather(distributed_values, axis=0)
+    run()
+    # <tf.Tensor: shape=(4, 2), dtype=int32, numpy=
+    # array([[1, 2],
+    #        [3, 4],
+    #        [1, 2],
+    #        [3, 4]], dtype=int32)>
+    ```
+
+    Some more example cases:
+
+    ```python
+    strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1", "GPU:2", "GPU:3"])
+    local_tensor = tf.reshape(tf.range(6), shape=(1,2,3))
+    distributed_values = strategy.experimental_distribute_values_from_function(lambda _: local_tensor)
+    @tf.function
+    def run():
+      return strategy.gather(distributed_values, axis=AXIS)
+    run()
+
+    #     With AXIS=0, the result is
+    #     <tf.Tensor: shape=(4, 2, 3), dtype=int32, numpy=
+    #     array([[[0, 1, 2],
+    #             [3, 4, 5]],
+    #            [[0, 1, 2],
+    #             [3, 4, 5]],
+    #            [[0, 1, 2],
+    #             [3, 4, 5]],
+    #            [[0, 1, 2],
+    #             [3, 4, 5]]], dtype=int32)>
+    #     With AXIS=1, the result is
+    #     <tf.Tensor: shape=(1, 8, 3), dtype=int32, numpy=
+    #     array([[[0, 1, 2],
+    #             [3, 4, 5],
+    #             [0, 1, 2],
+    #             [3, 4, 5],
+    #             [0, 1, 2],
+    #             [3, 4, 5],
+    #             [0, 1, 2],
+    #             [3, 4, 5]]], dtype=int32)>
+    #     With AXIS=2, the result is
+    #     <tf.Tensor: shape=(1, 2, 12), dtype=int32, numpy=
+    #     array([[[0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2],
+    #             [3, 4, 5, 3, 4, 5, 3, 4, 5, 3, 4, 5]]], dtype=int32)>
+
+    ```
+
+    Args:
+      value: a `tf.distribute.DistributedValues` instance, e.g. returned by
+        `Strategy.run`, to be combined into a single tensor. It can also be a
+        regular tensor when used with `OneDeviceStrategy` or default strategy.
+        The underlying tensor constructs can only be dense tensors with non-zero
+        rank, NOT `tf.IndexedSlices`.
+      axis: 0-D int32 Tensor. Dimension along which to gather. Must be in the
+        range [0, rank(value)).
+
+    Returns:
+       A `Tensor` that's the concatenation of `value` across replicas along
+       `axis` dimension.
+    """
+    # pylint: enable=line-too-long
+    _require_cross_replica_or_default_context_extended(self._extended)
+    dst = device_util.current(
+    ) or self._extended._default_device or "/device:CPU:0"
+    if isinstance(value, ops.IndexedSlices):
+      raise NotImplementedError("gather/all_gather does not support "
+                                "IndexedSlices")
+    return self._extended._local_results(
+        self._extended._gather_to(value, dst, axis))[0]
+
   @doc_controls.do_not_doc_inheritable  # DEPRECATED
   def unwrap(self, value):
     """Returns the list of all local per-replica values contained in `value`.
@@ -1615,167 +1688,6 @@ class Strategy(StrategyBase):
 
   __doc__ = StrategyBase.__doc__
 
-  def experimental_assign_to_logical_device(self, tensor, logical_device_id):
-    """Adds annotation that `tensor` will be assigned to a logical device.
-
-    NOTE: This API is only supported in TPUStrategy for now.
-    This adds an annotation to `tensor` specifying that operations on
-    `tensor` will be invoked on logical core device id `logical_device_id`.
-    When model parallelism is used, the default behavior is that all ops
-    are placed on zero-th logical device.
-
-    ```python
-
-    # Initializing TPU system with 2 logical devices and 4 replicas.
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
-    tf.config.experimental_connect_to_cluster(resolver)
-    topology = tf.tpu.experimental.initialize_tpu_system(resolver)
-    device_assignment = tf.tpu.experimental.DeviceAssignment.build(
-        topology,
-        computation_shape=[1, 1, 1, 2],
-        num_replicas=4)
-    strategy = tf.distribute.TPUStrategy(
-        resolver, experimental_device_assignment=device_assignment)
-    iterator = iter(inputs)
-
-    @tf.function()
-    def step_fn(inputs):
-      output = tf.add(inputs, inputs)
-
-      # Add operation will be executed on logical device 0.
-      output = strategy.experimental_assign_to_logical_device(output, 0)
-      return output
-
-    strategy.run(step_fn, args=(next(iterator),))
-    ```
-
-    Args:
-      tensor: Input tensor to annotate.
-      logical_device_id: Id of the logical core to which the tensor will be
-        assigned.
-
-    Raises:
-      ValueError: The logical device id presented is not consistent with total
-      number of partitions specified by the device assignment.
-
-    Returns:
-      Annotated tensor with identical value as `tensor`.
-    """
-    return self._extended._experimental_assign_to_logical_device(  # pylint: disable=protected-access
-        tensor, logical_device_id)
-
-  def experimental_split_to_logical_devices(self, tensor, partition_dimensions):
-    """Adds annotation that `tensor` will be split across logical devices.
-
-    NOTE: This API is only supported in TPUStrategy for now.
-    This adds an annotation to tensor `tensor` specifying that operations on
-    `tensor` will be be split among multiple logical devices. Tensor `tensor`
-    will be split across dimensions specified by `partition_dimensions`.
-    The dimensions of `tensor` must be divisible by corresponding value in
-    `partition_dimensions`.
-
-    For example, for system with 8 logical devices, if `tensor` is an image
-    tensor with shape (batch_size, width, height, channel) and
-    `partition_dimensions` is [1, 2, 4, 1], then `tensor` will be split
-    2 in width dimension and 4 way in height dimension and the split
-    tensor values will be fed into 8 logical devices.
-
-    ```python
-    # Initializing TPU system with 8 logical devices and 1 replica.
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
-    tf.config.experimental_connect_to_cluster(resolver)
-    topology = tf.tpu.experimental.initialize_tpu_system(resolver)
-    device_assignment = tf.tpu.experimental.DeviceAssignment.build(
-        topology,
-        computation_shape=[1, 2, 2, 2],
-        num_replicas=1)
-    strategy = tf.distribute.TPUStrategy(
-        resolver, experimental_device_assignment=device_assignment)
-
-    iterator = iter(inputs)
-
-    @tf.function()
-    def step_fn(inputs):
-      inputs = strategy.experimental_split_to_logical_devices(
-        inputs, [1, 2, 4, 1])
-
-      # model() function will be executed on 8 logical devices with `inputs`
-      # split 2 * 4  ways.
-      output = model(inputs)
-      return output
-
-    strategy.run(step_fn, args=(next(iterator),))
-    ```
-    Args:
-      tensor: Input tensor to annotate.
-      partition_dimensions: An unnested list of integers with the size equal to
-        rank of `tensor` specifying how `tensor` will be partitioned. The
-        product of all elements in `partition_dimensions` must be equal to the
-        total number of logical devices per replica.
-
-    Raises:
-      ValueError: 1) If the size of partition_dimensions does not equal to rank
-        of `tensor` or 2) if product of elements of `partition_dimensions` does
-        not match the number of logical devices per replica defined by the
-        implementing DistributionStrategy's device specification or
-        3) if a known size of `tensor` is not divisible by corresponding
-        value in `partition_dimensions`.
-
-    Returns:
-      Annotated tensor with identical value as `tensor`.
-    """
-    return self._extended._experimental_split_to_logical_devices(  # pylint: disable=protected-access
-        tensor, partition_dimensions)
-
-  def experimental_replicate_to_logical_devices(self, tensor):
-    """Adds annotation that `tensor` will be replicated to all logical devices.
-
-    NOTE: This API is only supported in TPUStrategy for now.
-    This adds an annotation to tensor `tensor` specifying that operations on
-    `tensor` will be invoked on all logical devices.
-
-    ```python
-    # Initializing TPU system with 2 logical devices and 4 replicas.
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
-    tf.config.experimental_connect_to_cluster(resolver)
-    topology = tf.tpu.experimental.initialize_tpu_system(resolver)
-    device_assignment = tf.tpu.experimental.DeviceAssignment.build(
-        topology,
-        computation_shape=[1, 1, 1, 2],
-        num_replicas=4)
-    strategy = tf.distribute.TPUStrategy(
-        resolver, experimental_device_assignment=device_assignment)
-
-    iterator = iter(inputs)
-
-    @tf.function()
-    def step_fn(inputs):
-      images, labels = inputs
-      images = strategy.experimental_split_to_logical_devices(
-        inputs, [1, 2, 4, 1])
-
-      # model() function will be executed on 8 logical devices with `inputs`
-      # split 2 * 4  ways.
-      output = model(inputs)
-
-      # For loss calculation, all logical devices share the same logits
-      # and labels.
-      labels = strategy.experimental_replicate_to_logical_devices(labels)
-      output = strategy.experimental_replicate_to_logical_devices(output)
-      loss = loss_fn(labels, output)
-
-      return loss
-
-    strategy.run(step_fn, args=(next(iterator),))
-    ```
-    Args:
-      tensor: Input tensor to annotate.
-
-    Returns:
-      Annotated tensor with identical value as `tensor`.
-    """
-    return self._extended._experimental_replicate_to_logical_devices(tensor)  # pylint: disable=protected-access
-
   def experimental_distribute_values_from_function(self, value_fn):
     """Generates `tf.distribute.DistributedValues` from `value_fn`.
 
@@ -2130,6 +2042,10 @@ class StrategyExtendedV2(object):
         checkpoint_restore_uid = kwargs[
             "initial_value"].checkpoint_position.restore_uid
         kwargs["initial_value"] = kwargs["initial_value"].wrapped_value
+      elif isinstance(kwargs["initial_value"],
+                      trackable.CheckpointInitialValueCallable):
+        checkpoint_restore_uid = kwargs[
+            "initial_value"].checkpoint_position.restore_uid
       else:
         checkpoint_restore_uid = None
 
@@ -2139,6 +2055,9 @@ class StrategyExtendedV2(object):
         # pylint: disable=protected-access
         # Let the checkpointing infrastructure know that the variable was
         # already restored so it doesn't waste memory loading the value again.
+        # In this case of CheckpointInitialValueCallable this may already be
+        # done by the final variable creator, but it doesn't hurt to do it
+        # again.
         created._maybe_initialize_trackable()
         created._update_uid = checkpoint_restore_uid
         # pylint: enable=protected-access
@@ -2243,19 +2162,6 @@ class StrategyExtendedV2(object):
     """Validate `colocate_with_variable` argument to `colocate_vars_with`."""
     pass
 
-  def _experimental_assign_to_logical_device(self, tensor, logical_device_id):
-    raise NotImplementedError("This method should be overriden by "
-                              "sub-classes which support model parallelism.")
-
-  def _experimental_split_to_logical_devices(self, tensor,
-                                             partition_dimensions):
-    raise NotImplementedError("This method should be overriden by "
-                              "sub-classes which support model parallelism.")
-
-  def _experimental_replicate_to_logical_devices(self, tensor):
-    raise NotImplementedError("This method should be overriden by "
-                              "sub-classes which support model parallelism.")
-
   def _make_dataset_iterator(self, dataset):
     raise NotImplementedError("must be implemented in descendants")
 
@@ -2265,8 +2171,7 @@ class StrategyExtendedV2(object):
   def _experimental_distribute_dataset(self, dataset, options):
     raise NotImplementedError("must be implemented in descendants")
 
-  def _experimental_distribute_datasets_from_function(self, dataset_fn,
-                                                      options):
+  def _distribute_datasets_from_function(self, dataset_fn, options):
     raise NotImplementedError("must be implemented in descendants")
 
   def _experimental_distribute_values_from_function(self, value_fn):
@@ -2277,7 +2182,7 @@ class StrategyExtendedV2(object):
     dst = device_util.current() or self._default_device or "/device:CPU:0"
     return self._local_results(self.reduce_to(reduce_op, value, dst))[0]
 
-  def reduce_to(self, reduce_op, value, destinations, experimental_hints=None):
+  def reduce_to(self, reduce_op, value, destinations, options=None):
     """Combine (via e.g. sum or mean) values across replicas.
 
     `reduce_to` aggregates `tf.distribute.DistributedValues` and distributed
@@ -2332,20 +2237,27 @@ class StrategyExtendedV2(object):
     <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.0>
 
     Args:
-      reduce_op: a `tf.distribute.ReduceOp` or string. How to reduce the value.
-      value: a `tf.distribute.DistributedValue`, or a `tf.Tensor` like object.
-      destinations: a `tf.distribute.DistributedValue`, a `tf.Variable`, a
+      reduce_op: a `tf.distribute.ReduceOp` value specifying how values should
+        be combined. Allows using string representation of the enum such as
+        "SUM", "MEAN".
+      value: a `tf.distribute.DistributedValues`, or a `tf.Tensor` like object.
+      destinations: a `tf.distribute.DistributedValues`, a `tf.Variable`, a
         `tf.Tensor` alike object, or a device string. It specifies the devices
         to reduce to. To perform an all-reduce, pass the same to `value` and
         `destinations`. Note that if it's a `tf.Variable`, the value is reduced
-        to the devices of that variable, this method doesn't update the variable.
-      experimental_hints: a `tf.distrbute.experimental.CollectiveHints`. Hints
-        to perform collective operations. See
-        `tf.distrbute.experimental.CollectiveHints` for details.
+        to the devices of that variable, and this method doesn't update the
+        variable.
+      options: a `tf.distribute.experimental.CommunicationOptions`. Options to
+        perform collective operations. This overrides the default options if the
+        `tf.distribute.Strategy` takes one in the constructor. See
+        `tf.distribute.experimental.CommunicationOptions` for details of the
+        options.
 
     Returns:
       A tensor or value reduced to `destinations`.
     """
+    if options is None:
+      options = collective_util.Options()
     _require_cross_replica_or_default_context_extended(self)
     assert not isinstance(destinations, (list, tuple))
     assert not isinstance(reduce_op, variable_scope.VariableAggregation)
@@ -2353,17 +2265,12 @@ class StrategyExtendedV2(object):
       reduce_op = reduce_util.ReduceOp(reduce_op.upper())
     assert (reduce_op == reduce_util.ReduceOp.SUM or
             reduce_op == reduce_util.ReduceOp.MEAN)
-    if experimental_hints is None:
-      experimental_hints = collective_util.Hints()
-    return self._reduce_to(reduce_op, value, destinations, experimental_hints)
+    return self._reduce_to(reduce_op, value, destinations, options)
 
-  def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
+  def _reduce_to(self, reduce_op, value, destinations, options):
     raise NotImplementedError("must be implemented in descendants")
 
-  def batch_reduce_to(self,
-                      reduce_op,
-                      value_destination_pairs,
-                      experimental_hints=None):
+  def batch_reduce_to(self, reduce_op, value_destination_pairs, options=None):
     """Combine multiple `reduce_to` calls into one for faster execution.
 
     Similar to `reduce_to`, but accepts a list of (value, destinations) pairs.
@@ -2413,29 +2320,75 @@ class StrategyExtendedV2(object):
     <tf.Variable 'Variable:0' shape=() dtype=float32, numpy=1.0>
 
     Args:
-      reduce_op: a `tf.distribute.ReduceOp`. How to reduce the value.
+      reduce_op: a `tf.distribute.ReduceOp` value specifying how values should
+        be combined. Allows using string representation of the enum such as
+        "SUM", "MEAN".
       value_destination_pairs: a sequence of (value, destinations) pairs. See
-        `reduce_to()` for descriptions.
-      experimental_hints: a `tf.distrbute.experimental.CollectiveHints`. Hints
-        to perform collective operations.
+        `tf.distribute.Strategy.reduce_to` for descriptions.
+      options: a `tf.distribute.experimental.CommunicationOptions`. Options to
+        perform collective operations. This overrides the default options if the
+        `tf.distribute.Strategy` takes one in the constructor. See
+        `tf.distribute.experimental.CommunicationOptions` for details of the
+        options.
 
     Returns:
       A list of reduced values, one per pair in `value_destination_pairs`.
     """
+    if options is None:
+      options = collective_util.Options()
     _require_cross_replica_or_default_context_extended(self)
     assert not isinstance(reduce_op, variable_scope.VariableAggregation)
     if isinstance(reduce_op, six.string_types):
       reduce_op = reduce_util.ReduceOp(reduce_op.upper())
-    if experimental_hints is None:
-      experimental_hints = collective_util.Hints()
-    return self._batch_reduce_to(reduce_op, value_destination_pairs,
-                                 experimental_hints)
+    return self._batch_reduce_to(reduce_op, value_destination_pairs, options)
 
-  def _batch_reduce_to(self, reduce_op, value_destination_pairs,
-                       experimental_hints):
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs, options):
     return [
-        self.reduce_to(
-            reduce_op, t, destinations=v, experimental_hints=experimental_hints)
+        self.reduce_to(reduce_op, t, destinations=v, options=options)
+        for t, v in value_destination_pairs
+    ]
+
+  def _gather_to(self, value, destinations, axis, options=None):
+    """Gather `value` across replicas along axis-th dimension to `destinations`.
+
+    `gather_to` gathers `tf.distribute.DistributedValues` or `tf.Tensor`-like
+    object, along `axis`-th dimension. It supports only dense tensors but NOT
+    sparse tensor. This API can only be called in cross-replica context.
+
+    Args:
+      value: a `tf.distribute.DistributedValues`, or a `tf.Tensor` like object.
+      destinations: a `tf.distribute.DistributedValues`, a `tf.Variable`, a
+        `tf.Tensor` alike object, or a device string. It specifies the devices
+        to reduce to. To perform an all-gather, pass the same to `value` and
+        `destinations`. Note that if it's a `tf.Variable`, the value is reduced
+        to the devices of that variable, and this method doesn't update the
+        variable.
+      axis: 0-D int32 Tensor. Dimension along which to gather. Must be in the
+        range [0, rank(value)).
+      options: a `tf.distribute.experimental.CommunicationOptions`. Options to
+        perform collective operations. This overrides the default options if the
+        `tf.distribute.Strategy` takes one in the constructor. See
+        `tf.distribute.experimental.CommunicationOptions` for details of the
+        options.
+
+    Returns:
+      A tensor or value gathered to `destinations`.
+    """
+    _require_cross_replica_or_default_context_extended(self)
+    assert not isinstance(destinations, (list, tuple))
+    if options is None:
+      options = collective_util.Options()
+    return self._gather_to_implementation(value, destinations, axis, options)
+
+  def _gather_to_implementation(self, value, destinations, axis, options):
+    raise NotImplementedError("_gather_to must be implemented in descendants")
+
+  def _batch_gather_to(self, value_destination_pairs, axis, options=None):
+    _require_cross_replica_or_default_context_extended(self)
+    if options is None:
+      options = collective_util.Options()
+    return [
+        self._gather_to(t, destinations=v, axis=axis, options=options)
         for t, v in value_destination_pairs
     ]
 
@@ -2456,7 +2409,8 @@ class StrategyExtendedV2(object):
     Example usage:
 
     ```python
-    strategy = tf.distribute.MirroredStrategy(['GPU:0', 'GPU:1']) # With 2 devices
+    strategy = tf.distribute.MirroredStrategy(['GPU:0', 'GPU:1']) # With 2
+    devices
     with strategy.scope():
       v = tf.Variable(5.0, aggregation=tf.VariableAggregation.SUM)
     def update_fn(v):
@@ -2511,35 +2465,6 @@ class StrategyExtendedV2(object):
   def _update(self, var, fn, args, kwargs, group):
     raise NotImplementedError("must be implemented in descendants")
 
-  @doc_controls.do_not_generate_docs
-  def update_non_slot(
-      self, colocate_with, fn, args=(), kwargs=None, group=True):
-    """Runs `fn(*args, **kwargs)` on `colocate_with` devices.
-
-    Used to update non-slot variables.
-
-    Args:
-      colocate_with: Devices returned by `non_slot_devices()`.
-      fn: Function to execute.
-      args: Tuple or list. Positional arguments to pass to `fn()`.
-      kwargs: Dict with keyword arguments to pass to `fn()`.
-      group: Boolean. Defaults to True. If False, the return value will be
-        unwrapped.
-
-    Returns:
-      Return value of `fn`, possibly merged across devices.
-    """
-    _require_cross_replica_or_default_context_extended(self)
-    if kwargs is None:
-      kwargs = {}
-    fn = autograph.tf_convert(
-        fn, autograph_ctx.control_status_ctx(), convert_by_default=False)
-    with self._container_strategy().scope():
-      return self._update_non_slot(colocate_with, fn, args, kwargs, group)
-
-  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
-    raise NotImplementedError("must be implemented in descendants")
-
   def _local_results(self, distributed_value):
     raise NotImplementedError("must be implemented in descendants")
 
@@ -2593,26 +2518,6 @@ class StrategyExtendedV2(object):
     # TODO(josh11b): More docstring
     raise NotImplementedError("must be implemented in descendants")
 
-  @doc_controls.do_not_generate_docs
-  def non_slot_devices(self, var_list):
-    """Device(s) for non-slot variables.
-
-    This method returns non-slot devices where non-slot variables are placed.
-    Users can create non-slot variables on these devices by using a block:
-
-    ```python
-    with tf.distribute.StrategyExtended.colocate_vars_with(tf.distribute.StrategyExtended.non_slot_devices(...)):
-      ...
-    ```
-
-    Args:
-      var_list: The list of variables being optimized, needed with the
-        default `tf.distribute.Strategy`.
-    Returns:
-      A sequence of devices for non-slot variables.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
   def _configure(self,
                  session_config=None,
                  cluster_spec=None,
@@ -2811,6 +2716,57 @@ class StrategyExtendedV1(StrategyExtendedV2):
     """
     raise NotImplementedError("must be implemented in descendants")
 
+  def update_non_slot(
+      self, colocate_with, fn, args=(), kwargs=None, group=True):
+    """Runs `fn(*args, **kwargs)` on `colocate_with` devices.
+
+    Used to update non-slot variables.
+
+    DEPRECATED: TF 1.x ONLY.
+
+    Args:
+      colocate_with: Devices returned by `non_slot_devices()`.
+      fn: Function to execute.
+      args: Tuple or list. Positional arguments to pass to `fn()`.
+      kwargs: Dict with keyword arguments to pass to `fn()`.
+      group: Boolean. Defaults to True. If False, the return value will be
+        unwrapped.
+
+    Returns:
+      Return value of `fn`, possibly merged across devices.
+    """
+    _require_cross_replica_or_default_context_extended(self)
+    if kwargs is None:
+      kwargs = {}
+    fn = autograph.tf_convert(
+        fn, autograph_ctx.control_status_ctx(), convert_by_default=False)
+    with self._container_strategy().scope():
+      return self._update_non_slot(colocate_with, fn, args, kwargs, group)
+
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def non_slot_devices(self, var_list):
+    """Device(s) for non-slot variables.
+
+    DEPRECATED: TF 1.x ONLY.
+
+    This method returns non-slot devices where non-slot variables are placed.
+    Users can create non-slot variables on these devices by using a block:
+
+    ```python
+    with tf.distribute.StrategyExtended.colocate_vars_with(tf.distribute.StrategyExtended.non_slot_devices(...)):
+      ...
+    ```
+
+    Args:
+      var_list: The list of variables being optimized, needed with the
+        default `tf.distribute.Strategy`.
+    Returns:
+      A sequence of devices for non-slot variables.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
   @property
   def experimental_between_graph(self):
     """Whether the strategy uses between-graph replication or not.
@@ -2892,6 +2848,11 @@ class ReplicaContext(object):
       raise ValueError(
           "replica_id_in_sync_group can only be an integer, a Tensor or None.")
     self._replica_id_in_sync_group = replica_id_in_sync_group
+    # We need this check becaused TPUContext extends from ReplicaContext and
+    # does not pass a strategy object since it is used by TPUEstimator.
+    if strategy:
+      self._local_replica_id = strategy.extended._get_local_replica_id(
+          replica_id_in_sync_group)
     self._summary_recording_distribution_strategy = None
 
   @doc_controls.do_not_generate_docs
@@ -2991,6 +2952,11 @@ class ReplicaContext(object):
         dtypes.int32,
         name="replica_id_in_sync_group")
 
+  @property
+  def _replica_id(self):
+    """This is the local replica id in a given sync group."""
+    return self._local_replica_id
+
   @property
   def strategy(self):
     """The current `tf.distribute.Strategy` object."""
@@ -3009,43 +2975,78 @@ class ReplicaContext(object):
     require_replica_context(self)
     return (device_util.current(),)
 
-  def all_reduce(self, reduce_op, value, experimental_hints=None):
-    """All-reduces the given `value Tensor` nest across replicas.
+  def all_reduce(self, reduce_op, value, options=None):
+    """All-reduces `value` across all replicas.
 
-    If `all_reduce` is called in any replica, it must be called in all replicas.
-    The nested structure and `Tensor` shapes must be identical in all replicas.
+    >>> strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"])
+    >>> def step_fn():
+    ...   ctx = tf.distribute.get_replica_context()
+    ...   value = tf.identity(1.)
+    ...   return ctx.all_reduce(tf.distribute.ReduceOp.SUM, value)
+    >>> strategy.experimental_local_results(strategy.run(step_fn))
+    (<tf.Tensor: shape=(), dtype=float32, numpy=2.0>,
+     <tf.Tensor: shape=(), dtype=float32, numpy=2.0>)
 
-    IMPORTANT: The ordering of communications must be identical in all replicas.
+    It supports batched operations. You can pass a list of values and it
+    attempts to batch them when possible. You can also specify `options`
+    to indicate the desired batching behavior, e.g. batch the values into
+    multiple packs so that they can better overlap with computations.
 
-    Example with two replicas:
-      Replica 0 `value`: {'a': 1, 'b': [40, 1]}
-      Replica 1 `value`: {'a': 3, 'b': [ 2, 98]}
+    >>> strategy = tf.distribute.MirroredStrategy(["GPU:0", "GPU:1"])
+    >>> def step_fn():
+    ...   ctx = tf.distribute.get_replica_context()
+    ...   value1 = tf.identity(1.)
+    ...   value2 = tf.identity(2.)
+    ...   return ctx.all_reduce(tf.distribute.ReduceOp.SUM, [value1, value2])
+    >>> strategy.experimental_local_results(strategy.run(step_fn))
+    ([PerReplica:{
+      0: <tf.Tensor: shape=(), dtype=float32, numpy=2.0>,
+      1: <tf.Tensor: shape=(), dtype=float32, numpy=2.0>
+    }, PerReplica:{
+      0: <tf.Tensor: shape=(), dtype=float32, numpy=4.0>,
+      1: <tf.Tensor: shape=(), dtype=float32, numpy=4.0>
+    }],)
 
-      If `reduce_op` == `SUM`:
-        Result (on all replicas): {'a': 4, 'b': [42, 99]}
+    Note that all replicas need to participate in the all-reduce, otherwise this
+    operation hangs. Note that if there're multiple all-reduces, they need to
+    execute in the same order on all replicas. Dispatching all-reduce based on
+    conditions is usually error-prone.
 
-      If `reduce_op` == `MEAN`:
-        Result (on all replicas): {'a': 2, 'b': [21, 49.5]}
+    This API currently can only be called in the replica context. Other
+    variants to reduce values across replicas are:
+    * `tf.distribute.StrategyExtended.reduce_to`: the reduce and all-reduce API
+      in the cross-replica context.
+    * `tf.distribute.StrategyExtended.batch_reduce_to`: the batched reduce and
+      all-reduce API in the cross-replica context.
+    * `tf.distribute.Strategy.reduce`: a more convenient method to reduce
+      to the host in cross-replica context.
 
     Args:
-      reduce_op: Reduction type, an instance of `tf.distribute.ReduceOp` enum.
-      value: The nested structure of `Tensor`s to all-reduce. The structure must
-        be compatible with `tf.nest`.
-      experimental_hints: A `tf.distrbute.experimental.CollectiveHints`. Hints
-        to perform collective operations.
+      reduce_op: a `tf.distribute.ReduceOp` value specifying how values should
+        be combined. Allows using string representation of the enum such as
+        "SUM", "MEAN".
+      value: a nested structure of `tf.Tensor` which `tf.nest.flatten` accepts.
+        The structure and the shapes of the `tf.Tensor` need to be same on all
+        replicas.
+      options: a `tf.distribute.experimental.CommunicationOptions`. Options to
+        perform collective operations. This overrides the default options if the
+        `tf.distribute.Strategy` takes one in the constructor. See
+        `tf.distribute.experimental.CommunicationOptions` for details of the
+        options.
 
     Returns:
-       A `Tensor` nest with the reduced `value`s from each replica.
+       A nested structure of `tf.Tensor` with the reduced values. The structure
+       is the same as `value`.
     """
     if isinstance(reduce_op, six.string_types):
       reduce_op = reduce_util.ReduceOp(reduce_op.upper())
-    if experimental_hints is None:
-      experimental_hints = collective_util.Hints()
+    if options is None:
+      options = collective_util.Options()
 
     def batch_all_reduce(strategy, *value_flat):
       return strategy.extended.batch_reduce_to(
           reduce_op, [(v, _batch_reduce_destination(v)) for v in value_flat],
-          experimental_hints)
+          options)
 
     if reduce_op in [reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN]:
       # TODO(cjfj): Work out why `batch_reduce` doesn't return the correct grad.
@@ -3071,6 +3072,115 @@ class ReplicaContext(object):
   #   to that point that the first result is needed. Most likely this can be
   #   implemented in terms of `merge_call()` and `batch_reduce_to()`.
 
+  # TODO(wxinyi): generate docs after it is implemented for all strategies.
+  def _all_gather(self, value, axis, options=None):
+    """All-gathers `value` across all replicas along `axis`.
+
+    Note: An `all_gather` method can only be called in replica context. To find
+    a cross-replica context counterpart, see `tf.distribute.Strategy.gather`.
+    All replicas need to participate in the all-gather, otherwise this
+    operation hangs. So if `all_gather` is called in any replica, it must be
+    called in all replicas.
+
+    Note: If there're multiple all-gather calls, they need to execute in
+    the same order on all replicas. Dispatching all-gather based on conditions
+    is usually error-prone.
+
+    # TODO(wxinyi): convert to testable docstring after implemented for MirroredStrategy
+    ```python
+    strategy = tf.distribute.MirroredStrategy(["GPU:0", "CPU:0"])
+    @tf.function
+    def gather_value():
+      ctx = tf.distribute.get_replica_context()
+      value = tf.constant([1, 2, 3])
+      # all_gather a `tf.distribute.DistributedValues`
+      return strategy.run(ctx.all_gather(value, axis=0))
+    strategy.experimental_local_results(gather_value)
+    # Result:
+    # (<tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3],
+    # dtype=int32)>,
+    #  <tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3],
+    # dtype=int32)>)
+    ```
+
+    ```python
+    strategy = tf.distribute.MirroredStrategy(["GPU:0", "CPU:0"])
+    @tf.function
+    def gather_nest():
+      ctx = tf.distribute.get_replica_context()
+      value_1 = tf.constant([1, 2, 3])
+      value_2 = tf.constant([[1, 2], [3, 4]])
+      # all_gather a nest of `tf.distribute.DistributedValues`
+      return ctx.all_gather([value_1, value_2], axis=0)
+    strategy.experimental_local_results(gather_nest)
+    # Result:
+    # ([<tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3],
+    # dtype=int32)>, <tf.Tensor: shape=(4, 2), dtype=int32, numpy=
+    # array([[1, 2],
+    #        [3, 4],
+    #        [1, 2],
+    #        [3, 4]], dtype=int32)],
+    # [<tf.Tensor: shape=(6,), dtype=int32, numpy=array([1, 2, 3, 1, 2, 3],
+    # dtype=int32)>, <tf.Tensor: shape=(4, 2), dtype=int32, numpy=
+    # array([[1, 2],
+    #        [3, 4],
+    #        [1, 2],
+    #        [3, 4]], dtype=int32)])
+    ```
+
+    Example with two replicas:
+      Replica 0 `value`: {'a': [0], 'b': [[0, 1]]}
+      Replica 1 `value`: {'a': [1], 'b': [[2, 3], [4, 5]]}
+
+      Result for `all_gather` with `axis`=0: (on all replicas):
+      {'a': [1, 2], 'b': [[0, 1], [2, 3], [4, 5]]}
+
+    Note: an input to be all_gathered must have the same rank on different
+    replicas, and they must have shapes that are consistent along all dimensions
+    except the `axis`-th dimension. For example, given a
+    `tf.distribute.DistributedValues` with tensors of shape `(1, 2, 3)` and
+    `(1, 3, 3)` on two replicas, you can call `all_gather(..., axis=1, ...)` on
+    it, but not `all_gather(..., axis=0, ...)` or `all_gather(..., axis=2, ...)`.
+
+
+    Args:
+      value: a nested structure of `tf.Tensor` which `tf.nest.flatten` accepts,
+        or a `tf.distribute.DistributedValues` instance. The structure of the
+        `tf.Tensor` need to be same on all replicas. The underlying tensor
+        constructs can only be dense tensors with non-zero rank, NOT
+        `tf.IndexedSlices`.
+      axis: 0-D int32 Tensor. Dimension along which to gather.
+      options: a `tf.distribute.experimental.CommunicationOptions`. Options to
+        perform collective operations. This overrides the default options if the
+        `tf.distribute.Strategy` takes one in the constructor. See
+        `tf.distribute.experimental.CommunicationOptions` for details of the
+        options.
+
+    Returns:
+       A nested structure of `tf.Tensor` with the gathered values. The structure
+       is the same as `value`.
+    """
+    for v in nest.flatten(value):
+      if isinstance(v, ops.IndexedSlices):
+        raise NotImplementedError("gather/all_gather does not support "
+                                  "IndexedSlices")
+
+    if options is None:
+      options = collective_util.Options()
+
+    def batch_all_gather(strategy, *value_flat):
+      return strategy.extended._batch_gather_to(  # pylint: disable=protected-access
+          [(v, _batch_reduce_destination(v)) for v in value_flat], axis,
+          options)
+
+    @custom_gradient.custom_gradient
+    def grad_wrapper(*xs):
+      ys = self.merge_call(batch_all_gather, args=xs)
+      # The gradient of an all-gather is itself an all-gather.
+      return ys, lambda *dy_s: self._all_gather(dy_s, axis)
+
+    return nest.pack_sequence_as(value, grad_wrapper(*nest.flatten(value)))
+
 
 def _batch_reduce_destination(x):
   """Returns the destinations for batch all-reduce."""
@@ -3178,8 +3288,7 @@ class _DefaultDistributionExtended(StrategyExtendedV1):
   def _experimental_distribute_dataset(self, dataset, options):
     return dataset
 
-  def _experimental_distribute_datasets_from_function(self, dataset_fn,
-                                                      options):
+  def _distribute_datasets_from_function(self, dataset_fn, options):
     return dataset_fn(InputContext())
 
   def _experimental_distribute_values_from_function(self, value_fn):
@@ -3216,9 +3325,13 @@ class _DefaultDistributionExtended(StrategyExtendedV1):
     with ReplicaContext(self._container_strategy(), replica_id_in_sync_group=0):
       return fn(*args, **kwargs)
 
-  def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
+  def _reduce_to(self, reduce_op, value, destinations, options):
     # TODO(josh11b): Use destinations?
-    del reduce_op, destinations, experimental_hints
+    del reduce_op, destinations, options
+    return value
+
+  def _gather_to_implementation(self, value, destinations, axis, options):
+    del destinations, axis, options
     return value
 
   def _update(self, var, fn, args, kwargs, group):
@@ -3275,6 +3388,12 @@ class _DefaultDistributionExtended(StrategyExtendedV1):
   def should_save_summary(self):
     return True
 
+  def _get_local_replica_id(self, replica_id_in_sync_group):
+    return replica_id_in_sync_group
+
+  def _get_replica_id_in_sync_group(self, replica_id):
+    return replica_id
+
   # TODO(priyag): This should inherit from `InputIterator`, once dependency
   # issues have been resolved.
   class DefaultInputIterator(object):
diff --git a/tensorflow/python/distribute/distribute_lib_test.py b/tensorflow/python/distribute/distribute_lib_test.py
index 816ff0ce465..7533b3a35ca 100644
--- a/tensorflow/python/distribute/distribute_lib_test.py
+++ b/tensorflow/python/distribute/distribute_lib_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 from absl.testing import parameterized
-import numpy as np
 
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.data.ops import dataset_ops
@@ -89,21 +88,16 @@ class _TestExtended(distribute_lib.StrategyExtendedV1):
                                            [distribute_lib.InputContext()],
                                            self._container_strategy())
 
-  def _experimental_distribute_datasets_from_function(self, dataset_fn,
-                                                      options):
+  def _distribute_datasets_from_function(self, dataset_fn, options):
     return dataset_fn(distribute_lib.InputContext())
 
   def _local_results(self, value):
     return (value,)
 
-  def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
-    del reduce_op, destinations, experimental_hints
+  def _reduce_to(self, reduce_op, value, destinations, options):
+    del reduce_op, destinations, options
     return value
 
-  def _experimental_make_numpy_dataset(self, numpy_input, session):
-    del session
-    return dataset_ops.DatasetV2.from_tensor_slices(numpy_input)
-
   def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
                                           initial_loop_values=None):
     # TODO(tomhennigan) This is missing many things (e.g. ctx.run_op).
@@ -125,6 +119,9 @@ class _TestExtended(distribute_lib.StrategyExtendedV1):
     else:
       return nest.map_structure(self._unwrap, result)
 
+  def _get_local_replica_id(self, replica_id_in_sync_group):
+    return replica_id_in_sync_group
+
 
 def _assert_in_default_state(t):
   t.assertIs(ds_context._get_default_replica_context(),
@@ -162,7 +159,7 @@ class TestStrategyTest(test.TestCase):
 
     def run_fn():
       replica_context = ds_context.get_replica_context()
-      self.assertTrue(replica_context is not None)
+      self.assertIsNotNone(replica_context)
       self.assertIs(None, ds_context.get_cross_replica_context())
       self.assertFalse(ds_context.in_cross_replica_context())
       self.assertTrue(ds_context.has_strategy())
@@ -344,13 +341,6 @@ class TestStrategyTest(test.TestCase):
       self.assertEqual(self.evaluate(x), self.evaluate(x_r))
       self.assertEqual(self.evaluate(y), self.evaluate(y_r))
 
-  @_run_in_and_out_of_scope
-  def testExperimentalMakeNumpyDataset(self, dist):
-    numpy_input = np.ones([10], dtype=np.float32)
-    dataset = dist.experimental_make_numpy_dataset(numpy_input)
-    self.assertEqual(
-        self.evaluate(dataset.reduce(0., lambda a, b: a + b)), 10.)
-
   @_run_in_and_out_of_scope
   def testExperimentalRunStepsOnIterator(self, dist):
     all_inputs = []
@@ -546,14 +536,14 @@ class DefaultDistributionStrategyTest(test.TestCase, parameterized.TestCase):
     if context.executing_eagerly():
       dataset_fn = lambda _: dataset_ops.DatasetV2.range(10).batch(2)
       dist_dataset_from_func = \
-          default_strategy.experimental_distribute_datasets_from_function(
+          default_strategy.distribute_datasets_from_function(
               dataset_fn)
       next_val = next(iter(dist_dataset_from_func))
       self.assertAllEqual([0, 1], self.evaluate(next_val))
     else:
       dataset_fn = lambda _: dataset_ops.DatasetV2.range(10).batch(2)
       dist_dataset_from_func = \
-        default_strategy.experimental_distribute_datasets_from_function(
+        default_strategy.distribute_datasets_from_function(
             dataset_fn)
       dataset_ops.make_initializable_iterator(dist_dataset_from_func)
 
diff --git a/tensorflow/python/distribute/distribute_utils.py b/tensorflow/python/distribute/distribute_utils.py
index 62f03c60224..32582ac9ce6 100644
--- a/tensorflow/python/distribute/distribute_utils.py
+++ b/tensorflow/python/distribute/distribute_utils.py
@@ -115,10 +115,10 @@ def regroup(values, wrap_class=values_lib.PerReplica, always_wrap=False):
     # pylint: disable=protected-access
     assert not isinstance(v0, values_lib.MirroredVariable), (
         "ids = %s, values = %s" % ([id(v) for v in values], values))
-    distributed_container = v0._distributed_container
+    distributed_container = v0._distributed_container()
     assert distributed_container is not None
     for v in values[1:]:
-      assert distributed_container is v._distributed_container
+      assert distributed_container is v._distributed_container()
     return distributed_container
   # pylint: enable=protected-access
 
@@ -209,7 +209,7 @@ def value_container(val):
       # DistributedVariable has _distributed_container defined
       # but we don't want to return it.
       not isinstance(val, values_lib.DistributedVariable)):
-    container = val._distributed_container  # pylint: disable=protected-access
+    container = val._distributed_container()  # pylint: disable=protected-access
     if container is not None:
       return container
   return val
@@ -318,13 +318,6 @@ def create_mirrored_variable(strategy, real_mirrored_creator, class_mapping,
     else:
       var_cls = class_mapping.get(synchronization)
       result = var_cls(strategy, value_list, aggregation)
-    # Install the created DistributedVariable as _distributed_container property
-    # of the underlying variables, to make it easy to map back to the container.
-    for v in result.values:
-      # Hold a strong reference to avoid the container from being GC-ed. After
-      # v = v.assign(), the user code may no longer holds references to the
-      # original container, since v.assign() returns a new DistributedVariable.
-      v._distributed_container = result  # pylint: disable=protected-access
 
   # Add the wrapped variable to the requested collections.
   # The handling of eager mode and the global step matches
diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index b77739c1274..d01cedcead0 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -61,7 +61,7 @@ from tensorflow.tools.docs import doc_controls
 def get_distributed_dataset(dataset,
                             input_workers,
                             strategy,
-                            split_batch_by=None,
+                            num_replicas_in_sync=None,
                             input_context=None):
   """Returns a distributed dataset from the given tf.data.Dataset instance.
 
@@ -77,8 +77,10 @@ def get_distributed_dataset(dataset,
         iterators should be created.
     strategy: a `tf.distribute.Strategy` object, used to run all-reduce to
         handle last partial batch.
-    split_batch_by: Optional integer. If present, we "split" each batch of the
-        dataset by `split_batch_by` value.
+    num_replicas_in_sync: Optional integer. If this is not None, the value is
+        used to decide how to rebatch datasets into smaller batches so that
+        the total batch size for each step (across all workers and replicas)
+        adds up to `dataset`'s batch size.
     input_context: `InputContext` for sharding. Only pass this in for between
         graph multi-worker cases where there is only one `input_worker`. In
         these cases, we will shard based on the `input_pipeline_id` and
@@ -92,14 +94,14 @@ def get_distributed_dataset(dataset,
         dataset,
         input_workers,
         strategy,
-        split_batch_by=split_batch_by,
+        num_replicas_in_sync=num_replicas_in_sync,
         input_context=input_context)
   else:
     return DistributedDatasetV1(
         dataset,
         input_workers,
         strategy,
-        split_batch_by=split_batch_by,
+        num_replicas_in_sync=num_replicas_in_sync,
         input_context=input_context)
 
 
@@ -274,7 +276,7 @@ class DistributedDatasetInterface(collections_abc.Iterable,
 
   There are two APIs to create a `tf.distribute.DistributedDataset` object:
   `tf.distribute.Strategy.experimental_distribute_dataset(dataset)`and
-  `tf.distribute.Strategy.experimental_distribute_datasets_from_function(dataset_fn)`.
+  `tf.distribute.Strategy.distribute_datasets_from_function(dataset_fn)`.
   *When to use which?* When you have a `tf.data.Dataset` instance, and the
   regular batch splitting (i.e. re-batch the input `tf.data.Dataset` instance
   with a new batch size that is equal to the global batch size divided by the
@@ -499,21 +501,27 @@ class InputWorkers(object):
     return InputWorkers(worker_device_pairs)
 
 
-def _get_next_as_optional(iterator, strategy, name=None):
-  """Returns an empty dataset indicator and the next input from the iterator."""
+def _get_next_as_optional(iterator, strategy, return_per_replica=False):
+  """Returns an empty dataset indicator and the next input from the iterator.
+
+  Args:
+    iterator: a DistributedIterator object.
+    strategy: the `tf.distribute.Strategy` instance.
+    return_per_replica: a boolean. If True, the returned data will be wrapped
+      with `PerReplica` structure. Otherwise it is a 2D
+      num_input_workers*num_replicas_per_worker list.
+
+  Returns:
+    A tuple (a boolean tensor indicating whether the next batch has value
+    globally, data from all replicas).
+  """
   replicas = []
   worker_has_values = []
   worker_devices = []
   for i, worker in enumerate(iterator._input_workers.worker_devices):  # pylint: disable=protected-access
-    if name is not None:
-      d = tf_device.DeviceSpec.from_string(worker)
-      new_name = "%s_%s_%d" % (name, d.job, d.task)
-    else:
-      new_name = None
-
     with ops.device(worker):
       worker_has_value, next_element = (
-          iterator._iterators[i].get_next_as_list(new_name))  # pylint: disable=protected-access
+          iterator._iterators[i].get_next_as_list())  # pylint: disable=protected-access
       # Collective all-reduce requires explicit devices for inputs.
       with ops.device("/cpu:0"):
         # Converting to integers for all-reduce.
@@ -523,6 +531,12 @@ def _get_next_as_optional(iterator, strategy, name=None):
       # Make `replicas` a flat list of values across all replicas.
       replicas.append(next_element)
 
+  if return_per_replica:
+    flattened_data = []
+    for per_worker_data in replicas:
+      flattened_data.extend(per_worker_data)
+    replicas = distribute_utils.regroup(flattened_data)
+
   # Run an all-reduce to see whether any worker has values.
   # TODO(b/131423105): we should be able to short-cut the all-reduce in some
   # cases.
@@ -540,67 +554,41 @@ def _get_next_as_optional(iterator, strategy, name=None):
   return global_has_value, replicas
 
 
-def _is_statically_shaped(tensor_class, shape):
+def _is_statically_shaped(element_spec):
   """Test if an iterator output is statically shaped.
 
   For sparse and ragged tensors this only tests the batch dimension.
 
   Args:
-    tensor_class: a class from an iterator.output_classes list.
-    shape: a TensorShape from an iterator.output_shapes list.
+    element_spec: a nest structure of `tf.TypeSpec`. The element spec of the
+      dataset of the iterator.
 
   Returns:
     True if the shape is static, false otherwise.
   """
-  if (tensor_class == sparse_tensor.SparseTensor or
-      isinstance(tensor_class, ragged_tensor.RaggedTensorSpec)):
-    # For sparse or ragged tensor, we should only check the first
-    # dimension in order to get_next_as_optional. This is because
-    # when these tensors get batched by dataset only the batch dimension
-    # is set.
-    if shape.rank > 0 and shape.as_list()[0] is None:
-      return False
-    return True
-  return shape.is_fully_defined()
 
-
-def _get_static_shape(iterators):
-  """Returns a boolean indicating if the input is fully defined."""
-  static_shape = True
-  for iterator in iterators:
-    if not isinstance(iterator, (_SingleWorkerOwnedDatasetIterator,
-                                 _SingleWorkerDatasetIterator)):
-      continue
-    flattened = zip(nest.flatten(iterator.output_shapes),
-                    nest.flatten(iterator.output_classes))
-    for output_shape, output_class in flattened:
-      if not _is_statically_shaped(output_class, output_shape):
-        static_shape = False
-        break
-    return static_shape
+  for spec in nest.flatten(element_spec):
+    if isinstance(
+        spec, (sparse_tensor.SparseTensorSpec, ragged_tensor.RaggedTensorSpec)):
+      # For sparse or ragged tensor, we should only check the first
+      # dimension in order to get_next_as_optional. This is because
+      # when these tensors get batched by dataset only the batch dimension
+      # is set.
+      if spec.shape.rank > 0 and spec.shape.as_list()[0] is None:
+        return False
+    else:
+      for component in nest.flatten(spec._component_specs):  # pylint: disable=protected-access
+        if not component.shape.is_fully_defined():
+          return False
+  return True
 
 
 class DistributedIteratorBase(DistributedIteratorInterface):
   """Common implementation for all input iterators."""
 
   # pylint: disable=super-init-not-called
-  def __init__(self, input_workers, iterators, strategy):
-    static_shape = _get_static_shape(iterators)
-
-    # TODO(b/133073708): we currently need a flag to control the usage because
-    # there is a performance difference between get_next() and
-    # get_next_as_optional(). And we only enable get_next_as_optional when the
-    # output shapes are not static.
-    #
-    # TODO(rxsang): We want to always enable the get_next_as_optional behavior
-    # when user passed input_fn instead of dataset.
-    if getattr(
-        strategy.extended, "experimental_enable_get_next_as_optional", False):
-      self._enable_get_next_as_optional = (
-          not static_shape) or strategy.extended._in_multi_worker_mode()
-    else:
-      self._enable_get_next_as_optional = False
-
+  def __init__(self, input_workers, iterators, strategy,
+               enable_get_next_as_optional):
     assert isinstance(input_workers, InputWorkers)
     if not input_workers.worker_devices:
       raise ValueError("Should have at least one worker for input iterator.")
@@ -608,6 +596,7 @@ class DistributedIteratorBase(DistributedIteratorInterface):
     self._iterators = iterators
     self._input_workers = input_workers
     self._strategy = strategy
+    self._enable_get_next_as_optional = enable_get_next_as_optional
 
   def next(self):
     return self.__next__()
@@ -622,29 +611,15 @@ class DistributedIteratorBase(DistributedIteratorInterface):
     return self
 
   def get_next_as_optional(self):
-    global_has_value, replicas = _get_next_as_optional(self, self._strategy)
+    global_has_value, replicas = _get_next_as_optional(
+        self, self._strategy, return_per_replica=True)
 
     def return_none():
       return optional_ops.Optional.empty(self._element_spec)
 
-    def return_value(replicas):
-      """Wraps the inputs for replicas in an `tf.experimental.Optional`."""
-      results = []
-      for i, worker in enumerate(self._input_workers.worker_devices):
-        with ops.device(worker):
-          devices = self._input_workers.compute_devices_for_worker(i)
-          for j, device in enumerate(devices):
-            with ops.device(device):
-              result = replicas[i][j]
-              results.append(result)
-      replicas = results
-
-      return optional_ops.Optional.from_value(
-          distribute_utils.regroup(replicas))
-
-    return control_flow_ops.cond(global_has_value,
-                                 lambda: return_value(replicas),
-                                 lambda: return_none())  # pylint: disable=unnecessary-lambda
+    return control_flow_ops.cond(
+        global_has_value, lambda: optional_ops.Optional.from_value(replicas),
+        return_none)
 
   def get_next(self, name=None):
     """Returns the next input from the iterator for all replicas."""
@@ -671,7 +646,8 @@ class DistributedIteratorBase(DistributedIteratorInterface):
       out_of_range_replicas.append(data)
       return data
 
-    global_has_value, replicas = _get_next_as_optional(self, self._strategy)
+    global_has_value, replicas = _get_next_as_optional(
+        self, self._strategy, return_per_replica=False)
     results = []
     for i, worker in enumerate(self._input_workers.worker_devices):
       with ops.device(worker):
@@ -754,9 +730,13 @@ class DistributedIteratorV1(DistributedIteratorBase):
 class DistributedIteratorSpec(type_spec.TypeSpec):
   """Type specification for `DistributedIterator`."""
 
-  __slots__ = ["_input_workers", "_element_spec", "_strategy"]
+  __slots__ = [
+      "_input_workers", "_element_spec", "_strategy",
+      "_enable_get_next_as_optional"
+  ]
 
-  def __init__(self, input_workers, element_spec, strategy):
+  def __init__(self, input_workers, element_spec, strategy,
+               enable_get_next_as_optional):
     # We don't want to allow deserialization of this class because we don't
     # serialize the strategy object. Currently the only places where
     # _deserialize is called is when we save/restore using SavedModels.
@@ -767,6 +747,7 @@ class DistributedIteratorSpec(type_spec.TypeSpec):
       self._input_workers = input_workers
       self._element_spec = element_spec
       self._strategy = strategy
+      self._enable_get_next_as_optional = enable_get_next_as_optional
 
   @property
   def value_type(self):
@@ -807,7 +788,8 @@ class DistributedIteratorSpec(type_spec.TypeSpec):
         lambda a, b: a.most_specific_compatible_type(b), self._element_spec,
         other._element_spec)
     return DistributedIteratorSpec(self._input_workers, element_spec,
-                                   self._strategy)
+                                   self._strategy,
+                                   self._enable_get_next_as_optional)
 
   @property
   def _component_specs(self):
@@ -826,32 +808,41 @@ class DistributedIteratorSpec(type_spec.TypeSpec):
     return value._iterators  # pylint: disable=protected-access
 
   def _from_components(self, components):
-    return DistributedIterator(input_workers=self._input_workers,
-                               iterators=None,
-                               components=components,
-                               element_spec=self._element_spec,
-                               strategy=self._strategy)
+    return DistributedIterator(
+        input_workers=self._input_workers,
+        iterators=None,
+        components=components,
+        element_spec=self._element_spec,
+        strategy=self._strategy,
+        enable_get_next_as_optional=self._enable_get_next_as_optional)
 
   @staticmethod
   def from_value(value):
     # pylint: disable=protected-access
     return DistributedIteratorSpec(value._input_workers, value._element_spec,
-                                   value._strategy)
+                                   value._strategy,
+                                   value._enable_get_next_as_optional)
 
   def _with_tensor_ranks_only(self):
     element_spec = nest.map_structure(
         lambda s: s._with_tensor_ranks_only(),  # pylint: disable=protected-access
         self._element_spec)
     return DistributedIteratorSpec(self._input_workers, element_spec,
-                                   self._strategy)
+                                   self._strategy,
+                                   self._enable_get_next_as_optional)
 
 
 class DistributedIterator(DistributedIteratorBase,
                           composite_tensor.CompositeTensor):
   """Input Iterator for a distributed dataset."""
 
-  def __init__(self, input_workers=None, iterators=None, strategy=None,
-               components=None, element_spec=None):
+  def __init__(self,
+               input_workers=None,
+               iterators=None,
+               strategy=None,
+               components=None,
+               element_spec=None,
+               enable_get_next_as_optional=False):
     if input_workers is None:
       raise ValueError("`input_workers` should be "
                        "provided.")
@@ -866,20 +857,15 @@ class DistributedIterator(DistributedIteratorBase,
       self._element_spec = element_spec
       self._input_workers = input_workers
       self._iterators = components
-      static_shape = _get_static_shape(self._iterators)
       self._strategy = strategy
-      if getattr(strategy.extended,
-                 "experimental_enable_get_next_as_optional", False):
-        self._enable_get_next_as_optional = (
-            not static_shape) or strategy.extended._in_multi_worker_mode()
-      else:
-        self._enable_get_next_as_optional = False
+      self._enable_get_next_as_optional = enable_get_next_as_optional
     else:
       if (components is not None and element_spec is not None):
         raise ValueError(error_message)
 
-      super(DistributedIterator, self).__init__(input_workers, iterators,
-                                                strategy)
+      super(DistributedIterator,
+            self).__init__(input_workers, iterators, strategy,
+                           enable_get_next_as_optional)
 
   @property
   def element_spec(self):
@@ -887,9 +873,9 @@ class DistributedIterator(DistributedIteratorBase,
 
   @property
   def _type_spec(self):
-    return DistributedIteratorSpec(self._input_workers,
-                                   self.element_spec,
-                                   self._strategy)
+    return DistributedIteratorSpec(self._input_workers, self.element_spec,
+                                   self._strategy,
+                                   self._enable_get_next_as_optional)
 
 
 class _IterableInput(DistributedDatasetInterface):
@@ -906,7 +892,8 @@ class _IterableInput(DistributedDatasetInterface):
   def reduce(self, initial_state, reduce_fn):
     """Execute a `reduce_fn` over all the elements of the input."""
     iterator = iter(self)
-    has_data, data = _get_next_as_optional(iterator, self._strategy)
+    has_data, data = _get_next_as_optional(
+        iterator, self._strategy, return_per_replica=True)
 
     def cond(has_data, data, state):
       del data, state  # Unused.
@@ -915,16 +902,9 @@ class _IterableInput(DistributedDatasetInterface):
     def loop_body(has_data, data, state):
       """Executes `reduce_fn` in a loop till the dataset is empty."""
       del has_data  # Unused.
-      # data is list of lists here. where each list corresponds to one worker.
-      # TODO(b/130570614): Add support for the multiworker and TPU pods use
-      # case.
-      if self._input_workers.num_workers == 1:
-        data = data[0]
-      else:
-        raise ValueError("Dataset iteration within a tf.function is"
-                         " not supported for multiple workers.")
-      state = reduce_fn(state, distribute_utils.regroup(data))
-      has_data, data = _get_next_as_optional(iterator, self._strategy)
+      state = reduce_fn(state, data)
+      has_data, data = _get_next_as_optional(
+          iterator, self._strategy, return_per_replica=True)
       return has_data, data, state
 
     has_data, data, final_state = control_flow_ops.while_loop(
@@ -939,20 +919,24 @@ class DistributedDataset(_IterableInput):
                dataset,
                input_workers,
                strategy,
-               split_batch_by=None,
+               num_replicas_in_sync=None,
                input_context=None):
     """Distribute the dataset on all workers.
 
-    If `split_batch_by` is not None, we "split" each batch of the dataset by
-    `split_batch_by` value.
+    If `num_replicas_in_sync` is not None, we split each batch of the dataset
+    into `num_replicas_in_sync` smaller batches, to be distributed among that
+    worker's replicas, so that the batch size for a global step (across all
+    workers and replicas) is as expected.
 
     Args:
       dataset: `tf.data.Dataset` that will be used as the input source.
       input_workers: an `InputWorkers` object.
       strategy: a `tf.distribute.Strategy` object, used to run all-reduce to
         handle last partial batch.
-      split_batch_by: Optional integer. If present, we "split" each batch of the
-        dataset by `split_batch_by` value.
+      num_replicas_in_sync: Optional integer. If this is not None, the value
+        is used to decide how to rebatch datasets into smaller batches so that
+        the total batch size for each step (across all workers and replicas)
+        adds up to `dataset`'s batch size.
       input_context: `InputContext` for sharding. Only pass this in for between
         graph multi-worker cases where there is only one `input_worker`. In
         these cases, we will shard based on the `input_pipeline_id` and
@@ -964,36 +948,29 @@ class DistributedDataset(_IterableInput):
     # different subset of files. If that is not possible, will attempt to shard
     # the final input such that each worker will run the entire preprocessing
     # pipeline and only receive its own shard of the dataset.
-    if split_batch_by:
-      try:
-        # pylint: disable=protected-access
-        with ops.colocate_with(dataset._variant_tensor):
-          dataset = distribute._LegacyRebatchDataset(dataset, split_batch_by)
-          # Add a prefetch to pipeline rebatching for performance.
-          # TODO(rachelim): Instead of inserting an extra prefetch stage here,
-          # leverage static graph rewrites to insert _RebatchDataset before
-          # the final `prefetch` if it exists.
-          dataset = dataset.prefetch(split_batch_by)
-      except errors.InvalidArgumentError as e:
-        if "without encountering a batch" in str(e):
-          six.reraise(
-              ValueError,
-              ValueError(
-                  "Call the `batch` method on the input Dataset in order to be "
-                  "able to split your input across {} replicas.\n Please "
-                  "the tf.distribute.Strategy guide. {}".format(
-                      split_batch_by, e)),
-              sys.exc_info()[2])
-        else:
-          raise
+
+    # Additionally, we rebatch the dataset on each worker into
+    # `num_replicas_in_sync` smaller batches to be distributed among that
+    # worker's replicas, so that the batch size for a global step (across all
+    # workers and replicas) adds up to the original dataset's batch size.
+    if num_replicas_in_sync is not None:
+      num_workers = input_context.num_input_pipelines if input_context else len(
+          input_workers.worker_devices)
+      rebatch_fn = self._make_rebatch_fn(dataset, num_workers,
+                                         num_replicas_in_sync)
+    else:
+      rebatch_fn = None
 
     self._cloned_datasets = []
     if input_context:
       # Between-graph where we rely on the input_context for sharding
       assert input_workers.num_workers == 1
+      if rebatch_fn is not None:
+        dataset = rebatch_fn(dataset, input_context.input_pipeline_id)
       dataset = input_ops.auto_shard_dataset(dataset,
                                              input_context.num_input_pipelines,
-                                             input_context.input_pipeline_id)
+                                             input_context.input_pipeline_id,
+                                             num_replicas_in_sync)
       self._cloned_datasets.append(dataset)
     else:
       replicated_ds = distribute.replicate(dataset,
@@ -1002,14 +979,73 @@ class DistributedDataset(_IterableInput):
         with ops.device(worker):
           cloned_dataset = replicated_ds[worker]
           cloned_dataset = cloned_dataset.with_options(dataset.options())
+          if rebatch_fn is not None:
+            cloned_dataset = rebatch_fn(cloned_dataset, i)
           cloned_dataset = input_ops.auto_shard_dataset(
-              cloned_dataset, len(input_workers.worker_devices), i)
+              cloned_dataset, len(input_workers.worker_devices), i,
+              num_replicas_in_sync)
           self._cloned_datasets.append(cloned_dataset)
 
     self._input_workers = input_workers
     self._strategy = strategy
-    self._element_spec = _create_distributed_tensor_spec(self._strategy,
-                                                         dataset.element_spec)  # pylint: disable=protected-access
+    self._enable_get_next_as_optional = _enable_get_next_as_optional(
+        self._strategy, dataset.element_spec)
+    self._element_spec = _create_distributed_tensor_spec(
+        self._strategy, self._cloned_datasets[0].element_spec)
+
+  def _make_rebatch_fn(self, dataset, num_workers, num_replicas_in_sync):
+    """Returns a callable that rebatches the input dataset.
+
+    Args:
+      dataset: A `tf.data.Dataset` representing the dataset to be distributed.
+      num_workers: An integer representing the number of workers to distribute
+        `dataset` among.
+      num_replicas_in_sync: An integer representing the number of replicas in
+        sync across all workers.
+    """
+    if num_replicas_in_sync % num_workers:
+      raise ValueError(
+          "tf.distribute expects every worker to have the same number of "
+          "replicas. However, encountered `num_replicas_in_sync` ({}) that "
+          "cannot be divided by `num_workers` ({})".format(
+              num_replicas_in_sync, num_workers))
+
+    num_replicas_per_worker = num_replicas_in_sync // num_workers
+    with ops.colocate_with(dataset._variant_tensor):  # pylint: disable=protected-access
+      batch_size = distribute.compute_batch_size(dataset)
+
+    def rebatch_fn(dataset, worker_index):
+      try:
+        # pylint: disable=protected-access
+        def apply_rebatch():
+          batch_sizes = distribute.batch_sizes_for_worker(
+              batch_size, num_workers, num_replicas_per_worker, worker_index)
+          return distribute._RebatchDataset(
+              dataset, batch_sizes).prefetch(num_replicas_per_worker)
+
+        def apply_legacy_rebatch():
+          return distribute._LegacyRebatchDataset(
+              dataset, num_replicas_in_sync).prefetch(num_replicas_per_worker)
+
+        with ops.colocate_with(dataset._variant_tensor):
+          return control_flow_ops.cond(
+              math_ops.not_equal(batch_size, -1),
+              true_fn=apply_rebatch,
+              false_fn=apply_legacy_rebatch)
+      except errors.InvalidArgumentError as e:
+        if "without encountering a batch" in str(e):
+          six.reraise(
+              ValueError,
+              ValueError(
+                  "Call the `batch` method on the input Dataset in order to be "
+                  "able to split your input across {} replicas.\n Please see "
+                  "the tf.distribute.Strategy guide. {}".format(
+                      num_replicas_in_sync, e)),
+              sys.exc_info()[2])
+        else:
+          raise
+
+    return rebatch_fn
 
   def __iter__(self):
     if not (context.executing_eagerly() or
@@ -1026,11 +1062,17 @@ class DistributedDataset(_IterableInput):
                                                     self._input_workers,
                                                     enable_legacy_iterators)
     if enable_legacy_iterators:
-      iterator = DistributedIteratorV1(self._input_workers, worker_iterators,
-                                       self._strategy)
+      iterator = DistributedIteratorV1(
+          self._input_workers,
+          worker_iterators,
+          self._strategy,
+          enable_get_next_as_optional=self._enable_get_next_as_optional)
     else:
-      iterator = DistributedIterator(self._input_workers, worker_iterators,
-                                     self._strategy)
+      iterator = DistributedIterator(
+          self._input_workers,
+          worker_iterators,
+          self._strategy,
+          enable_get_next_as_optional=self._enable_get_next_as_optional)
     iterator._element_spec = self.element_spec  # pylint: disable=protected-access
 
     # When async eager is enabled, sometimes the iterator may not finish
@@ -1054,14 +1096,14 @@ class DistributedDatasetV1(DistributedDataset):
                dataset,
                input_workers,
                strategy,
-               split_batch_by=None,
+               num_replicas_in_sync=None,
                input_context=None):
     self._input_workers = input_workers
     super(DistributedDatasetV1, self).__init__(
         dataset,
         input_workers,
         strategy,
-        split_batch_by=split_batch_by,
+        num_replicas_in_sync=num_replicas_in_sync,
         input_context=input_context)
 
   def make_one_shot_iterator(self):
@@ -1111,7 +1153,8 @@ class DistributedDatasetV1(DistributedDataset):
                                                     self._input_workers,
                                                     True)
     iterator = DistributedIteratorV1(self._input_workers, worker_iterators,
-                                     self._strategy)
+                                     self._strategy,
+                                     self._enable_get_next_as_optional)
     iterator._element_spec = self.element_spec  # pylint: disable=protected-access
 
     # When async eager is enabled, sometimes the iterator may not finish
@@ -1163,6 +1206,8 @@ class DistributedDatasetsFromFunction(_IterableInput):
         _create_datasets_per_worker_with_input_context(self._input_contexts,
                                                        self._input_workers,
                                                        dataset_fn))
+    self._enable_get_next_as_optional = _enable_get_next_as_optional(
+        self._strategy, element_spec)
     self._element_spec = _create_distributed_tensor_spec(
         self._strategy, element_spec)
 
@@ -1181,11 +1226,17 @@ class DistributedDatasetsFromFunction(_IterableInput):
                                                enable_legacy_iterators)
 
       if enable_legacy_iterators:
-        iterator = DistributedIteratorV1(self._input_workers, iterators,
-                                         self._strategy)
+        iterator = DistributedIteratorV1(
+            self._input_workers,
+            iterators,
+            self._strategy,
+            enable_get_next_as_optional=self._enable_get_next_as_optional)
       else:
-        iterator = DistributedIterator(self._input_workers, iterators,
-                                       self._strategy)
+        iterator = DistributedIterator(
+            self._input_workers,
+            iterators,
+            self._strategy,
+            enable_get_next_as_optional=self._enable_get_next_as_optional)
       iterator._element_spec = self._element_spec  # pylint: disable=protected-access
 
       # When async eager is enabled, sometimes the iterator may not finish
@@ -1232,7 +1283,8 @@ class DistributedDatasetsFromFunctionV1(DistributedDatasetsFromFunction):
     iterators = _create_iterators_per_worker(self._datasets,
                                              self._input_workers, True)
     iterator = DistributedIteratorV1(self._input_workers, iterators,
-                                     self._strategy)
+                                     self._strategy,
+                                     self._enable_get_next_as_optional)
     iterator._element_spec = self._element_spec  # pylint: disable=protected-access
 
     # When async eager is enabled, sometimes the iterator may not finish
@@ -1296,9 +1348,8 @@ class InputFunctionIterator(DistributedIteratorV1):
               "input_fn must return a tf.data.Dataset or a callable.")
         iterators.append(iterator)
 
-    super(InputFunctionIterator, self).__init__(input_workers, iterators,
-                                                strategy)
-    self._enable_get_next_as_optional = False
+    super(InputFunctionIterator, self).__init__(
+        input_workers, iterators, strategy, enable_get_next_as_optional=False)
 
 
 # TODO(anjalisridhar): This class will soon be removed and users should move
@@ -1310,20 +1361,24 @@ class DatasetIterator(DistributedIteratorV1):
                dataset,
                input_workers,
                strategy,
-               split_batch_by=None,
+               num_replicas_in_sync=None,
                input_context=None):
     """Make an iterator for the dataset on given devices.
 
-    If `split_batch_by` is not None, we "split" each batch of the
-    dataset by `split_batch_by` value.
+    If `num_replicas_in_sync` is not None, we split each batch of the dataset
+    into `num_replicas_in_sync` smaller batches, to be distributed among that
+    worker's replicas, so that the batch size for a global step (across all
+    workers and replicas) is as expected.
 
     Args:
       dataset: `tf.data.Dataset` that will be used as the input source.
       input_workers: an `InputWorkers` object.
       strategy: a `tf.distribute.Strategy` object, used to run all-reduce to
         handle last partial batch.
-      split_batch_by: Optional integer. If present, we "split" each batch of the
-        dataset by `split_batch_by` value.
+      num_replicas_in_sync: Optional integer. If this is not None, the value is
+        used to decide how to rebatch datasets into smaller batches so that the
+        total batch size for each step (across all workers and replicas) adds up
+        to `dataset`'s batch size.
       input_context: `InputContext` for sharding. Only pass this in for between
         graph multi-worker cases where there is only one `input_worker`. In
         these cases, we will shard based on the `input_pipeline_id` and
@@ -1333,14 +1388,13 @@ class DatasetIterator(DistributedIteratorV1):
         dataset,
         input_workers,
         strategy,
-        split_batch_by=split_batch_by,
+        num_replicas_in_sync=num_replicas_in_sync,
         input_context=input_context)
     worker_iterators = _create_iterators_per_worker(
         dist_dataset._cloned_datasets, input_workers, True)  # pylint: disable=protected-access
-    super(DatasetIterator, self).__init__(
-        input_workers,
-        worker_iterators,  # pylint: disable=protected-access
-        strategy)
+    super(DatasetIterator,
+          self).__init__(input_workers, worker_iterators, strategy,
+                         dist_dataset._enable_get_next_as_optional)  # pylint: disable=protected-access
     self._element_spec = dist_dataset.element_spec
 
 
@@ -1959,3 +2013,19 @@ def _replace_per_replica_spec(spec, i):
     return spec._value_specs[i]  # pylint: disable=protected-access
   else:
     return spec
+
+
+def _enable_get_next_as_optional(strategy, element_spec):
+  """Returns whether to enable using partial batch handling."""
+  # TODO(b/133073708): we currently need a flag to control the usage because
+  # there is a performance difference between get_next() and
+  # get_next_as_optional(). And we only enable get_next_as_optional when the
+  # output shapes are not static.
+  #
+  # TODO(rxsang): We want to always enable the get_next_as_optional behavior
+  # when user passed input_fn instead of dataset.
+  if not getattr(strategy.extended, "experimental_enable_get_next_as_optional",
+                 False):
+    return False
+  return not _is_statically_shaped(
+      element_spec) or strategy.extended._in_multi_worker_mode()  # pylint: disable=protected-access
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index a70eb50dbba..5abd6f483d3 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -19,8 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
-import json
-import threading
 
 from absl.testing import parameterized
 import numpy as np
@@ -29,18 +27,15 @@ from tensorflow.python import tf2
 from tensorflow.python.compat import compat
 from tensorflow.python.data.experimental.ops.distribute_options import AutoShardPolicy
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import input_lib
-from tensorflow.python.distribute import mirrored_strategy
-from tensorflow.python.distribute import multi_worker_test_base
-from tensorflow.python.distribute import parameter_server_strategy
+from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import test_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
@@ -61,12 +56,13 @@ class DistributedIteratorTestBase(test.TestCase):
 
   # The passed input_context is to create a sharded dataset in between-graph
   # case.
+  # TODO(yuefengz): rewrite the following method to make it less DRY.
   def _wrap_iterator(self,
                      input_type,
                      dataset_or_input_fn,
                      input_workers,
                      devices,
-                     split_batch_by,
+                     num_replicas_in_sync,
                      strategy,
                      input_context=None):
     # The `input_context` passed in is to shard dataset for
@@ -98,7 +94,7 @@ class DistributedIteratorTestBase(test.TestCase):
           dataset_or_input_fn,
           input_workers,
           strategy,
-          split_batch_by=split_batch_by,
+          num_replicas_in_sync=num_replicas_in_sync,
           input_context=input_context)
     return iterator
 
@@ -106,7 +102,7 @@ class DistributedIteratorTestBase(test.TestCase):
                     input_type,
                     dataset,
                     input_workers,
-                    split_batch_by,
+                    num_replicas_in_sync,
                     strategy,
                     input_context=None):
     if input_type == "dataset":
@@ -115,17 +111,49 @@ class DistributedIteratorTestBase(test.TestCase):
             dataset,
             input_workers,
             strategy,
-            split_batch_by=split_batch_by,
+            num_replicas_in_sync=num_replicas_in_sync,
             input_context=input_context)
       else:
         return input_lib.DistributedDatasetV1(
             dataset,
             input_workers,
             strategy,
-            split_batch_by=split_batch_by,
+            num_replicas_in_sync=num_replicas_in_sync,
             input_context=input_context)
     else:
-      return strategy.experimental_distribute_datasets_from_function(dataset)
+      return strategy.distribute_datasets_from_function(dataset)
+
+  def _assert_iterator_values(self,
+                              iterator,
+                              expected_values,
+                              evaluate_fn,
+                              devices,
+                              enable_get_next_as_optional=False):
+    actual_values = []
+    for _ in range(len(expected_values)):
+      if enable_get_next_as_optional:
+        next_element = iterator.get_next_as_optional().get_value()
+      else:
+        next_element = iterator.get_next()
+      computed_value = evaluate_fn([
+          distribute_utils.select_replica(r, next_element)
+          for r in range(len(devices))
+      ])
+      actual_values.append(computed_value)
+    for expected_value, actual_value in zip(expected_values, actual_values):
+      for expected, actual in zip(expected_value, actual_value):
+        self.assertAllEqual(expected, actual)
+
+  def _assert_dataset_values_for_loop(self, dataset, expected_values,
+                                      evaluate_fn, devices):
+    actual_values = []
+    for x in dataset:
+      computed_value = self.evaluate(
+          [distribute_utils.select_replica(r, x) for r in range(len(devices))])
+      actual_values.append(computed_value)
+    for expected_value, actual_value in zip(expected_values, actual_values):
+      for expected, actual in zip(expected_value, actual_value):
+        self.assertAllEqual(expected, actual)
 
   def _test_input_iteration(self,
                             input_type,
@@ -136,7 +164,7 @@ class DistributedIteratorTestBase(test.TestCase):
                             expected_values,
                             strategy,
                             sess=None,
-                            split_batch_by=None,
+                            num_replicas_in_sync=None,
                             input_context=None):
     if iteration_type == "for_loop" and not context.executing_eagerly():
       self.skipTest("unsupported test combination.")
@@ -156,7 +184,7 @@ class DistributedIteratorTestBase(test.TestCase):
           dataset_or_input_fn,
           input_workers,
           devices,
-          split_batch_by,
+          num_replicas_in_sync,
           strategy,
           input_context=input_context)
     else:
@@ -165,7 +193,7 @@ class DistributedIteratorTestBase(test.TestCase):
           input_type,
           dataset_or_input_fn,
           input_workers,
-          split_batch_by,
+          num_replicas_in_sync,
           strategy,
           input_context=input_context)
 
@@ -187,23 +215,12 @@ class DistributedIteratorTestBase(test.TestCase):
         evaluate(control_flow_ops.group(iterator.initializer))
 
       def test_get_next(iterator):
-        for expected_value in expected_values:
-          next_element = iterator.get_next()
-          computed_value = evaluate([
-              distribute_utils.select_replica(r, next_element)
-              for r in range(len(devices))
-          ])
-
-          self.assertEqual(len(expected_value), len(computed_value))
-          for i in range(len(expected_value)):
-            self.assertAllEqual(expected_value[i], computed_value[i])
+        self._assert_iterator_values(iterator, expected_values, evaluate,
+                                     devices)
 
         with self.assertRaises(errors.OutOfRangeError):
-          next_element = iterator.get_next()
-          evaluate([
-              distribute_utils.select_replica(r, next_element)
-              for r in range(len(devices))
-          ])
+          self._assert_iterator_values(iterator, expected_values, evaluate,
+                                       devices)
 
         # After re-initializing the iterator, should be able to iterate again.
         if not ops.executing_eagerly_outside_functions():
@@ -214,60 +231,43 @@ class DistributedIteratorTestBase(test.TestCase):
           else:
             iterator = iter(dataset)
 
-        for expected_value in expected_values:
-          next_element = iterator.get_next()
-          computed_value = evaluate([
-              distribute_utils.select_replica(r, next_element)
-              for r in range(len(devices))
-          ])
-          self.assertEqual(len(expected_value), len(computed_value))
-          for i in range(len(expected_value)):
-            self.assertAllEqual(expected_value[i], computed_value[i])
+        self._assert_iterator_values(iterator, expected_values, evaluate,
+                                     devices)
 
       def test_get_next_as_optional(iterator):
-        for expected_value in expected_values:
-          next_element = iterator.get_next_as_optional()
-          computed_value = evaluate([
-              distribute_utils.select_replica(r, next_element.get_value())
-              for r in range(len(devices))
-          ])
-
-          self.assertEqual(len(expected_value), len(computed_value))
-          for i in range(len(expected_value)):
-            self.assertAllEqual(expected_value[i], computed_value[i])
+        self._assert_iterator_values(
+            iterator,
+            expected_values,
+            evaluate,
+            devices,
+            enable_get_next_as_optional=True)
 
         next_element = iterator.get_next_as_optional()
         self.assertFalse(self.evaluate(next_element.has_value()))
         with self.assertRaises(errors.InvalidArgumentError):
-          evaluate([
-              distribute_utils.select_replica(r, next_element.get_value())
-              for r in range(len(devices))
-          ])
+          self._assert_iterator_values(
+              iterator, [0],
+              evaluate,
+              devices,
+              enable_get_next_as_optional=True)
 
       test_get_next(iterator)
 
       # re-initializing the iterator
       if not tf2.enabled():
-        self.skipTest("Not testing get_next_as_optional in TF1")
+        # TODO(yuefengz): we should split this function.
+        return
       else:
         if api_type == "wrap_into_iterator":
-          self.skipTest("unsupported test combination")
+          return
         else:
           iterator = iter(dataset)
 
       test_get_next_as_optional(iterator)
 
     if iteration_type == "for_loop" and context.executing_eagerly():
-      actual_values = []
-      for x in dataset:
-        computed_value = self.evaluate(
-            [distribute_utils.select_replica(r, x)
-             for r in range(len(devices))])
-        actual_values.append(computed_value)
-      for i, expected_value in enumerate(expected_values):
-        self.assertEqual(len(expected_value), len(actual_values[i]))
-        for j in range(len(expected_value)):
-          self.assertAllEqual(expected_value[j], actual_values[i][j])
+      self._assert_dataset_values_for_loop(dataset, expected_values,
+                                           self.evaluate, devices)
 
   def _create_dataset_or_input_fn(self, input_type, input_fn):
     if input_type == "input_fn":
@@ -276,8 +276,8 @@ class DistributedIteratorTestBase(test.TestCase):
       return input_fn(distribute_lib.InputContext())
 
 
-class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
-                                          parameterized.TestCase):
+class DistributedIteratorTest(DistributedIteratorTestBase,
+                              parameterized.TestCase):
 
   @combinations.generate(
       combinations.combine(
@@ -286,7 +286,8 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
           distribution=[
               strategy_combinations.one_device_strategy,
               strategy_combinations.mirrored_strategy_with_one_cpu,
-              strategy_combinations.mirrored_strategy_with_gpu_and_cpu
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_cpu
           ]))
   def testDisablingOwnedIteratorsInTF2(self, distribution, input_type):
     if not tf2.enabled():
@@ -355,16 +356,36 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
           iteration_type=["get_next", "for_loop"],
           distribution=[
               strategy_combinations.one_device_strategy,
-              strategy_combinations.mirrored_strategy_with_one_cpu
+              strategy_combinations.mirrored_strategy_with_one_cpu,
           ],
           enable_get_next_as_optional=[True, False]))
   def testOneDeviceCPU(self, input_type, api_type, iteration_type, distribution,
                        enable_get_next_as_optional):
     worker_device_pairs = [("/device:CPU:0", ["/device:CPU:0"])]
-    if tf2.enabled():
-      dataset_fn = lambda _: dataset_ops.DatasetV2.range(10)
-    else:
-      dataset_fn = lambda _: dataset_ops.DatasetV1.range(10)
+    dataset_fn = lambda _: dataset_ops.Dataset.range(10)
+    dataset_or_input_fn = self._create_dataset_or_input_fn(
+        input_type, dataset_fn)
+
+    expected_values = [[i] for i in range(10)]
+
+    distribution.extended.experimental_enable_get_next_as_optional = (
+        enable_get_next_as_optional)
+    self._test_input_iteration(input_type, api_type, iteration_type,
+                               dataset_or_input_fn, worker_device_pairs,
+                               expected_values, distribution)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          input_type=["input_fn", "dataset"],
+          api_type=["wrap_into_dataset"],
+          iteration_type=["get_next", "for_loop"],
+          distribution=[strategy_combinations.multi_worker_mirrored_2x1_cpu],
+          enable_get_next_as_optional=[True, False]))
+  def testOneDeviceCPUMultiWorker(self, input_type, api_type, iteration_type,
+                                  distribution, enable_get_next_as_optional):
+    worker_device_pairs = [("/device:CPU:0", ["/device:CPU:0"])]
+    dataset_fn = lambda _: dataset_ops.DatasetV1.range(10)
     dataset_or_input_fn = self._create_dataset_or_input_fn(
         input_type, dataset_fn)
 
@@ -396,10 +417,7 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
                                  distribution, enable_get_next_as_optional):
     worker_device_pairs = [("/device:CPU:0", ["/device:GPU:0",
                                               "/device:CPU:0"])]
-    if tf2.enabled():
-      dataset_fn = lambda _: dataset_ops.DatasetV2.range(10)
-    else:
-      dataset_fn = lambda _: dataset_ops.Dataset.range(10)
+    dataset_fn = lambda _: dataset_ops.Dataset.range(10)
     dataset_or_input_fn = self._create_dataset_or_input_fn(
         input_type, dataset_fn)
 
@@ -432,10 +450,7 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
       worker_device_pairs.setdefault(host_device, [])
       worker_device_pairs[host_device].append(tpu_device)
     worker_device_pairs = worker_device_pairs.items()
-    if tf2.enabled():
-      dataset_fn = lambda _: dataset_ops.DatasetV2.range(10)
-    else:
-      dataset_fn = lambda _: dataset_ops.Dataset.range(10)
+    dataset_fn = lambda _: dataset_ops.Dataset.range(10)
     dataset_or_input_fn = self._create_dataset_or_input_fn(
         input_type, dataset_fn)
 
@@ -460,7 +475,7 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
           iteration_type=["get_next", "for_loop"],
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-              strategy_combinations.central_storage_strategy_with_gpu_and_cpu
+              strategy_combinations.central_storage_strategy_with_gpu_and_cpu,
           ],
           enable_get_next_as_optional=[True, False]))
   def testTupleDataset(self, input_type, api_type, iteration_type, distribution,
@@ -470,14 +485,10 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
 
     def dataset_fn(ctx):
       del ctx
-      if tf2.enabled():
-        dataset1 = dataset_ops.DatasetV2.range(10)
-        dataset2 = dataset_ops.DatasetV2.range(10).map(lambda x: x**2)
-        return dataset_ops.DatasetV2.zip((dataset1, dataset2))
-      else:
-        dataset1 = dataset_ops.Dataset.range(10)
-        dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
-        return dataset_ops.Dataset.zip((dataset1, dataset2))
+      dataset1 = dataset_ops.Dataset.range(10)
+      dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
+      return dataset_ops.Dataset.zip((dataset1, dataset2))
+
     dataset_or_input_fn = self._create_dataset_or_input_fn(
         input_type, dataset_fn)
 
@@ -494,18 +505,53 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
         expected_values,
         distribution)
 
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          input_type=["input_fn", "dataset"],
+          api_type=["wrap_into_dataset"],
+          iteration_type=["get_next", "for_loop"],
+          distribution=[strategy_combinations.multi_worker_mirrored_2x2_gpu],
+          enable_get_next_as_optional=[True, False]))
+  def testTupleDatasetMultiworker(self, input_type, api_type, iteration_type,
+                                  distribution, enable_get_next_as_optional):
+    worker_device_pairs = [("/device:CPU:0", ["/device:GPU:0",
+                                              "/device:GPU:1"])]
+
+    def dataset_fn(ctx):
+      del ctx
+      dataset1 = dataset_ops.Dataset.range(10)
+      dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
+      return dataset_ops.Dataset.zip((dataset1, dataset2))
+
+    dataset_or_input_fn = self._create_dataset_or_input_fn(
+        input_type, dataset_fn)
+
+    expected_values = [
+        [(i, i**2), (i + 1, (i + 1)**2)] for i in range(0, 10, 2)
+    ]
+
+    distribution.extended.experimental_enable_get_next_as_optional = (
+        enable_get_next_as_optional)
+
+    # Input_context is not passed in and thus no sharding.
+    self._test_input_iteration(input_type, api_type, iteration_type,
+                               dataset_or_input_fn, worker_device_pairs,
+                               expected_values, distribution)
+
   @combinations.generate(
       combinations.combine(
           mode=["eager"],
           distribution=[
               strategy_combinations.one_device_strategy,
-              strategy_combinations.mirrored_strategy_with_one_cpu
+              strategy_combinations.mirrored_strategy_with_one_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
           ]))
   def testIterableIterator(self, distribution):
     worker_device_pairs = [("/device:CPU:0", ["/device:CPU:0"])]
     input_workers = input_lib.InputWorkers(worker_device_pairs)
 
-    dataset = dataset_ops.DatasetV2.range(10)
+    dataset = dataset_ops.Dataset.range(10)
     dist_dataset = input_lib.get_distributed_dataset(dataset, input_workers,
                                                      distribution)
 
@@ -528,12 +574,8 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
                                drop_remainder, distribution):
     worker_device_pairs = [("/device:CPU:0", ["/device:GPU:0",
                                               "/device:CPU:0"])]
-    if tf2.enabled():
-      dataset_fn = lambda _: dataset_ops.DatasetV2.range(9).batch(  # pylint: disable=g-long-lambda
-          2, drop_remainder=drop_remainder)
-    else:
-      dataset_fn = lambda _: dataset_ops.Dataset.range(9).batch(  # pylint: disable=g-long-lambda
-          2, drop_remainder=drop_remainder)
+    dataset_fn = lambda _: dataset_ops.Dataset.range(9).batch(  # pylint: disable=g-long-lambda
+        2, drop_remainder=drop_remainder)
     dataset_or_input_fn = self._create_dataset_or_input_fn(
         input_type, dataset_fn)
 
@@ -552,33 +594,152 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
         expected_values,
         distribution)
 
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          input_type=["input_fn", "dataset"],
+          api_type=["wrap_into_dataset"],
+          iteration_type=["get_next", "for_loop"],
+          drop_remainder=[True, False],
+          distribution=[
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_gpu,
+          ]))
+  def testUnevenDatasetBatchesMultiWorker(self, input_type, api_type,
+                                          iteration_type, drop_remainder,
+                                          distribution):
+    # Actual devices don't matter in this test as long as the number of global
+    # repices is 2.
+    worker_device_pairs = [("/device:CPU:0", ["/device:CPU:0"])]
+    cr = distribution.cluster_resolver
+    self.assertIsNotNone(cr)
+    worker_count = multi_worker_util.worker_count(cr.cluster_spec(),
+                                                  cr.task_type)
+    id_in_cluster = multi_worker_util.id_in_cluster(cr.cluster_spec(),
+                                                    cr.task_type, cr.task_id)
+
+    def dataset_fn(_):
+      dataset = dataset_ops.Dataset.range(9)
+
+      if input_type == "input_fn":
+        # When input_fn is used, there is no automatic rebatching and sharding,
+        # so we add them here.
+        return dataset.shard(worker_count, id_in_cluster).batch(1)
+      else:
+        return dataset.batch(2, drop_remainder=drop_remainder)
+
+    dataset_or_input_fn = self._create_dataset_or_input_fn(
+        input_type, dataset_fn)
+
+    if drop_remainder and input_type == "dataset":
+      if id_in_cluster == 0:
+        expected_values = [[[0]], [[2]], [[4]], [[6]]]
+      else:
+        expected_values = [[[1]], [[3]], [[5]], [[7]]]
+    else:
+      # The last global batch only contains data for one replica.
+      if id_in_cluster == 0:
+        expected_values = [[[0]], [[2]], [[4]], [[6]], [[8]]]
+      else:
+        expected_values = [[[1]], [[3]], [[5]], [[7]], [[]]]
+    distribution.extended.experimental_enable_get_next_as_optional = True
+    self._test_input_iteration(
+        input_type,
+        api_type,
+        iteration_type,
+        dataset_or_input_fn,
+        worker_device_pairs,
+        expected_values,
+        distribution,
+        num_replicas_in_sync=distribution.num_replicas_in_sync,
+        input_context=distribution.extended._make_input_context())
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          input_type=["input_fn", "dataset"],
+          api_type=["wrap_into_dataset"],
+          iteration_type=["get_next", "for_loop"],
+          drop_remainder=[True, False],
+          distribution=[
+              strategy_combinations.multi_worker_mirrored_2x2_gpu,
+          ]))
+  def testUnevenDatasetBatchesMultiWorkerFourReplicas(self, input_type,
+                                                      api_type, iteration_type,
+                                                      drop_remainder,
+                                                      distribution):
+    # Actual devices don't matter in this test as long as the number of global
+    # repices is 2.
+    worker_device_pairs = [("/device:CPU:0", ["/device:GPU:0",
+                                              "/device:GPU:1"])]
+    cr = distribution.cluster_resolver
+    self.assertIsNotNone(cr)
+    worker_count = multi_worker_util.worker_count(cr.cluster_spec(),
+                                                  cr.task_type)
+    id_in_cluster = multi_worker_util.id_in_cluster(cr.cluster_spec(),
+                                                    cr.task_type, cr.task_id)
+
+    def dataset_fn(_):
+      dataset = dataset_ops.Dataset.range(15)
+
+      if input_type == "input_fn":
+        # When input_fn is used, there is no automatic rebatching and sharding,
+        # so we add them here.
+        return dataset.shard(worker_count, id_in_cluster).batch(1)
+      else:
+        return dataset.batch(4, drop_remainder=drop_remainder)
+
+    dataset_or_input_fn = self._create_dataset_or_input_fn(
+        input_type, dataset_fn)
+
+    # The last global batch only contains data for one replica.
+    if drop_remainder and input_type == "dataset":
+      if id_in_cluster == 0:
+        expected_values = [[[0], [2]], [[4], [6]], [[8], [10]]]
+      else:
+        expected_values = [[[1], [3]], [[5], [7]], [[9], [11]]]
+    else:
+      if id_in_cluster == 0:
+        expected_values = [[[0], [2]], [[4], [6]], [[8], [10]], [[12], [14]]]
+      else:
+        expected_values = [[[1], [3]], [[5], [7]], [[9], [11]], [[13], []]]
+    distribution.extended.experimental_enable_get_next_as_optional = True
+    self._test_input_iteration(
+        input_type,
+        api_type,
+        iteration_type,
+        dataset_or_input_fn,
+        worker_device_pairs,
+        expected_values,
+        distribution,
+        num_replicas_in_sync=distribution.num_replicas_in_sync,
+        input_context=distribution.extended._make_input_context())
+
   @combinations.generate(
       combinations.combine(
           mode=["graph", "eager"],
           input_type=["dataset"],
           api_type=["wrap_into_iterator", "wrap_into_dataset"],
           iteration_type=["get_next", "for_loop"],
-          split_batch_by=[None, 2],
+          num_replicas_in_sync=[None, 2],
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.central_storage_strategy_with_gpu_and_cpu
           ],
           enable_get_next_as_optional=[True, False]))
   def testBatchSplitting(self, input_type, api_type, iteration_type,
-                         split_batch_by, distribution,
+                         num_replicas_in_sync, distribution,
                          enable_get_next_as_optional):
     worker_device_pairs = [("/device:CPU:0", ["/device:GPU:0",
                                               "/device:CPU:0"])]
     batch_size = 10
-    if tf2.enabled():
-      dataset_fn = lambda _: dataset_ops.DatasetV2.range(100).batch(batch_size)
-    else:
-      dataset_fn = lambda _: dataset_ops.Dataset.range(100).batch(batch_size)
+    dataset_fn = lambda _: dataset_ops.Dataset.range(100).batch(batch_size)
     dataset_or_input_fn = self._create_dataset_or_input_fn(
         input_type, dataset_fn)
 
     updated_batch_size = (
-        batch_size // split_batch_by if split_batch_by else batch_size)
+        batch_size //
+        num_replicas_in_sync if num_replicas_in_sync else batch_size)
     expected_values = [[range(i, i+updated_batch_size),
                         range(i+updated_batch_size, i+2*updated_batch_size)]
                        for i in range(0, 100, updated_batch_size*2)]
@@ -594,7 +755,57 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
         expected_values,
         distribution,
         sess=None,
-        split_batch_by=split_batch_by)
+        num_replicas_in_sync=num_replicas_in_sync)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          input_type=["dataset"],
+          api_type=["wrap_into_dataset"],
+          iteration_type=["get_next", "for_loop"],
+          num_replicas_in_sync=[None, 2],
+          distribution=[
+              strategy_combinations.multi_worker_mirrored_2x2_gpu,
+          ],
+          enable_get_next_as_optional=[True, False]))
+  def testBatchSplittingMultiWorker(self, input_type, api_type, iteration_type,
+                                    num_replicas_in_sync, distribution,
+                                    enable_get_next_as_optional):
+    worker_device_pairs = [("/device:CPU:0", ["/device:GPU:0",
+                                              "/device:GPU:1"])]
+    batch_size = 10
+    cr = distribution.cluster_resolver
+    self.assertIsNotNone(cr)
+
+    def dataset_fn(_):
+      dataset = dataset_ops.Dataset.range(100).batch(batch_size)
+      return dataset
+
+    dataset_or_input_fn = self._create_dataset_or_input_fn(
+        input_type, dataset_fn)
+
+    updated_batch_size = (
+        batch_size //
+        num_replicas_in_sync if num_replicas_in_sync else batch_size)
+    expected_values = [
+        [  # pylint: disable=g-complex-comprehension
+            range(i, i + updated_batch_size),
+            range(i + updated_batch_size, i + 2 * updated_batch_size)
+        ] for i in range(0, 100, updated_batch_size * 2)
+    ]
+
+    distribution.extended.experimental_enable_get_next_as_optional = (
+        enable_get_next_as_optional)
+    self._test_input_iteration(
+        input_type,
+        api_type,
+        iteration_type,
+        dataset_or_input_fn,
+        worker_device_pairs,
+        expected_values,
+        distribution,
+        sess=None,
+        num_replicas_in_sync=num_replicas_in_sync)
 
   @combinations.generate(
       combinations.combine(
@@ -605,13 +816,15 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
               strategy_combinations.central_storage_strategy_with_two_gpus,
+              strategy_combinations.multi_worker_mirrored_2x2_gpu,
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
           ],
       ))
   def testCacheAcrossIteration(self, distribution):
     if not tf2.enabled():
       self.skipTest("Only V2 is supported.")
 
-    dataset = dataset_ops.Dataset.range(10).shuffle(10).cache().batch(2)
+    dataset = dataset_ops.Dataset.range(16).shuffle(16).cache().batch(4)
     dist_dataset = distribution.experimental_distribute_dataset(dataset)
 
     first_epoch = list(
@@ -630,6 +843,8 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
               strategy_combinations.central_storage_strategy_with_two_gpus,
+              strategy_combinations.multi_worker_mirrored_2x2_gpu,
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
           ],
           reshuffle=[True, False]))
   def testShuffleAcrossIterations(self, distribution, reshuffle):
@@ -639,8 +854,8 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
     if not reshuffle and not compat.forward_compatible(2020, 5, 22):
       self.skipTest("Functionality currently not supported.")
 
-    dataset = dataset_ops.Dataset.range(10).shuffle(
-        10, reshuffle_each_iteration=reshuffle).batch(2)
+    dataset = dataset_ops.Dataset.range(12).shuffle(
+        12, reshuffle_each_iteration=reshuffle).batch(4)
     dist_dataset = distribution.experimental_distribute_dataset(dataset)
 
     first_epoch = list(
@@ -662,6 +877,8 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
               strategy_combinations.tpu_strategy,
               strategy_combinations.central_storage_strategy_with_two_gpus,
+              strategy_combinations.multi_worker_mirrored_2x2_gpu,
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
           ]))
   def testGetNextOptionalShape(self, distribution):
     batch_size = 8
@@ -681,13 +898,90 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
         label = data["label"]
 
         # Asser the shapes are still staic from all replicas.
-        for replica_id in range(distribution.num_replicas_in_sync):
+        for replica_id in range(len(distribution.extended.worker_devices)):
           self.assertEqual([per_replica_batch_size, 10],
                            feature[replica_id].shape)
           self.assertEqual([per_replica_batch_size], label[replica_id].shape)
 
     train_fn()
 
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+          ],
+          input_type=["dataset"],
+          api_type=["wrap_into_iterator", "wrap_into_dataset"],
+          iteration_type=["get_next", "for_loop"],
+          auto_shard_policy=[AutoShardPolicy.AUTO, AutoShardPolicy.OFF]))
+  def testAutoshardingOption(self, distribution, input_type, api_type,
+                             iteration_type, auto_shard_policy):
+    cr = distribution.cluster_resolver
+    self.assertIsNotNone(cr)
+    id_in_cluster = multi_worker_util.id_in_cluster(cr.cluster_spec(),
+                                                    cr.task_type, cr.task_id)
+    ds_option = dataset_ops.Options()
+    ds_option.experimental_distribute.auto_shard_policy = auto_shard_policy
+    dataset_fn = (
+        lambda _: dataset_ops.Dataset.range(4).with_options(ds_option))
+    dataset_or_input_fn = self._create_dataset_or_input_fn(
+        input_type, dataset_fn)
+
+    worker_device_pairs = [("/device:CPU:0", ["/device:CPU:0"])]
+    if auto_shard_policy == AutoShardPolicy.AUTO:
+      if id_in_cluster == 0:
+        expected_values = [[0], [2]]
+      else:
+        expected_values = [[1], [3]]
+    else:
+      expected_values = [[0], [1], [2], [3]]
+    self._test_input_iteration(
+        input_type,
+        api_type,
+        iteration_type,
+        dataset_or_input_fn,
+        worker_device_pairs,
+        expected_values,
+        distribution,
+        input_context=distribution.extended._make_input_context())
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+          ],
+          input_type=["input_fn"],
+          api_type=["wrap_into_dataset"],
+          iteration_type=["get_next", "for_loop"]))
+  def testDifferentDatasetsMultiWorker(self, distribution, input_type, api_type,
+                                       iteration_type):
+    cr = distribution.cluster_resolver
+    self.assertIsNotNone(cr)
+    id_in_cluster = multi_worker_util.id_in_cluster(cr.cluster_spec(),
+                                                    cr.task_type, cr.task_id)
+
+    def dataset_fn(ctx):
+      if ctx.input_pipeline_id == 0:
+        return dataset_ops.Dataset.range(8).batch(2)
+      else:
+        return dataset_ops.Dataset.range(9).batch(2)
+
+    dataset_or_input_fn = self._create_dataset_or_input_fn(
+        input_type, dataset_fn)
+
+    worker_device_pairs = [("/device:CPU:0", ["/device:CPU:0"])]
+
+    if id_in_cluster == 0:
+      expected_values = [[[0, 1]], [[2, 3]], [[4, 5]], [[6, 7]], [[]]]
+    else:
+      expected_values = [[[0, 1]], [[2, 3]], [[4, 5]], [[6, 7]], [[8]]]
+    distribution.extended.experimental_enable_get_next_as_optional = True
+    self._test_input_iteration(input_type, api_type, iteration_type,
+                               dataset_or_input_fn, worker_device_pairs,
+                               expected_values, distribution)
+
 
 class DistributedIteratorTensorTypeTest(DistributedIteratorTestBase,
                                         parameterized.TestCase):
@@ -854,8 +1148,7 @@ class DistributedIteratorTensorTypeTest(DistributedIteratorTestBase,
       ds = distribution.experimental_distribute_dataset(
           dataset_fn(distribute_lib.InputContext()))
     else:
-      ds = distribution.experimental_distribute_datasets_from_function(
-          dataset_fn)
+      ds = distribution.distribute_datasets_from_function(dataset_fn)
     iterator = iter(ds)
 
     self.assertEqual(iterator._enable_get_next_as_optional,
@@ -904,8 +1197,7 @@ class DistributedIteratorTensorTypeTest(DistributedIteratorTestBase,
       ds = distribution.experimental_distribute_dataset(
           dataset_fn(distribute_lib.InputContext()))
     else:
-      ds = distribution.experimental_distribute_datasets_from_function(
-          dataset_fn)
+      ds = distribution.distribute_datasets_from_function(dataset_fn)
 
     # Iterate through all the batches and sum them up.
     def sum_batch(per_replica_features):
@@ -947,339 +1239,187 @@ class DistributedIteratorTensorTypeTest(DistributedIteratorTestBase,
       expected_for_sum = 310.
     self.assertAllEqual(nest.flatten(sums), [expected_for_sum] * 3)
 
-
-class DistributedIteratorMultiWorkerTest(
-    multi_worker_test_base.MultiWorkerTestBase, DistributedIteratorTestBase,
-    parameterized.TestCase):
-
-  def _cpu_devices(self):
-    return [
-        ("/job:worker/replica:0/task:0",
-         ["/job:worker/replica:0/task:0/device:CPU:0"]),
-        ("/job:worker/replica:0/task:1",
-         ["/job:worker/replica:0/task:1/device:CPU:0"])]
-
-  def _cpu_and_one_gpu_devices(self):
-    return [
-        ("/job:worker/replica:0/task:0", [
-            "/job:worker/replica:0/task:0/device:GPU:0",
-            "/job:worker/replica:0/task:0/device:CPU:0"
-        ]),
-        ("/job:worker/replica:0/task:1", [
-            "/job:worker/replica:0/task:1/device:GPU:0",
-            "/job:worker/replica:0/task:1/device:CPU:0"
-        ])
-    ]
-
-  @combinations.generate(combinations.combine(
-      mode=["graph"],
-      input_type=["dataset"],
-      api_type=["wrap_into_iterator", "wrap_into_dataset"],
-      iteration_type=["get_next", "for_loop"],
-      auto_shard_policy=[AutoShardPolicy.AUTO, AutoShardPolicy.OFF]))
-  def testAutoshardingOption(self, input_type, api_type, iteration_type,
-                             auto_shard_policy):
-    ds_option = dataset_ops.Options()
-    ds_option.experimental_distribute.auto_shard_policy = auto_shard_policy
-    if tf2.enabled():
-      dataset_fn = (
-          lambda _: dataset_ops.DatasetV2.range(4).with_options(ds_option))
-    else:
-      dataset_fn = (
-          lambda _: dataset_ops.Dataset.range(4).with_options(ds_option))
-    dataset_or_input_fn = self._create_dataset_or_input_fn(
-        input_type, dataset_fn)
-
-    strategy = mirrored_strategy.MirroredStrategy(
-        devices=(self._cpu_devices()[0][1] + self._cpu_devices()[1][1]),
-        cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce(
-            ["/job:worker/task:0", "/job:worker/task:1"], 1))
-    worker_devices = self._cpu_devices()
-    with context.graph_mode(), self.cached_session() as sess:
-      if auto_shard_policy == AutoShardPolicy.AUTO:
-        expected_values = [[0, 1], [2, 3]]
-      else:
-        expected_values = [[0, 0], [1, 1], [2, 2], [3, 3]]
-      self._test_input_iteration(input_type, api_type, iteration_type,
-                                 dataset_or_input_fn, worker_devices,
-                                 expected_values, strategy, sess)
-
   @combinations.generate(
       combinations.combine(
-          mode=["graph"],
-          input_type=["input_fn", "dataset"],
+          mode=["eager"],
+          input_type=["dataset"],
           api_type=["wrap_into_iterator", "wrap_into_dataset"],
           iteration_type=["get_next", "for_loop"],
-          enable_get_next_as_optional=[True, False]))
-  def testOneDevicePerWorker(self, input_type, api_type, iteration_type,
-                             enable_get_next_as_optional):
-    if tf2.enabled():
-      dataset_fn = lambda _: dataset_ops.DatasetV2.range(4)
-    else:
-      dataset_fn = lambda _: dataset_ops.Dataset.range(4)
-    dataset_or_input_fn = self._create_dataset_or_input_fn(
-        input_type, dataset_fn)
-
-    strategy = mirrored_strategy.MirroredStrategy(
-        devices=(self._cpu_devices()[0][1] + self._cpu_devices()[1][1]),
-        cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce(
-            ["/job:worker/task:0", "/job:worker/task:1"], 1))
-    worker_devices = self._cpu_devices()
-    with context.graph_mode(), strategy.scope(), self.cached_session() as sess:
-
-      if input_type == "dataset":
-        # Autosharded
-        expected_values = [[0, 1], [2, 3]]
-      else:
-        expected_values = [[0, 0], [1, 1], [2, 2], [3, 3]]
-      strategy.extended.experimental_enable_get_next_as_optional = (
-          enable_get_next_as_optional)
-      self._test_input_iteration(
-          input_type,
-          api_type,
-          iteration_type,
-          dataset_or_input_fn,
-          worker_devices,
-          expected_values,
-          strategy,
-          sess=sess)
-
-  @combinations.generate(
-      combinations.combine(
-          mode=["graph"],
-          input_type=["input_fn", "dataset"],
-          api_type=["wrap_into_iterator", "wrap_into_dataset"],
-          iteration_type=["get_next", "for_loop"],
-          enable_get_next_as_optional=[True, False],
-          required_gpus=1))
-  def testTwoDevicesPerWorker(self, input_type, api_type, iteration_type,
-                              enable_get_next_as_optional):
-    if tf2.enabled():
-      dataset_fn = lambda _: dataset_ops.DatasetV2.range(4)
-    else:
-      dataset_fn = lambda _: dataset_ops.Dataset.range(4)
-    dataset_or_input_fn = self._create_dataset_or_input_fn(
-        input_type, dataset_fn)
-
-    strategy = mirrored_strategy.MirroredStrategy(
-        devices=(self._cpu_and_one_gpu_devices()[0][1] +
-                 self._cpu_and_one_gpu_devices()[1][1]),
-        cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce(
-            ["/job:worker/task:0", "/job:worker/task:1"], 2))
-    worker_devices = self._cpu_and_one_gpu_devices()
-    with context.graph_mode(), strategy.scope(), self.cached_session() as sess:
-
-      if input_type == "dataset":
-        # Autosharded
-        expected_values = [[0, 2, 1, 3]]
-      else:
-        expected_values = [[0, 1, 0, 1], [2, 3, 2, 3]]
-      strategy.extended.experimental_enable_get_next_as_optional = (
-          enable_get_next_as_optional)
-      self._test_input_iteration(
-          input_type,
-          api_type,
-          iteration_type,
-          dataset_or_input_fn,
-          worker_devices,
-          expected_values,
-          strategy,
-          sess=sess)
-
-  @combinations.generate(
-      combinations.combine(
-          mode=["graph"],
-          input_type=["input_fn", "dataset"],
-          api_type=["wrap_into_iterator", "wrap_into_dataset"],
-          iteration_type=["get_next", "for_loop"],
-          enable_get_next_as_optional=[True, False]))
-  def testTupleDataset(self, input_type, api_type, iteration_type,
-                       enable_get_next_as_optional):
-    strategy = mirrored_strategy.MirroredStrategy(
-        devices=(self._cpu_devices()[0][1] + self._cpu_devices()[1][1]),
-        cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce(
-            ["/job:worker/task:0", "/job:worker/task:1"], 1))
-    worker_devices = self._cpu_devices()
-
+          distribution=[
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_gpu,
+          ]))
+  def testMWMSPartialBatch(self, input_type, api_type, iteration_type,
+                           distribution):
+    # Test case: 2 workers, 1 replica each.
+    # This test simulates the sharded behavior when we have two files each with
+    # 12 elements and a global batch size of 8. When we consider the dataset in
+    # aggregate (non-distributed), there are 24 elements divided into 3 batches
+    # of size 8. Hence, the correct distributed behavior is for each replica to
+    # see sub-batches of size 4, over three steps.
     def dataset_fn(ctx):
       del ctx
-      if tf2.enabled():
-        dataset1 = dataset_ops.DatasetV2.range(4)
-        dataset2 = dataset_ops.DatasetV2.range(4).map(lambda x: x**2)
-        return dataset_ops.DatasetV2.zip((dataset1, dataset2))
-      else:
-        dataset1 = dataset_ops.Dataset.range(4)
-        dataset2 = dataset_ops.Dataset.range(4).map(lambda x: x**2)
-        return dataset_ops.Dataset.zip((dataset1, dataset2))
-    dataset_or_input_fn = self._create_dataset_or_input_fn(
-        input_type, dataset_fn)
+      dataset = dataset_ops.Dataset.range(12).batch(8)
 
-    with context.graph_mode(), strategy.scope(), self.cached_session() as sess:
+      # Set the sharding behavior to OFF for simplicity of test setup; namely,
+      # `dataset` defines the per-worker dataset and will not be further
+      # sharded. Each worker will see a dataset that is
+      # tf.data.Dataset.range(12).batch(8).rebatch(...).
+      options = dataset_ops.Options()
+      options.experimental_distribute.auto_shard_policy = AutoShardPolicy.OFF
+      dataset = dataset.with_options(options)
+      return dataset
 
-      if input_type == "dataset":
-        # Autosharded
-        expected_values = [[(0, 0), (1, 1)], [(2, 4), (3, 9)]]
-      else:
-        expected_values = [[(i, i**2), (i, i**2)] for i in range(0, 4)]
-      strategy.extended.experimental_enable_get_next_as_optional = (
-          enable_get_next_as_optional)
-      self._test_input_iteration(
-          input_type,
-          api_type,
-          iteration_type,
-          dataset_or_input_fn,
-          worker_devices,
-          expected_values,
-          strategy,
-          sess=sess)
+    dataset = self._create_dataset_or_input_fn(input_type, dataset_fn)
+
+    # Actual devices don't matter in this test as long as there is 1 local
+    # replica.
+    worker_device_pairs = [("/device:CPU:0", ["/device:CPU:0"])]
+
+    # Each test runs individually on each worker, so we compare the
+    # values on each worker. Each worker should rebatch its dataset into
+    # smaller batches of size 4.
+    expected_values = [[[0, 1, 2, 3]], [[4, 5, 6, 7]], [[8, 9, 10, 11]]]
+    self._test_input_iteration(
+        input_type,
+        api_type,
+        iteration_type,
+        dataset,
+        worker_device_pairs,
+        expected_values,
+        distribution,
+        num_replicas_in_sync=distribution.num_replicas_in_sync,
+        input_context=distribution.extended._make_input_context())
 
   @combinations.generate(
       combinations.combine(
-          mode=["graph"],
-          input_type=["input_fn", "dataset"],
+          mode=["eager"],
+          input_type=["dataset"],
           api_type=["wrap_into_iterator", "wrap_into_dataset"],
           iteration_type=["get_next", "for_loop"],
-          required_gpus=1))
-  def testUnevenDatasetBatches(self, input_type, api_type, iteration_type):
-    strategy = mirrored_strategy.MirroredStrategy(
-        devices=(self._cpu_and_one_gpu_devices()[0][1] +
-                 self._cpu_and_one_gpu_devices()[1][1]),
-        cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce(
-            ["/job:worker/task:0", "/job:worker/task:1"], 2))
-    if tf2.enabled():
-      dataset_fn = lambda _: dataset_ops.DatasetV2.range(9).batch(2)
-    else:
-      dataset_fn = lambda _: dataset_ops.Dataset.range(9).batch(2)
-    dataset_or_input_fn = self._create_dataset_or_input_fn(
-        input_type, dataset_fn)
-
-    worker_devices = self._cpu_and_one_gpu_devices()
-    with context.graph_mode(), strategy.scope(), self.cached_session() as sess:
-      if input_type == "dataset":
-        # Autosharded
-        expected_values = [[[0, 1], [4, 5], [2, 3], [6, 7]], [[8], [], [], []]]
-      else:
-        expected_values = [[[0, 1], [2, 3], [0, 1], [2, 3]],
-                           [[4, 5], [6, 7], [4, 5], [6, 7]], [[8], [], [8], []]]
-      strategy.extended.experimental_enable_get_next_as_optional = True
-      self._test_input_iteration(
-          input_type,
-          api_type,
-          iteration_type,
-          dataset_or_input_fn,
-          worker_devices,
-          expected_values,
-          strategy,
-          sess=sess)
-
-  @combinations.generate(
-      combinations.combine(
-          mode=["graph"],
-          input_type=["input_fn", "dataset"],
-          api_type=["wrap_into_iterator", "wrap_into_dataset"],
-          iteration_type=["get_next"],
-          strategy_cls=[
-              collective_all_reduce_strategy.CollectiveAllReduceStrategy,
-              parameter_server_strategy.ParameterServerStrategy,
-          ],
-          required_gpus=0))
-  def testUnevenDatasetBatchesBetweenGraph(self, input_type, api_type,
-                                           iteration_type, strategy_cls):
-    if api_type == "wrap_into_dataset" and input_type == "input_fn":
-      self.skipTest("unsupported test combination.")
-    if tf2.enabled():
-      # The V2 tests are skipped since we don't support creating an
-      # iterator for DistributedDataset in graph mode.
-      self.skipTest("unsupported test combination")
-    # Environment variable is global, we need locking when patching TF_CONFIG.
-    lock = threading.Lock()
-
-    def _worker_fn(task_type, task_id, num_gpus):
-      del num_gpus
-      tf_config = {
-          "cluster": self._cluster_spec,
-          "task": {
-              "type": task_type,
-              "index": task_id
-          }
-      }
-      with context.graph_mode(), lock, test.mock.patch.dict(
-          "os.environ", {"TF_CONFIG": json.dumps(tf_config)}):
-        strategy = strategy_cls()
-      with context.graph_mode(), strategy.scope(), self.cached_session(
-          target="grpc://" + self._cluster_spec[task_type][task_id]) as sess:
-        if tf2.enabled():
-          dataset_fn = lambda _: dataset_ops.DatasetV2.range(5).batch(2)
-        else:
-          dataset_fn = lambda _: dataset_ops.Dataset.range(5).batch(2)
-        dataset_or_input_fn = self._create_dataset_or_input_fn(
-            input_type, dataset_fn)
-        if (input_type == "dataset" and strategy_cls is
-            collective_all_reduce_strategy.CollectiveAllReduceStrategy):
-          # Autosharded
-          if task_id == 0:
-            expected_values = [[[0, 1]], [[4]]]
-          else:
-            expected_values = [[[2, 3]], [[]]]
-
-          # input_context is for between-graph auto-sharding.
-          input_context = distribute_lib.InputContext(
-              num_input_pipelines=2,
-              input_pipeline_id=task_id,
-              num_replicas_in_sync=2)
-        else:
-          expected_values = [[[0, 1]], [[2, 3]], [[4]]]
-          input_context = None
-
-        self._test_input_iteration(
-            input_type,
-            api_type,
-            iteration_type,
-            dataset_or_input_fn,
-            [("/job:%s/task:%d" %
-              (task_type, task_id), strategy.extended.worker_devices)],
-            expected_values,
-            strategy,
-            sess=sess,
-            input_context=input_context)
-
-    self._run_between_graph_clients(_worker_fn, self._cluster_spec, 0)
-
-  @combinations.generate(
-      combinations.combine(
-          mode=["graph"], input_type=["input_fn"],
-          api_type=["wrap_into_iterator", "wrap_into_dataset"],
-          iteration_type=["get_next", "for_loop"],
-          required_gpus=1))
-  def testDifferentDatasets(self, input_type, api_type, iteration_type):
+          distribution=[
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_gpu,
+          ]))
+  def testMWMSPartialBatchWithLegacyRebatch(self, input_type, api_type,
+                                            iteration_type, distribution):
+    # Test case: 2 workers, 1 replica each.
+    # This test simulates the sharded behavior when we have two files each with
+    # 12 elements and a global batch size of 8. When we consider the dataset in
+    # aggregate (non-distributed), there are 24 elements divided into 3 batches
+    # of size 8. Hence, the correct distributed behavior is for each replica to
+    # see sub-batches of size 4, over three steps. However, when we create a
+    # DistributedDataset and cannot statically infer the intended global batch
+    # size (e.g. if the user does not use a batching dataset), each worker will
+    # rebatch based on the dynamic batch size of the data encountered, even when
+    # it encounters partial batches. The last per-worker partial batch (size 4)
+    # ends up being split into two replicas, resulting in 4 steps in total, of
+    # (global) batch sizes 8, 8, 4, 4.
     def dataset_fn(ctx):
-      if ctx.input_pipeline_id == 0:
-        return dataset_ops.Dataset.range(8).batch(2)
-      else:
-        return dataset_ops.Dataset.range(9).batch(2)
-    dataset_or_input_fn = self._create_dataset_or_input_fn(
-        input_type, dataset_fn)
+      del ctx
+      # The following dataset is equivalent to
+      # tf.data.Dataset.range(12).batch(8), but does not use a batching dataset.
+      # This causes DistributedDataset to use LegacyRebatch instead.
+      batch_sizes = dataset_ops.Dataset.from_tensor_slices([8, 4])
+      offsets = dataset_ops.Dataset.from_tensor_slices([0, 8])
+      dataset = dataset_ops.Dataset.zip((offsets, batch_sizes))
 
-    strategy = mirrored_strategy.MirroredStrategy(
-        devices=(self._cpu_and_one_gpu_devices()[0][1] +
-                 self._cpu_and_one_gpu_devices()[1][1]),
-        cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce(
-            ["/job:worker/task:0", "/job:worker/task:1"], 2))
-    worker_devices = self._cpu_and_one_gpu_devices()
-    with context.graph_mode(), strategy.scope(), self.cached_session() as sess:
+      def map_fn(offset, batch_size):
+        return math_ops.range(offset, offset + batch_size)
+
+      dataset = dataset.map(map_fn)
+
+      # Set the sharding behavior to OFF for simplicity of test setup; namely,
+      # `dataset` defines the per-worker dataset and will not be further
+      # sharded. Each worker will see a dataset that is equivalent to
+      # tf.data.Dataset.range(12).batch(8).rebatch(...).
+      options = dataset_ops.Options()
+      options.experimental_distribute.auto_shard_policy = AutoShardPolicy.OFF
+      dataset = dataset.with_options(options)
+      return dataset
+
+    dataset = self._create_dataset_or_input_fn(input_type, dataset_fn)
+
+    # Actual devices don't matter in this test as long as the number of global
+    # replicas is 2.
+    worker_device_pairs = [("/device:CPU:0", ["/device:CPU:0"])]
+
+    # Each test runs individually on each worker, so we compare the
+    # values on each worker. Each worker should rebatch its dataset into
+    # smaller batches of size 4.
+    expected_values = [[[0, 1, 2, 3]], [[4, 5, 6, 7]], [[8, 9]], [[10, 11]]]
+    self._test_input_iteration(
+        input_type,
+        api_type,
+        iteration_type,
+        dataset,
+        worker_device_pairs,
+        expected_values,
+        distribution,
+        num_replicas_in_sync=distribution.num_replicas_in_sync,
+        input_context=distribution.extended._make_input_context())
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          input_type=["dataset"],
+          api_type=["wrap_into_iterator", "wrap_into_dataset"],
+          iteration_type=["get_next", "for_loop"],
+          distribution=[
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_gpu,
+          ],
+          auto_shard_policy=[AutoShardPolicy.AUTO, AutoShardPolicy.DATA]))
+  def testMWMSWithDataSharding(self, input_type, api_type, iteration_type,
+                               distribution, auto_shard_policy):
+    # Test case: 2 workers, 1 replica each.
+    # This test simulates the sharded behavior the dataset is sharded by data
+    # and the batch size is indivisible by the number of replicas. This checks
+    # that the elements are as expected and the batch size across all workers
+    # adds up to 3. This test will only pass if the autoshard rewrite rewrites
+    # RebatchDatasetV2 to legacy RebatchDataset when sharding by data.
+    def dataset_fn(ctx):
+      del ctx
+      dataset = dataset_ops.Dataset.range(8).batch(3)
+
+      # Set the sharding behavior to OFF for simplicity of test setup; namely,
+      # `dataset` defines the per-worker dataset and will not be further
+      # sharded. Each worker will see a dataset that is
+      # tf.data.Dataset.range(12).batch(8).rebatch(...).
+      options = dataset_ops.Options()
+      options.experimental_distribute.auto_shard_policy = auto_shard_policy
+      dataset = dataset.with_options(options)
+      return dataset
+
+    dataset = self._create_dataset_or_input_fn(input_type, dataset_fn)
+
+    # Actual devices don't matter in this test as long as there is 1 local
+    # replica.
+    worker_device_pairs = [("/device:CPU:0", ["/device:CPU:0"])]
+
+    # Each test runs individually on each worker, so we compare the
+    # values on each worker. We expect each worker to see different shards of
+    # data.
+    cr = distribution.cluster_resolver
+    worker_id = multi_worker_util.id_in_cluster(cr.cluster_spec(), cr.task_type,
+                                                cr.task_id)
+
+    if worker_id == 0:
+      expected_values = [[[0, 1]], [[3, 4]], [[6]]]
+    elif worker_id == 1:
+      expected_values = [[[2]], [[5]], [[7]]]
+
+    self._test_input_iteration(
+        input_type,
+        api_type,
+        iteration_type,
+        dataset,
+        worker_device_pairs,
+        expected_values,
+        distribution,
+        num_replicas_in_sync=distribution.num_replicas_in_sync,
+        input_context=distribution.extended._make_input_context())
 
-      expected_values = [[[0, 1], [2, 3], [0, 1], [2, 3]],
-                         [[4, 5], [6, 7], [4, 5], [6, 7]], [[], [], [8], []]]
-      strategy.extended.experimental_enable_get_next_as_optional = True
-      self._test_input_iteration(
-          input_type,
-          api_type,
-          iteration_type,
-          dataset_or_input_fn,
-          worker_devices,
-          expected_values,
-          strategy,
-          sess=sess)
 
 if __name__ == "__main__":
-  test.main()
+  test_util.main()
diff --git a/tensorflow/python/distribute/input_lib_type_spec_test.py b/tensorflow/python/distribute/input_lib_type_spec_test.py
index 691b29202e1..bc6ac811bbb 100644
--- a/tensorflow/python/distribute/input_lib_type_spec_test.py
+++ b/tensorflow/python/distribute/input_lib_type_spec_test.py
@@ -176,8 +176,7 @@ class InputTypeSpecTest(test.TestCase, parameterized.TestCase):
           dataset_fn(distribute_lib.InputContext()))
       type_spec = ds.element_spec
     else:
-      ds = distribution.experimental_distribute_datasets_from_function(
-          dataset_fn)
+      ds = distribution.distribute_datasets_from_function(dataset_fn)
       iterator = iter(ds)
       _check_type_spec_structure(iterator)
       type_spec = iterator.element_spec
diff --git a/tensorflow/python/distribute/input_ops.py b/tensorflow/python/distribute/input_ops.py
index 37a7ed642d0..de828f4bcd9 100644
--- a/tensorflow/python/distribute/input_ops.py
+++ b/tensorflow/python/distribute/input_ops.py
@@ -27,7 +27,7 @@ from tensorflow.python.framework import ops
 
 
 # pylint: disable=protected-access
-def auto_shard_dataset(dataset, num_shards, index):
+def auto_shard_dataset(dataset, num_shards, index, num_replicas_in_sync=None):
   """Shard the input pipeline by sharding the underlying list of files.
 
   Args:
@@ -37,6 +37,8 @@ def auto_shard_dataset(dataset, num_shards, index):
         shards operating in parallel. Same usage as in `tf.data.Dataset.shard`.
     index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
       Same usage as in `tf.data.Dataset.shard`.
+    num_replicas_in_sync: An integer representing the total number of replicas
+      across all workers. This is used in the rewrite when sharding by data.
 
   Returns:
     A modified `Dataset` obtained by updating the pipeline sharded by the
@@ -45,10 +47,14 @@ def auto_shard_dataset(dataset, num_shards, index):
   """
   if (dataset.options().experimental_distribute.auto_shard_policy !=
       AutoShardPolicy.OFF):
+    if num_replicas_in_sync is None:
+      num_replicas_in_sync = 1
     if isinstance(dataset, dataset_ops.DatasetV1):
-      return distribute._AutoShardDatasetV1(dataset, num_shards, index)
+      return distribute._AutoShardDatasetV1(dataset, num_shards, index,
+                                            num_replicas_in_sync)
     else:
-      return distribute._AutoShardDataset(dataset, num_shards, index)
+      return distribute._AutoShardDataset(dataset, num_shards, index,
+                                          num_replicas_in_sync)
   else:
     return dataset
 
diff --git a/tensorflow/python/distribute/integration_test/BUILD b/tensorflow/python/distribute/integration_test/BUILD
index 361c8a42dbe..c7c70776826 100644
--- a/tensorflow/python/distribute/integration_test/BUILD
+++ b/tensorflow/python/distribute/integration_test/BUILD
@@ -9,13 +9,17 @@ package(
 distribute_py_test(
     name = "saved_model_test",
     srcs = ["saved_model_test.py"],
-    disable_mlir_bridge = False,
     deps = [
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python:lookup_ops",
         "//tensorflow/python/distribute:combinations",
-        "//tensorflow/python/distribute:multi_process_runner",
         "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute:parameter_server_strategy_v2",
+        "//tensorflow/python/distribute:sharded_variable",
         "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/distribute:test_util",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -29,6 +33,7 @@ cuda_py_test(
     shard_count = 2,
     tags = [
         "multi_and_single_gpu",
+        "no_oss",  # TODO(b/170838851): UnavailableError:  Connection reset by peer
     ],
     deps = [
         "//tensorflow:tensorflow_py",
@@ -36,6 +41,7 @@ cuda_py_test(
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:multi_process_runner",
         "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute:test_util",
         "//tensorflow/python/eager:test",
     ],
 )
diff --git a/tensorflow/python/distribute/integration_test/mwms_peer_failure_test.py b/tensorflow/python/distribute/integration_test/mwms_peer_failure_test.py
index 003fb5f1a33..02dee6f6adb 100644
--- a/tensorflow/python/distribute/integration_test/mwms_peer_failure_test.py
+++ b/tensorflow/python/distribute/integration_test/mwms_peer_failure_test.py
@@ -27,16 +27,16 @@ import os
 import tensorflow as tf
 
 from tensorflow.python.distribute import collective_all_reduce_strategy as mwms_lib
-from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import test_util
 from tensorflow.python.eager import test
 
 
 # Put it in top level so it executes in the child processes as well.
 mwms_lib.CollectiveAllReduceExtended._enable_check_health = True
 mwms_lib.CollectiveAllReduceExtended._check_health_interval = 3
-mwms_lib.CollectiveAllReduceExtended._check_health_initial_timeout = 6
+mwms_lib.CollectiveAllReduceExtended._check_health_initial_timeout = 0
 
 
 def get_attempt(strategy, attempts):
@@ -213,4 +213,4 @@ class PeerFailureRecoverTest(test.TestCase):
 
 
 if __name__ == "__main__":
-  combinations.main()
+  test_util.main()
diff --git a/tensorflow/python/distribute/integration_test/saved_model_test.py b/tensorflow/python/distribute/integration_test/saved_model_test.py
index 33d94435ff7..b8ac71ef203 100644
--- a/tensorflow/python/distribute/integration_test/saved_model_test.py
+++ b/tensorflow/python/distribute/integration_test/saved_model_test.py
@@ -35,8 +35,15 @@ import tensorflow as tf
 import tensorflow.compat.v1 as tf1
 
 from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import parameter_server_strategy_v2
+from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import test_util
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import context
 from tensorflow.python.eager import test
+from tensorflow.python.ops import lookup_ops
 
 
 @combinations.generate(
@@ -478,5 +485,132 @@ class SaveAndLoadForTrainingTest(test.TestCase, parameterized.TestCase):
         tf.saved_model.load(v2_export_dir)
 
 
+class PSStrategySaveAndLoadTest(test.TestCase):
+  # Test saved_model saving and loading for parameter server strategy. These
+  # tests are different enough than the tests in `SaveAndLoadForXXX` so we make
+  # a separate test class for them.
+
+  @classmethod
+  def setUpClass(cls):
+    super().setUpClass()
+    cluster_def = multi_worker_test_base.create_in_process_cluster(
+        num_workers=2, num_ps=2)
+    cls.cluster_resolver = tf.distribute.cluster_resolver.SimpleClusterResolver(
+        tf.train.ClusterSpec(cluster_def))
+
+  def tearDown(self):
+    super().tearDown()
+    context._reset_context()
+
+  def load_and_run_v1(self,
+                      model_dir,
+                      inputs,
+                      signature_key=tf1.saved_model.signature_constants
+                      .DEFAULT_SERVING_SIGNATURE_DEF_KEY):
+    """Load a SavedModel into a TF 1.x-style graph and run `signature_key`."""
+    graph = tf.Graph()
+    with graph.as_default(), tf1.Session() as session:
+      meta_graph_def = tf1.saved_model.load(
+          session, [tf1.saved_model.tag_constants.SERVING], model_dir)
+      signature = meta_graph_def.signature_def[signature_key]
+      feed_dict = {}
+      for arg_name in inputs.keys():
+        input_tensor = session.graph.get_tensor_by_name(
+            signature.inputs[arg_name].name)
+        feed_dict[input_tensor] = inputs[arg_name]
+      output_dict = {}
+      for output_name, output_tensor_info in signature.outputs.items():
+        output_dict[output_name] = session.graph.get_tensor_by_name(
+            output_tensor_info.name)
+      return session.run(output_dict, feed_dict)["output_0"]
+
+  class Model(tf.Module):
+
+    def __init__(self):
+      self.v1 = tf.Variable([0, 0, 0, 0])
+      self.v2 = tf.Variable([1, 1, 1, 1])
+      self.table = lookup_ops.MutableHashTable(
+          key_dtype=tf.int32, value_dtype=tf.int32, default_value=-1)
+
+    def train(self):
+      # simulate a training process to mutate the state of the model.
+      self.v1.assign([2, 2, 2, 2])
+      self.v2.assign([3, 3, 3, 3])
+      self.table.insert(keys=1, values=1)
+
+    @tf.function(input_signature=[
+        tf.TensorSpec(shape=(), dtype=tf.dtypes.int32, name="x")
+    ])
+    def __call__(self, x):
+      t = tf.math.add(self.v1, self.v2)
+      return tf.math.add(t, self.table.lookup(x))
+
+  def test_basic(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver)
+    model_dir = self.get_temp_dir()
+    with strategy.scope():
+      m = self.Model()
+    m.train()
+    tf.saved_model.save(m, model_dir)
+
+    # Load via V2 API.
+    loaded = tf.saved_model.load(model_dir)
+    self.assertRegex(loaded.v1.device, "/job:chief/replica:0/task:0")
+    self.assertRegex(loaded.v2.device, "/job:chief/replica:0/task:0")
+    self.assertAllEqual(loaded(tf.identity(1)), [6, 6, 6, 6])
+    loaded.v2.assign([1, 1, 1, 1])
+    self.assertAllEqual(loaded(tf.identity(1)), [4, 4, 4, 4])
+
+    # Load via V1 API.
+    self.assertAllEqual(self.load_and_run_v1(model_dir, {"x": 1}), [6, 6, 6, 6])
+
+  def test_load_to_same_strategy(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver)
+    model_dir = self.get_temp_dir()
+    with strategy.scope():
+      m = self.Model()
+    m.train()
+    tf.saved_model.save(m, model_dir)
+
+    with strategy.scope():
+      loaded = tf.saved_model.load(model_dir)
+    self.assertRegex(loaded.v1.device, "/job:ps/replica:0/task:0")
+    self.assertRegex(loaded.v2.device, "/job:ps/replica:0/task:1")
+    self.assertAllEqual(loaded(tf.identity(1)), [6, 6, 6, 6])
+
+  def test_load_to_different_strategy(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver)
+    model_dir = self.get_temp_dir()
+    with strategy.scope():
+      m = self.Model()
+    m.train()
+    tf.saved_model.save(m, model_dir)
+
+    del m  # Garbage collect variables before we reset the context.
+    context._reset_context()
+
+    mirrored_strategy = tf.distribute.MirroredStrategy(devices=["CPU:0"])
+    with mirrored_strategy.scope():
+      loaded = tf.saved_model.load(model_dir)
+    self.assertIsInstance(loaded.v1, values.DistributedVariable)
+    self.assertAllEqual(loaded(tf.identity(1)), [6, 6, 6, 6])
+
+  def test_sharded_variable(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, tf1.fixed_size_partitioner(2))
+    model_dir = self.get_temp_dir()
+    with strategy.scope():
+      m = self.Model()
+      self.assertIsInstance(m.v1, sharded_variable.ShardedVariable)
+    m.train()
+    tf.saved_model.save(m, model_dir)
+
+    # ShardedVariable loading only works in v1.
+    self.assertAllEqual(self.load_and_run_v1(model_dir, {"x": 1}), [6, 6, 6, 6])
+
+
 if __name__ == "__main__":
-  combinations.main()
+  test_util.main()
diff --git a/tensorflow/python/distribute/mirrored_run.py b/tensorflow/python/distribute/mirrored_run.py
index 2cf23e96e67..4f1f48d30cc 100644
--- a/tensorflow/python/distribute/mirrored_run.py
+++ b/tensorflow/python/distribute/mirrored_run.py
@@ -246,6 +246,9 @@ class _MirroredReplicaThread(threading.Thread):
     self.distribution = dist
     self.devices = devices
     self.replica_id = replica_id
+    self.replica_id_in_sync_group = (
+        dist.extended._get_replica_id_in_sync_group(replica_id))  # pylint: disable=protected-access
+
     self.variable_creator_fn = variable_creator_fn
     # State needed to run and return the results of `fn`.
     self.main_fn = fn
@@ -310,7 +313,8 @@ class _MirroredReplicaThread(threading.Thread):
           _enter_graph(self.graph, self.in_eager,
                        self._variable_creator_stack), \
           context.device_policy(self.context_device_policy), \
-          _MirroredReplicaContext(self.distribution, self.replica_id), \
+          _MirroredReplicaContext(self.distribution,
+                                  self.replica_id_in_sync_group), \
           ops.device(self.devices[self.replica_id]), \
           ops.name_scope(self._name_scope), \
           variable_scope.variable_scope(
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
index 07798dc1046..89ce6ad5ea2 100644
--- a/tensorflow/python/distribute/mirrored_strategy.py
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import copy
 
+from tensorflow.python.distribute import collective_util
 from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
@@ -86,7 +87,7 @@ def _cluster_spec_to_device_list(cluster_spec, num_gpus_per_worker):
   for task_type in ("chief", "worker"):
     for task_id in range(len(cluster_spec.as_dict().get(task_type, []))):
       if num_gpus_per_worker == 0:
-        devices.append("/job:%s/task:%d" % (task_type, task_id))
+        devices.append("/job:%s/task:%d/device:CPU:0" % (task_type, task_id))
       else:
         devices.extend([
             "/job:%s/task:%d/device:GPU:%i" % (task_type, task_id, gpu_id)
@@ -313,6 +314,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     assert devices, ("Got an empty `devices` list and unable to recognize "
                      "any local devices.")
     self._cross_device_ops = cross_device_ops
+    self._communication_options = collective_util.Options()
     self._initialize_strategy(devices)
 
     # TODO(b/128995245): Enable last partial batch support in graph mode.
@@ -340,7 +342,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     self._input_workers_devices = (
         (device_util.canonicalize("/device:CPU:0", devices[0]), devices),)
     self._inferred_cross_device_ops = None if self._cross_device_ops else (
-        cross_device_ops_lib.choose_the_best(devices))
+        cross_device_ops_lib.select_cross_device_ops(devices))
     self._host_input_device = numpy_dataset.SingleDevice(
         self._input_workers_devices[0][0])
     self._is_multi_worker_training = False
@@ -378,14 +380,16 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     self._is_multi_worker_training = True
 
     if len(workers) > 1:
-      if not isinstance(self._cross_device_ops,
-                        cross_device_ops_lib.MultiWorkerAllReduce):
+      # Grandfather usage in the legacy tests if they're configured properly.
+      if (not isinstance(self._cross_device_ops,
+                         cross_device_ops_lib.ReductionToOneDevice) or
+          self._cross_device_ops._num_between_graph_workers > 1):  # pylint: disable=protected-access
         raise ValueError(
             "In-graph multi-worker training with `MirroredStrategy` is not "
             "supported.")
       self._inferred_cross_device_ops = self._cross_device_ops
     else:
-      # TODO(yuefengz): make `choose_the_best` work with device strings
+      # TODO(yuefengz): make `select_cross_device_ops` work with device strings
       # containing job names.
       self._inferred_cross_device_ops = cross_device_ops_lib.NcclAllReduce()
 
@@ -477,7 +481,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
         dataset,
         self._input_workers,
         self._container_strategy(),
-        split_batch_by=self._num_replicas_in_sync)
+        num_replicas_in_sync=self._num_replicas_in_sync)
 
   def _make_input_fn_iterator(
       self,
@@ -499,14 +503,13 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
         dataset,
         self._input_workers_with_options(options),
         self._container_strategy(),
-        split_batch_by=self._num_replicas_in_sync)
+        num_replicas_in_sync=self._num_replicas_in_sync)
 
   def _experimental_make_numpy_dataset(self, numpy_input, session):
     return numpy_dataset.one_host_numpy_dataset(
         numpy_input, self._host_input_device, session)
 
-  def _experimental_distribute_datasets_from_function(self, dataset_fn,
-                                                      options):
+  def _distribute_datasets_from_function(self, dataset_fn, options):
     input_contexts = []
     input_workers = self._input_workers_with_options(options)
     num_workers = input_workers.num_workers
@@ -631,7 +634,17 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
     del value  # Unused.
     return self._cross_device_ops or self._inferred_cross_device_ops
 
-  def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
+  def _gather_to_implementation(self, value, destinations, axis, options):
+    if not isinstance(value, values.DistributedValues):
+      # ReductionToOneDevice._gather accepts DistributedValues only.
+      return value
+    return self._get_cross_device_ops(value)._gather(  # pylint: disable=protected-access
+        value,
+        destinations=destinations,
+        axis=axis,
+        options=self._communication_options.merge(options))
+
+  def _reduce_to(self, reduce_op, value, destinations, options):
     if (distribute_utils.is_mirrored(value) and
         reduce_op == reduce_util.ReduceOp.MEAN):
       return value
@@ -647,10 +660,9 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
         reduce_op,
         value,
         destinations=destinations,
-        experimental_hints=experimental_hints)
+        options=self._communication_options.merge(options))
 
-  def _batch_reduce_to(self, reduce_op, value_destination_pairs,
-                       experimental_hints):
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs, options):
     cross_device_ops = None
     for value, _ in value_destination_pairs:
       if cross_device_ops is None:
@@ -658,8 +670,10 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
       elif cross_device_ops is not self._get_cross_device_ops(value):
         raise ValueError("inputs to batch_reduce_to must be either all on the "
                          "the host or all on the compute devices")
-    return cross_device_ops.batch_reduce(reduce_op, value_destination_pairs,
-                                         experimental_hints)
+    return cross_device_ops.batch_reduce(
+        reduce_op,
+        value_destination_pairs,
+        options=self._communication_options.merge(options))
 
   def _update(self, var, fn, args, kwargs, group):
     # TODO(josh11b): In eager mode, use one thread per device.
@@ -757,3 +771,9 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1):
   def _in_multi_worker_mode(self):
     """Whether this strategy indicates working in multi-worker settings."""
     return False
+
+  def _get_local_replica_id(self, replica_id_in_sync_group):
+    return replica_id_in_sync_group
+
+  def _get_replica_id_in_sync_group(self, replica_id):
+    return replica_id
diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py
index 5c86cbea1a4..acdfdbb3788 100644
--- a/tensorflow/python/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/mirrored_strategy_test.py
@@ -1148,9 +1148,9 @@ class MirroredStrategyDefunTest(test.TestCase):
                 # pylint: disable=g-long-lambda
                 lambda: mirrored_strategy.MirroredStrategy(
                     devices=mirrored_strategy.all_local_devices(),
-                    cross_device_ops=cross_device_ops_lib.MultiWorkerAllReduce([
-                        "/job:worker/task:0", "/job:worker/task:1"
-                    ], context.num_gpus())),
+                    cross_device_ops=cross_device_ops_lib.ReductionToOneDevice(
+                    ),
+                ),
                 required_gpus=1)
         ],
         mode=["graph"]))
@@ -1288,9 +1288,7 @@ class MultiWorkerMirroredStrategyTestWithChief(
     cls._default_target = "grpc://" + cls._cluster_spec["chief"][0]
 
   def _make_cross_device_ops(self):
-    return cross_device_ops_lib.MultiWorkerAllReduce(
-        ["/job:chief/task:0", "/job:worker/task:0", "/job:worker/task:1"],
-        context.num_gpus())
+    return cross_device_ops_lib.ReductionToOneDevice()
 
   def testMinimizeLossGraph(self):
     with context.graph_mode():
diff --git a/tensorflow/python/distribute/moving_averages_test.py b/tensorflow/python/distribute/moving_averages_test.py
index 577a6c1168f..22522ea2389 100644
--- a/tensorflow/python/distribute/moving_averages_test.py
+++ b/tensorflow/python/distribute/moving_averages_test.py
@@ -23,6 +23,7 @@ from absl.testing import parameterized
 from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import test_util
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
@@ -286,4 +287,4 @@ class ExponentialMovingAverageTest(test.TestCase, parameterized.TestCase):
 
 
 if __name__ == "__main__":
-  combinations.main()
+  test_util.main()
diff --git a/tensorflow/python/distribute/multi_process_lib.py b/tensorflow/python/distribute/multi_process_lib.py
index 89021448eb2..e733d6b81cd 100644
--- a/tensorflow/python/distribute/multi_process_lib.py
+++ b/tensorflow/python/distribute/multi_process_lib.py
@@ -12,41 +12,156 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""OSS multi-process library to be implemented."""
+"""Library for multi-process testing."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
-import multiprocessing as _multiprocessing
+import multiprocessing
 import os
+import sys
 import unittest
+from absl import app
 
 from tensorflow.python.eager import test
 
 
-try:
-  multiprocessing = _multiprocessing.get_context('forkserver')
-except ValueError:
-  # forkserver is not available on Windows.
-  multiprocessing = _multiprocessing.get_context('spawn')
+def is_oss():
+  """Returns whether the test is run under OSS."""
+  return len(sys.argv) >= 1 and 'bazel' in sys.argv[0]
 
 
-class Process(object):
-  """A process simulating a worker for testing multi-worker training."""
+def _is_enabled():
+  tpu_args = [arg for arg in sys.argv if arg.startswith('--tpu')]
+  if is_oss() and tpu_args:
+    return False
+  return sys.platform != 'win32'
+
+
+class _AbslProcess:
+  """A process that runs using absl.app.run."""
 
   def __init__(self, *args, **kwargs):
-    del args, kwargs
-    raise unittest.SkipTest(
-        'TODO(b/150264776): Implement OSS version of `multi_process_lib`')
+    super(_AbslProcess, self).__init__(*args, **kwargs)
+    # Monkey-patch that is carried over into the spawned process by pickle.
+    self._run_impl = getattr(self, 'run')
+    self.run = self._run_with_absl
+
+  def _run_with_absl(self):
+    app.run(lambda _: self._run_impl())
+
+
+if _is_enabled():
+
+  class AbslForkServerProcess(_AbslProcess,
+                              multiprocessing.context.ForkServerProcess):
+    """An absl-compatible Forkserver process.
+
+    Note: Forkserver is not available in windows.
+    """
+
+  class AbslForkServerContext(multiprocessing.context.ForkServerContext):
+    _name = 'absl_forkserver'
+    Process = AbslForkServerProcess  # pylint: disable=invalid-name
+
+  multiprocessing = AbslForkServerContext()
+  Process = multiprocessing.Process
+
+else:
+
+  class Process(object):
+    """A process that skips test (until windows is supported)."""
+
+    def __init__(self, *args, **kwargs):
+      del args, kwargs
+      raise unittest.SkipTest(
+          'TODO(b/150264776): Windows is not supported in MultiProcessRunner.')
+
+
+_test_main_called = False
+
+
+def _set_spawn_exe_path():
+  """Set the path to the executable for spawned processes.
+
+  This utility searches for the binary the parent process is using, and sets
+  the executable of multiprocessing's context accordingly.
+
+  Raises:
+    RuntimeError: If the binary path cannot be determined.
+  """
+  # TODO(b/150264776): This does not work with Windows. Find a solution.
+  if sys.argv[0].endswith('.py'):
+    # If all we have is a python module path, we'll need to make a guess for
+    # the actual executable path. Since the binary path may correspond to the
+    # parent's path of the python module, we are making guesses by reducing
+    # directories one at a time. E.g.,
+    # tensorflow/python/some/path/my_test.py
+    # -> tensorflow/python/some/path/my_test
+    # -> tensorflow/python/some/my_test
+    # -> tensorflow/python/my_test
+    path_to_use = None
+    guess_path = sys.argv[0][:-3]
+    guess_path = guess_path.split(os.sep)
+    for path_reduction in range(-1, -len(guess_path), -1):
+      possible_path = os.sep.join(guess_path[:path_reduction] +
+                                  [guess_path[-1]])
+      if os.access(possible_path, os.X_OK):
+        path_to_use = possible_path
+        break
+      # The binary can possibly have _gpu suffix.
+      possible_path += '_gpu'
+      if os.access(possible_path, os.X_OK):
+        path_to_use = possible_path
+        break
+    if path_to_use is None:
+      raise RuntimeError('Cannot determine binary path')
+    sys.argv[0] = path_to_use
+  # Note that this sets the executable for *all* contexts.
+  multiprocessing.get_context().set_executable(sys.argv[0])
+
+
+def _if_spawn_run_and_exit():
+  """If spawned process, run requested spawn task and exit. Else a no-op."""
+
+  # `multiprocessing` module passes a script "from multiprocessing.x import y"
+  # to subprocess, followed by a main function call. We use this to tell if
+  # the process is spawned. Examples of x are "forkserver" or
+  # "semaphore_tracker".
+  is_spawned = ('-c' in sys.argv[1:] and
+                sys.argv[sys.argv.index('-c') +
+                         1].startswith('from multiprocessing.'))
+
+  if not is_spawned:
+    return
+  cmd = sys.argv[sys.argv.index('-c') + 1]
+  # As a subprocess, we disregarding all other interpreter command line
+  # arguments.
+  sys.argv = sys.argv[0:1]
+
+  # Run the specified command - this is expected to be one of:
+  # 1. Spawn the process for semaphore tracker.
+  # 2. Spawn the initial process for forkserver.
+  # 3. Spawn any process as requested by the "spawn" method.
+  exec(cmd)  # pylint: disable=exec-used
+  sys.exit(0)  # Semaphore tracker doesn't explicitly sys.exit.
 
 
 def test_main():
   """Main function to be called within `__main__` of a test file."""
+  global _test_main_called
+  _test_main_called = True
+
   os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
+
+  if _is_enabled():
+    _set_spawn_exe_path()
+    _if_spawn_run_and_exit()
+
+  # Only runs test.main() if not spawned process.
   test.main()
 
 
 def initialized():
   """Returns whether the module is initialized."""
-  return True
+  return _test_main_called
diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index b7ed48de0a0..4c72969c995 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import atexit
 import collections
 import contextlib
 import json
@@ -28,6 +29,7 @@ import sys
 import threading
 import time
 import unittest
+import weakref
 
 from absl import logging
 import six
@@ -37,6 +39,7 @@ from tensorflow.python import tf2
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.distribute import multi_process_lib
 from tensorflow.python.eager import context
+from tensorflow.python.util.tf_export import tf_export
 
 multiprocessing = multi_process_lib.multiprocessing
 
@@ -121,49 +124,51 @@ class MultiProcessRunner(object):
   """
 
   def __init__(self,
-               proc_func,
+               fn,
                cluster_spec,
                rpc_layer=None,
                max_run_time=None,
                grpc_fail_fast=None,
-               stream_stdout=True,
-               list_stdout=False,
+               stream_output=True,
+               return_output=False,
                use_dill_for_args=True,
                daemon=False,
                dependence_on_chief=True,
                auto_restart=False,
                args=None,
                kwargs=None):
-    """Creates a multi-process runner.
+    """Instantiation of a `MultiProcessRunner`.
 
     Args:
-      proc_func: Function to be run on child processes. This will be run on
-        processes for all task types.
-      cluster_spec: Dict for cluster spec. The following is an example of
-        cluster with three workers and two ps's.
+      fn: Function to be run on child processes. This will be run on processes
+        for all task types.
+      cluster_spec: Dict for cluster spec. The utility function
+        `tf.__internal__.distribute.multi_process_runner.create_cluster_spec`
+        can be conveniently used to create such dict. The following is an
+        example of cluster with three workers and two ps's.
         {"worker": ["worker0.example.com:2222",
                     "worker1.example.com:2222",
                     "worker2.example.com:2222"],
          "ps": ["ps0.example.com:2222",
                 "ps1.example.com:2222"]}
       rpc_layer: RPC layer to use. Default value is 'grpc'.
-      max_run_time: If set, child processes is forced to exit at approximately
-        this many seconds after `start` is called. We achieve this through
-        `signal.alarm()` api. Note that this is best effort at Python level
-        since Python signal handler does not get executed when it runs lower
-        level C/C++ code. So it can be delayed for arbitrarily long time.
-        If any of the child process is still running when `max_run_time` is up,
-        they will be force-terminated and a `UnexpectedSubprocessExitError`
-        may be raised at `join()`.
+      max_run_time: `None` or integer. If not `None`, child processes are forced
+        to exit at approximately this many seconds after this utility is called.
+        We achieve this through `signal.alarm()` api. Note that this is best
+        effort at Python level since Python signal handler does not get executed
+        when it runs lower level C/C++ code. So it can be delayed for
+        arbitrarily long time. If any of the child process is still running when
+        `max_run_time` is up, they will be force-terminated and an
+        `UnexpectedSubprocessExitError` may be raised. If `None`, child
+        processes are not forced to exit.
       grpc_fail_fast: Whether GRPC connection between processes should fail
         without retrying. Defaults to None, in which case the environment
         variable is not explicitly set.
-      stream_stdout: True if the output/error from the subprocesses should be
+      stream_output: True if the output/error from the subprocesses should be
         streamed to be printed in parent process' log. Defaults to True.
-      list_stdout: True if the output/error from the subprocesses should be
-        collected to be attached to the resulting `MultiProcessRunnerResult`
-        returned from `MultiProcessRunner.join()`. If True, the list of stdout
-        can be retrieved via `MultiProcessRunnerResult.stdout` attribute.
+      return_output: If True, the output/error from the subprocesses should be
+        collected to be attached to the resulting namedtuple returned from
+        `join()`. The list of output can be retrieved via `stdout` attribute.
         Defaults to False.
       use_dill_for_args: Whether to use dill to pickle `args` and `kwargs`. dill
         can pickle more objects, but doesn't work with types in
@@ -174,36 +179,37 @@ class MultiProcessRunner(object):
         exits with a zero exit code.
       auto_restart: Whether to automatically restart processes that exit with
         non-zero exit code.
-      args: Positional arguments to be sent to functions run on processes.
-      kwargs: Keyword arguments to be sent to functions run on processes.
+      args: Positional arguments to be sent to `fn` run on subprocesses.
+      kwargs: Keyword arguments to be sent to `fn` run on subprocesses.
 
     Raises:
       RuntimeError: if `multi_process_runner.test_main()` is not called.
       ValueError: if there are more than one chief in the `cluster_spec`.
     """
+
     assert cluster_spec is not None
     if 'chief' in cluster_spec and len(cluster_spec['chief']) > 1:
       raise ValueError('If chief exists in the cluster, there must be at most '
                        'one chief. Current `cluster_spec` has {} chiefs.'
                        .format(len(cluster_spec['chief'])))
     if not multi_process_lib.initialized():
-      raise MultiProcessRunnerNotInitializedError(
+      raise NotInitializedError(
           '`multi_process_runner` is not initialized. '
-          'Please call `multi_process_runner.test_main()` '
-          'within `if __name__ == \'__main__\':` block '
+          'Please call `tf.__internal__.distribute.multi_process_runner.'
+          'test_main()` within `if __name__ == \'__main__\':` block '
           'in your python module to properly initialize '
           '`multi_process_runner`.')
-    if not callable(proc_func):
-      raise ValueError('proc_func is not a callable')
+    if not callable(fn):
+      raise ValueError('fn is not a callable')
 
-    self._proc_func = proc_func
+    self._fn = fn
     self._cluster_spec = cluster_spec
     self._rpc_layer = rpc_layer or 'grpc'
     self._max_run_time = max_run_time
     self._grpc_fail_fast = grpc_fail_fast
-    self._stream_stdout = stream_stdout
-    # TODO(rchao): Revisit list_stdout argument to consider other solution.
-    self._list_stdout = list_stdout
+    self._stream_output = stream_output
+    # TODO(rchao): Revisit return_output argument to consider other solution.
+    self._return_output = return_output
     self._dependence_on_chief = dependence_on_chief
     self._use_dill_for_args = use_dill_for_args
     self._daemon = daemon
@@ -249,18 +255,18 @@ class MultiProcessRunner(object):
       for line in reader:
         task_string = '[{}-{}]:'.format(task_type, task_id)
         formatted_line = '{} {}'.format(task_string.ljust(14), line)
-        if self._stream_stdout:
+        if self._stream_output:
           # TODO(rchao): Use a lock here to ensure the printed lines are not
           # broken.
           print(formatted_line, end='', flush=True)
-        if self._list_stdout:
+        if self._return_output:
           self._streaming_queue.put(formatted_line)
 
   def _start_subprocess_and_reading_thread(self,
                                            task_type,
                                            task_id,
                                            cluster_spec=None,
-                                           proc_func=None,
+                                           fn=None,
                                            args=None,
                                            kwargs=None):
     """Start a subprocess and a thread the reads lines from the subprocess."""
@@ -285,11 +291,11 @@ class MultiProcessRunner(object):
         streaming_pipe_w=pipe_w,
         barrier=self._barrier,
     )
-    if proc_func is None:
-      proc_func, args, kwargs = self._proc_func, self._args, self._kwargs
-    # Always use dill to pickle proc_func so that we support more callable
+    if fn is None:
+      fn, args, kwargs = self._fn, self._args, self._kwargs
+    # Always use dill to pickle fn so that we support more callable
     # types, e.g. lambda.
-    proc_func = dill.dumps(proc_func, dill.HIGHEST_PROTOCOL)
+    fn = dill.dumps(fn, dill.HIGHEST_PROTOCOL)
     if self._use_dill_for_args:
       args = dill.dumps(args, dill.HIGHEST_PROTOCOL)
       kwargs = dill.dumps(kwargs, dill.HIGHEST_PROTOCOL)
@@ -297,8 +303,7 @@ class MultiProcessRunner(object):
     p = _Process(
         test_env=test_env,
         target=_ProcFunc(),
-        args=(resources, test_env, proc_func, args, kwargs,
-              self._use_dill_for_args),
+        args=(resources, test_env, fn, args, kwargs, self._use_dill_for_args),
         daemon=self._daemon)
     p.start()
     self._processes[(task_type, task_id)] = p
@@ -355,7 +360,7 @@ class MultiProcessRunner(object):
     called:
 
     ```python
-    def proc_func():
+    def fn():
       # user code to be run
       import pdb; pdb.set_trace()
 
@@ -366,7 +371,7 @@ class MultiProcessRunner(object):
           task_id=0)
 
     mpr = multi_process_runner.MultiProcessRunner(
-        proc_func,
+        fn,
         multi_worker_test_base.create_cluster_spec(
             has_chief=True, num_workers=1))
     threading.Thread(target=follow_ups).start()
@@ -374,7 +379,7 @@ class MultiProcessRunner(object):
     mpr.join()
     ```
 
-    Note that if `list_stdout=True`, the logs/stdout by task
+    Note that if `return_output=True`, the logs/stdout by task
     run by the main process is not available in result.stdout.
 
     Args:
@@ -394,19 +399,19 @@ class MultiProcessRunner(object):
 
     _set_tf_config(as_task_type, as_task_id, self._cluster_spec,
                    self._rpc_layer)
-    self._proc_func(*self._args, **self._kwargs)
+    self._fn(*self._args, **self._kwargs)
 
   def start_single_process(self,
                            task_type,
                            task_id,
                            cluster_spec=None,
-                           proc_func=None,
+                           fn=None,
                            args=None,
                            kwargs=None):
     """Starts a single process.
 
     This starts a process in the cluster with the task type, task id, and the
-    process function (`proc_func`). If process function is `None`, the function
+    process function (`fn`). If process function is `None`, the function
     provided at `__init__` will be used. If `cluster_spec` is `None`, the
     cluster spec provided at `__init__` will be used.
 
@@ -420,11 +425,11 @@ class MultiProcessRunner(object):
       cluster_spec: The cluster spec to be used on the newly started
         process. If `None`, the cluster spec provided at `__init__` will be
         used.
-      proc_func: The process function to be run on the newly started
+      fn: The process function to be run on the newly started
         process. If specified, specify `args` and `kwargs` as well. If `None`,
         the function provided at `__init__` will be used.
-      args: Optional positional arguments to be supplied in `proc_func`.
-      kwargs: Optional keyword arguments to be supplied in `proc_func`.
+      args: Optional positional arguments to be supplied in `fn`.
+      kwargs: Optional keyword arguments to be supplied in `fn`.
     """
     with self._process_lock:
       if self._joined:
@@ -434,7 +439,7 @@ class MultiProcessRunner(object):
           task_type,
           task_id,
           cluster_spec=cluster_spec,
-          proc_func=proc_func,
+          fn=fn,
           args=args or (),
           kwargs=kwargs or {})
 
@@ -562,15 +567,17 @@ class MultiProcessRunner(object):
     race is removed.
 
     Args:
-      timeout: if set and not all processes report status within roughly
-        `timeout` seconds, a `SubprocessTimeoutError` exception will be raised.
+      timeout: optional integer or `None`. If provided as an integer, and not
+      all processes report status within roughly `timeout` seconds, a
+      `SubprocessTimeoutError` exception will be raised. If `None`, `join` never
+      times out.
 
     Returns:
-      A MultiProcessRunnerResult object, which has two attributes,
-      `return_value` and `stdout`. `return_value` always contains the return
-      values from the subprocesses. If `list_stdout` argument is True at
-      `__init__`, `stdout` is available that contains a list of all messages
-      from subprocesses' stdout and stderr.
+      A `MultiProcessRunnerResult` object, which has two attributes,
+      `return_value` and `stdout`. `return_value` always contains a list of
+      return values from the subprocesses, although the order is not meaningful.
+      If `return_output` argument is True at `__init__`, `stdout` is available
+      that contains a list of all messages from subprocesses' stdout and stderr.
 
     Raises:
       SubprocessTimeoutError: if not all processes report status approximately
@@ -591,6 +598,8 @@ class MultiProcessRunner(object):
         `UnexpectedSubprocessExitError`'s mpr_result attribute, which has the
         same structure as above 'Returns' section describes.
     """
+    if timeout and not isinstance(timeout, int):
+      raise ValueError('`timeout` must be an integer or `None`.')
     with self._process_lock:
       if self._joined:
         raise ValueError("MultiProcessRunner can't be joined twice.")
@@ -690,6 +699,8 @@ class MultiProcessRunner(object):
     sig = sig or getattr(signal, 'SIGKILL', signal.SIGTERM)
     for (task_type, task_id), p in self._processes.items():
       if p.exitcode is not None:
+        logging.info('%s-%d has already exited. Not terminating.', task_type,
+                     task_id)
         continue
       try:
         os.kill(p.pid, sig)
@@ -782,15 +793,14 @@ class _ProcFunc(object):
     sys.stderr.close()
     self._resources.streaming_pipe_w.close()
 
-  def __call__(self, resources, test_env, proc_func, args, kwargs,
-               use_dill_for_args):
+  def __call__(self, resources, test_env, fn, args, kwargs, use_dill_for_args):
     """The wrapper function that actually gets run in child process(es)."""
 
     global _barrier
 
     self._resources = resources
     _barrier = self._resources.barrier
-    proc_func = dill.loads(proc_func)
+    fn = dill.loads(fn)
     if use_dill_for_args:
       args = dill.loads(args)
       kwargs = dill.loads(kwargs)
@@ -824,8 +834,8 @@ class _ProcFunc(object):
       v2_compat.enable_v2_behavior()
 
     with self._runtime_mode(test_env.executing_eagerly):
-      info = _run_contained(test_env.task_type, test_env.task_id, proc_func,
-                            args, kwargs)
+      info = _run_contained(test_env.task_type, test_env.task_id, fn, args,
+                            kwargs)
       self._resources.process_status_queue.put(info)
 
       # Re-raise the exception in addition to reporting it to the parent
@@ -844,6 +854,25 @@ class _ProcFunc(object):
     sys.exit(0)
 
 
+# Active MultiProcessPoolRunner. We need to shut them down when the program
+# exits. For the main process, we do this via atexit callback. For a process
+# that is spawned by MultiProcessPoolRunner, e.g. nested MultiProcessPoolRunner,
+# we do this manually at the end of _pool_runner_worker. The reason is that
+# multiprocessing library waits for all spawned processes to exit, so atexit
+# callbacks won't trigger until all pools are shutdown.
+_active_pool_runners = weakref.WeakSet()
+
+
+def _shutdown_all_pool_runners():
+  for pool in _active_pool_runners:
+    pool.shutdown()
+
+
+def is_oss():
+  """Returns whether the test is run under OSS."""
+  return len(sys.argv) >= 1 and 'bazel' in sys.argv[0]
+
+
 class MultiProcessPoolRunner(object):
   """A utility class to start a process pool to simulate a cluster.
 
@@ -866,6 +895,7 @@ class MultiProcessPoolRunner(object):
       RuntimeError: if `multi_process_runner.test_main()` is not called.
       ValueError: if there are more than one chief in the `cluster_spec`.
     """
+    _active_pool_runners.add(self)
     self._cluster_spec = cluster_spec
     self._initializer = initializer
     self._conn = {}
@@ -880,28 +910,30 @@ class MultiProcessPoolRunner(object):
       conn.close()
     self._conn = {}
     if self._runner is not None:
-      self._runner.join()
+      try:
+        self._runner.join()
+      except Exception as e:  # pylint: disable=broad-except
+        logging.error(
+            'Ignoring exception when shutting down MultiProcessPoolRunner: %s',
+            e)
       self._runner = None
 
   def _start(self):
     """Starts the worker pool."""
     # We need different arguments for different processes so we're passing a
-    # no-op proc_func here and use start_single_process instead.
-    #
-    # We also need to start the process pool as daemon, so that they don't block
-    # the program from exiting. Note that __del__ may not get called when
-    # there's an exception. The user may also store a pool runner in a global
-    # object to share across test cases
+    # no-op fn here and use start_single_process instead.
 
     if dill is None:
       raise unittest.SkipTest(
           'TODO(b/150264776): Resolve dependency issue in CI')
+    if is_oss():
+      raise unittest.SkipTest(
+          'TODO(b/170360740): MultiProcessPoolRunner timing out in OSS')
 
     self._runner = MultiProcessRunner(
-        proc_func=lambda: None,
+        fn=lambda: None,
         cluster_spec=self._cluster_spec,
-        use_dill_for_args=False,
-        daemon=True)
+        use_dill_for_args=False)
     if self._initializer:
       initializer = dill.dumps(self._initializer, dill.HIGHEST_PROTOCOL)
     else:
@@ -913,16 +945,20 @@ class MultiProcessPoolRunner(object):
         self._runner.start_single_process(
             task_type,
             task_id,
-            proc_func=_pool_runner_worker,
+            fn=_pool_runner_worker,
             args=(task_type, task_id, initializer, conn2))
+    # In the case MultiProcessPoolRunner is not GC-ed, we register an atexit
+    # callback to shut them down. For example, when there're global
+    # MultiProcessPoolRunner.
+    atexit.register(_shutdown_all_pool_runners)
 
-  def run(self, proc_func, args=None, kwargs=None):
-    """Runs `proc_func` with `args` and `kwargs` on all jobs.
+  def run(self, fn, args=None, kwargs=None):
+    """Runs `fn` with `args` and `kwargs` on all jobs.
 
     Args:
-      proc_func: The function to be run.
-      args: Optional positional arguments to be supplied in `proc_func`.
-      kwargs: Optional keyword arguments to be supplied in `proc_func`.
+      fn: The function to be run.
+      args: Optional positional arguments to be supplied in `fn`.
+      kwargs: Optional keyword arguments to be supplied in `fn`.
 
     Returns:
       A list of return values.
@@ -932,9 +968,9 @@ class MultiProcessPoolRunner(object):
     if self._runner is None:
       self._start()
 
-    proc_func = dill.dumps(proc_func, dill.HIGHEST_PROTOCOL)
+    fn = dill.dumps(fn, dill.HIGHEST_PROTOCOL)
     for conn in self._conn.values():
-      conn.send((proc_func, args or [], kwargs or {}))
+      conn.send((fn, args or [], kwargs or {}))
 
     process_statuses = []
     for (task_type, task_id), conn in self._conn.items():
@@ -942,7 +978,7 @@ class MultiProcessPoolRunner(object):
       try:
         process_statuses.append(conn.recv())
       except EOFError:
-        # This shouldn't happen due to exceptions in proc_func. This usually
+        # This shouldn't happen due to exceptions in fn. This usually
         # means bugs in the runner.
         self.shutdown()
         raise RuntimeError('Unexpected EOF. Worker process may have died. '
@@ -978,18 +1014,24 @@ def _pool_runner_worker(task_type, task_id, initializer, conn):
     initializer()
   while True:
     try:
-      proc_func, args, kwargs = conn.recv()
+      fn, args, kwargs = conn.recv()
     except EOFError:
       break
-    proc_func = dill.loads(proc_func)
-    info = _run_contained(task_type, task_id, proc_func, args, kwargs)
+    fn = dill.loads(fn)
+    info = _run_contained(task_type, task_id, fn, args, kwargs)
     sys.stdout.flush()
     sys.stderr.flush()
     conn.send(info)
+  # Shutdown all MultiProcessPoolRunner in this process manually.
+  # MultiProcessPoolRunner registers an atexit callback to shutdown all pool
+  # runners, but we cannot rely on that in processes spawned by the
+  # multiprocessing library. This is because the library waits for all
+  # subprocesses before exiting and thus all atexit callbacks.
+  _shutdown_all_pool_runners()
 
 
-def _run_contained(task_type, task_id, proc_func, args, kwargs):
-  """Runs `proc_func` with `args` and `kwargs`.
+def _run_contained(task_type, task_id, fn, args, kwargs):
+  """Runs `fn` with `args` and `kwargs`.
 
   The function returns _ProcessStatusInfo which captures the return value and
   the exception.
@@ -997,9 +1039,9 @@ def _run_contained(task_type, task_id, proc_func, args, kwargs):
   Args:
     task_type: the task type.
     task_id: the task index.
-    proc_func: the function to be run.
-    args: optional positional arguments to be supplied in `proc_func`.
-    kwargs: optional keyword arguments to be supplied in `proc_func`.
+    fn: the function to be run.
+    args: optional positional arguments to be supplied in `fn`.
+    kwargs: optional keyword arguments to be supplied in `fn`.
 
   Returns:
     a _ProcessStatusInfo.
@@ -1009,7 +1051,7 @@ def _run_contained(task_type, task_id, proc_func, args, kwargs):
   return_value = None
   exc_info = None
   try:
-    return_value = proc_func(*args, **kwargs)
+    return_value = fn(*args, **kwargs)
     is_successful = True
     return _ProcessStatusInfo(
         task_type=task_type,
@@ -1018,7 +1060,7 @@ def _run_contained(task_type, task_id, proc_func, args, kwargs):
         exc_info=exc_info,
         return_value=return_value)
 
-  # If `proc_func` ends up exiting with `sys.exit()`, the `SystemExit` is not
+  # If `fn` ends up exiting with `sys.exit()`, the `SystemExit` is not
   # handled here.
   except Exception:  # pylint: disable=broad-except
     exc_info = sys.exc_info()
@@ -1030,12 +1072,17 @@ def _run_contained(task_type, task_id, proc_func, args, kwargs):
         return_value=return_value)
 
 
+@tf_export('__internal__.distribute.multi_process_runner'
+           '.SubprocessTimeoutError',
+           v1=[])
 class SubprocessTimeoutError(RuntimeError):
   """An error that indicates there is at least one subprocess timing out.
 
-  When this is raised, a `MultiProcessRunnerResult` object can be retrieved by
-  `SubprocessTimeoutError`'s mpr_result attribute. See
-  `MultiProcessRunner.join()` for more information.
+  When this is raised, a namedtuple object representing the multi-process run
+  result can be retrieved by
+  `tf.__internal__.distribute.multi_process_runner.SubprocessTimeoutError`'s
+  `mpr_result` attribute. See
+  `tf.__internal__.distribute.multi_process_runner.run` for more information.
   """
 
   def __init__(self, msg, mpr_result):
@@ -1043,12 +1090,18 @@ class SubprocessTimeoutError(RuntimeError):
     self.mpr_result = mpr_result
 
 
+@tf_export('__internal__.distribute.multi_process_runner'
+           '.UnexpectedSubprocessExitError',
+           v1=[])
 class UnexpectedSubprocessExitError(RuntimeError):
   """An error indicating there is at least one subprocess with unexpected exit.
 
-  When this is raised, a `MultiProcessRunnerResult` object can be retrieved by
-  `UnexpectedSubprocessExitError`'s mpr_result attribute. See
-  `MultiProcessRunner.join()` for more information.
+  When this is raised, a namedtuple object representing the multi-process run
+  result can be retrieved by
+  `tf.__internal__.distribute.multi_process_runner
+  .UnexpectedSubprocessExitError`'s
+  `mpr_result` attribute. See
+  `tf.__internal__.distribute.multi_process_runner.run` for more information.
   """
 
   def __init__(self, msg, mpr_result):
@@ -1056,12 +1109,15 @@ class UnexpectedSubprocessExitError(RuntimeError):
     self.mpr_result = mpr_result
 
 
-class MultiProcessRunnerNotInitializedError(RuntimeError):
-  """An error indicating `MultiProcessRunner` is used without initialization.
+@tf_export(
+    '__internal__.distribute.multi_process_runner.NotInitializedError', v1=[])
+class NotInitializedError(RuntimeError):
+  """An error indicating `multi_process_runner.run` is used without init.
 
   When this is raised, user is supposed to call
-  `multi_process_runner.test_main()` within `if __name__ == '__main__':` block
-  to properly initialize `multi_process_runner`.
+  `tf.__internal__.distribute.multi_process_runner.test_main()` within
+  `if __name__ == '__main__':` block to properly initialize
+  `multi_process_runner.run`.
   """
   pass
 
@@ -1080,33 +1136,177 @@ def _set_tf_config(task_type, task_id, cluster_spec, rpc_layer=None):
   os.environ['TF_CONFIG'] = json.dumps(tf_config_dict)
 
 
-def run(proc_func,
+@tf_export('__internal__.distribute.multi_process_runner.run', v1=[])
+def run(fn,
         cluster_spec,
         rpc_layer=None,
         max_run_time=None,
-        grpc_fail_fast=None,
-        stream_stdout=True,
-        list_stdout=False,
+        return_output=False,
         timeout=_DEFAULT_TIMEOUT_SEC,
         args=None,
-        kwargs=None):  # pylint: disable=g-doc-args
-  """Runs functions in local child processes.
+        kwargs=None):
+  """Run `fn` in multiple processes according to `cluster_spec`.
 
-  It is a convenience method that creates a `MultiProcessRunner` object and
-  invokes `start` and `join` method. Please see these methods for detailed
-  documentations.
+  Given a callable `fn`, `tf.__internal__.distribute.multi_process_runner.run`
+  launches multiple processes, each of which runs `fn`. These processes are
+  referred to as "subprocesses" or "child processes". Each of those subprocesses
+  will have their `TF_CONFIG` environment variable set, according to
+  `cluster_spec` and their task types. The stdout of the subprocesses are
+  streamed to the main process' and thus available in logs (if `stream_output`
+  is True), with [type-id] prefix.
+
+  `tf.__internal__.distribute.multi_process_runner.run` will block until all
+  subprocesses have successfully exited, and return a namedtuple object that
+  represents the run result. This object has a `return_value` attribute, which
+  is a list that contains subprocesses `fn`'s return values, for those
+  subprocesses that successfully returned from `fn`. The order of `return_value`
+  list is not meaningful. If an optional arg `return_output` (default to False)
+  is set to True, the namedtuple object will have an additional attribute
+  `stdout`, which is a list containing the stdout of the subprocesses. If any
+  subprocess' `fn` ends up raising an error, that error will be reraised from
+  `tf.__internal__.distribute.multi_process_runner.run`, and the aforementioned
+  namedtuple object will be available through the exception's
+  `mpr_result` attribute.
+
+  This utility is used for simulating running TensorFlow programs across
+  multiple task types, and each of the task type may contain more than one task
+  (except for "chief" where more than one task is prohibited). Test coverage of
+  multi-worker training is the main application of this utility, where code
+  written for multi-worker training can be realistically covered in unit tests.
+
+  Any test module that uses
+  `tf.__internal__.distribute.multi_process_runner.run()` must call
+  `tf.__internal__.distribute.multi_process_runner.test_main()` instead of
+  regular `test.main()` inside `if __name__ == '__main__':` block for proper
+  initialization.
+
+  Args:
+    fn: Function to be run on child processes. This will be run on processes for
+      all task types.
+    cluster_spec: Dict for cluster spec. The utility function
+      `tf.__internal__.distribute.multi_process_runner.create_cluster_spec` can
+      be conveniently used to create such dict. The following is an example of
+      cluster with three workers and two ps's.
+      {"worker": ["worker0.example.com:2222",
+                  "worker1.example.com:2222",
+                  "worker2.example.com:2222"],
+       "ps": ["ps0.example.com:2222",
+              "ps1.example.com:2222"]}
+    rpc_layer: RPC layer to use. Default value is 'grpc'.
+    max_run_time: `None` or integer. If not `None`, child processes are forced
+      to exit at approximately this many seconds after this utility is called.
+      We achieve this through `signal.alarm()` api. Note that this is best
+      effort at Python level since Python signal handler does not get executed
+      when it runs lower level C/C++ code. So it can be delayed for arbitrarily
+      long time. If any of the child process is still running when
+      `max_run_time` is up, they will be force-terminated and an
+      `tf.__internal__.distribute.multi_process_runner
+      .UnexpectedSubprocessExitError`
+      may be raised. If `None`, child processes are not forced to exit.
+    return_output: If True, the output/error from the subprocesses should be
+      collected to be attached to the resulting namedtuple returned from this
+      utility. The list of output can be retrieved via `stdout` attribute.
+      Defaults to False.
+    timeout: optional integer or `None`. If provided as an integer, and not all
+      processes report status within roughly `timeout` seconds, a
+      `tf.__internal__.distribute.multi_process_runner.SubprocessTimeoutError`
+      exception will be raised. If `None`,
+      `tf.__internal__.distribute.multi_process_runner.run` never times out.
+      Defaults to the constant `_DEFAULT_TIMEOUT_SEC` defined in
+      `multi_process_runner` module.
+    args: Positional arguments to be sent to `fn` run on subprocesses.
+    kwargs: Keyword arguments to be sent to `fn` run on subprocesses.
 
   Returns:
-    A MultiProcessRunnerResult object returned from `MultiProcessRunner.join()`.
+      A namedtuple object, which has two attributes,
+      `return_value` and `stdout`. `return_value` always contains a list of
+      returnvalues from the subprocesses, although the order is not meaningful.
+      If `return_output` argument is True, `stdout` is available that contains a
+      list of all messages from subprocesses' stdout and stderr, and the order
+      is mostly chronological.
+
+  Raises:
+    RuntimeError: if
+    `tf.__internal__.distribute.multi_process_runner.test_main()` is
+      not called in test's `if __name__ == '__main__':` block.
+    ValueError: if there are more than one chief in the `cluster_spec`.
+    tf.__internal__.distribute.multi_process_runner.SubprocessTimeoutError: if
+      not all processes report status approximately
+      within `timeout` seconds. When this is raised, a
+      namedtuple object can be retrieved by
+      `tf.__internal__.distribute.multi_process_runner.SubprocessTimeoutError`'s
+      `mpr_result` attribute, which has the same
+      structure as above 'Returns' section describes.
+    tf.__internal__.distribute.multi_process_runner
+    .UnexpectedSubprocessExitError:
+      If any of the subprocesses did not exit
+      properly (for example, they exit on SIGTERM or SIGKILL signal). When
+      this is raised, a namedtuple object can be retrieved by
+      `tf.__internal__.distribute.multi_process_runner
+      .UnexpectedSubprocessExitError`'s
+      `mpr_result` attribute, which has the
+      same structure as above 'Returns' section describes. If `max_run_time`
+      is not `None`, it is expected that some subprocesses may be
+      force-killed when `max_run_time` is up, and this is raised in those
+      cases.
+    Exception: if there is an Exception propagated from any subprocess. When
+      this is raised, a namedtuple object can be retrieved by
+      `tf.__internal__.distribute.multi_process_runner
+      .UnexpectedSubprocessExitError`
+      `mpr_result` attribute, which has the
+      same structure as above 'Returns' section describes.
+
+  Examples:
+
+  ```python
+  class SimpleMultiProcessTest(tf.test.TestCase):
+
+    def test_simple_printing_and_return(self):
+
+      def fn():
+        resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()
+
+        # This will print "[chief-0]:     Task type: chief , task id: 0"
+        # for chief, for example.
+        logging.info('Task type: %s, task id: %d',
+                     resolver.task_type, resolver.task_id)
+
+        return resolver.task_type
+
+      result = tf.__internal__.distribute.multi_process_runner.run(
+          fn=fn,
+          cluster_spec=(
+              tf.__internal__
+              .distribute.multi_process_runner.create_cluster_spec(
+                  has_chief=True, num_workers=2)))
+      assert sorted(result.return_value) == ['chief', 'worker', 'worker']
+
+    def test_error_from_fn(self):
+
+      def fn():
+        resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()
+        raise ValueError('Task type {}, task id {} is errors out'.format(
+            resolver.task_type, resolver.task_id))
+
+      with self.assertRaisesRegexp(ValueError,
+                                   'Task type worker, task id 0 is errors out'):
+        cluster_spec = (
+            tf.__internal__.distribute.multi_process_runner.create_cluster_spec(
+                num_workers=1))
+        tf.__internal__.distribute.multi_process_runner.run(
+            fn=fn, cluster_spec=cluster_spec)
+
+
+  if __name__ == '__main__':
+    tf.__internal__.distribute.multi_process_runner.test_main()
+  ```
   """
   runner = MultiProcessRunner(
-      proc_func,
+      fn,
       cluster_spec,
       rpc_layer,
       max_run_time=max_run_time,
-      grpc_fail_fast=grpc_fail_fast,
-      stream_stdout=stream_stdout,
-      list_stdout=list_stdout,
+      return_output=return_output,
       args=args,
       kwargs=kwargs)
   runner.start()
@@ -1117,11 +1317,49 @@ def run(proc_func,
 _barrier = None
 
 
-def barrier():
+@tf_export('__internal__.distribute.multi_process_runner.get_barrier', v1=[])
+def get_barrier():
+  """Returns a `multiprocessing.Barrier` for `multi_process_runner.run`.
+
+  `tf.__internal__.distribute.multi_process_runner.get_barrier()` returns
+  a `multiprocessing.Barrier` object which can be used within `fn` of
+  `tf.__internal__.distribute.multi_process_runner` to wait with
+  `barrier.wait()` call until all other tasks have also reached the
+  `barrier.wait()` call, before they can proceed individually.
+
+  Note that all tasks (subprocesses) have to reach `barrier.wait()` call to
+  proceed. Currently it is not supported to block on only a subset of tasks
+  in the cluster.
+
+  Example:
+  ```python
+
+  def fn():
+    some_work_to_be_done_by_all_tasks()
+
+    tf.__internal__.distribute.multi_process_runner.get_barrier().wait()
+
+    # The barrier guarantees that at this point, all tasks have finished
+    # `some_work_to_be_done_by_all_tasks()`
+    some_other_work_to_be_done_by_all_tasks()
+
+  result = tf.__internal__.distribute.multi_process_runner.run(
+      fn=fn,
+      cluster_spec=(
+          tf.__internal__
+          .distribute.multi_process_runner.create_cluster_spec(
+              num_workers=2)))
+  ```
+
+
+  Returns:
+    A `multiprocessing.Barrier` for `multi_process_runner.run`.
+  """
   if _barrier is None:
     raise ValueError(
-        'barrier is not defined. It is likely because you are calling barrier()'
-        'in the main process. barrier() can only be called in the subprocesses.'
+        'barrier is not defined. It is likely because you are calling '
+        'get_barrier() in the main process. get_barrier() can only be called '
+        'in the subprocesses.'
     )
   return _barrier
 
@@ -1140,7 +1378,7 @@ def manager():
   ```python
   manager = multi_process_runner.manager()
   some_event_happening_in_subprocess = manager.Event()
-  mpr = multi_process_runner.MultiProcessRunner(proc_func, cluster_spec,
+  mpr = multi_process_runner.MultiProcessRunner(fn, cluster_spec,
       args=(some_event_happening_in_subprocess,))
   mpr.start()
   some_event_happening_in_subprocess.wait()
@@ -1161,6 +1399,27 @@ def manager():
     return _manager
 
 
+@tf_export('__internal__.distribute.multi_process_runner.test_main', v1=[])
 def test_main():
-  """Main function to be called within `__main__` of a test file."""
+  """Main function to be called within `__main__` of a test file.
+
+  Any test module that uses
+  `tf.__internal__.distribute.multi_process_runner.run()`
+  must call this instead of regular `test.main()` inside
+  `if __name__ == '__main__':` block, or an error will be raised when
+  `tf.__internal__.distribute.multi_process_runner.run()` is used. This method
+  takes
+  care of needed initialization for launching multiple subprocesses.
+
+  Example:
+  ```python
+  class MyTestClass(tf.test.TestCase):
+    def testSomething(self):
+      # Testing code making use of
+      # `tf.__internal__.distribute.multi_process_runner.run()`.
+
+  if __name__ == '__main__':
+    tf.__internal__.distribute.multi_process_runner.test_main()
+  ```
+  """
   multi_process_lib.test_main()
diff --git a/tensorflow/python/distribute/multi_process_runner_no_init_test.py b/tensorflow/python/distribute/multi_process_runner_no_init_test.py
index 9276555c26b..dad05fc386f 100644
--- a/tensorflow/python/distribute/multi_process_runner_no_init_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_no_init_test.py
@@ -30,9 +30,8 @@ class MultiProcessRunnerNoInitTest(test.TestCase):
     def simple_func():
       return 'foobar'
 
-    with self.assertRaisesRegex(
-        multi_process_runner.MultiProcessRunnerNotInitializedError,
-        '`multi_process_runner` is not initialized.'):
+    with self.assertRaisesRegex(multi_process_runner.NotInitializedError,
+                                '`multi_process_runner` is not initialized.'):
       multi_process_runner.run(
           simple_func,
           multi_worker_test_base.create_cluster_spec(num_workers=1))
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index 7c1364b7d7c..c5cb3e4e9f1 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -33,38 +33,38 @@ from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.eager import test
 
 
-def proc_func_that_adds_task_type_in_return_data():
+def fn_that_adds_task_type_in_return_data():
   return multi_worker_test_base.get_task_type()
 
 
-def proc_func_that_errors():
+def fn_that_errors():
   raise ValueError('This is an error.')
 
 
-def proc_func_that_does_nothing():
+def fn_that_does_nothing():
   pass
 
 
-def proc_func_that_adds_simple_return_data():
+def fn_that_adds_simple_return_data():
   return 'dummy_data'
 
 
-def proc_func_that_returns_args_and_kwargs(*args, **kwargs):
+def fn_that_returns_args_and_kwargs(*args, **kwargs):
   return list(args) + list(kwargs.items())
 
 
-def proc_func_with_barrier():
-  return multi_process_runner.barrier()
+def fn_with_barrier():
+  return multi_process_runner.get_barrier()
 
 
-def proc_func_that_returns_pid():
+def fn_that_returns_pid():
   return os.getpid()
 
 
 V = None
 
 
-def proc_func_that_sets_global(val):
+def fn_that_sets_global(val):
   global V
   old_val = V
   V = val
@@ -79,21 +79,21 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_multi_process_runner(self):
     mpr_result = multi_process_runner.run(
-        proc_func_that_adds_task_type_in_return_data,
+        fn_that_adds_task_type_in_return_data,
         multi_worker_test_base.create_cluster_spec(
-            num_workers=2, num_ps=3, has_eval=1))
+            num_workers=2, num_ps=3, has_chief=True))
 
-    job_count_dict = {'worker': 2, 'ps': 3, 'evaluator': 1}
+    job_count_dict = {'worker': 2, 'ps': 3, 'chief': 1}
     for data in mpr_result.return_value:
       job_count_dict[data] -= 1
 
     self.assertEqual(job_count_dict['worker'], 0)
     self.assertEqual(job_count_dict['ps'], 0)
-    self.assertEqual(job_count_dict['evaluator'], 0)
+    self.assertEqual(job_count_dict['chief'], 0)
 
   def test_multi_process_runner_error_propagates_from_subprocesses(self):
     runner = multi_process_runner.MultiProcessRunner(
-        proc_func_that_errors,
+        fn_that_errors,
         multi_worker_test_base.create_cluster_spec(num_workers=1, num_ps=1),
         max_run_time=20)
     runner.start()
@@ -102,18 +102,18 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_multi_process_runner_queue_emptied_between_runs(self):
     cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
-    return_value = multi_process_runner.run(
-        proc_func_that_adds_simple_return_data, cluster_spec).return_value
+    return_value = multi_process_runner.run(fn_that_adds_simple_return_data,
+                                            cluster_spec).return_value
     self.assertTrue(return_value)
     self.assertEqual(return_value[0], 'dummy_data')
     self.assertEqual(return_value[1], 'dummy_data')
-    return_value = multi_process_runner.run(proc_func_that_does_nothing,
+    return_value = multi_process_runner.run(fn_that_does_nothing,
                                             cluster_spec).return_value
     self.assertFalse(return_value)
 
   def test_multi_process_runner_args_passed_correctly(self):
     return_value = multi_process_runner.run(
-        proc_func_that_returns_args_and_kwargs,
+        fn_that_returns_args_and_kwargs,
         multi_worker_test_base.create_cluster_spec(num_workers=1),
         args=('a', 'b'),
         kwargs={
@@ -132,7 +132,7 @@ class MultiProcessRunnerTest(test.TestCase):
     mpr_result = multi_process_runner.run(
         simple_print_func,
         multi_worker_test_base.create_cluster_spec(num_workers=2),
-        list_stdout=True)
+        return_output=True)
     std_stream_results = mpr_result.stdout
     return_value = mpr_result.return_value
     self.assertIn('[worker-0]:    This is something printed.\n',
@@ -143,16 +143,16 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_termination(self):
 
-    def proc_func():
+    def fn():
       for i in range(0, 10):
         print(
             'index {}, iteration {}'.format(self._worker_idx(), i), flush=True)
         time.sleep(5)
 
     mpr = multi_process_runner.MultiProcessRunner(
-        proc_func,
+        fn,
         multi_worker_test_base.create_cluster_spec(num_workers=2),
-        list_stdout=True)
+        return_output=True)
     mpr.start()
     time.sleep(5)
     mpr.terminate('worker', 0)
@@ -169,16 +169,16 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_termination_and_start_single_process(self):
 
-    def proc_func():
+    def fn():
       for i in range(0, 10):
         print(
             'index {}, iteration {}'.format(self._worker_idx(), i), flush=True)
         time.sleep(1)
 
     mpr = multi_process_runner.MultiProcessRunner(
-        proc_func,
+        fn,
         multi_worker_test_base.create_cluster_spec(num_workers=2),
-        list_stdout=True)
+        return_output=True)
     mpr.start()
     time.sleep(3)
     mpr.terminate('worker', 0)
@@ -196,7 +196,7 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_streaming(self):
 
-    def proc_func():
+    def fn():
       for i in range(5):
         logging.info('(logging) %s-%d, i: %d',
                      multi_worker_test_base.get_task_type(), self._worker_idx(),
@@ -208,10 +208,10 @@ class MultiProcessRunnerTest(test.TestCase):
         time.sleep(1)
 
     mpr = multi_process_runner.MultiProcessRunner(
-        proc_func,
+        fn,
         multi_worker_test_base.create_cluster_spec(
-            has_chief=True, num_workers=2, num_ps=2, has_eval=True),
-        list_stdout=True)
+            has_chief=True, num_workers=2, num_ps=2),
+        return_output=True)
     mpr._dependence_on_chief = False
 
     mpr.start()
@@ -221,7 +221,7 @@ class MultiProcessRunnerTest(test.TestCase):
 
     list_to_assert = mpr_result.stdout
 
-    for job in ['chief', 'evaluator']:
+    for job in ['chief']:
       for iteration in range(5):
         self.assertTrue(
             any('(logging) {}-0, i: {}'.format(job, iteration) in line
@@ -249,17 +249,17 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_start_in_process_as(self):
 
-    def proc_func():
+    def fn():
       for i in range(5):
         logging.info('%s-%d, i: %d', multi_worker_test_base.get_task_type(),
                      self._worker_idx(), i)
         time.sleep(1)
 
     mpr = multi_process_runner.MultiProcessRunner(
-        proc_func,
+        fn,
         multi_worker_test_base.create_cluster_spec(
             has_chief=True, num_workers=1),
-        list_stdout=True)
+        return_output=True)
 
     def eval_func():
       time.sleep(1)
@@ -278,9 +278,9 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_terminate_all_does_not_ignore_error(self):
     mpr = multi_process_runner.MultiProcessRunner(
-        proc_func_that_errors,
+        fn_that_errors,
         multi_worker_test_base.create_cluster_spec(num_workers=2),
-        list_stdout=True)
+        return_output=True)
     mpr.start()
     time.sleep(60)
     mpr.terminate_all()
@@ -289,26 +289,26 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_barrier(self):
     multi_process_runner.run(
-        proc_func_with_barrier,
+        fn_with_barrier,
         cluster_spec=multi_worker_test_base.create_cluster_spec(
             has_chief=True, num_workers=1),
     )
 
   def test_barrier_called_in_main_process(self):
     with self.assertRaises(ValueError):
-      multi_process_runner.barrier()
+      multi_process_runner.get_barrier()
 
   def test_stdout_available_when_timeout(self):
 
-    def proc_func():
+    def fn():
       logging.info('something printed')
       time.sleep(10000)  # Intentionally make the test timeout.
 
     with self.assertRaises(multi_process_runner.SubprocessTimeoutError) as cm:
       mpr = multi_process_runner.MultiProcessRunner(
-          proc_func,
+          fn,
           multi_worker_test_base.create_cluster_spec(num_workers=1),
-          list_stdout=True)
+          return_output=True)
       mpr.start()
       mpr.join(timeout=60)
     mpr.terminate_all()
@@ -319,23 +319,30 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_seg_fault_raises_error(self):
 
-    def proc_func_expected_to_seg_fault():
+    if multi_process_runner.is_oss():
+      self.skipTest('TODO(b/171004637): Failing in OSS')
+
+    def fn_expected_to_seg_fault():
       ctypes.string_at(0)  # Intentionally made seg fault.
 
     with self.assertRaises(
         multi_process_runner.UnexpectedSubprocessExitError) as cm:
       multi_process_runner.run(
-          proc_func_expected_to_seg_fault,
+          fn_expected_to_seg_fault,
           multi_worker_test_base.create_cluster_spec(num_workers=1),
-          list_stdout=True)
+          return_output=True)
     self.assertIn('Subprocess worker-0 exited with exit code',
                   str(cm.exception))
     list_to_assert = cm.exception.mpr_result.stdout
-    self.assertTrue(any('SIGSEGV' in line for line in list_to_assert))
+    self.assertTrue(
+        any('Segmentation fault' in line for line in list_to_assert))
 
   def test_seg_fault_in_chief_raises_error(self):
 
-    def proc_func_expected_to_seg_fault():
+    if multi_process_runner.is_oss():
+      self.skipTest('TODO(b/171004637): Failing in OSS')
+
+    def fn_expected_to_seg_fault():
       if multi_worker_test_base.get_task_type() == 'worker':
         time.sleep(10000)
       ctypes.string_at(0)  # Intentionally made seg fault.
@@ -343,24 +350,25 @@ class MultiProcessRunnerTest(test.TestCase):
     with self.assertRaises(
         multi_process_runner.UnexpectedSubprocessExitError) as cm:
       multi_process_runner.run(
-          proc_func_expected_to_seg_fault,
+          fn_expected_to_seg_fault,
           multi_worker_test_base.create_cluster_spec(
               has_chief=True, num_workers=1),
-          list_stdout=True)
+          return_output=True)
     self.assertIn('Subprocess chief-0 exited with exit code',
                   str(cm.exception))
     list_to_assert = cm.exception.mpr_result.stdout
-    self.assertTrue(any('SIGSEGV' in line for line in list_to_assert))
+    self.assertTrue(
+        any('Segmentation fault' in line for line in list_to_assert))
 
   def test_exit_code_is_reported_by_chief_subprocess(self):
 
-    def proc_func_expected_to_exit_with_20():
+    def fn_expected_to_exit_with_20():
       if multi_worker_test_base.get_task_type() == 'worker':
         time.sleep(10000)
       sys.exit(20)
 
     mpr = multi_process_runner.MultiProcessRunner(
-        proc_func_expected_to_exit_with_20,
+        fn_expected_to_exit_with_20,
         multi_worker_test_base.create_cluster_spec(
             has_chief=True, num_workers=1))
     mpr.start()
@@ -372,11 +380,11 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_exit_code_is_reported_by_subprocess(self):
 
-    def proc_func_expected_to_exit_with_10():
+    def fn_expected_to_exit_with_10():
       sys.exit(10)
 
     mpr = multi_process_runner.MultiProcessRunner(
-        proc_func_expected_to_exit_with_10,
+        fn_expected_to_exit_with_10,
         multi_worker_test_base.create_cluster_spec(num_workers=1))
     mpr.start()
 
@@ -387,7 +395,7 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_auto_restart(self):
 
-    def proc_func(counter):
+    def fn(counter):
       counter.value += 1
       if counter.value == 1:
         raise ValueError
@@ -395,7 +403,7 @@ class MultiProcessRunnerTest(test.TestCase):
     manager = multi_process_runner.manager()
     counter = manager.Value(int, 0)
     mpr = multi_process_runner.MultiProcessRunner(
-        proc_func,
+        fn,
         multi_worker_test_base.create_cluster_spec(num_workers=1),
         args=(counter,),
         auto_restart=True)
@@ -405,16 +413,16 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_auto_restart_and_timeout(self):
 
-    def proc_func():
+    def fn():
       logging.info('Running')
       time.sleep(1)
       raise ValueError
 
     mpr = multi_process_runner.MultiProcessRunner(
-        proc_func,
+        fn,
         multi_worker_test_base.create_cluster_spec(num_workers=1),
         auto_restart=True,
-        list_stdout=True)
+        return_output=True)
     mpr.start()
     with self.assertRaises(ValueError) as cm:
       mpr.join(timeout=10)
@@ -425,14 +433,14 @@ class MultiProcessRunnerTest(test.TestCase):
     # If the chief has exited with zero exit code, auto restart should stop
     # restarting other tasks even if they fail.
 
-    def proc_func():
+    def fn():
       time.sleep(1)
       if multi_worker_test_base.get_task_type() != 'chief':
         raise ValueError
 
     manager = multi_process_runner.manager()
     mpr = multi_process_runner.MultiProcessRunner(
-        proc_func,
+        fn,
         multi_worker_test_base.create_cluster_spec(
             has_chief=True, num_workers=1),
         auto_restart=True)
@@ -443,11 +451,11 @@ class MultiProcessRunnerTest(test.TestCase):
   def test_auto_restart_failure_immediate_after_restart(self):
     # Test the case when worker-0 fails immediately after worker-1 restarts.
 
-    def proc_func():
+    def fn():
       time.sleep(5)
 
     mpr = multi_process_runner.MultiProcessRunner(
-        proc_func,
+        fn,
         multi_worker_test_base.create_cluster_spec(
             has_chief=False, num_workers=2),
         auto_restart=True)
@@ -462,7 +470,7 @@ class MultiProcessRunnerTest(test.TestCase):
   def test_auto_restart_terminate(self):
     # Tasks terminated by the user should also be restarted.
 
-    def proc_func(counter):
+    def fn(counter):
       counter.value += 1
       if counter.value == 1:
         time.sleep(100)
@@ -471,7 +479,7 @@ class MultiProcessRunnerTest(test.TestCase):
     counter = manager.Value(int, 0)
 
     mpr = multi_process_runner.MultiProcessRunner(
-        proc_func,
+        fn,
         multi_worker_test_base.create_cluster_spec(
             has_chief=False, num_workers=1),
         args=(counter,),
@@ -484,14 +492,13 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_error_reporting_overrides_timeout_reporting(self):
 
-    def proc_func():
+    def fn():
       if self._worker_idx() == 1:
         time.sleep(10000)
       raise ValueError('Worker 0 errored')
 
     mpr = multi_process_runner.MultiProcessRunner(
-        proc_func,
-        multi_worker_test_base.create_cluster_spec(num_workers=2))
+        fn, multi_worker_test_base.create_cluster_spec(num_workers=2))
     mpr.start()
 
     with self.assertRaisesRegex(
@@ -501,12 +508,11 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_process_exists(self):
 
-    def proc_func():
+    def fn():
       time.sleep(100000)
 
     mpr = multi_process_runner.MultiProcessRunner(
-        proc_func,
-        multi_worker_test_base.create_cluster_spec(num_workers=1))
+        fn, multi_worker_test_base.create_cluster_spec(num_workers=1))
     mpr.start()
     self.assertTrue(mpr.process_exists('worker', 0))
     mpr.terminate('worker', 0)
@@ -514,29 +520,49 @@ class MultiProcessRunnerTest(test.TestCase):
     while mpr.process_exists('worker', 0):
       time.sleep(1)
 
+  def test_timeout_none(self):
+
+    if multi_process_runner.is_oss():
+      self.skipTest('Intentionally skipping longer test in OSS.')
+
+    def fn():
+      time.sleep(250)
+      raise ValueError('Worker 0 errored')
+
+    mpr = multi_process_runner.MultiProcessRunner(
+        fn, multi_worker_test_base.create_cluster_spec(num_workers=1))
+
+    mpr.start()
+    with self.assertRaisesRegex(ValueError, 'Worker 0 errored'):
+      mpr.join(timeout=None)
+
+
+_global_pool = multi_process_runner.MultiProcessPoolRunner(
+    multi_worker_test_base.create_cluster_spec(num_workers=2))
+
 
 class MultiProcessPoolRunnerTest(test.TestCase):
 
   def test_same_process_across_runs(self):
     cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
     runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
-    pid = runner.run(proc_func_that_returns_pid)
+    pid = runner.run(fn_that_returns_pid)
     for _ in range(3):
-      self.assertAllEqual(runner.run(proc_func_that_returns_pid), pid)
+      self.assertAllEqual(runner.run(fn_that_returns_pid), pid)
 
   def test_exceptions_in_sub_process(self):
     cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
     runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
-    pid = runner.run(proc_func_that_returns_pid)
+    pid = runner.run(fn_that_returns_pid)
     with self.assertRaisesRegex(ValueError, 'This is an error.'):
-      runner.run(proc_func_that_errors)
-    self.assertAllEqual(runner.run(proc_func_that_returns_pid), pid)
+      runner.run(fn_that_errors)
+    self.assertAllEqual(runner.run(fn_that_returns_pid), pid)
 
   def test_tf_config(self):
     cluster_spec = multi_worker_test_base.create_cluster_spec(
         has_chief=True, num_workers=2)
     runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
-    result = runner.run(proc_func_that_adds_task_type_in_return_data)
+    result = runner.run(fn_that_adds_task_type_in_return_data)
 
     job_count_dict = {'worker': 2, 'chief': 1}
     for data in result:
@@ -553,16 +579,32 @@ class MultiProcessPoolRunnerTest(test.TestCase):
     cluster_spec = multi_worker_test_base.create_cluster_spec(
         has_chief=True, num_workers=2)
     runner = multi_process_runner.MultiProcessPoolRunner(cluster_spec)
-    runner.run(proc_func_that_returns_pid)
+    runner.run(fn_that_returns_pid)
     raise ValueError('failure')
 
   def test_initializer(self):
     cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
     runner = multi_process_runner.MultiProcessPoolRunner(
-        cluster_spec, initializer=lambda: proc_func_that_sets_global(1))
-    result = runner.run(proc_func_that_sets_global, args=(2,))
+        cluster_spec, initializer=lambda: fn_that_sets_global(1))
+    result = runner.run(fn_that_sets_global, args=(2,))
     self.assertAllEqual(result, [1, 1])
 
+  def test_global_pool(self):
+    if multi_process_runner.is_oss():
+      self.skipTest('TODO(b/170360740): Failing in OSS')
+    _global_pool.run(fn_that_does_nothing)
+
+  def test_nested_pool(self):
+    if multi_process_runner.is_oss():
+      self.skipTest('TODO(b/170360740): Failing in OSS')
+
+    def fn():
+      # This runs in sub processes, so they are each using their own
+      # MultiProcessPoolRunner.
+      _global_pool.run(fn_that_does_nothing)
+
+    _global_pool.run(fn)
+
 
 if __name__ == '__main__':
   multi_process_runner.test_main()
diff --git a/tensorflow/python/distribute/multi_worker_continuous_run_test.py b/tensorflow/python/distribute/multi_worker_continuous_run_test.py
index 14e0564874b..971c2c5a8c3 100644
--- a/tensorflow/python/distribute/multi_worker_continuous_run_test.py
+++ b/tensorflow/python/distribute/multi_worker_continuous_run_test.py
@@ -40,6 +40,15 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope
 
 
+# TODO(b/151232436): This test doesn't work with check health enabled because it
+# relies on barrier around creating strategies. Check health performs
+# communications inside strategy constructor, which makes the barrier
+# ineffective.
+CollectiveAllReduceExtended = (
+    collective_all_reduce_strategy.CollectiveAllReduceExtended)
+CollectiveAllReduceExtended._enable_check_health = False
+
+
 NUM_WORKERS = 5
 
 
@@ -65,7 +74,7 @@ class MultiWorkerContinuousRunTest(test.TestCase, parameterized.TestCase):
     def worker_step_fn(worker_id):
       strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
       # Make sure the processeses are in sync after updating the cluster
-      multi_process_runner.barrier().wait()
+      multi_process_runner.get_barrier().wait()
 
       @def_function.function
       def run_reduce():
@@ -98,7 +107,7 @@ class MultiWorkerContinuousRunTest(test.TestCase, parameterized.TestCase):
     def worker_step_fn(worker_id, num_dims):
       strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
       # Make sure the processeses are in sync after updating the cluster
-      multi_process_runner.barrier().wait()
+      multi_process_runner.get_barrier().wait()
       tensor_shape = [2] * num_dims
 
       def variable_fn():
diff --git a/tensorflow/python/distribute/multi_worker_test_base.py b/tensorflow/python/distribute/multi_worker_test_base.py
index b0c51f4767f..9e56e6d1bf7 100644
--- a/tensorflow/python/distribute/multi_worker_test_base.py
+++ b/tensorflow/python/distribute/multi_worker_test_base.py
@@ -32,7 +32,7 @@ import six
 _portpicker_import_error = None
 try:
   import portpicker  # pylint: disable=g-import-not-at-top
-except ImportError as _error:  # pylint: disable=invalid-name
+except (ImportError, ModuleNotFoundError) as _error:  # pylint: disable=invalid-name
   _portpicker_import_error = _error
   portpicker = None
 
@@ -56,6 +56,7 @@ from tensorflow.python.training import server_lib
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.compat import collections_abc
+from tensorflow.python.util.tf_export import tf_export
 
 
 original_run_std_server = dc._run_std_server  # pylint: disable=protected-access
@@ -271,8 +272,8 @@ class MultiProcessCluster(object):
         self._cluster_spec,
         args=(self._start_events, self._finish_events),
         rpc_layer=self._rpc_layer,
-        stream_stdout=False,
-        list_stdout=False,
+        stream_output=False,
+        return_output=False,
         use_dill_for_args=False)
     self._mpr.start()
     for task_type, task_addresses in self._cluster_spec.items():
@@ -353,16 +354,53 @@ def create_multi_process_cluster(num_workers,
   return cluster
 
 
-# TODO(rchao): Remove `test_obj` once estimator repo picks up the updated
-# nightly TF.
+@tf_export(
+    '__internal__.distribute.multi_process_runner.create_cluster_spec', v1=[])
 def create_cluster_spec(has_chief=False,
                         num_workers=1,
                         num_ps=0,
-                        has_eval=False,
-                        test_obj=None):
-  """Create a cluster spec with tasks with unused local ports."""
-  del test_obj
+                        has_eval=False):
+  """Create a cluster spec with tasks with unused local ports.
 
+  This utility finds available ports at localhost, and returns a dict that
+  represents the cluster spec that utilizes those ports, according to the
+  arguments. The dict representing the cluster spec contains task types, and
+  their instances' addresses. Note that this is usually only for testing purpose
+  using multiple processes in the local machine, and should not be used for real
+  multi-worker TensorFlow programs, where the addresses need to point to the
+  processes at separate machines.
+
+  This util is useful when creating the `cluster_spec` arg for
+  `tf.__internal__.distribute.multi_process_runner.run`.
+
+  Arguments:
+    has_chief: Whether the generated cluster spec should contain "chief" task
+      type.
+    num_workers: Number of workers to use in the cluster spec.
+    num_ps: Number of parameter servers to use in the cluster spec.
+    has_eval: Whether this cluster spec has evaluator.
+
+  Returns:
+    A dict that represents the cluster spec using localhost ports for the tasks.
+
+  Example:
+
+  ```python
+  cluster_spec =
+  tf.__internal__.distribute.multi_process_runner.create_cluster_spec(
+      has_chief=True, num_workers=2, num_ps=2)
+  # An example of cluster_spec is
+  # {'chief': ['localhost:23381'],
+  # 'worker': ['localhost:19197', 'localhost:22903'],
+  # 'ps': ['localhost:16912', 'localhost:21535']}
+
+  cluster_spec =
+  tf.__internal__.distribute.multi_process_runner.create_cluster_spec(
+      has_chief=False, num_workers=0, num_ps=0, has_eval=True)
+  # An example of cluster_spec is
+  # {'evaluator': ['localhost:23381']}
+  ```
+  """
   if _portpicker_import_error:
     raise _portpicker_import_error  # pylint: disable=raising-bad-type
 
diff --git a/tensorflow/python/distribute/one_device_strategy.py b/tensorflow/python/distribute/one_device_strategy.py
index 8f40a5f7991..feaceb27795 100644
--- a/tensorflow/python/distribute/one_device_strategy.py
+++ b/tensorflow/python/distribute/one_device_strategy.py
@@ -109,8 +109,10 @@ class OneDeviceStrategy(distribute_lib.Strategy):
     return super(OneDeviceStrategy, self).experimental_distribute_dataset(
         dataset, options)
 
-  def experimental_distribute_datasets_from_function(self, dataset_fn,  # pylint: disable=useless-super-delegation
-                                                     options=None):
+  def distribute_datasets_from_function(
+      self,
+      dataset_fn,  # pylint: disable=useless-super-delegation
+      options=None):
     """Distributes `tf.data.Dataset` instances created by calls to `dataset_fn`.
 
     `dataset_fn` will be called once for each worker in the strategy. In this
@@ -127,7 +129,7 @@ class OneDeviceStrategy(distribute_lib.Strategy):
       return d.shard(
           input_context.num_input_pipelines, input_context.input_pipeline_id)
 
-    inputs = strategy.experimental_distribute_datasets_from_function(dataset_fn)
+    inputs = strategy.distribute_datasets_from_function(dataset_fn)
 
     for batch in inputs:
       replica_results = strategy.run(replica_fn, args=(batch,))
@@ -148,9 +150,8 @@ class OneDeviceStrategy(distribute_lib.Strategy):
       A "distributed `Dataset`", which the caller can iterate over like regular
       datasets.
     """
-    return super(
-        OneDeviceStrategy, self).experimental_distribute_datasets_from_function(
-            dataset_fn, options)
+    return super(OneDeviceStrategy,
+                 self).distribute_datasets_from_function(dataset_fn, options)
 
   def experimental_local_results(self, value):  # pylint: disable=useless-super-delegation
     """Returns the list of all local per-replica values contained in `value`.
@@ -316,8 +317,7 @@ class OneDeviceExtended(distribute_lib.StrategyExtendedV1):
         self._input_workers_with_options(options),
         self._container_strategy())
 
-  def _experimental_distribute_datasets_from_function(self, dataset_fn,
-                                                      options):
+  def _distribute_datasets_from_function(self, dataset_fn, options):
     return input_lib.get_distributed_datasets_from_function(
         dataset_fn,
         self._input_workers_with_options(options),
@@ -379,8 +379,12 @@ class OneDeviceExtended(distribute_lib.StrategyExtendedV1):
     with ops.device(self._device), _OneDeviceReplicaContext(strategy):
       return fn(*args, **kwargs)
 
-  def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
-    del reduce_op, destinations, experimental_hints
+  def _reduce_to(self, reduce_op, value, destinations, options):
+    del reduce_op, destinations, options
+    return value
+
+  def _gather_to_implementation(self, value, destinations, axis, options):
+    del destinations, axis, options
     return value
 
   def _update(self, var, fn, args, kwargs, group):
@@ -453,6 +457,9 @@ class OneDeviceExtended(distribute_lib.StrategyExtendedV1):
   def _support_per_replica_values(self):
     return False
 
+  def _get_local_replica_id(self, replica_id_in_sync_group):
+    return replica_id_in_sync_group
+
 
 class _OneDeviceReplicaContext(distribute_lib.ReplicaContext):
   """ReplicaContext for OneDeviceStrategy."""
diff --git a/tensorflow/python/distribute/parallel_device/BUILD b/tensorflow/python/distribute/parallel_device/BUILD
index e0f76fa6652..331e0a3c3af 100644
--- a/tensorflow/python/distribute/parallel_device/BUILD
+++ b/tensorflow/python/distribute/parallel_device/BUILD
@@ -1,5 +1,4 @@
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_library", "tf_gen_op_wrapper_py")
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load("//tensorflow/core/platform/default:distribute.bzl", "distribute_py_test")
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -17,10 +16,10 @@ py_library(
     srcs = ["parallel_device.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":parallel_device_ops",
         ":saving",
         "//tensorflow/python:_pywrap_parallel_device",
         "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/tpu:tpu_ops",
     ],
 )
 
@@ -31,26 +30,7 @@ py_library(
     deps = ["//tensorflow/python:framework_ops"],
 )
 
-tf_gen_op_wrapper_py(
-    name = "parallel_device_ops_py",
-    out = "gen_parallel_device_ops.py",
-    deps = ["//tensorflow/c/eager/parallel_device:parallel_device_ops"],
-)
-
-tf_custom_op_library(
-    name = "_parallel_device_ops.so",
-    srcs = ["//tensorflow/c/eager/parallel_device:parallel_device_ops_srcs"],
-)
-
-tf_custom_op_py_library(
-    name = "parallel_device_ops",
-    dso = [":_parallel_device_ops.so"],
-    kernels = ["//tensorflow/c/eager/parallel_device:parallel_device_ops"],
-    visibility = ["//tensorflow:internal"],
-    deps = [":parallel_device_ops_py"],
-)
-
-py_test(
+distribute_py_test(
     name = "parallel_device_test",
     srcs = ["parallel_device_test.py"],
     python_version = "PY3",
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device.py b/tensorflow/python/distribute/parallel_device/parallel_device.py
index 94d43561a30..218bd68d824 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device.py
@@ -19,26 +19,37 @@ from __future__ import division
 from __future__ import print_function
 
 import threading
+import weakref
 
 from tensorflow.python import _pywrap_parallel_device
 from tensorflow.python.distribute import device_util
-from tensorflow.python.distribute.parallel_device import gen_parallel_device_ops
 from tensorflow.python.distribute.parallel_device import saving
 from tensorflow.python.eager import context
-from tensorflow.python.framework import load_library
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
-from tensorflow.python.platform import resource_loader
+from tensorflow.python.ops import array_ops
 from tensorflow.python.tpu.ops import tpu_ops
 
-load_library.load_op_library(
-    resource_loader.get_path_to_datafile("_parallel_device_ops.so"))
-
 _next_device_number = 0
 _next_device_number_lock = threading.Lock()
 
+_all_parallel_devices = weakref.WeakValueDictionary()
+
+
+def unpack(tensor):
+  """Finds `tensor`'s parallel device and unpacks its components."""
+  parallel_device = _all_parallel_devices.get(tensor.device, None)
+  if parallel_device is None:
+    raise ValueError("{} is not a parallel device".format(tensor.device))
+  return parallel_device.unpack(tensor)
+
 
 # TODO(allenl): Expand this docstring once things like getting components on and
 # off the device are stable.
+#
+# TODO(allenl): Make multi-client work; we need an offset for device IDs, and an
+# indication of how many other devices there are total for collectives which
+# don't have a number of participants hard-coded in their attributes.
 class ParallelDevice(object):
   """A device which executes operations in parallel."""
 
@@ -64,10 +75,10 @@ class ParallelDevice(object):
     device, device_info = _pywrap_parallel_device.GetParallelDeviceCapsules(
         self._name, self.components)
     context.register_custom_device(device, self._name, device_info)
-    with ops.device(self._name):
-      self._device_ids = gen_parallel_device_ops.device_id()
+    self._device_ids = None
     self._device_scope = None
     self._saving_scope = None
+    _all_parallel_devices[self._name] = self
 
   def pack(self, tensors):
     """Create a tensor on the parallel device from a sequence of tensors.
@@ -106,6 +117,27 @@ class ParallelDevice(object):
     Returns:
       A parallel tensor containing 0 on the first device, 1 on the second, etc.
     """
+    if self._device_ids is None:
+      # device_ids may be called from inside a tf.function, in which case the
+      # function captures the eager tensor. We can't pack tensors in a function
+      # at the moment, and even if we could we don't want to hold on to a
+      # symbolic tensor, so we need to init_scope out of the function
+      # temporarily.
+      with ops.init_scope():
+        # TODO(allenl): Functions which capture eager device ID tensors won't be
+        # saveable in SavedModels. Ideally we'd run a DeviceID op every time
+        # device IDs are required, with functions using the op in their bodies
+        # but not hard-coding a fixed number of devices (so they can be re-used
+        # with a different replica count).
+        device_ids_list = []
+        for index, device in enumerate(self.components):
+          with ops.device(device):
+            # The identity op ensures each device ID tensor is placed on its
+            # device.
+            device_ids_list.append(
+                array_ops.identity(constant_op.constant(index)))
+        self._device_ids = self.pack(device_ids_list)
+
     return self._device_ids
 
   def _assert_eager(self):
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device_test.py b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
index 571a8ae4b6b..cf86a7362fb 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device_test.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
@@ -93,19 +93,30 @@ class _VirtualDeviceTestCase(test.TestCase):
 
   def setUp(self):
     super(_VirtualDeviceTestCase, self).setUp()
-    cpus = context.context().list_physical_devices("CPU")
-    # Set 4 virtual CPUs
-    context.context().set_logical_device_configuration(cpus[0], [
-        context.LogicalDeviceConfiguration(),
-        context.LogicalDeviceConfiguration(),
-        context.LogicalDeviceConfiguration(),
-        context.LogicalDeviceConfiguration()
-    ])
+    ctx = context.context()
+    if ctx.list_physical_devices("TPU"):
+      self.device_type = "TPU"
+    elif ctx.list_physical_devices("GPU"):
+      self.device_type = "GPU"
+      gpus = ctx.list_physical_devices(self.device_type)
+      ctx.set_logical_device_configuration(gpus[0], [
+          context.LogicalDeviceConfiguration(memory_limit=100),
+          context.LogicalDeviceConfiguration(memory_limit=100),
+      ])
+    else:
+      self.device_type = "CPU"
+      cpus = ctx.list_physical_devices("CPU")
+      ctx.set_logical_device_configuration(cpus[0], [
+          context.LogicalDeviceConfiguration(),
+          context.LogicalDeviceConfiguration(),
+      ])
 
-    self.device = parallel_device.ParallelDevice(
-        components=["/job:localhost/device:CPU:0", "CPU:1"])
-    self.assertIn("CPU:0", self.device.components[0])
-    self.assertIn("CPU:1", self.device.components[1])
+    self.device = parallel_device.ParallelDevice(components=[
+        "/job:localhost/device:{}:0".format(self.device_type),
+        self.device_type + ":1"
+    ])
+    self.assertIn(self.device_type + ":0", self.device.components[0])
+    self.assertIn(self.device_type + ":1", self.device.components[1])
 
 
 class ParallelDeviceTests(_VirtualDeviceTestCase):
@@ -124,10 +135,14 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
   def test_device_id(self):
     device_ids = self.device.unpack(self.device.device_ids)
     self.assertAllClose([0, 1], device_ids)
-    self.assertIn(self.device.components[0], device_ids[0].backing_device)
-    self.assertIn(self.device.components[1], device_ids[1].backing_device)
+    # TODO(allenl): Should device IDs be int64 so they can be placed on GPUs?
+    # Currently backing_device is CPU.
+    self.assertIn(self.device.components[0], device_ids[0].device)
+    self.assertIn(self.device.components[1], device_ids[1].device)
 
   def test_collective_reduce(self):
+    if self.device_type == "TPU":
+      self.skipTest("ParallelDevice collectives on TPUs need work")
     with self.device:
       x = self.device.pack(
           [constant_op.constant(-1.5),
@@ -139,6 +154,8 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[1], outputs[1].backing_device)
 
   def test_collective_reduce_async_scope(self):
+    if self.device_type == "TPU":
+      self.skipTest("ParallelDevice collectives on TPUs need work")
     # Note that ops on the parallel device currently don't execute
     # asynchronously. The test is just that we don't get deadlocks.
     with context.async_scope(), self.device:
@@ -152,6 +169,8 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[1], outputs[1].backing_device)
 
   def test_collective_reduce_async_context(self):
+    if self.device_type == "TPU":
+      self.skipTest("ParallelDevice collectives on TPUs need work")
     previous = config.get_synchronous_execution()
     try:
       context._reset_context()
@@ -173,6 +192,8 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
       config.set_synchronous_execution(previous)
 
   def test_collective_in_function(self):
+    if self.device_type == "TPU":
+      self.skipTest("ParallelDevice collectives on TPUs need work")
     c = constant_op.constant([2])
 
     @def_function.function
@@ -282,6 +303,19 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
       for parallel_component in self.device.unpack(parallel):
         self.assertAllClose(non_parallel, parallel_component)
 
+  def test_capturing(self):
+    with self.device:
+      x = constant_op.constant([1., 2.])
+      x = array_ops.identity(x)
+
+      @def_function.function
+      def f(y):
+        return x + y
+
+      y = array_ops.ones([2])
+      parallel_result = f(y)
+    self.assertAllClose([[2., 3.]] * 2, self.device.unpack(parallel_result))
+
   def test_euclidean_norm(self):
     def _test_fn():
       with backprop.GradientTape() as tape:
@@ -300,6 +334,33 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
       return y, tape.gradient(y, x)
     self._assert_close_to_non_parallel(_test_fn)
 
+  def test_variable_created_in_function(self):
+
+    class M(module.Module):
+
+      def __init__(self):
+        self.v = None
+        self.w = None
+        self.x = None
+        self.z = None
+
+      @def_function.function(autograph=False)
+      def __call__(self, x):
+        if self.v is None:
+          with ops.init_scope():
+            initial_value = constant_op.constant(2.)
+            self.z = variables.Variable(initial_value)
+          self.x = variables.Variable(initial_value)
+          self.w = variables.Variable(lambda: constant_op.constant(2.))
+          self.v = variables.Variable(constant_op.constant(2.))
+        return x * self.v * self.w * self.x * self.z
+
+    with self.device:
+      m = M()
+      packed_outputs = m(array_ops.ones([]))
+      outputs = self.device.unpack(packed_outputs)
+    self.assertAllClose([16., 16.], outputs)
+
 
 class LayerTests(_VirtualDeviceTestCase):
 
@@ -327,6 +388,8 @@ class LayerTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[1], outputs[1].backing_device)
 
   def test_layer_sync_training(self):
+    if self.device_type == "TPU":
+      self.skipTest("ParallelDevice collectives on TPUs need work")
     with self.device:
       layer = _Dense(5)
 
@@ -376,6 +439,8 @@ class LayerTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[1], final_kernels[1].backing_device)
 
   def test_training_loop(self):
+    if self.device_type == "TPU":
+      self.skipTest("ParallelDevice collectives on TPUs need work")
     for _ in range(5):
       layer = _Dense(5)
       checkpoint = tracking.Checkpoint(layer=layer)
diff --git a/tensorflow/python/distribute/parallel_device/saving.py b/tensorflow/python/distribute/parallel_device/saving.py
index f1539e49651..5fdd7ae5d3a 100644
--- a/tensorflow/python/distribute/parallel_device/saving.py
+++ b/tensorflow/python/distribute/parallel_device/saving.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import contextlib
 import functools
+import six
+import wrapt
 
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -47,14 +49,32 @@ class _ParallelComponentSaveable(saveable_object.SaveableObject):
         resource=self._handle, value=restored_tensor)
 
 
-class ParallelSavingMixin(resource_variable_ops.BaseResourceVariable):
-  """Mixin to to override variable checkpointing, saving each component."""
+_wrapt_type = type(wrapt.ObjectProxy)
+_variable_type = type(resource_variable_ops.BaseResourceVariable)
+if issubclass(_variable_type, _wrapt_type):
+  # Some wrapt versions do not have a meta-class, which would create an invalid
+  # MRO.
+  VariableProxyMetaClass = _variable_type
+else:
+  class VariableProxyMetaClass(_wrapt_type, _variable_type):  # pylint: disable=duplicate-bases
+    """A combined MetaClasses for ParallelVariable.
 
-  def __init__(self, parallel_device, expected_shape=None, use_resource=None,
-               **kwargs):
-    del expected_shape, use_resource
-    self._parallel_device = parallel_device
-    super(ParallelSavingMixin, self).__init__(**kwargs)
+    Satisfies the requirement "the metaclass of a derived class must be a
+    (non-strict) subclass of the metaclasses of all its bases." At the time of
+    writing these two MetaClasses are compatible (overriding different methods,
+    both relatively trivial).
+    """
+    pass
+
+
+class ParallelVariable(
+    six.with_metaclass(VariableProxyMetaClass, wrapt.ObjectProxy,
+                       resource_variable_ops.BaseResourceVariable)):
+  """Overrides variable checkpointing, saving each component."""
+
+  def __init__(self, parallel_device, wrapped_variable):
+    self._self_parallel_device = parallel_device
+    super(ParallelVariable, self).__init__(wrapped_variable)
 
   # TODO(allenl): Consider either adding a boolean argument for
   # save-primary-only or looking at synchronization/aggregation properties.
@@ -63,7 +83,8 @@ class ParallelSavingMixin(resource_variable_ops.BaseResourceVariable):
     component_saveables = {}
     # Create one SaveableObject per device, each one of which looks like a
     # regular ResourceVariable saveable.
-    for index, handle in enumerate(self._parallel_device.unpack(self.handle)):
+    for index, handle in enumerate(
+        self._self_parallel_device.unpack(self.handle)):
       if index == 0:
         # This is the name regular tf.Variables use to save. Using it for the
         # component on the first device means non-parallel tf.Variable objects
@@ -80,26 +101,24 @@ class ParallelSavingMixin(resource_variable_ops.BaseResourceVariable):
     return component_saveables
 
 
-class ParallelVariable(
-    ParallelSavingMixin, resource_variable_ops.ResourceVariable):
-  pass
-
-
-class UninitializedParallelVariable(
-    ParallelSavingMixin, resource_variable_ops.UninitializedVariable):
-  pass
-
-
-def _variable_creator(next_creator, parallel_device, initial_value=None,
-                      **kwargs):
-  del next_creator
-  if initial_value is not None:
+def _variable_creator(next_creator, parallel_device, **kwargs):
+  """Wraps intercepted variables to add parallel saving."""
+  # Depending on the context (SavedModel loading, tf.function, etc.) we may get
+  # one of several different variable types. For variables placed on the
+  # parallel device we only want to affect saving and otherwise preserve
+  # behavior. This wrapping to override behavior is similar to tf.distribute's
+  # DistributedVariable, but much more limited.
+  variable = next_creator(**kwargs)
+  if variable.device == parallel_device._name:  # Friend access; pylint: disable=protected-access
     return ParallelVariable(
-        parallel_device=parallel_device, initial_value=initial_value, **kwargs)
+        parallel_device=parallel_device, wrapped_variable=variable)
   else:
-    # SavedModel loading does not pass an initial value.
-    return UninitializedParallelVariable(
-        parallel_device=parallel_device, **kwargs)
+    # Variables not placed on the handler (because of a device scope) don't
+    # need wrapping.
+    #
+    # TODO(allenl): Device scopes should merge with parallel devices rather
+    # than overriding them like this.
+    return variable
 
 
 @contextlib.contextmanager
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
index 1d4c593d48b..29dc30b4abc 100644
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -117,10 +117,6 @@ class ParameterServerStrategy(distribute_lib.Strategy):
     extended = ParameterServerStrategyExtended(
         self, cluster_resolver=cluster_resolver)
     super(ParameterServerStrategy, self).__init__(extended)
-    distribute_lib.distribution_strategy_gauge.get_cell("V2").set(
-        "ParameterServerStrategy")
-    distribute_lib.distribution_strategy_replica_gauge.get_cell("num_ps").set(
-        len(self.extended.parameter_devices))
 
   def experimental_distribute_dataset(self, dataset, options=None):
     self._raise_pss_error_if_eager()
@@ -128,12 +124,10 @@ class ParameterServerStrategy(distribute_lib.Strategy):
           self).experimental_distribute_dataset(dataset=dataset,
                                                 options=options)
 
-  def experimental_distribute_datasets_from_function(self, dataset_fn,
-                                                     options=None):
+  def distribute_datasets_from_function(self, dataset_fn, options=None):
     self._raise_pss_error_if_eager()
-    super(ParameterServerStrategy,
-          self).experimental_distribute_datasets_from_function(
-              dataset_fn=dataset_fn, options=options)
+    super(ParameterServerStrategy, self).distribute_datasets_from_function(
+        dataset_fn=dataset_fn, options=options)
 
   def run(self, fn, args=(), kwargs=None, options=None):
     self._raise_pss_error_if_eager()
@@ -162,8 +156,6 @@ class ParameterServerStrategyV1(distribute_lib.StrategyV1):
             self, cluster_resolver=cluster_resolver))
     distribute_lib.distribution_strategy_gauge.get_cell("V1").set(
         "ParameterServerStrategy")
-    distribute_lib.distribution_strategy_replica_gauge.get_cell("num_ps").set(
-        len(self.extended.parameter_devices))
 
   __init__.__doc__ = ParameterServerStrategy.__init__.__doc__
 
@@ -353,14 +345,14 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
         dataset,
         self._input_workers_with_options(options),
         self._container_strategy(),
-        split_batch_by=self._num_replicas_in_sync)
+        num_replicas_in_sync=self._num_replicas_in_sync)
 
   def _make_dataset_iterator(self, dataset):
     return input_lib.DatasetIterator(
         dataset,
         self._input_workers,
         self._container_strategy(),
-        split_batch_by=self._num_replicas_in_sync)
+        num_replicas_in_sync=self._num_replicas_in_sync)
 
   def _make_input_fn_iterator(
       self,
@@ -387,8 +379,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
     return numpy_dataset.one_host_numpy_dataset(
         numpy_input, self._input_host_device, session)
 
-  def _experimental_distribute_datasets_from_function(self, dataset_fn,
-                                                      options):
+  def _distribute_datasets_from_function(self, dataset_fn, options):
     if self._cluster_spec:
       input_pipeline_id = multi_worker_util.id_in_cluster(
           self._cluster_spec, self._task_type, self._task_id)
@@ -512,25 +503,31 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
             "Cannot reduce to another worker: %r, current worker is %r" %
             (d, self._worker_device))
 
-  def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
+  def _gather_to_implementation(self, value, destinations, axis,
+                                options):
+    self._verify_destinations_not_different_worker(destinations)
+    if not isinstance(value, values.DistributedValues):
+      return value
+    return self._cross_device_ops._gather(  # pylint: disable=protected-access
+        value,
+        destinations=destinations,
+        axis=axis,
+        options=options)
+
+  def _reduce_to(self, reduce_op, value, destinations, options):
     self._verify_destinations_not_different_worker(destinations)
     if not isinstance(value, values.DistributedValues):
       # pylint: disable=protected-access
       return cross_device_ops_lib.reduce_non_distributed_value(
           reduce_op, value, destinations, self._num_replicas_in_sync)
     return self._cross_device_ops.reduce(
-        reduce_op,
-        value,
-        destinations=destinations,
-        experimental_hints=experimental_hints)
+        reduce_op, value, destinations=destinations, options=options)
 
-  def _batch_reduce_to(self, reduce_op, value_destination_pairs,
-                       experimental_hints):
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs, options):
     for _, destinations in value_destination_pairs:
       self._verify_destinations_not_different_worker(destinations)
     return self._cross_device_ops.batch_reduce(reduce_op,
-                                               value_destination_pairs,
-                                               experimental_hints)
+                                               value_destination_pairs, options)
 
   def _select_single_value(self, structured):
     """Select any single value in `structured`."""
@@ -702,3 +699,9 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
       Boolean.
     """
     return True
+
+  def _get_local_replica_id(self, replica_id_in_sync_group):
+    return replica_id_in_sync_group
+
+  def _get_replica_id_in_sync_group(self, replica_id):
+    return replica_id
diff --git a/tensorflow/python/distribute/parameter_server_strategy_test.py b/tensorflow/python/distribute/parameter_server_strategy_test.py
index a68183adbaa..1b4cd21c249 100644
--- a/tensorflow/python/distribute/parameter_server_strategy_test.py
+++ b/tensorflow/python/distribute/parameter_server_strategy_test.py
@@ -759,10 +759,9 @@ class ParameterServerStrategyTest(
                            strategy.experimental_distribute_dataset,
                            dataset.batch(2))
 
-    self.assertRaisesRegex(
-        NotImplementedError, 'ParameterServerStrategy*',
-        strategy.experimental_distribute_datasets_from_function,
-        lambda _: dataset)
+    self.assertRaisesRegex(NotImplementedError, 'ParameterServerStrategy*',
+                           strategy.distribute_datasets_from_function,
+                           lambda _: dataset)
 
     self.assertRaisesRegex(NotImplementedError, 'ParameterServerStrategy*',
                            strategy.scope)
diff --git a/tensorflow/python/distribute/parameter_server_strategy_v2.py b/tensorflow/python/distribute/parameter_server_strategy_v2.py
index 718fa809153..d215be0dd94 100644
--- a/tensorflow/python/distribute/parameter_server_strategy_v2.py
+++ b/tensorflow/python/distribute/parameter_server_strategy_v2.py
@@ -22,16 +22,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from absl import logging
+import os
+
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.distribute import sharded_variable
+from tensorflow.python.eager import remote
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.util import tf_contextlib
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import server_lib
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.util import tf_inspect
 
 
 # pylint: disable=protected-access
@@ -39,56 +44,92 @@ class ParameterServerStrategyV2(distribute_lib.Strategy):
   """An asynchronous multi-worker parameter server tf.distribute strategy.
 
   Currently, `ParameterServerStrategyV2` is not supported to be used as a
-  standalone tf.distribute strategy. It must be used in conjunction with
-  `Client`. The recommended way of using the combination is through a
-  `ParameterServerClient` object. Please see `Client` and
-  `ParameterServerClient` for more information.
+  standalone tf.distribute strategy. It should be used in conjunction with
+  `Client`. Please see `Client` for more information.
 
   This is currently under development, and the API as well as implementation
   is subject to changes.
   """
 
-  def __init__(self, cluster_resolver):
+  def __init__(self, cluster_resolver, variable_partitioner=None):
     """Initializes the V2 parameter server strategy.
 
+    This also connects to the remote server cluster.
+
     Args:
       cluster_resolver: a `tf.distribute.cluster_resolver.ClusterResolver`
         object.
+      variable_partitioner: a callable with the signature `num_partitions =
+        fn(shape, dtype)`, where `num_partitions` is a list/tuple representing
+        the number of partitions on each axis, and `shape` and `dtype` are of
+        types `tf.TensorShape` and `tf.dtypes.Dtype`. If None, variables will
+        not be partitioned. * `variable_partitioner` will be called for all
+        variables created under strategy `scope` to instruct how the variables
+        should be partitioned. Variables will be partitioned if there are more
+        than one partitions along the partitioning axis, otherwise it falls back
+        to normal `tf.Variable`. * Only the first / outermost axis partitioning
+        is supported, namely, elements in `num_partitions` must be 1 other than
+        the first element. * Partitioner like `min_max_variable_partitioner`,
+        `variable_axis_size_partitioner` and `fixed_size_partitioner` are also
+        supported since they conform to the required signature. * Div partition
+        strategy is used to partition variables. Assuming we assign consecutive
+        integer ids along the first axis of a variable, then ids are assigned to
+        shards in a contiguous manner, while attempting to keep each shard size
+        identical. If the ids do not evenly divide the number of shards, each of
+        the first several shards will be assigned one more id. For instance, a
+        variable whose first dimension is
+        13 has 13 ids, and they are split across 5 shards as: `[[0, 1, 2], [3,
+          4, 5], [6, 7, 8], [9, 10], [11, 12]]`. * Variables created under
+          `strategy.extended.colocate_vars_with` will not be partitioned, e.g,
+          optimizer's slot variables.
     """
-    self._extended = ParameterServerStrategyV2Extended(self, cluster_resolver)
     self._cluster_resolver = cluster_resolver
+    self._extended = ParameterServerStrategyV2Extended(self, cluster_resolver,
+                                                       variable_partitioner)
     self._verify_args_and_config(cluster_resolver)
     logging.info(
         "ParameterServerStrategyV2 is initialized with cluster_spec: "
         "%s", cluster_resolver.cluster_spec())
+
+    # TODO(b/167894802): Make chief, worker, and ps names customizable.
+    self._connect_to_cluster(client_name="chief")
     super(ParameterServerStrategyV2, self).__init__(self._extended)
+    distribute_lib.distribution_strategy_gauge.get_cell("V2").set(
+        "ParameterServerStrategy")
 
-  @tf_contextlib.contextmanager
-  def experimental_variable_partitioning_scope(self):
-    """A context manager for creating `ShardedVariable`.
+  def _connect_to_cluster(self, client_name):
+    if client_name in ["worker", "ps"]:
+      raise ValueError("Client name should not be 'worker' or 'ps'.")
+    cluster_spec = self._cluster_resolver.cluster_spec()
+    self._num_workers = len(cluster_spec.as_dict().get("worker", ()))
+    self._num_ps = len(cluster_spec.as_dict().get("ps", ()))
 
-    Variables created inside a `with experimental_variable_partitioning_scope()`
-    code block will be of type `ShardedVariable` and their values are
-    partitioned among parameter servers along the first / outermost axis. The
-    number of shards are equal to the number of parameter servers.
+    device_filters = server_lib.ClusterDeviceFilters()
+    # For any worker, only the devices on PS and chief nodes are visible
+    for i in range(self._num_workers):
+      device_filters.set_device_filters(
+          "worker", i, ["/job:ps", "/job:%s" % client_name])
+    # Similarly for any ps, only the devices on workers and chief are visible
+    for i in range(self._num_ps):
+      device_filters.set_device_filters(
+          "ps", i, ["/job:worker", "/job:%s" % client_name])
 
-    Variables created within this scope must be initialized using a callable as
-    `initial_value` and a known shape.
+    # Allow at most one outstanding RPC for each worker at a certain time. This
+    # is to simplify worker failure handling in the runtime
+    os.environ["TF_ENABLE_EAGER_CLIENT_STREAMING_ENQUEUE"] = "False"
 
-    Div partition strategy is used to partition the variable. Assuming we
-    assign consective integer ids along the first axis of the variable, then ids
-    are assigned to shards in a contiguous manner, while attempting to keep each
-    shard size identical. If the ids do not evenly divide the number of shards,
-    each of the first several shards will be assigned one more id. For instance,
-    a variable whose first dimension is 13 has 13 ids, and they are split across
-    5 shards as: `[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]`.
+    logging.info("%s is now connecting to cluster with cluster_spec: %r",
+                 self.__class__.__name__, cluster_spec)
+    remote.connect_to_cluster(
+        cluster_spec,
+        job_name=client_name,
+        protocol=self._cluster_resolver.rpc_layer,
+        cluster_device_filters=device_filters)
 
-    Yields:
-      A context manager for creating `ShardedVariable`.
-    """
-    with variable_scope.variable_creator_scope(
-        self._extended._make_sharded_variable_creator()):
-      yield
+    distribute_lib.distribution_strategy_replica_gauge.get_cell(
+        "ps_strategy_num_workers").set(self._num_workers)
+    distribute_lib.distribution_strategy_replica_gauge.get_cell(
+        "ps_strategy_num_ps").set(self._num_ps)
 
   def _verify_args_and_config(self, cluster_resolver):
     if not cluster_resolver.cluster_spec():
@@ -104,15 +145,35 @@ class ParameterServerStrategyV2Extended(
   Please see `tf.distribute.StrategyExtended` doc for more information.
   """
 
-  def __init__(self, container_strategy, cluster_resolver):
+  def __init__(self, container_strategy, cluster_resolver,
+               variable_partitioner):
     """Initialization of ParameterServerStrategyV2Extended."""
     super(ParameterServerStrategyV2Extended, self).__init__(container_strategy)
     self._num_ps = len(cluster_resolver.cluster_spec().as_dict().get("ps", []))
     self._variable_count = 0
+    self._variable_partitioner = variable_partitioner
 
   def _create_variable(self, next_creator, **kwargs):
+    """Implements StrategyExtendedV2._create_variable.
 
-    if "colocate_with" in kwargs:
+    Creates a `Variable` or a `ShardedVariable`. A `ShardedVariable` will be
+    created if satisfying all the following criteria:
+      1. `self._variable_partitioner` results in more than one partition on the
+         first axis.
+      2. variable's rank is greater than 0.
+      3. variable is not colocated with another variable.
+    Otherwise a `Variable` will be created.
+
+    Args:
+      next_creator: See `variable_scope.variable_creator_scope`; the next
+        creator in the chain.
+      **kwargs: Passed through to the next creator.
+
+    Returns:
+      A `Variable` or `ShardedVariable`.
+    """
+
+    if "colocate_with" in kwargs:  # Never partition colocated_with variables.
       colocate_with = kwargs["colocate_with"]
       # Clear the variable scope to avoid possible conflicts between device
       # scope and colocation scope.
@@ -124,6 +185,101 @@ class ParameterServerStrategyV2Extended(
               var.name, var.shape, kwargs["colocate_with"].name)
           return var
 
+    if self._variable_partitioner is None:
+      return self._create_variable_round_robin(next_creator, **kwargs)
+
+    name = kwargs.get("name", None)
+    initial_value = kwargs.get("initial_value", None)
+    if initial_value is None:
+      raise ValueError("initial_value must be specified.")
+
+    # Two cases where initial_value can be a callable:
+    #   1. initial_value is passed as a callable, e.g, an `initializer` class.
+    #   2. restoring from checkpoint, initial_value is a
+    #     "CheckpointInitialValueCallable".
+    init_from_fn = callable(initial_value)
+
+    dtype = kwargs.get("dtype", None)
+    shape = kwargs.get("shape", None)
+    if init_from_fn and (shape is None or dtype is None):
+      init_from_fn = False
+      initial_value = initial_value()
+    if not init_from_fn:
+      # The initial_value is created on client, it will need to be sent to
+      # PS for variable initialization, which can be inefficient and can
+      # potentially hit the 2GB limit on protobuf serialization.
+      initial_value = ops.convert_to_tensor(initial_value, dtype=dtype)
+      dtype = initial_value.dtype
+      shape = initial_value.shape
+    else:
+      shape = tensor_shape.as_shape(shape)
+
+    if shape.rank == 0:  # Skip partitioning rank-0 variable.
+      return self._create_variable_round_robin(next_creator, **kwargs)
+
+    num_partitions = self._variable_partitioner(shape=shape, dtype=dtype)
+    if not num_partitions or num_partitions[0] == 0 or any(
+        v != 1 for v in num_partitions[1:]):
+      raise ValueError(
+          "variable_partitioner must return a list/tuple whose elements are 1"
+          " besides the first element (non-zero), got: %r" % num_partitions)
+
+    if num_partitions[0] == 1:  # no partition
+      return self._create_variable_round_robin(next_creator, **kwargs)
+
+    # Use "div" partition strategy to partition the variable.
+    num_partitions = min(num_partitions[0], shape[0])
+    base = shape[0] // num_partitions
+    extra = shape[0] % num_partitions
+    # An example: num_partitions=4, shape[0]=10, partitions: [3, 3, 2, 2]
+    # offsets: [0, 3, 6, 8, 10]
+    offsets = []
+    for i in range(num_partitions):
+      if i == 0:
+        offsets.append(0)
+      else:
+        prev_shard_size = base + (1 if i - 1 < extra else 0)
+        offsets.append(offsets[i - 1] + prev_shard_size)
+    offsets.append(shape[0])
+
+    def init_shard_fn(shard_index):
+      if not init_from_fn:
+        logging.log_if(
+            logging.WARN, _INEFFICIENT_INIT_WARNING % name, shard_index == 0 and
+            shape.num_elements() > _LARGE_VARIABLE_NUM_ELEMENTS)
+        return initial_value[offsets[shard_index]:offsets[shard_index + 1]]
+      arg_spec = tf_inspect.getfullargspec(initial_value)
+      if ("shard_info" not in arg_spec.args and
+          "shard_info" not in arg_spec.kwonlyargs):
+        # `initial_value` is a callable that doesn't accept `shard_info`.
+        logging.log_if(
+            logging.WARN, _INEFFICIENT_INIT_WARNING % name, shard_index == 0 and
+            shape.num_elements() > _LARGE_VARIABLE_NUM_ELEMENTS)
+        full_value = initial_value()
+        return full_value[offsets[shard_index]:offsets[shard_index + 1]]
+      else:
+        # Memory-efficient way of initializing sharded variable. It requires
+        # the `init_fn` to accept a namedtuple `shard_info`.
+        component_shape = (offsets[shard_index + 1] -
+                           offsets[shard_index],) + shape[1:]
+        offsets_all_axes = (offsets[shard_index],) + (0,) * len(shape[1:])
+        return initial_value(
+            shard_info=trackable.ShardInfo(
+                shape=tensor_shape.as_shape(component_shape),
+                offset=offsets_all_axes))
+
+    var_list = []
+    for i in range(num_partitions):
+      kwargs["shape"] = (offsets[i + 1] - offsets[i],) + shape[1:]
+      kwargs["initial_value"] = lambda: init_shard_fn(i)
+      if name is not None:
+        kwargs["name"] = "{}/part_{}".format(name, i)
+      var_list.append(self._create_variable_round_robin(next_creator, **kwargs))
+
+    result = sharded_variable.ShardedVariable(var_list)
+    return result
+
+  def _create_variable_round_robin(self, next_creator, **kwargs):
     # Clear the colocation scope to avoid possible conflicts between device
     # scope and colocation scope.
     with ops.colocate_with(None, ignore_existing=True):
@@ -136,73 +292,24 @@ class ParameterServerStrategyV2Extended(
         self._variable_count += 1
         return var
 
-  def _make_sharded_variable_creator(self):
-    """Returns a function conforming to the `variable_creator` signature.
-
-    The returned function creates `ShardedVariable` when called.
-    """
-
-    def sharded_variable_creator(next_creator, **kwargs):
-      if "shape" not in kwargs or kwargs["shape"] is None:
-        raise ValueError("shape must be explicitly specified when creating "
-                         "sharded variables")
-      init_fn = kwargs.get("initial_value", None)
-      # We intentionally don't allow non-callable initial_value to ensure the
-      # value is created on PS but not client. If the value is created on
-      # client, it will needed to be sent to PS for variable initialization,
-      # which is inefficient and can potentially hit the 2GB limit on protobuf
-      # serialization.
-      if init_fn is None or not callable(init_fn):
-        raise ValueError("initial_value must be specified as a callable when "
-                         "creating sharded variables")
-
-      # Use "div" partition strategy to partition the variable.
-      full_shape = kwargs["shape"]
-      if self._num_ps < full_shape[0]:
-        num_shards = self._num_ps
-      else:
-        num_shards = full_shape[0]
-      offsets = []
-      base = full_shape[0] // num_shards
-      extra = full_shape[0] % num_shards
-      for i in range(num_shards):
-        if i == 0:
-          offsets.append(0)
-        else:
-          prev_shard_size = base + (1 if i - 1 < extra else 0)
-          offsets.append(offsets[i - 1] + prev_shard_size)
-
-      # Note: The way we initialize sharded variables is suboptimal, as it
-      # needs to create the full value tensor separately on each PS which the
-      # variable is going to be placed on. The full value could be very large
-      # and consume a lot of memory. The ideal way is to only create what's
-      # needed on the shard, however that's not practical because:
-      #  1. Initializers don't have sharded behavior support, even though some
-      #     initializers (e.g, uniform) can be used directly.
-      #  2. tf.Variable signature requires "initial_value" to be either a value
-      #     or a callable without arguments, meaning it is not straightforward
-      #     to make the sharded component from it.
-      def init_shard_fn(shard_index):
-        full_value = init_fn()
-        if shard_index < num_shards - 1:
-          return full_value[offsets[shard_index]:offsets[shard_index + 1]]
-        else:
-          return full_value[offsets[shard_index]:]
-
-      var_list = []
-      for i in range(num_shards):
-        kwargs["shape"] = None
-        kwargs["initial_value"] = lambda: init_shard_fn(i)
-        var_list.append(next_creator(**kwargs))
-
-      result = sharded_variable.ShardedVariable(var_list)
-      return result
-
-    return sharded_variable_creator
-
   def _call_for_each_replica(self, fn, args, kwargs):
     with distribute_lib.ReplicaContext(
         self._container_strategy(),
         replica_id_in_sync_group=constant_op.constant(0, dtypes.int32)):
       # TODO(rchao): Support multi-replica per worker or sync-group.
       return distribute_utils.regroup((fn(*args, **kwargs),))
+
+
+# The warning that will be logged if the way we initialize sharded variables
+# is memory-inefficient.
+_INEFFICIENT_INIT_WARNING = (
+    "Large variable %s is partitioned but not initialized in a memory-efficient"
+    " way. The full value is first being created and then sliced into smaller "
+    "values. To reduce the memory footprint, explicitly specify `dtype` and "
+    "`shape` when creating variables, and pass a callable to Variable's "
+    "`initial_value`. The callable should take only one argument which is a "
+    "namedtuple (shape: `tf.TensorShape`, offsets: list/tuple) where shape is "
+    "the shape of the component variable, and offsets is the offsets of the "
+    "smaller variable on each axis.")
+
+_LARGE_VARIABLE_NUM_ELEMENTS = 1e9
diff --git a/tensorflow/python/distribute/parameter_server_strategy_v2_test.py b/tensorflow/python/distribute/parameter_server_strategy_v2_test.py
index ad4e36baf38..d7c447a756f 100644
--- a/tensorflow/python/distribute/parameter_server_strategy_v2_test.py
+++ b/tensorflow/python/distribute/parameter_server_strategy_v2_test.py
@@ -19,16 +19,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import platform
-import sys
+import functools
+import os
+
+from absl.testing import parameterized
 
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import parameter_server_strategy_v2
+from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
-from tensorflow.python.eager import remote
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops import linalg_ops_impl
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variables
 from tensorflow.python.training.server_lib import ClusterSpec
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.training.tracking import util as tracking_util
 
 
 class ParameterServerStrategyV2Test(test.TestCase):
@@ -37,20 +48,16 @@ class ParameterServerStrategyV2Test(test.TestCase):
   def setUpClass(cls):
     super(ParameterServerStrategyV2Test, cls).setUpClass()
     cluster_def = multi_worker_test_base.create_in_process_cluster(
-        num_workers=2, num_ps=3, rpc_layer="grpc")
-    cls.cluster_resolver = SimpleClusterResolver(
-        ClusterSpec(cluster_def), rpc_layer="grpc")
-    remote.connect_to_cluster(
-        cls.cluster_resolver.cluster_spec(),
-        job_name="chief",
-        protocol=cls.cluster_resolver.rpc_layer)
+        num_workers=2, num_ps=3)
+    cls.cluster_resolver = SimpleClusterResolver(ClusterSpec(cluster_def))
+
+  def tearDown(self):
+    super().tearDown()
+    # reset context to disconnect from the cluster.
+    context._reset_context()
 
   def testVariablePlacement(self):
 
-    if sys.version_info >= (3, 8) and platform.system() == "Windows":
-      # TODO(b/165013260): Fix this
-      self.skipTest("Test is currently broken on Windows with Python 3.8")
-
     strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
         self.cluster_resolver)
     v1 = variables.Variable(initial_value=0.0)
@@ -68,5 +75,295 @@ class ParameterServerStrategyV2Test(test.TestCase):
     self.assertEqual(v5.device, "/job:ps/replica:0/task:0/device:CPU:0")
 
 
+class PartitionAwareIdentity(object):
+
+  def __call__(self, shape, dtype, shard_info):
+    value = linalg_ops_impl.eye(*shape, dtype=dtype)
+    if shard_info is not None:
+      value = array_ops.slice(value, shard_info.offset, shard_info.shape)
+    return value
+
+
+class VariablePartitioningTest(test.TestCase, parameterized.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    super(VariablePartitioningTest, cls).setUpClass()
+    cluster_def = multi_worker_test_base.create_in_process_cluster(
+        num_workers=2, num_ps=2)
+    cls.cluster_resolver = SimpleClusterResolver(ClusterSpec(cluster_def))
+
+  def tearDown(self):
+    super().tearDown()
+    # reset context to disconnect from the cluster.
+    context._reset_context()
+
+  def testDefaultNoPartition(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver)
+    with strategy.scope():
+      v = variables.Variable([0, 1, 2, 3])
+
+    self.assertIsInstance(v, variables.Variable)
+
+  def testBasic(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(2))
+    with strategy.scope():
+      init1 = init_ops_v2.Constant([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+      v1 = variables.Variable(
+          initial_value=lambda: init1(shape=(5, 2), dtype=dtypes.int64),
+          shape=(5, 2),
+          dtype=dtypes.int64)
+
+      init2 = init_ops_v2.Constant([0, 1, 2, 3, 4, 5])
+      v2 = variables.Variable(
+          initial_value=lambda: init2(shape=(6, 1), dtype=dtypes.int64),
+          shape=(6, 1),
+          dtype=dtypes.int64)
+
+    self.assertIsInstance(v1, sharded_variable.ShardedVariable)
+    self.assertLen(v1.variables, 2)
+    self.assertRegex(v1.variables[0].device, "/job:ps/replica:0/task:0")
+    self.assertRegex(v1.variables[1].device, "/job:ps/replica:0/task:1")
+    self.assertAllEqual(v1.variables[0], [[0, 1], [2, 3], [4, 5]])
+    self.assertAllEqual(v1.variables[1], [[6, 7], [8, 9]])
+
+    self.assertIsInstance(v2, sharded_variable.ShardedVariable)
+    self.assertLen(v2.variables, 2)
+    self.assertRegex(v2.variables[0].device, "/job:ps/replica:0/task:0")
+    self.assertRegex(v2.variables[1].device, "/job:ps/replica:0/task:1")
+    self.assertAllEqual(v2.variables[0], [[0], [1], [2]])
+    self.assertAllEqual(v2.variables[1], [[3], [4], [5]])
+
+  def testNonCallableInitialValue(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(4))
+    with strategy.scope():
+      v = variables.Variable([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+    self.assertIsInstance(v, sharded_variable.ShardedVariable)
+    self.assertLen(v.variables, 4)
+    self.assertRegex(v.variables[0].device, "/job:ps/replica:0/task:0")
+    self.assertRegex(v.variables[1].device, "/job:ps/replica:0/task:1")
+    self.assertRegex(v.variables[2].device, "/job:ps/replica:0/task:0")
+    self.assertRegex(v.variables[3].device, "/job:ps/replica:0/task:1")
+    self.assertAllEqual(v.variables[0], [0, 1, 2])
+    self.assertAllEqual(v.variables[1], [3, 4, 5])
+    self.assertAllEqual(v.variables[2], [6, 7])
+    self.assertAllEqual(v.variables[3], [8, 9])
+
+  def testNumPartitionsLargerThanSize(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(4))
+    with strategy.scope():
+      v = variables.Variable([0, 1, 2])
+
+    self.assertIsInstance(v, sharded_variable.ShardedVariable)
+    self.assertLen(v.variables, 3)
+    self.assertRegex(v.variables[0].device, "/job:ps/replica:0/task:0")
+    self.assertRegex(v.variables[1].device, "/job:ps/replica:0/task:1")
+    self.assertRegex(v.variables[2].device, "/job:ps/replica:0/task:0")
+    self.assertAllEqual(v.variables[0], [0])
+    self.assertAllEqual(v.variables[1], [1])
+    self.assertAllEqual(v.variables[2], [2])
+
+  def testPartitionToOne(self):
+    # For small variables there is only one partition.
+    variable_partitioner = partitioned_variables.min_max_variable_partitioner(
+        max_partitions=2, min_slice_size=64 << 20)
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, variable_partitioner)
+    with strategy.scope():
+      initializer = init_ops_v2.Constant([0] * 10)
+      v1 = variables.Variable(
+          initial_value=lambda: initializer(shape=(10,), dtype=dtypes.int64),
+          shape=(10,),
+          dtype=dtypes.int64)
+
+      v2 = variables.Variable([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+    self.assertIsInstance(v1, variables.Variable)
+    self.assertNotIsInstance(v1, sharded_variable.ShardedVariable)
+    self.assertRegex(v1.device, "/job:ps/replica:0/task:0")
+    self.assertAllEqual(v1, [0] * 10)
+
+    self.assertIsInstance(v2, variables.Variable)
+    self.assertNotIsInstance(v2, sharded_variable.ShardedVariable)
+    self.assertRegex(v2.device, "/job:ps/replica:0/task:1")
+    self.assertAllEqual(v2, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+  def testColocateWith(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(2))
+    with strategy.scope():
+      v1 = variables.Variable([0, 1, 2, 3])
+
+      with strategy.extended.colocate_vars_with(v1.variables[0]):
+        v2 = variables.Variable([4, 5])
+
+    self.assertIsInstance(v1, sharded_variable.ShardedVariable)
+
+    self.assertIsInstance(v2, variables.Variable)
+    self.assertNotIsInstance(v2, sharded_variable.ShardedVariable)
+    self.assertEqual(v2.device, v1.variables[0].device)
+    self.assertAllEqual(v2, [4, 5])
+
+  def testPartitionAwareInitializer(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(2))
+    with strategy.scope():
+      initializer = PartitionAwareIdentity()
+      initial_value = functools.partial(
+          initializer, shape=(4, 4), dtype=dtypes.int64)
+      v = variables.Variable(
+          initial_value=initial_value, shape=(4, 4), dtype=dtypes.int64)
+
+    self.assertIsInstance(v, sharded_variable.ShardedVariable)
+    self.assertLen(v.variables, 2)
+    self.assertRegex(v.variables[0].device, "/job:ps/replica:0/task:0")
+    self.assertRegex(v.variables[1].device, "/job:ps/replica:0/task:1")
+    self.assertAllEqual(v.variables[0], [[1, 0, 0, 0], [0, 1, 0, 0]])
+    self.assertAllEqual(v.variables[1], [[0, 0, 1, 0], [0, 0, 0, 1]])
+
+  def testPartitionWhenLackOfInfo(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(2))
+    with strategy.scope():
+      initializer = init_ops_v2.Constant([0, 1, 2, 3])
+      # Shape is not explicitly specified.
+      v1 = variables.Variable(
+          initial_value=lambda: initializer(shape=(4,), dtype=dtypes.int64),
+          dtype=dtypes.int64)
+      # Dtype is not explicitly specified.
+      v2 = variables.Variable(
+          initial_value=lambda: initializer(shape=(4,), dtype=dtypes.int64),
+          shape=(4,))
+      # Neither shape nor dtype is explicitly specified.
+      v3 = variables.Variable(
+          initial_value=lambda: initializer(shape=(4,), dtype=dtypes.int64))
+
+    for v in [v1, v2, v3]:
+      self.assertIsInstance(v, sharded_variable.ShardedVariable)
+      self.assertLen(v.variables, 2)
+      self.assertRegex(v.variables[0].device, "/job:ps/replica:0/task:0")
+      self.assertRegex(v.variables[1].device, "/job:ps/replica:0/task:1")
+      self.assertAllEqual(v.variables[0], [0, 1])
+      self.assertAllEqual(v.variables[1], [2, 3])
+
+  def testInvalidPartitioner(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, lambda shape, dtype: None)
+    with self.assertRaisesRegex(ValueError, "variable_partitioner"):
+      with strategy.scope():
+        variables.Variable([[[0, 1], [2, 3]], [[0, 1], [2, 3]]])
+
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, lambda shape, dtype: [])
+    with self.assertRaisesRegex(ValueError, "variable_partitioner"):
+      with strategy.scope():
+        variables.Variable([[[0, 1], [2, 3]], [[0, 1], [2, 3]]])
+
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, lambda shape, dtype: [0, 1, 1])
+    with self.assertRaisesRegex(ValueError, "variable_partitioner"):
+      with strategy.scope():
+        variables.Variable([[[0, 1], [2, 3]], [[0, 1], [2, 3]]])
+
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, lambda shape, dtype: [2, 2, 1])
+    with self.assertRaisesRegex(ValueError, "variable_partitioner"):
+      with strategy.scope():
+        variables.Variable([[[0, 1], [2, 3]], [[0, 1], [2, 3]]])
+
+  def testCreateInsideTFFunction(self):
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(2))
+
+    collection = []
+
+    @def_function.function
+    def create_vars():
+      if not collection:
+        identity = init_ops_v2.Identity()
+        v1 = variables.Variable([[1., 0.], [0., 1.]], dtype=dtypes.float32)
+        v2 = variables.Variable(lambda: identity((2, 2), dtypes.float32))
+        v3 = variables.Variable(
+            lambda: identity((2, 2), dtypes.float32),
+            dtype=dtypes.float32,
+            shape=(2, 2))
+        collection.extend([v1, v2, v3])
+
+    with strategy.scope():
+      create_vars()
+      for v in collection:
+        self.assertIsInstance(v, sharded_variable.ShardedVariable)
+        self.assertLen(v.variables, 2)
+        self.assertRegex(v.variables[0].device, "/job:ps/replica:0/task:0")
+        self.assertRegex(v.variables[1].device, "/job:ps/replica:0/task:1")
+        self.assertAllEqual(v.variables[0], [[1., 0.]])
+        self.assertAllEqual(v.variables[1], [[0., 1.]])
+
+  @parameterized.named_parameters(
+      ("Restore", False, 2),
+      ("RestoreDiffShards", False, 4),
+      ("DelayedRestore", True, 2),
+      ("DelayedRestoreDiffShards", True, 4),
+  )
+  def testCheckpoint(self, delayed, restore_shards):
+
+    def make_variable(name, shape, dtype, initializer):
+      initial_value = functools.partial(initializer, shape, dtype=dtype)
+      return variables.Variable(
+          name=name, initial_value=initial_value, shape=shape, dtype=dtype)
+
+    class Model(tracking.AutoTrackable):
+
+      def build(self):
+        self.w = self._add_variable_with_custom_getter(
+            "w",
+            shape=(4,),
+            initializer=init_ops_v2.Ones(),
+            getter=make_variable)
+
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver, partitioned_variables.fixed_size_partitioner(2))
+    ckpt_dir = os.path.join(self.get_temp_dir(), "checkpoint")
+
+    with strategy.scope():
+      model1 = Model()
+      model1.build()
+      self.assertIsInstance(model1.w, sharded_variable.ShardedVariable)
+      self.assertLen(model1.w.variables, 2)
+      model1.w.assign([1., 2., 3., 4.])
+
+      cp1 = tracking_util.Checkpoint(model=model1)
+      cp1.write(ckpt_dir)
+
+    strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
+        self.cluster_resolver,
+        partitioned_variables.fixed_size_partitioner(restore_shards))
+
+    with strategy.scope():
+      model2 = Model()
+      cp2 = tracking_util.Checkpoint(model=model2)
+      if delayed:
+        cp2.restore(ckpt_dir)
+        model2.build()
+      else:
+        model2.build()
+        cp2.restore(ckpt_dir)
+      self.assertIsInstance(model2.w, sharded_variable.ShardedVariable)
+      self.assertLen(model2.w.variables, restore_shards)
+      if restore_shards == 2:
+        self.assertAllEqual(model2.w.variables[0], [1., 2.])
+        self.assertAllEqual(model2.w.variables[1], [3., 4.])
+      elif restore_shards == 4:
+        self.assertAllEqual(model2.w.variables[0], [1.])
+        self.assertAllEqual(model2.w.variables[1], [2.])
+        self.assertAllEqual(model2.w.variables[2], [3.])
+        self.assertAllEqual(model2.w.variables[3], [4.])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/sharded_variable.py b/tensorflow/python/distribute/sharded_variable.py
index ea0fbf110d9..d42941e28dd 100644
--- a/tensorflow/python/distribute/sharded_variable.py
+++ b/tensorflow/python/distribute/sharded_variable.py
@@ -19,7 +19,11 @@ from __future__ import print_function
 
 import copy
 
+from tensorflow.python.framework import composite_tensor
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import type_spec
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.saved_model import save_context
@@ -27,7 +31,186 @@ from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base as trackable
 
 
-class ShardedVariable(trackable.Trackable):
+class ShardedVariableSpec(type_spec.TypeSpec):
+  """Type specification for a `ShardedVariable`."""
+
+  __slots__ = ['_variable_specs']
+
+  value_type = property(lambda self: ShardedVariable)
+
+  def __init__(self, *variable_specs):
+    self._variable_specs = tuple(variable_specs)
+
+  def _serialize(self):
+    return self._variable_specs
+
+  @property
+  def _component_specs(self):
+    return self._variable_specs
+
+  def _to_components(self, value):
+    return value.variables
+
+  def _from_components(self, variables):
+    return ShardedVariable(variables)
+
+
+class ShardedVariableMixin(trackable.Trackable):
+  """Mixin for ShardedVariable."""
+
+  # TODO(b/170877138): Remove this mixin once fixed. This mixin is required
+  # since TPUShardedVariable can't be a CompositeTensor.
+
+  def __init__(self, variables, name='ShardedVariable'):
+    """Treats `variables` as shards of a larger Variable.
+
+
+    Example:
+
+    ```
+    variables = [
+      tf.Variable(..., shape=(10, 100), dtype=tf.float32),
+      tf.Variable(..., shape=(15, 100), dtype=tf.float32),
+      tf.Variable(..., shape=(5, 100), dtype=tf.float32)
+    ]
+    sharded_variable = ShardedVariableMixin(variables)
+    assert sharded_variable.shape.as_list() == [30, 100]
+    ```
+
+    Args:
+      variables: A list of `ResourceVariable`s that comprise this sharded
+        variable. Variables should not be shared between different
+        `ShardedVariableMixin` objects.
+      name: String. Name of this container. Defaults to "ShardedVariable".
+    """
+    super(ShardedVariableMixin, self).__init__()
+    self._variables = variables
+    self._name = name
+
+    first_var = variables[0]
+
+    if any(not isinstance(v, variables_lib.Variable) for v in variables):
+      raise ValueError(
+          'Expected a list of `Variable`s, found: {}'.format(variables))
+
+    dtypes = {v.dtype for v in variables}
+    if len(dtypes) > 1:
+      raise ValueError(
+          'All `Variable`s must have the same dtype, found: {}'.format(
+              [v.dtype for v in variables]))
+    self._dtype = first_var.dtype
+
+    # All variables must have the same shape for axes > 0.
+    higher_dim_shapes = {tuple(v.shape.as_list()[1:]) for v in variables}
+    if len(higher_dim_shapes) > 1:
+      raise ValueError(
+          'All `Variables`s must have the same shapes except for the first '
+          'axis, found {}'.format([v.shape for v in variables]))
+    first_dim = sum(int(v.shape[0]) for v in variables)
+    self._shape = tensor_shape.TensorShape([first_dim] + first_var.shape[1:])
+    self._var_offsets = [
+        [0 for _ in range(len(first_var.shape))] for _ in range(len(variables))
+    ]
+    for i in range(1, len(variables)):
+      # Always partition on the first axis. Offsets on other axes are 0.
+      self._var_offsets[i][0] += (
+          self._var_offsets[i - 1][0] + variables[i - 1].shape[0])
+
+    save_slice_info = [v._get_save_slice_info() for v in variables]  # pylint: disable=protected-access
+    if any(slice_info is not None for slice_info in save_slice_info):
+      raise ValueError('`SaveSliceInfo` should not be set for `Variable`s. '
+                       '`ShardedVariable` will infer `SaveSliceInfo` according '
+                       'to the order of the `Variable`s in the list passed to '
+                       'the constructor. Found {}'.format(save_slice_info))
+
+    # We create an uninitialized saving_variable with the full shape, which can
+    # be later captured in signatures so that the signatures can treat this
+    # ShardedVariable as one single variable.
+    self._saving_variable = resource_variable_ops.UninitializedVariable(
+        shape=self._shape, dtype=self._dtype, name=self._name)
+
+  def __iter__(self):
+    """Return an iterable for accessing the underlying sharded variables."""
+    return iter(self._variables)
+
+  @property
+  def _type_spec(self):
+    return ShardedVariableSpec(*(
+        resource_variable_ops.VariableSpec(v.shape, v.dtype)
+        for v in self._variables))
+
+  @property
+  def variables(self):
+    """The list of `Variable`s that make up the shards of this object."""
+    if save_context.in_save_context():
+      return [self._saving_variable]
+    return self._variables
+
+  @property
+  def name(self):
+    """The name of this object. Used for checkpointing."""
+    return self._name
+
+  @property
+  def dtype(self):
+    """The dtype of all `Variable`s in this object."""
+    return self._dtype
+
+  @property
+  def shape(self):
+    """The overall shape, combining all shards along axis `0`."""
+    return self._shape
+
+  def assign(self, value, use_locking=None, name=None, read_value=True):
+    for i, v in enumerate(self._variables):
+      v.assign(array_ops.slice(value, self._var_offsets[i], v.shape.as_list()))
+
+  def assign_add(self, delta, use_locking=False, name=None, read_value=True):
+    for i, v in enumerate(self._variables):
+      v.assign_add(
+          array_ops.slice(delta, self._var_offsets[i], v.shape.as_list()))
+
+  def assign_sub(self, delta, use_locking=False, name=None, read_value=True):
+    for i, v in enumerate(self._variables):
+      v.assign_sub(
+          array_ops.slice(delta, self._var_offsets[i], v.shape.as_list()))
+
+  def _gather_saveables_for_checkpoint(self):
+    """Return a `Saveable` for each shard. See `Trackable`."""
+
+    def _saveable_factory(name=self.name):
+      """Creates `SaveableObject`s for this `ShardedVariable`."""
+      saveables = []
+      dims = len(self._variables[0].shape)
+      var_offset = [0 for _ in range(dims)]
+      for v in self._variables:
+        save_slice_info = variables_lib.Variable.SaveSliceInfo(
+            full_name=self.name,
+            full_shape=self.shape.as_list(),
+            var_offset=copy.copy(var_offset),
+            var_shape=v.shape.as_list())
+        saveables.append(
+            saveable_object_util.ResourceVariableSaveable(
+                v, save_slice_info.spec, name))
+        var_offset[0] += int(v.shape[0])
+      return saveables
+
+    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
+
+  def _map_resources(self, save_options):
+    """For implementing `Trackable`."""
+    obj_map, resource_map = {}, {}
+    for v in self._variables + [self._saving_variable]:
+      v_obj_map, v_resource_map = v._map_resources(save_options)  # pylint:disable=protected-access
+      obj_map.update(v_obj_map)
+      resource_map.update(v_resource_map)
+    obj_map[self] = ShardedVariable([obj_map[self._saving_variable]],
+                                    name=self.name)
+
+    return obj_map, resource_map
+
+
+class ShardedVariable(ShardedVariableMixin, composite_tensor.CompositeTensor):
   """A container for `Variables` that should be treated as shards.
 
   Variables that are too large to fit on a single device (e.g., large
@@ -75,123 +258,25 @@ class ShardedVariable(trackable.Trackable):
   ...   signatures=model.serve_fn)
   """
 
-  def __init__(self, variables, name='ShardedVariable'):
-    """Treats `variables` as shards of a larger Variable.
-
-
-    Example:
-
-    ```
-    variables = [
-      tf.Variable(..., shape=(10, 100), dtype=tf.float32),
-      tf.Variable(..., shape=(15, 100), dtype=tf.float32),
-      tf.Variable(..., shape=(5, 100), dtype=tf.float32)
-    ]
-    sharded_variable = ShardedVariable(variables)
-    assert sharded_variable.shape.as_list() == [30, 100]
-    ```
-
-    Args:
-      variables: A list of `ResourceVariable`s that comprise this sharded
-        variable. Variables should not be shared between different
-        `ShardedVariable` objects.
-      name: String. Name of this container. Defaults to "ShardedVariable".
-    """
-    super(ShardedVariable, self).__init__()
-    self._variables = variables
-    self._name = name
-
-    first_var = variables[0]
-
-    if any(not isinstance(v, variables_lib.Variable) for v in variables):
-      raise ValueError(
-          'Expected a list of `Variable`s, found: {}'.format(variables))
-
-    dtypes = {v.dtype for v in variables}
-    if len(dtypes) > 1:
-      raise ValueError(
-          'All `Variable`s must have the same dtype, found: {}'.format(
-              [v.dtype for v in variables]))
-    self._dtype = first_var.dtype
-
-    # All variables must have the same shape for axes > 0.
-    higher_dim_shapes = {tuple(v.shape.as_list()[1:]) for v in variables}
-    if len(higher_dim_shapes) > 1:
-      raise ValueError(
-          'All `Variables`s must have the same shapes except for the first '
-          'axis, found {}'.format([v.shape for v in variables]))
-    first_dim = sum(int(v.shape[0]) for v in variables)
-    self._shape = tensor_shape.TensorShape([first_dim] + first_var.shape[1:])
-
-    save_slice_info = [v._get_save_slice_info() for v in variables]  # pylint: disable=protected-access
-    if any(slice_info is not None for slice_info in save_slice_info):
-      raise ValueError('`SaveSliceInfo` should not be set for `Variable`s. '
-                       '`ShardedVariable` will infer `SaveSliceInfo` according '
-                       'to the order of the `Variable`s in the list passed to '
-                       'the constructor. Found {}'.format(save_slice_info))
-
-    # We create an uninitialized saving_variable with the full shape, which can
-    # be later captured in signatures so that the signatures can treat this
-    # ShardedVariable as one single variable.
-    self._saving_variable = resource_variable_ops.UninitializedVariable(
-        shape=self._shape, dtype=self._dtype, name=self._name)
-
-  def __iter__(self):
-    """Return an iterable for accessing the underlying sharded variables."""
-    return iter(self._variables)
-
   @property
-  def variables(self):
-    """The list of `Variable`s that make up the shards of this object."""
-    if save_context.in_save_context():
-      return [self._saving_variable]
-    return self._variables
+  def _type_spec(self):
+    return ShardedVariableSpec(*(
+        resource_variable_ops.VariableSpec(v.shape, v.dtype)
+        for v in self._variables))
 
-  @property
-  def name(self):
-    """The name of this object. Used for checkpointing."""
-    return self._name
 
-  @property
-  def dtype(self):
-    """The dtype of all `Variable`s in this object."""
-    return self._dtype
+def _var_to_tensor(var, dtype=None, name=None, as_ref=False):
+  del name
+  if dtype is not None and not dtype.is_compatible_with(var.dtype):
+    raise ValueError(
+        'Incompatible type conversion requested to type {!r} for variable '
+        'of type {!r}'.format(dtype.name, var.dtype.name))
+  if as_ref:
+    raise NotImplementedError(
+        "ShardedVariable doesn't support being used as a reference.")
+  return array_ops.concat(var.variables, axis=0)
 
-  @property
-  def shape(self):
-    """The overall shape, combining all shards along axis `0`."""
-    return self._shape
 
-  def _gather_saveables_for_checkpoint(self):
-    """Return a `Saveable` for each shard. See `Trackable`."""
-
-    def _saveable_factory(name=self.name):
-      """Creates `SaveableObject`s for this `ShardedVariable`."""
-      saveables = []
-      dims = len(self._variables[0].shape)
-      var_offset = [0 for _ in range(dims)]
-      for v in self._variables:
-        save_slice_info = variables_lib.Variable.SaveSliceInfo(
-            full_name=self.name,
-            full_shape=self.shape.as_list(),
-            var_offset=copy.copy(var_offset),
-            var_shape=v.shape.as_list())
-        saveables.append(
-            saveable_object_util.ResourceVariableSaveable(
-                v, save_slice_info.spec, name))
-        var_offset[0] += int(v.shape[0])
-      return saveables
-
-    return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
-
-  def _map_resources(self, save_options):
-    """For implementing `Trackable`."""
-    obj_map, resource_map = {}, {}
-    for v in self._variables + [self._saving_variable]:
-      v_obj_map, v_resource_map = v._map_resources(save_options)  # pylint:disable=protected-access
-      obj_map.update(v_obj_map)
-      resource_map.update(v_resource_map)
-    obj_map[self] = ShardedVariable([obj_map[self._saving_variable]],
-                                    name=self.name)
-
-    return obj_map, resource_map
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+ops.register_tensor_conversion_function(ShardedVariable, _var_to_tensor)
diff --git a/tensorflow/python/distribute/sharded_variable_test.py b/tensorflow/python/distribute/sharded_variable_test.py
index 64ed3d03717..5a71c3f4c4d 100644
--- a/tensorflow/python/distribute/sharded_variable_test.py
+++ b/tensorflow/python/distribute/sharded_variable_test.py
@@ -27,8 +27,11 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import loader
@@ -37,6 +40,7 @@ from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.training.tracking import util
+from tensorflow.python.util import nest
 
 
 def _load_and_run(
@@ -72,6 +76,44 @@ class ShardedVariableTest(test.TestCase):
     self.assertEqual(s.dtype, v0.dtype)
     self.assertEqual(s.name, 's')
 
+  def test_assign(self):
+    v0 = variables_lib.Variable([[0, 0]])
+    v1 = variables_lib.Variable([[1, 1], [2, 2]])
+    v2 = variables_lib.Variable([[3, 3]])
+    s = sharded_variable.ShardedVariable([v0, v1, v2])
+    s.assign([[4, 4], [5, 5], [6, 6], [7, 7]])
+    self.assertAllEqual(self.evaluate(s.variables[0]), [[4, 4]])
+    self.assertAllEqual(self.evaluate(s.variables[1]), [[5, 5], [6, 6]])
+    self.assertAllEqual(self.evaluate(s.variables[2]), [[7, 7]])
+
+  def test_assign_add(self):
+    v0 = variables_lib.Variable([[0, 0]])
+    v1 = variables_lib.Variable([[1, 1], [2, 2]])
+    v2 = variables_lib.Variable([[3, 3]])
+    s = sharded_variable.ShardedVariable([v0, v1, v2])
+    s.assign_add([[1, 1], [1, 1], [2, 2], [2, 2]])
+    self.assertAllEqual(self.evaluate(s.variables[0]), [[1, 1]])
+    self.assertAllEqual(self.evaluate(s.variables[1]), [[2, 2], [4, 4]])
+    self.assertAllEqual(self.evaluate(s.variables[2]), [[5, 5]])
+
+  def test_assign_sub(self):
+    v0 = variables_lib.Variable([[0, 0]])
+    v1 = variables_lib.Variable([[1, 1], [2, 2]])
+    v2 = variables_lib.Variable([[3, 3]])
+    s = sharded_variable.ShardedVariable([v0, v1, v2])
+    s.assign_sub([[0, 0], [1, 1], [1, 1], [3, 3]])
+    self.assertAllEqual(self.evaluate(s.variables[0]), [[0, 0]])
+    self.assertAllEqual(self.evaluate(s.variables[1]), [[0, 0], [1, 1]])
+    self.assertAllEqual(self.evaluate(s.variables[2]), [[0, 0]])
+
+  def test_convert_to_tensor(self):
+    v0 = variables_lib.Variable([[0, 0]])
+    v1 = variables_lib.Variable([[1, 1], [2, 2]])
+    v2 = variables_lib.Variable([[3, 3]])
+    s = sharded_variable.ShardedVariable([v0, v1, v2])
+    t = ops.convert_to_tensor(s)
+    self.assertAllEqual(t, [[0, 0], [1, 1], [2, 2], [3, 3]])
+
   def test_save_restore(self):
     fname = os.path.join(self.get_temp_dir(), 'checkpoint')
     variables = [
@@ -148,6 +190,58 @@ class ShardedVariableTest(test.TestCase):
     self.assertAllEqual(self.evaluate(cp2.s.variables[0]), [0, 1])
     self.assertAllEqual(self.evaluate(cp2.s.variables[1]), [2, 3])
 
+  def test_delayed_restore(self):
+    fname = os.path.join(self.get_temp_dir(), 'checkpoint')
+    model = tracking.AutoTrackable()
+    variables = [
+        variables_lib.Variable([0]),
+        variables_lib.Variable([1]),
+        variables_lib.Variable([2]),
+        variables_lib.Variable([3])
+    ]
+    model.s = sharded_variable.ShardedVariable(variables)
+    cp = util.Checkpoint(model=model)
+    cp.write(fname)
+
+    model2 = tracking.AutoTrackable()
+    cp2 = util.Checkpoint(model=model2)
+    cp2.restore(fname)
+    variables2 = [
+        variables_lib.Variable([0]),
+        variables_lib.Variable([0]),
+        variables_lib.Variable([0]),
+        variables_lib.Variable([0])
+    ]
+    model2.s = sharded_variable.ShardedVariable(variables2)
+    self.assertAllEqual(self.evaluate(model2.s.variables[0]), [0])
+    self.assertAllEqual(self.evaluate(model2.s.variables[1]), [1])
+    self.assertAllEqual(self.evaluate(model2.s.variables[2]), [2])
+    self.assertAllEqual(self.evaluate(model2.s.variables[3]), [3])
+
+  def test_delayed_restore_4_to_2_partitions(self):
+    fname = os.path.join(self.get_temp_dir(), 'checkpoint')
+    model = tracking.AutoTrackable()
+    variables = [
+        variables_lib.Variable([0]),
+        variables_lib.Variable([1]),
+        variables_lib.Variable([2]),
+        variables_lib.Variable([3])
+    ]
+    model.s = sharded_variable.ShardedVariable(variables)
+    cp = util.Checkpoint(model=model)
+    cp.write(fname)
+
+    model2 = tracking.AutoTrackable()
+    cp2 = util.Checkpoint(model=model2)
+    cp2.restore(fname)
+    variables2 = [
+        variables_lib.Variable([0, 0]),
+        variables_lib.Variable([0, 0])
+    ]
+    model2.s = sharded_variable.ShardedVariable(variables2)
+    self.assertAllEqual(self.evaluate(model2.s.variables[0]), [0, 1])
+    self.assertAllEqual(self.evaluate(model2.s.variables[1]), [2, 3])
+
   def test_save_graph_def(self):
     root = tracking.AutoTrackable()
     v1 = variables_lib.Variable([3.])
@@ -196,6 +290,145 @@ class ShardedVariableTest(test.TestCase):
               full_name='s', full_shape=[2], var_offset=[0], var_shape=[1]))
       sharded_variable.ShardedVariable([v])
 
+  def test_as_function_input(self):
+    variables1 = [
+        variables_lib.Variable([1]),
+        variables_lib.Variable([1]),
+    ]
+    s = sharded_variable.ShardedVariable(variables1)
+    variables2 = [
+        variables_lib.Variable([2]),
+        variables_lib.Variable([2]),
+    ]
+    s2 = sharded_variable.ShardedVariable(variables2)
+
+    trace_count = [0]
+
+    @def_function.function
+    def func(sharded_var):
+      trace_count[0] = trace_count[0] + 1
+      sharded_var.assign([0, 0])
+
+    func(s)
+    self.assertAllEqual(ops.convert_to_tensor(s), [0, 0])
+    self.assertEqual(trace_count[0], 1)
+    func(s2)
+    self.assertAllEqual(ops.convert_to_tensor(s2), [0, 0])
+    self.assertEqual(trace_count[0], 1)
+
+  def test_flatten(self):
+    variables = [
+        variables_lib.Variable([0]),
+        variables_lib.Variable([1]),
+    ]
+    s = sharded_variable.ShardedVariable(variables)
+
+    got = nest.flatten(s)
+    self.assertEqual(s, got[0])
+
+    got = nest.flatten(s, expand_composites=True)
+    self.assertAllEqual(variables, got)
+
+  def test_tf_module(self):
+    self.skipTest('integration with tf.module is not added yet.')
+
+    class Model(module.Module):
+
+      def __init__(self):
+        super().__init__()
+        variables = [
+            variables_lib.Variable([0]),
+            variables_lib.Variable([1]),
+        ]
+        self.w = sharded_variable.ShardedVariable(variables)
+
+    model = Model()
+
+    self.assertLen(model.variables, 2)
+    self.assertEqual(model.variables[0], [0])
+    self.assertEqual(model.variables[1], [1])
+    self.assertAllEqual(model.variables, model.trainable_variables)
+
+    self.assertLen(model._checkpoint_dependencies, 1)
+    self.assertEqual(model._checkpoint_dependencies[0].ref, model.w)
+
+  def test_keras_layer_setattr(self):
+
+    class Layer(base_layer.Layer):
+
+      def __init__(self):
+        super().__init__()
+        variables1 = [
+            variables_lib.Variable([0]),
+            variables_lib.Variable([1]),
+        ]
+        variables2 = [
+            variables_lib.Variable([2], trainable=False),
+            variables_lib.Variable([3], trainable=False),
+        ]
+        self.w = sharded_variable.ShardedVariable(variables1)
+        self.b = sharded_variable.ShardedVariable(variables2)
+
+    layer = Layer()
+
+    self.assertLen(layer.trainable_weights, 2)
+    self.assertEqual(layer.trainable_weights[0], [0])
+    self.assertEqual(layer.trainable_weights[1], [1])
+    self.assertLen(layer.non_trainable_weights, 2)
+    self.assertEqual(layer.non_trainable_weights[0], [2])
+    self.assertEqual(layer.non_trainable_weights[1], [3])
+    self.assertAllEqual(layer.weights,
+                        layer.trainable_weights + layer.non_trainable_weights)
+    self.assertAllEqual(layer.trainable_weights, layer.trainable_variables)
+    self.assertAllEqual(layer.weights, layer.variables)
+
+    checkpoint_deps = set(dep.ref for dep in layer._checkpoint_dependencies)
+    self.assertEqual(checkpoint_deps, set([layer.w, layer.b]))
+
+  def test_keras_layer_add_weight(self):
+
+    class Layer(base_layer.Layer):
+
+      def __init__(self):
+        super().__init__()
+        self.w = self.add_weight(
+            shape=(2,), initializer=lambda shape, dtype: [0, 1], trainable=True)
+        self.b = self.add_weight(
+            shape=(2,),
+            initializer=lambda shape, dtype: [2, 3],
+            trainable=False)
+
+    def sharded_variable_creator(next_creator, **kwargs):
+      v1_value = kwargs['initial_value']()[0:1]
+      v2_value = kwargs['initial_value']()[1:]
+
+      kwargs['initial_value'] = v1_value
+      kwargs['shape'] = (1,)
+      v1 = next_creator(**kwargs)
+
+      kwargs['initial_value'] = v2_value
+      kwargs['shape'] = (1,)
+      v2 = next_creator(**kwargs)
+
+      return sharded_variable.ShardedVariable([v1, v2])
+
+    with variable_scope.variable_creator_scope(sharded_variable_creator):
+      layer = Layer()
+
+    self.assertLen(layer.trainable_weights, 2)
+    self.assertEqual(layer.trainable_weights[0], [0])
+    self.assertEqual(layer.trainable_weights[1], [1])
+    self.assertLen(layer.non_trainable_weights, 2)
+    self.assertEqual(layer.non_trainable_weights[0], [2])
+    self.assertEqual(layer.non_trainable_weights[1], [3])
+    self.assertAllEqual(layer.weights,
+                        layer.trainable_weights + layer.non_trainable_weights)
+    self.assertAllEqual(layer.trainable_weights, layer.trainable_variables)
+    self.assertAllEqual(layer.weights, layer.variables)
+
+    checkpoint_deps = set(dep.ref for dep in layer._checkpoint_dependencies)
+    self.assertEqual(checkpoint_deps, set([layer.w, layer.b]))
+
 
 if __name__ == '__main__':
   v2_compat.enable_v2_behavior()
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index b72cdd77a0e..2a7afabf166 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import atexit
-
 from tensorflow.python import tf2
 from tensorflow.python.distribute import central_storage_strategy
 from tensorflow.python.distribute import cluster_resolver
@@ -29,18 +27,43 @@ from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy as mirrored_lib
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import one_device_strategy as one_device_lib
+from tensorflow.python.distribute import test_util
 from tensorflow.python.distribute import tpu_strategy as tpu_lib
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import context
 from tensorflow.python.eager import remote
-from tensorflow.python.framework import config
 from tensorflow.python.platform import flags
 from tensorflow.python.tpu import device_assignment as device_assignment_lib
 from tensorflow.python.tpu import tpu_strategy_util
+from tensorflow.python.util.tf_export import tf_export
 
-FLAGS = flags.FLAGS
+_TF_INTERNAL_API_PREFIX = "__internal__.distribute.combinations."
 
 _did_connect_to_cluster = False
+CollectiveAllReduceExtended = (
+    collective_all_reduce_strategy.CollectiveAllReduceExtended)
+
+
+def _version_chooser(tf1_cls, tf2_cls):
+
+  def creator(*args, **kwargs):
+    if tf2.enabled():
+      return tf2_cls(*args, **kwargs)
+    return tf1_cls(*args, **kwargs)
+
+  return creator
+
+
+MirroredStrategy = _version_chooser(mirrored_lib.MirroredStrategyV1,
+                                    mirrored_lib.MirroredStrategy)
+CentralStorageStrategy = _version_chooser(
+    central_storage_strategy.CentralStorageStrategyV1,
+    central_storage_strategy.CentralStorageStrategy)
+OneDeviceStrategy = _version_chooser(one_device_lib.OneDeviceStrategyV1,
+                                     one_device_lib.OneDeviceStrategy)
+# Only V2 CollectiveAllReduceStrategy combinations are supported.
+CollectiveAllReduceStrategy = (
+    collective_all_reduce_strategy.CollectiveAllReduceStrategy)
 
 
 # pylint: disable=missing-docstring
@@ -50,6 +73,7 @@ def _get_tpu_strategy_creator(steps_per_run,
                               **kwargs):
 
   def _create_tpu_strategy():
+    FLAGS = flags.FLAGS  # pylint: disable=invalid-name
     global _did_connect_to_cluster
 
     try:
@@ -77,8 +101,8 @@ def _get_tpu_strategy_creator(steps_per_run,
     device_assignment = None
     if use_single_core:
       device_assignment = device_assignment_lib.DeviceAssignment(
-          topology, core_assignment=device_assignment_lib.
-          SINGLE_CORE_ASSIGNMENT)
+          topology,
+          core_assignment=device_assignment_lib.SINGLE_CORE_ASSIGNMENT)
 
     # Steps per run is only supported in TF 1.x
     if tf2.enabled():
@@ -109,20 +133,24 @@ def _get_multi_worker_mirrored_creator(required_gpus):
         num_accelerators={"GPU": required_gpus},
         rpc_layer=tf_config.rpc_layer or "grpc",
     )
+    # Disable health check. We don't have a reliable to shutdown the strategy
+    # (and thus the health check) at the end of a test. Turning on health check
+    # causes some flakiness since we re-create part of the server when creating
+    # a strategy, and our tests are capable of handling failures.
+    CollectiveAllReduceExtended._enable_check_health = False  # pylint: disable=protected-access
     # Always create the strategy in eager mode so that it starts the server and
     # configures the eager context. The eager context can no longer be
     # configured after initialization.
     with context.eager_mode():
-      strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
-          cluster_resolver=resolver)
+      strategy = CollectiveAllReduceStrategy(cluster_resolver=resolver)
     # TODO(b/152320929): Wait for the cluster before proceeding, otherwise
     # collectives may hang if any worker launches collectives before the chief
     # creates the strategy.
     try:
-      multi_process_runner.barrier().wait()
+      multi_process_runner.get_barrier().wait()
     except ValueError:
       # If the creator is called in the main process,
-      # multi_process_runner.barrier() raises ValueError, which is safe to
+      # multi_process_runner.get_barrier() raises ValueError, which is safe to
       # ignore.
       pass
     return strategy
@@ -136,20 +164,16 @@ default_strategy = combinations.NamedDistribution(
     distribution_strategy_context._get_default_strategy,  # pylint: disable=protected-access
     required_gpus=None)
 one_device_strategy = combinations.NamedDistribution(
-    "OneDeviceCPU",
-    lambda: one_device_lib.OneDeviceStrategy("/cpu:0"),
-    required_gpus=None)
+    "OneDeviceCPU", lambda: OneDeviceStrategy("/cpu:0"), required_gpus=None)
 one_device_strategy_gpu = combinations.NamedDistribution(
-    "OneDeviceGPU",
-    lambda: one_device_lib.OneDeviceStrategy("/gpu:0"),
-    required_gpus=1)
+    "OneDeviceGPU", lambda: OneDeviceStrategy("/gpu:0"), required_gpus=1)
 one_device_strategy_on_worker_1 = combinations.NamedDistribution(
     "OneDeviceOnWorker1CPU",
-    lambda: one_device_lib.OneDeviceStrategy("/job:worker/replica:0/task:1/cpu:0"),  # pylint: disable=line-too-long
+    lambda: OneDeviceStrategy("/job:worker/replica:0/task:1/cpu:0"),
     required_gpus=None)
 one_device_strategy_gpu_on_worker_1 = combinations.NamedDistribution(
     "OneDeviceOnWorker1GPU",
-    lambda: one_device_lib.OneDeviceStrategy("/job:worker/replica:0/task:1/gpu:0"),  # pylint: disable=line-too-long
+    lambda: OneDeviceStrategy("/job:worker/replica:0/task:1/gpu:0"),
     required_gpus=1)
 tpu_strategy = combinations.NamedDistribution(
     "TPU", _get_tpu_strategy_creator(steps_per_run=2), required_tpu=True)
@@ -173,30 +197,32 @@ cloud_tpu_strategy = combinations.NamedDistribution(
     required_tpu=True,
     use_cloud_tpu=True)
 mirrored_strategy_with_one_cpu = combinations.NamedDistribution(
-    "Mirrored1CPU", lambda: mirrored_lib.MirroredStrategy(["/cpu:0"]))
+    "Mirrored1CPU", lambda: MirroredStrategy(["/cpu:0"]))
 mirrored_strategy_with_one_gpu = combinations.NamedDistribution(
-    "Mirrored1GPU",
-    lambda: mirrored_lib.MirroredStrategy(["/gpu:0"]),
-    required_gpus=1)
+    "Mirrored1GPU", lambda: MirroredStrategy(["/gpu:0"]), required_gpus=1)
 mirrored_strategy_with_gpu_and_cpu = combinations.NamedDistribution(
     "MirroredCPUAndGPU",
-    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"]),
+    lambda: MirroredStrategy(["/gpu:0", "/cpu:0"]),
     required_gpus=1)
 mirrored_strategy_with_two_gpus = combinations.NamedDistribution(
     "Mirrored2GPUs",
-    lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"]),
+    lambda: MirroredStrategy(["/gpu:0", "/gpu:1"]),
     required_gpus=2)
 # Should call set_virtual_cpus_to_at_least(3) in your test's setUp methods.
 mirrored_strategy_with_cpu_1_and_2 = combinations.NamedDistribution(
-    "Mirrored2CPU", lambda: mirrored_lib.MirroredStrategy(["/cpu:1", "/cpu:2"]))
+    "Mirrored2CPU", lambda: MirroredStrategy(["/cpu:1", "/cpu:2"]))
+mirrored_strategy_with_cpu_1_and_2.__doc__ = (
+    """Mirrored strategy with 2 virtual CPUs.
+
+    Should set up logical devices before use
+    """)
 central_storage_strategy_with_two_gpus = combinations.NamedDistribution(
     "CentralStorage2GPUs",
-    lambda: central_storage_strategy.CentralStorageStrategy._from_num_gpus(2),  # pylint: disable=protected-access
+    lambda: CentralStorageStrategy(["/gpu:0", "/gpu:1"]),
     required_gpus=2)
 central_storage_strategy_with_gpu_and_cpu = combinations.NamedDistribution(
     "CentralStorageCPUAndGPU",
-    lambda: central_storage_strategy.CentralStorageStrategy(
-        ["/gpu:0", "/cpu:0"]),
+    lambda: CentralStorageStrategy(["/gpu:0", "/cpu:0"]),
     required_gpus=1)
 # chief + 1 worker, with CPU.
 multi_worker_mirrored_2x1_cpu = combinations.NamedDistribution(
@@ -205,6 +231,7 @@ multi_worker_mirrored_2x1_cpu = combinations.NamedDistribution(
     has_chief=True,
     num_workers=1,
     use_pool_runner=True,
+    no_xla=True,
 )
 # chief + 1 worker, with 1 GPU each.
 multi_worker_mirrored_2x1_gpu = combinations.NamedDistribution(
@@ -214,6 +241,7 @@ multi_worker_mirrored_2x1_gpu = combinations.NamedDistribution(
     num_workers=1,
     required_gpus=1,
     use_pool_runner=True,
+    no_xla=True,
 )
 # chief + 1 worker, with 2 GPU each.
 multi_worker_mirrored_2x2_gpu = combinations.NamedDistribution(
@@ -223,6 +251,7 @@ multi_worker_mirrored_2x2_gpu = combinations.NamedDistribution(
     num_workers=1,
     required_gpus=2,
     use_pool_runner=True,
+    no_xla=True,
 )
 # chief + 3 workers, with CPU.
 multi_worker_mirrored_4x1_cpu = combinations.NamedDistribution(
@@ -231,48 +260,16 @@ multi_worker_mirrored_4x1_cpu = combinations.NamedDistribution(
     has_chief=True,
     num_workers=3,
     use_pool_runner=True,
+    no_xla=True,
 )
 
 
-# Shutdown the runners gracefully to avoid the processes getting SIGTERM.
-def _shutdown_at_exit():
-  for strategy in [
-      multi_worker_mirrored_2x1_cpu,
-      multi_worker_mirrored_2x1_gpu,
-      multi_worker_mirrored_2x2_gpu,
-      multi_worker_mirrored_4x1_cpu,
-  ]:
-    if strategy.runner:
-      strategy.runner.shutdown()
-
-
-atexit.register(_shutdown_at_exit)
-
-
 graph_and_eager_modes = ["graph", "eager"]
 
 
-# This function should be called in a test's `setUp` method with the
-# maximum value needed in any test.
+# TODO(crccw): remove after tf-nightly picks up the new API.
 def set_virtual_cpus_to_at_least(num_virtual_cpus):
-  """Create virtual CPU devices if they haven't yet been created."""
-  if num_virtual_cpus < 1:
-    raise ValueError("`num_virtual_cpus` must be at least 1 not %r" %
-                     (num_virtual_cpus,))
-  physical_devices = config.list_physical_devices("CPU")
-  if not physical_devices:
-    raise RuntimeError("No CPUs found")
-  configs = config.get_logical_device_configuration(physical_devices[0])
-  if configs is None:
-    logical_devices = [
-        context.LogicalDeviceConfiguration() for _ in range(num_virtual_cpus)
-    ]
-    config.set_logical_device_configuration(physical_devices[0],
-                                            logical_devices)
-  else:
-    if len(configs) < num_virtual_cpus:
-      raise RuntimeError("Already configured with %d < %d virtual CPUs" %
-                         (len(configs), num_virtual_cpus))
+  test_util.set_logical_devices_to_at_least("CPU", num_virtual_cpus)
 
 
 strategies_minus_tpu = [
@@ -327,8 +324,7 @@ multidevice_strategies = [
 ]
 
 multiworker_strategies = [
-    multi_worker_mirrored_2x1_cpu,
-    multi_worker_mirrored_2x1_gpu,
+    multi_worker_mirrored_2x1_cpu, multi_worker_mirrored_2x1_gpu,
     multi_worker_mirrored_2x2_gpu
 ]
 
@@ -358,3 +354,57 @@ def all_strategy_minus_default_and_tpu_combinations():
 def all_strategy_combinations_minus_default():
   return (all_strategy_minus_default_and_tpu_combinations() +
           tpu_strategy_combinations())
+
+
+tf_export(
+    _TF_INTERNAL_API_PREFIX + "central_storage_strategy_with_gpu_and_cpu",
+    v1=[]).export_constant(__name__,
+                           "central_storage_strategy_with_gpu_and_cpu")
+tf_export(
+    _TF_INTERNAL_API_PREFIX + "central_storage_strategy_with_two_gpus",
+    v1=[]).export_constant(__name__, "central_storage_strategy_with_two_gpus")
+tf_export(
+    _TF_INTERNAL_API_PREFIX + "cloud_tpu_strategy",
+    v1=[]).export_constant(__name__, "cloud_tpu_strategy")
+tf_export(
+    _TF_INTERNAL_API_PREFIX + "default_strategy",
+    v1=[]).export_constant(__name__, "default_strategy")
+tf_export(
+    _TF_INTERNAL_API_PREFIX + "mirrored_strategy_with_cpu_1_and_2",
+    v1=[]).export_constant(__name__, "mirrored_strategy_with_cpu_1_and_2")
+tf_export(
+    _TF_INTERNAL_API_PREFIX + "mirrored_strategy_with_gpu_and_cpu",
+    v1=[]).export_constant(__name__, "mirrored_strategy_with_gpu_and_cpu")
+tf_export(
+    _TF_INTERNAL_API_PREFIX + "mirrored_strategy_with_one_cpu",
+    v1=[]).export_constant(__name__, "mirrored_strategy_with_one_cpu")
+tf_export(
+    _TF_INTERNAL_API_PREFIX + "mirrored_strategy_with_one_gpu",
+    v1=[]).export_constant(__name__, "mirrored_strategy_with_one_gpu")
+tf_export(
+    _TF_INTERNAL_API_PREFIX + "mirrored_strategy_with_two_gpus",
+    v1=[]).export_constant(__name__, "mirrored_strategy_with_two_gpus")
+tf_export(
+    _TF_INTERNAL_API_PREFIX + "multi_worker_mirrored_2x1_cpu",
+    v1=[]).export_constant(__name__, "multi_worker_mirrored_2x1_cpu")
+tf_export(
+    _TF_INTERNAL_API_PREFIX + "multi_worker_mirrored_2x1_gpu",
+    v1=[]).export_constant(__name__, "multi_worker_mirrored_2x1_gpu")
+tf_export(
+    _TF_INTERNAL_API_PREFIX + "multi_worker_mirrored_2x2_gpu",
+    v1=[]).export_constant(__name__, "multi_worker_mirrored_2x2_gpu")
+tf_export(
+    _TF_INTERNAL_API_PREFIX + "one_device_strategy",
+    v1=[]).export_constant(__name__, "one_device_strategy")
+tf_export(
+    _TF_INTERNAL_API_PREFIX + "one_device_strategy_gpu",
+    v1=[]).export_constant(__name__, "one_device_strategy_gpu")
+tf_export(
+    _TF_INTERNAL_API_PREFIX + "tpu_strategy",
+    v1=[]).export_constant(__name__, "tpu_strategy")
+tf_export(
+    _TF_INTERNAL_API_PREFIX + "tpu_strategy_one_core",
+    v1=[]).export_constant(__name__, "tpu_strategy_one_core")
+tf_export(
+    _TF_INTERNAL_API_PREFIX + "tpu_strategy_packed_var",
+    v1=[]).export_constant(__name__, "tpu_strategy_packed_var")
diff --git a/tensorflow/python/distribute/strategy_combinations_test.py b/tensorflow/python/distribute/strategy_combinations_test.py
index 38ace7da42d..9ee5eab93c6 100644
--- a/tensorflow/python/distribute/strategy_combinations_test.py
+++ b/tensorflow/python/distribute/strategy_combinations_test.py
@@ -20,52 +20,22 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.python import tf2
+from tensorflow.python.distribute import central_storage_strategy
+from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import one_device_strategy
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
-from tensorflow.python.eager import context
+from tensorflow.python.distribute import test_util
+from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.eager import def_function
-from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class VirtualDevicesTest(test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    context._reset_context()  # pylint: disable=protected-access
-    # Need to call set_virtual_cpus_to_at_least() in setUp with the maximum
-    # value needed in any test.
-    strategy_combinations.set_virtual_cpus_to_at_least(3)
-    super(VirtualDevicesTest, self).setUp()
-
-  def test3VirtualCPUs(self):
-    cpu_device = config.list_physical_devices("CPU")[0]
-    self.assertLen(config.get_logical_device_configuration(cpu_device), 3)
-
-  def testSetVirtualCPUsAgain(self):
-    strategy_combinations.set_virtual_cpus_to_at_least(2)
-    cpu_device = config.list_physical_devices("CPU")[0]
-    self.assertLen(config.get_logical_device_configuration(cpu_device), 3)
-
-  def testSetVirtualCPUsErrors(self):
-    with self.assertRaises(ValueError):
-      strategy_combinations.set_virtual_cpus_to_at_least(0)
-    with self.assertRaisesRegex(RuntimeError, "with 3 < 5 virtual CPUs"):
-      strategy_combinations.set_virtual_cpus_to_at_least(5)
-
-  @combinations.generate(combinations.combine(
-      distribution=[strategy_combinations.mirrored_strategy_with_cpu_1_and_2],
-      mode=["graph", "eager"]))
-  def testMirrored2CPUs(self, distribution):
-    with distribution.scope():
-      one_per_replica = distribution.run(lambda: constant_op.constant(1))
-      num_replicas = distribution.reduce(
-          reduce_util.ReduceOp.SUM, one_per_replica, axis=None)
-      self.assertEqual(2, self.evaluate(num_replicas))
-
-
 class StrategyCombinationsTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(
@@ -100,6 +70,126 @@ class StrategyCombinationsTest(test.TestCase, parameterized.TestCase):
           reduce_util.ReduceOp.SUM, one_per_replica, axis=None)
       self.assertEqual(self.evaluate(num_replicas), 4.)
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_cpu_1_and_2
+          ],
+          mode=["graph", "eager"]))
+  def testMirrored2CPUs(self, distribution):
+    with distribution.scope():
+      one_per_replica = distribution.run(lambda: constant_op.constant(1))
+      num_replicas = distribution.reduce(
+          reduce_util.ReduceOp.SUM, one_per_replica, axis=None)
+      self.assertEqual(2, self.evaluate(num_replicas))
+
+
+class V1StrategyTest(test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    tf2.disable()
+
+  @combinations.generate(
+      combinations.combine(strategy=[
+          strategy_combinations.one_device_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+          strategy_combinations.one_device_strategy_gpu_on_worker_1,
+          strategy_combinations.one_device_strategy_on_worker_1
+      ]))
+  def testOneDevice(self, strategy):
+    self.assertIsInstance(strategy, one_device_strategy.OneDeviceStrategyV1)
+
+  @combinations.generate(
+      combinations.combine(strategy=[
+          strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
+          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          strategy_combinations.mirrored_strategy_with_one_cpu,
+          strategy_combinations.mirrored_strategy_with_one_gpu,
+          strategy_combinations.mirrored_strategy_with_two_gpus,
+      ]))
+  def testMirrored(self, strategy):
+    self.assertIsInstance(strategy, mirrored_strategy.MirroredStrategyV1)
+
+  @combinations.generate(
+      combinations.combine(strategy=[
+          strategy_combinations.multi_worker_mirrored_2x1_cpu,
+          strategy_combinations.multi_worker_mirrored_2x1_gpu,
+          strategy_combinations.multi_worker_mirrored_2x2_gpu,
+          strategy_combinations.multi_worker_mirrored_4x1_cpu,
+      ]))
+  def testMultiWorkerMirrored(self, strategy):
+    # MultiWorkerMirroredStrategy combinations only supports V2.
+    self.assertIsInstance(
+        strategy, collective_all_reduce_strategy.CollectiveAllReduceStrategy)
+
+  @combinations.generate(
+      combinations.combine(strategy=[
+          strategy_combinations.central_storage_strategy_with_gpu_and_cpu,
+          strategy_combinations.central_storage_strategy_with_two_gpus,
+      ]))
+  def testCentralStorage(self, strategy):
+    self.assertIsInstance(strategy,
+                          central_storage_strategy.CentralStorageStrategyV1)
+
+  @combinations.generate(
+      combinations.combine(strategy=strategy_combinations.tpu_strategies))
+  def testTPU(self, strategy):
+    self.assertIsInstance(strategy, tpu_strategy.TPUStrategyV1)
+
+
+class V2StrategyTest(test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    tf2.enable()
+
+  @combinations.generate(
+      combinations.combine(strategy=[
+          strategy_combinations.one_device_strategy,
+          strategy_combinations.one_device_strategy_gpu,
+          strategy_combinations.one_device_strategy_gpu_on_worker_1,
+          strategy_combinations.one_device_strategy_on_worker_1
+      ]))
+  def testOneDevice(self, strategy):
+    self.assertIsInstance(strategy, one_device_strategy.OneDeviceStrategy)
+
+  @combinations.generate(
+      combinations.combine(strategy=[
+          strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
+          strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          strategy_combinations.mirrored_strategy_with_one_cpu,
+          strategy_combinations.mirrored_strategy_with_one_gpu,
+          strategy_combinations.mirrored_strategy_with_two_gpus,
+      ]))
+  def testMirrored(self, strategy):
+    self.assertIsInstance(strategy, mirrored_strategy.MirroredStrategy)
+
+  @combinations.generate(
+      combinations.combine(strategy=[
+          strategy_combinations.multi_worker_mirrored_2x1_cpu,
+          strategy_combinations.multi_worker_mirrored_2x1_gpu,
+          strategy_combinations.multi_worker_mirrored_2x2_gpu,
+          strategy_combinations.multi_worker_mirrored_4x1_cpu,
+      ]))
+  def testMultiWorkerMirrored(self, strategy):
+    self.assertIsInstance(
+        strategy, collective_all_reduce_strategy.CollectiveAllReduceStrategy)
+
+  @combinations.generate(
+      combinations.combine(strategy=[
+          strategy_combinations.central_storage_strategy_with_gpu_and_cpu,
+          strategy_combinations.central_storage_strategy_with_two_gpus,
+      ]))
+  def testCentralStorage(self, strategy):
+    self.assertIsInstance(strategy,
+                          central_storage_strategy.CentralStorageStrategy)
+
+  @combinations.generate(
+      combinations.combine(strategy=strategy_combinations.tpu_strategies))
+  def testTPU(self, strategy):
+    self.assertIsInstance(strategy, tpu_strategy.TPUStrategy)
+
 
 if __name__ == "__main__":
-  combinations.main()
+  test_util.main()
diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
index ece8c573ed1..6b19a744457 100644
--- a/tensorflow/python/distribute/strategy_common_test.py
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -19,9 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from absl.testing import parameterized
-import numpy as np
 
-from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
@@ -29,14 +27,18 @@ from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import strategy_test_lib
+from tensorflow.python.distribute import test_util
+from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.distribute.collective_all_reduce_strategy import CollectiveAllReduceStrategy
-from tensorflow.python.distribute.tpu_strategy import TPUStrategy
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
+from tensorflow.python.util import nest
 
 
 @combinations.generate(
@@ -48,27 +50,6 @@ from tensorflow.python.platform import test
         mode=['eager']))
 class StrategyTest(test.TestCase, parameterized.TestCase):
 
-  def testSimpleReduce(self, strategy):
-    per_replica_value = strategy.experimental_distribute_values_from_function(
-        lambda _: array_ops.ones((), dtypes.float32))
-
-    def fn_eager():
-
-      return strategy.reduce(
-          reduce_util.ReduceOp.SUM, value=per_replica_value, axis=None)
-
-    fn_graph = def_function.function(fn_eager)
-    # Run reduce under the strategy scope to explicitly enter
-    # strategy default_device scope.
-    with strategy.scope():
-      self.assertEqual(fn_eager().numpy(), 1.0 * strategy.num_replicas_in_sync)
-      self.assertEqual(fn_graph().numpy(), 1.0 * strategy.num_replicas_in_sync)
-
-    # Run reduce without a strategy scope to implicitly enter
-    # strategy default_device scope.
-    self.assertEqual(fn_eager().numpy(), 1.0 * strategy.num_replicas_in_sync)
-    self.assertEqual(fn_graph().numpy(), 1.0 * strategy.num_replicas_in_sync)
-
   def testCaptureReplicaId(self, strategy):
     m = {}
 
@@ -86,6 +67,72 @@ class StrategyTest(test.TestCase, parameterized.TestCase):
     g()
 
 
+@combinations.generate(
+    combinations.combine(
+        strategy=[
+            strategy_combinations.multi_worker_mirrored_2x1_cpu,
+            strategy_combinations.multi_worker_mirrored_2x1_gpu,
+        ] + strategy_combinations.all_strategies,
+        mode=['eager']))
+class ReduceTest(test.TestCase, parameterized.TestCase):
+
+  def testBasic(self, strategy):
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        lambda _: array_ops.ones((), dtypes.float32))
+
+    def fn_eager():
+
+      return strategy.reduce(
+          reduce_util.ReduceOp.SUM, value=per_replica_value, axis=None)
+
+    fn_graph = def_function.function(fn_eager)
+    # Run reduce under the strategy scope to explicitly enter
+    # strategy default_device scope.
+    with strategy.scope():
+      self.assertEqual(fn_eager().numpy(), 1.0 * strategy.num_replicas_in_sync)
+      self.assertEqual(fn_graph().numpy(), 1.0 * strategy.num_replicas_in_sync)
+
+    # Run reduce without a strategy scope to implicitly enter
+    # strategy default_device scope.
+    self.assertEqual(fn_eager().numpy(), 1.0 * strategy.num_replicas_in_sync)
+    self.assertEqual(fn_graph().numpy(), 1.0 * strategy.num_replicas_in_sync)
+
+  def testAxis(self, strategy):
+
+    @def_function.function
+    def fn():
+      return constant_op.constant([1., 2.])
+
+    x = strategy.run(fn)
+
+    x_m = strategy.reduce(reduce_util.ReduceOp.MEAN, x, axis=0)
+    self.assertEqual(1.5, x_m)
+    x_s = strategy.reduce(reduce_util.ReduceOp.SUM, x, axis=0)
+    self.assertEqual(3 * strategy.num_replicas_in_sync, x_s)
+
+
+def _make_indexed_slices(values, indices, dense_shape):
+  tensor = ops.IndexedSlices(
+      values=constant_op.constant(values),
+      indices=constant_op.constant(indices),
+      dense_shape=constant_op.constant(dense_shape))
+  return tensor
+
+
+def _get_num_replicas_per_client(strategy):
+  if isinstance(strategy, CollectiveAllReduceStrategy):
+    resolver = strategy.cluster_resolver
+    return max(nest.flatten(resolver.num_accelerators())[0], 1)
+  else:
+    return strategy.num_replicas_in_sync
+
+
+def _is_tpu_strategy(strategy):
+  return isinstance(strategy,
+                    (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1,
+                     tpu_strategy.TPUStrategyV2))
+
+
 @combinations.generate(
     combinations.combine(
         strategy=[
@@ -107,7 +154,7 @@ class DistributedCollectiveAllReduceStrategyTest(
 
     expected_sum_on_workers = {'chief': 10, 'worker': 35}
     input_iterator = iter(
-        strategy.experimental_distribute_datasets_from_function(dataset_fn))
+        strategy.distribute_datasets_from_function(dataset_fn))
 
     @def_function.function
     def run(iterator):
@@ -135,10 +182,10 @@ class DistributedCollectiveAllReduceStrategyTest(
     # `result` is an incomplete batch
     result = run(input_iterator)
     expected_data_on_workers = {'chief': [8, 9, 10], 'worker': [11, 12, 13]}
-    self.assertTrue(
-        np.array_equal(
-            result.numpy(),
-            expected_data_on_workers[multi_worker_test_base.get_task_type()]))
+    self.assertAllEqual(
+        expected_data_on_workers[multi_worker_test_base.get_task_type()],
+        result.numpy(),
+    )
 
   def testSimpleInputFromFnLastPartialBatch(self, strategy):
 
@@ -151,7 +198,7 @@ class DistributedCollectiveAllReduceStrategyTest(
                            input_context.input_pipeline_id)
 
     input_iterator = iter(
-        strategy.experimental_distribute_datasets_from_function(dataset_fn))
+        strategy.distribute_datasets_from_function(dataset_fn))
 
     @def_function.function
     def run(input_iterator):
@@ -163,10 +210,9 @@ class DistributedCollectiveAllReduceStrategyTest(
     result = run(input_iterator)
 
     expected_data_on_worker = {'chief': [8, 9, 10, 11], 'worker': [12, 13]}
-    self.assertTrue(
-        np.array_equal(
-            result.numpy(), expected_data_on_worker[
-                multi_worker_test_base.get_task_type()]))
+    self.assertAllEqual(
+        expected_data_on_worker[multi_worker_test_base.get_task_type()],
+        result.numpy())
 
   def testReduceHostTensor(self, strategy):
     reduced = strategy.reduce(
@@ -184,7 +230,7 @@ class DistributedCollectiveAllReduceStrategyTest(
     reduced = strategy.extended.batch_reduce_to(reduce_util.ReduceOp.SUM,
                                                 [(value, value),
                                                  (value, value)])
-    self.assertAllEqual(reduced, [2., 2.])
+    self.assertAllEqual([2., 2.], reduced)
 
   def testReduceDeviceTensors(self, strategy):
     value = strategy.run(lambda: array_ops.identity(1.))
@@ -202,7 +248,7 @@ class DistributedCollectiveAllReduceStrategyTest(
     reduced = strategy.extended.batch_reduce_to(reduce_util.ReduceOp.SUM,
                                                 [(value, value),
                                                  (value, value)])
-    self.assertAllEqual(reduced, [2., 2.])
+    self.assertAllEqual([2., 2.], reduced)
 
   # TODO(crccw): add a test that mixes device and host tensors after multi
   # worker strategy combinations can run on a fixed number of GPUs.
@@ -220,7 +266,7 @@ class StrategyClusterResolverTest(test.TestCase, parameterized.TestCase):
     # `None` otherwise.
     resolver = strategy.cluster_resolver
     if not isinstance(strategy, CollectiveAllReduceStrategy) and not isinstance(
-        strategy, TPUStrategy):
+        strategy, tpu_strategy.TPUStrategy):
       self.assertIsNone(resolver)
       return
 
@@ -238,5 +284,4 @@ class StrategyClusterResolverTest(test.TestCase, parameterized.TestCase):
 
 
 if __name__ == '__main__':
-  v2_compat.enable_v2_behavior()
-  combinations.main()
+  test_util.main()
diff --git a/tensorflow/python/distribute/strategy_gather_test.py b/tensorflow/python/distribute/strategy_gather_test.py
new file mode 100644
index 00000000000..7cefcf396db
--- /dev/null
+++ b/tensorflow/python/distribute/strategy_gather_test.py
@@ -0,0 +1,600 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for common methods in strategy classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import central_storage_strategy
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import test_util
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.distribute.collective_all_reduce_strategy import CollectiveAllReduceStrategy
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+@combinations.generate(
+    combinations.combine(
+        strategy=[
+            strategy_combinations.default_strategy,
+            strategy_combinations.one_device_strategy,
+            strategy_combinations.one_device_strategy_gpu,
+            strategy_combinations.central_storage_strategy_with_two_gpus,
+            strategy_combinations.central_storage_strategy_with_gpu_and_cpu,
+            strategy_combinations.mirrored_strategy_with_one_cpu,
+            strategy_combinations.mirrored_strategy_with_one_gpu,
+            strategy_combinations.mirrored_strategy_with_two_gpus,
+            strategy_combinations.mirrored_strategy_with_cpu_1_and_2,
+            strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+            strategy_combinations.multi_worker_mirrored_2x2_gpu,
+            strategy_combinations.multi_worker_mirrored_2x1_cpu,
+            strategy_combinations.multi_worker_mirrored_2x1_gpu,
+        ],
+        mode=['eager'],
+        pure_eager=[True, False]) + combinations.combine(
+            strategy=[
+                strategy_combinations.tpu_strategy,
+                strategy_combinations.tpu_strategy_packed_var,
+                strategy_combinations.tpu_strategy_one_step,
+                strategy_combinations.cloud_tpu_strategy,
+            ],
+            mode=['eager'],
+            pure_eager=[False]))
+class GatherTest(test.TestCase, parameterized.TestCase):
+
+  def _gather_same_shape_and_verify(self, value_on_replica, axis, pure_eager,
+                                    strategy):
+    distributed_values = strategy.experimental_distribute_values_from_function(
+        lambda _: array_ops.identity(value_on_replica))
+
+    def run():
+      return strategy._gather(distributed_values, axis=axis)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    all_results = [
+        value_on_replica for _ in range(strategy.num_replicas_in_sync)
+    ]
+    expected_result = array_ops.concat(all_results, axis=axis)
+    self.assertAllEqual(expected_result, run().numpy())
+
+  def testGatherPerReplicaDense1D0Axis(self, strategy, pure_eager):
+    """A DistributedValues object with two tensors of shape [3] on each replica gathers to a tensor of [6]."""
+    single_value = constant_op.constant([1, 2, 3])
+    axis = 0
+    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
+
+  def testGatherPerReplicaDense2D0Axis(self, strategy, pure_eager):
+    """A DistributedValues object with two tensors of [1, 3] on each replica gathers along 0th dim to a tensor of [2, 3]."""
+    single_value = constant_op.constant([[1, 2, 3]])
+    axis = 0
+    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
+
+  def testGatherPerReplicaDense2D1Axis(self, strategy, pure_eager):
+    """A DistributedValues object with two tensors of [1, 3] on each replica gathers along 1st dim to a tensor of [1, 6]."""
+    single_value = constant_op.constant([[1, 2, 3]])
+    axis = 1
+    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
+
+  def testGatherPerReplicaDense3D0Axis(self, strategy, pure_eager):
+    """A DistributedValues object with two tensors of [1, 2, 2] on each replica gathers along 0th dim to a tensor of [2, 2, 2]."""
+    single_value = constant_op.constant([[[1, 2], [1, 2]]])
+    axis = 0
+    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
+
+  def testGatherPerReplicaDense3D1Axis(self, strategy, pure_eager):
+    """A DistributedValues object with two tensors of [1, 2, 2] on each replica gathers along 1nd dimension to a tensor of [1, 4, 2]."""
+    single_value = constant_op.constant([[[1, 2], [1, 2]]])
+    axis = 1
+    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
+
+  def testGatherPerReplicaDense3D2Axis(self, strategy, pure_eager):
+    """A DistributedValues object with two tensors of [1, 2, 2] on each replica gathers along 2nd dimension to a tensor of [1, 2, 4]."""
+    single_value = constant_op.constant([[[1, 2], [1, 2]]])
+    axis = 2
+    self._gather_same_shape_and_verify(single_value, axis, pure_eager, strategy)
+
+  def testGatherDiffShapeAtAxis0(self, strategy, pure_eager):
+    """Different `Axis`-th (0) dimension: shape [1, 1], [2, 1] -> [3, 1]."""
+
+    def value_fn(ctx):
+      return constant_op.constant(
+          1, shape=(ctx.replica_id_in_sync_group + 1, 1))
+
+    distributed_values = strategy.experimental_distribute_values_from_function(
+        value_fn)
+    axis = 0
+
+    def run():
+      return strategy._gather(distributed_values, axis=axis)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    expected_result = constant_op.constant(
+        1, shape=(sum(range(strategy.num_replicas_in_sync + 1)), 1))
+
+    self.assertAllEqual(expected_result, run().numpy())
+
+  def testGatherDiffShapeAtAxis1(self, strategy, pure_eager):
+    """Different `Axis`-th (non-0) dimension: shape [1, 1], [1, 2] -> [1, 3]."""
+
+    def value_fn(ctx):
+      return constant_op.constant(
+          1, shape=(1, ctx.replica_id_in_sync_group + 1))
+
+    distributed_values = strategy.experimental_distribute_values_from_function(
+        value_fn)
+    axis = 1
+
+    def run():
+      return strategy._gather(distributed_values, axis=axis)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    expected_result = constant_op.constant(
+        1, shape=(1, sum(range(strategy.num_replicas_in_sync + 1))))
+
+    self.assertAllEqual(expected_result, run().numpy())
+
+  def testGatherRaiseDiffShapeAtNonAxis(self, strategy, pure_eager):
+    """Different at non-`axis`-th dimension : [1, 1], [1, 2], 0th -> raise error."""
+    if isinstance(strategy, CollectiveAllReduceStrategy
+                 ) and _get_num_replicas_per_client(strategy) > 1:
+      self.skipTest('b/167331966')
+
+    if strategy.num_replicas_in_sync <= 1:
+      self.skipTest('Test for more than 1 replica only.')
+
+    def value_fn(ctx):
+      return constant_op.constant(
+          1, shape=(1, ctx.replica_id_in_sync_group + 1))
+
+    distributed_values = strategy.experimental_distribute_values_from_function(
+        value_fn)
+    axis = 0
+
+    def run():
+      return strategy._gather(distributed_values, axis=axis)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    if isinstance(strategy, CollectiveAllReduceStrategy):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  r'Shape mismatch'):
+        run()
+    elif isinstance(
+        strategy,
+        (mirrored_strategy.MirroredStrategy,
+         central_storage_strategy.CentralStorageStrategy)) and pure_eager:
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  r'Dimensions of inputs should match'):
+        run()
+    else:
+      with self.assertRaisesRegex(ValueError,
+                                  r'Dimension \d in both shapes must be equal'):
+        run()
+
+  def testGatherRaiseSparse(self, strategy, pure_eager):
+    dense_shape = [5, 2]
+    t0 = _make_indexed_slices(
+        values=[[1., 2.]], indices=[2], dense_shape=dense_shape)
+
+    def run(value):
+      return strategy._gather(value, axis=0)
+
+    with self.assertRaisesRegex(
+        NotImplementedError,
+        r'gather/all_gather does not support IndexedSlices'):
+      if pure_eager:
+        run(t0)
+      else:
+        def_function.function(run)(t0)
+
+  def testGatherRaiseDifferentRank(self, strategy, pure_eager):
+    """Different rank: [1,], [1, 2] -> raise error."""
+    if strategy.num_replicas_in_sync <= 1:
+      self.skipTest('Test for more than 1 replicas.')
+    if isinstance(strategy, CollectiveAllReduceStrategy
+                 ) and _get_num_replicas_per_client(strategy) > 1:
+      self.skipTest('b/167331966')
+    def value_fn(ctx):
+      return array_ops.ones(shape=(range(1, ctx.replica_id_in_sync_group + 2)))
+
+    distributed_values = strategy.experimental_distribute_values_from_function(
+        value_fn)
+    axis = 0
+
+    def run():
+      return strategy._gather(distributed_values, axis=axis)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    if isinstance(strategy, CollectiveAllReduceStrategy):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  r'Shape mismatch'):
+        run()
+    elif isinstance(
+        strategy,
+        (mirrored_strategy.MirroredStrategy,
+         central_storage_strategy.CentralStorageStrategy)) and pure_eager:
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  r'Ranks of all input tensors should match'):
+        run()
+    elif _is_tpu_strategy(strategy) and pure_eager:
+      with self.assertRaisesRegex(ValueError,
+                                  r'Dimension \d in both shapes must be equal'):
+        run()
+    else:
+      with self.assertRaisesRegex(ValueError,
+                                  r'Shape must be rank \d but is rank \d'):
+        run()
+
+  # Ideally, here we should split them into another test class, AllGatherTest.
+  # But doing that makes two initialize_tpu_system() calls and one of them times
+  # out, on Kokoro. Integrating two into one avoids it.
+  def _all_gather_same_shape_and_verify(self, value_on_replica, axis,
+                                        pure_eager, strategy):
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        lambda _: array_ops.identity(value_on_replica))
+
+    def replica_fn(per_replica_value):
+      ctx = ds_context.get_replica_context()
+      local_value = array_ops.identity(per_replica_value)
+      return ctx._all_gather(local_value, axis=axis)
+
+    if not pure_eager:
+      replica_fn = def_function.function(replica_fn)
+
+    result = strategy.experimental_local_results(
+        strategy.run(replica_fn, args=(per_replica_value,)))
+
+    all_value = [value_on_replica for _ in range(strategy.num_replicas_in_sync)]
+    expect = array_ops.concat(all_value, axis=axis)
+    expected_result = [expect] * _get_num_replicas_per_client(strategy)
+
+    self.assertAllClose(expected_result, result)
+
+  def testAllGatherPerReplicaDense1D0Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=0,...) a DistributedValues with a Tensor of shape (3,) on two replica returns a PerReplica of tensor(s) with shape (6,)."""
+    single_value = constant_op.constant([1, 2, 3], dtype=dtypes.float32)
+    axis = 0
+    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
+                                           strategy)
+
+  def testAllGatherPerReplicaDense2D0Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=0,...) a DistributedValues with a Tensor of shape (1,3) on two replica returns PerReplica of tensor(s) with shape (2,3)."""
+    single_value = constant_op.constant([[1, 2, 3]])
+    axis = 0
+    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
+                                           strategy)
+
+  def testAllGatherPerReplicaDense2D1Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=1,...) a DistributedValues with a Tensor of shape (1,3) on two replica returns PerReplica of tensor(s) with shape (1,6)."""
+    single_value = constant_op.constant([[1, 2, 3]])
+    axis = 1
+    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
+                                           strategy)
+
+  def testAllGatherPerReplicaDense3D0Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=0,...) a DistributedValues with a Tensor of shape (1,2,2) on two replica returns PerReplica of tensor(s) with shape (2,2,2)."""
+    single_value = constant_op.constant([[[1, 2], [1, 2]]])
+    axis = 0
+    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
+                                           strategy)
+
+  def testAllGatherPerReplicaDense3D1Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=1,...) a DistributedValues with a Tensor of shape (1,2,2) on two replica returns PerReplica of tensor(s) with shape (1,4,2)."""
+    single_value = constant_op.constant([[[1, 2], [1, 2]]])
+    axis = 1
+    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
+                                           strategy)
+
+  def testAllGatherPerReplicaDense3D2Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=2,...) a DistributedValues with a Tensor of shape (1,2,2) on two replica returns PerReplica of tensor(s) with shape (1,2,4)."""
+    single_value = constant_op.constant([[[1, 2], [1, 2]]])
+    axis = 2
+    self._all_gather_same_shape_and_verify(single_value, axis, pure_eager,
+                                           strategy)
+
+  def testAllGatherDiffValueTPU(self, strategy, pure_eager):
+    # Test for TPU only since it can't be tested via testAllGatherDiffShape*
+    if not _is_tpu_strategy(strategy):
+      self.skipTest('Test for TPU only. For other strategies case already'
+                    ' covered in other tests')
+
+    data = [[1], [2], [3], [4], [5], [6], [7], [8]]
+
+    axis = 0
+    dataset = dataset_ops.DatasetV2.from_tensor_slices(data).batch(8)
+    input_iterator = iter(strategy.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def replica_fn(per_replica_value):
+      ctx = ds_context.get_replica_context()
+      return ctx._all_gather(array_ops.identity(per_replica_value), axis=axis)
+
+    result = strategy.experimental_local_results(
+        strategy.run(replica_fn, args=(next(input_iterator),)))
+
+    expected_result = [data] * _get_num_replicas_per_client(strategy)
+    self.assertAllClose(expected_result, result)
+
+  def testAllGatherDiffShapeAtAxis0(self, strategy, pure_eager):
+    """Different `Axis==0`-th dimension: shape [1, 1], [2, 1] -> [3, 1]."""
+
+    if _is_tpu_strategy(strategy):
+      self.skipTest('TPU does not support all_gather different shapes')
+
+    def value_fn(ctx):
+      return constant_op.constant(
+          1, shape=(ctx.replica_id_in_sync_group + 1, 1))
+
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        value_fn)
+
+    expect = constant_op.constant(
+        1, shape=(sum(range(strategy.num_replicas_in_sync + 1)), 1))
+
+    def run(value):
+      value_identity = array_ops.identity(value)
+      ctx = ds_context.get_replica_context()
+      return ctx._all_gather(value_identity, axis=0)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    expected_result = [expect] * _get_num_replicas_per_client(strategy)
+    result = strategy.experimental_local_results(
+        strategy.run(run, args=(per_replica_value,)))
+    self.assertAllEqual(expected_result, result)
+
+  def testAllGatherDiffShapeAtAxis1(self, strategy, pure_eager):
+    """Different `Axis`-th (not 0th) dimension: shape [1, 1], [1, 2] -> [1, 3]."""
+    if _is_tpu_strategy(strategy):
+      self.skipTest('TPU does not support all_gather different shapes')
+
+    def value_fn(ctx):
+      return constant_op.constant(
+          1, shape=(1, ctx.replica_id_in_sync_group + 1))
+
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        value_fn)
+
+    expect = constant_op.constant(
+        1, shape=(1, sum(range(strategy.num_replicas_in_sync + 1))))
+
+    def run(value):
+      value_identity = array_ops.identity(value)
+      ctx = ds_context.get_replica_context()
+      return ctx._all_gather(value_identity, axis=1)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    expected_result = [expect] * _get_num_replicas_per_client(strategy)
+    result = strategy.experimental_local_results(
+        strategy.run(run, args=(per_replica_value,)))
+    self.assertAllEqual(expected_result, result)
+
+  def testAllGatherNest(self, strategy, pure_eager):
+    if _is_tpu_strategy(strategy):
+      self.skipTest('TPU does not support all_gather different shapes')
+
+    axis = 1
+
+    def value_fn(ctx):
+      value = constant_op.constant(
+          1, shape=(1, ctx.replica_id_in_sync_group + 1))
+      return value
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        value_fn)
+
+    expect_1 = constant_op.constant(
+        1, shape=(1, sum(range(strategy.num_replicas_in_sync + 1))))
+
+    expected_per_replica_1 = [expect_1] * _get_num_replicas_per_client(strategy)
+
+    value_2 = constant_op.constant([[[1, 2], [1, 2]]])
+
+    expect_2 = array_ops.concat(
+        [value_2 for _ in range(strategy.num_replicas_in_sync)], axis=axis)
+
+    expected_per_replica_2 = [expect_2] * _get_num_replicas_per_client(strategy)
+
+    def run(value):
+      value_1 = array_ops.identity(value)
+      value_3 = array_ops.identity(value_2)
+      ctx = ds_context.get_replica_context()
+      return ctx._all_gather([value_1, value_3], axis=axis)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    result = strategy.run(run, args=(per_replica_value,))
+    self.assertAllEqual(expected_per_replica_1,
+                        strategy.experimental_local_results(result[0]))
+    self.assertAllEqual(expected_per_replica_2,
+                        strategy.experimental_local_results(result[1]))
+
+  def testAllGatherNest1D0Axis(self, strategy, pure_eager):
+    """all_gather(..., axis=0,...) a nest of DistributedValues."""
+    single_value = constant_op.constant([1, 2, 3])
+    axis = 0
+
+    def run():
+      value_identity = array_ops.identity(single_value)
+      ctx = ds_context.get_replica_context()
+      return ctx._all_gather([value_identity, value_identity], axis=axis)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    all_value = [single_value for _ in range(strategy.num_replicas_in_sync)]
+    expect = array_ops.concat(all_value, axis=axis)
+    expected_per_replica = [expect] * _get_num_replicas_per_client(strategy)
+
+    result = strategy.run(run)
+    for gathered_result in result:
+      self.assertAllEqual(expected_per_replica,
+                          strategy.experimental_local_results(gathered_result))
+
+  def testAllGatherRaiseDiffShapeAtNonAxis(self, strategy, pure_eager):
+    """Different at non-`axis`-th dimension : [2, 1], [1, 1], all_gather(...axis=1...) -> raise error."""
+    if _is_tpu_strategy(strategy):
+      self.skipTest('TODO(b/169108777): raise a clear error message in xla.')
+
+    if isinstance(strategy, CollectiveAllReduceStrategy
+                 ) and _get_num_replicas_per_client(strategy) > 1:
+      self.skipTest('b/167331966')
+
+    if strategy.num_replicas_in_sync <= 1:
+      self.skipTest('Test for more than 1 replica only.')
+
+    def value_fn(ctx):
+      return constant_op.constant(
+          1, shape=(1, ctx.replica_id_in_sync_group + 1))
+
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        value_fn)
+
+    def run(value):
+      value_identity = array_ops.identity(value)
+      ctx = ds_context.get_replica_context()
+      return ctx._all_gather(value_identity, axis=0)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    if isinstance(strategy, CollectiveAllReduceStrategy):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  r'Shape mismatch'):
+        strategy.run(run, args=(per_replica_value,))
+    elif isinstance(
+        strategy,
+        (mirrored_strategy.MirroredStrategy,
+         central_storage_strategy.CentralStorageStrategy)) and pure_eager:
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  r'Dimensions of inputs should match'):
+        strategy.run(run, args=(per_replica_value,))
+    else:
+      with self.assertRaisesRegex(ValueError,
+                                  r'Dimension \d in both shapes must be equal'):
+        strategy.run(run, args=(per_replica_value,))
+
+  def testAllGatherRaiseSparse(self, strategy, pure_eager):
+    dense_shape = [5, 2]
+    t0 = _make_indexed_slices(
+        values=[[1., 2.]], indices=[2], dense_shape=dense_shape)
+
+    def replica_fn(value):
+      ctx = ds_context.get_replica_context()
+      return ctx._all_gather(value, axis=0)
+
+    with self.assertRaisesRegex(
+        NotImplementedError,
+        r'gather/all_gather does not support IndexedSlices'):
+      if not pure_eager:
+        strategy.run(def_function.function(replica_fn), args=(t0,))
+      else:
+        strategy.run(replica_fn, args=(t0,))
+
+  def testAllGatherRaiseDifferentRank(self, strategy, pure_eager):
+    """Different rank: [1,], [1, 2] -> raise error."""
+    if _is_tpu_strategy(strategy):
+      self.skipTest('TODO(b/169108777): raise a clear error message in xla.')
+
+    if strategy.num_replicas_in_sync <= 1:
+      self.skipTest('Test for more than 1 replicas.')
+    if isinstance(strategy, CollectiveAllReduceStrategy
+                 ) and _get_num_replicas_per_client(strategy) > 1:
+      self.skipTest('b/167331966')
+    def value_fn(ctx):
+      return array_ops.ones(shape=(range(1, ctx.replica_id_in_sync_group + 2)))
+
+    per_replica_value = strategy.experimental_distribute_values_from_function(
+        value_fn)
+
+    def run(value):
+      value_identity = array_ops.identity(value)
+      ctx = ds_context.get_replica_context()
+      return ctx._all_gather(value_identity, axis=0)
+
+    if not pure_eager:
+      run = def_function.function(run)
+
+    if isinstance(strategy, CollectiveAllReduceStrategy):
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  r'Shape mismatch'):
+        strategy.run(run, args=(per_replica_value,))
+    elif isinstance(strategy,
+                    (mirrored_strategy.MirroredStrategy,
+                     central_storage_strategy.CentralStorageStrategy)):
+      if pure_eager:
+        with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                    r'Ranks of all input tensors should match'):
+          strategy.run(run, args=(per_replica_value,))
+      else:
+        with self.assertRaisesRegex(ValueError,
+                                    r'Shape must be rank \d but is rank \d'):
+          strategy.run(run, args=(per_replica_value,))
+    else:
+      with self.assertRaisesRegex(ValueError,
+                                  r'Dimension \d in both shapes must be equal'):
+        strategy.run(run, args=(per_replica_value,))
+
+
+def _make_indexed_slices(values, indices, dense_shape):
+  tensor = ops.IndexedSlices(
+      values=constant_op.constant(values),
+      indices=constant_op.constant(indices),
+      dense_shape=constant_op.constant(dense_shape))
+  return tensor
+
+
+def _get_num_replicas_per_client(strategy):
+  if isinstance(strategy, CollectiveAllReduceStrategy):
+    resolver = strategy.cluster_resolver
+    return max(nest.flatten(resolver.num_accelerators())[0], 1)
+  else:
+    return strategy.num_replicas_in_sync
+
+
+def _is_tpu_strategy(strategy):
+  return isinstance(strategy,
+                    (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1,
+                     tpu_strategy.TPUStrategyV2))
+
+
+if __name__ == '__main__':
+  test_util.main()
diff --git a/tensorflow/python/distribute/strategy_reduce_test.py b/tensorflow/python/distribute/strategy_reduce_test.py
deleted file mode 100644
index a87cce2f0b8..00000000000
--- a/tensorflow/python/distribute/strategy_reduce_test.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for `strategy.reduce`."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import reduce_util
-from tensorflow.python.distribute import strategy_combinations
-from tensorflow.python.eager import def_function
-from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-
-
-class StrategyReduceTest(test.TestCase, parameterized.TestCase):
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]
-      ))
-  def test_reduce_with_axis(self, distribution):
-
-    @def_function.function
-    def fn():
-      return constant_op.constant([1., 2.])
-    x = distribution.run(fn)
-
-    x_m = distribution.reduce(reduce_util.ReduceOp.MEAN, x, axis=0)
-    self.assertEqual(1.5, self.evaluate(x_m))
-    x_s = distribution.reduce(reduce_util.ReduceOp.SUM, x, axis=0)
-    self.assertEqual(3 * distribution.num_replicas_in_sync, self.evaluate(x_s))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/distribute/strategy_test_lib.py b/tensorflow/python/distribute/strategy_test_lib.py
index 0bc4c6fca68..cbdcf51b9dd 100644
--- a/tensorflow/python/distribute/strategy_test_lib.py
+++ b/tensorflow/python/distribute/strategy_test_lib.py
@@ -28,6 +28,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.util import event_pb2
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import reduce_util
@@ -340,7 +341,7 @@ class DistributionTestBase(test.TestCase):
       self, strategy, input_fn, expected_values, ignore_order=False):
     assert_same = self.assertCountEqual if ignore_order else self.assertEqual
 
-    iterable = strategy.experimental_distribute_datasets_from_function(input_fn)
+    iterable = strategy.distribute_datasets_from_function(input_fn)
     if context.executing_eagerly():
       iterator = iter(iterable)
 
@@ -429,6 +430,8 @@ class DistributionTestBase(test.TestCase):
       self.assertEqual((1,) * len(global_step_tensors), global_step_values)
 
   def _test_numpy_dataset(self, strategy, session=None, run_in_function=False):
+    if not isinstance(strategy, distribute_lib.StrategyV1):
+      self.skipTest("n/a: V1 only")
     cached_session = session or self.cached_session()
     with strategy.scope(), cached_session as sess:
       x = np.asarray([[1, 2], [6, 12], [2, 4], [5, 10], [3, 6], [4, 8]])
diff --git a/tensorflow/python/distribute/test_util.py b/tensorflow/python/distribute/test_util.py
index de45bcfdecb..82867edb4c2 100644
--- a/tensorflow/python/distribute/test_util.py
+++ b/tensorflow/python/distribute/test_util.py
@@ -20,11 +20,12 @@ from __future__ import print_function
 
 import functools
 
+from tensorflow.python.compat import v2_compat
 from tensorflow.python.distribute import collective_all_reduce_strategy
-from tensorflow.python.distribute import cross_device_utils
-from tensorflow.python.distribute import distribute_utils
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import values
-from tensorflow.python.eager import def_function
+from tensorflow.python.eager import context
+from tensorflow.python.framework import config
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.util import nest
@@ -57,16 +58,40 @@ def _gather(strategy, value):
     return array_ops.stack(value._values)
   assert len(strategy.extended.worker_devices) == len(value._values)
   inputs = [array_ops.expand_dims_v2(v, axis=0) for v in value._values]
-  collective_keys = strategy.extended._collective_keys
-  devices = strategy.extended.worker_devices
-  group_size = strategy.num_replicas_in_sync
-
-  @def_function.function
-  def gather_fn():
-    gathered = cross_device_utils.build_collective_gather(
-        inputs, devices, group_size, collective_keys)
-    return distribute_utils.update_regroup(
-        strategy.extended, gathered, group=True)
-
-  return gather_fn()
+  return strategy._gather(values.PerReplica(inputs), axis=0)
   # pylint: enable=protected-access
+
+
+def set_logical_devices_to_at_least(device, num):
+  """Create logical devices of at least a given number."""
+  if num < 1:
+    raise ValueError("`num` must be at least 1 not %r" % (num,))
+  physical_devices = config.list_physical_devices(device)
+  if not physical_devices:
+    raise RuntimeError("No {} found".format(device))
+  if len(physical_devices) >= num:
+    return
+  # By default each physical device corresponds to one logical device. We create
+  # multiple logical devices for the last physical device so that we have `num`
+  # logical devices.
+  num = num - len(physical_devices) + 1
+  logical_devices = []
+  for _ in range(num):
+    if device.upper() == "GPU":
+      logical_devices.append(
+          context.LogicalDeviceConfiguration(memory_limit=2048))
+    else:
+      logical_devices.append(context.LogicalDeviceConfiguration())
+  # Create logical devices from the the last device since sometimes the first
+  # GPU is the primary graphic card and may has less memory available.
+  config.set_logical_device_configuration(physical_devices[-1], logical_devices)
+
+
+def main(enable_v2_behavior=True):
+  """All-in-one main function for tf.distribute tests."""
+  if enable_v2_behavior:
+    v2_compat.enable_v2_behavior()
+  else:
+    v2_compat.disable_v2_behavior()
+  # TODO(b/131360402): configure default logical devices.
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/distribute/test_util_test.py b/tensorflow/python/distribute/test_util_test.py
index 7dab2e199b1..165f97be6e2 100644
--- a/tensorflow/python/distribute/test_util_test.py
+++ b/tensorflow/python/distribute/test_util_test.py
@@ -1,13 +1,13 @@
 # Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
+# Licensed under the Apache License, Version 2.0 (the 'License');
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
+# distributed under the License is distributed on an 'AS IS' BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
@@ -23,8 +23,10 @@ from absl.testing import parameterized
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import test_util
+from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
+from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 
@@ -71,5 +73,14 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         self.evaluate(results['bar'][1]), [1.] * strategy.num_replicas_in_sync)
 
 
+class LogicalDevicesTest(test.TestCase):
+
+  def testLogicalCPUs(self):
+    context._reset_context()
+    test_util.set_logical_devices_to_at_least('CPU', 3)
+    cpu_device = config.list_physical_devices('CPU')[0]
+    self.assertLen(config.get_logical_device_configuration(cpu_device), 3)
+
+
 if __name__ == '__main__':
-  combinations.main()
+  test_util.main()
diff --git a/tensorflow/python/distribute/tf_function_test.py b/tensorflow/python/distribute/tf_function_test.py
index 967abebdfb3..38d070c2788 100644
--- a/tensorflow/python/distribute/tf_function_test.py
+++ b/tensorflow/python/distribute/tf_function_test.py
@@ -145,7 +145,7 @@ class TFFunctionTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.all_strategies, mode=["eager"]))
-  def testRetraceOnSaving(self, distribution):
+  def testRetraceOnSavingFirstTraceInScope(self, distribution):
     with distribution.scope():
       v = variables.Variable(0.)
 
@@ -167,6 +167,31 @@ class TFFunctionTest(test.TestCase, parameterized.TestCase):
       func()
     self.assertEqual(prev_tracing_count, tracing_count[0])
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.all_strategies, mode=["eager"]))
+  def testRetraceOnSavingFirstTraceOutsideScope(self, distribution):
+    with distribution.scope():
+      v = variables.Variable(0.)
+
+    tracing_count = [0]
+
+    @def_function.function
+    def func():
+      tracing_count[0] += 1
+      return v + 1.
+
+    func()
+    prev_tracing_count = tracing_count[0]
+    with save_context.save_context(save_options.SaveOptions()):
+      func()
+    self.assertEqual(prev_tracing_count + 1, tracing_count[0])
+
+    prev_tracing_count = tracing_count[0]
+    with save_context.save_context(save_options.SaveOptions()):
+      func()
+    self.assertEqual(prev_tracing_count, tracing_count[0])
+
 
 if __name__ == "__main__":
   v2_compat.enable_v2_behavior()
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 22aeb37ff7c..cafdf15cb59 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -23,8 +23,8 @@ import collections
 import contextlib
 import copy
 import weakref
-from absl import logging
 
+from absl import logging
 import numpy as np
 
 from tensorflow.compiler.xla.experimental.xla_sharding import xla_sharding
@@ -52,6 +52,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -125,7 +126,7 @@ class TPUStrategyV2(distribute_lib.Strategy):
   `strategy.run` is called inside a `tf.function` if eager
   behavior is enabled. See more details in https://www.tensorflow.org/guide/tpu.
 
-  `experimental_distribute_datasets_from_function` and
+  `distribute_datasets_from_function` and
   `experimental_distribute_dataset` APIs can be used to distribute the dataset
   across the TPU workers when writing your own training loop. If you are using
   `fit` and `compile` methods available in `tf.keras.Model`, then Keras will
@@ -144,7 +145,7 @@ class TPUStrategyV2(distribute_lib.Strategy):
   ...   y = np.random.randint(2, size=(2, 1))
   ...   dataset = tf.data.Dataset.from_tensor_slices((x, y))
   ...   return dataset.repeat().batch(1, drop_remainder=True)
-  >>> dist_dataset = strategy.experimental_distribute_datasets_from_function(
+  >>> dist_dataset = strategy.distribute_datasets_from_function(
   ...     dataset_fn)
   >>> iterator = iter(dist_dataset)
 
@@ -190,7 +191,7 @@ class TPUStrategyV2(distribute_lib.Strategy):
   ...   # Add operation will be executed on logical device 0.
   ...   output = strategy.experimental_assign_to_logical_device(output, 0)
   ...   return output
-  >>> dist_dataset = strategy.experimental_distribute_datasets_from_function(
+  >>> dist_dataset = strategy.distribute_datasets_from_function(
   ...     dataset_fn)
   >>> iterator = iter(dist_dataset)
   >>> strategy.run(step_fn, args=(next(iterator),))
@@ -229,7 +230,7 @@ class TPUStrategyV2(distribute_lib.Strategy):
     `tf.distribute.DistributedValues`, such as those produced by a
     `tf.distribute.DistributedDataset` from
     `tf.distribute.Strategy.experimental_distribute_dataset` or
-    `tf.distribute.Strategy.experimental_distribute_datasets_from_function`,
+    `tf.distribute.Strategy.distribute_datasets_from_function`,
     when `fn` is executed on a particular replica, it will be executed with the
     component of `tf.distribute.DistributedValues` that correspond to that
     replica.
@@ -278,6 +279,200 @@ class TPUStrategyV2(distribute_lib.Strategy):
     options = options or distribute_lib.RunOptions()
     return self.extended.tpu_run(fn, args, kwargs, options)
 
+  def experimental_assign_to_logical_device(self, tensor, logical_device_id):
+    """Adds annotation that `tensor` will be assigned to a logical device.
+
+    This adds an annotation to `tensor` specifying that operations on
+    `tensor` will be invoked on logical core device id `logical_device_id`.
+    When model parallelism is used, the default behavior is that all ops
+    are placed on zero-th logical device.
+
+    ```python
+
+    # Initializing TPU system with 2 logical devices and 4 replicas.
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+    tf.config.experimental_connect_to_cluster(resolver)
+    topology = tf.tpu.experimental.initialize_tpu_system(resolver)
+    device_assignment = tf.tpu.experimental.DeviceAssignment.build(
+        topology,
+        computation_shape=[1, 1, 1, 2],
+        num_replicas=4)
+    strategy = tf.distribute.TPUStrategy(
+        resolver, experimental_device_assignment=device_assignment)
+    iterator = iter(inputs)
+
+    @tf.function()
+    def step_fn(inputs):
+      output = tf.add(inputs, inputs)
+
+      # Add operation will be executed on logical device 0.
+      output = strategy.experimental_assign_to_logical_device(output, 0)
+      return output
+
+    strategy.run(step_fn, args=(next(iterator),))
+    ```
+
+    Args:
+      tensor: Input tensor to annotate.
+      logical_device_id: Id of the logical core to which the tensor will be
+        assigned.
+
+    Raises:
+      ValueError: The logical device id presented is not consistent with total
+      number of partitions specified by the device assignment.
+
+    Returns:
+      Annotated tensor with identical value as `tensor`.
+    """
+    num_logical_devices_per_replica = self.extended._tpu_devices.shape[1]  # pylint: disable=protected-access
+    if (logical_device_id < 0 or
+        logical_device_id >= num_logical_devices_per_replica):
+      raise ValueError("`logical_core_id` to assign must be lower then total "
+                       "number of logical devices per replica. Received "
+                       "logical device id {} but there are only total of {} "
+                       "logical devices in replica.".format(
+                           logical_device_id, num_logical_devices_per_replica))
+    return xla_sharding.assign_device(
+        tensor, logical_device_id, use_sharding_op=True)
+
+  def experimental_split_to_logical_devices(self, tensor, partition_dimensions):
+    """Adds annotation that `tensor` will be split across logical devices.
+
+    This adds an annotation to tensor `tensor` specifying that operations on
+    `tensor` will be be split among multiple logical devices. Tensor `tensor`
+    will be split across dimensions specified by `partition_dimensions`.
+    The dimensions of `tensor` must be divisible by corresponding value in
+    `partition_dimensions`.
+
+    For example, for system with 8 logical devices, if `tensor` is an image
+    tensor with shape (batch_size, width, height, channel) and
+    `partition_dimensions` is [1, 2, 4, 1], then `tensor` will be split
+    2 in width dimension and 4 way in height dimension and the split
+    tensor values will be fed into 8 logical devices.
+
+    ```python
+    # Initializing TPU system with 8 logical devices and 1 replica.
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+    tf.config.experimental_connect_to_cluster(resolver)
+    topology = tf.tpu.experimental.initialize_tpu_system(resolver)
+    device_assignment = tf.tpu.experimental.DeviceAssignment.build(
+        topology,
+        computation_shape=[1, 2, 2, 2],
+        num_replicas=1)
+    strategy = tf.distribute.TPUStrategy(
+        resolver, experimental_device_assignment=device_assignment)
+
+    iterator = iter(inputs)
+
+    @tf.function()
+    def step_fn(inputs):
+      inputs = strategy.experimental_split_to_logical_devices(
+        inputs, [1, 2, 4, 1])
+
+      # model() function will be executed on 8 logical devices with `inputs`
+      # split 2 * 4  ways.
+      output = model(inputs)
+      return output
+
+    strategy.run(step_fn, args=(next(iterator),))
+    ```
+    Args:
+      tensor: Input tensor to annotate.
+      partition_dimensions: An unnested list of integers with the size equal to
+        rank of `tensor` specifying how `tensor` will be partitioned. The
+        product of all elements in `partition_dimensions` must be equal to the
+        total number of logical devices per replica.
+
+    Raises:
+      ValueError: 1) If the size of partition_dimensions does not equal to rank
+        of `tensor` or 2) if product of elements of `partition_dimensions` does
+        not match the number of logical devices per replica defined by the
+        implementing DistributionStrategy's device specification or
+        3) if a known size of `tensor` is not divisible by corresponding
+        value in `partition_dimensions`.
+
+    Returns:
+      Annotated tensor with identical value as `tensor`.
+    """
+    num_logical_devices_per_replica = self.extended._tpu_devices.shape[1]  # pylint: disable=protected-access
+    num_partition_splits = np.prod(partition_dimensions)
+    input_shape = tensor.shape
+    tensor_rank = len(input_shape)
+
+    if tensor_rank != len(partition_dimensions):
+      raise ValueError("Length of `partition_dimensions` ({}) must be  "
+                       "equal to the rank of `x` ({}).".format(
+                           len(partition_dimensions), tensor_rank))
+
+    for dim_index, dim_size in enumerate(input_shape):
+      if dim_size is None:
+        continue
+
+      split_size = partition_dimensions[dim_index]
+      if dim_size % split_size != 0:
+        raise ValueError("Tensor shape at dimension ({}) must be "
+                         "divisible by corresponding value specified "
+                         "by `partition_dimensions` ({}).".format(
+                             dim_index, split_size))
+
+    if num_partition_splits != num_logical_devices_per_replica:
+      raise ValueError("Number of logical devices ({}) does not match the "
+                       "number of partition splits specified ({}).".format(
+                           num_logical_devices_per_replica,
+                           num_partition_splits))
+
+    tile_assignment = np.arange(num_partition_splits).reshape(
+        partition_dimensions)
+    return xla_sharding.tile(tensor, tile_assignment, use_sharding_op=True)
+
+  def experimental_replicate_to_logical_devices(self, tensor):
+    """Adds annotation that `tensor` will be replicated to all logical devices.
+
+    This adds an annotation to tensor `tensor` specifying that operations on
+    `tensor` will be invoked on all logical devices.
+
+    ```python
+    # Initializing TPU system with 2 logical devices and 4 replicas.
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+    tf.config.experimental_connect_to_cluster(resolver)
+    topology = tf.tpu.experimental.initialize_tpu_system(resolver)
+    device_assignment = tf.tpu.experimental.DeviceAssignment.build(
+        topology,
+        computation_shape=[1, 1, 1, 2],
+        num_replicas=4)
+    strategy = tf.distribute.TPUStrategy(
+        resolver, experimental_device_assignment=device_assignment)
+
+    iterator = iter(inputs)
+
+    @tf.function()
+    def step_fn(inputs):
+      images, labels = inputs
+      images = strategy.experimental_split_to_logical_devices(
+        inputs, [1, 2, 4, 1])
+
+      # model() function will be executed on 8 logical devices with `inputs`
+      # split 2 * 4  ways.
+      output = model(inputs)
+
+      # For loss calculation, all logical devices share the same logits
+      # and labels.
+      labels = strategy.experimental_replicate_to_logical_devices(labels)
+      output = strategy.experimental_replicate_to_logical_devices(output)
+      loss = loss_fn(labels, output)
+
+      return loss
+
+    strategy.run(step_fn, args=(next(iterator),))
+    ```
+    Args:
+      tensor: Input tensor to annotate.
+
+    Returns:
+      Annotated tensor with identical value as `tensor`.
+    """
+    return xla_sharding.replicate(tensor, use_sharding_op=True)
+
 
 @tf_export("distribute.experimental.TPUStrategy", v1=[])
 @deprecation.deprecated_endpoints("distribute.experimental.TPUStrategy")
@@ -558,7 +753,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
         dataset,
         input_workers,
         self._container_strategy(),
-        split_batch_by=self._num_replicas_in_sync)
+        num_replicas_in_sync=self._num_replicas_in_sync)
 
   def _make_input_fn_iterator(
       self,
@@ -615,10 +810,9 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
         dataset,
         self._get_input_workers(options),
         self._container_strategy(),
-        split_batch_by=self._num_replicas_in_sync)
+        num_replicas_in_sync=self._num_replicas_in_sync)
 
-  def _experimental_distribute_datasets_from_function(self, dataset_fn,
-                                                      options):
+  def _distribute_datasets_from_function(self, dataset_fn, options):
     input_workers = self._get_input_workers(options)
     input_contexts = []
     num_workers = input_workers.num_workers
@@ -772,57 +966,6 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     finally:
       self._logical_device_stack.pop()
 
-  def _experimental_assign_to_logical_device(self, tensor, logical_device_id):
-    """See `DistributionStrategy.experimental_assign_to_logical_device`."""
-    num_logical_devices_per_replica = self._tpu_devices.shape[1]
-    if (logical_device_id < 0 or
-        logical_device_id >= num_logical_devices_per_replica):
-      raise ValueError("`logical_core_id` to assign must be lower then total "
-                       "number of logical devices per replica. Received "
-                       "logical device id {} but there are only total of {} "
-                       "logical devices in replica.".format(
-                           logical_device_id, num_logical_devices_per_replica))
-    return xla_sharding.assign_device(
-        tensor, logical_device_id, use_sharding_op=True)
-
-  def _experimental_split_to_logical_devices(self, tensor,
-                                             partition_dimensions):
-    """See `DistributionStrategy.experimental_split_to_logical_devices`."""
-    num_logical_devices_per_replica = self._tpu_devices.shape[1]
-    num_partition_splits = np.prod(partition_dimensions)
-    input_shape = tensor.shape
-    tensor_rank = len(input_shape)
-
-    if tensor_rank != len(partition_dimensions):
-      raise ValueError("Length of `partition_dimensions` ({}) must be  "
-                       "equal to the rank of `x` ({}).".format(
-                           len(partition_dimensions), tensor_rank))
-
-    for dim_index, dim_size in enumerate(input_shape):
-      if dim_size is None:
-        continue
-
-      split_size = partition_dimensions[dim_index]
-      if dim_size % split_size != 0:
-        raise ValueError("Tensor shape at dimension ({}) must be "
-                         "divisible by corresponding value specified "
-                         "by `partition_dimensions` ({}).".format(
-                             dim_index, split_size))
-
-    if num_partition_splits != num_logical_devices_per_replica:
-      raise ValueError("Number of logical devices ({}) does not match the "
-                       "number of partition splits specified ({}).".format(
-                           num_logical_devices_per_replica,
-                           num_partition_splits))
-
-    tile_assignment = np.arange(num_partition_splits).reshape(
-        partition_dimensions)
-    return xla_sharding.tile(tensor, tile_assignment, use_sharding_op=True)
-
-  def _experimental_replicate_to_logical_devices(self, tensor):
-    """See `DistributionStrategy.experimental_replicate_to_logical_devices`."""
-    return xla_sharding.replicate(tensor, use_sharding_op=True)
-
   def _experimental_initialize_system(self):
     """Experimental method added to be used by Estimator.
 
@@ -879,7 +1022,54 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
         distribute_utils.TPU_VARIABLE_CLASS_MAPPING,
         distribute_utils.TPU_VARIABLE_POLICY_MAPPING, **kwargs)
 
-  def _reduce_to(self, reduce_op, value, destinations, experimental_hints):
+  def _gather_to_implementation(self, value, destinations, axis, options):
+    if not isinstance(value, values.DistributedValues):
+      return value
+
+    value_list = value.values
+    # pylint: disable=protected-access
+    if isinstance(
+        value,
+        values.DistributedVariable) and value._packed_variable is not None:
+      value_list = tuple(
+          value._packed_variable.on_device(d)
+          for d in value._packed_variable.devices)
+    # pylint: enable=protected-access
+
+    # Currently XLA op by op mode has a limit for the number of inputs for a
+    # single op, thus we break one `add_n` op into a group of `add_n` ops to
+    # work around the constraint.
+    if len(value.values) <= _XLA_OP_BY_OP_INPUTS_LIMIT:
+      output = array_ops.concat(value_list, axis=axis)
+    else:
+      output = array_ops.concat(
+          value_list[:_XLA_OP_BY_OP_INPUTS_LIMIT], axis=axis)
+      for i in range(_XLA_OP_BY_OP_INPUTS_LIMIT, len(value_list),
+                     _XLA_OP_BY_OP_INPUTS_LIMIT - 1):
+        output = array_ops.concat(
+            [output] + value_list[i:i + _XLA_OP_BY_OP_INPUTS_LIMIT - 1],
+            axis=axis)
+
+    output = self._broadcast_output(destinations, output)
+    return output
+
+  def _broadcast_output(self, destinations, output):
+    devices = cross_device_ops_lib.get_devices_from(destinations)
+
+    if len(devices) == 1:
+      # If necessary, copy to requested destination.
+      dest_canonical = device_util.canonicalize(devices[0])
+      host_canonical = device_util.canonicalize(self._host_device)
+
+      if dest_canonical != host_canonical:
+        with ops.device(dest_canonical):
+          output = array_ops.identity(output)
+    else:
+      output = cross_device_ops_lib.simple_broadcast(output, destinations)
+
+    return output
+
+  def _reduce_to(self, reduce_op, value, destinations, options):
     if (isinstance(value, values.DistributedValues) or
         tensor_util.is_tensor(value)
        ) and tpu_values.enclosing_tpu_context() is not None:
@@ -923,19 +1113,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     if reduce_op == reduce_util.ReduceOp.MEAN:
       output *= (1. / len(value_list))
 
-    devices = cross_device_ops_lib.get_devices_from(destinations)
-
-    if len(devices) == 1:
-      # If necessary, copy to requested destination.
-      dest_canonical = device_util.canonicalize(devices[0])
-      host_canonical = device_util.canonicalize(self._host_device)
-
-      if dest_canonical != host_canonical:
-        with ops.device(dest_canonical):
-          output = array_ops.identity(output)
-    else:
-      output = cross_device_ops_lib.simple_broadcast(output, destinations)
-
+    output = self._broadcast_output(destinations, output)
     return output
 
   def _update(self, var, fn, args, kwargs, group):
@@ -1203,6 +1381,9 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     # TPUStrategy.
     return False
 
+  def _get_local_replica_id(self, replica_id_in_sync_group):
+    return replica_id_in_sync_group
+
 
 class _TPUReplicaContext(distribute_lib.ReplicaContext):
   """Replication Context class for TPU Strategy."""
@@ -1229,6 +1410,70 @@ class _TPUReplicaContext(distribute_lib.ReplicaContext):
     """Places variables and ops on the specified logical device."""
     return self.strategy.extended.experimental_logical_device(logical_device_id)
 
+  # TODO(wxinyi): Investigate whether to use cross_replica_sum to optimize it.
+  def _all_gather(self, value, axis, options=None):
+    del options
+    for v in nest.flatten(value):
+      if isinstance(v, ops.IndexedSlices):
+        raise NotImplementedError("gather/all_gather does not support "
+                                  "IndexedSlices")
+
+    def _all_to_all(value, axis):
+      # The underlying AllToAllOp first do a split of the input value and then
+      # cross-replica communication and concatenation of the result. So we
+      # concatenate the local tensor here first.
+      inputs = array_ops.concat(
+          [value for _ in range(self.num_replicas_in_sync)], axis=0)
+      unordered_output = tpu_ops.all_to_all(
+          inputs,
+          concat_dimension=axis,
+          split_dimension=0,
+          split_count=self.num_replicas_in_sync)
+
+      # Re-order since xla.replica_id and ReplicaContext.replica_id mismatch.
+      # xla_id = xla.replica_id()
+      concat_replica_id = array_ops.concat([
+          array_ops.expand_dims_v2(self.replica_id_in_sync_group, 0)
+          for _ in range(self.num_replicas_in_sync)
+      ],
+                                           axis=0)
+      replica_ids = tpu_ops.all_to_all(
+          concat_replica_id,
+          concat_dimension=0,
+          split_dimension=0,
+          split_count=self.num_replicas_in_sync)
+
+      splited_unordered = array_ops.split(
+          unordered_output,
+          num_or_size_splits=self.num_replicas_in_sync,
+          axis=axis)
+      sorted_with_extra_dim = math_ops.unsorted_segment_sum(
+          array_ops.concat([
+              array_ops.expand_dims(replica, axis=0)
+              for replica in splited_unordered
+          ],
+                           axis=0),
+          replica_ids,
+          num_segments=self.num_replicas_in_sync)
+
+      splited_with_extra_dim = array_ops.split(
+          sorted_with_extra_dim,
+          num_or_size_splits=self.num_replicas_in_sync,
+          axis=0)
+      squeezed = [
+          array_ops.squeeze(replica, axis=0)
+          for replica in splited_with_extra_dim
+      ]
+      result = array_ops.concat(squeezed, axis=axis)
+      return result
+
+    @custom_gradient.custom_gradient
+    def grad_wrapper(*xs):
+      ys = [_all_to_all(t, axis=axis) for t in xs]
+      return ys, lambda *dy_s: self._all_gather(dy_s, axis)
+
+    return nest.pack_sequence_as(value, grad_wrapper(*nest.flatten(value)))
+
 
 def _set_last_step_outputs(ctx, last_step_tensor_outputs):
   """Sets the last step outputs on the given context."""
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index c2aa68a0785..e0afda84359 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -132,6 +132,28 @@ class TPUTest(test.TestCase):
       result = bar() + 1
       self.assertAllEqual(result, 2)
 
+  def test_tpu_output_device(self):
+
+    def foo():
+      return 1 + 1
+
+    func1 = function.defun_with_attributes(
+        foo, attributes={"_XlaMustCompile": False})
+    func2 = function.defun_with_attributes(
+        foo, attributes={
+            "_OutputsOnOpDevice": True,
+            "_XlaMustCompile": False
+        })
+
+    with ops.device("/device:TPU:0"):
+      ret1 = func1()
+      ret2 = func2()
+
+    self.assertAllEqual(ret1.backing_device,
+                        "/job:localhost/replica:0/task:0/device:CPU:0")
+    self.assertAllEqual(ret2.backing_device,
+                        "/job:localhost/replica:0/task:0/device:TPU:0")
+
   def test_on_demand_op_with_dynamic_output(self):
     with ops.device("/device:TPU:0"):
       where_output = array_ops.where([True, False, True])
@@ -509,10 +531,9 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
       return dataset.map(make_sparse)
 
     dataset = iter(
-        strategy.experimental_distribute_datasets_from_function(
+        strategy.distribute_datasets_from_function(
             dataset_fn,
-            distribute_lib.InputOptions(
-                experimental_prefetch_to_device=False)))
+            distribute_lib.InputOptions(experimental_prefetch_to_device=False)))
 
     sparse, result = sparse_lookup(dataset)
 
@@ -560,10 +581,9 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
       return dataset.map(make_sparse)
 
     dataset = iter(
-        strategy.experimental_distribute_datasets_from_function(
+        strategy.distribute_datasets_from_function(
             dataset_fn,
-            distribute_lib.InputOptions(
-                experimental_prefetch_to_device=False)))
+            distribute_lib.InputOptions(experimental_prefetch_to_device=False)))
 
     output = sparse_lookup(dataset)
 
@@ -616,7 +636,7 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
       return dataset.map(make_sparse)
 
     dataset = iter(
-        strategy.experimental_distribute_datasets_from_function(
+        strategy.distribute_datasets_from_function(
             dataset_fn,
             options=distribute_lib.InputOptions(
                 experimental_prefetch_to_device=False)))
@@ -730,7 +750,7 @@ class TPUStrategyDataPrefetchTest(test.TestCase):
       return dataset.batch(strategy.num_replicas_in_sync)
 
     with self.assertRaisesRegex(ValueError, "TPUStrategy does not support"):
-      iter(strategy.experimental_distribute_datasets_from_function(dataset_fn))
+      iter(strategy.distribute_datasets_from_function(dataset_fn))
 
   def test_prefetch_to_device_ragged_dataset_fn(self):
     strategy = get_tpu_strategy()
@@ -745,7 +765,7 @@ class TPUStrategyDataPrefetchTest(test.TestCase):
       return dataset.batch(strategy.num_replicas_in_sync)
 
     with self.assertRaisesRegex(ValueError, "TPUStrategy does not support"):
-      iter(strategy.experimental_distribute_datasets_from_function(dataset_fn))
+      iter(strategy.distribute_datasets_from_function(dataset_fn))
 
 
 class TPUStrategyDistributionTest(
diff --git a/tensorflow/python/distribute/v1/BUILD b/tensorflow/python/distribute/v1/BUILD
new file mode 100644
index 00000000000..3c45d9d441e
--- /dev/null
+++ b/tensorflow/python/distribute/v1/BUILD
@@ -0,0 +1,41 @@
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cuda_py_test(
+    name = "cross_device_ops_test",
+    srcs = ["cross_device_ops_test.py"],
+    python_version = "PY3",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:collective_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:kernels",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
+        "//tensorflow/python/distribute:collective_util",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:cross_device_ops",
+        "//tensorflow/python/distribute:cross_device_utils",
+        "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/distribute:distribute_utils",
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/distribute:test_util",
+        "//tensorflow/python/distribute:values",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/distribute/v1/cross_device_ops_test.py b/tensorflow/python/distribute/v1/cross_device_ops_test.py
new file mode 100644
index 00000000000..d552115bf4b
--- /dev/null
+++ b/tensorflow/python/distribute/v1/cross_device_ops_test.py
@@ -0,0 +1,850 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for CrossDeviceOps in v1 graph mode."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import os
+import threading
+import time
+
+from absl.testing import parameterized
+import numpy as np
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.distribute import cluster_resolver
+from tensorflow.python.distribute import collective_all_reduce_strategy as mwms_lib
+from tensorflow.python.distribute import collective_util
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_utils
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import values as value_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import kernels
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import collective_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+
+
+def _get_devices(devices):
+  if isinstance(devices, (tuple, list)):
+    return tuple(device_util.resolve(d) for d in devices)
+  elif isinstance(devices, value_lib.DistributedValues):
+    return devices._devices
+  elif isinstance(devices, ops.Tensor):
+    return (device_util.resolve(devices.device),)
+  return (device_util.resolve(devices),)
+
+
+def _make_per_replica(values, devices, regroup=False):
+  devices = _get_devices(devices)
+  assert len(values) == len(devices)
+
+  # We simulate the result of regroup called on PerReplica which strips the
+  # PerReplica wrapper if it has only one value.
+  if len(values) == 1 and regroup:
+    with ops.device(devices[0]):
+      placed_v = array_ops.identity(values[0])
+    return placed_v
+
+  index = []
+  for d, v in zip(devices, values):
+    with ops.device(d):
+      placed_v = array_ops.identity(v)
+    index.append(placed_v)
+  return distribute_utils.regroup(index)
+
+
+# pylint: disable=g-doc-args,g-doc-return-or-yield
+def _fake_mirrored(value, devices):
+  """Create a faked Mirrored object for testing.
+
+  All components of the returned Mirrored have the same objects, which is not
+  true in reality.
+  """
+  devices = _get_devices(devices)
+  values = []
+  for d in devices:
+    with ops.device(d):
+      values.append(array_ops.identity(value))
+  return distribute_utils.regroup(
+      values,
+      wrap_class=value_lib.Mirrored)
+
+
+def _make_indexed_slices(values, indices, dense_shape, device):
+  with ops.device(device):
+    tensor = ops.IndexedSlices(
+        values=constant_op.constant(values),
+        indices=constant_op.constant(indices),
+        dense_shape=constant_op.constant(dense_shape))
+  return tensor
+
+
+def _make_mirrored_indexed_slices(devices, values, indices, dense_shape):
+  values = [_make_indexed_slices(values, indices, dense_shape, d)
+            for d in devices]
+  return distribute_utils.regroup(
+      values,
+      wrap_class=value_lib.Mirrored)
+
+
+_cpu_device = "/device:CPU:0"
+
+
+class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
+
+  def _assert_indexed_slices_equal(self, left, right):
+    self.assertIsInstance(left, ops.IndexedSlices)
+    self.assertIsInstance(right, ops.IndexedSlices)
+    self.assertEqual(
+        device_util.resolve(left.device), device_util.resolve(right.device))
+    self.assertAllEqual(
+        self.evaluate(ops.convert_to_tensor(left)),
+        self.evaluate(ops.convert_to_tensor(right)))
+
+  def _assert_mirrored_equal(self,
+                             left_list,
+                             right_list,
+                             sess=None,
+                             run_options=None):
+    if not isinstance(left_list, list):
+      left_list, right_list = [left_list], [right_list]
+
+    for left, right in zip(left_list, right_list):
+      self.assertEqual(type(left), type(right))
+
+      # Convert Mirrored to a list since sess.run(Mirrored) only returns one
+      # value.
+      if isinstance(left, value_lib.Mirrored):
+        left, right = left.values, right.values
+      else:
+        # When there's only one replica Mirrored is automatically unwrapped.
+        left, right = [left], [right]
+
+      for left_value, right_value in zip(left, right):
+        self.assertEqual(
+            device_util.resolve(left_value.device),
+            device_util.resolve(right_value.device))
+
+      # Densify IndexedSlices.
+      left = [ops.convert_to_tensor(v) for v in left]
+      right = [ops.convert_to_tensor(v) for v in right]
+      if not context.executing_eagerly():
+        left, right = sess.run((left, right), options=run_options)
+      for left_value, right_value in zip(left, right):
+        self.assertAllEqual(left_value, right_value)
+
+  def _testReductionAndBroadcast(self, cross_device_ops, devices):
+    if context.num_gpus() < sum(1 for d in devices if "GPU" in d.upper()):
+      self.skipTest("Not enough GPUs")
+
+    with self.cached_session() as sess:
+      values = [constant_op.constant(float(d)) for d in range(len(devices))]
+      per_replica = _make_per_replica(values, devices)
+      mean = (len(devices) - 1.) / 2.
+
+      values_2 = [constant_op.constant(d + 1.0) for d in range(len(devices))]
+      per_replica_2 = _make_per_replica(values_2, devices)
+      mean_2 = mean + 1.
+
+      destination_mirrored = _fake_mirrored(1., devices)
+      destination_different = _fake_mirrored(1.,
+                                             device_util.resolve(_cpu_device))
+      destination_str = device_util.resolve(_cpu_device)
+
+      all_destinations = [
+          destination_mirrored,
+          destination_different,
+          destination_str,
+      ]
+
+      # test reduce()
+      for destinations in all_destinations:
+        self._assert_mirrored_equal(
+            cross_device_ops.reduce(
+                reduce_util.ReduceOp.MEAN,
+                per_replica,
+                destinations=destinations), _fake_mirrored(mean, destinations),
+            sess)
+        self._assert_mirrored_equal(
+            cross_device_ops.reduce(
+                reduce_util.ReduceOp.MEAN,
+                per_replica_2,
+                destinations=destinations),
+            _fake_mirrored(mean_2, destinations), sess)
+        self._assert_mirrored_equal(
+            cross_device_ops.reduce(
+                reduce_util.ReduceOp.SUM,
+                per_replica,
+                destinations=destinations),
+            _fake_mirrored(mean * len(devices), destinations), sess)
+        self._assert_mirrored_equal(
+            cross_device_ops.reduce(
+                reduce_util.ReduceOp.SUM,
+                per_replica_2,
+                destinations=destinations),
+            _fake_mirrored(mean_2 * len(devices), destinations), sess)
+
+      # test batch_reduce()
+      for d1, d2 in itertools.product(all_destinations, all_destinations):
+        self._assert_mirrored_equal(
+            cross_device_ops.batch_reduce(reduce_util.ReduceOp.MEAN,
+                                          [(per_replica, d1),
+                                           (per_replica_2, d2)]),
+            [_fake_mirrored(mean, d1),
+             _fake_mirrored(mean_2, d2)], sess)
+        self._assert_mirrored_equal(
+            cross_device_ops.batch_reduce(reduce_util.ReduceOp.SUM,
+                                          [(per_replica, d1),
+                                           (per_replica_2, d2)]),
+            [
+                _fake_mirrored(mean * len(devices), d1),
+                _fake_mirrored(mean_2 * len(devices), d2)
+            ], sess)
+
+      # test broadcast()
+      for destinations in all_destinations:
+        self._assert_mirrored_equal(
+            cross_device_ops.broadcast(constant_op.constant(1.), destinations),
+            _fake_mirrored(1., destinations), sess)
+
+  def _testIndexedSlicesAllReduce(self, devices, cross_device_ops_instance,
+                                  reduce_op, batch_reduce):
+    with self.cached_session() as sess:
+      dense_shape = [5, 2]
+      t0 = _make_indexed_slices([[1., 2.]], [1], dense_shape, devices[0])
+      t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], dense_shape,
+                                devices[1])
+      per_replica = value_lib.PerReplica((t0, t1))
+
+      if batch_reduce:
+        result = cross_device_ops_instance.batch_reduce(
+            reduce_op, [(per_replica, per_replica)])
+      else:
+        result = cross_device_ops_instance.reduce(reduce_op, per_replica,
+                                                  per_replica)
+
+      total_indices_with_dups = [1, 1, 3]
+      total_indices_without_dups = [1, 3]
+
+      if reduce_op == reduce_util.ReduceOp.SUM:
+        total_values_with_dups = [[1., 2.], [3., 4.], [5., 6.]]
+        total_values_without_dups = [[4., 6.], [5., 6.]]
+      else:
+        assert reduce_op == reduce_util.ReduceOp.MEAN
+        total_values_with_dups = [[0.5, 1.], [1.5, 2.], [2.5, 3.]]
+        total_values_without_dups = [[2., 3.], [2.5, 3.]]
+
+      total_mirrored_with_dups = _make_mirrored_indexed_slices(
+          devices, total_values_with_dups, total_indices_with_dups, dense_shape)
+      total_mirrored_without_dups = _make_mirrored_indexed_slices(
+          devices, total_values_without_dups, total_indices_without_dups,
+          dense_shape)
+
+      # Test that the result is semantically equal to both the concatenated
+      # IndexedSlices, as well as when the duplicate indices are summed up.
+      if batch_reduce:
+        total_mirrored_with_dups = [total_mirrored_with_dups]
+        total_mirrored_without_dups = [total_mirrored_without_dups]
+
+      self._assert_mirrored_equal(total_mirrored_with_dups, result, sess)
+      self._assert_mirrored_equal(total_mirrored_without_dups, result, sess)
+
+
+class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
+
+  reduction_to_one_combinations = combinations.combine(
+      cross_device_ops=[
+          combinations.NamedObject("DefaultReductionToOneDevice",
+                                   cross_device_ops_lib.ReductionToOneDevice()),
+          combinations.NamedObject(
+              "ReductionToCPUDeviceCrossDeviceOps",
+              cross_device_ops_lib.ReductionToOneDevice(
+                  reduce_to_device=_cpu_device)),
+          combinations.NamedObject(
+              "AccumulateNCrossDeviceOp",
+              cross_device_ops_lib.ReductionToOneDevice(
+                  accumulation_fn=math_ops.add_n)),
+      ],
+      devices=[
+          ["/cpu:0"],
+          ["/cpu:0", "/gpu:0"],
+          ["/gpu:0", "/gpu:1"],
+      ],
+      mode=["graph", "eager"])
+  allreduce_combinations = combinations.combine(
+      cross_device_ops=[
+          combinations.NamedObject(
+              "AllReduce",
+              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1)),
+          combinations.NamedObject(
+              "AllReduceNoGradientRepacking",
+              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0)),
+          combinations.NamedObject("NcclAllReduce",
+                                   cross_device_ops_lib.NcclAllReduce()),
+          combinations.NamedObject(
+              "HierarchicalCopy",
+              cross_device_ops_lib.HierarchicalCopyAllReduce(8)),
+      ],
+      devices=[
+          ["/gpu:0", "/gpu:1"],
+      ],
+      mode=["graph", "eager"])
+
+  @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
+  def testReductionAndBroadcast(self, cross_device_ops, devices):
+    if isinstance(
+        cross_device_ops._obj,  # pylint: disable=protected-access
+        cross_device_ops_lib.AllReduceCrossDeviceOps
+    ) and context.executing_eagerly():
+      self.skipTest("b/149881884")
+    self._testReductionAndBroadcast(cross_device_ops, devices)
+
+  def testChooseAlgorithm(self):
+    # Not use nccl if there is any cpu device.
+    self.assertIsInstance(
+        cross_device_ops_lib.select_cross_device_ops(["/cpu:0"]),
+        cross_device_ops_lib.ReductionToOneDevice)
+
+    # Not use nccl if requested device is not visible to TensorFlow.
+    # TODO(yuefengz): make `select_cross_device_ops` work with device strings
+    # self.assertIsInstance(
+    #     cross_device_ops_lib.select_cross_device_ops(["/gpu:100"]),
+    #     cross_device_ops_lib.ReductionToOneDevice)
+
+    if context.num_gpus() < 1:
+      return
+
+    devices = ["/gpu:0"]
+
+    def mock_get_registered_kernels_for_op(op):
+      if op == "NcclAllReduce":
+        return [object]
+      else:
+        return []
+
+    # Use nccl if nccl kernel is found.
+    with test.mock.patch.object(kernels, "get_registered_kernels_for_op",
+                                mock_get_registered_kernels_for_op):
+      self.assertIsInstance(
+          cross_device_ops_lib.select_cross_device_ops(devices),
+          cross_device_ops_lib.NcclAllReduce)
+
+    # Not use nccl if nccl kernel is not found.
+    with test.mock.patch.object(kernels,
+                                "get_registered_kernels_for_op", lambda _: []):
+      self.assertIsInstance(
+          cross_device_ops_lib.select_cross_device_ops(devices),
+          cross_device_ops_lib.ReductionToOneDevice)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      required_gpus=1))
+  def testSimpleReduceWithIndexedSlices(self):
+    devices = ["/cpu:0", "/gpu:0"]
+    t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
+    t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
+    per_replica = value_lib.PerReplica((t0, t1))
+    result = cross_device_ops_lib._simple_reduce(
+        per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM)
+
+    # Test that the result is semantically equal to both the concatenated
+    # IndexedSlices with and without duplicate indices.
+    total_with_dups = _make_indexed_slices(
+        [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0])
+    total_without_dups = _make_indexed_slices(
+        [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
+    self._assert_indexed_slices_equal(total_with_dups, result)
+    self._assert_indexed_slices_equal(total_without_dups, result)
+
+  @combinations.generate(
+      combinations.combine(
+          cross_device_ops_instance=[
+              combinations.NamedObject(
+                  "ReductionToOneDevice",
+                  cross_device_ops_lib.ReductionToOneDevice()),
+              combinations.NamedObject(
+                  "AllReduceCrossDeviceOps",
+                  cross_device_ops_lib.AllReduceCrossDeviceOps())
+          ],
+          reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
+          batch_reduce=[True, False],
+          mode=["graph", "eager"],
+          required_gpus=1))
+  def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
+                                 batch_reduce):
+    devices = ["/cpu:0", "/gpu:0"]
+    self._testIndexedSlicesAllReduce(devices, cross_device_ops_instance,
+                                     reduce_op, batch_reduce)
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+          cross_device_ops_instance=[
+              combinations.NamedObject(
+                  "ReductionToOneDevice",
+                  cross_device_ops_lib.ReductionToOneDevice()),
+              combinations.NamedObject(
+                  "AllReduceCrossDeviceOps",
+                  cross_device_ops_lib.AllReduceCrossDeviceOps("ring"))
+          ],
+          batch_reduce=[True, False],
+          mode=["graph", "eager"]))
+  def testReduceDistributedVariable(self, distribution,
+                                    cross_device_ops_instance, batch_reduce):
+    with distribution.scope():
+      v = variables.Variable(1.)
+    if batch_reduce:
+      result = cross_device_ops_instance.batch_reduce(reduce_util.ReduceOp.MEAN,
+                                                      [(v, v)])[0]
+    else:
+      result = cross_device_ops_instance.reduce(reduce_util.ReduceOp.MEAN, v, v)
+    for v in result.values:
+      self.assertIsInstance(v, ops.Tensor)
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual(self.evaluate(result.values), [1.0, 1.0])
+
+
+NUM_WORKERS = 3
+
+CollectiveCommunication = collective_util.CollectiveCommunication
+
+
+class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
+                              CrossDeviceOpsTestBase):
+
+  collective_key_base = 100000
+
+  @classmethod
+  def setUpClass(cls):
+    """Create a local cluster with 3 workers."""
+    cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
+        num_workers=NUM_WORKERS, num_ps=0)
+
+  def setUp(self):
+    super(CollectiveAllReduceTest, self).setUp()
+    # Reusing keys is not supported well. So we have to give a different
+    # collective key base for different tests.
+    CollectiveAllReduceTest.collective_key_base += 100000
+    mwms_lib.CollectiveAllReduceStrategy._collective_key_base = (
+        CollectiveAllReduceTest.collective_key_base)
+
+  def _get_test_objects(self,
+                        task_type,
+                        task_id,
+                        num_gpus=0,
+                        communication=CollectiveCommunication.AUTO,
+                        use_strategy_object=False,
+                        local_mode=False):
+    collective_keys = cross_device_utils.CollectiveKeys(
+        group_key_start=10 + CollectiveAllReduceTest.collective_key_base)
+    if local_mode:
+      if num_gpus:
+        devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
+      else:
+        devices = ["/device:CPU:0"]
+
+      if use_strategy_object:
+        strategy = (mwms_lib.CollectiveAllReduceStrategy
+                    ._from_local_devices(devices, communication=communication))  # pylint: disable=protected-access
+        return strategy, devices, ""
+      else:
+        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
+            devices=devices,
+            group_size=len(devices),
+            collective_keys=collective_keys)
+        return collective_all_reduce_ops, devices, ""
+    else:
+      # NCCL requires physical GPUs for every replica, which we can't do with
+      # simulated multi host set up now.
+      assert communication != CollectiveCommunication.NCCL
+      if num_gpus:
+        devices = [
+            "/job:%s/task:%d/replica:0/device:GPU:%d" % (task_type, task_id, i)
+            for i in range(num_gpus)
+        ]
+      else:
+        devices = [
+            "/job:%s/task:%d/replica:0/device:CPU:0" % (task_type, task_id)
+        ]
+
+      if use_strategy_object:
+        resolver = cluster_resolver.SimpleClusterResolver(
+            cluster_spec=multi_worker_util.normalize_cluster_spec(
+                self._cluster_spec),
+            task_type=task_type,
+            task_id=task_id,
+            num_accelerators={"GPU": num_gpus})
+        strategy = mwms_lib.CollectiveAllReduceStrategy(
+            cluster_resolver=resolver, communication=communication)
+        return (strategy, devices,
+                "grpc://" + self._cluster_spec[task_type][task_id])
+      else:
+        collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
+            devices=devices,
+            group_size=len(devices) * NUM_WORKERS,
+            collective_keys=collective_keys)
+        return (collective_all_reduce_ops, devices,
+                "grpc://" + self._cluster_spec[task_type][task_id])
+
+  def _assert_mirrored_equal(self, left_list, right_list, sess=None):
+    if context.executing_eagerly():
+      run_options = None
+    else:
+      # TODO(b/151025792): figure out why missing run options would make the
+      # test flaky and whether this is a problem in TF 2.
+      run_options = config_pb2.RunOptions()
+      run_options.experimental.collective_graph_key = 5
+    super(CollectiveAllReduceTest, self)._assert_mirrored_equal(
+        left_list, right_list, sess, run_options=run_options)
+
+  def _test_reduction(self,
+                      task_type,
+                      task_id,
+                      num_gpus,
+                      communication,
+                      use_strategy_object=False,
+                      local_mode=False,
+                      hints=None):
+    collective_all_reduce, devices, master_target = self._get_test_objects(
+        task_type,
+        task_id,
+        num_gpus,
+        communication=communication,
+        use_strategy_object=use_strategy_object,
+        local_mode=local_mode)
+    if local_mode:
+      num_workers = 1
+      worker_device = None
+    else:
+      num_workers = len(self._cluster_spec.get("chief", [])) + len(
+          self._cluster_spec.get("worker", []))
+      worker_device = "/job:%s/task:%d" % (task_type, task_id)
+
+    def _reduce(test_object, reduce_op, per_replica, destinations):
+      if use_strategy_object:
+        with test_object.scope():
+          return test_object.extended.reduce_to(reduce_op, per_replica,
+                                                destinations, hints)
+      else:
+        return test_object.reduce(reduce_op, per_replica, destinations, hints)
+
+    def _batch_reduce(test_object, reduce_op, value_destination_pairs):
+      if use_strategy_object:
+        with test_object.scope():
+          return test_object.extended.batch_reduce_to(reduce_op,
+                                                      value_destination_pairs,
+                                                      hints)
+      else:
+        return test_object.batch_reduce(reduce_op, value_destination_pairs,
+                                        hints)
+
+    with ops.Graph().as_default(), \
+         ops.device(worker_device), \
+         self.cached_session(target=master_target) as sess:
+      # Collective ops doesn't support scalar tensors, so we have to construct
+      # 1-d tensors.
+      values = [constant_op.constant([float(d)]) for d in range(len(devices))]
+      per_replica = _make_per_replica(values, devices)
+      mean = np.array([(len(devices) - 1.) / 2.])
+
+      values_2 = [constant_op.constant([d + 1.0]) for d in range(len(devices))]
+      per_replica_2 = _make_per_replica(values_2, devices)
+      mean_2 = np.array([mean[0] + 1.])
+
+      destination_mirrored = _fake_mirrored(1., devices)
+      destination_different = _fake_mirrored(1., _cpu_device)
+      destination_str = _cpu_device
+
+      all_destinations = [
+          destination_different, destination_mirrored, destination_str
+      ]
+
+      # test reduce()
+      for destinations in all_destinations:
+        self._assert_mirrored_equal(
+            _reduce(
+                collective_all_reduce,
+                reduce_util.ReduceOp.MEAN,
+                per_replica,
+                destinations=destinations), _fake_mirrored(mean, destinations),
+            sess)
+        self._assert_mirrored_equal(
+            _reduce(
+                collective_all_reduce,
+                reduce_util.ReduceOp.MEAN,
+                per_replica_2,
+                destinations=destinations),
+            _fake_mirrored(mean_2, destinations), sess)
+        self._assert_mirrored_equal(
+            _reduce(
+                collective_all_reduce,
+                reduce_util.ReduceOp.SUM,
+                per_replica,
+                destinations=destinations),
+            _fake_mirrored(mean * len(devices) * num_workers, destinations),
+            sess)
+        self._assert_mirrored_equal(
+            _reduce(
+                collective_all_reduce,
+                reduce_util.ReduceOp.SUM,
+                per_replica_2,
+                destinations=destinations),
+            _fake_mirrored(mean_2 * len(devices) * num_workers, destinations),
+            sess)
+
+      # test batch_reduce()
+      for d1, d2 in itertools.product(all_destinations, all_destinations):
+        self._assert_mirrored_equal(
+            _batch_reduce(collective_all_reduce, reduce_util.ReduceOp.MEAN,
+                          [(per_replica, d1), (per_replica_2, d2)]),
+            [_fake_mirrored(mean, d1),
+             _fake_mirrored(mean_2, d2)], sess)
+        self._assert_mirrored_equal(
+            _batch_reduce(collective_all_reduce, reduce_util.ReduceOp.SUM,
+                          [(per_replica, d1), (per_replica_2, d2)]),
+            [
+                _fake_mirrored(mean * len(devices) * num_workers, d1),
+                _fake_mirrored(mean_2 * len(devices) * num_workers, d2)
+            ], sess)
+
+  def _get_indexed_slices(self,
+                          devices,
+                          start_i,
+                          variable_length,
+                          as_per_replica=True):
+    dense_shape = [10, 2]
+    values = ([[1., 2.]], [[3., 4.]], [[2., 1.]], [[0., 0.]], [[3., 1.]],
+              [[2., 1.]])
+    indices = ([1], [2], [3], [4], [5], [6])
+
+    # values and indices that have variable lengths.
+    vl_values = ([[1., 2.], [3., 4.]], [[3., 4.]], [[2., 1.]], [[0., 0.]],
+                 [[3., 1.], [2., 1.]], [[2., 1.]])
+    vl_indices = ([1, 2], [2], [3], [4], [5, 6], [6])
+
+    indexed_slices = []
+    for i, d in enumerate(devices):
+      idx = i + start_i
+      indexed_slices.append(
+          _make_indexed_slices(
+              vl_values[idx] if variable_length else values[idx],
+              vl_indices[idx] if variable_length else indices[idx], dense_shape,
+              d))
+    if as_per_replica:
+      per_replica = value_lib.PerReplica(indexed_slices)
+      return per_replica
+    else:
+      return indexed_slices
+
+  def _test_reduce_indexed_slices(self,
+                                  task_type,
+                                  task_id,
+                                  num_gpus,
+                                  communication,
+                                  batch_reduce,
+                                  variable_length,
+                                  local_mode=False):
+    collective_all_reduce, devices, master_target = self._get_test_objects(
+        task_type,
+        task_id,
+        num_gpus,
+        communication=communication,
+        local_mode=local_mode)
+    if local_mode:
+      num_workers = 1
+      worker_device = None
+    else:
+      num_workers = len(self._cluster_spec.get("chief", [])) + len(
+          self._cluster_spec.get("worker", []))
+      worker_device = "/job:%s/task:%d" % (task_type, task_id)
+    with ops.Graph().as_default(), \
+         ops.device(worker_device), \
+         self.cached_session(target=master_target) as sess:
+      per_replica = self._get_indexed_slices(devices,
+                                             (task_id or 0) * max(num_gpus, 1),
+                                             variable_length)
+
+      if batch_reduce:
+        result = collective_all_reduce.batch_reduce(
+            reduce_util.ReduceOp.SUM, [(per_replica, per_replica)])[0]
+      else:
+        result = collective_all_reduce.reduce(reduce_util.ReduceOp.SUM,
+                                              per_replica, per_replica)
+      if num_gpus > 1:
+        self.assertIsInstance(result, value_lib.Mirrored)
+
+      run_options = config_pb2.RunOptions()
+      run_options.experimental.collective_graph_key = 7
+      if num_gpus > 1:
+        result = sess.run([ops.convert_to_tensor(v) for v in result.values],
+                          options=run_options)[0]
+      else:
+        result = sess.run(ops.convert_to_tensor(result), options=run_options)
+
+      # Reduce the same indexed slices on CPU locally as our expected results.
+      devices_cpu = [(worker_device or "") + "/device:CPU:0"] * (
+          max(num_gpus, 1) * num_workers)
+      per_replica_on_cpu = self._get_indexed_slices(
+          devices_cpu, 0, variable_length, as_per_replica=False)
+      expected_result = cross_device_utils.aggregate_tensors_or_indexed_slices(
+          per_replica_on_cpu)
+      expected_result = sess.run(ops.convert_to_tensor(expected_result))
+
+      self.assertAllEqual(expected_result, result)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["graph"],
+          required_gpus=[0, 1, 2],
+          use_strategy_object=[True, False],
+          bytes_per_pack=[0, 1, 4]))
+  def testReductionDistributed(self, required_gpus, use_strategy_object,
+                               bytes_per_pack):
+    hints = collective_util.Hints(bytes_per_pack=bytes_per_pack)
+    self._run_between_graph_clients(
+        self._test_reduction,
+        self._cluster_spec,
+        required_gpus,
+        communication=CollectiveCommunication.RING,
+        use_strategy_object=use_strategy_object,
+        hints=hints)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["graph"],
+          required_gpus=[0, 1, 2],
+          variable_length=[True, False]))
+  def testReduceIndexedSlicesDistributed(self, required_gpus, variable_length):
+    self._run_between_graph_clients(
+        self._test_reduce_indexed_slices,
+        self._cluster_spec,
+        required_gpus,
+        communication=CollectiveCommunication.RING,
+        batch_reduce=True,
+        variable_length=variable_length)
+
+  # Collective ops doesn't support strategy with one device.
+  @combinations.generate(
+      combinations.combine(
+          mode=["graph"],
+          required_gpus=2,
+          communication=[
+              CollectiveCommunication.NCCL, CollectiveCommunication.RING
+          ],
+          use_strategy_object=[True, False]))
+  def testReductionLocal(self, required_gpus, communication,
+                         use_strategy_object):
+    self._test_reduction(
+        None,
+        None,
+        required_gpus,
+        communication=communication,
+        use_strategy_object=use_strategy_object,
+        local_mode=True)
+
+  @combinations.generate(
+      combinations.combine(
+          mode=["graph"],
+          required_gpus=2,
+          batch_reduce=[True, False],
+          variable_length=[True, False],
+          communication=[
+              CollectiveCommunication.NCCL, CollectiveCommunication.RING
+          ]))
+  def testReduceIndexedSlicesLocal(self, required_gpus, batch_reduce,
+                                   variable_length, communication):
+    self._test_reduce_indexed_slices(
+        None,
+        None,
+        required_gpus,
+        communication=communication,
+        batch_reduce=batch_reduce,
+        variable_length=variable_length,
+        local_mode=True)
+
+  @combinations.generate(
+      combinations.combine(
+          required_gpus=2,
+          mode="eager",
+          communication=[
+              CollectiveCommunication.NCCL, CollectiveCommunication.RING
+          ]))
+  def testEagerMultiThread(self, communication):
+    collective, devices, _ = self._get_test_objects(
+        None,
+        None,
+        num_gpus=2,
+        communication=communication,
+        use_strategy_object=False,
+        local_mode=True)
+
+    # We would like to simulate the following sequence:
+    #   thread-0  device0                 device1
+    #   thread-1          device0 device1
+    # If the kernel launch sequence is as-is the program will deadlock since
+    # NCCL requires the launch order to be same on each device.
+    v0 = _make_per_replica([1.0 for _ in devices], devices)
+    v1 = _make_per_replica([2.0 for _ in devices], devices)
+
+    # Add a delay to collective_ops.all_reduce according to the input tensors
+    # index in `sequence.`
+    sequence = [v0.values[0], v1.values[0], v1.values[1], v0.values[1]]
+    all_reduce = collective_ops.all_reduce
+
+    def delayed_all_reduce(input_tensor, *args, **kwargs):
+      for idx, v in enumerate(sequence):
+        if input_tensor is v:
+          time.sleep(idx)
+          break
+      return all_reduce(input_tensor, *args, **kwargs)
+
+    with test.mock.patch.object(collective_ops, "all_reduce",
+                                delayed_all_reduce):
+      # We only use NCCL for batch reduce with two or more values, so we use two
+      # values here.
+
+      def thread_fn():
+        reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v0, v0),
+                                                                     (v0, v0)])
+        self.assertAllEqual(reduced[0].values, [2.0, 2.0])
+        self.assertAllEqual(reduced[1].values, [2.0, 2.0])
+
+      t = threading.Thread(target=thread_fn)
+      t.start()
+      reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v1, v1),
+                                                                   (v1, v1)])
+      self.assertAllEqual(reduced[0].values, [4.0, 4.0])
+      self.assertAllEqual(reduced[1].values, [4.0, 4.0])
+      t.join()
+
+if __name__ == "__main__":
+  # Set default inter op thread pool size to one to ensure we don't exhaust the
+  # thread pool with the additional executors to run collectives in eager.
+  os.environ["TF_NUM_INTEROP_THREADS"] = "1"
+  test.main()
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 051f56705ec..b90fd24b6e0 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import weakref
 
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
@@ -442,10 +443,20 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
   """Holds a map from replica to variables."""
 
   def __init__(self, strategy, values, aggregation, var_policy=None):
+    if (aggregation == variables_lib.VariableAggregation.MEAN and
+        not values[0].dtype.is_floating):
+      raise ValueError(
+          "creating distributed tf.Variable with aggregation=MEAN and a "
+          "non-floating dtype is not supported, please use a different "
+          "aggregation or dtype")
     self._distribute_strategy = strategy
     self._aggregation = aggregation
     super(DistributedVariable, self).__init__(values)
     self._common_name = self._primary.name.split(":")[0]
+    # Use a weakref to make it easy to map from the contained values
+    # to the container without introducing a reference cycle.
+    for v in values:
+      v._distributed_container = weakref.ref(self)  # pylint: disable=protected-access
 
     # Packed variable is used to reduce the overhead of function execution.
     # For a DistributedVariable, only one variable handle is captured into a
@@ -863,6 +874,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     Returns:
       Updated variable or `tf.Operation`.
     """
+    values_util.mark_as_unsaveable()
     return self.distribute_strategy.extended.update(
         self, update_fn, args=(value,), kwargs=kwargs, group=True)
 
@@ -950,6 +962,30 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
           self._primary.handle]
     return obj_map, resource_map
 
+  def _write_object_proto(self, proto, options):
+    """Update a SavedObject proto for the caller.
+
+    If a DistributedVariable object supports this method, it will be called when
+    saving with a pre-built `SavedObject` proto representing the object, plus an
+    instance of `SaveOptions`. This method is then free to modify that proto
+    instance.
+
+    `DistributedVariable` with `AUTO` or `ON_WRITE` synchronization optionally
+    write out information about their components to the
+    `experimental_distributed_variable_components` field of a
+    `SavedVariable` (depending on the `SaveOptions` variable policy).
+
+    Args:
+      proto: A pre-built `SavedObject` proto for this object. It is assumed this
+        will be a `SavedVariable` instance.
+      options: A `SaveOptions` instance.
+    """
+    if self._policy:
+      if self._policy._is_mirrored():  # pylint: disable=protected-access
+        self._policy._write_object_proto(self, proto, options)  # pylint: disable=protected-access
+    else:
+      self._write_object_proto(proto, options)
+
 
 # We extend from `saveable_object.SaveableObject` instead of
 # `saveable_object_util.ResourceVariableSaveable` since we need to read the
@@ -1028,8 +1064,6 @@ class MirroredVariable(DistributedVariable, Mirrored):
     return super(MirroredVariable, self).scatter_update(*args, **kwargs)
 
   def _get_cross_replica(self):
-    if values_util.is_saving_non_distributed():
-      return self._primary.read_value()
     # Return identity, to avoid directly exposing the variable to the user and
     # allowing it to be modified by mistake.
     return array_ops.identity(Mirrored._get_cross_replica(self))
@@ -1052,6 +1086,26 @@ class MirroredVariable(DistributedVariable, Mirrored):
 
     return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
+  def _write_object_proto(self, proto, options):
+    """Update a SavedObject proto for the caller.
+
+    If a DistributedVariable object supports this method, it will be called when
+    saving with a pre-built `SavedObject` proto representing the object, plus an
+    instance of `SaveOptions`. This method is then free to modify that proto
+    instance.
+
+    `DistributedVariable` with `AUTO` or `ON_WRITE` synchronization optionally
+    write out information about their components to the
+    `experimental_distributed_variable_components` field of a
+    `SavedVariable` (depending on the `SaveOptions` variable policy).
+
+    Args:
+      proto: A pre-built `SavedObject` proto for this object. It is assumed this
+        will be a `SavedVariable` instance.
+      options: A `SaveOptions` instance.
+    """
+    values_util.write_object_proto(self, proto, options)
+
   def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
     """Converts a variable to a tensor."""
     # TODO(b/154017756): Make _dense_var_to_tensor consistent between ON_READ
@@ -1102,6 +1156,7 @@ class SyncOnReadVariable(DistributedVariable):
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       if (ds_context.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
+        values_util.mark_as_unsaveable()
         return values_util.on_read_assign_sub_cross_replica(
             self, value, read_value=read_value)
       else:
@@ -1114,6 +1169,7 @@ class SyncOnReadVariable(DistributedVariable):
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       if (ds_context.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
+        values_util.mark_as_unsaveable()
         return values_util.on_read_assign_add_cross_replica(
             self, value, read_value=read_value)
       else:
@@ -1126,6 +1182,7 @@ class SyncOnReadVariable(DistributedVariable):
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       if (ds_context.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
+        values_util.mark_as_unsaveable()
         return values_util.on_read_assign_cross_replica(
             self, value, read_value=read_value)
       else:
@@ -1186,13 +1243,12 @@ class SyncOnReadVariable(DistributedVariable):
         return self._get_on_device_or_primary().value()
 
   def _get_cross_replica(self):
-    if values_util.is_saving_non_distributed():
-      return self._primary.read_value()
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
       # Consider returning a tensor value here to make the return value of
       # _get_cross_replica consistent.
       return self._get_replica(0)
-
+    if self._aggregation == vs.VariableAggregation.SUM:
+      values_util.mark_as_unsaveable()
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       return self._distribute_strategy.reduce(
           reduce_util.ReduceOp.from_variable_aggregation(self._aggregation),
@@ -1223,6 +1279,26 @@ class SyncOnReadVariable(DistributedVariable):
 
     return {trackable.VARIABLE_VALUE_KEY: _saveable_factory}
 
+  def _write_object_proto(self, proto, options):
+    """Update a SavedObject proto for the caller.
+
+    If a DistributedVariable object supports this method, it will be called when
+    saving with a pre-built `SavedObject` proto representing the object, plus an
+    instance of `SaveOptions`. This method is then free to modify that proto
+    instance.
+
+    `DistributedVariable` with `AUTO` or `ON_WRITE` synchronization optionally
+    write out information about their components to the
+    `experimental_distributed_variable_components` field of a
+    `SavedVariable` (depending on the `SaveOptions` variable policy).
+
+    Args:
+      proto: A pre-built `SavedObject` proto for this object. It is assumed this
+        will be a `SavedVariable` instance.
+      options: A `SaveOptions` instance.
+    """
+    pass
+
 
 # Register a conversion functions which reads the value of the variable,
 # allowing instances of the class to be used as tensors.
@@ -1328,10 +1404,11 @@ class OnReadPolicy(VariablePolicy):
 
   def _get_cross_replica(self, var):
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return var._primary  # pylint: disable=protected-access
-
+      return var._get_replica(0)  # pylint: disable=protected-access
+    if self._aggregation == vs.VariableAggregation.SUM:
+      values_util.mark_as_unsaveable()
     with ds_context.enter_or_assert_strategy(var.distribute_strategy):
-      return  var.distribute_strategy.reduce(
+      return var.distribute_strategy.reduce(
           reduce_util.ReduceOp.from_variable_aggregation(self._aggregation),
           var,
           axis=None)
@@ -1350,6 +1427,7 @@ class OnReadPolicy(VariablePolicy):
     with ds_context.enter_or_assert_strategy(var.distribute_strategy):
       if (ds_context.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
+        values_util.mark_as_unsaveable()
         return values_util.on_read_assign_sub_cross_replica(
             var, value, read_value=read_value)
       else:
@@ -1363,6 +1441,7 @@ class OnReadPolicy(VariablePolicy):
     with ds_context.enter_or_assert_strategy(var.distribute_strategy):
       if (ds_context.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
+        values_util.mark_as_unsaveable()
         return values_util.on_read_assign_add_cross_replica(
             var, value, read_value=read_value)
       else:
@@ -1374,6 +1453,7 @@ class OnReadPolicy(VariablePolicy):
     with ds_context.enter_or_assert_strategy(var.distribute_strategy):
       if (ds_context.in_cross_replica_context() and
           not values_util.in_replica_update_context()):
+        values_util.mark_as_unsaveable()
         return values_util.on_read_assign_cross_replica(var, value,
                                                         read_value=read_value)
       else:
@@ -1507,6 +1587,27 @@ class AutoPolicy(VariablePolicy):
   def get_restore_ops(self, var, tensor):
     return values_util.get_on_write_restore_ops(var, tensor)
 
+  def _write_object_proto(self, var, proto, options):
+    """Update a SavedObject proto for the caller.
+
+    If a DistributedVariable object supports this method, it will be called when
+    saving with a pre-built `SavedObject` proto representing the object, plus an
+    instance of `SaveOptions`. This method is then free to modify that proto
+    instance.
+
+    `DistributedVariable` with `AUTO` or `ON_WRITE` synchronization optionally
+    write out information about their components to the
+    `experimental_distributed_variable_components` field of a
+    `SavedVariable` (depending on the `SaveOptions` variable policy).
+
+    Args:
+      var : A DistributedVariable object
+      proto: A pre-built `SavedObject` proto for this object. It is assumed this
+        will be a `SavedVariable` instance.
+      options: A `SaveOptions` instance.
+    """
+    values_util.write_object_proto(var, proto, options)
+
 
 class OnWritePolicy(AutoPolicy):
   """Policy defined for `tf.VariableSynchronization.ON_WRITE` synchronization.
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index 02a9926ea18..1f9bef137d5 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -56,6 +56,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.saved_model import save
 from tensorflow.python.saved_model import save_context
 from tensorflow.python.saved_model import save_options
 from tensorflow.python.training import saver as saver_lib
@@ -113,23 +114,6 @@ def mirrored_and_tpu_strategy_combinations():
 
 class DistributedValuesTest(test.TestCase, parameterized.TestCase):
 
-  def testGetEager(self):
-    one = constant_op.constant(1)
-    two = constant_op.constant(2)
-    v = values_lib.DistributedValues((one, two))
-    self.assertEqual(one, v._get())
-    with distribute_lib.ReplicaContext(None, 1):
-      self.assertEqual(two, v._get())
-
-  def testGetGraph(self):
-    with context.graph_mode(), ops.Graph().as_default():
-      one = constant_op.constant(1)
-      two = constant_op.constant(2)
-      v = values_lib.DistributedValues((one, two))
-      self.assertEqual(one, v._get())
-      with distribute_lib.ReplicaContext(None, 1):
-        self.assertEqual(two, v._get())
-
   @combinations.generate(
       combinations.combine(
           distribution=(strategy_combinations.all_strategies_minus_default +
@@ -682,13 +666,13 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
     with distribution.scope():
       # We use four variables for convenience reasons. They have no special
       # meaning.
-      # - v is used whenever possible, and for the methods that require the
-      # dtype to be integer.
+      # - v is used whenever possible.
       # - w is used for scatter and gather, which require the variable to be
       # non-scalar.
-      # - y is used when the dtype needs to be float.
+      # - y is used when the dtype needs to be integer. Note that aggregation
+      # cannot be MEAN for integers.
       v = variables_lib.Variable(
-          0,
+          0.,
           synchronization=synchronization,
           aggregation=aggregation,
           trainable=True)
@@ -696,10 +680,11 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
                                  synchronization=synchronization,
                                  aggregation=aggregation,
                                  trainable=True)
-      y = variables_lib.Variable(
-          7.,
-          synchronization=synchronization,
-          aggregation=aggregation)
+      if aggregation != variables_lib.VariableAggregation.MEAN:
+        y = variables_lib.Variable(
+            0,
+            synchronization=synchronization,
+            aggregation=aggregation)
 
     # pylint: disable=g-long-lambda
 
@@ -708,7 +693,7 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
     _test(lambda: self.assertIs(v.constraint, None), v)
     # TODO(crccw): should we raise an error instead?
     _test(lambda: self.assertEqual(v.device, v._primary.device), v)
-    _test(lambda: self.assertEqual(v.dtype, dtypes.int32), v)
+    _test(lambda: self.assertEqual(v.dtype, dtypes.float32), v)
     if not context.executing_eagerly():
       _test(lambda: self.assertIs(v.graph, v._primary.graph), v)
     if not context.executing_eagerly():
@@ -722,9 +707,9 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
     _test(lambda: self.assertTrue(v.trainable, True), v)
 
     # tf.Variable methods.
-    _test(lambda: check_ops.assert_equal_v2(v.assign(1), 1), v)
-    _test(lambda: check_ops.assert_equal_v2(v.assign_add(1), 2), v)
-    _test(lambda: check_ops.assert_equal_v2(v.assign_sub(1), 1), v)
+    _test(lambda: check_ops.assert_equal_v2(v.assign(1.), 1.), v)
+    _test(lambda: check_ops.assert_equal_v2(v.assign_add(1.), 2.), v)
+    _test(lambda: check_ops.assert_equal_v2(v.assign_sub(1.), 1.), v)
     # TODO(b/148689177): Implement batch_scatter_update.
     # count_up_to() is skipped since it's deprecated.
     # eval() is skipped since it shouldn't called in a tf.function.
@@ -736,7 +721,7 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
                                           tensor_shape.TensorShape(())), v)
     # initialized_value() is skipped since it shouldn't called in a tf.function.
     # load() is skipped since it shouldn't called in a tf.function.
-    _test(lambda: check_ops.assert_equal_v2(v.read_value(), 1), v)
+    _test(lambda: check_ops.assert_equal_v2(v.read_value(), 1.), v)
     # ref() is skipped since it shouldn't called in a tf.function.
     _test(
         lambda: check_ops.assert_equal_v2(
@@ -770,62 +755,65 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
             [2., 0.5, 1.]), w)
     # set_shape() is skipped since ResourceVariable doesn't implement it.
     # to_proto() is skipped since it shouldn't called in a tf.function.
-    _test(lambda: check_ops.assert_equal_v2(v.value(), 1), v)
+    _test(lambda: check_ops.assert_equal_v2(v.value(), 1.), v)
 
     # DistributedVariable should be treated as ResourceVariable, so it needs to
     # conform to ResourceVariable interface as well.
     _test(lambda: self.assertIs(v.handle, v._primary.handle), v)
 
     # Convert to tensor.
-    _test(lambda: check_ops.assert_equal_v2(ops.convert_to_tensor(v), 1), v)
+    _test(lambda: check_ops.assert_equal_v2(ops.convert_to_tensor(v), 1.), v)
 
     # Control dependency.
     def _with_control_dep():
-      with ops.control_dependencies([v.assign(1)]):
+      with ops.control_dependencies([v.assign(1.)]):
         return array_ops.identity(1)
 
     _test(_with_control_dep, v)
 
     # Operator overloads.
-    _test(lambda: check_ops.assert_equal_v2(v.assign(7), 7), v)
-    _test(lambda: check_ops.assert_equal_v2(v + 1, 8), v)
-    _test(lambda: check_ops.assert_equal_v2(3 + v, 10), v)
-    _test(lambda: check_ops.assert_equal_v2(v + v, 14), v)
-    _test(lambda: check_ops.assert_equal_v2(v - 2, 5), v)
-    _test(lambda: check_ops.assert_equal_v2(v - v, 0), v)
-    _test(lambda: check_ops.assert_equal_v2(v * 2, 14), v)
-    _test(lambda: check_ops.assert_equal_v2(3 * v, 21), v)
-    _test(lambda: check_ops.assert_equal_v2(v * v, 49), v)
+    _test(lambda: check_ops.assert_equal_v2(v.assign(7.), 7.), v)
+    _test(lambda: check_ops.assert_equal_v2(v + 1., 8.), v)
+    _test(lambda: check_ops.assert_equal_v2(3 + v, 10.), v)
+    _test(lambda: check_ops.assert_equal_v2(v + v, 14.), v)
+    _test(lambda: check_ops.assert_equal_v2(v - 2., 5.), v)
+    _test(lambda: check_ops.assert_equal_v2(v - v, 0.), v)
+    _test(lambda: check_ops.assert_equal_v2(v * 2., 14.), v)
+    _test(lambda: check_ops.assert_equal_v2(3 * v, 21.), v)
+    _test(lambda: check_ops.assert_equal_v2(v * v, 49.), v)
     _test(
         lambda: check_ops.assert_equal_v2(
-            math_ops.cast(v / 2, dtypes.float32), 3.5), v)
+            math_ops.cast(v / 2., dtypes.float32), 3.5), v)
     _test(
         lambda: check_ops.assert_equal_v2(
-            math_ops.cast(14 / v, dtypes.float32), 2.), v)
-    _test(lambda: check_ops.assert_equal_v2(v // 2, 3), v)
-    _test(lambda: check_ops.assert_equal_v2(15 // v, 2), v)
-    _test(lambda: check_ops.assert_equal_v2(v % 2, 1), v)
-    _test(lambda: check_ops.assert_equal_v2(16 % v, 2), v)
-    _test(lambda: _assert(v < 12), v)
-    _test(lambda: _assert(v <= 12), v)
-    _test(lambda: _assert(not v > 12), v)
-    _test(lambda: _assert(not v >= 12), v)
-    _test(lambda: _assert(not 12 < v), v)
-    _test(lambda: _assert(not 12 <= v), v)
-    _test(lambda: _assert(12 > v), v)
-    _test(lambda: _assert(12 >= v), v)
-    # XLA doesn't implement pow() with integers.
-    _test(lambda: check_ops.assert_near_v2(pow(y, 3.), 343.), y)
-    _test(lambda: check_ops.assert_near_v2(pow(2., y), 128.), y)
-    _test(lambda: check_ops.assert_equal_v2(abs(v), 7), v)
-    _test(lambda: check_ops.assert_equal_v2(v & 3, 3), v)
-    _test(lambda: check_ops.assert_equal_v2(3 & v, 3), v)
-    _test(lambda: check_ops.assert_equal_v2(v | 8, 15), v)
-    _test(lambda: check_ops.assert_equal_v2(16 | v, 23), v)
-    _test(lambda: check_ops.assert_equal_v2(v ^ 3, 4), v)
-    _test(lambda: check_ops.assert_equal_v2(11 ^ v, 12), v)
-    _test(lambda: check_ops.assert_equal_v2(-v, -7), v)
-    _test(lambda: check_ops.assert_equal_v2(~v, ~7), v)
+            math_ops.cast(14. / v, dtypes.float32), 2.), v)
+    _test(lambda: _assert(v < 12.), v)
+    _test(lambda: _assert(v <= 12.), v)
+    _test(lambda: _assert(not v > 12.), v)
+    _test(lambda: _assert(not v >= 12.), v)
+    _test(lambda: _assert(not 12. < v), v)
+    _test(lambda: _assert(not 12. <= v), v)
+    _test(lambda: _assert(12. > v), v)
+    _test(lambda: _assert(12. >= v), v)
+    _test(lambda: check_ops.assert_near_v2(pow(v, 3.), 343.), v)
+    _test(lambda: check_ops.assert_near_v2(pow(2., v), 128.), v)
+    _test(lambda: check_ops.assert_equal_v2(abs(v), 7.), v)
+
+    # Operator overloads that only works for integers.
+    if aggregation != variables_lib.VariableAggregation.MEAN:
+      _test(lambda: check_ops.assert_equal_v2(y.assign(7), 7), y)
+      _test(lambda: check_ops.assert_equal_v2(y // 2, 3), y)
+      _test(lambda: check_ops.assert_equal_v2(15 // y, 2), y)
+      _test(lambda: check_ops.assert_equal_v2(y % 2, 1), y)
+      _test(lambda: check_ops.assert_equal_v2(16 % y, 2), y)
+      _test(lambda: check_ops.assert_equal_v2(y & 3, 3), y)
+      _test(lambda: check_ops.assert_equal_v2(3 & y, 3), y)
+      _test(lambda: check_ops.assert_equal_v2(y | 8, 15), y)
+      _test(lambda: check_ops.assert_equal_v2(16 | y, 23), y)
+      _test(lambda: check_ops.assert_equal_v2(y ^ 3, 4), y)
+      _test(lambda: check_ops.assert_equal_v2(11 ^ y, 12), y)
+      _test(lambda: check_ops.assert_equal_v2(-y, -7), y)
+      _test(lambda: check_ops.assert_equal_v2(~y, ~7), y)
 
     # Index.
     if isinstance(distribution.extended, tpu_strategy.TPUExtended):
@@ -838,6 +826,67 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
 
     # pylint: enable=g-long-lambda
 
+  def testUnsaveable(self, distribution, synchronization, aggregation, mode):
+    if isinstance(distribution.extended,
+                  parameter_server_strategy.ParameterServerStrategyExtended):
+      self.skipTest("n/a: not appliable to AggregatingVariable")
+    if (isinstance(distribution,
+                   collective_all_reduce_strategy.CollectiveAllReduceStrategy)
+        and mode == "graph"):
+      self.skipTest("MWMS combinations tests do not work well in graph mode.")
+    with distribution.scope():
+      v = variables_lib.Variable([1., 1.],
+                                 synchronization=synchronization,
+                                 aggregation=aggregation)
+
+    with self.cached_session():
+      self.evaluate(variables_lib.global_variables_initializer())
+
+    export_dir = self.get_temp_dir()
+
+    def _assert_unsaveable(f):
+      # Ignore if it cannot be traced. Certain combinations are not supported or
+      # yet or not allowed.
+      try:
+        f = def_function.function(f).get_concrete_function()
+      except (NotImplementedError, ValueError):
+        return
+      with self.assertRaisesRegex(ValueError, "f_with_input_signature"):
+        save.save(v, export_dir, signatures=f)
+
+    _assert_unsaveable(lambda: v.assign(ops.convert_to_tensor([1., 1.])))
+    _assert_unsaveable(lambda: v.assign_add(ops.convert_to_tensor([1., 1.])))
+    _assert_unsaveable(lambda: v.assign_sub(ops.convert_to_tensor([1., 1.])))
+    _assert_unsaveable(lambda: v.scatter_add(_make_index_slices([1.], [0])))
+    _assert_unsaveable(lambda: v.scatter_sub(_make_index_slices([1.], [0])))
+    _assert_unsaveable(lambda: v.scatter_mul(_make_index_slices([1.], [0])))
+    _assert_unsaveable(lambda: v.scatter_div(_make_index_slices([1.], [0])))
+    _assert_unsaveable(lambda: v.scatter_min(_make_index_slices([1.], [0])))
+    _assert_unsaveable(lambda: v.scatter_max(_make_index_slices([1.], [0])))
+    _assert_unsaveable(lambda: v.scatter_update(_make_index_slices([1.], [0])))
+    # Reading a ON_READ variable should be unsaveable if either:
+    # 1) CollectiveAllReduceStrategy, and aggregation is MEAN/SUM.
+    # 2) aggregation is SUM.
+    if (synchronization == variables_lib.VariableSynchronization.ON_READ and
+        (aggregation == variables_lib.VariableAggregation.SUM or
+         (isinstance(distribution.extended,
+                     collective_all_reduce_strategy.CollectiveAllReduceExtended)
+          and aggregation == variables_lib.VariableAggregation.MEAN))):
+      _assert_unsaveable(v.read_value)
+      _assert_unsaveable(v.value)
+      _assert_unsaveable(lambda: ops.convert_to_tensor(v))
+    else:
+      # Otherwise reading a variable should be saveable.
+
+      @def_function.function
+      def f():
+        v.read_value()
+        v.value()
+        return ops.convert_to_tensor(v)
+
+      with self.cached_session():
+        save.save(v, export_dir, signatures=f.get_concrete_function())
+
 
 @combinations.generate(
     combinations.combine(
@@ -1451,4 +1500,4 @@ def _make_index_slices(values, indices, dense_shape=None):
 
 
 if __name__ == "__main__":
-  combinations.main()
+  ds_test_util.main()
diff --git a/tensorflow/python/distribute/values_util.py b/tensorflow/python/distribute/values_util.py
index 1ad56fcbd27..369e2435d9b 100644
--- a/tensorflow/python/distribute/values_util.py
+++ b/tensorflow/python/distribute/values_util.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import reduce_util
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
@@ -31,11 +32,43 @@ from tensorflow.python.saved_model import save_options
 from tensorflow.python.training.saving import saveable_object
 
 
+def write_object_proto(var, proto, options):
+  """Update a SavedObject proto for the caller.
+
+  If a DistributedVariable object supports this method, it will be called when
+  saving with a pre-built `SavedObject` proto representing the object, plus an
+  instance of `SaveOptions`. This method is then free to modify that proto
+  instance.
+
+  `DistributedVariable` with `AUTO` or `ON_WRITE` synchronization optionally
+   write out information about their components to the
+   `experimental_distributed_variable_components` field of a
+   `SavedVariable` (depending on the `SaveOptions` variable policy).
+
+  Args:
+    var: The DistributedVariable object.
+    proto: A pre-built `SavedObject` proto for this object. It is assumed this
+      will be a `SavedVariable` instance.
+    options: A `SaveOptions` instance.
+  """
+  if options.experimental_variable_policy._expand_distributed_variables(  # pylint: disable=protected-access
+  ):
+    for var in var.values:
+      var_proto = (
+          proto.variable.experimental_distributed_variable_components.add())
+      var_proto.name = var.name.split(":")[0]
+      var_proto.device = var.device
+
+
 def get_on_write_saveable(var, primary_var, name):
   """Return saveable spec for AUTO and ON_WRITE variables."""
   # We use a callable so that we don't have to evaluate this expression
   # in the case where we are trying to restore instead of save.
   def tensor():
+    if context.executing_eagerly() and not primary_var.is_initialized():
+      # A SaveSpec tensor value of `None` indicates that the variable is
+      # uninitialized.
+      return None
     strategy = var.distribute_strategy
     return strategy.extended.read_var(var)
 
@@ -69,8 +102,7 @@ def get_on_read_saveable(var, primary_var, name):
   # We use a callable so that we don't have to evaluate this expression
   # in the case where we are trying to restore instead of save.
   def tensor():
-    strategy = var.distribute_strategy
-    return strategy.extended.read_var(var)
+    return var._get_cross_replica()  # pylint: disable=protected-access
 
   spec = saveable_object.SaveSpec(
       tensor=tensor,
@@ -257,7 +289,7 @@ def get_current_replica_id_as_int():
   """Returns the current replica ID as an integer, or `None`."""
   replica_context = ds_context.get_replica_context()
   if replica_context:
-    replica_id = replica_context.replica_id_in_sync_group
+    replica_id = replica_context._replica_id  # pylint: disable=protected-access
     if not isinstance(replica_id, int):
       replica_id = tensor_util.constant_value(replica_id)
   else:
@@ -339,3 +371,23 @@ def is_saving_non_distributed():
   options = save_context.get_save_options()
   return (options.experimental_variable_policy !=
           save_options.VariablePolicy.EXPAND_DISTRIBUTED_VARIABLES)
+
+
+def mark_as_unsaveable():
+  """Marks the function as unsaveable if not inside save context."""
+  if ops.inside_function() and not save_context.in_save_context():
+    ops.get_default_graph().mark_as_unsaveable("""
+ConcreteFunction that uses distributed variables in certain way cannot be saved.
+If you're saving with
+
+tf.saved_model.save(..., signatures=f.get_concrete_function())
+
+do
+
+@tf.function(input_signature=...)
+def f_with_input_signature():
+  ...
+
+tf.saved_model.save(..., signatures=f_with_input_signature)`
+
+instead.""")
diff --git a/tensorflow/python/distribute/vars_test.py b/tensorflow/python/distribute/vars_test.py
index ba77384a83a..e9eb9b77460 100644
--- a/tensorflow/python/distribute/vars_test.py
+++ b/tensorflow/python/distribute/vars_test.py
@@ -26,6 +26,7 @@ from absl.testing import parameterized
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import test_util
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.distribute import values
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
@@ -302,63 +303,6 @@ class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
     for component in mirrored.values:
       self.assertEqual(self.evaluate(component.read_value()), 3.)
 
-  @combinations.generate(strategy_with_var_policy())
-  def testAssignAggregationMeanDTypeNonFloat(self, distribution):
-    if isinstance(distribution, _TPU_STRATEGIES):
-      self.skipTest("Fix sponge/6e8ab540-4c0f-4da5-aedf-86505ff810c9 before "
-                    "reenabling test.")
-
-    with distribution.scope():
-      v = variables_lib.Variable(
-          1,
-          aggregation=variable_scope.VariableAggregation.MEAN,
-          dtype=dtypes.int32)
-    self.evaluate(v.initializer)
-
-    @def_function.function
-    def assign():
-      ctx = ds_context.get_replica_context()
-      return v.assign(ctx.replica_id_in_sync_group)
-
-    # disallow assign() with distributed value in replica context.
-    with self.assertRaisesRegex(ValueError,
-                                "Cannot update non-float variables"):
-      self.evaluate(
-          distribution.experimental_local_results(
-              distribution.run(assign)))
-
-    # allow assign() with same value in replica context.
-    @def_function.function
-    def assign_same():
-      return v.assign(2)
-
-    self.evaluate(
-        distribution.experimental_local_results(
-            distribution.run(assign_same)))
-    self.assertEqual(self.evaluate(v.read_value()), 2)
-
-    # allow assign() with mirrored variable in replica context.
-    with distribution.scope():
-      v2 = variables_lib.Variable(
-          3,
-          aggregation=variable_scope.VariableAggregation.SUM,
-          dtype=dtypes.int32)
-    self.evaluate(v2.initializer)
-
-    @def_function.function
-    def assign_mirrored():
-      return v.assign(v2)
-
-    self.evaluate(
-        distribution.experimental_local_results(
-            distribution.run(assign_mirrored)))
-    self.assertEqual(self.evaluate(v.read_value()), 3)
-
-    # allow assign() in cross replica context.
-    with distribution.scope():
-      self.evaluate(v.assign(4))
-      self.assertEqual(self.evaluate(v.read_value()), 4)
-
   @combinations.generate(strategy_with_var_policy())
   def testInitializedToSameValueInsideEagerRun(self, distribution):
     if not context.executing_eagerly(): self.skipTest("eager only test")
@@ -415,24 +359,24 @@ class OnWriteVariableSync(test.TestCase, parameterized.TestCase):
       with ops.init_scope():
         if obj.w is None:
           obj.w = variables_lib.Variable(
-              0, aggregation=variables_lib.VariableAggregation.MEAN)
+              0., aggregation=variables_lib.VariableAggregation.MEAN)
           obj.v = variables_lib.Variable(
               obj.w.read_value(),
               aggregation=variables_lib.VariableAggregation.MEAN)
           self.evaluate(variables_lib.global_variables_initializer())
 
-      return obj.v.assign_add(2)
+      return obj.v.assign_add(2.)
 
     per_replica_results = self.evaluate(
         distribution.experimental_local_results(distribution.run(assign)))
-    self.assertAllEqual([2, 2], per_replica_results)
+    self.assertAllEqual([2., 2.], per_replica_results)
 
   @combinations.generate(strategy_with_var_policy())
   def testOperatorOverride(self, distribution):
 
     with distribution.scope():
       v = variable_scope.variable(
-          1, aggregation=variables_lib.VariableAggregation.MEAN)
+          1, aggregation=variables_lib.VariableAggregation.SUM)
       self.evaluate(variables_lib.global_variables_initializer())
 
     self.assertEqual(2, self.evaluate(v + 1))
@@ -1329,4 +1273,4 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
 
 
 if __name__ == "__main__":
-  combinations.main()
+  test_util.main()
diff --git a/tensorflow/python/dlpack/BUILD b/tensorflow/python/dlpack/BUILD
index 31ae7f4b435..7d865029bfb 100644
--- a/tensorflow/python/dlpack/BUILD
+++ b/tensorflow/python/dlpack/BUILD
@@ -19,7 +19,6 @@ cuda_py_test(
     name = "dlpack_test",
     srcs = ["dlpack_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["noasan"],  # TODO(b/159774807)
     deps = [
         ":dlpack",
         "//tensorflow/python/eager:test",
diff --git a/tensorflow/python/dlpack/dlpack_test.py b/tensorflow/python/dlpack/dlpack_test.py
index af91da80512..31cc6adbff4 100644
--- a/tensorflow/python/dlpack/dlpack_test.py
+++ b/tensorflow/python/dlpack/dlpack_test.py
@@ -20,11 +20,14 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+
 from tensorflow.python.dlpack import dlpack
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
+from tensorflow.python.ops import array_ops
 
 int_dtypes = [
     np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32,
@@ -59,11 +62,20 @@ class DLPackTest(parameterized.TestCase, test.TestCase):
   def testRoundTrip(self, dtype, shape):
     np.random.seed(42)
     np_array = np.random.randint(0, 10, shape)
-    tf_tensor = constant_op.constant(np_array, dtype=dtype)
+    # copy to gpu if available
+    tf_tensor = array_ops.identity(constant_op.constant(np_array, dtype=dtype))
+    tf_tensor_device = tf_tensor.device
+    tf_tensor_dtype = tf_tensor.dtype
     dlcapsule = dlpack.to_dlpack(tf_tensor)
     del tf_tensor  # should still work
     tf_tensor2 = dlpack.from_dlpack(dlcapsule)
     self.assertAllClose(np_array, tf_tensor2)
+    if tf_tensor_dtype == dtypes.int32:
+      # int32 tensor is always on cpu for now
+      self.assertEqual(tf_tensor2.device,
+                       "/job:localhost/replica:0/task:0/device:CPU:0")
+    else:
+      self.assertEqual(tf_tensor_device, tf_tensor2.device)
 
   def testTensorsCanBeConsumedOnceOnly(self):
     np.random.seed(42)
@@ -95,6 +107,12 @@ class DLPackTest(parameterized.TestCase, test.TestCase):
     self.assertRaisesRegex(Exception, ".* is not supported by dlpack",
                            UnsupportedComplex64)
 
+  def testMustPassTensorArgumentToDLPack(self):
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        "The argument to `to_dlpack` must be a TF tensor, not Python object"):
+      dlpack.to_dlpack([1])
+
 
 if __name__ == "__main__":
   ops.enable_eager_execution()
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index f08790348a2..300da451dcc 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -5,6 +5,9 @@ load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "pybind_extension")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 load(
     "//tensorflow/tools/test:performance.bzl",
@@ -77,9 +80,41 @@ cc_library(
     ],
 )
 
+tf_python_pybind_extension(
+    name = "pywrap_tensor_test_util",
+    testonly = True,
+    srcs = ["pywrap_tensor_test_util.cc"],
+    module_name = "pywrap_tensor_test_util",
+    deps = [
+        ":pywrap_tfe_lib",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c/eager:c_api_test_util",
+        "//tensorflow/python:pybind11_lib",
+        "@pybind11",
+    ],
+)
+
+cuda_py_test(
+    name = "pywrap_tensor_test",
+    size = "small",
+    srcs = ["pywrap_tensor_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_oss",  # TODO(b/168051787): Enable.
+        "no_pip",  # TODO(b/168051787): Enable.
+    ],
+    tfrt_enabled = True,
+    deps = [
+        ":pywrap_tensor_test_util",
+        ":test",
+        "//third_party/py/numpy",
+    ],
+)
+
 filegroup(
     name = "pywrap_required_hdrs",
     srcs = [
+        "pywrap_tensor.h",
         "pywrap_tensor_conversion.h",
         "pywrap_tfe.h",
     ],
@@ -177,6 +212,7 @@ py_library(
         "//tensorflow/python:pywrap_tf_session",
         "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python:tf2",
+        "//tensorflow/python:tfrt_utils",
         "//tensorflow/python:util",
         "//third_party/py/numpy",
     ],
@@ -228,6 +264,7 @@ cuda_py_test(
     size = "small",
     srcs = ["context_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":context",
         ":test",
@@ -324,6 +361,7 @@ cuda_py_test(
     name = "tensor_test",
     srcs = ["tensor_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":context",
         ":test",
@@ -338,6 +376,7 @@ cuda_py_test(
     python_version = "PY3",
     tags = [
         "no_windows",  #TODO(b/139745667)
+        "notsan",  #TODO(b/139745667)
     ],
     deps = [
         ":backprop",
@@ -396,7 +435,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["function_argument_naming_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
+    # TODO(b/169371527): insert transfer op in eager lowering for TFRT.
     deps = [
         ":backprop",
         ":def_function",
@@ -449,7 +488,6 @@ cuda_py_test(
     python_version = "PY3",
     shard_count = 15,
     tags = [
-        "nogpu",  # TODO(b/162544929): segfault
         "nomac",  # b/157056289
     ],
     deps = [
@@ -574,12 +612,21 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python/ops/numpy_ops:numpy",
         "//tensorflow/python/saved_model:save_context",
-        "//tensorflow/python/saved_model:save_options",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
 
+pybind_extension(
+    name = "_concrete_function",
+    srcs = ["function.cc"],
+    module_name = "_concrete_function",
+    deps = [
+        "//third_party/python_runtime:headers",
+        "@pybind11",
+    ],
+)
+
 py_library(
     name = "backprop",
     srcs = ["backprop.py"],
@@ -722,6 +769,7 @@ cuda_py_test(
     name = "ops_test",
     srcs = ["ops_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":context",
         ":execute",
@@ -747,6 +795,7 @@ tf_py_test(
     name = "pywrap_tfe_test",
     srcs = ["pywrap_tfe_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":backprop",
         ":context",
@@ -789,6 +838,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:while_v2",  # TODO(b/118513001): Imported via control_flow_ops; remove.
+        "//tensorflow/python/distribute/parallel_device",
         "//tensorflow/python/profiler:trace",
         "//tensorflow/python/training/tracking:base",
     ],
@@ -825,7 +875,7 @@ cuda_py_test(
     name = "def_function_test",
     srcs = ["def_function_test.py"],
     python_version = "PY3",
-    tfrt_enabled = True,
+    # TODO(b/169371527): insert transfer op in eager lowering for TFRT.
     deps = [
         ":def_function",
         "//tensorflow/python:client_testlib",
@@ -934,6 +984,26 @@ tf_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "wrap_function_device_test",
+    srcs = ["wrap_function_device_test.py"],
+    python_version = "PY3",
+    xla_enable_strict_auto_jit = False,
+    xla_enabled = False,
+    deps = [
+        ":def_function",
+        ":wrap_function",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:config",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_library(
     name = "remote",
     srcs = ["remote.py"],
@@ -1001,6 +1071,7 @@ cuda_py_test(
     shard_count = 8,
     tags = [
         "no_oss",  # This test launches local server
+        "notsan",  # TODO(b/170783249)
     ],
     deps = [
         "//tensorflow/python:array_ops",
@@ -1048,6 +1119,7 @@ cuda_py_test(
     size = "small",
     srcs = ["device_placement_test.py"],
     python_version = "PY3",
+    shard_count = 5,
     deps = [
         ":context",
         ":def_function",
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 0adb4698529..584fed73158 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -42,6 +42,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -1722,6 +1723,43 @@ class JacobianTest(test.TestCase):
         g.jacobian(output, inp, experimental_use_pfor=True),
         g.jacobian(output, inp, experimental_use_pfor=False))
 
+  def test_foldl_partial_function(self):
+    x = array_ops.zeros([3])
+    with backprop.GradientTape(persistent=True) as tape:
+      tape.watch(x)
+      result = def_function.function(
+          functools.partial(functional_ops.foldl_v2, lambda a, b: a + b))(
+              x)
+    self.assertAllClose([1., 1., 1.],
+                        tape.jacobian(result, x, experimental_use_pfor=True))
+    self.assertAllClose([1., 1., 1.],
+                        tape.jacobian(result, x, experimental_use_pfor=False))
+
+    # Non-persistent tapes take a different function gradient path, but also
+    # work with pfor=True.
+    x = array_ops.zeros([3])
+    with backprop.GradientTape() as tape:
+      tape.watch(x)
+      result = def_function.function(
+          functools.partial(functional_ops.foldl_v2, lambda a, b: a + b))(
+              x)
+    self.assertAllClose([1., 1., 1.],
+                        tape.jacobian(result, x, experimental_use_pfor=True))
+
+  def test_foldl_pure_function(self):
+
+    @def_function.function
+    def compute_jacobian(use_pfor):
+      x = array_ops.zeros([3])
+      with backprop.GradientTape(persistent=True) as tape:
+        tape.watch(x)
+        result = functools.partial(functional_ops.foldl_v2, lambda a, b: a + b)(
+            x)
+      return tape.jacobian(result, x, experimental_use_pfor=use_pfor)
+
+    self.assertAllClose(compute_jacobian(use_pfor=True),
+                        compute_jacobian(use_pfor=False))
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class BatchJacobianTest(test.TestCase, parameterized.TestCase):
diff --git a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
index e034cf0e296..573c8bc2e10 100644
--- a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
+++ b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
@@ -108,16 +108,12 @@ class ResNet50Test(tf.test.TestCase):
   def test_apply(self):
     self._apply(defun=False)
 
-  @test_util.disable_tfrt(
-      'TFE_ContextGetExecutorForThread not implemented b/156188669')
   def test_apply_async(self):
     self._apply(defun=False, execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def test_apply_with_defun(self):
     self._apply(defun=True)
 
-  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def test_apply_with_defun_async(self):
     self._apply(defun=True, execution_mode=context.ASYNC)
 
@@ -340,7 +336,6 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         defun=False,
         execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def benchmark_eager_apply_with_defun(self):
     self._benchmark_eager_apply(
         'eager_apply_with_defun',
@@ -420,7 +415,6 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         resnet50_test_util.device_and_data_format(),
         defun=False)
 
-  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def benchmark_eager_train_datasets_with_defun(self):
 
     def make_iterator(tensors):
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index fd50f789a6a..37ab60918c2 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -80,8 +80,8 @@ def c_tfe_py_fastpath_execute(a,
   assert ctx.executing_eagerly(
   ), "The prototype doesn't contain C code for graph construction"
   try:
-    return pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                             "MatMul", name, ctx.op_callbacks,
+    return pywrap_tfe.TFE_Py_FastPathExecute(ctx,
+                                             "MatMul", name,
                                              a, b, "transpose_a", transpose_a,
                                              "transpose_b", transpose_b)
   except core._NotOkStatusException as e:
@@ -322,7 +322,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     func = lambda: math_ops.multiply(m, m)
     self._run(func, num_iters)
 
-  @test_util.disable_tfrt("numpy() not supported")
   def benchmark_np_multiply(self):
     self._benchmark_np_multiply(self._m_2, 30000)
 
@@ -331,7 +330,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       m = self._m_2.cpu()
       self._benchmark_tf_multiply(m, 30000)
 
-  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_multiply_GPU(self):
     if not context.num_gpus():
       return
@@ -344,7 +342,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       m = self._m_2.cpu()
       self._benchmark_tf_multiply_op(m, 30000)
 
-  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_multiply_op_GPU(self):
     if not context.num_gpus():
       return
@@ -358,7 +355,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       m2 = self._m_1_3_3_1.cpu()
       self._benchmark_tf_conv2d(m1, m2, 30000)
 
-  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_conv2d_GPU(self):
     if not context.num_gpus():
       return
@@ -371,7 +367,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     m = self._m_2
     self._run(lambda: gen_array_ops.identity(m), 30000)
 
-  @test_util.disable_tfrt("identity not supported")
   def benchmark_slowpath_tf_identity(self):
     self._run(lambda: gen_array_ops.identity(1), 30000)
 
@@ -386,7 +381,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
     self._run(f, 30000)
 
-  @test_util.disable_tfrt("identity not supported")
   def benchmark_tf_gradient_function_identity(self):
     with context.device(CPU):
       m = gen_array_ops.identity(self._m_2)
@@ -394,14 +388,12 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           lambda: backprop.gradients_function(gen_array_ops.identity, [0])(m),
           30000)
 
-  @test_util.disable_tfrt("identity not supported")
   def benchmark_tf_gradient_forward_identity(self):
     with backprop.GradientTape() as tape:
       m = self._m_2
       tape.watch(m)
       self._run(lambda: gen_array_ops.identity(m), 30000)
 
-  @test_util.disable_tfrt("gradients not supported")
   def benchmark_tf_gradient_tape_push_pop(self):
 
     def f():
@@ -410,7 +402,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
     self._run(f, 30000)
 
-  @test_util.disable_tfrt("gradients not supported")
   def benchmark_tf_gradient_function_no_op(self):
     with context.device(CPU):
       m = gen_array_ops.identity(self._m_2)
@@ -519,7 +510,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
                                                transpose_b,
                                                num_iters,
                                                execution_mode=None):
-    f = function.defun(math_ops.matmul)
+    f = def_function.function(math_ops.matmul)
 
     def func():
       with backprop.GradientTape() as gt:
@@ -558,7 +549,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tf_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("async not supported")
   def benchmark_tf_matmul_2_by_2_CPU_async(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
@@ -604,13 +594,11 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_defun_matmul_relaxed_shape(
           m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_args_matmul_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_defun_args_matmul(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("async not supported")
   def benchmark_defun_matmul_2_by_2_CPU_async(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
@@ -620,14 +608,17 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
-  def benchmark_defun_matmul_forward_backward_2_by_2_CPU(self):
+  def _benchmark_matmul_forward_backward_2_by_2_CPU(self, run_eager=False):
+    def_function.run_functions_eagerly(run_eager)
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_defun_matmul_forward_backward(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
+    def_function.run_functions_eagerly(False)
 
-  @test_util.disable_tfrt("async not supported")
-  def benchmark_defun_matmul_forward_backward_2_by_2_CPU_async(self):
+  def _benchmark_matmul_forward_backward_2_by_2_CPU_async(
+      self, run_eager=False):
+    def_function.run_functions_eagerly(run_eager)
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_defun_matmul_forward_backward(
@@ -636,6 +627,18 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
+  def benchmark_defun_matmul_forward_backward_2_by_2_CPU(self):
+    self._benchmark_matmul_forward_backward_2_by_2_CPU(False)
+
+  def benchmark_defun_matmul_forward_backward_2_by_2_CPU_async(self):
+    self._benchmark_matmul_forward_backward_2_by_2_CPU_async(False)
+
+  def benchmark_defun_eager_matmul_forward_backward_2_by_2_CPU(self):
+    self._benchmark_matmul_forward_backward_2_by_2_CPU(True)
+
+  def benchmark_defun_eager_matmul_forward_backward_2_by_2_CPU_async(self):
+    self._benchmark_matmul_forward_backward_2_by_2_CPU_async(True)
+
   def benchmark_tf_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -644,7 +647,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tf_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("async not supported")
   def benchmark_tf_matmul_2_by_2_GPU_async(self):
     if not context.num_gpus():
       return
@@ -656,7 +658,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_gen_math_ops_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -665,7 +666,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_gen_math_ops_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tfe_py_execute_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -674,7 +674,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_defun_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -683,7 +682,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_defun_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_defun_matmul_2_by_2_with_signature_GPU(self):
     if not context.num_gpus():
       return
@@ -692,7 +690,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_defun_matmul_with_signature(
           m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_defun_matmul_2_by_2_relaxed_shape_GPU(self):
     if not context.num_gpus():
       return
@@ -701,7 +698,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_defun_matmul_relaxed_shape(
           m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_args_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -709,7 +705,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       m = self._m_2_by_2.gpu()
       self._benchmark_defun_args_matmul(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("async not supported")
   def benchmark_defun_matmul_2_by_2_GPU_async(self):
     if not context.num_gpus():
       return
@@ -739,7 +734,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tf_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("async not supported")
   def benchmark_tf_matmul_100_by_784_CPU_async(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
@@ -761,7 +755,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tfe_py_fastpath_execute_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tfe_py_execute_matmul_100_by_784_CPU(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
@@ -774,7 +767,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_defun_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_matmul_100_by_784_GPU(self):
     if not context.num_gpus():
       return
@@ -783,7 +775,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tf_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("async not supported")
   def benchmark_tf_matmul_100_by_784_GPU_async(self):
     if not context.num_gpus():
       return
@@ -795,7 +786,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_100_by_784,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_gen_math_ops_matmul_100_by_784_GPU(self):
     if not context.num_gpus():
       return
@@ -804,7 +794,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_gen_math_ops_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tfe_py_execute_matmul_100_by_784_GPU(self):
     if not context.num_gpus():
       return
@@ -813,7 +802,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_defun_matmul_100_by_784_GPU(self):
     if not context.num_gpus():
       return
@@ -822,7 +810,8 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_defun_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("copy to GPU not supported")
+  @test_util.disable_tfrt(
+      "b/169371527: Support inserting transfer op in lowering.")
   def benchmark_nested_defun_matmul_100_by_784_GPU(self):
     m = self._m_100_by_784.gpu()
     self._benchmark_nested_defun_matmul(
@@ -934,47 +923,41 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
         func = lambda: math_ops.reduce_logsumexp(x)
       self._run(func, 3000, execution_mode=execution_mode)
 
-  @test_util.disable_tfrt("reduce logsumexp not supported")
   def benchmark_tf_reduce_logsumexp_CPU(self):
     self._benchmark_tf_reduce_logsumexp()
 
-  @test_util.disable_tfrt("reduce logsumexp not supported")
   def benchmark_tf_reduce_logsumexp_CPU_async(self):
     self._benchmark_tf_reduce_logsumexp(execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("reduce logsumexp not supported")
   def benchmark_tf_reduce_logsumexp_GPU(self):
     self._benchmark_tf_reduce_logsumexp(device=GPU)
 
-  @test_util.disable_tfrt("reduce logsumexp not supported")
   def benchmark_tf_reduce_logsumexp_GPU_async(self):
     self._benchmark_tf_reduce_logsumexp(device=GPU,
                                         execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("reduce logsumexp not supported")
+  @test_util.disable_tfrt(
+      "b/169371527: Support inserting transfer op in lowering.")
   def benchmark_tf_reduce_logsumexp_CPU_defunc(self):
     self._benchmark_tf_reduce_logsumexp(defunc=True)
 
-  @test_util.disable_tfrt("reduce logsumexp not supported")
+  @test_util.disable_tfrt(
+      "b/169371527: Support inserting transfer op in lowering.")
   def benchmark_tf_reduce_logsumexp_CPU_async_defun(self):
     self._benchmark_tf_reduce_logsumexp(
         execution_mode=context.ASYNC, defunc=True)
 
-  @test_util.disable_tfrt("reduce logsumexp not supported")
   def benchmark_tf_reduce_logsumexp_GPU_defun(self):
     self._benchmark_tf_reduce_logsumexp(device=GPU, defunc=True)
 
-  @test_util.disable_tfrt("reduce logsumexp not supported")
   def benchmark_tf_reduce_logsumexp_GPU_async_defun(self):
     self._benchmark_tf_reduce_logsumexp(
         device=GPU, execution_mode=context.ASYNC, defunc=True)
 
-  @test_util.disable_tfrt("reduce logsumexp not supported")
   def benchmark_tf_reduce_logsumexp_GPU_defun_compile(self):
     self._benchmark_tf_reduce_logsumexp(
         device=GPU, defunc=True, xla_compile=True)
 
-  @test_util.disable_tfrt("reduce logsumexp not supported")
   def benchmark_tf_reduce_logsumexp_GPU_async_defun_compile(self):
     self._benchmark_tf_reduce_logsumexp(
         device=GPU, execution_mode=context.ASYNC, defunc=True, xla_compile=True)
@@ -986,19 +969,15 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       func = lambda: math_ops.tensordot(a, b, [[1], [0]])
       self._run(func, 30000, execution_mode=execution_mode)
 
-  @test_util.disable_tfrt("tensordot not supported")
   def benchmark_tf_tensordot_CPU(self):
     self._benchmark_tf_tensordot()
 
-  @test_util.disable_tfrt("tensordot not supported")
   def benchmark_tf_tensordot_CPU_async(self):
     self._benchmark_tf_tensordot(execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("tensordot not supported")
   def benchmark_tf_tensordot_GPU(self):
     self._benchmark_tf_tensordot(device=GPU)
 
-  @test_util.disable_tfrt("tensordot not supported")
   def benchmark_tf_tensordot_GPU_async(self):
     self._benchmark_tf_tensordot(device=GPU, execution_mode=context.ASYNC)
 
@@ -1007,63 +986,48 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       func = lambda: array_ops.zeros(shape, dtype)
       self._run(func, 3000)
 
-  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_2_by_2_float32_CPU(self):
     self._benchmark_tf_zeros((2, 2), dtypes.float32)
 
-  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_2_by_2_bool_CPU(self):
     self._benchmark_tf_zeros((2, 2), dtypes.bool)
 
-  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_2_by_2_string_CPU(self):
     self._benchmark_tf_zeros((2, 2), dtypes.string)
 
-  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_2_by_2_float32_GPU(self):
     self._benchmark_tf_zeros((2, 2), dtypes.float32, device=GPU)
 
-  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_2_by_2_bool_GPU(self):
     self._benchmark_tf_zeros((2, 2), dtypes.bool, device=GPU)
 
-  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_30_by_30_float32_CPU(self):
     self._benchmark_tf_zeros((30, 30), dtypes.float32)
 
-  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_30_by_30_bool_CPU(self):
     self._benchmark_tf_zeros((30, 30), dtypes.bool)
 
-  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_30_by_30_string_CPU(self):
     self._benchmark_tf_zeros((30, 30), dtypes.string)
 
-  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_30_by_30_float32_GPU(self):
     self._benchmark_tf_zeros((30, 30), dtypes.float32, device=GPU)
 
-  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_30_by_30_bool_GPU(self):
     self._benchmark_tf_zeros((30, 30), dtypes.bool, device=GPU)
 
-  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_100_by_100_float32_CPU(self):
     self._benchmark_tf_zeros((100, 100), dtypes.float32)
 
-  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_100_by_100_bool_CPU(self):
     self._benchmark_tf_zeros((100, 100), dtypes.bool)
 
-  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_100_by_100_string_CPU(self):
     self._benchmark_tf_zeros((100, 100), dtypes.string)
 
-  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_100_by_100_float32_GPU(self):
     self._benchmark_tf_zeros((100, 100), dtypes.float32, device=GPU)
 
-  @test_util.disable_tfrt("context.device not supported")
   def benchmark_tf_zeros_100_by_100_bool_GPU(self):
     self._benchmark_tf_zeros((100, 100), dtypes.bool, device=GPU)
 
@@ -1162,7 +1126,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       m = self._m_2_by_2.cpu()
       self._benchmark_transpose(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_transpose_2_by_2_GPU(self):
     with context.device(GPU):
       m = self._m_2_by_2.gpu()
@@ -1173,7 +1136,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
       self._benchmark_transpose(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("Cannot convert array to EagerTensor of dtype int32")
   def benchmark_tf_transpose_variable_2_by_2_GPU(self):
     with context.device(GPU):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
@@ -1243,7 +1205,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
       self._benchmark_read_variable(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_read_variable_op_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -1257,7 +1218,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_read_variable_with_tape(
           m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_read_variable_op_with_tape_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -1275,8 +1235,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
     self._run(scan, 100)
 
-  @test_util.disable_tfrt(
-      "tf.While not supported in TF to CoreRT lowing. b/162685874")
+  @test_util.disable_tfrt("tf.While not supported RTFB tensor. b/169374895")
   def benchmarkScanDefun(self):
     elems = math_ops.range(1600)
 
@@ -1343,12 +1302,10 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
   def benchmark_convert_numpy_float_uncached(self):
     self._benchmark_convert_constant(np.array(42.0), cached=False)
 
-  @test_util.disable_tfrt("convert to tensor not supported")
   def benchmark_convert_3x_list_to_tensor(self):
     xs = [1, 2, 3]
     self._run(lambda: ops.convert_to_tensor(xs), 1000)
 
-  @test_util.disable_tfrt("convert to tensor not supported")
   def benchmark_convert_3x_array_to_tensor(self):
     xs = np.array([1, 2, 3], dtype=np.int32)
     self._run(lambda: ops.convert_to_tensor(xs), 1000)
@@ -1357,7 +1314,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     xs = [[0] * 2] * 40
     self._run(lambda: constant_op.constant(xs), 1000)
 
-  @test_util.disable_tfrt("convert to tensor not supported")
   def benchmark_constant_40x2_array_to_tensor(self):
     xs = np.array([[0] * 2] * 40, dtype=np.int32)
     self._run(lambda: constant_op.constant(xs), 1000)
@@ -1436,15 +1392,12 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     with context.device(CPU):
       self._run(benchmark_fn, 10)
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkTenThousandResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(10000)
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkHundredResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(100)
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkTenResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(10)
 
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 9c939fe0a76..a15e37a9151 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -39,6 +39,7 @@ from tensorflow.python.eager import executor
 from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import tfrt_utils
 from tensorflow.python.util import compat
 from tensorflow.python.util import is_in_graph_mode
 from tensorflow.python.util import tf_contextlib
@@ -68,15 +69,15 @@ DEVICE_PLACEMENT_SILENT_FOR_INT32 = (
 SYNC = 0
 ASYNC = 1
 
-MIRRORING_NONE = pywrap_tfe.TFE_MIRRORING_NONE
-MIRRORING_ALL = pywrap_tfe.TFE_MIRRORING_ALL
-
 _KEEP_ALIVE_SECS = 600
 
 _python_eager_context_create_counter = monitoring.Counter(
     "/tensorflow/api/python/eager_context_create_counter",
     "Counter for number of eager contexts created in Python.")
 
+# Re-exporting through context.
+is_tfrt_enabled = tfrt_utils.enabled
+
 
 class _EagerTensorCache(object):
   """Simple cache which evicts items based on length in a FIFO manner."""
@@ -186,21 +187,6 @@ class _TensorCaches(threading.local):
     return self._zeros_cache
 
 
-class _ThreadLocalData(threading.local):
-  """Thread local storage for the eager context."""
-
-  def __init__(self):
-    super(_ThreadLocalData, self).__init__()
-    self.device_spec = _starting_device_spec
-    self.device_name = ""
-    self.is_eager = default_execution_mode == EAGER_MODE
-    self.scope_name = ""
-    self.function_call_options = None
-    self.executor = None
-    self.op_callbacks = []
-    self.invoking_op_callbacks = False
-
-
 ContextSwitch = collections.namedtuple(
     "ContextSwitch", ["is_building_function", "enter_context_fn",
                       "device_stack"])
@@ -353,19 +339,6 @@ class _TensorCacheDeleter(object):
       del _tensor_caches_map[self._context_id]
 
 
-# If the below import is made available through the BUILD rule, then this
-# function is overridden and will instead return True and cause Tensorflow
-# graphs to run with TFRT.
-def is_tfrt_enabled():
-  return None
-
-
-try:
-  from tensorflow.python.framework.is_tfrt_test_true import is_tfrt_enabled  # pylint: disable=g-import-not-at-top
-except:  # pylint: disable=bare-except
-  pass
-
-
 # TODO(agarwal): rename to EagerContext / EagerRuntime ?
 # TODO(agarwal): consider keeping the corresponding Graph here.
 class Context(object):
@@ -423,7 +396,10 @@ class Context(object):
     _tensor_caches_map[self._id] = _TensorCaches()
 
     self._config = config
-    self._thread_local_data = _ThreadLocalData()
+    self._thread_local_data = pywrap_tfe.EagerContextThreadLocalData(
+        self,
+        is_eager=lambda: default_execution_mode == EAGER_MODE,
+        device_spec=_starting_device_spec)
     self._context_switches = _ContextSwitchStack(self.executing_eagerly())
     self._context_handle = None
     self._context_devices = None
@@ -438,10 +414,10 @@ class Context(object):
       raise ValueError(
           "execution_mode should be None/SYNC/ASYNC. Got %s" % execution_mode)
     if execution_mode is None:
-      execution_mode = ASYNC if is_tfrt_enabled() else SYNC
+      execution_mode = SYNC
     self._default_is_async = execution_mode == ASYNC
     self._lazy_remote_inputs_copy = None
-    self._use_tfrt = is_tfrt_enabled()
+    self._use_tfrt = tfrt_utils.enabled()
     self._server_def = server_def
     self._collective_ops_server_def = None
     self._collective_leader = None
@@ -762,8 +738,8 @@ class Context(object):
 
     This is intended to be used when a peer failure is detected, which allows
     the user to handle the case instead of hanging. This aborts all on-going
-    collectives. After all subsequent collectives error immediately. The only
-    way to recovery now is to restart the program.
+    collectives. After all subsequent collectives error immediately, and you
+    need to reset_context() to use collectives again.
 
     Args:
       code: a `tf.errors` error code.
@@ -772,6 +748,26 @@ class Context(object):
     self.ensure_initialized()
     pywrap_tfe.TFE_AbortCollectiveOps(self._handle, code, message)
 
+  def check_collective_ops_peer_health(self, task):
+    """Check collective peer health.
+
+    This probes each task to see if they're still alive. Note that restarted
+    tasks are considered a different one, and they're considered not healthy.
+
+    This should only be used in multi client multi worker training.
+
+    Args:
+      task: a task string, must be in the format of /job:xxx/replica:0/task:N.
+
+    Raises:
+      tf.errors.UnavailableError: when a peer is down.
+      tf.errors.FailedPreconditionError: when a peer is a different one from the
+        one this task has talked to, e.g. the peer has restarted.
+      tf.errors.InvalidArgumentError: when the task string is invalid.
+    """
+    self.ensure_initialized()
+    pywrap_tfe.TFE_CollectiveOpsCheckPeerHealth(self._handle, task)
+
   @property
   def _handle(self):
     if self._context_handle is None:
@@ -1489,6 +1485,10 @@ class Context(object):
 
     self._virtual_device_map[dev] = virtual_devices
 
+  def get_compiler_ir(self, device_name, function_name, args, stage="hlo"):
+    return pywrap_tfe.TF_GetCompilerIr(self._context_handle, function_name,
+                                       stage, device_name, args)
+
   @deprecated(
       None, "XLA:CPU and XLA:GPU devices are deprecated", warn_once=True)
   def enable_xla_devices(self):
@@ -1648,27 +1648,6 @@ class Context(object):
         pywrap_tfe.TFE_ContextSetThreadLocalDevicePlacementPolicy(
             self._handle, self._device_policy)
 
-  @property
-  def mirroring_policy(self):
-    # Only get the policy from the context if it has already been initialized
-    if self._context_handle is not None:
-      return pywrap_tfe.TFE_ContextGetMirroringPolicy(self._handle)
-
-    return self._mirroring_policy
-
-  @mirroring_policy.setter
-  def mirroring_policy(self, policy):
-    if policy is None:
-      policy = MIRRORING_NONE
-
-    if self._mirroring_policy is None or self._mirroring_policy != policy:
-      self._mirroring_policy = policy
-
-      # Only set the policy if the context has already been initialized
-      if self._context_handle is not None:
-        pywrap_tfe.TFE_ContextSetThreadLocalMirroringPolicy(
-            self._handle, self._mirroring_policy)
-
   @property
   def lazy_remote_inputs_copy(self):
     return self._lazy_remote_inputs_copy
@@ -1843,11 +1822,13 @@ def _reset_context():
   Should only be used for testing.
   """
   global _context
+  global _device_parsing_cache
   with _context_lock:
     if _context is not None:
       _context._clear_caches()
       _context = None
   _create_context()
+  _device_parsing_cache = {}
   pywrap_tfe.TFE_ClearScalarCache()
 
 
@@ -2104,18 +2085,6 @@ def device_policy(policy):
     ctx.device_policy = old_policy
 
 
-@tf_contextlib.contextmanager
-def mirroring_policy(policy):
-  """Context manager for setting mirroring policy for current thread."""
-  ctx = context()
-  old_policy = ctx.mirroring_policy
-  try:
-    ctx.mirroring_policy = policy
-    yield
-  finally:
-    ctx.mirroring_policy = old_policy
-
-
 def set_execution_mode(mode):
   """Sets execution mode for the current thread."""
   context().execution_mode = mode
diff --git a/tensorflow/python/eager/context_test.py b/tensorflow/python/eager/context_test.py
index 086f943b3b0..fe86104cc0b 100644
--- a/tensorflow/python/eager/context_test.py
+++ b/tensorflow/python/eager/context_test.py
@@ -75,6 +75,7 @@ class ContextTest(test.TestCase):
     del tensor2
     self.assertIs(weak_c(), None)
 
+  @test_util.disable_tfrt('b/169294215: tfrt does not support RunMetadata yet')
   def testSimpleGraphCollection(self):
 
     @def_function.function
@@ -111,20 +112,24 @@ class ContextTest(test.TestCase):
       _ = context.get_function_def('this_should_not_be_found')
 
   @test_util.run_gpu_only
+  @test_util.disable_tfrt('b/169293680: TFE_GetTotalMemoryUsage is unsupported')
   def testGetMemoryUsage(self):
     array_ops.zeros([10]) # Allocate some memory on the GPU.
     self.assertGreater(
         context.context().get_total_memory_usage('GPU:0'), 0)
 
+  @test_util.disable_tfrt('b/169293680: TFE_GetTotalMemoryUsage is unsupported')
   def testGetMemoryUsageCPU(self):
     with self.assertRaisesRegex(ValueError, 'CPU does not support'):
       context.context().get_total_memory_usage('CPU:0')
 
+  @test_util.disable_tfrt('b/169293680: TFE_GetTotalMemoryUsage is unsupported')
   def testGetMemoryUsageUnknownDevice(self):
     with self.assertRaisesRegex(ValueError, 'Failed parsing device name'):
       context.context().get_total_memory_usage('unknown_device')
 
   @test_util.run_gpu_only
+  @test_util.disable_tfrt('b/169293680: TFE_GetTotalMemoryUsage is unsupported')
   def testGetMemoryUsageAmbiguousDevice(self):
     if len(context.context().list_physical_devices('GPU')) < 2:
       self.skipTest('Need at least 2 GPUs')
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index a2840b34e64..2dfb69c9b07 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -324,7 +324,7 @@ class TFETest(test_util.TensorFlowTestCase):
       else:
         ops.disable_tensor_equality()
 
-  @test_util.disable_tfrt('Async execution mode not supported in TFRT.')
+  @test_util.disable_tfrt('Get execution mode not supported in TFRT.')
   def testContext(self):
     ctx = context.Context()
     self.assertTrue(ctx.executing_eagerly())
@@ -384,7 +384,6 @@ class TFETest(test_util.TensorFlowTestCase):
     with ctx.device(device_spec):
       self.assertEqual(device_name, ctx.device_name)
 
-  @test_util.disable_tfrt('Async execution mode not supported in TFRT.')
   def testAsyncBasic(self):
     ctx = context.Context(execution_mode=context.ASYNC)
     ctx.ensure_initialized()
@@ -404,8 +403,6 @@ class TFETest(test_util.TensorFlowTestCase):
     self.assertEqual(y.device, '/job:localhost/replica:0/task:0/device:CPU:0')
 
   @test_util.run_gpu_only
-  @test_util.disable_tfrt('Device name incorrect (known issue for runtime '
-                          'fallback).')
   def testShouldCopy(self):
     with ops.device('GPU:0'):
       x = array_ops.identity(1.0)
@@ -500,8 +497,6 @@ class TFETest(test_util.TensorFlowTestCase):
     cpu.__exit__()
 
   @test_util.run_gpu_only
-  @test_util.disable_tfrt('Device name incorrect (known issue for runtime '
-                          'fallback).')
   def testReEntrant(self):
     cpu = context.device('cpu:0')
     gpu = context.device('gpu:0')
@@ -548,7 +543,6 @@ class TFETest(test_util.TensorFlowTestCase):
       x.gpu(context.context().num_gpus() + 1)
 
   @test_util.run_gpu_only
-  @test_util.disable_tfrt('Async execution mode not supported in TFRT.')
   def testCopyBetweenDevicesAsync(self):
     with context.execution_mode(context.ASYNC):
       x = constant_op.constant([[1., 2.], [3., 4.]])
@@ -565,7 +559,6 @@ class TFETest(test_util.TensorFlowTestCase):
     context.context().executor.clear_error()
 
   @test_util.run_gpu_only
-  @test_util.disable_tfrt('Device placement not implemented.')
   def testCopyScope(self):
     constant = constant_op.constant(1.0)
     with ops.device('gpu:0'):
@@ -588,7 +581,7 @@ class TFETest(test_util.TensorFlowTestCase):
 
     self.assertAllEqual(test_fn(test_var), 1.0)
 
-  @test_util.disable_tfrt('Async execution mode not supported in TFRT.')
+  @test_util.disable_tfrt('PyFunc is not supported in TFRT.')
   def testPyFunctionAsync(self):
 
     def simple_fn(v):
@@ -634,7 +627,6 @@ class TFETest(test_util.TensorFlowTestCase):
         attrs=('T', three.dtype.as_datatype_enum))[0]
     self.assertAllEqual(15, product)
 
-  @test_util.disable_tfrt('Async execution mode not supported in TFRT.')
   def testExecuteBasicAsync(self):
     with context.execution_mode(context.ASYNC):
       three = constant_op.constant(3)
@@ -688,7 +680,6 @@ class TFETest(test_util.TensorFlowTestCase):
           attrs=('T', dtypes.int32.as_datatype_enum))[0]
 
   @test_util.run_gpu_only
-  @test_util.disable_tfrt('Device placement not implemented yet.')
   def testMatMulGPU(self):
     three = constant_op.constant([[3.]]).gpu()
     five = constant_op.constant([[5.]]).gpu()
@@ -700,6 +691,19 @@ class TFETest(test_util.TensorFlowTestCase):
                three.dtype.as_datatype_enum))[0]
     self.assertAllEqual([[15.0]], product)
 
+  @test_util.run_gpu_only
+  def testMatMulGPUCopyToCPU(self):
+    three = constant_op.constant([[3.]]).gpu()
+    five = constant_op.constant([[5.]]).gpu()
+    with ops.device('CPU:0'):
+      product = execute(
+          b'MatMul',
+          num_outputs=1,
+          inputs=[three, five],
+          attrs=('transpose_a', False, 'transpose_b', False, 'T',
+                 three.dtype.as_datatype_enum))[0]
+      self.assertAllEqual([[15.0]], product)
+
   def testExecuteStringAttr(self):
     checked_three = execute(
         b'CheckNumerics',
@@ -1043,8 +1047,6 @@ class TFETest(test_util.TensorFlowTestCase):
     for t in threads:
       t.join()
 
-  @test_util.disable_tfrt('Does not support converting DT_RESOURCE'
-                          'to op attr type yet.')
   def testEmptyResourceReturned(self):
     with ops.device('CPU:0'):
       v = variables.Variable(1.)
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index 3199747de53..5cc859aa033 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -27,15 +27,18 @@ import six
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
 from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.distribute.parallel_device import parallel_device
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function as function_lib
 from tensorflow.python.eager import lift_to_graph
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.profiler import trace
@@ -117,7 +120,7 @@ class _FrequentTracingDetector(object):
             "experimental_relax_shapes=True option that relaxes argument "
             "shapes that can avoid unnecessary retracing. For (3), please "
             "refer to "
-            "https://www.tensorflow.org/tutorials/customization/performance#python_or_tensor_args"
+            "https://www.tensorflow.org/guide/function#controlling_retracing"
             " and https://www.tensorflow.org/api_docs/python/tf/function for "
             " more details.".format(counter.get_tracing_count(),
                                     counter.call_count, function_name))
@@ -217,17 +220,18 @@ class UnliftedInitializerVariable(resource_variable_ops.UninitializedVariable):
     if constraint is not None and not callable(constraint):
       raise ValueError("The `constraint` argument must be a callable.")
 
-    if isinstance(initial_value, trackable.CheckpointInitialValue):
-      self._maybe_initialize_trackable()
-      self._update_uid = initial_value.checkpoint_position.restore_uid
-      initial_value = initial_value.wrapped_value
-
     with ops.name_scope(name, "Variable", []
                         if init_from_fn else [initial_value]) as scope_name:
       with ops.name_scope("Initializer"):
-        initial_value = ops.convert_to_tensor(
-            initial_value() if init_from_fn else initial_value,
-            name="initial_value", dtype=dtype)
+        if init_from_fn:
+          initial_value = initial_value()
+        if isinstance(initial_value, trackable.CheckpointInitialValue):
+          self._maybe_initialize_trackable()
+          self._update_uid = initial_value.checkpoint_position.restore_uid
+          initial_value = initial_value.wrapped_value
+
+        initial_value = ops.convert_to_tensor(initial_value,
+                                              name="initial_value", dtype=dtype)
       assert initial_value is not None
 
       # Don't use `shape or initial_value.shape` since TensorShape has
@@ -321,37 +325,7 @@ def experimental_run_functions_eagerly(run_eagerly):
   invocations of `tf.function` run eagerly instead of running as a traced graph
   function.
 
-  This can be useful for debugging or profiling. For example, let's say you
-  implemented a simple iterative sqrt function, and you want to collect the
-  intermediate values and plot the convergence.  Appending the values to a list
-  in `@tf.function` normally wouldn't work since it will just record the Tensors
-  being traced, not the values.  Instead, you can do the following.
-
-  >>> ys = []
-  >>>
-  >>> @tf.function
-  ... def sqrt(x):
-  ...   y = x / 2
-  ...   d = y
-  ...   for _ in range(10):
-  ...     d /= 2
-  ...     if y * y < x:
-  ...       y += d
-  ...     else:
-  ...       y -= d
-  ...     ys.append(y.numpy())
-  ...   return y
-  >>>
-  >>> tf.config.experimental_run_functions_eagerly(True)
-  >>> sqrt(tf.constant(2.))
-  <tf.Tensor: shape=(), dtype=float32, numpy=1.4150391>
-  >>> ys
-  [1.5, 1.25, 1.375, 1.4375, 1.40625, 1.421875, 1.4140625, 1.4179688, 1.4160156,
-  1.4150391]
-  >>> tf.config.experimental_run_functions_eagerly(False)
-
-  Calling `tf.config.experimental_run_functions_eagerly(False)` will undo this
-  behavior.
+  See `tf.config.run_functions_eagerly` for an example.
 
   Note: This flag has no effect on functions passed into tf.data transformations
   as arguments. tf.data functions are never executed eagerly and are always
@@ -371,37 +345,31 @@ def run_functions_eagerly(run_eagerly):
   invocations of `tf.function` run eagerly instead of running as a traced graph
   function.
 
-  This can be useful for debugging or profiling. For example, let's say you
-  implemented a simple iterative sqrt function, and you want to collect the
-  intermediate values and plot the convergence.  Appending the values to a list
-  in `@tf.function` normally wouldn't work since it will just record the Tensors
-  being traced, not the values.  Instead, you can do the following.
+  This can be useful for debugging.
 
-  >>> ys = []
-  >>>
-  >>> @tf.function
-  ... def sqrt(x):
-  ...   y = x / 2
-  ...   d = y
-  ...   for _ in range(10):
-  ...     d /= 2
-  ...     if y * y < x:
-  ...       y += d
-  ...     else:
-  ...       y -= d
-  ...     ys.append(y.numpy())
-  ...   return y
-  >>>
+  >>> def my_func(a):
+  ...  print("Python side effect")
+  ...  return a + a
+  >>> a_fn = tf.function(my_func)
+
+  >>> # A side effect the first time the function is traced
+  >>> a_fn(tf.constant(1))
+  Python side effect
+  <tf.Tensor: shape=(), dtype=int32, numpy=2>
+
+  >>> # No further side effect, as the traced function is called
+  >>> a_fn(tf.constant(2))
+  <tf.Tensor: shape=(), dtype=int32, numpy=4>
+
+  >>> # Now, switch to eager running
   >>> tf.config.run_functions_eagerly(True)
-  >>> sqrt(tf.constant(2.))
-  <tf.Tensor: shape=(), dtype=float32, numpy=1.4150391>
-  >>> ys
-  [1.5, 1.25, 1.375, 1.4375, 1.40625, 1.421875, 1.4140625, 1.4179688, 1.4160156,
-  1.4150391]
-  >>> tf.config.run_functions_eagerly(False)
+  >>> # Side effect, as the function is called directly
+  >>> a_fn(tf.constant(2))
+  Python side effect
+  <tf.Tensor: shape=(), dtype=int32, numpy=4>
 
-  Calling `tf.config.run_functions_eagerly(False)` will undo this
-  behavior.
+  >>> # Turn this back off
+  >>> tf.config.run_functions_eagerly(False)
 
   Note: This flag has no effect on functions passed into tf.data transformations
   as arguments. tf.data functions are never executed eagerly and are always
@@ -429,6 +397,45 @@ def functions_run_eagerly():
   return RUN_FUNCTIONS_EAGERLY
 
 
+def _evaluate_var_is_initialized(variables):
+  """Compute booleans indicating whether each variable is initialized."""
+  with ops.init_scope():
+    var_is_initialized = []
+    for v in variables:
+      var_is_initialized.append(
+          resource_variable_ops.var_is_initialized_op(v.handle))
+    try:
+      # Stack all the var_is_initialized values into one tensor and interpret
+      # the numpy value. This will reduce the number of RPCs between client and
+      # worker in the remote case.
+      return array_ops.stack(var_is_initialized).numpy()
+    except errors.UnimplementedError:
+      # Some devices do not support implicit copy-off to host. Fall back to
+      # variable-by-variable processing.
+      for index, v in enumerate(variables):
+        try:
+          numpy_value = var_is_initialized[index].numpy()
+        except errors.UnimplementedError:
+          # This is a variable on a parallel device; we'll extract its value on
+          # each replica and assert that they're identical.
+          components = parallel_device.unpack(var_is_initialized[index])
+          with ops.device(None):
+            components = array_ops.stack(components)
+            all_initialized = math_ops.reduce_all(components).numpy()
+            any_initialized = math_ops.reduce_any(components).numpy()
+          if all_initialized != any_initialized:
+            raise NotImplementedError(
+                ("Some but not all components of a parallel variable {} were "
+                 "initialized between their creation in a tf.function and "
+                 "the function's trace having completed. This is not yet "
+                 "supported; consider initializing either all or none of the "
+                 "components, or moving initialization out of the function."
+                ).format(repr(v)))
+          numpy_value = all_initialized
+        var_is_initialized[index] = numpy_value
+  return var_is_initialized
+
+
 class FunctionDeleter(object):
 
   __slots__ = ["func_graph"]
@@ -529,7 +536,9 @@ class Function(object):
     self._function_spec = function_lib.FunctionSpec.from_function_and_signature(
         python_function,
         input_signature,
-        experimental_follow_type_hints=experimental_follow_type_hints)
+        experimental_compile=experimental_compile,
+        experimental_follow_type_hints=experimental_follow_type_hints,
+    )
     self._implements = experimental_implements
     # If `True`, the function uses the rendezvous of the parent. This is only
     # needed to support code where raw send/recv operations are inserted and
@@ -770,7 +779,40 @@ class Function(object):
     self._function_spec = function_lib.FunctionSpec.from_function_and_signature(
         self._python_function, self.input_signature)
 
+  # TODO: Remove this private method after updating all its uses
+  # A good moment to do this could be when the experimental label is removed
   def _get_tracing_count(self):
+    return self.experimental_get_tracing_count()
+
+  def experimental_get_tracing_count(self):
+    """Returns the number of times the function has been traced.
+
+    For more information on when a function is traced and when it is
+    traced multiple times see https://www.tensorflow.org/guide/function.
+    Example:
+
+    >>> @tf.function
+    ... def double(a):
+    ...   return a + a
+    >>> double(tf.constant(1))
+    >>> double(tf.constant(2))
+    >>> double.experimental_get_tracing_count()
+    1
+    >>> double(tf.constant("a"))
+    >>> double.experimental_get_tracing_count()
+    2
+
+
+    The first time experimental_get_tracing_count is called
+    it returns 1, as the function is traced the first
+    time it is called, and the second time the same graph is used
+    since we're calling it with a parameter of the same type.
+
+    The second time experimental_get_tracing_count is called
+    it returns 2, as we called double with a
+    different argument type, and so it was traced again.
+
+    """
     result = self._stateless_fn.tracing_count if self._stateless_fn else 0
     result += self._stateful_fn.tracing_count if self._stateful_fn else 0
     return result
@@ -781,11 +823,11 @@ class Function(object):
       with trace.Trace(self._name, tf_function_call="eager"):
         return self._python_function(*args, **kwds)
 
-    tracing_count = self._get_tracing_count()
+    tracing_count = self.experimental_get_tracing_count()
     with trace.Trace(self._name) as tm:
       result = self._call(*args, **kwds)
       compiler = "xla" if self._experimental_compile else "nonXla"
-      new_tracing_count = self._get_tracing_count()
+      new_tracing_count = self.experimental_get_tracing_count()
       without_tracing = (tracing_count == new_tracing_count)
       execution_mode = "notTraced" if without_tracing else "traced"
       tm.set_metadata(tf_function_call=execution_mode + "-" + compiler,
@@ -913,6 +955,92 @@ class Function(object):
     return function_lib.defun(fn_with_cond)(canon_args, canon_kwds,
                                             filtered_flat_args)
 
+  def experimental_get_compiler_ir(self, *args, **kwargs):
+    """Returns compiler IR for the compiled function.
+
+    This API is intended *only* for debugging as there are no guarantees on
+    backwards compatibility of returned IR or the allowed values of `stage`.
+
+    Args:
+      *args: Arguments used for compilation; same arguments as used for calling
+        the function. Need to be eager tensors.
+      **kwargs: Keyword arguments used for compilation.
+
+    Returns:
+      Function callable with the stage at which the compiler IR should be
+      serialized. Allowed values for the `stage` are:
+       - `hlo`: HLO output after conversion from TF
+         (https://www.tensorflow.org/xla/operation_semantics).
+       - `optimized_hlo`: HLO after compiler optimizations.
+       - `optimized_hlo_dot`: optimized HLO in DOT format suitable for
+         Graphviz.
+
+      For example, for
+
+      ```python
+      @tf.function(experimental_compile=True)
+      def f(x):
+        return x + 1
+
+      f.experimental_get_compiler_ir(tf.random.normal([10, 10])(stage='hlo')
+      ```
+
+      the output is:
+
+      ```
+      HloModule a_inference_f_13__.9
+
+      ENTRY %a_inference_f_13__.9 (arg0.1: f32[10,10]) -> f32[10,10] {
+        %arg0.1 = f32[10,10]{1,0} parameter(0), parameter_replication={false}
+        %reshape.2 = f32[10,10]{1,0} reshape(f32[10,10]{1,0} %arg0.1)
+        %constant.3 = f32[] constant(1)
+        %broadcast.4 = f32[10,10]{1,0} broadcast(f32[] %constant.3)
+        %add.5 = f32[10,10]{1,0} add(f32[10,10]{1,0} %reshape.2,
+                                     f32[10,10]{1,0} %broadcast.4)
+        %reshape.6 = f32[10,10]{1,0} reshape(f32[10,10]{1,0} %add.5)
+        %tuple.7 = (f32[10,10]{1,0}) tuple(f32[10,10]{1,0} %reshape.6)
+        ROOT %get-tuple-element.8 = f32[10,10]{1,0}
+          get-tuple-element((f32[10,10]{1,0}) %tuple.7), index=0
+      }
+      ```
+
+    Raises:
+      ValueError: If an invalid `stage` is selected or if applied to a function
+        which is not compiled (`experimental_compile=True` is not set).
+      TypeError: When called with input in graph mode.
+    """
+    context.ensure_initialized()
+    if not self._experimental_compile:
+      raise ValueError(
+          "Compiler IR can only be returned for functions marked with "
+          "experimental_compile=True")
+
+    concrete_fn = self.get_concrete_function(*args, **kwargs)
+    fn_name = concrete_fn.name
+
+    # pylint: disable=protected-access
+    _, _, _, filtered_flat_args = \
+        concrete_fn._function_spec.canonicalize_function_inputs(
+            *args, **kwargs)
+
+    def compiler_ir_generator(stage='hlo'):
+      """Returns compiler IR for the given `stage`.
+
+      Args:
+        stage: Stage at which to return the IR. Allowed values are 'hlo' and
+        'optimized_hlo'.
+      """
+      # TODO(cheshire): This is a hack to get the current "preferred" device,
+      # there is no current API to get it otherwise.
+      device = random_ops.random_normal([]).device
+      return context.context().get_compiler_ir(
+          device_name=device,
+          stage=stage,
+          function_name=fn_name,
+          args=list(filtered_flat_args) + concrete_fn.captured_inputs)
+
+    return compiler_ir_generator
+
   @property
   def python_function(self):
     """The python function wrapped in this tf.function."""
@@ -939,21 +1067,15 @@ class Function(object):
     if not initializers:
       return
 
+    var_is_initialized = _evaluate_var_is_initialized(
+        [v for v, _ in initializers])
+
     # Note: using defun here avoids an infinite recursion.
     # Most of the code in this function runs eagerly with init_scope, where
     # autograph is not necessary.
     @function_lib.defun(autograph=False)
     def initialize_variables():
       op_map = object_identity.ObjectIdentityDictionary()
-      # Stack all the var_is_initialized values into one tensor and interpret
-      # the numpy value. This will reduce the number of RPCs between client and
-      # worker in the remote case.
-      with ops.init_scope():
-        var_is_initialized = []
-        for v, _ in initializers:
-          var_is_initialized.append(
-              resource_variable_ops.var_is_initialized_op(v.handle))
-        var_is_initialized = array_ops.stack(var_is_initialized).numpy()
 
       inits = []
       for (v, init), is_initialized in zip(initializers, var_is_initialized):
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index 8784fb1cd0a..42af94c6cb1 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -70,7 +70,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllEqual(fn(constant_op.constant(4.0)), 8.0)
 
-  @test_util.disable_tfrt('Variable argument is not supported')
   def testFailIfVariablesAreCreatedMoreThanOnce(self):
 
     @def_function.function
@@ -80,7 +79,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(ValueError):
       fn(1.0)
 
-  @test_util.disable_tfrt('Variable argument is not supported')
   def testFailIfVariablesAreCreatedMoreThanOnceNoWeakRef(self):
     state = []
 
@@ -100,7 +98,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllEqual(f(range(5)), 1.0)
 
-  @test_util.disable_tfrt('Variable argument is not supported')
   def testCorrectVariableCreation(self):
 
     state = []
@@ -114,7 +111,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(fn(constant_op.constant(1.0)), 2.0)
     self.assertAllEqual(fn(constant_op.constant(3.0)), 6.0)
 
-  @test_util.disable_tfrt('Variable argument is not supported')
   def testFunctionInitializer(self):
 
     state = []
@@ -127,7 +123,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllEqual(fn(constant_op.constant(1.0)), 2.0)
 
-  @test_util.disable_tfrt('Variable argument is not supported')
   def testFunctionMultipleVariableInitializer(self):
 
     state = []
@@ -141,7 +136,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllEqual(fn(constant_op.constant(1.0)), [2.0, 5.0])
 
-  @test_util.disable_tfrt('Variable argument is not supported')
   def testFunctionInitializationFunction(self):
 
     state = []
@@ -159,7 +153,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
     init_fn()
     self.assertEqual(state[0].numpy(), 2.0)
 
-  @test_util.disable_tfrt('Variable argument is not supported')
   def testVariableInitializerNotConstant(self):
 
     state = []
@@ -189,7 +182,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
       self.assertAllEqual(sess.run(state[0]), 2.0)
       self.assertAllEqual(self.evaluate(result), 6.0)
 
-  @test_util.disable_tfrt('Variable argument is not supported')
   def testLegacyGraphModeVariablesNonTrivialInitializer(self):
     with ops.Graph().as_default(), self.test_session() as sess:
       state = []
@@ -209,7 +201,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
       self.assertAllEqual(sess.run(state[0]), 6.0)
       self.assertAllEqual(self.evaluate(result), 18.0)
 
-  @test_util.disable_tfrt('Variable argument is not supported')
   def testLegacyGraphModeInputDependentInitializerFails(self):
     with ops.Graph().as_default():
       state = []
@@ -224,7 +215,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
                                   r'transitively.* mul .* x'):
         fn(constant_op.constant(3.0))
 
-  @test_util.disable_tfrt('Variable argument is not supported')
   def testMethod(self):
 
     class MyModel(object):
@@ -253,7 +243,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
         def_function.function(functools.partial(lambda x, y: x + y, 1.))(
             constant_op.constant(2.)))
 
-  @test_util.disable_tfrt('Partial is not supported')
   def test_functools_partial_new_default(self):
     def f(x=3, y=7):
       return x + y
@@ -262,7 +251,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(func().numpy(), 9)
     self.assertEqual(func(y=8).numpy(), 11)
 
-  @test_util.disable_tfrt('Partial is not supported')
   def test_functools_partial_keywords(self):
     def f(x, y):
       return x + y
@@ -271,7 +259,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
         functools.partial(f, x=array_ops.zeros([1]), y=array_ops.zeros([1])))
     self.assertAllEqual(func(), [0.0])
 
-  @test_util.disable_tfrt('Partial is not supported')
   def test_functools_partial_single_positional(self):
     def f(x, y):
       return x + y
@@ -280,7 +267,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
         functools.partial(f, constant_op.constant(1)))
     self.assertAllEqual(func(5), 6)
 
-  @test_util.disable_tfrt('Partial is not supported')
   def test_complicated_partial_with_defaults(self):
 
     def identity(*args):
@@ -328,7 +314,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
                      (tensor_spec.TensorSpec(
                          None, dtypes.float32, name='x'),))
 
-  @test_util.disable_tfrt('Variable argument is not supported')
   @test_util.run_in_graph_and_eager_modes
   def test_variable_naming(self):
     class HasVars(module.Module):
@@ -399,7 +384,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
                                 'defined in another function or code block'):
       f(array_ops.zeros(shape=(8, 42, 3)))
 
-  @test_util.disable_tfrt('Control flow is not supported')
   def testRuntimeErrorNotSticky(self):
 
     @def_function.function
@@ -504,7 +488,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
                  constant_op.constant(3.),
                  constant_op.constant(4.)))
 
-  @test_util.disable_tfrt('Variable argument is not supported')
   def testVariableCreatorScope(self):
     created_variables = []
     captured_variables = []
@@ -524,7 +507,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
       f()
     self.assertEqual(created_variables, captured_variables)
 
-  @test_util.disable_tfrt('Variable argument is not supported')
   def testVarAlreadyInitializedNoClobbering(self):
     v_holder = []
 
@@ -542,7 +524,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
     add_var.get_concrete_function(constant_op.constant(2.))
     self.assertAllClose([13., 14.], add_var(constant_op.constant(2.)))
 
-  @test_util.disable_tfrt('Variable argument is not supported')
   def testSameVariableTwice(self):
     v = variables.Variable(1.0)
 
@@ -552,7 +533,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllEqual(add(v, v), 2.0)
 
-  @test_util.disable_tfrt('Variable argument is not supported')
   def testVariableUpdate(self):
     v1 = variables.Variable(1.0)
     v2 = variables.Variable(2.0)
@@ -599,12 +579,19 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
 
     self.assertIs(func_a, func_b)
 
-    with save_context.save_context(save_options.SaveOptions()):
+    with save_context.save_context(
+        save_options.SaveOptions(experimental_variable_policy=save_options
+                                 .VariablePolicy.EXPAND_DISTRIBUTED_VARIABLES)):
       func_c = func.get_concrete_function(constant_op.constant(2.))
 
-    self.assertIs(func_a, func_c)
+    with save_context.save_context(
+        save_options.SaveOptions(
+            experimental_variable_policy=save_options.VariablePolicy.NONE)):
+      func_d = func.get_concrete_function(constant_op.constant(2.))
+
+    self.assertIsNot(func_a, func_c)
+    self.assertIsNot(func_a, func_d)
 
-  @test_util.disable_tfrt('Nested function is not supported')
   def testInitializationInNestedCall(self):
     v_holder = []
 
@@ -627,7 +614,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
     v_holder[1].assign(11.)
     self.assertAllClose([14., 15.], wrapper(constant_op.constant(2.)))
 
-  @test_util.disable_tfrt('Variable argument is not supported')
   @test_util.run_gpu_only
   def testDeviceAnnotationRespected(self):
     a = []
@@ -647,7 +633,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
     create_variable()
     self.assertRegex(a[0].device, 'CPU')
 
-  @test_util.disable_tfrt('Variable argument is not supported')
   @test_util.run_gpu_only
   def testDeviceAnnotationForInitializerRespected(self):
     a = []
@@ -685,6 +670,7 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
       (True, False),                          # compile
       (True, False),                          # override_function
   ))
+
   def testClone(self, input_signature, autograph, autograph_options, implements,
                 relax_shapes, compile_, override_function):
     original_py_function = lambda x: x
@@ -722,7 +708,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(self.evaluate(cloned(x)),
                        self.evaluate(cloned_py_function(x)))
 
-  @test_util.disable_tfrt('Variable argument is not supported')
   def testLiftPlaceholderInitializedVariable(self):
     with ops.Graph().as_default():
       var_list = []
@@ -767,6 +752,7 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
       (None, 'foo.bar'),  # implements
       (None, True, False),  # relax_shapes
   ))
+
   def test_pickle(self, input_signature, autograph, autograph_options,
                   implements, relax_shapes):
     """@function objects can be pickled and unpickled."""
@@ -876,7 +862,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
     self.assertLen(logs.output, 1)
     self.assertIn('Tracing is expensive', logs.output[0])
 
-  @test_util.disable_tfrt('Nested function is not supported')
   def test_frequent_retracing_warning_nested(self):
     if sys.version_info[0] < 3:
       self.skipTest('self.assertLogs() call is not available in Python 2.')
@@ -935,6 +920,70 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase):
     self.assertLen(logs.output, 1)
     self.assertIn('Tracing is expensive', logs.output[0])
 
+  def test_experimental_get_tracing_count_function(self):
+
+    @def_function.function
+    def double(a):
+      return a + a
+
+    double(constant_op.constant(1))
+    double(constant_op.constant(2))
+    self.assertAllEqual(double.experimental_get_tracing_count(), 1)
+    double(constant_op.constant('a'))
+    self.assertAllEqual(double.experimental_get_tracing_count(), 2)
+
+  def test_experimental_get_tracing_count_method(self):
+
+    class TestClass():
+
+      @def_function.function
+      def testDouble(self, a):
+        return a + a
+
+    obj1 = TestClass()
+    obj1.testDouble(constant_op.constant(1))
+    obj1.testDouble(constant_op.constant(2))
+    obj1.testDouble(constant_op.constant(1.1))
+    self.assertAllEqual(obj1.testDouble.experimental_get_tracing_count(), 2)
+    obj2 = TestClass()
+    obj2.testDouble(constant_op.constant(1))
+    obj2.testDouble(constant_op.constant(1.1))
+    obj2.testDouble(constant_op.constant('a'))
+    self.assertAllEqual(obj2.testDouble.experimental_get_tracing_count(), 3)
+    self.assertAllEqual(obj1.testDouble.experimental_get_tracing_count(), 2)
+
+  def test_experimental_get_tracing_count_function(self):
+
+    @def_function.function
+    def double(a):
+      return a + a
+
+    double(constant_op.constant(1))
+    double(constant_op.constant(2))
+    self.assertAllEqual(double.experimental_get_tracing_count(), 1)
+    double(constant_op.constant('a'))
+    self.assertAllEqual(double.experimental_get_tracing_count(), 2)
+
+  def test_experimental_get_tracing_count_method(self):
+
+    class TestClass():
+
+      @def_function.function
+      def testDouble(self, a):
+        return a + a
+
+    obj1 = TestClass()
+    obj1.testDouble(constant_op.constant(1))
+    obj1.testDouble(constant_op.constant(2))
+    obj1.testDouble(constant_op.constant(1.1))
+    self.assertAllEqual(obj1.testDouble.experimental_get_tracing_count(), 2)
+    obj2 = TestClass()
+    obj2.testDouble(constant_op.constant(1))
+    obj2.testDouble(constant_op.constant(1.1))
+    obj2.testDouble(constant_op.constant('a'))
+    self.assertAllEqual(obj2.testDouble.experimental_get_tracing_count(), 3)
+    self.assertAllEqual(obj1.testDouble.experimental_get_tracing_count(), 2)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/eager/def_function_test_cpu_only.py b/tensorflow/python/eager/def_function_test_cpu_only.py
index 7bb6ade8f6c..7fb5e8175fe 100644
--- a/tensorflow/python/eager/def_function_test_cpu_only.py
+++ b/tensorflow/python/eager/def_function_test_cpu_only.py
@@ -23,6 +23,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.framework import tfrt_utils
 from tensorflow.python.platform import test
 
 
@@ -37,8 +38,9 @@ class DefFunctionCpuOnlyTest(test.TestCase, parameterized.TestCase):
     if test.is_built_with_rocm() or test_util.is_xla_enabled():
       return
 
-    with self.assertRaisesRegexp(errors.UnimplementedError,
-                                 'check target linkage'):
+    with self.assertRaisesRegex((errors.UnknownError if tfrt_utils.enabled()
+                                 else errors.UnimplementedError),
+                                'check target linkage'):
 
       @def_function.function(experimental_compile=True)
       def fn(x):
diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
index 75f015eca1e..ed1085d8b54 100644
--- a/tensorflow/python/eager/def_function_xla_jit_test.py
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -526,6 +526,136 @@ class DefFunctionTest(xla_test.XLATestCase):
           b.backing_device) if on_gpu else 0
       self.assertEqual(initial_usage, final_usage)
 
+  def testGetCompilerIrConstants(self):
+    if 'tpu' in self.device.lower():
+      self.skipTest('TPU generates different HLO')
+
+    with ops.device('device:{}:0'.format(self.device)):
+
+      @def_function.function(experimental_compile=True)
+      def f(a, b):
+        return array_ops.transpose(a, b)
+
+      a = array_ops.ones([3, 4, 3], dtype=dtypes.float32)
+      b = constant_op.constant([0, 2, 1], dtype=dtypes.int32)
+
+      self.assertIn('{1,2,0}',
+                    f.experimental_get_compiler_ir(a, b)(stage='optimized_hlo'))
+
+  @test_util.disable_mlir_bridge('TODO(b/168732524): MLIR bridge does not '
+                                 ' optimize single-element tuples to scalars')
+  def testGetCompilerIrResourceVars(self):
+    with ops.device('device:{}:0'.format(self.device)):
+
+      v = variables.Variable([3.1, 3.2])
+
+      @def_function.function(experimental_compile=True)
+      def f(a, b):
+        v.assign_add(a * b)
+
+      a = random_ops.random_normal([2])
+      b = random_ops.random_normal([2])
+
+      self.assertIn('input_output_alias={ {}: (2, {}, may-alias) }',
+                    f.experimental_get_compiler_ir(a, b)('optimized_hlo'))
+
+  def testGetCompilerIrNotCompiled(self):
+    with ops.device('device:{}:0'.format(self.device)):
+
+      @def_function.function
+      def f(x):
+        return x + 1
+
+      a = random_ops.random_normal([10, 10])
+      with self.assertRaisesRegex(ValueError,
+                                  'marked with experimental_compile'):
+        f.experimental_get_compiler_ir(a)()
+
+  def testGetCompilerIrNested(self):
+    with ops.device('device:{}:0'.format(self.device)):
+
+      @def_function.function(experimental_compile=True)
+      def fn(x, a):
+        return x + a
+
+      @def_function.function(experimental_compile=False)
+      def fn2(x, a):
+        fn.experimental_get_compiler_ir(x, a)()
+        return fn(x, a)
+
+      inputs = constant_op.constant([1, 2, 2, 3, 3])
+      with self.assertRaisesRegex(TypeError, '"Graph" tensor'):
+        fn2(inputs, 1)
+
+  def testGetCompilerIrKwargs(self):
+    with ops.device('device:{}:0'.format(self.device)):
+
+      v = variables.Variable([0.1, 0.1])
+
+      @def_function.function(experimental_compile=True)
+      def f(a, b):
+        return (a + b) * v
+
+      a = constant_op.constant([1.1, 1.1])
+      b = constant_op.constant([2.2, 2.2])
+
+      self.assertIn('multiply',
+                    f.experimental_get_compiler_ir(b=a, a=b)(stage='hlo'))
+
+  def testGetCompilerIrDot(self):
+    with ops.device('device:{}:0'.format(self.device)):
+
+      @def_function.function(experimental_compile=True)
+      def f(a, b):
+        return a + b
+
+      a = constant_op.constant([1.1, 1.1])
+      b = constant_op.constant([2.2, 2.2])
+
+      self.assertIn(
+          'label',
+          f.experimental_get_compiler_ir(a, b)(stage='optimized_hlo_dot'))
+
+  def testGetCompilerIrNoDevicePlacement(self):
+    if 'gpu' not in self.device.lower():
+      self.skipTest('Testing get_compiler_ir on GPUs without placement')
+
+    @def_function.function(experimental_compile=True)
+    def f(a, b):
+      return a + b
+
+    a = constant_op.constant([1.1, 1.1])
+    b = constant_op.constant([2.2, 2.2])
+
+    self.assertIn(
+        'label',
+        f.experimental_get_compiler_ir(a, b)(stage='optimized_hlo_dot'))
+
+  def testGetCompilerIrNonTensors(self):
+    with ops.device('device:{}:0'.format(self.device)):
+
+      @def_function.function(experimental_compile=True)
+      def f(l):
+        return l[0] + l[1]
+
+      l = [constant_op.constant(1.1), constant_op.constant(2.2)]
+
+      self.assertIn('tuple',
+                    f.experimental_get_compiler_ir(l)())
+
+  def testConstantOnWrongDevice(self):
+    with ops.device('device:{}:0'.format(self.device)):
+
+      s = random_ops.random_uniform([2], 1, 10, dtypes.int32)
+      l = random_ops.random_normal([s[0] * s[1]])
+
+      @def_function.function(experimental_compile=True)
+      def f(l):
+        return array_ops.reshape(l, s)
+
+      self.assertIn('tuple',
+                    f.experimental_get_compiler_ir(l)())
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index 35abd6ddbfe..ac178185fd2 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -890,6 +890,24 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     self.assertAllClose([.2, -.4, .6], x_jvp)
     self.assertAllClose([.1, -.2, .3], x2_jvp)
 
+  def testIndexSlicesGrad(self):
+    x = constant_op.constant([1.])
+
+    with forwardprop.ForwardAccumulator(x, constant_op.constant([3.])) as acc:
+      y = array_ops.gather(x, 0)
+    self.assertAllClose(3., acc.jvp(y))
+
+  def testIndexSlicesGradInFunction(self):
+    @def_function.function
+    def f(a):
+      return array_ops.gather(a, 0)
+
+    x = constant_op.constant([1.])
+
+    with forwardprop.ForwardAccumulator(x, constant_op.constant([3.])) as acc:
+      y = f(x)
+    self.assertAllClose(3., acc.jvp(y))
+
   # NOTE: assert_no_new_pyobjects_executing_eagerly fails flakily on this
   # test... could be something wrong with the test decorator, or some sort of
   # nondeterministic caching.
diff --git a/tensorflow/python/eager/function.cc b/tensorflow/python/eager/function.cc
new file mode 100644
index 00000000000..0fc22d3844c
--- /dev/null
+++ b/tensorflow/python/eager/function.cc
@@ -0,0 +1,83 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <Python.h>
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl_bind.h"
+
+struct ConcreteFunction;  // Forward declaration.
+
+// TODO(jlchu): Migrate Python characteristics to C++
+
+namespace tensorflow {
+
+namespace py = pybind11;
+
+struct PyConcreteFunction {
+  PyConcreteFunction() {}
+  py::object _build_call_outputs(py::object result,
+                                 py::object structured_outputs,
+                                 bool _ndarrays_list, bool _ndarray_singleton);
+};
+
+py::object PyConcreteFunction::_build_call_outputs(
+    py::object result, py::object structured_outputs, bool _ndarrays_list,
+    bool _ndarray_singleton) {
+  static const py::module* nest =
+      new py::module(py::module::import("tensorflow.python.util.nest"));
+  // TODO(jlchu): Look into lazy loading of np_arrays module
+  static const py::module* np_arrays = new py::module(
+      py::module::import("tensorflow.python.ops.numpy_ops.np_arrays"));
+
+  if (structured_outputs.is_none()) {
+    return result;
+  }
+
+  // TODO(jlchu): Verify invariant -result = None only if
+  // structured_outputs = None?
+  py::list list_result = (py::list)result;
+
+  if (!list_result.empty()) {
+    if (_ndarrays_list) {
+      py::list ndarr_result(list_result.size());
+      for (int i = 0; i < ndarr_result.size(); ++i) {
+        ndarr_result[i] = np_arrays->attr("tensor_to_ndarray")(list_result[i]);
+      }
+      return ndarr_result;
+    } else if (_ndarray_singleton) {
+      return np_arrays->attr("tensor_to_ndarray")(list_result[0]);
+    }
+  }
+
+  // Replace outputs with results, skipping over any 'None' values.
+  py::list outputs_list = nest->attr("flatten")(structured_outputs, true);
+  int j = 0;
+  for (int i = 0; i < outputs_list.size(); ++i) {
+    if (!outputs_list[i].is_none()) {
+      outputs_list[i] = list_result[j];
+      ++j;
+    }
+  }
+  return nest->attr("pack_sequence_as")(structured_outputs, outputs_list, true);
+}
+
+PYBIND11_MODULE(_concrete_function, m) {
+  py::class_<PyConcreteFunction>(m, "ConcreteFunction")
+      .def(py::init<>())
+      .def("_build_call_outputs", &PyConcreteFunction::_build_call_outputs);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 2de288a338b..ab32c8370af 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -41,6 +41,7 @@ from tensorflow.python.eager import backprop_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import forwardprop_util
+from tensorflow.python.eager import monitoring
 from tensorflow.python.eager import tape
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
 from tensorflow.python.framework import c_api_util
@@ -66,7 +67,6 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.profiler import trace
 from tensorflow.python.saved_model import save_context
-from tensorflow.python.saved_model import save_options
 from tensorflow.python.util import compat
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import lazy_loader
@@ -92,6 +92,10 @@ BACKWARD_FUNCTION_ATTRIBUTE_NAME = "backward_function_name"
 IMPLEMENTS_ATTRIBUTE_NAME = "_implements"
 SHARED_RENDEZVOUS_ATTRIBUTE_NAME = "shared_rendezvous"
 
+_graph_building_time_counter = monitoring.Counter(
+    "/tensorflow/core/tf_function/graph_building_time_usecs",
+    "Time for tf.function to build a graph (us).")
+
 
 def _make_input_signature_hashable(elem):
   """Rewrite input signature to be hashable.
@@ -584,6 +588,8 @@ class _EagerDefinedFunction(object):
                 config=config,
                 executor_type=executor_type)
 
+    for i, func_graph_output in enumerate(self._func_graph_outputs):
+      custom_gradient.copy_handle_data(func_graph_output, outputs[i])
     if executing_eagerly:
       return outputs
     else:
@@ -592,8 +598,6 @@ class _EagerDefinedFunction(object):
       # once that's done.
       for i, shape in enumerate(self._output_shapes):
         outputs[i].set_shape(shape)
-      for i, func_graph_output in enumerate(self._func_graph_outputs):
-        custom_gradient.copy_handle_data(func_graph_output, outputs[i])
       return outputs
 
 
@@ -895,8 +899,9 @@ class _TapeGradientFunctions(object):
       for output in trainable_outputs:
         gradient_shape, gradient_dtype = default_gradient.shape_and_dtype(
             output)
-        gradients_wrt_outputs.append(
-            graph_placeholder(gradient_dtype, gradient_shape))
+        gradient_placeholder = graph_placeholder(gradient_dtype, gradient_shape)
+        custom_gradient.copy_handle_data(output, gradient_placeholder)
+        gradients_wrt_outputs.append(gradient_placeholder)
       with ops.device(None):
         gradients_wrt_inputs = gradients_util._GradientsHelper(  # pylint: disable=protected-access
             trainable_outputs,
@@ -904,6 +909,13 @@ class _TapeGradientFunctions(object):
             grad_ys=gradients_wrt_outputs,
             src_graph=self._func_graph)
 
+      if input_tangents:
+        # Convert IndexedSlices to dense tensors (as we do elsewhere for
+        # function gradients). Our C++ bindings don't know how to handle them
+        # currently.
+        gradients_wrt_inputs = nest.map_structure(
+            lambda x: ops.convert_to_tensor(x) if x is not None else None,
+            gradients_wrt_inputs)
       captures_from_forward = [
           c for c in backwards_graph.external_captures
           if not isinstance(c, ops.EagerTensor) and c.graph is self._func_graph
@@ -1199,9 +1211,10 @@ class _TapeGradientFunctions(object):
     """Create a backward function given `outputs` from the forward function."""
     capture_mapping = dict(
         zip((ops.tensor_id(t) for t in forward_graph.outputs), outputs))
+    captured_inputs = backward.captured_inputs
     remapped_captures = [
         capture_mapping.get(ops.tensor_id(capture), capture)
-        for capture in backward.captured_inputs
+        for capture in captured_inputs
     ]
     if any(t.graph is forward_graph for t in remapped_captures
            if not isinstance(t, ops.EagerTensor)):
@@ -1215,8 +1228,7 @@ class _TapeGradientFunctions(object):
     # unconnected gradients. We do that in advance so we don't have to hold on
     # to the outputs themselves, which may not be needed otherwise.
     variant_zeros_like = {}
-    backward_function_inputs = (
-        len(backward.inputs) - len(backward.captured_inputs))
+    backward_function_inputs = (len(backward.inputs) - len(captured_inputs))
     recorded_outputs = []
     trainable_recorded_outputs = 0
     skip_positions = []
@@ -1585,9 +1597,9 @@ class ConcreteFunction(object):
     function_spec = self._pre_initialized_function_spec
     args = function_spec.fullargspec.args
     arg_specs, kwarg_specs = self.structured_input_signature
+    vararg_indices = range(len(function_spec.arg_names), len(arg_specs))
     fullargspec = tf_inspect.FullArgSpec(
-        args=list(args) +
-        ["<arg{}>".format(i + 1) for i in range(len(args), len(arg_specs))],
+        args=list(args) + ["<arg{}>".format(i + 1) for i in vararg_indices],
         varargs=None,
         varkw=None,
         defaults=[_BOUND_VALUE] * len(arg_specs),
@@ -2146,7 +2158,7 @@ class ConcreteFunction(object):
     Returns:
       The actual call output.
     """
-    # TODO(jlchu): implement in C++.
+    # TODO(jlchu): call C++ version in function.cc when speed is improved
     if self._func_graph.structured_outputs is None:
       return result
 
@@ -2162,6 +2174,7 @@ class ConcreteFunction(object):
     j = 0
     for i, o in enumerate(outputs_list):
       if o is not None:
+        custom_gradient.copy_handle_data(self.outputs[j], result[j])
         outputs_list[i] = result[j]
         j += 1
     ret = nest.pack_sequence_as(self._func_graph.structured_outputs,
@@ -2267,10 +2280,16 @@ class ConcreteFunction(object):
       lines.append("  Args:")
       lines.extend(arg_details)
     lines.append("  Returns:")
+
+    def spec_from_value(value):
+      # For loaded function, structured_outputs are already specs.
+      if isinstance(value, type_spec.TypeSpec):
+        return value
+      return type_spec.type_spec_from_value(value)
+
     lines.append("    {}".format(
         pretty_print_spec(
-            nest.map_structure(type_spec.type_spec_from_value,
-                               self.structured_outputs))))
+            nest.map_structure(spec_from_value, self.structured_outputs))))
 
     return "\n".join(lines)
 
@@ -2307,7 +2326,8 @@ class FunctionSpec(object):
   def from_function_and_signature(python_function,
                                   input_signature,
                                   is_pure=False,
-                                  experimental_follow_type_hints=False):
+                                  experimental_follow_type_hints=False,
+                                  experimental_compile=None):
     """Create a FunctionSpec instance given a python function and signature.
 
     Args:
@@ -2316,6 +2336,7 @@ class FunctionSpec(object):
       is_pure: if True all input arguments (including variables and constants)
       will be converted to tensors and no variable changes allowed.
       experimental_follow_type_hints: see `tf.function`
+      experimental_compile: see `tf.function`
 
     Returns:
       instance of FunctionSpec
@@ -2397,6 +2418,7 @@ class FunctionSpec(object):
         is_method,
         input_signature,
         is_pure=is_pure,
+        experimental_compile=experimental_compile,
         experimental_follow_type_hints=experimental_follow_type_hints,
         name=name)
 
@@ -2406,7 +2428,8 @@ class FunctionSpec(object):
                input_signature,
                is_pure=False,
                experimental_follow_type_hints=False,
-               name=None):
+               name=None,
+               experimental_compile=None):
     """Constructs a FunctionSpec describing a python function.
 
     Args:
@@ -2417,10 +2440,12 @@ class FunctionSpec(object):
         will be converted to tensors and no variable changes allowed.
       experimental_follow_type_hints: see `tf.function`.
       name: Name of the function
+      experimental_compile: see `tf.function`.
     """
     self._fullargspec = fullargspec
     self._is_method = is_method
     self._is_pure = is_pure
+    self._experimental_compile = experimental_compile
     self._experimental_follow_type_hints = experimental_follow_type_hints
 
     # TODO(edloper): Include name when serializing for SavedModel?
@@ -2490,6 +2515,10 @@ class FunctionSpec(object):
   def is_pure(self):
     return self._is_pure
 
+  @property
+  def experimental_compile(self):
+    return self._experimental_compile
+
   @property
   def arg_names(self):
     return self._arg_names
@@ -2911,10 +2940,6 @@ class Function(object):
     self._experimental_compile = experimental_compile
     self._experimental_follow_type_hints = experimental_follow_type_hints
 
-    # A boolean indicating whether the function has been traced with
-    # distribution strategy.
-    self._traced_with_distribution_strategy = False
-
   def __call__(self, *args, **kwargs):
     """Calls a graph function specialized to the inputs."""
     with self._lock:
@@ -3068,7 +3093,11 @@ class Function(object):
     # Return the cached `Function` for the instance
     return self._descriptor_cache[instance]
 
-  def _cache_key(self, args, kwargs, include_tensor_ranks_only=False):
+  def _cache_key(self,
+                 args,
+                 kwargs,
+                 cache_key_context,
+                 include_tensor_ranks_only=False):
     """Computes the cache key given inputs and execution context."""
     if self.input_signature is None:
       inputs = (args, kwargs) if kwargs else args
@@ -3080,6 +3109,15 @@ class Function(object):
       assert not include_tensor_ranks_only
       hashable_input_signature = self._hashable_input_signature
 
+    (parent_graph, device_functions, colocation_stack, in_cross_replica_context,
+     variable_policy, xla_context_id) = cache_key_context
+
+    return CacheKey(hashable_input_signature, parent_graph, device_functions,
+                    colocation_stack, in_cross_replica_context, variable_policy,
+                    xla_context_id)
+
+  def _cache_key_context(self):
+    """Returns execution context."""
     ctx = context.context()
 
     # Don't need to open an init_scope if the _cache_key call is in eager mode
@@ -3134,23 +3172,14 @@ class Function(object):
     except (AttributeError, IndexError):
       pass
 
-    # If the function has been traced with a distribution strategy, it might
-    # need to be retraced at saving time as DistributedVariable created under
-    # distribution strategy may want different tracing behavior at training and
-    # saving, e.g, it wants to resolve to the primary component at saving time,
-    # but wants resolve to the component residing in the current device at
-    # training time. We achieve this by adding variable_policy to the function
-    # cache key.
-    if save_context.in_save_context(
-    ) and self._traced_with_distribution_strategy:
+    if save_context.in_save_context():
       variable_policy = (
           save_context.get_save_options().experimental_variable_policy)
     else:
-      variable_policy = save_options.VariablePolicy.EXPAND_DISTRIBUTED_VARIABLES
+      variable_policy = None
 
-    return CacheKey(hashable_input_signature, parent_graph, device_functions,
-                    colocation_stack, in_cross_replica_context, variable_policy,
-                    xla_context_id)
+    return (parent_graph, device_functions, colocation_stack,
+            in_cross_replica_context, variable_policy, xla_context_id)
 
   def _create_graph_function(self, args, kwargs, override_flat_arg_shapes=None):
     """Create a `ConcreteFunction` from `args` and `kwargs`."""
@@ -3191,7 +3220,8 @@ class Function(object):
     return graph_function
 
   def _define_function_with_shape_relaxation(self, args, kwargs, flat_args,
-                                             filtered_flat_args):
+                                             filtered_flat_args,
+                                             cache_key_context):
     """Define a function, relaxing arg shapes to avoid unnecessary retracing."""
     flat_no_comp = nest.flatten((args, kwargs), expand_composites=False)
 
@@ -3202,14 +3232,17 @@ class Function(object):
     # not information about the size of each dimension).
     if not any_composite_args:
       rank_only_cache_key = self._cache_key(
-          args, kwargs, include_tensor_ranks_only=True)
+          args, kwargs, cache_key_context, include_tensor_ranks_only=True)
     else:
       # For the rank-only cache key, replace any composite tensors with
       # shape-relaxed TypeSpecs.
       (cache_key_args, cache_key_kwargs) = nest.map_structure(
           _shape_relaxed_type_for_composite_tensor, (args, kwargs))
       rank_only_cache_key = self._cache_key(
-          cache_key_args, cache_key_kwargs, include_tensor_ranks_only=True)
+          cache_key_args,
+          cache_key_kwargs,
+          cache_key_context,
+          include_tensor_ranks_only=True)
 
     arg_specs = [_type_spec_for(x) for x in flat_no_comp]
     relaxed_arg_specs = self._function_cache.arg_relaxed_specs.get(
@@ -3288,7 +3321,8 @@ class Function(object):
     else:
       flat_args, filtered_flat_args = [None], []
 
-    cache_key = self._cache_key(args, kwargs)
+    cache_key_context = self._cache_key_context()
+    cache_key = self._cache_key(args, kwargs, cache_key_context)
 
     try:
       hash(cache_key)
@@ -3301,41 +3335,39 @@ class Function(object):
     if graph_function is not None:
       return graph_function, filtered_flat_args
 
-    logging.vlog(1,
-                 "Creating new FuncGraph for Python function %r (key: %r)",
-                 self._python_function, cache_key)
-    logging.vlog(2,
-                 "Python function signature [args: %s] [kwargs: %s]",
-                 args,
-                 kwargs)
+    with monitoring.MonitoredTimer(_graph_building_time_counter.get_cell()):
+      with trace.Trace("tf.function-graph_building"):
+        logging.vlog(1,
+                     "Creating new FuncGraph for Python function %r (key: %r)",
+                     self._python_function, cache_key)
+        logging.vlog(2, "Python function signature [args: %s] [kwargs: %s]",
+                     args, kwargs)
 
-    # pylint: disable=protected-access
-    call_context_key = cache_key._replace(input_signature=None)
-    # pylint: disable=protected-access
+        # pylint: disable=protected-access
+        call_context_key = cache_key._replace(input_signature=None)
+        # pylint: disable=protected-access
 
-    ag_status = (
-        ag_ctx.Status.ENABLED if self._autograph else ag_ctx.Status.DISABLED)
-    with ag_ctx.ControlStatusCtx(
-        status=ag_status, options=self._autograph_options):
+        ag_status = (
+            ag_ctx.Status.ENABLED
+            if self._autograph else ag_ctx.Status.DISABLED)
+        with ag_ctx.ControlStatusCtx(
+            status=ag_status, options=self._autograph_options):
 
-      # Build a function with shape relaxation retracing if:
-      # 1. shape relaxation is explicitly enabled
-      # and 2. there's no provided input signature
-      # and 3. there's been a cache miss for this calling context
-      if (self._experimental_relax_shapes
-          and self.input_signature is None
-          and call_context_key in self._function_cache.missed):
-        return self._define_function_with_shape_relaxation(
-            args, kwargs, flat_args, filtered_flat_args)
+          # Build a function with shape relaxation retracing if:
+          # 1. shape relaxation is explicitly enabled
+          # and 2. there's no provided input signature
+          # and 3. there's been a cache miss for this calling context
+          if (self._experimental_relax_shapes and
+              self.input_signature is None and
+              call_context_key in self._function_cache.missed):
+            return self._define_function_with_shape_relaxation(
+                args, kwargs, flat_args, filtered_flat_args, cache_key_context)
 
-      self._function_cache.missed.add(call_context_key)
-      graph_function = self._create_graph_function(args, kwargs)
-      self._function_cache.primary[cache_key] = graph_function
+          self._function_cache.missed.add(call_context_key)
+          graph_function = self._create_graph_function(args, kwargs)
+          self._function_cache.primary[cache_key] = graph_function
 
-      if ops.get_default_graph()._distribution_strategy_stack:
-        self._traced_with_distribution_strategy = True
-
-      return graph_function, filtered_flat_args
+          return graph_function, filtered_flat_args
 
 
 def register(func, *args, **kwargs):
diff --git a/tensorflow/python/eager/function_argument_naming_test.py b/tensorflow/python/eager/function_argument_naming_test.py
index c643bce6f56..4707f45164c 100644
--- a/tensorflow/python/eager/function_argument_naming_test.py
+++ b/tensorflow/python/eager/function_argument_naming_test.py
@@ -26,7 +26,6 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
@@ -220,7 +219,6 @@ class ArgumentNamingTests(test.TestCase, parameterized.TestCase):
         [b'x', b'y', b'args_1', b'second_variadic', b'z', b'cust'],
         [inp.op.get_attr('_user_specified_name') for inp in variadic_op.inputs])
 
-  @test_util.disable_tfrt('GPU to host copy not implemented yet.')
   def testVariadicInputSignature(self, function_decorator):
     @function_decorator(
         input_signature=(
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 65b23401431..f03efe460c3 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -56,6 +56,7 @@ from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.layers import convolutional
+from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import clip_ops
@@ -189,6 +190,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegex(AttributeError, 'no attribute'):
       add(c)
 
+  @test_util.disable_tfrt('Packed tensor is not supported in tfrt yet.')
   def testPackedVariable(self):
     with ops.device('/cpu:0'):
       v0_0 = resource_variable_ops.ResourceVariable(1.0)
@@ -841,6 +843,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     expected = [4.0] * 100
     self.assertSequenceEqual(outputs, expected)
 
+  @test_util.disable_tfrt('b/169431085: This test is flaky on tfrt')
   def testExecutingStatefulDefunConcurrently(self):
 
     v = resource_variable_ops.ResourceVariable(1.0)
@@ -1319,6 +1322,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertIsInstance(
         self.v, resource_variable_ops.ResourceVariable)
 
+  @test_util.disable_tfrt('b/169294215')
   def testRunMetadata(self):
 
     @def_function.function
@@ -3346,6 +3350,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     test_fn()
     self.assertEqual(ag_ctx.control_status_ctx().status, prev_status)
 
+  @test_util.disable_tfrt('b/170435618')
   def testCancelBeforeFunctionExecution(self):
     if not context.executing_eagerly():
       self.skipTest('eager only')
@@ -3363,6 +3368,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(errors.CancelledError):
       cancelable_func()
 
+  @test_util.disable_tfrt('b/170435618')
   def testCancelBlockedFunctionExecution(self):
     if not context.executing_eagerly():
       self.skipTest('eager only')
@@ -3386,6 +3392,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       cancelable_func()
     t.join()
 
+  @test_util.disable_tfrt('b/170435618')
   def testCancelAfterFunctionExecution(self):
     if not context.executing_eagerly():
       self.skipTest('eager only')
@@ -3550,6 +3557,20 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     for output in [cf(), cf(a), cf(y=b)]:
       self.assertAllEqual(output[0] + output[1], 5555)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testConcreteFunctionMethodWithVarargs(self):
+    float32_scalar = tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32)
+
+    class MyModel(module.Module):
+
+      @def_function.function(input_signature=[float32_scalar, float32_scalar])
+      def add(self, *arg):
+        return math_ops.add(*arg)
+
+    m = MyModel()
+    cf = m.add.get_concrete_function()
+    cf(-12.0, 3.0)
+
   @test_util.run_in_graph_and_eager_modes
   def testConcreteFunctionStructuredSignatureKeywordOrder(self):
     # Check that keyword-only arguments are sorted appropriately, so that they
@@ -4220,6 +4241,62 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     enabled(1, 2, 3, 4, kwonly=5, kwarg1=600, kwarg2=700)  # No retrace
     self.assertEqual(trace_count[0], 4)
 
+  def testWithModuleNameScope(self):
+    self.skipTest('b/166158748:function does not handle this case correctly.')
+
+    class Foo(module.Module):
+
+      def __init__(self):
+        super().__init__()
+        self.var = None
+
+      @def_function.function
+      @module.Module.with_name_scope
+      def add(self, x, y, z=1):
+        if self.var is None:
+          return x + y + z
+
+    foo = Foo()
+    self.assertEqual(foo.add(2, 3), 6)
+
+  def testWithModuleNameScopeRedundantArgs(self):
+    self.skipTest('b/166158748:function does not handle this case correctly.')
+
+    class Foo(module.Module):
+
+      def __init__(self):
+        super().__init__()
+        self.var = None
+
+      @def_function.function
+      @module.Module.with_name_scope
+      def add(self, x, y):
+        if self.var is None:
+          return x + y
+
+    foo = Foo()
+    with self.assertRaisesRegex(TypeError, 'got two values for argument'):
+      foo.add(2, x=3)  # pylint: disable=redundant-keyword-arg,no-value-for-parameter
+
+  def testWithModuleNameScopeMissingArgs(self):
+    self.skipTest('b/166158748:function does not handle this case correctly.')
+
+    class Foo(module.Module):
+
+      def __init__(self):
+        super().__init__()
+        self.var = None
+
+      @def_function.function
+      @module.Module.with_name_scope
+      def add(self, x, y):
+        if self.var is None:
+          return x + y
+
+    foo = Foo()
+    with self.assertRaisesRegex(TypeError, 'missing required arguments: y'):
+      foo.add(2)  # pylint: disable=no-value-for-parameter
+
 
 class MultiDeviceTest(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/eager/memory_tests/BUILD b/tensorflow/python/eager/memory_tests/BUILD
index 419de91b42a..59735c41608 100644
--- a/tensorflow/python/eager/memory_tests/BUILD
+++ b/tensorflow/python/eager/memory_tests/BUILD
@@ -43,6 +43,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["remote_memory_test.py"],
     tags = [
+        "no_gpu",  # TODO(b/168058741): Enable the test for GPU
         "optonly",  # The test is too slow in non-opt mode
     ],
     xla_enable_strict_auto_jit = False,  # b/140261762
diff --git a/tensorflow/python/eager/ops_test.py b/tensorflow/python/eager/ops_test.py
index a859f4edf01..494abdbf269 100644
--- a/tensorflow/python/eager/ops_test.py
+++ b/tensorflow/python/eager/ops_test.py
@@ -481,6 +481,8 @@ class OpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertIs(weak_x(), None)
     self.assertIs(weak_y(), None)
 
+  @test_util.disable_tfrt(
+      'b/153697193: tfrt cannot decode python stacktrace yet')
   def testAsyncExceptionStackTrace(self):
     config.set_synchronous_execution(False)
 
@@ -499,6 +501,18 @@ class OpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     context.async_clear_error()
     config.set_synchronous_execution(True)
 
+  def testCrossContextTensorCache(self):
+    old_context = context.context()
+    old_x = constant_op.constant(9.5)
+    context._set_context(context.Context())
+
+    try:
+      new_x = constant_op.constant(9.5)
+      self.assertEqual(new_x.numpy(), 9.5)
+    finally:
+      context._set_context(old_context)
+
+    self.assertEqual(old_x.numpy(), 9.5)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/pywrap_gradient_exclusions.cc b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
index 95c514b7518..f23bdc6ec5c 100644
--- a/tensorflow/python/eager/pywrap_gradient_exclusions.cc
+++ b/tensorflow/python/eager/pywrap_gradient_exclusions.cc
@@ -50,7 +50,7 @@ auto OpGradientInfoInit(const T &a) {
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 351> a = {{
+  static std::array<OpIndexInfo, 358> a = {{
       {"Acosh"},
       {"AllToAll", 1, {0}},
       {"ApproximateEqual"},
@@ -152,6 +152,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"IdentityReader"},
       {"Imag"},
       {"ImageProjectiveTransformV2", 1, {2}},
+      {"ImageProjectiveTransformV3", 2, {2, 3}},
       {"ImageSummary"},
       {"InitializeTable"},
       {"InitializeTableFromTextFile"},
@@ -226,6 +227,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"QuantizeAndDequantize"},
       {"QuantizeAndDequantizeV2"},
       {"QuantizeAndDequantizeV3"},
+      {"QuantizeAndDequantizeV4Grad", 1, {3}},
       {"QueueClose"},
       {"QueueDequeue"},
       {"QueueDequeueMany"},
@@ -331,11 +333,16 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
       {"StatelessRandomBinomial"},
       {"StatelessRandomGammaV2", 1, {1}},
       {"StatelessRandomNormal"},
+      {"StatelessRandomNormalV2"},
       {"StatelessRandomPoisson"},
       {"StatelessRandomUniform"},
       {"StatelessRandomUniformFullInt"},
+      {"StatelessRandomUniformFullIntV2"},
       {"StatelessRandomUniformInt"},
+      {"StatelessRandomUniformIntV2"},
+      {"StatelessRandomUniformV2"},
       {"StatelessTruncatedNormal"},
+      {"StatelessTruncatedNormalV2"},
       {"StopGradient"},
       {"StridedSliceGrad", 2, {0, 4}},
       {"StringSplit"},
@@ -414,7 +421,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedInputIndices(
 
 absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
     const tensorflow::string &op_name) {
-  static std::array<OpIndexInfo, 467> a = {{
+  static std::array<OpIndexInfo, 475> a = {{
       {"Abs"},
       {"AccumulateNV2"},
       {"Acos"},
@@ -570,6 +577,7 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
       {"Igammac"},
       {"Imag"},
       {"ImageProjectiveTransformV2"},
+      {"ImageProjectiveTransformV3"},
       {"ImageSummary"},
       {"InitializeTable"},
       {"InitializeTableFromTextFile"},
@@ -662,6 +670,8 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
       {"QuantizeAndDequantize"},
       {"QuantizeAndDequantizeV2"},
       {"QuantizeAndDequantizeV3"},
+      {"QuantizeAndDequantizeV4"},
+      {"QuantizeAndDequantizeV4Grad"},
       {"QueueClose"},
       {"QueueEnqueue"},
       {"QueueEnqueueMany"},
@@ -791,11 +801,16 @@ absl::optional<tensorflow::gtl::FlatSet<int>> OpGradientUnusedOutputIndices(
       {"StatelessMultinomial"},
       {"StatelessRandomBinomial"},
       {"StatelessRandomNormal"},
+      {"StatelessRandomNormalV2"},
       {"StatelessRandomPoisson"},
       {"StatelessRandomUniform"},
       {"StatelessRandomUniformFullInt"},
+      {"StatelessRandomUniformFullIntV2"},
       {"StatelessRandomUniformInt"},
+      {"StatelessRandomUniformIntV2"},
+      {"StatelessRandomUniformV2"},
       {"StatelessTruncatedNormal"},
+      {"StatelessTruncatedNormalV2"},
       {"StopGradient"},
       {"StridedSlice"},
       {"StridedSliceGrad"},
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 0789eab6270..bdd17c889e6 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -40,9 +40,42 @@ limitations under the License.
 
 // forward declare
 struct EagerTensor;
+namespace tensorflow {
 
+// Convert a TFE_TensorHandle to a Python numpy.ndarray object.
+// The two may share underlying storage so changes to one may reflect in the
+// other.
+PyObject* TFE_TensorHandleToNumpy(TFE_TensorHandle* handle, TF_Status* status) {
+  if (TFE_TensorHandleDataType(handle) == TF_RESOURCE) {
+    TF_SetStatus(status, TF_INVALID_ARGUMENT,
+                 "Cannot convert a Tensor of dtype resource to a NumPy array.");
+    return nullptr;
+  }
+
+  tensorflow::Safe_TF_TensorPtr tensor = nullptr;
+  Py_BEGIN_ALLOW_THREADS;
+  tensor = tensorflow::make_safe(TFE_TensorHandleResolve(handle, status));
+  Py_END_ALLOW_THREADS;
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+
+  PyObject* ret = nullptr;
+  auto cppstatus =
+      tensorflow::TF_TensorToMaybeAliasedPyArray(std::move(tensor), &ret);
+  tensorflow::Set_TF_Status_from_Status(status, cppstatus);
+  if (!status->status.ok()) {
+    Py_XDECREF(ret);
+    return nullptr;
+  }
+  CHECK_NE(ret, nullptr);
+  return ret;
+}
+}  // namespace tensorflow
 namespace {
 
+using tensorflow::TFE_TensorHandleToNumpy;
+
 // An instance of _EagerTensorProfiler that will receive callbacks about
 // events on eager tensors. This is set by TFE_Py_InitEagerTensor, if at all.
 PyObject* eager_tensor_profiler = nullptr;
@@ -87,35 +120,6 @@ TFE_Context* GetContextHandle(PyObject* py_context) {
   return ctx;
 }
 
-// Convert a TFE_TensorHandle to a Python numpy.ndarray object.
-// The two may share underlying storage so changes to one may reflect in the
-// other.
-PyObject* TFE_TensorHandleToNumpy(TFE_TensorHandle* handle, TF_Status* status) {
-  if (TFE_TensorHandleDataType(handle) == TF_RESOURCE) {
-    TF_SetStatus(status, TF_INVALID_ARGUMENT,
-                 "Cannot convert a Tensor of dtype resource to a NumPy array.");
-    return nullptr;
-  }
-
-  tensorflow::Safe_TF_TensorPtr tensor = nullptr;
-  Py_BEGIN_ALLOW_THREADS;
-  tensor = tensorflow::make_safe(TFE_TensorHandleResolve(handle, status));
-  Py_END_ALLOW_THREADS;
-  if (!status->status.ok()) {
-    return nullptr;
-  }
-
-  PyObject* ret = nullptr;
-  auto cppstatus =
-      tensorflow::TF_TensorToMaybeAliasedPyArray(std::move(tensor), &ret);
-  tensorflow::Set_TF_Status_from_Status(status, cppstatus);
-  if (!status->status.ok()) {
-    Py_XDECREF(ret);
-    return nullptr;
-  }
-  CHECK_NE(ret, nullptr);
-  return ret;
-}
 
 // Helper function to convert `v` to a tensorflow::DataType and store it in
 // `*out`. Returns true on success, false otherwise.
@@ -331,12 +335,12 @@ TFE_TensorHandle* ConvertToEagerTensor(TFE_Context* ctx, PyObject* value,
   // TODO(slebedev): also cache singleton NumPy arrays and scalars?
   if (PyArray_IsPythonNumber(value)) {
     auto* cache = TFE_TensorHandleCache::Get();
-    TFE_TensorHandle* handle = cache->Lookup(value, dtype, device_name);
+    TFE_TensorHandle* handle = cache->Lookup(value, dtype, ctx, device_name);
     if (handle != nullptr) return handle;
     handle = ConvertToEagerTensorUncached(ctx, value, dtype, device_name);
     if (handle == nullptr) return nullptr;
     if (!PyFloat_Check(value) || std::isfinite(PyFloat_AS_DOUBLE(value))) {
-      cache->Insert(value, dtype, device_name, handle);
+      cache->Insert(value, dtype, ctx, device_name, handle);
     }
     return handle;
   } else {
diff --git a/tensorflow/python/eager/pywrap_tensor.h b/tensorflow/python/eager/pywrap_tensor.h
index 4c84b5ce6ea..3c2a7c5a010 100644
--- a/tensorflow/python/eager/pywrap_tensor.h
+++ b/tensorflow/python/eager/pywrap_tensor.h
@@ -24,6 +24,7 @@ bool EagerTensor_CheckExact(const PyObject* o);
 tensorflow::int64 PyEagerTensor_ID(const PyObject* tensor);
 tensorflow::DataType PyEagerTensor_Dtype(const PyObject* tensor);
 tensorflow::int64 PyEagerTensor_NumElements(PyObject* tensor);
+TFE_TensorHandle* EagerTensor_Handle(const PyObject* o);
 
 namespace tensorflow {
 
@@ -37,6 +38,8 @@ TFE_TensorHandle* ConvertToEagerTensor(TFE_Context* ctx, PyObject* value,
                                        DataType dtype,
                                        const char* device_name = nullptr);
 
+PyObject* TFE_TensorHandleToNumpy(TFE_TensorHandle* handle, TF_Status* status);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_H_
diff --git a/tensorflow/python/eager/pywrap_tensor_conversion.cc b/tensorflow/python/eager/pywrap_tensor_conversion.cc
index 041ddf4ec53..432082433d7 100644
--- a/tensorflow/python/eager/pywrap_tensor_conversion.cc
+++ b/tensorflow/python/eager/pywrap_tensor_conversion.cc
@@ -38,10 +38,10 @@ TFE_TensorHandleCache* TFE_TensorHandleCache::Get() {
 }
 
 TFE_TensorHandle* TFE_TensorHandleCache::Lookup(
-    PyObject* value, tensorflow::DataType dtype,
+    PyObject* value, tensorflow::DataType dtype, TFE_Context* ctx,
     absl::string_view device_name) const {
   CHECK_NOTNULL(value);
-  const auto it = cache.find(Key{PyObjectPtr{value}, dtype, device_name});
+  const auto it = cache.find(Key{PyObjectPtr{value}, dtype, ctx, device_name});
   if (it == cache.end()) {
     scalar_cache_misses->GetCell()->IncrementBy(1);
     return nullptr;
@@ -53,10 +53,11 @@ TFE_TensorHandle* TFE_TensorHandleCache::Lookup(
 }
 
 void TFE_TensorHandleCache::Insert(PyObject* value, tensorflow::DataType dtype,
+                                   TFE_Context* ctx,
                                    absl::string_view device_name,
                                    TFE_TensorHandle* h) {
   Py_INCREF(value);
-  cache.emplace(Key{PyObjectPtr{value}, dtype, device_name},
+  cache.emplace(Key{PyObjectPtr{value}, dtype, ctx, device_name},
                 tensorflow::wrap(tensorflow::unwrap(h)->Copy()));
 }
 
diff --git a/tensorflow/python/eager/pywrap_tensor_conversion.h b/tensorflow/python/eager/pywrap_tensor_conversion.h
index 8890979c379..5bd851ed5b2 100644
--- a/tensorflow/python/eager/pywrap_tensor_conversion.h
+++ b/tensorflow/python/eager/pywrap_tensor_conversion.h
@@ -73,16 +73,20 @@ struct TFE_TensorHandleCache {
   ~TFE_TensorHandleCache() { DecrefUnrefAll(); }
 
   TFE_TensorHandle* Lookup(PyObject* value, tensorflow::DataType dtype,
+                           TFE_Context* ctx,
                            absl::string_view device_name) const;
 
-  void Insert(PyObject* value, tensorflow::DataType dtype,
+  void Insert(PyObject* value, tensorflow::DataType dtype, TFE_Context* ctx,
               absl::string_view device_name, TFE_TensorHandle* h);
 
   void Clear();
 
  private:
-  // TODO(slebedev): should the key depend on TFE_Context?
-  using Key = std::tuple<PyObjectPtr, tensorflow::DataType, absl::string_view>;
+  // TODO(kkb): Instead of `TFE_Context*` key, ideally Python's context object
+  // should have TFE_TensorHandleCache instance. Migrate once we Python context
+  // object is backed by C++ data structure. b/169790439
+  using Key = std::tuple<PyObjectPtr, tensorflow::DataType, TFE_Context*,
+                         absl::string_view>;
 
   void DecrefUnrefAll() {
     for (const auto& p : cache) {
diff --git a/tensorflow/python/eager/pywrap_tensor_test.py b/tensorflow/python/eager/pywrap_tensor_test.py
new file mode 100644
index 00000000000..ee1a3536546
--- /dev/null
+++ b/tensorflow/python/eager/pywrap_tensor_test.py
@@ -0,0 +1,35 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TFE_TensorHandleToNumpy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from tensorflow.python.eager import pywrap_tensor_test_util as util
+from tensorflow.python.eager import test
+
+
+class PywrapTensorTest(test.TestCase):
+
+  def testGetScalarOne(self):
+    result = util.get_scalar_one()
+    self.assertIsInstance(result, np.ndarray)
+    self.assertAllEqual(result, 1.0)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/eager/pywrap_tensor_test_util.cc b/tensorflow/python/eager/pywrap_tensor_test_util.cc
new file mode 100644
index 00000000000..21ef8c45e43
--- /dev/null
+++ b/tensorflow/python/eager/pywrap_tensor_test_util.cc
@@ -0,0 +1,41 @@
+// Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "pybind11/pybind11.h"
+#include "pybind11/pytypes.h"
+#include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/python/eager/pywrap_tensor.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+
+using tensorflow::Pyo;
+using tensorflow::TF_StatusPtr;
+using tensorflow::TFE_TensorHandleToNumpy;
+
+PYBIND11_MODULE(pywrap_tensor_test_util, m) {
+  m.def("get_scalar_one", []() {
+    // Builds a TFE_TensorHandle and then converts to NumPy ndarray
+    // using TFE_TensorHandleToNumpy.
+    TFE_ContextOptions* opts = TFE_NewContextOptions();
+    TF_StatusPtr status(TF_NewStatus());
+    TFE_Context* ctx = TFE_NewContext(opts, status.get());
+    TFE_TensorHandle* handle = TestScalarTensorHandle(ctx, 1.0f);
+    auto result = Pyo(TFE_TensorHandleToNumpy(handle, status.get()));
+    TFE_DeleteTensorHandle(handle);
+    TFE_DeleteContext(ctx);
+    TFE_DeleteContextOptions(opts);
+    return result;
+  });
+}
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 4431502f428..facbba92f59 100755
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 
 typedef tensorflow::gtl::InlinedVector<TFE_TensorHandle*, 4>
     TFE_InputTensorHandles;
@@ -259,16 +260,15 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* target,
 // it will simply fail with a NotImplementedError.
 //
 // The "args" PyObject* is meant to be a tuple with the following structure:
-//  Item 1: The TFE Context
-//  Item 2: device_name: Name of the device on which to execute the operation,
-//          or NULL for automatic selection.
-//  Item 3: op_name: Name of the TensorFlow op to execute.
-//  Item 4: name: An optional name for the operation.
-//  Item 5: List representing all callbacks to execute after successful
-//  op execute.
-//  Item 6 onwards: inputs - This is a list of inputs followed by a list of
+//  Item 1: The Python eager Context object
+//  Item 2: op_name: Name of the TensorFlow op to execute.
+//  Item 3: name: An optional name for the operation.
+//  Item 4 onwards: inputs - This is a list of inputs followed by a list of
 //        attrs. It is not necessary for type attrs to be present.
 //
+// Note: the device_name and op_callbacks, which were previously passed
+// as arguments, are now read via GetEagerContextThreadLocalData().
+//
 // This is named _C since there doesn't seem to be any way to make it visible
 // in the SWIG interface without renaming due to the use of the %native
 // directive.
@@ -394,4 +394,59 @@ PyObject* GetPyEagerContext();
 TF_Status* GetStatus();
 // Returns the pre-allocated status to the code.
 void ReturnStatus(TF_Status* status);
+
+namespace tensorflow {
+
+// Thread-local data associated with a Python eager Context object.
+//
+// TODO(edloper): Consider changing device_name and scope_name to a const char*
+// (with nullptr used for None). However, note that existing code (e.g.
+// TFE_TensorHandleCache::Lookup) assumes that the lifetime of these strings
+// extends beyond the point where their value is changed; so we'd need to make
+// sure that the strings stay alive (maybe using PyUnicode_InternInPlace?)
+struct EagerContextThreadLocalData {
+  bool is_eager = false;
+  bool invoking_op_callbacks = false;
+  tensorflow::Safe_PyObjectPtr device_name;
+  tensorflow::Safe_PyObjectPtr scope_name;
+  tensorflow::Safe_PyObjectPtr device_spec;
+  tensorflow::Safe_PyObjectPtr function_call_options;
+  tensorflow::Safe_PyObjectPtr executor;
+  tensorflow::Safe_PyObjectPtr op_callbacks;
+};
+
+// Create a thread-local-data structure associated with py_eager_context.
+// `is_eager` and `device_spec` are used to supply default values for those
+// fields whenever a new thread-local instance is created for py_eager_tensor.
+//
+// This function assumes that the Python GIL is held (and does not perform its
+// own locking).
+void MakeEagerContextThreadLocalData(PyObject* py_eager_context,
+                                     PyObject* is_eager,
+                                     PyObject* device_spec);
+
+// Returns the thread-local instance of EagerContextThreadLocalData that is
+// associated with the given Python Context object.  If an instance has not
+// yet been created for `py_eager_context` in this thread, then a new one is
+// created, and initialized with the default values specified in
+// MakeEagerContextThreadLocalData.
+EagerContextThreadLocalData* GetEagerContextThreadLocalData(
+    PyObject* py_eager_context);
+
+// Free data structures used to track py_eager_context.
+//
+// This frees global state associated with py_eager_context, as well as thread-
+// local state associated with py_eager_context and the current thread. If you
+// wish to destroy thread-local state associated with a single py_eager_context
+// for multiple threads, then you must call this method from each thread.
+//
+// Thread-local state assocaited with eager contexts is also automatically
+// cleaned up when the thread is destroyed.
+//
+// This function assumes that the Python GIL is held (and does not perform its
+// own locking).
+void DestroyEagerContextThreadLocalData(PyObject* py_eager_context);
+
+}  // namespace tensorflow
+
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index cd0ecc8182e..128fb09d114 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/platform/casts.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/util/abstract_stack_trace.h"
@@ -50,6 +51,7 @@ limitations under the License.
 #include "tensorflow/python/util/stack_trace.h"
 #include "tensorflow/python/util/util.h"
 
+using tensorflow::Status;
 using tensorflow::string;
 using tensorflow::strings::Printf;
 
@@ -1280,6 +1282,13 @@ class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyBackwardFunction,
     return PyObject_CallFunctionObjArgs(ones_like_fn_, tensor, NULL);
   }
 
+  // Builds a tensor filled with ones with the same shape and dtype as `t`.
+  Status BuildOnesLike(const PyTapeTensor& t,
+                       PyObject** result) const override {
+    *result = t.OnesLike();
+    return Status::OK();
+  }
+
   PyObject* Zeros(PyObject* shape, PyObject* dtype) const {
     if (PyErr_Occurred()) {
       return nullptr;
@@ -2953,7 +2962,14 @@ PyObject* TFE_Py_PackJVPs(PyObject* tensors) {
 }
 
 namespace {
-static const int kFastPathExecuteInputStartIndex = 5;
+
+// Indices for the "args" tuple that's passed to TFE_Py_FastPathExecute_C.
+enum FastPathExecuteArgIndex {
+  FAST_PATH_EXECUTE_ARG_CONTEXT = 0,
+  FAST_PATH_EXECUTE_ARG_OP_NAME = 1,
+  FAST_PATH_EXECUTE_ARG_NAME = 2,
+  FAST_PATH_EXECUTE_ARG_INPUT_START = 3
+};
 
 PyObject* GetPythonObjectFromString(tensorflow::StringPiece s) {
 #if PY_MAJOR_VERSION >= 3
@@ -3063,7 +3079,7 @@ tensorflow::DataType MaybeGetDTypeForAttr(const string& attr,
 
   for (const auto& input_info : it->second) {
     PyObject* item = PyTuple_GET_ITEM(
-        op_exec_info->args, kFastPathExecuteInputStartIndex + input_info.i);
+        op_exec_info->args, FAST_PATH_EXECUTE_ARG_INPUT_START + input_info.i);
     if (input_info.is_list) {
       tensorflow::Safe_PyObjectPtr fast_item(
           PySequence_Fast(item, "Unable to allocate"));
@@ -3526,19 +3542,26 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
   tensorflow::profiler::TraceMe activity(
       "TFE_Py_FastPathExecute_C", tensorflow::profiler::TraceMeLevel::kInfo);
   Py_ssize_t args_size = PyTuple_GET_SIZE(args);
-  if (args_size < kFastPathExecuteInputStartIndex) {
+  if (args_size < FAST_PATH_EXECUTE_ARG_INPUT_START) {
     PyErr_SetString(
         PyExc_ValueError,
         Printf("There must be at least %d items in the input tuple.",
-               kFastPathExecuteInputStartIndex)
+               FAST_PATH_EXECUTE_ARG_INPUT_START)
             .c_str());
     return nullptr;
   }
 
   FastPathOpExecInfo op_exec_info;
 
+  PyObject* py_eager_context =
+      PyTuple_GET_ITEM(args, FAST_PATH_EXECUTE_ARG_CONTEXT);
+
+  // TODO(edoper): Use interned string here
+  PyObject* eager_context_handle =
+      PyObject_GetAttrString(py_eager_context, "_context_handle");
+
   TFE_Context* ctx = reinterpret_cast<TFE_Context*>(
-      PyCapsule_GetPointer(PyTuple_GET_ITEM(args, 0), nullptr));
+      PyCapsule_GetPointer(eager_context_handle, nullptr));
   op_exec_info.ctx = ctx;
   op_exec_info.args = args;
 
@@ -3550,10 +3573,15 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
     return nullptr;
   }
 
-  op_exec_info.device_name = GetDeviceName(PyTuple_GET_ITEM(args, 1));
-  op_exec_info.op_name = PyTuple_GET_ITEM(args, 2);
-  op_exec_info.name = PyTuple_GET_ITEM(args, 3);
-  op_exec_info.callbacks = PyTuple_GET_ITEM(args, 4);
+  auto* tld = tensorflow::GetEagerContextThreadLocalData(py_eager_context);
+  if (tld == nullptr) {
+    return nullptr;
+  }
+  op_exec_info.device_name = GetDeviceName(tld->device_name.get());
+  op_exec_info.callbacks = tld->op_callbacks.get();
+
+  op_exec_info.op_name = PyTuple_GET_ITEM(args, FAST_PATH_EXECUTE_ARG_OP_NAME);
+  op_exec_info.name = PyTuple_GET_ITEM(args, FAST_PATH_EXECUTE_ARG_NAME);
 
   // TODO(nareshmodi): Add a benchmark for the fast-path with gradient callbacks
   // (similar to benchmark_tf_gradient_function_*). Also consider using an
@@ -3591,18 +3619,19 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
   const tensorflow::OpDef* op_def = tensorflow::unwrap(op)->OpDef();
   if (op_def == nullptr) return nullptr;
 
-  if (args_size < kFastPathExecuteInputStartIndex + op_def->input_arg_size()) {
+  if (args_size <
+      FAST_PATH_EXECUTE_ARG_INPUT_START + op_def->input_arg_size()) {
     PyErr_SetString(
         PyExc_ValueError,
         Printf("Tuple size smaller than intended. Expected to be at least %d, "
                "was %ld",
-               kFastPathExecuteInputStartIndex + op_def->input_arg_size(),
+               FAST_PATH_EXECUTE_ARG_INPUT_START + op_def->input_arg_size(),
                args_size)
             .c_str());
     return nullptr;
   }
 
-  if (!CheckInputsOk(args, kFastPathExecuteInputStartIndex, *op_def)) {
+  if (!CheckInputsOk(args, FAST_PATH_EXECUTE_ARG_INPUT_START, *op_def)) {
     RaiseFallbackException(
         "This function does not handle the case of the path where "
         "all inputs are not already EagerTensors.");
@@ -3618,7 +3647,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
 
   // Set non-inferred attrs, including setting defaults if the attr is passed in
   // as None.
-  for (int i = kFastPathExecuteInputStartIndex + op_def->input_arg_size();
+  for (int i = FAST_PATH_EXECUTE_ARG_INPUT_START + op_def->input_arg_size();
        i < args_size; i += 2) {
     PyObject* py_attr_name = PyTuple_GET_ITEM(args, i);
     const char* attr_name = TFE_GetPythonString(py_attr_name);
@@ -3675,7 +3704,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
     const auto& input_arg = op_def->input_arg(i);
 
     PyObject* input =
-        PyTuple_GET_ITEM(args, kFastPathExecuteInputStartIndex + i);
+        PyTuple_GET_ITEM(args, FAST_PATH_EXECUTE_ARG_INPUT_START + i);
     if (!input_arg.number_attr().empty()) {
       // The item is a homogeneous list.
       if (!RaiseIfNotPySequence(input, input_arg.number_attr())) return nullptr;
@@ -3820,7 +3849,7 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
   if (op_exec_info.run_callbacks) {
     if (!RunCallbacks(
             op_exec_info, args,
-            kFastPathExecuteInputStartIndex + op_def->input_arg_size(),
+            FAST_PATH_EXECUTE_ARG_INPUT_START + op_def->input_arg_size(),
             *flattened_inputs, *flattened_attrs, flat_result.get())) {
       return nullptr;
     }
@@ -4203,3 +4232,125 @@ PyObject* GetPyEagerContext() {
   Py_INCREF(py_context);
   return py_context;
 }
+
+namespace {
+
+// Default values for thread_local_data fields.
+struct EagerContextThreadLocalDataDefaults {
+  tensorflow::Safe_PyObjectPtr is_eager;
+  tensorflow::Safe_PyObjectPtr device_spec;
+};
+
+// Maps each py_eager_context object to its thread_local_data.
+//
+// Note: we need to use the python Context object as the key here (and not
+// its handle object), because the handle object isn't created until the
+// context is initialized; but thread_local_data is potentially accessed
+// before then.
+using EagerContextThreadLocalDataMap = absl::flat_hash_map<
+    PyObject*, std::unique_ptr<tensorflow::EagerContextThreadLocalData>>;
+thread_local EagerContextThreadLocalDataMap*
+    eager_context_thread_local_data_map = nullptr;
+
+// Maps each py_eager_context object to default values.
+using EagerContextThreadLocalDataDefaultsMap =
+    absl::flat_hash_map<PyObject*, EagerContextThreadLocalDataDefaults>;
+EagerContextThreadLocalDataDefaultsMap*
+    eager_context_thread_local_data_defaults = nullptr;
+
+}  // namespace
+
+namespace tensorflow {
+
+void MakeEagerContextThreadLocalData(PyObject* py_eager_context,
+                                     PyObject* is_eager,
+                                     PyObject* device_spec) {
+  DCheckPyGilState();
+  if (eager_context_thread_local_data_defaults == nullptr) {
+    eager_context_thread_local_data_defaults =
+        new EagerContextThreadLocalDataDefaultsMap();
+  }
+  if (eager_context_thread_local_data_defaults->count(py_eager_context) > 0) {
+    PyErr_SetString(PyExc_AssertionError,
+                    "MakeEagerContextThreadLocalData may not be called "
+                    "twice on the same eager Context object.");
+  }
+
+  auto& defaults =
+      (*eager_context_thread_local_data_defaults)[py_eager_context];
+  Py_INCREF(is_eager);
+  defaults.is_eager.reset(is_eager);
+  Py_INCREF(device_spec);
+  defaults.device_spec.reset(device_spec);
+}
+
+EagerContextThreadLocalData* GetEagerContextThreadLocalData(
+    PyObject* py_eager_context) {
+  if (eager_context_thread_local_data_defaults == nullptr) {
+    PyErr_SetString(PyExc_AssertionError,
+                    "MakeEagerContextThreadLocalData must be called "
+                    "before GetEagerContextThreadLocalData.");
+    return nullptr;
+  }
+  auto defaults =
+      eager_context_thread_local_data_defaults->find(py_eager_context);
+  if (defaults == eager_context_thread_local_data_defaults->end()) {
+    PyErr_SetString(PyExc_AssertionError,
+                    "MakeEagerContextThreadLocalData must be called "
+                    "before GetEagerContextThreadLocalData.");
+    return nullptr;
+  }
+
+  if (eager_context_thread_local_data_map == nullptr) {
+    eager_context_thread_local_data_map = new EagerContextThreadLocalDataMap();
+  }
+  auto& thread_local_data =
+      (*eager_context_thread_local_data_map)[py_eager_context];
+
+  if (!thread_local_data) {
+    thread_local_data.reset(new EagerContextThreadLocalData());
+
+    Safe_PyObjectPtr is_eager(PyObject_CallFunctionObjArgs(
+        defaults->second.is_eager.get(), nullptr));
+    if (!is_eager) return nullptr;
+    thread_local_data->is_eager = PyObject_IsTrue(is_eager.get());
+
+#if PY_MAJOR_VERSION >= 3
+    PyObject* scope_name = PyUnicode_FromString("");
+#else
+    PyObject* scope_name = PyString_FromString("");
+#endif
+    thread_local_data->scope_name.reset(scope_name);
+
+#if PY_MAJOR_VERSION >= 3
+    PyObject* device_name = PyUnicode_FromString("");
+#else
+    PyObject* device_name = PyString_FromString("");
+#endif
+    thread_local_data->device_name.reset(device_name);
+
+    Py_INCREF(defaults->second.device_spec.get());
+    thread_local_data->device_spec.reset(defaults->second.device_spec.get());
+
+    Py_INCREF(Py_None);
+    thread_local_data->function_call_options.reset(Py_None);
+
+    Py_INCREF(Py_None);
+    thread_local_data->executor.reset(Py_None);
+
+    thread_local_data->op_callbacks.reset(PyList_New(0));
+  }
+  return thread_local_data.get();
+}
+
+void DestroyEagerContextThreadLocalData(PyObject* py_eager_context) {
+  DCheckPyGilState();
+  if (eager_context_thread_local_data_defaults) {
+    eager_context_thread_local_data_defaults->erase(py_eager_context);
+  }
+  if (eager_context_thread_local_data_map) {
+    eager_context_thread_local_data_map->erase(py_eager_context);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/eager/pywrap_tfe_test.py b/tensorflow/python/eager/pywrap_tfe_test.py
index 9bf698fded0..07529e6a36e 100644
--- a/tensorflow/python/eager/pywrap_tfe_test.py
+++ b/tensorflow/python/eager/pywrap_tfe_test.py
@@ -57,16 +57,15 @@ class Tests(test.TestCase):
 
     self.assertAllClose(
         math_ops.matmul(a_2_by_2, b_2_by_2),
-        pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                          "MatMul", None, None, a_2_by_2,
-                                          b_2_by_2, "transpose_a", False,
-                                          "transpose_b", False))
+        pywrap_tfe.TFE_Py_FastPathExecute(ctx, "MatMul", None,
+                                          a_2_by_2, b_2_by_2, "transpose_a",
+                                          False, "transpose_b", False))
     self.assertAllClose(
         math_ops.matmul(a_100_by_784, b_100_by_784, transpose_b=True),
-        pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                          "MatMul", None, None, a_100_by_784,
-                                          b_100_by_784, "transpose_a", False,
-                                          "transpose_b", True))
+        pywrap_tfe.TFE_Py_FastPathExecute(ctx, "MatMul", None,
+                                          a_100_by_784, b_100_by_784,
+                                          "transpose_a", False, "transpose_b",
+                                          True))
 
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
@@ -76,14 +75,12 @@ class Tests(test.TestCase):
 
     a_2_by_2 = constant_op.constant(1.0, shape=[2, 2])
     m = resource_variable_ops.ResourceVariable(a_2_by_2)
-    x = pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                          "MatMul", None, None, m, m,
-                                          "transpose_a", False, "transpose_b",
-                                          False)
-    y = pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                          "MatMul", None, None, a_2_by_2,
-                                          a_2_by_2, "transpose_a", False,
+    x = pywrap_tfe.TFE_Py_FastPathExecute(ctx, "MatMul", None, m,
+                                          m, "transpose_a", False,
                                           "transpose_b", False)
+    y = pywrap_tfe.TFE_Py_FastPathExecute(ctx, "MatMul", None,
+                                          a_2_by_2, a_2_by_2, "transpose_a",
+                                          False, "transpose_b", False)
 
     self.assertAllEqual(x, y)
 
@@ -96,10 +93,9 @@ class Tests(test.TestCase):
     with backprop.GradientTape(persistent=True) as tape:
       a_2_by_2 = constant_op.constant(1.0, shape=[2, 2])
       tape.watch(a_2_by_2)
-      z = pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                            "MatMul", None, None, a_2_by_2,
-                                            a_2_by_2, "transpose_a", False,
-                                            "transpose_b", False)
+      z = pywrap_tfe.TFE_Py_FastPathExecute(ctx, "MatMul", None,
+                                            a_2_by_2, a_2_by_2, "transpose_a",
+                                            False, "transpose_b", False)
     dz_dy = tape.gradient(z, [a_2_by_2])[0]
     self.assertAllEqual(dz_dy.numpy(),
                         constant_op.constant(4.0, shape=[2, 2]).numpy())
@@ -114,10 +110,9 @@ class Tests(test.TestCase):
       a_2_by_2 = constant_op.constant(1.0, shape=[2, 2])
       m = resource_variable_ops.ResourceVariable(a_2_by_2)
       tape.watch(m)
-      z = pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                            "MatMul", None, None, m, m,
-                                            "transpose_a", False, "transpose_b",
-                                            False)
+      z = pywrap_tfe.TFE_Py_FastPathExecute(ctx, "MatMul", None, m,
+                                            m, "transpose_a", False,
+                                            "transpose_b", False)
     dz_dy = tape.gradient(z, [m])[0]
     self.assertAllEqual(dz_dy.numpy(),
                         constant_op.constant(4.0, shape=[2, 2]).numpy())
@@ -134,8 +129,8 @@ class Tests(test.TestCase):
 
     self.assertAllClose(
         math_ops.add_n([a_2_by_2, b_2_by_2]),
-        pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name, "AddN",
-                                          None, None, [a_2_by_2, b_2_by_2]))
+        pywrap_tfe.TFE_Py_FastPathExecute(ctx, "AddN", None,
+                                          [a_2_by_2, b_2_by_2]))
 
   # Tests homogeneous list op
   @test_util.assert_no_new_tensors
@@ -150,8 +145,7 @@ class Tests(test.TestCase):
     with backprop.GradientTape(persistent=True) as tape:
       tape.watch(a_2_by_2)
       tape.watch(b_2_by_2)
-      z1 = pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                             "AddN", None, None,
+      z1 = pywrap_tfe.TFE_Py_FastPathExecute(ctx, "AddN", None,
                                              [a_2_by_2, b_2_by_2])
       z2 = math_ops.add_n([a_2_by_2, b_2_by_2])
     dz1_dy = tape.gradient(z1, [a_2_by_2])[0]
@@ -170,8 +164,7 @@ class Tests(test.TestCase):
 
     self.assertAllClose(
         array_ops.identity_n([a_2_by_2, b_2_by_2]),
-        pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                          "IdentityN", None, None,
+        pywrap_tfe.TFE_Py_FastPathExecute(ctx, "IdentityN", None,
                                           [a_2_by_2, b_2_by_2]))
 
   # Tests heterogeneous list op
@@ -187,9 +180,8 @@ class Tests(test.TestCase):
     with backprop.GradientTape(persistent=True) as tape:
       tape.watch(a_2_by_2)
       tape.watch(b_2_by_2)
-      z1 = pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name,
-                                             "IdentityN", None, None,
-                                             [a_2_by_2, b_2_by_2])
+      z1 = pywrap_tfe.TFE_Py_FastPathExecute(ctx, "IdentityN",
+                                             None, [a_2_by_2, b_2_by_2])
       z2 = array_ops.identity_n([a_2_by_2, b_2_by_2])
     dz1_dy = tape.gradient(z1[0], [a_2_by_2])[0]
     dz2_dy = tape.gradient(z2[0], [a_2_by_2])[0]
@@ -208,18 +200,17 @@ class Tests(test.TestCase):
 
     # Not enough base params
     with self.assertRaisesRegex(ValueError,
-                                "at least 5 items in the input tuple"):
-      pywrap_tfe.TFE_Py_FastPathExecute(ctx_handle, ctx.device_name, "Identity")
+                                "at least 3 items in the input tuple"):
+      pywrap_tfe.TFE_Py_FastPathExecute(ctx, "Identity")
 
     # Not enough inputs
-    with self.assertRaisesRegex(ValueError, "Expected to be at least 6, was 5"):
-      pywrap_tfe.TFE_Py_FastPathExecute(ctx_handle, ctx_handle, "Identity",
-                                        None, [])
+    with self.assertRaisesRegex(ValueError, "Expected to be at least 4, was 3"):
+      pywrap_tfe.TFE_Py_FastPathExecute(ctx, "Identity", None)
 
     # Bad type
     with self.assertRaisesRegex(TypeError, "expected a string for op_name"):
-      pywrap_tfe.TFE_Py_FastPathExecute(ctx_handle, ctx.device_name, ctx_handle,
-                                        None, [], a_2_by_2)
+      pywrap_tfe.TFE_Py_FastPathExecute(ctx, ctx_handle, None,
+                                        a_2_by_2)
 
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
@@ -229,11 +220,9 @@ class Tests(test.TestCase):
     ctx = context.context()
     ctx.ensure_initialized()
 
-    ctx_handle = ctx._handle
     with self.assertRaises(core._FallbackException):
-      pywrap_tfe.TFE_Py_FastPathExecute(ctx_handle, ctx.device_name, "Split",
-                                        None, None, split_dim, value,
-                                        "num_split", -1)
+      pywrap_tfe.TFE_Py_FastPathExecute(ctx, "Split", None,
+                                        split_dim, value, "num_split", -1)
 
   @test_util.assert_no_new_tensors
   @test_util.assert_no_garbage_created
@@ -273,9 +262,9 @@ class Tests(test.TestCase):
     ctx = context.context()
     ctx.ensure_initialized()
     with self.assertRaises(core._FallbackException):
-      pywrap_tfe.TFE_Py_FastPathExecute(ctx._handle, ctx.device_name, "MatMul",
-                                        None, None, m, m, "transpose_a", False,
-                                        "transpose_b", False)
+      pywrap_tfe.TFE_Py_FastPathExecute(ctx, "MatMul", None, m, m,
+                                        "transpose_a", False, "transpose_b",
+                                        False)
 
   def testOpDefDefaultType(self):
     im = np.random.randint(
diff --git a/tensorflow/python/eager/remote_benchmarks_test.py b/tensorflow/python/eager/remote_benchmarks_test.py
index 300ce0c2b90..73437132cc5 100644
--- a/tensorflow/python/eager/remote_benchmarks_test.py
+++ b/tensorflow/python/eager/remote_benchmarks_test.py
@@ -92,7 +92,7 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
         wall_time=mean_us,
         extras={"examples_per_sec": num_iters / total_time})
 
-  def benchmark_send_mirroring_off(self):
+  def benchmark_send(self):
     remote.connect_to_remote_host(self._cached_server_target1)
 
     x = random_ops.random_uniform((2, 2)).cpu()
@@ -105,34 +105,13 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
       with ops.device("job:worker/replica:0/task:0/device:CPU:0"):
         return remote_func(m)
 
-    context.context().mirroring_policy = context.MIRRORING_NONE
     self._run(lambda: func(x))
     # NOTE(b/136184459): Force garbage collecting hanging resources before
     # subsequent calls to set_server_def, to ensure the destroy resource ops are
     # executed when their corresponding device and manager are still available.
     gc.collect()
 
-  def benchmark_send_mirroring_on(self):
-    remote.connect_to_remote_host(self._cached_server_target1)
-
-    x = random_ops.random_uniform((2, 2)).cpu()
-
-    @def_function.function
-    def remote_func(m):
-      return math_ops.matmul(m, m)
-
-    def func(m):
-      with ops.device("job:worker/replica:0/task:0/device:CPU:0"):
-        return remote_func(m)
-
-    context.context().mirroring_policy = context.MIRRORING_ALL
-    self._run(lambda: func(x))
-    # NOTE(b/136184459): Force garbage collecting hanging resources before
-    # subsequent calls to set_server_def, to ensure the destroy resource ops are
-    # executed when their corresponding device and manager are still available.
-    gc.collect()
-
-  def benchmark_worker_mirroring_off(self):
+  def benchmark_worker_recv(self):
     remote.connect_to_remote_host(
         [self._cached_server_target1, self._cached_server_target2])
 
@@ -147,29 +126,6 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark):
       with ops.device("job:worker/replica:0/task:0/device:CPU:0"):
         return remote_func()
 
-    context.context().mirroring_policy = context.MIRRORING_NONE
-    self._run(func)
-    # NOTE(b/136184459): Force garbage collecting hanging resources before
-    # subsequent calls to set_server_def, to ensure the destroy resource ops are
-    # executed when their corresponding device and manager are still available.
-    gc.collect()
-
-  def benchmark_worker_mirroring_on(self):
-    remote.connect_to_remote_host(
-        [self._cached_server_target1, self._cached_server_target2])
-
-    with ops.device("job:worker/replica:0/task:1/device:CPU:0"):
-      v = variables.Variable(1.0)
-
-    @def_function.function
-    def remote_func():
-      return 1.0 + v
-
-    def func():
-      with ops.device("job:worker/replica:0/task:0/device:CPU:0"):
-        return remote_func()
-
-    context.context().mirroring_policy = context.MIRRORING_ALL
     self._run(func)
     # NOTE(b/136184459): Force garbage collecting hanging resources before
     # subsequent calls to set_server_def, to ensure the destroy resource ops are
diff --git a/tensorflow/python/eager/remote_test.py b/tensorflow/python/eager/remote_test.py
index 429068149b1..0fb78cb2846 100644
--- a/tensorflow/python/eager/remote_test.py
+++ b/tensorflow/python/eager/remote_test.py
@@ -468,17 +468,6 @@ class MultiWorkersTest(test.TestCase, parameterized.TestCase):
       c = a + 1.0
       return c
 
-    context.context().mirroring_policy = context.MIRRORING_NONE
-
-    with ops.device('/job:worker/replica:0/task:0'):
-      self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0])
-
-    if test_util.is_gpu_available():
-      with ops.device('/job:worker/replica:0/task:0/device:GPU:0'):
-        self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0])
-
-    context.context().mirroring_policy = context.MIRRORING_ALL
-
     with ops.device('/job:worker/replica:0/task:0'):
       self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0])
 
@@ -520,17 +509,6 @@ class MultiWorkersTest(test.TestCase, parameterized.TestCase):
 
       return control_flow_ops.while_loop_v2(lambda _, d: d < 1, body, [i, 0])[0]
 
-    context.context().mirroring_policy = context.MIRRORING_NONE
-
-    with ops.device('/job:worker/replica:0/task:0'):
-      self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0])
-
-    if test_util.is_gpu_available():
-      with ops.device('/job:worker/replica:0/task:0/device:GPU:0'):
-        self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0])
-
-    context.context().mirroring_policy = context.MIRRORING_ALL
-
     with ops.device('/job:worker/replica:0/task:0'):
       self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0])
 
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 1d48d59f754..f0226435a72 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -416,6 +416,8 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(
         np.array(memoryview(t)), np.array([0.0], dtype=np.float32))
 
+  @test_util.disable_tfrt("b/169877776: ResourceVariable is not initialized "
+                          "properly in TFRT")
   def testResourceTensorCopy(self):
     if not test_util.is_gpu_available():
       self.skipTest("GPU only")
diff --git a/tensorflow/python/eager/wrap_function_device_test.py b/tensorflow/python/eager/wrap_function_device_test.py
new file mode 100644
index 00000000000..0b292fcc31a
--- /dev/null
+++ b/tensorflow/python/eager/wrap_function_device_test.py
@@ -0,0 +1,86 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import wrap_function
+from tensorflow.python.framework import config
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import importer as graph_def_importer
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+def _dataset_reduce_sum(dataset):
+  return dataset.reduce(
+      constant_op.constant(0, dtype=dtypes.int64), lambda x, y: x + y)
+
+
+def _loop_dataset_sum(dataset):
+  value = constant_op.constant(0, dtype=dtypes.int64)
+  for d in dataset:
+    value += d
+  return value
+
+
+def _iter_dataset_sum(dataset):
+  value = constant_op.constant(0, dtype=dtypes.int64)
+  for d in iter(dataset):
+    value += d
+  return value
+
+
+class WrappedGraphTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('cpu_reduce', 'CPU', _dataset_reduce_sum),
+      ('gpu_reduce', 'GPU', _dataset_reduce_sum),
+      ('cpu_loop', 'CPU', _loop_dataset_sum),
+      ('gpu_loop', 'GPU', _loop_dataset_sum),
+      ('cpu_iter', 'CPU', _iter_dataset_sum),
+      ('gpu_iter', 'GPU', _iter_dataset_sum),
+  )
+  def testWrapFuncDatasetDevice(self, device_type, dataset_reduce_fn):
+
+    devices = config.list_logical_devices(device_type=device_type)
+    if not devices:
+      self.skipTest('Skip when {} is not detected by TF'.format(device_type))
+
+    @def_function.function
+    def comp():
+      return dataset_reduce_fn(dataset_ops.Dataset.range(10))
+
+    graph = comp.get_concrete_function().graph
+
+    def function_to_wrap():
+      with ops.device(devices[0].name):
+        return graph_def_importer.import_graph_def(graph.as_graph_def())
+
+    with ops.device(devices[0].name):
+      wrapped_noarg_fn = wrap_function.wrap_function(
+          function_to_wrap, signature=[])
+
+    wrapped_noarg_fn()
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index 06a5b6dea33..12f35a494b1 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -41,6 +41,7 @@ from tensorflow.python.util import tf_decorator
 # asynchronously to avoid deadlock.
 ASYNC_STATEFUL_OPS = [
     "CollectiveGather",
+    "CollectiveGatherV2",
     "CollectiveReduce",
     "CollectiveReduceV2",
     "CollectiveBcastSend",
diff --git a/tensorflow/python/framework/combinations.py b/tensorflow/python/framework/combinations.py
index a384037e14f..7a659143fc3 100644
--- a/tensorflow/python/framework/combinations.py
+++ b/tensorflow/python/framework/combinations.py
@@ -28,6 +28,7 @@ from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_combinations
+from tensorflow.python.util.tf_export import tf_export
 
 
 class EagerGraphCombination(test_combinations.TestCombination):
@@ -81,3 +82,5 @@ generate = functools.partial(
 combine = test_combinations.combine
 times = test_combinations.times
 NamedObject = test_combinations.NamedObject
+
+tf_export("__internal__.test.combinations.generate", v1=[])(generate)
diff --git a/tensorflow/python/framework/composite_tensor.py b/tensorflow/python/framework/composite_tensor.py
index e3db9936389..22dbe7c7d88 100644
--- a/tensorflow/python/framework/composite_tensor.py
+++ b/tensorflow/python/framework/composite_tensor.py
@@ -25,8 +25,10 @@ import six
 from tensorflow.python import _pywrap_utils
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("__internal__.CompositeTensor", v1=[])
 @six.add_metaclass(abc.ABCMeta)
 class CompositeTensor(object):
   """Abstract base class for Tensor-like objects that are composed from Tensors.
diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 0962b9a8a70..7aed6fb2b70 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -18,40 +18,79 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python import _pywrap_tf32_execution
+from tensorflow.python import _pywrap_tensor_float_32_execution
 from tensorflow.python.eager import context
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-# No tf_export until TF is built against CUDA11 which is required for TF32.
-def tensor_float_32_execution_allowed():
-  """Get if TensorFloat-32 operations are enabled on supported hardware.
+@tf_export('config.experimental.tensor_float_32_execution_enabled')
+def tensor_float_32_execution_enabled():
+  """Returns whether TensorFloat-32 is enabled.
+
+  By default, TensorFloat-32 is enabled, but this can be changed with
+  `tf.config.experimental.enable_tensor_float_32_execution`.
 
   Returns:
-    True if TensorFloat-32 execution is enabled and False otherwise.
+    True if TensorFloat-32 is enabled (the default) and False otherwise
   """
-  return _pywrap_tf32_execution.is_allowed()
+  return _pywrap_tensor_float_32_execution.is_enabled()
 
 
-# No tf_export until TF is built against CUDA11 which is required for TF32.
-def allow_tensor_float_32_execution(allowed):
-  """Allow use of TensorFloat-32 with float32 ops on supported hardware.
+@tf_export('config.experimental.enable_tensor_float_32_execution')
+def enable_tensor_float_32_execution(enabled):
+  """Enable or disable the use of TensorFloat-32 on supported hardware.
 
-  TensorFloat-32 is a math mode introduced with the NVIDIA Ampere architecture.
-  TensorFloat-32 kernels take float32 inputs and produce float32 outputs.
-  Internally, the inputs are cast to a custom representation with 10-bit
-  mantissa (similar to float16) and 8-bit exponent (similar to float32) and are
-  executed using TensorCores with float32 accumulation. For more information,
-  see https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/.
+  [TensorFloat-32](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format),
+  or TF32 for short, is a math mode for NVIDIA Ampere GPUs. TensorFloat-32
+  execution causes certain float32 ops, such as matrix multiplications and
+  convolutions, to run much faster on Ampere GPUs but with reduced precision.
+  This reduced precision should not impact convergence of deep learning models
+  in practice.
 
-  TensorFloat-32 execution is disabled by default, but this may change in a
-  future version.
+  TensorFloat-32 is enabled by default in the nightly versions of TensorFlow. We
+  expect it will remain enabled by default in the first stable version that
+  TensorFloat-32 is available, which is TensorFlow 2.4, as it increases
+  performance and does not reduce model quality in practice. If you want to use
+  the full float32 precision, you can disable TensorFloat-32 execution with this
+  function. For example:
+
+  ```python
+  x = tf.fill((2, 2), 1.0001)
+  y = tf.fill((2, 2), 1.)
+  # TensorFloat-32 is enabled, so matmul is run with reduced precision
+  print(tf.linalg.matmul(x, y))  # [[2., 2.], [2., 2.]]
+  tf.config.experimental.enable_tensor_float_32_execution(False)
+  # Matmul is run with full precision
+  print(tf.linalg.matmul(x, y))  # [[2.0002, 2.0002], [2.0002, 2.0002]]
+  ```
+
+  There is [an RFC](https://github.com/tensorflow/community/pull/287) proposing
+  that TensorFloat-32 remain enabled by default in stable versions of
+  TensorFlow. We expect the RFC to be accepted, but if it isn't, TensorFloat-32
+  will be disabled by default in TensorFlow 2.4.
+
+  To check whether TensorFloat-32 execution is currently enabled, use
+  `tf.config.experimental.tensor_float_32_execution_enabled`.
+
+  Enabling TensorFloat-32 causes float32 inputs of supported ops, such as
+  `tf.linalg.matmul`, to be rounded from 23 bits of precision to 10 bits of
+  precision in most cases. This allows the ops to execute much faster by
+  utilizing the GPU's tensor cores. TensorFloat-32 has the same dynamic range as
+  float32, meaning it is no more likely to underflow or overflow than float32.
+  Ops still use float32 accumulation when TensorFloat-32 is enabled. Enabling
+  TensorFloat-32 only affects Ampere GPUs and subsequent GPUs that support
+  TensorFloat-32.
+
+  Note TensorFloat-32 is not always used in supported ops, as only inputs of
+  certain shapes are supported. Support for more input shapes and more ops may
+  be added in the future. As a result, precision of float32 ops may decrease in
+  minor versions of TensorFlow.
 
   Args:
-    allowed: whether to allow TensorFloat-32 execution
+    enabled: Bool indicating whether to enable TensorFloat-32 execution.
   """
-  _pywrap_tf32_execution.allow(allowed)
+  _pywrap_tensor_float_32_execution.enable(enabled)
 
 
 @tf_export('config.threading.get_intra_op_parallelism_threads')
@@ -474,6 +513,33 @@ def set_visible_devices(devices, device_type=None):
   context.context().set_visible_devices(devices, device_type)
 
 
+@tf_export('config.experimental.get_memory_usage')
+def get_memory_usage(device):
+  """Get the memory usage, in bytes, for the chosen device.
+
+  See https://www.tensorflow.org/api_docs/python/tf/device for specifying device
+  strings.
+
+  For example:
+
+  >>> gpu_devices = tf.config.list_physical_devices('GPU')
+  >>> if gpu_devices:
+  ...   tf.config.experimental.get_memory_usage('GPU:0')
+
+  Does not work for CPU.
+
+  Args:
+    device: Device string to get the bytes in use for.
+
+  Returns:
+    Total memory usage in bytes.
+
+  Raises:
+    ValueError: Non-existent or CPU device specified.
+  """
+  return context.context().get_total_memory_usage(device)
+
+
 @tf_export('config.experimental.get_memory_growth')
 def get_memory_growth(device):
   """Get if memory growth is enabled for a `PhysicalDevice`.
diff --git a/tensorflow/python/framework/config_test.py b/tensorflow/python/framework/config_test.py
index ee7e111f6b0..a20af802824 100644
--- a/tensorflow/python/framework/config_test.py
+++ b/tensorflow/python/framework/config_test.py
@@ -588,6 +588,35 @@ class DeviceTest(test.TestCase):
     for gpu in gpus:
       config.set_memory_growth(gpu, True)
 
+  @test_util.run_gpu_only
+  @reset_eager
+  def testGetMemoryUsage(self):
+    device = array_ops.zeros([]).backing_device
+    self.assertGreater(config.get_memory_usage(device), 0)
+
+  @test_util.run_gpu_only
+  @reset_eager
+  def testGetMemoryUsageSubstring(self):
+    self.assertGreater(config.get_memory_usage('GPU:0'), 0)
+
+  @reset_eager
+  def testGetMemoryUsageCPU(self):
+    with self.assertRaisesRegex(ValueError, 'CPU does not support'):
+      config.get_memory_usage('CPU:0')
+
+  @reset_eager
+  def testGetMemoryUsageUnknownDevice(self):
+    with self.assertRaisesRegex(ValueError, 'Failed parsing device name'):
+      config.get_memory_usage('unknown_device')
+
+  @test_util.run_gpu_only
+  @reset_eager
+  def testGetMemoryUsageAmbiguousDevice(self):
+    if len(config.list_physical_devices('GPU')) < 2:
+      self.skipTest('Need at least 2 GPUs')
+    with self.assertRaisesRegex(ValueError, 'Multiple devices'):
+      config.get_memory_usage('GPU')
+
   @test_util.run_gpu_only
   @reset_eager
   def testGpuInvalidConfig(self):
@@ -750,40 +779,38 @@ class DeviceTest(test.TestCase):
 class TensorFloat32Test(test.TestCase):
 
   def setUp(self):
+    super(TensorFloat32Test, self).setUp()
     if not test_util.is_gpu_available(
         cuda_only=True, min_cuda_compute_capability=(8, 0)):
       self.skipTest('TensorFloat-32 requires an NVIDIA GPU with compute '
                     'capability of at least 8.0')
 
   def tearDown(self):
-    config.allow_tensor_float_32_execution(False)
+    super(TensorFloat32Test, self).tearDown()
+    config.enable_tensor_float_32_execution(True)
 
-  def test_tf32_enabled(self):
-    self.assertFalse(config.tensor_float_32_execution_allowed())
-    config.allow_tensor_float_32_execution(True)
-    self.assertTrue(config.tensor_float_32_execution_allowed())
+  def test_tensor_float_32_enabled(self):
+    self.assertTrue(config.tensor_float_32_execution_enabled())
 
     x = array_ops.fill((8, 8), 1 + 2**-20)
     y = array_ops.ones((8, 8))
     out = math_ops.matmul(x, y)
-    # In tf32, each element of x is rounded to 1, so the output will be 8s.
+    # In TensorFloat-32, each element of x is rounded to 1, so the output will
+    # be 8s.
     expected = array_ops.fill((8, 8), 8)
     self.assertAllEqual(out, expected)
 
-  def test_tf32_disabled(self):
+  def test_tensor_float_32_disabled(self):
+    self.assertTrue(config.tensor_float_32_execution_enabled())
+    config.enable_tensor_float_32_execution(False)
+    self.assertFalse(config.tensor_float_32_execution_enabled())
+
     x = array_ops.fill((8, 8), 1 + 2**-20)
     y = array_ops.ones((8, 8))
     out = math_ops.matmul(x, y)
     expected = array_ops.fill((8, 8), 8 * (1 + 2**-20))
     self.assertAllEqual(out, expected)
 
-    # Test disabling tf32 after enabling it works correctly
-    config.allow_tensor_float_32_execution(True)
-    config.allow_tensor_float_32_execution(False)
-    self.assertFalse(config.tensor_float_32_execution_allowed())
-    out = math_ops.matmul(x, y)
-    self.assertAllEqual(out, expected)
-
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/framework/cpp_shape_inference.proto b/tensorflow/python/framework/cpp_shape_inference.proto
index 11199a9720f..aa4df78c40b 100644
--- a/tensorflow/python/framework/cpp_shape_inference.proto
+++ b/tensorflow/python/framework/cpp_shape_inference.proto
@@ -2,6 +2,7 @@ syntax = "proto3";
 
 package tensorflow;
 option cc_enable_arenas = true;
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/python/framework/cpp_shape_inference_go_proto";
 
 import "tensorflow/core/framework/types.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
@@ -10,6 +11,10 @@ message CppShapeInferenceResult {
   message HandleShapeAndType {
     TensorShapeProto shape = 1;
     DataType dtype = 2;
+    // For dtype==DT_VARIANT, specialized_type may indicate a more specific
+    // type. For other dtypes or when the information is unavailable it is set
+    // to ST_INVALID.
+    SpecializedType specialized_type = 3;
   }
   message HandleData {
     bool is_set = 1;
diff --git a/tensorflow/python/framework/device_spec.py b/tensorflow/python/framework/device_spec.py
index 36d30450e92..8b7d72bf03a 100644
--- a/tensorflow/python/framework/device_spec.py
+++ b/tensorflow/python/framework/device_spec.py
@@ -21,7 +21,8 @@ from __future__ import print_function
 from tensorflow.python.util.tf_export import tf_export
 
 
-_VALID_DEVICE_TYPES = frozenset({"CPU", "GPU", "TPU", "CUSTOM"})
+# EPU represents for TPU embedding for now. Subject to change in future.
+_VALID_DEVICE_TYPES = frozenset({"CPU", "GPU", "TPU", "CUSTOM", "EPU"})
 
 
 # ==============================================================================
diff --git a/tensorflow/python/framework/experimental/BUILD b/tensorflow/python/framework/experimental/BUILD
new file mode 100644
index 00000000000..3d404a411c9
--- /dev/null
+++ b/tensorflow/python/framework/experimental/BUILD
@@ -0,0 +1,153 @@
+# Experimental Unified APIs for Eager and Graph modes.
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+tf_python_pybind_extension(
+    name = "_unified_api",
+    srcs = ["unified_api.cc"],
+    features = ["-layering_check"],
+    module_name = "_unified_api",
+    deps = [
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:unified_api_pywrap_required_headers",
+        "@pybind11",
+    ],
+)
+
+tf_python_pybind_extension(
+    name = "_tape",
+    srcs = ["tape.cc"],
+    features = ["-layering_check"],
+    module_name = "_tape",
+    deps = [
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:unified_api_pywrap_required_headers",
+        "@pybind11",
+    ],
+)
+
+tf_python_pybind_extension(
+    name = "_math_ops",
+    srcs = ["math_ops.cc"],
+    module_name = "_math_ops",
+    deps = [
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:unified_api_pywrap_required_headers",
+        "@com_google_absl//absl/types:span",
+        "@pybind11",
+    ],
+)
+
+tf_python_pybind_extension(
+    name = "_nn_ops",
+    srcs = ["nn_ops.cc"],
+    module_name = "_nn_ops",
+    deps = [
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/lib/llvm_rtti",
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:unified_api_pywrap_required_headers",
+        "@com_google_absl//absl/types:span",
+        "@pybind11",
+    ],
+)
+
+py_library(
+    name = "gradient_registry",
+    srcs = ["gradient_registry.py"],
+    deps = [":_tape"],
+)
+
+py_library(
+    name = "math_ops",
+    srcs = ["math_ops.py"],
+    deps = [
+        ":_math_ops",
+        ":context_stack",
+    ],
+)
+
+py_library(
+    name = "nn_ops",
+    srcs = ["nn_ops.py"],
+    deps = [
+        ":_nn_ops",
+        ":context_stack",
+    ],
+)
+
+py_library(
+    name = "tape",
+    srcs = ["tape.py"],
+    deps = [
+        ":_tape",
+        ":context_stack",
+        ":gradient_registry",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
+py_library(
+    name = "def_function",
+    srcs = ["def_function.py"],
+)
+
+py_library(
+    name = "thread_local_stack",
+    srcs = ["thread_local_stack.py"],
+)
+
+py_library(
+    name = "context_stack",
+    srcs = ["context_stack.py"],
+    deps = [":thread_local_stack"],
+)
+
+cuda_py_test(
+    name = "unified_api_test",
+    size = "small",
+    srcs = ["unified_api_test.py"],
+    tags = [
+        # Note(srbs): These python bindings are not
+        # exported as part of the pip package yet so
+        # this test is disabled.
+        "no_pip",
+        "no_windows",  # b/168218876
+    ],
+    tfrt_enabled = True,
+    deps = [
+        ":_unified_api",
+        ":context_stack",
+        ":def_function",
+        ":math_ops",
+        ":nn_ops",
+        ":tape",
+        "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/framework/experimental/context_stack.py b/tensorflow/python/framework/experimental/context_stack.py
new file mode 100644
index 00000000000..7e29c1fb36e
--- /dev/null
+++ b/tensorflow/python/framework/experimental/context_stack.py
@@ -0,0 +1,40 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Thread-local context manager stack."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+from tensorflow.python.framework.experimental import thread_local_stack
+
+_default_ctx_stack = thread_local_stack.ThreadLocalStack()
+
+
+def get_default():
+  """Returns the default execution context."""
+  return _default_ctx_stack.peek()
+
+
+@contextlib.contextmanager
+def set_default(ctx):
+  """Returns a contextmanager with `ctx` as the default execution context."""
+  try:
+    _default_ctx_stack.push(ctx)
+    yield
+  finally:
+    _default_ctx_stack.pop()
diff --git a/tensorflow/python/framework/experimental/def_function.py b/tensorflow/python/framework/experimental/def_function.py
new file mode 100644
index 00000000000..29d914cbc6c
--- /dev/null
+++ b/tensorflow/python/framework/experimental/def_function.py
@@ -0,0 +1,74 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental impl of tf.function using unified APIs, for testing only."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework.experimental import _unified_api
+from tensorflow.python.framework.experimental import context_stack as context_lib
+from tensorflow.python.util import nest
+
+NewTracingContext = _unified_api.NewTracingContext
+
+
+class Function(object):
+  """Helper for tf.function."""
+
+  def __init__(self, func, name=None):
+    self._python_func = func
+    # TODO(srbs): Uniquify this name.
+    self.name = name or func.__name__
+
+  def __call__(self, *args, **kwargs):
+    # Flatten arguments.
+    flat_args = nest.flatten(args, expand_composites=True)
+    flat_kwargs = nest.flatten(kwargs, expand_composites=True)
+    all_args = flat_args + flat_kwargs
+
+    # Trace
+    outer_ctx = context_lib.get_default()
+    ctx = NewTracingContext(self.name)
+    with context_lib.set_default(ctx):
+      # TODO(srbs): Iterating over list of inputs is a known performance
+      # bottleneck. Add a pybind API for this.
+      inputs = [ctx.AddParameter(arg.DataType()) for arg in all_args]
+      structured_args = nest.pack_sequence_as(args, inputs[:len(flat_args)])
+      structured_kwargs = nest.pack_sequence_as(kwargs, inputs[len(flat_args):])
+      structured_outputs = self._python_func(*structured_args,
+                                             **structured_kwargs)
+
+      py_outputs = nest.flatten(structured_outputs, expand_composites=True)
+      num_outputs = len(py_outputs)
+      # TODO(srbs): Drop Nones before calling Finalize.
+      finalized_f = ctx.Finalize(py_outputs)
+      outer_ctx.RegisterFunction(finalized_f)
+
+    # Build call op
+    call_op = outer_ctx.CreateOperation(self.name, "")
+    call_op.SetOpName(self.name)
+    for arg in all_args:
+      call_op.AddInput(arg)
+    call_op_outputs = call_op.Execute(num_outputs)
+
+    # Cleanup
+    outer_ctx.RemoveFunction(self.name)
+
+    return nest.pack_sequence_as(structured_outputs, call_op_outputs)
+
+
+def function(func):
+  return Function(func)
diff --git a/tensorflow/examples/tutorials/mnist/__init__.py b/tensorflow/python/framework/experimental/gradient_registry.py
similarity index 73%
rename from tensorflow/examples/tutorials/mnist/__init__.py
rename to tensorflow/python/framework/experimental/gradient_registry.py
index f939bc3bf45..6cba4001553 100644
--- a/tensorflow/examples/tutorials/mnist/__init__.py
+++ b/tensorflow/python/framework/experimental/gradient_registry.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,11 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""Global GradientRegistry."""
 
-"""Imports mnist tutorial libraries used by tutorial examples."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.examples.tutorials.mnist import input_data
-from tensorflow.examples.tutorials.mnist import mnist
+from tensorflow.python.framework.experimental import _tape
+
+_GRADIENT_REGISTRY_GLOBAL = _tape.GradientRegistry()
+
+
+def get_global_registry():
+  return _GRADIENT_REGISTRY_GLOBAL
diff --git a/tensorflow/python/framework/experimental/math_ops.cc b/tensorflow/python/framework/experimental/math_ops.cc
new file mode 100644
index 00000000000..5e9522f1999
--- /dev/null
+++ b/tensorflow/python/framework/experimental/math_ops.cc
@@ -0,0 +1,79 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/ops/math_ops.h"
+
+#include <pybind11/stl.h>
+
+#include <memory>
+
+#include "absl/types/span.h"
+#include "pybind11/pybind11.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+
+using tensorflow::AbstractContext;
+using tensorflow::AbstractTensorHandle;
+
+namespace tensorflow {
+PYBIND11_MODULE(_math_ops, m) {
+  m.def("add", [](AbstractContext* ctx, AbstractTensorHandle* a,
+                  AbstractTensorHandle* b, const char* name) {
+    int num_outputs = 1;
+    std::vector<AbstractTensorHandle*> outputs(1);
+    if (!name) {
+      name = "Add";
+    }
+    MaybeRaiseRegisteredFromStatus(
+        ops::Add(ctx, {a, b}, absl::MakeSpan(outputs), name));
+    return outputs[0];
+  });
+  m.def("mat_mul", [](AbstractContext* ctx, AbstractTensorHandle* a,
+                      AbstractTensorHandle* b, const char* name) {
+    int num_outputs = 1;
+    std::vector<AbstractTensorHandle*> outputs(1);
+    if (!name) {
+      name = "MatMul";
+    }
+    MaybeRaiseRegisteredFromStatus(
+        ops::MatMul(ctx, {a, b}, absl::MakeSpan(outputs), name,
+                    /*transpose_a=*/false, /*transpose_b=*/false));
+    return outputs[0];
+  });
+  m.def("neg",
+        [](AbstractContext* ctx, AbstractTensorHandle* a, const char* name) {
+          int num_outputs = 1;
+          std::vector<AbstractTensorHandle*> outputs(1);
+          if (!name) {
+            name = "Neg";
+          }
+          MaybeRaiseRegisteredFromStatus(
+              ops::Neg(ctx, {a}, absl::MakeSpan(outputs), name));
+          return outputs[0];
+        });
+  m.def("sub", [](AbstractContext* ctx, AbstractTensorHandle* a,
+                  AbstractTensorHandle* b, const char* name) {
+    int num_outputs = 1;
+    std::vector<AbstractTensorHandle*> outputs(1);
+    if (!name) {
+      name = "Sub";
+    }
+    MaybeRaiseRegisteredFromStatus(
+        ops::Sub(ctx, {a, b}, absl::MakeSpan(outputs), name));
+    return outputs[0];
+  });
+}
+}  // namespace tensorflow
diff --git a/tensorflow/python/framework/experimental/math_ops.py b/tensorflow/python/framework/experimental/math_ops.py
new file mode 100644
index 00000000000..879cddfa036
--- /dev/null
+++ b/tensorflow/python/framework/experimental/math_ops.py
@@ -0,0 +1,42 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental impl for gen_math_ops.py using unified APIs, for testing."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework.experimental import _math_ops
+from tensorflow.python.framework.experimental import context_stack as context
+
+
+def add(a, b, name=None):
+  ctx = context.get_default()
+  return _math_ops.add(ctx, a, b, name)
+
+
+def mat_mul(a, b, name=None):
+  ctx = context.get_default()
+  return _math_ops.mat_mul(ctx, a, b, name)
+
+
+def neg(a, name=None):
+  ctx = context.get_default()
+  return _math_ops.neg(ctx, a, name)
+
+
+def sub(a, b, name=None):
+  ctx = context.get_default()
+  return _math_ops.sub(ctx, a, b, name)
diff --git a/tensorflow/python/framework/experimental/nn_ops.cc b/tensorflow/python/framework/experimental/nn_ops.cc
new file mode 100644
index 00000000000..7f857f12a6a
--- /dev/null
+++ b/tensorflow/python/framework/experimental/nn_ops.cc
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/ops/nn_ops.h"
+
+#include <pybind11/stl.h>
+
+#include <memory>
+
+#include "absl/types/span.h"
+#include "pybind11/pybind11.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+
+using tensorflow::AbstractContext;
+using tensorflow::AbstractTensorHandle;
+
+namespace tensorflow {
+PYBIND11_MODULE(_nn_ops, m) {
+  m.def("relu",
+        [](AbstractContext* ctx, AbstractTensorHandle* a, const char* name) {
+          int num_outputs = 1;
+          std::vector<AbstractTensorHandle*> outputs(1);
+          if (!name) {
+            name = "Relu";
+          }
+          MaybeRaiseRegisteredFromStatus(
+              ops::Relu(ctx, {a}, absl::MakeSpan(outputs), name));
+          return outputs[0];
+        });
+
+  m.def(
+      "sparse_softmax_cross_entropy_with_logits",
+      [](AbstractContext* ctx, AbstractTensorHandle* features,
+         AbstractTensorHandle* labels, const char* name) {
+        int num_outputs = 2;
+        std::vector<AbstractTensorHandle*> outputs(2);
+        if (!name) {
+          name = "SparseSoftmaxCrossEntropyWithLogits";
+        }
+        MaybeRaiseRegisteredFromStatus(ops::SparseSoftmaxCrossEntropyWithLogits(
+            ctx, {features, labels}, absl::MakeSpan(outputs), name));
+        return outputs[0];  // Only return the loss vals, not the backprop.
+      });
+}
+}  // namespace tensorflow
diff --git a/tensorflow/python/framework/experimental/nn_ops.py b/tensorflow/python/framework/experimental/nn_ops.py
new file mode 100644
index 00000000000..47b0bb5916e
--- /dev/null
+++ b/tensorflow/python/framework/experimental/nn_ops.py
@@ -0,0 +1,33 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental impl for gen_nn_ops.py using unified APIs, for testing only."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework.experimental import _nn_ops
+from tensorflow.python.framework.experimental import context_stack as context
+
+
+def relu(a, name=None):
+  ctx = context.get_default()
+  return _nn_ops.relu(ctx, a, name)
+
+
+def sparse_softmax_cross_entropy_with_logits(logits, labels, name=None):
+  ctx = context.get_default()
+  return _nn_ops.sparse_softmax_cross_entropy_with_logits(
+      ctx, logits, labels, name)
diff --git a/tensorflow/python/framework/experimental/tape.cc b/tensorflow/python/framework/experimental/tape.cc
new file mode 100644
index 00000000000..a6975c085ac
--- /dev/null
+++ b/tensorflow/python/framework/experimental/tape.cc
@@ -0,0 +1,88 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <pybind11/stl.h>
+
+#include "pybind11/pybind11.h"
+#include "tensorflow/c/eager/gradients.h"
+#include "tensorflow/c/experimental/gradients/math_grad.h"
+#include "tensorflow/c/experimental/gradients/nn_grad.h"
+#include "tensorflow/c/experimental/gradients/tape/tape_context.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+
+namespace py = pybind11;
+
+namespace tensorflow {
+namespace gradients {
+
+Status RegisterGradients(GradientRegistry* registry) {
+  // TODO(srbs): Rename ops::Add and AddRegisterer to AddV2.
+  TF_RETURN_IF_ERROR(registry->Register("AddV2", AddRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Exp", ExpRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("MatMul", MatMulRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Relu", ReluRegisterer));
+  TF_RETURN_IF_ERROR(
+      registry->Register("SparseSoftmaxCrossEntropyWithLogits",
+                         SparseSoftmaxCrossEntropyWithLogitsRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Neg", NegRegisterer));
+  TF_RETURN_IF_ERROR(registry->Register("Sub", SubRegisterer));
+  return Status::OK();
+}
+
+PYBIND11_MODULE(_tape, m) {
+  py::class_<Tape>(m, "Tape")
+      .def(py::init([](bool persistent) { return new Tape(persistent); }))
+      .def("Watch",
+           [](Tape* self, AbstractTensorHandle* t) { self->Watch(ToId(t)); })
+      .def("ComputeGradient",
+           [](Tape* self, TapeVSpace* vspace,
+              std::vector<AbstractTensorHandle*> target_tensors,
+              std::vector<AbstractTensorHandle*> source_tensors,
+              std::vector<AbstractTensorHandle*> output_gradients) {
+             std::vector<int64> target_tensor_ids;
+             std::vector<int64> source_tensor_ids;
+             target_tensor_ids.reserve(target_tensors.size());
+             source_tensor_ids.reserve(source_tensors.size());
+             for (auto t : target_tensors) {
+               target_tensor_ids.emplace_back(ToId(t));
+             }
+             for (auto t : source_tensors) {
+               source_tensor_ids.emplace_back(ToId(t));
+             }
+             std::unordered_map<tensorflow::int64, TapeTensor>
+                 source_tensors_that_are_targets;
+             std::vector<AbstractTensorHandle*> results;
+             Status s = self->ComputeGradient(
+                 *vspace, target_tensor_ids, source_tensor_ids,
+                 source_tensors_that_are_targets, output_gradients, &results,
+                 /*build_default_zeros_grads=*/false);
+             MaybeRaiseRegisteredFromStatus(s);
+             return results;
+           });
+  py::class_<TapeVSpace>(m, "TapeVSpace")
+      .def(py::init([](AbstractContext* ctx) { return new TapeVSpace(ctx); }));
+  py::class_<GradientRegistry>(m, "GradientRegistry").def(py::init([]() {
+    auto registry = new GradientRegistry();
+    MaybeRaiseRegisteredFromStatus(RegisterGradients(registry));
+    return registry;
+  }));
+  py::class_<TapeContext, AbstractContext>(m, "TapeContext")
+      .def(py::init(
+          [](AbstractContext* ctx, Tape* tape, GradientRegistry* registry) {
+            return new TapeContext(ctx, tape, *registry);
+          }));
+}
+}  // namespace gradients
+}  // namespace tensorflow
diff --git a/tensorflow/python/framework/experimental/tape.py b/tensorflow/python/framework/experimental/tape.py
new file mode 100644
index 00000000000..e88e09a7a44
--- /dev/null
+++ b/tensorflow/python/framework/experimental/tape.py
@@ -0,0 +1,58 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental impl for GradientTape using unified APIs, for testing only."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework.experimental import _tape
+from tensorflow.python.framework.experimental import context_stack
+from tensorflow.python.framework.experimental import gradient_registry
+from tensorflow.python.util import nest
+
+
+class GradientTape(object):
+  """GradientTape using the unified API."""
+
+  def __init__(self, persistent=False):
+    self._c_tape = _tape.Tape(persistent)
+    ctx = context_stack.get_default()
+    self._tape_context = _tape.TapeContext(
+        ctx, self._c_tape, gradient_registry.get_global_registry())
+    self._ctx_manager = None
+
+  def watch(self, t):
+    self._c_tape.Watch(t)
+
+  # TODO(srbs): Add support for unconnected_gradients.
+  def gradient(self, targets, sources, output_gradients=None):
+    ctx = context_stack.get_default()
+    vspace = _tape.TapeVSpace(ctx)
+    flat_targets = nest.flatten(targets)
+    flat_sources = nest.flatten(sources)
+    out_grads = self._c_tape.ComputeGradient(vspace, flat_targets, flat_sources,
+                                             output_gradients or [])
+    return nest.pack_sequence_as(sources, out_grads)
+
+  def __enter__(self):
+    """Enters a context inside which operations are recorded on this tape."""
+    self._ctx_manager = context_stack.set_default(self._tape_context)
+    self._ctx_manager.__enter__()
+    return self
+
+  def __exit__(self, typ, value, traceback):
+    self._ctx_manager.__exit__(typ, value, traceback)
+    self._ctx_manager = None
diff --git a/tensorflow/python/framework/experimental/thread_local_stack.py b/tensorflow/python/framework/experimental/thread_local_stack.py
new file mode 100644
index 00000000000..7042f32902c
--- /dev/null
+++ b/tensorflow/python/framework/experimental/thread_local_stack.py
@@ -0,0 +1,39 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Thread-local stack."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+
+# TODO(srbs): Move this to C++.
+class ThreadLocalStack(threading.local):
+  """A thread-local stack of objects for providing implicit defaults."""
+
+  def __init__(self):
+    super(ThreadLocalStack, self).__init__()
+    self._stack = []
+
+  def peek(self):
+    return self._stack[-1] if self._stack else None
+
+  def push(self, ctx):
+    return self._stack.append(ctx)
+
+  def pop(self):
+    self._stack.pop()
diff --git a/tensorflow/python/framework/experimental/unified_api.cc b/tensorflow/python/framework/experimental/unified_api.cc
new file mode 100644
index 00000000000..96bf2232a1e
--- /dev/null
+++ b/tensorflow/python/framework/experimental/unified_api.cc
@@ -0,0 +1,256 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <pybind11/stl.h>
+
+#include <memory>
+
+#include "pybind11/pybind11.h"
+#include "tensorflow/c/eager/abstract_context.h"
+#include "tensorflow/c/eager/abstract_function.h"
+#include "tensorflow/c/eager/abstract_operation.h"
+#include "tensorflow/c/eager/abstract_tensor_handle.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/eager/immediate_execution_context.h"
+#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/python/eager/pywrap_tensor.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+#include "tensorflow/python/lib/core/safe_ptr.h"
+
+namespace py = pybind11;
+
+using tensorflow::AbstractContext;
+using tensorflow::AbstractContextPtr;
+using tensorflow::AbstractFunction;
+using tensorflow::AbstractOperation;
+using tensorflow::AbstractOperationPtr;
+using tensorflow::AbstractTensorHandle;
+using tensorflow::AbstractTensorHandlePtr;
+using tensorflow::OutputList;
+
+using tensorflow::tracing::TracingContext;
+using tensorflow::tracing::TracingOperation;
+using tensorflow::tracing::TracingTensorHandle;
+
+using tensorflow::ImmediateContextPtr;
+using tensorflow::ImmediateExecutionContext;
+using tensorflow::ImmediateExecutionTensorHandle;
+
+using tensorflow::dyn_cast;
+using tensorflow::isa;
+using tensorflow::unwrap;
+using tensorflow::wrap;
+
+using tensorflow::DataType;
+using tensorflow::make_safe;
+using tensorflow::MaybeRaiseRegisteredFromStatus;
+using tensorflow::MaybeRaiseRegisteredFromTFStatus;
+using tensorflow::Pyo;
+using tensorflow::Safe_TF_StatusPtr;
+using tensorflow::Status;
+using tensorflow::string;
+using tensorflow::TFE_TensorHandleToNumpy;
+
+using tensorflow::errors::Internal;
+using tensorflow::errors::InvalidArgument;
+
+PYBIND11_MODULE(_unified_api, m) {
+  // Context creation functions.
+  m.def("SetTracingImplementation", [](const char* impl) {
+    Safe_TF_StatusPtr status = make_safe(TF_NewStatus());
+    TF_SetTracingImplementation(impl, status.get());
+    MaybeRaiseRegisteredFromStatus(status->status);
+  });
+  m.def("NewTracingContext", [](const char* fn_name) {
+    Safe_TF_StatusPtr status = make_safe(TF_NewStatus());
+    auto* ctx = unwrap(TF_CreateFunction(fn_name, status.get()));
+    MaybeRaiseRegisteredFromTFStatus(status.get());
+    if (!ctx) {
+      MaybeRaiseRegisteredFromStatus(
+          Internal("TF_CreateFunction returned nullptr"));
+    }
+    if (!isa<TracingContext>(ctx)) {
+      // TODO(srbs): Add a helper to convert the kind enum to a user-friendly
+      // string.
+      MaybeRaiseRegisteredFromStatus(
+          Internal("TF_CreateFunction must return a TracingContext, found ",
+                   ctx->getKind()));
+    }
+    return dyn_cast<TracingContext>(ctx);
+  });
+  m.def("EagerContextToImmediateExecutionContext", [](py::handle& obj) {
+    TFE_Context* ctx =
+        static_cast<TFE_Context*>(PyCapsule_GetPointer(obj.ptr(), nullptr));
+    if (!ctx) {
+      MaybeRaiseRegisteredFromStatus(InvalidArgument("TFE_Context is nullptr"));
+    }
+    return unwrap(ctx);
+  });
+
+  // Unified execution context.
+  py::class_<AbstractContext, AbstractContextPtr>(m, "AbstractContext")
+      .def("CreateOperation",
+           [](AbstractContext* self, const char* op,
+              const char* raw_device_name) {
+             auto operation = self->CreateOperation();
+             operation->Reset(op, raw_device_name);
+             return operation;
+           })
+      .def("RegisterFunction",
+           [](AbstractContext* self, AbstractFunction* f) {
+             Status s = self->RegisterFunction(f);
+             MaybeRaiseRegisteredFromStatus(s);
+           })
+      .def("RemoveFunction", [](AbstractContext* self, const string& func) {
+        Status s = self->RemoveFunction(func);
+        MaybeRaiseRegisteredFromStatus(s);
+      });
+
+  py::class_<TracingContext, AbstractContext>(m, "TracingContext")
+      .def("AddParameter",
+           [](TracingContext* self, DataType dtype) {
+             TracingTensorHandle* handle = nullptr;
+             Status s = self->AddParameter(dtype, &handle);
+             MaybeRaiseRegisteredFromStatus(s);
+             return static_cast<AbstractTensorHandle*>(handle);
+           })
+      .def("Finalize", [](TracingContext* self, py::handle& outputs) {
+        // TODO(srbs): Using OutputList seems like an overkill here. Should we
+        // simply pass in an absl::Span?
+        OutputList output_list;
+        if (outputs.ptr() != Py_None) {
+          if (!PyList_Check(outputs.ptr())) {
+            MaybeRaiseRegisteredFromStatus(
+                InvalidArgument("must provide a list of Tensors as inputs"));
+          }
+          Py_ssize_t len = PyList_Size(outputs.ptr());
+          output_list.outputs.resize(len);
+          for (Py_ssize_t i = 0; i < len; ++i) {
+            PyObject* elem = PyList_GetItem(outputs.ptr(), i);
+            if (!elem) {
+              MaybeRaiseRegisteredFromStatus(
+                  InvalidArgument("Tensor at index  ", i, " is None."));
+            }
+            py::handle elem_h = elem;
+            AbstractTensorHandle* handle = elem_h.cast<AbstractTensorHandle*>();
+            if (!isa<TracingTensorHandle>(handle)) {
+              MaybeRaiseRegisteredFromStatus(InvalidArgument(
+                  "Tensor at index  ", i, " is not a graph tensor."));
+            }
+            output_list.outputs[i] = handle;
+          }
+        }
+        AbstractFunction* f = nullptr;
+        Status s = self->Finalize(&output_list, &f);
+        MaybeRaiseRegisteredFromStatus(s);
+        return f;
+      });
+
+  // Note: This does not take ownership of the C++ context, the lifetime of
+  // which is managed by the python `Context` and is expected to outlive this
+  // object.
+  // TODO(srbs): Make AbstractContext refcounted so that the above comment is
+  // not needed.
+  py::class_<ImmediateExecutionContext, AbstractContext,
+             std::unique_ptr<ImmediateExecutionContext, py::nodelete>>
+      ImmediateExecutionContext(m, "ImmediateExecutionContext");
+
+  // Unified execution operation.
+  py::class_<AbstractOperation, AbstractOperationPtr>(m, "AbstractOperation")
+      .def("Reset",
+           [](AbstractOperation* self, const char* op,
+              const char* raw_device_name) {
+             Status s = self->Reset(op, raw_device_name);
+             MaybeRaiseRegisteredFromStatus(s);
+           })
+      .def("SetOpName",
+           [](AbstractOperation* self, const char* op_name) {
+             // TODO(srbs): We could provide SetOpName on TracingOperation
+             // but then we need to do a hasattr check or try/pass in python.
+             if (isa<TracingOperation>(self)) {
+               auto tracing_op = reinterpret_cast<TracingOperation*>(self);
+               Status s = tracing_op->SetOpName(op_name);
+               MaybeRaiseRegisteredFromStatus(s);
+             }
+           })
+      .def("Name", &AbstractOperation::Name)
+      .def("DeviceName", &AbstractOperation::DeviceName)
+      .def("SetDeviceName",
+           [](AbstractOperation* self, const char* name) {
+             Status s = self->SetDeviceName(name);
+             MaybeRaiseRegisteredFromStatus(s);
+           })
+      .def("AddInput",
+           [](AbstractOperation* self, AbstractTensorHandle* input) {
+             Status s = self->AddInput(input);
+             MaybeRaiseRegisteredFromStatus(s);
+           })
+      .def("SetAttrType",
+           [](AbstractOperation* self, const char* attr_name, DataType value) {
+             Status s = self->SetAttrType(attr_name, value);
+             MaybeRaiseRegisteredFromStatus(s);
+           })
+      .def("Execute", [](AbstractOperation* self, int num_outputs) {
+        std::vector<AbstractTensorHandle*> outputs(num_outputs);
+        MaybeRaiseRegisteredFromStatus(
+            self->Execute(absl::MakeSpan(outputs), &num_outputs));
+        return outputs;
+      });
+
+  // Unified execution tensor handle.
+  py::class_<AbstractTensorHandle, AbstractTensorHandlePtr>(
+      m, "AbstractTensorHandle")
+      .def("DataType", &AbstractTensorHandle::DataType)
+      .def("numpy", [](AbstractTensorHandle* self) {
+        // TODO(srbs): Export this on ImmediateExecutionTensorHandle only.
+        if (!isa<ImmediateExecutionTensorHandle>(self)) {
+          // TODO(srbs): Add a helper to convert the kind enum to a
+          // user-friendly string.
+          MaybeRaiseRegisteredFromStatus(Internal(
+              "AbstractTensorHandle.numpy() must be called with an ",
+              "ImmediateExecutionTensorHandle found type: ", self->getKind()));
+        }
+        TF_Status s;
+        TFE_TensorHandle* handle =
+            wrap(dyn_cast<ImmediateExecutionTensorHandle>(self));
+        auto result = TFE_TensorHandleToNumpy(handle, &s);
+        MaybeRaiseRegisteredFromStatus(s.status);
+        return Pyo(result);
+      });
+
+  m.def("EagerTensorToImmediateExecutionTensorHandle", [](py::object handle) {
+    if (!EagerTensor_CheckExact(handle.ptr())) {
+      MaybeRaiseRegisteredFromStatus(
+          InvalidArgument("EagerTensorToImmediateExecutionTensorHandle called "
+                          "with non-EagerTensor."));
+    }
+    TFE_TensorHandle* eager_tensor = EagerTensor_Handle(handle.ptr());
+    auto t = static_cast<AbstractTensorHandle*>(unwrap(eager_tensor));
+    t->Ref();
+    return t;
+  });
+
+  py::class_<AbstractFunction> AbstractFunction(m, "AbstractFunction");
+}
diff --git a/tensorflow/python/framework/experimental/unified_api_test.py b/tensorflow/python/framework/experimental/unified_api_test.py
new file mode 100644
index 00000000000..8edb3f51f7a
--- /dev/null
+++ b/tensorflow/python/framework/experimental/unified_api_test.py
@@ -0,0 +1,366 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Unified APIs' python bindings."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import timeit
+
+from absl.testing import parameterized
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework.experimental import _unified_api
+from tensorflow.python.framework.experimental import context_stack as context_lib
+from tensorflow.python.framework.experimental import def_function
+from tensorflow.python.framework.experimental import math_ops as unified_math_ops
+from tensorflow.python.framework.experimental import nn_ops as unified_nn_ops
+from tensorflow.python.framework.experimental import tape as tape_lib
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import test
+
+SetTracingImplementation = _unified_api.SetTracingImplementation
+TensorCastHelper = _unified_api.EagerTensorToImmediateExecutionTensorHandle
+
+
+def get_immediate_execution_context():
+  context.context().ensure_initialized()
+  return _unified_api.EagerContextToImmediateExecutionContext(
+      context.context()._handle)
+
+
+def maybe_cast(t, perform_cast):
+  if perform_cast:
+    return TensorCastHelper(t)
+  return t
+
+
+class UnifiedApiTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters([
+      ("Graph", False),
+      ("Mlir", True),
+  ])
+  def testAdd(self, use_mlir):
+    if use_mlir:
+      SetTracingImplementation("mlir")
+
+    def model(a, b):
+      return unified_math_ops.add(a, b)
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      a = TensorCastHelper(constant_op.constant([1., 2.]))
+      b = TensorCastHelper(constant_op.constant([3., 4.]))
+
+      func_output = def_function.function(model)(a, b)
+      self.assertAllEqual(func_output.numpy(), [4., 6.])
+
+      eager_output = model(a, b)
+      self.assertAllEqual(eager_output.numpy(), [4., 6.])
+
+  @parameterized.named_parameters([
+      ("Graph", False),
+      ("Mlir", True),
+  ])
+  def testAddGrad(self, use_mlir):
+    if use_mlir:
+      SetTracingImplementation("mlir")
+
+    def model(a, b):
+      with tape_lib.GradientTape() as tape:
+        tape.watch(a)
+        tape.watch(b)
+        result = unified_math_ops.add(a, b)
+      grads = tape.gradient(result, [a, b])
+      return grads
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      a = TensorCastHelper(constant_op.constant([1., 2.]))
+      b = TensorCastHelper(constant_op.constant([3., 4.]))
+
+      func_outputs = def_function.function(model)(a, b)
+      self.assertAllEqual(func_outputs[0].numpy(), [1.0, 1.0])
+      self.assertAllEqual(func_outputs[1].numpy(), [1.0, 1.0])
+
+      eager_outputs = model(a, b)
+      self.assertAllEqual(eager_outputs[0].numpy(), [1.0, 1.0])
+      self.assertAllEqual(eager_outputs[1].numpy(), [1.0, 1.0])
+
+  @parameterized.named_parameters([
+      ("Graph", False),
+      ("Mlir", True),
+  ])
+  def testRelu(self, use_mlir):
+    if use_mlir:
+      SetTracingImplementation("mlir")
+
+    def model(t):
+      return unified_nn_ops.relu(t)
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      positive = TensorCastHelper(constant_op.constant([1.]))
+      negative = TensorCastHelper(constant_op.constant([-1.]))
+
+      model_fn = def_function.function(model)
+      func_output = model_fn(positive)
+      self.assertAllEqual(func_output.numpy(), [1.])
+      func_output = model_fn(negative)
+      self.assertAllEqual(func_output.numpy(), [0.])
+
+      eager_output = model(positive)
+      self.assertAllEqual(eager_output.numpy(), [1.])
+      eager_output = model(negative)
+      self.assertAllEqual(eager_output.numpy(), [0.])
+
+  @parameterized.named_parameters([
+      ("Graph", False),
+      ("Mlir", True),
+  ])
+  def testReluGrad(self, use_mlir):
+    if use_mlir:
+      SetTracingImplementation("mlir")
+
+    def model(t):
+      with tape_lib.GradientTape() as tape:
+        tape.watch(t)
+        result = unified_nn_ops.relu(t)
+      grads = tape.gradient(result, t)
+      return grads
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      positive = TensorCastHelper(constant_op.constant([1.]))
+      negative = TensorCastHelper(constant_op.constant([-1.]))
+
+      model_fn = def_function.function(model)
+      func_output = model_fn(positive)
+      self.assertAllEqual(func_output.numpy(), [1.])
+      func_output = model_fn(negative)
+      self.assertAllEqual(func_output.numpy(), [0.])
+
+      eager_output = model(positive)
+      self.assertAllEqual(eager_output.numpy(), [1.])
+      eager_output = model(negative)
+      self.assertAllEqual(eager_output.numpy(), [0.])
+
+  @parameterized.named_parameters([
+      ("Graph", False),
+      ("Mlir", True),
+  ])
+  def testNeg(self, use_mlir):
+    if use_mlir:
+      SetTracingImplementation("mlir")
+
+    def model(a):
+      return unified_math_ops.neg(a)
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      a = TensorCastHelper(constant_op.constant([2.]))
+
+      func_output = def_function.function(model)(a)
+      self.assertAllEqual(func_output.numpy(), [-2.])
+
+      eager_output = model(a)
+      self.assertAllEqual(eager_output.numpy(), [-2.])
+
+  @parameterized.named_parameters([
+      ("Graph", False),
+      ("Mlir", True),
+  ])
+  def testNegGrad(self, use_mlir):
+    if use_mlir:
+      SetTracingImplementation("mlir")
+
+    def model(a):
+      with tape_lib.GradientTape() as tape:
+        tape.watch(a)
+        result = unified_math_ops.neg(a)
+      grads = tape.gradient(result, a)
+      return grads
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      a = TensorCastHelper(constant_op.constant([2.]))
+
+      func_outputs = def_function.function(model)(a)
+      self.assertAllEqual(func_outputs.numpy(), [-1.0])
+
+      eager_outputs = model(a)
+      self.assertAllEqual(eager_outputs.numpy(), [-1.0])
+
+  @parameterized.named_parameters([
+      ("Graph", False),
+      ("Mlir", True),
+  ])
+  def testSub(self, use_mlir):
+    if use_mlir:
+      SetTracingImplementation("mlir")
+
+    def model(a, b):
+      return unified_math_ops.sub(a, b)
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      a = TensorCastHelper(constant_op.constant([1., 2.]))
+      b = TensorCastHelper(constant_op.constant([3., 4.]))
+
+      func_output = def_function.function(model)(a, b)
+      self.assertAllEqual(func_output.numpy(), [-2., -2.])
+
+      eager_output = model(a, b)
+      self.assertAllEqual(eager_output.numpy(), [-2., -2.])
+
+  @parameterized.named_parameters([
+      ("Graph", False),
+      ("Mlir", True),
+  ])
+  def testSubGrad(self, use_mlir):
+    if use_mlir:
+      SetTracingImplementation("mlir")
+
+    def model(a, b):
+      with tape_lib.GradientTape() as tape:
+        tape.watch(a)
+        tape.watch(b)
+        result = unified_math_ops.sub(a, b)
+      grads = tape.gradient(result, [a, b])
+      return grads
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      a = TensorCastHelper(constant_op.constant([1., 2.]))
+      b = TensorCastHelper(constant_op.constant([3., 4.]))
+
+      func_outputs = def_function.function(model)(a, b)
+      self.assertAllEqual(func_outputs[0].numpy(), [1.0, 1.0])
+      self.assertAllEqual(func_outputs[1].numpy(), [-1.0, -1.0])
+
+      eager_outputs = model(a, b)
+      self.assertAllEqual(eager_outputs[0].numpy(), [1.0, 1.0])
+      self.assertAllEqual(eager_outputs[1].numpy(), [-1.0, -1.0])
+
+
+class UnifiedTapeBenchmark(test.Benchmark):
+
+  def _computeMnistMlpGrads(self, math_ops_lib, nn_ops_lib, backprop_lib, cast,
+                            num_iters, hidden_layers, hidden_size, batch_size):
+    batch_size = 1
+    image_size = 28 * 28
+    num_classes = 10
+
+    def model(x, hidden_weights, softmax_weight, labels):
+      with backprop_lib.GradientTape() as tape:
+        for weight in hidden_weights + [softmax_weight]:
+          tape.watch(weight)
+        for hidden_weight in hidden_weights:
+          x = math_ops_lib.mat_mul(x, hidden_weight)
+          x = nn_ops_lib.relu(x)
+        logits = math_ops_lib.mat_mul(x, softmax_weight)
+        loss = nn_ops_lib.sparse_softmax_cross_entropy_with_logits(
+            logits=logits, labels=labels)
+
+      grads = tape.gradient(loss, hidden_weights + [softmax_weight])
+      return grads
+
+    x = maybe_cast(array_ops.ones([batch_size, image_size]), cast)
+    hidden_weights = []
+    for i in range(hidden_layers):
+      hidden_weights.append(
+          maybe_cast(
+              random_ops.random_uniform(
+                  [hidden_size if i else image_size, hidden_size]), cast))
+    softmax_weight = maybe_cast(
+        random_ops.random_uniform([hidden_size, num_classes]), cast)
+    labels = maybe_cast(array_ops.zeros([batch_size], dtype=dtypes.int32), cast)
+
+    with context_lib.set_default(get_immediate_execution_context()):
+      # Warm up.
+      for _ in range(10):
+        model(x, hidden_weights, softmax_weight, labels)
+      runtimes = timeit.repeat(
+          lambda: model(x, hidden_weights, softmax_weight, labels),
+          repeat=num_iters,
+          number=10)
+    return min(runtimes) / 10
+
+  def benchmarkTwoHiddenLayerMnistEagerUnified(self):
+    num_iters = 100
+    duration = self._computeMnistMlpGrads(
+        unified_math_ops,
+        unified_nn_ops,
+        tape_lib,
+        True,
+        num_iters,
+        hidden_layers=2,
+        hidden_size=100,
+        batch_size=1)
+    self.report_benchmark(
+        name="TwoHiddenLayerMnistEagerUnified",
+        iters=num_iters,
+        wall_time=duration)
+
+  def benchmarkTwoHiddenLayerMnistEager(self):
+    num_iters = 100
+    duration = self._computeMnistMlpGrads(
+        math_ops,
+        nn_ops,
+        backprop,
+        False,
+        num_iters,
+        hidden_layers=2,
+        hidden_size=100,
+        batch_size=1)
+    self.report_benchmark(
+        name="TwoHiddenLayerMnistEager", iters=num_iters, wall_time=duration)
+
+  def benchmarkTenHiddenLayerMnistEagerUnified(self):
+    num_iters = 100
+    duration = self._computeMnistMlpGrads(
+        unified_math_ops,
+        unified_nn_ops,
+        tape_lib,
+        True,
+        num_iters,
+        hidden_layers=10,
+        hidden_size=100,
+        batch_size=1)
+    self.report_benchmark(
+        name="TenHiddenLayerMnistEagerUnified",
+        iters=num_iters,
+        wall_time=duration)
+
+  def benchmarkTenHiddenLayerMnistEager(self):
+    num_iters = 100
+    duration = self._computeMnistMlpGrads(
+        math_ops,
+        nn_ops,
+        backprop,
+        False,
+        num_iters,
+        hidden_layers=10,
+        hidden_size=100,
+        batch_size=1)
+    self.report_benchmark(
+        name="TenHiddenLayerMnistEager", iters=num_iters, wall_time=duration)
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index dbe0d57759b..71c009095a0 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -36,6 +36,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import custom_gradient
@@ -400,10 +401,6 @@ class FuncGraph(ops.Graph):
       # optimizers.
       old_graph_key = self._graph_key
       self._graph_key = graph._graph_key
-      # Inherit the auto_cast_variable_read_dtype, since this should not change
-      # inside a function.
-      old_auto_cast_var_read_dtype = self._auto_cast_variable_read_dtype
-      self._auto_cast_variable_read_dtype = graph._auto_cast_variable_read_dtype
       # pylint: enable=protected-access
 
       old_scope_exit_callbacks = self._scope_exit_callbacks
@@ -422,7 +419,6 @@ class FuncGraph(ops.Graph):
             self._device_function_stack = old_device_stack
             self._variable_creator_stack = old_creator_stack
             self._graph_key = old_graph_key
-            self._auto_cast_variable_read_dtype = old_auto_cast_var_read_dtype
     return inner_cm()
 
   @property
@@ -583,14 +579,16 @@ class FuncGraph(ops.Graph):
     # backward accumulators in the original graph before we create placeholders
     # to capture the inputs.
     ctxt = ops.get_default_graph()._control_flow_context  # pylint: disable=protected-access
-    for i, inp in enumerate(inputs):
+    # Use a different list to avoid modifying the original inputs list.
+    captured_inputs = []
+    for inp in inputs:
       # TPU Estimator defines a control flow context with no AddValue method.
       if ctxt is not None and hasattr(ctxt, "AddValue"):
         inp = ctxt.AddValue(inp)
       inp = self.capture(inp)
-      inputs[i] = inp
+      captured_inputs.append(inp)
     return super(FuncGraph, self)._create_op_internal(  # pylint: disable=protected-access
-        op_type, inputs, dtypes, input_types, name, attrs, op_def,
+        op_type, captured_inputs, dtypes, input_types, name, attrs, op_def,
         compute_device)
 
   def capture(self, tensor, name=None, shape=None):
@@ -719,7 +717,12 @@ class FuncGraph(ops.Graph):
       # device as the source tensor. The device placement may be relaxed at
       # a later date.
       with ops.control_dependencies(None), self.device(tensor.device):
-        graph_const = constant_op.constant(tensor.numpy(), dtype=tensor.dtype,
+        constant_value = tensor_util.constant_value(tensor)
+        if constant_value is None:
+          # Some eager tensors, e.g. parallel tensors, are not convertible to a
+          # single constant. We'll use a placeholder for this case.
+          return self._capture_helper(tensor, name)
+        graph_const = constant_op.constant(constant_value, dtype=tensor.dtype,
                                            shape=tensor.shape, name=name)
       self.add_capture(tensor, graph_const)
     else:
@@ -1202,7 +1205,8 @@ def _get_defun_inputs(args, names, structure, flat_shapes=None):
       # Tensor or not.  For non-tensor entries it should be None.
       shape = next(shapes_iter)
       if isinstance(arg, (ops.Tensor, tensor_spec.TensorSpec)):
-        if isinstance(arg, tensor_spec.TensorSpec) and arg.name:
+        arg_is_spec = isinstance(arg, tensor_spec.TensorSpec)
+        if arg_is_spec and arg.name:
           requested_name = arg.name
         else:
           requested_name = name
@@ -1215,6 +1219,8 @@ def _get_defun_inputs(args, names, structure, flat_shapes=None):
           # Sometimes parameter names are not valid op names, so fall back to
           # unnamed placeholders.
           placeholder = graph_placeholder(arg.dtype, placeholder_shape)
+        if not arg_is_spec:
+          custom_gradient.copy_handle_data(arg, placeholder)
         if name is not None:
           # Record the requested/user-specified name in case it's different than
           # the uniquified name, for validation when exporting signatures.
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 596b93227bf..90cd0f62986 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import re
 import time
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.core.framework import function_pb2
@@ -1592,6 +1593,7 @@ class UnrollLSTMTest(test.TestCase):
       self.assertAllClose(mv0, mv2, rtol=1e-4)
       self.assertAllClose(mv0, mv3, rtol=1e-4)
 
+  @test_util.run_without_tensor_float_32("Calls matmul in custom LSTM function")
   def testUnrollLSTMGrad(self):
     # Run one step of the unrolled lstm graph.
     def RunForwardBackward(mode, cfg=None):
@@ -1622,10 +1624,11 @@ class UnrollLSTMTest(test.TestCase):
       self.assertAllClose(d0, d3, rtol=1e-4, atol=1e-4)
 
 
-class FunctionInlineControlTest(test.TestCase):
+class FunctionInlineControlTest(test.TestCase, parameterized.TestCase):
 
+  @parameterized.parameters((True), (False))
   @test_util.disable_xla("XLA changes the names, breaking graph analysis")
-  def testFoo(self):
+  def testFoo(self, noinline):
     dtype = dtypes.float32
     cfg = config_pb2.ConfigProto(
         graph_options=config_pb2.GraphOptions(
@@ -1635,50 +1638,50 @@ class FunctionInlineControlTest(test.TestCase):
                 do_function_inlining=True,
                 do_constant_folding=True)))
     cell_func_call_pattern = re.compile(r"Cell[^/]*\(")
-    for noinline in [False, True]:
+    @function.Defun(dtype, noinline=noinline)
+    def Cell(v):
+      # If v is a vector [n, 1], x is a big square matrix.
+      x = math_ops.tanh(v + array_ops.transpose(v, [1, 0]))
+      return math_ops.reduce_sum(x, 1, keepdims=True)
 
-      @function.Defun(dtype, noinline=noinline)
-      def Cell(v):
-        # If v is a vector [n, 1], x is a big square matrix.
-        x = math_ops.tanh(v + array_ops.transpose(v, [1, 0]))
-        return math_ops.reduce_sum(x, 1, keepdims=True)
-
-      @function.Defun(dtype)
-      def Forward(x):
-        for _ in range(10):
-          # pylint: disable=cell-var-from-loop
-          x = Cell(x)
-        return math_ops.reduce_sum(x, [0, 1])
+    @function.Defun(dtype)
+    def Forward(x):
+      for _ in range(10):
+        # pylint: disable=cell-var-from-loop
+        x = Cell(x)
+      return math_ops.reduce_sum(x, [0, 1])
 
+    # Disabling this check on the ROCm platform, because it fails
+    # The failure might not be ROCm specific(see commit message for details)
+    if not test.is_built_with_rocm():
       self.assertEqual(noinline, Cell.definition.attr["_noinline"].b)
 
-      g = ops.Graph()
-      with g.as_default():
-        x = array_ops.placeholder(dtype)
-        y = Forward(x)
-        dx, = gradients_impl.gradients([y], [x])
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtype)
+      y = Forward(x)
+      dx, = gradients_impl.gradients([y], [x])
 
-      np.random.seed(321)
-      inp = np.random.uniform(-1, 1, [16, 1]).astype(np.float32)
-      run_metadata = config_pb2.RunMetadata()
-      with session.Session(graph=g, config=cfg) as sess:
-        ans = sess.run(
-            [y, dx], {x: inp},
-            run_metadata=run_metadata,
-            options=config_pb2.RunOptions(
-                trace_level=config_pb2.RunOptions.FULL_TRACE))
-        print(ans[0], np.sum(ans[1]))
-        self.assertAllClose(ans[0], 255.971, rtol=1e-3)
-        self.assertAllClose(np.sum(ans[1]), 13.0408, rtol=1e-3)
+    np.random.seed(321)
+    inp = np.random.uniform(-1, 1, [16, 1]).astype(np.float32)
+    run_metadata = config_pb2.RunMetadata()
+    with session.Session(graph=g, config=cfg) as sess:
+      ans = sess.run(
+          [y, dx], {x: inp},
+          run_metadata=run_metadata,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+      self.assertAllClose(ans[0], 255.971, rtol=1e-3)
+      self.assertAllClose(np.sum(ans[1]), 13.0408, rtol=1e-3)
 
-      def MetadataHasCell(run_metadata):
-        for dev_stats in run_metadata.step_stats.dev_stats:
-          for node_stats in dev_stats.node_stats:
-            if cell_func_call_pattern.search(node_stats.timeline_label):
-              return True
-        return False
+    def MetadataHasCell(run_metadata):
+      for dev_stats in run_metadata.step_stats.dev_stats:
+        for node_stats in dev_stats.node_stats:
+          if cell_func_call_pattern.search(node_stats.timeline_label):
+            return True
+      return False
 
-      self.assertEqual(MetadataHasCell(run_metadata), noinline)
+    self.assertEqual(MetadataHasCell(run_metadata), noinline)
 
 
 class ModuleFunctionTest(test.TestCase):
diff --git a/tensorflow/python/framework/indexed_slices.py b/tensorflow/python/framework/indexed_slices.py
index 45f6e254b0e..b1e1e20fc2e 100644
--- a/tensorflow/python/framework/indexed_slices.py
+++ b/tensorflow/python/framework/indexed_slices.py
@@ -429,9 +429,12 @@ def _indexed_slices_to_tensor(value, dtype=None, name=None, as_ref=False):
             "elements. This may consume a large amount of memory." %
             num_elements)
     else:
-      warnings.warn(
-          "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
-          "This may consume a large amount of memory.")
+      if value.dense_shape.op.type != "VariableShape":
+        # VariableShape may hide static shapes behind a resource handle
+        # producing a warning that isn't that useful to users.
+        warnings.warn(
+            "Converting sparse IndexedSlices(%s) to a dense Tensor of unknown "
+            "shape. This may consume a large amount of memory." % value)
   return math_ops.unsorted_segment_sum(
       value.values, value.indices, value.dense_shape[0], name=name)
 
diff --git a/tensorflow/python/framework/is_mlir_bridge_test_false.py b/tensorflow/python/framework/is_mlir_bridge_test_false.py
new file mode 100644
index 00000000000..d2581a94501
--- /dev/null
+++ b/tensorflow/python/framework/is_mlir_bridge_test_false.py
@@ -0,0 +1,30 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Including this as a dependency will result in tests NOT using MLIR bridge.
+
+This function is defined by default in test_util.py to None. The test_util then
+attempts to import this module. If this file is made available through the BUILD
+rule, then this function is overridden and will instead cause Tensorflow graphs
+to be always NOT be compiled with MLIR bridge.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+def is_mlir_bridge_enabled():
+  """Returns false if the MLIR bridge should be not be enabled for tests."""
+  return False
diff --git a/tensorflow/python/framework/is_mlir_bridge_test_true.py b/tensorflow/python/framework/is_mlir_bridge_test_true.py
index 9ef94cd8222..af627b556f8 100644
--- a/tensorflow/python/framework/is_mlir_bridge_test_true.py
+++ b/tensorflow/python/framework/is_mlir_bridge_test_true.py
@@ -26,5 +26,5 @@ from __future__ import print_function
 
 
 def is_mlir_bridge_enabled():
-  """Returns true to if MLIR bridge should be enabled for tests."""
+  """Returns true if MLIR bridge should be enabled for tests."""
   return True
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 2e7bade095b..f37b48e76c2 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -27,7 +27,6 @@ import sys
 
 from tensorflow.python import _pywrap_python_op_gen
 from tensorflow.python.client import pywrap_tf_session as py_tf
-from tensorflow.python.lib.io import file_io
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
@@ -140,9 +139,9 @@ def load_library(library_location):
     OSError: When the file to be loaded is not found.
     RuntimeError: when unable to load the library.
   """
-  if file_io.file_exists(library_location):
-    if file_io.is_directory(library_location):
-      directory_contents = file_io.list_directory(library_location)
+  if os.path.exists(library_location):
+    if os.path.isdir(library_location):
+      directory_contents = os.listdir(library_location)
 
       kernel_libraries = [
           os.path.join(library_location, f) for f in directory_contents
diff --git a/tensorflow/python/framework/memory_checker.py b/tensorflow/python/framework/memory_checker.py
index ff99182966a..5def0a9cd86 100644
--- a/tensorflow/python/framework/memory_checker.py
+++ b/tensorflow/python/framework/memory_checker.py
@@ -24,9 +24,9 @@ from tensorflow.python.profiler.traceme import traceme_wrapper
 from tensorflow.python.util import tf_inspect
 
 try:
-  from tensorflow.python.platform.cpp_memory_checker import _CppMemoryChecker  # pylint:disable=g-import-not-at-top
+  from tensorflow.python.platform.cpp_memory_checker import _CppMemoryChecker as CppMemoryChecker  # pylint:disable=g-import-not-at-top
 except ImportError:
-  _CppMemoryChecker = None
+  CppMemoryChecker = None
 
 
 def _get_test_name_best_effort():
@@ -77,13 +77,13 @@ class MemoryChecker(object):
     self._trace_me = TraceMe('with MemoryChecker():')
     self._trace_me.__enter__()
     self._python_memory_checker = _PythonMemoryChecker()
-    if _CppMemoryChecker:
-      self._cpp_memory_checker = _CppMemoryChecker(_get_test_name_best_effort())
+    if CppMemoryChecker:
+      self._cpp_memory_checker = CppMemoryChecker(_get_test_name_best_effort())
     return self
 
   @traceme_wrapper
   def __exit__(self, exc_type, exc_value, traceback):
-    if _CppMemoryChecker:
+    if CppMemoryChecker:
       self._cpp_memory_checker.stop()
     self._trace_me.__exit__(exc_type, exc_value, traceback)
 
@@ -99,7 +99,7 @@ class MemoryChecker(object):
     code complexity and the allcoation pattern.
     """
     self._python_memory_checker.record_snapshot()
-    if _CppMemoryChecker:
+    if CppMemoryChecker:
       self._cpp_memory_checker.record_snapshot()
 
   @traceme_wrapper
@@ -111,7 +111,7 @@ class MemoryChecker(object):
     directory provided the infra instead.
     """
     self._python_memory_checker.report()
-    if _CppMemoryChecker:
+    if CppMemoryChecker:
       self._cpp_memory_checker.report()
 
   @traceme_wrapper
@@ -124,7 +124,7 @@ class MemoryChecker(object):
     """
 
     self._python_memory_checker.assert_no_leak_if_all_possibly_except_one()
-    if _CppMemoryChecker:
+    if CppMemoryChecker:
       self._cpp_memory_checker.assert_no_leak_if_all_possibly_except_one()
 
   @traceme_wrapper
diff --git a/tensorflow/python/framework/op_def_library.py b/tensorflow/python/framework/op_def_library.py
index 53d092787f6..016af65fc0a 100644
--- a/tensorflow/python/framework/op_def_library.py
+++ b/tensorflow/python/framework/op_def_library.py
@@ -63,6 +63,27 @@ def _SatisfiesTypeConstraint(dtype, attr_def, param_name):
            ", ".join(dtypes.as_dtype(x).name for x in allowed_list)))
 
 
+def _SatisfiesLengthConstraint(length, attr_def, param_name, op_type_name):
+  if attr_def.has_minimum and length < attr_def.minimum:
+    raise ValueError("Attr '%s' of '%s' Op passed list of length %d "
+                     "less than minimum %d." %
+                     (param_name, op_type_name, length, attr_def.minimum))
+
+
+def _SatisfiesAllowedStringsConstraint(value, attr_def, arg_name, op_type_name):
+  if value not in attr_def.allowed_values.list.s:
+    raise ValueError(
+        "Attr '%s' of '%s' Op passed string '%s' not in: \"%s\"." %
+        (arg_name, op_type_name, compat.as_text(value), '", "'.join(
+            map(compat.as_text, attr_def.allowed_values.list.s))))
+
+
+def _SatisfiesIntMinimumConstraint(value, attr_def, arg_name, op_type_name):
+  if value < attr_def.minimum:
+    raise ValueError("Attr '%s' of '%s' Op passed %d less than minimum %d." %
+                     (arg_name, op_type_name, value, attr_def.minimum))
+
+
 def _IsListParameter(arg):
   if arg.number_attr:
     return True
@@ -172,15 +193,13 @@ def _MakeBool(v, arg_name):
   return v
 
 
-def _MakeType(v, attr_def):
+def _MakeType(v, arg_name):
   try:
     v = dtypes.as_dtype(v).base_dtype
   except TypeError:
     raise TypeError("Expected DataType for argument '%s' not %s." %
-                    (attr_def.name, repr(v)))
-  i = v.as_datatype_enum
-  _SatisfiesTypeConstraint(i, attr_def, param_name=attr_def.name)
-  return i
+                    (arg_name, repr(v)))
+  return v.as_datatype_enum
 
 
 def _MakeShape(v, arg_name):
@@ -670,78 +689,32 @@ def _apply_op_helper(op_type_name, name=None, **keywords):  # pylint: disable=in
     for attr_def in op_def.attr:
       key = attr_def.name
       value = attrs[key]
-      attr_value = attr_value_pb2.AttrValue()
+
       if attr_def.HasField("default_value") and value is None:
+        attr_value = attr_value_pb2.AttrValue()
         attr_value.CopyFrom(attr_def.default_value)
         attr_protos[key] = attr_value
         continue
+
+      attr_value = value_to_attr_value(value, attr_def.type, key)
       if attr_def.type.startswith("list("):
-        if not _IsListValue(value):
-          raise TypeError("Expected list for attr " + key)
-        if attr_def.has_minimum:
-          if len(value) < attr_def.minimum:
-            raise ValueError("Attr '%s' of '%s' Op passed list of length %d "
-                             "less than minimum %d." %
-                             (key, op_type_name, len(value),
-                              attr_def.minimum))
-        attr_value.list.SetInParent()
-      if attr_def.type == "string":
-        attr_value.s = _MakeStr(value, key)
-        if attr_def.HasField("allowed_values"):
-          if attr_value.s not in attr_def.allowed_values.list.s:
-            raise ValueError(
-                "Attr '%s' of '%s' Op passed string '%s' not in: \"%s\"." %
-                (key, op_type_name, compat.as_text(attr_value.s),
-                 '", "'.join(map(compat.as_text,
-                                 attr_def.allowed_values.list.s))))
-      elif attr_def.type == "list(string)":
-        attr_value.list.s.extend([_MakeStr(x, key) for x in value])
-        if attr_def.HasField("allowed_values"):
-          for x in attr_value.list.s:
-            if x not in attr_def.allowed_values.list.s:
-              raise ValueError(
-                  "Attr '%s' of '%s' Op passed string '%s' not in: \"%s\"." %
-                  (key, op_type_name, compat.as_text(x),
-                   '", "'.join(map(compat.as_text,
-                                   attr_def.allowed_values.list.s))))
-      elif attr_def.type == "int":
-        attr_value.i = _MakeInt(value, key)
-        if attr_def.has_minimum:
-          if attr_value.i < attr_def.minimum:
-            raise ValueError(
-                "Attr '%s' of '%s' Op passed %d less than minimum %d." %
-                (key, op_type_name, attr_value.i, attr_def.minimum))
-      elif attr_def.type == "list(int)":
-        attr_value.list.i.extend([_MakeInt(x, key) for x in value])
-      elif attr_def.type == "float":
-        attr_value.f = _MakeFloat(value, key)
-      elif attr_def.type == "list(float)":
-        attr_value.list.f.extend([_MakeFloat(x, key) for x in value])
-      elif attr_def.type == "bool":
-        attr_value.b = _MakeBool(value, key)
-      elif attr_def.type == "list(bool)":
-        attr_value.list.b.extend([_MakeBool(x, key) for x in value])
-      elif attr_def.type == "type":
-        attr_value.type = _MakeType(value, attr_def)
-      elif attr_def.type == "list(type)":
-        attr_value.list.type.extend(
-            [_MakeType(x, attr_def) for x in value])
-      elif attr_def.type == "shape":
-        attr_value.shape.CopyFrom(_MakeShape(value, key))
-      elif attr_def.type == "list(shape)":
-        attr_value.list.shape.extend(
-            [_MakeShape(x, key) for x in value])
-      elif attr_def.type == "tensor":
-        attr_value.tensor.CopyFrom(_MakeTensor(value, key))
-      elif attr_def.type == "list(tensor)":
-        attr_value.list.tensor.extend(
-            [_MakeTensor(x, key) for x in value])
-      elif attr_def.type == "func":
-        attr_value.func.CopyFrom(_MakeFunc(value, key))
-      elif attr_def.type == "list(func)":
-        attr_value.list.func.extend([_MakeFunc(x, key) for x in value])
-      else:
-        raise TypeError("Unrecognized Attr type " + attr_def.type)
+        _SatisfiesLengthConstraint(len(value), attr_def, key, op_type_name)
+      if attr_def.HasField("allowed_values"):
+        if attr_def.type == "string":
+          _SatisfiesAllowedStringsConstraint(attr_value.s, attr_def, key,
+                                             op_type_name)
+        elif attr_def.type == "list(string)":
+          for value in attr_value.list.s:
+            _SatisfiesAllowedStringsConstraint(value, attr_def, key,
+                                               op_type_name)
+      if attr_def.has_minimum and attr_def.type == "int":
+        _SatisfiesIntMinimumConstraint(attr_value.i, attr_def, key,
+                                       op_type_name)
+      if attr_def.type == "type":
+        _SatisfiesTypeConstraint(attr_value.type, attr_def, key)
+      if attr_def.type == "list(type)":
+        for value in attr_value.list.type:
+          _SatisfiesTypeConstraint(value, attr_def, key)
 
       attr_protos[key] = attr_value
     del attrs  # attrs is no longer authoritative, use attr_protos instead
@@ -792,6 +765,61 @@ def _apply_op_helper(op_type_name, name=None, **keywords):  # pylint: disable=in
     return output_structure, op_def.is_stateful, op, outputs
 
 
+def value_to_attr_value(value, attr_type, arg_name):  # pylint: disable=invalid-name
+  """Encodes a Python value as an `AttrValue` proto message.
+
+  Args:
+    value: The value to convert.
+    attr_type: The value type (string) -- see the AttrValue proto definition for
+      valid strings.
+    arg_name: Argument name (for error messages).
+
+  Returns:
+    An AttrValue proto message that encodes `value`.
+  """
+  attr_value = attr_value_pb2.AttrValue()
+
+  if attr_type.startswith("list("):
+    if not _IsListValue(value):
+      raise TypeError("Expected list for attr " + arg_name)
+
+  if attr_type == "string":
+    attr_value.s = _MakeStr(value, arg_name)
+  elif attr_type == "list(string)":
+    attr_value.list.s.extend([_MakeStr(x, arg_name) for x in value])
+  elif attr_type == "int":
+    attr_value.i = _MakeInt(value, arg_name)
+  elif attr_type == "list(int)":
+    attr_value.list.i.extend([_MakeInt(x, arg_name) for x in value])
+  elif attr_type == "float":
+    attr_value.f = _MakeFloat(value, arg_name)
+  elif attr_type == "list(float)":
+    attr_value.list.f.extend([_MakeFloat(x, arg_name) for x in value])
+  elif attr_type == "bool":
+    attr_value.b = _MakeBool(value, arg_name)
+  elif attr_type == "list(bool)":
+    attr_value.list.b.extend([_MakeBool(x, arg_name) for x in value])
+  elif attr_type == "type":
+    attr_value.type = _MakeType(value, arg_name)
+  elif attr_type == "list(type)":
+    attr_value.list.type.extend([_MakeType(x, arg_name) for x in value])
+  elif attr_type == "shape":
+    attr_value.shape.CopyFrom(_MakeShape(value, arg_name))
+  elif attr_type == "list(shape)":
+    attr_value.list.shape.extend([_MakeShape(x, arg_name) for x in value])
+  elif attr_type == "tensor":
+    attr_value.tensor.CopyFrom(_MakeTensor(value, arg_name))
+  elif attr_type == "list(tensor)":
+    attr_value.list.tensor.extend([_MakeTensor(x, arg_name) for x in value])
+  elif attr_type == "func":
+    attr_value.func.CopyFrom(_MakeFunc(value, arg_name))
+  elif attr_type == "list(func)":
+    attr_value.list.func.extend([_MakeFunc(x, arg_name) for x in value])
+  else:
+    raise TypeError("Unrecognized Attr type " + attr_type)
+  return attr_value
+
+
 # The following symbols are used by op_def_util.cc.
 _pywrap_utils.RegisterPyObject("tf.dtypes.DType", dtypes.DType)
 _pywrap_utils.RegisterPyObject("tf.dtypes.as_dtype", dtypes.as_dtype)
diff --git a/tensorflow/python/framework/op_def_util.cc b/tensorflow/python/framework/op_def_util.cc
index 4e1569f190d..794acbc0b28 100644
--- a/tensorflow/python/framework/op_def_util.cc
+++ b/tensorflow/python/framework/op_def_util.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 #include "tensorflow/python/util/util.h"
 
 using ::tensorflow::swig::GetRegisteredPyObject;
diff --git a/tensorflow/python/framework/op_def_util.h b/tensorflow/python/framework/op_def_util.h
index 3b35c3ef7ad..dc72c1304f8 100644
--- a/tensorflow/python/framework/op_def_util.h
+++ b/tensorflow/python/framework/op_def_util.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index f07bca17061..47561b2c115 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -371,7 +371,7 @@ class Tensor(internal.NativeObject, core_tf_types.Tensor):
       TypeError: If the op is not an `Operation`.
     """
     if not isinstance(op, Operation):
-      raise TypeError("op needs to be an Operation: %s" % op)
+      raise TypeError("op needs to be an Operation: %s" % (op,))
     self._op = op
     self._value_index = value_index
     self._dtype = dtypes.as_dtype(dtype)
@@ -595,12 +595,12 @@ class Tensor(internal.NativeObject, core_tf_types.Tensor):
 
     In some cases, the inferred shape may have unknown dimensions. If
     the caller has additional information about the values of these
-    dimensions, `Tensor.set_shape()` can be used to augment the
-    inferred shape.
+    dimensions, `tf.ensure_shape` or `Tensor.set_shape()` can be used to augment
+    the inferred shape.
 
     >>> @tf.function
     ... def my_fun(a):
-    ...   a.set_shape([5, 5])
+    ...   a = tf.ensure_shape(a, [5, 5])
     ...   # the `print` executes during tracing.
     ...   print("Result shape: ", a.shape)
     ...   return a
@@ -618,6 +618,11 @@ class Tensor(internal.NativeObject, core_tf_types.Tensor):
   def set_shape(self, shape):
     """Updates the shape of this tensor.
 
+    Note: It is recommended to use `tf.ensure_shape` instead of
+    `Tensor.set_shape`, because `tf.ensure_shape` provides better checking for
+    programming errors and can create guarantees for compiler
+    optimization.
+
     With eager execution this operates as a shape assertion.
     Here the shapes match:
 
@@ -1506,6 +1511,13 @@ def convert_to_tensor(value,
 
   if preferred_dtype is not None:
     preferred_dtype = dtypes.as_dtype(preferred_dtype)
+
+  # See below for the reason why it's `type(value)` and not just `value`.
+  # https://docs.python.org/3.8/reference/datamodel.html#special-lookup
+  overload = getattr(type(value), "__tf_tensor__", None)
+  if overload is not None:
+    return overload(value, dtype, name)
+
   for base_type, conversion_func in tensor_conversion_registry.get(type(value)):
     # If dtype is None but preferred_dtype is not None, we try to
     # cast to preferred_dtype first.
@@ -1934,19 +1946,19 @@ class Operation(object):
       assert op_def is None
       c_op = node_def
     else:
-      raise TypeError("node_def needs to be a NodeDef: %s" % node_def)
+      raise TypeError("node_def needs to be a NodeDef: %s" % (node_def,))
 
     if not isinstance(g, Graph):
-      raise TypeError("g needs to be a Graph: %s" % g)
+      raise TypeError("g needs to be a Graph: %s" % (g,))
     self._graph = g
 
     if inputs is None:
       inputs = []
     elif not isinstance(inputs, list):
-      raise TypeError("inputs needs to be a list of Tensors: %s" % inputs)
+      raise TypeError("inputs needs to be a list of Tensors: %s" % (inputs,))
     for a in inputs:
       if not isinstance(a, Tensor):
-        raise TypeError("input needs to be a Tensor: %s" % a)
+        raise TypeError("input needs to be a Tensor: %s" % (a,))
     if input_types is None:
       input_types = [i.dtype.base_dtype for i in inputs]
     else:
@@ -2333,6 +2345,10 @@ class Operation(object):
   def __repr__(self):
     return "<tf.Operation '%s' type=%s>" % (self.name, self.type)
 
+  def __tf_tensor__(self, dtype=None, name=None):
+    """Raises a helpful error."""
+    raise TypeError("can't convert Operation '{}' to Tensor".format(self.name))
+
   @property
   def outputs(self):
     """The list of `Tensor` objects representing the outputs of this op."""
@@ -4316,12 +4332,16 @@ class Graph(object):
   def _colocate_with_for_gradient(self, op, gradient_uid,
                                   ignore_existing=False):
     with self.colocate_with(op, ignore_existing):
-      if gradient_uid is not None and self._control_flow_context is not None:
-        self._control_flow_context.EnterGradientColocation(op, gradient_uid)
-        try:
+      if gradient_uid is not None:
+        ctx = _get_enclosing_context(self)
+        if ctx is not None:
+          ctx.EnterGradientColocation(op, gradient_uid)
+          try:
+            yield
+          finally:
+            ctx.ExitGradientColocation(op, gradient_uid)
+        else:
           yield
-        finally:
-          self._control_flow_context.ExitGradientColocation(op, gradient_uid)
       else:
         yield
 
@@ -5160,42 +5180,6 @@ class Graph(object):
   def _global_distribute_strategy_scope(self, distribute_strategy_scope):
     self._thread_local.distribute_strategy_scope = (distribute_strategy_scope)
 
-  @property
-  def _auto_cast_variable_read_dtype(self):
-    """The dtype that instances of `AutoCastVariable` will be casted to.
-
-    This is None if `AutoCastVariables` should not be casted.
-
-    See `AutoCastVariable` for more information.
-
-    Returns:
-      The dtype that instances of `AutoCastVariable` will be casted to.
-    """
-    dtype = getattr(self._thread_local, "_auto_cast_variable_read_dtype", None)
-    if dtype is None:
-      self._thread_local._auto_cast_variable_read_dtype = None  # pylint: disable=protected-access
-    return dtype
-
-  @_auto_cast_variable_read_dtype.setter
-  def _auto_cast_variable_read_dtype(self, dtype):
-    self._thread_local._auto_cast_variable_read_dtype = dtype  # pylint: disable=protected-access
-
-  def _enable_auto_casting_variables(self, dtype):
-    """Returns a context manager to automatically cast AutoCastVariables.
-
-    If an AutoCastVariable `var` is used under this context manager, it will be
-    casted to `dtype` before being used.
-
-    See `AutoCastVariable` for more information.
-
-    Args:
-      dtype: The dtype that AutoCastVariables should be casted to.
-
-    Returns:
-      Context manager.
-    """
-    return enable_auto_cast_variables(dtype, graph=self)
-
   def _mutation_lock(self):
     """Returns a lock to guard code that creates & mutates ops.
 
@@ -5211,38 +5195,6 @@ class Graph(object):
     return self._group_lock.group(_SESSION_RUN_LOCK_GROUP)
 
 
-class enable_auto_cast_variables(object):
-  """Enables the autocasting of `AutoCastVariable`s.
-
-  Under this context manager, `AutoCastVariable`s will be cast to `dtype` if
-  `dtype` is floating-point. Otherwise, `AutoCastVariable`s will not be cast.
-  """
-
-  __slots__ = ["_dtype", "_graph", "_prev_read_dtype"]
-
-  def __init__(self, dtype, graph=None):
-    if dtype and not dtype.is_floating:
-      self._dtype = None
-    else:
-      self._dtype = dtype
-    if graph is None:
-      self._graph = get_default_graph()
-    else:
-      self._graph = graph
-
-  def __enter__(self):
-    # For performance, access `_thread_local` attr directly rather than
-    # @property wrappers.
-    graph_thread_local = self._graph._thread_local
-    self._prev_read_dtype = getattr(graph_thread_local,
-                                    "_auto_cast_variable_read_dtype", None)
-    graph_thread_local._auto_cast_variable_read_dtype = self._dtype
-
-  def __exit__(self, type_arg, value_arg, traceback_arg):
-    self._graph._thread_local._auto_cast_variable_read_dtype = (
-        self._prev_read_dtype)
-
-
 # TODO(agarwal): currently device directives in an outer eager scope will not
 # apply to inner graph mode code. Fix that.
 
@@ -6151,7 +6103,7 @@ def _get_graph_from_inputs(op_input_list, graph=None):
 
   op_input_list = tuple(op_input_list)  # Handle generators correctly
   if graph and not isinstance(graph, Graph):
-    raise TypeError("Input graph needs to be a Graph: %s" % graph)
+    raise TypeError("Input graph needs to be a Graph: %s" % (graph,))
 
   # 1. We validate that all of the inputs are from the same graph. This is
   #    either the supplied graph parameter, or the first one selected from one
@@ -6833,13 +6785,6 @@ def get_from_proto_function(collection_name):
     return None
 
 
-def _operation_conversion_error(op, dtype=None, name=None, as_ref=False):
-  """Produce a nice error if someone converts an Operation to a Tensor."""
-  raise TypeError(("Can't convert Operation '%s' to Tensor "
-                   "(target dtype=%r, name=%r, as_ref=%r)") %
-                  (op.name, dtype, name, as_ref))
-
-
 def _op_to_colocate_with(v, graph):
   """Operation object corresponding to v to use for colocation constraints."""
   if v is None:
@@ -6873,10 +6818,6 @@ def _is_keras_symbolic_tensor(x):
   return hasattr(x, "graph") and getattr(x.graph, "name", None) == "keras_graph"
 
 
-tensor_conversion_registry.register_tensor_conversion_function(
-    Operation, _operation_conversion_error)
-
-
 # These symbols were originally defined in this module; import them for
 # backwards compatibility until all references have been updated to access
 # them from the indexed_slices.py module.
@@ -7018,3 +6959,15 @@ def set_int_list_attr(op, attr_name, ints):
   """TF internal method used to set a list(int) attribute in the node_def."""
   ints_list = attr_value_pb2.AttrValue.ListValue(i=ints)
   op._set_attr(attr_name, attr_value_pb2.AttrValue(list=ints_list))  # pylint:disable=protected-access
+
+
+def _get_enclosing_context(graph):
+  # pylint: disable=protected-access
+  if graph is None:
+    return None
+
+  if graph._control_flow_context is not None:
+    return graph._control_flow_context
+
+  if graph.building_function and hasattr(graph, "outer_graph"):
+    return _get_enclosing_context(graph.outer_graph)
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 4129b55e3fd..04b6d90a838 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -91,7 +91,6 @@ class ResourceTest(test_util.TensorFlowTestCase):
                   resources.shared_resources()).eval()), 0)
 
 
-@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class TensorAndShapeTest(test_util.TensorFlowTestCase):
 
   def testShape(self):
@@ -459,7 +458,6 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
       _ = ~constant_op.constant("a")
 
 
-@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 @test_util.run_all_in_graph_and_eager_modes
 class IndexedSlicesTest(test_util.TensorFlowTestCase):
 
@@ -504,7 +502,6 @@ class IndexedSlicesTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(x.indices, [0, 2])
 
 
-@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 @test_util.run_all_in_graph_and_eager_modes
 class IndexedSlicesSpecTest(test_util.TensorFlowTestCase,
                             parameterized.TestCase):
@@ -650,7 +647,6 @@ def _apply_op(g, *args, **kwargs):
     return op.outputs
 
 
-@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class OperationTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
@@ -858,12 +854,25 @@ class OperationTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       ops.convert_to_tensor(tensor, dtype=dtypes.int32)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testConvertToTensorProtocol(self):
+    class TensorCompatible:
+
+      def __tf_tensor__(self, dtype=None, name=None):
+        return constant_op.constant((1, 2, 3), dtype=dtype, name=name)
+
+    tc = TensorCompatible()
+
+    tensor = ops.convert_to_tensor(tc, dtype=dtypes.int32)
+    self.assertEqual(tensor.dtype, dtypes.int32)
+    self.assertAllEqual((1, 2, 3), self.evaluate(tensor))
+
   @test_util.run_deprecated_v1
   def testNoConvert(self):
     # Operation cannot be converted to Tensor.
     op = control_flow_ops.no_op()
     with self.assertRaisesRegex(TypeError,
-                                r"Can't convert Operation '.*' to Tensor"):
+                                "can't convert Operation '.+' to Tensor"):
       ops.convert_to_tensor(op)
 
   def testStr(self):
@@ -1592,7 +1601,6 @@ class NameTest(test_util.TensorFlowTestCase):
                        g.create_op("FloatOutput", [], [dtypes.float32]).name)
 
 
-@test_util.disable_tfrt("Device API are not supported yet. b/156188344")
 class DeviceTest(test_util.TensorFlowTestCase):
 
   def testNoDevice(self):
@@ -2173,7 +2181,6 @@ class CollectionTest(test_util.TensorFlowTestCase):
       # Collections are ordered.
       self.assertEqual([90, 100], ops.get_collection("key"))
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def test_defun(self):
     with context.eager_mode():
 
@@ -2280,7 +2287,6 @@ class ControlDependenciesTest(test_util.TensorFlowTestCase):
     # e should be dominated by c.
     self.assertEqual(e.op.control_inputs, [])
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.run_in_graph_and_eager_modes
   def testEager(self):
     def future():
@@ -2601,7 +2607,6 @@ class OpScopeTest(test_util.TensorFlowTestCase):
     self._testGraphElements([a, variable, b])
 
 
-@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class InitScopeTest(test_util.TensorFlowTestCase):
 
   def testClearsControlDependencies(self):
@@ -2903,7 +2908,6 @@ class InitScopeTest(test_util.TensorFlowTestCase):
           self.assertFalse(self.evaluate(f()))
 
 
-@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class GraphTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -3392,7 +3396,6 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
         self.assertEqual("/device:CPU:0", b.op.device)
     f()
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def testColocateWithVariableInFunction(self):
     v = variables.Variable(1.)
 
diff --git a/tensorflow/python/framework/py_context_manager.cc b/tensorflow/python/framework/py_context_manager.cc
new file mode 100644
index 00000000000..b895701d84f
--- /dev/null
+++ b/tensorflow/python/framework/py_context_manager.cc
@@ -0,0 +1,74 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/python/framework/py_context_manager.h"
+
+#include <map>
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+bool PyContextManager::Enter(PyObject* py_context_manager) {
+  if (context_manager_) {
+    PyErr_SetString(
+        PyExc_ValueError,
+        "tensorflow::PyContextManager::Enter must be called at most once.");
+  }
+  if (!py_context_manager) return false;
+  context_manager_.reset(py_context_manager);
+  static char _enter[] = "__enter__";
+  var_.reset(PyObject_CallMethod(context_manager_.get(), _enter, nullptr));
+  return var_ != nullptr;
+}
+
+PyContextManager::~PyContextManager() {
+  if (var_) {
+    static char _exit[] = "__exit__";
+    static char _ooo[] = "OOO";
+    if (PyErr_Occurred()) {
+      PyObject *type, *value, *traceback;
+      PyErr_Fetch(&type, &value, &traceback);
+      value = value ? value : Py_None;
+      traceback = traceback ? traceback : Py_None;
+      Safe_PyObjectPtr result(PyObject_CallMethod(
+          context_manager_.get(), _exit, _ooo, type, value, traceback));
+      if (result) {
+        if (PyObject_IsTrue(result.get())) {
+          PyErr_SetString(
+              PyExc_ValueError,
+              "tensorflow::PyContextManager::Enter does not support "
+              "context managers that suppress exceptions.");
+        } else {
+          PyErr_Restore(type, value, traceback);
+        }
+      }
+    } else {
+      PyObject* result = PyObject_CallMethod(context_manager_.get(), _exit,
+                                             _ooo, Py_None, Py_None, Py_None);
+      if (result) {
+        Py_DECREF(result);
+      } else {
+        LOG(ERROR)
+            << "A context manager wrapped by tensorflow::PyContextManager "
+               "raised a new exception from its __new__ method.  This behavior "
+               "is not supported by PyContextManager, and the exception is "
+               "being suppressed.";
+        PyErr_Clear();
+      }
+    }
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/python/framework/py_context_manager.h b/tensorflow/python/framework/py_context_manager.h
new file mode 100644
index 00000000000..6c15fccaf07
--- /dev/null
+++ b/tensorflow/python/framework/py_context_manager.h
@@ -0,0 +1,78 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_PYTHON_FRAMEWORK_PY_CONTEXT_MANAGER_H_
+#define TENSORFLOW_PYTHON_FRAMEWORK_PY_CONTEXT_MANAGER_H_
+
+#include <Python.h>
+
+#include <string>
+
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
+
+namespace tensorflow {
+
+// Class that wraps a Python context manager, and calls the `__enter__` and
+// `__exit__` methods at appropriate times:
+//
+// * When `PyContextManager::Enter(cm)` is called, the context manager `cm`
+//   is stored, and `cm.__enter__` is called.  The result can be retrieved
+//   with `PyContextManager::var()`.
+// * When the `PyContextManager` is destroyed, then `cm.__exit__` is called
+//   (with information about any active exception).
+// * `PyContextManager::Enter(cm)` may be called at most once. If
+//   `PyContextManager::Enter()` is never called, then the destructor is a
+//   no-op (i.e., `__exit__` is not called).
+//
+// PyContextManager places two restrictons on the wrapped context managers:
+//
+// 1. The context manager may not suppress exceptions -- i.e., `__exit__`
+//    may not return a True value.  If it does, then a new exception will be
+//    set, indicating that this is unuspported.
+// 2. The context manager may not raise an exception from `__exit__` if the
+//    an exception is not active when it is called.  If it does, then an error
+//    message will be logged, indicating that this is unsupported, and the
+//    exception will be suppressed.
+//
+// These restrictions are both intended to ensure that the state of
+// PyErr_Occured is unchanged by PyContextManager's destructor.  This is
+// important, because changing the state of PyErr_Occurred in the destructor
+// would mean that we are returning a nullptr with no exception set, or
+// returning a non-null value with an exception set (both of which are invalid).
+class PyContextManager {
+ public:
+  // Calls `py_context_manager.__enter__()`, and stores the result in `var`.
+  // Return true if `__enter__` succeeds, or false if `__enter__` raises an
+  // exception.  (Also returns false if `py_context_manager` is nullptr.)
+  //
+  // Steals a reference to `py_context_manager`.  (This reference is deleted
+  // when the destructor is called.)
+  bool Enter(PyObject* py_context_manager);
+
+  // Calls `py_context_manager.__exit__()`.
+  ~PyContextManager();
+
+  // Returns the variable returned by `context_manager.__enter__()`.
+  // (This is the `var` bound by `with context_manager as var`.)
+  // Returns a borrowed reference.
+  PyObject* var() { return var_.get(); }
+
+ protected:
+  Safe_PyObjectPtr context_manager_;
+  Safe_PyObjectPtr var_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_FRAMEWORK_PY_CONTEXT_MANAGER_H_
diff --git a/tensorflow/python/framework/py_context_manager_pybind.cc b/tensorflow/python/framework/py_context_manager_pybind.cc
new file mode 100644
index 00000000000..34565145444
--- /dev/null
+++ b/tensorflow/python/framework/py_context_manager_pybind.cc
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "tensorflow/python/framework/py_context_manager.h"
+
+namespace py = pybind11;
+
+namespace {
+
+// Test harness for PyContextManager.  Creates a PyContextManager `cm` that
+// wraps `context_manager`, calls `cm.Enter()`, and then calls `body_func`
+// with `cm.var()`.  Returns the result of the function.
+py::handle TestPyContextManager(py::handle context_manager,
+                                py::handle body_func) {
+  tensorflow::Safe_PyObjectPtr result;
+  {
+    tensorflow::PyContextManager cm;
+    Py_INCREF(context_manager.ptr());  // cm.Enter steals a reference.
+    if (!cm.Enter(context_manager.ptr())) {
+      throw py::error_already_set();
+    }
+    result.reset(
+        PyObject_CallFunctionObjArgs(body_func.ptr(), cm.var(), nullptr));
+  }
+  // cm gets destroyed here.
+
+  if (result) {
+    return result.release();
+  } else {
+    throw py::error_already_set();
+  }
+}
+
+}  // namespace
+
+PYBIND11_MODULE(_py_context_manager, m) {
+  m.def("test_py_context_manager", TestPyContextManager);
+}
diff --git a/tensorflow/python/framework/py_context_manager_test.py b/tensorflow/python/framework/py_context_manager_test.py
new file mode 100644
index 00000000000..60c72a806ae
--- /dev/null
+++ b/tensorflow/python/framework/py_context_manager_test.py
@@ -0,0 +1,118 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.python.framework._py_context_manager."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import _py_context_manager
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+
+class TestContextManager(object):
+
+  def __init__(self, behavior="basic"):
+    self.log = []
+    self.behavior = behavior
+
+  def __enter__(self):
+    self.log.append("__enter__()")
+    if self.behavior == "raise_from_enter":
+      raise ValueError("exception in __enter__")
+    return "var"
+
+  def __exit__(self, ex_type, ex_value, ex_tb):
+    self.log.append("__exit__(%s, %s, %s)" % (ex_type, ex_value, ex_tb))
+    if self.behavior == "raise_from_exit":
+      raise ValueError("exception in __exit__")
+    if self.behavior == "suppress_exception":
+      return True
+
+
+# Expected log when the body doesn't raise an exception.
+NO_EXCEPTION_LOG = """\
+__enter__()
+body('var')
+__exit__(None, None, None)"""
+
+# Expected log when the body does raise an exception.  (Regular expression.)
+EXCEPTION_LOG = """\
+__enter__\\(\\)
+body\\('var'\\)
+__exit__\\(<class 'ValueError'>, Foo, <traceback object.*>\\)"""
+
+
+class OpDefUtilTest(test_util.TensorFlowTestCase):
+
+  def testBasic(self):
+    cm = TestContextManager()
+
+    def body(var):
+      cm.log.append("body(%r)" % var)
+
+    _py_context_manager.test_py_context_manager(cm, body)
+    self.assertEqual("\n".join(cm.log), NO_EXCEPTION_LOG)
+
+  def testBodyRaisesException(self):
+    cm = TestContextManager()
+
+    def body(var):
+      cm.log.append("body(%r)" % var)
+      raise ValueError("Foo")
+
+    with self.assertRaisesRegexp(ValueError, "Foo"):
+      _py_context_manager.test_py_context_manager(cm, body)
+    self.assertRegex("\n".join(cm.log), EXCEPTION_LOG)
+
+  def testEnterRaisesException(self):
+    cm = TestContextManager("raise_from_enter")
+
+    def body(var):
+      cm.log.append("body(%r)" % var)
+
+    with self.assertRaisesRegexp(ValueError, "exception in __enter__"):
+      _py_context_manager.test_py_context_manager(cm, body)
+    self.assertEqual("\n".join(cm.log), "__enter__()")
+
+  # Test behavior in unsupported case where __exit__ raises an exception.
+  def testExitRaisesException(self):
+    cm = TestContextManager("raise_from_exit")
+
+    def body(var):
+      cm.log.append("body(%r)" % var)
+
+    # Note: this does *not* raise an exception (but does log a warning):
+    _py_context_manager.test_py_context_manager(cm, body)
+    self.assertEqual("\n".join(cm.log), NO_EXCEPTION_LOG)
+
+  # Test behavior in unsupported case where __exit__ suppresses exception.
+  def testExitSuppressesException(self):
+    cm = TestContextManager("suppress_exception")
+
+    def body(var):
+      cm.log.append("body(%r)" % var)
+      raise ValueError("Foo")
+
+    with self.assertRaisesRegexp(
+        ValueError, "tensorflow::PyContextManager::Enter does not support "
+        "context managers that suppress exception"):
+      _py_context_manager.test_py_context_manager(cm, body)
+    self.assertRegex("\n".join(cm.log), EXCEPTION_LOG)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index f81e99242bf..447cbc8e3b7 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -928,8 +928,7 @@ bool GenEagerPythonOp::AddEagerFallbackCode(
 
 void GenEagerPythonOp::AddEagerFastPathExecute() {
   string fastpath_execute_params =
-      strings::StrCat("_ctx._context_handle, tld.device_name, \"",
-                      op_def_.name(), "\", ", "name, tld.op_callbacks");
+      strings::StrCat("_ctx, \"", op_def_.name(), "\", ", "name");
   string fallback_params;
 
   for (int i = 0; i < api_def_.in_arg_size(); i++) {
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 20508f37eb7..de29cc53c1f 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+import operator
 import six
 
 from tensorflow.core.framework import tensor_shape_pb2
@@ -916,10 +918,7 @@ class TensorShape(object):
   def num_elements(self):
     """Returns the total number of elements, or none for incomplete shapes."""
     if self.is_fully_defined():
-      size = 1
-      for dim in self._dims:
-        size *= dim.value
-      return size
+      return functools.reduce(operator.mul, self.as_list(), 1)
     else:
       return None
 
@@ -942,19 +941,20 @@ class TensorShape(object):
     other = as_shape(other)
     if self._dims is None:
       return other
+    if other.dims is None:
+      return self
     else:
       try:
         self.assert_same_rank(other)
-        new_dims = []
-        for i, dim in enumerate(self._dims):
-          new_dims.append(dim.merge_with(other[i]))
+        new_dims = [
+            dim.merge_with(other_dim)
+            for dim, other_dim in zip(self._dims, other.dims)
+        ]
         return TensorShape(new_dims)
       except ValueError:
         raise ValueError("Shapes %s and %s are not compatible" % (self, other))
 
   def __add__(self, other):
-    if not isinstance(other, TensorShape):
-      other = TensorShape(other)
     return self.concatenate(other)
 
   def __radd__(self, other):
@@ -1157,10 +1157,10 @@ class TensorShape(object):
     if self._dims is None or other.dims is None or self.rank != other.rank:
       return unknown_shape()
 
-    dims = [(Dimension(None))] * self.rank
-    for i, (d1, d2) in enumerate(zip(self._dims, other.dims)):
-      if d1 is not None and d2 is not None and d1 == d2:
-        dims[i] = d1
+    dims = [
+        d1 if d1 is not None and d2 is not None and d1 == d2 else None
+        for d1, d2 in zip(self._dims, other.dims)
+    ]
     return TensorShape(dims)
 
   def is_fully_defined(self):
diff --git a/tensorflow/python/framework/test_combinations.py b/tensorflow/python/framework/test_combinations.py
index 5a43704c548..09b6ba478db 100644
--- a/tensorflow/python/framework/test_combinations.py
+++ b/tensorflow/python/framework/test_combinations.py
@@ -57,8 +57,10 @@ from absl.testing import parameterized
 import six
 
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("__internal__.test.combinations.TestCombination", v1=[])
 class TestCombination(object):
   """Customize the behavior of `generate()` and the tests that it executes.
 
@@ -112,8 +114,27 @@ class TestCombination(object):
     return []
 
 
+@tf_export("__internal__.test.combinations.ParameterModifier", v1=[])
 class ParameterModifier(object):
-  """Customizes the behavior of a particular parameter."""
+  """Customizes the behavior of a particular parameter.
+
+  Users should override `modified_arguments()` to modify the parameter they
+  want, eg: change the value of certain parameter or filter it from the params
+  passed to the test case.
+
+  See the sample usage below, it will change any negative parameters to zero
+  before it gets passed to test case.
+  ```
+  class NonNegativeParameterModifier(ParameterModifier):
+
+    def modified_arguments(self, kwargs, requested_parameters):
+      updates = {}
+      for name, value in kwargs.items():
+        if value < 0:
+          updates[name] = 0
+      return updates
+  ```
+  """
 
   DO_NOT_PASS_TO_THE_TEST = object()
 
@@ -169,8 +190,39 @@ class ParameterModifier(object):
       return id(self.__class__)
 
 
+@tf_export("__internal__.test.combinations.OptionalParameter", v1=[])
 class OptionalParameter(ParameterModifier):
-  """A parameter that is optional in `combine()` and in the test signature."""
+  """A parameter that is optional in `combine()` and in the test signature.
+
+  `OptionalParameter` is usually used with `TestCombination` in the
+  `parameter_modifiers()`. It allows `TestCombination` to skip certain
+  parameters when passing them to `combine()`, since the `TestCombination` might
+  consume the param and create some context based on the value it gets.
+
+  See the sample usage below:
+
+  ```
+  class EagerGraphCombination(TestCombination):
+
+    def context_managers(self, kwargs):
+      mode = kwargs.pop("mode", None)
+      if mode is None:
+        return []
+      elif mode == "eager":
+        return [context.eager_mode()]
+      elif mode == "graph":
+        return [ops.Graph().as_default(), context.graph_mode()]
+      else:
+        raise ValueError(
+            "'mode' has to be either 'eager' or 'graph', got {}".format(mode))
+
+    def parameter_modifiers(self):
+      return [test_combinations.OptionalParameter("mode")]
+  ```
+
+  When the test case is generated, the param "mode" will not be passed to the
+  test method, since it is consumed by the `EagerGraphCombination`.
+  """
 
   def modified_arguments(self, kwargs, requested_parameters):
     if self._parameter_name in requested_parameters:
@@ -316,6 +368,7 @@ def _augment_with_special_arguments(test_method, test_combinations):
   return decorated
 
 
+@tf_export("__internal__.test.combinations.combine", v1=[])
 def combine(**kwargs):
   """Generate combinations based on its keyword arguments.
 
@@ -353,6 +406,7 @@ def combine(**kwargs):
   ]
 
 
+@tf_export("__internal__.test.combinations.times", v1=[])
 def times(*combined):
   """Generate a product of N sets of combinations.
 
@@ -386,6 +440,7 @@ def times(*combined):
   return combined_results
 
 
+@tf_export("__internal__.test.combinations.NamedObject", v1=[])
 class NamedObject(object):
   """A class that translates an object into a good test name."""
 
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 4d7b7746b9c..329de12ac41 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -54,6 +54,7 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import tape
+from tensorflow.python.framework import config
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -65,11 +66,13 @@ from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import tfrt_utils
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_util_v2
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.ops import variables
@@ -102,36 +105,32 @@ except Exception:  # pylint: disable=broad-except
   pass
 
 
-# Uses the same mechanism as above to selectively enable MLIR compilation.
+# Uses the same mechanism as above to selectively enable/disable MLIR
+# compilation.
 def is_mlir_bridge_enabled():
-  return False
+  return None
 
 
 try:
-  from tensorflow.python.framework.is_mlir_bridge_test_true import is_mlir_bridge_enabled  # pylint: disable=g-import-not-at-top, unused-import
-except Exception:  # pylint: disable=broad-except
-  pass
+  from tensorflow.python.framework.is_mlir_bridge_test_false import is_mlir_bridge_enabled  # pylint: disable=g-import-not-at-top, unused-import
+except ImportError:
+  try:
+    from tensorflow.python.framework.is_mlir_bridge_test_true import is_mlir_bridge_enabled  # pylint: disable=g-import-not-at-top, unused-import
+  except ImportError:
+    pass
 
 
-# Uses the same mechanism as above to selectively enable TFRT.
-def is_tfrt_enabled():
-  return False
+def _get_object_count_by_type(exclude=()):
+  return (
+      collections.Counter([type(obj).__name__ for obj in gc.get_objects()]) -
+      collections.Counter([type(obj).__name__ for obj in exclude]))
 
 
-try:
-  from tensorflow.python.framework.is_tfrt_test_true import is_tfrt_enabled  # pylint: disable=g-import-not-at-top, unused-import
-except Exception:  # pylint: disable=broad-except
-  pass
-
-
-def _get_object_count_by_type():
-  return collections.Counter([type(obj).__name__ for obj in gc.get_objects()])
-
 @tf_export("test.gpu_device_name")
 def gpu_device_name():
   """Returns the name of a GPU device if available or the empty string."""
   for x in device_lib.list_local_devices():
-    if x.device_type == "GPU" or x.device_type == "SYCL":
+    if x.device_type == "GPU":
       return compat.as_str(x.name)
   return ""
 
@@ -284,7 +283,8 @@ def _strip_checkpoint_v2_randomized(graph_def):
       if attr_tensor_value and len(attr_tensor_value.string_val) == 1:
         attr_tensor_string_value = attr_tensor_value.string_val[0]
         if (attr_tensor_string_value and
-            re.match(_SHARDED_SAVE_OP_PATTERN, str(attr_tensor_string_value))):
+            re.match(compat.as_bytes(_SHARDED_SAVE_OP_PATTERN),
+                     attr_tensor_string_value)):
           delete_keys.append(attr_key)
     for attr_key in delete_keys:
       del node.attr[attr_key]
@@ -297,7 +297,8 @@ def _strip_hash_table_shared_name(graph_def):
   for node in graph_def.node:
     delete_keys = []
     if node.op == "HashTableV2" and "shared_name" in node.attr:
-      if re.match(_TABLE_SHARED_NAME_PATTERN, str(node.attr["shared_name"].s)):
+      if re.match(compat.as_bytes(_TABLE_SHARED_NAME_PATTERN),
+                  node.attr["shared_name"].s):
         delete_keys.append("shared_name")
     for attr_key in delete_keys:
       del node.attr[attr_key]
@@ -335,13 +336,13 @@ def NHWCToNCHW(input_tensor):
   """Converts the input from the NHWC format to NCHW.
 
   Args:
-    input_tensor: a 4- or 5-D tensor, or an array representing shape
+    input_tensor: a 3-, 4-, or 5-D tensor, or an array representing shape
 
   Returns:
     converted tensor or shape array
   """
   # tensor dim -> new axis order
-  new_axes = {4: [0, 3, 1, 2], 5: [0, 4, 1, 2, 3]}
+  new_axes = {3: [0, 2, 1], 4: [0, 3, 1, 2], 5: [0, 4, 1, 2, 3]}
   if isinstance(input_tensor, ops.Tensor):
     ndims = input_tensor.shape.ndims
     return array_ops.transpose(input_tensor, new_axes[ndims])
@@ -615,12 +616,12 @@ def enable_output_all_intermediates(fn):
     The wrapped function
   """
 
-  def wrapper(*args, **kwargs):
+  def wrapper(self, *args, **kwargs):
     output_all_intermediates_old = \
         control_flow_util_v2._EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE
     control_flow_util_v2._EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE = True
     try:
-      return fn(*args, **kwargs)
+      return fn(self, *args, **kwargs)
     finally:
       control_flow_util_v2._EXPERIMENTAL_OUTPUT_ALL_INTERMEDIATES_OVERRIDE = \
           output_all_intermediates_old
@@ -659,12 +660,20 @@ def assert_no_new_pyobjects_executing_eagerly(func=None, warmup_iters=2):
         # versions of python2.7.x.
         for _ in range(warmup_iters):
           f(self, *args, **kwargs)
+        # Since we aren't in the normal test lifecylce, we need to manually run
+        # cleanups to clear out their object references.
+        self.doCleanups()
 
         # Some objects are newly created by _get_object_count_by_type().  So
         # create and save as a dummy variable to include it as a baseline.
         obj_count_by_type = _get_object_count_by_type()
         gc.collect()
-        obj_count_by_type = _get_object_count_by_type()
+        # unittest.doCleanups adds to self._outcome with each unwound call.
+        # These objects are retained across gc collections so we exclude them
+        # from the object count calculation.
+        obj_count_by_type = _get_object_count_by_type(
+            exclude=gc.get_referents(self._outcome.errors,
+                                     self._outcome.skipped))
 
         if ops.has_default_graph():
           collection_sizes_before = {
@@ -673,6 +682,9 @@ def assert_no_new_pyobjects_executing_eagerly(func=None, warmup_iters=2):
           }
         for _ in range(3):
           f(self, *args, **kwargs)
+        # Since we aren't in the normal test lifecylce, we need to manually run
+        # cleanups to clear out their object references.
+        self.doCleanups()
         # Note that gc.get_objects misses anything that isn't subject to garbage
         # collection (C types). Collections are a common source of leaks, so we
         # test for collection sizes explicitly.
@@ -694,7 +706,11 @@ def assert_no_new_pyobjects_executing_eagerly(func=None, warmup_iters=2):
         gc.collect()
 
         # There should be no new Python objects hanging around.
-        obj_count_by_type = _get_object_count_by_type() - obj_count_by_type
+        obj_count_by_type = (
+            _get_object_count_by_type(
+                exclude=gc.get_referents(self._outcome.errors,
+                                         self._outcome.skipped)) -
+            obj_count_by_type)
         # In some cases (specifically on MacOS), new_count is somehow
         # smaller than previous_count.
         # Using plain assert because not all classes using this decorator
@@ -1561,6 +1577,10 @@ def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
   Returns:
     True if a GPU device of the requested kind is available.
   """
+
+  # This was needed earlier when we had support for SYCL in TensorFlow.
+  del cuda_only
+
   try:
     for local_device in device_lib.list_local_devices():
       if local_device.device_type == "GPU":
@@ -1568,8 +1588,6 @@ def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
         cc = gpu_info.compute_capability or (0, 0)
         if not min_cuda_compute_capability or cc >= min_cuda_compute_capability:
           return True
-      if local_device.device_type == "SYCL" and not cuda_only:
-        return True
     return False
   except errors_impl.NotFoundError as e:
     if not all(x in str(e) for x in ["CUDA", "not find"]):
@@ -1821,7 +1839,7 @@ def disable_tfrt(unused_description):
     """Execute the test only if tfrt is not enabled."""
 
     if tf_inspect.isclass(cls_or_func):
-      if is_tfrt_enabled():
+      if tfrt_utils.enabled():
         return None
       else:
         return cls_or_func
@@ -1829,7 +1847,7 @@ def disable_tfrt(unused_description):
       def decorator(func):
 
         def decorated(self, *args, **kwargs):
-          if is_tfrt_enabled():
+          if tfrt_utils.enabled():
             return
           else:
             return func(self, *args, **kwargs)
@@ -1908,6 +1926,75 @@ def xla_allow_fallback(description):  # pylint: disable=unused-argument
   return xla_allow_fallback_impl
 
 
+# The description is just for documentation purposes.
+def run_without_tensor_float_32(description):  # pylint: disable=unused-argument
+  """Execute test with TensorFloat-32 disabled.
+
+  While almost every real-world deep learning model runs fine with
+  TensorFloat-32, many tests use assertAllClose or similar methods.
+  TensorFloat-32 matmuls typically will cause such methods to fail with the
+  default tolerances.
+
+  Args:
+    description: A description used for documentation purposes, describing why
+      the test requires TensorFloat-32 to be disabled.
+
+  Returns:
+    Decorator which runs a test with TensorFloat-32 disabled.
+  """
+
+  def decorator(f):
+
+    @functools.wraps(f)
+    def decorated(self, *args, **kwargs):
+      allowed = config.tensor_float_32_execution_enabled()
+      try:
+        config.enable_tensor_float_32_execution(False)
+        f(self, *args, **kwargs)
+      finally:
+        config.enable_tensor_float_32_execution(allowed)
+
+    return decorated
+
+  return decorator
+
+
+# The description is just for documentation purposes.
+def run_all_without_tensor_float_32(description):  # pylint: disable=unused-argument
+  """Execute all tests in a class with TensorFloat-32 disabled."""
+  return for_all_test_methods(run_without_tensor_float_32, description)
+
+
+def matmul_without_tf32(a, b, *args, **kwargs):
+  """Run matmul but cast float32 inputs to float64 if TensorFloat-32 is enabled.
+
+  This effectively runs matmul without TensorFloat-32. It should only be used in
+  tests when verifying some other op or functions works correctly, e.g. to test
+  `tf.linalg.sqrtm` by matrix multiplying the output of the op by itself. In
+  such cases, the matmul itself is not being tested so it's OK to run it with
+  higher precision.
+
+  If a matmul itself is being tested, or some other op which uses matmul, use
+  `run_without_tensor_float_32` instead.
+
+  Args:
+    a: First input to tf.linalg.matmul
+    b: Second input to tf.linalg.matmul
+    args: Other positional arguments to tf.linalg.matmul
+    **kwargs: Other keyword arguments to tf.linalg.matmul
+
+  Returns:
+    A tensor with the same type as `a`.
+  """
+  if config.tensor_float_32_execution_enabled() and a.dtype == "float32":
+    a = math_ops.cast(a, "float64")
+    b = math_ops.cast(b, "float64")
+    ret = math_ops.matmul(a, b, *args, **kwargs)
+    return math_ops.cast(ret, a.dtype)
+  else:
+    return math_ops.matmul(a, b, *args, **kwargs)
+
+
 class EagerSessionWarner(object):
 
   def __getattr__(self, attr):
@@ -1935,8 +2022,13 @@ class TensorFlowTestCase(googletest.TestCase):
       # disable it here.
       pywrap_tf_session.TF_SetXlaConstantFoldingDisabled(True)
 
+    # Check if the mlir bridge has been explicitly enabled or disabled. If
+    # is_mlir_bridge_enabled() returns None, the user did not explictly enable
+    # or disable the bridge so do not update enable_mlir_bridge.
     if is_mlir_bridge_enabled():
       context.context().enable_mlir_bridge = True
+    elif is_mlir_bridge_enabled() is not None:
+      context.context().enable_mlir_bridge = False
 
     self._threads = []
     self._tempdir = None
@@ -1944,6 +2036,7 @@ class TensorFlowTestCase(googletest.TestCase):
     self._test_start_time = None
 
   def setUp(self):
+    super(TensorFlowTestCase, self).setUp()
     self._ClearCachedSession()
     random.seed(random_seed.DEFAULT_GRAPH_SEED)
     np.random.seed(random_seed.DEFAULT_GRAPH_SEED)
@@ -1978,6 +2071,7 @@ class TensorFlowTestCase(googletest.TestCase):
       thread.check_termination()
 
     self._ClearCachedSession()
+    super(TensorFlowTestCase, self).tearDown()
 
   def _ClearCachedSession(self):
     if self._cached_session is not None:
@@ -2078,7 +2172,7 @@ class TensorFlowTestCase(googletest.TestCase):
     if isinstance(expected_message_maybe_ascii, type(message)):
       expected_message = expected_message_maybe_ascii
       self._AssertProtoEquals(expected_message, message)
-    elif isinstance(expected_message_maybe_ascii, str):
+    elif isinstance(expected_message_maybe_ascii, (str, bytes)):
       expected_message = type(message)()
       text_format.Merge(
           expected_message_maybe_ascii,
@@ -2827,6 +2921,8 @@ class TensorFlowTestCase(googletest.TestCase):
     lines = []
     subscripts = np.transpose(subscripts)
     prefix = " " * indent
+    if np.ndim(value) == 0:
+      return [prefix + "[0] : " + str(value)]
     for subscript in itertools.islice(subscripts, limit):
       lines.append(prefix + str(subscript) + " : " +
                    str(value[tuple(subscript)]))
@@ -3331,3 +3427,41 @@ class AbstractGradientTape:
 
   def __exit__(self, exc_type, exc_val, exc_tb):
     self._tape_impl.__exit__(exc_type, exc_val, exc_tb)
+
+
+@contextlib.contextmanager
+def run_functions_eagerly(run_eagerly):
+  """Runs functions eagerly if `run_eagerly` is true.
+
+  WARNING: Setting `run_eagerly` to True in tests running in V1 graph mode
+  *WILL NOT* make the tf.function to run eagerly because eager is disabled by
+  default in V1. Instead, tf.function will run as a traced graph function.
+
+  Ensures that the state (for running functions eagerly) is back to the initial
+  `def_function.RUN_FUNCTIONS_EAGERLY` state.
+
+  Args:
+    run_eagerly: Boolean determining whether to run the function eagerly or not.
+
+  Raises:
+    ValueError if `run_eagerly` is not a boolean.
+
+  Yields:
+    Nothing.
+  """
+  if not isinstance(run_eagerly, bool):
+    raise ValueError(
+        "Expected bool for `run_eagerly` but got {}".format(run_eagerly))
+
+  is_eager = context.executing_eagerly()
+  if not is_eager and run_eagerly:
+    logging.warning(
+        "Running tf.function eagerly in V1 graph mode is not supported. "
+        "tf.function will be run as a traced graph function.")
+
+  initial_state = def_function.functions_run_eagerly()
+  def_function.run_functions_eagerly(run_eagerly)
+  try:
+    yield
+  finally:
+    def_function.run_functions_eagerly(initial_state)
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 4d85772fe42..ec5d7b6a85b 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -34,6 +34,7 @@ from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -42,6 +43,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_ops  # pylint: disable=unused-import
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -107,6 +109,27 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                                 r"^Found unexpected node '{{node seven}}"):
       test_util.assert_equal_graph_def(def_57, def_empty)
 
+  def test_assert_equal_graph_def_hash_table(self):
+    def get_graph_def():
+      with ops.Graph().as_default() as g:
+        x = constant_op.constant([2, 9], name="x")
+        keys = constant_op.constant([1, 2], name="keys")
+        values = constant_op.constant([3, 4], name="values")
+        default = constant_op.constant(-1, name="default")
+        table = lookup_ops.StaticHashTable(
+            lookup_ops.KeyValueTensorInitializer(keys, values), default)
+        _ = table.lookup(x)
+      return g.as_graph_def()
+    def_1 = get_graph_def()
+    def_2 = get_graph_def()
+    # The unique shared_name of each table makes the graph unequal.
+    with self.assertRaisesRegex(AssertionError, "hash_table_"):
+      test_util.assert_equal_graph_def(def_1, def_2,
+                                       hash_table_shared_name=False)
+    # That can be ignored. (NOTE: modifies GraphDefs in-place.)
+    test_util.assert_equal_graph_def(def_1, def_2,
+                                     hash_table_shared_name=True)
+
   def testIsGoogleCudaEnabled(self):
     # The test doesn't assert anything. It ensures the py wrapper
     # function is generated correctly.
@@ -595,6 +618,19 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertAllInRange(
           x, 10, 15, open_lower_bound=True, open_upper_bound=True)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testAssertAllInRangeScalar(self):
+    x = constant_op.constant(10.0, name="x")
+    nan = constant_op.constant(np.nan, name="nan")
+    self.assertAllInRange(x, 5, 15)
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(nan, 5, 15)
+
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(x, 10, 15, open_lower_bound=True)
+    with self.assertRaises(AssertionError):
+      self.assertAllInRange(x, 1, 2)
+
   @test_util.run_in_graph_and_eager_modes
   def testAssertAllInRangeErrorMessageEllipses(self):
     x_init = np.array([[10.0, 15.0]] * 12)
@@ -926,12 +962,13 @@ class GarbageCollectionTest(test_util.TensorFlowTestCase):
 
   def test_no_new_objects_decorator(self):
 
-    class LeakedObjectTest(object):
+    class LeakedObjectTest(unittest.TestCase):
 
-      def __init__(inner_self):  # pylint: disable=no-self-argument
-        inner_self.assertEqual = self.assertEqual  # pylint: disable=invalid-name
-        inner_self.accumulation = []
+      def __init__(self, *args, **kwargs):
+        super(LeakedObjectTest, self).__init__(*args, **kwargs)
+        self.accumulation = []
 
+      @unittest.expectedFailure
       @test_util.assert_no_new_pyobjects_executing_eagerly
       def test_has_leak(self):
         self.accumulation.append([1.])
@@ -940,10 +977,33 @@ class GarbageCollectionTest(test_util.TensorFlowTestCase):
       def test_has_no_leak(self):
         self.not_accumulating = [1.]
 
-    with self.assertRaises(AssertionError):
-      LeakedObjectTest().test_has_leak()
+    self.assertTrue(LeakedObjectTest("test_has_leak").run().wasSuccessful())
+    self.assertTrue(LeakedObjectTest("test_has_no_leak").run().wasSuccessful())
 
-    LeakedObjectTest().test_has_no_leak()
+
+class RunFunctionsEagerlyInV2Test(test_util.TensorFlowTestCase,
+                                  parameterized.TestCase):
+  @parameterized.named_parameters(
+      [("_RunEagerly", True), ("_RunGraph", False)])
+  def test_run_functions_eagerly(self, run_eagerly):  # pylint: disable=g-wrong-blank-lines
+    results = []
+
+    @def_function.function
+    def add_two(x):
+      for _ in range(5):
+        x += 2
+        results.append(x)
+      return x
+
+    with test_util.run_functions_eagerly(run_eagerly):
+      add_two(constant_op.constant(2.))
+      if context.executing_eagerly():
+        if run_eagerly:
+          self.assertTrue(isinstance(t, ops.EagerTensor) for t in results)
+        else:
+          self.assertTrue(isinstance(t, ops.Tensor) for t in results)
+      else:
+        self.assertTrue(isinstance(t, ops.Tensor) for t in results)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/examples/get_started/__init__.py b/tensorflow/python/framework/tfrt_utils.py
similarity index 80%
rename from tensorflow/examples/get_started/__init__.py
rename to tensorflow/python/framework/tfrt_utils.py
index c12e1da97c1..0f973b6a5bb 100644
--- a/tensorflow/examples/get_started/__init__.py
+++ b/tensorflow/python/framework/tfrt_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,8 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""A collection of "getting started" examples."""
+"""Utilities for TFRT migration."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+
+def enabled():
+  """Returns true if TFRT should be enabled."""
+  return False
diff --git a/tensorflow/python/grappler/auto_mixed_precision_test.py b/tensorflow/python/grappler/auto_mixed_precision_test.py
index 0066fcb9712..f0731dcf027 100644
--- a/tensorflow/python/grappler/auto_mixed_precision_test.py
+++ b/tensorflow/python/grappler/auto_mixed_precision_test.py
@@ -512,9 +512,7 @@ class AutoMixedPrecisionTest(test.TestCase, parameterized.TestCase):
     tol = 5e-2 if mode == 'mkl' else 1e-3
     self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
-  # TODO(reedwm): Fix and enable this test with MKL. Currently this crashes with
-  # MKL
-  @parameterized.parameters(['cuda'])
+  @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
   def test_conv_bn_dropout(self, mode):
@@ -545,6 +543,7 @@ class AutoMixedPrecisionTest(test.TestCase, parameterized.TestCase):
     # The default tolerance (1e-3) results in a tiny fraction (<1%) of
     # miscompares on ROCm platform, and hence the tolerance bump
     tol = 2e-3 if test.is_built_with_rocm else 1e-3
+    tol = 5e-2 if mode == 'mkl' else tol
     self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
 
   # TODO(reedwm): Fix and enable this test with MKL. Currently this crashes with
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index d5c53ae2959..263b05047da 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -193,7 +193,9 @@ def _get_cluster():
 
 def _is_transpose(node):
   return node.endswith('TransposeNHWCToNCHW-LayoutOptimizer') or node.endswith(
-      'TransposeNCHWToNHWC-LayoutOptimizer')
+      'TransposeNCHWToNHWC-LayoutOptimizer') or node.endswith(
+          'TransposeNDHWCToNCDHW-LayoutOptimizer') or node.endswith(
+              'TransposeNCDHWToNDHWC-LayoutOptimizer')
 
 
 def _is_permute(node):
@@ -221,6 +223,9 @@ class LayoutOptimizerTest(test.TestCase):
   def _assert_map_nhwc_to_nchw(self, name, nodes):
     self.assertIn(name + '-DimMapNHWCToNCHW-LayoutOptimizer', nodes)
 
+  def _assert_map_ndhwc_to_ncdhw(self, name, nodes):
+    self.assertIn(name + '-DataFormatDimMapNDHWCToNCDHW-LayoutOptimizer', nodes)
+
   def _assert_vec_nchw_to_nhwc(self, name, nodes):
     self.assertIn(name + '-VecPermuteNCHWToNHWC-LayoutOptimizer', nodes)
 
@@ -1135,6 +1140,231 @@ class LayoutOptimizerTest(test.TestCase):
       self.assertIn('MaxPoolGradV2-3-LayoutOptimizer', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.deprecated_graph_mode_only
+  def testLeakyRelu(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([4, 14, 14, 1], seed=0)
+      w = random_ops.truncated_normal([2, 2, 1, 2], seed=0)
+      y = nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
+      y = nn.leaky_relu(y, alpha=0.2)
+      output = array_ops.identity(y)
+
+      with session.Session(config=_get_config(False)) as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nchw_to_nhwc('LeakyRelu-0-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  @test_util.deprecated_graph_mode_only
+  def testLeakyReluGrad(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([4, 14, 14, 1], seed=0)
+      w = random_ops.truncated_normal([2, 2, 1, 1], seed=0)
+      y = nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
+      y = gen_nn_ops.leaky_relu_grad(y, x, alpha=0.2)
+      output = array_ops.identity(y)
+
+      with session.Session(config=_get_config(False)) as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 3
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_nhwc_to_nchw('LeakyReluGrad-1', nodes)
+      self._assert_trans_nchw_to_nhwc('LeakyReluGrad-0-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  @test_util.deprecated_graph_mode_only
+  def testReduceOpsFor5DTensors(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 4, 2, 3, 3], seed=0)
+      w = random_ops.truncated_normal([2, 2, 2, 3, 3], seed=0)
+      conv3d = gen_nn_ops.conv3d(x, w, [1, 1, 1, 1, 1], 'SAME')
+      y = math_ops.reduce_mean(conv3d, [0, 1, 2, 3], keepdims=True)
+      output = array_ops.identity(y)
+
+      with session.Session(config=_get_config(False)) as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # The reduce op Mean needs to dim map the input reduce index to NCDHW.
+      # Then, the output needs to be tranposed back to NDHWC.
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_ndhwc_to_ncdhw('Conv3D-0', nodes)
+      self._assert_map_ndhwc_to_ncdhw('Mean-1', nodes)
+      self._assert_trans_ncdhw_to_ndhwc('Mean-0-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  @test_util.deprecated_graph_mode_only
+  def testBinaryOpsFor5DTensors(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x = random_ops.truncated_normal([1, 4, 2, 3, 3], seed=0)
+      w = random_ops.truncated_normal([2, 2, 2, 3, 3], seed=0)
+      mean = random_ops.truncated_normal([1, 1, 1, 1, 3], seed=0)
+      variance = random_ops.truncated_normal([1, 1, 1, 1, 3], seed=0)
+      gamma = random_ops.truncated_normal([1, 1, 1, 1, 3], seed=0)
+      beta = random_ops.truncated_normal([1, 1, 1, 1, 3], seed=0)
+      conv3d = gen_nn_ops.conv3d(x, w, [1, 1, 1, 1, 1], 'SAME')
+      y = nn.batch_normalization(
+          conv3d,
+          mean=mean,
+          variance=variance,
+          scale=gamma,
+          offset=beta,
+          variance_epsilon=0.001)
+      output = array_ops.identity(y)
+
+      with session.Session(config=_get_config(False)) as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      # The binary ops mul_1 and add_1 in batch norm need to transpose one of
+      # the two inputs to NCDHW. The other input has already been tranposed via
+      # Conv3D.
+      expected_num_transposes = 4
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_ndhwc_to_ncdhw('Conv3D-0', nodes)
+      self._assert_trans_ndhwc_to_ncdhw('batchnorm/mul_1-1', nodes)
+      self._assert_trans_ndhwc_to_ncdhw('batchnorm/add_1-1', nodes)
+      self._assert_trans_ncdhw_to_ndhwc('batchnorm/add_1-0-0', nodes)
+
+  @test_util.deprecated_graph_mode_only
+  def testBatchNorm3D(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x_3d = random_ops.truncated_normal([1, 4, 2, 3, 3], seed=0)
+      filters = random_ops.truncated_normal([2, 2, 2, 3, 3], seed=0)
+      strides_val = [1, 1, 1, 1, 1]
+      scale = constant_op.constant(0.1, shape=[3])
+      offset = constant_op.constant(0.3, shape=[3])
+      conv3d = gen_nn_ops.conv3d(x_3d, filters, strides_val, 'SAME')
+      y, _, _ = nn.fused_batch_norm(conv3d, scale, offset, data_format='NDHWC')
+      output = array_ops.identity(y)
+
+      with session.Session(config=_get_config(False)) as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_ndhwc_to_ncdhw('Conv3D-0', nodes)
+      self._assert_trans_ncdhw_to_ndhwc('FusedBatchNormV3-0-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  @test_util.deprecated_graph_mode_only
+  def testBatchNormGrad3D(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x_3d = random_ops.truncated_normal([1, 4, 2, 3, 3], seed=0)
+      filters = random_ops.truncated_normal([2, 2, 2, 3, 3], seed=0)
+      strides_val = [1, 1, 1, 1, 1]
+      scale = constant_op.constant(0.1, shape=[3])
+      offset = constant_op.constant(0.3, shape=[3])
+      mean = constant_op.constant(0.1, shape=[3])
+      variance = constant_op.constant(0.3, shape=[3])
+      conv3d = gen_nn_ops.conv3d(x_3d, filters, strides_val, 'SAME')
+      y, running_mean, running_var, r0, r1, r2 = gen_nn_ops.fused_batch_norm_v3(
+          conv3d,
+          scale,
+          offset,
+          mean,
+          variance,
+          epsilon=1.001e-5,
+          exponential_avg_factor=1.0,
+          data_format='NDHWC',
+          is_training=True,
+          name='batch_norm')
+      dx, dscale, doffset, _, _ = gen_nn_ops.fused_batch_norm_grad_v3(
+          y,
+          x_3d,
+          scale,
+          r0,
+          r1,
+          r2,
+          epsilon=1.001e-5,
+          data_format='NDHWC',
+          is_training=True)
+      output = array_ops.identity(dx)
+
+      with session.Session(config=_get_config(False)) as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 3
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_ndhwc_to_ncdhw('Conv3D-0', nodes)
+      self._assert_trans_ndhwc_to_ncdhw('FusedBatchNormGradV3-1', nodes)
+      self._assert_trans_ncdhw_to_ndhwc('FusedBatchNormGradV3-0-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
   @test_util.deprecated_graph_mode_only
   def testConv3D(self):
     if test.is_gpu_available(cuda_only=True):
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index d8eff0f2260..7c8492a5f48 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -10,6 +10,14 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = [
+        "//tensorflow/python/keras/google/private_tf_api_test:__pkg__",
+    ],
+)
+
 py_library(
     name = "keras",
     srcs = [
@@ -87,6 +95,8 @@ py_library(
         "//tensorflow/python/distribute:multi_worker_util",
         "//tensorflow/python/keras/engine:keras_tensor",
         "//tensorflow/python/keras/utils:control_flow_util",
+        "//tensorflow/python/keras/utils:tf_contextlib",
+        "//tensorflow/python/keras/utils:tf_inspect",
     ],
 )
 
@@ -94,7 +104,9 @@ py_library(
     name = "backend_config",
     srcs = ["backend_config.py"],
     srcs_version = "PY2AND3",
-    deps = ["//tensorflow/python:util"],
+    deps = [
+        "//tensorflow/python:util",
+    ],
 )
 
 # TODO(scottzhu): Cleanup this target and point all the user to keras/engine.
@@ -277,6 +289,7 @@ py_library(
 py_library(
     name = "optimizers",
     srcs = [
+        "optimizer_v1.py",
         "optimizers.py",
     ],
     srcs_version = "PY2AND3",
@@ -307,6 +320,7 @@ py_library(
     deps = [
         ":backend",
         ":models",
+        "//tensorflow/python:config",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_spec",
@@ -325,6 +339,7 @@ tf_py_test(
     size = "small",
     srcs = ["activations_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":activations",
         ":backend",
@@ -344,6 +359,7 @@ tf_py_test(
     size = "small",
     srcs = ["combinations_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":combinations",
         ":testing_utils",
@@ -360,6 +376,7 @@ tf_py_test(
     size = "small",
     srcs = ["constraints_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":backend",
         ":combinations",
@@ -374,6 +391,7 @@ tf_py_test(
     size = "small",
     srcs = ["initializers_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":backend",
         ":combinations",
@@ -408,6 +426,7 @@ tf_py_test(
     python_version = "PY3",
     shard_count = 8,
     tags = ["notsan"],
+    tfrt_enabled = True,
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -443,6 +462,7 @@ tf_py_test(
     srcs = ["metrics_functional_test.py"],
     python_version = "PY3",
     deps = [
+        ":combinations",
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
@@ -610,6 +630,7 @@ tf_py_test(
     size = "medium",
     srcs = ["backend_config_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":backend",
         ":backend_config",
@@ -624,6 +645,7 @@ tf_py_test(
     srcs = ["keras_parameterized_test.py"],
     python_version = "PY3",
     tags = ["notsan"],
+    tfrt_enabled = True,
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index fe0bf5977f9..4f1ef96c8ef 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -71,17 +71,21 @@ def softmax(x, axis=-1):
   Raises:
       ValueError: In case `dim(x) == 1`.
   """
-  ndim = K.ndim(x)
-  if ndim == 2:
-    return nn.softmax(x)
-  elif ndim > 2:
+  rank = x.shape.rank
+  if rank == 2:
+    output = nn.softmax(x)
+  elif rank > 2:
     e = math_ops.exp(x - math_ops.reduce_max(x, axis=axis, keepdims=True))
     s = math_ops.reduce_sum(e, axis=axis, keepdims=True)
-    return e / s
+    output = e / s
   else:
     raise ValueError('Cannot apply softmax to a tensor that is 1D. '
                      'Received input: %s' % (x,))
 
+  # Cache the logits to use for crossentropy loss.
+  output._keras_logits = x  # pylint: disable=protected-access
+  return output
+
 
 @keras_export('keras.activations.elu')
 @dispatch.add_dispatch_support
@@ -391,7 +395,10 @@ def sigmoid(x):
   Returns:
       Tensor with the sigmoid activation: `1 / (1 + exp(-x))`.
   """
-  return nn.sigmoid(x)
+  output = nn.sigmoid(x)
+  # Cache the logits to use for crossentropy loss.
+  output._keras_logits = x  # pylint: disable=protected-access
+  return output
 
 
 @keras_export('keras.activations.exponential')
@@ -499,8 +506,10 @@ def serialize(activation):
 def deserialize(name, custom_objects=None):
   """Returns activation function given a string identifier.
 
-  Arguments:
-      x : String identifier.
+  Args:
+    name: The name of the activation function.
+    custom_objects: Optional `{function_name: function_obj}`
+      dictionary listing user-provided activation functions.
 
   Returns:
       Corresponding activation function.
@@ -516,11 +525,6 @@ def deserialize(name, custom_objects=None):
   ...
   ValueError: Unknown activation function:abcd
 
-  Args:
-    name: The name of the activation function.
-    custom_objects: Optional `{function_name: function_obj}`
-      dictionary listing user-provided activation functions.
-
   Raises:
       ValueError: `Unknown activation function` if the input string does not
       denote any defined Tensorflow activation function.
diff --git a/tensorflow/python/keras/applications/BUILD b/tensorflow/python/keras/applications/BUILD
index a2c41dbe501..b8e89a22f09 100644
--- a/tensorflow/python/keras/applications/BUILD
+++ b/tensorflow/python/keras/applications/BUILD
@@ -12,7 +12,11 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
 
 py_library(
     name = "applications",
@@ -41,6 +45,7 @@ py_library(
         "//tensorflow/python:tf_export",
         "//tensorflow/python/keras:activations",
         "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras:models",
         "//tensorflow/python/keras/engine",
         "//tensorflow/python/keras/layers",
         "//tensorflow/python/keras/utils:data_utils",
@@ -54,7 +59,11 @@ tf_py_test(
     size = "medium",
     srcs = ["applications_test.py"],
     shard_count = 36,
-    tags = ["no_rocm"],
+    tags = [
+        "no_rocm",
+        "notsan",  # b/168814536
+    ],
+    tfrt_enabled = True,
     deps = [
         ":applications",
         "//tensorflow/python:client_testlib",
@@ -74,10 +83,12 @@ tf_py_test(
     tags = [
         "no_oss",
         "no_pip",
+        "notsan",  # b/168814536
     ],
     deps = [
         ":applications",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras/preprocessing",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -90,10 +101,12 @@ tf_py_test(
     tags = [
         "no_oss",
         "no_pip",
+        "notsan",  # TODO(b/170901700)
     ],
     deps = [
         ":applications",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras/preprocessing",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -110,6 +123,7 @@ tf_py_test(
     deps = [
         ":applications",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras/preprocessing",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -126,6 +140,7 @@ tf_py_test(
     deps = [
         ":applications",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras/preprocessing",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -142,6 +157,7 @@ tf_py_test(
     deps = [
         ":applications",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras/preprocessing",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -158,6 +174,7 @@ tf_py_test(
     deps = [
         ":applications",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras/preprocessing",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -174,6 +191,7 @@ tf_py_test(
     deps = [
         ":applications",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras/preprocessing",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -190,6 +208,7 @@ tf_py_test(
     deps = [
         ":applications",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras/preprocessing",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -206,14 +225,15 @@ tf_py_test(
     deps = [
         ":applications",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras/preprocessing",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
 tf_py_test(
-    name = "applications_load_weight_test_mobilenet_v3",
+    name = "applications_load_weight_test_mobilenet_v3_small",
     srcs = ["applications_load_weight_test.py"],
-    args = ["--module=mobilenet_v3"],
+    args = ["--module=mobilenet_v3_small"],
     main = "applications_load_weight_test.py",
     tags = [
         "no_oss",
@@ -222,6 +242,24 @@ tf_py_test(
     deps = [
         ":applications",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras/preprocessing",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+tf_py_test(
+    name = "applications_load_weight_test_mobilenet_v3_large",
+    srcs = ["applications_load_weight_test.py"],
+    args = ["--module=mobilenet_v3_large"],
+    main = "applications_load_weight_test.py",
+    tags = [
+        "no_oss",
+        "no_pip",
+    ],
+    deps = [
+        ":applications",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras/preprocessing",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -240,6 +278,7 @@ tf_py_test(
     deps = [
         ":applications",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras/preprocessing",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -258,6 +297,7 @@ tf_py_test(
     deps = [
         ":applications",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras/preprocessing",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -274,6 +314,7 @@ tf_py_test(
     deps = [
         ":applications",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras/preprocessing",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -290,6 +331,7 @@ tf_py_test(
     deps = [
         ":applications",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras/preprocessing",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -302,6 +344,7 @@ tf_py_test(
     deps = [
         ":applications",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/keras/applications/applications_load_weight_test.py b/tensorflow/python/keras/applications/applications_load_weight_test.py
index aaafe9f984a..2ae8a03b79a 100644
--- a/tensorflow/python/keras/applications/applications_load_weight_test.py
+++ b/tensorflow/python/keras/applications/applications_load_weight_test.py
@@ -52,8 +52,8 @@ ARG_TO_MODEL = {
                             [inception_resnet_v2.InceptionResNetV2]),
     'mobilenet': (mobilenet, [mobilenet.MobileNet]),
     'mobilenet_v2': (mobilenet_v2, [mobilenet_v2.MobileNetV2]),
-    'mobilenet_v3': (mobilenet_v3, [mobilenet_v3.MobileNetV3Small,
-                                    mobilenet_v3.MobileNetV3Large]),
+    'mobilenet_v3_small': (mobilenet_v3, [mobilenet_v3.MobileNetV3Small]),
+    'mobilenet_v3_large': (mobilenet_v3, [mobilenet_v3.MobileNetV3Large]),
     'densenet': (densenet, [densenet.DenseNet121,
                             densenet.DenseNet169, densenet.DenseNet201]),
     'nasnet_mobile': (nasnet, [nasnet.NASNetMobile]),
diff --git a/tensorflow/python/keras/applications/densenet.py b/tensorflow/python/keras/applications/densenet.py
index 03f5754bd61..c66fe75554e 100644
--- a/tensorflow/python/keras/applications/densenet.py
+++ b/tensorflow/python/keras/applications/densenet.py
@@ -145,8 +145,9 @@ def DenseNet(
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.densenet.preprocess_input` for an example.
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For DenseNet, call `tf.keras.applications.densenet.preprocess_input` on your
+  inputs before passing them to the model.
 
   Arguments:
     blocks: numbers of building blocks for the four dense layers.
@@ -382,9 +383,10 @@ DOC = """
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
-  
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.densenet.preprocess_input` for an example.
+
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For DenseNet, call `tf.keras.applications.densenet.preprocess_input` on your
+  inputs before passing them to the model.
 
   Arguments:
     include_top: whether to include the fully-connected
diff --git a/tensorflow/python/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/applications/inception_resnet_v2.py
index 2ac213861b9..5e46d97fdd2 100644
--- a/tensorflow/python/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/applications/inception_resnet_v2.py
@@ -61,8 +61,10 @@ def InceptionResNetV2(include_top=True,
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.inception_resnet_v2.preprocess_input` for an example.
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For InceptionResNetV2, call
+  `tf.keras.applications.inception_resnet_v2.preprocess_input`
+  on your inputs before passing them to the model.
 
   Arguments:
     include_top: whether to include the fully-connected
diff --git a/tensorflow/python/keras/applications/inception_v3.py b/tensorflow/python/keras/applications/inception_v3.py
index 59a869840b4..94e1ab558b8 100644
--- a/tensorflow/python/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/applications/inception_v3.py
@@ -63,8 +63,9 @@ def InceptionV3(
   Note that the data format convention used by the model is
   the one specified in the `tf.keras.backend.image_data_format()`.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.inception_v3.preprocess_input` for an example.
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For InceptionV3, call `tf.keras.applications.inception_v3.preprocess_input`
+  on your inputs before passing them to the model.
 
   Arguments:
     include_top: Boolean, whether to include the fully-connected
diff --git a/tensorflow/python/keras/applications/mobilenet.py b/tensorflow/python/keras/applications/mobilenet.py
index b628f2ccc6a..d434a801e52 100644
--- a/tensorflow/python/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/applications/mobilenet.py
@@ -104,8 +104,9 @@ def MobileNet(input_shape=None,
   Note that the data format convention used by the model is
   the one specified in the `tf.keras.backend.image_data_format()`.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.mobilenet.preprocess_input` for an example.
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For MobileNet, call `tf.keras.applications.mobilenet.preprocess_input`
+  on your inputs before passing them to the model.
 
   Arguments:
     input_shape: Optional shape tuple, only to be specified if `include_top`
diff --git a/tensorflow/python/keras/applications/mobilenet_v2.py b/tensorflow/python/keras/applications/mobilenet_v2.py
index 13867a40569..c149bc3d16f 100644
--- a/tensorflow/python/keras/applications/mobilenet_v2.py
+++ b/tensorflow/python/keras/applications/mobilenet_v2.py
@@ -111,8 +111,9 @@ def MobileNetV2(input_shape=None,
 
   Optionally loads weights pre-trained on ImageNet.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.mobilenet_v2.preprocess_input` for an example.
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For MobileNetV2, call `tf.keras.applications.mobilenet_v2.preprocess_input`
+  on your inputs before passing them to the model.
 
   Arguments:
     input_shape: Optional shape tuple, to be specified if you would
diff --git a/tensorflow/python/keras/applications/mobilenet_v3.py b/tensorflow/python/keras/applications/mobilenet_v3.py
index a83716e4893..b774623b6af 100644
--- a/tensorflow/python/keras/applications/mobilenet_v3.py
+++ b/tensorflow/python/keras/applications/mobilenet_v3.py
@@ -61,14 +61,15 @@ BASE_DOCSTRING = """Instantiates the {name} architecture.
   The following table describes the performance of MobileNets:
   ------------------------------------------------------------------------
   MACs stands for Multiply Adds
-
-  |Classification Checkpoint|MACs(M)|Parameters(M)|Top1 Accuracy|Pixel1|CPU(ms)|
-  | [mobilenet_v3_large_1.0_224]              | 217 | 5.4 |   75.6   |   51.2  |
-  | [mobilenet_v3_large_0.75_224]             | 155 | 4.0 |   73.3   |   39.8  |
-  | [mobilenet_v3_large_minimalistic_1.0_224] | 209 | 3.9 |   72.3   |   44.1  |
-  | [mobilenet_v3_small_1.0_224]              | 66  | 2.9 |   68.1   |   15.8  |
-  | [mobilenet_v3_small_0.75_224]             | 44  | 2.4 |   65.4   |   12.8  |
-  | [mobilenet_v3_small_minimalistic_1.0_224] | 65  | 2.0 |   61.9   |   12.2  |
+  
+  |Classification Checkpoint|MACs(M)|Parameters(M)|Top1 Accuracy|Pixel1 CPU(ms)|
+  |---|---|---|---|---|
+  | mobilenet_v3_large_1.0_224              | 217 | 5.4 |   75.6   |   51.2  |
+  | mobilenet_v3_large_0.75_224             | 155 | 4.0 |   73.3   |   39.8  |
+  | mobilenet_v3_large_minimalistic_1.0_224 | 209 | 3.9 |   72.3   |   44.1  |
+  | mobilenet_v3_small_1.0_224              | 66  | 2.9 |   68.1   |   15.8  |
+  | mobilenet_v3_small_0.75_224             | 44  | 2.4 |   65.4   |   12.8  |
+  | mobilenet_v3_small_minimalistic_1.0_224 | 65  | 2.0 |   61.9   |   12.2  |
 
   The weights for all 6 models are obtained and translated from the Tensorflow
   checkpoints from TensorFlow checkpoints found [here]
@@ -76,8 +77,10 @@ BASE_DOCSTRING = """Instantiates the {name} architecture.
 
   Optionally loads weights pre-trained on ImageNet.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.mobilenet_v3.preprocess_input` for an example.
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For MobileNetV3, call
+  `tf.keras.applications.mobilenet_v3.preprocess_input` on your
+  inputs before passing them to the model.
 
   Arguments:
     input_shape: Optional shape tuple, to be specified if you would
diff --git a/tensorflow/python/keras/applications/nasnet.py b/tensorflow/python/keras/applications/nasnet.py
index 3f14646a8e2..5887cfca594 100644
--- a/tensorflow/python/keras/applications/nasnet.py
+++ b/tensorflow/python/keras/applications/nasnet.py
@@ -86,9 +86,6 @@ def NASNet(
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.nasnet.preprocess_input` for an example.
-
   Arguments:
     input_shape: Optional shape tuple, the input shape
       is by default `(331, 331, 3)` for NASNetLarge and
@@ -331,7 +328,7 @@ def NASNetMobile(input_shape=None,
                  pooling=None,
                  classes=1000):
   """Instantiates a Mobile NASNet model in ImageNet mode.
-  
+
   Reference:
   - [Learning Transferable Architectures for Scalable Image Recognition](
       https://arxiv.org/abs/1707.07012) (CVPR 2018)
@@ -339,9 +336,10 @@ def NASNetMobile(input_shape=None,
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
-  
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.nasnet.preprocess_input` for an example.
+
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For NASNet, call `tf.keras.applications.nasnet.preprocess_input` on your
+  inputs before passing them to the model.
 
   Arguments:
       input_shape: Optional shape tuple, only to be specified
@@ -407,7 +405,7 @@ def NASNetLarge(input_shape=None,
                 pooling=None,
                 classes=1000):
   """Instantiates a NASNet model in ImageNet mode.
-  
+
   Reference:
   - [Learning Transferable Architectures for Scalable Image Recognition](
       https://arxiv.org/abs/1707.07012) (CVPR 2018)
@@ -415,9 +413,10 @@ def NASNetLarge(input_shape=None,
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
-  
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.nasnet.preprocess_input` for an example.
+
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For NASNet, call `tf.keras.applications.nasnet.preprocess_input` on your
+  inputs before passing them to the model.
 
   Arguments:
       input_shape: Optional shape tuple, only to be specified
diff --git a/tensorflow/python/keras/applications/resnet.py b/tensorflow/python/keras/applications/resnet.py
index 720704f20df..9c50b8a7c65 100644
--- a/tensorflow/python/keras/applications/resnet.py
+++ b/tensorflow/python/keras/applications/resnet.py
@@ -79,9 +79,6 @@ def ResNet(stack_fn,
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.resnet.preprocess_input` for an example.
-
   Arguments:
     stack_fn: a function that returns output tensor for the
       stacked residual blocks.
@@ -545,9 +542,10 @@ DOC = """
   Optionally loads weights pre-trained on ImageNet.
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
-  
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.resnet.preprocess_input` for an example.
+
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For ResNet, call `tf.keras.applications.resnet.preprocess_input` on your
+  inputs before passing them to the model.
 
   Arguments:
     include_top: whether to include the fully-connected
diff --git a/tensorflow/python/keras/applications/resnet_v2.py b/tensorflow/python/keras/applications/resnet_v2.py
index 212e25350a2..83f6e674cc8 100644
--- a/tensorflow/python/keras/applications/resnet_v2.py
+++ b/tensorflow/python/keras/applications/resnet_v2.py
@@ -148,8 +148,9 @@ DOC = """
   Note that the data format convention used by the model is
   the one specified in your Keras config at `~/.keras/keras.json`.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.resnet_v2.preprocess_input` for an example.
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For ResNetV2, call `tf.keras.applications.resnet_v2.preprocess_input` on your
+  inputs before passing them to the model.
 
   Arguments:
     include_top: whether to include the fully-connected
diff --git a/tensorflow/python/keras/applications/vgg16.py b/tensorflow/python/keras/applications/vgg16.py
index 9b46ca397af..33bf8d25b24 100644
--- a/tensorflow/python/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/applications/vgg16.py
@@ -66,8 +66,9 @@ def VGG16(
 
   The default input size for this model is 224x224.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.vgg16.preprocess_input` for an example.
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For VGG16, call `tf.keras.applications.vgg16.preprocess_input` on your
+  inputs before passing them to the model.
 
   Arguments:
       include_top: whether to include the 3 fully-connected
diff --git a/tensorflow/python/keras/applications/vgg19.py b/tensorflow/python/keras/applications/vgg19.py
index 54dc62cf20b..ad6c9b84c00 100644
--- a/tensorflow/python/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/applications/vgg19.py
@@ -66,8 +66,9 @@ def VGG19(
 
   The default input size for this model is 224x224.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.vgg19.preprocess_input` for an example.
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For VGG19, call `tf.keras.applications.vgg19.preprocess_input` on your
+  inputs before passing them to the model.
 
   Arguments:
     include_top: whether to include the 3 fully-connected
diff --git a/tensorflow/python/keras/applications/xception.py b/tensorflow/python/keras/applications/xception.py
index 80027b2164b..3d595ffa419 100644
--- a/tensorflow/python/keras/applications/xception.py
+++ b/tensorflow/python/keras/applications/xception.py
@@ -68,8 +68,9 @@ def Xception(
   the one specified in your Keras config at `~/.keras/keras.json`.
   Note that the default input image size for this model is 299x299.
 
-  Caution: Be sure to properly pre-process your inputs to the application.
-  Please see `applications.xception.preprocess_input` for an example.
+  Note: each Keras Application expects a specific kind of input preprocessing.
+  For Xception, call `tf.keras.applications.xception.preprocess_input` on your
+  inputs before passing them to the model.
 
   Arguments:
     include_top: whether to include the fully-connected
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 651acbfeac4..2493d32fe6a 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -27,6 +27,7 @@ import json
 import os
 import sys
 import threading
+import warnings
 import weakref
 
 import numpy as np
@@ -40,7 +41,6 @@ from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function as eager_function
 from tensorflow.python.eager import lift_to_graph
-from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device_spec
@@ -54,6 +54,8 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend_config
 from tensorflow.python.keras.engine import keras_tensor
 from tensorflow.python.keras.utils import control_flow_util
+from tensorflow.python.keras.utils import tf_contextlib
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
@@ -81,8 +83,6 @@ from tensorflow.python.training.tracking import util as tracking_util
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
-from tensorflow.python.util import tf_contextlib
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
@@ -107,6 +107,17 @@ _CURRENT_SCRATCH_GRAPH = threading.local()
 _SESSION = threading.local()
 
 
+# A global dictionary mapping graph objects to an index of counters used
+# for various layer/optimizer names in each graph.
+# Allows to give unique autogenerated names to layers, in a graph-specific way.
+PER_GRAPH_OBJECT_NAME_UIDS = weakref.WeakKeyDictionary()
+
+
+# A global set tracking what object names have been seen so far.
+# Optionally used as an avoid-list when generaing names
+OBSERVED_NAMES = set()
+
+
 # _DUMMY_EAGER_GRAPH.key is used as a key in _GRAPH_LEARNING_PHASES.
 # We keep a separate reference to it to make sure it does not get removed from
 # _GRAPH_LEARNING_PHASES.
@@ -210,12 +221,6 @@ def cast_to_floatx(x):
   return np.asarray(x, dtype=floatx())
 
 
-# A global dictionary mapping graph objects to an index of counters used
-# for various layer/optimizer names in each graph.
-# Allows to give unique autogenerated names to layers, in a graph-specific way.
-PER_GRAPH_OBJECT_NAME_UIDS = weakref.WeakKeyDictionary()
-
-
 @keras_export('keras.backend.get_uid')
 def get_uid(prefix=''):
   """Associates a string prefix with an integer counter in a TensorFlow graph.
@@ -248,6 +253,7 @@ def reset_uids():
   """
 
   PER_GRAPH_OBJECT_NAME_UIDS.clear()
+  OBSERVED_NAMES.clear()
 
 
 @keras_export('keras.backend.clear_session')
@@ -425,10 +431,10 @@ def set_learning_phase(value):
   Raises:
       ValueError: if `value` is neither `0` nor `1`.
   """
-  logging.warning('`tf.keras.backend.set_learning_phase` is deprecated and '
-                  'will be removed after 2020-10-11. To update it, simply '
-                  'pass a True/False value to the `training` argument of the '
-                  '`__call__` method of your layer or model.')
+  warnings.warn('`tf.keras.backend.set_learning_phase` is deprecated and '
+                'will be removed after 2020-10-11. To update it, simply '
+                'pass a True/False value to the `training` argument of the '
+                '`__call__` method of your layer or model.')
   deprecated_internal_set_learning_phase(value)
 
 
@@ -483,10 +489,10 @@ def learning_phase_scope(value):
   Raises:
      ValueError: if `value` is neither `0` nor `1`.
   """
-  logging.warning('`tf.keras.backend.learning_phase_scope` is deprecated and '
-                  'will be removed after 2020-10-11. To update it, simply '
-                  'pass a True/False value to the `training` argument of the '
-                  '`__call__` method of your layer or model.')
+  warnings.warn('`tf.keras.backend.learning_phase_scope` is deprecated and '
+                'will be removed after 2020-10-11. To update it, simply '
+                'pass a True/False value to the `training` argument of the '
+                '`__call__` method of your layer or model.')
   with deprecated_internal_learning_phase_scope(value):
     try:
       yield
@@ -840,7 +846,7 @@ def _to_tensor(x, dtype):
   Returns:
       A tensor.
   """
-  return ops.convert_to_tensor_v2(x, dtype=dtype)
+  return ops.convert_to_tensor_v2_with_dispatch(x, dtype=dtype)
 
 
 @keras_export('keras.backend.is_sparse')
@@ -999,11 +1005,17 @@ def track_variable(v):
   _GRAPH_VARIABLES[graph].add(v)
 
 
+def observe_object_name(name):
+  """Observe a name and make sure it won't be used by `unique_object_name`."""
+  OBSERVED_NAMES.add(name)
+
+
 def unique_object_name(name,
                        name_uid_map=None,
                        avoid_names=None,
                        namespace='',
-                       zero_based=False):
+                       zero_based=False,
+                       avoid_observed_names=False):
   """Makes a object name (or arbitrary string) unique within a TensorFlow graph.
 
   Arguments:
@@ -1011,12 +1023,15 @@ def unique_object_name(name,
     name_uid_map: An optional defaultdict(int) to use when creating unique
       names. If None (default), uses a per-Graph dictionary.
     avoid_names: An optional set or dict with names which should not be used. If
-      None (default) does not avoid any names.
+      None (default), don't avoid any names unless `avoid_observed_names` is
+      True.
     namespace: Gets a name which is unique within the (graph, namespace). Layers
       which are not Networks use a blank namespace and so get graph-global
       names.
     zero_based: If True, name sequences start with no suffix (e.g. "dense",
       "dense_1"). If False, naming is one-based ("dense_1", "dense_2").
+    avoid_observed_names: If True, avoid any names that have been observed by
+      `backend.observe_object_name`.
 
   Returns:
     Unique string name.
@@ -1031,7 +1046,10 @@ def unique_object_name(name,
   if name_uid_map is None:
     name_uid_map = get_default_graph_uid_map()
   if avoid_names is None:
-    avoid_names = set()
+    if avoid_observed_names:
+      avoid_names = OBSERVED_NAMES
+    else:
+      avoid_names = set()
   proposed_name = None
   while proposed_name is None or proposed_name in avoid_names:
     name_key = (namespace, name)
@@ -1207,11 +1225,10 @@ def placeholder(shape=None,
     if ndim:
       shape = (None,) * ndim
   if keras_tensor.keras_tensors_enabled():
-    spec = tensor_spec.TensorSpec(
-        shape=shape, dtype=dtype, name=name)
     if sparse:
       spec = sparse_tensor.SparseTensorSpec(
           shape=shape, dtype=dtype)
+      x = keras_tensor.SparseKerasTensor(spec, name=name)
     elif ragged:
       ragged_rank = 0
       for i in range(1, len(shape)):
@@ -1224,7 +1241,11 @@ def placeholder(shape=None,
       spec = ragged_tensor.RaggedTensorSpec(
           shape=shape, dtype=dtype, ragged_rank=ragged_rank)
 
-    x = keras_tensor.KerasTensor(spec, name=name)
+      x = keras_tensor.RaggedKerasTensor(spec, name=name)
+    else:
+      spec = tensor_spec.TensorSpec(
+          shape=shape, dtype=dtype, name=name)
+      x = keras_tensor.KerasTensor(spec, name=name)
   else:
     with get_graph().as_default():
       if sparse:
@@ -1267,7 +1288,8 @@ def is_placeholder(x):
   try:
     if keras_tensor.keras_tensors_enabled():
       return hasattr(x, '_is_backend_placeholder')
-    if isinstance(x, composite_tensor.CompositeTensor):
+    from tensorflow.python.keras.utils import tf_utils  # pylint: disable=g-import-not-at-top
+    if tf_utils.is_extension_type(x):
       flat_components = nest.flatten(x, expand_composites=True)
       return py_any(is_placeholder(c) for c in flat_components)
     else:
@@ -1816,6 +1838,8 @@ def moving_average_update(x, value, momentum):
 def dot(x, y):
   """Multiplies 2 tensors (and/or variables) and returns a tensor.
 
+  This operation corresponds to `numpy.dot(a, b, out=None)`.
+
   Arguments:
       x: Tensor or variable.
       y: Tensor or variable.
@@ -1825,6 +1849,7 @@ def dot(x, y):
 
   Examples:
 
+  If inputs `x` and `y` are 2-D arrays, then it is equivalent to `tf.matmul`.
   >>> x = tf.keras.backend.placeholder(shape=(2, 3))
   >>> y = tf.keras.backend.placeholder(shape=(3, 4))
   >>> xy = tf.keras.backend.dot(x, y)
@@ -1837,6 +1862,8 @@ def dot(x, y):
   >>> xy
   <KerasTensor: shape=(32, 28, 4) dtype=float32 ...>
 
+  If `x` is an N-D array and `y` is an M-D array (where M>=2), it is a sum
+  product over the last axis of `x` and the second-to-last axis of `y`.
   >>> x = tf.keras.backend.random_uniform_variable(shape=(2, 3), low=0, high=1)
   >>> y = tf.keras.backend.ones((4, 3, 5))
   >>> xy = tf.keras.backend.dot(x, y)
@@ -2419,6 +2446,9 @@ def abs(x):
 def sqrt(x):
   """Element-wise square root.
 
+     This function clips tensor values to a specified min(0) and max(inf)
+     before taking sqrt.
+
   Arguments:
       x: Tensor or variable.
 
@@ -3880,7 +3910,8 @@ class GraphExecutionFunction(object):
     # CompositeTensors. E.g., if output_structure contains a SparseTensor, then
     # this ensures that we return its value as a SparseTensorValue rather than
     # a SparseTensor.
-    if isinstance(tensor, composite_tensor.CompositeTensor):
+    from tensorflow.python.keras.utils import tf_utils  # pylint: disable=g-import-not-at-top
+    if tf_utils.is_extension_type(tensor):
       return self._session.run(tensor)
     else:
       return tensor
@@ -4766,7 +4797,7 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
       from_logits: Boolean, whether `output` is the
           result of a softmax, or is a tensor of logits.
       axis: Int specifying the channels axis. `axis=-1` corresponds to data
-          format `channels_last', and `axis=1` corresponds to data format
+          format `channels_last`, and `axis=1` corresponds to data format
           `channels_first`.
 
   Returns:
@@ -4797,10 +4828,16 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
   [0. 0. 0.]
 
   """
-  target = ops.convert_to_tensor_v2(target)
-  output = ops.convert_to_tensor_v2(output)
-
+  target = ops.convert_to_tensor_v2_with_dispatch(target)
+  output = ops.convert_to_tensor_v2_with_dispatch(output)
   target.shape.assert_is_compatible_with(output.shape)
+
+  # Use logits whenever they are available. `softmax` and `sigmoid`
+  # activations cache logits on the `output` Tensor.
+  if hasattr(output, '_keras_logits'):
+    output = output._keras_logits  # pylint: disable=protected-access
+    from_logits = True
+
   if from_logits:
     return nn.softmax_cross_entropy_with_logits_v2(
         labels=target, logits=output, axis=axis)
@@ -4838,7 +4875,7 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
       from_logits: Boolean, whether `output` is the
           result of a softmax, or is a tensor of logits.
       axis: Int specifying the channels axis. `axis=-1` corresponds to data
-          format `channels_last', and `axis=1` corresponds to data format
+          format `channels_last`, and `axis=1` corresponds to data format
           `channels_first`.
 
   Returns:
@@ -4847,12 +4884,17 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
   Raises:
       ValueError: if `axis` is neither -1 nor one of the axes of `output`.
   """
-  target = ops.convert_to_tensor_v2(target)
-  output = ops.convert_to_tensor_v2(output)
+  target = ops.convert_to_tensor_v2_with_dispatch(target)
+  output = ops.convert_to_tensor_v2_with_dispatch(output)
 
-  if (not from_logits and
-      not isinstance(output, (ops.EagerTensor, variables_module.Variable)) and
-      output.op.type == 'Softmax') and not hasattr(output, '_keras_history'):
+  # Use logits whenever they are available. `softmax` and `sigmoid`
+  # activations cache logits on the `output` Tensor.
+  if hasattr(output, '_keras_logits'):
+    output = output._keras_logits  # pylint: disable=protected-access
+    from_logits = True
+  elif (not from_logits and
+        not isinstance(output, (ops.EagerTensor, variables_module.Variable)) and
+        output.op.type == 'Softmax') and not hasattr(output, '_keras_history'):
     # When softmax activation function is used for output operation, we
     # use logits from the softmax function directly to compute loss in order
     # to prevent collapsing zero when training.
@@ -4860,8 +4902,7 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
     assert len(output.op.inputs) == 1
     output = output.op.inputs[0]
     from_logits = True
-
-  if not from_logits:
+  elif not from_logits:
     epsilon_ = _constant_to_tensor(epsilon(), output.dtype.base_dtype)
     output = clip_ops.clip_by_value(output, epsilon_, 1 - epsilon_)
     output = math_ops.log(output)
@@ -4925,8 +4966,14 @@ def binary_crossentropy(target, output, from_logits=False):
   Returns:
       A tensor.
   """
-  target = ops.convert_to_tensor_v2(target)
-  output = ops.convert_to_tensor_v2(output)
+  target = ops.convert_to_tensor_v2_with_dispatch(target)
+  output = ops.convert_to_tensor_v2_with_dispatch(output)
+
+  # Use logits whenever they are available. `softmax` and `sigmoid`
+  # activations cache logits on the `output` Tensor.
+  if hasattr(output, '_keras_logits'):
+    output = output._keras_logits  # pylint: disable=protected-access
+    from_logits = True
 
   if from_logits:
     return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
@@ -6018,8 +6065,9 @@ def random_binomial(shape, p=0.0, dtype=None, seed=None):
   <tf.Tensor: shape=(2, 3), dtype=float32, numpy=...,
   dtype=float32)>
   """
-  logging.warning('`tf.keras.backend.random_binomial` is deprecated. '
-                  'Please use `tf.keras.backend.random_bernoulli` instead.')
+  warnings.warn('`tf.keras.backend.random_binomial` is deprecated, '
+                'and will be removed in a future version.'
+                'Please use `tf.keras.backend.random_bernoulli` instead.')
   return random_bernoulli(shape, p, dtype, seed)
 
 
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 2e0274a509b..4dd2a45eba6 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -36,11 +36,11 @@ from tensorflow.python.keras import combinations
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.layers import advanced_activations
 from tensorflow.python.keras.layers import normalization
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
-from tensorflow.python.util import tf_inspect
 
 
 def compare_single_input_op_to_numpy(keras_op,
@@ -491,7 +491,7 @@ class BackendLinearAlgebraTest(test.TestCase, parameterized.TestCase):
                                      input_shape_b=(4, 7))
 
   def test_relu(self):
-    x = ops.convert_to_tensor_v2([[-4, 0], [2, 7]], 'float32')
+    x = ops.convert_to_tensor_v2_with_dispatch([[-4, 0], [2, 7]], 'float32')
 
     # standard relu
     relu_op = backend.relu(x)
@@ -1310,7 +1310,7 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
     inputs = backend.variable(input_val)
     initial_states = [
         backend.variable(init_state_val),
-        ops.convert_to_tensor_v2(
+        ops.convert_to_tensor_v2_with_dispatch(
             np.concatenate([init_state_val, init_state_val], axis=-1))
     ]
     mask = backend.variable(np_mask)
@@ -1617,9 +1617,11 @@ class BackendCrossEntropyLossesTest(test.TestCase, parameterized.TestCase):
     p = backend.placeholder()
     o = backend.categorical_crossentropy(t, p)
 
-    t_val = ops.convert_to_tensor_v2([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
-    p_val = ops.convert_to_tensor_v2([[.9, .05, .05], [.05, .89, .06],
-                                      [.05, .01, .94]])
+    t_val = ops.convert_to_tensor_v2_with_dispatch([[1., 0., 0.], [0., 1., 0.],
+                                                    [0., 0., 1.]])
+    p_val = ops.convert_to_tensor_v2_with_dispatch([[.9, .05, .05],
+                                                    [.05, .89, .06],
+                                                    [.05, .01, .94]])
     f = backend.function([t, p], o)
 
     result = f([t_val, p_val])
@@ -1633,7 +1635,8 @@ class BackendCrossEntropyLossesTest(test.TestCase, parameterized.TestCase):
     self.assertArrayNear(result, [.105, .065, .111], 1e-3)
 
     # from logits
-    p_val = ops.convert_to_tensor_v2([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    p_val = ops.convert_to_tensor_v2_with_dispatch([[8., 1., 1.], [0., 9., 1.],
+                                                    [2., 3., 5.]])
     o = backend.categorical_crossentropy(t, p, from_logits=True)
     f = backend.function([t, p], o)
 
@@ -1685,9 +1688,10 @@ class BackendCrossEntropyLossesTest(test.TestCase, parameterized.TestCase):
     p = backend.placeholder()
     o = backend.sparse_categorical_crossentropy(t, p)
 
-    t_val = ops.convert_to_tensor_v2([0, 1, 2])
-    p_val = ops.convert_to_tensor_v2([[.9, .05, .05], [.05, .89, .06],
-                                      [.05, .01, .94]])
+    t_val = ops.convert_to_tensor_v2_with_dispatch([0, 1, 2])
+    p_val = ops.convert_to_tensor_v2_with_dispatch([[.9, .05, .05],
+                                                    [.05, .89, .06],
+                                                    [.05, .01, .94]])
     f = backend.function([t, p], o)
 
     result = f([t_val, p_val])
@@ -1703,7 +1707,8 @@ class BackendCrossEntropyLossesTest(test.TestCase, parameterized.TestCase):
       _ = f([t_val, p_val])
 
     # from logits
-    p_val = ops.convert_to_tensor_v2([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    p_val = ops.convert_to_tensor_v2_with_dispatch([[8., 1., 1.], [0., 9., 1.],
+                                                    [2., 3., 5.]])
     o = backend.sparse_categorical_crossentropy(t, p, from_logits=True)
     f = backend.function([t, p], o)
 
@@ -2124,9 +2129,10 @@ class ControlOpsTests(test.TestCase):
     self.assertEqual(backend.eval(tensor), [9.0])
 
   def test_unequal_rank(self):
-    x = ops.convert_to_tensor_v2(
+    x = ops.convert_to_tensor_v2_with_dispatch(
         np.array([[1, 2, 3], [4, 5, 6]]), dtype='float32')
-    y = ops.convert_to_tensor_v2(np.array([1, 2, 3]), dtype='float32')
+    y = ops.convert_to_tensor_v2_with_dispatch(
+        np.array([1, 2, 3]), dtype='float32')
 
     def true_func():
       return x
diff --git a/tensorflow/python/keras/benchmarks/BUILD b/tensorflow/python/keras/benchmarks/BUILD
index 95e88ca7a9d..d25afb24f9a 100644
--- a/tensorflow/python/keras/benchmarks/BUILD
+++ b/tensorflow/python/keras/benchmarks/BUILD
@@ -8,7 +8,11 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
 
 # To run CPU benchmarks:
 #   bazel run -c opt benchmarks_test -- --benchmarks=.
@@ -36,7 +40,10 @@ py_library(
 # This lib is mainly for running benchmarks on mlcompass infra.
 py_library(
     name = "profiler_lib",
-    visibility = ["//tensorflow:internal"],
+    visibility = [
+        "//learning/brain/contrib/keras_benchmark:__subpackages__",
+        "//tensorflow/python/keras:__subpackages__",
+    ],
 )
 
 COMMON_TAGS = [
@@ -66,9 +73,11 @@ cuda_py_test(
     tags = COMMON_TAGS + [
         "no_oss_py38",  # TODO(b/162044699)
     ],
+    tfrt_enabled = True,
     deps = [
         ":profiler_lib",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/utils:tf_inspect",
     ],
 )
 
@@ -76,6 +85,7 @@ cuda_py_test(
     name = "model_components_benchmarks_test",
     srcs = ["model_components_benchmarks_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":profiler_lib",
         "//tensorflow:tensorflow_py",
@@ -97,6 +107,7 @@ cuda_py_test(
     srcs = ["keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py"],
     python_version = "PY3",
     tags = COMMON_TAGS,
+    tfrt_enabled = True,
     deps = [
         ":benchmark_util",
         ":profiler_lib",
@@ -109,6 +120,7 @@ cuda_py_test(
     srcs = ["keras_examples_benchmarks/text_classification_transformer_benchmark_test.py"],
     python_version = "PY3",
     tags = COMMON_TAGS,
+    tfrt_enabled = True,
     deps = [
         ":benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -120,6 +132,7 @@ cuda_py_test(
     srcs = ["keras_examples_benchmarks/antirectifier_benchmark_test.py"],
     python_version = "PY3",
     tags = COMMON_TAGS,
+    tfrt_enabled = True,
     deps = [
         ":benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -131,6 +144,7 @@ cuda_py_test(
     srcs = ["keras_examples_benchmarks/mnist_conv_benchmark_test.py"],
     python_version = "PY3",
     tags = COMMON_TAGS,
+    tfrt_enabled = True,
     deps = [
         ":benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -143,6 +157,7 @@ cuda_py_test(
     srcs = ["keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py"],
     python_version = "PY3",
     tags = COMMON_TAGS,
+    tfrt_enabled = True,
     deps = [
         ":benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -154,6 +169,7 @@ cuda_py_test(
     srcs = ["keras_examples_benchmarks/mnist_irnn_benchmark_test.py"],
     python_version = "PY3",
     tags = COMMON_TAGS,
+    tfrt_enabled = True,
     deps = [
         ":benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -165,6 +181,7 @@ cuda_py_test(
     srcs = ["keras_examples_benchmarks/reuters_mlp_benchmark_test.py"],
     python_version = "PY3",
     tags = COMMON_TAGS,
+    tfrt_enabled = True,
     deps = [
         ":benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -177,12 +194,25 @@ cuda_py_test(
     srcs = ["keras_examples_benchmarks/cifar10_cnn_benchmark_test.py"],
     python_version = "PY3",
     tags = COMMON_TAGS,
+    tfrt_enabled = True,
     deps = [
         ":benchmark_util",
         "//tensorflow:tensorflow_py",
     ],
 )
 
+cuda_py_test(
+    name = "mnist_conv_custom_training_benchmark_test",
+    srcs = ["keras_examples_benchmarks/mnist_conv_custom_training_benchmark_test.py"],
+    python_version = "PY3",
+    tags = COMMON_TAGS,
+    tfrt_enabled = True,
+    deps = [
+        ":distribution_util",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
 py_library(
     name = "distribution_util",
     srcs = ["distribution_util.py"],
diff --git a/tensorflow/python/keras/benchmarks/eager_microbenchmarks_test.py b/tensorflow/python/keras/benchmarks/eager_microbenchmarks_test.py
index 82f2a8342c2..d8e004689d3 100644
--- a/tensorflow/python/keras/benchmarks/eager_microbenchmarks_test.py
+++ b/tensorflow/python/keras/benchmarks/eager_microbenchmarks_test.py
@@ -23,8 +23,8 @@ import six
 import tensorflow as tf
 
 from tensorflow.python.eager import context
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.platform import benchmark
-from tensorflow.python.util import tf_inspect
 
 
 def _run_benchmark(func, num_iters, execution_mode=None):
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/README.md b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/README.md
index 002d65842ab..d26d9495019 100644
--- a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/README.md
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/README.md
@@ -84,56 +84,56 @@ Metrics for following benchmarks:</br>
 
 #### Cifar10 CNN benchmark
 
-| ----- | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec |
-Distribution_Strategy | | :---: | :--------: | :-------: | :------------:
-| :---------: | :-------------------: | | CPU | 256 | 1393.4896 | 3.21 |
-15397.69 | `off` | | GPU:2 | 256 | 76.49 | 2.59 | 18758.01 | `mirrored` |
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 256        | 1393.4896 | 3.21           | 15397.69    | `off`
+GPU:2 | 256        | 76.49     | 2.59           | 18758.01    | `mirrored`
 
 #### MNIST Conv benchmark
 
-| ----- | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec |
-Distribution_Strategy | | :---: | :--------: | :-------: | :------------:
-| :---------: | :-------------------: | | CPU | 256 | 196.52 | 12.19 | 4915.26 |
-`off` | | GPU:2 | 256 | 24.5794 | 1.21 | 47899.32 | `mirrored` |
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 256        | 196.52    | 12.19          | 4915.26     | `off`
+GPU:2 | 256        | 24.5794   | 1.21           | 47899.32    | `mirrored`
 
 #### MNIST Hierarchical RNN (HRNN) benchmark
 
-| ----- | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec |
-Distribution_Strategy | | :---: | :--------: | :-------: | :------------:
-| :---------: | :-------------------: | | CPU | 256 | 654.05 | 218.68 | 274.24 |
-`off` | | GPU:2 | 256 | 20.77 | 3.73 | 15088.06 | `mirrored` |
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 256        | 654.05    | 218.68         | 274.24      | `off`
+GPU:2 | 256        | 20.77     | 3.73           | 15088.06    | `mirrored`
 
 #### Bidirectional LSTM benchmark
 
-| ----- | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec |
-Distribution_Strategy | | :---: | :--------: | :-------: | :------------:
-| :---------: | :-------------------: | | CPU | 512 | 225.57 | 72.55 | 344.70 |
-`off` | | GPU:2 | 512 | 23.54 | 3.23 | 7532.53 | `mirrored` |
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 512        | 225.57    | 72.55          | 344.70      | `off`
+GPU:2 | 512        | 23.54     | 3.23           | 7532.53     | `mirrored`
 
 #### Text classification with transformer benchmark
 
-| ----- | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec |
-Distribution_Strategy | | :---: | :--------: | :-------: | :------------:
-| :---------: | :-------------------: | | CPU | 512 | 109.22 | 35.93 | 698.10 |
-`off` | | GPU:2 | 512 | 9.28 | 0.83 | 26567.54 | `mirrored` |
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 512        | 109.22    | 35.93          | 698.10      | `off`
+GPU:2 | 512        | 9.28      | 0.83           | 26567.54    | `mirrored`
 
 #### MLP benchmark
 
-| ----- | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec |
-Distribution_Strategy | | :---: | :--------: | :-------: | :------------:
-| :---------: | :-------------------: | | CPU | 128 | 3.76 | 0.54 | 17678.54 |
-`off` | | GPU:2 | 128 | 5.91 | 0.30 | 25435.14 | `mirrored` |
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 128        | 3.76      | 0.54           | 17678.54    | `off`
+GPU:2 | 128        | 5.91      | 0.30           | 25435.14    | `mirrored`
 
 #### Antirectifier benchmark
 
-| ----- | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec |
-Distribution_Strategy | | :---: | :--------: | :-------: | :------------:
-| :---------: | :-------------------: | | CPU | 512 | 6.77 | 1.79 | 30916.39 |
-`off` | | GPU:2 | 512 | 6.81 | 0.66 | 66563.17 | `mirrored` |
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+:---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
+CPU   | 512        | 6.77      | 1.79           | 30916.39    | `off`
+GPU:2 | 512        | 6.81      | 0.66           | 66563.17    | `mirrored`
 
 #### IRNN benchmark
 
------ | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
+      | Batch_size | Wall_time | Avg_epoch_time | Exp_per_sec | Distribution_Strategy
 :---: | :--------: | :-------: | :------------: | :---------: | :-------------------:
 CPU   | 1024       | 213.00    | 69.01          | 868.08      | `off`
 GPU:2 | 1024       | 92.71     | 29.12          | 2042.94     | `mirrored`
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/antirectifier_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/antirectifier_benchmark_test.py
index 430bae6186f..044f2a33d47 100644
--- a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/antirectifier_benchmark_test.py
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/antirectifier_benchmark_test.py
@@ -54,77 +54,65 @@ class AntirectifierBenchmark(tf.test.Benchmark):
   #   Check more details in `measure_performance()` method of
   #   benchmark_util.
   def benchmark_antirectifier_bs_128(self):
-    """Measure performance with batch_size=128 and run_iters=2."""
+    """Measure performance with batch_size=128."""
     batch_size = 128
-    run_iters = 2
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer="rmsprop",
         loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
         metrics=["sparse_categorical_accuracy"])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_antirectifier_bs_256(self):
-    """Measure performance with batch_size=256 and run_iters=3."""
+    """Measure performance with batch_size=256."""
     batch_size = 256
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer="rmsprop",
         loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
         metrics=["sparse_categorical_accuracy"])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_antirectifier_bs_512(self):
-    """Measure performance with batch_size=512 and run_iters=4."""
+    """Measure performance with batch_size=512."""
     batch_size = 512
-    run_iters = 4
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer="rmsprop",
         loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
         metrics=["sparse_categorical_accuracy"])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_antirectifier_bs_512_gpu_2(self):
-    """Measure performance with batch_size=512, run_iters=4, gpu=2 and
+    """Measure performance with batch_size=512, gpu=2 and
 
     distribution_strategy=`mirrored`.
     """
     batch_size = 512
-    run_iters = 4
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         num_gpus=2,
         distribution_strategy="mirrored",
         optimizer="rmsprop",
         loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
         metrics=["sparse_categorical_accuracy"])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
 
 class Antirectifier(tf.keras.layers.Layer):
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py
index 77f231902a1..184e3d180a1 100644
--- a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/bidirectional_lstm_benchmark_test.py
@@ -56,77 +56,65 @@ class BidirectionalLSTMBenchmark(tf.test.Benchmark):
   #   Check more details in `measure_performance()` method of
   #   benchmark_util.
   def benchmark_bidirect_lstm_imdb_bs_128(self):
-    """Measure performance with batch_size=128 and run_iters=3."""
+    """Measure performance with batch_size=128."""
     batch_size = 128
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.imdb_x,
         y=self.imdb_y,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer='adam',
         loss='binary_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_bidirect_lstm_imdb_bs_256(self):
-    """Measure performance with batch_size=256 and run_iters=2."""
+    """Measure performance with batch_size=256."""
     batch_size = 256
-    run_iters = 2
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.imdb_x,
         y=self.imdb_y,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer='adam',
         loss='binary_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_bidirect_lstm_imdb_bs_512(self):
-    """Measure performance with batch_size=512 and run_iters=4."""
+    """Measure performance with batch_size=512."""
     batch_size = 512
-    run_iters = 4
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.imdb_x,
         y=self.imdb_y,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer='adam',
         loss='binary_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_bidirect_lstm_imdb_bs_512_gpu_2(self):
-    """Measure performance with batch_size=512, run_iters=4, gpu=2 and
+    """Measure performance with batch_size=512, gpu=2 and
 
     distribution_strategy=`mirrored`.
     """
     batch_size = 512
-    run_iters = 4
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.imdb_x,
         y=self.imdb_y,
         batch_size=batch_size,
-        run_iters=run_iters,
         num_gpus=2,
         distribution_strategy='mirrored',
         optimizer='adam',
         loss='binary_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/cifar10_cnn_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/cifar10_cnn_benchmark_test.py
index 88d27a2a040..f12e970cfe4 100644
--- a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/cifar10_cnn_benchmark_test.py
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/cifar10_cnn_benchmark_test.py
@@ -31,7 +31,7 @@ class Cifar10CNNBenchmark(tf.test.Benchmark):
     (self.x_train, self.y_train), _ = tf.keras.datasets.cifar10.load_data()
     self.x_train = self.x_train.astype('float32') / 255
     self.y_train = tf.keras.utils.to_categorical(self.y_train, self.num_classes)
-    self.epochs = 25
+    self.epochs = 5
 
   def _build_model(self):
     """Model from https://github.com/keras-team/keras/blob/master/examples/cifar10_cnn.py."""
@@ -70,72 +70,61 @@ class Cifar10CNNBenchmark(tf.test.Benchmark):
   #   Check more details in `measure_performance()` method of
   #   benchmark_util.
   def benchmark_cnn_cifar10_bs_256(self):
-    """Measure performance with batch_size=256 and run_iters=3."""
+    """Measure performance with batch_size=256."""
     batch_size = 256
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         epochs=self.epochs,
         optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6),
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_cnn_cifar10_bs_512(self):
-    """Measure performance with batch_size=512 and run_iters=3."""
+    """Measure performance with batch_size=512."""
     batch_size = 512
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         epochs=self.epochs,
         optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6),
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_cnn_cifar10_bs_1024(self):
-    """Measure performance with batch_size=1024 and run_iters=2."""
+    """Measure performance with batch_size=1024."""
     batch_size = 1024
-    run_iters = 2
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         epochs=self.epochs,
         optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001, decay=1e-6),
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_cnn_cifar10_bs_1024_gpu_2(self):
-    """Measure performance with batch_size=1024, run_iters=2, gpu=2 and
+    """Measure performance with batch_size=1024, gpu=2 and
 
     distribution_strategy=`mirrored`.
     """
     batch_size = 1024
-    run_iters = 2
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         num_gpus=2,
         distribution_strategy='mirrored',
         epochs=self.epochs,
@@ -143,8 +132,7 @@ class Cifar10CNNBenchmark(tf.test.Benchmark):
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_conv_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_conv_benchmark_test.py
index 1ea2e45ec73..59b0a5edd6e 100644
--- a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_conv_benchmark_test.py
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_conv_benchmark_test.py
@@ -61,54 +61,61 @@ class ConvMnistBenchmark(tf.test.Benchmark):
   #   Check more details in `measure_performance()` method of
   #   benchmark_util.
   def benchmark_conv_mnist_bs_128(self):
-    """Measure performance with batch_size=128 and run_iters=2."""
+    """Measure performance with batch_size=128."""
     batch_size = 128
-    run_iters = 2
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         epochs=self.epochs,
         optimizer='adam',
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_conv_mnist_bs_256(self):
-    """Measure performance with batch_size=256 and run_iters=3."""
+    """Measure performance with batch_size=256."""
     batch_size = 256
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         epochs=self.epochs,
         optimizer='adam',
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
-  def benchmark_conv_mnist_bs_256_gpu_2(self):
-    """Measure performance with batch_size=256, run_iters=3, gpu=2 and
+  def benchmark_conv_mnist_bs_512(self):
+    """Measure performance with batch_size=512."""
+    batch_size = 512
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        epochs=self.epochs,
+        optimizer='adam',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_conv_mnist_bs_512_gpu_2(self):
+    """Measure performance with batch_size=512, gpu=2 and
 
     distribution_strategy='mirrored'
     """
-    batch_size = 256
-    run_iters = 3
+    batch_size = 512
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         num_gpus=2,
         distribution_strategy='mirrored',
         epochs=self.epochs,
@@ -116,26 +123,7 @@ class ConvMnistBenchmark(tf.test.Benchmark):
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_conv_mnist_bs_512(self):
-    """Measure performance with batch_size=512 and run_iters=3."""
-    batch_size = 512
-    run_iters = 3
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        run_iters=run_iters,
-        epochs=self.epochs,
-        optimizer='adam',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_conv_custom_training_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_conv_custom_training_benchmark_test.py
new file mode 100644
index 00000000000..f7c1130989f
--- /dev/null
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_conv_custom_training_benchmark_test.py
@@ -0,0 +1,360 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks using custom training loop on MNIST dataset."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import timeit
+import numpy as np
+
+import tensorflow as tf
+
+from tensorflow.python.keras.benchmarks import distribution_util
+
+
+class CustomMnistBenchmark(tf.test.Benchmark):
+  """Benchmarks for custom training loop using `tf.test.Benchmark`."""
+
+  def __init__(self):
+    super(CustomMnistBenchmark, self).__init__()
+    self.num_classes = 10
+    self.input_shape = (28, 28, 1)
+    self.epochs = 15
+    (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
+    x_train = x_train.astype('float32') / 255
+    x_train = np.expand_dims(x_train, -1)
+    y_train = tf.keras.utils.to_categorical(y_train, self.num_classes)
+    self.num_examples = x_train.shape[0]
+    #  Use `tf.data.Dataset` for custom training loop.
+    self.train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+
+  def _build_model(self):
+    """Model from https://keras.io/examples/vision/mnist_convnet/."""
+    model = tf.keras.Sequential([
+        tf.keras.Input(shape=self.input_shape),
+        tf.keras.layers.Conv2D(32, kernel_size=(3, 3), activation='relu'),
+        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+        tf.keras.layers.Conv2D(64, kernel_size=(3, 3), activation='relu'),
+        tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+        tf.keras.layers.Flatten(),
+        tf.keras.layers.Dropout(0.5),
+        tf.keras.layers.Dense(self.num_classes, activation='softmax'),
+    ])
+
+    return model
+
+  def compute_loss(self, targets, predictions, loss_fn, batch_size):
+    """Compute average loss."""
+    per_example_loss = loss_fn(targets, predictions)
+    return tf.nn.compute_average_loss(
+        per_example_loss, global_batch_size=batch_size)
+
+  @tf.function(experimental_relax_shapes=True)
+  def train_step(self, inputs, model, loss_fn, optimizer, batch_size):
+    """Compute loss and optimize model by optimizer.
+
+    Arguments:
+      inputs: `tf.data`.
+      model: See `model` in `train_function()` method.
+      loss_fn: See `loss_fn` in `train_function()` method.
+      optimizer: See `optimizer` in `train_function()` method.
+      batch_size: See `batch_size` in `train_function()` method.
+
+    Returns:
+      Loss value.
+    """
+    train_x, train_y = inputs
+    with tf.GradientTape() as tape:
+      predictions = model(train_x, training=True)
+      loss = self.compute_loss(train_y, predictions, loss_fn, batch_size)
+    grads = tape.gradient(loss, model.trainable_weights)
+    optimizer.apply_gradients(zip(grads, model.trainable_weights))
+    return loss
+
+  @tf.function(experimental_relax_shapes=True)
+  def distributed_train_step(self, batch_dataset, model, loss_fn, optimizer,
+                             batch_size, distribution_strategy):
+    """Train step in distribution strategy setting.
+
+    Arguments:
+      batch_dataset: `tf.data`.
+      model: See `model` in `train_function()` method.
+      loss_fn: See `loss_fn` in `train_function()` method.
+      optimizer: See `optimizer` in `train_function()` method.
+      batch_size: See `batch_size` in `train_function()` method.
+      distribution_strategy: See `distribution_strategy` in `train_function()`
+        method.
+
+    Returns:
+      Sum of per_replica_losses.
+    """
+    per_replica_losses = distribution_strategy.run(
+        self.train_step,
+        args=(
+            batch_dataset,
+            model,
+            loss_fn,
+            optimizer,
+            batch_size,
+        ))
+    return distribution_strategy.reduce(
+        tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
+
+  def train_function(self,
+                     model,
+                     train_dataset,
+                     loss_fn,
+                     optimizer,
+                     epochs=2,
+                     distribution_strategy=None,
+                     batch_size=256):
+    """Train model in custom training loop and return average
+
+    train_step_time.
+
+    Arguments:
+      model: Model function to be benchmarked.
+      train_dataset: `tf.data` dataset. Should return a tuple of either (inputs,
+        targets) or (inputs, targets, sample_weights).
+      loss_fn: `tf.keras.losses.Loss` instance.
+      optimizer: `tf.keras.optimizers` instance.
+      epochs: Integer. Number of epochs to train the model. If unspecified,
+        `epochs` will default to 2.
+      distribution_strategy: Distribution strategies. It could be
+        `multi_worker_mirrored`, `one_device`, `mirrored`. If unspecified,
+        `distribution_strategy` will default to 'off'. Note that, `TPU` and
+        `parameter_server` are not supported yet.
+      batch_size: Integer. Number of samples per gradient update. If
+        unspecified, `batch_size` will default to 32.
+
+    Returns:
+      Average train_step_time.
+    """
+    train_step_time_list = []
+    timer = timeit.default_timer
+
+    total_loss = 0.0
+    num_batches = 0
+    for _ in range(epochs):
+      # Iterate over the batches of the dataset.
+      for batch_dataset in train_dataset:
+
+        start_time = timer()
+
+        if distribution_strategy is not None:
+          total_loss += self.distributed_train_step(batch_dataset, model,
+                                                    loss_fn, optimizer,
+                                                    batch_size,
+                                                    distribution_strategy)
+        else:
+          total_loss += self.train_step(batch_dataset, model, loss_fn,
+                                        optimizer, batch_size)
+        num_batches += 1
+
+        end_time = timer()
+        train_step_time_list.append(end_time - start_time)
+
+    return np.mean(train_step_time_list)
+
+  def measure_performance(self,
+                          model,
+                          dataset,
+                          loss_fn,
+                          optimizer,
+                          batch_size=32,
+                          run_iters=4,
+                          epochs=10,
+                          distribution_strategy=None):
+    """Run models and measure the performance.
+
+    Arguments:
+      model_fn: Model function to be benchmarked.
+      dataset: `tf.data` dataset. Should return a tuple of either (inputs,
+        targets) or (inputs, targets, sample_weights).
+      loss_fn: `tf.keras.losses.Loss` instance.
+      optimizer: `tf.keras.optimizers` instance.
+      batch_size: Integer. Number of samples per gradient update. If
+        unspecified, `batch_size` will default to 32.
+      run_iters: Integer. Number of iterations to run the performance
+        measurement. If unspecified, `run_iters` will default to 4.
+      epochs: Integer. Number of epochs to train the model. If unspecified,
+        `epochs` will default to 10.
+      distribution_strategy: Distribution strategies. It could be
+        `multi_worker_mirrored`, `one_device`, `mirrored`. If unspecified,
+        `distribution_strategy` will default to 'off'. Note that, `TPU` and
+        `parameter_server` are not supported yet.
+
+    Returns:
+      Performance summary, which contains build_time, avg_epoch_time,
+      wall_time, exp_per_sec, epochs, warmup_time, train_step_time.
+
+    Raise:
+      ValueError: if `dataset` is None or if `optimizer` instance is
+      not provided or if `loss_fn` instance is not provided.
+    """
+    if distribution_strategy is not None and \
+      not isinstance(dataset, tf.distribute.DistributedDataset):
+      raise ValueError('tf.distribute.DistributedDataset'
+                       ' required in distribution strategy.')
+
+    if distribution_strategy is None and \
+      not isinstance(dataset, tf.data.Dataset):
+      raise ValueError('`tf.data` is required.')
+
+    if not isinstance(loss_fn, tf.keras.losses.Loss):
+      raise ValueError('`tf.keras.losses.Loss` instance '
+                       'for loss_fn is required.')
+
+    if not isinstance(optimizer, tf.keras.optimizers.Optimizer):
+      raise ValueError('`tf.keras.optimizers` instance '
+                       'for optimizer is required.')
+
+    avg_epoch_time_list, train_step_time_list = [], []
+    wall_time_list, exp_per_sec_list, warmup_time_list = [], [], []
+
+    total_num_examples = epochs * self.num_examples
+
+    for _ in range(run_iters):
+      timer = timeit.default_timer
+      start_time = timer()
+      t1 = timer()
+      self.train_function(model, dataset, loss_fn, optimizer, 1,
+                          distribution_strategy, batch_size)
+      warmup_time = timer() - t1
+
+      t2 = timer()
+      train_step_time = self.train_function(model, dataset, loss_fn, optimizer,
+                                            epochs, distribution_strategy,
+                                            batch_size)
+      end_time = timer()
+
+      train_step_time_list.append(train_step_time)
+      warmup_time_list.append(warmup_time)
+      wall_time_list.append(end_time - start_time)
+      exp_per_sec_list.append(total_num_examples / (end_time - t2))
+      avg_epoch_time_list.append((end_time - t2) / epochs)
+
+    metrics = []
+    metrics.append({
+        'name': 'avg_epoch_time',
+        'value': np.mean(avg_epoch_time_list)
+    })
+    metrics.append({'name': 'exp_per_sec', 'value': np.mean(exp_per_sec_list)})
+    metrics.append({'name': 'warmup_time', 'value': np.mean(warmup_time_list)})
+    metrics.append({
+        'name': 'train_step_time',
+        'value': np.mean(train_step_time_list)
+    })
+    metrics.append({'name': 'epochs', 'value': epochs})
+
+    wall_time = np.mean(wall_time_list)
+
+    return metrics, wall_time
+
+  def benchmark_custom_training_mnist_bs_128(self):
+    """Measure performance with batch_size=128 and run_iters=5."""
+    batch_size = 128
+    run_iters = 5
+    train_dataset = self.train_dataset.shuffle(
+        buffer_size=1024).batch(batch_size)
+
+    # Instantiate a loss function.
+    loss_fn = tf.keras.losses.CategoricalCrossentropy(
+        reduction=tf.keras.losses.Reduction.NONE)
+    # Instantiate an optimizer to train the model.
+    optimizer = tf.keras.optimizers.Adam()
+    model = self._build_model()
+
+    metrics, wall_time = self.measure_performance(model, train_dataset, loss_fn,
+                                                  optimizer, batch_size,
+                                                  run_iters, self.epochs)
+    self.report_benchmark(iters=run_iters, wall_time=wall_time, metrics=metrics)
+
+  def benchmark_custom_training_mnist_bs_256(self):
+    """Measure performance with batch_size=256 and run_iters=5."""
+    batch_size = 256
+    run_iters = 5
+    train_dataset = self.train_dataset.shuffle(
+        buffer_size=1024).batch(batch_size)
+
+    # Instantiate a loss function.
+    loss_fn = tf.keras.losses.CategoricalCrossentropy(
+        reduction=tf.keras.losses.Reduction.NONE)
+    # Instantiate an optimizer to train the model.
+    optimizer = tf.keras.optimizers.Adam()
+    model = self._build_model()
+
+    metrics, wall_time = self.measure_performance(model, train_dataset, loss_fn,
+                                                  optimizer, batch_size,
+                                                  run_iters, self.epochs)
+    self.report_benchmark(iters=run_iters, wall_time=wall_time, metrics=metrics)
+
+  def benchmark_custom_training_mnist_bs_512(self):
+    """Measure performance with batch_size=512 and run_iters=10."""
+    batch_size = 512
+    run_iters = 5
+    train_dataset = self.train_dataset.shuffle(
+        buffer_size=1024).batch(batch_size)
+
+    # Instantiate a loss function.
+    loss_fn = tf.keras.losses.CategoricalCrossentropy(
+        reduction=tf.keras.losses.Reduction.NONE)
+    # Instantiate an optimizer to train the model.
+    optimizer = tf.keras.optimizers.Adam()
+    model = self._build_model()
+
+    metrics, wall_time = self.measure_performance(model, train_dataset, loss_fn,
+                                                  optimizer, batch_size,
+                                                  run_iters, self.epochs)
+    self.report_benchmark(iters=run_iters, wall_time=wall_time, metrics=metrics)
+
+  def benchmark_custom_training_mnist_bs_512_gpu_2(self):
+    """Measure performance with batch_size=512, run_iters=10, gpu=2 and
+
+    distribution_strategy='mirrored'.
+    """
+    batch_size = 512
+    run_iters = 10
+    train_dataset = self.train_dataset.shuffle(
+        buffer_size=1024).batch(batch_size)
+
+    distribution_strategy = 'mirrored'
+
+    strategy = distribution_util.get_distribution_strategy(
+        distribution_strategy=distribution_strategy, num_gpus=2)
+
+    if distribution_strategy != 'off':
+      train_dataset = strategy.experimental_distribute_dataset(train_dataset)
+
+    strategy_scope = distribution_util.get_strategy_scope(strategy)
+
+    with strategy_scope:
+      # Instantiate a loss function.
+      loss_fn = tf.keras.losses.CategoricalCrossentropy(
+          reduction=tf.keras.losses.Reduction.NONE)
+      # Instantiate an optimizer to train the model.
+      optimizer = tf.keras.optimizers.Adam()
+      model = self._build_model()
+
+    metrics, wall_time = self.measure_performance(model, train_dataset, loss_fn,
+                                                  optimizer, batch_size,
+                                                  run_iters, self.epochs,
+                                                  strategy)
+    self.report_benchmark(iters=run_iters, wall_time=wall_time, metrics=metrics)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py
index 29c71e576ba..ef7252733bc 100644
--- a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_hierarchical_rnn_benchmark_test.py
@@ -62,77 +62,65 @@ class HierarchicalRNNBenchmark(tf.test.Benchmark):
   #   Check more details in `measure_performance()` method of
   #   benchmark_util.
   def benchmark_hrnn_mnist_bs_256(self):
-    """Measure performance with batch_size=256 and run_iters=4."""
+    """Measure performance with batch_size=256."""
     batch_size = 256
-    run_iters = 4
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer='rmsprop',
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
-  def benchmark_hrnn_mnist_bs_256_gpu_2(self):
-    """Measure performance with batch_size=256, run_iters=4, gpu=2 and
-
-    distribution_strategy='mirrored'
-    """
-    batch_size = 256
-    run_iters = 4
+  def benchmark_hrnn_mnist_bs_512(self):
+    """Measure performance with batch_size=512."""
+    batch_size = 512
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        optimizer='rmsprop',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_hrnn_mnist_bs_1024(self):
+    """Measure performance with batch_size=1024."""
+    batch_size = 1024
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        optimizer='rmsprop',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_hrnn_mnist_bs_1024_gpu_2(self):
+    """Measure performance with batch_size=1024, gpu=2 and
+
+    distribution_strategy='mirrored'
+    """
+    batch_size = 1024
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         num_gpus=2,
         distribution_strategy='mirrored',
         optimizer='rmsprop',
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_hrnn_mnist_bs_512(self):
-    """Measure performance with batch_size=512 and run_iters=5."""
-    batch_size = 512
-    run_iters = 5
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        run_iters=run_iters,
-        optimizer='rmsprop',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_hrnn_mnist_bs_1024(self):
-    """Measure performance with batch_size=1024 and run_iters=3."""
-    batch_size = 1024
-    run_iters = 3
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        run_iters=run_iters,
-        optimizer='rmsprop',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_irnn_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_irnn_benchmark_test.py
index b1913a2c268..dcfa8f44e8b 100644
--- a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_irnn_benchmark_test.py
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/mnist_irnn_benchmark_test.py
@@ -62,77 +62,65 @@ class IRNNMnistBenchmark(tf.test.Benchmark):
   #   Check more details in `measure_performance()` method of
   #   benchmark_util.
   def benchmark_irnn_mnist_bs_256(self):
-    """Measure performance with batch_size=256 and run_iters=4."""
+    """Measure performance with batch_size=256."""
     batch_size = 256
-    run_iters = 4
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.learning_rate),
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_irnn_mnist_bs_512(self):
-    """Measure performance with batch_size=512 and run_iters=3."""
+    """Measure performance with batch_size=512."""
     batch_size = 512
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.learning_rate),
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_irnn_mnist_bs_1024(self):
-    """Measure performance with batch_size=1024 and run_iters=3."""
+    """Measure performance with batch_size=1024."""
     batch_size = 1024
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.learning_rate),
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
-  def benchmark_irnn_mnist_bs_1024_gpu_3(self):
-    """Measure performance with batch_size=1024, run_iters=3, gpu=3 and
+  def benchmark_irnn_mnist_bs_1024_gpu_2(self):
+    """Measure performance with batch_size=1024, gpu=2 and
 
     distribution_strategy='mirrored'
     """
     batch_size = 1024
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
-        num_gpus=3,
+        num_gpus=2,
         distribution_strategy='mirrored',
         optimizer=tf.keras.optimizers.RMSprop(learning_rate=self.learning_rate),
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/reuters_mlp_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/reuters_mlp_benchmark_test.py
index be7823441f3..c3be6e47659 100644
--- a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/reuters_mlp_benchmark_test.py
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/reuters_mlp_benchmark_test.py
@@ -61,81 +61,69 @@ class MLPReutersBenchmark(tf.test.Benchmark):
   #   Check more details in `measure_performance()` method of
   #   benchmark_util.
   def benchmark_mlp_reuters_bs_128(self):
-    """Measure performance with batch_size=128 and run_iters=2."""
+    """Measure performance with batch_size=128."""
     batch_size = 128
-    run_iters = 2
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
         epochs=self.epochs,
         optimizer='adam',
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
-  def benchmark_mlp_reuters_bs_128_gpu_3(self):
-    """Measure performance with batch_size=128, run_iters=2, gpu=3 and
-
-    distribution_strategy='mirrored'
-    """
-    batch_size = 128
-    run_iters = 2
+  def benchmark_mlp_reuters_bs_256(self):
+    """Measure performance with batch_size=256."""
+    batch_size = 256
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.x_train,
         y=self.y_train,
         batch_size=batch_size,
-        run_iters=run_iters,
-        num_gpus=3,
+        epochs=self.epochs,
+        optimizer='adam',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_mlp_reuters_bs_512(self):
+    """Measure performance with batch_size=512."""
+    batch_size = 512
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        epochs=self.epochs,
+        optimizer='adam',
+        loss='categorical_crossentropy',
+        metrics=['accuracy'])
+
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
+
+  def benchmark_mlp_reuters_bs_512_gpu_2(self):
+    """Measure performance with batch_size=512, gpu=2 and
+
+    distribution_strategy='mirrored'
+    """
+    batch_size = 512
+    metrics, wall_time, extras = benchmark_util.measure_performance(
+        self._build_model,
+        x=self.x_train,
+        y=self.y_train,
+        batch_size=batch_size,
+        num_gpus=2,
         distribution_strategy='mirrored',
         epochs=self.epochs,
         optimizer='adam',
         loss='categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_mlp_reuters_bs_256(self):
-    """Measure performance with batch_size=256 and run_iters=3."""
-    batch_size = 256
-    run_iters = 3
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        run_iters=run_iters,
-        epochs=self.epochs,
-        optimizer='adam',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
-
-  def benchmark_mlp_reuters_bs_512(self):
-    """Measure performance with batch_size=512 and run_iters=4."""
-    batch_size = 512
-    run_iters = 4
-    metrics, wall_time, extras = benchmark_util.measure_performance(
-        self._build_model,
-        x=self.x_train,
-        y=self.y_train,
-        batch_size=batch_size,
-        run_iters=run_iters,
-        epochs=self.epochs,
-        optimizer='adam',
-        loss='categorical_crossentropy',
-        metrics=['accuracy'])
-
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/text_classification_transformer_benchmark_test.py b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/text_classification_transformer_benchmark_test.py
index 8d1847ae204..688d360630d 100644
--- a/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/text_classification_transformer_benchmark_test.py
+++ b/tensorflow/python/keras/benchmarks/keras_examples_benchmarks/text_classification_transformer_benchmark_test.py
@@ -66,77 +66,65 @@ class TextWithTransformerBenchmark(tf.test.Benchmark):
   #   Check more details in `measure_performance()` method of
   #   benchmark_util.
   def benchmark_text_classification_bs_128(self):
-    """Measure performance with batch_size=128 and run_iters=3."""
+    """Measure performance with batch_size=128."""
     batch_size = 128
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.imdb_x,
         y=self.imdb_y,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer='adam',
         loss='sparse_categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_text_classification_bs_256(self):
-    """Measure performance with batch_size=256 and run_iters=3."""
+    """Measure performance with batch_size=256."""
     batch_size = 256
-    run_iters = 3
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.imdb_x,
         y=self.imdb_y,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer='adam',
         loss='sparse_categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_text_classification_bs_512(self):
-    """Measure performance with batch_size=512 and run_iters=4."""
+    """Measure performance with batch_size=512."""
     batch_size = 512
-    run_iters = 4
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.imdb_x,
         y=self.imdb_y,
         batch_size=batch_size,
-        run_iters=run_iters,
         optimizer='adam',
         loss='sparse_categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
   def benchmark_text_classification_bs_512_gpu_2(self):
-    """Measure performance with batch_size=512, run_iters=4, gpu=1 and
+    """Measure performance with batch_size=512, gpu=1 and
 
     distribution_strategy='mirrored'
     """
     batch_size = 512
-    run_iters = 4
     metrics, wall_time, extras = benchmark_util.measure_performance(
         self._build_model,
         x=self.imdb_x,
         y=self.imdb_y,
         batch_size=batch_size,
-        run_iters=run_iters,
         num_gpus=2,
         distribution_strategy='mirrored',
         optimizer='adam',
         loss='sparse_categorical_crossentropy',
         metrics=['accuracy'])
 
-    self.report_benchmark(
-        iters=run_iters, wall_time=wall_time, metrics=metrics, extras=extras)
+    self.report_benchmark(wall_time=wall_time, metrics=metrics, extras=extras)
 
 
 class MultiHeadSelfAttention(tf.keras.layers.Layer):
diff --git a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/BUILD b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/BUILD
index 66246d834db..5501aedcd4e 100644
--- a/tensorflow/python/keras/benchmarks/saved_model_benchmarks/BUILD
+++ b/tensorflow/python/keras/benchmarks/saved_model_benchmarks/BUILD
@@ -8,7 +8,11 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
 
 # To run CPU benchmarks:
 #   bazel run -c opt benchmarks_test -- --benchmarks=.
@@ -39,6 +43,7 @@ cuda_py_test(
         "no_pip",  # b/161253163
         "no_windows",  # b/160628318
     ],
+    tfrt_enabled = True,
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -53,6 +58,7 @@ cuda_py_test(
         "no_pip",  # b/161253163
         "no_windows",  # b/160628318
     ],
+    tfrt_enabled = True,
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -67,6 +73,7 @@ cuda_py_test(
         "no_pip",  # b/161253163
         "no_windows",  # b/160628318
     ],
+    tfrt_enabled = True,
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -81,6 +88,7 @@ cuda_py_test(
         "no_pip",  # b/161253163
         "no_windows",  # b/160628318
     ],
+    tfrt_enabled = True,
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -95,6 +103,7 @@ cuda_py_test(
         "no_pip",  # b/161253163
         "no_windows",  # b/160628318
     ],
+    tfrt_enabled = True,
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -109,6 +118,7 @@ cuda_py_test(
         "no_pip",  # b/161253163
         "no_windows",  # b/160628318
     ],
+    tfrt_enabled = True,
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -123,6 +133,7 @@ cuda_py_test(
         "no_pip",  # b/161253163
         "no_windows",  # b/160628318
     ],
+    tfrt_enabled = True,
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
@@ -137,6 +148,7 @@ cuda_py_test(
         "no_pip",  # b/161253163
         "no_windows",  # b/160628318
     ],
+    tfrt_enabled = True,
     deps = [
         ":saved_model_benchmark_util",
         "//tensorflow:tensorflow_py",
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 21215db2c6f..de97c80fd62 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -37,6 +37,7 @@ from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distributed_file_utils
 from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
@@ -53,6 +54,7 @@ from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.profiler import profiler_v2 as profiler
 from tensorflow.python.saved_model import save_options as save_options_lib
@@ -262,7 +264,7 @@ class CallbackList(object):
 
     if self._progbar is None and add_progbar:
       self._progbar = ProgbarLogger(count_mode='steps')
-      self.callbacks.append(self._progbar)
+      self.callbacks.insert(0, self._progbar)
 
     if self._history is None and add_history:
       self._history = History()
@@ -665,7 +667,8 @@ class Callback(object):
         epoch: Integer, index of epoch.
         logs: Dict, metric results for this training epoch, and for the
           validation epoch if validation is performed. Validation result keys
-          are prefixed with `val_`.
+          are prefixed with `val_`. For training epoch, the values of the  
+         `Model`'s metrics are returned. Example : `{'loss': 0.2, 'acc': 0.7}`.
     """
 
   @doc_controls.for_subclass_implementers
@@ -675,6 +678,10 @@ class Callback(object):
 
     Subclasses should override for any actions to run.
 
+    Note that if the `steps_per_execution` argument to `compile` in
+    `tf.keras.Model` is set to `N`, this method will only be called every `N`
+    batches.
+
     Arguments:
         batch: Integer, index of batch within the current epoch.
         logs: Dict, contains the return value of `model.train_step`. Typically,
@@ -691,6 +698,10 @@ class Callback(object):
 
     Subclasses should override for any actions to run.
 
+    Note that if the `steps_per_execution` argument to `compile` in
+    `tf.keras.Model` is set to `N`, this method will only be called every `N`
+    batches.
+
     Arguments:
         batch: Integer, index of batch within the current epoch.
         logs: Dict. Aggregated metric results up until this batch.
@@ -708,6 +719,10 @@ class Callback(object):
 
     Subclasses should override for any actions to run.
 
+    Note that if the `steps_per_execution` argument to `compile` in
+    `tf.keras.Model` is set to `N`, this method will only be called every `N`
+    batches.
+
     Arguments:
         batch: Integer, index of batch within the current epoch.
         logs: Dict, contains the return value of `model.test_step`. Typically,
@@ -725,6 +740,10 @@ class Callback(object):
 
     Subclasses should override for any actions to run.
 
+    Note that if the `steps_per_execution` argument to `compile` in
+    `tf.keras.Model` is set to `N`, this method will only be called every `N`
+    batches.
+
     Arguments:
         batch: Integer, index of batch within the current epoch.
         logs: Dict. Aggregated metric results up until this batch.
@@ -737,6 +756,10 @@ class Callback(object):
 
     Subclasses should override for any actions to run.
 
+    Note that if the `steps_per_execution` argument to `compile` in
+    `tf.keras.Model` is set to `N`, this method will only be called every `N`
+    batches.
+
     Arguments:
         batch: Integer, index of batch within the current epoch.
         logs: Dict, contains the return value of `model.predict_step`,
@@ -751,6 +774,10 @@ class Callback(object):
 
     Subclasses should override for any actions to run.
 
+    Note that if the `steps_per_execution` argument to `compile` in
+    `tf.keras.Model` is set to `N`, this method will only be called every `N`
+    batches.
+
     Arguments:
         batch: Integer, index of batch within the current epoch.
         logs: Dict. Aggregated metric results up until this batch.
@@ -896,10 +923,15 @@ class TerminateOnNaN(Callback):
   """Callback that terminates training when a NaN loss is encountered.
   """
 
+  def __init__(self):
+    super(TerminateOnNaN, self).__init__()
+    self._supports_tf_logs = True
+
   def on_batch_end(self, batch, logs=None):
     logs = logs or {}
     loss = logs.get('loss')
     if loss is not None:
+      loss = tf_utils.to_numpy_or_python_type(loss)
       if np.isnan(loss) or np.isinf(loss):
         print('Batch %d: Invalid loss, terminating training' % (batch))
         self.model.stop_training = True
@@ -1111,15 +1143,22 @@ class ModelCheckpoint(Callback):
     the end of every epoch, or after a fixed number of training batches.
   - Whether only weights are saved, or the whole model is saved.
 
+  Note: If you get `WARNING:tensorflow:Can save best model only with <name>
+  available, skipping` see the description of the `monitor` argument for
+  details on how to get this right.
+
   Example:
 
   ```python
+  model.compile(loss=..., optimizer=...,
+                metrics=['accuracy'])
+
   EPOCHS = 10
   checkpoint_filepath = '/tmp/checkpoint'
   model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
       filepath=checkpoint_filepath,
       save_weights_only=True,
-      monitor='val_acc',
+      monitor='val_accuracy',
       mode='max',
       save_best_only=True)
 
@@ -1138,31 +1177,45 @@ class ModelCheckpoint(Callback):
         `filepath` is `weights.{epoch:02d}-{val_loss:.2f}.hdf5`, then the model
         checkpoints will be saved with the epoch number and the validation loss
         in the filename.
-      monitor: quantity to monitor.
+      monitor: The metric name to monitor. Typically the metrics are set by the
+        `Model.compile` method. Note:
+
+        * Prefix the name with `"val_`" to monitor validation metrics.
+        * Use `"loss"` or "`val_loss`" to monitor the model's total loss.
+        * If you specify metrics as strings, like `"accuracy"`, pass the same
+          string (with or without the `"val_"` prefix).
+        * If you pass `metrics.Metric` objects, `monitor` should be set to
+          `metric.name`
+        * If you're not sure about the metric names you can check the contents
+          of the `history.history` dictionary returned by
+          `history = model.fit()`
+        * Multi-output models set additional prefixes on the metric names.
+
       verbose: verbosity mode, 0 or 1.
-      save_best_only: if `save_best_only=True`, the latest best model according
-        to the quantity monitored will not be overwritten.
-        If `filepath` doesn't contain formatting options like `{epoch}` then
-        `filepath` will be overwritten by each new better model.
-      mode: one of {auto, min, max}. If `save_best_only=True`, the decision to
-        overwrite the current save file is made based on either the maximization
-        or the minimization of the monitored quantity. For `val_acc`, this
-        should be `max`, for `val_loss` this should be `min`, etc. In `auto`
-        mode, the direction is automatically inferred from the name of the
-        monitored quantity.
+      save_best_only: if `save_best_only=True`, it only saves when the model
+        is considered the "best" and the latest best model according to the
+        quantity monitored will not be overwritten. If `filepath` doesn't
+        contain formatting options like `{epoch}` then `filepath` will be
+        overwritten by each new better model.
+      mode: one of {'auto', 'min', 'max'}. If `save_best_only=True`, the
+        decision to overwrite the current save file is made based on either
+        the maximization or the minimization of the monitored quantity.
+        For `val_acc`, this should be `max`, for `val_loss` this should be
+        `min`, etc. In `auto` mode, the direction is automatically inferred
+        from the name of the monitored quantity.
       save_weights_only: if True, then only the model's weights will be saved
         (`model.save_weights(filepath)`), else the full model is saved
         (`model.save(filepath)`).
       save_freq: `'epoch'` or integer. When using `'epoch'`, the callback saves
         the model after each epoch. When using integer, the callback saves the
         model at end of this many batches. If the `Model` is compiled with
-        `experimental_steps_per_execution=N`, then the saving criteria will be
+        `steps_per_execution=N`, then the saving criteria will be
         checked every Nth batch. Note that if the saving isn't aligned to
         epochs, the monitored metric may potentially be less reliable (it
         could reflect as little as 1 batch, since the metrics get reset every
         epoch). Defaults to `'epoch'`.
       options: Optional `tf.train.CheckpointOptions` object if
-        `save_weights_only` is true or optional `tf.saved_model.SavedOptions`
+        `save_weights_only` is true or optional `tf.saved_model.SaveOptions`
         object if `save_weights_only` is false.
       **kwargs: Additional arguments for backwards compatibility. Possible key
         is `period`.
@@ -1259,16 +1312,6 @@ class ModelCheckpoint(Callback):
       self.save_weights_only = True
 
   def on_train_begin(self, logs=None):
-    # pylint: disable=protected-access
-    if self.model._in_multi_worker_mode:
-      logging.warning(
-          'Automatic model reloading for interrupted job was removed from '
-          'the `ModelCheckpoint` callback in multi-worker mode, please use the '
-          '`keras.callbacks.experimental.BackupAndRestore` callback instead. '
-          'See this tutorial for details: '
-          'https://www.tensorflow.org/tutorials/distribute/'
-          'multi_worker_with_keras#backupandrestore_callback.'
-      )
     if self.load_weights_on_restart:
       filepath_to_load = (
           self._get_most_recently_modified_file_matching_pattern(self.filepath))
@@ -1560,7 +1603,8 @@ class BackupAndRestore(Callback):
     self._supported_strategies = (
         distribute_lib._DefaultDistributionStrategy,
         mirrored_strategy.MirroredStrategy,
-        collective_all_reduce_strategy.CollectiveAllReduceStrategy)
+        collective_all_reduce_strategy.CollectiveAllReduceStrategy,
+        tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV2)
 
     if not context.executing_eagerly():
       if ops.inside_function():
@@ -1588,8 +1632,10 @@ class BackupAndRestore(Callback):
     if not isinstance(self.model.distribute_strategy,
                       self._supported_strategies):
       raise NotImplementedError(
-          'Currently only support empty strategy, MirroredStrategy and '
-          'MultiWorkerMirroredStrategy.')
+          '%s is not supported yet. '
+          'Currently BackupAndRestore callback only supports empty strategy, '
+          'MirroredStrategy, MultiWorkerMirroredStrategy and TPUStrategy.' %
+          type(self.model.distribute_strategy).__name__)
     self.model._training_state = (
         worker_training_state.WorkerTrainingState(self.model, self.backup_dir))
     self._training_state = self.model._training_state
@@ -1898,30 +1944,6 @@ class TensorBoard(Callback, version_utils.TensorBoardVersionSelector):
   You can find more information about TensorBoard
   [here](https://www.tensorflow.org/get_started/summaries_and_tensorboard).
 
-  Example (Basic):
-
-  ```python
-  tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")
-  model.fit(x_train, y_train, epochs=2, callbacks=[tensorboard_callback])
-  # run the tensorboard command to view the visualizations.
-  ```
-
-  Example (Profile):
-
-  ```python
-  # profile a single batch, e.g. the 5th batch.
-  tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs',
-                                                        profile_batch=5)
-  model.fit(x_train, y_train, epochs=2, callbacks=[tensorboard_callback])
-  # Now run the tensorboard command to view the visualizations (profile plugin).
-
-  # profile a range of batches, e.g. from 10 to 20.
-  tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs',
-                                                        profile_batch='10,20')
-  model.fit(x_train, y_train, epochs=2, callbacks=[tensorboard_callback])
-  # Now run the tensorboard command to view the visualizations (profile plugin).
-  ```
-
   Arguments:
       log_dir: the path of the directory where to save the log files to be
         parsed by TensorBoard.
@@ -1953,8 +1975,72 @@ class TensorBoard(Callback, version_utils.TensorBoardVersionSelector):
         about metadata files format. In case if the same metadata file is
         used for all embedding layers, string can be passed.
 
-  Raises:
-      ValueError: If histogram_freq is set and no validation data is provided.
+  Examples:
+
+  Basic usage:
+
+  ```python
+  tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="./logs")
+  model.fit(x_train, y_train, epochs=2, callbacks=[tensorboard_callback])
+  # Then run the tensorboard command to view the visualizations.
+  ```
+
+  Custom batch-level summaries in a subclassed Model:
+
+  ```python
+  class MyModel(tf.keras.Model):
+
+    def build(self, _):
+      self.dense = tf.keras.layers.Dense(10)
+
+    def call(self, x):
+      outputs = self.dense(x)
+      tf.summary.histogram('outputs', outputs)
+      return outputs
+
+  model = MyModel()
+  model.compile('sgd', 'mse')
+
+  # Make sure to set `update_freq=N` to log a batch-level summary every N batches.
+  # In addition to any `tf.summary` contained in `Model.call`, metrics added in
+  # `Model.compile` will be logged every N batches.
+  tb_callback = tf.keras.callbacks.TensorBoard('./logs', update_freq=1)
+  model.fit(x_train, y_train, callbacks=[tb_callback])
+  ```
+
+  Custom batch-level summaries in a Functional API Model:
+
+  ```python
+  def my_summary(x):
+    tf.summary.histogram('x', x)
+    return x
+
+  inputs = tf.keras.Input(10)
+  x = tf.keras.layers.Dense(10)(inputs)
+  outputs = tf.keras.layers.Lambda(my_summary)(x)
+  model = tf.keras.Model(inputs, outputs)
+  model.compile('sgd', 'mse')
+
+  # Make sure to set `update_freq=N` to log a batch-level summary every N batches.
+  # In addition to any `tf.summary` contained in `Model.call`, metrics added in
+  # `Model.compile` will be logged every N batches.
+  tb_callback = tf.keras.callbacks.TensorBoard('./logs', update_freq=1)
+  model.fit(x_train, y_train, callbacks=[tb_callback])
+  ```
+
+  Profiling:
+
+  ```python
+  # Profile a single batch, e.g. the 5th batch.
+  tensorboard_callback = tf.keras.callbacks.TensorBoard(
+      log_dir='./logs', profile_batch=5)
+  model.fit(x_train, y_train, epochs=2, callbacks=[tensorboard_callback])
+
+  # Profile a range of batches, e.g. from 10 to 20.
+  tensorboard_callback = tf.keras.callbacks.TensorBoard(
+      log_dir='./logs', profile_batch=(10,20))
+  model.fit(x_train, y_train, epochs=2, callbacks=[tensorboard_callback])
+  ```
   """
 
   # pylint: enable=line-too-long
@@ -2111,7 +2197,7 @@ class TensorBoard(Callback, version_utils.TensorBoardVersionSelector):
 
     config_pbtxt = text_format.MessageToString(config)
     path = os.path.join(self._log_write_dir, 'projector_config.pbtxt')
-    with open(path, 'w') as f:
+    with gfile.Open(path, 'w') as f:
       f.write(config_pbtxt)
 
   def _push_writer(self, writer, step):
@@ -2417,7 +2503,7 @@ class ReduceLROnPlateau(Callback):
     """Resets wait counter and cooldown counter.
     """
     if self.mode not in ['auto', 'min', 'max']:
-      logging.warning('Learning Rate Plateau Reducing mode %s is unknown, '
+      logging.warning('Learning rate reduction mode %s is unknown, '
                       'fallback to auto mode.', self.mode)
       self.mode = 'auto'
     if (self.mode == 'min' or
@@ -2438,7 +2524,7 @@ class ReduceLROnPlateau(Callback):
     logs['lr'] = K.get_value(self.model.optimizer.lr)
     current = logs.get(self.monitor)
     if current is None:
-      logging.warning('Reduce LR on plateau conditioned on metric `%s` '
+      logging.warning('Learning rate reduction is conditioned on metric `%s` '
                       'which is not available. Available metrics are: %s',
                       self.monitor, ','.join(list(logs.keys())))
 
@@ -2541,8 +2627,6 @@ class CSVLogger(Callback):
         delimiter = self.sep
 
       fieldnames = ['epoch'] + self.keys
-      if six.PY2:
-        fieldnames = [unicode(x) for x in fieldnames]
 
       self.writer = csv.DictWriter(
           self.csv_file,
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 9fd8bf86609..5cb33e73622 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -934,33 +934,33 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
                                            steps=10,
                                            verbose=0)
 
-    with context.eager_mode():
-      tensor = ops.convert_to_tensor(1.)
+    tensor = ops.convert_to_tensor_v2_with_dispatch(1.)
 
     def mock_numpy():
       raise RuntimeError(
           'If this error is seen, ModelCheckpoint is causing a blocking '
           'NumPy conversion even when not checkpointing.')
 
-    with test.mock.patch.object(tensor, 'numpy', mock_numpy):
-      logs = {'metric': tensor}
+    tensor.numpy = mock_numpy
 
-      cb_list.on_train_begin(logs)
-      cb_list.on_epoch_begin(0, logs)
-      cb_list.on_train_batch_begin(0, logs)
-      cb_list.on_train_batch_end(0, logs)
-      cb_list.on_epoch_end(0, logs)
-      cb_list.on_train_end(logs)
+    logs = {'metric': tensor}
 
-      cb_list.on_test_begin(logs)
-      cb_list.on_test_batch_begin(0, logs)
-      cb_list.on_test_batch_end(0, logs)
-      cb_list.on_test_end(logs)
+    cb_list.on_train_begin(logs)
+    cb_list.on_epoch_begin(0, logs)
+    cb_list.on_train_batch_begin(0, logs)
+    cb_list.on_train_batch_end(0, logs)
+    cb_list.on_epoch_end(0, logs)
+    cb_list.on_train_end(logs)
 
-      cb_list.on_predict_begin(logs)
-      cb_list.on_predict_batch_begin(logs)
-      cb_list.on_predict_batch_end(logs)
-      cb_list.on_predict_end(logs)
+    cb_list.on_test_begin(logs)
+    cb_list.on_test_batch_begin(0, logs)
+    cb_list.on_test_batch_end(0, logs)
+    cb_list.on_test_end(logs)
+
+    cb_list.on_predict_begin(logs)
+    cb_list.on_predict_batch_begin(logs)
+    cb_list.on_predict_batch_end(logs)
+    cb_list.on_predict_end(logs)
 
   def test_ProgbarLogger_verbose_2_nonblocking(self):
     # Should only cause a sync block on epoch end methods.
@@ -974,31 +974,30 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
                                            steps=10,
                                            verbose=2)
 
-    with context.eager_mode():
-      tensor = ops.convert_to_tensor(1.)
+    tensor = ops.convert_to_tensor_v2_with_dispatch(1.)
 
     def mock_numpy():
       raise RuntimeError(
           'If this error is seen, ModelCheckpoint is causing a blocking '
           'NumPy conversion even when not checkpointing.')
 
-    with test.mock.patch.object(tensor, 'numpy', mock_numpy):
-      logs = {'metric': tensor}
+    tensor.numpy = mock_numpy
+    logs = {'metric': tensor}
 
-      cb_list.on_train_begin(logs)
-      cb_list.on_epoch_begin(0, logs)
-      cb_list.on_train_batch_begin(0, logs)
-      cb_list.on_train_batch_end(0, logs)
+    cb_list.on_train_begin(logs)
+    cb_list.on_epoch_begin(0, logs)
+    cb_list.on_train_batch_begin(0, logs)
+    cb_list.on_train_batch_end(0, logs)
 
-      cb_list.on_test_begin(logs)
-      cb_list.on_test_batch_begin(0, logs)
-      cb_list.on_test_batch_end(0, logs)
-      cb_list.on_test_end(logs)
+    cb_list.on_test_begin(logs)
+    cb_list.on_test_batch_begin(0, logs)
+    cb_list.on_test_batch_end(0, logs)
+    cb_list.on_test_end(logs)
 
-      with self.assertRaisesRegex(RuntimeError, 'NumPy conversion'):
-        # on_epoch_end should still block.
-        cb_list.on_epoch_end(0, logs)
-      cb_list.on_train_end(logs)
+    with self.assertRaisesRegex(RuntimeError, 'NumPy conversion'):
+      # on_epoch_end should still block.
+      cb_list.on_epoch_end(0, logs)
+    cb_list.on_train_end(logs)
 
   def test_EarlyStopping(self):
     with self.cached_session():
@@ -1469,7 +1468,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
         epochs=20)
     loss = history.history['loss']
     self.assertEqual(len(loss), 1)
-    self.assertTrue(np.isnan(loss[0]))
+    self.assertTrue(np.isnan(loss[0]) or np.isinf(loss[0]))
 
   @unittest.skipIf(
       os.name == 'nt',
@@ -1770,6 +1769,28 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
     with self.assertRaisesRegexp(ValueError, 'New function '):
       model.predict(x, batch_size=2)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_stop_training_batch_level(self):
+
+    class MyCallback(keras.callbacks.Callback):
+
+      def __init__(self):
+        super(MyCallback, self).__init__()
+        self.batch_counter = 0
+
+      def on_train_batch_end(self, batch, logs=None):
+        self.batch_counter += 1
+        if batch == 2:
+          self.model.stop_training = True
+
+    model = keras.Sequential([keras.layers.Dense(1)])
+    model.compile('sgd', 'mse')
+    x, y = np.ones((10, 10)), np.ones((10, 1))
+    my_cb = MyCallback()
+    # Will run 5 batches if `stop_training` doesn't work.
+    model.fit(x, y, batch_size=2, callbacks=[my_cb])
+    self.assertEqual(my_cb.batch_counter, 3)
+
 
 # A summary that was emitted during a test. Fields:
 #   logdir: str. The logdir of the FileWriter to which the summary was
@@ -2193,7 +2214,7 @@ class TestTensorBoardV2(keras_parameterized.TestCase):
                                            steps=100,
                                            verbose=0)
 
-    tensor = ops.convert_to_tensor(1.)
+    tensor = ops.convert_to_tensor_v2_with_dispatch(1.)
 
     def mock_numpy():
       raise RuntimeError(
diff --git a/tensorflow/python/keras/callbacks_v1.py b/tensorflow/python/keras/callbacks_v1.py
index 251fb3476dc..5c0c3ff6e96 100644
--- a/tensorflow/python/keras/callbacks_v1.py
+++ b/tensorflow/python/keras/callbacks_v1.py
@@ -245,8 +245,8 @@ class TensorBoard(callbacks.TensorBoard):
     # visualize embeddings.
     if self.embeddings_freq and self.embeddings_data is not None:
       # Avoid circular dependency.
-      from tensorflow.python.keras.engine import training_utils  # pylint: disable=g-import-not-at-top
-      self.embeddings_data = training_utils.standardize_input_data(
+      from tensorflow.python.keras.engine import training_utils_v1  # pylint: disable=g-import-not-at-top
+      self.embeddings_data = training_utils_v1.standardize_input_data(
           self.embeddings_data, model.input_names)
 
       # If embedding_layer_names are not provided, get all of the embedding
diff --git a/tensorflow/python/keras/datasets/BUILD b/tensorflow/python/keras/datasets/BUILD
index 63d9826f5ec..67f400db4b0 100644
--- a/tensorflow/python/keras/datasets/BUILD
+++ b/tensorflow/python/keras/datasets/BUILD
@@ -8,7 +8,11 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
 
 py_library(
     name = "datasets",
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index 748ab7ce0f4..0b30c34ce0a 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -16,13 +16,18 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
 
 py_library(
     name = "distribute",
     srcs = [
         "__init__.py",
         "distributed_training_utils.py",
+        "distributed_training_utils_v1.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -131,6 +136,7 @@ cuda_py_test(
     srcs = ["worker_training_state_test.py"],
     python_version = "PY3",
     shard_count = 4,
+    tfrt_enabled = True,
     deps = [
         ":multi_worker_testing_utils",
         ":worker_training_state",
@@ -144,7 +150,6 @@ cuda_py_test(
 distribute_py_test(
     name = "checkpointing_test",
     srcs = ["checkpointing_test.py"],
-    disable_mlir_bridge = False,
     main = "checkpointing_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -165,6 +170,7 @@ cuda_py_test(
     tags = [
         "multi_and_single_gpu",
         "nomsan",  # TODO(b/162894966)
+        "notsan",  # TODO(b/171040408): data race
     ],
     # b/155301154 broken with XLA:GPU
     xla_enable_strict_auto_jit = True,
@@ -245,10 +251,11 @@ distribute_py_test(
     main = "custom_training_loop_models_test.py",
     tags = [
         "multi_and_single_gpu",
-        "no_cuda11",
+        "notsan",  # TODO(b/170954243)
     ],
     tpu_tags = [
         "no_oss",  # b/153615544.
+        "notsan",  # TODO(b/170869466)
     ],
     deps = [
         "//tensorflow/python:math_ops",
@@ -330,13 +337,17 @@ distribute_py_test(
 distribute_py_test(
     name = "distribute_strategy_test",
     srcs = ["distribute_strategy_test.py"],
+    disable_mlir_bridge = True,  # TODO(b/170352626)
     full_precision = True,
     main = "distribute_strategy_test.py",
+    python_version = "PY3",
     shard_count = 10,
     tags = [
         "multi_and_single_gpu",
         "no_rocm",  # times out on ROCm
         "no_windows_gpu",
+        "noasan",  # TODO(b/170902997)
+        "notap",  # TODO(b/170902997)
         "notsan",
     ],
     tpu_tags = [
@@ -376,9 +387,11 @@ py_library(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/distribute:multi_process_runner",
         "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/distribute:tpu_strategy",
         "//tensorflow/python/eager:context",
@@ -394,7 +407,7 @@ distribute_py_test(
     name = "keras_dnn_correctness_test",
     size = "medium",
     srcs = ["keras_dnn_correctness_test.py"],
-    disable_mlir_bridge = False,
+    disable_mlir_bridge = True,  # TODO(b/170352626)
     full_precision = True,
     main = "keras_dnn_correctness_test.py",
     # Shard count is set to an odd number to distribute tasks across
@@ -404,6 +417,7 @@ distribute_py_test(
         "multi_and_single_gpu",
         "no_rocm",  # times out on ROCm
         "no_windows_gpu",
+        "nogpu",  # TODO(b/170905292)
         "notsan",
     ],
     deps = [
@@ -415,10 +429,12 @@ distribute_py_test(
     name = "keras_embedding_model_correctness_test",
     size = "medium",
     srcs = ["keras_embedding_model_correctness_test.py"],
+    disable_mlir_bridge = True,  # TODO(b/170352626)
     full_precision = True,
     main = "keras_embedding_model_correctness_test.py",
     shard_count = 8,
     tags = [
+        "broken",  # b/170975619
         "multi_and_single_gpu",
         "no_windows_gpu",
         "notsan",
@@ -432,7 +448,7 @@ distribute_py_test(
     name = "keras_image_model_correctness_test",
     size = "medium",
     srcs = ["keras_image_model_correctness_test.py"],
-    disable_mlir_bridge = False,
+    disable_mlir_bridge = True,  # TODO(b/170352626)
     full_precision = True,
     main = "keras_image_model_correctness_test.py",
     shard_count = 16,
@@ -440,6 +456,7 @@ distribute_py_test(
         "multi_and_single_gpu",
         "no_rocm",  # times out on ROCm
         "no_windows_gpu",
+        "noasan",  # TODO(b/337374867) fails with -fsanitize=null
         "notsan",
     ],
     xla_enable_strict_auto_jit = False,  # Tensorflow also fails.
@@ -479,6 +496,7 @@ distribute_py_test(
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -497,6 +515,7 @@ distribute_py_test(
         "no_cuda11",
         "no_oss",
         "no_windows_gpu",
+        "noasan",  # TODO(b/337374867) fails with -fsanitize=null
         "notpu",  # TODO(b/153672562)
         "notsan",
     ],
@@ -509,7 +528,6 @@ distribute_py_test(
     name = "keras_save_load_test",
     size = "medium",
     srcs = ["keras_save_load_test.py"],
-    disable_mlir_bridge = False,
     full_precision = True,
     main = "keras_save_load_test.py",
     shard_count = 7,
@@ -544,6 +562,7 @@ distribute_py_test(
 distribute_py_test(
     name = "keras_utils_test",
     srcs = ["keras_utils_test.py"],
+    disable_mlir_bridge = True,  # TODO(b/170352626)
     full_precision = True,
     main = "keras_utils_test.py",
     shard_count = 4,
@@ -668,6 +687,7 @@ cuda_py_test(
         "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras:metrics",
         "//tensorflow/python/keras/layers:core",
     ],
 )
@@ -676,7 +696,7 @@ cuda_py_test(
     name = "multi_worker_test",
     srcs = ["multi_worker_test.py"],
     python_version = "PY3",
-    shard_count = 32,
+    shard_count = 2,
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # TODO(b/130369494): Investigate why it times out on OSS.
@@ -777,7 +797,6 @@ distribute_py_test(
     name = "saved_model_save_load_test",
     size = "medium",
     srcs = ["saved_model_save_load_test.py"],
-    disable_mlir_bridge = False,
     full_precision = True,
     main = "saved_model_save_load_test.py",
     shard_count = 7,
@@ -795,7 +814,6 @@ distribute_py_test(
     name = "saved_model_mixed_api_test",
     size = "medium",
     srcs = ["saved_model_mixed_api_test.py"],
-    disable_mlir_bridge = False,
     full_precision = True,
     main = "saved_model_mixed_api_test.py",
     shard_count = 7,
@@ -849,8 +867,9 @@ py_test(
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute:parameter_server_strategy_v2",
         "//tensorflow/python/distribute:sharded_variable",
-        "//tensorflow/python/distribute/client:parameter_server_client",
+        "//tensorflow/python/distribute/client",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:def_function",
diff --git a/tensorflow/python/keras/distribute/checkpointing_test.py b/tensorflow/python/keras/distribute/checkpointing_test.py
index b9689adede9..90a0290e38c 100644
--- a/tensorflow/python/keras/distribute/checkpointing_test.py
+++ b/tensorflow/python/keras/distribute/checkpointing_test.py
@@ -21,21 +21,23 @@ import os
 
 from absl.testing import parameterized
 
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import test
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.platform import test
 from tensorflow.python.training.saving import checkpoint_options
 from tensorflow.python.training.tracking import util as trackable_utils
 
 
 class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_one_cpu,
@@ -94,7 +96,7 @@ class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
           ValueError, "optimizer slot variable under the scope"):
         checkpoint.restore(save_path)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_one_cpu,
@@ -136,4 +138,5 @@ class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
 
 
 if __name__ == "__main__":
+  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py
index 60b7d4690bb..313cb14c78b 100644
--- a/tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py
+++ b/tensorflow/python/keras/distribute/collective_all_reduce_strategy_test.py
@@ -24,18 +24,19 @@ import numpy as np
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import collective_all_reduce_strategy
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import collective_all_reduce_strategy as mwms_lib
+from tensorflow.python.distribute import combinations as ds_combinations
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
-from tensorflow.python.eager import context
+from tensorflow.python.framework import config as tf_config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import testing_utils
@@ -64,7 +65,7 @@ def create_test_objects(cluster_spec=None,
                         num_gpus=None):
   sess_config = config_pb2.ConfigProto()
   if num_gpus is None:
-    num_gpus = context.num_gpus()
+    num_gpus = len(tf_config.list_logical_devices('GPU'))
 
   if cluster_spec and task_type and task_id is not None:
     cluster_resolver = SimpleClusterResolver(
@@ -78,7 +79,7 @@ def create_test_objects(cluster_spec=None,
         ClusterSpec({}), num_accelerators={'GPU': num_gpus})
     target = ''
 
-  strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+  strategy = mwms_lib.CollectiveAllReduceStrategy(
       cluster_resolver=cluster_resolver)
   sess_config = strategy.update_config_proto(sess_config)
 
@@ -93,9 +94,7 @@ class CollectiveAllReduceStrategyTestBase(
   def setUp(self):
     # We use a different key_base for each test so that collective keys won't be
     # reused.
-    # TODO(yuefengz, ayushd): enable it to reuse collective keys in different
-    # tests.
-    CollectiveAllReduceStrategyTestBase.collective_key_base += 100000
+    mwms_lib.CollectiveAllReduceStrategy._collective_key_base += 100000
     super(CollectiveAllReduceStrategyTestBase, self).setUp()
 
   def _get_test_object(self, task_type, task_id, num_gpus=0):
@@ -104,18 +103,6 @@ class CollectiveAllReduceStrategyTestBase(
         task_type=task_type,
         task_id=task_id,
         num_gpus=num_gpus)
-
-    collective_keys = cross_device_utils.CollectiveKeys(
-        group_key_start=10 +
-        CollectiveAllReduceStrategyTestBase.collective_key_base,
-        op_instance_key_start=100 +
-        CollectiveAllReduceStrategyTestBase.collective_key_base,
-        variable_instance_key_start=10000 +
-        CollectiveAllReduceStrategyTestBase.collective_key_base)
-    strategy.extended._collective_keys = collective_keys
-    strategy.extended._cross_device_ops._collective_keys = collective_keys
-    strategy.extended._host_cross_device_ops._collective_keys = collective_keys
-
     return strategy, target, session_config
 
   def _test_complex_model(self, task_type, task_id, num_gpus):
@@ -257,13 +244,13 @@ class DistributedCollectiveAllReduceStrategyTest(
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=0)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(mode=['graph'], required_gpus=[0, 1, 2]))
   def testComplexModel(self, required_gpus):
     self._run_between_graph_clients(
         self._test_complex_model, self._cluster_spec, num_gpus=required_gpus)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(mode=['graph'], required_gpus=[0, 1, 2]))
   @testing_utils.enable_v2_dtype_behavior
   def testMixedPrecision(self, required_gpus):
@@ -285,13 +272,13 @@ class DistributedCollectiveAllReduceStrategyTestWithChief(
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=0, has_chief=True)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(mode=['graph'], required_gpus=[0, 1, 2]))
   def testComplexModel(self, required_gpus):
     self._run_between_graph_clients(
         self._test_complex_model, self._cluster_spec, num_gpus=required_gpus)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(mode=['graph'], required_gpus=[0, 1, 2]))
   @testing_utils.enable_v2_dtype_behavior
   def testMixedPrecision(self, required_gpus):
@@ -310,64 +297,66 @@ class LocalCollectiveAllReduceStrategy(
     strategy_test_lib.TwoDeviceDistributionTestBase,
     parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(mode=['graph'], required_gpus=[2, 4]))
   def testComplexModel(self, required_gpus):
     self._test_complex_model(None, None, required_gpus)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(mode=['graph'], required_gpus=[2, 4]))
   @testing_utils.enable_v2_dtype_behavior
   def testMixedPrecision(self, required_gpus):
     with policy.policy_scope('mixed_float16'):
       self._test_mixed_precision(None, None, required_gpus)
 
+# TODO(b/170360740): Timeout in OSS
+if not multi_process_runner.is_oss():
 
-@combinations.generate(
-    combinations.combine(
-        strategy=[
-            strategy_combinations.multi_worker_mirrored_2x1_cpu,
-            strategy_combinations.multi_worker_mirrored_2x1_gpu,
-        ],
-        mode=['eager']))
-class DistributedCollectiveAllReduceStrategyEagerTest(test.TestCase,
-                                                      parameterized.TestCase):
+  @ds_combinations.generate(
+      combinations.combine(
+          strategy=[
+              strategy_combinations.multi_worker_mirrored_2x1_cpu,
+              strategy_combinations.multi_worker_mirrored_2x1_gpu,
+          ],
+          mode=['eager']))
+  class DistributedCollectiveAllReduceStrategyEagerTest(test.TestCase,
+                                                        parameterized.TestCase):
 
-  def testFitWithoutStepsPerEpochPartialBatch(self, strategy):
+    def testFitWithoutStepsPerEpochPartialBatch(self, strategy):
 
-    def _model_fn():
-      x = layers.Input(shape=(1,), name='input')
-      y = layers.Dense(1, name='dense')(x)
-      model = training.Model(x, y)
-      return model
+      def _model_fn():
+        x = layers.Input(shape=(1,), name='input')
+        y = layers.Dense(1, name='dense')(x)
+        model = training.Model(x, y)
+        return model
 
-    def _get_dataset():
-      inputs = array_ops.expand_dims_v2(constant_op.constant(range(10)), axis=1)
-      targets = array_ops.expand_dims_v2(
-          constant_op.constant(range(10)), axis=1)
-      # Make global batch size 12 for 2 replicas and a non-repeated dataset with
-      # 10 elements so that we have partial batch
-      dataset = dataset_ops.Dataset.from_tensor_slices(
-          (inputs, targets)).batch(12, drop_remainder=False)
-      return dataset
+      def _get_dataset():
+        inputs = array_ops.expand_dims_v2(
+            constant_op.constant(range(10)), axis=1)
+        targets = array_ops.expand_dims_v2(
+            constant_op.constant(range(10)), axis=1)
+        # Make global batch size 12 for 2 replicas and a non-repeated dataset
+        # with 10 elements so that we have partial batch
+        dataset = dataset_ops.Dataset.from_tensor_slices(
+            (inputs, targets)).batch(
+                12, drop_remainder=False)
+        return dataset
+
+      with strategy.scope():
+        optimizer_fn = gradient_descent_keras.SGD
+        optimizer = optimizer_fn(0.001)
+        model = _model_fn()
+        loss = 'mse'
+        metrics = ['mae']
+        model.compile(optimizer, loss, metrics=metrics)
+      dataset = _get_dataset()
+      kernel_before = model.get_weights()[0][0]
+      model.fit(dataset, epochs=10)
+      kernel_after = model.get_weights()[0][0]
+      self.assertNotEqual(kernel_before, kernel_after)
+      self.assertGreater(abs(kernel_before - 1), abs(kernel_after - 1))
 
-    with strategy.scope():
-      optimizer_fn = gradient_descent_keras.SGD
-      optimizer = optimizer_fn(0.001)
-      model = _model_fn()
-      loss = 'mse'
-      metrics = ['mae']
-      model.compile(
-          optimizer,
-          loss,
-          metrics=metrics)
-    dataset = _get_dataset()
-    kernel_before = model.get_weights()[0][0]
-    model.fit(dataset, epochs=10)
-    kernel_after = model.get_weights()[0][0]
-    self.assertNotEqual(kernel_before, kernel_after)
-    self.assertGreater(abs(kernel_before-1), abs(kernel_after-1))
 
 if __name__ == '__main__':
   v2_compat.enable_v2_behavior()
-  combinations.main()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/ctl_correctness_test.py b/tensorflow/python/keras/distribute/ctl_correctness_test.py
index 3af3ee218c9..9b62431557c 100644
--- a/tensorflow/python/keras/distribute/ctl_correctness_test.py
+++ b/tensorflow/python/keras/distribute/ctl_correctness_test.py
@@ -25,18 +25,20 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.distribute import optimizer_combinations
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.platform import test
 
 _NUM_SAMPLES = 66
 _BATCH_SIZE = 32
@@ -224,7 +226,7 @@ class TestDistributionStrategyDnnCorrectness(test.TestCase,
     np.random.seed(_RANDOM_SEED)
     random_seed.set_random_seed(_RANDOM_SEED)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.all_strategies,
           optimizer_fn=optimizer_combinations.optimizers_v2,
@@ -277,4 +279,4 @@ class TestDistributionStrategyDnnCorrectness(test.TestCase,
 
 
 if __name__ == '__main__':
-  combinations.main()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/custom_training_loop_metrics_test.py b/tensorflow/python/keras/distribute/custom_training_loop_metrics_test.py
index a41d1f369a4..0ad69699d64 100644
--- a/tensorflow/python/keras/distribute/custom_training_loop_metrics_test.py
+++ b/tensorflow/python/keras/distribute/custom_training_loop_metrics_test.py
@@ -22,17 +22,19 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import metrics
+from tensorflow.python.platform import test
 
 
 class KerasMetricsTest(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.all_strategies +
           strategy_combinations.multiworker_strategies,
@@ -57,7 +59,7 @@ class KerasMetricsTest(test.TestCase, parameterized.TestCase):
                      loss_metric_2.result().numpy())
     self.assertEqual(loss_metric.result().numpy(), 5.0)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.all_strategies+
           strategy_combinations.multiworker_strategies,
@@ -81,7 +83,7 @@ class KerasMetricsTest(test.TestCase, parameterized.TestCase):
     # of 10 resulting in mean of 4.5.
     self.assertEqual(metric.result().numpy(), 4.5)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.all_strategies,
           mode=["eager"]
@@ -100,4 +102,4 @@ class KerasMetricsTest(test.TestCase, parameterized.TestCase):
 
 
 if __name__ == "__main__":
-  combinations.main()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/custom_training_loop_models_test.py b/tensorflow/python/keras/distribute/custom_training_loop_models_test.py
index b6b92391cef..e66b174b3aa 100644
--- a/tensorflow/python/keras/distribute/custom_training_loop_models_test.py
+++ b/tensorflow/python/keras/distribute/custom_training_loop_models_test.py
@@ -25,14 +25,16 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import test
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.module import module
 from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
 from tensorflow.python.util import nest
 
 
@@ -52,7 +54,7 @@ class CustomModel(module.Module):
     return x
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
         distribution=(strategy_combinations.all_strategies +
                       strategy_combinations.multiworker_strategies),
@@ -203,13 +205,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     train_step(input_iterator)
 
-  # TODO(b/165912857): Re-enable.
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]
-      ))
-  def DISABLED_test_lstm(self, distribution):
+  def test_lstm(self, distribution):
 
     batch_size = 32
 
@@ -420,7 +416,7 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
 class KerasModelsXLATest(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.tpu_strategies, mode=["eager"]))
   def test_tf_function_experimental_compile(self, distribution):
@@ -480,4 +476,4 @@ def _get_model():
 
 
 if __name__ == "__main__":
-  combinations.main()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/custom_training_loop_optimizer_test.py b/tensorflow/python/keras/distribute/custom_training_loop_optimizer_test.py
index b9eee26220a..b61534f1d78 100644
--- a/tensorflow/python/keras/distribute/custom_training_loop_optimizer_test.py
+++ b/tensorflow/python/keras/distribute/custom_training_loop_optimizer_test.py
@@ -19,33 +19,32 @@ from __future__ import division
 from __future__ import print_function
 
 from absl.testing import parameterized
-
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import test
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
 
 
 class OptimizerTest(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           combinations.combine(
               distribution=strategy_combinations.multidevice_strategies,
               mode=["eager"],
           ),
-          combinations.concat(
-              combinations.combine(
-                  experimental_aggregate_gradients=True,
-                  expected=[[[-0.3, -0.3], [-0.3, -0.3]]]),
-              combinations.combine(
-                  experimental_aggregate_gradients=False,
-                  expected=[[[-0.1, -0.1], [-0.2, -0.2]]]),
-          )))
+          combinations.combine(
+              experimental_aggregate_gradients=True,
+              expected=[[[-0.3, -0.3], [-0.3, -0.3]]]) +
+          combinations.combine(
+              experimental_aggregate_gradients=False,
+              expected=[[[-0.1, -0.1], [-0.2, -0.2]]])
+      ))
   def test_custom_aggregation(self, distribution,
                               experimental_aggregate_gradients, expected):
 
@@ -56,8 +55,8 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
     @def_function.function
     def optimize():
       grads = values.PerReplica([
-          ops.convert_to_tensor([1., 1.]),
-          ops.convert_to_tensor([2., 2.]),
+          ops.convert_to_tensor_v2_with_dispatch([1., 1.]),
+          ops.convert_to_tensor_v2_with_dispatch([2., 2.]),
       ])
 
       def step_fn(grads):
@@ -71,7 +70,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllClose(optimize(), expected)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.one_device_strategy,
           mode=["eager"],
@@ -85,7 +84,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
 
     @def_function.function
     def optimize():
-      grads = ops.convert_to_tensor([1., 1.])
+      grads = ops.convert_to_tensor_v2_with_dispatch([1., 1.])
 
       def step_fn(grads):
         optimizer.apply_gradients(
@@ -98,7 +97,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllClose(optimize(), [[-0.1, -0.1]])
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(distribution=[
           strategy_combinations.central_storage_strategy_with_gpu_and_cpu
       ]))
@@ -107,7 +106,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
       v = variables.Variable([0., 0.])
       optimizer = gradient_descent.SGD(0.1)
 
-    grads = ops.convert_to_tensor([1., 1.])
+    grads = ops.convert_to_tensor_v2_with_dispatch([1., 1.])
 
     def step_fn(grads):
       with self.assertRaises(NotImplementedError):
diff --git a/tensorflow/python/keras/distribute/distribute_strategy_test.py b/tensorflow/python/keras/distribute/distribute_strategy_test.py
index 4ea53429195..7bc70101fb4 100644
--- a/tensorflow/python/keras/distribute/distribute_strategy_test.py
+++ b/tensorflow/python/keras/distribute/distribute_strategy_test.py
@@ -17,15 +17,23 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from absl.testing import parameterized
 import numpy as np
+
 from tensorflow.python import keras
 from tensorflow.python.data.experimental.ops import cardinality
+from tensorflow.python.data.experimental.ops import writers
+from tensorflow.python.data.experimental.ops.distribute_options import AutoShardPolicy
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
 from tensorflow.python.distribute import central_storage_strategy
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import parameter_server_strategy
@@ -36,9 +44,12 @@ from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.distribute import distributed_training_utils
+from tensorflow.python.keras.distribute import distributed_training_utils_v1
 from tensorflow.python.keras.distribute import optimizer_combinations
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.mixed_precision.experimental import policy
@@ -48,6 +59,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import loss_reduction
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -150,8 +162,7 @@ def batch_wrapper(dataset, batch_size, distribution, repeat=None):
     dataset = dataset.repeat(repeat)
   # TPUs currently require fully defined input shapes, drop_remainder ensures
   # the input will have fully defined shapes.
-  if isinstance(distribution,
-                (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+  if _is_tpu_strategy(distribution):
     return dataset.batch(batch_size, drop_remainder=True)
   else:
     return dataset.batch(batch_size)
@@ -235,12 +246,18 @@ strategies_minus_tpu = [
     strategy_combinations.central_storage_strategy_with_gpu_and_cpu
 ]
 
-tpu_strategies = [
-    strategy_combinations.tpu_strategy,  # steps_per_run=2
-    strategy_combinations.tpu_strategy_one_step
+multi_worker_mirrored_strategies = [
+    strategy_combinations.multi_worker_mirrored_2x1_cpu,
+    strategy_combinations.multi_worker_mirrored_2x1_gpu,
+    strategy_combinations.multi_worker_mirrored_2x2_gpu,
 ]
 
-all_strategies = strategies_minus_tpu + tpu_strategies
+tpu_strategies = [
+    strategy_combinations.tpu_strategy,
+]
+
+all_strategies = (
+    strategies_minus_tpu + tpu_strategies + multi_worker_mirrored_strategies)
 
 
 def strategy_minus_tpu_combinations():
@@ -257,8 +274,14 @@ def tpu_strategy_combinations_graph_only():
   return combinations.combine(distribution=tpu_strategies, mode=['graph'])
 
 
+def multi_worker_strategy_combinations_eager_only():
+  return combinations.combine(
+      distribution=multi_worker_mirrored_strategies, mode=['eager'])
+
+
 def all_strategy_combinations():
-  return strategy_minus_tpu_combinations() + tpu_strategy_combinations()
+  return strategy_minus_tpu_combinations() + tpu_strategy_combinations(
+  ) + multi_worker_strategy_combinations_eager_only()
 
 
 def all_strategy_minus_default_and_tpu_combinations():
@@ -274,7 +297,8 @@ def all_strategy_minus_default_and_tpu_combinations():
 
 def all_strategy_combinations_minus_default():
   return (all_strategy_minus_default_and_tpu_combinations() +
-          tpu_strategy_combinations())
+          tpu_strategy_combinations() +
+          multi_worker_strategy_combinations_eager_only())
 
 
 def strategy_and_optimizer_combinations():
@@ -317,7 +341,21 @@ def strategy_and_optimizer_combinations():
           optimizer_combinations.gradient_descent_optimizer_keras_v2_fn,
           optimizer_combinations.rmsprop_optimizer_keras_v2_fn
       ])
-  return non_tpu_strategies + tpu_strategies_eager + tpu_strategies_graph
+  multi_worker_eager = combinations.combine(
+      distribution=multi_worker_mirrored_strategies,
+      mode=['eager'],
+      optimizer=[
+          optimizer_combinations.adadelta_optimizer_keras_v2_fn,
+          optimizer_combinations.adagrad_optimizer_keras_v2_fn,
+          optimizer_combinations.adam_optimizer_keras_v2_fn,
+          optimizer_combinations.adamax_optimizer_keras_v2_fn,
+          optimizer_combinations.gradient_descent_optimizer_keras_v2_fn,
+          optimizer_combinations.nadam_optimizer_keras_v2_fn,
+          optimizer_combinations.rmsprop_optimizer_keras_v2_fn,
+          optimizer_combinations.ftrl_optimizer_keras_v2_fn
+      ])
+  return (non_tpu_strategies + tpu_strategies_eager + tpu_strategies_graph +
+          multi_worker_eager)
 
 
 class BatchCountingCB(keras.callbacks.Callback):
@@ -353,7 +391,7 @@ class BatchCountingCB(keras.callbacks.Callback):
 class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                                               parameterized.TestCase):
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_no_steps_no_batch_size(self, distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
     # that use per_core_batch_size
@@ -363,18 +401,18 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
 
     with self.cached_session():
       # Default global batch size 32 for input with 64 samples run in 2 steps
-      steps, batch_size = distributed_training_utils.get_input_params(
+      steps, batch_size = distributed_training_utils_v1.get_input_params(
           distribution, 64, steps=None, batch_size=None)
       self.assertEqual(batch_size, 32 // replica_scale_factor)
       self.assertEqual(steps, 2)
 
       # Computed global batch size 20 is lower than 32 if we pass less samples.
-      steps, batch_size = distributed_training_utils.get_input_params(
+      steps, batch_size = distributed_training_utils_v1.get_input_params(
           distribution, 20, steps=None, batch_size=None)
       self.assertEqual(batch_size, 20 // replica_scale_factor)
       self.assertEqual(steps, 1)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_with_steps_no_batch_size(
       self, distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
@@ -385,27 +423,27 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
 
     with self.cached_session():
       # Computed global batch size is correct for number of specified 1 step
-      steps, batch_size = distributed_training_utils.get_input_params(
+      steps, batch_size = distributed_training_utils_v1.get_input_params(
           distribution, 64, steps=1, batch_size=None)
       self.assertEqual(batch_size, 64 // replica_scale_factor)
       self.assertEqual(steps, 1)
 
       # Computed global batch size is correct for number of specified 2 steps
-      steps, batch_size = distributed_training_utils.get_input_params(
+      steps, batch_size = distributed_training_utils_v1.get_input_params(
           distribution, 64, steps=2, batch_size=None)
       self.assertEqual(batch_size, 32 // replica_scale_factor)
       self.assertEqual(steps, 2)
 
       # All samples can not be consumed in specified number of steps
       with self.assertRaisesRegex(ValueError, 'not divisible by steps'):
-        distributed_training_utils.get_input_params(
+        distributed_training_utils_v1.get_input_params(
             distribution, 63, steps=2, batch_size=None)
 
       # This cases is different for different strategies due to the
       # difference in supported batch size being global or per-replica.
       if replica_scale_factor == 1:
         # Computed global batch size is correct even if not sharadable
-        steps, batch_size = distributed_training_utils.get_input_params(
+        steps, batch_size = distributed_training_utils_v1.get_input_params(
             distribution, 63, steps=3, batch_size=None)
         self.assertEqual(batch_size, 21)
         self.assertEqual(steps, 3)
@@ -414,10 +452,10 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         with self.assertRaisesRegex(
             ValueError, 'could not be sharded evenly '
             'across the sync replicas'):
-          distributed_training_utils.get_input_params(
+          distributed_training_utils_v1.get_input_params(
               distribution, 63, steps=1, batch_size=None)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_no_steps_with_batch_size(
       self, distribution):
     # Calculate the per_replica_batch_size scaling factor for strategies
@@ -428,33 +466,33 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
 
     with self.cached_session():
       # Computed steps is correct for specified batch size
-      steps, batch_size = distributed_training_utils.get_input_params(
+      steps, batch_size = distributed_training_utils_v1.get_input_params(
           distribution, 64, steps=None, batch_size=16)
       self.assertEqual(batch_size, 16)
       self.assertEqual(steps, 4 // replica_scale_factor)
 
       # Computed steps is correct for specified batch size
-      steps, batch_size = distributed_training_utils.get_input_params(
+      steps, batch_size = distributed_training_utils_v1.get_input_params(
           distribution, 64, steps=None, batch_size=32)
       self.assertEqual(batch_size, 32)
       self.assertEqual(steps, 2 // replica_scale_factor)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_calculating_input_params_with_steps_with_batch_size(
       self, distribution):
     with self.cached_session():
       # No change to steps and batch size if both specified and feasible
-      steps, batch_size = distributed_training_utils.get_input_params(
+      steps, batch_size = distributed_training_utils_v1.get_input_params(
           distribution, 64, steps=5, batch_size=3)
       self.assertEqual(batch_size, 3)
       self.assertEqual(steps, 5)
 
       # Number of samples is less than global batch size * steps
       with self.assertRaisesRegex(ValueError, 'less than samples required'):
-        distributed_training_utils.get_input_params(
+        distributed_training_utils_v1.get_input_params(
             distribution, 64, steps=10, batch_size=13)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_calling_model_with_numpy_arrays(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -488,13 +526,12 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         model.predict(inputs)
         model.predict(inputs, batch_size=8)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_calling_model_with_mixed_precision(self, distribution):
     if isinstance(distribution.extended,
                   parameter_server_strategy.ParameterServerStrategyExtended):
       self.skipTest('b/152097775')
-    if isinstance(distribution,
-                  (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+    if _is_tpu_strategy(distribution):
       policy_name = 'mixed_bfloat16'
     else:
       policy_name = 'mixed_float16'
@@ -534,7 +571,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       model.predict(inputs)
       model.predict(inputs, batch_size=8)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_operator_overload_mixed_precision(self, distribution):
     # Regression test that tests a fixed bug does not reoccur. Adding an
     # AutoCastVariable to a tensor on a TPU, where the variable was the LHS of
@@ -544,8 +581,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                   parameter_server_strategy.ParameterServerStrategyExtended):
       self.skipTest('b/152097775')
 
-    if isinstance(distribution,
-                  (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+    if _is_tpu_strategy(distribution):
       policy_name = 'mixed_bfloat16'
     else:
       policy_name = 'mixed_float16'
@@ -575,7 +611,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       self.assertIsNotNone(grad_v1)
       self.assertIsNotNone(grad_v2)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[strategy_combinations.one_device_strategy],
           mode=['graph', 'eager']))
@@ -593,7 +629,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                                   'cannot be called in cross-replica context'):
         optimizer.apply_gradients(zip(gradients, model.trainable_variables))
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_calling_model_with_nested_numpy_arrays(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -624,10 +660,11 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       model.predict(inputs)
       model.predict(inputs, batch_size=8)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
-          distribution=strategies_minus_tpu,
-          mode=['graph', 'eager']))
+          distribution=strategies_minus_tpu, mode=['graph', 'eager']) +
+      combinations.combine(
+          distribution=multi_worker_mirrored_strategies, mode=['eager']))
   def test_numpy_with_sample_weights(self, distribution):
     with self.cached_session(), distribution.scope():
       model = get_sample_weights_model()
@@ -665,7 +702,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       result = model.evaluate(inputs, targets, batch_size=2, verbose=1)
       self.assertAllClose(result, 13.5)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_flatten_predict_outputs(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -692,7 +729,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       self.assertAllEqual([6, 7], outs[0].shape)
       self.assertAllEqual([6, 7], outs[1].shape)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(tpu_strategy_combinations_graph_only(),
                          combinations.combine(batch_size=[4, 6])))
   def test_evaluate_with_partial_batch(self, distribution, batch_size):
@@ -735,7 +772,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
           atol=1e-5,
           rtol=1e-5)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           tpu_strategy_combinations_graph_only()))
   def test_predict_with_partial_batch(self, distribution):
@@ -772,7 +809,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
           atol=1e-5,
           rtol=1e-5)
 
-  @combinations.generate(tpu_strategy_combinations_graph_only())
+  @ds_combinations.generate(tpu_strategy_combinations_graph_only())
   def test_no_target_model(self, distribution):
     with self.cached_session():
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
@@ -797,7 +834,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
         model.predict(inputs, steps=1)
         model.evaluate(inputs, steps=1)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           tpu_strategy_combinations_graph_only()))
   def test_predict_multi_output_model_with_partial_batch(
@@ -832,7 +869,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
           atol=1e-4,
           rtol=1e-4)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_gradients_are_none(self, distribution):
 
     if not context.executing_eagerly():
@@ -863,7 +900,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
 class TestDistributionStrategyWithDatasets(test.TestCase,
                                            parameterized.TestCase):
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_calling_model_on_same_dataset(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -896,7 +933,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           validation_steps=2)
       model.predict(get_predict_dataset(distribution), steps=2)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_model_interleaved_eval_same_as_direct_eval(
       self, distribution):
     with self.cached_session():
@@ -947,7 +984,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       self.assertEqual(interleaved_output.history['val_categorical_accuracy'],
                        [x[2] for x in user_controlled_output])
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_fit_with_tuple_and_dict_dataset_inputs(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -984,12 +1021,11 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_fit_with_dictionary_in_the_dataset_b135161171(
       self, distribution):
 
-    if isinstance(distribution,
-                  (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+    if _is_tpu_strategy(distribution):
       self.skipTest('b/142805125')
 
     def custom_loss(predict, label, weight):
@@ -1032,7 +1068,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       model.fit(data)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_fit_eval_and_predict_methods_on_dataset_without_steps(
       self, distribution):
     with self.cached_session():
@@ -1068,19 +1104,56 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       self.assertAllClose(
           predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
+  def test_predict_on_dataset_with_unknown_cardinality_without_steps(
+      self, distribution, mode):
+    # TODO(b/155867206): Investigate why this test occasionally segfaults on TPU
+    # in eager mode.
+    if mode == 'eager' and _is_tpu_strategy(distribution):
+      self.skipTest('caused segfault with TPU in eager mode.')
+
+    if mode == 'graph' and _is_tpu_strategy(distribution):
+      self.skipTest('partial batch not supported with TPU in graph mode.')
+
+    with self.cached_session():
+      with distribution.scope():
+        optimizer_fn = gradient_descent_keras.SGD
+        optimizer = optimizer_fn(0.001)
+        model = get_model()
+        loss = 'mse'
+        metrics = ['mae', keras.metrics.CategoricalAccuracy()]
+        model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((20, 3), dtype=np.float32)
+      # steps/steps_per_epoch are calculated when using numpy arrays as
+      # input data.
+      predict_with_numpy = model.predict(inputs, batch_size=10)
+
+      predict_dataset = convert_numpy_to_dataset_with_unknown_cardinality(
+          inputs)
+
+      self.assertEqual(
+          keras.backend.get_value(cardinality.cardinality(predict_dataset)),
+          cardinality.UNKNOWN)
+
+      predict_with_ds = model.predict(predict_dataset)
+      self.assertAllClose(
+          predict_with_numpy, predict_with_ds, atol=1e-4, rtol=1e-4)
+
+  @ds_combinations.generate(all_strategy_combinations())
   def test_on_dataset_with_unknown_cardinality_without_steps(
       self, distribution, mode):
     # TODO(b/155867206): Investigate why this test occasionally segfaults on TPU
     # in eager mode.
-    if mode == 'eager' and isinstance(
-        distribution, (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+    if mode == 'eager' and _is_tpu_strategy(distribution):
       self.skipTest('caused segfault with TPU in eager mode.')
 
-    if mode == 'graph' and isinstance(
-        distribution, (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+    if mode == 'graph' and _is_tpu_strategy(distribution):
       self.skipTest('partial batch not supported with TPU in graph mode.')
 
+    if isinstance(distribution,
+                  collective_all_reduce_strategy.CollectiveAllReduceStrategy):
+      self.skipTest('EOF error causes subsequent collective ops fail.')
     with self.cached_session():
       with distribution.scope():
         optimizer_fn = gradient_descent_keras.SGD
@@ -1131,7 +1204,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           atol=1e-4,
           rtol=1e-4)
 
-  @combinations.generate(tpu_strategy_combinations_graph_only())
+  @ds_combinations.generate(tpu_strategy_combinations_graph_only())
   def test_on_dataset_with_unknown_cardinality(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -1172,7 +1245,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
                                   'Number of steps could not be inferred'):
         model.fit(dataset, epochs=1)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_fit_eval_and_predict_methods_on_dataset(
       self, distribution):
     with self.cached_session():
@@ -1193,7 +1266,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       model.evaluate(dataset, steps=2, verbose=1)
       model.predict(get_predict_dataset(distribution), steps=2)
 
-  @combinations.generate(strategy_and_optimizer_combinations())
+  @ds_combinations.generate(strategy_and_optimizer_combinations())
   def test_fit_eval_and_predict_with_optimizer(self, distribution, optimizer):
     with self.cached_session():
 
@@ -1211,7 +1284,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       model.evaluate(dataset, steps=2, verbose=1)
       model.predict(get_predict_dataset(distribution), steps=2)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -1243,7 +1316,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       with self.assertRaisesRegex(ValueError, 'is incompatible with'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu
@@ -1269,7 +1342,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -1298,7 +1371,8 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
             metrics=metrics)
 
       batch_size = 8
-      if isinstance(distribution, mirrored_strategy.MirroredStrategy):
+      if isinstance(distribution, (mirrored_strategy.MirroredStrategy,
+                                   mirrored_strategy.MirroredStrategyV1)):
         # MirroredStrategy uses global batch size.
         batch_size = 8 * distribution.num_replicas_in_sync
 
@@ -1324,7 +1398,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       ref_output = np.ones((160, 1), dtype=np.float32)
       self.assertArrayNear(output, ref_output, 1e-1)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def testOptimizerWithCallbacks(self, distribution):
     with self.cached_session():
       with distribution.scope():
@@ -1348,7 +1422,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
       self.assertAllClose(0.001, keras.backend.get_value(model.optimizer.lr))
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(tpu_strategy_combinations_graph_only(),
                          combinations.combine(batch_size=[4, 6])))
   def test_evaluate_with_dataset_with_partial_batch(self, distribution,
@@ -1389,7 +1463,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           atol=1e-5,
           rtol=1e-5)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           tpu_strategy_combinations_graph_only()))
   def test_predict_with_dataset_with_partial_batch(
@@ -1421,7 +1495,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           atol=1e-5,
           rtol=1e-5)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           tpu_strategy_combinations_graph_only()))
   def test_predict_multi_output_model_with_dataset_with_partial_batch(
@@ -1458,7 +1532,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
           atol=1e-4,
           rtol=1e-4)
 
-  @combinations.generate(all_strategy_combinations_minus_default())
+  @ds_combinations.generate(all_strategy_combinations_minus_default())
   def test_match_model_input_matches_with_dataset_tensors(self, distribution):
 
     def _create_model_input_output_tensors():
@@ -1511,10 +1585,11 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
             atol=1e-4,
             rtol=1e-4)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
-          distribution=strategies_minus_tpu,
-          mode=['graph', 'eager']))
+          distribution=strategies_minus_tpu, mode=['graph', 'eager']) +
+      combinations.combine(
+          distribution=multi_worker_mirrored_strategies, mode=['eager']))
   def test_dataset_with_sample_weights(self, distribution):
     with self.cached_session(), distribution.scope():
       model = get_sample_weights_model()
@@ -1550,6 +1625,57 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       self.assertAllClose(result, 13.5)
 
 
+def _is_tpu_strategy(strategy):
+  if isinstance(strategy,
+                (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+    return True
+  return False
+
+
+class TestDistributionStrategyWithDatasetsFile(test.TestCase,
+                                               parameterized.TestCase):
+
+  def setUp(self):
+    super(TestDistributionStrategyWithDatasetsFile, self).setUp()
+    self.input_file_name = os.path.join(self.get_temp_dir(), 'input.tfrecord')
+    inputs = np.zeros((20, 3), dtype=np.float32)
+    input_dataset = dataset_ops.Dataset.from_tensor_slices(inputs)
+    input_dataset = input_dataset.map(parsing_ops.serialize_tensor)
+    writer = writers.TFRecordWriter(self.input_file_name)
+    writer.write(input_dataset)
+
+  # TODO(wxinyi): add a multi-worker test for TPU
+  @ds_combinations.generate(multi_worker_strategy_combinations_eager_only())
+  def test_predict_on_dataset_shard_options_file_multi_worker_mirrored(
+      self, distribution, mode):
+    # This test is to verify if we successfully switch auto_shard_policy of a
+    # input dataset inside model.predict with MultiWorkerMirroredStrategy to
+    # AutoShardPolicy.DATA. Since there is only one input file for multiple
+    # workers, AutoShardPolicy.AUTO or AutoShardPolicy.FILE will lead to an
+    # error. However, since we switch to AutoShardPolicy.DATA in model.predict,
+    # no error is raised.
+    del mode
+    with distribution.scope():
+      optimizer_fn = gradient_descent_keras.SGD
+      optimizer = optimizer_fn(0.001)
+      model = get_model()
+      loss = 'mse'
+      model.compile(optimizer, loss)
+
+    dataset = readers.TFRecordDataset(self.input_file_name)
+    dataset = dataset.map(lambda x: parsing_ops.parse_tensor(x, dtypes.float32))
+
+    dummy_op = lambda inp: True
+
+    dataset = dataset.filter(dummy_op).batch(8, drop_remainder=True)
+
+    options = dataset_ops.Options()
+    options.experimental_distribute.auto_shard_policy = AutoShardPolicy.FILE
+    dataset = dataset.with_options(options)
+
+    model.predict(dataset, steps=1)
+
+
 class TestRegularizerLoss(test.TestCase, parameterized.TestCase):
 
   class IdentityRegularizer(keras.regularizers.Regularizer):
@@ -1572,7 +1698,7 @@ class TestRegularizerLoss(test.TestCase, parameterized.TestCase):
   def loss_fn(_, y_pred):
     return math_ops.reduce_mean(y_pred)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           strategy_combinations.all_strategy_combinations_minus_default()))
   def test_regularizer_loss(self, distribution):
@@ -1605,10 +1731,12 @@ class TestRegularizerLoss(test.TestCase, parameterized.TestCase):
       self.assertEqual(-1.0, v)
 
 
+@testing_utils.run_all_without_tensor_float_32(
+    'Uses Dense layers, which call matmul')
 class TestDistributionStrategyWithKerasModels(test.TestCase,
                                               parameterized.TestCase):
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_distribution_strategy_on_sequential_model(
       self, distribution):
     with distribution.scope():
@@ -1627,7 +1755,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     model.predict(inputs, batch_size=10)
     model.evaluate(inputs, targets, batch_size=10)
 
-  @combinations.generate(all_strategy_combinations())
+  @ds_combinations.generate(all_strategy_combinations())
   def test_distribution_strategy_on_functional_model(
       self, distribution):
     with distribution.scope():
@@ -1646,7 +1774,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     model.predict(inputs)
     model.evaluate(inputs, targets)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_distributed_dataset(self, distribution):
     with distribution.scope():
@@ -1698,7 +1826,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
                                     'distributed dataset, you must specify'):
           model.fit(ds, epochs=2)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_distributed_datasets_from_function(self, distribution):
     with distribution.scope():
@@ -1729,10 +1857,8 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
         ds = ds.batch(5).repeat()
         return ds
 
-      ds = distribution.experimental_distribute_datasets_from_function(
-          make_dataset)
-      val_ds = distribution.experimental_distribute_datasets_from_function(
-          make_dataset)
+      ds = distribution.distribute_datasets_from_function(make_dataset)
+      val_ds = distribution.distribute_datasets_from_function(make_dataset)
 
       model.fit(
           ds,
@@ -1752,7 +1878,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
                                     'distributed dataset, you must specify'):
           model.fit(ds, epochs=2)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_host_training_loop(self, distribution):
     with distribution.scope():
@@ -1762,7 +1888,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
       outputs = keras.layers.Dense(1)(x)
       model = keras.Model(inputs, outputs)
 
-    model.compile('sgd', 'mse', experimental_steps_per_execution=10)
+    model.compile('sgd', 'mse', steps_per_execution=10)
 
     bc = BatchCountingCB()
     x, y = np.ones((100, 10, 10, 3)), np.ones((100, 1))
@@ -1778,7 +1904,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertEqual(bc.predict_begin_batches, [0, 10, 20, 30, 40])
     self.assertEqual(bc.predict_end_batches, [9, 19, 29, 39, 49])
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_host_training_loop_last_partial_execution(self, distribution):
     with distribution.scope():
@@ -1786,7 +1912,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
       outputs = keras.layers.Dense(1)(inputs)
       model = keras.Model(inputs, outputs)
 
-    model.compile('sgd', 'mse', experimental_steps_per_execution=20)
+    model.compile('sgd', 'mse', steps_per_execution=20)
 
     bc = BatchCountingCB()
     x, y = np.ones((100, 10)), np.ones((100, 1))
@@ -1802,7 +1928,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertEqual(bc.predict_begin_batches, [0, 20, 40])
     self.assertEqual(bc.predict_end_batches, [19, 39, 49])
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_host_training_loop_dataset_unknown_size(self, distribution):
     with distribution.scope():
@@ -1810,7 +1936,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
       outputs = keras.layers.Dense(1)(inputs)
       model = keras.Model(inputs, outputs)
 
-    model.compile('sgd', 'mse', experimental_steps_per_execution=20)
+    model.compile('sgd', 'mse', steps_per_execution=20)
 
     x, y = np.ones((100, 10)), np.ones((100, 1))
     ds = dataset_ops.DatasetV2.from_tensor_slices((x, y)).batch(2)
@@ -1838,7 +1964,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertEqual(bc.predict_begin_batches, [0, 20, 40])
     self.assertEqual(bc.predict_end_batches, [19, 39, 49])
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_host_training_loop_truncate_to_epoch(self, distribution):
     with distribution.scope():
@@ -1846,7 +1972,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
       outputs = keras.layers.Dense(1)(inputs)
       model = keras.Model(inputs, outputs)
 
-    model.compile('sgd', 'mse', experimental_steps_per_execution=500)
+    model.compile('sgd', 'mse', steps_per_execution=500)
 
     x, y = np.ones((100, 10)), np.ones((100, 1))
     bc = BatchCountingCB()
@@ -1864,7 +1990,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertEqual(bc.predict_begin_batches, [0])
     self.assertEqual(bc.predict_end_batches, [24])
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_gradient_clipping(self, distribution):
 
@@ -1886,7 +2012,8 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     model.compile(optimizer, 'mae')
 
     if isinstance(distribution,
-                  central_storage_strategy.CentralStorageStrategy):
+                  (central_storage_strategy.CentralStorageStrategy,
+                   central_storage_strategy.CentralStorageStrategyV1)):
       with self.assertRaisesRegex(ValueError, 'not supported'):
         model.fit(x, y, batch_size=10, epochs=1)
     else:
@@ -1894,11 +2021,12 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
       self.assertAllClose(self.evaluate(layer.v1), 3.)
       self.assertAllClose(self.evaluate(layer.v2), -1.)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(distribution=all_strategies, mode=['eager']))
   def test_custom_gradient_transformation(self, distribution):
     if isinstance(distribution,
-                  central_storage_strategy.CentralStorageStrategy):
+                  (central_storage_strategy.CentralStorageStrategy,
+                   central_storage_strategy.CentralStorageStrategyV1)):
       self.skipTest('Not supported with `CentralStorageStrategy`')
 
     class MyLayer(keras.layers.Layer):
@@ -1927,7 +2055,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertAllClose(self.evaluate(layer.v1), 0.)
     self.assertAllClose(self.evaluate(layer.v2), -2.)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           all_strategy_combinations_minus_default()))
   def test_distribution_strategy_one_dimensional(self, distribution):
@@ -1945,7 +2073,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
 
       model.fit(x, y, epochs=1, steps_per_epoch=2)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -1989,7 +2117,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertArrayNear(history.history['loss'], ds_history.history['loss'],
                          1e-5)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           all_strategy_combinations_minus_default()))
   def test_distribution_strategy_with_symbolic_add_loss(
@@ -2020,7 +2148,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertAllClose(history.history, ds_history.history)
 
   # TODO(omalleyt): Investigate flakiness and re-enable.
-  @combinations.generate(all_strategy_minus_default_and_tpu_combinations())
+  @ds_combinations.generate(all_strategy_minus_default_and_tpu_combinations())
   def DISABLED_test_distribution_strategy_with_callable_add_loss(
       self, distribution):
 
@@ -2051,7 +2179,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
 
     self.assertAllClose(history.history, ds_history.history)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           all_strategy_minus_default_and_tpu_combinations()))
   def test_distribution_strategy_with_add_metric_in_call(
@@ -2099,7 +2227,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
 
     self.assertAllClose(history.history, ds_history.history)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.one_device_strategy,
@@ -2153,7 +2281,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
 
     self.assertAllClose(history.history, ds_history.history)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       # TODO(phillypham): Why does validation_steps > 1 not work on TPUs?
       combinations.times(
           all_strategy_minus_default_and_tpu_combinations()))
@@ -2193,9 +2321,9 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
 
     self.assertAllClose(history.history, ds_history.history)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
-          distribution=strategies_minus_tpu,
+          distribution=strategies_minus_tpu + multi_worker_mirrored_strategies,
           mode=['eager']))
   def test_sparse_tensor_outputs(self, distribution):
 
@@ -2222,9 +2350,9 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     self.assertAllEqual(output.values, expected_values)
     self.assertAllEqual(output.dense_shape, expected_dense_shape)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
-          distribution=strategies_minus_tpu,
+          distribution=strategies_minus_tpu + multi_worker_mirrored_strategies,
           mode=['eager']))
   def test_ragged_tensor_outputs(self, distribution):
 
@@ -2249,9 +2377,10 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
     expected_values = [[1], [2, 3]]
     self.assertAllEqual(expected_values, output)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
-          distribution=strategies_minus_default_minus_tpu + tpu_strategies,
+          distribution=strategies_minus_default_minus_tpu + tpu_strategies +
+          multi_worker_mirrored_strategies,
           mode=['eager']))
   def test_correctness_of_add_loss_with_merge_call(self, distribution):
     batch_size = 32
@@ -2315,7 +2444,7 @@ class TestDistributionStrategyWithKerasModels(test.TestCase,
         for x in dataset:
           train_step(x)
 
-  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  @ds_combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_unimplemented_parameter_server_strategy(self):
     cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=2)
@@ -2431,7 +2560,7 @@ class TestDistributionStrategyWithMultipleAddLossAndMetricCalls(
     test.TestCase, parameterized.TestCase):
   """Tests complex models with multiple add loss and metric calls."""
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           all_strategy_combinations_minus_default(),
           combinations.combine(
@@ -2502,7 +2631,7 @@ class DeterministicModel(keras.Model):
 class TestModelCapturesStrategy(test.TestCase, parameterized.TestCase):
   """Tests that model creation captures the strategy."""
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.all_strategies,
           mode=['eager']))
@@ -2567,4 +2696,4 @@ class TestModelCapturesStrategy(test.TestCase, parameterized.TestCase):
 
 if __name__ == '__main__':
   base_layer_utils.enable_v2_dtype_behavior()
-  test.main()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/distributed_training_utils.py b/tensorflow/python/keras/distribute/distributed_training_utils.py
index 07bbf3f2b1c..9f707f57668 100644
--- a/tensorflow/python/keras/distribute/distributed_training_utils.py
+++ b/tensorflow/python/keras/distribute/distributed_training_utils.py
@@ -18,421 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
-
-import numpy as np
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.distribute import distribute_coordinator_context as dc_context
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
-from tensorflow.python.distribute import multi_worker_util
-from tensorflow.python.distribute import reduce_util
-from tensorflow.python.eager import context
-from tensorflow.python.eager import def_function
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import callbacks
-from tensorflow.python.keras import metrics as metrics_module
-from tensorflow.python.keras import optimizers
-from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.optimizer_v2 import optimizer_v2
-from tensorflow.python.keras.utils.mode_keys import ModeKeys
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.ops.ragged import ragged_concat_ops
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import nest
-from tensorflow.python.util import tf_contextlib
-
-
-def set_weights(distribution_strategy, dist_model, weights):
-  """Sets the weights of the replicated models.
-
-  The weights of the replicated models are set to the weights of the original
-  model. The weights of the replicated model are Mirrored variables and hence
-  we need to use the `update` call within a DistributionStrategy scope.
-
-  Args:
-    distribution_strategy: DistributionStrategy used to distribute training
-        and validation.
-    dist_model: The replicated models on the different devices.
-    weights: The weights of the original model.
-  """
-  assign_ops = []
-  for layer in dist_model.layers:
-    num_param = len(layer.weights)
-    layer_weights = weights[:num_param]
-    for sw, w in zip(layer.weights, layer_weights):
-      if ops.executing_eagerly_outside_functions():
-        sw.assign(w)
-      else:
-        assign_ops.append(distribution_strategy.unwrap(sw.assign(w)))
-    weights = weights[num_param:]
-
-  if not ops.executing_eagerly_outside_functions():
-    K.get_session(assign_ops).run(assign_ops)
-
-
-def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
-                  grouped_updates=None, grouped_session_args=None,
-                  with_loss_tensor=False):
-  """Unwrap the list of values contained in the PerReplica parameters.
-
-  This function calls `flatten_per_replica_values` to parse each of the input
-  parameters into a list of values on the different devices. If we set
-  `with_loss_tensor` to be True, we also call `reduce` on the list of losses on
-  the different devices to give us one loss tensor.
-
-  Args:
-    distribution_strategy: DistributionStrategy used to distribute training and
-        validation.
-    grouped_inputs: PerReplica inputs returned from the train or test function
-        that we ran on each device.
-    grouped_outputs: PerReplica outputs returned from the train or test function
-        that we ran on each device.
-    grouped_updates: PerReplica updates returned from the train or test function
-        that we ran on each device.
-    grouped_session_args: PerReplica session args returned from the train or
-        test function that we ran on each device.
-    with_loss_tensor: Boolean that indicates if we need to add the reduced loss
-        tensor as one of the outputs.
-
-  Returns:
-    Values of each of the PerReplica parameters.
-
-  """
-  # Unwrap per device values returned from each model's train function.
-  # This will be used to construct the main train function.
-  all_inputs = flatten_per_replica_values(distribution_strategy,
-                                          grouped_inputs)
-  all_outputs = unwrap_outputs(distribution_strategy, grouped_outputs,
-                               with_loss_tensor)
-
-  if grouped_updates:
-    all_updates = flatten_per_replica_values(distribution_strategy,
-                                             grouped_updates)
-  else:
-    all_updates = None
-
-  all_session_args = {}
-  if grouped_session_args:
-    grouped_feed_dict = grouped_session_args.get('feed_dict')
-    if grouped_feed_dict:
-      all_session_args['feed_dict'] = flatten_per_replica_values(
-          distribution_strategy, grouped_feed_dict)
-
-    grouped_fetches = grouped_session_args.get('fetches')
-    if grouped_fetches:
-      all_session_args['fetches'] = flatten_per_replica_values(
-          distribution_strategy, grouped_fetches)
-
-  # TODO(priyag): Return only non empty/None values
-  return all_inputs, all_outputs, all_updates, all_session_args
-
-
-def unwrap_output_dict(strategy, grouped_outputs, mode):
-  """Unwrap the list of outputs contained in the PerReplica parameters."""
-  if mode == ModeKeys.PREDICT:
-    return flatten_per_replica_values(strategy, grouped_outputs)
-
-  # In the case of fit/eval, the grouped_outputs is a dict, whereas in predict,
-  # the output is as same structure as model output. They need to be treated
-  # differently
-  total_loss = strategy.reduce(reduce_util.ReduceOp.SUM,
-                               grouped_outputs['total_loss'][0], axis=None)
-  output_losses = flatten_per_replica_values(strategy,
-                                             grouped_outputs['output_losses'])
-  metrics = flatten_per_replica_values(strategy,
-                                       grouped_outputs['metrics'])
-  batch_size = strategy.reduce(reduce_util.ReduceOp.SUM,
-                               grouped_outputs['batch_size'], axis=None)
-  if (is_tpu_strategy(strategy) and
-      ops.executing_eagerly_outside_functions()):
-    # Choose 1 value per replica in the TPU case since all replicas produce the
-    # same output.
-    # We only do this in eager mode for now since this function is used in
-    # both graph and eager mode and in the graph case we currently don't use
-    # experimental_run so would need to be removed when we converge the graph
-    # code path as well.
-    output_losses = output_losses[::strategy.num_replicas_in_sync]
-    metrics = metrics[::strategy.num_replicas_in_sync]
-  return {'total_loss': [total_loss],
-          'output_losses': output_losses,
-          'metrics': metrics,
-          'batch_size': batch_size}
-
-
-def unwrap_outputs(distribution_strategy, grouped_outputs,
-                   with_loss_tensor=False):
-  """Unwrap the list of outputs contained in the PerReplica parameters.
-
-  This function calls `flatten_per_replica_values` to parse each of the input
-  parameters into a list of outputs on the different devices. If we set
-  `with_loss_tensor` to be True, we also call `reduce` on the list of losses on
-  the different devices to give us one loss tensor.
-
-  Args:
-    distribution_strategy: DistributionStrategy used to distribute training and
-        validation.
-    grouped_outputs: PerReplica outputs returned from the train or test function
-        that we ran on each device.
-    with_loss_tensor: Boolean that indicates if we need to add the reduced loss
-        tensor as one of the outputs.
-
-  Returns:
-    Values of each of the PerReplica outputs.
-
-  """
-  if not with_loss_tensor:
-    return flatten_per_replica_values(distribution_strategy,
-                                      grouped_outputs)
-
-  if not isinstance(grouped_outputs, list):
-    grouped_outputs = [grouped_outputs]
-  # reduce loss tensor before adding it to the list of fetches
-  loss = distribution_strategy.reduce(reduce_util.ReduceOp.SUM,
-                                      grouped_outputs[0], axis=None)
-  all_outputs = flatten_per_replica_values(distribution_strategy,
-                                           grouped_outputs[1:])
-  if (is_tpu_strategy(distribution_strategy) and
-      ops.executing_eagerly_outside_functions()):
-    # Choose 1 value per replica in the TPU case since all replicas produce the
-    # same output.
-    # We only do this in eager mode for now since this function is used in
-    # both graph and eager mode and in the graph case we currently don't use
-    # experimental_run so would need to be removed when we converge the graph
-    # code path as well.
-    all_outputs = all_outputs[::distribution_strategy.num_replicas_in_sync]
-  return [loss] + all_outputs
-
-
-def flatten_per_replica_values(distribution_strategy, per_replica_values):
-  """Unwraps and flattens a nest of PerReplica parameters.
-
-  PerReplica values have one value associated with each device. Each entry in
-  the PerReplica dict has a device `key` and the corresponding value on the
-  device as the `value`. In this function we take a PerReplica value or a list
-  of PerReplica values and return all the values in the PerReplica dict.
-
-  Args:
-    distribution_strategy: DistributionStrategy used to distribute training and
-      validation.
-    per_replica_values: List of PerReplica object or a single PerReplica object.
-
-  Returns:
-    List of values of all the PerReplica objects.
-
-  """
-  # pylint: disable=g-complex-comprehension
-  # This function takes a PerReplica object or a list of PerReplica objects and
-  # returns all the values associated with it.
-  return [e for flattened in nest.flatten(per_replica_values)
-          for e in distribution_strategy.unwrap(flattened)]
-
-
-def validate_callbacks(input_callbacks, optimizer):
-  """Validate whether given callbacks are supported by DistributionStrategy.
-
-  Args:
-    input_callbacks: List of callbacks passed by the user to fit.
-    optimizer: Optimizer instance used to train the model.
-
-  Raises:
-    ValueError: If `LearningRateScheduler` or `ReduceLROnPlateau` is one of the
-        callbacks passed.
-    ValueError: If `write_grads` is one of the parameters passed as part of the
-        TensorBoard callback.
-  """
-  if input_callbacks:
-    for callback in input_callbacks:
-      if isinstance(callback, (callbacks.LearningRateScheduler,
-                               callbacks.ReduceLROnPlateau)):
-
-        if not isinstance(optimizer, optimizer_v2.OptimizerV2):
-          raise ValueError('You must specify a Keras Optimizer V2 when using '
-                           '%s callback with DistributionStrategy.' % callback)
-
-      # If users want to use the TensorBoard callback they cannot use certain
-      # features of the callback that involve accessing model attributes and
-      # running ops.
-      if isinstance(callback, callbacks.TensorBoard):
-        if getattr(callback, 'write_grads', False):
-          logging.warning(
-              UserWarning(
-                  '`write_grads` in the TensorBoard callback is not supported '
-                  'when using DistributionStrategy. Setting `write_grads` '
-                  'to `False`.'))
-          callback.write_grads = False
-
-
-def validate_distributed_dataset_inputs(distribution_strategy, x, y,
-                                        sample_weights=None):
-  """Validate all the components of a DistributedValue Dataset input.
-
-  Args:
-    distribution_strategy: The current DistributionStrategy used to call
-        `fit`/`evaluate`.
-    x: Input Dataset DistributedValue object. For example, when we use
-        `MirroredStrategy` this is a PerReplica object with a tensor for each
-        device set in the dict. x can also be a tuple or dict. The keys of the
-        dict should match the names of the input layers of the model.
-    y: Target Dataset DistributedValue object. For example, when we use
-        `MirroredStrategy` this is a PerReplica object with a tensor for each
-        device set in the dict. y can also be a tuple or dict. The keys of the
-        dict should match the names of the output layers of the model.
-    sample_weights: Sample weights Dataset DistributedValue object. For example,
-        when we use `MirroredStrategy` this is a PerReplica object with a tensor
-        for each device set in the dict.
-
-  Returns:
-    The unwrapped values list of the x and y DistributedValues inputs.
-
-  Raises:
-    ValueError: If x and y do not have support for being evaluated as tensors.
-        or if x and y contain elements that are not tensors or if x and y
-        contain elements that have a shape or dtype mismatch.
-  """
-  # If the input and target used to call the model are not dataset tensors,
-  # we need to raise an error. When using a DistributionStrategy, the input
-  # and targets to a model should be from a `tf.data.Dataset`.
-
-  # If each element of x and y are not tensors, we cannot standardize and
-  # validate the input and targets.
-  x_values_list = validate_per_replica_inputs(distribution_strategy, x)
-
-  if y is not None:
-    y_values_list = validate_per_replica_inputs(distribution_strategy, y)
-  else:
-    y_values_list = None
-
-  if sample_weights is not None:
-    sample_weights_list = validate_per_replica_inputs(distribution_strategy,
-                                                      sample_weights)
-  else:
-    sample_weights_list = None
-
-  # Return the unwrapped values to avoid calling `unwrap` a second time.
-  return x_values_list, y_values_list, sample_weights_list
-
-
-def validate_per_replica_inputs(distribution_strategy, x):
-  """Validates PerReplica dataset input list.
-
-  Args:
-    distribution_strategy: The current DistributionStrategy used to call
-      `fit`, `evaluate` and `predict`.
-    x: A list of PerReplica objects that represent the input or
-      target values.
-
-  Returns:
-    List containing the first element of each of the PerReplica objects in
-    the input list.
-
-  Raises:
-    ValueError: If any of the objects in the `per_replica_list` is not a tensor.
-
-  """
-  # Convert the inputs and targets into a list of PerReplica objects.
-  per_replica_list = nest.flatten(x, expand_composites=True)
-  x_values_list = []
-  for x in per_replica_list:
-    # At this point x should contain only tensors.
-    x_values = distribution_strategy.unwrap(x)
-    for value in x_values:
-      if not tensor_util.is_tensor(value):
-        raise ValueError('Dataset input to the model should be tensors instead '
-                         'they are of type {}'.format(type(value)))
-
-    if not context.executing_eagerly():
-      # Validate that the shape and dtype of all the elements in x are the same.
-      validate_all_tensor_shapes(x, x_values)
-    validate_all_tensor_types(x, x_values)
-
-    x_values_list.append(x_values[0])
-  return x_values_list
-
-
-def validate_all_tensor_types(x, x_values):
-  x_dtype = x_values[0].dtype
-  for i in range(1, len(x_values)):
-    if x_dtype != x_values[i].dtype:
-      raise ValueError('Input tensor dtypes do not match for distributed tensor'
-                       ' inputs {}'.format(x))
-
-
-def validate_all_tensor_shapes(x, x_values):
-  # Validate that the shape of all the elements in x have the same shape
-  x_shape = x_values[0].shape.as_list()
-  for i in range(1, len(x_values)):
-    if x_shape != x_values[i].shape.as_list():
-      raise ValueError('Input tensor shapes do not match for distributed tensor'
-                       ' inputs {}'.format(x))
-
-
-def _wait_for_variable_initialization(session):
-  """Utility to wait for variables to be initialized."""
-  all_variables = K._get_variables(K.get_graph())  # pylint: disable=protected-access
-  candidate_vars = []
-  for v in all_variables:
-    if not getattr(v, '_keras_initialized', False):
-      candidate_vars.append(v)
-
-  if not candidate_vars:
-    return
-
-  while True:
-    is_initialized = session.run(
-        [variables.is_variable_initialized(v) for v in candidate_vars])
-    uninitialized_vars = []
-    for flag, v in zip(is_initialized, candidate_vars):
-      if not flag:
-        uninitialized_vars.append(v)
-      v._keras_initialized = True  # pylint: disable=protected-access
-    if not uninitialized_vars:
-      break
-
-
-def init_restore_or_wait_for_variables():
-  """Initialize or restore variables or wait for variables to be initialized."""
-  session = K._get_session()  # pylint: disable=protected-access
-  if not multi_worker_util.has_worker_context(
-  ) or multi_worker_util.should_load_checkpoint():
-    # TODO(yuefengz): if checkpoints exist, restore from checkpoint.
-    K._initialize_variables(session)  # pylint: disable=protected-access
-  else:
-    _wait_for_variable_initialization(session)
-
-
-def validate_inputs(x, y):
-  """Validate inputs when using DistributionStrategy.
-
-  Args:
-    x: Model Inputs.
-    y: Model Targets.
-
-  Raises:
-    ValueError: if input is not a Dataset or a numpy array(when we use
-      MirroredStrategy).
-  """
-  if (isinstance(x, iterator_ops.Iterator) or
-      isinstance(y, iterator_ops.Iterator)):
-    raise ValueError('`DistributionStrategy` does not support inputs of type '
-                     'Iterator. You must pass a `tf.data.Dataset` object or a '
-                     'numpy array as input.')
-
-
-# TODO(b/118776054): Currently we support global batch size for TPUStrategy and
-# core MirroredStrategy only. Remove this check when contrib MirroredStrategy is
-# no longer needed.
-def global_batch_size_supported(distribution_strategy):
-  return distribution_strategy.extended._global_batch_size  # pylint: disable=protected-access
 
 
 # TODO(sourabhbajaj): Remove this once we use the same API for all strategies.
@@ -442,664 +28,11 @@ def is_tpu_strategy(strategy):
           strategy.__class__.__name__.startswith('TPUStrategy'))
 
 
-def is_dataset_shape_fully_defined(dataset):
-  """Returns whether a dataset contains a final partial batch."""
-  shapes = nest.flatten(dataset_ops.get_legacy_output_shapes(dataset))
-  unknown_shapes = [s for s in shapes if not s.is_fully_defined()]
-  return not unknown_shapes
-
-
-def process_batch_and_step_size(strategy,
-                                inputs,
-                                batch_size,
-                                steps_per_epoch,
-                                mode,
-                                validation_split=0.):
-  """Process the batch size and step size based on input and dist strategy."""
-  first_x_value = nest.flatten(inputs)[0]
-  if isinstance(first_x_value, np.ndarray):
-    num_samples = first_x_value.shape[0]
-    if validation_split and 0. < validation_split < 1.:
-      num_samples = int(num_samples * (1 - validation_split))
-    # Until support for partial batch is implemented across all
-    # functions and distribution strategy, we pass `mode` to selectively
-    # relax the constraint to consume all the training samples.
-    steps_per_epoch, batch_size = get_input_params(
-        strategy, num_samples, steps_per_epoch, batch_size, mode=mode)
-  return batch_size, steps_per_epoch
-
-
-def get_input_params(distribution_strategy,
-                     num_samples,
-                     steps,
-                     batch_size,
-                     mode=None):
-  """Calculate the number of batches and steps/steps_per_epoch.
-
-  Args:
-    distribution_strategy: The DistributionStrategy used to compile the model.
-    num_samples: The number of samples from which we determine the batch size
-      and steps.
-    steps:  The specified number of steps.
-    batch_size: The specified batch_size.
-    mode: ModeKey representing whether input will be used for training,
-      evaluation, or prediction. This is used to relax the constraints on
-      consuming all the training samples to keep compatibility till we support
-      partial batches. If none, then partial batches are not allowed.
-
-  Returns:
-    steps: The steps or steps_per_epoch argument depending on if a user is
-        calling `fit`, `evaluate` or `predict`. If the is_training flag is set
-        we don't require the number of samples to be used completely.
-    batch_size: The batch size to be used in model iterations.
-
-  Raises:
-    ValueError: If the number of batches or steps evaluates to 0.
-
-  """
-  # TODO(b/118776054): Use global batch size for Keras/DS support.
-  # Currently this is only supported in TPUStrategy and CoreMirroredStrategy.
-  use_per_replica_batch = not global_batch_size_supported(
-      distribution_strategy)
-
-  # TODO(b/128995245): In eager mode, uneven batch sizes are allowed except for
-  # `fit()` on TPUStrategy.
-  # In graph mode, the zero batch case in batch norm is not handled due to
-  # XLA-GPU regression. Uneven batch sizes are not allowed except
-  # for `test()` and `predict()` on TPUStrategy.
-  if context.executing_eagerly():
-    allow_partial_batch = (mode != ModeKeys.TRAIN or
-                           not is_tpu_strategy(distribution_strategy))
-  else:
-    allow_partial_batch = (mode == ModeKeys.TRAIN or
-                           ((mode == ModeKeys.PREDICT or mode == ModeKeys.TEST)
-                            and is_tpu_strategy(distribution_strategy)))
-
-  if steps is None:
-    if batch_size is None:
-      # If neither the batch size or number of steps are set. We choose the
-      # global batch size as the minimum of number of samples and 32. 32 is
-      # chosen to provide backward compatibility.
-      global_batch_size = min(num_samples, 32)
-    else:
-      # If the user provided the batch size we need to handle the case
-      # between different strategies that use the global/per-replica batch size
-      global_batch_size = batch_size
-      if use_per_replica_batch:
-        global_batch_size *= distribution_strategy.num_replicas_in_sync
-    if allow_partial_batch:
-      steps = np.ceil(num_samples / global_batch_size).astype(int)
-    else:
-      if num_samples % global_batch_size:
-        raise ValueError('The number of samples %s is not divisible by '
-                         'batch size %s.' % (num_samples, global_batch_size))
-      steps = num_samples // global_batch_size
-  else:
-    if batch_size is None:
-      # We calculate the batch size based on the number of steps specified
-      if num_samples % steps:
-        raise ValueError('The number of samples %s is not divisible by '
-                         'steps %s. Please change the number of steps to a '
-                         'value that can consume all the samples' % (
-                             num_samples, steps))
-      global_batch_size = num_samples // steps
-    else:
-      # If the user provided the batch size we need to handle the case
-      # between different strategies that use the global/per-replica batch size
-      global_batch_size = batch_size
-      if use_per_replica_batch:
-        global_batch_size *= distribution_strategy.num_replicas_in_sync
-
-      min_num_samples = global_batch_size * steps
-      if allow_partial_batch:
-        min_num_samples = global_batch_size * (steps-1) + 1 if steps > 1 else 0
-
-      if num_samples < min_num_samples:
-        raise ValueError('Number of samples %s is less than samples required '
-                         'for specified batch_size %s and steps %s' % (
-                             num_samples, global_batch_size, steps))
-
-  # We need to return the per replica or global batch size based on the strategy
-  if use_per_replica_batch:
-    if global_batch_size % distribution_strategy.num_replicas_in_sync:
-      raise ValueError(
-          'The batch size (%s) could not be sharded evenly across the sync '
-          'replicas (%s) in the distribution strategy.' % (
-              global_batch_size, distribution_strategy.num_replicas_in_sync))
-    batch_size = global_batch_size // distribution_strategy.num_replicas_in_sync
-  else:
-    batch_size = global_batch_size
-
-  return steps, batch_size
-
-
-def get_batch_dimension(iterator):
-  shapes = nest.flatten(dataset_ops.get_legacy_output_shapes(iterator))
-  # Take the batch size from the first element, as it should be the same for
-  # all.
-  dims = shapes[0].dims
-  return dims[0] if dims else None
-
-
-def get_iterator(dataset, distribution_strategy):
-  with distribution_strategy.scope():
-    iterator = distribution_strategy.make_dataset_iterator(dataset)
-  initialize_iterator(iterator, distribution_strategy)
-  return iterator
-
-
-def initialize_iterator(iterator, distribution_strategy):
-  with distribution_strategy.scope():
-    init_op = control_flow_ops.group(iterator.initializer)
-    if not context.executing_eagerly():
-      K.get_session((init_op,)).run(init_op)
-
-
-def _get_input_from_iterator(iterator, model):
-  """Get elements from the iterator and verify the input shape and type."""
-  next_element = iterator.get_next()
-
-  # `len(nest.flatten(x))` is going to not count empty elements such as {}.
-  # len(nest.flatten([[0,1,2], {}])) is 3 and not 4.   The `next_element` is
-  # going to get flattened in `_prepare_feed_values` to work around that. Empty
-  # elements are going to get filtered out as part of the flattening.
-  if len(nest.flatten(next_element)) == len(model.inputs):
-    x = next_element
-    y = None
-    sample_weights = None
-  elif len(nest.flatten(next_element)) == (len(model.inputs) +
-                                           len(model.outputs)):
-    x, y = next_element
-    sample_weights = None
-  else:
-    x, y, sample_weights = next_element
-
-  # Validate that all the elements in x and y are of the same type and shape.
-  validate_distributed_dataset_inputs(
-      model._distribution_strategy, x, y, sample_weights)
-  return x, y, sample_weights
-
-
-def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
-  """Prepare feed values to the model execution function.
-
-  Arguments:
-    model: Model to prepare feed values for.
-    inputs: List or dict of model inputs.
-    targets: Optional list of model targets.
-    sample_weights: Optional list of sample weight arrays.
-    mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
-
-  Returns:
-    Feed values for the model in the given mode.
-  """
-  strategy = model._distribution_strategy
-  inputs, targets, sample_weights = _get_input_from_iterator(inputs, model)
-  if is_tpu_strategy(strategy):
-    if sample_weights is not None:
-      raise ValueError('TPUStrategy does not support sample weights.')
-
-  # When the inputs are dict, then we want to flatten it in the same order as
-  # the input layers, such that the data are fed into the input layers in the
-  # correct order.
-  if isinstance(inputs, dict):
-    inputs = [inputs[key] for key in model._feed_input_names]
-  if is_distributing_by_cloning(model):
-    inputs = flatten_per_replica_values(strategy, inputs)
-    targets = flatten_per_replica_values(strategy, targets)
-    # Expand 1-dimensional inputs.
-    # TODO(b/124535720): Remove once this standarize data logic is shared with
-    # main flow.
-    inputs, targets = nest.map_structure(
-        training_utils.standardize_single_array, (inputs, targets))
-  else:
-    inputs = training_utils.ModelInputs(inputs).as_list()
-
-  if mode == ModeKeys.PREDICT:
-    sample_weights = []
-    targets = []
-  elif sample_weights is not None and is_distributing_by_cloning(model):
-    if context.executing_eagerly() and not model._compile_distribution:
-      raise NotImplementedError('`sample_weight` is not supported when using '
-                                'tf.distribute.Strategy in eager mode and '
-                                'cloning=True.')
-    sample_weights = flatten_per_replica_values(strategy, sample_weights)
-
-  ins = [inputs, targets, sample_weights]
-  return tuple(ins)
-
-
-def is_distributing_by_cloning(model):
-  """Decide whether this model is going to be distributed via cloning.
-
-  We are going to distribute the model by cloning in graph mode.
-
-  Args:
-    model: Keras model to distribute.
-
-  Returns:
-    True if the `model` is going to be distributed using cloning and False
-    otherwise.
-  """
-  if (is_tpu_strategy(model._distribution_strategy) and
-      context.executing_eagerly):  # b/137580852
-    return False
-  elif ops.executing_eagerly_outside_functions():
-    return bool(model._compile_distribution)
-  return True
-
-
-def _custom_compile_for_predict(model):
-  """Custom compile for TPU predict mode."""
-  if not model.built:
-    # Model is not compilable because it does not know its number of inputs
-    # and outputs, nor their shapes and names. We will compile after the first
-    # time the model gets called on training data.
-    return
-  model._is_compiled = True
-  model.total_loss = None
-  model.train_function = None
-  model.test_function = None
-  model.predict_function = None
-
-
-def _build_network_on_replica(model, mode, inputs=None, targets=None):
-  """Build an updated model on replicas.
-
-  We create a new Keras model while sharing the variables from the old graph.
-  Building a new sub-graph is required since the original keras model creates
-  placeholders for the input and the output that are not accessible till we
-  call iterator.get_next() inside the step_fn for `fit`/`evaluate`/`predict`.
-
-  The sharing of weights and layers between the old and the new model guarantee
-  that we're using Strategy variables and any updates on either model are
-  reflected correctly in callbacks and loop iterations.
-
-  We need to make sure we share the optimizers between the old and the new model
-  as well so that optimizer state is not lost if the user is running fit
-  multiple times.
-
-  Args:
-    model: Model to be replicated across Replicas
-    mode: Which of fit/eval/predict is building the distributed network
-    inputs: Input variables to be passed to the model
-    targets: Target tensor to be passed to model.compile
-
-  Returns:
-    A new model with shared layers with the old model.
-  """
-  # Need to do imports here since we run into a circular dependency error.
-  from tensorflow.python.keras import models  # pylint: disable=g-import-not-at-top
-  from tensorflow.python.keras.engine import sequential  # pylint: disable=g-import-not-at-top
-
-  # We rely on the internal methods to avoid having share_weights weights in the
-  # public API.
-  if isinstance(model, sequential.Sequential):
-    updated_model = models._clone_sequential_model(
-        model, input_tensors=inputs, layer_fn=models.share_weights)
-  else:
-    updated_model = models._clone_functional_model(
-        model, input_tensors=inputs, layer_fn=models.share_weights)
-    # Callable losses added directly to a functional Model need to be added
-    # here.
-    updated_model._callable_losses = model._callable_losses
-
-  # Recast all low precision outputs back to float32 since we only casted
-  # the inputs to bfloat16 and not targets. This is done so that we can preserve
-  # precision when calculating the loss value.
-  def _upcast_low_precision_outputs(output):
-    if output.dtype == dtypes.bfloat16:
-      return math_ops.cast(output, dtypes.float32)
-    else:
-      return output
-  updated_model.outputs = [_upcast_low_precision_outputs(o)
-                           for o in updated_model.outputs]
-
-  if isinstance(targets, tuple):
-    targets = nest.flatten(targets)
-
-  if mode == ModeKeys.PREDICT and inputs is not None:  # TPU predict case
-    _custom_compile_for_predict(updated_model)
-  else:
-    updated_model.compile(
-        model.optimizer,
-        model.loss,
-        metrics=metrics_module.clone_metrics(model._compile_metrics),
-        loss_weights=model.loss_weights,
-        sample_weight_mode=model.sample_weight_mode,
-        weighted_metrics=metrics_module.clone_metrics(
-            model._compile_weighted_metrics),
-        target_tensors=targets)
-  return updated_model
-
-
-def _build_distributed_network(model, strategy, mode, inputs=None,
-                               targets=None):
-  """Create a cloned model on each replica."""
-  with K.get_graph().as_default(), strategy.scope():
-    distributed_model = strategy.extended.call_for_each_replica(
-        _build_network_on_replica,
-        args=(model, mode, inputs, targets))
-    set_distributed_model(model, mode, distributed_model)
-
-
-def _clone_and_build_model(model, mode, inputs=None, targets=None):
-  """Clone and build the given keras_model."""
-  # We need to set the import here since we run into a circular dependency
-  # error.
-  from tensorflow.python.keras import models  # pylint: disable=g-import-not-at-top
-  cloned_model = models.clone_model(model, input_tensors=inputs)
-
-  # Compile and build model.
-  if isinstance(model.optimizer, optimizers.TFOptimizer):
-    optimizer = model.optimizer
-  else:
-    optimizer_config = model.optimizer.get_config()
-    optimizer = model.optimizer.__class__.from_config(optimizer_config)
-
-  # Recast all low precision outputs back to float32 since we only casted
-  # the inputs to bfloat16 and not targets. This is done so that we can preserve
-  # precision when calculating the loss value.
-  def _upcast_low_precision_outputs(output):
-    if output.dtype == dtypes.bfloat16:
-      return math_ops.cast(output, dtypes.float32)
-    else:
-      return output
-  cloned_model.outputs = [_upcast_low_precision_outputs(o)
-                          for o in cloned_model.outputs]
-
-  if isinstance(targets, tuple):
-    targets = nest.flatten(targets)
-  if mode == ModeKeys.PREDICT and inputs is not None:  # TPU predict case
-    _custom_compile_for_predict(cloned_model)
-  else:
-    cloned_model.compile(
-        optimizer,
-        model.loss,
-        metrics=metrics_module.clone_metrics(model._compile_metrics),
-        loss_weights=model.loss_weights,
-        sample_weight_mode=model.sample_weight_mode,
-        weighted_metrics=metrics_module.clone_metrics(
-            model._compile_weighted_metrics),
-        target_tensors=targets)
-  return cloned_model
-
-
-def clone_model_on_replicas(model, strategy, mode, inputs=None, targets=None):
-  """Create a cloned model on each replica."""
-  with K.get_graph().as_default(), strategy.scope():
-    distributed_model = strategy.extended.call_for_each_replica(
-        _clone_and_build_model, args=(model, mode, inputs, targets))
-    set_distributed_model(model, mode, distributed_model)
-  if mode == ModeKeys.TRAIN:
-    model._make_callback_model(distributed_model)
-
-
-def _make_execution_function(model, mode):
-  """Makes or reuses function to run one step of distributed model execution."""
-  if is_distributing_by_cloning(model):
-    return _make_execution_function_with_cloning(model, mode)
-
-  distributed_function = get_distributed_function(model, mode)
-  if distributed_function:
-    return distributed_function
-
-  distribution_function = _make_execution_function_without_cloning(model, mode)
-  set_distributed_function(model, mode, distribution_function)
-  return distribution_function
-
-
-def _make_execution_function_without_cloning(model, mode):
-  """Creates a function to run one step of distributed model execution."""
-  strategy = model._distribution_strategy
-
-  with strategy.scope():
-    per_replica_function = _make_replica_execution_function(model, mode)
-
-    def distributed_function(input_fn):
-      """A single step of the distributed execution across replicas."""
-      x, y, sample_weights = input_fn()
-      # Call `Model.{train,test,predict}_on_batch` on every replica passing
-      # PerReplicas as arguments.  On every replica inside this call, each
-      # PerReplica object will return the value for that replica.  The outputs
-      # are PerReplicas too.
-      outputs = strategy.run(per_replica_function, args=(x, y, sample_weights))
-      # Out of PerReplica outputs reduce or pick values to return.
-      all_outputs = unwrap_outputs(
-          strategy, outputs, with_loss_tensor=(mode != ModeKeys.PREDICT))
-      return all_outputs
-
-    if not model.run_eagerly:
-      distributed_function = def_function.function(distributed_function)
-      def execution_function(input_fn):
-        # `numpy` translates Tensors to values in Eager mode.
-        return [out.numpy() for out in distributed_function(input_fn)]
-    else:
-      execution_function = distributed_function
-
-    return execution_function
-
-
-def _make_replica_execution_function(model, mode):
-  """A single step of the distributed execution on a replica."""
-  if mode == ModeKeys.TRAIN:
-    func = model.train_on_batch
-  elif mode == ModeKeys.TEST:
-    func = model.test_on_batch
-  else:
-
-    def predict_on_batch(x, y=None, sample_weights=None):
-      del y, sample_weights
-      return model.predict_on_batch(x)
-
-    func = predict_on_batch
-
-  if mode != ModeKeys.PREDICT:
-    # `reset_metrics` is set to False to maintain stateful metrics across
-    # batch-level calls.
-    func = functools.partial(func, reset_metrics=False)
-
-  return func
-
-
-def _make_replicated_models_with_cloning(model, mode):
-  """Build models on each replica."""
-  strategy = model._distribution_strategy
-
-  # If distributed_model is not built, create one for `mode`.
-  if model._compile_distribution:
-    clone_model_on_replicas(model, strategy, mode)
-  else:
-    _build_distributed_network(model, strategy, mode)
-
-
-def _make_execution_function_with_cloning(model, mode):
-  """Clones or re-uses models to run one step of distributed model execution."""
-  distributed_model = get_distributed_model(model, mode)
-  # TODO(b/134069401): Create a cache for the distributed model and exec
-  # function that incorporates additional attributes to be part of the cache key
-  # than just the mode.
-  # If distributed model for a particular `mode` is already built, use the
-  # `_distribution_function` on that distributed model.
-  # If you have updated the sample_weight_mode on the model, then you will need
-  # to recompile metrics and recreate the execution function. This is indicated
-  # by the `_recompile_exec_function` property.
-  if (distributed_model and hasattr(distributed_model, '_distribution_function')
-      and not (hasattr(distributed_model, '_recompile_exec_function') and
-               distributed_model._recompile_exec_function)):
-    return distributed_model._distributed_function
-
-  if not distributed_model:
-    _make_replicated_models_with_cloning(model, mode)
-    distributed_model = get_distributed_model(model, mode)
-  assert distributed_model
-
-  # Also create an execution function on that distributed model.
-  if context.executing_eagerly():
-    distributed_function = _make_eager_execution_function(model, mode)
-  else:
-    distributed_function = _make_graph_execution_function(model, mode)
-
-  # We cache the distributed execution function on the model since creating
-  # distributed models and execution functions are expensive.
-  distributed_model._distributed_function = distributed_function
-  distributed_model._recompile_exec_function = False
-  return distributed_function
-
-
-def _make_graph_execution_function(model, mode):
-  """Makes function to run one step of distributed model in graph mode."""
-
-  def _per_replica_function(model):
-    f = model._make_execution_function(mode)
-    return (f.inputs, f.outputs, f.updates_op, f.session_kwargs)
-
-  strategy = model._distribution_strategy
-  with strategy.scope():
-    # Create train ops on each of the devices when we call
-    # `_per_replica_fit_function`.
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = strategy.extended.call_for_each_replica(
-         _per_replica_function, args=(get_distributed_model(model, mode),))
-
-    # Initialize the variables in the replicated model. This is necessary for
-    # multi-worker training because on some workers, initialization is not
-    # needed. This method does initialization or waiting for initialization
-    # according to the context object of distribute coordinator.
-    init_restore_or_wait_for_variables()
-
-    # Unwrap all the per device values returned from `call_for_each_replica`.
-    # Unwrapping per device values gives you a list of values that can be
-    # used to construct a new train function that is composed of update ops on
-    # all the devices over which the model is distributed.
-    (all_inputs, all_outputs, all_updates, all_session_args) = unwrap_values(
-        strategy,
-        grouped_inputs,
-        grouped_outputs,
-        grouped_updates,
-        grouped_session_args,
-        with_loss_tensor=(mode != ModeKeys.PREDICT))
-
-    return K.function(
-        all_inputs,
-        all_outputs,
-        updates=all_updates,
-        name='distributed_{}_function'.format(mode),
-        **all_session_args)
-
-
-def _make_eager_execution_function(model, mode):
-  """Makes function to run one step of distributed model eager execution."""
-  def _per_replica_function(model):
-    f = model._make_execution_function(mode)
-    return (f.inputs, f.outputs)
-
-  # NOTE(priyag): Try creating a new FuncGraph within DS scope instead of using
-  # the global one.
-  strategy = model._distribution_strategy
-  global_graph = K.get_graph()
-
-  with global_graph.as_default(), strategy.scope():
-    # First we gather the relevant portions of the model across all replicas.
-    # `K._scratch_graph(global_graph)` signals to Keras that it should not
-    # lift to a separate graph when creating the per-replica functions.
-    with K._scratch_graph(global_graph):
-      # Create train ops on each of the devices when we call
-      # `_per_replica_fit_function`.
-      grouped = strategy.extended.call_for_each_replica(
-          _per_replica_function, args=(get_distributed_model(model, mode),))
-      grouped_inputs, grouped_outputs = grouped
-
-      # Unwrap all the per device values returned from `call_for_each_replica`.
-      # Unwrapping per device values gives you a list of values that can be
-      # used to construct a new train function that is composed of
-      # inputs/outputs on all the devices over which the model is distributed.
-      (all_inputs, all_outputs, _, _) = unwrap_values(
-          strategy,
-          grouped_inputs,
-          grouped_outputs,
-          with_loss_tensor=(mode != ModeKeys.PREDICT))
-
-    # Finally, a joint Keras function is created; this one will be created in
-    # a separate FuncGraph.
-    return K.function(
-        all_inputs,
-        all_outputs,
-        name='eager_distributed_{}_function'.format(mode))
-
-
-def _copy_weights_to_distributed_model(original_model, mode):
-  """Copies weights from original model to distributed models."""
-  strategy = original_model._distribution_strategy
-  distributed_model = get_distributed_model(original_model, mode)
-  if strategy:
-    # Copy the weights from the original model to each of the replicated
-    # models.
-    orig_model_weights = original_model.get_weights()
-    first_model = strategy.unwrap(distributed_model)[0]
-    set_weights(strategy, first_model, orig_model_weights)
-
-
-def _copy_weights_to_original_model(model, mode):
-  """Copies weights from first distributed model back to original model."""
-  if model._distribution_strategy and mode == ModeKeys.TRAIN:
-    distributed_model = get_distributed_model(model, mode)
-    updated_weights = model._distribution_strategy.unwrap(
-        distributed_model)[0].get_weights()
-    model.set_weights(updated_weights)
-
-
-def _per_replica_aggregate_batch(strategy, batch_outs, model, mode):
-  """Aggregates the per-replica batch-level outputs from a distributed step."""
-  if strategy is not None and mode == ModeKeys.PREDICT:
-    total_batch_outs = []
-    for i in range(len(model.outputs)):
-      num_replicas = strategy.num_replicas_in_sync
-      nested_outs = batch_outs[i * num_replicas:i * num_replicas + num_replicas]
-      total_batch_outs.append(
-          concat_along_batch_dimension(nest.flatten(nested_outs)))
-    return total_batch_outs
-  return batch_outs
-
-
-def _reset_metrics(model):
-  if model._distribution_strategy:
-    for mode in [ModeKeys.TRAIN, ModeKeys.TEST, ModeKeys.PREDICT]:
-      distributed_model = get_distributed_model(model, mode)
-      if distributed_model:
-        first_model = model._distribution_strategy.unwrap(distributed_model)[0]
-        first_model.reset_metrics()
-
-
-def get_distributed_model(model, mode):
-  key = _generate_cache_key(mode)
-  return model._distributed_model_cache.get(key, None)
-
-
-def set_distributed_model(model, mode, distributed_model):
-  key = _generate_cache_key(mode)
-  model._distributed_model_cache[key] = distributed_model
-
-
-def get_distributed_function(model, mode):
-  key = _generate_cache_key(mode)
-  return model._distributed_function_cache.get(key, None)
-
-
-def set_distributed_function(model, mode, distributed_function):
-  key = _generate_cache_key(mode)
-  model._distributed_function_cache[key] = distributed_function
-
-
-def _generate_cache_key(mode):
-  key = hash(mode)
-  return key
-
-
-@tf_contextlib.contextmanager
-def distributed_scope(strategy, learning_phase):
-  with strategy.scope(), K.learning_phase_scope(learning_phase):
-    yield
+# TODO(b/118776054): Currently we support global batch size for TPUStrategy and
+# core MirroredStrategy only. Remove this check when contrib MirroredStrategy is
+# no longer needed.
+def global_batch_size_supported(distribution_strategy):
+  return distribution_strategy.extended._global_batch_size  # pylint: disable=protected-access
 
 
 def call_replica_local_fn(fn, *args, **kwargs):
@@ -1131,71 +64,3 @@ def call_replica_local_fn(fn, *args, **kwargs):
     with strategy.scope():
       return strategy.extended.call_for_each_replica(fn, args, kwargs)
   return fn(*args, **kwargs)
-
-
-def is_current_worker_chief():
-  return dc_context.get_current_worker_context().is_chief
-
-
-def filter_distributed_callbacks(callbacks_list, model):
-  """Filter Callbacks based on the worker context when running multi-worker.
-
-  Arguments:
-    callbacks_list: A list of `Callback` instances.
-    model: Keras model instance.
-
-  Returns:
-    The list of `Callback` instances that should be run on this worker.
-  """
-
-  if not model._in_multi_worker_mode():
-    raise ValueError(
-        'filter_distributed_callbacks() should only be called when Keras '
-        'is in multi worker mode.')
-
-  callbacks_list = callbacks_list or []
-  if not [
-      c for c in callbacks_list if isinstance(c, callbacks.ModelCheckpoint)
-  ]:
-    # TODO(rchao): Consider providing a ModelCheckpoint here if the user
-    # fails to (possibly with tempfile directory).
-    logging.warning('ModelCheckpoint callback is not provided. '
-                    'Workers will need to restart training if any fails.')
-
-  if callbacks_list is None or is_current_worker_chief():
-    return callbacks_list
-
-  # Some Callbacks should only run on the chief worker.
-  return [
-      callback for callback in callbacks_list if not callback._chief_worker_only
-  ]  # pylint: disable=protected-access
-
-
-def _update_sample_weight_modes(model, mode, sample_weights):
-  """Update sample_weight_mode of the distributed model."""
-  if is_distributing_by_cloning(model):
-    distributed_model = get_distributed_model(model, mode)
-    if not distributed_model:
-      _make_replicated_models_with_cloning(model, mode)
-      distributed_model = get_distributed_model(model, mode)
-    distributed_model._recompile_exec_function = any(
-        [e.sample_weights_mismatch() for e in model._training_endpoints])
-
-    if sample_weights:
-      distributed_models = flatten_per_replica_values(
-          model._distribution_strategy, distributed_model)
-      # sample_weights is a tuple of 1 list where the number of elements in the
-      # list is equal to the number of replicas in sync.
-      sample_weights = sample_weights[0]
-      if sample_weights and None not in sample_weights:
-        for m, sw in zip(distributed_models, sample_weights):
-          m._update_sample_weight_modes(sample_weights=[sw])
-
-
-def concat_along_batch_dimension(outputs):
-  """Concats prediction outputs along the batch dimension."""
-  if isinstance(outputs[0], sparse_tensor.SparseTensor):
-    return sparse_ops.sparse_concat_v2(axis=0, sp_inputs=outputs)
-  if isinstance(outputs[0], ragged_tensor.RaggedTensor):
-    return ragged_concat_ops.concat(outputs, axis=0)
-  return np.concatenate(outputs)
diff --git a/tensorflow/python/keras/distribute/distributed_training_utils_test.py b/tensorflow/python/keras/distribute/distributed_training_utils_test.py
index c47d694c5b5..6e515592ebc 100644
--- a/tensorflow/python/keras/distribute/distributed_training_utils_test.py
+++ b/tensorflow/python/keras/distribute/distributed_training_utils_test.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.keras import callbacks
-from tensorflow.python.keras.distribute import distributed_training_utils
+from tensorflow.python.keras.distribute import distributed_training_utils_v1
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.platform import test
 from tensorflow.python.training import adam as v1_adam
@@ -39,7 +39,7 @@ class DistributedTrainingUtilsTest(test.TestCase):
         callbacks.RemoteMonitor()
     ]
 
-    distributed_training_utils.validate_callbacks(
+    distributed_training_utils_v1.validate_callbacks(
         supported_predefined_callbacks, adam.Adam())
 
     unsupported_predefined_callbacks = [
@@ -50,8 +50,8 @@ class DistributedTrainingUtilsTest(test.TestCase):
     for callback in unsupported_predefined_callbacks:
       with self.assertRaisesRegex(ValueError,
                                   'You must specify a Keras Optimizer V2'):
-        distributed_training_utils.validate_callbacks([callback],
-                                                      v1_adam.AdamOptimizer())
+        distributed_training_utils_v1.validate_callbacks(
+            [callback], v1_adam.AdamOptimizer())
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/distribute/distributed_training_utils_v1.py b/tensorflow/python/keras/distribute/distributed_training_utils_v1.py
new file mode 100644
index 00000000000..2e7a8299e43
--- /dev/null
+++ b/tensorflow/python/keras/distribute/distributed_training_utils_v1.py
@@ -0,0 +1,1158 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities related to distributed training."""
+# pylint:disable=protected-access
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.distribute import distribute_coordinator_context as dc_context
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import callbacks
+from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils
+from tensorflow.python.keras.engine import training_utils_v1
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.keras.utils import tf_contextlib
+from tensorflow.python.keras.utils.mode_keys import ModeKeys
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_concat_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
+
+
+def set_weights(distribution_strategy, dist_model, weights):
+  """Sets the weights of the replicated models.
+
+  The weights of the replicated models are set to the weights of the original
+  model. The weights of the replicated model are Mirrored variables and hence
+  we need to use the `update` call within a DistributionStrategy scope.
+
+  Args:
+    distribution_strategy: DistributionStrategy used to distribute training
+        and validation.
+    dist_model: The replicated models on the different devices.
+    weights: The weights of the original model.
+  """
+  assign_ops = []
+  for layer in dist_model.layers:
+    num_param = len(layer.weights)
+    layer_weights = weights[:num_param]
+    for sw, w in zip(layer.weights, layer_weights):
+      if ops.executing_eagerly_outside_functions():
+        sw.assign(w)
+      else:
+        assign_ops.append(distribution_strategy.unwrap(sw.assign(w)))
+    weights = weights[num_param:]
+
+  if not ops.executing_eagerly_outside_functions():
+    K.get_session(assign_ops).run(assign_ops)
+
+
+def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
+                  grouped_updates=None, grouped_session_args=None,
+                  with_loss_tensor=False):
+  """Unwrap the list of values contained in the PerReplica parameters.
+
+  This function calls `flatten_per_replica_values` to parse each of the input
+  parameters into a list of values on the different devices. If we set
+  `with_loss_tensor` to be True, we also call `reduce` on the list of losses on
+  the different devices to give us one loss tensor.
+
+  Args:
+    distribution_strategy: DistributionStrategy used to distribute training and
+        validation.
+    grouped_inputs: PerReplica inputs returned from the train or test function
+        that we ran on each device.
+    grouped_outputs: PerReplica outputs returned from the train or test function
+        that we ran on each device.
+    grouped_updates: PerReplica updates returned from the train or test function
+        that we ran on each device.
+    grouped_session_args: PerReplica session args returned from the train or
+        test function that we ran on each device.
+    with_loss_tensor: Boolean that indicates if we need to add the reduced loss
+        tensor as one of the outputs.
+
+  Returns:
+    Values of each of the PerReplica parameters.
+
+  """
+  # Unwrap per device values returned from each model's train function.
+  # This will be used to construct the main train function.
+  all_inputs = flatten_per_replica_values(distribution_strategy,
+                                          grouped_inputs)
+  all_outputs = unwrap_outputs(distribution_strategy, grouped_outputs,
+                               with_loss_tensor)
+
+  if grouped_updates:
+    all_updates = flatten_per_replica_values(distribution_strategy,
+                                             grouped_updates)
+  else:
+    all_updates = None
+
+  all_session_args = {}
+  if grouped_session_args:
+    grouped_feed_dict = grouped_session_args.get('feed_dict')
+    if grouped_feed_dict:
+      all_session_args['feed_dict'] = flatten_per_replica_values(
+          distribution_strategy, grouped_feed_dict)
+
+    grouped_fetches = grouped_session_args.get('fetches')
+    if grouped_fetches:
+      all_session_args['fetches'] = flatten_per_replica_values(
+          distribution_strategy, grouped_fetches)
+
+  # TODO(priyag): Return only non empty/None values
+  return all_inputs, all_outputs, all_updates, all_session_args
+
+
+def unwrap_output_dict(strategy, grouped_outputs, mode):
+  """Unwrap the list of outputs contained in the PerReplica parameters."""
+  if mode == ModeKeys.PREDICT:
+    return flatten_per_replica_values(strategy, grouped_outputs)
+
+  # In the case of fit/eval, the grouped_outputs is a dict, whereas in predict,
+  # the output is as same structure as model output. They need to be treated
+  # differently
+  total_loss = strategy.reduce(reduce_util.ReduceOp.SUM,
+                               grouped_outputs['total_loss'][0], axis=None)
+  output_losses = flatten_per_replica_values(strategy,
+                                             grouped_outputs['output_losses'])
+  metrics = flatten_per_replica_values(strategy,
+                                       grouped_outputs['metrics'])
+  batch_size = strategy.reduce(reduce_util.ReduceOp.SUM,
+                               grouped_outputs['batch_size'], axis=None)
+  if (dist_utils.is_tpu_strategy(strategy) and
+      ops.executing_eagerly_outside_functions()):
+    # Choose 1 value per replica in the TPU case since all replicas produce the
+    # same output.
+    # We only do this in eager mode for now since this function is used in
+    # both graph and eager mode and in the graph case we currently don't use
+    # experimental_run so would need to be removed when we converge the graph
+    # code path as well.
+    output_losses = output_losses[::strategy.num_replicas_in_sync]
+    metrics = metrics[::strategy.num_replicas_in_sync]
+  return {'total_loss': [total_loss],
+          'output_losses': output_losses,
+          'metrics': metrics,
+          'batch_size': batch_size}
+
+
+def unwrap_outputs(distribution_strategy, grouped_outputs,
+                   with_loss_tensor=False):
+  """Unwrap the list of outputs contained in the PerReplica parameters.
+
+  This function calls `flatten_per_replica_values` to parse each of the input
+  parameters into a list of outputs on the different devices. If we set
+  `with_loss_tensor` to be True, we also call `reduce` on the list of losses on
+  the different devices to give us one loss tensor.
+
+  Args:
+    distribution_strategy: DistributionStrategy used to distribute training and
+        validation.
+    grouped_outputs: PerReplica outputs returned from the train or test function
+        that we ran on each device.
+    with_loss_tensor: Boolean that indicates if we need to add the reduced loss
+        tensor as one of the outputs.
+
+  Returns:
+    Values of each of the PerReplica outputs.
+
+  """
+  if not with_loss_tensor:
+    return flatten_per_replica_values(distribution_strategy,
+                                      grouped_outputs)
+
+  if not isinstance(grouped_outputs, list):
+    grouped_outputs = [grouped_outputs]
+  # reduce loss tensor before adding it to the list of fetches
+  loss = distribution_strategy.reduce(reduce_util.ReduceOp.SUM,
+                                      grouped_outputs[0], axis=None)
+  all_outputs = flatten_per_replica_values(distribution_strategy,
+                                           grouped_outputs[1:])
+  if (dist_utils.is_tpu_strategy(distribution_strategy) and
+      ops.executing_eagerly_outside_functions()):
+    # Choose 1 value per replica in the TPU case since all replicas produce the
+    # same output.
+    # We only do this in eager mode for now since this function is used in
+    # both graph and eager mode and in the graph case we currently don't use
+    # experimental_run so would need to be removed when we converge the graph
+    # code path as well.
+    all_outputs = all_outputs[::distribution_strategy.num_replicas_in_sync]
+  return [loss] + all_outputs
+
+
+def flatten_per_replica_values(distribution_strategy, per_replica_values):
+  """Unwraps and flattens a nest of PerReplica parameters.
+
+  PerReplica values have one value associated with each device. Each entry in
+  the PerReplica dict has a device `key` and the corresponding value on the
+  device as the `value`. In this function we take a PerReplica value or a list
+  of PerReplica values and return all the values in the PerReplica dict.
+
+  Args:
+    distribution_strategy: DistributionStrategy used to distribute training and
+      validation.
+    per_replica_values: List of PerReplica object or a single PerReplica object.
+
+  Returns:
+    List of values of all the PerReplica objects.
+
+  """
+  # pylint: disable=g-complex-comprehension
+  # This function takes a PerReplica object or a list of PerReplica objects and
+  # returns all the values associated with it.
+  return [e for flattened in nest.flatten(per_replica_values)
+          for e in distribution_strategy.unwrap(flattened)]
+
+
+def validate_callbacks(input_callbacks, optimizer):
+  """Validate whether given callbacks are supported by DistributionStrategy.
+
+  Args:
+    input_callbacks: List of callbacks passed by the user to fit.
+    optimizer: Optimizer instance used to train the model.
+
+  Raises:
+    ValueError: If `LearningRateScheduler` or `ReduceLROnPlateau` is one of the
+        callbacks passed.
+    ValueError: If `write_grads` is one of the parameters passed as part of the
+        TensorBoard callback.
+  """
+  if input_callbacks:
+    for callback in input_callbacks:
+      if isinstance(callback, (callbacks.LearningRateScheduler,
+                               callbacks.ReduceLROnPlateau)):
+
+        if not isinstance(optimizer, optimizer_v2.OptimizerV2):
+          raise ValueError('You must specify a Keras Optimizer V2 when using '
+                           '%s callback with DistributionStrategy.' % callback)
+
+      # If users want to use the TensorBoard callback they cannot use certain
+      # features of the callback that involve accessing model attributes and
+      # running ops.
+      if isinstance(callback, callbacks.TensorBoard):
+        if getattr(callback, 'write_grads', False):
+          logging.warning(
+              UserWarning(
+                  '`write_grads` in the TensorBoard callback is not supported '
+                  'when using DistributionStrategy. Setting `write_grads` '
+                  'to `False`.'))
+          callback.write_grads = False
+
+
+def validate_distributed_dataset_inputs(distribution_strategy, x, y,
+                                        sample_weights=None):
+  """Validate all the components of a DistributedValue Dataset input.
+
+  Args:
+    distribution_strategy: The current DistributionStrategy used to call
+        `fit`/`evaluate`.
+    x: Input Dataset DistributedValue object. For example, when we use
+        `MirroredStrategy` this is a PerReplica object with a tensor for each
+        device set in the dict. x can also be a tuple or dict. The keys of the
+        dict should match the names of the input layers of the model.
+    y: Target Dataset DistributedValue object. For example, when we use
+        `MirroredStrategy` this is a PerReplica object with a tensor for each
+        device set in the dict. y can also be a tuple or dict. The keys of the
+        dict should match the names of the output layers of the model.
+    sample_weights: Sample weights Dataset DistributedValue object. For example,
+        when we use `MirroredStrategy` this is a PerReplica object with a tensor
+        for each device set in the dict.
+
+  Returns:
+    The unwrapped values list of the x and y DistributedValues inputs.
+
+  Raises:
+    ValueError: If x and y do not have support for being evaluated as tensors.
+        or if x and y contain elements that are not tensors or if x and y
+        contain elements that have a shape or dtype mismatch.
+  """
+  # If the input and target used to call the model are not dataset tensors,
+  # we need to raise an error. When using a DistributionStrategy, the input
+  # and targets to a model should be from a `tf.data.Dataset`.
+
+  # If each element of x and y are not tensors, we cannot standardize and
+  # validate the input and targets.
+  x_values_list = validate_per_replica_inputs(distribution_strategy, x)
+
+  if y is not None:
+    y_values_list = validate_per_replica_inputs(distribution_strategy, y)
+  else:
+    y_values_list = None
+
+  if sample_weights is not None:
+    sample_weights_list = validate_per_replica_inputs(distribution_strategy,
+                                                      sample_weights)
+  else:
+    sample_weights_list = None
+
+  # Return the unwrapped values to avoid calling `unwrap` a second time.
+  return x_values_list, y_values_list, sample_weights_list
+
+
+def validate_per_replica_inputs(distribution_strategy, x):
+  """Validates PerReplica dataset input list.
+
+  Args:
+    distribution_strategy: The current DistributionStrategy used to call
+      `fit`, `evaluate` and `predict`.
+    x: A list of PerReplica objects that represent the input or
+      target values.
+
+  Returns:
+    List containing the first element of each of the PerReplica objects in
+    the input list.
+
+  Raises:
+    ValueError: If any of the objects in the `per_replica_list` is not a tensor.
+
+  """
+  # Convert the inputs and targets into a list of PerReplica objects.
+  per_replica_list = nest.flatten(x, expand_composites=True)
+  x_values_list = []
+  for x in per_replica_list:
+    # At this point x should contain only tensors.
+    x_values = distribution_strategy.unwrap(x)
+    for value in x_values:
+      if not tensor_util.is_tensor(value):
+        raise ValueError('Dataset input to the model should be tensors instead '
+                         'they are of type {}'.format(type(value)))
+
+    if not context.executing_eagerly():
+      # Validate that the shape and dtype of all the elements in x are the same.
+      validate_all_tensor_shapes(x, x_values)
+    validate_all_tensor_types(x, x_values)
+
+    x_values_list.append(x_values[0])
+  return x_values_list
+
+
+def validate_all_tensor_types(x, x_values):
+  x_dtype = x_values[0].dtype
+  for i in range(1, len(x_values)):
+    if x_dtype != x_values[i].dtype:
+      raise ValueError('Input tensor dtypes do not match for distributed tensor'
+                       ' inputs {}'.format(x))
+
+
+def validate_all_tensor_shapes(x, x_values):
+  # Validate that the shape of all the elements in x have the same shape
+  x_shape = x_values[0].shape.as_list()
+  for i in range(1, len(x_values)):
+    if x_shape != x_values[i].shape.as_list():
+      raise ValueError('Input tensor shapes do not match for distributed tensor'
+                       ' inputs {}'.format(x))
+
+
+def _wait_for_variable_initialization(session):
+  """Utility to wait for variables to be initialized."""
+  all_variables = K._get_variables(K.get_graph())  # pylint: disable=protected-access
+  candidate_vars = []
+  for v in all_variables:
+    if not getattr(v, '_keras_initialized', False):
+      candidate_vars.append(v)
+
+  if not candidate_vars:
+    return
+
+  while True:
+    is_initialized = session.run(
+        [variables.is_variable_initialized(v) for v in candidate_vars])
+    uninitialized_vars = []
+    for flag, v in zip(is_initialized, candidate_vars):
+      if not flag:
+        uninitialized_vars.append(v)
+      v._keras_initialized = True  # pylint: disable=protected-access
+    if not uninitialized_vars:
+      break
+
+
+def init_restore_or_wait_for_variables():
+  """Initialize or restore variables or wait for variables to be initialized."""
+  session = K._get_session()  # pylint: disable=protected-access
+  if not multi_worker_util.has_worker_context(
+  ) or multi_worker_util.should_load_checkpoint():
+    # TODO(yuefengz): if checkpoints exist, restore from checkpoint.
+    K._initialize_variables(session)  # pylint: disable=protected-access
+  else:
+    _wait_for_variable_initialization(session)
+
+
+def validate_inputs(x, y):
+  """Validate inputs when using DistributionStrategy.
+
+  Args:
+    x: Model Inputs.
+    y: Model Targets.
+
+  Raises:
+    ValueError: if input is not a Dataset or a numpy array(when we use
+      MirroredStrategy).
+  """
+  if (isinstance(x, iterator_ops.Iterator) or
+      isinstance(y, iterator_ops.Iterator)):
+    raise ValueError('`DistributionStrategy` does not support inputs of type '
+                     'Iterator. You must pass a `tf.data.Dataset` object or a '
+                     'numpy array as input.')
+
+
+def is_dataset_shape_fully_defined(dataset):
+  """Returns whether a dataset contains a final partial batch."""
+  shapes = nest.flatten(dataset_ops.get_legacy_output_shapes(dataset))
+  unknown_shapes = [s for s in shapes if not s.is_fully_defined()]
+  return not unknown_shapes
+
+
+def process_batch_and_step_size(strategy,
+                                inputs,
+                                batch_size,
+                                steps_per_epoch,
+                                mode,
+                                validation_split=0.):
+  """Process the batch size and step size based on input and dist strategy."""
+  first_x_value = nest.flatten(inputs)[0]
+  if isinstance(first_x_value, np.ndarray):
+    num_samples = first_x_value.shape[0]
+    if validation_split and 0. < validation_split < 1.:
+      num_samples = int(num_samples * (1 - validation_split))
+    # Until support for partial batch is implemented across all
+    # functions and distribution strategy, we pass `mode` to selectively
+    # relax the constraint to consume all the training samples.
+    steps_per_epoch, batch_size = get_input_params(
+        strategy, num_samples, steps_per_epoch, batch_size, mode=mode)
+  return batch_size, steps_per_epoch
+
+
+def get_input_params(distribution_strategy,
+                     num_samples,
+                     steps,
+                     batch_size,
+                     mode=None):
+  """Calculate the number of batches and steps/steps_per_epoch.
+
+  Args:
+    distribution_strategy: The DistributionStrategy used to compile the model.
+    num_samples: The number of samples from which we determine the batch size
+      and steps.
+    steps:  The specified number of steps.
+    batch_size: The specified batch_size.
+    mode: ModeKey representing whether input will be used for training,
+      evaluation, or prediction. This is used to relax the constraints on
+      consuming all the training samples to keep compatibility till we support
+      partial batches. If none, then partial batches are not allowed.
+
+  Returns:
+    steps: The steps or steps_per_epoch argument depending on if a user is
+        calling `fit`, `evaluate` or `predict`. If the is_training flag is set
+        we don't require the number of samples to be used completely.
+    batch_size: The batch size to be used in model iterations.
+
+  Raises:
+    ValueError: If the number of batches or steps evaluates to 0.
+
+  """
+  # TODO(b/118776054): Use global batch size for Keras/DS support.
+  # Currently this is only supported in TPUStrategy and CoreMirroredStrategy.
+  use_per_replica_batch = not dist_utils.global_batch_size_supported(
+      distribution_strategy)
+
+  # TODO(b/128995245): In eager mode, uneven batch sizes are allowed except for
+  # `fit()` on TPUStrategy.
+  # In graph mode, the zero batch case in batch norm is not handled due to
+  # XLA-GPU regression. Uneven batch sizes are not allowed except
+  # for `test()` and `predict()` on TPUStrategy.
+  if context.executing_eagerly():
+    allow_partial_batch = (
+        mode != ModeKeys.TRAIN or
+        not dist_utils.is_tpu_strategy(distribution_strategy))
+  else:
+    allow_partial_batch = (
+        mode == ModeKeys.TRAIN or
+        ((mode == ModeKeys.PREDICT or mode == ModeKeys.TEST) and
+         dist_utils.is_tpu_strategy(distribution_strategy)))
+
+  if steps is None:
+    if batch_size is None:
+      # If neither the batch size or number of steps are set. We choose the
+      # global batch size as the minimum of number of samples and 32. 32 is
+      # chosen to provide backward compatibility.
+      global_batch_size = min(num_samples, 32)
+    else:
+      # If the user provided the batch size we need to handle the case
+      # between different strategies that use the global/per-replica batch size
+      global_batch_size = batch_size
+      if use_per_replica_batch:
+        global_batch_size *= distribution_strategy.num_replicas_in_sync
+    if allow_partial_batch:
+      steps = np.ceil(num_samples / global_batch_size).astype(int)
+    else:
+      if num_samples % global_batch_size:
+        raise ValueError('The number of samples %s is not divisible by '
+                         'batch size %s.' % (num_samples, global_batch_size))
+      steps = num_samples // global_batch_size
+  else:
+    if batch_size is None:
+      # We calculate the batch size based on the number of steps specified
+      if num_samples % steps:
+        raise ValueError('The number of samples %s is not divisible by '
+                         'steps %s. Please change the number of steps to a '
+                         'value that can consume all the samples' % (
+                             num_samples, steps))
+      global_batch_size = num_samples // steps
+    else:
+      # If the user provided the batch size we need to handle the case
+      # between different strategies that use the global/per-replica batch size
+      global_batch_size = batch_size
+      if use_per_replica_batch:
+        global_batch_size *= distribution_strategy.num_replicas_in_sync
+
+      min_num_samples = global_batch_size * steps
+      if allow_partial_batch:
+        min_num_samples = global_batch_size * (steps-1) + 1 if steps > 1 else 0
+
+      if num_samples < min_num_samples:
+        raise ValueError('Number of samples %s is less than samples required '
+                         'for specified batch_size %s and steps %s' % (
+                             num_samples, global_batch_size, steps))
+
+  # We need to return the per replica or global batch size based on the strategy
+  if use_per_replica_batch:
+    if global_batch_size % distribution_strategy.num_replicas_in_sync:
+      raise ValueError(
+          'The batch size (%s) could not be sharded evenly across the sync '
+          'replicas (%s) in the distribution strategy.' % (
+              global_batch_size, distribution_strategy.num_replicas_in_sync))
+    batch_size = global_batch_size // distribution_strategy.num_replicas_in_sync
+  else:
+    batch_size = global_batch_size
+
+  return steps, batch_size
+
+
+def get_batch_dimension(iterator):
+  shapes = nest.flatten(dataset_ops.get_legacy_output_shapes(iterator))
+  # Take the batch size from the first element, as it should be the same for
+  # all.
+  dims = shapes[0].dims
+  return dims[0] if dims else None
+
+
+def get_iterator(dataset, distribution_strategy):
+  with distribution_strategy.scope():
+    iterator = distribution_strategy.make_dataset_iterator(dataset)
+  initialize_iterator(iterator, distribution_strategy)
+  return iterator
+
+
+def initialize_iterator(iterator, distribution_strategy):
+  with distribution_strategy.scope():
+    init_op = control_flow_ops.group(iterator.initializer)
+    if not context.executing_eagerly():
+      K.get_session((init_op,)).run(init_op)
+
+
+def _get_input_from_iterator(iterator, model):
+  """Get elements from the iterator and verify the input shape and type."""
+  next_element = iterator.get_next()
+
+  # `len(nest.flatten(x))` is going to not count empty elements such as {}.
+  # len(nest.flatten([[0,1,2], {}])) is 3 and not 4.   The `next_element` is
+  # going to get flattened in `_prepare_feed_values` to work around that. Empty
+  # elements are going to get filtered out as part of the flattening.
+  if len(nest.flatten(next_element)) == len(model.inputs):
+    x = next_element
+    y = None
+    sample_weights = None
+  elif len(nest.flatten(next_element)) == (len(model.inputs) +
+                                           len(model.outputs)):
+    x, y = next_element
+    sample_weights = None
+  else:
+    x, y, sample_weights = next_element
+
+  # Validate that all the elements in x and y are of the same type and shape.
+  validate_distributed_dataset_inputs(
+      model._distribution_strategy, x, y, sample_weights)
+  return x, y, sample_weights
+
+
+def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
+  """Prepare feed values to the model execution function.
+
+  Arguments:
+    model: Model to prepare feed values for.
+    inputs: List or dict of model inputs.
+    targets: Optional list of model targets.
+    sample_weights: Optional list of sample weight arrays.
+    mode: One of ModeKeys.TRAIN/ModeKeys.TEST/ModeKeys.PREDICT.
+
+  Returns:
+    Feed values for the model in the given mode.
+  """
+  strategy = model._distribution_strategy
+  inputs, targets, sample_weights = _get_input_from_iterator(inputs, model)
+  if dist_utils.is_tpu_strategy(strategy):
+    if sample_weights is not None:
+      raise ValueError('TPUStrategy does not support sample weights.')
+
+  # When the inputs are dict, then we want to flatten it in the same order as
+  # the input layers, such that the data are fed into the input layers in the
+  # correct order.
+  if isinstance(inputs, dict):
+    inputs = [inputs[key] for key in model._feed_input_names]
+  if is_distributing_by_cloning(model):
+    inputs = flatten_per_replica_values(strategy, inputs)
+    targets = flatten_per_replica_values(strategy, targets)
+    # Expand 1-dimensional inputs.
+    # TODO(b/124535720): Remove once this standarize data logic is shared with
+    # main flow.
+    inputs, targets = nest.map_structure(
+        training_utils_v1.standardize_single_array, (inputs, targets))
+  else:
+    inputs = training_utils_v1.ModelInputs(inputs).as_list()
+
+  if mode == ModeKeys.PREDICT:
+    sample_weights = []
+    targets = []
+  elif sample_weights is not None and is_distributing_by_cloning(model):
+    if context.executing_eagerly() and not model._compile_distribution:
+      raise NotImplementedError('`sample_weight` is not supported when using '
+                                'tf.distribute.Strategy in eager mode and '
+                                'cloning=True.')
+    sample_weights = flatten_per_replica_values(strategy, sample_weights)
+
+  ins = [inputs, targets, sample_weights]
+  return tuple(ins)
+
+
+def is_distributing_by_cloning(model):
+  """Decide whether this model is going to be distributed via cloning.
+
+  We are going to distribute the model by cloning in graph mode.
+
+  Args:
+    model: Keras model to distribute.
+
+  Returns:
+    True if the `model` is going to be distributed using cloning and False
+    otherwise.
+  """
+  if (dist_utils.is_tpu_strategy(model._distribution_strategy) and
+      context.executing_eagerly):  # b/137580852
+    return False
+  elif ops.executing_eagerly_outside_functions():
+    return bool(model._compile_distribution)
+  return True
+
+
+def _custom_compile_for_predict(model):
+  """Custom compile for TPU predict mode."""
+  if not model.built:
+    # Model is not compilable because it does not know its number of inputs
+    # and outputs, nor their shapes and names. We will compile after the first
+    # time the model gets called on training data.
+    return
+  model._is_compiled = True
+  model.total_loss = None
+  model.train_function = None
+  model.test_function = None
+  model.predict_function = None
+
+
+def _build_network_on_replica(model, mode, inputs=None, targets=None):
+  """Build an updated model on replicas.
+
+  We create a new Keras model while sharing the variables from the old graph.
+  Building a new sub-graph is required since the original keras model creates
+  placeholders for the input and the output that are not accessible till we
+  call iterator.get_next() inside the step_fn for `fit`/`evaluate`/`predict`.
+
+  The sharing of weights and layers between the old and the new model guarantee
+  that we're using Strategy variables and any updates on either model are
+  reflected correctly in callbacks and loop iterations.
+
+  We need to make sure we share the optimizers between the old and the new model
+  as well so that optimizer state is not lost if the user is running fit
+  multiple times.
+
+  Args:
+    model: Model to be replicated across Replicas
+    mode: Which of fit/eval/predict is building the distributed network
+    inputs: Input variables to be passed to the model
+    targets: Target tensor to be passed to model.compile
+
+  Returns:
+    A new model with shared layers with the old model.
+  """
+  # Need to do imports here since we run into a circular dependency error.
+  from tensorflow.python.keras import models  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras.engine import sequential  # pylint: disable=g-import-not-at-top
+
+  # We rely on the internal methods to avoid having share_weights weights in the
+  # public API.
+  if isinstance(model, sequential.Sequential):
+    updated_model = models._clone_sequential_model(
+        model, input_tensors=inputs, layer_fn=models.share_weights)
+  else:
+    updated_model = models._clone_functional_model(
+        model, input_tensors=inputs, layer_fn=models.share_weights)
+    # Callable losses added directly to a functional Model need to be added
+    # here.
+    updated_model._callable_losses = model._callable_losses
+
+  # Recast all low precision outputs back to float32 since we only casted
+  # the inputs to bfloat16 and not targets. This is done so that we can preserve
+  # precision when calculating the loss value.
+  def _upcast_low_precision_outputs(output):
+    if output.dtype == dtypes.bfloat16:
+      return math_ops.cast(output, dtypes.float32)
+    else:
+      return output
+  updated_model.outputs = [_upcast_low_precision_outputs(o)
+                           for o in updated_model.outputs]
+
+  if isinstance(targets, tuple):
+    targets = nest.flatten(targets)
+
+  if mode == ModeKeys.PREDICT and inputs is not None:  # TPU predict case
+    _custom_compile_for_predict(updated_model)
+  else:
+    updated_model.compile(
+        model.optimizer,
+        model.loss,
+        metrics=metrics_module.clone_metrics(model._compile_metrics),
+        loss_weights=model.loss_weights,
+        sample_weight_mode=model.sample_weight_mode,
+        weighted_metrics=metrics_module.clone_metrics(
+            model._compile_weighted_metrics),
+        target_tensors=targets)
+  return updated_model
+
+
+def _build_distributed_network(model, strategy, mode, inputs=None,
+                               targets=None):
+  """Create a cloned model on each replica."""
+  with K.get_graph().as_default(), strategy.scope():
+    distributed_model = strategy.extended.call_for_each_replica(
+        _build_network_on_replica,
+        args=(model, mode, inputs, targets))
+    set_distributed_model(model, mode, distributed_model)
+
+
+def _clone_and_build_model(model, mode, inputs=None, targets=None):
+  """Clone and build the given keras_model."""
+  # We need to set the import here since we run into a circular dependency
+  # error.
+  from tensorflow.python.keras import models  # pylint: disable=g-import-not-at-top
+  cloned_model = models.clone_model(model, input_tensors=inputs)
+
+  # Compile and build model.
+  if isinstance(model.optimizer, optimizers.TFOptimizer):
+    optimizer = model.optimizer
+  else:
+    optimizer_config = model.optimizer.get_config()
+    optimizer = model.optimizer.__class__.from_config(optimizer_config)
+
+  # Recast all low precision outputs back to float32 since we only casted
+  # the inputs to bfloat16 and not targets. This is done so that we can preserve
+  # precision when calculating the loss value.
+  def _upcast_low_precision_outputs(output):
+    if output.dtype == dtypes.bfloat16:
+      return math_ops.cast(output, dtypes.float32)
+    else:
+      return output
+  cloned_model.outputs = [_upcast_low_precision_outputs(o)
+                          for o in cloned_model.outputs]
+
+  if isinstance(targets, tuple):
+    targets = nest.flatten(targets)
+  if mode == ModeKeys.PREDICT and inputs is not None:  # TPU predict case
+    _custom_compile_for_predict(cloned_model)
+  else:
+    cloned_model.compile(
+        optimizer,
+        model.loss,
+        metrics=metrics_module.clone_metrics(model._compile_metrics),
+        loss_weights=model.loss_weights,
+        sample_weight_mode=model.sample_weight_mode,
+        weighted_metrics=metrics_module.clone_metrics(
+            model._compile_weighted_metrics),
+        target_tensors=targets)
+  return cloned_model
+
+
+def clone_model_on_replicas(model, strategy, mode, inputs=None, targets=None):
+  """Create a cloned model on each replica."""
+  with K.get_graph().as_default(), strategy.scope():
+    distributed_model = strategy.extended.call_for_each_replica(
+        _clone_and_build_model, args=(model, mode, inputs, targets))
+    set_distributed_model(model, mode, distributed_model)
+  if mode == ModeKeys.TRAIN:
+    model._make_callback_model(distributed_model)
+
+
+def _make_execution_function(model, mode):
+  """Makes or reuses function to run one step of distributed model execution."""
+  if is_distributing_by_cloning(model):
+    return _make_execution_function_with_cloning(model, mode)
+
+  distributed_function = get_distributed_function(model, mode)
+  if distributed_function:
+    return distributed_function
+
+  distribution_function = _make_execution_function_without_cloning(model, mode)
+  set_distributed_function(model, mode, distribution_function)
+  return distribution_function
+
+
+def _make_execution_function_without_cloning(model, mode):
+  """Creates a function to run one step of distributed model execution."""
+  strategy = model._distribution_strategy
+
+  with strategy.scope():
+    per_replica_function = _make_replica_execution_function(model, mode)
+
+    def distributed_function(input_fn):
+      """A single step of the distributed execution across replicas."""
+      x, y, sample_weights = input_fn()
+      # Call `Model.{train,test,predict}_on_batch` on every replica passing
+      # PerReplicas as arguments.  On every replica inside this call, each
+      # PerReplica object will return the value for that replica.  The outputs
+      # are PerReplicas too.
+      outputs = strategy.run(per_replica_function, args=(x, y, sample_weights))
+      # Out of PerReplica outputs reduce or pick values to return.
+      all_outputs = unwrap_outputs(
+          strategy, outputs, with_loss_tensor=(mode != ModeKeys.PREDICT))
+      return all_outputs
+
+    if not model.run_eagerly:
+      distributed_function = def_function.function(distributed_function)
+      def execution_function(input_fn):
+        # `numpy` translates Tensors to values in Eager mode.
+        return [out.numpy() for out in distributed_function(input_fn)]
+    else:
+      execution_function = distributed_function
+
+    return execution_function
+
+
+def _make_replica_execution_function(model, mode):
+  """A single step of the distributed execution on a replica."""
+  if mode == ModeKeys.TRAIN:
+    func = model.train_on_batch
+  elif mode == ModeKeys.TEST:
+    func = model.test_on_batch
+  else:
+
+    def predict_on_batch(x, y=None, sample_weights=None):
+      del y, sample_weights
+      return model.predict_on_batch(x)
+
+    func = predict_on_batch
+
+  if mode != ModeKeys.PREDICT:
+    # `reset_metrics` is set to False to maintain stateful metrics across
+    # batch-level calls.
+    func = functools.partial(func, reset_metrics=False)
+
+  return func
+
+
+def _make_replicated_models_with_cloning(model, mode):
+  """Build models on each replica."""
+  strategy = model._distribution_strategy
+
+  # If distributed_model is not built, create one for `mode`.
+  if model._compile_distribution:
+    clone_model_on_replicas(model, strategy, mode)
+  else:
+    _build_distributed_network(model, strategy, mode)
+
+
+def _make_execution_function_with_cloning(model, mode):
+  """Clones or re-uses models to run one step of distributed model execution."""
+  distributed_model = get_distributed_model(model, mode)
+  # TODO(b/134069401): Create a cache for the distributed model and exec
+  # function that incorporates additional attributes to be part of the cache key
+  # than just the mode.
+  # If distributed model for a particular `mode` is already built, use the
+  # `_distribution_function` on that distributed model.
+  # If you have updated the sample_weight_mode on the model, then you will need
+  # to recompile metrics and recreate the execution function. This is indicated
+  # by the `_recompile_exec_function` property.
+  if (distributed_model and hasattr(distributed_model, '_distribution_function')
+      and not (hasattr(distributed_model, '_recompile_exec_function') and
+               distributed_model._recompile_exec_function)):
+    return distributed_model._distributed_function
+
+  if not distributed_model:
+    _make_replicated_models_with_cloning(model, mode)
+    distributed_model = get_distributed_model(model, mode)
+  assert distributed_model
+
+  # Also create an execution function on that distributed model.
+  if context.executing_eagerly():
+    distributed_function = _make_eager_execution_function(model, mode)
+  else:
+    distributed_function = _make_graph_execution_function(model, mode)
+
+  # We cache the distributed execution function on the model since creating
+  # distributed models and execution functions are expensive.
+  distributed_model._distributed_function = distributed_function
+  distributed_model._recompile_exec_function = False
+  return distributed_function
+
+
+def _make_graph_execution_function(model, mode):
+  """Makes function to run one step of distributed model in graph mode."""
+
+  def _per_replica_function(model):
+    f = model._make_execution_function(mode)
+    return (f.inputs, f.outputs, f.updates_op, f.session_kwargs)
+
+  strategy = model._distribution_strategy
+  with strategy.scope():
+    # Create train ops on each of the devices when we call
+    # `_per_replica_fit_function`.
+    (grouped_inputs, grouped_outputs, grouped_updates,
+     grouped_session_args) = strategy.extended.call_for_each_replica(
+         _per_replica_function, args=(get_distributed_model(model, mode),))
+
+    # Initialize the variables in the replicated model. This is necessary for
+    # multi-worker training because on some workers, initialization is not
+    # needed. This method does initialization or waiting for initialization
+    # according to the context object of distribute coordinator.
+    init_restore_or_wait_for_variables()
+
+    # Unwrap all the per device values returned from `call_for_each_replica`.
+    # Unwrapping per device values gives you a list of values that can be
+    # used to construct a new train function that is composed of update ops on
+    # all the devices over which the model is distributed.
+    (all_inputs, all_outputs, all_updates, all_session_args) = unwrap_values(
+        strategy,
+        grouped_inputs,
+        grouped_outputs,
+        grouped_updates,
+        grouped_session_args,
+        with_loss_tensor=(mode != ModeKeys.PREDICT))
+
+    return K.function(
+        all_inputs,
+        all_outputs,
+        updates=all_updates,
+        name='distributed_{}_function'.format(mode),
+        **all_session_args)
+
+
+def _make_eager_execution_function(model, mode):
+  """Makes function to run one step of distributed model eager execution."""
+  def _per_replica_function(model):
+    f = model._make_execution_function(mode)
+    return (f.inputs, f.outputs)
+
+  # NOTE(priyag): Try creating a new FuncGraph within DS scope instead of using
+  # the global one.
+  strategy = model._distribution_strategy
+  global_graph = K.get_graph()
+
+  with global_graph.as_default(), strategy.scope():
+    # First we gather the relevant portions of the model across all replicas.
+    # `K._scratch_graph(global_graph)` signals to Keras that it should not
+    # lift to a separate graph when creating the per-replica functions.
+    with K._scratch_graph(global_graph):
+      # Create train ops on each of the devices when we call
+      # `_per_replica_fit_function`.
+      grouped = strategy.extended.call_for_each_replica(
+          _per_replica_function, args=(get_distributed_model(model, mode),))
+      grouped_inputs, grouped_outputs = grouped
+
+      # Unwrap all the per device values returned from `call_for_each_replica`.
+      # Unwrapping per device values gives you a list of values that can be
+      # used to construct a new train function that is composed of
+      # inputs/outputs on all the devices over which the model is distributed.
+      (all_inputs, all_outputs, _, _) = unwrap_values(
+          strategy,
+          grouped_inputs,
+          grouped_outputs,
+          with_loss_tensor=(mode != ModeKeys.PREDICT))
+
+    # Finally, a joint Keras function is created; this one will be created in
+    # a separate FuncGraph.
+    return K.function(
+        all_inputs,
+        all_outputs,
+        name='eager_distributed_{}_function'.format(mode))
+
+
+def _copy_weights_to_distributed_model(original_model, mode):
+  """Copies weights from original model to distributed models."""
+  strategy = original_model._distribution_strategy
+  distributed_model = get_distributed_model(original_model, mode)
+  if strategy:
+    # Copy the weights from the original model to each of the replicated
+    # models.
+    orig_model_weights = original_model.get_weights()
+    first_model = strategy.unwrap(distributed_model)[0]
+    set_weights(strategy, first_model, orig_model_weights)
+
+
+def _copy_weights_to_original_model(model, mode):
+  """Copies weights from first distributed model back to original model."""
+  if model._distribution_strategy and mode == ModeKeys.TRAIN:
+    distributed_model = get_distributed_model(model, mode)
+    updated_weights = model._distribution_strategy.unwrap(
+        distributed_model)[0].get_weights()
+    model.set_weights(updated_weights)
+
+
+def _per_replica_aggregate_batch(strategy, batch_outs, model, mode):
+  """Aggregates the per-replica batch-level outputs from a distributed step."""
+  if strategy is not None and mode == ModeKeys.PREDICT:
+    total_batch_outs = []
+    for i in range(len(model.outputs)):
+      num_replicas = strategy.num_replicas_in_sync
+      nested_outs = batch_outs[i * num_replicas:i * num_replicas + num_replicas]
+      total_batch_outs.append(
+          concat_along_batch_dimension(nest.flatten(nested_outs)))
+    return total_batch_outs
+  return batch_outs
+
+
+def _reset_metrics(model):
+  if model._distribution_strategy:
+    for mode in [ModeKeys.TRAIN, ModeKeys.TEST, ModeKeys.PREDICT]:
+      distributed_model = get_distributed_model(model, mode)
+      if distributed_model:
+        first_model = model._distribution_strategy.unwrap(distributed_model)[0]
+        first_model.reset_metrics()
+
+
+def get_distributed_model(model, mode):
+  key = _generate_cache_key(mode)
+  return model._distributed_model_cache.get(key, None)
+
+
+def set_distributed_model(model, mode, distributed_model):
+  key = _generate_cache_key(mode)
+  model._distributed_model_cache[key] = distributed_model
+
+
+def get_distributed_function(model, mode):
+  key = _generate_cache_key(mode)
+  return model._distributed_function_cache.get(key, None)
+
+
+def set_distributed_function(model, mode, distributed_function):
+  key = _generate_cache_key(mode)
+  model._distributed_function_cache[key] = distributed_function
+
+
+def _generate_cache_key(mode):
+  key = hash(mode)
+  return key
+
+
+@tf_contextlib.contextmanager
+def distributed_scope(strategy, learning_phase):
+  with strategy.scope(), K.learning_phase_scope(learning_phase):
+    yield
+
+
+def is_current_worker_chief():
+  return dc_context.get_current_worker_context().is_chief
+
+
+def filter_distributed_callbacks(callbacks_list, model):
+  """Filter Callbacks based on the worker context when running multi-worker.
+
+  Arguments:
+    callbacks_list: A list of `Callback` instances.
+    model: Keras model instance.
+
+  Returns:
+    The list of `Callback` instances that should be run on this worker.
+  """
+
+  if not model._in_multi_worker_mode():
+    raise ValueError(
+        'filter_distributed_callbacks() should only be called when Keras '
+        'is in multi worker mode.')
+
+  callbacks_list = callbacks_list or []
+  if not [
+      c for c in callbacks_list if isinstance(c, callbacks.ModelCheckpoint)
+  ]:
+    # TODO(rchao): Consider providing a ModelCheckpoint here if the user
+    # fails to (possibly with tempfile directory).
+    logging.warning('ModelCheckpoint callback is not provided. '
+                    'Workers will need to restart training if any fails.')
+
+  if callbacks_list is None or is_current_worker_chief():
+    return callbacks_list
+
+  # Some Callbacks should only run on the chief worker.
+  return [
+      callback for callback in callbacks_list if not callback._chief_worker_only
+  ]  # pylint: disable=protected-access
+
+
+def _update_sample_weight_modes(model, mode, sample_weights):
+  """Update sample_weight_mode of the distributed model."""
+  if is_distributing_by_cloning(model):
+    distributed_model = get_distributed_model(model, mode)
+    if not distributed_model:
+      _make_replicated_models_with_cloning(model, mode)
+      distributed_model = get_distributed_model(model, mode)
+    distributed_model._recompile_exec_function = any(
+        [e.sample_weights_mismatch() for e in model._training_endpoints])
+
+    if sample_weights:
+      distributed_models = flatten_per_replica_values(
+          model._distribution_strategy, distributed_model)
+      # sample_weights is a tuple of 1 list where the number of elements in the
+      # list is equal to the number of replicas in sync.
+      sample_weights = sample_weights[0]
+      if sample_weights and None not in sample_weights:
+        for m, sw in zip(distributed_models, sample_weights):
+          m._update_sample_weight_modes(sample_weights=[sw])
+
+
+def concat_along_batch_dimension(outputs):
+  """Concats prediction outputs along the batch dimension."""
+  if isinstance(outputs[0], sparse_tensor.SparseTensor):
+    return sparse_ops.sparse_concat_v2(axis=0, sp_inputs=outputs)
+  if isinstance(outputs[0], ragged_tensor.RaggedTensor):
+    return ragged_concat_ops.concat(outputs, axis=0)
+  return np.concatenate(outputs)
diff --git a/tensorflow/python/keras/distribute/keras_correctness_test_base.py b/tensorflow/python/keras/distribute/keras_correctness_test_base.py
index 4a855f60777..77a5f290439 100644
--- a/tensorflow/python/keras/distribute/keras_correctness_test_base.py
+++ b/tensorflow/python/keras/distribute/keras_correctness_test_base.py
@@ -24,13 +24,13 @@ import numpy as np
 import six
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.distribute import distributed_training_utils
 from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.keras.preprocessing import sequence
@@ -50,7 +50,14 @@ all_strategies = [
     strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
     strategy_combinations.mirrored_strategy_with_two_gpus,
     strategy_combinations.tpu_strategy,  # steps_per_run=2
-    strategy_combinations.tpu_strategy_one_step,
+]
+
+
+# TODO(b/159831559): add to all_strategies once all tests pass.
+multi_worker_mirrored = [
+    strategy_combinations.multi_worker_mirrored_2x1_cpu,
+    strategy_combinations.multi_worker_mirrored_2x1_gpu,
+    strategy_combinations.multi_worker_mirrored_2x2_gpu,
 ]
 
 
@@ -66,11 +73,16 @@ def graph_mode_test_configuration():
 
 def all_strategy_and_input_config_combinations():
   return (combinations.times(
-      combinations.combine(
-          distribution=all_strategies),
+      combinations.combine(distribution=all_strategies),
       eager_mode_test_configuration() + graph_mode_test_configuration()))
 
 
+def all_strategy_and_input_config_combinations_eager():
+  return (combinations.times(
+      combinations.combine(distribution=all_strategies),
+      eager_mode_test_configuration()))
+
+
 def strategy_minus_tpu_and_input_config_combinations_eager():
   return (combinations.times(
       combinations.combine(
@@ -106,10 +118,9 @@ def test_combinations_for_embedding_model():
           (eager_mode_test_configuration())))
 
 
-def test_combinations_with_tpu_strategies():
+def test_combinations_with_tpu_strategies_graph():
   tpu_strategies = [
       strategy_combinations.tpu_strategy,
-      strategy_combinations.tpu_strategy_one_step
   ]
 
   return (combinations.times(
@@ -117,6 +128,18 @@ def test_combinations_with_tpu_strategies():
       graph_mode_test_configuration()))
 
 
+def multi_worker_mirrored_eager():
+  return combinations.times(
+      combinations.combine(distribution=multi_worker_mirrored),
+      eager_mode_test_configuration())
+
+
+def multi_worker_mirrored_eager_and_graph():
+  return combinations.times(
+      combinations.combine(distribution=multi_worker_mirrored),
+      eager_mode_test_configuration() + graph_mode_test_configuration())
+
+
 class MaybeDistributionScope(object):
   """Provides a context allowing no distribution strategy."""
 
@@ -317,6 +340,7 @@ def compare_results(results_with_ds,
     # so use larger tolerance for now. Predict should be related to weights.
     if (isinstance(distribution,
                    (mirrored_strategy.MirroredStrategy,
+                    mirrored_strategy.MirroredStrategyV1,
                     distribute_lib._DefaultDistributionStrategy)) and  # pylint: disable=protected-access
         key.startswith(('weights_1', 'weights_2', 'predict_result'))):
       return relaxed_tolerance
diff --git a/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py b/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py
index 6ec7cc2bac5..e6581a82692 100644
--- a/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_dnn_correctness_test.py
@@ -20,20 +20,24 @@ from __future__ import print_function
 import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.eager import context
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.distribute import keras_correctness_test_base
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
-from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 
 
 def all_strategy_combinations_with_eager_and_graph_modes():
   return (combinations.combine(
       distribution=keras_correctness_test_base.all_strategies,
-      mode=['graph', 'eager']))
+      mode=['graph', 'eager']) + combinations.combine(
+          distribution=keras_correctness_test_base.multi_worker_mirrored,
+          mode='eager'))
 
 
 def all_strategy_combinations_with_graph_mode():
@@ -47,6 +51,8 @@ def is_default_strategy(strategy):
     return not distribution_strategy_context.has_strategy()
 
 
+@testing_utils.run_all_without_tensor_float_32(
+    'Uses Dense layers, which call matmul')
 class TestDistributionStrategyDnnCorrectness(
     keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
 
@@ -97,22 +103,26 @@ class TestDistributionStrategyDnnCorrectness(
     x_predict = np.array([[1.], [2.], [3.], [4.]], dtype=np.float32)
     return x_train, y_train, x_eval, y_eval, x_predict
 
-  @combinations.generate(
-      keras_correctness_test_base.all_strategy_and_input_config_combinations())
+  @ds_combinations.generate(
+      keras_correctness_test_base.all_strategy_and_input_config_combinations() +
+      keras_correctness_test_base.multi_worker_mirrored_eager())
   def test_dnn_correctness(self, distribution, use_numpy, use_validation_data):
     self.run_correctness_test(distribution, use_numpy, use_validation_data)
 
-  @combinations.generate(
-      keras_correctness_test_base.test_combinations_with_tpu_strategies())
+  @ds_combinations.generate(
+      keras_correctness_test_base
+      .test_combinations_with_tpu_strategies_graph() +
+      keras_correctness_test_base.multi_worker_mirrored_eager())
   def test_dnn_correctness_with_partial_last_batch_eval(self, distribution,
                                                         use_numpy,
                                                         use_validation_data):
     self.run_correctness_test(
         distribution, use_numpy, use_validation_data, partial_last_batch='eval')
 
-  @combinations.generate(
+  @ds_combinations.generate(
       keras_correctness_test_base
-      .strategy_minus_tpu_and_input_config_combinations_eager())
+      .strategy_minus_tpu_and_input_config_combinations_eager() +
+      keras_correctness_test_base.multi_worker_mirrored_eager())
   def test_dnn_correctness_with_partial_last_batch(self, distribution,
                                                    use_numpy,
                                                    use_validation_data):
@@ -124,7 +134,7 @@ class TestDistributionStrategyDnnCorrectness(
         partial_last_batch='train_and_eval',
         training_epochs=1)
 
-  @combinations.generate(all_strategy_combinations_with_graph_mode())
+  @ds_combinations.generate(all_strategy_combinations_with_graph_mode())
   def test_dnn_with_dynamic_learning_rate(self, distribution):
     self.run_dynamic_lr_test(distribution)
 
@@ -163,7 +173,8 @@ class TestDistributionStrategyDnnMetricCorrectness(
       history = model.fit(x=train_dataset, epochs=2, steps_per_epoch=10)
       self.assertEqual(history.history['binary_accuracy'], [1.0, 1.0])
 
-  @combinations.generate(all_strategy_combinations_with_eager_and_graph_modes())
+  @ds_combinations.generate(
+      all_strategy_combinations_with_eager_and_graph_modes())
   def test_simple_dnn_metric_correctness(self, distribution):
     self.run_metric_correctness_test(distribution)
 
@@ -211,7 +222,8 @@ class TestDistributionStrategyDnnMetricEvalCorrectness(
       self.assertEqual(outs[1], 0.)
       self.assertEqual(outs[2], 0.)
 
-  @combinations.generate(all_strategy_combinations_with_eager_and_graph_modes())
+  @ds_combinations.generate(
+      all_strategy_combinations_with_eager_and_graph_modes())
   def test_identity_model_metric_eval_correctness(self, distribution):
     self.run_eval_metrics_correctness_test(distribution)
 
@@ -240,6 +252,8 @@ class SubclassedModel(keras.Model):
     return self.dense4(x)
 
 
+@testing_utils.run_all_without_tensor_float_32(
+    'Uses Dense layers, which call matmul')
 class TestDistributionStrategyDnnCorrectnessWithSubclassedModel(
     TestDistributionStrategyDnnCorrectness):
 
@@ -256,8 +270,9 @@ class TestDistributionStrategyDnnCorrectnessWithSubclassedModel(
           metrics=['mse'])
       return model
 
-  @combinations.generate(
-      keras_correctness_test_base.all_strategy_and_input_config_combinations())
+  @ds_combinations.generate(
+      keras_correctness_test_base.all_strategy_and_input_config_combinations() +
+      keras_correctness_test_base.multi_worker_mirrored_eager())
   def test_dnn_correctness(self, distribution, use_numpy, use_validation_data):
     if (context.executing_eagerly()) or is_default_strategy(distribution):
       self.run_correctness_test(distribution, use_numpy, use_validation_data)
@@ -275,7 +290,7 @@ class TestDistributionStrategyDnnCorrectnessWithSubclassedModel(
           '`input_dim` set in its first layer or a subclassed model.'):
         self.run_correctness_test(distribution, use_numpy, use_validation_data)
 
-  @combinations.generate(all_strategy_combinations_with_graph_mode())
+  @ds_combinations.generate(all_strategy_combinations_with_graph_mode())
   def test_dnn_with_dynamic_learning_rate(self, distribution):
     if ((context.executing_eagerly() and not K.is_tpu_strategy(distribution)) or
         is_default_strategy(distribution)):
@@ -294,8 +309,8 @@ class TestDistributionStrategyDnnCorrectnessWithSubclassedModel(
           '`input_dim` set in its first layer or a subclassed model.'):
         self.run_dynamic_lr_test(distribution)
 
-  @combinations.generate(
-      keras_correctness_test_base.test_combinations_with_tpu_strategies())
+  @ds_combinations.generate(
+      keras_correctness_test_base.test_combinations_with_tpu_strategies_graph())
   def test_dnn_correctness_with_partial_last_batch_eval(self, distribution,
                                                         use_numpy,
                                                         use_validation_data):
@@ -311,4 +326,4 @@ class TestDistributionStrategyDnnCorrectnessWithSubclassedModel(
 
 
 if __name__ == '__main__':
-  test.main()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/keras_embedding_model_correctness_test.py b/tensorflow/python/keras/distribute/keras_embedding_model_correctness_test.py
index 93e47df6ef0..9d67fe660cc 100644
--- a/tensorflow/python/keras/distribute/keras_embedding_model_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_embedding_model_correctness_test.py
@@ -18,11 +18,12 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+
 from tensorflow.python import keras
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.keras.distribute import keras_correctness_test_base
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
-from tensorflow.python.platform import test
 
 
 class DistributionStrategyEmbeddingModelCorrectnessTest(
@@ -55,16 +56,18 @@ class DistributionStrategyEmbeddingModelCorrectnessTest(
           metrics=['sparse_categorical_accuracy'])
     return model
 
-  @combinations.generate(
-      keras_correctness_test_base.test_combinations_for_embedding_model())
+  @ds_combinations.generate(
+      keras_correctness_test_base.test_combinations_for_embedding_model() +
+      keras_correctness_test_base.multi_worker_mirrored_eager())
   def test_embedding_model_correctness(self, distribution, use_numpy,
                                        use_validation_data):
 
     self.use_distributed_dense = False
     self.run_correctness_test(distribution, use_numpy, use_validation_data)
 
-  @combinations.generate(
-      keras_correctness_test_base.test_combinations_for_embedding_model())
+  @ds_combinations.generate(
+      keras_correctness_test_base.test_combinations_for_embedding_model() +
+      keras_correctness_test_base.multi_worker_mirrored_eager())
   def test_embedding_time_distributed_model_correctness(
       self, distribution, use_numpy, use_validation_data):
     self.use_distributed_dense = True
@@ -145,12 +148,13 @@ class DistributionStrategySiameseEmbeddingModelCorrectnessTest(
 
     return x_train, y_train, x_predict
 
-  @combinations.generate(
-      keras_correctness_test_base.test_combinations_for_embedding_model())
+  @ds_combinations.generate(
+      keras_correctness_test_base.test_combinations_for_embedding_model() +
+      keras_correctness_test_base.multi_worker_mirrored_eager())
   def test_siamese_embedding_model_correctness(self, distribution, use_numpy,
                                                use_validation_data):
     self.run_correctness_test(distribution, use_numpy, use_validation_data)
 
 
 if __name__ == '__main__':
-  test.main()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/keras_image_model_correctness_test.py b/tensorflow/python/keras/distribute/keras_image_model_correctness_test.py
index 7e6ae3cc719..0d93d925497 100644
--- a/tensorflow/python/keras/distribute/keras_image_model_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_image_model_correctness_test.py
@@ -19,13 +19,18 @@ from __future__ import print_function
 
 import numpy as np
 from tensorflow.python import keras
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.eager import context
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.distribute import keras_correctness_test_base
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
-from tensorflow.python.platform import test
 
 
+@testing_utils.run_all_without_tensor_float_32(
+    'Uses Dense layers, which call matmul. Even if Dense layers run in '
+    'float64, the test sometimes fails with TensorFloat-32 enabled for unknown '
+    'reasons')
 class DistributionStrategyCnnCorrectnessTest(
     keras_correctness_test_base.TestDistributionStrategyCorrectnessBase):
 
@@ -46,7 +51,10 @@ class DistributionStrategyCnnCorrectnessTest(
       if self.with_batch_norm == 'regular':
         c1 = keras.layers.BatchNormalization(name='bn1')(c1)
       elif self.with_batch_norm == 'sync':
-        c1 = keras.layers.SyncBatchNormalization(name='bn1')(c1)
+        # Test with parallel batch norms to verify all-reduce works OK.
+        bn1 = keras.layers.SyncBatchNormalization(name='bn1')(c1)
+        bn2 = keras.layers.SyncBatchNormalization(name='bn2')(c1)
+        c1 = keras.layers.Add()([bn1, bn2])
       c1 = keras.layers.MaxPooling2D(pool_size=(2, 2))(c1)
       logits = keras.layers.Dense(
           10, activation='softmax', name='pred')(
@@ -91,24 +99,26 @@ class DistributionStrategyCnnCorrectnessTest(
     x_eval, y_eval = self._get_data(count=1000)
     return x_train, y_train, x_eval, y_eval, x_eval
 
-  @combinations.generate(
-      keras_correctness_test_base.all_strategy_and_input_config_combinations())
+  @ds_combinations.generate(
+      keras_correctness_test_base.all_strategy_and_input_config_combinations() +
+      keras_correctness_test_base.multi_worker_mirrored_eager())
   def test_cnn_correctness(self, distribution, use_numpy, use_validation_data):
     self.run_correctness_test(distribution, use_numpy, use_validation_data)
 
-  @combinations.generate(
-      keras_correctness_test_base.all_strategy_and_input_config_combinations())
+  @ds_combinations.generate(
+      keras_correctness_test_base.all_strategy_and_input_config_combinations() +
+      keras_correctness_test_base.multi_worker_mirrored_eager())
   def test_cnn_with_batch_norm_correctness(self, distribution, use_numpy,
                                            use_validation_data):
-    self.skipTest('Flakily times out, b/134670856')
     self.run_correctness_test(
         distribution,
         use_numpy,
         use_validation_data,
         with_batch_norm='regular')
 
-  @combinations.generate(
-      keras_correctness_test_base.all_strategy_and_input_config_combinations())
+  @ds_combinations.generate(
+      keras_correctness_test_base.all_strategy_and_input_config_combinations() +
+      keras_correctness_test_base.multi_worker_mirrored_eager())
   def test_cnn_with_sync_batch_norm_correctness(self, distribution, use_numpy,
                                                 use_validation_data):
     if not context.executing_eagerly():
@@ -120,10 +130,11 @@ class DistributionStrategyCnnCorrectnessTest(
         use_validation_data,
         with_batch_norm='sync')
 
-  @combinations.generate(
-      keras_correctness_test_base.test_combinations_with_tpu_strategies() +
+  @ds_combinations.generate(
       keras_correctness_test_base
-      .strategy_minus_tpu_and_input_config_combinations_eager())
+      .all_strategy_and_input_config_combinations_eager() +
+      keras_correctness_test_base.multi_worker_mirrored_eager() +
+      keras_correctness_test_base.test_combinations_with_tpu_strategies_graph())
   def test_cnn_correctness_with_partial_last_batch_eval(self, distribution,
                                                         use_numpy,
                                                         use_validation_data):
@@ -134,10 +145,11 @@ class DistributionStrategyCnnCorrectnessTest(
         partial_last_batch=True,
         training_epochs=1)
 
-  @combinations.generate(
-      keras_correctness_test_base.test_combinations_with_tpu_strategies() +
-      keras_correctness_test_base
-      .strategy_minus_tpu_and_input_config_combinations_eager())
+  @ds_combinations.generate(
+      keras_correctness_test_base.
+      all_strategy_and_input_config_combinations_eager() +
+      keras_correctness_test_base.multi_worker_mirrored_eager() +
+      keras_correctness_test_base.test_combinations_with_tpu_strategies_graph())
   def test_cnn_with_batch_norm_correctness_and_partial_last_batch_eval(
       self, distribution, use_numpy, use_validation_data):
     self.run_correctness_test(
@@ -149,4 +161,4 @@ class DistributionStrategyCnnCorrectnessTest(
 
 
 if __name__ == '__main__':
-  test.main()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/keras_metrics_test.py b/tensorflow/python/keras/distribute/keras_metrics_test.py
index 44ed5debe60..bf592898d00 100644
--- a/tensorflow/python/keras/distribute/keras_metrics_test.py
+++ b/tensorflow/python/keras/distribute/keras_metrics_test.py
@@ -19,12 +19,13 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
-from tensorflow.python.eager import test
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import metrics
 from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
 
 
 def _labeled_dataset_fn():
@@ -83,9 +84,7 @@ def all_combinations():
 
 def tpu_combinations():
   return combinations.combine(
-      distribution=[
-          strategy_combinations.tpu_strategy_one_step,
-      ],
+      distribution=[strategy_combinations.tpu_strategy,],
       mode=["graph"])
 
 
@@ -114,7 +113,7 @@ class KerasMetricsTest(test.TestCase, parameterized.TestCase):
         if batches_consumed >= 4:  # Consume 4 input batches in total.
           break
 
-  @combinations.generate(all_combinations() + tpu_combinations())
+  @ds_combinations.generate(all_combinations() + tpu_combinations())
   def testMean(self, distribution):
     def _dataset_fn():
       return dataset_ops.Dataset.range(1000).map(math_ops.to_float).batch(
diff --git a/tensorflow/python/keras/distribute/keras_models_test.py b/tensorflow/python/keras/distribute/keras_models_test.py
index da58c04d335..6c82545beb6 100644
--- a/tensorflow/python/keras/distribute/keras_models_test.py
+++ b/tensorflow/python/keras/distribute/keras_models_test.py
@@ -22,14 +22,15 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
-from tensorflow.python.eager import test
+from tensorflow.python.framework import test_combinations as combinations
+from tensorflow.python.platform import test
 
 
 class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.all_strategies, mode=["eager"]))
   def test_lstm_model_with_dynamic_batch(self, distribution):
diff --git a/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py b/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py
index 75c7ce833c5..b6379706f0d 100644
--- a/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py
+++ b/tensorflow/python/keras/distribute/keras_optimizer_v2_test.py
@@ -21,13 +21,14 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 from tensorflow.python import keras
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.ops import math_ops
@@ -45,7 +46,7 @@ def get_model():
 
 class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.central_storage_strategy_with_two_gpus,
@@ -101,7 +102,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
       # v(2) = beta2 * v(1) + (1-beta2) * grad^2 = 0.2 * 1.8 + 0.8 * 2.25
       self.assertAllClose(2.16, self.evaluate(all_vars[2]))
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.central_storage_strategy_with_two_gpus,
diff --git a/tensorflow/python/keras/distribute/keras_premade_models_test.py b/tensorflow/python/keras/distribute/keras_premade_models_test.py
index 2d24ca369a7..4d811b2a4cb 100644
--- a/tensorflow/python/keras/distribute/keras_premade_models_test.py
+++ b/tensorflow/python/keras/distribute/keras_premade_models_test.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.optimizer_v2 import adagrad
@@ -38,10 +40,15 @@ def strategy_combinations_eager_data_fn():
           strategy_combinations.one_device_strategy,
           strategy_combinations.one_device_strategy_gpu,
           strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-          strategy_combinations.mirrored_strategy_with_two_gpus
+          strategy_combinations.mirrored_strategy_with_two_gpus,
+          strategy_combinations.multi_worker_mirrored_2x1_cpu,
+          strategy_combinations.multi_worker_mirrored_2x1_gpu,
+          strategy_combinations.multi_worker_mirrored_2x2_gpu,
+          # NOTE: TPUStrategy not tested because the models in this test are
+          # sparse and do not work with TPUs.
       ],
       mode=['eager'],
-      data_fn=[get_numpy, get_dataset])
+      data_fn=['numpy', 'dataset'])
 
 
 def get_numpy():
@@ -59,20 +66,20 @@ def get_dataset():
 
 class KerasPremadeModelsTest(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(strategy_combinations_eager_data_fn())
+  @ds_combinations.generate(strategy_combinations_eager_data_fn())
   def test_linear_model(self, distribution, data_fn):
     with distribution.scope():
       model = linear.LinearModel()
       opt = gradient_descent.SGD(learning_rate=0.1)
       model.compile(opt, 'mse')
-      if data_fn == get_numpy:
+      if data_fn == 'numpy':
         inputs, output = get_numpy()
         hist = model.fit(inputs, output, epochs=5)
       else:
         hist = model.fit(get_dataset(), epochs=5)
       self.assertLess(hist.history['loss'][4], 0.2)
 
-  @combinations.generate(strategy_combinations_eager_data_fn())
+  @ds_combinations.generate(strategy_combinations_eager_data_fn())
   def test_wide_deep_model(self, distribution, data_fn):
     with distribution.scope():
       linear_model = linear.LinearModel(units=1)
@@ -83,7 +90,7 @@ class KerasPremadeModelsTest(test.TestCase, parameterized.TestCase):
       wide_deep_model.compile(
           optimizer=[linear_opt, dnn_opt],
           loss='mse')
-      if data_fn == get_numpy:
+      if data_fn == 'numpy':
         inputs, output = get_numpy()
         hist = wide_deep_model.fit(inputs, output, epochs=5)
       else:
@@ -92,4 +99,4 @@ class KerasPremadeModelsTest(test.TestCase, parameterized.TestCase):
 
 
 if __name__ == '__main__':
-  test.main()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/keras_rnn_model_correctness_test.py b/tensorflow/python/keras/distribute/keras_rnn_model_correctness_test.py
index aa7f0c20045..7e00ff5ec7f 100644
--- a/tensorflow/python/keras/distribute/keras_rnn_model_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_rnn_model_correctness_test.py
@@ -20,7 +20,8 @@ from __future__ import print_function
 import numpy as np
 from tensorflow.python import keras
 from tensorflow.python import tf2
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.keras import testing_utils
@@ -29,7 +30,6 @@ from tensorflow.python.keras.layers import recurrent as rnn_v1
 from tensorflow.python.keras.layers import recurrent_v2 as rnn_v2
 from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
-from tensorflow.python.platform import test
 
 
 class _DistributionStrategyRnnModelCorrectnessTest(
@@ -69,6 +69,8 @@ class _DistributionStrategyRnnModelCorrectnessTest(
     return model
 
 
+@testing_utils.run_all_without_tensor_float_32(
+    'Uses Dense layers, which call matmul')
 class DistributionStrategyGruModelCorrectnessTest(
     _DistributionStrategyRnnModelCorrectnessTest):
 
@@ -80,14 +82,16 @@ class DistributionStrategyGruModelCorrectnessTest(
     else:
       return rnn_v1.GRU
 
-  @combinations.generate(
-      keras_correctness_test_base.test_combinations_for_embedding_model())
+  @ds_combinations.generate(
+      keras_correctness_test_base.test_combinations_for_embedding_model() +
+      keras_correctness_test_base.multi_worker_mirrored_eager())
   def test_gru_model_correctness(self, distribution, use_numpy,
                                  use_validation_data):
-    self.skipTest('Test is sensitive to TF random seed, b/TBD')
     self.run_correctness_test(distribution, use_numpy, use_validation_data)
 
 
+@testing_utils.run_all_without_tensor_float_32(
+    'Uses Dense layers, which call matmul')
 class DistributionStrategyLstmModelCorrectnessTest(
     _DistributionStrategyRnnModelCorrectnessTest):
 
@@ -99,14 +103,16 @@ class DistributionStrategyLstmModelCorrectnessTest(
     else:
       return rnn_v1.LSTM
 
-  @combinations.generate(
-      keras_correctness_test_base.test_combinations_for_embedding_model())
+  @ds_combinations.generate(
+      keras_correctness_test_base.test_combinations_for_embedding_model() +
+      keras_correctness_test_base.multi_worker_mirrored_eager())
   def test_lstm_model_correctness(self, distribution, use_numpy,
                                   use_validation_data):
     self.run_correctness_test(distribution, use_numpy, use_validation_data)
 
-  @combinations.generate(
-      keras_correctness_test_base.test_combinations_for_embedding_model())
+  @ds_combinations.generate(
+      keras_correctness_test_base.test_combinations_for_embedding_model() +
+      keras_correctness_test_base.multi_worker_mirrored_eager())
   @testing_utils.enable_v2_dtype_behavior
   def test_lstm_model_correctness_mixed_precision(self, distribution, use_numpy,
                                                   use_validation_data):
@@ -121,4 +127,4 @@ class DistributionStrategyLstmModelCorrectnessTest(
 
 
 if __name__ == '__main__':
-  test.main()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/keras_save_load_test.py b/tensorflow/python/keras/distribute/keras_save_load_test.py
index 65877a0f869..9f3c7f24dd8 100644
--- a/tensorflow/python/keras/distribute/keras_save_load_test.py
+++ b/tensorflow/python/keras/distribute/keras_save_load_test.py
@@ -17,13 +17,17 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.eager import test
+from tensorflow.python.distribute import combinations as ds_combinations
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_combinations as combinations
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.distribute import saved_model_test_base as test_base
 from tensorflow.python.keras.saving import save
+from tensorflow.python.platform import test
 
 
+@testing_utils.run_all_without_tensor_float_32(
+    'Uses Dense layers, which call matmul')
 class KerasSaveLoadTest(test_base.TestSavedModelBase):
 
   def setUp(self):
@@ -42,13 +46,13 @@ class KerasSaveLoadTest(test_base.TestSavedModelBase):
     return restored_keras_model.predict(
         predict_dataset, steps=test_base.PREDICT_STEPS)
 
-  @combinations.generate(test_base.simple_models_with_strategies())
+  @ds_combinations.generate(test_base.simple_models_with_strategies())
   def test_save_no_strategy_restore_strategy(self, model_and_input,
                                              distribution):
     self.run_test_save_no_strategy_restore_strategy(
         model_and_input, distribution)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(test_base.simple_models_with_strategies(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_save_strategy_restore_no_strategy(self, model_and_input,
@@ -56,7 +60,7 @@ class KerasSaveLoadTest(test_base.TestSavedModelBase):
     self.run_test_save_strategy_restore_no_strategy(
         model_and_input, distribution, save_in_scope)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(test_base.simple_models_with_strategy_pairs(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_save_strategy_restore_strategy(self, model_and_input,
@@ -70,4 +74,5 @@ class KerasSaveLoadTest(test_base.TestSavedModelBase):
 
 
 if __name__ == '__main__':
+  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py b/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py
index 7b1bc7665b8..2a1ec826b0d 100644
--- a/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py
+++ b/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py
@@ -19,8 +19,9 @@ from __future__ import print_function
 
 import numpy as np
 from tensorflow.python import keras
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.distribute import keras_correctness_test_base
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
 from tensorflow.python.platform import test
@@ -31,7 +32,6 @@ def strategies_for_stateful_embedding_model():
 
   return [
       strategy_combinations.tpu_strategy_one_core,
-      strategy_combinations.tpu_strategy_one_step_one_core
   ]
 
 
@@ -82,7 +82,7 @@ class DistributionStrategyStatefulLstmModelCorrectnessTest(
 
   # TODO(jhseu): Disabled to fix b/130808953. Need to investigate why it
   # doesn't work and enable for DistributionStrategy more generally.
-  @combinations.generate(test_combinations_for_stateful_embedding_model())
+  @ds_combinations.generate(test_combinations_for_stateful_embedding_model())
   def disabled_test_stateful_lstm_model_correctness(
       self, distribution, use_numpy, use_validation_data):
     self.run_correctness_test(
@@ -91,9 +91,10 @@ class DistributionStrategyStatefulLstmModelCorrectnessTest(
         use_validation_data,
         is_stateful_model=True)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
-          keras_correctness_test_base.test_combinations_with_tpu_strategies()))
+          keras_correctness_test_base
+          .test_combinations_with_tpu_strategies_graph()))
   def test_incorrectly_use_multiple_cores_for_stateful_lstm_model(
       self, distribution, use_numpy, use_validation_data):
     with self.assertRaisesRegex(
diff --git a/tensorflow/python/keras/distribute/keras_utils_test.py b/tensorflow/python/keras/distribute/keras_utils_test.py
index 1baa687fbb2..090f526f7ed 100644
--- a/tensorflow/python/keras/distribute/keras_utils_test.py
+++ b/tensorflow/python/keras/distribute/keras_utils_test.py
@@ -23,18 +23,21 @@ import tempfile
 
 from absl.testing import parameterized
 import numpy as np
+
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
+from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import losses
 from tensorflow.python.keras.distribute import distribute_strategy_test as keras_test_lib
-from tensorflow.python.keras.distribute import distributed_training_utils
+from tensorflow.python.keras.distribute import distributed_training_utils_v1
 from tensorflow.python.keras.distribute import optimizer_combinations
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
@@ -73,7 +76,7 @@ class Counter(keras.callbacks.Callback):
 class TestDistributionStrategyWithCallbacks(test.TestCase,
                                             parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           keras_test_lib.all_strategy_combinations()))
   def test_callbacks_in_fit(self, distribution):
@@ -127,7 +130,7 @@ class TestDistributionStrategyWithCallbacks(test.TestCase,
             'on_train_end': 1
         })
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           keras_test_lib.all_strategy_combinations()))
   def test_callbacks_in_eval(self, distribution):
@@ -151,7 +154,7 @@ class TestDistributionStrategyWithCallbacks(test.TestCase,
             'on_test_end': 1
         })
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           keras_test_lib.all_strategy_combinations()))
   def test_callbacks_in_predict(self, distribution):
@@ -181,7 +184,7 @@ class TestDistributionStrategyWithCallbacks(test.TestCase,
 
 class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -202,10 +205,10 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
           'distributed tensor inputs '
           'DistributedValues:.+'):
         with distribution.scope():
-          distributed_training_utils.validate_distributed_dataset_inputs(
+          distributed_training_utils_v1.validate_distributed_dataset_inputs(
               distribution, x, y)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -226,10 +229,10 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
           'distributed tensor inputs '
           'DistributedValues:.+'):
         with distribution.scope():
-          distributed_training_utils.validate_distributed_dataset_inputs(
+          distributed_training_utils_v1.validate_distributed_dataset_inputs(
               distribution, x, y)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -279,7 +282,7 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
       with self.assertRaises(ValueError):
         model.predict(dataset, verbose=0)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -313,7 +316,7 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
         model.compile(
             'sgd')
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -340,7 +343,7 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
           model.compile(
               'sgd')
 
-  @combinations.generate(
+  @ds_combinations.generate(
       keras_test_lib.all_strategy_combinations_minus_default())
   def test_standalone_loss_without_loss_reduction(self, distribution):
     with distribution.scope():
@@ -358,7 +361,7 @@ class TestDistributionStrategyWithLossMasking(test.TestCase,
 
   # TODO(priyag): Enable all strategies for this test. Currently it does not
   # work for TPU due to some invalid datatype.
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -391,7 +394,7 @@ class TestDistributionStrategyWithLossMasking(test.TestCase,
 class TestDistributionStrategyWithNormalizationLayer(test.TestCase,
                                                      parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           keras_test_lib.all_strategy_combinations(),
           combinations.combine(
@@ -436,7 +439,7 @@ class TestDistributionStrategyWithNormalizationLayer(test.TestCase,
 class TestDistributionStrategySaveLoadWeights(test.TestCase,
                                               parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           keras_test_lib.all_strategy_combinations_minus_default(),
           combinations.combine(
@@ -463,7 +466,7 @@ class TestDistributionStrategySaveLoadWeights(test.TestCase,
             keras_test_lib.get_predict_dataset(distribution), steps=2)
         model_2.fit(dataset, epochs=1, steps_per_epoch=1)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           keras_test_lib.all_strategy_combinations_minus_default(),
           combinations.combine(
@@ -498,7 +501,7 @@ class TestDistributionStrategySaveLoadWeights(test.TestCase,
 
 class TestDistributionStrategyValidation(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           keras_test_lib.all_strategy_combinations_minus_default()))
   def test_layer_outside_scope(self, distribution):
@@ -517,7 +520,7 @@ class TestDistributionStrategyValidation(test.TestCase, parameterized.TestCase):
               loss,
               metrics=metrics)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       keras_test_lib.all_strategy_combinations_minus_default())
   def test_model_outside_scope(self, distribution):
     with self.cached_session():
@@ -536,7 +539,7 @@ class TestDistributionStrategyValidation(test.TestCase, parameterized.TestCase):
 class TestDistributionStrategyWithStaticShapes(test.TestCase,
                                                parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -549,7 +552,7 @@ class TestDistributionStrategyWithStaticShapes(test.TestCase,
           r'the number of replicas \(2\)'):
         keras.layers.Input(shape=(3,), batch_size=5, name='input')
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           distribution=[
               strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
@@ -574,4 +577,4 @@ class TestDistributionStrategyWithStaticShapes(test.TestCase,
 
 
 if __name__ == '__main__':
-  test.main()
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/minimize_loss_test.py b/tensorflow/python/keras/distribute/minimize_loss_test.py
index 804b205b5a7..9df91f3fb6c 100644
--- a/tensorflow/python/keras/distribute/minimize_loss_test.py
+++ b/tensorflow/python/keras/distribute/minimize_loss_test.py
@@ -22,17 +22,17 @@ from absl.testing import parameterized
 import numpy
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.distribute.single_loss_example import batchnorm_example
 from tensorflow.python.distribute.single_loss_example import minimize_loss_example
 from tensorflow.python.eager import context
-from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.distribute import optimizer_combinations
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -42,6 +42,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.ops.losses import losses_impl
+from tensorflow.python.platform import test
 
 
 VAR_MAP_V1 = {
@@ -71,7 +72,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
     self.evaluate(iterator.initializer)
     return iterator
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           optimizer_combinations.distributions_and_v1_optimizers(),
           combinations.combine(mode=["graph"], use_callable_loss=[True, False])
@@ -122,7 +123,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
       self.assertTrue(is_not_increasing)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           optimizer_combinations.distributions_and_v1_optimizers(),
           combinations.combine(mode=["graph"], use_callable_loss=[True, False])
@@ -161,7 +162,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
       self.assertTrue(is_not_increasing)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           optimizer_combinations.distributions_and_v1_and_v2_optimizers(),
           combinations.combine(mode=["graph", "eager"])) + combinations.combine(
@@ -228,7 +229,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           get_expected_variables(len(distribution.extended.parameter_devices)),
           set(created_variables))
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           combinations.combine(momentum=[0.8, 0.9, 0.99], renorm=[False, True]),
           combinations.times(
@@ -295,7 +296,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
               expected_moving_mean - averaged_batch_mean(i)) * (1.0 - momentum))
           self.assertNear(expected_moving_means[i], moving_means[i], 0.0001)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           combinations.combine(loss_reduction=[
               losses_impl.Reduction.SUM, losses_impl.Reduction.MEAN,
@@ -411,7 +412,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         # One of the mean loss reductions.
         self.assertNear(weight, 2 + 0.053, 0.0001)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           optimizer_combinations.distributions_and_v1_and_v2_optimizers(),
           combinations.combine(mode=["graph", "eager"]),
@@ -541,7 +542,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(initial_loss.dtype, loss_tensor.dtype)
     self.assertEqual(initial_loss.shape, loss_tensor.shape)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       optimizer_combinations.distributions_and_v2_optimizers())
   def test_empty_var_list(self, distribution, optimizer_fn):
     opt = optimizer_fn()
diff --git a/tensorflow/python/keras/distribute/mirrored_strategy_test.py b/tensorflow/python/keras/distribute/mirrored_strategy_test.py
index 2844af8cc3a..fc800d4b210 100644
--- a/tensorflow/python/keras/distribute/mirrored_strategy_test.py
+++ b/tensorflow/python/keras/distribute/mirrored_strategy_test.py
@@ -19,17 +19,17 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function
-from tensorflow.python.eager import test
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.engine import training as keras_training
 from tensorflow.python.keras.layers import core as keras_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import optimizer as optimizer_lib
 
@@ -50,18 +50,18 @@ class MiniModel(keras_training.Model):
     return self.fc(inputs)
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
         distribution=[
             strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
         ],
-        mode=["graph", "eager"]))
+        mode=["eager"]))
 class MirroredStrategyDefunTest(test.TestCase):
 
   def testTrain(self, distribution):
     with distribution.scope():
       mock_model = MiniModel()
-      mock_model.call = function.defun(mock_model.call)
+      mock_model.call = def_function.function(mock_model.call)
 
       def loss_fn(ctx):
         del ctx
diff --git a/tensorflow/python/keras/distribute/mirrored_variable_test.py b/tensorflow/python/keras/distribute/mirrored_variable_test.py
index e24420ffc4c..22a8dbea234 100644
--- a/tensorflow/python/keras/distribute/mirrored_variable_test.py
+++ b/tensorflow/python/keras/distribute/mirrored_variable_test.py
@@ -20,14 +20,15 @@ from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import collective_all_reduce_strategy
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import context
-from tensorflow.python.eager import test
 from tensorflow.python.framework import config
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.layers import core
+from tensorflow.python.platform import test
 
 
 def _mimic_two_cpus():
@@ -39,11 +40,11 @@ def _mimic_two_cpus():
   ])
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
         distribution=[
             strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
-            combinations.NamedDistribution(
+            ds_combinations.NamedDistribution(
                 "Collective2CPUs",
                 # pylint: disable=g-long-lambda
                 lambda: collective_all_reduce_strategy.
diff --git a/tensorflow/python/keras/distribute/model_combinations.py b/tensorflow/python/keras/distribute/model_combinations.py
index 8b8dea9d16b..2d95b308cb3 100644
--- a/tensorflow/python/keras/distribute/model_combinations.py
+++ b/tensorflow/python/keras/distribute/model_combinations.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.distribute import combinations
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.distribute import simple_models
 
 simple_functional_model = combinations.NamedObject(
diff --git a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
index 0b98e85e26d..fcef87faa9c 100644
--- a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
@@ -24,10 +24,11 @@ import os
 from absl.testing import parameterized
 
 from tensorflow.python.distribute import collective_all_reduce_strategy as collective_strategy
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import distributed_file_utils
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base as test_base
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras.distribute import multi_worker_testing_utils
 from tensorflow.python.lib.io import file_io
@@ -79,7 +80,7 @@ def _get_task_config():
 
 class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           mode=['eager'],
           file_format=['h5', 'tf'],
@@ -137,7 +138,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
         args=(self, file_format))
 
-  @combinations.generate(combinations.combine(mode=['eager']))
+  @ds_combinations.generate(combinations.combine(mode=['eager']))
   def test_model_checkpoint_works_with_same_file_path(self, mode):
 
     def proc_model_checkpoint_works_with_same_file_path(
@@ -163,7 +164,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
         args=(self, saving_filepath))
 
-  @combinations.generate(combinations.combine(mode=['eager']))
+  @ds_combinations.generate(combinations.combine(mode=['eager']))
   def test_backupandrestore_checkpoint_works_with_interruption(self, mode):
 
     class InterruptingCallback(callbacks.Callback):
@@ -182,6 +183,8 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
 
     def proc_model_checkpoint_works_with_same_file_path(test_obj,
                                                         saving_filepath):
+      if multi_process_runner.is_oss():
+        test_obj.skipTest('TODO(b/170838633): Failing in OSS')
       model, _, train_ds, steps = _model_setup(test_obj, file_format='')
       num_epoch = 4
 
@@ -203,8 +206,8 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
         if 'Interrupting!' not in str(e):
           raise
 
-      multi_process_runner.barrier().wait()
-      backup_filepath = os.path.join(bar_dir, 'checkpoint')
+      multi_process_runner.get_barrier().wait()
+      backup_filepath = os.path.join(bar_dir, 'chief', 'checkpoint')
       test_obj.assertTrue(file_io.file_exists_v2(backup_filepath))
       test_obj.assertTrue(file_io.file_exists_v2(saving_filepath))
 
@@ -217,7 +220,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
               callbacks.BackupAndRestore(backup_dir=bar_dir),
               AssertCallback()
           ])
-      multi_process_runner.barrier().wait()
+      multi_process_runner.get_barrier().wait()
       test_obj.assertFalse(file_io.file_exists_v2(backup_filepath))
       test_obj.assertTrue(file_io.file_exists_v2(saving_filepath))
 
@@ -228,7 +231,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
         args=(self, saving_filepath))
 
-  @combinations.generate(combinations.combine(mode=['eager']))
+  @ds_combinations.generate(combinations.combine(mode=['eager']))
   def test_tensorboard_saves_on_chief_but_not_otherwise(self, mode):
 
     def proc_tensorboard_saves_on_chief_but_not_otherwise(test_obj):
@@ -266,7 +269,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
         args=(self,))
 
-  @combinations.generate(combinations.combine(mode=['eager']))
+  @ds_combinations.generate(combinations.combine(mode=['eager']))
   def test_tensorboard_can_still_save_to_temp_even_if_it_exists(self, mode):
 
     def proc_tensorboard_can_still_save_to_temp_even_if_it_exists(test_obj):
@@ -295,7 +298,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
         args=(self,))
 
-  @combinations.generate(combinations.combine(mode=['eager']))
+  @ds_combinations.generate(combinations.combine(mode=['eager']))
   def test_tensorboard_works_with_same_file_path(self, mode):
 
     def proc_tensorboard_works_with_same_file_path(test_obj, saving_filepath):
@@ -305,7 +308,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
       # The saving_filepath shouldn't exist at the beginning (as it's unique).
       test_obj.assertFalse(file_io.file_exists_v2(saving_filepath))
 
-      multi_process_runner.barrier().wait()
+      multi_process_runner.get_barrier().wait()
 
       model.fit(
           x=train_ds,
@@ -313,7 +316,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
           steps_per_epoch=steps,
           callbacks=[callbacks.TensorBoard(log_dir=saving_filepath)])
 
-      multi_process_runner.barrier().wait()
+      multi_process_runner.get_barrier().wait()
 
       test_obj.assertTrue(file_io.list_directory_v2(saving_filepath))
 
@@ -324,7 +327,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
         args=(self, saving_filepath))
 
-  @combinations.generate(combinations.combine(mode=['eager']))
+  @ds_combinations.generate(combinations.combine(mode=['eager']))
   def test_early_stopping(self, mode):
 
     def proc_early_stopping(test_obj):
diff --git a/tensorflow/python/keras/distribute/multi_worker_test.py b/tensorflow/python/keras/distribute/multi_worker_test.py
index a97719d0aaf..54c72004bb3 100644
--- a/tensorflow/python/keras/distribute/multi_worker_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_test.py
@@ -31,17 +31,18 @@ from absl.testing import parameterized
 # pylint: disable=g-direct-tensorflow-import
 from tensorflow.python import keras
 from tensorflow.python.distribute import collective_all_reduce_strategy as collective_strategy
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import distribute_coordinator as dc
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import multi_worker_test_base as test_base
 from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import models
-from tensorflow.python.keras import optimizers
+from tensorflow.python.keras import optimizer_v1
 from tensorflow.python.keras.distribute import multi_worker_testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.util import nest
@@ -70,11 +71,11 @@ def _clone_and_build_model(model, strategy):
     cloned_model = models.clone_model(model)
 
   # Compile and build model.
-  if isinstance(model.optimizer, optimizers.TFOptimizer):
+  if isinstance(model.optimizer, optimizer_v1.TFOptimizer):
     optimizer = model.optimizer
     # TODO(yuefengz): figure out why the optimizer here is still a
     # TFOptimizer.
-    while isinstance(optimizer, optimizers.TFOptimizer):
+    while isinstance(optimizer, optimizer_v1.TFOptimizer):
       optimizer = optimizer.optimizer
     optimizer = copy.deepcopy(optimizer)
   else:
@@ -203,7 +204,7 @@ class MultiWorkerVerificationCallback(callbacks.Callback):
 class KerasMultiWorkerTestIndependentWorker(test_base.IndependentWorkerTestBase,
                                             parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           mode=['graph'],
           strategy_cls=[
@@ -261,7 +262,7 @@ class KerasMultiWorkerTestIndependentWorker(test_base.IndependentWorkerTestBase,
     self.join_independent_workers(threads_to_join)
     verification_callback.verify(self)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           mode=['graph'],
           strategy_cls=[ParameterServerStrategy],
diff --git a/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py b/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
index ee2f64d5338..42d0e4d4630 100644
--- a/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
@@ -27,10 +27,11 @@ from tensorflow.python import keras
 from tensorflow.python.data.experimental.ops import distribute_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import collective_all_reduce_strategy
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.datasets import mnist
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
@@ -56,7 +57,7 @@ class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase):
       else:
         raise
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           mode=['eager'],
           shard_policy=[None] + list(distribute_options.AutoShardPolicy)))
@@ -108,7 +109,7 @@ class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase):
 
     num_workers = 4
 
-    def proc_func(model_path, checkpoint_dir):
+    def fn(model_path, checkpoint_dir):
       global_batch_size = per_worker_batch_size * num_workers
       strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy()
       with strategy.scope():
@@ -157,7 +158,7 @@ class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase):
         file_io.delete_recursively_v2(os.path.dirname(write_model_path))
 
       # Make sure chief finishes saving before non-chief's assertions.
-      multi_process_runner.barrier().wait()
+      multi_process_runner.get_barrier().wait()
 
       if not file_io.file_exists_v2(model_path):
         raise RuntimeError()
@@ -178,7 +179,7 @@ class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase):
         file_io.delete_recursively_v2(write_checkpoint_dir)
 
       # Make sure chief finishes saving before non-chief's assertions.
-      multi_process_runner.barrier().wait()
+      multi_process_runner.get_barrier().wait()
 
       if not file_io.file_exists_v2(checkpoint_dir):
         raise RuntimeError()
@@ -197,10 +198,10 @@ class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase):
     checkpoint_dir = os.path.join(self.get_temp_dir(), 'ckpt')
     with test_util.skip_if_error(self, errors_impl.UnavailableError):
       mpr_result = multi_process_runner.run(
-          proc_func,
+          fn,
           multi_worker_test_base.create_cluster_spec(num_workers=num_workers),
           args=(model_path, checkpoint_dir),
-          list_stdout=True)
+          return_output=True)
 
     self.assertTrue(
         any([
diff --git a/tensorflow/python/keras/distribute/optimizer_combinations.py b/tensorflow/python/keras/distribute/optimizer_combinations.py
index 495ee6c0456..254bb375a75 100644
--- a/tensorflow/python/keras/distribute/optimizer_combinations.py
+++ b/tensorflow/python/keras/distribute/optimizer_combinations.py
@@ -17,9 +17,8 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
-from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations as strategy_combinations_base
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_keras_v2
 from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_keras_v2
 from tensorflow.python.keras.optimizer_v2 import adam as adam_keras_v2
diff --git a/tensorflow/python/keras/distribute/parameter_server_training_test.py b/tensorflow/python/keras/distribute/parameter_server_training_test.py
index 12a7db44b76..e4801d909ec 100644
--- a/tensorflow/python/keras/distribute/parameter_server_training_test.py
+++ b/tensorflow/python/keras/distribute/parameter_server_training_test.py
@@ -26,11 +26,11 @@ from tensorflow.python import keras
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import multi_worker_test_base
-from tensorflow.python.distribute.client import parameter_server_client
+from tensorflow.python.distribute import parameter_server_strategy_v2
+from tensorflow.python.distribute.client import client as client_lib
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_spec
@@ -40,6 +40,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops.losses import loss_reduction
+from tensorflow.python.platform import test
 from tensorflow.python.training.server_lib import ClusterSpec
 
 
@@ -51,7 +52,8 @@ def make_client(num_workers, num_ps):
   ]
   cluster_resolver = SimpleClusterResolver(
       ClusterSpec(cluster_def), rpc_layer="grpc")
-  return parameter_server_client.ParameterServerClient(cluster_resolver)
+  return client_lib.Client(
+      parameter_server_strategy_v2.ParameterServerStrategyV2(cluster_resolver))
 
 
 class KPLTest(test.TestCase):
@@ -71,7 +73,7 @@ class KPLTest(test.TestCase):
 
     with self.client.strategy.scope():
 
-      # Define KPLs under client's context. Right now, if they have look up
+      # Define KPLs under strategy's scope. Right now, if they have look up
       # tables, they will be created on the client. Their variables will be
       # created on PS. Ideally they should be cached on each worker since they
       # will not be changed in a training step.
diff --git a/tensorflow/python/keras/distribute/saved_model_mixed_api_test.py b/tensorflow/python/keras/distribute/saved_model_mixed_api_test.py
index d303a4228b5..61151aad921 100644
--- a/tensorflow/python/keras/distribute/saved_model_mixed_api_test.py
+++ b/tensorflow/python/keras/distribute/saved_model_mixed_api_test.py
@@ -23,15 +23,19 @@ tf.saved_model.save().
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.eager import test
+from tensorflow.python.distribute import combinations as ds_combinations
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_combinations as combinations
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.distribute import saved_model_test_base as test_base
 from tensorflow.python.keras.saving import save
+from tensorflow.python.platform import test
 
 _DEFAULT_FUNCTION_KEY = 'serving_default'
 
 
+@testing_utils.run_all_without_tensor_float_32(
+    'Uses Dense layers, which call matmul')
 class SavedModelSaveAndLoadTest(test_base.TestSavedModelBase):
 
   def setUp(self):
@@ -50,13 +54,13 @@ class SavedModelSaveAndLoadTest(test_base.TestSavedModelBase):
                                                        predict_dataset,
                                                        output_name)
 
-  @combinations.generate(test_base.simple_models_with_strategies())
+  @ds_combinations.generate(test_base.simple_models_with_strategies())
   def test_save_no_strategy_restore_strategy(self, model_and_input,
                                              distribution):
     self.run_test_save_no_strategy_restore_strategy(
         model_and_input, distribution)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(test_base.simple_models_with_strategies(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_save_strategy_restore_no_strategy(self, model_and_input,
@@ -64,7 +68,7 @@ class SavedModelSaveAndLoadTest(test_base.TestSavedModelBase):
     self.run_test_save_strategy_restore_no_strategy(
         model_and_input, distribution, save_in_scope)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(test_base.simple_models_with_strategy_pairs(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_save_strategy_restore_strategy(self, model_and_input,
@@ -78,4 +82,5 @@ class SavedModelSaveAndLoadTest(test_base.TestSavedModelBase):
 
 
 if __name__ == '__main__':
+  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/distribute/saved_model_save_load_test.py b/tensorflow/python/keras/distribute/saved_model_save_load_test.py
index 39856af2a20..2df80f2c5d8 100644
--- a/tensorflow/python/keras/distribute/saved_model_save_load_test.py
+++ b/tensorflow/python/keras/distribute/saved_model_save_load_test.py
@@ -19,19 +19,23 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
-from tensorflow.python.eager import test
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_combinations as combinations
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.distribute import model_combinations
 from tensorflow.python.keras.distribute import saved_model_test_base as test_base
 from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
 from tensorflow.python.saved_model import load_options as load_options_lib
 from tensorflow.python.saved_model import save_options as save_options_lib
 from tensorflow.python.saved_model import saved_model
 
 
+@testing_utils.run_all_without_tensor_float_32(
+    'Uses Dense layers, which call matmul')
 class SavedModelKerasModelTest(test_base.TestSavedModelBase):
 
   def setUp(self):
@@ -50,13 +54,13 @@ class SavedModelKerasModelTest(test_base.TestSavedModelBase):
                                                        predict_dataset,
                                                        output_name)
 
-  @combinations.generate(test_base.simple_models_with_strategies())
+  @ds_combinations.generate(test_base.simple_models_with_strategies())
   def test_save_no_strategy_restore_strategy(self, model_and_input,
                                              distribution):
     self.run_test_save_no_strategy_restore_strategy(
         model_and_input, distribution)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(test_base.simple_models_with_strategies(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_save_strategy_restore_no_strategy(self, model_and_input,
@@ -64,7 +68,7 @@ class SavedModelKerasModelTest(test_base.TestSavedModelBase):
     self.run_test_save_strategy_restore_no_strategy(
         model_and_input, distribution, save_in_scope)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(test_base.simple_models_with_strategy_pairs(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_save_strategy_restore_strategy(self, model_and_input,
@@ -76,7 +80,7 @@ class SavedModelKerasModelTest(test_base.TestSavedModelBase):
                                                  distribution_for_restoring,
                                                  save_in_scope)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(test_base.simple_models_with_strategies(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_no_variable_device_placement(self, model_and_input, distribution,
@@ -126,13 +130,13 @@ class SavedModelTFModuleTest(test_base.TestSavedModelBase):
     model = saved_model.load(saved_dir)
     return self._predict_with_model(distribution, model, predict_dataset)
 
-  @combinations.generate(test_base.tfmodule_models_with_strategies())
+  @ds_combinations.generate(test_base.tfmodule_models_with_strategies())
   def test_save_no_strategy_restore_strategy(self, model_and_input,
                                              distribution):
     self.run_test_save_no_strategy_restore_strategy(
         model_and_input, distribution)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(test_base.tfmodule_models_with_strategies(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_save_strategy_restore_no_strategy(
@@ -140,7 +144,7 @@ class SavedModelTFModuleTest(test_base.TestSavedModelBase):
     self.run_test_save_strategy_restore_no_strategy(
         model_and_input, distribution, save_in_scope)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(test_base.tfmodule_models_with_strategy_pairs(),
                          combinations.combine(save_in_scope=[True, False])))
   def test_save_strategy_restore_strategy(self, model_and_input,
@@ -152,7 +156,7 @@ class SavedModelTFModuleTest(test_base.TestSavedModelBase):
                                                  distribution_for_restoring,
                                                  save_in_scope)
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           model_and_input=[model_combinations.simple_tfmodule_model],
           distribution=test_base.strategies +
@@ -177,4 +181,5 @@ class SavedModelTFModuleTest(test_base.TestSavedModelBase):
 
 
 if __name__ == '__main__':
+  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/distribute/saved_model_test_base.py b/tensorflow/python/keras/distribute/saved_model_test_base.py
index 9314dbe7d79..8ac8c0dc32a 100644
--- a/tensorflow/python/keras/distribute/saved_model_test_base.py
+++ b/tensorflow/python/keras/distribute/saved_model_test_base.py
@@ -24,12 +24,12 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations
-from tensorflow.python.eager import test
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.distribute import model_combinations
 from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
 from tensorflow.python.saved_model import saved_model
 
 _RANDOM_SEED = 1337
diff --git a/tensorflow/python/keras/distribute/step_fn_test.py b/tensorflow/python/keras/distribute/step_fn_test.py
index ed6f1886a53..213996fb7a7 100644
--- a/tensorflow/python/keras/distribute/step_fn_test.py
+++ b/tensorflow/python/keras/distribute/step_fn_test.py
@@ -20,20 +20,21 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 import numpy
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute.single_loss_example import single_loss_example
 from tensorflow.python.eager import context
-from tensorflow.python.eager import test
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.distribute import optimizer_combinations
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
 
 
 @test_util.with_control_flow_v2
 class SingleLossStepTest(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.times(
           optimizer_combinations.distributions_and_v1_optimizers(),
           combinations.combine(
diff --git a/tensorflow/python/keras/distribute/worker_training_state.py b/tensorflow/python/keras/distribute/worker_training_state.py
index 6385594e0c0..114fd5d9692 100644
--- a/tensorflow/python/keras/distribute/worker_training_state.py
+++ b/tensorflow/python/keras/distribute/worker_training_state.py
@@ -73,15 +73,17 @@ class WorkerTrainingState(object):
     # workers need to perform `save()`.
     # But all workers should restore from the same checkpoint_dir as passed in
     # read_checkpoint_manager.
-    self.write_checkpoint_dir = distributed_file_utils.write_dirpath(
+    self.read_checkpoint_manager = checkpoint_management.CheckpointManager(
+        checkpoint,
+        directory=os.path.join(checkpoint_dir, 'chief'),
+        max_to_keep=1)
+    write_checkpoint_dir = distributed_file_utils.write_dirpath(
         checkpoint_dir, self._model.distribute_strategy)
-    self.write_checkpoint_manager = checkpoint_management.CheckpointManager(
-        checkpoint, directory=self.write_checkpoint_dir, max_to_keep=1)
-    if self.write_checkpoint_dir == checkpoint_dir:
-      self.read_checkpoint_manager = self.write_checkpoint_manager
+    if self._model.distribute_strategy.extended.should_checkpoint:
+      self.write_checkpoint_manager = self.read_checkpoint_manager
     else:
-      self.read_checkpoint_manager = checkpoint_management.CheckpointManager(
-          checkpoint, directory=checkpoint_dir, max_to_keep=1)
+      self.write_checkpoint_manager = checkpoint_management.CheckpointManager(
+          checkpoint, directory=write_checkpoint_dir, max_to_keep=1)
 
   def back_up(self, epoch):
     """Back up the current state of training into a checkpoint file.
@@ -111,13 +113,8 @@ class WorkerTrainingState(object):
     Delete the backup directories which should not exist after `fit()`
     successfully finishes.
     """
-    # pylint: disable=protected-access
-    for pathname in file_io.get_matching_files_v2(
-        self.write_checkpoint_manager._prefix + '*'):
-      file_io.delete_recursively_v2(pathname)
-    for pathname in file_io.get_matching_files_v2(
-        os.path.join(self.write_checkpoint_manager.directory, 'checkpoint')):
-      file_io.delete_recursively_v2(pathname)
+    if self.write_checkpoint_manager is self.read_checkpoint_manager:
+      file_io.delete_recursively_v2(self.write_checkpoint_manager.directory)
 
   def maybe_load_initial_epoch_from_ckpt(self, initial_epoch, mode):
     """Maybe load initial epoch from ckpt considering possible worker recovery.
diff --git a/tensorflow/python/keras/distribute/worker_training_state_test.py b/tensorflow/python/keras/distribute/worker_training_state_test.py
index 0411aedc3c1..cb65747c239 100644
--- a/tensorflow/python/keras/distribute/worker_training_state_test.py
+++ b/tensorflow/python/keras/distribute/worker_training_state_test.py
@@ -21,8 +21,9 @@ import os
 import sys
 
 from absl.testing import parameterized
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import multi_worker_test_base as test_base
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.framework.errors_impl import NotFoundError
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras.distribute import multi_worker_testing_utils
@@ -33,33 +34,35 @@ from tensorflow.python.platform import test
 class ModelCheckpointTest(test_base.IndependentWorkerTestBase,
                           parameterized.TestCase):
 
-  @combinations.generate(
+  @ds_combinations.generate(
       combinations.combine(
           mode=['graph'],
           required_gpus=[0, 1],
           file_format=['h5', 'tf'],
           save_weights_only=[True, False]))
   def testCheckpointExists(self, file_format, save_weights_only):
-    train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(64, 2)
-    model = multi_worker_testing_utils.get_mnist_model((28, 28, 1))
-    saving_dir = self.get_temp_dir()
-    saving_filepath = os.path.join(saving_dir, 'checkpoint.' + file_format)
-    callbacks_list = [
-        callbacks.ModelCheckpoint(
-            filepath=saving_filepath, save_weights_only=save_weights_only)
-    ]
-    self.assertFalse(file_io.file_exists_v2(saving_filepath))
+    with self.cached_session():
+      train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(64, 2)
+      model = multi_worker_testing_utils.get_mnist_model((28, 28, 1))
+      saving_dir = self.get_temp_dir()
+      saving_filepath = os.path.join(saving_dir, 'checkpoint.' + file_format)
+      callbacks_list = [
+          callbacks.ModelCheckpoint(
+              filepath=saving_filepath, save_weights_only=save_weights_only)
+      ]
+      self.assertFalse(file_io.file_exists_v2(saving_filepath))
 
-    try:
-      model.fit(
-          x=train_ds, epochs=2, steps_per_epoch=2, callbacks=callbacks_list)
-    except NotFoundError as e:
-      if 'Failed to create a NewWriteableFile' in e.message:
-        self.skipTest('b/138941852, path not found error in Windows py35.')
-    tf_saved_model_exists = file_io.file_exists_v2(saving_filepath)
-    tf_weights_only_checkpoint_exists = file_io.file_exists_v2(
-        saving_filepath + '.index')
-    self.assertTrue(tf_saved_model_exists or tf_weights_only_checkpoint_exists)
+      try:
+        model.fit(
+            x=train_ds, epochs=2, steps_per_epoch=2, callbacks=callbacks_list)
+      except NotFoundError as e:
+        if 'Failed to create a NewWriteableFile' in e.message:
+          self.skipTest('b/138941852, path not found error in Windows py35.')
+      tf_saved_model_exists = file_io.file_exists_v2(saving_filepath)
+      tf_weights_only_checkpoint_exists = file_io.file_exists_v2(
+          saving_filepath + '.index')
+      self.assertTrue(
+          tf_saved_model_exists or tf_weights_only_checkpoint_exists)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/engine/BUILD b/tensorflow/python/keras/engine/BUILD
index 0d2ddb46049..144e8ac6d78 100644
--- a/tensorflow/python/keras/engine/BUILD
+++ b/tensorflow/python/keras/engine/BUILD
@@ -14,7 +14,11 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
 
 py_library(
     name = "engine",
@@ -27,11 +31,12 @@ py_library(
         "saving.py",
         "sequential.py",
         "training.py",
-        "training_arrays.py",
-        "training_distributed.py",
-        "training_eager.py",
-        "training_generator.py",
+        "training_arrays_v1.py",
+        "training_distributed_v1.py",
+        "training_eager_v1.py",
+        "training_generator_v1.py",
         "training_utils.py",
+        "training_utils_v1.py",
         "training_v1.py",
     ],
     srcs_version = "PY2AND3",
@@ -58,6 +63,7 @@ py_library(
         "//tensorflow/python/keras:constraints",
         "//tensorflow/python/keras:initializers",
         "//tensorflow/python/keras:losses",
+        "//tensorflow/python/keras:metrics",
         "//tensorflow/python/keras:optimizers",
         "//tensorflow/python/keras:regularizers",
         "//tensorflow/python/keras/distribute",
@@ -68,6 +74,7 @@ py_library(
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:metrics_utils",
         "//tensorflow/python/keras/utils:mode_keys",
+        "//tensorflow/python/keras/utils:tf_utils",
         "//tensorflow/python/keras/utils:version_utils",
         "//tensorflow/python/module",
         "//tensorflow/python/ops/ragged:ragged_tensor",
@@ -96,6 +103,7 @@ py_library(
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras/utils:tf_inspect",
         "//tensorflow/python/keras/utils:tf_utils",
     ],
 )
@@ -171,6 +179,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/keras/utils:engine_utils",
+        "//tensorflow/python/keras/utils:tf_utils",
     ],
 )
 
@@ -245,6 +254,7 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:lookup_ops",
+        "//tensorflow/python/keras",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:combinations",
     ],
@@ -255,6 +265,7 @@ tf_py_test(
     size = "medium",
     srcs = ["data_adapter_test.py"],
     python_version = "PY3",
+    shard_count = 4,
     tags = [
         "no_oss_py38",  # TODO(b/150615192)
         "nomac",  # TODO(mihaimaruseac): b/127695564
@@ -262,6 +273,7 @@ tf_py_test(
     deps = [
         ":data_adapter",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
         "//third_party/py/numpy",
     ],
 )
@@ -328,6 +340,23 @@ tf_py_test(
     tags = [
         "nomac",  # TODO(mihaimaruseac): b/127695564
     ],
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+tf_py_test(
+    name = "ragged_keras_tensor_test",
+    size = "small",
+    srcs = ["ragged_keras_tensor_test.py"],
+    python_version = "PY3",
+    tags = [
+        "nomac",  # TODO(mihaimaruseac): b/127695564
+    ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -343,6 +372,7 @@ tf_py_test(
     tags = [
         "nomac",  # TODO(mihaimaruseac): b/127695564
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -379,6 +409,7 @@ tf_py_test(
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/keras",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:callbacks",
         "//tensorflow/python/keras:combinations",
@@ -529,9 +560,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "training_utils_test",
+    name = "training_utils_v1_test",
     size = "medium",
-    srcs = ["training_utils_test.py"],
+    srcs = ["training_utils_v1_test.py"],
     python_version = "PY3",
     tags = [
         "no_oss",  # TODO(b/135021748) reenable
@@ -557,6 +588,8 @@ tf_py_test(
     deps = [
         ":base_layer",
         ":engine",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
         "//tensorflow/python/keras:testing_utils",
         "//tensorflow/python/keras/utils:layer_utils",
     ],
@@ -584,6 +617,7 @@ tf_py_test(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:combinations",
         "//tensorflow/python/keras:initializers",
@@ -607,9 +641,11 @@ tf_py_test(
     tags = [
         "nomac",  # TODO(mihaimaruseac): b/127695564
     ],
+    tfrt_enabled = True,
     deps = [
         ":base_layer",
         ":engine",
+        "//tensorflow/python/keras",
         "//tensorflow/python/keras:testing_utils",
         "//tensorflow/python/keras/utils:layer_utils",
     ],
@@ -630,6 +666,7 @@ tf_py_test(
         ":engine",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:composite_tensor",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
@@ -642,15 +679,18 @@ tf_py_test(
         "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:type_spec",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/keras",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:combinations",
         "//tensorflow/python/keras:regularizers",
         "//tensorflow/python/keras:testing_utils",
         "//tensorflow/python/keras/layers",
+        "//tensorflow/python/keras/legacy_tf_layers:core",
         "//tensorflow/python/keras/mixed_precision/experimental:policy",
         "//tensorflow/python/keras/optimizer_v2",
         "//tensorflow/python/keras/utils:tf_utils",
@@ -694,7 +734,7 @@ tf_py_test(
 
 tf_py_test(
     name = "deferred_sequential_test",
-    size = "small",
+    size = "medium",
     srcs = ["deferred_sequential_test.py"],
     python_version = "PY3",
     tags = [
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index a9c863cbc9e..3a3f6363e3c 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -23,6 +23,7 @@ import copy
 import functools
 import itertools
 import threading
+import warnings
 import weakref
 
 import numpy as np
@@ -36,8 +37,8 @@ from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import execute
-from tensorflow.python.eager import function
 from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -61,11 +62,13 @@ from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.keras.saving.saved_model import layer_serialization
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils import version_utils
 # A module that only depends on `keras.layers` import these from here.
 from tensorflow.python.keras.utils.generic_utils import to_snake_case  # pylint: disable=unused-import
 from tensorflow.python.keras.utils.tf_utils import is_tensor_or_tensor_list  # pylint: disable=unused-import
+
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -76,16 +79,19 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import data_structures
-from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util import compat
-from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
+# pylint: disable=g-inconsistent-quotes
+metrics_mod = generic_utils.LazyLoader(
+    "metrics_mod", globals(),
+    "tensorflow.python.keras.metrics")
+# pylint: enable=g-inconsistent-quotes
+
 # Prefix that is added to the TF op layer names.
 _TF_OP_LAYER_NAME_PREFIX = 'tf_op_layer_'
 
@@ -96,9 +102,11 @@ _AUTOCAST_TYPES = (ops.Tensor, sparse_tensor.SparseTensor,
 
 keras_layers_gauge = monitoring.BoolGauge('/tensorflow/api/keras/layers',
                                           'keras layers usage', 'method')
+keras_models_gauge = monitoring.BoolGauge(
+    '/tensorflow/api/keras/models', 'keras model usage', 'method')
 keras_api_gauge = monitoring.BoolGauge('/tensorflow/api/keras',
                                        'keras api usage', 'method')
-keras_model_gauge = monitoring.BoolGauge(
+keras_premade_model_gauge = monitoring.BoolGauge(
     '/tensorflow/api/keras/premade_models', 'premade keras model usage', 'type')
 
 
@@ -296,6 +304,20 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   # not available to the restoration code).
   _must_restore_from_config = False
 
+  def _instrument_layer_creation(self):
+    self._instrumented_keras_api = False
+    self._instrumented_keras_layer_class = False
+    self._instrumented_keras_model_class = False
+    if not getattr(self, '_disable_keras_instrumentation', False):
+      keras_api_gauge.get_cell('layer').set(True)
+      self._instrumented_keras_api = True
+      if getattr(self, '_is_model_for_instrumentation', False):
+        keras_models_gauge.get_cell(self.__class__.__name__).set(True)
+        self._instrumented_keras_model_class = True
+      else:
+        keras_layers_gauge.get_cell(self.__class__.__name__).set(True)
+        self._instrumented_keras_layer_class = True
+
   @trackable.no_automatic_dependency_tracking
   def __init__(self,
                trainable=True,
@@ -303,8 +325,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
                dtype=None,
                dynamic=False,
                **kwargs):
-    keras_api_gauge.get_cell('layer').set(True)
-    keras_layers_gauge.get_cell(self.__class__.__name__).set(True)
+    self._instrument_layer_creation()
+
     # These properties should be set by the user via keyword arguments.
     # note that 'dtype', 'input_shape' and 'batch_input_shape'
     # are only applicable to input layers: do not pass these keywords
@@ -317,6 +339,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         'weights',
         'activity_regularizer',
         'autocast',
+        'implementation',
     }
     # Validate optional keyword arguments.
     generic_utils.validate_kwargs(kwargs, allowed_kwargs)
@@ -832,9 +855,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
       inputs = self._maybe_cast_inputs(inputs)
 
-      # try:
       with backend.name_scope(self._name_scope()):
-        with ops.enable_auto_cast_variables(self._compute_dtype_object):
+        with autocast_variable.enable_auto_cast_variables(
+            self._compute_dtype_object):
           # Build layer if applicable (if the `build` method has been
           # overridden).
           # TODO(kaftan): do we maybe_build here, or have we already done it?
@@ -986,7 +1009,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         if not self.built:
           self._maybe_build(inputs)
 
-        with ops.enable_auto_cast_variables(self._compute_dtype_object):
+        with autocast_variable.enable_auto_cast_variables(
+            self._compute_dtype_object):
           outputs = call_fn(inputs, *args, **kwargs)
 
         if self._activity_regularizer:
@@ -1006,10 +1030,10 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         np_arrays.ndarray, np.ndarray, float, int)) for x in input_list):
 
       def _convert_non_tensor(x):
-        # Don't call `ops.convert_to_tensor_v2` on all `inputs` because
+        # Don't call `ops.convert_to_tensor` on all `inputs` because
         # `SparseTensors` can't be converted to `Tensor`.
         if isinstance(x, (np_arrays.ndarray, np.ndarray, float, int)):
-          return ops.convert_to_tensor_v2(x)
+          return ops.convert_to_tensor_v2_with_dispatch(x)
         return x
 
       inputs = nest.map_structure(_convert_non_tensor, inputs)
@@ -1119,7 +1143,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
             call_fn = self.call
 
           try:
-            with ops.enable_auto_cast_variables(self._compute_dtype_object):
+            with autocast_variable.enable_auto_cast_variables(
+                self._compute_dtype_object):
               outputs = call_fn(cast_inputs, *args, **kwargs)
 
           except errors.OperatorNotAllowedInGraphError as e:
@@ -1320,6 +1345,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
     Trainable weights are updated via gradient descent during training.
 
+    Note: This will not track the weights of nested `tf.Modules` that are not
+    themselves Keras layers.
+
     Returns:
       A list of trainable variables.
     """
@@ -1336,6 +1364,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     Non-trainable weights are *not* updated during training. They are expected
     to be updated manually in `call()`.
 
+    Note: This will not track the weights of nested `tf.Modules` that are not
+    themselves Keras layers.
+
     Returns:
       A list of non-trainable variables.
     """
@@ -1354,18 +1385,20 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   def weights(self):
     """Returns the list of all layer variables/weights.
 
+    Note: This will not track the weights of nested `tf.Modules` that are not
+    themselves Keras layers.
+
     Returns:
       A list of variables.
     """
     return self.trainable_weights + self.non_trainable_weights
 
   @property
-  @deprecation.deprecated(
-      date=None,
-      instructions='This property should not be used in TensorFlow 2.0, '
-      'as updates are applied automatically.')
   @doc_controls.do_not_generate_docs
   def updates(self):
+    warnings.warn('`layer.updates` will be removed in a future version. '
+                  'This property should not be used in TensorFlow 2.0, '
+                  'as `updates` are applied automatically.')
     if keras_tensor.keras_tensors_enabled():
       return []
 
@@ -1513,12 +1546,13 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       if callable(loss):
         # We run the loss without autocasting, as regularizers are often
         # numerically unstable in float16.
-        with ops.enable_auto_cast_variables(None):
+        with autocast_variable.enable_auto_cast_variables(None):
           loss = loss()
       if loss is None:
         return None  # Will be filtered out when computing the .losses property
       if not tensor_util.is_tensor(loss):
-        loss = ops.convert_to_tensor_v2(loss, dtype=backend.floatx())
+        loss = ops.convert_to_tensor_v2_with_dispatch(
+            loss, dtype=backend.floatx())
       loss._unconditional_loss = True  # pylint: disable=protected-access
       return loss
 
@@ -1535,7 +1569,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         continue
       if not tensor_util.is_tensor(loss) and not isinstance(
           loss, keras_tensor.KerasTensor):
-        loss = ops.convert_to_tensor_v2(loss, dtype=backend.floatx())
+        loss = ops.convert_to_tensor_v2_with_dispatch(
+            loss, dtype=backend.floatx())
       # TF Functions should take the eager path.
       if ((tf_utils.is_symbolic_tensor(loss) or
            isinstance(loss, keras_tensor.KerasTensor)) and
@@ -1589,7 +1624,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     ['max', 'min']
 
     Returns:
-      A list of tensors.
+      A list of `Metric` objects.
     """
     collected_metrics = []
     for layer in self._flatten_layers():
@@ -1607,11 +1642,11 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     class MyMetricLayer(tf.keras.layers.Layer):
       def __init__(self):
         super(MyMetricLayer, self).__init__(name='my_metric_layer')
-        self.mean = metrics_module.Mean(name='metric_1')
+        self.mean = tf.keras.metrics.Mean(name='metric_1')
 
       def call(self, inputs):
         self.add_metric(self.mean(x))
-        self.add_metric(math_ops.reduce_sum(x), name='metric_2')
+        self.add_metric(tf.reduce_sum(x), name='metric_2')
         return inputs
     ```
 
@@ -1703,7 +1738,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         elif metric_obj:
           self._metrics.append(metric_obj)
         else:
-          from tensorflow.python.keras import metrics as metrics_mod  # pylint:disable=g-import-not-at-top
           # Build the metric object with the value's dtype if it defines one
           metric_obj = metrics_mod.Mean(
               name=name, dtype=getattr(value, 'dtype', None))
@@ -1883,8 +1917,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         output_weights.append(weight)
     return backend.batch_get_value(output_weights)
 
-  @deprecation.deprecated(
-      date=None, instructions='Please use `layer.updates` instead.')
   @doc_controls.do_not_generate_docs
   def get_updates_for(self, inputs):
     """Deprecated, do NOT use!
@@ -1897,10 +1929,11 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     Returns:
       List of update ops of the layer that depend on `inputs`.
     """
+    warnings.warn('`layer.get_updates_for` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `layer.updates` method instead.')
     return self.updates
 
-  @deprecation.deprecated(
-      date=None, instructions='Please use `layer.losses` instead.')
   @doc_controls.do_not_generate_docs
   def get_losses_for(self, inputs):
     """Deprecated, do NOT use!
@@ -1913,6 +1946,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     Returns:
       List of loss tensors of the layer that depend on `inputs`.
     """
+    warnings.warn('`layer.get_losses_for` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `layer.losses` instead.')
     return self.losses
 
   @doc_controls.do_not_doc_inheritable
@@ -2217,8 +2253,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   # Methods & attributes below are public aliases of other methods.            #
   ##############################################################################
 
-  @deprecation.deprecated(
-      date=None, instructions='Please use `layer.__call__` method instead.')
   @doc_controls.do_not_doc_inheritable
   def apply(self, inputs, *args, **kwargs):
     """Deprecated, do NOT use!
@@ -2233,13 +2267,17 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     Returns:
       Output tensor(s).
     """
+    warnings.warn('`layer.apply` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `layer.__call__` method instead.')
     return self.__call__(inputs, *args, **kwargs)
 
-  @deprecation.deprecated(
-      date=None, instructions='Please use `layer.add_weight` method instead.')
   @doc_controls.do_not_doc_inheritable
   def add_variable(self, *args, **kwargs):
     """Deprecated, do NOT use! Alias for `add_weight`."""
+    warnings.warn('`layer.add_variable` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `layer.add_weight` method instead.')
     return self.add_weight(*args, **kwargs)
 
   @property
@@ -2249,6 +2287,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
     Alias of `self.weights`.
 
+    Note: This will not track the weights of nested `tf.Modules` that are not
+    themselves Keras layers.
+
     Returns:
       A list of variables.
     """
@@ -2309,11 +2350,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
                        'use a different Strategy, e.g. a MirroredStrategy.' %
                        (strategy.__class__.__name__, self._dtype_policy.name))
 
-    # This has no impact on the layer behavior, and is only used for printing
-    # warnings.
-    self._dtype_defaulted_to_floatx = (not dtype and
-                                       policy.policy_defaults_to_floatx())
-
     # Performance optimization: cache the compute dtype as a Dtype object or
     # None, so that str to Dtype conversion doesn't happen in Layer.__call__.
     # TODO(b/157486353): Investigate returning DTypes in Policy.
@@ -2374,37 +2410,10 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   def _cast_single_input(self, x):
     """Cast a single Tensor or TensorSpec to the compute dtype."""
     if self._should_cast_single_input(x):
-      if self._dtype_defaulted_to_floatx:
-        self._warn_about_input_casting(x.dtype.base_dtype)
       return math_ops.cast(x, self._compute_dtype_object)
     else:
       return x
 
-  def _warn_about_input_casting(self, input_dtype):
-    # self._already_warned_about_input_casting is only retrieved or set in this
-    # function.
-    already_warned = getattr(self, '_already_warned_about_input_casting', False)
-    if not already_warned:
-      tf_logging.warn(
-          "Layer {self.name} is casting an input tensor from dtype "
-          "{input_dtype} to the layer's dtype of {layer_dtype}, which is new "
-          "behavior in TensorFlow 2.  The layer has dtype {layer_dtype} "
-          'because its dtype defaults to floatx.\n\n'
-          ""
-          "If you intended to run this layer in {layer_dtype}, you can safely "
-          "ignore this warning. If in doubt, this warning is likely only an "
-          "issue if you are porting a TensorFlow 1.X model to TensorFlow 2.\n\n"
-          ""
-          "To change all layers to have dtype {input_dtype} by default, call "
-          "`tf.keras.backend.set_floatx('{input_dtype}')`. To change just this "
-          "layer, pass dtype='{input_dtype}' to the layer constructor. If you "
-          "are the author of this layer, you can disable autocasting by "
-          "passing autocast=False to the base Layer constructor.\n".format(
-              self=self,
-              input_dtype=input_dtype.name,
-              layer_dtype=self._compute_dtype))
-      self._already_warned_about_input_casting = True
-
   # _dtype used to be an attribute set in the constructor. We still expose it
   # because some clients still use it.
   # TODO(reedwm): Deprecate, then remove the _dtype property.
@@ -2440,6 +2449,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           generic_utils.to_snake_case(self.__class__.__name__),
           zero_based=zero_based)
     else:
+      backend.observe_object_name(name)
       self._name = name
 
   def _get_existing_metric(self, name=None):
@@ -2586,10 +2596,10 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # we copy them to avoid loss of KerasHistory metadata.
     flat_outputs = nest.flatten(outputs)
     flat_inputs = nest.flatten((args, kwargs))
-    inputs_set = object_identity.ObjectIdentitySet(flat_inputs)
+    input_ids_set = {id(i) for i in flat_inputs}
     outputs_copy = []
     for x in flat_outputs:
-      if x in inputs_set:
+      if id(x) in input_ids_set:
         with backend.name_scope(self.name):
           x = array_ops.identity(x)
       outputs_copy.append(x)
@@ -2810,9 +2820,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       pass
 
     # Keep track of metric instance created in subclassed layer.
-    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
     for val in nest.flatten(value):
-      if isinstance(val, metrics_module.Metric) and hasattr(self, '_metrics'):
+      if isinstance(val, metrics_mod.Metric) and hasattr(self, '_metrics'):
         self._metrics.append(val)
 
     # TODO(scottzhu): Need to track Module object as well for weight tracking.
@@ -2833,7 +2842,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # Append value to list of trainable / non-trainable weights if relevant
     # TODO(b/125122625): This won't pick up on any variables added to a
     # list/dict after creation.
-    for val in nest.flatten(value):
+    for val in nest.flatten(value, expand_composites=True):
       # TODO(b/126450014): Remove `_UnreadVariable` check here when assign ops
       # no longer return True for isinstance Variable checks.
       if not isinstance(val, tf_variables.Variable):
@@ -2865,7 +2874,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         'weights', 'trainable_weights', 'non_trainable_weights'
     }
     if hasattr(self, '_layers'):
-      nested_layers = trackable_layer_utils.filter_empty_layer_containers(
+      nested_layers = layer_utils.filter_empty_layer_containers(
           self._layers)
       return list(
           itertools.chain.from_iterable(
@@ -2889,7 +2898,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           continue
         seen_object_ids.add(layer_or_container_id)
 
-        if isinstance(layer_or_container, Layer):
+        if (isinstance(layer_or_container, Layer) and
+            not isinstance(layer_or_container, metrics_mod.Metric)):
           yield layer_or_container
           # Introspect recursively through sublayers.
           if recursive:
@@ -2985,12 +2995,13 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
   def _dedup_weights(self, weights):
     """Dedupe weights while maintaining order as much as possible."""
-    output, seen_weights = [], object_identity.ObjectIdentitySet()
+    output, seen_ids = [], set()
     for w in weights:
-      if w not in seen_weights:
+      if id(w) not in seen_ids:
         output.append(w)
         # Track the Variable's identity to avoid __eq__ issues.
-        seen_weights.add(w)
+        seen_ids.add(id(w))
+
     return output
 
   def _split_out_first_arg(self, args, kwargs):
@@ -3167,7 +3178,7 @@ class TensorFlowOpLayer(Layer):
         return op.outputs[0]
       return op.outputs
 
-  @function.defun
+  @def_function.function
   def _defun_call(self, inputs):
     """Wraps the op creation method in an Eager function for `run_eagerly`."""
     return self._make_op(inputs)
@@ -3266,7 +3277,7 @@ def _in_functional_construction_mode(layer, inputs, args, kwargs, input_list):
 
 def _convert_numpy_or_python_types(x):
   if isinstance(x, (np_arrays.ndarray, np.ndarray, float, int)):
-    return ops.convert_to_tensor_v2(x)
+    return ops.convert_to_tensor_v2_with_dispatch(x)
   return x
 
 
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index 022718ea549..029447b3cba 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -27,12 +27,14 @@ import numpy as np
 
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import type_spec
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
@@ -43,10 +45,9 @@ from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training as training_lib
-from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.legacy_tf_layers import core as legacy_core
 from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.keras.utils import control_flow_util
-from tensorflow.python.layers import core as legacy_core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -56,7 +57,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging
 from tensorflow.python.summary import summary_iterator
 from tensorflow.python.util import nest
 
@@ -85,6 +85,13 @@ class InvalidLayer(base_layer.Layer):
 
 class BaseLayerTest(keras_parameterized.TestCase):
 
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def test_layer_instrumentation(self):
+    layer = layers.Add()
+    self.assertTrue(layer._instrumented_keras_api)
+    self.assertTrue(layer._instrumented_keras_layer_class)
+    self.assertFalse(layer._instrumented_keras_model_class)
+
   @combinations.generate(combinations.times(
       combinations.keras_model_type_combinations(),
       combinations.keras_tensor_combinations()))
@@ -435,6 +442,49 @@ class BaseLayerTest(keras_parameterized.TestCase):
     # Checks that variables get initialized.
     model.fit(x, y, batch_size=2, epochs=2)
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def test_composite_variable_assignment(self):
+
+    class Spec(type_spec.TypeSpec):
+
+      value_type = property(lambda self: CompositeVariable)
+
+      def _component_specs(self):
+        pass
+
+      def _serialize(self):
+        pass
+
+      def _to_components(self, value):
+        return value._variables
+
+      def _from_components(self, variable_list):
+        return CompositeVariable(variable_list)
+
+    class CompositeVariable(composite_tensor.CompositeTensor):
+
+      def __init__(self, variable_list):
+        self._variables = variable_list
+
+      @property
+      def _type_spec(self):
+        return Spec()
+
+    class CompositeVariableLayer(base_layer.Layer):
+
+      def __init__(self):
+        super().__init__()
+        self.composite_var = CompositeVariable(
+            [variables.Variable(1.),
+             variables.Variable(2.)])
+
+    layer = CompositeVariableLayer()
+    self.assertLen(layer.weights, 2)
+    self.assertIsInstance(layer.weights[0], variables.Variable)
+    self.assertIsInstance(layer.weights[1], variables.Variable)
+    self.assertEqual(self.evaluate(layer.weights[0]), 1.)
+    self.assertEqual(self.evaluate(layer.weights[1]), 2.)
+
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_layer_names(self):
     with testing_utils.use_keras_tensors_scope(False):
@@ -471,6 +521,32 @@ class BaseLayerTest(keras_parameterized.TestCase):
         ]
         self.assertAllEqual(actual_names, expected_names)
 
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def test_layer_names_after_loading(self):
+    if context.executing_eagerly():
+      backend.clear_session()
+      with testing_utils.use_keras_tensors_scope(True):
+        # Mimic loading a model that already contained add layers with
+        # name = 'add_1' and 'tf.__operators__.add'
+        layers.Add(name='add_1')
+        layers.Add(name='tf.__operators__.add')
+
+        inputs = input_layer.Input(shape=[2])
+        add1 = inputs + inputs
+        add2 = layers.Add()([inputs, inputs])
+        add3 = inputs + inputs
+        add4 = layers.Add()([inputs, inputs])
+        model = training_lib.Model(
+            inputs=[inputs], outputs=[add1, add2, add3, add4])
+        actual_names = [l.name for l in model.layers]
+        # The generated op layer names should have avoided layer names seen in
+        # the loaded model. (This avoiance should not apply to non-op-layers)
+        expected_names = [
+            'input_1', 'tf.__operators__.add_1',
+            'add', 'tf.__operators__.add_2', 'add_1'
+        ]
+        self.assertAllEqual(actual_names, expected_names)
+
   def test_add_trainable_weight_on_frozen_layer(self):
 
     class TestLayer(base_layer.Layer):
@@ -791,7 +867,6 @@ class SymbolicSupportTest(keras_parameterized.TestCase):
     with ops.Graph().as_default():
       x1 = array_ops.ones((3, 3))
     x2 = array_ops.ones((3, 3))
-    self.assertIsInstance(x2, ops.EagerTensor)
     with self.assertRaisesRegex(TypeError, 'Graph tensors'):
       math_ops.matmul(x1, x2)
 
@@ -1135,7 +1210,7 @@ class NameScopingTest(keras_parameterized.TestCase):
     self.assertEqual(sublayer.active_name_scope, 'MyName2/Sublayer')
 
   def test_name_scope_tf_tensor(self):
-    x = ops.convert_to_tensor_v2(np.ones((10, 10)))
+    x = ops.convert_to_tensor_v2_with_dispatch(np.ones((10, 10)))
     layer = layers.Dense(
         10, activation=layers.ReLU(name='MyAct'), name='MyName3')
     layer(x)
@@ -1545,44 +1620,6 @@ class DTypeTest(keras_parameterized.TestCase):
     layer = IdentityLayerWithoutAutocast(dtype='float64')
     self.assertEqual(layer(self._const('float32')).dtype, 'float32')
 
-  @testing_utils.enable_v2_dtype_behavior
-  def test_dtype_warnings(self):
-    # Test a layer warns when it casts inputs.
-    layer = IdentityLayer()
-    with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
-      layer(self._const('float64'))
-      self.assertRegex(
-          str(mock_warn.call_args),
-          ".*from dtype float64 to the layer's dtype of float32.*"
-          "The layer has dtype float32 because.*")
-
-    # Test a layer does not warn a second time
-    with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
-      layer(self._const('float64'))
-      mock_warn.assert_not_called()
-
-    # Test a new layer can warn even if a different layer already warned
-    layer = IdentityLayer()
-    with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
-      layer(self._const('float64'))
-      self.assertRegex(
-          str(mock_warn.call_args),
-          ".*from dtype float64 to the layer's dtype of float32.*"
-          "The layer has dtype float32 because.*")
-
-    # Test a layer does not warn if a dtype is passed
-    layer = IdentityLayer(dtype='float32')
-    with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
-      layer(self._const('float64'))
-      mock_warn.assert_not_called()
-
-    # Test a layer does not warn if a Policy is set:
-    with policy.policy_scope('float32'):
-      layer = IdentityLayer()
-      with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
-        layer(self._const('float64'))
-        mock_warn.assert_not_called()
-
   @testing_utils.enable_v2_dtype_behavior
   def test_compute_output_signature(self):
 
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index 6fa955399d9..d6b32907593 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -30,15 +30,14 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.utils import control_flow_util
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_util_v2
-from tensorflow.python.ops import control_flow_v2_func_graphs
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.training.tracking import base as tracking
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 _call_context = threading.local()
@@ -580,11 +579,7 @@ def check_graph_consistency(tensor=None, method='add_loss', force_raise=False):
   """
   if (force_raise or
       (ops.executing_eagerly_outside_functions() and
-       hasattr(tensor, 'graph') and
-       isinstance(tensor.graph,
-                  (control_flow_v2_func_graphs.CondBranchFuncGraph,
-                   control_flow_v2_func_graphs.WhileCondFuncGraph,
-                   control_flow_v2_func_graphs.WhileBodyFuncGraph)))):
+       hasattr(tensor, 'graph') and tensor.graph.is_control_flow_graph)):
     if method == 'activity_regularizer':
       bad_example = """
       class TestModel(tf.keras.Model):
@@ -852,7 +847,7 @@ def no_ragged_support(inputs, layer_name):
 
 
 def is_split_variable(v):
-  """Returns True if `v` is either a PartionedVariable or a SharedVariable."""
+  """Returns True if `v` is either a PartionedVariable or a ShardedVariable."""
   return hasattr(v, '_variable_list') or hasattr(v, '_variables')
 
 
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index 536efb52ad1..a190218bbcf 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -22,6 +22,7 @@ import collections
 import functools
 import itertools
 import threading
+import warnings
 
 import numpy as np
 import six
@@ -51,6 +52,7 @@ from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.keras.saving.saved_model import layer_serialization
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 # A module that only depends on `keras.layers` import these from here.
 from tensorflow.python.keras.utils.generic_utils import to_snake_case  # pylint: disable=unused-import
@@ -64,12 +66,9 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import data_structures
-from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.training.tracking import tracking
-from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
-from tensorflow.python.util import tf_inspect
 from tensorflow.tools.docs import doc_controls
 
 
@@ -153,15 +152,15 @@ class Layer(base_layer.Layer):
   @trackable.no_automatic_dependency_tracking
   def __init__(self, trainable=True, name=None, dtype=None, dynamic=False,
                **kwargs):
-    base_layer.keras_api_gauge.get_cell('layer').set(True)
-    base_layer.keras_layers_gauge.get_cell(self.__class__.__name__).set(True)
+    self._instrument_layer_creation()
+
     # These properties should be set by the user via keyword arguments.
     # note that 'dtype', 'input_shape' and 'batch_input_shape'
     # are only applicable to input layers: do not pass these keywords
     # to non-input layers.
     allowed_kwargs = {
         'input_dim', 'input_shape', 'batch_input_shape', 'batch_size',
-        'weights', 'activity_regularizer', 'autocast'
+        'weights', 'activity_regularizer', 'autocast', 'implementation'
     }
     # Validate optional keyword arguments.
     generic_utils.validate_kwargs(kwargs, allowed_kwargs)
@@ -690,10 +689,10 @@ class Layer(base_layer.Layer):
     # Accept NumPy and scalar inputs by converting to Tensors.
     if any(isinstance(x, (np.ndarray, float, int)) for x in input_list):
       def _convert_non_tensor(x):
-        # Don't call `ops.convert_to_tensor_v2` on all `inputs` because
+        # Don't call `ops.convert_to_tensor` on all `inputs` because
         # `SparseTensors` can't be converted to `Tensor`.
         if isinstance(x, (np.ndarray, float, int)):
-          return ops.convert_to_tensor_v2(x)
+          return ops.convert_to_tensor_v2_with_dispatch(x)
         return x
       inputs = nest.map_structure(_convert_non_tensor, inputs)
       input_list = nest.flatten(inputs)
@@ -780,7 +779,8 @@ class Layer(base_layer.Layer):
 
           if not self.dynamic:
             try:
-              with ops.enable_auto_cast_variables(self._compute_dtype_object):
+              with autocast_variable.enable_auto_cast_variables(
+                  self._compute_dtype_object):
                 outputs = call_fn(cast_inputs, *args, **kwargs)
 
             except errors.OperatorNotAllowedInGraphError as e:
@@ -824,7 +824,8 @@ class Layer(base_layer.Layer):
         with backend.name_scope(self._name_scope()):
           self._maybe_build(inputs)
           cast_inputs = self._maybe_cast_inputs(inputs)
-          with ops.enable_auto_cast_variables(self._compute_dtype_object):
+          with autocast_variable.enable_auto_cast_variables(
+              self._compute_dtype_object):
             outputs = self.call(cast_inputs, *args, **kwargs)
           self._handle_activity_regularization(inputs, outputs)
           self._set_mask_metadata(inputs, outputs, input_masks)
@@ -834,8 +835,9 @@ class Layer(base_layer.Layer):
   def _assert_built_as_v1(self):
     if not hasattr(self, '_originally_built_as_v1'):
       raise ValueError(
-          'Your Layer or Model is in an invalid state. This can happen if you '
-          'are interleaving estimator/non-estimator models or '
+          'Your Layer or Model is in an invalid state. '
+          'This can happen for the following cases:\n '
+          '1. You might be interleaving estimator/non-estimator models or '
           'interleaving models/layers made in tf.compat.v1.Graph.as_default() '
           'with models/layers created outside of it. '
           'Converting a model to an estimator (via model_to_estimator) '
@@ -843,7 +845,11 @@ class Layer(base_layer.Layer):
           'if they were not the model converted to an estimator). '
           'Similarly, making a layer or a model inside a '
           'a tf.compat.v1.Graph invalidates all layers/models you previously '
-          'made outside of the graph.')
+          'made outside of the graph.\n'
+          '2. You might be using a custom keras layer implementation with '
+          ' custom __init__ which didn\'t call super().__init__. '
+          ' Please check the implementation of %s and its bases.' %
+          (type(self),))
 
   @property
   def dtype(self):
@@ -1048,12 +1054,13 @@ class Layer(base_layer.Layer):
       if callable(loss):
         # We run the loss without autocasting, as regularizers are often
         # numerically unstable in float16.
-        with ops.enable_auto_cast_variables(None):
+        with autocast_variable.enable_auto_cast_variables(None):
           loss = loss()
       if loss is None:
         return None  # Will be filtered out when computing the .losses property
       if not tensor_util.is_tensor(loss):
-        loss = ops.convert_to_tensor_v2(loss, dtype=backend.floatx())
+        loss = ops.convert_to_tensor_v2_with_dispatch(
+            loss, dtype=backend.floatx())
       loss._unconditional_loss = (inputs is None)  # pylint: disable=protected-access
       return loss
 
@@ -1068,7 +1075,8 @@ class Layer(base_layer.Layer):
       if loss is None:
         continue
       if not tensor_util.is_tensor(loss):
-        loss = ops.convert_to_tensor_v2(loss, dtype=backend.floatx())
+        loss = ops.convert_to_tensor_v2_with_dispatch(
+            loss, dtype=backend.floatx())
       # TF Functions should take the eager path.
       if (tf_utils.is_symbolic_tensor(loss) and
           not base_layer_utils.is_in_tf_function()):
@@ -1229,7 +1237,7 @@ class Layer(base_layer.Layer):
       elif hasattr(x, 'op'):
         update = x.op
       else:
-        update = ops.convert_to_tensor_v2(x)
+        update = ops.convert_to_tensor_v2_with_dispatch(x)
 
       reachable = tf_utils.get_reachable_from_inputs(relevant_inputs, [update])
       update._unconditional_update = update not in reachable
@@ -1692,8 +1700,6 @@ class Layer(base_layer.Layer):
   # Methods & attributes below are public aliases of other methods.            #
   ##############################################################################
 
-  @deprecation.deprecated(
-      date=None, instructions='Please use `layer.__call__` method instead.')
   @doc_controls.do_not_doc_inheritable
   def apply(self, inputs, *args, **kwargs):
     """Deprecated, do NOT use!
@@ -1708,13 +1714,17 @@ class Layer(base_layer.Layer):
     Returns:
       Output tensor(s).
     """
+    warnings.warn('`layer.apply` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `layer.__call__` method instead.')
     return self.__call__(inputs, *args, **kwargs)
 
-  @deprecation.deprecated(
-      date=None, instructions='Please use `layer.add_weight` method instead.')
   @doc_controls.do_not_doc_inheritable
   def add_variable(self, *args, **kwargs):
     """Deprecated, do NOT use! Alias for `add_weight`."""
+    warnings.warn('`layer.add_variable` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `layer.add_weight` method instead.')
     return self.add_weight(*args, **kwargs)
 
   @property
@@ -1781,11 +1791,6 @@ class Layer(base_layer.Layer):
                        'use a different Strategy, e.g. a MirroredStrategy.' %
                        (strategy.__class__.__name__, self._dtype_policy.name))
 
-    # This has no impact on the layer behavior, and is only used for printing
-    # warnings.
-    self._dtype_defaulted_to_floatx = (not dtype and
-                                       policy.policy_defaults_to_floatx())
-
     # Performance optimization: cache the compute dtype as a Dtype object or
     # None, so that str to Dtype conversion doesn't happen in Layer.__call__.
     if self._dtype_policy.compute_dtype:
@@ -1829,8 +1834,6 @@ class Layer(base_layer.Layer):
                       ragged_tensor.RaggedTensor)
         if (isinstance(x, cast_types) and x.dtype.is_floating and
             x.dtype.base_dtype.name != compute_dtype):
-          if self._dtype_defaulted_to_floatx:
-            self._warn_about_input_casting(x.dtype.base_dtype)
           return math_ops.cast(x, compute_dtype)
         elif isinstance(x, tensor_spec.TensorSpec) and x.dtype.is_floating:
           # Inputs may be TensorSpecs when this function is called from
@@ -1842,31 +1845,6 @@ class Layer(base_layer.Layer):
     else:
       return inputs
 
-  def _warn_about_input_casting(self, input_dtype):
-    # self._already_warned_about_input_casting is only retrieved or set in this
-    # function.
-    already_warned = getattr(self, '_already_warned_about_input_casting', False)
-    if not already_warned:
-      tf_logging.warn(
-          "Layer {self.name} is casting an input tensor from dtype "
-          "{input_dtype} to the layer's dtype of {layer_dtype}, which is new "
-          "behavior in TensorFlow 2.  The layer has dtype {layer_dtype} "
-          'because its dtype defaults to floatx.\n\n'
-          ""
-          "If you intended to run this layer in {layer_dtype}, you can safely "
-          "ignore this warning. If in doubt, this warning is likely only an "
-          "issue if you are porting a TensorFlow 1.X model to TensorFlow 2.\n\n"
-          ""
-          "To change all layers to have dtype {input_dtype} by default, call "
-          "`tf.keras.backend.set_floatx('{input_dtype}')`. To change just this "
-          "layer, pass dtype='{input_dtype}' to the layer constructor. If you "
-          "are the author of this layer, you can disable autocasting by "
-          "passing autocast=False to the base Layer constructor.\n".format(
-              self=self,
-              input_dtype=input_dtype.name,
-              layer_dtype=self._compute_dtype))
-      self._already_warned_about_input_casting = True
-
   # _dtype used to be an attribute set in the constructor. We still expose it
   # because some clients still use it.
   # TODO(reedwm): Deprecate, then remove the _dtype property.
@@ -2150,7 +2128,7 @@ class Layer(base_layer.Layer):
     Returns:
       A dict mapping all sublayers to their `trainable` value.
     """
-    layers = trackable_layer_utils.filter_empty_layer_containers(self._layers)
+    layers = layer_utils.filter_empty_layer_containers(self._layers)
     # Keep track of each top-level layers' `trainable` as well as the
     # state of all of its sublayers.
     trainable_state = {self: self.trainable}
@@ -2160,7 +2138,7 @@ class Layer(base_layer.Layer):
 
   def _set_trainable_state(self, trainable_state):
     """Set `trainable` state for each sublayer."""
-    layers = trackable_layer_utils.filter_empty_layer_containers(self._layers)
+    layers = layer_utils.filter_empty_layer_containers(self._layers)
     if self in trainable_state:
       self.trainable = trainable_state[self]
     for layer in layers:
@@ -2316,7 +2294,7 @@ class Layer(base_layer.Layer):
         'weights', 'trainable_weights', 'non_trainable_weights'
     }
     if hasattr(self, '_layers'):
-      nested_layers = trackable_layer_utils.filter_empty_layer_containers(
+      nested_layers = layer_utils.filter_empty_layer_containers(
           self._layers)
       return list(
           itertools.chain.from_iterable(
@@ -2378,12 +2356,13 @@ class Layer(base_layer.Layer):
 
   def _dedup_weights(self, weights):
     """Dedupe weights while maintaining order as much as possible."""
-    output, seen_weights = [], object_identity.ObjectIdentitySet()
+    output, seen_ids = [], set()
     for w in weights:
-      if w not in seen_weights:
+      if id(w) not in seen_ids:
         output.append(w)
         # Track the Variable's identity to avoid __eq__ issues.
-        seen_weights.add(w)
+        seen_ids.add(id(w))
+
     return output
 
   # SavedModel properties. Please see keras/saving/saved_model for details.
diff --git a/tensorflow/python/keras/engine/base_preprocessing_layer.py b/tensorflow/python/keras/engine/base_preprocessing_layer.py
index f5577bf058e..318323b395a 100644
--- a/tensorflow/python/keras/engine/base_preprocessing_layer.py
+++ b/tensorflow/python/keras/engine/base_preprocessing_layer.py
@@ -31,7 +31,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.engine import training_generator
+from tensorflow.python.keras.engine import training_generator_v1
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import sparse_ops
@@ -149,7 +149,7 @@ class CombinerPreprocessingLayer(PreprocessingLayer):
     else:
       accumulator = self._combiner.restore(self._restore_updates())
     if isinstance(data, (list, tuple)):
-      data = ops.convert_to_tensor_v2(data)
+      data = ops.convert_to_tensor_v2_with_dispatch(data)
     if not isinstance(data,
                       (dataset_ops.DatasetV2,
                        np.ndarray,
@@ -175,7 +175,7 @@ class CombinerPreprocessingLayer(PreprocessingLayer):
       next_data = self._get_dataset_iterator(
           dataset_ops.Dataset.from_tensor_slices(data).batch(512))
     else:
-      generator, _ = training_generator.convert_to_generator_like(
+      generator, _ = training_generator_v1.convert_to_generator_like(
           data, batch_size=512)
       # If the data is not a dataset, we can iterate over it using next(foo);
       # here, we wrap that into a callable.
@@ -271,7 +271,7 @@ def convert_to_list(values, sparse_default_value=None):
         values, default_value=sparse_default_value)
     values = K.get_value(dense_tensor)
 
-  if isinstance(values, (ops.EagerTensor, ops.Tensor)):
+  if isinstance(values, ops.Tensor):
     values = K.get_value(values)
 
   # We may get passed a ndarray or the code above may give us a ndarray.
diff --git a/tensorflow/python/keras/engine/compile_utils_test.py b/tensorflow/python/keras/engine/compile_utils_test.py
index 39127270539..ae92b9aeb09 100644
--- a/tensorflow/python/keras/engine/compile_utils_test.py
+++ b/tensorflow/python/keras/engine/compile_utils_test.py
@@ -53,7 +53,7 @@ class LossesContainerTest(keras_parameterized.TestCase):
 
     y_t = [array_ops.ones((10, 1)), array_ops.zeros((10, 1))]
     y_p = [array_ops.ones((10, 1)), array_ops.ones((10, 1))]
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 
     total_loss = loss_container(y_t, y_p, sample_weight=sw)
 
@@ -86,7 +86,7 @@ class LossesContainerTest(keras_parameterized.TestCase):
 
     y_t = {'out1': array_ops.ones((10, 1)), 'out2': array_ops.zeros((10, 1))}
     y_p = {'out1': array_ops.ones((10, 1)), 'out2': array_ops.ones((10, 1))}
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 
     total_loss = loss_container(y_t, y_p, sample_weight=sw)
 
@@ -112,7 +112,7 @@ class LossesContainerTest(keras_parameterized.TestCase):
 
     y_t = [array_ops.ones((10, 1)), array_ops.zeros((10, 1))]
     y_p = [array_ops.ones((10, 1)), array_ops.ones((10, 1))]
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 
     total_loss = loss_container(y_t, y_p, sample_weight=sw)
 
@@ -135,7 +135,7 @@ class LossesContainerTest(keras_parameterized.TestCase):
 
     y_t = {'out1': array_ops.ones((10, 1)), 'out2': array_ops.zeros((10, 1))}
     y_p = {'out1': array_ops.ones((10, 1)), 'out2': array_ops.ones((10, 1))}
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 
     total_loss = loss_container(y_t, y_p, sample_weight=sw)
 
@@ -170,7 +170,7 @@ class LossesContainerTest(keras_parameterized.TestCase):
               array_ops.zeros((10, 1))],
         'a': array_ops.ones((10, 1))
     }
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 
     total_loss = loss_container(y_t, y_p, sample_weight=sw)
     self.assertEqual(total_loss.numpy(), 0.75)
@@ -193,7 +193,7 @@ class LossesContainerTest(keras_parameterized.TestCase):
 
     y_t = [array_ops.ones((10, 1)), array_ops.zeros((10, 1))]
     y_p = [array_ops.ones((10, 1)), array_ops.ones((10, 1))]
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 
     total_loss = loss_container(y_t, y_p, sample_weight=sw)
     self.assertEqual(total_loss.numpy(), 0.5)
@@ -220,13 +220,13 @@ class LossesContainerTest(keras_parameterized.TestCase):
     })
 
     y_p = {
-        'output1': ops.convert_to_tensor([[0], [1], [2]]),
-        'output2': ops.convert_to_tensor([[3], [4], [5]]),
-        'output3': ops.convert_to_tensor([[6], [7], [8]])
+        'output1': ops.convert_to_tensor_v2_with_dispatch([[0], [1], [2]]),
+        'output2': ops.convert_to_tensor_v2_with_dispatch([[3], [4], [5]]),
+        'output3': ops.convert_to_tensor_v2_with_dispatch([[6], [7], [8]])
     }
     y_t = {
-        'output1': ops.convert_to_tensor([[1], [2], [3]]),
-        'output3': ops.convert_to_tensor([[4], [5], [6]])
+        'output1': ops.convert_to_tensor_v2_with_dispatch([[1], [2], [3]]),
+        'output3': ops.convert_to_tensor_v2_with_dispatch([[4], [5], [6]])
     }
 
     total_loss = loss_container(y_t, y_p)
@@ -372,7 +372,7 @@ class MetricsContainerTest(keras_parameterized.TestCase):
 
     y_t = [array_ops.ones((10, 1)), array_ops.zeros((10, 1))]
     y_p = [array_ops.ones((10, 1)), 2 * array_ops.ones((10, 1))]
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
     metric_container.update_state(y_t, y_p, sample_weight=sw)
     self.assertLen(metric_container.metrics, 6)
 
@@ -415,7 +415,7 @@ class MetricsContainerTest(keras_parameterized.TestCase):
 
     y_t = {'out1': array_ops.ones((10, 1)), 'out2': array_ops.zeros((10, 1))}
     y_p = {'out1': array_ops.ones((10, 1)), 'out2': 2 * array_ops.ones((10, 1))}
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
     metric_container.update_state(y_t, y_p, sample_weight=sw)
 
     mse_metric = metric_container.metrics[0]
@@ -440,7 +440,7 @@ class MetricsContainerTest(keras_parameterized.TestCase):
 
     y_t = [array_ops.ones((10, 1)), array_ops.zeros((10, 1))]
     y_p = [array_ops.ones((10, 1)), array_ops.ones((10, 1))]
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 
     metric_container.update_state(y_t, y_p, sample_weight=sw)
     self.assertLen(metric_container.metrics, 1)
@@ -457,7 +457,7 @@ class MetricsContainerTest(keras_parameterized.TestCase):
 
     y_t = {'out1': array_ops.ones((10, 1)), 'out2': array_ops.zeros((10, 1))}
     y_p = {'out1': array_ops.ones((10, 1)), 'out2': array_ops.ones((10, 1))}
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 
     metric_container.update_state(y_t, y_p, sample_weight=sw)
     self.assertLen(metric_container.metrics, 1)
@@ -487,7 +487,7 @@ class MetricsContainerTest(keras_parameterized.TestCase):
               array_ops.zeros((10, 1))],
         'a': array_ops.ones((10, 1))
     }
-    sw = ops.convert_to_tensor_v2([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
+    sw = ops.convert_to_tensor_v2_with_dispatch([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
 
     metric_container.update_state(y_t, y_p, sample_weight=sw)
     self.assertLen(metric_container.metrics, 3)
@@ -548,9 +548,9 @@ class MetricsContainerTest(keras_parameterized.TestCase):
     metric_container = compile_utils.MetricsContainer(
         metrics=['mae'], weighted_metrics=['mae'])
 
-    y_t = ops.convert_to_tensor_v2([[0], [3], [0]])
-    y_p = ops.convert_to_tensor_v2([[0], [0], [0]])
-    sw = ops.convert_to_tensor_v2([[1], [0], [1]])
+    y_t = ops.convert_to_tensor_v2_with_dispatch([[0], [3], [0]])
+    y_p = ops.convert_to_tensor_v2_with_dispatch([[0], [0], [0]])
+    sw = ops.convert_to_tensor_v2_with_dispatch([[1], [0], [1]])
 
     metric_container.update_state(y_t, y_p, sample_weight=sw)
     self.assertLen(metric_container.metrics, 2)
@@ -566,8 +566,8 @@ class MetricsContainerTest(keras_parameterized.TestCase):
   def test_broadcast_metrics_to_dict(self):
     metric_container = compile_utils.MetricsContainer(metrics=['mae'])
 
-    y_p = {'output': ops.convert_to_tensor([[0], [1], [2]])}
-    y_t = {'output': ops.convert_to_tensor([[1], [2], [3]])}
+    y_p = {'output': ops.convert_to_tensor_v2_with_dispatch([[0], [1], [2]])}
+    y_t = {'output': ops.convert_to_tensor_v2_with_dispatch([[1], [2], [3]])}
     metric_container.update_state(y_t, y_p)
 
     mae_metric = metric_container.metrics[0]
@@ -578,8 +578,8 @@ class MetricsContainerTest(keras_parameterized.TestCase):
     metric_container = compile_utils.MetricsContainer(
         metrics=['mae'], output_names=['output'])
 
-    y_p = ops.convert_to_tensor([[0], [1], [2]])
-    y_t = {'output': ops.convert_to_tensor([[1], [2], [3]])}
+    y_p = ops.convert_to_tensor_v2_with_dispatch([[0], [1], [2]])
+    y_t = {'output': ops.convert_to_tensor_v2_with_dispatch([[1], [2], [3]])}
     metric_container.update_state(y_t, y_p)
 
     mae_metric = metric_container.metrics[0]
@@ -595,13 +595,13 @@ class MetricsContainerTest(keras_parameterized.TestCase):
     })
 
     y_p = {
-        'output1': ops.convert_to_tensor([[0], [1], [2]]),
-        'output2': ops.convert_to_tensor([[3], [4], [5]]),
-        'output3': ops.convert_to_tensor([[6], [7], [8]])
+        'output1': ops.convert_to_tensor_v2_with_dispatch([[0], [1], [2]]),
+        'output2': ops.convert_to_tensor_v2_with_dispatch([[3], [4], [5]]),
+        'output3': ops.convert_to_tensor_v2_with_dispatch([[6], [7], [8]])
     }
     y_t = {
-        'output1': ops.convert_to_tensor([[1], [2], [3]]),
-        'output3': ops.convert_to_tensor([[4], [5], [6]])
+        'output1': ops.convert_to_tensor_v2_with_dispatch([[1], [2], [3]]),
+        'output3': ops.convert_to_tensor_v2_with_dispatch([[4], [5], [6]])
     }
 
     metric_container.update_state(y_t, y_p)
diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py
index e9662da73e7..2cc6f69403e 100644
--- a/tensorflow/python/keras/engine/data_adapter.py
+++ b/tensorflow/python/keras/engine/data_adapter.py
@@ -34,16 +34,17 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.eager import context
+from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework.ops import composite_tensor
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import data_utils
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -52,6 +53,9 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
+keras_data_adapter_gauge = monitoring.BoolGauge(
+    "/tensorflow/api/keras/data_adapters", "keras data adapter usage", "method")
+
 try:
   from scipy import sparse as scipy_sparse  # pylint: disable=g-import-not-at-top
 except ImportError:
@@ -61,13 +65,6 @@ try:
 except ImportError:
   pd = None
 
-try:
-  # In Python2 unicode is a scalar type
-  scalar_types = (float, int, str, unicode)
-except NameError:
-  # In Python3 unicode is not present, it always uses string
-  scalar_types = (float, int, str)
-
 
 @six.add_metaclass(abc.ABCMeta)
 class DataAdapter(object):
@@ -423,8 +420,8 @@ class TensorLikeDataAdapter(DataAdapter):
 class GenericArrayLikeDataAdapter(TensorLikeDataAdapter):
   """Adapter that handles array-like data without forcing it into memory.
 
-  As an example, this adapter handles `keras.utils.HDF5Matrix` which holds
-  datasets that may be too big to fully fit into memory.
+  This adapter handles array-like datasets that may be too big to fully
+  fit into memory.
 
   Specifically, this adapter handles any Python class which implements:
   `__get_item__`, `__len__`, `shape`, and `dtype` with the same meanings
@@ -527,7 +524,7 @@ class CompositeTensorDataAdapter(DataAdapter):
 
     def _is_composite(v):
       # Dataset inherits from CompositeTensor but shouldn't be handled here.
-      if (isinstance(v, composite_tensor.CompositeTensor) and
+      if (tf_utils.is_extension_type(v) and
           not isinstance(v, dataset_ops.DatasetV2)):
         return True
       # Support Scipy sparse tensors if scipy is installed
@@ -617,7 +614,7 @@ class ListsOfScalarsDataAdapter(DataAdapter):
 
   @staticmethod
   def _is_list_of_scalars(inp):
-    if isinstance(inp, scalar_types):
+    if isinstance(inp, (float, int, str, bytes, bytearray)):
       return True
     if isinstance(inp, (list, tuple)):
       return ListsOfScalarsDataAdapter._is_list_of_scalars(inp[0])
@@ -968,6 +965,8 @@ def select_data_adapter(x, y):
         "handling inputs. Found multiple adapters {} to handle "
         "input: {}, {}".format(
             adapter_cls, _type_name(x), _type_name(y)))
+  # Instrument the data adapter usage before returning it
+  keras_data_adapter_gauge.get_cell(adapter_cls[0].__name__).set(True)
   return adapter_cls[0]
 
 
@@ -1006,7 +1005,7 @@ def _process_tensorlike(inputs):
       dtype = None
       if issubclass(x.dtype.type, np.floating):
         dtype = backend.floatx()
-      return ops.convert_to_tensor(x, dtype=dtype)
+      return ops.convert_to_tensor_v2_with_dispatch(x, dtype=dtype)
     elif scipy_sparse and scipy_sparse.issparse(x):
       return _scipy_sparse_to_sparse_tensor(x)
     return x
@@ -1281,7 +1280,7 @@ def _make_class_weight_map_fn(class_weight):
         "than the number of classes, found {}").format(class_weight)
     raise ValueError(error_msg)
 
-  class_weight_tensor = ops.convert_to_tensor_v2(
+  class_weight_tensor = ops.convert_to_tensor_v2_with_dispatch(
       [class_weight[int(c)] for c in class_ids])
 
   def _class_weights_map_fn(*data):
diff --git a/tensorflow/python/keras/engine/data_adapter_test.py b/tensorflow/python/keras/engine/data_adapter_test.py
index fad193009cf..9ca63ec42f0 100644
--- a/tensorflow/python/keras/engine/data_adapter_test.py
+++ b/tensorflow/python/keras/engine/data_adapter_test.py
@@ -26,11 +26,9 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.experimental.ops import cardinality
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import data_adapter
@@ -91,6 +89,8 @@ class DataAdapterTestBase(keras_parameterized.TestCase):
     self.iterator_input = data_utils.threadsafe_generator(generator)()
     self.sequence_input = TestSequence(batch_size=self.batch_size,
                                        feature_shape=10)
+    self.text_input = [['abc']]
+    self.bytes_input = [[b'abc']]
     self.model = keras.models.Sequential(
         [keras.layers.Dense(8, input_shape=(10,), activation='softmax')])
 
@@ -123,6 +123,8 @@ class TensorLikeDataAdapterTest(DataAdapterTestBase):
     self.assertFalse(self.adapter_cls.can_handle(self.dataset_input))
     self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
     self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
+    self.assertFalse(self.adapter_cls.can_handle(self.text_input))
+    self.assertFalse(self.adapter_cls.can_handle(self.bytes_input))
 
   def test_size_numpy(self):
     adapter = self.adapter_cls(
@@ -253,6 +255,8 @@ class TensorLikeDataAdapterTest(DataAdapterTestBase):
     self.assertFalse(self.adapter_cls.can_handle(self.dataset_input))
     self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
     self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
+    self.assertFalse(self.adapter_cls.can_handle(self.text_input))
+    self.assertFalse(self.adapter_cls.can_handle(self.bytes_input))
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_training(self):
@@ -266,92 +270,92 @@ class TensorLikeDataAdapterTest(DataAdapterTestBase):
     self.assertEqual(adapter.get_size(), 10)
     self.assertFalse(adapter.has_partial_batch())
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_shuffle_correctness(self):
-    with context.eager_mode():
-      num_samples = 100
-      batch_size = 32
-      x = np.arange(num_samples)
-      np.random.seed(99)
-      adapter = self.adapter_cls(
-          x, y=None, batch_size=batch_size, shuffle=True, epochs=2)
+    num_samples = 100
+    batch_size = 32
+    x = np.arange(num_samples)
+    np.random.seed(99)
+    adapter = self.adapter_cls(
+        x, y=None, batch_size=batch_size, shuffle=True, epochs=2)
 
-      def _get_epoch(ds_iter):
-        ds_data = []
-        for _ in range(int(math.ceil(num_samples / batch_size))):
-          ds_data.append(next(ds_iter).numpy())
-        return np.concatenate(ds_data)
+    def _get_epoch(ds_iter):
+      ds_data = []
+      for _ in range(int(math.ceil(num_samples / batch_size))):
+        ds_data.append(next(ds_iter).numpy())
+      return np.concatenate(ds_data)
 
-      ds_iter = iter(adapter.get_dataset())
+    ds_iter = iter(adapter.get_dataset())
 
-      # First epoch.
-      epoch_data = _get_epoch(ds_iter)
-      # Check that shuffling occurred.
-      self.assertNotAllClose(x, epoch_data)
-      # Check that each elements appears, and only once.
-      self.assertAllClose(x, np.sort(epoch_data))
+    # First epoch.
+    epoch_data = _get_epoch(ds_iter)
+    # Check that shuffling occurred.
+    self.assertNotAllClose(x, epoch_data)
+    # Check that each elements appears, and only once.
+    self.assertAllClose(x, np.sort(epoch_data))
 
-      # Second epoch.
-      second_epoch_data = _get_epoch(ds_iter)
-      # Check that shuffling occurred.
-      self.assertNotAllClose(x, second_epoch_data)
-      # Check that shuffling is different across epochs.
-      self.assertNotAllClose(epoch_data, second_epoch_data)
-      # Check that each elements appears, and only once.
-      self.assertAllClose(x, np.sort(second_epoch_data))
+    # Second epoch.
+    second_epoch_data = _get_epoch(ds_iter)
+    # Check that shuffling occurred.
+    self.assertNotAllClose(x, second_epoch_data)
+    # Check that shuffling is different across epochs.
+    self.assertNotAllClose(epoch_data, second_epoch_data)
+    # Check that each elements appears, and only once.
+    self.assertAllClose(x, np.sort(second_epoch_data))
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_batch_shuffle_correctness(self):
-    with context.eager_mode():
-      num_samples = 100
-      batch_size = 6
-      x = np.arange(num_samples)
-      np.random.seed(99)
-      adapter = self.adapter_cls(
-          x, y=None, batch_size=batch_size, shuffle='batch', epochs=2)
+    num_samples = 100
+    batch_size = 6
+    x = np.arange(num_samples)
+    np.random.seed(99)
+    adapter = self.adapter_cls(
+        x, y=None, batch_size=batch_size, shuffle='batch', epochs=2)
 
-      def _get_epoch_batches(ds_iter):
-        ds_data = []
-        for _ in range(int(math.ceil(num_samples / batch_size))):
-          ds_data.append(next(ds_iter)[0].numpy())
-        return ds_data
+    def _get_epoch_batches(ds_iter):
+      ds_data = []
+      for _ in range(int(math.ceil(num_samples / batch_size))):
+        ds_data.append(next(ds_iter)[0].numpy())
+      return ds_data
 
-      ds_iter = iter(adapter.get_dataset())
+    ds_iter = iter(adapter.get_dataset())
 
-      # First epoch.
-      epoch_batch_data = _get_epoch_batches(ds_iter)
-      epoch_data = np.concatenate(epoch_batch_data)
+    # First epoch.
+    epoch_batch_data = _get_epoch_batches(ds_iter)
+    epoch_data = np.concatenate(epoch_batch_data)
 
-      def _verify_batch(batch):
-        # Verify that a batch contains only contiguous data, and that it has
-        # been shuffled.
-        shuffled_batch = np.sort(batch)
-        self.assertNotAllClose(batch, shuffled_batch)
-        for i in range(1, len(batch)):
-          self.assertEqual(shuffled_batch[i-1] + 1, shuffled_batch[i])
+    def _verify_batch(batch):
+      # Verify that a batch contains only contiguous data, and that it has
+      # been shuffled.
+      shuffled_batch = np.sort(batch)
+      self.assertNotAllClose(batch, shuffled_batch)
+      for i in range(1, len(batch)):
+        self.assertEqual(shuffled_batch[i-1] + 1, shuffled_batch[i])
 
-      # Assert that the data within each batch remains contiguous
-      for batch in epoch_batch_data:
-        _verify_batch(batch)
+    # Assert that the data within each batch remains contiguous
+    for batch in epoch_batch_data:
+      _verify_batch(batch)
 
-      # Check that individual batches are unshuffled
-      # Check that shuffling occurred.
-      self.assertNotAllClose(x, epoch_data)
-      # Check that each elements appears, and only once.
-      self.assertAllClose(x, np.sort(epoch_data))
+    # Check that individual batches are unshuffled
+    # Check that shuffling occurred.
+    self.assertNotAllClose(x, epoch_data)
+    # Check that each elements appears, and only once.
+    self.assertAllClose(x, np.sort(epoch_data))
 
-      # Second epoch.
-      second_epoch_batch_data = _get_epoch_batches(ds_iter)
-      second_epoch_data = np.concatenate(second_epoch_batch_data)
+    # Second epoch.
+    second_epoch_batch_data = _get_epoch_batches(ds_iter)
+    second_epoch_data = np.concatenate(second_epoch_batch_data)
 
-      # Assert that the data within each batch remains contiguous
-      for batch in second_epoch_batch_data:
-        _verify_batch(batch)
+    # Assert that the data within each batch remains contiguous
+    for batch in second_epoch_batch_data:
+      _verify_batch(batch)
 
-      # Check that shuffling occurred.
-      self.assertNotAllClose(x, second_epoch_data)
-      # Check that shuffling is different across epochs.
-      self.assertNotAllClose(epoch_data, second_epoch_data)
-      # Check that each elements appears, and only once.
-      self.assertAllClose(x, np.sort(second_epoch_data))
+    # Check that shuffling occurred.
+    self.assertNotAllClose(x, second_epoch_data)
+    # Check that shuffling is different across epochs.
+    self.assertNotAllClose(epoch_data, second_epoch_data)
+    # Check that each elements appears, and only once.
+    self.assertAllClose(x, np.sort(second_epoch_data))
 
   @parameterized.named_parameters(
       ('batch_size_5', 5, None, 5),
@@ -422,6 +426,8 @@ class GenericArrayLikeDataAdapterTest(DataAdapterTestBase):
     self.assertFalse(self.adapter_cls.can_handle(self.dataset_input))
     self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
     self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
+    self.assertFalse(self.adapter_cls.can_handle(self.text_input))
+    self.assertFalse(self.adapter_cls.can_handle(self.bytes_input))
 
   def test_size(self):
     adapter = self.adapter_cls(
@@ -446,7 +452,7 @@ class GenericArrayLikeDataAdapterTest(DataAdapterTestBase):
   def test_training(self):
     # First verify that DummyArrayLike can't be converted to a Tensor
     with self.assertRaises(TypeError):
-      ops.convert_to_tensor_v2(self.arraylike_input)
+      ops.convert_to_tensor_v2_with_dispatch(self.arraylike_input)
 
     # Then train on the array like.
     # It should not be converted to a tensor directly (which would force it into
@@ -495,92 +501,92 @@ class GenericArrayLikeDataAdapterTest(DataAdapterTestBase):
     self.model.evaluate(self.arraylike_input,
                         self.tensor_target, batch_size=5)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_shuffle_correctness(self):
-    with context.eager_mode():
-      num_samples = 100
-      batch_size = 32
-      x = DummyArrayLike(np.arange(num_samples))
-      np.random.seed(99)
-      adapter = self.adapter_cls(
-          x, y=None, batch_size=batch_size, shuffle=True, epochs=2)
+    num_samples = 100
+    batch_size = 32
+    x = DummyArrayLike(np.arange(num_samples))
+    np.random.seed(99)
+    adapter = self.adapter_cls(
+        x, y=None, batch_size=batch_size, shuffle=True, epochs=2)
 
-      def _get_epoch(ds_iter):
-        ds_data = []
-        for _ in range(int(math.ceil(num_samples / batch_size))):
-          ds_data.append(next(ds_iter).numpy())
-        return np.concatenate(ds_data)
+    def _get_epoch(ds_iter):
+      ds_data = []
+      for _ in range(int(math.ceil(num_samples / batch_size))):
+        ds_data.append(next(ds_iter).numpy())
+      return np.concatenate(ds_data)
 
-      ds_iter = iter(adapter.get_dataset())
+    ds_iter = iter(adapter.get_dataset())
 
-      # First epoch.
-      epoch_data = _get_epoch(ds_iter)
-      # Check that shuffling occurred.
-      self.assertNotAllClose(x, epoch_data)
-      # Check that each elements appears, and only once.
-      self.assertAllClose(x, np.sort(epoch_data))
+    # First epoch.
+    epoch_data = _get_epoch(ds_iter)
+    # Check that shuffling occurred.
+    self.assertNotAllClose(x, epoch_data)
+    # Check that each elements appears, and only once.
+    self.assertAllClose(x, np.sort(epoch_data))
 
-      # Second epoch.
-      second_epoch_data = _get_epoch(ds_iter)
-      # Check that shuffling occurred.
-      self.assertNotAllClose(x, second_epoch_data)
-      # Check that shuffling is different across epochs.
-      self.assertNotAllClose(epoch_data, second_epoch_data)
-      # Check that each elements appears, and only once.
-      self.assertAllClose(x, np.sort(second_epoch_data))
+    # Second epoch.
+    second_epoch_data = _get_epoch(ds_iter)
+    # Check that shuffling occurred.
+    self.assertNotAllClose(x, second_epoch_data)
+    # Check that shuffling is different across epochs.
+    self.assertNotAllClose(epoch_data, second_epoch_data)
+    # Check that each elements appears, and only once.
+    self.assertAllClose(x, np.sort(second_epoch_data))
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_batch_shuffle_correctness(self):
-    with context.eager_mode():
-      num_samples = 100
-      batch_size = 6
-      x = DummyArrayLike(np.arange(num_samples))
-      np.random.seed(99)
-      adapter = self.adapter_cls(
-          x, y=None, batch_size=batch_size, shuffle='batch', epochs=2)
+    num_samples = 100
+    batch_size = 6
+    x = DummyArrayLike(np.arange(num_samples))
+    np.random.seed(99)
+    adapter = self.adapter_cls(
+        x, y=None, batch_size=batch_size, shuffle='batch', epochs=2)
 
-      def _get_epoch_batches(ds_iter):
-        ds_data = []
-        for _ in range(int(math.ceil(num_samples / batch_size))):
-          ds_data.append(next(ds_iter)[0].numpy())
-        return ds_data
+    def _get_epoch_batches(ds_iter):
+      ds_data = []
+      for _ in range(int(math.ceil(num_samples / batch_size))):
+        ds_data.append(next(ds_iter)[0].numpy())
+      return ds_data
 
-      ds_iter = iter(adapter.get_dataset())
+    ds_iter = iter(adapter.get_dataset())
 
-      # First epoch.
-      epoch_batch_data = _get_epoch_batches(ds_iter)
-      epoch_data = np.concatenate(epoch_batch_data)
+    # First epoch.
+    epoch_batch_data = _get_epoch_batches(ds_iter)
+    epoch_data = np.concatenate(epoch_batch_data)
 
-      def _verify_batch(batch):
-        # Verify that a batch contains only contiguous data, but that it has
-        # been shuffled.
-        shuffled_batch = np.sort(batch)
-        self.assertNotAllClose(batch, shuffled_batch)
-        for i in range(1, len(batch)):
-          self.assertEqual(shuffled_batch[i-1] + 1, shuffled_batch[i])
+    def _verify_batch(batch):
+      # Verify that a batch contains only contiguous data, but that it has
+      # been shuffled.
+      shuffled_batch = np.sort(batch)
+      self.assertNotAllClose(batch, shuffled_batch)
+      for i in range(1, len(batch)):
+        self.assertEqual(shuffled_batch[i-1] + 1, shuffled_batch[i])
 
-      # Assert that the data within each batch is shuffled contiguous data
-      for batch in epoch_batch_data:
-        _verify_batch(batch)
+    # Assert that the data within each batch is shuffled contiguous data
+    for batch in epoch_batch_data:
+      _verify_batch(batch)
 
-      # Check that individual batches are unshuffled
-      # Check that shuffling occurred.
-      self.assertNotAllClose(x, epoch_data)
-      # Check that each elements appears, and only once.
-      self.assertAllClose(x, np.sort(epoch_data))
+    # Check that individual batches are unshuffled
+    # Check that shuffling occurred.
+    self.assertNotAllClose(x, epoch_data)
+    # Check that each elements appears, and only once.
+    self.assertAllClose(x, np.sort(epoch_data))
 
-      # Second epoch.
-      second_epoch_batch_data = _get_epoch_batches(ds_iter)
-      second_epoch_data = np.concatenate(second_epoch_batch_data)
+    # Second epoch.
+    second_epoch_batch_data = _get_epoch_batches(ds_iter)
+    second_epoch_data = np.concatenate(second_epoch_batch_data)
 
-      # Assert that the data within each batch remains contiguous
-      for batch in second_epoch_batch_data:
-        _verify_batch(batch)
+    # Assert that the data within each batch remains contiguous
+    for batch in second_epoch_batch_data:
+      _verify_batch(batch)
 
-      # Check that shuffling occurred.
-      self.assertNotAllClose(x, second_epoch_data)
-      # Check that shuffling is different across epochs.
-      self.assertNotAllClose(epoch_data, second_epoch_data)
-      # Check that each elements appears, and only once.
-      self.assertAllClose(x, np.sort(second_epoch_data))
+    # Check that shuffling occurred.
+    self.assertNotAllClose(x, second_epoch_data)
+    # Check that shuffling is different across epochs.
+    self.assertNotAllClose(epoch_data, second_epoch_data)
+    # Check that each elements appears, and only once.
+    self.assertAllClose(x, np.sort(second_epoch_data))
 
   @parameterized.named_parameters(
       ('batch_size_5', 5, None, 5),
@@ -668,6 +674,8 @@ class GeneratorDataAdapterTest(DataAdapterTestBase):
     self.assertFalse(self.adapter_cls.can_handle(self.dataset_input))
     self.assertTrue(self.adapter_cls.can_handle(self.generator_input))
     self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
+    self.assertFalse(self.adapter_cls.can_handle(self.text_input))
+    self.assertFalse(self.adapter_cls.can_handle(self.bytes_input))
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_training(self):
@@ -676,7 +684,7 @@ class GeneratorDataAdapterTest(DataAdapterTestBase):
     self.model.fit(self.generator_input, steps_per_epoch=10)
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   @data_utils.dont_use_multiprocessing_pool
   def test_with_multiprocessing_training(self):
     self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd',
@@ -712,15 +720,15 @@ class GeneratorDataAdapterTest(DataAdapterTestBase):
       self.adapter_cls(
           self.generator_input, sample_weights=self.generator_input)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_not_shuffled(self):
     def generator():
       for i in range(10):
         yield np.ones((1, 1)) * i
 
     adapter = self.adapter_cls(generator(), shuffle=True)
-    with context.eager_mode():
-      for i, data in enumerate(adapter.get_dataset()):
-        self.assertEqual(i, data[0].numpy().flatten())
+    for i, data in enumerate(adapter.get_dataset()):
+      self.assertEqual(i, data[0].numpy().flatten())
 
 
 class KerasSequenceAdapterTest(DataAdapterTestBase):
@@ -735,6 +743,8 @@ class KerasSequenceAdapterTest(DataAdapterTestBase):
     self.assertFalse(self.adapter_cls.can_handle(self.dataset_input))
     self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
     self.assertTrue(self.adapter_cls.can_handle(self.sequence_input))
+    self.assertFalse(self.adapter_cls.can_handle(self.text_input))
+    self.assertFalse(self.adapter_cls.can_handle(self.bytes_input))
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_training(self):
@@ -743,7 +753,7 @@ class KerasSequenceAdapterTest(DataAdapterTestBase):
     self.model.fit(self.sequence_input)
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   @data_utils.dont_use_multiprocessing_pool
   def test_with_multiprocessing_training(self):
     self.model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd',
@@ -914,7 +924,7 @@ class DataHandlerTest(keras_parameterized.TestCase):
     def generator():
       for _ in range(2):
         for step in range(3):
-          yield (ops.convert_to_tensor_v2([step]),)
+          yield (ops.convert_to_tensor_v2_with_dispatch([step]),)
 
     data_handler = data_adapter.DataHandler(
         generator(), epochs=2, steps_per_epoch=3)
@@ -1007,20 +1017,20 @@ class TestValidationSplit(keras_parameterized.TestCase):
       y = np.array([0, 2, 4, 6, 8])
       sw = np.array([0, 4, 8, 12, 16])
     else:
-      x = ops.convert_to_tensor_v2([0, 1, 2, 3, 4])
-      y = ops.convert_to_tensor_v2([0, 2, 4, 6, 8])
-      sw = ops.convert_to_tensor_v2([0, 4, 8, 12, 16])
+      x = ops.convert_to_tensor_v2_with_dispatch([0, 1, 2, 3, 4])
+      y = ops.convert_to_tensor_v2_with_dispatch([0, 2, 4, 6, 8])
+      sw = ops.convert_to_tensor_v2_with_dispatch([0, 4, 8, 12, 16])
 
     (train_x, train_y, train_sw), (val_x, val_y, val_sw) = (
         data_adapter.train_validation_split((x, y, sw), validation_split=0.2))
 
     if use_numpy:
-      train_x = ops.convert_to_tensor_v2(train_x)
-      train_y = ops.convert_to_tensor_v2(train_y)
-      train_sw = ops.convert_to_tensor_v2(train_sw)
-      val_x = ops.convert_to_tensor_v2(val_x)
-      val_y = ops.convert_to_tensor_v2(val_y)
-      val_sw = ops.convert_to_tensor_v2(val_sw)
+      train_x = ops.convert_to_tensor_v2_with_dispatch(train_x)
+      train_y = ops.convert_to_tensor_v2_with_dispatch(train_y)
+      train_sw = ops.convert_to_tensor_v2_with_dispatch(train_sw)
+      val_x = ops.convert_to_tensor_v2_with_dispatch(val_x)
+      val_y = ops.convert_to_tensor_v2_with_dispatch(val_y)
+      val_sw = ops.convert_to_tensor_v2_with_dispatch(val_sw)
 
     self.assertEqual(train_x.numpy().tolist(), [0, 1, 2, 3])
     self.assertEqual(train_y.numpy().tolist(), [0, 2, 4, 6])
@@ -1052,6 +1062,23 @@ class TestValidationSplit(keras_parameterized.TestCase):
     self.assertIsNone(val_sw)
 
 
+class ListsOfScalarsDataAdapterTest(DataAdapterTestBase):
+
+  def setUp(self):
+    super(ListsOfScalarsDataAdapterTest, self).setUp()
+    self.adapter_cls = data_adapter.ListsOfScalarsDataAdapter
+
+  def test_can_list_inputs(self):
+    self.assertTrue(self.adapter_cls.can_handle(self.text_input))
+    self.assertTrue(self.adapter_cls.can_handle(self.bytes_input))
+
+    self.assertFalse(self.adapter_cls.can_handle(self.numpy_input))
+    self.assertFalse(self.adapter_cls.can_handle(self.tensor_input))
+    self.assertFalse(self.adapter_cls.can_handle(self.dataset_input))
+    self.assertFalse(self.adapter_cls.can_handle(self.generator_input))
+    self.assertFalse(self.adapter_cls.can_handle(self.sequence_input))
+
+
 class TestUtils(keras_parameterized.TestCase):
 
   def test_expand_1d_sparse_tensors_untouched(self):
diff --git a/tensorflow/python/keras/engine/feature_columns_integration_test.py b/tensorflow/python/keras/engine/feature_columns_integration_test.py
index 04d708feb5e..b70e33b9c8f 100644
--- a/tensorflow/python/keras/engine/feature_columns_integration_test.py
+++ b/tensorflow/python/keras/engine/feature_columns_integration_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.feature_column import dense_features as df
 from tensorflow.python.keras.utils import np_utils
 from tensorflow.python.platform import test
 
@@ -34,7 +35,7 @@ class TestDNNModel(keras.models.Model):
 
   def __init__(self, feature_columns, units, name=None, **kwargs):
     super(TestDNNModel, self).__init__(name=name, **kwargs)
-    self._input_layer = fc.DenseFeatures(feature_columns, name='input_layer')
+    self._input_layer = df.DenseFeatures(feature_columns, name='input_layer')
     self._dense_layer = keras.layers.Dense(units, name='dense_layer')
 
   def call(self, features):
@@ -52,7 +53,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
   def test_sequential_model(self):
     columns = [fc.numeric_column('a')]
     model = keras.models.Sequential([
-        fc.DenseFeatures(columns),
+        df.DenseFeatures(columns),
         keras.layers.Dense(64, activation='relu'),
         keras.layers.Dense(20, activation='softmax')
     ])
@@ -74,7 +75,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
   def test_sequential_model_with_ds_input(self):
     columns = [fc.numeric_column('a')]
     model = keras.models.Sequential([
-        fc.DenseFeatures(columns),
+        df.DenseFeatures(columns),
         keras.layers.Dense(64, activation='relu'),
         keras.layers.Dense(20, activation='softmax')
     ])
@@ -112,7 +113,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
     crossed_feature = fc.indicator_column(crossed_feature)
     feature_columns.append(crossed_feature)
 
-    feature_layer = fc.DenseFeatures(feature_columns)
+    feature_layer = df.DenseFeatures(feature_columns)
 
     model = keras.models.Sequential([
         feature_layer,
@@ -185,7 +186,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
     col_a = fc.numeric_column('a')
     col_b = fc.numeric_column('b')
 
-    feature_layer = fc.DenseFeatures([col_a, col_b], name='fc')
+    feature_layer = df.DenseFeatures([col_a, col_b], name='fc')
     dense = keras.layers.Dense(4)
 
     # This seems problematic.... We probably need something for DenseFeatures
@@ -213,8 +214,8 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
     col_b = fc.numeric_column('b')
     col_c = fc.numeric_column('c')
 
-    fc1 = fc.DenseFeatures([col_a, col_b], name='fc1')
-    fc2 = fc.DenseFeatures([col_b, col_c], name='fc2')
+    fc1 = df.DenseFeatures([col_a, col_b], name='fc1')
+    fc2 = df.DenseFeatures([col_b, col_c], name='fc2')
     dense = keras.layers.Dense(4)
 
     # This seems problematic.... We probably need something for DenseFeatures
@@ -289,7 +290,7 @@ class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
     categorical_cols = [fc.categorical_column_with_hash_bucket('cabin', 10)]
     feature_cols = ([fc.numeric_column('age')]
                     + [fc.indicator_column(cc) for cc in categorical_cols])
-    layers = [fc.DenseFeatures(feature_cols),
+    layers = [df.DenseFeatures(feature_cols),
               keras.layers.Dense(128),
               keras.layers.Dense(1)]
 
diff --git a/tensorflow/python/keras/engine/functional.py b/tensorflow/python/keras/engine/functional.py
index e1399ba6777..892773fa656 100644
--- a/tensorflow/python/keras/engine/functional.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -27,7 +27,6 @@ import warnings
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
-from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
@@ -40,13 +39,13 @@ from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.saving.saved_model import network_serialization
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
 
 
 # pylint: disable=g-classes-have-attributes
@@ -140,10 +139,16 @@ class Functional(training_lib.Model):
     # Models constructed with a single Tensor or list of Tensors can
     # be called with a dict, where the keys of the dict are the names
     # of the `Input` objects. Extra keys are ignored with warning.
-    self._enable_dict_to_input_mapping = (
-        not nest.is_nested(self._nested_inputs) or
-        (isinstance(self._nested_inputs, (list, tuple, dict)) and
-         not any(nest.is_nested(t) for t in self._nested_inputs)))
+    if not nest.is_nested(self._nested_inputs):
+      self._enable_dict_to_input_mapping = True
+    elif (isinstance(self._nested_inputs, (list, tuple)) and
+          not any(nest.is_nested(t) for t in self._nested_inputs)):
+      self._enable_dict_to_input_mapping = True
+    elif (isinstance(self._nested_inputs, dict) and
+          not any(nest.is_nested(t) for t in self._nested_inputs.values())):
+      self._enable_dict_to_input_mapping = True
+    else:
+      self._enable_dict_to_input_mapping = False
 
     if not keras_tensor.keras_tensors_enabled():
       if any(not hasattr(tensor, '_keras_history') for tensor in self.outputs):
@@ -635,7 +640,7 @@ class Functional(training_lib.Model):
 
       # Dtype casting.
       tensor = math_ops.cast(tensor, dtype=ref_input.dtype)
-    elif isinstance(tensor, composite_tensor.CompositeTensor):
+    elif tf_utils.is_extension_type(tensor):
       # Dtype casting.
       tensor = math_ops.cast(tensor, dtype=ref_input.dtype)
 
diff --git a/tensorflow/python/keras/engine/functional_test.py b/tensorflow/python/keras/engine/functional_test.py
index 63e735810fc..8427517f235 100644
--- a/tensorflow/python/keras/engine/functional_test.py
+++ b/tensorflow/python/keras/engine/functional_test.py
@@ -1972,6 +1972,28 @@ class NestedNetworkTest(keras_parameterized.TestCase):
     # Check that 'b' was used and 'a' was ignored.
     self.assertEqual(res.shape.as_list(), [1, 1])
 
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def test_nested_dict_mapping(self):
+    a = input_layer_lib.Input(shape=(1,), dtype='int32', name='a')
+    b = input_layer_lib.Input(shape=(1,), dtype='int32', name='b')
+    c = input_layer_lib.Input(shape=(1,), dtype='int32', name='c')
+    d = input_layer_lib.Input(shape=(1,), dtype='int32', name='d')
+    inputs = {'a': (a, b), 'c': (c, d)}
+    outputs = 1000 * a + 100 * b + 10 * c + d
+    model = training_lib.Model(inputs, outputs)
+
+    a_val = array_ops.ones((1, 1), dtype='int32')
+    b_val = 2 * array_ops.ones((1, 1), dtype='int32')
+    c_val = 3 * array_ops.ones((1, 1), dtype='int32')
+    d_val = 4 * array_ops.ones((1, 1), dtype='int32')
+
+    inputs_val = {'a': (a_val, b_val), 'c': (c_val, d_val)}
+    res = model(inputs_val)
+
+    # Check that inputs were flattened in the correct order.
+    self.assertFalse(model._enable_dict_to_input_mapping)
+    self.assertEqual(self.evaluate(res), [1234])
+
 
 @combinations.generate(combinations.keras_mode_combinations())
 class AddLossTest(keras_parameterized.TestCase):
diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index 33f9320e516..f92709a1128 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -20,7 +20,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import backend
@@ -183,8 +182,8 @@ class InputLayer(base_layer.Layer):
     node_module.Node(layer=self, outputs=input_tensor)
 
     # Store type spec
-    if isinstance(input_tensor, (
-        composite_tensor.CompositeTensor, keras_tensor.KerasTensor)):
+    if isinstance(input_tensor, keras_tensor.KerasTensor) or (
+        tf_utils.is_extension_type(input_tensor)):
       self._type_spec = input_tensor._type_spec  # pylint: disable=protected-access
     else:
       self._type_spec = tensor_spec.TensorSpec(
diff --git a/tensorflow/python/keras/engine/keras_tensor.py b/tensorflow/python/keras/engine/keras_tensor.py
index 3aa9b595d4f..055752026c7 100644
--- a/tensorflow/python/keras/engine/keras_tensor.py
+++ b/tensorflow/python/keras/engine/keras_tensor.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -25,6 +26,8 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec as type_spec_module
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.ragged import ragged_operators  # pylint: disable=unused-import
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
 
@@ -50,6 +53,13 @@ def keras_tensors_enabled():
   return _KERAS_TENSORS_ENABLED and ops.executing_eagerly_outside_functions()
 
 
+# Tensorflow tensors have a maximum rank of 254
+# (See `MaxDimensions()` in //tensorflow/core/framework/tensor_shape.h )
+# So we do not try to infer values for int32 tensors larger than this,
+# As they cannot represent shapes.
+_MAX_TENSOR_RANK = 254
+
+
 class KerasTensor(object):
   """A representation of a Keras in/output during Functional API construction.
 
@@ -153,6 +163,76 @@ class KerasTensor(object):
     # it can't access shape or dtype
     return self._type_spec._shape  # pylint: disable=protected-access
 
+  @classmethod
+  def from_tensor(cls, tensor):
+    """Convert a traced (composite)tensor to a representative KerasTensor."""
+    if isinstance(tensor, ops.Tensor):
+      name = getattr(tensor, 'name', None)
+      type_spec = type_spec_module.type_spec_from_value(tensor)
+      inferred_value = None
+      if (type_spec.dtype == dtypes.int32 and type_spec.shape.rank is not None
+          and type_spec.shape.rank < 2):
+        # If this tensor might be representing shape information,
+        # (dtype=int32, rank of 0 or 1, not too large to represent a shape)
+        # we attempt to capture any value information tensorflow's
+        # shape handling can extract from the current scratch graph.
+        #
+        # Even though keras layers each trace in their own scratch
+        # graph, this shape value info extraction allows us to capture
+        # a sizable and useful subset of the C++ shape value inference TF can do
+        # if all tf ops appear in the same graph when using shape ops.
+        #
+        # Examples of things this cannot infer concrete dimensions for
+        # that the full single-graph C++ shape inference sometimes can are:
+        # * cases where the shape tensor is cast out of int32 before being
+        #   manipulated w/ floating point numbers then converted back
+        # * cases where int32 tensors w/ rank >= 2 are manipulated before being
+        #   used as a shape tensor
+        # * cases where int32 tensors too large to represent shapes are
+        #   manipulated to a smaller size before being used as a shape tensor
+        inferred_value = array_ops.ones(shape=tensor).shape
+        if inferred_value.dims:
+          inferred_value = inferred_value.as_list()
+          if len(inferred_value) > _MAX_TENSOR_RANK:
+            inferred_value = None
+        else:
+          inferred_value = None
+
+      return KerasTensor(type_spec, inferred_value=inferred_value, name=name)
+    else:
+      # Fallback to the generic arbitrary-typespec KerasTensor
+      name = getattr(tensor, 'name', None)
+      type_spec = type_spec_module.type_spec_from_value(tensor)
+      return cls(type_spec, name=name)
+
+  def _to_placeholder(self):
+    """Convert this KerasTensor to a placeholder in a graph."""
+    # If there is an inferred value for this tensor, inject the inferred value
+    if self._inferred_value is not None:
+      # If we suspect this KerasTensor might be representing a shape tensor,
+      # and we were able to extract value information with TensorFlow's shape
+      # handling when making the KerasTensor, we construct the placeholder by
+      # re-injecting the inferred value information into the graph. We
+      # do this injection through the shape of a placeholder, because that
+      # allows us to specify partially-unspecified shape values.
+      #
+      # See the comment on value extraction inside `from_tensor` for more info.
+      inferred_value = array_ops.shape(
+          array_ops.placeholder(
+              shape=self._inferred_value, dtype=dtypes.int32))
+      if self.type_spec.shape.rank == 0:
+        # `tf.shape` always returns a rank-1, we may need to turn it back to a
+        # scalar.
+        inferred_value = inferred_value[0]
+      return inferred_value
+
+    # Use the generic conversion from typespec to a placeholder.
+    def component_to_placeholder(component):
+      return array_ops.placeholder(component.dtype, component.shape)
+
+    return nest.map_structure(
+        component_to_placeholder, self.type_spec, expand_composites=True)
+
   def get_shape(self):
     return self.shape
 
@@ -298,26 +378,27 @@ class KerasTensor(object):
     return self._name
 
   @classmethod
-  def _overload_all_operators(cls):  # pylint: disable=invalid-name
+  def _overload_all_operators(cls, tensor_class):  # pylint: disable=invalid-name
     """Register overloads for all operators."""
     for operator in ops.Tensor.OVERLOADABLE_OPERATORS:
-      cls._overload_operator(operator)
+      cls._overload_operator(tensor_class, operator)
 
     # We include `experimental_ref` for versions of TensorFlow that
     # still include the deprecated method in Tensors.
-    if hasattr(ops.Tensor, 'experimental_ref'):
-      cls._overload_operator('experimental_ref')
+    if hasattr(tensor_class, 'experimental_ref'):
+      cls._overload_operator(tensor_class, 'experimental_ref')
 
   @classmethod
-  def _overload_operator(cls, operator):  # pylint: disable=invalid-name
-    """Overload an operator with the same overloading as `ops.Tensor`.
+  def _overload_operator(cls, tensor_class, operator):  # pylint: disable=invalid-name
+    """Overload an operator with the same implementation as a base Tensor class.
 
-    We pull the operator out of ops.Tensor dynamically to avoid ordering issues.
+    We pull the operator out of the class dynamically to avoid ordering issues.
 
     Args:
+      tensor_class: The (Composite)Tensor to get the method from.
       operator: string. The operator name.
     """
-    tensor_oper = getattr(ops.Tensor, operator)
+    tensor_oper = getattr(tensor_class, operator)
 
     # Compatibility with Python 2:
     # Python 2 unbound methods have type checks for the first arg,
@@ -327,81 +408,91 @@ class KerasTensor(object):
     setattr(cls, operator, tensor_oper)
 
 
-KerasTensor._overload_all_operators()  # pylint: disable=protected-access
+KerasTensor._overload_all_operators(ops.Tensor)  # pylint: disable=protected-access
 
 
-class _KerasTensorIterator(object):
-  """Iterates over the leading dim of a KerasTensor. Performs 0 error checks."""
+class SparseKerasTensor(KerasTensor):
+  """A specialized KerasTensor representation for `tf.sparse.SparseTensor`s.
 
-  def __init__(self, tensor, dim0):
-    self._tensor = tensor
-    self._index = 0
-    self._limit = dim0
+  Specifically, it specializes the conversion to a placeholder in order
+  to maintain dense shape information.
+  """
 
-  def __iter__(self):
-    return self
+  def _to_placeholder(self):
+    spec = self.type_spec
 
-  def __next__(self):
-    if self._index == self._limit:
-      raise StopIteration
-    result = self._tensor[self._index]
-    self._index += 1
+    # nest.map_structure loses dense shape information for sparse tensors.
+    # So, we special-case sparse placeholder creation.
+    # This only preserves shape information for top-level sparse tensors;
+    # not for sparse tensors that are nested inside another composite
+    # tensor.
+    return array_ops.sparse_placeholder(dtype=spec.dtype, shape=spec.shape)
+
+
+class RaggedKerasTensor(KerasTensor):
+  """A specialized KerasTensor representation for `tf.RaggedTensor`s.
+
+  Specifically, it:
+
+  1. Specializes the conversion to a placeholder in order
+  to maintain shape information for non-ragged dimensions.
+  2. Overloads the KerasTensor's operators with the RaggedTensor versions
+  when they don't match the `tf.Tensor` versions
+  3. Exposes some of the instance method/attribute that are unique to
+  the RaggedTensor API (such as ragged_rank).
+  """
+
+  def _to_placeholder(self):
+    ragged_spec = self.type_spec
+    if ragged_spec.ragged_rank == 0 or ragged_spec.shape.rank is None:
+      return super(RaggedKerasTensor, self)._to_placeholder()
+
+    flat_shape = ragged_spec.shape[ragged_spec.ragged_rank:]
+    result = array_ops.placeholder(ragged_spec.dtype, flat_shape)
+
+    known_num_splits = []
+    prod = 1
+    for axis_size in ragged_spec.shape:
+      if prod is not None:
+        if axis_size is None or (
+            getattr(axis_size, 'value', True) is None):
+          prod = None
+        else:
+          prod = prod * axis_size
+      known_num_splits.append(prod)
+
+    for axis in range(ragged_spec.ragged_rank, 0, -1):
+      axis_size = ragged_spec.shape[axis]
+      if axis_size is None or (getattr(axis_size, 'value', True) is None):
+        num_splits = known_num_splits[axis-1]
+        if num_splits is not None:
+          num_splits = num_splits + 1
+        splits = array_ops.placeholder(
+            ragged_spec.row_splits_dtype, [num_splits])
+        result = ragged_tensor.RaggedTensor.from_row_splits(
+            result, splits, validate=False)
+      else:
+        rowlen = constant_op.constant(axis_size, ragged_spec.row_splits_dtype)
+        result = ragged_tensor.RaggedTensor.from_uniform_row_length(
+            result, rowlen, validate=False)
     return result
 
-  next = __next__  # python2.x compatibility.
-
-
-def keras_tensor_to_placeholder(x):
-  """Construct a graph placeholder to represent a KerasTensor when tracing."""
-  if hasattr(x, '_user_registered_symbolic_object'):
-    return x._user_registered_symbolic_object  # pylint: disable=protected-access
-
-  if isinstance(x, KerasTensor):
-    spec = x.type_spec
-
-    if x._inferred_value is not None:  # pylint: disable=protected-access
-      # If we suspect this KerasTensor might be representing a shape tensor,
-      # and we were able to extract value information with TensorFlow's shape
-      # handling when making the KerasTensor, we construct the placeholder by
-      # re-injecting the inferred value information into the graph.
-      # Even though keras layers each trace in their own scratch
-      # graph, this shape value info injection allows us to capture
-      # a sizable and useful subset of the C++ shape value inference TF can do
-      # if all tf ops appear in the same graph when using shape ops.
-      #
-      # Examples of things this cannot infer concrete dimensions for
-      # that the full single-graph C++ shape inference sometimes can are:
-      # * cases where the shape tensor is cast out of int32 before being
-      #   manipulated w/ floating point numbers then converted back
-      # * cases where int32 tensors w/ rank > 2 are manipulated before being
-      #   used as a shape tensor
-      inferred_value = array_ops.shape(
-          array_ops.placeholder(
-              shape=x._inferred_value, dtype=dtypes.int32))  # pylint: disable=protected-access
-      if spec.shape.rank == 0:
-        # `tf.shape` always returns a rank-1, we may need to turn it back to a
-        # scalar.
-        inferred_value = inferred_value[0]
-      return inferred_value  # pylint: disable=protected-access
-
-    if isinstance(spec, sparse_tensor.SparseTensorSpec):
-      # nest.map_structure loses dense shape information for sparse tensors.
-      # So, we special-case sparse placeholder creation.
-      # This only preserves shape information for top-level sparse tensors;
-      # not for sparse tensors that are nested inside another composite
-      # tensor.
-      return array_ops.sparse_placeholder(dtype=spec.dtype, shape=spec.shape)
-
-    def component_to_placeholder(component):
-      return array_ops.placeholder(component.dtype, component.shape)
-
-    ph = nest.map_structure(
-        component_to_placeholder, spec, expand_composites=True)
-    return ph
-  else:
-    return x
+  @property
+  def ragged_rank(self):
+    return self.type_spec.ragged_rank
+
+RaggedKerasTensor._overload_operator(ragged_tensor.RaggedTensor, '__add__')  # pylint: disable=protected-access
+RaggedKerasTensor._overload_operator(ragged_tensor.RaggedTensor, '__radd__')  # pylint: disable=protected-access
+RaggedKerasTensor._overload_operator(ragged_tensor.RaggedTensor, '__mul__')  # pylint: disable=protected-access
+RaggedKerasTensor._overload_operator(ragged_tensor.RaggedTensor, '__rmul__')  # pylint: disable=protected-access
 
 
+# TODO(b/161487382):
+# Special-case user-registered symbolic objects (registered by the
+# private `register_symbolic_tensor_type` method) by passing them between
+# scratch graphs directly.
+# This is needed to not break Tensorflow probability
+# while they finish migrating to composite tensors.
 class UserRegisteredSpec(type_spec_module.TypeSpec):
   """TypeSpec to represent user-registered symbolic objects."""
 
@@ -425,72 +516,95 @@ class UserRegisteredSpec(type_spec_module.TypeSpec):
   def value_type(self):
     raise NotImplementedError
 
-# Tensorflow tensors have a maximum dimension of 254
-# (See //tensorflow/core/framework/tensor_shape.h )
-# So we do not try to infer values for int32 tensors larger than this,
-# As they cannot represent shapes.
-_MAX_TENSOR_DIMS = 254
 
+# TODO(b/161487382):
+# Special-case user-registered symbolic objects (registered by the
+# private `register_symbolic_tensor_type` method) by passing them between
+# scratch graphs directly.
+# This is needed to not break Tensorflow probability
+# while they finish migrating to composite tensors.
+class UserRegisteredTypeKerasTensor(KerasTensor):
+  """KerasTensor that represents legacy register_symbolic_tensor_type."""
 
-def keras_tensor_from_tensor(x):
-  """Convert a traced (composite)tensor to a representative KerasTensor."""
-  name = getattr(x, 'name', None)
-  inferred_value = None
-
-  # TODO(b/161487382):
-  # Special-case user-registered symbolic objects (registered by the
-  # private `register_symbolic_tensor_type` method) by passing them between
-  # scratch graphs directly.
-  # This is needed to not break Tensorflow probability
-  # while they finish migrating to composite tensors.
-  user_registered_symbolic = False
-  try:
-    from tensorflow.python.keras.utils import tf_utils  # pylint: disable=g-import-not-at-top to prevent circular imports
-    if isinstance(x, tuple(tf_utils._user_convertible_tensor_types)):  # pylint: disable=protected-access
-      user_registered_symbolic = True
-  except ImportError:
-    pass
-  if user_registered_symbolic:
+  def __init__(self, user_registered_symbolic_object):
+    x = user_registered_symbolic_object
+    self._user_registered_symbolic_object = x
     type_spec = UserRegisteredSpec(x.shape, x.dtype)
+    name = getattr(x, 'name', None)
+
+    super(UserRegisteredTypeKerasTensor, self).__init__(type_spec, name)
+
+  @classmethod
+  def from_tensor(cls, tensor):
+    return cls(tensor)
+
+  def _to_placeholder(self):
+    return self._user_registered_symbolic_object
+
+
+class _KerasTensorIterator(object):
+  """Iterates over the leading dim of a KerasTensor. Performs 0 error checks."""
+
+  def __init__(self, tensor, dim0):
+    self._tensor = tensor
+    self._index = 0
+    self._limit = dim0
+
+  def __iter__(self):
+    return self
+
+  def __next__(self):
+    if self._index == self._limit:
+      raise StopIteration
+    result = self._tensor[self._index]
+    self._index += 1
+    return result
+
+  next = __next__  # python2.x compatibility.
+
+
+# Specify the mappings of tensor class to KerasTensor class.
+# This is specifically a list instead of a dict for now because
+# 1. we do a check w/ isinstance because a key lookup based on class
+#    would miss subclasses
+# 2. a list allows us to control lookup ordering
+# We include ops.Tensor -> KerasTensor in the first position as a fastpath,
+# *and* include object -> KerasTensor at the end as a catch-all.
+# We can re-visit these choices in the future as needed.
+keras_tensor_classes = [
+    (ops.Tensor, KerasTensor),
+    (sparse_tensor.SparseTensor, SparseKerasTensor),
+    (ragged_tensor.RaggedTensor, RaggedKerasTensor),
+    (object, KerasTensor)
+]
+
+
+def register_keras_tensor_specialization(cls, keras_tensor_subclass):
+  """Register a specialized KerasTensor subclass for a Tensor type."""
+  # We always leave (object, KerasTensor) at the end as a generic fallback
+  keras_tensor_classes.insert(-1, (cls, keras_tensor_subclass))
+
+
+def keras_tensor_to_placeholder(x):
+  """Construct a graph placeholder to represent a KerasTensor when tracing."""
+  if isinstance(x, KerasTensor):
+    return x._to_placeholder()  # pylint: disable=protected-access
   else:
-    type_spec = type_spec_module.type_spec_from_value(x)
+    return x
 
-  if (isinstance(type_spec, tensor_spec.TensorSpec)
-      and type_spec.dtype == dtypes.int32
-      and type_spec.shape.rank < 2):
-    # If this tensor might be representing shape information,
-    # (dtype=int32, rank of 0 or 1, not too large to represent a shape)
-    # we attempt to capture any value information tensorflow's
-    # shape handling can extract from the current scratch graph.
-    #
-    # Even though keras layers each trace in their own scratch
-    # graph, this shape value info extraction allows us to capture
-    # a sizable and useful subset of the C++ shape value inference TF can do
-    # if all tf ops appear in the same graph when using shape ops.
-    #
-    # Examples of things this cannot infer concrete dimensions for
-    # that the full single-graph C++ shape inference sometimes can are:
-    # * cases where the shape tensor is cast out of int32 before being
-    #   manipulated w/ floating point numbers then converted back
-    # * cases where int32 tensors w/ rank > 2 are manipulated before being
-    #   used as a shape tensor
-    # * cases where int32 tensors too large to represent shapes are manipulated
-    #   to a smaller size before being used as a shape tensor
-    inferred_value = array_ops.ones(shape=x).shape
-    if inferred_value.dims:
-      inferred_value = inferred_value.as_list()
-      if len(inferred_value) > _MAX_TENSOR_DIMS:
-        inferred_value = None
-    else:
-      inferred_value = None
 
-  out = KerasTensor(type_spec,
-                    inferred_value=inferred_value, name=name)
-  if user_registered_symbolic:
-    out._user_registered_symbolic_object = x  # pylint: disable=protected-access
+def keras_tensor_from_tensor(tensor):
+  """Convert a traced (composite)tensor to a representative KerasTensor."""
+  # Create a specialized KerasTensor that supports instance methods,
+  # operators, and additional value inference if possible
+  keras_tensor_cls = None
+  for tensor_type, cls in keras_tensor_classes:
+    if isinstance(tensor, tensor_type):
+      keras_tensor_cls = cls
+      break
 
-  if hasattr(x, '_keras_mask'):
-    out._keras_mask = KerasTensor(  # pylint: disable=protected-access
-        type_spec_module.type_spec_from_value(x._keras_mask))  # pylint: disable=protected-access
+  out = keras_tensor_cls.from_tensor(tensor)
 
+  if hasattr(tensor, '_keras_mask'):
+    out._keras_mask = keras_tensor_from_tensor(tensor._keras_mask)  # pylint: disable=protected-access
   return out
diff --git a/tensorflow/python/keras/engine/ragged_keras_tensor_test.py b/tensorflow/python/keras/engine/ragged_keras_tensor_test.py
new file mode 100644
index 00000000000..fc85fef29bf
--- /dev/null
+++ b/tensorflow/python/keras/engine/ragged_keras_tensor_test.py
@@ -0,0 +1,373 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RaggedKerasTensor tests."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import func_graph
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import layers
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import training
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+class RaggedKerasTensorTest(keras_parameterized.TestCase):
+
+  @parameterized.parameters(
+      {'batch_size': None, 'shape': (None, 5), 'ragged_rank': 1},
+      {'batch_size': None, 'shape': (None, 3, 5), 'ragged_rank': 1},
+      {'batch_size': None, 'shape': (5, None), 'ragged_rank': 2},
+      {'batch_size': None, 'shape': (3, 5, None), 'ragged_rank': 3},
+      {'batch_size': None, 'shape': (None, 3, 5, None), 'ragged_rank': 4},
+      {'batch_size': None, 'shape': (2, 3, None, 4, 5, None), 'ragged_rank': 6},
+      {'batch_size': 8, 'shape': (None, 5), 'ragged_rank': 1},
+      {'batch_size': 9, 'shape': (None, 3, 5), 'ragged_rank': 1},
+      {'batch_size': 1, 'shape': (5, None), 'ragged_rank': 2},
+      {'batch_size': 4, 'shape': (3, 5, None), 'ragged_rank': 3},
+      {'batch_size': 7, 'shape': (None, 3, 5, None), 'ragged_rank': 4},
+      {'batch_size': 12, 'shape': (2, 3, None, 4, 5, None), 'ragged_rank': 6},
+  )
+  def test_to_placeholder(self, shape, batch_size, ragged_rank):
+    with testing_utils.use_keras_tensors_scope(True):
+      inp = layers.Input(shape=shape, batch_size=batch_size, ragged=True)
+      self.assertEqual(inp.ragged_rank, ragged_rank)
+      self.assertAllEqual(inp.shape, [batch_size] + list(shape))
+      with func_graph.FuncGraph('test').as_default():
+        placeholder = inp._to_placeholder()
+        self.assertEqual(placeholder.ragged_rank, ragged_rank)
+        self.assertAllEqual(placeholder.shape, [batch_size] + list(shape))
+
+  def test_add(self):
+    inp = layers.Input(shape=[None], ragged=True)
+    out = inp + inp
+    model = training.Model(inp, out)
+
+    x = ragged_factory_ops.constant([[3, 4], [1, 2], [3, 5]])
+    self.assertAllEqual(model(x), x + x)
+
+  def test_mul(self):
+    inp = layers.Input(shape=[None], ragged=True)
+    out = inp * inp
+    model = training.Model(inp, out)
+
+    x = ragged_factory_ops.constant([[3, 4], [1, 2], [3, 5]])
+    self.assertAllEqual(model(x), x * x)
+
+  def test_sub(self):
+    inp = layers.Input(shape=[None], ragged=True)
+    out = inp - inp
+    model = training.Model(inp, out)
+
+    x = ragged_factory_ops.constant([[3, 4], [1, 2], [3, 5]])
+    self.assertAllEqual(model(x), x - x)
+
+  def test_div(self):
+    inp = layers.Input(shape=[None], ragged=True)
+    out = inp / inp
+    model = training.Model(inp, out)
+
+    x = ragged_factory_ops.constant([[3, 4], [1, 2], [3, 5]])
+    self.assertAllEqual(model(x), x / x)
+
+  @parameterized.parameters(
+      {'property_name': 'values'},
+      {'property_name': 'flat_values'},
+      {'property_name': 'row_splits'},
+      {'property_name': 'nested_row_splits'},
+  )
+  def test_instance_property(self, property_name):
+    inp = layers.Input(shape=[None], ragged=True)
+    out = getattr(inp, property_name)
+    model = training.Model(inp, out)
+
+    x = ragged_factory_ops.constant([[3, 4], [1, 2], [3, 5]])
+    expected_property = getattr(x, property_name)
+    self.assertAllEqual(model(x), expected_property)
+
+    # Test that it works with serialization and deserialization as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected_property)
+
+  @parameterized.parameters(
+      {'name': 'value_rowids'},
+      {'name': 'nested_value_rowids'},
+      {'name': 'nrows'},
+      {'name': 'row_starts'},
+      {'name': 'row_limits'},
+      {'name': 'row_lengths'},
+      {'name': 'nested_row_lengths'},
+      {'name': 'bounding_shape'},
+      {
+          'name': 'with_values',
+          'args': [[1, 2, 3, 4, 5, 6]]
+      },
+      {
+          'name': 'with_flat_values',
+          'kwargs': {
+              'new_values': [1, 2, 3, 4, 5, 6]
+          }
+      },
+      {
+          'name': 'with_row_splits_dtype',
+          'kwargs': {
+              'dtype': dtypes.int32
+          }
+      },
+      {
+          'name': 'merge_dims',
+          'args': [0],
+          'kwargs': {
+              'inner_axis': 1
+          }
+      },
+      {'name': 'to_tensor'},
+      {'name': 'to_sparse'},
+  )
+  def test_instance_method(self, name, args=None, kwargs=None):
+    if not args:
+      args = []
+    if not kwargs:
+      kwargs = {}
+
+    inp = layers.Input(shape=[None], ragged=True)
+    out = getattr(inp, name)(*args, **kwargs)
+    model = training.Model(inp, out)
+
+    x = ragged_factory_ops.constant([[3, 4], [1, 2], [3, 5]])
+    expected_property = getattr(x, name)(*args, **kwargs)
+    # We expand composites before checking equality because
+    # assertAllEqual otherwise wouldn't work for SparseTensor outputs
+    for a, b in zip(nest.flatten(model(x), expand_composites=True),
+                    nest.flatten(expected_property, expand_composites=True)):
+      self.assertAllEqual(a, b)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    for a, b in zip(nest.flatten(model2(x), expand_composites=True),
+                    nest.flatten(expected_property, expand_composites=True)):
+      self.assertAllEqual(a, b)
+
+
+class RaggedTensorClassMethodAsLayerTest(keras_parameterized.TestCase):
+
+  def test_from_value_rowids(self):
+    inp = layers.Input(shape=[None])
+    out = ragged_tensor.RaggedTensor.from_value_rowids(
+        inp, value_rowids=[0, 0, 0, 0, 2, 2, 2, 3], nrows=5)
+    model = training.Model(inp, out)
+
+    x = constant_op.constant([3, 1, 4, 1, 5, 9, 2, 6])
+    expected = ragged_tensor.RaggedTensor.from_value_rowids(
+        x, value_rowids=[0, 0, 0, 0, 2, 2, 2, 3], nrows=5)
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_row_splits(self):
+    inp = layers.Input(shape=[None])
+    out = ragged_tensor.RaggedTensor.from_row_splits(
+        inp, row_splits=[0, 4, 4, 7, 8, 8])
+    model = training.Model(inp, out)
+
+    x = constant_op.constant([3, 1, 4, 1, 5, 9, 2, 6])
+    expected = ragged_tensor.RaggedTensor.from_row_splits(
+        x, row_splits=[0, 4, 4, 7, 8, 8])
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_row_lengths(self):
+    inp = layers.Input(shape=[None])
+    out = ragged_tensor.RaggedTensor.from_row_lengths(
+        inp, row_lengths=[4, 0, 3, 1, 0])
+    model = training.Model(inp, out)
+
+    x = constant_op.constant([3, 1, 4, 1, 5, 9, 2, 6])
+    expected = ragged_tensor.RaggedTensor.from_row_lengths(
+        x, row_lengths=[4, 0, 3, 1, 0])
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_row_starts(self):
+    inp = layers.Input(shape=[None])
+    out = ragged_tensor.RaggedTensor.from_row_starts(
+        inp, row_starts=[0, 4, 4, 7, 8])
+    model = training.Model(inp, out)
+
+    x = constant_op.constant([3, 1, 4, 1, 5, 9, 2, 6])
+    expected = ragged_tensor.RaggedTensor.from_row_starts(
+        x, row_starts=[0, 4, 4, 7, 8])
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_row_limits(self):
+    row_limits = constant_op.constant([2, 2, 5, 6, 7], dtypes.int64)
+
+    inp = layers.Input(shape=[None], dtype=dtypes.string)
+    out = ragged_tensor.RaggedTensor.from_row_limits(
+        inp, row_limits, validate=False)
+    model = training.Model(inp, out)
+
+    x = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    expected = ragged_tensor.RaggedTensor.from_row_limits(
+        x, row_limits, validate=False)
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_uniform_row_length(self):
+    inp = layers.Input(shape=[None])
+    out = ragged_tensor.RaggedTensor.from_uniform_row_length(inp, 2, 8)
+    model = training.Model(inp, out)
+
+    x = constant_op.constant(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16])
+    expected = ragged_tensor.RaggedTensor.from_uniform_row_length(x, 2, 8)
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_nested_value_row_ids(self):
+    nested_value_rowids = [
+        constant_op.constant([0, 0, 1, 3, 3], dtypes.int64),
+        constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
+    ]
+    inp = layers.Input(shape=[None], dtype=dtypes.string)
+    out = ragged_tensor.RaggedTensor.from_nested_value_rowids(
+        inp, nested_value_rowids)
+    model = training.Model(inp, out)
+
+    x = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    expected = ragged_tensor.RaggedTensor.from_nested_value_rowids(
+        x, nested_value_rowids)
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_nested_row_splits(self):
+    nested_row_splits = [
+        constant_op.constant([0, 2, 3, 3, 5], dtypes.int64),
+        constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
+    ]
+    inp = layers.Input(shape=[None], dtype=dtypes.string)
+    out = ragged_tensor.RaggedTensor.from_nested_row_splits(
+        inp, nested_row_splits)
+    model = training.Model(inp, out)
+
+    x = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    expected = ragged_tensor.RaggedTensor.from_nested_row_splits(
+        x, nested_row_splits)
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_nested_row_lengths(self):
+    nested_row_lengths = [
+        constant_op.constant([2, 1, 0, 2], dtypes.int64),
+        constant_op.constant([2, 0, 3, 1, 1], dtypes.int64)
+    ]
+    inp = layers.Input(shape=[None], dtype=dtypes.string)
+    out = ragged_tensor.RaggedTensor.from_nested_row_lengths(
+        inp, nested_row_lengths)
+    model = training.Model(inp, out)
+
+    x = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    expected = ragged_tensor.RaggedTensor.from_nested_row_lengths(
+        x, nested_row_lengths)
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_tensor(self):
+    inp = layers.Input(shape=[None], ragged=False)
+    out = ragged_tensor.RaggedTensor.from_tensor(inp)
+    model = training.Model(inp, out)
+
+    x = constant_op.constant([[3., 4.], [1., 2.], [3., 5.]])
+    expected = ragged_tensor.RaggedTensor.from_tensor(x)
+    self.assertAllEqual(model(x), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(x), expected)
+
+  def test_from_sparse(self):
+    inp = layers.Input(shape=[None], sparse=True, dtype=dtypes.string)
+    out = ragged_tensor.RaggedTensor.from_sparse(inp)
+    model = training.Model(inp, out)
+
+    indices = [[0, 0], [1, 0], [1, 1], [2, 0]]
+    values = [b'a', b'b', b'c', b'd']
+    shape = [4, 5]
+    sp_value = sparse_tensor.SparseTensor(indices, values, shape)
+
+    expected = ragged_tensor.RaggedTensor.from_sparse(sp_value)
+    self.assertAllEqual(model(sp_value), expected)
+
+    # Test that the model can serialize and deserialize as well
+    model_config = model.get_config()
+    model2 = training.Model.from_config(model_config)
+    self.assertAllEqual(model2(sp_value), expected)
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  tensor_shape.enable_v2_tensorshape()
+  test.main()
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index c4c6be11b13..75bbcd024e0 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import warnings
 
 from tensorflow.python import tf2
 from tensorflow.python.framework import ops
@@ -32,12 +33,12 @@ from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.saving.saved_model import model_serialization
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops.numpy_ops import np_arrays
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -421,9 +422,9 @@ class Sequential(functional.Functional):
     Returns:
         A Numpy array of probability predictions.
     """
-    logging.warning('`model.predict_proba()` is deprecated and '
-                    'will be removed after 2021-01-01. '
-                    'Please use `model.predict()` instead.')
+    warnings.warn('`model.predict_proba()` is deprecated and '
+                  'will be removed after 2021-01-01. '
+                  'Please use `model.predict()` instead.')
     preds = self.predict(x, batch_size, verbose)
     if preds.min() < 0. or preds.max() > 1.:
       logging.warning('Network returning invalid probability values. '
@@ -446,15 +447,15 @@ class Sequential(functional.Functional):
     Returns:
         A numpy array of class predictions.
     """
-    logging.warning('`model.predict_classes()` is deprecated and '
-                    'will be removed after 2021-01-01. '
-                    'Please use instead:'
-                    '* `np.argmax(model.predict(x), axis=-1)`, '
-                    '  if your model does multi-class classification '
-                    '  (e.g. if it uses a `softmax` last-layer activation).'
-                    '* `(model.predict(x) > 0.5).astype("int32")`, '
-                    '  if your model does binary classification '
-                    '  (e.g. if it uses a `sigmoid` last-layer activation).')
+    warnings.warn('`model.predict_classes()` is deprecated and '
+                  'will be removed after 2021-01-01. '
+                  'Please use instead:'
+                  '* `np.argmax(model.predict(x), axis=-1)`, '
+                  '  if your model does multi-class classification '
+                  '  (e.g. if it uses a `softmax` last-layer activation).'
+                  '* `(model.predict(x) > 0.5).astype("int32")`, '
+                  '  if your model does binary classification '
+                  '  (e.g. if it uses a `sigmoid` last-layer activation).')
     proba = self.predict(x, batch_size=batch_size, verbose=verbose)
     if proba.shape[-1] > 1:
       return proba.argmax(axis=-1)
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 1c8510ff3c9..6a9a3bf9bcc 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -24,7 +24,7 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -456,7 +456,7 @@ class TestSequentialEagerIntegration(keras_parameterized.TestCase):
 
       def __init__(self, name=None):
         super(MySequential, self).__init__(name=name)
-        self.call = function.defun(self.call)
+        self.call = def_function.function(self.call)
 
     model = MySequential()
     model.add(keras.layers.Dense(4, activation='relu'))
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 9cb35ff1e88..02b62f235a1 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -22,9 +22,14 @@ import copy
 import itertools
 import json
 import os
+import warnings
+
 import six
 
 from tensorflow.python.autograph.lang import directives
+from tensorflow.python.data.experimental.ops.distribute_options import AutoShardPolicy
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import values as ds_values
 from tensorflow.python.eager import backprop
@@ -38,6 +43,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks as callbacks_module
+from tensorflow.python.keras import optimizer_v1
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils
 from tensorflow.python.keras.engine import base_layer
@@ -52,6 +58,7 @@ from tensorflow.python.keras.saving.saved_model import json_utils
 from tensorflow.python.keras.saving.saved_model import model_serialization
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils import version_utils
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
@@ -72,7 +79,6 @@ from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.training.tracking import util as trackable_utils
-from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import keras_export
@@ -217,6 +223,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
 
   @trackable.no_automatic_dependency_tracking
   def __init__(self, *args, **kwargs):
+    self._is_model_for_instrumentation = True
     base_layer.keras_api_gauge.get_cell('model').set(True)
 
     # Special case for Subclassed Functional Model, which we couldn't detect
@@ -328,13 +335,13 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     never throw unexpected errors in an unrelated workflow).
 
     Args:
-     input_shape: Single tuple, TensorShape, or list of shapes, where shapes
-         are tuples, integers, or TensorShapes.
+     input_shape: Single tuple, TensorShape, or list/dict of shapes, where
+         shapes are tuples, integers, or TensorShapes.
 
     Raises:
       ValueError:
         1. In case of invalid user-provided data (not of type tuple,
-           list, or TensorShape).
+           list, TensorShape, or dict).
         2. If the model requires call arguments that are agnostic
            to the input shapes (positional or kwarg in call signature).
         3. If not all layers were properly built.
@@ -350,7 +357,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     if input_shape is None:
       raise ValueError('Input shape must be defined when calling build on a '
                        'model subclass network.')
-    valid_types = (tuple, list, tensor_shape.TensorShape)
+    valid_types = (tuple, list, tensor_shape.TensorShape, dict)
     if not isinstance(input_shape, valid_types):
       raise ValueError('Specified input shape is not one of the valid types. '
                        'Please specify a batch input shape of type tuple or '
@@ -445,6 +452,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
               loss_weights=None,
               weighted_metrics=None,
               run_eagerly=None,
+              steps_per_execution=None,
               **kwargs):
     """Configures the model for training.
 
@@ -496,17 +504,18 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
           logic will not be wrapped in a `tf.function`. Recommended to leave
           this as `None` unless your `Model` cannot be run inside a
           `tf.function`.
-        **kwargs: Any additional arguments. Supported arguments:
-            - `experimental_steps_per_execution`: Int. The number of batches to
-              run during each `tf.function` call. Running multiple batches
-              inside a single `tf.function` call can greatly improve performance
-              on TPUs or small models with a large Python overhead. Note that if
-              this value is set to `N`, `Callback.on_batch` methods will only be
-              called every `N` batches. This currently defaults to `1`. At most,
-              one full epoch will be run each execution. If a number larger than
-              the size of the epoch is passed, the execution will be truncated
-              to the size of the epoch.
-            - `sample_weight_mode` for backward compatibility.
+        steps_per_execution: Int. Defaults to 1. The number of batches to
+          run during each `tf.function` call. Running multiple batches
+          inside a single `tf.function` call can greatly improve performance
+          on TPUs or small models with a large Python overhead.
+          At most, one full epoch will be run each
+          execution. If a number larger than the size of the epoch is passed,
+          the execution will be truncated to the size of the epoch.
+          Note that if `steps_per_execution` is set to `N`,
+          `Callback.on_batch_begin` and `Callback.on_batch_end` methods
+          will only be called every `N` batches
+          (i.e. before/after each `tf.function` execution).
+        **kwargs: Arguments supported for backwards compatibility only.
 
     Raises:
         ValueError: In case of invalid arguments for
@@ -514,6 +523,13 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     """
     base_layer.keras_api_gauge.get_cell('compile').set(True)
     with self.distribute_strategy.scope():
+      if 'experimental_steps_per_execution' in kwargs:
+        logging.warn('The argument `steps_per_execution` is no longer '
+                     'experimental. Pass `steps_per_execution` instead of '
+                     '`experimental_steps_per_execution`.')
+        if not steps_per_execution:
+          steps_per_execution = kwargs.pop('experimental_steps_per_execution')
+
       self._validate_compile(optimizer, metrics, **kwargs)
       self._run_eagerly = run_eagerly
 
@@ -523,9 +539,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       self.compiled_metrics = compile_utils.MetricsContainer(
           metrics, weighted_metrics, output_names=self.output_names)
 
-      experimental_steps_per_execution = kwargs.pop(
-          'experimental_steps_per_execution', 1)
-      self._configure_steps_per_execution(experimental_steps_per_execution)
+      self._configure_steps_per_execution(steps_per_execution or 1)
 
       # Initializes attrs that are reset each time `compile` is called.
       self._reset_compile_cache()
@@ -699,7 +713,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     This method can be overridden to support custom training logic.
     This method is called by `Model.make_train_function`.
 
-    This method should contain the mathemetical logic for one step of training.
+    This method should contain the mathematical logic for one step of training.
     This typically includes the forward pass, loss calculation, backpropagation,
     and metric updates.
 
@@ -855,7 +869,11 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
             interactively (eg, in a production environment).
         callbacks: List of `keras.callbacks.Callback` instances.
             List of callbacks to apply during training.
-            See `tf.keras.callbacks`.
+            See `tf.keras.callbacks`. Note `tf.keras.callbacks.ProgbarLogger`
+            and `tf.keras.callbacks.History` callbacks are created automatically
+            and need not be passed into `model.fit`.
+            `tf.keras.callbacks.ProgbarLogger` is created or not based on
+            `verbose` argument to `model.fit`.
         validation_split: Float between 0 and 1.
             Fraction of the training data to be used as validation data.
             The model will set apart this fraction of the training data,
@@ -871,7 +889,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
             The model will not be trained on this data. Thus, note the fact
             that the validation loss of data provided using `validation_split`
             or `validation_data` is not affected by regularization layers like
-            noise and dropuout.
+            noise and dropout.
             `validation_data` will override `validation_split`.
             `validation_data` could be:
               - tuple `(x_val, y_val)` of Numpy arrays or tensors
@@ -1071,6 +1089,8 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
               logs = tmp_logs  # No error, now safe to assign to logs.
               end_step = step + data_handler.step_increment
               callbacks.on_train_batch_end(end_step, logs)
+              if self.stop_training:
+                break
 
         if logs is None:
           raise ValueError('Expect x to be a non-empty array or dataset.')
@@ -1080,6 +1100,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         if validation_data and self._should_eval(epoch, validation_freq):
           # Create data_handler for evaluation and cache it.
           if getattr(self, '_eval_data_handler', None) is None:
+            self._fit_frame = tf_inspect.currentframe()
             self._eval_data_handler = data_adapter.DataHandler(
                 x=val_x,
                 y=val_y,
@@ -1115,6 +1136,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       # If eval data_hanlder exists, delete it after all epochs are done.
       if getattr(self, '_eval_data_handler', None) is not None:
         del self._eval_data_handler
+        del self._fit_frame
       callbacks.on_train_end(logs=training_logs)
       return self.history
 
@@ -1124,7 +1146,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     This method can be overridden to support custom evaluation logic.
     This method is called by `Model.make_test_function`.
 
-    This function should contain the mathemetical logic for one step of
+    This function should contain the mathematical logic for one step of
     evaluation.
     This typically includes the forward pass, loss calculation, and metrics
     updates.
@@ -1308,7 +1330,10 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     _disallow_inside_tf_function('evaluate')
 
     with self.distribute_strategy.scope():
-      if getattr(self, '_eval_data_handler', None) is not None:
+      # Use cached evaluation data only when it's called in `Model.fit`
+      if (getattr(self, '_fit_frame', None) is not None
+          and tf_inspect.currentframe().f_back is self._fit_frame
+          and getattr(self, '_eval_data_handler', None) is not None):
         data_handler = self._eval_data_handler
       else:
         # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
@@ -1359,7 +1384,13 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       if return_dict:
         return logs
       else:
-        results = [logs.get(name, None) for name in self.metrics_names]
+        results = []
+        for name in self.metrics_names:
+          if name in logs:
+            results.append(logs[name])
+        for key in sorted(logs.keys()):
+          if key not in self.metrics_names:
+            results.append(logs[key])
         if len(results) == 1:
           return results[0]
         return results
@@ -1370,7 +1401,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     This method can be overridden to support custom inference logic.
     This method is called by `Model.make_predict_function`.
 
-    This method should contain the mathemetical logic for one step of inference.
+    This method should contain the mathematical logic for one step of inference.
     This typically includes the forward pass.
 
     Configuration details for *how* this logic is run (e.g. `tf.function` and
@@ -1454,7 +1485,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     self.predict_function = predict_function
     return self.predict_function
 
-  @disable_multi_worker
   def predict(self,
               x,
               batch_size=None,
@@ -1537,6 +1567,20 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     outputs = None
     with self.distribute_strategy.scope():
       # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
+      dataset_types = (dataset_ops.DatasetV1, dataset_ops.DatasetV2)
+      if (self._in_multi_worker_mode() or _is_tpu_multi_host(
+          self.distribute_strategy)) and isinstance(x, dataset_types):
+        try:
+          options = dataset_ops.Options()
+          data_option = AutoShardPolicy.DATA
+          options.experimental_distribute.auto_shard_policy = data_option
+          x = x.with_options(options)
+        except ValueError:
+          warnings.warn('Using Model.predict with '
+                        'MultiWorkerDistributionStrategy or TPUStrategy and '
+                        'AutoShardPolicy.FILE might lead to out-of-order result'
+                        '. Consider setting it to AutoShardPolicy.DATA.')
+
       data_handler = data_adapter.DataHandler(
           x=x,
           batch_size=batch_size,
@@ -1762,8 +1806,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       outputs = self.predict_function(iterator)
     return tf_utils.to_numpy_or_python_type(outputs)
 
-  @deprecation.deprecated(
-      None, 'Please use Model.fit, which supports generators.')
   def fit_generator(self,
                     generator,
                     steps_per_epoch=None,
@@ -1785,6 +1827,9 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       `Model.fit` now supports generators, so there is no longer any need to use
       this endpoint.
     """
+    warnings.warn('`Model.fit_generator` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `Model.fit`, which supports generators.')
     return self.fit(
         generator,
         steps_per_epoch=steps_per_epoch,
@@ -1801,8 +1846,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         shuffle=shuffle,
         initial_epoch=initial_epoch)
 
-  @deprecation.deprecated(
-      None, 'Please use Model.evaluate, which supports generators.')
   def evaluate_generator(self,
                          generator,
                          steps=None,
@@ -1817,6 +1860,9 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       `Model.evaluate` now supports generators, so there is no longer any need
       to use this endpoint.
     """
+    warnings.warn('`Model.evaluate_generator` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `Model.evaluate`, which supports generators.')
     self._check_call_args('evaluate_generator')
 
     return self.evaluate(
@@ -1828,8 +1874,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         verbose=verbose,
         callbacks=callbacks)
 
-  @deprecation.deprecated(
-      None, 'Please use Model.predict, which supports generators.')
   def predict_generator(self,
                         generator,
                         steps=None,
@@ -1844,6 +1888,9 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       `Model.predict` now supports generators, so there is no longer any need
       to use this endpoint.
     """
+    warnings.warn('`Model.predict_generator` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `Model.predict`, which supports generators.')
     return self.predict(
         generator,
         steps=steps,
@@ -1905,7 +1952,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     This allows you to save the entirety of the state of a model
     in a single file.
 
-    Saved models can be reinstantiated via `keras.models.load_model`.
+    Saved models can be re-instantiated via `keras.models.load_model`.
     The model returned by `load_model` is a compiled model ready to be used
     (unless the saved model was never compiled in the first place).
 
@@ -2257,10 +2304,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
         layer.reset_states()
 
   @property
-  @deprecation.deprecated(
-      date=None,
-      instructions='This property should not be used in TensorFlow 2.0, '
-      'as updates are applied automatically.')
   @doc_controls.do_not_generate_docs
   def state_updates(self):
     """Deprecated, do NOT use!
@@ -2274,6 +2317,9 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     Returns:
         A list of update ops.
     """
+    warnings.warn('`Model.state_updates` will be removed in a future version. '
+                  'This property should not be used in TensorFlow 2.0, '
+                  'as `updates` are applied automatically.')
     state_updates = []
     for layer in self.layers:
       if getattr(layer, 'stateful', False):
@@ -2285,6 +2331,9 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
   def weights(self):
     """Returns the list of all layer variables/weights.
 
+    Note: This will not track the weights of nested `tf.Modules` that are not
+    themselves Keras layers.
+
     Returns:
       A list of variables.
     """
@@ -2443,7 +2492,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
   def _validate_compile(self, optimizer, metrics, **kwargs):
     """Performs validation checks for the default `compile`."""
     if any(
-        isinstance(opt, optimizers.Optimizer)
+        isinstance(opt, optimizer_v1.Optimizer)
         for opt in nest.flatten(optimizer)):
       raise ValueError(
           '`tf.compat.v1.keras` Optimizer (', optimizer, ') is '
@@ -2460,9 +2509,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
     if kwargs.pop('target_tensors', None) is not None:
       raise ValueError(
           'target_tensors argument is not supported when executing eagerly.')
-    invalid_kwargs = set(kwargs) - {
-        'experimental_steps_per_execution', 'sample_weight_mode'
-    }
+    invalid_kwargs = set(kwargs) - {'sample_weight_mode'}
     if invalid_kwargs:
       raise TypeError('Invalid keyword argument(s) in `compile`: %s' %
                       (invalid_kwargs,))
@@ -2577,15 +2624,33 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
   # Functions below exist only as v1 / v2 compatibility shims.
   ######################################################################
 
-  def _get_compile_args(self):
-    """Used for saving or cloning a Model."""
+  def _get_compile_args(self, user_metrics=True):
+    """Used for saving or cloning a Model.
+
+    Args:
+      user_metrics: Whether to return user-supplied metrics or `Metric` objects.
+        Defaults to returning the user-supplied metrics.
+
+    Returns:
+      Dictionary of arguments that were used when compiling the model.
+    """
     self._assert_compile_was_called()
     # pylint: disable=protected-access
+
+    saved_metrics = self.compiled_metrics._user_metrics
+    saved_weighted_metrics = self.compiled_metrics._user_weighted_metrics
+
+    if not user_metrics:
+      if saved_metrics is not None:
+        saved_metrics = self.compiled_metrics._metrics
+      if saved_weighted_metrics is not None:
+        saved_weighted_metrics = self.compiled_metrics._weighted_metrics
+
     compile_args = {
         'optimizer': self.optimizer,
         'loss': self.compiled_loss._user_losses,
-        'metrics': self.compiled_metrics._user_metrics,
-        'weighted_metrics': self.compiled_metrics._user_weighted_metrics,
+        'metrics': saved_metrics,
+        'weighted_metrics': saved_weighted_metrics,
         'loss_weights': self.compiled_loss._user_loss_weights,
     }
     # pylint: enable=protected-access
@@ -2620,6 +2685,8 @@ def reduce_per_replica(values, strategy, reduction='first'):
 
   def _reduce(v):
     """Reduce a single `PerReplica` object."""
+    if reduction == 'concat' and _collective_all_reduce_multi_worker(strategy):
+      return _multi_worker_concat(v, strategy)
     if not isinstance(v, ds_values.PerReplica):
       return v
     elif reduction == 'first':
@@ -2664,6 +2731,42 @@ def _tpu_multi_host_concat(v, strategy):
   return concat(ordered_replicas)
 
 
+def _collective_all_reduce_multi_worker(strategy):
+  return (isinstance(strategy,
+                     collective_all_reduce_strategy.CollectiveAllReduceStrategy)
+         ) and strategy.extended._in_multi_worker_mode()  # pylint: disable=protected-access
+
+
+# TODO(wxinyi): merge this with _tpu_multi_host_concat once we have all_gather
+# for all strategies
+def _multi_worker_concat(v, strategy):
+  """Order PerReplica objects for CollectiveAllReduceStrategy and concat."""
+  replicas = strategy._gather(v, axis=0)  # pylint: disable=protected-access
+  # v might not have the same shape on different replicas
+  if isinstance(v, ds_values.PerReplica):
+    shapes = array_ops.concat([
+        array_ops.expand_dims_v2(array_ops.shape(single_value)[0], axis=0)
+        for single_value in v.values
+    ],
+                              axis=0)
+    all_shapes = strategy._gather(shapes, axis=0)  # pylint: disable=protected-access
+  else:
+    # v is a tensor. This may happen when, say, we have 2x1 multi-worker.
+    all_shapes = strategy._gather(  # pylint: disable=protected-access
+        array_ops.expand_dims_v2(array_ops.shape(v)[0], axis=0),
+        axis=0)
+
+  replicas = array_ops.split(
+      replicas,
+      num_or_size_splits=all_shapes,
+      num=strategy.num_replicas_in_sync)
+  ordered_replicas = []
+  num_replicas_per_worker = len(strategy.extended.worker_devices)
+  for replica_id in range(num_replicas_per_worker):
+    ordered_replicas += replicas[replica_id::num_replicas_per_worker]
+  return concat(ordered_replicas)
+
+
 def _is_scalar(x):
   return isinstance(x, (ops.Tensor, variables.Variable)) and x.shape.rank == 0
 
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays_v1.py
similarity index 92%
rename from tensorflow/python/keras/engine/training_arrays.py
rename to tensorflow/python/keras/engine/training_arrays_v1.py
index 531e576662b..ad9b37f569e 100644
--- a/tensorflow/python/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays_v1.py
@@ -29,8 +29,8 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
-from tensorflow.python.keras.distribute import distributed_training_utils
-from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.distribute import distributed_training_utils_v1
+from tensorflow.python.keras.engine import training_utils_v1
 from tensorflow.python.keras.utils.generic_utils import make_batches
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.keras.utils.mode_keys import ModeKeys
@@ -139,13 +139,13 @@ def model_iteration(model,
   if is_dataset:
     if steps_per_epoch is None:
       reset_dataset_after_each_epoch = True
-      steps_per_epoch = training_utils.infer_steps_for_dataset(
+      steps_per_epoch = training_utils_v1.infer_steps_for_dataset(
           model, inputs, steps_per_epoch, epochs=epochs, steps_name=steps_name)
     input_iterator = _get_iterator(inputs, model._distribution_strategy)
 
   # Enter tf.distribute.Strategy scope.
   if model._distribution_strategy:
-    scope = distributed_training_utils.distributed_scope(
+    scope = distributed_training_utils_v1.distributed_scope(
         strategy=model._distribution_strategy,
         learning_phase=(1 if mode == ModeKeys.TRAIN else 0))
     scope.__enter__()
@@ -153,10 +153,6 @@ def model_iteration(model,
   use_steps = is_dataset or steps_per_epoch is not None
   do_validation = val_inputs is not None
 
-  # Convert Eager Tensors to NumPy arrays to support batching/shuffling.
-  inputs, targets, sample_weights = training_utils. \
-      convert_eager_tensors_to_numpy((inputs, targets, sample_weights))
-
   # Prepare input data.
   inputs = input_iterator or inputs
   if validation_in_fit and prepared_feed_values_from_dataset:
@@ -197,7 +193,7 @@ def model_iteration(model,
       # model_iteration() call, it will not trigger the dataset-input path
       # that determines the number of steps required. To avoid this issue,
       # set validation_steps here if validation_steps is None.
-      validation_steps = training_utils.infer_steps_for_dataset(
+      validation_steps = training_utils_v1.infer_steps_for_dataset(
           model,
           val_inputs,
           validation_steps,
@@ -240,18 +236,19 @@ def model_iteration(model,
 
   # Select aggregation method.
   if mode == ModeKeys.PREDICT:
-    aggregator = training_utils.OutputsAggregator(
+    aggregator = training_utils_v1.OutputsAggregator(
         use_steps,
         num_samples=None if steps_per_epoch else num_samples_or_steps,
         steps=steps_per_epoch)
   else:
-    aggregator = training_utils.MetricsAggregator(
+    aggregator = training_utils_v1.MetricsAggregator(
         use_steps,
         num_samples=None if steps_per_epoch else num_samples_or_steps,
         steps=steps_per_epoch)
 
   if model._compile_distribution:
-    distributed_training_utils._copy_weights_to_distributed_model(model, mode)
+    distributed_training_utils_v1._copy_weights_to_distributed_model(
+        model, mode)
 
   callbacks.model.stop_training = False
   callbacks._call_begin_hook(mode)
@@ -288,9 +285,9 @@ def model_iteration(model,
         # Get outputs.
         try:
           # `ins` can be callable in tf.distribute.Strategy + eager case.
-          if not callable(ins) or (
-              model._distribution_strategy and
-              not distributed_training_utils.is_distributing_by_cloning(model)):
+          if not callable(ins) or (model._distribution_strategy and
+                                   not distributed_training_utils_v1
+                                   .is_distributing_by_cloning(model)):
             actual_inputs = ins
           else:
             actual_inputs = ins()
@@ -329,8 +326,9 @@ def model_iteration(model,
           batch_outs = [batch_outs]
 
         if model._distribution_strategy:
-          batch_outs = distributed_training_utils._per_replica_aggregate_batch(
-              model._distribution_strategy, batch_outs, model, mode)
+          batch_outs = (
+              distributed_training_utils_v1._per_replica_aggregate_batch(
+                  model._distribution_strategy, batch_outs, model, mode))
 
         # Aggregate results.
         if step == 0:
@@ -348,7 +346,7 @@ def model_iteration(model,
       # Sample-wise loop.
       index_array = np.arange(num_samples_or_steps)
       if shuffle == 'batch':
-        index_array = training_utils.batch_shuffle(index_array, batch_size)
+        index_array = training_utils_v1.batch_shuffle(index_array, batch_size)
       elif shuffle:
         np.random.shuffle(index_array)
       batches = make_batches(num_samples_or_steps, batch_size)
@@ -407,13 +405,13 @@ def model_iteration(model,
 
     # Run the test loop every `validation_freq` epochs during training.
     if (do_validation and
-        training_utils.should_run_validation(validation_freq, epoch) and
+        training_utils_v1.should_run_validation(validation_freq, epoch) and
         not callbacks.model.stop_training):
 
       if model._compile_distribution:
         # Since we create a new clone from the original model we need to copy
         # the weights back to the original model before we can run validation.
-        distributed_training_utils._copy_weights_to_original_model(
+        distributed_training_utils_v1._copy_weights_to_original_model(
             model, ModeKeys.TRAIN)
 
       val_results = model_iteration(
@@ -450,7 +448,7 @@ def model_iteration(model,
   if model._distribution_strategy:
     if model._compile_distribution:
       # TODO(priyag, psv): Copy back metrics to the original model as well?
-      distributed_training_utils._copy_weights_to_original_model(model, mode)
+      distributed_training_utils_v1._copy_weights_to_original_model(model, mode)
     scope.__exit__(None, None, None)
 
   if mode == ModeKeys.TRAIN:
@@ -481,8 +479,8 @@ def _get_num_samples_or_steps(ins, batch_size, steps_per_epoch):
   """Returns total number of samples (when training in batch mode) or steps."""
   if steps_per_epoch:
     return steps_per_epoch
-  return training_utils.check_num_samples(ins, batch_size, steps_per_epoch,
-                                          'steps_per_epoch')
+  return training_utils_v1.check_num_samples(ins, batch_size, steps_per_epoch,
+                                             'steps_per_epoch')
 
 
 def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
@@ -500,11 +498,11 @@ def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
   """
   if model._distribution_strategy:
     if isinstance(inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
-      inputs = distributed_training_utils.get_iterator(
+      inputs = distributed_training_utils_v1.get_iterator(
           inputs, model._distribution_strategy)
 
     def get_distributed_inputs():
-      return distributed_training_utils._prepare_feed_values(
+      return distributed_training_utils_v1._prepare_feed_values(
           model, inputs, targets, sample_weights, mode)
 
     # In the eager case, we want to call the input method per step, so return
@@ -525,7 +523,7 @@ def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
         inputs,
         extract_tensors_from_dataset=True)
 
-  inputs = training_utils.ModelInputs(inputs).as_list()
+  inputs = training_utils_v1.ModelInputs(inputs).as_list()
   targets = list(targets or [])
   sample_weights = list(sample_weights or [])
   ins = inputs + targets + sample_weights
@@ -537,23 +535,23 @@ def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
 
 def _get_iterator(inputs, distribution_strategy=None):
   if distribution_strategy:
-    return distributed_training_utils.get_iterator(
+    return distributed_training_utils_v1.get_iterator(
         inputs, distribution_strategy)
-  return training_utils.get_iterator(inputs)
+  return training_utils_v1.get_iterator(inputs)
 
 
 def _reinitialize_iterator(iterator, distribution_strategy=None):
   if distribution_strategy:
-    distributed_training_utils.initialize_iterator(
+    distributed_training_utils_v1.initialize_iterator(
         iterator, distribution_strategy)
   else:
-    training_utils.initialize_iterator(iterator)
+    training_utils_v1.initialize_iterator(iterator)
 
 
 def _make_execution_function(model, mode):
   """Makes function to run one step of model execution."""
   if model._distribution_strategy:
-    return distributed_training_utils._make_execution_function(model, mode)
+    return distributed_training_utils_v1._make_execution_function(model, mode)
   return model._make_execution_function(mode)
 
 
@@ -580,8 +578,8 @@ def _update_sample_weight_mode(model, mode, inputs):
   # Call the DistributionStrategy specific function to update the
   # sample_weight_mode on the model.
   if model._distribution_strategy:
-    distributed_training_utils._update_sample_weight_modes(model, mode,
-                                                           sample_weights)
+    distributed_training_utils_v1._update_sample_weight_modes(model, mode,
+                                                              sample_weights)
 
 # For backwards compatibility for internal users of these loops.
 fit_loop = functools.partial(model_iteration, mode=ModeKeys.TRAIN)
@@ -591,7 +589,7 @@ predict_loop = functools.partial(
     model_iteration, mode=ModeKeys.PREDICT, shuffle=False)
 
 
-class ArrayLikeTrainingLoop(training_utils.TrainingLoop):
+class ArrayLikeTrainingLoop(training_utils_v1.TrainingLoop):
   """TrainingLoop that handle inputs like array.
 
   This is the default handler for most of the input data types, includes
@@ -637,9 +635,9 @@ class ArrayLikeTrainingLoop(training_utils.TrainingLoop):
       val_x, val_y, val_sample_weights = model._prepare_validation_data(
           validation_data, batch_size, validation_steps)
     elif validation_split and 0. < validation_split < 1.:
-      (x, y, sample_weights, val_x, val_y,
-       val_sample_weights) = training_utils.split_training_and_validation_data(
-           x, y, sample_weights, validation_split)
+      (x, y, sample_weights, val_x, val_y, val_sample_weights
+      ) = training_utils_v1.split_training_and_validation_data(
+          x, y, sample_weights, validation_split)
     else:
       if validation_steps:
         raise ValueError('`validation_steps` should not be specified if '
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed_v1.py
similarity index 95%
rename from tensorflow/python/keras/engine/training_distributed.py
rename to tensorflow/python/keras/engine/training_distributed_v1.py
index b33a90bd533..0e73b21adc1 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed_v1.py
@@ -31,10 +31,11 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
-from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils
+from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils_v2
+from tensorflow.python.keras.distribute import distributed_training_utils_v1 as dist_utils
 from tensorflow.python.keras.engine import partial_batch_padding_handler as padding_util
-from tensorflow.python.keras.engine import training_arrays
-from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.engine import training_arrays_v1
+from tensorflow.python.keras.engine import training_utils_v1
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import array_ops
@@ -257,7 +258,7 @@ def experimental_tpu_fit_loop(model,
         break
 
     if (do_validation and
-        training_utils.should_run_validation(validation_freq, epoch)):
+        training_utils_v1.should_run_validation(validation_freq, epoch)):
       logging.info('Running validation at fit epoch: %s', epoch)
 
       if model._compile_distribution:
@@ -385,8 +386,9 @@ def experimental_tpu_test_loop(model,
     try:
       _, batch_outs = K.batch_get_value([test_op, output_tensors])
     except errors.OutOfRangeError:
-      warning_msg = 'Make sure that your dataset can generate at least '
-      '`steps` batches (in this case, {} batches).'.format(steps)
+      warning_msg = (
+          'Make sure that your dataset can generate at least '
+          '`steps` batches (in this case, {} batches).'.format(steps))
 
       logging.warning('Your dataset iterator ran out of data; '
                       'interrupting evaluation. ' + warning_msg)
@@ -532,8 +534,9 @@ def experimental_tpu_predict_loop(model,
       _, batch_outs = K.batch_get_value([predict_ops, output_tensors])
 
     except errors.OutOfRangeError:
-      warning_msg = 'Make sure that your dataset can generate at least '
-      '`steps` batches (in this case, {} batches).'.format(steps)
+      warning_msg = (
+          'Make sure that your dataset can generate at least '
+          '`steps` batches (in this case, {} batches).'.format(steps))
 
       logging.warning('Your dataset iterator ran out of data; '
                       'interrupting evaluation. ' + warning_msg)
@@ -574,7 +577,7 @@ def experimental_tpu_predict_loop(model,
   return prediction_result
 
 
-class DistributionSingleWorkerTrainingLoop(training_utils.TrainingLoop):
+class DistributionSingleWorkerTrainingLoop(training_utils_v1.TrainingLoop):
   """Training loop for distribution strategy with single worker."""
 
   def fit(self,
@@ -629,8 +632,8 @@ class DistributionSingleWorkerTrainingLoop(training_utils.TrainingLoop):
 
     val_dataset = None
     if validation_data:
-      val_x, val_y, val_sample_weights = training_utils.unpack_validation_data(
-          validation_data)
+      val_x, val_y, val_sample_weights = (
+          training_utils_v1.unpack_validation_data(validation_data))
       dist_utils.validate_inputs(val_x, val_y)
       _, validation_steps = dist_utils.process_batch_and_step_size(
           model._distribution_strategy, val_x, batch_size, validation_steps,
@@ -648,8 +651,8 @@ class DistributionSingleWorkerTrainingLoop(training_utils.TrainingLoop):
       raise ValueError('validation_split argument is not supported with '
                        'distribution strategies.')
 
-    if dist_utils.is_tpu_strategy(model._distribution_strategy):
-      steps_per_epoch = training_utils.infer_steps_for_dataset(
+    if dist_utils_v2.is_tpu_strategy(model._distribution_strategy):
+      steps_per_epoch = training_utils_v1.infer_steps_for_dataset(
           model, dataset, steps_per_epoch, epochs, steps_name='steps_per_epoch')
       if steps_per_epoch is None:
         raise ValueError('Number of steps could not be inferred from the data, '
@@ -669,7 +672,7 @@ class DistributionSingleWorkerTrainingLoop(training_utils.TrainingLoop):
             validation_steps=validation_steps,
             validation_freq=validation_freq)
 
-    return training_arrays.fit_loop(
+    return training_arrays_v1.fit_loop(
         model,
         dataset,
         batch_size=batch_size,
@@ -705,8 +708,8 @@ class DistributionSingleWorkerTrainingLoop(training_utils.TrainingLoop):
         batch_size=batch_size,
         allow_partial_batch=True)
 
-    if dist_utils.is_tpu_strategy(model._distribution_strategy):
-      steps = training_utils.infer_steps_for_dataset(
+    if dist_utils_v2.is_tpu_strategy(model._distribution_strategy):
+      steps = training_utils_v1.infer_steps_for_dataset(
           model, dataset, steps, steps_name='steps')
       if steps is None:
         raise ValueError('Number of steps could not be inferred from the data, '
@@ -717,7 +720,7 @@ class DistributionSingleWorkerTrainingLoop(training_utils.TrainingLoop):
         return experimental_tpu_test_loop(
             model, dataset, verbose=verbose, steps=steps, callbacks=callbacks)
 
-    return training_arrays.test_loop(
+    return training_arrays_v1.test_loop(
         model,
         inputs=dataset,
         batch_size=batch_size,
@@ -742,8 +745,8 @@ class DistributionSingleWorkerTrainingLoop(training_utils.TrainingLoop):
         x,
         batch_size=batch_size,
         allow_partial_batch=True)
-    if dist_utils.is_tpu_strategy(model._distribution_strategy):
-      steps = training_utils.infer_steps_for_dataset(
+    if dist_utils_v2.is_tpu_strategy(model._distribution_strategy):
+      steps = training_utils_v1.infer_steps_for_dataset(
           model, dataset, steps, steps_name='steps')
       if steps is None:
         raise ValueError('Number of steps could not be inferred from the data, '
@@ -751,7 +754,7 @@ class DistributionSingleWorkerTrainingLoop(training_utils.TrainingLoop):
       if not context.executing_eagerly():
         return experimental_tpu_predict_loop(
             model, dataset, verbose=verbose, steps=steps, callbacks=callbacks)
-    return training_arrays.predict_loop(
+    return training_arrays_v1.predict_loop(
         model,
         dataset,
         batch_size=batch_size,
@@ -779,7 +782,7 @@ def _train_with_multi_worker(method):
   return wrapper
 
 
-class DistributionMultiWorkerTrainingLoop(training_utils.TrainingLoop):
+class DistributionMultiWorkerTrainingLoop(training_utils_v1.TrainingLoop):
   """Training loop for distribution strategy with multiple worker."""
 
   def __init__(self, single_worker_loop):
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager_v1.py
similarity index 96%
rename from tensorflow/python/keras/engine/training_eager.py
rename to tensorflow/python/keras/engine/training_eager_v1.py
index b3ce3d13ed7..2acd7493cb0 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager_v1.py
@@ -25,6 +25,7 @@ from tensorflow.python.eager.backprop import GradientTape
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.engine import training_utils_v1
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
 from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.ops import math_ops
@@ -121,17 +122,19 @@ def _model_loss(model,
   if any(
       isinstance(input_t, (np.ndarray, float, int))
       for input_t in nest.flatten(inputs)):
-    inputs = nest.map_structure(ops.convert_to_tensor_v2, inputs)
+    inputs = nest.map_structure(ops.convert_to_tensor_v2_with_dispatch, inputs)
 
   outs = model(inputs, **kwargs)
   outs = nest.flatten(outs)
 
   if targets:
-    targets = training_utils.cast_if_floating_dtype_and_mismatch(targets, outs)
+    targets = training_utils_v1.cast_if_floating_dtype_and_mismatch(
+        targets, outs)
   # TODO(sallymatson/psv): check if we should do same mismatch fix for weights
   if sample_weights:
     sample_weights = [
-        training_utils.cast_if_floating_dtype(ops.convert_to_tensor_v2(val))
+        training_utils_v1.cast_if_floating_dtype(
+            ops.convert_to_tensor_v2_with_dispatch(val))
         if val is not None else None for val in sample_weights
     ]
 
@@ -303,7 +306,7 @@ def train_on_batch(model,
           model output. Could be a empty list when model has only one output.
         'metrics': list of tensors for metric specified.
   """
-  inputs = training_utils.cast_to_model_input_dtypes(inputs, model)
+  inputs = training_utils_v1.cast_to_model_input_dtypes(inputs, model)
   outs, total_loss, output_losses, masks = (
       _process_single_batch(
           model,
@@ -344,7 +347,7 @@ def test_on_batch(model,
           model output. Could be a empty list when model has only one output.
         'metrics': list of tensors for metric specified.
   """
-  inputs = training_utils.cast_to_model_input_dtypes(inputs, model)
+  inputs = training_utils_v1.cast_to_model_input_dtypes(inputs, model)
 
   with backend.eager_learning_phase_scope(0):
     outs, total_loss, output_losses, masks = (
diff --git a/tensorflow/python/keras/engine/training_generator_test.py b/tensorflow/python/keras/engine/training_generator_test.py
index 3837763c494..8f898618695 100644
--- a/tensorflow/python/keras/engine/training_generator_test.py
+++ b/tensorflow/python/keras/engine/training_generator_test.py
@@ -34,7 +34,7 @@ from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import training
-from tensorflow.python.keras.engine import training_generator
+from tensorflow.python.keras.engine import training_generator_v1
 from tensorflow.python.keras.optimizer_v2 import rmsprop
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.platform import test
@@ -527,7 +527,7 @@ class TestConvertToGeneratorLike(test.TestCase, parameterized.TestCase):
         isinstance(data, (dataset_ops.DatasetV2, iterator_ops.Iterator))):
       return
 
-    generator, steps = training_generator.convert_to_generator_like(
+    generator, steps = training_generator_v1.convert_to_generator_like(
         data, batch_size=2, steps_per_epoch=expected_batches)
     self.assertEqual(steps, expected_batches)
 
diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator_v1.py
similarity index 96%
rename from tensorflow/python/keras/engine/training_generator.py
rename to tensorflow/python/keras/engine/training_generator_v1.py
index 1fcf3ef25e4..9b6fc1577bb 100644
--- a/tensorflow/python/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator_v1.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks as cbks
 from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.engine import training_utils_v1
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils.mode_keys import ModeKeys
@@ -132,7 +133,7 @@ def model_iteration(model,
     original_dataset = data
     if steps_per_epoch is None:
       reset_dataset_after_each_epoch = True
-      steps_per_epoch = training_utils.infer_steps_for_dataset(
+      steps_per_epoch = training_utils_v1.infer_steps_for_dataset(
           model, data, steps_per_epoch, epochs=epochs, steps_name=steps_name)
 
   # Convert to a format that supports `next(generator)`.
@@ -179,9 +180,11 @@ def model_iteration(model,
       mode=mode)
 
   if mode == ModeKeys.PREDICT:
-    aggregator = training_utils.OutputsAggregator(True, steps=steps_per_epoch)
+    aggregator = training_utils_v1.OutputsAggregator(
+        True, steps=steps_per_epoch)
   else:
-    aggregator = training_utils.MetricsAggregator(True, steps=steps_per_epoch)
+    aggregator = training_utils_v1.MetricsAggregator(
+        True, steps=steps_per_epoch)
 
   should_set_learning_phase = context.executing_eagerly() and model.run_eagerly
   if should_set_learning_phase:
@@ -293,7 +296,7 @@ def model_iteration(model,
 
     # Run the test loop every epoch during training.
     if (do_validation and
-        training_utils.should_run_validation(validation_freq, epoch) and
+        training_utils_v1.should_run_validation(validation_freq, epoch) and
         not callbacks.model.stop_training):
       val_results = model_iteration(
           model,
@@ -538,7 +541,7 @@ def _get_num_samples_or_steps(data, steps_per_epoch):
   return steps_per_epoch, True
 
 
-class GeneratorOrSequenceTrainingLoop(training_utils.TrainingLoop):
+class GeneratorOrSequenceTrainingLoop(training_utils_v1.TrainingLoop):
   """Generator-like.
 
   Input is Python generator, or Sequence object.
@@ -569,7 +572,7 @@ class GeneratorOrSequenceTrainingLoop(training_utils.TrainingLoop):
           workers=1,
           use_multiprocessing=False):
     model._validate_or_infer_batch_size(batch_size, steps_per_epoch, x)
-    training_utils.check_generator_arguments(
+    training_utils_v1.check_generator_arguments(
         y, sample_weight, validation_split=validation_split)
     return fit_generator(
         model,
@@ -602,7 +605,7 @@ class GeneratorOrSequenceTrainingLoop(training_utils.TrainingLoop):
                workers=1,
                use_multiprocessing=False):
     model._validate_or_infer_batch_size(batch_size, steps, x)
-    training_utils.check_generator_arguments(y, sample_weight)
+    training_utils_v1.check_generator_arguments(y, sample_weight)
     return evaluate_generator(
         model,
         x,
@@ -635,7 +638,7 @@ class GeneratorOrSequenceTrainingLoop(training_utils.TrainingLoop):
         use_multiprocessing=use_multiprocessing)
 
 
-class EagerDatasetOrIteratorTrainingLoop(training_utils.TrainingLoop):
+class EagerDatasetOrIteratorTrainingLoop(training_utils_v1.TrainingLoop):
   """A non-distributed Dataset or iterator in eager execution."""
 
   def fit(self,
@@ -658,10 +661,11 @@ class EagerDatasetOrIteratorTrainingLoop(training_utils.TrainingLoop):
           **kwargs):
     model._validate_or_infer_batch_size(batch_size, steps_per_epoch, x)
     # Make sure that y, sample_weights, validation_split are not passed.
-    training_utils.validate_dataset_input(x, y, sample_weight, validation_split)
+    training_utils_v1.validate_dataset_input(x, y, sample_weight,
+                                             validation_split)
     if (isinstance(x, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)) and
         shuffle):
-      training_utils.verify_dataset_shuffled(x)
+      training_utils_v1.verify_dataset_shuffled(x)
 
     return fit_generator(
         model,
@@ -691,7 +695,7 @@ class EagerDatasetOrIteratorTrainingLoop(training_utils.TrainingLoop):
                **kwargs):
     model._validate_or_infer_batch_size(batch_size, steps, x)
     # Make sure that y, sample_weights, validation_split are not passed.
-    training_utils.validate_dataset_input(x, y, sample_weight)
+    training_utils_v1.validate_dataset_input(x, y, sample_weight)
     return evaluate_generator(
         model, x, steps=steps, verbose=verbose, workers=0, callbacks=callbacks)
 
@@ -708,7 +712,7 @@ class EagerDatasetOrIteratorTrainingLoop(training_utils.TrainingLoop):
         model, x, steps=steps, verbose=verbose, workers=0, callbacks=callbacks)
 
 
-class GeneratorLikeTrainingLoop(training_utils.TrainingLoop):
+class GeneratorLikeTrainingLoop(training_utils_v1.TrainingLoop):
   """TrainingLoop that handle inputs like python generator.
 
   This is the default handler for most of the input data types, includes
@@ -755,8 +759,9 @@ class GeneratorLikeTrainingLoop(training_utils.TrainingLoop):
                                                        validation_steps)
     elif validation_split and 0. < validation_split < 1.:
       (x, y, sample_weights, val_x, val_y,
-       val_sample_weights) = training_utils.split_training_and_validation_data(
-           x, y, sample_weights, validation_split)
+       val_sample_weights) = (
+           training_utils_v1.split_training_and_validation_data(
+               x, y, sample_weights, validation_split))
       validation_data = (val_x, val_y, val_sample_weights)
     else:
       if validation_steps:
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 15976c0a072..dee1055bbc4 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -29,7 +29,6 @@ import six
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
@@ -45,7 +44,7 @@ from tensorflow.python.keras.callbacks import Callback
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training as training_module
-from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.engine import training_utils_v1
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import np_utils
 from tensorflow.python.ops import array_ops
@@ -69,6 +68,19 @@ except ImportError:
 
 class TrainingTest(keras_parameterized.TestCase):
 
+  @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_with_all_model_types
+  def test_model_instrumentation(self):
+    layers = [
+        layers_module.Dense(10, dtype=np.float64),
+        layers_module.Dense(10, dtype=np.float64)
+    ]
+    model = testing_utils.get_model_from_layers(layers, input_shape=(1,))
+
+    self.assertTrue(model._instrumented_keras_api)
+    self.assertTrue(model._instrumented_keras_model_class)
+    self.assertFalse(model._instrumented_keras_layer_class)
+
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
   def test_fit_training_arg(self):
@@ -987,17 +999,17 @@ class TrainingTest(keras_parameterized.TestCase):
       # Test with eager execution and iterator
       model.fit(dataset, epochs=1, steps_per_epoch=2)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_losses_in_defun(self):
-    with context.eager_mode():
-      layer = layers_module.Dense(1, kernel_regularizer='l1')
-      layer(array_ops.ones([1, 10]))
+    layer = layers_module.Dense(1, kernel_regularizer='l1')
+    layer(array_ops.ones([1, 10]))
 
-      @function.defun
-      def get_losses():
-        return layer.losses
+    @def_function.function
+    def get_losses():
+      return layer.losses
 
-      self.assertAllEqual(
-          self.evaluate(layer.losses), self.evaluate(get_losses()))
+    self.assertAllEqual(
+        self.evaluate(layer.losses), self.evaluate(get_losses()))
 
   @keras_parameterized.run_all_keras_modes
   def test_logging(self):
@@ -1118,70 +1130,70 @@ class TrainingTest(keras_parameterized.TestCase):
     self.assertAllEqual([[6], [8], [10], [12]],
                         model.predict(dataset_two, steps=2))
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_training_on_sparse_categorical_crossentropy_loss_with_softmax(self):
-    with context.eager_mode():
-      np.random.seed(1337)
-      train_x = np.ones((100, 4))
-      train_y = np.random.randint(0, 1, size=(100, 1))
+    np.random.seed(1337)
+    train_x = np.ones((100, 4))
+    train_y = np.random.randint(0, 1, size=(100, 1))
 
-      reference_model = testing_utils.get_small_sequential_mlp(16, 2,
-                                                               input_dim=4)
-      reference_model.compile(loss='sparse_categorical_crossentropy',
-                              optimizer=RMSPropOptimizer(learning_rate=0.001),
-                              run_eagerly=True)
-      fixed_weights = reference_model.get_weights()
-      reference_model_loss = reference_model.train_on_batch(train_x, train_y)
+    reference_model = testing_utils.get_small_sequential_mlp(16, 2,
+                                                             input_dim=4)
+    reference_model.compile(loss='sparse_categorical_crossentropy',
+                            optimizer=RMSPropOptimizer(learning_rate=0.001),
+                            run_eagerly=True)
+    fixed_weights = reference_model.get_weights()
+    reference_model_loss = reference_model.train_on_batch(train_x, train_y)
 
-      test_model = testing_utils.get_small_sequential_mlp(16, 2, input_dim=4)
-      test_model.compile(loss='sparse_categorical_crossentropy',
-                         optimizer=RMSPropOptimizer(learning_rate=0.001),
-                         run_eagerly=False)
-      test_model.set_weights(fixed_weights)
-      test_model_loss = test_model.train_on_batch(train_x, train_y)
-      self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
+    test_model = testing_utils.get_small_sequential_mlp(16, 2, input_dim=4)
+    test_model.compile(loss='sparse_categorical_crossentropy',
+                       optimizer=RMSPropOptimizer(learning_rate=0.001),
+                       run_eagerly=False)
+    test_model.set_weights(fixed_weights)
+    test_model_loss = test_model.train_on_batch(train_x, train_y)
+    self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_training_on_categorical_crossentropy_loss_with_softmax(self):
-    with context.eager_mode():
-      np.random.seed(1337)
-      train_x = np.ones((100, 4))
-      train_y = np_utils.to_categorical(
-          np.random.randint(0, 1, size=(100, 1)), 2)
+    np.random.seed(1337)
+    train_x = np.ones((100, 4))
+    train_y = np_utils.to_categorical(
+        np.random.randint(0, 1, size=(100, 1)), 2)
 
-      reference_model = testing_utils.get_small_sequential_mlp(16, 2,
-                                                               input_dim=4)
-      reference_model.compile(loss='categorical_crossentropy',
-                              optimizer=RMSPropOptimizer(learning_rate=0.001),
-                              run_eagerly=True)
-      fixed_weights = reference_model.get_weights()
-      reference_model_loss = reference_model.train_on_batch(train_x, train_y)
+    reference_model = testing_utils.get_small_sequential_mlp(16, 2,
+                                                             input_dim=4)
+    reference_model.compile(loss='categorical_crossentropy',
+                            optimizer=RMSPropOptimizer(learning_rate=0.001),
+                            run_eagerly=True)
+    fixed_weights = reference_model.get_weights()
+    reference_model_loss = reference_model.train_on_batch(train_x, train_y)
 
-      test_model = testing_utils.get_small_sequential_mlp(16, 2, input_dim=4)
-      test_model.compile(loss='categorical_crossentropy',
-                         optimizer=RMSPropOptimizer(learning_rate=0.001),
-                         run_eagerly=False)
-      test_model.set_weights(fixed_weights)
-      test_model_loss = test_model.train_on_batch(train_x, train_y)
-      self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
+    test_model = testing_utils.get_small_sequential_mlp(16, 2, input_dim=4)
+    test_model.compile(loss='categorical_crossentropy',
+                       optimizer=RMSPropOptimizer(learning_rate=0.001),
+                       run_eagerly=False)
+    test_model.set_weights(fixed_weights)
+    test_model_loss = test_model.train_on_batch(train_x, train_y)
+    self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_training_on_binary_crossentropy_loss(self):
-    with context.eager_mode():
-      train_x = np.ones((100, 4), dtype=np.float32)
-      train_y = np.ones((100, 1), dtype=np.float32)
-      reference_model = testing_utils.get_small_sequential_mlp(16, 1,
-                                                               input_dim=4)
-      reference_model.compile(loss='binary_crossentropy',
-                              optimizer=RMSPropOptimizer(learning_rate=0.001),
-                              run_eagerly=True)
-      fixed_weights = reference_model.get_weights()
-      reference_model_loss = reference_model.train_on_batch(train_x, train_y)
+    train_x = np.ones((100, 4), dtype=np.float32)
+    train_y = np.ones((100, 1), dtype=np.float32)
+    reference_model = testing_utils.get_small_sequential_mlp(16, 1,
+                                                             input_dim=4)
+    reference_model.compile(loss='binary_crossentropy',
+                            optimizer=RMSPropOptimizer(learning_rate=0.001),
+                            run_eagerly=True)
+    fixed_weights = reference_model.get_weights()
+    reference_model_loss = reference_model.train_on_batch(train_x, train_y)
 
-      test_model = testing_utils.get_small_sequential_mlp(16, 1, input_dim=4)
-      test_model.compile(loss='binary_crossentropy',
-                         optimizer=RMSPropOptimizer(learning_rate=0.001),
-                         run_eagerly=False)
-      test_model.set_weights(fixed_weights)
-      test_model_loss = test_model.train_on_batch(train_x, train_y)
-      self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
+    test_model = testing_utils.get_small_sequential_mlp(16, 1, input_dim=4)
+    test_model.compile(loss='binary_crossentropy',
+                       optimizer=RMSPropOptimizer(learning_rate=0.001),
+                       run_eagerly=False)
+    test_model.set_weights(fixed_weights)
+    test_model_loss = test_model.train_on_batch(train_x, train_y)
+    self.assertAlmostEqual(test_model_loss, reference_model_loss, places=4)
 
   @keras_parameterized.run_with_all_model_types
   @keras_parameterized.run_all_keras_modes
@@ -1618,6 +1630,64 @@ class TrainingTest(keras_parameterized.TestCase):
     model.evaluate(x, batch_size=batch_size)
     model.predict(x, batch_size=batch_size)
 
+  @keras_parameterized.run_all_keras_modes(
+      always_skip_v1=True)
+  @parameterized.named_parameters(
+      ('custom_metrics', False, True),
+      ('compiled_metrics', True, False),
+      ('both_compiled_and_custom_metrics', True, True))
+  def test_evaluate_with_custom_test_step(
+      self, use_compiled_metrics, use_custom_metrics):
+
+    class MyModel(training_module.Model):
+
+      def test_step(self, data):
+        x, y = data
+        pred = self(x)
+        metrics = {}
+        if use_compiled_metrics:
+          self.compiled_metrics.update_state(y, pred)
+          self.compiled_loss(y, pred)
+          for metric in self.metrics:
+            metrics[metric.name] = metric.result()
+        if use_custom_metrics:
+          custom_metrics = {
+              'mean': math_ops.reduce_mean(pred),
+              'sum': math_ops.reduce_sum(pred)
+          }
+          metrics.update(custom_metrics)
+        return metrics
+
+    inputs = layers_module.Input((2,))
+    outputs = layers_module.Dense(3)(inputs)
+    model = MyModel(inputs, outputs)
+    if use_compiled_metrics:
+      model.compile('adam', 'mse', metrics=['mae', 'mape'],
+                    run_eagerly=testing_utils.should_run_eagerly())
+    else:
+      model.compile('adam', 'mse',
+                    run_eagerly=testing_utils.should_run_eagerly())
+    x = np.random.random((4, 2))
+    y = np.random.random((4, 3))
+    results_list = model.evaluate(x, y)
+    results_dict = model.evaluate(x, y, return_dict=True)
+    self.assertLen(results_list, len(results_dict))
+    if use_compiled_metrics and use_custom_metrics:
+      self.assertLen(results_list, 5)
+      self.assertEqual(results_list,
+                       [results_dict['loss'],
+                        results_dict['mae'], results_dict['mape'],
+                        results_dict['mean'], results_dict['sum']])
+    if use_compiled_metrics and not use_custom_metrics:
+      self.assertLen(results_list, 3)
+      self.assertEqual(results_list,
+                       [results_dict['loss'],
+                        results_dict['mae'], results_dict['mape']])
+    if not use_compiled_metrics and use_custom_metrics:
+      self.assertLen(results_list, 2)
+      self.assertEqual(results_list,
+                       [results_dict['mean'], results_dict['sum']])
+
 
 class TestExceptionsAndWarnings(keras_parameterized.TestCase):
 
@@ -1962,7 +2032,7 @@ class LossWeightingTest(keras_parameterized.TestCase):
           [[0, .4, 1, 1], [2, .4, .3, 1]])
       dataset = dataset_ops.Dataset.from_tensor_slices(sample_weights)
       sample_weights = dataset_ops.make_one_shot_iterator(dataset).get_next()
-      sample_weights = training_utils.standardize_sample_weights(
+      sample_weights = training_utils_v1.standardize_sample_weights(
           sample_weights, model.output_names)
 
       # Update model loss with sample weight tensor.
@@ -2388,7 +2458,7 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
       output_a_np = np.random.random((10, 4))
       output_b_np = np.random.random((10, 3))
 
-      input_v = backend.variables_module.Variable(input_a_np, dtype='float32')
+      input_v = variables_lib.Variable(input_a_np, dtype='float32')
       self.evaluate(variables_lib.variables_initializer([input_v]))
       a = input_layer.Input(tensor=input_v)
       b = input_layer.Input(shape=(3,), name='input_b')
@@ -2599,7 +2669,7 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
       out = model.evaluate(input_a_np, None)
 
       # Test model with no external data at all.
-      input_v = backend.variables_module.Variable(input_a_np, dtype='float32')
+      input_v = variables_lib.Variable(input_a_np, dtype='float32')
       self.evaluate(variables_lib.variables_initializer([input_v]))
       a = input_layer.Input(tensor=input_v)
       a_2 = layers_module.Dense(4, name='dense_1')(a)
@@ -2758,7 +2828,7 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase):
                                })
 
       # test with custom TF placeholder as target
-      pl_target_a = backend.array_ops.placeholder('float32', shape=(None, 4))
+      pl_target_a = array_ops.placeholder('float32', shape=(None, 4))
       model.compile(optimizer='rmsprop', loss='mse',
                     target_tensors={'dense_1': pl_target_a})
       model.train_on_batch([input_a_np, input_b_np],
@@ -3585,16 +3655,20 @@ class TestAutoUpdates(keras_parameterized.TestCase):
 
 class TestFunctionTracing(keras_parameterized.TestCase):
 
+  def _seq_model_and_data(self):
+    model = sequential.Sequential([layers_module.Dense(4, activation='relu')])
+    model.compile(loss='mse', optimizer='rmsprop')
+    x = np.random.random((10, 6))
+    y = np.random.random((10, 4))
+    return model, x, y
+
   @keras_parameterized.run_all_keras_modes(
       always_skip_v1=True, always_skip_eager=True)
   def test_no_tracing_between_epoch(self):
     if sys.version_info[0] < 3:
       self.skipTest('self.assertLogs() call is not available in Python 2.')
 
-    model = sequential.Sequential([layers_module.Dense(4, activation='relu')])
-    model.compile(loss='mse', optimizer='rmsprop')
-    x = np.random.random((10, 6))
-    y = np.random.random((10, 4))
+    model, x, y = self._seq_model_and_data()
 
     logging.set_verbosity(1)
     with self.assertLogs(level=1) as logs:
@@ -3603,6 +3677,21 @@ class TestFunctionTracing(keras_parameterized.TestCase):
     new_func_graph = 'INFO:absl:Creating new FuncGraph for Python function'
     self.assertEqual(sum(new_func_graph in log for log in logs.output), 9)
 
+  @keras_parameterized.run_all_keras_modes(
+      always_skip_v1=True, always_skip_eager=True)
+  def test_evaluate_no_cached_data(self):
+    if sys.version_info[0] < 3:
+      self.skipTest('self.assertLogs() call is not available in Python 2.')
+
+    model, x, y = self._seq_model_and_data()
+
+    new_func_graph = 'INFO:absl:Creating new FuncGraph for Python function'
+    logging.set_verbosity(1)
+    with self.assertLogs(level=1) as eval_logs:
+      for _ in range(6):
+        model.evaluate(x, y, batch_size=5)
+    self.assertEqual(sum(new_func_graph in log for log in eval_logs.output), 20)
+
 
 class TestBuildCustomModel(keras_parameterized.TestCase):
 
@@ -3650,6 +3739,22 @@ class TestBuildCustomModel(keras_parameterized.TestCase):
     model.build([None, 1])
     self.assertEqual(model.l1.kernel.shape.as_list(), [1, 1])
 
+  @keras_parameterized.run_all_keras_modes
+  def test_build_dict_inputs(self):
+
+    class MyModel(training_module.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.l1 = layers_module.Dense(1)
+
+      def call(self, inputs):
+        return self.l1(inputs['x'])
+
+    model = MyModel()
+    model.build({'x': [None, 16]})
+    self.assertEqual(model.l1.kernel.shape.as_list(), [16, 1])
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index 84bcd99922f..4180c0b7e1d 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -17,350 +17,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import abc
-import atexit
-from collections import OrderedDict
-import functools
-import multiprocessing.pool
-import threading
-import time
-
 import numpy as np
-import six
-from six.moves import zip  # pylint: disable=redefined-builtin
 
-from tensorflow.python import tf2
-from tensorflow.python.data.experimental.ops import cardinality
-from tensorflow.python.data.experimental.ops.distribute_options import AutoShardPolicy
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.ops import readers
-from tensorflow.python.eager import context
-from tensorflow.python.framework import composite_tensor_utils
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import smart_cond
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import callbacks as cbks
-from tensorflow.python.keras import losses
-from tensorflow.python.keras import metrics as metrics_module
-from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import generic_utils
-from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
-from tensorflow.python.util.compat import collections_abc
-
-
-@six.add_metaclass(abc.ABCMeta)
-class Aggregator(object):
-  """Abstract base class used to aggregate batch-level outputs of a loop.
-
-  Attributes:
-    use_steps: Whether the loop is using `step` or `batch_size`.
-    num_samples: Total number of samples: `batch_size * num_batches`.
-    steps: Total number of steps.
-    batch_size: Batch size. It is used for validation checks between inputs and
-      outputs.
-    results: What to return at the end of the aggregation loop.
-  """
-
-  def __init__(self, use_steps, num_samples=None, steps=None, batch_size=None):
-    self.use_steps = use_steps
-    self.num_samples = num_samples
-    self.steps = steps
-    self.batch_size = batch_size
-    self.results = []
-
-  @abc.abstractmethod
-  def create(self, batch_outs):
-    """Creates the initial results from the first batch outputs.
-
-    Arguments:
-      batch_outs: A list of batch-level outputs.
-    """
-    raise NotImplementedError('Must be implemented in subclasses.')
-
-  @abc.abstractmethod
-  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
-    """Aggregates batch-level results into total results.
-
-    Arguments:
-      batch_outs: A list of batch-level outputs.
-      batch_start: The start index of this batch. Always `None` if `use_steps`
-        is `True`.
-      batch_end: The end index of this batch. Always `None` if `use_steps` is
-        `True`.
-    """
-    raise NotImplementedError('Must be implemented in subclasses.')
-
-  @abc.abstractmethod
-  def finalize(self):
-    """Prepares the total results to be returned."""
-    raise NotImplementedError('Must be implemented in subclasses.')
-
-
-class MetricsAggregator(Aggregator):
-  """Aggregator that calculates loss and metrics info.
-
-  Attributes:
-    use_steps: Whether the loop is using `step` or `batch_size`.
-    num_samples: Total number of samples: `batch_size*num_batches`.
-    steps: Total number of steps, ie number of times to iterate over a dataset
-      to cover all samples.
-  """
-
-  def __init__(self, use_steps, num_samples=None, steps=None):
-    super(MetricsAggregator, self).__init__(
-        use_steps=use_steps,
-        num_samples=num_samples,
-        steps=steps,
-        batch_size=None)
-
-  def create(self, batch_outs):
-    self.results = [0.] * len(batch_outs)
-
-  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
-    # Loss.
-    if self.use_steps:
-      self.results[0] += batch_outs[0]
-    else:
-      self.results[0] += batch_outs[0] * (batch_end - batch_start)
-    # Metrics (always stateful, just grab current values.)
-    self.results[1:] = batch_outs[1:]
-
-  def finalize(self):
-    if not self.results:
-      raise ValueError('Empty training data.')
-    self.results[0] /= (self.num_samples or self.steps)
-
-
-class ConcatAggregator(Aggregator):
-  """Combine tensor-likes which cannot be merged on the fly.
-
-  This class expects to aggregate a single tensor-like rather than a nested
-  structure of tensor-likes.
-  """
-
-  def __init__(self, batch_size):
-    self.composite = None
-    super(ConcatAggregator, self).__init__(
-        use_steps=True, num_samples=None, steps=None, batch_size=batch_size)
-
-  def create(self, batch_element):
-    self.composite = composite_tensor_utils.is_composite_or_composite_value(
-        batch_element)
-
-  def aggregate(self, batch_element, batch_start=None, batch_end=None):
-
-    # TODO(psv): Add num_samples check here to detect when output batch
-    # #samples is < batch size and != input batch #samples.
-    if self.batch_size and self.batch_size < batch_element.shape[0]:
-      raise ValueError(
-          'Mismatch between expected batch size and model output batch size. '
-          'Output shape = {}, expected output shape = shape {}'.format(
-              batch_element.shape,
-              (self.batch_size,) + batch_element.shape[1:]))
-    self.results.append(batch_element)
-
-  def finalize(self):
-    # Special case of single batch inference which skips a copy.
-    if len(self.results) == 1:
-      self.results = self.results[0]
-
-    elif self.composite:
-      # TODO(taylorrobie): efficiently concatenate.
-      results = self.results[0]
-      for r in self.results[1:]:
-        results = composite_tensor_utils.append_composite_tensor(results, r)
-      self.results = results
-
-    else:
-      self.results = np.concatenate(self.results, axis=0)
-
-    if isinstance(self.results, ops.EagerTensor):
-      self.results = self.results._numpy()  # pylint: disable=protected-access
-
-
-_COPY_THREADS = 4
-_COPY_POOL = None
-
-
-def get_copy_pool():
-  """Shared threadpool for copying arrays.
-
-  Pool instantiation takes ~ 2ms, so a singleton pool is used rather than
-  creating a pool per SliceAggregator.
-
-  Returns:
-    The global copy threadpool.
-  """
-  global _COPY_POOL
-  if _COPY_POOL is None:
-    _COPY_POOL = multiprocessing.pool.ThreadPool(_COPY_THREADS)
-    atexit.register(_COPY_POOL.close)
-  return _COPY_POOL
-
-
-class SliceAggregator(Aggregator):
-  """Combine arrays where the final size is known.
-
-  This class expects to aggregate a single tensor-like rather than a nested
-  structure of tensor-likes.
-
-  NumPy copies are an operation that threads handle quite well because all of
-  the heavy lifting is in c and does not need the GIL. Moreover, we can perform
-  lock-free writes to the same buffer in multiple threads because the nature of
-  result aggregation guarantees that either the indices are disjoint or the
-  aggregator will throw an exception in finalize. Moreover, because aggregation
-  is performed on the slowest varying dimension, assignments for a given batch
-  will write to contiguous blocks of memory, further minimizing contention.
-
-  There is, however, some scheduling and context switching overhead which will
-  offset the gains from pipelining the slice assignment. Below a given threshold
-  it is faster to simply assign in the main thread rather than enqueue the
-  assignment in a side thread. The exact threshold will vary from system to
-  system, but the time is not very sensitive to the exact transition so a value
-  of 2 ** 14 was chosen which should be reasonable on most systems.
-  """
-
-  _BINARY_SIZE_THRESHOLD = 2 ** 14
-  _MAX_COPY_SECONDS = 300
-
-  def __init__(self, num_samples, batch_size):
-    self._async_copies = []
-    self._pool = get_copy_pool()
-    self._errors = []
-    super(SliceAggregator, self).__init__(
-        use_steps=False,
-        num_samples=num_samples,
-        steps=None,
-        batch_size=batch_size)
-
-  def create(self, batch_element):
-    # This step does not need to be pipelined because NumPy empty array
-    # initialization is effectively instantaneous.
-    shape = (self.num_samples,) + batch_element.shape[1:]
-    dtype = batch_element.dtype
-    if isinstance(batch_element, ops.EagerTensor):
-      dtype = dtype.as_numpy_dtype
-
-    self.results = np.empty(shape=shape, dtype=dtype)
-
-  def aggregate(self, batch_element, batch_start, batch_end):
-    # Fail early.
-    if self._errors:
-      six.reraise(type(self._errors[0]), self._errors[0])
-
-    # In the special case of single batch inference, no copy is needed.
-    if batch_end - batch_start == self.num_samples:
-      if self.num_samples != batch_element.shape[0]:
-        raise ValueError(
-            'Mismatch between expected batch size and model output batch size. '
-            'Output shape = {}, expected output shape = shape {}'.format(
-                batch_element.shape, self.results.shape))
-
-      self.results = batch_element
-      return
-
-    # This is an approximate threshold, so we don't need to consider the number
-    # of bytes per element.
-    num_elements = np.prod(batch_element.shape)
-    if num_elements < self._BINARY_SIZE_THRESHOLD:
-      self.results[batch_start:batch_end] = batch_element
-    else:
-      is_finished = threading.Event()
-      self._pool.apply_async(
-          self._slice_assign,
-          args=(batch_element, batch_start, batch_end, is_finished))
-      self._async_copies.append(is_finished)
-
-  def _slice_assign(self, batch_element, batch_start, batch_end, is_finished):
-    try:
-      self.results[batch_start:batch_end] = batch_element
-
-    except Exception as e:  # pylint: disable=broad-except
-      # `_slice_assign` should only be called in threads and exceptions raised
-      # in threads do not carry over to the main thread. So instead we perform a
-      # a broad catch in the thread and then store the exception to be re-raised
-      # in the main thread.
-      self._errors.append(e)
-
-    finally:
-      is_finished.set()
-
-  def finalize(self):
-    start_time = time.time()
-    for is_finished in self._async_copies:
-      timeout = max([0., self._MAX_COPY_SECONDS - (time.time() - start_time)])
-      if not is_finished.wait(timeout):
-        raise ValueError('Timed out waiting for copy to complete.')
-
-    if self._errors:
-      six.reraise(self._errors[0].__class__, self._errors[0])
-
-
-class OutputsAggregator(Aggregator):
-  """Aggregator that concatenates outputs."""
-
-  _structure = None
-
-  def create(self, batch_outs):
-    # SparseTensorValue is a named tuple which nest will flatten, so we need
-    # to guard it to properly handle the structure.
-    self._structure = nest.get_traverse_shallow_structure(
-        lambda x: not composite_tensor_utils.is_composite_or_composite_value(x),
-        batch_outs)
-    batch_outs = nest.flatten_up_to(self._structure, batch_outs)
-
-    for batch_element in batch_outs:
-      if composite_tensor_utils.is_composite_or_composite_value(batch_element):
-        # If the output is not a ndarray, it will be either a composite tensor
-        # or a composite tensor's Value object. In either case, we can't
-        # allocate an array to hold the object - we'll handle it later.
-        self.results.append(ConcatAggregator(self.batch_size))
-      elif isinstance(batch_element, (np.ndarray, ops.EagerTensor)):
-        self.results.append(
-            (ConcatAggregator(self.batch_size) if self.use_steps else
-             SliceAggregator(self.num_samples, self.batch_size)))
-      else:
-        # This is not a ndarray, a CompositeTensor, or a CompositeTensorValue.
-        # Fail fast rather than trying to concatenate it.
-        raise RuntimeError('Attempted to aggregate unsupported object {}.'
-                           .format(batch_element))
-
-      self.results[-1].create(batch_element)
-
-  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
-    batch_outs = nest.flatten_up_to(self._structure, batch_outs)
-    for batch_element, result in zip(batch_outs, self.results):
-      result.aggregate(batch_element, batch_start, batch_end)
-
-  def finalize(self):
-    for result in self.results:
-      result.finalize()
-    self.results = [i.results for i in self.results]
-    self.results = nest.pack_sequence_as(self._structure, self.results)
-
-
-def get_progbar(model, count_mode, include_metrics=True):
-  """Get Progbar."""
-  if include_metrics:
-    stateful_metric_names = getattr(model, 'metrics_names', None)
-    if stateful_metric_names:
-      stateful_metric_names = stateful_metric_names[1:]  # Exclude `loss`
-  else:
-    stateful_metric_names = None
-  return cbks.ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names)
 
 
 def slice_arrays(arrays, indices, contiguous=True):
@@ -399,245 +62,6 @@ def slice_arrays(arrays, indices, contiguous=True):
   return slices
 
 
-def check_num_samples(ins, batch_size=None, steps=None, steps_name='steps'):
-  """Determine the number of samples provided for training and evaluation.
-
-  The number of samples is not defined when running with `steps`,
-  in which case the number of samples is set to `None`.
-
-  Arguments:
-      ins: List of tensors to be fed to the Keras function.
-      batch_size: Integer batch size or `None` if not defined.
-      steps: Total number of steps (batches of samples) before declaring
-        `_predict_loop` finished. Ignored with the default value of `None`.
-      steps_name: The public API's parameter name for `steps`.
-
-  Raises:
-      ValueError: when `steps` is `None` and the attribute `ins.shape`
-      does not exist. Also raises ValueError when `steps` is not `None`
-      and `batch_size` is not `None` because they are mutually
-      exclusive.
-
-  Returns:
-      When steps is `None`, returns the number of samples to be
-      processed based on the size of the first dimension of the
-      first input numpy array. When steps is not `None` and
-      `batch_size` is `None`, returns `None`.
-  """
-  if steps is not None and batch_size is not None:
-    raise ValueError('If ' + steps_name +
-                     ' is set, the `batch_size` must be None.')
-  if check_steps_argument(ins, steps, steps_name):
-    return None
-
-  if hasattr(ins[0], 'shape'):
-    return int(ins[0].shape[0])
-  return None  # Edge case where ins == [static_learning_phase]
-
-
-def standardize_single_array(x, expected_shape=None):
-  """Expand data of shape (x,) to (x, 1), unless len(expected_shape)==1."""
-  if x is None:
-    return None
-
-  if composite_tensor_utils.is_composite_or_composite_value(x):
-    return x
-
-  if isinstance(x, int):
-    raise ValueError(
-        'Expected an array data type but received an integer: {}'.format(x))
-
-  if (x.shape is not None and len(x.shape) == 1 and
-      (expected_shape is None or len(expected_shape) != 1)):
-    if tensor_util.is_tensor(x):
-      x = array_ops.expand_dims(x, axis=1)
-    else:
-      x = np.expand_dims(x, 1)
-  return x
-
-
-def standardize_input_data(data,
-                           names,
-                           shapes=None,
-                           check_batch_axis=True,
-                           exception_prefix=''):
-  """Normalizes inputs and targets provided by users.
-
-  Users may pass data as a list of arrays, dictionary of arrays,
-  or as a single array. We normalize this to an ordered list of
-  arrays (same order as `names`), while checking that the provided
-  arrays have shapes that match the network's expectations.
-
-  Arguments:
-      data: User-provided input data (polymorphic).
-      names: List of expected array names.
-      shapes: Optional list of expected array shapes.
-      check_batch_axis: Boolean; whether to check that the batch axis of the
-        arrays matches the expected value found in `shapes`.
-      exception_prefix: String prefix used for exception formatting.
-
-  Returns:
-      List of standardized input arrays (one array per model input).
-
-  Raises:
-      ValueError: in case of improperly formatted user-provided data.
-  """
-  try:
-    data_len = len(data)
-  except TypeError:
-    # For instance if data is `None` or a symbolic Tensor.
-    data_len = None
-
-  if not names:
-    if data_len and not isinstance(data, dict):
-      raise ValueError(
-          'Error when checking model ' + exception_prefix + ': '
-          'expected no data, but got:', data)
-    return []
-  if data is None:
-    return [None for _ in range(len(names))]
-
-  if isinstance(data, dict):
-    try:
-      data = [
-          data[x].values
-          if data[x].__class__.__name__ == 'DataFrame' else data[x]
-          for x in names
-      ]
-    except KeyError as e:
-      raise ValueError('No data provided for "' + e.args[0] + '". Need data '
-                       'for each key in: ' + str(names))
-  elif isinstance(data, (list, tuple)):
-    if isinstance(data[0], (list, tuple)):
-      data = [np.asarray(d) for d in data]
-    elif len(names) == 1 and isinstance(data[0], (float, int)):
-      data = [np.asarray(data)]
-    else:
-      data = [
-          x.values if x.__class__.__name__ == 'DataFrame' else x for x in data
-      ]
-  else:
-    data = data.values if data.__class__.__name__ == 'DataFrame' else data
-    data = [data]
-
-  if shapes is not None:
-    data = [
-        standardize_single_array(x, shape) for (x, shape) in zip(data, shapes)
-    ]
-  else:
-    data = [standardize_single_array(x) for x in data]
-
-  if len(data) != len(names):
-    if data and hasattr(data[0], 'shape'):
-      raise ValueError('Error when checking model ' + exception_prefix +
-                       ': the list of Numpy arrays that you are passing to '
-                       'your model is not the size the model expected. '
-                       'Expected to see ' + str(len(names)) + ' array(s), ' +
-                       'for inputs ' + str(names) + ' but instead got the '
-                       'following list of ' + str(len(data)) + ' arrays: ' +
-                       str(data)[:200] + '...')
-    elif len(names) > 1:
-      raise ValueError('Error when checking model ' + exception_prefix +
-                       ': you are passing a list as input to your model, '
-                       'but the model expects a list of ' + str(len(names)) +
-                       ' Numpy arrays instead. The list you passed was: ' +
-                       str(data)[:200])
-    elif len(data) == 1 and not hasattr(data[0], 'shape'):
-      raise TypeError('Error when checking model ' + exception_prefix +
-                      ': data should be a Numpy array, or list/dict of '
-                      'Numpy arrays. Found: ' + str(data)[:200] + '...')
-    elif len(names) == 1:
-      data = [np.asarray(data)]
-
-  # Check shapes compatibility.
-  if shapes:
-    for i in range(len(names)):
-      if shapes[i] is not None:
-        if tensor_util.is_tensor(data[i]):
-          tensorshape = data[i].shape
-          if not tensorshape:
-            continue
-          data_shape = tuple(tensorshape.as_list())
-        elif composite_tensor_utils.is_composite_or_composite_value(data[i]):
-          tensorshape = composite_tensor_utils.get_shape(data[i])
-          data_shape = tuple(tensorshape.as_list())
-        else:
-          data_shape = data[i].shape
-
-        shape = shapes[i]
-        if len(data_shape) != len(shape):
-          raise ValueError('Error when checking ' + exception_prefix +
-                           ': expected ' + names[i] + ' to have ' +
-                           str(len(shape)) + ' dimensions, but got array '
-                           'with shape ' + str(data_shape))
-        if not check_batch_axis:
-          data_shape = data_shape[1:]
-          shape = shape[1:]
-        for dim, ref_dim in zip(data_shape, shape):
-          if ref_dim != dim and ref_dim is not None and dim is not None:
-            raise ValueError('Error when checking ' + exception_prefix +
-                             ': expected ' + names[i] + ' to have shape ' +
-                             str(shape) + ' but got array with shape ' +
-                             str(data_shape))
-  return data
-
-
-def standardize_sample_or_class_weights(x_weight, output_names, weight_type):
-  """Maps `sample_weight` or `class_weight` to model outputs.
-
-  Arguments:
-      x_weight: User-provided `sample_weight` or `class_weight` argument.
-      output_names: List of output names (strings) in the model.
-      weight_type: A string used purely for exception printing.
-
-  Returns:
-      A list of `sample_weight` or `class_weight` where there are exactly
-          one element per model output.
-
-  Raises:
-      ValueError: In case of invalid user-provided argument.
-  """
-  if x_weight is None or (isinstance(x_weight, (list, tuple)) and
-                          len(x_weight) == 0):  # pylint: disable=g-explicit-length-test
-    return [None for _ in output_names]
-  if len(output_names) == 1:
-    if isinstance(x_weight, (list, tuple)) and len(x_weight) == 1:
-      return x_weight
-    if isinstance(x_weight, dict) and output_names[0] in x_weight:
-      return [x_weight[output_names[0]]]
-    else:
-      return [x_weight]
-  if isinstance(x_weight, (list, tuple)):
-    if len(x_weight) != len(output_names):
-      raise ValueError('Provided `' + weight_type + '` was a list of ' +
-                       str(len(x_weight)) + ' elements, but the model has ' +
-                       str(len(output_names)) + ' outputs. '
-                       'You should provide one `' + weight_type + '`'
-                       'array per model output.')
-    return x_weight
-  if isinstance(x_weight, collections_abc.Mapping):
-    generic_utils.check_for_unexpected_keys(weight_type, x_weight, output_names)
-    x_weights = []
-    for name in output_names:
-      x_weights.append(x_weight.get(name))
-    return x_weights
-  else:
-    raise TypeError('The model has multiple outputs, so `' + weight_type + '` '
-                    'should be either a list or a dict. '
-                    'Provided `' + weight_type + '` type not understood: ' +
-                    str(x_weight))
-
-
-def standardize_class_weights(class_weight, output_names):
-  return standardize_sample_or_class_weights(class_weight, output_names,
-                                             'class_weight')
-
-
-def standardize_sample_weights(sample_weight, output_names):
-  return standardize_sample_or_class_weights(sample_weight, output_names,
-                                             'sample_weight')
-
-
 def handle_partial_sample_weights(outputs, sample_weights, sample_weight_modes,
                                   check_all_flat=False):
   """Adds 1.0 as sample weights for the outputs for which there is no weight.
@@ -697,506 +121,6 @@ def handle_partial_sample_weights(outputs, sample_weights, sample_weight_modes,
           any_sample_weight, partial_sample_weight)
 
 
-def check_array_lengths(inputs, targets, weights=None):
-  """Does user input validation for numpy arrays.
-
-  Arguments:
-      inputs: list of Numpy arrays of inputs.
-      targets: list of Numpy arrays of targets.
-      weights: list of Numpy arrays of sample weights.
-
-  Raises:
-      ValueError: in case of incorrectly formatted data.
-  """
-
-  def is_tensor_or_composite_tensor(x):
-    return tensor_util.is_tensor(
-        x) or composite_tensor_utils.is_composite_or_composite_value(x)
-
-  def set_of_lengths(x):
-    # Returns a set with the variation between
-    # different shapes, with None => 0
-    if x is None:
-      return {}
-    else:
-      return set([
-          y.shape[0]
-          for y in x
-          if y is not None and not is_tensor_or_composite_tensor(y)
-      ])
-
-  set_x = set_of_lengths(inputs)
-  set_y = set_of_lengths(targets)
-  set_w = set_of_lengths(weights)
-  if len(set_x) > 1:
-    raise ValueError('All input arrays (x) should have '
-                     'the same number of samples. Got array shapes: ' +
-                     str([x.shape for x in inputs]))
-  if len(set_y) > 1:
-    raise ValueError('All target arrays (y) should have '
-                     'the same number of samples. Got array shapes: ' +
-                     str([y.shape for y in targets]))
-  if set_x and set_y and list(set_x)[0] != list(set_y)[0]:
-    raise ValueError('Input arrays should have '
-                     'the same number of samples as target arrays. '
-                     'Found ' + str(list(set_x)[0]) + ' input samples '
-                     'and ' + str(list(set_y)[0]) + ' target samples.')
-  if len(set_w) > 1:
-    raise ValueError('All sample_weight arrays should have '
-                     'the same number of samples. Got array shapes: ' +
-                     str([w.shape for w in weights]))
-  if set_y and set_w and list(set_y)[0] != list(set_w)[0]:
-    raise ValueError('Sample_weight arrays should have '
-                     'the same number of samples as target arrays. Got ' +
-                     str(list(set_y)[0]) + ' input samples and ' +
-                     str(list(set_w)[0]) + ' target samples.')
-
-
-def check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
-  """Does validation on the compatibility of targets and loss functions.
-
-  This helps prevent users from using loss functions incorrectly. This check
-  is purely for UX purposes.
-
-  Arguments:
-      targets: list of Numpy arrays of targets.
-      loss_fns: list of loss functions.
-      output_shapes: list of shapes of model outputs.
-
-  Raises:
-      ValueError: if a loss function or target array
-          is incompatible with an output.
-  """
-  key_loss_fns = {
-      losses.mean_squared_error, losses.binary_crossentropy,
-      losses.categorical_crossentropy
-  }
-  key_loss_classes = (losses.MeanSquaredError, losses.BinaryCrossentropy,
-                      losses.CategoricalCrossentropy)
-  for y, loss, shape in zip(targets, loss_fns, output_shapes):
-    if y is None or loss is None or tensor_util.is_tensor(y):
-      continue
-    if losses.is_categorical_crossentropy(loss):
-      if y.shape[-1] == 1:
-        raise ValueError('You are passing a target array of shape ' +
-                         str(y.shape) +
-                         ' while using as loss `categorical_crossentropy`. '
-                         '`categorical_crossentropy` expects '
-                         'targets to be binary matrices (1s and 0s) '
-                         'of shape (samples, classes). '
-                         'If your targets are integer classes, '
-                         'you can convert them to the expected format via:\n'
-                         '```\n'
-                         'from keras.utils import to_categorical\n'
-                         'y_binary = to_categorical(y_int)\n'
-                         '```\n'
-                         '\n'
-                         'Alternatively, you can use the loss function '
-                         '`sparse_categorical_crossentropy` instead, '
-                         'which does expect integer targets.')
-
-    is_loss_wrapper = isinstance(loss, losses.LossFunctionWrapper)
-    if (isinstance(loss, key_loss_classes) or (is_loss_wrapper and
-                                               (loss.fn in key_loss_fns))):
-      for target_dim, out_dim in zip(y.shape[1:], shape[1:]):
-        if out_dim is not None and target_dim != out_dim:
-          loss_name = loss.name
-          if loss_name is None:
-            loss_type = loss.fn if is_loss_wrapper else type(loss)
-            loss_name = loss_type.__name__
-          raise ValueError('A target array with shape ' + str(y.shape) +
-                           ' was passed for an output of shape ' + str(shape) +
-                           ' while using as loss `' + loss_name + '`. '
-                           'This loss expects targets to have the same shape '
-                           'as the output.')
-
-
-def collect_per_output_metric_info(metrics,
-                                   output_names,
-                                   output_shapes,
-                                   loss_fns,
-                                   is_weighted=False):
-  """Maps metric names and functions to model outputs.
-
-  Arguments:
-      metrics: a list or a list of lists or a dict of metric functions.
-      output_names: a list of the names (strings) of model outputs.
-      output_shapes: a list of the shapes (strings) of model outputs.
-      loss_fns: a list of the loss functions corresponding to the model outputs.
-      is_weighted: Boolean indicating whether the given metrics are weighted.
-
-  Returns:
-      A list (one entry per model output) of dicts.
-      For instance, if the model has 2 outputs, and for the first output
-      we want to compute "binary_accuracy" and "binary_crossentropy",
-      and just "binary_accuracy" for the second output,
-      the list would look like: `[{
-          'acc': binary_accuracy(),
-          'ce': binary_crossentropy(),
-        }, {
-          'acc': binary_accuracy(),
-        }]`
-
-  Raises:
-      TypeError: if an incorrect type is passed for the `metrics` argument.
-  """
-  if not metrics:
-    return [{} for _ in output_names]
-
-  if isinstance(metrics, list):
-    any_sub_list = any(isinstance(m, list) for m in metrics)
-    if any_sub_list:
-      if len(metrics) != len(output_names):
-        raise ValueError('When passing a list of lists as `metrics`, '
-                         'it should have one entry per model output. '
-                         'The model has ' + str(len(output_names)) +
-                         ' outputs, but you passed metrics=' + str(metrics))
-      # User has provided a list of len = len(outputs).
-      nested_metrics = [generic_utils.to_list(m) for m in metrics]
-    else:
-      # If it is a single list we then apply all metrics to all outputs.
-      if len(output_names) > 1:
-        nested_metrics = []
-        for _ in output_names:
-          nested_metrics.append(
-              [metrics_module.clone_metric(m) for m in metrics])
-      else:
-        nested_metrics = [metrics]
-  elif isinstance(metrics, collections_abc.Mapping):
-    generic_utils.check_for_unexpected_keys('metrics', metrics, output_names)
-    nested_metrics = []
-    for name in output_names:
-      output_metrics = generic_utils.to_list(metrics.get(name, []))
-      nested_metrics.append(output_metrics)
-  else:
-    raise TypeError('Type of `metrics` argument not understood. '
-                    'Expected a list or dictionary, found: ' + str(metrics))
-
-  per_output_metrics = []
-  for i, metrics in enumerate(nested_metrics):
-    metrics_dict = OrderedDict()
-    for metric in metrics:
-      metric_name = get_metric_name(metric, is_weighted)
-      metric_fn = get_metric_function(
-          metric, output_shape=output_shapes[i], loss_fn=loss_fns[i])
-
-      # If the metric function is not stateful, we create a stateful version.
-      if not isinstance(metric_fn, metrics_module.Metric):
-        metric_fn = metrics_module.MeanMetricWrapper(
-            metric_fn, name=metric_name)
-      metrics_dict[metric_name] = metric_fn
-    per_output_metrics.append(metrics_dict)
-
-  return per_output_metrics
-
-
-def batch_shuffle(index_array, batch_size):
-  """Shuffles an array in a batch-wise fashion.
-
-  Useful for shuffling HDF5 arrays
-  (where one cannot access arbitrary indices).
-
-  Arguments:
-      index_array: array of indices to be shuffled.
-      batch_size: integer.
-
-  Returns:
-      The `index_array` array, shuffled in a batch-wise fashion.
-  """
-  batch_count = int(len(index_array) / batch_size)
-  # to reshape we need to be cleanly divisible by batch size
-  # we stash extra items and reappend them after shuffling
-  last_batch = index_array[batch_count * batch_size:]
-  index_array = index_array[:batch_count * batch_size]
-  index_array = index_array.reshape((batch_count, batch_size))
-  np.random.shuffle(index_array)
-  index_array = index_array.flatten()
-  return np.append(index_array, last_batch)
-
-
-def standardize_weights(y,
-                        sample_weight=None,
-                        class_weight=None,
-                        sample_weight_mode=None):
-  """Performs sample weight validation and standardization.
-
-  Everything gets normalized to a single sample-wise (or timestep-wise)
-  weight array. If both `sample_weight` and `class_weight` are provided,
-  the weights are multiplied.
-
-  Arguments:
-      y: Numpy array or Tensor of model targets to be weighted.
-      sample_weight: User-provided `sample_weight` argument.
-      class_weight: User-provided `class_weight` argument.
-      sample_weight_mode: One of `None` or `"temporal"`. `"temporal"` indicated
-        that we expect 2D weight data that will be applied to the last 2
-        dimensions of the targets (i.e. we are weighting timesteps, not
-        samples).
-
-  Returns:
-      A numpy array of target weights, one entry per sample to weight.
-
-  Raises:
-      ValueError: In case of invalid user-provided arguments.
-  """
-  # Iterator may return sample_weight as 1-tuple
-  if isinstance(sample_weight, tuple):
-    sample_weight = sample_weight[0]
-  if sample_weight_mode is not None and sample_weight_mode != 'samplewise':
-    if sample_weight_mode != 'temporal':
-      raise ValueError('"sample_weight_mode '
-                       'should be None or "temporal". '
-                       'Found: ' + str(sample_weight_mode))
-    if len(y.shape) < 3:
-      raise ValueError('Found a sample_weight array for '
-                       'an input with shape ' + str(y.shape) + '. '
-                       'Timestep-wise sample weighting (use of '
-                       'sample_weight_mode="temporal") is restricted to '
-                       'outputs that are at least 3D, i.e. that have '
-                       'a time dimension.')
-    if sample_weight is not None and len(sample_weight.shape) != 2:
-      raise ValueError('Found a sample_weight array with shape ' +
-                       str(sample_weight.shape) + '. '
-                       'In order to use timestep-wise sample weighting, '
-                       'you should pass a 2D sample_weight array.')
-  else:
-    if sample_weight is not None and len(sample_weight.shape) != 1:
-      raise ValueError('Found a sample_weight array with shape {}. In order to '
-                       'use timestep-wise sample weights, you should specify '
-                       'sample_weight_mode="temporal" in compile(); found "{}" '
-                       'instead. If you just mean to use sample-wise weights, '
-                       'make sure your sample_weight array is 1D.'
-                       .format(sample_weight.shape, sample_weight_mode))
-
-  if sample_weight is not None:
-    if len(sample_weight.shape) > len(y.shape):
-      raise ValueError('Found a sample_weight with shape' +
-                       str(sample_weight.shape) + '.'
-                       'Expected sample_weight with rank '
-                       'less than or equal to ' + str(len(y.shape)))
-
-    if (not tensor_util.is_tensor(sample_weight) and
-        y.shape[:sample_weight.ndim] != sample_weight.shape):
-      raise ValueError('Found a sample_weight array with shape ' +
-                       str(sample_weight.shape) + ' for an input with shape ' +
-                       str(y.shape) + '. '
-                       'sample_weight cannot be broadcast.')
-
-  # Class weights applied per-sample.
-  class_sample_weight = None
-  if isinstance(class_weight, dict):
-    if len(y.shape) > 2:
-      raise ValueError('`class_weight` not supported for '
-                       '3+ dimensional targets.')
-
-    if tensor_util.is_tensor(y):
-      # Few classes are expected, so densifying is reasonable.
-      keys = np.array(sorted(class_weight.keys()))
-      values = np.array([class_weight[i] for i in keys])
-      weight_vector = np.zeros(np.max(keys) + 1)
-      weight_vector[:] = np.nan
-      weight_vector[keys] = values
-
-      y_classes = smart_cond.smart_cond(
-          len(y.shape.as_list()) == 2 and K.shape(y)[1] > 1,
-          lambda: K.argmax(y, axis=1),
-          lambda: math_ops.cast(K.reshape(y, (-1,)), dtypes.int64))
-      class_sample_weight = array_ops.gather(weight_vector, y_classes)
-      gen_array_ops.check_numerics(
-          class_sample_weight,
-          'Invalid classes or class weights detected. NaN values indicate that '
-          'an appropriate class weight could not be determined.')
-      class_sample_weight = math_ops.cast(class_sample_weight, K.floatx())
-      if sample_weight is not None:
-        sample_weight = math_ops.cast(
-            ops.convert_to_tensor_v2(sample_weight), K.floatx())
-    else:
-      y_classes = y
-      if len(y.shape) == 2:
-        if y.shape[1] > 1:
-          y_classes = np.argmax(y, axis=1)
-        elif y.shape[1] == 1:
-          y_classes = np.reshape(y, y.shape[0])
-
-      class_sample_weight = np.asarray(
-          [class_weight[cls] for cls in y_classes if cls in class_weight])
-
-      if len(class_sample_weight) != len(y_classes):
-        # subtract the sets to pick all missing classes
-        existing_classes = set(y_classes)
-        existing_class_weight = set(class_weight.keys())
-        raise ValueError(
-            '`class_weight` must contain all classes in the data.'
-            ' The classes %s exist in the data but not in '
-            '`class_weight`.' % (existing_classes - existing_class_weight))
-
-  if class_sample_weight is not None and sample_weight is not None:
-    # Multiply weights if both are provided.
-    return class_sample_weight * sample_weight
-  if sample_weight is not None:
-    return sample_weight
-  if class_sample_weight is not None:
-    return class_sample_weight
-  return None
-
-
-def has_symbolic_tensors(ls):
-  if context.executing_eagerly():
-    return False
-  return has_tensors(ls)
-
-
-def has_tensors(ls):
-  """Returns true if `ls` contains tensors."""
-  # Note: at some point in time ragged tensors didn't count as tensors, so this
-  # returned false for ragged tensors. Making this return true fails some tests
-  # which would then require a steps_per_epoch argument.
-  if isinstance(ls, (list, tuple)):
-    return any(
-        tensor_util.is_tensor(v) and
-        not isinstance(v, ragged_tensor.RaggedTensor) for v in ls)
-  if isinstance(ls, dict):
-    return any(
-        tensor_util.is_tensor(v) and
-        not isinstance(v, ragged_tensor.RaggedTensor)
-        for _, v in six.iteritems(ls))
-  return tensor_util.is_tensor(ls) and not isinstance(
-      ls, ragged_tensor.RaggedTensor)
-
-
-def get_metric_name(metric, weighted=False):
-  """Returns the name corresponding to the given metric input.
-
-  Arguments:
-    metric: Metric function name or reference.
-    weighted: Boolean indicating if the given metric is weighted.
-
-  Returns:
-      The metric name.
-  """
-  if tf2.enabled():
-    # We keep the string that the user has set in compile as the metric name.
-    if isinstance(metric, six.string_types):
-      return metric
-
-    metric = metrics_module.get(metric)
-    return metric.name if hasattr(metric, 'name') else metric.__name__
-  else:
-    metric_name_prefix = 'weighted_' if weighted else ''
-    if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
-      if metric in ('accuracy', 'acc'):
-        suffix = 'acc'
-      elif metric in ('crossentropy', 'ce'):
-        suffix = 'ce'
-    else:
-      metric_fn = metrics_module.get(metric)
-      # Get metric name as string
-      if hasattr(metric_fn, 'name'):
-        suffix = metric_fn.name
-      else:
-        suffix = metric_fn.__name__
-    metric_name = metric_name_prefix + suffix
-    return metric_name
-
-
-def get_metric_function(metric, output_shape=None, loss_fn=None):
-  """Returns the metric function corresponding to the given metric input.
-
-  Arguments:
-      metric: Metric function name or reference.
-      output_shape: The shape of the output that this metric will be calculated
-        for.
-      loss_fn: The loss function used.
-
-  Returns:
-      The metric function.
-  """
-  if metric not in ['accuracy', 'acc', 'crossentropy', 'ce']:
-    return metrics_module.get(metric)
-
-  is_sparse_categorical_crossentropy = (
-      isinstance(loss_fn, losses.SparseCategoricalCrossentropy) or
-      (isinstance(loss_fn, losses.LossFunctionWrapper) and
-       loss_fn.fn == losses.sparse_categorical_crossentropy))
-
-  is_binary_crossentropy = (
-      isinstance(loss_fn, losses.BinaryCrossentropy) or
-      (isinstance(loss_fn, losses.LossFunctionWrapper) and
-       loss_fn.fn == losses.binary_crossentropy))
-
-  if metric in ['accuracy', 'acc']:
-    if output_shape[-1] == 1 or is_binary_crossentropy:
-      return metrics_module.binary_accuracy
-    elif is_sparse_categorical_crossentropy:
-      return metrics_module.sparse_categorical_accuracy
-    # If the output_shape[-1] is not 1, then we know output is `categorical`.
-    # We assume it is sparse categorical only if loss is explicitly given
-    # as sparse categorical crossentropy loss.
-    return metrics_module.categorical_accuracy
-  else:
-    if output_shape[-1] == 1 or is_binary_crossentropy:
-      return metrics_module.binary_crossentropy
-    elif is_sparse_categorical_crossentropy:
-      return metrics_module.sparse_categorical_crossentropy
-    return metrics_module.categorical_crossentropy
-
-
-def call_metric_function(metric_fn,
-                         y_true,
-                         y_pred=None,
-                         weights=None,
-                         mask=None):
-  """Invokes metric function and returns the metric result tensor."""
-  if mask is not None:
-    mask = math_ops.cast(mask, y_pred.dtype)
-    if weights is None:
-      # Use mask as sample weight.
-      weights = mask
-    else:
-      # Update dimensions of weights to match with mask.
-      weights = math_ops.cast(weights, dtype=y_pred.dtype)
-      mask, _, weights = losses_utils.squeeze_or_expand_dimensions(
-          mask, sample_weight=weights)
-      weights *= mask
-
-  if y_pred is not None:
-    return metric_fn(y_true, y_pred, sample_weight=weights)
-  # `Mean` metric only takes a single value.
-  return metric_fn(y_true, sample_weight=weights)
-
-
-def get_loss_function(loss):
-  """Returns the loss corresponding to the loss input in `compile` API."""
-  if loss is None or isinstance(loss, losses.Loss):
-    return loss
-
-  if tf_inspect.isclass(loss) and issubclass(loss, losses.Loss):
-    # It is not safe to assume that the loss takes no constructor arguments.
-    raise ValueError(
-        'Received uninstantiated Loss class: {}\nPlease call loss ""classes '
-        'before passing them to Model.compile.'.format(loss))
-
-  # Deserialize loss configuration, if needed.
-  if isinstance(loss, collections_abc.Mapping):
-    loss = losses.get(loss)
-
-  # Custom callable class.
-  if callable(loss) and not hasattr(loss, '__name__'):
-    return loss
-
-  # Wrap loss function with signature `(y_true, y_pred, **kwargs)`
-  # in `LossFunctionWrapper` class.
-  loss_fn = losses.get(loss)
-
-  # For losses which are given as strings/functions in the compile API,
-  # we always set the loss reduction type to be `SUM_OVER_BATCH_SIZE`
-  # (both in distribution strategy context and otherwise).
-  return losses.LossFunctionWrapper(
-      loss_fn,
-      name=loss_fn.__name__,
-      reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE)
-
-
 class RespectCompiledTrainableState(object):
   """Set and restore trainable state if it has changed since compile.
 
@@ -1243,667 +167,6 @@ class RespectCompiledTrainableState(object):
     return False  # False values do not suppress exceptions
 
 
-def validate_dataset_input(x, y, sample_weight, validation_split=None):
-  """Validates user input arguments when a dataset iterator is passed.
-
-  Arguments:
-    x: Input data. A `tf.data` dataset or iterator.
-    y: Target data. It could be either Numpy array(s) or TensorFlow tensor(s).
-      Expected to be `None` when `x` is a dataset iterator.
-    sample_weight: An optional sample-weight array passed by the user to weight
-      the importance of each sample in `x`. Expected to be `None` when `x` is a
-      dataset iterator
-    validation_split: Float between 0 and 1. Fraction of the training data to be
-      used as validation data. Expected to be `None` when `x` is a dataset
-      iterator.
-
-  Raises:
-    ValueError: if argument `y` or `sample_weight` or `validation_split` are
-        provided by user.
-  """
-  if y is not None:
-    raise ValueError('You passed a dataset or dataset iterator (%s) as '
-                     'input `x` to your model. In that case, you should '
-                     'not specify a target (`y`) argument, since the dataset '
-                     'or dataset iterator generates both input data and '
-                     'target data. '
-                     'Received: %s' % (x, y))
-  if sample_weight is not None:
-    raise ValueError('`sample_weight` argument is not supported when input '
-                     '`x` is a dataset or a dataset iterator. Instead, you'
-                     'can provide sample_weight as the third element  of your'
-                     'dataset, i.e. (inputs, targets, sample_weight). '
-                     'Received: x=%s, sample_weight=%s' % (x, sample_weight))
-  if validation_split is not None and validation_split != 0.0:
-    raise ValueError(
-        '`validation_split` argument is not supported when '
-        'input `x` is a dataset or a dataset iterator. '
-        'Received: x=%s, validation_split=%f' % (x, validation_split))
-
-
-def validate_input_types(inp, orig_inp, allow_dict=True, field_name='inputs'):
-  """Helper function to validate either inputs or targets."""
-  if isinstance(inp, (list, tuple)):
-    if not all(isinstance(v, np.ndarray) or
-               tensor_util.is_tensor(v) for v in inp):
-      raise ValueError(
-          'Please provide as model inputs either a single array or a list of '
-          'arrays. You passed: {}={}'.format(field_name, str(orig_inp)))
-  elif isinstance(inp, dict):
-    if not allow_dict:
-      raise ValueError(
-          'You cannot pass a dictionary as model {}.'.format(field_name))
-  elif not isinstance(inp, np.ndarray) and not tensor_util.is_tensor(inp):
-    raise ValueError(
-        'Please provide as model inputs either a single array or a list of '
-        'arrays. You passed: {}={}'.format(field_name, orig_inp))
-
-
-def check_generator_arguments(y=None, sample_weight=None,
-                              validation_split=None):
-  """Validates arguments passed when using a generator."""
-  if y is not None:
-    raise ValueError('`y` argument is not supported when data is'
-                     'a generator or Sequence instance. Instead pass targets'
-                     ' as the second element of the generator.')
-  if sample_weight is not None:
-    raise ValueError('`sample_weight` argument is not supported when data is'
-                     'a generator or Sequence instance. Instead pass sample'
-                     ' weights as the third element of the generator.')
-  if validation_split:
-    raise ValueError('If your data is in the form of a Python generator, '
-                     'you cannot use `validation_split`.')
-
-
-def check_steps_argument(input_data, steps, steps_name):
-  """Validates `steps` argument based on input data's type.
-
-  The cases when `steps` value must be provided are when
-    1. input data passed is an iterator.
-    2. model was built on top of symbolic tensors, input data is not
-       required and is `None`.
-    3. input data passed is a symbolic tensor.
-
-  Arguments:
-      input_data: Input data. Can be Numpy array(s) or TensorFlow tensor(s) or
-        tf.data.Dataset iterator or `None`.
-      steps: Integer or `None`. Total number of steps (batches of samples) to
-        execute.
-      steps_name: The public API's parameter name for `steps`.
-
-  Returns:
-    boolean, True if `steps` argument is required, else False.
-
-  Raises:
-      ValueError: if `steps` argument is required for given input data type
-        but not provided.
-  """
-  is_x_iterator = isinstance(
-      input_data, (iterator_ops.Iterator, iterator_ops.OwnedIterator))
-  if (input_data is None or is_x_iterator or has_symbolic_tensors(input_data) or
-      (isinstance(input_data, list) and not input_data)):
-    if steps is None:
-      input_type_str = 'a Dataset iterator' if is_x_iterator else 'data tensors'
-      raise ValueError('When using {input_type} as input to a model, you should'
-                       ' specify the `{steps_name}` argument.'.format(
-                           input_type=input_type_str, steps_name=steps_name))
-    return True
-
-  if isinstance(input_data, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
-    return True
-
-  if steps is not None:
-    list_types = (np.ndarray, list, tuple)
-    if (isinstance(input_data, list_types) or
-        (isinstance(input_data, dict) and
-         any(isinstance(v, list_types) for v in input_data.values()))):
-      logging.warning('When passing input data as arrays, do not specify '
-                      '`steps_per_epoch`/`steps` argument. '
-                      'Please use `batch_size` instead.')
-  return False
-
-
-def cast_single_tensor(x, dtype=None):
-  if isinstance(x, np.ndarray):
-    x = ops.convert_to_tensor_v2(x)
-  dtype = dtype or K.floatx()
-  if x.dtype.is_floating:
-    return math_ops.cast(x, dtype=dtype)
-  return x
-
-
-def cast_if_floating_dtype_and_mismatch(targets, outputs):
-  """Returns target data tensors using correct datatype.
-
-  Checks that each target and output pair are the same datatype. If not, casts
-  the target to the output's datatype.
-
-  Args:
-    targets: tensor or list of targets.
-    outputs: tensor or list of outputs.
-
-  Returns:
-    Targets in appropriate datatype.
-  """
-  if tensor_util.is_tensor(targets):
-    # There is one target, so output[0] should be the only output.
-    return cast_single_tensor(targets, dtype=outputs[0].dtype)
-  new_targets = []
-  for target, out in zip(targets, outputs):
-    if isinstance(target, np.ndarray):
-      target = ops.convert_to_tensor_v2(target)
-    if target.dtype != out.dtype:
-      new_targets.append(cast_single_tensor(target, dtype=out.dtype))
-    else:
-      new_targets.append(target)
-  return new_targets
-
-
-def cast_if_floating_dtype(x, dtype=None):
-  """Casts the given data tensors to the default floating point type.
-
-  Casts only if the input is already a floating point type.
-  Args:
-    x: tensor or list/tuple of tensors.
-    dtype: The dtype to which Tensors should be cast.
-
-  Returns:
-    Converted input.
-  """
-  return nest.map_structure(functools.partial(cast_single_tensor, dtype=dtype),
-                            x)
-
-
-def cast_to_model_input_dtypes(x, model):
-  """Casts the given data tensors to the dtypes of the model inputs.
-
-  Args:
-    x: tensor or list/tuple of tensors.
-    model: The model.
-
-  Returns:
-    Converted input. Each tensor is casted to the corresponding input in
-    `model.inputs`.
-  """
-  input_dtypes = nest.map_structure(lambda t: t.dtype, model.inputs)
-  return nest.map_structure(math_ops.cast, x, input_dtypes)
-
-
-def prepare_sample_weight_modes(training_endpoints, sample_weight_mode):
-  """Prepares sample weight modes for the model.
-
-  Args:
-    training_endpoints: List of model _TrainingEndpoints.
-    sample_weight_mode: sample weight mode user input passed from compile API.
-
-  Raises:
-    ValueError: In case of invalid `sample_weight_mode` input.
-  """
-
-  if isinstance(sample_weight_mode, collections_abc.Mapping):
-    generic_utils.check_for_unexpected_keys(
-        'sample_weight_mode', sample_weight_mode,
-        [e.output_name for e in training_endpoints])
-
-    for end_point in training_endpoints:
-      if not end_point.should_skip_target_weights():
-        if end_point.output_name not in sample_weight_mode:
-          raise ValueError('Output ' + end_point.output_name +
-                           'missing from `_sample_weight_modes` dictionary')
-        else:
-          end_point.sample_weight_mode = sample_weight_mode.get(
-              end_point.output_name)
-  elif isinstance(sample_weight_mode, (list, tuple)):
-    if len(sample_weight_mode) != len(training_endpoints):
-      raise ValueError('When passing a list as sample_weight_mode, '
-                       'it should have one entry per model output. '
-                       'The model has ' + str(len(training_endpoints)) +
-                       ' outputs, but you passed ' +
-                       str(len(sample_weight_mode)) + '_sample_weight_modes.')
-    for mode, endpoint in zip(sample_weight_mode, training_endpoints):
-      if not endpoint.should_skip_target_weights():
-        endpoint.sample_weight_mode = mode
-  else:
-    for endpoint in training_endpoints:
-      if not endpoint.should_skip_target_weights():
-        endpoint.sample_weight_mode = sample_weight_mode
-
-
-def prepare_loss_functions(loss, output_names):
-  """Converts loss to a list of loss functions.
-
-  Arguments:
-      loss: String (name of objective function), objective function or
-        `tf.losses.Loss` instance. See `tf.losses`. If the model has multiple
-        outputs, you can use a different loss on each output by passing a
-        dictionary or a list of losses. The loss value that will be minimized by
-        the model will then be the sum of all individual losses.
-      output_names: List of model output names.
-
-  Returns:
-      A list of loss objective functions.
-
-  Raises:
-      ValueError: If loss is a dict with keys not in model output names,
-          or if loss is a list with len not equal to model outputs.
-  """
-  if isinstance(loss, collections_abc.Mapping):
-    generic_utils.check_for_unexpected_keys('loss', loss, output_names)
-    loss_functions = []
-    for name in output_names:
-      if name not in loss:
-        logging.warning(
-            'Output {0} missing from loss dictionary. We assume '
-            'this was done on purpose. The fit and evaluate APIs will not be '
-            'expecting any data to be passed to {0}.'.format(name))
-      loss_functions.append(get_loss_function(loss.get(name, None)))
-  elif isinstance(loss, six.string_types):
-    loss_functions = [get_loss_function(loss) for _ in output_names]
-  elif isinstance(loss, collections_abc.Sequence):
-    if len(loss) != len(output_names):
-      raise ValueError('When passing a list as loss, it should have one entry '
-                       'per model outputs. The model has {} outputs, but you '
-                       'passed loss={}'.format(len(output_names), loss))
-    loss_functions = nest.map_structure(get_loss_function, loss)
-  else:
-    loss_functions = [get_loss_function(loss) for _ in range(len(output_names))]
-
-  return loss_functions
-
-
-def prepare_loss_weights(training_endpoints, loss_weights=None):
-  """Converts loss weights to a list of loss weights.
-
-  The result loss weights will be populated on the training endpoint.
-
-  Arguments:
-      training_endpoints: List of model training endpoints.
-      loss_weights: Optional list or dictionary specifying scalar coefficients
-        (Python floats) to weight the loss contributions of different model
-        outputs. The loss value that will be minimized by the model will then be
-        the *weighted sum* of all individual losses, weighted by the
-          `loss_weights` coefficients. If a list, it is expected to have a 1:1
-            mapping to the model's outputs. If a dict, it is expected to map
-            output names (strings) to scalar coefficients.
-
-  Raises:
-      ValueError: If loss weight is a dict with key not in model output names,
-          or if loss is a list with len not equal to model outputs.
-  """
-  if loss_weights is None:
-    for e in training_endpoints:
-      e.loss_weight = 1.
-  elif isinstance(loss_weights, collections_abc.Mapping):
-    generic_utils.check_for_unexpected_keys(
-        'loss_weights', loss_weights,
-        [e.output_name for e in training_endpoints])
-    for e in training_endpoints:
-      e.loss_weight = loss_weights.get(e.output_name, 1.)
-  elif isinstance(loss_weights, list):
-    if len(loss_weights) != len(training_endpoints):
-      raise ValueError('When passing a list as loss_weights, '
-                       'it should have one entry per model output. '
-                       'The model has ' + str(len(training_endpoints)) +
-                       ' outputs, but you passed loss_weights=' +
-                       str(loss_weights))
-    for w, e in zip(loss_weights, training_endpoints):
-      e.loss_weight = w
-  else:
-    raise TypeError('Could not interpret loss_weights argument: ' +
-                    str(loss_weights) + ' - expected a list of dicts.')
-
-
-# TODO(rohanj): This is a hack to get around not depending on feature_column and
-# create a cyclical dependency. Figure out a cleaner solution
-def is_feature_layer(layer):
-  """Returns whether `layer` is a FeatureLayer or not."""
-  return getattr(layer, '_is_feature_layer', False)
-
-
-def is_eager_dataset_or_iterator(data):
-  return context.executing_eagerly() and isinstance(
-      data, (dataset_ops.DatasetV1, dataset_ops.DatasetV2,
-             iterator_ops.OwnedIterator))
-
-
-# pylint: disable=protected-access
-def assert_not_batched(dataset):
-  """Asserts that `dataset` is not batched.
-
-  The algorithm used by this method is sound but not complete. In other words,
-  if the method fails to establish the assertion, it does not mean the dataset
-  is batched.
-
-  Example usage:
-  ```python
-  try:
-    assert_not_batched(dataset)
-    # safe to assume `dataset` it not batched here
-  expect ValueError:
-    # make no assumptions about `dataset`
-  ```
-
-  Args:
-    dataset: The dataset to analyze.
-
-  Raises:
-    ValueError: If the method cannot establish the assertion.
-  """
-  if isinstance(dataset, dataset_ops.DatasetV1Adapter):
-    return assert_not_batched(dataset._dataset)
-  else:
-    allowed_types = [
-        dataset_ops._OptionsDataset,
-        dataset_ops.ConcatenateDataset,
-        dataset_ops.CacheDataset,
-        dataset_ops.FilterDataset,
-        dataset_ops.MapDataset,
-        dataset_ops.ParallelMapDataset,
-        dataset_ops.PrefetchDataset,
-        dataset_ops.RangeDataset,
-        dataset_ops.RepeatDataset,
-        dataset_ops.ShuffleDataset,
-        dataset_ops.SkipDataset,
-        dataset_ops.SparseTensorSliceDataset,
-        dataset_ops.TakeDataset,
-        dataset_ops.TensorDataset,
-        dataset_ops.TensorSliceDataset,
-        dataset_ops.ZipDataset,
-        readers.FixedLengthRecordDatasetV2,
-        readers.TextLineDatasetV2,
-        readers.TFRecordDatasetV2,
-    ]
-    for ty in allowed_types:
-      if isinstance(dataset, ty):
-        for input_dataset in dataset._inputs():
-          assert_not_batched(input_dataset)
-        return
-    raise ValueError('Could not assert that dataset is not batched.')
-
-
-# pylint: disable=protected-access
-def assert_not_shuffled(dataset):
-  """Asserts that `dataset` is not shuffled.
-
-  The algorithm used by this method is sound but not complete. In other words,
-  if the method fails to establish the assertion, it does not mean the dataset
-  is shuffled.
-
-  Example usage:
-  ```python
-  try:
-    assert_not_shuffled(dataset)
-    # safe to assume `dataset` it not shuffled here
-  expect ValueError:
-    # make no assumptions about `dataset`
-  ```
-
-  Args:
-    dataset: The dataset to analyze.
-
-  Raises:
-    ValueError: If the method cannot establish the assertion.
-  """
-  if isinstance(dataset, dataset_ops.DatasetV1Adapter):
-    return assert_not_shuffled(dataset._dataset)
-  else:
-    allowed_types = [
-        dataset_ops._OptionsDataset,
-        dataset_ops.BatchDataset,
-        dataset_ops.ConcatenateDataset,
-        dataset_ops.CacheDataset,
-        dataset_ops.FilterDataset,
-        dataset_ops.MapDataset,
-        dataset_ops.PaddedBatchDataset,
-        dataset_ops.ParallelMapDataset,
-        dataset_ops.PrefetchDataset,
-        dataset_ops.RangeDataset,
-        dataset_ops.RepeatDataset,
-        dataset_ops.SkipDataset,
-        dataset_ops.SparseTensorSliceDataset,
-        dataset_ops.TakeDataset,
-        dataset_ops.TensorDataset,
-        dataset_ops.TensorSliceDataset,
-        dataset_ops.WindowDataset,
-        dataset_ops.ZipDataset,
-        readers.FixedLengthRecordDatasetV2,
-        readers.TextLineDatasetV2,
-        readers.TFRecordDatasetV2,
-    ]
-    for ty in allowed_types:
-      if isinstance(dataset, ty):
-        for input_dataset in dataset._inputs():
-          assert_not_shuffled(input_dataset)
-        return
-    raise ValueError('Could not assert that dataset is not shuffled.')
-
-
-def verify_dataset_shuffled(x):
-  """Verifies that the dataset is shuffled.
-
-  Args:
-    x: Dataset passed as an input to the model.
-
-  Raises:
-    ValueError: if the dataset is not already shuffled.
-  """
-  assert isinstance(x, dataset_ops.DatasetV2)
-  try:
-    assert_not_shuffled(x)
-  except ValueError:
-    # Dataset may or may not be shuffled.
-    return
-  else:
-    logging.warning('Expected a shuffled dataset but input dataset `x` is '
-                    'not shuffled. Please invoke `shuffle()` on input dataset.')
-
-
-def is_dataset_or_iterator(data):
-  return isinstance(data, (dataset_ops.DatasetV1, dataset_ops.DatasetV2,
-                           iterator_ops.Iterator, iterator_ops.OwnedIterator))
-
-
-def get_iterator(dataset):
-  """Create and initialize an iterator from a dataset."""
-  if context.executing_eagerly():
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
-  else:
-    iterator = dataset_ops.make_initializable_iterator(dataset)
-  initialize_iterator(iterator)
-  return iterator
-
-
-def initialize_iterator(iterator):
-  if not context.executing_eagerly():
-    init_op = iterator.initializer
-    K.get_session((init_op,)).run(init_op)
-
-
-def extract_tensors_from_dataset(dataset):
-  """Extract a tuple of tensors `inputs, targets, sample_weight` from a dataset.
-
-  Arguments:
-    dataset: Dataset instance.
-
-  Returns:
-    Tuple of tensors `x, y, weights`. `y` and `weights` entry may be None.
-  """
-  iterator = get_iterator(dataset)
-  inputs, targets, sample_weight = unpack_iterator_input(iterator)
-  return inputs, targets, sample_weight
-
-
-def unpack_iterator_input(iterator):
-  """Convert a dataset iterator to a tuple of tensors `x, y, sample_weights`.
-
-  Arguments:
-    iterator: Instance of a dataset iterator.
-
-  Returns:
-    Tuple of tensors `x, y, weights`. `y` and `weights` entry may be None.
-  """
-  try:
-    next_element = iterator.get_next()
-  except errors.OutOfRangeError:
-    raise RuntimeError('Your dataset iterator ran out of data; '
-                       'Make sure that your dataset can generate '
-                       'required number of samples.')
-
-  if isinstance(next_element, (list, tuple)):
-    if len(next_element) not in [2, 3]:
-      raise ValueError(
-          'Please provide model inputs as a list or tuple of 2 or 3 '
-          'elements: (input, target) or (input, target, sample_weights) '
-          'Received %s' % next_element)
-    if len(next_element) == 2:
-      x, y = next_element
-      weights = None
-    else:
-      x, y, weights = next_element
-  else:
-    x = next_element
-    y = None
-    weights = None
-  return x, y, weights
-
-
-def infer_steps_for_dataset(model,
-                            dataset,
-                            steps,
-                            epochs=1,
-                            steps_name='steps'):
-  """Infers steps_per_epoch needed to loop through a dataset.
-
-  Arguments:
-      model: Keras model instance.
-      dataset: Input data of type tf.data.Dataset.
-      steps: Number of steps to draw from the dataset (may be None if unknown).
-      epochs: Number of times to iterate over the dataset.
-      steps_name: The string name of the steps argument, either `steps`,
-        `validation_steps`, or `steps_per_epoch`. Only used for error message
-        formatting.
-
-  Returns:
-    Integer or `None`. Inferred number of steps to loop through the dataset.
-    `None` is returned if 1) the size of the dataset is unknown and `steps` was
-    not specified, or 2) this is multi-worker training and auto sharding is
-    enabled.
-
-  Raises:
-    ValueError: In case of invalid argument values.
-  """
-  assert isinstance(dataset, dataset_ops.DatasetV2)
-  if (model._in_multi_worker_mode() and
-      (dataset.options().experimental_distribute.auto_shard_policy !=
-       AutoShardPolicy.OFF)):
-    # If the dataset would be auto-sharded, we should not infer a local
-    # steps_per_epoch due to the possible inbalanced sharding between workers.
-    return None
-
-  size = K.get_value(cardinality.cardinality(dataset))
-  if size == cardinality.INFINITE and steps is None:
-    raise ValueError('When passing an infinitely repeating dataset, you '
-                     'must specify the `%s` argument.' % (steps_name,))
-  if size >= 0:
-    if steps is not None and steps * epochs > size:
-      if epochs > 1:
-        raise ValueError('The dataset you passed contains %s batches, but you '
-                         'passed `epochs=%s` and `%s=%s`, which is a total of '
-                         '%s steps. We cannot draw that many steps from this '
-                         'dataset. We suggest to set `%s=%s`.' %
-                         (size, epochs, steps_name, steps, steps * epochs,
-                          steps_name, size // epochs))
-      else:
-        raise ValueError('The dataset you passed contains %s batches, but you '
-                         'passed `%s=%s`. We cannot draw that many steps from '
-                         'this dataset. We suggest to set `%s=%s`.' %
-                         (size, steps_name, steps, steps_name, size))
-  if steps is None:
-    if size >= 0:
-      return size
-    return None
-  return steps
-
-
-class ModelInputs(object):
-  """Encapsulates model inputs.
-
-  Allows for transforming model inputs while keeping the same structure.
-  """
-
-  def __init__(self, inputs):
-    self._inputs = inputs
-    self._is_dict = isinstance(self._inputs, dict)
-    self._is_single_input = not isinstance(self._inputs, (list, tuple, dict))
-
-    self._flattened_inputs = []
-    self._input_names = []
-
-    if self._is_dict:
-      for k in sorted(self._inputs.keys()):
-        self._flattened_inputs.append(self._inputs[k])
-        self._input_names.append(k)
-    else:
-      self._flattened_inputs = nest.flatten(self._inputs)
-      self._input_names = [
-          'input_%d' % (i + 1) for i in range(len(self._flattened_inputs))
-      ]
-
-  def get_input_names(self):
-    """Returns keys to name inputs by.
-
-    In case inputs provided were a list, tuple or single entry, we make up a
-    key 'input_%d'. For dictionary case, we return a sorted list of keys.
-    """
-    return self._input_names
-
-  def get_symbolic_inputs(self, return_single_as_list=False):
-    """Returns inputs to be set as self.inputs for a model."""
-    # TODO(karmel): There is a side-effect here where what you get
-    # with as_list and as_dict depends on whether you have called this
-    # method first, since it modifies in place.
-    for i, (k, v) in enumerate(zip(self._input_names, self._flattened_inputs)):
-      if isinstance(v, (list, float, int)):
-        v = np.asarray(v)
-        if v.ndim == 1:
-          v = np.expand_dims(v, 1)
-
-      if isinstance(v, (np.ndarray, ops.EagerTensor)):
-        # We fix the placeholder shape except the batch size.
-        # This is suboptimal, but it is the best we can do with the info
-        # we have. The user should call `model._set_inputs(placeholders)`
-        # to specify custom placeholders if the need arises.
-        shape = (None,) + tuple(v.shape[1:])
-        if shape == (None,):
-          shape = (None, 1)
-        dtype = dtypes.as_dtype(v.dtype)
-        if dtype.is_floating:
-          dtype = K.floatx()
-        v = K.placeholder(shape=shape, name=k, dtype=dtype)
-      elif isinstance(v, tensor_spec.TensorSpec):
-        shape = (None,) + tuple(v.shape.as_list()[1:])
-        if shape == (None,):
-          shape = (None, 1)
-        v = K.placeholder(shape=shape, name=k, dtype=v.dtype)
-
-      self._flattened_inputs[i] = v
-
-    if self._is_dict:
-      return dict(zip(self._input_names, self._flattened_inputs))
-    if self._is_single_input and not return_single_as_list:
-      return self._flattened_inputs[0]
-    return self._flattened_inputs
-
-  def as_dict(self):
-    """An iterable over a dictionary version of inputs."""
-    for k, v in zip(self._input_names, self._flattened_inputs):
-      yield k, v
-
-  def as_list(self):
-    """Returning the inputs as a list."""
-    return self._flattened_inputs
-
-
 # Allow use of methods not exposed to the user.
 # pylint: disable=protected-access
 def get_input_shape_and_dtype(layer):
@@ -1955,187 +218,8 @@ def get_static_batch_size(layer):
   return None
 
 
-def generic_output_names(outputs_list):
-  return ['output_%d' % (i + 1) for i in range(len(outputs_list))]
-
-
-def convert_eager_tensors_to_numpy(structure):
-  """Convert every EagerTensor in `structure` to NumPy.
-
-  Arguments:
-    structure: An arbitrary structure of elements to be converted to NumPy
-      arrays.
-
-  Returns:
-    An identical structure with EagerTensors converted to NumPy arrays.
-  """
-
-  def _convert(element):
-    if isinstance(element, ops.EagerTensor):
-      return element.numpy()
-    return element
-
-  return nest.map_structure(_convert, structure)
-
-
 def list_to_tuple(maybe_list):
   """Datasets will stack the list of tensor, so switch them to tuples."""
   if isinstance(maybe_list, list):
     return tuple(maybe_list)
   return maybe_list
-
-
-def should_run_validation(validation_freq, epoch):
-  """Checks if validation should be run this epoch.
-
-  Arguments:
-    validation_freq: Integer or list. If an integer, specifies how many training
-      epochs to run before a new validation run is performed. If a list,
-      specifies the epochs on which to run validation.
-    epoch: Integer, the number of the training epoch just completed.
-
-  Returns:
-    Bool, True if validation should be run.
-
-  Raises:
-    ValueError: if `validation_freq` is an Integer and less than 1, or if
-    it is neither an Integer nor a Sequence.
-  """
-  # `epoch` is 0-indexed internally but 1-indexed in the public API.
-  one_indexed_epoch = epoch + 1
-
-  if isinstance(validation_freq, int):
-    if validation_freq < 1:
-      raise ValueError('`validation_freq` can not be less than 1.')
-    return one_indexed_epoch % validation_freq == 0
-
-  if not isinstance(validation_freq, collections_abc.Container):
-    raise ValueError('`validation_freq` must be an Integer or '
-                     '`collections_abc.Container` (e.g. list, tuple, etc.)')
-  return one_indexed_epoch in validation_freq
-
-
-def split_training_and_validation_data(x, y, sample_weights, validation_split):
-  """Split input data into train/eval section based on validation_split."""
-  if has_symbolic_tensors(x):
-    raise ValueError('If your data is in the form of symbolic tensors, '
-                     'you cannot use `validation_split`.')
-  if hasattr(x[0], 'shape'):
-    split_at = int(x[0].shape[0] * (1. - validation_split))
-  else:
-    split_at = int(len(x[0]) * (1. - validation_split))
-  x, val_x = (generic_utils.slice_arrays(x, 0, split_at),
-              generic_utils.slice_arrays(x, split_at))
-  y, val_y = (generic_utils.slice_arrays(y, 0, split_at),
-              generic_utils.slice_arrays(y, split_at))
-  if sample_weights:
-    sample_weights, val_sample_weights = (
-        generic_utils.slice_arrays(sample_weights, 0, split_at),
-        generic_utils.slice_arrays(sample_weights, split_at),
-    )
-  else:
-    val_sample_weights = None
-  return x, y, sample_weights, val_x, val_y, val_sample_weights
-
-
-def unpack_validation_data(validation_data, raise_if_ambiguous=True):
-  """Unpack validation data based input type.
-
-  The validation data is not touched if its dataset or dataset iterator.
-  For other type of input (Numpy or tensor), it will be unpacked into tuple of
-  3 which is x, y and sample weights.
-
-  Args:
-    validation_data: dataset, dataset iterator, or numpy, tensor tuple.
-    raise_if_ambiguous: boolean on whether to fail if validation_data cannot be
-      parsed. Otherwise simply return validation_data, None, None and defer the
-      decision to the caller.
-
-  Returns:
-    tuple of 3, (x, y, sample_weights) for numpy and tensor input.
-  """
-  if (isinstance(validation_data, (iterator_ops.Iterator,
-                                   iterator_ops.OwnedIterator,
-                                   dataset_ops.DatasetV2,
-                                   data_utils.Sequence))
-      or not hasattr(validation_data, '__len__')):
-    val_x = validation_data
-    val_y = None
-    val_sample_weight = None
-  elif len(validation_data) == 2:
-    try:
-      val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
-      val_sample_weight = None
-    except ValueError:
-      val_x, val_y, val_sample_weight = validation_data, None, None
-  elif len(validation_data) == 3:
-    try:
-      val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
-    except ValueError:
-      val_x, val_y, val_sample_weight = validation_data, None, None
-  else:
-    if raise_if_ambiguous:
-      raise ValueError(
-          'When passing a `validation_data` argument, '
-          'it must contain either 2 items (x_val, y_val), '
-          'or 3 items (x_val, y_val, val_sample_weights), '
-          'or alternatively it could be a dataset or a '
-          'dataset or a dataset iterator. '
-          'However we received `validation_data=%s`' % validation_data)
-    val_x, val_y, val_sample_weight = validation_data, None, None
-  return val_x, val_y, val_sample_weight
-
-
-class TrainingLoop(object):
-  """TrainingLoop is a wrapper class around the training logic.
-
-  This class is trying to encapsulate the different logic of fit/eval/predict
-  with regard to different data input and model condition.
-
-  Note that TrainingLoop is stateless, which means it doesn't contain any
-  internal field and can be reused with different model and inputs.
-  """
-
-  def fit(self,
-          model,
-          x=None,
-          y=None,
-          batch_size=None,
-          epochs=1,
-          verbose=1,
-          callbacks=None,
-          validation_split=0.,
-          validation_data=None,
-          shuffle=True,
-          class_weight=None,
-          sample_weight=None,
-          initial_epoch=0,
-          steps_per_epoch=None,
-          validation_steps=None,
-          validation_freq=1,
-          **kwargs):
-    """Train the model with the inputs and targets."""
-    raise NotImplementedError()
-
-  def evaluate(self,
-               model,
-               x=None,
-               y=None,
-               batch_size=None,
-               verbose=1,
-               sample_weight=None,
-               steps=None,
-               callbacks=None,
-               **kwargs):
-    """Returns the loss value & metrics values for the model in test mode."""
-    raise NotImplementedError()
-
-  def predict(self,
-              model,
-              x,
-              batch_size=None,
-              verbose=0,
-              steps=None,
-              callbacks=None,
-              **kwargs):
-    raise NotImplementedError()
diff --git a/tensorflow/python/keras/engine/training_utils_v1.py b/tensorflow/python/keras/engine/training_utils_v1.py
new file mode 100644
index 00000000000..c198bad1511
--- /dev/null
+++ b/tensorflow/python/keras/engine/training_utils_v1.py
@@ -0,0 +1,1827 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Training-related utilities."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import atexit
+import collections
+import functools
+import multiprocessing.pool
+import threading
+import time
+
+import numpy as np
+import six
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python import tf2
+from tensorflow.python.data.experimental.ops import cardinality
+from tensorflow.python.data.experimental.ops.distribute_options import AutoShardPolicy
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import composite_tensor_utils
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import smart_cond
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import callbacks as cbks
+from tensorflow.python.keras import losses
+from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras.utils import data_utils
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import losses_utils
+from tensorflow.python.keras.utils import tf_inspect
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
+from tensorflow.python.util.compat import collections_abc
+
+
+@six.add_metaclass(abc.ABCMeta)
+class Aggregator(object):
+  """Abstract base class used to aggregate batch-level outputs of a loop.
+
+  Attributes:
+    use_steps: Whether the loop is using `step` or `batch_size`.
+    num_samples: Total number of samples: `batch_size * num_batches`.
+    steps: Total number of steps.
+    batch_size: Batch size. It is used for validation checks between inputs and
+      outputs.
+    results: What to return at the end of the aggregation loop.
+  """
+
+  def __init__(self, use_steps, num_samples=None, steps=None, batch_size=None):
+    self.use_steps = use_steps
+    self.num_samples = num_samples
+    self.steps = steps
+    self.batch_size = batch_size
+    self.results = []
+
+  @abc.abstractmethod
+  def create(self, batch_outs):
+    """Creates the initial results from the first batch outputs.
+
+    Arguments:
+      batch_outs: A list of batch-level outputs.
+    """
+    raise NotImplementedError('Must be implemented in subclasses.')
+
+  @abc.abstractmethod
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    """Aggregates batch-level results into total results.
+
+    Arguments:
+      batch_outs: A list of batch-level outputs.
+      batch_start: The start index of this batch. Always `None` if `use_steps`
+        is `True`.
+      batch_end: The end index of this batch. Always `None` if `use_steps` is
+        `True`.
+    """
+    raise NotImplementedError('Must be implemented in subclasses.')
+
+  @abc.abstractmethod
+  def finalize(self):
+    """Prepares the total results to be returned."""
+    raise NotImplementedError('Must be implemented in subclasses.')
+
+
+class MetricsAggregator(Aggregator):
+  """Aggregator that calculates loss and metrics info.
+
+  Attributes:
+    use_steps: Whether the loop is using `step` or `batch_size`.
+    num_samples: Total number of samples: `batch_size*num_batches`.
+    steps: Total number of steps, ie number of times to iterate over a dataset
+      to cover all samples.
+  """
+
+  def __init__(self, use_steps, num_samples=None, steps=None):
+    super(MetricsAggregator, self).__init__(
+        use_steps=use_steps,
+        num_samples=num_samples,
+        steps=steps,
+        batch_size=None)
+
+  def create(self, batch_outs):
+    self.results = [0.] * len(batch_outs)
+
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    # Loss.
+    if self.use_steps:
+      self.results[0] += batch_outs[0]
+    else:
+      self.results[0] += batch_outs[0] * (batch_end - batch_start)
+    # Metrics (always stateful, just grab current values.)
+    self.results[1:] = batch_outs[1:]
+
+  def finalize(self):
+    if not self.results:
+      raise ValueError('Empty training data.')
+    self.results[0] /= (self.num_samples or self.steps)
+
+
+class ConcatAggregator(Aggregator):
+  """Combine tensor-likes which cannot be merged on the fly.
+
+  This class expects to aggregate a single tensor-like rather than a nested
+  structure of tensor-likes.
+  """
+
+  def __init__(self, batch_size):
+    self.composite = None
+    super(ConcatAggregator, self).__init__(
+        use_steps=True, num_samples=None, steps=None, batch_size=batch_size)
+
+  def create(self, batch_element):
+    self.composite = composite_tensor_utils.is_composite_or_composite_value(
+        batch_element)
+
+  def aggregate(self, batch_element, batch_start=None, batch_end=None):
+
+    # TODO(psv): Add num_samples check here to detect when output batch
+    # #samples is < batch size and != input batch #samples.
+    if self.batch_size and self.batch_size < batch_element.shape[0]:
+      raise ValueError(
+          'Mismatch between expected batch size and model output batch size. '
+          'Output shape = {}, expected output shape = shape {}'.format(
+              batch_element.shape,
+              (self.batch_size,) + batch_element.shape[1:]))
+    self.results.append(batch_element)
+
+  def finalize(self):
+    # Special case of single batch inference which skips a copy.
+    if len(self.results) == 1:
+      self.results = self.results[0]
+
+    elif self.composite:
+      # TODO(taylorrobie): efficiently concatenate.
+      results = self.results[0]
+      for r in self.results[1:]:
+        results = composite_tensor_utils.append_composite_tensor(results, r)
+      self.results = results
+
+    else:
+      self.results = np.concatenate(self.results, axis=0)
+
+
+_COPY_THREADS = 4
+_COPY_POOL = None
+
+
+def get_copy_pool():
+  """Shared threadpool for copying arrays.
+
+  Pool instantiation takes ~ 2ms, so a singleton pool is used rather than
+  creating a pool per SliceAggregator.
+
+  Returns:
+    The global copy threadpool.
+  """
+  global _COPY_POOL
+  if _COPY_POOL is None:
+    _COPY_POOL = multiprocessing.pool.ThreadPool(_COPY_THREADS)
+    atexit.register(_COPY_POOL.close)
+  return _COPY_POOL
+
+
+class SliceAggregator(Aggregator):
+  """Combine arrays where the final size is known.
+
+  This class expects to aggregate a single tensor-like rather than a nested
+  structure of tensor-likes.
+
+  NumPy copies are an operation that threads handle quite well because all of
+  the heavy lifting is in c and does not need the GIL. Moreover, we can perform
+  lock-free writes to the same buffer in multiple threads because the nature of
+  result aggregation guarantees that either the indices are disjoint or the
+  aggregator will throw an exception in finalize. Moreover, because aggregation
+  is performed on the slowest varying dimension, assignments for a given batch
+  will write to contiguous blocks of memory, further minimizing contention.
+
+  There is, however, some scheduling and context switching overhead which will
+  offset the gains from pipelining the slice assignment. Below a given threshold
+  it is faster to simply assign in the main thread rather than enqueue the
+  assignment in a side thread. The exact threshold will vary from system to
+  system, but the time is not very sensitive to the exact transition so a value
+  of 2 ** 14 was chosen which should be reasonable on most systems.
+  """
+
+  _BINARY_SIZE_THRESHOLD = 2 ** 14
+  _MAX_COPY_SECONDS = 300
+
+  def __init__(self, num_samples, batch_size):
+    self._async_copies = []
+    self._pool = get_copy_pool()
+    self._errors = []
+    super(SliceAggregator, self).__init__(
+        use_steps=False,
+        num_samples=num_samples,
+        steps=None,
+        batch_size=batch_size)
+
+  def create(self, batch_element):
+    # This step does not need to be pipelined because NumPy empty array
+    # initialization is effectively instantaneous.
+    shape = (self.num_samples,) + batch_element.shape[1:]
+    dtype = batch_element.dtype
+
+    self.results = np.empty(shape=shape, dtype=dtype)
+
+  def aggregate(self, batch_element, batch_start, batch_end):
+    # Fail early.
+    if self._errors:
+      six.reraise(type(self._errors[0]), self._errors[0])
+
+    # In the special case of single batch inference, no copy is needed.
+    if batch_end - batch_start == self.num_samples:
+      if self.num_samples != batch_element.shape[0]:
+        raise ValueError(
+            'Mismatch between expected batch size and model output batch size. '
+            'Output shape = {}, expected output shape = shape {}'.format(
+                batch_element.shape, self.results.shape))
+
+      self.results = batch_element
+      return
+
+    # This is an approximate threshold, so we don't need to consider the number
+    # of bytes per element.
+    num_elements = np.prod(batch_element.shape)
+    if num_elements < self._BINARY_SIZE_THRESHOLD:
+      self.results[batch_start:batch_end] = batch_element
+    else:
+      is_finished = threading.Event()
+      self._pool.apply_async(
+          self._slice_assign,
+          args=(batch_element, batch_start, batch_end, is_finished))
+      self._async_copies.append(is_finished)
+
+  def _slice_assign(self, batch_element, batch_start, batch_end, is_finished):
+    """Legacy utility method to slice input arrays."""
+    try:
+      self.results[batch_start:batch_end] = batch_element
+
+    except Exception as e:  # pylint: disable=broad-except
+      # `_slice_assign` should only be called in threads and exceptions raised
+      # in threads do not carry over to the main thread. So instead we perform a
+      # a broad catch in the thread and then store the exception to be re-raised
+      # in the main thread.
+      self._errors.append(e)
+
+    finally:
+      is_finished.set()
+
+  def finalize(self):
+    start_time = time.time()
+    for is_finished in self._async_copies:
+      timeout = max([0., self._MAX_COPY_SECONDS - (time.time() - start_time)])
+      if not is_finished.wait(timeout):
+        raise ValueError('Timed out waiting for copy to complete.')
+
+    if self._errors:
+      six.reraise(self._errors[0].__class__, self._errors[0])
+
+
+class OutputsAggregator(Aggregator):
+  """Aggregator that concatenates outputs."""
+
+  _structure = None
+
+  def create(self, batch_outs):
+    # SparseTensorValue is a named tuple which nest will flatten, so we need
+    # to guard it to properly handle the structure.
+    self._structure = nest.get_traverse_shallow_structure(
+        lambda x: not composite_tensor_utils.is_composite_or_composite_value(x),
+        batch_outs)
+    batch_outs = nest.flatten_up_to(self._structure, batch_outs)
+
+    for batch_element in batch_outs:
+      if composite_tensor_utils.is_composite_or_composite_value(batch_element):
+        # If the output is not a ndarray, it will be either a composite tensor
+        # or a composite tensor's Value object. In either case, we can't
+        # allocate an array to hold the object - we'll handle it later.
+        self.results.append(ConcatAggregator(self.batch_size))
+      elif isinstance(batch_element, np.ndarray):
+        self.results.append(
+            (ConcatAggregator(self.batch_size) if self.use_steps else
+             SliceAggregator(self.num_samples, self.batch_size)))
+      else:
+        # This is not a ndarray, a CompositeTensor, or a CompositeTensorValue.
+        # Fail fast rather than trying to concatenate it.
+        raise RuntimeError('Attempted to aggregate unsupported object {}.'
+                           .format(batch_element))
+
+      self.results[-1].create(batch_element)
+
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    batch_outs = nest.flatten_up_to(self._structure, batch_outs)
+    for batch_element, result in zip(batch_outs, self.results):
+      result.aggregate(batch_element, batch_start, batch_end)
+
+  def finalize(self):
+    for result in self.results:
+      result.finalize()
+    self.results = [i.results for i in self.results]
+    self.results = nest.pack_sequence_as(self._structure, self.results)
+
+
+def get_progbar(model, count_mode, include_metrics=True):
+  """Get Progbar."""
+  if include_metrics:
+    stateful_metric_names = getattr(model, 'metrics_names', None)
+    if stateful_metric_names:
+      stateful_metric_names = stateful_metric_names[1:]  # Exclude `loss`
+  else:
+    stateful_metric_names = None
+  return cbks.ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names)
+
+
+def check_num_samples(ins, batch_size=None, steps=None, steps_name='steps'):
+  """Determine the number of samples provided for training and evaluation.
+
+  The number of samples is not defined when running with `steps`,
+  in which case the number of samples is set to `None`.
+
+  Arguments:
+      ins: List of tensors to be fed to the Keras function.
+      batch_size: Integer batch size or `None` if not defined.
+      steps: Total number of steps (batches of samples) before declaring
+        `_predict_loop` finished. Ignored with the default value of `None`.
+      steps_name: The public API's parameter name for `steps`.
+
+  Raises:
+      ValueError: when `steps` is `None` and the attribute `ins.shape`
+      does not exist. Also raises ValueError when `steps` is not `None`
+      and `batch_size` is not `None` because they are mutually
+      exclusive.
+
+  Returns:
+      When steps is `None`, returns the number of samples to be
+      processed based on the size of the first dimension of the
+      first input numpy array. When steps is not `None` and
+      `batch_size` is `None`, returns `None`.
+  """
+  if steps is not None and batch_size is not None:
+    raise ValueError('If ' + steps_name +
+                     ' is set, the `batch_size` must be None.')
+  if check_steps_argument(ins, steps, steps_name):
+    return None
+
+  if hasattr(ins[0], 'shape'):
+    return int(ins[0].shape[0])
+  return None  # Edge case where ins == [static_learning_phase]
+
+
+def standardize_single_array(x, expected_shape=None):
+  """Expand data of shape (x,) to (x, 1), unless len(expected_shape)==1."""
+  if x is None:
+    return None
+
+  if composite_tensor_utils.is_composite_or_composite_value(x):
+    return x
+
+  if isinstance(x, int):
+    raise ValueError(
+        'Expected an array data type but received an integer: {}'.format(x))
+
+  if (x.shape is not None and len(x.shape) == 1 and
+      (expected_shape is None or len(expected_shape) != 1)):
+    if tensor_util.is_tensor(x):
+      x = array_ops.expand_dims(x, axis=1)
+    else:
+      x = np.expand_dims(x, 1)
+  return x
+
+
+def standardize_input_data(data,
+                           names,
+                           shapes=None,
+                           check_batch_axis=True,
+                           exception_prefix=''):
+  """Normalizes inputs and targets provided by users.
+
+  Users may pass data as a list of arrays, dictionary of arrays,
+  or as a single array. We normalize this to an ordered list of
+  arrays (same order as `names`), while checking that the provided
+  arrays have shapes that match the network's expectations.
+
+  Arguments:
+      data: User-provided input data (polymorphic).
+      names: List of expected array names.
+      shapes: Optional list of expected array shapes.
+      check_batch_axis: Boolean; whether to check that the batch axis of the
+        arrays matches the expected value found in `shapes`.
+      exception_prefix: String prefix used for exception formatting.
+
+  Returns:
+      List of standardized input arrays (one array per model input).
+
+  Raises:
+      ValueError: in case of improperly formatted user-provided data.
+  """
+  try:
+    data_len = len(data)
+  except TypeError:
+    # For instance if data is `None` or a symbolic Tensor.
+    data_len = None
+
+  if not names:
+    if data_len and not isinstance(data, dict):
+      raise ValueError(
+          'Error when checking model ' + exception_prefix + ': '
+          'expected no data, but got:', data)
+    return []
+  if data is None:
+    return [None for _ in range(len(names))]
+
+  if isinstance(data, dict):
+    try:
+      data = [
+          data[x].values
+          if data[x].__class__.__name__ == 'DataFrame' else data[x]
+          for x in names
+      ]
+    except KeyError as e:
+      raise ValueError('No data provided for "' + e.args[0] + '". Need data '
+                       'for each key in: ' + str(names))
+  elif isinstance(data, (list, tuple)):
+    if isinstance(data[0], (list, tuple)):
+      data = [np.asarray(d) for d in data]
+    elif len(names) == 1 and isinstance(data[0], (float, int)):
+      data = [np.asarray(data)]
+    else:
+      data = [
+          x.values if x.__class__.__name__ == 'DataFrame' else x for x in data
+      ]
+  else:
+    data = data.values if data.__class__.__name__ == 'DataFrame' else data
+    data = [data]
+
+  if shapes is not None:
+    data = [
+        standardize_single_array(x, shape) for (x, shape) in zip(data, shapes)
+    ]
+  else:
+    data = [standardize_single_array(x) for x in data]
+
+  if len(data) != len(names):
+    if data and hasattr(data[0], 'shape'):
+      raise ValueError('Error when checking model ' + exception_prefix +
+                       ': the list of Numpy arrays that you are passing to '
+                       'your model is not the size the model expected. '
+                       'Expected to see ' + str(len(names)) + ' array(s), ' +
+                       'for inputs ' + str(names) + ' but instead got the '
+                       'following list of ' + str(len(data)) + ' arrays: ' +
+                       str(data)[:200] + '...')
+    elif len(names) > 1:
+      raise ValueError('Error when checking model ' + exception_prefix +
+                       ': you are passing a list as input to your model, '
+                       'but the model expects a list of ' + str(len(names)) +
+                       ' Numpy arrays instead. The list you passed was: ' +
+                       str(data)[:200])
+    elif len(data) == 1 and not hasattr(data[0], 'shape'):
+      raise TypeError('Error when checking model ' + exception_prefix +
+                      ': data should be a Numpy array, or list/dict of '
+                      'Numpy arrays. Found: ' + str(data)[:200] + '...')
+    elif len(names) == 1:
+      data = [np.asarray(data)]
+
+  # Check shapes compatibility.
+  if shapes:
+    for i in range(len(names)):
+      if shapes[i] is not None:
+        if tensor_util.is_tensor(data[i]):
+          tensorshape = data[i].shape
+          if not tensorshape:
+            continue
+          data_shape = tuple(tensorshape.as_list())
+        elif composite_tensor_utils.is_composite_or_composite_value(data[i]):
+          tensorshape = composite_tensor_utils.get_shape(data[i])
+          data_shape = tuple(tensorshape.as_list())
+        else:
+          data_shape = data[i].shape
+
+        shape = shapes[i]
+        if len(data_shape) != len(shape):
+          raise ValueError('Error when checking ' + exception_prefix +
+                           ': expected ' + names[i] + ' to have ' +
+                           str(len(shape)) + ' dimensions, but got array '
+                           'with shape ' + str(data_shape))
+        if not check_batch_axis:
+          data_shape = data_shape[1:]
+          shape = shape[1:]
+        for dim, ref_dim in zip(data_shape, shape):
+          if ref_dim != dim and ref_dim is not None and dim is not None:
+            raise ValueError('Error when checking ' + exception_prefix +
+                             ': expected ' + names[i] + ' to have shape ' +
+                             str(shape) + ' but got array with shape ' +
+                             str(data_shape))
+  return data
+
+
+def standardize_sample_or_class_weights(x_weight, output_names, weight_type):
+  """Maps `sample_weight` or `class_weight` to model outputs.
+
+  Arguments:
+      x_weight: User-provided `sample_weight` or `class_weight` argument.
+      output_names: List of output names (strings) in the model.
+      weight_type: A string used purely for exception printing.
+
+  Returns:
+      A list of `sample_weight` or `class_weight` where there are exactly
+          one element per model output.
+
+  Raises:
+      ValueError: In case of invalid user-provided argument.
+  """
+  if x_weight is None or (isinstance(x_weight, (list, tuple)) and
+                          len(x_weight) == 0):  # pylint: disable=g-explicit-length-test
+    return [None for _ in output_names]
+  if len(output_names) == 1:
+    if isinstance(x_weight, (list, tuple)) and len(x_weight) == 1:
+      return x_weight
+    if isinstance(x_weight, dict) and output_names[0] in x_weight:
+      return [x_weight[output_names[0]]]
+    else:
+      return [x_weight]
+  if isinstance(x_weight, (list, tuple)):
+    if len(x_weight) != len(output_names):
+      raise ValueError('Provided `' + weight_type + '` was a list of ' +
+                       str(len(x_weight)) + ' elements, but the model has ' +
+                       str(len(output_names)) + ' outputs. '
+                       'You should provide one `' + weight_type + '`'
+                       'array per model output.')
+    return x_weight
+  if isinstance(x_weight, collections_abc.Mapping):
+    generic_utils.check_for_unexpected_keys(weight_type, x_weight, output_names)
+    x_weights = []
+    for name in output_names:
+      x_weights.append(x_weight.get(name))
+    return x_weights
+  else:
+    raise TypeError('The model has multiple outputs, so `' + weight_type + '` '
+                    'should be either a list or a dict. '
+                    'Provided `' + weight_type + '` type not understood: ' +
+                    str(x_weight))
+
+
+def standardize_class_weights(class_weight, output_names):
+  return standardize_sample_or_class_weights(class_weight, output_names,
+                                             'class_weight')
+
+
+def standardize_sample_weights(sample_weight, output_names):
+  return standardize_sample_or_class_weights(sample_weight, output_names,
+                                             'sample_weight')
+
+
+def check_array_lengths(inputs, targets, weights=None):
+  """Does user input validation for numpy arrays.
+
+  Arguments:
+      inputs: list of Numpy arrays of inputs.
+      targets: list of Numpy arrays of targets.
+      weights: list of Numpy arrays of sample weights.
+
+  Raises:
+      ValueError: in case of incorrectly formatted data.
+  """
+
+  def is_tensor_or_composite_tensor(x):
+    return tensor_util.is_tensor(
+        x) or composite_tensor_utils.is_composite_or_composite_value(x)
+
+  def set_of_lengths(x):
+    # Returns a set with the variation between
+    # different shapes, with None => 0
+    if x is None:
+      return {}
+    else:
+      return set([
+          y.shape[0]
+          for y in x
+          if y is not None and not is_tensor_or_composite_tensor(y)
+      ])
+
+  set_x = set_of_lengths(inputs)
+  set_y = set_of_lengths(targets)
+  set_w = set_of_lengths(weights)
+  if len(set_x) > 1:
+    raise ValueError('All input arrays (x) should have '
+                     'the same number of samples. Got array shapes: ' +
+                     str([x.shape for x in inputs]))
+  if len(set_y) > 1:
+    raise ValueError('All target arrays (y) should have '
+                     'the same number of samples. Got array shapes: ' +
+                     str([y.shape for y in targets]))
+  if set_x and set_y and list(set_x)[0] != list(set_y)[0]:
+    raise ValueError('Input arrays should have '
+                     'the same number of samples as target arrays. '
+                     'Found ' + str(list(set_x)[0]) + ' input samples '
+                     'and ' + str(list(set_y)[0]) + ' target samples.')
+  if len(set_w) > 1:
+    raise ValueError('All sample_weight arrays should have '
+                     'the same number of samples. Got array shapes: ' +
+                     str([w.shape for w in weights]))
+  if set_y and set_w and list(set_y)[0] != list(set_w)[0]:
+    raise ValueError('Sample_weight arrays should have '
+                     'the same number of samples as target arrays. Got ' +
+                     str(list(set_y)[0]) + ' input samples and ' +
+                     str(list(set_w)[0]) + ' target samples.')
+
+
+def check_loss_and_target_compatibility(targets, loss_fns, output_shapes):
+  """Does validation on the compatibility of targets and loss functions.
+
+  This helps prevent users from using loss functions incorrectly. This check
+  is purely for UX purposes.
+
+  Arguments:
+      targets: list of Numpy arrays of targets.
+      loss_fns: list of loss functions.
+      output_shapes: list of shapes of model outputs.
+
+  Raises:
+      ValueError: if a loss function or target array
+          is incompatible with an output.
+  """
+  key_loss_fns = {
+      losses.mean_squared_error, losses.binary_crossentropy,
+      losses.categorical_crossentropy
+  }
+  key_loss_classes = (losses.MeanSquaredError, losses.BinaryCrossentropy,
+                      losses.CategoricalCrossentropy)
+  for y, loss, shape in zip(targets, loss_fns, output_shapes):
+    if y is None or loss is None or tensor_util.is_tensor(y):
+      continue
+    if losses.is_categorical_crossentropy(loss):
+      if y.shape[-1] == 1:
+        raise ValueError('You are passing a target array of shape ' +
+                         str(y.shape) +
+                         ' while using as loss `categorical_crossentropy`. '
+                         '`categorical_crossentropy` expects '
+                         'targets to be binary matrices (1s and 0s) '
+                         'of shape (samples, classes). '
+                         'If your targets are integer classes, '
+                         'you can convert them to the expected format via:\n'
+                         '```\n'
+                         'from keras.utils import to_categorical\n'
+                         'y_binary = to_categorical(y_int)\n'
+                         '```\n'
+                         '\n'
+                         'Alternatively, you can use the loss function '
+                         '`sparse_categorical_crossentropy` instead, '
+                         'which does expect integer targets.')
+
+    is_loss_wrapper = isinstance(loss, losses.LossFunctionWrapper)
+    if (isinstance(loss, key_loss_classes) or (is_loss_wrapper and
+                                               (loss.fn in key_loss_fns))):
+      for target_dim, out_dim in zip(y.shape[1:], shape[1:]):
+        if out_dim is not None and target_dim != out_dim:
+          loss_name = loss.name
+          if loss_name is None:
+            loss_type = loss.fn if is_loss_wrapper else type(loss)
+            loss_name = loss_type.__name__
+          raise ValueError('A target array with shape ' + str(y.shape) +
+                           ' was passed for an output of shape ' + str(shape) +
+                           ' while using as loss `' + loss_name + '`. '
+                           'This loss expects targets to have the same shape '
+                           'as the output.')
+
+
+def collect_per_output_metric_info(metrics,
+                                   output_names,
+                                   output_shapes,
+                                   loss_fns,
+                                   is_weighted=False):
+  """Maps metric names and functions to model outputs.
+
+  Arguments:
+      metrics: a list or a list of lists or a dict of metric functions.
+      output_names: a list of the names (strings) of model outputs.
+      output_shapes: a list of the shapes (strings) of model outputs.
+      loss_fns: a list of the loss functions corresponding to the model outputs.
+      is_weighted: Boolean indicating whether the given metrics are weighted.
+
+  Returns:
+      A list (one entry per model output) of dicts.
+      For instance, if the model has 2 outputs, and for the first output
+      we want to compute "binary_accuracy" and "binary_crossentropy",
+      and just "binary_accuracy" for the second output,
+      the list would look like: `[{
+          'acc': binary_accuracy(),
+          'ce': binary_crossentropy(),
+        }, {
+          'acc': binary_accuracy(),
+        }]`
+
+  Raises:
+      TypeError: if an incorrect type is passed for the `metrics` argument.
+  """
+  if not metrics:
+    return [{} for _ in output_names]
+
+  if isinstance(metrics, list):
+    any_sub_list = any(isinstance(m, list) for m in metrics)
+    if any_sub_list:
+      if len(metrics) != len(output_names):
+        raise ValueError('When passing a list of lists as `metrics`, '
+                         'it should have one entry per model output. '
+                         'The model has ' + str(len(output_names)) +
+                         ' outputs, but you passed metrics=' + str(metrics))
+      # User has provided a list of len = len(outputs).
+      nested_metrics = [generic_utils.to_list(m) for m in metrics]
+    else:
+      # If it is a single list we then apply all metrics to all outputs.
+      if len(output_names) > 1:
+        nested_metrics = []
+        for _ in output_names:
+          nested_metrics.append(
+              [metrics_module.clone_metric(m) for m in metrics])
+      else:
+        nested_metrics = [metrics]
+  elif isinstance(metrics, collections_abc.Mapping):
+    generic_utils.check_for_unexpected_keys('metrics', metrics, output_names)
+    nested_metrics = []
+    for name in output_names:
+      output_metrics = generic_utils.to_list(metrics.get(name, []))
+      nested_metrics.append(output_metrics)
+  else:
+    raise TypeError('Type of `metrics` argument not understood. '
+                    'Expected a list or dictionary, found: ' + str(metrics))
+
+  per_output_metrics = []
+  for i, metrics in enumerate(nested_metrics):
+    metrics_dict = collections.OrderedDict()
+    for metric in metrics:
+      metric_name = get_metric_name(metric, is_weighted)
+      metric_fn = get_metric_function(
+          metric, output_shape=output_shapes[i], loss_fn=loss_fns[i])
+
+      # If the metric function is not stateful, we create a stateful version.
+      if not isinstance(metric_fn, metrics_module.Metric):
+        metric_fn = metrics_module.MeanMetricWrapper(
+            metric_fn, name=metric_name)
+      metrics_dict[metric_name] = metric_fn
+    per_output_metrics.append(metrics_dict)
+
+  return per_output_metrics
+
+
+def batch_shuffle(index_array, batch_size):
+  """Shuffles an array in a batch-wise fashion.
+
+  Useful for shuffling HDF5 arrays
+  (where one cannot access arbitrary indices).
+
+  Arguments:
+      index_array: array of indices to be shuffled.
+      batch_size: integer.
+
+  Returns:
+      The `index_array` array, shuffled in a batch-wise fashion.
+  """
+  batch_count = int(len(index_array) / batch_size)
+  # to reshape we need to be cleanly divisible by batch size
+  # we stash extra items and reappend them after shuffling
+  last_batch = index_array[batch_count * batch_size:]
+  index_array = index_array[:batch_count * batch_size]
+  index_array = index_array.reshape((batch_count, batch_size))
+  np.random.shuffle(index_array)
+  index_array = index_array.flatten()
+  return np.append(index_array, last_batch)
+
+
+def standardize_weights(y,
+                        sample_weight=None,
+                        class_weight=None,
+                        sample_weight_mode=None):
+  """Performs sample weight validation and standardization.
+
+  Everything gets normalized to a single sample-wise (or timestep-wise)
+  weight array. If both `sample_weight` and `class_weight` are provided,
+  the weights are multiplied.
+
+  Arguments:
+      y: Numpy array or Tensor of model targets to be weighted.
+      sample_weight: User-provided `sample_weight` argument.
+      class_weight: User-provided `class_weight` argument.
+      sample_weight_mode: One of `None` or `"temporal"`. `"temporal"` indicated
+        that we expect 2D weight data that will be applied to the last 2
+        dimensions of the targets (i.e. we are weighting timesteps, not
+        samples).
+
+  Returns:
+      A numpy array of target weights, one entry per sample to weight.
+
+  Raises:
+      ValueError: In case of invalid user-provided arguments.
+  """
+  # Iterator may return sample_weight as 1-tuple
+  if isinstance(sample_weight, tuple):
+    sample_weight = sample_weight[0]
+  if sample_weight_mode is not None and sample_weight_mode != 'samplewise':
+    if sample_weight_mode != 'temporal':
+      raise ValueError('"sample_weight_mode '
+                       'should be None or "temporal". '
+                       'Found: ' + str(sample_weight_mode))
+    if len(y.shape) < 3:
+      raise ValueError('Found a sample_weight array for '
+                       'an input with shape ' + str(y.shape) + '. '
+                       'Timestep-wise sample weighting (use of '
+                       'sample_weight_mode="temporal") is restricted to '
+                       'outputs that are at least 3D, i.e. that have '
+                       'a time dimension.')
+    if sample_weight is not None and len(sample_weight.shape) != 2:
+      raise ValueError('Found a sample_weight array with shape ' +
+                       str(sample_weight.shape) + '. '
+                       'In order to use timestep-wise sample weighting, '
+                       'you should pass a 2D sample_weight array.')
+  else:
+    if sample_weight is not None and len(sample_weight.shape) != 1:
+      raise ValueError(
+          'Found a sample_weight array with shape {}. In order to '
+          'use timestep-wise sample weights, you should specify '
+          'sample_weight_mode="temporal" in compile(); founssd "{}" '
+          'instead. If you just mean to use sample-wise weights, '
+          'make sure your sample_weight array is 1D.'.format(
+              sample_weight.shape, sample_weight_mode))
+
+  if sample_weight is not None:
+    if len(sample_weight.shape) > len(y.shape):
+      raise ValueError('Found a sample_weight with shape' +
+                       str(sample_weight.shape) + '.'
+                       'Expected sample_weight with rank '
+                       'less than or equal to ' + str(len(y.shape)))
+
+    if (not tensor_util.is_tensor(sample_weight) and
+        y.shape[:sample_weight.ndim] != sample_weight.shape):
+      raise ValueError('Found a sample_weight array with shape ' +
+                       str(sample_weight.shape) + ' for an input with shape ' +
+                       str(y.shape) + '. '
+                       'sample_weight cannot be broadcast.')
+
+  # Class weights applied per-sample.
+  class_sample_weight = None
+  if isinstance(class_weight, dict):
+    if len(y.shape) > 2:
+      raise ValueError('`class_weight` not supported for '
+                       '3+ dimensional targets.')
+
+    if tensor_util.is_tensor(y):
+      # Few classes are expected, so densifying is reasonable.
+      keys = np.array(sorted(class_weight.keys()))
+      values = np.array([class_weight[i] for i in keys])
+      weight_vector = np.zeros(np.max(keys) + 1)
+      weight_vector[:] = np.nan
+      weight_vector[keys] = values
+
+      y_classes = smart_cond.smart_cond(
+          len(y.shape.as_list()) == 2 and K.shape(y)[1] > 1,
+          lambda: K.argmax(y, axis=1),
+          lambda: math_ops.cast(K.reshape(y, (-1,)), dtypes.int64))
+      class_sample_weight = array_ops.gather(weight_vector, y_classes)
+      gen_array_ops.check_numerics(
+          class_sample_weight,
+          'Invalid classes or class weights detected. NaN values indicate that '
+          'an appropriate class weight could not be determined.')
+      class_sample_weight = math_ops.cast(class_sample_weight, K.floatx())
+      if sample_weight is not None:
+        sample_weight = math_ops.cast(
+            ops.convert_to_tensor_v2_with_dispatch(sample_weight), K.floatx())
+    else:
+      y_classes = y
+      if len(y.shape) == 2:
+        if y.shape[1] > 1:
+          y_classes = np.argmax(y, axis=1)
+        elif y.shape[1] == 1:
+          y_classes = np.reshape(y, y.shape[0])
+
+      class_sample_weight = np.asarray(
+          [class_weight[cls] for cls in y_classes if cls in class_weight])
+
+      if len(class_sample_weight) != len(y_classes):
+        # subtract the sets to pick all missing classes
+        existing_classes = set(y_classes)
+        existing_class_weight = set(class_weight.keys())
+        raise ValueError(
+            '`class_weight` must contain all classes in the data.'
+            ' The classes %s exist in the data but not in '
+            '`class_weight`.' % (existing_classes - existing_class_weight))
+
+  if class_sample_weight is not None and sample_weight is not None:
+    # Multiply weights if both are provided.
+    return class_sample_weight * sample_weight
+  if sample_weight is not None:
+    return sample_weight
+  if class_sample_weight is not None:
+    return class_sample_weight
+  return None
+
+
+def has_symbolic_tensors(ls):
+  if context.executing_eagerly():
+    return False
+  return has_tensors(ls)
+
+
+def has_tensors(ls):
+  """Returns true if `ls` contains tensors."""
+  # Note: at some point in time ragged tensors didn't count as tensors, so this
+  # returned false for ragged tensors. Making this return true fails some tests
+  # which would then require a steps_per_epoch argument.
+  if isinstance(ls, (list, tuple)):
+    return any(
+        tensor_util.is_tensor(v) and
+        not isinstance(v, ragged_tensor.RaggedTensor) for v in ls)
+  if isinstance(ls, dict):
+    return any(
+        tensor_util.is_tensor(v) and
+        not isinstance(v, ragged_tensor.RaggedTensor)
+        for _, v in six.iteritems(ls))
+  return tensor_util.is_tensor(ls) and not isinstance(
+      ls, ragged_tensor.RaggedTensor)
+
+
+def get_metric_name(metric, weighted=False):
+  """Returns the name corresponding to the given metric input.
+
+  Arguments:
+    metric: Metric function name or reference.
+    weighted: Boolean indicating if the given metric is weighted.
+
+  Returns:
+      The metric name.
+  """
+  if tf2.enabled():
+    # We keep the string that the user has set in compile as the metric name.
+    if isinstance(metric, six.string_types):
+      return metric
+
+    metric = metrics_module.get(metric)
+    return metric.name if hasattr(metric, 'name') else metric.__name__
+  else:
+    metric_name_prefix = 'weighted_' if weighted else ''
+    if metric in ('accuracy', 'acc', 'crossentropy', 'ce'):
+      if metric in ('accuracy', 'acc'):
+        suffix = 'acc'
+      elif metric in ('crossentropy', 'ce'):
+        suffix = 'ce'
+    else:
+      metric_fn = metrics_module.get(metric)
+      # Get metric name as string
+      if hasattr(metric_fn, 'name'):
+        suffix = metric_fn.name
+      else:
+        suffix = metric_fn.__name__
+    metric_name = metric_name_prefix + suffix
+    return metric_name
+
+
+def get_metric_function(metric, output_shape=None, loss_fn=None):
+  """Returns the metric function corresponding to the given metric input.
+
+  Arguments:
+      metric: Metric function name or reference.
+      output_shape: The shape of the output that this metric will be calculated
+        for.
+      loss_fn: The loss function used.
+
+  Returns:
+      The metric function.
+  """
+  if metric not in ['accuracy', 'acc', 'crossentropy', 'ce']:
+    return metrics_module.get(metric)
+
+  is_sparse_categorical_crossentropy = (
+      isinstance(loss_fn, losses.SparseCategoricalCrossentropy) or
+      (isinstance(loss_fn, losses.LossFunctionWrapper) and
+       loss_fn.fn == losses.sparse_categorical_crossentropy))
+
+  is_binary_crossentropy = (
+      isinstance(loss_fn, losses.BinaryCrossentropy) or
+      (isinstance(loss_fn, losses.LossFunctionWrapper) and
+       loss_fn.fn == losses.binary_crossentropy))
+
+  if metric in ['accuracy', 'acc']:
+    if output_shape[-1] == 1 or is_binary_crossentropy:
+      return metrics_module.binary_accuracy
+    elif is_sparse_categorical_crossentropy:
+      return metrics_module.sparse_categorical_accuracy
+    # If the output_shape[-1] is not 1, then we know output is `categorical`.
+    # We assume it is sparse categorical only if loss is explicitly given
+    # as sparse categorical crossentropy loss.
+    return metrics_module.categorical_accuracy
+  else:
+    if output_shape[-1] == 1 or is_binary_crossentropy:
+      return metrics_module.binary_crossentropy
+    elif is_sparse_categorical_crossentropy:
+      return metrics_module.sparse_categorical_crossentropy
+    return metrics_module.categorical_crossentropy
+
+
+def call_metric_function(metric_fn,
+                         y_true,
+                         y_pred=None,
+                         weights=None,
+                         mask=None):
+  """Invokes metric function and returns the metric result tensor."""
+  if mask is not None:
+    mask = math_ops.cast(mask, y_pred.dtype)
+    if weights is None:
+      # Use mask as sample weight.
+      weights = mask
+    else:
+      # Update dimensions of weights to match with mask.
+      weights = math_ops.cast(weights, dtype=y_pred.dtype)
+      mask, _, weights = losses_utils.squeeze_or_expand_dimensions(
+          mask, sample_weight=weights)
+      weights *= mask
+
+  if y_pred is not None:
+    return metric_fn(y_true, y_pred, sample_weight=weights)
+  # `Mean` metric only takes a single value.
+  return metric_fn(y_true, sample_weight=weights)
+
+
+def get_loss_function(loss):
+  """Returns the loss corresponding to the loss input in `compile` API."""
+  if loss is None or isinstance(loss, losses.Loss):
+    return loss
+
+  if tf_inspect.isclass(loss) and issubclass(loss, losses.Loss):
+    # It is not safe to assume that the loss takes no constructor arguments.
+    raise ValueError(
+        'Received uninstantiated Loss class: {}\nPlease call loss ""classes '
+        'before passing them to Model.compile.'.format(loss))
+
+  # Deserialize loss configuration, if needed.
+  if isinstance(loss, collections_abc.Mapping):
+    loss = losses.get(loss)
+
+  # Custom callable class.
+  if callable(loss) and not hasattr(loss, '__name__'):
+    return loss
+
+  # Wrap loss function with signature `(y_true, y_pred, **kwargs)`
+  # in `LossFunctionWrapper` class.
+  loss_fn = losses.get(loss)
+
+  # For losses which are given as strings/functions in the compile API,
+  # we always set the loss reduction type to be `SUM_OVER_BATCH_SIZE`
+  # (both in distribution strategy context and otherwise).
+  return losses.LossFunctionWrapper(
+      loss_fn,
+      name=loss_fn.__name__,
+      reduction=losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE)
+
+
+def validate_dataset_input(x, y, sample_weight, validation_split=None):
+  """Validates user input arguments when a dataset iterator is passed.
+
+  Arguments:
+    x: Input data. A `tf.data` dataset or iterator.
+    y: Target data. It could be either Numpy array(s) or TensorFlow tensor(s).
+      Expected to be `None` when `x` is a dataset iterator.
+    sample_weight: An optional sample-weight array passed by the user to weight
+      the importance of each sample in `x`. Expected to be `None` when `x` is a
+      dataset iterator
+    validation_split: Float between 0 and 1. Fraction of the training data to be
+      used as validation data. Expected to be `None` when `x` is a dataset
+      iterator.
+
+  Raises:
+    ValueError: if argument `y` or `sample_weight` or `validation_split` are
+        provided by user.
+  """
+  if y is not None:
+    raise ValueError('You passed a dataset or dataset iterator (%s) as '
+                     'input `x` to your model. In that case, you should '
+                     'not specify a target (`y`) argument, since the dataset '
+                     'or dataset iterator generates both input data and '
+                     'target data. '
+                     'Received: %s' % (x, y))
+  if sample_weight is not None:
+    raise ValueError('`sample_weight` argument is not supported when input '
+                     '`x` is a dataset or a dataset iterator. Instead, you'
+                     'can provide sample_weight as the third element  of your'
+                     'dataset, i.e. (inputs, targets, sample_weight). '
+                     'Received: x=%s, sample_weight=%s' % (x, sample_weight))
+  if validation_split is not None and validation_split != 0.0:
+    raise ValueError(
+        '`validation_split` argument is not supported when '
+        'input `x` is a dataset or a dataset iterator. '
+        'Received: x=%s, validation_split=%f' % (x, validation_split))
+
+
+def validate_input_types(inp, orig_inp, allow_dict=True, field_name='inputs'):
+  """Helper function to validate either inputs or targets."""
+  if isinstance(inp, (list, tuple)):
+    if not all(isinstance(v, np.ndarray) or
+               tensor_util.is_tensor(v) for v in inp):
+      raise ValueError(
+          'Please provide as model inputs either a single array or a list of '
+          'arrays. You passed: {}={}'.format(field_name, str(orig_inp)))
+  elif isinstance(inp, dict):
+    if not allow_dict:
+      raise ValueError(
+          'You cannot pass a dictionary as model {}.'.format(field_name))
+  elif not isinstance(inp, np.ndarray) and not tensor_util.is_tensor(inp):
+    raise ValueError(
+        'Please provide as model inputs either a single array or a list of '
+        'arrays. You passed: {}={}'.format(field_name, orig_inp))
+
+
+def check_generator_arguments(y=None, sample_weight=None,
+                              validation_split=None):
+  """Validates arguments passed when using a generator."""
+  if y is not None:
+    raise ValueError('`y` argument is not supported when data is'
+                     'a generator or Sequence instance. Instead pass targets'
+                     ' as the second element of the generator.')
+  if sample_weight is not None:
+    raise ValueError('`sample_weight` argument is not supported when data is'
+                     'a generator or Sequence instance. Instead pass sample'
+                     ' weights as the third element of the generator.')
+  if validation_split:
+    raise ValueError('If your data is in the form of a Python generator, '
+                     'you cannot use `validation_split`.')
+
+
+def check_steps_argument(input_data, steps, steps_name):
+  """Validates `steps` argument based on input data's type.
+
+  The cases when `steps` value must be provided are when
+    1. input data passed is an iterator.
+    2. model was built on top of symbolic tensors, input data is not
+       required and is `None`.
+    3. input data passed is a symbolic tensor.
+
+  Arguments:
+      input_data: Input data. Can be Numpy array(s) or TensorFlow tensor(s) or
+        tf.data.Dataset iterator or `None`.
+      steps: Integer or `None`. Total number of steps (batches of samples) to
+        execute.
+      steps_name: The public API's parameter name for `steps`.
+
+  Returns:
+    boolean, True if `steps` argument is required, else False.
+
+  Raises:
+      ValueError: if `steps` argument is required for given input data type
+        but not provided.
+  """
+  is_x_iterator = isinstance(
+      input_data, (iterator_ops.Iterator, iterator_ops.OwnedIterator))
+  if (input_data is None or is_x_iterator or has_symbolic_tensors(input_data) or
+      (isinstance(input_data, list) and not input_data)):
+    if steps is None:
+      input_type_str = 'a Dataset iterator' if is_x_iterator else 'data tensors'
+      raise ValueError('When using {input_type} as input to a model, you should'
+                       ' specify the `{steps_name}` argument.'.format(
+                           input_type=input_type_str, steps_name=steps_name))
+    return True
+
+  if isinstance(input_data, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
+    return True
+
+  if steps is not None:
+    list_types = (np.ndarray, list, tuple)
+    if (isinstance(input_data, list_types) or
+        (isinstance(input_data, dict) and
+         any(isinstance(v, list_types) for v in input_data.values()))):
+      logging.warning('When passing input data as arrays, do not specify '
+                      '`steps_per_epoch`/`steps` argument. '
+                      'Please use `batch_size` instead.')
+  return False
+
+
+def cast_single_tensor(x, dtype=None):
+  if isinstance(x, np.ndarray):
+    x = ops.convert_to_tensor_v2_with_dispatch(x)
+  dtype = dtype or K.floatx()
+  if x.dtype.is_floating:
+    return math_ops.cast(x, dtype=dtype)
+  return x
+
+
+def cast_if_floating_dtype_and_mismatch(targets, outputs):
+  """Returns target data tensors using correct datatype.
+
+  Checks that each target and output pair are the same datatype. If not, casts
+  the target to the output's datatype.
+
+  Args:
+    targets: tensor or list of targets.
+    outputs: tensor or list of outputs.
+
+  Returns:
+    Targets in appropriate datatype.
+  """
+  if tensor_util.is_tensor(targets):
+    # There is one target, so output[0] should be the only output.
+    return cast_single_tensor(targets, dtype=outputs[0].dtype)
+  new_targets = []
+  for target, out in zip(targets, outputs):
+    if isinstance(target, np.ndarray):
+      target = ops.convert_to_tensor_v2_with_dispatch(target)
+    if target.dtype != out.dtype:
+      new_targets.append(cast_single_tensor(target, dtype=out.dtype))
+    else:
+      new_targets.append(target)
+  return new_targets
+
+
+def cast_if_floating_dtype(x, dtype=None):
+  """Casts the given data tensors to the default floating point type.
+
+  Casts only if the input is already a floating point type.
+  Args:
+    x: tensor or list/tuple of tensors.
+    dtype: The dtype to which Tensors should be cast.
+
+  Returns:
+    Converted input.
+  """
+  return nest.map_structure(functools.partial(cast_single_tensor, dtype=dtype),
+                            x)
+
+
+def cast_to_model_input_dtypes(x, model):
+  """Casts the given data tensors to the dtypes of the model inputs.
+
+  Args:
+    x: tensor or list/tuple of tensors.
+    model: The model.
+
+  Returns:
+    Converted input. Each tensor is casted to the corresponding input in
+    `model.inputs`.
+  """
+  input_dtypes = nest.map_structure(lambda t: t.dtype, model.inputs)
+  return nest.map_structure(math_ops.cast, x, input_dtypes)
+
+
+def prepare_sample_weight_modes(training_endpoints, sample_weight_mode):
+  """Prepares sample weight modes for the model.
+
+  Args:
+    training_endpoints: List of model _TrainingEndpoints.
+    sample_weight_mode: sample weight mode user input passed from compile API.
+
+  Raises:
+    ValueError: In case of invalid `sample_weight_mode` input.
+  """
+
+  if isinstance(sample_weight_mode, collections_abc.Mapping):
+    generic_utils.check_for_unexpected_keys(
+        'sample_weight_mode', sample_weight_mode,
+        [e.output_name for e in training_endpoints])
+
+    for end_point in training_endpoints:
+      if not end_point.should_skip_target_weights():
+        if end_point.output_name not in sample_weight_mode:
+          raise ValueError('Output ' + end_point.output_name +
+                           'missing from `_sample_weight_modes` dictionary')
+        else:
+          end_point.sample_weight_mode = sample_weight_mode.get(
+              end_point.output_name)
+  elif isinstance(sample_weight_mode, (list, tuple)):
+    if len(sample_weight_mode) != len(training_endpoints):
+      raise ValueError('When passing a list as sample_weight_mode, '
+                       'it should have one entry per model output. '
+                       'The model has ' + str(len(training_endpoints)) +
+                       ' outputs, but you passed ' +
+                       str(len(sample_weight_mode)) + '_sample_weight_modes.')
+    for mode, endpoint in zip(sample_weight_mode, training_endpoints):
+      if not endpoint.should_skip_target_weights():
+        endpoint.sample_weight_mode = mode
+  else:
+    for endpoint in training_endpoints:
+      if not endpoint.should_skip_target_weights():
+        endpoint.sample_weight_mode = sample_weight_mode
+
+
+def prepare_loss_functions(loss, output_names):
+  """Converts loss to a list of loss functions.
+
+  Arguments:
+      loss: String (name of objective function), objective function or
+        `tf.losses.Loss` instance. See `tf.losses`. If the model has multiple
+        outputs, you can use a different loss on each output by passing a
+        dictionary or a list of losses. The loss value that will be minimized by
+        the model will then be the sum of all individual losses.
+      output_names: List of model output names.
+
+  Returns:
+      A list of loss objective functions.
+
+  Raises:
+      ValueError: If loss is a dict with keys not in model output names,
+          or if loss is a list with len not equal to model outputs.
+  """
+  if isinstance(loss, collections_abc.Mapping):
+    generic_utils.check_for_unexpected_keys('loss', loss, output_names)
+    loss_functions = []
+    for name in output_names:
+      if name not in loss:
+        logging.warning(
+            'Output {0} missing from loss dictionary. We assume '
+            'this was done on purpose. The fit and evaluate APIs will not be '
+            'expecting any data to be passed to {0}.'.format(name))
+      loss_functions.append(get_loss_function(loss.get(name, None)))
+  elif isinstance(loss, six.string_types):
+    loss_functions = [get_loss_function(loss) for _ in output_names]
+  elif isinstance(loss, collections_abc.Sequence):
+    if len(loss) != len(output_names):
+      raise ValueError('When passing a list as loss, it should have one entry '
+                       'per model outputs. The model has {} outputs, but you '
+                       'passed loss={}'.format(len(output_names), loss))
+    loss_functions = nest.map_structure(get_loss_function, loss)
+  else:
+    loss_functions = [get_loss_function(loss) for _ in range(len(output_names))]
+
+  return loss_functions
+
+
+def prepare_loss_weights(training_endpoints, loss_weights=None):
+  """Converts loss weights to a list of loss weights.
+
+  The result loss weights will be populated on the training endpoint.
+
+  Arguments:
+      training_endpoints: List of model training endpoints.
+      loss_weights: Optional list or dictionary specifying scalar coefficients
+        (Python floats) to weight the loss contributions of different model
+        outputs. The loss value that will be minimized by the model will then be
+        the *weighted sum* of all individual losses, weighted by the
+          `loss_weights` coefficients. If a list, it is expected to have a 1:1
+            mapping to the model's outputs. If a dict, it is expected to map
+            output names (strings) to scalar coefficients.
+
+  Raises:
+      ValueError: If loss weight is a dict with key not in model output names,
+          or if loss is a list with len not equal to model outputs.
+  """
+  if loss_weights is None:
+    for e in training_endpoints:
+      e.loss_weight = 1.
+  elif isinstance(loss_weights, collections_abc.Mapping):
+    generic_utils.check_for_unexpected_keys(
+        'loss_weights', loss_weights,
+        [e.output_name for e in training_endpoints])
+    for e in training_endpoints:
+      e.loss_weight = loss_weights.get(e.output_name, 1.)
+  elif isinstance(loss_weights, list):
+    if len(loss_weights) != len(training_endpoints):
+      raise ValueError('When passing a list as loss_weights, '
+                       'it should have one entry per model output. '
+                       'The model has ' + str(len(training_endpoints)) +
+                       ' outputs, but you passed loss_weights=' +
+                       str(loss_weights))
+    for w, e in zip(loss_weights, training_endpoints):
+      e.loss_weight = w
+  else:
+    raise TypeError('Could not interpret loss_weights argument: ' +
+                    str(loss_weights) + ' - expected a list of dicts.')
+
+
+# TODO(rohanj): This is a hack to get around not depending on feature_column and
+# create a cyclical dependency. Figure out a cleaner solution
+def is_feature_layer(layer):
+  """Returns whether `layer` is a FeatureLayer or not."""
+  return getattr(layer, '_is_feature_layer', False)
+
+
+def is_eager_dataset_or_iterator(data):
+  return context.executing_eagerly() and isinstance(
+      data, (dataset_ops.DatasetV1, dataset_ops.DatasetV2,
+             iterator_ops.OwnedIterator))
+
+
+# pylint: disable=protected-access
+def get_dataset_graph_def(dataset):
+  if context.executing_eagerly():
+    graph_def_str = dataset._as_serialized_graph().numpy()
+  else:
+    graph_def_str = K.get_value(dataset._as_serialized_graph())
+  return graph_pb2.GraphDef().FromString(graph_def_str)
+
+
+def verify_dataset_shuffled(x):
+  """Verifies that the dataset is shuffled.
+
+  Args:
+    x: Dataset passed as an input to the model.
+
+  Returns:
+    boolean, whether the input dataset is shuffled or not.
+  """
+  assert isinstance(x, dataset_ops.DatasetV2)
+  graph_def = get_dataset_graph_def(x)
+  for node in graph_def.node:
+    if node.op.startswith('ShuffleDataset'):
+      return True
+  # Also check graph_def.library.function for ds.interleave or ds.flat_map
+  for function in graph_def.library.function:
+    for node in function.node_def:
+      if node.op.startswith('ShuffleDataset'):
+        return True
+  logging.warning('Expected a shuffled dataset but input dataset `x` is '
+                  'not shuffled. Please invoke `shuffle()` on input dataset.')
+  return False
+
+
+def is_dataset_or_iterator(data):
+  return isinstance(data, (dataset_ops.DatasetV1, dataset_ops.DatasetV2,
+                           iterator_ops.Iterator, iterator_ops.OwnedIterator))
+
+
+def get_iterator(dataset):
+  """Create and initialize an iterator from a dataset."""
+  if context.executing_eagerly():
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+  else:
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+  initialize_iterator(iterator)
+  return iterator
+
+
+def initialize_iterator(iterator):
+  if not context.executing_eagerly():
+    init_op = iterator.initializer
+    K.get_session((init_op,)).run(init_op)
+
+
+def extract_tensors_from_dataset(dataset):
+  """Extract a tuple of tensors `inputs, targets, sample_weight` from a dataset.
+
+  Arguments:
+    dataset: Dataset instance.
+
+  Returns:
+    Tuple of tensors `x, y, weights`. `y` and `weights` entry may be None.
+  """
+  iterator = get_iterator(dataset)
+  inputs, targets, sample_weight = unpack_iterator_input(iterator)
+  return inputs, targets, sample_weight
+
+
+def unpack_iterator_input(iterator):
+  """Convert a dataset iterator to a tuple of tensors `x, y, sample_weights`.
+
+  Arguments:
+    iterator: Instance of a dataset iterator.
+
+  Returns:
+    Tuple of tensors `x, y, weights`. `y` and `weights` entry may be None.
+  """
+  try:
+    next_element = iterator.get_next()
+  except errors.OutOfRangeError:
+    raise RuntimeError('Your dataset iterator ran out of data; '
+                       'Make sure that your dataset can generate '
+                       'required number of samples.')
+
+  if isinstance(next_element, (list, tuple)):
+    if len(next_element) not in [2, 3]:
+      raise ValueError(
+          'Please provide model inputs as a list or tuple of 2 or 3 '
+          'elements: (input, target) or (input, target, sample_weights) '
+          'Received %s' % next_element)
+    if len(next_element) == 2:
+      x, y = next_element
+      weights = None
+    else:
+      x, y, weights = next_element
+  else:
+    x = next_element
+    y = None
+    weights = None
+  return x, y, weights
+
+
+def infer_steps_for_dataset(model,
+                            dataset,
+                            steps,
+                            epochs=1,
+                            steps_name='steps'):
+  """Infers steps_per_epoch needed to loop through a dataset.
+
+  Arguments:
+      model: Keras model instance.
+      dataset: Input data of type tf.data.Dataset.
+      steps: Number of steps to draw from the dataset (may be None if unknown).
+      epochs: Number of times to iterate over the dataset.
+      steps_name: The string name of the steps argument, either `steps`,
+        `validation_steps`, or `steps_per_epoch`. Only used for error message
+        formatting.
+
+  Returns:
+    Integer or `None`. Inferred number of steps to loop through the dataset.
+    `None` is returned if 1) the size of the dataset is unknown and `steps` was
+    not specified, or 2) this is multi-worker training and auto sharding is
+    enabled.
+
+  Raises:
+    ValueError: In case of invalid argument values.
+  """
+  assert isinstance(dataset, dataset_ops.DatasetV2)
+  if (model._in_multi_worker_mode() and
+      (dataset.options().experimental_distribute.auto_shard_policy !=
+       AutoShardPolicy.OFF)):
+    # If the dataset would be auto-sharded, we should not infer a local
+    # steps_per_epoch due to the possible inbalanced sharding between workers.
+    return None
+
+  size = K.get_value(cardinality.cardinality(dataset))
+  if size == cardinality.INFINITE and steps is None:
+    raise ValueError('When passing an infinitely repeating dataset, you '
+                     'must specify the `%s` argument.' % (steps_name,))
+  if size >= 0:
+    if steps is not None and steps * epochs > size:
+      if epochs > 1:
+        raise ValueError('The dataset you passed contains %s batches, but you '
+                         'passed `epochs=%s` and `%s=%s`, which is a total of '
+                         '%s steps. We cannot draw that many steps from this '
+                         'dataset. We suggest to set `%s=%s`.' %
+                         (size, epochs, steps_name, steps, steps * epochs,
+                          steps_name, size // epochs))
+      else:
+        raise ValueError('The dataset you passed contains %s batches, but you '
+                         'passed `%s=%s`. We cannot draw that many steps from '
+                         'this dataset. We suggest to set `%s=%s`.' %
+                         (size, steps_name, steps, steps_name, size))
+  if steps is None:
+    if size >= 0:
+      return size
+    return None
+  return steps
+
+
+class ModelInputs(object):
+  """Encapsulates model inputs.
+
+  Allows for transforming model inputs while keeping the same structure.
+  """
+
+  def __init__(self, inputs):
+    self._inputs = inputs
+    self._is_dict = isinstance(self._inputs, dict)
+    self._is_single_input = not isinstance(self._inputs, (list, tuple, dict))
+
+    self._flattened_inputs = []
+    self._input_names = []
+
+    if self._is_dict:
+      for k in sorted(self._inputs.keys()):
+        self._flattened_inputs.append(self._inputs[k])
+        self._input_names.append(k)
+    else:
+      self._flattened_inputs = nest.flatten(self._inputs)
+      self._input_names = [
+          'input_%d' % (i + 1) for i in range(len(self._flattened_inputs))
+      ]
+
+  def get_input_names(self):
+    """Returns keys to name inputs by.
+
+    In case inputs provided were a list, tuple or single entry, we make up a
+    key 'input_%d'. For dictionary case, we return a sorted list of keys.
+    """
+    return self._input_names
+
+  def get_symbolic_inputs(self, return_single_as_list=False):
+    """Returns inputs to be set as self.inputs for a model."""
+    # TODO(karmel): There is a side-effect here where what you get
+    # with as_list and as_dict depends on whether you have called this
+    # method first, since it modifies in place.
+    for i, (k, v) in enumerate(zip(self._input_names, self._flattened_inputs)):
+      if isinstance(v, (list, float, int)):
+        v = np.asarray(v)
+        if v.ndim == 1:
+          v = np.expand_dims(v, 1)
+
+      if isinstance(v, np.ndarray):
+        # We fix the placeholder shape except the batch size.
+        # This is suboptimal, but it is the best we can do with the info
+        # we have. The user should call `model._set_inputs(placeholders)`
+        # to specify custom placeholders if the need arises.
+        shape = (None,) + tuple(v.shape[1:])
+        if shape == (None,):
+          shape = (None, 1)
+        dtype = dtypes.as_dtype(v.dtype)
+        if dtype.is_floating:
+          dtype = K.floatx()
+        v = K.placeholder(shape=shape, name=k, dtype=dtype)
+      elif isinstance(v, tensor_spec.TensorSpec):
+        shape = (None,) + tuple(v.shape.as_list()[1:])
+        if shape == (None,):
+          shape = (None, 1)
+        v = K.placeholder(shape=shape, name=k, dtype=v.dtype)
+
+      self._flattened_inputs[i] = v
+
+    if self._is_dict:
+      return dict(zip(self._input_names, self._flattened_inputs))
+    if self._is_single_input and not return_single_as_list:
+      return self._flattened_inputs[0]
+    return self._flattened_inputs
+
+  def as_dict(self):
+    """An iterable over a dictionary version of inputs."""
+    for k, v in zip(self._input_names, self._flattened_inputs):
+      yield k, v
+
+  def as_list(self):
+    """Returning the inputs as a list."""
+    return self._flattened_inputs
+
+
+# Allow use of methods not exposed to the user.
+# pylint: disable=protected-access
+
+
+# pylint: enable=protected-access
+
+
+def generic_output_names(outputs_list):
+  return ['output_%d' % (i + 1) for i in range(len(outputs_list))]
+
+
+def should_run_validation(validation_freq, epoch):
+  """Checks if validation should be run this epoch.
+
+  Arguments:
+    validation_freq: Integer or list. If an integer, specifies how many training
+      epochs to run before a new validation run is performed. If a list,
+      specifies the epochs on which to run validation.
+    epoch: Integer, the number of the training epoch just completed.
+
+  Returns:
+    Bool, True if validation should be run.
+
+  Raises:
+    ValueError: if `validation_freq` is an Integer and less than 1, or if
+    it is neither an Integer nor a Sequence.
+  """
+  # `epoch` is 0-indexed internally but 1-indexed in the public API.
+  one_indexed_epoch = epoch + 1
+
+  if isinstance(validation_freq, int):
+    if validation_freq < 1:
+      raise ValueError('`validation_freq` can not be less than 1.')
+    return one_indexed_epoch % validation_freq == 0
+
+  if not isinstance(validation_freq, collections_abc.Container):
+    raise ValueError('`validation_freq` must be an Integer or '
+                     '`collections_abc.Container` (e.g. list, tuple, etc.)')
+  return one_indexed_epoch in validation_freq
+
+
+def split_training_and_validation_data(x, y, sample_weights, validation_split):
+  """Split input data into train/eval section based on validation_split."""
+  if has_symbolic_tensors(x):
+    raise ValueError('If your data is in the form of symbolic tensors, '
+                     'you cannot use `validation_split`.')
+  if hasattr(x[0], 'shape'):
+    split_at = int(x[0].shape[0] * (1. - validation_split))
+  else:
+    split_at = int(len(x[0]) * (1. - validation_split))
+  x, val_x = (generic_utils.slice_arrays(x, 0, split_at),
+              generic_utils.slice_arrays(x, split_at))
+  y, val_y = (generic_utils.slice_arrays(y, 0, split_at),
+              generic_utils.slice_arrays(y, split_at))
+  if sample_weights:
+    sample_weights, val_sample_weights = (
+        generic_utils.slice_arrays(sample_weights, 0, split_at),
+        generic_utils.slice_arrays(sample_weights, split_at),
+    )
+  else:
+    val_sample_weights = None
+  return x, y, sample_weights, val_x, val_y, val_sample_weights
+
+
+def unpack_validation_data(validation_data, raise_if_ambiguous=True):
+  """Unpack validation data based input type.
+
+  The validation data is not touched if its dataset or dataset iterator.
+  For other type of input (Numpy or tensor), it will be unpacked into tuple of
+  3 which is x, y and sample weights.
+
+  Args:
+    validation_data: dataset, dataset iterator, or numpy, tensor tuple.
+    raise_if_ambiguous: boolean on whether to fail if validation_data cannot be
+      parsed. Otherwise simply return validation_data, None, None and defer the
+      decision to the caller.
+
+  Returns:
+    tuple of 3, (x, y, sample_weights) for numpy and tensor input.
+  """
+  if (isinstance(validation_data, (iterator_ops.Iterator,
+                                   iterator_ops.OwnedIterator,
+                                   dataset_ops.DatasetV2,
+                                   data_utils.Sequence))
+      or not hasattr(validation_data, '__len__')):
+    val_x = validation_data
+    val_y = None
+    val_sample_weight = None
+  elif len(validation_data) == 2:
+    try:
+      val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
+      val_sample_weight = None
+    except ValueError:
+      val_x, val_y, val_sample_weight = validation_data, None, None
+  elif len(validation_data) == 3:
+    try:
+      val_x, val_y, val_sample_weight = validation_data  # pylint: disable=unpacking-non-sequence
+    except ValueError:
+      val_x, val_y, val_sample_weight = validation_data, None, None
+  else:
+    if raise_if_ambiguous:
+      raise ValueError(
+          'When passing a `validation_data` argument, '
+          'it must contain either 2 items (x_val, y_val), '
+          'or 3 items (x_val, y_val, val_sample_weights), '
+          'or alternatively it could be a dataset or a '
+          'dataset or a dataset iterator. '
+          'However we received `validation_data=%s`' % validation_data)
+    val_x, val_y, val_sample_weight = validation_data, None, None
+  return val_x, val_y, val_sample_weight
+
+
+class TrainingLoop(object):
+  """TrainingLoop is a wrapper class around the training logic.
+
+  This class is trying to encapsulate the different logic of fit/eval/predict
+  with regard to different data input and model condition.
+
+  Note that TrainingLoop is stateless, which means it doesn't contain any
+  internal field and can be reused with different model and inputs.
+  """
+
+  def fit(self,
+          model,
+          x=None,
+          y=None,
+          batch_size=None,
+          epochs=1,
+          verbose=1,
+          callbacks=None,
+          validation_split=0.,
+          validation_data=None,
+          shuffle=True,
+          class_weight=None,
+          sample_weight=None,
+          initial_epoch=0,
+          steps_per_epoch=None,
+          validation_steps=None,
+          validation_freq=1,
+          **kwargs):
+    """Train the model with the inputs and targets."""
+    raise NotImplementedError()
+
+  def evaluate(self,
+               model,
+               x=None,
+               y=None,
+               batch_size=None,
+               verbose=1,
+               sample_weight=None,
+               steps=None,
+               callbacks=None,
+               **kwargs):
+    """Returns the loss value & metrics values for the model in test mode."""
+    raise NotImplementedError()
+
+  def predict(self,
+              model,
+              x,
+              batch_size=None,
+              verbose=0,
+              steps=None,
+              callbacks=None,
+              **kwargs):
+    raise NotImplementedError()
diff --git a/tensorflow/python/keras/engine/training_utils_test.py b/tensorflow/python/keras/engine/training_utils_v1_test.py
similarity index 55%
rename from tensorflow/python/keras/engine/training_utils_test.py
rename to tensorflow/python/keras/engine/training_utils_v1_test.py
index 06d26ef5088..64d44cb7955 100644
--- a/tensorflow/python/keras/engine/training_utils_test.py
+++ b/tensorflow/python/keras/engine/training_utils_v1_test.py
@@ -35,7 +35,7 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import keras_tensor
-from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.engine import training_utils_v1
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
@@ -45,7 +45,7 @@ class ModelInputsTest(test.TestCase):
 
   def test_single_thing(self):
     a = np.ones(10)
-    model_inputs = training_utils.ModelInputs(a)
+    model_inputs = training_utils_v1.ModelInputs(a)
     self.assertEqual(['input_1'], model_inputs.get_input_names())
     vals = model_inputs.get_symbolic_inputs()
     self.assertTrue(tensor_util.is_tensor(vals))
@@ -55,132 +55,84 @@ class ModelInputsTest(test.TestCase):
     self.assertEqual(backend.floatx(), vals[0].dtype)
 
   def test_single_thing_eager(self):
+    if not context.executing_eagerly():
+      self.skipTest('Run in eager mode only.')
     with testing_utils.use_keras_tensors_scope(False):
-      with context.eager_mode():
-        a = np.ones(10, dtype=np.int32)
-        model_inputs = training_utils.ModelInputs(a)
-        self.assertEqual(['input_1'], model_inputs.get_input_names())
-        val = model_inputs.get_symbolic_inputs()
-        self.assertTrue(tf_utils.is_symbolic_tensor(val))
-        vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-        self.assertEqual(1, len(vals))
-        self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
-        self.assertEqual(dtypes.int32, vals[0].dtype)
+      a = np.ones(10, dtype=np.int32)
+      model_inputs = training_utils_v1.ModelInputs(a)
+      self.assertEqual(['input_1'], model_inputs.get_input_names())
+      val = model_inputs.get_symbolic_inputs()
+      self.assertTrue(tf_utils.is_symbolic_tensor(val))
+      vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
+      self.assertEqual(1, len(vals))
+      self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
+      self.assertEqual(dtypes.int32, vals[0].dtype)
     with testing_utils.use_keras_tensors_scope(True):
-      with context.eager_mode():
-        a = np.ones(10, dtype=np.int32)
-        model_inputs = training_utils.ModelInputs(a)
-        self.assertEqual(['input_1'], model_inputs.get_input_names())
-        val = model_inputs.get_symbolic_inputs()
-        self.assertIsInstance(val, keras_tensor.KerasTensor)
-        vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-        self.assertEqual(1, len(vals))
-        self.assertIsInstance(vals[0], keras_tensor.KerasTensor)
-        self.assertEqual(dtypes.int32, vals[0].dtype)
+      a = np.ones(10, dtype=np.int32)
+      model_inputs = training_utils_v1.ModelInputs(a)
+      self.assertEqual(['input_1'], model_inputs.get_input_names())
+      val = model_inputs.get_symbolic_inputs()
+      self.assertIsInstance(val, keras_tensor.KerasTensor)
+      vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
+      self.assertEqual(1, len(vals))
+      self.assertIsInstance(vals[0], keras_tensor.KerasTensor)
+      self.assertEqual(dtypes.int32, vals[0].dtype)
 
   def test_list(self):
     a = [np.ones(10), np.ones(20)]
-    model_inputs = training_utils.ModelInputs(a)
+    model_inputs = training_utils_v1.ModelInputs(a)
     self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
     vals = model_inputs.get_symbolic_inputs()
     self.assertTrue(tensor_util.is_tensor(vals[0]))
     self.assertTrue(tensor_util.is_tensor(vals[1]))
 
   def test_list_eager(self):
+    if not context.executing_eagerly():
+      self.skipTest('Run in eager mode only.')
     with testing_utils.use_keras_tensors_scope(False):
-      with context.eager_mode():
-        a = [np.ones(10), np.ones(20)]
-        model_inputs = training_utils.ModelInputs(a)
-        self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
-        vals = model_inputs.get_symbolic_inputs()
-        self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
-        self.assertTrue(tf_utils.is_symbolic_tensor(vals[1]))
+      a = [np.ones(10), np.ones(20)]
+      model_inputs = training_utils_v1.ModelInputs(a)
+      self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
+      vals = model_inputs.get_symbolic_inputs()
+      self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
+      self.assertTrue(tf_utils.is_symbolic_tensor(vals[1]))
     with testing_utils.use_keras_tensors_scope(True):
-      with context.eager_mode():
-        a = [np.ones(10), np.ones(20)]
-        model_inputs = training_utils.ModelInputs(a)
-        self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
-        vals = model_inputs.get_symbolic_inputs()
-        self.assertIsInstance(vals[0], keras_tensor.KerasTensor)
-        self.assertIsInstance(vals[1], keras_tensor.KerasTensor)
+      a = [np.ones(10), np.ones(20)]
+      model_inputs = training_utils_v1.ModelInputs(a)
+      self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
+      vals = model_inputs.get_symbolic_inputs()
+      self.assertIsInstance(vals[0], keras_tensor.KerasTensor)
+      self.assertIsInstance(vals[1], keras_tensor.KerasTensor)
 
   def test_dict(self):
     a = {'b': np.ones(10), 'a': np.ones(20)}
-    model_inputs = training_utils.ModelInputs(a)
+    model_inputs = training_utils_v1.ModelInputs(a)
     self.assertEqual(['a', 'b'], model_inputs.get_input_names())
     vals = model_inputs.get_symbolic_inputs()
     self.assertTrue(tensor_util.is_tensor(vals['a']))
     self.assertTrue(tensor_util.is_tensor(vals['b']))
 
   def test_dict_eager(self):
+    if not context.executing_eagerly():
+      self.skipTest('Run in eager mode only.')
     with testing_utils.use_keras_tensors_scope(False):
-      with context.eager_mode():
-        a = {'b': np.ones(10), 'a': np.ones(20)}
-        model_inputs = training_utils.ModelInputs(a)
-        self.assertEqual(['a', 'b'], model_inputs.get_input_names())
-        vals = model_inputs.get_symbolic_inputs()
-        self.assertTrue(tf_utils.is_symbolic_tensor(vals['a']))
-        self.assertTrue(tf_utils.is_symbolic_tensor(vals['b']))
+      a = {'b': np.ones(10), 'a': np.ones(20)}
+      model_inputs = training_utils_v1.ModelInputs(a)
+      self.assertEqual(['a', 'b'], model_inputs.get_input_names())
+      vals = model_inputs.get_symbolic_inputs()
+      self.assertTrue(tf_utils.is_symbolic_tensor(vals['a']))
+      self.assertTrue(tf_utils.is_symbolic_tensor(vals['b']))
     with testing_utils.use_keras_tensors_scope(True):
-      with context.eager_mode():
-        a = {'b': np.ones(10), 'a': np.ones(20)}
-        model_inputs = training_utils.ModelInputs(a)
-        self.assertEqual(['a', 'b'], model_inputs.get_input_names())
-        vals = model_inputs.get_symbolic_inputs()
-        self.assertIsInstance(vals['a'], keras_tensor.KerasTensor)
-        self.assertIsInstance(vals['b'], keras_tensor.KerasTensor)
+      a = {'b': np.ones(10), 'a': np.ones(20)}
+      model_inputs = training_utils_v1.ModelInputs(a)
+      self.assertEqual(['a', 'b'], model_inputs.get_input_names())
+      vals = model_inputs.get_symbolic_inputs()
+      self.assertIsInstance(vals['a'], keras_tensor.KerasTensor)
+      self.assertIsInstance(vals['b'], keras_tensor.KerasTensor)
 
 
 class DatasetUtilsTest(test.TestCase, parameterized.TestCase):
 
-  @parameterized.named_parameters(
-      # pylint: disable=g-long-lambda
-      ('Batch', lambda: dataset_ops.Dataset.range(5).batch(2), ValueError),
-      ('Cache', lambda: dataset_ops.Dataset.range(5).cache()),
-      ('Concatenate', lambda: dataset_ops.Dataset.range(5).concatenate(
-          dataset_ops.Dataset.range(5))),
-      ('FlatMap', lambda: dataset_ops.Dataset.range(5).flat_map(
-          lambda _: dataset_ops.Dataset.from_tensors(0)), ValueError),
-      ('Filter', lambda: dataset_ops.Dataset.range(5).filter(lambda _: True)),
-      ('FixedLengthRecordDatasetV2',
-       lambda: readers.FixedLengthRecordDatasetV2([], 42)),
-      ('FromTensors', lambda: dataset_ops.Dataset.from_tensors(0)),
-      ('FromTensorSlices',
-       lambda: dataset_ops.Dataset.from_tensor_slices([0, 0, 0])),
-      ('Interleave', lambda: dataset_ops.Dataset.range(5).interleave(
-          lambda _: dataset_ops.Dataset.from_tensors(0), cycle_length=1),
-       ValueError),
-      ('ParallelInterleave', lambda: dataset_ops.Dataset.range(5).interleave(
-          lambda _: dataset_ops.Dataset.from_tensors(0),
-          cycle_length=1,
-          num_parallel_calls=1), ValueError),
-      ('Map', lambda: dataset_ops.Dataset.range(5).map(lambda x: x)),
-      ('Options',
-       lambda: dataset_ops.Dataset.range(5).with_options(dataset_ops.Options())
-      ),
-      ('PaddedBatch', lambda: dataset_ops.Dataset.range(5).padded_batch(2, []),
-       ValueError),
-      ('ParallelMap', lambda: dataset_ops.Dataset.range(5).map(
-          lambda x: x, num_parallel_calls=1)),
-      ('Prefetch', lambda: dataset_ops.Dataset.range(5).prefetch(1)),
-      ('Range', lambda: dataset_ops.Dataset.range(0)),
-      ('Repeat', lambda: dataset_ops.Dataset.range(0).repeat(0)),
-      ('Shuffle', lambda: dataset_ops.Dataset.range(5).shuffle(1)),
-      ('Skip', lambda: dataset_ops.Dataset.range(5).skip(2)),
-      ('Take', lambda: dataset_ops.Dataset.range(5).take(2)),
-      ('TextLineDataset', lambda: readers.TextLineDatasetV2([])),
-      ('TFRecordDataset', lambda: readers.TFRecordDatasetV2([])),
-      ('Window', lambda: dataset_ops.Dataset.range(5).window(2), ValueError),
-      ('Zip', lambda: dataset_ops.Dataset.zip(dataset_ops.Dataset.range(5))),
-      # pylint: enable=g-long-lambda
-  )
-  def test_assert_not_batched(self, dataset_fn, expected_error=None):
-    if expected_error is None:
-      training_utils.assert_not_batched(dataset_fn())
-    else:
-      with self.assertRaises(expected_error):
-        training_utils.assert_not_batched(dataset_fn())
-
   @parameterized.named_parameters(
       # pylint: disable=g-long-lambda
       ('Batch', lambda: dataset_ops.Dataset.range(5).batch(2)),
@@ -188,7 +140,9 @@ class DatasetUtilsTest(test.TestCase, parameterized.TestCase):
       ('Concatenate', lambda: dataset_ops.Dataset.range(5).concatenate(
           dataset_ops.Dataset.range(5))),
       ('FlatMap', lambda: dataset_ops.Dataset.range(5).flat_map(
-          lambda _: dataset_ops.Dataset.from_tensors(0)), ValueError),
+          lambda _: dataset_ops.Dataset.from_tensors(0))),
+      ('FlatMap_Shuffle', lambda: dataset_ops.Dataset.range(5).flat_map(
+          lambda _: dataset_ops.Dataset.from_tensors(0).shuffle(1)), True),
       ('Filter', lambda: dataset_ops.Dataset.range(5).filter(lambda _: True)),
       ('FixedLengthRecordDatasetV2',
        lambda: readers.FixedLengthRecordDatasetV2([], 42)),
@@ -196,8 +150,10 @@ class DatasetUtilsTest(test.TestCase, parameterized.TestCase):
       ('FromTensorSlices',
        lambda: dataset_ops.Dataset.from_tensor_slices([0, 0, 0])),
       ('Interleave', lambda: dataset_ops.Dataset.range(5).interleave(
-          lambda _: dataset_ops.Dataset.from_tensors(0), cycle_length=1),
-       ValueError),
+          lambda _: dataset_ops.Dataset.from_tensors(0), cycle_length=1)),
+      ('Interleave_Shuffle', lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0).shuffle(1),
+          cycle_length=1), True),
       ('Map', lambda: dataset_ops.Dataset.range(5).map(lambda x: x)),
       ('Options',
        lambda: dataset_ops.Dataset.range(5).with_options(dataset_ops.Options())
@@ -206,13 +162,13 @@ class DatasetUtilsTest(test.TestCase, parameterized.TestCase):
       ('ParallelInterleave', lambda: dataset_ops.Dataset.range(5).interleave(
           lambda _: dataset_ops.Dataset.from_tensors(0),
           cycle_length=1,
-          num_parallel_calls=1), ValueError),
+          num_parallel_calls=1)),
       ('ParallelMap', lambda: dataset_ops.Dataset.range(5).map(
           lambda x: x, num_parallel_calls=1)),
       ('Prefetch', lambda: dataset_ops.Dataset.range(5).prefetch(1)),
       ('Range', lambda: dataset_ops.Dataset.range(0)),
       ('Repeat', lambda: dataset_ops.Dataset.range(0).repeat(0)),
-      ('Shuffle', lambda: dataset_ops.Dataset.range(5).shuffle(1), ValueError),
+      ('Shuffle', lambda: dataset_ops.Dataset.range(5).shuffle(1), True),
       ('Skip', lambda: dataset_ops.Dataset.range(5).skip(2)),
       ('Take', lambda: dataset_ops.Dataset.range(5).take(2)),
       ('TextLineDataset', lambda: readers.TextLineDatasetV2([])),
@@ -221,24 +177,17 @@ class DatasetUtilsTest(test.TestCase, parameterized.TestCase):
       ('Zip', lambda: dataset_ops.Dataset.zip(dataset_ops.Dataset.range(5))),
       # pylint: enable=g-long-lambda
   )
-  def test_assert_not_shuffled(self, dataset_fn, expected_error=None):
-    if expected_error is None:
-      training_utils.assert_not_shuffled(dataset_fn())
+  def test_verify_dataset_shuffled(self, dataset_fn, expect_shuffled=False):
+    dataset = dataset_fn()
+
+    if not expect_shuffled:
+      with test.mock.patch.object(logging, 'warning') as mock_log:
+        shuffled = training_utils_v1.verify_dataset_shuffled(dataset)
+        self.assertRegex(
+            str(mock_log.call_args), 'input dataset `x` is not shuffled.')
+        self.assertFalse(shuffled)
     else:
-      with self.assertRaises(expected_error):
-        training_utils.assert_not_shuffled(dataset_fn())
-
-  def test_verify_dataset_shuffled(self):
-    dataset = dataset_ops.Dataset.range(5)
-    training_utils.assert_not_shuffled(dataset)
-
-    with test.mock.patch.object(logging, 'warning') as mock_log:
-      training_utils.verify_dataset_shuffled(dataset)
-      self.assertRegex(
-          str(mock_log.call_args), 'input dataset `x` is not shuffled.')
-
-    shuffled_dataset = dataset.shuffle(10)
-    training_utils.verify_dataset_shuffled(shuffled_dataset)
+      self.assertTrue(training_utils_v1.verify_dataset_shuffled(dataset))
 
 
 class StandardizeWeightsTest(keras_parameterized.TestCase):
@@ -246,21 +195,22 @@ class StandardizeWeightsTest(keras_parameterized.TestCase):
   def test_sample_weights(self):
     y = np.array([0, 1, 0, 0, 2])
     sample_weights = np.array([0.5, 1., 1., 0., 2.])
-    weights = training_utils.standardize_weights(y, sample_weights)
+    weights = training_utils_v1.standardize_weights(y, sample_weights)
     self.assertAllClose(weights, sample_weights)
 
   def test_class_weights(self):
     y = np.array([0, 1, 0, 0, 2])
     class_weights = {0: 0.5, 1: 1., 2: 1.5}
-    weights = training_utils.standardize_weights(y, class_weight=class_weights)
+    weights = training_utils_v1.standardize_weights(
+        y, class_weight=class_weights)
     self.assertAllClose(weights, np.array([0.5, 1., 0.5, 0.5, 1.5]))
 
   def test_sample_weights_and_class_weights(self):
     y = np.array([0, 1, 0, 0, 2])
     sample_weights = np.array([0.5, 1., 1., 0., 2.])
     class_weights = {0: 0.5, 1: 1., 2: 1.5}
-    weights = training_utils.standardize_weights(y, sample_weights,
-                                                 class_weights)
+    weights = training_utils_v1.standardize_weights(y, sample_weights,
+                                                    class_weights)
     expected = sample_weights * np.array([0.5, 1., 0.5, 0.5, 1.5])
     self.assertAllClose(weights, expected)
 
@@ -327,32 +277,35 @@ class AggregationTest(keras_parameterized.TestCase):
 
   def setUp(self):
     super(AggregationTest, self).setUp()
-    self._old_pool = training_utils._COPY_POOL
-    self._old_threshold = training_utils.SliceAggregator._BINARY_SIZE_THRESHOLD
-    self._old_timeout = training_utils.SliceAggregator._MAX_COPY_SECONDS
-    training_utils._COPY_POOL = MonitoredPool(training_utils._COPY_THREADS)
+    self._old_pool = training_utils_v1._COPY_POOL
+    self._old_threshold = (
+        training_utils_v1.SliceAggregator._BINARY_SIZE_THRESHOLD)
+    self._old_timeout = training_utils_v1.SliceAggregator._MAX_COPY_SECONDS
+    training_utils_v1._COPY_POOL = MonitoredPool(
+        training_utils_v1._COPY_THREADS)
 
   def tearDown(self):
     super(AggregationTest, self).tearDown()
-    training_utils._COPY_POOL = self._old_pool
-    training_utils.SliceAggregator._BINARY_SIZE_THRESHOLD = self._old_threshold
-    training_utils.SliceAggregator._MAX_COPY_SECONDS = self._old_timeout
+    training_utils_v1._COPY_POOL = self._old_pool
+    training_utils_v1.SliceAggregator._BINARY_SIZE_THRESHOLD = (
+        self._old_threshold)
+    training_utils_v1.SliceAggregator._MAX_COPY_SECONDS = self._old_timeout
 
   def _run_with_steps(self):
-    aggregator = training_utils.OutputsAggregator(use_steps=True)
+    aggregator = training_utils_v1.OutputsAggregator(use_steps=True)
     for i, batch in enumerate(np.array_split(_TEST_DATA, 4)):
       if i == 0:
         aggregator.create(batch)
       aggregator.aggregate(batch)
 
     assert len(aggregator.results) == 1
-    assert isinstance(aggregator.results[0], training_utils.ConcatAggregator)
+    assert isinstance(aggregator.results[0], training_utils_v1.ConcatAggregator)
 
     aggregator.finalize()
     return aggregator.results
 
   def _run_without_steps(self):
-    aggregator = training_utils.OutputsAggregator(
+    aggregator = training_utils_v1.OutputsAggregator(
         use_steps=False, num_samples=6)
 
     batch_start = 0
@@ -365,7 +318,7 @@ class AggregationTest(keras_parameterized.TestCase):
       batch_start = batch_end
 
     assert len(aggregator.results) == 1
-    assert isinstance(aggregator.results[0], training_utils.SliceAggregator)
+    assert isinstance(aggregator.results[0], training_utils_v1.SliceAggregator)
 
     aggregator.finalize()
     return aggregator.results
@@ -377,7 +330,7 @@ class AggregationTest(keras_parameterized.TestCase):
     self.assertAllEqual(self._run_without_steps(), _TEST_DATA)
 
   def test_nested_aggregation(self):
-    aggregator = training_utils.OutputsAggregator(
+    aggregator = training_utils_v1.OutputsAggregator(
         use_steps=False, num_samples=6)
 
     batches = np.array_split(_TEST_DATA, 4)
@@ -395,46 +348,46 @@ class AggregationTest(keras_parameterized.TestCase):
     self.assertAllEqual(aggregator.results, (_TEST_DATA, _TEST_DATA))
 
   def test_concat_single_batch(self):
-    aggregator = training_utils.OutputsAggregator(use_steps=True)
+    aggregator = training_utils_v1.OutputsAggregator(use_steps=True)
     data = _TEST_DATA.copy()
     aggregator.create(data)
     assert len(aggregator.results) == 1
-    assert isinstance(aggregator.results[0], training_utils.ConcatAggregator)
+    assert isinstance(aggregator.results[0], training_utils_v1.ConcatAggregator)
 
     aggregator.aggregate(data)
     aggregator.finalize()
     assert aggregator.results is data  # No copy.
 
   def test_slice_single_batch(self):
-    aggregator = training_utils.OutputsAggregator(
+    aggregator = training_utils_v1.OutputsAggregator(
         use_steps=False, num_samples=6)
     data = _TEST_DATA.copy()
     aggregator.create(data)
     assert len(aggregator.results) == 1
-    assert isinstance(aggregator.results[0], training_utils.SliceAggregator)
+    assert isinstance(aggregator.results[0], training_utils_v1.SliceAggregator)
 
     aggregator.aggregate(data, 0, 6)
     aggregator.finalize()
     assert aggregator.results is data  # No copy.
 
   def test_async_copy(self):
-    training_utils.SliceAggregator._BINARY_SIZE_THRESHOLD = 15
+    training_utils_v1.SliceAggregator._BINARY_SIZE_THRESHOLD = 15
     self.assertAllEqual(self._run_without_steps(), _TEST_DATA)
 
     # Two of the four batches will have 20 elements and two will have 10.
-    self.assertEqual(training_utils._COPY_POOL._apply_counter, 2)
+    self.assertEqual(training_utils_v1._COPY_POOL._apply_counter, 2)
 
   def test_async_copy_timeout(self):
-    training_utils.SliceAggregator._BINARY_SIZE_THRESHOLD = 15
-    training_utils.SliceAggregator._MAX_COPY_SECONDS = 0.1
-    training_utils._COPY_POOL._func_wrapper = add_sleep
+    training_utils_v1.SliceAggregator._BINARY_SIZE_THRESHOLD = 15
+    training_utils_v1.SliceAggregator._MAX_COPY_SECONDS = 0.1
+    training_utils_v1._COPY_POOL._func_wrapper = add_sleep
     with self.assertRaisesRegex(ValueError, 'Timed out waiting for copy'):
       self._run_without_steps()
 
   def test_async_copy_reraise(self):
-    training_utils.SliceAggregator._BINARY_SIZE_THRESHOLD = 15
-    training_utils.SliceAggregator._MAX_COPY_SECONDS = 1.
-    training_utils._COPY_POOL._func_wrapper = cause_error
+    training_utils_v1.SliceAggregator._BINARY_SIZE_THRESHOLD = 15
+    training_utils_v1.SliceAggregator._MAX_COPY_SECONDS = 1.
+    training_utils_v1._COPY_POOL._func_wrapper = cause_error
     with self.assertRaisesRegex(TypeError, 'NoneType'):
       self._run_without_steps()
 
diff --git a/tensorflow/python/keras/engine/training_v1.py b/tensorflow/python/keras/engine/training_v1.py
index 2c38de92666..54969bb5e83 100644
--- a/tensorflow/python/keras/engine/training_v1.py
+++ b/tensorflow/python/keras/engine/training_v1.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import warnings
 
 import numpy as np
 
@@ -28,7 +29,6 @@ from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import composite_tensor_utils
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
@@ -40,30 +40,33 @@ from tensorflow.python.framework import type_spec
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import optimizer_v1
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.distribute import distributed_training_utils
+from tensorflow.python.keras.distribute import distributed_training_utils_v1
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import training as training_lib
-from tensorflow.python.keras.engine import training_arrays
-from tensorflow.python.keras.engine import training_distributed
-from tensorflow.python.keras.engine import training_eager
-from tensorflow.python.keras.engine import training_generator
+from tensorflow.python.keras.engine import training_arrays_v1
+from tensorflow.python.keras.engine import training_distributed_v1
+from tensorflow.python.keras.engine import training_eager_v1
+from tensorflow.python.keras.engine import training_generator_v1
 from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.engine import training_utils_v1
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.keras.saving.saved_model import model_serialization
 from tensorflow.python.keras.utils import data_utils
+from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import losses_utils
+from tensorflow.python.keras.utils import tf_inspect
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
-from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.types import core
-from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.compat import collections_abc
 
 try:
@@ -321,8 +324,8 @@ class Model(training_lib.Model):
 
     self._set_optimizer(optimizer)
     is_any_keras_optimizer_v1 = any(
-        (isinstance(opt, optimizers.Optimizer)
-         and not isinstance(opt, optimizers.TFOptimizer)
+        (isinstance(opt, optimizer_v1.Optimizer)
+         and not isinstance(opt, optimizer_v1.TFOptimizer)
         ) for opt in nest.flatten(self.optimizer))
 
     if is_any_keras_optimizer_v1 and ops.executing_eagerly_outside_functions():
@@ -411,7 +414,7 @@ class Model(training_lib.Model):
     base_layer.keras_api_gauge.get_cell('compile').set(True)
 
     # Prepare list of loss functions, same size of model outputs.
-    self.loss_functions = training_utils.prepare_loss_functions(
+    self.loss_functions = training_utils_v1.prepare_loss_functions(
         self.loss, self.output_names)
 
     target_tensors = self._process_target_tensor_for_compile(target_tensors)
@@ -423,7 +426,8 @@ class Model(training_lib.Model):
       self._training_endpoints.append(endpoint)
 
     # Prepare list loss weights, same size of model outputs.
-    training_utils.prepare_loss_weights(self._training_endpoints, loss_weights)
+    training_utils_v1.prepare_loss_weights(self._training_endpoints,
+                                           loss_weights)
 
     # Initialization for Eager mode execution.
     if self.run_eagerly:
@@ -445,7 +449,7 @@ class Model(training_lib.Model):
           masks=self._prepare_output_masks())
 
       # Prepare sample weight modes. List with the same length as model outputs.
-      training_utils.prepare_sample_weight_modes(
+      training_utils_v1.prepare_sample_weight_modes(
           self._training_endpoints, sample_weight_mode)
 
       # Creates the model loss and weighted metrics sub-graphs.
@@ -582,25 +586,25 @@ class Model(training_lib.Model):
     # Case 1: distribution strategy.
     if self._distribution_strategy:
       if self._in_multi_worker_mode():
-        return training_distributed.DistributionMultiWorkerTrainingLoop(
-            training_distributed.DistributionSingleWorkerTrainingLoop())
+        return training_distributed_v1.DistributionMultiWorkerTrainingLoop(
+            training_distributed_v1.DistributionSingleWorkerTrainingLoop())
       else:
-        return training_distributed.DistributionSingleWorkerTrainingLoop()
+        return training_distributed_v1.DistributionSingleWorkerTrainingLoop()
 
     # Case 2: generator-like. Input is Python generator, or Sequence object,
     # or a non-distributed Dataset or iterator in eager execution.
     if data_utils.is_generator_or_sequence(inputs):
-      return training_generator.GeneratorOrSequenceTrainingLoop()
-    if training_utils.is_eager_dataset_or_iterator(inputs):
-      return training_generator.EagerDatasetOrIteratorTrainingLoop()
+      return training_generator_v1.GeneratorOrSequenceTrainingLoop()
+    if training_utils_v1.is_eager_dataset_or_iterator(inputs):
+      return training_generator_v1.EagerDatasetOrIteratorTrainingLoop()
 
     # Case 3: Symbolic tensors or Numpy array-like.
     # This includes Datasets and iterators in graph mode (since they
     # generate symbolic tensors).
     if self.run_eagerly:
-      return training_generator.GeneratorLikeTrainingLoop()
+      return training_generator_v1.GeneratorLikeTrainingLoop()
     else:
-      return training_arrays.ArrayLikeTrainingLoop()
+      return training_arrays_v1.ArrayLikeTrainingLoop()
 
   def fit(self,
           x=None,
@@ -993,7 +997,7 @@ class Model(training_lib.Model):
 
     # Reset metrics on all the distributed (cloned) models.
     if self._distribution_strategy:
-      distributed_training_utils._reset_metrics(self)  # pylint: disable=protected-access
+      distributed_training_utils_v1._reset_metrics(self)  # pylint: disable=protected-access
 
   def train_on_batch(self,
                      x,
@@ -1062,7 +1066,7 @@ class Model(training_lib.Model):
     # for each replica by `self._distribution_strategy` and the same code path
     # as Eager is expected to be taken.
     if self.run_eagerly or self._distribution_strategy:
-      output_dict = training_eager.train_on_batch(
+      output_dict = training_eager_v1.train_on_batch(
           self,
           x,
           y,
@@ -1072,7 +1076,7 @@ class Model(training_lib.Model):
                  + output_dict['metrics'])
       outputs = [_non_none_constant_value(v) for v in outputs]  # pylint: disable=protected-access
     else:
-      x = training_utils.ModelInputs(x).as_list()
+      x = training_utils_v1.ModelInputs(x).as_list()
       ins = x + list(y or []) + list(sample_weights or [])
 
       if not isinstance(K.symbolic_learning_phase(), int):
@@ -1141,7 +1145,7 @@ class Model(training_lib.Model):
     # If `self._distribution_strategy` is True, then we are in a replica context
     # at this point.
     if self.run_eagerly or self._distribution_strategy:
-      output_dict = training_eager.test_on_batch(
+      output_dict = training_eager_v1.test_on_batch(
           self,
           x,
           y,
@@ -1151,7 +1155,7 @@ class Model(training_lib.Model):
                  + output_dict['metrics'])
       outputs = [_non_none_constant_value(v) for v in outputs]  # pylint: disable=protected-access
     else:
-      x = training_utils.ModelInputs(x).as_list()
+      x = training_utils_v1.ModelInputs(x).as_list()
       inputs = x + list(y or []) + list(sample_weights or [])
 
       self._update_sample_weight_modes(sample_weights=sample_weights)
@@ -1196,7 +1200,7 @@ class Model(training_lib.Model):
     # If `self._distribution_strategy` is True, then we are in a replica context
     # at this point.
     if self.run_eagerly or self._distribution_strategy:
-      inputs = training_utils.cast_if_floating_dtype(inputs)
+      inputs = training_utils_v1.cast_if_floating_dtype(inputs)
       if isinstance(inputs, collections_abc.Sequence):
         # Unwrap lists with only one input, as we do when training on batch
         if len(inputs) == 1:
@@ -1211,8 +1215,6 @@ class Model(training_lib.Model):
       return outputs[0]
     return outputs
 
-  @deprecation.deprecated(
-      None, 'Please use Model.fit, which supports generators.')
   def fit_generator(self,
                     generator,
                     steps_per_epoch=None,
@@ -1234,6 +1236,9 @@ class Model(training_lib.Model):
       `Model.fit` now supports generators, so there is no longer any need to use
       this endpoint.
     """
+    warnings.warn('`model.fit_generator` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `Model.fit`, which supports generators.')
     return self.fit(
         generator,
         steps_per_epoch=steps_per_epoch,
@@ -1250,8 +1255,6 @@ class Model(training_lib.Model):
         shuffle=shuffle,
         initial_epoch=initial_epoch)
 
-  @deprecation.deprecated(
-      None, 'Please use Model.evaluate, which supports generators.')
   def evaluate_generator(self,
                          generator,
                          steps=None,
@@ -1266,6 +1269,9 @@ class Model(training_lib.Model):
       `Model.evaluate` now supports generators, so there is no longer any need
       to use this endpoint.
     """
+    warnings.warn('`Model.evaluate_generator` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `Model.evaluate`, which supports generators.')
     self._check_call_args('evaluate_generator')
 
     return self.evaluate(
@@ -1277,8 +1283,6 @@ class Model(training_lib.Model):
         verbose=verbose,
         callbacks=callbacks)
 
-  @deprecation.deprecated(
-      None, 'Please use Model.predict, which supports generators.')
   def predict_generator(self,
                         generator,
                         steps=None,
@@ -1293,6 +1297,9 @@ class Model(training_lib.Model):
       `Model.predict` now supports generators, so there is no longer any need
       to use this endpoint.
     """
+    warnings.warn('`Model.predict_generator` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please use `Model.predict`, which supports generators.')
     return self.predict(
         generator,
         steps=steps,
@@ -1365,7 +1372,7 @@ class Model(training_lib.Model):
   def _prepare_validation_data(self, validation_data, batch_size,
                                validation_steps):
     """Unpack and check the validation data."""
-    val_x, val_y, val_sample_weights = training_utils.unpack_validation_data(
+    val_x, val_y, val_sample_weights = training_utils_v1.unpack_validation_data(
         validation_data)
     return self._standardize_user_data(
         val_x,
@@ -1395,7 +1402,7 @@ class Model(training_lib.Model):
             'We currently do not support enabling `run_eagerly` with '
             'distribution strategy.')
 
-      if (distributed_training_utils.is_distributing_by_cloning(self) and
+      if (distributed_training_utils_v1.is_distributing_by_cloning(self) and
           (not self.built or not self.inputs or not self.outputs)):
         raise ValueError(
             'We currently do not support distribution strategy with a '
@@ -1444,7 +1451,7 @@ class Model(training_lib.Model):
 
   def _compile_eagerly(self, metrics, weighted_metrics, sample_weight_mode):
     # Prepare sample weight modes. List with the same length as model outputs.
-    training_utils.prepare_sample_weight_modes(
+    training_utils_v1.prepare_sample_weight_modes(
         self._training_endpoints, sample_weight_mode)
     # Prepare sample weights.
     self._prepare_sample_weights()
@@ -1703,7 +1710,7 @@ class Model(training_lib.Model):
 
     # Avoids the override in Sequential.layers which filters Input layers.
     # (Which are often the very layers that we're after.)
-    layers = trackable_layer_utils.filter_empty_layer_containers(self._layers)
+    layers = layer_utils.filter_empty_layer_containers(self._layers)
     first_layer = next(layers, None)
     if first_layer:
       # The per-replica static batch size.
@@ -1783,10 +1790,10 @@ class Model(training_lib.Model):
         output_shapes.append(None)
       else:
         output_shapes.append(output.shape.as_list())
-    self._per_output_metrics = training_utils.collect_per_output_metric_info(
+    self._per_output_metrics = training_utils_v1.collect_per_output_metric_info(
         metrics, self.output_names, output_shapes, self.loss_functions)
     self._per_output_weighted_metrics = (
-        training_utils.collect_per_output_metric_info(
+        training_utils_v1.collect_per_output_metric_info(
             weighted_metrics,
             self.output_names,
             output_shapes,
@@ -1896,7 +1903,7 @@ class Model(training_lib.Model):
     metric_results = []
     for metric_name, metric_fn in metrics_dict.items():
       with K.name_scope(metric_name):
-        metric_result = training_utils.call_metric_function(
+        metric_result = training_utils_v1.call_metric_function(
             metric_fn, y_true, y_pred, weights=weights, mask=mask)
         metric_results.append(metric_result)
     return metric_results
@@ -2133,7 +2140,7 @@ class Model(training_lib.Model):
     # in the codebase.
     if isinstance(x, dataset_ops.DatasetV2):
       if shuffle:
-        training_utils.verify_dataset_shuffled(x)
+        training_utils_v1.verify_dataset_shuffled(x)
 
     strategy = self._distribution_strategy
     with strategy.scope():
@@ -2185,8 +2192,8 @@ class Model(training_lib.Model):
         x = ds.batch(batch_size, drop_remainder=drop_remainder)
       else:
         assert isinstance(x, dataset_ops.DatasetV2)
-        training_utils.validate_dataset_input(x, y, sample_weight,
-                                              validation_split)
+        training_utils_v1.validate_dataset_input(x, y, sample_weight,
+                                                 validation_split)
     return x
 
   def _standardize_user_data(self,
@@ -2263,28 +2270,28 @@ class Model(training_lib.Model):
       # Graph mode dataset. We'll pass the dataset as-is (unless
       # `extract_tensors_from_dataset` is True, in which case we extract
       # the tensors from the dataset and we output them.
-      training_utils.validate_dataset_input(x, y, sample_weight,
-                                            validation_split)
+      training_utils_v1.validate_dataset_input(x, y, sample_weight,
+                                               validation_split)
       if shuffle:
-        training_utils.verify_dataset_shuffled(x)
+        training_utils_v1.verify_dataset_shuffled(x)
 
       is_dataset = True
       if extract_tensors_from_dataset:
         # We do this for `train_on_batch`/etc.
-        x, y, sample_weight = training_utils.extract_tensors_from_dataset(x)
+        x, y, sample_weight = training_utils_v1.extract_tensors_from_dataset(x)
     elif isinstance(x, iterator_ops.Iterator):
       # Graph mode iterator. We extract the symbolic tensors.
-      training_utils.validate_dataset_input(x, y, sample_weight,
-                                            validation_split)
+      training_utils_v1.validate_dataset_input(x, y, sample_weight,
+                                               validation_split)
       iterator = x
-      x, y, sample_weight = training_utils.unpack_iterator_input(iterator)
+      x, y, sample_weight = training_utils_v1.unpack_iterator_input(iterator)
       is_dataset = True
     else:
       is_dataset = False
 
     # Validates `steps` argument based on x's type.
     if check_steps:
-      training_utils.check_steps_argument(x, steps, steps_name)
+      training_utils_v1.check_steps_argument(x, steps, steps_name)
 
     # First, we build the model on the fly if necessary.
     if not self.inputs:
@@ -2347,7 +2354,7 @@ class Model(training_lib.Model):
     # Standardize the inputs.
     if not isinstance(x, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
       # TODO(fchollet): run static checks with dataset output shape(s).
-      x = training_utils.standardize_input_data(
+      x = training_utils_v1.standardize_input_data(
           x,
           feed_input_names,
           feed_input_shapes,
@@ -2375,7 +2382,7 @@ class Model(training_lib.Model):
 
       def _type_spec_from_value(value):
         """Grab type_spec without converting array-likes to tensors."""
-        if isinstance(value, composite_tensor.CompositeTensor):
+        if tf_utils.is_extension_type(value):
           return value._type_spec  # pylint: disable=protected-access
         # Get a TensorSpec for array-like data without
         # converting the data to a Tensor
@@ -2394,8 +2401,8 @@ class Model(training_lib.Model):
     if y is not None:
       # Prepare self._sample_weight_modes. List with the same length as
       # model outputs.
-      training_utils.prepare_sample_weight_modes(self._training_endpoints,
-                                                 self.sample_weight_mode)
+      training_utils_v1.prepare_sample_weight_modes(self._training_endpoints,
+                                                    self.sample_weight_mode)
       feed_output_names = self._feed_output_names
       feed_sample_weight_modes = self._sample_weight_modes
       if not self._is_graph_network:
@@ -2404,7 +2411,7 @@ class Model(training_lib.Model):
         feed_output_shapes = self._feed_output_shapes
 
       # Standardize the outputs.
-      y = training_utils.standardize_input_data(
+      y = training_utils_v1.standardize_input_data(
           y,
           feed_output_names,
           # Don't enforce target shapes to match output shapes.
@@ -2415,22 +2422,22 @@ class Model(training_lib.Model):
 
       # Generate sample-wise weight values given the `sample_weight` and
       # `class_weight` arguments.
-      sample_weights = training_utils.standardize_sample_weights(
+      sample_weights = training_utils_v1.standardize_sample_weights(
           sample_weight, feed_output_names)
-      class_weights = training_utils.standardize_class_weights(
+      class_weights = training_utils_v1.standardize_class_weights(
           class_weight, feed_output_names)
 
       sample_weights = [
-          training_utils.standardize_weights(ref, sw, cw, mode)
+          training_utils_v1.standardize_weights(ref, sw, cw, mode)
           for (ref, sw, cw, mode) in zip(y, sample_weights, class_weights,
                                          feed_sample_weight_modes)
       ]
       # Check that all arrays have the same length.
       if not self._distribution_strategy:
-        training_utils.check_array_lengths(x, y, sample_weights)
+        training_utils_v1.check_array_lengths(x, y, sample_weights)
         if self._is_graph_network and not run_eagerly:
           # Additional checks to avoid users mistakenly using improper loss fns.
-          training_utils.check_loss_and_target_compatibility(
+          training_utils_v1.check_loss_and_target_compatibility(
               y, self._feed_loss_fns, feed_output_shapes)
 
       sample_weights, _, _ = training_utils.handle_partial_sample_weights(
@@ -2465,11 +2472,12 @@ class Model(training_lib.Model):
     # iterator and only one batch of samples is required, we fetch the data
     # tensors from the iterator and then standardize them.
     if isinstance(inputs, (dataset_ops.DatasetV1, dataset_ops.DatasetV2)):
-      inputs, targets, _ = training_utils.extract_tensors_from_dataset(inputs)
+      inputs, targets, _ = training_utils_v1.extract_tensors_from_dataset(
+          inputs)
     # We type-check that `inputs` and `targets` are either single arrays
     # or lists of arrays, and extract a flat list of inputs from the passed
     # structure.
-    training_utils.validate_input_types(inputs, orig_inputs)
+    training_utils_v1.validate_input_types(inputs, orig_inputs)
 
     if isinstance(inputs, (list, tuple)):
       processed_inputs += list(inputs)
@@ -2504,14 +2512,14 @@ class Model(training_lib.Model):
       if not self.inputs:
         # For subclassed models, a robust input spec is not available so we
         # must cast to the model dtype.
-        inputs = training_utils.cast_if_floating_dtype(inputs, self.dtype)
+        inputs = training_utils_v1.cast_if_floating_dtype(inputs, self.dtype)
 
       def create_tensor_spec(t):
         return tensor_spec.TensorSpec(t.shape, t.dtype)
 
       cast_inputs = nest.map_structure(create_tensor_spec, inputs)
-    elif training_utils.has_tensors(inputs):
-      cast_inputs = training_utils.cast_if_floating_dtype(inputs)
+    elif training_utils_v1.has_tensors(inputs):
+      cast_inputs = training_utils_v1.cast_if_floating_dtype(inputs)
     else:
       cast_inputs = inputs
     self._set_inputs(cast_inputs)
@@ -2520,11 +2528,11 @@ class Model(training_lib.Model):
   def _compile_from_inputs(self, all_inputs, target, orig_inputs, orig_target):
     if target is not None:
       # We need to use `y` to set the model targets.
-      if training_utils.has_tensors(target):
-        target = training_utils.cast_if_floating_dtype_and_mismatch(
+      if training_utils_v1.has_tensors(target):
+        target = training_utils_v1.cast_if_floating_dtype_and_mismatch(
             target, self.outputs)
-      training_utils.validate_input_types(target, orig_target,
-                                          allow_dict=False, field_name='target')
+      training_utils_v1.validate_input_types(
+          target, orig_target, allow_dict=False, field_name='target')
       if isinstance(target, (list, tuple)):
         all_inputs += list(target)
       else:
@@ -2623,7 +2631,7 @@ class Model(training_lib.Model):
         input_shape = (None,) + tuple(inputs.as_list()[1:])
       elif isinstance(inputs, dict):
         # We assert that the first layer is a FeatureLayer.
-        if not training_utils.is_feature_layer(self.layers[0]):
+        if not training_utils_v1.is_feature_layer(self.layers[0]):
           raise ValueError('Passing a dictionary input to a Sequential Model '
                            'which doesn\'t have FeatureLayer as the first layer'
                            ' is an error.')
@@ -2638,7 +2646,7 @@ class Model(training_lib.Model):
 
     # On-the-fly setting of symbolic model inputs (either by using the tensor
     # provided, or by creating a placeholder if Numpy data was provided).
-    model_inputs = training_utils.ModelInputs(inputs)
+    model_inputs = training_utils_v1.ModelInputs(inputs)
     inputs = model_inputs.get_symbolic_inputs()
     self.inputs = model_inputs.get_symbolic_inputs(return_single_as_list=True)
     self.input_names = model_inputs.get_input_names()
@@ -2662,7 +2670,7 @@ class Model(training_lib.Model):
     #                    data adapter since it assumes nest.flatten ordering.
     outputs = nest.flatten(outputs)
     self.outputs = outputs
-    self.output_names = training_utils.generic_output_names(outputs)
+    self.output_names = training_utils_v1.generic_output_names(outputs)
     # TODO(scottzhu): Should we cleanup the self._training_endpoints here?
     self.built = True
 
@@ -2810,7 +2818,8 @@ class Model(training_lib.Model):
   def _trackable_saved_model_saver(self):
     return model_serialization.ModelSavedModelSaver(self)
 
-  def _get_compile_args(self):
+  def _get_compile_args(self, user_metrics=True):
+    del user_metrics
     self._assert_compile_was_called()
     kwargs = {
         'loss': self.loss,
@@ -2852,7 +2861,7 @@ class DistributedCallbackModel(Model):
     self._original_model.load_weights(filepath, by_name=False)
     # Copy the weights from the original model to each of the replicated models.
     orig_model_weights = self._original_model.get_weights()
-    distributed_training_utils.set_weights(
+    distributed_training_utils_v1.set_weights(
         self._original_model._distribution_strategy, self,  # pylint: disable=protected-access
         orig_model_weights)
 
@@ -3132,7 +3141,7 @@ class _TrainingTarget(object):
 
 
 def _is_symbolic_tensor(x):
-  return tensor_util.is_tensor(x) and not isinstance(x, ops.EagerTensor)
+  return tensor_util.is_tensor(x)
 
 
 def _convert_scipy_sparse_tensor(value, expected_input):
@@ -3183,7 +3192,7 @@ def _get_metrics_from_layers(layers):
     List of metrics.
   """
   metrics = []
-  layers = trackable_layer_utils.filter_empty_layer_containers(layers)
+  layers = layer_utils.filter_empty_layer_containers(layers)
   for layer in layers:
     if isinstance(layer, Model):
       # We cannot call 'metrics' on the model because we do not want to
diff --git a/tensorflow/python/keras/estimator/__init__.py b/tensorflow/python/keras/estimator/__init__.py
index 895dd0458ef..2f11d1b9f87 100644
--- a/tensorflow/python/keras/estimator/__init__.py
+++ b/tensorflow/python/keras/estimator/__init__.py
@@ -38,7 +38,9 @@ def model_to_estimator(
     custom_objects=None,
     model_dir=None,
     config=None,
-    checkpoint_format='saver'):
+    checkpoint_format='saver',
+    metric_names_map=None,
+    export_outputs=None):
   """Constructs an `Estimator` instance from given keras model.
 
   If you use infrastructure or other tooling that relies on Estimators, you can
@@ -71,6 +73,28 @@ def model_to_estimator(
   estimator.train(input_fn, steps=1)
   ```
 
+  Example with customized export signature:
+  ```python
+  inputs = {'a': tf.keras.Input(..., name='a'),
+            'b': tf.keras.Input(..., name='b')}
+  outputs = {'c': tf.keras.layers.Dense(..., name='c')(inputs['a']),
+             'd': tf.keras.layers.Dense(..., name='d')(inputs['b'])}
+  keras_model = tf.keras.Model(inputs, outputs)
+  keras_model.compile(...)
+  export_outputs = {'c': tf.estimator.export.RegressionOutput,
+                    'd': tf.estimator.export.ClassificationOutput}
+
+  estimator = tf.keras.estimator.model_to_estimator(
+      keras_model, export_outputs=export_outputs)
+
+  def input_fn():
+    return dataset_ops.Dataset.from_tensors(
+        ({'features': features, 'sample_weights': sample_weights},
+         targets))
+
+  estimator.train(input_fn, steps=1)
+  ```
+
   Args:
     keras_model: A compiled Keras model object. This argument is mutually
       exclusive with `keras_model_path`. Estimator's `model_fn` uses the
@@ -100,6 +124,32 @@ def model_to_estimator(
       `tf.train.Checkpoint`. Currently, saving object-based checkpoints from
       `model_to_estimator` is only supported by Functional and Sequential
       models. Defaults to 'saver'.
+    metric_names_map: Optional dictionary mapping Keras model output metric
+      names to custom names. This can be used to override the default Keras
+      model output metrics names in a multi IO model use case and provide custom
+      names for the `eval_metric_ops` in Estimator.
+      The Keras model metric names can be obtained using `model.metrics_names`
+      excluding any loss metrics such as total loss and output losses.
+      For example, if your Keras model has two outputs `out_1` and `out_2`,
+      with `mse` loss and `acc` metric, then `model.metrics_names` will be
+      `['loss', 'out_1_loss', 'out_2_loss', 'out_1_acc', 'out_2_acc']`.
+      The model metric names excluding the loss metrics will be
+      `['out_1_acc', 'out_2_acc']`.
+    export_outputs: Optional dictionary. This can be used to override the
+      default Keras model output exports in a multi IO model use case and
+      provide custom names for the `export_outputs` in
+      `tf.estimator.EstimatorSpec`. Default is None, which is equivalent to
+      {'serving_default': `tf.estimator.export.PredictOutput`}. If not None,
+      the keys must match the keys of `model.output_names`.
+      A dict `{name: output}` where:
+        * name: An arbitrary name for this output.
+        * output: an `ExportOutput` class such as `ClassificationOutput`,
+          `RegressionOutput`, or `PredictOutput`. Single-headed models only need
+          to specify one entry in this dictionary. Multi-headed models should
+          specify one entry for each head, one of which must be named using
+          `tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`
+          If no entry is provided, a default `PredictOutput` mapping to
+          `predictions` will be created.
 
   Returns:
     An Estimator from given keras model.
@@ -126,7 +176,9 @@ def model_to_estimator(
       model_dir=model_dir,
       config=config,
       checkpoint_format=checkpoint_format,
-      use_v2_estimator=False)
+      use_v2_estimator=False,
+      metric_names_map=metric_names_map,
+      export_outputs=export_outputs)
 
 
 @keras_export('keras.estimator.model_to_estimator', v1=[])
@@ -136,7 +188,8 @@ def model_to_estimator_v2(keras_model=None,
                           model_dir=None,
                           config=None,
                           checkpoint_format='checkpoint',
-                          metric_names_map=None):
+                          metric_names_map=None,
+                          export_outputs=None):
   """Constructs an `Estimator` instance from given keras model.
 
   If you use infrastructure or other tooling that relies on Estimators, you can
@@ -169,6 +222,28 @@ def model_to_estimator_v2(keras_model=None,
   estimator.train(input_fn, steps=1)
   ```
 
+  Example with customized export signature:
+  ```python
+  inputs = {'a': tf.keras.Input(..., name='a'),
+            'b': tf.keras.Input(..., name='b')}
+  outputs = {'c': tf.keras.layers.Dense(..., name='c')(inputs['a']),
+             'd': tf.keras.layers.Dense(..., name='d')(inputs['b'])}
+  keras_model = tf.keras.Model(inputs, outputs)
+  keras_model.compile(...)
+  export_outputs = {'c': tf.estimator.export.RegressionOutput,
+                    'd': tf.estimator.export.ClassificationOutput}
+
+  estimator = tf.keras.estimator.model_to_estimator(
+      keras_model, export_outputs=export_outputs)
+
+  def input_fn():
+    return dataset_ops.Dataset.from_tensors(
+        ({'features': features, 'sample_weights': sample_weights},
+         targets))
+
+  estimator.train(input_fn, steps=1)
+  ```
+
   Note: We do not support creating weighted metrics in Keras and converting them
   to weighted metrics in the Estimator API using `model_to_estimator`.
   You will have to create these metrics directly on the estimator spec using the
@@ -248,6 +323,21 @@ def model_to_estimator_v2(keras_model=None,
       `['loss', 'out_1_loss', 'out_2_loss', 'out_1_acc', 'out_2_acc']`.
       The model metric names excluding the loss metrics will be
       `['out_1_acc', 'out_2_acc']`.
+    export_outputs: Optional dictionary. This can be used to override the
+      default Keras model output exports in a multi IO model use case and
+      provide custom names for the `export_outputs` in
+      `tf.estimator.EstimatorSpec`. Default is None, which is equivalent to
+      {'serving_default': `tf.estimator.export.PredictOutput`}. If not None,
+      the keys must match the keys of `model.output_names`.
+      A dict `{name: output}` where:
+        * name: An arbitrary name for this output.
+        * output: an `ExportOutput` class such as `ClassificationOutput`,
+          `RegressionOutput`, or `PredictOutput`. Single-headed models only need
+          to specify one entry in this dictionary. Multi-headed models should
+          specify one entry for each head, one of which must be named using
+          `tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY`
+          If no entry is provided, a default `PredictOutput` mapping to
+          `predictions` will be created.
 
   Returns:
     An Estimator from given keras model.
@@ -275,5 +365,6 @@ def model_to_estimator_v2(keras_model=None,
       config=config,
       checkpoint_format=checkpoint_format,
       use_v2_estimator=True,
-      metric_names_map=metric_names_map)
+      metric_names_map=metric_names_map,
+      export_outputs=export_outputs)
 # LINT.ThenChange(//tensorflow_estimator/python/estimator/keras.py)
diff --git a/tensorflow/python/keras/feature_column/BUILD b/tensorflow/python/keras/feature_column/BUILD
index f1acbeba66c..89eaa6d1dd2 100644
--- a/tensorflow/python/keras/feature_column/BUILD
+++ b/tensorflow/python/keras/feature_column/BUILD
@@ -8,7 +8,11 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
 
 py_library(
     name = "feature_column",
@@ -18,7 +22,6 @@ py_library(
         ":dense_features",
         ":dense_features_v2",
         ":sequence_feature_column",
-        "//tensorflow/python/keras:combinations",
     ],
 )
 
@@ -60,7 +63,6 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:tf_export",
         "//tensorflow/python/feature_column:feature_column_v2",
-        "//tensorflow/python/keras:combinations",
     ],
 )
 
@@ -85,6 +87,8 @@ tf_py_test(
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/feature_column:feature_column_v2",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
     ],
 )
 
@@ -108,6 +112,8 @@ tf_py_test(
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/feature_column:feature_column_v2",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
     ],
 )
 
@@ -165,6 +171,7 @@ py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/feature_column:feature_column_v2",
+        "//tensorflow/python/keras:metrics",  # Import it here since base_layer didn't import it due to circular dependency.
         "//tensorflow/python/keras/layers:recurrent",
     ],
 )
diff --git a/tensorflow/python/keras/feature_column/dense_features_test.py b/tensorflow/python/keras/feature_column/dense_features_test.py
index a9fcb4ad315..5bd106a0358 100644
--- a/tensorflow/python/keras/feature_column/dense_features_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_test.py
@@ -23,7 +23,6 @@ import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import constant_op
@@ -59,148 +58,148 @@ class DenseFeaturesTest(keras_parameterized.TestCase):
     inputs = self.evaluate(dense_features(features))
     self.assertAllClose([[0.]], inputs)
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_reuses_variables(self):
-    with context.eager_mode():
-      sparse_input = sparse_tensor.SparseTensor(
-          indices=((0, 0), (1, 0), (2, 0)),
-          values=(0, 1, 2),
-          dense_shape=(3, 3))
+    sparse_input = sparse_tensor.SparseTensor(
+        indices=((0, 0), (1, 0), (2, 0)),
+        values=(0, 1, 2),
+        dense_shape=(3, 3))
 
-      # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(
-          key='a', num_buckets=3)
-      embedding_dimension = 2
+    # Create feature columns (categorical and embedding).
+    categorical_column = fc.categorical_column_with_identity(
+        key='a', num_buckets=3)
+    embedding_dimension = 2
 
-      def _embedding_column_initializer(shape, dtype, partition_info=None):
-        del shape  # unused
-        del dtype  # unused
-        del partition_info  # unused
-        embedding_values = (
-            (1, 0),  # id 0
-            (0, 1),  # id 1
-            (1, 1))  # id 2
-        return embedding_values
+    def _embedding_column_initializer(shape, dtype, partition_info=None):
+      del shape  # unused
+      del dtype  # unused
+      del partition_info  # unused
+      embedding_values = (
+          (1, 0),  # id 0
+          (0, 1),  # id 1
+          (1, 1))  # id 2
+      return embedding_values
 
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_embedding_column_initializer)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_embedding_column_initializer)
 
-      dense_features = df.DenseFeatures([embedding_column])
-      features = {'a': sparse_input}
+    dense_features = df.DenseFeatures([embedding_column])
+    features = {'a': sparse_input}
 
-      inputs = dense_features(features)
-      variables = dense_features.variables
+    inputs = dense_features(features)
+    variables = dense_features.variables
 
-      # Sanity check: test that the inputs are correct.
-      self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
+    # Sanity check: test that the inputs are correct.
+    self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
 
-      # Check that only one variable was created.
-      self.assertEqual(1, len(variables))
+    # Check that only one variable was created.
+    self.assertEqual(1, len(variables))
 
-      # Check that invoking dense_features on the same features does not create
-      # additional variables
-      _ = dense_features(features)
-      self.assertEqual(1, len(variables))
-      self.assertIs(variables[0], dense_features.variables[0])
+    # Check that invoking dense_features on the same features does not create
+    # additional variables
+    _ = dense_features(features)
+    self.assertEqual(1, len(variables))
+    self.assertIs(variables[0], dense_features.variables[0])
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_dense_feature_with_partitioner(self):
-    with context.eager_mode():
-      sparse_input = sparse_tensor.SparseTensor(
-          indices=((0, 0), (1, 0), (2, 0), (3, 0)),
-          values=(0, 1, 3, 2),
-          dense_shape=(4, 4))
+    sparse_input = sparse_tensor.SparseTensor(
+        indices=((0, 0), (1, 0), (2, 0), (3, 0)),
+        values=(0, 1, 3, 2),
+        dense_shape=(4, 4))
 
-      # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(
-          key='a', num_buckets=4)
-      embedding_dimension = 2
+    # Create feature columns (categorical and embedding).
+    categorical_column = fc.categorical_column_with_identity(
+        key='a', num_buckets=4)
+    embedding_dimension = 2
 
-      def _embedding_column_initializer(shape, dtype, partition_info=None):
-        offset = partition_info._var_offset[0]
-        del shape  # unused
-        del dtype  # unused
-        if offset == 0:
-          embedding_values = (
-              (1, 0),  # id 0
-              (0, 1))  # id 1
-        else:
-          embedding_values = (
-              (1, 1),  # id 2
-              (2, 2))  # id 3
-        return embedding_values
-
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_embedding_column_initializer)
-
-      dense_features = df.DenseFeatures(
-          [embedding_column],
-          partitioner=partitioned_variables.fixed_size_partitioner(2))
-      features = {'a': sparse_input}
-
-      inputs = dense_features(features)
-      variables = dense_features.variables
-
-      # Sanity check: test that the inputs are correct.
-      self.assertAllEqual([[1, 0], [0, 1], [2, 2], [1, 1]], inputs)
-
-      # Check that only one variable was created.
-      self.assertEqual(2, len(variables))
-
-      # Check that invoking dense_features on the same features does not create
-      # additional variables
-      _ = dense_features(features)
-      self.assertEqual(2, len(variables))
-      self.assertIs(variables[0], dense_features.variables[0])
-      self.assertIs(variables[1], dense_features.variables[1])
-
-  def test_feature_column_dense_features_gradient(self):
-    with context.eager_mode():
-      sparse_input = sparse_tensor.SparseTensor(
-          indices=((0, 0), (1, 0), (2, 0)),
-          values=(0, 1, 2),
-          dense_shape=(3, 3))
-
-      # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(
-          key='a', num_buckets=3)
-      embedding_dimension = 2
-
-      def _embedding_column_initializer(shape, dtype, partition_info=None):
-        del shape  # unused
-        del dtype  # unused
-        del partition_info  # unused
+    def _embedding_column_initializer(shape, dtype, partition_info=None):
+      offset = partition_info._var_offset[0]
+      del shape  # unused
+      del dtype  # unused
+      if offset == 0:
         embedding_values = (
             (1, 0),  # id 0
-            (0, 1),  # id 1
-            (1, 1))  # id 2
-        return embedding_values
+            (0, 1))  # id 1
+      else:
+        embedding_values = (
+            (1, 1),  # id 2
+            (2, 2))  # id 3
+      return embedding_values
 
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_embedding_column_initializer)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_embedding_column_initializer)
 
-      dense_features = df.DenseFeatures([embedding_column])
-      features = {'a': sparse_input}
+    dense_features = df.DenseFeatures(
+        [embedding_column],
+        partitioner=partitioned_variables.fixed_size_partitioner(2))
+    features = {'a': sparse_input}
 
-      def scale_matrix():
-        matrix = dense_features(features)
-        return 2 * matrix
+    inputs = dense_features(features)
+    variables = dense_features.variables
 
-      # Sanity check: Verify that scale_matrix returns the correct output.
-      self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
+    # Sanity check: test that the inputs are correct.
+    self.assertAllEqual([[1, 0], [0, 1], [2, 2], [1, 1]], inputs)
 
-      # Check that the returned gradient is correct.
-      grad_function = backprop.implicit_grad(scale_matrix)
-      grads_and_vars = grad_function()
-      indexed_slice = grads_and_vars[0][0]
-      gradient = grads_and_vars[0][0].values
+    # Check that only one variable was created.
+    self.assertEqual(2, len(variables))
 
-      self.assertAllEqual([0, 1, 2], indexed_slice.indices)
-      self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
+    # Check that invoking dense_features on the same features does not create
+    # additional variables
+    _ = dense_features(features)
+    self.assertEqual(2, len(variables))
+    self.assertIs(variables[0], dense_features.variables[0])
+    self.assertIs(variables[1], dense_features.variables[1])
+
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def test_feature_column_dense_features_gradient(self):
+    sparse_input = sparse_tensor.SparseTensor(
+        indices=((0, 0), (1, 0), (2, 0)),
+        values=(0, 1, 2),
+        dense_shape=(3, 3))
+
+    # Create feature columns (categorical and embedding).
+    categorical_column = fc.categorical_column_with_identity(
+        key='a', num_buckets=3)
+    embedding_dimension = 2
+
+    def _embedding_column_initializer(shape, dtype, partition_info=None):
+      del shape  # unused
+      del dtype  # unused
+      del partition_info  # unused
+      embedding_values = (
+          (1, 0),  # id 0
+          (0, 1),  # id 1
+          (1, 1))  # id 2
+      return embedding_values
+
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_embedding_column_initializer)
+
+    dense_features = df.DenseFeatures([embedding_column])
+    features = {'a': sparse_input}
+
+    def scale_matrix():
+      matrix = dense_features(features)
+      return 2 * matrix
+
+    # Sanity check: Verify that scale_matrix returns the correct output.
+    self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
+
+    # Check that the returned gradient is correct.
+    grad_function = backprop.implicit_grad(scale_matrix)
+    grads_and_vars = grad_function()
+    indexed_slice = grads_and_vars[0][0]
+    gradient = grads_and_vars[0][0].values
+
+    self.assertAllEqual([0, 1, 2], indexed_slice.indices)
+    self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
 
   def test_raises_if_empty_feature_columns(self):
     with self.assertRaisesRegex(ValueError,
diff --git a/tensorflow/python/keras/feature_column/dense_features_v2_test.py b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
index bb2ce657c46..30776149514 100644
--- a/tensorflow/python/keras/feature_column/dense_features_v2_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
@@ -22,7 +22,6 @@ import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import context
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -54,96 +53,96 @@ class DenseFeaturesTest(keras_parameterized.TestCase):
     inputs = self.evaluate(dense_features(features))
     self.assertAllClose([[0.]], inputs)
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_reuses_variables(self):
-    with context.eager_mode():
-      sparse_input = sparse_tensor.SparseTensor(
-          indices=((0, 0), (1, 0), (2, 0)),
-          values=(0, 1, 2),
-          dense_shape=(3, 3))
+    sparse_input = sparse_tensor.SparseTensor(
+        indices=((0, 0), (1, 0), (2, 0)),
+        values=(0, 1, 2),
+        dense_shape=(3, 3))
 
-      # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(
-          key='a', num_buckets=3)
-      embedding_dimension = 2
+    # Create feature columns (categorical and embedding).
+    categorical_column = fc.categorical_column_with_identity(
+        key='a', num_buckets=3)
+    embedding_dimension = 2
 
-      def _embedding_column_initializer(shape, dtype, partition_info=None):
-        del shape  # unused
-        del dtype  # unused
-        del partition_info  # unused
-        embedding_values = (
-            (1, 0),  # id 0
-            (0, 1),  # id 1
-            (1, 1))  # id 2
-        return embedding_values
+    def _embedding_column_initializer(shape, dtype, partition_info=None):
+      del shape  # unused
+      del dtype  # unused
+      del partition_info  # unused
+      embedding_values = (
+          (1, 0),  # id 0
+          (0, 1),  # id 1
+          (1, 1))  # id 2
+      return embedding_values
 
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_embedding_column_initializer)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_embedding_column_initializer)
 
-      dense_features = df.DenseFeatures([embedding_column])
-      features = {'a': sparse_input}
+    dense_features = df.DenseFeatures([embedding_column])
+    features = {'a': sparse_input}
 
-      inputs = dense_features(features)
-      variables = dense_features.variables
+    inputs = dense_features(features)
+    variables = dense_features.variables
 
-      # Sanity check: test that the inputs are correct.
-      self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
+    # Sanity check: test that the inputs are correct.
+    self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
 
-      # Check that only one variable was created.
-      self.assertEqual(1, len(variables))
+    # Check that only one variable was created.
+    self.assertEqual(1, len(variables))
 
-      # Check that invoking dense_features on the same features does not create
-      # additional variables
-      _ = dense_features(features)
-      self.assertEqual(1, len(variables))
-      self.assertIs(variables[0], dense_features.variables[0])
+    # Check that invoking dense_features on the same features does not create
+    # additional variables
+    _ = dense_features(features)
+    self.assertEqual(1, len(variables))
+    self.assertIs(variables[0], dense_features.variables[0])
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_feature_column_dense_features_gradient(self):
-    with context.eager_mode():
-      sparse_input = sparse_tensor.SparseTensor(
-          indices=((0, 0), (1, 0), (2, 0)),
-          values=(0, 1, 2),
-          dense_shape=(3, 3))
+    sparse_input = sparse_tensor.SparseTensor(
+        indices=((0, 0), (1, 0), (2, 0)),
+        values=(0, 1, 2),
+        dense_shape=(3, 3))
 
-      # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(
-          key='a', num_buckets=3)
-      embedding_dimension = 2
+    # Create feature columns (categorical and embedding).
+    categorical_column = fc.categorical_column_with_identity(
+        key='a', num_buckets=3)
+    embedding_dimension = 2
 
-      def _embedding_column_initializer(shape, dtype, partition_info=None):
-        del shape  # unused
-        del dtype  # unused
-        del partition_info  # unused
-        embedding_values = (
-            (1, 0),  # id 0
-            (0, 1),  # id 1
-            (1, 1))  # id 2
-        return embedding_values
+    def _embedding_column_initializer(shape, dtype, partition_info=None):
+      del shape  # unused
+      del dtype  # unused
+      del partition_info  # unused
+      embedding_values = (
+          (1, 0),  # id 0
+          (0, 1),  # id 1
+          (1, 1))  # id 2
+      return embedding_values
 
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_embedding_column_initializer)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_embedding_column_initializer)
 
-      dense_features = df.DenseFeatures([embedding_column])
-      features = {'a': sparse_input}
+    dense_features = df.DenseFeatures([embedding_column])
+    features = {'a': sparse_input}
 
-      def scale_matrix():
-        matrix = dense_features(features)
-        return 2 * matrix
+    def scale_matrix():
+      matrix = dense_features(features)
+      return 2 * matrix
 
-      # Sanity check: Verify that scale_matrix returns the correct output.
-      self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
+    # Sanity check: Verify that scale_matrix returns the correct output.
+    self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())
 
-      # Check that the returned gradient is correct.
-      grad_function = backprop.implicit_grad(scale_matrix)
-      grads_and_vars = grad_function()
-      indexed_slice = grads_and_vars[0][0]
-      gradient = grads_and_vars[0][0].values
+    # Check that the returned gradient is correct.
+    grad_function = backprop.implicit_grad(scale_matrix)
+    grads_and_vars = grad_function()
+    indexed_slice = grads_and_vars[0][0]
+    gradient = grads_and_vars[0][0].values
 
-      self.assertAllEqual([0, 1, 2], indexed_slice.indices)
-      self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
+    self.assertAllEqual([0, 1, 2], indexed_slice.indices)
+    self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
 
   def test_dense_feature_with_training_arg(self):
     price1 = fc.numeric_column('price1', shape=2)
diff --git a/tensorflow/python/keras/initializers/__init__.py b/tensorflow/python/keras/initializers/__init__.py
index 828a5b9ca49..ae388b591f5 100644
--- a/tensorflow/python/keras/initializers/__init__.py
+++ b/tensorflow/python/keras/initializers/__init__.py
@@ -25,8 +25,8 @@ from tensorflow.python import tf2
 from tensorflow.python.keras.initializers import initializers_v1
 from tensorflow.python.keras.initializers import initializers_v2
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_inspect as inspect
 from tensorflow.python.ops import init_ops
-from tensorflow.python.util import tf_inspect as inspect
 from tensorflow.python.util.tf_export import keras_export
 
 
diff --git a/tensorflow/python/keras/initializers/initializers_v2.py b/tensorflow/python/keras/initializers/initializers_v2.py
index e2042feca62..66e6719f31f 100644
--- a/tensorflow/python/keras/initializers/initializers_v2.py
+++ b/tensorflow/python/keras/initializers/initializers_v2.py
@@ -34,7 +34,7 @@ class Initializer(object):
   signature:
 
   ```python
-  def __call__(self, shape, dtype=None)`:
+  def __call__(self, shape, dtype=None):
     # returns a tensor of shape `shape` and dtype `dtype`
     # containing values drawn from a distribution of your choice.
   ```
@@ -54,7 +54,7 @@ class Initializer(object):
       self.mean = mean
       self.stddev = stddev
 
-    def __call__(self, shape, dtype=None)`:
+    def __call__(self, shape, dtype=None):
       return tf.random.normal(
           shape, mean=self.mean, stddev=self.stddev, dtype=dtype)
 
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index e254f6340fc..f03de2b436e 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -20,11 +20,11 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import models
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import array_ops
@@ -239,7 +239,7 @@ class KerasInitializersTest(test.TestCase):
         model.get_config(), custom_objects={'my_initializer': my_initializer})
     self.assertEqual(model2.layers[1].kernel_initializer, my_initializer)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_load_external_variance_scaling_v2(self):
     external_serialized_json = {
         'class_name': 'VarianceScaling',
diff --git a/tensorflow/python/keras/integration_test/BUILD b/tensorflow/python/keras/integration_test/BUILD
index 20e1a886d4e..3b4db66ab55 100644
--- a/tensorflow/python/keras/integration_test/BUILD
+++ b/tensorflow/python/keras/integration_test/BUILD
@@ -11,8 +11,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 tf_py_test(
     name = "forwardprop_test",
     srcs = ["forwardprop_test.py"],
diff --git a/tensorflow/python/keras/integration_test/forwardprop_test.py b/tensorflow/python/keras/integration_test/forwardprop_test.py
index 0418a9db48b..a93dd2337a4 100644
--- a/tensorflow/python/keras/integration_test/forwardprop_test.py
+++ b/tensorflow/python/keras/integration_test/forwardprop_test.py
@@ -262,6 +262,29 @@ class ForwardpropTest(tf.test.TestCase, parameterized.TestCase):
       return forward_over_back_hvp, back_over_back_hvp
     self.assertAllClose(*_compute_hvps(), rtol=1e-5, atol=1e-5)
 
+  def testEmbeddingLayerInFunction(self):
+
+    class M(tf.keras.Model):
+
+      def __init__(self):
+        super(M, self).__init__()
+        self.embed = tf.keras.layers.Embedding(5, 1)
+        self.proj = tf.keras.layers.Dense(1)
+
+      @tf.function
+      def call(self, x):
+        return self.proj(self.embed(x))
+
+    model = M()
+    model(tf.zeros([3, 3], dtype=tf.int32))  # pylint: disable=not-callable
+    parameters = model.embed.variables
+    tangents = [tf.ones_like(v) for v in parameters]
+    with tf.autodiff.ForwardAccumulator(parameters, tangents):
+      # Note that forwardprop runs alongside the original computation. This test
+      # is just checking that it doesn't crash; correctness is tested in core
+      # TF.
+      model(tf.zeros([3, 3], dtype=tf.int32))  # pylint: disable=not-callable
+
 
 class HessianTests(tf.test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/integration_test/gradients_test.py b/tensorflow/python/keras/integration_test/gradients_test.py
index 706150ddaa8..e9a851e02a8 100644
--- a/tensorflow/python/keras/integration_test/gradients_test.py
+++ b/tensorflow/python/keras/integration_test/gradients_test.py
@@ -79,6 +79,34 @@ class GradientsTest(tf.test.TestCase):
     for g, g_re in zip(grads, grads_re):
       self.assertAllClose(g, g_re)
 
+  def testLSTMBatchJacobian(self):
+    class HasLSTM(tf.keras.Model):
+
+      def __init__(self):
+        super(HasLSTM, self).__init__()
+        self.lstm = tf.keras.layers.LSTM(units=5)
+        self.dense = tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
+
+      def call(self, x):
+        return self.dense(self.lstm(x))
+
+    m = HasLSTM()
+
+    def jacobian(x):
+      with tf.GradientTape() as tape:
+        tape.watch(x)
+        y = m(x)  # pylint: disable=not-callable
+      return tape.batch_jacobian(y, x)
+
+    inp = tf.nn.l2_normalize(tf.ones([1, 2, 3]), axis=[1, 2])
+    eager_result = jacobian(inp)
+    function_result = tf.function(jacobian)(inp)
+    self.assertAllClose(eager_result, function_result)
+    backprop_result, numeric_result = tf.test.compute_gradient(
+        m, [inp], delta=1e-3)
+    self.assertAllClose(numeric_result, backprop_result, rtol=1e-2)
+    self.assertAllClose(tf.reshape(numeric_result, [-1]),
+                        tf.reshape(eager_result, [-1]), rtol=1e-2)
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensorflow/python/keras/layers/BUILD b/tensorflow/python/keras/layers/BUILD
index 6458d097f62..f240956d834 100644
--- a/tensorflow/python/keras/layers/BUILD
+++ b/tensorflow/python/keras/layers/BUILD
@@ -1,8 +1,8 @@
 # Description:
 #   Contains the Keras layers (internal TensorFlow version).
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 package(
     # TODO(scottzhu): Remove non-keras deps from TF.
@@ -16,7 +16,11 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
 
 # A separate build for layers without serialization to avoid circular deps
 # with feature column.
@@ -48,7 +52,9 @@ py_library(
         ":recurrent_v2",
         ":rnn_cell_wrapper_v2",
         ":wrappers",
+        "//tensorflow/python/keras/feature_column",
         "//tensorflow/python/keras/layers/preprocessing",
+        "//tensorflow/python/keras/premade",
         "//tensorflow/python/keras/utils:tf_utils",
     ],
 )
@@ -433,8 +439,8 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":recurrent",
-        "//tensorflow/python:rnn_cell",
         "//tensorflow/python:util",
+        "//tensorflow/python/keras/layers/legacy_rnn:rnn_cell_wrapper_impl",
     ],
 )
 
@@ -535,7 +541,6 @@ cuda_py_test(
     python_version = "PY3",
     shard_count = 4,
     tags = [
-        "no_cuda11",
         "no_windows_gpu",
     ],
     deps = [
@@ -585,6 +590,7 @@ tf_py_test(
     srcs = ["subclassed_layers_test.py"],
     python_version = "PY3",
     shard_count = 3,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -682,7 +688,7 @@ tf_py_test(
 
 tf_py_test(
     name = "noise_test",
-    size = "small",
+    size = "medium",
     srcs = ["noise_test.py"],
     python_version = "PY3",
     deps = [
@@ -771,7 +777,10 @@ tf_py_test(
     srcs = ["recurrent_test.py"],
     python_version = "PY3",
     shard_count = 12,
-    tags = ["no_rocm"],
+    tags = [
+        "no_rocm",
+        "notsan",  # TODO(b/170870794)
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -799,6 +808,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["separable_convolutional_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -814,10 +824,9 @@ cuda_py_test(
     python_version = "PY3",
     shard_count = 12,
     tags = [
-        "no_cuda11",
         "no_oss",
+        "notsan",  # TODO(b/170954246)
     ],
-    xla_enable_strict_auto_jit = False,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -836,7 +845,6 @@ cuda_py_test(
         "no_cuda11",
         "no_oss",
     ],
-    xla_enable_strict_auto_jit = False,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -851,6 +859,7 @@ tf_py_test(
     size = "small",
     srcs = ["serialization_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -917,10 +926,13 @@ tf_py_test(
     tags = [
         "notsan",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
         "//tensorflow/python/keras:combinations",
+        "//tensorflow/python/keras/layers/legacy_rnn:rnn_cell_impl",
+        "//tensorflow/python/keras/legacy_tf_layers:layers_base",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -931,6 +943,7 @@ tf_py_test(
     size = "small",
     srcs = ["layers_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":layers",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 05efbd23c1e..c383ef2b8da 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -107,9 +107,6 @@ class Conv(Layer):
         not safe to use when doing asynchronous distributed training.
     bias_constraint: Optional projection function to be applied to the
         bias after being updated by an `Optimizer`.
-    trainable: Boolean, if `True` the weights of this layer will be marked as
-      trainable (and listed in `layer.trainable_weights`).
-    name: A string, the name of the layer.
   """
 
   def __init__(self,
@@ -422,8 +419,8 @@ class Conv1D(Conv):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"`, `"same"` or `"causal"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
       `"causal"` results in causal (dilated) convolutions, e.g. `output[t]`
       does not depend on `input[t+1:]`. Useful when modeling temporal data
@@ -578,8 +575,8 @@ class Conv2D(Conv):
       specify the same value for all spatial dimensions. Specifying any stride
       value != 1 is incompatible with specifying any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs. `channels_last` corresponds
@@ -722,8 +719,8 @@ class Conv3D(Conv):
       specify the same value for all spatial dimensions. Specifying any stride
       value != 1 is incompatible with specifying any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs. `channels_last` corresponds
@@ -846,8 +843,8 @@ class Conv1DTranspose(Conv1D):
       time dimension. Specifying a stride value != 1 is incompatible with
       specifying a `dilation_rate` value != 1. Defaults to 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     output_padding: An integer specifying the amount of padding along
       the time dimension of the output tensor.
@@ -1100,8 +1097,8 @@ class Conv2DTranspose(Conv2D):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     output_padding: An integer or tuple/list of 2 integers,
       specifying the amount of padding along the height and width
@@ -1404,8 +1401,8 @@ class Conv3DTranspose(Conv3D):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     output_padding: An integer or tuple/list of 3 integers,
       specifying the amount of padding along the depth, height, and
@@ -1704,8 +1701,8 @@ class SeparableConv(Conv):
       Specifying any `stride` value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
@@ -1747,7 +1744,6 @@ class SeparableConv(Conv):
       bias after being updated by an `Optimizer`.
     trainable: Boolean, if `True` the weights of this layer will be marked as
       trainable (and listed in `layer.trainable_weights`).
-    name: A string, the name of the layer.
   """
 
   def __init__(self,
@@ -1911,9 +1907,9 @@ class SeparableConv1D(SeparableConv):
       Specifying any `stride` value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"`, `"same"`, or `"causal"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
-      height/width dimension as the input. `"causal"` results in causal 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
+      height/width dimension as the input. `"causal"` results in causal
       (dilated) convolutions, e.g. `output[t]` does not depend on `input[t+1:]`.
     data_format: A string, one of `channels_last` (default) or `channels_first`.
       The ordering of the dimensions in the inputs.
@@ -1960,7 +1956,6 @@ class SeparableConv1D(SeparableConv):
       see `keras.constraints`).
     trainable: Boolean, if `True` the weights of this layer will be marked as
       trainable (and listed in `layer.trainable_weights`).
-    name: A string, the name of the layer.
 
   Input shape:
     3D tensor with shape:
@@ -2029,7 +2024,7 @@ class SeparableConv1D(SeparableConv):
 
   def call(self, inputs):
     if self.padding == 'causal':
-      inputs = array_ops.pad(inputs, self._compute_causal_padding())
+      inputs = array_ops.pad(inputs, self._compute_causal_padding(inputs))
     if self.data_format == 'channels_last':
       strides = (1,) + self.strides * 2 + (1,)
       spatial_start_dim = 1
@@ -2100,8 +2095,8 @@ class SeparableConv2D(SeparableConv):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: one of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
@@ -2263,8 +2258,8 @@ class DepthwiseConv2D(Conv2D):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: one of `'valid'` or `'same'` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     depth_multiplier: The number of depthwise convolution output channels
       for each input channel.
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent_test.py b/tensorflow/python/keras/layers/convolutional_recurrent_test.py
index a9187009be2..cb6be1e6f4e 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent_test.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent_test.py
@@ -22,7 +22,6 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
@@ -32,7 +31,7 @@ from tensorflow.python.platform import test
 class ConvLSTMTest(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(
-      *test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           data_format=['channels_first', 'channels_last'],
           return_sequences=[True, False]))
   def test_conv_lstm(self, data_format, return_sequences):
diff --git a/tensorflow/python/keras/layers/convolutional_transpose_test.py b/tensorflow/python/keras/layers/convolutional_transpose_test.py
index dd73d22d51b..4326044458e 100644
--- a/tensorflow/python/keras/layers/convolutional_transpose_test.py
+++ b/tensorflow/python/keras/layers/convolutional_transpose_test.py
@@ -207,3 +207,6 @@ class Conv3DTransposeTest(keras_parameterized.TestCase):
             },
             input_shape=(None, 3, None, None, None),
             input_data=input_data)
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 36ac087ef64..6772aba605e 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -30,6 +30,7 @@ import numpy as np
 
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -46,22 +47,30 @@ from tensorflow.python.keras.layers.ops import core as core_ops
 from tensorflow.python.keras.utils import control_flow_util
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import get_canonical_name_for_symbol
 from tensorflow.python.util.tf_export import get_symbol_from_name
 from tensorflow.python.util.tf_export import keras_export
 
+# TODO(b/168039935): track dropout rate to decide whether/how to make a
+# dropout rate fastpath.
+keras_temporary_dropout_rate = monitoring.BoolGauge(
+    '/tensorflow/api/keras/dropout/temp_rate_is_zero',
+    'Temporarily record if Keras dropout layer was created w/'
+    'constant rate = 0')
+
 
 # pylint: disable=g-classes-have-attributes
 @keras_export('keras.layers.Masking')
@@ -186,6 +195,10 @@ class Dropout(Layer):
   def __init__(self, rate, noise_shape=None, seed=None, **kwargs):
     super(Dropout, self).__init__(**kwargs)
     self.rate = rate
+    if isinstance(rate, (int, float)) and not rate:
+      keras_temporary_dropout_rate.get_cell().set(True)
+    else:
+      keras_temporary_dropout_rate.get_cell().set(False)
     self.noise_shape = noise_shape
     self.seed = seed
     self.supports_masking = True
@@ -201,7 +214,7 @@ class Dropout(Layer):
     noise_shape = []
     for i, value in enumerate(self.noise_shape):
       noise_shape.append(concrete_inputs_shape[i] if value is None else value)
-    return ops.convert_to_tensor_v2(noise_shape)
+    return ops.convert_to_tensor_v2_with_dispatch(noise_shape)
 
   def call(self, inputs, training=None):
     if training is None:
@@ -753,9 +766,10 @@ class Lambda(Layer):
 
   The main reason to subclass `tf.keras.layers.Layer` instead of using a
   `Lambda` layer is saving and inspecting a Model. `Lambda` layers
-  are saved by serializing the Python bytecode, whereas subclassed
-  Layers can be saved via overriding their `get_config` method. Overriding
-  `get_config` improves the portability of Models. Models that rely on
+  are saved by serializing the Python bytecode, which is fundamentally
+  non-portable. They should only be loaded in the same environment where
+  they were saved. Subclassed layers can be saved in a more portable way
+  by overriding their `get_config` method. Models that rely on
   subclassed Layers are also often easier to visualize and reason about.
 
   Examples:
@@ -1292,8 +1306,20 @@ class TFOpLambda(Layer):
                                       api_name='keras',
                                       add_prefix_to_v1_names=True))
     if 'name' not in kwargs:
+      # Generate a name.
+      # TFOpLambda layers avoid already-observed names,
+      # because users cannot easily control the generated names.
+      # Without this avoidance, users would be more likely to run
+      # into unavoidable duplicate layer name collisions.
+      # (For standard layers users could just set `name` when creating the
+      # layer to work around a collision, but they can't do that for
+      # auto-generated layers)
+      if self.symbol:
+        name = 'tf.' + self.symbol
+      else:
+        name = self.function.__name__
       kwargs['name'] = K.unique_object_name(
-          'tf.' + self.symbol, zero_based=True)
+          name, zero_based=True, avoid_observed_names=True)
     kwargs['autocast'] = False
 
     # Decorate the function to produce this layer's call method
@@ -1517,3 +1543,261 @@ for slicing_op in [array_ops._slice_helper,  # pylint: disable=protected-access
                    array_ops.boolean_mask,
                    array_ops.boolean_mask_v2]:
   TFSlicingOpDispatcher(slicing_op).register(slicing_op)
+
+
+class InstanceProperty(Layer):
+  """Wraps an instance property access (e.g. `x.foo`) in a Keras Layer.
+
+  This layer takes an attribute name `attr_name` in the constructor and,
+  when called on input tensor `obj` returns `obj.attr_name`.
+
+  KerasTensors specialized for specific extension types use it to
+  represent instance property accesses on the represented object in the
+  case where the property needs to be dynamically accessed as opposed to
+  being statically computed from the typespec, e.g.
+
+  x = keras.Input(..., ragged=True)
+  out = x.flat_values
+  """
+
+  @trackable.no_automatic_dependency_tracking
+  def __init__(self, attr_name, **kwargs):
+    self.attr_name = attr_name
+
+    if 'name' not in kwargs:
+      kwargs['name'] = K.unique_object_name(
+          'input.' + self.attr_name, zero_based=True, avoid_observed_names=True)
+    kwargs['autocast'] = False
+
+    # Do not individually trace op layers in the SavedModel.
+    self._must_restore_from_config = True
+
+    super(InstanceProperty, self).__init__(**kwargs)
+
+    # Preserve all argument data structures when saving/loading a config
+    # (e.g., don't unnest lists that contain one element)
+    self._preserve_input_structure_in_config = True
+
+  def call(self, obj):
+    return getattr(obj, self.attr_name)
+
+  def get_config(self):
+    config = {
+        'attr_name': self.attr_name
+    }
+    base_config = super(InstanceProperty, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
+
+
+class InstanceMethod(InstanceProperty):
+  """Wraps an instance method access (e.g. `x.foo(arg)` in a Keras Layer.
+
+  This layer takes an attribute name `attr_name` in the constructor and,
+  when called on input tensor `obj` with additional arguments `args` and
+  `kwargs` returns `obj.attr_name(*args, **kwargs)`.
+
+  KerasTensors specialized for specific extension types use it to
+  represent dynamic instance method calls on the represented object, e.g.
+
+  x = keras.Input(..., ragged=True)
+  new_values = keras.Input(...)
+  out = x.with_values(new_values)
+  """
+
+  def call(self, obj, args, kwargs):
+    method = getattr(obj, self.attr_name)
+    return method(*args, **kwargs)
+
+
+def _delegate_property(keras_tensor_cls, property_name):  # pylint: disable=invalid-name
+  """Register property on a KerasTensor class.
+
+  Calling this multiple times with the same arguments should be a no-op.
+
+  This method exposes a property on the KerasTensor class that will use an
+  `InstanceProperty` layer to access the property on the represented
+  intermediate values in the model.
+
+  Arguments:
+    keras_tensor_cls: The KerasTensor subclass that should expose the property.
+    property_name: The name of the property to expose and delegate to the
+      represented (Composite)Tensor.
+  """
+  # We use a lambda because we can't create a Keras layer at import time
+  # due to dynamic layer class versioning.
+  property_access = property(lambda self: InstanceProperty(property_name)(self))  # pylint: disable=unnecessary-lambda
+  setattr(keras_tensor_cls, property_name, property_access)
+
+
+def _delegate_method(keras_tensor_cls, method_name):  # pylint: disable=invalid-name
+  """Register method on a KerasTensor class.
+
+  Calling this function times with the same arguments should be a no-op.
+
+  This method exposes an instance method on the KerasTensor class that will use
+  an `InstanceMethod` layer to run the desired method on the represented
+  intermediate values in the model.
+
+  Arguments:
+    keras_tensor_cls: The KerasTensor subclass that should expose the property.
+    method_name: The name of the method to expose and delegate to the
+      represented (Composite)Tensor.
+  """
+  def delegate(self, *args, **kwargs):
+    return InstanceMethod(method_name)(self, args, kwargs)
+  setattr(keras_tensor_cls, method_name, delegate)
+
+# We do not support the `uniform_row_length` property because it
+# returns either `None` or an int tensor, and code that relies on it tends
+# to check `is None` directly. Delegating it here would always return a
+# `KerasTensor`, regardless of what can be statically inferred. This would
+# never equal `None`, breaking code that expects it to be partially-static
+# in unpredictable ways.
+for ragged_property in [
+    'values',
+    'flat_values',
+    'row_splits',
+    'nested_row_splits'
+]:
+  _delegate_property(keras_tensor.RaggedKerasTensor, ragged_property)
+
+for ragged_method_name in [
+    'value_rowids',
+    'nested_value_rowids',
+    'nrows',
+    'row_starts',
+    'row_limits',
+    'row_lengths',
+    'nested_row_lengths',
+    'bounding_shape',
+    'with_values',
+    'with_flat_values',
+    'with_row_splits_dtype',
+    'merge_dims',
+    'to_tensor',
+    'to_sparse',
+]:
+  _delegate_method(keras_tensor.RaggedKerasTensor, ragged_method_name)
+
+for sparse_property in [
+    'indices',
+    'values',
+]:
+  _delegate_property(keras_tensor.SparseKerasTensor, sparse_property)
+
+for sparse_method in [
+    'with_values',
+]:
+  _delegate_method(keras_tensor.SparseKerasTensor, sparse_method)
+
+
+class ClassMethod(Layer):
+  """Wraps a TF API Class's class method  in a `Layer` object.
+
+  It is inserted by the Functional API construction whenever users call
+  a supported TF Class's class method on KerasTensors.
+
+  This is useful in the case where users do something like:
+  x = keras.Input(...)
+  y = keras.Input(...)
+  out = tf.RaggedTensor.from_row_splits(x, y)
+  """
+
+  @trackable.no_automatic_dependency_tracking
+  def __init__(self, cls_ref, method_name, **kwargs):
+    self.cls_ref = cls_ref
+    self.method_name = method_name
+    self.cls_symbol = (
+        get_canonical_name_for_symbol(self.cls_ref,
+                                      add_prefix_to_v1_names=True) or
+        get_canonical_name_for_symbol(self.cls_ref,
+                                      api_name='keras',
+                                      add_prefix_to_v1_names=True))
+    if 'name' not in kwargs:
+      kwargs['name'] = K.unique_object_name(
+          'tf.' + self.cls_symbol + '.' + self.method_name, zero_based=True,
+          avoid_observed_names=True)
+    kwargs['autocast'] = False
+
+    # Do not individually trace op layers in the SavedModel.
+    self._must_restore_from_config = True
+
+    super(ClassMethod, self).__init__(**kwargs)
+
+    # Preserve all argument data structures when saving/loading a config
+    # (e.g., don't unnest lists that contain one element)
+    self._preserve_input_structure_in_config = True
+
+    self._expects_training_arg = False
+    self._expects_mask_arg = False
+
+  def call(self, args, kwargs):
+    return getattr(self.cls_ref, self.method_name)(*args, **kwargs)
+
+  def get_config(self):
+    if not self.cls_symbol:
+      raise ValueError('This Keras class method conversion tried to convert '
+                       'a method belonging to class %s, a class '
+                       'that is not an exposed in the TensorFlow API. '
+                       'To ensure cross-version compatibility of Keras models '
+                       'that use op layers, only op layers produced from '
+                       'exported TF API symbols can be serialized.'
+                       % self.cls_symbol)
+    config = {
+        'cls_symbol': self.cls_symbol,
+        'method_name': self.method_name
+    }
+
+    base_config = super(ClassMethod, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    config = config.copy()
+    symbol_name = config.pop('cls_symbol')
+    cls_ref = get_symbol_from_name(symbol_name)
+    if not cls_ref:
+      raise ValueError(
+          'TF symbol `tf.%s` could not be found.' % symbol_name)
+
+    config['cls_ref'] = cls_ref
+
+    return cls(**config)
+
+
+class TFClassMethodDispatcher(dispatch.OpDispatcher):
+  """A class method dispatcher that allows building a functional model with TF class methods."""
+
+  def __init__(self, cls, method_name):
+    self.cls = cls
+    self.method_name = method_name
+
+  def handle(self, args, kwargs):
+    """Handle the specified operation with the specified arguments."""
+    if any(
+        isinstance(x, keras_tensor.KerasTensor)
+        for x in nest.flatten([args, kwargs])):
+      return ClassMethod(self.cls, self.method_name)(args[1:], kwargs)
+    else:
+      return self.NOT_SUPPORTED
+
+for ragged_class_method in [
+    'from_value_rowids',
+    'from_row_splits',
+    'from_row_lengths',
+    'from_row_starts',
+    'from_row_limits',
+    'from_uniform_row_length',
+    'from_nested_value_rowids',
+    'from_nested_row_splits',
+    'from_nested_row_lengths',
+    'from_tensor',
+    'from_sparse',
+]:
+  TFClassMethodDispatcher(
+      ragged_tensor.RaggedTensor, ragged_class_method).register(
+          getattr(ragged_tensor.RaggedTensor, ragged_class_method))
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index f6509814249..3f113bf8cbb 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -504,14 +504,14 @@ class CoreLayersTest(keras_parameterized.TestCase):
         keras.layers.Dense, kwargs={'units': 3}, input_shape=(3, 4, 5, 2))
 
   def test_dense_dtype(self):
-    inputs = ops.convert_to_tensor_v2(
+    inputs = ops.convert_to_tensor_v2_with_dispatch(
         np.random.randint(low=0, high=7, size=(2, 2)))
     layer = keras.layers.Dense(5, dtype='float32')
     outputs = layer(inputs)
     self.assertEqual(outputs.dtype, 'float32')
 
   def test_dense_with_policy(self):
-    inputs = ops.convert_to_tensor_v2(
+    inputs = ops.convert_to_tensor_v2_with_dispatch(
         np.random.randint(low=0, high=7, size=(2, 2)))
     layer = keras.layers.Dense(5, dtype=policy.Policy('mixed_float16'))
     outputs = layer(inputs)
@@ -559,5 +559,20 @@ class CoreLayersTest(keras_parameterized.TestCase):
       self.assertAllEqual(np.ones((10, 20)), layer([x, y]))
 
 
+@keras_parameterized.run_all_keras_modes
+class TFOpLambdaTest(keras_parameterized.TestCase):
+
+  def test_non_tf_symbol(self):
+    def dummy_func(a, b):
+      return a + b
+
+    layer = core.TFOpLambda(dummy_func)
+    self.assertIsNone(layer.symbol)
+    self.assertEqual(layer.name, 'dummy_func')
+
+    with self.assertRaisesRegex(ValueError, 'was generated from .*dummy_func'):
+      layer.get_config()
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent_test.py b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
index d25851f6569..3bb392c85ad 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent_test.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
@@ -40,7 +40,7 @@ from tensorflow.python.training import gradient_descent
 class CuDNNTest(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(
-      *test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           layer_class=[keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM],
           return_sequences=[True, False]))
   @test_util.run_gpu_only
@@ -56,7 +56,7 @@ class CuDNNTest(keras_parameterized.TestCase):
         input_shape=(num_samples, timesteps, input_size))
 
   @parameterized.named_parameters(
-      *test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           layer_class=[keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM],
           go_backwards=[True, False]))
   @test_util.run_gpu_only
@@ -269,7 +269,7 @@ class CuDNNV1OnlyTest(keras_parameterized.TestCase):
 
   # TODO(b/156439419): Reenable after the bug is fixed.
   @parameterized.named_parameters(
-      *test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           rnn_type=['LSTM', 'GRU'], to_cudnn=[True, False],
           bidirectional=[True, False], implementation=[1, 2],
           model_nest_level=[1, 2], model_type=['seq', 'func']))
@@ -352,7 +352,7 @@ class CuDNNV1OnlyTest(keras_parameterized.TestCase):
     os.remove(fname)
 
   @parameterized.named_parameters(
-      *test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           rnn_type=['LSTM', 'GRU'], to_cudnn=[True, False]))
   @test_util.run_v1_only('b/120911602')
   @test_util.run_gpu_only
diff --git a/tensorflow/python/keras/layers/dense_attention.py b/tensorflow/python/keras/layers/dense_attention.py
index cd277a1a6a9..89750606c17 100644
--- a/tensorflow/python/keras/layers/dense_attention.py
+++ b/tensorflow/python/keras/layers/dense_attention.py
@@ -49,8 +49,6 @@ class BaseDenseAttention(Layer):
       flow of information from the future towards the past.
     dropout: Float between 0 and 1. Fraction of the units to drop for the
       attention scores.
-    return_attention_scores: bool, it `True`, returns the attention scores
-      (after masking and softmax) as an additional output argument.
 
   Call Arguments:
 
@@ -69,6 +67,8 @@ class BaseDenseAttention(Layer):
         `mask==False` do not contribute to the result.
     training: Python boolean indicating whether the layer should behave in
       training mode (adding dropout) or in inference mode (no dropout).
+    return_attention_scores: bool, it `True`, returns the attention scores
+      (after masking and softmax) as an additional output argument.
 
   Output:
 
@@ -77,12 +77,11 @@ class BaseDenseAttention(Layer):
       `[batch_size, Tq, Tv]`.
   """
 
-  def __init__(self, causal=False, dropout=0.0, return_attention_scores=False,
+  def __init__(self, causal=False, dropout=0.0,
                **kwargs):
     super(BaseDenseAttention, self).__init__(**kwargs)
     self.causal = causal
     self.dropout = dropout
-    self.return_attention_scores = return_attention_scores
     self.supports_masking = True
 
   def _calculate_scores(self, query, key):
@@ -140,7 +139,11 @@ class BaseDenseAttention(Layer):
     return math_ops.matmul(weights, value), weights
 
   # TODO(b/125916026): Consider exposing a __call__ method with named args.
-  def call(self, inputs, mask=None, training=None):
+  def call(self,
+           inputs,
+           mask=None,
+           training=None,
+           return_attention_scores=False):
     self._validate_call_args(inputs=inputs, mask=mask)
     q = inputs[0]
     v = inputs[1]
@@ -170,7 +173,7 @@ class BaseDenseAttention(Layer):
       # Mask of shape [batch_size, Tq, 1].
       q_mask = array_ops.expand_dims(q_mask, axis=-1)
       result *= math_ops.cast(q_mask, dtype=result.dtype)
-    if self.return_attention_scores:
+    if return_attention_scores:
       return result, attention_scores
     return result
 
@@ -180,7 +183,7 @@ class BaseDenseAttention(Layer):
       q_mask = mask[0]
       if q_mask is None:
         return None
-      return ops.convert_to_tensor_v2(q_mask)
+      return ops.convert_to_tensor_v2_with_dispatch(q_mask)
     return None
 
   def _validate_call_args(self, inputs, mask):
@@ -209,7 +212,6 @@ class BaseDenseAttention(Layer):
     config = {
         'causal': self.causal,
         'dropout': self.dropout,
-        'return_attention_scores': self.return_attention_scores,
     }
     base_config = super(BaseDenseAttention, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -239,8 +241,6 @@ class Attention(BaseDenseAttention):
       flow of information from the future towards the past.
     dropout: Float between 0 and 1. Fraction of the units to drop for the
       attention scores.
-    return_attention_scores: bool, it `True`, returns the attention scores
-      (after masking and softmax) as an additional output argument.
 
   Call Arguments:
 
@@ -257,6 +257,8 @@ class Attention(BaseDenseAttention):
       * value_mask: A boolean mask `Tensor` of shape `[batch_size, Tv]`.
         If given, will apply the mask such that values at positions where
         `mask==False` do not contribute to the result.
+    return_attention_scores: bool, it `True`, returns the attention scores
+      (after masking and softmax) as an additional output argument.
     training: Python boolean indicating whether the layer should behave in
       training mode (adding dropout) or in inference mode (no dropout).
 
@@ -378,8 +380,6 @@ class AdditiveAttention(BaseDenseAttention):
       flow of information from the future towards the past.
     dropout: Float between 0 and 1. Fraction of the units to drop for the
       attention scores.
-    return_attention_scores: bool, it `True`, returns the attention scores
-      (after masking and softmax) as an additional output argument.
 
   Call Arguments:
 
@@ -398,6 +398,8 @@ class AdditiveAttention(BaseDenseAttention):
         `mask==False` do not contribute to the result.
     training: Python boolean indicating whether the layer should behave in
       training mode (adding dropout) or in inference mode (no dropout).
+    return_attention_scores: bool, it `True`, returns the attention scores
+      (after masking and softmax) as an additional output argument.
 
   Output:
 
diff --git a/tensorflow/python/keras/layers/dense_attention_test.py b/tensorflow/python/keras/layers/dense_attention_test.py
index 942304e4316..8570f41b34f 100644
--- a/tensorflow/python/keras/layers/dense_attention_test.py
+++ b/tensorflow/python/keras/layers/dense_attention_test.py
@@ -82,8 +82,8 @@ class BaseDenseAttentionTest(test.TestCase, parameterized.TestCase):
     # => softmax_scores000 = exp(1)/(exp(1) + exp(0)) = 0.73105857863
     #    softmax_scores001 = exp(0)/(exp(1) + exp(0)) = 0.26894142137
     #    softmax_scores002 = 0
-    expected_scores = np.array(
-        [[[0.73105857863, 0.26894142137, 0.]]], dtype=np.float32)
+    expected_scores = np.array([[[0.73105857863, 0.26894142137, 0.]]],
+                               dtype=np.float32)
     self.assertAllClose(expected_scores, actual_scores)
     # Expected tensor of shape [1, 1, 1].
     # expected000 = 0.73105857863 * 1.6 + 0.26894142137 * 0.7 - 0 * 0.8
@@ -187,8 +187,7 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
 
   def test_calculate_scores_multi_dim(self):
     # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
     # Key tensor of shape [1, 3, 4]
     k = np.array(
         [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@@ -204,8 +203,8 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
     # expected010 = 2.*1.5+2.1*1.6+2.2*1.7+2.3*1.8 = 14.24
     # expected011 = 2.*2.5+2.1*2.6+2.2*2.7+2.3*2.8 = 22.84
     # expected012 = 2.*3.5+2.1*3.6+2.2*3.7+2.3*3.8 = 31.44
-    expected = np.array(
-        [[[7.64, 12.24, 16.84], [14.24, 22.84, 31.44]]], dtype=np.float32)
+    expected = np.array([[[7.64, 12.24, 16.84], [14.24, 22.84, 31.44]]],
+                        dtype=np.float32)
     self.assertAllClose(expected, actual)
 
   def test_calculate_scores_one_dim_batch_size_two(self):
@@ -241,8 +240,7 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
 
   def test_shape(self):
     # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
     # Value tensor of shape [1, 3, 4]
     v = np.array(
         [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@@ -257,8 +255,7 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
 
   def test_shape_with_key(self):
     # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
     # Value tensor of shape [1, 3, 4]
     v = np.array(
         [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@@ -342,12 +339,16 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
     q_mask = np.array([[True, False]], dtype=np.bool_)
     # Value mask tensor of shape [1, 3]
     v_mask = np.array([[True, True, False]], dtype=np.bool_)
-    attention_layer = dense_attention.Attention(
-        return_attention_scores=return_attention_scores)
+    attention_layer = dense_attention.Attention()
     if return_attention_scores:
-      actual, actual_scores = attention_layer([q, v], mask=[q_mask, v_mask])
+      actual, actual_scores = attention_layer(
+          [q, v],
+          mask=[q_mask, v_mask],
+          return_attention_scores=return_attention_scores)
     else:
-      actual = attention_layer([q, v], mask=[q_mask, v_mask])
+      actual = attention_layer([q, v],
+                               mask=[q_mask, v_mask],
+                               return_attention_scores=return_attention_scores)
 
     # Expected scores of shape [1, 2, 3]
     # scores = [[[1.1*1.6, 1.1*0.7, -1.1*0.8], [-0.5*1.6, -0.5*0.7, 0.5*0.8]]]
@@ -365,10 +366,9 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
     #                              = 0.61063923394
     #    attention_distribution012 = 0
     if return_attention_scores:
-      expected_scores = np.array(
-          [[[0.72908792234, 0.27091207765, 0.],
-            [0.38936076605, 0.61063923394, 0.]]],
-          dtype=np.float32)
+      expected_scores = np.array([[[0.72908792234, 0.27091207765, 0.],
+                                   [0.38936076605, 0.61063923394, 0.]]],
+                                 dtype=np.float32)
       self.assertAllClose(expected_scores, actual_scores)
     # Expected tensor of shape [1, 2, 1] with zeros where  q_mask == False.
     # expected000 = 0.72908792234 * 1.6 + 0.27091207765 * 0.7 - 0 * 0.8
@@ -385,10 +385,11 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
 
   def test_scale_init_eager(self):
     """Tests that scale initializes to 1 when use_scale=True."""
-    with context.eager_mode():
-      attention_layer = dense_attention.Attention(use_scale=True)
-      attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
-      self.assertAllClose(1., attention_layer.scale.value())
+    if not context.executing_eagerly():
+      self.skipTest('Only run in eager mode')
+    attention_layer = dense_attention.Attention(use_scale=True)
+    attention_layer.build(input_shape=([1, 1, 1], [1, 1, 1]))
+    self.assertAllClose(1., attention_layer.scale.value())
 
   def test_scale_init_graph(self):
     """Tests that scale initializes to 1 when use_scale=True."""
@@ -405,12 +406,13 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
   def test_self_attention_causal(self, return_attention_scores):
     # Query-value tensor of shape [1, 3, 1]
     q = np.array([[[0.5], [0.8], [-0.3]]], dtype=np.float32)
-    attention_layer = dense_attention.Attention(
-        causal=True, return_attention_scores=return_attention_scores)
+    attention_layer = dense_attention.Attention(causal=True)
     if return_attention_scores:
-      actual, actual_scores = attention_layer([q, q])
+      actual, actual_scores = attention_layer(
+          [q, q], return_attention_scores=return_attention_scores)
     else:
-      actual = attention_layer([q, q])
+      actual = attention_layer([q, q],
+                               return_attention_scores=return_attention_scores)
 
     # Expected scores of shape [1, 3, 3]
     # scores = [[0.25, 0.4, -0.15], [0.4, 0.64, -0.24], [-0.15, -0.24, 0.09]]
@@ -425,8 +427,7 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
     #      = [0.31395396638, 0.28693232061, 0.399113713]
     if return_attention_scores:
       expected_scores = np.array(
-          [[[1., 0., 0.],
-            [0.44028635073, 0.55971364926, 0.],
+          [[[1., 0., 0.], [0.44028635073, 0.55971364926, 0.],
             [0.31395396638, 0.28693232061, 0.399113713]]],
           dtype=np.float32)
       self.assertAllClose(expected_scores, actual_scores)
@@ -436,8 +437,8 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
     #             = 0.66791409477
     # expected020 = 0.31395396638 * 0.5 +0.28693232061 * 0.8 -0.399113713 * 0.3
     #             = 0.26678872577
-    expected = np.array(
-        [[[0.5], [0.66791409477], [0.26678872577]]], dtype=np.float32)
+    expected = np.array([[[0.5], [0.66791409477], [0.26678872577]]],
+                        dtype=np.float32)
     self.assertAllClose(expected, actual)
 
   def test_inputs_not_list(self):
@@ -500,24 +501,20 @@ class AttentionTest(test.TestCase, parameterized.TestCase):
     self.assertAllClose([[[0], [1]]], actual)
 
   @parameterized.named_parameters(
-      ('', False, False),
-      ('use_scale', True, False),
-      ('return_attention_scores', False, True),
+      ('', False),
+      ('use_scale', True),
   )
-  def test_serialization(self, use_scale, return_attention_scores):
+  def test_serialization(self, use_scale):
     # Test serialization with use_scale
-    layer = dense_attention.Attention(
-        use_scale=use_scale, return_attention_scores=return_attention_scores)
+    layer = dense_attention.Attention(use_scale=use_scale)
 
     config = keras.layers.serialize(layer)
     new_layer = keras.layers.deserialize(config)
     self.assertEqual(new_layer.use_scale, use_scale)
-    self.assertEqual(new_layer.return_attention_scores, return_attention_scores)
 
     config = layer.get_config()
     new_layer = dense_attention.Attention.from_config(config)
     self.assertEqual(new_layer.use_scale, use_scale)
-    self.assertEqual(new_layer.return_attention_scores, return_attention_scores)
 
 
 @combinations.generate(combinations.combine(mode=['graph', 'eager']))
@@ -541,8 +538,7 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase):
 
   def test_calculate_scores_multi_dim(self):
     # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
     # Key tensor of shape [1, 3, 4]
     k = np.array(
         [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@@ -561,10 +557,9 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase):
     # expected011 = 0.5*tanh(2.+2.5) + 0.6*tanh(2.1+2.6) + 0.7*tanh(2.2+2.7) + 0.8*tanh(2.3+2.8) = 2.59964024652
     # expected012 = 0.5*tanh(2.+3.5) + 0.6*tanh(2.1+3.6) + 0.7*tanh(2.2+3.7) + 0.8*tanh(2.3+3.8) = 2.59995130916
     # pylint:enable=line-too-long
-    expected = np.array(
-        [[[2.58044532581, 2.59734317449, 2.59964024652],
-          [2.59734317449, 2.59964024652, 2.59995130916]]],
-        dtype=np.float32)
+    expected = np.array([[[2.58044532581, 2.59734317449, 2.59964024652],
+                          [2.59734317449, 2.59964024652, 2.59995130916]]],
+                        dtype=np.float32)
     self.assertAllClose(expected, actual)
 
   def test_calculate_scores_one_dim_batch_size_two(self):
@@ -581,14 +576,13 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase):
     # Expected tensor of shape [2, 1, 1].
     # expected000 = 0.5 * tanh(1.1 + 1.6) = 0.49550372683
     # expected100 = 0.5 * tanh(2.1 + 2.6) = 0.49991728277
-    expected = np.array(
-        [[[0.49550372683]], [[0.49991728277]]], dtype=np.float32)
+    expected = np.array([[[0.49550372683]], [[0.49991728277]]],
+                        dtype=np.float32)
     self.assertAllClose(expected, actual)
 
   def test_shape(self):
     # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
     # Value tensor of shape [1, 3, 4]
     v = np.array(
         [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@@ -603,8 +597,7 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase):
 
   def test_shape_no_scale(self):
     # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
     # Value tensor of shape [1, 3, 4]
     v = np.array(
         [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@@ -619,8 +612,7 @@ class AdditiveAttentionTest(test.TestCase, parameterized.TestCase):
 
   def test_shape_with_key(self):
     # Query tensor of shape [1, 2, 4]
-    q = np.array(
-        [[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
+    q = np.array([[[1., 1.1, 1.2, 1.3], [2., 2.1, 2.2, 2.3]]], dtype=np.float32)
     # Value tensor of shape [1, 3, 4]
     v = np.array(
         [[[1.5, 1.6, 1.7, 1.8], [2.5, 2.6, 2.7, 2.8], [3.5, 3.6, 3.7, 3.8]]],
@@ -778,8 +770,8 @@ class LowerTriangularMaskTest(test.TestCase, parameterized.TestCase):
 
   def test_orthogonal_shape(self):
     actual = dense_attention._lower_triangular_mask([3, 2])
-    expected = np.array(
-        [[True, False], [True, True], [True, True]], dtype=np.bool_)
+    expected = np.array([[True, False], [True, True], [True, True]],
+                        dtype=np.bool_)
     self.assertAllEqual(expected, actual)
 
   def test_three_dim(self):
diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
index defa03409a2..9b4a0622daa 100644
--- a/tensorflow/python/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -20,11 +20,13 @@ from __future__ import print_function
 
 from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.eager import context
+from tensorflow.python.framework import config as tf_config
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import embedding_ops
@@ -105,13 +107,17 @@ class Embedding(Layer):
       raise ValueError('Both `input_dim` and `output_dim` should be positive, '
                        'found input_dim {} and output_dim {}'.format(
                            input_dim, output_dim))
-    dtype = kwargs.pop('dtype', K.floatx())
+    if (not base_layer_utils.v2_dtype_behavior_enabled() and
+        'dtype' not in kwargs):
+      # In TF1, the dtype defaults to the input dtype which is typically int32,
+      # so explicitly set it to floatx
+      kwargs['dtype'] = K.floatx()
     # We set autocast to False, as we do not want to cast floating- point inputs
     # to self.dtype. In call(), we cast to int32, and casting to self.dtype
     # before casting to int32 might cause the int32 values to be different due
     # to a loss of precision.
     kwargs['autocast'] = False
-    super(Embedding, self).__init__(dtype=dtype, **kwargs)
+    super(Embedding, self).__init__(**kwargs)
 
     self.input_dim = input_dim
     self.output_dim = output_dim
@@ -130,23 +136,24 @@ class Embedding(Layer):
     # since it knows all kernels using the variable only exist on CPU.
     # When eager execution is enabled, the placement decision has to be made
     # right now. Checking for the presence of GPUs to avoid complicating the
-    # TPU codepaths which can handle sparse optimizers. But if we are within
-    # a tf.function, we go back the graph mode logic and rely on the placer.
-    if context.executing_eagerly() and context.context().num_gpus():
+    # TPU codepaths which can handle sparse optimizers.
+    if context.executing_eagerly() and tf_config.list_logical_devices('GPU'):
       with ops.device('cpu:0'):
         self.embeddings = self.add_weight(
             shape=(self.input_dim, self.output_dim),
             initializer=self.embeddings_initializer,
             name='embeddings',
             regularizer=self.embeddings_regularizer,
-            constraint=self.embeddings_constraint)
+            constraint=self.embeddings_constraint,
+            experimental_autocast=False)
     else:
       self.embeddings = self.add_weight(
           shape=(self.input_dim, self.output_dim),
           initializer=self.embeddings_initializer,
           name='embeddings',
           regularizer=self.embeddings_regularizer,
-          constraint=self.embeddings_constraint)
+          constraint=self.embeddings_constraint,
+          experimental_autocast=False)
     self.built = True
 
   def compute_mask(self, inputs, mask=None):
@@ -187,6 +194,10 @@ class Embedding(Layer):
       out = embedding_ops.embedding_lookup_v2(self.embeddings.variables, inputs)
     else:
       out = embedding_ops.embedding_lookup_v2(self.embeddings, inputs)
+    if self._dtype_policy.should_cast_variables:
+      # Instead of casting the variable as in most layers, cast the output, as
+      # this is mathematically equivalent but is faster.
+      out = math_ops.cast(out, self._dtype_policy.compute_dtype)
     return out
 
   def get_config(self):
diff --git a/tensorflow/python/keras/layers/embeddings_test.py b/tensorflow/python/keras/layers/embeddings_test.py
index 6aa873b2bd7..fd468bd15b1 100644
--- a/tensorflow/python/keras/layers/embeddings_test.py
+++ b/tensorflow/python/keras/layers/embeddings_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.mixed_precision.experimental import policy
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
@@ -146,6 +147,17 @@ class EmbeddingTest(keras_parameterized.TestCase):
     outputs = model.predict(np.array([[0, 2, 4]], dtype='int32'))
     self.assertAllClose(outputs, [[[1., 2.], [5., 6.], [9., 10.]]])
 
+  @testing_utils.enable_v2_dtype_behavior
+  def test_mixed_precision_embedding(self):
+    try:
+      policy.set_policy('mixed_float16')
+      layer = keras.layers.Embedding(input_dim=5, output_dim=2)
+      self.assertEqual(layer._dtype_policy.name, 'mixed_float16')
+      outputs = layer(np.array([0, 1, 2]))
+      self.assertEqual(outputs.dtype, 'float16')
+    finally:
+      policy.set_policy('float32')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/layers/gru_test.py b/tensorflow/python/keras/layers/gru_test.py
index 5b794c580a2..bfa4e5de087 100644
--- a/tensorflow/python/keras/layers/gru_test.py
+++ b/tensorflow/python/keras/layers/gru_test.py
@@ -26,7 +26,6 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
@@ -48,7 +47,7 @@ class GRULayerTest(keras_parameterized.TestCase):
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_float64_GRU(self):
     if test.is_built_with_rocm():
       self.skipTest('Double type is yet not supported in ROCm')
diff --git a/tensorflow/python/keras/layers/gru_v2_test.py b/tensorflow/python/keras/layers/gru_v2_test.py
index e5100e495d3..db2b0a2e7b9 100644
--- a/tensorflow/python/keras/layers/gru_v2_test.py
+++ b/tensorflow/python/keras/layers/gru_v2_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import os
 import shutil
 
@@ -34,7 +35,6 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
@@ -82,7 +82,7 @@ class GRUV2Test(keras_parameterized.TestCase):
                     reset_after=reset_after)
     self.assertFalse(layer._could_use_gpu_kernel)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_use_on_default_activation_with_gpu_kernel(self):
     layer = rnn.GRU(1, activation=nn.tanh)
     self.assertTrue(layer._could_use_gpu_kernel)
@@ -149,7 +149,7 @@ class GRUV2Test(keras_parameterized.TestCase):
       l2 = layer_class.from_config(l1.get_config())
       assert l1.get_config() == l2.get_config()
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_gru_v2_feature_parity_with_canonical_gru(self):
     if test.is_built_with_rocm():
       self.skipTest('Skipping the test as ROCm MIOpen does not '
@@ -366,7 +366,7 @@ class GRUV2Test(keras_parameterized.TestCase):
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_float64_GRU(self):
     if test.is_built_with_rocm():
       self.skipTest('Double type is yet not supported in ROCm')
@@ -566,7 +566,7 @@ class GRUV2Test(keras_parameterized.TestCase):
         run_eagerly=testing_utils.should_run_eagerly())
     model.fit(x, y, epochs=1, shuffle=False)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_explicit_device_with_go_backward_and_mask(self):
     if test.is_built_with_rocm():
       self.skipTest('Skipping the test as ROCm MIOpen does not '
@@ -640,6 +640,31 @@ class GRUV2Test(keras_parameterized.TestCase):
     # Make sure it doesn't crash with cudnn kernel.
     model.predict(inputs)
 
+  # TODO (b/169895267): test with xla_gpu is disabled.
+  def test_deepcopy(self):
+    if not context.executing_eagerly():
+      self.skipTest('v2-only test')
+    original_layer = rnn.GRU(5)
+    copied_layer = copy.deepcopy(original_layer)
+    self.assertEqual(copied_layer.units, 5)
+    self.assertEqual(original_layer.get_config(), original_layer.get_config())
+
+    # Copy layer before layer call on inputs without weight initialization.
+    inputs = np.random.normal(size=[32, 10, 8]).astype(np.float32)
+    original_layer = rnn.GRU(4)
+    copied_layer = copy.deepcopy(original_layer)
+    outputs = original_layer(inputs)
+    copied_outputs = copied_layer(inputs)
+    self.assertNotAllClose(
+        self.evaluate(outputs), self.evaluate(copied_outputs))
+
+    # Copy layer after layer call on inputs with weight initialization.
+    original_layer = rnn.GRU(4)
+    outputs = original_layer(inputs)
+    copied_layer = copy.deepcopy(original_layer)
+    copied_outputs = copied_layer(inputs)
+    self.assertAllClose(self.evaluate(outputs), self.evaluate(copied_outputs))
+
 
 class GRULayerGradientTapeTest(keras_parameterized.TestCase):
 
@@ -704,7 +729,7 @@ class GRUGraphRewriteTest(keras_parameterized.TestCase):
     else:
       self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_GRU_runtime(self):
     layer = rnn.GRU(self.rnn_state_size, return_runtime=True)
 
@@ -720,7 +745,7 @@ class GRUGraphRewriteTest(keras_parameterized.TestCase):
     model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
     self._test_runtime_with_model(model)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_GRU_runtime_with_mask(self):
     if test.is_built_with_rocm():
       self.skipTest('Skipping the test as ROCm MIOpen does not '
@@ -779,7 +804,7 @@ class GRUGraphRewriteTest(keras_parameterized.TestCase):
     _, runtime_value = model.predict(x_train)
     self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_GRU_runtime_with_cond(self):
     # This test is to demonstrate the graph rewrite of grappler plugin under
     # the condition that the function returns different number of internal
diff --git a/tensorflow/python/keras/layers/kernelized.py b/tensorflow/python/keras/layers/kernelized.py
index eac985e63bf..c8a6a65d68c 100644
--- a/tensorflow/python/keras/layers/kernelized.py
+++ b/tensorflow/python/keras/layers/kernelized.py
@@ -218,7 +218,7 @@ class RandomFourierFeatures(base_layer.Layer):
     super(RandomFourierFeatures, self).build(input_shape)
 
   def call(self, inputs):
-    inputs = ops.convert_to_tensor_v2(inputs, dtype=self.dtype)
+    inputs = ops.convert_to_tensor_v2_with_dispatch(inputs, dtype=self.dtype)
     inputs = gen_math_ops.cast(inputs, dtypes.float32)
     kernel = (1.0 / self.kernel_scale) * self.unscaled_kernel
     outputs = gen_math_ops.mat_mul(inputs, kernel)
diff --git a/tensorflow/python/keras/layers/kernelized_test.py b/tensorflow/python/keras/layers/kernelized_test.py
index 8ae3b2f31cb..da4fd16e029 100644
--- a/tensorflow/python/keras/layers/kernelized_test.py
+++ b/tensorflow/python/keras/layers/kernelized_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend as keras_backend
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import initializers
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import training
@@ -71,21 +72,22 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
     else:
       self.assertAllClose(expected, actual, atol=atol)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_state_saving_and_loading(self):
-    input_data = np.random.random((1, 2))
-    rff_layer = kernel_layers.RandomFourierFeatures(output_dim=10, scale=3.0)
-    inputs = input_layer.Input((2,))
-    outputs = rff_layer(inputs)
-    model = training.Model(inputs, outputs)
-    output_data = model.predict(input_data)
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir)
-    saved_model_dir = os.path.join(temp_dir, 'rff_model')
-    model.save(saved_model_dir)
-    new_model = save.load_model(saved_model_dir)
-    new_output_data = new_model.predict(input_data)
-    self.assertAllClose(output_data, new_output_data, atol=1e-4)
+    with self.cached_session():
+      input_data = np.random.random((1, 2))
+      rff_layer = kernel_layers.RandomFourierFeatures(output_dim=10, scale=3.0)
+      inputs = input_layer.Input((2,))
+      outputs = rff_layer(inputs)
+      model = training.Model(inputs, outputs)
+      output_data = model.predict(input_data)
+      temp_dir = self.get_temp_dir()
+      self.addCleanup(shutil.rmtree, temp_dir)
+      saved_model_dir = os.path.join(temp_dir, 'rff_model')
+      model.save(saved_model_dir)
+      new_model = save.load_model(saved_model_dir)
+      new_output_data = new_model.predict(input_data)
+      self.assertAllClose(output_data, new_output_data, atol=1e-4)
 
   def test_invalid_output_dim(self):
     with self.assertRaisesRegex(
diff --git a/tensorflow/python/keras/layers/legacy_rnn/BUILD b/tensorflow/python/keras/layers/legacy_rnn/BUILD
index 4d3b4a4c852..2da1445bec4 100644
--- a/tensorflow/python/keras/layers/legacy_rnn/BUILD
+++ b/tensorflow/python/keras/layers/legacy_rnn/BUILD
@@ -2,10 +2,19 @@
 #   Contains the legacy TF RNN APIs (internal TensorFlow version).
 
 package(
-    default_visibility = ["//tensorflow:__subpackages__"],
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+        "//tensorflow/python/keras:__subpackages__",
+    ],
     licenses = ["notice"],  # Apache 2.0
 )
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "rnn_cell_impl",
     srcs = ["rnn_cell_impl.py"],
@@ -31,6 +40,7 @@ py_library(
         "//tensorflow/python/keras:activations",
         "//tensorflow/python/keras:initializers",
         "//tensorflow/python/keras/engine:input_spec",
+        "//tensorflow/python/keras/legacy_tf_layers:layers_base",
         "//tensorflow/python/keras/utils:tf_utils",
         "//tensorflow/python/training/tracking:base",
     ],
diff --git a/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py b/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py
index 1e33edd497c..91f3aede33a 100644
--- a/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py
+++ b/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_impl.py
@@ -25,8 +25,10 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import warnings
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import config as tf_config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -38,8 +40,8 @@ from tensorflow.python.keras import initializers
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.keras.layers.legacy_rnn import rnn_cell_wrapper_impl
+from tensorflow.python.keras.legacy_tf_layers import base as base_layer
 from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import init_ops
@@ -282,7 +284,7 @@ class RNNCell(base_layer.Layer):
   def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
     if inputs is not None:
       # Validate the given batch_size and dtype against inputs if provided.
-      inputs = ops.convert_to_tensor(inputs, name="inputs")
+      inputs = ops.convert_to_tensor_v2_with_dispatch(inputs, name="inputs")
       if batch_size is not None:
         if tensor_util.is_tensor(batch_size):
           static_batch_size = tensor_util.constant_value(
@@ -416,13 +418,14 @@ class BasicRNNCell(LayerRNNCell):
                name=None,
                dtype=None,
                **kwargs):
-    logging.warning("`tf.nn.rnn_cell.BasicRNNCell` is deprecated. This class "
-                    "is equivalent as `tf.keras.layers.SimpleRNNCell`, "
-                    "and will be replaced by that in Tensorflow 2.0.")
+    warnings.warn("`tf.nn.rnn_cell.BasicRNNCell` is deprecated and will be "
+                  "removed in a future version. This class "
+                  "is equivalent as `tf.keras.layers.SimpleRNNCell`, "
+                  "and will be replaced by that in Tensorflow 2.0.")
     super(BasicRNNCell, self).__init__(
         _reuse=reuse, name=name, dtype=dtype, **kwargs)
     _check_supported_dtypes(self.dtype)
-    if context.executing_eagerly() and context.num_gpus() > 0:
+    if context.executing_eagerly() and tf_config.list_logical_devices("GPU"):
       logging.warn(
           "%s: Note that this cell is not optimized for performance. "
           "Please use tf.contrib.cudnn_rnn.CudnnRNNTanh for better "
@@ -523,14 +526,15 @@ class GRUCell(LayerRNNCell):
                name=None,
                dtype=None,
                **kwargs):
-    logging.warning("`tf.nn.rnn_cell.GRUCell` is deprecated. This class "
-                    "is equivalent as `tf.keras.layers.GRUCell`, "
-                    "and will be replaced by that in Tensorflow 2.0.")
+    warnings.warn("`tf.nn.rnn_cell.GRUCell` is deprecated and will be removed "
+                  "in a future version. This class "
+                  "is equivalent as `tf.keras.layers.GRUCell`, "
+                  "and will be replaced by that in Tensorflow 2.0.")
     super(GRUCell, self).__init__(
         _reuse=reuse, name=name, dtype=dtype, **kwargs)
     _check_supported_dtypes(self.dtype)
 
-    if context.executing_eagerly() and context.num_gpus() > 0:
+    if context.executing_eagerly() and tf_config.list_logical_devices("GPU"):
       logging.warn(
           "%s: Note that this cell is not optimized for performance. "
           "Please use tf.contrib.cudnn_rnn.CudnnGRU for better "
@@ -695,9 +699,10 @@ class BasicLSTMCell(LayerRNNCell):
         When restoring from CudnnLSTM-trained checkpoints, must use
         `CudnnCompatibleLSTMCell` instead.
     """
-    logging.warning("`tf.nn.rnn_cell.BasicLSTMCell` is deprecated. This class "
-                    "is equivalent as `tf.keras.layers.LSTMCell`, "
-                    "and will be replaced by that in Tensorflow 2.0.")
+    warnings.warn("`tf.nn.rnn_cell.BasicLSTMCell` is deprecated and will be "
+                  "removed in a future version. This class "
+                  "is equivalent as `tf.keras.layers.LSTMCell`, "
+                  "and will be replaced by that in Tensorflow 2.0.")
     super(BasicLSTMCell, self).__init__(
         _reuse=reuse, name=name, dtype=dtype, **kwargs)
     _check_supported_dtypes(self.dtype)
@@ -705,7 +710,7 @@ class BasicLSTMCell(LayerRNNCell):
       logging.warn(
           "%s: Using a concatenated state is slower and will soon be "
           "deprecated.  Use state_is_tuple=True.", self)
-    if context.executing_eagerly() and context.num_gpus() > 0:
+    if context.executing_eagerly() and tf_config.list_logical_devices("GPU"):
       logging.warn(
           "%s: Note that this cell is not optimized for performance. "
           "Please use tf.contrib.cudnn_rnn.CudnnLSTM for better "
@@ -895,9 +900,10 @@ class LSTMCell(LayerRNNCell):
         When restoring from CudnnLSTM-trained checkpoints, use
         `CudnnCompatibleLSTMCell` instead.
     """
-    logging.warning("`tf.nn.rnn_cell.LSTMCell` is deprecated. This class "
-                    "is equivalent as `tf.keras.layers.LSTMCell`, "
-                    "and will be replaced by that in Tensorflow 2.0.")
+    warnings.warn("`tf.nn.rnn_cell.LSTMCell` is deprecated and will be "
+                  "removed in a future version. This class "
+                  "is equivalent as `tf.keras.layers.LSTMCell`, "
+                  "and will be replaced by that in Tensorflow 2.0.")
     super(LSTMCell, self).__init__(
         _reuse=reuse, name=name, dtype=dtype, **kwargs)
     _check_supported_dtypes(self.dtype)
@@ -910,7 +916,7 @@ class LSTMCell(LayerRNNCell):
           "%s: The num_unit_shards and proj_unit_shards parameters are "
           "deprecated and will be removed in Jan 2017.  "
           "Use a variable scope with a partitioner instead.", self)
-    if context.executing_eagerly() and context.num_gpus() > 0:
+    if context.executing_eagerly() and tf_config.list_logical_devices("GPU"):
       logging.warn(
           "%s: Note that this cell is not optimized for performance. "
           "Please use tf.contrib.cudnn_rnn.CudnnLSTM for better "
diff --git a/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_wrapper_impl.py b/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_wrapper_impl.py
index 2e3923918a0..9618bc75545 100644
--- a/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_wrapper_impl.py
+++ b/tensorflow/python/keras/layers/legacy_rnn/rnn_cell_wrapper_impl.py
@@ -116,7 +116,7 @@ class DropoutWrapperBase(object):
     with ops.name_scope_v2("DropoutWrapperInit"):
 
       def tensor_and_const_value(v):
-        tensor_value = ops.convert_to_tensor(v)
+        tensor_value = ops.convert_to_tensor_v2_with_dispatch(v)
         const_value = tensor_util.constant_value(tensor_value)
         return (tensor_value, const_value)
 
diff --git a/tensorflow/python/keras/layers/local.py b/tensorflow/python/keras/layers/local.py
index c33c88f3a3d..88c8fede08c 100644
--- a/tensorflow/python/keras/layers/local.py
+++ b/tensorflow/python/keras/layers/local.py
@@ -29,6 +29,9 @@ from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -774,7 +777,7 @@ def local_conv_matmul(inputs, kernel, kernel_mask, output_shape):
   kernel = kernel_mask * kernel
   kernel = make_2d(kernel, split_dim=K.ndim(kernel) // 2)
 
-  output_flat = K.math_ops.sparse_matmul(inputs_flat, kernel, b_is_sparse=True)
+  output_flat = math_ops.sparse_matmul(inputs_flat, kernel, b_is_sparse=True)
   output = K.reshape(output_flat,
                      [K.shape(output_flat)[0],] + output_shape.as_list()[1:])
   return output
@@ -807,7 +810,7 @@ def local_conv_sparse_matmul(inputs, kernel, kernel_idxs, kernel_shape,
       Output (N+2)-D dense tensor with shape `output_shape`.
   """
   inputs_flat = K.reshape(inputs, (K.shape(inputs)[0], -1))
-  output_flat = K.sparse_ops.sparse_tensor_dense_mat_mul(
+  output_flat = sparse_ops.sparse_tensor_dense_mat_mul(
       kernel_idxs, kernel, kernel_shape, inputs_flat, adjoint_b=True)
   output_flat_transpose = K.transpose(output_flat)
 
@@ -833,11 +836,11 @@ def make_2d(tensor, split_dim):
     Tensor of shape
     `(d0 * ... * d(split_dim-1), d(split_dim) * ... * d(N-1))`.
   """
-  shape = K.array_ops.shape(tensor)
+  shape = array_ops.shape(tensor)
   in_dims = shape[:split_dim]
   out_dims = shape[split_dim:]
 
-  in_size = K.math_ops.reduce_prod(in_dims)
-  out_size = K.math_ops.reduce_prod(out_dims)
+  in_size = math_ops.reduce_prod(in_dims)
+  out_size = math_ops.reduce_prod(out_dims)
 
-  return K.array_ops.reshape(tensor, (in_size, out_size))
+  return array_ops.reshape(tensor, (in_size, out_size))
diff --git a/tensorflow/python/keras/layers/local_test.py b/tensorflow/python/keras/layers/local_test.py
index 78611317972..2505eed6bab 100644
--- a/tensorflow/python/keras/layers/local_test.py
+++ b/tensorflow/python/keras/layers/local_test.py
@@ -22,9 +22,13 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
@@ -423,9 +427,9 @@ def get_inputs(data_format, filters, height, num_samples, width):
 def xent(y_true, y_pred):
   y_true = keras.backend.cast(
       keras.backend.reshape(y_true, (-1,)),
-      keras.backend.dtypes_module.int32)
+      dtypes.int32)
 
-  return keras.backend.nn.sparse_softmax_cross_entropy_with_logits(
+  return nn.sparse_softmax_cross_entropy_with_logits(
       labels=y_true,
       logits=y_pred)
 
@@ -496,9 +500,9 @@ def copy_lc_weights_2_to_1(lc_layer_2_from, lc_layer_1_to):
   lc_2_kernel_masked = keras.backend.permute_dimensions(
       lc_2_kernel_masked, permutation)
 
-  lc_2_kernel_mask = keras.backend.math_ops.not_equal(
+  lc_2_kernel_mask = math_ops.not_equal(
       lc_2_kernel_masked, 0)
-  lc_2_kernel_flat = keras.backend.array_ops.boolean_mask(
+  lc_2_kernel_flat = array_ops.boolean_mask(
       lc_2_kernel_masked, lc_2_kernel_mask)
   lc_2_kernel_reshaped = keras.backend.reshape(lc_2_kernel_flat,
                                                lc_layer_1_to.kernel.shape)
@@ -516,8 +520,8 @@ def copy_lc_weights_2_to_3(lc_layer_2_from, lc_layer_3_to):
   lc_2_kernel_masked = keras.layers.local.make_2d(
       lc_2_kernel_masked, split_dim=keras.backend.ndim(lc_2_kernel_masked) // 2)
   lc_2_kernel_masked = keras.backend.transpose(lc_2_kernel_masked)
-  lc_2_kernel_mask = keras.backend.math_ops.not_equal(lc_2_kernel_masked, 0)
-  lc_2_kernel_flat = keras.backend.array_ops.boolean_mask(
+  lc_2_kernel_mask = math_ops.not_equal(lc_2_kernel_masked, 0)
+  lc_2_kernel_flat = array_ops.boolean_mask(
       lc_2_kernel_masked, lc_2_kernel_mask)
 
   lc_2_kernel_flat = keras.backend.get_value(lc_2_kernel_flat)
diff --git a/tensorflow/python/keras/layers/lstm_test.py b/tensorflow/python/keras/layers/lstm_test.py
index 5ed86dd829a..29020b09f64 100644
--- a/tensorflow/python/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/layers/lstm_test.py
@@ -25,7 +25,6 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
@@ -47,7 +46,7 @@ class LSTMLayerTest(keras_parameterized.TestCase):
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_float64_LSTM(self):
     if test.is_built_with_rocm():
       self.skipTest('Double type is yet not supported in ROCm')
diff --git a/tensorflow/python/keras/layers/lstm_v2_test.py b/tensorflow/python/keras/layers/lstm_v2_test.py
index e9774e15c5f..c6cb9208357 100644
--- a/tensorflow/python/keras/layers/lstm_v2_test.py
+++ b/tensorflow/python/keras/layers/lstm_v2_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import os
 import shutil
 import time
@@ -35,7 +36,6 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import recurrent as rnn_v1
@@ -81,7 +81,7 @@ class LSTMV2Test(keras_parameterized.TestCase):
         use_bias=use_bias)
     self.assertFalse(layer._could_use_gpu_kernel)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_use_on_default_activation_with_gpu_kernel(self):
     layer = rnn.LSTM(1, activation=nn.tanh)
     self.assertTrue(layer._could_use_gpu_kernel)
@@ -324,7 +324,7 @@ class LSTMV2Test(keras_parameterized.TestCase):
     targets = np.random.random((num_samples, units))
     model.train_on_batch([main_inputs] + initial_state, targets)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_lstm_v2_feature_parity_with_canonical_lstm(self):
     if test.is_built_with_rocm():
       self.skipTest('Skipping the test as ROCm MIOpen does not '
@@ -592,7 +592,7 @@ class LSTMV2Test(keras_parameterized.TestCase):
         },
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_float64_LSTM(self):
     if test.is_built_with_rocm():
       self.skipTest('Skipping the test as ROCm MIOpen does not '
@@ -767,7 +767,7 @@ class LSTMV2Test(keras_parameterized.TestCase):
     model.evaluate(x, y)
     model.predict(x)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_explicit_device_with_go_backward_and_mask(self):
     if test.is_built_with_rocm():
       self.skipTest('Skipping the test as ROCm MIOpen does not '
@@ -841,6 +841,31 @@ class LSTMV2Test(keras_parameterized.TestCase):
     # Make sure it doesn't crash with cudnn kernel.
     model.predict(inputs)
 
+  # TODO (b/169895267): test with xla_gpu is disabled.
+  def test_deepcopy(self):
+    if not context.executing_eagerly():
+      self.skipTest('v2-only test')
+    original_layer = rnn.LSTM(5)
+    copied_layer = copy.deepcopy(original_layer)
+    self.assertEqual(copied_layer.units, 5)
+    self.assertEqual(original_layer.get_config(), original_layer.get_config())
+
+    # Copy layer before layer call on inputs without weight initialization.
+    inputs = np.random.normal(size=[32, 10, 8]).astype(np.float32)
+    original_layer = rnn.LSTM(4)
+    copied_layer = copy.deepcopy(original_layer)
+    outputs = original_layer(inputs)
+    copied_outputs = copied_layer(inputs)
+    self.assertNotAllClose(
+        self.evaluate(outputs), self.evaluate(copied_outputs))
+
+    # Copy layer after layer call on inputs with weight initialization.
+    original_layer = rnn.LSTM(4)
+    outputs = original_layer(inputs)
+    copied_layer = copy.deepcopy(original_layer)
+    copied_outputs = copied_layer(inputs)
+    self.assertAllClose(self.evaluate(outputs), self.evaluate(copied_outputs))
+
 
 @keras_parameterized.run_all_keras_modes(config=_config)
 class LSTMGraphRewriteTest(keras_parameterized.TestCase):
@@ -880,7 +905,7 @@ class LSTMGraphRewriteTest(keras_parameterized.TestCase):
     else:
       self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_LSTM_runtime(self):
     layer = rnn.LSTM(self.rnn_state_size, return_runtime=True)
 
@@ -896,7 +921,7 @@ class LSTMGraphRewriteTest(keras_parameterized.TestCase):
     model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
     self._test_runtime_with_model(model)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_LSTM_runtime_with_mask(self):
     if test.is_built_with_rocm():
       self.skipTest('Skipping the test as ROCm MIOpen does not '
@@ -955,7 +980,7 @@ class LSTMGraphRewriteTest(keras_parameterized.TestCase):
     _, runtime_value = model.predict(x_train)
     self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_LSTM_runtime_with_cond(self):
     # This test is to demonstrate the graph rewrite of grappler plugin under
     # the condition that the function returns different number of internal
diff --git a/tensorflow/python/keras/layers/merge_test.py b/tensorflow/python/keras/layers/merge_test.py
index 16d3701b7e3..028d2eb3925 100644
--- a/tensorflow/python/keras/layers/merge_test.py
+++ b/tensorflow/python/keras/layers/merge_test.py
@@ -22,7 +22,6 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
@@ -225,7 +224,7 @@ class MergeLayersTest(keras_parameterized.TestCase):
     self.assertEqual(layer.compute_output_shape([(4, 5), (4, 5)]), (4, 1))
 
   @parameterized.named_parameters(
-      *tf_test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           layer=[keras.layers.Add, keras.layers.Subtract,
                  keras.layers.Multiply, keras.layers.Minimum,
                  keras.layers.Maximum, keras.layers.Average,
@@ -251,7 +250,7 @@ class MergeLayersTest(keras_parameterized.TestCase):
     self.assertAllEqual(out_dense, out_ragged)
 
   @parameterized.named_parameters(
-      *tf_test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           layer=[keras.layers.Add, keras.layers.Subtract,
                  keras.layers.Multiply, keras.layers.Minimum,
                  keras.layers.Maximum, keras.layers.Average]))
diff --git a/tensorflow/python/keras/layers/multi_head_attention.py b/tensorflow/python/keras/layers/multi_head_attention.py
index 7ddce8caceb..bda0056fe7e 100644
--- a/tensorflow/python/keras/layers/multi_head_attention.py
+++ b/tensorflow/python/keras/layers/multi_head_attention.py
@@ -191,11 +191,15 @@ class MultiHeadAttention(Layer):
     value: Value `Tensor` of shape `[B, S, dim]`.
     key: Optional key `Tensor` of shape `[B, S, dim]`. If not given, will use
       `value` for both `key` and `value`, which is the most common case.
-    attention_mask: a boolean mask of shape `[B, T, S]`, that prevents attention
-      to certain positions.
+    attention_mask: a boolean mask of shape `[B, T, S]`, that prevents
+      attention to certain positions.
     return_attention_scores: A boolean to indicate whether the output should
       be attention output if True, or (attention_output, attention_scores) if
       False. Defaults to False.
+    training: Python boolean indicating whether the layer should behave in
+      training mode (adding dropout) or in inference mode (no dropout).
+      Defaults to either using the training mode of the parent layer/model,
+      or False (inference) if there is no parent layer.
 
   Returns:
     attention_output: The result of the computation, of shape [B, T, E],
@@ -396,7 +400,12 @@ class MultiHeadAttention(Layer):
             attention_mask, axis=mask_expansion_axes)
     return self._softmax(attention_scores, attention_mask)
 
-  def _compute_attention(self, query, key, value, attention_mask=None):
+  def _compute_attention(self,
+                         query,
+                         key,
+                         value,
+                         attention_mask=None,
+                         training=None):
     """Applies Dot-product attention with query, key, value tensors.
 
     This function defines the computation inside `call` with projected
@@ -409,6 +418,8 @@ class MultiHeadAttention(Layer):
       value: Projected value `Tensor` of shape `[B, T, N, value_dim]`.
       attention_mask: a boolean mask of shape `[B, T, S]`, that prevents
         attention to certain positions.
+      training: Python boolean indicating whether the layer should behave in
+        training mode (adding dropout) or in inference mode (doing nothing).
 
     Returns:
       attention_output: Multi-headed outputs of attention computation.
@@ -428,15 +439,21 @@ class MultiHeadAttention(Layer):
 
     # This is actually dropping out entire tokens to attend to, which might
     # seem a bit unusual, but is taken from the original Transformer paper.
-    attention_scores_dropout = self._dropout_layer(attention_scores)
+    attention_scores_dropout = self._dropout_layer(
+        attention_scores, training=training)
 
     # `context_layer` = [B, T, N, H]
     attention_output = special_math_ops.einsum(self._combine_equation,
                                                attention_scores_dropout, value)
     return attention_output, attention_scores
 
-  def call(self, query, value, key=None, attention_mask=None,
-           return_attention_scores=False):
+  def call(self,
+           query,
+           value,
+           key=None,
+           attention_mask=None,
+           return_attention_scores=False,
+           training=None):
     if not self._built_from_signature:
       self._build_from_signature(query=query, value=value, key=key)
     if key is None:
@@ -454,10 +471,9 @@ class MultiHeadAttention(Layer):
     value = self._value_dense(value)
 
     attention_output, attention_scores = self._compute_attention(
-        query, key, value, attention_mask)
+        query, key, value, attention_mask, training)
     attention_output = self._output_dense(attention_output)
 
     if return_attention_scores:
       return attention_output, attention_scores
     return attention_output
-
diff --git a/tensorflow/python/keras/layers/multi_head_attention_test.py b/tensorflow/python/keras/layers/multi_head_attention_test.py
index a50fefd05ba..4c957b8973b 100644
--- a/tensorflow/python/keras/layers/multi_head_attention_test.py
+++ b/tensorflow/python/keras/layers/multi_head_attention_test.py
@@ -225,6 +225,22 @@ class MultiHeadAttentionTest(keras_parameterized.TestCase):
         model.predict([query, value, mask_data]),
         model.predict([query, value, null_mask_data]))
 
+  def test_dropout(self):
+    test_layer = multi_head_attention.MultiHeadAttention(
+        num_heads=2, key_dim=2, dropout=0.5)
+
+    # Generate data for the input (non-mask) tensors.
+    from_data = keras.backend.ones(shape=(32, 4, 8))
+    to_data = keras.backend.ones(shape=(32, 2, 8))
+    train_out = test_layer(from_data, to_data, None, None, None, True)
+    test_out = test_layer(from_data, to_data, None, None, None, False)
+
+    # Output should be close when not in training mode,
+    # and should not be close when enabling dropout in training mode.
+    self.assertNotAllClose(
+        keras.backend.eval(train_out),
+        keras.backend.eval(test_out))
+
 
 class SubclassAttention(multi_head_attention.MultiHeadAttention):
 
@@ -235,7 +251,8 @@ class SubclassAttention(multi_head_attention.MultiHeadAttention):
                          query_tensor,
                          key_tensor,
                          value_tensor,
-                         attention_mask=None):
+                         attention_mask=None,
+                         training=None):
     return value_tensor, None
 
 
diff --git a/tensorflow/python/keras/layers/noise_test.py b/tensorflow/python/keras/layers/noise_test.py
index 96c6d595bdf..47133891aa8 100644
--- a/tensorflow/python/keras/layers/noise_test.py
+++ b/tensorflow/python/keras/layers/noise_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.keras.backend import dtypes_module
+from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
@@ -48,7 +48,7 @@ class NoiseLayersTest(keras_parameterized.TestCase):
 
   @staticmethod
   def _make_model(dtype, class_type):
-    assert dtype in (dtypes_module.float32, dtypes_module.float64)
+    assert dtype in (dtypes.float32, dtypes.float64)
     assert class_type in ('gaussian_noise', 'gaussian_dropout', 'alpha_noise')
     model = keras.Sequential()
     model.add(keras.layers.Dense(8, input_shape=(32,), dtype=dtype))
@@ -70,22 +70,22 @@ class NoiseLayersTest(keras_parameterized.TestCase):
     model.train_on_batch(np.zeros((8, 32)), np.zeros((8, 8)))
 
   def test_noise_float32(self):
-    self._train_model(dtypes_module.float32, 'gaussian_noise')
+    self._train_model(dtypes.float32, 'gaussian_noise')
 
   def test_noise_float64(self):
-    self._train_model(dtypes_module.float64, 'gaussian_noise')
+    self._train_model(dtypes.float64, 'gaussian_noise')
 
   def test_dropout_float32(self):
-    self._train_model(dtypes_module.float32, 'gaussian_dropout')
+    self._train_model(dtypes.float32, 'gaussian_dropout')
 
   def test_dropout_float64(self):
-    self._train_model(dtypes_module.float64, 'gaussian_dropout')
+    self._train_model(dtypes.float64, 'gaussian_dropout')
 
   def test_alpha_dropout_float32(self):
-    self._train_model(dtypes_module.float32, 'alpha_noise')
+    self._train_model(dtypes.float32, 'alpha_noise')
 
   def test_alpha_dropout_float64(self):
-    self._train_model(dtypes_module.float64, 'alpha_noise')
+    self._train_model(dtypes.float64, 'alpha_noise')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index d9bac2c2e92..0737fe11712 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -41,30 +41,47 @@ from tensorflow.python.util.tf_export import keras_export
 
 
 class BatchNormalizationBase(Layer):
-  r"""Normalize and scale inputs or activations.
+  r"""Layer that normalizes its inputs.
 
-  Normalize the activations of the previous layer at each batch,
-  i.e. applies a transformation that maintains the mean activation
-  close to 0 and the activation standard deviation close to 1.
+  Batch normalization applies a transformation that maintains the mean output
+  close to 0 and the output standard deviation close to 1.
 
-  Batch normalization differs from other layers in several key aspects:
+  Importantly, batch normalization works differently during training and
+  during inference.
 
-  1) Adding BatchNormalization with `training=True` to a model causes the
-  result of one example to depend on the contents of all other examples in a
-  minibatch. Be careful when padding batches or masking examples, as these can
-  change the minibatch statistics and affect other examples.
+  **During training** (i.e. when using `fit()` or when calling the layer/model
+  with the argument `training=True`), the layer normalizes its output using
+  the mean and standard deviation of the current batch of inputs. That is to
+  say, for each channel being normalized, the layer returns
+  `(batch - mean(batch)) / (var(batch) + epsilon) * gamma + beta`, where:
 
-  2) Updates to the weights (moving statistics) are based on the forward pass
-  of a model rather than the result of gradient computations.
+  - `epsilon` is small constant (configurable as part of the constructor
+  arguments)
+  - `gamma` is a learned scaling factor (initialized as 1), which
+  can be disabled by passing `scale=False` to the constructor.
+  - `beta` is a learned offset factor (initialized as 0), which
+  can be disabled by passing `center=False` to the constructor.
 
-  3) When performing inference using a model containing batch normalization, it
-  is generally (though not always) desirable to use accumulated statistics
-  rather than mini-batch statistics. This is accomplished by passing
-  `training=False` when calling the model, or using `model.predict`.
+  **During inference** (i.e. when using `evaluate()` or `predict()` or when
+  calling the layer/model with the argument `training=False` (which is the
+  default), the layer normalizes its output using a moving average of the
+  mean and standard deviation of the batches it has seen during training. That
+  is to say, it returns
+  `(batch - self.moving_mean) / (self.moving_var + epsilon) * gamma + beta`.
+
+  `self.moving_mean` and `self.moving_var` are non-trainable variables that
+  are updated each time the layer in called in training mode, as such:
+
+  - `moving_mean = moving_mean * momentum + mean(batch) * (1 - momentum)`
+  - `moving_var = moving_var * momentum + var(batch) * (1 - momentum)`
+
+  As such, the layer will only normalize its inputs during inference
+  *after having been trained on data that has similar statistics as the
+  inference data*.
 
   Arguments:
-    axis: Integer, the axis that should be normalized (typically the features
-      axis). For instance, after a `Conv2D` layer with
+    axis: Integer or a list of integers, the axis that should be normalized
+    (typically the features axis). For instance, after a `Conv2D` layer with
       `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
     momentum: Momentum for the moving average.
     epsilon: Small float added to variance to avoid dividing by zero.
@@ -117,6 +134,7 @@ class BatchNormalizationBase(Layer):
             across all examples), and finally apply gamma and/or beta. If
             `None`, no adjustment is applied. Cannot be specified if
             virtual_batch_size is specified.
+
   Call arguments:
     inputs: Input tensor (of any rank).
     training: Python boolean indicating whether the layer should behave in
@@ -125,21 +143,13 @@ class BatchNormalizationBase(Layer):
         variance of the current batch of inputs.
       - `training=False`: The layer will normalize its inputs using the mean and
         variance of its moving statistics, learned during training.
+
   Input shape: Arbitrary. Use the keyword argument `input_shape` (tuple of
     integers, does not include the samples axis) when using this layer as the
     first layer in a model.
+
   Output shape: Same shape as input.  {{TRAINABLE_ATTRIBUTE_NOTE}}
-  Normalization equations: Consider the intermediate activations \(x\) of a
-    mini-batch of size
-    \\(m\\):  We can compute the mean and variance of the batch  \\({\mu_B} =
-      \frac{1}{m} \sum_{i=1}^{m} {x_i}\\)  \\({\sigma_B^2} = \frac{1}{m}
-      \sum_{i=1}^{m} ({x_i} - {\mu_B})^2\\)  and then compute a normalized
-      \\(x\\), including a small factor \\({\epsilon}\\) for numerical
-      stability.  \\(\hat{x_i} = \frac{x_i - \mu_B}{\sqrt{\sigma_B^2 +
-      \epsilon}}\\)  And finally \\(\hat{x}\) is linearly transformed by
-      \({\gamma}\\)
-    and \\({\beta}\\), which are learned parameters:  \\({y_i} = {\gamma *
-      \hat{x_i} + \beta}\\)
+
   Reference:
     - [Ioffe and Szegedy, 2015](https://arxiv.org/abs/1502.03167).
   """
@@ -320,13 +330,13 @@ class BatchNormalizationBase(Layer):
       # output back to its original shape accordingly.
       if self._USE_V2_BEHAVIOR:
         if self.fused is None:
-          self.fused = (ndims == 4)
-        elif self.fused and ndims != 4:
+          self.fused = ndims in (4, 5)
+        elif self.fused and ndims not in (4, 5):
           raise ValueError('Batch normalization layers with fused=True only '
-                           'support 4D input tensors.')
+                           'support 4D or 5D input tensors.')
       else:
         assert self.fused is not None
-        self.fused = (ndims == 4 and self._fused_can_be_used())
+        self.fused = (ndims in (4, 5) and self._fused_can_be_used())
       # TODO(chrisying): fused batch norm is currently not supported for
       # multi-axis batch norm and by extension virtual batches. In some cases,
       # it might be possible to use fused batch norm but would require reshaping
@@ -335,13 +345,22 @@ class BatchNormalizationBase(Layer):
       # common use case (turning 5D w/ virtual batch to NCHW)
 
     if self.fused:
-      if self.axis == [1]:
+      if self.axis == [1] and ndims == 4:
         self._data_format = 'NCHW'
-      elif self.axis == [3]:
+      elif self.axis == [1] and ndims == 5:
+        self._data_format = 'NCDHW'
+      elif self.axis == [3] and ndims == 4:
         self._data_format = 'NHWC'
+      elif self.axis == [4] and ndims == 5:
+        self._data_format = 'NDHWC'
+      elif ndims == 5:
+        # 5D tensors that can be passed in but should not use fused batch norm
+        # due to unsupported axis.
+        self.fused = False
       else:
         raise ValueError('Unsupported axis, fused batch norm only supports '
-                         'axis == [1] or axis == [3]')
+                         'axis == [1] or axis == [3] for 4D input tensors or '
+                         'axis == [1] or axis == [4] for 5D input tensors')
 
     axis_to_dim = {x: input_shape.dims[x].value for x in self.axis}
     for x in axis_to_dim:
@@ -480,7 +499,8 @@ class BatchNormalizationBase(Layer):
   def _assign_moving_average(self, variable, value, momentum, inputs_size):
     with K.name_scope('AssignMovingAvg') as scope:
       with ops.colocate_with(variable):
-        decay = ops.convert_to_tensor_v2(1.0 - momentum, name='decay')
+        decay = ops.convert_to_tensor_v2_with_dispatch(
+            1.0 - momentum, name='decay')
         if decay.dtype != variable.dtype.base_dtype:
           decay = math_ops.cast(decay, variable.dtype.base_dtype)
         update_delta = (variable - math_ops.cast(value, variable.dtype)) * decay
@@ -585,7 +605,7 @@ class BatchNormalizationBase(Layer):
                                                   lambda: self.momentum,
                                                   lambda: 1.0)
         else:
-          momentum = ops.convert_to_tensor_v2(self.momentum)
+          momentum = ops.convert_to_tensor_v2_with_dispatch(self.momentum)
 
       def mean_update():
         """Update self.moving_mean with the most recent data point."""
@@ -787,10 +807,11 @@ class BatchNormalizationBase(Layer):
       moving_variance = self.moving_variance
 
       mean = control_flow_util.smart_cond(
-          training, lambda: mean, lambda: ops.convert_to_tensor_v2(moving_mean))
+          training, lambda: mean,
+          lambda: ops.convert_to_tensor_v2_with_dispatch(moving_mean))
       variance = control_flow_util.smart_cond(
           training, lambda: variance,
-          lambda: ops.convert_to_tensor_v2(moving_variance))
+          lambda: ops.convert_to_tensor_v2_with_dispatch(moving_variance))
 
       if self.virtual_batch_size is not None:
         # This isn't strictly correct since in ghost batch norm, you are
@@ -949,7 +970,8 @@ def enclosing_xla_context():
   return None
 
 
-@keras_export(v1=['keras.layers.BatchNormalization'])  # pylint: disable=missing-docstring
+# pylint: disable=missing-docstring
+@keras_export(v1=['keras.layers.BatchNormalization'])
 class BatchNormalization(BatchNormalizationBase):
 
   __doc__ = replace_in_base_docstring([("""
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index e60f34720a2..79ecc3c3fe1 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -22,12 +22,10 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
@@ -68,6 +66,15 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
         kwargs={'scale': False,
                 'center': False},
         input_shape=(3, 3))
+    testing_utils.layer_test(
+        keras.layers.BatchNormalization,
+        kwargs={
+            'gamma_initializer': 'ones',
+            'beta_initializer': 'ones',
+            'moving_mean_initializer': 'zeros',
+            'moving_variance_initializer': 'ones'
+        },
+        input_shape=(3, 2, 4, 2))
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_batchnorm_weights(self):
@@ -211,6 +218,7 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
     train_loss = model.train_on_batch(test_data, test_targets)
     self.assertAlmostEqual(test_loss, train_loss)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_eager_batchnorm_in_custom_model_call_with_tf_function(self):
 
     class MyModel(keras.Model):
@@ -223,16 +231,15 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
       def call(self, x, training):
         return self.bn(x, training=training)
 
-    with context.eager_mode():
-      model = MyModel()
+    model = MyModel()
 
-      for _ in range(10):
-        x = constant_op.constant(0.5, shape=[1, 1])
-        model(x, training=True)
+    for _ in range(10):
+      x = constant_op.constant(0.5, shape=[1, 1])
+      model(x, training=True)
 
-      # Make sure the moving mean and variance have been updated
-      self.assertAllClose(model.bn.moving_mean.numpy(), [0.047], atol=3e-3)
-      self.assertAllClose(model.bn.moving_variance.numpy(), [0.9], atol=3e-2)
+    # Make sure the moving mean and variance have been updated
+    self.assertAllClose(model.bn.moving_mean.numpy(), [0.047], atol=3e-3)
+    self.assertAllClose(model.bn.moving_variance.numpy(), [0.9], atol=3e-2)
 
 
 class BatchNormalizationV1Test(keras_parameterized.TestCase):
@@ -321,7 +328,7 @@ class BatchNormalizationV2Test(keras_parameterized.TestCase):
     norm = normalization_v2.BatchNormalization(fused=True)
     self.assertEqual(norm.fused, True)
     inp = keras.layers.Input(shape=(4, 4))
-    with self.assertRaisesRegex(ValueError, '4D input tensors'):
+    with self.assertRaisesRegex(ValueError, '4D or 5D input tensors'):
       norm(inp)
 
   def test_updates_in_wrap_function(self):
@@ -757,7 +764,7 @@ class LayerNormalizationNumericsTest(keras_parameterized.TestCase):
         self.assertAllClose(gamma_grad_t, gamma_grad_ref, rtol=tol, atol=tol)
 
   # The gradient_checker_v2 does not work properly with LayerNorm in graph mode.
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_backward(self):
     # For numeric stability, we ensure the axis's dimension(s) have at least 4
     # elements.
diff --git a/tensorflow/python/keras/layers/normalization_v2.py b/tensorflow/python/keras/layers/normalization_v2.py
index afc5c4ba412..ce487d2359a 100644
--- a/tensorflow/python/keras/layers/normalization_v2.py
+++ b/tensorflow/python/keras/layers/normalization_v2.py
@@ -28,7 +28,8 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import keras_export
 
 
-@keras_export('keras.layers.experimental.SyncBatchNormalization', v1=[])  # pylint: disable=g-classes-have-attributes
+# pylint: disable=g-classes-have-attributes
+@keras_export('keras.layers.experimental.SyncBatchNormalization', v1=[])
 class SyncBatchNormalization(normalization.BatchNormalizationBase):
   r"""Normalize and scale inputs or activations synchronously across replicas.
 
@@ -169,9 +170,14 @@ class SyncBatchNormalization(normalization.BatchNormalizationBase):
         local_squared_sum = math_ops.reduce_sum(math_ops.square(y), axis=axes,
                                                 keepdims=True)
         batch_size = math_ops.cast(array_ops.shape_v2(y)[0], dtypes.float32)
-        y_sum, y_squared_sum, global_batch_size = (
-            replica_ctx.all_reduce(reduce_util.ReduceOp.SUM, [
-                local_sum, local_squared_sum, batch_size]))
+        # TODO(b/163099951): batch the all-reduces once we sort out the ordering
+        # issue for NCCL. We don't have a mechanism to launch NCCL in the same
+        # order in each replica nowadays, so we limit NCCL to batch all-reduces.
+        y_sum = replica_ctx.all_reduce(reduce_util.ReduceOp.SUM, local_sum)
+        y_squared_sum = replica_ctx.all_reduce(reduce_util.ReduceOp.SUM,
+                                               local_squared_sum)
+        global_batch_size = replica_ctx.all_reduce(reduce_util.ReduceOp.SUM,
+                                                   batch_size)
 
         axes_vals = [(array_ops.shape_v2(y))[i] for i in range(1, len(axes))]
         multiplier = math_ops.cast(math_ops.reduce_prod(axes_vals),
@@ -204,7 +210,8 @@ class SyncBatchNormalization(normalization.BatchNormalizationBase):
         return (mean, variance)
 
 
-@keras_export('keras.layers.BatchNormalization', v1=[])  # pylint: disable=missing-docstring
+# pylint: disable=missing-docstring
+@keras_export('keras.layers.BatchNormalization', v1=[])
 class BatchNormalization(normalization.BatchNormalizationBase):
 
   __doc__ = normalization.replace_in_base_docstring([
diff --git a/tensorflow/python/keras/layers/ops/BUILD b/tensorflow/python/keras/layers/ops/BUILD
index 09973c54790..ea70ee982db 100644
--- a/tensorflow/python/keras/layers/ops/BUILD
+++ b/tensorflow/python/keras/layers/ops/BUILD
@@ -8,7 +8,11 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
 
 py_library(
     name = "core",
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index 723c541c8ad..2d325799328 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -16,7 +16,11 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
 
 py_library(
     name = "preprocessing",
@@ -46,10 +50,18 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:boosted_trees_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python/keras/engine:base_layer",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/utils:tf_utils",
         "//tensorflow/python/ops/parallel_for:control_flow_ops",
+        "//tensorflow/python/ops/ragged:ragged_functional_ops",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -60,12 +72,19 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_spec",
-        "//tensorflow/python/keras/engine:base_layer",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/ops/ragged:ragged_array_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -78,10 +97,18 @@ py_library(
     deps = [
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_spec",
-        "//tensorflow/python/keras/engine:base_layer",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/ops/ragged:ragged_functional_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -102,9 +129,16 @@ py_library(
         "//tensorflow/python:stateful_random_ops",
         "//tensorflow/python:stateless_random_ops",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/compat",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/keras:backend",
-        "//tensorflow/python/keras/engine:base_layer",
-        "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/engine:input_spec",
+        "//tensorflow/python/keras/utils:control_flow_util",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -117,20 +151,14 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":table_utils",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/keras:backend",
-        "//tensorflow/python/keras/engine:base_preprocessing_layer",
-        "//tensorflow/python/ops/ragged",
+        "//tensorflow/python/keras/engine",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -144,11 +172,15 @@ py_library(
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tf_export",
         "//tensorflow/python:util",
         "//tensorflow/python/keras:backend",
-        "//tensorflow/python/keras/engine:base_preprocessing_layer",
+        "//tensorflow/python/keras/engine",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -163,6 +195,8 @@ py_library(
         ":index_lookup",
         ":table_utils",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/keras/engine",
     ],
 )
 
@@ -174,19 +208,17 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/keras:backend",
-        "//tensorflow/python/keras/engine:base_preprocessing_layer",
-        "//tensorflow/python/ops/ragged",
+        "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/ops/ragged:ragged_functional_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -204,16 +236,18 @@ py_library(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:util",
+        "//tensorflow/python:tf_export",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/keras:backend",
-        "//tensorflow/python/keras/engine:base_preprocessing_layer",
-        "//tensorflow/python/ops/ragged",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/utils:layer_utils",
+        "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/ops/ragged:ragged_functional_ops",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -226,19 +260,23 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:bincount_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:string_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:tf_export",
         "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/keras:backend",
-        "//tensorflow/python/keras/engine:base_preprocessing_layer",
-        "//tensorflow/python/ops/ragged",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/engine:input_spec",
+        "//tensorflow/python/keras/utils:layer_utils",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -249,8 +287,9 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python/keras:backend",
+        "//tensorflow/python:platform",
         "//tensorflow/python/keras/engine:base_layer",
     ],
 )
@@ -266,6 +305,8 @@ py_library(
         ":index_lookup",
         ":table_utils",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/keras/engine",
     ],
 )
 
@@ -277,9 +318,11 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/keras:engine",
-        "//tensorflow/python/keras/engine:base_preprocessing_layer",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/utils:tf_utils",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -289,6 +332,8 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -303,9 +348,20 @@ cuda_py_test(
     ],
     deps = [
         ":category_crossing",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras:testing_utils",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
         "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -339,8 +395,15 @@ distribute_py_test(
     ],
     deps = [
         ":category_encoding",
+        ":preprocessing_test_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:config",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_combinations_lib",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/distribute:tpu_strategy",
         "//tensorflow/python/keras",
     ],
 )
@@ -358,8 +421,15 @@ distribute_py_test(
     ],
     deps = [
         ":category_crossing",
+        ":preprocessing_test_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:config",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_combinations_lib",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/distribute:tpu_strategy",
         "//tensorflow/python/keras",
     ],
 )
@@ -374,7 +444,12 @@ distribute_py_test(
     ],
     tpu_tags = ["no_oss"],
     deps = [
-        ":category_crossing",
+        ":image_preprocessing",
+        ":preprocessing_test_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_combinations_lib",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/keras",
@@ -391,6 +466,7 @@ tf_py_test(
         ":discretization",
         ":preprocessing_test_utils",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -400,9 +476,15 @@ distribute_py_test(
     srcs = ["discretization_distribution_test.py"],
     main = "discretization_distribution_test.py",
     python_version = "PY3",
-    tags = ["multi_and_single_gpu"],
+    tags = [
+        "multi_and_single_gpu",
+    ],
     deps = [
         ":discretization",
+        ":preprocessing_test_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:config",
+        "//tensorflow/python:framework_test_combinations_lib",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/keras",
@@ -418,8 +500,16 @@ cuda_py_test(
     deps = [
         ":hashing",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras:testing_utils",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -478,7 +568,18 @@ cuda_py_test(
     deps = [
         ":image_preprocessing",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:stateful_random_ops",
+        "//tensorflow/python:stateless_random_ops",
+        "//tensorflow/python/compat",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/keras",
         "//tensorflow/python/keras:testing_utils",
+        "//tensorflow/python/keras/engine",
+        "//tensorflow/python/keras/utils:generic_utils",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -489,10 +590,15 @@ tf_py_test(
     size = "small",
     srcs = ["normalization_test.py"],
     python_version = "PY3",
+    tags = [
+        "broken",  # b/170974360
+        "noasan",  # TODO(b/337374867) fails with -fsanitize=null
+    ],
     deps = [
         ":normalization",
         ":preprocessing_test_utils",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -518,11 +624,18 @@ distribute_py_test(
     srcs = ["normalization_distribution_test.py"],
     main = "normalization_distribution_test.py",
     python_version = "PY3",
-    tags = ["no_oss"],
+    tags = [
+        "no_oss",
+    ],
     deps = [
         ":normalization",
+        ":preprocessing_test_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_combinations_lib",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/keras",
     ],
 )
@@ -531,6 +644,7 @@ tf_py_test(
     name = "table_utils_test",
     srcs = ["table_utils_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":table_utils",
         "//tensorflow/python:client_testlib",
@@ -573,10 +687,16 @@ distribute_py_test(
         "no_oss",  # b/155502591
     ],
     deps = [
+        ":preprocessing_test_utils",
         ":text_vectorization",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:config",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_combinations_lib",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:strategy_combinations",
-        "//tensorflow/python/eager:test",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/keras",
     ],
 )
@@ -585,9 +705,11 @@ tf_py_test(
     name = "reduction_test",
     srcs = ["reduction_test.py"],
     python_version = "PY3",
+    tags = ["notsan"],  # TODO(b/170783154)
     deps = [
         ":reduction",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -597,6 +719,9 @@ tf_py_test(
     size = "medium",
     srcs = ["string_lookup_test.py"],
     python_version = "PY3",
+    tags = [
+        "notsan",  #b/168758821
+    ],
     deps = [
         ":preprocessing_test_utils",
         ":string_lookup",
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
index 7c976880059..88693c7fa25 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
@@ -8,15 +8,27 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
 
 tf_py_test(
     name = "category_encoding_benchmark",
     srcs = ["category_encoding_benchmark.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
-        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:platform_benchmark",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/keras/layers/preprocessing:category_encoding",
+        "@absl_py//absl/flags",
     ],
 )
 
@@ -24,9 +36,18 @@ tf_py_test(
     name = "category_crossing_benchmark",
     srcs = ["category_crossing_benchmark.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
-        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:platform_benchmark",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/keras/layers/preprocessing:category_crossing",
+        "@absl_py//absl/flags",
     ],
 )
 
@@ -34,9 +55,18 @@ tf_py_test(
     name = "hashing_benchmark",
     srcs = ["hashing_benchmark.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
-        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:platform_benchmark",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/keras/layers/preprocessing:hashing",
+        "@absl_py//absl/flags",
     ],
 )
 
@@ -44,9 +74,17 @@ tf_py_test(
     name = "index_lookup_adapt_benchmark",
     srcs = ["index_lookup_adapt_benchmark.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
-        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:platform_benchmark",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/keras/layers/preprocessing:index_lookup",
+        "@absl_py//absl/flags",
     ],
 )
 
@@ -54,9 +92,19 @@ tf_py_test(
     name = "normalization_adapt_benchmark",
     srcs = ["normalization_adapt_benchmark.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
-        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:extra_py_tests_deps",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_benchmark",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/keras/layers/preprocessing:normalization",
+        "@absl_py//absl/flags",
     ],
 )
 
@@ -64,8 +112,19 @@ cuda_py_test(
     name = "image_preproc_benchmark",
     srcs = ["image_preproc_benchmark.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
-        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_benchmark",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/keras/layers/preprocessing:image_preprocessing",
+        "@absl_py//absl/flags",
     ],
 )
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/index_lookup_adapt_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/index_lookup_adapt_benchmark.py
index 619fb86103b..621a4588715 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/index_lookup_adapt_benchmark.py
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/index_lookup_adapt_benchmark.py
@@ -73,7 +73,11 @@ class BenchmarkAdapt(benchmark.Benchmark):
     batched_ds = ds.take(num_elements).batch(batch_size)
     input_t = keras.Input(shape=(), dtype=dtypes.string)
     layer = index_lookup.IndexLookup(
-        max_tokens=k, num_oov_tokens=0, reserve_zero=False)
+        max_tokens=k,
+        num_oov_indices=0,
+        mask_token=None,
+        oov_token="OOV",
+        dtype=dtypes.string)
     _ = layer(input_t)
     num_repeats = 5
     starts = []
@@ -93,7 +97,11 @@ class BenchmarkAdapt(benchmark.Benchmark):
     batched_ds = ds.take(num_elements).batch(batch_size)
     input_t = keras.Input(shape=(), dtype=dtypes.string)
     layer = index_lookup.IndexLookup(
-        max_tokens=k, num_oov_tokens=0, reserve_zero=False)
+        max_tokens=k,
+        num_oov_indices=0,
+        mask_token=None,
+        oov_token="OOV",
+        dtype=dtypes.string)
     _ = layer(input_t)
     num_repeats = 5
     starts = []
diff --git a/tensorflow/python/keras/layers/preprocessing/category_crossing.py b/tensorflow/python/keras/layers/preprocessing/category_crossing.py
index bdb29d21c4e..747a105afdd 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_crossing.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_crossing.py
@@ -143,7 +143,7 @@ class CategoryCrossing(base_preprocessing_layer.PreprocessingLayer):
 
   def _preprocess_input(self, inp):
     if isinstance(inp, (list, tuple, np.ndarray)):
-      inp = ops.convert_to_tensor(inp)
+      inp = ops.convert_to_tensor_v2_with_dispatch(inp)
     if inp.shape.rank == 1:
       inp = array_ops.expand_dims(inp, axis=-1)
     return inp
diff --git a/tensorflow/python/keras/layers/preprocessing/category_crossing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/category_crossing_distribution_test.py
index 1ccc7fe2296..867d1c6a35f 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_crossing_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_crossing_distribution_test.py
@@ -22,11 +22,12 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers.preprocessing import category_crossing
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
@@ -45,7 +46,7 @@ def batch_wrapper(dataset, batch_size, distribution, repeat=None):
     return dataset.batch(batch_size)
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
         # Investigate why crossing is not supported with TPU.
         distribution=strategy_combinations.all_strategies,
diff --git a/tensorflow/python/keras/layers/preprocessing/category_crossing_test.py b/tensorflow/python/keras/layers/preprocessing/category_crossing_test.py
index 0f320196080..392996ce5cf 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_crossing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_crossing_test.py
@@ -25,8 +25,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers.preprocessing import category_crossing
@@ -246,7 +246,7 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertEqual(output_spec.shape.dims[0], input_shapes[0].dims[0])
     self.assertEqual(output_spec.dtype, dtypes.string)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_config_with_custom_name(self):
     layer = category_crossing.CategoryCrossing(depth=2, name='hashing')
     config = layer.get_config()
diff --git a/tensorflow/python/keras/layers/preprocessing/category_encoding.py b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
index 95540176e04..35c4363ccdd 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
@@ -34,6 +34,7 @@ from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bincount_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
@@ -128,7 +129,7 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
 
     # We need to call super() before we call _add_state_variable().
     combiner = _CategoryEncodingCombiner(
-        compute_max_element=max_tokens is None,
+        max_tokens=max_tokens,
         compute_idf=output_mode == TFIDF)
     super(CategoryEncoding, self).__init__(combiner=combiner, **kwargs)
     base_preprocessing_layer._kpl_gauge.get_cell("V2").set("CategoryEncoding")
@@ -138,15 +139,6 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     self._sparse = sparse
     self._called = False
 
-    # We are adding these here instead of in build() since they do not depend
-    # on the input shape at all.
-    if max_tokens is None:
-      self.num_elements = self._add_state_variable(
-          name=_NUM_ELEMENTS_NAME,
-          shape=(),
-          dtype=dtypes.int32,
-          initializer=init_ops.zeros_initializer)
-
     if self._output_mode == TFIDF:
       # The TF-IDF weight may have a (None,) tensorshape. This creates
       # a 1D variable with arbitrary shape, which we can assign any weight to
@@ -159,6 +151,8 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
       else:
         initializer = init_ops.zeros_initializer
 
+      # We are adding these here instead of in build() since they do not depend
+      # on the input shape at all.
       self.tf_idf_weights = self._add_state_variable(
           name=_IDF_NAME,
           shape=tensor_shape.TensorShape((max_tokens,)),
@@ -198,16 +192,17 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     if not reset_state:
       raise ValueError("CategoryEncoding does not support streaming adapts.")
 
-    if self._called and self._max_tokens is None:
-      raise RuntimeError("CategoryEncoding can't be adapted after being called "
-                         "if max_tokens is None.")
     super(CategoryEncoding, self).adapt(data, reset_state)
 
   def _set_state_variables(self, updates):
     if not self.built:
       raise RuntimeError("_set_state_variables() must be called after build().")
-    if self._max_tokens is None:
-      self.set_num_elements(updates[_NUM_ELEMENTS_NAME])
+    if _NUM_ELEMENTS_NAME in updates:
+      if self._max_tokens is None:
+        self.set_num_elements(updates[_NUM_ELEMENTS_NAME])
+      elif self._max_tokens != updates[_NUM_ELEMENTS_NAME]:
+        raise RuntimeError("Cannot update states if you construct the layer "
+                           "with `max_tokens`={}".format(self._max_tokens))
     if self._output_mode == TFIDF:
       self.set_tfidf_data(updates[_IDF_NAME])
 
@@ -248,7 +243,7 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     if self._called:
       raise RuntimeError("num_elements cannot be changed after the layer is "
                          "called.")
-    K.set_value(self.num_elements, num_elements)
+    self._max_tokens = num_elements
 
   def set_tfidf_data(self, tfidf_data):
     tfidf_data = self._convert_to_ndarray(tfidf_data)
@@ -269,7 +264,7 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
 
   def call(self, inputs, count_weights=None):
     if isinstance(inputs, (list, np.ndarray)):
-      inputs = ops.convert_to_tensor_v2(inputs)
+      inputs = ops.convert_to_tensor_v2_with_dispatch(inputs)
     if inputs.shape.rank == 1:
       inputs = array_ops.expand_dims(inputs, 1)
 
@@ -278,12 +273,10 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
                        "or `output_mode='binary'`. Please pass a single input.")
     self._called = True
     if self._max_tokens is None:
-      out_depth = K.get_value(self.num_elements)
-      if out_depth == 0:
-        raise RuntimeError(
-            "If you construct a `CategoryEncoding` layer with "
-            "`max_tokens=None`, you need to call `adapt()` "
-            "on it before using it")
+      raise RuntimeError(
+          "If you construct a `CategoryEncoding` layer with "
+          "`max_tokens=None`, you need to call `adapt()` "
+          "on it before using it")
     else:
       out_depth = self._max_tokens
 
@@ -303,11 +296,26 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
       return tf_idf_data
 
     binary_output = (self._output_mode == BINARY)
+    if isinstance(inputs, sparse_tensor.SparseTensor):
+      max_value = math_ops.reduce_max(inputs.values)
+      min_value = math_ops.reduce_min(inputs.values)
+    else:
+      max_value = math_ops.reduce_max(inputs)
+      min_value = math_ops.reduce_min(inputs)
+    condition = math_ops.logical_and(
+        math_ops.greater_equal(
+            math_ops.cast(out_depth, max_value.dtype), max_value),
+        math_ops.greater_equal(
+            min_value, math_ops.cast(0, min_value.dtype)))
+    control_flow_ops.Assert(
+        condition, ["Input values must be in the range 0 <= values < max_tokens"
+                    " with max_tokens={}".format(out_depth)])
     if self._sparse:
       result = bincount_ops.sparse_bincount(
           inputs,
           weights=count_weights,
           minlength=out_depth,
+          maxlength=out_depth,
           axis=-1,
           binary_output=binary_output)
       result = math_ops.cast(result, K.floatx())
@@ -322,6 +330,7 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
           inputs,
           weights=count_weights,
           minlength=out_depth,
+          maxlength=out_depth,
           dtype=K.floatx(),
           axis=-1,
           binary_output=binary_output)
@@ -350,9 +359,9 @@ class _CategoryEncodingCombiner(base_preprocessing_layer.Combiner):
   MAX_VALUE_IDX = 0
   DOC_ID_IDX = 1
 
-  def __init__(self, compute_max_element=True, compute_idf=False):
+  def __init__(self, max_tokens=None, compute_idf=False):
+    self._max_tokens = max_tokens
     self._compute_idf = compute_idf
-    self._compute_max_element = compute_max_element
 
   def compute(self, values, accumulator=None):
     """Computes a step in this computation, returning a new accumulator."""
@@ -367,9 +376,10 @@ class _CategoryEncodingCombiner(base_preprocessing_layer.Combiner):
         element = [element]
       current_doc_id = accumulator.data[self.DOC_ID_IDX]
       for value in element:
-        current_max_value = accumulator.data[self.MAX_VALUE_IDX]
-        if value > current_max_value:
-          accumulator.data[self.MAX_VALUE_IDX] = value
+        if self._max_tokens is None:
+          current_max_value = accumulator.data[self.MAX_VALUE_IDX]
+          if value > current_max_value:
+            accumulator.data[self.MAX_VALUE_IDX] = value
         if self._compute_idf:
           doc_count = accumulator.per_doc_count_dict[value]
           if doc_count["last_doc_id"] != current_doc_id:
@@ -389,9 +399,10 @@ class _CategoryEncodingCombiner(base_preprocessing_layer.Combiner):
     for accumulator in accumulators[1:]:
       base_accumulator.data[self.DOC_ID_IDX] += accumulator.data[
           self.DOC_ID_IDX]
-      base_accumulator.data[self.MAX_VALUE_IDX] = max(
-          base_accumulator.data[self.MAX_VALUE_IDX],
-          accumulator.data[self.MAX_VALUE_IDX])
+      if self._max_tokens is None:
+        base_accumulator.data[self.MAX_VALUE_IDX] = max(
+            base_accumulator.data[self.MAX_VALUE_IDX],
+            accumulator.data[self.MAX_VALUE_IDX])
       if self._compute_idf:
         for token, value in accumulator.per_doc_count_dict.items():
           # Any newly created token counts in 'base_accumulator''s
@@ -431,10 +442,13 @@ class _CategoryEncodingCombiner(base_preprocessing_layer.Combiner):
           the IDF value for index i. Only returned if `compute_idf` is True.
     """
     data, document_counts = accumulator
-    max_element = data[self.MAX_VALUE_IDX]
+    if data[self.MAX_VALUE_IDX] is not None:
+      max_element = data[self.MAX_VALUE_IDX] + 1
+    else:
+      max_element = self._max_tokens
     output_dict = {}
-    if self._compute_max_element:
-      output_dict[_NUM_ELEMENTS_NAME] = max_element + 1
+    if self._max_tokens is None:
+      output_dict[_NUM_ELEMENTS_NAME] = max_element
 
     if self._compute_idf:
       num_documents = data[self.DOC_ID_IDX]
@@ -444,7 +458,7 @@ class _CategoryEncodingCombiner(base_preprocessing_layer.Combiner):
       # the dict directly for those values gives us meaningful counts (of 0).
       # However, this also means we can't just extract the values in
       # document_counts - we need to do a deliberate indexing using range().
-      doc_counts = [document_counts[i]["count"] for i in range(max_element + 1)]
+      doc_counts = [document_counts[i]["count"] for i in range(max_element)]
       idf = self._inverse_document_frequency(doc_counts, num_documents)
       output_dict[_IDF_NAME] = idf
 
@@ -493,5 +507,8 @@ class _CategoryEncodingCombiner(base_preprocessing_layer.Combiner):
       per_doc_count_dict = collections.defaultdict(create_default_dict)
     else:
       per_doc_count_dict = None
-    data = [0, 0]
+    if self._max_tokens is None:
+      data = [0, 0]
+    else:
+      data = [None, 0]
     return _CategoryEncodingAccumulator(data, per_doc_count_dict)
diff --git a/tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py
index 011495b9314..7d6cff94067 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py
@@ -22,11 +22,12 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers.preprocessing import category_encoding
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
@@ -45,7 +46,7 @@ def batch_wrapper(dataset, batch_size, distribution, repeat=None):
     return dataset.batch(batch_size)
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
         # (b/156783625): Outside compilation failed for eager mode only.
         distribution=strategy_combinations.strategies_minus_tpu,
diff --git a/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py b/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
index 7e7f7f32be0..3b2026e5048 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
@@ -267,6 +268,34 @@ class CategoryEncodingInputTest(keras_parameterized.TestCase,
     model = keras.Model(inputs=input_data, outputs=output_data)
     _ = model.predict(input_array, steps=1)
 
+  def test_dense_oov_input(self):
+    input_array = constant_op.constant([[1, 2, 3], [4, 3, 4]])
+    max_tokens = 3
+    expected_output_shape = [None, max_tokens]
+    encoder_layer = get_layer_class()(max_tokens)
+    input_data = keras.Input(shape=(3,), dtype=dtypes.int32)
+    int_data = encoder_layer(input_data)
+    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        ".*must be in the range 0 <= values < max_tokens.*"):
+      _ = model.predict(input_array, steps=1)
+
+  def test_dense_negative(self):
+    input_array = constant_op.constant([[1, 2, 0], [2, 2, -1]])
+    max_tokens = 3
+    expected_output_shape = [None, max_tokens]
+    encoder_layer = get_layer_class()(max_tokens)
+    input_data = keras.Input(shape=(3,), dtype=dtypes.int32)
+    int_data = encoder_layer(input_data)
+    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        ".*must be in the range 0 <= values < max_tokens.*"):
+      _ = model.predict(input_array, steps=1)
+
 
 @keras_parameterized.run_all_keras_modes
 class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
@@ -324,28 +353,6 @@ class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
     output_dataset = model.predict(input_array, steps=1)
     self.assertAllEqual(expected_output, output_dataset)
 
-  def test_adapt_after_build(self):
-    vocab_data = np.array([[1, 1, 1, 1, 2, 2, 2, 3, 3, 4]])
-    input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
-
-    # pyformat: disable
-    expected_output = [[0, 1, 1, 1, 0],
-                       [1, 1, 0, 1, 0]]
-    # pyformat: enable
-    max_tokens = 5
-    expected_output_shape = [None, max_tokens]
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
-    layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
-    int_data = layer(input_data)
-    layer.adapt(vocab_data)
-    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
   def test_hard_maximum_set_state_variables_after_build(self):
     state_variables = {category_encoding._NUM_ELEMENTS_NAME: 5}
     input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
@@ -403,20 +410,10 @@ class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
         max_tokens=None, output_mode=category_encoding.BINARY)
     layer.adapt([1, 2])
     _ = layer(input_data)
-    with self.assertRaisesRegex(RuntimeError, "num_elements cannot be changed"):
+    with self.assertRaisesRegex(
+        RuntimeError, ".*'max_tokens' arg must be set to None."):
       layer.set_num_elements(5)
 
-  def test_adapt_after_call_fails(self):
-    vocab_data = np.array([1, 1, 1, 1, 2, 2, 2, 3, 3, 4])
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
-    layer = get_layer_class()(
-        max_tokens=None, output_mode=category_encoding.BINARY)
-    layer.adapt(vocab_data)
-    _ = layer(input_data)
-    with self.assertRaisesRegex(RuntimeError, "can't be adapted"):
-      layer.adapt(vocab_data)
-
   def test_set_state_variables_after_call_fails(self):
     state_variables = {category_encoding._NUM_ELEMENTS_NAME: 5}
 
@@ -425,7 +422,7 @@ class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
         max_tokens=None, output_mode=category_encoding.BINARY)
     layer.adapt([1, 2])
     _ = layer(input_data)
-    with self.assertRaisesRegex(RuntimeError, "num_elements cannot be changed"):
+    with self.assertRaisesRegex(RuntimeError, "Cannot update states.*"):
       layer._set_state_variables(state_variables)
 
 
@@ -468,7 +465,7 @@ class CategoryEncodingOutputTest(keras_parameterized.TestCase,
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
         max_tokens=None, output_mode=category_encoding.BINARY)
-    layer.set_weights([np.array(max_tokens)])
+    layer.set_num_elements(max_tokens)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
 
@@ -508,7 +505,7 @@ class CategoryEncodingOutputTest(keras_parameterized.TestCase,
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
         max_tokens=None, output_mode=category_encoding.COUNT)
-    layer.set_weights([np.array(max_tokens)])
+    layer.set_num_elements(max_tokens)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
 
@@ -604,7 +601,7 @@ class CategoryEncodingModelBuildingTest(
 
     weights = []
     if max_tokens is None:
-      weights.append(np.array(5))
+      layer.set_num_elements(5)
     if output_mode == category_encoding.TFIDF:
       weights.append(tfidf_data)
 
@@ -755,6 +752,20 @@ class CategoryEncodingCombinerTest(
         RuntimeError, r".*you need to call.*"):
       _ = layer([1, 2, 3])
 
+  def test_saving_loading(self):
+    encoder = category_encoding.CategoryEncoding()
+    encoder.adapt([1, 2, 3])
+    model = keras.Sequential([encoder])
+    model.save("/tmp/model", save_format="tf")
+    loaded_model = keras.models.load_model("/tmp/model")
+    self.assertAllClose(model.predict([[1]]), loaded_model.predict([[1]]))
+
+  def test_serialize(self):
+    encoder = category_encoding.CategoryEncoding()
+    encoder.adapt([1, 2, 3])
+    model = keras.Sequential([encoder])
+    _ = keras.models.clone_model(model)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
index 27f794c2c0d..208aca92aa3 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
@@ -21,16 +21,17 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.framework import config
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers.preprocessing import discretization
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.platform import test
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
         distribution=strategy_combinations.strategies_minus_tpu,
         mode=["eager", "graph"]))
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing.py b/tensorflow/python/keras/layers/preprocessing/hashing.py
index a6de075535c..ea8d6f0fd95 100644
--- a/tensorflow/python/keras/layers/preprocessing/hashing.py
+++ b/tensorflow/python/keras/layers/preprocessing/hashing.py
@@ -154,7 +154,7 @@ class Hashing(base_preprocessing_layer.PreprocessingLayer):
 
   def _preprocess_single_input(self, inp):
     if isinstance(inp, (list, tuple, np.ndarray)):
-      inp = ops.convert_to_tensor(inp)
+      inp = ops.convert_to_tensor_v2_with_dispatch(inp)
     return inp
 
   def _preprocess_inputs(self, inputs):
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/hashing_distribution_test.py
index 0cfd1ab967c..26982597515 100644
--- a/tensorflow/python/keras/layers/preprocessing/hashing_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/hashing_distribution_test.py
@@ -22,17 +22,18 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers.preprocessing import hashing
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.platform import test
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
         distribution=strategy_combinations.all_strategies,
         mode=["eager", "graph"]))
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing_test.py b/tensorflow/python/keras/layers/preprocessing/hashing_test.py
index 2e5e5f7005c..58592b8910a 100644
--- a/tensorflow/python/keras/layers/preprocessing/hashing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/hashing_test.py
@@ -25,8 +25,8 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers.preprocessing import hashing
@@ -311,7 +311,7 @@ class HashingTest(keras_parameterized.TestCase):
     self.assertEqual(output_spec.shape.dims, input_shape.dims)
     self.assertEqual(output_spec.dtype, dtypes.int64)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_config_with_custom_name(self):
     layer = hashing.Hashing(num_bins=2, name='hashing')
     config = layer.get_config()
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
index 87a18db31f3..8b98b5336bb 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -469,6 +470,8 @@ class RandomTranslation(PreprocessingLayer):
     interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
     seed: Integer. Used to create a random seed.
     name: A string, the name of the layer.
+    fill_value: a float represents the value to be filled outside the
+      boundaries when `fill_mode` is "constant".
 
   Input shape:
     4D tensor with shape: `(samples, height, width, channels)`,
@@ -490,6 +493,7 @@ class RandomTranslation(PreprocessingLayer):
                interpolation='bilinear',
                seed=None,
                name=None,
+               fill_value=0.0,
                **kwargs):
     self.height_factor = height_factor
     if isinstance(height_factor, (tuple, list)):
@@ -522,6 +526,7 @@ class RandomTranslation(PreprocessingLayer):
     check_fill_mode_and_interpolation(fill_mode, interpolation)
 
     self.fill_mode = fill_mode
+    self.fill_value = fill_value
     self.interpolation = interpolation
     self.seed = seed
     self._rng = make_generator(self.seed)
@@ -559,7 +564,8 @@ class RandomTranslation(PreprocessingLayer):
           inputs,
           get_translation_matrix(translations),
           interpolation=self.interpolation,
-          fill_mode=self.fill_mode)
+          fill_mode=self.fill_mode,
+          fill_value=self.fill_value)
 
     output = control_flow_util.smart_cond(training, random_translated_inputs,
                                           lambda: inputs)
@@ -574,6 +580,7 @@ class RandomTranslation(PreprocessingLayer):
         'height_factor': self.height_factor,
         'width_factor': self.width_factor,
         'fill_mode': self.fill_mode,
+        'fill_value': self.fill_value,
         'interpolation': self.interpolation,
         'seed': self.seed,
     }
@@ -617,6 +624,7 @@ def get_translation_matrix(translations, name=None):
 def transform(images,
               transforms,
               fill_mode='reflect',
+              fill_value=0.0,
               interpolation='bilinear',
               output_shape=None,
               name=None):
@@ -636,6 +644,8 @@ def transform(images,
       not backpropagated into transformation parameters.
     fill_mode: Points outside the boundaries of the input are filled according
       to the given mode (one of `{'constant', 'reflect', 'wrap', 'nearest'}`).
+    fill_value: a float represents the value to be filled outside the
+      boundaries when `fill_mode` is "constant".
     interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
     output_shape: Output dimesion after the transform, [height, width]. If None,
       output is the same size as input image.
@@ -681,7 +691,7 @@ def transform(images,
         if output_shape_value is not None:
           output_shape = output_shape_value
 
-    output_shape = ops.convert_to_tensor_v2(
+    output_shape = ops.convert_to_tensor_v2_with_dispatch(
         output_shape, dtypes.int32, name='output_shape')
 
     if not output_shape.get_shape().is_compatible_with([2]):
@@ -689,6 +699,18 @@ def transform(images,
                        'new_height, new_width, instead got '
                        '{}'.format(output_shape))
 
+    fill_value = ops.convert_to_tensor_v2_with_dispatch(
+        fill_value, dtypes.float32, name='fill_value')
+
+    if compat.forward_compatible(2020, 8, 5):
+      return gen_image_ops.ImageProjectiveTransformV3(
+          images=images,
+          output_shape=output_shape,
+          fill_value=fill_value,
+          transforms=transforms,
+          fill_mode=fill_mode.upper(),
+          interpolation=interpolation.upper())
+
     return gen_image_ops.ImageProjectiveTransformV2(
         images=images,
         output_shape=output_shape,
@@ -777,6 +799,8 @@ class RandomRotation(PreprocessingLayer):
     interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
     seed: Integer. Used to create a random seed.
     name: A string, the name of the layer.
+    fill_value: a float represents the value to be filled outside the
+      boundaries when `fill_mode` is "constant".
 
   Input shape:
     4D tensor with shape: `(samples, height, width, channels)`,
@@ -796,6 +820,7 @@ class RandomRotation(PreprocessingLayer):
                interpolation='bilinear',
                seed=None,
                name=None,
+               fill_value=0.0,
                **kwargs):
     self.factor = factor
     if isinstance(factor, (tuple, list)):
@@ -809,6 +834,7 @@ class RandomRotation(PreprocessingLayer):
                        'got {}'.format(factor))
     check_fill_mode_and_interpolation(fill_mode, interpolation)
     self.fill_mode = fill_mode
+    self.fill_value = fill_value
     self.interpolation = interpolation
     self.seed = seed
     self._rng = make_generator(self.seed)
@@ -834,6 +860,7 @@ class RandomRotation(PreprocessingLayer):
           inputs,
           get_rotation_matrix(angles, img_hd, img_wd),
           fill_mode=self.fill_mode,
+          fill_value=self.fill_value,
           interpolation=self.interpolation)
 
     output = control_flow_util.smart_cond(training, random_rotated_inputs,
@@ -848,6 +875,7 @@ class RandomRotation(PreprocessingLayer):
     config = {
         'factor': self.factor,
         'fill_mode': self.fill_mode,
+        'fill_value': self.fill_value,
         'interpolation': self.interpolation,
         'seed': self.seed,
     }
@@ -892,6 +920,8 @@ class RandomZoom(PreprocessingLayer):
     interpolation: Interpolation mode. Supported values: "nearest", "bilinear".
     seed: Integer. Used to create a random seed.
     name: A string, the name of the layer.
+    fill_value: a float represents the value to be filled outside the
+      boundaries when `fill_mode` is "constant".
 
   Example:
 
@@ -914,7 +944,6 @@ class RandomZoom(PreprocessingLayer):
       negative.
   """
 
-  # TODO(b/156526279): Add `fill_value` argument.
   def __init__(self,
                height_factor,
                width_factor=None,
@@ -922,6 +951,7 @@ class RandomZoom(PreprocessingLayer):
                interpolation='bilinear',
                seed=None,
                name=None,
+               fill_value=0.0,
                **kwargs):
     self.height_factor = height_factor
     if isinstance(height_factor, (tuple, list)):
@@ -951,6 +981,7 @@ class RandomZoom(PreprocessingLayer):
     check_fill_mode_and_interpolation(fill_mode, interpolation)
 
     self.fill_mode = fill_mode
+    self.fill_value = fill_value
     self.interpolation = interpolation
     self.seed = seed
     self._rng = make_generator(self.seed)
@@ -983,8 +1014,10 @@ class RandomZoom(PreprocessingLayer):
           array_ops.concat([width_zoom, height_zoom], axis=1),
           dtype=dtypes.float32)
       return transform(
-          inputs, get_zoom_matrix(zooms, img_hd, img_wd),
+          inputs,
+          get_zoom_matrix(zooms, img_hd, img_wd),
           fill_mode=self.fill_mode,
+          fill_value=self.fill_value,
           interpolation=self.interpolation)
 
     output = control_flow_util.smart_cond(training, random_zoomed_inputs,
@@ -1000,6 +1033,7 @@ class RandomZoom(PreprocessingLayer):
         'height_factor': self.height_factor,
         'width_factor': self.width_factor,
         'fill_mode': self.fill_mode,
+        'fill_value': self.fill_value,
         'interpolation': self.interpolation,
         'seed': self.seed,
     }
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py
index 7fc2b42c919..2932ca35d05 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_distribution_test.py
@@ -22,16 +22,17 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers.preprocessing import image_preprocessing
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.platform import test
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
         distribution=strategy_combinations.all_strategies,
         mode=["eager", "graph"]))
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index b51e948baea..2525848bc34 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -21,15 +21,16 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.compat import compat
 from tensorflow.python.distribute.mirrored_strategy import MirroredStrategy
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.layers.preprocessing import image_preprocessing
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops import gen_stateful_random_ops
+from tensorflow.python.ops import gen_stateless_random_ops_v2
 from tensorflow.python.ops import image_ops_impl as image_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -316,14 +317,14 @@ class RescalingTest(keras_parameterized.TestCase):
         input_shape=(2, 5, 6, 3),
         expected_output_shape=(None, 5, 6, 3))
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_rescaling_correctness_float(self):
     layer = image_preprocessing.Rescaling(scale=1./127.5, offset=-1.)
     inputs = random_ops.random_uniform((2, 4, 5, 3))
     outputs = layer(inputs)
     self.assertAllClose(outputs.numpy(), inputs.numpy() * (1./127.5) - 1)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_rescaling_correctness_int(self):
     layer = image_preprocessing.Rescaling(scale=1./127.5, offset=-1)
     inputs = random_ops.random_uniform((2, 4, 5, 3), 0, 100, dtype='int32')
@@ -414,7 +415,7 @@ class RandomFlipTest(keras_parameterized.TestCase):
           actual_output = layer(input_images, training=1)
           self.assertAllClose(expected_output, actual_output)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_config_with_custom_name(self):
     layer = image_preprocessing.RandomFlip(name='image_preproc')
     config = layer.get_config()
@@ -499,7 +500,7 @@ class RandomContrastTest(keras_parameterized.TestCase):
     with self.assertRaises(ValueError):
       image_preprocessing.RandomContrast((0.1, -0.2))
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_config_with_custom_name(self):
     layer = image_preprocessing.RandomContrast((.5, .6), name='image_preproc')
     config = layer.get_config()
@@ -683,7 +684,7 @@ class RandomTranslationTest(keras_parameterized.TestCase):
         actual_output = layer(input_images, training=0)
         self.assertAllClose(expected_output, actual_output)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_config_with_custom_name(self):
     layer = image_preprocessing.RandomTranslation(.5, .6, name='image_preproc')
     config = layer.get_config()
@@ -698,11 +699,16 @@ class RandomTransformTest(keras_parameterized.TestCase):
                                       transform_matrix,
                                       expected_output,
                                       mode,
+                                      fill_value=0.0,
                                       interpolation='bilinear'):
     inp = np.arange(15).reshape((1, 5, 3, 1)).astype(np.float32)
     with self.cached_session(use_gpu=True):
       output = image_preprocessing.transform(
-          inp, transform_matrix, fill_mode=mode, interpolation=interpolation)
+          inp,
+          transform_matrix,
+          fill_mode=mode,
+          fill_value=fill_value,
+          interpolation=interpolation)
     self.assertAllClose(expected_output, output)
 
   def test_random_translation_reflect(self):
@@ -871,7 +877,7 @@ class RandomTransformTest(keras_parameterized.TestCase):
     self._run_random_transform_with_mock(transform_matrix, expected_output,
                                          'nearest')
 
-  def test_random_translation_constant(self):
+  def test_random_translation_constant_0(self):
     # constant output is (0000|abcd|0000)
 
     # Test down shift by 1.
@@ -926,6 +932,62 @@ class RandomTransformTest(keras_parameterized.TestCase):
     self._run_random_transform_with_mock(transform_matrix, expected_output,
                                          'constant')
 
+  def test_random_translation_constant_1(self):
+    with compat.forward_compatibility_horizon(2020, 8, 6):
+      # constant output is (1111|abcd|1111)
+
+      # Test down shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[1., 1., 1.],
+           [0., 1., 2.],
+           [3., 4., 5.],
+           [6., 7., 8],
+           [9., 10., 11]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., 0., 0., 1., -1., 0., 0.]])
+      self._run_random_transform_with_mock(
+          transform_matrix, expected_output, 'constant', fill_value=1.0)
+
+      # Test up shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[3., 4., 5.],
+           [6., 7., 8],
+           [9., 10., 11.],
+           [12., 13., 14.],
+           [1., 1., 1.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., 0., 0., 1., 1., 0., 0.]])
+      self._run_random_transform_with_mock(
+          transform_matrix, expected_output, 'constant', fill_value=1.0)
+
+      # Test left shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[1., 2., 1.],
+           [4., 5., 1.],
+           [7., 8., 1.],
+           [10., 11., 1.],
+           [13., 14., 1.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., 1., 0., 1., 0., 0., 0.]])
+      self._run_random_transform_with_mock(
+          transform_matrix, expected_output, 'constant', fill_value=1.0)
+
+      # Test right shift by 1.
+      # pyformat: disable
+      expected_output = np.asarray(
+          [[1., 0., 1.],
+           [1., 3., 4],
+           [1., 6., 7.],
+           [1., 9., 10.],
+           [1., 12., 13.]]).reshape((1, 5, 3, 1)).astype(np.float32)
+      # pyformat: enable
+      transform_matrix = np.asarray([[1., 0., -1., 0., 1., 0., 0., 0.]])
+      self._run_random_transform_with_mock(
+          transform_matrix, expected_output, 'constant', fill_value=1.0)
+
   def test_random_translation_nearest_interpolation(self):
     # nearest output is (aaaa|abcd|dddd)
 
@@ -1034,7 +1096,7 @@ class RandomRotationTest(keras_parameterized.TestCase):
       self.assertAllEqual(2, len(values))
       self.assertAllClose(values[0], values[1], rtol=1e-5)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_config_with_custom_name(self):
     layer = image_preprocessing.RandomRotation(.5, name='image_preproc')
     config = layer.get_config()
@@ -1140,7 +1202,7 @@ class RandomZoomTest(keras_parameterized.TestCase):
         actual_output = layer(input_images, training=0)
         self.assertAllClose(expected_output, actual_output)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_config_with_custom_name(self):
     layer = image_preprocessing.RandomZoom(.5, .6, name='image_preproc')
     config = layer.get_config()
@@ -1176,11 +1238,14 @@ class RandomHeightTest(keras_parameterized.TestCase):
     mock_factor = 0
     with test.mock.patch.object(
         gen_stateful_random_ops, 'stateful_uniform', return_value=mock_factor):
-      with testing_utils.use_gpu():
-        img = np.random.random((12, 5, 8, 3))
-        layer = image_preprocessing.RandomHeight(.4)
-        img_out = layer(img, training=True)
-        self.assertEqual(img_out.shape[1], 3)
+      with test.mock.patch.object(
+          gen_stateless_random_ops_v2, 'stateless_random_uniform_v2',
+          return_value=mock_factor):
+        with testing_utils.use_gpu():
+          img = np.random.random((12, 5, 8, 3))
+          layer = image_preprocessing.RandomHeight(.4)
+          img_out = layer(img, training=True)
+          self.assertEqual(img_out.shape[1], 3)
 
   def test_random_height_longer_numeric(self):
     for dtype in (np.int64, np.float32):
@@ -1231,7 +1296,7 @@ class RandomHeightTest(keras_parameterized.TestCase):
         actual_output = layer(input_images, training=0)
         self.assertAllClose(expected_output, actual_output)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_config_with_custom_name(self):
     layer = image_preprocessing.RandomHeight(.5, name='image_preproc')
     config = layer.get_config()
@@ -1267,11 +1332,14 @@ class RandomWidthTest(keras_parameterized.TestCase):
     mock_factor = 0
     with test.mock.patch.object(
         gen_stateful_random_ops, 'stateful_uniform', return_value=mock_factor):
-      with testing_utils.use_gpu():
-        img = np.random.random((12, 8, 5, 3))
-        layer = image_preprocessing.RandomWidth(.4)
-        img_out = layer(img, training=True)
-        self.assertEqual(img_out.shape[2], 3)
+      with test.mock.patch.object(
+          gen_stateless_random_ops_v2, 'stateless_random_uniform_v2',
+          return_value=mock_factor):
+        with testing_utils.use_gpu():
+          img = np.random.random((12, 8, 5, 3))
+          layer = image_preprocessing.RandomWidth(.4)
+          img_out = layer(img, training=True)
+          self.assertEqual(img_out.shape[2], 3)
 
   def test_random_width_longer_numeric(self):
     for dtype in (np.int64, np.float32):
@@ -1321,7 +1389,7 @@ class RandomWidthTest(keras_parameterized.TestCase):
         actual_output = layer(input_images, training=0)
         self.assertAllClose(expected_output, actual_output)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_config_with_custom_name(self):
     layer = image_preprocessing.RandomWidth(.5, name='image_preproc')
     config = layer.get_config()
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
index c593cd41c85..b421990d2b9 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
@@ -22,11 +22,12 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import context
 from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers.preprocessing import index_lookup
 from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
@@ -41,7 +42,7 @@ def get_layer_class():
     return index_lookup_v1.IndexLookup
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
         distribution=strategy_combinations.all_strategies,
         mode=["eager"]))  # Eager-only, no graph: b/158793009
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization.py b/tensorflow/python/keras/layers/preprocessing/normalization.py
index 4b75def0247..1de1c30d8e6 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization.py
@@ -29,6 +29,7 @@ from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import keras_export
 
@@ -37,7 +38,15 @@ _MEAN_NAME = 'mean'
 _VARIANCE_NAME = 'variance'
 
 
-# TODO(momernick): Find a good example of normalization?
+def convert_to_ndarray(values):
+  if isinstance(values, np.ndarray):
+    return values
+  elif isinstance(values, ops.Tensor):
+    return K.get_value(values)
+  else:
+    return np.array(values)
+
+
 @keras_export('keras.layers.experimental.preprocessing.Normalization', v1=[])
 class Normalization(base_preprocessing_layer.CombinerPreprocessingLayer):
   """Feature-wise normalization of the data.
@@ -56,9 +65,17 @@ class Normalization(base_preprocessing_layer.CombinerPreprocessingLayer):
         normalization statistics. By default the last axis, the `features` axis
         is kept and any `space` or `time` axes are summed. Each element in the
         the axes that are kept is normalized independently. If `axis` is set to
-        'None', the layer will perform scalar normalization (diving the input
+        'None', the layer will perform scalar normalization (dividing the input
         by a single scalar value). The `batch` axis, 0, is always summed over
         (`axis=0` is not allowed).
+      mean: The mean value(s) to use during normalization. The passed value(s)
+        will be broadcast to the shape of the kept axes above; if the value(s)
+        cannot be broadcast, an error will be raised when this layer's build()
+        method is called.
+      variance: The variance value(s) to use during normalization. The passed
+        value(s) will be broadcast to the shape of the kept axes above; if the
+        value(s)cannot be broadcast, an error will be raised when this layer's
+        build() method is called.
 
   Examples:
 
@@ -70,12 +87,22 @@ class Normalization(base_preprocessing_layer.CombinerPreprocessingLayer):
   >>> layer.adapt(adapt_data)
   >>> layer(input_data)
   <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
+  array([[-1.4142135 ],
+         [-0.70710677],
+         [ 0.        ]], dtype=float32)>
+
+  Pass the mean and variance directly.
+
+  >>> input_data = np.array([[1.], [2.], [3.]], np.float32)
+  >>> layer = Normalization(mean=3., variance=2.)
+  >>> layer(input_data)
+  <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
   array([[-1.4142135 ],
          [-0.70710677],
          [ 0.        ]], dtype=float32)>
   """
 
-  def __init__(self, axis=-1, dtype=None, **kwargs):
+  def __init__(self, axis=-1, dtype=None, mean=None, variance=None, **kwargs):
     # This ensures that if the value of K.floatx() changes after file-loading
     # time, the dtype value will change to reflect it.
     dtype = dtype or K.floatx()
@@ -97,6 +124,24 @@ class Normalization(base_preprocessing_layer.CombinerPreprocessingLayer):
 
     self.axis = axis
 
+    if isinstance(mean, variables.Variable):
+      raise ValueError('Normalization does not support passing a Variable '
+                       'for the `mean` init arg.')
+    if isinstance(variance, variables.Variable):
+      raise ValueError('Normalization does not support passing a Variable '
+                       'for the `variance` init arg.')
+
+    if mean is not None and variance is not None:
+      mean = convert_to_ndarray(mean)
+      variance = convert_to_ndarray(variance)
+    elif mean is not None or variance is not None:
+      raise ValueError(
+          'When setting values directly, both `mean` and `variance` '
+          'must be set. Got mean: {} and variance: {}'.format(mean, variance))
+
+    self.mean_val = mean
+    self.variance_val = variance
+
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     if len(input_shape) == 1:
@@ -144,8 +189,13 @@ class Normalization(base_preprocessing_layer.CombinerPreprocessingLayer):
 
     super(Normalization, self).build(input_shape)
 
+    if (self.mean_val is not None and self.variance_val is not None):
+      mean_val = self.mean_val * np.ones(mean_and_var_shape)
+      variance_val = self.variance_val * np.ones(mean_and_var_shape)
+      self.set_weights([mean_val, variance_val])
+
   def call(self, inputs):
-    inputs = ops.convert_to_tensor_v2(inputs)
+    inputs = ops.convert_to_tensor_v2_with_dispatch(inputs)
     if inputs.shape.rank == 1:
       inputs = array_ops.expand_dims(inputs, 1)
     # If the inputs are not floats, cast them to floats. This avoids issues
@@ -297,8 +347,9 @@ class _NormalizingCombiner(base_preprocessing_layer.Combiner):
     if (count == 0 and (mean.any() != 0.0 or var.any() != 0.0)):
       raise RuntimeError(
           'The mean and/or variance of a Normalization preprocessing layer '
-          "were set without also setting 'count'. If 'count' is not also set,"
-          " 'adapt' cannot be called unless the 'reset_state' arg is True.")
+          "were set without also setting 'count'. If 'count' is not also set, "
+          " or was set to 0, 'adapt' cannot be called unless the 'reset_state'"
+          'arg is True.')
     return self._create_accumulator(output[_COUNT_NAME], output[_MEAN_NAME],
                                     output[_VARIANCE_NAME])
 
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_distribution_test.py
index f22556ef723..4bf15da4358 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization_distribution_test.py
@@ -22,9 +22,10 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import context
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers.preprocessing import normalization
 from tensorflow.python.keras.layers.preprocessing import normalization_v1
@@ -104,7 +105,7 @@ def _get_layer_computation_test_cases():
   return crossed_test_cases
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.times(
         combinations.combine(
             distribution=strategy_combinations.all_strategies,
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
index 69eafc54adc..f629b88f369 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
@@ -25,12 +25,14 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers.preprocessing import normalization
 from tensorflow.python.keras.layers.preprocessing import normalization_v1
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
@@ -216,7 +218,7 @@ class NormalizationTest(keras_parameterized.TestCase,
   @parameterized.named_parameters(*_get_layer_computation_test_cases())
   def test_layer_computation(self, adapt_data, axis, test_data, use_dataset,
                              expected):
-    input_shape = tuple([None for _ in range(test_data.ndim - 1)])
+    input_shape = tuple([test_data.shape[i] for i in range(1, test_data.ndim)])
     if use_dataset:
       # Keras APIs expect batched datasets
       adapt_data = dataset_ops.Dataset.from_tensor_slices(adapt_data).batch(
@@ -235,6 +237,45 @@ class NormalizationTest(keras_parameterized.TestCase,
     output_data = model.predict(test_data)
     self.assertAllClose(expected, output_data)
 
+    weights = layer.get_weights()
+    mean = weights[0]
+    var = weights[1]
+
+    direct_set_layer = cls(axis=axis, mean=mean, variance=var)
+    input_data = keras.Input(shape=input_shape)
+    output = direct_set_layer(input_data)
+    model = keras.Model(input_data, output)
+    model._run_eagerly = testing_utils.should_run_eagerly()
+    output_data = model.predict(test_data)
+    self.assertAllClose(expected, output_data)
+
+  def test_broadcasting_during_direct_setting(self):
+    cls = get_layer_class()
+    layer = cls(axis=-1, mean=[1.0], variance=[2.0])
+    layer.build((None, 2))
+    weights = layer.get_weights()
+    self.assertAllClose([1.0, 1.0], weights[0])
+    self.assertAllClose([2.0, 2.0], weights[1])
+
+  def test_broadcasting_during_direct_setting_with_tensors(self):
+    cls = get_layer_class()
+    layer = cls(
+        axis=-1,
+        mean=constant_op.constant([1.0]),
+        variance=constant_op.constant([2.0]))
+    layer.build((None, 2))
+    weights = layer.get_weights()
+    self.assertAllClose([1.0, 1.0], weights[0])
+    self.assertAllClose([2.0, 2.0], weights[1])
+
+  def test_broadcasting_during_direct_setting_with_variables_fails(self):
+    cls = get_layer_class()
+    with self.assertRaisesRegex(ValueError, "passing a Variable"):
+      _ = cls(
+          axis=-1,
+          mean=variables.Variable([1.0]),
+          variance=variables.Variable([2.0]))
+
   def test_mean_setting_continued_adapt_failure(self):
 
     if not context.executing_eagerly():
diff --git a/tensorflow/python/keras/layers/preprocessing/preprocessing_stage.py b/tensorflow/python/keras/layers/preprocessing/preprocessing_stage.py
index ae572ab4e7a..6bcae297d51 100644
--- a/tensorflow/python/keras/layers/preprocessing/preprocessing_stage.py
+++ b/tensorflow/python/keras/layers/preprocessing/preprocessing_stage.py
@@ -23,8 +23,10 @@ import numpy as np
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.keras.engine import base_preprocessing_layer
+from tensorflow.python.keras.engine import functional
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.util import nest
 
 
 class PreprocessingStage(base_preprocessing_layer.PreprocessingLayer,
@@ -94,3 +96,177 @@ class PreprocessingStage(base_preprocessing_layer.PreprocessingLayer,
                                              reset_state=reset_state)
 
 
+class FunctionalPreprocessingStage(base_preprocessing_layer.PreprocessingLayer,
+                                   functional.Functional):
+  """A functional preprocessing stage.
+
+  This preprocessing stage wraps a graph of preprocessing layers into a
+  Functional-like object that enables you to `adapt()` the whole graph via
+  a single `adapt()` call on the preprocessing stage.
+
+  Preprocessing stage is not a complete model, so it cannot be called with
+  `fit()`. However, it is possible to add regular layers that may be trainable
+  to a preprocessing stage.
+
+  A functional preprocessing stage is created in the same way as `Functional`
+  models. A stage can be instantiated by passing two arguments to
+  `__init__`. The first argument is the `keras.Input` Tensors that represent
+  the inputs to the stage. The second argument specifies the output
+  tensors that represent the outputs of this stage. Both arguments can be a
+  nested structure of tensors.
+
+  Example:
+
+  >>> inputs = {'x2': tf.keras.Input(shape=(5,)),
+  ...           'x1': tf.keras.Input(shape=(1,))}
+  >>> norm_layer = tf.keras.layers.experimental.preprocessing.Normalization()
+  >>> y = norm_layer(inputs['x2'])
+  >>> y, z = tf.keras.layers.Lambda(lambda x: (x, x))(inputs['x1'])
+  >>> outputs = [inputs['x1'], [y, z]]
+  >>> stage = FunctionalPreprocessingStage(inputs, outputs)
+
+  Arguments:
+    inputs: An input tensor (must be created via `tf.keras.Input()`), or a list,
+      a dict, or a nested strcture of input tensors.
+    outputs: An output tensor, or a list, a dict or a nested structure of output
+      tensors.
+    name: String, optional. Name of the preprocessing stage.
+  """
+
+  def fit(self, *args, **kwargs):
+    raise ValueError(
+        'Preprocessing stage is not a complete model, and hence should not be '
+        '`fit`. Instead, you may feed data to `adapt` the stage to set '
+        'appropriate states of the layers in the stage.')
+
+  def adapt(self, data, reset_state=True):
+    """Adapt the state of the layers of the preprocessing stage to the data.
+
+    Arguments:
+      data: A batched Dataset object, a NumPy array, an EagerTensor, or a list,
+        dict or nested structure of Numpy Arrays or EagerTensors. The elements
+        of Dataset object need to conform with inputs of the stage. The first
+        dimension of NumPy arrays or EagerTensors are understood to be batch
+        dimension. Data to be iterated over to adapt the state of the layers in
+        this preprocessing stage.
+      reset_state: Whether this call to `adapt` should reset the state of the
+        layers in this preprocessing stage.
+
+    Examples:
+
+    >>> # For a stage with dict input
+    >>> inputs = {'x2': tf.keras.Input(shape=(5,)),
+    ...           'x1': tf.keras.Input(shape=(1,))}
+    >>> outputs = [inputs['x1'], inputs['x2']]
+    >>> stage = FunctionalPreprocessingStage(inputs, outputs)
+    >>> ds = tf.data.Dataset.from_tensor_slices({'x1': tf.ones((4,5)),
+    ...                                          'x2': tf.ones((4,1))})
+    >>> sorted(ds.element_spec.items()) # Check element_spec
+    [('x1', TensorSpec(shape=(5,), dtype=tf.float32, name=None)),
+     ('x2', TensorSpec(shape=(1,), dtype=tf.float32, name=None))]
+    >>> stage.adapt(ds)
+    >>> data_np = {'x1': np.ones((4, 5)), 'x2': np.ones((4, 1))}
+    >>> stage.adapt(data_np)
+
+    """
+    if not isinstance(data, dataset_ops.Dataset):
+      data = self._flatten_to_reference_inputs(data)
+      if any([
+          not isinstance(datum, (np.ndarray, ops.EagerTensor)) for datum in data
+      ]):
+        raise ValueError(
+            '`adapt()` requires a batched Dataset, a list of EagerTensors '
+            'or Numpy arrays as input, got {}'.format(type(data)))
+      ds_input = [
+          dataset_ops.Dataset.from_tensor_slices(x).batch(1) for x in data
+      ]
+
+    if isinstance(data, dataset_ops.Dataset):
+      # Validate the datasets to try and ensure we haven't been passed one with
+      # infinite size. That would cause an infinite loop here.
+      if tf_utils.dataset_is_infinite(data):
+        raise ValueError(
+            'The dataset passed to `adapt()` has an infinite number of '
+            'elements. Please use dataset.take(...) to make the number '
+            'of elements finite.')
+      # Unzip dataset object to a list of single input dataset.
+      ds_input = _unzip_dataset(data)
+
+    # Dictionary mapping reference tensors to datasets
+    ds_dict = {}
+    tensor_usage_count = self._tensor_usage_count
+    for x, y in zip(self.inputs, ds_input):
+      x_id = str(id(x))
+      ds_dict[x_id] = [y] * tensor_usage_count[x_id]
+
+    nodes_by_depth = self._nodes_by_depth
+    depth_keys = sorted(nodes_by_depth.keys(), reverse=True)
+
+    def build_map_fn(node, args, kwargs):
+      if not isinstance(args.element_spec, tuple):
+
+        def map_fn(*x):
+          return nest.flatten(node.layer(*x, **kwargs))
+      else:
+
+        def map_fn(*x):
+          return nest.flatten(node.layer(x, **kwargs))
+
+      return map_fn
+
+    for depth in depth_keys:
+      for node in nodes_by_depth[depth]:
+        # Input node
+        if node.is_input:
+          continue
+
+        # Node with input not computed yet
+        if any(t_id not in ds_dict for t_id in node.flat_input_ids):
+          continue
+
+        args, kwargs = node.map_arguments(ds_dict)
+        args = dataset_ops.Dataset.zip(nest.list_to_tuple(*args))
+
+        if hasattr(node.layer, 'adapt'):
+          node.layer.adapt(args, reset_state=reset_state)
+
+        map_fn = build_map_fn(node, args, kwargs)
+        outputs = args.map(map_fn)
+        outputs = _unzip_dataset(outputs)
+
+        # Update ds_dict.
+        for x_id, y in zip(node.flat_output_ids, outputs):
+          ds_dict[x_id] = [y] * tensor_usage_count[x_id]
+
+
+def _unzip_dataset(ds):
+  """Unzip dataset into a list of single element datasets.
+
+  Arguments:
+    ds: A Dataset object.
+
+  Returns:
+    A list of Dataset object, each correspond to one of the `element_spec` of
+    the input Dataset object.
+
+  Example:
+
+  >>> ds1 = tf.data.Dataset.from_tensor_slices([1, 2, 3])
+  >>> ds2 = tf.data.Dataset.from_tensor_slices([4, 5, 6])
+  >>> ds_zipped_tuple = tf.data.Dataset.zip((ds1, ds2))
+  >>> ds_unzipped_tuple = _unzip_dataset(ds_zipped_tuple)
+  >>> ds_zipped_dict = tf.data.Dataset.zip({'ds1': ds1, 'ds2': ds2})
+  >>> ds_unzipped_dict = _unzip_dataset(ds_zipped_dict)
+
+  Then the two elements of `ds_unzipped_tuple` and `ds_unzipped_dict` are both
+  the same as `ds1` and `ds2`.
+  """
+  element_count = len(nest.flatten(ds.element_spec))
+  ds_unzipped = []
+  for i in range(element_count):
+
+    def map_fn(*x, j=i):
+      return nest.flatten(x)[j]
+
+    ds_unzipped.append(ds.map(map_fn))
+  return ds_unzipped
diff --git a/tensorflow/python/keras/layers/preprocessing/preprocessing_stage_functional_test.py b/tensorflow/python/keras/layers/preprocessing/preprocessing_stage_functional_test.py
new file mode 100644
index 00000000000..5c92565ba7c
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/preprocessing_stage_functional_test.py
@@ -0,0 +1,445 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional preprocessing stage tests."""
+# pylint: disable=g-classes-have-attributes
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.engine import base_preprocessing_layer
+from tensorflow.python.keras.engine.input_layer import Input
+from tensorflow.python.keras.layers import convolutional
+from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.layers import merge
+from tensorflow.python.keras.layers.preprocessing import image_preprocessing
+from tensorflow.python.keras.layers.preprocessing import normalization
+from tensorflow.python.keras.layers.preprocessing import preprocessing_stage
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class PL(base_preprocessing_layer.PreprocessingLayer):
+
+  def __init__(self, **kwargs):
+    self.adapt_time = None
+    self.adapt_count = 0
+    super(PL, self).__init__(**kwargs)
+
+  def adapt(self, data, reset_state=True):
+    self.adapt_time = time.time()
+    self.adapt_count += 1
+
+  def call(self, inputs):
+    return inputs + 1
+
+
+class PLMerge(PL):
+
+  def call(self, inputs):
+    return inputs[0] + inputs[1]
+
+
+class PLSplit(PL):
+
+  def call(self, inputs):
+    return inputs + 1, inputs - 1
+
+
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+class PreprocessingStageTest(keras_parameterized.TestCase,
+                             preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_adapt_preprocessing_stage_with_single_input_output(self):
+
+    x = Input(shape=(3,))
+
+    l0 = PL()
+    y = l0(x)
+
+    l1 = PL()
+    z = l1(y)
+
+    stage = preprocessing_stage.FunctionalPreprocessingStage(x, z)
+    stage.compile()
+
+    # Test with NumPy array
+    one_array = np.ones((4, 3), dtype='float32')
+    stage.adapt(one_array)
+    self.assertEqual(l0.adapt_count, 1)
+    self.assertEqual(l1.adapt_count, 1)
+    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+
+    # Check call
+    z = stage(array_ops.ones((4, 3), dtype='float32'))
+    self.assertAllClose(z, np.ones((4, 3), dtype='float32') + 2.)
+
+    # Test with dataset
+    adapt_data = dataset_ops.Dataset.from_tensor_slices(one_array)
+    adapt_data = adapt_data.batch(2)  # 5 batches of 2 samples
+
+    stage.adapt(adapt_data)
+    self.assertEqual(l0.adapt_count, 2)
+    self.assertEqual(l1.adapt_count, 2)
+    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+
+    # Test error with bad data
+    with self.assertRaisesRegex(ValueError, 'requires a '):
+      stage.adapt(None)
+
+    # Disallow calling fit
+    with self.assertRaisesRegex(ValueError, 'Preprocessing stage'):
+      stage.fit(None)
+
+  def test_adapt_preprocessing_stage_with_list_input(self):
+
+    x0 = Input(shape=(3,))
+    x1 = Input(shape=(3,))
+    x2 = Input(shape=(3,))
+
+    l0 = PLMerge()
+    y = l0([x0, x1])
+
+    l1 = PLMerge()
+    y = l1([y, x2])
+
+    l2 = PLSplit()
+    z, y = l2(y)
+
+    stage = preprocessing_stage.FunctionalPreprocessingStage([x0, x1, x2],
+                                                             [y, z])
+    stage.compile()
+
+    # Test with NumPy array
+    one_array = np.ones((4, 3), dtype='float32')
+    stage.adapt([one_array, one_array, one_array])
+    self.assertEqual(l0.adapt_count, 1)
+    self.assertEqual(l1.adapt_count, 1)
+    self.assertEqual(l2.adapt_count, 1)
+    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+    self.assertLessEqual(l1.adapt_time, l2.adapt_time)
+
+    # Check call
+    y, z = stage([
+        array_ops.ones((4, 3), dtype='float32'),
+        array_ops.ones((4, 3), dtype='float32'),
+        array_ops.ones((4, 3), dtype='float32')
+    ])
+    self.assertAllClose(y, np.ones((4, 3), dtype='float32') + 1.)
+    self.assertAllClose(z, np.ones((4, 3), dtype='float32') + 3.)
+
+    # Test with dataset
+    adapt_data = dataset_ops.Dataset.from_tensor_slices(
+        (one_array, one_array, one_array))
+    adapt_data = adapt_data.batch(2)  # 5 batches of 2 samples
+
+    stage.adapt(adapt_data)
+    self.assertEqual(l0.adapt_count, 2)
+    self.assertEqual(l1.adapt_count, 2)
+    self.assertEqual(l2.adapt_count, 2)
+    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+    self.assertLessEqual(l1.adapt_time, l2.adapt_time)
+
+    # Test error with bad data
+    with self.assertRaisesRegex(ValueError, 'requires a '):
+      stage.adapt(None)
+
+  def test_adapt_preprocessing_stage_with_dict_input(self):
+    x0 = Input(shape=(3,), name='x0')
+    x1 = Input(shape=(4,), name='x1')
+    x2 = Input(shape=(3, 5), name='x2')
+
+    # dimension will mismatch if x1 incorrectly placed.
+    x1_sum = core.Lambda(
+        lambda x: math_ops.reduce_sum(x, axis=-1, keepdims=True))(
+            x1)
+    x2_sum = core.Lambda(lambda x: math_ops.reduce_sum(x, axis=-1))(x2)
+
+    l0 = PLMerge()
+    y = l0([x0, x1_sum])
+
+    l1 = PLMerge()
+    y = l1([y, x2_sum])
+
+    l2 = PLSplit()
+    z, y = l2(y)
+    stage = preprocessing_stage.FunctionalPreprocessingStage(
+        {
+            'x2': x2,
+            'x0': x0,
+            'x1': x1
+        }, [y, z])
+    stage.compile()
+
+    # Test with dict of NumPy array
+    one_array0 = np.ones((4, 3), dtype='float32')
+    one_array1 = np.ones((4, 4), dtype='float32')
+    one_array2 = np.ones((4, 3, 5), dtype='float32')
+    adapt_data = {'x1': one_array1, 'x0': one_array0, 'x2': one_array2}
+    stage.adapt(adapt_data)
+    self.assertEqual(l0.adapt_count, 1)
+    self.assertEqual(l1.adapt_count, 1)
+    self.assertEqual(l2.adapt_count, 1)
+    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+    self.assertLessEqual(l1.adapt_time, l2.adapt_time)
+
+    # Check call
+    y, z = stage({
+        'x1': array_ops.constant(one_array1),
+        'x2': array_ops.constant(one_array2),
+        'x0': array_ops.constant(one_array0)
+    })
+    self.assertAllClose(y, np.zeros((4, 3), dtype='float32') + 9.)
+    self.assertAllClose(z, np.zeros((4, 3), dtype='float32') + 11.)
+
+    # Test with list of NumPy array
+    adapt_data = [one_array0, one_array1, one_array2]
+    stage.adapt(adapt_data)
+    self.assertEqual(l0.adapt_count, 2)
+    self.assertEqual(l1.adapt_count, 2)
+    self.assertEqual(l2.adapt_count, 2)
+    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+    self.assertLessEqual(l1.adapt_time, l2.adapt_time)
+
+    # Test with flattened dataset
+    adapt_data = dataset_ops.Dataset.from_tensor_slices(
+        (one_array0, one_array1, one_array2))
+    adapt_data = adapt_data.batch(2)  # 5 batches of 2 samples
+
+    stage.adapt(adapt_data)
+    self.assertEqual(l0.adapt_count, 3)
+    self.assertEqual(l1.adapt_count, 3)
+    self.assertEqual(l2.adapt_count, 3)
+    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+    self.assertLessEqual(l1.adapt_time, l2.adapt_time)
+
+    # Test with dataset in dict shape
+    adapt_data = dataset_ops.Dataset.from_tensor_slices({
+        'x0': one_array0,
+        'x2': one_array2,
+        'x1': one_array1
+    })
+    adapt_data = adapt_data.batch(2)  # 5 batches of 2 samples
+    stage.adapt(adapt_data)
+    self.assertEqual(l0.adapt_count, 4)
+    self.assertEqual(l1.adapt_count, 4)
+    self.assertEqual(l2.adapt_count, 4)
+    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+    self.assertLessEqual(l1.adapt_time, l2.adapt_time)
+
+    # Test error with bad data
+    with self.assertRaisesRegex(ValueError, 'requires a '):
+      stage.adapt(None)
+
+  def test_adapt_preprocessing_stage_with_dict_output(self):
+    x = Input(shape=(3,), name='x')
+
+    l0 = PLSplit()
+    y0, y1 = l0(x)
+
+    l1 = PLSplit()
+    z0, z1 = l1(y0)
+    stage = preprocessing_stage.FunctionalPreprocessingStage({'x': x}, {
+        'y1': y1,
+        'z1': z1,
+        'y0': y0,
+        'z0': z0
+    })
+    stage.compile()
+
+    # Test with NumPy array
+    one_array = np.ones((4, 3), dtype='float32')
+    adapt_data = {'x': one_array}
+    stage.adapt(adapt_data)
+    self.assertEqual(l0.adapt_count, 1)
+    self.assertEqual(l1.adapt_count, 1)
+    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+
+    # Check call
+    outputs = stage({'x': array_ops.constant(one_array)})
+    self.assertEqual(set(outputs.keys()), {'y0', 'y1', 'z0', 'z1'})
+    self.assertAllClose(outputs['y0'], np.ones((4, 3), dtype='float32') + 1.)
+    self.assertAllClose(outputs['y1'], np.ones((4, 3), dtype='float32') - 1.)
+    self.assertAllClose(outputs['z0'], np.ones((4, 3), dtype='float32') + 2.)
+    self.assertAllClose(outputs['z1'], np.ones((4, 3), dtype='float32'))
+
+  def test_preprocessing_stage_with_nested_input(self):
+    # Test with NumPy array
+    x0 = Input(shape=(3,))
+    x1 = Input(shape=(3,))
+    x2 = Input(shape=(3,))
+
+    l0 = PLMerge()
+    y = l0([x0, x1])
+
+    l1 = PLMerge()
+    y = l1([y, x2])
+
+    l2 = PLSplit()
+    z, y = l2(y)
+
+    stage = preprocessing_stage.FunctionalPreprocessingStage([x0, [x1, x2]],
+                                                             [y, z])
+    stage.compile()
+    one_array = np.ones((4, 3), dtype='float32')
+    stage.adapt([one_array, [one_array, one_array]])
+    self.assertEqual(l0.adapt_count, 1)
+    self.assertEqual(l1.adapt_count, 1)
+    self.assertEqual(l2.adapt_count, 1)
+    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+    self.assertLessEqual(l1.adapt_time, l2.adapt_time)
+
+    # Check call
+    y, z = stage([
+        array_ops.ones((4, 3), dtype='float32'),
+        [
+            array_ops.ones((4, 3), dtype='float32'),
+            array_ops.ones((4, 3), dtype='float32')
+        ]
+    ])
+    self.assertAllClose(y, np.ones((4, 3), dtype='float32') + 1.)
+    self.assertAllClose(z, np.ones((4, 3), dtype='float32') + 3.)
+
+    # Test with dataset
+    adapt_data = dataset_ops.Dataset.from_tensor_slices(
+        (one_array, (one_array, one_array)))
+    adapt_data = adapt_data.batch(2)  # 5 batches of 2 samples
+
+    stage.adapt(adapt_data)
+    self.assertEqual(l0.adapt_count, 2)
+    self.assertEqual(l1.adapt_count, 2)
+    self.assertEqual(l2.adapt_count, 2)
+    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+    self.assertLessEqual(l1.adapt_time, l2.adapt_time)
+
+    # Test error with bad data
+    with self.assertRaisesRegex(ValueError, 'requires a '):
+      stage.adapt(None)
+
+  def test_include_layers_with_dict_input(self):
+
+    class PLMergeDict(PLMerge):
+
+      def call(self, inputs):
+        return inputs['a'] + inputs['b']
+
+    x0 = Input(shape=(3,))
+    x1 = Input(shape=(3,))
+
+    l0 = PLMergeDict()
+    y = l0({'a': x0, 'b': x1})
+
+    l1 = PLSplit()
+    z, y = l1(y)
+
+    stage = preprocessing_stage.FunctionalPreprocessingStage([x0, x1], [y, z])
+    stage.compile()
+
+    one_array = np.ones((4, 3), dtype='float32')
+    adapt_data = dataset_ops.Dataset.from_tensor_slices((one_array, one_array))
+    stage.adapt(adapt_data)
+    self.assertEqual(l0.adapt_count, 1)
+    self.assertEqual(l1.adapt_count, 1)
+    self.assertLessEqual(l0.adapt_time, l1.adapt_time)
+
+    # Check call
+    y, z = stage([
+        array_ops.ones((4, 3), dtype='float32'),
+        array_ops.ones((4, 3), dtype='float32')
+    ])
+    self.assertAllClose(y, np.ones((4, 3), dtype='float32'))
+    self.assertAllClose(z, np.ones((4, 3), dtype='float32') + 2.)
+
+  def test_include_layers_with_nested_input(self):
+
+    class PLMergeNest(PLMerge):
+
+      def call(self, inputs):
+        a = inputs[0]
+        b = inputs[1][0]
+        c = inputs[1][1]
+        return a + b + c
+
+    x0 = Input(shape=(3,))
+    x1 = Input(shape=(3,))
+    x2 = Input(shape=(3,))
+
+    l0 = PLMergeNest()
+    y = l0([x0, [x1, x2]])
+
+    stage = preprocessing_stage.FunctionalPreprocessingStage([x0, x1, x2], y)
+    stage.compile()
+
+    one_array = np.ones((4, 3), dtype='float32')
+    adapt_data = dataset_ops.Dataset.from_tensor_slices((one_array,) * 3)
+    stage.adapt(adapt_data)
+    self.assertEqual(l0.adapt_count, 1)
+
+    # Check call
+    y = stage([
+        array_ops.ones((4, 3), dtype='float32'),
+        array_ops.ones((4, 3), dtype='float32'),
+        array_ops.ones((4, 3), dtype='float32')
+    ])
+    self.assertAllClose(y, np.ones((4, 3), dtype='float32') + 2.)
+
+  def test_mixing_preprocessing_and_regular_layers(self):
+    x0 = Input(shape=(10, 10, 3))
+    x1 = Input(shape=(10, 10, 3))
+    x2 = Input(shape=(10, 10, 3))
+
+    y0 = merge.Add()([x0, x1])
+    y1 = image_preprocessing.CenterCrop(8, 8)(x2)
+    y1 = convolutional.ZeroPadding2D(padding=1)(y1)
+
+    z = merge.Add()([y0, y1])
+    z = normalization.Normalization()(z)
+    z = convolutional.Conv2D(4, 3)(z)
+
+    stage = preprocessing_stage.FunctionalPreprocessingStage([x0, x1, x2], z)
+
+    data = [
+        np.ones((12, 10, 10, 3), dtype='float32'),
+        np.ones((12, 10, 10, 3), dtype='float32'),
+        np.ones((12, 10, 10, 3), dtype='float32')
+    ]
+
+    stage.adapt(data)
+    _ = stage(data)
+    stage.compile('rmsprop', 'mse')
+    with self.assertRaisesRegex(ValueError, 'Preprocessing stage'):
+      stage.fit(data, np.ones((12, 8, 8, 4)))
+
+    ds_x0 = dataset_ops.Dataset.from_tensor_slices(np.ones((12, 10, 10, 3)))
+    ds_x1 = dataset_ops.Dataset.from_tensor_slices(np.ones((12, 10, 10, 3)))
+    ds_x2 = dataset_ops.Dataset.from_tensor_slices(np.ones((12, 10, 10, 3)))
+    ds_x = dataset_ops.Dataset.zip((ds_x0, ds_x1, ds_x2))
+    ds_y = dataset_ops.Dataset.from_tensor_slices(np.ones((12, 8, 8, 4)))
+    dataset = dataset_ops.Dataset.zip((ds_x, ds_y)).batch(4)
+
+    with self.assertRaisesRegex(ValueError, 'Preprocessing stage'):
+      stage.fit(dataset)
+    _ = stage.evaluate(data, np.ones((12, 8, 8, 4)))
+    _ = stage.predict(data)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils.py b/tensorflow/python/keras/layers/preprocessing/table_utils.py
index 3329f32b4fe..c72b8252480 100644
--- a/tensorflow/python/keras/layers/preprocessing/table_utils.py
+++ b/tensorflow/python/keras/layers/preprocessing/table_utils.py
@@ -62,8 +62,10 @@ class TableHandler(object):
       raise RuntimeError("Size mismatch between values and key arrays. "
                          "Keys had size %s, values had size %s." %
                          (len(keys), len(values)))
-    keys = ops.convert_to_tensor(keys, dtype=self.table._key_dtype)  # pylint: disable=protected-access
-    values = ops.convert_to_tensor(values, dtype=self.table._value_dtype)  # pylint: disable=protected-access
+    keys = ops.convert_to_tensor_v2_with_dispatch(
+        keys, dtype=self.table._key_dtype)  # pylint: disable=protected-access
+    values = ops.convert_to_tensor_v2_with_dispatch(
+        values, dtype=self.table._value_dtype)  # pylint: disable=protected-access
     if values.shape.ndims != 1:
       raise ValueError("`values` must be 1-dimensional, got an input with "
                        " %s dimensions." % values.shape.ndims)
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index 2cc8bc2b340..6449d8afaf7 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -155,6 +155,10 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
       the number of unique tokens in the vocabulary is less than max_tokens,
       resulting in a tensor of shape [batch_size, max_tokens] regardless of
       vocabulary size. Defaults to True.
+    vocabulary: An optional list of vocabulary terms, or a path to a text file
+      containing a vocabulary to load into this layer. The file should contain
+      one token per line. If the list or file contains the same token multiple
+      times, an error will be thrown.
 
   Example:
   This example instantiates a TextVectorization layer that lowercases text,
@@ -196,6 +200,43 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
   array([[2, 1, 4, 0],
          [1, 3, 0, 0]])
 
+  Example:
+  This example instantiates a TextVectorization layer by passing a list
+  of vocabulary terms to the layer's __init__ method.
+
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(
+        max_tokens=None,
+        standardize=None,
+        split=None,
+        output_mode=text_vectorization.INT,
+        vocabulary=vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+
+    output_dataset = model.predict(input_array)
+  >>> vocab_data = ["earth", "wind", "and", "fire"]
+  >>> max_len = 4  # Sequence length to pad the outputs to.
+  >>>
+  >>> # Create the layer, passing the vocab directly. You can also pass the
+  >>> # vocabulary arg a path to a file containing one vocabulary word per
+  >>> # line.
+  >>> vectorize_layer = TextVectorization(
+  ...  max_tokens=max_features,
+  ...  output_mode='int',
+  ...  output_sequence_length=max_len,
+  ...  vocabulary=vocab_data)
+  >>>
+  >>> # Because we've passed the vocabulary directly, we don't need to adapt
+  >>> # the layer - the vocabulary is already set. The vocabulary contains the
+  >>> # padding token ('') and OOV token ('[UNK]') as well as the passed tokens.
+  >>> vectorize_layer.get_vocabulary()
+  ['', '[UNK]', 'earth', 'wind', 'and', 'fire']
+
   """
   # TODO(momernick): Add an examples section to the docstring.
 
@@ -207,6 +248,7 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
                output_mode=INT,
                output_sequence_length=None,
                pad_to_max_tokens=True,
+               vocabulary=None,
                **kwargs):
 
     # This layer only applies to string processing, and so should only have
@@ -295,7 +337,7 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
 
     mask_token = "" if output_mode in [None, INT] else None
     self._index_lookup_layer = self._get_index_lookup_class()(
-        max_tokens=max_tokens, mask_token=mask_token)
+        max_tokens=max_tokens, mask_token=mask_token, vocabulary=vocabulary)
 
     # If this layer is configured for string or integer output, we do not
     # create a vectorization layer (as the output is not vectorized).
@@ -367,7 +409,7 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
     # on an implicit call to `build` in the base layer's `adapt`, since
     # preprocessing changes the input shape.
     if isinstance(data, (list, tuple, np.ndarray)):
-      data = ops.convert_to_tensor(data)
+      data = ops.convert_to_tensor_v2_with_dispatch(data)
 
     if isinstance(data, ops.Tensor):
       if data.shape.rank == 1:
@@ -404,6 +446,9 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
     return self._index_lookup_layer.get_vocabulary()
 
   def get_config(self):
+    # This does not include the 'vocabulary' arg, since if the vocab was passed
+    # at init time it's now stored in variable state - we don't need to
+    # pull it off disk again.
     config = {
         "max_tokens": self._max_tokens,
         "standardize": self._standardize,
@@ -566,7 +611,7 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
 
   def call(self, inputs):
     if isinstance(inputs, (list, tuple, np.ndarray)):
-      inputs = ops.convert_to_tensor(inputs)
+      inputs = ops.convert_to_tensor_v2_with_dispatch(inputs)
 
     self._called = True
     inputs = self._preprocess(inputs)
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_distribution_test.py
index 2d80f13684d..222bcd6252a 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_distribution_test.py
@@ -22,11 +22,12 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import context
 from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.keras.layers.preprocessing import text_vectorization
@@ -41,7 +42,7 @@ def get_layer_class():
     return text_vectorization_v1.TextVectorization
 
 
-@combinations.generate(
+@ds_combinations.generate(
     combinations.combine(
         distribution=strategy_combinations.all_strategies,
         mode=["eager", "graph"]))
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
index e7f61e94724..a1f9f54a39f 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@@ -690,6 +690,25 @@ class TextVectorizationPreprocessingTest(
 
     self.assertAllEqual(expected_output, output)
 
+  def test_vocab_setting_via_init(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(
+        max_tokens=None,
+        standardize=None,
+        split=None,
+        output_mode=text_vectorization.INT,
+        vocabulary=vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
 
 @keras_parameterized.run_all_keras_modes
 class TextVectorizationDistributionTest(
@@ -1042,24 +1061,6 @@ class TextVectorizationOutputTest(
     with self.assertRaisesRegex(RuntimeError, "vocabulary cannot be changed"):
       layer.set_vocabulary(vocab_data)
 
-  def test_bag_output_soft_maximum_adapt_after_call_fails(self):
-    vocab_data = np.array([
-        "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
-        "and", "fire"
-    ])
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(
-        max_tokens=None,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.BINARY,
-        pad_to_max_tokens=False)
-    layer.adapt(vocab_data)
-    _ = layer(input_data)
-    with self.assertRaisesRegex(RuntimeError, "can't be adapted after being"):
-      layer.adapt(vocab_data)
-
   def test_bag_output_soft_maximum_set_state_variables_after_call_fails(self):
     state_variables = {
         text_vectorization._VOCAB_NAME: ["earth", "wind", "and", "fire"]
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 90a73db31d7..72ba1fbcc58 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
 import numpy as np
 
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
@@ -43,7 +45,6 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import data_structures
-from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import keras_export
@@ -182,10 +183,7 @@ class StackedRNNCells(Layer):
   def get_config(self):
     cells = []
     for cell in self.cells:
-      cells.append({
-          'class_name': cell.__class__.__name__,
-          'config': cell.get_config()
-      })
+      cells.append(generic_utils.serialize_keras_object(cell))
     config = {'cells': cells}
     base_config = super(StackedRNNCells, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -976,11 +974,7 @@ class RNN(Layer):
     if self.zero_output_for_mask:
       config['zero_output_for_mask'] = self.zero_output_for_mask
 
-    cell_config = self.cell.get_config()
-    config['cell'] = {
-        'class_name': self.cell.__class__.__name__,
-        'config': cell_config
-    }
+    config['cell'] = generic_utils.serialize_keras_object(self.cell)
     base_config = super(RNN, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
@@ -1716,12 +1710,6 @@ class GRUCell(DropoutRNNCellMixin, Layer):
     recurrent_dropout: Float between 0 and 1.
       Fraction of the units to drop for
       the linear transformation of the recurrent state.
-    implementation: Implementation mode, either 1 or 2.
-      Mode 1 will structure its operations as a larger number of
-      smaller dot products and additions, whereas mode 2 will
-      batch them into fewer, larger operations. These modes will
-      have different performance profiles on different hardware and
-      for different applications.
     reset_after: GRU convention (whether to apply reset gate after or
       before matrix multiplication). False = "before" (default),
       True = "after" (CuDNN compatible).
@@ -1750,7 +1738,6 @@ class GRUCell(DropoutRNNCellMixin, Layer):
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=1,
                reset_after=False,
                **kwargs):
     # By default use cached variable under v2 mode, see b/143699808.
@@ -1778,6 +1765,8 @@ class GRUCell(DropoutRNNCellMixin, Layer):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
+
+    implementation = kwargs.pop('implementation', 1)
     if self.recurrent_dropout != 0 and implementation != 1:
       logging.debug(RECURRENT_DROPOUT_WARNING_MSG)
       self.implementation = 1
@@ -2007,12 +1996,6 @@ class GRU(RNN):
     recurrent_dropout: Float between 0 and 1.
       Fraction of the units to drop for
       the linear transformation of the recurrent state.
-    implementation: Implementation mode, either 1 or 2.
-      Mode 1 will structure its operations as a larger number of
-      smaller dot products and additions, whereas mode 2 will
-      batch them into fewer, larger operations. These modes will
-      have different performance profiles on different hardware and
-      for different applications.
     return_sequences: Boolean. Whether to return the last output
       in the output sequence, or the full sequence.
     return_state: Boolean. Whether to return the last state
@@ -2070,7 +2053,6 @@ class GRU(RNN):
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=1,
                return_sequences=False,
                return_state=False,
                go_backwards=False,
@@ -2078,6 +2060,7 @@ class GRU(RNN):
                unroll=False,
                reset_after=False,
                **kwargs):
+    implementation = kwargs.pop('implementation', 1)
     if implementation == 0:
       logging.warning('`implementation=0` has been deprecated, '
                       'and now defaults to `implementation=1`.'
@@ -2286,12 +2269,6 @@ class LSTMCell(DropoutRNNCellMixin, Layer):
     recurrent_dropout: Float between 0 and 1.
       Fraction of the units to drop for
       the linear transformation of the recurrent state.
-    implementation: Implementation mode, either 1 or 2.
-      Mode 1 will structure its operations as a larger number of
-      smaller dot products and additions, whereas mode 2 will
-      batch them into fewer, larger operations. These modes will
-      have different performance profiles on different hardware and
-      for different applications.
 
   Call arguments:
     inputs: A 2D tensor.
@@ -2318,7 +2295,6 @@ class LSTMCell(DropoutRNNCellMixin, Layer):
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=1,
                **kwargs):
     # By default use cached variable under v2 mode, see b/143699808.
     if ops.executing_eagerly_outside_functions():
@@ -2346,6 +2322,7 @@ class LSTMCell(DropoutRNNCellMixin, Layer):
 
     self.dropout = min(1., max(0., dropout))
     self.recurrent_dropout = min(1., max(0., recurrent_dropout))
+    implementation = kwargs.pop('implementation', 1)
     if self.recurrent_dropout != 0 and implementation != 1:
       logging.debug(RECURRENT_DROPOUT_WARNING_MSG)
       self.implementation = 1
@@ -2562,8 +2539,6 @@ class PeepholeLSTMCell(LSTMCell):
   ```
   """
 
-  @deprecation.deprecated(
-      None, 'Please use tensorflow_addons.rnn.PeepholeLSTMCell instead')
   def __init__(self,
                units,
                activation='tanh',
@@ -2581,8 +2556,11 @@ class PeepholeLSTMCell(LSTMCell):
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=1,
                **kwargs):
+    warnings.warn('`tf.keras.experimental.PeepholeLSTMCell` is deprecated '
+                  'and will be removed in a future version. '
+                  'Please use tensorflow_addons.rnn.PeepholeLSTMCell '
+                  'instead.')
     super(PeepholeLSTMCell, self).__init__(
         units=units,
         activation=activation,
@@ -2600,7 +2578,7 @@ class PeepholeLSTMCell(LSTMCell):
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
-        implementation=implementation,
+        implementation=kwargs.pop('implementation', 1),
         **kwargs)
 
   def build(self, input_shape):
@@ -2696,12 +2674,6 @@ class LSTM(RNN):
     recurrent_dropout: Float between 0 and 1.
       Fraction of the units to drop for
       the linear transformation of the recurrent state.
-    implementation: Implementation mode, either 1 or 2.
-      Mode 1 will structure its operations as a larger number of
-      smaller dot products and additions, whereas mode 2 will
-      batch them into fewer, larger operations. These modes will
-      have different performance profiles on different hardware and
-      for different applications.
     return_sequences: Boolean. Whether to return the last output.
       in the output sequence, or the full sequence.
     return_state: Boolean. Whether to return the last state
@@ -2757,13 +2729,13 @@ class LSTM(RNN):
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=1,
                return_sequences=False,
                return_state=False,
                go_backwards=False,
                stateful=False,
                unroll=False,
                **kwargs):
+    implementation = kwargs.pop('implementation', 1)
     if implementation == 0:
       logging.warning('`implementation=0` has been deprecated, '
                       'and now defaults to `implementation=1`.'
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index db8cda90553..d7beddf184a 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -33,7 +33,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer_utils
@@ -670,10 +669,41 @@ class RNNTest(keras_parameterized.TestCase):
               np.random.random((num_samples, timesteps, embedding_dim))]
     model.predict(inputs)
 
-  def test_builtin_rnn_cell_serialization(self):
+  def test_builtin_and_custom_rnn_cell_serialization(self):
+
+    @keras.utils.generic_utils.register_keras_serializable(package='TestOnly')
+    class CustomRNNCell(keras.layers.Layer):
+
+      def __init__(self, units, **kwargs):
+        self.units = units
+        self.state_size = units
+        super(CustomRNNCell, self).__init__(**kwargs)
+
+      def build(self, input_shape):
+        self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
+                                      initializer='uniform',
+                                      name='kernel')
+        self.recurrent_kernel = self.add_weight(
+            shape=(self.units, self.units),
+            initializer='uniform',
+            name='recurrent_kernel')
+        self.built = True
+
+      def call(self, inputs, states):
+        prev_output = states[0]
+        h = keras.backend.dot(inputs, self.kernel)
+        output = h + keras.backend.dot(prev_output, self.recurrent_kernel)
+        return output, [output]
+
+      def get_config(self):
+        config = {'units': self.units}
+        base_config = super(CustomRNNCell, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
     for cell_class in [keras.layers.SimpleRNNCell,
                        keras.layers.GRUCell,
-                       keras.layers.LSTMCell]:
+                       keras.layers.LSTMCell,
+                       CustomRNNCell]:
       # Test basic case.
       x = keras.Input((None, 5))
       cell = cell_class(32)
@@ -722,7 +752,7 @@ class RNNTest(keras_parameterized.TestCase):
       self.assertAllClose(y_np, y_np_2, atol=1e-4)
 
   @parameterized.named_parameters(
-      *test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           layer=[rnn_v1.SimpleRNN, rnn_v1.GRU, rnn_v1.LSTM,
                  rnn_v2.GRU, rnn_v2.LSTM],
           unroll=[True, False]))
@@ -743,7 +773,7 @@ class RNNTest(keras_parameterized.TestCase):
     model.train_on_batch(x_np, y_np)
 
   @parameterized.named_parameters(
-      *test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           cell=[keras.layers.SimpleRNNCell, keras.layers.GRUCell,
                 keras.layers.LSTMCell],
           unroll=[True, False]))
@@ -1551,7 +1581,7 @@ class RNNTest(keras_parameterized.TestCase):
     model.predict(np.ones((batch, timesteps, input_dim)))
 
   @parameterized.named_parameters(
-      *test_util.generate_combinations_with_testcase_name(layer=[
+      *testing_utils.generate_combinations_with_testcase_name(layer=[
           rnn_v1.SimpleRNN, rnn_v1.GRU, rnn_v1.LSTM, rnn_v2.GRU, rnn_v2.LSTM
       ]))
   def test_rnn_with_ragged_input(self, layer):
diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py
index 9794189cf09..8eddc14f5f8 100644
--- a/tensorflow/python/keras/layers/recurrent_v2.py
+++ b/tensorflow/python/keras/layers/recurrent_v2.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 
 import uuid
 
-from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
+from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device
 from tensorflow.python.framework import dtypes
@@ -66,6 +66,45 @@ _CUDNN_NOT_AVAILABLE_MSG = ('Layer %s will not use cuDNN kernel since it '
                             'on GPU')
 
 
+def _use_new_code():
+  return True
+
+
+# TODO(b/169707691): The wrapper can be removed if TFLite doesn't need to rely
+# on supportive attributes from LSTM/GRU.
+class _DefunWrapper(object):
+  """A wrapper with no deep copy of the Defun in LSTM/GRU layer."""
+
+  def __init__(self, time_major, go_backwards, layer_name):
+    self.time_major = time_major
+    self.go_backwards = go_backwards
+    self.layer_name = layer_name
+    if self.layer_name not in ['lstm', 'gru']:
+      raise ValueError('Defun wrapper only applies to LSTM and GRU layer, '
+                       'but given {}'.format(self.layer_name))
+    # The first two attributes are added to support TFLite use case.
+    supportive_attributes = {
+        'time_major': self.time_major,
+        'go_backwards': self.go_backwards,
+        _FUNCTION_API_NAME_ATTRIBUTE: self.layer_name + '_' + str(uuid.uuid4())
+    }
+    if self.layer_name == 'lstm':
+      layer_func = lstm_with_backend_selection
+    else:
+      layer_func = gru_with_backend_selection
+
+    self.defun_layer = function.defun_with_attributes(
+        layer_func,
+        attributes=supportive_attributes,
+        autograph=False)
+
+  def __deepcopy__(self, memo):
+    new_wrapper = type(self)(
+        self.time_major, self.go_backwards, self.layer_name)
+    memo[id(self)] = new_wrapper
+    return new_wrapper
+
+
 @keras_export('keras.layers.GRUCell', v1=[])
 class GRUCell(recurrent.GRUCell):
   """Cell class for the GRU layer.
@@ -125,12 +164,6 @@ class GRUCell(recurrent.GRUCell):
       linear transformation of the inputs. Default: 0.
     recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
       the linear transformation of the recurrent state. Default: 0.
-    implementation: Implementation mode, either 1 or 2.
-      Mode 1 will structure its operations as a larger number of
-      smaller dot products and additions, whereas mode 2 (default) will
-      batch them into fewer, larger operations. These modes will
-      have different performance profiles on different hardware and
-      for different applications. Default: 2.
     reset_after: GRU convention (whether to apply reset gate after or
       before matrix multiplication). False = "before",
       True = "after" (default and CuDNN compatible).
@@ -161,7 +194,6 @@ class GRUCell(recurrent.GRUCell):
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=2,
                reset_after=True,
                **kwargs):
     super(GRUCell, self).__init__(
@@ -180,7 +212,7 @@ class GRUCell(recurrent.GRUCell):
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
-        implementation=implementation,
+        implementation=kwargs.pop('implementation', 2),
         reset_after=reset_after,
         **kwargs)
 
@@ -270,12 +302,6 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
       transformation of the inputs. Default: 0.
     recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
       the linear transformation of the recurrent state. Default: 0.
-    implementation: Implementation mode, either 1 or 2.
-      Mode 1 will structure its operations as a larger number of
-      smaller dot products and additions, whereas mode 2 will
-      batch them into fewer, larger operations. These modes will
-      have different performance profiles on different hardware and
-      for different applications. Default: 2.
     return_sequences: Boolean. Whether to return the last output
       in the output sequence, or the full sequence. Default: `False`.
     return_state: Boolean. Whether to return the last state in addition to the
@@ -334,7 +360,6 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=2,
                return_sequences=False,
                return_state=False,
                go_backwards=False,
@@ -364,7 +389,7 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
-        implementation=implementation,
+        implementation=kwargs.pop('implementation', 2),
         return_sequences=return_sequences,
         return_state=return_state,
         go_backwards=go_backwards,
@@ -379,7 +404,7 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
         self.recurrent_activation in (activations.sigmoid, nn.sigmoid) and
         recurrent_dropout == 0 and not unroll and use_bias and
         reset_after and ops.executing_eagerly_outside_functions())
-    if context.num_gpus() > 0:
+    if config.list_logical_devices('GPU'):
       # Only show the message when there is GPU available, user will not care
       # about the cuDNN if there isn't any GPU.
       if self._could_use_gpu_kernel:
@@ -387,19 +412,8 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
       else:
         logging.warn(_CUDNN_NOT_AVAILABLE_MSG % self.name)
 
-    # TODO(b/162616551): Remove all compat statements after 08/20/2020.
-    # This follows b/161915509 and is mainly to test the stateless Case op.
-    if compat.forward_compatible(2020, 8, 27):
-      # The first two attributes are added to support TFLite use case.
-      supportive_attributes = {
-          'time_major': time_major,
-          'go_backwards': go_backwards,
-          _FUNCTION_API_NAME_ATTRIBUTE: 'gru_' + str(uuid.uuid4())
-      }
-      self.defun_gru_with_backend_selection = function.defun_with_attributes(
-          gru_with_backend_selection,
-          attributes=supportive_attributes,
-          autograph=False)
+    if _use_new_code():
+      self._defun_wrapper = _DefunWrapper(time_major, go_backwards, 'gru')
 
   def build(self, input_shape):
     super(GRU, self).build(input_shape)
@@ -483,7 +497,7 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
     if dropout_mask is not None:
       inputs = inputs * dropout_mask[0]
 
-    if compat.forward_compatible(2020, 8, 27):
+    if _use_new_code():
       gru_kwargs = {
           'inputs': inputs,
           'init_h': _read_variable_value(initial_state[0]),
@@ -497,7 +511,7 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
           'zero_output_for_mask': self.zero_output_for_mask
       }
       (last_output, outputs, new_h,
-       runtime) = self.defun_gru_with_backend_selection(**gru_kwargs)
+       runtime) = self._defun_wrapper.defun_layer(**gru_kwargs)
     else:
       gpu_gru_kwargs = {
           'inputs': inputs,
@@ -520,7 +534,7 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
         can_use_gpu = (
             # Either user specified GPU or unspecified but GPU is available.
             (device_type == _GPU_DEVICE_NAME or
-             (device_type is None and context.num_gpus() > 0)) and
+             (device_type is None and config.list_logical_devices('GPU'))) and
             (mask is None or is_cudnn_supported_inputs(mask, self.time_major)))
         # Under eager context, check the device placement and prefer the
         if can_use_gpu:
@@ -797,7 +811,7 @@ def gru_with_backend_selection(inputs, init_h, kernel, recurrent_kernel, bias,
         true_fn=cudnn_gru_fn,
         false_fn=standard_gru_fn)
 
-  if compat.forward_compatible(2020, 8, 27):
+  if _use_new_code():
     # Chooses the implementation dynamicly based on the running device.
     (last_output, outputs, new_h,
      runtime) = control_flow_ops.execute_fn_for_device(
@@ -894,11 +908,6 @@ class LSTMCell(recurrent.LSTMCell):
       transformation of the inputs. Default: 0.
     recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
       the linear transformation of the recurrent state. Default: 0.
-    implementation: Implementation mode, either 1 or 2.
-      Mode 1 will structure its operations as a larger number of smaller dot
-      products and additions, whereas mode 2 (default) will batch them into
-      fewer, larger operations. These modes will have different performance
-      profiles on different hardware and for different applications. Default: 2.
 
   Call arguments:
     inputs: A 2D tensor, with shape of `[batch, feature]`.
@@ -929,7 +938,6 @@ class LSTMCell(recurrent.LSTMCell):
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=2,
                **kwargs):
     super(LSTMCell, self).__init__(
         units,
@@ -948,7 +956,7 @@ class LSTMCell(recurrent.LSTMCell):
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
-        implementation=implementation,
+        implementation=kwargs.pop('implementation', 2),
         **kwargs)
 
 
@@ -1028,11 +1036,6 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
       transformation of the inputs. Default: 0.
     recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
       the linear transformation of the recurrent state. Default: 0.
-    implementation: Implementation mode, either 1 or 2. Mode 1 will structure
-      its operations as a larger number of smaller dot products and additions,
-      whereas mode 2 will batch them into fewer, larger operations. These modes
-      will have different performance profiles on different hardware and for
-      different applications. Default: 2.
     return_sequences: Boolean. Whether to return the last output. in the output
       sequence, or the full sequence. Default: `False`.
     return_state: Boolean. Whether to return the last state in addition to the
@@ -1086,7 +1089,6 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
                bias_constraint=None,
                dropout=0.,
                recurrent_dropout=0.,
-               implementation=2,
                return_sequences=False,
                return_state=False,
                go_backwards=False,
@@ -1116,7 +1118,7 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
         bias_constraint=bias_constraint,
         dropout=dropout,
         recurrent_dropout=recurrent_dropout,
-        implementation=implementation,
+        implementation=kwargs.pop('implementation', 2),
         return_sequences=return_sequences,
         return_state=return_state,
         go_backwards=go_backwards,
@@ -1133,7 +1135,7 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
         self.recurrent_activation in (activations.sigmoid, nn.sigmoid) and
         recurrent_dropout == 0 and not unroll and use_bias and
         ops.executing_eagerly_outside_functions())
-    if context.num_gpus() > 0:
+    if config.list_logical_devices('GPU'):
       # Only show the message when there is GPU available, user will not care
       # about the cuDNN if there isn't any GPU.
       if self._could_use_gpu_kernel:
@@ -1141,18 +1143,8 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
       else:
         logging.warn(_CUDNN_NOT_AVAILABLE_MSG % self.name)
 
-    if compat.forward_compatible(2020, 8, 27):
-      # The first two attributes are added to support TFLite use case.
-      supportive_attributes = {
-          'time_major': time_major,
-          'go_backwards': go_backwards,
-          _FUNCTION_API_NAME_ATTRIBUTE: 'lstm_' + str(uuid.uuid4())
-      }
-
-      self.defun_lstm_with_backend_selection = function.defun_with_attributes(
-          lstm_with_backend_selection,
-          attributes=supportive_attributes,
-          autograph=False)
+    if _use_new_code():
+      self._defun_wrapper = _DefunWrapper(time_major, go_backwards, 'lstm')
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
     # The input should be dense, padded with zeros. If a ragged input is fed
@@ -1202,7 +1194,7 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
       dropout_mask = self.get_dropout_mask_for_cell(inputs, training, count=4)
       if dropout_mask is not None:
         inputs = inputs * dropout_mask[0]
-      if compat.forward_compatible(2020, 8, 27):
+      if _use_new_code():
         lstm_kwargs = {
             'inputs':
                 inputs,
@@ -1228,7 +1220,7 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
                 self.zero_output_for_mask,
         }
         (last_output, outputs, new_h, new_c,
-         runtime) = self.defun_lstm_with_backend_selection(**lstm_kwargs)
+         runtime) = self._defun_wrapper.defun_layer(**lstm_kwargs)
       else:
         gpu_lstm_kwargs = {
             'inputs':
@@ -1262,7 +1254,7 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
           can_use_gpu = (
               # Either user specified GPU or unspecified but GPU is available.
               (device_type == _GPU_DEVICE_NAME or
-               (device_type is None and context.num_gpus() > 0)) and
+               (device_type is None and config.list_logical_devices('GPU'))) and
               (mask is None or
                is_cudnn_supported_inputs(mask, self.time_major)))
           # Under eager context, check the device placement and prefer the
@@ -1633,7 +1625,7 @@ def lstm_with_backend_selection(inputs, init_h, init_c, kernel,
         true_fn=cudnn_lstm_fn,
         false_fn=stardard_lstm_fn)
 
-  if compat.forward_compatible(2020, 8, 27):
+  if _use_new_code():
     # Chooses the implementation dynamicly based on the running device.
     (last_output, outputs, new_h, new_c,
      runtime) = control_flow_ops.execute_fn_for_device(
diff --git a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py
index d387a375aa2..652322c810e 100644
--- a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py
+++ b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2.py
@@ -26,8 +26,8 @@ from __future__ import print_function
 
 
 from tensorflow.python.keras.layers import recurrent
-from tensorflow.python.ops import rnn_cell_wrapper_impl
-from tensorflow.python.util import tf_inspect
+from tensorflow.python.keras.layers.legacy_rnn import rnn_cell_wrapper_impl
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py
index b0fd5189b17..08de645328e 100644
--- a/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py
+++ b/tensorflow/python/keras/layers/rnn_cell_wrapper_v2_test.py
@@ -26,11 +26,11 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import layers
 from tensorflow.python.keras.layers import rnn_cell_wrapper_v2
+from tensorflow.python.keras.layers.legacy_rnn import rnn_cell_impl
+from tensorflow.python.keras.legacy_tf_layers import base as legacy_base_layer
 from tensorflow.python.keras.utils import generic_utils
-from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
-from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 
@@ -40,8 +40,10 @@ class RNNCellWrapperTest(test.TestCase, parameterized.TestCase):
 
   def testResidualWrapper(self):
     wrapper_type = rnn_cell_wrapper_v2.ResidualWrapper
-    x = ops.convert_to_tensor_v2(np.array([[1., 1., 1.]]), dtype="float32")
-    m = ops.convert_to_tensor_v2(np.array([[0.1, 0.1, 0.1]]), dtype="float32")
+    x = ops.convert_to_tensor_v2_with_dispatch(
+        np.array([[1., 1., 1.]]), dtype="float32")
+    m = ops.convert_to_tensor_v2_with_dispatch(
+        np.array([[0.1, 0.1, 0.1]]), dtype="float32")
     base_cell = rnn_cell_impl.GRUCell(
         3, kernel_initializer=init_ops.constant_initializer(0.5),
         bias_initializer=init_ops.constant_initializer(0.5))
@@ -62,9 +64,10 @@ class RNNCellWrapperTest(test.TestCase, parameterized.TestCase):
 
   def testResidualWrapperWithSlice(self):
     wrapper_type = rnn_cell_wrapper_v2.ResidualWrapper
-    x = ops.convert_to_tensor_v2(
+    x = ops.convert_to_tensor_v2_with_dispatch(
         np.array([[1., 1., 1., 1., 1.]]), dtype="float32")
-    m = ops.convert_to_tensor_v2(np.array([[0.1, 0.1, 0.1]]), dtype="float32")
+    m = ops.convert_to_tensor_v2_with_dispatch(
+        np.array([[0.1, 0.1, 0.1]]), dtype="float32")
     base_cell = rnn_cell_impl.GRUCell(
         3, kernel_initializer=init_ops.constant_initializer(0.5),
         bias_initializer=init_ops.constant_initializer(0.5))
@@ -116,7 +119,8 @@ class RNNCellWrapperTest(test.TestCase, parameterized.TestCase):
     base_cell = layers.SimpleRNNCell(1, name="basic_rnn_cell")
     rnn_cell = wrapper(base_cell)
     rnn_layer = layers.RNN(rnn_cell)
-    inputs = ops.convert_to_tensor_v2([[[1]]], dtype=dtypes.float32)
+    inputs = ops.convert_to_tensor_v2_with_dispatch([[[1]]],
+                                                    dtype=dtypes.float32)
     rnn_layer(inputs)
 
     wrapper_name = generic_utils.to_snake_case(wrapper.__name__)
@@ -136,12 +140,12 @@ class RNNCellWrapperTest(test.TestCase, parameterized.TestCase):
   def testWrapperV2Caller(self, wrapper):
     """Tests that wrapper V2 is using the LayerRNNCell's caller."""
 
-    with base_layer.keras_style_scope():
+    with legacy_base_layer.keras_style_scope():
       base_cell = rnn_cell_impl.MultiRNNCell(
           [rnn_cell_impl.BasicRNNCell(1) for _ in range(2)])
     rnn_cell = wrapper(base_cell)
-    inputs = ops.convert_to_tensor_v2([[1]], dtype=dtypes.float32)
-    state = ops.convert_to_tensor_v2([[1]], dtype=dtypes.float32)
+    inputs = ops.convert_to_tensor_v2_with_dispatch([[1]], dtype=dtypes.float32)
+    state = ops.convert_to_tensor_v2_with_dispatch([[1]], dtype=dtypes.float32)
     _ = rnn_cell(inputs, [state, state])
     weights = base_cell._cells[0].weights
     self.assertLen(weights, expected_len=2)
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index d1fa4c19e92..e47d9f59e08 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -61,7 +61,7 @@ from tensorflow.python.keras.layers.preprocessing import string_lookup_v1 as pre
 from tensorflow.python.keras.layers.preprocessing import text_vectorization as preprocessing_text_vectorization
 from tensorflow.python.keras.layers.preprocessing import text_vectorization_v1 as preprocessing_text_vectorization_v1
 from tensorflow.python.keras.utils import generic_utils
-from tensorflow.python.util import tf_inspect as inspect
+from tensorflow.python.keras.utils import tf_inspect as inspect
 from tensorflow.python.util.tf_export import keras_export
 
 
diff --git a/tensorflow/python/keras/layers/simplernn_test.py b/tensorflow/python/keras/layers/simplernn_test.py
index b586814a345..66f68720e7b 100644
--- a/tensorflow/python/keras/layers/simplernn_test.py
+++ b/tensorflow/python/keras/layers/simplernn_test.py
@@ -26,7 +26,6 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
@@ -47,7 +46,7 @@ class SimpleRNNLayerTest(test.TestCase, parameterized.TestCase):
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_float64_SimpleRNN(self):
     num_samples = 2
     timesteps = 3
diff --git a/tensorflow/python/keras/layers/subclassed_layers_test.py b/tensorflow/python/keras/layers/subclassed_layers_test.py
index 6adeb0934ed..572ce859702 100644
--- a/tensorflow/python/keras/layers/subclassed_layers_test.py
+++ b/tensorflow/python/keras/layers/subclassed_layers_test.py
@@ -37,7 +37,7 @@ class SubclassedLayersTest(keras_parameterized.TestCase):
     class BuildConstantLayer(keras.layers.Layer):
 
       def build(self, input_shape):
-        self.b = ops.convert_to_tensor_v2(2.0)
+        self.b = ops.convert_to_tensor_v2_with_dispatch(2.0)
 
       def call(self, inputs):
         return self.b * inputs
@@ -46,7 +46,7 @@ class SubclassedLayersTest(keras_parameterized.TestCase):
     model = testing_utils.get_model_from_layers(
         [layer, keras.layers.Dense(1)], input_shape=(1,))
 
-    x = ops.convert_to_tensor_v2([[3.0]])
+    x = ops.convert_to_tensor_v2_with_dispatch([[3.0]])
     self.assertEqual(
         tf_utils.is_symbolic_tensor(model(x)), not context.executing_eagerly())
     self.assertEqual(
@@ -58,10 +58,10 @@ class SubclassedLayersTest(keras_parameterized.TestCase):
     class BuildDerivedConstantLayer(keras.layers.Layer):
 
       def build(self, input_shape):
-        a = ops.convert_to_tensor_v2(1.0)
+        a = ops.convert_to_tensor_v2_with_dispatch(1.0)
         b = 2.0 * a
         self.variable = variables.Variable(b)
-        self.constant = ops.convert_to_tensor_v2(self.variable)
+        self.constant = ops.convert_to_tensor_v2_with_dispatch(self.variable)
 
       def call(self, inputs):
         return self.variable * self.constant * inputs
@@ -70,7 +70,7 @@ class SubclassedLayersTest(keras_parameterized.TestCase):
     model = testing_utils.get_model_from_layers(
         [layer, keras.layers.Dense(1)], input_shape=(1,))
 
-    x = ops.convert_to_tensor_v2([[3.0]])
+    x = ops.convert_to_tensor_v2_with_dispatch([[3.0]])
     self.assertEqual(
         tf_utils.is_symbolic_tensor(model(x)), not context.executing_eagerly())
     self.assertEqual(
diff --git a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
index e128323a1a6..b89bafde8d2 100644
--- a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
+++ b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
@@ -158,15 +158,15 @@ def _int32_manipulation_at_max_shape_dims_limit():
   # of the max tensor size Keras can try inferring values for.
   inputs = keras.Input(batch_size=2, shape=(10,))
   batch_size = array_ops.shape(inputs)[0]
-  num_features = int(keras_tensor._MAX_TENSOR_DIMS / int(inputs.shape[0]))
+  num_features = int(keras_tensor._MAX_TENSOR_RANK / int(inputs.shape[0]))
   x = math_ops.range(batch_size * num_features, dtype='int32')
-  assert x.shape.as_list() == [keras_tensor._MAX_TENSOR_DIMS]
+  assert x.shape.as_list() == [keras_tensor._MAX_TENSOR_RANK]
 
   # Verify that a value was actually inferred for a tensor that *might*
   # represent the shape, bying checking that a value in
   # the range appears in the printed inferred value
   if keras_tensor.keras_tensors_enabled():
-    assert str(keras_tensor._MAX_TENSOR_DIMS - 1) in str(x)
+    assert str(keras_tensor._MAX_TENSOR_RANK - 1) in str(x)
 
   x = array_ops.reshape(x, (batch_size, num_features))
   x = math_ops.cast(x, dtype='float32')
@@ -637,7 +637,7 @@ class AutoLambdaTest(keras_parameterized.TestCase):
     self.assertAllEqual(model(ones), 3.0 * ones)
 
   def test_numerical_correctness_simple(self):
-    x = ops.convert_to_tensor_v2([[-1., 0., -2., 1.]])
+    x = ops.convert_to_tensor_v2_with_dispatch([[-1., 0., -2., 1.]])
     inputs = keras.Input(shape=(4,))
     outputs = gen_nn_ops.relu(inputs)
     model = keras.Model(inputs, outputs)
@@ -645,7 +645,7 @@ class AutoLambdaTest(keras_parameterized.TestCase):
     self.assertAllClose(y, [[0., 0., 0., 1.]])
 
   def test_numerical_correctness_with_attrs(self):
-    x = ops.convert_to_tensor_v2([[1.5, 1.5], [2.5, 3.5]])
+    x = ops.convert_to_tensor_v2_with_dispatch([[1.5, 1.5], [2.5, 3.5]])
     inputs = keras.Input(shape=(2,))
     outputs = math_ops.reduce_mean(inputs, axis=1)
     model = keras.Model(inputs, outputs)
@@ -653,7 +653,7 @@ class AutoLambdaTest(keras_parameterized.TestCase):
     self.assertAllClose(y, [1.5, 3.])
 
   def test_numerical_correctness_serialization(self):
-    x = ops.convert_to_tensor_v2([[-1., 0., -2., 1.]])
+    x = ops.convert_to_tensor_v2_with_dispatch([[-1., 0., -2., 1.]])
     inputs = keras.Input(shape=(4,))
     outputs = gen_nn_ops.relu(inputs)
     model1 = keras.Model(inputs, outputs)
@@ -731,7 +731,8 @@ class AutoLambdaTest(keras_parameterized.TestCase):
     model.summary()
 
 
-class InputInEagerTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+class InputInEagerTest(keras_parameterized.TestCase):
   """Tests ops on keras inputs in Eager runtime.
 
   Input returns graph/symbolic tensors in the Eager runtime (this
@@ -740,21 +741,19 @@ class InputInEagerTest(test.TestCase):
   """
 
   def test_identity(self):
-    with context.eager_mode():
-      x = keras.Input(shape=(1,))
-      ident = array_ops.identity(x)
+    x = keras.Input(shape=(1,))
+    ident = array_ops.identity(x)
 
-      # This is now a graph tensor, and should be able to continue in graphland
-      self.assertIn('Identity', ident.name)
+    # This is now a graph tensor, and should be able to continue in graphland
+    self.assertIn('Identity', ident.name)
 
   def test_size(self):
-    with context.eager_mode():
-      x = keras.Input(shape=(3,))
-      self.assertAllEqual(x.get_shape().as_list(), [None, 3])
-      sz = array_ops.size(x)
+    x = keras.Input(shape=(3,))
+    self.assertAllEqual(x.get_shape().as_list(), [None, 3])
+    sz = array_ops.size(x)
 
-      # This is now a graph tensor, and should be able to continue in graphland
-      self.assertIn('Size', sz.name)
+    # This is now a graph tensor, and should be able to continue in graphland
+    self.assertIn('Size', sz.name)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index 23fef467cfe..6798e5c8fff 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import copy
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine.base_layer import Layer
@@ -28,11 +29,11 @@ from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.layers.recurrent import _standardize_args
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -241,6 +242,11 @@ class TimeDistributed(Wrapper):
         output_shape = self._get_shape_tuple((-1, input_length), y, 1,
                                              output_shape[2:])
         y = array_ops.reshape(y, output_shape)
+        if not context.executing_eagerly():
+          # Set the static shape for the result since it might be lost during
+          # array_ops reshape, eg, some `None` dim in the result could be
+          # inferred.
+          y.set_shape(self.compute_output_shape(input_shape))
 
     return y
 
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index 671fe65d520..f1412975cc3 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -28,7 +28,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
@@ -44,7 +43,6 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 from tensorflow.python.training.tracking import util as trackable_util
 from tensorflow.python.util import nest
-from tensorflow.python.util import object_identity
 
 
 class _RNNCellWithConstants(keras.layers.Layer):
@@ -130,10 +128,11 @@ class TimeDistributedTest(keras_parameterized.TestCase):
 
     # check whether the model variables are present in the
     # trackable list of objects
-    checkpointed_objects = object_identity.ObjectIdentitySet(
-        trackable_util.list_objects(model))
+    checkpointed_object_ids = {
+        id(o) for o in trackable_util.list_objects(model)
+    }
     for v in model.variables:
-      self.assertIn(v, checkpointed_objects)
+      self.assertIn(id(v), checkpointed_object_ids)
 
   def test_timedistributed_static_batch_size(self):
     model = keras.models.Sequential()
@@ -405,7 +404,7 @@ class TimeDistributedTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes
   @parameterized.named_parameters(
-      *tf_test_util.generate_combinations_with_testcase_name(
+      *testing_utils.generate_combinations_with_testcase_name(
           layer=[keras.layers.LSTM,
                  keras.layers.Dense]))
   def test_TimeDistributed_with_ragged_input(self, layer):
@@ -466,6 +465,13 @@ class TimeDistributedTest(keras_parameterized.TestCase):
         output_ragged, name='tensor')
     self.assertAllEqual(output_ragged.to_tensor(), output_dense)
 
+  def test_TimeDistributed_set_static_shape(self):
+    layer = keras.layers.TimeDistributed(keras.layers.Conv2D(16, (3, 3)))
+    inputs = keras.Input(batch_shape=(1, None, 32, 32, 1))
+    outputs = layer(inputs)
+    # Make sure the batch dim is not lost after array_ops.reshape.
+    self.assertListEqual(outputs.shape.as_list(), [1, None, 30, 30, 16])
+
 
 @combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class BidirectionalTest(test.TestCase, parameterized.TestCase):
@@ -492,10 +498,11 @@ class BidirectionalTest(test.TestCase, parameterized.TestCase):
 
       # check whether the model variables are present in the
       # trackable list of objects
-      checkpointed_objects = object_identity.ObjectIdentitySet(
-          trackable_util.list_objects(model))
+      checkpointed_object_ids = {
+          id(o) for o in trackable_util.list_objects(model)
+      }
       for v in model.variables:
-        self.assertIn(v, checkpointed_objects)
+        self.assertIn(id(v), checkpointed_object_ids)
 
       # test compute output shape
       ref_shape = model.layers[-1].output.shape
@@ -1030,10 +1037,11 @@ class BidirectionalTest(test.TestCase, parameterized.TestCase):
 
     # check whether the model variables are present in the
     # trackable list of objects
-    checkpointed_objects = object_identity.ObjectIdentitySet(
-        trackable_util.list_objects(model))
+    checkpointed_object_ids = {
+        id(o) for o in trackable_util.list_objects(model)
+    }
     for v in model.variables:
-      self.assertIn(v, checkpointed_objects)
+      self.assertIn(id(v), checkpointed_object_ids)
 
     # test compute output shape
     ref_shape = model.layers[-1].output.shape
@@ -1152,7 +1160,7 @@ class BidirectionalTest(test.TestCase, parameterized.TestCase):
         epochs=1,
         batch_size=10)
 
-  @tf_test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_wrapped_rnn_cell(self):
     # See https://github.com/tensorflow/tensorflow/issues/26581.
     batch = 20
diff --git a/tensorflow/python/keras/legacy_tf_layers/BUILD b/tensorflow/python/keras/legacy_tf_layers/BUILD
index 3ce0d3c6bec..45ccd958db0 100644
--- a/tensorflow/python/keras/legacy_tf_layers/BUILD
+++ b/tensorflow/python/keras/legacy_tf_layers/BUILD
@@ -5,10 +5,19 @@ load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 package(
-    default_visibility = ["//tensorflow:__subpackages__"],
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+        "//tensorflow/python/keras:__subpackages__",
+    ],
     licenses = ["notice"],  # Apache 2.0
 )
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "layers_base",
     srcs = [
@@ -79,6 +88,7 @@ tf_py_test(
     srcs = ["base_test.py"],
     main = "base_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":core",
         ":layers_base",
@@ -109,6 +119,7 @@ tf_py_test(
     srcs = ["core_test.py"],
     main = "core_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":core",
         "//tensorflow/python:array_ops",
@@ -160,6 +171,7 @@ tf_py_test(
     main = "pooling_test.py",
     python_version = "PY3",
     tags = ["no_rocm"],
+    tfrt_enabled = True,
     deps = [
         ":pooling",
         "//tensorflow/python:array_ops",
@@ -177,6 +189,7 @@ cuda_py_test(
     main = "normalization_test.py",
     python_version = "PY3",
     shard_count = 10,
+    tfrt_enabled = True,
     deps = [
         ":convolutional",
         ":normalization",
diff --git a/tensorflow/python/keras/legacy_tf_layers/base.py b/tensorflow/python/keras/legacy_tf_layers/base.py
index 25b9ddca65e..1e2a7e7861c 100644
--- a/tensorflow/python/keras/legacy_tf_layers/base.py
+++ b/tensorflow/python/keras/legacy_tf_layers/base.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import warnings
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -26,13 +27,12 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.mixed_precision.experimental import policy
+from tensorflow.python.keras.utils import tf_contextlib
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.training.tracking import base as trackable
-from tensorflow.python.util import deprecation
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
 # Avoid breaking users who directly import this symbol from this file.
@@ -210,6 +210,9 @@ class Layer(base_layer.Layer):
     if 'autocast' not in kwargs:
       kwargs['autocast'] = False
 
+    # Mark that legacy layers should not be instrumented as Keras usage
+    self._disable_keras_instrumentation = True
+
     super(Layer, self).__init__(trainable=trainable, name=name, dtype=dtype,
                                 **kwargs)
 
@@ -237,11 +240,11 @@ class Layer(base_layer.Layer):
   # We no longer track graph in tf.layers layers. This property is only kept to
   # maintain API backward compatibility.
   @property
-  @deprecation.deprecated(
-      date=None,
-      instructions='Stop using this property because tf.layers layers no '
-      'longer track their graph.')
   def graph(self):
+    warnings.warn('`Layer.graph` is deprecated and '
+                  'will be removed in a future version. '
+                  'Please stop using this property because tf.layers layers no '
+                  'longer track their graph.')
     if context.executing_eagerly():
       raise RuntimeError('Layer.graph not supported when executing eagerly.')
     return None
diff --git a/tensorflow/python/keras/legacy_tf_layers/base_test.py b/tensorflow/python/keras/legacy_tf_layers/base_test.py
index 2c9810c4109..90d57fae407 100644
--- a/tensorflow/python/keras/legacy_tf_layers/base_test.py
+++ b/tensorflow/python/keras/legacy_tf_layers/base_test.py
@@ -60,6 +60,9 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
     layer = base_layers.Layer(name='my_layer', trainable=False)
     self.assertEqual(layer.trainable, False)
 
+    # Assert that the layer was not instrumented as a Keras layer
+    self.assertFalse(layer._instrumented_keras_api)
+
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testInt64Layer(self):
     layer = base_layers.Layer(name='my_layer', dtype='int64')
@@ -83,6 +86,8 @@ class BaseLayerTest(test.TestCase, parameterized.TestCase):
 
     with base_layers.keras_style_scope():
       layer = base_layers.Layer(name='my_layer')
+    # Assert that the layer was not instrumented as a Keras layer
+    self.assertFalse(layer._instrumented_keras_api)
     # Test basic variable creation.
     with backend.name_scope('bar'):
       variable = layer.add_variable(
diff --git a/tensorflow/python/keras/legacy_tf_layers/convolutional.py b/tensorflow/python/keras/legacy_tf_layers/convolutional.py
index 4fd53531fd1..4f3732510a0 100644
--- a/tensorflow/python/keras/legacy_tf_layers/convolutional.py
+++ b/tensorflow/python/keras/legacy_tf_layers/convolutional.py
@@ -19,10 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
 from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.keras.legacy_tf_layers import base
 from tensorflow.python.ops import init_ops
-from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -118,9 +119,6 @@ class Conv1D(keras_layers.Conv1D, base.Layer):
         name=name, **kwargs)
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.Conv1D` instead.')
 @tf_export(v1=['layers.conv1d'])
 def conv1d(inputs,
            filters,
@@ -201,6 +199,9 @@ def conv1d(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.conv1d` is deprecated and '
+                'will be removed in a future version. '
+                'Please Use `tf.keras.layers.Conv1D` instead.')
   layer = Conv1D(
       filters=filters,
       kernel_size=kernel_size,
@@ -323,9 +324,6 @@ class Conv2D(keras_layers.Conv2D, base.Layer):
         name=name, **kwargs)
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.Conv2D` instead.')
 @tf_export(v1=['layers.conv2d'])
 def conv2d(inputs,
            filters,
@@ -413,6 +411,9 @@ def conv2d(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.conv2d` is deprecated and '
+                'will be removed in a future version. '
+                'Please Use `tf.keras.layers.Conv2D` instead.')
   layer = Conv2D(
       filters=filters,
       kernel_size=kernel_size,
@@ -536,9 +537,6 @@ class Conv3D(keras_layers.Conv3D, base.Layer):
         name=name, **kwargs)
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.Conv3D` instead.')
 @tf_export(v1=['layers.conv3d'])
 def conv3d(inputs,
            filters,
@@ -627,6 +625,9 @@ def conv3d(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.conv3d` is deprecated and '
+                'will be removed in a future version. '
+                'Please Use `tf.keras.layers.Conv3D` instead.')
   layer = Conv3D(
       filters=filters,
       kernel_size=kernel_size,
@@ -875,9 +876,6 @@ class SeparableConv2D(keras_layers.SeparableConv2D, base.Layer):
         **kwargs)
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.SeparableConv1D` instead.')
 @tf_export(v1=['layers.separable_conv1d'])
 def separable_conv1d(inputs,
                      filters,
@@ -971,6 +969,9 @@ def separable_conv1d(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.separable_conv1d` is deprecated and '
+                'will be removed in a future version. '
+                'Please Use `tf.keras.layers.SeparableConv1D` instead.')
   layer = SeparableConv1D(
       filters=filters,
       kernel_size=kernel_size,
@@ -998,9 +999,6 @@ def separable_conv1d(inputs,
   return layer.apply(inputs)
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.SeparableConv2D` instead.')
 @tf_export(v1=['layers.separable_conv2d'])
 def separable_conv2d(inputs,
                      filters,
@@ -1099,6 +1097,9 @@ def separable_conv2d(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.separable_conv2d` is deprecated and '
+                'will be removed in a future version. '
+                'Please Use `tf.keras.layers.SeparableConv2D` instead.')
   layer = SeparableConv2D(
       filters=filters,
       kernel_size=kernel_size,
@@ -1214,9 +1215,6 @@ class Conv2DTranspose(keras_layers.Conv2DTranspose, base.Layer):
         **kwargs)
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.Conv2DTranspose` instead.')
 @tf_export(v1=['layers.conv2d_transpose'])
 def conv2d_transpose(inputs,
                      filters,
@@ -1293,6 +1291,9 @@ def conv2d_transpose(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.conv2d_transpose` is deprecated and '
+                'will be removed in a future version. '
+                'Please Use `tf.keras.layers.Conv2DTranspose` instead.')
   layer = Conv2DTranspose(
       filters=filters,
       kernel_size=kernel_size,
@@ -1400,9 +1401,6 @@ class Conv3DTranspose(keras_layers.Conv3DTranspose, base.Layer):
         **kwargs)
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions='Use `tf.keras.layers.Conv3DTranspose` instead.')
 @tf_export(v1=['layers.conv3d_transpose'])
 def conv3d_transpose(inputs,
                      filters,
@@ -1473,6 +1471,9 @@ def conv3d_transpose(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.conv3d_transpose` is deprecated and '
+                'will be removed in a future version. '
+                'Please Use `tf.keras.layers.Conv3DTranspose` instead.')
   layer = Conv3DTranspose(
       filters=filters,
       kernel_size=kernel_size,
diff --git a/tensorflow/python/keras/legacy_tf_layers/core.py b/tensorflow/python/keras/legacy_tf_layers/core.py
index 78ddf2547ae..b401801bd4a 100644
--- a/tensorflow/python/keras/legacy_tf_layers/core.py
+++ b/tensorflow/python/keras/legacy_tf_layers/core.py
@@ -21,11 +21,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
 
 from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.keras.legacy_tf_layers import base
 from tensorflow.python.ops import init_ops
-from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -110,8 +110,6 @@ class Dense(keras_layers.Dense, base.Layer):
                                 **kwargs)
 
 
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.Dense instead.')
 @tf_export(v1=['layers.dense'])
 def dense(
     inputs, units,
@@ -170,6 +168,9 @@ def dense(
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.dense` is deprecated and '
+                'will be removed in a future version. '
+                'Please use `tf.keras.layers.Dense` instead.')
   layer = Dense(units,
                 activation=activation,
                 use_bias=use_bias,
@@ -226,9 +227,6 @@ class Dropout(keras_layers.Dropout, base.Layer):
     return super(Dropout, self).call(inputs, training=training)
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions='Use keras.layers.dropout instead.')
 @tf_export(v1=['layers.dropout'])
 def dropout(inputs,
             rate=0.5,
@@ -267,6 +265,9 @@ def dropout(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.dropout` is deprecated and '
+                'will be removed in a future version. '
+                'Please use `tf.keras.layers.Dropout` instead.')
   layer = Dropout(rate, noise_shape=noise_shape, seed=seed, name=name)
   return layer.apply(inputs, training=training)
 
@@ -297,9 +298,6 @@ class Flatten(keras_layers.Flatten, base.Layer):
   pass
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions='Use keras.layers.Flatten instead.')
 @tf_export(v1=['layers.flatten'])
 def flatten(inputs, name=None, data_format='channels_last'):
   """Flattens an input tensor while preserving the batch axis (axis 0).
@@ -328,6 +326,9 @@ def flatten(inputs, name=None, data_format='channels_last'):
     # now `y` has shape `(None, None)`
   ```
   """
+  warnings.warn('`tf.layers.flatten` is deprecated and '
+                'will be removed in a future version. '
+                'Please use `tf.keras.layers.Flatten` instead.')
   layer = Flatten(name=name, data_format=data_format)
   return layer.apply(inputs)
 
diff --git a/tensorflow/python/keras/legacy_tf_layers/core_test.py b/tensorflow/python/keras/legacy_tf_layers/core_test.py
index 88f9a1afa0a..3da2e947cad 100644
--- a/tensorflow/python/keras/legacy_tf_layers/core_test.py
+++ b/tensorflow/python/keras/legacy_tf_layers/core_test.py
@@ -290,22 +290,22 @@ class DenseTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(weights['scope/dense/bias'].read_value(), np.zeros(
           (2)))
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def testEagerExecution(self):
-    with context.eager_mode():
-      container = variable_scope.EagerVariableStore()
-      x = constant_op.constant([[2.0]])
-      with container.as_default():
-        y = core_layers.dense(
-            x, 1, name='my_dense',
-            kernel_initializer=init_ops.ones_initializer())
-      self.assertAllEqual(y, [[2.0]])
-      self.assertEqual(len(container.variables()), 2)
-      # Recreate the layer to test reuse.
-      with container.as_default():
-        core_layers.dense(
-            x, 1, name='my_dense',
-            kernel_initializer=init_ops.ones_initializer())
-      self.assertEqual(len(container.variables()), 2)
+    container = variable_scope.EagerVariableStore()
+    x = constant_op.constant([[2.0]])
+    with container.as_default():
+      y = core_layers.dense(
+          x, 1, name='my_dense',
+          kernel_initializer=init_ops.ones_initializer())
+    self.assertAllEqual(y, [[2.0]])
+    self.assertEqual(len(container.variables()), 2)
+    # Recreate the layer to test reuse.
+    with container.as_default():
+      core_layers.dense(
+          x, 1, name='my_dense',
+          kernel_initializer=init_ops.ones_initializer())
+    self.assertEqual(len(container.variables()), 2)
 
   def testFunctionalDenseWithCustomGetter(self):
     called = [0]
diff --git a/tensorflow/python/keras/legacy_tf_layers/normalization.py b/tensorflow/python/keras/legacy_tf_layers/normalization.py
index d874882aed1..4b16ad62336 100644
--- a/tensorflow/python/keras/legacy_tf_layers/normalization.py
+++ b/tensorflow/python/keras/legacy_tf_layers/normalization.py
@@ -19,11 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
 
 from tensorflow.python.keras.layers import normalization as keras_normalization
 from tensorflow.python.keras.legacy_tf_layers import base
 from tensorflow.python.ops import init_ops
-from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -172,11 +172,6 @@ class BatchNormalization(keras_normalization.BatchNormalization, base.Layer):
     return super(BatchNormalization, self).call(inputs, training=training)
 
 
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.BatchNormalization instead.  In '
-    'particular, `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)` should not '
-    'be used (consult the `tf.keras.layers.BatchNormalization` '
-    'documentation).')
 @tf_export(v1=['layers.batch_normalization'])
 def batch_normalization(inputs,
                         axis=-1,
@@ -309,6 +304,13 @@ def batch_normalization(inputs,
       2017](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models)
       ([pdf](http://papers.nips.cc/paper/6790-batch-renormalization-towards-reducing-minibatch-dependence-in-batch-normalized-models.pdf))
   """
+  warnings.warn(
+      '`tf.layers.batch_normalization` is deprecated and '
+      'will be removed in a future version. '
+      'Please use `tf.keras.layers.BatchNormalization` instead. '
+      'In particular, `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)` '
+      'should not be used (consult the `tf.keras.layers.BatchNormalization` '
+      'documentation).')
   layer = BatchNormalization(
       axis=axis,
       momentum=momentum,
diff --git a/tensorflow/python/keras/legacy_tf_layers/pooling.py b/tensorflow/python/keras/legacy_tf_layers/pooling.py
index 2e1ba36c5b9..a989cb30e01 100644
--- a/tensorflow/python/keras/legacy_tf_layers/pooling.py
+++ b/tensorflow/python/keras/legacy_tf_layers/pooling.py
@@ -19,9 +19,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
 from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.keras.legacy_tf_layers import base
-from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -58,8 +59,6 @@ class AveragePooling1D(keras_layers.AveragePooling1D, base.Layer):
         **kwargs)
 
 
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.AveragePooling1D instead.')
 @tf_export(v1=['layers.average_pooling1d'])
 def average_pooling1d(inputs, pool_size, strides,
                       padding='valid', data_format='channels_last',
@@ -87,6 +86,9 @@ def average_pooling1d(inputs, pool_size, strides,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.average_pooling1d` is deprecated and '
+                'will be removed in a future version. '
+                'Please use `tf.keras.layers.AveragePooling1D` instead.')
   layer = AveragePooling1D(pool_size=pool_size,
                            strides=strides,
                            padding=padding,
@@ -128,8 +130,6 @@ class MaxPooling1D(keras_layers.MaxPooling1D, base.Layer):
         **kwargs)
 
 
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.MaxPooling1D instead.')
 @tf_export(v1=['layers.max_pooling1d'])
 def max_pooling1d(inputs, pool_size, strides,
                   padding='valid', data_format='channels_last',
@@ -157,6 +157,9 @@ def max_pooling1d(inputs, pool_size, strides,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.max_pooling1d` is deprecated and '
+                'will be removed in a future version. '
+                'Please use `tf.keras.layers.MaxPooling1D` instead.')
   layer = MaxPooling1D(pool_size=pool_size,
                        strides=strides,
                        padding=padding,
@@ -198,8 +201,6 @@ class AveragePooling2D(keras_layers.AveragePooling2D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.AveragePooling2D instead.')
 @tf_export(v1=['layers.average_pooling2d'])
 def average_pooling2d(inputs,
                       pool_size, strides,
@@ -232,6 +233,9 @@ def average_pooling2d(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.average_pooling2d` is deprecated and '
+                'will be removed in a future version. '
+                'Please use `tf.keras.layers.AveragePooling2D` instead.')
   layer = AveragePooling2D(pool_size=pool_size, strides=strides,
                            padding=padding, data_format=data_format,
                            name=name)
@@ -271,8 +275,6 @@ class MaxPooling2D(keras_layers.MaxPooling2D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.MaxPooling2D instead.')
 @tf_export(v1=['layers.max_pooling2d'])
 def max_pooling2d(inputs,
                   pool_size, strides,
@@ -305,6 +307,9 @@ def max_pooling2d(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.max_pooling2d` is deprecated and '
+                'will be removed in a future version. '
+                'Please use `tf.keras.layers.MaxPooling2D` instead.')
   layer = MaxPooling2D(pool_size=pool_size, strides=strides,
                        padding=padding, data_format=data_format,
                        name=name)
@@ -346,8 +351,6 @@ class AveragePooling3D(keras_layers.AveragePooling3D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.AveragePooling3D instead.')
 @tf_export(v1=['layers.average_pooling3d'])
 def average_pooling3d(inputs,
                       pool_size, strides,
@@ -382,6 +385,9 @@ def average_pooling3d(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.average_pooling3d` is deprecated and '
+                'will be removed in a future version. '
+                'Please use `tf.keras.layers.AveragePooling3D` instead.')
   layer = AveragePooling3D(pool_size=pool_size, strides=strides,
                            padding=padding, data_format=data_format,
                            name=name)
@@ -423,8 +429,6 @@ class MaxPooling3D(keras_layers.MaxPooling3D, base.Layer):
         padding=padding, data_format=data_format, name=name, **kwargs)
 
 
-@deprecation.deprecated(
-    date=None, instructions='Use keras.layers.MaxPooling3D instead.')
 @tf_export(v1=['layers.max_pooling3d'])
 def max_pooling3d(inputs,
                   pool_size, strides,
@@ -457,6 +461,9 @@ def max_pooling3d(inputs,
   Raises:
     ValueError: if eager execution is enabled.
   """
+  warnings.warn('`tf.layers.max_pooling3d` is deprecated and '
+                'will be removed in a future version. '
+                'Please use `tf.keras.layers.MaxPooling3D` instead.')
   layer = MaxPooling3D(pool_size=pool_size, strides=strides,
                        padding=padding, data_format=data_format,
                        name=name)
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index bda32897fc5..d739c16f116 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -25,6 +25,7 @@ import six
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond
 from tensorflow.python.framework import tensor_util
@@ -144,8 +145,11 @@ class Loss(object):
     graph_ctx = tf_utils.graph_context_for_symbolic_tensors(
         y_true, y_pred, sample_weight)
     with K.name_scope(self._name_scope), graph_ctx:
-      ag_call = autograph.tf_convert(self.call, ag_ctx.control_status_ctx())
-      losses = ag_call(y_true, y_pred)
+      if context.executing_eagerly():
+        call_fn = self.call
+      else:
+        call_fn = autograph.tf_convert(self.call, ag_ctx.control_status_ctx())
+      losses = call_fn(y_true, y_pred)
       return losses_utils.compute_weighted_loss(
           losses, sample_weight, reduction=self._get_reduction())
 
@@ -179,7 +183,7 @@ class Loss(object):
     Returns:
       Loss values with the shape `[batch_size, d0, .. dN-1]`.
     """
-    NotImplementedError('Must be implemented in subclasses.')
+    raise NotImplementedError('Must be implemented in subclasses.')
 
   def _get_reduction(self):
     """Handles `AUTO` reduction cases and returns the reduction value."""
@@ -1189,7 +1193,7 @@ def mean_squared_error(y_true, y_pred):
   Returns:
     Mean squared error values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   return K.mean(math_ops.squared_difference(y_pred, y_true), axis=-1)
 
@@ -1222,7 +1226,7 @@ def mean_absolute_error(y_true, y_pred):
   Returns:
     Mean absolute error values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   return K.mean(math_ops.abs(y_pred - y_true), axis=-1)
 
@@ -1257,7 +1261,7 @@ def mean_absolute_percentage_error(y_true, y_pred):
   Returns:
     Mean absolute percentage error values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   diff = math_ops.abs(
       (y_true - y_pred) / K.maximum(math_ops.abs(y_true), K.epsilon()))
@@ -1284,7 +1288,7 @@ def mean_squared_logarithmic_error(y_true, y_pred):
   >>> assert loss.shape == (2,)
   >>> y_true = np.maximum(y_true, 1e-7)
   >>> y_pred = np.maximum(y_pred, 1e-7)
-  >>> assert np.array_equal(
+  >>> assert np.allclose(
   ...     loss.numpy(),
   ...     np.mean(
   ...         np.square(np.log(y_true + 1.) - np.log(y_pred + 1.)), axis=-1))
@@ -1296,7 +1300,7 @@ def mean_squared_logarithmic_error(y_true, y_pred):
   Returns:
     Mean squared logarithmic error values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   first_log = math_ops.log(K.maximum(y_pred, K.epsilon()) + 1.)
   second_log = math_ops.log(K.maximum(y_true, K.epsilon()) + 1.)
@@ -1344,7 +1348,7 @@ def squared_hinge(y_true, y_pred):
   Returns:
      Squared hinge loss values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   y_true = _maybe_convert_labels(y_true)
   return K.mean(
@@ -1377,7 +1381,7 @@ def hinge(y_true, y_pred):
   Returns:
     Hinge loss values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   y_true = _maybe_convert_labels(y_true)
   return K.mean(math_ops.maximum(1. - y_true * y_pred, 0.), axis=-1)
@@ -1409,7 +1413,7 @@ def categorical_hinge(y_true, y_pred):
   Returns:
     Categorical hinge loss values.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   pos = math_ops.reduce_sum(y_true * y_pred, axis=-1)
   neg = math_ops.reduce_max((1. - y_true) * y_pred, axis=-1)
@@ -1444,7 +1448,7 @@ def huber(y_true, y_pred, delta=1.0):
   delta = math_ops.cast(delta, dtype=K.floatx())
   error = math_ops.subtract(y_pred, y_true)
   abs_error = math_ops.abs(error)
-  half = ops.convert_to_tensor_v2(0.5, dtype=abs_error.dtype)
+  half = ops.convert_to_tensor_v2_with_dispatch(0.5, dtype=abs_error.dtype)
   return K.mean(
       array_ops.where_v2(
           abs_error <= delta, half * math_ops.pow(error, 2),
@@ -1452,7 +1456,8 @@ def huber(y_true, y_pred, delta=1.0):
       axis=-1)
 
 
-@keras_export('keras.losses.log_cosh', 'keras.losses.logcosh')
+@keras_export('keras.losses.log_cosh', 'keras.losses.logcosh',
+              'keras.metrics.log_cosh', 'keras.metrics.logcosh')
 @dispatch.add_dispatch_support
 def log_cosh(y_true, y_pred):
   """Logarithm of the hyperbolic cosine of the prediction error.
@@ -1481,7 +1486,7 @@ def log_cosh(y_true, y_pred):
   Returns:
     Logcosh error values. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
 
   def _logcosh(x):
@@ -1518,9 +1523,10 @@ def categorical_crossentropy(y_true,
   Returns:
     Categorical crossentropy loss value.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
-  label_smoothing = ops.convert_to_tensor_v2(label_smoothing, dtype=K.floatx())
+  label_smoothing = ops.convert_to_tensor_v2_with_dispatch(
+      label_smoothing, dtype=K.floatx())
 
   def _smooth_labels():
     num_classes = math_ops.cast(array_ops.shape(y_true)[-1], y_pred.dtype)
@@ -1557,7 +1563,7 @@ def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1):
   Returns:
     Sparse categorical crossentropy loss value.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   return K.sparse_categorical_crossentropy(
       y_true, y_pred, from_logits=from_logits, axis=axis)
@@ -1588,9 +1594,10 @@ def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0):
   Returns:
     Binary crossentropy loss value. shape = `[batch_size, d0, .. dN-1]`.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
-  label_smoothing = ops.convert_to_tensor_v2(label_smoothing, dtype=K.floatx())
+  label_smoothing = ops.convert_to_tensor_v2_with_dispatch(
+      label_smoothing, dtype=K.floatx())
 
   def _smooth_labels():
     return y_true * (1.0 - label_smoothing) + 0.5 * label_smoothing
@@ -1638,7 +1645,7 @@ def kl_divergence(y_true, y_pred):
   Raises:
     TypeError: If `y_true` cannot be cast to the `y_pred.dtype`.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   y_true = K.clip(y_true, K.epsilon(), 1)
   y_pred = K.clip(y_pred, K.epsilon(), 1)
@@ -1674,7 +1681,7 @@ def poisson(y_true, y_pred):
   Raises:
     InvalidArgumentError: If `y_true` and `y_pred` have incompatible shapes.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   y_true = math_ops.cast(y_true, y_pred.dtype)
   return K.mean(y_pred - y_true * math_ops.log(y_pred + K.epsilon()), axis=-1)
 
diff --git a/tensorflow/python/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
index 34213c8308a..7cbd7b18f70 100644
--- a/tensorflow/python/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -21,11 +21,13 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.keras import activations
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import losses
@@ -95,16 +97,19 @@ class KerasLossesTest(test.TestCase, parameterized.TestCase):
     p = backend.placeholder()
     o = losses.categorical_crossentropy(t, p)
 
-    t_val = ops.convert_to_tensor_v2([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]])
-    p_val = ops.convert_to_tensor_v2([[.9, .05, .05], [.05, .89, .06],
-                                      [.05, .01, .94]])
+    t_val = ops.convert_to_tensor_v2_with_dispatch([[1., 0., 0.], [0., 1., 0.],
+                                                    [0., 0., 1.]])
+    p_val = ops.convert_to_tensor_v2_with_dispatch([[.9, .05, .05],
+                                                    [.05, .89, .06],
+                                                    [.05, .01, .94]])
     f = backend.function([t, p], o)
 
     result = f([t_val, p_val])
     self.assertArrayNear(result, [.105, .116, .062], 1e-3)
 
     # from logits
-    p_val = ops.convert_to_tensor_v2([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    p_val = ops.convert_to_tensor_v2_with_dispatch([[8., 1., 1.], [0., 9., 1.],
+                                                    [2., 3., 5.]])
     o = losses.categorical_crossentropy(t, p, from_logits=True)
     f = backend.function([t, p], o)
 
@@ -133,16 +138,18 @@ class KerasLossesTest(test.TestCase, parameterized.TestCase):
     p = backend.placeholder()
     o = losses.sparse_categorical_crossentropy(t, p)
 
-    t_val = ops.convert_to_tensor_v2([0, 1, 2])
-    p_val = ops.convert_to_tensor_v2([[.9, .05, .05], [.05, .89, .06],
-                                      [.05, .01, .94]])
+    t_val = ops.convert_to_tensor_v2_with_dispatch([0, 1, 2])
+    p_val = ops.convert_to_tensor_v2_with_dispatch([[.9, .05, .05],
+                                                    [.05, .89, .06],
+                                                    [.05, .01, .94]])
     f = backend.function([t, p], o)
 
     result = f([t_val, p_val])
     self.assertArrayNear(result, [.105, .116, .062], 1e-3)
 
     # from logits
-    p_val = ops.convert_to_tensor_v2([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    p_val = ops.convert_to_tensor_v2_with_dispatch([[8., 1., 1.], [0., 9., 1.],
+                                                    [2., 3., 5.]])
     o = losses.sparse_categorical_crossentropy(t, p, from_logits=True)
     f = backend.function([t, p], o)
 
@@ -240,6 +247,59 @@ class KerasLossesTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegex(ValueError, 'Could not interpret loss'):
       losses.get(0)
 
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def test_binary_crossentropy_uses_cached_logits(self):
+    logits = constant_op.constant([[-30., 30.]])
+    y_pred = activations.sigmoid(logits)
+    self.assertTrue(hasattr(y_pred, '_keras_logits'))
+    y_true = constant_op.constant([[0., 1.]])
+    loss = losses.binary_crossentropy(y_true, y_pred)[0]
+    # Check that logits are used. If y_pred is used directly, loss will
+    # collapse to 0 from underflow.
+    self.assertNotEqual(self.evaluate(loss), 0.)
+
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def test_categorical_crossentropy_uses_cached_logits(self):
+    logits = constant_op.constant([[-5., 0., 5.]])
+    y_pred = activations.softmax(logits)
+    self.assertTrue(hasattr(y_pred, '_keras_logits'))
+    y_true = constant_op.constant([[0., 0., 1.]])
+    loss = losses.categorical_crossentropy(y_true, logits, from_logits=True)[0]
+    # Check that logits are used. If y_pred is used directly, loss will
+    # collapse to 0 from underflow.
+    self.assertNotEqual(self.evaluate(loss), 0.)
+
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def test_sparse_categorical_crossentropy_uses_cached_logits(self):
+    logits = constant_op.constant([[-5., 0., 5.]])
+    y_pred = activations.softmax(logits)
+    self.assertTrue(hasattr(y_pred, '_keras_logits'))
+    y_true = constant_op.constant([2])
+    loss = losses.sparse_categorical_crossentropy(
+        y_true, logits, from_logits=True)[0]
+    # Check that logits are used. If y_pred is used directly, loss will
+    # collapse to 0 from underflow.
+    self.assertNotEqual(self.evaluate(loss), 0.)
+
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def test_loss_not_autographed_in_eager(self):
+
+    class MyLoss(losses.Loss):
+
+      def call(self, y_true, y_pred):
+        return y_true - y_pred
+
+    loss = MyLoss()
+    y_true = constant_op.constant([[0., 0., 0.]])
+    y_pred = constant_op.constant([[1., 1., 1.]])
+
+    def tf_convert(fn, _):
+      assert False, 'Function should not be autographed.'
+      return fn
+
+    with test.mock.patch.object(autograph, 'tf_convert', tf_convert):
+      loss(y_true, y_pred)
+
 
 @combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class MeanSquaredErrorTest(test.TestCase):
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index b3f391c7897..c4bc03aed8c 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -56,6 +56,7 @@ from tensorflow.python.keras.losses import squared_hinge
 from tensorflow.python.keras.saving.saved_model import metric_serialization
 from tensorflow.python.keras.utils import losses_utils
 from tensorflow.python.keras.utils import metrics_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import to_list
@@ -72,7 +73,6 @@ from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
@@ -963,7 +963,7 @@ class _ConfusionMatrixConditionCount(Metric):
       result = self.accumulator[0]
     else:
       result = self.accumulator
-    return ops.convert_to_tensor_v2(result)
+    return ops.convert_to_tensor_v2_with_dispatch(result)
 
   def reset_states(self):
     num_thresholds = len(to_list(self.thresholds))
@@ -2824,7 +2824,7 @@ class MeanIoU(Metric):
     sum_over_col = math_ops.cast(
         math_ops.reduce_sum(self.total_cm, axis=1), dtype=self._dtype)
     true_positives = math_ops.cast(
-        array_ops.diag_part(self.total_cm), dtype=self._dtype)
+        array_ops.tensor_diag_part(self.total_cm), dtype=self._dtype)
 
     # sum_over_row + sum_over_col =
     #     2 * true_positives + false_positives + false_negatives.
@@ -3239,7 +3239,7 @@ def binary_accuracy(y_true, y_pred, threshold=0.5):
   Returns:
     Binary accuracy values. shape = `[batch_size, d0, .. dN-1]`
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
   threshold = math_ops.cast(threshold, y_pred.dtype)
   y_pred = math_ops.cast(y_pred > threshold, y_pred.dtype)
   return K.mean(math_ops.equal(y_true, y_pred), axis=-1)
@@ -3297,8 +3297,8 @@ def sparse_categorical_accuracy(y_true, y_pred):
   Returns:
     Sparse categorical accuracy values.
   """
-  y_pred = ops.convert_to_tensor_v2(y_pred)
-  y_true = ops.convert_to_tensor_v2(y_true)
+  y_pred = ops.convert_to_tensor_v2_with_dispatch(y_pred)
+  y_true = ops.convert_to_tensor_v2_with_dispatch(y_true)
   y_pred_rank = y_pred.shape.ndims
   y_true_rank = y_true.shape.ndims
   # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,)
@@ -3364,8 +3364,8 @@ def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
   Returns:
     Sparse top K categorical accuracy value.
   """
-  y_pred_rank = ops.convert_to_tensor_v2(y_pred).shape.ndims
-  y_true_rank = ops.convert_to_tensor_v2(y_true).shape.ndims
+  y_pred_rank = ops.convert_to_tensor_v2_with_dispatch(y_pred).shape.ndims
+  y_true_rank = ops.convert_to_tensor_v2_with_dispatch(y_true).shape.ndims
   # Flatten y_pred to (batch_size, num_samples) and y_true to (num_samples,)
   if (y_true_rank is not None) and (y_pred_rank is not None):
     if y_pred_rank > 2:
diff --git a/tensorflow/python/keras/metrics_functional_test.py b/tensorflow/python/keras/metrics_functional_test.py
index 1082ac939ff..ff0e103ce59 100644
--- a/tensorflow/python/keras/metrics_functional_test.py
+++ b/tensorflow/python/keras/metrics_functional_test.py
@@ -18,15 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.eager import context
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import combinations
 from tensorflow.python.keras import metrics
 from tensorflow.python.platform import test
 
 
-class KerasFunctionalMetricsTest(test.TestCase):
+class KerasFunctionalMetricsTest(test.TestCase, parameterized.TestCase):
 
   def test_metrics(self):
     with self.cached_session():
@@ -68,21 +69,21 @@ class KerasFunctionalMetricsTest(test.TestCase):
       y_pred = K.variable(np.random.random((6, 7)))
       self.assertEqual(K.eval(metric(y_true, y_pred)).shape, (6,))
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_sparse_categorical_accuracy_eager(self):
     """Tests that ints passed in via Eager return results. See b/113504761."""
-    with context.eager_mode():
-      metric = metrics.sparse_categorical_accuracy
-      y_true = np.arange(6).reshape([6, 1])
-      y_pred = np.arange(36).reshape([6, 6])
-      self.assertAllEqual(metric(y_true, y_pred), [0., 0., 0., 0., 0., 1.])
+    metric = metrics.sparse_categorical_accuracy
+    y_true = np.arange(6).reshape([6, 1])
+    y_pred = np.arange(36).reshape([6, 6])
+    self.assertAllEqual(metric(y_true, y_pred), [0., 0., 0., 0., 0., 1.])
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_sparse_categorical_accuracy_float_eager(self):
     """Tests that floats passed in via Eager return results. See b/113504761."""
-    with context.eager_mode():
-      metric = metrics.sparse_categorical_accuracy
-      y_true = np.arange(6, dtype=np.float32).reshape([6, 1])
-      y_pred = np.arange(36).reshape([6, 6])
-      self.assertAllEqual(metric(y_true, y_pred), [0., 0., 0., 0., 0., 1.])
+    metric = metrics.sparse_categorical_accuracy
+    y_true = np.arange(6, dtype=np.float32).reshape([6, 1])
+    y_pred = np.arange(36).reshape([6, 6])
+    self.assertAllEqual(metric(y_true, y_pred), [0., 0., 0., 0., 0., 1.])
 
   def test_sparse_top_k_categorical_accuracy(self):
     with self.cached_session():
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 7b339fc5a47..1ce86e0f355 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -25,14 +25,11 @@ import os
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import function as eager_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
@@ -40,6 +37,8 @@ from tensorflow.python.keras import layers
 from tensorflow.python.keras import metrics
 from tensorflow.python.keras import Model
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import training as training_mod
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
@@ -71,7 +70,7 @@ class KerasSumTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(self.evaluate(m.total), 100)
 
       # check update_state() and result() + state accumulation + tensor input
-      update_op = m.update_state(ops.convert_to_tensor_v2([1, 5]))
+      update_op = m.update_state(ops.convert_to_tensor_v2_with_dispatch([1, 5]))
       self.evaluate(update_op)
       self.assertAlmostEqual(self.evaluate(m.result()), 106)
       self.assertEqual(self.evaluate(m.total), 106)  # 100 + 1 + 5
@@ -210,7 +209,7 @@ class MeanTest(keras_parameterized.TestCase):
     self.assertEqual(m2.dtype, dtypes.float32)
     self.assertEqual(len(m2.variables), 2)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_function_wrapped_reset_state(self):
     m = metrics.Mean(name='my_mean')
 
@@ -1478,7 +1477,7 @@ class MeanTensorTest(test.TestCase, parameterized.TestCase):
     """Ensure that variables are created correctly in a tf function."""
     m = metrics.MeanTensor(dtype=dtypes.float64)
 
-    @eager_function.defun
+    @def_function.function
     def call_metric(x):
       return m(x)
 
@@ -1488,47 +1487,47 @@ class MeanTensorTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(self.evaluate(m.count), [1, 1])
       self.assertAllClose(self.evaluate(call_metric([20, 2])), [60, 21])
 
+  @combinations.generate(combinations.combine(mode=['eager']))
   def test_in_keras_model(self):
-    with context.eager_mode():
-      class ModelWithMetric(Model):
+    class ModelWithMetric(Model):
 
-        def __init__(self):
-          super(ModelWithMetric, self).__init__()
-          self.dense1 = layers.Dense(
-              3, activation='relu', kernel_initializer='ones')
-          self.dense2 = layers.Dense(
-              1, activation='sigmoid', kernel_initializer='ones')
-          self.mean_tensor = metrics.MeanTensor()
+      def __init__(self):
+        super(ModelWithMetric, self).__init__()
+        self.dense1 = layers.Dense(
+            3, activation='relu', kernel_initializer='ones')
+        self.dense2 = layers.Dense(
+            1, activation='sigmoid', kernel_initializer='ones')
+        self.mean_tensor = metrics.MeanTensor()
 
-        def call(self, x):
-          x = self.dense1(x)
-          x = self.dense2(x)
-          self.mean_tensor(self.dense1.kernel)
-          return x
+      def call(self, x):
+        x = self.dense1(x)
+        x = self.dense2(x)
+        self.mean_tensor(self.dense1.kernel)
+        return x
 
-      model = ModelWithMetric()
-      model.compile(
-          loss='mae',
-          optimizer='rmsprop',
-          run_eagerly=True)
+    model = ModelWithMetric()
+    model.compile(
+        loss='mae',
+        optimizer='rmsprop',
+        run_eagerly=True)
 
-      x = np.ones((100, 4))
-      y = np.zeros((100, 1))
-      model.evaluate(x, y, batch_size=50)
-      self.assertAllClose(self.evaluate(model.mean_tensor.result()),
-                          np.ones((4, 3)))
-      self.assertAllClose(self.evaluate(model.mean_tensor.total),
-                          np.full((4, 3), 2))
-      self.assertAllClose(self.evaluate(model.mean_tensor.count),
-                          np.full((4, 3), 2))
+    x = np.ones((100, 4))
+    y = np.zeros((100, 1))
+    model.evaluate(x, y, batch_size=50)
+    self.assertAllClose(self.evaluate(model.mean_tensor.result()),
+                        np.ones((4, 3)))
+    self.assertAllClose(self.evaluate(model.mean_tensor.total),
+                        np.full((4, 3), 2))
+    self.assertAllClose(self.evaluate(model.mean_tensor.count),
+                        np.full((4, 3), 2))
 
-      model.evaluate(x, y, batch_size=25)
-      self.assertAllClose(self.evaluate(model.mean_tensor.result()),
-                          np.ones((4, 3)))
-      self.assertAllClose(self.evaluate(model.mean_tensor.total),
-                          np.full((4, 3), 4))
-      self.assertAllClose(self.evaluate(model.mean_tensor.count),
-                          np.full((4, 3), 4))
+    model.evaluate(x, y, batch_size=25)
+    self.assertAllClose(self.evaluate(model.mean_tensor.result()),
+                        np.ones((4, 3)))
+    self.assertAllClose(self.evaluate(model.mean_tensor.total),
+                        np.full((4, 3), 4))
+    self.assertAllClose(self.evaluate(model.mean_tensor.count),
+                        np.full((4, 3), 4))
 
 
 @combinations.generate(combinations.combine(mode=['graph', 'eager']))
@@ -2062,6 +2061,47 @@ class CustomMetricsTest(test.TestCase):
     metric_result = tf_functioned_metric_fn(sum_metric, y_true, y_pred)
     self.assertAllClose(self.evaluate(metric_result), 10, 1e-2)
 
+  def test_metric_not_tracked_as_sublayer_in_layer(self):
+
+    class MyLayer(base_layer.Layer):
+
+      def __init__(self, **kwargs):
+        super(MyLayer, self).__init__(**kwargs)
+        self.mean_obj = metrics.Mean(name='my_mean_obj')
+
+      def call(self, x):
+        self.add_metric(
+            math_ops.reduce_sum(x), aggregation='mean', name='my_mean_tensor')
+        self.add_metric(self.mean_obj(x))
+        return x
+
+    layer = MyLayer()
+    x = np.ones((1, 1))
+    layer(x)
+    self.assertLen(list(layer._flatten_layers(include_self=False)), 0)
+    self.assertLen(layer.metrics, 2)
+
+  def test_metric_not_tracked_as_sublayer_in_model(self):
+
+    class MyModel(training_mod.Model):
+
+      def __init__(self, **kwargs):
+        super(MyModel, self).__init__(**kwargs)
+        self.mean_obj = metrics.Mean(name='my_mean_obj')
+
+      def call(self, x):
+        self.add_metric(
+            math_ops.reduce_sum(x), aggregation='mean', name='my_mean_tensor')
+        self.add_metric(self.mean_obj(x))
+        return x
+
+    model = MyModel()
+    x = np.ones((1, 1))
+    model(x)
+    self.assertLen(list(model._flatten_layers(include_self=False)), 0)
+    self.assertLen(model.layers, 0)
+    self.assertLen(model.metrics, 2)
+
 
 def _get_model(compile_metrics):
   model_layers = [
diff --git a/tensorflow/python/keras/mixed_precision/experimental/BUILD b/tensorflow/python/keras/mixed_precision/experimental/BUILD
index b143e5946f5..d1bd18f85a5 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/BUILD
+++ b/tensorflow/python/keras/mixed_precision/experimental/BUILD
@@ -22,7 +22,6 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 package(
     default_visibility = [
         # TODO(scottzhu): Remove these two deps and convert the test to integration test.
-        "//tensorflow/python:__pkg__",  # For loss_scale_optimizer_test
         "//tensorflow/python/distribute:__pkg__",  # For collective_all_reduce_strategy_test
         "//tensorflow/python/keras:__subpackages__",
         "//tensorflow/tools/pip_package:__pkg__",
@@ -32,6 +31,12 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "mixed_precision_experimental",
     srcs = ["__init__.py"],
@@ -88,6 +93,7 @@ cuda_py_test(
     name = "device_compatibility_check_test",
     srcs = ["device_compatibility_check_test.py"],
     srcs_version = "PY2AND3",
+    tfrt_enabled = True,
     deps = [
         ":device_compatibility_check",
         "//tensorflow/python:client_testlib",
@@ -147,6 +153,7 @@ py_test(
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:mirrored_strategy",
         "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/distribute:test_util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/keras/optimizer_v2",
         "@absl_py//absl/testing:parameterized",
@@ -199,6 +206,7 @@ cuda_py_test(
     name = "loss_scale_benchmark",
     size = "medium",
     srcs = ["loss_scale_benchmark.py"],
+    tfrt_enabled = True,
     deps = [
         ":loss_scale_optimizer",
         ":test_util",
@@ -216,6 +224,7 @@ cuda_py_test(
     size = "small",
     srcs = ["mixed_precision_graph_rewrite_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":loss_scale_optimizer",
         ":policy",
@@ -224,6 +233,8 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:mixed_precision",
         "//tensorflow/python:tf2",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras:combinations",
         "//tensorflow/python/keras:testing_utils",
         "//tensorflow/python/keras/optimizer_v2",
         "@absl_py//absl/testing:parameterized",
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
index 20770061639..b33ea3a0b33 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import threading
+
 from tensorflow.python.distribute import ps_values as ps_distribute_values
 from tensorflow.python.distribute import values as distribute_values
 from tensorflow.python.eager import context
@@ -27,13 +29,18 @@ from tensorflow.python.ops import variables
 from tensorflow.python.types import core
 
 
+# _autocast_dtype.dtype is the dtype AutoCastVariables should be cast to, or
+# None if AutoCastVariables should not be cast.
+_autocast_dtype = threading.local()
+
+
 class AutoCastVariable(variables.Variable, core.Tensor):
   """Variable that will cast itself to a different dtype in applicable contexts.
 
   This class wraps a floating-point `tf.Variable`. It emulates the variable
   interface and delegates to the wrapped variable, but it additionally will cast
-  the wrapped variable under a `Graph._enable_auto_casting_variables(dtype)`
-  context manager.
+  the wrapped variable under an `enable_auto_cast_variables(dtype)` context
+  manager.
 
   For example:
 
@@ -41,10 +48,10 @@ class AutoCastVariable(variables.Variable, core.Tensor):
   >>> v = AutoCastVariable(v)
   >>> tf.identity(v).dtype
   tf.float32
-  >>> with ops.get_default_graph()._enable_auto_casting_variables(tf.float16):
+  >>> with enable_auto_cast_variables(tf.float16):
   ...   tf.identity(v).dtype
   tf.float16
-  >>> with ops.get_default_graph()._enable_auto_casting_variables(tf.float16):
+  >>> with enable_auto_cast_variables(tf.float16):
   ...   v.dtype  # v.dtype also changes under the context manager
   tf.float16
 
@@ -74,19 +81,14 @@ class AutoCastVariable(variables.Variable, core.Tensor):
 
   def _should_cast(self):
     """Returns True if this variable should be casted when accessed."""
-    g = ops.get_default_graph()
-    # pylint:disable=protected-access
-    return (g._auto_cast_variable_read_dtype is not None and
-            self.true_dtype != g._auto_cast_variable_read_dtype)
-    # pylint:enable=protected-access
+    autocast_dtype = getattr(_autocast_dtype, 'dtype', None)
+    return autocast_dtype is not None and self.true_dtype != autocast_dtype
 
   @property
   def dtype(self):
     """The dtype this variable will be casted to when read."""
-    if self._should_cast():
-      return ops.get_default_graph()._auto_cast_variable_read_dtype  # pylint:disable=protected-access
-    else:
-      return self._variable.dtype
+    dtype = getattr(_autocast_dtype, 'dtype', None)
+    return dtype or self._variable.dtype
 
   @property
   def true_dtype(self):
@@ -126,7 +128,7 @@ class AutoCastVariable(variables.Variable, core.Tensor):
       raise ValueError(
           'Incompatible type conversion requested to type {!r} for variable '
           'of type {!r}'.format(dtype.name, self.dtype.name))
-    val = ops.convert_to_tensor_v2(
+    val = ops.convert_to_tensor_v2_with_dispatch(
         self._variable, dtype=self._variable.dtype, name=name)
     return math_ops.cast(val, self.dtype)
 
@@ -160,7 +162,6 @@ class AutoCastVariable(variables.Variable, core.Tensor):
   #     would be the same as the ref of the underlying variable, which would be
   #     strange as they are different Python objects.
 
-  # pylint: disable=multiple-statements
   def set_shape(self, shape):
     return self._variable.set_shape(self, shape)
 
@@ -508,3 +509,27 @@ def create_autocast_variable(variable, op=None):
       # pylint: enable=missing-format-attribute
 
   return AutoCastDistributedVariable(variable, op=op)
+
+
+class enable_auto_cast_variables(object):  # pylint:disable=invalid-name
+  """Context manager which enables the autocasting of `AutoCastVariable`s.
+
+  Under this context manager, `AutoCastVariable`s will be cast to `dtype` if
+  `dtype` is floating-point. Otherwise, `AutoCastVariable`s will not be cast.
+  """
+
+  __slots__ = ['_dtype', '_prev_dtype']
+
+  def __init__(self, dtype):
+    if dtype and not dtype.is_floating:
+      dtype = None
+    self._dtype = dtype
+
+  def __enter__(self):
+    self._prev_dtype = getattr(_autocast_dtype, 'dtype', None)
+    _autocast_dtype.dtype = self._dtype
+
+  def __exit__(self, type_arg, value_arg, traceback_arg):
+    _autocast_dtype.dtype = self._prev_dtype
+
+
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index 162533fb880..738333039da 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -18,21 +18,24 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import threading
 
 from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import tf2
-from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import test_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_combinations as combinations
 from tensorflow.python.keras.mixed_precision.experimental import autocast_variable
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
 from tensorflow.python.ops import array_ops
@@ -52,14 +55,14 @@ def get_var(val, dtype, name=None):
   return variables.VariableV1(val, use_resource=True, dtype=dtype, name=name)
 
 
-@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+@ds_combinations.generate(combinations.combine(mode=['graph', 'eager']))
 class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
 
   def setUp(self):
-    strategy_combinations.set_virtual_cpus_to_at_least(3)
+    test_util.set_logical_devices_to_at_least('CPU', 3)
     super(AutoCastVariableTest, self).setUp()
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
   def test_read(self, distribution):
     with distribution.scope():
       x = get_var(1., dtypes.float32)
@@ -73,16 +76,14 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(array_ops.identity(x).dtype, dtypes.float32)
 
       # within auto cast scope of different dtype
-      with ops.get_default_graph()._enable_auto_casting_variables(
-          dtypes.float16):
+      with autocast_variable.enable_auto_cast_variables(dtypes.float16):
         self.assertEqual(x.dtype, dtypes.float16)
         self.assertEqual(x.value().dtype, dtypes.float16)
         self.assertEqual(x.read_value().dtype, dtypes.float16)
         self.assertEqual(array_ops.identity(x).dtype, dtypes.float16)
 
       # within auto cast scope of same dtype
-      with ops.get_default_graph()._enable_auto_casting_variables(
-          dtypes.float32):
+      with autocast_variable.enable_auto_cast_variables(dtypes.float32):
         self.assertEqual(x.dtype, dtypes.float32)
         self.assertEqual(x.value().dtype, dtypes.float32)
         self.assertEqual(x.read_value().dtype, dtypes.float32)
@@ -98,32 +99,29 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(x.sparse_read([0]).dtype, dtypes.float32)
     self.assertEqual(x.gather_nd([0]).dtype, dtypes.float32)
 
-    with ops.get_default_graph()._enable_auto_casting_variables(
-        dtypes.float16):
+    with autocast_variable.enable_auto_cast_variables(dtypes.float16):
       self.assertEqual(x.sparse_read([0]).dtype, dtypes.float16)
       self.assertEqual(x.gather_nd([0]).dtype, dtypes.float16)
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
   def test_read_nested_scopes(self, distribution):
     with distribution.scope():
       x = get_var(1., dtypes.float32)
       x = autocast_variable.create_autocast_variable(x)
       self.evaluate(x.initializer)
 
-      with ops.get_default_graph()._enable_auto_casting_variables(
-          dtypes.float16):
+      with autocast_variable.enable_auto_cast_variables(dtypes.float16):
         self.assertEqual(x.dtype, dtypes.float16)
         self.assertEqual(x.read_value().dtype, dtypes.float16)
 
-        with ops.get_default_graph()._enable_auto_casting_variables(
-            dtypes.float32):
+        with autocast_variable.enable_auto_cast_variables(dtypes.float32):
           self.assertEqual(x.dtype, dtypes.float32)
           self.assertEqual(x.read_value().dtype, dtypes.float32)
 
         self.assertEqual(x.dtype, dtypes.float16)
         self.assertEqual(x.read_value().dtype, dtypes.float16)
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
   def test_dtype_is_not_string(self, distribution):
     with distribution.scope():
       x = get_var(1., dtypes.float32)
@@ -134,13 +132,13 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       self.assertIsInstance(x.true_dtype, dtypes.DType)
 
       dtype = dtypes.float16
-      with ops.get_default_graph()._enable_auto_casting_variables(dtype):
+      with autocast_variable.enable_auto_cast_variables(dtype):
         self.assertEqual(x.dtype, dtypes.float16)
         self.assertIsInstance(x.dtype, dtypes.DType)
         self.assertEqual(x.true_dtype, dtypes.float32)
         self.assertIsInstance(x.true_dtype, dtypes.DType)
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
   def test_method_delegations(self, distribution):
     # Test AutoCastVariable correctly delegates Variable methods to the
     # underlying variable.
@@ -160,8 +158,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
 
         x = get_var(7., dtypes.float32)
         x = autocast_variable.create_autocast_variable(x)
-        with ops.get_default_graph()._enable_auto_casting_variables(
-            read_dtype):
+        with autocast_variable.enable_auto_cast_variables(read_dtype):
           self.evaluate(x.initializer)
           self.assertEqual(self.evaluate(x.value()), 7)
           self.assertEqual(self.evaluate(x.read_value()), 7)
@@ -194,8 +191,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
           # DistributedVariables
           x = get_var([7, 8], dtypes.float32)
           x = autocast_variable.create_autocast_variable(x)
-          with ops.get_default_graph()._enable_auto_casting_variables(
-              read_dtype):
+          with autocast_variable.enable_auto_cast_variables(read_dtype):
             self.evaluate(x.initializer)
             self.assertAllEqual(self.evaluate(x.value()), [7, 8])
 
@@ -220,14 +216,13 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
             self.assertAllEqual(
                 evaluate(x.scatter_nd_update([[0], [1]], [1., 2.])), [1, 2])
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
   def test_operator_overloads(self, distribution):
     with distribution.scope():
       for read_dtype in (dtypes.float32, dtypes.float16):
         x = get_var(7., dtypes.float32)
         x = autocast_variable.create_autocast_variable(x)
-        with ops.get_default_graph()._enable_auto_casting_variables(
-            read_dtype):
+        with autocast_variable.enable_auto_cast_variables(read_dtype):
           self.evaluate(x.initializer)
           self.assertAlmostEqual(8, self.evaluate(x + 1))
           self.assertAlmostEqual(10, self.evaluate(3 + x))
@@ -267,7 +262,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
             self.assertAllEqual(x == [7., 8., 10.], [True, True, False])
             self.assertAllEqual(x != [7., 8., 10.], [False, False, True])
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
   def test_assign(self, distribution):
     with distribution.scope():
       x = get_var(0., dtypes.float32)
@@ -339,12 +334,11 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       # reset x
       self.evaluate(x.assign(0.))
       # within auto cast scope.
-      with ops.get_default_graph()._enable_auto_casting_variables(
-          dtypes.float16):
+      with autocast_variable.enable_auto_cast_variables(dtypes.float16):
         # assign still expect float32 value even if in float16 scope
         run_and_check()
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
   def test_assign_tf_function(self, distribution):
     if not context.executing_eagerly():
       self.skipTest('Test is not compatible with graph mode')
@@ -357,11 +351,10 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       def run_assign():
         return x.assign(1.).assign_add(3.).assign_add(3.).assign_sub(2.)
 
-      with ops.get_default_graph()._enable_auto_casting_variables(
-          dtypes.float16):
+      with autocast_variable.enable_auto_cast_variables(dtypes.float16):
         self.assertAllClose(5., self.evaluate(run_assign()))
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
   def test_assign_op(self, distribution):
     with distribution.scope():
       x = get_var(0., dtypes.float32)
@@ -375,7 +368,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
 
       func()
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
   def test_tf_function_control_dependencies(self, distribution):
     if not context.executing_eagerly():
       self.skipTest('Test is not compatible with graph mode')
@@ -393,7 +386,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       func()
       self.assertAllClose(2., self.evaluate(x))
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
   def test_assign_stays_in_true_dtype(self, distribution):
     with distribution.scope():
       x = get_var(1., dtypes.float32)
@@ -403,8 +396,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       # in fp32
       small_val = np.finfo('float16').eps / 2
       small_tensor = constant_op.constant(small_val, dtype=dtypes.float32)
-      with ops.get_default_graph()._enable_auto_casting_variables(
-          dtypes.float16):
+      with autocast_variable.enable_auto_cast_variables(dtypes.float16):
         # Variable should be increased, despite it appearing to be the same
         # float16 value.
         self.evaluate(x.assign(1. + small_tensor))
@@ -412,13 +404,30 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(1. + small_val, self.evaluate(x))
 
       self.evaluate(x.assign(1.))
-      with ops.get_default_graph()._enable_auto_casting_variables(
-          dtypes.float16):
+      with autocast_variable.enable_auto_cast_variables(dtypes.float16):
         self.evaluate(x.assign_add(small_tensor))
         self.assertEqual(1., self.evaluate(x.value()))
       self.assertEqual(1. + small_val, self.evaluate(x))
 
-  @combinations.generate(maybe_distribute)
+  def test_thread_local_autocast_dtype(self):
+    x = get_var(1., dtypes.float32)
+    x = autocast_variable.create_autocast_variable(x)
+    self.evaluate(x.initializer)
+
+    with autocast_variable.enable_auto_cast_variables(dtypes.float16):
+      self.assertEqual(x.dtype, dtypes.float16)
+
+      # New threads should not see the modified value of the autocast dtype.
+      var_dtype = None
+      def f():
+        nonlocal var_dtype
+        var_dtype = x.dtype
+      thread = threading.Thread(target=f)
+      thread.start()
+      thread.join()
+      self.assertEqual(var_dtype, dtypes.float32)
+
+  @ds_combinations.generate(maybe_distribute)
   def test_checkpoint(self, distribution):
     with self.test_session():
       with distribution.scope():
@@ -434,7 +443,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       checkpoint.restore(save_path).assert_consumed().run_restore_ops()
       self.assertEqual(self.evaluate(x), 123.)
 
-  @combinations.generate(maybe_distribute)
+  @ds_combinations.generate(maybe_distribute)
   def test_invalid_wrapped_variable(self, distribution):
     with distribution.scope():
       # Wrap a non-variable
@@ -459,8 +468,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
           "<AutoCastVariable 'x:0' shape=() dtype=float32 true_dtype=float32, "
           "numpy="
       )
-      with ops.get_default_graph()._enable_auto_casting_variables(
-          dtypes.float16):
+      with autocast_variable.enable_auto_cast_variables(dtypes.float16):
         self.assertStartsWith(
             repr(x),
             "<AutoCastVariable 'x:0' shape=() dtype=float16 "
@@ -471,8 +479,7 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
           repr(x),
           "<AutoCastVariable 'x:0' shape=() dtype=float32 true_dtype=float32>"
       )
-      with ops.get_default_graph()._enable_auto_casting_variables(
-          dtypes.float16):
+      with autocast_variable.enable_auto_cast_variables(dtypes.float16):
         self.assertEqual(
             repr(x),
             "<AutoCastVariable 'x:0' shape=() dtype=float16 true_dtype=float32>"
diff --git a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
index cfa2fbca080..dd754e87bb4 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import config as tf_config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend
@@ -36,7 +37,7 @@ from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import models
-from tensorflow.python.keras import optimizers
+from tensorflow.python.keras import optimizer_v1
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
@@ -52,6 +53,7 @@ from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import flags
 from tensorflow.python.platform import test
 from tensorflow.python.training.experimental import loss_scale as loss_scale_module
 from tensorflow.python.training.tracking import util as trackable_utils
@@ -98,7 +100,7 @@ default_strategy_fn = distribution_strategy_context.get_strategy
 
 def create_mirrored_strategy():
   """Create a MirroredStrategy, using a GPU if it is available."""
-  if context.num_gpus() >= 1:
+  if tf_config.list_logical_devices('GPU'):
     return mirrored_strategy.MirroredStrategy(['cpu:0', 'gpu:0'])
   else:
     return mirrored_strategy.MirroredStrategy(['cpu:0'])
@@ -106,7 +108,8 @@ def create_mirrored_strategy():
 
 def create_central_storage_strategy():
   """Create a CentralStorageStrategy, using a GPU if it is available."""
-  compute_devices = ['cpu:0', 'gpu:0'] if context.num_gpus() >= 1 else ['cpu:0']
+  compute_devices = ['cpu:0', 'gpu:0'] if (
+      tf_config.list_logical_devices('GPU')) else ['cpu:0']
   return central_storage_strategy.CentralStorageStrategy(
       compute_devices, parameter_device='cpu:0')
 
@@ -126,10 +129,11 @@ class KerasLayerTest(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(*TESTCASES)
   def test_mixed_policies_(self, strategy_fn):
+    strategy = strategy_fn()
     for dtype in 'float16', 'bfloat16':
       x = constant_op.constant([1.])
       policy_name = 'mixed_' + dtype
-      with strategy_fn().scope(), policy.policy_scope(policy_name):
+      with strategy.scope(), policy.policy_scope(policy_name):
         layer = mp_test_util.MultiplyLayer(assert_type=dtype)
         self.assertEqual(layer.dtype, dtypes.float32)
         self.assertEqual(get_layer_policy.get_layer_policy(layer).name,
@@ -852,7 +856,7 @@ class KerasModelTest(keras_parameterized.TestCase):
       else:
         error_msg = 'optimizer" must be an instance of '
       with self.assertRaisesRegex(ValueError, error_msg):
-        model.compile(optimizers.SGD(1.), 'mse')
+        model.compile(optimizer_v1.SGD(1.), 'mse')
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_functional_model_loss_dtype(self):
@@ -1004,8 +1008,12 @@ class KerasModelTest(keras_parameterized.TestCase):
 
     # The checkpoint and expected values were obtained from the program in
     # testdata/BUILD.
-    ckpt_dir = test.test_src_dir_path(
-        'python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2')
+    ckpt_dir = os.path.join(
+        flags.FLAGS['test_srcdir'].value,
+        'org_tensorflow/tensorflow/python/keras',
+        'mixed_precision/experimental/testdata/lso_ckpt_tf2.2')
+    # ckpt_dir = test.test_src_dir_path(
+    #     'python/keras/mixed_precision/experimental/testdata/lso_ckpt_tf2.2')
     model.load_weights(os.path.join(ckpt_dir, 'ckpt'))
     model.compile(opt, 'mse', run_eagerly=testing_utils.should_run_eagerly())
     model(np.zeros((2, 2)))  # Create model weights
@@ -1035,9 +1043,13 @@ class KerasModelTest(keras_parameterized.TestCase):
     self.assertEqual(self.evaluate(opt.loss_scale._num_good_steps), 1)
 
   def test_restore_old_saved_model(self):
-    saved_model_dir = test.test_src_dir_path(
-        'python/keras/mixed_precision/experimental/testdata/'
-        'lso_savedmodel_tf2.2')
+    saved_model_dir = os.path.join(
+        flags.FLAGS['test_srcdir'].value,
+        'org_tensorflow/tensorflow/python/keras',
+        'mixed_precision/experimental/testdata/lso_savedmodel_tf2.2')
+    # saved_model_dir = test.test_src_dir_path(
+    #     'python/keras/mixed_precision/experimental/testdata/'
+    #     'lso_savedmodel_tf2.2')
     model = save.load_model(saved_model_dir)
     expected_kernel = np.array([[9.229685, 10.901115], [10.370763, 9.757362]])
     self.assertAllClose(backend.eval(model.weights[0]), expected_kernel)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_benchmark.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_benchmark.py
index 4ebc360b973..95fcd1168d1 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_benchmark.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_benchmark.py
@@ -24,6 +24,7 @@ from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import config
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.ops import math_ops
@@ -136,7 +137,7 @@ class LossScaleBenchmark(test.Benchmark):
                             wall_time=(end - start) / num_iters, name=name)
 
   def _gpus_to_test_with(self):
-    num_gpus = context.num_gpus()
+    num_gpus = len(config.list_logical_devices('GPU'))
     gpus_to_test_with = []
     if num_gpus >= 1:
       gpus_to_test_with.append(1)
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
index 69b39e3f989..dd7bf6a682d 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
@@ -142,8 +142,8 @@ class _DelegatingTrackableMixin(object):
     return self._trackable._add_variable_with_custom_getter(
         name, shape, dtype, initializer, getter, overwrite, **kwargs_for_getter)
 
-  def _preload_simple_restoration(self, name, shape):
-    return self._trackable._preload_simple_restoration(name, shape)
+  def _preload_simple_restoration(self, name):
+    return self._trackable._preload_simple_restoration(name)
 
   def _track_trackable(self, trackable, name, overwrite=False):  # pylint: disable=redefined-outer-name
     return self._trackable._track_trackable(trackable, name, overwrite)
@@ -229,6 +229,41 @@ class LossScaleOptimizer(_DelegatingTrackableMixin, optimizer_v2.OptimizerV2):
   >>> opt.apply_gradients([(grad, var)])  # Loss scale is updated here
   >>> var.numpy()
   0.25
+
+  Hyperparameters can be accessed and set on the LossScaleOptimizer, which will
+  be delegated to the wrapped optimizer.
+
+  >>> opt = tf.keras.optimizers.Adam(beta_1=0.8, epsilon=1e-5)
+  >>> lso = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt,
+  ...                                                                "dynamic")
+  >>> opt.beta_1
+  0.8
+  >>> lso.beta_1  # Equivalent to `opt.beta_1`
+  0.8
+  >>> lso.beta_1 = 0.7  # Equivalent to `opt.beta_1 = 0.7`
+  >>> opt.beta_1
+  0.7
+  >>> lso.beta_1
+  0.7
+
+  However, accessing or setting non-hyperparameters is not delegated to the
+  LossScaleOptimizer. In an Adam optimizer, `beta_1` is a hyperparameter but
+  `epsilon` is not, as the Adam optimizer only calls `Optimizer._set_hyper` on
+  `beta_1`.
+
+  >>> opt.epsilon
+  1e-5
+  >>> lso.epsilon
+  Traceback (most recent call last):
+  ...
+  AttributeError: 'LossScaleOptimizer' object has no attribute 'epsilon'
+  >>> lso.epsilon = 1e-4
+  >>> opt.epsilon
+  >>> 1e-5
+
+  In the above example, despite epsilon being set on the LossScaleOptimizer, the
+  old epsilon value will still be used when training as epsilon was not set on
+  the Adam optimizer.
   """
 
   _HAS_AGGREGATE_GRAD = True
@@ -248,20 +283,8 @@ class LossScaleOptimizer(_DelegatingTrackableMixin, optimizer_v2.OptimizerV2):
     if not isinstance(optimizer, optimizer_v2.OptimizerV2):
       raise ValueError('"optimizer" must be an instance of OptimizerV2, but '
                        'got: %s' % optimizer)
-    if optimizer.clipnorm is not None:
-      raise ValueError('LossScaleOptimizer does not support wrapping '
-                       'optimizers with a clipnorm. Optimizer %s has clipnorm '
-                       '%s' % (optimizer, optimizer.clipnorm))
-
-    if optimizer.clipvalue is not None:
-      raise ValueError('LossScaleOptimizer does not support wrapping '
-                       'optimizers with a clipvalue. Optimizer %s has '
-                       'clipvalue %s' % (optimizer, optimizer.clipvalue))
     self._raise_if_strategy_unsupported()
 
-    self.clipnorm = None
-    self.clipvalue = None
-
     self._optimizer = optimizer
     self._loss_scale = keras_loss_scale_module.get(loss_scale)
     if self._loss_scale is None:
@@ -280,9 +303,6 @@ class LossScaleOptimizer(_DelegatingTrackableMixin, optimizer_v2.OptimizerV2):
       backend.track_variable(weight)
     self._track_trackable(self._loss_scale, 'loss_scale')
 
-    # Needed because the superclass's __getattribute__ checks this.
-    self._hyper = {}
-
     # To support restoring TensorFlow 2.2 checkpoints.
     self._track_trackable(FakeOptimizerForRestoration(self._optimizer),
                           'base_optimizer')
@@ -486,6 +506,30 @@ class LossScaleOptimizer(_DelegatingTrackableMixin, optimizer_v2.OptimizerV2):
   def set_weights(self, weights):
     return self._optimizer.set_weights(weights)
 
+  @property
+  def clipnorm(self):
+    return self._optimizer.clipnorm
+
+  @clipnorm.setter
+  def clipnorm(self, val):
+    self._optimizer.clipnorm = val
+
+  @property
+  def global_clipnorm(self):
+    return self._optimizer.global_clipnorm
+
+  @global_clipnorm.setter
+  def global_clipnorm(self, val):
+    self._optimizer.global_clipnorm = val
+
+  @property
+  def clipvalue(self):
+    return self._optimizer.clipvalue
+
+  @clipvalue.setter
+  def clipvalue(self, val):
+    self._optimizer.clipvalue = val
+
   def _aggregate_gradients(self, grads_and_vars):
     return self._optimizer._aggregate_gradients(grads_and_vars)  # pylint: disable=protected-access
 
@@ -504,26 +548,47 @@ class LossScaleOptimizer(_DelegatingTrackableMixin, optimizer_v2.OptimizerV2):
   def add_slot(self, var, slot_name, initializer='zeros'):
     return self._optimizer.add_slot(var, slot_name, initializer)
 
-  # For the most part, we only expose methods in the base OptimizerV2, not
-  # individual subclasses like Adam. However, although "learning_rate" and "lr"
-  # properties are not part of the base OptimizerV2 class, they are part of most
-  # subclasses, so we expose them here for convenience.
+  def __getattribute__(self, name):
+    try:
+      return object.__getattribute__(self, name)
+    except AttributeError as e:
+      if name == '_optimizer' or name == '_hyper':
+        # Avoid infinite recursion
+        raise e
 
-  @property
-  def learning_rate(self):
-    return self._optimizer.learning_rate
+      # Delegate hyperparameter accesses to inner optimizer.
+      if name == 'lr':
+        name = 'learning_rate'
+      if name in self._optimizer._hyper:
+        return self._optimizer._get_hyper(name)
+      raise e
 
-  @learning_rate.setter
-  def learning_rate(self, lr):
-    self._optimizer.learning_rate = lr
+  def __dir__(self):
+    result = set(super(LossScaleOptimizer, self).__dir__())
+    if '_optimizer' in result:
+      result |= self._optimizer._hyper.keys()
+      if 'learning_rate' in self._optimizer._hyper.keys():
+        result.add('lr')
+    return list(result)
 
-  @property
-  def lr(self):
-    return self._optimizer.lr
-
-  @lr.setter
-  def lr(self, lr):
-    self._optimizer.lr = lr
+  def __setattr__(self, name, value):
+    if name == 'lr':
+      name = 'learning_rate'
+    # Delegate setting hyperparameter to inner optimizer if the attribute does
+    # not exist on the LossScaleOptimizer
+    try:
+      # We cannot check for the 'iterations' attribute as it cannot be set after
+      # it is accessed.
+      if name != 'iterations':
+        object.__getattribute__(self, name)
+      has_attribute = True
+    except AttributeError:
+      has_attribute = False
+    if (name != '_optimizer' and name in self._optimizer._hyper
+        and not has_attribute):
+      self._optimizer._set_hyper(name, value)
+    else:
+      super(LossScaleOptimizer, self).__setattr__(name, value)
 
   # We do not override some OptimizerV2 methods. For each, we describe why we do
   # not delegate them to self._optimizer:
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
index 9a9d174a64f..fe3a237ef83 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.distribute import central_storage_strategy
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import context
+from tensorflow.python.framework import config as tf_config
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import combinations
@@ -53,7 +54,7 @@ default_strategy_fn = distribution_strategy_context.get_strategy
 
 
 def create_mirrored_strategy():
-  if context.num_gpus() >= 1:
+  if tf_config.list_logical_devices('GPU'):
     return mirrored_strategy.MirroredStrategy(['cpu:0', 'gpu:0'])
   else:
     return mirrored_strategy.MirroredStrategy(['cpu:0'])
@@ -124,10 +125,10 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
   def testGetScaledLoss(self):
     opt = gradient_descent.SGD(2.0)
     opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=2.)
-    loss = ops.convert_to_tensor_v2(5.)
+    loss = ops.convert_to_tensor_v2_with_dispatch(5.)
     self.assertEqual(10., self.evaluate(opt.get_scaled_loss(loss)))
     self.assertEqual(10., self.evaluate(opt.get_scaled_loss(lambda: loss)()))
-    loss = ops.convert_to_tensor_v2(5., dtype='float16')
+    loss = ops.convert_to_tensor_v2_with_dispatch(5., dtype='float16')
     self.assertEqual(10., self.evaluate(opt.get_scaled_loss(loss)))
     self.assertEqual(10., self.evaluate(opt.get_scaled_loss(lambda: loss)()))
 
@@ -135,8 +136,8 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     opt = gradient_descent.SGD(2.0)
     opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=2)
     scaled_grads = [
-        ops.convert_to_tensor_v2(3.), None,
-        ops.convert_to_tensor_v2(-4., dtype='float16')
+        ops.convert_to_tensor_v2_with_dispatch(3.), None,
+        ops.convert_to_tensor_v2_with_dispatch(-4., dtype='float16')
     ]
     grads = opt.get_unscaled_gradients(scaled_grads)
     grads = [self.evaluate(g) if g is not None else g for g in grads]
@@ -146,9 +147,10 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     opt = gradient_descent.SGD(2.0)
     opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=2)
     sparse_scaled_grad = ops.IndexedSlices(
-        ops.convert_to_tensor_v2([[4., 2.], [8., 5.]]),
-        ops.convert_to_tensor_v2([1, 3], dtype='int32'),
-        dense_shape=ops.convert_to_tensor_v2([5, 2], dtype='int32'))
+        ops.convert_to_tensor_v2_with_dispatch([[4., 2.], [8., 5.]]),
+        ops.convert_to_tensor_v2_with_dispatch([1, 3], dtype='int32'),
+        dense_shape=ops.convert_to_tensor_v2_with_dispatch([5, 2],
+                                                           dtype='int32'))
     sparse_grad = opt.get_unscaled_gradients([sparse_scaled_grad])[0]
     self.assertIsInstance(sparse_grad, ops.IndexedSlices)
     self.assertAllEqual([[2., 1.], [4., 2.5]],
@@ -187,6 +189,52 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
       # 1.
       self.assertAllClose([1.], self.evaluate(var))
 
+  # pylint: disable=cell-var-from-loop
+  @parameterized.named_parameters(*TESTCASES)
+  def testClipping(self, strategy_fn):
+    strategy = strategy_fn()
+    learning_rate = 2.
+    for clip_type in ('clipnorm', 'global_clipnorm', 'clipvalue'):
+      with strategy.scope(), self.subTest(clip_type=clip_type):
+        var = variables.Variable([5.0])
+        opt = gradient_descent.SGD(learning_rate, **{clip_type: 2.0})
+        loss_scale = loss_scale_module.DynamicLossScale(
+            initial_loss_scale=2, increment_period=1, multiplier=2)
+        opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale)
+        self.assertEqual(getattr(opt, clip_type), 2.0)
+        self.assertEqual(
+            loss_scale.initial_loss_scale % strategy.num_replicas_in_sync, 0)
+
+        loss = lambda: var * 4 / strategy.num_replicas_in_sync
+        run_fn = lambda: opt.minimize(loss, var_list=[var])
+
+        # Test running with clipped gradients
+        run_op = strategy.experimental_run(run_fn)
+        self.evaluate(variables.global_variables_initializer())
+        self._run_if_in_graph_mode(run_op)
+        # The gradient is 4 but is clipped to 2, so the variable will be
+        # init_val - clipped_grad * lr == 5 - 2 * 2 == 1
+        self.assertAllClose([1.], self.evaluate(var))
+        self.assertEqual(self.evaluate(opt.loss_scale()), 4)
+
+        # Test changing the clip amount and running again
+        setattr(opt, clip_type, 3.0)
+        run_op = strategy.experimental_run(run_fn)
+        self._run_if_in_graph_mode(run_op)
+        # The gradient is 4 but is clipped to 3, so the variable will be
+        # prev_var - clipped_grad * lr == 1 - 3 * 2 == -5
+        self.assertAllClose([-5.], self.evaluate(var))
+        self.assertEqual(self.evaluate(opt.loss_scale()), 8)
+
+        # Test Inf gradients are still skipped instead of being clipped
+        loss = lambda: var * float('Inf')
+        run_fn = lambda: opt.minimize(loss, var_list=[var])
+        run_op = strategy.experimental_run(run_fn)
+        self._run_if_in_graph_mode(run_op)
+        self.assertAllClose([-5.], self.evaluate(var))  # Var does not change
+        self.assertEqual(self.evaluate(opt.loss_scale()), 4)
+  # pylint: enable=cell-var-from-loop
+
   @parameterized.named_parameters(*TESTCASES)
   def testDynamicUpdate(self, strategy_fn):
     with strategy_fn().scope() as strategy:
@@ -334,47 +382,71 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegex(ValueError, r'loss_scale cannot be None'):
       loss_scale_optimizer.LossScaleOptimizer(opt, None)
 
-  @parameterized.named_parameters(*TESTCASES)
-  def testGettingAndSettingLearningRate(self, strategy_fn):
-    with self.test_session(), strategy_fn().scope() as strategy:
-      var = variables.Variable([5.0])
-      opt = adam.Adam(learning_rate=1.0)
-      loss = lambda: var * 2.0
-      run_fn = lambda: opt.minimize(loss, [var])
-      run_op = strategy.experimental_run(run_fn)
+  def testHyperParametersExposed(self):
+    with self.cached_session():
+      opt = adam.Adam(learning_rate=1.0, beta_1=0.5, beta_2=0.9)
+      lso = loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
+      # Force hyperparameters to be created
+      opt.lr  # pylint: disable=pointless-statement
       self.evaluate(variables.global_variables_initializer())
-      self._run_if_in_graph_mode(run_op)
 
-      lr = self.evaluate(opt.lr)
-      self.assertEqual(1.0, lr)
+      self.assertEqual(self.evaluate(lso.beta_1), 0.5)
+      self.assertIsInstance(lso.beta_1, variables.Variable)
+      self.assertEqual(self.evaluate(lso.lr), 1.0)
+      self.assertIs(lso.lr, opt.lr)
+      self.assertIs(lso.lr, lso.learning_rate)
 
-      opt.lr = 2.0
-      lr = self.evaluate(opt.lr)
-      self.assertEqual(2.0, lr)
+      lso.beta_1 = 0.25
+      self.assertEqual(self.evaluate(lso.beta_1), 0.25)
+      self.assertEqual(self.evaluate(opt.beta_1), 0.25)
+      self.assertIs(lso.beta_1, opt.beta_1)
+      opt.beta_1 = 0.75
+      self.assertEqual(self.evaluate(lso.beta_1), 0.75)
+      self.assertEqual(self.evaluate(opt.beta_1), 0.75)
+      self.assertIs(lso.beta_1, opt.beta_1)
+      lso.lr = 2.0
+      self.assertEqual(self.evaluate(lso.lr), 2.0)
+      self.assertEqual(self.evaluate(lso.learning_rate), 2.0)
+      self.assertEqual(self.evaluate(opt.lr), 2.0)
+      self.assertEqual(self.evaluate(opt.learning_rate), 2.0)
+      self.assertIs(lso.lr, opt.lr)
 
-      self.evaluate(opt.lr.assign(3.0))
-      lr = self.evaluate(opt.lr)
-      self.assertEqual(3.0, lr)
+      # Test setting attribute that is both attribute on LossScaleOptimizer and
+      # hyperparameter on wrapped optimizer.
+      class MyOpt(gradient_descent.SGD):
 
+        def __init__(self):
+          super().__init__()
+          self._set_hyper('loss_scale', 123.)
+
+      opt = MyOpt()
+      lso = loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
       with self.assertRaises(AttributeError):
-        opt.not_an_attr += 3
+        lso.loss_scale = loss_scale_module.FixedLossScale(2.)
 
   def testArbitraryAttributesNotExposed(self):
-    opt = adam.Adam(learning_rate=1.0)
-    # Test that Adam has attributes 'epsilon' and 'beta1'
-    opt.epsilon  # pylint: disable=pointless-statement
-    opt.beta_1  # pylint: disable=pointless-statement
-    opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=10.)
-    # Test that attributes defined by OptimizerV2 subclasses are not exposed in
-    # LossScaleOptimizer, and that the error message is sensible.
+    opt = gradient_descent.SGD()
+    lso = loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
+    self.assertFalse(opt.nesterov)
     with self.assertRaisesRegex(
         AttributeError,
-        "'LossScaleOptimizer' object has no attribute 'epsilon'"):
-      opt.epsilon  # pylint: disable=pointless-statement
-    with self.assertRaisesRegex(
-        AttributeError,
-        "'LossScaleOptimizer' object has no attribute 'beta_1'"):
-      opt.beta_1  # pylint: disable=pointless-statement
+        "'LossScaleOptimizer' object has no attribute 'nesterov'"):
+      lso.nesterov  # pylint: disable=pointless-statement
+
+    lso.nesterov = True
+    self.assertTrue(lso.nesterov)
+    self.assertFalse(opt.nesterov)
+
+  def testDir(self):
+    lso = loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD(),
+                                                  'dynamic')
+    dir_result = dir(lso)
+    self.assertIn('learning_rate', dir_result)  # Hyperparameter
+    self.assertIn('lr', dir_result)  # Hyperparameter
+    self.assertIn('minimize', dir_result)  # Attribute
+    self.assertIn('loss_scale', dir_result)  # Attribute
+    self.assertNotIn('nesterov', dir_result)  # Attribute on inner optimizer
+    self.assertIn('nesterov', dir(lso._optimizer))
 
   def testApplyGradientsGetsUnwrappedTensors(self):
     # Tests that gradients passed to apply_gradients are not wrapped in a
diff --git a/tensorflow/python/keras/mixed_precision/experimental/policy.py b/tensorflow/python/keras/mixed_precision/experimental/policy.py
index 592057f0b56..33f6562f796 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/policy.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/policy.py
@@ -32,11 +32,6 @@ from tensorflow.python.training.experimental import mixed_precision_global_state
 from tensorflow.python.util.tf_export import keras_export
 
 
-# Default value of certain arguments, indicating the default behavior for
-# that argument should be used.
-USE_DEFAULT = 'USE_DEFAULT'
-
-
 @keras_export('keras.mixed_precision.experimental.Policy', v1=[])
 class Policy(object):
   """A dtype policy for a Keras layer.
@@ -293,7 +288,7 @@ class Policy(object):
   layer would only work if the inputs were float32.
   """
 
-  def __init__(self, name, loss_scale=USE_DEFAULT):
+  def __init__(self, name, loss_scale='auto'):
     """Constructs the policy.
 
     The `name` argument determines the compute and variable dtype, the default
@@ -310,9 +305,10 @@ class Policy(object):
           a dynamic loss scale is used. These policies are used for mixed
           precision training.
       loss_scale: A `tf.mixed_precision.experimental.LossScale`, an int (which
-        uses a `FixedLossScale`), or the string "dynamic" (which uses a
-        `DynamicLossScale`). Defaults to using no loss scaling unless `name` is
-        "mixed_float16", in which case this defaults to "dynamic". Only
+        uses a `FixedLossScale`), the string "dynamic" (which uses a
+        `DynamicLossScale`), or None (which uses no loss scale). Defaults to
+        `"auto"`. In the `"auto"` case: 1) if `name` is `"mixed_float16"`, then
+        use `loss_scale="dynamic"`. 2) otherwise, do not use a loss scale. Only
         `tf.keras.Model`s, not layers, use the loss scale, and it is only used
         during `Model.fit`, `Model.train_on_batch`, and other similar methods.
     """
@@ -324,7 +320,7 @@ class Policy(object):
     self._name = name
     self._compute_dtype, self._variable_dtype = self._parse_name(name)
 
-    if loss_scale == USE_DEFAULT:
+    if loss_scale == 'auto':
       loss_scale = 'dynamic' if name == 'mixed_float16' else None
       self._using_default_loss_scale = True
     else:
@@ -514,11 +510,6 @@ def global_policy():
   return _global_policy
 
 
-def policy_defaults_to_floatx():
-  """Returns True if `global_policy()` will use the current value of floatx."""
-  return _global_policy is None and base_layer_utils.v2_dtype_behavior_enabled()
-
-
 def _check_if_mixed_precision_graph_rewrite_is_enabled(policy):
   if mixed_precision_global_state.mixed_precision_graph_rewrite_is_enabled:
     raise ValueError(
@@ -543,7 +534,12 @@ def set_policy(policy):
   passed to the layer constructor. If no global policy is set, layers will
   instead default to a Policy constructed from `tf.keras.backend.floatx()`.
 
-  See `keras.mixed_precision.experimental.Policy` for more information.
+  Only floating point policies can be set as the global policy, such as
+  `'float32'` and `'mixed_float16'`. Non-floating point policies such as
+  `'int32'` and `'complex64'` cannot be set as the global policy because most
+  layers do not support such policies.
+
+  See `tf.keras.mixed_precision.experimental.Policy` for more information.
 
   Args:
     policy: A Policy, or a string that will be converted to a Policy..
@@ -559,6 +555,12 @@ def set_policy(policy):
   is_mixed_policy = policy is not None and policy.should_cast_variables
   if is_mixed_policy:
     _check_if_mixed_precision_graph_rewrite_is_enabled(policy)
+  if (policy is not None and policy.compute_dtype is not None and
+      not dtypes.as_dtype(policy.compute_dtype).is_floating):
+    raise ValueError('set_policy can only be used to set the global policy to '
+                     'floating-point policies, such as "float32" and '
+                     '"mixed_float16", but got policy: %s'
+                     % (policy.name,))
   _global_policy = policy
   mixed_precision_global_state.using_mixed_precision_policy = is_mixed_policy
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/policy_test.py b/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
index 94880a9b239..9ebcc3558e6 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/policy_test.py
@@ -155,6 +155,21 @@ class PolicyTest(test.TestCase, parameterized.TestCase):
     finally:
       mp_policy.set_policy(None)
 
+  @testing_utils.enable_v2_dtype_behavior
+  def test_global_policy_dtype_error(self):
+    with self.assertRaisesRegex(
+        ValueError,
+        'set_policy can only be used to set the global policy to '
+        'floating-point policies, such as "float32" and "mixed_float16", but '
+        'got policy: int32'):
+      mp_policy.set_policy('int32')
+    with self.assertRaisesRegex(
+        ValueError,
+        'set_policy can only be used to set the global policy to '
+        'floating-point policies, such as "float32" and "mixed_float16", but '
+        'got policy: complex64'):
+      mp_policy.set_policy(mp_policy.Policy('complex64'))
+
   @testing_utils.enable_v2_dtype_behavior
   def test_loss_scale_warning(self):
     with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
@@ -174,22 +189,24 @@ class PolicyTest(test.TestCase, parameterized.TestCase):
 
   @testing_utils.enable_v2_dtype_behavior
   def test_device_compatibility_warning(self):
-    with context.eager_mode():
-      device_compatibility_check._logged_compatibility_check = False
+    if not context.executing_eagerly():
+      self.skipTest('Run in eager mode only.')
+
+    device_compatibility_check._logged_compatibility_check = False
+    with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
+      mp_policy.Policy('mixed_float16')
+    if config_module.list_physical_devices('GPU'):
+      mock_warn.assert_not_called()
+    else:
+      self.assertRegex(
+          mock_warn.call_args[0][0],
+          r'Mixed precision compatibility check \(mixed_float16\): WARNING.*')
+
+    if config_module.list_physical_devices('GPU'):
+      # Assert message is only logged once
       with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
         mp_policy.Policy('mixed_float16')
-      if config_module.list_physical_devices('GPU'):
-        mock_warn.assert_not_called()
-      else:
-        self.assertRegex(
-            mock_warn.call_args[0][0],
-            r'Mixed precision compatibility check \(mixed_float16\): WARNING.*')
-
-      if config_module.list_physical_devices('GPU'):
-        # Assert message is only logged once
-        with test.mock.patch.object(tf_logging, 'warn') as mock_warn:
-          mp_policy.Policy('mixed_float16')
-        mock_warn.assert_not_called()
+      mock_warn.assert_not_called()
 
   @testing_utils.enable_v2_dtype_behavior
   def test_policy_scope(self):
diff --git a/tensorflow/python/keras/mixed_precision/experimental/test_util.py b/tensorflow/python/keras/mixed_precision/experimental/test_util.py
index 937b378115d..c0d9cbf98d6 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/test_util.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/test_util.py
@@ -55,7 +55,7 @@ def create_identity_with_grad_check_fn(expected_gradient, expected_dtype=None):
       if expected_dtype:
         assert dx.dtype == expected_dtype, (
             'dx.dtype should be %s but is: %s' % (expected_dtype, dx.dtype))
-      expected_tensor = ops.convert_to_tensor_v2(
+      expected_tensor = ops.convert_to_tensor_v2_with_dispatch(
           expected_gradient, dtype=dx.dtype, name='expected_gradient')
       # Control dependency is to ensure input is available. It's possible the
       # dataset will throw a StopIteration to indicate there is no more data, in
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 76324621a8b..b3737b5c2c4 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -22,7 +22,7 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import metrics as metrics_module
-from tensorflow.python.keras import optimizers
+from tensorflow.python.keras import optimizer_v1
 from tensorflow.python.keras.engine import functional
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
@@ -682,8 +682,8 @@ def clone_and_build_model(
         clone._set_inputs(input_tensors)
 
   if compile_clone:
-    if isinstance(orig_optimizer, optimizers.TFOptimizer):
-      optimizer = optimizers.TFOptimizer(
+    if isinstance(orig_optimizer, optimizer_v1.TFOptimizer):
+      optimizer = optimizer_v1.TFOptimizer(
           orig_optimizer.optimizer, optimizer_iterations)
       K.track_tf_optimizer(optimizer)
     else:
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index 8411ed0d3ea..854a0cabd3e 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics
 from tensorflow.python.keras import models
+from tensorflow.python.keras import optimizer_v1
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -420,10 +421,9 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase):
     """Assert that two models have the same compile parameters."""
 
     self.assertEqual('mse', model.loss)
-    self.assertTrue(
-        isinstance(model.optimizer,
-                   (keras.optimizers.RMSprop,
-                    keras.optimizer_v2.rmsprop.RMSprop)))
+    self.assertIsInstance(
+        model.optimizer,
+        (optimizer_v1.RMSprop, keras.optimizer_v2.rmsprop.RMSprop))
 
   def _clone_and_build_test_helper(self, model, model_type):
     inp = np.random.random((10, 4))
diff --git a/tensorflow/python/keras/optimizer_v1.py b/tensorflow/python/keras/optimizer_v1.py
new file mode 100644
index 00000000000..24cb0aaecff
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v1.py
@@ -0,0 +1,839 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=invalid-name
+# pylint: disable=g-classes-have-attributes
+"""Legacy v1 optimizer classes.
+
+For more examples see the base class `tf.compat.v1.keras.optimizers.Optimizer`.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend as K
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import training_util
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.util import nest
+
+
+class Optimizer(object):
+  """Abstract optimizer base class.
+
+  Note: this is the parent class of all optimizers, not an actual optimizer
+  that can be used for training models.
+
+  All Keras optimizers support the following keyword arguments:
+
+      clipnorm: float >= 0. Gradients will be clipped
+          when their L2 norm exceeds this value.
+      clipvalue: float >= 0. Gradients will be clipped
+          when their absolute value exceeds this value.
+  """
+
+  def __init__(self, **kwargs):
+    allowed_kwargs = {'clipnorm', 'clipvalue'}
+    for k in kwargs:
+      if k not in allowed_kwargs:
+        raise TypeError('Unexpected keyword argument '
+                        'passed to optimizer: ' + str(k))
+      # checks that clipnorm >= 0 and clipvalue >= 0
+      if kwargs[k] < 0:
+        raise ValueError('Expected {} >= 0, received: {}'.format(k, kwargs[k]))
+    self.__dict__.update(kwargs)
+    self.updates = []
+    self.weights = []
+
+  # Set this to False, indicating `apply_gradients` does not take the
+  # `experimental_aggregate_gradients` argument.
+  _HAS_AGGREGATE_GRAD = False
+
+  def _create_all_weights(self, params):
+    """Creates and sets all optimizer weights.
+
+    Args:
+      params: list or tuple of `Variable` objects that will be minimized
+        using this optimizer.
+
+    Returns:
+      Specific weight values that are used in `get_updates`
+    """
+    raise NotImplementedError
+
+  def get_updates(self, loss, params):
+    raise NotImplementedError
+
+  def get_gradients(self, loss, params):
+    """Returns gradients of `loss` with respect to `params`.
+
+    Arguments:
+        loss: Loss tensor.
+        params: List of variables.
+
+    Returns:
+        List of gradient tensors.
+
+    Raises:
+        ValueError: In case any gradient cannot be computed (e.g. if gradient
+          function not implemented).
+    """
+    grads = K.gradients(loss, params)
+    if any(g is None for g in grads):
+      raise ValueError('An operation has `None` for gradient. '
+                       'Please make sure that all of your ops have a '
+                       'gradient defined (i.e. are differentiable). '
+                       'Common ops without gradient: '
+                       'K.argmax, K.round, K.eval.')
+    if hasattr(self, 'clipnorm'):
+      grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
+    if hasattr(self, 'clipvalue'):
+      grads = [
+          clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
+          for g in grads
+      ]
+    return grads
+
+  def set_weights(self, weights):
+    """Sets the weights of the optimizer, from Numpy arrays.
+
+    Should only be called after computing the gradients
+    (otherwise the optimizer has no weights).
+
+    Arguments:
+        weights: a list of Numpy arrays. The number of arrays and their shape
+          must match number of the dimensions of the weights of the optimizer
+          (i.e. it should match the output of `get_weights`).
+
+    Raises:
+        ValueError: in case of incompatible weight shapes.
+    """
+    params = self.weights
+    if len(params) != len(weights):
+      raise ValueError('Length of the specified weight list (' +
+                       str(len(weights)) +
+                       ') does not match the number of weights '
+                       'of the optimizer (' + str(len(params)) + ')')
+    weight_value_tuples = []
+    param_values = K.batch_get_value(params)
+    for pv, p, w in zip(param_values, params, weights):
+      if pv.shape != w.shape:
+        raise ValueError('Optimizer weight shape ' + str(pv.shape) +
+                         ' not compatible with '
+                         'provided weight shape ' + str(w.shape))
+      weight_value_tuples.append((p, w))
+    K.batch_set_value(weight_value_tuples)
+
+  def get_weights(self):
+    """Returns the current value of the weights of the optimizer.
+
+    Returns:
+        A list of numpy arrays.
+    """
+    return K.batch_get_value(self.weights)
+
+  def get_config(self):
+    config = {}
+    if hasattr(self, 'clipnorm'):
+      config['clipnorm'] = self.clipnorm
+    if hasattr(self, 'clipvalue'):
+      config['clipvalue'] = self.clipvalue
+    return config
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
+
+
+class SGD(Optimizer):
+  """Stochastic gradient descent optimizer.
+
+  Includes support for momentum,
+  learning rate decay, and Nesterov momentum.
+
+  Arguments:
+      lr: float >= 0. Learning rate.
+      momentum: float >= 0. Parameter that accelerates SGD in the relevant
+        direction and dampens oscillations.
+      decay: float >= 0. Learning rate decay over each update.
+      nesterov: boolean. Whether to apply Nesterov momentum.
+  """
+
+  def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, **kwargs):
+    super(SGD, self).__init__(**kwargs)
+    with K.name_scope(self.__class__.__name__):
+      self.iterations = K.variable(0, dtype='int64', name='iterations')
+      self.lr = K.variable(lr, name='lr')
+      self.momentum = K.variable(momentum, name='momentum')
+      self.decay = K.variable(decay, name='decay')
+    self.initial_decay = decay
+    self.nesterov = nesterov
+
+  def _create_all_weights(self, params):
+    shapes = [K.int_shape(p) for p in params]
+    moments = [K.zeros(shape) for shape in shapes]
+    self.weights = [self.iterations] + moments
+    return moments
+
+  def get_updates(self, loss, params):
+    grads = self.get_gradients(loss, params)
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
+
+    lr = self.lr
+    if self.initial_decay > 0:
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. /
+          (1. +
+           self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
+    # momentum
+    moments = self._create_all_weights(params)
+    for p, g, m in zip(params, grads, moments):
+      v = self.momentum * m - lr * g  # velocity
+      self.updates.append(state_ops.assign(m, v))
+
+      if self.nesterov:
+        new_p = p + self.momentum * v - lr * g
+      else:
+        new_p = p + v
+
+      # Apply constraints.
+      if getattr(p, 'constraint', None) is not None:
+        new_p = p.constraint(new_p)
+
+      self.updates.append(state_ops.assign(p, new_p))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'momentum': float(K.get_value(self.momentum)),
+        'decay': float(K.get_value(self.decay)),
+        'nesterov': self.nesterov
+    }
+    base_config = super(SGD, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class RMSprop(Optimizer):
+  """RMSProp optimizer.
+
+  It is recommended to leave the parameters of this optimizer
+  at their default values
+  (except the learning rate, which can be freely tuned).
+
+  Arguments:
+      lr: float >= 0. Learning rate.
+      rho: float >= 0.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
+      decay: float >= 0. Learning rate decay over each update.
+  """
+
+  def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0., **kwargs):
+    super(RMSprop, self).__init__(**kwargs)
+    with K.name_scope(self.__class__.__name__):
+      self.lr = K.variable(lr, name='lr')
+      self.rho = K.variable(rho, name='rho')
+      self.decay = K.variable(decay, name='decay')
+      self.iterations = K.variable(0, dtype='int64', name='iterations')
+    if epsilon is None:
+      epsilon = K.epsilon()
+    self.epsilon = epsilon
+    self.initial_decay = decay
+
+  def _create_all_weights(self, params):
+    accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
+    self.weights = accumulators
+    return accumulators
+
+  def get_updates(self, loss, params):
+    grads = self.get_gradients(loss, params)
+    accumulators = self._create_all_weights(params)
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
+
+    lr = self.lr
+    if self.initial_decay > 0:
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. /
+          (1. +
+           self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
+
+    for p, g, a in zip(params, grads, accumulators):
+      # update accumulator
+      new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
+      self.updates.append(state_ops.assign(a, new_a))
+      new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
+
+      # Apply constraints.
+      if getattr(p, 'constraint', None) is not None:
+        new_p = p.constraint(new_p)
+
+      self.updates.append(state_ops.assign(p, new_p))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'rho': float(K.get_value(self.rho)),
+        'decay': float(K.get_value(self.decay)),
+        'epsilon': self.epsilon
+    }
+    base_config = super(RMSprop, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Adagrad(Optimizer):
+  """Adagrad optimizer.
+
+  Adagrad is an optimizer with parameter-specific learning rates,
+  which are adapted relative to how frequently a parameter gets
+  updated during training. The more updates a parameter receives,
+  the smaller the updates.
+
+  It is recommended to leave the parameters of this optimizer
+  at their default values.
+
+  # Arguments
+      lr: float >= 0. Initial learning rate.
+      epsilon: float >= 0. If `None`, defaults to `K.epsilon()`.
+      decay: float >= 0. Learning rate decay over each update.
+
+  # References
+      - [Adaptive Subgradient Methods for Online Learning and Stochastic
+      Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+  """
+
+  def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs):
+    super(Adagrad, self).__init__(**kwargs)
+    with K.name_scope(self.__class__.__name__):
+      self.lr = K.variable(lr, name='lr')
+      self.decay = K.variable(decay, name='decay')
+      self.iterations = K.variable(0, dtype='int64', name='iterations')
+    if epsilon is None:
+      epsilon = K.epsilon()
+    self.epsilon = epsilon
+    self.initial_decay = decay
+
+  def _create_all_weights(self, params):
+    shapes = [K.int_shape(p) for p in params]
+    accumulators = [K.zeros(shape) for shape in shapes]
+    self.weights = accumulators
+    return accumulators
+
+  def get_updates(self, loss, params):
+    grads = self.get_gradients(loss, params)
+    accumulators = self._create_all_weights(params)
+
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
+
+    lr = self.lr
+    if self.initial_decay > 0:
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. /
+          (1. +
+           self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
+
+    for p, g, a in zip(params, grads, accumulators):
+      new_a = a + math_ops.square(g)  # update accumulator
+      self.updates.append(state_ops.assign(a, new_a))
+      new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
+
+      # Apply constraints.
+      if getattr(p, 'constraint', None) is not None:
+        new_p = p.constraint(new_p)
+
+      self.updates.append(state_ops.assign(p, new_p))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'decay': float(K.get_value(self.decay)),
+        'epsilon': self.epsilon
+    }
+    base_config = super(Adagrad, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Adadelta(Optimizer):
+  """Adadelta optimizer.
+
+  Adadelta is a more robust extension of Adagrad
+  that adapts learning rates based on a moving window of gradient updates,
+  instead of accumulating all past gradients. This way, Adadelta continues
+  learning even when many updates have been done. Compared to Adagrad, in the
+  original version of Adadelta you don't have to set an initial learning
+  rate. In this version, initial learning rate and decay factor can
+  be set, as in most other Keras optimizers.
+
+  It is recommended to leave the parameters of this optimizer
+  at their default values.
+
+  # Arguments
+      lr: float >= 0. Initial learning rate, defaults to 1.
+          It is recommended to leave it at the default value.
+      rho: float >= 0. Adadelta decay factor, corresponding to fraction of
+          gradient to keep at each time step.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
+      decay: float >= 0. Initial learning rate decay.
+
+  # References
+      - [Adadelta - an adaptive learning rate
+      method](http://arxiv.org/abs/1212.5701)
+  """
+
+  def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0., **kwargs):
+    super(Adadelta, self).__init__(**kwargs)
+    with K.name_scope(self.__class__.__name__):
+      self.lr = K.variable(lr, name='lr')
+      self.decay = K.variable(decay, name='decay')
+      self.iterations = K.variable(0, dtype='int64', name='iterations')
+    if epsilon is None:
+      epsilon = K.epsilon()
+    self.rho = rho
+    self.epsilon = epsilon
+    self.initial_decay = decay
+
+  def _create_all_weights(self, params):
+    shapes = [K.int_shape(p) for p in params]
+    accumulators = [K.zeros(shape) for shape in shapes]
+    delta_accumulators = [K.zeros(shape) for shape in shapes]
+    self.weights = accumulators + delta_accumulators
+    return accumulators, delta_accumulators
+
+  def get_updates(self, loss, params):
+    grads = self.get_gradients(loss, params)
+    self.updates = [state_ops.assign_add(self.iterations, 1)]
+    accumulators, delta_accumulators = self._create_all_weights(params)
+
+    lr = self.lr
+    if self.initial_decay > 0:
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. /
+          (1. +
+           self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
+
+    for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
+      # update accumulator
+      new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
+      self.updates.append(state_ops.assign(a, new_a))
+
+      # use the new accumulator and the *old* delta_accumulator
+      update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon)
+      new_p = p - lr * update
+
+      # Apply constraints.
+      if getattr(p, 'constraint', None) is not None:
+        new_p = p.constraint(new_p)
+
+      self.updates.append(state_ops.assign(p, new_p))
+
+      # update delta_accumulator
+      new_d_a = self.rho * d_a + (1 - self.rho) * math_ops.square(update)
+      self.updates.append(state_ops.assign(d_a, new_d_a))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'rho': self.rho,
+        'decay': float(K.get_value(self.decay)),
+        'epsilon': self.epsilon
+    }
+    base_config = super(Adadelta, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Adam(Optimizer):
+  """Adam optimizer.
+
+  Default parameters follow those provided in the original paper.
+
+  Arguments:
+      lr: float >= 0. Learning rate.
+      beta_1: float, 0 < beta < 1. Generally close to 1.
+      beta_2: float, 0 < beta < 1. Generally close to 1.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
+      decay: float >= 0. Learning rate decay over each update.
+      amsgrad: boolean. Whether to apply the AMSGrad variant of this algorithm
+        from the paper "On the Convergence of Adam and Beyond".
+  """
+
+  def __init__(self,
+               lr=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=None,
+               decay=0.,
+               amsgrad=False,
+               **kwargs):
+    super(Adam, self).__init__(**kwargs)
+    with K.name_scope(self.__class__.__name__):
+      self.iterations = K.variable(0, dtype='int64', name='iterations')
+      self.lr = K.variable(lr, name='lr')
+      self.beta_1 = K.variable(beta_1, name='beta_1')
+      self.beta_2 = K.variable(beta_2, name='beta_2')
+      self.decay = K.variable(decay, name='decay')
+    if epsilon is None:
+      epsilon = K.epsilon()
+    self.epsilon = epsilon
+    self.initial_decay = decay
+    self.amsgrad = amsgrad
+
+  def _create_all_weights(self, params):
+    ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
+    vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
+    if self.amsgrad:
+      vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
+    else:
+      vhats = [K.zeros(1) for _ in params]
+    self.weights = [self.iterations] + ms + vs + vhats
+    return ms, vs, vhats
+
+  def get_updates(self, loss, params):
+    grads = self.get_gradients(loss, params)
+    self.updates = []
+
+    lr = self.lr
+    if self.initial_decay > 0:
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. /
+          (1. +
+           self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
+
+    with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
+      t = math_ops.cast(self.iterations, K.floatx())
+    lr_t = lr * (
+        K.sqrt(1. - math_ops.pow(self.beta_2, t)) /
+        (1. - math_ops.pow(self.beta_1, t)))
+
+    ms, vs, vhats = self._create_all_weights(params)
+    for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
+      m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
+      v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g)
+      if self.amsgrad:
+        vhat_t = math_ops.maximum(vhat, v_t)
+        p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
+        self.updates.append(state_ops.assign(vhat, vhat_t))
+      else:
+        p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
+
+      self.updates.append(state_ops.assign(m, m_t))
+      self.updates.append(state_ops.assign(v, v_t))
+      new_p = p_t
+
+      # Apply constraints.
+      if getattr(p, 'constraint', None) is not None:
+        new_p = p.constraint(new_p)
+
+      self.updates.append(state_ops.assign(p, new_p))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'beta_1': float(K.get_value(self.beta_1)),
+        'beta_2': float(K.get_value(self.beta_2)),
+        'decay': float(K.get_value(self.decay)),
+        'epsilon': self.epsilon,
+        'amsgrad': self.amsgrad
+    }
+    base_config = super(Adam, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Adamax(Optimizer):
+  """Adamax optimizer from Adam paper's Section 7.
+
+  It is a variant of Adam based on the infinity norm.
+  Default parameters follow those provided in the paper.
+
+  Arguments:
+      lr: float >= 0. Learning rate.
+      beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
+      decay: float >= 0. Learning rate decay over each update.
+  """
+
+  def __init__(self,
+               lr=0.002,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=None,
+               decay=0.,
+               **kwargs):
+    super(Adamax, self).__init__(**kwargs)
+    with K.name_scope(self.__class__.__name__):
+      self.iterations = K.variable(0, dtype='int64', name='iterations')
+      self.lr = K.variable(lr, name='lr')
+      self.beta_1 = K.variable(beta_1, name='beta_1')
+      self.beta_2 = K.variable(beta_2, name='beta_2')
+      self.decay = K.variable(decay, name='decay')
+    if epsilon is None:
+      epsilon = K.epsilon()
+    self.epsilon = epsilon
+    self.initial_decay = decay
+
+  def _create_all_weights(self, params):
+
+    shapes = [K.int_shape(p) for p in params]
+    # zero init of 1st moment
+    ms = [K.zeros(shape) for shape in shapes]
+    # zero init of exponentially weighted infinity norm
+    us = [K.zeros(shape) for shape in shapes]
+    self.weights = [self.iterations] + ms + us
+    return ms, us
+
+  def get_updates(self, loss, params):
+    grads = self.get_gradients(loss, params)
+    self.updates = []
+
+    lr = self.lr
+    if self.initial_decay > 0:
+      lr = lr * (  # pylint: disable=g-no-augmented-assignment
+          1. /
+          (1. +
+           self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
+
+    with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
+      t = math_ops.cast(self.iterations, K.floatx())
+    lr_t = lr / (1. - math_ops.pow(self.beta_1, t))
+
+    ms, us = self._create_all_weights(params)
+
+    for p, g, m, u in zip(params, grads, ms, us):
+
+      m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
+      u_t = math_ops.maximum(self.beta_2 * u, math_ops.abs(g))
+      p_t = p - lr_t * m_t / (u_t + self.epsilon)
+
+      self.updates.append(state_ops.assign(m, m_t))
+      self.updates.append(state_ops.assign(u, u_t))
+      new_p = p_t
+
+      # Apply constraints.
+      if getattr(p, 'constraint', None) is not None:
+        new_p = p.constraint(new_p)
+
+      self.updates.append(state_ops.assign(p, new_p))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'beta_1': float(K.get_value(self.beta_1)),
+        'beta_2': float(K.get_value(self.beta_2)),
+        'decay': float(K.get_value(self.decay)),
+        'epsilon': self.epsilon
+    }
+    base_config = super(Adamax, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class Nadam(Optimizer):
+  """Nesterov Adam optimizer.
+
+  Much like Adam is essentially RMSprop with momentum,
+  Nadam is Adam RMSprop with Nesterov momentum.
+
+  Default parameters follow those provided in the paper.
+  It is recommended to leave the parameters of this optimizer
+  at their default values.
+
+  Arguments:
+      lr: float >= 0. Learning rate.
+      beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
+      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
+  """
+
+  def __init__(self,
+               lr=0.002,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=None,
+               schedule_decay=0.004,
+               **kwargs):
+    super(Nadam, self).__init__(**kwargs)
+    with K.name_scope(self.__class__.__name__):
+      self.iterations = K.variable(0, dtype='int64', name='iterations')
+      self.m_schedule = K.variable(1., name='m_schedule')
+      self.lr = K.variable(lr, name='lr')
+      self.beta_1 = K.variable(beta_1, name='beta_1')
+      self.beta_2 = K.variable(beta_2, name='beta_2')
+    if epsilon is None:
+      epsilon = K.epsilon()
+    self.epsilon = epsilon
+    self.schedule_decay = schedule_decay
+
+  def _create_all_weights(self, params):
+    shapes = [K.int_shape(p) for p in params]
+    ms = [K.zeros(shape) for shape in shapes]
+    vs = [K.zeros(shape) for shape in shapes]
+
+    self.weights = [self.iterations, self.m_schedule] + ms + vs
+    return ms, vs
+
+  def get_updates(self, loss, params):
+    grads = self.get_gradients(loss, params)
+    self.updates = []
+
+    with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
+      t = math_ops.cast(self.iterations, K.floatx())
+
+    # Due to the recommendations in [2], i.e. warming momentum schedule
+    momentum_cache_t = self.beta_1 * (
+        1. - 0.5 *
+        (math_ops.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
+    momentum_cache_t_1 = self.beta_1 * (
+        1. - 0.5 *
+        (math_ops.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
+    m_schedule_new = self.m_schedule * momentum_cache_t
+    m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
+    self.updates.append((self.m_schedule, m_schedule_new))
+
+    ms, vs = self._create_all_weights(params)
+
+    for p, g, m, v in zip(params, grads, ms, vs):
+      # the following equations given in [1]
+      g_prime = g / (1. - m_schedule_new)
+      m_t = self.beta_1 * m + (1. - self.beta_1) * g
+      m_t_prime = m_t / (1. - m_schedule_next)
+      v_t = self.beta_2 * v + (1. - self.beta_2) * math_ops.square(g)
+      v_t_prime = v_t / (1. - math_ops.pow(self.beta_2, t))
+      m_t_bar = (1. -
+                 momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
+
+      self.updates.append(state_ops.assign(m, m_t))
+      self.updates.append(state_ops.assign(v, v_t))
+
+      p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon)
+      new_p = p_t
+
+      # Apply constraints.
+      if getattr(p, 'constraint', None) is not None:
+        new_p = p.constraint(new_p)
+
+      self.updates.append(state_ops.assign(p, new_p))
+    return self.updates
+
+  def get_config(self):
+    config = {
+        'lr': float(K.get_value(self.lr)),
+        'beta_1': float(K.get_value(self.beta_1)),
+        'beta_2': float(K.get_value(self.beta_2)),
+        'epsilon': self.epsilon,
+        'schedule_decay': self.schedule_decay
+    }
+    base_config = super(Nadam, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+class TFOptimizer(Optimizer, trackable.Trackable):
+  """Wrapper class for native TensorFlow optimizers."""
+
+  def __init__(self, optimizer, iterations=None):  # pylint: disable=super-init-not-called
+    self.optimizer = optimizer
+    self._track_trackable(optimizer, name='optimizer')
+    if iterations is None:
+      with K.name_scope(self.__class__.__name__):
+        self.iterations = K.variable(0, dtype='int64', name='iterations')
+    else:
+      self.iterations = iterations
+    self._track_trackable(self.iterations, name='global_step')
+
+  def _clip_gradients(self, grads):
+    """Clip gradients according to the clipnorm and clipvalue attributes."""
+    # TFOptimizer wrapper has no gradient clipping options.
+    return grads
+
+  def minimize(self, loss, var_list, grad_loss=None, tape=None):
+    """Mimics the `OptimizerV2.minimize` API."""
+    if not callable(loss) and tape is None:
+      raise ValueError('`tape` is required when a `Tensor` loss is passed.')
+    tape = tape if tape is not None else backprop.GradientTape()
+
+    if callable(loss):
+      with tape:
+        if not callable(var_list):
+          tape.watch(var_list)
+        loss = loss()
+        if callable(var_list):
+          var_list = var_list()
+
+    var_list = nest.flatten(var_list)
+    if var_list:
+      grads = tape.gradient(loss, var_list, grad_loss)
+      grads_and_vars = list(zip(grads, var_list))
+      self.apply_gradients(grads_and_vars)
+
+  def apply_gradients(self, grads_and_vars):
+    self.optimizer.apply_gradients(grads_and_vars, global_step=self.iterations)
+
+  def get_grads(self, loss, params):
+    return self.optimizer.compute_gradients(loss, params)
+
+  def get_updates(self, loss, params):
+    if distribution_strategy_context.has_strategy():
+      self.updates = []
+
+      if not params:
+        # After the model vars have been created, the second call to get_updates
+        # is called with params as an empty list. This ensures that we call
+        # compute_gradients with params=None.
+        grads = self.optimizer.compute_gradients(loss)
+      else:
+        grads = self.optimizer.compute_gradients(loss, params)
+      global_step = training_util.get_global_step()
+      opt_update = self.optimizer.apply_gradients(grads, global_step)
+    else:
+      if not params:
+        self.updates = [state_ops.assign_add(self.iterations, 1)]
+        return self.updates
+
+      # Updates list starts out empty because the iterations variable is
+      # incremented in optimizer.apply_gradients()
+      self.updates = []
+      grads = self.optimizer.compute_gradients(loss, params)
+      opt_update = self.optimizer.apply_gradients(
+          grads, global_step=self.iterations)
+
+    self.updates.append(opt_update)
+    return self.updates
+
+  @property
+  def weights(self):
+    raise NotImplementedError
+
+  def get_config(self):
+    raise NotImplementedError
+
+  def from_config(self, config):
+    raise NotImplementedError
+
+
+# Aliases.
+
+sgd = SGD
+rmsprop = RMSprop
+adagrad = Adagrad
+adadelta = Adadelta
+adam = Adam
+adamax = Adamax
+nadam = Nadam
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index d5341006e46..6d774f606b1 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -14,7 +14,11 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
 
 py_library(
     name = "optimizer_v2",
@@ -88,6 +92,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["adagrad_test.py"],
     shard_count = 4,
+    tfrt_enabled = True,
     deps = [
         ":optimizer_v2",
         "//tensorflow/python:client_testlib",
@@ -129,6 +134,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["adamax_test.py"],
     shard_count = 4,
+    # TODO(b/168527439): invalid resource variable reference on GPU for TFRT.
     deps = [
         ":optimizer_v2",
         "//tensorflow/python:client_testlib",
@@ -149,6 +155,8 @@ cuda_py_test(
     size = "medium",
     srcs = ["adadelta_test.py"],
     shard_count = 4,
+    tags = ["no_rocm"],
+    # TODO(b/168527439): invalid resource variable reference on GPU for TFRT.
     deps = [
         ":optimizer_v2",
         "//tensorflow/python:client_testlib",
@@ -169,6 +177,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["ftrl_test.py"],
     shard_count = 4,
+    tfrt_enabled = True,
     deps = [
         ":optimizer_v2",
         "//tensorflow/python:client_testlib",
@@ -208,6 +217,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["nadam_test.py"],
     shard_count = 4,
+    tfrt_enabled = True,
     deps = [
         ":optimizer_v2",
         "//tensorflow/python:client_testlib",
@@ -254,6 +264,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["learning_rate_schedule_test.py"],
     shard_count = 4,
+    tfrt_enabled = True,
     deps = [
         ":optimizer_v2",
         "//tensorflow/python:client_testlib",
@@ -268,6 +279,7 @@ cuda_py_test(
     name = "legacy_learning_rate_decay_test",
     size = "medium",
     srcs = ["legacy_learning_rate_decay_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":legacy_learning_rate_decay",
         "//tensorflow/python:framework_test_lib",
@@ -276,6 +288,7 @@ cuda_py_test(
         "//tensorflow/python:training_lib",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/keras",
         "//tensorflow/python/keras:combinations",
     ],
 )
@@ -285,6 +298,8 @@ cuda_py_test(
     size = "medium",
     srcs = ["rmsprop_test.py"],
     shard_count = 2,
+    tags = ["no_rocm"],
+    # TODO(b/168527439): invalid resource variable reference on GPU for TFRT.
     deps = [
         ":optimizer_v2",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta.py b/tensorflow/python/keras/optimizer_v2/adadelta.py
index 8c895ae07f4..404b3f81e3f 100644
--- a/tensorflow/python/keras/optimizer_v2/adadelta.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta.py
@@ -101,7 +101,8 @@ class Adadelta(optimizer_v2.OptimizerV2):
     super(Adadelta, self)._prepare_local(var_device, var_dtype, apply_state)
     apply_state[(var_device, var_dtype)].update(
         dict(
-            epsilon=ops.convert_to_tensor_v2(self.epsilon, var_dtype),
+            epsilon=ops.convert_to_tensor_v2_with_dispatch(
+                self.epsilon, var_dtype),
             rho=array_ops.identity(self._get_hyper('rho', var_dtype))))
 
   def set_weights(self, weights):
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta_test.py b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
index fd785aa412f..29835f2cc04 100644
--- a/tensorflow/python/keras/optimizer_v2/adadelta_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
@@ -148,9 +148,9 @@ class AdadeltaOptimizerTest(test.TestCase, parameterized.TestCase):
   def testResourceBasic(self):
     self.doTestBasic(use_resource=True)
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testBasicCallableParams(self):
-    with context.eager_mode():
-      self.doTestBasic(use_resource=True, use_callable_params=True)
+    self.doTestBasic(use_resource=True, use_callable_params=True)
 
   def testMinimizeSparseResourceVariable(self):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad.py b/tensorflow/python/keras/optimizer_v2/adagrad.py
index ba76b837942..4d3294ab9f8 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@@ -87,7 +87,8 @@ class Adagrad(optimizer_v2.OptimizerV2):
     super(Adagrad, self)._prepare_local(var_device, var_dtype, apply_state)
     apply_state[(var_device, var_dtype)].update(
         dict(
-            epsilon=ops.convert_to_tensor_v2(self.epsilon, var_dtype),
+            epsilon=ops.convert_to_tensor_v2_with_dispatch(
+                self.epsilon, var_dtype),
             neg_lr_t=-apply_state[(var_device, var_dtype)]['lr_t'],
             zero=array_ops.zeros((), dtype=dtypes.int64)))
 
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad_test.py b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
index 4496f0b98e7..5a8f9d6ad77 100644
--- a/tensorflow/python/keras/optimizer_v2/adagrad_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
@@ -118,9 +118,9 @@ class AdagradOptimizerTest(test.TestCase, parameterized.TestCase):
   def testBasic(self):
     self.doTestBasic()
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testBasicCallableParams(self):
-    with context.eager_mode():
-      self.doTestBasic(use_callable_params=True)
+    self.doTestBasic(use_callable_params=True)
 
   def testBasicWithLearningRateDecay(self):
     for dtype in _DATA_TYPES:
diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py
index 1fccd116012..e4896fd167e 100644
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@@ -144,7 +144,8 @@ class Adam(optimizer_v2.OptimizerV2):
     apply_state[(var_device, var_dtype)].update(
         dict(
             lr=lr,
-            epsilon=ops.convert_to_tensor_v2(self.epsilon, var_dtype),
+            epsilon=ops.convert_to_tensor_v2_with_dispatch(
+                self.epsilon, var_dtype),
             beta_1_t=beta_1_t,
             beta_1_power=beta_1_power,
             one_minus_beta_1_t=1 - beta_1_t,
@@ -396,7 +397,8 @@ class NonFusedAdam(optimizer_v2.OptimizerV2):
     apply_state[(var_device, var_dtype)].update(
         dict(
             lr=lr,
-            epsilon=ops.convert_to_tensor_v2(self.epsilon, var_dtype),
+            epsilon=ops.convert_to_tensor_v2_with_dispatch(
+                self.epsilon, var_dtype),
             beta_1_t=beta_1_t,
             beta_1_power=beta_1_power,
             one_minus_beta_1_t=1 - beta_1_t,
diff --git a/tensorflow/python/keras/optimizer_v2/adam_test.py b/tensorflow/python/keras/optimizer_v2/adam_test.py
index ad53e89bd81..9cf58177446 100644
--- a/tensorflow/python/keras/optimizer_v2/adam_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adam_test.py
@@ -26,7 +26,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import combinations
-from tensorflow.python.keras import optimizers
+from tensorflow.python.keras import optimizer_v1
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.ops import array_ops
@@ -254,9 +254,9 @@ class AdamOptimizerTest(test.TestCase, parameterized.TestCase):
   def testResourceBasic(self):
     self.doTestBasic()
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testBasicCallableParams(self):
-    with context.eager_mode():
-      self.doTestBasic(use_callable_params=True)
+    self.doTestBasic(use_callable_params=True)
 
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testBasicWithAmsgrad(self):
@@ -525,19 +525,19 @@ class AdamOptimizerTest(test.TestCase, parameterized.TestCase):
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testSlotsUniqueEager(self):
-    with context.eager_mode():
-      v1 = variables.Variable(1.)
-      v2 = variables.Variable(1.)
-      opt = adam.Adam(1.)
-      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
-      # There should be iteration, and two unique slot variables for v1 and v2.
-      self.assertEqual(5, len(set(v.ref() for v in opt.variables())))
-      self.assertEqual(
-          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+    v1 = variables.Variable(1.)
+    v2 = variables.Variable(1.)
+    opt = adam.Adam(1.)
+    opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+    # There should be iteration, and two unique slot variables for v1 and v2.
+    self.assertLen(set(v.ref() for v in opt.variables()), 5)
+    self.assertEqual(
+        self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
 
   def testSetWeightsFromV1AdamWithoutMinimize(self):
-    keras_v1_adam = optimizers.Adam()
+    keras_v1_adam = optimizer_v1.Adam()
     keras_v2_adam = adam.Adam()
     keras_v2_adam.set_weights(keras_v1_adam.get_weights())
     keras_v1_iteration = keras_v1_adam.iterations
@@ -708,9 +708,9 @@ class NonFusedAdamOptimizerTest(test.TestCase, parameterized.TestCase):
   def testResourceBasic(self):
     self.doTestBasic()
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testBasicCallableParams(self):
-    with context.eager_mode():
-      self.doTestBasic(use_callable_params=True)
+    self.doTestBasic(use_callable_params=True)
 
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testBasicWithAmsgrad(self):
diff --git a/tensorflow/python/keras/optimizer_v2/adamax.py b/tensorflow/python/keras/optimizer_v2/adamax.py
index 3f4312c731e..26cc59b1f98 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax.py
@@ -122,7 +122,8 @@ class Adamax(optimizer_v2.OptimizerV2):
     apply_state[(var_device, var_dtype)].update(
         dict(
             neg_scaled_lr=-lr_t / (1 - beta_1_power),
-            epsilon=ops.convert_to_tensor_v2(self.epsilon, var_dtype),
+            epsilon=ops.convert_to_tensor_v2_with_dispatch(
+                self.epsilon, var_dtype),
             beta_1_t=beta_1_t,
             beta_1_power=beta_1_power,
             one_minus_beta_1_t=1 - beta_1_t,
diff --git a/tensorflow/python/keras/optimizer_v2/adamax_test.py b/tensorflow/python/keras/optimizer_v2/adamax_test.py
index a78a9d2f443..f955df863f1 100644
--- a/tensorflow/python/keras/optimizer_v2/adamax_test.py
+++ b/tensorflow/python/keras/optimizer_v2/adamax_test.py
@@ -350,14 +350,14 @@ class AdamaxOptimizerTest(test.TestCase, parameterized.TestCase):
           self.assertAllCloseAccordingToType(var0_np, var0)
           self.assertAllCloseAccordingToType(var1_np, var1)
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testSlotsUniqueEager(self):
-    with context.eager_mode():
-      v1 = variables.Variable(1.)
-      v2 = variables.Variable(1.)
-      opt = adamax.Adamax(1.)
-      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
-      # There should be iteration, and two unique slot variables for v1 and v2.
-      self.assertEqual(5, len({id(v) for v in opt.variables()}))
+    v1 = variables.Variable(1.)
+    v2 = variables.Variable(1.)
+    opt = adamax.Adamax(1.)
+    opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+    # There should be iteration, and two unique slot variables for v1 and v2.
+    self.assertLen({id(v) for v in opt.variables()}, 5)
 
   def testConstructAdamaxWithLR(self):
     opt = adamax.Adamax(lr=1.0)
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent.py b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
index 088f56cd17a..ee7de98236b 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
@@ -43,7 +43,7 @@ class SGD(optimizer_v2.OptimizerV2):
   w = w + velocity
   ```
 
-  When `nesterov=False`, this rule becomes:
+  When `nesterov=True`, this rule becomes:
 
   ```python
   velocity = momentum * velocity - learning_rate * g
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
index 0f25beacc9a..165102bede5 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
@@ -23,7 +23,7 @@ import numpy as np
 
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -258,25 +258,30 @@ class GradientDescentOptimizerTest(test.TestCase, parameterized.TestCase):
         self.assertAllCloseAccordingToType(
             [[3.0], [4.0 - 3.0 * 0.01 - 2.0 * 0.01]], self.evaluate(var1))
 
-  def testCapturingInDefunWhileExecutingEagerly(self):
-    with context.eager_mode():
-      optimizer = gradient_descent.SGD(1.0)
+  @combinations.generate(combinations.combine(mode=["eager"]))
+  def testCapturingInFunctionWhileExecutingEagerly(self):
+    optimizer = gradient_descent.SGD(1.0)
 
-      def step():
-        self.v = variables.Variable(1.0)
-        with backprop.GradientTape() as tape:
-          loss = self.v**2
-        grad = tape.gradient(loss, self.v)
-        optimizer.apply_gradients([(grad, self.v)])
-        return self.v.read_value()
+    var_holder = {}
+    def step():
+      if not var_holder:
+        var_holder["var"] = variables.Variable(1.0)
+      else:
+        var_holder["var"].assign(1.0)
 
-      compiled_step = function.defun(step)
+      with backprop.GradientTape() as tape:
+        loss = var_holder["var"]**2
+      grad = tape.gradient(loss, var_holder["var"])
+      optimizer.apply_gradients([(grad, var_holder["var"])])
+      return var_holder["var"].read_value()
 
-      self.assertEqual(float(step()), -1.0)
-      self.assertEqual(float(compiled_step()), -1.0)
-      # This shouldn't fail; in particular, the learning rate tensor should
-      # be an EagerTensor once again, not a graph Tensor.
-      self.assertEqual(float(step()), -1.0)
+    compiled_step = def_function.function(step)
+
+    self.assertEqual(float(step()), -1.0)
+    self.assertEqual(float(compiled_step()), -1.0)
+    # This shouldn't fail; in particular, the learning rate tensor should
+    # be an EagerTensor once again, not a graph Tensor.
+    self.assertEqual(float(step()), -1.0)
 
   def testConstructSGDWithLR(self):
     opt = gradient_descent.SGD(lr=1.0)
diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
index 4dcff3d6c44..30b4f2145bb 100644
--- a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
+++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py
@@ -143,7 +143,7 @@ class ExponentialDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "ExponentialDecay") as name:
-      initial_learning_rate = ops.convert_to_tensor_v2(
+      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
       decay_steps = math_ops.cast(self.decay_steps, dtype)
@@ -237,11 +237,11 @@ class PiecewiseConstantDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "PiecewiseConstant"):
-      boundaries = nest.map_structure(ops.convert_to_tensor_v2,
+      boundaries = nest.map_structure(ops.convert_to_tensor_v2_with_dispatch,
                                       nest.flatten(self.boundaries))
-      values = nest.map_structure(ops.convert_to_tensor_v2,
+      values = nest.map_structure(ops.convert_to_tensor_v2_with_dispatch,
                                   nest.flatten(self.values))
-      x_recomp = ops.convert_to_tensor_v2(step)
+      x_recomp = ops.convert_to_tensor_v2_with_dispatch(step)
       for i, b in enumerate(boundaries):
         if b.dtype.base_dtype != x_recomp.dtype.base_dtype:
           # We cast the boundaries to have the same type as the step
@@ -374,7 +374,7 @@ class PolynomialDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "PolynomialDecay") as name:
-      initial_learning_rate = ops.convert_to_tensor_v2(
+      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
       end_learning_rate = math_ops.cast(self.end_learning_rate, dtype)
@@ -494,7 +494,7 @@ class InverseTimeDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "InverseTimeDecay") as name:
-      initial_learning_rate = ops.convert_to_tensor_v2(
+      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
       decay_steps = math_ops.cast(self.decay_steps, dtype)
@@ -588,7 +588,7 @@ class CosineDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "CosineDecay"):
-      initial_learning_rate = ops.convert_to_tensor_v2(
+      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
       decay_steps = math_ops.cast(self.decay_steps, dtype)
@@ -687,7 +687,7 @@ class CosineDecayRestarts(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "SGDRDecay") as name:
-      initial_learning_rate = ops.convert_to_tensor_v2(
+      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
       first_decay_steps = math_ops.cast(self.first_decay_steps, dtype)
@@ -824,7 +824,7 @@ class LinearCosineDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "LinearCosineDecay") as name:
-      initial_learning_rate = ops.convert_to_tensor_v2(
+      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
       decay_steps = math_ops.cast(self.decay_steps, dtype)
@@ -950,7 +950,7 @@ class NoisyLinearCosineDecay(LearningRateSchedule):
 
   def __call__(self, step):
     with ops.name_scope_v2(self.name or "NoisyLinearCosineDecay") as name:
-      initial_learning_rate = ops.convert_to_tensor_v2(
+      initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch(
           self.initial_learning_rate, name="initial_learning_rate")
       dtype = initial_learning_rate.dtype
       decay_steps = math_ops.cast(self.decay_steps, dtype)
diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
index d2bc7b94ac2..447a348618c 100644
--- a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
+++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule_test.py
@@ -43,9 +43,6 @@ def _maybe_serialized(lr_decay, serialize_and_deserialize):
     return lr_decay
 
 
-# @parameterized.named_parameters(
-#     ("NotSerialized", False),
-#     ("Serialized", True))
 @combinations.generate(combinations.combine(serialize=[False, True],
                                             mode=["graph", "eager"]))
 class LRDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
@@ -123,24 +120,26 @@ class LRDecayTestV2(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllClose(self.evaluate(decayed_lr(x)), 0.001, 1e-6)
 
   def testPiecewiseFunction(self, serialize):
+    if not context.executing_eagerly():
+      self.skipTest("Run on eager mode only.")
+
     del serialize
-    with context.eager_mode():
-      v = variables.Variable(1.)
-      def loss_fn():
-        return v * v
-      learning_rate = learning_rate_schedule.PiecewiseConstantDecay(
-          [1.], [1., 0.1])
-      opt = gradient_descent.SGD(learning_rate=learning_rate)
+    v = variables.Variable(1.)
+    def loss_fn():
+      return v * v
+    learning_rate = learning_rate_schedule.PiecewiseConstantDecay(
+        [1.], [1., 0.1])
+    opt = gradient_descent.SGD(learning_rate=learning_rate)
 
-      @def_function.function
-      def minimize():
-        with backprop.GradientTape() as tape:
-          loss = loss_fn()
-        g = tape.gradient(loss, [v])
-        opt.apply_gradients(list(zip(g, [v])))
+    @def_function.function
+    def minimize():
+      with backprop.GradientTape() as tape:
+        loss = loss_fn()
+      g = tape.gradient(loss, [v])
+      opt.apply_gradients(list(zip(g, [v])))
 
-      minimize()
-      self.assertAllEqual(v.read_value(), -1.0)
+    minimize()
+    self.assertAllEqual(v.read_value(), -1.0)
 
   def testPiecewiseConstantEdgeCases(self, serialize):
     # Test casting boundaries from int32 to int64.
diff --git a/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay.py b/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay.py
index ad280568fc7..ab8e4f55b52 100644
--- a/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay.py
+++ b/tensorflow/python/keras/optimizer_v2/legacy_learning_rate_decay.py
@@ -148,10 +148,11 @@ def piecewise_constant(x, boundaries, values, name=None):
   the learning rate value across different invocations of optimizer functions.
   @end_compatibility
   """
-  boundaries = nest.map_structure(ops.convert_to_tensor_v2,
+  boundaries = nest.map_structure(ops.convert_to_tensor_v2_with_dispatch,
                                   nest.flatten(boundaries))
-  values = nest.map_structure(ops.convert_to_tensor_v2, nest.flatten(values))
-  x_recomp = ops.convert_to_tensor(x)
+  values = nest.map_structure(ops.convert_to_tensor_v2_with_dispatch,
+                              nest.flatten(values))
+  x_recomp = ops.convert_to_tensor_v2_with_dispatch(x)
   # Avoid explicit conversion to x's dtype. This could result in faulty
   # comparisons, for example if floats are converted to integers.
   for i, b in enumerate(boundaries):
diff --git a/tensorflow/python/keras/optimizer_v2/nadam.py b/tensorflow/python/keras/optimizer_v2/nadam.py
index 090eabacf1e..550db0f6472 100644
--- a/tensorflow/python/keras/optimizer_v2/nadam.py
+++ b/tensorflow/python/keras/optimizer_v2/nadam.py
@@ -122,7 +122,7 @@ class Nadam(optimizer_v2.OptimizerV2):
     apply_state[(var_device, var_dtype)] = dict(
         lr_t=lr_t,
         neg_lr_t=-lr_t,
-        epsilon=ops.convert_to_tensor_v2(self.epsilon, var_dtype),
+        epsilon=ops.convert_to_tensor_v2_with_dispatch(self.epsilon, var_dtype),
         beta_1_t=beta_1_t,
         beta_2_t=beta_2_t,
         m_t=m_t,
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index e6b4458ca8d..ca3f1a3a9b1 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -30,6 +30,7 @@ from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.distribute import values as ds_values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -40,6 +41,7 @@ from tensorflow.python.keras.optimizer_v2 import learning_rate_schedule
 from tensorflow.python.keras.optimizer_v2 import utils as optimizer_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -50,10 +52,12 @@ from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.saved_model import revived_types
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 
+keras_optimizers_gauge = monitoring.BoolGauge(
+    "/tensorflow/api/keras/optimizers", "keras optimizer usage", "method")
+
 _DEFAULT_VALID_DTYPES = frozenset([
     dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64,
     dtypes.complex64, dtypes.complex128
@@ -304,26 +308,32 @@ class OptimizerV2(trackable.Trackable):
     ```
 
     Args:
-      name: A non-empty string.  The name to use for accumulators created
-        for the optimizer.
+      name: String. The name to use for momentum accumulator weights created
+        by the optimizer.
       gradient_aggregator: The function to use to aggregate gradients across
         devices (when using `tf.distribute.Strategy`). If `None`, defaults to
         summing the gradients across devices. The function should accept and
         return a list of `(gradient, variable)` tuples.
-      gradient_transformers: (Optional). List of functions to use to transform
-        gradients before applying updates to `Variable`s. The functions are
+      gradient_transformers: Optional. List of functions to use to transform
+        gradients before applying updates to Variables. The functions are
         applied after `gradient_aggregator`. The functions should accept and
         return a list of `(gradient, variable)` tuples.
-      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
-        `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
-        gradients by value, `decay` is included for backward compatibility to
-        allow time inverse decay of learning rate. `lr` is included for backward
-        compatibility, recommended to use `learning_rate` instead.
+      **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
+        `clipnorm`, `global_clipnorm`.
+        If `clipvalue` (float) is set, the gradient of each weight
+        is clipped to be no higher than this value.
+        If `clipnorm` (float) is set, the gradient of each weight
+        is individually clipped so that its norm is no higher than this value.
+        If `global_clipnorm` (float) is set the gradient of all weights is
+        clipped so that their global norm is no higher than this value.
 
     Raises:
-      ValueError: If name is malformed.
+      ValueError: in case of any invalid argument.
     """
-    allowed_kwargs = {"clipnorm", "clipvalue", "lr", "decay"}
+    # Instrument optimizer usages
+    keras_optimizers_gauge.get_cell(self.__class__.__name__).set(True)
+
+    allowed_kwargs = {"clipnorm", "clipvalue", "lr", "decay", "global_clipnorm"}
     for k in kwargs:
       if k not in allowed_kwargs:
         raise TypeError("Unexpected keyword argument "
@@ -370,6 +380,11 @@ class OptimizerV2(trackable.Trackable):
       gradient_transformers = []
     self.gradient_transformers = gradient_transformers
     self.clipnorm = kwargs.pop("clipnorm", None)
+    self.global_clipnorm = kwargs.pop("global_clipnorm", None)
+    if self.clipnorm is not None and self.global_clipnorm is not None:
+      raise ValueError("Cannot accept both `clipnorm` and `global_clipnorm`, "
+                       "passed `clipnorm` {}, `global_clipnorm` {}".format(
+                           self.clipnorm, self.global_clipnorm))
     self.clipvalue = kwargs.pop("clipvalue", None)
 
   @property
@@ -377,6 +392,11 @@ class OptimizerV2(trackable.Trackable):
     """`float` or `None`. If set, clips gradients to a maximum norm."""
     return self._clipnorm
 
+  @property
+  def global_clipnorm(self):
+    """`float` or `None`. If set, clips gradients to a maximum norm."""
+    return self._global_clipnorm
+
   @clipnorm.setter
   def clipnorm(self, val):
     if val is not None and self.gradient_transformers:
@@ -387,6 +407,16 @@ class OptimizerV2(trackable.Trackable):
     self._clipnorm_fn = optimizer_utils.make_gradient_clipnorm_fn(
         self._clipnorm)
 
+  @global_clipnorm.setter
+  def global_clipnorm(self, val):
+    if val is not None and self.gradient_transformers:
+      raise ValueError("`clipnorm` cannot be set when `gradient_transformers` "
+                       "is set. Instead, use the `gradient_transformers` to "
+                       "specify clipping and other transformations.")
+    self._global_clipnorm = val
+    self._global_clipnorm_fn = optimizer_utils.make_global_gradient_clipnorm_fn(
+        self._global_clipnorm)
+
   @property
   def clipvalue(self):
     """`float` or `None`. If set, clips gradients to a maximum value."""
@@ -425,6 +455,8 @@ class OptimizerV2(trackable.Trackable):
       grads_and_vars = self._clipvalue_fn(grads_and_vars)
     if self._clipnorm is not None:
       grads_and_vars = self._clipnorm_fn(grads_and_vars)
+    if self._global_clipnorm is not None:
+      grads_and_vars = self._global_clipnorm_fn(grads_and_vars)
 
     for fn in self.gradient_transformers:
       grads_and_vars = fn(grads_and_vars)
@@ -765,6 +797,14 @@ class OptimizerV2(trackable.Trackable):
         return self._get_hyper(name)
       raise e
 
+  def __dir__(self):
+    result = set(super(OptimizerV2, self).__dir__())
+    if "_hyper" in result:
+      result |= self._hyper.keys()
+      if "learning_rate" in self._hyper.keys():
+        result.add("lr")
+    return list(result)
+
   def __setattr__(self, name, value):
     """Override setattr to support dynamic hyperparameter setting."""
     # Backwards compatibility with Keras optimizers.
@@ -924,6 +964,8 @@ class OptimizerV2(trackable.Trackable):
       config["clipnorm"] = self.clipnorm
     if self.clipvalue is not None:
       config["clipvalue"] = self.clipvalue
+    if self.global_clipnorm is not None:
+      config["global_clipnorm"] = self.global_clipnorm
     return config
 
   @classmethod
@@ -1267,7 +1309,7 @@ class OptimizerV2(trackable.Trackable):
         # (aside from double initialization), and makes variable creator scopes
         # behave the same way they do when graph building.
         and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
-      initializer = trackable.CheckpointInitialValue(
+      initializer = trackable.CheckpointInitialValueCallable(
           checkpoint_position=slot_variable_position)
       slot_variable = self.add_slot(
           var=variable,
@@ -1320,7 +1362,7 @@ def _var_key(var):
   # pylint: disable=protected-access
   # Get the distributed variable if it exists.
   if hasattr(var, "_distributed_container"):
-    var = var._distributed_container
+    var = var._distributed_container()
   if var._in_graph_mode:
     return var._shared_name
   return var._unique_id
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index e994a6e1e44..d56ec49ad00 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -35,7 +35,7 @@ from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import losses
-from tensorflow.python.keras import optimizers
+from tensorflow.python.keras import optimizer_v1
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import sequential
@@ -237,7 +237,7 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testComputeGradientsWithTensors(self):
     with testing_utils.use_gpu():
-      x = ops.convert_to_tensor_v2(1.0)
+      x = ops.convert_to_tensor_v2_with_dispatch(1.0)
 
       def f():
         return x * x
@@ -356,11 +356,37 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
       self.evaluate(opt_op)
       self.assertAllClose([0.], self.evaluate(var))
 
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def testGradGlobalClipNorm(self):
+    with testing_utils.use_gpu():
+      # l2 norm is 5.0
+      var1 = variables.Variable([1.0])
+      var2 = variables.Variable([2.0])
+      loss = lambda: 3 * var1 + 4 * var2
+      opt = gradient_descent.SGD(learning_rate=1.0, global_clipnorm=2.0)
+      opt_op = opt.minimize(loss, [var1, var2])
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(opt_op)
+      # grad1 = 3.0 * 2.0 / 5.0 = 1.2
+      self.assertAllClose([-.2], self.evaluate(var1))
+      # grad2 = 4.0 * 2.0 / 5.0 = 1.6
+      self.assertAllClose([.4], self.evaluate(var2))
+
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testInvalidClipNorm(self):
     with self.assertRaisesRegex(ValueError, '>= 0'):
       gradient_descent.SGD(learning_rate=1.0, clipnorm=-1.0)
 
+  @combinations.generate(
+      combinations.combine(
+          mode=['graph', 'eager'],
+          clip_type=['clipnorm', 'global_clipnorm', 'clipvalue']))
+  def testConfigWithCliping(self, clip_type):
+    opt = gradient_descent.SGD(learning_rate=1.0, **{clip_type: 2.0})
+    config = opt.get_config()
+    opt = gradient_descent.SGD.from_config(config)
+    self.assertEqual(getattr(opt, clip_type), 2.0)
+
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testInvalidKwargs(self):
     with self.assertRaisesRegex(TypeError, 'Unexpected keyword argument'):
@@ -459,6 +485,16 @@ class OptimizerTest(test.TestCase, parameterized.TestCase):
       lr = self.evaluate(opt.lr)
       self.assertEqual(4.0, lr)
 
+  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  def testDir(self):
+    opt = gradient_descent.SGD(learning_rate=1.0, momentum=0.1)
+    dir_result = set(dir(opt))
+    self.assertIn('learning_rate', dir_result)  # Hyperparameter
+    self.assertIn('lr', dir_result)  # Hyperparameter
+    self.assertIn('momentum', dir_result)  # Hyperparameter
+    self.assertIn('nesterov', dir_result)  # Attribute
+    self.assertIn('minimize', dir_result)  # Attribute
+
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testOptimizerWithKerasModel(self):
     a = input_layer.Input(shape=(3,), name='input_a')
@@ -713,42 +749,42 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
                           rtol=1e-5, atol=1e-5)
 
   def testAdadeltaCompatibility(self):
-    opt_v1 = optimizers.Adadelta(lr=0.01)
+    opt_v1 = optimizer_v1.Adadelta(lr=0.01)
     opt_v2 = adadelta.Adadelta(learning_rate=0.01)
     self._testOptimizersCompatibility(opt_v1, opt_v2)
 
   def testAdagradCompatibility(self):
-    opt_v1 = optimizers.Adagrad(lr=0.01)
+    opt_v1 = optimizer_v1.Adagrad(lr=0.01)
     opt_v2 = adagrad.Adagrad(learning_rate=0.01)
     self._testOptimizersCompatibility(opt_v1, opt_v2)
 
   def testAdamCompatibility(self):
-    opt_v1 = optimizers.Adam()
+    opt_v1 = optimizer_v1.Adam()
     opt_v2 = adam.Adam()
     self._testOptimizersCompatibility(opt_v1, opt_v2)
 
   def testAdamaxCompatibility(self):
-    opt_v1 = optimizers.Adamax(lr=0.01)
+    opt_v1 = optimizer_v1.Adamax(lr=0.01)
     opt_v2 = adamax.Adamax(learning_rate=0.01)
     self._testOptimizersCompatibility(opt_v1, opt_v2)
 
   def testNadamCompatibility(self):
-    opt_v1 = optimizers.Nadam(lr=0.001)
+    opt_v1 = optimizer_v1.Nadam(lr=0.001)
     opt_v2 = nadam.Nadam(learning_rate=0.001)
     self._testOptimizersCompatibility(opt_v1, opt_v2)
 
   def testMomentumCompatibility(self):
-    opt_v1 = optimizers.SGD(lr=0.01, momentum=0.9)
+    opt_v1 = optimizer_v1.SGD(lr=0.01, momentum=0.9)
     opt_v2 = gradient_descent.SGD(learning_rate=0.01, momentum=0.9)
     self._testOptimizersCompatibility(opt_v1, opt_v2)
 
   def testRMSpropCompatibility(self):
-    opt_v1 = optimizers.RMSprop()
+    opt_v1 = optimizer_v1.RMSprop()
     opt_v2 = rmsprop.RMSprop()
     self._testOptimizersCompatibility(opt_v1, opt_v2)
 
   def testSGDCompatibility(self):
-    opt_v1 = optimizers.SGD(lr=0.01)
+    opt_v1 = optimizer_v1.SGD(lr=0.01)
     opt_v2 = gradient_descent.SGD(learning_rate=0.01)
     self._testOptimizersCompatibility(opt_v1, opt_v2, False)
 
@@ -778,7 +814,7 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
           num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
       model_tf.set_weights(model_k_v2.get_weights())
 
-      opt_k_v1 = optimizers.SGD(momentum=0.9, nesterov=True)
+      opt_k_v1 = optimizer_v1.SGD(momentum=0.9, nesterov=True)
       opt_k_v2 = gradient_descent.SGD(momentum=0.9, nesterov=True)
       opt_tf = momentum.MomentumOptimizer(
           learning_rate=0.01, momentum=0.9, use_nesterov=True)
@@ -832,7 +868,7 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
           num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
       model_k_v2.set_weights(model_k_v1.get_weights())
 
-      opt_k_v1 = optimizers.Adam(amsgrad=True)
+      opt_k_v1 = optimizer_v1.Adam(amsgrad=True)
       opt_k_v2 = adam.Adam(amsgrad=True)
 
       model_k_v1.compile(
@@ -856,62 +892,60 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase):
 
 # Note: These tests are kept in a separate class to avoid bugs in some
 # distributions of Python that break AutoGraph which is used by tf.function.
-class OptimizerWithFunctionTest(test.TestCase):
+@combinations.generate(combinations.combine(mode=['eager']))
+class OptimizerWithFunctionTest(test.TestCase, parameterized.TestCase):
 
   def testBasic(self):
-    with context.eager_mode():
-      var = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
-      loss = lambda: 3 * var
-      opt = adam.Adam(learning_rate=1.0)
+    var = variables.Variable([1.0, 2.0], dtype=dtypes.float32)
+    loss = lambda: 3 * var
+    opt = adam.Adam(learning_rate=1.0)
 
-      @def_function.function
-      def fn():
-        opt.minimize(loss, [var])
-        return var
+    @def_function.function
+    def fn():
+      opt.minimize(loss, [var])
+      return var
 
-      self.assertAllClose([0., 1.], fn(), atol=1e-4)
-      self.assertAllClose([-1, 0.], fn(), atol=1e-4)
+    self.assertAllClose([0., 1.], fn(), atol=1e-4)
+    self.assertAllClose([-1, 0.], fn(), atol=1e-4)
 
   def testVarKeyWithVarCreatedInEager(self):
-    with context.eager_mode():
-      a = variables.Variable([1., 2.], name='var')
-      b = variables.Variable([1.], name='var')
+    a = variables.Variable([1., 2.], name='var')
+    b = variables.Variable([1.], name='var')
 
-      @test_util.also_run_as_tf_function
-      def var_key_test():
-        self.assertFalse(a._in_graph_mode)
-        self.assertFalse(b._in_graph_mode)
-        var_key_a = optimizer_v2._var_key(a)
-        self.assertStartsWith(var_key_a, 'var_')
-        var_key_b = optimizer_v2._var_key(b)
-        self.assertStartsWith(var_key_b, 'var_')
-        self.assertNotEquals(var_key_a, var_key_b)
+    @test_util.also_run_as_tf_function
+    def var_key_test():
+      self.assertFalse(a._in_graph_mode)
+      self.assertFalse(b._in_graph_mode)
+      var_key_a = optimizer_v2._var_key(a)
+      self.assertStartsWith(var_key_a, 'var_')
+      var_key_b = optimizer_v2._var_key(b)
+      self.assertStartsWith(var_key_b, 'var_')
+      self.assertNotEqual(var_key_a, var_key_b)
 
-      var_key_test()
+    var_key_test()
 
   def testLearningRateDecayUsedInTwoFunctions(self):
-    with context.eager_mode():
-      a = variables.Variable([1., 2.], name='var')
-      b = variables.Variable([1.], name='var')
+    a = variables.Variable([1., 2.], name='var')
+    b = variables.Variable([1.], name='var')
 
-      learning_rate_decay = learning_rate_schedule.InverseTimeDecay(
-          0.5, decay_steps=1.0, decay_rate=0.5)
-      opt = adam.Adam(learning_rate=learning_rate_decay)
-      loss_a = lambda: 3 * a
-      loss_b = lambda: 2 * b
+    learning_rate_decay = learning_rate_schedule.InverseTimeDecay(
+        0.5, decay_steps=1.0, decay_rate=0.5)
+    opt = adam.Adam(learning_rate=learning_rate_decay)
+    loss_a = lambda: 3 * a
+    loss_b = lambda: 2 * b
 
-      @def_function.function
-      def fn_a():
-        opt.minimize(loss_a, [a])
-        return a
+    @def_function.function
+    def fn_a():
+      opt.minimize(loss_a, [a])
+      return a
 
-      @def_function.function
-      def fn_b():
-        opt.minimize(loss_b, [b])
-        return b
+    @def_function.function
+    def fn_b():
+      opt.minimize(loss_b, [b])
+      return b
 
-      fn_a()
-      fn_b()
+    fn_a()
+    fn_b()
 
 
 _NUM_LEARNERS = 50
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop.py b/tensorflow/python/keras/optimizer_v2/rmsprop.py
index 1fa2577e72f..315b0a65e3c 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@@ -49,7 +49,7 @@ class RMSprop(optimizer_v2.OptimizerV2):
     learning_rate: A `Tensor`, floating point value, or a schedule that is a
       `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
       that takes no arguments and returns the actual value to use. The
-      learning rate. Defeaults to 0.001.
+      learning rate. Defaults to 0.001.
     rho: Discounting factor for the history/coming gradient. Defaults to 0.9.
     momentum: A scalar or a scalar `Tensor`. Defaults to 0.0.
     epsilon: A small constant for numerical stability. This epsilon is
@@ -109,7 +109,7 @@ class RMSprop(optimizer_v2.OptimizerV2):
       learning_rate: A `Tensor`, floating point value, or a schedule that is a
         `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
         that takes no arguments and returns the actual value to use. The
-        learning rate. Defeaults to 0.001.
+        learning rate. Defaults to 0.001.
       rho: Discounting factor for the history/coming gradient. Defaults to 0.9.
       momentum: A scalar or a scalar `Tensor`. Defaults to 0.0.
       epsilon: A small constant for numerical stability. This epsilon is
@@ -167,7 +167,8 @@ class RMSprop(optimizer_v2.OptimizerV2):
     apply_state[(var_device, var_dtype)].update(
         dict(
             neg_lr_t=-apply_state[(var_device, var_dtype)]["lr_t"],
-            epsilon=ops.convert_to_tensor_v2(self.epsilon, var_dtype),
+            epsilon=ops.convert_to_tensor_v2_with_dispatch(
+                self.epsilon, var_dtype),
             rho=rho,
             momentum=array_ops.identity(self._get_hyper("momentum", var_dtype)),
             one_minus_rho=1. - rho))
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
index 35f795edb53..cd25ab842e6 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
@@ -25,7 +25,6 @@ import math
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -59,7 +58,7 @@ _TESTPARAMS = [
 ]
 
 
-class RMSpropOptimizerTest(test.TestCase):
+class RMSpropOptimizerTest(test.TestCase, parameterized.TestCase):
 
   def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, rho, momentum,
                             epsilon, centered):
@@ -459,54 +458,54 @@ class RMSpropOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
           self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testCallableParams(self):
-    with context.eager_mode():
-      for dtype in _DATA_TYPES:
-        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
-        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+    for dtype in _DATA_TYPES:
+      var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+      var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
 
-        learning_rate = lambda: 2.0
-        rho = lambda: 0.9
-        momentum = lambda: 0.0
-        epsilon = 1.0
-        opt = rmsprop.RMSprop(learning_rate, rho, momentum, epsilon)
+      learning_rate = lambda: 2.0
+      rho = lambda: 0.9
+      momentum = lambda: 0.0
+      epsilon = 1.0
+      opt = rmsprop.RMSprop(learning_rate, rho, momentum, epsilon)
 
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
-        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
-        # Step 1: the rms accumulators where 1. So we should see a normal
-        # update: v -= grad * learning_rate
-        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        # Check the parameters.
-        self.assertAllCloseAccordingToType(
-            np.array([
-                1.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)),
-                2.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0))
-            ]), self.evaluate(var0))
-        self.assertAllCloseAccordingToType(
-            np.array([
-                3.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)),
-                4.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0))
-            ]), self.evaluate(var1))
-        # Step 2: the root mean square accumulators contain the previous update.
-        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        # Check the parameters.
-        self.assertAllCloseAccordingToType(
-            np.array([
-                1.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)) -
-                (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0)),
-                2.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)) -
-                (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0))
-            ]), self.evaluate(var0))
-        self.assertAllCloseAccordingToType(
-            np.array([
-                3.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)) -
-                (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0)),
-                4.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)) -
-                (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0))
-            ]), self.evaluate(var1))
+      # Fetch params to validate initial values
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+      # Step 1: the rms accumulators where 1. So we should see a normal
+      # update: v -= grad * learning_rate
+      opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      # Check the parameters.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0))
+          ]), self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([
+              3.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0))
+          ]), self.evaluate(var1))
+      # Step 2: the root mean square accumulators contain the previous update.
+      opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      # Check the parameters.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)) -
+              (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0)),
+              2.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)) -
+              (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0))
+          ]), self.evaluate(var0))
+      self.assertAllCloseAccordingToType(
+          np.array([
+              3.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)) -
+              (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0)),
+              4.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)) -
+              (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0))
+          ]), self.evaluate(var1))
 
   def testConstructRMSpropWithLR(self):
     opt = rmsprop.RMSprop(lr=1.0)
@@ -521,31 +520,31 @@ class RMSpropOptimizerTest(test.TestCase):
     self.assertAllClose(self.evaluate(opt_2.lr), (1.0))
     self.assertAllClose(self.evaluate(opt_3.lr), (0.1))
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testSlotsUniqueEager(self):
-    with context.eager_mode():
-      v1 = variables.Variable(1.)
-      v2 = variables.Variable(1.)
+    v1 = variables.Variable(1.)
+    v2 = variables.Variable(1.)
 
-      opt = rmsprop.RMSprop(1., momentum=0., centered=False)
-      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
-      # There should be iteration, and one unique slot variable for v1 and v2.
-      self.assertEqual(3, len(set({id(v) for v in opt.variables()})))
-      self.assertEqual(
-          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+    opt = rmsprop.RMSprop(1., momentum=0., centered=False)
+    opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+    # There should be iteration, and one unique slot variable for v1 and v2.
+    self.assertLen(set({id(v) for v in opt.variables()}), 3)
+    self.assertEqual(
+        self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
 
-      opt = rmsprop.RMSprop(learning_rate=1., momentum=0.2, centered=False)
-      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
-      # There should be iteration, and two unique slot variables for v1 and v2.
-      self.assertEqual(5, len(set({id(v) for v in opt.variables()})))
-      self.assertEqual(
-          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+    opt = rmsprop.RMSprop(learning_rate=1., momentum=0.2, centered=False)
+    opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+    # There should be iteration, and two unique slot variables for v1 and v2.
+    self.assertLen(set({id(v) for v in opt.variables()}), 5)
+    self.assertEqual(
+        self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
 
-      opt = rmsprop.RMSprop(learning_rate=1., momentum=0.2, centered=True)
-      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
-      # There should be iteration, and three unique slot variables for v1 and v2
-      self.assertEqual(7, len(set({id(v) for v in opt.variables()})))
-      self.assertEqual(
-          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+    opt = rmsprop.RMSprop(learning_rate=1., momentum=0.2, centered=True)
+    opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+    # There should be iteration, and three unique slot variables for v1 and v2
+    self.assertLen(set({id(v) for v in opt.variables()}), 7)
+    self.assertEqual(
+        self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
 
 
 @combinations.generate(combinations.combine(mode=["graph", "eager"]))
diff --git a/tensorflow/python/keras/optimizer_v2/utils.py b/tensorflow/python/keras/optimizer_v2/utils.py
index 909a25d4b15..90f9a4975e7 100644
--- a/tensorflow/python/keras/optimizer_v2/utils.py
+++ b/tensorflow/python/keras/optimizer_v2/utils.py
@@ -92,7 +92,8 @@ def make_gradient_clipnorm_fn(clipnorm):
   def gradient_clipnorm_fn(grads_and_vars):
 
     if isinstance(distribute_ctx.get_strategy(),
-                  central_storage_strategy.CentralStorageStrategy):
+                  (central_storage_strategy.CentralStorageStrategy,
+                   central_storage_strategy.CentralStorageStrategyV1)):
       raise ValueError(
           "`clipnorm` is not supported with `CenteralStorageStrategy`")
 
@@ -104,6 +105,27 @@ def make_gradient_clipnorm_fn(clipnorm):
   return gradient_clipnorm_fn
 
 
+def make_global_gradient_clipnorm_fn(clipnorm):
+  """Creates a gradient transformation function for clipping by norm."""
+  if clipnorm is None:
+    return lambda grads_and_vars: grads_and_vars
+
+  def gradient_clipnorm_fn(grads_and_vars):
+
+    if isinstance(distribute_ctx.get_strategy(),
+                  (central_storage_strategy.CentralStorageStrategy,
+                   central_storage_strategy.CentralStorageStrategyV1)):
+      raise ValueError(
+          "`global_clipnorm` is not supported with `CenteralStorageStrategy`")
+
+    grads, variables = zip(*grads_and_vars)
+    clipped_grads, _ = clip_ops.clip_by_global_norm(grads, clipnorm)
+    clipped_grads_and_vars = list(zip(clipped_grads, variables))
+    return clipped_grads_and_vars
+
+  return gradient_clipnorm_fn
+
+
 def make_gradient_clipvalue_fn(clipvalue):
   """Creates a gradient transformation function for clipping by value."""
   if clipvalue is None:
@@ -112,7 +134,8 @@ def make_gradient_clipvalue_fn(clipvalue):
   def gradient_clipvalue_fn(grads_and_vars):
 
     if isinstance(distribute_ctx.get_strategy(),
-                  central_storage_strategy.CentralStorageStrategy):
+                  (central_storage_strategy.CentralStorageStrategy,
+                   central_storage_strategy.CentralStorageStrategyV1)):
       raise ValueError(
           "`clipvalue` is not supported with `CenteralStorageStrategy`")
 
diff --git a/tensorflow/python/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
index 75d1e10242a..e1f2ca80cf9 100644
--- a/tensorflow/python/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -13,18 +13,19 @@
 # limitations under the License.
 # ==============================================================================
 # pylint: disable=invalid-name
-"""Built-in optimizer classes."""
+"""Built-in optimizer classes.
+
+For more examples see the base class `tf.keras.optimizers.Optimizer`.
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import six
-from six.moves import zip  # pylint: disable=redefined-builtin
 
-from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.eager import backprop
-from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.optimizer_v1 import Optimizer
+from tensorflow.python.keras.optimizer_v1 import TFOptimizer
 from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_v2
 from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_v2
 from tensorflow.python.keras.optimizer_v2 import adam as adam_v2
@@ -36,819 +37,10 @@ from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
-from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import state_ops
 from tensorflow.python.training import optimizer as tf_optimizer_module
-from tensorflow.python.training import training_util
-from tensorflow.python.training.tracking import base as trackable
-from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
 
-class Optimizer(object):
-  """Abstract optimizer base class.
-
-  Note: this is the parent class of all optimizers, not an actual optimizer
-  that can be used for training models.
-
-  All Keras optimizers support the following keyword arguments:
-
-      clipnorm: float >= 0. Gradients will be clipped
-          when their L2 norm exceeds this value.
-      clipvalue: float >= 0. Gradients will be clipped
-          when their absolute value exceeds this value.
-  """
-
-  def __init__(self, **kwargs):
-    allowed_kwargs = {'clipnorm', 'clipvalue'}
-    for k in kwargs:
-      if k not in allowed_kwargs:
-        raise TypeError('Unexpected keyword argument '
-                        'passed to optimizer: ' + str(k))
-      # checks that clipnorm >= 0 and clipvalue >= 0
-      if kwargs[k] < 0:
-        raise ValueError('Expected {} >= 0, received: {}'.format(k, kwargs[k]))
-    self.__dict__.update(kwargs)
-    self.updates = []
-    self.weights = []
-
-  # Set this to False, indicating `apply_gradients` does not take the
-  # `experimental_aggregate_gradients` argument.
-  _HAS_AGGREGATE_GRAD = False
-
-  def _create_all_weights(self, params):
-    """Creates and sets all optimizer weights.
-
-    Args:
-      params: list or tuple of `Variable` objects that will be minimized
-        using this optimizer.
-
-    Returns:
-      Specific weight values that are used in `get_updates`
-    """
-    raise NotImplementedError
-
-  def get_updates(self, loss, params):
-    raise NotImplementedError
-
-  def get_gradients(self, loss, params):
-    """Returns gradients of `loss` with respect to `params`.
-
-    Arguments:
-        loss: Loss tensor.
-        params: List of variables.
-
-    Returns:
-        List of gradient tensors.
-
-    Raises:
-        ValueError: In case any gradient cannot be computed (e.g. if gradient
-          function not implemented).
-    """
-    grads = K.gradients(loss, params)
-    if any(g is None for g in grads):
-      raise ValueError('An operation has `None` for gradient. '
-                       'Please make sure that all of your ops have a '
-                       'gradient defined (i.e. are differentiable). '
-                       'Common ops without gradient: '
-                       'K.argmax, K.round, K.eval.')
-    if hasattr(self, 'clipnorm'):
-      grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
-    if hasattr(self, 'clipvalue'):
-      grads = [
-          clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
-          for g in grads
-      ]
-    return grads
-
-  def set_weights(self, weights):
-    """Sets the weights of the optimizer, from Numpy arrays.
-
-    Should only be called after computing the gradients
-    (otherwise the optimizer has no weights).
-
-    Arguments:
-        weights: a list of Numpy arrays. The number of arrays and their shape
-          must match number of the dimensions of the weights of the optimizer
-          (i.e. it should match the output of `get_weights`).
-
-    Raises:
-        ValueError: in case of incompatible weight shapes.
-    """
-    params = self.weights
-    if len(params) != len(weights):
-      raise ValueError('Length of the specified weight list (' +
-                       str(len(weights)) +
-                       ') does not match the number of weights '
-                       'of the optimizer (' + str(len(params)) + ')')
-    weight_value_tuples = []
-    param_values = K.batch_get_value(params)
-    for pv, p, w in zip(param_values, params, weights):
-      if pv.shape != w.shape:
-        raise ValueError('Optimizer weight shape ' + str(pv.shape) +
-                         ' not compatible with '
-                         'provided weight shape ' + str(w.shape))
-      weight_value_tuples.append((p, w))
-    K.batch_set_value(weight_value_tuples)
-
-  def get_weights(self):
-    """Returns the current value of the weights of the optimizer.
-
-    Returns:
-        A list of numpy arrays.
-    """
-    return K.batch_get_value(self.weights)
-
-  def get_config(self):
-    config = {}
-    if hasattr(self, 'clipnorm'):
-      config['clipnorm'] = self.clipnorm
-    if hasattr(self, 'clipvalue'):
-      config['clipvalue'] = self.clipvalue
-    return config
-
-  @classmethod
-  def from_config(cls, config):
-    return cls(**config)
-
-
-class SGD(Optimizer):
-  """Stochastic gradient descent optimizer.
-
-  Includes support for momentum,
-  learning rate decay, and Nesterov momentum.
-
-  Arguments:
-      lr: float >= 0. Learning rate.
-      momentum: float >= 0. Parameter that accelerates SGD in the relevant
-        direction and dampens oscillations.
-      decay: float >= 0. Learning rate decay over each update.
-      nesterov: boolean. Whether to apply Nesterov momentum.
-  """
-
-  def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, **kwargs):
-    super(SGD, self).__init__(**kwargs)
-    with K.name_scope(self.__class__.__name__):
-      self.iterations = K.variable(0, dtype='int64', name='iterations')
-      self.lr = K.variable(lr, name='lr')
-      self.momentum = K.variable(momentum, name='momentum')
-      self.decay = K.variable(decay, name='decay')
-    self.initial_decay = decay
-    self.nesterov = nesterov
-
-  def _create_all_weights(self, params):
-    shapes = [K.int_shape(p) for p in params]
-    moments = [K.zeros(shape) for shape in shapes]
-    self.weights = [self.iterations] + moments
-    return moments
-
-  def get_updates(self, loss, params):
-    grads = self.get_gradients(loss, params)
-    self.updates = [state_ops.assign_add(self.iterations, 1)]
-
-    lr = self.lr
-    if self.initial_decay > 0:
-      lr = lr * (  # pylint: disable=g-no-augmented-assignment
-          1. /
-          (1. +
-           self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
-    # momentum
-    moments = self._create_all_weights(params)
-    for p, g, m in zip(params, grads, moments):
-      v = self.momentum * m - lr * g  # velocity
-      self.updates.append(state_ops.assign(m, v))
-
-      if self.nesterov:
-        new_p = p + self.momentum * v - lr * g
-      else:
-        new_p = p + v
-
-      # Apply constraints.
-      if getattr(p, 'constraint', None) is not None:
-        new_p = p.constraint(new_p)
-
-      self.updates.append(state_ops.assign(p, new_p))
-    return self.updates
-
-  def get_config(self):
-    config = {
-        'lr': float(K.get_value(self.lr)),
-        'momentum': float(K.get_value(self.momentum)),
-        'decay': float(K.get_value(self.decay)),
-        'nesterov': self.nesterov
-    }
-    base_config = super(SGD, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-class RMSprop(Optimizer):
-  """RMSProp optimizer.
-
-  It is recommended to leave the parameters of this optimizer
-  at their default values
-  (except the learning rate, which can be freely tuned).
-
-  Arguments:
-      lr: float >= 0. Learning rate.
-      rho: float >= 0.
-      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
-      decay: float >= 0. Learning rate decay over each update.
-  """
-
-  def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0., **kwargs):
-    super(RMSprop, self).__init__(**kwargs)
-    with K.name_scope(self.__class__.__name__):
-      self.lr = K.variable(lr, name='lr')
-      self.rho = K.variable(rho, name='rho')
-      self.decay = K.variable(decay, name='decay')
-      self.iterations = K.variable(0, dtype='int64', name='iterations')
-    if epsilon is None:
-      epsilon = K.epsilon()
-    self.epsilon = epsilon
-    self.initial_decay = decay
-
-  def _create_all_weights(self, params):
-    accumulators = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
-    self.weights = accumulators
-    return accumulators
-
-  def get_updates(self, loss, params):
-    grads = self.get_gradients(loss, params)
-    accumulators = self._create_all_weights(params)
-    self.updates = [state_ops.assign_add(self.iterations, 1)]
-
-    lr = self.lr
-    if self.initial_decay > 0:
-      lr = lr * (  # pylint: disable=g-no-augmented-assignment
-          1. /
-          (1. +
-           self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
-
-    for p, g, a in zip(params, grads, accumulators):
-      # update accumulator
-      new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
-      self.updates.append(state_ops.assign(a, new_a))
-      new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
-
-      # Apply constraints.
-      if getattr(p, 'constraint', None) is not None:
-        new_p = p.constraint(new_p)
-
-      self.updates.append(state_ops.assign(p, new_p))
-    return self.updates
-
-  def get_config(self):
-    config = {
-        'lr': float(K.get_value(self.lr)),
-        'rho': float(K.get_value(self.rho)),
-        'decay': float(K.get_value(self.decay)),
-        'epsilon': self.epsilon
-    }
-    base_config = super(RMSprop, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-class Adagrad(Optimizer):
-  """Adagrad optimizer.
-
-  Adagrad is an optimizer with parameter-specific learning rates,
-  which are adapted relative to how frequently a parameter gets
-  updated during training. The more updates a parameter receives,
-  the smaller the updates.
-
-  It is recommended to leave the parameters of this optimizer
-  at their default values.
-
-  # Arguments
-      lr: float >= 0. Initial learning rate.
-      epsilon: float >= 0. If `None`, defaults to `K.epsilon()`.
-      decay: float >= 0. Learning rate decay over each update.
-
-  # References
-      - [Adaptive Subgradient Methods for Online Learning and Stochastic
-      Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
-  """
-
-  def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs):
-    super(Adagrad, self).__init__(**kwargs)
-    with K.name_scope(self.__class__.__name__):
-      self.lr = K.variable(lr, name='lr')
-      self.decay = K.variable(decay, name='decay')
-      self.iterations = K.variable(0, dtype='int64', name='iterations')
-    if epsilon is None:
-      epsilon = K.epsilon()
-    self.epsilon = epsilon
-    self.initial_decay = decay
-
-  def _create_all_weights(self, params):
-    shapes = [K.int_shape(p) for p in params]
-    accumulators = [K.zeros(shape) for shape in shapes]
-    self.weights = accumulators
-    return accumulators
-
-  def get_updates(self, loss, params):
-    grads = self.get_gradients(loss, params)
-    accumulators = self._create_all_weights(params)
-
-    self.updates = [state_ops.assign_add(self.iterations, 1)]
-
-    lr = self.lr
-    if self.initial_decay > 0:
-      lr = lr * (  # pylint: disable=g-no-augmented-assignment
-          1. /
-          (1. +
-           self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
-
-    for p, g, a in zip(params, grads, accumulators):
-      new_a = a + math_ops.square(g)  # update accumulator
-      self.updates.append(state_ops.assign(a, new_a))
-      new_p = p - lr * g / (K.sqrt(new_a) + self.epsilon)
-
-      # Apply constraints.
-      if getattr(p, 'constraint', None) is not None:
-        new_p = p.constraint(new_p)
-
-      self.updates.append(state_ops.assign(p, new_p))
-    return self.updates
-
-  def get_config(self):
-    config = {
-        'lr': float(K.get_value(self.lr)),
-        'decay': float(K.get_value(self.decay)),
-        'epsilon': self.epsilon
-    }
-    base_config = super(Adagrad, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-class Adadelta(Optimizer):
-  """Adadelta optimizer.
-
-  Adadelta is a more robust extension of Adagrad
-  that adapts learning rates based on a moving window of gradient updates,
-  instead of accumulating all past gradients. This way, Adadelta continues
-  learning even when many updates have been done. Compared to Adagrad, in the
-  original version of Adadelta you don't have to set an initial learning
-  rate. In this version, initial learning rate and decay factor can
-  be set, as in most other Keras optimizers.
-
-  It is recommended to leave the parameters of this optimizer
-  at their default values.
-
-  # Arguments
-      lr: float >= 0. Initial learning rate, defaults to 1.
-          It is recommended to leave it at the default value.
-      rho: float >= 0. Adadelta decay factor, corresponding to fraction of
-          gradient to keep at each time step.
-      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
-      decay: float >= 0. Initial learning rate decay.
-
-  # References
-      - [Adadelta - an adaptive learning rate
-      method](http://arxiv.org/abs/1212.5701)
-  """
-
-  def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0., **kwargs):
-    super(Adadelta, self).__init__(**kwargs)
-    with K.name_scope(self.__class__.__name__):
-      self.lr = K.variable(lr, name='lr')
-      self.decay = K.variable(decay, name='decay')
-      self.iterations = K.variable(0, dtype='int64', name='iterations')
-    if epsilon is None:
-      epsilon = K.epsilon()
-    self.rho = rho
-    self.epsilon = epsilon
-    self.initial_decay = decay
-
-  def _create_all_weights(self, params):
-    shapes = [K.int_shape(p) for p in params]
-    accumulators = [K.zeros(shape) for shape in shapes]
-    delta_accumulators = [K.zeros(shape) for shape in shapes]
-    self.weights = accumulators + delta_accumulators
-    return accumulators, delta_accumulators
-
-  def get_updates(self, loss, params):
-    grads = self.get_gradients(loss, params)
-    self.updates = [state_ops.assign_add(self.iterations, 1)]
-    accumulators, delta_accumulators = self._create_all_weights(params)
-
-    lr = self.lr
-    if self.initial_decay > 0:
-      lr = lr * (  # pylint: disable=g-no-augmented-assignment
-          1. /
-          (1. +
-           self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
-
-    for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
-      # update accumulator
-      new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
-      self.updates.append(state_ops.assign(a, new_a))
-
-      # use the new accumulator and the *old* delta_accumulator
-      update = g * K.sqrt(d_a + self.epsilon) / K.sqrt(new_a + self.epsilon)
-      new_p = p - lr * update
-
-      # Apply constraints.
-      if getattr(p, 'constraint', None) is not None:
-        new_p = p.constraint(new_p)
-
-      self.updates.append(state_ops.assign(p, new_p))
-
-      # update delta_accumulator
-      new_d_a = self.rho * d_a + (1 - self.rho) * math_ops.square(update)
-      self.updates.append(state_ops.assign(d_a, new_d_a))
-    return self.updates
-
-  def get_config(self):
-    config = {
-        'lr': float(K.get_value(self.lr)),
-        'rho': self.rho,
-        'decay': float(K.get_value(self.decay)),
-        'epsilon': self.epsilon
-    }
-    base_config = super(Adadelta, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-class Adam(Optimizer):
-  """Adam optimizer.
-
-  Default parameters follow those provided in the original paper.
-
-  Arguments:
-      lr: float >= 0. Learning rate.
-      beta_1: float, 0 < beta < 1. Generally close to 1.
-      beta_2: float, 0 < beta < 1. Generally close to 1.
-      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
-      decay: float >= 0. Learning rate decay over each update.
-      amsgrad: boolean. Whether to apply the AMSGrad variant of this algorithm
-        from the paper "On the Convergence of Adam and Beyond".
-  """
-
-  def __init__(self,
-               lr=0.001,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=None,
-               decay=0.,
-               amsgrad=False,
-               **kwargs):
-    super(Adam, self).__init__(**kwargs)
-    with K.name_scope(self.__class__.__name__):
-      self.iterations = K.variable(0, dtype='int64', name='iterations')
-      self.lr = K.variable(lr, name='lr')
-      self.beta_1 = K.variable(beta_1, name='beta_1')
-      self.beta_2 = K.variable(beta_2, name='beta_2')
-      self.decay = K.variable(decay, name='decay')
-    if epsilon is None:
-      epsilon = K.epsilon()
-    self.epsilon = epsilon
-    self.initial_decay = decay
-    self.amsgrad = amsgrad
-
-  def _create_all_weights(self, params):
-    ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
-    vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
-    if self.amsgrad:
-      vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
-    else:
-      vhats = [K.zeros(1) for _ in params]
-    self.weights = [self.iterations] + ms + vs + vhats
-    return ms, vs, vhats
-
-  def get_updates(self, loss, params):
-    grads = self.get_gradients(loss, params)
-    self.updates = []
-
-    lr = self.lr
-    if self.initial_decay > 0:
-      lr = lr * (  # pylint: disable=g-no-augmented-assignment
-          1. /
-          (1. +
-           self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
-
-    with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
-      t = math_ops.cast(self.iterations, K.floatx())
-    lr_t = lr * (
-        K.sqrt(1. - math_ops.pow(self.beta_2, t)) /
-        (1. - math_ops.pow(self.beta_1, t)))
-
-    ms, vs, vhats = self._create_all_weights(params)
-    for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
-      m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
-      v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g)
-      if self.amsgrad:
-        vhat_t = math_ops.maximum(vhat, v_t)
-        p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
-        self.updates.append(state_ops.assign(vhat, vhat_t))
-      else:
-        p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
-
-      self.updates.append(state_ops.assign(m, m_t))
-      self.updates.append(state_ops.assign(v, v_t))
-      new_p = p_t
-
-      # Apply constraints.
-      if getattr(p, 'constraint', None) is not None:
-        new_p = p.constraint(new_p)
-
-      self.updates.append(state_ops.assign(p, new_p))
-    return self.updates
-
-  def get_config(self):
-    config = {
-        'lr': float(K.get_value(self.lr)),
-        'beta_1': float(K.get_value(self.beta_1)),
-        'beta_2': float(K.get_value(self.beta_2)),
-        'decay': float(K.get_value(self.decay)),
-        'epsilon': self.epsilon,
-        'amsgrad': self.amsgrad
-    }
-    base_config = super(Adam, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-class Adamax(Optimizer):
-  """Adamax optimizer from Adam paper's Section 7.
-
-  It is a variant of Adam based on the infinity norm.
-  Default parameters follow those provided in the paper.
-
-  Arguments:
-      lr: float >= 0. Learning rate.
-      beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
-      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
-      decay: float >= 0. Learning rate decay over each update.
-  """
-
-  def __init__(self,
-               lr=0.002,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=None,
-               decay=0.,
-               **kwargs):
-    super(Adamax, self).__init__(**kwargs)
-    with K.name_scope(self.__class__.__name__):
-      self.iterations = K.variable(0, dtype='int64', name='iterations')
-      self.lr = K.variable(lr, name='lr')
-      self.beta_1 = K.variable(beta_1, name='beta_1')
-      self.beta_2 = K.variable(beta_2, name='beta_2')
-      self.decay = K.variable(decay, name='decay')
-    if epsilon is None:
-      epsilon = K.epsilon()
-    self.epsilon = epsilon
-    self.initial_decay = decay
-
-  def _create_all_weights(self, params):
-
-    shapes = [K.int_shape(p) for p in params]
-    # zero init of 1st moment
-    ms = [K.zeros(shape) for shape in shapes]
-    # zero init of exponentially weighted infinity norm
-    us = [K.zeros(shape) for shape in shapes]
-    self.weights = [self.iterations] + ms + us
-    return ms, us
-
-  def get_updates(self, loss, params):
-    grads = self.get_gradients(loss, params)
-    self.updates = []
-
-    lr = self.lr
-    if self.initial_decay > 0:
-      lr = lr * (  # pylint: disable=g-no-augmented-assignment
-          1. /
-          (1. +
-           self.decay * math_ops.cast(self.iterations, K.dtype(self.decay))))
-
-    with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
-      t = math_ops.cast(self.iterations, K.floatx())
-    lr_t = lr / (1. - math_ops.pow(self.beta_1, t))
-
-    ms, us = self._create_all_weights(params)
-
-    for p, g, m, u in zip(params, grads, ms, us):
-
-      m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
-      u_t = math_ops.maximum(self.beta_2 * u, math_ops.abs(g))
-      p_t = p - lr_t * m_t / (u_t + self.epsilon)
-
-      self.updates.append(state_ops.assign(m, m_t))
-      self.updates.append(state_ops.assign(u, u_t))
-      new_p = p_t
-
-      # Apply constraints.
-      if getattr(p, 'constraint', None) is not None:
-        new_p = p.constraint(new_p)
-
-      self.updates.append(state_ops.assign(p, new_p))
-    return self.updates
-
-  def get_config(self):
-    config = {
-        'lr': float(K.get_value(self.lr)),
-        'beta_1': float(K.get_value(self.beta_1)),
-        'beta_2': float(K.get_value(self.beta_2)),
-        'decay': float(K.get_value(self.decay)),
-        'epsilon': self.epsilon
-    }
-    base_config = super(Adamax, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-class Nadam(Optimizer):
-  """Nesterov Adam optimizer.
-
-  Much like Adam is essentially RMSprop with momentum,
-  Nadam is Adam RMSprop with Nesterov momentum.
-
-  Default parameters follow those provided in the paper.
-  It is recommended to leave the parameters of this optimizer
-  at their default values.
-
-  Arguments:
-      lr: float >= 0. Learning rate.
-      beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
-      epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
-  """
-
-  def __init__(self,
-               lr=0.002,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=None,
-               schedule_decay=0.004,
-               **kwargs):
-    super(Nadam, self).__init__(**kwargs)
-    with K.name_scope(self.__class__.__name__):
-      self.iterations = K.variable(0, dtype='int64', name='iterations')
-      self.m_schedule = K.variable(1., name='m_schedule')
-      self.lr = K.variable(lr, name='lr')
-      self.beta_1 = K.variable(beta_1, name='beta_1')
-      self.beta_2 = K.variable(beta_2, name='beta_2')
-    if epsilon is None:
-      epsilon = K.epsilon()
-    self.epsilon = epsilon
-    self.schedule_decay = schedule_decay
-
-  def _create_all_weights(self, params):
-    shapes = [K.int_shape(p) for p in params]
-    ms = [K.zeros(shape) for shape in shapes]
-    vs = [K.zeros(shape) for shape in shapes]
-
-    self.weights = [self.iterations, self.m_schedule] + ms + vs
-    return ms, vs
-
-  def get_updates(self, loss, params):
-    grads = self.get_gradients(loss, params)
-    self.updates = []
-
-    with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
-      t = math_ops.cast(self.iterations, K.floatx())
-
-    # Due to the recommendations in [2], i.e. warming momentum schedule
-    momentum_cache_t = self.beta_1 * (
-        1. - 0.5 *
-        (math_ops.pow(K.cast_to_floatx(0.96), t * self.schedule_decay)))
-    momentum_cache_t_1 = self.beta_1 * (
-        1. - 0.5 *
-        (math_ops.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay)))
-    m_schedule_new = self.m_schedule * momentum_cache_t
-    m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
-    self.updates.append((self.m_schedule, m_schedule_new))
-
-    ms, vs = self._create_all_weights(params)
-
-    for p, g, m, v in zip(params, grads, ms, vs):
-      # the following equations given in [1]
-      g_prime = g / (1. - m_schedule_new)
-      m_t = self.beta_1 * m + (1. - self.beta_1) * g
-      m_t_prime = m_t / (1. - m_schedule_next)
-      v_t = self.beta_2 * v + (1. - self.beta_2) * math_ops.square(g)
-      v_t_prime = v_t / (1. - math_ops.pow(self.beta_2, t))
-      m_t_bar = (1. -
-                 momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
-
-      self.updates.append(state_ops.assign(m, m_t))
-      self.updates.append(state_ops.assign(v, v_t))
-
-      p_t = p - self.lr * m_t_bar / (K.sqrt(v_t_prime) + self.epsilon)
-      new_p = p_t
-
-      # Apply constraints.
-      if getattr(p, 'constraint', None) is not None:
-        new_p = p.constraint(new_p)
-
-      self.updates.append(state_ops.assign(p, new_p))
-    return self.updates
-
-  def get_config(self):
-    config = {
-        'lr': float(K.get_value(self.lr)),
-        'beta_1': float(K.get_value(self.beta_1)),
-        'beta_2': float(K.get_value(self.beta_2)),
-        'epsilon': self.epsilon,
-        'schedule_decay': self.schedule_decay
-    }
-    base_config = super(Nadam, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-class TFOptimizer(Optimizer, trackable.Trackable):
-  """Wrapper class for native TensorFlow optimizers."""
-
-  def __init__(self, optimizer, iterations=None):  # pylint: disable=super-init-not-called
-    self.optimizer = optimizer
-    self._track_trackable(optimizer, name='optimizer')
-    if iterations is None:
-      with K.name_scope(self.__class__.__name__):
-        self.iterations = K.variable(0, dtype='int64', name='iterations')
-    else:
-      self.iterations = iterations
-    self._track_trackable(self.iterations, name='global_step')
-
-  def _clip_gradients(self, grads):
-    """Clip gradients according to the clipnorm and clipvalue attributes."""
-    # TFOptimizer wrapper has no gradient clipping options.
-    return grads
-
-  def minimize(self, loss, var_list, grad_loss=None, tape=None):
-    """Mimics the `OptimizerV2.minimize` API."""
-    if not callable(loss) and tape is None:
-      raise ValueError('`tape` is required when a `Tensor` loss is passed.')
-    tape = tape if tape is not None else backprop.GradientTape()
-
-    if callable(loss):
-      with tape:
-        if not callable(var_list):
-          tape.watch(var_list)
-        loss = loss()
-        if callable(var_list):
-          var_list = var_list()
-
-    var_list = nest.flatten(var_list)
-    if var_list:
-      grads = tape.gradient(loss, var_list, grad_loss)
-      grads_and_vars = list(zip(grads, var_list))
-      self.apply_gradients(grads_and_vars)
-
-  def apply_gradients(self, grads_and_vars):
-    self.optimizer.apply_gradients(grads_and_vars, global_step=self.iterations)
-
-  def get_grads(self, loss, params):
-    return self.optimizer.compute_gradients(loss, params)
-
-  def get_updates(self, loss, params):
-    if distribution_strategy_context.has_strategy():
-      self.updates = []
-
-      if not params:
-        # After the model vars have been created, the second call to get_updates
-        # is called with params as an empty list. This ensures that we call
-        # compute_gradients with params=None.
-        grads = self.optimizer.compute_gradients(loss)
-      else:
-        grads = self.optimizer.compute_gradients(loss, params)
-      global_step = training_util.get_global_step()
-      opt_update = self.optimizer.apply_gradients(grads, global_step)
-    else:
-      if not params:
-        self.updates = [state_ops.assign_add(self.iterations, 1)]
-        return self.updates
-
-      # Updates list starts out empty because the iterations variable is
-      # incremented in optimizer.apply_gradients()
-      self.updates = []
-      grads = self.optimizer.compute_gradients(loss, params)
-      opt_update = self.optimizer.apply_gradients(
-          grads, global_step=self.iterations)
-
-    self.updates.append(opt_update)
-    return self.updates
-
-  @property
-  def weights(self):
-    raise NotImplementedError
-
-  def get_config(self):
-    raise NotImplementedError
-
-  def from_config(self, config):
-    raise NotImplementedError
-
-
-# Aliases.
-
-sgd = SGD
-rmsprop = RMSprop
-adagrad = Adagrad
-adadelta = Adadelta
-adam = Adam
-adamax = Adamax
-nadam = Nadam
-
-
 @keras_export('keras.optimizers.serialize')
 def serialize(optimizer):
   return serialize_keras_object(optimizer)
diff --git a/tensorflow/python/keras/optimizers_test.py b/tensorflow/python/keras/optimizers_test.py
index db051eafea0..8c6658a10b9 100644
--- a/tensorflow/python/keras/optimizers_test.py
+++ b/tensorflow/python/keras/optimizers_test.py
@@ -27,6 +27,7 @@ from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import optimizer_v1
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.utils import np_utils
 from tensorflow.python.platform import test
@@ -109,63 +110,63 @@ class KerasOptimizersTest(keras_parameterized.TestCase):
 
   def test_sgd(self):
     with self.cached_session():
-      self._test_optimizer(keras.optimizers.SGD())
+      self._test_optimizer(optimizer_v1.SGD())
 
   def test_momentum(self):
     with self.cached_session():
       self._test_optimizer(
-          keras.optimizers.SGD(lr=0.01, momentum=0.9, nesterov=True))
+          optimizer_v1.SGD(lr=0.01, momentum=0.9, nesterov=True))
 
   def test_rmsprop(self):
     with self.cached_session():
-      self._test_optimizer(keras.optimizers.RMSprop())
-      self._test_optimizer(keras.optimizers.RMSprop(decay=1e-3))
+      self._test_optimizer(optimizer_v1.RMSprop())
+      self._test_optimizer(optimizer_v1.RMSprop(decay=1e-3))
 
   def test_adagrad(self):
     with self.cached_session():
-      self._test_optimizer(keras.optimizers.Adagrad())
-      self._test_optimizer(keras.optimizers.Adagrad(decay=1e-3))
+      self._test_optimizer(optimizer_v1.Adagrad())
+      self._test_optimizer(optimizer_v1.Adagrad(decay=1e-3))
 
   def test_adadelta(self):
     with self.cached_session():
-      self._test_optimizer(keras.optimizers.Adadelta(), target=0.6)
+      self._test_optimizer(optimizer_v1.Adadelta(), target=0.6)
       # Accuracy seems dependent on the initialization. Even adding
       # tf.compat.v1.Print nodes in the graph seemed to affect the
       # initialization seed, and hence the accuracy.
-      self._test_optimizer(keras.optimizers.Adadelta(decay=1e-3), target=0.4)
+      self._test_optimizer(optimizer_v1.Adadelta(decay=1e-3), target=0.4)
 
   def test_adam(self):
     with self.cached_session():
-      self._test_optimizer(keras.optimizers.Adam())
+      self._test_optimizer(optimizer_v1.Adam())
       # Accuracy seems dependent on the seed initialization.
       # TODO(b/121051441): fix test flakiness.
-      self._test_optimizer(keras.optimizers.Adam(decay=1e-3), target=0.73)
-      self._test_optimizer(keras.optimizers.Adam(amsgrad=True))
+      self._test_optimizer(optimizer_v1.Adam(decay=1e-3), target=0.73)
+      self._test_optimizer(optimizer_v1.Adam(amsgrad=True))
 
   def test_adamax(self):
     with self.cached_session():
-      self._test_optimizer(keras.optimizers.Adamax())
-      self._test_optimizer(keras.optimizers.Adamax(decay=1e-3))
+      self._test_optimizer(optimizer_v1.Adamax())
+      self._test_optimizer(optimizer_v1.Adamax(decay=1e-3))
 
   def test_nadam(self):
     with self.cached_session():
-      self._test_optimizer(keras.optimizers.Nadam())
+      self._test_optimizer(optimizer_v1.Nadam())
 
   def test_clipnorm(self):
     with self.cached_session():
       self._test_optimizer(
-          keras.optimizers.SGD(lr=0.01, momentum=0.9, clipnorm=0.5))
+          optimizer_v1.SGD(lr=0.01, momentum=0.9, clipnorm=0.5))
 
   def test_clipvalue(self):
     with self.cached_session():
       self._test_optimizer(
-          keras.optimizers.SGD(lr=0.01, momentum=0.9, clipvalue=0.5))
+          optimizer_v1.SGD(lr=0.01, momentum=0.9, clipvalue=0.5))
 
   def test_tf_optimizer(self):
     if context.executing_eagerly():
       self.skipTest(
           'v1 optimizer does not run in eager mode')
-    optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
+    optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01))
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(
         2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1)))
@@ -194,7 +195,7 @@ class KerasOptimizersTest(keras_parameterized.TestCase):
           'v1 optimizer does not run in eager mode')
     graph = ops.Graph()
     with graph.as_default():
-      optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
+      optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01))
       keras.backend.track_tf_optimizer(optimizer)
       optimizer_weak = weakref.ref(optimizer)
     graph_weak = weakref.ref(graph)
@@ -209,7 +210,7 @@ class KerasOptimizersTest(keras_parameterized.TestCase):
       self.skipTest(
           'v1 optimizer does not run in eager mode')
     with self.cached_session():
-      optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
+      optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01))
       model = keras.models.Sequential()
       model.add(keras.layers.Dense(
           2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1)))
@@ -229,9 +230,9 @@ class KerasOptimizersTest(keras_parameterized.TestCase):
 
   def test_negative_clipvalue_or_clipnorm(self):
     with self.assertRaises(ValueError):
-      _ = keras.optimizers.SGD(lr=0.01, clipvalue=-0.5)
+      _ = optimizer_v1.SGD(lr=0.01, clipvalue=-0.5)
     with self.assertRaises(ValueError):
-      _ = keras.optimizers.Adam(clipnorm=-2.0)
+      _ = optimizer_v1.Adam(clipnorm=-2.0)
 
   def test_mixed_precision_loss_scale_optimizer(self):
     if context.executing_eagerly():
diff --git a/tensorflow/python/keras/premade/BUILD b/tensorflow/python/keras/premade/BUILD
index 8c30cdbe2a6..9b3852e7ce2 100644
--- a/tensorflow/python/keras/premade/BUILD
+++ b/tensorflow/python/keras/premade/BUILD
@@ -9,7 +9,11 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
 
 py_library(
     name = "premade",
diff --git a/tensorflow/python/keras/premade/linear.py b/tensorflow/python/keras/premade/linear.py
index 438e3270021..f8ea38fa5f6 100644
--- a/tensorflow/python/keras/premade/linear.py
+++ b/tensorflow/python/keras/premade/linear.py
@@ -95,7 +95,7 @@ class LinearModel(training.Model):
     self.kernel_regularizer = regularizers.get(kernel_regularizer)
     self.bias_regularizer = regularizers.get(bias_regularizer)
     super(LinearModel, self).__init__(**kwargs)
-    base_layer.keras_model_gauge.get_cell('Linear').set(True)
+    base_layer.keras_premade_model_gauge.get_cell('Linear').set(True)
 
   def build(self, input_shape):
     if isinstance(input_shape, dict):
diff --git a/tensorflow/python/keras/premade/linear_test.py b/tensorflow/python/keras/premade/linear_test.py
index ad57baa7813..15914ec0c6e 100644
--- a/tensorflow/python/keras/premade/linear_test.py
+++ b/tensorflow/python/keras/premade/linear_test.py
@@ -113,68 +113,66 @@ class LinearModelTest(keras_parameterized.TestCase):
     indices = []
     values = []
     target = np.zeros((batch_size, 1))
-    with context.eager_mode():
-      for i in range(64):
-        rand_int = np.random.randint(3)
-        if rand_int == 0:
-          indices.append((i, 0))
-          val = np.random.uniform(low=-5, high=5)
-          values.append(val)
-          target[i] = 0.3 * val
-        elif rand_int == 1:
-          indices.append((i, 1))
-          val = np.random.uniform(low=-5, high=5)
-          values.append(val)
-          target[i] = 0.2 * val
-        else:
-          indices.append((i, 0))
-          indices.append((i, 1))
-          val_1 = np.random.uniform(low=-5, high=5)
-          val_2 = np.random.uniform(low=-5, high=5)
-          values.append(val_1)
-          values.append(val_2)
-          target[i] = 0.3 * val_1 + 0.2 * val_2
+    for i in range(64):
+      rand_int = np.random.randint(3)
+      if rand_int == 0:
+        indices.append((i, 0))
+        val = np.random.uniform(low=-5, high=5)
+        values.append(val)
+        target[i] = 0.3 * val
+      elif rand_int == 1:
+        indices.append((i, 1))
+        val = np.random.uniform(low=-5, high=5)
+        values.append(val)
+        target[i] = 0.2 * val
+      else:
+        indices.append((i, 0))
+        indices.append((i, 1))
+        val_1 = np.random.uniform(low=-5, high=5)
+        val_2 = np.random.uniform(low=-5, high=5)
+        values.append(val_1)
+        values.append(val_2)
+        target[i] = 0.3 * val_1 + 0.2 * val_2
 
-      indices = np.asarray(indices)
-      values = np.asarray(values)
-      shape = constant_op.constant([batch_size, 2], dtype=dtypes.int64)
-      inp = sparse_tensor.SparseTensor(indices, values, shape)
-      model = linear.LinearModel(use_bias=False)
-      opt = gradient_descent.SGD()
-      for _ in range(20):
-        with backprop.GradientTape() as t:
-          output = model(inp)
-          loss = backend.mean(losses.mean_squared_error(target, output))
-        grads = t.gradient(loss, model.trainable_variables)
-        grads_and_vars = zip(grads, model.trainable_variables)
-        opt.apply_gradients(grads_and_vars)
+    indices = np.asarray(indices)
+    values = np.asarray(values)
+    shape = constant_op.constant([batch_size, 2], dtype=dtypes.int64)
+    inp = sparse_tensor.SparseTensor(indices, values, shape)
+    model = linear.LinearModel(use_bias=False)
+    opt = gradient_descent.SGD()
+    for _ in range(20):
+      with backprop.GradientTape() as t:
+        output = model(inp)
+        loss = backend.mean(losses.mean_squared_error(target, output))
+      grads = t.gradient(loss, model.trainable_variables)
+      grads_and_vars = zip(grads, model.trainable_variables)
+      opt.apply_gradients(grads_and_vars)
 
   # This test is an example for a regression on categorical inputs, i.e.,
   # the output is 0.4, 0.6, 0.9 when input is 'alpha', 'beta', 'gamma'
   # separately.
   def test_linear_model_with_feature_column(self):
-    with context.eager_mode():
-      vocab_list = ['alpha', 'beta', 'gamma']
-      vocab_val = [0.4, 0.6, 0.9]
-      data = np.random.choice(vocab_list, size=256)
-      y = np.zeros_like(data, dtype=np.float32)
-      for vocab, val in zip(vocab_list, vocab_val):
-        indices = np.where(data == vocab)
-        y[indices] = val + np.random.uniform(
-            low=-0.01, high=0.01, size=indices[0].shape)
-      cat_column = fc.categorical_column_with_vocabulary_list(
-          key='symbol', vocabulary_list=vocab_list)
-      ind_column = fc.indicator_column(cat_column)
-      dense_feature_layer = dense_features_v2.DenseFeatures([ind_column])
-      linear_model = linear.LinearModel(
-          use_bias=False, kernel_initializer='zeros')
-      combined = sequential.Sequential([dense_feature_layer, linear_model])
-      opt = gradient_descent.SGD(learning_rate=0.1)
-      combined.compile(opt, 'mse', [])
-      combined.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
-      self.assertAllClose([[0.4], [0.6], [0.9]],
-                          combined.layers[1].dense_layers[0].kernel.numpy(),
-                          atol=0.01)
+    vocab_list = ['alpha', 'beta', 'gamma']
+    vocab_val = [0.4, 0.6, 0.9]
+    data = np.random.choice(vocab_list, size=256)
+    y = np.zeros_like(data, dtype=np.float32)
+    for vocab, val in zip(vocab_list, vocab_val):
+      indices = np.where(data == vocab)
+      y[indices] = val + np.random.uniform(
+          low=-0.01, high=0.01, size=indices[0].shape)
+    cat_column = fc.categorical_column_with_vocabulary_list(
+        key='symbol', vocabulary_list=vocab_list)
+    ind_column = fc.indicator_column(cat_column)
+    dense_feature_layer = dense_features_v2.DenseFeatures([ind_column])
+    linear_model = linear.LinearModel(
+        use_bias=False, kernel_initializer='zeros')
+    combined = sequential.Sequential([dense_feature_layer, linear_model])
+    opt = gradient_descent.SGD(learning_rate=0.1)
+    combined.compile(opt, 'mse', [])
+    combined.fit(x={'symbol': data}, y=y, batch_size=32, epochs=10)
+    self.assertAllClose([[0.4], [0.6], [0.9]],
+                        combined.layers[1].dense_layers[0].kernel.numpy(),
+                        atol=0.01)
 
   def test_config(self):
     linear_model = linear.LinearModel(units=3, use_bias=True)
diff --git a/tensorflow/python/keras/premade/wide_deep.py b/tensorflow/python/keras/premade/wide_deep.py
index edb0124276f..1f70a38cc93 100644
--- a/tensorflow/python/keras/premade/wide_deep.py
+++ b/tensorflow/python/keras/premade/wide_deep.py
@@ -85,7 +85,7 @@ class WideDeepModel(keras_training.Model):
         Allowed keyword arguments include `name`.
     """
     super(WideDeepModel, self).__init__(**kwargs)
-    base_layer.keras_model_gauge.get_cell('WideDeep').set(True)
+    base_layer.keras_premade_model_gauge.get_cell('WideDeep').set(True)
     self.linear_model = linear_model
     self.dnn_model = dnn_model
     self.activation = activations.get(activation)
diff --git a/tensorflow/python/keras/premade/wide_deep_test.py b/tensorflow/python/keras/premade/wide_deep_test.py
index 591b53e9a84..a87961e8a72 100644
--- a/tensorflow/python/keras/premade/wide_deep_test.py
+++ b/tensorflow/python/keras/premade/wide_deep_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.eager import context
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
@@ -96,29 +95,28 @@ class WideDeepModelTest(keras_parameterized.TestCase):
     wide_deep_model.fit(inputs, output, epochs=5)
 
   def test_wide_deep_model_with_multi_outputs(self):
-    with context.eager_mode():
-      inp = input_layer.Input(shape=(1,), name='linear')
-      l = linear.LinearModel(units=2, use_bias=False)(inp)
-      l1, l2 = array_ops.split(l, num_or_size_splits=2, axis=1)
-      linear_model = training.Model(inp, [l1, l2])
-      linear_model.set_weights([np.asarray([[0.5, 0.3]])])
-      h = core.Dense(units=2, use_bias=False)(inp)
-      h1, h2 = array_ops.split(h, num_or_size_splits=2, axis=1)
-      dnn_model = training.Model(inp, [h1, h2])
-      dnn_model.set_weights([np.asarray([[0.1, -0.5]])])
-      wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
-      inp_np = np.asarray([[1.]])
-      out1, out2 = wide_deep_model(inp_np)
-      # output should be (0.5 + 0.1), and (0.3 - 0.5)
-      self.assertAllClose([[0.6]], out1)
-      self.assertAllClose([[-0.2]], out2)
+    inp = input_layer.Input(shape=(1,), name='linear')
+    l = linear.LinearModel(units=2, use_bias=False)(inp)
+    l1, l2 = array_ops.split(l, num_or_size_splits=2, axis=1)
+    linear_model = training.Model(inp, [l1, l2])
+    linear_model.set_weights([np.asarray([[0.5, 0.3]])])
+    h = core.Dense(units=2, use_bias=False)(inp)
+    h1, h2 = array_ops.split(h, num_or_size_splits=2, axis=1)
+    dnn_model = training.Model(inp, [h1, h2])
+    dnn_model.set_weights([np.asarray([[0.1, -0.5]])])
+    wide_deep_model = wide_deep.WideDeepModel(linear_model, dnn_model)
+    inp_np = np.asarray([[1.]])
+    out1, out2 = wide_deep_model(inp_np)
+    # output should be (0.5 + 0.1), and (0.3 - 0.5)
+    self.assertAllClose([[0.6]], out1)
+    self.assertAllClose([[-0.2]], out2)
 
-      wide_deep_model = wide_deep.WideDeepModel(
-          linear_model, dnn_model, activation='relu')
-      out1, out2 = wide_deep_model(inp_np)
-      # output should be relu((0.5 + 0.1)), and relu((0.3 - 0.5))
-      self.assertAllClose([[0.6]], out1)
-      self.assertAllClose([[0.]], out2)
+    wide_deep_model = wide_deep.WideDeepModel(
+        linear_model, dnn_model, activation='relu')
+    out1, out2 = wide_deep_model(inp_np)
+    # output should be relu((0.5 + 0.1)), and relu((0.3 - 0.5))
+    self.assertAllClose([[0.6]], out1)
+    self.assertAllClose([[0.]], out2)
 
   def test_wide_deep_model_with_single_optimizer(self):
     linear_model = linear.LinearModel(units=1)
diff --git a/tensorflow/python/keras/preprocessing/BUILD b/tensorflow/python/keras/preprocessing/BUILD
index 24260fb71db..04925088f88 100644
--- a/tensorflow/python/keras/preprocessing/BUILD
+++ b/tensorflow/python/keras/preprocessing/BUILD
@@ -13,7 +13,11 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
 
 py_library(
     name = "preprocessing",
@@ -25,6 +29,7 @@ py_library(
         ":sequence",
         ":text",
         ":timeseries",
+        "//tensorflow/python/keras/utils:all_utils",
     ],
 )
 
@@ -109,6 +114,7 @@ tf_py_test(
     size = "small",
     srcs = ["sequence_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":sequence",
         "//tensorflow/python:client_testlib",
@@ -121,6 +127,7 @@ tf_py_test(
     size = "small",
     srcs = ["text_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":text",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/keras/preprocessing/image.py b/tensorflow/python/keras/preprocessing/image.py
index f943967d65b..fbfe67fb8b6 100644
--- a/tensorflow/python/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/preprocessing/image.py
@@ -33,11 +33,11 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.preprocessing.image_dataset import image_dataset_from_directory  # pylint: disable=unused-import
 from tensorflow.python.keras.utils import data_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 random_rotation = image.random_rotation
@@ -111,7 +111,7 @@ def smart_resize(x, size, interpolation='bilinear'):
   if len(size) != 2:
     raise ValueError('Expected `size` to be a tuple of 2 integers, '
                      'but got: %s' % (size,))
-  img = ops.convert_to_tensor(x)
+  img = ops.convert_to_tensor_v2_with_dispatch(x)
   if img.shape.rank is not None:
     if img.shape.rank != 3:
       raise ValueError(
diff --git a/tensorflow/python/keras/preprocessing/image_dataset.py b/tensorflow/python/keras/preprocessing/image_dataset.py
index 892fbe59709..66164086e7e 100644
--- a/tensorflow/python/keras/preprocessing/image_dataset.py
+++ b/tensorflow/python/keras/preprocessing/image_dataset.py
@@ -203,6 +203,8 @@ def image_dataset_from_directory(directory,
   dataset = dataset.batch(batch_size)
   # Users may need to reference `class_names`.
   dataset.class_names = class_names
+  # Include file paths for images as attribute.
+  dataset.file_paths = image_paths
   return dataset
 
 
diff --git a/tensorflow/python/keras/preprocessing/image_test.py b/tensorflow/python/keras/preprocessing/image_test.py
index 9d010a98f61..2f22b92f05a 100644
--- a/tensorflow/python/keras/preprocessing/image_test.py
+++ b/tensorflow/python/keras/preprocessing/image_test.py
@@ -22,15 +22,16 @@ import os
 import shutil
 import tempfile
 
+from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.framework import test_util
+from tensorflow.python.data import Dataset
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import layers
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.preprocessing import image as preprocessing_image
 from tensorflow.python.platform import test
-from tensorflow.python.data import Dataset
 
 try:
   import PIL  # pylint:disable=g-import-not-at-top
@@ -58,7 +59,7 @@ def _generate_test_images():
 
 class TestImage(keras_parameterized.TestCase):
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_smart_resize(self):
     test_input = np.random.random((20, 40, 3))
     output = preprocessing_image.smart_resize(test_input, size=(50, 50))
@@ -71,18 +72,21 @@ class TestImage(keras_parameterized.TestCase):
     output = preprocessing_image.smart_resize(test_input, size=(5, 15))
     self.assertListEqual(list(output.shape), [5, 15, 3])
 
-  @test_util.run_v2_only
-  def test_smart_resize_tf_dataset(self):
+  @parameterized.named_parameters(
+      ('size1', (50, 50)),
+      ('size2', (10, 10)),
+      ('size3', (100, 50)),
+      ('size4', (5, 15)))
+  @testing_utils.run_v2_only
+  def test_smart_resize_tf_dataset(self, size):
     test_input_np = np.random.random((2, 20, 40, 3))
     test_ds = Dataset.from_tensor_slices(test_input_np)
 
     resize = lambda img: preprocessing_image.smart_resize(img, size=size)
-
-    for size in [(50, 50), (10, 10), (100, 50), (5, 15)]:
-      test_ds = test_ds.map(resize)
-      for sample in test_ds.as_numpy_iterator():
-        self.assertIsInstance(sample, np.ndarray)
-        self.assertListEqual(list(sample.shape), [size[0], size[1], 3])
+    test_ds = test_ds.map(resize)
+    for sample in test_ds.as_numpy_iterator():
+      self.assertIsInstance(sample, np.ndarray)
+      self.assertListEqual(list(sample.shape), [size[0], size[1], 3])
 
   def test_smart_resize_errors(self):
     with self.assertRaisesRegex(ValueError, 'a tuple of 2 integers'):
diff --git a/tensorflow/python/keras/preprocessing/text.py b/tensorflow/python/keras/preprocessing/text.py
index 5a343e0069c..372dc18b61d 100644
--- a/tensorflow/python/keras/preprocessing/text.py
+++ b/tensorflow/python/keras/preprocessing/text.py
@@ -63,7 +63,7 @@ def one_hot(input_text,
             filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
             lower=True,
             split=' '):
-  """One-hot encodes a text into a list of word indexes of size `n`.
+  r"""One-hot encodes a text into a list of word indexes of size `n`.
 
   This function receives as input a string of text and returns a
   list of encoded integers each corresponding to a word (or token)
@@ -73,8 +73,11 @@ def one_hot(input_text,
       input_text: Input text (string).
       n: int. Size of vocabulary.
       filters: list (or concatenation) of characters to filter out, such as
-          punctuation. Default: ``!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n``,
-            includes basic punctuation, tabs, and newlines.
+        punctuation. Default:
+        ```
+        '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n
+        ```,
+        includes basic punctuation, tabs, and newlines.
       lower: boolean. Whether to set the text to lowercase.
       split: str. Separator for word splitting.
 
diff --git a/tensorflow/python/keras/protobuf/BUILD b/tensorflow/python/keras/protobuf/BUILD
index 644c32bb507..b7d85419fb9 100644
--- a/tensorflow/python/keras/protobuf/BUILD
+++ b/tensorflow/python/keras/protobuf/BUILD
@@ -9,10 +9,15 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 tf_proto_library(
     name = "projector_config_proto",
     srcs = ["projector_config.proto"],
     cc_api_version = 2,
 )
+
+tf_proto_library(
+    name = "saved_metadata_proto",
+    srcs = ["saved_metadata.proto"],
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/core:protos_all"],
+)
diff --git a/tensorflow/python/keras/protobuf/saved_metadata.proto b/tensorflow/python/keras/protobuf/saved_metadata.proto
new file mode 100644
index 00000000000..41684bbd627
--- /dev/null
+++ b/tensorflow/python/keras/protobuf/saved_metadata.proto
@@ -0,0 +1,33 @@
+// Protobuf containing the metadata for each Keras object saved in a SavedModel.
+
+syntax = "proto3";
+
+package third_party.tensorflow.python.keras.protobuf;
+
+import "tensorflow/core/framework/versions.proto";
+
+message SavedMetadata {
+  // Nodes represent trackable objects in the SavedModel. The data for every
+  // Keras object is stored.
+  repeated SavedObject nodes = 1;
+}
+
+// Metadata of an individual Keras object.
+message SavedObject {
+  // Version defined by the code serializing this Keras object.
+  .tensorflow.VersionDef version = 1;
+  // Index of the node in the SavedModel SavedObjectGraph.
+  int32 node_id = 2;
+  // String path from root (e.g. "root.child_layer")
+  string node_path = 3;
+
+  // Identifier to determine loading function.
+  // Currently supported identifiers:
+  //   _tf_keras_layer, _tf_keras_input_layer, _tf_keras_rnn_layer,
+  //   _tf_keras_metric, _tf_keras_network, _tf_keras_model,
+  //   _tf_keras_sequential
+  string identifier = 4;
+  // Metadata containing a JSON-serialized object with the non-TensorFlow
+  // attributes for this Keras object.
+  string metadata = 5;
+}
diff --git a/tensorflow/python/keras/saving/BUILD b/tensorflow/python/keras/saving/BUILD
index 2e1b6cdd9f7..3b1eeb8ed8d 100644
--- a/tensorflow/python/keras/saving/BUILD
+++ b/tensorflow/python/keras/saving/BUILD
@@ -14,6 +14,15 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob([
+        "*.py",
+        "**/*.py",
+    ]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_library(
     name = "saving",
     srcs = [
@@ -44,10 +53,13 @@ py_library(
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras:losses",
         "//tensorflow/python/keras:optimizers",
         "//tensorflow/python/keras:regularizers",
         "//tensorflow/python/keras/engine:input_spec",
+        "//tensorflow/python/keras/mixed_precision/experimental:autocast_variable",
         "//tensorflow/python/keras/utils:engine_utils",
+        "//tensorflow/python/keras/utils:metrics_utils",
         "//tensorflow/python/keras/utils:mode_keys",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/saved_model/model_utils",
@@ -61,6 +73,9 @@ tf_py_test(
     srcs = ["metrics_serialization_test.py"],
     python_version = "PY3",
     shard_count = 8,
+    tags = [
+        "notsan",  # TODO(b/170870790)
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -194,7 +209,9 @@ tf_py_test(
     size = "small",
     srcs = ["saved_model/json_utils_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
+        ":saving",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index c7709544563..d3bb10c98dd 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -26,11 +26,10 @@ import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import optimizers
+from tensorflow.python.keras import optimizer_v1
 from tensorflow.python.keras.saving import model_config as model_config_lib
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.saving.saved_model import json_utils
-from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils.generic_utils import LazyLoader
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.ops import variables as variables_module
@@ -128,7 +127,7 @@ def save_model_to_hdf5(model, filepath, overwrite=True, include_optimizer=True):
     # TODO(b/128683857): Add integration tests between tf.keras and external
     # Keras, to avoid breaking TF.js users.
     if (include_optimizer and model.optimizer and
-        not isinstance(model.optimizer, optimizers.TFOptimizer)):
+        not isinstance(model.optimizer, optimizer_v1.TFOptimizer)):
       save_optimizer_weights_to_hdf5_group(f, model.optimizer)
 
     f.flush()
@@ -402,10 +401,6 @@ def preprocess_weights_for_loading(layer,
 
   conv_layers = ['Conv1D', 'Conv2D', 'Conv3D', 'Conv2DTranspose', 'ConvLSTM2D']
   if layer.__class__.__name__ in conv_layers:
-    if original_backend == 'theano':
-      weights[0] = conv_utils.convert_kernel(weights[0])
-      if layer.__class__.__name__ == 'ConvLSTM2D':
-        weights[1] = conv_utils.convert_kernel(weights[1])
     if K.int_shape(layer.weights[0]) != weights[0].shape:
       weights[0] = np.transpose(weights[0], (3, 2, 0, 1))
       if layer.__class__.__name__ == 'ConvLSTM2D':
diff --git a/tensorflow/python/keras/saving/hdf5_format_test.py b/tensorflow/python/keras/saving/hdf5_format_test.py
index 1817bfc9263..9ed90bc4999 100644
--- a/tensorflow/python/keras/saving/hdf5_format_test.py
+++ b/tensorflow/python/keras/saving/hdf5_format_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import optimizer_v1
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import training
@@ -341,7 +342,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
                                        name='d1'))
       ref_model.add(keras.layers.Dense(num_classes, name='d2'))
       ref_model.compile(loss=keras.losses.MSE,
-                        optimizer=keras.optimizers.RMSprop(lr=0.0001),
+                        optimizer=optimizer_v1.RMSprop(lr=0.0001),
                         metrics=[keras.metrics.categorical_accuracy])
 
       f_ref_model = h5py.File(h5_path, 'w')
@@ -354,7 +355,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
                                    name='d1'))
       model.add(keras.layers.Dense(num_classes, name='d2'))
       model.compile(loss=keras.losses.MSE,
-                    optimizer=keras.optimizers.RMSprop(lr=0.0001),
+                    optimizer=optimizer_v1.RMSprop(lr=0.0001),
                     metrics=[keras.metrics.categorical_accuracy])
       with self.assertRaisesRegex(
           ValueError, r'Layer #0 \(named "d1"\), weight '
@@ -515,7 +516,7 @@ class TestWholeModelSaving(keras_parameterized.TestCase):
     with ops.Graph().as_default(), self.cached_session():
       # test with custom optimizer, loss
 
-      class CustomOp(keras.optimizers.RMSprop):
+      class CustomOp(optimizer_v1.RMSprop):
         pass
 
       def custom_loss(y_true, y_pred):
@@ -692,7 +693,7 @@ class TestWholeModelSaving(keras_parameterized.TestCase):
       model = keras.Model(inputs, outputs)
       model.compile(
           loss=keras.losses.MSE,
-          optimizer=keras.optimizers.Adam(),
+          optimizer=optimizer_v1.Adam(),
           metrics=[
               keras.metrics.categorical_accuracy,
               keras.metrics.CategoricalAccuracy()
@@ -830,29 +831,30 @@ class TestWholeModelSaving(keras_parameterized.TestCase):
     saved_model_dir = self._save_model_dir()
     save_format = testing_utils.get_save_format()
 
-    model = _make_model()
-    model.compile(
-        loss=keras.losses.SparseCategoricalCrossentropy(),
-        optimizer=optimizers.gradient_descent_v2.SGD(),
-        metrics=[keras.metrics.SparseCategoricalCrossentropy()])
-    x = np.random.normal(size=(32, 4))
-    y = np.random.randint(0, 3, size=32)
-    model.train_on_batch(x, y)
-    evaluation_results = model.evaluate(x, y)
-    # Save and reload model.
-    model.save(saved_model_dir, save_format=save_format)
-    del model  # Prevent misuse.
-    loaded_model = keras.models.load_model(saved_model_dir)
-    loaded_model_eval_results = loaded_model.evaluate(x, y)
-    # Assert all evaluation results are the same.
-    self.assertAllClose(evaluation_results, loaded_model_eval_results, 1e-9)
-    # Check correctness of the loss calculation.
-    self.assertAllGreater(evaluation_results, 0.)
-    evaluation_results = dict(
-        zip(loaded_model.metrics_names, evaluation_results))
-    self.assertNear(
-        evaluation_results['sparse_categorical_crossentropy'] +
-        evaluation_results['custom_loss'], evaluation_results['loss'], 1e-6)
+    with self.cached_session():
+      model = _make_model()
+      model.compile(
+          loss=keras.losses.SparseCategoricalCrossentropy(),
+          optimizer=optimizers.gradient_descent_v2.SGD(),
+          metrics=[keras.metrics.SparseCategoricalCrossentropy()])
+      x = np.random.normal(size=(32, 4))
+      y = np.random.randint(0, 3, size=32)
+      model.train_on_batch(x, y)
+      evaluation_results = model.evaluate(x, y)
+      # Save and reload model.
+      model.save(saved_model_dir, save_format=save_format)
+      del model  # Prevent misuse.
+      loaded_model = keras.models.load_model(saved_model_dir)
+      loaded_model_eval_results = loaded_model.evaluate(x, y)
+      # Assert all evaluation results are the same.
+      self.assertAllClose(evaluation_results, loaded_model_eval_results, 1e-9)
+      # Check correctness of the loss calculation.
+      self.assertAllGreater(evaluation_results, 0.)
+      evaluation_results = dict(
+          zip(loaded_model.metrics_names, evaluation_results))
+      self.assertNear(
+          evaluation_results['sparse_categorical_crossentropy'] +
+          evaluation_results['custom_loss'], evaluation_results['loss'], 1e-6)
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_save_uncompiled_model_with_optimizer(self):
@@ -1028,7 +1030,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
       model = keras.models.Sequential()
       model.add(keras.layers.Dense(2, input_shape=(3,)))
       model.add(keras.layers.Dense(3))
-      model.compile(loss='mse', optimizer=optimizers.Adam(), metrics=['acc'])
+      model.compile(loss='mse', optimizer=optimizer_v1.Adam(), metrics=['acc'])
       if not ops.executing_eagerly_outside_functions():
         model._make_train_function()
       temp_dir = self.get_temp_dir()
diff --git a/tensorflow/python/keras/saving/model_config.py b/tensorflow/python/keras/saving/model_config.py
index 63f82b404a4..facc95b22f9 100644
--- a/tensorflow/python/keras/saving/model_config.py
+++ b/tensorflow/python/keras/saving/model_config.py
@@ -34,6 +34,15 @@ except ImportError:
 @keras_export('keras.models.model_from_config')
 def model_from_config(config, custom_objects=None):
   """Instantiates a Keras model from its config.
+ 
+  Usage:
+  ```
+  # for a Functional API model
+  tf.keras.Model().from_config(model.get_config())
+
+  # for a Sequential model
+  tf.keras.Sequential().from_config(model.get_config())
+  ```
 
   Arguments:
       config: Configuration dictionary.
diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index c0c69c4e715..19ecd4fad4c 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -67,9 +67,9 @@ def save_model(model,
 
   The saved model contains:
 
-      - the model's configuration (topology)
-      - the model's weights
-      - the model's optimizer's state (if any)
+  - the model's configuration (topology)
+  - the model's weights
+  - the model's optimizer's state (if any)
 
   Thus the saved model can be reinstantiated in
   the exact same state, without any of the code
diff --git a/tensorflow/python/keras/saving/save_test.py b/tensorflow/python/keras/saving/save_test.py
index 4df46864f22..fcd2003aab8 100644
--- a/tensorflow/python/keras/saving/save_test.py
+++ b/tensorflow/python/keras/saving/save_test.py
@@ -28,12 +28,12 @@ from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.feature_column import feature_column_lib
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.feature_column import dense_features
+from tensorflow.python.keras.feature_column import sequence_feature_column as ksfc
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.saving import model_config
 from tensorflow.python.keras.saving import save
@@ -66,13 +66,13 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
   def assert_saved_model(self, path):
     loader_impl.parse_saved_model(path)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_save_format_defaults(self):
     path = os.path.join(self.get_temp_dir(), 'model_path')
     save.save_model(self.model, path)
     self.assert_saved_model(path)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_save_format_defaults_pathlib(self):
     if sys.version_info < (3, 6):
       self.skipTest('pathlib is only available for python version >= 3.6')
@@ -80,7 +80,7 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
     save.save_model(self.model, path)
     self.assert_saved_model(path)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_save_hdf5(self):
     path = os.path.join(self.get_temp_dir(), 'model')
     save.save_model(self.model, path, save_format='h5')
@@ -90,7 +90,7 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
         'requires the model to be a Functional model or a Sequential model.'):
       save.save_model(self.subclassed_model, path, save_format='h5')
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_save_load_hdf5_pathlib(self):
     if sys.version_info < (3, 6):
       self.skipTest('pathlib is only available for python version >= 3.6')
@@ -98,7 +98,7 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
     save.save_model(self.model, path, save_format='h5')
     save.load_model(path)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_save_tf(self):
     path = os.path.join(self.get_temp_dir(), 'model')
     save.save_model(self.model, path, save_format='tf')
@@ -109,13 +109,13 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
     save.save_model(self.subclassed_model, path, save_format='tf')
     self.assert_saved_model(path)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_save_load_tf_string(self):
     path = os.path.join(self.get_temp_dir(), 'model')
     save.save_model(self.model, path, save_format='tf')
     save.load_model(path)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_save_load_tf_pathlib(self):
     if sys.version_info < (3, 6):
       self.skipTest('pathlib is only available for python version >= 3.6')
@@ -123,7 +123,7 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
     save.save_model(self.model, path, save_format='tf')
     save.load_model(path)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_save_load_weights_tf_pathlib(self):
     if sys.version_info < (3, 6):
       self.skipTest('pathlib is only available for python version >= 3.6')
@@ -131,7 +131,7 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
     self.model.save_weights(path, save_format='tf')
     self.model.load_weights(path)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def test_save_load_weights_hdf5_pathlib(self):
     if sys.version_info < (3, 6):
       self.skipTest('pathlib is only available for python version >= 3.6')
@@ -191,7 +191,7 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
                 shape=(None, 1), sparse=True, name='b', dtype='string')
     }
 
-    fc_layer, _ = feature_column_lib.SequenceFeatures(cols)(input_layers)
+    fc_layer, _ = ksfc.SequenceFeatures(cols)(input_layers)
     # TODO(tibell): Figure out the right dtype and apply masking.
     # sequence_length_mask = array_ops.sequence_mask(sequence_length)
     # x = keras.layers.GRU(32)(fc_layer, mask=sequence_length_mask)
@@ -290,7 +290,7 @@ class TestSaveModel(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_saving_model_with_custom_object(self):
-    with generic_utils.custom_object_scope():
+    with generic_utils.custom_object_scope(), self.cached_session():
 
       @generic_utils.register_keras_serializable()
       class CustomLoss(losses.MeanSquaredError):
diff --git a/tensorflow/python/keras/saving/saved_model/load.py b/tensorflow/python/keras/saving/saved_model/load.py
index 4889ee97211..e0426e74f6b 100644
--- a/tensorflow/python/keras/saving/saved_model/load.py
+++ b/tensorflow/python/keras/saving/saved_model/load.py
@@ -23,6 +23,7 @@ import types
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function as defun
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import regularizers
@@ -43,7 +44,6 @@ from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking.tracking import delete_tracking
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
-from tensorflow.python.util import object_identity
 
 # To avoid circular dependencies between keras/engine and keras/saving,
 # code in keras/saving must delay imports.
@@ -179,8 +179,6 @@ class KerasObjectLoader(tf_load.Loader):
     # records all nodes that were generated directly/indirectly from the config,
     # so that they do not get recreated multiple times.
     self._nodes_recreated_from_config = {}
-    self._all_nodes_recreated_from_config = (
-        object_identity.ObjectIdentityWeakSet())
     # Store all node ids that have already been traversed when tracking nodes
     # that were recreated from the config.
     self._traversed_nodes_from_config = []
@@ -293,7 +291,6 @@ class KerasObjectLoader(tf_load.Loader):
                      'Object: {}'.format(obj_child))
       self._nodes_recreated_from_config[child_id] = (
           obj_child, self._config_node_setter(setter))
-      self._all_nodes_recreated_from_config.add(obj_child)
       self._add_children_recreated_from_config(
           obj_child, child_proto, child_id)
 
@@ -363,7 +360,6 @@ class KerasObjectLoader(tf_load.Loader):
 
     setter = self._config_node_setter(_revive_setter)
     self._nodes_recreated_from_config[node_id] = obj, setter
-    self._all_nodes_recreated_from_config.add(obj)
     self._add_children_recreated_from_config(
         obj, self._proto.nodes[node_id], node_id)
     return obj, setter
@@ -980,8 +976,11 @@ def infer_inputs_from_restored_call_function(fn):
     TensorSpec of call function inputs.
   """
   def common_spec(x, y):
-    return tensor_spec.TensorSpec(defun.common_shape(x.shape, y.shape),
-                                  x.dtype, x.name)
+    common_shape = defun.common_shape(x.shape, y.shape)
+    if isinstance(x, sparse_tensor.SparseTensorSpec):
+      return sparse_tensor.SparseTensorSpec(common_shape, x.dtype)
+    return tensor_spec.TensorSpec(common_shape, x.dtype, x.name)
+
   spec = fn.concrete_functions[0].structured_input_signature[0][0]
   for concrete in fn.concrete_functions[1:]:
     spec2 = concrete.structured_input_signature[0][0]
diff --git a/tensorflow/python/keras/saving/saved_model/metric_serialization.py b/tensorflow/python/keras/saving/saved_model/metric_serialization.py
index efe977ec55f..419d02811d5 100644
--- a/tensorflow/python/keras/saving/saved_model/metric_serialization.py
+++ b/tensorflow/python/keras/saving/saved_model/metric_serialization.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.keras.saving.saved_model import layer_serialization
+from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.training.tracking import data_structures
 
 
@@ -31,7 +32,7 @@ class MetricSavedModelSaver(layer_serialization.LayerSavedModelSaver):
 
   def _python_properties_internal(self):
     metadata = dict(
-        class_name=type(self.obj).__name__,
+        class_name=generic_utils.get_registered_name(type(self.obj)),
         name=self.obj.name,
         dtype=self.obj.dtype)
     metadata.update(layer_serialization.get_config(self.obj))
diff --git a/tensorflow/python/keras/saving/saved_model/revive_test.py b/tensorflow/python/keras/saving/saved_model/revive_test.py
index 5c4f8a23cfe..80170dbd708 100644
--- a/tensorflow/python/keras/saving/saved_model/revive_test.py
+++ b/tensorflow/python/keras/saving/saved_model/revive_test.py
@@ -33,12 +33,15 @@ from tensorflow.python import keras
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.saving.saved_model import load as keras_load
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -72,6 +75,36 @@ class SubclassedModelNoConfig(keras.Model):
     return x
 
 
+class SparseDense(keras.layers.Dense):
+
+  def call(self, inputs):
+    input_shape = array_ops.stack(
+        (math_ops.reduce_prod(array_ops.shape(inputs)[:-1]),
+         self.kernel.shape[0]))
+    output_shape = array_ops.concat(
+        (array_ops.shape(inputs)[:-1], [self.kernel.shape[1]]), -1)
+    x = sparse_ops.sparse_reshape(inputs, input_shape)
+    return array_ops.reshape(
+        self.activation(
+            sparse_ops.sparse_tensor_dense_matmul(x, self.kernel) + self.bias),
+        output_shape)
+
+
+class SubclassedSparseModelNoConfig(keras.Model):
+
+  def __init__(self, a, b):
+    super(SubclassedSparseModelNoConfig, self).__init__()
+    self.a = a
+    self.shared = CustomLayerNoConfig(a, b)
+    self.all_layers = [SparseDense(4)]
+
+  def call(self, inputs):
+    x = inputs
+    for layer in self.all_layers:
+      x = layer(x)
+    return self.shared(x + self.a)
+
+
 class SubclassedModelWithConfig(SubclassedModelNoConfig):
 
   def get_config(self):
@@ -171,6 +204,9 @@ class ReviveTestBase(keras_parameterized.TestCase):
                         self.evaluate(revived.weights))
     input_arr = constant_op.constant(
         np.random.random((2, 2, 3)).astype(np.float32))
+    if isinstance(revived._saved_model_inputs_spec,
+                  sparse_tensor.SparseTensorSpec):
+      input_arr = sparse_ops.from_dense(input_arr)
 
     self.assertAllClose(model(input_arr), revived(input_arr))
     self.assertAllClose(sum(model.losses), sum(revived.losses))
@@ -276,6 +312,15 @@ class TestModelRevive(ReviveTestBase):
     revived = keras_load.load(self.path)
     self._assert_revived_correctness(model, revived)
 
+  def test_revive_subclassed_with_sparse_model(self):
+    model = SubclassedSparseModelNoConfig(1., 2.)
+    # Run data through the Model to create save spec and weights.
+    x = sparse_ops.from_dense(np.ones((10, 2, 3), dtype=np.float32))
+    model.predict(x, batch_size=10)
+    model.save(self.path, save_format='tf')
+    revived = keras_load.load(self.path)
+    self._assert_revived_correctness(model, revived)
+
   def test_revive_sequential_inputs(self):
     model = keras.models.Sequential([
         keras.Input((None,), dtype=dtypes.string),
@@ -295,6 +340,29 @@ class TestModelRevive(ReviveTestBase):
     revived = keras_load.load(self.path, compile=False)
     self._assert_revived_correctness(model, revived)
 
+  def test_load_compiled_metrics(self):
+    model = testing_utils.get_small_sequential_mlp(1, 3)
+
+    # Compile with dense categorical accuracy
+    model.compile('rmsprop', 'mse', 'acc')
+    x = np.random.random((5, 10)).astype(np.float32)
+    y_true = np.random.random((5, 3)).astype(np.float32)
+    model.train_on_batch(x, y_true)
+
+    model.save(self.path, include_optimizer=True, save_format='tf')
+    revived = keras_load.load(self.path, compile=True)
+    self.assertAllClose(model.test_on_batch(x, y_true),
+                        revived.test_on_batch(x, y_true))
+
+    # Compile with sparse categorical accuracy
+    model.compile('rmsprop', 'mse', 'acc')
+    y_true = np.random.randint(0, 3, (5, 1)).astype(np.float32)
+    model.train_on_batch(x, y_true)
+    model.save(self.path, include_optimizer=True, save_format='tf')
+    revived = keras_load.load(self.path, compile=True)
+    self.assertAllClose(model.test_on_batch(x, y_true),
+                        revived.test_on_batch(x, y_true))
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()
diff --git a/tensorflow/python/keras/saving/saved_model/save_impl.py b/tensorflow/python/keras/saving/saved_model/save_impl.py
index a2c4d58d18e..97c8fc313f0 100644
--- a/tensorflow/python/keras/saving/saved_model/save_impl.py
+++ b/tensorflow/python/keras/saving/saved_model/save_impl.py
@@ -26,17 +26,18 @@ import weakref
 
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import input_spec
+from tensorflow.python.keras.mixed_precision.experimental import autocast_variable
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.saving.saved_model import constants
 from tensorflow.python.keras.saving.saved_model import load as keras_load
 from tensorflow.python.keras.saving.saved_model import serialized_attributes
 from tensorflow.python.keras.saving.saved_model import utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils import version_utils
 from tensorflow.python.keras.utils.generic_utils import LazyLoader
 from tensorflow.python.platform import tf_logging as logging
@@ -44,7 +45,6 @@ from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_inspect
 
 # To avoid circular dependencies between keras/engine and keras/saving,
 # code in keras/saving must delay imports.
@@ -522,7 +522,8 @@ def layer_call_wrapper(call_collection, method):
     with base_layer_utils.call_context().enter(
         layer, inputs=inputs, build_graph=False, training=training,
         saving=True):
-      with ops.enable_auto_cast_variables(layer._compute_dtype_object):  # pylint: disable=protected-access
+      with autocast_variable.enable_auto_cast_variables(
+          layer._compute_dtype_object):  # pylint: disable=protected-access
         ret = method(*args, **kwargs)
     _restore_layer_losses(original_losses)
     return ret
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index 1dff9a2e8cf..3cb960d0971 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -53,6 +53,8 @@ from tensorflow.python.keras.saving.saved_model import load as keras_load
 from tensorflow.python.keras.saving.saved_model import save_impl as keras_save
 from tensorflow.python.keras.utils import control_flow_util
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_contextlib
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -62,8 +64,6 @@ from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import load as tf_load
 from tensorflow.python.saved_model import save as tf_save
-from tensorflow.python.util import tf_contextlib
-from tensorflow.python.util import tf_inspect
 
 
 class LayerWithLearningPhase(keras.engine.base_layer.Layer):
@@ -507,7 +507,7 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
 
     self.assertAllClose(
         model.predict(input_arr),
-        loaded.signatures['predict'](ops.convert_to_tensor_v2(
+        loaded.signatures['predict'](ops.convert_to_tensor_v2_with_dispatch(
             input_arr.astype('float32')))['predictions'])
 
     feature = {
@@ -517,7 +517,7 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     example = example_pb2.Example(
         features=feature_pb2.Features(feature=feature))
     outputs = loaded.signatures['parse_and_predict'](
-        ops.convert_to_tensor_v2([example.SerializeToString()]))
+        ops.convert_to_tensor_v2_with_dispatch([example.SerializeToString()]))
     self.assertAllClose(model.predict(input_arr), outputs['predictions'])
     self.assertAllClose(model.layers[0](input_arr), outputs['layer_1_outputs'])
 
@@ -965,33 +965,34 @@ class MetricTest(test.TestCase, parameterized.TestCase):
                                  num_tensor_args,
                                  shape=(1, 5),
                                  test_sample_weight=True):
-    tf_save.save(metric, save_dir)
-    loaded = keras_load.load(save_dir)
-    self.evaluate([v.initializer for v in loaded.variables])
-    self.assertEqual(metric.name, loaded.name)
-    self.assertEqual(metric.dtype, loaded.dtype)
+    with self.cached_session():
+      tf_save.save(metric, save_dir)
+      loaded = keras_load.load(save_dir)
+      self.evaluate([v.initializer for v in loaded.variables])
+      self.assertEqual(metric.name, loaded.name)
+      self.assertEqual(metric.dtype, loaded.dtype)
 
-    inputs = self.generate_inputs(num_tensor_args, shape)
-    actual = self.evaluate(metric(*inputs))
-    self.assertAllClose(actual, loaded(*inputs))
-    self.assertAllClose(metric.variables, loaded.variables)
-
-    # Test with separate calls to update state and result.
-    inputs = self.generate_inputs(num_tensor_args, shape)
-    self.evaluate(metric.update_state(*inputs))
-    self.evaluate(loaded.update_state(*inputs))
-    actual = self.evaluate(metric.result())
-    self.assertAllClose(actual, loaded.result())
-
-    if test_sample_weight:
-      # Test with sample weights input.
       inputs = self.generate_inputs(num_tensor_args, shape)
-      sample_weight = self.generate_inputs(1, [])[0]
-      inputs.append(sample_weight)
-
       actual = self.evaluate(metric(*inputs))
       self.assertAllClose(actual, loaded(*inputs))
-    return loaded
+      self.assertAllClose(metric.variables, loaded.variables)
+
+      # Test with separate calls to update state and result.
+      inputs = self.generate_inputs(num_tensor_args, shape)
+      self.evaluate(metric.update_state(*inputs))
+      self.evaluate(loaded.update_state(*inputs))
+      actual = self.evaluate(metric.result())
+      self.assertAllClose(actual, loaded.result())
+
+      if test_sample_weight:
+        # Test with sample weights input.
+        inputs = self.generate_inputs(num_tensor_args, shape)
+        sample_weight = self.generate_inputs(1, [])[0]
+        inputs.append(sample_weight)
+
+        actual = self.evaluate(metric(*inputs))
+        self.assertAllClose(actual, loaded(*inputs))
+      return loaded
 
   @parameterized.named_parameters([
       ('mean', keras.metrics.Mean, 1, (1, 5)),
@@ -1054,6 +1055,33 @@ class MetricTest(test.TestCase, parameterized.TestCase):
             num_tensor_args,
             test_sample_weight=False)
 
+  def test_registered_custom_metric(self):
+
+    @generic_utils.register_keras_serializable('Testing')
+    class CustomMeanMetric(keras.metrics.Mean):
+
+      def update_state(self, *args):  # pylint: disable=useless-super-delegation
+        # Sometimes built-in metrics return an op in update_state. Custom
+        # metrics don't support returning ops, so wrap the update_state method
+        # while returning nothing.
+        super(CustomMeanMetric, self).update_state(*args)
+
+    with self.cached_session():
+      metric = CustomMeanMetric()
+      save_dir = self._save_model_dir('first_save')
+      self.evaluate([v.initializer for v in metric.variables])
+      loaded = self._test_metric_save_and_load(
+          metric,
+          save_dir,
+          num_tensor_args=1,
+          test_sample_weight=False)
+
+      self._test_metric_save_and_load(
+          loaded,
+          self._save_model_dir('second_save'),
+          num_tensor_args=1,
+          test_sample_weight=False)
+
   def test_custom_metric_wrapped_call(self):
 
     class NegativeMean(keras.metrics.Mean):
diff --git a/tensorflow/python/keras/saving/saved_model/utils.py b/tensorflow/python/keras/saving/saved_model/utils.py
index 82547cc393d..5ed68e775d8 100644
--- a/tensorflow/python/keras/saving/saved_model/utils.py
+++ b/tensorflow/python/keras/saving/saved_model/utils.py
@@ -24,10 +24,10 @@ from tensorflow.python.eager import context
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.utils import control_flow_util
+from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils.generic_utils import LazyLoader
-from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_inspect
 
 
 # pylint:disable=g-inconsistent-quotes
@@ -118,7 +118,7 @@ def list_all_layers(obj):
     return obj.layers
   else:
     return list(
-        trackable_layer_utils.filter_empty_layer_containers(obj._layers))  # pylint: disable=protected-access
+        layer_utils.filter_empty_layer_containers(obj._layers))  # pylint: disable=protected-access
 
 
 def list_all_layers_and_sublayers(obj):
diff --git a/tensorflow/python/keras/saving/saved_model_experimental.py b/tensorflow/python/keras/saving/saved_model_experimental.py
index 64765185986..cbb75d1ebab 100644
--- a/tensorflow/python/keras/saving/saved_model_experimental.py
+++ b/tensorflow/python/keras/saving/saved_model_experimental.py
@@ -18,13 +18,14 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import warnings
 
 import six
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import optimizers
+from tensorflow.python.keras import optimizer_v1
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.keras.saving import model_config
 from tensorflow.python.keras.saving import saving_utils
@@ -41,7 +42,6 @@ from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training.tracking import graph_view
 from tensorflow.python.util import compat
-from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import keras_export
 
@@ -61,10 +61,6 @@ sequential = LazyLoader(
 # pylint:enable=g-inconsistent-quotes
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions=('Please use `model.save(..., save_format="tf")` or '
-                  '`tf.keras.models.save_model(..., save_format="tf")`.'))
 @keras_export(v1=['keras.experimental.export_saved_model'])
 def export_saved_model(model,
                        saved_model_path,
@@ -130,6 +126,10 @@ def export_saved_model(model,
     ValueError: If the input signature cannot be inferred from the model.
     AssertionError: If the SavedModel directory already exists and isn't empty.
   """
+  warnings.warn('`tf.keras.experimental.export_saved_model` is deprecated'
+                'and will be removed in a future version. '
+                'Please use `model.save(..., save_format="tf")` or '
+                '`tf.keras.models.save_model(..., save_format="tf")`.')
   if serving_only:
     save_lib.save(
         model,
@@ -206,7 +206,7 @@ def _save_v1_format(model, path, custom_objects, as_text, input_signature):
 
   has_saved_vars = False
   if model.optimizer:
-    if isinstance(model.optimizer, (optimizers.TFOptimizer,
+    if isinstance(model.optimizer, (optimizer_v1.TFOptimizer,
                                     optimizer_v2.OptimizerV2)):
       _export_mode(mode_keys.ModeKeys.TRAIN, has_saved_vars, **export_args)
       has_saved_vars = True
@@ -372,10 +372,6 @@ def _assert_same_non_optimizer_objects(model, model_graph, clone, clone_graph):
   return True
 
 
-@deprecation.deprecated(
-    date=None,
-    instructions=('The experimental save and load functions have been  '
-                  'deprecated. Please switch to `tf.keras.models.load_model`.'))
 @keras_export(v1=['keras.experimental.load_from_saved_model'])
 def load_from_saved_model(saved_model_path, custom_objects=None):
   """Loads a keras Model from a SavedModel created by `export_saved_model()`.
@@ -413,6 +409,9 @@ def load_from_saved_model(saved_model_path, custom_objects=None):
   Returns:
     a keras.Model instance.
   """
+  warnings.warn('`tf.keras.experimental.load_from_saved_model` is deprecated'
+                'and will be removed in a future version. '
+                'Please switch to `tf.keras.models.load_model`.')
   # restore model topology from json string
   model_json_filepath = os.path.join(
       compat.as_bytes(saved_model_path),
diff --git a/tensorflow/python/keras/saving/saved_model_experimental_test.py b/tensorflow/python/keras/saving/saved_model_experimental_test.py
index f4b91298d10..45130922250 100644
--- a/tensorflow/python/keras/saving/saved_model_experimental_test.py
+++ b/tensorflow/python/keras/saving/saved_model_experimental_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.keras import optimizer_v1
 from tensorflow.python.keras.engine import training as model_lib
 from tensorflow.python.keras.optimizer_v2 import adadelta
 from tensorflow.python.keras.optimizer_v2 import rmsprop
@@ -458,7 +459,7 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
       x = keras.layers.Dense(2)(inputs)
       x = keras.layers.Dense(3)(x)
       clone = keras.models.Model(inputs, x)
-      clone.compile(loss='mse', optimizer=keras.optimizers.RMSprop(lr=0.0001))
+      clone.compile(loss='mse', optimizer=optimizer_v1.RMSprop(lr=0.0001))
       clone.train_on_batch(input_arr, target_arr)
 
     keras_saved_model._assert_same_non_optimizer_objects(
@@ -487,7 +488,7 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
       x = keras.layers.Dense(4)(x)
       x = keras.layers.Dense(3)(x)
       clone = keras.models.Model(inputs, x)
-      clone.compile(loss='mse', optimizer=keras.optimizers.RMSprop(lr=0.0001))
+      clone.compile(loss='mse', optimizer=optimizer_v1.RMSprop(lr=0.0001))
       clone.train_on_batch(input_arr, target_arr)
 
   def testSaveSequentialModelWithoutInputShapes(self):
diff --git a/tensorflow/python/keras/saving/saving_utils.py b/tensorflow/python/keras/saving/saving_utils.py
index 9fdf81cae2a..b240c4262af 100644
--- a/tensorflow/python/keras/saving/saving_utils.py
+++ b/tensorflow/python/keras/saving/saving_utils.py
@@ -24,6 +24,7 @@ import six
 from tensorflow.python.eager import def_function
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import losses
+from tensorflow.python.keras import optimizer_v1
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.utils import generic_utils
@@ -161,7 +162,7 @@ def model_metadata(model, include_optimizer=True, require_config=True):
       backend=K.backend(),
       model_config=model_config)
   if model.optimizer and include_optimizer:
-    if isinstance(model.optimizer, optimizers.TFOptimizer):
+    if isinstance(model.optimizer, optimizer_v1.TFOptimizer):
       logging.warning(
           'TensorFlow optimizers do not '
           'make it possible to access '
@@ -173,7 +174,7 @@ def model_metadata(model, include_optimizer=True, require_config=True):
           'Prefer using a Keras optimizer instead '
           '(see keras.io/optimizers).')
     elif model._compile_was_called:  # pylint: disable=protected-access
-      training_config = model._get_compile_args()  # pylint: disable=protected-access
+      training_config = model._get_compile_args(user_metrics=False)  # pylint: disable=protected-access
       training_config.pop('optimizer', None)  # Handled separately.
       metadata['training_config'] = _serialize_nested_config(training_config)
       if isinstance(model.optimizer, optimizer_v2.RestoredOptimizer):
diff --git a/tensorflow/python/keras/saving/saving_utils_test.py b/tensorflow/python/keras/saving/saving_utils_test.py
index 49b6fde9ec7..85f421a8507 100644
--- a/tensorflow/python/keras/saving/saving_utils_test.py
+++ b/tensorflow/python/keras/saving/saving_utils_test.py
@@ -40,6 +40,7 @@ from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.feature_column import dense_features
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.ops import array_ops
@@ -156,7 +157,7 @@ class TraceModelCallTest(keras_parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_trace_features_layer(self):
     columns = [feature_column_lib.numeric_column('x')]
-    model = sequential.Sequential([feature_column_lib.DenseFeatures(columns)])
+    model = sequential.Sequential([dense_features.DenseFeatures(columns)])
     model_input = {'x': constant_op.constant([[1.]])}
     model.predict(model_input, steps=1)
     fn = saving_utils.trace_model_call(model)
@@ -166,7 +167,7 @@ class TraceModelCallTest(keras_parameterized.TestCase):
         feature_column_lib.numeric_column('x'),
         feature_column_lib.numeric_column('y')
     ]
-    model = sequential.Sequential([feature_column_lib.DenseFeatures(columns)])
+    model = sequential.Sequential([dense_features.DenseFeatures(columns)])
     model_input = {'x': constant_op.constant([[1.]]),
                    'y': constant_op.constant([[2.]])}
     model.predict(model_input, steps=1)
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index 550ff664823..6d212e0cda3 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -18,14 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import contextlib
 import functools
+import itertools
 import threading
 
 import numpy as np
 
 from tensorflow.python import tf2
 from tensorflow.python.eager import context
+from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -43,9 +46,9 @@ from tensorflow.python.keras.optimizer_v2 import adamax as adamax_v2
 from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
 from tensorflow.python.keras.optimizer_v2 import nadam as nadam_v2
 from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
-from tensorflow.python.util import tf_contextlib
+from tensorflow.python.keras.utils import tf_contextlib
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_inspect
 
 
 def string_test(actual, expected):
@@ -937,3 +940,146 @@ def use_gpu():
   """Uses gpu when requested and available."""
   with device(should_use_gpu=True):
     yield
+
+
+def for_all_test_methods(decorator, *args, **kwargs):
+  """Generate class-level decorator from given method-level decorator.
+
+  It is expected for the given decorator to take some arguments and return
+  a method that is then called on the test method to produce a decorated
+  method.
+
+  Args:
+    decorator: The decorator to apply.
+    *args: Positional arguments
+    **kwargs: Keyword arguments
+  Returns: Function that will decorate a given classes test methods with the
+    decorator.
+  """
+
+  def all_test_methods_impl(cls):
+    """Apply decorator to all test methods in class."""
+    for name in dir(cls):
+      value = getattr(cls, name)
+      if callable(value) and name.startswith('test') and (name !=
+                                                          'test_session'):
+        setattr(cls, name, decorator(*args, **kwargs)(value))
+    return cls
+
+  return all_test_methods_impl
+
+
+# The description is just for documentation purposes.
+def run_without_tensor_float_32(description):  # pylint: disable=unused-argument
+  """Execute test with TensorFloat-32 disabled.
+
+  While almost every real-world deep learning model runs fine with
+  TensorFloat-32, many tests use assertAllClose or similar methods.
+  TensorFloat-32 matmuls typically will cause such methods to fail with the
+  default tolerances.
+
+  Args:
+    description: A description used for documentation purposes, describing why
+      the test requires TensorFloat-32 to be disabled.
+
+  Returns:
+    Decorator which runs a test with TensorFloat-32 disabled.
+  """
+
+  def decorator(f):
+
+    @functools.wraps(f)
+    def decorated(self, *args, **kwargs):
+      allowed = config.tensor_float_32_execution_enabled()
+      try:
+        config.enable_tensor_float_32_execution(False)
+        f(self, *args, **kwargs)
+      finally:
+        config.enable_tensor_float_32_execution(allowed)
+
+    return decorated
+
+  return decorator
+
+
+# The description is just for documentation purposes.
+def run_all_without_tensor_float_32(description):  # pylint: disable=unused-argument
+  """Execute all tests in a class with TensorFloat-32 disabled."""
+  return for_all_test_methods(run_without_tensor_float_32, description)
+
+
+def run_v2_only(func=None):
+  """Execute the decorated test only if running in v2 mode.
+
+  This function is intended to be applied to tests that exercise v2 only
+  functionality. If the test is run in v1 mode it will simply be skipped.
+
+  See go/tf-test-decorator-cheatsheet for the decorators to use in different
+  v1/v2/eager/graph combinations.
+
+  Args:
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+
+  Returns:
+    Returns a decorator that will conditionally skip the decorated test method.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      raise ValueError('`run_v2_only` only supports test methods.')
+
+    def decorated(self, *args, **kwargs):
+      if not tf2.enabled():
+        self.skipTest('Test is only compatible with v2')
+
+      return f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
+def generate_combinations_with_testcase_name(**kwargs):
+  """Generate combinations based on its keyword arguments using combine().
+
+  This function calls combine() and appends a testcase name to the list of
+  dictionaries returned. The 'testcase_name' key is a required for named
+  parameterized tests.
+
+  Args:
+    **kwargs: keyword arguments of form `option=[possibilities, ...]` or
+      `option=the_only_possibility`.
+
+  Returns:
+    a list of dictionaries for each combination. Keys in the dictionaries are
+    the keyword argument names.  Each key has one value - one of the
+    corresponding keyword argument values.
+  """
+  sort_by_key = lambda k: k[0]
+  combinations = []
+  for key, values in sorted(kwargs.items(), key=sort_by_key):
+    if not isinstance(values, list):
+      values = [values]
+    combinations.append([(key, value) for value in values])
+
+  combinations = [collections.OrderedDict(result)
+                  for result in itertools.product(*combinations)]
+  named_combinations = []
+  for combination in combinations:
+    assert isinstance(combination, collections.OrderedDict)
+    name = ''.join([
+        '_{}_{}'.format(''.join(filter(str.isalnum, key)),
+                        ''.join(filter(str.isalnum, str(value))))
+        for key, value in combination.items()
+    ])
+    named_combinations.append(
+        collections.OrderedDict(
+            list(combination.items()) +
+            [('testcase_name', '_test{}'.format(name))]))
+
+  return named_combinations
diff --git a/tensorflow/python/keras/tests/BUILD b/tensorflow/python/keras/tests/BUILD
index 4db3327d1f6..bb9290b3e6f 100644
--- a/tensorflow/python/keras/tests/BUILD
+++ b/tensorflow/python/keras/tests/BUILD
@@ -15,7 +15,11 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
 
 tf_py_test(
     name = "get_config_test",
@@ -25,6 +29,7 @@ tf_py_test(
     tags = [
         "no_pip",
     ],
+    tfrt_enabled = True,
     deps = [
         ":get_config_samples",
         "//tensorflow/python:client_testlib",
@@ -97,6 +102,8 @@ tf_py_test(
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python:util",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras:testing_utils",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/training/tracking",
@@ -121,6 +128,7 @@ tf_py_test(
     name = "graph_util_test",
     srcs = ["graph_util_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -130,6 +138,7 @@ tf_py_test(
         "//tensorflow/python:saver",
         "//tensorflow/python:session",
         "//tensorflow/python:tf_optimizer",
+        "//tensorflow/python/keras",
     ],
 )
 
@@ -147,6 +156,8 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python/keras",
+        "//tensorflow/python/keras/layers/legacy_rnn:rnn_cell_impl",
+        "//tensorflow/python/keras/legacy_tf_layers:layers_base",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -260,6 +271,7 @@ tf_py_test(
         "nomsan",  # TODO(b/149948895): Re-enable.
         "notsan",  # TODO(b/149948895): Re-enable.
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -274,7 +286,6 @@ cuda_py_test(
     name = "op_callbacks_test",
     srcs = ["op_callbacks_test.py"],
     python_version = "PY3",
-    tags = ["no_cuda11"],
     xla_enable_strict_auto_jit = False,
     deps = [
         "//tensorflow/python:framework_ops",
@@ -284,6 +295,7 @@ cuda_py_test(
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras",
         "//tensorflow/python/keras:combinations",
     ],
 )
@@ -292,6 +304,7 @@ cuda_py_test(
     name = "summary_ops_test",
     size = "small",
     srcs = ["summary_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client_testlib",
@@ -299,6 +312,7 @@ cuda_py_test(
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
         "//tensorflow/python:summary_ops_v2",
+        "//tensorflow/python/keras:testing_utils",
         "//tensorflow/python/keras/engine",
         "//tensorflow/python/keras/layers:core",
     ],
@@ -308,6 +322,7 @@ tf_py_test(
     name = "saved_model_test",
     size = "small",
     srcs = ["saved_model_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -317,6 +332,8 @@ tf_py_test(
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras:metrics",
         "//tensorflow/python/keras/layers:core",
         "//tensorflow/python/keras/optimizer_v2",
         "//tensorflow/python/saved_model:save",
@@ -352,11 +369,13 @@ tf_py_test(
     size = "small",
     srcs = ["serialization_util_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:util",
+        "//tensorflow/python/keras",
         "//tensorflow/python/keras:combinations",
         "//tensorflow/python/keras/engine",
         "//tensorflow/python/keras/layers:core",
@@ -383,6 +402,7 @@ tf_py_test(
         "no_windows",
         "nomac",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -393,6 +413,7 @@ tf_py_test(
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras",
         "//tensorflow/python/keras:combinations",
         "//tensorflow/python/keras/engine",
         "//tensorflow/python/keras/layers:core",
@@ -410,6 +431,7 @@ tf_py_test(
     srcs = ["tracking_util_test.py"],
     python_version = "PY3",
     tags = ["notsan"],  # b/74395663
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/compiler/tests:xla_test",
         "//tensorflow/python:checkpoint_management",
@@ -431,6 +453,7 @@ tf_py_test(
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/keras",
         "//tensorflow/python/keras:combinations",
         "//tensorflow/python/keras/engine",
         "//tensorflow/python/keras/layers:core",
@@ -465,6 +488,7 @@ tf_py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras",
         "//tensorflow/python/keras:combinations",
         "//tensorflow/python/keras/engine",
         "//tensorflow/python/keras/layers:core",
diff --git a/tensorflow/python/keras/tests/add_loss_correctness_test.py b/tensorflow/python/keras/tests/add_loss_correctness_test.py
index 8494d6e31a0..e7a9701dc52 100644
--- a/tensorflow/python/keras/tests/add_loss_correctness_test.py
+++ b/tensorflow/python/keras/tests/add_loss_correctness_test.py
@@ -102,48 +102,46 @@ class TestAddLossCorrectness(keras_parameterized.TestCase):
     history = model.fit(self.x, batch_size=3, epochs=5)
     self.assertAllClose(history.history['loss'], [0., -.1, -.2, -.3, -.4], 1e-3)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_loss_on_model_ctl(self):
-    with context.eager_mode():
+    def get_model_and_train_step():
+      inputs = Input(shape=(1,))
+      targets = Input(shape=(1,))
+      outputs = testing_utils.Bias()(inputs)
+      model = Model([inputs, targets], outputs)
+      model.add_loss(MAE()(targets, outputs))
+      model.add_loss(math_ops.reduce_mean(mae(targets, outputs)))
+      return get_ctl_train_step(model)
 
-      def get_model_and_train_step():
-        inputs = Input(shape=(1,))
-        targets = Input(shape=(1,))
-        outputs = testing_utils.Bias()(inputs)
-        model = Model([inputs, targets], outputs)
-        model.add_loss(MAE()(targets, outputs))
-        model.add_loss(math_ops.reduce_mean(mae(targets, outputs)))
-        return get_ctl_train_step(model)
+    train_step = get_model_and_train_step()
+    loss = [train_step(self.x, self.y) for _ in range(5)]
+    self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
 
-      train_step = get_model_and_train_step()
-      loss = [train_step(self.x, self.y) for _ in range(5)]
-      self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
-
-      train_step = def_function.function(get_model_and_train_step())
-      loss = [train_step(self.x, self.y) for _ in range(5)]
-      self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
+    train_step = def_function.function(get_model_and_train_step())
+    loss = [train_step(self.x, self.y) for _ in range(5)]
+    self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_loss_callable_on_model_ctl(self):
-    with context.eager_mode():
+    def get_model_and_train_step():
+      inputs = Input(shape=(1,))
+      targets = Input(shape=(1,))
+      outputs = testing_utils.Bias()(inputs)
+      model = Model([inputs, targets], outputs)
 
-      def get_model_and_train_step():
-        inputs = Input(shape=(1,))
-        targets = Input(shape=(1,))
-        outputs = testing_utils.Bias()(inputs)
-        model = Model([inputs, targets], outputs)
+      def callable_loss():
+        return math_ops.reduce_sum(model.weights)
 
-        def callable_loss():
-          return math_ops.reduce_sum(model.weights)
+      model.add_loss(callable_loss)
+      return get_ctl_train_step(model)
 
-        model.add_loss(callable_loss)
-        return get_ctl_train_step(model)
+    train_step = get_model_and_train_step()
+    loss = [train_step(self.x, self.y) for _ in range(5)]
+    self.assertAllClose(loss, [0., -0.05, -0.1, -0.15, -0.2], 1e-3)
 
-      train_step = get_model_and_train_step()
-      loss = [train_step(self.x, self.y) for _ in range(5)]
-      self.assertAllClose(loss, [0., -0.05, -0.1, -0.15, -0.2], 1e-3)
-
-      train_step = def_function.function(get_model_and_train_step())
-      loss = [train_step(self.x, self.y) for _ in range(5)]
-      self.assertAllClose(loss, [0., -0.05, -0.1, -0.15, -0.2], 1e-3)
+    train_step = def_function.function(get_model_and_train_step())
+    loss = [train_step(self.x, self.y) for _ in range(5)]
+    self.assertAllClose(loss, [0., -0.05, -0.1, -0.15, -0.2], 1e-3)
 
   @keras_parameterized.run_all_keras_modes
   def test_loss_with_sample_weight_on_model_fit(self):
@@ -161,26 +159,25 @@ class TestAddLossCorrectness(keras_parameterized.TestCase):
     history = model.fit([self.x, self.y, self.w], batch_size=3, epochs=5)
     self.assertAllClose(history.history['loss'], [4., 3.6, 3.2, 2.8, 2.4], 1e-3)
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_loss_with_sample_weight_on_model_ctl(self):
-    with context.eager_mode():
+    def get_model_and_train_step():
+      inputs = Input(shape=(1,))
+      targets = Input(shape=(1,))
+      sw = Input(shape=(1,))
+      outputs = testing_utils.Bias()(inputs)
+      model = Model([inputs, targets, sw], outputs)
+      model.add_loss(MAE()(targets, outputs, sw))
+      model.add_loss(math_ops.reduce_mean(sw * mae(targets, outputs)))
+      return get_ctl_train_step(model)
 
-      def get_model_and_train_step():
-        inputs = Input(shape=(1,))
-        targets = Input(shape=(1,))
-        sw = Input(shape=(1,))
-        outputs = testing_utils.Bias()(inputs)
-        model = Model([inputs, targets, sw], outputs)
-        model.add_loss(MAE()(targets, outputs, sw))
-        model.add_loss(math_ops.reduce_mean(sw * mae(targets, outputs)))
-        return get_ctl_train_step(model)
+    train_step = get_model_and_train_step()
+    loss = [train_step(self.x, self.y, self.w) for _ in range(5)]
+    self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
 
-      train_step = get_model_and_train_step()
-      loss = [train_step(self.x, self.y, self.w) for _ in range(5)]
-      self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
-
-      train_step = def_function.function(get_model_and_train_step())
-      loss = [train_step(self.x, self.y, self.w) for _ in range(5)]
-      self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
+    train_step = def_function.function(get_model_and_train_step())
+    loss = [train_step(self.x, self.y, self.w) for _ in range(5)]
+    self.assertAllClose(loss, [2., 1.8, 1.6, 1.4, 1.2], 1e-3)
 
   @keras_parameterized.run_all_keras_modes
   def test_loss_with_sample_weight_in_model_call(self):
@@ -429,27 +426,25 @@ class TestAddLossCorrectness(keras_parameterized.TestCase):
       self.assertEqual(len(model.get_losses_for(x4)), 2)
       self.assertEqual(len(model.get_losses_for(None)), 1)
 
-  @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_invalid_constant_input(self):
-    with context.eager_mode():
-      inputs = Input(shape=(1,))
-      outputs = testing_utils.Bias()(inputs)
-      model = Model(inputs, outputs)
-      with self.assertRaisesRegex(
-          ValueError,
-          'Expected a symbolic Tensors or a callable for the loss value'):
-        model.add_loss(1.)
+    inputs = Input(shape=(1,))
+    outputs = testing_utils.Bias()(inputs)
+    model = Model(inputs, outputs)
+    with self.assertRaisesRegex(
+        ValueError,
+        'Expected a symbolic Tensors or a callable for the loss value'):
+      model.add_loss(1.)
 
-  @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_invalid_variable_input(self):
-    with context.eager_mode():
-      inputs = Input(shape=(1,))
-      outputs = testing_utils.Bias()(inputs)
-      model = Model(inputs, outputs)
-      with self.assertRaisesRegex(
-          ValueError,
-          'Expected a symbolic Tensors or a callable for the loss value'):
-        model.add_loss(model.weights[0])
+    inputs = Input(shape=(1,))
+    outputs = testing_utils.Bias()(inputs)
+    model = Model(inputs, outputs)
+    with self.assertRaisesRegex(
+        ValueError,
+        'Expected a symbolic Tensors or a callable for the loss value'):
+      model.add_loss(model.weights[0])
 
   @keras_parameterized.run_all_keras_modes
   def test_add_entropy_loss_on_functional_model(self):
diff --git a/tensorflow/python/keras/tests/automatic_outside_compilation_test.py b/tensorflow/python/keras/tests/automatic_outside_compilation_test.py
index a770b7fa6aa..ceedfa0a6b1 100644
--- a/tensorflow/python/keras/tests/automatic_outside_compilation_test.py
+++ b/tensorflow/python/keras/tests/automatic_outside_compilation_test.py
@@ -32,7 +32,7 @@ from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import remote
-from tensorflow.python.eager import test
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras.distribute import distribute_strategy_test
@@ -46,6 +46,7 @@ from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.platform import flags
+from tensorflow.python.platform import test
 from tensorflow.python.summary import summary_iterator
 from tensorflow.python.tpu import tpu_strategy_util
 
@@ -245,34 +246,44 @@ class AutoOutsideCompilationWithKerasTest(test.TestCase):
   def testSummaryWithCustomTrainingLoop(self):
     strategy = get_tpu_strategy()
 
+    writer = summary_ops_v2.create_file_writer_v2(self.summary_dir)
     with strategy.scope():
       model = distribute_strategy_test.get_model()
       model.compile('sgd', 'mse')
-      writer = summary_ops_v2.create_file_writer_v2(self.summary_dir)
 
-      @def_function.function
-      def custom_function(dataset):
+    @def_function.function
+    def custom_function(dataset):
 
-        def _custom_step(features, labels):
-          del labels
-          logits = model(features)
-          with summary_ops_v2.always_record_summaries(), writer.as_default():
-            scalar_summary_v2.scalar(
-                'logits',
-                math_ops.reduce_sum(logits),
-                step=model.optimizer.iterations)
-          return logits
+      def _custom_step(features, labels):
+        del labels
+        logits = model(features)
+        with summary_ops_v2.always_record_summaries(), writer.as_default():
+          scalar_summary_v2.scalar(
+              'logits',
+              math_ops.reduce_sum(logits),
+              step=model.optimizer.iterations)
+        return logits
 
-        iterator = iter(dataset)
-        output = strategy.unwrap(
-            strategy.run(_custom_step, args=(next(iterator))))
-        return output
+      iterator = iter(dataset)
+      output = strategy.unwrap(
+          strategy.run(_custom_step, args=(next(iterator))))
+      return output
 
-      dataset = strategy.experimental_distribute_dataset(
-          distribute_strategy_test.get_dataset(strategy))
+    dataset = strategy.experimental_distribute_dataset(
+        distribute_strategy_test.get_dataset(strategy))
 
-      custom_function(dataset)
+    custom_function(dataset)
+    writer.close()
+
+    event_files = file_io.get_matching_files_v2(
+        os.path.join(self.summary_dir, 'event*'))
+    events_count_dictionary = {
+        ('logits'): 0,
+    }
+    self.validate_recorded_sumary_file(event_files, events_count_dictionary,
+                                       1)
 
 
 if __name__ == '__main__':
+  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/tests/convert_to_constants_test.py b/tensorflow/python/keras/tests/convert_to_constants_test.py
index f59c83b79dc..05826ed918d 100644
--- a/tensorflow/python/keras/tests/convert_to_constants_test.py
+++ b/tensorflow/python/keras/tests/convert_to_constants_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import convert_to_constants
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -101,7 +101,7 @@ class VariablesToConstantsTest(test.TestCase):
     for expected, actual in zip(expected_value, actual_value):
       np.testing.assert_almost_equal(expected.numpy(), actual.numpy())
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def testKerasModel(self):
     """Test a basic Keras model with Variables."""
     input_data = {"x": constant_op.constant(1., shape=[1, 1])}
@@ -124,7 +124,7 @@ class VariablesToConstantsTest(test.TestCase):
     root, output_func = self._freezeModel(to_save)
     self._testConvertedFunction(root, root.f, output_func, input_data)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def testKerasLSTM(self):
     """Test a Keras LSTM containing dynamic_rnn ops."""
     input_data = {
@@ -146,7 +146,7 @@ class VariablesToConstantsTest(test.TestCase):
     root, output_func = self._freezeModel(to_save)
     self._testConvertedFunction(root, root.f, output_func, input_data)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def testEmbeddings(self):
     """Test model with embeddings."""
     input_data = {
diff --git a/tensorflow/python/keras/tests/integration_test.py b/tensorflow/python/keras/tests/integration_test.py
index 64a7b694355..9e6c9693c0a 100644
--- a/tensorflow/python/keras/tests/integration_test.py
+++ b/tensorflow/python/keras/tests/integration_test.py
@@ -28,10 +28,10 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers.legacy_rnn import rnn_cell_impl as rnn_cell
+from tensorflow.python.keras.legacy_tf_layers import base as base_layer
 from tensorflow.python.keras.utils import np_utils
-from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import nn_ops as nn
-from tensorflow.python.ops import rnn_cell
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/tests/memory_test.py b/tensorflow/python/keras/tests/memory_test.py
index 753820d3295..465df84d6fe 100644
--- a/tensorflow/python/keras/tests/memory_test.py
+++ b/tensorflow/python/keras/tests/memory_test.py
@@ -26,10 +26,10 @@ from __future__ import print_function
 
 from tensorflow.python import keras
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import test
 from tensorflow.python.eager.memory_tests import memory_test_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
 
 
 class SingleLayerNet(keras.Model):
diff --git a/tensorflow/python/keras/tests/model_architectures_test.py b/tensorflow/python/keras/tests/model_architectures_test.py
index fe7b7e476b0..f1f2cc4fe33 100644
--- a/tensorflow/python/keras/tests/model_architectures_test.py
+++ b/tensorflow/python/keras/tests/model_architectures_test.py
@@ -26,6 +26,7 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import optimizer_v1
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.tests import model_architectures
 from tensorflow.python.platform import test
@@ -62,7 +63,7 @@ class TestModelArchitectures(keras_parameterized.TestCase):
   def get_custom_objects(self):
     """Define custom_objects."""
 
-    class CustomOpt(keras.optimizers.SGD):
+    class CustomOpt(optimizer_v1.SGD):
       pass
 
     def custom_loss(y_true, y_pred):
diff --git a/tensorflow/python/keras/tests/model_subclassing_test.py b/tensorflow/python/keras/tests/model_subclassing_test.py
index 4b9b625cc0b..db0a9de99fc 100644
--- a/tensorflow/python/keras/tests/model_subclassing_test.py
+++ b/tensorflow/python/keras/tests/model_subclassing_test.py
@@ -428,7 +428,7 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
       def call(self, inputs):
         return inputs + self.b + self.c
 
-    x = ops.convert_to_tensor_v2(np.ones((10, 10), 'float32'))
+    x = ops.convert_to_tensor_v2_with_dispatch(np.ones((10, 10), 'float32'))
     model = MyModel()
     model(x)
     self.assertEqual(1, len(model.trainable_weights))
@@ -444,7 +444,7 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
       def call(self, inputs):
         return inputs + self.b + self.c
 
-    x = ops.convert_to_tensor_v2(np.ones((10, 10), 'float32'))
+    x = ops.convert_to_tensor_v2_with_dispatch(np.ones((10, 10), 'float32'))
     model = MyModelCustomBuild()
     model(x)
     self.assertEqual(1, len(model.trainable_weights))
@@ -467,7 +467,7 @@ class ModelSubclassingTest(keras_parameterized.TestCase):
         self.add_update(self.c.assign(inputs[1, :]))
         return inputs + self.b + self.c
 
-    x = ops.convert_to_tensor_v2(np.ones((10, 10), 'float32'))
+    x = ops.convert_to_tensor_v2_with_dispatch(np.ones((10, 10), 'float32'))
     model = MyModel()
     model(x)
 
@@ -706,32 +706,33 @@ class CustomCallSignatureTests(test.TestCase, parameterized.TestCase):
       m.predict_on_batch(x)
 
   def test_deepcopy(self):
-    with context.eager_mode():
+    if not context.executing_eagerly():
+      self.skipTest('Run in eager mode only.')
 
-      class MyModel(keras.Model):
+    class MyModel(keras.Model):
 
-        def __init__(self):
-          super(MyModel, self).__init__()
-          self.my_variable = variables_lib.Variable(0.0, trainable=False)
-          self.layer = keras.layers.Dense(4)
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.my_variable = variables_lib.Variable(0.0, trainable=False)
+        self.layer = keras.layers.Dense(4)
 
-        def call(self, obs):
-          return self.layer(obs)
+      def call(self, obs):
+        return self.layer(obs)
 
-      model = MyModel()
-      model.my_variable.assign_add(1.0)
+    model = MyModel()
+    model.my_variable.assign_add(1.0)
 
-      new_model = copy.deepcopy(model)
-      self.assertEqual(model.my_variable.numpy(), 1.0)
-      self.assertEqual(new_model.my_variable.numpy(), 1.0)
+    new_model = copy.deepcopy(model)
+    self.assertEqual(model.my_variable.numpy(), 1.0)
+    self.assertEqual(new_model.my_variable.numpy(), 1.0)
 
-      model.my_variable.assign_add(1.0)
-      self.assertEqual(model.my_variable.numpy(), 2.0)
-      self.assertEqual(new_model.my_variable.numpy(), 1.0)
+    model.my_variable.assign_add(1.0)
+    self.assertEqual(model.my_variable.numpy(), 2.0)
+    self.assertEqual(new_model.my_variable.numpy(), 1.0)
 
-      # Check that Trackable logic still works.
-      self.assertLen(new_model.variables, 1)
-      self.assertLen(new_model.layers, 1)
+    # Check that Trackable logic still works.
+    self.assertLen(new_model.variables, 1)
+    self.assertLen(new_model.layers, 1)
 
   def test_batch_counters_not_in_variables(self):
 
diff --git a/tensorflow/python/keras/tests/op_callbacks_test.py b/tensorflow/python/keras/tests/op_callbacks_test.py
index ca50bbb1a81..bee71f3b09e 100644
--- a/tensorflow/python/keras/tests/op_callbacks_test.py
+++ b/tensorflow/python/keras/tests/op_callbacks_test.py
@@ -22,12 +22,12 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
-from tensorflow.python.eager import test
 from tensorflow.python.framework import op_callbacks
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.ops import script_ops
+from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
diff --git a/tensorflow/python/keras/tests/saved_model_test.py b/tensorflow/python/keras/tests/saved_model_test.py
index cd6363b8855..9264a60eb55 100644
--- a/tensorflow/python/keras/tests/saved_model_test.py
+++ b/tensorflow/python/keras/tests/saved_model_test.py
@@ -22,7 +22,7 @@ import os
 import sys
 
 from tensorflow.python.eager import backprop
-from tensorflow.python.eager import function
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_spec
@@ -41,10 +41,7 @@ class _ModelWithOptimizerUsingDefun(util.Checkpoint):
     self.dense = core.Dense(1)
     self.optimizer = adam.Adam(0.01)
 
-  # Using defun due to control flow v2 cycles, b/121159261. def_function uses
-  # conds to gate variable initialization and so triggers cond reference cycles,
-  # but the thing being wrapped here does not use cond itself.
-  @function.defun(
+  @def_function.function(
       input_signature=(tensor_spec.TensorSpec([None, 2], dtypes.float32),
                        tensor_spec.TensorSpec([None], dtypes.float32)),
   )
diff --git a/tensorflow/python/keras/tests/summary_ops_test.py b/tensorflow/python/keras/tests/summary_ops_test.py
index 5c9fee91b9c..7d9a89ec60c 100644
--- a/tensorflow/python/keras/tests/summary_ops_test.py
+++ b/tensorflow/python/keras/tests/summary_ops_test.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.core.util import event_pb2
-from tensorflow.python.framework import test_util
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine.sequential import Sequential
 from tensorflow.python.keras.engine.training import Model
 from tensorflow.python.keras.layers.core import Activation
@@ -33,7 +33,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 
 
-class SummaryOpsTest(test_util.TensorFlowTestCase):
+class SummaryOpsTest(test.TestCase):
 
   def tearDown(self):
     super(SummaryOpsTest, self).tearDown()
@@ -50,7 +50,7 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
     # the second event.
     return events[1]
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def testKerasModel(self):
     model = Sequential(
         [Dense(10, input_shape=(100,)),
@@ -59,7 +59,7 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
     first_val = event.summary.value[0]
     self.assertEqual(model.to_json(), first_val.tensor.string_val[0].decode())
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def testKerasModel_usesDefaultStep(self):
     model = Sequential(
         [Dense(10, input_shape=(100,)),
@@ -72,7 +72,7 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
       # Reset to default state for other tests.
       summary_ops.set_step(None)
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def testKerasModel_subclass(self):
 
     class SimpleSubclass(Model):
@@ -93,7 +93,7 @@ class SummaryOpsTest(test_util.TensorFlowTestCase):
       self.assertRegex(
           str(mock_log.call_args), 'Model failed to serialize as JSON.')
 
-  @test_util.run_v2_only
+  @testing_utils.run_v2_only
   def testKerasModel_otherExceptions(self):
     model = Sequential()
 
diff --git a/tensorflow/python/keras/tests/tracking_test.py b/tensorflow/python/keras/tests/tracking_test.py
index 02d5cd519ab..f3818190902 100644
--- a/tensorflow/python/keras/tests/tracking_test.py
+++ b/tensorflow/python/keras/tests/tracking_test.py
@@ -23,7 +23,6 @@ import numpy
 import six
 
 from tensorflow.python.eager import context
-from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -37,6 +36,7 @@ from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
 from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.training.tracking import util
@@ -177,21 +177,18 @@ class ListTests(keras_parameterized.TestCase):
     m2(m2.null_input())
     self.assertLen(m2.trainable_variables, 6)
 
+  @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testUpdatesForwarded(self):
-    with context.graph_mode():
-      model = HasList()
-      model_input = array_ops.ones([32, 2])
-      model(model_input)
+    model = HasList()
+    model_input = array_ops.ones([32, 2])
+    model(model_input)
+    if context.executing_eagerly():
+      self.assertEqual(0, len(model.updates))
+    else:
       self.assertGreater(len(model.layers_with_updates[0].updates), 0)
       self.assertEqual(set(model.layers_with_updates[0].updates),
                        set(model.updates))
 
-    with context.eager_mode():
-      model = HasList()
-      model_input = array_ops.ones([32, 2])
-      model(model_input)
-      self.assertEqual(0, len(model.updates))
-
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testLossesForwarded(self):
     model = HasList()
@@ -609,4 +606,5 @@ class InterfaceTests(keras_parameterized.TestCase):
 
 
 if __name__ == "__main__":
+  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/tests/tracking_util_test.py b/tensorflow/python/keras/tests/tracking_util_test.py
index 21b6ef8e8d2..ed0bb17adbd 100644
--- a/tensorflow/python/keras/tests/tracking_util_test.py
+++ b/tensorflow/python/keras/tests/tracking_util_test.py
@@ -90,7 +90,7 @@ class InterfaceTests(test.TestCase):
 
   def testSaveWithOnlyKerasSession(self):
 
-    with ops.Graph().as_default():
+    with ops.Graph().as_default(), self.cached_session():
       inp = input_layer.Input([1])
       dense = core.Dense(1)(inp)
       model = training.Model(inp, dense)
@@ -100,13 +100,15 @@ class InterfaceTests(test.TestCase):
       checkpoint.save(os.path.join(self.get_temp_dir(), "ckpt"))
 
   def testObjectMetadata(self):
-    with context.eager_mode():
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      dense = core.Dense(1)
-      checkpoint = trackable_utils.Checkpoint(dense=dense)
-      dense(constant_op.constant([[1.]]))
-      save_path = checkpoint.save(checkpoint_prefix)
+    if not context.executing_eagerly():
+      self.skipTest("Run in eager mode only.")
+
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dense = core.Dense(1)
+    checkpoint = trackable_utils.Checkpoint(dense=dense)
+    dense(constant_op.constant([[1.]]))
+    save_path = checkpoint.save(checkpoint_prefix)
 
     objects = trackable_utils.object_metadata(save_path)
     all_variable_names = []
@@ -354,6 +356,7 @@ class CheckpointingTests(keras_parameterized.TestCase):
     with self.test_session():
       num_training_steps = 10
       checkpoint_directory = self.get_temp_dir()
+      optimizer = adam.Adam(0.001)
       def _train_fn(model, input_value):
         with backprop.GradientTape() as tape:
           loss = model(input_value)
@@ -363,7 +366,6 @@ class CheckpointingTests(keras_parameterized.TestCase):
       for training_continuation in range(3):
         with testing_utils.device(should_use_gpu=True):
           model = MyModel()
-          optimizer = adam.Adam(0.001)
           root = trackable_utils.Checkpoint(
               optimizer=optimizer, model=model)
           manager = checkpoint_management.CheckpointManager(
@@ -382,30 +384,30 @@ class CheckpointingTests(keras_parameterized.TestCase):
           self.assertEqual(training_continuation + 1,
                            self.evaluate(root.save_counter))
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testPartialRestoreWarningObject(self):
-    with context.eager_mode():
-      optimizer = adam.Adam(0.0)
-      original_root = trackable_utils.Checkpoint(v1=variables_lib.Variable(2.),
-                                                 v2=variables_lib.Variable(3.),
-                                                 optimizer=optimizer)
-      # Create a slot variable to save
-      optimizer.minimize(original_root.v1.read_value, [original_root.v1])
-      prefix = os.path.join(self.get_temp_dir(), "ckpt")
-      save_path = original_root.save(prefix)
-      partial_root = trackable_utils.Checkpoint(v1=variables_lib.Variable(0.))
-      weak_partial_root = weakref.ref(partial_root)
-      weak_v1 = weakref.ref(partial_root.v1)
-      partial_root.restore(save_path)
-      self.assertEqual(2., partial_root.v1.numpy())
-      with test.mock.patch.object(logging, "warning") as mock_log:
-        del partial_root
-        self.assertIsNone(weak_partial_root())
-        self.assertIsNone(weak_v1())
-        messages = str(mock_log.call_args_list)
-      self.assertIn("(root).v2'", messages)
-      self.assertIn("(root).optimizer's state 'm' for (root).v1", messages)
-      self.assertNotIn("(root).v1'", messages)
-      self.assertIn("expect_partial()", messages)
+    optimizer = adam.Adam(0.0)
+    original_root = trackable_utils.Checkpoint(v1=variables_lib.Variable(2.),
+                                               v2=variables_lib.Variable(3.),
+                                               optimizer=optimizer)
+    # Create a slot variable to save
+    optimizer.minimize(original_root.v1.read_value, [original_root.v1])
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    save_path = original_root.save(prefix)
+    partial_root = trackable_utils.Checkpoint(v1=variables_lib.Variable(0.))
+    weak_partial_root = weakref.ref(partial_root)
+    weak_v1 = weakref.ref(partial_root.v1)
+    partial_root.restore(save_path)
+    self.assertEqual(2., partial_root.v1.numpy())
+    with test.mock.patch.object(logging, "warning") as mock_log:
+      del partial_root
+      self.assertIsNone(weak_partial_root())
+      self.assertIsNone(weak_v1())
+      messages = str(mock_log.call_args_list)
+    self.assertIn("(root).v2'", messages)
+    self.assertIn("(root).optimizer's state 'm' for (root).v1", messages)
+    self.assertNotIn("(root).v1'", messages)
+    self.assertIn("expect_partial()", messages)
 
   # pylint: disable=cell-var-from-loop
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
@@ -450,6 +452,7 @@ class CheckpointingTests(keras_parameterized.TestCase):
                            self.evaluate(root.save_counter))
   # pylint: enable=cell-var-from-loop
 
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testAnonymousVarsInInit(self):
 
     class Model(training.Model):
@@ -463,21 +466,20 @@ class CheckpointingTests(keras_parameterized.TestCase):
       def call(self, x):
         return x * self.w + self.b
 
-    with context.eager_mode():
-      model = Model()
-      optimizer = adam.Adam(learning_rate=0.05)
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      checkpoint = trackable_utils.Checkpoint(
-          model=model, optimizer=optimizer)
-      for _ in range(2):
-        checkpoint.save(checkpoint_prefix)
-        with backprop.GradientTape() as tape:
-          loss = (constant_op.constant(1.)
-                  - model(constant_op.constant(1.))) ** 2
-        grad = tape.gradient(loss, model.vars)
-        optimizer.apply_gradients(
-            [(g, v) for g, v in zip(grad, model.vars)])
+    model = Model()
+    optimizer = adam.Adam(learning_rate=0.05)
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    checkpoint = trackable_utils.Checkpoint(
+        model=model, optimizer=optimizer)
+    for _ in range(2):
+      checkpoint.save(checkpoint_prefix)
+      with backprop.GradientTape() as tape:
+        loss = (constant_op.constant(1.)
+                - model(constant_op.constant(1.))) ** 2
+      grad = tape.gradient(loss, model.vars)
+      optimizer.apply_gradients(
+          [(g, v) for g, v in zip(grad, model.vars)])
 
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def testDeferredSlotRestoration(self):
diff --git a/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py b/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
index 6a998b238fe..3463a7862bd 100644
--- a/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
+++ b/tensorflow/python/keras/tests/tracking_util_with_v1_optimizers_test.py
@@ -1,4 +1,4 @@
-  # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -26,9 +26,7 @@ from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import combinations
@@ -39,6 +37,7 @@ from tensorflow.python.keras.layers import core
 from tensorflow.python.module import module
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
 from tensorflow.python.training import adam
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
@@ -283,7 +282,7 @@ class CheckpointingTests(keras_parameterized.TestCase):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
 
-    def _train_fn(optimizer, model):
+    def _train_fn(optimizer, model, root):
       input_value = constant_op.constant([[3.]])
       optimizer.minimize(
           functools.partial(model, input_value),
@@ -303,7 +302,7 @@ class CheckpointingTests(keras_parameterized.TestCase):
 
         for _ in range(num_training_steps):
           strategy.extended.call_for_each_replica(
-              functools.partial(_train_fn, optimizer, model))
+              functools.partial(_train_fn, optimizer, model, root))
         root.save(file_prefix=checkpoint_prefix)
         self.assertEqual((training_continuation + 1) * num_training_steps,
                          root.optimizer_step.numpy())
@@ -314,7 +313,7 @@ class CheckpointingTests(keras_parameterized.TestCase):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
 
-    def _train_fn(optimizer, model):
+    def _train_fn(optimizer, model, root):
       input_value = constant_op.constant([[3.]])
       return optimizer.minimize(
           functools.partial(model, input_value),
@@ -332,7 +331,7 @@ class CheckpointingTests(keras_parameterized.TestCase):
           status = root.restore(checkpoint_management.latest_checkpoint(
               checkpoint_directory))
           train_op = strategy.extended.call_for_each_replica(
-              functools.partial(_train_fn, optimizer, model))
+              functools.partial(_train_fn, optimizer, model, root))
           with self.session() as session:
             if training_continuation > 0:
               status.assert_consumed()
@@ -460,16 +459,7 @@ class CheckpointingTests(keras_parameterized.TestCase):
                            self.evaluate(root.save_counter))
   # pylint: enable=cell-var-from-loop
 
-  def _get_checkpoint_name(self, name):
-    root = module.Module()
-    trackable_utils.add_variable(
-        root, name=name, shape=[1, 2], dtype=dtypes.float64)
-    (named_variable,), _, _ = trackable_utils._serialize_object_graph(
-        root, saveables_cache=None)
-    with ops.name_scope_v2("root/" + named_variable.name):
-      pass  # Make sure we can use this as an op name if we prefix it.
-    return named_variable.name
-
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def testAnonymousVarsInInit(self):
 
     class Model(training.Model):
@@ -483,21 +473,20 @@ class CheckpointingTests(keras_parameterized.TestCase):
       def call(self, x):
         return x * self.w + self.b
 
-    with context.eager_mode():
-      model = Model()
-      optimizer = adam.AdamOptimizer(learning_rate=0.05)
-      checkpoint_directory = self.get_temp_dir()
-      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-      checkpoint = trackable_utils.Checkpoint(
-          model=model, optimizer=optimizer)
-      for _ in range(2):
-        checkpoint.save(checkpoint_prefix)
-        with backprop.GradientTape() as tape:
-          loss = (constant_op.constant(1.)
-                  - model(constant_op.constant(1.))) ** 2
-        grad = tape.gradient(loss, model.vars)
-        optimizer.apply_gradients(
-            [(g, v) for g, v in zip(grad, model.vars)])
+    model = Model()
+    optimizer = adam.AdamOptimizer(learning_rate=0.05)
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    checkpoint = trackable_utils.Checkpoint(
+        model=model, optimizer=optimizer)
+    for _ in range(2):
+      checkpoint.save(checkpoint_prefix)
+      with backprop.GradientTape() as tape:
+        loss = (constant_op.constant(1.)
+                - model(constant_op.constant(1.))) ** 2
+      grad = tape.gradient(loss, model.vars)
+      optimizer.apply_gradients(
+          [(g, v) for g, v in zip(grad, model.vars)])
 
   @combinations.generate(combinations.combine(mode=["graph", "eager"]))
   def test_initialize_if_not_restoring(self):
@@ -708,4 +697,5 @@ class CheckpointCompatibilityTests(keras_parameterized.TestCase):
 
 
 if __name__ == "__main__":
+  ops.enable_eager_execution()
   test.main()
diff --git a/tensorflow/python/keras/type/BUILD b/tensorflow/python/keras/type/BUILD
index bb612301dd1..301f9af3f9e 100644
--- a/tensorflow/python/keras/type/BUILD
+++ b/tensorflow/python/keras/type/BUILD
@@ -5,6 +5,12 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
+
 py_strict_library(
     name = "types",
     srcs = ["types.py"],
diff --git a/tensorflow/python/keras/utils/BUILD b/tensorflow/python/keras/utils/BUILD
index 38e3c8e66af..30b93fa95d8 100644
--- a/tensorflow/python/keras/utils/BUILD
+++ b/tensorflow/python/keras/utils/BUILD
@@ -13,7 +13,11 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
 
 py_library(
     name = "utils",
@@ -52,24 +56,37 @@ py_library(
     name = "data_utils",
     srcs = ["data_utils.py"],
     srcs_version = "PY2AND3",
-    deps = [":generic_utils"],
+    deps = [
+        ":generic_utils",
+        ":io_utils",
+        ":tf_inspect",
+    ],
 )
 
 py_library(
     name = "engine_utils",
     srcs = [
         "conv_utils.py",
-        "io_utils.py",
         "losses_utils.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":data_utils",
+        ":io_utils",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/ops/losses:loss_reduction",
     ],
 )
 
+py_library(
+    name = "io_utils",
+    srcs = ["io_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "tf_utils",
     srcs = ["tf_utils.py"],
@@ -95,6 +112,8 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":tf_contextlib",
+        ":tf_inspect",
         "//tensorflow/python:util",
         "//third_party/py/numpy",
     ],
@@ -191,6 +210,24 @@ py_library(
     ],
 )
 
+py_library(
+    name = "tf_contextlib",
+    srcs = ["tf_contextlib.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "tf_inspect",
+    srcs = ["tf_inspect.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+    ],
+)
+
 py_library(
     name = "vis_utils",
     srcs = [
@@ -239,6 +276,7 @@ tf_py_test(
     size = "small",
     srcs = ["version_utils_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":version_utils",
         "//tensorflow/python:client_testlib",
@@ -252,6 +290,7 @@ tf_py_test(
     size = "small",
     srcs = ["tf_utils_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":tf_utils",
         "//tensorflow/python:client_testlib",
@@ -266,7 +305,6 @@ tf_py_test(
     srcs = ["composite_tensor_support_test.py"],
     python_version = "PY3",
     shard_count = 8,
-    tags = ["no_windows"],  # b/135752236
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -276,6 +314,7 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/keras",
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras/layers",
         "//tensorflow/python/ops/ragged:ragged_tensor",
@@ -293,6 +332,7 @@ tf_py_test(
         "no_windows",  # TODO: needs investigation on Windows
         "notsan",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -306,6 +346,7 @@ tf_py_test(
     size = "small",
     srcs = ["layer_utils_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":layer_utils",
         "//tensorflow/python:client_testlib",
@@ -319,6 +360,7 @@ tf_py_test(
     size = "small",
     srcs = ["np_utils_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -332,7 +374,9 @@ tf_py_test(
     size = "small",
     srcs = ["kernelized_utils_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
+        ":layer_utils",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:layers",
@@ -362,6 +406,7 @@ tf_py_test(
     size = "small",
     srcs = ["vis_utils_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -375,6 +420,7 @@ tf_py_test(
     size = "small",
     srcs = ["conv_utils_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -388,6 +434,7 @@ tf_py_test(
     size = "small",
     srcs = ["metrics_utils_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
diff --git a/tensorflow/python/keras/utils/all_utils.py b/tensorflow/python/keras/utils/all_utils.py
index deb73b08337..17b8fe98310 100644
--- a/tensorflow/python/keras/utils/all_utils.py
+++ b/tensorflow/python/keras/utils/all_utils.py
@@ -34,8 +34,6 @@ from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import get_custom_objects
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
-from tensorflow.python.keras.utils.io_utils import HDF5Matrix
-from tensorflow.python.keras.utils.layer_utils import convert_all_kernels_in_model
 from tensorflow.python.keras.utils.layer_utils import get_source_inputs
 from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model
 from tensorflow.python.keras.utils.np_utils import normalize
diff --git a/tensorflow/python/keras/utils/composite_tensor_support_test.py b/tensorflow/python/keras/utils/composite_tensor_support_test.py
index daba188414a..a48408c6788 100644
--- a/tensorflow/python/keras/utils/composite_tensor_support_test.py
+++ b/tensorflow/python/keras/utils/composite_tensor_support_test.py
@@ -325,7 +325,7 @@ def prepare_inputs(data, use_dict, use_dataset, action, input_name):
 @keras_parameterized.run_with_all_model_types
 @keras_parameterized.run_all_keras_modes
 @parameterized.named_parameters(
-    *test_util.generate_combinations_with_testcase_name(
+    *testing_utils.generate_combinations_with_testcase_name(
         use_dict=[True, False],
         use_dataset=[True, False],
         action=["predict", "evaluate", "fit"]))
@@ -490,7 +490,7 @@ class ScipySparseTensorInputTest(keras_parameterized.TestCase,
 @keras_parameterized.run_with_all_model_types
 @keras_parameterized.run_all_keras_modes
 @parameterized.named_parameters(
-    *test_util.generate_combinations_with_testcase_name(
+    *testing_utils.generate_combinations_with_testcase_name(
         use_dict=[True, False],
         use_dataset=[True, False],
         action=["predict", "evaluate", "fit"]))
@@ -537,7 +537,7 @@ class RaggedTensorInputTest(keras_parameterized.TestCase,
 @keras_parameterized.run_with_all_model_types
 @keras_parameterized.run_all_keras_modes
 @parameterized.named_parameters(
-    *test_util.generate_combinations_with_testcase_name(
+    *testing_utils.generate_combinations_with_testcase_name(
         use_dict=[True, False], use_dataset=[True, False]))
 class RaggedTensorInputValidationTest(keras_parameterized.TestCase,
                                       test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/keras/utils/control_flow_util.py b/tensorflow/python/keras/utils/control_flow_util.py
index 8d13c573149..788b5731554 100644
--- a/tensorflow/python/keras/utils/control_flow_util.py
+++ b/tensorflow/python/keras/utils/control_flow_util.py
@@ -22,7 +22,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import smart_cond as smart_module
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variables
 
@@ -127,13 +129,13 @@ def constant_value(pred):  # pylint: disable=invalid-name
     TypeError: If `pred` is not a Variable, Tensor or bool, or Python
       integer 1 or 0.
   """
-  # Allow integer booleans.
-  if isinstance(pred, int):
-    if pred == 1:
-      pred = True
-    elif pred == 0:
-      pred = False
-
+  if isinstance(pred, ops.Tensor):
+    return tensor_util.constant_value(pred)
+  if pred in {0, 1}:  # Accept 1/0 as valid boolean values
+    return bool(pred)
+  if isinstance(pred, bool):
+    return pred
   if isinstance(pred, variables.Variable):
     return None
-  return smart_module.smart_constant_value(pred)
+  raise TypeError("`pred` must be a Tensor, or a Python bool, or 1 or 0. "
+                  "Found instead: %s" % type(pred))
diff --git a/tensorflow/python/keras/utils/conv_utils.py b/tensorflow/python/keras/utils/conv_utils.py
index e8ee866d958..769ac654687 100644
--- a/tensorflow/python/keras/utils/conv_utils.py
+++ b/tensorflow/python/keras/utils/conv_utils.py
@@ -208,31 +208,6 @@ def normalize_padding(value):
   return padding
 
 
-def convert_kernel(kernel):
-  """Converts a Numpy kernel matrix from Theano format to TensorFlow format.
-
-  Also works reciprocally, since the transformation is its own inverse.
-
-  This is used for converting legacy Theano-saved model files.
-
-  Arguments:
-      kernel: Numpy array (3D, 4D or 5D).
-
-  Returns:
-      The converted kernel.
-
-  Raises:
-      ValueError: in case of invalid kernel shape or invalid data_format.
-  """
-  kernel = np.asarray(kernel)
-  if not 3 <= kernel.ndim <= 5:
-    raise ValueError('Invalid kernel shape:', kernel.shape)
-  slices = [slice(None, None, -1) for _ in range(kernel.ndim)]
-  no_flip = (slice(None, None), slice(None, None))
-  slices[-2:] = no_flip
-  return np.copy(kernel[slices])
-
-
 def conv_kernel_mask(input_shape, kernel_shape, strides, padding):
   """Compute a mask representing the connectivity of a convolution operation.
 
diff --git a/tensorflow/python/keras/utils/data_utils.py b/tensorflow/python/keras/utils/data_utils.py
index 7eb0b63aebd..7f15c3e8af5 100644
--- a/tensorflow/python/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/utils/data_utils.py
@@ -23,14 +23,12 @@ from abc import abstractmethod
 from contextlib import closing
 import errno
 import functools
-import gc
 import hashlib
 import multiprocessing
 import multiprocessing.dummy
 import os
 import random
 import shutil
-import signal
 import sys
 import tarfile
 import threading
@@ -45,11 +43,10 @@ from six.moves.urllib.error import URLError
 
 from tensorflow.python.framework import ops
 from six.moves.urllib.request import urlopen
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras.utils.io_utils import path_to_string
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import deprecation
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -553,115 +550,6 @@ def init_pool(seqs):
   _SHARED_SEQUENCES = seqs
 
 
-@deprecation.deprecated('2020-06-07', 'Please manage pools using the standard '
-                        'Python lib.')
-@keras_export('keras.experimental.terminate_keras_multiprocessing_pools')
-def terminate_keras_multiprocessing_pools(grace_period=0.1, use_sigkill=False):
-  """Destroy Keras' multiprocessing pools to prevent deadlocks.
-
-  In general multiprocessing.Pool can interact quite badly with other, seemingly
-  unrelated, parts of a codebase due to Pool's reliance on fork. This method
-  cleans up all pools which are known to belong to Keras (and thus can be safely
-  terminated).
-
-  Args:
-    grace_period: Time (in seconds) to wait for process cleanup to propagate.
-    use_sigkill: Boolean of whether or not to perform a cleanup pass using
-      SIGKILL.
-
-  Returns:
-    A list of human readable strings describing all issues encountered. It is up
-    to the caller to decide whether to treat this as an error condition.
-  """
-  errors = []
-
-  # First cleanup the pools spawned by Keras. If we start killing workers and
-  # a parent pool is still alive it will just spawn replacements which we don't
-  # want.
-  gc.collect()
-  for pool in _DATA_POOLS:
-    pool.close()
-    pool.terminate()
-    # We do not join the pool, because that would wait forever if a worker
-    # refused to exit.
-
-    # Finally, delete our reference to the pool so that we do not block garbage
-    # collection.
-    del pool
-
-  # If there were any pools, sleep for a small grace period to allow everything
-  # to finalize.
-  if _DATA_POOLS:
-    time.sleep(grace_period)
-
-  # Now we kill any workers which are still alive. However we must compare
-  # the worker identifier to the set of identifiers which are known to have been
-  # spawned by pools belonging to Keras to avoid deleting unrelated workers.
-  # First we call the .terminate() method of a worker, and then if it still
-  # persists we directly send a signal to the process.  Certain worker tasks may
-  # be able to gracefully handle shutdown, so we send a SIGTERM and then
-  # optionally follow up with a SIGKILL.
-  visited_workers = set()
-  cleanup_passes = ['.terminate', 'SIGTERM']
-  if use_sigkill:
-    cleanup_passes.append('SIGKILL')
-  cleanup_passes.append('log')
-
-  for cleanup_pass in cleanup_passes:
-    while True:
-      # In rare cases, queue.qsize() overestimates the number of elements. This
-      # loop is designed to be more robust.
-      try:
-        _WORKER_IDS.add(get_worker_id_queue().get_nowait())
-      except queue.Empty:
-        break
-
-    gc.collect()
-    workers_terminated_this_pass = False
-    for worker in multiprocessing.active_children():
-      ident = worker.ident
-      if ident in _WORKER_IDS and worker.is_alive():
-        try:
-          if cleanup_pass == '.terminate':
-            # First we ask nicely.
-            worker.terminate()
-            worker.join(timeout=grace_period)
-            visited_workers.add(ident)
-            workers_terminated_this_pass = True
-          elif cleanup_pass in ('SIGTERM', 'SIGKILL'):
-            # Then we ask increasingly tersely.
-            os.kill(worker.pid, signal.SIGKILL if cleanup_pass == 'SIGKILL'
-                    else signal.SIGTERM)
-            workers_terminated_this_pass = True
-
-          elif cleanup_pass == 'log':
-            # And finally we give up and log the failure.
-            errors.append('worker still alive: {}, pid={}, hash={}'
-                          .format(worker.name, worker.pid, hash(worker)))
-
-        except OSError:
-          # Worker exited since the start of this loop.
-          pass
-
-    if workers_terminated_this_pass:
-      # There can be a small propagation delay between worker destruction and
-      # workers reporting False for is_alive and no longer appearing in the
-      # list of active children. Once again, we sleep for a small grace period.
-      # This prevents false positives from workers which are simply still in the
-      # process of spinning down.
-      time.sleep(grace_period)
-
-  # Finally we remove the visited worker ids to handle the edge case that a
-  # pid is reused.
-  _WORKER_IDS.difference_update(visited_workers)
-
-  gc.collect()
-  for pool in _DATA_POOLS:
-    errors.append('pool still exists: {}, hash={}'.format(pool, hash(pool)))
-
-  return errors
-
-
 def get_index(uid, i):
   """Get the value from the Sequence `uid` at index `i`.
 
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index fc4eeb194e8..6b79ebf7581 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -29,11 +29,10 @@ import types as python_types
 
 import numpy as np
 import six
-
+from tensorflow.python.keras.utils import tf_contextlib
+from tensorflow.python.keras.utils import tf_inspect
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 _GLOBAL_CUSTOM_OBJECTS = {}
diff --git a/tensorflow/python/keras/utils/io_utils.py b/tensorflow/python/keras/utils/io_utils.py
index 7c3395b239c..e70f8013ef8 100644
--- a/tensorflow/python/keras/utils/io_utils.py
+++ b/tensorflow/python/keras/utils/io_utils.py
@@ -18,21 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import os
 import sys
 
-import numpy as np
 import six
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import type_spec
-from tensorflow.python.util import deprecation
-from tensorflow.python.util.tf_export import keras_export
-
-try:
-  import h5py
-except ImportError:
-  h5py = None
 
 
 if sys.version_info >= (3, 6):
@@ -77,162 +66,6 @@ def path_to_string(path):
   return _path_to_string(path)
 
 
-@keras_export('keras.utils.HDF5Matrix')
-class HDF5Matrix(object):
-  """Representation of HDF5 dataset to be used instead of a Numpy array.
-
-  THIS CLASS IS DEPRECATED.
-  Training with HDF5Matrix may not be optimized for performance, and might
-  not work with every distribution strategy.
-
-  We recommend using https://github.com/tensorflow/io to load your
-  HDF5 data into a tf.data Dataset and passing that dataset to Keras.
-  """
-  refs = collections.defaultdict(int)
-
-  @deprecation.deprecated('2020-05-30', 'Training with '
-                          'HDF5Matrix is not optimized for performance. '
-                          'Instead, we recommend using '
-                          'https://github.com/tensorflow/io to load your '
-                          'HDF5 data into a tf.data Dataset and passing '
-                          'that dataset to Keras.')
-  def __init__(self, datapath, dataset, start=0, end=None, normalizer=None):
-    """Representation of HDF5 dataset to be used instead of a Numpy array.
-
-    Example:
-
-    ```python
-        x_data = HDF5Matrix('input/file.hdf5', 'data')
-        model.predict(x_data)
-    ```
-
-    Providing `start` and `end` allows use of a slice of the dataset.
-
-    Optionally, a normalizer function (or lambda) can be given. This will
-    be called on every slice of data retrieved.
-
-    Arguments:
-        datapath: string, path to a HDF5 file
-        dataset: string, name of the HDF5 dataset in the file specified
-            in datapath
-        start: int, start of desired slice of the specified dataset
-        end: int, end of desired slice of the specified dataset
-        normalizer: function to be called on data when retrieved
-
-    Returns:
-        An array-like HDF5 dataset.
-
-    Raises:
-      ImportError if HDF5 & h5py are not installed
-    """
-    if h5py is None:
-      raise ImportError('The use of HDF5Matrix requires '
-                        'HDF5 and h5py installed.')
-
-    if datapath not in list(self.refs.keys()):
-      f = h5py.File(datapath)
-      self.refs[datapath] = f
-    else:
-      f = self.refs[datapath]
-    self.data = f[dataset]
-    self.start = start
-    if end is None:
-      self.end = self.data.shape[0]
-    else:
-      self.end = end
-    self.normalizer = normalizer
-
-  def __len__(self):
-    return self.end - self.start
-
-  def __getitem__(self, key):
-    if isinstance(key, slice):
-      start, stop = key.start, key.stop
-      if start is None:
-        start = 0
-      if stop is None:
-        stop = self.shape[0]
-      if stop + self.start <= self.end:
-        idx = slice(start + self.start, stop + self.start)
-      else:
-        raise IndexError
-    elif isinstance(key, (int, np.integer)):
-      if key + self.start < self.end:
-        idx = key + self.start
-      else:
-        raise IndexError
-    elif isinstance(key, np.ndarray):
-      if np.max(key) + self.start < self.end:
-        idx = (self.start + key).tolist()
-      else:
-        raise IndexError
-    else:
-      # Assume list/iterable
-      if max(key) + self.start < self.end:
-        idx = [x + self.start for x in key]
-      else:
-        raise IndexError
-    if self.normalizer is not None:
-      return self.normalizer(self.data[idx])
-    else:
-      return self.data[idx]
-
-  @property
-  def shape(self):
-    """Gets a numpy-style shape tuple giving the dataset dimensions.
-
-    Returns:
-        A numpy-style shape tuple.
-    """
-    return (self.end - self.start,) + self.data.shape[1:]
-
-  @property
-  def dtype(self):
-    """Gets the datatype of the dataset.
-
-    Returns:
-        A numpy dtype string.
-    """
-    return self.data.dtype
-
-  @property
-  def ndim(self):
-    """Gets the number of dimensions (rank) of the dataset.
-
-    Returns:
-        An integer denoting the number of dimensions (rank) of the dataset.
-    """
-    return self.data.ndim
-
-  @property
-  def size(self):
-    """Gets the total dataset size (number of elements).
-
-    Returns:
-        An integer denoting the number of elements in the dataset.
-    """
-    return np.prod(self.shape)
-
-  @staticmethod
-  def _to_type_spec(value):
-    """Gets the Tensorflow TypeSpec corresponding to the passed dataset.
-
-    Args:
-      value: A HDF5Matrix object.
-
-    Returns:
-      A tf.TensorSpec.
-    """
-    if not isinstance(value, HDF5Matrix):
-      raise TypeError('Expected value to be a HDF5Matrix, but saw: {}'.format(
-          type(value)))
-    return tensor_spec.TensorSpec(shape=value.shape, dtype=value.dtype)
-
-
-type_spec.register_type_spec_from_value_converter(HDF5Matrix,
-                                                  HDF5Matrix._to_type_spec)  # pylint: disable=protected-access
-
-
 def ask_to_proceed_with_overwrite(filepath):
   """Produces a prompt asking about overwriting a file.
 
diff --git a/tensorflow/python/keras/utils/io_utils_test.py b/tensorflow/python/keras/utils/io_utils_test.py
index 29328e52dbc..a0ead4ee623 100644
--- a/tensorflow/python/keras/utils/io_utils_test.py
+++ b/tensorflow/python/keras/utils/io_utils_test.py
@@ -18,110 +18,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import shutil
 import sys
 
-import numpy as np
 import six
 
-from tensorflow.python import keras
 from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.utils import io_utils
 from tensorflow.python.platform import test
 
-try:
-  import h5py  # pylint:disable=g-import-not-at-top
-except ImportError:
-  h5py = None
-
-
-def create_dataset(h5_path='test.h5'):
-  x = np.random.randn(200, 10).astype('float32')
-  y = np.random.randint(0, 2, size=(200, 1))
-  f = h5py.File(h5_path, 'w')
-  # Creating dataset to store features
-  x_dset = f.create_dataset('my_data', (200, 10), dtype='f')
-  x_dset[:] = x
-  # Creating dataset to store labels
-  y_dset = f.create_dataset('my_labels', (200, 1), dtype='i')
-  y_dset[:] = y
-  f.close()
-
 
 class TestIOUtils(keras_parameterized.TestCase):
 
-  @keras_parameterized.run_all_keras_modes
-  def test_HDF5Matrix(self):
-    if h5py is None:
-      return
-
-    temp_dir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, temp_dir)
-
-    h5_path = os.path.join(temp_dir, 'test.h5')
-    create_dataset(h5_path)
-
-    # Instantiating HDF5Matrix for the training set,
-    # which is a slice of the first 150 elements
-    x_train = io_utils.HDF5Matrix(h5_path, 'my_data', start=0, end=150)
-    y_train = io_utils.HDF5Matrix(h5_path, 'my_labels', start=0, end=150)
-
-    # Likewise for the test set
-    x_test = io_utils.HDF5Matrix(h5_path, 'my_data', start=150, end=200)
-    y_test = io_utils.HDF5Matrix(h5_path, 'my_labels', start=150, end=200)
-
-    # HDF5Matrix behave more or less like Numpy matrices
-    # with regard to indexing
-    self.assertEqual(y_train.shape, (150, 1))
-    # But they do not support negative indices, so don't try print(x_train[-1])
-
-    self.assertEqual(y_train.dtype, np.dtype('i'))
-    self.assertEqual(y_train.ndim, 2)
-    self.assertEqual(y_train.size, 150)
-
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(64, input_shape=(10,), activation='relu'))
-    model.add(keras.layers.Dense(1, activation='sigmoid'))
-    model.compile(
-        loss='binary_crossentropy',
-        optimizer='sgd',
-        run_eagerly=testing_utils.should_run_eagerly())
-
-    # Note: you have to use shuffle='batch' or False with HDF5Matrix
-    model.fit(x_train, y_train, batch_size=32, shuffle='batch', verbose=False)
-    # test that evaluation and prediction
-    # don't crash and return reasonable results
-    out_pred = model.predict(x_test, batch_size=32, verbose=False)
-    out_eval = model.evaluate(x_test, y_test, batch_size=32, verbose=False)
-
-    self.assertEqual(out_pred.shape, (50, 1))
-    self.assertGreater(out_eval, 0)
-
-    # test slicing for shortened array
-    self.assertEqual(len(x_train[0:]), len(x_train))
-
-    # test __getitem__ invalid use cases
-    with self.assertRaises(IndexError):
-      _ = x_train[1000]
-    with self.assertRaises(IndexError):
-      _ = x_train[1000: 1001]
-    with self.assertRaises(IndexError):
-      _ = x_train[[1000, 1001]]
-    with self.assertRaises(IndexError):
-      _ = x_train[six.moves.range(1000, 1001)]
-    with self.assertRaises(IndexError):
-      _ = x_train[np.array([1000])]
-    with self.assertRaises(TypeError):
-      _ = x_train[None]
-
-    # test normalizer
-    normalizer = lambda x: x + 1
-    normalized_x_train = io_utils.HDF5Matrix(
-        h5_path, 'my_data', start=0, end=150, normalizer=normalizer)
-    self.assertAllClose(normalized_x_train[0][0], x_train[0][0] + 1)
-
   def test_ask_to_proceed_with_overwrite(self):
     with test.mock.patch.object(six.moves, 'input') as mock_log:
       mock_log.return_value = 'y'
diff --git a/tensorflow/python/keras/utils/layer_utils.py b/tensorflow/python/keras/utils/layer_utils.py
index 3195bb0eb13..3e3f8ec97e8 100644
--- a/tensorflow/python/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/utils/layer_utils.py
@@ -25,11 +25,7 @@ import weakref
 import numpy as np
 import six
 
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.utils.conv_utils import convert_kernel
-from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
-from tensorflow.python.util import object_identity
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -104,7 +100,7 @@ def count_params(weights):
   Returns:
       The total number of scalars composing the weights
   """
-  unique_weights = object_identity.ObjectIdentitySet(weights)
+  unique_weights = {id(w): w for w in weights}.values()
   weight_shapes = [w.shape.as_list() for w in unique_weights]
   standardized_weight_shapes = [
       [0 if w_i is None else w_i for w_i in w] for w in weight_shapes
@@ -329,37 +325,6 @@ def gather_non_trainable_weights(trainable, sub_layers, extra_variables):
   return weights + non_trainable_extra_variables
 
 
-@deprecation.deprecated('2020-06-23',
-                        'The Theano kernel format is legacy; '
-                        'this utility will be removed.')
-@keras_export('keras.utils.convert_all_kernels_in_model')
-def convert_all_kernels_in_model(model):
-  """Converts all convolution kernels in a model from Theano to TensorFlow.
-
-  Also works from TensorFlow to Theano.
-
-  This is used for converting legacy Theano-saved model files.
-
-  Arguments:
-      model: target model for the conversion.
-  """
-  # Note: SeparableConvolution not included
-  # since only supported by TF.
-  conv_classes = {
-      'Conv1D',
-      'Conv2D',
-      'Conv3D',
-      'Conv2DTranspose',
-  }
-  to_assign = []
-  for layer in model.layers:
-    if layer.__class__.__name__ in conv_classes:
-      original_kernel = K.get_value(layer.kernel)
-      converted_kernel = convert_kernel(original_kernel)
-      to_assign.append((layer.kernel, converted_kernel))
-  K.batch_set_value(to_assign)
-
-
 def convert_dense_weights_data_format(dense,
                                       previous_feature_map_shape,
                                       target_data_format='channels_first'):
@@ -503,3 +468,22 @@ def cached_per_instance(f):
   wrapped.cache = cache
   return wrapped
 
+
+def filter_empty_layer_containers(layer_list):
+  """Filter out empty Layer-like containers and uniquify."""
+  # TODO(b/130381733): Make this an attribute in base_layer.Layer.
+  existing = set()
+  to_visit = layer_list[::-1]
+  while to_visit:
+    obj = to_visit.pop()
+    if id(obj) in existing:
+      continue
+    existing.add(id(obj))
+    if hasattr(obj, '_is_layer') and not isinstance(obj, type):
+      yield obj
+    else:
+      sub_layers = getattr(obj, 'layers', None) or []
+
+      # Trackable data structures will not show up in ".layers" lists, but
+      # the layers they contain will.
+      to_visit.extend(sub_layers[::-1])
diff --git a/tensorflow/python/keras/utils/losses_utils.py b/tensorflow/python/keras/utils/losses_utils.py
index b8a063e3b42..442067d2ea5 100644
--- a/tensorflow/python/keras/utils/losses_utils.py
+++ b/tensorflow/python/keras/utils/losses_utils.py
@@ -31,8 +31,8 @@ from tensorflow.python.util.tf_export import keras_export
 
 # TODO(joshl/psv): Update references to ReductionV2 to point to its
 # new location.
-ReductionV2 = keras_export(  # pylint: disable=invalid-name
-    'keras.losses.Reduction', v1=[])(loss_reduction.ReductionV2)
+ReductionV2 = loss_reduction.ReductionV2
+keras_export('keras.losses.Reduction', v1=[])(loss_reduction.ReductionV2)
 
 
 def remove_squeezable_dimensions(
@@ -253,11 +253,11 @@ def compute_weighted_loss(losses,
     ops.get_default_graph()._last_loss_reduction = reduction  # pylint: disable=protected-access
 
     if not isinstance(losses, keras_tensor.KerasTensor):
-      losses = ops.convert_to_tensor_v2(losses)
+      losses = ops.convert_to_tensor_v2_with_dispatch(losses)
     input_dtype = losses.dtype
 
     if not isinstance(sample_weight, keras_tensor.KerasTensor):
-      sample_weight = ops.convert_to_tensor_v2(sample_weight)
+      sample_weight = ops.convert_to_tensor_v2_with_dispatch(sample_weight)
 
     # TODO(psv): Handle casting here in a better way, eg. if losses is float64
     # we do not want to lose precision.
diff --git a/tensorflow/python/keras/utils/metrics_utils.py b/tensorflow/python/keras/utils/metrics_utils.py
index 7d47850e8aa..5b3905a28da 100644
--- a/tensorflow/python/keras/utils/metrics_utils.py
+++ b/tensorflow/python/keras/utils/metrics_utils.py
@@ -311,7 +311,8 @@ def update_confusion_matrix_variables(variables_to_update,
 
   y_true = math_ops.cast(y_true, dtype=variable_dtype)
   y_pred = math_ops.cast(y_pred, dtype=variable_dtype)
-  thresholds = ops.convert_to_tensor_v2(thresholds, dtype=variable_dtype)
+  thresholds = ops.convert_to_tensor_v2_with_dispatch(
+      thresholds, dtype=variable_dtype)
   num_thresholds = thresholds.shape[0]
   if multi_label:
     one_thresh = math_ops.equal(
diff --git a/tensorflow/python/keras/utils/multi_gpu_utils.py b/tensorflow/python/keras/utils/multi_gpu_utils.py
index 0d2d043d731..089ca98f6d0 100644
--- a/tensorflow/python/keras/utils/multi_gpu_utils.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils.py
@@ -23,8 +23,6 @@ from tensorflow.python.keras.engine.training import Model
 from tensorflow.python.keras.layers.core import Lambda
 from tensorflow.python.keras.layers.merge import concatenate
 from tensorflow.python.ops import array_ops
-from tensorflow.python.util import deprecation
-from tensorflow.python.util.tf_export import keras_export
 
 
 def _get_available_devices():
@@ -36,9 +34,6 @@ def _normalize_device_name(name):
   return name
 
 
-@keras_export('keras.utils.multi_gpu_model')
-@deprecation.deprecated(
-    '2020-04-01', 'Use `tf.distribute.MirroredStrategy` instead.')
 def multi_gpu_model(model, gpus, cpu_merge=True, cpu_relocation=False):
   """Replicates a model on different GPUs.
 
diff --git a/tensorflow/python/keras/utils/multi_gpu_utils_test.py b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
index 0765afb4db7..322028bbf7d 100644
--- a/tensorflow/python/keras/utils/multi_gpu_utils_test.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
@@ -24,6 +24,7 @@ from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import config
 from tensorflow.python.framework import ops
+from tensorflow.python.keras import optimizer_v1
 from tensorflow.python.keras.utils import multi_gpu_utils
 from tensorflow.python.keras.utils import np_utils
 from tensorflow.python.platform import test
@@ -191,7 +192,7 @@ class TestMultiGPUModel(test.TestCase):
 
       parallel_model.compile(
           loss='categorical_crossentropy',
-          optimizer=keras.optimizers.RMSprop(lr=0.0001, decay=1e-6),
+          optimizer=optimizer_v1.RMSprop(lr=0.0001, decay=1e-6),
           metrics=['accuracy'],
           target_tensors=[targets])
       parallel_model.fit(epochs=1, steps_per_epoch=3)
diff --git a/tensorflow/python/distribute/__init__.py b/tensorflow/python/keras/utils/tf_contextlib.py
similarity index 56%
rename from tensorflow/python/distribute/__init__.py
rename to tensorflow/python/keras/utils/tf_contextlib.py
index f9d0a95ea58..3830014d4ac 100644
--- a/tensorflow/python/distribute/__init__.py
+++ b/tensorflow/python/keras/utils/tf_contextlib.py
@@ -12,19 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Distribution Strategy library."""
-
+"""TFDecorator-aware replacements for the contextlib module."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import
-from tensorflow.python.distribute import cluster_resolver
-from tensorflow.python.distribute import cross_device_ops
-from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.distribute import mirrored_strategy
-from tensorflow.python.distribute import one_device_strategy
-from tensorflow.python.distribute.experimental import collective_all_reduce_strategy
-from tensorflow.python.distribute.experimental import parameter_server_strategy
-# pylint: enable=unused-import
+import contextlib as _contextlib
+
+from tensorflow.python.util import tf_decorator
+
+
+def contextmanager(target):
+  """A tf_decorator-aware wrapper for `contextlib.contextmanager`.
+
+  Usage is identical to `contextlib.contextmanager`.
+
+  Args:
+    target: A callable to be wrapped in a contextmanager.
+  Returns:
+    A callable that can be used inside of a `with` statement.
+  """
+  context_manager = _contextlib.contextmanager(target)
+  return tf_decorator.make_decorator(target, context_manager, 'contextmanager')
diff --git a/tensorflow/python/keras/utils/tf_inspect.py b/tensorflow/python/keras/utils/tf_inspect.py
new file mode 100644
index 00000000000..8f1b668539a
--- /dev/null
+++ b/tensorflow/python/keras/utils/tf_inspect.py
@@ -0,0 +1,407 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TFDecorator-aware replacements for the inspect module."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import functools
+import inspect as _inspect
+
+import six
+
+from tensorflow.python.util import tf_decorator
+
+ArgSpec = _inspect.ArgSpec
+
+
+if hasattr(_inspect, 'FullArgSpec'):
+  FullArgSpec = _inspect.FullArgSpec  # pylint: disable=invalid-name
+else:
+  FullArgSpec = collections.namedtuple('FullArgSpec', [
+      'args', 'varargs', 'varkw', 'defaults', 'kwonlyargs', 'kwonlydefaults',
+      'annotations'
+  ])
+
+
+def _convert_maybe_argspec_to_fullargspec(argspec):
+  if isinstance(argspec, FullArgSpec):
+    return argspec
+  return FullArgSpec(
+      args=argspec.args,
+      varargs=argspec.varargs,
+      varkw=argspec.keywords,
+      defaults=argspec.defaults,
+      kwonlyargs=[],
+      kwonlydefaults=None,
+      annotations={})
+
+if hasattr(_inspect, 'getfullargspec'):
+  _getfullargspec = _inspect.getfullargspec  # pylint: disable=invalid-name
+
+  def _getargspec(target):
+    """A python3 version of getargspec.
+
+    Calls `getfullargspec` and assigns args, varargs,
+    varkw, and defaults to a python 2/3 compatible `ArgSpec`.
+
+    The parameter name 'varkw' is changed to 'keywords' to fit the
+    `ArgSpec` struct.
+
+    Args:
+      target: the target object to inspect.
+
+    Returns:
+      An ArgSpec with args, varargs, keywords, and defaults parameters
+      from FullArgSpec.
+    """
+    fullargspecs = getfullargspec(target)
+    argspecs = ArgSpec(
+        args=fullargspecs.args,
+        varargs=fullargspecs.varargs,
+        keywords=fullargspecs.varkw,
+        defaults=fullargspecs.defaults)
+    return argspecs
+else:
+  _getargspec = _inspect.getargspec
+
+  def _getfullargspec(target):
+    """A python2 version of getfullargspec.
+
+    Args:
+      target: the target object to inspect.
+
+    Returns:
+      A FullArgSpec with empty kwonlyargs, kwonlydefaults and annotations.
+    """
+    return _convert_maybe_argspec_to_fullargspec(getargspec(target))
+
+
+def currentframe():
+  """TFDecorator-aware replacement for inspect.currentframe."""
+  return _inspect.stack()[1][0]
+
+
+def getargspec(obj):
+  """TFDecorator-aware replacement for `inspect.getargspec`.
+
+  Note: `getfullargspec` is recommended as the python 2/3 compatible
+  replacement for this function.
+
+  Args:
+    obj: A function, partial function, or callable object, possibly decorated.
+
+  Returns:
+    The `ArgSpec` that describes the signature of the outermost decorator that
+    changes the callable's signature, or the `ArgSpec` that describes
+    the object if not decorated.
+
+  Raises:
+    ValueError: When callable's signature can not be expressed with
+      ArgSpec.
+    TypeError: For objects of unsupported types.
+  """
+  if isinstance(obj, functools.partial):
+    return _get_argspec_for_partial(obj)
+
+  decorators, target = tf_decorator.unwrap(obj)
+
+  spec = next((d.decorator_argspec
+               for d in decorators
+               if d.decorator_argspec is not None), None)
+  if spec:
+    return spec
+
+  try:
+    # Python3 will handle most callables here (not partial).
+    return _getargspec(target)
+  except TypeError:
+    pass
+
+  if isinstance(target, type):
+    try:
+      return _getargspec(target.__init__)
+    except TypeError:
+      pass
+
+    try:
+      return _getargspec(target.__new__)
+    except TypeError:
+      pass
+
+  # The `type(target)` ensures that if a class is received we don't return
+  # the signature of its __call__ method.
+  return _getargspec(type(target).__call__)
+
+
+def _get_argspec_for_partial(obj):
+  """Implements `getargspec` for `functools.partial` objects.
+
+  Args:
+    obj: The `functools.partial` object
+  Returns:
+    An `inspect.ArgSpec`
+  Raises:
+    ValueError: When callable's signature can not be expressed with
+      ArgSpec.
+  """
+  # When callable is a functools.partial object, we construct its ArgSpec with
+  # following strategy:
+  # - If callable partial contains default value for positional arguments (ie.
+  # object.args), then final ArgSpec doesn't contain those positional arguments.
+  # - If callable partial contains default value for keyword arguments (ie.
+  # object.keywords), then we merge them with wrapped target. Default values
+  # from callable partial takes precedence over those from wrapped target.
+  #
+  # However, there is a case where it is impossible to construct a valid
+  # ArgSpec. Python requires arguments that have no default values must be
+  # defined before those with default values. ArgSpec structure is only valid
+  # when this presumption holds true because default values are expressed as a
+  # tuple of values without keywords and they are always assumed to belong to
+  # last K arguments where K is number of default values present.
+  #
+  # Since functools.partial can give default value to any argument, this
+  # presumption may no longer hold in some cases. For example:
+  #
+  # def func(m, n):
+  #   return 2 * m + n
+  # partialed = functools.partial(func, m=1)
+  #
+  # This example will result in m having a default value but n doesn't. This is
+  # usually not allowed in Python and can not be expressed in ArgSpec correctly.
+  #
+  # Thus, we must detect cases like this by finding first argument with default
+  # value and ensures all following arguments also have default values. When
+  # this is not true, a ValueError is raised.
+
+  n_prune_args = len(obj.args)
+  partial_keywords = obj.keywords or {}
+
+  args, varargs, keywords, defaults = getargspec(obj.func)
+
+  # Pruning first n_prune_args arguments.
+  args = args[n_prune_args:]
+
+  # Partial function may give default value to any argument, therefore length
+  # of default value list must be len(args) to allow each argument to
+  # potentially be given a default value.
+  no_default = object()
+  all_defaults = [no_default] * len(args)
+
+  if defaults:
+    all_defaults[-len(defaults):] = defaults
+
+  # Fill in default values provided by partial function in all_defaults.
+  for kw, default in six.iteritems(partial_keywords):
+    if kw in args:
+      idx = args.index(kw)
+      all_defaults[idx] = default
+    elif not keywords:
+      raise ValueError('Function does not have **kwargs parameter, but '
+                       'contains an unknown partial keyword.')
+
+  # Find first argument with default value set.
+  first_default = next(
+      (idx for idx, x in enumerate(all_defaults) if x is not no_default), None)
+
+  # If no default values are found, return ArgSpec with defaults=None.
+  if first_default is None:
+    return ArgSpec(args, varargs, keywords, None)
+
+  # Checks if all arguments have default value set after first one.
+  invalid_default_values = [
+      args[i] for i, j in enumerate(all_defaults)
+      if j is no_default and i > first_default
+  ]
+
+  if invalid_default_values:
+    raise ValueError('Some arguments %s do not have default value, but they '
+                     'are positioned after those with default values. This can '
+                     'not be expressed with ArgSpec.' % invalid_default_values)
+
+  return ArgSpec(args, varargs, keywords, tuple(all_defaults[first_default:]))
+
+
+def getfullargspec(obj):
+  """TFDecorator-aware replacement for `inspect.getfullargspec`.
+
+  This wrapper emulates `inspect.getfullargspec` in[^)]* Python2.
+
+  Args:
+    obj: A callable, possibly decorated.
+
+  Returns:
+    The `FullArgSpec` that describes the signature of
+    the outermost decorator that changes the callable's signature. If the
+    callable is not decorated, `inspect.getfullargspec()` will be called
+    directly on the callable.
+  """
+  decorators, target = tf_decorator.unwrap(obj)
+
+  for d in decorators:
+    if d.decorator_argspec is not None:
+      return _convert_maybe_argspec_to_fullargspec(d.decorator_argspec)
+  return _getfullargspec(target)
+
+
+def getcallargs(*func_and_positional, **named):
+  """TFDecorator-aware replacement for inspect.getcallargs.
+
+  Args:
+    *func_and_positional: A callable, possibly decorated, followed by any
+      positional arguments that would be passed to `func`.
+    **named: The named argument dictionary that would be passed to `func`.
+
+  Returns:
+    A dictionary mapping `func`'s named arguments to the values they would
+    receive if `func(*positional, **named)` were called.
+
+  `getcallargs` will use the argspec from the outermost decorator that provides
+  it. If no attached decorators modify argspec, the final unwrapped target's
+  argspec will be used.
+  """
+  func = func_and_positional[0]
+  positional = func_and_positional[1:]
+  argspec = getfullargspec(func)
+  call_args = named.copy()
+  this = getattr(func, 'im_self', None) or getattr(func, '__self__', None)
+  if ismethod(func) and this:
+    positional = (this,) + positional
+  remaining_positionals = [arg for arg in argspec.args if arg not in call_args]
+  call_args.update(dict(zip(remaining_positionals, positional)))
+  default_count = 0 if not argspec.defaults else len(argspec.defaults)
+  if default_count:
+    for arg, value in zip(argspec.args[-default_count:], argspec.defaults):
+      if arg not in call_args:
+        call_args[arg] = value
+  if argspec.kwonlydefaults is not None:
+    for k, v in argspec.kwonlydefaults.items():
+      if k not in call_args:
+        call_args[k] = v
+  return call_args
+
+
+def getframeinfo(*args, **kwargs):
+  return _inspect.getframeinfo(*args, **kwargs)
+
+
+def getdoc(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getdoc.
+
+  Args:
+    object: An object, possibly decorated.
+
+  Returns:
+    The docstring associated with the object.
+
+  The outermost-decorated object is intended to have the most complete
+  documentation, so the decorated parameter is not unwrapped.
+  """
+  return _inspect.getdoc(object)
+
+
+def getfile(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getfile."""
+  unwrapped_object = tf_decorator.unwrap(object)[1]
+
+  # Work around for the case when object is a stack frame
+  # and only .pyc files are used. In this case, getfile
+  # might return incorrect path. So, we get the path from f_globals
+  # instead.
+  if (hasattr(unwrapped_object, 'f_globals') and
+      '__file__' in unwrapped_object.f_globals):
+    return unwrapped_object.f_globals['__file__']
+  return _inspect.getfile(unwrapped_object)
+
+
+def getmembers(object, predicate=None):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getmembers."""
+  return _inspect.getmembers(object, predicate)
+
+
+def getmodule(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getmodule."""
+  return _inspect.getmodule(object)
+
+
+def getmro(cls):
+  """TFDecorator-aware replacement for inspect.getmro."""
+  return _inspect.getmro(cls)
+
+
+def getsource(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getsource."""
+  return _inspect.getsource(tf_decorator.unwrap(object)[1])
+
+
+def getsourcefile(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getsourcefile."""
+  return _inspect.getsourcefile(tf_decorator.unwrap(object)[1])
+
+
+def getsourcelines(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.getsourcelines."""
+  return _inspect.getsourcelines(tf_decorator.unwrap(object)[1])
+
+
+def isbuiltin(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.isbuiltin."""
+  return _inspect.isbuiltin(tf_decorator.unwrap(object)[1])
+
+
+def isclass(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.isclass."""
+  return _inspect.isclass(tf_decorator.unwrap(object)[1])
+
+
+def isfunction(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.isfunction."""
+  return _inspect.isfunction(tf_decorator.unwrap(object)[1])
+
+
+def isframe(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.ismodule."""
+  return _inspect.isframe(tf_decorator.unwrap(object)[1])
+
+
+def isgenerator(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.isgenerator."""
+  return _inspect.isgenerator(tf_decorator.unwrap(object)[1])
+
+
+def isgeneratorfunction(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.isgeneratorfunction."""
+  return _inspect.isgeneratorfunction(tf_decorator.unwrap(object)[1])
+
+
+def ismethod(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.ismethod."""
+  return _inspect.ismethod(tf_decorator.unwrap(object)[1])
+
+
+def ismodule(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.ismodule."""
+  return _inspect.ismodule(tf_decorator.unwrap(object)[1])
+
+
+def isroutine(object):  # pylint: disable=redefined-builtin
+  """TFDecorator-aware replacement for inspect.isroutine."""
+  return _inspect.isroutine(tf_decorator.unwrap(object)[1])
+
+
+def stack(context=1):
+  """TFDecorator-aware replacement for inspect.stack."""
+  return _inspect.stack(context)[1:]
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index 51cb1acc899..3515dcc87a1 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -30,13 +30,14 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine import keras_tensor
+from tensorflow.python.keras.utils import tf_contextlib
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
-from tensorflow.python.util import tf_contextlib
 
 
 def is_tensor_or_tensor_list(v):
@@ -284,6 +285,23 @@ def are_all_symbolic_tensors(tensors):
 _user_convertible_tensor_types = set()
 
 
+def is_extension_type(tensor):
+  """Returns whether a tensor is of an ExtensionType.
+
+  github.com/tensorflow/community/pull/269
+  Currently it works by checking if `tensor` is a `CompositeTensor` instance,
+  but this will be changed to use an appropriate extensiontype protocol
+  check once ExtensionType is made public.
+
+  Arguments:
+    tensor: An object to test
+
+  Returns:
+    True if the tensor is an extension type object, false if not.
+  """
+  return isinstance(tensor, composite_tensor.CompositeTensor)
+
+
 def is_symbolic_tensor(tensor):
   """Returns whether a tensor is symbolic (from a TF graph) or an eager tensor.
 
@@ -298,7 +316,7 @@ def is_symbolic_tensor(tensor):
   """
   if isinstance(tensor, ops.Tensor):
     return hasattr(tensor, 'graph')
-  elif isinstance(tensor, composite_tensor.CompositeTensor):
+  elif is_extension_type(tensor):
     component_tensors = nest.flatten(tensor, expand_composites=True)
     return any(hasattr(t, 'graph') for t in component_tensors)
   elif isinstance(tensor, variables.Variable):
@@ -346,12 +364,15 @@ def register_symbolic_tensor_type(cls):
     cls: A `class` type which shall be regarded as a symbolic `Tensor`.
   """
   global _user_convertible_tensor_types
+  if cls not in _user_convertible_tensor_types:
+    keras_tensor.register_keras_tensor_specialization(
+        cls, keras_tensor.UserRegisteredTypeKerasTensor)
   _user_convertible_tensor_types.add(cls)
 
 
 def type_spec_from_value(value):
   """Grab type_spec without converting array-likes to tensors."""
-  if isinstance(value, composite_tensor.CompositeTensor):
+  if is_extension_type(value):
     return value._type_spec  # pylint: disable=protected-access
   # Get a TensorSpec for array-like data without
   # converting the data to a Tensor
@@ -441,7 +462,7 @@ def get_tensor_spec(t, dynamic_batch=False, name=None):
   # pylint: disable=protected-access
   if isinstance(t, type_spec.TypeSpec):
     spec = t
-  elif isinstance(t, composite_tensor.CompositeTensor):
+  elif is_extension_type(t):
     # TODO(b/148821952): Should these specs have a name attr?
     spec = t._type_spec
   elif (hasattr(t, '_keras_history') and
@@ -457,10 +478,11 @@ def get_tensor_spec(t, dynamic_batch=False, name=None):
 
   dynamic_batch_spec = copy.deepcopy(spec)
   # RaggedTensorSpec only has a private _shape.
-  shape = dynamic_batch_spec._shape.as_list()
-  if shape:
-    shape[0] = None
-    dynamic_batch_spec._shape = tensor_shape.TensorShape(shape)
+  shape = dynamic_batch_spec._shape
+  if shape.rank is not None and shape.rank > 0:
+    shape_list = shape.as_list()
+    shape_list[0] = None
+    dynamic_batch_spec._shape = tensor_shape.TensorShape(shape_list)
   return dynamic_batch_spec
   # pylint: enable=protected-access
 
diff --git a/tensorflow/python/keras/utils/tf_utils_test.py b/tensorflow/python/keras/utils/tf_utils_test.py
index 9a3939e0c39..f096c61ab3c 100644
--- a/tensorflow/python/keras/utils/tf_utils_test.py
+++ b/tensorflow/python/keras/utils/tf_utils_test.py
@@ -22,11 +22,14 @@ from absl.testing import parameterized
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 
@@ -44,14 +47,17 @@ class TestIsSymbolicTensor(test.TestCase, parameterized.TestCase):
       self.assertFalse(tf_utils.is_symbolic_tensor(
           variables.Variable(name='blah', initial_value=0.)))
       self.assertFalse(
-          tf_utils.is_symbolic_tensor(ops.convert_to_tensor_v2(0.)))
+          tf_utils.is_symbolic_tensor(
+              ops.convert_to_tensor_v2_with_dispatch(0.)))
       self.assertFalse(tf_utils.is_symbolic_tensor(
           sparse_tensor.SparseTensor(
               indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
     else:
       self.assertTrue(tf_utils.is_symbolic_tensor(
           variables.Variable(name='blah', initial_value=0.)))
-      self.assertTrue(tf_utils.is_symbolic_tensor(ops.convert_to_tensor_v2(0.)))
+      self.assertTrue(
+          tf_utils.is_symbolic_tensor(
+              ops.convert_to_tensor_v2_with_dispatch(0.)))
       self.assertTrue(tf_utils.is_symbolic_tensor(
           sparse_tensor.SparseTensor(
               indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
@@ -61,7 +67,7 @@ class TestIsSymbolicTensor(test.TestCase, parameterized.TestCase):
     class CustomClass(object):
 
       def value(self):
-        return ops.convert_to_tensor_v2(42.)
+        return ops.convert_to_tensor_v2_with_dispatch(42.)
 
     ops.register_tensor_conversion_function(
         CustomClass, lambda value, **_: value.value())
@@ -72,7 +78,8 @@ class TestIsSymbolicTensor(test.TestCase, parameterized.TestCase):
       self.assertFalse(tf_utils.is_symbolic_tensor(
           variables.Variable(name='blah', initial_value=0.)))
       self.assertFalse(
-          tf_utils.is_symbolic_tensor(ops.convert_to_tensor_v2(0.)))
+          tf_utils.is_symbolic_tensor(
+              ops.convert_to_tensor_v2_with_dispatch(0.)))
       self.assertFalse(tf_utils.is_symbolic_tensor(
           sparse_tensor.SparseTensor(
               indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
@@ -80,7 +87,9 @@ class TestIsSymbolicTensor(test.TestCase, parameterized.TestCase):
     else:
       self.assertTrue(tf_utils.is_symbolic_tensor(
           variables.Variable(name='blah', initial_value=0.)))
-      self.assertTrue(tf_utils.is_symbolic_tensor(ops.convert_to_tensor_v2(0.)))
+      self.assertTrue(
+          tf_utils.is_symbolic_tensor(
+              ops.convert_to_tensor_v2_with_dispatch(0.)))
       self.assertTrue(tf_utils.is_symbolic_tensor(
           sparse_tensor.SparseTensor(
               indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
@@ -95,7 +104,7 @@ class TestIsSymbolicTensor(test.TestCase, parameterized.TestCase):
 
       def __init__(self, input_):
         self._input = input_
-        self.value = ops.convert_to_tensor_v2([[42.]])
+        self.value = ops.convert_to_tensor_v2_with_dispatch([[42.]])
 
       @property
       def dtype(self):
@@ -110,7 +119,7 @@ class TestIsSymbolicTensor(test.TestCase, parameterized.TestCase):
       def __init__(self, fn, **kwargs):
         def _fn(*fargs, **fkwargs):
           d = fn(*fargs, **fkwargs)
-          x = ops.convert_to_tensor_v2(d)
+          x = ops.convert_to_tensor_v2_with_dispatch(d)
           d.shape = x.shape
           d.get_shape = x.get_shape
           return d, x
@@ -138,7 +147,7 @@ class TestIsSymbolicTensor(test.TestCase, parameterized.TestCase):
     model = keras.Model(model.inputs, model(model.outputs))
     # Now we instantiate the model and verify we have a `Foo` object, not a
     # `Tensor`.
-    y = model(ops.convert_to_tensor_v2([[7.]]))
+    y = model(ops.convert_to_tensor_v2_with_dispatch([[7.]]))
     self.assertIsInstance(y, Foo)
     # Confirm that (custom) loss sees `Foo` instance, not Tensor.
     obtained_prediction_box = [None]
@@ -194,5 +203,24 @@ class TestIsRagged(test.TestCase):
     tensor = [1., 2., 3.]
     self.assertFalse(tf_utils.is_ragged(tensor))
 
+
+class TestIsExtensionType(test.TestCase):
+
+  def test_is_extension_type_return_true_for_ragged_tensor(self):
+    self.assertTrue(tf_utils.is_extension_type(
+        ragged_factory_ops.constant([[1, 2], [3]])))
+
+  def test_is_extension_type_return_true_for_sparse_tensor(self):
+    self.assertTrue(tf_utils.is_extension_type(
+        sparse_ops.from_dense([[1, 2], [3, 4]])))
+
+  def test_is_extension_type_return_false_for_dense_tensor(self):
+    self.assertFalse(tf_utils.is_extension_type(
+        constant_op.constant([[1, 2], [3, 4]])))
+
+  def test_is_extension_type_return_false_for_list(self):
+    tensor = [1., 2., 3.]
+    self.assertFalse(tf_utils.is_extension_type(tensor))
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/wrappers/BUILD b/tensorflow/python/keras/wrappers/BUILD
index 446dac2697f..9786cff1463 100644
--- a/tensorflow/python/keras/wrappers/BUILD
+++ b/tensorflow/python/keras/wrappers/BUILD
@@ -8,7 +8,11 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
+filegroup(
+    name = "all_py_srcs",
+    srcs = glob(["*.py"]),
+    visibility = ["//tensorflow/python/keras/google/private_tf_api_test:__pkg__"],
+)
 
 py_library(
     name = "wrappers",
@@ -35,6 +39,8 @@ tf_py_test(
     deps = [
         ":wrappers",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras:testing_utils",
+        "//tensorflow/python/keras/utils:np_utils",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 0d6b6ac36a3..0def8a47a59 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1,6 +1,6 @@
 # Tests of TensorFlow kernels written using the Python API.
 
-load("//tensorflow:tensorflow.bzl", "sycl_py_test", "tf_custom_op_library")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 # buildifier: disable=same-origin-load
@@ -121,6 +121,7 @@ cuda_py_test(
         "noasan",  # TODO(b/155406705): flaky
     ],
     deps = [
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -282,15 +283,45 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "collective_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["collective_ops_test.py"],
-    tfrt_enabled = True,
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    tfrt_enabled = False,
     deps = [
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:collective_ops_gen",
+        "//tensorflow/python:collective_ops",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:test_util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+tf_py_test(
+    name = "collective_ops_multi_worker_test",
+    size = "medium",
+    srcs = ["collective_ops_multi_worker_test.py"],
+    python_version = "PY3",
+    tags = ["no_rocm"],
+    tfrt_enabled = False,
+    deps = [
+        "//tensorflow/python:collective_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/distribute:multi_process_runner",
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
     ],
 )
 
@@ -696,6 +727,7 @@ cuda_py_test(
     name = "logging_ops_test",
     size = "small",
     srcs = ["logging_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
@@ -720,9 +752,12 @@ tf_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_spec",
         "//tensorflow/python:training",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:save",
     ],
 )
 
@@ -753,7 +788,6 @@ tf_py_test(
     srcs = ["matrix_exponential_op_test.py"],
     shard_count = 16,
     tags = ["no_windows_gpu"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -792,6 +826,10 @@ cuda_py_test(
     name = "matrix_solve_ls_op_test",
     size = "medium",
     srcs = ["matrix_solve_ls_op_test.py"],
+    tags = [
+        "noasan",  # TODO(b/337374867) fails with -fsanitize=null
+    ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -806,6 +844,7 @@ cuda_py_test(
     name = "matrix_square_root_op_test",
     size = "medium",
     srcs = ["matrix_square_root_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -873,7 +912,6 @@ tf_py_test(
     name = "parse_single_example_op_test",
     size = "small",
     srcs = ["parse_single_example_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -925,6 +963,8 @@ cuda_py_test(
     name = "resource_variable_ops_test",
     size = "medium",
     srcs = ["resource_variable_ops_test.py"],
+    # TODO(kkb): CppMemoryChecker is flaky without these two flags, investigate.
+    #
     # TODO(b/128347673): Re-enable.
     tags = ["no_windows"],
     tfrt_enabled = True,
@@ -935,6 +975,7 @@ cuda_py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:memory_checker",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:variables",
         "@absl_py//absl/testing:parameterized",
@@ -973,6 +1014,7 @@ tf_py_test(
     name = "save_restore_ops_test",
     size = "small",
     srcs = ["save_restore_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
@@ -1136,6 +1178,7 @@ tf_py_test(
     name = "string_format_op_test",
     size = "small",
     srcs = ["string_format_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -1160,7 +1203,6 @@ tf_py_test(
     name = "string_split_op_test",
     size = "small",
     srcs = ["string_split_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1327,6 +1369,7 @@ tf_py_test(
     name = "template_test",
     size = "small",
     srcs = ["template_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -1365,7 +1408,6 @@ cuda_py_test(
     tags = [
         "no_oss",  # TODO(b/142818120): Re-enable.
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -1601,7 +1643,7 @@ tf_py_test(
     name = "reader_ops_test",
     size = "small",
     srcs = ["reader_ops_test.py"],
-    data = ["//tensorflow/core:lmdb_testdata"],
+    data = ["//tensorflow/core/lib/lmdb:lmdb_testdata"],
     tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
@@ -1706,7 +1748,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["batch_matmul_op_test.py"],
     shard_count = 20,
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1795,6 +1836,7 @@ cuda_py_test(
     name = "check_ops_test",
     size = "small",
     srcs = ["check_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
@@ -1829,6 +1871,7 @@ cuda_py_test(
     name = "constant_op_eager_test",
     size = "small",
     srcs = ["constant_op_eager_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1904,6 +1947,7 @@ tf_py_test(
     name = "control_flow_util_v2_test",
     size = "small",
     srcs = ["control_flow_util_v2_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:cond_v2",
@@ -2048,7 +2092,10 @@ cuda_py_test(
     name = "dynamic_partition_op_test",
     size = "medium",
     srcs = ["dynamic_partition_op_test.py"],
-    tfrt_enabled = True,
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    tfrt_enabled = False,  # TODO(b/153089059): add support for complex128.
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2191,6 +2238,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["linalg_ops_test.py"],
     tags = ["no_windows_gpu"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2272,7 +2320,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["matmul_op_test.py"],
     shard_count = 20,
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2448,7 +2495,6 @@ cuda_py_test(
     tags = [
         "no_windows_gpu",
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2575,7 +2621,6 @@ cuda_py_test(
     name = "shape_ops_test",
     size = "medium",
     srcs = ["shape_ops_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -2593,7 +2638,6 @@ cuda_py_test(
     name = "softmax_op_test",
     size = "medium",
     srcs = ["softmax_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2608,7 +2652,6 @@ cuda_py_test(
     name = "softplus_op_test",
     size = "small",
     srcs = ["softplus_op_test.py"],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -3114,7 +3157,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["pooling_ops_test.py"],
     shard_count = 4,
-    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Flaky in XLA b/149568654
     deps = [
         "//tensorflow/python:array_ops",
@@ -3383,6 +3425,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["conv_ops_3d_test.py"],
     shard_count = 30,
+    tags = ["no_cuda11"],
     tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
@@ -3398,7 +3441,6 @@ cuda_py_test(
     srcs = ["cwise_ops_test.py"],
     shard_count = 50,
     tags = ["no_windows"],  # b/163222163
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3419,7 +3461,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["cwise_ops_binary_test.py"],
     shard_count = 50,
-    tfrt_enabled = True,
     # b/140155647: Error just outside of tolerance
     xla_enable_strict_auto_jit = False,
     deps = [
@@ -3525,7 +3566,6 @@ tf_py_test(
     tags = [
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3546,7 +3586,6 @@ cuda_py_test(
     tags = [
         "no_windows",
     ],
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3583,6 +3622,7 @@ cuda_py_test(
         "no_oss",  # b/117185141.
         "nomsan",  # TODO(b/117236102): Re-enable in msan build.
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -3626,6 +3666,7 @@ cuda_py_test(
         "no_windows_gpu",
         "nomsan",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -3650,20 +3691,6 @@ cuda_py_test(
     ],
 )
 
-sycl_py_test(
-    name = "basic_gpu_test",
-    size = "small",
-    srcs = ["basic_gpu_test.py"],
-    deps = [
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
-        "//third_party/py/numpy",
-    ],
-)
-
 tf_py_test(
     name = "sets_test",
     size = "medium",
@@ -3853,7 +3880,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["cond_v2_test.py"],
     grpc_enabled = True,
-    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/array_ops/BUILD b/tensorflow/python/kernel_tests/array_ops/BUILD
index 42fa2fef81e..df17e5a3a39 100644
--- a/tensorflow/python/kernel_tests/array_ops/BUILD
+++ b/tensorflow/python/kernel_tests/array_ops/BUILD
@@ -10,6 +10,7 @@ package(
 cuda_py_test(
     name = "batch_gather_op_test",
     srcs = ["batch_gather_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -23,6 +24,7 @@ cuda_py_test(
     name = "unstack_op_test",
     size = "small",
     srcs = ["unstack_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -37,6 +39,7 @@ cuda_py_test(
     name = "slice_op_test",
     size = "medium",
     srcs = ["slice_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -72,6 +75,7 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variables",
diff --git a/tensorflow/python/kernel_tests/array_ops/gather_op_test.py b/tensorflow/python/kernel_tests/array_ops/gather_op_test.py
index d553b2912ef..f0c762e0cba 100644
--- a/tensorflow/python/kernel_tests/array_ops/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/gather_op_test.py
@@ -436,6 +436,12 @@ class GatherTest(test.TestCase, parameterized.TestCase):
           params=[[10, 11, 12], [13, 14, 15]],
           indices=[1, 0],
           expected=[[11, 10], [14, 13]]),
+      dict(  # 3D indices, batch_dims=-3, axis=1
+          batch_dims=-3,
+          axis=1,
+          params=[[0, 1, 2], [3, 4, 5]],
+          indices=[[[0, 1], [1, 0]]],
+          expected=[[[[0, 1], [1, 0]]], [[[3, 4], [4, 3]]]]),
   ])
   @test_util.run_in_graph_and_eager_modes
   def testBatchDims(self, params, indices, batch_dims, expected=None,
diff --git a/tensorflow/python/kernel_tests/array_ops/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/array_ops/scatter_nd_ops_test.py
index 144cc525905..b3c566b882f 100644
--- a/tensorflow/python/kernel_tests/array_ops/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/scatter_nd_ops_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker_v2
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@@ -840,6 +841,45 @@ class ScatterNdTensorTest(test.TestCase):
       self.assertAllEqual(max_result,
                           constant_op.constant([1, 1, 1, 2, 1, 1, 1, 2]))
 
+  def testUpdateMinMaxGradients(self):
+    with self.cached_session():
+      x = array_ops.ones([4], dtype=dtypes.float32)
+      indices = constant_op.constant([[1], [2], [3], [3]])
+      updates = constant_op.constant([2.0, 0.5, 1.0, 1.0], dtype=dtypes.float32)
+
+      theoretical, _ = gradient_checker_v2.compute_gradient(
+          lambda x: array_ops.tensor_scatter_max(x, indices, updates), [x])
+      # Numerical gradient doesn't work for degenerate values because the
+      # derivative is not continuous. The manually entered gradient divides
+      # the gradient among all contributing elements at the discontinuity.
+      manual = array_ops.reshape(
+          array_ops.matrix_diag([1.0, 0.0, 1.0, 0.3333]), (1, 4, 4))
+      self.assertAllClose(theoretical, manual, 5e-4, 5e-4)
+
+      theoretical, _ = gradient_checker_v2.compute_gradient(
+          lambda x: array_ops.tensor_scatter_min(x, indices, updates), [x])
+      manual = array_ops.reshape(
+          array_ops.matrix_diag([1.0, 1.0, 0.0, 0.3333]), (1, 4, 4))
+      self.assertAllClose(theoretical, manual, 5e-4, 5e-4)
+
+      theoretical, _ = gradient_checker_v2.compute_gradient(
+          lambda updates: array_ops.tensor_scatter_max(x, indices, updates),
+          [updates])
+      manual = constant_op.constant(
+          [[[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0],
+            [0.0, 0.0, 0.3333, 0.3333]]],
+          dtype=dtypes.float32)
+      self.assertAllClose(theoretical, manual, 5e-4, 5e-4)
+
+      theoretical, _ = gradient_checker_v2.compute_gradient(
+          lambda updates: array_ops.tensor_scatter_min(x, indices, updates),
+          [updates])
+      manual = constant_op.constant(
+          [[[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0],
+            [0.0, 0.0, 0.3333, 0.3333]]],
+          dtype=dtypes.float32)
+      self.assertAllClose(theoretical, manual, 5e-4, 5e-4)
+
   def testTensorScatterUpdateWithForwarding(self):
     for dtype in (dtypes.int32, dtypes.float32):
 
@@ -870,6 +910,32 @@ class ScatterNdTensorTest(test.TestCase):
             "hello", "there", "hello", "there", "there", "hello", "hello", "12"
         ]))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testUpdateRepeatedIndices1D(self):
+    if test_util.is_gpu_available():
+      self.skipTest("Duplicate indices scatter is non-deterministic on GPU")
+    a = array_ops.zeros([10, 1])
+    b = array_ops.tensor_scatter_update(a, [[5], [5]], [[4], [8]])
+    self.assertAllEqual(
+        b,
+        constant_op.constant([[0.], [0.], [0.], [0.], [0.], [8.], [0.], [0.],
+                              [0.], [0.]]))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testUpdateRepeatedIndices2D(self):
+    if test_util.is_gpu_available():
+      self.skipTest("Duplicate indices scatter is non-deterministic on GPU")
+    a = array_ops.zeros([10, 10])
+    b = array_ops.tensor_scatter_update(
+        a, [[5], [6], [6]],
+        [math_ops.range(10),
+         math_ops.range(11, 21),
+         math_ops.range(10, 20)])
+    self.assertAllEqual(
+        b[6],
+        constant_op.constant(
+            [10., 11., 12., 13., 14., 15., 16., 17., 18., 19.]))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 391930e20d5..8eb5af399b4 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -1234,6 +1234,7 @@ class SliceAssignTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       with self.assertRaises(ValueError):
         sess.run(v[:].assign(too_small_val))
 
+  @test_util.disable_xla("b/123559667")
   @test_util.run_in_graph_and_eager_modes
   def testTensorStridedSliceUpdateWithInputForward(self):
     """Tests tensor_strided_slice_update with input-forwarding taking effect."""
@@ -1243,6 +1244,7 @@ class SliceAssignTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       return gen_array_ops.tensor_strided_slice_update(y, [0], [1], [1], [0])
     self.assertAllEqual([0, 1], self.evaluate(assign(array_ops.zeros([2]))))
 
+  @test_util.disable_xla("b/123559667")
   @test_util.run_in_graph_and_eager_modes
   def testTensorStridedSliceUpdateNoInputForward(self):
     """Tests tensor_strided_slice_update with no input-forwarding."""
@@ -1254,6 +1256,7 @@ class SliceAssignTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     ans = y + z
     self.assertAllClose([1.6, 2.6], self.evaluate(ans))
 
+  @test_util.disable_xla("b/123559667")
   def testTensorStridedSliceUpdateGradSimple(self):
     original = constant_op.constant([0.2, 0.3])
     updates = constant_op.constant([0.4])
@@ -1272,6 +1275,7 @@ class SliceAssignTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           ([4], [5], [3], [1], [3], 1, 0, 0, 0, 0),
           ([2, 2, 3, 2], [0, 0, 1], [1, 0, 2], [1, 0, 1], [2, 3], 0, 0, 2, 0, 5)
       ]))
+  @test_util.disable_xla("b/123559667")
   def testTensorStridedSliceUpdateGrad(
       self, shape, begin, end, strides, updates_shape, *args):
     with self.cached_session():
@@ -1399,6 +1403,34 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
       check_dtypes(dtypes.int64, dtypes.int32)
       check_dtypes(dtypes.int64, dtypes.int64)
 
+  def testOutputDtype(self):
+
+    def check_output_dtype(output_dtype):
+      res = self.evaluate(
+          array_ops.sequence_mask(
+              constant_op.constant([1, 3, 2], dtype=dtypes.int32),
+              constant_op.constant(5, dtype=dtypes.int32),
+              dtype=output_dtype))
+      self.assertAllEqual(
+          res,
+          self.evaluate(
+              math_ops.cast([[True, False, False, False, False],
+                             [True, True, True, False, False],
+                             [True, True, False, False, False]], output_dtype)))
+
+    check_output_dtype(dtypes.bool)
+    check_output_dtype("bool")
+    check_output_dtype(np.bool)
+    check_output_dtype(dtypes.int32)
+    check_output_dtype("int32")
+    check_output_dtype(np.int32)
+    check_output_dtype(dtypes.float32)
+    check_output_dtype("float32")
+    check_output_dtype(np.float32)
+    check_output_dtype(dtypes.int64)
+    check_output_dtype("float64")
+    check_output_dtype(np.float64)
+
 
 class ConcatSliceResourceTest(test_util.TensorFlowTestCase):
 
@@ -1578,7 +1610,7 @@ class QuantizeAndDequantizeTest(test_util.TensorFlowTestCase):
         expected = self._scale_per_slice(shape, axis, quant_values)
         unused_minmax_value = 0 if axis is None else [0] * shape[axis]
         fake_quantized = self.evaluate(
-            array_ops.quantize_and_dequantize(
+            array_ops.quantize_and_dequantize_v2(
                 inputs,
                 unused_minmax_value,
                 unused_minmax_value,
@@ -1588,7 +1620,7 @@ class QuantizeAndDequantizeTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(fake_quantized, expected)
         if axis is not None:
           fake_quantized = self.evaluate(
-              array_ops.quantize_and_dequantize(
+              array_ops.quantize_and_dequantize_v2(
                   inputs,
                   unused_minmax_value,
                   unused_minmax_value,
@@ -1596,6 +1628,23 @@ class QuantizeAndDequantizeTest(test_util.TensorFlowTestCase):
                   axis=(axis - 4)))
           self.assertAllClose(fake_quantized, expected)
 
+  def testQuantizeDequantizeGrad(self):
+    shape = (2, 2)
+    max_threshold = 0
+    min_threshold = -10
+    input_value = np.random.rand(2, 2) * 40.0 - 20.0
+    input_tensor = constant_op.constant(input_value, shape=shape,
+                                        name="input_tensor")
+    with self.cached_session():
+      def f(a):
+        return array_ops.quantize_and_dequantize_v2(
+            a,
+            input_min=min_threshold,
+            input_max=max_threshold,
+            range_given=True)
+      output_grad = gradient_checker_v2.compute_gradient(f, [input_tensor])
+      self.assertAllClose(output_grad[0], np.zeros([1, 4, 4]))
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class SortedSearchTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/kernel_tests/batch_matmul_op_test.py b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
index 30b61027813..ac82a320bb6 100644
--- a/tensorflow/python/kernel_tests/batch_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
@@ -130,6 +130,7 @@ class BatchMatmulOpTest(test.TestCase):
 
 def _GetBatchMatmulOpTest(dtype, adjoint_a, adjoint_b, use_static_shape):
 
+  @test_util.run_without_tensor_float_32("Tests batch matmul")
   def Test(self):
     np.random.seed(42)
     self._testNonEmpty(dtype, adjoint_a, adjoint_b, use_static_shape)
@@ -141,6 +142,7 @@ def _GetBatchMatmulOpTest(dtype, adjoint_a, adjoint_b, use_static_shape):
 def _GetBatchMatmulOpBroadcastingTest(dtype, adjoint_a, adjoint_b,
                                       use_static_shape):
 
+  @test_util.run_without_tensor_float_32("Tests batch matmul")
   def Test(self):
     np.random.seed(42)
     self._testBroadcasting(dtype, adjoint_a, adjoint_b, use_static_shape)
diff --git a/tensorflow/python/kernel_tests/bitcast_op_test.py b/tensorflow/python/kernel_tests/bitcast_op_test.py
index ed6d7799c7e..5551469aa73 100644
--- a/tensorflow/python/kernel_tests/bitcast_op_test.py
+++ b/tensorflow/python/kernel_tests/bitcast_op_test.py
@@ -82,6 +82,7 @@ class BitcastTest(test.TestCase):
       datatype = dtypes.int8
       array_ops.bitcast(x, datatype, None)
 
+  @test_util.disable_tfrt("b/169901260")
   def testQuantizedType(self):
     shape = [3, 4]
     x = np.zeros(shape, np.uint16)
diff --git a/tensorflow/python/kernel_tests/boosted_trees/BUILD b/tensorflow/python/kernel_tests/boosted_trees/BUILD
index 5b318324d4c..e7a34382355 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/BUILD
+++ b/tensorflow/python/kernel_tests/boosted_trees/BUILD
@@ -24,6 +24,7 @@ tf_py_test(
     name = "resource_ops_test",
     size = "small",
     srcs = ["resource_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
         "//tensorflow/python:boosted_trees_ops",
@@ -39,6 +40,7 @@ tf_py_test(
     name = "prediction_ops_test",
     size = "small",
     srcs = ["prediction_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
         "//tensorflow/python:array_ops",
@@ -53,6 +55,7 @@ tf_py_test(
     name = "stats_ops_test",
     size = "medium",
     srcs = ["stats_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:boosted_trees_ops",
@@ -69,6 +72,7 @@ tf_py_test(
     name = "training_ops_test",
     size = "small",
     srcs = ["training_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
         "//tensorflow/python:array_ops",
@@ -83,6 +87,7 @@ tf_py_test(
     name = "quantile_ops_test",
     size = "small",
     srcs = ["quantile_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
         "//tensorflow/python:boosted_trees_ops",
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index acc5af03097..9cb9fb490bc 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -1903,6 +1903,174 @@ class AssertShapesTest(test.TestCase):
         sess.run(out, feed_dict=feed_dict)
 
 
+class AssertShapesSparseTensorTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_assert_shapes_sparse_tensor_scalar_target_success(self):
+    sparse_float = sparse_tensor.SparseTensor(
+        constant_op.constant([[]], dtypes.int64),
+        constant_op.constant([42], dtypes.float32),
+        constant_op.constant([], dtypes.int64))
+    assertion = check_ops.assert_shapes([(sparse_float, [])])
+    with ops.control_dependencies([assertion]):
+      out = array_ops.identity(sparse_float)
+    self.evaluate(out)
+
+  def test_assert_shapes_sparse_tensor_nonscalar_target_fail(self):
+    sparse_float = sparse_tensor.SparseTensor(
+        constant_op.constant([[]], dtypes.int64),
+        constant_op.constant([42], dtypes.float32),
+        constant_op.constant([], dtypes.int64))
+    with self.assertRaisesRegexp(ValueError,
+                                 r"must have rank 2.*Received rank 0"):
+      assertion = check_ops.assert_shapes([(sparse_float, [None, None])])
+      with ops.control_dependencies([assertion]):
+        out = array_ops.identity(sparse_float)
+      self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_assert_shapes_sparse_tensor_fully_specified_target_success(self):
+    sparse_float = sparse_tensor.SparseTensor(
+        constant_op.constant([[111], [232]], dtypes.int64),
+        constant_op.constant([23.4, -43.2], dtypes.float32),
+        constant_op.constant([500], dtypes.int64))
+    assertion = check_ops.assert_shapes([(sparse_float, [500])])
+    with ops.control_dependencies([assertion]):
+      out = array_ops.identity(sparse_float)
+    self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_assert_shapes_sparse_tensor_fully_specified_target_fail(self):
+    sparse_float = sparse_tensor.SparseTensor(
+        constant_op.constant([[111], [232]], dtypes.int64),
+        constant_op.constant([23.4, -43.2], dtypes.float32),
+        constant_op.constant([500], dtypes.int64))
+    with self.assertRaisesRegexp(ValueError, r"dimension 0 must have size 499"):
+      assertion = check_ops.assert_shapes([(sparse_float, [499])])
+      with ops.control_dependencies([assertion]):
+        out = array_ops.identity(sparse_float)
+      self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_assert_shapes_sparse_tensor_partially_specified_target_success(self):
+    sparse_int = sparse_tensor.SparseTensor(
+        constant_op.constant([[5, 6], [7, 8]], dtypes.int64),
+        constant_op.constant([23, -43], dtypes.int32),
+        constant_op.constant([30, 40], dtypes.int64))
+    assertion = check_ops.assert_shapes([(sparse_int, [None, 40])])
+    with ops.control_dependencies([assertion]):
+      out = array_ops.identity(sparse_int)
+    self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_assert_shapes_sparse_tensor_symbolic_match_success(self):
+    sparse_int = sparse_tensor.SparseTensor(
+        constant_op.constant([[5, 6, 7], [8, 9, 10]], dtypes.int64),
+        constant_op.constant([23, -43], dtypes.int32),
+        constant_op.constant([30, 30, 40], dtypes.int64))
+    assertion = check_ops.assert_shapes([(sparse_int, ["N", "N", "D"])])
+    with ops.control_dependencies([assertion]):
+      out = array_ops.identity(sparse_int)
+    self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_assert_shapes_sparse_tensor_partially_specified_target_fail(self):
+    sparse_int = sparse_tensor.SparseTensor(
+        constant_op.constant([[5, 6], [7, 8]], dtypes.int64),
+        constant_op.constant([23, -43], dtypes.int32),
+        constant_op.constant([30, 40], dtypes.int64))
+    with self.assertRaisesRegexp(ValueError, r"dimension 1 must have size 41"):
+      assertion = check_ops.assert_shapes([(sparse_int, [None, 41])])
+      with ops.control_dependencies([assertion]):
+        out = array_ops.identity(sparse_int)
+      self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_assert_shapes_sparse_tensor_wrong_rank_fail(self):
+    sparse_int = sparse_tensor.SparseTensor(
+        constant_op.constant([[5, 6], [7, 8]], dtypes.int64),
+        constant_op.constant([23, -43], dtypes.int32),
+        constant_op.constant([30, 40], dtypes.int64))
+    with self.assertRaisesRegexp(ValueError,
+                                 r"must have rank 3\..* Received rank 2"):
+      assertion = check_ops.assert_shapes([(sparse_int, [None, None, 40])])
+      with ops.control_dependencies([assertion]):
+        out = array_ops.identity(sparse_int)
+      self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_assert_shapes_sparse_tensor_wrong_symbolic_match_fail(self):
+    sparse_int = sparse_tensor.SparseTensor(
+        constant_op.constant([[5, 6], [7, 8]], dtypes.int64),
+        constant_op.constant([23, -43], dtypes.int32),
+        constant_op.constant([30, 40], dtypes.int64))
+    with self.assertRaisesRegexp(ValueError, r"dimension 1 must have size 30"):
+      assertion = check_ops.assert_shapes([(sparse_int, ["D", "D"])])
+      with ops.control_dependencies([assertion]):
+        out = array_ops.identity(sparse_int)
+      self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_assert_shapes_sparse_tensor_multiple_assertions_success(self):
+    sparse_scalar = sparse_tensor.SparseTensor(
+        constant_op.constant([[]], dtypes.int64),
+        constant_op.constant([42], dtypes.float32),
+        constant_op.constant([], dtypes.int64))
+    sparse_2d = sparse_tensor.SparseTensor(
+        constant_op.constant([[5, 6], [7, 8]], dtypes.int64),
+        constant_op.constant([23, -43], dtypes.int32),
+        constant_op.constant([30, 30], dtypes.int64))
+    assertion = check_ops.assert_shapes([(sparse_scalar, []),
+                                         (sparse_2d, ["N", "N"])])
+    with ops.control_dependencies([assertion]):
+      out = array_ops.identity(sparse_2d)
+    self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_assert_shapes_sparse_tensor_multiple_assertions_fail(self):
+    sparse_scalar = sparse_tensor.SparseTensor(
+        constant_op.constant([[]], dtypes.int64),
+        constant_op.constant([42], dtypes.float32),
+        constant_op.constant([], dtypes.int64))
+    sparse_2d = sparse_tensor.SparseTensor(
+        constant_op.constant([[5, 6], [7, 8]], dtypes.int64),
+        constant_op.constant([23, -43], dtypes.int32),
+        constant_op.constant([30, 40], dtypes.int64))
+    with self.assertRaisesRegexp(ValueError, r"dimension 1 must have size 30"):
+      assertion = check_ops.assert_shapes([(sparse_scalar, []),
+                                           (sparse_2d, ["N", "N"])])
+      with ops.control_dependencies([assertion]):
+        out = array_ops.identity(sparse_2d)
+      self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_assert_shapes_sparse_tensor_mixed_dense_and_sparse_success(self):
+    dense_scalar = constant_op.constant([42], dtypes.float32)
+    sparse_2d = sparse_tensor.SparseTensor(
+        constant_op.constant([[5, 6], [7, 8]], dtypes.int64),
+        constant_op.constant([23, -43], dtypes.int32),
+        constant_op.constant([30, 30], dtypes.int64))
+    assertion = check_ops.assert_shapes([(dense_scalar, []),
+                                         (sparse_2d, ["N", "N"])])
+    with ops.control_dependencies([assertion]):
+      out = array_ops.identity(sparse_2d)
+    self.evaluate(out)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_assert_shapes_sparse_tensor_mixed_dense_and_sparse_fail(self):
+    dense_scalar = constant_op.constant([42], dtypes.float32)
+    sparse_2d = sparse_tensor.SparseTensor(
+        constant_op.constant([[5, 6], [7, 8]], dtypes.int64),
+        constant_op.constant([23, -43], dtypes.int32),
+        constant_op.constant([30, 40], dtypes.int64))
+    with self.assertRaisesRegexp(ValueError, r"dimension 1 must have size 30"):
+      assertion = check_ops.assert_shapes([(dense_scalar, []),
+                                           (sparse_2d, ["N", "N"])])
+      with ops.control_dependencies([assertion]):
+        out = array_ops.identity(sparse_2d)
+      self.evaluate(out)
+
+
 class IsStrictlyIncreasingTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index a9afca8bfe7..0697f7def1b 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -106,7 +106,7 @@ class CholeskyOpTest(test.TestCase):
   def _verifyCholesky(self, x):
     # Verify that LL^T == x.
     chol = linalg_ops.cholesky(x)
-    verification = math_ops.matmul(chol, chol, adjoint_b=True)
+    verification = test_util.matmul_without_tf32(chol, chol, adjoint_b=True)
     self._verifyCholeskyBase(x, chol, verification)
 
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)
@@ -271,8 +271,8 @@ class CholeskyGradTest(test.TestCase):
     def Compute(x):
       # Turn the random matrix x into a Hermitian matrix by
       # computing the quadratic form x * x^H.
-      a = math_ops.matmul(x, math_ops.conj(
-          array_ops.matrix_transpose(x))) / shape[0]
+      a = test_util.matmul_without_tf32(
+          x, math_ops.conj(array_ops.matrix_transpose(x))) / shape[0]
       if batch:
         a = array_ops.tile(array_ops.expand_dims(a, 0), [2, 1, 1])
       # Finally take the cholesky decomposition of the Hermitian matrix.
diff --git a/tensorflow/python/kernel_tests/collective_ops_multi_worker_test.py b/tensorflow/python/kernel_tests/collective_ops_multi_worker_test.py
new file mode 100644
index 00000000000..47e296a384b
--- /dev/null
+++ b/tensorflow/python/kernel_tests/collective_ops_multi_worker_test.py
@@ -0,0 +1,142 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for multi worker Collective Operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import os
+import time
+
+from tensorflow.core.protobuf import tensorflow_server_pb2
+from tensorflow.python.distribute import cluster_resolver as cluster_resolver_lib
+from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
+from tensorflow.python.ops import collective_ops
+
+
+def enable_collective_ops(cluster_resolver):
+  context.context().configure_collective_ops(
+      collective_leader="/job:worker/replica:0/task:0")
+  config_proto = copy.deepcopy(context.context().config)
+  server_def = tensorflow_server_pb2.ServerDef(
+      cluster=cluster_resolver.cluster_spec().as_cluster_def(),
+      default_session_config=config_proto,
+      job_name=cluster_resolver.task_type,
+      task_index=cluster_resolver.task_id,
+      protocol=cluster_resolver.rpc_layer or "grpc")
+  context.context().enable_collective_ops(server_def)
+
+
+class CollectiveOpTest(test.TestCase):
+
+  def testCheckHealth(self):
+
+    def worker_fn():
+      enable_collective_ops(cluster_resolver_lib.TFConfigClusterResolver())
+      # There may be some delays before the server startup. Check health should
+      # eventually be OK.
+      while True:
+        try:
+          for task in [
+              "/job:worker/replica:0/task:0",
+              "/job:worker/replica:0/task:1",
+          ]:
+            context.context().check_collective_ops_peer_health(task)
+        except errors.UnavailableError:
+          continue
+        break
+      multi_process_runner.get_barrier().wait()
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    mpr = multi_process_runner.MultiProcessRunner(worker_fn, cluster_spec)
+    mpr.start()
+    mpr.join()
+
+  def testCheckHealthPeerDown(self):
+
+    if multi_process_runner.is_oss():
+      self.skipTest("TODO(b/170838845): Failing in OSS")
+
+    def worker_fn():
+      enable_collective_ops(cluster_resolver_lib.TFConfigClusterResolver())
+      context.context().check_collective_ops_peer_health(
+          "/job:worker/replica:0/task:1",)
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    mpr = multi_process_runner.MultiProcessRunner(worker_fn, cluster_spec)
+    mpr.start_single_process("worker", 0)
+    with self.assertRaises(errors.UnavailableError):
+      mpr.join()
+
+  def testCheckHealthPeerRestart(self):
+
+    def worker_fn():
+      cluster_resolver = cluster_resolver_lib.TFConfigClusterResolver()
+      enable_collective_ops(cluster_resolver)
+
+      collective_ops.all_reduce(
+          constant_op.constant(1.),
+          group_size=2,
+          group_key=100,
+          instance_key=100,
+          merge_op="Add",
+          final_op="Id",
+          communication_hint="ring")
+
+      if cluster_resolver.task_type == "worker":
+        # MultiProcessRunner will auto restart worker-0.
+        os._exit(1)  # pylint: disable=protected-access
+      else:
+        # chief should eventually gets FailedPreconditionError after worker-0
+        # has restarted.
+        while True:
+          time.sleep(1)
+          try:
+            context.context().check_collective_ops_peer_health(
+                "/job:worker/replica:0/task:0",)
+          except errors.UnavailableError:
+            pass
+          except errors.FailedPreconditionError:
+            break
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(
+        has_chief=True, num_workers=1)
+    mpr = multi_process_runner.MultiProcessRunner(
+        worker_fn, cluster_spec, auto_restart=True)
+    mpr.start()
+    mpr.join()
+
+  def testCheckHealthInvalidPeer(self):
+
+    def worker_fn():
+      enable_collective_ops(cluster_resolver_lib.TFConfigClusterResolver())
+      context.context().check_collective_ops_peer_health("localhost:12345",)
+
+    cluster_spec = multi_worker_test_base.create_cluster_spec(num_workers=2)
+    mpr = multi_process_runner.MultiProcessRunner(worker_fn, cluster_spec)
+    mpr.start_single_process("worker", 0)
+    with self.assertRaises(errors.InvalidArgumentError):
+      mpr.join()
+
+
+if __name__ == "__main__":
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/kernel_tests/collective_ops_test.py b/tensorflow/python/kernel_tests/collective_ops_test.py
index 4225df7537a..8a2a4b6cd28 100644
--- a/tensorflow/python/kernel_tests/collective_ops_test.py
+++ b/tensorflow/python/kernel_tests/collective_ops_test.py
@@ -18,109 +18,608 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import threading
+import time
+
+from absl.testing import parameterized
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import test_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import gen_collective_ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import collective_ops as _collective_ops
 from tensorflow.python.platform import test
 
 
-class CollectiveOpsTest(test.TestCase):
+class CollectiveOpsV1(object):
+  all_reduce = _collective_ops.all_reduce
+  all_gather = _collective_ops.all_gather
 
-  def _setup_context(self, num_cpus=2):
-    context._reset_context()
-    cpus = config.list_physical_devices('CPU')
-    self.assertEqual(len(cpus), 1)
-    config.set_logical_device_configuration(cpus[0], [
-        context.LogicalDeviceConfiguration(),
-        context.LogicalDeviceConfiguration(),
-        context.LogicalDeviceConfiguration(),
-        context.LogicalDeviceConfiguration()
-    ])
-    context.ensure_initialized()
 
-  @test_util.run_v2_only
-  def testReduceV2(self):
-    self._setup_context()
+class CollectiveOpsV2(object):
+
+  @staticmethod
+  def all_reduce(t, group_size, group_key, instance_key, *args, **kwargs):
+    group_size = array_ops.identity(group_size)
+    group_key = array_ops.identity(group_key)
+    instance_key = array_ops.identity(instance_key)
+    return _collective_ops.all_reduce_v2(t, group_size, group_key, instance_key,
+                                         *args, **kwargs)
+
+  @staticmethod
+  def all_gather(t, group_size, group_key, instance_key, *args, **kwargs):
+    group_size = array_ops.identity(group_size)
+    group_key = array_ops.identity(group_key)
+    instance_key = array_ops.identity(instance_key)
+    return _collective_ops.all_gather_v2(t, group_size, group_key, instance_key,
+                                         *args, **kwargs)
+
+
+device_combination = (
+    combinations.combine(device='CPU', communication='RING', required_gpus=0) +
+    combinations.combine(
+        device='GPU', communication=['RING', 'NCCL'], required_gpus=2))
+
+
+@combinations.generate(
+    combinations.times(
+        combinations.combine(
+            collective_ops=[
+                combinations.NamedObject('v1', CollectiveOpsV1),
+                combinations.NamedObject('v2', CollectiveOpsV2)
+            ],
+            mode='eager'), device_combination))
+class CollectiveOpsTest(test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    _setup_context()
+    super().setUp()
+
+  def testReduce(self, collective_ops, device, communication):
+    dev0 = '/device:%s:0' % device
+    dev1 = '/device:%s:1' % device
 
     @def_function.function
-    def single_all_reduce(in_value, group_size, group_key, instance_key):
-      return gen_collective_ops.collective_reduce_v2(
-          in_value, group_size, group_key, instance_key, merge_op='Add',
-          final_op='Id', communication_hint='auto')
-
-    @def_function.function
-    def run_all_reduce_1cpu():
-      with ops.device('/device:CPU:0'):
+    def run_all_reduce_1device():
+      with ops.device(dev0):
         in_value = constant_op.constant([1.])
-        group_size = constant_op.constant(1)
-        group_key = constant_op.constant(1)
-        instance_key = constant_op.constant(1)
-        return single_all_reduce(in_value, group_size, group_key, instance_key)
+        group_size = 1
+        group_key = 1
+        instance_key = 1
+        return collective_ops.all_reduce(
+            in_value,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint=communication)
 
     @def_function.function
-    def run_all_reduce_2cpus():
+    def run_all_reduce_2devices():
       in_value = constant_op.constant([1.])
-      group_size = constant_op.constant(2)
-      group_key = constant_op.constant(2)
-      instance_key = constant_op.constant(2)
+      group_size = 2
+      group_key = 2
+      instance_key = 2
       collectives = []
-      with ops.device('/device:CPU:0'):
-        collectives.append(single_all_reduce(in_value, group_size, group_key,
-                                             instance_key))
-      with ops.device('/device:CPU:1'):
-        collectives.append(single_all_reduce(in_value, group_size, group_key,
-                                             instance_key))
+      with ops.device(dev0):
+        collectives.append(
+            collective_ops.all_reduce(
+                in_value,
+                group_size,
+                group_key,
+                instance_key,
+                communication_hint=communication))
+      with ops.device(dev1):
+        collectives.append(
+            collective_ops.all_reduce(
+                in_value,
+                group_size,
+                group_key,
+                instance_key,
+                communication_hint=communication))
       return collectives
 
-    self.assertAllClose(run_all_reduce_1cpu(), [1.], rtol=1e-5, atol=1e-5)
-    for result in run_all_reduce_2cpus():
+    self.assertAllClose(run_all_reduce_1device(), [1.], rtol=1e-5, atol=1e-5)
+    for result in run_all_reduce_2devices():
       self.assertAllClose(result, [2.], rtol=1e-5, atol=1e-5)
 
-  @test_util.run_v2_only
-  def testInstanceKeyScopedUnderGroupKey(self):
-    self._setup_context()
+  def testGather(self, collective_ops, device, communication):
+    dev0 = '/device:%s:0' % device
+    dev1 = '/device:%s:1' % device
 
     @def_function.function
-    def single_all_reduce(in_value, group_size, group_key, instance_key):
-      return gen_collective_ops.collective_reduce_v2(
-          in_value, group_size, group_key, instance_key, merge_op='Add',
-          final_op='Id', communication_hint='auto')
+    def run_all_gather_1device():
+      with ops.device(dev0):
+        in_value = constant_op.constant([1.])
+        group_size = 1
+        group_key = 1
+        instance_key = 1
+        return collective_ops.all_gather(
+            in_value,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint=communication)
 
     @def_function.function
-    def run_all_reduce_4cpus_same_instance_key():
-      # Use a common instance key for both groups.
-      instance_key = constant_op.constant(0)
-      # We will create 2 groups each with 2 devices.
-      group_size = constant_op.constant(2)
-      # Group 0 comprises cpu:0 and cpu:1.
-      group0_key = constant_op.constant(0)
-      # Group 1 comprises cpu:2 and cpu:3.
-      group1_key = constant_op.constant(1)
+    def run_all_gather_2devices():
+      in_value = constant_op.constant([1.])
+      group_size = 2
+      group_key = 2
+      instance_key = 2
       collectives = []
-      with ops.device('/device:CPU:0'):
-        collectives.append(single_all_reduce(
-            constant_op.constant(1.), group_size, group0_key, instance_key))
-      with ops.device('/device:CPU:1'):
-        collectives.append(single_all_reduce(
-            constant_op.constant(2.), group_size, group0_key, instance_key))
-      with ops.device('/device:CPU:2'):
-        collectives.append(single_all_reduce(
-            constant_op.constant(3.), group_size, group1_key, instance_key))
-      with ops.device('/device:CPU:3'):
-        collectives.append(single_all_reduce(
-            constant_op.constant(4.), group_size, group1_key, instance_key))
+      with ops.device(dev0):
+        collectives.append(
+            collective_ops.all_gather(
+                in_value,
+                group_size,
+                group_key,
+                instance_key,
+                communication_hint=communication))
+      with ops.device(dev1):
+        collectives.append(
+            collective_ops.all_gather(
+                in_value,
+                group_size,
+                group_key,
+                instance_key,
+                communication_hint=communication))
       return collectives
 
-    results = run_all_reduce_4cpus_same_instance_key()
+    self.assertAllClose(run_all_gather_1device(), [1.], rtol=1e-5, atol=1e-5)
+    for result in run_all_gather_2devices():
+      self.assertAllClose(result, [1., 1.], rtol=1e-5, atol=1e-5)
+
+  def testInstanceKeyScopedUnderGroupKey(self, collective_ops, device,
+                                         communication):
+    if device == 'GPU' and context.num_gpus() < 4:
+      self.skipTest('not enough GPU')
+
+    dev0 = '/device:%s:0' % device
+    dev1 = '/device:%s:1' % device
+    dev2 = '/device:%s:2' % device
+    dev3 = '/device:%s:3' % device
+
+    @def_function.function
+    def run_all_reduce_4devices_same_instance_key():
+      # Use a common instance key for both groups.
+      instance_key = 0
+      # We will create 2 groups each with 2 devices.
+      group_size = 2
+      # Group 0 comprises dev0 and dev1.
+      group0_key = 0
+      # Group 1 comprises dev2 and dev3.
+      group1_key = 1
+      collectives = []
+      with ops.device(dev0):
+        collectives.append(
+            collective_ops.all_reduce(
+                constant_op.constant(1.), group_size, group0_key, instance_key))
+      with ops.device(dev1):
+        collectives.append(
+            collective_ops.all_reduce(
+                constant_op.constant(2.), group_size, group0_key, instance_key))
+      with ops.device(dev2):
+        collectives.append(
+            collective_ops.all_reduce(
+                constant_op.constant(3.), group_size, group1_key, instance_key))
+      with ops.device(dev3):
+        collectives.append(
+            collective_ops.all_reduce(
+                constant_op.constant(4.), group_size, group1_key, instance_key))
+      return collectives
+
+    results = run_all_reduce_4devices_same_instance_key()
     self.assertAllClose(results[0], 3., rtol=1e-5, atol=1e-5)
     self.assertAllClose(results[1], 3., rtol=1e-5, atol=1e-5)
     self.assertAllClose(results[2], 7., rtol=1e-5, atol=1e-5)
     self.assertAllClose(results[3], 7., rtol=1e-5, atol=1e-5)
 
+  def testCollectiveGroupSizeOne(self, collective_ops, device, communication):
+    if communication == 'NCCL':
+      self.skipTest('b/170672646: it crashes with NCCL and group size one')
+    dev0 = '/device:%s:0' % device
+
+    group_size = 1
+    group_key = 100
+    instance_key = 100
+    in_value = [1., 2., 3., 4.]
+    in_tensor = constant_op.constant(in_value)
+
+    with ops.device(dev0):
+      reduced_tensor = collective_ops.all_reduce(
+          in_tensor,
+          group_size,
+          group_key,
+          instance_key,
+          communication_hint=communication)
+    self.assertAllEqual(in_value, reduced_tensor.numpy())
+
+    with ops.device(dev0):
+      gathered_tensor = collective_ops.all_gather(
+          in_tensor,
+          group_size,
+          group_key,
+          instance_key,
+          communication_hint=communication)
+    self.assertAllEqual(in_value, gathered_tensor.numpy())
+
+  def testMultipleGroups(self, collective_ops, device, communication):
+    if device == 'GPU' and context.num_gpus() < 4:
+      self.skipTest('not enough GPU')
+
+    num_elements = 4
+
+    @def_function.function
+    def run_all_reduce(group_size, group_key):
+      instance_key = group_key
+      input_value = [float(group_key) for i in range(num_elements)]
+      collectives = []
+      for device_idx in range(group_size):
+        with ops.device('/{}:{}'.format(device, device_idx)):
+          input_tensor = constant_op.constant(input_value)
+          collectives.append(
+              collective_ops.all_reduce(
+                  input_tensor,
+                  group_size,
+                  group_key,
+                  instance_key,
+                  communication_hint=communication))
+      return collectives
+
+    def run_and_assert(group_size, group_key):
+      for reduced_tensor in run_all_reduce(group_size, group_key):
+        self.assertAllEqual(
+            [float(group_key) * group_size for i in range(num_elements)],
+            reduced_tensor.numpy())
+
+    run_and_assert(group_size=2, group_key=1)
+    run_and_assert(group_size=3, group_key=2)
+
+
+@combinations.generate(
+    combinations.times(
+        combinations.combine(
+            collective_op=[
+                combinations.NamedObject('all_reduce',
+                                         CollectiveOpsV1.all_reduce),
+                combinations.NamedObject('all_reduce_v2',
+                                         CollectiveOpsV2.all_reduce),
+                combinations.NamedObject('all_gather',
+                                         CollectiveOpsV1.all_gather),
+                combinations.NamedObject('all_gather_v2',
+                                         CollectiveOpsV2.all_gather),
+            ],
+            mode='eager'), device_combination))
+class AbortCollectiveOpsTest(test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    _setup_context()
+    super().setUp()
+
+  def testAbortGroupParamsResolution(self, collective_op, device,
+                                     communication):
+    dev0 = '/device:%s:0' % device
+    dev1 = '/device:%s:1' % device
+    group_size = 2
+    group_key = 100
+    instance_key = 100
+    in_tensor = constant_op.constant([1.])
+
+    def abort_fn():
+      time.sleep(2)
+      context.context().abort_collective_ops(errors.UNAVAILABLE, 'peer down')
+
+    t = threading.Thread(target=abort_fn)
+    t.start()
+
+    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
+      # This hangs on params resolution since we're only launching one
+      # collective for a group size of 2.
+      with ops.device(dev0):
+        collective_op(
+            in_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint=communication)
+
+    # After abortion, subsequent collectives should fail immediately.
+    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
+      with ops.device(dev0):
+        collective_op(
+            in_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint=communication)
+
+    t.join()
+    # Reset the context in order to reset the collective executor.
+    _setup_context()
+
+    # After reset non-NCCL collectives should work.
+    def collective_fn():
+      for device in [dev0, dev1]:
+        with ops.device(device):
+          collective_op(
+              in_tensor,
+              group_size,
+              group_key,
+              instance_key,
+              communication_hint=communication)
+
+    def_function.function(collective_fn)()
+
+  def testAbortInstanceParamsResolution(self, collective_op, device,
+                                        communication):
+    dev0 = '/device:%s:0' % device
+    dev1 = '/device:%s:1' % device
+    group_size = 2
+    group_key = 100
+    instance_key = 100
+    in_tensor = constant_op.constant([1.])
+
+    def collective_fn():
+      for device in [dev0, dev1]:
+        with ops.device(device):
+          collective_op(
+              in_tensor,
+              group_size,
+              group_key,
+              instance_key,
+              communication_hint=communication)
+
+    # First perform a normal all-reduce to complete the group resolution.
+    def_function.function(collective_fn)()
+
+    def abort_fn():
+      time.sleep(2)
+      context.context().abort_collective_ops(errors.UNAVAILABLE, 'peer down')
+
+    t = threading.Thread(target=abort_fn)
+    t.start()
+
+    # Use a different instance key to trigger another instance resolution.
+    instance_key = 101
+    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
+      # This hangs on params resolution since we're only launching one
+      # collective for a group size of 2.
+      with ops.device(dev0):
+        collective_op(
+            in_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint=communication)
+
+    # After abortion, subsequent collectives should fail immediately.
+    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
+      with ops.device(dev0):
+        collective_op(
+            in_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint=communication)
+
+    context._reset_context()  # pylint: disable=protected-access
+    t.join()
+    # Reset the context in order to reset the collective executor.
+    _setup_context()
+
+    # After reset non-NCCL collectives should work.
+    def_function.function(collective_fn)()
+
+  def testAbortCommunication(self, collective_op, device, communication):
+    dev0 = '/device:%s:0' % device
+    dev1 = '/device:%s:1' % device
+    group_size = 2
+    group_key = 100
+    instance_key = 100
+    in_tensor = constant_op.constant([1.])
+
+    # First perform a normal collective to finish resolution.
+    def collective_fn():
+      for device in [dev0, dev1]:
+        with ops.device(device):
+          collective_op(
+              in_tensor,
+              group_size,
+              group_key,
+              instance_key,
+              communication_hint=communication)
+
+    def_function.function(collective_fn)()
+
+    # Launch a collective that hangs, and abort the collective executor after
+    # the launch.
+    def abort_fn():
+      time.sleep(2)
+      context.context().abort_collective_ops(errors.UNAVAILABLE, 'peer down')
+
+    t = threading.Thread(target=abort_fn)
+    t.start()
+
+    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
+      with ops.device(dev0):
+        collective_op(
+            in_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint=communication)
+
+    # After abortion, subsequent collectives should fail immediately.
+    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
+      with ops.device(dev0):
+        collective_op(
+            in_tensor,
+            group_size,
+            group_key,
+            instance_key,
+            communication_hint=communication)
+
+    # Reset the context in order to reset the collective executor.
+    t.join()
+    _setup_context()
+    def_function.function(collective_fn)()
+
+
+@combinations.generate(
+    combinations.times(
+        combinations.combine(
+            collective_op=[
+                combinations.NamedObject('all_reduce',
+                                         CollectiveOpsV1.all_reduce),
+                combinations.NamedObject('all_reduce_v2',
+                                         CollectiveOpsV2.all_reduce),
+                combinations.NamedObject('all_gather',
+                                         CollectiveOpsV1.all_gather),
+                combinations.NamedObject('all_gather_v2',
+                                         CollectiveOpsV2.all_gather),
+            ],
+            mode='eager'), device_combination))
+class TimeoutTest(test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    _setup_context()
+    super().setUp()
+
+  def testTimeout(self, collective_op, device, communication):
+    if device == 'GPU':
+      self.skipTest('b/170980122')
+    timeout = 1.5
+
+    @def_function.function
+    def run(group_size, reported_group_size=None):
+      group_key = 20
+      instance_key = 30
+      tensor = [1., 2., 3., 4.]
+      results = []
+      if reported_group_size is None:
+        reported_group_size = group_size
+      for i in range(group_size):
+        with ops.device('/{}:{}'.format(device, i)):
+          input_data = constant_op.constant(tensor)
+          result = collective_op(
+              input_data,
+              group_size=reported_group_size,
+              group_key=group_key,
+              instance_key=instance_key,
+              communication_hint=communication,
+              timeout=timeout)
+          results.append(result)
+      return results
+
+    run(2, 2)
+
+    start_time = time.time()
+    with self.assertRaisesRegex(errors.DeadlineExceededError,
+                                'Collective has timed out during execution'):
+      run(1, 2)
+    elapsed = time.time() - start_time
+    self.assertAllGreaterEqual(elapsed, timeout)
+
+  def testParamResolutionAfterTimeout(self, collective_op, device,
+                                      communication):
+    if device == 'GPU':
+      self.skipTest('b/170980122')
+    dev0 = '/device:%s:0' % device
+    dev1 = '/device:%s:1' % device
+    timeout = 1.5
+    group_key = 20
+    instance_key = 30
+    input_data = constant_op.constant([1., 2., 3., 4.])
+
+    # This timeout comes from param solution.
+    with self.assertRaisesRegex(
+        errors.DeadlineExceededError,
+        'Collective has timed out waiting for other workers'):
+      with ops.device(dev0):
+        collective_op(
+            input_data,
+            group_size=2,
+            group_key=group_key,
+            instance_key=instance_key,
+            communication_hint=communication,
+            timeout=timeout)
+
+    # We launch the second device after the first device times out. This is to
+    # simulate the situation when other workers are slow and the timeout is
+    # short. It should error immediately.
+    with self.assertRaisesRegex(
+        errors.DeadlineExceededError,
+        'Collective has timed out waiting for other workers'):
+      with ops.device(dev1):
+        collective_op(
+            input_data,
+            group_size=2,
+            group_key=group_key,
+            instance_key=instance_key,
+            communication_hint=communication)
+
+  def testExecutionAfterTimeout(self, collective_op, device, communication):
+    if device == 'GPU':
+      self.skipTest('b/170980122')
+    dev0 = '/device:%s:0' % device
+    dev1 = '/device:%s:1' % device
+    timeout = 1.5
+    group_key = 20
+    instance_key = 30
+    input_data = constant_op.constant([1., 2., 3., 4.])
+
+    @def_function.function
+    def run():
+      for device in [dev0, dev1]:
+        with ops.device(device):
+          collective_op(
+              input_data,
+              group_size=2,
+              group_key=group_key,
+              instance_key=instance_key,
+              communication_hint=communication,
+              timeout=timeout)
+
+    # Run a normal all-reduce to complete param resolution.
+    run()
+
+    with self.assertRaisesRegex(errors.DeadlineExceededError,
+                                'Collective has timed out during execution'):
+      with ops.device(dev0):
+        collective_op(
+            input_data,
+            group_size=2,
+            group_key=group_key,
+            instance_key=instance_key,
+            communication_hint=communication,
+            timeout=timeout)
+
+    # We launch the second device after the first device times out. This is to
+    # simulate the situation when other workers are slow and the timeout is
+    # short. It should error immediately.
+    with self.assertRaisesRegex(errors.DeadlineExceededError,
+                                'Collective has timed out during execution'):
+      with ops.device(dev1):
+        # No timeout.
+        collective_op(
+            input_data,
+            group_size=2,
+            group_key=group_key,
+            instance_key=instance_key,
+            communication_hint=communication)
+
+
+def _setup_context():
+  context._reset_context()
+  test_util.set_logical_devices_to_at_least('CPU', 4)
+  context.ensure_initialized()
+
 
 if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
   test.main()
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index 334e25cfc4e..bcc31872027 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -645,6 +645,17 @@ class ConcatOpTest(test.TestCase):
           inp_tensors_placeholders, -2, output_shape=[2, 3],
           gather_indexes=[2, 0], feed_dict=feed_dict)
 
+  def testConcatDtype(self):
+    for dtype in [dtypes.int32, dtypes.int64, dtypes.uint32, dtypes.uint64]:
+      with test_util.use_gpu():
+        t1 = constant_op.constant([[1, 2, 3], [4, 5, 6]], dtype=dtype)
+        t2 = constant_op.constant([[7, 8, 9], [10, 11, 12]], dtype=dtype)
+
+        c = gen_array_ops.concat_v2([t1, t2], 1)
+        self.assertEqual([2, 6], c.get_shape().as_list())
+        output = self.evaluate(c)
+        self.assertAllEqual([[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]], output)
+
   def testConcatAxisType(self):
     for dtype in [dtypes.int32, dtypes.int64]:
       with test_util.use_gpu():
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index 30b20c67fda..70d7b2530a9 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -940,7 +940,6 @@ class CondV2Test(test.TestCase):
     self.assertEqual(fn_output[0].op.type, "StatefulPartitionedCall")
     self.assertAllEqual(self.evaluate(fn_output), [2.0, 4.0])
 
-  @test_util.disable_tfrt("GPU to host copy not implemented yet.")
   def testGradientTapeOfCondWithResourceVariableInFunction(self):
     with context.eager_mode():
       v = variables.Variable(2.)
diff --git a/tensorflow/python/kernel_tests/constant_op_eager_test.py b/tensorflow/python/kernel_tests/constant_op_eager_test.py
index 81f26f2f791..589540fd886 100644
--- a/tensorflow/python/kernel_tests/constant_op_eager_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_eager_test.py
@@ -149,6 +149,8 @@ class ConstantTest(test.TestCase):
             [2, 3, 5]).astype(np.complex128))
     self._testAll(np.empty((2, 0, 5)).astype(np.complex128))
 
+  @test_util.disable_tfrt("support creating string tensors from empty "
+                          "numpy arrays.")
   def testString(self):
     val = [compat.as_bytes(str(x)) for x in np.arange(-15, 15)]
     self._testCpu(np.array(val).reshape([2, 3, 5]))
@@ -169,6 +171,19 @@ class ConstantTest(test.TestCase):
     #   numpy array, which loses the null terminators.
     self.assertEqual(val.tolist(), nested)
 
+  def testStringConstantOp(self):
+    s = constant_op.constant("uiuc")
+    self.assertEqual(s.numpy().decode("utf-8"), "uiuc")
+    s_array = constant_op.constant(["mit", "stanford"])
+    self.assertAllEqual(s_array.numpy(), ["mit", "stanford"])
+
+    with ops.device("/cpu:0"):
+      s = constant_op.constant("cmu")
+      self.assertEqual(s.numpy().decode("utf-8"), "cmu")
+
+      s_array = constant_op.constant(["berkeley", "ucla"])
+      self.assertAllEqual(s_array.numpy(), ["berkeley", "ucla"])
+
   def testExplicitShapeNumPy(self):
     c = constant_op.constant(
         np.arange(-15, 15).reshape([2, 3, 5]).astype(np.float32),
@@ -412,6 +427,7 @@ class ZerosLikeTest(test.TestCase):
     self.assertFalse(np.any(z_value))
     self.assertEqual((2, 3), z_value.shape)
 
+  @test_util.disable_tfrt("b/169112823: unsupported dtype for Op:ZerosLike.")
   def testZerosLikeCPU(self):
     for dtype in [
         dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int32,
@@ -422,6 +438,7 @@ class ZerosLikeTest(test.TestCase):
     ]:
       self._compareZeros(dtype, use_gpu=False)
 
+  @test_util.disable_tfrt("b/169112823: unsupported dtype for Op:ZerosLike.")
   def testZerosLikeGPU(self):
     for dtype in [
         dtypes_lib.float32, dtypes_lib.float64, dtypes_lib.int32,
@@ -431,6 +448,7 @@ class ZerosLikeTest(test.TestCase):
     ]:
       self._compareZeros(dtype, use_gpu=True)
 
+  @test_util.disable_tfrt("b/169112823: unsupported dtype for Op:ZerosLike.")
   def testZerosLikeDtype(self):
     # Make sure zeros_like works even for dtypes that cannot be cast between
     shape = (3, 5)
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index e35b62a4556..e965c52ee29 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -456,6 +456,7 @@ class ZerosTest(test.TestCase):
         self.assertFalse(np.any(z_value))
         self.assertEqual((2, 3), z_value.shape)
 
+  @test_util.disable_tfrt("b/169901260")
   def testQint8Dtype(self):
     dtype = dtypes_lib.qint8
     z = array_ops.zeros([2, 3], dtype=dtype)
@@ -466,6 +467,7 @@ class ZerosTest(test.TestCase):
     z_value = self.evaluate(math_ops.cast(z, dtypes_lib.int32))
     self.assertFalse(np.any(z_value))
 
+  @test_util.disable_tfrt("b/169901260")
   def testQint16Dtype(self):
     dtype = dtypes_lib.qint16
     z = array_ops.zeros([2, 3], dtype=dtype)
@@ -650,6 +652,7 @@ class OnesTest(test.TestCase):
         self.assertEqual([2, 3], z.get_shape())
         self.assertAllEqual(z, np.ones([2, 3]))
 
+  @test_util.disable_tfrt("b/169901260")
   def testQintDtype(self):
 
     @def_function.function(autograph=False)
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 2e13414f720..54bbd2b2e9e 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -34,6 +34,8 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import tf2
 from tensorflow.python.client import device_lib
 from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import cardinality
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as eager_function
@@ -720,8 +722,9 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       # We expect that everything runs on CPU, even if GPU is available.
       self.assertEqual(len(run_metadata.partition_graphs), 1)
 
-  def _count_matching_switch_nodes_on_device(self, run_metadata, device_str):
-    # Returns the number of Switch nodes with type float32 placed on
+  def _count_matching_switch_nodes_on_device(self, run_metadata, device_str,
+                                             dtype):
+    # Returns the number of Switch nodes with type dtype placed on
     # `device_str`.
     device_graphs = [
         g for g in run_metadata.partition_graphs
@@ -729,14 +732,14 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
     ]
     self.assertLen(device_graphs, 1)
     switch_nodes = [
-        n for n in device_graphs[0].node if n.op == "Switch" and
-        n.attr["T"].type == dtypes.float32.as_datatype_enum
+        n for n in device_graphs[0].node
+        if n.op == "Switch" and n.attr["T"].type == dtype.as_datatype_enum
     ]
     return len(switch_nodes)
 
   @test_util.run_gpu_only
   @test_util.run_deprecated_v1
-  def testCondSwitchColocatedWithInputWhenInputOnCPU(self):
+  def testCondSwitchColocatedWithInputWhenInputExplicitlyPlacedOnCPU(self):
     x = array_ops.placeholder(dtypes.float32)
 
     # `arg` is used in the cond then branch so a Switch node is created for it.
@@ -756,12 +759,46 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       options = config_pb2.RunOptions(output_partition_graphs=True)
       sess.run(
           r, feed_dict={x: -10.}, options=options, run_metadata=run_metadata)
-      self.assertEqual(len(run_metadata.partition_graphs), 2)
+      self.assertLen(run_metadata.partition_graphs, 2)
       # Check that the Switch for `arg` gets placed on CPU.
       self.assertEqual(
-          self._count_matching_switch_nodes_on_device(run_metadata, "CPU"), 1)
+          self._count_matching_switch_nodes_on_device(run_metadata, "CPU",
+                                                      dtypes.float32), 1)
       self.assertEqual(
-          self._count_matching_switch_nodes_on_device(run_metadata, "GPU"), 0)
+          self._count_matching_switch_nodes_on_device(run_metadata, "GPU",
+                                                      dtypes.float32), 0)
+
+  @test_util.run_gpu_only
+  @test_util.run_deprecated_v1
+  def testCondSwitchColocatedWithInputWhenInputPlacedOnCPU(self):
+    x = array_ops.placeholder(dtypes.float32)
+
+    # `arg` is used in the cond then branch so a Switch node is created for it.
+    # We test that the Switch node gets placed on the same device as `arg`.
+    # Since arg is a dataset (and only has a CPU kernel), it gets placed on CPU
+    # by placer.
+    arg = dataset_ops.Dataset.range(8)
+
+    def true_fn():
+      return cardinality.cardinality(arg)
+
+    r = control_flow_ops.cond(
+        constant_op.constant(True), true_fn,
+        lambda: constant_op.constant(0, dtypes.int64))
+
+    with session.Session() as sess:
+      run_metadata = config_pb2.RunMetadata()
+      options = config_pb2.RunOptions(output_partition_graphs=True)
+      sess.run(
+          r, feed_dict={x: -10.}, options=options, run_metadata=run_metadata)
+      self.assertLen(run_metadata.partition_graphs, 2)
+      # Check that the Switch for `arg` gets placed on CPU.
+      self.assertEqual(
+          self._count_matching_switch_nodes_on_device(run_metadata, "CPU",
+                                                      dtypes.variant), 1)
+      self.assertEqual(
+          self._count_matching_switch_nodes_on_device(run_metadata, "GPU",
+                                                      dtypes.variant), 0)
 
   @test_util.run_gpu_only
   @test_util.run_deprecated_v1
@@ -787,9 +824,11 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(len(run_metadata.partition_graphs), 2)
       # Check that the Switch for `arg` gets placed on GPU.
       self.assertEqual(
-          self._count_matching_switch_nodes_on_device(run_metadata, "CPU"), 0)
+          self._count_matching_switch_nodes_on_device(run_metadata, "CPU",
+                                                      dtypes.float32), 0)
       self.assertEqual(
-          self._count_matching_switch_nodes_on_device(run_metadata, "GPU"), 1)
+          self._count_matching_switch_nodes_on_device(run_metadata, "GPU",
+                                                      dtypes.float32), 1)
 
   def testCondAccessTrueBranchTensorInFalseBranchRaises(self):
 
@@ -4579,6 +4618,14 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase):
       result = control_flow_ops.merge([v_f, v_t])
       self.evaluate(result)
 
+  def testSwitchEagerMode(self):
+    if not context.executing_eagerly():
+      return
+    input_data = [1, 2, 3, 4]
+    vf, vt = control_flow_ops.switch(input_data, False)
+    self.assertAllEqual(vf, input_data)
+    self.assertAllEqual(vt, [])
+
   @test_util.run_deprecated_v1
   def testQIntArgAndRet(self):
 
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index 9bd962e75f3..d0c4fea8eb4 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -48,6 +48,10 @@ def GetTestConfigs():
   return test_configs
 
 
+@test_util.run_all_without_tensor_float_32(
+    "Tests Conv3d, which in some cases is implemented with a matmul. With "
+    "TensorFloat-32, tests fail in some of those cases (and as of August 13 "
+    "2020, only those cases)")
 class Conv3DTest(test.TestCase):
 
   def _DtypesToTest(self, use_gpu):
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index f480f4319da..dd033121329 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
@@ -310,14 +311,14 @@ class Conv2DTest(test.TestCase):
           data_format, use_gpu)
       expected_results.append(expected)
       computed_results.append(computed)
-      tolerance = 1e-2 if use_gpu else 1e-5
-      expected_values = self.evaluate(expected_results)
-      computed_values = self.evaluate(computed_results)
-      for e_value, c_value in zip(expected_values, computed_values):
-        tf_logging.debug("expected = %s", e_value)
-        tf_logging.debug("actual = %s", c_value)
-        self.assertAllClose(
-            e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=rtol)
+    tolerance = 1e-2 if use_gpu else 1e-5
+    expected_values = self.evaluate(expected_results)
+    computed_values = self.evaluate(computed_results)
+    for e_value, c_value in zip(expected_values, computed_values):
+      tf_logging.debug("expected = %s", e_value)
+      tf_logging.debug("actual = %s", c_value)
+      self.assertAllClose(
+          e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=rtol)
 
   def _VerifyValues(self,
                     tensor_in_sizes,
@@ -3266,6 +3267,173 @@ def GetInceptionBackFilterTest(input_size, filter_size, output_size, strides,
   return Test
 
 
+class FusedConv2DTest(test.TestCase):
+
+  def _CreateNumpyTensor(self, shape):
+    total_size = np.prod(shape)
+    return np.arange(1, total_size + 1, dtype=np.float32).reshape(shape)
+
+  def _CreateConv2D(self,
+                    input_values,
+                    filters,
+                    strides=[1, 1],
+                    padding="SAME"):
+    return nn_ops.convolution(
+        input_values, filters, strides=strides, padding=padding)
+
+  # Tests tensor forwarding of a fused Conv2D+BiasAdd+Add op when the input to
+  # Add has refcount 1.
+  @test_util.run_in_graph_and_eager_modes(use_gpu=False)
+  def testAddWithRefCountOne(self):
+    expected_output = [
+        113377, 125570, 77305, 86738, 19433, 22226, 60681, 70722, 36291, 43718,
+        7143, 9206, 9785, 12098, 4783, 6366, 779, 1134
+    ]
+    tensor_in_sizes = [1, 3, 3, 2]
+    filter_in_sizes = [2, 2, 2, 2]
+    bias_in_sizes = [2]
+
+    x = self._CreateNumpyTensor(tensor_in_sizes)
+    filter_in = self._CreateNumpyTensor(filter_in_sizes)
+    bias_in = self._CreateNumpyTensor(bias_in_sizes)
+    # To get different weights for filter
+    offset = 1
+
+    conv1 = self._CreateConv2D(x, filter_in)
+    conv2 = self._CreateConv2D(conv1, filter_in + offset)
+
+    conv = self._CreateConv2D(conv1, filter_in - offset)
+    bias_add = nn_ops.bias_add(conv, bias_in)
+    add = math_ops.add_n([bias_add, conv2])
+
+    self.assertAllEqual(
+        np.rint(expected_output),
+        self.evaluate(add).reshape(-1))
+
+  # Tests tensor forwarding of a fused Conv2D+BiasAdd+Add op when the input to
+  # Add has a total refcount of 2, and Add is its last consumer.
+  @test_util.run_in_graph_and_eager_modes(use_gpu=False)
+  def testAddWithRefCountTwoAndRunAddLast(self):
+    expected_output = [
+        1.907175e+06, 2.253505e+06, 7.809210e+05, 9.537180e+05, 1.184170e+05,
+        1.523070e+05, 5.367010e+05, 6.803700e+05, 1.867090e+05, 2.529460e+05,
+        2.362300e+04, 3.522600e+04, 5.121700e+04, 7.168300e+04, 1.494300e+04,
+        2.347400e+04, 1.558000e+03, 2.903000e+03
+    ]
+    tensor_in_sizes = [1, 3, 3, 2]
+    filter_in_sizes = [2, 2, 2, 2]
+    bias_in_sizes = [2]
+
+    x = self._CreateNumpyTensor(tensor_in_sizes)
+    filter_in = self._CreateNumpyTensor(filter_in_sizes)
+    bias_in = self._CreateNumpyTensor(bias_in_sizes)
+    # To get different weights for filter
+    offset = 1
+
+    conv1 = self._CreateConv2D(x, filter_in)
+    conv2 = self._CreateConv2D(conv1, filter_in + offset)
+
+    conv = self._CreateConv2D(conv2, filter_in - offset)
+    bias_add = nn_ops.bias_add(conv, bias_in)
+    add = math_ops.add_n([bias_add, conv1])
+
+    self.assertAllEqual(
+        np.rint(expected_output),
+        self.evaluate(add).reshape(-1))
+
+  # Tests tensor forwarding of a fused Conv2D+BiasAdd+Add op when the input to
+  # Add has refcount 2 and Add (in the fused Conv2D op) is its first consumer.
+  @test_util.run_in_graph_and_eager_modes(use_gpu=False)
+  def testAddWithRefCountTwoAndRunAddFirst(self):
+    expected_output = [
+        176161, 194450, 120673, 134822, 30545, 34734, 96041, 111102, 58149,
+        69289, 11745, 14839, 15833, 19302, 7965, 10339, 1345, 1877
+    ]
+    tensor_in_sizes = [1, 3, 3, 2]
+    filter_in_sizes = [2, 2, 2, 2]
+    bias_in_sizes = [2]
+
+    x = self._CreateNumpyTensor(tensor_in_sizes)
+    filter_in = self._CreateNumpyTensor(filter_in_sizes)
+    bias_in = self._CreateNumpyTensor(bias_in_sizes)
+    # To get different weights for filter
+    offset = 1
+
+    conv1 = self._CreateConv2D(x, filter_in)
+    conv2 = self._CreateConv2D(conv1, filter_in + offset)
+
+    conv = self._CreateConv2D(conv1, filter_in - offset)
+    bias_add = nn_ops.bias_add(conv, bias_in)
+    add = math_ops.add_n([bias_add, conv2])
+
+    relu = nn_ops.relu(add)
+    output = math_ops.add_n([relu, conv2])
+
+    self.assertAllEqual(
+        np.rint(expected_output),
+        self.evaluate(output).reshape(-1))
+
+  # Tests tensor forwarding of a fused Conv2D+BiasAdd+Add op when the input to
+  # Add has refcount 2, and there is no dependency between its two consumers.
+  @test_util.run_in_graph_and_eager_modes(use_gpu=False)
+  def testAddWithRefCountTwoAndNoDependence(self):
+    expected_output = [
+        176161, 194450, 120673, 134822, 30545, 34734, 96041, 111102, 58149,
+        69289, 11745, 14839, 15833, 19302, 7965, 10339, 1345, 1877
+    ]
+    tensor_in_sizes = [1, 3, 3, 2]
+    filter_in_sizes = [2, 2, 2, 2]
+    bias_in_sizes = [2]
+
+    x = self._CreateNumpyTensor(tensor_in_sizes)
+    filter_in = self._CreateNumpyTensor(filter_in_sizes)
+    bias_in = self._CreateNumpyTensor(bias_in_sizes)
+    # To get different weights for filter
+    offset = 1
+
+    conv1 = self._CreateConv2D(x, filter_in)
+    conv2 = self._CreateConv2D(conv1, filter_in + offset)
+
+    conv = self._CreateConv2D(conv1, filter_in - offset)
+    bias_add = nn_ops.bias_add(conv, bias_in)
+    add = math_ops.add_n([bias_add, conv2])
+
+    relu1 = nn_ops.relu(add)
+    relu2 = nn_ops.relu(conv2)
+    output = math_ops.add_n([relu1, relu2])
+
+    self.assertAllEqual(
+        np.rint(expected_output),
+        self.evaluate(output).reshape(-1))
+
+  # Tests tensor forwarding of a fused Conv2D+BiasAdd+Add op when the input to
+  # Add is the same as the input to the fused Conv2D op and needs a tensor
+  # buffer.
+  @test_util.run_in_graph_and_eager_modes(use_gpu=False)
+  def testAddWithSameSrcAndAddTensorBuffer(self):
+    expected_output = [
+        57157, 63298, 39249, 44026, 9971, 11402, 31193, 36306, 19126, 22948,
+        3970, 5060, 5135, 6350, 2666, 3524, 461, 674
+    ]
+    tensor_in_sizes = [1, 3, 3, 2]
+    filter_in_sizes = [2, 2, 2, 2]
+    bias_in_sizes = [2]
+
+    x = self._CreateNumpyTensor(tensor_in_sizes)
+    filter_in = self._CreateNumpyTensor(filter_in_sizes)
+    bias_in = self._CreateNumpyTensor(bias_in_sizes)
+
+    conv1 = self._CreateConv2D(x, filter_in)
+
+    conv = self._CreateConv2D(conv1, filter_in)
+    bias_add = nn_ops.bias_add(conv, bias_in)
+    add = math_ops.add_n([bias_add, conv1])
+
+    self.assertAllEqual(
+        np.rint(expected_output),
+        self.evaluate(add).reshape(-1))
+
+
 if __name__ == "__main__":
   for index, (input_size_, filter_size_, output_size_, stride_,
               padding_) in enumerate(GetShrunkInceptionShapes()):
diff --git a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
index 50e6c0ad91f..1f8f6ac6153 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
@@ -343,9 +343,9 @@ class BinaryOpTest(test.TestCase):
     self._compareGpu(x, y, np.mod, _MOD)
 
   def testUint32Basic(self):
-    x = np.arange(1, 13, 2).reshape(1, 3, 2).astype(np.int32)
-    y = np.arange(1, 7, 1).reshape(1, 3, 2).astype(np.int32)
-    self._compareBoth(x, y, np.add, math_ops.add)
+    x = np.arange(1, 13, 2).reshape(1, 3, 2).astype(np.uint32)
+    y = np.arange(1, 7, 1).reshape(1, 3, 2).astype(np.uint32)
+    self._compareBoth(x, y, np.add, math_ops.add_v2)
 
   def testInt64Basic(self):
     x = np.arange(1 << 40, 13 << 40, 2 << 40).reshape(1, 3, 2).astype(np.int64)
@@ -991,6 +991,26 @@ class ComparisonOpTest(test.TestCase):
           [[True, True, True, True, True], [False, False, False, False, False]],
           values)
 
+  @test_util.disable_tfrt("b/169901260")
+  def testEqualQuantizeDType(self):
+    dtypes = [
+        dtypes_lib.qint8,
+        dtypes_lib.qint16,
+        dtypes_lib.quint8,
+        dtypes_lib.quint16,
+    ]
+    x = np.asarray([0, 1, 2, 3, 4])
+    y = np.asarray([0, 1, 2, 3, 4])
+    for dtype in dtypes:
+      xt = x.astype(dtype.as_numpy_dtype)
+      yt = y.astype(dtype.as_numpy_dtype)
+      cmp_eq = math_ops.equal(xt, yt)
+      cmp_ne = math_ops.not_equal(xt, yt)
+      values = self.evaluate([cmp_eq, cmp_ne])
+      self.assertAllEqual(
+          [[True, True, True, True, True], [False, False, False, False, False]],
+          values)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index a7d8f841401..96628a1a06a 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -1032,12 +1032,10 @@ class RoundingTest(test.TestCase):
     self._compare_values(x, y=y)
 
   def testTypes(self):
-    # TODO(b/131162241): Enable test for GPU
-    with ops.device("/CPU:0"):
-      for dtype in [np.float16, np.float32, np.float64,
-                    dtypes_lib.bfloat16.as_numpy_dtype]:
-        with self.subTest(dtype=dtype):
-          self._testDtype(dtype)
+    for dtype in [np.float16, np.float32, np.float64,
+                  dtypes_lib.bfloat16.as_numpy_dtype]:
+      with self.subTest(dtype=dtype):
+        self._testDtype(dtype)
 
 
 class ComplexMakeRealImagTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index 549d7b4c98e..0a0680e7d2f 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -11,6 +11,7 @@ cuda_py_test(
     name = "bijector_test",
     size = "small",
     srcs = ["bijector_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -48,6 +49,7 @@ cuda_py_test(
     name = "kullback_leibler_test",
     size = "small",
     srcs = ["kullback_leibler_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -60,6 +62,7 @@ cuda_py_test(
     name = "beta_test",
     size = "small",
     srcs = ["beta_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
@@ -111,6 +114,10 @@ cuda_py_test(
     name = "dirichlet_test",
     size = "small",
     srcs = ["dirichlet_test.py"],
+    tags = [
+        # b/170982175
+        "no_oss",
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -212,6 +219,7 @@ cuda_py_test(
         "no_oss",
         # disable to avoid false positives from scipy.
         "nomsan",
+        "noasan",  #b/168810473
     ],
     deps = [
         "//tensorflow/python:client_testlib",
@@ -264,6 +272,7 @@ cuda_py_test(
     name = "special_math_test",
     size = "medium",
     srcs = ["special_math_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
@@ -280,6 +289,7 @@ cuda_py_test(
     name = "identity_bijector_test",
     size = "small",
     srcs = ["identity_bijector_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
index a4c07daa940..7c8f389f178 100644
--- a/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
@@ -268,6 +268,8 @@ class DirichletMultinomialTest(test.TestCase):
       self.assertAllClose(sample_var_, analytic_var, atol=0.05, rtol=0.)
       self.assertAllClose(sample_stddev_, analytic_stddev, atol=0.02, rtol=0.)
 
+  @test_util.run_without_tensor_float_32(
+      "Tests DirichletMultinomial.covariance, which calls matmul")
   def testCovariance(self):
     # Shape [2]
     alpha = [1., 2]
diff --git a/tensorflow/python/kernel_tests/distributions/dirichlet_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
index 0f963824531..ba6df8ffb4f 100644
--- a/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
@@ -197,9 +197,11 @@ class DirichletTest(test.TestCase):
 
     self.assertAllClose(sample_mean_, analytic_mean, atol=0.04, rtol=0.)
     self.assertAllClose(sample_cov_, analytic_cov, atol=0.06, rtol=0.)
-    self.assertAllClose(sample_var_, analytic_var, atol=0.03, rtol=0.)
+    self.assertAllClose(sample_var_, analytic_var, atol=0.04, rtol=0.)
     self.assertAllClose(sample_stddev_, analytic_stddev, atol=0.02, rtol=0.)
 
+  @test_util.run_without_tensor_float_32(
+      "Calls Dirichlet.covariance, which calls matmul")
   def testVariance(self):
     alpha = [1., 2, 3]
     denominator = np.sum(alpha)**2 * (np.sum(alpha) + 1)
diff --git a/tensorflow/python/kernel_tests/distributions/util_test.py b/tensorflow/python/kernel_tests/distributions/util_test.py
index 093fdb69dc3..b6ab771fe08 100644
--- a/tensorflow/python/kernel_tests/distributions/util_test.py
+++ b/tensorflow/python/kernel_tests/distributions/util_test.py
@@ -300,6 +300,7 @@ class EmbedCheckCategoricalEventShapeTest(test.TestCase):
             param)
         checked_param.eval(feed_dict={param: np.ones([int(2**11+1)])})
 
+  @test_util.disable_tfrt("b/169901260")
   @test_util.run_in_graph_and_eager_modes
   def testUnsupportedDtype(self):
     param = ops.convert_to_tensor(
diff --git a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
index 8c448194076..0fd9790c794 100644
--- a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
@@ -23,8 +23,10 @@ import unittest
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
@@ -346,6 +348,19 @@ class DynamicPartitionTest(test.TestCase):
       res = self.evaluate(partitioned)
     self.assertEqual(res[-1].shape[0], 192)
 
+  #  see https://github.com/tensorflow/tensorflow/issues/42500
+  def testMultiGPU(self):
+    device_list = config.list_logical_devices("GPU")
+    results = []
+    for device in device_list:
+      with ops.device(device.name):
+        data = constant_op.constant(np.zeros((1000,)))
+        partitions = constant_op.constant(np.arange(1000, dtype=np.int32) % 10)
+        result = data_flow_ops.dynamic_partition(data, partitions, 10)
+        results.append(self.evaluate(result))
+    if device_list:
+      self.assertAllEqual(results, np.zeros((len(device_list), 10, 100)))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
index 50d11a62793..5c4df4c6ac7 100644
--- a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
@@ -62,6 +62,7 @@ class DynamicStitchTestBase(object):
         # length.
         self.assertEqual([None], stitched_t.get_shape().as_list())
 
+  @test_util.disable_tfrt("b/169901260")
   def testSimpleOneDimensional(self):
     # Test various datatypes in the simple case to ensure that the op was
     # registered under those types.
diff --git a/tensorflow/python/kernel_tests/einsum_op_test.py b/tensorflow/python/kernel_tests/einsum_op_test.py
index aa9e356bea5..4236eb93278 100644
--- a/tensorflow/python/kernel_tests/einsum_op_test.py
+++ b/tensorflow/python/kernel_tests/einsum_op_test.py
@@ -35,6 +35,8 @@ from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_without_tensor_float_32(
+    'Tests einsum, which sometimes does a matmul with cuBLAS')
 class EinsumOpTest(test.TestCase):
 
   def _check(self, s, *input_shapes, **kwargs):
@@ -285,6 +287,8 @@ class EinsumOpTest(test.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
+@test_util.run_all_without_tensor_float_32(
+    "Tests einsum's gradient, which sometimes does a matmul with cuBLAS")
 class EinsumGradTest(test.TestCase):
 
   def _check_gradient(self, s, *input_shapes):
diff --git a/tensorflow/python/kernel_tests/fingerprint_op_test.py b/tensorflow/python/kernel_tests/fingerprint_op_test.py
index 0af3f5182fa..0a5643f9d69 100644
--- a/tensorflow/python/kernel_tests/fingerprint_op_test.py
+++ b/tensorflow/python/kernel_tests/fingerprint_op_test.py
@@ -37,6 +37,11 @@ class FingerprintTest(test.TestCase):
     self.assertTupleEqual(fingerprint0.shape, fingerprint1.shape)
     self.assertTrue(np.any(fingerprint0 != fingerprint1))
 
+  def test_empty(self):
+    f0 = self.evaluate(array_ops.fingerprint([]))
+    self.assertEqual(f0.ndim, 2)
+    self.assertEqual(f0.shape, (0, 8))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index e3268fad2d8..f2348c6c7ac 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -945,6 +945,8 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
       self.assertAllClose(abs_value, count, rtol=tol, atol=tol)
 
 
+@test_util.run_all_without_tensor_float_32(
+    "Tests convolutional_orthogonal_1d, which calls matmul")
 class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
 
   @test_util.run_deprecated_v1
@@ -1174,6 +1176,8 @@ class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
         self.assertAllClose(self.evaluate(ratio), gain, rtol=tol, atol=tol)
 
 
+@test_util.run_all_without_tensor_float_32(
+    "Tests convolutional_orthogonal_3d, which calls matmul")
 class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index 097183d1025..67d1adba3a9 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -27,6 +27,7 @@ cuda_py_test(
     name = "linear_operator_addition_test",
     size = "small",
     srcs = ["linear_operator_addition_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -65,6 +66,7 @@ cuda_py_test(
     name = "linear_operator_algebra_test",
     size = "small",
     srcs = ["linear_operator_algebra_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/linalg/sparse/BUILD b/tensorflow/python/kernel_tests/linalg/sparse/BUILD
index 96ebc38ce5a..560ba7b2fd4 100644
--- a/tensorflow/python/kernel_tests/linalg/sparse/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/sparse/BUILD
@@ -11,6 +11,7 @@ cuda_py_test(
     name = "conjugate_gradient_test",
     size = "medium",
     srcs = ["conjugate_gradient_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -28,6 +29,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["csr_sparse_matrix_test.py"],
     main = "csr_sparse_matrix_test.py",
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python/ops/linalg/sparse",
     ],
@@ -53,6 +55,7 @@ cuda_py_test(
     srcs = ["csr_sparse_matrix_grad_test.py"],
     main = "csr_sparse_matrix_grad_test.py",
     shard_count = 50,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python/ops/linalg/sparse",
     ],
@@ -64,6 +67,7 @@ cuda_py_test(
     srcs = ["csr_sparse_matrix_dense_mat_mul_grad_test.py"],
     main = "csr_sparse_matrix_dense_mat_mul_grad_test.py",
     shard_count = 50,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python/ops/linalg/sparse",
     ],
@@ -75,6 +79,7 @@ cuda_py_test(
     srcs = ["csr_sparse_matrix_sparse_mat_mul_grad_test.py"],
     main = "csr_sparse_matrix_sparse_mat_mul_grad_test.py",
     shard_count = 50,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python/ops/linalg/sparse",
     ],
diff --git a/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_ops_test.py b/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_ops_test.py
index ac82f190db0..b9d9f007167 100644
--- a/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_ops_test.py
+++ b/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_ops_test.py
@@ -534,7 +534,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
       c_value = self.evaluate(c)
 
       expected_c_value = self.evaluate(
-          math_ops.conj(math_ops.matmul(a_dense, b)))
+          math_ops.conj(test_util.matmul_without_tf32(a_dense, b)))
       self.assertAllClose(expected_c_value, c_value)
 
   @test_util.run_in_graph_and_eager_modes
@@ -576,7 +576,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
                 transpose_b=transpose_b,
                 adjoint_a=adjoint_a,
                 adjoint_b=adjoint_b)
-            c_dense_t = math_ops.matmul(
+            c_dense_t = test_util.matmul_without_tf32(
                 a_mats,
                 b_mats,
                 transpose_a=transpose_a,
@@ -587,7 +587,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
             c_t_value, c_dense_t_value = self.evaluate((c_t, c_dense_t))
 
             self.assertAllClose(
-                c_t_value, c_dense_t_value, rtol=1e-6, atol=1e-5)
+                c_t_value, c_dense_t_value, rtol=1e-6, atol=2e-5)
 
   @test_util.run_in_graph_and_eager_modes
   def testLargeBatchSparseMatrixMatMulTransposed(self):
@@ -640,7 +640,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
                 adjoint_b=adjoint_b)
 
             # Example: t(adj(a) . b) = t(b) . conj(a)
-            c_dense_t = math_ops.matmul(
+            c_dense_t = test_util.matmul_without_tf32(
                 math_ops.conj(b_mats) if adjoint_b else b_mats,
                 math_ops.conj(a_mats) if adjoint_a else a_mats,
                 transpose_a=not (transpose_b or adjoint_b),
@@ -650,7 +650,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
             self.assertAllEqual(c_t.shape, c_dense_t.shape)
             c_t_value, c_dense_t_value = self.evaluate((c_t, c_dense_t))
             self.assertAllClose(
-                c_t_value, c_dense_t_value, rtol=1e-6, atol=1e-5)
+                c_t_value, c_dense_t_value, rtol=1e-6, atol=2e-5)
 
   @test_util.run_in_graph_and_eager_modes
   def testLargeBatchSparseMatrixMatMulConjugate(self):
@@ -670,7 +670,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
     c_t = sparse_csr_matrix_ops.sparse_matrix_mat_mul(
         a_sm, b_mats, conjugate_output=True)
 
-    c_dense_t = math_ops.conj(math_ops.matmul(a_mats, b_mats))
+    c_dense_t = math_ops.conj(test_util.matmul_without_tf32(a_mats, b_mats))
     self.assertAllEqual(c_t.shape, c_dense_t.shape)
     c_t_value, c_dense_t_value = self.evaluate((c_t, c_dense_t))
 
@@ -772,7 +772,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
             adjoint_b=adjoint_b)
         c_sm_dense = sparse_csr_matrix_ops.csr_sparse_matrix_to_dense(
             c_sm, dtypes.float32)
-        c_dense_t = math_ops.matmul(
+        c_dense_t = test_util.matmul_without_tf32(
             a_mats,
             b_mats,
             transpose_a=transpose_a,
@@ -1143,7 +1143,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
         dense_cholesky = sparse_csr_matrix_ops.csr_sparse_matrix_to_dense(
             cholesky_sparse_matrices, dtype)
         # Compute L * Lh where L is the Sparse Cholesky factor.
-        verification = math_ops.matmul(
+        verification = test_util.matmul_without_tf32(
             dense_cholesky, array_ops.transpose(dense_cholesky, conjugate=True))
         verification = twist_matrix(verification, ordering_amd)
         # Assert that input matrix A satisfies A = L * Lh.
@@ -1197,7 +1197,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
           cholesky_sparse_matrix, dtype)
 
       # Compute L * Lh.
-      verification = math_ops.matmul(
+      verification = test_util.matmul_without_tf32(
           dense_cholesky,
           array_ops.transpose(dense_cholesky, perm=[0, 2, 1], conjugate=True))
       verification = twist_matrix(verification, ordering_amd)
@@ -1238,7 +1238,7 @@ class CSRSparseMatrixOpsTest(test.TestCase):
         cholesky_sparse_matrix, dtypes.float32)
 
     # Compute L * Lh.
-    verification = math_ops.matmul(
+    verification = test_util.matmul_without_tf32(
         dense_cholesky, array_ops.transpose(dense_cholesky, perm=[0, 2, 1]))
     verification = twist_matrix(verification, ordering_amd)
     verification_values = self.evaluate(verification)
diff --git a/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_test.py b/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_test.py
index 35c706cb36a..4aa3474ffbb 100644
--- a/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_test.py
+++ b/tensorflow/python/kernel_tests/linalg/sparse/csr_sparse_matrix_test.py
@@ -162,7 +162,7 @@ class SparseMatrixMatmulTest(test.TestCase):
                          1.j * np.random.randn(*dense_shape_b))).astype(dtype)
       a_sm = sparse_csr_matrix_ops.CSRSparseMatrix(a_mats)
       b_sm = sparse_csr_matrix_ops.CSRSparseMatrix(b_mats)
-      c_dense = math_ops.matmul(
+      c_dense = test_util.matmul_without_tf32(
           a_mats,
           b_mats,
           transpose_a=transpose_a,
@@ -202,7 +202,7 @@ class SparseMatrixMatmulTest(test.TestCase):
       b_mats = (np.random.randn(*dense_shape_b) +
                 1.j * np.random.randn(*dense_shape_b)).astype(dtype)
       a_sm = sparse_csr_matrix_ops.CSRSparseMatrix(a_mats)
-      c_dense = math_ops.matmul(
+      c_dense = test_util.matmul_without_tf32(
           a_mats,
           b_mats,
           transpose_a=transpose_a,
@@ -240,7 +240,7 @@ class SparseMatrixMatmulTest(test.TestCase):
       b_mats = sparsify((np.random.randn(*dense_shape_b) +
                          1.j * np.random.randn(*dense_shape_b))).astype(dtype)
       b_sm = sparse_csr_matrix_ops.CSRSparseMatrix(b_mats)
-      c_dense = math_ops.matmul(
+      c_dense = test_util.matmul_without_tf32(
           a_mats,
           b_mats,
           transpose_a=transpose_a,
diff --git a/tensorflow/python/kernel_tests/linalg_grad_test.py b/tensorflow/python/kernel_tests/linalg_grad_test.py
index f1d885fd231..4ed02fec222 100644
--- a/tensorflow/python/kernel_tests/linalg_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg_grad_test.py
@@ -63,6 +63,9 @@ def _GetMatrixUnaryFunctorGradientTest(functor_, dtype_, shape_, **kwargs_):
 
   @test_util.enable_control_flow_v2
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_without_tensor_float_32(
+      'Tests `tf.linalg.expm`, which call matmul. Additionally, calls ops '
+      'which do matmul in their gradient, such as MatrixSolve.')
   def Test(self):
 
     def RandomInput():
@@ -102,6 +105,16 @@ def _GetMatrixBinaryFunctorGradientTest(functor_,
                                         **kwargs_):
 
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_without_tensor_float_32(
+      'Tests `tf.linalg.lstsq`, which call matmul. Additionally, calls ops '
+      'which do matmul in their gradient, such as MatrixSolveLs.')
+  # TODO(b/164254522): With TensorFloat-32, some tests fails with extremely high
+  # absolute and relative differences when calling assertAllClose. For example,
+  # the test test_MatrixSolveLsGradient_float32_10_10_1e-06 of class
+  # MatrixBinaryFunctorGradientTest fails with a max absolute difference of
+  # 0.883 and a max relative difference of 736892. We should consider disabling
+  # TensorFloat-32 within `tf.linalg.lstsq and perhaps other linear algebra
+  # functions, even if TensorFloat-32 is allowed globally.
   def Test(self):
 
     def RandomInput():
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index ce20cf489e6..f2f6dc33b84 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np  # pylint: disable=unused-import
 
+from tensorflow.core.framework import types_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -40,6 +41,7 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variable_scope as vs
@@ -1600,9 +1602,18 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     def func():
       t = constant_op.constant([1., 2., 3.])
       l = list_ops.tensor_list_from_tensor(t, element_shape=[])
+      handle_data = resource_variable_ops.get_eager_safe_handle_data(l)
+      self.assertTrue(handle_data.is_set)
+      self.assertEqual(types_pb2.ST_TENSOR_LIST,
+                       handle_data.shape_and_type[0].specialized_type)
       return l
 
     tensor_list = func()
+    handle_data = resource_variable_ops.get_eager_safe_handle_data(tensor_list)
+    self.assertTrue(handle_data.is_set)
+    self.assertEqual(dtypes.float32, handle_data.shape_and_type[0].dtype)
+    self.assertEqual(types_pb2.ST_TENSOR_LIST,
+                     handle_data.shape_and_type[0].specialized_type)
     element = list_ops.tensor_list_get_item(
         tensor_list, 0, element_dtype=dtypes.float32)
     self.assertAllEqual(element.shape.as_list(), [])
@@ -1685,6 +1696,21 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     hess = t.gradient(jac, x)
     self.assertAllEqual(hess, 6.)
 
+  def testTensorListElementShapeShapeInference(self):
+
+    @def_function.function
+    def f():
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32, element_shape=None)
+      l_element_shape = list_ops.tensor_list_element_shape(l, dtypes.int32)
+      self.assertIsNone(l_element_shape.shape.rank)
+      shape_l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.int32, element_shape=l_element_shape.shape)
+      shape_l = list_ops.tensor_list_push_back(shape_l, l_element_shape)
+      return list_ops.tensor_list_pop_back(shape_l, dtypes.int32)[1]
+
+    self.assertAllEqual(f(), -1)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
index 045dafc3089..d564da12c27 100644
--- a/tensorflow/python/kernel_tests/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -47,6 +48,8 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import load as saved_model_load
+from tensorflow.python.saved_model import save as saved_model_save
 from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
 from tensorflow.python.training.tracking import graph_view
@@ -192,6 +195,20 @@ class StaticHashTableTest(BaseLookupTableTest):
     result = self.evaluate(output)
     self.assertAllEqual([0, 1, -1], result)
 
+  def testStaticHashTableGetItem(self):
+    default_val = constant_op.constant(-1, dtypes.int64)
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = self.getHashTable()(lookup_ops.KeyValueTensorInitializer(
+        keys, values), default_val)
+    self.initialize_table(table)
+
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    output = table[input_string]
+
+    result = self.evaluate(output)
+    self.assertAllEqual([0, 1, -1], result)
+
   def testStaticHashTableWithSparseTensorInput(self):
     default_val = constant_op.constant(-1, dtypes.int64)
     keys = constant_op.constant(["brain", "salad", "surgery"])
@@ -470,6 +487,24 @@ class StaticHashTableTest(BaseLookupTableTest):
     self.evaluate(lookup_ops.tables_initializer())
     self.assertAllEqual(grad, -10.)
 
+  def testExportShapeInference(self):
+    table = self.getHashTable()(lookup_ops.KeyValueTensorInitializer(
+        constant_op.constant([2, 5], dtype=dtypes.int64),
+        constant_op.constant([-10.0, 1], dtype=dtypes.float32)), -1)
+    actual_shapes = [t.shape for t in table.export()]
+    inferred_shapes = []
+
+    @def_function.function
+    def f():
+      for t in table.export():
+        inferred_shapes.append(t.shape)
+
+    f()
+    self.assertLen(actual_shapes, 2)
+    self.assertLen(inferred_shapes, 2)
+    self.assertTrue(inferred_shapes[0].is_compatible_with(actual_shapes[0]))
+    self.assertTrue(inferred_shapes[1].is_compatible_with(actual_shapes[1]))
+
 
 class KeyValueTensorInitializerTest(BaseLookupTableTest):
 
@@ -951,6 +986,21 @@ class StaticVocabularyTableTest(BaseLookupTableTest):
     self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
     self.assertEqual(vocab_size + oov_buckets, self.evaluate(table.size()))
 
+  def testStaticVocabularyTableGetItem(self):
+    vocab_file = self._createVocabFile("feat_to_id_1.txt")
+    vocab_size = 3
+    oov_buckets = 1
+    table = self.getVocabularyTable()(lookup_ops.TextFileIdTableInitializer(
+        vocab_file, vocab_size=vocab_size), oov_buckets)
+
+    self.initialize_table(table)
+
+    input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
+
+    out = table[input_string]
+    self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
+    self.assertEqual(vocab_size + oov_buckets, self.evaluate(table.size()))
+
   def testInt32StaticVocabularyTable(self):
     vocab_file = self._createVocabFile("feat_to_id_2.txt", ("42", "1", "-1000"))
     vocab_size = 3
@@ -1223,71 +1273,85 @@ class StaticVocabularyTableTest(BaseLookupTableTest):
 class DenseHashTableOpTest(test.TestCase):
 
   def testBasic(self):
-    with self.cached_session():
+    keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+    values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=-1,
+        empty_key=0,
+        deleted_key=-1)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
-      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(4, self.evaluate(table.size()))
+
+    remove_string = constant_op.constant([12, 15], dtypes.int64)
+    self.evaluate(table.remove(remove_string))
+    self.assertAllEqual(3, self.evaluate(table.size()))
+
+    input_string = constant_op.constant([11, 12, 15], dtypes.int64)
+    output = table.lookup(input_string)
+    self.assertAllEqual([3], output.get_shape())
+
+    result = self.evaluate(output)
+    self.assertAllEqual([0, -1, -1], result)
+
+  def testGetItem(self):
+    keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+    values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=-1,
+        empty_key=0,
+        deleted_key=-1)
+
+    self.evaluate(table.insert(keys, values))
+
+    input_string = constant_op.constant([11, 12, 15], dtypes.int64)
+    output = table[input_string]
+    self.assertAllEqual([3], output.get_shape())
+
+    result = self.evaluate(output)
+    self.assertAllEqual([0, 1, -1], result)
+
+  def testBasicBool(self):
+    keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+    values = constant_op.constant([True, True, True, True], dtypes.bool)
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.bool,
+        default_value=False,
+        empty_key=0,
+        deleted_key=-1)
+    self.assertAllEqual(0, self.evaluate(table.size()))
+
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(4, self.evaluate(table.size()))
+
+    remove_string = constant_op.constant([11, 15], dtypes.int64)
+    self.evaluate(table.remove(remove_string))
+    self.assertAllEqual(3, self.evaluate(table.size()))
+
+    input_string = constant_op.constant([11, 12, 15], dtypes.int64)
+    output = table.lookup(input_string)
+    self.assertAllEqual([3], output.get_shape())
+
+    result = self.evaluate(output)
+    self.assertAllEqual([False, True, False], result)
+
+  def testSameEmptyAndDeletedKey(self):
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "Empty and deleted keys"):
       table = lookup_ops.DenseHashTable(
           dtypes.int64,
           dtypes.int64,
           default_value=-1,
-          empty_key=0,
-          deleted_key=-1)
+          empty_key=42,
+          deleted_key=42)
       self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(4, self.evaluate(table.size()))
-
-      remove_string = constant_op.constant([12, 15], dtypes.int64)
-      self.evaluate(table.remove(remove_string))
-      self.assertAllEqual(3, self.evaluate(table.size()))
-
-      input_string = constant_op.constant([11, 12, 15], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([3], output.get_shape())
-
-      result = self.evaluate(output)
-      self.assertAllEqual([0, -1, -1], result)
-
-  def testBasicBool(self):
-    with self.cached_session():
-
-      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
-      values = constant_op.constant([True, True, True, True], dtypes.bool)
-      table = lookup_ops.DenseHashTable(
-          dtypes.int64,
-          dtypes.bool,
-          default_value=False,
-          empty_key=0,
-          deleted_key=-1)
-      self.assertAllEqual(0, self.evaluate(table.size()))
-
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(4, self.evaluate(table.size()))
-
-      remove_string = constant_op.constant([11, 15], dtypes.int64)
-      self.evaluate(table.remove(remove_string))
-      self.assertAllEqual(3, self.evaluate(table.size()))
-
-      input_string = constant_op.constant([11, 12, 15], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([3], output.get_shape())
-
-      result = self.evaluate(output)
-      self.assertAllEqual([False, True, False], result)
-
-  def testSameEmptyAndDeletedKey(self):
-    with self.cached_session():
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "Empty and deleted keys"):
-        table = lookup_ops.DenseHashTable(
-            dtypes.int64,
-            dtypes.int64,
-            default_value=-1,
-            empty_key=42,
-            deleted_key=42)
-        self.assertAllEqual(0, self.evaluate(table.size()))
-
   @test_util.run_v1_only("uses placeholders")
   def testLookupUnknownShape(self):
     with self.cached_session():
@@ -1310,212 +1374,203 @@ class DenseHashTableOpTest(test.TestCase):
       self.assertAllEqual([0, 1, -1], result)
 
   def testMapStringToFloat(self):
-    with self.cached_session():
+    keys = constant_op.constant(["a", "b", "c", "d"], dtypes.string)
+    values = constant_op.constant([0.0, 1.1, 2.2, 3.3], dtypes.float32)
+    default_value = constant_op.constant(-1.5, dtypes.float32)
+    table = lookup_ops.DenseHashTable(
+        dtypes.string,
+        dtypes.float32,
+        default_value=default_value,
+        empty_key="",
+        deleted_key="$")
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      keys = constant_op.constant(["a", "b", "c", "d"], dtypes.string)
-      values = constant_op.constant([0.0, 1.1, 2.2, 3.3], dtypes.float32)
-      default_value = constant_op.constant(-1.5, dtypes.float32)
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(4, self.evaluate(table.size()))
+
+    remove_string = constant_op.constant(["b", "e"])
+    self.evaluate(table.remove(remove_string))
+    self.assertAllEqual(3, self.evaluate(table.size()))
+
+    input_string = constant_op.constant(["a", "b", "d", "e"], dtypes.string)
+    output = table.lookup(input_string)
+    self.assertAllEqual([4], output.get_shape())
+
+    result = self.evaluate(output)
+    self.assertAllClose([0, -1.5, 3.3, -1.5], result)
+
+  def testMapInt64ToFloat(self):
+    for float_dtype in [dtypes.float32, dtypes.float64]:
+      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+      values = constant_op.constant([0.0, 1.1, 2.2, 3.3], float_dtype)
+      default_value = constant_op.constant(-1.5, float_dtype)
       table = lookup_ops.DenseHashTable(
-          dtypes.string,
-          dtypes.float32,
+          dtypes.int64,
+          float_dtype,
           default_value=default_value,
-          empty_key="",
-          deleted_key="$")
+          empty_key=0,
+          deleted_key=-1)
       self.assertAllEqual(0, self.evaluate(table.size()))
 
       self.evaluate(table.insert(keys, values))
       self.assertAllEqual(4, self.evaluate(table.size()))
 
-      remove_string = constant_op.constant(["b", "e"])
+      remove_string = constant_op.constant([12, 15], dtypes.int64)
       self.evaluate(table.remove(remove_string))
       self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(["a", "b", "d", "e"], dtypes.string)
+      input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64)
       output = table.lookup(input_string)
       self.assertAllEqual([4], output.get_shape())
 
       result = self.evaluate(output)
       self.assertAllClose([0, -1.5, 3.3, -1.5], result)
 
-  def testMapInt64ToFloat(self):
-    for float_dtype in [dtypes.float32, dtypes.float64]:
-      with self.cached_session():
-
-        keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
-        values = constant_op.constant([0.0, 1.1, 2.2, 3.3], float_dtype)
-        default_value = constant_op.constant(-1.5, float_dtype)
-        table = lookup_ops.DenseHashTable(
-            dtypes.int64,
-            float_dtype,
-            default_value=default_value,
-            empty_key=0,
-            deleted_key=-1)
-        self.assertAllEqual(0, self.evaluate(table.size()))
-
-        self.evaluate(table.insert(keys, values))
-        self.assertAllEqual(4, self.evaluate(table.size()))
-
-        remove_string = constant_op.constant([12, 15], dtypes.int64)
-        self.evaluate(table.remove(remove_string))
-        self.assertAllEqual(3, self.evaluate(table.size()))
-
-        input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64)
-        output = table.lookup(input_string)
-        self.assertAllEqual([4], output.get_shape())
-
-        result = self.evaluate(output)
-        self.assertAllClose([0, -1.5, 3.3, -1.5], result)
-
   def testVectorValues(self):
-    with self.cached_session():
-      keys = constant_op.constant([11, 12, 13], dtypes.int64)
-      values = constant_op.constant([[0, 1, 2, 3], [3, 4, 5, 6], [6, 7, 8, 9]],
-                                    dtypes.int64)
-      default_value = constant_op.constant([-1, -2, -3, -4], dtypes.int64)
-      table = lookup_ops.DenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=0,
-          deleted_key=-1,
-          initial_num_buckets=4)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    keys = constant_op.constant([11, 12, 13], dtypes.int64)
+    values = constant_op.constant([[0, 1, 2, 3], [3, 4, 5, 6], [6, 7, 8, 9]],
+                                  dtypes.int64)
+    default_value = constant_op.constant([-1, -2, -3, -4], dtypes.int64)
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=default_value,
+        empty_key=0,
+        deleted_key=-1,
+        initial_num_buckets=4)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
-      self.assertAllEqual(4, len(self.evaluate(table.export()[0])))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
+    self.assertAllEqual(4, len(self.evaluate(table.export()[0])))
 
-      self.evaluate(
-          table.insert(
-              constant_op.constant([14], dtypes.int64),
-              constant_op.constant([[2, 3, 4, 5]], dtypes.int64)))
-      self.assertAllEqual(4, self.evaluate(table.size()))
-      self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
+    self.evaluate(
+        table.insert(
+            constant_op.constant([14], dtypes.int64),
+            constant_op.constant([[2, 3, 4, 5]], dtypes.int64)))
+    self.assertAllEqual(4, self.evaluate(table.size()))
+    self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
 
-      remove_string = constant_op.constant([12, 16], dtypes.int64)
-      self.evaluate(table.remove(remove_string))
-      self.assertAllEqual(3, self.evaluate(table.size()))
-      self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
+    remove_string = constant_op.constant([12, 16], dtypes.int64)
+    self.evaluate(table.remove(remove_string))
+    self.assertAllEqual(3, self.evaluate(table.size()))
+    self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
 
-      input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([4, 4],
-                          output.shape,
-                          msg="Saw shape: %s" % output.shape)
+    input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64)
+    output = table.lookup(input_string)
+    self.assertAllEqual([4, 4],
+                        output.shape,
+                        msg="Saw shape: %s" % output.shape)
 
-      result = self.evaluate(output)
-      self.assertAllEqual(
-          [[0, 1, 2, 3], [-1, -2, -3, -4], [2, 3, 4, 5], [-1, -2, -3, -4]],
-          result)
+    result = self.evaluate(output)
+    self.assertAllEqual(
+        [[0, 1, 2, 3], [-1, -2, -3, -4], [2, 3, 4, 5], [-1, -2, -3, -4]],
+        result)
 
   def testVectorKeys(self):
-    with self.cached_session():
-      keys = constant_op.constant([[0, 1], [1, 2], [1, 3]], dtypes.int64)
-      values = constant_op.constant([10, 11, 12], dtypes.int64)
-      empty_key = constant_op.constant([0, 3], dtypes.int64)
-      deleted_key = constant_op.constant([-1, -1], dtypes.int64)
-      default_value = constant_op.constant(-1, dtypes.int64)
-      table = lookup_ops.DenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=default_value,
-          empty_key=empty_key,
-          deleted_key=deleted_key,
-          initial_num_buckets=8)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    keys = constant_op.constant([[0, 1], [1, 2], [1, 3]], dtypes.int64)
+    values = constant_op.constant([10, 11, 12], dtypes.int64)
+    empty_key = constant_op.constant([0, 3], dtypes.int64)
+    deleted_key = constant_op.constant([-1, -1], dtypes.int64)
+    default_value = constant_op.constant(-1, dtypes.int64)
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=default_value,
+        empty_key=empty_key,
+        deleted_key=deleted_key,
+        initial_num_buckets=8)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      self.evaluate(
-          table.insert(
-              constant_op.constant([[0, 0]], dtypes.int64),
-              constant_op.constant([13], dtypes.int64)))
-      self.assertAllEqual(4, self.evaluate(table.size()))
-      self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
+    self.evaluate(
+        table.insert(
+            constant_op.constant([[0, 0]], dtypes.int64),
+            constant_op.constant([13], dtypes.int64)))
+    self.assertAllEqual(4, self.evaluate(table.size()))
+    self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
 
-      remove_string = constant_op.constant([[1, 2], [7, 8]], dtypes.int64)
-      self.evaluate(table.remove(remove_string))
-      self.assertAllEqual(3, self.evaluate(table.size()))
-      self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
+    remove_string = constant_op.constant([[1, 2], [7, 8]], dtypes.int64)
+    self.evaluate(table.remove(remove_string))
+    self.assertAllEqual(3, self.evaluate(table.size()))
+    self.assertAllEqual(8, len(self.evaluate(table.export()[0])))
 
-      input_string = constant_op.constant([[0, 1], [1, 2], [1, 3], [0, 2]],
-                                          dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([4], output.get_shape())
+    input_string = constant_op.constant([[0, 1], [1, 2], [1, 3], [0, 2]],
+                                        dtypes.int64)
+    output = table.lookup(input_string)
+    self.assertAllEqual([4], output.get_shape())
 
-      result = self.evaluate(output)
-      self.assertAllEqual([10, -1, 12, -1], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([10, -1, 12, -1], result)
 
   def testResize(self):
-    with self.cached_session():
-      keys = constant_op.constant([11, 12, 13], dtypes.int64)
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup_ops.DenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=0,
-          deleted_key=-1,
-          initial_num_buckets=4)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    keys = constant_op.constant([11, 12, 13], dtypes.int64)
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=-1,
+        empty_key=0,
+        deleted_key=-1,
+        initial_num_buckets=4)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
-      self.assertAllEqual(4, len(self.evaluate(table.export()[0])))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
+    self.assertAllEqual(4, len(self.evaluate(table.export()[0])))
 
-      keys2 = constant_op.constant([12, 99], dtypes.int64)
-      self.evaluate(table.remove(keys2))
-      self.assertAllEqual(2, self.evaluate(table.size()))
-      self.assertAllEqual(4, len(self.evaluate(table.export()[0])))
+    keys2 = constant_op.constant([12, 99], dtypes.int64)
+    self.evaluate(table.remove(keys2))
+    self.assertAllEqual(2, self.evaluate(table.size()))
+    self.assertAllEqual(4, len(self.evaluate(table.export()[0])))
 
-      keys3 = constant_op.constant([13, 14, 15, 16, 17], dtypes.int64)
-      values3 = constant_op.constant([3, 4, 5, 6, 7], dtypes.int64)
+    keys3 = constant_op.constant([13, 14, 15, 16, 17], dtypes.int64)
+    values3 = constant_op.constant([3, 4, 5, 6, 7], dtypes.int64)
 
-      self.evaluate(table.insert(keys3, values3))
-      self.assertAllEqual(6, self.evaluate(table.size()))
-      self.assertAllEqual(16, len(self.evaluate(table.export()[0])))
+    self.evaluate(table.insert(keys3, values3))
+    self.assertAllEqual(6, self.evaluate(table.size()))
+    self.assertAllEqual(16, len(self.evaluate(table.export()[0])))
 
-      keys4 = constant_op.constant([10, 11, 12, 13, 14, 15, 16, 17, 18],
-                                   dtypes.int64)
-      output = table.lookup(keys4)
-      self.assertAllEqual([-1, 0, -1, 3, 4, 5, 6, 7, -1], self.evaluate(output))
+    keys4 = constant_op.constant([10, 11, 12, 13, 14, 15, 16, 17, 18],
+                                 dtypes.int64)
+    output = table.lookup(keys4)
+    self.assertAllEqual([-1, 0, -1, 3, 4, 5, 6, 7, -1], self.evaluate(output))
 
   def testExport(self):
-    with self.cached_session():
+    keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
+    values = constant_op.constant([1, 2, 3, 4], dtypes.int64)
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=-1,
+        empty_key=100,
+        deleted_key=200,
+        initial_num_buckets=8)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      keys = constant_op.constant([11, 12, 13, 14], dtypes.int64)
-      values = constant_op.constant([1, 2, 3, 4], dtypes.int64)
-      table = lookup_ops.DenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=100,
-          deleted_key=200,
-          initial_num_buckets=8)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(4, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(4, self.evaluate(table.size()))
+    keys2 = constant_op.constant([12, 15], dtypes.int64)
+    self.evaluate(table.remove(keys2))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      keys2 = constant_op.constant([12, 15], dtypes.int64)
-      self.evaluate(table.remove(keys2))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    exported_keys, exported_values = table.export()
 
-      exported_keys, exported_values = table.export()
+    np_keys = self.evaluate(exported_keys)
+    np_values = self.evaluate(exported_values)
 
-      np_keys = self.evaluate(exported_keys)
-      np_values = self.evaluate(exported_values)
+    self.assertAllEqual(8, len(np_keys))
+    self.assertAllEqual(8, len(np_values))
 
-      self.assertAllEqual(8, len(np_keys))
-      self.assertAllEqual(8, len(np_values))
-
-      # pair up keys and values, drop extra added dimension
-      pairs = np.dstack((np_keys.flatten(), np_values.flatten()))[0]
-      # sort by key
-      pairs = pairs[pairs[:, 0].argsort()]
-      self.assertAllEqual([[11, 1], [13, 3], [14, 4], [100, 0], [100, 0],
-                           [100, 0], [100, 0], [200, 2]], pairs)
+    # pair up keys and values, drop extra added dimension
+    pairs = np.dstack((np_keys.flatten(), np_values.flatten()))[0]
+    # sort by key
+    pairs = pairs[pairs[:, 0].argsort()]
+    self.assertAllEqual([[11, 1], [13, 3], [14, 4], [100, 0], [100, 0],
+                         [100, 0], [100, 0], [200, 2]], pairs)
 
   @test_util.run_v1_only("Saver V1 only")
   def testSaveRestore(self):
@@ -1705,6 +1760,47 @@ class DenseHashTableOpTest(test.TestCase):
     output = load_table.lookup(input_string)
     self.assertAllEqual([-1, 0, 1, 2, -1], self.evaluate(output))
 
+  @test_util.run_v2_only
+  def testSavedModelSaveRestore(self):
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
+
+    root = tracking.AutoTrackable()
+
+    default_value = -1
+    empty_key = 0
+    deleted_key = -1
+    keys = constant_op.constant([11, 12, 13], dtypes.int64)
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    root.table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=default_value,
+        empty_key=empty_key,
+        deleted_key=deleted_key,
+        name="t1",
+        checkpoint=True,
+        initial_num_buckets=32)
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec((), dtypes.int64)])
+    def lookup(key):
+      return root.table.lookup(key)
+
+    root.lookup = lookup
+
+    self.assertAllEqual(0, root.table.size())
+    root.table.insert(keys, values)
+    self.assertAllEqual(3, self.evaluate(root.table.size()))
+    self.assertAllEqual(32, len(self.evaluate(root.table.export()[0])))
+
+    saved_model_save.save(root, save_path)
+
+    del root
+    loaded = saved_model_load.load(save_path)
+    self.assertEqual(loaded.lookup(12), 1)
+    self.assertEqual(loaded.lookup(10), -1)
+
   @test_util.run_v1_only("Saver V1 only")
   def testVectorSaveRestore(self):
     save_dir = os.path.join(self.get_temp_dir(), "vector_save_restore")
@@ -1848,137 +1944,134 @@ class DenseHashTableOpTest(test.TestCase):
       self.assertAllEqual([0, 1, -1, 3, -1], output)
 
   def testReprobe(self):
-    with self.cached_session():
-      # Insert 6 keys into a table with 8 buckets.
-      # The values are chosen to make sure collisions occur when using GCC STL
-      keys = constant_op.constant([11, 12, 13, 19, 20, 21], dtypes.int64)
-      values = constant_op.constant([51, 52, 53, 54, 55, 56], dtypes.int64)
-      table = lookup_ops.DenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=0,
-          deleted_key=-1,
-          initial_num_buckets=8)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    # Insert 6 keys into a table with 8 buckets.
+    # The values are chosen to make sure collisions occur when using GCC STL
+    keys = constant_op.constant([11, 12, 13, 19, 20, 21], dtypes.int64)
+    values = constant_op.constant([51, 52, 53, 54, 55, 56], dtypes.int64)
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=-1,
+        empty_key=0,
+        deleted_key=-1,
+        initial_num_buckets=8)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(6, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(6, self.evaluate(table.size()))
 
-      input_string = constant_op.constant([10, 11, 12, 13, 14, 19, 20, 21, 22],
-                                          dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([9], output.get_shape())
+    input_string = constant_op.constant([10, 11, 12, 13, 14, 19, 20, 21, 22],
+                                        dtypes.int64)
+    output = table.lookup(input_string)
+    self.assertAllEqual([9], output.get_shape())
 
-      result = self.evaluate(output)
-      self.assertAllEqual([-1, 51, 52, 53, -1, 54, 55, 56, -1], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([-1, 51, 52, 53, -1, 54, 55, 56, -1], result)
 
   def testCustomEmptyKey(self):
-    with self.cached_session():
-      keys = constant_op.constant([11, 0, 13], dtypes.int64)
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup_ops.DenseHashTable(
-          dtypes.int64,
-          dtypes.int64,
-          default_value=-1,
-          empty_key=12,
-          deleted_key=-1)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    keys = constant_op.constant([11, 0, 13], dtypes.int64)
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=-1,
+        empty_key=12,
+        deleted_key=-1)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant([11, 0, 15], dtypes.int64)
-      output = table.lookup(input_string)
-      self.assertAllEqual([3], output.get_shape())
+    input_string = constant_op.constant([11, 0, 15], dtypes.int64)
+    output = table.lookup(input_string)
+    self.assertAllEqual([3], output.get_shape())
 
-      result = self.evaluate(output)
-      self.assertAllEqual([0, 1, -1], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([0, 1, -1], result)
 
   def testErrors(self):
-    with self.cached_session():
-      table = lookup_ops.DenseHashTable(
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=-1,
+        empty_key=0,
+        deleted_key=-1)
+
+    # Inserting the empty key returns an error
+    keys1 = constant_op.constant([11, 0], dtypes.int64)
+    values1 = constant_op.constant([0, 1], dtypes.int64)
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "empty_key"):
+      self.evaluate(table.insert(keys1, values1))
+
+    # Looking up the empty key returns an error
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "empty_key"):
+      self.evaluate(table.lookup(keys1))
+
+    # Inserting the deleted key returns an error
+    keys2 = constant_op.constant([11, -1], dtypes.int64)
+    values2 = constant_op.constant([0, 1], dtypes.int64)
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "deleted_key"):
+      self.evaluate(table.insert(keys2, values2))
+
+    # Looking up the empty key returns an error
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "deleted_key"):
+      self.evaluate(table.lookup(keys2))
+
+    # Arbitrary tensors of keys are not supported
+    keys = constant_op.constant([[11, 0], [12, 1]], dtypes.int64)
+    values = constant_op.constant([[11, 0], [12, 1]], dtypes.int64)
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "Expected key shape"):
+      self.evaluate(table.lookup(keys))
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "Expected key shape"):
+      self.evaluate(table.insert(keys, values))
+
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "Number of buckets must be"):
+      table2 = lookup_ops.DenseHashTable(
           dtypes.int64,
           dtypes.int64,
           default_value=-1,
-          empty_key=0,
-          deleted_key=-1)
+          empty_key=17,
+          deleted_key=-1,
+          initial_num_buckets=12)
+      self.assertAllEqual(0, self.evaluate(table2.size()))
 
-      # Inserting the empty key returns an error
-      keys1 = constant_op.constant([11, 0], dtypes.int64)
-      values1 = constant_op.constant([0, 1], dtypes.int64)
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "empty_key"):
-        self.evaluate(table.insert(keys1, values1))
+    with self.assertRaisesRegex(
+        errors_impl.InvalidArgumentError,
+        "Empty and deleted keys must have same shape"):
+      table3 = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=42,
+          deleted_key=[1, 2])
+      self.assertAllEqual(0, self.evaluate(table3.size()))
 
-      # Looking up the empty key returns an error
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "empty_key"):
-        self.evaluate(table.lookup(keys1))
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "Empty and deleted keys cannot be equal"):
+      table4 = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=42,
+          deleted_key=42)
+      self.assertAllEqual(0, self.evaluate(table4.size()))
 
-      # Inserting the deleted key returns an error
-      keys2 = constant_op.constant([11, -1], dtypes.int64)
-      values2 = constant_op.constant([0, 1], dtypes.int64)
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "deleted_key"):
-        self.evaluate(table.insert(keys2, values2))
-
-      # Looking up the empty key returns an error
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "deleted_key"):
-        self.evaluate(table.lookup(keys2))
-
-      # Arbitrary tensors of keys are not supported
-      keys = constant_op.constant([[11, 0], [12, 1]], dtypes.int64)
-      values = constant_op.constant([[11, 0], [12, 1]], dtypes.int64)
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "Expected key shape"):
-        self.evaluate(table.lookup(keys))
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "Expected key shape"):
-        self.evaluate(table.insert(keys, values))
-
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "Number of buckets must be"):
-        table2 = lookup_ops.DenseHashTable(
-            dtypes.int64,
-            dtypes.int64,
-            default_value=-1,
-            empty_key=17,
-            deleted_key=-1,
-            initial_num_buckets=12)
-        self.assertAllEqual(0, self.evaluate(table2.size()))
-
-      with self.assertRaisesRegex(
-          errors_impl.InvalidArgumentError,
-          "Empty and deleted keys must have same shape"):
-        table3 = lookup_ops.DenseHashTable(
-            dtypes.int64,
-            dtypes.int64,
-            default_value=-1,
-            empty_key=42,
-            deleted_key=[1, 2])
-        self.assertAllEqual(0, self.evaluate(table3.size()))
-
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "Empty and deleted keys cannot be equal"):
-        table4 = lookup_ops.DenseHashTable(
-            dtypes.int64,
-            dtypes.int64,
-            default_value=-1,
-            empty_key=42,
-            deleted_key=42)
-        self.assertAllEqual(0, self.evaluate(table4.size()))
-
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "Empty and deleted keys cannot be equal"):
-        table5 = lookup_ops.DenseHashTable(
-            dtypes.int64,
-            dtypes.int64,
-            default_value=-1,
-            empty_key=[1, 2, 3],
-            deleted_key=[1, 2, 3])
-        self.assertAllEqual(0, self.evaluate(table5.size()))
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "Empty and deleted keys cannot be equal"):
+      table5 = lookup_ops.DenseHashTable(
+          dtypes.int64,
+          dtypes.int64,
+          default_value=-1,
+          empty_key=[1, 2, 3],
+          deleted_key=[1, 2, 3])
+      self.assertAllEqual(0, self.evaluate(table5.size()))
 
   @test_util.run_in_graph_and_eager_modes
   def testStringToResource(self):
@@ -1994,6 +2087,30 @@ class DenseHashTableOpTest(test.TestCase):
     table.insert("v1", v1.handle)
     self.assertEqual([], table.lookup("v1").shape)
 
+  def testExportShapeInference(self):
+    default_value = -1
+    empty_key = 0
+    deleted_key = -1
+    table = lookup_ops.DenseHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=default_value,
+        empty_key=empty_key,
+        deleted_key=deleted_key)
+    actual_shapes = [t.shape for t in table.export()]
+    inferred_shapes = []
+
+    @def_function.function
+    def f():
+      for t in table.export():
+        inferred_shapes.append(t.shape)
+
+    f()
+    self.assertLen(actual_shapes, 2)
+    self.assertLen(inferred_shapes, 2)
+    self.assertTrue(inferred_shapes[0].is_compatible_with(actual_shapes[0]))
+    self.assertTrue(inferred_shapes[1].is_compatible_with(actual_shapes[1]))
+
 
 class IndexTableFromFile(test.TestCase):
 
@@ -2005,68 +2122,65 @@ class IndexTableFromFile(test.TestCase):
 
   def test_string_index_table_from_file(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, num_oov_buckets=1)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file, num_oov_buckets=1)
+    ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_string_index_table_from_multicolumn_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file,
-          num_oov_buckets=1,
-          key_column_index=0,
-          value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file,
+        num_oov_buckets=1,
+        key_column_index=0,
+        value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER)
+    ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_string_index_table_from_multicolumn_file_custom_delimiter(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file,
-          num_oov_buckets=1,
-          key_column_index=0,
-          value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
-          delimiter=" ")
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file,
+        num_oov_buckets=1,
+        key_column_index=0,
+        value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
+        delimiter=" ")
+    ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_string_index_table_from_file_tensor_filename(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
-    with self.cached_session():
-      vocabulary_file = constant_op.constant(vocabulary_file)
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, num_oov_buckets=1)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+    vocabulary_file = constant_op.constant(vocabulary_file)
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file, num_oov_buckets=1)
+    ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
-      if not context.executing_eagerly():
-        self.assertEqual(1,
-                         len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+    if not context.executing_eagerly():
+      self.assertEqual(1,
+                       len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
 
   @test_util.run_v1_only("placeholder usage")
   def test_string_index_table_from_file_placeholder_filename(self):
@@ -2089,70 +2203,64 @@ class IndexTableFromFile(test.TestCase):
   def test_int32_index_table_from_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab2.txt", values=("42", "1", "-1000"))
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file,
-          num_oov_buckets=1,
-          key_dtype=dtypes.int32)
-      ids = table.lookup(
-          constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file,
+        num_oov_buckets=1,
+        key_dtype=dtypes.int32)
+    ids = table.lookup(constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_int64_index_table_from_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab3.txt", values=("42", "1", "-1000"))
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file,
-          num_oov_buckets=1,
-          key_dtype=dtypes.int64)
-      ids = table.lookup(
-          constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file,
+        num_oov_buckets=1,
+        key_dtype=dtypes.int64)
+    ids = table.lookup(constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_index_table_from_file_with_default_value(self):
     default_value = -42
     vocabulary_file = self._createVocabFile("f2i_vocab4.txt")
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, default_value=default_value)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file, default_value=default_value)
+    ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
 
   def test_index_table_from_file_with_oov_buckets(self):
     vocabulary_file = self._createVocabFile("f2i_vocab5.txt")
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, num_oov_buckets=1000)
-      ids = table.lookup(
-          constant_op.constant(["salad", "surgery", "tarkus", "toccata"]))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file, num_oov_buckets=1000)
+    ids = table.lookup(
+        constant_op.constant(["salad", "surgery", "tarkus", "toccata"]))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual(
-          (
-              1,  # From vocabulary file.
-              2,  # From vocabulary file.
-              867,  # 3 + fingerprint("tarkus") mod 300.
-              860),  # 3 + fingerprint("toccata") mod 300.
-          self.evaluate(ids))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual(
+        (
+            1,  # From vocabulary file.
+            2,  # From vocabulary file.
+            867,  # 3 + fingerprint("tarkus") mod 300.
+            860),  # 3 + fingerprint("toccata") mod 300.
+        self.evaluate(ids))
 
   def test_index_table_from_file_fails_with_empty_vocabulary_file_name(self):
     self.assertRaises(
@@ -2183,26 +2291,24 @@ class IndexTableFromFile(test.TestCase):
 
   def test_index_table_from_file_with_vocab_size_too_small(self):
     vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, vocab_size=2)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file, vocab_size=2)
+    ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, -1, -1), self.evaluate(ids))
-      self.assertEqual(2, self.evaluate(table.size()))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, -1, -1), self.evaluate(ids))
+    self.assertEqual(2, self.evaluate(table.size()))
 
   def test_index_table_from_file_with_vocab_size_too_large(self):
     vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
-    with self.cached_session():
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "Invalid vocab_size"):
-        table = lookup_ops.index_table_from_file(
-            vocabulary_file=vocabulary_file, vocab_size=4)
-        self.evaluate(table.initializer)
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "Invalid vocab_size"):
+      table = lookup_ops.index_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=4)
+      self.evaluate(table.initializer)
 
   def test_index_table_from_file_with_vocab_size(self):
     vocabulary_file = self._createVocabFile("f2i_vocab8.txt")
@@ -2213,50 +2319,46 @@ class IndexTableFromFile(test.TestCase):
         vocabulary_file=vocabulary_file,
         vocab_size=0)
 
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, vocab_size=3)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file, vocab_size=3)
+    ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, -1), self.evaluate(ids))
-      self.assertEqual(3, self.evaluate(table.size()))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, -1), self.evaluate(ids))
+    self.assertEqual(3, self.evaluate(table.size()))
 
   def test_index_table_from_file_with_invalid_hashers(self):
     vocabulary_file = self._createVocabFile("invalid_hasher.txt")
-    with self.cached_session():
-      with self.assertRaises(TypeError):
-        lookup_ops.index_table_from_file(
-            vocabulary_file=vocabulary_file,
-            vocab_size=3,
-            num_oov_buckets=1,
-            hasher_spec=1)
-
-      table = lookup_ops.index_table_from_file(
+    with self.assertRaises(TypeError):
+      lookup_ops.index_table_from_file(
           vocabulary_file=vocabulary_file,
           vocab_size=3,
           num_oov_buckets=1,
-          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+          hasher_spec=1)
 
-      self.assertRaises(ValueError, table.lookup,
-                        constant_op.constant(["salad", "surgery", "tarkus"]))
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file,
+        vocab_size=3,
+        num_oov_buckets=1,
+        hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+
+    self.assertRaises(ValueError, table.lookup,
+                      constant_op.constant(["salad", "surgery", "tarkus"]))
 
   def test_index_table_from_file_table_ref_with_oov_buckets(self):
     vocabulary_file = self._createVocabFile("f2i_vocab9.txt")
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, num_oov_buckets=1)
-      self.assertIsNotNone(table.resource_handle)
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file, num_oov_buckets=1)
+    self.assertIsNotNone(table.resource_handle)
 
   def test_index_table_from_file_table_ref_without_oov_buckets(self):
     vocabulary_file = self._createVocabFile("f2i_vocab10.txt")
-    with self.cached_session():
-      table = lookup_ops.index_table_from_file(
-          vocabulary_file=vocabulary_file, num_oov_buckets=0)
-      self.assertIsNotNone(table.resource_handle)
+    table = lookup_ops.index_table_from_file(
+        vocabulary_file=vocabulary_file, num_oov_buckets=0)
+    self.assertIsNotNone(table.resource_handle)
 
 
 class IndexTableFromTensor(test.TestCase):
@@ -2279,75 +2381,67 @@ class IndexTableFromTensor(test.TestCase):
     self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_int32_index_table_from_tensor_with_tensor_init(self):
-    with self.cached_session():
-      table = lookup_ops.index_table_from_tensor(
-          vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int32)
-      ids = table.lookup(
-          constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
+    table = lookup_ops.index_table_from_tensor(
+        vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int32)
+    ids = table.lookup(constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.FailedPreconditionError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.FailedPreconditionError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_int64_index_table_from_tensor_with_tensor_init(self):
-    with self.cached_session():
-      table = lookup_ops.index_table_from_tensor(
-          vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int64)
-      ids = table.lookup(
-          constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
+    table = lookup_ops.index_table_from_tensor(
+        vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int64)
+    ids = table.lookup(constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.FailedPreconditionError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.FailedPreconditionError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_index_table_from_tensor_with_default_value(self):
     default_value = -42
-    with self.cached_session():
-      table = lookup_ops.index_table_from_tensor(
-          vocabulary_list=["brain", "salad", "surgery"],
-          default_value=default_value)
-      ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
+    table = lookup_ops.index_table_from_tensor(
+        vocabulary_list=["brain", "salad", "surgery"],
+        default_value=default_value)
+    ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.FailedPreconditionError):
-          self.evaluate(ids)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.FailedPreconditionError):
+        self.evaluate(ids)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
 
   def test_index_table_from_tensor_missing_vocabulary_list(self):
-    with self.cached_session():
-      with self.assertRaisesRegex(ValueError,
-                                  "vocabulary_list must be specified"):
-        lookup_ops.index_table_from_tensor(
-            vocabulary_list=None, num_oov_buckets=1)
+    with self.assertRaisesRegex(ValueError,
+                                "vocabulary_list must be specified"):
+      lookup_ops.index_table_from_tensor(
+          vocabulary_list=None, num_oov_buckets=1)
 
   def test_index_table_from_tensor_empty_vocabulary_list(self):
-    with self.cached_session():
-      with self.assertRaisesRegex(errors_impl.OpError,
-                                  "keys and values cannot be empty"):
-        _ = lookup_ops.index_table_from_tensor(
-            vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1)
-        self.evaluate(lookup_ops.tables_initializer())
+    with self.assertRaisesRegex(errors_impl.OpError,
+                                "keys and values cannot be empty"):
+      _ = lookup_ops.index_table_from_tensor(
+          vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1)
+      self.evaluate(lookup_ops.tables_initializer())
 
   def test_index_table_from_tensor_with_invalid_hashers(self):
-    with self.cached_session():
-      with self.assertRaises(TypeError):
-        lookup_ops.index_table_from_tensor(
-            vocabulary_list=["brain", "salad", "surgery"],
-            num_oov_buckets=1,
-            hasher_spec=1)
-
-      table = lookup_ops.index_table_from_tensor(
+    with self.assertRaises(TypeError):
+      lookup_ops.index_table_from_tensor(
           vocabulary_list=["brain", "salad", "surgery"],
           num_oov_buckets=1,
-          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+          hasher_spec=1)
 
-      self.assertRaises(ValueError, table.lookup,
-                        constant_op.constant(["salad", "surgery", "tarkus"]))
+    table = lookup_ops.index_table_from_tensor(
+        vocabulary_list=["brain", "salad", "surgery"],
+        num_oov_buckets=1,
+        hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+
+    self.assertRaises(ValueError, table.lookup,
+                      constant_op.constant(["salad", "surgery", "tarkus"]))
 
 
 class IndexToStringTableFromFileTest(test.TestCase):
@@ -2364,147 +2458,135 @@ class IndexToStringTableFromFileTest(test.TestCase):
     type_funcs = [str, constant_op.constant]
     for type_func in type_funcs:
       vocabulary_file = type_func(vocabulary_path)
-      with self.cached_session():
-        table = lookup_ops.index_to_string_table_from_file(
-            vocabulary_file=vocabulary_file)
-        features = table.lookup(
-            constant_op.constant([0, 1, 2, 3], dtypes.int64))
-        if not context.executing_eagerly():
-          with self.assertRaises(errors_impl.OpError):
-            self.evaluate(features)
-        self.evaluate(lookup_ops.tables_initializer())
-        self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                            self.evaluate(features))
+      table = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file)
+      features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
+      if not context.executing_eagerly():
+        with self.assertRaises(errors_impl.OpError):
+          self.evaluate(features)
+      self.evaluate(lookup_ops.tables_initializer())
+      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                          self.evaluate(features))
 
   def test_index_to_string_table_from_multicolumn_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
-    with self.cached_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file,
-          key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
-          value_column_index=0)
-      features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(features)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          self.evaluate(features))
+    table = lookup_ops.index_to_string_table_from_file(
+        vocabulary_file=vocabulary_file,
+        key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
+        value_column_index=0)
+    features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                        self.evaluate(features))
 
   def test_index_to_string_table_from_multicolumn_file_custom_delimiter(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
-    with self.cached_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file,
-          key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
-          value_column_index=0,
-          delimiter=" ")
-      features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(features)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          self.evaluate(features))
+    table = lookup_ops.index_to_string_table_from_file(
+        vocabulary_file=vocabulary_file,
+        key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER,
+        value_column_index=0,
+        delimiter=" ")
+    features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                        self.evaluate(features))
 
   def test_index_to_string_table_with_default_value(self):
     default_value = b"NONE"
     vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
-    with self.cached_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file, default_value=default_value)
-      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(features)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((b"salad", b"surgery", default_value),
-                          self.evaluate(features))
+    table = lookup_ops.index_to_string_table_from_file(
+        vocabulary_file=vocabulary_file, default_value=default_value)
+    features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((b"salad", b"surgery", default_value),
+                        self.evaluate(features))
 
   def test_index_to_string_table_with_vocab_size_too_small(self):
     default_value = b"NONE"
     vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
-    with self.cached_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file,
-          vocab_size=2,
-          default_value=default_value)
-      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(features)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((b"salad", default_value, default_value),
-                          self.evaluate(features))
+    table = lookup_ops.index_to_string_table_from_file(
+        vocabulary_file=vocabulary_file,
+        vocab_size=2,
+        default_value=default_value)
+    features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((b"salad", default_value, default_value),
+                        self.evaluate(features))
 
   def test_index_to_string_table_with_vocab_size_too_large(self):
     vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
-    with self.cached_session():
-      with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
-                                  "Invalid vocab_size"):
-        _ = lookup_ops.index_to_string_table_from_file(
-            vocabulary_file=vocabulary_file, vocab_size=4)
-        self.evaluate(lookup_ops.tables_initializer())
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
+                                "Invalid vocab_size"):
+      _ = lookup_ops.index_to_string_table_from_file(
+          vocabulary_file=vocabulary_file, vocab_size=4)
+      self.evaluate(lookup_ops.tables_initializer())
 
   def test_index_to_string_table_with_vocab_size(self):
     vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
-    with self.cached_session():
-      table = lookup_ops.index_to_string_table_from_file(
-          vocabulary_file=vocabulary_file, vocab_size=3)
-      features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
+    table = lookup_ops.index_to_string_table_from_file(
+        vocabulary_file=vocabulary_file, vocab_size=3)
+    features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
 
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(features)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((b"salad", b"surgery", b"UNK"),
-                          self.evaluate(features))
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((b"salad", b"surgery", b"UNK"), self.evaluate(features))
 
 
 class IndexToStringTableFromTensorTest(test.TestCase):
 
   def test_index_to_string_table_from_tensor(self):
-    with self.cached_session():
-      vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
-      table = lookup_ops.index_to_string_table_from_tensor(
-          vocabulary_list=vocabulary_list)
+    vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
+    table = lookup_ops.index_to_string_table_from_tensor(
+        vocabulary_list=vocabulary_list)
 
-      indices = constant_op.constant([0, 1, 2, 3], dtypes.int64)
-      features = table.lookup(indices)
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(features)
-      self.evaluate(lookup_ops.tables_initializer())
+    indices = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+    features = table.lookup(indices)
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
+    self.evaluate(lookup_ops.tables_initializer())
 
-      self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          self.evaluate(features))
+    self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
+                        self.evaluate(features))
 
   def test_duplicate_entries(self):
-    with self.cached_session():
-      vocabulary_list = constant_op.constant(["hello", "hello"])
-      table = lookup_ops.index_to_string_table_from_tensor(
-          vocabulary_list=vocabulary_list)
-      indices = constant_op.constant([0, 1, 4], dtypes.int64)
-      features = table.lookup(indices)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((b"hello", b"hello", b"UNK"), self.evaluate(features))
+    vocabulary_list = constant_op.constant(["hello", "hello"])
+    table = lookup_ops.index_to_string_table_from_tensor(
+        vocabulary_list=vocabulary_list)
+    indices = constant_op.constant([0, 1, 4], dtypes.int64)
+    features = table.lookup(indices)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((b"hello", b"hello", b"UNK"), self.evaluate(features))
 
   def test_index_to_string_with_default_value(self):
     default_value = b"NONE"
-    with self.cached_session():
-      vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
-      table = lookup_ops.index_to_string_table_from_tensor(
-          vocabulary_list=vocabulary_list, default_value=default_value)
-      indices = constant_op.constant([1, 2, 4], dtypes.int64)
-      features = table.lookup(indices)
-      if not context.executing_eagerly():
-        with self.assertRaises(errors_impl.OpError):
-          self.evaluate(features)
-      self.evaluate(lookup_ops.tables_initializer())
-      self.assertAllEqual((b"salad", b"surgery", default_value),
-                          self.evaluate(features))
+    vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
+    table = lookup_ops.index_to_string_table_from_tensor(
+        vocabulary_list=vocabulary_list, default_value=default_value)
+    indices = constant_op.constant([1, 2, 4], dtypes.int64)
+    features = table.lookup(indices)
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(features)
+    self.evaluate(lookup_ops.tables_initializer())
+    self.assertAllEqual((b"salad", b"surgery", default_value),
+                        self.evaluate(features))
 
 
 class IdTableWithHashBucketsTest(test.TestCase):
@@ -2667,45 +2749,40 @@ class IdTableWithHashBucketsTest(test.TestCase):
 
   def testIdTableWithHashBucketsInitializationAcrossSessions(self):
     vocab_file = self._createVocabFile("feat_to_id_5.txt")
-    with self.cached_session():
-      default_value = -1
-      vocab_size = 3
-      oov_buckets = 1
-      table1 = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.StaticHashTable(
-              lookup_ops.TextFileIdTableInitializer(
-                  vocab_file, vocab_size=vocab_size), default_value),
-          oov_buckets)
+    default_value = -1
+    vocab_size = 3
+    oov_buckets = 1
+    table1 = lookup_ops.IdTableWithHashBuckets(
+        lookup_ops.StaticHashTable(
+            lookup_ops.TextFileIdTableInitializer(
+                vocab_file, vocab_size=vocab_size), default_value), oov_buckets)
 
-      self.evaluate(table1.initializer)
+    self.evaluate(table1.initializer)
 
-      input_string_1 = constant_op.constant(
-          ["brain", "salad", "surgery", "UNK"])
+    input_string_1 = constant_op.constant(["brain", "salad", "surgery", "UNK"])
 
-      out1 = table1.lookup(input_string_1)
+    out1 = table1.lookup(input_string_1)
 
-      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out1))
-      self.assertEqual(vocab_size + oov_buckets, self.evaluate(table1.size()))
+    self.assertAllEqual([0, 1, 2, 3], self.evaluate(out1))
+    self.assertEqual(vocab_size + oov_buckets, self.evaluate(table1.size()))
 
-    with self.cached_session():
-      default_value = -1
-      vocab_size = 3
-      oov_buckets = 1
+    default_value = -1
+    vocab_size = 3
+    oov_buckets = 1
 
-      # Underlying lookup table already initialized in previous session.
-      # No need to call self.evaluate(table2.initializer)
-      table2 = lookup_ops.IdTableWithHashBuckets(
-          lookup_ops.StaticHashTable(
-              lookup_ops.TextFileIdTableInitializer(
-                  vocab_file, vocab_size=vocab_size), default_value),
-          oov_buckets)
+    # Underlying lookup table already initialized in previous session.
+    # No need to call self.evaluate(table2.initializer)
+    table2 = lookup_ops.IdTableWithHashBuckets(
+        lookup_ops.StaticHashTable(
+            lookup_ops.TextFileIdTableInitializer(
+                vocab_file, vocab_size=vocab_size), default_value), oov_buckets)
 
-      input_string_2 = constant_op.constant(["fruit", "salad", "UNK"])
+    input_string_2 = constant_op.constant(["fruit", "salad", "UNK"])
 
-      out2 = table2.lookup(input_string_2)
+    out2 = table2.lookup(input_string_2)
 
-      self.assertAllEqual([3, 1, 3], self.evaluate(out2))
-      self.assertEqual(vocab_size + oov_buckets, self.evaluate(table2.size()))
+    self.assertAllEqual([3, 1, 3], self.evaluate(out2))
+    self.assertEqual(vocab_size + oov_buckets, self.evaluate(table2.size()))
 
   def testIdTableWithHashBucketsWithMultipleInitializersDifferentDefault(self):
     vocab_file = self._createVocabFile("feat_to_id_6.txt")
@@ -2894,84 +2971,79 @@ class IdTableWithHashBucketsTest(test.TestCase):
 
   def testIdTableWithHashBucketsWithInvalidHashers(self):
     vocab_file = self._createVocabFile("feat_to_id_4.txt")
-    with self.cached_session():
-      default_value = -1
-      vocab_size = 3
-      oov_buckets = 1
-      lookup_table = lookup_ops.StaticHashTable(
-          lookup_ops.TextFileIdTableInitializer(
-              vocab_file, vocab_size=vocab_size), default_value)
+    default_value = -1
+    vocab_size = 3
+    oov_buckets = 1
+    lookup_table = lookup_ops.StaticHashTable(
+        lookup_ops.TextFileIdTableInitializer(
+            vocab_file, vocab_size=vocab_size), default_value)
 
-      with self.assertRaises(TypeError):
-        lookup_ops.IdTableWithHashBuckets(
-            lookup_table, oov_buckets, hasher_spec=1)
+    with self.assertRaises(TypeError):
+      lookup_ops.IdTableWithHashBuckets(
+          lookup_table, oov_buckets, hasher_spec=1)
 
+    table = lookup_ops.IdTableWithHashBuckets(
+        lookup_table,
+        oov_buckets,
+        hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+
+    input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
+
+    with self.assertRaises(ValueError):
+      table.lookup(input_string)
+
+    with self.assertRaises(ValueError):
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_table, oov_buckets, hasher_spec=lookup_ops.StrongHashSpec([]))
+
+    with self.assertRaises(ValueError):
       table = lookup_ops.IdTableWithHashBuckets(
           lookup_table,
           oov_buckets,
-          hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None))
+          hasher_spec=lookup_ops.StrongHashSpec([1, 2, 3]))
 
-      input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
-
-      with self.assertRaises(ValueError):
-        table.lookup(input_string)
-
-      with self.assertRaises(ValueError):
-        table = lookup_ops.IdTableWithHashBuckets(
-            lookup_table,
-            oov_buckets,
-            hasher_spec=lookup_ops.StrongHashSpec([]))
-
-      with self.assertRaises(ValueError):
-        table = lookup_ops.IdTableWithHashBuckets(
-            lookup_table,
-            oov_buckets,
-            hasher_spec=lookup_ops.StrongHashSpec([1, 2, 3]))
-
-      with self.assertRaises(TypeError):
-        table = lookup_ops.IdTableWithHashBuckets(
-            lookup_table,
-            oov_buckets,
-            hasher_spec=lookup_ops.StrongHashSpec([None, 2]))
+    with self.assertRaises(TypeError):
+      table = lookup_ops.IdTableWithHashBuckets(
+          lookup_table,
+          oov_buckets,
+          hasher_spec=lookup_ops.StrongHashSpec([None, 2]))
 
   def testIdTableWithHashBucketsNoInnerTable(self):
-    with self.cached_session():
-      table = lookup_ops.IdTableWithHashBuckets(None, num_oov_buckets=1)
-      self.assertIsNone(table.resource_handle)
+    table = lookup_ops.IdTableWithHashBuckets(None, num_oov_buckets=1)
+    self.assertIsNone(table.resource_handle)
 
 
 class MutableHashTableOpTest(test.TestCase):
 
   def testMutableHashTable(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"])
-      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"])
+    values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(4, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(4, self.evaluate(table.size()))
 
-      remove_string = constant_op.constant(["tarkus", "tank"])
-      self.evaluate(table.remove(remove_string))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    remove_string = constant_op.constant(["tarkus", "tank"])
+    self.evaluate(table.remove(remove_string))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
-      self.assertAllEqual([3], output.get_shape())
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    output = table.lookup(input_string)
+    self.assertAllEqual([3], output.get_shape())
 
-      result = self.evaluate(output)
-      self.assertAllEqual([0, 1, -1], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([0, 1, -1], result)
 
-      exported_keys, exported_values = table.export()
+    exported_keys, exported_values = table.export()
 
-      # exported data is in the order of the internal map, i.e. undefined
-      sorted_keys = np.sort(self.evaluate(exported_keys))
-      sorted_values = np.sort(self.evaluate(exported_values))
-      self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys)
-      self.assertAllEqual([0, 1, 2], sorted_values)
+    # exported data is in the order of the internal map, i.e. undefined
+    sorted_keys = np.sort(self.evaluate(exported_keys))
+    sorted_values = np.sort(self.evaluate(exported_values))
+    self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys)
+    self.assertAllEqual([0, 1, 2], sorted_values)
 
   @test_util.run_v1_only("SaverV1")
   def testSaveRestore(self):
@@ -3170,370 +3242,374 @@ class MutableHashTableOpTest(test.TestCase):
       self.assertAllEqual([b"-", b"a", b"b"], output)
 
   def testMutableHashTableOfTensors(self):
-    with self.cached_session():
-      default_val = constant_op.constant([-1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"])
-      values = constant_op.constant([[0, 1], [2, 3], [4, 5], [6, 7]],
-                                    dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    default_val = constant_op.constant([-1, -1], dtypes.int64)
+    keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"])
+    values = constant_op.constant([[0, 1], [2, 3], [4, 5], [6, 7]],
+                                  dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(4, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(4, self.evaluate(table.size()))
 
-      remove_string = constant_op.constant(["tarkus", "tank"])
-      self.evaluate(table.remove(remove_string))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    remove_string = constant_op.constant(["tarkus", "tank"])
+    self.evaluate(table.remove(remove_string))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
-      self.assertAllEqual([3, 2], output.get_shape())
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    output = table.lookup(input_string)
+    self.assertAllEqual([3, 2], output.get_shape())
 
-      result = self.evaluate(output)
-      self.assertAllEqual([[0, 1], [2, 3], [-1, -1]], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([[0, 1], [2, 3], [-1, -1]], result)
 
-      exported_keys, exported_values = table.export()
-      # exported data is in the order of the internal map, i.e. undefined
-      sorted_keys = np.sort(self.evaluate(exported_keys))
-      sorted_values = np.sort(self.evaluate(exported_values), axis=0)
-      self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys)
-      sorted_expected_values = np.sort([[4, 5], [2, 3], [0, 1]], axis=0)
-      self.assertAllEqual(sorted_expected_values, sorted_values)
+    exported_keys, exported_values = table.export()
+    # exported data is in the order of the internal map, i.e. undefined
+    sorted_keys = np.sort(self.evaluate(exported_keys))
+    sorted_values = np.sort(self.evaluate(exported_values), axis=0)
+    self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys)
+    sorted_expected_values = np.sort([[4, 5], [2, 3], [0, 1]], axis=0)
+    self.assertAllEqual(sorted_expected_values, sorted_values)
 
   def testMutableHashTableExportInsert(self):
-    with self.cached_session():
-      default_val = constant_op.constant([-1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64)
-      table1 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                           default_val)
-      self.assertAllEqual(0, self.evaluate(table1.size()))
-      self.evaluate(table1.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table1.size()))
+    default_val = constant_op.constant([-1, -1], dtypes.int64)
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64)
+    table1 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                         default_val)
+    self.assertAllEqual(0, self.evaluate(table1.size()))
+    self.evaluate(table1.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table1.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      expected_output = [[0, 1], [2, 3], [-1, -1]]
-      output1 = table1.lookup(input_string)
-      self.assertAllEqual(expected_output, self.evaluate(output1))
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    expected_output = [[0, 1], [2, 3], [-1, -1]]
+    output1 = table1.lookup(input_string)
+    self.assertAllEqual(expected_output, self.evaluate(output1))
 
-      exported_keys, exported_values = table1.export()
-      self.assertAllEqual(3, self.evaluate(exported_keys).size)
-      self.assertAllEqual(6, self.evaluate(exported_values).size)
+    exported_keys, exported_values = table1.export()
+    self.assertAllEqual(3, self.evaluate(exported_keys).size)
+    self.assertAllEqual(6, self.evaluate(exported_values).size)
 
-      # Populate a second table from the exported data
-      table2 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                           default_val)
-      self.assertAllEqual(0, self.evaluate(table2.size()))
-      self.evaluate(table2.insert(exported_keys, exported_values))
-      self.assertAllEqual(3, self.evaluate(table2.size()))
+    # Populate a second table from the exported data
+    table2 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                         default_val)
+    self.assertAllEqual(0, self.evaluate(table2.size()))
+    self.evaluate(table2.insert(exported_keys, exported_values))
+    self.assertAllEqual(3, self.evaluate(table2.size()))
 
-      # Verify lookup result is still the same
-      output2 = table2.lookup(input_string)
-      self.assertAllEqual(expected_output, self.evaluate(output2))
+    # Verify lookup result is still the same
+    output2 = table2.lookup(input_string)
+    self.assertAllEqual(expected_output, self.evaluate(output2))
 
   def testMutableHashTableOfTensorsInvalidShape(self):
-    with self.cached_session():
-      default_val = constant_op.constant([-1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
+    default_val = constant_op.constant([-1, -1], dtypes.int64)
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
 
-      # Shape [6] instead of [3, 2]
-      values = constant_op.constant([0, 1, 2, 3, 4, 5], dtypes.int64)
-      with self.assertRaisesOpError("Expected shape"):
-        self.evaluate(table.insert(keys, values))
-
-      # Shape [2,3] instead of [3, 2]
-      values = constant_op.constant([[0, 1, 2], [3, 4, 5]], dtypes.int64)
-      with self.assertRaisesOpError("Expected shape"):
-        self.evaluate(table.insert(keys, values))
-
-      # Shape [2, 2] instead of [3, 2]
-      values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
-      with self.assertRaisesOpError("Expected shape"):
-        self.evaluate(table.insert(keys, values))
-
-      # Shape [3, 1] instead of [3, 2]
-      values = constant_op.constant([[0], [2], [4]], dtypes.int64)
-      with self.assertRaisesOpError("Expected shape"):
-        self.evaluate(table.insert(keys, values))
-
-      # Valid Insert
-      values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64)
+    # Shape [6] instead of [3, 2]
+    values = constant_op.constant([0, 1, 2, 3, 4, 5], dtypes.int64)
+    with self.assertRaisesOpError("Expected shape"):
       self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+
+    # Shape [2,3] instead of [3, 2]
+    values = constant_op.constant([[0, 1, 2], [3, 4, 5]], dtypes.int64)
+    with self.assertRaisesOpError("Expected shape"):
+      self.evaluate(table.insert(keys, values))
+
+    # Shape [2, 2] instead of [3, 2]
+    values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
+    with self.assertRaisesOpError("Expected shape"):
+      self.evaluate(table.insert(keys, values))
+
+    # Shape [3, 1] instead of [3, 2]
+    values = constant_op.constant([[0], [2], [4]], dtypes.int64)
+    with self.assertRaisesOpError("Expected shape"):
+      self.evaluate(table.insert(keys, values))
+
+    # Valid Insert
+    values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64)
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
   def testMutableHashTableInvalidDefaultValue(self):
-    with self.cached_session():
-      default_val = constant_op.constant([[-1, -1]], dtypes.int64)
-      with self.assertRaisesOpError("Default value must be a vector"):
-        table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                            default_val)
-        self.assertAllEqual(0, self.evaluate(table.size()))
+    default_val = constant_op.constant([[-1, -1]], dtypes.int64)
+    with self.assertRaisesOpError("Default value must be a vector"):
+      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                          default_val)
+      self.assertAllEqual(0, self.evaluate(table.size()))
 
   def testMutableHashTableDuplicateInsert(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery", "brain"])
-      values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery", "brain"])
+    values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    output = table.lookup(input_string)
 
-      result = self.evaluate(output)
-      self.assertAllEqual([3, 1, -1], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([3, 1, -1], result)
 
   def testMutableHashTableFindHighRank(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant([["brain", "salad"],
-                                           ["tank", "tarkus"]])
-      output = table.lookup(input_string)
-      self.assertAllEqual([2, 2], output.get_shape())
+    input_string = constant_op.constant([["brain", "salad"],
+                                         ["tank", "tarkus"]])
+    output = table.lookup(input_string)
+    self.assertAllEqual([2, 2], output.get_shape())
 
-      result = self.evaluate(output)
-      self.assertAllEqual([[0, 1], [-1, -1]], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([[0, 1], [-1, -1]], result)
 
   def testMutableHashTableInsertHighRank(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]])
-      values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
+    default_val = -1
+    keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]])
+    values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(4, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(4, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"])
-      output = table.lookup(input_string)
+    input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"])
+    output = table.lookup(input_string)
 
-      result = self.evaluate(output)
-      self.assertAllEqual([0, 1, 3, -1], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([0, 1, 3, -1], result)
 
   def testMutableHashTableRemoveHighRank(self):
-    with self.test_session():
-      default_val = -1
-      keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]])
-      values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
+    default_val = -1
+    keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]])
+    values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(4, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(4, self.evaluate(table.size()))
 
-      remove_string = constant_op.constant(["salad", "tarkus"])
-      self.evaluate(table.remove(remove_string))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    remove_string = constant_op.constant(["salad", "tarkus"])
+    self.evaluate(table.remove(remove_string))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"])
-      output = table.lookup(input_string)
+    input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"])
+    output = table.lookup(input_string)
 
-      result = self.evaluate(output)
-      self.assertAllEqual([0, -1, 3, -1], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([0, -1, 3, -1], result)
 
   def testMutableHashTableOfTensorsFindHighRank(self):
-    with self.cached_session():
-      default_val = constant_op.constant([-1, -1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]],
-                                    dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
+    default_val = constant_op.constant([-1, -1, -1], dtypes.int64)
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]],
+                                  dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant([["brain", "salad"],
-                                           ["tank", "tarkus"]])
-      output = table.lookup(input_string)
-      self.assertAllEqual([2, 2, 3], output.get_shape())
+    input_string = constant_op.constant([["brain", "salad"],
+                                         ["tank", "tarkus"]])
+    output = table.lookup(input_string)
+    self.assertAllEqual([2, 2, 3], output.get_shape())
 
-      result = self.evaluate(output)
-      self.assertAllEqual(
-          [[[0, 1, 2], [2, 3, 4]], [[-1, -1, -1], [-1, -1, -1]]], result)
+    result = self.evaluate(output)
+    self.assertAllEqual(
+        [[[0, 1, 2], [2, 3, 4]], [[-1, -1, -1], [-1, -1, -1]]], result)
 
   def testMutableHashTableOfTensorsRemoveHighRank(self):
-    with self.test_session():
-      default_val = constant_op.constant([-1, -1, -1], dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]],
-                                    dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
+    default_val = constant_op.constant([-1, -1, -1], dtypes.int64)
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]],
+                                  dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      remove_string = constant_op.constant([["brain", "tank"]])
-      self.evaluate(table.remove(remove_string))
-      self.assertAllEqual(2, self.evaluate(table.size()))
+    remove_string = constant_op.constant([["brain", "tank"]])
+    self.evaluate(table.remove(remove_string))
+    self.assertAllEqual(2, self.evaluate(table.size()))
 
-      input_string = constant_op.constant([["brain", "salad"],
-                                           ["surgery", "tank"]])
-      output = table.lookup(input_string)
-      self.assertAllEqual([2, 2, 3], output.get_shape())
+    input_string = constant_op.constant([["brain", "salad"],
+                                         ["surgery", "tank"]])
+    output = table.lookup(input_string)
+    self.assertAllEqual([2, 2, 3], output.get_shape())
 
-      result = self.evaluate(output)
-      self.assertAllEqual(
-          [[[-1, -1, -1], [2, 3, 4]], [[4, 5, 6], [-1, -1, -1]]], result)
+    result = self.evaluate(output)
+    self.assertAllEqual(
+        [[[-1, -1, -1], [2, 3, 4]], [[4, 5, 6], [-1, -1, -1]]], result)
 
   def testMultipleMutableHashTables(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
 
-      table1 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                           default_val)
-      table2 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                           default_val)
-      table3 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                           default_val)
-      self.evaluate(table1.insert(keys, values))
-      self.evaluate(table2.insert(keys, values))
-      self.evaluate(table3.insert(keys, values))
+    table1 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                         default_val)
+    table2 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                         default_val)
+    table3 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                         default_val)
+    self.evaluate(table1.insert(keys, values))
+    self.evaluate(table2.insert(keys, values))
+    self.evaluate(table3.insert(keys, values))
 
-      self.assertAllEqual(3, self.evaluate(table1.size()))
-      self.assertAllEqual(3, self.evaluate(table2.size()))
-      self.assertAllEqual(3, self.evaluate(table3.size()))
+    self.assertAllEqual(3, self.evaluate(table1.size()))
+    self.assertAllEqual(3, self.evaluate(table2.size()))
+    self.assertAllEqual(3, self.evaluate(table3.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output1 = table1.lookup(input_string)
-      output2 = table2.lookup(input_string)
-      output3 = table3.lookup(input_string)
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    output1 = table1.lookup(input_string)
+    output2 = table2.lookup(input_string)
+    output3 = table3.lookup(input_string)
 
-      out1, out2, out3 = self.evaluate([output1, output2, output3])
-      self.assertAllEqual([0, 1, -1], out1)
-      self.assertAllEqual([0, 1, -1], out2)
-      self.assertAllEqual([0, 1, -1], out3)
+    out1, out2, out3 = self.evaluate([output1, output2, output3])
+    self.assertAllEqual([0, 1, -1], out1)
+    self.assertAllEqual([0, 1, -1], out2)
+    self.assertAllEqual([0, 1, -1], out3)
 
   def testMutableHashTableWithTensorDefault(self):
-    with self.cached_session():
-      default_val = constant_op.constant(-1, dtypes.int64)
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
+    default_val = constant_op.constant(-1, dtypes.int64)
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    output = table.lookup(input_string)
 
-      result = self.evaluate(output)
-      self.assertAllEqual([0, 1, -1], result)
+    result = self.evaluate(output)
+    self.assertAllEqual([0, 1, -1], result)
 
   def testSignatureMismatch(self):
-    with self.cached_session():
-      default_val = -1
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1, 2], dtypes.int64)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
-                                          default_val)
+    default_val = -1
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1, 2], dtypes.int64)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64,
+                                        default_val)
 
-      # insert with keys of the wrong type
-      with self.assertRaises(ValueError):
-        self.evaluate(table.insert(constant_op.constant([4, 5, 6]), values))
+    # insert with keys of the wrong type
+    with self.assertRaises(ValueError):
+      self.evaluate(table.insert(constant_op.constant([4, 5, 6]), values))
 
-      # insert with values of the wrong type
-      with self.assertRaises(ValueError):
-        self.evaluate(table.insert(keys, constant_op.constant(["a", "b", "c"])))
+    # insert with values of the wrong type
+    with self.assertRaises(ValueError):
+      self.evaluate(table.insert(keys, constant_op.constant(["a", "b", "c"])))
 
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string_ref = variables.Variable("brain")
-      input_int64_ref = variables.Variable(-1, dtype=dtypes.int64)
-      self.evaluate(variables.global_variables_initializer())
+    input_string_ref = variables.Variable("brain")
+    input_int64_ref = variables.Variable(-1, dtype=dtypes.int64)
+    self.evaluate(variables.global_variables_initializer())
 
-      # Ref types do not produce an insert signature mismatch.
-      self.evaluate(table.insert(input_string_ref, input_int64_ref))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    # Ref types do not produce an insert signature mismatch.
+    self.evaluate(table.insert(input_string_ref, input_int64_ref))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      # Ref types do not produce a lookup signature mismatch.
-      self.assertEqual(-1, self.evaluate(table.lookup(input_string_ref)))
+    # Ref types do not produce a lookup signature mismatch.
+    self.assertEqual(-1, self.evaluate(table.lookup(input_string_ref)))
 
-      # lookup with keys of the wrong type
-      input_string = constant_op.constant([1, 2, 3], dtypes.int64)
-      with self.assertRaises(ValueError):
-        self.evaluate(table.lookup(input_string))
+    # lookup with keys of the wrong type
+    input_string = constant_op.constant([1, 2, 3], dtypes.int64)
+    with self.assertRaises(ValueError):
+      self.evaluate(table.lookup(input_string))
 
-      # default value of the wrong type
-      with self.assertRaises(TypeError):
-        lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, "UNK")
+    # default value of the wrong type
+    with self.assertRaises(TypeError):
+      lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, "UNK")
 
   def testMutableHashTableStringFloat(self):
-    with self.cached_session():
-      default_val = -1.5
-      keys = constant_op.constant(["brain", "salad", "surgery"])
-      values = constant_op.constant([0, 1.1, 2.2], dtypes.float32)
-      table = lookup_ops.MutableHashTable(dtypes.string, dtypes.float32,
-                                          default_val)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    default_val = -1.5
+    keys = constant_op.constant(["brain", "salad", "surgery"])
+    values = constant_op.constant([0, 1.1, 2.2], dtypes.float32)
+    table = lookup_ops.MutableHashTable(dtypes.string, dtypes.float32,
+                                        default_val)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant(["brain", "salad", "tank"])
-      output = table.lookup(input_string)
+    input_string = constant_op.constant(["brain", "salad", "tank"])
+    output = table.lookup(input_string)
 
-      result = self.evaluate(output)
-      self.assertAllClose([0, 1.1, default_val], result)
+    result = self.evaluate(output)
+    self.assertAllClose([0, 1.1, default_val], result)
 
   def testMutableHashTableIntFloat(self):
-    with self.cached_session():
-      default_val = -1.0
-      keys = constant_op.constant([3, 7, 0], dtypes.int64)
-      values = constant_op.constant([7.5, -1.2, 9.9], dtypes.float32)
-      table = lookup_ops.MutableHashTable(dtypes.int64, dtypes.float32,
-                                          default_val)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    default_val = -1.0
+    keys = constant_op.constant([3, 7, 0], dtypes.int64)
+    values = constant_op.constant([7.5, -1.2, 9.9], dtypes.float32)
+    table = lookup_ops.MutableHashTable(dtypes.int64, dtypes.float32,
+                                        default_val)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant([7, 0, 11], dtypes.int64)
-      output = table.lookup(input_string)
+    input_string = constant_op.constant([7, 0, 11], dtypes.int64)
+    output = table.lookup(input_string)
 
-      result = self.evaluate(output)
-      self.assertAllClose([-1.2, 9.9, default_val], result)
+    result = self.evaluate(output)
+    self.assertAllClose([-1.2, 9.9, default_val], result)
 
   def testMutableHashTableInt64String(self):
-    with self.cached_session():
-      default_val = "n/a"
-      keys = constant_op.constant([0, 1, 2], dtypes.int64)
-      values = constant_op.constant(["brain", "salad", "surgery"])
-      table = lookup_ops.MutableHashTable(dtypes.int64, dtypes.string,
-                                          default_val)
-      self.assertAllEqual(0, self.evaluate(table.size()))
+    default_val = "n/a"
+    keys = constant_op.constant([0, 1, 2], dtypes.int64)
+    values = constant_op.constant(["brain", "salad", "surgery"])
+    table = lookup_ops.MutableHashTable(dtypes.int64, dtypes.string,
+                                        default_val)
+    self.assertAllEqual(0, self.evaluate(table.size()))
 
-      self.evaluate(table.insert(keys, values))
-      self.assertAllEqual(3, self.evaluate(table.size()))
+    self.evaluate(table.insert(keys, values))
+    self.assertAllEqual(3, self.evaluate(table.size()))
 
-      input_string = constant_op.constant([0, 1, 3], dtypes.int64)
-      output = table.lookup(input_string)
+    input_string = constant_op.constant([0, 1, 3], dtypes.int64)
+    output = table.lookup(input_string)
 
-      result = self.evaluate(output)
-      self.assertAllEqual((b"brain", b"salad", b"n/a"), result)
+    result = self.evaluate(output)
+    self.assertAllEqual((b"brain", b"salad", b"n/a"), result)
+
+  def testExportShapeInference(self):
+    default_value = -1
+    table = lookup_ops.MutableHashTable(
+        dtypes.int64,
+        dtypes.int64,
+        default_value=default_value)
+    actual_shapes = [t.shape for t in table.export()]
+    inferred_shapes = []
+
+    @def_function.function
+    def f():
+      for t in table.export():
+        inferred_shapes.append(t.shape)
+
+    f()
+    self.assertLen(actual_shapes, 2)
+    self.assertLen(inferred_shapes, 2)
+    self.assertTrue(inferred_shapes[0].is_compatible_with(actual_shapes[0]))
+    self.assertTrue(inferred_shapes[1].is_compatible_with(actual_shapes[1]))
 
 
 class MutableHashTableBenchmark(test.Benchmark):
diff --git a/tensorflow/python/kernel_tests/lu_op_test.py b/tensorflow/python/kernel_tests/lu_op_test.py
index fee6aecb3b0..8d522e80a08 100644
--- a/tensorflow/python/kernel_tests/lu_op_test.py
+++ b/tensorflow/python/kernel_tests/lu_op_test.py
@@ -91,7 +91,7 @@ class LuOpTest(test.TestCase):
     # Prepare the upper factor.
     upper = array_ops.matrix_band_part(lu, 0, -1)
 
-    verification = math_ops.matmul(lower, upper)
+    verification = test_util.matmul_without_tf32(lower, upper)
 
     # Permute the rows of product of the Cholesky factors.
     if num_rows > 0:
diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
index 62379ed222a..af0f8e97edc 100644
--- a/tensorflow/python/kernel_tests/map_fn_test.py
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.eager import def_function
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
@@ -33,6 +33,8 @@ from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 import tensorflow.python.ops.tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
 
@@ -79,6 +81,17 @@ class MapFnTest(test.TestCase):
       self.assertAllEqual(result.values, st.values)
       self.assertAllEqual(result.dense_shape, st.dense_shape)
 
+  def testMapRaggedTensor(self):
+    # Note: there are additional tests in ragged/ragged_map_fn_op_test.py
+    with self.cached_session():
+      rt = ragged_factory_ops.constant([[1, 2], [3]])
+      result = map_fn.map_fn(
+          lambda x: x + 1,
+          rt,
+          fn_output_signature=ragged_tensor.RaggedTensorSpec([None], rt.dtype))
+      self.assertAllEqual([[2, 3], [4]], result)
+      self.assertEqual([2, None], result.shape.as_list())
+
   @test_util.run_in_graph_and_eager_modes
   def testMapOverScalarErrors(self):
     with self.assertRaisesRegex(ValueError, "not scalars"):
diff --git a/tensorflow/python/kernel_tests/map_ops_test.py b/tensorflow/python/kernel_tests/map_ops_test.py
index 8db5cd7a6b1..771e22e5c5e 100644
--- a/tensorflow/python/kernel_tests/map_ops_test.py
+++ b/tensorflow/python/kernel_tests/map_ops_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import map_ops
+from tensorflow.python.ops import sort_ops
 from tensorflow.python.platform import test
 
 
@@ -57,7 +58,7 @@ class MapOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     m = map_ops.empty_tensor_map()
     k = constant_op.constant(1.0)
     with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                "Trying to lookup non-existent key."):
+                                "Trying to lookup non-existent key. *"):
       l = map_ops.tensor_map_lookup(m, k, dtypes.float32)
       self.evaluate(l)
 
@@ -68,7 +69,7 @@ class MapOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     v = constant_op.constant(11.0)
     m = map_ops.tensor_map_insert(m, k, v)
     with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                "Trying to lookup non-existent key."):
+                                "Trying to lookup non-existent key. *"):
       l = map_ops.tensor_map_lookup(m, k2, dtypes.float32)
       self.evaluate(l)
 
@@ -87,7 +88,7 @@ class MapOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     m = map_ops.empty_tensor_map()
     k = constant_op.constant(1.0)
     with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                "Trying to erase non-existent item."):
+                                "Trying to erase non-existent item. *"):
       m = map_ops.tensor_map_erase(m, k, dtypes.float32)
       self.evaluate(m)
 
@@ -98,7 +99,7 @@ class MapOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     v = constant_op.constant(2.0)
     m = map_ops.tensor_map_insert(m, k2, v)
     with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                "Trying to erase non-existent item."):
+                                "Trying to erase non-existent item. *"):
       m = map_ops.tensor_map_erase(m, k, dtypes.float32)
       self.evaluate(m)
 
@@ -133,6 +134,58 @@ class MapOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllClose(l, v)
     self.assertAllClose(l2, default_value)
 
+  def testStackKeys(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    k2 = constant_op.constant(2.0)
+    k3 = constant_op.constant(3.0)
+    v = constant_op.constant(21.0)
+    v2 = constant_op.constant(22.0)
+    v3 = constant_op.constant(23.0)
+    m = map_ops.tensor_map_insert(m, k, v)
+    m = map_ops.tensor_map_insert(m, k2, v2)
+    keys = map_ops.tensor_map_stack_keys(m, k.dtype)
+    expected = constant_op.constant([1.0, 2.0])
+    self.assertAllClose(array_ops.shape(keys), array_ops.shape(expected))
+    self.assertAllClose(sort_ops.sort(keys), expected)
+
+    m = map_ops.tensor_map_insert(m, k3, v3)
+    keys = map_ops.tensor_map_stack_keys(m, k.dtype)
+    expected = constant_op.constant([1.0, 2.0, 3.0])
+    self.assertAllClose(array_ops.shape(keys), array_ops.shape(expected))
+    self.assertAllClose(sort_ops.sort(keys), expected)
+
+  def testStackKeysEmptyMapFails(self):
+    m = map_ops.empty_tensor_map()
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError, "TensorMapStackKeys cannot be called "
+        "on empty map."):
+      keys = map_ops.tensor_map_stack_keys(m, dtypes.float32)
+      self.evaluate(keys)
+
+  def testStackKeysIncorrectDtypeFails(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant("key_with_wrong_dtype")
+    v = constant_op.constant(2.0)
+    m = map_ops.tensor_map_insert(m, k, v)
+    simple = "Key does not match requested dtype."
+    with self.assertRaisesRegex(errors.InvalidArgumentError, simple):
+      keys = map_ops.tensor_map_stack_keys(m, dtypes.float32)
+      self.evaluate(keys)
+
+  def testStackKeysIncorrectShapeFails(self):
+    m = map_ops.empty_tensor_map()
+    k = constant_op.constant(1.0)
+    k2 = constant_op.constant([1.0, 11.0])
+    v = constant_op.constant(2.0)
+    v2 = constant_op.constant(22.0)
+    m = map_ops.tensor_map_insert(m, k, v)
+    m = map_ops.tensor_map_insert(m, k2, v2)
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Keys must all have the same shape."):
+      keys = map_ops.tensor_map_stack_keys(m, dtypes.float32)
+      self.evaluate(keys)
+
   def testInsertLookupGrad(self):
     with backprop.GradientTape() as tape:
       m = map_ops.empty_tensor_map()
@@ -397,6 +450,5 @@ class MapOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(s, 0)
     self.assertAllEqual(map_ops.tensor_map_has_key(m, k), False)
 
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index 712d7336b94..737ca777804 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -70,6 +70,7 @@ class MatMulTest(test_lib.TestCase):
 
 def _GetMatMulTest(a_np_, b_np_, use_static_shape_, **kwargs_):
 
+  @test_util.run_without_tensor_float_32("Tests matmul")
   def Test(self):
     np_val = np.matrix(a_np_) * np.matrix(b_np_)
 
diff --git a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
index ffe0f595618..9a5a467a5a1 100644
--- a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
@@ -26,7 +26,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
@@ -41,7 +40,7 @@ class InverseOpTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         # Verify that x^{-1} * x == Identity matrix.
         inv = linalg_ops.matrix_inverse(y, adjoint=adjoint)
-        tf_ans = math_ops.matmul(inv, y, adjoint_b=adjoint)
+        tf_ans = test_util.matmul_without_tf32(inv, y, adjoint_b=adjoint)
         np_ans = np.identity(y.shape[-1])
         if x.ndim > 2:
           tiling = list(y.shape)
diff --git a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
index 6cf330ed981..98796f256ab 100644
--- a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_without_tensor_float_32
 class SquareRootOpTest(test.TestCase):
 
   def _verifySquareRoot(self, matrix, np_type):
@@ -36,7 +37,7 @@ class SquareRootOpTest(test.TestCase):
 
     # Verify that matmul(sqrtm(A), sqrtm(A)) = A
     sqrt = gen_linalg_ops.matrix_square_root(matrix)
-    square = math_ops.matmul(sqrt, sqrt)
+    square = test_util.matmul_without_tf32(sqrt, sqrt)
     self.assertShapeEqual(matrix, square)
     self.assertAllClose(matrix, square, rtol=1e-4, atol=1e-3)
 
diff --git a/tensorflow/python/kernel_tests/parse_single_example_op_test.py b/tensorflow/python/kernel_tests/parse_single_example_op_test.py
index ab270bf0d59..498b6a8fd65 100644
--- a/tensorflow/python/kernel_tests/parse_single_example_op_test.py
+++ b/tensorflow/python/kernel_tests/parse_single_example_op_test.py
@@ -856,6 +856,7 @@ class ParseSingleExampleTest(test.TestCase):
                                                  expected_err[1]):
           out = parsing_ops.parse_single_example(**kwargs)
           sess.run(flatten_values_tensors_or_sparse(out.values()))
+        return
       else:
         # Returns dict w/ Tensors and SparseTensors.
         out = parsing_ops.parse_single_example(**kwargs)
@@ -939,6 +940,20 @@ class ParseSingleExampleTest(test.TestCase):
         },
         expected_output)
 
+  def testExampleLongerThanSpec(self):
+    serialized = example(
+        features=features({
+            "a": bytes_feature([b"a", b"b"]),
+        })).SerializeToString()
+    self._test(
+        {
+            "serialized": ops.convert_to_tensor(serialized),
+            "features": {
+                "a": parsing_ops.FixedLenFeature(1, dtypes.string)
+            }
+        },
+        expected_err=(errors_impl.OpError, "Can't parse serialized Example"))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 7555230fa35..20699f5de49 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -50,15 +50,21 @@ def GetDeviceScope(self, use_gpu=False):
     return self.session(use_gpu=use_gpu)
 
 
-def GetTestConfigs(include_nchw_vect_c=False):
+def GetTestConfigs(include_nchw_vect_c=False, one_dimensional=False):
   """Get all the valid tests configs to run.
 
   Args:
     include_nchw_vect_c: Whether to include NCHW_VECT_C in the test configs.
+    one_dimensional: If it's a 1D test
 
   Returns:
     all the valid test configs as tuples of data_format and use_gpu.
   """
+  if one_dimensional:
+    test_configs = [("NWC", False), ("NWC", True)]
+    if test.is_gpu_available(cuda_only=True):
+      test_configs += [("NCW", True)]
+    return test_configs
   test_configs = [("NHWC", False), ("NHWC", True)]
   if not test.is_gpu_available(cuda_only=True):
     tf_logging.info("NCHW and NCHW_VECT_C tests skipped because not run with "
@@ -106,8 +112,12 @@ def GetShrunkInceptionMaxPoolShapes(shrink=30):
 
 class PoolingTest(test.TestCase):
 
+  def _isMaxPool(self, func):
+    return func in (nn_ops.max_pool, nn_ops.max_pool_v2)
+
   def _VerifyOneType(self, pool_func, input_sizes, ksize, strides, padding,
-                     data_format, data_type, expected, use_gpu, v2):
+                     data_format, data_type, expected, use_gpu, v2,
+                     use_negative_input=False):
     """Verifies the output values of the pooling function.
 
     Args:
@@ -121,6 +131,8 @@ class PoolingTest(test.TestCase):
       data_type: The data type to use to run the pooling operation.
       expected: An array containing the expected operation outputs.
       use_gpu: Whether we are running on GPU.
+      v2: Whether to use v2 version.
+      use_negative_input: If the input values should be negative.
     """
     total_size = 1
     for s in input_sizes:
@@ -141,10 +153,11 @@ class PoolingTest(test.TestCase):
                     data_type)
     # Initializes the input tensor with array containing incrementing
     # numbers from 1, wrapping round to -127 after 127 to support int8.
-    x = [((f + 128) % 255) - 127 for f in range(total_size)]
+    y = -1 if use_negative_input else 1
+    x = [(((f + 128) % 255) - 127)*y for f in range(total_size)]
     with self.cached_session(use_gpu=use_gpu):
       t = constant_op.constant(x, shape=input_sizes, dtype=data_type)
-      if data_format in ("NCHW", "NCHW_VECT_C"):
+      if data_format in ("NCHW", "NCHW_VECT_C", "NCW"):
         if data_format == "NCHW_VECT_C":
           t = test_util.NHWCToNCHW_VECT_C(t)
           t, _, _ = gen_array_ops.quantize_v2(t, -128.0, 127.0, dtypes.qint8)
@@ -152,6 +165,8 @@ class PoolingTest(test.TestCase):
           t = test_util.NHWCToNCHW(t)
         ksize = test_util.NHWCToNCHW(ksize)
         strides = test_util.NHWCToNCHW(strides)
+        if isinstance(padding, list):
+          padding = test_util.NHWCToNCHW(padding)
       ksize_placeholder = array_ops.placeholder(dtypes.int32, shape=[4])
       strides_placeholder = array_ops.placeholder(dtypes.int32, shape=[4])
       if v2:
@@ -184,7 +199,8 @@ class PoolingTest(test.TestCase):
       self.assertAllCloseAccordingToType(expected, actual.flatten())
 
   def _VerifyOneTest(self, pool_func, input_sizes, ksize, strides, padding,
-                     data_format, expected, use_gpu, v2):
+                     data_format, expected, use_gpu, v2,
+                     use_negative_input=False):
     """Verifies the output values of the pooling function.
 
     Args:
@@ -197,6 +213,8 @@ class PoolingTest(test.TestCase):
       data_format: The data format we use to run the pooling operation.
       expected: An array containing the expected operation outputs.
       use_gpu: Whether we are running on GPU.
+      v2: Whether to use v2 version.
+      use_negative_input: If the input values should be negative."
     """
     if data_format == "NCHW_VECT_C":
       avg_pool_func = nn_ops.avg_pool
@@ -204,17 +222,24 @@ class PoolingTest(test.TestCase):
       if pool_func == avg_pool_func:
         tf_logging.info("NCHW_VECT_C not yet implemented for avg_pool")
         return
+      if (self._isMaxPool(pool_func) and isinstance(padding, list)):
+        tf_logging.info("NCHW_VECT_C not yet implemented for max pool" +
+                        " with explicit padding")
+        return
 
     self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
-                        data_format, dtypes.float32, expected, use_gpu, v2)
+                        data_format, dtypes.float32, expected, use_gpu, v2,
+                        use_negative_input)
     if not test.is_built_with_rocm():
       # double datatype is not supported for pooling ops on the ROCm platform
       self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
-                          data_format, dtypes.float64, expected, use_gpu, v2)
+                          data_format, dtypes.float64, expected, use_gpu, v2,
+                          use_negative_input)
 
     if not use_gpu or test_util.GpuSupportsHalfMatMulAndConv():
       self._VerifyOneType(pool_func, input_sizes, ksize, strides, padding,
-                          data_format, dtypes.float16, expected, use_gpu, v2)
+                          data_format, dtypes.float16, expected, use_gpu, v2,
+                          use_negative_input)
 
   def _VerifyValues(self,
                     pool_func,
@@ -224,7 +249,9 @@ class PoolingTest(test.TestCase):
                     padding,
                     expected,
                     use_gpu,
-                    v2=False):
+                    v2=False,
+                    one_dim=False,
+                    use_negative_input=False):
     """Verifies the output values of the pooling function.
 
     Args:
@@ -236,11 +263,16 @@ class PoolingTest(test.TestCase):
       padding: Padding type.
       expected: An array containing the expected operation outputs.
       use_gpu: Whether we are running on GPU.
+      v2: Whether to use v2 version.
+      one_dim: If one dimensional pools should be done instead of two
+        dimensional pools.
+      use_negative_input: If the input values should be negative.
     """
-    for (data_format, use_gpu_2) in GetTestConfigs(True):
+    for (data_format, use_gpu_2) in GetTestConfigs(True, one_dim):
       if use_gpu_2 == use_gpu:
         self._VerifyOneTest(pool_func, input_sizes, ksize, strides, padding,
-                            data_format, expected, use_gpu, v2)
+                            data_format, expected, use_gpu, v2,
+                            use_negative_input)
 
   def _testAvgPoolValidPadding(self, use_gpu):
     expected_output = [7.0, 8.0, 9.0]
@@ -467,6 +499,101 @@ class PoolingTest(test.TestCase):
           use_gpu=use_gpu,
           v2=v2)
 
+  def _testMaxPoolZeroExplicitPadding(self, use_gpu):
+    expected_output = [9.0]
+    self._VerifyValues(
+        nn_ops.max_pool,
+        input_sizes=[1, 3, 3, 1],
+        ksize=[1, 3, 3, 1],
+        strides=[1, 2, 2, 1],
+        padding=[[0, 0], [0, 0], [0, 0], [0, 0]],
+        expected=expected_output,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolNegativeInputExpPadding(self, use_gpu):
+    expected_output = [-1, -1, -1, -1]
+    self._VerifyValues(
+        nn_ops.max_pool,
+        input_sizes=[1, 3, 3, 1],
+        ksize=[1, 3, 3, 1],
+        strides=[1, 2, 2, 1],
+        padding=[[0, 0], [2, 1], [2, 1], [0, 0]],
+        expected=expected_output,
+        use_gpu=use_gpu,
+        use_negative_input=True)
+
+  def _testMaxPoolExplicitPadding(self, use_gpu):
+    expected_output = [9.0, 9.0]
+    self._VerifyValues(
+        nn_ops.max_pool,
+        input_sizes=[1, 3, 3, 1],
+        ksize=[1, 3, 3, 1],
+        strides=[1, 2, 2, 1],
+        padding=[[0, 0], [0, 2], [0, 1], [0, 0]],
+        expected=expected_output,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolExplicitPaddingAdvanced(self, use_gpu):
+    expected_output = [7, 9, 11, 12, 19, 21, 23, 24, 31, 33, 35, 36, 31, 33,
+                       35, 36]
+    self._VerifyValues(
+        nn_ops.max_pool,
+        input_sizes=[1, 6, 6, 1],
+        ksize=[1, 3, 3, 1],
+        strides=[1, 2, 2, 1],
+        padding=[[0, 0], [1, 2], [2, 1], [0, 0]],
+        expected=expected_output,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolNegativeInputExpPaddingAdv(self, use_gpu):
+    expected_output = [-1, -1, -3, -5, -7, -7, -9, -11, -19, -19, -21, -23, -31,
+                       -31, -33, -35]
+
+    self._VerifyValues(
+        nn_ops.max_pool,
+        input_sizes=[1, 6, 6, 1],
+        ksize=[1, 3, 3, 1],
+        strides=[1, 2, 2, 1],
+        padding=[[0, 0], [1, 2], [2, 1], [0, 0]],
+        expected=expected_output,
+        use_gpu=use_gpu,
+        use_negative_input=True)
+
+  def _testMaxPoolExplicitPaddingV2(self, use_gpu):
+    expected_output = [9.0, 9.0]
+    self._VerifyValues(
+        nn_ops.max_pool_v2,
+        input_sizes=[1, 3, 3, 1],
+        ksize=[1, 3, 3, 1],
+        strides=[1, 2, 2, 1],
+        padding=[[0, 0], [0, 2], [0, 1], [0, 0]],
+        expected=expected_output,
+        use_gpu=use_gpu)
+
+  def _testMaxPoolExplicitPadding1D(self, use_gpu):
+    expected_output = [2.0, 3.0]
+    self._VerifyValues(
+        nn_ops.max_pool1d,
+        input_sizes=[1, 3, 1],
+        ksize=[1, 2, 1],
+        strides=[1, 2, 1],
+        padding=[[0, 0], [0, 1], [0, 0]],
+        expected=expected_output,
+        use_gpu=use_gpu,
+        one_dim=True)
+
+  def _testMaxPoolExplicitPadding1dV2(self, use_gpu):
+    expected_output = [2.0, 3.0]
+    self._VerifyValues(
+        nn_ops.max_pool_v2,
+        input_sizes=[1, 3, 1],
+        ksize=[1, 2, 1],
+        strides=[1, 2, 1],
+        padding=[[0, 0], [0, 1], [0, 0]],
+        expected=expected_output,
+        use_gpu=use_gpu,
+        one_dim=True)
+
   def _testMaxPoolSamePaddingNonSquareWindow(self, use_gpu):
     # input is:
     # [1.0, 2.0
@@ -618,6 +745,14 @@ class PoolingTest(test.TestCase):
       self._testMaxPoolSamePaddingPacket4(use_gpu)
       self._testMaxPoolSamePaddingPacket8(use_gpu)
       self._testMaxPoolEmptyInput(use_gpu)
+      self._testMaxPoolZeroExplicitPadding(use_gpu)
+      self._testMaxPoolExplicitPadding(use_gpu)
+      self._testMaxPoolExplicitPaddingV2(use_gpu)
+      self._testMaxPoolExplicitPadding1D(use_gpu)
+      self._testMaxPoolExplicitPadding1dV2(use_gpu)
+      self._testMaxPoolExplicitPaddingAdvanced(use_gpu)
+      self._testMaxPoolNegativeInputExpPadding(use_gpu)
+      self._testMaxPoolNegativeInputExpPaddingAdv(use_gpu)
 
   # Tests for DepthwiseMaxPooling on CPU only.
   @test_util.run_deprecated_v1
@@ -980,7 +1115,7 @@ class PoolingTest(test.TestCase):
                                 data_format,
                                 use_gpu,
                                 x_init_value=None):
-    """Verifies the gradients of the avg pooling function.
+    """Verifies the gradients of the max or avg pooling function.
 
     Args:
       pool_func: Function to be called, co.MaxPool, co.AvgPool,
@@ -1017,11 +1152,13 @@ class PoolingTest(test.TestCase):
         func_name = "max_pool"
         err_tolerance = 1e-3
       if data_format == "NCHW":
-        ksize = [1, 1, window_rows, window_rows]
+        ksize = [1, 1, window_rows, window_cols]
         strides = [1, 1, row_stride, col_stride]
+        if isinstance(padding, list):
+          padding = test_util.NHWCToNCHW(padding)
         t = test_util.NHWCToNCHW(input_tensor)
       else:
-        ksize = [1, window_rows, window_rows, 1]
+        ksize = [1, window_rows, window_cols, 1]
         strides = [1, row_stride, col_stride, 1]
         t = input_tensor
       t = pool_func(
@@ -1261,6 +1398,76 @@ class PoolingTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  def _testMaxPoolExplicitPadding1(self, data_format, use_gpu):
+    for pool_func in [nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+          pool_func,
+          input_sizes=[1, 7, 7, 1],
+          output_sizes=[1, 7, 7, 1],
+          window_rows=3,
+          window_cols=3,
+          row_stride=1,
+          col_stride=1,
+          padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+          data_format=data_format,
+          use_gpu=use_gpu)
+
+  def _testMaxPoolExplicitPadding2(self, data_format, use_gpu):
+    for pool_func in [nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+          pool_func,
+          input_sizes=[1, 7, 7, 1],
+          output_sizes=[1, 6, 8, 1],
+          window_rows=3,
+          window_cols=5,
+          row_stride=1,
+          col_stride=1,
+          padding=[[0, 0], [0, 1], [2, 3], [0, 0]],
+          data_format=data_format,
+          use_gpu=use_gpu)
+
+  def _testMaxPoolExplicitPaddingLeftGreater(self, data_format, use_gpu):
+    for pool_func in [nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+          pool_func,
+          input_sizes=[1, 7, 7, 1],
+          output_sizes=[1, 6, 8, 1],
+          window_rows=3,
+          window_cols=5,
+          row_stride=1,
+          col_stride=1,
+          padding=[[0, 0], [0, 1], [3, 2], [0, 0]],
+          data_format=data_format,
+          use_gpu=use_gpu)
+
+  def _testMaxPoolExplicitPaddingBatchChannel(self, data_format, use_gpu):
+    for pool_func in [nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+          pool_func,
+          input_sizes=[4, 7, 7, 3],
+          output_sizes=[4, 6, 8, 3],
+          window_rows=3,
+          window_cols=5,
+          row_stride=1,
+          col_stride=1,
+          padding=[[0, 0], [0, 1], [3, 2], [0, 0]],
+          data_format=data_format,
+          use_gpu=use_gpu)
+
+  def _testMaxPoolExplicitPaddingStrides(self, data_format, use_gpu):
+    for pool_func in [nn_ops.max_pool]:
+      self._ConstructAndTestGradient(
+          pool_func,
+          input_sizes=[1, 7, 7, 1],
+          output_sizes=[1, 4, 3, 1],
+          window_rows=3,
+          window_cols=3,
+          row_stride=2,
+          col_stride=3,
+          padding=[[0, 0], [1, 1], [1, 1], [0, 0]],
+          data_format=data_format,
+          use_gpu=use_gpu)
+
   @test_util.run_deprecated_v1
   def testMaxPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
@@ -1274,6 +1481,11 @@ class PoolingTest(test.TestCase):
       self._testMaxPoolGradSamePadding2_1(data_format, use_gpu)
       self._testMaxPoolGradSamePadding2_2(data_format, use_gpu)
       self._testMaxPoolGradSamePadding3_1(data_format, use_gpu)
+      self._testMaxPoolExplicitPadding1(data_format, use_gpu)
+      self._testMaxPoolExplicitPadding2(data_format, use_gpu)
+      self._testMaxPoolExplicitPaddingStrides(data_format, use_gpu)
+      self._testMaxPoolExplicitPaddingLeftGreater(data_format, use_gpu)
+      self._testMaxPoolExplicitPaddingBatchChannel(data_format, use_gpu)
 
   def _MaxPoolGrad(self, orig_input, orig_output, grad, window_rows,
                    window_cols, row_stride, col_stride, padding, v2):
@@ -1294,9 +1506,16 @@ class PoolingTest(test.TestCase):
       A Tensor.
     """
     pool_func = gen_nn_ops.max_pool_grad_v2 if v2 else gen_nn_ops.max_pool_grad
-    return pool_func(orig_input, orig_output, grad,
-                     [1, window_rows, window_cols, 1],
-                     [1, row_stride, col_stride, 1], padding)
+    if v2:
+      return pool_func(orig_input, orig_output, grad,
+                       [1, window_rows, window_cols, 1],
+                       [1, row_stride, col_stride, 1], padding)
+    else:
+      padding, explicit_paddings = nn_ops.convert_padding(padding)
+      return pool_func(orig_input, orig_output, grad,
+                       [1, window_rows, window_cols, 1],
+                       [1, row_stride, col_stride, 1], padding,
+                       explicit_paddings)
 
   def _testMaxPoolGradDirect(self, input_data, output_backprop,
                              expected_input_backprop, input_sizes, output_sizes,
@@ -1439,6 +1658,116 @@ class PoolingTest(test.TestCase):
             use_gpu=use_gpu,
             v2=v2)
 
+  def _testMaxPoolGradZeroExplicitPadding(self):
+    input_data = [
+        1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0,
+        0.0, 1.0
+    ]
+    output_backprop = [11.0, 12.0, 13.0, 15.0, 16.0, 17.0, 19.0, 20.0, 21.0]
+    expected_input_backprop = [
+        11.0, 0.0, 25.0, 0.0, 0.0, 31.0, 0.0, 17.0, 19.0, 0.0, 41.0, 0.0, 0.0,
+        0.0, 0.0, 0.0
+    ]
+
+    for use_gpu in True, False:
+      for v2 in [False]:
+        self._testMaxPoolGradDirect(
+            input_data,
+            output_backprop,
+            expected_input_backprop,
+            input_sizes=[1, 4, 4, 1],
+            output_sizes=[1, 3, 3, 1],
+            window_rows=2,
+            window_cols=2,
+            row_stride=1,
+            col_stride=1,
+            padding=[[0, 0], [0, 0], [0, 0], [0, 0]],
+            use_gpu=use_gpu,
+            v2=v2)
+
+  def _testMaxPoolGradExplicitPadding_1(self):
+    input_data = [
+        1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0,
+        0.0, 1.0
+    ]
+    output_backprop = [11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0,
+                       20.0, 21.0, 22.0]
+    expected_input_backprop = [
+        11.0, 0.0, 25.0, 0.0, 0.0, 31.0, 0.0, 49.0, 19.0, 0.0, 41.0, 0.0, 0.0,
+        0.0, 0.0, 22.0
+    ]
+
+    for use_gpu in True, False:
+      for v2 in [False]:
+        self._testMaxPoolGradDirect(
+            input_data,
+            output_backprop,
+            expected_input_backprop,
+            input_sizes=[1, 4, 4, 1],
+            output_sizes=[1, 3, 4, 1],
+            window_rows=2,
+            window_cols=2,
+            row_stride=1,
+            col_stride=1,
+            padding=[[0, 0], [0, 0], [0, 1], [0, 0]],
+            use_gpu=use_gpu,
+            v2=v2)
+
+  def _testMaxPoolGradExplicitPadding_2(self):
+    input_data = [
+        1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0,
+        0.0, 1.0
+    ]
+    output_backprop = [11.0, 12.0, 13.0, 15.0, 16.0, 17.0, 19.0, 20.0, 21.0]
+    expected_input_backprop = [
+        54.0, 0.0, 30.0, 0.0, 0.0, 0.0, 0.0, 0.0, 39.0, 0.0, 21.0, 0.0, 0.0,
+        0.0, 0.0, 0.0
+    ]
+
+    for use_gpu in True, False:
+      for v2 in [False]:
+        self._testMaxPoolGradDirect(
+            input_data,
+            output_backprop,
+            expected_input_backprop,
+            input_sizes=[1, 4, 4, 1],
+            output_sizes=[1, 3, 3, 1],
+            window_rows=3,
+            window_cols=3,
+            row_stride=2,
+            col_stride=2,
+            padding=[[0, 0], [2, 1], [2, 1], [0, 0]],
+            use_gpu=use_gpu,
+            v2=v2)
+
+  def _testMaxPoolGradExplicitPadding_3(self):
+    input_data = [
+        -1.0, -5.0, -1.0, -5.0, -5.0, -1.0, -5.0, -1.0, -1.0, -5.0, -1.0, -5.0,
+        -5.0, -1.0, -5.0, -1.0
+    ]
+    output_backprop = [11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0,
+                       20.0, 21.0, 22.0]
+    expected_input_backprop = [
+        11.0, 0.0, 25.0, 0.0, 0.0, 31.0, 0.0, 49.0, 19.0, 0.0, 41.0, 0.0, 0.0,
+        0.0, 0.0, 22.0
+    ]
+
+    for use_gpu in True, False:
+      for v2 in [False]:
+        self._testMaxPoolGradDirect(
+            input_data,
+            output_backprop,
+            expected_input_backprop,
+            input_sizes=[1, 4, 4, 1],
+            output_sizes=[1, 3, 4, 1],
+            window_rows=2,
+            window_cols=2,
+            row_stride=1,
+            col_stride=1,
+            padding=[[0, 0], [0, 0], [0, 1], [0, 0]],
+            use_gpu=use_gpu,
+            v2=v2)
+
   @test_util.no_xla_auto_jit("b/123923733")  # NaNs handled differently
   def _testMaxPoolGradDirectWithNans2_1(self):
     input_data = [float("nan")] * 16
@@ -1615,6 +1944,10 @@ class PoolingTest(test.TestCase):
     self._testMaxPoolGradDirect1_3()
     self._testMaxPoolGradDirectWithNans2_1()
     self._testMaxPoolGradDirectWithNans2_2()
+    self._testMaxPoolGradZeroExplicitPadding()
+    self._testMaxPoolGradExplicitPadding_1()
+    self._testMaxPoolGradExplicitPadding_2()
+    self._testMaxPoolGradExplicitPadding_3()
 
   def _testMaxPoolGradGradValidPadding1_1(self, data_format, use_gpu):
     for pool_func in [gen_nn_ops.max_pool_v2, nn_ops.max_pool]:
@@ -1956,6 +2289,94 @@ class PoolingTest(test.TestCase):
               strides=[1, 1, 1, 1],
               padding="VALID")
 
+  @test_util.run_deprecated_v1
+  def _testEdgeCasesRaiseErrors(self):
+    with self.assertRaisesRegexp(
+        ValueError, "Data formats NCHW_VECT_C is not yet supported with "
+                    "explicit padding"):
+      nn_ops.max_pool(
+          array_ops.placeholder(dtypes.float32, shape=[1, 3, 3, 1]),
+          ksize=[1, 2, 2, 1],
+          strides=[1, 2, 2, 1],
+          padding=[[0, 0], [0, 1], [0, 1], [0, 0]],
+          data_format="NCHW_VECT_C")
+    with self.assertRaisesRegexp(
+        ValueError, "Explicit padding is not yet supported with an input "
+                    "tensor of rank 5"):
+      nn_ops.max_pool_v2(
+          array_ops.placeholder(dtypes.float32, shape=[1, 3, 3, 1, 1]),
+          ksize=[1, 2, 2, 1, 1],
+          strides=[1, 2, 2, 1, 1],
+          padding=[[0, 0], [0, 1], [0, 1], [0, 0]],
+          data_format="NCHW")
+    with self.assertRaisesRegexp(
+        ValueError, "Attr 'padding' of 'MaxPoolV2' Op passed "
+                    "string 'EXPLICIT'"):
+      gen_nn_ops.max_pool_v2(
+          array_ops.placeholder(dtypes.float32, shape=[1, 3, 3, 1, 1]),
+          ksize=[1, 2, 2, 1, 1],
+          strides=[1, 2, 2, 1, 1],
+          padding="EXPLICIT",
+          data_format="NHWC")
+
+  @test_util.run_deprecated_v1
+  def _testEdgeCasesExcessPadding(self):
+    with self.session(use_gpu=test.is_gpu_available()) as sess:
+      with self.assertRaisesRegexp(
+          errors_impl.InvalidArgumentError,
+          "Right padding 2 needs to be smaller than the window size 2"):
+        input_sizes = [1, 3, 3, 1]
+        x = [(((f + 128) % 255) - 127) for f in range(9)]
+        t = constant_op.constant(x, shape=input_sizes, dtype=dtypes.float32)
+        sess.run(gen_nn_ops.max_pool(
+            t,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding="EXPLICIT",
+            explicit_paddings=[0, 0, 0, 1, 0, 2, 0, 0],
+            data_format="NHWC"))
+
+  @test_util.run_deprecated_v1
+  def _testNegativePadding(self):
+    with self.session(use_gpu=test.is_gpu_available()) as sess:
+      with self.assertRaisesRegexp(
+          ValueError, "All elements of explicit_paddings must be "
+                      "nonnegative for"):
+        input_sizes = [1, 3, 3, 1]
+        x = [(((f + 128) % 255) - 127) for f in range(9)]
+        t = constant_op.constant(x, shape=input_sizes, dtype=dtypes.float32)
+        sess.run(gen_nn_ops.max_pool(
+            t,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding="EXPLICIT",
+            explicit_paddings=[0, 0, -1, -1, -1, -1, 0, 0],
+            data_format="NHWC"))
+
+  @test_util.run_deprecated_v1
+  def _testExplicitPaddingBatch(self):
+    with self.session(use_gpu=test.is_gpu_available()) as sess:
+      with self.assertRaisesRegexp(
+          ValueError, "Nonzero explicit padding in the batch or depth "
+                      "dimensions is not supported"):
+        input_sizes = [1, 3, 3, 1]
+        x = [(((f + 128) % 255) - 127) for f in range(9)]
+        t = constant_op.constant(x, shape=input_sizes, dtype=dtypes.float32)
+        sess.run(gen_nn_ops.max_pool(
+            t,
+            ksize=[1, 2, 2, 1],
+            strides=[1, 2, 2, 1],
+            padding="EXPLICIT",
+            explicit_paddings=[1, 1, 1, 1, 1, 1, 0, 0],
+            data_format="NHWC"))
+
+  def testExplicitPaddingEdgeCases(self):
+    # Tests for Explicit padding.
+    self._testEdgeCasesRaiseErrors()
+    self._testEdgeCasesExcessPadding()
+    self._testExplicitPaddingBatch()
+    self._testNegativePadding()
+
 
 def GetMaxPoolFwdTest(input_size, filter_size, strides, padding):
 
diff --git a/tensorflow/python/kernel_tests/proto/BUILD b/tensorflow/python/kernel_tests/proto/BUILD
index 0e935dfe8c4..e5c46c76e2e 100644
--- a/tensorflow/python/kernel_tests/proto/BUILD
+++ b/tensorflow/python/kernel_tests/proto/BUILD
@@ -126,6 +126,7 @@ tf_py_test(
     tags = [
         "no_pip",
     ],
+    tfrt_enabled = True,
     deps = [
         ":descriptor_source_test_base",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/qr_op_test.py b/tensorflow/python/kernel_tests/qr_op_test.py
index b895fe4ea99..0a618b7f555 100644
--- a/tensorflow/python/kernel_tests/qr_op_test.py
+++ b/tensorflow/python/kernel_tests/qr_op_test.py
@@ -200,6 +200,8 @@ def _GetQrGradOpTest(dtype_, shape_, full_matrices_):
     return a
 
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_without_tensor_float_32("Tests Qr gradient, which calls matmul"
+                                        )
   def Test(self):
     np.random.seed(42)
     # Optimal stepsize for central difference is O(epsilon^{1/3}).
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index 06360fc2095..f94ef8ee862 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -1,7 +1,7 @@
 # Tests of TensorFlow kernels written using the Python API.
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -24,6 +24,7 @@ cuda_py_test(
     name = "parameterized_truncated_normal_op_test",
     size = "medium",
     srcs = ["parameterized_truncated_normal_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
@@ -42,6 +43,10 @@ tf_py_test(
     name = "random_shuffle_queue_test",
     size = "small",
     srcs = ["random_shuffle_queue_test.py"],
+    tags = [
+        "no_cuda_on_cpu_tap",  # TODO(b/171060960) flakyly broken assertions
+    ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:data_flow_ops",
@@ -57,6 +62,7 @@ cuda_py_test(
     name = "multinomial_op_test",
     size = "small",
     srcs = ["multinomial_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -76,6 +82,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["multinomial_op_big_test.py"],
     shard_count = 3,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -94,6 +101,7 @@ cuda_py_test(
     name = "random_crop_test",
     size = "small",
     srcs = ["random_crop_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:random_ops",
@@ -154,6 +162,7 @@ cuda_py_test(
     name = "random_grad_test",
     size = "small",
     srcs = ["random_grad_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -174,6 +183,7 @@ tf_py_test(
     srcs = ["random_binomial_test.py"],
     shard_count = 3,
     tags = ["no_oss"],
+    tfrt_enabled = True,
     deps = [
         ":util",
         "//tensorflow/python:array_ops",
@@ -190,6 +200,7 @@ cuda_py_test(
     name = "random_poisson_test",
     size = "medium",
     srcs = ["random_poisson_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":util",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/python/kernel_tests/random/random_poisson_test.py b/tensorflow/python/kernel_tests/random/random_poisson_test.py
index 51dd4cb47ca..eafa1d9382c 100644
--- a/tensorflow/python/kernel_tests/random/random_poisson_test.py
+++ b/tensorflow/python/kernel_tests/random/random_poisson_test.py
@@ -171,6 +171,11 @@ class RandomPoissonTest(test.TestCase):
               constant_op.constant([1], dtype=lam_dt), [10],
               dtype=out_dt).eval()
 
+  @test_util.run_deprecated_v1
+  def testInfRate(self):
+    sample = random_ops.random_poisson(shape=[2], lam=np.inf)
+    self.assertAllEqual([np.inf, np.inf], self.evaluate(sample))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
index f3949f30c03..547089b522f 100644
--- a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
@@ -22,10 +22,13 @@ import functools
 
 from absl.testing import parameterized
 import numpy as np
+from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
+from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -35,6 +38,31 @@ from tensorflow.python.ops import stateless_random_ops as stateless
 from tensorflow.python.platform import test
 
 
+# Note that in theory each test will reset the eager context and may choose to
+# hide some devices, so we shouldn't cache this transient info. Tests in this
+# file don't make those config changes, so caching is fine. It provides a good
+# speed-up.
+_cached_device = None
+
+
+def get_device():
+  global _cached_device
+  if _cached_device is not None:
+    return _cached_device
+  # Precedence from high to low
+  for device_type in ('XLA_GPU', 'GPU', 'XLA_CPU', 'CPU'):
+    devices = config.list_logical_devices(device_type)
+    if devices:
+      _cached_device = devices[0]
+      return _cached_device
+  raise ValueError('Cannot find any suitable device. Available devices: %s' %
+                   config.list_logical_devices())
+
+
+BEFORE_EXPIRE = (2020, 10, 24)
+AFTER_EXPIRE = (2020, 10, 26)
+
+
 def invert_philox(key, value):
   """Invert the Philox bijection."""
   key = np.array(key, dtype=np.uint32)
@@ -59,47 +87,71 @@ SEED_TYPES = [dtypes.int32, dtypes.int64]
 def float_cases(shape_dtypes=(None,)):
   cases = (
       # Uniform distribution, with and without range
-      (stateless.stateless_random_uniform, random_ops.random_uniform, {}),
-      (stateless.stateless_random_uniform, random_ops.random_uniform,
-       dict(minval=2.2, maxval=7.1)),
+      ('uniform', stateless.stateless_random_uniform, random_ops.random_uniform,
+       {}),
+      ('uniform2', stateless.stateless_random_uniform,
+       random_ops.random_uniform, dict(minval=2.2, maxval=7.1)),
       # Normal distribution, with and without mean+stddev
-      (stateless.stateless_random_normal, random_ops.random_normal, {}),
-      (stateless.stateless_random_normal, random_ops.random_normal,
+      ('normal', stateless.stateless_random_normal, random_ops.random_normal,
+       {}),
+      ('normal2', stateless.stateless_random_normal, random_ops.random_normal,
        dict(mean=2, stddev=3)),
       # Truncated normal distribution, with and without mean+stddev
-      (stateless.stateless_truncated_normal, random_ops.truncated_normal, {}),
-      (stateless.stateless_truncated_normal, random_ops.truncated_normal,
-       dict(mean=3, stddev=4)),
+      ('trnorm', stateless.stateless_truncated_normal,
+       random_ops.truncated_normal, {}),
+      ('trnorm2', stateless.stateless_truncated_normal,
+       random_ops.truncated_normal, dict(mean=3, stddev=4)),
   )
   # Explicitly passing in params because capturing cell variable from loop is
   # problematic in Python
   def wrap(op, dtype, shape, shape_dtype, kwds, seed):
+    device_type = get_device().device_type
+    # Some dtypes are not supported on some devices
+    if (dtype == dtypes.float16 and device_type in ('XLA_GPU', 'XLA_CPU') or
+        dtype == dtypes.bfloat16 and device_type == 'GPU'):
+      dtype = dtypes.float32
     shape_ = (constant_op.constant(shape, dtype=shape_dtype)
               if shape_dtype is not None else shape)
     return op(seed=seed, shape=shape_, dtype=dtype, **kwds)
-  for dtype in dtypes.float16, dtypes.float32, dtypes.float64:
+
+  def _name(a):
+    if hasattr(a, 'name'):
+      return a.name
+    else:
+      return a
+
+  for dtype in dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64:
     for shape_dtype in shape_dtypes:
       for shape in (), (3,), (2, 5):
-        for stateless_op, stateful_op, kwds in cases:
-          yield (functools.partial(wrap, stateless_op, dtype, shape,
+        for name, stateless_op, stateful_op, kwds in cases:
+          yield (('%s_%s_%s_%s' %
+                  (name, _name(dtype), shape, _name(shape_dtype))).replace(
+                      ' ', ''),
+                 functools.partial(wrap, stateless_op, dtype, shape,
                                    shape_dtype, kwds),
-                 functools.partial(wrap, stateful_op, dtype, shape,
-                                   shape_dtype, kwds))
+                 functools.partial(wrap, stateful_op, dtype, shape, shape_dtype,
+                                   kwds))
 
 
-def int_cases(shape_dtypes=(None,)):
-  def wrap(op, shape, shape_dtype, dtype, seed):
+def int_cases(shape_dtypes=(None,), minval_maxval=None):
+
+  def wrap(op, minval, maxval, shape, shape_dtype, dtype, seed):
     shape_ = (constant_op.constant(shape, dtype=shape_dtype)
               if shape_dtype is not None else shape)
-    return op(seed=seed, shape=shape_, minval=2, maxval=11111,
-              dtype=dtype)
-  for shape_dtype in shape_dtypes:
-    for shape in (), (3,), (2, 5):
-      for dtype in dtypes.int32, dtypes.int64:
-        yield (functools.partial(wrap, stateless.stateless_random_uniform,
-                                 shape, shape_dtype, dtype),
-               functools.partial(wrap, random_ops.random_uniform,
-                                 shape, shape_dtype, dtype))
+    return op(
+        seed=seed, shape=shape_, minval=minval, maxval=maxval, dtype=dtype)
+
+  if minval_maxval is None:
+    minval_maxval = ((2, 11111),)
+  for minval, maxval in minval_maxval:
+    for shape_dtype in shape_dtypes:
+      for shape in (), (3,), (2, 5):
+        for dtype in dtypes.int32, dtypes.int64:
+          yield ('uniform_%s_%s' % (minval, maxval),
+                 functools.partial(wrap, stateless.stateless_random_uniform,
+                                   minval, maxval, shape, shape_dtype, dtype),
+                 functools.partial(wrap, random_ops.random_uniform, minval,
+                                   maxval, shape, shape_dtype, dtype))
 
 
 def multinomial_cases():
@@ -112,7 +164,8 @@ def multinomial_cases():
     for output_dtype in dtypes.int32, dtypes.int64:
       for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
                                                 [0.25, 0.75]]):
-        yield (functools.partial(wrap, stateless.stateless_multinomial, logits,
+        yield ('multinomial',
+               functools.partial(wrap, stateless.stateless_multinomial, logits,
                                  logits_dtype, output_dtype),
                functools.partial(wrap, random_ops.multinomial, logits,
                                  logits_dtype, output_dtype))
@@ -124,10 +177,11 @@ def gamma_cases():
               alpha=constant_op.constant(alpha, dtype=dtype), dtype=dtype)
   for dtype in np.float16, np.float32, np.float64:
     for alpha in ([[.5, 1., 2.]], [[0.5, 0.5], [0.8, 0.2], [0.25, 0.75]]):
-      yield (functools.partial(wrap, stateless.stateless_random_gamma, alpha,
+      yield ('gamma',
+             functools.partial(wrap, stateless.stateless_random_gamma, alpha,
                                dtype, (10,) + tuple(np.shape(alpha))),
-             functools.partial(wrap, random_ops.random_gamma, alpha,
-                               dtype, (10,)))
+             functools.partial(wrap, random_ops.random_gamma, alpha, dtype,
+                               (10,)))
 
 
 def poisson_cases():
@@ -138,7 +192,8 @@ def poisson_cases():
   for lam_dtype in np.float16, np.float32, np.float64, np.int32, np.int64:
     for out_dtype in np.float16, np.float32, np.float64, np.int32, np.int64:
       for lam in ([[5.5, 1., 2.]], [[7.5, 10.5], [3.8, 8.2], [1.25, 9.75]]):
-        yield (functools.partial(wrap, stateless.stateless_random_poisson, lam,
+        yield ('poisson',
+               functools.partial(wrap, stateless.stateless_random_poisson, lam,
                                  lam_dtype, out_dtype,
                                  (10,) + tuple(np.shape(lam))),
                functools.partial(wrap, random_ops.random_poisson, lam,
@@ -153,22 +208,28 @@ class StatelessOpsTest(test.TestCase, parameterized.TestCase):
     key = 0x3ec8f720, 0x02461e29
     preseed = invert_philox(key, (seed[0], 0, seed[1], 0)).astype(np.uint64)
     preseed = preseed[::2] | preseed[1::2] << 32
-    random_seed.set_random_seed(seed[0])
-    with test_util.use_gpu():
-      stateless_op, stateful_op = case
-      if context.executing_eagerly():
-        # Call set_random_seed in order to clear kernel cache, to prevent
-        # kernel reusing for the stateful op
-        random_seed.set_random_seed(seed[0])
+    with ops.device(get_device().name):
+      _, stateless_op, stateful_op = case
+      random_seed.set_random_seed(seed[0])
       stateful = stateful_op(seed=seed[1])
       pure = stateless_op(seed=preseed)
       self.assertAllEqual(stateful, pure)
 
+  def _test_old_and_new_stateless_match(self, case, seed):
+    """Tests that the new stateless ops match the old stateless ones."""
+    with ops.device(get_device().name):
+      _, stateless_op, _ = case
+      with compat.forward_compatibility_horizon(*BEFORE_EXPIRE):
+        old = stateless_op(seed=seed)
+      with compat.forward_compatibility_horizon(*AFTER_EXPIRE):
+        new = stateless_op(seed=seed)
+      self.assertAllClose(old, new)
+
   def _test_determinism(self, case, seed_type):
     # Stateless values should be equal iff the seeds are equal (roughly)
     seeds = [(x, y) for x in range(5) for y in range(5)] * 3  # pylint: disable=g-complex-comprehension
-    with self.test_session(use_gpu=True), test_util.use_gpu():
-      stateless_op, _ = case
+    with self.test_session(use_gpu=True), ops.device(get_device().name):
+      _, stateless_op, _ = case
       if context.executing_eagerly():
         values = [
             (seed, stateless_op(seed=constant_op.constant(seed, seed_type)))
@@ -183,88 +244,172 @@ class StatelessOpsTest(test.TestCase, parameterized.TestCase):
         ]
       for s0, v0 in values:
         for s1, v1 in values:
-          self.assertEqual(s0 == s1, np.all(v0 == v1))
+          if dtypes.as_dtype(v0.dtype) != dtypes.bfloat16:
+            self.assertEqual(s0 == s1, np.all(v0 == v1))
+          elif s0 == s1:
+            # Skip the s0 != s1 case because v0 and v1 can be either equal or
+            # unequal in that case due to bfloat16's low precision
+            self.assertAllEqual(v0, v1)
 
   @parameterized.named_parameters(
-      ('_%s_%s' % (case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
+      ('_%s_%s_%s' % (case[0], case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
       for seed_id, seed in enumerate(SEEDS)
       for case_id, case in enumerate(float_cases()))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   def testMatchFloat(self, case, seed):
+    if get_device().device_type in ('XLA_GPU', 'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Skip on XLA because XLA kernels do not support int64 '
+                    'seeds needed by this test.')
     self._test_match(case, seed)
 
   @parameterized.named_parameters(
-      ('_%s_%s' % (case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
+      ('_%s_%s_%s' % (case[0], case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
       for seed_id, seed in enumerate(SEEDS)
       for case_id, case in enumerate(int_cases()))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   def testMatchInt(self, case, seed):
+    if get_device().device_type in ('XLA_GPU', 'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Skip on XLA because XLA kernels do not support int64 '
+                    'seeds needed by this test.')
     self._test_match(case, seed)
 
   @parameterized.named_parameters(
-      ('_%s_%s' % (case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
+      ('_%s_%s_%s' % (case[0], case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
       for seed_id, seed in enumerate(SEEDS)
       for case_id, case in enumerate(multinomial_cases()))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   def testMatchMultinomial(self, case, seed):
+    if get_device().device_type in ('XLA_GPU', 'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking XLA kernel')
     self._test_match(case, seed)
 
   @parameterized.named_parameters(
-      ('_%s_%s' % (case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
+      ('_%s_%s_%s' % (case[0], case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
       for seed_id, seed in enumerate(SEEDS)
       for case_id, case in enumerate(gamma_cases()))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   def testMatchGamma(self, case, seed):
+    if get_device().device_type == 'GPU':
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking GPU kernel')
+    if get_device().device_type in ('XLA_GPU', 'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking XLA kernel')
     self._test_match(case, seed)
 
   @parameterized.named_parameters(
-      ('_%s_%s' % (case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
+      ('_%s_%s_%s' % (case[0], case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
       for seed_id, seed in enumerate(SEEDS)
       for case_id, case in enumerate(poisson_cases()))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   def testMatchPoisson(self, case, seed):
+    if get_device().device_type == 'GPU':
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking GPU kernel')
+    if get_device().device_type in ('XLA_GPU', 'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking XLA kernel')
     self._test_match(case, seed)
 
   @parameterized.named_parameters(
-      ('_%s_%s' % (case_id, type_id), case, seed_type)  # pylint: disable=g-complex-comprehension
-      for type_id, seed_type in enumerate(SEED_TYPES)
-      for case_id, case in enumerate(float_cases(
-          shape_dtypes=(dtypes.int32, dtypes.int64))))
+      ('_%s_%s_%s' % (case[0], case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
+      for seed_id, seed in enumerate(SEEDS)
+      for case_id, case in enumerate(float_cases()))
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
+  def testOldAndNewStatelessMatchFloat(self, case, seed):
+    self._test_old_and_new_stateless_match(case, seed)
+
+  @parameterized.named_parameters(
+      ('_%s_%s_%s' % (case[0], case_id, seed_id), case, seed)  # pylint: disable=g-complex-comprehension
+      for seed_id, seed in enumerate(SEEDS)
+      for case_id, case in enumerate(
+          int_cases(minval_maxval=((2, 11111), (None, None)))))
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
+  def testOldAndNewStatelessMatchInt(self, case, seed):
+    self._test_old_and_new_stateless_match(case, seed)
+
+  @parameterized.named_parameters(
+      ('_%s_%s_%s' % (case[0], seed_type.name, case_id), case, seed_type)  # pylint: disable=g-complex-comprehension
+      for seed_type in SEED_TYPES
+      for case_id, case in enumerate(
+          float_cases(shape_dtypes=(dtypes.int32, dtypes.int64))))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   def testDeterminismFloat(self, case, seed_type):
+    if seed_type == dtypes.int64 and get_device().device_type in ('XLA_GPU',
+                                                                  'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest(
+          'Skip on XLA because XLA kernels do not support int64 seeds.')
     self._test_determinism(case, seed_type)
 
   @parameterized.named_parameters(
-      ('_%s_%s' % (case_id, type_id), case, seed_type)  # pylint: disable=g-complex-comprehension
-      for type_id, seed_type in enumerate(SEED_TYPES)
-      for case_id, case in enumerate(int_cases(
-          shape_dtypes=(dtypes.int32, dtypes.int64))))
+      ('_%s_%s_%s' % (case[0], seed_type.name, case_id), case, seed_type)  # pylint: disable=g-complex-comprehension
+      for seed_type in SEED_TYPES
+      for case_id, case in enumerate(
+          int_cases(shape_dtypes=(dtypes.int32, dtypes.int64))))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   def testDeterminismInt(self, case, seed_type):
+    if seed_type == dtypes.int64 and get_device().device_type in ('XLA_GPU',
+                                                                  'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest(
+          'Skip on XLA because XLA kernels do not support int64 seeds.')
     self._test_determinism(case, seed_type)
 
   @parameterized.named_parameters(
-      ('_%s_%s' % (case_id, type_id), case, seed_type)  # pylint: disable=g-complex-comprehension
-      for type_id, seed_type in enumerate(SEED_TYPES)
+      ('_%s_%s_%s' % (case[0], seed_type.name, case_id), case, seed_type)  # pylint: disable=g-complex-comprehension
+      for seed_type in SEED_TYPES
       for case_id, case in enumerate(multinomial_cases()))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   def testDeterminismMultinomial(self, case, seed_type):
+    if get_device().device_type in ('XLA_GPU', 'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking XLA kernel')
     self._test_determinism(case, seed_type)
 
   @parameterized.named_parameters(
-      ('_%s_%s' % (case_id, type_id), case, seed_type)  # pylint: disable=g-complex-comprehension
-      for type_id, seed_type in enumerate(SEED_TYPES)
+      ('_%s_%s_%s' % (case[0], seed_type.name, case_id), case, seed_type)  # pylint: disable=g-complex-comprehension
+      for seed_type in SEED_TYPES
       for case_id, case in enumerate(gamma_cases()))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   def testDeterminismGamma(self, case, seed_type):
+    if get_device().device_type == 'GPU':
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking GPU kernel')
+    if get_device().device_type in ('XLA_GPU', 'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking XLA kernel')
     self._test_determinism(case, seed_type)
 
   @parameterized.named_parameters(
-      ('_%s_%s' % (case_id, type_id), case, seed_type)  # pylint: disable=g-complex-comprehension
-      for type_id, seed_type in enumerate(SEED_TYPES)
+      ('_%s_%s_%s' % (case[0], seed_type.name, case_id), case, seed_type)  # pylint: disable=g-complex-comprehension
+      for seed_type in SEED_TYPES
       for case_id, case in enumerate(poisson_cases()))
   @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   def testDeterminismPoisson(self, case, seed_type):
+    if get_device().device_type == 'GPU':
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking GPU kernel')
+    if get_device().device_type in ('XLA_GPU', 'XLA_CPU'):
+      # This test was passing before because soft placement silently picked the
+      # CPU kernels.
+      self.skipTest('Lacking XLA kernel')
     self._test_determinism(case, seed_type)
 
   def assertDTypeEqual(self, a, b):
@@ -327,4 +472,6 @@ class StatelessOpsTest(test.TestCase, parameterized.TestCase):
 
 
 if __name__ == '__main__':
+  config.set_soft_device_placement(False)
+  context.context().enable_xla_devices()
   test.main()
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 9a927b86d0b..44279a98a39 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import memory_checker
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -56,8 +57,6 @@ from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
 
 
-@test_util.disable_tfrt(
-    "Trying to assign variable with wrong dtype. b/156200342")
 @test_util.with_control_flow_v2
 class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
                               parameterized.TestCase):
@@ -104,6 +103,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       v0 = resource_variable_ops.ResourceVariable(1.0)
       self.assertAllEqual(v0.numpy(), 1.0)
 
+  @test_util.disable_tfrt("b/169375363: error code support")
   def testReadVariableDtypeMismatchEager(self):
     with context.eager_mode():
       handle = resource_variable_ops.var_handle_op(
@@ -169,10 +169,12 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
   @test_util.run_in_graph_and_eager_modes
   def testVariableShape(self):
     v = resource_variable_ops.ResourceVariable([1., 1.])
+    vshape = resource_variable_ops.variable_shape(v.handle)
     self.assertAllEqual(
-        tensor_util.constant_value(
-            resource_variable_ops.variable_shape(v.handle)),
+        tensor_util.constant_value(vshape),
         [2])
+    if not context.executing_eagerly():
+      self.assertEqual("Const", vshape.op.type)
 
   @test_util.run_deprecated_v1
   def testDifferentAssignGraph(self):
@@ -197,6 +199,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       value, _ = sess.run([v, v.assign_add(1.0)])
       self.assertAllEqual(value, 0.0)
 
+  @test_util.disable_tfrt("b/169375363: error code support")
   def testAssignVariableDtypeMismatchEager(self):
     with context.eager_mode():
       handle = resource_variable_ops.var_handle_op(
@@ -333,7 +336,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     g = gradients_impl.gradients(c, [b], unconnected_gradients="zero")[0]
     self.assertAllEqual(g.shape.as_list(), [1, 2])
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.run_deprecated_v1
   def testGradientCondInWhileLoop(self):
     v = resource_variable_ops.ResourceVariable(initial_value=1.0)
@@ -748,6 +750,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       self.assertEqual(v.handle.op.colocation_groups(),
                        v.initializer.inputs[1].op.colocation_groups())
 
+  @test_util.disable_tfrt("b/169375363: error code support")
   def testCountUpTo(self):
     with context.eager_mode():
       v = resource_variable_ops.ResourceVariable(0, name="upto")
@@ -755,6 +758,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       with self.assertRaises(errors.OutOfRangeError):
         v.count_up_to(1)
 
+  @test_util.disable_tfrt("b/169375363: error code support")
   def testCountUpToFunction(self):
     with context.eager_mode():
       v = resource_variable_ops.ResourceVariable(0, name="upto")
@@ -853,6 +857,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
           variable_def=other_v_def)
       self.assertIsNotNone(other_v_prime._cached_value)
 
+  @test_util.disable_tfrt("b/169375363: error code support")
   def testVariableDefInitializedInstances(self):
     with ops.Graph().as_default(), self.cached_session():
       v_def = resource_variable_ops.ResourceVariable(
@@ -974,6 +979,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     self.evaluate(assign_without_read)
     self.assertEqual(0.0, self.evaluate(v.value()))
 
+  @test_util.disable_tfrt("b/169375363: error code support")
   @test_util.run_in_graph_and_eager_modes
   @test_util.run_v1_only("b/120545219")
   def testDestroyResource(self):
@@ -1000,6 +1006,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
           [assign],
           feed_dict={placeholder: np.zeros(shape=[2, 2], dtype=np.float32)})
 
+  @test_util.disable_tfrt("b/169375363: error code support")
   def testAssignDifferentShapesEagerNotAllowed(self):
     with context.eager_mode():
       with variable_scope.variable_scope("foo"):
@@ -1010,7 +1017,6 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
           assign = var.assign(np.zeros(shape=[2, 2]))
           self.evaluate(assign)
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.disable_xla("XLA doesn't allow changing shape at assignment, as "
                          "dictated by tf2xla/xla_resource.cc:SetTypeAndShape")
   @test_util.run_in_graph_and_eager_modes
@@ -1063,6 +1069,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
         .batch_scatter_update(batch_slices2),
         [[1, 3], [2, 3]])
 
+  @test_util.disable_tfrt("b/169375363: error code support")
   @test_util.run_in_graph_and_eager_modes
   def testInitValueWrongShape(self):
     with self.assertRaisesWithPredicateMatch(
@@ -1081,6 +1088,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     self.assertEqual(v.dtype, w.dtype)
 
   # TODO(alive): get caching to work in eager mode.
+  @test_util.disable_tfrt("b/169375363: error code support")
   @test_util.run_deprecated_v1
   def testCachingDevice(self):
     with ops.device("/job:server/task:1"):
@@ -1097,6 +1105,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       with self.assertRaises(ValueError):
         _ = w.value().op.get_attr("_class")
 
+  @test_util.disable_tfrt("b/169375363: error code support")
   @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
@@ -1155,6 +1164,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       v.initializer.run(feed_dict={v.initial_value: 3.0})
       self.assertEqual(3.0, v.value().eval())
 
+  @test_util.disable_tfrt("b/169375363: error code support")
   @test_util.run_v1_only("b/120545219")
   def testControlFlowInitialization(self):
     """Expects an error if an initializer is in a control-flow scope."""
@@ -1242,6 +1252,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       self.assertEqual(1, v1.read_value().numpy())
       self.assertEqual(2, v2.read_value().numpy())
 
+  @test_util.disable_tfrt("b/169375363: error code support")
   def testDestruction(self):
     with context.eager_mode():
       var = resource_variable_ops.ResourceVariable(initial_value=1.0,
@@ -1329,6 +1340,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       state_ops.scatter_update(v, [1], [3])
       self.assertAllEqual([1.0, 3.0], v.numpy())
 
+  @test_util.disable_tfrt("b/169375363: error code support")
   @test_util.run_in_graph_and_eager_modes
   def testScatterUpdateInvalidArgs(self):
     v = resource_variable_ops.ResourceVariable([0, 1, 2, 3], name="update")
@@ -1338,6 +1350,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     with self.assertRaisesRegex(Exception, r"shape.*2.*3"):
       state_ops.scatter_update(v, [0, 1], [0, 1, 2])
 
+  @test_util.disable_tfrt("b/169375363: error code support")
   @test_util.run_in_graph_and_eager_modes
   def testAssignIncompatibleShape(self):
     v = resource_variable_ops.ResourceVariable([0, 1, 2, 3])
@@ -1384,7 +1397,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
 
   # TODO(ebrevdo): Add run_in_graph_and_eager_modes once we can create
   # EagerTensor constants with TensorProto inputs.
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
+  @test_util.disable_tfrt("Does not support tf.Const in lowering.")
   @test_util.run_in_graph_and_eager_modes()
   def testVariantInitializer(self):
     variant_shape_and_type_data = self.create_variant_shape_and_type_data()
@@ -1566,6 +1579,29 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
           var.handle, indices, dtype=dtype)
     self.assertAllEqual(expected, result)
 
+  @test_util.run_v2_only
+  def testUninitializedVariableMemoryUsage(self):
+    if test_util.is_gpu_available():
+      # TODO(allenl): Investigate possible GPU-specific memory leaks
+      self.skipTest("Disabled when a GPU is available")
+    # TODO(kkb): Python memory checker complains continuous `weakref`
+    # allocations, investigate.
+    if memory_checker.CppMemoryChecker is None:
+      self.skipTest("Requires the C++ memory checker")
+
+    def _create_and_delete_variable():
+      resource_variable_ops.UninitializedVariable(
+          shape=[100, 100],
+          dtype=dtypes.float32)
+
+    _create_and_delete_variable()
+    checker = memory_checker.CppMemoryChecker(
+        "ResourceVariableOps.testUninitializedVariableMemoryUsage")
+    for _ in range(2):
+      _create_and_delete_variable()
+      checker.record_snapshot()
+    checker.report()
+    checker.assert_no_leak_if_all_possibly_except_one()
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/rnn_cell_test.py b/tensorflow/python/kernel_tests/rnn_cell_test.py
index 01b324f29fb..7fa31d14777 100644
--- a/tensorflow/python/kernel_tests/rnn_cell_test.py
+++ b/tensorflow/python/kernel_tests/rnn_cell_test.py
@@ -3062,6 +3062,8 @@ class RNNCellTest(test.TestCase, parameterized.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
+@test_util.run_all_without_tensor_float_32(
+    "Uses an LSTMCell, which calls matmul")
 class DropoutWrapperTest(test.TestCase, parameterized.TestCase):
 
   def _testDropoutWrapper(self,
diff --git a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
index 5be7cb4dd3a..40f8b31b7c2 100644
--- a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
+++ b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
@@ -38,6 +38,7 @@ def _AddTest(test_class, op_name, testcase_name, fn):
   setattr(test_class, test_name, fn)
 
 
+@test_util.run_all_without_tensor_float_32
 class SelfAdjointEigTest(test.TestCase):
 
   @test_util.run_deprecated_v1
@@ -160,8 +161,8 @@ def _GetSelfAdjointEigTest(dtype_, shape_, compute_v_):
         tf_e, tf_v = linalg_ops.self_adjoint_eig(constant_op.constant(a))
 
         # Check that V*diag(E)*V^T is close to A.
-        a_ev = math_ops.matmul(
-            math_ops.matmul(tf_v, array_ops.matrix_diag(tf_e)),
+        a_ev = test_util.matmul_without_tf32(
+            test_util.matmul_without_tf32(tf_v, array_ops.matrix_diag(tf_e)),
             tf_v,
             adjoint_b=True)
         self.assertAllClose(self.evaluate(a_ev), a, atol=atol)
diff --git a/tensorflow/python/kernel_tests/spacetodepth_op_test.py b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
index 762a644b065..6b229ea80f7 100644
--- a/tensorflow/python/kernel_tests/spacetodepth_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
@@ -309,6 +309,7 @@ class SpaceToDepthTest(test.TestCase):
       actual_vals, expected_vals = self.evaluate([actual, expected])
       self.assertTrue(np.array_equal(actual_vals, expected_vals))
 
+  @test_util.disable_tfrt("b/169901260")
   def testAgainstTranspose(self):
     self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", dtypes.float32, False)
     self.compareToTranspose(1, 2, 3, 2, 2, "NHWC", dtypes.float32, False)
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
index 79f1c488f35..8ec1756c154 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
@@ -97,7 +97,6 @@ class SparseTensorDenseMatMulTest(test.TestCase):
 
     self._testMatmul(x, y, indices_dtype=indices_dtype)
 
-  @test_util.run_deprecated_v1
   def testBasic(self):
     np.random.seed(127)  # Repeatable results
     self._testBasic(np.int32)
@@ -108,7 +107,6 @@ class SparseTensorDenseMatMulTest(test.TestCase):
     self._testBasic(np.int32, indices_dtype=np.int32)
     self._testBasic(np.float32, indices_dtype=np.int32)
 
-  @test_util.run_deprecated_v1
   def testShapeInference(self):
     x = np.random.rand(10, 10)
     x[np.abs(x) < 0.5] = 0  # Make it sparse
@@ -116,97 +114,91 @@ class SparseTensorDenseMatMulTest(test.TestCase):
     x_indices = np.vstack(np.where(x)).astype(np.int64).T
     x_values = x[np.where(x)]
     x_shape = x.shape
-    x_st = sparse_tensor.SparseTensor(x_indices, x_values, x_shape)
-    result = sparse_ops.sparse_tensor_dense_matmul(x_st, y)
-    self.assertEqual(result.get_shape(), (10, 20))
 
-    x_shape_unknown = array_ops.placeholder(dtype=dtypes.int64, shape=None)
-    x_st_shape_unknown = sparse_tensor.SparseTensor(x_indices, x_values,
-                                                    x_shape_unknown)
-    result_left_shape_unknown = sparse_ops.sparse_tensor_dense_matmul(
-        x_st_shape_unknown, y)
-    self.assertEqual(result_left_shape_unknown.get_shape().as_list(),
-                     [None, 20])
+    with ops.Graph().as_default():
+      x_st = sparse_tensor.SparseTensor(x_indices, x_values, x_shape)
+      result = sparse_ops.sparse_tensor_dense_matmul(x_st, y)
+      self.assertEqual(result.get_shape(), (10, 20))
 
-    x_shape_inconsistent = [10, 15]
-    x_st_shape_inconsistent = sparse_tensor.SparseTensor(x_indices, x_values,
-                                                         x_shape_inconsistent)
-    with self.assertRaisesRegex(ValueError, "Dimensions must be equal"):
-      sparse_ops.sparse_tensor_dense_matmul(x_st_shape_inconsistent, y)
+      x_shape_unknown = array_ops.placeholder(dtype=dtypes.int64, shape=None)
+      x_st_shape_unknown = sparse_tensor.SparseTensor(x_indices, x_values,
+                                                      x_shape_unknown)
+      result_left_shape_unknown = sparse_ops.sparse_tensor_dense_matmul(
+          x_st_shape_unknown, y)
+      self.assertEqual(result_left_shape_unknown.get_shape().as_list(),
+                       [None, 20])
 
-  @test_util.deprecated_graph_mode_only
+      x_shape_inconsistent = [10, 15]
+      x_st_shape_inconsistent = sparse_tensor.SparseTensor(
+          x_indices, x_values, x_shape_inconsistent)
+      with self.assertRaisesRegex(ValueError, "Dimensions must be equal"):
+        sparse_ops.sparse_tensor_dense_matmul(x_st_shape_inconsistent, y)
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=False)
   def testInvalidIndicesForSparseTensorDenseMatmul(self):
-    # Note: use_gpu=False because nice errors are only returned from CPU kernel.
-    with self.session(use_gpu=False):
-      indices = np.matrix([[1, 10]]).astype(np.int64)
-      values = np.array([10]).astype(np.float32)
-      shape = [3, 2]
-      sparse_t = sparse_tensor.SparseTensor(indices, values, shape)
+    # TODO(b/169813429): Make GPU kernel return nice errors too.
+    indices = np.matrix([[1, 10]]).astype(np.int64)
+    values = np.array([10]).astype(np.float32)
+    shape = [3, 2]
+    sparse_t = sparse_tensor.SparseTensor(indices, values, shape)
 
-      # Test multiplying by both a small and large dense matrix, to hit
-      # both cases in the kernel.
-      dense_t = np.matrix([[1] * 5, [2] * 5], dtype=np.float32)
-      with self.assertRaisesOpError(
-          "k .10. from index.0,1. out of bounds .>=2."):
-        self.evaluate(sparse_ops.sparse_tensor_dense_matmul(sparse_t, dense_t))
-      dense_t = np.matrix([[1] * 500, [2] * 500], dtype=np.float32)
-      with self.assertRaisesOpError(
-          "k .10. from index.0,1. out of bounds .>=2."):
-        self.evaluate(sparse_ops.sparse_tensor_dense_matmul(sparse_t, dense_t))
+    # Test multiplying by both a small and large dense matrix, to hit
+    # both cases in the kernel.
+    dense_t = np.matrix([[1] * 5, [2] * 5], dtype=np.float32)
+    with self.assertRaisesOpError("k .10. from index.0,1. out of bounds .>=2."):
+      self.evaluate(sparse_ops.sparse_tensor_dense_matmul(sparse_t, dense_t))
+    dense_t = np.matrix([[1] * 500, [2] * 500], dtype=np.float32)
+    with self.assertRaisesOpError("k .10. from index.0,1. out of bounds .>=2."):
+      self.evaluate(sparse_ops.sparse_tensor_dense_matmul(sparse_t, dense_t))
 
-      # Repeat with adjoint_a, to get a different error.
-      dense_t = np.matrix([[1] * 5, [2] * 5, [3] * 5], dtype=np.float32)
-      with self.assertRaisesOpError(
-          "m .10. from index.0,1. out of bounds .>=2."):
-        self.evaluate(
-            sparse_ops.sparse_tensor_dense_matmul(
-                sparse_t, dense_t, adjoint_a=True))
-      dense_t = np.matrix([[1] * 500, [2] * 500, [3] * 500], dtype=np.float32)
-      with self.assertRaisesOpError(
-          "m .10. from index.0,1. out of bounds .>=2."):
-        self.evaluate(
-            sparse_ops.sparse_tensor_dense_matmul(
-                sparse_t, dense_t, adjoint_a=True))
+    # Repeat with adjoint_a, to get a different error.
+    dense_t = np.matrix([[1] * 5, [2] * 5, [3] * 5], dtype=np.float32)
+    with self.assertRaisesOpError("m .10. from index.0,1. out of bounds .>=2."):
+      self.evaluate(
+          sparse_ops.sparse_tensor_dense_matmul(
+              sparse_t, dense_t, adjoint_a=True))
+    dense_t = np.matrix([[1] * 500, [2] * 500, [3] * 500], dtype=np.float32)
+    with self.assertRaisesOpError("m .10. from index.0,1. out of bounds .>=2."):
+      self.evaluate(
+          sparse_ops.sparse_tensor_dense_matmul(
+              sparse_t, dense_t, adjoint_a=True))
 
+  @test_util.run_gpu_only
   def testInvalidIndicesForSparseTensorDenseMatmulOnGPU(self):
-    # Note: use_gpu=False because nice errors are only returned from CPU kerne
-    if not test.is_gpu_available():
-      return
-    with self.session(use_gpu=True):
-      indices = np.array([[1, 10]]).astype(np.int64)
-      values = np.array([10]).astype(np.float32)
-      shape = [3, 2]
-      sparse_t = sparse_tensor.SparseTensor(indices, values, shape)
+    indices = np.array([[1, 10]]).astype(np.int64)
+    values = np.array([10]).astype(np.float32)
+    shape = [3, 2]
+    sparse_t = sparse_tensor.SparseTensor(indices, values, shape)
 
-      # Test multiplying by both a small and large dense matrix, to hit
-      # both cases in the kernel.
-      dense_t = np.matrix([[1] * 5, [2] * 5], dtype=np.float32)
-      expected_t = np.array([[0] * 5, [np.nan] * 5, [0] * 5], dtype=np.float32)
-      self.assertAllClose(expected_t,
-                          sparse_ops.sparse_tensor_dense_matmul(
-                              sparse_t, dense_t))
-      dense_t = np.matrix([[1] * 500, [2] * 500], dtype=np.float32)
-      expected_t = np.array(
-          [[0] * 500, [np.nan] * 500, [0] * 500], dtype=np.float32)
-      self.assertAllClose(expected_t,
-                          sparse_ops.sparse_tensor_dense_matmul(
-                              sparse_t, dense_t))
+    # Test multiplying by both a small and large dense matrix, to hit
+    # both cases in the kernel.
+    dense_t = np.matrix([[1] * 5, [2] * 5], dtype=np.float32)
+    expected_t = np.array([[0] * 5, [np.nan] * 5, [0] * 5], dtype=np.float32)
+    self.assertAllClose(
+        expected_t, sparse_ops.sparse_tensor_dense_matmul(sparse_t, dense_t))
+    dense_t = np.matrix([[1] * 500, [2] * 500], dtype=np.float32)
+    expected_t = np.array([[0] * 500, [np.nan] * 500, [0] * 500],
+                          dtype=np.float32)
+    self.assertAllClose(
+        expected_t, sparse_ops.sparse_tensor_dense_matmul(sparse_t, dense_t))
 
-      # Repeat with adjoint_a, now the error is that the sparse index
-      # is OOO w.r.t. the output.  The GPU kernel can't do much here,
-      # so it just doesn't accumulate.
+    # Repeat with adjoint_a, now the error is that the sparse index
+    # is OOO w.r.t. the output.  The GPU kernel can't do much here,
+    # so it just doesn't accumulate.
 
-      dense_t = np.matrix([[1] * 5, [2] * 5, [3] * 5], dtype=np.float32)
-      expected_t = np.array([[0] * 5, [0] * 5], dtype=np.float32)
-      self.assertAllClose(expected_t,
-                          sparse_ops.sparse_tensor_dense_matmul(
-                              sparse_t, dense_t, adjoint_a=True))
+    dense_t = np.matrix([[1] * 5, [2] * 5, [3] * 5], dtype=np.float32)
+    expected_t = np.array([[0] * 5, [0] * 5], dtype=np.float32)
+    self.assertAllClose(
+        expected_t,
+        sparse_ops.sparse_tensor_dense_matmul(
+            sparse_t, dense_t, adjoint_a=True))
 
-      dense_t = np.matrix([[1] * 500, [2] * 500, [3] * 500], dtype=np.float32)
-      expected_t = np.array([[0] * 500, [0] * 500], dtype=np.float32)
-      self.assertAllClose(expected_t,
-                          sparse_ops.sparse_tensor_dense_matmul(
-                              sparse_t, dense_t, adjoint_a=True))
+    dense_t = np.matrix([[1] * 500, [2] * 500, [3] * 500], dtype=np.float32)
+    expected_t = np.array([[0] * 500, [0] * 500], dtype=np.float32)
+    self.assertAllClose(
+        expected_t,
+        sparse_ops.sparse_tensor_dense_matmul(
+            sparse_t, dense_t, adjoint_a=True))
 
   # Tests setting one dimension to be a high value.
   def _testLarge(self, np_dtype):
@@ -235,7 +227,6 @@ class SparseTensorDenseMatMulTest(test.TestCase):
     self._testLarge(np.complex128)
 
   # Tests random sized matrices.
-  @test_util.run_deprecated_v1
   def testFloatRandom(self):
     np.random.seed(127)  # Repeatable results
     for _ in range(8):
diff --git a/tensorflow/python/kernel_tests/sparse_xent_op_test.py b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
index cf1337b493d..c53f196ecb9 100644
--- a/tensorflow/python/kernel_tests/sparse_xent_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
@@ -25,6 +25,8 @@ import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.eager import backprop as backprop_lib
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -32,7 +34,7 @@ from tensorflow.python.framework import ops as ops_lib
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -79,33 +81,34 @@ class SparseXentTest(test.TestCase):
       self.assertAllClose([0.0, 0.0, 0.0], tf_loss)
       self.assertAllClose([[0.0], [0.0], [0.0]], tf_backprop)
 
-  @test_util.run_deprecated_v1
-  @test_util.disable_xla("XLA cannot assert inside of a kernel.")
-  def testInvalidLabel(self):
+  @test_util.run_gpu_only()
+  def testInvalidLabelGPU(self):
     features = [[1., 1., 1., 1.], [1., 1., 1., 1.], [1., 2., 3., 4.],
                 [1., 2., 3., 4.]]
     labels = [4, 3, 0, -1]
+    loss, backprop = self.evaluate(
+        gen_nn_ops.sparse_softmax_cross_entropy_with_logits(features, labels))
+    self.assertAllClose([[np.nan] * 4, [0.25, 0.25, 0.25, -0.75],
+                         [-0.968, 0.087, 0.237, 0.6439], [np.nan] * 4],
+                        backprop,
+                        rtol=1e-3,
+                        atol=1e-3)
+    self.assertAllClose([np.nan, 1.3862, 3.4420, np.nan],
+                        loss,
+                        rtol=1e-3,
+                        atol=1e-3)
 
-    if test.is_built_with_gpu_support() and test.is_gpu_available():
-      with self.session(use_gpu=True) as sess:
-        loss, backprop = (
-            gen_nn_ops.sparse_softmax_cross_entropy_with_logits(
-                features, labels))
-        tf_loss, tf_backprop = self.evaluate([loss, backprop])
-        self.assertAllClose(
-            [[np.nan] * 4, [0.25, 0.25, 0.25, -0.75],
-             [-0.968, 0.087, 0.237, 0.6439], [np.nan] * 4],
-            tf_backprop,
-            rtol=1e-3,
-            atol=1e-3)
-        self.assertAllClose(
-            [np.nan, 1.3862, 3.4420, np.nan], tf_loss, rtol=1e-3, atol=1e-3)
-
-    with self.session(use_gpu=False) as sess:
-      loss, backprop = (
+  @test_util.run_in_graph_and_eager_modes(use_gpu=False)
+  @test_util.disable_xla("XLA cannot assert inside of a kernel.")
+  def testInvalidLabelCPU(self):
+    features = [[1., 1., 1., 1.], [1., 1., 1., 1.], [1., 2., 3., 4.],
+                [1., 2., 3., 4.]]
+    labels = [4, 3, 0, -1]
+    with self.assertRaisesRegex(
+        (errors_impl.InvalidArgumentError, errors_impl.UnknownError),
+        "Received a label value of"):
+      self.evaluate(
           gen_nn_ops.sparse_softmax_cross_entropy_with_logits(features, labels))
-      with self.assertRaisesOpError("Received a label value of"):
-        self.evaluate([loss, backprop])
 
   def testNpXent(self):
     # We create 2 batches of logits for testing.
@@ -153,9 +156,8 @@ class SparseXentTest(test.TestCase):
         nn_ops.sparse_softmax_cross_entropy_with_logits(
             labels=constant_op.constant(0), logits=constant_op.constant(1.0))
 
-  @test_util.run_deprecated_v1
   def testLabelsPlaceholderScalar(self):
-    with self.session(use_gpu=True):
+    with ops_lib.Graph().as_default(), self.session(use_gpu=True):
       labels = array_ops.placeholder(np.int32)
       y = nn_ops.sparse_softmax_cross_entropy_with_logits(
           labels=labels, logits=[[7.]])
@@ -189,7 +191,7 @@ class SparseXentTest(test.TestCase):
   def testEmpty(self):
     self._testXent(np.zeros((0, 3)), np.zeros((0,), dtype=np.int32))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testGradient(self):
     with self.session(use_gpu=True) as sess:
       l = constant_op.constant([3, 0, 1], name="l")
@@ -198,22 +200,28 @@ class SparseXentTest(test.TestCase):
           shape=[3, 4],
           dtype=dtypes.float64,
           name="f")
-      x = nn_ops.sparse_softmax_cross_entropy_with_logits(
-          labels=l, logits=f, name="xent")
-      err = gradient_checker.compute_gradient_error(f, [3, 4], x, [3])
 
-      # Check that no extra computation performed. When only first derivative is
-      # requested, second derivative must not be computed. So when there is no
-      # second derivative, there is no `BatchMatMul` op in the graph.
-      op_names = [
-          op.op_def.name for op in sess.graph.get_operations() if op.op_def
-      ]
-      self.assertNotIn("BatchMatMul", op_names)
-      self.assertNotIn("BatchMatMulV2", op_names)
+      def xent(f):
+        # gradient_checker_v2.computee_gradient doesn't take int32/int64.
+        # labels must be of type int32/int64, so passing them separately here.
+        return nn_ops.sparse_softmax_cross_entropy_with_logits(
+            labels=l, logits=f, name="xent")
 
-    self.assertLess(err, 5e-8)
+      theoretical, numerical = gradient_checker_v2.compute_gradient(xent, [f])
+
+      if not context.executing_eagerly():
+        # Check that no extra computation performed. When only first derivative
+        # is requested, second derivative must not be computed. So when there is
+        # no second derivative, there is no `BatchMatMul` op in the graph.
+        op_names = [
+            op.op_def.name for op in sess.graph.get_operations() if op.op_def
+        ]
+        self.assertNotIn("BatchMatMul", op_names)
+        self.assertNotIn("BatchMatMulV2", op_names)
+
+    tol = 5e-8
+    self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
 
-  @test_util.run_deprecated_v1
   def testSecondGradient(self):
     with self.session() as sess:
       l = constant_op.constant([3, 0, 1], name="l")
@@ -222,51 +230,67 @@ class SparseXentTest(test.TestCase):
           shape=[3, 4],
           dtype=dtypes.float64,
           name="f")
-      x = nn_ops.sparse_softmax_cross_entropy_with_logits(
-          labels=l, logits=f, name="xent")
 
-      gradients = gradients_impl.gradients(x, [f])[0]
-      err = gradient_checker.compute_gradient_error(f, [3, 4], gradients,
-                                                    [3, 4])
+      def xent_grad(f):
+        if not context.executing_eagerly():
+          return gradients_impl.gradients(
+              nn_ops.sparse_softmax_cross_entropy_with_logits(
+                  labels=l, logits=f, name="xent"), [f])[0]
+        with backprop_lib.GradientTape() as tape:
+          tape.watch(f)
+          return tape.gradient(
+              nn_ops.sparse_softmax_cross_entropy_with_logits(
+                  labels=l, logits=f, name="xent"), [f])[0]
 
-      # Check that second derivative is calculated.
-      # (it is equivalent to being `BatchMatMul` op in the graph because of
-      # implementation of xentropy grad)
-      op_names = [
-          op.op_def.name for op in sess.graph.get_operations() if op.op_def
-      ]
-      self.assertIn("BatchMatMulV2", op_names)
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          xent_grad, [f])
 
-    self.assertLess(err, 5e-8)
+      if not context.executing_eagerly():
+        # Check that second derivative is calculated.
+        # (it is equivalent to being `BatchMatMul` op in the graph because of
+        # implementation of xentropy grad)
+        op_names = [
+            op.op_def.name for op in sess.graph.get_operations() if op.op_def
+        ]
+        self.assertIn("BatchMatMulV2", op_names)
 
+    tol = 5e-8
+    self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def _testHighDim(self, features, labels):
     np_loss, np_backprop = self._npXent(np.array(features), np.array(labels))
     # manually reshape loss
     np_loss = np.reshape(np_loss, np.array(labels).shape)
-    with self.cached_session(use_gpu=True) as sess:
-      loss = nn_ops.sparse_softmax_cross_entropy_with_logits(
-          labels=labels, logits=features)
-      backprop = loss.op.inputs[0].op.outputs[1]
-      tf_loss, tf_backprop = self.evaluate([loss, backprop])
+    tf_loss = nn_ops.sparse_softmax_cross_entropy_with_logits(
+        labels=labels, logits=features)
+    if not context.executing_eagerly():
+      tf_backprop = tf_loss.op.inputs[0].op.outputs[1]
+    else:
+      with backprop_lib.GradientTape() as tape:
+        features = constant_op.constant(features)
+        tape.watch(features)
+        tf_backprop = tape.gradient(
+            nn_ops.sparse_softmax_cross_entropy_with_logits(
+                labels=labels, logits=features), [features])[0]
+        tf_backprop = array_ops.reshape(tf_backprop, np_backprop.shape)
+
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
     self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
 
-  @test_util.run_deprecated_v1
   def testHighDim(self):
     features = [[[1., 1., 1., 1.]], [[1., 2., 3., 4.]]]
     labels = [[3], [0]]
     self._testHighDim(features, labels)
 
-  @test_util.run_deprecated_v1
   def testHighDim2(self):
     features = [[[1., 1., 1., 1.], [2., 2., 2., 2.]],
                 [[1., 2., 3., 4.], [5., 6., 7., 8.]]]
     labels = [[3, 2], [0, 3]]
     self._testHighDim(features, labels)
 
-  @test_util.run_deprecated_v1
   def testScalarHandling(self):
-    with self.session(use_gpu=False) as sess:
+    with ops_lib.Graph().as_default(), self.session(use_gpu=False) as sess:
       with self.assertRaisesRegex(errors_impl.InvalidArgumentError,
                                   ".*labels must be 1-D.*"):
         labels = array_ops.placeholder(dtypes.int32, shape=[None, 1])
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 5440319db5b..4d0f6507aef 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
@@ -1566,9 +1565,6 @@ class TensorArrayTest(test.TestCase):
     ta_grad = ta.grad("grad")
     flows = [ta.flow, ta_grad.flow]
 
-    # Same goes for stack.
-    flows.append(ta.stack("stack"))
-
     # Similar tests for unpack and split
     with ops.device("/job:worker/task:0/cpu:0"):
       ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=3)
@@ -1584,25 +1580,6 @@ class TensorArrayTest(test.TestCase):
       ta = ta.split([1.0, 2.0], [1, 1])
     flows.append(ta.flow)
 
-    g = ops.get_default_graph()
-    dev_assignments = collections.defaultdict(list)
-    for op in g.get_operations():
-      dev_assignments[op.device].append(op.name)
-    # We have created 3 different TensorArray handles, only those and their
-    # "size" ops should be deviceless.
-    self.assertLen(dev_assignments[""], 6)
-    # We assigned two writes explicitly to device #2.
-    ops_assigned_to_task_2 = []
-    for device, ops_on_device in dev_assignments.items():
-      if "/task:2/" in device:
-        ops_assigned_to_task_2 = ops_on_device
-        break
-    self.assertLen(ops_assigned_to_task_2, 2)
-    # All other ops should colocate with the first write on device #1.
-    self.assertLen(dev_assignments, 3)
-    for device in dev_assignments:
-      self.assertNotIn("/task:0/", device)
-
     session = session_lib.Session(self._workers[0].target)
 
     run_options = config_pb2.RunOptions(
diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index a031f9bca07..368a7f18f8b 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -165,6 +165,7 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
     return a, b, a_dims, b_dims
 
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_without_tensor_float_32("Tests tensordot, which calls matmul")
   def test_tensordot(self):
     if dynamic_shape_ and context.executing_eagerly():
       self.skipTest("Placeholders not support in eager mode")
@@ -196,6 +197,7 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
       self.assertAllEqual(tf_ans.shape, np_ans.shape)
 
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_without_tensor_float_32("Tests tensordot, which calls matmul")
   def test_tensordot_scalar_axes(self):
     if dynamic_shape_ and context.executing_eagerly():
       self.skipTest("Placeholders not support in eager mode")
diff --git a/tensorflow/python/kernel_tests/v1_compat_tests/BUILD b/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
index bd9c02d8101..ac04803ba3b 100644
--- a/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
+++ b/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
@@ -12,6 +12,7 @@ tf_py_test(
     name = "identity_op_py_test",
     size = "small",
     srcs = ["identity_op_py_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
@@ -23,6 +24,7 @@ cuda_py_test(
     name = "scatter_nd_ops_test",
     size = "small",
     srcs = ["scatter_nd_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:state_ops",
@@ -35,6 +37,7 @@ cuda_py_test(
     name = "session_ops_test",
     size = "small",
     srcs = ["session_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index 0d3bbb5144d..e04bf9798eb 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -173,7 +173,6 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
       with self.assertRaisesRegex(ValueError, "shape.*and.*are incompatible"):
         var.assign(np.zeros(shape=[2, 2]))
 
-  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.run_in_graph_and_eager_modes
   def testAssignDifferentShapesAllowed(self):
     var = variables.Variable(np.zeros(shape=[1, 1]),
@@ -183,8 +182,6 @@ class VariablesTestCase(test.TestCase, parameterized.TestCase):
     self.evaluate(var.assign(np.zeros(shape=[2, 2])))
     self.assertAllEqual(np.zeros(shape=[2, 2]), var.read_value())
 
-  @test_util.disable_tfrt("GetHostSize() is not expected to be called with "
-                          "string type. b/156761465")
   def testZeroSizeStringAssign(self):
     with self.cached_session() as sess:
       array = variables.VariableV1(
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index 9acb6d4a283..f430142a73c 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -880,10 +880,13 @@ TFE_TensorHandle* PySeqToTFE_TensorHandle(TFE_Context* ctx, PyObject* obj,
 
     case DT_INVALID:  // Only occurs for empty tensors.
     {
-      Tensor tensor(requested_dtype == DT_INVALID ? DT_FLOAT : requested_dtype,
-                    TensorShape(state.inferred_shape));
-      TensorInterface t(std::move(tensor));
-      return tensorflow::wrap(tensorflow::unwrap(ctx)->CreateLocalHandle(&t));
+      AbstractTensorInterface* t = tensorflow::unwrap(ctx)->CreateTensor(
+          requested_dtype == DT_INVALID ? DT_FLOAT : requested_dtype,
+          state.inferred_shape);
+      auto* result =
+          tensorflow::wrap(tensorflow::unwrap(ctx)->CreateLocalHandle(t));
+      t->Release();
+      return result;
     }
 
     default:
diff --git a/tensorflow/python/lib/core/pybind11_lib.h b/tensorflow/python/lib/core/pybind11_lib.h
index a0fb45a5152..e464728a515 100644
--- a/tensorflow/python/lib/core/pybind11_lib.h
+++ b/tensorflow/python/lib/core/pybind11_lib.h
@@ -55,12 +55,12 @@ inline py::object PyoOrThrow(PyObject* ptr) {
   return Pyo(ptr);
 }
 
-void ThrowTypeError(const char* error_message) {
+[[noreturn]] void ThrowTypeError(const char* error_message) {
   PyErr_SetString(PyExc_TypeError, error_message);
   throw pybind11::error_already_set();
 }
 
-void ThrowValueError(const char* error_message) {
+[[noreturn]] void ThrowValueError(const char* error_message) {
   PyErr_SetString(PyExc_ValueError, error_message);
   throw pybind11::error_already_set();
 }
diff --git a/tensorflow/python/lib/core/safe_ptr.cc b/tensorflow/python/lib/core/safe_ptr.cc
index 2194f2499fd..ce852a4f009 100644
--- a/tensorflow/python/lib/core/safe_ptr.cc
+++ b/tensorflow/python/lib/core/safe_ptr.cc
@@ -17,10 +17,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-Safe_PyObjectPtr make_safe(PyObject* object) {
-  return Safe_PyObjectPtr(object);
-}
-
 Safe_TF_TensorPtr make_safe(TF_Tensor* tensor) {
   return Safe_TF_TensorPtr(tensor);
 }
diff --git a/tensorflow/python/lib/core/safe_ptr.h b/tensorflow/python/lib/core/safe_ptr.h
index 44d14e9bea4..00f47d7bbe6 100644
--- a/tensorflow/python/lib/core/safe_ptr.h
+++ b/tensorflow/python/lib/core/safe_ptr.h
@@ -16,20 +16,17 @@ limitations under the License.
 #ifndef TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
 #define TENSORFLOW_PYTHON_LIB_CORE_SAFE_PTR_H_
 
-#include <memory>
-
 #include <Python.h>
 
+#include <memory>
+
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 
 namespace tensorflow {
 namespace detail {
 
-struct PyDecrefDeleter {
-  void operator()(PyObject* p) const { Py_DECREF(p); }
-};
-
 struct TFTensorDeleter {
   void operator()(TF_Tensor* p) const { TF_DeleteTensor(p); }
 };
@@ -48,11 +45,6 @@ struct TFBufferDeleter {
 
 }  // namespace detail
 
-// Safe container for an owned PyObject. On destruction, the reference count of
-// the contained object will be decremented.
-using Safe_PyObjectPtr = std::unique_ptr<PyObject, detail::PyDecrefDeleter>;
-Safe_PyObjectPtr make_safe(PyObject* o);
-
 // Safe containers for an owned TF_Tensor. On destruction, the tensor will be
 // deleted by TF_DeleteTensor.
 using Safe_TF_TensorPtr = std::unique_ptr<TF_Tensor, detail::TFTensorDeleter>;
diff --git a/tensorflow/python/lib/core/safe_pyobject_ptr.cc b/tensorflow/python/lib/core/safe_pyobject_ptr.cc
new file mode 100644
index 00000000000..966d3ec5ab5
--- /dev/null
+++ b/tensorflow/python/lib/core/safe_pyobject_ptr.cc
@@ -0,0 +1,24 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
+
+namespace tensorflow {
+
+Safe_PyObjectPtr make_safe(PyObject* object) {
+  return Safe_PyObjectPtr(object);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op_mobile.cc b/tensorflow/python/lib/core/safe_pyobject_ptr.h
similarity index 50%
rename from tensorflow/core/kernels/data/optimize_dataset_op_mobile.cc
rename to tensorflow/python/lib/core/safe_pyobject_ptr.h
index 5e6257ce3e0..496bfed6c62 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op_mobile.cc
+++ b/tensorflow/python/lib/core/safe_pyobject_ptr.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,26 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/framework/dataset.h"
+
+#ifndef TENSORFLOW_PYTHON_LIB_CORE_SAFE_PYOBJECT_PTR_H_
+#define TENSORFLOW_PYTHON_LIB_CORE_SAFE_PYOBJECT_PTR_H_
+
+#include <Python.h>
+
+#include <memory>
 
 namespace tensorflow {
-namespace data {
+namespace detail {
 
-class OptimizeDatasetOp : public UnaryDatasetOpKernel {
- public:
-  explicit OptimizeDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {}
-
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) {
-    input->Ref();
-    *output = input;
-  }
+struct PyDecrefDeleter {
+  void operator()(PyObject* p) const { Py_DECREF(p); }
 };
 
-namespace {
-REGISTER_KERNEL_BUILDER(Name("OptimizeDataset").Device(DEVICE_CPU),
-                        OptimizeDatasetOp);
-}  // namespace
-}  // namespace data
+}  // namespace detail
+
+// Safe container for an owned PyObject. On destruction, the reference count of
+// the contained object will be decremented.
+using Safe_PyObjectPtr = std::unique_ptr<PyObject, detail::PyDecrefDeleter>;
+Safe_PyObjectPtr make_safe(PyObject* o);
+
 }  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_LIB_CORE_SAFE_PYOBJECT_PTR_H_
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index fb4e19da902..a4c9b613068 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -347,6 +347,7 @@ def get_matching_files(filename):
 
   Raises:
   *  errors.OpError: If there are filesystem / directory listing errors.
+  *  errors.NotFoundError: If pattern to be matched is an invalid directory.
   """
   return get_matching_files_v2(filename)
 
@@ -401,6 +402,7 @@ def get_matching_files_v2(pattern):
 
   Raises:
     errors.OpError: If there are filesystem / directory listing errors.
+    errors.NotFoundError: If pattern to be matched is an invalid directory.
   """
   if isinstance(pattern, six.string_types):
     return [
diff --git a/tensorflow/python/lite/toco_python_api_wrapper.cc b/tensorflow/python/lite/toco_python_api_wrapper.cc
index b77200a3bee..c5a5f63b2ac 100644
--- a/tensorflow/python/lite/toco_python_api_wrapper.cc
+++ b/tensorflow/python/lite/toco_python_api_wrapper.cc
@@ -77,4 +77,14 @@ PYBIND11_MODULE(_pywrap_toco_api, m) {
       R"pbdoc(
       Returns a sparsified model.
     )pbdoc");
+  m.def(
+      "RegisterCustomOpdefs",
+      [](py::object custom_opdefs_txt_raw) {
+        return tensorflow::PyoOrThrow(
+            toco::RegisterCustomOpdefs(custom_opdefs_txt_raw.ptr()));
+      },
+      py::arg("custom_opdefs_txt_raw"),
+      R"pbdoc(
+      Registers the given custom opdefs to the TensorFlow global op registry.
+    )pbdoc");
 }
diff --git a/tensorflow/python/mlir_wrapper.cc b/tensorflow/python/mlir_wrapper.cc
index 6bc0183fdc5..fa16e5872ee 100644
--- a/tensorflow/python/mlir_wrapper.cc
+++ b/tensorflow/python/mlir_wrapper.cc
@@ -31,6 +31,17 @@ PYBIND11_MODULE(_pywrap_mlir, m) {
           return output;
         });
 
+  m.def("ImportFunction", [](const std::string &functiondef,
+                             const std::string &functiondef_library,
+                             const std::string &pass_pipeline) {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    std::string output = tensorflow::ImportFunction(
+        functiondef, functiondef_library, pass_pipeline, status.get());
+    tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+    return output;
+  });
+
   m.def("ExperimentalConvertSavedModelToMlir",
         [](const std::string &saved_model_path,
            const std::string &exported_names, bool show_debug_info) {
diff --git a/tensorflow/python/module/BUILD b/tensorflow/python/module/BUILD
index fea6fe123ad..1b93d8cf8b6 100644
--- a/tensorflow/python/module/BUILD
+++ b/tensorflow/python/module/BUILD
@@ -23,6 +23,7 @@ py_library(
 tf_py_test(
     name = "module_test",
     srcs = ["module_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":module",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/modules_with_exports.py b/tensorflow/python/modules_with_exports.py
index 40ef715aacf..8d03133d266 100644
--- a/tensorflow/python/modules_with_exports.py
+++ b/tensorflow/python/modules_with_exports.py
@@ -53,6 +53,21 @@ from tensorflow.python.ops import initializers_ns as initializers
 
 from tensorflow.python.util.tf_export import tf_export
 
+# _internal APIs
+from tensorflow.python.distribute.combinations import generate
+from tensorflow.python.distribute.multi_process_runner import *
+from tensorflow.python.distribute.multi_worker_test_base import *
+from tensorflow.python.distribute.strategy_combinations import *
+from tensorflow.python.framework.combinations import *
+from tensorflow.python.framework.composite_tensor import *
+from tensorflow.python.framework.test_combinations import *
+from tensorflow.python.util.tf_decorator import make_decorator
+from tensorflow.python.util.tf_decorator import unwrap
+
+tf_export('__internal__.decorator.make_decorator', v1=[])(make_decorator)
+tf_export('__internal__.decorator.unwrap', v1=[])(unwrap)
+
+
 # Export protos
 # pylint: disable=undefined-variable
 tf_export(v1=['AttrValue'])(AttrValue)
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 6da542ff98e..b8c59bcbb6c 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -1140,6 +1140,37 @@ def _TensorScatterAddGrad(op, grad):
   return [tensor_grad, None, updates_grad]
 
 
+def _TensorScatterMinOrMaxGrad(op, grad):
+  """Gradient for TensorScatterMin and TensorScatterMax."""
+  indices = op.inputs[1]
+  x = op.inputs[0]
+  y = op.inputs[2]
+  output = op.outputs[0]
+  x_indicators = math_ops.cast(math_ops.equal(x, output), grad.dtype)
+  y_output = array_ops.gather_nd(output, indices)
+  y_indicators = math_ops.cast(math_ops.equal(y, y_output), grad.dtype)
+  ys_indicators = array_ops.scatter_nd(indices, y_indicators,
+                                       array_ops.shape(x))
+  indicators = x_indicators + ys_indicators  # All elements are >= 1.
+  # If there are multiple minimum or maximum elements then the gradient will be
+  # divided between them.
+  x_grad = grad * x_indicators / indicators
+  y_grad = array_ops.gather_nd(grad / indicators, indices) * y_indicators
+  return [x_grad, None, y_grad]
+
+
+@ops.RegisterGradient("TensorScatterMax")
+def _TensorScatterMaxGrad(op, grad):
+  """Gradient for TensorScatterMax op."""
+  return _TensorScatterMinOrMaxGrad(op, grad)
+
+
+@ops.RegisterGradient("TensorScatterMin")
+def _TensorScatterMinGrad(op, grad):
+  """Gradient for TensorScatterMin op."""
+  return _TensorScatterMinOrMaxGrad(op, grad)
+
+
 @ops.RegisterGradient("TensorScatterSub")
 def _TensorScatterSubGrad(op, grad):
   indices = op.inputs[1]
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 4a2d04d8a45..67a7c7a0e94 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -574,14 +574,19 @@ def broadcast_static_shape(shape_x, shape_y):
 @dispatch.add_dispatch_support
 def shape_v2(input, out_type=dtypes.int32, name=None):
   # pylint: disable=redefined-builtin
-  """Returns the shape of a tensor.
-  
+  """Returns a tensor containing the shape of the input tensor.
+
   See also `tf.size`, `tf.rank`.
 
   `tf.shape` returns a 1-D integer tensor representing the shape of `input`.
+  For a scalar input, the tensor returned has a shape of (0,) and its value is
+  the empty vector (i.e. []).
 
   For example:
 
+  >>> tf.shape(1.)
+  <tf.Tensor: shape=(0,), dtype=int32, numpy=array([], dtype=int32)>
+
   >>> t = tf.constant([[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]])
   >>> tf.shape(t)
   <tf.Tensor: shape=(3,), dtype=int32, numpy=array([2, 2, 3], dtype=int32)>
@@ -597,7 +602,7 @@ def shape_v2(input, out_type=dtypes.int32, name=None):
 
   >>> a.shape
   TensorShape([None, None, 10])
-  
+
   (The first `None` represents the as yet unknown batch size.)
 
   `tf.shape` and `Tensor.shape` should be identical in eager mode.  Within
@@ -2472,7 +2477,6 @@ def matrix_diag(diagonal,
 @tf_export("linalg.diag_part", v1=["linalg.diag_part", "matrix_diag_part"])
 @dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_diag_part")
-@dispatch.add_dispatch_support
 def matrix_diag_part(
     input,  # pylint:disable=redefined-builtin
     name="diag_part",
@@ -2610,6 +2614,51 @@ def matrix_diag_part(
       input=input, k=k, padding_value=padding_value, align=align, name=name)
 
 
+@tf_export(
+    "linalg.tensor_diag_part", v1=["linalg.tensor_diag_part", "diag_part"])
+@dispatch.add_dispatch_support
+@deprecation.deprecated_endpoints("diag_part")
+def tensor_diag_part(
+    input,  # pylint:disable=redefined-builtin
+    name=None):
+  """Returns the diagonal part of the tensor.
+
+  This operation returns a tensor with the `diagonal` part
+  of the `input`. The `diagonal` part is computed as follows:
+
+  Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
+  tensor of rank `k` with dimensions `[D1,..., Dk]` where:
+
+  `diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
+
+  For a rank 2 tensor, `linalg.diag_part` and `linalg.tensor_diag_part`
+  produce the same result. For rank 3 and higher, linalg.diag_part extracts
+  the diagonal of each inner-most matrix in the tensor. An example where
+  they differ is given below.
+
+  >>> x = [[[[1111,1112],[1121,1122]],
+  ...       [[1211,1212],[1221,1222]]],
+  ...      [[[2111, 2112], [2121, 2122]],
+  ...       [[2211, 2212], [2221, 2222]]]
+  ...      ]
+  >>> tf.linalg.tensor_diag_part(x)
+  <tf.Tensor: shape=(2, 2), dtype=int32, numpy=
+  array([[1111, 1212],
+         [2121, 2222]], dtype=int32)>
+  >>> tf.linalg.diag_part(x).shape
+  TensorShape([2, 2, 2])
+
+  Args:
+    input: A `Tensor` with rank `2k`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A Tensor containing diagonals of `input`. Has the same type as `input`, and
+    rank `k`.
+  """
+  return gen_array_ops.diag_part(input=input, name=name)
+
+
 @tf_export("linalg.set_diag", v1=["linalg.set_diag", "matrix_set_diag"])
 @dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_set_diag")
@@ -3709,6 +3758,23 @@ def _FakeQuantWithMinMaxVarsPerChannelGradient(op, grad):
       narrow_range=op.get_attr("narrow_range"))
 
 
+@ops.RegisterGradient("QuantizeAndDequantizeV4")
+def _QuantizeAndDequantizeV4Grad(op, grad):
+  """Gradient for QuantizeAndDequantizeV4 op."""
+  return quantize_and_dequantize_v4_grad(
+      grad,
+      op.inputs[0],
+      op.inputs[1],
+      op.inputs[2],
+      axis=op.get_attr("axis"))
+
+
+@ops.RegisterGradient("QuantizeAndDequantizeV4Grad")
+def _QuantizeAndDequantizeV4GradGrad(op, grad):
+  """Gradient for QuantizeAndDequantizeV4Grad op."""
+  return _QuantizeAndDequantizeV4Grad(op, grad)
+
+
 @tf_export("required_space_to_batch_paddings")
 def required_space_to_batch_paddings(input_shape,
                                      block_shape,
@@ -4252,8 +4318,7 @@ def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
     # authoritative type. Whenever maxlen fits into tf.int32, so do the lengths.
     matrix = gen_math_ops.cast(expand_dims(lengths, -1), maxlen.dtype)
     result = row_vector < matrix
-
-    if dtype is None or result.dtype.base_dtype == dtype.base_dtype:
+    if dtype is None or result.dtype.is_compatible_with(dtype):
       return result
     else:
       return gen_math_ops.cast(result, dtype)
@@ -5160,6 +5225,293 @@ def batch_gather_nd(params, indices, batch_dims, name=None):
   return out
 
 
+@deprecation.deprecated_endpoints("tensor_scatter_update")
+@tf_export(
+    "tensor_scatter_nd_update",
+    v1=["tensor_scatter_nd_update", "tensor_scatter_update"])
+@dispatch.add_dispatch_support
+def tensor_scatter_nd_update(tensor, indices, updates, name=None):
+  """"Scatter `updates` into an existing tensor according to `indices`.
+
+  This operation creates a new tensor by applying sparse `updates` to the
+  input `tensor`. This is similar to an index assignment.
+
+  ```
+  # Not implemented: tensors cannot be updated inplace.
+  tensor[indices] = updates
+  ```
+
+  If an out of bound index is found on CPU, an error is returned.
+
+  > **WARNING**: There are some GPU specific semantics for this operation.
+  >
+  > - If an out of bound index is found, the index is ignored.
+  > - The order in which updates are applied is nondeterministic, so the output
+  >   will be nondeterministic if `indices` contains duplicates.
+
+  This operation is very similar to `tf.scatter_nd`, except that the updates are
+  scattered onto an existing tensor (as opposed to a zero-tensor). If the memory
+  for the existing tensor cannot be re-used, a copy is made and updated.
+
+  In general:
+
+  * `indices` is an integer tensor - the indices to update in `tensor`.
+  * `indices` has **at least two** axes, the last axis is the depth of the
+    index vectors.
+  * For each index vector in `indices` there is a corresponding entry in
+    `updates`.
+  * If the length of the index vectors matches the rank of the `tensor`, then
+    the index vectors each point to scalars in `tensor` and each update is a
+    scalar.
+  * If the length of the index vectors is less than the rank of `tensor`, then
+    the index vectors each point to slices of `tensor` and shape of the updates
+    must match that slice.
+
+  Overall this leads to the following shape constraints:
+
+  ```
+  assert tf.rank(indices) >= 2
+  index_depth = indices.shape[-1]
+  batch_shape = indices.shape[:-1]
+  assert index_depth <= tf.rank(tensor)
+  outer_shape = tensor.shape[:index_depth]
+  inner_shape = tensor.shape[index_depth:]
+  assert updates.shape == batch_shape + inner_shape
+  ```
+
+  Typical usage is often much simpler than this general form, and it
+  can be better understood starting with simple examples:
+
+  ### Scalar updates
+
+  The simplest usage inserts scalar elements into a tensor by index.
+  In this case, the `index_depth` must equal the rank of the
+  input `tensor`, slice each column of `indices` is an index into an axis of the
+  input `tensor`.
+
+  In this simplest case the shape constraints are:
+
+  ```
+  num_updates, index_depth = indices.shape.as_list()
+  assert updates.shape == [num_updates]
+  assert index_depth == tf.rank(tensor)`
+  ```
+
+  For example, to insert 4 scattered elements in a rank-1 tensor with
+  8 elements.
+
+  <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+  <img style="width:100%"
+    src="https://www.tensorflow.org/images/ScatterNd1.png">
+  </div>
+
+  This scatter operation would look like this:
+
+  >>> tensor = [0, 0, 0, 0, 0, 0, 0, 0]    # tf.rank(tensor) == 1
+  >>> indices = [[1], [3], [4], [7]]       # num_updates == 4, index_depth == 1
+  >>> updates = [9, 10, 11, 12]            # num_updates == 4
+  >>> print(tf.tensor_scatter_nd_update(tensor, indices, updates))
+  tf.Tensor([ 0 9  0 10  11  0  0 12], shape=(8,), dtype=int32)
+
+  The length (first axis) of `updates` must equal the length of the `indices`:
+  `num_updates`. This is the the number of updates being inserted. Each
+  scalar update is inserted into `tensor` at the indexed location.
+
+  For a higher rank input `tensor` scalar updates can be inserted by using an
+  `index_depth` that matches `tf.rank(tensor)`:
+
+  >>> tensor = [[1, 1], [1, 1], [1, 1]]    # tf.rank(tensor) == 2
+  >>> indices = [[0, 1], [2, 0]]           # num_updates == 2, index_depth == 2
+  >>> updates = [5, 10]                    # num_updates == 2
+  >>> print(tf.tensor_scatter_nd_update(tensor, indices, updates))
+  tf.Tensor(
+      [[ 1  5]
+       [ 1  1]
+       [10  1]], shape=(3, 2), dtype=int32)
+
+  ### Slice updates
+
+  When the input `tensor` has more than one axis scatter can be used to update
+  entire slices.
+
+  In this case it's helpful to think of the input `tensor` as being a two level
+  array-of-arrays. The shape of this two level array is split into the
+  `outer_shape` and the `inner_shape`.
+
+  `indices` indexes into the outer level of the input tensor (`outer_shape`).
+  and replaces the sub-array at that location with the coresponding item from
+  the `updates` list. The shape of each update is `inner_shape`.
+
+  When updating a list of slices the shape constraints are:
+
+  ```
+  num_updates, index_depth = indices.shape.as_list()
+  inner_shape = tensor.shape[:index_depth]
+  outer_shape = tensor.shape[index_depth:]
+  assert updates.shape == [num_updates, inner_shape]
+  ```
+
+  For example, to update rows of a `(6, 3)` `tensor`:
+
+  >>> tensor = tf.zeros([6, 3], dtype=tf.int32)
+
+  Use an index depth of one.
+
+  >>> indices = tf.constant([[2], [4]])     # num_updates == 2, index_depth == 1
+  >>> num_updates, index_depth = indices.shape.as_list()
+
+  The `outer_shape` is `6`, the inner shape is `3`:
+
+  >>> outer_shape = tensor.shape[:index_depth]
+  >>> inner_shape = tensor.shape[index_depth:]
+
+  2 rows are being indexed so 2 `updates` must be supplied.
+  Each update must be shaped to match the `inner_shape`.
+
+  >>> # num_updates == 2, inner_shape==3
+  >>> updates = tf.constant([[1, 2, 3],
+  ...                        [4, 5, 6]])
+
+  Alltogether this gives:
+
+  >>> tf.tensor_scatter_nd_update(tensor, indices, updates).numpy()
+  array([[0, 0, 0],
+         [0, 0, 0],
+         [1, 2, 3],
+         [0, 0, 0],
+         [4, 5, 6],
+         [0, 0, 0]], dtype=int32)
+
+  #### More slice update examples
+
+  A tensor representing a batch of uniformly sized video clips naturally has 5
+  axes: `[batch_size, time, width, height, channels]`.
+
+  For example:
+
+  >>> batch_size, time, width, height, channels = 13,11,7,5,3
+  >>> video_batch = tf.zeros([batch_size, time, width, height, channels])
+
+  To replace a selection of video clips:
+    * Use an `index_depth` of 1 (indexing the `outer_shape`: `[batch_size]`)
+    * Provide updates each with a shape matching the `inner_shape`:
+      `[time, width, height, channels]`.
+
+  To relace the first two clips with ones:
+
+  >>> indices = [[0],[1]]
+  >>> new_clips = tf.ones([2, time, width, height, channels])
+  >>> tf.tensor_scatter_nd_update(video_batch, indices, new_clips)
+
+  To replace a selection of frames in the videos:
+
+  * `indices` must have an `index_depth` of 2 for the `outer_shape`:
+    `[batch_size, time]`.
+  * `updates` must be shaped like a list of images.  Each update must have a
+    shape, matching the `inner_shape`: `[width, height, channels]`.
+
+  To replace the first frame of the first three video clips:
+
+  >>> indices = [[0, 0], [1, 0], [2, 0]] # num_updates=3, index_depth=2
+  >>> new_images = tf.ones([
+  ...   # num_updates=3, inner_shape=(width, height, channels)
+  ...   3, width, height, channels])
+  >>> tf.tensor_scatter_nd_update(video_batch, indices, new_images)
+
+  ### Folded indices
+
+  In simple cases it's convienient to think of `indices` and `updates` as
+  lists, but this is not a strict requirement. Instead of a flat `num_updates`,
+  the `indices` and `updates` can be folded into a `batch_shape`. This
+  `batch_shape` is all axes of the `indices`, except for the innermost
+  `index_depth` axis.
+
+  ```
+  index_depth = indices.shape[-1]
+  batch_shape = indices.shape[:-1]
+  ```
+
+  Note: The one exception is that the `batch_shape` cannot be `[]`. You can't
+  update a single index by passing indices with shape `[index_depth]`.
+
+  `updates` must have a matching `batch_shape` (the axes before `inner_shape`).
+
+  ```
+  assert updates.shape == batch_shape + inner_shape
+  ```
+
+  Note: The result is equivalent to flattening the `batch_shape` axes of
+  `indices` and `updates`. This generalization just avoids the need
+  for reshapes when it is more natural to construct "folded" indices and
+  updates.
+
+  With this generalization the full shape constraints are:
+
+  ```
+  assert tf.rank(indices) >= 2
+  index_depth = indices.shape[-1]
+  batch_shape = indices.shape[:-1]
+  assert index_depth <= tf.rank(tensor)
+  outer_shape = tensor.shape[:index_depth]
+  inner_shape = tensor.shape[index_depth:]
+  assert updates.shape == batch_shape + inner_shape
+  ```
+
+  For example, to draw an `X` on a `(5,5)` matrix start with these indices:
+
+  >>> tensor = tf.zeros([5,5])
+  >>> indices = tf.constant([
+  ...  [[0,0],
+  ...   [1,1],
+  ...   [2,2],
+  ...   [3,3],
+  ...   [4,4]],
+  ...  [[0,4],
+  ...   [1,3],
+  ...   [2,2],
+  ...   [3,1],
+  ...   [4,0]],
+  ... ])
+  >>> indices.shape.as_list()  # batch_shape == [2, 5], index_depth == 2
+  [2, 5, 2]
+
+  Here the `indices` do not have a shape of `[num_updates, index_depth]`, but a
+  shape of `batch_shape+[index_depth]`.
+
+  Since the `index_depth` is equal to the rank of `tensor`:
+
+  * `outer_shape` is `(5,5)`
+  * `inner_shape` is `()` - each update is scalar
+  * `updates.shape` is `batch_shape + inner_shape == (5,2) + ()`
+
+  >>> updates = [
+  ...   [1,1,1,1,1],
+  ...   [1,1,1,1,1],
+  ... ]
+
+  Putting this together gives:
+
+  >>> tf.tensor_scatter_nd_update(tensor, indices, updates).numpy()
+  array([[1., 0., 0., 0., 1.],
+         [0., 1., 0., 1., 0.],
+         [0., 0., 1., 0., 0.],
+         [0., 1., 0., 1., 0.],
+         [1., 0., 0., 0., 1.]], dtype=float32)
+
+  Args:
+    tensor: Tensor to copy/update.
+    indices: Indices to update.
+    updates: Updates to apply at the indices.
+    name: Optional name for the operation.
+
+  Returns:
+    A new tensor with the given shape and updates applied according to the
+    indices.
+  """
+  return gen_array_ops.tensor_scatter_update(
+      tensor=tensor, indices=indices, updates=updates, name=name)
+
+
 # Define quantize_v2 here in order to make name the second-to-last attribute,
 # because round_mode was added later.
 # (And also now because of 'axis' processing).
@@ -5295,6 +5647,13 @@ dequantize.__doc__ = gen_array_ops.dequantize.__doc__
 
 @tf_export("quantization.quantize_and_dequantize")
 @dispatch.add_dispatch_support
+@deprecation.deprecated(None,
+                        "This Op has been deprecated, use" +
+                        "`quantize_and_dequantize_v2` instead. To " +
+                        "To simulate the V1 the behavior of " +
+                        "tf.quantization.quantize_and_dequantize(...) use " +
+                        "tf.grad_pass_through(" +
+                        "tf.quantization.quantize_and_dequantize_v2)(...).")
 def quantize_and_dequantize(
     input,  # pylint: disable=redefined-builtin
     input_min,
@@ -5353,6 +5712,93 @@ def quantize_and_dequantize(
       name=name)
 
 
+@tf_export("quantization.quantize_and_dequantize_v2")
+@dispatch.add_dispatch_support
+def quantize_and_dequantize_v2(
+    input,  # pylint: disable=redefined-builtin
+    input_min,
+    input_max,
+    signed_input=True,
+    num_bits=8,
+    range_given=False,
+    round_mode="HALF_TO_EVEN",
+    name=None,
+    narrow_range=False,
+    axis=None):
+  """Quantizes then dequantizes a tensor.
+
+  Updates the gradient definition for quantization that is outside the range to
+  be 0.To simulate the V1 the behavior of
+  tf.quantization.quantize_and_dequantize(...) use
+  tf.grad_pass_through(tf.quantization.quantize_and_dequantize_v2)(...).
+
+  Example usage:
+
+  ```python
+  def getQuantizeOp(input):
+      input_tensor = tf.placeholder(tf.float32, shape=[4, 4])
+      net = tf.quantization.quantize_and_dequantize(input,
+                                                    input_min=min_threshold,
+                                                    input_max=max_threshold,
+                                                    range_given=True)
+
+  To simulate v1 behavior:
+
+  def testDecomposeQuantizeDequantize(self):
+      def f(input_tensor):
+        return tf.quantization.quantize_and_dequantize_v2(input_tensor,
+                                                          input_min = 5.0,
+                                                          input_max= -10.0,
+                                                          range_given=True)
+      input_tensor = tf.placeholder(tf.float32, shape=[4, 4])
+      net = tf.grad_pass_through(f)(input_tensor)
+  ```
+
+  Args:
+    input: A `Tensor` to quantize and dequantize.
+    input_min: If range_given=True, the minimum input value, that needs to be
+      represented in the quantized representation. If axis is specified, this
+      should be a vector of minimum values for each slice along axis.
+    input_max: If range_given=True, the maximum input value that needs to be
+      represented in the quantized representation. If axis is specified, this
+      should be a vector of maximum values for each slice along axis.
+    signed_input: True if the quantization is signed or unsigned.
+    num_bits: The bitwidth of the quantization.
+    range_given: If true use `input_min` and `input_max` for the range of the
+      input, otherwise determine min and max from the input `Tensor`.
+    round_mode: Rounding mode when rounding from float values to quantized ones.
+      one of ['HALF_TO_EVEN', 'HALF_UP']
+    name: Optional name for the operation.
+    narrow_range: If true, then the absolute value of the quantized minimum
+      value is the same as the quantized maximum value, instead of 1 greater.
+      i.e. for 8 bit quantization, the minimum value is -127 instead of -128.
+    axis: Integer. If specified, refers to a dimension of the input tensor, such
+      that quantization will be per slice along that dimension.
+
+  Returns:
+    A `Tensor`. Each element is the result of quantizing and dequantizing the
+    corresponding element of `input`.
+  """
+  if axis is None:
+    axis = -1
+  elif axis < 0:
+    if input.shape.ndims is None:
+      raise ValueError("input should have known rank to use negative axis.")
+    axis %= input.shape.ndims
+
+  return gen_array_ops.quantize_and_dequantize_v4(
+      input,
+      input_min=input_min,
+      input_max=input_max,
+      signed_input=signed_input,
+      num_bits=num_bits,
+      range_given=range_given,
+      round_mode=round_mode,
+      narrow_range=narrow_range,
+      axis=axis,
+      name=name)
+
+
 @tf_export("searchsorted")
 @dispatch.add_dispatch_support
 def searchsorted(sorted_sequence,
@@ -5519,9 +5965,9 @@ def extract_image_patches_v2(images, sizes, strides, rates, padding, name=None):
   ```
 
   Args:
-    images: A 4-D Tensor with shape `[batch, in_rows, in_cols, depth]
-    sizes: The size of the extracted patches. Must be [1, size_rows, size_cols,
-      1].
+    images: A 4-D Tensor with shape `[batch, in_rows, in_cols, depth]`.
+    sizes: The size of the extracted patches. Must be
+      `[1, size_rows, size_cols, 1]`.
     strides: A 1-D Tensor of length 4. How far the centers of two consecutive
       patches are in the images. Must be: `[1, stride_rows, stride_cols, 1]`.
     rates: A 1-D Tensor of length 4. Must be: `[1, rate_rows, rate_cols, 1]`.
@@ -5840,7 +6286,7 @@ def _with_nonzero_rank(data):
 @dispatch.add_dispatch_support
 def repeat(input, repeats, axis=None, name=None):  # pylint: disable=redefined-builtin
   """Repeat elements of `input`.
-  
+
   See also `tf.concat`, `tf.stack`, `tf.tile`.
 
   Args:
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index baf0018fb32..5bc9bc1ab77 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -25,7 +25,9 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import bincount_ops
+from tensorflow.python.ops import gen_count_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -834,5 +836,121 @@ class TestSparseCountFailureModes(test.TestCase):
       self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
 
+@test_util.run_all_in_graph_and_eager_modes
+@test_util.disable_tfrt
+class RawOpsTest(test.TestCase, parameterized.TestCase):
+
+  def testSparseCountSparseOutputBadIndicesShape(self):
+    indices = [[[0], [0]], [[0], [1]], [[1], [0]], [[1], [2]]]
+    values = [1, 1, 1, 10]
+    weights = [1, 2, 4, 6]
+    dense_shape = [2, 3]
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Input indices must be a 2-dimensional tensor"):
+      self.evaluate(
+          gen_count_ops.SparseCountSparseOutput(
+              indices=indices,
+              values=values,
+              dense_shape=dense_shape,
+              weights=weights,
+              binary_output=False))
+
+  def testSparseCountSparseOutputBadWeightsShape(self):
+    indices = [[0, 0], [0, 1], [1, 0], [1, 2]]
+    values = [1, 1, 1, 10]
+    weights = [1, 2, 4]
+    dense_shape = [2, 3]
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Weights and values must have the same shape"):
+      self.evaluate(
+          gen_count_ops.SparseCountSparseOutput(
+              indices=indices,
+              values=values,
+              dense_shape=dense_shape,
+              weights=weights,
+              binary_output=False))
+
+  def testSparseCountSparseOutputBadNumberOfValues(self):
+    indices = [[0, 0], [0, 1], [1, 0]]
+    values = [1, 1, 1, 10]
+    weights = [1, 2, 4, 6]
+    dense_shape = [2, 3]
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        "Number of values must match first dimension of indices"):
+      self.evaluate(
+          gen_count_ops.SparseCountSparseOutput(
+              indices=indices,
+              values=values,
+              dense_shape=dense_shape,
+              weights=weights,
+              binary_output=False))
+
+  def testRaggedCountSparseOutput(self):
+    splits = [0, 4, 7]
+    values = [1, 1, 2, 1, 2, 10, 5]
+    weights = [1, 2, 3, 4, 5, 6, 7]
+    output_indices, output_values, output_shape = self.evaluate(
+        gen_count_ops.RaggedCountSparseOutput(
+            splits=splits, values=values, weights=weights, binary_output=False))
+    self.assertAllEqual([[0, 1], [0, 2], [1, 2], [1, 5], [1, 10]],
+                        output_indices)
+    self.assertAllEqual([7, 3, 5, 7, 6], output_values)
+    self.assertAllEqual([2, 11], output_shape)
+
+  def testRaggedCountSparseOutputBadWeightsShape(self):
+    splits = [0, 4, 7]
+    values = [1, 1, 2, 1, 2, 10, 5]
+    weights = [1, 2, 3, 4, 5, 6]
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Weights and values must have the same shape"):
+      self.evaluate(
+          gen_count_ops.RaggedCountSparseOutput(
+              splits=splits,
+              values=values,
+              weights=weights,
+              binary_output=False))
+
+  def testRaggedCountSparseOutputEmptySplits(self):
+    splits = []
+    values = [1, 1, 2, 1, 2, 10, 5]
+    weights = [1, 2, 3, 4, 5, 6, 7]
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        "Must provide at least 2 elements for the splits argument"):
+      self.evaluate(
+          gen_count_ops.RaggedCountSparseOutput(
+              splits=splits,
+              values=values,
+              weights=weights,
+              binary_output=False))
+
+  def testRaggedCountSparseOutputBadSplitsStart(self):
+    splits = [1, 7]
+    values = [1, 1, 2, 1, 2, 10, 5]
+    weights = [1, 2, 3, 4, 5, 6, 7]
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Splits must start with 0"):
+      self.evaluate(
+          gen_count_ops.RaggedCountSparseOutput(
+              splits=splits,
+              values=values,
+              weights=weights,
+              binary_output=False))
+
+  def testRaggedCountSparseOutputBadSplitsEnd(self):
+    splits = [0, 5]
+    values = [1, 1, 2, 1, 2, 10, 5]
+    weights = [1, 2, 3, 4, 5, 6, 7]
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Splits must end with the number of values"):
+      self.evaluate(
+          gen_count_ops.RaggedCountSparseOutput(
+              splits=splits,
+              values=values,
+              weights=weights,
+              binary_output=False))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 9bc638ac5a2..f920092fd7f 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -1151,14 +1151,15 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
     ValueError:  If static checks determine `x` has wrong rank.
   """
   with ops.name_scope(name, 'assert_rank', (x, rank) + tuple(data or [])):
-    x = ops.convert_to_tensor(x, name='x')
+    if not isinstance(x, sparse_tensor.SparseTensor):
+      x = ops.convert_to_tensor(x, name='x')
     rank = ops.convert_to_tensor(rank, name='rank')
     message = message or ''
 
     static_condition = lambda actual_rank, given_rank: actual_rank == given_rank
     dynamic_condition = math_ops.equal
 
-    if context.executing_eagerly():
+    if context.executing_eagerly() or isinstance(x, sparse_tensor.SparseTensor):
       name = ''
     else:
       name = x.name
@@ -1418,11 +1419,12 @@ def assert_rank_in(
   """
   with ops.name_scope(
       name, 'assert_rank_in', (x,) + tuple(ranks) + tuple(data or [])):
-    x = ops.convert_to_tensor(x, name='x')
+    if not isinstance(x, sparse_tensor.SparseTensor):
+      x = ops.convert_to_tensor(x, name='x')
     ranks = tuple([ops.convert_to_tensor(rank, name='rank') for rank in ranks])
     message = message or ''
 
-    if context.executing_eagerly():
+    if context.executing_eagerly() or isinstance(x, sparse_tensor.SparseTensor):
       name = ''
     else:
       name = x.name
@@ -1582,7 +1584,7 @@ def _dimension_sizes(x):
   rank = x.get_shape().rank
   rank_is_known = rank is not None
   if rank_is_known and rank == 0:
-    return tuple([1])
+    return (1,)
   if rank_is_known and rank > 0:
     static_shape = x.get_shape().as_list()
     sizes = [
@@ -1787,14 +1789,14 @@ def assert_shapes(shapes, data=None, summarize=None, message=None, name=None):
   message = message or ''
   with ops.name_scope(name, 'assert_shapes', [shapes, data]):
     # Shape specified as None implies no constraint
-    shape_constraints = [
-        (ops.convert_to_tensor(x), s) for x, s in shapes if s is not None
-    ]
+    shape_constraints = [(x if isinstance(x, sparse_tensor.SparseTensor) else
+                          ops.convert_to_tensor(x), s)
+                         for x, s in shapes if s is not None]
 
     executing_eagerly = context.executing_eagerly()
 
     def tensor_name(x):
-      if executing_eagerly:
+      if executing_eagerly or isinstance(x, sparse_tensor.SparseTensor):
         return _shape_and_dtype_str(x)
       return x.name
 
diff --git a/tensorflow/python/ops/collective_ops.py b/tensorflow/python/ops/collective_ops.py
index 7ae2fec262e..f2378995597 100644
--- a/tensorflow/python/ops/collective_ops.py
+++ b/tensorflow/python/ops/collective_ops.py
@@ -24,8 +24,8 @@ def all_reduce(t,
                group_size,
                group_key,
                instance_key,
-               merge_op,
-               final_op,
+               merge_op='Add',
+               final_op='Id',
                subdiv_offsets=(0,),
                communication_hint='auto',
                timeout=0):
@@ -47,9 +47,9 @@ def all_reduce(t,
     communication_hint: preferred collective communication.  The implementation
       may fall back to another mechanism.  Options include `auto`, `ring`, and
       `nccl`.
-    timeout: If set to a non zero, set a completion timeout to detect staleness.
-      If the timer goes off, a DeadlineExceededError is raised.
-      The timeout value in seconds. This feature is experimental.
+    timeout: a float. If set to a non zero, set a completion timeout to detect
+      staleness.  If the timer goes off, a DeadlineExceededError is raised.  The
+      timeout value in seconds. This feature is experimental.
 
   Returns:
     An Op implementing the distributed reduction.
@@ -71,6 +71,48 @@ def all_reduce(t,
       timeout_seconds=timeout)
 
 
+def all_reduce_v2(t,
+                  group_size,
+                  group_key,
+                  instance_key,
+                  merge_op='Add',
+                  final_op='Id',
+                  communication_hint='auto',
+                  timeout=0):
+  """Reduces tensors collectively, across devices.
+
+  Args:
+    t: the tensor to be reduced.
+    group_size: an int32 tensor. The total number of tensors to be collectively
+      reduced.  Each must reside on a different device.  Should be a positive
+      integer.
+    group_key: an int32 tensor identifying the group of devices.
+    instance_key: an int32 tensor identifying the participating group of Ops.
+    merge_op: string naming the binary Op to be applied to compute each partial
+      reduction.
+    final_op: string naming the unary Op to be applied to each fully reduced
+      value.  Can be 'Id' for no operation.
+    communication_hint: preferred collective communication.  The implementation
+      may fall back to another mechanism.  Options include `auto`, `ring`, and
+      `nccl`.
+    timeout: a float. If set to a non zero, set a completion timeout to detect
+      staleness.  If the timer goes off, a DeadlineExceededError is raised.  The
+      timeout value in seconds. This feature is experimental.
+
+  Returns:
+    An Op implementing the distributed reduction.
+  """
+  return gen_collective_ops.collective_reduce_v2(
+      t,
+      group_size=group_size,
+      group_key=group_key,
+      instance_key=instance_key,
+      merge_op=merge_op,
+      final_op=final_op,
+      communication_hint=communication_hint.lower(),
+      timeout_seconds=timeout)
+
+
 def all_gather(t,
                group_size,
                group_key,
@@ -82,15 +124,15 @@ def all_gather(t,
   Args:
     t: the tensor to participate in the accumulation.
     group_size: the total number of tensors to be collectively accumulated.
-      Each must reside on a different device.  Should be a positive integer.
+      Each must reside on a different device. Should be a positive integer.
     group_key: an integer identifying the group of devices.
     instance_key: an integer identifying the participating group of Ops.
-    communication_hint: preferred collective communication.  The implementation
-      may fall back to another mechanism.  Options include `auto`, `ring`, and
+    communication_hint: preferred collective communication. The implementation
+      may fall back to another mechanism. Options include `auto`, `ring`, and
       `nccl`.
-    timeout: If set to a non zero, set a completion timeout to detect staleness.
-      If the timer goes off, a DeadlineExceededError is raised.
-      The timeout value in seconds. This feature is experimental.
+    timeout: a float. If set to a non zero, set a completion timeout to detect
+      staleness. If the timer goes off, a DeadlineExceededError is raised. The
+      timeout value in seconds. This feature is experimental.
 
   Returns:
     An Op implementing the distributed operation.
@@ -110,6 +152,40 @@ def all_gather(t,
       timeout_seconds=timeout)
 
 
+def all_gather_v2(t,
+                  group_size,
+                  group_key,
+                  instance_key,
+                  communication_hint='auto',
+                  timeout=0):
+  """Accumulates tensors collectively, across devices, along first dimension.
+
+  Args:
+    t: the tensor to participate in the accumulation.
+    group_size: an int32 tensor, the total number of tensors to be collectively
+      accumulated. Each must reside on a different device. Should be a positive
+      integer.
+    group_key: an int32 tensor identifying the group of devices.
+    instance_key: an int32 tensor identifying the participating group of Ops.
+    communication_hint: preferred collective communication. The implementation
+      may fall back to another mechanism. Options include `auto`, `ring`, and
+      `nccl`.
+    timeout: a float. If set to a non zero, set a completion timeout to detect
+      staleness. If the timer goes off, a DeadlineExceededError is raised. The
+      timeout value in seconds. This feature is experimental.
+
+  Returns:
+    An Op implementing the distributed operation.
+  """
+  return gen_collective_ops.collective_gather_v2(
+      t,
+      group_size=group_size,
+      group_key=group_key,
+      instance_key=instance_key,
+      communication_hint=communication_hint.lower(),
+      timeout_seconds=timeout)
+
+
 def broadcast_send(t,
                    shape,
                    dtype,
diff --git a/tensorflow/python/ops/collective_ops_gpu_test.py b/tensorflow/python/ops/collective_ops_gpu_test.py
index 87758a314b2..a1ac4a60320 100644
--- a/tensorflow/python/ops/collective_ops_gpu_test.py
+++ b/tensorflow/python/ops/collective_ops_gpu_test.py
@@ -19,6 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import threading
+import time
 
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -27,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.platform import test
 
@@ -301,6 +304,65 @@ class CollectiveOpGPUTest(test.TestCase):
             [1.], group_size=1, group_key=0, instance_key=0, merge_op='Add',
             final_op='Id', communication_hint='NCCL')
 
+  @test_util.run_v2_only
+  def testAbortNccl(self):
+    self._setup_context(num_gpus=2)
+
+    group_size = 2
+    group_key = 100
+    instance_key = 100
+    in_tensor = constant_op.constant(1.)
+
+    # First perform a normal collective to finish resolution.
+    def collective_fn():
+      for device in ['GPU:0', 'GPU:1']:
+        with ops.device(device):
+          collective_ops.all_reduce(
+              in_tensor,
+              group_size,
+              group_key,
+              instance_key,
+              'Add',
+              'Id',
+              communication_hint='nccl')
+
+    def_function.function(collective_fn)()
+
+    # Launch a collective that hangs, and abort the collective executor after
+    # the launch.
+    def abort_fn():
+      time.sleep(2)
+      context.context().abort_collective_ops(errors.UNAVAILABLE, 'peer down')
+
+    t = threading.Thread(target=abort_fn)
+    t.start()
+
+    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
+      collective_ops.all_reduce(
+          in_tensor,
+          group_size,
+          group_key,
+          instance_key,
+          'Add',
+          'Id',
+          communication_hint='nccl')
+
+    # After abortion, subsequent collectives should fail immediately.
+    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
+      collective_ops.all_reduce(
+          in_tensor,
+          group_size,
+          group_key,
+          instance_key,
+          'Add',
+          'Id',
+          communication_hint='nccl')
+
+    t.join()
+    # Reset the context in order to reset the collective executor.
+    context._reset_context()  # pylint: disable=protected-access
+    def_function.function(collective_fn)()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index fd93da34847..d3988d0806d 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import threading
 import time
 
 from tensorflow.core.protobuf import config_pb2
@@ -167,151 +166,6 @@ class CollectiveOpTest(test.TestCase):
     elapsed = time.time() - start_time
     self.assertAllGreaterEqual(elapsed, timeout)
 
-  @test_util.run_v2_only
-  def testCollectiveTimeoutV2(self):
-    timeout = 4.5
-    cpus = config.list_physical_devices('CPU')
-    self.assertEqual(len(cpus), 1)
-    config.set_logical_device_configuration(cpus[0], [
-        context.LogicalDeviceConfiguration(),
-        context.LogicalDeviceConfiguration()
-    ])
-    context.ensure_initialized()
-
-    @def_function.function
-    def run_all_reduce(group_size, reported_group_size=None):
-      group_key = 20
-      instance_key = 30
-      tensor = [1, 2, 3, 4]
-      results = []
-      if reported_group_size is None:
-        reported_group_size = group_size
-      for i in range(group_size):
-        with ops.device('/CPU:{}'.format(i)):
-          input_data = constant_op.constant(tensor)
-          collective_op = collective_ops.all_reduce(
-              input_data,
-              group_size=reported_group_size,
-              group_key=group_key,
-              instance_key=instance_key,
-              merge_op='Add',
-              final_op='Id',
-              timeout=timeout)
-          results.append(collective_op)
-      return results
-
-    run_all_reduce(2, 2)
-
-    start_time = time.time()
-    with self.assertRaisesRegex(errors.DeadlineExceededError,
-                                'Collective has timed out during execution'):
-      run_all_reduce(1, 2)
-    elapsed = time.time() - start_time
-    self.assertAllGreaterEqual(elapsed, timeout)
-
-  @test_util.run_v2_only
-  def testParamResolutionAfterTimeoutV2(self):
-    timeout = 1.5
-    cpus = config.list_physical_devices('CPU')
-    self.assertEqual(len(cpus), 1)
-    config.set_logical_device_configuration(cpus[0], [
-        context.LogicalDeviceConfiguration(),
-        context.LogicalDeviceConfiguration()
-    ])
-    context.ensure_initialized()
-
-    group_key = 20
-    instance_key = 30
-    input_data = constant_op.constant([1, 2, 3, 4])
-
-    # This timeout comes from param solution.
-    with self.assertRaisesRegex(
-        errors.DeadlineExceededError,
-        'Collective has timed out waiting for other workers'):
-      with ops.device('CPU:0'):
-        collective_ops.all_reduce(
-            input_data,
-            group_size=2,
-            group_key=group_key,
-            instance_key=instance_key,
-            merge_op='Add',
-            final_op='Id',
-            timeout=timeout)
-
-    # We launch the second device after the first device times out. This is to
-    # simulate the situation when other workers are slow and the timeout is
-    # short. Since the CPU:0 times out in the param resolution phase, CPU:1
-    # should times out as well, but in the execute phase.
-    with self.assertRaisesRegex(errors.DeadlineExceededError,
-                                'Collective has timed out during execution'):
-      with ops.device('CPU:1'):
-        collective_ops.all_reduce(
-            input_data,
-            group_size=2,
-            group_key=group_key,
-            instance_key=instance_key,
-            merge_op='Add',
-            final_op='Id',
-            timeout=timeout)
-
-  @test_util.run_v2_only
-  def testExecutionAfterTimeoutV2(self):
-    timeout = 1.5
-    cpus = config.list_physical_devices('CPU')
-    self.assertEqual(len(cpus), 1)
-    config.set_logical_device_configuration(cpus[0], [
-        context.LogicalDeviceConfiguration(),
-        context.LogicalDeviceConfiguration()
-    ])
-    context.ensure_initialized()
-
-    group_key = 20
-    instance_key = 30
-    input_data = constant_op.constant([1, 2, 3, 4])
-
-    @def_function.function
-    def run_all_reduce():
-      for device in ['CPU:0', 'CPU:1']:
-        with ops.device(device):
-          collective_ops.all_reduce(
-              input_data,
-              group_size=2,
-              group_key=group_key,
-              instance_key=instance_key,
-              merge_op='Add',
-              final_op='Id',
-              timeout=timeout)
-
-    # Run a normal all-reduce to complete param resolution.
-    run_all_reduce()
-
-    with self.assertRaisesRegex(errors.DeadlineExceededError,
-                                'Collective has timed out during execution'):
-      with ops.device('CPU:0'):
-        collective_ops.all_reduce(
-            input_data,
-            group_size=2,
-            group_key=group_key,
-            instance_key=instance_key,
-            merge_op='Add',
-            final_op='Id',
-            timeout=timeout)
-
-    # We launch the second device after the first device times out. This is to
-    # simulate the situation when other workers are slow and the timeout is
-    # short. It should error immediately.
-    with self.assertRaisesRegex(errors.DeadlineExceededError,
-                                'Collective has timed out during execution'):
-      with ops.device('CPU:1'):
-        # No timeout.
-        collective_ops.all_reduce(
-            input_data,
-            group_size=2,
-            group_key=group_key,
-            merge_op='Add',
-            final_op='Id',
-            instance_key=instance_key)
-
   def testNcclHintFallbackToRingReduce(self):
     """Tests that setting `communication_hint=nccl` works on non-GPU builds."""
     if kernels.get_registered_kernels_for_op('NcclAllReduce'):
@@ -661,22 +515,6 @@ class CollectiveOpTest(test.TestCase):
     result = fn([in0, in1])
     self.assertAllClose(result, [2, 2])
 
-  @test_util.run_v2_only
-  def testCollectiveGroupSizeOne(self):
-    group_size = 1
-    group_key = 100
-    instance_key = 100
-    in_value = [1, 2, 3, 4]
-    in_tensor = constant_op.constant(in_value)
-
-    reduced_tensor = collective_ops.all_reduce(
-        in_tensor, group_size, group_key, instance_key, 'Add', 'Id')
-    self.assertAllEqual(in_value, reduced_tensor.numpy())
-
-    gathered_tensor = collective_ops.all_gather(
-        in_tensor, group_size, group_key, instance_key)
-    self.assertAllEqual(in_value, gathered_tensor.numpy())
-
   def testConstantWithScopedAllocator(self):
     group_size = 2
     group_key = 1
@@ -713,205 +551,6 @@ class CollectiveOpTest(test.TestCase):
         results = sess.run(run_ops)
     self.assertEqual(results, [3., 3., 3., 3.])
 
-  @test_util.run_v2_only
-  def testMultipleGroups(self):
-    cpus = config.list_physical_devices('CPU')
-    self.assertEqual(len(cpus), 1)
-    config.set_logical_device_configuration(cpus[0], [
-        context.LogicalDeviceConfiguration(),
-        context.LogicalDeviceConfiguration(),
-        context.LogicalDeviceConfiguration()
-    ])
-    context.ensure_initialized()
-    num_elements = 4
-
-    @def_function.function
-    def run_all_reduce(group_size, group_key):
-      instance_key = group_key
-      input_value = [group_key for i in range(num_elements)]
-      collectives = []
-      for device_idx in range(group_size):
-        with ops.device('/CPU:{}'.format(device_idx)):
-          input_tensor = constant_op.constant(input_value)
-          collectives.append(collective_ops.all_reduce(
-              input_tensor, group_size, group_key, instance_key, merge_op='Add',
-              final_op='Id'))
-      return collectives
-
-    def run_and_assert(group_size, group_key):
-      for reduced_tensor in run_all_reduce(group_size, group_key):
-        self.assertAllEqual(
-            [group_key * group_size for i in range(num_elements)],
-            reduced_tensor.numpy())
-
-    run_and_assert(group_size=2, group_key=1)
-    run_and_assert(group_size=3, group_key=2)
-
-  @test_util.run_v2_only
-  def testAbortGroupParamsResolution(self):
-    group_size = 2
-    group_key = 100
-    instance_key = 100
-    in_tensor = constant_op.constant(1.)
-
-    def abort_fn():
-      time.sleep(2)
-      context.context().abort_collective_ops(errors.UNAVAILABLE, 'peer down')
-
-    t = threading.Thread(target=abort_fn)
-    t.start()
-
-    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
-      # This hangs on params resolution since we're only launching one
-      # collective for a group size of 2.
-      collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key,
-                                'Add', 'Id')
-
-    # After abortion, subsequent collectives should fail immediately.
-    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
-      collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key,
-                                'Add', 'Id')
-
-    # Reset the context in order to reset the collective executor.
-    context._reset_context()  # pylint: disable=protected-access
-    t.join()
-
-    # After reset non-NCCL collectives should work.
-    cpus = config.list_physical_devices('CPU')
-    config.set_logical_device_configuration(cpus[0], [
-        context.LogicalDeviceConfiguration(),
-        context.LogicalDeviceConfiguration()
-    ])
-
-    def collective_fn():
-      for device in ['CPU:0', 'CPU:1']:
-        with ops.device(device):
-          collective_ops.all_reduce(
-              in_tensor,
-              group_size,
-              group_key,
-              instance_key,
-              'Add',
-              'Id',
-              communication_hint='ring')
-
-    def_function.function(collective_fn)()
-
-  @test_util.run_v2_only
-  def testAbortInstanceParamsResolution(self):
-    cpus = config.list_physical_devices('CPU')
-    config.set_logical_device_configuration(cpus[0], [
-        context.LogicalDeviceConfiguration(),
-        context.LogicalDeviceConfiguration()
-    ])
-    group_size = 2
-    group_key = 100
-    instance_key = 100
-    in_tensor = constant_op.constant(1.)
-
-    def collective_fn():
-      for device in ['CPU:0', 'CPU:1']:
-        with ops.device(device):
-          collective_ops.all_reduce(
-              in_tensor,
-              group_size,
-              group_key,
-              instance_key,
-              'Add',
-              'Id',
-              communication_hint='ring')
-
-    # First perform a normal all-reduce to complete the group resolution.
-    def_function.function(collective_fn)()
-
-    def abort_fn():
-      time.sleep(2)
-      context.context().abort_collective_ops(errors.UNAVAILABLE, 'peer down')
-
-    t = threading.Thread(target=abort_fn)
-    t.start()
-
-    # Use a different instance key to trigger another instance resolution.
-    instance_key = 101
-    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
-      # This hangs on params resolution since we're only launching one
-      # collective for a group size of 2.
-      collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key,
-                                'Add', 'Id')
-
-    # After abortion, subsequent collectives should fail immediately.
-    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
-      collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key,
-                                'Add', 'Id')
-
-    # Reset the context in order to reset the collective executor.
-    context._reset_context()  # pylint: disable=protected-access
-    t.join()
-
-    # After reset non-NCCL collectives should work.
-    cpus = config.list_physical_devices('CPU')
-    config.set_logical_device_configuration(cpus[0], [
-        context.LogicalDeviceConfiguration(),
-        context.LogicalDeviceConfiguration()
-    ])
-    def_function.function(collective_fn)()
-
-  @test_util.run_v2_only
-  def testAbortRing(self):
-    cpus = config.list_physical_devices('CPU')
-    config.set_logical_device_configuration(cpus[0], [
-        context.LogicalDeviceConfiguration(),
-        context.LogicalDeviceConfiguration()
-    ])
-    group_size = 2
-    group_key = 100
-    instance_key = 100
-    in_tensor = constant_op.constant(1.)
-
-    # First perform a normal collective to finish resolution.
-    def collective_fn():
-      for device in ['CPU:0', 'CPU:1']:
-        with ops.device(device):
-          collective_ops.all_reduce(
-              in_tensor,
-              group_size,
-              group_key,
-              instance_key,
-              'Add',
-              'Id',
-              communication_hint='ring')
-
-    def_function.function(collective_fn)()
-
-    # Launch a collective that hangs, and abort the collective executor after
-    # the launch.
-    def abort_fn():
-      time.sleep(2)
-      context.context().abort_collective_ops(errors.UNAVAILABLE, 'peer down')
-
-    t = threading.Thread(target=abort_fn)
-    t.start()
-
-    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
-      collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key,
-                                'Add', 'Id')
-
-    # After abortion, subsequent collectives should fail immediately.
-    with self.assertRaisesRegex(errors.UnavailableError, 'peer down'):
-      collective_ops.all_reduce(in_tensor, group_size, group_key, instance_key,
-                                'Add', 'Id')
-
-    # Reset the context in order to reset the collective executor.
-    t.join()
-    context._reset_context()  # pylint: disable=protected-access
-    # After reset non-NCCL collectives should work.
-    cpus = config.list_physical_devices('CPU')
-    config.set_logical_device_configuration(cpus[0], [
-        context.LogicalDeviceConfiguration(),
-        context.LogicalDeviceConfiguration()
-    ])
-    def_function.function(collective_fn)()
-
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index 163f0fb7077..5bdd2494e91 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_util
@@ -42,6 +43,7 @@ from tensorflow.python.ops import default_gradient
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import gradients_util
+from tensorflow.python.ops import handle_data_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import nest
 
@@ -286,6 +288,7 @@ def _build_cond(pred,
     # Prevent fetching since the variant outputs can't be fetched directly.
     if_op.graph.prevent_fetching(if_op)
 
+  _copy_handle_data(tensors, true_graph.outputs, false_graph.outputs)
   # Return identities for each output of the If op, rather than the output of
   # the If op directly. This makes pruning work if the output of cond() is
   # fetched: the lowering pass converts the If outputs into IdentityN outputs,
@@ -813,6 +816,32 @@ def _get_output_shapes(*branch_graph_outputs):
   return output_shapes
 
 
+def _copy_handle_data(external_tensors, *branch_graph_outputs):
+  """Combines shapes in handle data and sets metadata on `external_tensors`."""
+  for tensors in zip(external_tensors, *branch_graph_outputs):
+    external = tensors[0]
+    internal = tensors[1:]
+    internal_handle_data = []
+    for tensor in internal:
+      handle_data = handle_data_util.get_resource_handle_data(tensor)
+      # NOTE: Assumes handle data has only one ShapeAndType entry. It's
+      # unclear how to combine different lengths across branches.
+      if not handle_data.is_set or len(handle_data.shape_and_type) != 1:
+        break
+      internal_handle_data.append(handle_data)
+    else:  # There is handle data, so we need to combine it.
+      combined_shape = tensor_shape.TensorShape(None)
+      for handle_data in internal_handle_data:
+        handle_shape = tensor_shape.TensorShape(
+            handle_data.shape_and_type[0].shape)
+        combined_shape = combined_shape.most_specific_compatible_shape(
+            handle_shape)
+      combined_handle_data = internal_handle_data[0]
+      combined_handle_data.shape_and_type[0].shape.CopyFrom(
+          combined_shape.as_proto())
+      handle_data_util.set_handle_data(external, combined_handle_data)
+
+
 def verify_captures(op_type, branch_graphs):
   """Verify that a branch's tensor is not accessed in another branch fn."""
   # Note: It is technically not possible for lower-branch_index branches to
@@ -1143,6 +1172,7 @@ def _build_case(branch_index,
     # Prevent fetching since the variant outputs can't be fetched directly.
     case_op.graph.prevent_fetching(case_op)
 
+  _copy_handle_data(tensors, *[g.outputs for g in branch_graphs])
   # Return identities for each output of the Case op, rather than the output of
   # the Case op directly. This makes pruning work if the output of switch_case()
   # is fetched: the lowering pass converts the Case outputs into IdentityN
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 748f842a9e0..8e58c8d4408 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -3647,7 +3647,11 @@ def execute_fn_for_device(device_branch_fns, default_fn, name="execute_fn"):
     The tensors returned by the callable identified by device type during
     execution, or those returned by 'default_fn' if no key matches.
   """
-
+  # Always execute the default fn for XLA to avoid complicated graph by case op.
+  # see more discussions in b/167276293.
+  is_in_xla = util.GraphOrParentsInXlaContext(ops.get_default_graph())
+  if is_in_xla:
+    return default_fn()
   device_branch_fns_upper = {k.upper(): v for k, v in device_branch_fns.items()}
   branch_fns = list(device_branch_fns_upper.values())
   devices = list(device_branch_fns_upper.keys())
diff --git a/tensorflow/python/ops/control_flow_util_v2.py b/tensorflow/python/ops/control_flow_util_v2.py
index 4fc464c545c..0bbee785641 100644
--- a/tensorflow/python/ops/control_flow_util_v2.py
+++ b/tensorflow/python/ops/control_flow_util_v2.py
@@ -244,8 +244,9 @@ def _is_tpu_strategy(strategy):
 
 def _register_keras_layer_context_function(func):
   global _KERAS_LAYER_CONTEXT_FUNCTION
-  if _KERAS_LAYER_CONTEXT_FUNCTION is None:
-    _KERAS_LAYER_CONTEXT_FUNCTION = func
+  # TODO(scottzhu): Disable duplicated inject once keras is moved to
+  # third_party/py/keras.
+  _KERAS_LAYER_CONTEXT_FUNCTION = func
 
 
 def _is_building_keras_layer():
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index f081f036b58..3e38f68a0f7 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -17,7 +17,6 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from tensorflow.python.client import pywrap_tf_session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape as tape_lib
@@ -25,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import handle_data_util
 from tensorflow.python.ops import op_selector
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
@@ -42,47 +42,8 @@ VAR_OP_TYPES = [
 ]
 
 
-def copy_handle_data(source_t, target_t):
-  """Copies HandleData for variant and resource type tensors if available.
-
-  The CppShapeInferenceResult::HandleData proto contains information about the
-  shapes and types of the element tensors of resource/variant type tensors.
-  We need to copy this across function boundaries, i.e., when capturing a
-  placeholder or when returning a function tensor as output. If we don't do this
-  the element tensors will have unknown shapes, e.g., if a TensorList variant
-  tensor is captured as a placeholder, elements popped from that list would have
-  unknown shape.
-
-  Args:
-    source_t: The tensor to copy HandleData from.
-    target_t: The tensor to copy HandleData to.
-  """
-  if (target_t.dtype == dtypes.resource or
-      target_t.dtype == dtypes.variant):
-    if isinstance(source_t, ops.EagerTensor):
-      handle_data = source_t._handle_data  # pylint: disable=protected-access
-    else:
-      handle_data = resource_variable_ops.get_resource_handle_data(source_t)
-    if (handle_data is not None
-        and handle_data.is_set
-        and handle_data.shape_and_type):
-      # pylint: disable=protected-access
-      pywrap_tf_session.SetHandleShapeAndType(target_t.graph._c_graph,
-                                              target_t._as_tf_output(),
-                                              handle_data.SerializeToString())
-      # pylint: enable=protected-access
-      # Ensure that shapes and dtypes are propagated.
-      shapes, types = zip(*[(pair.shape, pair.dtype)
-                            for pair in handle_data.shape_and_type])
-      ranks = [len(s.dim) if not s.unknown_rank else -1 for s in shapes]
-      shapes = [[d.size for d in s.dim]  # pylint: disable=g-complex-comprehension
-                if not s.unknown_rank else None for s in shapes]
-      pywrap_tf_session.TF_GraphSetOutputHandleShapesAndTypes_wrapper(
-          target_t._op._graph._c_graph,  # pylint: disable=protected-access
-          target_t._as_tf_output(),  # pylint: disable=protected-access
-          shapes,
-          ranks,
-          types)
+# TODO(allenl): Remove this alias and migrate callers.
+copy_handle_data = handle_data_util.copy_handle_data
 
 
 @tf_export("custom_gradient")
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 6e285d6681d..b51d1baa6c0 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -838,14 +838,28 @@ def If(cond, inputs, then_branch, else_branch, name=None):
     or else_branch(inputs).
   """
   # pylint: disable=protected-access
+  # Handle the Defun case until users have transitioned to tf.function. Note
+  # that composites may need to be re-packed by the caller.
   if isinstance(then_branch, function._DefinedFunction):
     tlist = [_.type for _ in then_branch.definition.signature.output_arg]
-  else:
-    # We assume that `then_branch` is a ConcreteFunction here.
-    tlist = nest.flatten(then_branch.output_dtypes)
-  return gen_functional_ops._if(
+    return gen_functional_ops._if(
+        cond, inputs, tlist, then_branch, else_branch, name=name)
+
+  # We assume that `then_branch` is a ConcreteFunction here.
+  then_out = then_branch.structured_outputs
+  else_out = else_branch.structured_outputs
+
+  # Ensure then/else are the same type of composites to avoid an invalid call
+  # to pack_sequence_as later on.
+  nest.assert_same_structure(then_out, else_out, expand_composites=True)
+
+  tlist = nest.flatten(then_branch.output_dtypes)
+  ret = gen_functional_ops._if(
       cond, inputs, tlist, then_branch, else_branch, name=name)
 
+  # Re-pack the outputs to restore any CompositeTensors
+  return nest.pack_sequence_as(then_out, ret, expand_composites=True)
+
 
 def Gradient(inputs, f, name=None):
   r"""Computes the gradient function for function f via backpropagation.
diff --git a/tensorflow/python/ops/functional_ops_test.py b/tensorflow/python/ops/functional_ops_test.py
index 7e3bc631c44..92e97f63ecd 100644
--- a/tensorflow/python/ops/functional_ops_test.py
+++ b/tensorflow/python/ops/functional_ops_test.py
@@ -21,26 +21,26 @@ from __future__ import print_function
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.platform import test
 
 
 class FunctionalOpsTest(test.TestCase):
 
-  @test_util.deprecated_graph_mode_only
   def testIfWithDefun(self):
+    # Defun should only be used in graph mode
+    with ops.Graph().as_default():
+      @function.Defun(dtypes.float32)
+      def Then(x):
+        return x + 1
 
-    @function.Defun(dtypes.float32)
-    def Then(x):
-      return x + 1
+      @function.Defun(dtypes.float32)
+      def Else(x):
+        return x - 1
 
-    @function.Defun(dtypes.float32)
-    def Else(x):
-      return x - 1
-
-    with self.cached_session():
       inputs = [10.]
       result = self.evaluate(functional_ops.If(False, inputs, Then, Else))
       self.assertEqual([9.0], result)
@@ -57,12 +57,29 @@ class FunctionalOpsTest(test.TestCase):
     def Else(x):
       return x - 1
 
-    with self.cached_session():
-      inputs = [10.]
-      result = self.evaluate(
-          functional_ops.If(False, inputs, Then.get_concrete_function(),
-                            Else.get_concrete_function()))
-      self.assertEqual([9.0], result)
+    inputs = [10.]
+    then_cf = Then.get_concrete_function()
+    else_cf = Else.get_concrete_function()
+    result = self.evaluate(functional_ops.If(False, inputs, then_cf, else_cf))
+    self.assertEqual([9.0], result)
+
+  def testIfWithFunctionComposite(self):
+
+    signature = [tensor_spec.TensorSpec([], dtypes.float32)]
+    @def_function.function(input_signature=signature)
+    def Then(x):
+      return sparse_tensor.SparseTensor([[0]], [x + 1], [1])
+
+    @def_function.function(input_signature=signature)
+    def Else(x):
+      return sparse_tensor.SparseTensor([[0]], [x - 1], [1])
+
+    inputs = [10.]
+    then_cf = Then.get_concrete_function()
+    else_cf = Else.get_concrete_function()
+    result = functional_ops.If(False, inputs, then_cf, else_cf)
+    self.assertIsInstance(result, sparse_tensor.SparseTensor)
+    self.assertAllEqual([9.0], result.values)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/gradients_util.py b/tensorflow/python/ops/gradients_util.py
index 9784650d2d9..4d4df0ffa48 100644
--- a/tensorflow/python/ops/gradients_util.py
+++ b/tensorflow/python/ops/gradients_util.py
@@ -333,7 +333,7 @@ def _MaybeCompile(scope, op, func, grad_fn):
           "_XlaSeparateCompiledGradients")
       xla_scope = op.get_attr("_XlaScope").decode()
     except ValueError:
-      return grad_fn()  # Exit early
+      xla_compile = False
 
   if not xla_compile:
     return grad_fn()  # Exit early
diff --git a/tensorflow/python/ops/handle_data_util.py b/tensorflow/python/ops/handle_data_util.py
new file mode 100644
index 00000000000..d83bea3cb18
--- /dev/null
+++ b/tensorflow/python/ops/handle_data_util.py
@@ -0,0 +1,73 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Decorator to overrides the gradient for a function."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.client import pywrap_tf_session
+from tensorflow.python.framework import cpp_shape_inference_pb2
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.util import compat
+
+
+def get_resource_handle_data(graph_op):
+  assert type(graph_op) == ops.Tensor  # pylint: disable=unidiomatic-typecheck
+
+  handle_data = pywrap_tf_session.GetHandleShapeAndType(
+      graph_op.graph._c_graph, graph_op._as_tf_output())  # pylint: disable=protected-access
+
+  return cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData.FromString(
+      compat.as_bytes(handle_data))
+
+
+def copy_handle_data(source_t, target_t):
+  """Copies HandleData for variant and resource type tensors if available.
+
+  The CppShapeInferenceResult::HandleData proto contains information about the
+  shapes and types of the element tensors of resource/variant type tensors.
+  We need to copy this across function boundaries, i.e., when capturing a
+  placeholder or when returning a function tensor as output. If we don't do this
+  the element tensors will have unknown shapes, e.g., if a TensorList variant
+  tensor is captured as a placeholder, elements popped from that list would have
+  unknown shape.
+
+  Args:
+    source_t: The tensor to copy HandleData from.
+    target_t: The tensor to copy HandleData to.
+  """
+  if (target_t.dtype == dtypes.resource or
+      target_t.dtype == dtypes.variant):
+    if isinstance(source_t, ops.EagerTensor):
+      handle_data = source_t._handle_data  # pylint: disable=protected-access
+    else:
+      handle_data = get_resource_handle_data(source_t)
+    if (handle_data is not None
+        and handle_data.is_set
+        and handle_data.shape_and_type):
+      set_handle_data(target_t, handle_data)
+
+
+def set_handle_data(target_t, handle_data):
+  # pylint: disable=protected-access
+  if isinstance(target_t, ops.EagerTensor):
+    target_t._handle_data = handle_data
+    return
+  pywrap_tf_session.SetHandleShapeAndType(target_t.graph._c_graph,
+                                          target_t._as_tf_output(),
+                                          handle_data.SerializeToString())
+  # pylint: enable=protected-access
diff --git a/tensorflow/python/ops/image_grad_deterministic_test.py b/tensorflow/python/ops/image_grad_deterministic_test.py
new file mode 100644
index 00000000000..c07214f8f89
--- /dev/null
+++ b/tensorflow/python/ops/image_grad_deterministic_test.py
@@ -0,0 +1,142 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for deterministic image op gradient functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+
+from absl.testing import parameterized
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import image_grad_test_base as test_base
+from tensorflow.python.ops import image_ops
+from tensorflow.python.platform import test
+
+
+class ResizeBilinearOpDeterministicTest(test_base.ResizeBilinearOpTestBase):
+
+  def _randomNDArray(self, shape):
+    return 2 * np.random.random_sample(shape) - 1
+
+  def _randomDataOp(self, shape, data_type):
+    return constant_op.constant(self._randomNDArray(shape), dtype=data_type)
+
+  @parameterized.parameters(
+      # Note that there is no 16-bit floating point format registered for GPU
+      {
+          'align_corners': False,
+          'half_pixel_centers': False,
+          'data_type': dtypes.float32
+      },
+      {
+          'align_corners': False,
+          'half_pixel_centers': False,
+          'data_type': dtypes.float64
+      },
+      {
+          'align_corners': True,
+          'half_pixel_centers': False,
+          'data_type': dtypes.float32
+      },
+      {
+          'align_corners': False,
+          'half_pixel_centers': True,
+          'data_type': dtypes.float32
+      })
+  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_cuda_only
+  def testDeterministicGradients(self, align_corners, half_pixel_centers,
+                                 data_type):
+    if not align_corners and test_util.is_xla_enabled():
+      # Align corners is deprecated in TF2.0, but align_corners==False is not
+      # supported by XLA.
+      self.skipTest('align_corners==False not currently supported by XLA')
+    with self.session(force_gpu=True):
+      seed = (
+          hash(align_corners) % 256 + hash(half_pixel_centers) % 256 +
+          hash(data_type) % 256)
+      np.random.seed(seed)
+      input_shape = (1, 25, 12, 3)  # NHWC
+      output_shape = (1, 200, 250, 3)
+      input_image = self._randomDataOp(input_shape, data_type)
+      repeat_count = 3
+      if context.executing_eagerly():
+
+        def resize_bilinear_gradients(local_seed):
+          np.random.seed(local_seed)
+          upstream_gradients = self._randomDataOp(output_shape, dtypes.float32)
+          with backprop.GradientTape(persistent=True) as tape:
+            tape.watch(input_image)
+            output_image = image_ops.resize_bilinear(
+                input_image,
+                output_shape[1:3],
+                align_corners=align_corners,
+                half_pixel_centers=half_pixel_centers)
+            gradient_injector_output = output_image * upstream_gradients
+          return tape.gradient(gradient_injector_output, input_image)
+
+        for i in range(repeat_count):
+          local_seed = seed + i  # select different upstream gradients
+          result_a = resize_bilinear_gradients(local_seed)
+          result_b = resize_bilinear_gradients(local_seed)
+          self.assertAllEqual(result_a, result_b)
+      else:  # graph mode
+        upstream_gradients = array_ops.placeholder(
+            dtypes.float32, shape=output_shape, name='upstream_gradients')
+        output_image = image_ops.resize_bilinear(
+            input_image,
+            output_shape[1:3],
+            align_corners=align_corners,
+            half_pixel_centers=half_pixel_centers)
+        gradient_injector_output = output_image * upstream_gradients
+        # The gradient function behaves as if grad_ys is multiplied by the op
+        # gradient result, not passing the upstram gradients through the op's
+        # gradient generation graph. This is the reason for using the
+        # gradient injector
+        resize_bilinear_gradients = gradients_impl.gradients(
+            gradient_injector_output,
+            input_image,
+            grad_ys=None,
+            colocate_gradients_with_ops=True)[0]
+        for i in range(repeat_count):
+          feed_dict = {upstream_gradients: self._randomNDArray(output_shape)}
+          result_a = resize_bilinear_gradients.eval(feed_dict=feed_dict)
+          result_b = resize_bilinear_gradients.eval(feed_dict=feed_dict)
+          self.assertAllEqual(result_a, result_b)
+
+
+if __name__ == '__main__':
+  # Note that the effect of setting the following environment variable to
+  # 'true' is not tested. Unless we can find a simpler pattern for testing these
+  # environment variables, it would require this file to be made into a base
+  # and then two more test files to be created.
+  #
+  # When deterministic op functionality can be enabled and disabled between test
+  # cases in the same process, then the tests for deterministic op
+  # functionality, for this op and for other ops, will be able to be included in
+  # the same file with the regular tests, simplifying the organization of tests
+  # and test files.
+  os.environ['TF_DETERMINISTIC_OPS'] = '1'
+  test.main()
diff --git a/tensorflow/python/ops/image_grad_test.py b/tensorflow/python/ops/image_grad_test.py
index 3da536c967e..92e174b3703 100644
--- a/tensorflow/python/ops/image_grad_test.py
+++ b/tensorflow/python/ops/image_grad_test.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,536 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for Python ops defined in image_grad.py."""
+"""Functional tests for Image Op Gradients."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
-from tensorflow.python.eager import backprop
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import gradient_checker
-from tensorflow.python.ops import gradient_checker_v2
-from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import image_ops
-from tensorflow.python.ops import gen_image_ops
+from tensorflow.python.ops import image_grad_test_base as test_base
 from tensorflow.python.platform import test
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import array_ops
-
-
-@test_util.for_all_test_methods(test_util.disable_xla,
-                                'align_corners=False not supported by XLA')
-class ResizeNearestNeighborOpTest(test.TestCase):
-
-  TYPES = [np.float32, np.float64]
-
-  def testShapeIsCorrectAfterOp(self):
-    in_shape = [1, 2, 2, 1]
-    out_shape = [1, 4, 6, 1]
-
-    for nptype in self.TYPES:
-      x = np.arange(0, 4).reshape(in_shape).astype(nptype)
-
-      input_tensor = constant_op.constant(x, shape=in_shape)
-      resize_out = image_ops.resize_nearest_neighbor(input_tensor,
-                                                     out_shape[1:3])
-      with self.cached_session(use_gpu=True):
-        self.assertEqual(out_shape, list(resize_out.get_shape()))
-        resize_out = self.evaluate(resize_out)
-      self.assertEqual(out_shape, list(resize_out.shape))
-
-  def testGradFromResizeToLargerInBothDims(self):
-    in_shape = [1, 2, 3, 1]
-    out_shape = (1, 4, 6, 1)
-
-    for nptype in self.TYPES:
-      x = np.arange(0, 6).reshape(in_shape).astype(nptype)
-
-      def resize_nn(t, shape=out_shape):
-        return image_ops.resize_nearest_neighbor(t, shape[1:3])
-
-      with self.cached_session(use_gpu=True):
-        input_tensor = constant_op.constant(x, shape=in_shape)
-        err = gradient_checker_v2.max_error(
-            *gradient_checker_v2.compute_gradient(resize_nn, [input_tensor]))
-        self.assertLess(err, 1e-3)
-
-  def testGradFromResizeToSmallerInBothDims(self):
-    in_shape = [1, 4, 6, 1]
-    out_shape = (1, 2, 3, 1)
-
-    for nptype in self.TYPES:
-      x = np.arange(0, 24).reshape(in_shape).astype(nptype)
-
-      def resize_nn(t, shape=out_shape):
-        return image_ops.resize_nearest_neighbor(t, shape[1:3])
-
-      with self.cached_session(use_gpu=True):
-        input_tensor = constant_op.constant(x, shape=in_shape)
-        err = gradient_checker_v2.max_error(
-            *gradient_checker_v2.compute_gradient(resize_nn, [input_tensor]))
-        self.assertLess(err, 1e-3)
-
-  def testCompareGpuVsCpu(self):
-    in_shape = [1, 4, 6, 3]
-    out_shape = (1, 8, 16, 3)
-
-    for nptype in self.TYPES:
-      x = np.arange(0, np.prod(in_shape)).reshape(in_shape).astype(nptype)
-      for align_corners in [True, False]:
-
-        def resize_nn(t, shape=out_shape, align_corners=align_corners):
-          return image_ops.resize_nearest_neighbor(
-              t, shape[1:3], align_corners=align_corners)
-
-        with self.cached_session(use_gpu=False):
-          input_tensor = constant_op.constant(x, shape=in_shape)
-          grad_cpu = gradient_checker_v2.compute_gradient(resize_nn,
-                                                          [input_tensor])
-
-        with self.cached_session(use_gpu=True):
-          input_tensor = constant_op.constant(x, shape=in_shape)
-          grad_gpu = gradient_checker_v2.compute_gradient(resize_nn,
-                                                          [input_tensor])
-
-        self.assertAllClose(grad_cpu, grad_gpu, rtol=1e-5, atol=1e-5)
-
-
-class ResizeBilinearOpTest(test.TestCase):
-
-  def testShapeIsCorrectAfterOp(self):
-    in_shape = [1, 2, 2, 1]
-    out_shape = [1, 4, 6, 1]
-
-    x = np.arange(0, 4).reshape(in_shape).astype(np.float32)
-
-    input_tensor = constant_op.constant(x, shape=in_shape)
-    resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
-    with self.cached_session():
-      self.assertEqual(out_shape, list(resize_out.get_shape()))
-      resize_out = self.evaluate(resize_out)
-      self.assertEqual(out_shape, list(resize_out.shape))
-
-  @test_util.run_deprecated_v1
-  def testGradFromResizeToLargerInBothDims(self):
-    in_shape = [1, 2, 3, 1]
-    out_shape = [1, 4, 6, 1]
-
-    x = np.arange(0, 6).reshape(in_shape).astype(np.float32)
-
-    with self.cached_session():
-      input_tensor = constant_op.constant(x, shape=in_shape)
-      resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
-      err = gradient_checker.compute_gradient_error(
-          input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
-    self.assertLess(err, 1e-3)
-
-  @test_util.run_deprecated_v1
-  def testGradFromResizeToSmallerInBothDims(self):
-    in_shape = [1, 4, 6, 1]
-    out_shape = [1, 2, 3, 1]
-
-    x = np.arange(0, 24).reshape(in_shape).astype(np.float32)
-
-    with self.cached_session():
-      input_tensor = constant_op.constant(x, shape=in_shape)
-      resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
-      err = gradient_checker.compute_gradient_error(
-          input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
-    self.assertLess(err, 1e-3)
-
-  @test_util.run_deprecated_v1
-  def testCompareGpuVsCpu(self):
-    in_shape = [2, 4, 6, 3]
-    out_shape = [2, 8, 16, 3]
-
-    size = np.prod(in_shape)
-    x = 1.0 / size * np.arange(0, size).reshape(in_shape).astype(np.float32)
-
-    # Align corners will be deprecated for tf2.0 and the false version is not
-    # supported by XLA.
-    align_corner_options = [True
-                           ] if test_util.is_xla_enabled() else [True, False]
-    for align_corners in align_corner_options:
-      grad = {}
-      for use_gpu in [False, True]:
-        with self.cached_session(use_gpu=use_gpu):
-          input_tensor = constant_op.constant(x, shape=in_shape)
-          resized_tensor = image_ops.resize_bilinear(
-              input_tensor, out_shape[1:3], align_corners=align_corners)
-          grad[use_gpu] = gradient_checker.compute_gradient(
-              input_tensor, in_shape, resized_tensor, out_shape, x_init_value=x)
-
-      self.assertAllClose(grad[False], grad[True], rtol=1e-4, atol=1e-4)
-
-  @test_util.run_deprecated_v1
-  def testTypes(self):
-    in_shape = [1, 4, 6, 1]
-    out_shape = [1, 2, 3, 1]
-    x = np.arange(0, 24).reshape(in_shape)
-
-    for use_gpu in [False, True]:
-      with self.cached_session(use_gpu=use_gpu) as sess:
-        for dtype in [np.float16, np.float32, np.float64]:
-          input_tensor = constant_op.constant(x.astype(dtype), shape=in_shape)
-          resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
-          grad = sess.run(gradients_impl.gradients(resize_out, input_tensor))[0]
-          self.assertAllEqual(in_shape, grad.shape)
-          # Not using gradient_checker.compute_gradient as I didn't work out
-          # the changes required to compensate for the lower precision of
-          # float16 when computing the numeric jacobian.
-          # Instead, we just test the theoretical jacobian.
-          self.assertAllEqual([[[[1.], [0.], [1.], [0.], [1.], [0.]],
-                                [[0.], [0.], [0.], [0.], [0.], [0.]],
-                                [[1.], [0.], [1.], [0.], [1.], [0.]],
-                                [[0.], [0.], [0.], [0.], [0.], [0.]]]], grad)
-
-
-class ResizeBicubicOpTest(test.TestCase):
-
-  def testShapeIsCorrectAfterOp(self):
-    in_shape = [1, 2, 2, 1]
-    out_shape = [1, 4, 6, 1]
-
-    x = np.arange(0, 4).reshape(in_shape).astype(np.float32)
-
-    for align_corners in [True, False]:
-      input_tensor = constant_op.constant(x, shape=in_shape)
-      resize_out = image_ops.resize_bicubic(
-          input_tensor, out_shape[1:3], align_corners=align_corners)
-      with self.cached_session():
-        self.assertEqual(out_shape, list(resize_out.get_shape()))
-        resize_out = self.evaluate(resize_out)
-        self.assertEqual(out_shape, list(resize_out.shape))
-
-  @test_util.run_deprecated_v1
-  def testGradFromResizeToLargerInBothDims(self):
-    in_shape = [1, 2, 3, 1]
-    out_shape = [1, 4, 6, 1]
-
-    x = np.arange(0, 6).reshape(in_shape).astype(np.float32)
-
-    for align_corners in [True, False]:
-      with self.cached_session():
-        input_tensor = constant_op.constant(x, shape=in_shape)
-        resize_out = image_ops.resize_bicubic(input_tensor, out_shape[1:3],
-                                              align_corners=align_corners)
-        err = gradient_checker.compute_gradient_error(
-            input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
-      self.assertLess(err, 1e-3)
-
-  @test_util.run_deprecated_v1
-  def testGradFromResizeToSmallerInBothDims(self):
-    in_shape = [1, 4, 6, 1]
-    out_shape = [1, 2, 3, 1]
-
-    x = np.arange(0, 24).reshape(in_shape).astype(np.float32)
-
-    for align_corners in [True, False]:
-      input_tensor = constant_op.constant(x, shape=in_shape)
-      resize_out = image_ops.resize_bicubic(
-          input_tensor, out_shape[1:3], align_corners=align_corners)
-      with self.cached_session():
-        err = gradient_checker.compute_gradient_error(
-            input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
-      self.assertLess(err, 1e-3)
-
-  @test_util.run_deprecated_v1
-  def testGradOnUnsupportedType(self):
-    in_shape = [1, 4, 6, 1]
-    out_shape = [1, 2, 3, 1]
-
-    x = np.arange(0, 24).reshape(in_shape).astype(np.uint8)
-
-    input_tensor = constant_op.constant(x, shape=in_shape)
-    resize_out = image_ops.resize_bicubic(input_tensor, out_shape[1:3])
-    with self.cached_session():
-      grad = gradients_impl.gradients(input_tensor, [resize_out])
-      self.assertEqual([None], grad)
-
-
-class ScaleAndTranslateOpTest(test.TestCase):
-
-  @test_util.run_deprecated_v1
-  def testGrads(self):
-    in_shape = [1, 2, 3, 1]
-    out_shape = [1, 4, 6, 1]
-
-    x = np.arange(0, 6).reshape(in_shape).astype(np.float32)
-
-    kernel_types = [
-        'lanczos1', 'lanczos3', 'lanczos5', 'gaussian', 'box', 'triangle',
-        'keyscubic', 'mitchellcubic'
-    ]
-    scales = [(1.0, 1.0), (0.37, 0.47), (2.1, 2.1)]
-    translations = [(0.0, 0.0), (3.14, 1.19), (2.1, 3.1), (100.0, 200.0)]
-    for scale in scales:
-      for translation in translations:
-        for kernel_type in kernel_types:
-          for antialias in [True, False]:
-            with self.cached_session():
-              input_tensor = constant_op.constant(x, shape=in_shape)
-              scale_and_translate_out = image_ops.scale_and_translate(
-                  input_tensor,
-                  out_shape[1:3],
-                  scale=constant_op.constant(scale),
-                  translation=constant_op.constant(translation),
-                  kernel_type=kernel_type,
-                  antialias=antialias)
-              err = gradient_checker.compute_gradient_error(
-                  input_tensor,
-                  in_shape,
-                  scale_and_translate_out,
-                  out_shape,
-                  x_init_value=x)
-            self.assertLess(err, 1e-3)
-
-  def testIdentityGrads(self):
-    """Tests that Gradients for 1.0 scale should be ones for some kernels."""
-    in_shape = [1, 2, 3, 1]
-    out_shape = [1, 4, 6, 1]
-
-    x = np.arange(0, 6).reshape(in_shape).astype(np.float32)
-
-    kernel_types = ['lanczos1', 'lanczos3', 'lanczos5', 'triangle', 'keyscubic']
-    scale = (1.0, 1.0)
-    translation = (0.0, 0.0)
-    antialias = True
-    for kernel_type in kernel_types:
-      with self.cached_session():
-        input_tensor = constant_op.constant(x, shape=in_shape)
-        with backprop.GradientTape() as tape:
-          tape.watch(input_tensor)
-          scale_and_translate_out = image_ops.scale_and_translate(
-              input_tensor,
-              out_shape[1:3],
-              scale=constant_op.constant(scale),
-              translation=constant_op.constant(translation),
-              kernel_type=kernel_type,
-              antialias=antialias)
-        grad = tape.gradient(scale_and_translate_out, input_tensor)[0]
-        grad_v = self.evaluate(grad)
-        self.assertAllClose(np.ones_like(grad_v), grad_v)
-
-
-class CropAndResizeOpTest(test.TestCase):
-
-  def testShapeIsCorrectAfterOp(self):
-    batch = 2
-    image_height = 3
-    image_width = 4
-    crop_height = 4
-    crop_width = 5
-    depth = 2
-    num_boxes = 2
-
-    image_shape = [batch, image_height, image_width, depth]
-    crop_size = [crop_height, crop_width]
-    crops_shape = [num_boxes, crop_height, crop_width, depth]
-
-    image = np.arange(0, batch * image_height * image_width *
-                      depth).reshape(image_shape).astype(np.float32)
-    boxes = np.array([[0, 0, 1, 1], [.1, .2, .7, .8]], dtype=np.float32)
-    box_ind = np.array([0, 1], dtype=np.int32)
-
-    crops = image_ops.crop_and_resize(
-        constant_op.constant(image, shape=image_shape),
-        constant_op.constant(boxes, shape=[num_boxes, 4]),
-        constant_op.constant(box_ind, shape=[num_boxes]),
-        constant_op.constant(crop_size, shape=[2]))
-    with self.session(use_gpu=True) as sess:
-      self.assertEqual(crops_shape, list(crops.get_shape()))
-      crops = self.evaluate(crops)
-      self.assertEqual(crops_shape, list(crops.shape))
-
-  def _randomUniformAvoidAnchors(self, low, high, anchors, radius, num_samples):
-    """Generate samples that are far enough from a set of anchor points.
-
-    We generate uniform samples in [low, high], then reject those that are less
-    than radius away from any point in anchors. We stop after we have accepted
-    num_samples samples.
-
-    Args:
-      low: The lower end of the interval.
-      high: The upper end of the interval.
-      anchors: A list of length num_crops with anchor points to avoid.
-      radius: Distance threshold for the samples from the anchors.
-      num_samples: How many samples to produce.
-
-    Returns:
-      samples: A list of length num_samples with the accepted samples.
-    """
-    self.assertTrue(low < high)
-    self.assertTrue(radius >= 0)
-    num_anchors = len(anchors)
-    # Make sure that at least half of the interval is not forbidden.
-    self.assertTrue(2 * radius * num_anchors < 0.5 * (high - low))
-    anchors = np.reshape(anchors, num_anchors)
-    samples = []
-    while len(samples) < num_samples:
-      sample = np.random.uniform(low, high)
-      if np.all(np.fabs(sample - anchors) > radius):
-        samples.append(sample)
-    return samples
-
-  @test_util.run_deprecated_v1
-  def testGradRandomBoxes(self):
-    """Test that the gradient is correct for randomly generated boxes.
-
-    The mapping is piecewise differentiable with respect to the box coordinates.
-    The points where the function is not differentiable are those which are
-    mapped to image pixels, i.e., the normalized y coordinates in
-    np.linspace(0, 1, image_height) and normalized x coordinates in
-    np.linspace(0, 1, image_width). Make sure that the box coordinates are
-    sufficiently far away from those rectangular grid centers that are points of
-    discontinuity, so that the finite difference Jacobian is close to the
-    computed one.
-    """
-    np.random.seed(1)  # Make it reproducible.
-    delta = 1e-3
-    radius = 2 * delta
-    low, high = -0.5, 1.5  # Also covers the case of extrapolation.
-
-    image_height = 4
-    for image_width in range(1, 3):
-      for crop_height in range(1, 3):
-        for crop_width in range(2, 4):
-          for depth in range(1, 3):
-            for num_boxes in range(1, 3):
-
-              batch = num_boxes
-              image_shape = [batch, image_height, image_width, depth]
-              crop_size = [crop_height, crop_width]
-              crops_shape = [num_boxes, crop_height, crop_width, depth]
-              boxes_shape = [num_boxes, 4]
-
-              image = np.arange(0, batch * image_height * image_width *
-                                depth).reshape(image_shape).astype(np.float32)
-              boxes = []
-              for _ in range(num_boxes):
-                # pylint: disable=unbalanced-tuple-unpacking
-                y1, y2 = self._randomUniformAvoidAnchors(
-                    low, high, np.linspace(0, 1, image_height), radius, 2)
-                x1, x2 = self._randomUniformAvoidAnchors(
-                    low, high, np.linspace(0, 1, image_width), radius, 2)
-                # pylint: enable=unbalanced-tuple-unpacking
-                boxes.append([y1, x1, y2, x2])
-
-              boxes = np.array(boxes, dtype=np.float32)
-              box_ind = np.arange(batch, dtype=np.int32)
-
-              with self.cached_session(use_gpu=True):
-                image_tensor = constant_op.constant(image, shape=image_shape)
-                boxes_tensor = constant_op.constant(boxes, shape=[num_boxes, 4])
-                box_ind_tensor = constant_op.constant(
-                    box_ind, shape=[num_boxes])
-                crops = image_ops.crop_and_resize(
-                    image_tensor,
-                    boxes_tensor,
-                    box_ind_tensor,
-                    constant_op.constant(
-                        crop_size, shape=[2]))
-
-                err = gradient_checker.compute_gradient_error(
-                    [image_tensor, boxes_tensor], [image_shape, boxes_shape],
-                    crops,
-                    crops_shape,
-                    delta=delta,
-                    x_init_value=[image, boxes])
-
-              self.assertLess(err, 2e-3)
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class RGBToHSVOpTest(test.TestCase):
-
-  TYPES = [np.float32, np.float64]
-
-  def testShapeIsCorrectAfterOp(self):
-    in_shape = [2, 20, 30, 3]
-    out_shape = [2, 20, 30, 3]
-
-    for nptype in self.TYPES:
-      x = np.random.randint(0, high=255, size=[2, 20, 30, 3]).astype(nptype)
-      rgb_input_tensor = constant_op.constant(x, shape=in_shape)
-      hsv_out = gen_image_ops.rgb_to_hsv(rgb_input_tensor)
-      with self.cached_session(use_gpu=True):
-        self.assertEqual(out_shape, list(hsv_out.get_shape()))
-      hsv_out = self.evaluate(hsv_out)
-      self.assertEqual(out_shape, list(hsv_out.shape))
-
-  def testRGBToHSVGradSimpleCase(self):
-
-    def f(x):
-      return gen_image_ops.rgb_to_hsv(x)
-
-    # Building a simple input tensor to avoid any discontinuity
-    x = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8,
-                                                     0.9]]).astype(np.float32)
-    rgb_input_tensor = constant_op.constant(x, shape=x.shape)
-    # Computing Analytical and Numerical gradients of f(x)
-    analytical, numerical = gradient_checker_v2.compute_gradient(
-        f, [rgb_input_tensor])
-    self.assertAllClose(numerical, analytical, atol=1e-4)
-
-  def testRGBToHSVGradRandomCase(self):
-
-    def f(x):
-      return gen_image_ops.rgb_to_hsv(x)
-
-    np.random.seed(0)
-    # Building a simple input tensor to avoid any discontinuity
-    x = np.random.rand(1, 5, 5, 3).astype(np.float32)
-    rgb_input_tensor = constant_op.constant(x, shape=x.shape)
-    # Computing Analytical and Numerical gradients of f(x)
-    self.assertLess(
-        gradient_checker_v2.max_error(
-            *gradient_checker_v2.compute_gradient(f, [rgb_input_tensor])), 1e-4)
-
-  def testRGBToHSVGradSpecialCaseRGreatest(self):
-    # This test tests a specific subset of the input space
-    # with a dummy function implemented with native TF operations.
-    in_shape = [2, 10, 20, 3]
-
-    def f(x):
-      return gen_image_ops.rgb_to_hsv(x)
-
-    def f_dummy(x):
-      # This dummy function is a implementation of RGB to HSV using
-      # primitive TF functions for one particular case when R>G>B.
-      r = x[..., 0]
-      g = x[..., 1]
-      b = x[..., 2]
-      # Since MAX = r and MIN = b, we get the following h,s,v values.
-      v = r
-      s = 1 - math_ops.div_no_nan(b, r)
-      h = 60 * math_ops.div_no_nan(g - b, r - b)
-      h = h / 360
-      return array_ops.stack([h, s, v], axis=-1)
-
-    # Building a custom input tensor where R>G>B
-    x_reds = np.ones((in_shape[0], in_shape[1], in_shape[2])).astype(np.float32)
-    x_greens = 0.5 * np.ones(
-        (in_shape[0], in_shape[1], in_shape[2])).astype(np.float32)
-    x_blues = 0.2 * np.ones(
-        (in_shape[0], in_shape[1], in_shape[2])).astype(np.float32)
-    x = np.stack([x_reds, x_greens, x_blues], axis=-1)
-    rgb_input_tensor = constant_op.constant(x, shape=in_shape)
-
-    # Computing Analytical and Numerical gradients of f(x)
-    analytical, numerical = gradient_checker_v2.compute_gradient(
-        f, [rgb_input_tensor])
-    # Computing Analytical and Numerical gradients of f_dummy(x)
-    analytical_dummy, numerical_dummy = gradient_checker_v2.compute_gradient(
-        f_dummy, [rgb_input_tensor])
-    self.assertAllClose(numerical, analytical, atol=1e-4)
-    self.assertAllClose(analytical_dummy, analytical, atol=1e-4)
-    self.assertAllClose(numerical_dummy, numerical, atol=1e-4)
 
+ResizeNearestNeighborOpTest = test_base.ResizeNearestNeighborOpTestBase
+ResizeBilinearOpTest = test_base.ResizeBilinearOpTestBase
+ResizeBicubicOpTest = test_base.ResizeBicubicOpTestBase
+ScaleAndTranslateOpTest = test_base.ScaleAndTranslateOpTestBase
+CropAndResizeOpTest = test_base.CropAndResizeOpTestBase
+RGBToHSVOpTest = test_base.RGBToHSVOpTestBase
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/image_grad_test_base.py b/tensorflow/python/ops/image_grad_test_base.py
new file mode 100644
index 00000000000..58e0ddc5284
--- /dev/null
+++ b/tensorflow/python/ops/image_grad_test_base.py
@@ -0,0 +1,622 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Python ops defined in image_grad.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from absl.testing import parameterized
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import image_ops
+from tensorflow.python.ops import gen_image_ops
+from tensorflow.python.platform import test
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import array_ops
+
+
+@test_util.for_all_test_methods(test_util.disable_xla,
+                                'align_corners=False not supported by XLA')
+class ResizeNearestNeighborOpTestBase(test.TestCase):
+
+  TYPES = [np.float32, np.float64]
+
+  def testShapeIsCorrectAfterOp(self):
+    in_shape = [1, 2, 2, 1]
+    out_shape = [1, 4, 6, 1]
+
+    for nptype in self.TYPES:
+      x = np.arange(0, 4).reshape(in_shape).astype(nptype)
+
+      input_tensor = constant_op.constant(x, shape=in_shape)
+      resize_out = image_ops.resize_nearest_neighbor(input_tensor,
+                                                     out_shape[1:3])
+      with self.cached_session(use_gpu=True):
+        self.assertEqual(out_shape, list(resize_out.get_shape()))
+        resize_out = self.evaluate(resize_out)
+      self.assertEqual(out_shape, list(resize_out.shape))
+
+  def testGradFromResizeToLargerInBothDims(self):
+    in_shape = [1, 2, 3, 1]
+    out_shape = (1, 4, 6, 1)
+
+    for nptype in self.TYPES:
+      x = np.arange(0, 6).reshape(in_shape).astype(nptype)
+
+      def resize_nn(t, shape=out_shape):
+        return image_ops.resize_nearest_neighbor(t, shape[1:3])
+
+      with self.cached_session(use_gpu=True):
+        input_tensor = constant_op.constant(x, shape=in_shape)
+        err = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(resize_nn, [input_tensor]))
+        self.assertLess(err, 1e-3)
+
+  def testGradFromResizeToSmallerInBothDims(self):
+    in_shape = [1, 4, 6, 1]
+    out_shape = (1, 2, 3, 1)
+
+    for nptype in self.TYPES:
+      x = np.arange(0, 24).reshape(in_shape).astype(nptype)
+
+      def resize_nn(t, shape=out_shape):
+        return image_ops.resize_nearest_neighbor(t, shape[1:3])
+
+      with self.cached_session(use_gpu=True):
+        input_tensor = constant_op.constant(x, shape=in_shape)
+        err = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(resize_nn, [input_tensor]))
+        self.assertLess(err, 1e-3)
+
+  def testCompareGpuVsCpu(self):
+    in_shape = [1, 4, 6, 3]
+    out_shape = (1, 8, 16, 3)
+
+    for nptype in self.TYPES:
+      x = np.arange(0, np.prod(in_shape)).reshape(in_shape).astype(nptype)
+      for align_corners in [True, False]:
+
+        def resize_nn(t, shape=out_shape, align_corners=align_corners):
+          return image_ops.resize_nearest_neighbor(
+              t, shape[1:3], align_corners=align_corners)
+
+        with self.cached_session(use_gpu=False):
+          input_tensor = constant_op.constant(x, shape=in_shape)
+          grad_cpu = gradient_checker_v2.compute_gradient(
+              resize_nn, [input_tensor])
+
+        with self.cached_session(use_gpu=True):
+          input_tensor = constant_op.constant(x, shape=in_shape)
+          grad_gpu = gradient_checker_v2.compute_gradient(
+              resize_nn, [input_tensor])
+
+        self.assertAllClose(grad_cpu, grad_gpu, rtol=1e-5, atol=1e-5)
+
+
+class ResizeBilinearOpTestBase(test.TestCase, parameterized.TestCase):
+
+  def _itGen(self, smaller_shape, larger_shape):
+    up_sample = (smaller_shape, larger_shape)
+    down_sample = (larger_shape, smaller_shape)
+    pass_through = (larger_shape, larger_shape)
+    shape_pairs = (up_sample, down_sample, pass_through)
+    # Align corners is deprecated in TF2.0, but align_corners==False is not
+    # supported by XLA.
+    options = [(True, False)]
+    if not test_util.is_xla_enabled():
+      options += [(False, True), (False, False)]
+    for align_corners, half_pixel_centers in options:
+      for in_shape, out_shape in shape_pairs:
+        yield in_shape, out_shape, align_corners, half_pixel_centers
+
+  def _getJacobians(self,
+                    in_shape,
+                    out_shape,
+                    align_corners=False,
+                    half_pixel_centers=False,
+                    dtype=np.float32,
+                    use_gpu=False,
+                    force_gpu=False):
+    with self.cached_session(use_gpu=use_gpu, force_gpu=force_gpu) as sess:
+      # Input values should not influence gradients
+      x = np.arange(np.prod(in_shape)).reshape(in_shape).astype(dtype)
+      input_tensor = constant_op.constant(x, shape=in_shape)
+      resized_tensor = image_ops.resize_bilinear(
+          input_tensor,
+          out_shape[1:3],
+          align_corners=align_corners,
+          half_pixel_centers=half_pixel_centers)
+      # compute_gradient will use a random tensor as the init value
+      return gradient_checker.compute_gradient(input_tensor, in_shape,
+                                               resized_tensor, out_shape)
+
+  @parameterized.parameters({
+      'batch_size': 1,
+      'channel_count': 1
+  }, {
+      'batch_size': 2,
+      'channel_count': 3
+  }, {
+      'batch_size': 5,
+      'channel_count': 4
+  })
+  @test_util.run_deprecated_v1
+  def testShapes(self, batch_size, channel_count):
+    smaller_shape = [batch_size, 2, 3, channel_count]
+    larger_shape = [batch_size, 4, 6, channel_count]
+    for in_shape, out_shape, align_corners, half_pixel_centers in \
+        self._itGen(smaller_shape, larger_shape):
+      # Input values should not influence shapes
+      x = np.arange(np.prod(in_shape)).reshape(in_shape).astype(np.float32)
+      input_tensor = constant_op.constant(x, shape=in_shape)
+      resized_tensor = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
+      self.assertEqual(out_shape, list(resized_tensor.get_shape()))
+      grad_tensor = gradients_impl.gradients(resized_tensor, input_tensor)[0]
+      self.assertEqual(in_shape, list(grad_tensor.get_shape()))
+      with self.cached_session():
+        resized_values = self.evaluate(resized_tensor)
+        self.assertEqual(out_shape, list(resized_values.shape))
+        grad_values = self.evaluate(grad_tensor)
+        self.assertEqual(in_shape, list(grad_values.shape))
+
+  @parameterized.parameters({
+      'batch_size': 1,
+      'channel_count': 1
+  }, {
+      'batch_size': 4,
+      'channel_count': 3
+  }, {
+      'batch_size': 3,
+      'channel_count': 2
+  })
+  @test_util.run_deprecated_v1
+  def testGradients(self, batch_size, channel_count):
+    smaller_shape = [batch_size, 2, 3, channel_count]
+    larger_shape = [batch_size, 5, 6, channel_count]
+    for in_shape, out_shape, align_corners, half_pixel_centers in \
+        self._itGen(smaller_shape, larger_shape):
+      jacob_a, jacob_n = self._getJacobians(in_shape, out_shape, align_corners,
+                                            half_pixel_centers)
+      threshold = 1e-4
+      self.assertAllClose(jacob_a, jacob_n, threshold, threshold)
+
+  @test_util.run_deprecated_v1
+  def testTypes(self):
+    in_shape = [1, 4, 6, 1]
+    out_shape = [1, 2, 3, 1]
+    for use_gpu in [False, True]:
+      for dtype in [np.float16, np.float32, np.float64]:
+        jacob_a, jacob_n = self._getJacobians(
+            in_shape, out_shape, dtype=dtype, use_gpu=use_gpu)
+        if dtype == np.float16:
+          # Compare fp16 analytical gradients to fp32 numerical gradients,
+          # since fp16 numerical gradients are too imprecise unless great
+          # care is taken with choosing the inputs and the delta. This is
+          # a weaker, but pragmatic, check (in particular, it does not test
+          # the op itself, only its gradient).
+          _, jacob_n = self._getJacobians(
+              in_shape, out_shape, dtype=np.float32, use_gpu=use_gpu)
+        threshold = 1e-3
+        if dtype == np.float64:
+          threshold = 1e-5
+        self.assertAllClose(jacob_a, jacob_n, threshold, threshold)
+
+  @test_util.run_deprecated_v1
+  def testGradOnUnsupportedType(self):
+    in_shape = [1, 4, 6, 1]
+    out_shape = [1, 2, 3, 1]
+
+    x = np.arange(0, 24).reshape(in_shape).astype(np.uint8)
+
+    input_tensor = constant_op.constant(x, shape=in_shape)
+    resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
+    with self.cached_session():
+      grad = gradients_impl.gradients(resize_out, [input_tensor])
+      self.assertEqual([None], grad)
+
+  def _gpuVsCpuCase(self, in_shape, out_shape, align_corners,
+                    half_pixel_centers, dtype):
+    grad = {}
+    for use_gpu in [False, True]:
+      grad[use_gpu] = self._getJacobians(
+          in_shape,
+          out_shape,
+          align_corners,
+          half_pixel_centers,
+          dtype=dtype,
+          use_gpu=use_gpu)
+    threshold = 1e-4
+    # Note that this is comparing both analytical and numerical Jacobians
+    self.assertAllClose(grad[False], grad[True], rtol=threshold, atol=threshold)
+
+  @parameterized.parameters({
+      'batch_size': 1,
+      'channel_count': 1
+  }, {
+      'batch_size': 2,
+      'channel_count': 3
+  }, {
+      'batch_size': 5,
+      'channel_count': 4
+  })
+  @test_util.run_deprecated_v1
+  def testCompareGpuVsCpu(self, batch_size, channel_count):
+    smaller_shape = [batch_size, 4, 6, channel_count]
+    larger_shape = [batch_size, 8, 16, channel_count]
+    for params in self._itGen(smaller_shape, larger_shape):
+      self._gpuVsCpuCase(*params, dtype=np.float32)
+
+  @test_util.run_deprecated_v1
+  def testCompareGpuVsCpuFloat64(self):
+    in_shape = [1, 5, 7, 1]
+    out_shape = [1, 9, 11, 1]
+    # Note that there is no 16-bit floating-point format registered for GPU
+    self._gpuVsCpuCase(
+        in_shape,
+        out_shape,
+        align_corners=True,
+        half_pixel_centers=False,
+        dtype=np.float64)
+
+
+class ResizeBicubicOpTestBase(test.TestCase):
+
+  def testShapeIsCorrectAfterOp(self):
+    in_shape = [1, 2, 2, 1]
+    out_shape = [1, 4, 6, 1]
+
+    x = np.arange(0, 4).reshape(in_shape).astype(np.float32)
+
+    for align_corners in [True, False]:
+      input_tensor = constant_op.constant(x, shape=in_shape)
+      resize_out = image_ops.resize_bicubic(
+          input_tensor, out_shape[1:3], align_corners=align_corners)
+      with self.cached_session():
+        self.assertEqual(out_shape, list(resize_out.get_shape()))
+        resize_out = self.evaluate(resize_out)
+        self.assertEqual(out_shape, list(resize_out.shape))
+
+  @test_util.run_deprecated_v1
+  def testGradFromResizeToLargerInBothDims(self):
+    in_shape = [1, 2, 3, 1]
+    out_shape = [1, 4, 6, 1]
+
+    x = np.arange(0, 6).reshape(in_shape).astype(np.float32)
+
+    for align_corners in [True, False]:
+      with self.cached_session():
+        input_tensor = constant_op.constant(x, shape=in_shape)
+        resize_out = image_ops.resize_bicubic(
+            input_tensor, out_shape[1:3], align_corners=align_corners)
+        err = gradient_checker.compute_gradient_error(
+            input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
+      self.assertLess(err, 1e-3)
+
+  @test_util.run_deprecated_v1
+  def testGradFromResizeToSmallerInBothDims(self):
+    in_shape = [1, 4, 6, 1]
+    out_shape = [1, 2, 3, 1]
+
+    x = np.arange(0, 24).reshape(in_shape).astype(np.float32)
+
+    for align_corners in [True, False]:
+      input_tensor = constant_op.constant(x, shape=in_shape)
+      resize_out = image_ops.resize_bicubic(
+          input_tensor, out_shape[1:3], align_corners=align_corners)
+      with self.cached_session():
+        err = gradient_checker.compute_gradient_error(
+            input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
+      self.assertLess(err, 1e-3)
+
+  @test_util.run_deprecated_v1
+  def testGradOnUnsupportedType(self):
+    in_shape = [1, 4, 6, 1]
+    out_shape = [1, 2, 3, 1]
+
+    x = np.arange(0, 24).reshape(in_shape).astype(np.uint8)
+
+    input_tensor = constant_op.constant(x, shape=in_shape)
+    resize_out = image_ops.resize_bicubic(input_tensor, out_shape[1:3])
+    with self.cached_session():
+      grad = gradients_impl.gradients(resize_out, [input_tensor])
+      self.assertEqual([None], grad)
+
+
+class ScaleAndTranslateOpTestBase(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testGrads(self):
+    in_shape = [1, 2, 3, 1]
+    out_shape = [1, 4, 6, 1]
+
+    x = np.arange(0, 6).reshape(in_shape).astype(np.float32)
+
+    kernel_types = [
+        'lanczos1', 'lanczos3', 'lanczos5', 'gaussian', 'box', 'triangle',
+        'keyscubic', 'mitchellcubic'
+    ]
+    scales = [(1.0, 1.0), (0.37, 0.47), (2.1, 2.1)]
+    translations = [(0.0, 0.0), (3.14, 1.19), (2.1, 3.1), (100.0, 200.0)]
+    for scale in scales:
+      for translation in translations:
+        for kernel_type in kernel_types:
+          for antialias in [True, False]:
+            with self.cached_session():
+              input_tensor = constant_op.constant(x, shape=in_shape)
+              scale_and_translate_out = image_ops.scale_and_translate(
+                  input_tensor,
+                  out_shape[1:3],
+                  scale=constant_op.constant(scale),
+                  translation=constant_op.constant(translation),
+                  kernel_type=kernel_type,
+                  antialias=antialias)
+              err = gradient_checker.compute_gradient_error(
+                  input_tensor,
+                  in_shape,
+                  scale_and_translate_out,
+                  out_shape,
+                  x_init_value=x)
+            self.assertLess(err, 1e-3)
+
+  def testIdentityGrads(self):
+    """Tests that Gradients for 1.0 scale should be ones for some kernels."""
+    in_shape = [1, 2, 3, 1]
+    out_shape = [1, 4, 6, 1]
+
+    x = np.arange(0, 6).reshape(in_shape).astype(np.float32)
+
+    kernel_types = ['lanczos1', 'lanczos3', 'lanczos5', 'triangle', 'keyscubic']
+    scale = (1.0, 1.0)
+    translation = (0.0, 0.0)
+    antialias = True
+    for kernel_type in kernel_types:
+      with self.cached_session():
+        input_tensor = constant_op.constant(x, shape=in_shape)
+        with backprop.GradientTape() as tape:
+          tape.watch(input_tensor)
+          scale_and_translate_out = image_ops.scale_and_translate(
+              input_tensor,
+              out_shape[1:3],
+              scale=constant_op.constant(scale),
+              translation=constant_op.constant(translation),
+              kernel_type=kernel_type,
+              antialias=antialias)
+        grad = tape.gradient(scale_and_translate_out, input_tensor)[0]
+        grad_v = self.evaluate(grad)
+        self.assertAllClose(np.ones_like(grad_v), grad_v)
+
+
+class CropAndResizeOpTestBase(test.TestCase):
+
+  def testShapeIsCorrectAfterOp(self):
+    batch = 2
+    image_height = 3
+    image_width = 4
+    crop_height = 4
+    crop_width = 5
+    depth = 2
+    num_boxes = 2
+
+    image_shape = [batch, image_height, image_width, depth]
+    crop_size = [crop_height, crop_width]
+    crops_shape = [num_boxes, crop_height, crop_width, depth]
+
+    image = np.arange(0, batch * image_height * image_width *
+                      depth).reshape(image_shape).astype(np.float32)
+    boxes = np.array([[0, 0, 1, 1], [.1, .2, .7, .8]], dtype=np.float32)
+    box_ind = np.array([0, 1], dtype=np.int32)
+
+    crops = image_ops.crop_and_resize(
+        constant_op.constant(image, shape=image_shape),
+        constant_op.constant(boxes, shape=[num_boxes, 4]),
+        constant_op.constant(box_ind, shape=[num_boxes]),
+        constant_op.constant(crop_size, shape=[2]))
+    with self.session(use_gpu=True) as sess:
+      self.assertEqual(crops_shape, list(crops.get_shape()))
+      crops = self.evaluate(crops)
+      self.assertEqual(crops_shape, list(crops.shape))
+
+  def _randomUniformAvoidAnchors(self, low, high, anchors, radius, num_samples):
+    """Generate samples that are far enough from a set of anchor points.
+
+    We generate uniform samples in [low, high], then reject those that are less
+    than radius away from any point in anchors. We stop after we have accepted
+    num_samples samples.
+
+    Args:
+      low: The lower end of the interval.
+      high: The upper end of the interval.
+      anchors: A list of length num_crops with anchor points to avoid.
+      radius: Distance threshold for the samples from the anchors.
+      num_samples: How many samples to produce.
+
+    Returns:
+      samples: A list of length num_samples with the accepted samples.
+    """
+    self.assertTrue(low < high)
+    self.assertTrue(radius >= 0)
+    num_anchors = len(anchors)
+    # Make sure that at least half of the interval is not forbidden.
+    self.assertTrue(2 * radius * num_anchors < 0.5 * (high - low))
+    anchors = np.reshape(anchors, num_anchors)
+    samples = []
+    while len(samples) < num_samples:
+      sample = np.random.uniform(low, high)
+      if np.all(np.fabs(sample - anchors) > radius):
+        samples.append(sample)
+    return samples
+
+  @test_util.run_deprecated_v1
+  def testGradRandomBoxes(self):
+    """Test that the gradient is correct for randomly generated boxes.
+
+    The mapping is piecewise differentiable with respect to the box coordinates.
+    The points where the function is not differentiable are those which are
+    mapped to image pixels, i.e., the normalized y coordinates in
+    np.linspace(0, 1, image_height) and normalized x coordinates in
+    np.linspace(0, 1, image_width). Make sure that the box coordinates are
+    sufficiently far away from those rectangular grid centers that are points of
+    discontinuity, so that the finite difference Jacobian is close to the
+    computed one.
+    """
+    np.random.seed(1)  # Make it reproducible.
+    delta = 1e-3
+    radius = 2 * delta
+    low, high = -0.5, 1.5  # Also covers the case of extrapolation.
+
+    image_height = 4
+    for image_width in range(1, 3):
+      for crop_height in range(1, 3):
+        for crop_width in range(2, 4):
+          for depth in range(1, 3):
+            for num_boxes in range(1, 3):
+
+              batch = num_boxes
+              image_shape = [batch, image_height, image_width, depth]
+              crop_size = [crop_height, crop_width]
+              crops_shape = [num_boxes, crop_height, crop_width, depth]
+              boxes_shape = [num_boxes, 4]
+
+              image = np.arange(0, batch * image_height * image_width *
+                                depth).reshape(image_shape).astype(np.float32)
+              boxes = []
+              for _ in range(num_boxes):
+                # pylint: disable=unbalanced-tuple-unpacking
+                y1, y2 = self._randomUniformAvoidAnchors(
+                    low, high, np.linspace(0, 1, image_height), radius, 2)
+                x1, x2 = self._randomUniformAvoidAnchors(
+                    low, high, np.linspace(0, 1, image_width), radius, 2)
+                # pylint: enable=unbalanced-tuple-unpacking
+                boxes.append([y1, x1, y2, x2])
+
+              boxes = np.array(boxes, dtype=np.float32)
+              box_ind = np.arange(batch, dtype=np.int32)
+
+              with self.cached_session(use_gpu=True):
+                image_tensor = constant_op.constant(image, shape=image_shape)
+                boxes_tensor = constant_op.constant(boxes, shape=[num_boxes, 4])
+                box_ind_tensor = constant_op.constant(
+                    box_ind, shape=[num_boxes])
+                crops = image_ops.crop_and_resize(
+                    image_tensor, boxes_tensor, box_ind_tensor,
+                    constant_op.constant(crop_size, shape=[2]))
+
+                err = gradient_checker.compute_gradient_error(
+                    [image_tensor, boxes_tensor], [image_shape, boxes_shape],
+                    crops,
+                    crops_shape,
+                    delta=delta,
+                    x_init_value=[image, boxes])
+
+              self.assertLess(err, 2e-3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RGBToHSVOpTestBase(test.TestCase):
+
+  TYPES = [np.float32, np.float64]
+
+  def testShapeIsCorrectAfterOp(self):
+    in_shape = [2, 20, 30, 3]
+    out_shape = [2, 20, 30, 3]
+
+    for nptype in self.TYPES:
+      x = np.random.randint(0, high=255, size=[2, 20, 30, 3]).astype(nptype)
+      rgb_input_tensor = constant_op.constant(x, shape=in_shape)
+      hsv_out = gen_image_ops.rgb_to_hsv(rgb_input_tensor)
+      with self.cached_session(use_gpu=True):
+        self.assertEqual(out_shape, list(hsv_out.get_shape()))
+      hsv_out = self.evaluate(hsv_out)
+      self.assertEqual(out_shape, list(hsv_out.shape))
+
+  def testRGBToHSVGradSimpleCase(self):
+
+    def f(x):
+      return gen_image_ops.rgb_to_hsv(x)
+
+    # Building a simple input tensor to avoid any discontinuity
+    x = np.array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8,
+                                                     0.9]]).astype(np.float32)
+    rgb_input_tensor = constant_op.constant(x, shape=x.shape)
+    # Computing Analytical and Numerical gradients of f(x)
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        f, [rgb_input_tensor])
+    self.assertAllClose(numerical, analytical, atol=1e-4)
+
+  def testRGBToHSVGradRandomCase(self):
+
+    def f(x):
+      return gen_image_ops.rgb_to_hsv(x)
+
+    np.random.seed(0)
+    # Building a simple input tensor to avoid any discontinuity
+    x = np.random.rand(1, 5, 5, 3).astype(np.float32)
+    rgb_input_tensor = constant_op.constant(x, shape=x.shape)
+    # Computing Analytical and Numerical gradients of f(x)
+    self.assertLess(
+        gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(f, [rgb_input_tensor])), 1e-4)
+
+  def testRGBToHSVGradSpecialCaseRGreatest(self):
+    # This test tests a specific subset of the input space
+    # with a dummy function implemented with native TF operations.
+    in_shape = [2, 10, 20, 3]
+
+    def f(x):
+      return gen_image_ops.rgb_to_hsv(x)
+
+    def f_dummy(x):
+      # This dummy function is a implementation of RGB to HSV using
+      # primitive TF functions for one particular case when R>G>B.
+      r = x[..., 0]
+      g = x[..., 1]
+      b = x[..., 2]
+      # Since MAX = r and MIN = b, we get the following h,s,v values.
+      v = r
+      s = 1 - math_ops.div_no_nan(b, r)
+      h = 60 * math_ops.div_no_nan(g - b, r - b)
+      h = h / 360
+      return array_ops.stack([h, s, v], axis=-1)
+
+    # Building a custom input tensor where R>G>B
+    x_reds = np.ones((in_shape[0], in_shape[1], in_shape[2])).astype(np.float32)
+    x_greens = 0.5 * np.ones(
+        (in_shape[0], in_shape[1], in_shape[2])).astype(np.float32)
+    x_blues = 0.2 * np.ones(
+        (in_shape[0], in_shape[1], in_shape[2])).astype(np.float32)
+    x = np.stack([x_reds, x_greens, x_blues], axis=-1)
+    rgb_input_tensor = constant_op.constant(x, shape=in_shape)
+
+    # Computing Analytical and Numerical gradients of f(x)
+    analytical, numerical = gradient_checker_v2.compute_gradient(
+        f, [rgb_input_tensor])
+    # Computing Analytical and Numerical gradients of f_dummy(x)
+    analytical_dummy, numerical_dummy = gradient_checker_v2.compute_gradient(
+        f_dummy, [rgb_input_tensor])
+    self.assertAllClose(numerical, analytical, atol=1e-4)
+    self.assertAllClose(analytical_dummy, analytical, atol=1e-4)
+    self.assertAllClose(numerical_dummy, numerical, atol=1e-4)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index 3c8d4989a5f..549295df148 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -271,3 +271,38 @@ def _image_projective_transform_grad(op, grad):
       interpolation=interpolation,
       fill_mode=fill_mode)
   return [output, None, None]
+
+
+@ops.RegisterGradient("ImageProjectiveTransformV3")
+def _image_projective_transform_v3_grad(op, grad):
+  """Computes the gradient for ImageProjectiveTransform."""
+  images = op.inputs[0]
+  transforms = op.inputs[1]
+  interpolation = op.get_attr("interpolation")
+  fill_mode = op.get_attr("fill_mode")
+
+  image_or_images = ops.convert_to_tensor(images, name="images")
+  transform_or_transforms = ops.convert_to_tensor(
+      transforms, name="transforms", dtype=dtypes.float32)
+
+  if image_or_images.dtype.base_dtype not in _IMAGE_DTYPES:
+    raise TypeError("Invalid dtype %s." % image_or_images.dtype)
+  if len(transform_or_transforms.get_shape()) == 1:
+    transforms = transform_or_transforms[None]
+  elif len(transform_or_transforms.get_shape()) == 2:
+    transforms = transform_or_transforms
+  else:
+    raise TypeError("Transforms should have rank 1 or 2.")
+
+  # Invert transformations
+  transforms = flat_transforms_to_matrices(transforms=transforms)
+  inverse = linalg_ops.matrix_inverse(transforms)
+  transforms = matrices_to_flat_transforms(inverse)
+  output = gen_image_ops.image_projective_transform_v3(
+      images=grad,
+      transforms=transforms,
+      output_shape=array_ops.shape(image_or_images)[1:3],
+      interpolation=interpolation,
+      fill_mode=fill_mode,
+      fill_value=0.0)
+  return [output, None, None, None]
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index e728da34117..8a767597f00 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -534,7 +534,7 @@ def flip_left_right(image):
 
   Outputs the contents of `image` flipped along the width dimension.
 
-  See also `reverse()`.
+  See also `tf.reverse`.
 
   Usage Example:
 
@@ -2034,8 +2034,8 @@ def adjust_brightness(image, delta):
   The value `delta` is added to all components of the tensor `image`. `image` is
   converted to `float` and scaled appropriately if it is in fixed-point
   representation, and `delta` is converted to the same data type. For regular
-  images, `delta` should be in the range `[0,1)`, as it is added to the image in
-  floating point representation, where pixel values are in the `[0,1)` range.
+  images, `delta` should be in the range `(-1,1)`, as it is added to the image
+  in floating point representation, where pixel values are in the `[0,1)` range.
 
   Usage Example:
 
@@ -3757,7 +3757,10 @@ def rgb_to_yuv(images):
 
   Outputs a tensor of the same shape as the `images` tensor, containing the YUV
   value of the pixels.
-  The output is only well defined if the value in images are in [0,1].
+  The output is only well defined if the value in images are in [0, 1].
+  There are two ways of representing an image: [0, 255] pixel values range or 
+  [0, 1] (as float) pixel values range. Users need to convert the input image 
+  into a float [0, 1] range.
 
   Args:
     images: 2-D or higher rank. Image data to convert. Last dimension must be
@@ -5533,9 +5536,31 @@ def generate_bounding_box_proposals(scores,
                                     name=None):
   """Generate bounding box proposals from encoded bounding boxes.
 
+  Args:
+    scores: A 4-D float `Tensor` of shape
+     `[num_images, height, width, num_achors]` containing scores of
+      the boxes for given anchors, can be unsorted.
+    bbox_deltas: A 4-D float `Tensor` of shape
+     `[num_images, height, width, 4 x num_anchors]` encoding boxes 
+      with respect to each anchor. Coordinates are given 
+      in the form `[dy, dx, dh, dw]`.
+    image_info: A 2-D float `Tensor` of shape `[num_images, 5]` 
+      containing image information Height, Width, Scale.
+    anchors: A 2-D float `Tensor` of shape `[num_anchors, 4]` 
+      describing the anchor boxes.
+      Boxes are formatted in the form `[y1, x1, y2, x2]`.
+    nms_threshold: A scalar float `Tensor` for non-maximal-suppression
+      threshold. Defaults to 0.7.
+    pre_nms_topn: A scalar int `Tensor` for the number of 
+      top scoring boxes to be used as input. Defaults to 6000.
+    min_size: A scalar float `Tensor`. Any box that has a smaller size
+      than min_size will be discarded. Defaults to 16.
+    post_nms_topn: An integer. Maximum number of rois in the output.
+    name: A name for this operation (optional).
+
   Returns:
     rois: Region of interest boxes sorted by their scores.
-    roi_probabilities: scores of the ROI boxes in the ROIs' tensor.
+    roi_probabilities: scores of the ROI boxes in the ROIs' `Tensor`.
   """
   return gen_image_ops.generate_bounding_box_proposals(
       scores=scores,
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index aae2946535a..d3c1bf1fdeb 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -34,11 +34,14 @@ from tensorflow.python.client import session
 from tensorflow.python.compat import compat
 from tensorflow.python.data.experimental.ops import get_single_element
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -97,6 +100,8 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
 
 class RGBToYIQTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_without_tensor_float_32(
+      "Calls rgb_to_yiq and yiq_to_rgb, which use matmul")
   def testBatch(self):
     # Build an arbitrary RGB image
     np.random.seed(7)
@@ -127,6 +132,8 @@ class RGBToYIQTest(test_util.TensorFlowTestCase):
 
 class RGBToYUVTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_without_tensor_float_32(
+      "Calls rgb_to_yuv and yuv_to_rgb, which use matmul")
   def testBatch(self):
     # Build an arbitrary RGB image
     np.random.seed(7)
@@ -956,7 +963,6 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
       y_v[i][2] = b
     return y_v.reshape(x_np.shape)
 
-  @test_util.run_deprecated_v1
   def testAdjustRandomSaturation(self):
     x_shapes = [
         [2, 2, 3],
@@ -991,7 +997,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
           else:
             raise AssertionError("Invalid test style: %s" % (test_style))
           y_baseline = self._adjustSaturationNp(x_np, scale)
-          y_fused = image_ops.adjust_saturation(x_np, scale).eval()
+          y_fused = self.evaluate(image_ops.adjust_saturation(x_np, scale))
           self.assertAllClose(y_fused, y_baseline, rtol=2e-5, atol=1e-5)
 
 
@@ -1016,7 +1022,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
-  @test_util.run_deprecated_v1
   def testLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1024,7 +1029,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
     with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(x_tf)
-      self.assertTrue(y.op.name.startswith("flip_left_right"))
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
@@ -1042,21 +1046,46 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
-  @test_util.run_deprecated_v1
+  def testRandomFlipLeftRightStateful(self):
+    # Test random flip with single seed (stateful).
+    with ops.Graph().as_default():
+      x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
+      y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
+      seed = 42
+
+      with self.cached_session(use_gpu=True):
+        x_tf = constant_op.constant(x_np, shape=x_np.shape)
+        y = image_ops.random_flip_left_right(x_tf, seed=seed)
+        self.assertTrue(y.op.name.startswith("random_flip_left_right"))
+
+        count_flipped = 0
+        count_unflipped = 0
+        for _ in range(100):
+          y_tf = self.evaluate(y)
+          if y_tf[0][0] == 1:
+            self.assertAllEqual(y_tf, x_np)
+            count_unflipped += 1
+          else:
+            self.assertAllEqual(y_tf, y_np)
+            count_flipped += 1
+
+        # 100 trials
+        # Mean: 50
+        # Std Dev: ~5
+        # Six Sigma: 50 - (5 * 6) = 20
+        self.assertGreaterEqual(count_flipped, 20)
+        self.assertGreaterEqual(count_unflipped, 20)
+
   def testRandomFlipLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
-    seed = 42
 
     with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf, seed=seed)
-      self.assertTrue(y.op.name.startswith("random_flip_left_right"))
-
       count_flipped = 0
       count_unflipped = 0
-      for _ in range(100):
-        y_tf = self.evaluate(y)
+      for seed in range(100):
+        y_tf = self.evaluate(image_ops.random_flip_left_right(x_tf, seed=seed))
         if y_tf[0][0] == 1:
           self.assertAllEqual(y_tf, x_np)
           count_unflipped += 1
@@ -1064,12 +1093,8 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
           self.assertAllEqual(y_tf, y_np)
           count_flipped += 1
 
-      # 100 trials
-      # Mean: 50
-      # Std Dev: ~5
-      # Six Sigma: 50 - (5 * 6) = 20
-      self.assertGreaterEqual(count_flipped, 20)
-      self.assertGreaterEqual(count_unflipped, 20)
+      self.assertEqual(count_flipped, 45)
+      self.assertEqual(count_unflipped, 55)
 
   # TODO(b/162345082): stateless random op generates different random number
   # with xla_gpu. Update tests such that there is a single ground truth result
@@ -1174,7 +1199,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
         self.assertAllEqual(flip_counts[0], flip_counts[i])
         self.assertAllEqual(flip_sequences[0], flip_sequences[i])
 
-  @test_util.run_deprecated_v1
   def testRandomFlipLeftRightWithBatch(self):
     batch_size = 16
     seed = 42
@@ -1193,13 +1217,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
 
     with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf, seed=seed)
-      self.assertTrue(y.op.name.startswith("random_flip_left_right"))
-
       count_flipped = 0
       count_unflipped = 0
-      for _ in range(100):
-        y_tf = self.evaluate(y)
+      for seed in range(100):
+        y_tf = self.evaluate(image_ops.random_flip_left_right(x_tf, seed=seed))
 
         # check every element of the batch
         for i in range(batch_size):
@@ -1210,14 +1231,8 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
             self.assertAllEqual(y_tf[i], y_np[i])
             count_flipped += 1
 
-      # 100 trials, each containing batch_size elements
-      # Mean: 50 * batch_size
-      # Std Dev: ~5 * sqrt(batch_size)
-      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
-      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
-      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
-      self.assertGreaterEqual(count_flipped, six_sigma)
-      self.assertGreaterEqual(count_unflipped, six_sigma)
+      self.assertEqual(count_flipped, 772)
+      self.assertEqual(count_unflipped, 828)
 
   def testInvolutionUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1239,7 +1254,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
-  @test_util.run_deprecated_v1
   def testUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1247,7 +1261,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
     with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(x_tf)
-      self.assertTrue(y.op.name.startswith("flip_up_down"))
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
@@ -1265,21 +1278,45 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
-  @test_util.run_deprecated_v1
+  def testRandomFlipUpDownStateful(self):
+    # Test random flip with single seed (stateful).
+    with ops.Graph().as_default():
+      x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
+      y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
+      seed = 42
+
+      with self.cached_session(use_gpu=True):
+        x_tf = constant_op.constant(x_np, shape=x_np.shape)
+        y = image_ops.random_flip_up_down(x_tf, seed=seed)
+        self.assertTrue(y.op.name.startswith("random_flip_up_down"))
+        count_flipped = 0
+        count_unflipped = 0
+        for _ in range(100):
+          y_tf = self.evaluate(y)
+          if y_tf[0][0] == 1:
+            self.assertAllEqual(y_tf, x_np)
+            count_unflipped += 1
+          else:
+            self.assertAllEqual(y_tf, y_np)
+            count_flipped += 1
+
+        # 100 trials
+        # Mean: 50
+        # Std Dev: ~5
+        # Six Sigma: 50 - (5 * 6) = 20
+        self.assertGreaterEqual(count_flipped, 20)
+        self.assertGreaterEqual(count_unflipped, 20)
+
   def testRandomFlipUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
 
-    seed = 42
-
     with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf, seed=seed)
-      self.assertTrue(y.op.name.startswith("random_flip_up_down"))
       count_flipped = 0
       count_unflipped = 0
-      for _ in range(100):
-        y_tf = self.evaluate(y)
+      for seed in range(100):
+        y_tf = self.evaluate(image_ops.random_flip_up_down(x_tf, seed=seed))
         if y_tf[0][0] == 1:
           self.assertAllEqual(y_tf, x_np)
           count_unflipped += 1
@@ -1287,14 +1324,9 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
           self.assertAllEqual(y_tf, y_np)
           count_flipped += 1
 
-      # 100 trials
-      # Mean: 50
-      # Std Dev: ~5
-      # Six Sigma: 50 - (5 * 6) = 20
-      self.assertGreaterEqual(count_flipped, 20)
-      self.assertGreaterEqual(count_unflipped, 20)
+      self.assertEqual(count_flipped, 45)
+      self.assertEqual(count_unflipped, 55)
 
-  @test_util.run_deprecated_v1
   def testRandomFlipUpDownWithBatch(self):
     batch_size = 16
     seed = 42
@@ -1313,13 +1345,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
 
     with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf, seed=seed)
-      self.assertTrue(y.op.name.startswith("random_flip_up_down"))
-
       count_flipped = 0
       count_unflipped = 0
-      for _ in range(100):
-        y_tf = self.evaluate(y)
+      for seed in range(100):
+        y_tf = self.evaluate(image_ops.random_flip_up_down(x_tf, seed=seed))
 
         # check every element of the batch
         for i in range(batch_size):
@@ -1330,14 +1359,8 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
             self.assertAllEqual(y_tf[i], y_np[i])
             count_flipped += 1
 
-      # 100 trials, each containing batch_size elements
-      # Mean: 50 * batch_size
-      # Std Dev: ~5 * sqrt(batch_size)
-      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
-      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
-      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
-      self.assertGreaterEqual(count_flipped, six_sigma)
-      self.assertGreaterEqual(count_unflipped, six_sigma)
+      self.assertEqual(count_flipped, 772)
+      self.assertEqual(count_unflipped, 828)
 
   def testInvolutionTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1359,7 +1382,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
-  @test_util.run_deprecated_v1
   def testTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[1, 4], [2, 5], [3, 6]], dtype=np.uint8).reshape([3, 2, 1])
@@ -1367,7 +1389,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
     with self.cached_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose(x_tf)
-      self.assertTrue(y.op.name.startswith("transpose"))
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
@@ -1386,48 +1407,49 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
       y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
-  @test_util.run_deprecated_v1
   def testPartialShapes(self):
-    p_unknown_rank = array_ops.placeholder(dtypes.uint8)
-    p_unknown_dims_3 = array_ops.placeholder(
-        dtypes.uint8, shape=[None, None, None])
-    p_unknown_dims_4 = array_ops.placeholder(
-        dtypes.uint8, shape=[None, None, None, None])
-    p_unknown_width = array_ops.placeholder(dtypes.uint8, shape=[64, None, 3])
-    p_unknown_batch = array_ops.placeholder(
-        dtypes.uint8, shape=[None, 64, 64, 3])
-    p_wrong_rank = array_ops.placeholder(dtypes.uint8, shape=[None, None])
-    p_zero_dim = array_ops.placeholder(dtypes.uint8, shape=[64, 0, 3])
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      p_unknown_rank = array_ops.placeholder(dtypes.uint8)
+      p_unknown_dims_3 = array_ops.placeholder(
+          dtypes.uint8, shape=[None, None, None])
+      p_unknown_dims_4 = array_ops.placeholder(
+          dtypes.uint8, shape=[None, None, None, None])
+      p_unknown_width = array_ops.placeholder(dtypes.uint8, shape=[64, None, 3])
+      p_unknown_batch = array_ops.placeholder(
+          dtypes.uint8, shape=[None, 64, 64, 3])
+      p_wrong_rank = array_ops.placeholder(dtypes.uint8, shape=[None, None])
+      p_zero_dim = array_ops.placeholder(dtypes.uint8, shape=[64, 0, 3])
 
-    #Ops that support 3D input
-    for op in [
-        image_ops.flip_left_right, image_ops.flip_up_down,
-        image_ops.random_flip_left_right, image_ops.random_flip_up_down,
-        image_ops.transpose, image_ops.rot90
-    ]:
-      transformed_unknown_rank = op(p_unknown_rank)
-      self.assertIsNone(transformed_unknown_rank.get_shape().ndims)
-      transformed_unknown_dims_3 = op(p_unknown_dims_3)
-      self.assertEqual(3, transformed_unknown_dims_3.get_shape().ndims)
-      transformed_unknown_width = op(p_unknown_width)
-      self.assertEqual(3, transformed_unknown_width.get_shape().ndims)
+      #Ops that support 3D input
+      for op in [
+          image_ops.flip_left_right, image_ops.flip_up_down,
+          image_ops.random_flip_left_right, image_ops.random_flip_up_down,
+          image_ops.transpose, image_ops.rot90
+      ]:
+        transformed_unknown_rank = op(p_unknown_rank)
+        self.assertIsNone(transformed_unknown_rank.get_shape().ndims)
+        transformed_unknown_dims_3 = op(p_unknown_dims_3)
+        self.assertEqual(3, transformed_unknown_dims_3.get_shape().ndims)
+        transformed_unknown_width = op(p_unknown_width)
+        self.assertEqual(3, transformed_unknown_width.get_shape().ndims)
 
-      with self.assertRaisesRegex(ValueError, "must be > 0"):
-        op(p_zero_dim)
+        with self.assertRaisesRegex(ValueError, "must be > 0"):
+          op(p_zero_dim)
 
-    #Ops that support 4D input
-    for op in [
-        image_ops.flip_left_right, image_ops.flip_up_down,
-        image_ops.random_flip_left_right, image_ops.random_flip_up_down,
-        image_ops.transpose, image_ops.rot90
-    ]:
-      transformed_unknown_dims_4 = op(p_unknown_dims_4)
-      self.assertEqual(4, transformed_unknown_dims_4.get_shape().ndims)
-      transformed_unknown_batch = op(p_unknown_batch)
-      self.assertEqual(4, transformed_unknown_batch.get_shape().ndims)
-      with self.assertRaisesRegex(ValueError,
-                                  "must be at least three-dimensional"):
-        op(p_wrong_rank)
+      #Ops that support 4D input
+      for op in [
+          image_ops.flip_left_right, image_ops.flip_up_down,
+          image_ops.random_flip_left_right, image_ops.random_flip_up_down,
+          image_ops.transpose, image_ops.rot90
+      ]:
+        transformed_unknown_dims_4 = op(p_unknown_dims_4)
+        self.assertEqual(4, transformed_unknown_dims_4.get_shape().ndims)
+        transformed_unknown_batch = op(p_unknown_batch)
+        self.assertEqual(4, transformed_unknown_batch.get_shape().ndims)
+        with self.assertRaisesRegex(ValueError,
+                                    "must be at least three-dimensional"):
+          op(p_wrong_rank)
 
   def testRot90GroupOrder(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
@@ -1445,25 +1467,21 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase,
         rotated = image_ops.rot90(rotated)
       self.assertAllEqual(image, self.evaluate(rotated))
 
-  @test_util.run_deprecated_v1
   def testRot90NumpyEquivalence(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
     with self.cached_session(use_gpu=True):
-      k_placeholder = array_ops.placeholder(dtypes.int32, shape=[])
-      y_tf = image_ops.rot90(image, k_placeholder)
       for k in xrange(4):
         y_np = np.rot90(image, k=k)
-        self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
+        self.assertAllEqual(
+            y_np, self.evaluate(image_ops.rot90(image, k)))
 
-  @test_util.run_deprecated_v1
   def testRot90NumpyEquivalenceWithBatch(self):
     image = np.arange(48, dtype=np.uint8).reshape([2, 2, 4, 3])
     with self.cached_session(use_gpu=True):
-      k_placeholder = array_ops.placeholder(dtypes.int32, shape=[])
-      y_tf = image_ops.rot90(image, k_placeholder)
       for k in xrange(4):
         y_np = np.rot90(image, k=k, axes=(1, 2))
-        self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
+        self.assertAllEqual(
+            y_np, self.evaluate(image_ops.rot90(image, k)))
 
   def testFlipImageUnknownShape(self):
     expected_output = constant_op.constant([[[[3, 4, 5], [0, 1, 2]],
@@ -1687,19 +1705,15 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
       offset_width = ops.convert_to_tensor(offset_width)
       target_height = ops.convert_to_tensor(target_height)
       target_width = ops.convert_to_tensor(target_width)
-      x_tensor = array_ops.placeholder(x.dtype, shape=[None] * x.ndim)
-      feed_dict = {x_tensor: x}
+      x_tensor = ops.convert_to_tensor(x)
     else:
       x_tensor = x
-      feed_dict = {}
 
     y = image_ops.crop_to_bounding_box(x_tensor, offset_height, offset_width,
                                        target_height, target_width)
-    if not use_tensor_inputs:
-      self.assertTrue(y.get_shape().is_fully_defined())
 
     with self.cached_session(use_gpu=True):
-      return y.eval(feed_dict=feed_dict)
+      return self.evaluate(y)
 
   def _assertReturns(self,
                      x,
@@ -1733,27 +1747,21 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     x = np.array(x).reshape(x_shape)
 
     for use_tensor_inputs in use_tensor_inputs_options:
-      try:
+      with self.assertRaisesRegex(
+          (ValueError, errors.InvalidArgumentError), err_msg):
         self._CropToBoundingBox(x, offset_height, offset_width, target_height,
                                 target_width, use_tensor_inputs)
-      except Exception as e:
-        if err_msg not in str(e):
-          raise
-      else:
-        raise AssertionError("Exception not raised: %s" % err_msg)
 
   def _assertShapeInference(self, pre_shape, height, width, post_shape):
     image = array_ops.placeholder(dtypes.float32, shape=pre_shape)
     y = image_ops.crop_to_bounding_box(image, 0, 0, height, width)
     self.assertEqual(y.get_shape().as_list(), post_shape)
 
-  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
     self._assertReturns(x, x_shape, 0, 0, x, x_shape)
 
-  @test_util.run_deprecated_v1
   def testCrop(self):
     x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     x_shape = [3, 3, 1]
@@ -1778,21 +1786,21 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     y = [1, 2, 4, 5, 7, 8]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
-  @test_util.run_deprecated_v1
   def testShapeInference(self):
-    self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([59, 69, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([None, 66, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([None, 69, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([55, None, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([59, None, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([None, None, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([55, 66, None], 55, 66, [55, 66, None])
-    self._assertShapeInference([59, 69, None], 55, 66, [55, 66, None])
-    self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
-    self._assertShapeInference(None, 55, 66, [55, 66, None])
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([59, 69, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, 66, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, 69, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([55, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([59, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([55, 66, None], 55, 66, [55, 66, None])
+      self._assertShapeInference([59, 69, None], 55, 66, [55, 66, None])
+      self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
+      self._assertShapeInference(None, 55, 66, [55, 66, None])
 
-  @test_util.run_deprecated_v1
   def testNon3DInput(self):
     # Input image is not 3D
     x = [0] * 15
@@ -1804,7 +1812,6 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
                          target_width,
                          "must have either 3 or 4 dimensions.")
 
-  @test_util.run_deprecated_v1
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
     # Each line is a test configuration:
@@ -1833,32 +1840,32 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
           offset_width,
           target_height,
           target_width,
-          "assertion failed:",
+          "inner 3 dims of 'image.shape' must be > 0",
           use_tensor_inputs_options=[True])
 
-  @test_util.run_deprecated_v1
   def testBadParams(self):
     x_shape = [4, 4, 1]
     x = np.zeros(x_shape)
 
     # Each line is a test configuration:
     #   (offset_height, offset_width, target_height, target_width), err_msg
-    test_config = (([-1, 0, 3, 3], "offset_height must be >= 0"), ([
-        0, -1, 3, 3
-    ], "offset_width must be >= 0"), ([0, 0, 0, 3],
-                                      "target_height must be > 0"),
-                   ([0, 0, 3, 0], "target_width must be > 0"),
-                   ([2, 0, 3, 3], "height must be >= target + offset"),
-                   ([0, 2, 3, 3], "width must be >= target + offset"))
+    test_config = (
+        ([-1, 0, 3, 3], "offset_height must be >= 0"),
+        ([0, -1, 3, 3], "offset_width must be >= 0"),
+        ([0, 0, 0, 3], "target_height must be > 0"),
+        ([0, 0, 3, 0], "target_width must be > 0"),
+        ([2, 0, 3, 3], r"height must be >= target \+ offset"),
+        ([0, 2, 3, 3], r"width must be >= target \+ offset"))
 
     for params, err_msg in test_config:
       self._assertRaises(x, x_shape, *params, err_msg=err_msg)
 
-  @test_util.run_deprecated_v1
   def testNameScope(self):
-    image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
-    y = image_ops.crop_to_bounding_box(image, 0, 0, 55, 66)
-    self.assertTrue(y.name.startswith("crop_to_bounding_box"))
+    # Testing name scope requires a graph.
+    with ops.Graph().as_default():
+      image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
+      y = image_ops.crop_to_bounding_box(image, 0, 0, 55, 66)
+      self.assertTrue(y.name.startswith("crop_to_bounding_box"))
 
 
 class CentralCropTest(test_util.TensorFlowTestCase):
@@ -2009,19 +2016,17 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
       offset_width = ops.convert_to_tensor(offset_width)
       target_height = ops.convert_to_tensor(target_height)
       target_width = ops.convert_to_tensor(target_width)
-      x_tensor = array_ops.placeholder(x.dtype, shape=[None] * x.ndim)
-      feed_dict = {x_tensor: x}
+      x_tensor = ops.convert_to_tensor(x)
     else:
       x_tensor = x
-      feed_dict = {}
 
-    y = image_ops.pad_to_bounding_box(x_tensor, offset_height, offset_width,
-                                      target_height, target_width)
-    if not use_tensor_inputs:
-      self.assertTrue(y.get_shape().is_fully_defined())
+    @def_function.function
+    def pad_bbox(*args):
+      return image_ops.pad_to_bounding_box(*args)
 
     with self.cached_session(use_gpu=True):
-      return y.eval(feed_dict=feed_dict)
+      return self.evaluate(pad_bbox(x_tensor, offset_height, offset_width,
+                                    target_height, target_width))
 
   def _assertReturns(self,
                      x,
@@ -2055,14 +2060,10 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     x = np.array(x).reshape(x_shape)
 
     for use_tensor_inputs in use_tensor_inputs_options:
-      try:
+      with self.assertRaisesRegex(
+          (ValueError, errors.InvalidArgumentError), err_msg):
         self._PadToBoundingBox(x, offset_height, offset_width, target_height,
                                target_width, use_tensor_inputs)
-      except Exception as e:
-        if err_msg not in str(e):
-          raise
-      else:
-        raise AssertionError("Exception not raised: %s" % err_msg)
 
   def _assertShapeInference(self, pre_shape, height, width, post_shape):
     image = array_ops.placeholder(dtypes.float32, shape=pre_shape)
@@ -2083,14 +2084,12 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     with self.cached_session(use_gpu=True):
       self.assertAllClose(y, self.evaluate(y_tf))
 
-  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
     offset_height, offset_width = [0, 0]
     self._assertReturns(x, x_shape, offset_height, offset_width, x, x_shape)
 
-  @test_util.run_deprecated_v1
   def testPadding(self):
     x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     x_shape = [3, 3, 1]
@@ -2115,21 +2114,21 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     y_shape = [3, 4, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
-  @test_util.run_deprecated_v1
   def testShapeInference(self):
-    self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([50, 60, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([None, 66, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([None, 60, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([55, None, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([50, None, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([None, None, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([55, 66, None], 55, 66, [55, 66, None])
-    self._assertShapeInference([50, 60, None], 55, 66, [55, 66, None])
-    self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
-    self._assertShapeInference(None, 55, 66, [55, 66, None])
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([50, 60, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, 66, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, 60, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([55, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([50, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([55, 66, None], 55, 66, [55, 66, None])
+      self._assertShapeInference([50, 60, None], 55, 66, [55, 66, None])
+      self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
+      self._assertShapeInference(None, 55, 66, [55, 66, None])
 
-  @test_util.run_deprecated_v1
   def testNon3DInput(self):
     # Input image is not 3D
     x = [0] * 15
@@ -2141,7 +2140,6 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
                          target_width,
                          "must have either 3 or 4 dimensions.")
 
-  @test_util.run_deprecated_v1
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
     # Each line is a test configuration:
@@ -2174,26 +2172,31 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
           "inner 3 dims of \\'image.shape\\' must be > 0",
           use_tensor_inputs_options=[True])
 
-  @test_util.run_deprecated_v1
   def testBadParams(self):
     x_shape = [3, 3, 1]
     x = np.zeros(x_shape)
 
     # Each line is a test configuration:
     #   offset_height, offset_width, target_height, target_width, err_msg
-    test_config = ((-1, 0, 4, 4, "offset_height must be >= 0"),
-                   (0, -1, 4, 4, "offset_width must be >= 0"),
-                   (2, 0, 4, 4, "height must be <= target - offset"),
-                   (0, 2, 4, 4, "width must be <= target - offset"))
+    test_config = (
+        (-1, 0, 4, 4,
+         "offset_height must be >= 0"),
+        (0, -1, 4, 4,
+         "offset_width must be >= 0"),
+        (2, 0, 4, 4,
+         "height must be <= target - offset"),
+        (0, 2, 4, 4,
+         "width must be <= target - offset"))
 
     for config_item in test_config:
       self._assertRaises(x, x_shape, *config_item)
 
-  @test_util.run_deprecated_v1
   def testNameScope(self):
-    image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
-    y = image_ops.pad_to_bounding_box(image, 0, 0, 55, 66)
-    self.assertTrue(y.op.name.startswith("pad_to_bounding_box"))
+    # Testing name scope requires a graph.
+    with ops.Graph().as_default():
+      image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
+      y = image_ops.pad_to_bounding_box(image, 0, 0, 55, 66)
+      self.assertTrue(y.op.name.startswith("pad_to_bounding_box"))
 
 
 class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
@@ -2242,19 +2245,17 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
         fraction_object_covered.append(float(np.sum(y_tf)) / bounding_box_area)
 
       # min_object_covered as tensor
-      min_object_covered_placeholder = array_ops.placeholder(dtypes.float32)
+      min_object_covered_t = ops.convert_to_tensor(min_object_covered)
       begin, size, _ = image_ops.sample_distorted_bounding_box(
           image_size=image_size_tf,
           bounding_boxes=bounding_box_tf,
-          min_object_covered=min_object_covered_placeholder,
+          min_object_covered=min_object_covered_t,
           aspect_ratio_range=aspect_ratio_range,
           area_range=area_range)
       y = array_ops.strided_slice(image_tf, begin, begin + size)
 
       for _ in xrange(num_iter):
-        y_tf = y.eval(feed_dict={
-            min_object_covered_placeholder: min_object_covered
-        })
+        y_tf = self.evaluate(y)
         crop_height = y_tf.shape[0]
         crop_width = y_tf.shape[1]
         aspect_ratio = float(crop_width) / float(crop_height)
@@ -2297,7 +2298,6 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
     # TODO(wicke, shlens, dga): Restore this test so that it is no longer flaky.
     # self.assertGreaterEqual(min(fraction_object_covered), min_object_covered)
 
-  @test_util.run_deprecated_v1
   def testWholeImageBoundingBox(self):
     height = 40
     width = 50
@@ -2312,7 +2312,6 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
         aspect_ratio_range=(0.75, 1.33),
         area_range=(0.05, 1.0))
 
-  @test_util.run_deprecated_v1
   def testWithBoundingBox(self):
     height = 40
     width = 50
@@ -2343,43 +2342,44 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
         aspect_ratio_range=(0.75, 1.33),
         area_range=(0.05, 1.0))
 
-  @test_util.run_deprecated_v1
   def testSampleDistortedBoundingBoxShape(self):
-    with self.cached_session(use_gpu=True):
-      image_size = constant_op.constant(
-          [40, 50, 1], shape=[3], dtype=dtypes.int32)
-      bounding_box = constant_op.constant(
-          [[[0.0, 0.0, 1.0, 1.0]]],
-          shape=[1, 1, 4],
-          dtype=dtypes.float32,
-      )
-      begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
-          image_size=image_size,
-          bounding_boxes=bounding_box,
-          min_object_covered=0.1,
-          aspect_ratio_range=(0.75, 1.33),
-          area_range=(0.05, 1.0))
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      with self.cached_session(use_gpu=True):
+        image_size = constant_op.constant(
+            [40, 50, 1], shape=[3], dtype=dtypes.int32)
+        bounding_box = constant_op.constant(
+            [[[0.0, 0.0, 1.0, 1.0]]],
+            shape=[1, 1, 4],
+            dtype=dtypes.float32,
+        )
+        begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
+            image_size=image_size,
+            bounding_boxes=bounding_box,
+            min_object_covered=0.1,
+            aspect_ratio_range=(0.75, 1.33),
+            area_range=(0.05, 1.0))
 
-      # Test that the shapes are correct.
-      self.assertAllEqual([3], begin.get_shape().as_list())
-      self.assertAllEqual([3], end.get_shape().as_list())
-      self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
-      # Actual run to make sure shape is correct inside Compute().
-      begin = self.evaluate(begin)
-      end = self.evaluate(end)
-      bbox_for_drawing = self.evaluate(bbox_for_drawing)
+        # Test that the shapes are correct.
+        self.assertAllEqual([3], begin.get_shape().as_list())
+        self.assertAllEqual([3], end.get_shape().as_list())
+        self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
+        # Actual run to make sure shape is correct inside Compute().
+        begin = self.evaluate(begin)
+        end = self.evaluate(end)
+        bbox_for_drawing = self.evaluate(bbox_for_drawing)
 
-      begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
-          image_size=image_size,
-          bounding_boxes=bounding_box,
-          min_object_covered=array_ops.placeholder(dtypes.float32),
-          aspect_ratio_range=(0.75, 1.33),
-          area_range=(0.05, 1.0))
+        begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
+            image_size=image_size,
+            bounding_boxes=bounding_box,
+            min_object_covered=array_ops.placeholder(dtypes.float32),
+            aspect_ratio_range=(0.75, 1.33),
+            area_range=(0.05, 1.0))
 
-      # Test that the shapes are correct.
-      self.assertAllEqual([3], begin.get_shape().as_list())
-      self.assertAllEqual([3], end.get_shape().as_list())
-      self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
+        # Test that the shapes are correct.
+        self.assertAllEqual([3], begin.get_shape().as_list())
+        self.assertAllEqual([3], end.get_shape().as_list())
+        self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
 
   def testDefaultMinObjectCovered(self):
     # By default min_object_covered=0.1 if not provided
@@ -2549,7 +2549,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([1, 1, 4], bbox_for_drawing.shape)
 
 
-class ResizeImagesV2Test(test_util.TensorFlowTestCase):
+class ResizeImagesV2Test(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   METHODS = [
       image_ops.ResizeMethod.BILINEAR, image_ops.ResizeMethod.NEAREST_NEIGHBOR,
@@ -2594,7 +2594,6 @@ class ResizeImagesV2Test(test_util.TensorFlowTestCase):
       return False
 
   @test_util.disable_xla("align_corners=False not supported by XLA")
-  @test_util.run_deprecated_v1
   def testNoOp(self):
     img_shape = [1, 6, 4, 1]
     single_shape = [6, 4, 1]
@@ -2632,7 +2631,6 @@ class ResizeImagesV2Test(test_util.TensorFlowTestCase):
         self.assertAllEqual(single_shape, newshape)
 
   # half_pixel_centers unsupported in ResizeBilinear
-  @test_util.run_deprecated_v1
   @test_util.disable_xla("b/127616992")
   def testTensorArguments(self):
     img_shape = [1, 6, 4, 1]
@@ -2643,16 +2641,17 @@ class ResizeImagesV2Test(test_util.TensorFlowTestCase):
         127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
         50, 50, 100, 100, 50, 50, 100, 100
     ]
-    new_size = array_ops.placeholder(dtypes.int32, shape=(2))
+    def resize_func(t, new_size, method):
+      return image_ops.resize_images_v2(t, new_size, method)
 
     img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
 
     for method in self.METHODS:
-      with self.cached_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True):
         image = constant_op.constant(img_np, shape=img_shape)
-        y = image_ops.resize_images_v2(image, new_size, method)
+        y = resize_func(image, [6, 4], method)
         yshape = array_ops.shape(y)
-        resized, newshape = sess.run([y, yshape], {new_size: [6, 4]})
+        resized, newshape = self.evaluate([y, yshape])
         self.assertAllEqual(img_shape, newshape)
         if method in self.INTERPOLATING_METHODS:
           self.assertAllClose(resized, img_np, atol=1e-5)
@@ -2661,9 +2660,9 @@ class ResizeImagesV2Test(test_util.TensorFlowTestCase):
       with self.cached_session(use_gpu=True):
         img_single = img_np.reshape(single_shape)
         image = constant_op.constant(img_single, shape=single_shape)
-        y = image_ops.resize_images_v2(image, new_size, self.METHODS[0])
+        y = resize_func(image, [6, 4], self.METHODS[0])
         yshape = array_ops.shape(y)
-        resized, newshape = sess.run([y, yshape], {new_size: [6, 4]})
+        resized, newshape = self.evaluate([y, yshape])
         self.assertAllEqual(single_shape, newshape)
         if method in self.INTERPOLATING_METHODS:
           self.assertAllClose(resized, img_single, atol=1e-5)
@@ -2671,49 +2670,72 @@ class ResizeImagesV2Test(test_util.TensorFlowTestCase):
     # Incorrect shape.
     with self.assertRaises(ValueError):
       new_size = constant_op.constant(4)
-      _ = image_ops.resize_images_v2(image, new_size,
-                                     image_ops.ResizeMethod.BILINEAR)
+      _ = resize_func(image, new_size, image_ops.ResizeMethod.BILINEAR)
     with self.assertRaises(ValueError):
       new_size = constant_op.constant([4])
-      _ = image_ops.resize_images_v2(image, new_size,
-                                     image_ops.ResizeMethod.BILINEAR)
+      _ = resize_func(image, new_size, image_ops.ResizeMethod.BILINEAR)
     with self.assertRaises(ValueError):
       new_size = constant_op.constant([1, 2, 3])
-      _ = image_ops.resize_images_v2(image, new_size,
-                                     image_ops.ResizeMethod.BILINEAR)
+      _ = resize_func(image, new_size, image_ops.ResizeMethod.BILINEAR)
 
     # Incorrect dtypes.
     with self.assertRaises(ValueError):
       new_size = constant_op.constant([6.0, 4])
-      _ = image_ops.resize_images_v2(image, new_size,
-                                     image_ops.ResizeMethod.BILINEAR)
+      _ = resize_func(image, new_size, image_ops.ResizeMethod.BILINEAR)
     with self.assertRaises(ValueError):
-      _ = image_ops.resize_images_v2(image, [6, 4.0],
-                                     image_ops.ResizeMethod.BILINEAR)
+      _ = resize_func(image, [6, 4.0], image_ops.ResizeMethod.BILINEAR)
     with self.assertRaises(ValueError):
-      _ = image_ops.resize_images_v2(image, [None, 4],
-                                     image_ops.ResizeMethod.BILINEAR)
+      _ = resize_func(image, [None, 4], image_ops.ResizeMethod.BILINEAR)
     with self.assertRaises(ValueError):
-      _ = image_ops.resize_images_v2(image, [6, None],
-                                     image_ops.ResizeMethod.BILINEAR)
+      _ = resize_func(image, [6, None], image_ops.ResizeMethod.BILINEAR)
 
-  @test_util.run_deprecated_v1
-  def testReturnDtype(self):
-    target_shapes = [[6, 4], [3, 2],
-                     [
-                         array_ops.placeholder(dtypes.int32),
-                         array_ops.placeholder(dtypes.int32)
-                     ]]
-    for nptype in self.TYPES:
-      image = array_ops.placeholder(nptype, shape=[1, 6, 4, 1])
-      for method in self.METHODS:
-        for target_shape in target_shapes:
-          y = image_ops.resize_images_v2(image, target_shape, method)
-          if method == image_ops.ResizeMethod.NEAREST_NEIGHBOR:
-            expected_dtype = image.dtype
-          else:
-            expected_dtype = dtypes.float32
-          self.assertEqual(y.dtype, expected_dtype)
+  def testReturnDtypeV1(self):
+    # Shape inference in V1.
+    with ops.Graph().as_default():
+      target_shapes = [[6, 4], [3, 2],
+                       [
+                           array_ops.placeholder(dtypes.int32),
+                           array_ops.placeholder(dtypes.int32)
+                       ]]
+      for nptype in self.TYPES:
+        image = array_ops.placeholder(nptype, shape=[1, 6, 4, 1])
+        for method in self.METHODS:
+          for target_shape in target_shapes:
+            y = image_ops.resize_images_v2(image, target_shape, method)
+            if method == image_ops.ResizeMethod.NEAREST_NEIGHBOR:
+              expected_dtype = image.dtype
+            else:
+              expected_dtype = dtypes.float32
+            self.assertEqual(y.dtype, expected_dtype)
+
+  @parameterized.named_parameters([("_RunEagerly", True), ("_RunGraph", False)])
+  def testReturnDtypeV2(self, run_func_eagerly):
+    if not context.executing_eagerly() and run_func_eagerly:
+      # Skip running tf.function eagerly in V1 mode.
+      self.skipTest("Skip test that runs tf.function eagerly in V1 mode.")
+    else:
+
+      @def_function.function
+      def test_dtype(image, target_shape, target_method):
+        y = image_ops.resize_images_v2(image, target_shape, target_method)
+        if method == image_ops.ResizeMethod.NEAREST_NEIGHBOR:
+          expected_dtype = image.dtype
+        else:
+          expected_dtype = dtypes.float32
+
+        self.assertEqual(y.dtype, expected_dtype)
+
+      target_shapes = [[6, 4],
+                       [3, 2],
+                       [tensor_spec.TensorSpec(shape=None, dtype=dtypes.int32),
+                        tensor_spec.TensorSpec(shape=None, dtype=dtypes.int32)]]
+
+      for nptype in self.TYPES:
+        image = tensor_spec.TensorSpec(shape=[1, 6, 4, 1], dtype=nptype)
+        for method in self.METHODS:
+          for target_shape in target_shapes:
+            with test_util.run_functions_eagerly(run_func_eagerly):
+              test_dtype.get_concrete_function(image, target_shape, method)
 
   # half_pixel_centers not supported by XLA
   @test_util.disable_xla("b/127616992")
@@ -2951,52 +2973,54 @@ class ResizeImagesV2Test(test_util.TensorFlowTestCase):
             value[use_gpu] = self.evaluate(out_op)
         self.assertAllClose(value[True], value[False], rtol=1e-5, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def testShapeInference(self):
-    self._assertShapeInference([50, 60, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([55, 66, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([59, 69, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([50, 69, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([59, 60, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([None, 60, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([None, 66, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([None, 69, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([50, None, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([55, None, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([59, None, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([None, None, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([50, 60, None], [55, 66], [55, 66, None])
-    self._assertShapeInference([55, 66, None], [55, 66], [55, 66, None])
-    self._assertShapeInference([59, 69, None], [55, 66], [55, 66, None])
-    self._assertShapeInference([50, 69, None], [55, 66], [55, 66, None])
-    self._assertShapeInference([59, 60, None], [55, 66], [55, 66, None])
-    self._assertShapeInference([None, None, None], [55, 66], [55, 66, None])
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      self._assertShapeInference([50, 60, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([55, 66, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([59, 69, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([50, 69, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([59, 60, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([None, 60, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([None, 66, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([None, 69, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([50, None, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([55, None, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([59, None, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([None, None, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([50, 60, None], [55, 66], [55, 66, None])
+      self._assertShapeInference([55, 66, None], [55, 66], [55, 66, None])
+      self._assertShapeInference([59, 69, None], [55, 66], [55, 66, None])
+      self._assertShapeInference([50, 69, None], [55, 66], [55, 66, None])
+      self._assertShapeInference([59, 60, None], [55, 66], [55, 66, None])
+      self._assertShapeInference([None, None, None], [55, 66], [55, 66, None])
 
-  @test_util.run_deprecated_v1
   def testNameScope(self):
-    with self.cached_session(use_gpu=True):
-      single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
-      y = image_ops.resize_images(single_image, [55, 66])
-      self.assertTrue(y.op.name.startswith("resize"))
+    # Testing name scope requires placeholders and a graph.
+    with ops.Graph().as_default():
+      with self.cached_session(use_gpu=True):
+        single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
+        y = image_ops.resize_images(single_image, [55, 66])
+        self.assertTrue(y.op.name.startswith("resize"))
 
   def _ResizeImageCall(self, x, max_h, max_w, preserve_aspect_ratio,
                        use_tensor_inputs):
     if use_tensor_inputs:
       target_max = ops.convert_to_tensor([max_h, max_w])
-      x_tensor = array_ops.placeholder(x.dtype, shape=[None] * x.ndim)
-      feed_dict = {x_tensor: x}
+      x_tensor = ops.convert_to_tensor(x)
     else:
-      target_max = [max_h, max_w]
+      target_max = (max_h, max_w)
       x_tensor = x
-      feed_dict = {}
 
-    y = image_ops.resize_images(
-        x_tensor,
-        ops.convert_to_tensor(target_max),
-        preserve_aspect_ratio=preserve_aspect_ratio)
+    def resize_func(t,
+                    target_max=target_max,
+                    preserve_aspect_ratio=preserve_aspect_ratio):
+      return image_ops.resize_images(
+          t, ops.convert_to_tensor(target_max),
+          preserve_aspect_ratio=preserve_aspect_ratio)
 
     with self.cached_session(use_gpu=True):
-      return y.eval(feed_dict=feed_dict)
+      return self.evaluate(resize_func(x_tensor))
 
   def _assertResizeEqual(self,
                          x,
@@ -3032,7 +3056,6 @@ class ResizeImagesV2Test(test_util.TensorFlowTestCase):
                                    preserve_aspect_ratio, use_tensor_inputs)
       self.assertShapeEqual(y, ops.convert_to_tensor(y_tf))
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioMultipleImages(self):
     x_shape = [10, 100, 80, 10]
     x = np.random.uniform(size=x_shape)
@@ -3046,42 +3069,36 @@ class ResizeImagesV2Test(test_util.TensorFlowTestCase):
             expect_shape,
             preserve_aspect_ratio=preserve_aspect_ratio)
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeEqual(x, x_shape, x, x_shape)
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSmaller(self):
     x_shape = [100, 100, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [75, 50], [50, 50, 10])
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSmallerMultipleImages(self):
     x_shape = [10, 100, 100, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [75, 50], [10, 50, 50, 10])
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioLarger(self):
     x_shape = [100, 100, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [150, 200], [150, 150, 10])
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSameRatio(self):
     x_shape = [1920, 1080, 3]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [3840, 2160], [3840, 2160, 3])
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSquare(self):
     x_shape = [299, 299, 3]
     x = np.random.uniform(size=x_shape)
@@ -3089,7 +3106,8 @@ class ResizeImagesV2Test(test_util.TensorFlowTestCase):
     self._assertResizeCheckShape(x, x_shape, [320, 320], [320, 320, 3])
 
 
-class ResizeImagesTest(test_util.TensorFlowTestCase):
+class ResizeImagesTest(test_util.TensorFlowTestCase,
+                       parameterized.TestCase):
 
   METHODS = [
       image_ops.ResizeMethodV1.BILINEAR,
@@ -3124,7 +3142,6 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
       return False
 
   @test_util.disable_xla("align_corners=False not supported by XLA")
-  @test_util.run_deprecated_v1
   def testNoOp(self):
     img_shape = [1, 6, 4, 1]
     single_shape = [6, 4, 1]
@@ -3160,7 +3177,6 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         newshape = self.evaluate(yshape)
         self.assertAllEqual(single_shape, newshape)
 
-  @test_util.run_deprecated_v1
   def testTensorArguments(self):
     img_shape = [1, 6, 4, 1]
     single_shape = [6, 4, 1]
@@ -3170,16 +3186,18 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         127, 127, 64, 64, 127, 127, 64, 64, 64, 64, 127, 127, 64, 64, 127, 127,
         50, 50, 100, 100, 50, 50, 100, 100
     ]
-    new_size = array_ops.placeholder(dtypes.int32, shape=(2))
+
+    def resize_func(t, new_size, method):
+      return image_ops.resize_images(t, new_size, method)
 
     img_np = np.array(data, dtype=np.uint8).reshape(img_shape)
 
     for method in self.METHODS:
-      with self.cached_session(use_gpu=True) as sess:
+      with self.cached_session(use_gpu=True):
         image = constant_op.constant(img_np, shape=img_shape)
-        y = image_ops.resize_images(image, new_size, method)
+        y = resize_func(image, [6, 4], method)
         yshape = array_ops.shape(y)
-        resized, newshape = sess.run([y, yshape], {new_size: [6, 4]})
+        resized, newshape = self.evaluate([y, yshape])
         self.assertAllEqual(img_shape, newshape)
         self.assertAllClose(resized, img_np, atol=1e-5)
 
@@ -3187,58 +3205,82 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     with self.cached_session(use_gpu=True):
       img_single = img_np.reshape(single_shape)
       image = constant_op.constant(img_single, shape=single_shape)
-      y = image_ops.resize_images(image, new_size, self.METHODS[0])
+      y = resize_func(image, [6, 4], self.METHODS[0])
       yshape = array_ops.shape(y)
-      resized, newshape = sess.run([y, yshape], {new_size: [6, 4]})
+      resized, newshape = self.evaluate([y, yshape])
       self.assertAllEqual(single_shape, newshape)
       self.assertAllClose(resized, img_single, atol=1e-5)
 
     # Incorrect shape.
     with self.assertRaises(ValueError):
       new_size = constant_op.constant(4)
-      _ = image_ops.resize_images(image, new_size,
-                                  image_ops.ResizeMethodV1.BILINEAR)
+      _ = resize_func(image, new_size, image_ops.ResizeMethodV1.BILINEAR)
     with self.assertRaises(ValueError):
       new_size = constant_op.constant([4])
-      _ = image_ops.resize_images(image, new_size,
-                                  image_ops.ResizeMethodV1.BILINEAR)
+      _ = resize_func(image, new_size, image_ops.ResizeMethodV1.BILINEAR)
     with self.assertRaises(ValueError):
       new_size = constant_op.constant([1, 2, 3])
-      _ = image_ops.resize_images(image, new_size,
-                                  image_ops.ResizeMethodV1.BILINEAR)
+      _ = resize_func(image, new_size, image_ops.ResizeMethodV1.BILINEAR)
 
     # Incorrect dtypes.
     with self.assertRaises(ValueError):
       new_size = constant_op.constant([6.0, 4])
-      _ = image_ops.resize_images(image, new_size,
-                                  image_ops.ResizeMethodV1.BILINEAR)
+      _ = resize_func(image, new_size, image_ops.ResizeMethodV1.BILINEAR)
     with self.assertRaises(ValueError):
-      _ = image_ops.resize_images(image, [6, 4.0],
-                                  image_ops.ResizeMethodV1.BILINEAR)
+      _ = resize_func(image, [6, 4.0], image_ops.ResizeMethodV1.BILINEAR)
     with self.assertRaises(ValueError):
-      _ = image_ops.resize_images(image, [None, 4],
-                                  image_ops.ResizeMethodV1.BILINEAR)
+      _ = resize_func(image, [None, 4], image_ops.ResizeMethodV1.BILINEAR)
     with self.assertRaises(ValueError):
-      _ = image_ops.resize_images(image, [6, None],
-                                  image_ops.ResizeMethodV1.BILINEAR)
+      _ = resize_func(image, [6, None], image_ops.ResizeMethodV1.BILINEAR)
 
-  @test_util.run_deprecated_v1
-  def testReturnDtype(self):
-    target_shapes = [[6, 4], [3, 2], [
-        array_ops.placeholder(dtypes.int32),
-        array_ops.placeholder(dtypes.int32)
-    ]]
-    for nptype in self.TYPES:
-      image = array_ops.placeholder(nptype, shape=[1, 6, 4, 1])
-      for method in self.METHODS:
-        for target_shape in target_shapes:
-          y = image_ops.resize_images(image, target_shape, method)
-          if (method == image_ops.ResizeMethodV1.NEAREST_NEIGHBOR or
-              target_shape == image.shape[1:3]):
-            expected_dtype = image.dtype
-          else:
-            expected_dtype = dtypes.float32
-          self.assertEqual(y.dtype, expected_dtype)
+  def testReturnDtypeV1(self):
+    # Shape inference in V1.
+    with ops.Graph().as_default():
+      target_shapes = [[6, 4], [3, 2], [
+          array_ops.placeholder(dtypes.int32),
+          array_ops.placeholder(dtypes.int32)
+      ]]
+      for nptype in self.TYPES:
+        image = array_ops.placeholder(nptype, shape=[1, 6, 4, 1])
+        for method in self.METHODS:
+          for target_shape in target_shapes:
+            y = image_ops.resize_images(image, target_shape, method)
+            if (method == image_ops.ResizeMethodV1.NEAREST_NEIGHBOR or
+                target_shape == image.shape[1:3]):
+              expected_dtype = image.dtype
+            else:
+              expected_dtype = dtypes.float32
+            self.assertEqual(y.dtype, expected_dtype)
+
+  @parameterized.named_parameters([("_RunEagerly", True), ("_RunGraph", False)])
+  def testReturnDtypeV2(self, run_func_eagerly):
+    if not context.executing_eagerly() and run_func_eagerly:
+      # Skip running tf.function eagerly in V1 mode.
+      self.skipTest("Skip test that runs tf.function eagerly in V1 mode.")
+    else:
+
+      @def_function.function
+      def test_dtype(image, target_shape, target_method):
+        y = image_ops.resize_images(image, target_shape, target_method)
+        if (method == image_ops.ResizeMethodV1.NEAREST_NEIGHBOR or
+            target_shape == image.shape[1:3]):
+          expected_dtype = image.dtype
+        else:
+          expected_dtype = dtypes.float32
+
+        self.assertEqual(y.dtype, expected_dtype)
+
+      target_shapes = [[6, 4],
+                       [3, 2],
+                       [tensor_spec.TensorSpec(shape=None, dtype=dtypes.int32),
+                        tensor_spec.TensorSpec(shape=None, dtype=dtypes.int32)]]
+
+      for nptype in self.TYPES:
+        image = tensor_spec.TensorSpec(shape=[1, 6, 4, 1], dtype=nptype)
+        for method in self.METHODS:
+          for target_shape in target_shapes:
+            with test_util.run_functions_eagerly(run_func_eagerly):
+              test_dtype.get_concrete_function(image, target_shape, method)
 
   @test_util.disable_xla("align_corners=False not supported by XLA")
   def testSumTensor(self):
@@ -3477,51 +3519,51 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
               value[use_gpu] = self.evaluate(out_op)
           self.assertAllClose(value[True], value[False], rtol=1e-5, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def testShapeInference(self):
-    self._assertShapeInference([50, 60, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([55, 66, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([59, 69, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([50, 69, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([59, 60, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([None, 60, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([None, 66, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([None, 69, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([50, None, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([55, None, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([59, None, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([None, None, 3], [55, 66], [55, 66, 3])
-    self._assertShapeInference([50, 60, None], [55, 66], [55, 66, None])
-    self._assertShapeInference([55, 66, None], [55, 66], [55, 66, None])
-    self._assertShapeInference([59, 69, None], [55, 66], [55, 66, None])
-    self._assertShapeInference([50, 69, None], [55, 66], [55, 66, None])
-    self._assertShapeInference([59, 60, None], [55, 66], [55, 66, None])
-    self._assertShapeInference([None, None, None], [55, 66], [55, 66, None])
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      self._assertShapeInference([50, 60, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([55, 66, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([59, 69, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([50, 69, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([59, 60, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([None, 60, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([None, 66, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([None, 69, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([50, None, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([55, None, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([59, None, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([None, None, 3], [55, 66], [55, 66, 3])
+      self._assertShapeInference([50, 60, None], [55, 66], [55, 66, None])
+      self._assertShapeInference([55, 66, None], [55, 66], [55, 66, None])
+      self._assertShapeInference([59, 69, None], [55, 66], [55, 66, None])
+      self._assertShapeInference([50, 69, None], [55, 66], [55, 66, None])
+      self._assertShapeInference([59, 60, None], [55, 66], [55, 66, None])
+      self._assertShapeInference([None, None, None], [55, 66], [55, 66, None])
 
-  @test_util.run_deprecated_v1
   def testNameScope(self):
-    img_shape = [1, 3, 2, 1]
-    with self.cached_session(use_gpu=True):
-      single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
-      y = image_ops.resize_images(single_image, [55, 66])
-      self.assertTrue(y.op.name.startswith("resize"))
+    # Testing name scope requires placeholders and a graph.
+    with ops.Graph().as_default():
+      img_shape = [1, 3, 2, 1]
+      with self.cached_session(use_gpu=True):
+        single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
+        y = image_ops.resize_images(single_image, [55, 66])
+        self.assertTrue(y.op.name.startswith("resize"))
 
   def _ResizeImageCall(self, x, max_h, max_w, preserve_aspect_ratio,
                        use_tensor_inputs):
     if use_tensor_inputs:
       target_max = ops.convert_to_tensor([max_h, max_w])
-      x_tensor = array_ops.placeholder(x.dtype, shape=[None] * x.ndim)
-      feed_dict = {x_tensor: x}
+      x_tensor = ops.convert_to_tensor(x)
     else:
       target_max = [max_h, max_w]
       x_tensor = x
-      feed_dict = {}
 
-    y = image_ops.resize_images(x_tensor, target_max,
-                                preserve_aspect_ratio=preserve_aspect_ratio)
+    y = image_ops.resize_images(
+        x_tensor, target_max, preserve_aspect_ratio=preserve_aspect_ratio)
 
     with self.cached_session(use_gpu=True):
-      return y.eval(feed_dict=feed_dict)
+      return self.evaluate(y)
 
   def _assertResizeEqual(self, x, x_shape, y, y_shape,
                          preserve_aspect_ratio=True,
@@ -3549,7 +3591,6 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                                    preserve_aspect_ratio, use_tensor_inputs)
       self.assertShapeEqual(y, ops.convert_to_tensor(y_tf))
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioMultipleImages(self):
     x_shape = [10, 100, 100, 10]
     x = np.random.uniform(size=x_shape)
@@ -3557,42 +3598,36 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     self._assertResizeCheckShape(x, x_shape, [250, 250], [10, 250, 250, 10],
                                  preserve_aspect_ratio=False)
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeEqual(x, x_shape, x, x_shape)
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSmaller(self):
     x_shape = [100, 100, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [75, 50], [50, 50, 10])
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSmallerMultipleImages(self):
     x_shape = [10, 100, 100, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [75, 50], [10, 50, 50, 10])
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioLarger(self):
     x_shape = [100, 100, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [150, 200], [150, 150, 10])
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSameRatio(self):
     x_shape = [1920, 1080, 3]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [3840, 2160], [3840, 2160, 3])
 
-  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSquare(self):
     x_shape = [299, 299, 3]
     x = np.random.uniform(size=x_shape)
@@ -3607,19 +3642,14 @@ class ResizeImageWithPadV1Test(test_util.TensorFlowTestCase):
     if use_tensor_inputs:
       target_height = ops.convert_to_tensor(target_height)
       target_width = ops.convert_to_tensor(target_width)
-      x_tensor = array_ops.placeholder(x.dtype, shape=[None] * x.ndim)
-      feed_dict = {x_tensor: x}
+      x_tensor = ops.convert_to_tensor(x)
     else:
       x_tensor = x
-      feed_dict = {}
-
-    y = image_ops.resize_image_with_pad_v1(x_tensor, target_height,
-                                           target_width)
-    if not use_tensor_inputs:
-      self.assertTrue(y.get_shape().is_fully_defined())
 
     with self.cached_session(use_gpu=True):
-      return y.eval(feed_dict=feed_dict)
+      return self.evaluate(
+          image_ops.resize_image_with_pad_v1(x_tensor, target_height,
+                                             target_width))
 
   def _assertReturns(self,
                      x,
@@ -3648,28 +3678,53 @@ class ResizeImageWithPadV1Test(test_util.TensorFlowTestCase):
     x = np.array(x).reshape(x_shape)
 
     for use_tensor_inputs in use_tensor_inputs_options:
-      try:
+      with self.assertRaisesRegex(
+          (ValueError, errors.InvalidArgumentError), err_msg):
         self._ResizeImageWithPad(x, target_height, target_width,
                                  use_tensor_inputs)
-      except Exception as e:  # pylint: disable=broad-except
-        if err_msg not in str(e):
-          raise
-      else:
-        raise AssertionError("Exception not raised: %s" % err_msg)
 
   def _assertShapeInference(self, pre_shape, height, width, post_shape):
     image = array_ops.placeholder(dtypes.float32, shape=pre_shape)
     y = image_ops.resize_image_with_pad_v1(image, height, width)
     self.assertEqual(y.get_shape().as_list(), post_shape)
 
-  @test_util.run_deprecated_v1
+  def testShapeInference(self):
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      # Test with 3-D tensors.
+      self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([50, 60, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, 66, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, 60, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([55, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([50, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([55, 66, None], 55, 66, [55, 66, None])
+      self._assertShapeInference([50, 60, None], 55, 66, [55, 66, None])
+      self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
+      self._assertShapeInference(None, 55, 66, [55, 66, None])
+
+      # Test with 4-D tensors.
+      self._assertShapeInference([5, 55, 66, 3], 55, 66, [5, 55, 66, 3])
+      self._assertShapeInference([5, 50, 60, 3], 55, 66, [5, 55, 66, 3])
+      self._assertShapeInference([5, None, 66, 3], 55, 66, [5, 55, 66, 3])
+      self._assertShapeInference([5, None, 60, 3], 55, 66, [5, 55, 66, 3])
+      self._assertShapeInference([5, 55, None, 3], 55, 66, [5, 55, 66, 3])
+      self._assertShapeInference([5, 50, None, 3], 55, 66, [5, 55, 66, 3])
+      self._assertShapeInference([5, None, None, 3], 55, 66, [5, 55, 66, 3])
+      self._assertShapeInference([5, 55, 66, None], 55, 66, [5, 55, 66, None])
+      self._assertShapeInference([5, 50, 60, None], 55, 66, [5, 55, 66, None])
+      self._assertShapeInference([5, None, None, None], 55, 66,
+                                 [5, 55, 66, None])
+      self._assertShapeInference([None, None, None, None], 55, 66,
+                                 [None, 55, 66, None])
+
   def testNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertReturns(x, x_shape, x, x_shape)
 
-  @test_util.run_deprecated_v1
   def testPad(self):
     # Reduce vertical dimension
     x = [1, 2, 3, 4, 5, 6, 7, 8]
@@ -3707,19 +3762,14 @@ class ResizeImageWithPadV2Test(test_util.TensorFlowTestCase):
     if use_tensor_inputs:
       target_height = ops.convert_to_tensor(target_height)
       target_width = ops.convert_to_tensor(target_width)
-      x_tensor = array_ops.placeholder(x.dtype, shape=[None] * x.ndim)
-      feed_dict = {x_tensor: x}
+      x_tensor = ops.convert_to_tensor(x)
     else:
       x_tensor = x
-      feed_dict = {}
-
-    y = image_ops.resize_image_with_pad_v2(x_tensor, target_height,
-                                           target_width)
-    if not use_tensor_inputs:
-      self.assertTrue(y.get_shape().is_fully_defined())
 
     with self.cached_session(use_gpu=True):
-      return y.eval(feed_dict=feed_dict)
+      return self.evaluate(
+          image_ops.resize_image_with_pad_v2(x_tensor, target_height,
+                                             target_width))
 
   def _assertReturns(self,
                      x,
@@ -3748,29 +3798,53 @@ class ResizeImageWithPadV2Test(test_util.TensorFlowTestCase):
     x = np.array(x).reshape(x_shape)
 
     for use_tensor_inputs in use_tensor_inputs_options:
-      try:
+      with self.assertRaisesRegex(
+          (ValueError, errors.InvalidArgumentError), err_msg):
         self._ResizeImageWithPad(x, target_height, target_width,
                                  use_tensor_inputs)
-      except Exception as e:  # pylint: disable=broad-except
-        if err_msg not in str(e):
-          raise
-      else:
-        raise AssertionError("Exception not raised: %s" % err_msg)
 
   def _assertShapeInference(self, pre_shape, height, width, post_shape):
     image = array_ops.placeholder(dtypes.float32, shape=pre_shape)
     y = image_ops.resize_image_with_pad_v1(image, height, width)
     self.assertEqual(y.get_shape().as_list(), post_shape)
 
+  def testShapeInference(self):
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      # Test with 3-D tensors.
+      self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([50, 60, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, 66, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, 60, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([55, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([50, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([55, 66, None], 55, 66, [55, 66, None])
+      self._assertShapeInference([50, 60, None], 55, 66, [55, 66, None])
+      self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
+      self._assertShapeInference(None, 55, 66, [55, 66, None])
+
+      # Test with 4-D tensors.
+      self._assertShapeInference([5, 55, 66, 3], 55, 66, [5, 55, 66, 3])
+      self._assertShapeInference([5, 50, 60, 3], 55, 66, [5, 55, 66, 3])
+      self._assertShapeInference([5, None, 66, 3], 55, 66, [5, 55, 66, 3])
+      self._assertShapeInference([5, None, 60, 3], 55, 66, [5, 55, 66, 3])
+      self._assertShapeInference([5, 55, None, 3], 55, 66, [5, 55, 66, 3])
+      self._assertShapeInference([5, 50, None, 3], 55, 66, [5, 55, 66, 3])
+      self._assertShapeInference([5, None, None, 3], 55, 66, [5, 55, 66, 3])
+      self._assertShapeInference([5, 55, 66, None], 55, 66, [5, 55, 66, None])
+      self._assertShapeInference([5, 50, 60, None], 55, 66, [5, 55, 66, None])
+      self._assertShapeInference([5, None, None, None], 55, 66,
+                                 [5, 55, 66, None])
+      self._assertShapeInference([None, None, None, None], 55, 66,
+                                 [None, 55, 66, None])
 
-  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertReturns(x, x_shape, x, x_shape)
 
-  @test_util.run_deprecated_v1
   def testPad(self):
     # Reduce vertical dimension
     x = [1, 2, 3, 4, 5, 6, 7, 8]
@@ -3806,19 +3880,17 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     if use_tensor_inputs:
       target_height = ops.convert_to_tensor(target_height)
       target_width = ops.convert_to_tensor(target_width)
-      x_tensor = array_ops.placeholder(x.dtype, shape=[None] * x.ndim)
-      feed_dict = {x_tensor: x}
+      x_tensor = ops.convert_to_tensor(x)
     else:
       x_tensor = x
-      feed_dict = {}
 
-    y = image_ops.resize_image_with_crop_or_pad(x_tensor, target_height,
-                                                target_width)
-    if not use_tensor_inputs:
-      self.assertTrue(y.get_shape().is_fully_defined())
+    @def_function.function
+    def resize_crop_or_pad(*args):
+      return image_ops.resize_image_with_crop_or_pad(*args)
 
     with self.cached_session(use_gpu=True):
-      return y.eval(feed_dict=feed_dict)
+      return self.evaluate(
+          resize_crop_or_pad(x_tensor, target_height, target_width))
 
   def _assertReturns(self,
                      x,
@@ -3847,28 +3919,22 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     x = np.array(x).reshape(x_shape)
 
     for use_tensor_inputs in use_tensor_inputs_options:
-      try:
+      with self.assertRaisesRegex(
+          (ValueError, errors.InvalidArgumentError), err_msg):
         self._ResizeImageWithCropOrPad(x, target_height, target_width,
                                        use_tensor_inputs)
-      except Exception as e:
-        if err_msg not in str(e):
-          raise
-      else:
-        raise AssertionError("Exception not raised: %s" % err_msg)
 
   def _assertShapeInference(self, pre_shape, height, width, post_shape):
     image = array_ops.placeholder(dtypes.float32, shape=pre_shape)
     y = image_ops.resize_image_with_crop_or_pad(image, height, width)
     self.assertEqual(y.get_shape().as_list(), post_shape)
 
-  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertReturns(x, x_shape, x, x_shape)
 
-  @test_util.run_deprecated_v1
   def testPad(self):
     # Pad even along col.
     x = [1, 2, 3, 4, 5, 6, 7, 8]
@@ -3906,7 +3972,6 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
     self._assertReturns(x, x_shape, y, y_shape)
 
-  @test_util.run_deprecated_v1
   def testCrop(self):
     # Crop even along col.
     x = [1, 2, 3, 4, 5, 6, 7, 8]
@@ -3944,7 +4009,6 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
     self._assertReturns(x, x_shape, y, y_shape)
 
-  @test_util.run_deprecated_v1
   def testCropAndPad(self):
     # Pad along row but crop along col.
     x = [1, 2, 3, 4, 5, 6, 7, 8]
@@ -3964,29 +4028,29 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
     self._assertReturns(x, x_shape, y, y_shape)
 
-  @test_util.run_deprecated_v1
   def testShapeInference(self):
-    self._assertShapeInference([50, 60, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([59, 69, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([50, 69, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([59, 60, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([None, 60, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([None, 66, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([None, 69, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([50, None, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([55, None, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([59, None, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([None, None, 3], 55, 66, [55, 66, 3])
-    self._assertShapeInference([50, 60, None], 55, 66, [55, 66, None])
-    self._assertShapeInference([55, 66, None], 55, 66, [55, 66, None])
-    self._assertShapeInference([59, 69, None], 55, 66, [55, 66, None])
-    self._assertShapeInference([50, 69, None], 55, 66, [55, 66, None])
-    self._assertShapeInference([59, 60, None], 55, 66, [55, 66, None])
-    self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
-    self._assertShapeInference(None, 55, 66, [55, 66, None])
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      self._assertShapeInference([50, 60, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([59, 69, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([50, 69, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([59, 60, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, 60, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, 66, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, 69, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([50, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([55, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([59, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([None, None, 3], 55, 66, [55, 66, 3])
+      self._assertShapeInference([50, 60, None], 55, 66, [55, 66, None])
+      self._assertShapeInference([55, 66, None], 55, 66, [55, 66, None])
+      self._assertShapeInference([59, 69, None], 55, 66, [55, 66, None])
+      self._assertShapeInference([50, 69, None], 55, 66, [55, 66, None])
+      self._assertShapeInference([59, 60, None], 55, 66, [55, 66, None])
+      self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
+      self._assertShapeInference(None, 55, 66, [55, 66, None])
 
-  @test_util.run_deprecated_v1
   def testNon3DInput(self):
     # Input image is not 3D
     x = [0] * 15
@@ -4000,7 +4064,6 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
       self._assertRaises(x, x_shape, target_height, target_width,
                          "must have either 3 or 4 dimensions.")
 
-  @test_util.run_deprecated_v1
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
     target_height, target_width = [1, 1]
@@ -4026,7 +4089,6 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
           "inner 3 dims of \\'image.shape\\' must be > 0",
           use_tensor_inputs_options=[True])
 
-  @test_util.run_deprecated_v1
   def testBadParams(self):
     x_shape = [4, 4, 1]
     x = np.zeros(x_shape)
@@ -4041,11 +4103,12 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     self._assertRaises(x, x_shape, target_height, target_width,
                        "target_width must be > 0")
 
-  @test_util.run_deprecated_v1
   def testNameScope(self):
-    image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
-    y = image_ops.resize_image_with_crop_or_pad(image, 55, 66)
-    self.assertTrue(y.op.name.startswith("resize_image_with_crop_or_pad"))
+    # Testing name scope requires placeholders and a graph.
+    with ops.Graph().as_default():
+      image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
+      y = image_ops.resize_image_with_crop_or_pad(image, 55, 66)
+      self.assertTrue(y.op.name.startswith("resize_image_with_crop_or_pad"))
 
 
 def simple_color_ramp():
@@ -4125,7 +4188,6 @@ class JpegTest(test_util.TensorFlowTestCase):
         image1_crop, image2 = self.evaluate([image1_crop, image2])
         self.assertAllEqual(image1_crop, image2)
 
-  @test_util.run_deprecated_v1
   def testCropAndDecodeJpegWithInvalidCropWindow(self):
     with self.cached_session() as sess:
       # Encode it, then decode it, then encode it
@@ -4138,10 +4200,10 @@ class JpegTest(test_util.TensorFlowTestCase):
                       [11, 11, 11, -1], [11, 11, 0, 11], [11, 11, 11, 0],
                       [0, 0, h + 1, w], [0, 0, h, w + 1]]
       for crop_window in crop_windows:
-        result = image_ops.decode_and_crop_jpeg(jpeg0, crop_window)
-        with self.assertRaisesWithPredicateMatch(
-            errors.InvalidArgumentError,
-            lambda e: "Invalid JPEG data or crop window" in str(e)):
+        with self.assertRaisesRegex(
+            (ValueError, errors.InvalidArgumentError),
+            "Invalid JPEG data or crop window"):
+          result = image_ops.decode_and_crop_jpeg(jpeg0, crop_window)
           self.evaluate(result)
 
   def testSynthetic(self):
@@ -4202,36 +4264,35 @@ class JpegTest(test_util.TensorFlowTestCase):
       # The images should be the same.
       self.assertAllClose(image1, image2)
 
-  @test_util.run_deprecated_v1
   def testShape(self):
-    with self.cached_session(use_gpu=True) as sess:
-      jpeg = constant_op.constant("nonsense")
-      for channels in 0, 1, 3:
-        image = image_ops.decode_jpeg(jpeg, channels=channels)
-        self.assertEqual(image.get_shape().as_list(),
-                         [None, None, channels or None])
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      with self.cached_session(use_gpu=True) as sess:
+        jpeg = constant_op.constant("nonsense")
+        for channels in 0, 1, 3:
+          image = image_ops.decode_jpeg(jpeg, channels=channels)
+          self.assertEqual(image.get_shape().as_list(),
+                           [None, None, channels or None])
 
-  @test_util.run_deprecated_v1
   def testExtractJpegShape(self):
     # Read a real jpeg and verify shape.
     path = ("tensorflow/core/lib/jpeg/testdata/"
             "jpeg_merge_test1.jpg")
-    with self.cached_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True):
       jpeg = io_ops.read_file(path)
       # Extract shape without decoding.
-      [image_shape] = sess.run([image_ops.extract_jpeg_shape(jpeg)])
-      self.assertEqual(image_shape.tolist(), [256, 128, 3])
+      image_shape = self.evaluate(image_ops.extract_jpeg_shape(jpeg))
+      self.assertAllEqual(image_shape, [256, 128, 3])
 
-  @test_util.run_deprecated_v1
   def testExtractJpegShapeforCmyk(self):
     # Read a cmyk jpeg image, and verify its shape.
     path = ("tensorflow/core/lib/jpeg/testdata/"
             "jpeg_merge_test1_cmyk.jpg")
-    with self.cached_session(use_gpu=True) as sess:
+    with self.cached_session(use_gpu=True):
       jpeg = io_ops.read_file(path)
-      [image_shape] = sess.run([image_ops.extract_jpeg_shape(jpeg)])
+      image_shape = self.evaluate(image_ops.extract_jpeg_shape(jpeg))
       # Cmyk jpeg image has 4 channels.
-      self.assertEqual(image_shape.tolist(), [256, 128, 4])
+      self.assertAllEqual(image_shape, [256, 128, 4])
 
   def testRandomJpegQuality(self):
     # Previous implementation of random_jpeg_quality had a bug.
@@ -4299,14 +4360,12 @@ class JpegTest(test_util.TensorFlowTestCase):
       with self.cached_session(use_gpu=True) as sess:
         sess.run(adjust_jpeg_quality_image)
 
-  @test_util.run_deprecated_v1
   def testAdjustJpegQualityShape(self):
     with self.cached_session(use_gpu=True):
       image = constant_op.constant(
           np.arange(24, dtype=np.uint8).reshape([2, 4, 3]))
       adjusted_image = image_ops.adjust_jpeg_quality(image, 80)
-      self.assertListEqual(adjusted_image.shape.as_list(),
-                           [None, None, 3])
+      adjusted_image.shape.assert_is_compatible_with([None, None, 3])
 
 
 class PngTest(test_util.TensorFlowTestCase):
@@ -4379,25 +4438,15 @@ class PngTest(test_util.TensorFlowTestCase):
       self.assertEqual(2, image0.shape[-1])
       self.assertAllEqual(image0, image1)
 
-  @test_util.run_deprecated_v1
   def testShape(self):
-    with self.cached_session(use_gpu=True):
-      png = constant_op.constant("nonsense")
-      for channels in 0, 1, 3:
-        image = image_ops.decode_png(png, channels=channels)
-        self.assertEqual(image.get_shape().as_list(),
-                         [None, None, channels or None])
-
-  def testPaletteOnly(self):
-    filename = "tensorflow/core/lib/png/testdata/palette_only.png"
-    expected = np.zeros((20, 20, 1), np.uint8)
-    expected[1, 1:19, :] = 1
-    expected[3, 1:19, :] = 2
-    with self.cached_session(use_gpu=True):
-      channels = 1
-      png = image_ops.decode_png(io_ops.read_file(filename), channels=channels)
-      png = self.evaluate(png)
-      self.assertAllEqual(expected, png)
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      with self.cached_session(use_gpu=True):
+        png = constant_op.constant("nonsense")
+        for channels in 0, 1, 3:
+          image = image_ops.decode_png(png, channels=channels)
+          self.assertEqual(image.get_shape().as_list(),
+                           [None, None, channels or None])
 
 
 class GifTest(test_util.TensorFlowTestCase):
@@ -4435,12 +4484,13 @@ class GifTest(test_util.TensorFlowTestCase):
     self._testValid("scan.gif")
     self._testValid("optimized.gif")
 
-  @test_util.run_deprecated_v1
   def testShape(self):
-    with self.cached_session(use_gpu=True) as sess:
-      gif = constant_op.constant("nonsense")
-      image = image_ops.decode_gif(gif)
-      self.assertEqual(image.get_shape().as_list(), [None, None, None, 3])
+    # Shape function requires placeholders and a graph.
+    with ops.Graph().as_default():
+      with self.cached_session(use_gpu=True) as sess:
+        gif = constant_op.constant("nonsense")
+        image = image_ops.decode_gif(gif)
+        self.assertEqual(image.get_shape().as_list(), [None, None, None, 3])
 
 
 class ConvertImageTest(test_util.TensorFlowTestCase):
@@ -4462,17 +4512,17 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
         self.assertTrue(y_saturate.dtype == output_dtype)
         self.assertAllClose(y_saturate, y_np, atol=1e-5)
 
-  @test_util.run_deprecated_v1
   def testNoConvert(self):
-    # Make sure converting to the same data type creates only an identity op
-    with self.cached_session(use_gpu=True):
-      image = constant_op.constant([1], dtype=dtypes.uint8)
-      image_ops.convert_image_dtype(image, dtypes.uint8)
-      y = image_ops.convert_image_dtype(image, dtypes.uint8)
-      self.assertEqual(y.op.type, "Identity")
-      self.assertEqual(y.op.inputs[0], image)
+    # Tests with Tensor.op requires a graph.
+    with ops.Graph().as_default():
+      # Make sure converting to the same data type creates only an identity op
+      with self.cached_session(use_gpu=True):
+        image = constant_op.constant([1], dtype=dtypes.uint8)
+        image_ops.convert_image_dtype(image, dtypes.uint8)
+        y = image_ops.convert_image_dtype(image, dtypes.uint8)
+        self.assertEqual(y.op.type, "Identity")
+        self.assertEqual(y.op.inputs[0], image)
 
-  @test_util.run_deprecated_v1
   def testConvertBetweenInteger(self):
     # Make sure converting to between integer types scales appropriately
     with self.cached_session(use_gpu=True):
@@ -4481,7 +4531,6 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       self._convert([0, 2**32], dtypes.int64, dtypes.int32, [0, 1])
       self._convert([0, 1], dtypes.int32, dtypes.int64, [0, 2**32])
 
-  @test_util.run_deprecated_v1
   def testConvertBetweenFloat(self):
     # Make sure converting to between float types does nothing interesting
     with self.cached_session(use_gpu=True):
@@ -4490,7 +4539,6 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       self._convert([-1.0, 0, 1.0, 200000], dtypes.float64, dtypes.float32,
                     [-1.0, 0, 1.0, 200000])
 
-  @test_util.run_deprecated_v1
   def testConvertBetweenIntegerAndFloat(self):
     # Make sure converting from and to a float type scales appropriately
     with self.cached_session(use_gpu=True):
@@ -4499,7 +4547,6 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       self._convert([0, 1.1 / 255.0, 1], dtypes.float32, dtypes.uint8,
                     [0, 1, 255])
 
-  @test_util.run_deprecated_v1
   def testConvertBetweenInt16AndInt8(self):
     with self.cached_session(use_gpu=True):
       # uint8, uint16
@@ -5471,7 +5518,7 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class DecodeImageTest(test_util.TensorFlowTestCase):
+class DecodeImageTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   _FORWARD_COMPATIBILITY_HORIZONS = [
       (2020, 1, 1),
@@ -5651,7 +5698,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
           first_frame = array_ops.gather(animation, 0)
           image1 = image_ops.convert_image_dtype(first_frame, dtypes.float32)
           image0, image1 = self.evaluate([image0, image1])
-          self.assertEqual(len(image0.shape), 3)
+          self.assertLen(image0.shape, 3)
           self.assertAllEqual(list(image0.shape), [40, 20, 3])
           self.assertAllEqual(image0, image1)
 
@@ -5659,7 +5706,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
           image2 = image_ops.decode_image(gif0, dtype=dtypes.float32)
           image3 = image_ops.convert_image_dtype(animation, dtypes.float32)
           image2, image3 = self.evaluate([image2, image3])
-          self.assertEqual(len(image2.shape), 4)
+          self.assertLen(image2.shape, 4)
           self.assertAllEqual(list(image2.shape), [12, 40, 20, 3])
           self.assertAllEqual(image2, image3)
 
@@ -5675,6 +5722,52 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
           crop_size=[1, 1])
       self.evaluate(v)
 
+   @parameterized.named_parameters(
+      ("_jpeg", "JPEG", "jpeg_merge_test1.jpg"),
+      ("_png", "PNG", "lena_rgba.png"),
+      ("_gif", "GIF", "scan.gif"),
+  )
+  def testWrongOpBmp(self, img_format, filename):
+    base_folder = "tensorflow/core/lib"
+    base_path = os.path.join(base_folder, img_format.lower(), "testdata")
+    err_msg = "Trying to decode " + img_format + " format using DecodeBmp op"
+    with self.assertRaisesRegex(
+        (ValueError, errors.InvalidArgumentError), err_msg):
+      img_bytes = io_ops.read_file(os.path.join(base_path, filename))
+      img = image_ops.decode_bmp(img_bytes)
+      self.evaluate(img)
+
+  @parameterized.named_parameters(
+      ("_jpeg", image_ops.decode_jpeg, "DecodeJpeg"),
+      ("_png", image_ops.decode_png, "DecodePng"),
+      ("_gif", image_ops.decode_gif, "DecodeGif"),
+  )
+  def testWrongOp(self, decode_op, op_used):
+    base = "tensorflow/core/lib/bmp/testdata"
+    bmp0 = io_ops.read_file(os.path.join(base, "rgba_small.bmp"))
+    err_msg = ("Trying to decode BMP format using a wrong op. Use `decode_bmp` "
+               "or `decode_image` instead. Op used: ") + op_used
+    with self.assertRaisesRegex(
+        (ValueError, errors.InvalidArgumentError), err_msg):
+      img = decode_op(bmp0)
+      self.evaluate(img)
+
+  @parameterized.named_parameters(
+      ("_png", "PNG", "lena_rgba.png"),
+      ("_gif", "GIF", "scan.gif"),
+      ("_bmp", "BMP", "rgba_small.bmp"),
+  )
+  def testWrongOpJpeg(self, img_format, filename):
+    base_folder = "tensorflow/core/lib"
+    base_path = os.path.join(base_folder, img_format.lower(), "testdata")
+    err_msg = ("DecodeAndCropJpeg operation can run on JPEG only, but "
+               "detected ") + img_format
+    with self.assertRaisesRegex(
+        (ValueError, errors.InvalidArgumentError), err_msg):
+      img_bytes = io_ops.read_file(os.path.join(base_path, filename))
+      img = image_ops.decode_and_crop_jpeg(img_bytes, [1, 1, 2, 2])
+      self.evaluate(img)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/init_ops_test.py b/tensorflow/python/ops/init_ops_test.py
index 4ea7ef007d6..ae8bfbdbdd0 100644
--- a/tensorflow/python/ops/init_ops_test.py
+++ b/tensorflow/python/ops/init_ops_test.py
@@ -203,8 +203,6 @@ class InitializersTest(test.TestCase):
             run_metadata=run_metadata)
 
   @test_util.run_gpu_only
-  @test_util.disable_tfrt('b/165614506: Incorrect device name set in '
-                          'tfrt::TensorHandle.')
   def test_eager_orthogonal_gpu(self):
     with context.eager_mode():
       v = variable_scope.get_variable(
diff --git a/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py
index 43107c092e3..e2fa737bf59 100644
--- a/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_block_lower_triangular.py
@@ -726,7 +726,7 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
           # `Ax_0 + Bx_1 + Dx_2 = y_0` as `Ax_0 = y_0*`, where
           # `y_0* = y_0 - Bx_1 - Dx_2`.
           for j in reversed(range(index + 1, len(self.operators))):
-            y -= self.operators[j][index].matmul(
+            y = y - self.operators[j][index].matmul(
                 solution_list[len(self.operators) - 1 - j],
                 adjoint=adjoint)
           # Continuing the example above, solve `Ax_0 = y_0*` for `x_0`.
@@ -748,7 +748,7 @@ class LinearOperatorBlockLowerTriangular(linear_operator.LinearOperator):
           # `Dx_0 + Ex_1 + Fx_2 = y_2` as `Fx_2 = y_2*`, where
           # `y_2* = y_2 - D_x0 - Ex_1`.
           for i, operator in enumerate(row[:-1]):
-            y -= operator.matmul(solution_list[i], adjoint=adjoint)
+            y = y - operator.matmul(solution_list[i], adjoint=adjoint)
           # Continuing the example above, solve `Fx_2 = y_2*` for `x_2`.
           solution_list.append(row[-1].solve(y, adjoint=adjoint))
 
diff --git a/tensorflow/python/ops/linalg/registrations_util.py b/tensorflow/python/ops/linalg/registrations_util.py
index c707a67d43c..d19094132e5 100644
--- a/tensorflow/python/ops/linalg/registrations_util.py
+++ b/tensorflow/python/ops/linalg/registrations_util.py
@@ -56,7 +56,7 @@ def is_square(operator_a, operator_b):
       return m == l
 
   if (operator_a.is_square != operator_b.is_square) and (
-      operator_a.is_square is not None and operator_a.is_square is not None):
+      operator_a.is_square is not None and operator_b.is_square is not None):
     return False
 
   return None
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index 40f03491e6d..847d144bde3 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -524,6 +524,7 @@ def _QrGrad(op, dq, dr):
     return _QrGradSquareAndDeepMatrices(q, r, dq, dr)
 
   # Partition a = [x, y], r = [u, v] and reduce to the square case
+  # The methodology is explained in detail in https://arxiv.org/abs/2009.10071
   a = op.inputs[0]
   y = a[..., :, num_rows:]
   u = r[..., :, :num_rows]
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index 03b7b98119d..90b4abbdaf7 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops_impl
 from tensorflow.python.ops import map_fn
@@ -150,6 +151,9 @@ def matrix_triangular_solve(matrix, rhs, lower=True, adjoint=False, name=None):
 def cholesky_solve(chol, rhs, name=None):
   """Solves systems of linear eqns `A X = RHS`, given Cholesky factorizations.
 
+  Specifically, returns `X` from `A X = RHS`, where `A = L L^T`, `L` is the
+  `chol` arg and `RHS` is the `rhs` arg.
+
   ```python
   # Solve 10 separate 2x2 linear systems:
   A = ... # shape 10 x 2 x 2
@@ -731,9 +735,11 @@ def norm(tensor,
             lambda i: control_flow_ops.cond(i >= 0, lambda: i, lambda: i + rank),
             ops.convert_to_tensor(axis))
         axes = math_ops.range(rank)
-        perm_before = array_ops.concat(
-            [array_ops.setdiff1d(axes, positive_axis)[0], positive_axis],
-            axis=0)
+        perm_before = array_ops.concat([
+            gen_array_ops.list_diff(axes, positive_axis, dtypes.int32)[0],
+            positive_axis
+        ],
+                                       axis=0)
         perm_after = map_fn.map_fn(
             lambda i: math_ops.cast(
                 array_ops.squeeze(
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index 3e7c116ec97..8379a26a260 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -19,11 +19,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.framework import types_pb2
+from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_list_ops
+from tensorflow.python.ops import handle_data_util
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_list_ops import *
@@ -56,19 +60,45 @@ def empty_tensor_list(element_shape,
       name=name)
 
 
+def _set_handle_data(list_handle, element_shape, element_dtype):
+  """Sets type information on `list_handle` for consistency with graphs."""
+  # TODO(b/169968286): It would be better if we had a consistent story for
+  # creating handle data from eager operations (shared with VarHandleOp).
+  if isinstance(list_handle, ops.EagerTensor):
+    if tensor_util.is_tensor(element_shape):
+      element_shape = tensor_shape.TensorShape(None)
+    elif not isinstance(element_shape, tensor_shape.TensorShape):
+      element_shape = tensor_shape.TensorShape(element_shape)
+    handle_data = cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData()
+    handle_data.is_set = True
+    handle_data.shape_and_type.append(
+        cpp_shape_inference_pb2.CppShapeInferenceResult.HandleShapeAndType(
+            shape=element_shape.as_proto(),
+            dtype=element_dtype.as_datatype_enum,
+            specialized_type=types_pb2.ST_TENSOR_LIST))
+    list_handle._handle_data = handle_data  # pylint: disable=protected-access
+
+
 def tensor_list_reserve(element_shape, num_elements, element_dtype, name=None):
-  return gen_list_ops.tensor_list_reserve(
+  result = gen_list_ops.tensor_list_reserve(
       element_shape=_build_element_shape(element_shape),
       num_elements=num_elements,
       element_dtype=element_dtype,
       name=name)
+  # TODO(b/169968286): gen_ops needs to ensure the metadata is properly
+  # populated for eager operations.
+  _set_handle_data(result, element_shape, element_dtype)
+  return result
 
 
 def tensor_list_from_tensor(tensor, element_shape, name=None):
-  return gen_list_ops.tensor_list_from_tensor(
+  tensor = ops.convert_to_tensor(tensor)
+  result = gen_list_ops.tensor_list_from_tensor(
       tensor=tensor,
       element_shape=_build_element_shape(element_shape),
       name=name)
+  _set_handle_data(result, tensor.shape, tensor.dtype)
+  return result
 
 
 def tensor_list_get_item(input_handle, index, element_dtype, element_shape=None,
@@ -107,16 +137,22 @@ def tensor_list_scatter(tensor,
                         element_shape=None,
                         input_handle=None,
                         name=None):
+  """Returns a TensorList created or updated by scattering `tensor`."""
+  tensor = ops.convert_to_tensor(tensor)
   if input_handle is not None:
-    return gen_list_ops.tensor_list_scatter_into_existing_list(
+    output_handle = gen_list_ops.tensor_list_scatter_into_existing_list(
         input_handle=input_handle, tensor=tensor, indices=indices, name=name)
+    handle_data_util.copy_handle_data(input_handle, output_handle)
+    return output_handle
   else:
-    return gen_list_ops.tensor_list_scatter_v2(
+    output_handle = gen_list_ops.tensor_list_scatter_v2(
         tensor=tensor,
         indices=indices,
         element_shape=_build_element_shape(element_shape),
         num_elements=-1,
         name=name)
+    _set_handle_data(output_handle, element_shape, tensor.dtype)
+    return output_handle
 
 
 def tensor_list_stack(input_handle,
@@ -167,8 +203,10 @@ def tensor_list_set_item(input_handle,
         lambda: gen_list_ops.tensor_list_resize(  # pylint: disable=g-long-lambda
             input_handle, index + 1),
         lambda: input_handle)
-  return gen_list_ops.tensor_list_set_item(
+  output_handle = gen_list_ops.tensor_list_set_item(
       input_handle=input_handle, index=index, item=item, name=name)
+  handle_data_util.copy_handle_data(input_handle, output_handle)
+  return output_handle
 
 
 @ops.RegisterGradient("TensorListPushBack")
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 02fce277690..96e24d04362 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections as py_collections
 import os
 import pprint
 import random
@@ -305,8 +306,21 @@ def print_v2(*inputs, **kwargs):
     # printed input.
     templates = []
     tensors = []
+    # If an input to the print function is of type `OrderedDict`, sort its
+    # elements by the keys for consistency with the ordering of `nest.flatten`.
+    # This is not needed for `dict` types because `pprint.pformat()` takes care
+    # of printing the template in a sorted fashion.
+    inputs_ordered_dicts_sorted = []
+    for input_ in inputs:
+      if isinstance(input_, py_collections.OrderedDict):
+        inputs_ordered_dicts_sorted.append(
+            py_collections.OrderedDict(sorted(input_.items())))
+      else:
+        inputs_ordered_dicts_sorted.append(input_)
     tensor_free_structure = nest.map_structure(
-        lambda x: "" if tensor_util.is_tensor(x) else x, inputs)
+        lambda x: "" if tensor_util.is_tensor(x) else x,
+        inputs_ordered_dicts_sorted)
+
     tensor_free_template = " ".join(
         pprint.pformat(x) for x in tensor_free_structure)
     placeholder = _generate_placeholder_string(tensor_free_template)
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 9f27ccf9a1c..f99102fee52 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -146,6 +146,10 @@ class LookupInterface(trackable.TrackableResource):
     """Looks up `keys` in a table, outputs the corresponding values."""
     raise NotImplementedError
 
+  def __getitem__(self, keys):
+    """Looks up `keys` in a table, outputs the corresponding values."""
+    return self.lookup(keys)
+
 
 class InitializableLookupTableBase(LookupInterface):
   """Initializable lookup table interface.
@@ -255,14 +259,28 @@ class StaticHashTable(InitializableLookupTableBase):
 
   Example usage:
 
-  ```python
-  keys_tensor = tf.constant([1, 2])
-  vals_tensor = tf.constant([3, 4])
-  input_tensor = tf.constant([1, 5])
-  table = tf.lookup.StaticHashTable(
-      tf.lookup.KeyValueTensorInitializer(keys_tensor, vals_tensor), -1)
-  print(table.lookup(input_tensor))
-  ```
+  >>> keys_tensor = tf.constant(['a', 'b', 'c'])
+  >>> vals_tensor = tf.constant([7, 8, 9])
+  >>> input_tensor = tf.constant(['a', 'f'])
+  >>> table = tf.lookup.StaticHashTable(
+  ...     tf.lookup.KeyValueTensorInitializer(keys_tensor, vals_tensor),
+  ...     default_value=-1)
+  >>> table.lookup(input_tensor).numpy()
+  array([ 7, -1], dtype=int32)
+
+  Or for more pythonic code:
+
+  >>> table[input_tensor].numpy()
+  array([ 7, -1], dtype=int32)
+
+  The result of a lookup operation has the same shape as the argument:
+
+  >>> input_tensor = tf.constant([['a', 'b'], ['c', 'd']])
+  >>> table[input_tensor].numpy()
+  array([[ 7,  8],
+         [ 9, -1]], dtype=int32)
+
+
   """
 
   def __init__(self, initializer, default_value, name=None):
@@ -422,16 +440,15 @@ class DatasetInitializer(TableInitializerBase):
   """Creates a table initializer from a `tf.data.Dataset`.
 
   Sample usage:
-  ```python
-    keys = tf.data.Dataset.range(100)
-    values = tf.data.Dataset.range(100).map(
-        lambda x: string_ops.as_string(x * 2))
-    ds = tf.data.Dataset.zip((keys, values))
-    init = tf.lookup.experimental.DatasetInitializer(ds)
-    table = tf.lookup.StaticHashTable(init, "")
-    output = table.lookup([0, 1, 2])
-    assertEquals(outputs, ["0", "2", "4"])
-  ```
+
+  >>> keys = tf.data.Dataset.range(100)
+  >>> values = tf.data.Dataset.range(100).map(
+  ...     lambda x: string_ops.as_string(x * 2))
+  >>> ds = tf.data.Dataset.zip((keys, values))
+  >>> init = tf.lookup.experimental.DatasetInitializer(ds)
+  >>> table = tf.lookup.StaticHashTable(init, "")
+  >>> table.lookup(tf.constant([0, 1, 2], dtype=tf.int64)).numpy()
+  array([b'0', b'2', b'4'], dtype=object)
 
   Attributes:
     dataset: A `tf.data.Dataset` object that produces tuples of scalars. The
@@ -479,7 +496,19 @@ class DatasetInitializer(TableInitializerBase):
 
 @tf_export("lookup.KeyValueTensorInitializer")
 class KeyValueTensorInitializer(TableInitializerBase):
-  """Table initializers given `keys` and `values` tensors."""
+  """Table initializers given `keys` and `values` tensors.
+
+  >>> keys_tensor = tf.constant(['a', 'b', 'c'])
+  >>> vals_tensor = tf.constant([7, 8, 9])
+  >>> input_tensor = tf.constant(['a', 'f'])
+  >>> init = tf.lookup.KeyValueTensorInitializer(keys_tensor, vals_tensor)
+  >>> table = tf.lookup.StaticHashTable(
+  ...     init,
+  ...     default_value=-1)
+  >>> table.lookup(input_tensor).numpy()
+  array([ 7, -1], dtype=int32)
+
+  """
 
   def __init__(self, keys, values, key_dtype=None, value_dtype=None, name=None):
     """Constructs a table initializer object based on keys and values tensors.
@@ -537,7 +566,7 @@ class KeyValueTensorInitializer(TableInitializerBase):
 class TextFileIndex(object):
   """The key and value content to get from each line.
 
-  This class defines the key and value used for tf.lookup.TextFileInitializer.
+  This class defines the key and value used for `tf.lookup.TextFileInitializer`.
 
   The key and value content to get from each line is specified either
   by the following, or a value `>=0`.
@@ -555,7 +584,7 @@ class TextFileIndex(object):
 
 @tf_export("lookup.TextFileInitializer")
 class TextFileInitializer(TableInitializerBase):
-  """Table initializers from a text file.
+  r"""Table initializers from a text file.
 
   This initializer assigns one entry in the table for each line in the file.
 
@@ -574,11 +603,11 @@ class TextFileInitializer(TableInitializerBase):
 
   For example if we have a file with the following content:
 
-  ```
-  emerson 10
-  lake 20
-  palmer 30
-  ```
+  >>> import tempfile
+  >>> f = tempfile.NamedTemporaryFile(delete=False)
+  >>> content='\n'.join(["emerson 10", "lake 20", "palmer 30",])
+  >>> f.file.write(content.encode('utf-8'))
+  >>> f.file.close()
 
   The following snippet initializes a table with the first column as keys and
   second column as values:
@@ -587,12 +616,13 @@ class TextFileInitializer(TableInitializerBase):
   * `lake -> 20`
   * `palmer -> 30`
 
-  ```python
-  table = tf.lookup.StaticHashTable(tf.lookup.TextFileInitializer(
-      "test.txt", tf.string, 0, tf.int64, 1, delimiter=" "), -1)
-  ...
-  table.init.run()
-  ```
+  >>> init= tf.lookup.TextFileInitializer(
+  ...    filename=f.name,
+  ...    key_dtype=tf.string, key_index=0,
+  ...    value_dtype=tf.int64, value_index=1,
+  ...    delimiter=" ")
+  >>> table = tf.lookup.StaticHashTable(init, default_value=-1)
+  >>> table.lookup(tf.constant(['palmer','lake','tarkus'])).numpy()
 
   Similarly to initialize the whole line as keys and the line number as values.
 
@@ -600,13 +630,14 @@ class TextFileInitializer(TableInitializerBase):
   * `lake 20 -> 1`
   * `palmer 30 -> 2`
 
-  ```python
-  table = tf.lookup.StaticHashTable(tf.lookup.TextFileInitializer(
-      "test.txt", tf.string, tf.lookup.TextFileIndex.WHOLE_LINE,
-      tf.int64, tf.lookup.TextFileIndex.LINE_NUMBER, delimiter=" "), -1)
-  ...
-  table.init.run()
-  ```
+  >>> init = tf.lookup.TextFileInitializer(
+  ...   filename=f.name,
+  ...   key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
+  ...   value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER,
+  ...   delimiter=" ")
+  >>> table = tf.lookup.StaticHashTable(init, -1)
+  >>> table.lookup(tf.constant('palmer 30')).numpy()
+  2
   """
 
   def __init__(self,
@@ -1106,45 +1137,53 @@ class IdTableWithHashBuckets(LookupInterface):
 
 @tf_export("lookup.StaticVocabularyTable", v1=[])
 class StaticVocabularyTable(LookupInterface):
-  r"""String to Id table wrapper that assigns out-of-vocabulary keys to buckets.
+  r"""String to Id table that assigns out-of-vocabulary keys to hash buckets.
 
   For example, if an instance of `StaticVocabularyTable` is initialized with a
   string-to-id initializer that maps:
 
-  * `emerson -> 0`
-  * `lake -> 1`
-  * `palmer -> 2`
+  >>> init = tf.lookup.KeyValueTensorInitializer(
+  ...     keys=tf.constant(['emerson', 'lake', 'palmer']),
+  ...     values=tf.constant([0, 1, 2], dtype=tf.int64))
+  >>> table = tf.lookup.StaticVocabularyTable(
+  ...    init,
+  ...    num_oov_buckets=5)
 
   The `Vocabulary` object will performs the following mapping:
 
   * `emerson -> 0`
   * `lake -> 1`
   * `palmer -> 2`
-  * `<other term> -> bucket_id`, where bucket_id will be between `3` and
-  `3 + num_oov_buckets - 1`, calculated by:
+  * `<other term> -> bucket_id`, where `bucket_id` will be between `3` and
+  `3 + num_oov_buckets - 1 = 7`, calculated by:
   `hash(<term>) % num_oov_buckets + vocab_size`
 
-  If input_tensor is `["emerson", "lake", "palmer", "king", "crimson"]`,
-  the lookup result is `[0, 1, 2, 4, 7]`.
+  If input_tensor is:
+
+  >>> input_tensor = tf.constant(["emerson", "lake", "palmer",
+  ...                             "king", "crimson"])
+  >>> table[input_tensor].numpy()
+  array([0, 1, 2, 6, 7])
 
   If `initializer` is None, only out-of-vocabulary buckets are used.
 
   Example usage:
 
-  ```python
-  num_oov_buckets = 3
-  input_tensor = tf.constant(["emerson", "lake", "palmer", "king", "crimnson"])
-  table = tf.lookup.StaticVocabularyTable(
-      tf.lookup.TextFileInitializer(
-          filename,
-          key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
-          value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER,
-          delimiter="\t"),
-      num_oov_buckets)
-  out = table.lookup(input_tensor).
-  table.init.run()
-  print(out.eval())
-  ```
+  >>> num_oov_buckets = 3
+  >>> vocab = ["emerson", "lake", "palmer", "crimnson"]
+  >>> import tempfile
+  >>> f = tempfile.NamedTemporaryFile(delete=False)
+  >>> f.write('\n'.join(vocab).encode('utf-8'))
+  >>> f.close()
+
+  >>> init = tf.lookup.TextFileInitializer(
+  ...     f.name,
+  ...     key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE,
+  ...     value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER)
+  >>> table = tf.lookup.StaticVocabularyTable(init, num_oov_buckets)
+  >>> table.lookup(tf.constant(["palmer", "crimnson" , "king",
+  ...                           "tarkus", "black", "moon"])).numpy()
+  array([2, 3, 5, 6, 6, 4])
 
   The hash function used for generating out-of-vocabulary buckets ID is
   Fingerprint64.
@@ -1158,8 +1197,8 @@ class StaticVocabularyTable(LookupInterface):
     """Construct a `StaticVocabularyTable` object.
 
     Args:
-      initializer: A TableInitializerBase object that contains the data used to
-        initialize the table. If None, then we only use out-of-vocab buckets.
+      initializer: A `TableInitializerBase` object that contains the data used
+        to initialize the table. If None, then we only use out-of-vocab buckets.
       num_oov_buckets: Number of buckets to use for out-of-vocabulary keys. Must
         be greater than zero.
       lookup_key_dtype: Data type of keys passed to `lookup`. Defaults to
@@ -1926,17 +1965,18 @@ class DenseHashTable(LookupInterface):
 
   Example usage:
 
-  ```python
-  table = tf.lookup.DenseHashTable(key_dtype=tf.int64,
-                                   value_dtype=tf.int64,
-                                   default_value=-1,
-                                   empty_key=0,
-                                   deleted_key=-1)
-
-  sess.run(table.insert(keys, values))
-  out = table.lookup(query_keys)
-  print(out.eval())
-  ```
+  >>> table = tf.lookup.experimental.DenseHashTable(
+  ...     key_dtype=tf.string,
+  ...     value_dtype=tf.int64,
+  ...     default_value=-1,
+  ...     empty_key='',
+  ...     deleted_key='$')
+  >>> keys = tf.constant(['a', 'b', 'c'])
+  >>> values = tf.constant([0, 1, 2], dtype=tf.int64)
+  >>> table.insert(keys, values)
+  >>> table.remove(tf.constant(['c']))
+  >>> table.lookup(tf.constant(['a', 'b', 'c','d'])).numpy()
+  array([ 0,  1, -1, -1])
   """
 
   # TODO(andreasst): consider extracting common code with MutableHashTable into
@@ -1985,10 +2025,8 @@ class DenseHashTable(LookupInterface):
     self._checkpoint = checkpoint
     self._name = name
 
-    self._empty_key = ops.convert_to_tensor(
-        empty_key, dtype=key_dtype, name="empty_key")
-    self._deleted_key = ops.convert_to_tensor(
-        deleted_key, dtype=key_dtype, name="deleted_key")
+    self._empty_key = empty_key
+    self._deleted_key = deleted_key
     self._shared_name = None
     if context.executing_eagerly():
       # TODO(allenl): This will leak memory due to kernel caching by the
@@ -2010,9 +2048,13 @@ class DenseHashTable(LookupInterface):
     # training to work correctly. Use the node name if no shared_name has been
     # explicitly specified.
     use_node_name_sharing = self._checkpoint and self._shared_name is None
+    empty_key = ops.convert_to_tensor(
+        self._empty_key, dtype=self._key_dtype, name="empty_key")
+    deleted_key = ops.convert_to_tensor(
+        self._deleted_key, dtype=self._key_dtype, name="deleted_key")
     table_ref = gen_lookup_ops.mutable_dense_hash_table_v2(
-        empty_key=self._empty_key,
-        deleted_key=self._deleted_key,
+        empty_key=empty_key,
+        deleted_key=deleted_key,
         shared_name=self._shared_name,
         use_node_name_sharing=use_node_name_sharing,
         value_dtype=self._value_dtype,
diff --git a/tensorflow/python/ops/losses/loss_reduction.py b/tensorflow/python/ops/losses/loss_reduction.py
index 829bc2f811e..789a6561bfb 100644
--- a/tensorflow/python/ops/losses/loss_reduction.py
+++ b/tensorflow/python/ops/losses/loss_reduction.py
@@ -44,7 +44,7 @@ class ReductionV2(object):
        loss_obj = tf.keras.losses.CategoricalCrossentropy(
            reduction=tf.keras.losses.Reduction.NONE)
        ....
-       loss = tf.reduce_sum(loss_object(labels, predictions)) *
+       loss = tf.reduce_sum(loss_obj(labels, predictions)) *
            (1. / global_batch_size)
      ```
 
diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index bf7b2cda7f3..edea769f663 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -526,7 +526,8 @@ def map_fn(fn,
       varscope.set_caching_device(None)
 
     result_flat = _result_batchable_to_flat(result_batchable,
-                                            result_flat_signature)
+                                            result_flat_signature,
+                                            n_static)
     result = result_unflatten(result_flat)
     return result
 
@@ -608,7 +609,8 @@ def _result_value_flat_to_batchable(result_value_flat, result_flat_signature):
   return result_value_batchable
 
 
-def _result_batchable_to_flat(result_batchable, result_flat_signature):
+def _result_batchable_to_flat(result_batchable, result_flat_signature,
+                              batch_size):
   """Converts result_batchable -> result_flat."""
   result_flat = []
   i = 0
@@ -616,7 +618,7 @@ def _result_batchable_to_flat(result_batchable, result_flat_signature):
     # pylint: disable=protected-access
     num_tensors = len(spec._flat_tensor_specs)
     result_flat.append(
-        spec._batch(None)._from_compatible_tensor_list(
+        spec._batch(batch_size)._from_compatible_tensor_list(
             result_batchable[i:i + num_tensors]))
     i += num_tensors
   assert i == len(result_batchable)
diff --git a/tensorflow/python/ops/map_ops.py b/tensorflow/python/ops/map_ops.py
index ce8b7a6bc2f..7315e7e18bd 100644
--- a/tensorflow/python/ops/map_ops.py
+++ b/tensorflow/python/ops/map_ops.py
@@ -46,6 +46,11 @@ def tensor_map_erase(input_handle, key, value_dtype):
 def tensor_map_has_key(input_handle, key):
   return gen_map_ops.tensor_map_has_key(input_handle, key)
 
+
+def tensor_map_stack_keys(input_handle, key_dtype):
+  return gen_map_ops.tensor_map_stack_keys(input_handle, key_dtype)
+
+
 @ops.RegisterGradient("TensorMapLookup")
 def LookupGrad(op, dval):
   _, k = op.inputs
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 463ad8337c7..389f6f8dce9 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -297,7 +297,7 @@ def _ProdGrad(op, grad):
     reduction_indices = (reduction_indices + rank) % rank
     reduced = math_ops.cast(reduction_indices, dtypes.int32)
     idx = math_ops.range(0, rank)
-    other, _ = array_ops.setdiff1d(idx, reduced)
+    other, _ = gen_array_ops.list_diff(idx, reduced, dtypes.int32)
     perm = array_ops.concat([reduced, other], 0)
     reduced_num = math_ops.reduce_prod(array_ops.gather(input_shape, reduced))
     other_num = math_ops.reduce_prod(array_ops.gather(input_shape, other))
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index c29fd8f1477..c493cec7e89 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -533,6 +533,31 @@ _mul.__doc__ = (
 @tf_export("math.subtract", "subtract")
 @dispatch.add_dispatch_support
 def subtract(x, y, name=None):
+  """Returns x - y element-wise.
+
+  *Note*: Subtract supports broadcasting. More about broadcasting
+  [here](https://numpy.org/doc/stable/user/basics.broadcasting.html)
+
+  Both input and output have a range `(-inf, inf)`.
+
+  For example:
+
+  >>> x = tf.constant([1.0, -1.0, 5.0, -2.0, 0.0])
+  >>> y = tf.constant([5.0, 1.0, 3.7, -19.9, float("inf")])
+  >>> tf.subtract(x,y)
+  <tf.Tensor: shape=(5,), dtype=float32,
+  numpy= array([-4. , -2. ,  1.3, 17.9, -inf], dtype=float32)>
+
+  Args:
+    x: A `Tensor`. Must be one of the following types: `bfloat16`, `half`,
+      `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`,
+      `complex64`, `complex128`, `string`.
+    y: A `Tensor`. Must have the same type as x.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as x.
+  """
   return gen_math_ops.sub(x, y, name)
 
 
@@ -1148,7 +1173,7 @@ def _OverrideBinaryOperatorHelper(func, op_name, clazz_object=ops.Tensor):
           try:
             r_op = getattr(y, "__r%s__" % op_name)
             out = r_op(x)
-            if out == NotImplemented:
+            if out is NotImplemented:
               raise
             return out
           except (TypeError, ValueError):
@@ -2696,10 +2721,10 @@ def reduce_max(input_tensor, axis=None, keepdims=False, name=None):
   tf.Tensor(-1, shape=(), dtype=int32)
   >>> x = tf.constant([4, float('nan')])
   >>> print(tf.reduce_max(x))
-  tf.Tensor(4.0, shape=(), dtype=float32)
+  tf.Tensor(nan, shape=(), dtype=float32)
   >>> x = tf.constant([float('nan'), float('nan')])
   >>> print(tf.reduce_max(x))
-  tf.Tensor(-inf, shape=(), dtype=float32)
+  tf.Tensor(nan, shape=(), dtype=float32)
   >>> x = tf.constant([float('-inf'), float('inf')])
   >>> print(tf.reduce_max(x))
   tf.Tensor(inf, shape=(), dtype=float32)
@@ -3046,7 +3071,7 @@ def reduce_logsumexp(input_tensor, axis=None, keepdims=False, name=None):
             dims=reduce_dim))
     if not keepdims:
       my_max = array_ops.reshape(my_max, gen_array_ops.shape(result))
-    result = gen_math_ops.add(result, my_max)
+    result = _add_dispatch(result, my_max, name=name)
     return _may_reduce_to_scalar(keepdims, axis, result)
 
 
@@ -3693,6 +3718,29 @@ def log_sigmoid(x, name=None):
 
   Returns:
     A Tensor with the same type as `x`.
+
+  Usage Example:
+
+  If a positive number is large, then its log_sigmoid will approach to 0 since
+  the formula will be `y = log( <large_num> / (1 + <large_num>) )` which
+  approximates to `log (1)` which is 0.
+
+  >>> x = tf.constant([0.0, 1.0, 50.0, 100.0])
+  >>> tf.math.log_sigmoid(x)
+  <tf.Tensor: shape=(4,), dtype=float32, numpy=
+  array([-6.9314718e-01, -3.1326169e-01, -1.9287499e-22, -0.0000000e+00],
+        dtype=float32)>
+
+  If a negative number is large, its log_sigmoid will approach to the number
+  itself since the formula will be `y = log( 1 / (1 + <large_num>) )` which is
+  `log (1) - log ( (1 + <large_num>) )` which approximates to `- <large_num>`
+  that is the number itself.
+
+  >>> x = tf.constant([-100.0, -50.0, -1.0, 0.0])
+  >>> tf.math.log_sigmoid(x)
+  <tf.Tensor: shape=(4,), dtype=float32, numpy=
+  array([-100.       ,  -50.       ,   -1.3132616,   -0.6931472],
+        dtype=float32)>
   """
   with ops.name_scope(name, "LogSigmoid", [x]) as name:
     x = ops.convert_to_tensor(x, name="x")
@@ -4514,7 +4562,7 @@ def tensordot(a, b, axes, name=None):
         rank_a = array_ops.rank(a)
         axes = ops.convert_to_tensor(axes, dtype=dtypes.int32, name="axes")
         axes = array_ops.where(axes >= 0, axes, axes + rank_a)
-        free, _ = array_ops.setdiff1d(range(rank_a), axes)
+        free, _ = gen_array_ops.list_diff(range(rank_a), axes, dtypes.int32)
       free_dims = array_ops.gather(shape_a, free)
       axes_dims = array_ops.gather(shape_a, axes)
       prod_free_dims = reduce_prod(free_dims)
@@ -4761,6 +4809,35 @@ def ndtri(x, name=None):
     return gen_math_ops.ndtri(x)
 
 
+@tf_export("math.erfcinv")
+@dispatch.add_dispatch_support
+def erfcinv(x, name=None):
+  """Computes the inverse of complementary error function.
+
+  Given `x`, compute the inverse complementary error function of `x`.
+  This function is the inverse of `tf.math.erfc`, and is defined on
+  `[0, 2]`.
+
+  >>> tf.math.erfcinv([0., 0.2, 1., 1.5, 2.])
+  <tf.Tensor: shape=(5,), dtype=float32, numpy=
+  array([       inf,  0.9061935, -0.       , -0.4769363,       -inf],
+        dtype=float32)>
+
+  Args:
+    x: `Tensor` with type `float` or `double`.
+    name: A name for the operation (optional).
+  Returns:
+    Inverse complementary error function of `x`.
+
+  @compatibility(numpy)
+  Equivalent to scipy.special.erfcinv
+  @end_compatibility
+  """
+  with ops.name_scope(name, "erfcinv", [x]):
+    x = ops.convert_to_tensor(x, name="start")
+    return -ndtri(0.5 * x) * np.sqrt(0.5)
+
+
 @tf_export("math.ceil", v1=["math.ceil", "ceil"])
 @dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("ceil")
@@ -4920,10 +4997,66 @@ def rsqrt(x, name=None):
 
   Args:
     x: A `tf.Tensor`. Must be one of the following types: `bfloat16`, `half`,
-      `float32`, `float64`. `int32`
+      `float32`, `float64`.
     name: A name for the operation (optional).
 
   Returns:
     A `tf.Tensor`. Has the same type as `x`.
   """
   return gen_math_ops.rsqrt(x, name)
+
+
+@tf_export("math.acos", "acos")
+@dispatch.add_dispatch_support
+def acos(x, name=None):
+  """Computes acos of x element-wise.
+
+  Provided an input tensor, the `tf.math.acos` operation
+  returns the inverse cosine of each element of the tensor.
+  If `y = tf.math.cos(x)` then, `x = tf.math.acos(y)`.
+
+  Input range is `[-1, 1]` and the output has a range of `[0, pi]`.
+
+  For example:
+
+  >>> x = tf.constant([1.0, -0.5, 3.4, 0.2, 0.0, -2], dtype = tf.float32)
+  >>> tf.math.acos(x)
+  <tf.Tensor: shape=(6,), dtype=float32,
+  numpy= array([0. , 2.0943952, nan, 1.3694383, 1.5707964, nan],
+  dtype=float32)>
+
+  Args:
+    x: A `Tensor`. Must be one of the following types: `bfloat16`, `half`,
+      `float32`, `float64`, `uint8`, `int8`, `int16`, `int32`, `int64`,
+      `complex64`, `complex128`, `string`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as x.
+  """
+  return gen_math_ops.acos(x, name)
+
+
+@tf_export("math.floor", "floor")
+@dispatch.add_dispatch_support
+def floor(x, name=None):
+  """Returns element-wise largest integer not greater than x.
+
+  Both input range is `(-inf, inf)` and the
+  ouput range consists of all integer values.
+
+  For example:
+
+  >>> x = tf.constant([1.3324, -1.5, 5.555, -2.532, 0.99, float("inf")])
+  >>> tf.floor(x).numpy()
+  array([ 1., -2.,  5., -3.,  0., inf], dtype=float32)
+
+  Args:
+    x:  A `Tensor`. Must be one of the following types: `bfloat16`, `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as x.
+  """
+  return gen_math_ops.floor(x, name)
diff --git a/tensorflow/python/ops/math_ops_linspace_test.py b/tensorflow/python/ops/math_ops_linspace_test.py
index f56b1c9284d..1f698977768 100644
--- a/tensorflow/python/ops/math_ops_linspace_test.py
+++ b/tensorflow/python/ops/math_ops_linspace_test.py
@@ -21,8 +21,8 @@ from __future__ import print_function
 # Using distutils.version.LooseVersion was resulting in an error, so importing
 # directly.
 from distutils.version import LooseVersion  # pylint: disable=g-importing-member
-import itertools
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import test_util
@@ -31,29 +31,36 @@ from tensorflow.python.platform import googletest
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class LinspaceTest(test_util.TensorFlowTestCase):
+class LinspaceTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
-  def testLinspaceBroadcasts(self):
+  # pylint: disable=g-complex-comprehension
+  @parameterized.parameters([
+      {
+          "start_shape": start_shape,
+          "stop_shape": stop_shape,
+          "dtype": dtype,
+          "num": num
+      }
+      for start_shape in [(), (2,), (2, 2)]
+      for stop_shape in [(), (2,), (2, 2)]
+      for dtype in [np.float64, np.int64]
+      for num in [0, 1, 2, 20]
+  ])
+  # pylint: enable=g-complex-comprehension
+  def testLinspaceBroadcasts(self, start_shape, stop_shape, dtype, num):
     if LooseVersion(np.version.version) < LooseVersion("1.16.0"):
       self.skipTest("numpy doesn't support axes before version 1.16.0")
 
-    shapes = [(), (2,), (2, 2)]
+      ndims = max(len(start_shape), len(stop_shape))
+      for axis in range(-ndims, ndims):
+        start = np.ones(start_shape, dtype)
+        stop = 10 * np.ones(stop_shape, dtype)
 
-    types = [np.float64, np.int64]
+        np_ans = np.linspace(start, stop, num, axis=axis)
+        tf_ans = self.evaluate(
+            math_ops.linspace_nd(start, stop, num, axis=axis))
 
-    for start_shape, stop_shape in itertools.product(shapes, repeat=2):
-      for num in [0, 1, 2, 20]:
-        ndims = max(len(start_shape), len(stop_shape))
-        for axis in range(-ndims, ndims):
-          for dtype in types:
-            start = np.ones(start_shape, dtype)
-            stop = 10 * np.ones(stop_shape, dtype)
-
-            np_ans = np.linspace(start, stop, num, axis=axis)
-            tf_ans = self.evaluate(
-                math_ops.linspace_nd(start, stop, num, axis=axis))
-
-            self.assertAllClose(np_ans, tf_ans)
+        self.assertAllClose(np_ans, tf_ans)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index dabf4bb9d33..2572419a7ba 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -882,5 +882,14 @@ class RangeTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(values, self.evaluate(tensor))
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class ErfcinvTest(test_util.TensorFlowTestCase):
+
+  def testErfcinv(self):
+    values = np.random.uniform(0.1, 1.9, size=int(1e4)).astype(np.float32)
+    approx_id = math_ops.erfc(math_ops.erfcinv(values))
+    self.assertAllClose(values, self.evaluate(approx_id))
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 1742a919216..0421829bff3 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -43,14 +43,18 @@ class BatchNormalizationTest(test.TestCase):
     return math_ops.cast(y, x.dtype)
 
   def _inference_ref(self, x, scale, offset, mean, var, epsilon, data_format):
-    if data_format not in ['NHWC', 'NCHW']:
-      raise ValueError('data_format must be NCHW or NHWC, '
-                       'got %s.' % data_format)
+    if data_format not in ['NHWC', 'NCHW', 'NDHWC', 'NCDHW']:
+      raise ValueError('data_format must be NCHW or NHWC for 4D tensors or'
+                       'NCDHW or NDHWC for 5D tensors, got %s.' % data_format)
     if data_format == 'NCHW':
       x = array_ops.transpose(x, [0, 2, 3, 1])
+    elif data_format == 'NCDHW':
+      x = array_ops.transpose(x, [0, 2, 3, 4, 1])
     y = self._batch_norm(x, mean, var, offset, scale, epsilon)
     if data_format == 'NCHW':
       y = array_ops.transpose(y, [0, 3, 1, 2])
+    elif data_format == 'NCDHW':
+      y = array_ops.transpose(y, [0, 4, 1, 2, 3])
     return self.evaluate(y)
 
   def _test_inference(self,
@@ -102,17 +106,24 @@ class BatchNormalizationTest(test.TestCase):
 
   def _training_ref(self, x, scale, offset, old_mean, old_var,
                     exponential_avg_factor, epsilon, data_format):
-    if data_format not in ['NHWC', 'NCHW']:
-      raise ValueError('data_format must be NCHW or NHWC, '
-                       'got %s.' % data_format)
+    if data_format not in ['NHWC', 'NCHW', 'NDHWC', 'NCDHW']:
+      raise ValueError('data_format must be NCHW or NHWC for 4D tensors or'
+                       'NCDHW or NDHWC for 5D tensors, got %s.' % data_format)
+    use_4d_tensor = (x.shape.ndims == 4)
     if data_format == 'NCHW':
       x = array_ops.transpose(x, [0, 2, 3, 1])
+    elif data_format == 'NCDHW':
+      x = array_ops.transpose(x, [0, 2, 3, 4, 1])
+
+    mean_axis = [0, 1, 2] if use_4d_tensor else [0, 1, 2, 3]
     batch_mean, batch_var = nn_impl.moments(
-        math_ops.cast(x, scale.dtype), [0, 1, 2], keep_dims=False)
+        math_ops.cast(x, scale.dtype), mean_axis, keep_dims=False)
 
     y = self._batch_norm(x, batch_mean, batch_var, offset, scale, epsilon)
     if data_format == 'NCHW':
       y = array_ops.transpose(y, [0, 3, 1, 2])
+    elif data_format == 'NCDHW':
+      y = array_ops.transpose(y, [0, 4, 1, 2, 3])
 
     # This is for Bessel's correction. tf.nn.moments uses n, instead of n-1, as
     # the denominator in the formula to calculate variance, while
@@ -377,14 +388,18 @@ class BatchNormalizationTest(test.TestCase):
 
   def _runtests(self, x_shape, is_training, gradient_test=False,
                 cpu_only=False):
+    if len(x_shape) == 4:
+      data_format_list = ['NHWC', 'NCHW']
+    else:
+      data_format_list = ['NCDHW', 'NDHWC']
     use_gpu_vals = [False]
     if test.is_gpu_available(cuda_only=True) and not cpu_only:
       use_gpu_vals += [True]
     factors = [1.0, 0.6]
     for dtype in [np.float16, np.float32]:
       for use_gpu in use_gpu_vals:
-        for data_format in ['NHWC', 'NCHW']:
-          if data_format == 'NHWC':
+        for data_format in data_format_list:
+          if data_format == 'NHWC' or data_format == 'NDHWC':
             scale_shape = x_shape[-1:]
           else:
             scale_shape = x_shape[1:2]
@@ -444,6 +459,10 @@ class BatchNormalizationTest(test.TestCase):
     # GPU kernel doesn't properly handle case where non-channel dimensions are 1
     self._runtests(x_shape, False, cpu_only=True)
 
+  def testInferenceShape7(self):
+    x_shape = [1, 2, 6, 1, 3]
+    self._runtests(x_shape, False)
+
   def testTrainingShape1(self):
     x_shape = [1, 1, 6, 1]
     self._runtests(x_shape, True)
@@ -465,11 +484,16 @@ class BatchNormalizationTest(test.TestCase):
     x_shape = [0, 131, 127, 6]
     self._runtests(x_shape, True)
 
+  @test_util.run_deprecated_v1
   def testTrainingShape6(self):
     x_shape = [1, 1, 1, 1]
     # GPU kernel doesn't properly handle case where non-channel dimensions are 1
     self._runtests(x_shape, True, cpu_only=True)
 
+  def testTrainingShape7(self):
+    x_shape = [1, 2, 6, 1, 3]
+    self._runtests(x_shape, True)
+
   @test_util.run_deprecated_v1
   def testBatchNormGradInferenceShape1(self):
     x_shape = [1, 1, 6, 1]
@@ -503,6 +527,11 @@ class BatchNormalizationTest(test.TestCase):
     self._runtests(x_shape, is_training=False, gradient_test=True,
                    cpu_only=True)
 
+  @test_util.run_deprecated_v1
+  def testBatchNormGradInferenceShape7(self):
+    x_shape = [1, 2, 6, 1, 3]
+    self._runtests(x_shape, is_training=False, gradient_test=True)
+
   @test_util.run_deprecated_v1
   def testBatchNormGradTrainingShape1(self):
     x_shape = [1, 1, 6, 1]
@@ -535,42 +564,54 @@ class BatchNormalizationTest(test.TestCase):
     # GPU kernel doesn't properly handle case where non-channel dimensions are 1
     self._runtests(x_shape, is_training=True, gradient_test=True, cpu_only=True)
 
+  @test_util.run_deprecated_v1
+  def testBatchNormGradTrainingShape7(self):
+    x_shape = [1, 2, 6, 1, 3]
+    self._runtests(x_shape, is_training=True, gradient_test=True)
+
   def _testBatchNormGradGrad(self, config):
     shape = config['shape']
     err_tolerance = config['err_tolerance']
     dtype = config['dtype']
+    rank = len(shape)
+    if rank == 4:
+      data_format_nhwc, features_nhwc = 'NHWC', shape[3]
+      data_format_nchw, features_nchw = 'NCHW', shape[1]
+    else:
+      data_format_nhwc, features_nhwc = 'NDHWC', shape[4]
+      data_format_nchw, features_nchw = 'NCDHW', shape[1]
     for is_training in [True, False]:
       if test.is_gpu_available(cuda_only=True):
         self._test_grad_grad(
             shape,
-            dtype, [shape[3]],
+            dtype, [features_nhwc],
             np.float32,
             use_gpu=True,
-            data_format='NHWC',
+            data_format=data_format_nhwc,
             is_training=is_training,
             err_tolerance=err_tolerance)
         self._test_grad_grad(
             shape,
-            dtype, [shape[1]],
+            dtype, [features_nchw],
             np.float32,
             use_gpu=True,
-            data_format='NCHW',
+            data_format=data_format_nchw,
             is_training=is_training,
             err_tolerance=err_tolerance)
       self._test_grad_grad(
           shape,
-          dtype, [shape[3]],
+          dtype, [features_nhwc],
           np.float32,
           use_gpu=False,
-          data_format='NHWC',
+          data_format=data_format_nhwc,
           is_training=is_training,
           err_tolerance=err_tolerance)
       self._test_grad_grad(
           shape,
-          dtype, [shape[1]],
+          dtype, [features_nchw],
           np.float32,
           use_gpu=False,
-          data_format='NCHW',
+          data_format=data_format_nchw,
           is_training=is_training,
           err_tolerance=err_tolerance)
 
@@ -610,6 +651,24 @@ class BatchNormalizationTest(test.TestCase):
     }
     self._testBatchNormGradGrad(config)
 
+  @test_util.run_deprecated_v1
+  def testBatchNormGradGradConfig5(self):
+    config = {
+        'shape': [2, 3, 2, 2, 2],
+        'err_tolerance': 2e-3,
+        'dtype': np.float32,
+    }
+    self._testBatchNormGradGrad(config)
+
+  @test_util.run_deprecated_v1
+  def testBatchNormGradGradConfig6(self):
+    config = {
+        'shape': [2, 3, 2, 2, 2],
+        'err_tolerance': 3e-3,
+        'dtype': np.float16,
+    }
+    self._testBatchNormGradGrad(config)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 6dee2fac95d..a02e31f80a5 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -688,6 +688,7 @@ def _MaxPoolGrad(op, grad):
       op.get_attr("ksize"),
       op.get_attr("strides"),
       padding=op.get_attr("padding"),
+      explicit_paddings=op.get_attr("explicit_paddings"),
       data_format=op.get_attr("data_format"))
 
 
@@ -896,6 +897,11 @@ def _BaseFusedBatchNormGrad(op, version, *grad):
     if data_format == b"NCHW":
       x = array_ops.transpose(x, [0, 2, 3, 1])
       grad_y = array_ops.transpose(grad_y, [0, 2, 3, 1])
+    elif data_format == b"NCDHW":
+      x = array_ops.transpose(x, [0, 2, 3, 4, 1])
+      grad_y = array_ops.transpose(grad_y, [0, 2, 3, 4, 1])
+    target_data_format = ("NHWC" if data_format in (b"NCHW",
+                                                    b"NHWC") else "NDHWC")
     args = {
         "y_backprop": grad_y,
         "x": x,
@@ -903,7 +909,7 @@ def _BaseFusedBatchNormGrad(op, version, *grad):
         "reserve_space_1": pop_mean,
         "reserve_space_2": pop_var,
         "epsilon": epsilon,
-        "data_format": "NHWC",
+        "data_format": target_data_format,
         "is_training": is_training
     }
     if version == 2:
@@ -911,6 +917,8 @@ def _BaseFusedBatchNormGrad(op, version, *grad):
     dx, dscale, doffset, _, _ = grad_fun(**args)
     if data_format == b"NCHW":
       dx = array_ops.transpose(dx, [0, 3, 1, 2])
+    elif data_format == b"NCDHW":
+      dx = array_ops.transpose(dx, [0, 4, 1, 2, 3])
   return dx, dscale, doffset, None, None
 
 
@@ -940,8 +948,8 @@ def _BatchNormGrad(grad_y,
   """Returns the gradients for the 3 inputs of BatchNorm.
 
   Args:
-    grad_y: A `Tensor` of 4 dimensions for gradient for y.
-    x: A `Tensor` of 4 dimensions for x.
+    grad_y: A `Tensor` of 4 or 5 dimensions for gradient for y.
+    x: A `Tensor` of 4 or 5 dimensions for x.
     scale: A `Tensor` of 1 dimension for scaling.
     pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when
       is_training=False.
@@ -967,11 +975,19 @@ def _BatchNormGrad(grad_y,
     if data_format == b"NHWC":
       keepdims = False
       reduce_axis = [0, 1, 2]
-    else:
+    elif data_format == b"NDHWC":
+      keepdims = False
+      reduce_axis = [0, 1, 2, 3]
+    elif data_format == b"NCHW":
       keepdims = True
       reduce_axis = [0, 2, 3]
       shape = [1, array_ops.size(scale), 1, 1]
       scale = array_ops.reshape(scale, shape)
+    else:
+      keepdims = True
+      reduce_axis = [0, 2, 3, 4]
+      shape = [1, array_ops.size(scale), 1, 1, 1]
+      scale = array_ops.reshape(scale, shape)
     mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keepdims=keepdims)
     mean_x = math_ops.reduce_mean(x, reduce_axis, keepdims=keepdims)
     var_x = math_ops.reduce_mean(
@@ -986,19 +1002,27 @@ def _BatchNormGrad(grad_y,
         grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset)
     grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum(
         grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
-    if data_format == b"NCHW":
+    if data_format == b"NCHW" or data_format == b"NCDHW":
       grad_scale = array_ops.squeeze(grad_scale)
     grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
     return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
   else:
     if data_format == b"NHWC":
       reduce_axis = [0, 1, 2]
-    else:
+    elif data_format == b"NDHWC":
+      reduce_axis = [0, 1, 2, 3]
+    elif data_format == b"NCHW":
       reduce_axis = [0, 2, 3]
       shape = [1, array_ops.size(pop_mean), 1, 1]
       pop_mean = array_ops.reshape(pop_mean, shape)
       pop_var = array_ops.reshape(pop_var, shape)
       scale = array_ops.reshape(scale, shape)
+    else:
+      reduce_axis = [0, 2, 3, 4]
+      shape = [1, array_ops.size(pop_mean), 1, 1, 1]
+      pop_mean = array_ops.reshape(pop_mean, shape)
+      pop_var = array_ops.reshape(pop_var, shape)
+      scale = array_ops.reshape(scale, shape)
 
     grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
     var_rsqrt = math_ops.rsqrt(pop_var + epsilon)
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 89174b29336..d22fbf3fa4e 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -1585,7 +1585,7 @@ def fused_batch_norm(
   (http://arxiv.org/abs/1502.03167).
 
   Args:
-    x: Input `Tensor` of 4 dimensions.
+    x: Input `Tensor` of 4 or 5 dimensions.
     scale: A `Tensor` of 1 dimension for scaling.
     offset: A `Tensor` of 1 dimension for bias.
     mean: A `Tensor` of 1 dimension for population mean. The shape and meaning
@@ -1611,7 +1611,8 @@ def fused_batch_norm(
             Variance must be a `Tensor` of the same shape as scale containing
             the exponential running variance.
     epsilon: A small float number added to the variance of x.
-    data_format: The data format for x. Either "NHWC" (default) or "NCHW".
+    data_format: The data format for x. Support "NHWC" (default) or "NCHW" for
+                 4D tenors and "NDHWC" or "NCDHW" for 5D tensors.
     is_training: A bool value to specify if the operation is used for
                  training or inference.
     name: A name for this operation (optional).
@@ -1622,7 +1623,7 @@ def fused_batch_norm(
                             returned.
 
   Returns:
-    y: A 4D Tensor for the normalized, scaled, offsetted x.
+    y: A 4D or 5D Tensor for the normalized, scaled, offsetted x.
     running_mean: A 1D Tensor for the exponential running mean of x.
                   The output value is (1 - exponential_avg_factor) * mean +
                   exponential_avg_factor * batch_mean), where batch_mean
diff --git a/tensorflow/python/ops/nn_loss_scaling_utilities_test.py b/tensorflow/python/ops/nn_loss_scaling_utilities_test.py
index 4f96f9ba6a3..7f150b34bb3 100644
--- a/tensorflow/python/ops/nn_loss_scaling_utilities_test.py
+++ b/tensorflow/python/ops/nn_loss_scaling_utilities_test.py
@@ -22,6 +22,7 @@ from absl.testing import parameterized
 
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import test_util
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -35,7 +36,7 @@ from tensorflow.python.platform import test as test_lib
 class LossUtilitiesTest(test_lib.TestCase, parameterized.TestCase):
 
   def setUp(self):
-    strategy_combinations.set_virtual_cpus_to_at_least(3)
+    test_util.set_logical_devices_to_at_least("CPU", 3)
     super(LossUtilitiesTest, self).setUp()
 
   def testComputeAverageLossGlobalBatchSize(self):
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 7874d6e4d59..2477fa1e920 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1752,12 +1752,14 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
       name=name)
 
 
-def convert_padding(padding):
+def convert_padding(padding, expected_length=4):
   """Converts Python padding to C++ padding for ops which take EXPLICIT padding.
 
   Args:
     padding: the `padding` argument for a Python op which supports EXPLICIT
       padding.
+    expected_length: Expected number of entries in the padding list when
+      explicit padding is used.
 
   Returns:
     (padding, explicit_paddings) pair, which should be passed as attributes to a
@@ -1783,9 +1785,9 @@ def convert_padding(padding):
                          "be a list/tuple of size 2. Element with index %d of "
                          "padding has size %d" % (i, len(dim_paddings)))
       explicit_paddings.extend(dim_paddings)
-    if len(padding) != 4:
-      raise ValueError("When padding is a list, it must be of size 4. Got "
-                       "padding of size: %d" % len(padding))
+    if len(padding) != expected_length:
+      raise ValueError("When padding is a list, it must be of size %d. Got "
+                       "padding of size: %d" % (expected_length, len(padding)))
     padding = "EXPLICIT"
   return padding, explicit_paddings
 
@@ -2524,8 +2526,13 @@ def conv2d_transpose_v2(
       value is given it is replicated in the `H` and `W` dimension. By default
       the `N` and `C` dimensions are set to 0. The dimension order is determined
       by the value of `data_format`, see below for details.
-    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
-      the "returns" section of `tf.nn.convolution` for details.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`.
     data_format: A string. 'NHWC' and 'NCHW' are supported.
     dilations: An int or list of `ints` that has length `1`, `2` or `4`,
       defaults to 1. The dilation factor for each dimension of`input`. If a
@@ -2559,6 +2566,7 @@ def conv2d_transpose_v2(
 
     strides = _get_sequence(strides, 2, channel_index, "strides")
     dilations = _get_sequence(dilations, 2, channel_index, "dilations")
+    padding, explicit_paddings = convert_padding(padding)
 
     return gen_nn_ops.conv2d_backprop_input(
         input_sizes=output_shape,
@@ -2566,6 +2574,7 @@ def conv2d_transpose_v2(
         out_backprop=input,
         strides=strides,
         padding=padding,
+        explicit_paddings=explicit_paddings,
         data_format=data_format,
         dilations=dilations,
         name=name)
@@ -4481,8 +4490,15 @@ def max_pool_v2(input, ksize, strides, padding, data_format=None, name=None):
       of the window for each dimension of the input tensor.
     strides: An int or list of `ints` that has length `1`, `N` or `N+2`. The
       stride of the sliding window for each dimension of the input tensor.
-    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
-      the "returns" section of `tf.nn.convolution` for details.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`. When using explicit
+      padding, the size of the paddings cannot be greater than the sliding
+      window size.
     data_format: A string. Specifies the channel dimension. For N=1 it can be
       either "NWC" (default) or "NCW", for N=2 it can be either "NHWC" (default)
       or "NCHW" and for N=3 either "NDHWC" (default) or "NCDHW".
@@ -4508,12 +4524,20 @@ def max_pool_v2(input, ksize, strides, padding, data_format=None, name=None):
   else:
     channel_index = 1 if data_format.startswith("NC") else n + 1
 
+  if isinstance(padding, (list, tuple)) and data_format == "NCHW_VECT_C":
+    raise ValueError("Data formats NCHW_VECT_C is not yet supported with "
+                     "explicit padding")
+
   ksize = _get_sequence(ksize, n, channel_index, "ksize")
   strides = _get_sequence(strides, n, channel_index, "strides")
 
+  if (isinstance(padding, (list, tuple)) and n == 3):
+    raise ValueError("Explicit padding is not yet supported with an input "
+                     "tensor of rank 5")
+
   max_pooling_ops = {
       1: max_pool1d,
-      2: gen_nn_ops.max_pool,
+      2: max_pool2d,
       3: gen_nn_ops.max_pool3d
   }
 
@@ -4545,8 +4569,15 @@ def max_pool(value,
       The size of the window for each dimension of the input tensor.
     strides: An int or list of `ints` that has length `1`, `2` or `4`.
       The stride of the sliding window for each dimension of the input tensor.
-    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
-      See the "returns" section of `tf.nn.convolution` for details.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`. When using explicit
+      padding, the size of the paddings cannot be greater than the sliding
+      window size.
     data_format: A string. 'NHWC', 'NCHW' and 'NCHW_VECT_C' are supported.
     name: Optional name for the operation.
     input: Alias for value.
@@ -4563,6 +4594,10 @@ def max_pool(value,
 
     ksize = _get_sequence(ksize, 2, channel_index, "ksize")
     strides = _get_sequence(strides, 2, channel_index, "strides")
+    if isinstance(padding, (list, tuple)) and data_format == "NCHW_VECT_C":
+      raise ValueError("Data formats NCHW_VECT_C is not yet supported with "
+                       "explicit padding")
+    padding, explicit_paddings = convert_padding(padding)
     if ((np.isscalar(ksize) and ksize == 0) or
         (isinstance(ksize,
                     (list, tuple, np.ndarray)) and any(v == 0 for v in ksize))):
@@ -4573,6 +4608,7 @@ def max_pool(value,
         ksize=ksize,
         strides=strides,
         padding=padding,
+        explicit_paddings=explicit_paddings,
         data_format=data_format,
         name=name)
 
@@ -4591,8 +4627,14 @@ def max_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):
       window for each dimension of the input tensor.
     strides: An int or list of `ints` that has length `1` or `3`. The stride of
       the sliding window for each dimension of the input tensor.
-    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
-      the "returns" section of `tf.nn.convolution` for details.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NWC"`, this should be in the form `[[0, 0], [pad_left,
+      pad_right], [0, 0]]`. When explicit padding used and data_format is
+      `"NCW"`, this should be in the form `[[0, 0], [0, 0], [pad_left,
+      pad_right]]`. When using explicit padding, the size of the paddings cannot
+      be greater than the sliding window size.
     data_format: An optional string from: "NWC", "NCW". Defaults to "NWC".
     name: A name for the operation (optional).
 
@@ -4601,11 +4643,17 @@ def max_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):
     The max pooled output tensor.
   """
   with ops.name_scope(name, "MaxPool1d", [input]) as name:
+    if isinstance(padding, (list, tuple)) and data_format == "NCHW_VECT_C":
+      raise ValueError("Data formats NCHW_VECT_C is not yet supported with "
+                       "explicit padding")
     if data_format is None:
       data_format = "NWC"
     channel_index = 1 if data_format.startswith("NC") else 2
     ksize = [1] + _get_sequence(ksize, 1, channel_index, "ksize")
     strides = [1] + _get_sequence(strides, 1, channel_index, "strides")
+    padding, explicit_paddings = convert_padding(padding, 3)
+    if padding == "EXPLICIT":
+      explicit_paddings = [0, 0] + explicit_paddings
 
     expanding_dim = 1 if data_format == "NWC" else 2
     data_format = "NHWC" if data_format == "NWC" else "NCHW"
@@ -4616,6 +4664,7 @@ def max_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):
         ksize=ksize,
         strides=strides,
         padding=padding,
+        explicit_paddings=explicit_paddings,
         data_format=data_format,
         name=name)
     return array_ops.squeeze(result, expanding_dim)
@@ -4634,8 +4683,15 @@ def max_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
       the window for each dimension of the input tensor.
     strides: An int or list of `ints` that has length `1`, `2` or `4`. The
       stride of the sliding window for each dimension of the input tensor.
-    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm. See
-      the "returns" section of `tf.nn.convolution` for details.
+    padding: Either the `string `"SAME"` or `"VALID"` indicating the type of
+      padding algorithm to use, or a list indicating the explicit paddings at
+      the start and end of each dimension. When explicit padding is used and
+      data_format is `"NHWC"`, this should be in the form `[[0, 0], [pad_top,
+      pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit padding used
+      and data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
+      [pad_top, pad_bottom], [pad_left, pad_right]]`. When using explicit
+      padding, the size of the paddings cannot be greater than the sliding
+      window size.
     data_format: A string. 'NHWC', 'NCHW' and 'NCHW_VECT_C' are supported.
     name: Optional name for the operation.
 
@@ -4650,12 +4706,17 @@ def max_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
 
     ksize = _get_sequence(ksize, 2, channel_index, "ksize")
     strides = _get_sequence(strides, 2, channel_index, "strides")
+    if isinstance(padding, (list, tuple)) and data_format == "NCHW_VECT_C":
+      raise ValueError("Data formats NCHW_VECT_C is not yet supported with "
+                       "explicit padding")
+    padding, explicit_paddings = convert_padding(padding)
 
     return gen_nn_ops.max_pool(
         input,
         ksize=ksize,
         strides=strides,
         padding=padding,
+        explicit_paddings=explicit_paddings,
         data_format=data_format,
         name=name)
 # pylint: enable=redefined-builtin
@@ -5341,6 +5402,19 @@ def fractional_max_pool_v2(value,
       [Graham, 2015](https://arxiv.org/abs/1412.6071)
       ([pdf](https://arxiv.org/pdf/1412.6071.pdf))
   """
+  if (isinstance(pooling_ratio, (list, tuple))):
+    if (pooling_ratio[0] != 1.0 or pooling_ratio[-1] != 1.0):
+      raise ValueError(
+          "The first and last elements of pooling ratio must be 1.0.")
+    for element in pooling_ratio:
+      if element < 1.0:
+        raise ValueError("pooling_ratio should be >= 1.0.")
+  elif (isinstance(pooling_ratio, (int, float))):
+    if pooling_ratio < 1.0:
+      raise ValueError("pooling_ratio should be >= 1.0.")
+  else:
+    raise ValueError("pooling_ratio should be an int or a list of ints.")
+
   pooling_ratio = _get_sequence(pooling_ratio, 2, 3, "pooling_ratio")
 
   if seed == 0:
@@ -5542,7 +5616,6 @@ def erosion2d(value, kernel, strides, rates, padding, name=None):
   Returns:
     A `Tensor`. Has the same type as `value`.
     4-D with shape `[batch, out_height, out_width, depth]`.
-
   Raises:
     ValueError: If the `value` depth does not match `kernel`' shape, or if
       padding is other than `'VALID'` or `'SAME'`.
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 9b864be39a2..851bfcb66de 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -541,6 +541,8 @@ class DropoutTest(test_lib.TestCase):
       _ = nn_ops.dropout(x, 0.5)
 
 
+@test_util.run_all_without_tensor_float_32(
+    "Tests _compute_sampled_logits and related functions, which call matmul")
 class ComputeSampledLogitsTest(test_lib.TestCase):
 
   def setUp(self):
@@ -1231,6 +1233,33 @@ class DataFormatDimMapTest(test_lib.TestCase):
       y_val = self.evaluate(y)
       self.assertAllEqual(y_val, y_val_expected)
 
+  def testNDHWCtoNCDHW(self):
+    x_val = [1, -4, -3, -2]
+    y_val_expected = [2, 2, 3, 4]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x, src_format="NDHWC", dst_format="NCDHW")
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
+  def testNDHWCtoDHWNC(self):
+    x_val = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4]
+    y_val_expected = [3, 0, 1, 2, 4, 3, 0, 1, 2, 4]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x, src_format="NDHWC", dst_format="DHWNC")
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
+  def testDNHWCtoWHDCN(self):
+    x_val = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4]
+    y_val_expected = [4, 2, 1, 0, 3, 4, 2, 1, 0, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_dim_map(x, src_format="NDHWC", dst_format="WHDCN")
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
+      self.assertAllEqual(y_val, y_val_expected)
+
   def testArbitraryASCII(self):
     x_val = [-4, -3, -2, -1, 0, 1, 2, 3]
     y_val_expected = [3, 2, 1, 0, 3, 2, 1, 0]
diff --git a/tensorflow/python/ops/numpy_ops/BUILD b/tensorflow/python/ops/numpy_ops/BUILD
index 96eb5509de3..d22c96e50c8 100644
--- a/tensorflow/python/ops/numpy_ops/BUILD
+++ b/tensorflow/python/ops/numpy_ops/BUILD
@@ -43,6 +43,7 @@ py_library(
 cuda_py_test(
     name = "np_arrays_test",
     srcs = ["np_arrays_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":numpy",
         "//tensorflow/python:dtypes",
@@ -68,6 +69,7 @@ cuda_py_test(
 cuda_py_test(
     name = "np_logic_test",
     srcs = ["np_logic_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":numpy",
         "//third_party/py/numpy",
@@ -100,6 +102,7 @@ cuda_py_test(
 cuda_py_test(
     name = "np_utils_test",
     srcs = ["np_utils_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":numpy",
         "//tensorflow/python:platform",
diff --git a/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_NumPy_Keras_and_Distribution_Strategy.ipynb b/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_NumPy_Keras_and_Distribution_Strategy.ipynb
index 11e968d1576..c35134ea642 100644
--- a/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_NumPy_Keras_and_Distribution_Strategy.ipynb
+++ b/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_NumPy_Keras_and_Distribution_Strategy.ipynb
@@ -20,7 +20,7 @@
         "## Overview\n",
         "\n",
         "TensorFlow Numpy provides an implementation of a subset of NumPy API on top of TensorFlow backend. Please see [TF NumPy API documentation](https://www.tensorflow.org/api_docs/python/tf/experimental/numpy) and \n",
-        " [TensorFlow NumPy Guide](https://colab.sandbox.google.com/drive/15AshdHLS_xTMohWDleTiAgyPdRt6JQJJ#scrollTo=s2enCDi_FvCR).\n",
+        " [TensorFlow NumPy Guide](https://www.tensorflow.org/guide/tf_numpy).\n",
         "\n",
         "This document shows how TensorFlow NumPy interoperates with TensorFlow's high level APIs like DistributionStrategky and Keras."
       ]
diff --git a/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_Numpy_Distributed_Image_Classification.ipynb b/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_Numpy_Distributed_Image_Classification.ipynb
index a7cd7f38b41..92e05a1f267 100644
--- a/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_Numpy_Distributed_Image_Classification.ipynb
+++ b/tensorflow/python/ops/numpy_ops/g3doc/TensorFlow_Numpy_Distributed_Image_Classification.ipynb
@@ -3,7 +3,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "KQALG9h23b0R"
       },
       "source": [
@@ -15,8 +14,6 @@
       "execution_count": null,
       "metadata": {
         "cellView": "both",
-        "colab": {},
-        "colab_type": "code",
         "id": "U34SJW0W3dg_"
       },
       "outputs": [],
@@ -37,7 +34,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "VIX1XZHJ3gFo"
       },
       "source": [
@@ -47,7 +43,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "f7NApJ7R3ndN"
       },
       "source": [
@@ -55,13 +50,12 @@
         "\n",
         "TensorFlow implements a subset of the [NumPy API](https://numpy.org/doc/1.16), available as `tf.experimental.numpy`. This allows running NumPy code, accelerated by TensorFlow together with access to all of TensorFlow's APIs. Please see [TensorFlow NumPy Guide](https://www.tensorflow.org/guide/tf_numpy) to get started.\n",
         "\n",
-        "Here you will learn how to build a deep model for an image classification task by using TensorFlow Numpy APIs. For using higher level `tf.keras` APIs, see the following [tutorial](tutorials/quickstart/beginner)."
+        "Here you will learn how to build a deep model for an image classification task by using TensorFlow Numpy APIs. For using higher level `tf.keras` APIs, see the following [tutorial](https://www.tensorflow.org/tutorials/quickstart/beginner)."
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "IYDdfih63rSG"
       },
       "source": [
@@ -74,8 +68,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "3IlLM-YlTMv5"
       },
       "outputs": [],
@@ -88,8 +80,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "U13hRXHKTcsE"
       },
       "outputs": [],
@@ -98,6 +88,7 @@
         "import functools\n",
         "import matplotlib.pyplot as plt\n",
         "import os\n",
+        "import tempfile\n",
         "import tensorflow as tf\n",
         "import tensorflow.experimental.numpy as tnp\n",
         "import tensorflow_datasets as tfds\n",
@@ -121,7 +112,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "AxNuZSqZKcdM"
       },
       "source": [
@@ -136,8 +126,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "yKf9Tm5OjwGK"
       },
       "outputs": [],
@@ -172,7 +160,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "ZDJQp4i00qaJ"
       },
       "source": [
@@ -185,13 +172,11 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "44yzAmBFreyg"
       },
       "outputs": [],
       "source": [
-        "class Dense(object):\n",
+        "class Dense(tf.Module):\n",
         "\n",
         "  def __init__(self, units, use_relu=True):\n",
         "    self.wt = None\n",
@@ -230,7 +215,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "wfKpg3adUCy9"
       },
       "source": [
@@ -242,13 +226,11 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "NdrdxKB7SenC"
       },
       "outputs": [],
       "source": [
-        "class Model(object):\n",
+        "class Model(tf.Module):\n",
         "  \"\"\"A  three layer neural network.\"\"\"\n",
         "\n",
         "  def __init__(self):\n",
@@ -269,7 +251,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "Hoxh5Z7E_9Pv"
       },
       "source": [
@@ -282,8 +263,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "hOxqjE7rZPdr"
       },
       "outputs": [],
@@ -325,7 +304,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "8t70b5d6XCs7"
       },
       "source": [
@@ -336,8 +314,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "HrhS_M6kALeP"
       },
       "outputs": [],
@@ -371,7 +347,48 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
+        "id": "Dw7RwQmKcYK9"
+      },
+      "source": [
+        "#### Saving the models to disk"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rmk2xLQLcXkl"
+      },
+      "outputs": [],
+      "source": [
+        "# A temporary directory to save our models into.\n",
+        "dir = tempfile.TemporaryDirectory()\n",
+        "\n",
+        "# We take our model, and create a wrapper for it.\n",
+        "class SaveableModel(Model):\n",
+        "  @tf.function\n",
+        "  def __call__(self, inputs):\n",
+        "    return super().__call__(inputs)\n",
+        "\n",
+        "saveable_model = SaveableModel()\n",
+        "\n",
+        "# This saves a concrete function that we care about.\n",
+        "outputs = saveable_model(x_test)\n",
+        "\n",
+        "# This saves the model to disk.\n",
+        "tf.saved_model.save(saveable_model, dir.name)\n",
+        "\n",
+        "loaded = tf.saved_model.load(dir.name)\n",
+        "outputs_loaded = loaded(x_test)\n",
+        "\n",
+        "# Ensure that the loaded model preserves the weights\n",
+        "# of the saved model.\n",
+        "assert tnp.allclose(outputs, outputs_loaded)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
         "id": "ak_hCOkGXXfl"
       },
       "source": [
@@ -385,7 +402,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "ujbeT5p6Xm7k"
       },
       "source": [
@@ -398,8 +414,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "MZ6hivj-ZIRo"
       },
       "outputs": [],
@@ -458,7 +472,6 @@
     {
       "cell_type": "markdown",
       "metadata": {
-        "colab_type": "text",
         "id": "1ZiN1rpJYHLu"
       },
       "source": [
@@ -469,8 +482,6 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "colab": {},
-        "colab_type": "code",
         "id": "A6ZHYmLapunm"
       },
       "outputs": [],
diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops.py b/tensorflow/python/ops/numpy_ops/np_array_ops.py
index 866c66a7d14..369297bd85a 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops.py
@@ -19,6 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import enum
+import functools
 import math
 import numbers
 import numpy as np
@@ -560,6 +562,21 @@ def _reduce(tf_fn,
       tf_fn(input_tensor=a.data, axis=axis, keepdims=keepdims))
 
 
+# TODO (DarrenZhang01): Add `axis` support to the `size` API.
+@np_utils.np_doc('size')
+def size(x, axis=None):  # pylint: disable=missing-docstring
+  if axis is not None:
+    raise NotImplementedError('axis argument is not supported in the current '
+                              '`np.size` implementation')
+  if isinstance(x, (int, float, np.int32, np.int64, np.float32, np.float64)):
+    return 1
+  x = asarray(x).data
+  if x.shape.is_fully_defined():
+    return np.prod(x.shape.as_list())
+  else:
+    return np_utils.tensor_to_ndarray(array_ops.size_v2(x))
+
+
 @np_utils.np_doc('sum')
 def sum(a, axis=None, dtype=None, keepdims=None):  # pylint: disable=redefined-builtin
   return _reduce(
@@ -1418,7 +1435,7 @@ def take_along_axis(arr, indices, axis):  # pylint: disable=missing-docstring
   indices = indices.data
 
   rank = array_ops.rank(arr)
-  axis = array_ops.where_v2(axis < 0, axis + rank, axis)
+  axis = axis + rank if axis < 0 else axis
 
   # Broadcast shapes to match, ensure that the axis of interest is not
   # broadcast.
@@ -1444,7 +1461,7 @@ def take_along_axis(arr, indices, axis):  # pylint: disable=missing-docstring
 
   swapaxes_ = lambda t: swapaxes(np_utils.tensor_to_ndarray(t), axis, -1).data
 
-  dont_move_axis_to_end = math_ops.equal(axis, rank - 1)
+  dont_move_axis_to_end = math_ops.equal(axis, np_utils.subtract(rank, 1))
   arr = np_utils.cond(dont_move_axis_to_end, lambda: arr,
                       lambda: swapaxes_(arr))
   indices = np_utils.cond(dont_move_axis_to_end, lambda: indices,
@@ -1510,8 +1527,15 @@ def _as_index(idx, need_scalar=True):
   return data, data.shape.rank == 0
 
 
-def _slice_helper(tensor, slice_spec, updates=None):
-  """Helper function for __getitem__ and _with_update.
+class _UpdateMethod(enum.Enum):
+  UPDATE = 0
+  ADD = 1
+  MIN = 2
+  MAX = 3
+
+
+def _slice_helper(tensor, slice_spec, update_method=None, updates=None):
+  """Helper function for __getitem__ and _with_index_update_helper.
 
   This function collects the indices in `slice_spec` into two buckets, which we
   can call "idx1" and "idx2" here. idx1 is intended for `strided_slice`, idx2
@@ -1538,10 +1562,14 @@ def _slice_helper(tensor, slice_spec, updates=None):
   Args:
     tensor: the tensor to be read from or write into.
     slice_spec: the indices.
-    updates: the new values to write into `tensor`.
+    update_method: (optional) a member of `_UpdateMethod`, indicating how to
+      update the values (replacement, add, etc.). `None` indicates just reading.
+    updates: (optional) the new values to write into `tensor`. It must have the
+      same dtype as `tensor`.
 
   Returns:
-    The result of reading or the updated `tensor` after writing.
+    The result of reading (if `update_method` is `None`) or the updated `tensor`
+    after writing.
   """
   begin, end, strides = [], [], []
   new_axis_mask, shrink_axis_mask = 0, 0
@@ -1612,7 +1640,7 @@ def _slice_helper(tensor, slice_spec, updates=None):
     else:
       var_empty = constant_op.constant([], dtype=dtypes.int32)
       packed_begin = packed_end = packed_strides = var_empty
-    if updates is not None and not advanced_indices:
+    if update_method == _UpdateMethod.UPDATE and not advanced_indices:
       return array_ops.tensor_strided_slice_update(
           tensor,
           packed_begin,
@@ -1642,8 +1670,30 @@ def _slice_helper(tensor, slice_spec, updates=None):
           new_axis_mask=new_axis_mask,
           ellipsis_mask=ellipsis_mask,
           name=name)
-    if updates is None and not advanced_indices:
-      return tensor
+    if not advanced_indices:
+      if update_method is None:
+        return tensor
+      assert update_method != _UpdateMethod.UPDATE
+      # TF lacks TensorStridedSliceAdd and alike, so we need to do
+      # read+add+update.
+      if update_method == _UpdateMethod.ADD:
+        update_op = math_ops.add
+      elif update_method == _UpdateMethod.MIN:
+        update_op = math_ops.minimum
+      elif update_method == _UpdateMethod.MAX:
+        update_op = math_ops.maximum
+      return array_ops.tensor_strided_slice_update(
+          original_tensor,
+          packed_begin,
+          packed_end,
+          packed_strides,
+          update_op(tensor, updates),
+          begin_mask=begin_mask,
+          end_mask=end_mask,
+          shrink_axis_mask=shrink_axis_mask,
+          new_axis_mask=new_axis_mask,
+          ellipsis_mask=ellipsis_mask,
+          name=name + '_2')
     advanced_indices_map = {}
     for index, data, had_ellipsis in advanced_indices:
       if had_ellipsis:
@@ -1680,6 +1730,10 @@ def _slice_helper(tensor, slice_spec, updates=None):
       if updates is None:
         return array_ops.gather_nd(tensor, stacked_indices)
       else:
+        # We only need to move-axis `updates` in the contiguous case becausce
+        # only in this case the result dimensions of advanced indexing are in
+        # the middle of `updates`. In the non-contiguous case, those dimensions
+        # are always at the front.
         if dims_contiguous:
           # TODO(wangpeng): Support unknown rank (e.g. by partially flattening
           #   `updates`)
@@ -1694,7 +1748,15 @@ def _slice_helper(tensor, slice_spec, updates=None):
             return range(start, start + length)
           updates = moveaxis(updates, range_(batch_start, batch_size),
                              range(batch_size)).data
-        tensor = array_ops.tensor_scatter_update(
+        if update_method == _UpdateMethod.UPDATE:
+          update_op = array_ops.tensor_scatter_update
+        elif update_method == _UpdateMethod.ADD:
+          update_op = array_ops.tensor_scatter_add
+        elif update_method == _UpdateMethod.MIN:
+          update_op = array_ops.tensor_scatter_min
+        elif update_method == _UpdateMethod.MAX:
+          update_op = array_ops.tensor_scatter_max
+        tensor = update_op(
             tensor, stacked_indices, updates)
         if range(len(dims)) != dims:
           tensor = moveaxis(tensor, range(len(dims)), dims).data
@@ -1715,10 +1777,11 @@ def _slice_helper(tensor, slice_spec, updates=None):
     # do a gather instead.
     rank = np_utils._maybe_static(array_ops.rank(tensor))  # pylint: disable=protected-access
     dims = [(x + rank if x < 0 else x) for x in dims]
-    shape_tensor = array_ops.shape(tensor, out_type=stacked_indices.dtype)
+    shape_tensor = array_ops.shape(tensor)
     dim_sizes = array_ops.gather(shape_tensor, dims)
     if len(dims) == 1:
       stacked_indices = indices[0]
+    stacked_indices = math_ops.cast(stacked_indices, dtypes.int32)
     stacked_indices = array_ops.where_v2(stacked_indices < 0,
                                          stacked_indices + dim_sizes,
                                          stacked_indices)
@@ -1726,8 +1789,12 @@ def _slice_helper(tensor, slice_spec, updates=None):
     if len(dims) > 1:
       index_scaling = math_ops.cumprod(
           dim_sizes, reverse=True, exclusive=True)
-      stacked_indices = math_ops.tensordot(
-          stacked_indices, index_scaling, axes=1)
+      def _tensordot(a, b):
+        # TODO(b/168657656): This function should be replaced by
+        # tensordot(axis=1) once MatMul has int32 XLA kernel.
+        b = array_ops.broadcast_to(b, array_ops.shape(a))
+        return math_ops.reduce_sum(a * b, axis=-1)
+      stacked_indices = _tensordot(stacked_indices, index_scaling)
       flat_shape = array_ops.concat(
           [shape_tensor[:axis], [-1], shape_tensor[axis + len(dims):]],
           axis=0)
@@ -1769,8 +1836,8 @@ def _getitem(self, slice_spec):
   return np_utils.tensor_to_ndarray(result_t)
 
 
-def _with_update(a, slice_spec, updates):
-  """Implementation of ndarray._with_update."""
+def _with_index_update_helper(update_method, a, slice_spec, updates):
+  """Implementation of ndarray._with_index_*."""
   if (isinstance(slice_spec, bool) or (isinstance(slice_spec, ops.Tensor) and
                                        slice_spec.dtype == dtypes.bool) or
       (isinstance(slice_spec, (np.ndarray, np_arrays.ndarray)) and
@@ -1780,10 +1847,18 @@ def _with_update(a, slice_spec, updates):
   if not isinstance(slice_spec, tuple):
     slice_spec = _as_spec_tuple(slice_spec)
 
-  updates = asarray(updates, a.dtype)
-  result_t = _slice_helper(a.data, slice_spec, updates.data)
-  return np_utils.tensor_to_ndarray(result_t)
+  a_dtype = a.dtype
+  a, updates = _promote_dtype_binary(a, updates)
+  result_t = _slice_helper(a.data, slice_spec, update_method, updates.data)
+  return np_utils.tensor_to_ndarray(result_t).astype(a_dtype)
 
 
 setattr(np_arrays.ndarray, '__getitem__', _getitem)
-setattr(np_arrays.ndarray, '_with_update', _with_update)
+setattr(np_arrays.ndarray, '_with_index_update',
+        functools.partial(_with_index_update_helper, _UpdateMethod.UPDATE))
+setattr(np_arrays.ndarray, '_with_index_add',
+        functools.partial(_with_index_update_helper, _UpdateMethod.ADD))
+setattr(np_arrays.ndarray, '_with_index_min',
+        functools.partial(_with_index_update_helper, _UpdateMethod.MIN))
+setattr(np_arrays.ndarray, '_with_index_max',
+        functools.partial(_with_index_update_helper, _UpdateMethod.MAX))
diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops_test.py b/tensorflow/python/ops/numpy_ops/np_array_ops_test.py
index 845db14935b..b3beb32793b 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops_test.py
@@ -25,12 +25,14 @@ from six.moves import range
 from six.moves import zip
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.numpy_ops import np_array_ops
 from tensorflow.python.ops.numpy_ops import np_arrays
@@ -804,6 +806,31 @@ class ArrayMethodsTest(test.TestCase):
   def testAmax(self):
     self._testReduce(np_array_ops.amax, np.amax, 'amax')
 
+  def testSize(self):
+
+    def run_test(arr, axis=None):
+      onp_arr = np.array(arr)
+      self.assertEqual(np_array_ops.size(arr, axis), np.size(onp_arr, axis))
+
+    run_test(np_array_ops.array([1]))
+    run_test(np_array_ops.array([1, 2, 3, 4, 5]))
+    run_test(np_array_ops.ones((2, 3, 2)))
+    run_test(np_array_ops.ones((3, 2)))
+    run_test(np_array_ops.zeros((5, 6, 7)))
+    run_test(1)
+    run_test(np_array_ops.ones((3, 2, 1)))
+    run_test(constant_op.constant(5))
+    run_test(constant_op.constant([1, 1, 1]))
+    self.assertRaises(NotImplementedError, np_array_ops.size, np.ones((2, 2)),
+                      1)
+
+    @def_function.function(input_signature=[tensor_spec.TensorSpec(shape=None)])
+    def f(arr):
+      arr = np_array_ops.asarray(arr)
+      return np_array_ops.size(arr)
+
+    self.assertEqual(f(np_array_ops.ones((3, 2))).data.numpy(), 6)
+
   def testRavel(self):
 
     def run_test(arr, *args, **kwargs):
diff --git a/tensorflow/python/ops/numpy_ops/np_math_ops.py b/tensorflow/python/ops/numpy_ops/np_math_ops.py
index c1505e6fb65..631975c9b8a 100644
--- a/tensorflow/python/ops/numpy_ops/np_math_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_math_ops.py
@@ -217,6 +217,9 @@ def clip(a, a_min, a_max):  # pylint: disable=missing-docstring
             *np_utils.tf_broadcast(a.data, a_min.data, a_max.data)))
 
 
+setattr(np_arrays.ndarray, 'clip', clip)
+
+
 @np_utils.np_doc('matmul')
 def matmul(x1, x2):  # pylint: disable=missing-docstring
   def f(x1, x2):
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index b189ac57bb9..9208073d946 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -22,6 +22,7 @@ py_library(
         ":gradients",
         ":test_util",
         "//tensorflow/compiler/tf2xla/python:xla",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:constant_op",
@@ -110,7 +111,10 @@ py_library(
 cuda_py_test(
     name = "control_flow_ops_test",
     srcs = ["control_flow_ops_test.py"],
-    tags = ["no_rocm"],
+    shard_count = 16,
+    tags = [
+        "no_rocm",
+    ],
     deps = [
         ":control_flow_ops",
         ":test_util",
@@ -153,6 +157,9 @@ cuda_py_test(
 cuda_py_test(
     name = "array_test",
     srcs = ["array_test.py"],
+    tags = [
+        "notsan",  # TODO(b/170999669): Data race
+    ],
     deps = [
         ":control_flow_ops",
         ":test_util",
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops.py b/tensorflow/python/ops/parallel_for/control_flow_ops.py
index b60bc210e9b..504574385c6 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops.py
@@ -450,6 +450,13 @@ def vectorized_map(fn, elems, fallback_to_while_loop=True):
     results of applying fn to tensors unpacked from elems along the first
     dimension, from first to last.
 
+    Although they are less common as user-visible inputs and outputs, note that
+    tensors of type `tf.variant` which represent tensor lists (for example from
+    `tf.raw_ops.TensorListFromTensor`) are vectorized by stacking the list
+    contents rather than the variant itself, and so the container tensor will
+    have a scalar shape when returned rather than the usual stacked shape. This
+    improves the performance of control flow gradient vectorization.
+
   Raises:
     ValueError: If vectorization fails and fallback_to_while_loop is False.
   """
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index fe3d5f55d4e..6e70234de76 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -44,7 +44,10 @@ from tensorflow.python.ops import cond_v2
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_v2_toggles
 from tensorflow.python.ops import data_flow_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import gen_list_ops
 from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients as gradient_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import list_ops
@@ -171,6 +174,7 @@ class PForTest(PForTestCase):
                         pfor_control_flow_ops.vectorized_map(
                             lambda x: x * x, math_ops.range(4)))
     self.assertTrue(def_function.functions_run_eagerly())
+    def_function.run_functions_eagerly(False)
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -910,6 +914,7 @@ class TensorArrayTest(PForTestCase):
       self.assertAllClose(actual_grad, computed_grad)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class TensorListTest(PForTestCase):
 
   def test_create_outside_and_write(self):
@@ -1009,6 +1014,38 @@ class TensorListTest(PForTestCase):
 
     self._test_loop_fn(loop_fn, 2)
 
+  def test_create_inside_and_concat(self):
+
+    def loop_fn(i):
+      handle = list_ops.tensor_list_reserve([2], 2, dtypes.int32)
+      handle = list_ops.tensor_list_scatter([[i, 2]], [0], input_handle=handle)
+      handle = list_ops.tensor_list_scatter([[1, 2]], [1], input_handle=handle)
+      return gen_list_ops.tensor_list_concat_v2(
+          handle,
+          element_dtype=dtypes.int32,
+          element_shape=[2],
+          leading_dims=[])
+
+    output = pfor_control_flow_ops.pfor(loop_fn, 2)
+    self.assertAllClose([[0, 2, 1, 2], [1, 2, 1, 2]], output[0])
+    self.assertAllClose([[2, 2], [2, 2]], output[1])
+
+  def test_create_outside_and_concat(self):
+    h = list_ops.tensor_list_reserve([2], 2, dtypes.int32)
+
+    def loop_fn(i):
+      handle = list_ops.tensor_list_scatter([[i, 2]], [0], input_handle=h)
+      handle = list_ops.tensor_list_scatter([[1, 2]], [1], input_handle=handle)
+      return gen_list_ops.tensor_list_concat_v2(
+          handle,
+          element_dtype=dtypes.int32,
+          element_shape=[2],
+          leading_dims=[])
+
+    output = pfor_control_flow_ops.pfor(loop_fn, 2)
+    self.assertAllClose([[0, 2, 1, 2], [1, 2, 1, 2]], output[0])
+    self.assertAllClose([[2, 2], [2, 2]], output[1])
+
   def test_tensor_list_from_tensor(self):
     t = random_ops.random_uniform([2, 3, 4])
 
@@ -1039,6 +1076,29 @@ class TensorListTest(PForTestCase):
     if not v2_enabled:
       control_flow_v2_toggles.disable_control_flow_v2()
 
+  def test_tensor_list_addn_already_stacked(self):
+
+    def loop_fn(i):
+      l1 = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+      l1 = list_ops.tensor_list_set_item(l1, 0, i)
+      l2 = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+      l2 = list_ops.tensor_list_set_item(l2, 1, i)
+      return list_ops.tensor_list_stack(math_ops.add_n([l1, l2]), dtypes.int32)
+
+    self._test_loop_fn(loop_fn, 2)
+
+  def test_tensor_list_addn_stacking_required(self):
+    l1 = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+    l1 = list_ops.tensor_list_set_item(l1, 1, 1)
+
+    def loop_fn(i):
+      l2 = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+      l2 = list_ops.tensor_list_set_item(l2, 1, i)
+      return list_ops.tensor_list_stack(
+          math_ops.add_n([l1, l2]), dtypes.int32)
+
+    self._test_loop_fn(loop_fn, 2)
+
 
 class StackTest(PForTestCase):
 
@@ -1508,6 +1568,23 @@ class WhileV2Test(PForTestCase):
     y = constant_op.constant(np.random.uniform(size=(3, 3)))
     self.assertAllClose(_f(x, y, True), _f(x, y, False))
 
+  def test_scan(self):
+    np.random.seed(seed=42)
+    data = np.random.randn(3).astype(np.float32)
+
+    def log_prob(x):
+      return math_ops.reduce_sum(functional_ops.scan_v2(
+          lambda _, yi: (x - yi)**2,
+          elems=data,
+          initializer=constant_op.constant(0.)))
+
+    x = variables.Variable(array_ops.ones([2]))
+    self.evaluate(x.initializer)
+    v_log_prob = lambda x: pfor_control_flow_ops.vectorized_map(log_prob, x)
+    theoretical, numerical = gradient_checker_v2.compute_gradient(
+        v_log_prob, (x,), delta=1e-3)
+    self.assertAllClose(theoretical, numerical, rtol=1e-2)
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class NestedControlFlowTest(PForTestCase):
diff --git a/tensorflow/python/ops/parallel_for/math_test.py b/tensorflow/python/ops/parallel_for/math_test.py
index 85b58055d8f..be15e0d96ef 100644
--- a/tensorflow/python/ops/parallel_for/math_test.py
+++ b/tensorflow/python/ops/parallel_for/math_test.py
@@ -82,6 +82,11 @@ class MathTest(PForTestCase, parameterized.TestCase):
     self._test_unary_cwise_ops(complex_ops, True)
 
   def test_unary_cwise_real_ops_1(self):
+    if test.is_built_with_rocm():
+      # TODO(rocm):
+      # This fails on ROCm...see JIRA ticket 236756
+      self.skipTest("Fails on ROCM")
+
     real_ops = [
         lambda x: math_ops.acosh(1 + math_ops.square(x)),
         math_ops.abs,
@@ -196,12 +201,15 @@ class MathTest(PForTestCase, parameterized.TestCase):
           math_ops.subtract,
           math_ops.truncate_mod,
           safe_polygamma,
-          safe_zeta,
       ]
       # FloorDiv fails on XLA due floor's discontinuities exacerbating small
       # division differences.
       if not test_util.is_xla_enabled():
         float_ops += [math_ops.floor_div]
+        # TODO(b/168912036): Re-enable once GPU + XLA issues for Zeta are
+        # resolved.
+        if not test_util.is_gpu_available():
+          float_ops += [safe_zeta]
       for op in logical_ops + float_ops:
         x = random_ops.random_uniform([7, 3, 5])
         y = random_ops.random_uniform([3, 5])
@@ -261,6 +269,9 @@ class MathTest(PForTestCase, parameterized.TestCase):
 
     self._test_loop_fn(loop_fn, 4)
 
+  @test_util.run_without_tensor_float_32(
+      "Calls matmul in parallel for-loop and compares result to calling matmul "
+      "in sequential for-loop")
   def test_matmul(self):
     for tr_a in (True, False):
       for tr_b in (True, False):
@@ -745,6 +756,9 @@ class LinalgTest(PForTestCase):
 
     self._test_loop_fn(loop_fn, 2)
 
+  @test_util.run_without_tensor_float_32(
+      "Calls einsum in parallel for-loop and compares result to calling einsum "
+      "in sequential for-loop")
   def test_einsum(self):
     b = 10
     x_series = random_ops.random_uniform([b, 9, 9])
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index cde1e6a9957..7e460176c61 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -28,6 +28,7 @@ import numpy as np
 import six
 
 from tensorflow.compiler.tf2xla.python import xla
+from tensorflow.core.framework import types_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import execute
@@ -42,10 +43,12 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_linalg_ops
+from tensorflow.python.ops import gen_list_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gen_parsing_ops
@@ -58,6 +61,7 @@ from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops import tensor_array_ops
@@ -74,6 +78,29 @@ flags.DEFINE_bool(
     "DEPRECATED: Flag is ignored.")
 
 
+def _variant_handle_data(t):
+  """Fetches handle data for a variant tensor `t`, or None if unavailable."""
+  handle_data = resource_variable_ops.get_eager_safe_handle_data(t)
+  if not handle_data.is_set:
+    return None
+  if len(handle_data.shape_and_type) != 1:
+    raise ValueError("Expected handle data of length 1, got {!r} of length {}"
+                     .format(handle_data, len(handle_data.shape_and_type)))
+  return handle_data.shape_and_type[0]
+
+
+def _is_tensor_list(t):
+  """True if `t` is a TensorList, False if it isn't, None if unknown."""
+  if t.dtype != dtypes.variant:
+    return False
+  shape_and_type = _variant_handle_data(t)
+  if shape_and_type is None:
+    # TODO(b/169968286): Identify all variant tensors (e.g. optionals) and we
+    # can make this an error instead of assuming TensorLists have handle data.
+    return None  # Presumed not a TensorList
+  return shape_and_type.specialized_type == types_pb2.ST_TENSOR_LIST
+
+
 def _stack(t, length):
   """stacks `t` `length` times."""
   # Note that this stacking may currently be triggered, for example, when a
@@ -81,14 +108,24 @@ def _stack(t, length):
   # produces a loop dependent output. Simply stacking the variants may not be
   # suitable since operations on stacked handles may expect a vectorized version
   # of the variant.
-  # Given that variant types are generic, we are currently unable to figure out
-  # which particular variant type is being considered here and hence it may not
-  # be safe to allow stacking it.
   if t.dtype == dtypes.variant:
-    raise NotImplementedError(
-        "Vectorization tried to stack variant tensor %s. "
-        "This is likely because vectorization of that variant "
-        "is not fully supported yet." % t)
+    shape_and_type = _variant_handle_data(t)
+    if shape_and_type is None:
+      raise ValueError("Required handle data not set for {!r}".format(t))
+    if shape_and_type.specialized_type == types_pb2.ST_TENSOR_LIST:
+      return wrap(
+          _stack_tensor_list(t, shape_and_type.dtype, length),
+          True)
+    else:
+      if shape_and_type.specialized_type != types_pb2.ST_INVALID:
+        raise ValueError(
+            ("Attempted to stack an unhandled variant-dtype tensor of "
+             "type {!r} ({!r})").format(
+                 shape_and_type.specialized_type, t))
+      else:
+        raise ValueError(
+            "Attempted to stack a variant-dtype tensor with no type set ({!r})"
+            .format(t))
   ones = array_ops.ones_like(array_ops.shape(t))
   ones = array_ops.reshape(ones, [-1])
   length = array_ops.reshape(length, [-1])
@@ -734,20 +771,30 @@ class _PforInput(object):
     self._op = op
     self._inputs = inputs
 
-  def stack_inputs(self, stack_indices=None):
+  def stack_inputs(self, stack_indices=None, tile_variants=False):
     """Stacks unstacked inputs at `stack_indices`.
 
     Args:
       stack_indices: indices of inputs at which stacking is done. If None,
         stacking is done at all indices.
+      tile_variants: If True, affected indices which have a variant dtype will
+        be tiled after this operation to match the expected shape of a
+        vectorized tensor. Variants generally need to be un-tiled when they are
+        inputs to operations and tiled when returned.
     """
     if stack_indices is None:
       stack_indices = range(len(self._inputs))
     length = self.pfor.loop_len_vector
     for i in stack_indices:
       inp = self._inputs[i]
+      is_variant = inp.t.dtype == dtypes.variant
       if not inp.is_stacked:
         self._inputs[i] = _stack(inp.t, length)
+        if tile_variants and is_variant:
+          self._inputs[i] = wrap(
+              _tile_variant_with_length(self._inputs[i].t, length), True)
+      elif not tile_variants and is_variant:
+        self._inputs[i] = wrap(_untile_variant(self._inputs[i].t), True)
 
   def expanddim_inputs_for_broadcast(self):
     """Reshapes stacked inputs to prepare them for broadcast.
@@ -993,6 +1040,12 @@ def wrap(tensor, is_stacked=True, is_sparse_stacked=False):
   return WrappedTensor(tensor, is_stacked, is_sparse_stacked)
 
 
+def _wrap_and_tile_variants(tensor, length):
+  if tensor.dtype == dtypes.variant:
+    tensor = _tile_variant_with_length(tensor, length)
+  return wrap(tensor)
+
+
 def _fallback_converter(pfor_input, warn=True):
   if warn:
     logging.warn("Using a while_loop for converting %s", pfor_input.op_type)
@@ -1495,7 +1548,10 @@ class PFor(object):
           if y is y_op:
             new_outputs = new_op
           else:
-            new_outputs = [wrap(x, False) for x in new_op.outputs]
+            new_outputs = []
+            for old_output, new_output in zip(y_op.outputs, new_op.outputs):
+              custom_gradient.copy_handle_data(old_output, new_output)
+              new_outputs.append(wrap(new_output, False))
         else:
           # Either some inputs are not loop invariant or op is stateful.
           if hasattr(y_op, "pfor_converter"):
@@ -1569,7 +1625,10 @@ class PFor(object):
                 else:
                   batch_dim = tensor_shape.TensorShape(loop_len)
                 output_shape = batch_dim.concatenate(output_shape)
-              new_output.t.set_shape(output_shape)
+              if _is_tensor_list(new_output.t):
+                new_output.t.set_shape([])
+              else:
+                new_output.t.set_shape(output_shape)
             self._add_conversion(old_output, new_output)
         stack.pop(0)
 
@@ -2887,8 +2946,10 @@ def _convert_rank(pfor_input):
 @RegisterPFor("AddN")
 def _convert_addn(pfor_input):
   # AddN does not support broadcasting.
-  pfor_input.stack_inputs()
-  return wrap(math_ops.add_n([x.t for x in pfor_input.inputs]), True)
+  pfor_input.stack_inputs(tile_variants=False)
+  return _wrap_and_tile_variants(
+      math_ops.add_n([x.t for x in pfor_input.inputs]),
+      pfor_input.pfor.loop_len_vector)
 
 
 @RegisterPFor("Cross")
@@ -3517,8 +3578,7 @@ def _convert_tensor_array_grad_v3(pfor_input):
   return [wrap(grad_handle, False), wrap(flow_out, True)]
 
 
-def _stack_tensor_list_shape(shape, pfor_input):
-  first_dim = pfor_input.pfor.loop_len_vector
+def _stack_tensor_list_shape(shape, first_dim):
   shape_value = tensor_util.constant_value(shape)
   # Note that negative values in the shape are used to signify unknown shapes
   # and are handled in a special way.
@@ -3536,15 +3596,35 @@ def _stack_tensor_list_shape(shape, pfor_input):
         lambda: array_ops.concat([first_dim, shape], axis=0))
 
 
-def _tile_variant(t, pfor_input):
+def _tile_variant_with_length(t, length):
   """stacks `t` `length` times."""
+  if _is_tensor_list(t):
+    # The content of TensorLists is vectorized, not the variant itself.
+    return t
+  original_tensor = t
   t.set_shape([])
   t = array_ops.reshape(t, [-1])
   with ops.device("CPU:0"):
-    return array_ops.tile(t, pfor_input.pfor.loop_len_vector)
+    result = array_ops.tile(t, length)
+    # TODO(b/169968286): Should regular shape functions do handle data
+    # propagation here?
+    custom_gradient.copy_handle_data(original_tensor, result)
+    return result
+
+
+def _tile_variant(t, pfor_input):
+  """stacks `t` according to its loop context."""
+  return _tile_variant_with_length(t, pfor_input.pfor.loop_len_vector)
 
 
 def _untile_variant(t):
+  if _is_tensor_list(t):
+    # The content of TensorLists is vectorized, not the variant itself.
+    if not t.shape.is_compatible_with([]):
+      raise AssertionError(
+          "Unexpectedly saw a TensorList with non-scalar shape: {!r}"
+          .format(t))
+    return t
   return array_ops.gather(t, 0)
 
 
@@ -3555,7 +3635,8 @@ def _convert_tensor_list_reserve(pfor_input):
   element_dtype = pfor_input.get_attr("element_dtype")
 
   # Prepend a dimension to element_shape.
-  element_shape = _stack_tensor_list_shape(element_shape, pfor_input)
+  element_shape = _stack_tensor_list_shape(element_shape,
+                                           pfor_input.pfor.loop_len_vector)
   handle = list_ops.tensor_list_reserve(
       element_shape, num_elements, element_dtype=element_dtype)
 
@@ -3578,16 +3659,16 @@ def _convert_tensor_list_length(pfor_input):
   return wrap(list_ops.tensor_list_length(handle), False)
 
 
-def _stack_tensor_list(handle, dtype, pfor_input, element_shape=None):
+def _stack_tensor_list(handle, dtype, loop_len_vector, element_shape=None):
   if element_shape is None:
     element_shape = list_ops.tensor_list_element_shape(handle, dtypes.int32)
   length = list_ops.tensor_list_length(handle)
   new_handle = list_ops.tensor_list_reserve(
-      _stack_tensor_list_shape(element_shape, pfor_input), length, dtype)
+      _stack_tensor_list_shape(element_shape, loop_len_vector), length, dtype)
 
   def _body_fn(i, h):
     elem = list_ops.tensor_list_get_item(handle, i, dtype, element_shape)
-    elem = _stack(elem, pfor_input.pfor.loop_len_vector).t
+    elem = _stack(elem, loop_len_vector).t
     return i + 1, list_ops.tensor_list_set_item(h, i, elem)
 
   return control_flow_ops.while_loop(lambda i, _: i < length, _body_fn,
@@ -3603,7 +3684,8 @@ def _convert_tensor_list_get_item(pfor_input):
 
   if handle_stacked:
     handle = _untile_variant(handle)
-    element_shape = _stack_tensor_list_shape(element_shape, pfor_input)
+    element_shape = _stack_tensor_list_shape(element_shape,
+                                             pfor_input.pfor.loop_len_vector)
     if index_stacked:
       # We use a sequential loop since that may be more efficient than first
       # gathering and concatenating all the element corresponding to `index`,
@@ -3649,7 +3731,8 @@ def _convert_tensor_array_set_item(pfor_input):
       return wrap(
           list_ops.tensor_list_scatter(item, index, input_handle=handle), False)
     else:
-      handle = _stack_tensor_list(handle, item.dtype, pfor_input)
+      handle = _stack_tensor_list(handle, item.dtype,
+                                  pfor_input.pfor.loop_len_vector)
   else:
     handle = _untile_variant(handle)
 
@@ -3665,6 +3748,46 @@ def _convert_tensor_array_set_item(pfor_input):
     return wrap(_tile_variant(handle, pfor_input), True)
 
 
+@RegisterPFor("TensorListConcatV2")
+def _convert_tensor_list_concat_v2(pfor_input):
+  input_handle = pfor_input.stacked_input(0)
+  element_shape = pfor_input.unstacked_input(1)
+  leading_dims = pfor_input.unstacked_input(2)
+  element_dtype = pfor_input.get_attr("element_dtype")
+
+  handle = _untile_variant(input_handle)
+  length = list_ops.tensor_list_length(handle)
+  # Note that element_shape attribute can have incomplete shapes. This doesn't
+  # seem to work well when creating another list and then doing a concat on it.
+  # Hence we try to find the dynamic shape here.
+  element_shape = control_flow_ops.cond(
+      length > 0, lambda: array_ops.shape(
+          list_ops.tensor_list_get_item(handle, 0, element_dtype, None)),
+      lambda: constant_op.constant([0, 0], dtype=dtypes.int32))
+  # The code below creates a copy of the list with each elements' first two
+  # dimensions transposed.
+  new_element_shape = array_ops.concat(
+      [element_shape[1:2], element_shape[0:1], element_shape[2:]], axis=0)
+
+  # Create a new TensorList with elements transposed.
+  def _transpose_elem(i, h):
+    elem = list_ops.tensor_list_get_item(handle, i, element_dtype, None)
+    elem = _transpose_first_two_dims(elem)
+    return i + 1, list_ops.tensor_list_set_item(h, i, elem)
+
+  new_handle = list_ops.tensor_list_reserve(new_element_shape, length,
+                                            element_dtype)
+  new_handle = control_flow_ops.while_loop(lambda i, _: i < length,
+                                           _transpose_elem, [0, new_handle])[1]
+  output, lengths = gen_list_ops.tensor_list_concat_v2(
+      input_handle=new_handle,
+      element_dtype=element_dtype,
+      element_shape=new_element_shape,
+      leading_dims=leading_dims)
+  output = _transpose_first_two_dims(output)
+  return wrap(output, True), wrap(lengths, False)
+
+
 @RegisterPFor("TensorListStack")
 def _convert_tensor_list_stack(pfor_input):
   handle = pfor_input.stacked_input(0)
@@ -3673,7 +3796,8 @@ def _convert_tensor_list_stack(pfor_input):
   num_elements = pfor_input.get_attr("num_elements")
 
   handle = _untile_variant(handle)
-  input_shape = _stack_tensor_list_shape(input_shape, pfor_input)
+  input_shape = _stack_tensor_list_shape(input_shape,
+                                         pfor_input.pfor.loop_len_vector)
   output = list_ops.tensor_list_stack(
       handle,
       element_dtype,
@@ -3692,7 +3816,8 @@ def _convert_tensor_list_gather(pfor_input):
 
   if handle_stacked:
     handle = _untile_variant(handle)
-    element_shape = _stack_tensor_list_shape(element_shape, pfor_input)
+    element_shape = _stack_tensor_list_shape(element_shape,
+                                             pfor_input.pfor.loop_len_vector)
     if index_stacked:
       # We use a sequential loop since that may be more efficient than first
       # gathering and concatenating all the element corresponding to `index`,
@@ -3735,7 +3860,8 @@ def _convert_tensor_list_scatter(pfor_input):
   if handle_stacked:
     handle = _untile_variant(handle)
   else:
-    handle = _stack_tensor_list(handle, item.dtype, pfor_input)
+    handle = _stack_tensor_list(handle, item.dtype,
+                                pfor_input.pfor.loop_len_vector)
 
   item = _transpose_first_two_dims(item)
   handle = list_ops.tensor_list_scatter(item, indices, input_handle=handle)
@@ -3747,7 +3873,8 @@ def _convert_tensor_list_from_tensor(pfor_input):
   tensor = pfor_input.stacked_input(0)
   element_shape = pfor_input.unstacked_input(1)
   tensor = _transpose_first_two_dims(tensor)
-  element_shape = _stack_tensor_list_shape(element_shape, pfor_input)
+  element_shape = _stack_tensor_list_shape(element_shape,
+                                           pfor_input.pfor.loop_len_vector)
   handle = list_ops.tensor_list_from_tensor(tensor, element_shape)
   return wrap(_tile_variant(handle, pfor_input), True)
 
@@ -4106,8 +4233,12 @@ class WhileV2(object):
       shapes = [tensor_shape.TensorShape(shape) for shape in shapes]
     for i, shape in enumerate(shapes):
       shape = shape.merge_with(output_shapes[i])
-      if self._pfor_input.input(i).is_stacked:
-        shape = tensor_shape.TensorShape([None]).concatenate(shape)
+      pfor_input = self._pfor_input.input(i)
+      if pfor_input.is_stacked:
+        if _is_tensor_list(pfor_input.t):
+          shape = tensor_shape.TensorShape([]).concatenate(shape)
+        else:
+          shape = tensor_shape.TensorShape([None]).concatenate(shape)
       output_shapes[i] = shape
     assert len(output_shapes) == self._pfor_input.num_inputs
     return output_shapes
@@ -4217,7 +4348,12 @@ class WhileV2(object):
           if out.is_stacked != inp.is_stacked:
             stacking_mismatch = True
             mismatching_stacked_indices.append(i)
-            wrapped_inputs[i] = _stack(inp.t, [array_ops.size(new_indices)])
+            stacked = _stack(inp.t, [array_ops.size(new_indices)])
+            if inp.t.dtype == dtypes.variant:
+              stacked = wrap(
+                  _tile_variant_with_length(stacked.t,
+                                            [array_ops.size(new_indices)]))
+            wrapped_inputs[i] = stacked
         if not stacking_mismatch:
           if mismatching_stacked_indices:
             # We needed to stack some inputs. This code will be abandoned and
@@ -4372,7 +4508,8 @@ class WhileV2(object):
     _ = while_fn.get_concrete_function()
     if indices_to_stack:
       # Need to abandon the current conversion, stack some inputs and restart.
-      self._pfor_input.stack_inputs(stack_indices=indices_to_stack)
+      self._pfor_input.stack_inputs(
+          stack_indices=indices_to_stack, tile_variants=True)
       # Note that this call will recurse at most one time. The first call will
       # do the required stacking, based on the iterative procedure in
       # _process_body, and the next invocation to __call__ should not need to do
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index 95e5602a246..2934491e69a 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -2,17 +2,7 @@ load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow/tools/test:performance.bzl", "tf_py_logged_benchmark")
 
 package(
-    default_visibility = [
-        "//intelligence/datum/prensor:__pkg__",
-        "//learning/brain/contrib/text:__pkg__",
-        "//nlp/nlx/bert:__pkg__",
-        "//nlp/nlx/i18n/pangloss:__subpackages__",
-        "//nlp/nlx/infrastructure/multiscale:__subpackages__",
-        "//nlp/projects/atc/tf/ops:__pkg__",
-        "//research/graph/convolutions/model:__subpackages__",
-        "//research/socrates:__subpackages__",
-        "//tensorflow:internal",
-    ],
+    default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],  # Apache 2.0
 )
 
@@ -481,6 +471,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "ragged_tensor_test_ops",
+    srcs = ["ragged_tensor_test_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:bitwise_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
 #-------------------------------------------------------------------------------
 # RaggedTensor Tests
 #-------------------------------------------------------------------------------
@@ -507,6 +510,7 @@ py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -638,6 +642,9 @@ py_test(
     python_version = "PY3",
     shard_count = 4,
     srcs_version = "PY2AND3",
+    tags = [
+        "notsan",  # TODO(b/170902201): Flaky
+    ],
     deps = [
         ":ragged_factory_ops",
         ":ragged_gather_ops",
@@ -1056,17 +1063,19 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",  # fixdeps: keep
+        ":ragged_dispatch",
         ":ragged_factory_ops",
         ":ragged_tensor",
+        ":ragged_tensor_test_ops",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:bitwise_ops",
         "//tensorflow/python:clip_ops",
+        "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
@@ -1296,3 +1305,32 @@ py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+py_test(
+    name = "ragged_tensor_supported_values_test",
+    srcs = ["ragged_tensor_supported_values_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_tensor_test_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:composite_tensor",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:type_spec",
+        "//tensorflow/python:util",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/ops/ragged/ragged_conversion_ops.py b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
index e8c625ccc73..e915d1ecd61 100644
--- a/tensorflow/python/ops/ragged/ragged_conversion_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
@@ -143,3 +143,42 @@ def to_sparse(rt_input, name=None):
 
 def from_sparse(st_input, name=None):
   return ragged_tensor.RaggedTensor.from_sparse(st_input, name)
+
+
+@ops.RegisterGradient("RaggedTensorFromVariant")
+def _ragged_tensor_from_variant_grad(op, *grads):
+  """Gradient for RaggedTensorFromVariant op."""
+
+  variant_rank = op.inputs[0].shape.rank
+  if variant_rank == 0:
+    batched_input = False
+  elif variant_rank == 1:
+    batched_input = True
+  elif variant_rank is None:
+    batched_input = (op.get_attr("output_ragged_rank") > 0)
+  else:
+    # TODO(edloper): Add a batch_dims argument to RaggedTensorToVariant, so
+    # we can support this.
+    raise ValueError("Unable to compute gradient: RaggedTensorToVariant "
+                     "can currently only generate 0D or 1D output.")
+  return [
+      gen_ragged_conversion_ops.ragged_tensor_to_variant(
+          rt_nested_splits=op.outputs[:-1],
+          rt_dense_values=grads[-1],
+          batched_input=batched_input)
+  ]
+
+
+@ops.RegisterGradient("RaggedTensorToVariant")
+def _ragged_tensor_to_variant_grad(op, encoded_ragged_grad):
+  """Gradient for RaggedTensorToVariant op."""
+  dense_values = op.inputs[-1]
+  ragged_rank = len(op.inputs) - 1
+  row_splits = 0 if ragged_rank == 0 else op.inputs[0]
+  values_grad = gen_ragged_conversion_ops.ragged_tensor_to_variant_gradient(
+      encoded_ragged_grad=encoded_ragged_grad,
+      row_splits=row_splits,
+      dense_values_shape=array_ops.shape(dense_values),
+      Tvalues=op.inputs[-1].dtype)
+  result = [None] * ragged_rank + [values_grad]
+  return result
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
index 5c9388b8677..312a5bada35 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -389,6 +389,8 @@ _BINARY_ELEMENTWISE_OPS = [
     math_ops.realdiv,
     math_ops.squared_difference,
     math_ops.subtract,
+    math_ops.tensor_equals,
+    math_ops.tensor_not_equals,
     math_ops.truediv,
     math_ops.truncatediv,
     math_ops.truncatemod,
@@ -497,6 +499,7 @@ _RAGGED_DISPATCH_OPS = [
     (array_ops.stack, ragged_concat_ops.stack, ['[values]']),
     (array_ops.tile, ragged_array_ops.tile, ['input']),
     (array_ops.where, ragged_where_op.where, ['condition', 'x', 'y']),
+    (array_ops.where_v2, ragged_where_op.where_v2, ['condition', 'x', 'y']),
     (data_flow_ops.dynamic_partition, _ragged_dynamic_partition,
      ['data', 'partitions']),
     (math_ops.unsorted_segment_sum, ragged_math_ops.segment_sum,
@@ -535,8 +538,9 @@ def register_dispatchers():
     _, undecorated_op = tf_decorator.unwrap(op)
     if not hasattr(undecorated_op,
                    tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names):
-      raise AssertionError('Expected %s to be an exported symbol '
-                           '(while adding a RaggedTensor dispatcher)')
+      raise AssertionError('Expected %r to be an exported symbol '
+                           '(while adding a RaggedTensor dispatcher)'
+                           % (undecorated_op,))
 
   for op in _UNARY_ELEMENTWISE_OPS:
     UnaryRaggedElementwiseDispatcher(op).register(op)
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
index 7ef0d9fd0b8..7a1d7c1882a 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@@ -30,115 +30,15 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import data_flow_ops
-from tensorflow.python.ops import gen_bitwise_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_dispatch
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_test_ops as test_ops
 from tensorflow.python.platform import googletest
 
-# Constants listing various op types to test.  Each operation
-# should be included in at least one list below, or tested separately if
-# necessary (e.g., because it expects additional arguments).
-UNARY_FLOAT_OPS = [
-    math_ops.abs,
-    math_ops.acos,
-    math_ops.acosh,
-    math_ops.angle,
-    math_ops.asin,
-    math_ops.asinh,
-    math_ops.atan,
-    math_ops.atanh,
-    math_ops.ceil,
-    math_ops.conj,
-    math_ops.cos,
-    math_ops.cosh,
-    math_ops.digamma,
-    math_ops.erf,
-    math_ops.erfc,
-    math_ops.erfinv,
-    math_ops.exp,
-    math_ops.expm1,
-    math_ops.floor,
-    math_ops.imag,
-    math_ops.is_finite,
-    math_ops.is_inf,
-    math_ops.is_nan,
-    math_ops.lgamma,
-    math_ops.log,
-    math_ops.log1p,
-    math_ops.log_sigmoid,
-    math_ops.ndtri,
-    math_ops.negative,
-    math_ops.real,
-    math_ops.reciprocal,
-    math_ops.rint,
-    math_ops.round,
-    math_ops.rsqrt,
-    math_ops.sign,
-    math_ops.sin,
-    math_ops.sinh,
-    math_ops.sqrt,
-    math_ops.square,
-    math_ops.tan,
-    array_ops.identity,
-    array_ops.ones_like,
-    array_ops.zeros_like,
-]
-UNARY_BOOL_OPS = [
-    math_ops.logical_not,
-]
-UNARY_STRING_OPS = [
-    string_ops.decode_base64,
-    string_ops.encode_base64,
-    string_ops.string_strip,
-    parsing_ops.decode_compressed,
-]
-BINARY_FLOAT_OPS = [
-    math_ops.add,
-    math_ops.atan2,
-    math_ops.complex,
-    math_ops.div_no_nan,
-    math_ops.divide,
-    math_ops.equal,
-    math_ops.floordiv,
-    math_ops.floormod,
-    math_ops.greater,
-    math_ops.greater_equal,
-    math_ops.less,
-    math_ops.less_equal,
-    math_ops.maximum,
-    math_ops.minimum,
-    math_ops.multiply,
-    math_ops.not_equal,
-    math_ops.pow,
-    math_ops.realdiv,
-    math_ops.squared_difference,
-    math_ops.subtract,
-    math_ops.truediv,
-]
-BINARY_BOOL_OPS = [
-    math_ops.logical_and,
-    math_ops.logical_or,
-    math_ops.logical_xor,
-]
-UNARY_INT_OPS = [
-    gen_bitwise_ops.invert,
-    string_ops.unicode_script,
-]
-BINARY_INT_OPS = [
-    gen_bitwise_ops.bitwise_and,
-    gen_bitwise_ops.bitwise_or,
-    gen_bitwise_ops.bitwise_xor,
-    gen_bitwise_ops.left_shift,
-    gen_bitwise_ops.right_shift,
-    math_ops.truncatediv,
-    math_ops.truncatemod,
-]
-
 
 # pylint: disable=g-complex-comprehension
 @test_util.run_all_in_graph_and_eager_modes
@@ -183,17 +83,17 @@ class RaggedDispatchTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       # Test each unary op.
       #=========================================================================
       [{'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]), 'op': op}
-       for op in UNARY_FLOAT_OPS] +
+       for op in test_ops.UNARY_FLOAT_OPS] +
       [{'x': ragged_factory_ops.constant_value([[True, False], [True]]),
         'op': op}
-       for op in UNARY_BOOL_OPS] +
+       for op in test_ops.UNARY_BOOL_OPS] +
       [{'x': ragged_factory_ops.constant_value([[18, 512], [12412]], np.int32),
         'op': op}
-       for op in UNARY_INT_OPS] +
+       for op in test_ops.UNARY_INT_OPS] +
       [{'x': ragged_factory_ops.constant_value([['abcd', 'efgh'],
                                                 ['aabbccdd']]),
         'op': op}
-       for op in UNARY_STRING_OPS] +
+       for op in test_ops.UNARY_STRING_OPS] +
       [
           {'op': clip_ops.clip_by_value,
            'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
@@ -239,6 +139,11 @@ class RaggedDispatchTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       ]
       )  # pyformat: disable
   def testUnaryElementwiseOp(self, x, op=math_ops.abs, **extra_args):
+    if test_util.IsBuiltWithROCm():
+      # TODO(rocm):
+      # This fails on ROCm...see JIRA ticket 236756
+      self.skipTest('Fails on ROCM')
+
     result = op(x, **extra_args)
 
     # Run the wrapped op on the dense values, for comparison.
@@ -337,20 +242,20 @@ class RaggedDispatchTest(test_util.TensorFlowTestCase, parameterized.TestCase):
            'use_kwargs': ('x',)},
       ] +
       #=========================================================================
-      # Test each unary op.
+      # Test each binary op.
       #=========================================================================
       [{'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
         'y': ragged_factory_ops.constant_value([[5.0, 1.0], [12.0]]),
         'op': op}
-       for op in BINARY_FLOAT_OPS] +
+       for op in test_ops.BINARY_FLOAT_OPS] +
       [{'x': ragged_factory_ops.constant_value([[-2, 3], [-3]]),
         'y': ragged_factory_ops.constant_value([[5, 1], [12]]),
         'op': op}
-       for op in BINARY_INT_OPS] +
+       for op in test_ops.BINARY_INT_OPS] +
       [{'x': ragged_factory_ops.constant_value([[True, True], [False]]),
         'y': ragged_factory_ops.constant_value([[False, True], [False]]),
         'op': op}
-       for op in BINARY_BOOL_OPS]
+       for op in test_ops.BINARY_BOOL_OPS]
       )  # pyformat: disable
   def testBinaryElementwiseOp(self, x, y, op=math_ops.add, **extra_args):
     use_kwargs = extra_args.pop('use_kwargs', ())
@@ -581,6 +486,12 @@ class RaggedDispatchTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           op=array_ops.where,
           args=(ragged_factory_ops.constant_value([[True, False], [True]]),),
           expected=[[0, 0], [1, 0]]),
+      dict(
+          op=array_ops.where_v2,
+          args=(ragged_factory_ops.constant_value([[True, False], [True]]),
+                ragged_factory_ops.constant_value([[b'A', b'B'], [b'C']]),
+                ragged_factory_ops.constant_value([[b'a', b'b'], [b'c']])),
+          expected=ragged_factory_ops.constant_value([[b'A', b'b'], [b'C']])),
       dict(
           op=math_ops.unsorted_segment_sum,
           kwargs={
@@ -834,8 +745,7 @@ class RaggedDispatchTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     ]
 
     # Ops that should be listed as supported in v1 only.
-    # TODO(edloper): Add a dispatch for where_v2.
-    supported_ops_v1 = ['batch_gather', 'where']
+    supported_ops_v1 = ['batch_gather']
 
     # Ops that should be listed as supported in v2 only.
     supported_ops_v2 = []
diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops.py b/tensorflow/python/ops/ragged/ragged_factory_ops.py
index 0513c6b690b..1d57187e518 100644
--- a/tensorflow/python/ops/ragged/ragged_factory_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_factory_ops.py
@@ -346,5 +346,6 @@ def placeholder(dtype, ragged_rank, value_shape=None, name=None):
     for i in reversed(range(ragged_rank)):
       row_splits = array_ops.placeholder(dtypes.int64, [None],
                                          "row_splits_%d" % i)
-      result = ragged_tensor.RaggedTensor.from_row_splits(result, row_splits)
+      result = ragged_tensor.RaggedTensor.from_row_splits(result, row_splits,
+                                                          validate=False)
     return result
diff --git a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
index 8a40e396a68..bead4923a0a 100644
--- a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
@@ -150,6 +150,21 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase,
           result_dtype=ragged_tensor.RaggedTensorType(
               dtype=dtypes.int64, ragged_rank=4),
       ),
+      # [d1] -> [d1, (d2), (d3)]
+      dict(
+          fn=ragged_math_ops.range,
+          elems=np.array([1, 2, 3], np.int64),
+          expected_output=[[[0]], [[0, 1]], [[0, 1, 2]]],
+          result_dtype=ragged_tensor.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=2)),
+      # [0] -> [0, (d2), (d3)]  (github issue #36232)
+      dict(
+          fn=ragged_math_ops.range,
+          elems=np.zeros([0], np.int64),
+          expected_output=[],
+          expected_ragged_rank=2,
+          result_dtype=ragged_tensor.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=2)),
   ])
 
   def testRaggedMap(
diff --git a/tensorflow/python/ops/ragged/ragged_math_ops.py b/tensorflow/python/ops/ragged/ragged_math_ops.py
index 73a53583ada..60d608b381e 100644
--- a/tensorflow/python/ops/ragged/ragged_math_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_math_ops.py
@@ -40,8 +40,12 @@ from tensorflow.python.util.tf_export import tf_export
 # pylint: disable=redefined-builtin
 @tf_export('ragged.range')
 @dispatch.add_dispatch_support
-def range(starts, limits=None, deltas=1, dtype=None,
-          name=None, row_splits_dtype=dtypes.int64):
+def range(starts,
+          limits=None,
+          deltas=1,
+          dtype=None,
+          name=None,
+          row_splits_dtype=dtypes.int64):
   """Returns a `RaggedTensor` containing the specified sequences of numbers.
 
   Each row of the returned `RaggedTensor` contains a single sequence:
@@ -104,9 +108,8 @@ def range(starts, limits=None, deltas=1, dtype=None,
 
     result = gen_ragged_math_ops.ragged_range(
         starts, limits, deltas, Tsplits=row_splits_dtype, name=name)
-    return ragged_tensor.RaggedTensor.from_row_splits(result.rt_dense_values,
-                                                      result.rt_nested_splits,
-                                                      validate=False)
+    return ragged_tensor.RaggedTensor.from_row_splits(
+        result.rt_dense_values, result.rt_nested_splits, validate=False)
 
 
 def _infer_matching_dtype(tensors, dtype_hierarchy):
@@ -118,7 +121,6 @@ def _infer_matching_dtype(tensors, dtype_hierarchy):
 
 ops.no_gradient('RaggedRange')
 
-
 #===============================================================================
 # ragged_segment_<AGGREGATE>
 #===============================================================================
@@ -181,8 +183,8 @@ def _ragged_segment_aggregate(unsorted_segment_op,
       `int32`.  `segment_ids.shape` must be a prefix of `data.shape`.
       `segment_ids` is not required to be sorted.
     num_segments: An `int32` or `int64` scalar.
-    separator: An optional string. Defaults to None. The separator to
-      use when joining. Only used for string types.
+    separator: An optional string. Defaults to None. The separator to use when
+      joining. Only used for string types.
     name: A name prefix for the returned tensor (optional).
 
   Returns:
@@ -261,38 +263,42 @@ def _ragged_segment_aggregate(unsorted_segment_op,
 
 def segment_sum(data, segment_ids, num_segments, name=None):
   # For docs, see: _RAGGED_SEGMENT_DOCSTRING
-  return _ragged_segment_aggregate(math_ops.unsorted_segment_sum,
-                                   data=data,
-                                   segment_ids=segment_ids,
-                                   num_segments=num_segments,
-                                   name=(name or'RaggedSegmentSum'))
+  return _ragged_segment_aggregate(
+      math_ops.unsorted_segment_sum,
+      data=data,
+      segment_ids=segment_ids,
+      num_segments=num_segments,
+      name=(name or 'RaggedSegmentSum'))
 
 
 def segment_prod(data, segment_ids, num_segments, name=None):
   # For docs, see: _RAGGED_SEGMENT_DOCSTRING
-  return _ragged_segment_aggregate(math_ops.unsorted_segment_prod,
-                                   data=data,
-                                   segment_ids=segment_ids,
-                                   num_segments=num_segments,
-                                   name=(name or 'RaggedSegmentProd'))
+  return _ragged_segment_aggregate(
+      math_ops.unsorted_segment_prod,
+      data=data,
+      segment_ids=segment_ids,
+      num_segments=num_segments,
+      name=(name or 'RaggedSegmentProd'))
 
 
 def segment_min(data, segment_ids, num_segments, name=None):
   # For docs, see: _RAGGED_SEGMENT_DOCSTRING
-  return _ragged_segment_aggregate(math_ops.unsorted_segment_min,
-                                   data=data,
-                                   segment_ids=segment_ids,
-                                   num_segments=num_segments,
-                                   name=(name or 'RaggedSegmentMin'))
+  return _ragged_segment_aggregate(
+      math_ops.unsorted_segment_min,
+      data=data,
+      segment_ids=segment_ids,
+      num_segments=num_segments,
+      name=(name or 'RaggedSegmentMin'))
 
 
 def segment_max(data, segment_ids, num_segments, name=None):
   # For docs, see: _RAGGED_SEGMENT_DOCSTRING
-  return _ragged_segment_aggregate(math_ops.unsorted_segment_max,
-                                   data=data,
-                                   segment_ids=segment_ids,
-                                   num_segments=num_segments,
-                                   name=(name or 'RaggedSegmentMax'))
+  return _ragged_segment_aggregate(
+      math_ops.unsorted_segment_max,
+      data=data,
+      segment_ids=segment_ids,
+      num_segments=num_segments,
+      name=(name or 'RaggedSegmentMax'))
 
 
 def segment_mean(data, segment_ids, num_segments, name=None):
@@ -301,7 +307,8 @@ def segment_mean(data, segment_ids, num_segments, name=None):
                       [data, segment_ids, num_segments]):
     total = segment_sum(data, segment_ids, num_segments)
     ones = ragged_tensor.RaggedTensor.from_nested_row_splits(
-        array_ops.ones_like(data.flat_values), data.nested_row_splits,
+        array_ops.ones_like(data.flat_values),
+        data.nested_row_splits,
         validate=False)
     count = segment_sum(ones, segment_ids, num_segments)
     if ragged_tensor.is_ragged(total):
@@ -316,12 +323,13 @@ def segment_sqrt_n(data, segment_ids, num_segments, name=None):
                       [data, segment_ids, num_segments]):
     total = segment_sum(data, segment_ids, num_segments)
     ones = ragged_tensor.RaggedTensor.from_nested_row_splits(
-        array_ops.ones_like(data.flat_values), data.nested_row_splits,
+        array_ops.ones_like(data.flat_values),
+        data.nested_row_splits,
         validate=False)
     count = segment_sum(ones, segment_ids, num_segments)
     if ragged_tensor.is_ragged(total):
-      return total.with_flat_values(
-          total.flat_values / math_ops.sqrt(count.flat_values))
+      return total.with_flat_values(total.flat_values /
+                                    math_ops.sqrt(count.flat_values))
     else:
       return total / math_ops.sqrt(count)
 
@@ -455,8 +463,8 @@ def ragged_reduce_aggregate(reduce_op,
       range `[0, rt_input.rank)`.
     keepdims: If true, retains reduced dimensions with length 1.
     separator: An optional string. Defaults to None. The separator to use when
-      joining. The separator must not be set for non-string data types. (i.e.
-      if separator is not None then it uses string ops)
+      joining. The separator must not be set for non-string data types. (i.e. if
+      separator is not None then it uses string ops)
     name: A name prefix for the returned tensor (optional).
 
   Returns:
@@ -470,14 +478,12 @@ def ragged_reduce_aggregate(reduce_op,
   """
   if not ragged_tensor.is_ragged(rt_input):
     if separator is None:
-      return reduce_op(rt_input, axis, name=name)
+      return reduce_op(rt_input, axis, keepdims=keepdims, name=name)
     else:
       # When separator is not None, We infer that dtype is string and
       # reduce_join will be called.
-      return reduce_op(rt_input, axis, name=name, separator=separator)
-
-  if keepdims:
-    raise ValueError('keepdims=True is not supported for RaggedTensors.')
+      return reduce_op(
+          rt_input, axis, keepdims=keepdims, name=name, separator=separator)
 
   if isinstance(axis, ops.Tensor):
     axis = tensor_util.constant_value(axis)
@@ -488,7 +494,12 @@ def ragged_reduce_aggregate(reduce_op,
 
   # When reducing all axes, just ignore splits & reduce the inner values.
   if axis is None:
-    return reduce_op(rt_input.flat_values, None, name=name)
+    result = reduce_op(rt_input.flat_values, None, keepdims=keepdims, name=name)
+    if keepdims:
+      # Expand the result to the input number of dimensions.
+      for _ in rt_input.shape[1:]:
+        result = array_ops.expand_dims(result, axis=0)
+    return result
 
   with ops.name_scope(name, 'RaggedReduce', [rt_input, axis]):
     if isinstance(axis, (tuple, list)):
@@ -529,15 +540,21 @@ def ragged_reduce_aggregate(reduce_op,
       row_lengths = rt_input.row_splits[1:] - rt_input.row_splits[:-1]
       num_segments = math_ops.maximum(math_ops.reduce_max(row_lengths), 0)
       segment_ids = range(row_lengths).values
-      return _ragged_segment_aggregate(unsorted_segment_op, rt_input.values,
-                                       segment_ids, num_segments, separator)
+      result = _ragged_segment_aggregate(unsorted_segment_op, rt_input.values,
+                                         segment_ids, num_segments, separator)
+      if keepdims:
+        result = array_ops.expand_dims(result, axis=0)
+      return result
     elif axis == 1:
       # out[i_0, i_1, i_2, ..., i_N] = sum_{j} rt_input[i_0, j, i_2, ..., i_N]
       num_segments = array_ops.shape(rt_input.row_splits)[0] - 1
       segment_ids = segment_id_ops.row_splits_to_segment_ids(
           rt_input.row_splits)
-      return _ragged_segment_aggregate(unsorted_segment_op, rt_input.values,
-                                       segment_ids, num_segments, separator)
+      result = _ragged_segment_aggregate(unsorted_segment_op, rt_input.values,
+                                         segment_ids, num_segments, separator)
+      if keepdims:
+        result = array_ops.expand_dims(result, axis=1)
+      return result
     else:
       # out[i_0, ..., i_[axis-1], i_axis+1], ..., i_N] =
       #     sum_{j} rt_input [i_0, ..., i_[axis-1], j, i_axis+1], ..., i_N]
@@ -554,7 +571,8 @@ def reduce_sum(input_tensor, axis=None, keepdims=None, name=None):
       reduce_op=math_ops.reduce_sum,
       unsorted_segment_op=math_ops.unsorted_segment_sum,
       rt_input=input_tensor,
-      axis=axis, keepdims=keepdims,
+      axis=axis,
+      keepdims=keepdims,
       name=(name or 'RaggedReduceSum'))
 
 
@@ -598,13 +616,15 @@ def reduce_mean(input_tensor, axis=None, keepdims=None, name=None):
     if ragged_tensor.is_ragged(input_tensor):
       ones = ragged_tensor.RaggedTensor.from_nested_row_splits(
           array_ops.ones_like(input_tensor.flat_values),
-          input_tensor.nested_row_splits, validate=False)
+          input_tensor.nested_row_splits,
+          validate=False)
     else:
       ones = array_ops.ones_like(input_tensor)
     count = reduce_sum(ones, axis, keepdims)
     if ragged_tensor.is_ragged(total):
       return ragged_tensor.RaggedTensor.from_nested_row_splits(
-          total.flat_values / count.flat_values, total.nested_row_splits,
+          total.flat_values / count.flat_values,
+          total.nested_row_splits,
           validate=False)
     else:
       return total / count
diff --git a/tensorflow/python/ops/ragged/ragged_operators.py b/tensorflow/python/ops/ragged/ragged_operators.py
index 7654fa22b1e..28091773614 100644
--- a/tensorflow/python/ops/ragged/ragged_operators.py
+++ b/tensorflow/python/ops/ragged/ragged_operators.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_getitem
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -29,9 +30,25 @@ def _right(operator):
   return tf_decorator.make_decorator(operator, lambda y, x: operator(x, y))
 
 
+def ragged_hash(self):
+  """The operation invoked by the `RaggedTensor.__hash__` operator."""
+  g = getattr(self.row_splits, "graph", None)
+  # pylint: disable=protected-access
+  if (ops.Tensor._USE_EQUALITY and ops.executing_eagerly_outside_functions() and
+      (g is None or g.building_function)):
+    raise TypeError("RaggedTensor is unhashable.")
+  else:
+    return id(self)
+
+
 # Indexing
 ragged_tensor.RaggedTensor.__getitem__ = ragged_getitem.ragged_tensor_getitem
 
+# Equality
+ragged_tensor.RaggedTensor.__eq__ = math_ops.tensor_equals
+ragged_tensor.RaggedTensor.__ne__ = math_ops.tensor_not_equals
+ragged_tensor.RaggedTensor.__hash__ = ragged_hash
+
 # Ordering operators
 ragged_tensor.RaggedTensor.__ge__ = math_ops.greater_equal
 ragged_tensor.RaggedTensor.__gt__ = math_ops.greater
diff --git a/tensorflow/python/ops/ragged/ragged_operators_test.py b/tensorflow/python/ops/ragged/ragged_operators_test.py
index ff936d630e6..7a29353a72e 100644
--- a/tensorflow/python/ops/ragged/ragged_operators_test.py
+++ b/tensorflow/python/ops/ragged/ragged_operators_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python import tf2
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import googletest
@@ -26,6 +28,24 @@ from tensorflow.python.platform import googletest
 @test_util.run_all_in_graph_and_eager_modes
 class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase):
 
+  def testEqualityOperators(self):
+    a = ragged_factory_ops.constant([[1, 2], [3]])
+    b = ragged_factory_ops.constant([[4, 5], [3]])
+    c = 2
+
+    if tf2.enabled() and ops.executing_eagerly_outside_functions():
+      # Value-based equality:
+      self.assertAllEqual(a == b, [[False, False], [True]])
+      self.assertAllEqual(a != b, [[True, True], [False]])
+
+      # Value-based equality (w/ broadcasting):
+      self.assertAllEqual(a == c, [[False, True], [False]])
+      self.assertAllEqual(a != c, [[True, False], [True]])
+    else:
+      # Identity-based equality:
+      self.assertAllEqual(a == b, False)
+      self.assertAllEqual(a != b, True)
+
   def testOrderingOperators(self):
     x = ragged_factory_ops.constant([[1, 5], [3]])
     y = ragged_factory_ops.constant([[4, 5], [1]])
diff --git a/tensorflow/python/ops/ragged/ragged_placeholder_op_test.py b/tensorflow/python/ops/ragged/ragged_placeholder_op_test.py
index d2261d408b3..cdad7d49205 100644
--- a/tensorflow/python/ops/ragged/ragged_placeholder_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_placeholder_op_test.py
@@ -21,6 +21,7 @@ from absl.testing import parameterized
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import googletest
@@ -34,30 +35,20 @@ class RaggedPlaceholderOpTest(test_util.TensorFlowTestCase,
       # dtype, ragged_rank, value_shape, name -> expected
       (dtypes.int32, 0, [5], None,
        'Tensor("Placeholder:0", shape=(5,), dtype=int32)'),
-      (dtypes.int32, 1, [], 'ph',
-       'tf.RaggedTensor('
+      (dtypes.int32, 1, [], 'ph', 'tf.RaggedTensor('
        'values=Tensor("ph/flat_values:0", shape=(None,), dtype=int32), '
-       'row_splits=Tensor("ph/RaggedFromRowSplits/control_dependency:0", '
-       'shape=(None,), dtype=int64))'),
-      (dtypes.string, 1, [5], 'ph',
-       'tf.RaggedTensor('
+       'row_splits=Tensor("ph/row_splits_0:0", shape=(None,), dtype=int64))'),
+      (dtypes.string, 1, [5], 'ph', 'tf.RaggedTensor('
        'values=Tensor("ph/flat_values:0", shape=(None, 5), dtype=string), '
-       'row_splits=Tensor("ph/RaggedFromRowSplits/control_dependency:0", '
-       'shape=(None,), dtype=int64))'),
-      (dtypes.float32, 2, [], 'ph',
-       'tf.RaggedTensor(values=tf.RaggedTensor('
+       'row_splits=Tensor("ph/row_splits_0:0", shape=(None,), dtype=int64))'),
+      (dtypes.float32, 2, [], 'ph', 'tf.RaggedTensor(values=tf.RaggedTensor('
        'values=Tensor("ph/flat_values:0", shape=(None,), dtype=float32), '
-       'row_splits=Tensor("ph/RaggedFromRowSplits/control_dependency:0", '
-       'shape=(None,), dtype=int64)), '
-       'row_splits=Tensor("ph/RaggedFromRowSplits_1/control_dependency:0", '
-       'shape=(None,), dtype=int64))'),
-      (dtypes.int32, 2, [3, 5], 'ph',
-       'tf.RaggedTensor(values=tf.RaggedTensor('
+       'row_splits=Tensor("ph/row_splits_1:0", shape=(None,), dtype=int64)), '
+       'row_splits=Tensor("ph/row_splits_0:0", shape=(None,), dtype=int64))'),
+      (dtypes.int32, 2, [3, 5], 'ph', 'tf.RaggedTensor(values=tf.RaggedTensor('
        'values=Tensor("ph/flat_values:0", shape=(None, 3, 5), dtype=int32), '
-       'row_splits=Tensor("ph/RaggedFromRowSplits/control_dependency:0", '
-       'shape=(None,), dtype=int64)), '
-       'row_splits=Tensor("ph/RaggedFromRowSplits_1/control_dependency:0", '
-       'shape=(None,), dtype=int64))'),
+       'row_splits=Tensor("ph/row_splits_1:0", shape=(None,), dtype=int64)), '
+       'row_splits=Tensor("ph/row_splits_0:0", shape=(None,), dtype=int64))'),
   ])
   def testRaggedPlaceholder(self, dtype, ragged_rank, value_shape, name,
                             expected):
@@ -72,6 +63,16 @@ class RaggedPlaceholderOpTest(test_util.TensorFlowTestCase,
       with self.assertRaises(RuntimeError):
         ragged_factory_ops.placeholder(dtypes.int32, 1, [])
 
+  def testRaggedPlaceholderDoesNotIncludeValidationOps(self):
+    if context.executing_eagerly():
+      return
+    graph = ops.Graph()
+    with graph.as_default():
+      ragged_factory_ops.placeholder(
+          dtypes.float32, ragged_rank=1, value_shape=[])
+      self.assertEqual([op.type for op in graph.get_operations()],
+                       ['Placeholder', 'Placeholder'])
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_reduce_op_test.py b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
index a39090fa3a2..2afe0867ad7 100644
--- a/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
@@ -40,8 +40,7 @@ def mean(*values):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class RaggedReduceOpsTest(test_util.TensorFlowTestCase,
-                          parameterized.TestCase):
+class RaggedReduceOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.parameters(
       #=========================================================================
@@ -51,92 +50,212 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase,
       #    [9,     ],
       #    [2, 6   ]]
       #=========================================================================
+      # keepdims=True
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=0,
+          keepdims=False,
           expected=[15, 12, 4]  # = [3+1+9+2, 1+5+6, 4]
       ),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=-2,
+          keepdims=False,
           expected=[15, 12, 4]  # = [3+1+9+2, 1+5+6, 4]
       ),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=1,
+          keepdims=False,
           expected=[8, 6, 9, 8]  # = [3+1+4, 1+5, 9, 2+6]
       ),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=-1,
+          keepdims=False,
           expected=[8, 6, 9, 8]  # = [3+1+4, 1+5, 9, 2+6]
       ),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_prod,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=0,
+          keepdims=False,
           expected=[54, 30, 4]  # = [3*1*9*2, 1*5*6, 4]
       ),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_prod,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=1,
+          keepdims=False,
           expected=[12, 5, 9, 12]  # = [3*1*4, 1*5, 9, 2*6]
       ),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_min,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=0,
+          keepdims=False,
           expected=[1, 1, 4]  # = [min(3, 1, 9, 2), min(1, 5, 6), 4]
       ),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_min,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=1,
+          keepdims=False,
           expected=[1, 1, 9, 2]  # = [min(3, 1, 4), min(1, 5), 9, min(2, 6)]
       ),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_max,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=0,
+          keepdims=False,
           expected=[9, 6, 4]  # = [max(3, 1, 9, 2), max(1, 5, 6), 4]
       ),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_max,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=1,
+          keepdims=False,
           expected=[4, 5, 9, 6]  # = [max(3, 1, 4), max(1, 5), 9, max(2, 6)]
       ),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_mean,
           rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
           axis=0,
+          keepdims=False,
           expected=[3.75, 4, 4]  # = [mean(3, 1, 9, 2), mean(1, 5, 6), 4]
       ),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_any,
           rt_input=[[True, True], [True, True, False, True], [False, True]],
           axis=0,
+          keepdims=False,
           expected=[True, True, False, True]),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_any,
           rt_input=[[True, True], [True, True, False, True], [False, True]],
           axis=1,
+          keepdims=False,
           expected=[True, True, True]),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_all,
           rt_input=[[True, True], [True, True, False, True], [False, True]],
           axis=0,
+          keepdims=False,
           expected=[False, True, False, True]),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_all,
           rt_input=[[True, True], [True, True, False, True], [False, True]],
           axis=1,
+          keepdims=False,
           expected=[True, False, False]),
+      # keepdims=True
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          keepdims=True,
+          expected=[[15, 12, 4]]  # = [[3+1+9+2, 1+5+6, 4]]
+      ),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=-2,
+          keepdims=True,
+          expected=[[15, 12, 4]]  # = [[3+1+9+2, 1+5+6, 4]]
+      ),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=1,
+          keepdims=True,
+          expected=[[8], [6], [9], [8]]  # = [[3+1+4], [1+5], [9], [2+6]]
+      ),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=-1,
+          keepdims=True,
+          expected=[[8], [6], [9], [8]]  # = [[3+1+4], [1+5], [9], [2+6]]
+      ),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_prod,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          keepdims=True,
+          expected=[[54, 30, 4]]  # = [[3*1*9*2, 1*5*6, 4]]
+      ),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_prod,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=1,
+          keepdims=True,
+          expected=[[12], [5], [9], [12]]  # = [[3*1*4], [1*5], [9], [2*6]]
+      ),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_min,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          keepdims=True,
+          expected=[[1, 1, 4]]  # = [[min(3, 1, 9, 2), min(1, 5, 6), 4]]
+      ),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_min,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=1,
+          keepdims=True,
+          expected=[[1], [1], [9],
+                    [2]]  # = [[min(3, 1, 4)], [min(1, 5)], [9], [min(2, 6)]]
+      ),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_max,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          keepdims=True,
+          expected=[[9, 6, 4]]  # = [[max(3, 1, 9, 2), max(1, 5, 6), 4]]
+      ),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_max,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=1,
+          keepdims=True,
+          expected=[[4], [5], [9],
+                    [6]]  # = [[max(3, 1, 4)], [max(1, 5)], [9], [max(2, 6)]]
+      ),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_mean,
+          rt_input=[[3, 1, 4], [1, 5], [9], [2, 6]],
+          axis=0,
+          keepdims=True,
+          expected=[[3.75, 4, 4]]  # = [[mean(3, 1, 9, 2), mean(1, 5, 6), 4]]
+      ),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_any,
+          rt_input=[[True, True], [True, True, False, True], [False, True]],
+          axis=0,
+          keepdims=True,
+          expected=[[True, True, False, True]]),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_any,
+          rt_input=[[True, True], [True, True, False, True], [False, True]],
+          axis=1,
+          keepdims=True,
+          expected=[[True], [True], [True]]),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_all,
+          rt_input=[[True, True], [True, True, False, True], [False, True]],
+          axis=0,
+          keepdims=True,
+          expected=[[False, True, False, True]]),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_all,
+          rt_input=[[True, True], [True, True, False, True], [False, True]],
+          axis=1,
+          keepdims=True,
+          expected=[[True], [False], [False]]),
 
       #=========================================================================
       # Examples with the following RaggedTensor (ragged_rank=1):
@@ -153,52 +272,62 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase,
           ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=None,
+          keepdims=False,
           expected=0 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_prod,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=None,
+          keepdims=False,
           expected=0 * 1 * 2 * 3 * 4 * 5 * 6 * 7 * 8 * 9),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_min,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=None,
+          keepdims=False,
           expected=min(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_max,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=None,
+          keepdims=False,
           expected=max(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_mean,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=None,
+          keepdims=False,
           expected=mean(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)),
       # axis=0
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=0,
+          keepdims=False,
           expected=[0 + 4 + 5 + 7 + 8, 1 + 6 + 9, 2, 3]),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_prod,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=0,
+          keepdims=False,
           expected=[0 * 4 * 5 * 7 * 8, 1 * 6 * 9, 2, 3]),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_min,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=0,
+          keepdims=False,
           expected=[min(0, 4, 5, 7, 8), min(1, 6, 9), 2, 3]),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_max,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=0,
+          keepdims=False,
           expected=[max(0, 4, 5, 7, 8), max(1, 6, 9), 2, 3]),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_mean,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=0,
+          keepdims=False,
           expected=[mean(0, 4, 5, 7, 8),
                     mean(1, 6, 9), 2, 3]),
       # axis=1
@@ -208,16 +337,19 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase,
           ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=1,
+          keepdims=False,
           expected=[0 + 1 + 2 + 3, 4, 0, 5 + 6, 7, 8 + 9]),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_prod,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=1,
+          keepdims=False,
           expected=[0 * 1 * 2 * 3, 4, 1, 5 * 6, 7, 8 * 9]),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_min,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=1,
+          keepdims=False,
           expected=[min(0, 1, 2, 3), 4, _MAX_INT32,
                     min(5, 6), 7,
                     min(8, 9)]),
@@ -225,6 +357,7 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase,
           ragged_reduce_op=ragged_math_ops.reduce_max,
           rt_input=[[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]],
           axis=1,
+          keepdims=False,
           expected=[max(0, 1, 2, 3), 4, _MIN_INT32,
                     max(5, 6), 7,
                     max(8, 9)]),
@@ -236,51 +369,117 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase,
       #  [                      ],
       #  [[9   ]                ]]
       #=========================================================================
+      # keepdims=False
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=[],
+          keepdims=False,
           expected=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]]),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=None,
+          keepdims=False,
           expected=sum([1, 2, 3, 4, 5, 6, 7, 8, 9])),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=0,
+          keepdims=False,
           expected=[[1 + 6 + 9, 2 + 7], [], [3 + 8, 4, 5]]),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=1,
+          keepdims=False,
           expected=[[1 + 3, 2 + 4, 5], [6 + 8, 7], [], [9]]),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=2,
+          keepdims=False,
           expected=[[1 + 2, 0, 3 + 4 + 5], [6 + 7, 0, 8], [], [9]]),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=[0, 1],
+          keepdims=False,
           expected=[1 + 3 + 6 + 8 + 9, 2 + 4 + 7, 5]),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=[0, 2],
+          keepdims=False,
           expected=[1 + 6 + 9 + 2 + 7, 0, 3 + 8 + 4 + 5]),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=[1, 2],
+          keepdims=False,
           expected=[1 + 2 + 3 + 4 + 5, 6 + 7 + 8, 0, 9]),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=[0, 1, 2],
+          keepdims=False,
           expected=sum([1, 2, 3, 4, 5, 6, 7, 8, 9])),
+      # keepdims=True
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[],
+          keepdims=True,
+          expected=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]]),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=None,
+          keepdims=True,
+          expected=[[[sum([1, 2, 3, 4, 5, 6, 7, 8, 9])]]]),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=0,
+          keepdims=True,
+          expected=[[[1 + 6 + 9, 2 + 7], [], [3 + 8, 4, 5]]]),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=1,
+          keepdims=True,
+          expected=[[[1 + 3, 2 + 4, 5]], [[6 + 8, 7]], [[]], [[9]]]),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=2,
+          keepdims=True,
+          expected=[[[1 + 2], [0], [3 + 4 + 5]], [[6 + 7], [0], [8]], [],
+                    [[9]]]),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[0, 1],
+          keepdims=True,
+          expected=[[[1 + 3 + 6 + 8 + 9, 2 + 4 + 7, 5]]]),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[0, 2],
+          keepdims=True,
+          expected=[[[1 + 6 + 9 + 2 + 7], [0], [3 + 8 + 4 + 5]]]),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[1, 2],
+          keepdims=True,
+          expected=[[[1 + 2 + 3 + 4 + 5]], [[6 + 7 + 8]], [[0]], [[9]]]),
+      dict(
+          ragged_reduce_op=ragged_math_ops.reduce_sum,
+          rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
+          axis=[0, 1, 2],
+          keepdims=True,
+          expected=[[[sum([1, 2, 3, 4, 5, 6, 7, 8, 9])]]]),
 
       #=========================================================================
       # Examples for ragged_reduce_mean ragged_rank=2:
@@ -292,16 +491,19 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase,
           ragged_reduce_op=ragged_math_ops.reduce_mean,
           rt_input=[[[1, 2], [3, 4, 5]], [[6, 7], [8]], [[9]]],
           axis=0,
+          keepdims=False,
           expected=[[mean(1, 6, 9), mean(2, 7)], [mean(3, 8), 4, 5]]),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_mean,
           rt_input=[[[1, 2], [3, 4, 5]], [[6, 7], [8]], [[9]]],
           axis=1,
+          keepdims=False,
           expected=[[mean(1, 3), mean(2, 4), 5], [mean(6, 8), 7], [9]]),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_mean,
           rt_input=[[[1, 2], [3, 4, 5]], [[6, 7], [8]], [[9]]],
           axis=2,
+          keepdims=False,
           expected=[[mean(1, 2), mean(3, 4, 5)], [mean(6, 7), 8], [9]]),
 
       # Test case for GitHub issue 27497, multiple negative axes.
@@ -309,16 +511,18 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase,
           ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=[-2, -1],
+          keepdims=False,
           expected=[1 + 2 + 3 + 4 + 5, 6 + 7 + 8, 0, 9]),
       dict(
           ragged_reduce_op=ragged_math_ops.reduce_sum,
           rt_input=[[[1, 2], [], [3, 4, 5]], [[6, 7], [], [8]], [], [[9]]],
           axis=[-3, -2, -1],
+          keepdims=False,
           expected=sum([1, 2, 3, 4, 5, 6, 7, 8, 9])),
   )
-  def testReduce(self, ragged_reduce_op, rt_input, axis, expected):
+  def testReduce(self, ragged_reduce_op, rt_input, axis, keepdims, expected):
     rt_input = ragged_factory_ops.constant(rt_input)
-    reduced = ragged_reduce_op(rt_input, axis)
+    reduced = ragged_reduce_op(rt_input, axis, keepdims=keepdims)
     self.assertAllEqual(reduced, expected)
 
   def testReduceKeepsInnerDimensionShape(self):
@@ -336,8 +540,8 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase,
   def testMeanNan(self):
     rt_as_list = [[0, 1, 2, 3], [4], [], [5, 6], [7], [8, 9]]
     expected = (
-        np.array([0 + 1 + 2 + 3, 4, 0, 5 + 6, 7, 8 + 9]) / np.array(
-            [4, 1, 0, 2, 1, 2]))
+        np.array([0 + 1 + 2 + 3, 4, 0, 5 + 6, 7, 8 + 9]) /
+        np.array([4, 1, 0, 2, 1, 2]))
     rt_input = ragged_factory_ops.constant(rt_as_list)
     reduced = ragged_math_ops.reduce_mean(rt_input, axis=1)
     self.assertEqualWithNan(self.evaluate(reduced), expected)
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index 767f549e952..7323f85fd2b 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -44,7 +44,9 @@ from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged.row_partition import RowPartition
 from tensorflow.python.types import internal as internal_types
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.tools.docs import doc_controls
 
 # pylint: disable=protected-access
 _convert_row_partition = RowPartition._convert_row_partition
@@ -75,6 +77,18 @@ class RaggedTensor(composite_tensor.CompositeTensor,
   time: it can't depend on the runtime values of `Tensor`s, and can't vary
   dynamically for different session runs.
 
+  Note that the `__init__` constructor is private. Please use one of the
+  following methods to construct a `RaggedTensor`:
+
+      * `tf.RaggedTensor.from_row_lengths`
+      * `tf.RaggedTensor.from_value_rowids`
+      * `tf.RaggedTensor.from_row_splits`
+      * `tf.RaggedTensor.from_row_starts`
+      * `tf.RaggedTensor.from_row_limits`
+      * `tf.RaggedTensor.from_nested_row_splits`
+      * `tf.RaggedTensor.from_nested_row_lengths`
+      * `tf.RaggedTensor.from_nested_value_rowids`
+
   ### Potentially Ragged Tensors
 
   Many ops support both `Tensor`s and `RaggedTensor`s.  The term "potentially
@@ -228,6 +242,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
   #=============================================================================
   # Constructor (private)
   #=============================================================================
+  @doc_controls.do_not_generate_docs
   def __init__(self, values, row_partition, internal=False):
     """Creates a `RaggedTensor` with a specified partitioning for `values`.
 
@@ -261,15 +276,12 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       raise ValueError("RaggedTensor constructor is private; please use one "
                        "of the factory methods instead (e.g., "
                        "RaggedTensor.from_row_lengths())")
-    if not isinstance(values, (RaggedTensor, ops.Tensor)):
-      raise TypeError("values must be a Tensor or RaggedTensor, got %r" %
-                      values)
+    _assert_is_supported_ragged_values_type(values)
     if not isinstance(row_partition, RowPartition):
       raise TypeError("row_partition must be a RowPartition, got %r" %
                       row_partition)
 
     # Validate shapes.
-    values = convert_to_tensor_or_ragged_tensor(values)
     values.shape.with_rank_at_least(1)
     if isinstance(values, RaggedTensor):
       # pylint: disable=protected-access
@@ -330,6 +342,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
         row_partition=row_partition)
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_value_rowids(cls,
                         values,
                         value_rowids,
@@ -388,6 +401,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       return cls._from_row_partition(values, row_partition, validate=validate)
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_row_splits(cls, values, row_splits, name=None, validate=True):
     """Creates a `RaggedTensor` with rows partitioned by `row_splits`.
 
@@ -434,6 +448,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       return cls._from_row_partition(values, row_partition, validate=validate)
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_row_lengths(cls, values, row_lengths, name=None, validate=True):
     """Creates a `RaggedTensor` with rows partitioned by `row_lengths`.
 
@@ -476,6 +491,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       return cls._from_row_partition(values, row_partition, validate=validate)
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_row_starts(cls, values, row_starts, name=None, validate=True):
     """Creates a `RaggedTensor` with rows partitioned by `row_starts`.
 
@@ -506,7 +522,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
     if not isinstance(validate, bool):
       raise TypeError("validate must have type bool")
     with ops.name_scope(name, "RaggedFromRowStarts", [values, row_starts]):
-      values = convert_to_tensor_or_ragged_tensor(values)
+      values = _convert_to_ragged_tensor_values(values)
       row_partition = RowPartition.from_row_starts(
           row_starts=row_starts,
           nvals=_nrows(values),
@@ -515,6 +531,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       return cls._from_row_partition(values, row_partition, validate=validate)
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_row_limits(cls, values, row_limits, name=None, validate=True):
     """Creates a `RaggedTensor` with rows partitioned by `row_limits`.
 
@@ -551,6 +568,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       return cls._from_row_partition(values, row_partition, validate=validate)
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_uniform_row_length(cls,
                               values,
                               uniform_row_length,
@@ -611,7 +629,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       raise TypeError("validate must have type bool")
     with ops.name_scope(name, "RaggedFromUniformRowLength",
                         [values, uniform_row_length, nrows]):
-      values = convert_to_tensor_or_ragged_tensor(values)
+      values = _convert_to_ragged_tensor_values(values)
       uniform_row_length = _convert_row_partition(
           uniform_row_length, "UniformRowLength",
           _get_optional_partition_dtype(values))
@@ -625,6 +643,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       return cls._from_row_partition(values, row_partition, validate=validate)
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_nested_value_rowids(cls,
                                flat_values,
                                nested_value_rowids,
@@ -681,6 +700,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       return result
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_nested_row_splits(cls,
                              flat_values,
                              nested_row_splits,
@@ -720,6 +740,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       return result
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_nested_row_lengths(cls,
                               flat_values,
                               nested_row_lengths,
@@ -790,7 +811,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
               (name, row_partition.dtype, values._row_partition.dtype))
         values = values.with_row_splits_dtype(row_partition.dtype)
     else:
-      values = ops.convert_to_tensor(values, name="values")
+      values = _convert_to_ragged_tensor_values(values)
 
     return (values, row_partition)
 
@@ -1296,6 +1317,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       A `RaggedTensor`.  `result.rank = 1 + new_values.rank`.
       `result.ragged_rank = 1 + new_values.ragged_rank`
     """
+    new_values = _convert_to_ragged_tensor_values(new_values)
     new_values.shape.with_rank_at_least(1)
     self.values.shape[:1].assert_is_compatible_with(new_values.shape[:1])
     if (isinstance(new_values, RaggedTensor) and
@@ -1325,10 +1347,11 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       `result.rank = self.ragged_rank + new_values.rank`.
       `result.ragged_rank = self.ragged_rank + new_values.ragged_rank`.
     """
-    if isinstance(self._values, ops.Tensor):
-      return self.with_values(new_values)
-    else:
+    if isinstance(self._values, RaggedTensor):
       return self.with_values(self.values.with_flat_values(new_values))
+    else:
+      new_values = _convert_to_ragged_tensor_values(new_values)
+    return self.with_values(new_values)
 
   def with_row_splits_dtype(self, dtype):
     """Returns a copy of this RaggedTensor with the given `row_splits` dtype.
@@ -1467,6 +1490,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
 #=============================================================================
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_tensor(cls,
                   tensor,
                   lengths=None,
@@ -1739,6 +1763,7 @@ class RaggedTensor(composite_tensor.CompositeTensor,
       return tensor
 
   @classmethod
+  @dispatch.add_dispatch_support
   def from_sparse(cls, st_input, name=None, row_splits_dtype=dtypes.int64):
     """Converts a 2D `tf.sparse.SparseTensor` to a `RaggedTensor`.
 
@@ -2149,7 +2174,10 @@ def match_row_splits_dtypes(*tensors, **kwargs):
 class RaggedTensorSpec(type_spec.BatchableTypeSpec):
   """Type specification for a `tf.RaggedTensor`."""
 
-  __slots__ = ["_shape", "_dtype", "_ragged_rank", "_row_splits_dtype"]
+  __slots__ = [
+      "_shape", "_dtype", "_ragged_rank", "_row_splits_dtype",
+      "_flat_values_spec"
+  ]
 
   @property
   def dtype(self):
@@ -2211,7 +2239,7 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
 
   @property
   def row_splits_dtype(self):
-    """The `tf.dtypes.DType` of the the RaggedTensor's `row_splits`.
+    """The `tf.dtypes.DType` of the RaggedTensor's `row_splits`.
 
     Examples:
 
@@ -2225,6 +2253,16 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
     """
     return self._row_splits_dtype
 
+  @property
+  def flat_values_spec(self):
+    """The `TypeSpec` of the flat_values of RaggedTensor.
+
+    Returns:
+      - The TypeSpec of flat_values.
+      - None when the flat_values is a Tensor.
+    """
+    return self._flat_values_spec
+
   @property
   def value_type(self):
     return RaggedTensor if self._ragged_rank > 0 else ops.Tensor
@@ -2233,7 +2271,8 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
                shape=None,
                dtype=dtypes.float32,
                ragged_rank=None,
-               row_splits_dtype=dtypes.int64):
+               row_splits_dtype=dtypes.int64,
+               flat_values_spec=None):
     """Constructs a type specification for a `tf.RaggedTensor`.
 
     Args:
@@ -2244,10 +2283,23 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
         flat_values is partitioned.  Defaults to `shape.ndims - 1`.
       row_splits_dtype: `dtype` for the RaggedTensor's `row_splits` tensor. One
         of `tf.int32` or `tf.int64`.
+      flat_values_spec: TypeSpec for flat_value of the RaggedTensor. It shall be
+        provided when the flat_values is a CompositeTensor rather then Tensor.
+        If both `dtype` and `flat_values_spec` and  are provided, `dtype` must
+        be the same as `flat_values_spec.dtype`. (experimental)
     """
     self._shape = tensor_shape.as_shape(shape)
-    self._dtype = dtypes.as_dtype(dtype)
     self._row_splits_dtype = dtypes.as_dtype(row_splits_dtype)
+    if flat_values_spec is not None:
+      if dtype is None:
+        dtype = flat_values_spec.dtype
+      elif dtype != flat_values_spec.dtype:
+        raise ValueError("dtype must be the same as flat_values_spec.dtype")
+    elif dtype is None:
+      raise ValueError(
+          "At least one of dtype or flat_values_spec must be provided")
+    self._dtype = dtypes.as_dtype(dtype)
+    self._flat_values_spec = flat_values_spec
 
     rank = self._shape.ndims
     if ragged_rank is None:
@@ -2264,29 +2316,43 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
         raise ValueError("ragged_rank must be less than rank.")
 
   def is_compatible_with(self, spec_or_value):
-    if (self._ragged_rank == 0 and
-        isinstance(spec_or_value, (ops.Tensor, tensor_spec.TensorSpec))):
-      return tensor_spec.TensorSpec(
-          self._shape, self._dtype).is_compatible_with(spec_or_value)
-    else:
-      return super(RaggedTensorSpec, self).is_compatible_with(spec_or_value)
+    # RaggedTensor with ragged_rank 0 can be compatible with raw flat_values.
+    if self._ragged_rank == 0:
+      if self._flat_values_spec is None:
+        if isinstance(spec_or_value, (ops.Tensor, tensor_spec.TensorSpec)):
+          return tensor_spec.TensorSpec(
+              self._shape, self._dtype).is_compatible_with(spec_or_value)
+      elif not isinstance(spec_or_value, (RaggedTensor, RaggedTensorSpec)):
+        return self._flat_values_spec.is_compatible_with(spec_or_value)
+    return super(RaggedTensorSpec, self).is_compatible_with(spec_or_value)
 
   def _serialize(self):
-    return (self._shape, self._dtype, self._ragged_rank, self._row_splits_dtype)
+    if self._flat_values_spec is None:
+      return (self._shape, self._dtype, self._ragged_rank,
+              self._row_splits_dtype)
+    else:
+      return (self._shape, self._dtype, self._ragged_rank,
+              self._row_splits_dtype, self._flat_values_spec)
 
   @property
   def _component_specs(self):
     if self._ragged_rank == 0:
-      return [tensor_spec.TensorSpec(self._shape, self._dtype)]
+      if self._flat_values_spec is not None:
+        return [self._flat_values_spec]
+      else:
+        return [tensor_spec.TensorSpec(self._shape, self._dtype)]
 
-    flat_values_shape = tensor_shape.TensorShape([None]).concatenate(
-        self._shape[self._ragged_rank + 1:])
+    flat_values_spec = self._flat_values_spec
+    if flat_values_spec is None:
+      flat_values_shape = tensor_shape.TensorShape([None]).concatenate(
+          self._shape[self._ragged_rank + 1:])
+      flat_values_spec = tensor_spec.TensorSpec(flat_values_shape, self._dtype)
     outer_dim = tensor_shape.dimension_at_index(self._shape, 0)
     outer_splits_shape = [None if outer_dim is None else outer_dim + 1]
     inner_splits_spec = tensor_spec.TensorSpec([None], self._row_splits_dtype)
 
     specs = ([
-        tensor_spec.TensorSpec(flat_values_shape, self._dtype),
+        flat_values_spec,
         tensor_spec.TensorSpec(outer_splits_shape, self._row_splits_dtype)
     ] + [inner_splits_spec for _ in range(self._ragged_rank - 1)])
     return specs
@@ -2328,6 +2394,8 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
   def _to_tensor_list(self, value):
     # TODO(edloper): Update gen_ragged_conversion_ops that convert to and
     # from variant to include all of the row-partitioning tensors.
+    if self._flat_values_spec is not None:
+      raise ValueError("Customized value_type is not supported")
     ragged_rank = value.ragged_rank if isinstance(value, RaggedTensor) else 0
     if ragged_rank != self._ragged_rank:
       raise ValueError("Ragged rank of value (%d) does not match ragged "
@@ -2341,6 +2409,8 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
     return [value._to_variant(batched_input=False)]
 
   def _to_batched_tensor_list(self, value):
+    if self._flat_values_spec is not None:
+      raise ValueError("Customized value_type is not supported")
     ragged_rank = value.ragged_rank if isinstance(value, RaggedTensor) else 0
     if ragged_rank != self._ragged_rank:
       raise ValueError("Ragged rank of value (%d) does not match ragged "
@@ -2353,6 +2423,8 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
     return [value._to_variant(batched_input=True)]
 
   def _from_compatible_tensor_list(self, tensor_list):
+    if self._flat_values_spec is not None:
+      raise ValueError("Customized value_type is not supported")
     if self._ragged_rank < 0:
       raise ValueError("ragged_rank must be non-negative; got %s." %
                        self._ragged_rank)
@@ -2372,11 +2444,15 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
     return result
 
   def _batch(self, batch_size):
+    if self._flat_values_spec is not None:
+      raise ValueError("Customized value_type is not supported")
     return RaggedTensorSpec(
         tensor_shape.TensorShape([batch_size]).concatenate(self._shape),
         self._dtype, self._ragged_rank + 1, self._row_splits_dtype)
 
   def _unbatch(self):
+    if self._flat_values_spec is not None:
+      raise ValueError("Customized value_type is not supported")
     # Note: Negative ragged_rank is allowed here because the dataset could be
     # subsequently batched again. If ragged_rank > 1, assume row_splits_dtype is
     # consistent. Errors are handled in
@@ -2395,11 +2471,20 @@ class RaggedTensorSpec(type_spec.BatchableTypeSpec):
 
   @classmethod
   def from_value(cls, value):
-    return cls(
-        shape=value.shape,
-        dtype=value.values.dtype,
-        ragged_rank=value.ragged_rank,
-        row_splits_dtype=value.row_splits.dtype)
+    if (isinstance(value, ragged_tensor_value.RaggedTensorValue) or
+        isinstance(value.flat_values, ops.Tensor)):
+      return cls(
+          shape=value.shape,
+          dtype=value.values.dtype,
+          ragged_rank=value.ragged_rank,
+          row_splits_dtype=value.row_splits.dtype)
+    else:
+      return cls(
+          shape=value.shape,
+          dtype=value.values.dtype,
+          ragged_rank=value.ragged_rank,
+          row_splits_dtype=value.row_splits.dtype,
+          flat_values_spec=type_spec.type_spec_from_value(value.flat_values))
 
 
 type_spec.register_type_spec_from_value_converter(
@@ -2449,8 +2534,29 @@ def convert_to_tensor_or_ragged_tensor(value,
       return RaggedTensor.from_nested_row_splits(
           flat_values, value.nested_row_splits, validate=False)
   else:
-    return ops.convert_to_tensor(
-        value=value, dtype=dtype, preferred_dtype=preferred_dtype, name=name)
+    return ops.convert_to_tensor_v2_with_dispatch(
+        value=value, dtype=dtype, dtype_hint=preferred_dtype, name=name)
+
+
+def _convert_to_ragged_tensor_values(value):
+  """Converts value to supported RaggedTensor value.
+
+  * If `value` is an object of supported value type, then return it as-is.
+  * Otherwise convert it to Tensor or RaggedTensor.
+
+  Args:
+    value: An object of `Tensor`, `RaggedTensor` or registerred RaggedTensor
+      value types, or an object whose type has a registered `Tensor`
+      conversion function.
+
+  Returns:
+    An object of `Tensor`, `RaggedTensor` or registerred RaggedTensor
+    value types
+  """
+  if _is_supported_ragged_values_type(value):
+    return value
+  else:
+    return convert_to_tensor_or_ragged_tensor(value, name="values")
 
 
 #===============================================================================
@@ -2770,4 +2876,65 @@ def _get_optional_partition_dtype(values):
   return None
 
 
-ops.no_gradient("RaggedTensorToVariant")
+_SUPPORTED_RAGGED_VALUE_TYPES = (ops.Tensor, RaggedTensor)
+
+
+# TODO(edloper): Consider whether we should change the registry to be on
+# TypeSpecs rather than ValueTypes.
+def _add_supported_value_type(cls):
+  """Register the `cls` as supported value type of RaggedTenosr.
+
+  The cls must be a subclass of CompositeTensor, and must support:
+   - Properties:
+     - x.shape
+     - x.dtype
+   - Methods:
+     - x.__getitem__(idx) (method: returns a supported value type)
+   - Ops:
+     - tf.shape(x) -- tf.shape(x)[0] must be a tf.Tensor.
+     - tf.tile(x)
+     - assert_rank_at_least(x)
+     - tf.ones_like(x)
+     - tf.gather(params=x, indices=Tensor)
+     - tf.add(x, y)
+     - tf.boolean_mask(x, ...)
+     - @TODO(edloper): Complete this list
+
+   Note: the following RaggedTensor, RaggedTensorSpec methods & ops are not
+   currently supported unless `rt.values` is a RaggedTensor or a tf.Tensor:
+     - rt.to_tensor()
+     - rt.to_sparse_tensor()
+     - rt._to_variant()
+     - rt._from_variant()
+     - tf.ragged.cross([rt])
+     - tf.gather(params=x, indices=rt)  # rt used for indices
+     - RaggedTensorSpec methods:
+       - _batch
+       - _unbatch
+       - _to_tensor_list
+       - _to_batched_tensor_list
+       - _from_compatible_tensor_list
+
+  Args:
+    cls: The type to be added to supported value types.
+  """
+  if not issubclass(cls, composite_tensor.CompositeTensor):
+    raise ValueError("cls(%s) must be a subclass of CompositeTensor" % cls)
+  if not hasattr(cls, "shape"):
+    raise ValueError("cls must support the `shape` property")
+  if not hasattr(cls, "dtype"):
+    raise ValueError("cls must support the `dtype` property")
+  global _SUPPORTED_RAGGED_VALUE_TYPES
+  _SUPPORTED_RAGGED_VALUE_TYPES += (cls,)
+
+
+def _is_supported_ragged_values_type(value):
+  return isinstance(value, _SUPPORTED_RAGGED_VALUE_TYPES)
+
+
+def _assert_is_supported_ragged_values_type(value):
+  if not _is_supported_ragged_values_type(value):
+    ok_types = ", ".join(cls.__name__ for cls in
+                         _SUPPORTED_RAGGED_VALUE_TYPES)
+    raise TypeError("type(values) must be one of: %r, got %r" %
+                    (ok_types, value))
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_shape.py b/tensorflow/python/ops/ragged/ragged_tensor_shape.py
index 5dd84c5387e..c8635acd697 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_shape.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_shape.py
@@ -582,10 +582,12 @@ def _broadcast_to_ragged_shape(rt_input, dst_shape, broadcast_inner_dimensions):
     rt_input = ragged_array_ops.tile(rt_input, multiples)
 
   if broadcast_inner_dimensions:
+    new_shape = array_ops.broadcast_dynamic_shape(
+        array_ops.shape(
+            rt_input.flat_values, out_type=dst_shape.dim_size_dtype),
+        array_ops.concat([[1], dst_shape.inner_dim_sizes], axis=0))
     rt_input = rt_input.with_flat_values(
-        array_ops.reshape(
-            rt_input.flat_values,
-            array_ops.concat([[-1], dst_shape.inner_dim_sizes], axis=0)))
+        array_ops.broadcast_to(rt_input.flat_values, new_shape))
 
   # Do broadcasting for dimensions that become ragged.  We must do these from
   # outermost to innermost.
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py b/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
index 7014226dc99..1e8aeeeae1d 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
@@ -378,37 +378,41 @@ class RaggedTensorShapeTest(test_util.TensorFlowTestCase,
         r'partitioned_dim_sizes=\(<[^>]+>, <[^>]+>\), '
         r'inner_dim_sizes=<[^>]+>\)')
 
-  @parameterized.parameters(
-      [
-          dict(
-              x=[[10], [20], [30]],  # shape=[3, 1]
-              dim_sizes=[3, 2],
-              expected=[[10, 10], [20, 20], [30, 30]]),
-          dict(
-              x=[[10], [20], [30]],  # shape=[3, 1]
-              dim_sizes=[3, [3, 0, 2]],
-              expected=ragged_factory_ops.constant_value(
-                  [[10, 10, 10], [], [30, 30]], dtype=np.int32)),
-          dict(
-              x=[[[1, 2, 3]], [[4, 5, 6]]],  # shape = [2, 1, 3]
-              dim_sizes=[2, [2, 3], 3],
-              expected=ragged_factory_ops.constant_value(
-                  [[[1, 2, 3], [1, 2, 3]], [[4, 5, 6], [4, 5, 6], [4, 5, 6]]],
-                  dtype=np.int32,
-                  ragged_rank=1)),
-          dict(
-              x=[[[1]], [[2]]],  # shape = [2, 1, 1]
-              dim_sizes=[2, [2, 3], [0, 2, 1, 2, 0]],
-              expected=ragged_factory_ops.constant_value(
-                  [[[], [1, 1]], [[2], [2, 2], []]],
-                  dtype=np.int32,
-                  ragged_rank=2)),
-          dict(
-              x=10,
-              dim_sizes=[3, [3, 0, 2]],
-              expected=ragged_factory_ops.constant_value([[10, 10, 10], [],
-                                                          [10, 10]])),
-      ])
+  @parameterized.parameters([
+      dict(
+          x=[[10], [20], [30]],  # shape=[3, 1]
+          dim_sizes=[3, 2],
+          expected=[[10, 10], [20, 20], [30, 30]]),
+      dict(
+          x=[[10], [20], [30]],  # shape=[3, 1]
+          dim_sizes=[3, [3, 0, 2]],
+          expected=ragged_factory_ops.constant_value(
+              [[10, 10, 10], [], [30, 30]], dtype=np.int32)),
+      dict(
+          x=[[[1, 2, 3]], [[4, 5, 6]]],  # shape = [2, 1, 3]
+          dim_sizes=[2, [2, 3], 3],
+          expected=ragged_factory_ops.constant_value(
+              [[[1, 2, 3], [1, 2, 3]], [[4, 5, 6], [4, 5, 6], [4, 5, 6]]],
+              dtype=np.int32,
+              ragged_rank=1)),
+      dict(
+          x=[[[1]], [[2]]],  # shape = [2, 1, 1]
+          dim_sizes=[2, [2, 3], [0, 2, 1, 2, 0]],
+          expected=ragged_factory_ops.constant_value(
+              [[[], [1, 1]], [[2], [2, 2], []]], dtype=np.int32,
+              ragged_rank=2)),
+      dict(
+          x=10,
+          dim_sizes=[3, [3, 0, 2]],
+          expected=ragged_factory_ops.constant_value([[10, 10, 10], [],
+                                                      [10, 10]])),
+      dict(
+          x=ragged_factory_ops.constant_value([[[1], [2]], [[3]]],
+                                              ragged_rank=1),
+          dim_sizes=[2, [2, 1], 2],
+          expected=ragged_factory_ops.constant_value(
+              [[[1, 1], [2, 2]], [[3, 3]]], ragged_rank=1)),
+  ])
   def testRaggedBroadcastTo(self, x, dim_sizes, expected):
     shape = RaggedTensorDynamicShape.from_dim_sizes(dim_sizes)
     result = ragged_tensor_shape.broadcast_to(x, shape)
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_supported_values_test.py b/tensorflow/python/ops/ragged/ragged_tensor_supported_values_test.py
new file mode 100644
index 00000000000..9d0241e7cf3
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_supported_values_test.py
@@ -0,0 +1,500 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for RaggedTensor supported value types."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.framework import composite_tensor
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
+from tensorflow.python.framework import type_spec
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_test_ops as test_ops
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
+from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensorSpec
+from tensorflow.python.platform import googletest
+from tensorflow.python.util import dispatch
+
+
+class WrappedTensor(composite_tensor.CompositeTensor):
+  """A class used to test extending RaggedTensor value type support.
+
+  Simply wraps a `tf.Tensor` value.
+  """
+
+  def __init__(self, value):
+    if not isinstance(value, ops.Tensor):
+      raise ValueError("Expect Tensor object, but get '%s'" % value)
+    self.value = value
+
+  @property
+  def shape(self):
+    return self.value.shape
+
+  @property
+  def dtype(self):
+    return self.value.dtype
+
+  def __getitem__(self, idx):
+    return WrappedTensor(self.value.__getitem__(idx))
+
+  @property
+  def _type_spec(self):
+    return WrappedTensorSpec(type_spec.type_spec_from_value(self.value))
+
+
+class WrappedTensorSpec(type_spec.TypeSpec):
+
+  def __init__(self, value_spec):
+    self._value_spec = value_spec
+
+  @property
+  def dtype(self):
+    return self._value_spec.dtype
+
+  @property
+  def value_type(self):
+    return WrappedTensor
+
+  def _to_components(self, value):
+    return value.value
+
+  def _from_components(self, value):
+    return WrappedTensor(value)
+
+  def _component_specs(self):
+    return self._value_spec
+
+  def _serialize(self):
+    return (self._value_spec,)
+
+
+class WrappedTensorOpDispatcher(dispatch.GlobalOpDispatcher):
+  """Global op dispatcher for WrappedTensor."""
+
+  # For these ops, just return plain Tensors (not WrappedTensors).
+  OPS_THAT_RETURN_UNTRACED_RESULTS = (array_ops.shape, array_ops.shape_v2,
+                                      check_ops.assert_rank_at_least)
+
+  def call_op(self, op, *args, **kwargs):
+    return op(*args, **kwargs)
+
+  def handle(self, op, args, kwargs):
+    # Dispatcher only applies if at least one arg is a WrappedTensor.
+    if not (any(self.is_wrapped_tensor_arg(x) for x in args) or
+            any(self.is_wrapped_tensor_arg(x) for x in kwargs.values())):
+      return self.NOT_SUPPORTED
+
+    args = [self.unwrap(v) for v in args]
+    kwargs = dict([(k, self.unwrap(v)) for (k, v) in kwargs.items()])
+    value = self.call_op(op, *args, **kwargs)
+    if op in self.OPS_THAT_RETURN_UNTRACED_RESULTS:
+      return value
+    else:
+      return WrappedTensor(value)
+
+  def unwrap(self, value):
+    if isinstance(value, WrappedTensor):
+      return value.value
+    elif isinstance(value, (list, tuple)):
+      return type(value)([self.unwrap(v) for v in value])
+    else:
+      return value
+
+  def is_wrapped_tensor_arg(self, value):
+    if isinstance(value, WrappedTensor):
+      return True
+    if isinstance(value, (list, tuple)):
+      if any(isinstance(x, WrappedTensor) for x in value):
+        return True
+    return False
+
+
+WrappedTensorOpDispatcher().register()
+ragged_tensor._add_supported_value_type(WrappedTensor)
+
+
+# pylint: disable=g-complex-comprehension
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorSupportedValuesTest(test_util.TensorFlowTestCase,
+                                      parameterized.TestCase):
+
+  def assertAllTensorsEqual(self, list1, list2):
+    self.assertLen(list1, len(list2))
+    for (t1, t2) in zip(list1, list2):
+      self.assertAllEqual(t1, t2)
+
+  def testConstruction(self):
+    tensor_values = constant_op.constant(
+        ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
+    values = WrappedTensor(tensor_values)
+
+    row_splits = constant_op.constant([0, 2, 2, 5, 6, 8], dtypes.int64)
+    rt = RaggedTensor.from_row_splits(values, row_splits)
+    self.assertIsInstance(rt.values, WrappedTensor)
+    self.assertAllEqual(rt.values.value, tensor_values)
+    self.assertAllEqual(rt.row_splits, row_splits)
+
+    row_starts = constant_op.constant([0, 2, 2, 5, 6], dtypes.int64)
+    rt = RaggedTensor.from_row_starts(values, row_starts)
+    self.assertIsInstance(rt.values, WrappedTensor)
+    self.assertAllEqual(rt.values.value, tensor_values)
+    self.assertAllEqual(rt.row_starts(), row_starts)
+
+    row_limits = constant_op.constant([2, 2, 5, 6, 8], dtypes.int64)
+    rt = RaggedTensor.from_row_limits(values, row_limits)
+    self.assertIsInstance(rt.values, WrappedTensor)
+    self.assertAllEqual(rt.values.value, tensor_values)
+    self.assertAllEqual(rt.row_limits(), row_limits)
+
+    row_lengths = constant_op.constant([2, 0, 3, 1, 2], dtypes.int64)
+    rt = RaggedTensor.from_row_lengths(values, row_lengths)
+    self.assertIsInstance(rt.values, WrappedTensor)
+    self.assertAllEqual(rt.values.value, tensor_values)
+    self.assertAllEqual(rt.row_lengths(), row_lengths)
+
+    rt = RaggedTensor.from_uniform_row_length(values, 4)
+    self.assertIsInstance(rt.values, WrappedTensor)
+    self.assertAllEqual(rt.values.value, tensor_values)
+    self.assertAllEqual(rt.uniform_row_length, 4)
+
+  def testWithValues(self):
+    tensor_values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    values = WrappedTensor(tensor_values)
+    nested_row_splits = [[0, 2, 5], [0, 2, 2, 5, 6, 7]]
+    rt = RaggedTensor.from_nested_row_splits(values, nested_row_splits)
+
+    tensor_int = constant_op.constant([1, 2, 3, 4, 5])
+    rt_int = rt.with_values(tensor_int)
+    self.assertAllEqual(rt_int.values, tensor_int)
+
+    rt_wrapped_int = rt.with_values(WrappedTensor(tensor_int))
+    self.assertIsInstance(rt_wrapped_int.values, WrappedTensor)
+    self.assertAllEqual(rt_wrapped_int.values.value, tensor_int)
+
+  def testWithFlatValues(self):
+    tensor_values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    values = WrappedTensor(tensor_values)
+    nested_row_splits = [[0, 2, 5], [0, 2, 2, 5, 6, 7]]
+    rt = RaggedTensor.from_nested_row_splits(values, nested_row_splits)
+
+    tensor_int = constant_op.constant([1, 2, 3, 4, 5, 6, 7])
+    rt_int = rt.with_flat_values(tensor_int)
+    self.assertAllEqual(rt_int.flat_values, tensor_int)
+
+    rt_wrapped_int = rt.with_flat_values(WrappedTensor(tensor_int))
+    self.assertIsInstance(rt_wrapped_int.flat_values, WrappedTensor)
+    self.assertAllEqual(rt_wrapped_int.flat_values.value, tensor_int)
+
+  @parameterized.parameters(
+      #=========================================================================
+      # Test each unary op.
+      #=========================================================================
+      [{'x': ([[-2.0, 3.0], [-3.0]]), 'op': op}
+       for op in test_ops.UNARY_FLOAT_OPS] +
+      [{'x': ([[True, False], [True]]),
+        'op': op}
+       for op in test_ops.UNARY_BOOL_OPS] +
+      [{'x': [[18, 512], [12412]],
+        'x_dtype': dtypes.int32,
+        'op': op}
+       for op in test_ops.UNARY_INT_OPS] +
+      [{'x': ([['abcd', 'efgh'], ['aabbccdd']]),
+        'op': op}
+       for op in test_ops.UNARY_STRING_OPS] +
+      [
+          {'op': clip_ops.clip_by_value,
+           'x': ([[-2.0, 3.0], [-3.0]]),
+           'clip_value_min': 0.1, 'clip_value_max': 4.0},
+          {'op': math_ops.cast,
+           'x': ([[-2.0, 3.0], [-3.0]]),
+           'dtype': dtypes.int32},
+          {'op': math_ops.saturate_cast,
+           'x': ([[-2.0, 3.0], [-3.0]]),
+           'dtype': dtypes.int32},
+          {'op': string_ops.string_to_hash_bucket,
+           'x': (
+               [['abcd', 'efgh'], ['aabbccdd']]),
+           'num_buckets': 1000},
+          {'op': string_ops.string_to_hash_bucket_fast,
+           'x': (
+               [['abcd', 'efgh'], ['aabbccdd']]),
+           'num_buckets': 1000},
+          {'op': string_ops.string_to_hash_bucket_strong,
+           'x': (
+               [['abcd', 'efgh'], ['aabbccdd']]),
+           'num_buckets': 1000,
+           'key': [1231, 12512]},
+          {'op': string_ops.string_to_number,
+           'x': ([['-2.0', '3.0'], ['-3.0']])},
+          {'op': string_ops.regex_full_match,
+           'x': ([['hello', '123'], ['1+1']]),
+           'pattern': r'\w+'},
+          {'op': string_ops.regex_replace,
+           'x': ([['hello', '123'], ['1+1']]),
+           'pattern': r'\d',
+           'rewrite': '#'},
+          {'op': string_ops.substr,
+           'x': ([['hello', '123'], ['1+1']]),
+           'pos': 2, 'len': 3},
+          {'op': array_ops.check_numerics,
+           'x': ([[-2.0, 3.0], [-3.0]]),
+           'message': 'check-numerics'},
+          {'op': nn_ops.dropout,
+           'x': ([[-2.0, 3.0], [-3.0]]),
+           'rate': 0.5,
+           'seed': 1},
+      ])  # pyformat: disable
+  def testUnaryElementwiseOp(self,
+                             x,
+                             x_dtype=None,
+                             op=math_ops.abs,
+                             **extra_args):
+    x = ragged_factory_ops.constant(x, x_dtype)
+    wrapped_x = ragged_tensor.RaggedTensor.from_nested_row_splits(
+        WrappedTensor(x.flat_values), x.nested_row_splits)
+    test_util.random_seed.set_seed(1234)
+    res = op(x, **extra_args)
+    test_util.random_seed.set_seed(1234)
+    wrapped_res = op(wrapped_x, **extra_args)
+    self.assertIsInstance(wrapped_res.flat_values, WrappedTensor)
+    self.assertAllEqual(wrapped_res.flat_values.value, res.flat_values)
+    self.assertAllTensorsEqual(wrapped_res.nested_row_splits,
+                               res.nested_row_splits)
+
+  @parameterized.parameters(
+      #=========================================================================
+      # Test each binary op.
+      #=========================================================================
+      [{'x': [[-2.0, 3.0], [-3.0]],
+        'y': [[5.0, 1.0], [12.0]],
+        'op': op}
+       for op in test_ops.BINARY_FLOAT_OPS] +
+      [{'x': [[-2, 3], [-3]],
+        'y': [[5, 1], [12]],
+        'op': op}
+       for op in test_ops.BINARY_INT_OPS] +
+      [{'x': [[True, True], [False]],
+        'y': [[False, True], [False]],
+        'op': op}
+       for op in test_ops.BINARY_BOOL_OPS]
+      )  # pyformat: disable
+  def testBinaryElementwiseOp(self, x, y, op=math_ops.add):
+    x = ragged_factory_ops.constant(x)
+    y = ragged_factory_ops.constant(y)
+    wrapped_x = ragged_tensor.RaggedTensor.from_nested_row_splits(
+        WrappedTensor(x.flat_values), x.nested_row_splits)
+    wrapped_y = ragged_tensor.RaggedTensor.from_nested_row_splits(
+        WrappedTensor(y.flat_values), y.nested_row_splits)
+    res = op(x, y)
+    wrapped_res = op(wrapped_x, wrapped_y)
+    self.assertIsInstance(wrapped_res.flat_values, WrappedTensor)
+    self.assertAllEqual(wrapped_res.flat_values.value, res.flat_values)
+    self.assertAllTensorsEqual(wrapped_res.nested_row_splits,
+                               res.nested_row_splits)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorSpecSupportedValuesTest(test_util.TensorFlowTestCase,
+                                          parameterized.TestCase):
+
+  def assertAllTensorsEqual(self, list1, list2):
+    self.assertLen(list1, len(list2))
+    for (t1, t2) in zip(list1, list2):
+      self.assertAllEqual(t1, t2)
+
+  def testConstruction(self):
+    flat_values_spec = WrappedTensorSpec(
+        tensor_spec.TensorSpec(shape=(None, 5), dtype=dtypes.float32))
+    spec1 = RaggedTensorSpec(
+        shape=None,
+        dtype=dtypes.float32,
+        ragged_rank=1,
+        row_splits_dtype=dtypes.int64,
+        flat_values_spec=flat_values_spec)
+    self.assertIsNone(spec1._shape.rank)
+    self.assertEqual(spec1._dtype, dtypes.float32)
+    self.assertEqual(spec1._row_splits_dtype, dtypes.int64)
+    self.assertEqual(spec1._ragged_rank, 1)
+    self.assertEqual(spec1._flat_values_spec, flat_values_spec)
+
+    self.assertIsNone(spec1.shape.rank)
+    self.assertEqual(spec1.dtype, dtypes.float32)
+    self.assertEqual(spec1.row_splits_dtype, dtypes.int64)
+    self.assertEqual(spec1.ragged_rank, 1)
+    self.assertEqual(spec1.flat_values_spec, flat_values_spec)
+
+    with self.assertRaisesRegex(
+        ValueError, 'dtype must be the same as flat_values_spec.dtype'):
+      spec1 = RaggedTensorSpec(
+          shape=None,
+          dtype=dtypes.float64,
+          ragged_rank=1,
+          row_splits_dtype=dtypes.int64,
+          flat_values_spec=flat_values_spec)
+
+  @parameterized.parameters([
+      (RaggedTensorSpec(
+          ragged_rank=1,
+          flat_values_spec=tensor_spec.TensorSpec(None, dtypes.float32)),
+       (tensor_shape.TensorShape(None), dtypes.float32, 1, dtypes.int64,
+        tensor_spec.TensorSpec(None, dtypes.float32))),
+      (RaggedTensorSpec(
+          shape=(5, None, 5),
+          ragged_rank=1,
+          dtype=dtypes.float64,
+          flat_values_spec=tensor_spec.TensorSpec(
+              (5,), dtypes.float64)), (tensor_shape.TensorShape(
+                  (5, None, 5)), dtypes.float64, 1, dtypes.int64,
+                                       tensor_spec.TensorSpec((5,),
+                                                              dtypes.float64))),
+  ])
+  def testSerialize(self, rt_spec, expected):
+    serialization = rt_spec._serialize()
+    # TensorShape has an unconventional definition of equality, so we can't use
+    # assertEqual directly here.  But repr() is deterministic and lossless for
+    # the expected values, so we can use that instead.
+    self.assertEqual(repr(serialization), repr(expected))
+
+  @parameterized.parameters([
+      (RaggedTensorSpec(
+          ragged_rank=0,
+          shape=[5, 3],
+          flat_values_spec=WrappedTensorSpec(
+              tensor_spec.TensorSpec([5, 3], dtypes.float32))),
+       [WrappedTensorSpec(tensor_spec.TensorSpec([5, 3], dtypes.float32))]),
+      (RaggedTensorSpec(
+          ragged_rank=1,
+          flat_values_spec=WrappedTensorSpec(
+              tensor_spec.TensorSpec([None, 3], dtypes.float32))),
+       [
+           WrappedTensorSpec(tensor_spec.TensorSpec([None, 3], dtypes.float32)),
+           tensor_spec.TensorSpec([None], dtypes.int64),
+       ]),
+      (RaggedTensorSpec(
+          ragged_rank=2,
+          dtype=dtypes.float64,
+          flat_values_spec=WrappedTensorSpec(
+              tensor_spec.TensorSpec([None, 3], dtypes.float64))),
+       [
+           WrappedTensorSpec(tensor_spec.TensorSpec([None, 3], dtypes.float64)),
+           tensor_spec.TensorSpec([None], dtypes.int64),
+           tensor_spec.TensorSpec([None], dtypes.int64),
+       ]),
+      (RaggedTensorSpec(
+          shape=[5, None, None],
+          dtype=dtypes.string,
+          flat_values_spec=WrappedTensorSpec(
+              tensor_spec.TensorSpec([None, 3], dtypes.string))),
+       [
+           WrappedTensorSpec(tensor_spec.TensorSpec([None, 3], dtypes.string)),
+           tensor_spec.TensorSpec([6], dtypes.int64),
+           tensor_spec.TensorSpec([None], dtypes.int64),
+       ]),
+  ])
+  def testComponentSpecs(self, rt_spec, expected):
+    self.assertEqual(rt_spec._component_specs, expected)
+
+  @parameterized.parameters([
+      {
+          'rt_spec':
+              RaggedTensorSpec(
+                  shape=[2, None, None],
+                  ragged_rank=1,
+                  flat_values_spec=WrappedTensorSpec(
+                      tensor_spec.TensorSpec(None, dtype=dtypes.float32))),
+          'flat_values': [[1.0, 2.0], [3.0, 4.0]],
+          'nested_row_splits': [[0, 1, 1, 2]],
+      },
+      {
+          'rt_spec':
+              RaggedTensorSpec(
+                  shape=[2, None, None],
+                  flat_values_spec=WrappedTensorSpec(
+                      tensor_spec.TensorSpec(None, dtype=dtypes.float32))),
+          'flat_values': [1.0, 2.0, 3.0, 4.0],
+          'nested_row_splits': [[0, 2, 4], [0, 2, 3, 3, 4]],
+      },
+  ])
+  def testToFromComponents(self, rt_spec, flat_values, nested_row_splits):
+    wrapped_tensor = WrappedTensor(constant_op.constant(flat_values))
+    rt = RaggedTensor.from_nested_row_splits(wrapped_tensor, nested_row_splits)
+    components = rt_spec._to_components(rt)
+    self.assertIsInstance(components[0], WrappedTensor)
+    self.assertAllEqual(components[0].value, wrapped_tensor.value)
+    self.assertAllTensorsEqual(components[1:], nested_row_splits)
+    rt_reconstructed = rt_spec._from_components(components)
+    self.assertIsInstance(rt_reconstructed.flat_values, WrappedTensor)
+    self.assertAllEqual(rt_reconstructed.flat_values.value,
+                        wrapped_tensor.value)
+    self.assertAllTensorsEqual(rt_reconstructed.nested_row_splits,
+                               rt.nested_row_splits)
+    self.assertEqual(rt_reconstructed.dtype, rt.dtype)
+
+  def testIsCompatibleWith(self):
+    spec1 = RaggedTensorSpec([32, None, None],
+                             dtypes.float32,
+                             2,
+                             flat_values_spec=WrappedTensorSpec(
+                                 tensor_spec.TensorSpec([None, None],
+                                                        dtypes.float32)))
+    spec2 = RaggedTensorSpec(
+        None,
+        dtypes.float32,
+        2,
+        flat_values_spec=WrappedTensorSpec(
+            tensor_spec.TensorSpec(None, dtypes.float32)))
+    spec3 = RaggedTensorSpec(
+        None,
+        dtypes.int32,
+        1,
+        flat_values_spec=WrappedTensorSpec(
+            tensor_spec.TensorSpec(None, dtypes.int32)))
+    spec4 = RaggedTensorSpec([None],
+                             dtypes.int32,
+                             0,
+                             flat_values_spec=WrappedTensorSpec(
+                                 tensor_spec.TensorSpec(None, dtypes.int32)))
+    spec5 = RaggedTensorSpec([None], dtypes.int32, 0)
+
+    self.assertTrue(spec1.is_compatible_with(spec2))
+    self.assertFalse(spec1.is_compatible_with(spec3))
+    self.assertFalse(spec1.is_compatible_with(spec4))
+    self.assertFalse(spec2.is_compatible_with(spec3))
+    self.assertFalse(spec2.is_compatible_with(spec4))
+    self.assertFalse(spec3.is_compatible_with(spec4))
+    self.assertFalse(spec4.is_compatible_with(spec5))
+    value = constant_op.constant([1, 2, 3])
+    self.assertFalse(spec4.is_compatible_with(value))
+    self.assertTrue(spec4.is_compatible_with(WrappedTensor(value)))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
index 286b730b298..a38c5527305 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -18,10 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -30,8 +32,15 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_ragged_conversion_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import map_fn
+from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -155,8 +164,9 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                                 'RaggedTensor constructor is private'):
       RaggedTensor(values=values, row_partition=rp)
 
-    with self.assertRaisesRegex(TypeError,
-                                'values must be a Tensor or RaggedTensor'):
+    with self.assertRaisesRegex(
+        TypeError,
+        r"""type\(values\) must be one of: 'Tensor, RaggedTensor.*"""):
       RaggedTensor(values=range(7), row_partition=rp, internal=True)
 
     with self.assertRaisesRegex(TypeError,
@@ -1232,19 +1242,21 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         self.evaluate(factory(**kwargs))
 
+  #=============================================================================
+  # RaggedTensor Variant conversion
+  #=============================================================================
 
-#=============================================================================
-# RaggedTensor Variant conversion
-#=============================================================================
-
-  @parameterized.parameters(
+  @parameterized.named_parameters(
       {
+          'testcase_name': 'Shape_5_none',
           'ragged_constant': [[1, 2], [3, 4, 5], [6], [], [7]],
           'ragged_rank': 1
       }, {
+          'testcase_name': 'Shape_4_none_2',
           'ragged_constant': [[[1, 2]], [], [[3, 4]], []],
           'ragged_rank': 1
       }, {
+          'testcase_name': 'Shape_1_none_none',
           'ragged_constant': [[[1], [2, 3, 4, 5, 6, 7]], [[]]],
           'ragged_rank': 2
       })
@@ -1431,6 +1443,131 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           output_ragged_rank=1,
           input_ragged_rank=1)
 
+  def _testRaggedVarientGradient(self, func, x, expected_grad):
+    x = constant_op.constant(x)
+    if context.executing_eagerly():
+      with backprop.GradientTape() as t:
+        t.watch(x)
+        y = func(x)
+        g = t.gradient(y, x)
+    else:
+      y = func(x)
+      g = gradients_impl.gradients(ys=y, xs=x)[0]
+    self.assertAllClose(g, expected_grad)
+
+  def testRaggedVariantGradients(self):
+    def func(x):
+      rt1 = RaggedTensor.from_row_splits(values=x, row_splits=[0, 4, 7, 8])
+      rt2 = rt1 * [[10], [100], [1000]]
+      v = rt2._to_variant(batched_input=False)
+      rt3 = RaggedTensor._from_variant(v, dtype=rt2.dtype, output_ragged_rank=1)
+      return rt3.flat_values
+
+    self._testRaggedVarientGradient(
+        func,
+        [3.0, 1.0, 4.0, 1.0, 1.0, 0.0, 2.0, 1.0],
+        [10., 10., 10., 10., 100., 100., 100., 1000.])
+
+  def testRaggedVariantGradientsBatched(self):
+    def func(x):
+      rt1 = RaggedTensor.from_row_splits(values=x, row_splits=[0, 4, 7, 8])
+      rt2 = rt1 * [[10], [100], [1000]]
+      v = rt2._to_variant(batched_input=True)
+      rt3 = RaggedTensor._from_variant(v, dtype=rt2.dtype, output_ragged_rank=1)
+      return rt3.flat_values
+
+    self._testRaggedVarientGradient(
+        func,
+        [3.0, 1.0, 4.0, 1.0, 1.0, 0.0, 2.0, 1.0],
+        [10., 10., 10., 10., 100., 100., 100., 1000.])
+
+  def testRaggedVariantGradientsBatchedAndSliced(self):
+    def func(x, i):
+      rt1 = RaggedTensor.from_row_splits(values=x, row_splits=[0, 4, 7, 8])
+      rt2 = rt1 * [[10], [100], [1000]]
+      v_slice = rt2._to_variant(batched_input=True)[i]
+      return RaggedTensor._from_variant(v_slice, dtype=rt2.dtype,
+                                        output_ragged_rank=0)
+
+    self._testRaggedVarientGradient(
+        functools.partial(func, i=0),
+        [3.0, 1.0, 4.0, 1.0, 1.0, 0.0, 2.0, 1.0],
+        [10., 10., 10., 10., 0., 0., 0., 0.])
+    self._testRaggedVarientGradient(
+        functools.partial(func, i=1),
+        [3.0, 1.0, 4.0, 1.0, 1.0, 0.0, 2.0, 1.0],
+        [0., 0., 0., 0., 100., 100., 100., 0.])
+    self._testRaggedVarientGradient(
+        functools.partial(func, i=2),
+        [3.0, 1.0, 4.0, 1.0, 1.0, 0.0, 2.0, 1.0],
+        [0., 0., 0., 0., 0., 0., 0., 1000.])
+
+  def testRaggedVariantGradientsRaggedRank0(self):
+    def func(x):
+      x2 = x * 2
+      v = gen_ragged_conversion_ops.ragged_tensor_to_variant(
+          [], x2, batched_input=False)
+      return RaggedTensor._from_variant(v, dtype=x2.dtype, output_ragged_rank=0)
+
+    self._testRaggedVarientGradient(
+        func,
+        [3.0, 1.0, 4.0, 1.0, 1.0, 0.0, 2.0, 1.0],
+        [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0])
+
+  def testRaggedVariantGradientsRaggedRank3(self):
+    def func(x):
+      x2 = x * 2
+      rt1 = RaggedTensor.from_nested_row_splits(
+          x2, ([0, 0, 3], [0, 2, 2, 3], [0, 4, 7, 8]))
+      v = rt1._to_variant(batched_input=False)
+      rt3 = RaggedTensor._from_variant(v, dtype=x2.dtype, output_ragged_rank=3)
+      return rt3.flat_values
+
+    self._testRaggedVarientGradient(
+        func,
+        [3.0, 1.0, 4.0, 1.0, 1.0, 0.0, 2.0, 1.0],
+        [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0])
+
+  def testRaggedVariantGradientsViaMapFn(self):
+    rt = RaggedTensor.from_row_splits(
+        values=[3, 1.0, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 7, 8])
+
+    def func(x):
+
+      def transform_row(row):
+        return math_ops.sqrt(
+            math_ops.reduce_mean(math_ops.square(row * x), keepdims=True))
+
+      return math_ops.reduce_sum(map_fn.map_fn(transform_row, rt))
+
+    self._testRaggedVarientGradient(func, 3.0, 14.653377)
+
+  def testRaggedVariantGradientsViaMapFnReduce(self):
+    def func(x):
+      rt1 = RaggedTensor.from_row_splits(values=x, row_splits=[0, 4, 7, 8])
+      return map_fn.map_fn(
+          math_ops.reduce_max, rt1,
+          fn_output_signature=tensor_spec.TensorSpec((), x.dtype))
+
+    self._testRaggedVarientGradient(
+        func,
+        [3.0, 1.0, 4.0, 1.0, 1.0, 0.0, 2.0, 1.0],
+        [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0])
+
+  def testRaggedVariantGradientsErrors(self):
+    if context.executing_eagerly():
+      return
+
+    rt = RaggedTensor.from_row_splits([1.0, 2.0], row_splits=[0, 2, 2])
+    v1 = rt._to_variant()
+    v2 = array_ops.stack([array_ops.stack([v1])])
+    y = RaggedTensor._from_variant(v2, rt.dtype, output_ragged_rank=3)
+
+    with self.assertRaisesRegex(
+        ValueError, 'Unable to compute gradient: RaggedTensorToVariant '
+        'can currently only generate 0D or 1D output.'):
+      gradients_impl.gradients(ys=y.flat_values, xs=rt.flat_values)
+
   def assertNumpyObjectTensorsRecursivelyEqual(self, a, b, msg):
     """Check that two numpy arrays are equal.
 
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test_ops.py b/tensorflow/python/ops/ragged/ragged_tensor_test_ops.py
new file mode 100644
index 00000000000..c4f04a69ac3
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test_ops.py
@@ -0,0 +1,125 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""It lists ops of RaggedTensor for the interest of test."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_bitwise_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import string_ops
+
+
+# Constants listing various op types to test.  Each operation
+# should be included in at least one list below, or tested separately if
+# necessary (e.g., because it expects additional arguments).
+UNARY_FLOAT_OPS = [
+    math_ops.abs,
+    math_ops.acos,
+    math_ops.acosh,
+    math_ops.angle,
+    math_ops.asin,
+    math_ops.asinh,
+    math_ops.atan,
+    math_ops.atanh,
+    math_ops.ceil,
+    math_ops.conj,
+    math_ops.cos,
+    math_ops.cosh,
+    math_ops.digamma,
+    math_ops.erf,
+    math_ops.erfc,
+    math_ops.erfinv,
+    math_ops.exp,
+    math_ops.expm1,
+    math_ops.floor,
+    math_ops.imag,
+    math_ops.is_finite,
+    math_ops.is_inf,
+    math_ops.is_nan,
+    math_ops.lgamma,
+    math_ops.log,
+    math_ops.log1p,
+    math_ops.log_sigmoid,
+    math_ops.ndtri,
+    math_ops.negative,
+    math_ops.real,
+    math_ops.reciprocal,
+    math_ops.rint,
+    math_ops.round,
+    math_ops.rsqrt,
+    math_ops.sign,
+    math_ops.sin,
+    math_ops.sinh,
+    math_ops.sqrt,
+    math_ops.square,
+    math_ops.tan,
+    array_ops.identity,
+    array_ops.ones_like,
+    array_ops.zeros_like,
+]
+UNARY_BOOL_OPS = [
+    math_ops.logical_not,
+]
+UNARY_STRING_OPS = [
+    string_ops.decode_base64,
+    string_ops.encode_base64,
+    string_ops.string_strip,
+    parsing_ops.decode_compressed,
+]
+BINARY_FLOAT_OPS = [
+    math_ops.add,
+    math_ops.atan2,
+    math_ops.complex,
+    math_ops.div_no_nan,
+    math_ops.divide,
+    math_ops.equal,
+    math_ops.floordiv,
+    math_ops.floormod,
+    math_ops.greater,
+    math_ops.greater_equal,
+    math_ops.less,
+    math_ops.less_equal,
+    math_ops.maximum,
+    math_ops.minimum,
+    math_ops.multiply,
+    math_ops.not_equal,
+    math_ops.pow,
+    math_ops.realdiv,
+    math_ops.squared_difference,
+    math_ops.subtract,
+    math_ops.truediv,
+]
+BINARY_BOOL_OPS = [
+    math_ops.logical_and,
+    math_ops.logical_or,
+    math_ops.logical_xor,
+]
+UNARY_INT_OPS = [
+    gen_bitwise_ops.invert,
+    string_ops.unicode_script,
+]
+BINARY_INT_OPS = [
+    gen_bitwise_ops.bitwise_and,
+    gen_bitwise_ops.bitwise_or,
+    gen_bitwise_ops.bitwise_xor,
+    gen_bitwise_ops.left_shift,
+    gen_bitwise_ops.right_shift,
+    math_ops.truncatediv,
+    math_ops.truncatemod,
+]
diff --git a/tensorflow/python/ops/ragged/ragged_where_op.py b/tensorflow/python/ops/ragged/ragged_where_op.py
index 23af6b20146..6e4ff548f62 100644
--- a/tensorflow/python/ops/ragged/ragged_where_op.py
+++ b/tensorflow/python/ops/ragged/ragged_where_op.py
@@ -25,6 +25,68 @@ from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_shape
+
+
+def where_v2(condition, x=None, y=None, name=None):
+  """Return the elements where `condition` is `True`.
+
+  : If both `x` and `y` are None: Retrieve indices of true elements.
+
+    Returns the coordinates of true elements of `condition`. The coordinates
+    are returned in a 2-D tensor with shape
+    `[num_true_values, dim_size(condition)]`, where `result[i]` is the
+    coordinates of the `i`th true value (in row-major order).
+
+  : If both `x` and `y` are non-`None`: Multiplex between `x` and `y`.
+
+    Choose an output shape  from the shapes of `condition`, `x`, and `y` that
+    all three shapes are broadcastable to; and then use the broadcasted
+    `condition` tensor as a mask that chooses whether the corredsponding element
+    in the output should be taken from `x` (if `condition` is true) or `y` (if
+    `condition` is false).
+
+  >>> # Example: retrieve indices of true elements
+  >>> tf.where(tf.ragged.constant([[True, False], [True]]))
+  <tf.Tensor: shape=(2, 2), dtype=int64, numpy= array([[0, 0], [1, 0]])>
+
+  >>> # Example: multiplex between `x` and `y`
+  >>> tf.where(tf.ragged.constant([[True, False], [True, False, True]]),
+  ...          tf.ragged.constant([['A', 'B'], ['C', 'D', 'E']]),
+  ...          tf.ragged.constant([['a', 'b'], ['c', 'd', 'e']]))
+  <tf.RaggedTensor [[b'A', b'b'], [b'C', b'd', b'E']]>
+
+  Args:
+    condition: A potentially ragged tensor of type `bool`
+    x: A potentially ragged tensor (optional).
+    y: A potentially ragged tensor (optional).  Must be specified if `x` is
+      specified.  Must have the same rank and type as `x`.
+    name: A name of the operation (optional).
+
+  Returns:
+    : If both `x` and `y` are `None`:
+      A `Tensor` with shape `(num_true, rank(condition))`.
+    : Otherwise:
+      A potentially ragged tensor with the same type as `x` and `y`, and whose
+      shape is broadcast-compatible with `x`, `y`, and `condition`.
+
+  Raises:
+    ValueError: When exactly one of `x` or `y` is non-`None`; or when
+      `condition`, `x`, and `y` have incompatible shapes.
+  """
+  if (x is None) != (y is None):
+    raise ValueError('x and y must be either both None or both non-None')
+
+  with ops.name_scope('RaggedWhere', name, [condition, x, y]):
+    condition = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        condition, name='condition')
+    if x is None:
+      return _coordinate_where(condition)
+    else:
+      x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, name='x')
+      y = ragged_tensor.convert_to_tensor_or_ragged_tensor(y, name='y')
+      condition, x, y = ragged_tensor.match_row_splits_dtypes(condition, x, y)
+      return _elementwise_where_v2(condition, x, y)
 
 
 def where(condition, x=None, y=None, name=None):
@@ -134,6 +196,32 @@ def _elementwise_where(condition, x, y):
     raise ValueError('Input shapes do not match.')
 
 
+def _elementwise_where_v2(condition, x, y):
+  """Ragged version of tf.where_v2(condition, x, y)."""
+  # Broadcast x, y, and condition to have the same shape.
+  if not (condition.shape.is_fully_defined() and x.shape.is_fully_defined() and
+          y.shape.is_fully_defined() and x.shape == y.shape and
+          condition.shape == x.shape):
+    shape_c = ragged_tensor_shape.RaggedTensorDynamicShape.from_tensor(
+        condition)
+    shape_x = ragged_tensor_shape.RaggedTensorDynamicShape.from_tensor(x)
+    shape_y = ragged_tensor_shape.RaggedTensorDynamicShape.from_tensor(y)
+    shape = ragged_tensor_shape.broadcast_dynamic_shape(
+        shape_c, ragged_tensor_shape.broadcast_dynamic_shape(shape_x, shape_y))
+    condition = ragged_tensor_shape.broadcast_to(condition, shape)
+    x = ragged_tensor_shape.broadcast_to(x, shape)
+    y = ragged_tensor_shape.broadcast_to(y, shape)
+
+  condition_is_ragged = isinstance(condition, ragged_tensor.RaggedTensor)
+  x_is_ragged = isinstance(x, ragged_tensor.RaggedTensor)
+  y_is_ragged = isinstance(y, ragged_tensor.RaggedTensor)
+  if not (condition_is_ragged or x_is_ragged or y_is_ragged):
+    return array_ops.where_v2(condition, x, y)
+
+  return ragged_functional_ops.map_flat_values(array_ops.where_v2, condition, x,
+                                               y)
+
+
 def _coordinate_where(condition):
   """Ragged version of tf.where(condition)."""
   if not isinstance(condition, ragged_tensor.RaggedTensor):
diff --git a/tensorflow/python/ops/ragged/ragged_where_op_test.py b/tensorflow/python/ops/ragged/ragged_where_op_test.py
index 4d5d6cd666a..162e47ec659 100644
--- a/tensorflow/python/ops/ragged/ragged_where_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_where_op_test.py
@@ -19,6 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_where_op
@@ -26,8 +29,7 @@ from tensorflow.python.platform import googletest
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class RaggedWhereOpTest(test_util.TensorFlowTestCase,
-                        parameterized.TestCase):
+class RaggedWhereV1OpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.parameters([
       #=========================================================================
@@ -209,5 +211,169 @@ class RaggedWhereOpTest(test_util.TensorFlowTestCase,
       ragged_where_op.where(condition, x, y)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedWhereV2OpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Coordinate-retrieval mode
+      #=========================================================================
+      dict(  # shape=[D1]
+          condition=[True, False, True, False, True],
+          expected=[[0], [2], [4]]),
+      dict(  # shape=[D1, D2]
+          condition=[[True, False], [False, True]],
+          expected=[[0, 0], [1, 1]]),
+      dict(  # shape=[D1, (D2)]
+          condition=ragged_factory_ops.constant_value(
+              [[True, False, True], [False, True]]),
+          expected=[[0, 0], [0, 2], [1, 1]]),
+      dict(  # shape=[D1, (D2), (D3)]
+          condition=ragged_factory_ops.constant_value([
+              [[True, False, True], [False, True]],
+              [[True], [], [False], [False, True, False]]
+          ]),
+          expected=[[0, 0, 0], [0, 0, 2], [0, 1, 1],
+                    [1, 0, 0], [1, 3, 1]]),
+      dict(  # shape=[D1, (D2), D3]
+          condition=ragged_factory_ops.constant_value([
+              [[True, False], [False, True]],
+              [[True, False], [False, False], [True, False], [False, True]]
+          ], ragged_rank=1),
+          expected=[[0, 0, 0], [0, 1, 1],
+                    [1, 0, 0], [1, 2, 0], [1, 3, 1]]),
+      dict(  # shape=[D1, (D2), (D3), (D4)]
+          condition=ragged_factory_ops.constant_value([
+              [[[], [True]]],
+              [[[True, False, True], [False, True]],
+               [[True], [], [False], [False, True, False]]]
+          ]),
+          expected=[[0, 0, 1, 0],
+                    [1, 0, 0, 0], [1, 0, 0, 2], [1, 0, 1, 1],
+                    [1, 1, 0, 0], [1, 1, 3, 1]]),
+
+      #=========================================================================
+      # Elementwise multiplexing
+      #=========================================================================
+      dict(  # shape=[]
+          condition=True, x='A', y='a', expected=b'A'),
+      dict(  # shape=[]
+          condition=False, x='A', y='a', expected=b'a'),
+      dict(  # shape=[D1]
+          condition=[True, False, True],
+          x=['A', 'B', 'C'],
+          y=['a', 'b', 'c'],
+          expected=[b'A', b'b', b'C']),
+      dict(  # shape=[D1, D2]
+          condition=[[True, False], [False, True]],
+          x=[['A', 'B'], ['D', 'E']],
+          y=[['a', 'b'], ['d', 'e']],
+          expected=[[b'A', b'b'], [b'd', b'E']]),
+      dict(  # shape=[D1, (D2)]
+          condition=ragged_factory_ops.constant_value(
+              [[True, False, True], [False, True]]),
+          x=ragged_factory_ops.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=ragged_factory_ops.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          expected=ragged_factory_ops.constant_value(
+              [[b'A', b'b', b'C'], [b'd', b'E']])),
+      dict(  # shape=[D1, (D2), D3]
+          condition=ragged_factory_ops.constant_value([
+              [[True, False], [False, True]],
+              [[True, False], [False, False], [True, False], [False, True]]
+          ], ragged_rank=1),
+          x=ragged_factory_ops.constant_value([
+              [['A', 'B'], ['C', 'D']],
+              [['E', 'F'], ['G', 'H'], ['I', 'J'], ['K', 'L']]
+          ], ragged_rank=1),
+          y=ragged_factory_ops.constant_value([
+              [['a', 'b'], ['c', 'd']],
+              [['e', 'f'], ['g', 'h'], ['i', 'j'], ['k', 'l']]
+          ], ragged_rank=1),
+          expected=ragged_factory_ops.constant_value([
+              [[b'A', b'b'], [b'c', b'D']],
+              [[b'E', b'f'], [b'g', b'h'], [b'I', b'j'], [b'k', b'L']]
+          ], ragged_rank=1)),
+      dict(  # shape=[D1, (D2), (D3), (D4)]
+          condition=ragged_factory_ops.constant_value([
+              [[[], [True]]],
+              [[[True, False, True], [False, True]],
+               [[True], [], [False], [False, True, False]]]
+          ]),
+          x=ragged_factory_ops.constant_value([
+              [[[], ['A']]],
+              [[['B', 'C', 'D'], ['E', 'F']],
+               [['G'], [], ['H'], ['I', 'J', 'K']]]
+          ]),
+          y=ragged_factory_ops.constant_value([
+              [[[], ['a']]],
+              [[['b', 'c', 'd'], ['e', 'f']],
+               [['g'], [], ['h'], ['i', 'j', 'k']]]
+          ]),
+          expected=ragged_factory_ops.constant_value([
+              [[[], [b'A']]],
+              [[[b'B', b'c', b'D'], [b'e', b'F']],
+               [[b'G'], [], [b'h'], [b'i', b'J', b'k']]]
+          ])),
+
+      #=========================================================================
+      # Broadcasting
+      #=========================================================================
+      dict(  # c.shape=[D1], x.shape=[D1, D2], y.shape=[D1, D2]
+          condition=[[True], [False], [True]],
+          x=[['A', 'B'], ['C', 'D'], ['E', 'F']],
+          y=[['a', 'b'], ['c', 'd'], ['e', 'f']],
+          expected=[[b'A', b'B'], [b'c', b'd'], [b'E', b'F']]),
+      dict(  # c.shape=[D1], x.shape=[D1, (D2)], y.shape=[D1, (D2)]
+          condition=[[True], [False], [True]],
+          x=ragged_factory_ops.constant_value(
+              [['A', 'B', 'C'], ['D', 'E'], ['F', 'G']]),
+          y=ragged_factory_ops.constant_value(
+              [['a', 'b', 'c'], ['d', 'e'], ['f', 'g']]),
+          expected=ragged_factory_ops.constant_value(
+              [[b'A', b'B', b'C'], [b'd', b'e'], [b'F', b'G']])),
+      dict(  # c.shape=[D1, None], x.shape=[], y.shape=[]
+          condition=ragged_factory_ops.constant_value(
+              [[True, False, True, True], [True, False]]),
+          x=10,
+          y=20,
+          expected=ragged_factory_ops.constant_value(
+              [[10, 20, 10, 10], [10, 20]])),
+      dict(  # c.shape=[D1, D2], x.shape=[D1, 1], y.shape=[1, D2]
+          condition=[[True, False], [True, False], [False, True]],
+          x=[[10], [20], [30]],
+          y=[[40, 50]],
+          expected=[[10, 50], [20, 50], [40, 30]]),
+      dict(  # c.shape=[D1, (D2), D3], x.shape=[D1, (D2), 1], y.shape=[D3]
+          condition=ragged_factory_ops.constant_value(
+              [[[True, False], [False, True]], [[True, True]]],
+              ragged_rank=1),
+          x=ragged_factory_ops.constant_value([[[10], [20]], [[30]]],
+                                              ragged_rank=1),
+          y=np.array([[[40, 50]]]),
+          expected=[[[10, 50], [40, 20]], [[30, 30]]]),
+  ])   # pyformat: disable
+  def testRaggedWhere(self, condition, expected, x=None, y=None):
+    result = ragged_where_op.where_v2(condition, x, y)
+    self.assertAllEqual(result, expected)
+
+  @parameterized.parameters([
+      dict(
+          condition=[True, False],
+          x=[1, 2],
+          error=ValueError,
+          message='x and y must be either both None or both non-None'),
+      dict(
+          condition=ragged_factory_ops.constant_value([[True, False, True],
+                                                       [False, True]]),
+          x=ragged_factory_ops.constant_value([['A', 'B', 'C'], ['D', 'E']]),
+          y=[['a', 'b'], ['d', 'e']],
+          error=errors.InvalidArgumentError,
+          message=r'must be broadcastable|Unable to broadcast'),
+  ])
+  def testRaggedWhereErrors(self, condition, error, message, x=None, y=None):
+    with self.assertRaisesRegex(error, message):
+      self.evaluate(ragged_where_op.where_v2(condition, x, y))
+
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/raw_ops_test.py b/tensorflow/python/ops/raw_ops_test.py
index 850e96bb9ed..6706ef194b2 100644
--- a/tensorflow/python/ops/raw_ops_test.py
+++ b/tensorflow/python/ops/raw_ops_test.py
@@ -18,16 +18,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
+
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.platform import test
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class RawOpsTest(test.TestCase):
+@test_util.disable_tfrt
+class RawOpsTest(test.TestCase, parameterized.TestCase):
 
   def testSimple(self):
     x = constant_op.constant(1)
@@ -58,6 +64,29 @@ class RawOpsTest(test.TestCase):
         gen_math_ops.Any(input=x, axis=0),
         gen_math_ops.Any(input=x, axis=0, keep_dims=False))
 
+  @parameterized.parameters([[0, 8]], [[-1, 6]])
+  def testStringNGramsBadDataSplits(self, splits):
+    data = ["aa", "bb", "cc", "dd", "ee", "ff"]
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Invalid split value"):
+      self.evaluate(
+          gen_string_ops.string_n_grams(
+              data=data,
+              data_splits=splits,
+              separator="",
+              ngram_widths=[2],
+              left_pad="",
+              right_pad="",
+              pad_width=0,
+              preserve_short_sequences=False))
+
+  def testGetSessionHandle(self):
+    if context.executing_eagerly():
+      with self.assertRaisesRegex(
+          errors.FailedPreconditionError,
+          "GetSessionHandle called on null session state"):
+        gen_data_flow_ops.GetSessionHandle(value=[1])
+
 
 if __name__ == "__main__":
   ops.enable_eager_execution()
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 5d4eeba2994..6cda36d556e 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -35,14 +35,15 @@ from tensorflow.python.framework import auto_control_deps_utils as acd
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import gen_state_ops
+from tensorflow.python.ops import handle_data_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@@ -62,14 +63,8 @@ acd.register_read_only_resource_op("ResourceGatherNd")
 acd.register_read_only_resource_op("_ReadVariablesOp")
 
 
-def get_resource_handle_data(graph_op):
-  assert type(graph_op) == ops.Tensor  # pylint: disable=unidiomatic-typecheck
-
-  handle_data = pywrap_tf_session.GetHandleShapeAndType(
-      graph_op.graph._c_graph, graph_op._as_tf_output())  # pylint: disable=protected-access
-
-  return cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData.FromString(
-      compat.as_bytes(handle_data))
+# TODO(allenl): Remove this alias and migrate callers.
+get_resource_handle_data = handle_data_util.get_resource_handle_data
 
 
 def get_eager_safe_handle_data(handle):
@@ -156,6 +151,12 @@ def _variable_handle_from_shape_and_dtype(shape,
     container = ""
   shape = tensor_shape.as_shape(shape)
   dtype = dtypes.as_dtype(dtype)
+  if not graph_mode:
+    if shared_name is not None:
+      raise errors.InternalError(
+          "Using an explicit shared_name is not supported executing eagerly.")
+    shared_name = context.shared_name()
+
   handle = gen_resource_variable_ops.var_handle_op(
       shape=shape,
       dtype=dtype,
@@ -169,19 +170,6 @@ def _variable_handle_from_shape_and_dtype(shape,
     _set_handle_shapes_and_types(handle, full_handle_data, graph_mode)
     return handle
   else:
-    # We do not want two distinct ResourceVariable objects for the same
-    # underlying resource in the runtime.
-    # When in eager mode, explicitly ensure so here. When in graph mode, it's
-    # ensured by always generating different variable names.
-    exists = gen_resource_variable_ops.var_is_initialized_op(handle)
-
-    # We create an assert Op instead of checking right away in order to be
-    # compatible with ASYNC execution mode. Further, since not all devices
-    # support string tensors, we encode the assertion string in the Op name
-    gen_logging_ops._assert(  # pylint: disable=protected-access
-        math_ops.logical_not(exists), [exists],
-        name="EagerVariableNameReuse")
-
     handle_data = cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData()
     handle_data.is_set = True
     handle_data.shape_and_type.append(
@@ -800,7 +788,13 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     Returns:
       A `Tensor` of type `bool`.
     """
-    return gen_resource_variable_ops.var_is_initialized_op(self.handle, name)
+    # TODO(b/169792703): The current device placement logic never overrides an
+    # explicit placement with a custom device, causing `v.is_initalized()` to
+    # fail under a non-custom device context if `v` is in a custom device. The
+    # explicit placement below makes this work, but should not be necessary once
+    # the logic is updated to handle cases like this.
+    with ops.device(self.device):
+      return gen_resource_variable_ops.var_is_initialized_op(self.handle, name)
 
   def assign_sub(self, delta, use_locking=None, name=None, read_value=True):
     """Subtracts a value from this variable.
@@ -1686,11 +1680,6 @@ class ResourceVariable(BaseResourceVariable):
     if constraint is not None and not callable(constraint):
       raise ValueError("The `constraint` argument must be a callable.")
 
-    if isinstance(initial_value, trackable.CheckpointInitialValue):
-      self._maybe_initialize_trackable()
-      self._update_uid = initial_value.checkpoint_position.restore_uid
-      initial_value = initial_value.wrapped_value
-
     if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
       collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
     with ops.init_scope():
@@ -1708,7 +1697,7 @@ class ResourceVariable(BaseResourceVariable):
           # When in eager mode use a uid for the shared_name, to prevent
           # accidental sharing.
           unique_id = "%s_%d" % (handle_name, ops.uid())
-          shared_name = context.shared_name()
+          shared_name = None  # Never shared
         # Use attr_scope and device(None) to simulate the behavior of
         # colocate_with when the variable we want to colocate with doesn't
         # yet exist.
@@ -1719,10 +1708,15 @@ class ResourceVariable(BaseResourceVariable):
                 s=[compat.as_bytes("loc:@%s" % handle_name)]))
         with ops.get_default_graph()._attr_scope({"_class": attr}):
           with ops.name_scope("Initializer"), device_context_manager(None):
-            initial_value = ops.convert_to_tensor(
-                initial_value() if init_from_fn else initial_value,
-                name="initial_value",
-                dtype=dtype)
+            if init_from_fn:
+              initial_value = initial_value()
+            if isinstance(initial_value, trackable.CheckpointInitialValue):
+              self._maybe_initialize_trackable()
+              self._update_uid = initial_value.checkpoint_position.restore_uid
+              initial_value = initial_value.wrapped_value
+            initial_value = ops.convert_to_tensor(initial_value,
+                                                  name="initial_value",
+                                                  dtype=dtype)
           if shape is not None:
             if not initial_value.shape.is_compatible_with(shape):
               raise ValueError(
@@ -1954,7 +1948,7 @@ class UninitializedVariable(BaseResourceVariable):
           unique_id = shared_name
         else:
           unique_id = "%s_%d" % (handle_name, ops.uid())
-          shared_name = context.shared_name(unique_id)
+          shared_name = None  # Never shared
         handle = _variable_handle_from_shape_and_dtype(
             shape=shape,
             dtype=dtype,
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 7ee5a16ca9a..53f6a2b0492 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -71,7 +71,7 @@ def _maybe_copy_to_context_device(tensor, device_name):
 class EagerFunc(object):
   """A wrapper for a function owned by an EagerPyFunc."""
 
-  def __init__(self, func, Tout, is_grad_func):
+  def __init__(self, func, Tout, is_grad_func, use_tape_cache=True):
     """Constructs an EagerFunc.
 
     Args:
@@ -80,10 +80,14 @@ class EagerFunc(object):
         None.
       is_grad_func: Whether this EagerFunc is the gradient of another
         EagerPyFunc.
+      use_tape_cache: (Optional.) Whether to cache `func` in the `tape_cache`.
+        For additional information, see description of `_eager_py_func`.
+        This parameter should be removed once the #35084 issue is fixed.
     """
     self._func = func
     self._out_dtypes = Tout
     self._is_grad_func = is_grad_func
+    self._use_tape_cache = use_tape_cache
 
   def _convert(self, value, dtype):
     """Converts `value` to a tensor of type `dtype`, with error checking.
@@ -147,7 +151,8 @@ class EagerFunc(object):
         else:
           outputs = _maybe_copy_to_context_device(
               self._convert(ret, dtype=self._out_dtypes[0]), device_name)
-    tape_cache[compat.as_bytes(token)] = (tape, args, outputs)
+    if self._use_tape_cache:
+      tape_cache[compat.as_bytes(token)] = (tape, args, outputs)
     return outputs
 
 
@@ -277,7 +282,8 @@ def _internal_py_func(func,
                       stateful=None,
                       eager=False,
                       is_grad_func=False,
-                      name=None):
+                      name=None,
+                      use_tape_cache=True):
   """See documentation for py_func and eager_py_func."""
   if not callable(func):
     raise ValueError("Expected func to be callable, got func of type {}".format(
@@ -293,7 +299,7 @@ def _internal_py_func(func,
     Tout = [Tout]
 
   if eager:
-    func = EagerFunc(func, Tout, is_grad_func)
+    func = EagerFunc(func, Tout, is_grad_func, use_tape_cache=use_tape_cache)
 
   # Tying the registered function's lifetime with the current default graph is
   # not reliable. For example, Estimator-based binaries may switch graphs in
@@ -370,6 +376,58 @@ def _EagerPyFuncGrad(op, *dy):
         is_grad_func=True)
 
 
+def _eager_py_func(func, inp, Tout, name=None, use_tape_cache=True):
+  """Wraps a python function into a TensorFlow op that executes it eagerly.
+
+  This function is the internal implementation for `eager_py_func`, see the
+  `eager_py_func` docstring for the full description.
+
+  Note: this function as a layer of indirection was added with one
+  specific purpose: as a workaround for github issue #35084.
+  It does all the same as `eager_py_func` used to do with one difference:
+  it can be used to instruct underlying EagerFunc not to use `tape_cache`
+  to avoid memory leak. When the issue #35084 is fixed - this function should
+  be removed, its body should be moved back to become the body of
+  `eager_py_func` and all the call sites should be reverted to
+  using `eager_py_func` without `use_tape_cache` argument of any value.
+
+  Args:
+    func: A Python function which accepts a list of `Tensor` objects having
+      element types that match the corresponding `tf.Tensor` objects in `inp`
+      and returns a list of `Tensor` objects (or a single `Tensor`, or `None`)
+      having element types that match the corresponding values in `Tout`.
+    inp: A list of `Tensor` objects.
+    Tout: A list or tuple of tensorflow data types or a single tensorflow data
+      type if there is only one, indicating what `func` returns; an empty list
+      if no value is returned (i.e., if the return value is `None`).
+    name: A name for the operation (optional).
+    use_tape_cache: (Optional.) Whether to cache `func` in the `tape_cache`.
+      For additional information, see description of `_eager_py_func`.
+      This parameter should be removed once the #35084 issue is fixed.
+
+  Returns:
+    A list of `Tensor` or a single `Tensor` which `func` computes; an empty list
+    if `func` returns None.
+  """
+  if ops.executing_eagerly_outside_functions():
+    with ops.device(context.context().host_address_space()):
+      return _internal_py_func(
+          func=func,
+          inp=inp,
+          Tout=Tout,
+          eager=True,
+          name=name,
+          use_tape_cache=use_tape_cache)
+
+  return _internal_py_func(
+      func=func,
+      inp=inp,
+      Tout=Tout,
+      eager=True,
+      name=name,
+      use_tape_cache=use_tape_cache)
+
+
 @tf_export("py_function")
 @dispatch.add_dispatch_support
 def eager_py_func(func, inp, Tout, name=None):
@@ -451,12 +509,8 @@ def eager_py_func(func, inp, Tout, name=None):
     A list of `Tensor` or a single `Tensor` which `func` computes; an empty list
     if `func` returns None.
   """
-  if ops.executing_eagerly_outside_functions():
-    with ops.device(context.context().host_address_space()):
-      return _internal_py_func(
-          func=func, inp=inp, Tout=Tout, eager=True, name=name)
-
-  return _internal_py_func(func=func, inp=inp, Tout=Tout, eager=True, name=name)
+  return _eager_py_func(
+      func=func, inp=inp, Tout=Tout, name=name, use_tape_cache=True)
 
 
 def py_func_common(func, inp, Tout, stateful=True, name=None):
diff --git a/tensorflow/python/ops/sparse_ops_test.py b/tensorflow/python/ops/sparse_ops_test.py
index 829971b1af1..74e150ad6c7 100644
--- a/tensorflow/python/ops/sparse_ops_test.py
+++ b/tensorflow/python/ops/sparse_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -30,6 +31,7 @@ from tensorflow.python.framework import test_util
 # Need array_grad to register gradient for Identity.
 from tensorflow.python.ops import array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import gradient_checker_v2 as gradient_checker
 from tensorflow.python.ops import math_ops
 # Need sparse_grad to register gradient for SparseToDense.
@@ -234,5 +236,57 @@ class SparseOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual([5], result.dense_shape)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class RawOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  def testSparseFillEmptyRowsGrad(self):
+    reverse_index_map = [2, 1]
+    grad_values = [0, 1, 2, 3]
+    d_values, d_default_value = self.evaluate(
+        gen_sparse_ops.SparseFillEmptyRowsGrad(
+            reverse_index_map=reverse_index_map, grad_values=grad_values))
+    self.assertAllEqual([2, 1], d_values)
+    self.assertEqual(3, d_default_value)
+
+  def testSparseFillEmptyRowsGradNegativeIndexMapValue(self):
+    reverse_index_map = [2, -1]
+    grad_values = [0, 1, 2, 3]
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        r'Elements in reverse index must be in \[0, 4\)'):
+      self.evaluate(
+          gen_sparse_ops.SparseFillEmptyRowsGrad(
+              reverse_index_map=reverse_index_map, grad_values=grad_values))
+
+  def testSparseFillEmptyRowsGradLargeIndexMapValue(self):
+    reverse_index_map = [2, 10]
+    grad_values = [0, 1, 2, 3]
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        r'Elements in reverse index must be in \[0, 4\)'):
+      self.evaluate(
+          gen_sparse_ops.SparseFillEmptyRowsGrad(
+              reverse_index_map=reverse_index_map, grad_values=grad_values))
+
+  def testSparseFillEmptyRowsGradMatrix(self):
+    reverse_index_map = [0, 1]
+    grad_values = [[0, 1], [2, 3]]
+    # Note: Eager mode and graph mode throw different errors here. Graph mode
+    # will fail with a ValueError from the shape checking logic, while Eager
+    # will fail with an InvalidArgumentError from the kernel itself.
+    if context.executing_eagerly():
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  r'grad_values must be a vector'):
+        self.evaluate(
+            gen_sparse_ops.SparseFillEmptyRowsGrad(
+                reverse_index_map=reverse_index_map, grad_values=grad_values))
+    else:
+      with self.assertRaisesRegex(ValueError,
+                                  r'Shape must be rank 1 but is rank 2'):
+        self.evaluate(
+            gen_sparse_ops.SparseFillEmptyRowsGrad(
+                reverse_index_map=reverse_index_map, grad_values=grad_values))
+
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index 6bddd3ea9bf..2f4589cdf98 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -606,21 +606,25 @@ def _enclosing_tpu_context():
 @tf_export('einsum', 'linalg.einsum')
 @dispatch.add_dispatch_support
 def einsum(equation, *inputs, **kwargs):
-  """Tensor contraction over specified indices and outer product.
+  r"""Tensor contraction over specified indices and outer product.
 
   Einsum allows defining Tensors by defining their element-wise computation.
   This computation is defined by `equation`, a shorthand form based on Einstein
   summation. As an example, consider multiplying two matrices A and B to form a
   matrix C.  The elements of C are given by:
 
-  ```
-    C[i,k] = sum_j A[i,j] * B[j,k]
-  ```
+  $$ C_{i,k} = \sum_j A_{i,j} B_{j,k} $$
 
-  The corresponding `equation` is:
+  or
 
   ```
-    ij,jk->ik
+  C[i,k] = sum_j A[i,j] * B[j,k]
+  ```
+
+  The corresponding einsum `equation` is:
+
+  ```
+  ij,jk->ik
   ```
 
   In general, to convert the element-wise equation into the `equation` string,
@@ -632,35 +636,98 @@ def einsum(equation, *inputs, **kwargs):
   3. drop summation signs, and (`ik = ij, jk`)
   4. move the output to the right, while replacing "=" with "->". (`ij,jk->ik`)
 
+  Note: If the output indices are not specified repeated indices are summed.
+  So `ij,jk->ik` can be simplified to `ij,jk`.
+
   Many common operations can be expressed in this way.  For example:
 
-  ```python
-  # Matrix multiplication
-  einsum('ij,jk->ik', m0, m1)  # output[i,k] = sum_j m0[i,j] * m1[j, k]
+  **Matrix multiplication**
 
-  # Dot product
-  einsum('i,i->', u, v)  # output = sum_i u[i]*v[i]
+  >>> m0 = tf.random.normal(shape=[2, 3])
+  >>> m1 = tf.random.normal(shape=[3, 5])
+  >>> e = tf.einsum('ij,jk->ik', m0, m1)
+  >>> # output[i,k] = sum_j m0[i,j] * m1[j, k]
+  >>> print(e.shape)
+  (2, 5)
 
-  # Outer product
-  einsum('i,j->ij', u, v)  # output[i,j] = u[i]*v[j]
+  Repeated indices are summed if the output indices are not specified.
 
-  # Transpose
-  einsum('ij->ji', m)  # output[j,i] = m[i,j]
+  >>> e = tf.einsum('ij,jk', m0, m1)  # output[i,k] = sum_j m0[i,j] * m1[j, k]
+  >>> print(e.shape)
+  (2, 5)
 
-  # Trace
-  einsum('ii', m)  # output[j,i] = trace(m) = sum_i m[i, i]
 
-  # Batch matrix multiplication
-  einsum('aij,ajk->aik', s, t)  # out[a,i,k] = sum_j s[a,i,j] * t[a, j, k]
-  ```
+  **Dot product**
 
-  To enable and control broadcasting, use an ellipsis.  For example, to perform
-  batch matrix multiplication with NumPy-style broadcasting across the batch
-  dimensions, use:
+  >>> u = tf.random.normal(shape=[5])
+  >>> v = tf.random.normal(shape=[5])
+  >>> e = tf.einsum('i,i->', u, v)  # output = sum_i u[i]*v[i]
+  >>> print(e.shape)
+  ()
 
-  ```python
-  einsum('...ij,...jk->...ik', u, v)
-  ```
+  **Outer product**
+
+  >>> u = tf.random.normal(shape=[3])
+  >>> v = tf.random.normal(shape=[5])
+  >>> e = tf.einsum('i,j->ij', u, v)  # output[i,j] = u[i]*v[j]
+  >>> print(e.shape)
+  (3, 5)
+
+  **Transpose**
+
+  >>> m = tf.ones(2,3)
+  >>> e = tf.einsum('ij->ji', m0)  # output[j,i] = m0[i,j]
+  >>> print(e.shape)
+  (3, 2)
+
+  **Diag**
+
+  >>> m = tf.reshape(tf.range(9), [3,3])
+  >>> diag = tf.einsum('ii->i', m)
+  >>> print(diag.shape)
+  (3,)
+
+  **Trace**
+
+  >>> # Repeated indices are summed.
+  >>> trace = tf.einsum('ii', m)  # output[j,i] = trace(m) = sum_i m[i, i]
+  >>> assert trace == sum(diag)
+  >>> print(trace.shape)
+  ()
+
+  **Batch matrix multiplication**
+
+  >>> s = tf.random.normal(shape=[7,5,3])
+  >>> t = tf.random.normal(shape=[7,3,2])
+  >>> e = tf.einsum('bij,bjk->bik', s, t)
+  >>> # output[a,i,k] = sum_j s[a,i,j] * t[a, j, k]
+  >>> print(e.shape)
+  (7, 5, 2)
+
+  This method does not support broadcasting on named-axes. All axes with
+  matching labels should have the same length. If you have length-1 axes,
+  use `tf.squeseze` or `tf.reshape` to eliminate them.
+
+  To write code that is agnostic to the number of indices in the input
+  use an ellipsis. The ellipsis is a placeholder for "whatever other indices
+  fit here".
+
+  For example, to perform a NumPy-style broadcasting-batch-matrix multiplication
+  where the matrix multiply acts on the last two axes of the input, use:
+
+  >>> s = tf.random.normal(shape=[11, 7, 5, 3])
+  >>> t = tf.random.normal(shape=[11, 7, 3, 2])
+  >>> e =  tf.einsum('...ij,...jk->...ik', s, t)
+  >>> print(e.shape)
+  (11, 7, 5, 2)
+
+  Einsum **will** broadcast over axes covered by the ellipsis.
+
+  >>> s = tf.random.normal(shape=[11, 1, 5, 3])
+  >>> t = tf.random.normal(shape=[1, 7, 3, 2])
+  >>> e =  tf.einsum('...ij,...jk->...ik', s, t)
+  >>> print(e.shape)
+  (11, 7, 5, 2)
 
   Args:
     equation: a `str` describing the contraction, in the same format as
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 623f5063c7d..ba184b222ca 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -635,6 +635,8 @@ class BesselTest(test.TestCase, parameterized.TestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
+@test_util.run_all_without_tensor_float_32(
+    'Tests einsum, which sometimes does a matmul with cuBLAS')
 class EinsumTest(test.TestCase):
 
   def _check(self, s, *input_shapes, **kwargs):
diff --git a/tensorflow/python/ops/stateful_random_ops.py b/tensorflow/python/ops/stateful_random_ops.py
index b6cbb229af4..902b83f9e0e 100644
--- a/tensorflow/python/ops/stateful_random_ops.py
+++ b/tensorflow/python/ops/stateful_random_ops.py
@@ -23,6 +23,7 @@ import enum  # pylint: disable=g-bad-import-order
 import numpy as np
 import six
 
+from tensorflow.python.compat import compat
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import composite_tensor
@@ -32,12 +33,14 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_stateful_random_ops
+from tensorflow.python.ops import gen_stateless_random_ops_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.util.tf_export import tf_export
 
+
 # A seed for random ops (stateful and stateless) will always be 1024
 # bits, all of which will be sent to the C++ code. The actual C++
 # implementation of some algorithms may only use a lower part of the bits.
@@ -136,6 +139,15 @@ def _make_1d_state(state_size, seed):
   return seed
 
 
+def _get_counter_size(alg):
+  if alg == RNG_ALG_PHILOX:
+    return 2
+  elif alg == RNG_ALG_THREEFRY:
+    return 1
+  else:
+    raise ValueError("Unsupported algorithm id: %s" % alg)
+
+
 def _get_state_size(alg):
   if alg == RNG_ALG_PHILOX:
     return PHILOX_STATE_SIZE
@@ -560,6 +572,10 @@ class Generator(tracking.AutoTrackable, composite_tensor.CompositeTensor):
     return self._alg
 
   def _standard_normal(self, shape, dtype):
+    if compat.forward_compatible(2020, 10, 25):
+      key, counter = self._prepare_key_counter(shape)
+      return gen_stateless_random_ops_v2.stateless_random_normal_v2(
+          shape, key=key, counter=counter, dtype=dtype, alg=self.algorithm)
     return gen_stateful_random_ops.stateful_standard_normal_v2(
         self.state.handle, self.algorithm, shape, dtype=dtype)
 
@@ -586,6 +602,8 @@ class Generator(tracking.AutoTrackable, composite_tensor.CompositeTensor):
     else:
       raise ValueError("Unsupported algorithm id: %s" % alg)
 
+  # TODO(wangpeng): Add "Returns" section to docstring once new version kicks in
+  # pylint: disable=g-doc-return-or-yield
   def skip(self, delta):
     """Advance the counter of a counter-based RNG.
 
@@ -595,7 +613,24 @@ class Generator(tracking.AutoTrackable, composite_tensor.CompositeTensor):
         (or any other distribution). The actual increment added to the
         counter is an unspecified implementation detail.
     """
-    gen_stateful_random_ops.rng_skip(self.state.handle, self.algorithm, delta)
+    if compat.forward_compatible(2020, 10, 25):
+      return gen_stateful_random_ops.rng_read_and_skip(
+          self.state.handle,
+          alg=math_ops.cast(self.algorithm, dtypes.int32),
+          delta=math_ops.cast(delta, dtypes.uint64))
+    gen_stateful_random_ops.rng_skip(
+        self.state.handle, math_ops.cast(self.algorithm, dtypes.int64),
+        math_ops.cast(delta, dtypes.int64))
+  # pylint: enable=g-doc-return-or-yield
+
+  def _prepare_key_counter(self, shape):
+    delta = math_ops.reduce_prod(shape)
+    counter_key = self.skip(delta)
+    counter_size = _get_counter_size(self.algorithm)
+    counter = array_ops.bitcast(counter_key[:counter_size], dtypes.uint64)
+    key = array_ops.bitcast(counter_key[counter_size:counter_size + 1],
+                            dtypes.uint64)
+    return key, counter
 
   # The following functions return a tensor and as a side effect update
   # self._state_var.
@@ -624,6 +659,14 @@ class Generator(tracking.AutoTrackable, composite_tensor.CompositeTensor):
       return math_ops.add(rnd * stddev, mean, name=name)
 
   def _truncated_normal(self, shape, dtype):
+    if compat.forward_compatible(2020, 10, 25):
+      key, counter = self._prepare_key_counter(shape)
+      return gen_stateless_random_ops_v2.stateless_truncated_normal_v2(
+          shape=shape,
+          key=key,
+          counter=counter,
+          dtype=dtype,
+          alg=self.algorithm)
     return gen_stateful_random_ops.stateful_truncated_normal(
         self.state.handle, self.algorithm, shape, dtype=dtype)
 
@@ -662,10 +705,27 @@ class Generator(tracking.AutoTrackable, composite_tensor.CompositeTensor):
       return math_ops.add(mul, mean_tensor, name=name)
 
   def _uniform(self, shape, dtype):
+    if compat.forward_compatible(2020, 10, 25):
+      key, counter = self._prepare_key_counter(shape)
+      return gen_stateless_random_ops_v2.stateless_random_uniform_v2(
+          shape=shape,
+          key=key,
+          counter=counter,
+          dtype=dtype,
+          alg=self.algorithm)
     return gen_stateful_random_ops.stateful_uniform(
         self.state.handle, self.algorithm, shape=shape, dtype=dtype)
 
   def _uniform_full_int(self, shape, dtype, name=None):
+    if compat.forward_compatible(2020, 10, 25):
+      key, counter = self._prepare_key_counter(shape)
+      return gen_stateless_random_ops_v2.stateless_random_uniform_full_int_v2(
+          shape=shape,
+          key=key,
+          counter=counter,
+          dtype=dtype,
+          alg=self.algorithm,
+          name=name)
     return gen_stateful_random_ops.stateful_uniform_full_int(
         self.state.handle, self.algorithm, shape=shape,
         dtype=dtype, name=name)
@@ -729,6 +789,16 @@ class Generator(tracking.AutoTrackable, composite_tensor.CompositeTensor):
       minval = ops.convert_to_tensor(minval, dtype=dtype, name="min")
       maxval = ops.convert_to_tensor(maxval, dtype=dtype, name="max")
       if dtype.is_integer:
+        if compat.forward_compatible(2020, 10, 25):
+          key, counter = self._prepare_key_counter(shape)
+          return gen_stateless_random_ops_v2.stateless_random_uniform_int_v2(
+              shape=shape,
+              key=key,
+              counter=counter,
+              minval=minval,
+              maxval=maxval,
+              alg=self.algorithm,
+              name=name)
         return gen_stateful_random_ops.stateful_uniform_int(
             self.state.handle, self.algorithm, shape=shape,
             minval=minval, maxval=maxval, name=name)
diff --git a/tensorflow/python/ops/stateful_random_ops_test.py b/tensorflow/python/ops/stateful_random_ops_test.py
index 2389a068854..756ead401b4 100644
--- a/tensorflow/python/ops/stateful_random_ops_test.py
+++ b/tensorflow/python/ops/stateful_random_ops_test.py
@@ -274,7 +274,7 @@ class StatefulRandomOpsTest(test.TestCase, parameterized.TestCase):
     with self.cached_session() as sess:
       gen1 = random.Generator.from_seed(seed)
       gen2 = random.Generator.from_non_deterministic_state()
-      sess.run((gen1._state_var.initializer, gen2._state_var.initializer))
+      sess.run((gen1.state.initializer, gen2.state.initializer))
       r1 = gen1.normal(shape, dtype=dtypes.float32)
       r2 = gen2.normal(shape, dtype=dtypes.float32)
       def f():
@@ -372,7 +372,7 @@ class StatefulRandomOpsTest(test.TestCase, parameterized.TestCase):
     gen = random.Generator(state=[counter, 0, key], alg=random.RNG_ALG_PHILOX)
     delta = 432
     gen.skip(delta)
-    new_counter = gen._state_var[0]
+    new_counter = gen.state[0]
     self.assertAllEqual(counter + delta * 256, new_counter)
 
   def _sameAsOldRandomOps(self, device, floats):
@@ -394,7 +394,7 @@ class StatefulRandomOpsTest(test.TestCase, parameterized.TestCase):
         with ops.device(device):
           return new(dtype, gen)
 
-      for _ in range(100):
+      for _ in range(5):
         self.assertAllEqual(run_old(), run_new())
 
     shape = constant_op.constant([4, 7])
@@ -582,6 +582,11 @@ class StatefulRandomOpsTest(test.TestCase, parameterized.TestCase):
   @test_util.run_v2_only
   def testGetGlobalGeneratorWithXla(self):
     """Demonstrates using the global generator with XLA."""
+    # This test was passing before because soft placement silently picked the
+    # CPU kernel.
+    # TODO(wangpeng): Remove this skip
+    self.skipTest("NonDeterministicInts lacks XLA kernel.")
+
     if not config.list_physical_devices("XLA_CPU"):
       self.skipTest("No XLA_CPU device available.")
 
@@ -675,17 +680,16 @@ class StatefulRandomOpsTest(test.TestCase, parameterized.TestCase):
           random.GeneratorSpec(shape=(2, 3), dtype=dtypes.int64))
 
   @test_util.run_v2_only
-  @test_util.run_cuda_only
-  def testMirroredStratSeq(self):
+  def testCreateOutsideMirroredStrat(self):
     """Tests RNG/MirrorStrategy interaction #1.
 
-    If an RNG is created outside strategy.scope(), all replicas will access the
+    If an RNG is created outside a DS scope, all replicas will access the
     same RNG object, and accesses are serialized.
     """
     shape = [3, 4]
     dtype = dtypes.int32
     gen = random.Generator.from_seed(1234)
-    strat = MirroredStrategy(devices=["/cpu:0", test_util.gpu_device_name()])
+    strat = MirroredStrategy(devices=["cpu:0", "cpu:1"])
     with strat.scope():
       def f():
         t1 = gen.uniform_full_int(shape=shape, dtype=dtype)
@@ -762,4 +766,5 @@ class StatefulRandomOpsTest(test.TestCase, parameterized.TestCase):
 
 
 if __name__ == "__main__":
+  config.set_soft_device_placement(False)
   test.main()
diff --git a/tensorflow/python/ops/stateless_random_ops.py b/tensorflow/python/ops/stateless_random_ops.py
index 3e825cc4775..ebe15ec0dce 100644
--- a/tensorflow/python/ops/stateless_random_ops.py
+++ b/tensorflow/python/ops/stateless_random_ops.py
@@ -20,16 +20,19 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.compat import compat
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_stateless_random_ops
+from tensorflow.python.ops import gen_stateless_random_ops_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
+
 ops.NotDifferentiable("StatelessMultinomial")
 ops.NotDifferentiable("StatelessRandomBinomial")
 ops.NotDifferentiable("StatelessRandomNormal")
@@ -40,6 +43,13 @@ ops.NotDifferentiable("StatelessRandomUniformFullInt")
 ops.NotDifferentiable("StatelessTruncatedNormal")
 
 
+ops.NotDifferentiable("StatelessRandomNormalV2")
+ops.NotDifferentiable("StatelessRandomUniformV2")
+ops.NotDifferentiable("StatelessRandomUniformIntV2")
+ops.NotDifferentiable("StatelessRandomUniformFullIntV2")
+ops.NotDifferentiable("StatelessTruncatedNormalV2")
+
+
 @tf_export("random.experimental.stateless_split")
 @dispatch.add_dispatch_support
 def split(seed, num=2):
@@ -113,6 +123,10 @@ def fold_in(seed, data):
   return array_ops.stack([seed1, data])
 
 
+_get_key_counter_alg = (gen_stateless_random_ops_v2
+                        .stateless_random_get_key_counter_alg)
+
+
 @tf_export("random.stateless_uniform")
 @dispatch.add_dispatch_support
 def stateless_random_uniform(shape,
@@ -192,17 +206,35 @@ def stateless_random_uniform(shape,
                       [shape, seed, minval, maxval]) as name:
     shape = tensor_util.shape_tensor(shape)
     if dtype.is_integer and minval is None:
-      result = gen_stateless_random_ops.stateless_random_uniform_full_int(
-          shape, seed=seed, dtype=dtype, name=name)
+      if compat.forward_compatible(2020, 10, 25):
+        key, counter, alg = _get_key_counter_alg(seed)
+        result = (gen_stateless_random_ops_v2
+                  .stateless_random_uniform_full_int_v2(
+                      shape, key=key, counter=counter, dtype=dtype, alg=alg,
+                      name=name))
+      else:
+        result = gen_stateless_random_ops.stateless_random_uniform_full_int(
+            shape, seed=seed, dtype=dtype, name=name)
     else:
       minval = ops.convert_to_tensor(minval, dtype=dtype, name="min")
       maxval = ops.convert_to_tensor(maxval, dtype=dtype, name="max")
       if dtype.is_integer:
-        result = gen_stateless_random_ops.stateless_random_uniform_int(
-            shape, seed=seed, minval=minval, maxval=maxval, name=name)
+        if compat.forward_compatible(2020, 10, 25):
+          key, counter, alg = _get_key_counter_alg(seed)
+          result = gen_stateless_random_ops_v2.stateless_random_uniform_int_v2(
+              shape, key=key, counter=counter, minval=minval, maxval=maxval,
+              alg=alg, name=name)
+        else:
+          result = gen_stateless_random_ops.stateless_random_uniform_int(
+              shape, seed=seed, minval=minval, maxval=maxval, name=name)
       else:
-        rnd = gen_stateless_random_ops.stateless_random_uniform(
-            shape, seed=seed, dtype=dtype)
+        if compat.forward_compatible(2020, 10, 25):
+          key, counter, alg = _get_key_counter_alg(seed)
+          rnd = gen_stateless_random_ops_v2.stateless_random_uniform_v2(
+              shape, key=key, counter=counter, dtype=dtype, alg=alg)
+        else:
+          rnd = gen_stateless_random_ops.stateless_random_uniform(
+              shape, seed=seed, dtype=dtype)
         result = math_ops.add(rnd * (maxval - minval), minval, name=name)
     tensor_util.maybe_set_static_shape(result, shape)
     return result
@@ -476,7 +508,12 @@ def stateless_random_normal(shape,
     shape = tensor_util.shape_tensor(shape)
     mean = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
     stddev = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
-    rnd = gen_stateless_random_ops.stateless_random_normal(shape, seed, dtype)
+    if compat.forward_compatible(2020, 10, 25):
+      key, counter, alg = _get_key_counter_alg(seed)
+      rnd = gen_stateless_random_ops_v2.stateless_random_normal_v2(
+          shape, key=key, counter=counter, dtype=dtype, alg=alg)
+    else:
+      rnd = gen_stateless_random_ops.stateless_random_normal(shape, seed, dtype)
     result = math_ops.add(rnd * stddev, mean, name=name)
     tensor_util.maybe_set_static_shape(result, shape)
     return result
@@ -521,8 +558,13 @@ def stateless_truncated_normal(shape,
     shape = tensor_util.shape_tensor(shape)
     mean = ops.convert_to_tensor(mean, dtype=dtype, name="mean")
     stddev = ops.convert_to_tensor(stddev, dtype=dtype, name="stddev")
-    rnd = gen_stateless_random_ops.stateless_truncated_normal(
-        shape, seed, dtype)
+    if compat.forward_compatible(2020, 10, 25):
+      key, counter, alg = _get_key_counter_alg(seed)
+      rnd = gen_stateless_random_ops_v2.stateless_truncated_normal_v2(
+          shape, key=key, counter=counter, dtype=dtype, alg=alg)
+    else:
+      rnd = gen_stateless_random_ops.stateless_truncated_normal(
+          shape, seed, dtype)
     result = math_ops.add(rnd * stddev, mean, name=name)
     tensor_util.maybe_set_static_shape(result, shape)
     return result
diff --git a/tensorflow/python/ops/summary_ops_v2.py b/tensorflow/python/ops/summary_ops_v2.py
index 305c3fa6a9a..db9227c97cb 100644
--- a/tensorflow/python/ops/summary_ops_v2.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -1360,14 +1360,14 @@ def trace_off():
   """Stops the current trace and discards any collected information."""
   global _current_trace_context
   with _current_trace_context_lock:
+    if _current_trace_context is None:
+      return  # tracing already off
+    graph, profiler = _current_trace_context  # pylint: disable=redefined-outer-name, unpacking-non-sequence
     _current_trace_context = None
 
-  # Disabling run_metadata disables graph collection as well.
-  context.context().disable_run_metadata()
+  if graph:
+    # Disabling run_metadata disables graph collection as well.
+    context.context().disable_run_metadata()
 
-  # profiler only has start and stop. One needs to stop in order to export
-  # and stopping when it is not running will raise an error.
-  try:
+  if profiler:
     _profiler.stop()
-  except _profiler.ProfilerNotRunningError:
-    pass
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index 2cd9a0161f7..58dc92084a6 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -220,24 +220,6 @@ class _GraphTensorArray(object):
       with ops.colocate_with(self._colocate_with[0]):
         yield
 
-  @contextlib.contextmanager
-  def _colocate_with_first_write_or_handle(self):
-    """Colocates ops with the handle or the first write.
-
-    In the case of colocate_with_first_write_call, the device for _handle is not
-    updated and remains empty. Colocating things with that just propagates the
-    empty device assignment, so we colocate with the first write op instead.
-
-    Yields:
-      Nothing but the appropriate colocation context.
-    """
-    if not self._colocate_with:
-      with ops.colocate_with(self._handle):
-        yield
-    else:
-      with ops.colocate_with(self._colocate_with[0]):
-        yield
-
   def identity(self):
     """See TensorArray."""
     flow = array_ops.identity(self._flow)
@@ -252,7 +234,7 @@ class _GraphTensorArray(object):
     if flow is None:
       flow = self.flow
     with ops.name_scope(name, "TensorArrayGrad", [self._handle]):
-      with self._colocate_with_first_write_or_handle():
+      with ops.colocate_with(self._handle):
         g_handle, unused_flow = gen_data_flow_ops.tensor_array_grad_v3(
             handle=self._handle, source=source, flow_in=flow, name=name)
         with ops.control_dependencies([g_handle]):
@@ -299,7 +281,7 @@ class _GraphTensorArray(object):
 
   def stack(self, name=None):
     """See TensorArray."""
-    with self._colocate_with_first_write_or_handle():
+    with ops.colocate_with(self._handle):
       with ops.name_scope(name, "TensorArrayStack", [self._handle]):
         value = self.gather(math_ops.range(0, self.size()), name=name)
         if (self.element_shape and not self._dynamic_size and
diff --git a/tensorflow/python/ops/v1_compat_tests/BUILD b/tensorflow/python/ops/v1_compat_tests/BUILD
index 37bff01d429..3f44e5208a2 100644
--- a/tensorflow/python/ops/v1_compat_tests/BUILD
+++ b/tensorflow/python/ops/v1_compat_tests/BUILD
@@ -10,6 +10,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["gradient_checker_test.py"],
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 6e0e83f8564..c1804f770b1 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -270,6 +270,24 @@ def disable_resource_variables():
   _api_usage_gauge.get_cell().set(False)
 
 
+def _needs_no_arguments(python_callable):
+  """Returns true if the callable needs no arguments to call."""
+  # TODO(bfontain): Switch to inspect.signature when we are python 3 only.
+  # signature = inspect.signature(python_callable)
+  # return not [1 for param in signature.parameters.values()
+  #             if param.default == param.empty]
+  num_arguments = len(tf_inspect.getargspec(python_callable).args)
+  if not tf_inspect.isfunction(python_callable) and not isinstance(
+      python_callable, functools.partial):
+    # getargspec includes self for function objects (which aren't
+    # functools.partial). This has no default so we need to remove it.
+    # It is not even an argument so its odd that getargspec returns this.
+    # Note that this is fixed with inspect.signature in Python 3.
+    num_arguments -= 1
+  return num_arguments == len(
+      tf_inspect.getargspec(python_callable).defaults or [])
+
+
 class _VariableStore(object):
   """Variable store that carries a number of named Variables.
 
@@ -905,18 +923,17 @@ class _VariableStore(object):
         # Instantiate initializer if provided initializer is a type object.
         if tf_inspect.isclass(initializer):
           initializer = initializer()
-        if shape is not None and shape.is_fully_defined():
+        if shape.is_fully_defined():
           if "partition_info" in tf_inspect.getargspec(initializer).args:
-            init_val = lambda: initializer(  # pylint: disable=g-long-lambda
-                shape.as_list(),
-                dtype=dtype,
-                partition_info=partition_info)
+            init_val = functools.partial(initializer,
+                                         shape.as_list(),
+                                         dtype=dtype,
+                                         partition_info=partition_info)
           else:
-            init_val = lambda: initializer(  # pylint: disable=g-long-lambda
-                shape.as_list(), dtype=dtype)
+            init_val = functools.partial(initializer,
+                                         shape.as_list(), dtype=dtype)
           variable_dtype = dtype.base_dtype
-        elif len(tf_inspect.getargspec(initializer).args) == len(
-            tf_inspect.getargspec(initializer).defaults or []):
+        elif _needs_no_arguments(initializer):
           init_val = initializer
           variable_dtype = None
         else:
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 4d6ca923af3..4e79ec97ff9 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -1794,8 +1794,13 @@ class RefVariable(VariableV1, core.Tensor):
           # pylint: disable=protected-access
           with ops.get_default_graph()._attr_scope({"_class": attr}):
             with ops.name_scope("Initializer"), ops.device(None):
+              initial_value = initial_value()
+              if isinstance(initial_value, trackable.CheckpointInitialValue):
+                self._maybe_initialize_trackable()
+                self._update_uid = initial_value.checkpoint_position.restore_uid
+                initial_value = initial_value.wrapped_value
               self._initial_value = ops.convert_to_tensor(
-                  initial_value(), name="initial_value", dtype=dtype)
+                  initial_value, name="initial_value", dtype=dtype)
               if shape is None:
                 shape = (
                     self._initial_value.get_shape()
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 23c24476934..556360ce640 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -161,6 +161,10 @@ def while_loop(cond,
       Returns:
         A list of tensors the same length as args.
       """
+      # The function was created with a signature rather than tensors, so
+      # internal placeholders were created without handle data.
+      _copy_handle_data(nest.flatten(loop_vars[2:], expand_composites=True),
+                        nest.flatten(args, expand_composites=True))
       # Capture the tensors already captured in cond_graph so that they appear
       # in the same order in body_graph.external_captures.
       for t in cond_graph.external_captures:
@@ -372,7 +376,7 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
     while_op._add_while_inputs(new_inputs)
     while_op._add_outputs([t.dtype for t in new_outputs],
                           [t.shape for t in new_outputs])
-    _copy_handle_data(new_outputs, op.outputs[orig_num_params:])
+    _copy_handle_data(new_outputs, while_op.outputs[orig_num_params:])
 
   # Do not ignore grads wrt extra outputs when computing higher order
   # derivatives.
diff --git a/tensorflow/python/platform/remote_utils.py b/tensorflow/python/platform/remote_utils.py
index 9ec2e5e5ef8..aa430f5d697 100644
--- a/tensorflow/python/platform/remote_utils.py
+++ b/tensorflow/python/platform/remote_utils.py
@@ -20,3 +20,11 @@ from __future__ import print_function
 
 def get_default_communication_protocol():
   return 'grpc'
+
+
+def is_remote_path(_):
+  return False
+
+
+def get_appendable_file_encoding():
+  return ''
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index 9996f5c9894..622e14616ab 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -36,6 +36,7 @@ cuda_py_test(
     srcs = ["profiler_client_test.py"],
     python_version = "PY3",
     tags = ["no_pip"],
+    tfrt_enabled = True,
     deps = [
         ":profiler_client",
         "//tensorflow/python/eager:test",
@@ -63,6 +64,7 @@ cuda_py_test(
         "no_pip",
         "no_rocm",
     ],
+    tfrt_enabled = True,
     deps = [
         ":profiler_v2",
         "//tensorflow/python:constant_op",
@@ -122,6 +124,7 @@ cuda_py_test(
     srcs = ["profiler_test.py"],
     python_version = "PY3",
     tags = ["no_pip"],
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":model_analyzer",
@@ -182,6 +185,7 @@ cuda_py_test(
         "no_gpu",  # b/136036359
         "no_pip",
     ],
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":profile_context",
diff --git a/tensorflow/python/profiler/integration_test/BUILD b/tensorflow/python/profiler/integration_test/BUILD
new file mode 100644
index 00000000000..a7b92cd714a
--- /dev/null
+++ b/tensorflow/python/profiler/integration_test/BUILD
@@ -0,0 +1,39 @@
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+py_library(
+    name = "mnist_testing_utils",
+    srcs = ["mnist_testing_utils.py"],
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:extra_py_tests_deps",
+    ],
+)
+
+cuda_py_test(
+    name = "profiler_api_test",
+    srcs = ["profiler_api_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_pip",
+        "no_rocm",
+    ],
+    deps = [
+        ":mnist_testing_utils",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
+        "//tensorflow/python/distribute:multi_process_runner",
+        "//tensorflow/python/distribute:multi_worker_test_base",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/profiler:profiler_client",
+        "//tensorflow/python/profiler:profiler_v2",
+    ],
+)
diff --git a/tensorflow/python/profiler/integration_test/mnist_testing_utils.py b/tensorflow/python/profiler/integration_test/mnist_testing_utils.py
new file mode 100644
index 00000000000..c0a7800fa9a
--- /dev/null
+++ b/tensorflow/python/profiler/integration_test/mnist_testing_utils.py
@@ -0,0 +1,72 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A simple MNIST model for testing multi-worker distribution strategies with Keras."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+def mnist_synthetic_dataset(batch_size, steps_per_epoch):
+  """Generate synthetic MNIST dataset for testing."""
+  # train dataset
+  x_train = tf.ones([batch_size * steps_per_epoch, 28, 28, 1],
+                    dtype=tf.dtypes.float32)
+  y_train = tf.ones([batch_size * steps_per_epoch, 1], dtype=tf.dtypes.int32)
+  train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+  train_ds = train_ds.repeat()
+  # train_ds = train_ds.shuffle(100)
+  train_ds = train_ds.batch(64, drop_remainder=True)
+
+  # eval dataset
+  x_test = tf.random.uniform([10000, 28, 28, 1], dtype=tf.dtypes.float32)
+  y_test = tf.random.uniform([10000, 1],
+                             minval=0,
+                             maxval=9,
+                             dtype=tf.dtypes.int32)
+  eval_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+  eval_ds = eval_ds.batch(64, drop_remainder=True)
+
+  return train_ds, eval_ds
+
+
+def get_mnist_model(input_shape):
+  """Define a deterministically-initialized CNN model for MNIST testing."""
+  inputs = tf.keras.Input(shape=input_shape)
+  x = tf.keras.layers.Conv2D(
+      32,
+      kernel_size=(3, 3),
+      activation="relu",
+      kernel_initializer=tf.keras.initializers.TruncatedNormal(seed=99))(
+          inputs)
+  x = tf.keras.layers.BatchNormalization()(x)
+  x = tf.keras.layers.Flatten()(x) + tf.keras.layers.Flatten()(x)
+  x = tf.keras.layers.Dense(
+      10,
+      activation="softmax",
+      kernel_initializer=tf.keras.initializers.TruncatedNormal(seed=99))(
+          x)
+  model = tf.keras.Model(inputs=inputs, outputs=x)
+
+  # TODO(yuefengz): optimizer with slot variables doesn't work because of
+  # optimizer's bug.
+  # TODO(yuefengz): we should not allow non-v2 optimizer.
+  model.compile(
+      loss=tf.keras.losses.sparse_categorical_crossentropy,
+      optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
+      metrics=["accuracy"])
+  return model
diff --git a/tensorflow/python/profiler/integration_test/profiler_api_test.py b/tensorflow/python/profiler/integration_test/profiler_api_test.py
new file mode 100644
index 00000000000..7b9b51ae9c2
--- /dev/null
+++ b/tensorflow/python/profiler/integration_test/profiler_api_test.py
@@ -0,0 +1,136 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf 2.x profiler."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import glob
+import os
+import threading
+
+import portpicker
+
+from tensorflow.python.distribute import collective_all_reduce_strategy as collective_strategy
+from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.eager import context
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.profiler import profiler_client
+from tensorflow.python.profiler import profiler_v2 as profiler
+from tensorflow.python.profiler.integration_test import mnist_testing_utils
+
+
+def _model_setup():
+  """Set up a MNIST Keras model for testing purposes.
+
+  Builds a MNIST Keras model and returns model information.
+
+  Returns:
+    A tuple of (batch_size, steps, train_dataset, mode)
+  """
+  context.set_log_device_placement(True)
+  batch_size = 64
+  steps = 2
+  with collective_strategy.CollectiveAllReduceStrategy().scope():
+    # TODO(b/142509827): In rare cases this errors out at C++ level with the
+    # "Connect failed" error message.
+    train_ds, _ = mnist_testing_utils.mnist_synthetic_dataset(batch_size, steps)
+    model = mnist_testing_utils.get_mnist_model((28, 28, 1))
+  return batch_size, steps, train_ds, model
+
+
+def _make_temp_log_dir(test_obj):
+  return test_obj.get_temp_dir()
+
+
+class ProfilerApiTest(test_util.TensorFlowTestCase):
+
+  def _check_tools_pb_exist(self, logdir):
+    expected_files = [
+        'overview_page.pb',
+        'input_pipeline.pb',
+        'tensorflow_stats.pb',
+        'kernel_stats.pb',
+    ]
+    for file in expected_files:
+      path = os.path.join(logdir, 'plugins', 'profile', '*', '*{}'.format(file))
+      self.assertEqual(1, len(glob.glob(path)),
+                       'Expected one path match: ' + path)
+
+  def _check_xspace_pb_exist(self, logdir):
+    path = os.path.join(logdir, 'plugins', 'profile', '*', '*.xplane.pb')
+    self.assertEqual(1, len(glob.glob(path)),
+                     'Expected one path match: ' + path)
+
+  def test_single_worker_no_profiling(self):
+    """Test single worker without profiling."""
+
+    _, steps, train_ds, model = _model_setup()
+
+    model.fit(x=train_ds, epochs=2, steps_per_epoch=steps)
+
+  def test_single_worker_sampling_mode(self):
+    """Test single worker sampling mode."""
+
+    def on_worker(port):
+      logging.info('worker starting server on {}'.format(port))
+      profiler.start_server(port)
+      _, steps, train_ds, model = _model_setup()
+      model.fit(x=train_ds, epochs=2, steps_per_epoch=steps)
+
+    def on_profile(port, logdir):
+      # Request for 30 milliseconds of profile.
+      duration_ms = 30
+
+      options = profiler.ProfilerOptions(
+          host_tracer_level=2,
+          python_tracer_level=0,
+          device_tracer_level=1,
+      )
+
+      profiler_client.trace('localhost:{}'.format(port), logdir, duration_ms,
+                            '', 100, options)
+
+    logdir = self.get_temp_dir()
+    port = portpicker.pick_unused_port()
+    thread_profiler = threading.Thread(target=on_profile, args=(port, logdir))
+    thread_worker = threading.Thread(target=on_worker, args=(port,))
+    thread_worker.start()
+    thread_profiler.start()
+    thread_profiler.join()
+    thread_worker.join(120)
+    self._check_xspace_pb_exist(logdir)
+
+  def test_single_worker_programmatic_mode(self):
+    """Test single worker programmatic mode."""
+    logdir = self.get_temp_dir()
+
+    options = profiler.ProfilerOptions(
+        host_tracer_level=2,
+        python_tracer_level=0,
+        device_tracer_level=1,
+    )
+    profiler.start(logdir, options)
+    _, steps, train_ds, model = _model_setup()
+    model.fit(x=train_ds, epochs=2, steps_per_epoch=steps)
+    profiler.stop()
+    self._check_xspace_pb_exist(logdir)
+    self._check_tools_pb_exist(logdir)
+
+
+if __name__ == '__main__':
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index 98e93367c8f..beb0693c80b 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -1,8 +1,8 @@
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
-
-# buildifier: disable=same-origin-load
-load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")  # buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")  # buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")  # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
     default_visibility = ["//tensorflow/python/profiler:__subpackages__"],
@@ -65,6 +65,7 @@ cuda_py_test(
         "no_gpu",  # b/138442728
         "no_pip",
     ],
+    tfrt_enabled = True,
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":model_analyzer_testlib",
@@ -80,6 +81,7 @@ cuda_py_test(
 tf_python_pybind_extension(
     name = "_pywrap_traceme",
     srcs = ["traceme_wrapper.cc"],
+    copts = tf_profiler_copts(),
     module_name = "_pywrap_traceme",
     visibility = [
         "//perftools/accelerators/xprof/xprofilez/integration_tests:__pkg__",
@@ -94,13 +96,13 @@ tf_python_pybind_extension(
 cc_library(
     name = "traceme_wrapper",
     hdrs = ["traceme_wrapper.h"],
-    features = ["-layering_check"],
+    copts = tf_profiler_copts(),
     visibility = [
         "//tensorflow/compiler/xla/python:__pkg__",
     ],
     deps = [
         "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/lib:traceme_headers",
+        "//tensorflow/core/profiler/lib:traceme_for_pybind",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@pybind11",
@@ -110,7 +112,7 @@ cc_library(
 tf_python_pybind_extension(
     name = "_pywrap_profiler",
     srcs = ["profiler_wrapper.cc"],
-    features = ["-layering_check"],
+    copts = tf_profiler_copts(),
     module_name = "_pywrap_profiler",
     visibility = [
         "//tensorflow/python/eager:__pkg__",
@@ -118,17 +120,18 @@ tf_python_pybind_extension(
     ],
     deps = [
         "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/convert:op_stats_to_tf_stats",
-        "//tensorflow/core/profiler/convert:xplane_to_op_stats",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/profiler/convert:xplane_to_tools_data",
         "//tensorflow/core/profiler/convert:xplane_to_trace_events",
-        "//tensorflow/core/profiler/lib:profiler_session_headers",
-        "//tensorflow/core/profiler/rpc:profiler_server_headers",
+        "//tensorflow/core/profiler/lib:profiler_session_for_pybind",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/rpc:profiler_server_for_pybind",
         "//tensorflow/core/profiler/rpc/client:capture_profile",
         "//tensorflow/core/profiler/rpc/client:save_profile",
         "//tensorflow/python:pybind11_status",
-        "@com_github_grpc_grpc//:grpc++_public_hdrs",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
         "@pybind11",
     ],
 )
@@ -137,12 +140,16 @@ cc_library(
     name = "python_hooks",
     srcs = ["python_hooks.cc"],
     hdrs = ["python_hooks.h"],
-    copts = ["-fexceptions"],
+    compatible_with = get_compatible_with_cloud(),
+    copts = tf_profiler_copts() + ["-fexceptions"],
     features = ["-use_header_modules"],  # Incompatible with -fexceptions.
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:xplane_builder",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_utils",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@pybind11",
diff --git a/tensorflow/python/profiler/internal/profiler_wrapper.cc b/tensorflow/python/profiler/internal/profiler_wrapper.cc
index 401201018d9..2f1d29ed334 100644
--- a/tensorflow/python/profiler/internal/profiler_wrapper.cc
+++ b/tensorflow/python/profiler/internal/profiler_wrapper.cc
@@ -14,23 +14,28 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <string>
+#include <vector>
 
 #include "absl/memory/memory.h"
+#include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
-#include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
-#include "tensorflow/core/profiler/convert/op_stats_to_tf_stats.h"
-#include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
-#include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
+#include "tensorflow/core/profiler/convert/xplane_to_tools_data.h"
 #include "tensorflow/core/profiler/convert/xplane_to_trace_events.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
-#include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
-#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/rpc/client/capture_profile.h"
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
 #include "tensorflow/core/profiler/rpc/profiler_server.h"
@@ -40,46 +45,174 @@ namespace py = ::pybind11;
 
 namespace {
 
-using ::tensorflow::profiler::KERNEL_STATS_DB;
-using ::tensorflow::profiler::OP_METRICS_DB;
-using ::tensorflow::profiler::STEP_DB;
+using ::tensorflow::RemoteProfilerSessionManagerOptions;
 
-tensorflow::Status ValidateHostPortPair(const std::string& host_port) {
+// Profiler gives grace after profiling duration to terminate.
+constexpr absl::Duration kSessionGraceTime = absl::Seconds(5);
+
+tensorflow::Status ValidateHostPortPair(absl::string_view host_port) {
   tensorflow::uint32 port;
   std::vector<absl::string_view> parts = absl::StrSplit(host_port, ':');
   // Must be host:port, port must be a number, host must not contain a '/',
   // host also must not be empty.
   if (parts.size() != 2 || !absl::SimpleAtoi(parts[1], &port) ||
-      parts[0].find("/") != std::string::npos || parts[0].empty()) {
+      absl::StrContains(parts[0], "/") || parts[0].empty()) {
     return tensorflow::errors::InvalidArgument(
         "Could not interpret \"", host_port, "\" as a host-port pair.");
   }
   return tensorflow::Status::OK();
 }
 
-tensorflow::ProfileOptions GetOptions(const py::dict& opts) {
-  tensorflow::ProfileOptions options =
+tensorflow::Status ValidateOptions(
+    const RemoteProfilerSessionManagerOptions& options) {
+  if (options.service_addresses().empty()) {
+    return tensorflow::errors::InvalidArgument("No service address provided.");
+  }
+
+  if (options.profiler_options().duration_ms() == 0) {
+    return tensorflow::errors::InvalidArgument(
+        "duration_ms must be greater than zero.");
+  }
+
+  for (absl::string_view host_port : options.service_addresses()) {
+    TF_RETURN_IF_ERROR(ValidateHostPortPair(host_port));
+  }
+
+  if (options.max_session_duration_ms() <
+      options.profiler_options().duration_ms()) {
+    return tensorflow::errors::InvalidArgument(
+        "The maximum profiling session duration must be greater than or equal "
+        "to the local profiler duration.");
+  }
+
+  return tensorflow::Status::OK();
+}
+
+// Receives a comma delimited list of service_addresses and adds them to
+// RemoteProfilerSessionManagerOptions::service_addresses.
+void AddServiceAddresses(absl::string_view service_addresses,
+                         RemoteProfilerSessionManagerOptions* options) {
+  for (absl::string_view server : absl::StrSplit(service_addresses, ',')) {
+    options->add_service_addresses(server.data(), server.size());
+  }
+}
+
+// Sets gRPC deadline to a grace period based on the profiling duration.
+void UpdateMaxSessionDuration(RemoteProfilerSessionManagerOptions& options) {
+  auto local_profiler_duration = options.profiler_options().duration_ms();
+  auto session_creation_ts = options.session_creation_timestamp_ns();
+  auto requested_start_ts = options.profiler_options().start_timestamp_ns();
+  // User only needs to set maximal session duration if the profiling duration
+  // is bounded.
+  DCHECK_GT(local_profiler_duration, 0);
+  VLOG(3) << "duration_ms was given as " << local_profiler_duration;
+  // Max session duration includes the profiling session and grace time.
+  auto profile_duration =
+      absl::Milliseconds(local_profiler_duration) + kSessionGraceTime;
+  absl::Duration delay_duration;
+  // When requested start timestamp is 0, profiling starts immediately.
+  if (requested_start_ts > 0) {
+    delay_duration =
+        absl::Nanoseconds(requested_start_ts - session_creation_ts);
+  }
+
+  auto max_session_duration = profile_duration + delay_duration;
+  options.set_max_session_duration_ms(
+      absl::ToInt64Milliseconds(max_session_duration));
+  VLOG(1) << "max_session_duration set to " << max_session_duration;
+}
+
+// Takes profiler options in a py::dict and returns a
+// RemoteProfilerSessionManagerOptions.
+// This must be called under GIL because it reads Python objects. Reading Python
+// objects require GIL because the objects can be mutated by other Python
+// threads. In addition, Python objects are reference counted; reading py::dict
+// will increase its reference count.
+RemoteProfilerSessionManagerOptions GetOptionsLocked(absl::string_view logdir,
+                                                     const py::dict& opts) {
+  RemoteProfilerSessionManagerOptions options;
+  *options.mutable_profiler_options() =
       tensorflow::ProfilerSession::DefaultOptions();
+  // Store a timestamp of when this session was created. This will be the basis
+  // of gRPC deadline afterwards.
+  auto now = absl::Now();
+  options.set_session_creation_timestamp_ns(absl::ToUnixNanos(now));
+  VLOG(2) << "set_session_creation_timestamp_ns set to "
+          << options.session_creation_timestamp_ns() << " [" << now << "]";
+
+  // Set the path of where to store XSpaces.
+  options.mutable_profiler_options()->set_repository_path(logdir.data(),
+                                                          logdir.size());
+  VLOG(2) << "repository_path set to "
+          << options.profiler_options().repository_path();
+
   for (const auto& kw : opts) {
     std::string key = py::cast<std::string>(kw.first);
     if (key == "host_tracer_level") {
-      options.set_host_tracer_level(py::cast<int>(kw.second));
-      VLOG(1) << "host_tracer_level set to " << options.host_tracer_level();
+      auto value = py::cast<int>(kw.second);
+      options.mutable_profiler_options()->set_host_tracer_level(value);
+      VLOG(1) << "host_tracer_level set to " << value;
     } else if (key == "device_tracer_level") {
-      options.set_device_tracer_level(py::cast<int>(kw.second));
-      VLOG(1) << "device_tracer_level set to " << options.device_tracer_level();
+      auto value = py::cast<int>(kw.second);
+      options.mutable_profiler_options()->set_device_tracer_level(value);
+      VLOG(1) << "device_tracer_level set to " << value;
     } else if (key == "python_tracer_level") {
-      options.set_python_tracer_level(py::cast<int>(kw.second));
-      VLOG(1) << "python_tracer_level set to " << options.python_tracer_level();
+      auto value = py::cast<int>(kw.second);
+      options.mutable_profiler_options()->set_python_tracer_level(value);
+      VLOG(1) << "python_tracer_level set to " << value;
+    } else {
+      LOG(WARNING) << "Unrecognised key: " << key;
     }
   }
+
+  return options;
+}
+
+RemoteProfilerSessionManagerOptions GetOptionsLocked(
+    absl::string_view service_addresses, absl::string_view logdir,
+    absl::string_view worker_list, bool include_dataset_ops,
+    tensorflow::int32 duration_ms, py::dict opts, bool* is_cloud_tpu_session) {
+  RemoteProfilerSessionManagerOptions options = GetOptionsLocked(logdir, opts);
+
+  // Remote profiling does not support any use cases where the following options
+  // are set by `py::dict opts`. e.g. `opts['service_addrs']` will not happen.
+  DCHECK(options.service_addresses().empty());
+  // In remote profiling, duration is always passed by value explicitly and not
+  // set in py::dict opts.
+  DCHECK_EQ(options.profiler_options().duration_ms(), 0);
+  // Because duration_ms is not set from py::dict opts, it follows that
+  // max_session_duration_ms must be unset as well.
+  DCHECK_EQ(options.max_session_duration_ms(), 0);
+
+  // Worker_list is only used for TensorBoard TPU capture cases. For a TPU
+  // cluster, service_address is the Master, which can already be found in the
+  // list of workers. These sessions will be used with the ProfileAnalysis
+  // service.
+  *is_cloud_tpu_session = !worker_list.empty();
+  AddServiceAddresses(*is_cloud_tpu_session ? worker_list : service_addresses,
+                      &options);
+
+  // Set local profiler duration and profiler session durations.
+  options.mutable_profiler_options()->set_include_dataset_ops(
+      include_dataset_ops);
+  options.mutable_profiler_options()->set_duration_ms(duration_ms);
+  UpdateMaxSessionDuration(options);
+
+  for (int idx = 0; idx < options.service_addresses_size(); ++idx) {
+    VLOG(1) << "service_addr " << idx << " set to "
+            << options.service_addresses(idx);
+  }
+  VLOG(1) << "include_dataset_ops set to " << include_dataset_ops;
+  VLOG(1) << "duration_ms set to " << duration_ms;
+
   return options;
 }
 
 class ProfilerSessionWrapper {
  public:
   void Start(const char* logdir, const py::dict& options) {
-    session_ = tensorflow::ProfilerSession::Create(GetOptions(options));
+    auto opts = GetOptionsLocked(logdir, options);
+    session_ = tensorflow::ProfilerSession::Create(opts.profiler_options());
     logdir_ = logdir;
     tensorflow::MaybeRaiseRegisteredFromStatus(session_->Status());
   }
@@ -102,6 +235,7 @@ class ProfilerSessionWrapper {
     tensorflow::profiler::XSpace xspace;
     tensorflow::Status status;
     status = session_->CollectData(&xspace);
+    xspace.add_hostnames(tensorflow::port::Hostname());
     session_.reset();
     status = tensorflow::profiler::ExportToTensorBoard(xspace, logdir_);
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
@@ -123,97 +257,65 @@ PYBIND11_MODULE(_pywrap_profiler, m) {
       .def("export_to_tb", &ProfilerSessionWrapper::ExportToTensorBoard);
 
   m.def("start_server", [](int port) {
-    auto profiler_server = absl::make_unique<tensorflow::ProfilerServer>();
+    auto profiler_server =
+        absl::make_unique<tensorflow::profiler::ProfilerServer>();
     profiler_server->StartProfilerServer(port);
     // Intentionally release profiler server. Should transfer ownership to
     // caller instead.
     profiler_server.release();
   });
 
-  m.def(
-      "trace",
-      [](const char* service_addr, const char* logdir, const char* worker_list,
-         bool include_dataset_ops, int duration_ms, int num_tracing_attempts,
-         py::dict options) {
-        tensorflow::Status status = ValidateHostPortPair(service_addr);
-        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
-        tensorflow::ProfileOptions opts = GetOptions(options);
-        opts.set_include_dataset_ops(include_dataset_ops);
-        status = tensorflow::profiler::Trace(service_addr, logdir, worker_list,
-                                             duration_ms, num_tracing_attempts,
-                                             opts);
-        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
-      },
-      py::call_guard<py::gil_scoped_release>());
+  m.def("trace", [](const char* service_addr, const char* logdir,
+                    const char* worker_list, bool include_dataset_ops,
+                    int duration_ms, int num_tracing_attempts,
+                    py::dict options) {
+    // TPU capture is true if the user sets worker_list.
+    bool is_cloud_tpu_session = false;
+    // Normalize py::dict into a well defined and validated proto.
+    tensorflow::RemoteProfilerSessionManagerOptions opts =
+        GetOptionsLocked(service_addr, logdir, worker_list, include_dataset_ops,
+                         duration_ms, options, &is_cloud_tpu_session);
+    tensorflow::Status status = ValidateOptions(opts);
+    tensorflow::MaybeRaiseRegisteredFromStatus(status);
 
-  m.def(
-      "monitor",
-      [](const char* service_addr, int duration_ms, int monitoring_level,
-         bool display_timestamp) {
-        tensorflow::Status status = ValidateHostPortPair(service_addr);
-        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
-        tensorflow::string content;
-        status = tensorflow::profiler::Monitor(service_addr, duration_ms,
-                                               monitoring_level,
-                                               display_timestamp, &content);
-        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
-        return content;
-      },
-      py::call_guard<py::gil_scoped_release>());
+    {
+      // Release the lock to keep the lock scope to a minimum, and allow
+      // other threads to proceed.
+      py::gil_scoped_release release;
+      status = tensorflow::profiler::Trace(logdir, num_tracing_attempts, opts,
+                                           is_cloud_tpu_session);
+    }
+    tensorflow::MaybeRaiseRegisteredFromStatus(status);
+  });
 
-  m.def("xspace_to_trace_events", [](const py::bytes& serialized_xspace_proto) {
+  m.def("monitor", [](const char* service_addr, int duration_ms,
+                      int monitoring_level, bool display_timestamp) {
+    tensorflow::Status status = ValidateHostPortPair(service_addr);
+    tensorflow::MaybeRaiseRegisteredFromStatus(status);
     tensorflow::string content;
-    tensorflow::profiler::XSpace xspace;
-    xspace.ParseFromString(std::string(serialized_xspace_proto));
-    tensorflow::profiler::ConvertXSpaceToTraceEventsString(xspace, &content);
-    return py::bytes(content);
+    {
+      // Release the lock to keep the lock scope to a minimum, and allow
+      // other threads to proceed.
+      py::gil_scoped_release release;
+      status = tensorflow::profiler::Monitor(service_addr, duration_ms,
+                                             monitoring_level,
+                                             display_timestamp, &content);
+    }
+    tensorflow::MaybeRaiseRegisteredFromStatus(status);
+    return content;
   });
 
-  m.def("xspace_to_overview_page",
-        [](const py::bytes& serialized_xspace_proto) {
-          tensorflow::profiler::XSpace xspace;
-          xspace.ParseFromString(std::string(serialized_xspace_proto));
-          tensorflow::profiler::OverviewPage overview_page =
-              tensorflow::profiler::ConvertOpStatsToOverviewPage(
-                  ConvertXSpaceToOpStats(
-                      xspace, {OP_METRICS_DB, STEP_DB, KERNEL_STATS_DB}));
-          return py::bytes(overview_page.SerializeAsString());
-        });
-
-  m.def("xspace_to_input_pipeline",
-        [](const py::bytes& serialized_xspace_proto) {
-          tensorflow::profiler::XSpace xspace;
-          xspace.ParseFromString(std::string(serialized_xspace_proto));
-          tensorflow::profiler::InputPipelineAnalysisResult input_pipeline =
-              tensorflow::profiler::ConvertOpStatsToInputPipelineAnalysis(
-                  ConvertXSpaceToOpStats(xspace, {OP_METRICS_DB, STEP_DB}));
-          return py::bytes(input_pipeline.SerializeAsString());
-        });
-
-  m.def("xspace_to_tf_stats", [](const py::bytes& serialized_xspace_proto) {
-    tensorflow::profiler::XSpace xspace;
-    xspace.ParseFromString(std::string(serialized_xspace_proto));
-    tensorflow::profiler::TfStatsDatabase tf_stats_db =
-        tensorflow::profiler::ConvertOpStatsToTfStats(
-            ConvertXSpaceToOpStats(xspace, {OP_METRICS_DB, KERNEL_STATS_DB}));
-    return py::bytes(tf_stats_db.SerializeAsString());
-  });
-
-  m.def("xspace_to_kernel_stats", [](const py::bytes& serialized_xspace_proto) {
-    tensorflow::profiler::XSpace xspace;
-    xspace.ParseFromString(std::string(serialized_xspace_proto));
-    tensorflow::profiler::OpStats op_stats =
-        ConvertXSpaceToOpStats(xspace, {KERNEL_STATS_DB});
-    return py::bytes(op_stats.kernel_stats_db().SerializeAsString());
-  });
-
-  m.def("xspace_to_memory_profile",
-        [](const py::bytes& serialized_xspace_proto) {
-          tensorflow::profiler::XSpace xspace;
-          xspace.ParseFromString(std::string(serialized_xspace_proto));
-          std::string json_output;
-          tensorflow::profiler::ConvertXSpaceToMemoryProfileJson(xspace,
-                                                                 &json_output);
-          return py::bytes(json_output);
+  m.def("xspace_to_tools_data",
+        [](const py::list& xspace_path_list, const py::str& py_tool_name) {
+          std::vector<std::string> xspace_paths;
+          for (py::handle obj : xspace_path_list) {
+            xspace_paths.push_back(std::string(py::cast<py::str>(obj)));
+          }
+          std::string tool_name = std::string(py_tool_name);
+          auto tool_data_and_success =
+              tensorflow::profiler::ConvertMultiXSpacesToToolData(xspace_paths,
+                                                                  tool_name);
+          return py::make_tuple(py::bytes(tool_data_and_success.first),
+                                py::bool_(tool_data_and_success.second));
         });
 };
diff --git a/tensorflow/python/profiler/internal/python_hooks.cc b/tensorflow/python/profiler/internal/python_hooks.cc
index ee2ad1e254b..8d6318d8fde 100644
--- a/tensorflow/python/profiler/internal/python_hooks.cc
+++ b/tensorflow/python/profiler/internal/python_hooks.cc
@@ -16,13 +16,19 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_utils.h"
 
 namespace tensorflow {
 namespace profiler {
 
 namespace py = ::pybind11;
 
+namespace {
+
 template <typename T>
 int ProfileFunction(PyObject* obj, PyFrameObject* frame, int what,
                     PyObject* arg) {
@@ -40,40 +46,165 @@ void ThreadingSetProfile(const py::object& callback) {
   setprofile(callback);
 }
 
+std::string GetEventName(PyCodeObject* py_code) {
+  string filename(py::reinterpret_borrow<py::str>(py_code->co_filename));
+  string function;
+  if (py_code->co_name == nullptr) {
+    function = "<unknown>";
+  } else {
+    function = py::reinterpret_borrow<py::str>(py_code->co_name);
+  }
+
+  return absl::StrCat("$", io::Basename(filename), ":", py_code->co_firstlineno,
+                      " ", function);
+}
+
+string GetEventName(PyCFunctionObject* py_cfunc) {
+  PyObject* module = py_cfunc->m_module;
+  string filename;
+  bool filename_ok;
+#if PY_MAJOR_VERSION < 3
+  filename_ok = (module != nullptr && PyString_Check(module));
+#else
+  filename_ok = (module != nullptr && PyUnicode_Check(module));
+#endif
+  if (filename_ok) {
+    filename = py::reinterpret_borrow<py::str>(module);
+  } else {
+    filename = "<unknown>";
+  }
+
+  return absl::StrCat("$", filename, " ", py_cfunc->m_ml->ml_name);
+}
+
+void AddEventToXLine(const PythonTraceEntry& event, XLineBuilder* line,
+                     XPlaneBuilder* plane) {
+  // TODO(jiesun): maybe add full filename as event stats.
+  auto xevent = line->AddEvent(*plane->GetOrCreateEventMetadata(event.Name()));
+  xevent.SetTimestampNs(event.start_time_ns);
+  xevent.SetEndTimestampNs(event.end_time_ns);
+}
+
+}  // namespace
+
+std::string PythonTraceEntry::Name() const {
+  std::string event_name;
+  if (code_object) {
+    return GetEventName(code_object);
+  } else if (function_object) {
+    return GetEventName(function_object);
+  }
+  return "<unknown>";
+}
+
 PythonHooks* PythonHooks::GetSingleton() {
   static PythonHooks* singleton = new PythonHooks;
   return singleton;
 }
 
-void PythonHooks::Start(const PythonHooksOptions& option) {
+void PythonHooks::Start(const PythonHooksOptions& options) {
   if (!Py_IsInitialized()) return;
-  if (option.enable_python_traceme || option.enable_trace_python_function) {
+
+#if PY_MAJOR_VERSION < 3 || (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 7)
+  // Before Python 3.7, the GIL is created on demand by PyEval_InitThreads().
+  // When a thread was not started by Python (e.g., when starting profiling via
+  // RPC) there might be no GIL. Before Python 3.6, PyGILState_Ensure would
+  // crash. The crash was fixed in Python 3.6 but the fix introduced a race for
+  // GIL creation. Calling PyEval_InitThreads() prevents the race. This is a
+  // no-op when called for a second time so it is innocuous. See
+  // https://vstinner.github.io/python37-gil-change.html for details.
+  PyEval_InitThreads();
+#endif
+
+  options_ = options;
+  start_timestamp_ns_ = EnvTime::NowNanos();
+  if (options_.enable_python_traceme || options_.enable_trace_python_function) {
     PyGILState_STATE gil_state = PyGILState_Ensure();
-    if (option.enable_trace_python_function) {
+    if (options_.enable_trace_python_function) {
       SetProfilerInAllThreads();
     }
-    if (option.enable_python_traceme) {
+    if (options_.enable_python_traceme) {
       EnableTraceMe(true);
     }
+    if (options_.end_to_end_mode) {
+      // When end to end mode is used, Stop() and Finalize() i.e. symbolization
+      // and data collection happens during C's atexit(), when Py_FinalizeEx()
+      // already called.
+      try {
+        auto atexit = py::module::import("atexit");
+        atexit.attr("register")(py::cpp_function([]() {
+          PythonHooks* singleton = PythonHooks::GetSingleton();
+          singleton->Stop();
+          singleton->CollectData(&(singleton->end_to_end_xplane_.emplace()));
+        }));
+      } catch (const py::error_already_set& e) {
+        LOG(ERROR) << "Can't install atexit handler for e2e mode." << e.what();
+      }
+    }
     PyGILState_Release(gil_state);
+    active_session_ = true;
   }
 }
 
-void PythonHooks::Stop(const PythonHooksOptions& option) {
+void PythonHooks::Stop() {
   if (!Py_IsInitialized()) return;
-  if (option.enable_python_traceme || option.enable_trace_python_function) {
+  if (!active_session_) return;  // Makes sure Stop() can be reentrant.
+  if (options_.enable_python_traceme || options_.enable_trace_python_function) {
     PyGILState_STATE gil_state = PyGILState_Ensure();
-    if (option.enable_trace_python_function) {
+    if (options_.enable_trace_python_function) {
       ClearProfilerInAllThreads();
     }
-    if (option.enable_python_traceme) {
+    if (options_.enable_python_traceme) {
       EnableTraceMe(false);
     }
     PyGILState_Release(gil_state);
+    active_session_ = false;
   }
 }
 
-void PythonHooks::Finalize() { tracemes_.clear(); }
+void PythonHooks::CollectData(XPlane* raw_plane) {
+  DCHECK(raw_plane);
+  XPlaneBuilder plane(raw_plane);
+  for (auto& it : entries_) {
+    uint64 thread_id = it.first;
+    auto& thread_events = it.second;
+    VLOG(1) << "Collecting " << thread_events.completed.size() << ":"
+            << thread_events.active.size() << " events on thread " << thread_id;
+    auto line = plane.GetOrCreateLine(thread_id);
+    line.SetTimestampNs(start_timestamp_ns_);
+    for (const auto& event : thread_events.completed) {
+      AddEventToXLine(event, &line, &plane);
+    }
+    if (options_.include_incomplete_events) {
+      uint64 now = EnvTime::NowNanos();
+      while (!thread_events.active.empty()) {
+        auto& event = thread_events.active.top();
+        event.end_time_ns = now;
+        AddEventToXLine(event, &line, &plane);
+        thread_events.active.pop();
+      }
+    }
+  }
+  entries_.clear();
+}
+
+void PythonHooks::Finalize(XSpace* space) {
+  if (space && options_.enable_trace_python_function) {
+    XPlane* plane =
+        FindOrAddMutablePlaneWithName(space, kPythonTracerPlaneName);
+    if (options_.end_to_end_mode) {
+      if (end_to_end_xplane_) {
+        end_to_end_xplane_->set_name(plane->name());
+        plane->Swap(&*end_to_end_xplane_);
+        end_to_end_xplane_.reset();
+      }
+    } else {
+      PyGILState_STATE gil_state = PyGILState_Ensure();
+      CollectData(plane);
+      PyGILState_Release(gil_state);
+    }
+  }
+}
 
 void PythonHooks::ProfileSlow(const py::object& frame, const string& event,
                               const py::object& arg) {
@@ -106,52 +237,58 @@ void PythonHooks::ProfileSlow(const py::object& frame, const string& event,
 }
 
 void PythonHooks::ProfileFast(PyFrameObject* frame, int what, PyObject* arg) {
-  const int64 thread_id = PyThread_get_thread_ident();
+  const int64 thread_id = Env::Default()->GetCurrentThreadId();
+  uint64 now = EnvTime::NowNanos();
+  auto& thread_traces = entries_[thread_id];
 
-  if (what == PyTrace_CALL) {
-    PyCodeObject* f_code = frame->f_code;
-    string filename(py::reinterpret_borrow<py::str>(f_code->co_filename));
-    int line_no = frame->f_lineno;
-
-    string function;
-    if (f_code->co_name == nullptr) {
-      function = "<unknown>";
-    } else {
-      function = py::reinterpret_borrow<py::str>(f_code->co_name);
+  switch (what) {
+    case PyTrace_CALL: {
+      PyCodeObject* f_code = frame->f_code;
+      thread_traces.active.emplace(now, 0, f_code, nullptr);
+      break;
     }
-
-    tracemes_[thread_id].push_back(
-        absl::make_unique<TraceMe>([&filename, line_no, &function] {
-          return absl::StrCat("$", io::Basename(filename), ":", line_no, " ",
-                              function);
-        }));
-  } else if (what == PyTrace_C_CALL && PyCFunction_Check(arg)) {
-    // Python stack does not have a filename/line_no for native calls.
-    auto* func = reinterpret_cast<PyCFunctionObject*>(arg);
-    PyObject* module = func->m_module;
-    string filename;
-    bool filename_ok;
-#if PY_MAJOR_VERSION < 3
-    filename_ok = (module != nullptr && PyString_Check(module));
-#else
-    filename_ok = (module != nullptr && PyUnicode_Check(module));
-#endif
-    if (filename_ok) {
-      filename = py::reinterpret_borrow<py::str>(module);
-    } else {
-      filename = "<unknown>";
+    case PyTrace_RETURN:
+    case PyTrace_EXCEPTION: {
+      if (!thread_traces.active.empty()) {
+        auto& entry = thread_traces.active.top();
+        entry.end_time_ns = now;
+        thread_traces.completed.emplace_back(std::move(entry));
+        thread_traces.active.pop();
+      } else if (options_.include_incomplete_events) {
+        PyCodeObject* f_code = frame->f_code;
+        thread_traces.completed.emplace_back(start_timestamp_ns_, now, f_code,
+                                             nullptr);
+      }
+      break;
     }
-
-    tracemes_[thread_id].push_back(
-        absl::make_unique<TraceMe>([&filename, func] {
-          return absl::StrCat(filename, " ", func->m_ml->ml_name);
-        }));
-  } else if (what == PyTrace_RETURN || what == PyTrace_C_RETURN ||
-             what == PyTrace_EXCEPTION || what == PyTrace_C_EXCEPTION) {
-    auto& thread_tracemes = tracemes_[thread_id];
-    if (!thread_tracemes.empty()) {
-      thread_tracemes.pop_back();
+    case PyTrace_C_CALL: {
+      if (PyCFunction_Check(arg)) {
+        // Python stack does not have a filename/line_no for native calls.
+        auto* func = reinterpret_cast<PyCFunctionObject*>(arg);
+        entries_[thread_id].active.emplace(now, 0, nullptr, func);
+      }
+      break;
     }
+    case PyTrace_C_RETURN:
+    case PyTrace_C_EXCEPTION: {
+      if (!thread_traces.active.empty()) {
+        auto& entry = thread_traces.active.top();
+        entry.end_time_ns = now;
+        thread_traces.completed.emplace_back(std::move(entry));
+        thread_traces.active.pop();
+      } else if (options_.include_incomplete_events) {
+        // Only the end of the events is recorded, use profiler start as start.
+        if (PyCFunction_Check(arg)) {
+          // Python stack does not have a filename/line_no for native calls.
+          auto* func = reinterpret_cast<PyCFunctionObject*>(arg);
+          entries_[thread_id].completed.emplace_back(start_timestamp_ns_, now,
+                                                     nullptr, func);
+        }
+      }
+      break;
+    }
+    default:
+      break;
   }
 }
 
diff --git a/tensorflow/python/profiler/internal/python_hooks.h b/tensorflow/python/profiler/internal/python_hooks.h
index 582edf4a93b..b30fcc391f4 100644
--- a/tensorflow/python/profiler/internal/python_hooks.h
+++ b/tensorflow/python/profiler/internal/python_hooks.h
@@ -16,14 +16,16 @@ limitations under the License.
 #define TENSORFLOW_PYTHON_PROFILER_INTERNAL_PYTHON_HOOKS_H_
 
 #include <memory>
+#include <stack>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "pybind11/cast.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -33,6 +35,52 @@ namespace py = ::pybind11;
 struct PythonHooksOptions {
   bool enable_trace_python_function = false;
   bool enable_python_traceme = true;
+  bool end_to_end_mode = false;
+  // Incomplete events are defined as those python calls which we only see
+  // either start or end, but not both. If we want to include them in the final
+  // result, profiler start, end time are used respectively to the absent
+  // timestamps.
+  bool include_incomplete_events = true;
+};
+
+struct PythonTraceEntry {
+  PythonTraceEntry(uint64 start, uint64 end, PyCodeObject* code,
+                   PyCFunctionObject* func)
+      : start_time_ns(start),
+        end_time_ns(end),
+        code_object(code),
+        function_object(func) {
+    Py_XINCREF(code_object);
+    Py_XINCREF(function_object);
+  }
+  ~PythonTraceEntry() {
+    Py_XDECREF(code_object);
+    Py_XDECREF(function_object);
+  }
+  PythonTraceEntry(PythonTraceEntry&& other) {
+    start_time_ns = other.start_time_ns;
+    end_time_ns = other.end_time_ns;
+    code_object = other.code_object;
+    function_object = other.function_object;
+    other.code_object = nullptr;
+    other.function_object = nullptr;
+  }
+
+  std::string Name() const;
+
+  uint64 start_time_ns;
+  uint64 end_time_ns;
+  PyCodeObject* code_object;
+  PyCFunctionObject* function_object;
+
+  PythonTraceEntry(const PythonTraceEntry& other) = delete;
+  void operator=(const PythonTraceEntry&) = delete;
+  void operator=(PythonTraceEntry&&) = delete;
+};
+
+struct PerThreadEvents {
+  std::deque<PythonTraceEntry> completed;
+  std::stack<PythonTraceEntry> active;
 };
 
 // Singleton for tracing python function calls.
@@ -41,19 +89,27 @@ class PythonHooks {
   static PythonHooks* GetSingleton();
 
   void Start(const PythonHooksOptions& option);
-  void Stop(const PythonHooksOptions& option);
-  void Finalize();
+  void Stop();
+  void Finalize(XSpace* space);
   void ProfileSlow(const py::object& frame, const string& event,
                    const py::object& arg);
   void ProfileFast(PyFrameObject* frame, int what, PyObject* arg);
 
  private:
   void EnableTraceMe(bool enable);
+  void CollectData(XPlane* raw_plane);
 
   void SetProfilerInAllThreads();
   void ClearProfilerInAllThreads();
 
-  absl::flat_hash_map<int64, std::vector<std::unique_ptr<TraceMe>>> tracemes_;
+  // entries_ are accessed when GIL is held, therefore no race conditions.
+  absl::flat_hash_map<int64, PerThreadEvents> entries_;
+  uint64 start_timestamp_ns_;
+  bool active_session_ = false;
+  PythonHooksOptions options_;
+  // In end to end mode, Python get uninitialized before Stop()/Finalize(), we
+  // need to buffer the result.
+  absl::optional<XPlane> end_to_end_xplane_;
 };
 
 }  // namespace profiler
diff --git a/tensorflow/python/profiler/profiler_client.py b/tensorflow/python/profiler/profiler_client.py
index 1380b6578fc..0b1eb0db495 100644
--- a/tensorflow/python/profiler/profiler_client.py
+++ b/tensorflow/python/profiler/profiler_client.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import errors
 from tensorflow.python.profiler.internal import _pywrap_profiler
 
 from tensorflow.python.util.tf_export import tf_export
@@ -32,38 +33,57 @@ def trace(service_addr,
           worker_list='',
           num_tracing_attempts=3,
           options=None):
-  """Sends grpc requests to profiler server to perform on-demand profiling.
+  """Sends gRPC requests to one or more profiler servers to perform on-demand profiling.
 
-  This method will block caller thread until it receives tracing result. This
-  method supports CPU, GPU, and Cloud TPU. This method supports profiling a
-  single host for CPU, GPU, TPU, as well as multiple TPU workers.
-  The profiled results will be saved to your specified TensorBoard log
-  directory (e.g. the directory you save your model checkpoints). Use the
+  This method will block the calling thread until it receives responses from all
+  servers or until deadline expiration. Both single host and multiple host
+  profiling are supported on CPU, GPU, and TPU.
+  The profiled results will be saved by each server to the specified TensorBoard
+  log directory (i.e. the directory you save your model checkpoints). Use the
   TensorBoard profile plugin to view the visualization and analysis results.
 
   Args:
-    service_addr: gRPC address of profiler service e.g. grpc://localhost:6009.
-    logdir: Path of TensorBoard log directory e.g. /tmp/tb_log.
-    duration_ms: Duration of tracing or monitoring in ms.
-    worker_list: Optional. The list of workers that we are about to profile in
-      the current session (TPU only).
+    service_addr: A comma delimited string of gRPC addresses of the workers to
+      profile.
+      e.g. service_addr='grpc://localhost:6009'
+           service_addr='grpc://10.0.0.2:8466,grpc://10.0.0.3:8466'
+           service_addr='grpc://localhost:12345,grpc://localhost:23456'
+    logdir: Path to save profile data to, typically a TensorBoard log directory.
+      This path must be accessible to both the client and server.
+      e.g. logdir='gs://your_tb_dir'
+    duration_ms: Duration of tracing or monitoring in mliiseconds. Must be
+      greater than zero.
+    worker_list: An optional TPU only configuration. The list of workers to
+      profile in the current session.
     num_tracing_attempts: Optional. Automatically retry N times when no trace
       event is collected (default 3).
     options: profiler.experimental.ProfilerOptions namedtuple for miscellaneous
       profiler options.
 
   Raises:
-    UnavailableError: If no trace event is collected.
+    InvalidArgumentError: For when arguments fail validation checks.
+    UnavailableError: If no trace event was collected.
 
   Example usage (CPU/GPU):
   # Start a profiler server before your model runs.
   ```python
   tf.profiler.experimental.server.start(6009)
-  # your model code.
+  # (Model code goes here).
   # Send gRPC request to the profiler server to collect a trace of your model.
   ```python
   tf.profiler.experimental.client.trace('grpc://localhost:6009',
-                                        '/tmp/tb_log', 2000)
+                                        '/nfs/tb_log', 2000)
+
+  Example usage (Multiple GPUs):
+  # E.g. your worker IP addresses are 10.0.0.2, 10.0.0.3, 10.0.0.4, and you
+  # would like to schedule start of profiling 1 second from now, for a duration
+  # of 2 seconds.
+  options['delay_ms'] = 1000
+  tf.profiler.experimental.client.trace(
+      'grpc://10.0.0.2:8466,grpc://10.0.0.3:8466,grpc://10.0.0.4:8466',
+      'gs://your_tb_dir',
+      2000,
+      options=options)
 
   Example usage (TPU):
   # Send gRPC request to a TPU worker to collect a trace of your model. A
@@ -82,16 +102,19 @@ def trace(service_addr,
   # profile for 2 seconds.
   tf.profiler.experimental.client.trace('grpc://10.0.0.2:8466',
                                         'gs://your_tb_dir',
-                                        2000, '10.0.0.3,10.0.0.4')
-
+                                        2000, '10.0.0.2,10.0.0.3,10.0.0.4')
   Launch TensorBoard and point it to the same logdir you provided to this API.
   $ tensorboard --logdir=/tmp/tb_log (or gs://your_tb_dir in the above examples)
   Open your browser and go to localhost:6006/#profile to view profiling results.
 
   """
+  if duration_ms <= 0:
+    raise errors.InvalidArgumentError(None, None,
+                                      'duration_ms must be greater than zero.')
+
   opts = dict(options._asdict()) if options is not None else {}
   _pywrap_profiler.trace(
-      _strip_prefix(service_addr, _GRPC_PREFIX), logdir, worker_list, True,
+      _strip_addresses(service_addr, _GRPC_PREFIX), logdir, worker_list, True,
       duration_ms, num_tracing_attempts, opts)
 
 
@@ -127,3 +150,7 @@ def monitor(service_addr, duration_ms, level=1):
 
 def _strip_prefix(s, prefix):
   return s[len(prefix):] if s.startswith(prefix) else s
+
+
+def _strip_addresses(addresses, prefix):
+  return ','.join([_strip_prefix(s, prefix) for s in addresses.split(',')])
diff --git a/tensorflow/python/profiler/profiler_client_test.py b/tensorflow/python/profiler/profiler_client_test.py
index 36a7df41be2..85042be5409 100644
--- a/tensorflow/python/profiler/profiler_client_test.py
+++ b/tensorflow/python/profiler/profiler_client_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import tempfile
+import portpicker
 
 from tensorflow.python.eager import test
 from tensorflow.python.framework import errors
@@ -29,18 +29,35 @@ from tensorflow.python.profiler import profiler_v2 as profiler
 
 class ProfilerClientTest(test_util.TensorFlowTestCase):
 
-  def testStartTracing_ProcessInvalidAddress(self):
-    with self.assertRaises(errors.UnavailableError):
-      profiler_client.trace('localhost:6006', tempfile.mkdtemp(), 2000)
+  def testTrace_ProfileIdleServer(self):
+    test_port = portpicker.pick_unused_port()
+    profiler.start_server(test_port)
+    # Test the profilers are successfully started and connected to profiler
+    # service on the worker. Since there is no op running, it is expected to
+    # return UnavailableError with no trace events collected string.
+    with self.assertRaises(errors.UnavailableError) as error:
+      profiler_client.trace(
+          'localhost:' + str(test_port), self.get_temp_dir(), duration_ms=10)
+    self.assertStartsWith(str(error.exception), 'No trace event was collected')
 
-  def testStartTracing_ProcessInvalidAddressWithOptions(self):
-    with self.assertRaises(errors.UnavailableError):
+  def testTrace_ProfileIdleServerWithOptions(self):
+    test_port = portpicker.pick_unused_port()
+    profiler.start_server(test_port)
+    # Test the profilers are successfully started and connected to profiler
+    # service on the worker. Since there is no op running, it is expected to
+    # return UnavailableError with no trace events collected string.
+    with self.assertRaises(errors.UnavailableError) as error:
       options = profiler.ProfilerOptions(
           host_tracer_level=3, device_tracer_level=0)
       profiler_client.trace(
-          'localhost:6006', tempfile.mkdtemp(), 2000, options=options)
+          'localhost:' + str(test_port),
+          self.get_temp_dir(),
+          duration_ms=10,
+          options=options)
+    self.assertStartsWith(str(error.exception), 'No trace event was collected')
 
   def testMonitor_ProcessInvalidAddress(self):
+    # Monitor is only supported in cloud TPU. Test invalid address instead.
     with self.assertRaises(errors.UnavailableError):
       profiler_client.monitor('localhost:6006', 2000)
 
diff --git a/tensorflow/python/pywrap_mlir.py b/tensorflow/python/pywrap_mlir.py
index a8a8181ce48..82048140e16 100644
--- a/tensorflow/python/pywrap_mlir.py
+++ b/tensorflow/python/pywrap_mlir.py
@@ -29,6 +29,13 @@ def import_graphdef(graphdef, pass_pipeline):
       pass_pipeline.encode('utf-8'))
 
 
+def import_function(concrete_function, pass_pipeline):
+  return ImportFunction(
+      str(concrete_function.function_def).encode('utf-8'),
+      str(concrete_function.graph.as_graph_def().library).encode('utf-8'),
+      pass_pipeline.encode('utf-8'))
+
+
 def experimental_convert_saved_model_to_mlir(saved_model_path, exported_names,
                                              show_debug_info):
   return ExperimentalConvertSavedModelToMlir(
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 4507118c17c..b18c7c1e738 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -107,6 +107,7 @@ tf_py_test(
     name = "loader_test",
     size = "small",
     srcs = ["loader_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":builder",
         ":loader",
@@ -165,6 +166,7 @@ tf_py_test(
     srcs = ["saved_model_test.py"],
     data = ["//tensorflow/cc/saved_model:saved_model_half_plus_two"],
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         ":builder",
         ":constants",
@@ -180,13 +182,13 @@ tf_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:saver_test_utils",
         "//tensorflow/python:session",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:test_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python/training:saver_test_utils",
     ],
 )
 
@@ -215,6 +217,7 @@ tf_py_test(
     name = "utils_test",
     size = "small",
     srcs = ["utils_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":utils",
         "//tensorflow/core:protos_all_py",
@@ -247,6 +250,7 @@ tf_py_test(
     name = "signature_def_utils_test",
     size = "small",
     srcs = ["signature_def_utils_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":signature_constants",
         ":signature_def_utils",
@@ -262,6 +266,7 @@ tf_py_test(
     name = "simple_save_test",
     size = "small",
     srcs = ["simple_save_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":loader",
         ":signature_constants",
@@ -306,6 +311,7 @@ tf_py_test(
     name = "save_context_test",
     srcs = ["save_context_test.py"],
     srcs_version = "PY2AND3",
+    tfrt_enabled = True,
     deps = [
         ":save_context",
         ":save_options",
@@ -442,6 +448,7 @@ py_strict_library(
         ":loader",
         ":signature_serialization",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:composite_tensor",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
@@ -449,10 +456,11 @@ py_strict_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:saver",
         "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training_lib",
+        "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:lift_to_graph",
         "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/training:monitored_session",
         "//tensorflow/python/training/tracking",
     ],
 )
@@ -523,6 +531,7 @@ py_strict_library(
 tf_py_test(
     name = "revived_types_test",
     srcs = ["revived_types_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":revived_types",
         "//tensorflow/core:protos_all_py",
@@ -597,6 +606,7 @@ py_strict_library(
 tf_py_test(
     name = "nested_structure_coder_test",
     srcs = ["nested_structure_coder_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":nested_structure_coder",
         "//tensorflow/core:protos_all_py",
@@ -641,6 +651,7 @@ py_strict_library(
 tf_py_test(
     name = "method_name_updater_test",
     srcs = ["method_name_updater_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":method_name_updater",
         "//tensorflow/core:protos_all_py",
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 46b1fb57de2..51e3f55924d 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -534,7 +534,7 @@ class SavedModelBuilder(_SavedModelBuilder):
 
     # legacy_init_op is deprecated, and going away in TF 2.0.
     # Re-mapping to main_op, as treatment is identical regardless.
-    main_op = main_op or legacy_init_op
+    main_op = main_op if main_op is not None else legacy_init_op
 
     # Add assets and ops
     self._add_collections(assets_collection, main_op, None)
diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py
index 63fa4a4acbd..092e4177f44 100644
--- a/tensorflow/python/saved_model/function_deserialization.py
+++ b/tensorflow/python/saved_model/function_deserialization.py
@@ -22,6 +22,7 @@ import collections
 import re
 
 from tensorflow.core.framework import function_pb2
+from tensorflow.core.protobuf import saved_object_graph_pb2
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as function_lib
 from tensorflow.python.framework import func_graph as func_graph_lib
@@ -141,9 +142,18 @@ def _deserialize_function_spec_as_nonmethod(function_spec_proto, coder):
       kwonlydefaults=typeless_fullargspec.kwonlydefaults,
       annotations=typeless_fullargspec.annotations)
   input_signature = coder.decode_proto(function_spec_proto.input_signature)
+
+  # See `tf.function` and the ExperimentalCompile proto for details.
+  experimental_compile = {
+      saved_object_graph_pb2.FunctionSpec.ExperimentalCompile.DEFAULT: None,
+      saved_object_graph_pb2.FunctionSpec.ExperimentalCompile.ON: True,
+      saved_object_graph_pb2.FunctionSpec.ExperimentalCompile.OFF: False,
+  }.get(function_spec_proto.experimental_compile)
+
   return function_lib.FunctionSpec(fullargspec=fullargspec,
                                    is_method=False,
-                                   input_signature=input_signature)
+                                   input_signature=input_signature,
+                                   experimental_compile=experimental_compile)
 
 
 # TODO(allenl): The fact that we can't derive ConcreteFunction calling
@@ -153,8 +163,6 @@ def _deserialize_function_spec_as_nonmethod(function_spec_proto, coder):
 def setup_bare_concrete_function(saved_bare_concrete_function,
                                  concrete_functions):
   """Makes a restored bare concrete function callable."""
-  # Bare concrete functions accept only flat lists of Tensors with unique
-  # names.
   concrete_function = concrete_functions[
       saved_bare_concrete_function.concrete_function_name]
   # pylint: disable=protected-access
@@ -162,6 +170,12 @@ def setup_bare_concrete_function(saved_bare_concrete_function,
       saved_bare_concrete_function.argument_keywords)
   concrete_function._num_positional_args = (
       saved_bare_concrete_function.allowed_positional_arguments)
+  if saved_bare_concrete_function.HasField("function_spec"):
+    coder = nested_structure_coder.StructureCoder()
+    function_spec = _deserialize_function_spec_as_nonmethod(
+        saved_bare_concrete_function.function_spec,
+        coder)
+    concrete_function._set_function_spec(function_spec)
   # pylint: enable=protected-access
   concrete_function.add_to_graph()
   return concrete_function
@@ -176,7 +190,8 @@ class RestoredFunction(def_function.Function):
   def __init__(self, python_function, name, function_spec, concrete_functions):
     # TODO(mdan): We may enable autograph once exceptions are supported.
     super(RestoredFunction, self).__init__(
-        python_function, name, autograph=False)
+        python_function, name, autograph=False,
+        experimental_compile=function_spec.experimental_compile)
     self.concrete_functions = concrete_functions
     self._function_spec = function_spec
 
@@ -327,10 +342,11 @@ def load_function_def_library(library, load_shared_name_suffix=None):
     for dep in _list_function_deps(fdef, library_function_names):
       functions[dep].add_to_graph(func_graph)
 
-    # We do not initialize the new ConcreteFunction's function_spec or
+    # We do not initialize the new ConcreteFunction's function_spec and/or
     # arg_keywords here (which are used to parse the structured and flat
-    # signatures, respectively).  function_spec is set up later by
-    # recreate_function(); and arg_keywords by setup_bare_concrete_function().
+    # signatures, respectively). ConcreteFunction that are part of a saved
+    # function is set up later by recreate_function(); and bare ConcreteFunction
+    # is set up by by setup_bare_concrete_function().
     func = function_lib.ConcreteFunction(func_graph)
     func.add_to_graph(graph)
 
diff --git a/tensorflow/python/saved_model/function_serialization.py b/tensorflow/python/saved_model/function_serialization.py
index 13280d9441a..ad18e8f5d2a 100644
--- a/tensorflow/python/saved_model/function_serialization.py
+++ b/tensorflow/python/saved_model/function_serialization.py
@@ -46,6 +46,14 @@ def _serialize_function_spec(function_spec, coder):
   proto.is_method = function_spec.is_method
   proto.input_signature.CopyFrom(
       coder.encode_structure(function_spec.input_signature))
+
+  # See `tf.function` and the ExperimentalCompile proto for details.
+  proto.experimental_compile = {
+      None: saved_object_graph_pb2.FunctionSpec.ExperimentalCompile.DEFAULT,
+      True: saved_object_graph_pb2.FunctionSpec.ExperimentalCompile.ON,
+      False: saved_object_graph_pb2.FunctionSpec.ExperimentalCompile.OFF,
+  }.get(function_spec.experimental_compile)
+
   return proto
 
 
@@ -77,17 +85,19 @@ def serialize_concrete_function(concrete_function, node_ids, coder):
 
 def serialize_bare_concrete_function(concrete_function, name_map):
   """Build a SavedBareConcreteFunction."""
-  # TODO(edloper): Currently, bare concrete functions don't have access to a
-  # function_spec, so they can't be called with the structured signature.
-  # Update the serialization to include a function_spec.
-
   # pylint: disable=protected-access
   name = name_map.get(compat.as_text(concrete_function.name),
                       concrete_function.name)
-  return saved_object_graph_pb2.SavedBareConcreteFunction(
+  proto = saved_object_graph_pb2.SavedBareConcreteFunction(
       concrete_function_name=name,
       allowed_positional_arguments=concrete_function._num_positional_args,
       argument_keywords=concrete_function._arg_keywords)
+  if concrete_function._pre_initialized_function_spec is not None:
+    coder = nested_structure_coder.StructureCoder()
+    proto.function_spec.CopyFrom(
+        _serialize_function_spec(
+            concrete_function._pre_initialized_function_spec, coder))
+  return proto
   # pylint: enable=protected-access
 
 
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index 54124df6eba..a8244850308 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -480,6 +480,34 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(31, imported.f(input1).numpy())
     self.assertEqual(32, imported.f(input3).numpy())
 
+  def test_structured_inputs_bare_concrete_function(self, cycles):
+
+    def func(x, training=True):
+      # x is a nested structure, we care about one particular tensor.
+      _, (a, b) = x
+      if training:
+        return 2 * a["a"] + b
+      else:
+        return 7
+
+    x = constant_op.constant(10)
+    y = constant_op.constant(11)
+
+    input1 = [6, ({"a": x}, y)]
+    input2 = [7, ({"a": x}, y)]  # Not compatible with input1 signature.
+    input3 = [6, ({"a": y}, x)]  # Compatible with input1 signature.
+
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(func).get_concrete_function(input1)
+
+    imported = cycle(root, cycles)
+
+    with self.assertRaises(TypeError):
+      imported.f(input2)
+
+    self.assertEqual(31, imported.f(input1).numpy())
+    self.assertEqual(32, imported.f(input3).numpy())
+
   def test_structured_output(self, cycles):
 
     # Use fields with non-alphabetical order
@@ -509,6 +537,31 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(5, result[1].numpy())
     self.assertEqual(0.5, result[2]["x"].numpy())
 
+  def test_pretty_print_signature(self, cycles):
+
+    named_tuple_type = collections.namedtuple("NamedTupleHello", ["b", "a"])
+
+    def func(input1, input2):
+      named_tuple = named_tuple_type(a=input1 + input2, b=input1 * input2)
+      return [named_tuple, input2, {"x": 0.5}]
+
+    root = tracking.AutoTrackable()
+    root.f = def_function.function(func).get_concrete_function(
+        constant_op.constant(2), constant_op.constant(3))
+
+    imported = cycle(root, cycles)
+    self.assertEqual(
+        imported.f.pretty_printed_signature(), """func(input1, input2)
+  Args:
+    input1: int32 Tensor, shape=()
+    input2: int32 Tensor, shape=()
+  Returns:
+    [NamedTupleHello(b=<1>, a=<2>), <3>, {'x': <4>}]
+      <1>: int32 Tensor, shape=()
+      <2>: int32 Tensor, shape=()
+      <3>: int32 Tensor, shape=()
+      <4>: float32 Tensor, shape=()""")
+
   def test_positional_arguments(self, cycles):
     def func(x, training=False, abc=7.1, defg=7.7):
       del abc
@@ -854,6 +907,25 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual([2, 4, 6],
                         imported.f(constant_op.constant([1, 2, 3])).numpy())
 
+  def test_experimental_compile(self, cycles):
+
+    # It'd be nice to use parameterize here, but the library does not support
+    # having parameterized test methods inside already-parameterized classes.
+    for experimental_compile in (None, True, False):
+
+      @def_function.function(experimental_compile=experimental_compile)
+      def f(x):
+        return x + 1.
+
+      root = module.Module()
+      root.f = f
+      save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+      save.save(root, save_dir)
+
+      imported = cycle(root, cycles)
+
+      self.assertEqual(imported.f._experimental_compile, experimental_compile)
+
   def test_get_concrete_function(self, cycles):
 
     @def_function.function
@@ -1800,6 +1872,8 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     root = tracking.AutoTrackable()
     root.table = lookup_ops.MutableHashTable(dtypes.string, dtypes.float32, -1)
     root.table.insert("foo", 15)
+    root.table2 = lookup_ops.MutableHashTable(dtypes.string, dtypes.float32, -1)
+    root.table2.insert("idk", 21)
 
     @def_function.function(
         input_signature=[tensor_spec.TensorSpec(None, dtypes.string)])
diff --git a/tensorflow/python/saved_model/load_v1_in_v2.py b/tensorflow/python/saved_model/load_v1_in_v2.py
index add3b4e6320..a8627701bb8 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2.py
@@ -23,6 +23,7 @@ import functools
 from tensorflow.python.eager import context
 from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.eager import wrap_function
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph
@@ -36,6 +37,7 @@ from tensorflow.python.saved_model import signature_serialization
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.training.tracking import tracking
+from tensorflow.python.util import nest
 
 
 class _Initializer(tracking.CapturableResource):
@@ -154,6 +156,11 @@ class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
           dense_shape_name = "%s_dense_shape" % original_input_name
           input_names.extend([indices_name, values_name, dense_shape_name])
           input_tensors.extend([feed.indices, feed.values, feed.dense_shape])
+        elif isinstance(feed, composite_tensor.CompositeTensor):
+          component_tensors = nest.flatten(feed, expand_composites=True)
+          input_names.extend("%s_component_%d" % (original_input_name, n)
+                             for n in range(len(component_tensors)))
+          input_tensors.extend(component_tensors)
         else:
           input_names.append(original_input_name)
           input_tensors.append(feed)
diff --git a/tensorflow/python/saved_model/load_v1_in_v2_test.py b/tensorflow/python/saved_model/load_v1_in_v2_test.py
index 806a4db6fba..cab2c8bedb0 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2_test.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2_test.py
@@ -45,6 +45,7 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.saved_model import builder_impl
 from tensorflow.python.saved_model import load
 from tensorflow.python.saved_model import save
@@ -566,6 +567,25 @@ class LoadTest(test.TestCase):
         start_dense_shape=dense_shape)
     self.assertAllEqual([84, 86, 88], result["output"].values.numpy())
 
+  def _model_with_ragged_input(self):
+    """Generate a graph with a RaggedTensor input and serialize in V1 format."""
+    export_graph = ops.Graph()
+    with export_graph.as_default():
+      x = ragged_factory_ops.placeholder(dtypes.float32, 1, [])
+      y = x * 2
+      with session_lib.Session() as sess:
+        path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid()))
+        simple_save.simple_save(sess, path, inputs={"x": x}, outputs={"y": y})
+    return path
+
+  def test_load_ragged_inputs(self):
+    path = self._model_with_ragged_input()
+    imported = load.load(path)
+    imported_fn = imported.signatures["serving_default"]
+    x = ragged_factory_ops.constant([[10., 20.], [30.]])
+    result = imported_fn(x_component_0=x.values, x_component_1=x.row_splits)
+    self.assertAllEqual(result["y"], [[20., 40.], [60.]])
+
   def _model_with_defun(self):
     """Generate a graph with a Defun and serialize in V1 format."""
     export_graph = ops.Graph()
diff --git a/tensorflow/python/saved_model/method_name_updater.py b/tensorflow/python/saved_model/method_name_updater.py
index 12f0bdd3552..5c18c7f88e2 100644
--- a/tensorflow/python/saved_model/method_name_updater.py
+++ b/tensorflow/python/saved_model/method_name_updater.py
@@ -44,7 +44,8 @@ class MethodNameUpdater(object):
   Typical usages of the `MethodNameUpdater`
   ```python
   ...
-  updater = tf.compat.v1.saved_model.MethodNameUpdater(export_dir)
+  updater = tf.compat.v1.saved_model.signature_def_utils.MethodNameUpdater(
+      export_dir)
   # Update all signature_defs with key "foo" in all meta graph defs.
   updater.replace_method_name(signature_key="foo", method_name="regress")
   # Update a single signature_def with key "bar" in the meta graph def with
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 361883adc22..d03ae64a480 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -786,16 +786,26 @@ def _write_object_proto(obj, proto, asset_file_def_index, function_name_map):
       # pylint:enable=protected-access
     proto.user_object.CopyFrom(registered_type_proto)
 
+  # Give the object a chance to modify the SavedObject proto.
+  # This is currently used by MirroredVariables to optionally write their
+  # component variables to the proto.
+  #
+  # This is not yet an official Trackable method, the only current use case
+  # being MirroredVariables. See the method implementation there for more
+  # documentation.
+  if hasattr(obj, "_write_object_proto"):
+    obj._write_object_proto(proto, options)  # pylint: disable=protected-access
 
-def _export_debug_info(exported_graph):
-  """Exports debug information from a graph.
+
+def _export_debug_info(exported_graph, export_dir):
+  """Exports debug information from graph to file.
+
+  Creates and writes GraphDebugInfo with traces for ops in all functions of the
+  exported_graph.
 
   Args:
     exported_graph: A Graph that has been created by tracing a saveable view.
-
-  Returns:
-    Corresponding GraphDebugInfo with traces for ops in all functions of the
-    exported_graph.
+    export_dir: SavedModel directory in which to write the debug info.
   """
   exported_operations = []
   for fn_name in exported_graph._functions:  # pylint: disable=protected-access
@@ -806,7 +816,14 @@ def _export_debug_info(exported_graph):
     fn_graph = fn.graph
     for fn_op in fn_graph.get_operations():
       exported_operations.append((fn_name, fn_op))
-  return error_interpolation.create_graph_debug_info_def(exported_operations)
+
+  graph_debug_info = error_interpolation.create_graph_debug_info_def(
+      exported_operations)
+  file_io.atomic_write_string_to_file(
+      os.path.join(
+          utils_impl.get_or_create_debug_dir(export_dir),
+          constants.DEBUG_INFO_FILENAME_PB),
+      graph_debug_info.SerializeToString(deterministic=True))
 
 
 @tf_export(
@@ -991,10 +1008,6 @@ def save(obj, export_dir, signatures=None, options=None):
   @end_compatibility
   """
   options = options or save_options.SaveOptions()
-  if options.experimental_variable_policy._expand_distributed_variables():  # pylint:disable=protected-access
-    raise NotImplementedError(
-        "The VariablePolicy.EXPAND_DISTRIBUTED_VARIABLES option is "
-        "not implemented in saved_model.save.")
   # TODO(allenl): Factor out some subset of SavedModelBuilder which is 2.x
   # compatible (no sessions) and share it with this export API rather than
   # making a SavedModel proto and writing it directly.
@@ -1002,7 +1015,7 @@ def save(obj, export_dir, signatures=None, options=None):
   meta_graph_def = saved_model.meta_graphs.add()
 
   _, exported_graph, object_saver, asset_info = _build_meta_graph(
-      obj, export_dir, signatures, options, meta_graph_def)
+      obj, signatures, options, meta_graph_def)
   saved_model.saved_model_schema_version = constants.SAVED_MODEL_SCHEMA_VERSION
 
   # Write the checkpoint, copy assets into the assets directory, and write out
@@ -1033,6 +1046,9 @@ def save(obj, export_dir, signatures=None, options=None):
       compat.as_str(constants.SAVED_MODEL_FILENAME_PB))
   file_io.atomic_write_string_to_file(
       path, saved_model.SerializeToString(deterministic=True))
+  # Save debug info, if requested.
+  if options.save_debug_info:
+    _export_debug_info(exported_graph, export_dir)
 
   # Clean reference cycles so repeated export()s don't make work for the garbage
   # collector. Before this point, we need to keep references to captured
@@ -1041,7 +1057,7 @@ def save(obj, export_dir, signatures=None, options=None):
 
 
 def export_meta_graph(obj, filename, signatures=None, options=None):
-  """Exports the MetaGraph proto to a file.
+  """Exports the MetaGraph proto of the `obj` to a file.
 
   This function goes through the same procedures saved_model.save goes to
   produce the given object's MetaGraph, then saves it to the given file. It
@@ -1066,11 +1082,15 @@ def export_meta_graph(obj, filename, signatures=None, options=None):
   options = options or save_options.SaveOptions()
   export_dir = os.path.dirname(filename)
   meta_graph_def, exported_graph, _, _ = _build_meta_graph(
-      obj, export_dir, signatures, options)
+      obj, signatures, options)
 
   file_io.atomic_write_string_to_file(
       filename, meta_graph_def.SerializeToString(deterministic=True))
 
+  # Save debug info, if requested.
+  if options.save_debug_info:
+    _export_debug_info(exported_graph, export_dir)
+
   # Clean reference cycles so repeated export()s don't make work for the garbage
   # collector. Before this point, we need to keep references to captured
   # constants in the saved graph.
@@ -1078,16 +1098,14 @@ def export_meta_graph(obj, filename, signatures=None, options=None):
 
 
 def _build_meta_graph_impl(obj,
-                           export_dir,
                            signatures,
                            options,
                            meta_graph_def=None):
   """Creates a MetaGraph containing the resources and functions of an object."""
   if ops.inside_function():
     raise AssertionError(
-        "tf.saved_model.save is not supported inside a traced "
-        "@tf.function. Move the call to the outer eagerly-executed "
-        "context.")
+        "tf.saved_model.save is not supported inside a traced @tf.function. "
+        "Move the call to the outer eagerly-executed context.")
   # pylint: enable=line-too-long
   if not isinstance(obj, base.Trackable):
     raise ValueError(
@@ -1130,24 +1148,36 @@ def _build_meta_graph_impl(obj,
                                                asset_info.asset_index)
   meta_graph_def.object_graph_def.CopyFrom(object_graph_proto)
 
-  # Save debug info, if requested.
-  if options.save_debug_info:
-    graph_debug_info = _export_debug_info(exported_graph)
-    file_io.atomic_write_string_to_file(
-        os.path.join(
-            utils_impl.get_or_create_debug_dir(export_dir),
-            constants.DEBUG_INFO_FILENAME_PB),
-        graph_debug_info.SerializeToString(deterministic=True))
-
   return meta_graph_def, exported_graph, object_saver, asset_info
 
 
 def _build_meta_graph(obj,
-                      export_dir,
                       signatures,
                       options,
                       meta_graph_def=None):
-  """Creates a MetaGraph under a SaveContext."""
+  """Creates a MetaGraph under a save context.
+
+  Args:
+    obj: A trackable object to build the MetaGraph from.
+    signatures: Can be a `tf.function` with an input signature specified or the
+      result of `f.get_concrete_function` on a `@tf.function`-decorated function
+      `f`. `signatures` may also be a dictionary, in which case it maps from
+      signature keys to `tf.function` instances. If None, finds signature to
+      export from the `@tf.function`-decorated methods in `obj`.
+    options: `tf.saved_model.SaveOptions` object that specifies options for
+      saving.
+    meta_graph_def: Optional, the MetaGraphDef proto fill.
+
+  Raises:
+    AssertionError: If `export_meta_graph` is executing inside a `tf.function`.
+    ValueError: If `obj` is not trackable.
+
+  Returns:
+    meta_graph_def: Filled MetaGraphDef proto
+    exported_graph: `tf.Graph` object generated from `obj`.
+    object_saver: `util.TrackableSaver` of the `obj` and its dependencies.
+    asset_info: `_AssetInfo` tuple containing external assets in the `obj`.
+  """
+
   with save_context.save_context(options):
-    return _build_meta_graph_impl(obj, export_dir, signatures, options,
-                                  meta_graph_def)
+    return _build_meta_graph_impl(obj, signatures, options, meta_graph_def)
diff --git a/tensorflow/python/saved_model/save_options.py b/tensorflow/python/saved_model/save_options.py
index ae4421bf022..f6330848441 100644
--- a/tensorflow/python/saved_model/save_options.py
+++ b/tensorflow/python/saved_model/save_options.py
@@ -56,13 +56,11 @@ class VariablePolicy(enum.Enum):
     Distributed variables are still saved as one variable under this policy.
 
   EXPAND_DISTRIBUTED_VARIABLES
-    Distributed variables will be explicitly expanded into their respective
-    distributed replicas, and their assigned devices will be saved. This is
-    useful when one wants to use the model for training in environments where
-    the original distribution strategy is not available. Checkpoints are
-    currently incompatible with this option, so it is not implemented in
-    `saved_model.save` (only the internal `saved_model.export_meta_graph` API
-    supports it for now).
+    Distributed variables will be saved with information about their components,
+    allowing for their restoration on load. Also, the saved graph will contain
+    references to those variables. This is useful when one wants to use the
+    model for training in environments where the original distribution strategy
+    is not available.
   """
 
   NONE = None
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index d74d190f37e..de828f7ea1b 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -47,6 +47,7 @@ from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.saved_model import load
 from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import save
@@ -551,10 +552,16 @@ class SaveTest(test.TestCase, parameterized.TestCase):
       self.assertEmpty(v1.device)
 
   @parameterized.named_parameters(
-      ("_ExpandDistributedVariables",
-       save_options.VariablePolicy.EXPAND_DISTRIBUTED_VARIABLES),
-      ("_DiscardDistributedVariables", save_options.VariablePolicy.NONE))
-  def test_expand_distributed_variables(self, expand_strategy):
+      ("_ExpandDistributedVariablesWithPolicy",
+       save_options.VariablePolicy.EXPAND_DISTRIBUTED_VARIABLES, True),
+      ("_ExpandDistributedVariablesWithoutPolicy",
+       save_options.VariablePolicy.EXPAND_DISTRIBUTED_VARIABLES, False),
+      ("_DiscardDistributedVariablesWithPolicy",
+       save_options.VariablePolicy.NONE, True),
+      ("_DiscardDistributedVariablesWithoutPolicy",
+       save_options.VariablePolicy.NONE, False))
+  def test_expand_distributed_variables(self, expand_strategy, policy):
+    # 1. Create a context with both CPU:0 and CPU:1.
     context._reset_context()
     cpus = context.context().list_physical_devices("CPU")
     if len(cpus) == 1:
@@ -565,8 +572,11 @@ class SaveTest(test.TestCase, parameterized.TestCase):
           ])
     context.ensure_initialized()
 
+    # 2. Create and save a model under a mirrored strategy.
     file_name = os.path.join(self.get_temp_dir(), "saved_model.pb")
-    with mirrored_strategy.MirroredStrategy(["CPU:0", "CPU:1"]).scope():
+    strategy = mirrored_strategy.MirroredStrategy(["CPU:0", "CPU:1"])
+    strategy.extended._use_var_policy = policy
+    with strategy.scope():
       root = tracking.AutoTrackable()
       root.v = variables.Variable([1., 1.], name="v")
 
@@ -581,35 +591,59 @@ class SaveTest(test.TestCase, parameterized.TestCase):
           filename=file_name,
           options=save_options.SaveOptions(
               experimental_variable_policy=expand_strategy))
-    graph_def = meta_graph.read_meta_graph_file(file_name).graph_def
-    v0 = next((n for n in graph_def.node if n.name == "v"), None)
-    v1 = next((n for n in graph_def.node if n.name == "v/replica_1"), None)
-    self.assertIsNotNone(v0)
+
+    # 3. Read the output file and test behavior.
+    meta_graph_def = meta_graph.read_meta_graph_file(file_name)
+    object_graph = meta_graph_def.object_graph_def
+    graph_def = meta_graph_def.graph_def
+    v = next((n.variable
+              for n in object_graph.nodes
+              if n.HasField("variable") and n.variable.name == "v"), None)
     saved_function = next((f for f in graph_def.library.function
                            if "inference_f_" in f.signature.name), None)
     self.assertIsNotNone(saved_function)
     if (expand_strategy ==
         save_options.VariablePolicy.EXPAND_DISTRIBUTED_VARIABLES):
-      self.assertIsNotNone(v1)
       # experimental_save_variable_devices should have been automatically set.
+      self.assertIn("CPU:0", v.device)
+      components = v.experimental_distributed_variable_components
+      self.assertLen(components, 2)
+      v0 = next((x for x in components if x.name == "v"), None)
+      v1 = next((x for x in components if x.name == "v/replica_1"), None)
+      self.assertIsNotNone(v0)
+      self.assertIsNotNone(v1)
       self.assertIn("CPU:0", v0.device)
       self.assertIn("CPU:1", v1.device)
       self.assertLen(saved_function.signature.input_arg, 2)
     else:
-      self.assertIsNone(v1)
-      self.assertEmpty(v0.device)
+      self.assertEmpty(v.device)
+      self.assertEmpty(v.experimental_distributed_variable_components)
       self.assertLen(saved_function.signature.input_arg, 1)
 
-  def test_expand_distributed_variables_not_allowed(self):
+  def test_save_uninitialized_variable(self):
     root = tracking.AutoTrackable()
-    with self.assertRaisesRegex(NotImplementedError,
-                                "not implemented in saved_model.save"):
-      save.save(
-          obj=root,
-          export_dir="",
-          options=save_options.SaveOptions(
-              experimental_variable_policy=save_options.VariablePolicy
-              .EXPAND_DISTRIBUTED_VARIABLES))
+    root.uninitialized_variable = resource_variable_ops.UninitializedVariable(
+        name="uninitialized_variable", dtype=dtypes.float32)
+    root.initialized_variable = variables.Variable(
+        1.0, name="initialized_variable")
+
+    # TODO(b/149594077): Python loading does not work now partly because it
+    # shouldn't, as the public API and semantics of uninitialized variables
+    # are not properly defined, and officially supporting loading would end up
+    # defining semantics "by usage." We should only allow loading once the API
+    # is made official.
+    export_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, export_dir)
+    with self.assertRaisesRegex(FileNotFoundError,
+                                "Key uninitialized_variable"):
+      load.load(export_dir)
+    with ops.Graph().as_default(), session_lib.Session() as session:
+      # The final ValueError here (with "no variables to save") is confusing,
+      # but errors upstream give the user the correct information (a
+      # NotFoundError stating that the uninitalized_variable was not found in
+      # the checkpoint).
+      with self.assertRaises(ValueError):
+        loader.load(session, [tag_constants.SERVING], export_dir)
 
 
 class VariablePolicyEnumTest(test.TestCase):
@@ -718,7 +752,6 @@ class SavingOptionsTest(test.TestCase):
     root.f = def_function.function(
         lambda x: 2. * x,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
-    root.f(constant_op.constant(1.))
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     options = save_options.SaveOptions(function_aliases={
         "my_func": root.f,
@@ -820,8 +853,7 @@ class AssetTests(test.TestCase):
         key_index=lookup_ops.TextFileIndex.WHOLE_LINE,
         value_dtype=dtypes.int64,
         value_index=lookup_ops.TextFileIndex.LINE_NUMBER)
-    table = lookup_ops.HashTable(
-        initializer, default_value=-1)
+    table = lookup_ops.HashTable(initializer, default_value=-1)
     root.table_user = def_function.function(
         table.lookup,
         input_signature=[tensor_spec.TensorSpec(None, dtypes.string)])
diff --git a/tensorflow/python/tf_program/pywrap_tfd.py b/tensorflow/python/tf_program/pywrap_tfd.py
index 0d9a236f5d3..a7a30b71f4e 100644
--- a/tensorflow/python/tf_program/pywrap_tfd.py
+++ b/tensorflow/python/tf_program/pywrap_tfd.py
@@ -137,8 +137,8 @@ class TFProgram(object):
   """Python wrap for a Tensorflow Program (essentially an mlir Module)."""
 
   def __init__(self):
-    mlir.registerDialects()
     self.ctx = mlir.MLIRContext()
+    mlir.preloadTensorFlowDialects(self.ctx)
     self.builder = mlir.Builder(self.ctx)
     self.module = mlir.ModuleOp.create(mlir.UnknownLoc.get(self.ctx))
     self.curr_func = None
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 302bb20eb20..e8e66bcef28 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -28,15 +28,18 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/dlpack.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/get_compiler_ir.h"
 #include "tensorflow/python/eager/pywrap_tensor_conversion.h"
 #include "tensorflow/python/eager/pywrap_tfe.h"
 #include "tensorflow/python/lib/core/py_exception_registry.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
 #include "tensorflow/python/lib/core/pybind11_status.h"
 #include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 #include "tensorflow/python/util/util.h"
 
 namespace py = pybind11;
@@ -284,8 +287,161 @@ static py::object TFE_ClearScalarCache() {
   return py::none();
 }
 
+// Returns compiler IR for a given function.
+static std::string TFE_GetCompilerIr(py::handle& ctx,
+                                     const char* concrete_function_name,
+                                     const char* stage, const char* device_name,
+                                     py::handle& inputs) {
+  EagerContext* context = ContextFromInterface(
+      reinterpret_cast<ImmediateExecutionContext*>(InputTFE_Context(ctx)));
+
+  std::string s_stage(stage);
+  IrExportStage selected_stage = [&] {
+    if (s_stage == "hlo") {
+      return IrExportStage::HLO;
+    } else if (s_stage == "optimized_hlo") {
+      return IrExportStage::OPTIMIZED_HLO;
+    } else if (s_stage == "optimized_hlo_dot") {
+      return IrExportStage::OPTIMIZED_HLO_DOT;
+    } else {
+      ThrowValueError(
+          absl::StrFormat("Invalid stage selected: '%s'. Valid values are: "
+                          "'hlo', 'optimized_hlo', 'optimized_hlo_dot'",
+                          s_stage)
+              .c_str());
+    }
+  }();
+
+  TFE_InputTensorHandles handles = InputTFE_InputTensorHandles(inputs);
+
+  std::vector<const TensorHandle*> input_handles;
+  for (TFE_TensorHandle* tensor_handle : handles) {
+    AbstractTensorHandle* abstract_tensor_handle = unwrap(tensor_handle);
+    input_handles.push_back(TensorHandleFromInterface(abstract_tensor_handle));
+  }
+
+  DeviceNameUtils::ParsedName input_device_name;
+  if (!DeviceNameUtils::ParseFullOrLocalName(device_name, &input_device_name)) {
+    ThrowValueError(
+        absl::StrFormat("Failed parsing device name: '%s'", device_name)
+            .c_str());
+  }
+
+  std::vector<Device*> devices = context->local_device_mgr()->ListDevices();
+  auto selected_device = absl::c_find_if(devices, [&](const Device* d) {
+    return DeviceNameUtils::AreCompatibleDevNames(input_device_name,
+                                                  d->parsed_name());
+  });
+  if (selected_device == devices.end()) {
+    ThrowValueError("No matching device found");
+  }
+
+  xla::StatusOr<std::string> hlo_text =
+      GetCompilerIr(selected_stage, context->pflr(), concrete_function_name,
+                    *selected_device, context, input_handles);
+
+  if (!hlo_text.ok()) {
+    ThrowValueError(absl::StrFormat("Failed getting HLO text: '%s'",
+                                    hlo_text.status().error_message())
+                        .c_str());
+  }
+  return *hlo_text;
+}
+
 }  // namespace tensorflow
 
+namespace {
+
+// Wrapper around the EagerContextThreadLocalData struct (defined in
+// pywrap_tfe.h), so it can be accessed from Python.
+//
+// For PyObject* fields, the get_*() methods return a new reference; and the
+// set_*() methods create a new reference (i.e., they do not steal a reference).
+class EagerContextThreadLocalDataWrapper {
+ public:
+  explicit EagerContextThreadLocalDataWrapper(py::handle py_eager_context,
+                                              py::handle is_eager,
+                                              py::handle device_spec)
+      : py_eager_context_(py_eager_context.ptr()) {
+    tensorflow::MakeEagerContextThreadLocalData(
+        py_eager_context.ptr(), is_eager.ptr(), device_spec.ptr());
+  }
+
+  ~EagerContextThreadLocalDataWrapper() {
+    tensorflow::DestroyEagerContextThreadLocalData(py_eager_context_);
+  }
+
+  bool get_is_eager() const { return GetData()->is_eager; }
+  void set_is_eager(bool v) { GetData()->is_eager = v; }
+
+  bool get_invoking_op_callbacks() const {
+    return GetData()->invoking_op_callbacks;
+  }
+  void set_invoking_op_callbacks(bool v) {
+    GetData()->invoking_op_callbacks = v;
+  }
+
+  py::handle get_device_name() const {
+    return GetPyObject(&GetData()->device_name);
+  }
+  void set_device_name(py::handle v) {
+    SetPyObject(v, &GetData()->device_name);
+  }
+
+  py::handle get_scope_name() const {
+    return GetPyObject(&GetData()->scope_name);
+  }
+  void set_scope_name(py::handle v) { SetPyObject(v, &GetData()->scope_name); }
+
+  py::handle get_device_spec() const {
+    return GetPyObject(&GetData()->device_spec);
+  }
+  void set_device_spec(py::handle v) {
+    SetPyObject(v, &GetData()->device_spec);
+  }
+
+  py::handle get_function_call_options() const {
+    return GetPyObject(&GetData()->function_call_options);
+  }
+  void set_function_call_options(py::handle v) {
+    SetPyObject(v, &GetData()->function_call_options);
+  }
+
+  py::handle get_executor() const { return GetPyObject(&GetData()->executor); }
+  void set_executor(py::handle v) { SetPyObject(v, &GetData()->executor); }
+
+  py::handle get_op_callbacks() const {
+    return GetPyObject(&GetData()->op_callbacks);
+  }
+  void set_op_callbacks(py::handle v) {
+    SetPyObject(v, &GetData()->op_callbacks);
+  }
+
+ private:
+  tensorflow::EagerContextThreadLocalData* GetData() const {
+    auto* result =
+        tensorflow::GetEagerContextThreadLocalData(py_eager_context_);
+    if (!result) {
+      throw py::error_already_set();
+    }
+    return result;
+  }
+
+  py::handle GetPyObject(tensorflow::Safe_PyObjectPtr* obj) const {
+    Py_INCREF(obj->get());
+    return obj->get();
+  }
+
+  void SetPyObject(py::handle value, tensorflow::Safe_PyObjectPtr* ptr) {
+    Py_INCREF(value.ptr());
+    ptr->reset(value.ptr());
+  }
+
+  PyObject* py_eager_context_;  // not owned (borrowed reference).
+};
+
+}  // namespace
+
 // py::return_value_policy::reference is defined as specified by the
 // pybind11 documents listed here.
 // https://pybind11.readthedocs.io/en/stable/advanced/functions.html#return-value-policies
@@ -359,10 +515,8 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
                 tensorflow::InputTFE_Context(ctx)));
 
         tensorflow::DeviceNameUtils::ParsedName input_device_name;
-        if (!tensorflow::DeviceNameUtils::ParseFullName(device_name,
-                                                        &input_device_name) &&
-            !tensorflow::DeviceNameUtils::ParseLocalName(device_name,
-                                                         &input_device_name)) {
+        if (!tensorflow::DeviceNameUtils::ParseFullOrLocalName(
+                device_name, &input_device_name)) {
           tensorflow::ThrowValueError(
               absl::StrFormat("Failed parsing device name: '%s'", device_name)
                   .c_str());
@@ -375,11 +529,6 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
         for (int device_idx = 0; device_idx < devices.size(); device_idx++) {
           tensorflow::Device* device = devices[device_idx];
 
-          if (absl::StrContains(device->name(), "XLA") &&
-              !absl::StrContains(device_name, "XLA")) {
-            continue;
-          }
-
           if (tensorflow::DeviceNameUtils::AreCompatibleDevNames(
                   input_device_name, device->parsed_name())) {
             if (device->device_type() == tensorflow::DEVICE_CPU) {
@@ -387,13 +536,6 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
                   "CPU does not support getting allocator information");
             }
 
-            if (absl::StrContains(device->device_type(), "XLA") &&
-                !absl::StrContains(device_name, "XLA")) {
-              // TODO(b/140134773): Remove this workaround.
-              // Do not accidentally match XLA devices.
-              continue;
-            }
-
             if (matched_device != nullptr) {
               tensorflow::ThrowValueError(
                   absl::StrFormat(
@@ -412,7 +554,6 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
               absl::StrFormat("No matching devices found for '%s'", device_name)
                   .c_str());
         }
-        CHECK(matched_device);
 
         tensorflow::AllocatorAttributes attrs;
         tensorflow::Allocator* allocator = matched_device->GetAllocator(attrs);
@@ -426,7 +567,6 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
             absl::StrFormat("Allocator stats not available for device '%s'",
                             matched_device->name())
                 .c_str());
-        LOG(FATAL) << "Unreachable";
       });
 
   // XLA Eager Logic
@@ -436,13 +576,19 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
   m.def("TF_SetXlaConstantFoldingDisabled", &TF_SetXlaConstantFoldingDisabled);
   m.def("TF_GetXlaConstantFoldingDisabled", &TF_GetXlaConstantFoldingDisabled);
   m.def("TF_SetXlaMinClusterSize", &TF_SetXlaMinClusterSize);
+  m.def("TF_GetCompilerIr", &tensorflow::TFE_GetCompilerIr);
 
   // MLIR Logic
   m.def("TF_IsMlirBridgeEnabled", [] {
-    return tensorflow::GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge;
+    return tensorflow::GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge ==
+           tensorflow::ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED;
   });
   m.def("TF_EnableMlirBridge", [](bool enabled) {
-    tensorflow::GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge = enabled;
+    tensorflow::GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge =
+        enabled
+            ? tensorflow::ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED
+            : tensorflow::ConfigProto::Experimental::
+                  MLIR_BRIDGE_ROLLOUT_DISABLED;
   });
   m.def("TF_EnableXlaDevices", [] {
     tensorflow::GetXlaDeviceFlags()->tf_xla_enable_xla_devices = true;
@@ -544,19 +690,11 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
     return TFE_ContextGetDevicePlacementPolicy(
         tensorflow::InputTFE_Context(ctx));
   });
-  m.def("TFE_ContextGetMirroringPolicy", [](py::handle& ctx) {
-    return TFE_ContextGetMirroringPolicy(tensorflow::InputTFE_Context(ctx));
-  });
   m.def("TFE_ContextSetThreadLocalDevicePlacementPolicy",
         [](py::handle& ctx, TFE_ContextDevicePlacementPolicy policy) {
           TFE_ContextSetThreadLocalDevicePlacementPolicy(
               tensorflow::InputTFE_Context(ctx), policy);
         });
-  m.def("TFE_ContextSetThreadLocalMirroringPolicy",
-        [](py::handle& ctx, TFE_ContextMirroringPolicy policy) {
-          TFE_ContextSetThreadLocalMirroringPolicy(
-              tensorflow::InputTFE_Context(ctx), policy);
-        });
   m.def("TFE_ContextSetServerDef", [](py::handle& ctx, int keep_alive_secs,
                                       py::bytes proto) {
     tensorflow::Safe_TF_StatusPtr status =
@@ -862,8 +1000,6 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
   m.def("TFE_ContextOptionsSetLazyRemoteInputsCopy",
         &TFE_ContextOptionsSetLazyRemoteInputsCopy);
   m.def("TFE_ContextOptionsSetTfrt", &TFE_ContextOptionsSetTfrt);
-  m.def("TFE_ContextOptionsSetMirroringPolicy",
-        &TFE_ContextOptionsSetMirroringPolicy);
   m.def("TFE_ContextOptionsSetAsync", &TFE_ContextOptionsSetAsync);
   m.def("TFE_DeleteContextOptions", &TFE_DeleteContextOptions,
         py::return_value_policy::reference);
@@ -915,6 +1051,14 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
     TF_SetStatus(status.get(), static_cast<TF_Code>(code), message);
     TFE_AbortCollectiveOps(tensorflow::InputTFE_Context(ctx), status.get());
   });
+  m.def("TFE_CollectiveOpsCheckPeerHealth",
+        [](const py::handle& ctx, const char* task) {
+          tensorflow::Safe_TF_StatusPtr status =
+              tensorflow::make_safe(TF_NewStatus());
+          TFE_CollectiveOpsCheckPeerHealth(tensorflow::InputTFE_Context(ctx),
+                                           task, status.get());
+          tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+        });
   m.def("TF_ListPhysicalDevices", &tensorflow::TF_ListPhysicalDevices);
   m.def("TF_GetDeviceDetails", &tensorflow::TF_GetDeviceDetails);
   m.def("TF_DeleteDeviceList", &TF_DeleteDeviceList,
@@ -1212,9 +1356,16 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
   // DLPack functions
   m.def("TFE_ToDlpackCapsule", [](py::handle& o) {
     PyObject* eager_tensor_pyobject_ptr = o.ptr();
-    TFE_TensorHandle* thandle = EagerTensor_Handle(eager_tensor_pyobject_ptr);
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
+
+    if (!EagerTensor_CheckExact(eager_tensor_pyobject_ptr)) {
+      status->status = tensorflow::errors::InvalidArgument(
+          "The argument to `to_dlpack` must be a TF tensor, not Python object");
+      tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+    }
+
+    TFE_TensorHandle* thandle = EagerTensor_Handle(eager_tensor_pyobject_ptr);
     void* dlm_ptr = tensorflow::TFE_HandleToDLPack(thandle, status.get());
     tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
 
@@ -1290,6 +1441,38 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
     tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
   });
 
+  py::class_<EagerContextThreadLocalDataWrapper>(m,
+                                                 "EagerContextThreadLocalData")
+      .def(py::init<py::handle, py::handle, py::handle>(),
+           py::arg("py_eager_context"), py::arg("is_eager"),
+           py::arg("device_spec"))
+      .def_property("is_eager",
+                    &EagerContextThreadLocalDataWrapper::get_is_eager,
+                    &EagerContextThreadLocalDataWrapper::set_is_eager)
+      .def_property(
+          "invoking_op_callbacks",
+          &EagerContextThreadLocalDataWrapper::get_invoking_op_callbacks,
+          &EagerContextThreadLocalDataWrapper::set_invoking_op_callbacks)
+      .def_property("device_name",
+                    &EagerContextThreadLocalDataWrapper::get_device_name,
+                    &EagerContextThreadLocalDataWrapper::set_device_name)
+      .def_property("scope_name",
+                    &EagerContextThreadLocalDataWrapper::get_scope_name,
+                    &EagerContextThreadLocalDataWrapper::set_scope_name)
+      .def_property("device_spec",
+                    &EagerContextThreadLocalDataWrapper::get_device_spec,
+                    &EagerContextThreadLocalDataWrapper::set_device_spec)
+      .def_property(
+          "function_call_options",
+          &EagerContextThreadLocalDataWrapper::get_function_call_options,
+          &EagerContextThreadLocalDataWrapper::set_function_call_options)
+      .def_property("executor",
+                    &EagerContextThreadLocalDataWrapper::get_executor,
+                    &EagerContextThreadLocalDataWrapper::set_executor)
+      .def_property("op_callbacks",
+                    &EagerContextThreadLocalDataWrapper::get_op_callbacks,
+                    &EagerContextThreadLocalDataWrapper::set_op_callbacks);
+
   // C API Enum
 
   py::enum_<TFE_ContextDevicePlacementPolicy>(
@@ -1312,9 +1495,4 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
       .value("TF_ATTR_PLACEHOLDER", TF_ATTR_PLACEHOLDER)
       .value("TF_ATTR_FUNC", TF_ATTR_FUNC)
       .export_values();
-
-  py::enum_<TFE_ContextMirroringPolicy>(m, "TFE_ContextMirroringPolicy")
-      .value("TFE_MIRRORING_NONE", TFE_MIRRORING_NONE)
-      .value("TFE_MIRRORING_ALL", TFE_MIRRORING_ALL)
-      .export_values();
 };
diff --git a/tensorflow/python/tools/api/generator/BUILD b/tensorflow/python/tools/api/generator/BUILD
index bdf48716144..c8d6eded2d4 100644
--- a/tensorflow/python/tools/api/generator/BUILD
+++ b/tensorflow/python/tools/api/generator/BUILD
@@ -2,10 +2,8 @@
 # Scripts used to generate TensorFlow Python API.
 
 load("//tensorflow:tensorflow.bzl", "py_test")
-load("//tensorflow/python/tools/api/generator:api_init_files.bzl", "TENSORFLOW_API_INIT_FILES")
-load("//tensorflow/python/tools/api/generator:api_init_files_v1.bzl", "TENSORFLOW_API_INIT_FILES_V1")
-load("//tensorflow/python/tools/api/generator:api_init_files.bzl", "KERAS_API_INIT_FILES")
-load("//tensorflow/python/tools/api/generator:api_init_files_v1.bzl", "KERAS_API_INIT_FILES_V1")
+load("//tensorflow/python/tools/api/generator:api_init_files.bzl", "KERAS_API_INIT_FILES", "TENSORFLOW_API_INIT_FILES")
+load("//tensorflow/python/tools/api/generator:api_init_files_v1.bzl", "KERAS_API_INIT_FILES_V1", "TENSORFLOW_API_INIT_FILES_V1")
 
 package(
     licenses = ["notice"],  # Apache 2.0
@@ -85,7 +83,13 @@ py_test(
     ],
     deps = [
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:composite_tensor",
+        "//tensorflow/python:framework_combinations",
+        "//tensorflow/python:modules_with_exports",
         "//tensorflow/python:no_contrib",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:multi_process_runner",
+        "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/tools/api/generator:create_python_api",
     ],
 )
diff --git a/tensorflow/python/tools/api/generator/api_gen.bzl b/tensorflow/python/tools/api/generator/api_gen.bzl
index 45355c90aaa..fb9fd65c7bb 100644
--- a/tensorflow/python/tools/api/generator/api_gen.bzl
+++ b/tensorflow/python/tools/api/generator/api_gen.bzl
@@ -139,7 +139,7 @@ def gen_api_init_files(
             " --use_relative_imports=True $(OUTS)"
         ),
         srcs = srcs,
-        tools = [":" + api_gen_binary_target],
+        exec_tools = [":" + api_gen_binary_target],
         visibility = [
             "//tensorflow:__pkg__",
             "//tensorflow/tools/api/tests:__pkg__",
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index c7a788450dc..c04ab6900e9 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -4,6 +4,14 @@
 TENSORFLOW_API_INIT_FILES = [
     # BEGIN GENERATED FILES
     "__init__.py",
+    "__internal__/__init__.py",
+    "__internal__/decorator/__init__.py",
+    "__internal__/distribute/__init__.py",
+    "__internal__/distribute/combinations/__init__.py",
+    "__internal__/distribute/multi_process_runner/__init__.py",
+    "__internal__/test/__init__.py",
+    "__internal__/test/combinations/__init__.py",
+    "__internal__/tracking/__init__.py",
     "__operators__/__init__.py",
     "audio/__init__.py",
     "autograph/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/output_init_files_test.py b/tensorflow/python/tools/api/generator/output_init_files_test.py
index f1f85de868b..25035e0567a 100644
--- a/tensorflow/python/tools/api/generator/output_init_files_test.py
+++ b/tensorflow/python/tools/api/generator/output_init_files_test.py
@@ -24,6 +24,11 @@ import sys
 # pylint: disable=unused-import
 from tensorflow import python as _tf_for_api_traversal
 from tensorflow.lite.python import lite as _tflite_for_api_traversal
+from tensorflow.python import modules_with_exports
+from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import test_combinations
 # pylint: enable=unused-import
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index bdbdd3499ad..0c8b8f5576b 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -590,6 +590,9 @@ def _create_example_string(example_dict):
       example.features.feature[feature_name].float_list.value.extend(
           feature_list)
     elif isinstance(feature_list[0], str):
+      example.features.feature[feature_name].bytes_list.value.extend(
+          [f.encode('utf8') for f in feature_list])
+    elif isinstance(feature_list[0], bytes):
       example.features.feature[feature_name].bytes_list.value.extend(
           feature_list)
     elif isinstance(feature_list[0], six.integer_types):
diff --git a/tensorflow/python/tools/saved_model_cli_test.py b/tensorflow/python/tools/saved_model_cli_test.py
index 84283ec7dd7..2580cbd73ca 100644
--- a/tensorflow/python/tools/saved_model_cli_test.py
+++ b/tensorflow/python/tools/saved_model_cli_test.py
@@ -12,9 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for SavedModelCLI tool.
-
-"""
+"""Tests for SavedModelCLI tool."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -29,6 +27,7 @@ from absl.testing import parameterized
 import numpy as np
 from six import StringIO
 
+from tensorflow.core.example import example_pb2
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.debug.wrappers import local_cli_wrapper
@@ -177,8 +176,7 @@ signature_def['serving_default']:
     saved_model_dir = os.path.join(test.get_temp_dir(), 'dummy_model')
     dummy_model = DummyModel()
     # Call with specific values to create new polymorphic function traces.
-    dummy_model.func1(
-        constant_op.constant(5), constant_op.constant(9), True)
+    dummy_model.func1(constant_op.constant(5), constant_op.constant(9), True)
     dummy_model(constant_op.constant(5))
     save.save(dummy_model, saved_model_dir)
     self.parser = saved_model_cli.create_parser()
@@ -391,6 +389,33 @@ Defined Functions:
     self.assertTrue(len(input_dict) == 2)
     self.assertTrue(len(input_expr_dict) == 2)
 
+  def testInputPreProcessExamplesWithStrAndBytes(self):
+    input_examples_str = 'inputs=[{"text":["foo"], "bytes":[b"bar"]}]'
+    input_dict = saved_model_cli.preprocess_input_examples_arg_string(
+        input_examples_str)
+    feature = example_pb2.Example.FromString(input_dict['inputs'][0])
+    self.assertProtoEquals(
+        """
+          features {
+            feature {
+              key: "bytes"
+              value {
+                bytes_list {
+                  value: "bar"
+                }
+              }
+            }
+            feature {
+              key: "text"
+              value {
+                bytes_list {
+                  value: "foo"
+                }
+              }
+            }
+          }
+    """, feature)
+
   def testInputPreProcessFileNames(self):
     input_str = (r'inputx=C:\Program Files\data.npz[v:0];'
                  r'input:0=c:\PROGRA~1\data.npy')
@@ -680,10 +705,11 @@ Defined Functions:
     def fake_wrapper_session(sess):
       return sess
 
-    with test.mock.patch.object(local_cli_wrapper,
-                                'LocalCLIDebugWrapperSession',
-                                side_effect=fake_wrapper_session,
-                                autospec=True) as fake:
+    with test.mock.patch.object(
+        local_cli_wrapper,
+        'LocalCLIDebugWrapperSession',
+        side_effect=fake_wrapper_session,
+        autospec=True) as fake:
       saved_model_cli.run(args)
       fake.assert_called_with(test.mock.ANY)
 
@@ -720,11 +746,11 @@ Defined Functions:
     self.parser = saved_model_cli.create_parser()
     base_path = test.test_src_dir_path(SAVED_MODEL_PATH)
     output_dir = os.path.join(test.get_temp_dir(), 'aot_compile_cpu_dir')
-    args = self.parser.parse_args(
-        ['aot_compile_cpu', '--dir', base_path, '--tag_set', 'serve',
-         '--output_prefix', output_dir,
-         '--cpp_class', 'Compiled',
-         '--signature_def_key', 'MISSING'])
+    args = self.parser.parse_args([
+        'aot_compile_cpu', '--dir', base_path, '--tag_set', 'serve',
+        '--output_prefix', output_dir, '--cpp_class', 'Compiled',
+        '--signature_def_key', 'MISSING'
+    ])
     with self.assertRaisesRegex(ValueError, 'Unable to find signature_def'):
       saved_model_cli.aot_compile_cpu(args)
 
@@ -785,9 +811,8 @@ Defined Functions:
     output_prefix = os.path.join(test.get_temp_dir(), 'aot_compile_cpu_dir/out')
     args = self.parser.parse_args([
         'aot_compile_cpu', '--dir', saved_model_dir, '--tag_set', 'serve',
-        '--signature_def_key', 'func',
-        '--output_prefix', output_prefix, '--variables_to_feed',
-        variables_to_feed, '--cpp_class', 'Generated'
+        '--signature_def_key', 'func', '--output_prefix', output_prefix,
+        '--variables_to_feed', variables_to_feed, '--cpp_class', 'Generated'
     ])  # Use the default seving signature_key.
     with test.mock.patch.object(logging, 'warn') as captured_warn:
       saved_model_cli.aot_compile_cpu(args)
@@ -812,8 +837,8 @@ Defined Functions:
     if func == dummy_model.func_write:
       # Writeable variables setters do not preserve constness.
       self.assertIn('set_var_param_write_var_data(float', header_contents)
-      self.assertNotIn(
-          'set_var_param_write_var_data(const float', header_contents)
+      self.assertNotIn('set_var_param_write_var_data(const float',
+                       header_contents)
 
     makefile_contents = file_io.read_file_to_string(
         '{}_makefile.inc'.format(output_prefix))
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index 9cd75d1bed7..0afe2086d93 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -1,6 +1,7 @@
 # Description: Operations defined for Cloud TPUs
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "pytype_library")  # buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_py_test")  # buildifier: disable=same-origin-load
 load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 
@@ -41,17 +42,18 @@ py_test(
     ],
 )
 
-py_library(
-    name = "tpu_py",
+pytype_library(
+    name = "tpu_ops",
     srcs = ["ops/tpu_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":tpu_function",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:tpu_ops_gen",
     ],
 )
 
-py_library(
+pytype_library(
     name = "async_checkpoint",
     srcs = ["async_checkpoint.py"],
     srcs_version = "PY2AND3",
@@ -88,7 +90,7 @@ tpu_py_test(
     ],
 )
 
-py_library(
+pytype_library(
     name = "preempted_hook_py",
     srcs = ["preempted_hook.py"],
     srcs_version = "PY2AND3",
@@ -185,7 +187,7 @@ py_library(
     ],
 )
 
-py_library(
+pytype_library(
     name = "tpu_lib",
     srcs = [
         "__init__.py",
@@ -196,20 +198,18 @@ py_library(
         "tensor_tracer_flags.py",
         "tensor_tracer_report.py",
         "topology.py",
-        "tpu.py",
         "tpu_feed.py",
-        "tpu_function.py",
         "tpu_optimizer.py",
         "tpu_sharding.py",
         "tpu_strategy_util.py",
-        "tpu_system_metadata.py",
         "training_loop.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":datasets",
         ":functional",
-        ":tpu_py",
+        ":tpu_function",
+        ":tpu_ops",
         "//tensorflow/compiler/xla/experimental/xla_sharding",
         "//tensorflow/compiler/xla/python_api:xla_shape",
         "//tensorflow/core:protos_all_py",
@@ -226,6 +226,7 @@ py_library(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:platform_analytics",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tpu_ops_gen",
@@ -233,6 +234,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/compiler/xla",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/tpu:tensor_tracer_proto_py",
         "//tensorflow/python/tpu/profiler",
@@ -240,7 +242,64 @@ py_library(
     ],
 )
 
-py_library(
+pytype_library(
+    name = "tpu_py",
+    srcs = ["tpu.py"],
+    deps = [
+        ":tpu_function",
+        ":tpu_ops",
+        "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:auto_control_deps",
+        "//tensorflow/python:c_api_util",
+        "//tensorflow/python:composite_tensor",
+        "//tensorflow/python:config",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:device",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:func_graph",
+        "//tensorflow/python:function",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/compiler/xla",
+        "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/logging",
+        "@enum34_archive//:enum",
+        "@six_archive//:six",
+    ],
+)
+
+pytype_library(
+    name = "tpu_function",
+    srcs = ["tpu_function.py"],
+    deps = [
+    ],
+)
+
+pytype_library(
+    name = "tpu_system_metadata",
+    srcs = ["tpu_system_metadata.py"],
+    deps = [
+        ":tpu_py",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_py",
+        "//tensorflow/python:config",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:session",
+        "//tensorflow/python:util",
+        "//tensorflow/python/distribute:device_util",
+    ],
+)
+
+pytype_library(
     name = "datasets",
     srcs = [
         "datasets.py",
@@ -250,6 +309,7 @@ py_library(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:function",
         "//tensorflow/python:functional_ops",
+        "//tensorflow/python/data/experimental/ops:interleave_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/ops:readers",
@@ -277,6 +337,7 @@ tf_py_test(
         "no_oss",  # TODO(b/131157871): Reenable in OSS when fixed
         "no_windows",  # TODO: needs investigation on Windows
     ],
+    tfrt_enabled = True,
     deps = [
         ":tpu",
         "//tensorflow/python:client_testlib",
@@ -290,6 +351,7 @@ tf_py_test(
     name = "tpu_sharding_test",
     size = "small",
     srcs = ["tpu_sharding_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":tpu",
         "//tensorflow/python:client_testlib",
@@ -301,6 +363,7 @@ tf_py_test(
     name = "bfloat16_test",
     size = "small",
     srcs = ["bfloat16_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":tpu",
         "//tensorflow/python:client_testlib",
@@ -312,6 +375,7 @@ tf_py_test(
     name = "tpu_infeed_test",
     size = "small",
     srcs = ["tpu_infeed_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":tpu",
         "//tensorflow/python:framework",
@@ -323,13 +387,14 @@ tf_py_test(
     name = "topology_test",
     size = "medium",
     srcs = ["topology_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":tpu",
         "//tensorflow/python:framework_test_lib",
     ],
 )
 
-py_library(
+pytype_library(
     name = "tpu_embedding",
     srcs = [
         "tpu_embedding.py",
@@ -351,7 +416,7 @@ py_library(
     ],
 )
 
-py_library(
+pytype_library(
     name = "tpu_strategy_util",
     srcs = ["tpu_strategy_util.py"],
     deps = [
@@ -366,7 +431,7 @@ py_library(
     ],
 )
 
-py_library(
+pytype_library(
     name = "feature_column",
     srcs = ["feature_column.py"],
     deps = [
@@ -379,7 +444,7 @@ py_library(
     ],
 )
 
-py_library(
+pytype_library(
     name = "feature_column_v2",
     srcs = ["feature_column_v2.py"],
     deps = [
@@ -399,6 +464,7 @@ tf_py_test(
         "feature_column_test.py",
     ],
     main = "feature_column_test.py",
+    tfrt_enabled = True,
     deps = [
         ":feature_column",
         "//tensorflow/python:client_testlib",
@@ -421,6 +487,7 @@ tf_py_test(
         "feature_column_v2_test.py",
     ],
     main = "feature_column_v2_test.py",
+    tfrt_enabled = True,
     deps = [
         ":feature_column_v2",
         "//tensorflow/python:client_testlib",
@@ -437,22 +504,23 @@ tf_py_test(
     ],
 )
 
-py_library(
+pytype_library(
     name = "tpu_embedding_v2_utils",
     srcs = ["tpu_embedding_v2_utils.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/python:init_ops_v2",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/distribute:device_util",
         "//tensorflow/python/distribute:sharded_variable",
         "//tensorflow/python/tpu:tpu_lib",
-        "//tensorflow/python/tpu:tpu_py",
+        "//tensorflow/python/tpu:tpu_ops",
         "//tensorflow/python/training/saving:saveable_hook",
         "@six_archive//:six",
     ],
 )
 
-py_library(
+pytype_library(
     name = "tpu_embedding_v2",
     srcs = ["tpu_embedding_v2.py"],
     srcs_version = "PY2AND3",
@@ -460,9 +528,11 @@ py_library(
         ":tpu_embedding_v2_utils",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/distribute:distribute_utils",
         "//tensorflow/python/distribute:sharded_variable",
+        "//tensorflow/python/distribute:tpu_strategy",
         "//tensorflow/python/tpu:tpu_lib",
-        "//tensorflow/python/tpu:tpu_py",
+        "//tensorflow/python/tpu:tpu_ops",
         "//tensorflow/python/training/saving:saveable_hook",
         "@six_archive//:six",
     ],
@@ -545,12 +615,27 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "tpu_embedding_v2_utils_test",
+    srcs = [
+        "tpu_embedding_v2_utils_test.py",
+    ],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    tfrt_enabled = True,
+    deps = [
+        ":tpu_embedding_v2",
+        "//tensorflow/python/compat:v2_compat",
+    ],
+)
+
 tpu_py_test(
     name = "tpu_outside_compilation_test",
     srcs = [
         "tpu_outside_compilation_test.py",
     ],
     disable_experimental = True,
+    disable_mlir_bridge = False,
     python_version = "PY3",
     tags = ["no_oss"],
     deps = [
diff --git a/tensorflow/python/tpu/client/BUILD b/tensorflow/python/tpu/client/BUILD
index dc94bffb64e..a6973d4ec22 100644
--- a/tensorflow/python/tpu/client/BUILD
+++ b/tensorflow/python/tpu/client/BUILD
@@ -43,6 +43,7 @@ tf_py_test(
     tags = [
         "no_oss_py2",
     ],
+    tfrt_enabled = True,
     deps = [
         ":client",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/tpu/client/client_test.py b/tensorflow/python/tpu/client/client_test.py
index 0b3e6854f72..c919636adde 100644
--- a/tensorflow/python/tpu/client/client_test.py
+++ b/tensorflow/python/tpu/client/client_test.py
@@ -87,6 +87,7 @@ class CloudTpuClientTest(test.TestCase):
     if 'TPU_NAME' in os.environ:
       del os.environ['TPU_NAME']
     self._time_now = 0
+    self.addCleanup(mock.patch.stopall)
 
   def _mock_time(self, *args, **kwargs):
     return self._time_now
diff --git a/tensorflow/python/tpu/device_assignment.py b/tensorflow/python/tpu/device_assignment.py
index 9e805655a01..f8cb4e16266 100644
--- a/tensorflow/python/tpu/device_assignment.py
+++ b/tensorflow/python/tpu/device_assignment.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import enum
 import math
 
 import numpy as np
@@ -313,10 +314,22 @@ def _ring_3d(x_size, y_size, z_size):
   return ret
 
 
+class DeviceOrderMode(enum.IntEnum):
+  """The way of determining device orders when computing device assignment."""
+  # By default the mode is set to AUTO, the library will choose to form rings
+  # when that is possible.
+  AUTO = 0
+  # Form rings for replicas and model-parallel cores.
+  RING = 1
+  # Form meshes for replicas and/or model-parallel cores.
+  MESH = 2
+
+
 def device_assignment(topology,
                       computation_shape=None,
                       computation_stride=None,
-                      num_replicas=1):
+                      num_replicas=1,
+                      device_order_mode=DeviceOrderMode.AUTO):
   """Computes a device_assignment of a computation across a TPU topology.
 
   Attempts to choose a compact grid of cores for locality.
@@ -341,6 +354,9 @@ def device_assignment(topology,
       TPU topology. If None, the `computation_stride` is `[1] * topology_rank`.
     num_replicas: The number of computation replicas to run. The replicas will
       be packed into the free spaces of the topology.
+    device_order_mode: An enum of `DeviceOrderMode` class which indicates
+      whether to assign devices to form rings or meshes, or let the library to
+      choose.
 
   Returns:
     A DeviceAssignment object, which describes the mapping between the logical
@@ -450,6 +466,12 @@ def device_assignment(topology,
         computation_shape[-1] == 2  # Only handle 3D case.
         and np.prod(computation_stride) == 1  # Ensure no stride.
         and num_replicas == max_replicas)  # Full replication.
+
+    if device_order_mode != DeviceOrderMode.AUTO:
+      if device_order_mode == DeviceOrderMode.RING and not enable_3d_tiling:
+        raise ValueError("cannot assign ring order in the given topology")
+      enable_3d_tiling = device_order_mode == DeviceOrderMode.RING
+
     if enable_3d_tiling:
       assignment = []
       inner_ring = _ring_3d(computation_shape[0], computation_shape[1],
diff --git a/tensorflow/python/tpu/tensor_tracer.py b/tensorflow/python/tpu/tensor_tracer.py
index 3f8f7530a8d..41da6251ba6 100644
--- a/tensorflow/python/tpu/tensor_tracer.py
+++ b/tensorflow/python/tpu/tensor_tracer.py
@@ -49,6 +49,7 @@ from tensorflow.python.ops import summary_ops_v2 as summary
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import analytics
 from tensorflow.python.platform import gfile
+from tensorflow.python.platform import remote_utils
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary_iterator
 from tensorflow.python.tpu import tensor_tracer_flags
@@ -145,11 +146,7 @@ def set_parameters(tensor_tracer_params=None):
           - full_tensor_summary: Writes the full tensors as binary event files.
             The outputs can be read using: trace =
               tensor_tracer.read_tensor_tracer_event_file(event_file_path)
-          - trace-back-if-nan: This mode will write the full tensor content only
-            when the tensor has a NaN or Inf in it. It is possible to also print
-            the inputs coming to this op using 'trace_stack_size' parameter.
-            E.g., if trace_stack_size=2, then the tensor with NaN/Inf, its
-            inputs, and its inputs' inputs will also be printed.
+
         - report_file: Path to the metadata file that is written during graph
           construction. If not set, metadata will be printed to stdout during
           graph construction.
@@ -181,32 +178,14 @@ def set_parameters(tensor_tracer_params=None):
           '--included_optypes=some_op_type --excluded_optypes=*.' will trace
           only the ops with type 'some_op_type'
         Advanced Flags:
-        - compact_trace: If not set, statistics per tensor is written as soon as
-          they are executed. If set, then statistics for all traced tensors will
-          be stored in a cache and will be written only once per step. This flag
-          is ignored for full-tensor and part-tensor trace modes. If the
-          trace_dir is a remote directory, compact_trace will be forced.
         - trace_scalar: Scalar values are not traced by default. If this flag is
           set, scalar values will also be traced.
-        - included_cores: Accepts a list string. Tracing will only be dumped for
-          these cores. E.g, setting it to '[0,2,4,6]' will result in a trace
-          only for those cores.
         - op_range: In the form of '%d:%d' that limits the tracing to the ops
           within this limit. --op_range='5:10' will trace only the ops that have
             topological order between 5-10.
-        - trace_before_included_ops: If set to a number-k, it will also trace
-          distance-k inputs of each traced tensor. E.g., k=1, then in addition
-          to each traced_tensor, their input tensors will also be traced.
-        - trace_after_included_ops: Same as trace_before_included_ops, where it
-          will also trace distance-k outputs of each traced tensor.
         - submode: 'brief' or 'detailed'. If the trace mode is not compact,
           brief mode will print only the id of each traced tensor to save some
           space. 'detailed' mode prints the full tensor name.
-        - trace_stack_size: Used only for trace_mode=trace-back-if-nan mode. It
-          determines how many ops to print back from a nan op. E.g, op4 -> op3
-          -> op2 -> op1 -> op0, if op0 has a NaN and trace_stack_size is 1, the
-          result of op1 will also be printed. trace_stack_size is 2, the result
-          of op1 and op2 will be printed.
         - use_fingerprint_subdirectory: The trace directory will be chosen as
           using the fingerprint of the trace metadata under the provided
           trace_dir.
@@ -366,24 +345,6 @@ def keras_layer_tracepoint(layer, checkpoint_name):
   return layer
 
 
-def _trace_files_need_precreated(output_dir):
-  """Return True if trace files must be pre-created by users."""
-
-  if not output_dir.startswith('/'):
-    return False
-  if len(output_dir) < 5:
-    return False
-  if output_dir[2] != 'n':
-    return False
-  if output_dir[3] != 's':
-    return False
-  if output_dir[1] != 'c':
-    return False
-  if output_dir[4] != '/':
-    return False
-  return True
-
-
 class TensorTracer(object):
   """A software construct for tracing tensor values in a TF graph.
 
@@ -527,9 +488,6 @@ class TensorTracer(object):
 
   def _is_interesting_op(self, op):
     """Returns True if the given op is not an interesting one to be traced."""
-    # If flag is set to include less interesting ops, then include everything.
-    if self._parameters.include_less_interesting_ops:
-      return True
     return op_priority(op.type) <= self._parameters.trace_level
 
   @staticmethod
@@ -552,6 +510,7 @@ class TensorTracer(object):
     self._report_proto = None
     self._temp_cache_var = []
     self._report_proto_path = ''
+    self._outmost_context = None
 
   def report_proto(self):
     """Getter for tensor_tracer.proto object for summary and full_tensor_summary modes.
@@ -654,34 +613,14 @@ class TensorTracer(object):
       - The op is at most _trace_ops_before_included hops before an included op
       - The op is at most _trace_ops_after_included hops after an included op
     """
+    for opname_re in self._parameters.included_opname_re_list:
+      if opname_re.match(op.name):
+        return True
 
-    def _is_op_or_any_neighbor_included(op, check_before=0, check_after=0):
-      """Helper function to check if op is included or not."""
-      for opname_re in self._parameters.included_opname_re_list:
-        if opname_re.match(op.name):
-          return True
-
-      for optype_re in self._parameters.included_optype_re_list:
-        if optype_re.match(op.type):
-          return True
-
-      if check_after > 0:
-        for out_tensor in op.outputs:
-          for consumer in out_tensor.consumers():
-            if _is_op_or_any_neighbor_included(consumer, check_after - 1, 0):
-              return True
-      if check_before > 0:
-        for input_tensor in op.inputs:
-          if _is_op_or_any_neighbor_included(input_tensor.op,
-                                             0,
-                                             check_before - 1):
-            return True
-      return False
-    # check_after and check_before are swapped below, as below operation
-    # checks the distance from an arbitrary op to included ops.
-    return _is_op_or_any_neighbor_included(
-        op, self._parameters.trace_ops_after_included,
-        self._parameters.trace_ops_before_included)
+    for optype_re in self._parameters.included_optype_re_list:
+      if optype_re.match(op.type):
+        return True
+    return False
 
   def _is_user_excluded_op(self, op):
     for opname_re in self._parameters.excluded_opname_re_list:
@@ -725,20 +664,6 @@ class TensorTracer(object):
 
   def _use_tensor_values_cache(self):
     """Returns True if immediate tensors should be first saved to a cache."""
-    if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_SUMMARY:
-      # For summary tace mode only compact format is supported.
-      return True
-
-    if self._parameters.trace_mode not in set([
-        tensor_tracer_flags.TRACE_MODE_NAN_INF,
-        tensor_tracer_flags.TRACE_MODE_NORM,
-        tensor_tracer_flags.TRACE_MODE_MAX_ABS,
-        tensor_tracer_flags.TRACE_MODE_SUMMARY
-    ]):
-      return False
-    if (self._parameters.trace_dir and
-        _trace_files_need_precreated(self._parameters.trace_dir)):
-      return True
     return self._parameters.use_compact_trace
 
   def _use_tensor_buffer(self):
@@ -897,26 +822,6 @@ class TensorTracer(object):
       output_tensor = array_ops.reshape(output_tensor, [1])
       return output_tensor
 
-    def _detect_inf_nan_producer(tensor):
-      """Checks if the tensor is the first NaN/Inf tensor in the computation path."""
-      if tensor.op.inputs:
-        inp_check = [
-            _detect_nan_inf(inp_tensor) for inp_tensor in tensor.op.inputs
-        ]
-        is_any_input_inf_nan = math_ops.add_n(inp_check)
-      else:
-        is_any_input_inf_nan = constant_op.constant(0, dtypes.bool)
-      is_current_tensor_inf_nan = _detect_nan_inf(tensor)
-      # An op is NaN/INF producer only when all inputs are nan/inf free (
-      # is_any_input_inf_nan = 0), and its output has nan/inf (
-      # is_current_tensor_inf_nan=1). Below will be 1 if op nan/inf is producer.
-      is_nan_producer = is_current_tensor_inf_nan - is_any_input_inf_nan
-      is_nan_producer = math_ops.reduce_any(is_nan_producer > 0)
-      return is_nan_producer
-
-    if (self._parameters.trace_mode ==
-        tensor_tracer_flags.TRACE_MODE_FULL_IF_NAN):
-      return {self._parameters.trace_mode: _detect_inf_nan_producer(tensor)}
     if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_NAN_INF:
       return {self._parameters.trace_mode: _detect_nan_inf(tensor)}
     if (self._parameters.trace_mode ==
@@ -992,19 +897,22 @@ class TensorTracer(object):
 
       Raises:
         ValueError: If tensor_name is not already in
-                    self._tensorname_idx_map.
+                    tensor_trace_order.tensorname_to_cache_idx.
       """
 
       if self._parameters.is_brief_mode():
-        if tensor_name not in tensor_trace_order.tensorname_idx_map:
+        if tensor_name not in tensor_trace_order.tensorname_to_cache_idx:
           raise ValueError(
-              'Tensor name %s is not in the tensorname_idx_map'%tensor_name)
-        msg = '%d'%self._tensorname_idx_map[tensor_name]
+              'Tensor name %s is not in the tensorname_to_cache_idx' %
+              tensor_name)
+        msg = '%d' % tensor_trace_order.tensorname_to_cache_idx[tensor_name]
       else:
-        msg = '"%s"'%tensor_name
+        msg = '"%s"' % tensor_name
 
       if self._parameters.trace_dir:
-        output_path = os.path.join(self._parameters.trace_dir, _TRACE_FILE_NAME)
+        output_path = os.path.join(
+            self._parameters.trace_dir,
+            _TRACE_FILE_NAME + self._get_outfile_suffix())
         output_stream = _OUTPUT_STREAM_ESCAPE + output_path
       else:
         output_stream = sys.stderr
@@ -1025,38 +933,6 @@ class TensorTracer(object):
 
       return _print_tensor(tensor_name, -1, tensor, tensor)
 
-    def _show_full_tensors(tensor):
-      """Prints the full tensor values for the tensors that are _trace_stack_size hops away from a given tensor."""
-
-      def _get_distance_k_tensors(k_before=0):
-        """Returns the tensors that are at most k_before hops away from the tensor."""
-        if k_before < 0:
-          return []
-        visited_tensors = {tensor: 0}
-        visitor_queue = [tensor]
-        head = 0
-        while head < len(visitor_queue):
-          current_tensor = visitor_queue[head]
-          head += 1
-          distance = visited_tensors[current_tensor]
-          if distance == k_before:
-            break
-          for input_tensor in current_tensor.op.inputs:
-            if input_tensor in visited_tensors:
-              continue
-            visitor_queue.append(input_tensor)
-            visited_tensors[input_tensor] = distance + 1
-        return visitor_queue
-
-      tensors_to_print = _get_distance_k_tensors(
-          self._parameters.trace_stack_size)
-      print_ops = [_print_tensor(t.name, -1, t, t) for t in tensors_to_print]
-      with ops.control_dependencies(print_ops):
-        return constant_op.constant(True)
-
-    if (self._parameters.trace_mode ==
-        tensor_tracer_flags.TRACE_MODE_FULL_IF_NAN):
-      return _show_full_tensors
     if (self._parameters.trace_mode ==
         tensor_tracer_flags.TRACE_MODE_PART_TENSOR):
       return _show_part_tensor
@@ -1086,6 +962,22 @@ class TensorTracer(object):
     """
     return control_flow_util.IsInCond(op)
 
+  def _is_in_outmost_while_loop(self, op):
+    """Returns true if the op is at the same level with the training loop.
+
+    Returns false if the op is in an inner while loop or if it is outside of the
+    training loop.
+    Args:
+      op: tf.Operation
+
+    Returns:
+      A boolean.
+    """
+    ctxt = self._get_op_control_flow_context(op)
+    outer_while_context = control_flow_util.GetContainingWhileContext(ctxt)
+    return outer_while_context == control_flow_util.GetContainingWhileContext(
+        self._outmost_context)
+
   def _should_trace_in_control_flow(self):
     """Returns false incase it is not safe to trace ops in tf.cond or tf.while_loop."""
     # As different from the other trace modes, TRACE_MODE_OPTIONAL_SUMMARY
@@ -1131,7 +1023,18 @@ class TensorTracer(object):
       report_handler.instrument_op(
           op, TensorTracer.reason(op_id, _REASON_NOT_EXECUTED))
       return True
-    if self._is_in_control_flow(op):
+    # TensorTracer will not trace the operations that are in an inner while loop
+    # or tf.cond when a temporary cache is used. Temporary cache adds direct
+    # data dependencies to traced operations, and needs a static number of
+    # traced operations. For these cases,
+    # - We do not know the number of slots required when there are inner while
+    # loops. TensorTracer can only trace the result of a while loop.
+    # - We do not know ahead of time which branch of the tf.cond
+    # will be taken, so we avoid introducing data dependencies for the
+    # operations inside a tf.cond.
+    # - We also cannot have a data dependency to an operation in a different
+    # while context.
+    if self._is_in_control_flow(op) or not self._is_in_outmost_while_loop(op):
       if not self._should_trace_in_control_flow():
         report_handler.instrument_op(
             op, TensorTracer.reason(op_id, _REASON_IN_CONTROL_FLOW))
@@ -1311,20 +1214,10 @@ class TensorTracer(object):
       # Output files are handled by tf.summary operations, no need to precreate
       # them.
       return
-    if _trace_files_need_precreated(self._parameters.trace_dir):
-      for replica_id in range(0, self._tt_config.num_replicas):
-        trace_file_path = os.path.join(
-            self._parameters.trace_dir,
-            _COMPACT_TRACE_FILE_PREFIX) + '%d'%replica_id
-        if not gfile.Exists(trace_file_path):
-          raise RuntimeError(
-              '%s must be pre-created with the '
-              'appropriate properties.'%trace_file_path)
-    else:
+    if not gfile.Exists(self._parameters.trace_dir):
+      file_io.recursive_create_dir(self._parameters.trace_dir)
       if not gfile.Exists(self._parameters.trace_dir):
-        file_io.recursive_create_dir(self._parameters.trace_dir)
-        if not gfile.Exists(self._parameters.trace_dir):
-          raise RuntimeError('Failed to create %s'%self._parameters.trace_dir)
+        raise RuntimeError('Failed to create %s'%self._parameters.trace_dir)
 
   def _create_temp_cache(self, num_traced_tensors, num_signatures):
     """Creates a temporary cache with the given dimensions.
@@ -1360,6 +1253,7 @@ class TensorTracer(object):
     report_handler = tensor_tracer_report.TTReportHandle()
     traced_tensors = self._determine_and_instrument_traced_tensors(
         graph_order, ops_in_exec_path, tensor_trace_points, report_handler)
+    logging.info('TensorTracer is tracing %d tensors.', len(traced_tensors))
 
     tensor_trace_order = tensor_tracer_report.TensorTraceOrder(graph_order,
                                                                traced_tensors)
@@ -1398,18 +1292,89 @@ class TensorTracer(object):
         tensor_tracer_flags.TRACE_MODE_SUMMARY,
         tensor_tracer_flags.TRACE_MODE_FULL_TENSOR_SUMMARY)
 
-  def _generate_flush_cache_op(self, num_replicas, on_tpu):
+  def _inspect_summary_cache(self, cache, replica_id, step_num, output_stream,
+                             tensor_trace_order):
+    """Generates a print operation to print trace inspection.
+
+    Args:
+      cache: Tensor storing the trace results for the step.
+      replica_id: Tensor storing the replica id of the running core.
+      step_num: Step number.
+      output_stream: Where to print the outputs, e.g., file path, or sys.stderr.
+      tensor_trace_order: TensorTraceOrder object holding tensorname to id map.
+
+    Returns:
+      The Op to flush the cache to file.
+    """
+    def _inspect_tensor(tensor):
+      """Returns the text to be printed for inspection output."""
+      if (self._parameters.trace_mode ==
+          tensor_tracer_flags.TRACE_MODE_NAN_INF):
+        return control_flow_ops.cond(
+            math_ops.greater(tensor, 0.0),
+            lambda: 'has NaNs/Infs!',
+            lambda: 'has no NaNs or Infs.')
+      else:
+        return tensor
+
+    # Check if the cache includes any nan or inf
+    if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_NAN_INF:
+      # Cache has 1s or 0s if the mode is NaN_INF
+      step_has_nan_or_inf = math_ops.greater(math_ops.reduce_sum(cache), 0.0)
+    else:
+      # Cache has the actual numerics for other modes.
+      step_has_nan_or_inf = math_ops.reduce_any(
+          gen_math_ops.logical_or(
+              gen_math_ops.is_nan(cache), gen_math_ops.is_inf(cache)))
+
+    # Summarizing message for each step.
+    step_error_message = control_flow_ops.cond(
+        step_has_nan_or_inf,
+        lambda: 'NaNs or Infs in the step!',
+        lambda: 'No numerical issues have been found for the step.')
+
+    # No need to print core numbers if the cache is merged already.
+    if self._parameters.collect_summary_per_core:
+      stats = ['\n\n', 'core:', replica_id, ',', 'step:', step_num, '-->',
+               step_error_message,
+               'Printing tensors for mode:%s...' % self._parameters.trace_mode]
+    else:
+      stats = ['\n\n', 'step:', step_num, '-->', step_error_message,
+               'Printing tensors for mode:%s...' % self._parameters.trace_mode]
+
+    for tensor_name, cache_idx in sorted(
+        tensor_trace_order.tensorname_to_cache_idx.items(),
+        key=lambda item: item[1]):
+      if self._parameters.collect_summary_per_core:
+        stats.extend([
+            '\n', 'core:', replica_id, ',', 'step:', step_num, ',',
+            tensor_name, '-->', _inspect_tensor(cache[cache_idx, 0])])
+      else:
+        stats.extend([
+            '\n', 'step:', step_num, ',',
+            tensor_name, '-->', _inspect_tensor(cache[cache_idx, 0])])
+    return logging_ops.print_v2(*stats, summarize=-1,
+                                output_stream=output_stream)
+
+  def _get_outfile_suffix(self):
+    if remote_utils.is_remote_path(self._parameters.trace_dir):
+      return remote_utils.get_appendable_file_encoding()
+    else:
+      return ''
+
+  def _generate_flush_cache_op(self, num_replicas, on_tpu, tensor_trace_order):
     """Generates an Op that will flush the cache to file.
 
     Args:
       num_replicas: total number of replicas.
       on_tpu: if the graph is executed on TPU.
+      tensor_trace_order: TensorTraceOrder object holding tensorname to id map.
 
     Returns:
       The Op to flush the cache to file.
     """
 
-    def _flush_fun(cache, replica_id):
+    def _flush_fun(cache, replica_id, step_num):
       """Flushes the cache to a file corresponding to replica_id."""
 
       def _f(file_index):
@@ -1420,19 +1385,27 @@ class TensorTracer(object):
           if self._parameters.trace_dir:
             output_path = (os.path.join(self._parameters.trace_dir,
                                         _COMPACT_TRACE_FILE_PREFIX)
-                           + replica_str)
+                           + replica_str + self._get_outfile_suffix())
             output_stream = _OUTPUT_STREAM_ESCAPE + output_path
           else:
             output_stream = sys.stderr
 
           new_step_line = _REPLICA_ID_TAG + replica_str
           print_ops = []
-          for i in range(self._num_signature_dimensions()):
-            print_ops.append(logging_ops.print_v2(
-                new_step_line, '\n',
-                cache[:, i], '\n',
-                summarize=-1,
-                output_stream=output_stream))
+          if self._parameters.inspect_trace:
+            if self._num_signature_dimensions() > 1:
+              raise ValueError('Inspecting multi signatures are not supported.')
+            print_ops.append(self._inspect_summary_cache(
+                cache=cache, replica_id=replica_id, step_num=step_num,
+                output_stream=output_stream,
+                tensor_trace_order=tensor_trace_order))
+          else:
+            for i in range(self._num_signature_dimensions()):
+              print_ops.append(logging_ops.print_v2(
+                  new_step_line, '\n',
+                  cache[:, i], '\n',
+                  summarize=-1,
+                  output_stream=output_stream))
           with ops.control_dependencies(print_ops):
             return constant_op.constant(0).op
         return _print_cache
@@ -1441,8 +1414,14 @@ class TensorTracer(object):
         return math_ops.equal(replica_id, file_index)
 
       flush_op_cases = {}
-      for i in range(num_replicas):
-        flush_op_cases[_eq(i)] = _f(i)
+      flush_op_cases[_eq(0)] = _f(0)
+      for i in range(1, num_replicas):
+        if on_tpu and not self._parameters.collect_summary_per_core:
+          # If this is the case, the cache is already merged for all cores.
+          # Only first core flushes the cache.
+          flush_op_cases[_eq(i)] = control_flow_ops.no_op
+        else:
+          flush_op_cases[_eq(i)] = _f(i)
       # Each replica needs to determine where to write their output.
       # To do this, we check if replica_id is 0, then 1, ..., and then
       # num_replicas - 1 statically; and return the corresponding static file
@@ -1455,11 +1434,20 @@ class TensorTracer(object):
       cache_val = cache
     else:
       cache_val = cache.value()
+
     if on_tpu:
-      flush_op = tpu.outside_compilation(_flush_fun,
-                                         cache_val, self._replica_id)
+      # If we do not need to collect traces for all cores, merge and aggregate
+      # per core trace.
+      if not self._parameters.collect_summary_per_core:
+        cache_val = self.merge_caches_on_tpu(cache_val)
+        cache_val = self.aggregate_global_cache(cache_val)[0]
+
+      flush_op = tpu.outside_compilation(
+          _flush_fun, cache_val, self._replica_id,
+          array_ops.identity(training_util.get_or_create_global_step()))
     else:
-      flush_op = _flush_fun(cache_val, self._replica_id)
+      flush_op = _flush_fun(cache_val, self._replica_id,
+                            training_util.get_or_create_global_step())
     if self._use_temp_cache():
       with ops.control_dependencies([flush_op]):
         return constant_op.constant(0).op
@@ -1473,13 +1461,15 @@ class TensorTracer(object):
         with ops.control_dependencies([assign_op]):
           return constant_op.constant(0).op
 
-  def _flush_tensor_values_cache(self, tensor_fetches, op_fetches, on_tpu):
+  def _flush_tensor_values_cache(self, tensor_fetches, op_fetches, on_tpu,
+                                 tensor_trace_order):
     """Flushes the intermediate tensor values in the graph to the cache.
 
     Args:
       tensor_fetches: list of tensor results returned by the model_fn.
       op_fetches: list of ops that are returned by the model_fn, e.g., train_op.
       on_tpu: if the graph is executed on TPU.
+      tensor_trace_order: TensorTraceOrder object holding tensorname to id map.
 
     Returns:
       An identical copy of tensor_fetches.
@@ -1489,7 +1479,7 @@ class TensorTracer(object):
     with ops.control_dependencies(op_fetches +
                                   [tensor.op for tensor in tensor_fetches]):
       flush_cache_op = self._generate_flush_cache_op(
-          self._tt_config.num_replicas, on_tpu)
+          self._tt_config.num_replicas, on_tpu, tensor_trace_order)
       return control_flow_ops.tuple(tensor_fetches,
                                     control_inputs=[flush_cache_op])
 
@@ -1573,18 +1563,13 @@ class TensorTracer(object):
     Raises:
       RuntimeError: if there is no aggregate function defined for a signature.
     """
-
-    global_cache_shape = local_tpu_cache_tensor.shape.as_list()
-    global_cache_shape[0] = self._tt_config.num_replicas
-
-    # Each replica will insert their local cache into the
-    # replica_id index of the global replica.
-    indices = array_ops.reshape(self._replica_id, [1, 1])
-    global_cache = array_ops.scatter_nd(indices, local_tpu_cache_tensor,
-                                        global_cache_shape)
-    merged_global_cache = tpu_ops.cross_replica_sum(
-        global_cache, [list(range(self._tt_config.num_replicas))])
-    return merged_global_cache
+    x = array_ops.broadcast_to(
+        local_tpu_cache_tensor,
+        shape=[self._tt_config.num_replicas] +
+        local_tpu_cache_tensor.shape.as_list())
+    return tpu_ops.all_to_all(
+        x, concat_dimension=0, split_dimension=0,
+        split_count=self._tt_config.num_replicas)
 
   def aggregate_global_cache(self, global_tt_summary_cache):
     """Merges the given caches on tpu.
@@ -1773,6 +1758,9 @@ class TensorTracer(object):
 
     trace_mode = self._parameters.trace_mode
     device_type = self._tt_config.device_type
+    # pylint: disable=protected-access
+    self._outmost_context = graph._get_control_flow_context()
+    # pylint: enable=protected-access
 
     analytics.track_usage('tensor_tracer', [trace_mode, device_type])
     TensorTracer.check_device_type(device_type)
@@ -1793,10 +1781,6 @@ class TensorTracer(object):
     tensor_fetch_set = set(processed_t_fetches)
     tracing_ops = []
 
-    # pylint: disable=protected-access
-    current_control_flow_context = graph._get_control_flow_context()
-    # pylint: enable=protected-access
-
     sorted_exec_op_list = list(exec_op_set)
     sorted_exec_op_list.sort(key=lambda op: op.name)
     # Trace ops only if they are in the execution path.
@@ -1863,13 +1847,6 @@ class TensorTracer(object):
             else:
               return tensor_trace_fn(tensor)
 
-          def conditional_trace_fn(predicate_tensor, out_tensor, trace_fn,
-                                   out_tensor_name):
-            """Creates a cond op that traces the out_tensor if predicate is satisfied."""
-            return control_flow_ops.cond(
-                predicate_tensor, lambda: trace_fn(out_tensor, out_tensor_name),
-                lambda: constant_op.constant(False)).op
-
           if len(processed_tensors) != 1:
             raise RuntimeError('Multiple stats are only allowed in compact '
                                'mode.')
@@ -1877,24 +1854,11 @@ class TensorTracer(object):
           # mode that uses compact format(self._use_tensor_values_cache = true).
           # Non-compact mode currently allows single stat per tensor.
           processed_out_tensor = six.next(six.itervalues(processed_tensors))
-
-          if self._parameters.is_conditional_trace:
-            trace_op = conditional_trace_fn(processed_out_tensor, out_tensor,
-                                            tpu_wrap_trace_fn, tensor_name)
-          elif self._parameters.included_cores:
-            should_print = constant_op.constant(False)
-            for core in self._parameters.included_cores:
-              should_print = gen_math_ops.logical_or(
-                  should_print, gen_math_ops.equal(self._replica_id, core))
-            trace_op = conditional_trace_fn(should_print, processed_out_tensor,
-                                            tpu_wrap_trace_fn, tensor_name)
-
-          else:
-            trace_op = tpu_wrap_trace_fn(processed_out_tensor, tensor_name)
+          trace_op = tpu_wrap_trace_fn(processed_out_tensor, tensor_name)
 
         if op_control_flow_context:
           # pylint: disable=protected-access
-          graph._set_control_flow_context(current_control_flow_context)
+          graph._set_control_flow_context(self._outmost_context)
           # pylint: enable=protected-access
         if trace_op:
           if is_a_fetched_tensor:
@@ -1908,7 +1872,7 @@ class TensorTracer(object):
             # pylint: enable=protected-access
 
     # pylint: disable=protected-access
-    graph._set_control_flow_context(current_control_flow_context)
+    graph._set_control_flow_context(self._outmost_context)
     # pylint: enable=protected-access
     if tracing_ops:
       # If we are tracing a fetched tensor, their dependency is stored in
@@ -1931,7 +1895,8 @@ class TensorTracer(object):
           del self._host_call_fn[_TT_HOSTCALL_KEY]
       else:
         processed_t_fetches = self._flush_tensor_values_cache(
-            processed_t_fetches, op_fetches, on_tpu=on_tpu)
+            processed_t_fetches, op_fetches, on_tpu=on_tpu,
+            tensor_trace_order=tensor_trace_order)
 
     # processed_t_fetches is a list at this point. Convert it to the same
     # format as given in tensor_fetches.
diff --git a/tensorflow/python/tpu/tensor_tracer_flags.py b/tensorflow/python/tpu/tensor_tracer_flags.py
index 4e412c46e82..ba375737866 100644
--- a/tensorflow/python/tpu/tensor_tracer_flags.py
+++ b/tensorflow/python/tpu/tensor_tracer_flags.py
@@ -27,16 +27,17 @@ from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 
-TRACE_MODE_NAN_INF = 'nan-inf'
 TRACE_MODE_PART_TENSOR = 'part-tensor'
 TRACE_MODE_FULL_TENSOR = 'full-tensor'
-TRACE_MODE_FULL_IF_NAN = 'trace-back-if-nan'
+TRACE_MODE_FULL_TENSOR_SUMMARY = 'full_tensor_summary'
+
+TRACE_MODE_NAN_INF = 'nan-inf'
 TRACE_MODE_NORM = 'norm'
 TRACE_MODE_MAX_ABS = 'max-abs'
 TRACE_MODE_SUMMARY = 'summary'
 # summary mode to collects a finite set of signatures for each traced tensor,
 # (such as norm, max, min, mean) and dumps it using tb summaries.
-TRACE_MODE_FULL_TENSOR_SUMMARY = 'full_tensor_summary'
+
 # Full tensor mode dumps the whole tensor values for the traced tensors without
 # any processing on them; using tb summaries.
 
@@ -49,20 +50,14 @@ _FLAG_NO_QUOTE_PAT = re.compile(r'\s*--([^=]+)=(\S*)')
 _FLAG_NO_EQUAL_PAT = re.compile(r'\s*--([^=]+)\s*')
 
 FLAGS_ENV_VAR = 'TENSOR_TRACER_FLAGS'
-FLAG_NAME_TRACE_STACK_SIZE = 'trace_stack_size'
 FLAG_NAME_ENABLE = 'enable'
 FLAG_NAME_TRACE_MODE = 'trace_mode'
-FLAG_NAME_USE_COMPACT_TRACE = 'compact_trace'
 FLAG_NAME_TRACE_SCALAR_OPS = 'trace_scalar'
-FLAG_NAME_TRACE_BEFORE_OPS = 'trace_before_included_ops'
-FLAG_NAME_TRACE_AFTER_OPS = 'trace_after_included_ops'
 FLAG_NAME_SUBMODE = 'submode'
-FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS = 'include_less_interesting_ops'
 FLAG_NAME_EXCLUDED_OPNAMES = 'excluded_opnames'
 FLAG_NAME_EXCLUDED_OPTYPES = 'excluded_optypes'
 FLAG_NAME_INCLUDED_OPNAMES = 'included_opnames'
 FLAG_NAME_INCLUDED_OPTYPES = 'included_optypes'
-FLAG_NAME_INCLUDED_CORES = 'included_cores'
 FLAG_NAME_TRACE_LEVEL = 'trace_level'
 FLAG_NAME_TRACE_DIR = 'trace_dir'
 FLAG_NAME_REPORT_FILE = 'report_file'
@@ -74,6 +69,7 @@ FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS = 'dump_graphs'
 FLAG_NAME_SUMMARY_SIGNATURES = 'signatures'
 FLAG_NAME_SUMMARY_PER_CORE = 'collect_summary_per_core'
 FLAG_NAME_TEMP_CACHE_VAR = 'use_temp_cache'
+FLAG_NAME_INSPECT_TRACE = 'inspect_trace'
 FLAG_NAME_FINGERPRINT_DIR = 'use_fingerprint_subdirectory'
 
 _OP_RANGE_PAT = re.compile(r'(\d+):(\d+)')
@@ -124,41 +120,22 @@ class TTParameters(object):
     self.included_optype_re_list = self._flag_value_to_re_list(
         FLAG_NAME_INCLUDED_OPTYPES)
 
-    self.is_conditional_trace = self._is_conditional_trace_mode()
     self.trace_scalar_ops = self.is_flag_on(FLAG_NAME_TRACE_SCALAR_OPS)
-    self.use_compact_trace = self.is_flag_on(FLAG_NAME_USE_COMPACT_TRACE)
+    self.use_compact_trace = self.trace_mode in (TRACE_MODE_NAN_INF,
+                                                 TRACE_MODE_NORM,
+                                                 TRACE_MODE_MAX_ABS,
+                                                 TRACE_MODE_SUMMARY)
     self.use_temp_cache_var = self.is_flag_on(FLAG_NAME_TEMP_CACHE_VAR)
+    self.inspect_trace = self.is_flag_on(FLAG_NAME_INSPECT_TRACE)
     self.use_fingerprint_subdir = self.is_flag_on(FLAG_NAME_FINGERPRINT_DIR)
 
-    # _trace_ops_before_included and _trace_ops_after_included denotes to depth
-    # of tracing relative to the ops given in --included_opnames or
-    # --included_optypes
-    # For example, in the below graph
-    #                op1 --> op2 --> op3 --> op4 --> op5
-    # If --included_opnames=op3 then only op3 will be traced.
-    # If also --trace_before_included_ops=2 (_trace_ops_before_included), then
-    # op1 and op2 will be traced as they are at most 2 hops apart from an
-    # included op. Similarly, if --trace_after_included_ops=2, then op4 and op5
-    # will also be traced.
-    self.trace_ops_before_included = self._get_flag_int_value(
-        FLAG_NAME_TRACE_BEFORE_OPS, 0)
-    self.trace_ops_after_included = self._get_flag_int_value(
-        FLAG_NAME_TRACE_AFTER_OPS, 0)
-    self.trace_stack_size = self._get_flag_int_value(FLAG_NAME_TRACE_STACK_SIZE,
-                                                     1)
     _, self.graph_dump_path = self.get_flag_value(
         FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS)
-    self.included_cores = self._flag_value_as_int_list(FLAG_NAME_INCLUDED_CORES)
-    self.include_less_interesting_ops = self.is_flag_on(
-        FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS)
     self.trace_level = self._get_flag_int_value(FLAG_NAME_TRACE_LEVEL,
                                                 _TT_DEFAULT_TRACE_LEVEL)
     self.summary_signatures = self._get_summary_signatures()
     self.collect_summary_per_core = self.is_flag_on(FLAG_NAME_SUMMARY_PER_CORE)
 
-  def _is_conditional_trace_mode(self):
-    return self.trace_mode == TRACE_MODE_FULL_IF_NAN
-
   def _get_report_filepath(self):
     """Sets the path of the output report file."""
 
@@ -205,7 +182,7 @@ class TTParameters(object):
       trace_mode = TRACE_MODE_NORM
     valid_trace_modes = [
         TRACE_MODE_NAN_INF, TRACE_MODE_PART_TENSOR, TRACE_MODE_FULL_TENSOR,
-        TRACE_MODE_NORM, TRACE_MODE_MAX_ABS, TRACE_MODE_FULL_IF_NAN,
+        TRACE_MODE_NORM, TRACE_MODE_MAX_ABS,
         TRACE_MODE_SUMMARY, TRACE_MODE_FULL_TENSOR_SUMMARY
     ]
     if trace_mode not in valid_trace_modes:
@@ -265,18 +242,18 @@ class TTParameters(object):
   def _validate_flag_names(self):
     """Validates if the TensorTrace flags passed are valid."""
     valid_flag_names = [
-        FLAG_NAME_ENABLE, FLAG_NAME_TRACE_MODE, FLAG_NAME_USE_COMPACT_TRACE,
-        FLAG_NAME_TRACE_SCALAR_OPS, FLAG_NAME_TRACE_BEFORE_OPS,
-        FLAG_NAME_TRACE_AFTER_OPS, FLAG_NAME_TRACE_STACK_SIZE,
+        FLAG_NAME_ENABLE, FLAG_NAME_TRACE_MODE,
+        FLAG_NAME_TRACE_SCALAR_OPS,
         FLAG_NAME_SUBMODE, FLAG_NAME_EXCLUDED_OPNAMES,
         FLAG_NAME_EXCLUDED_OPTYPES, FLAG_NAME_INCLUDED_OPNAMES,
         FLAG_NAME_INCLUDED_OPTYPES, FLAG_NAME_TRACE_DIR,
-        FLAG_NAME_INCLUDED_CORES, FLAG_NAME_REPORT_FILE,
+        FLAG_NAME_REPORT_FILE,
         FLAG_NAME_USE_TEST_UNDECLARED_OUTPUTS_DIR,
-        FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS, FLAG_NAME_OP_RANGE,
+        FLAG_NAME_OP_RANGE,
         FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS, FLAG_NAME_TRACE_LEVEL,
         FLAG_NAME_SUMMARY_SIGNATURES, FLAG_NAME_SUMMARY_PER_CORE,
-        FLAG_NAME_TEMP_CACHE_VAR, FLAG_NAME_FINGERPRINT_DIR
+        FLAG_NAME_TEMP_CACHE_VAR, FLAG_NAME_FINGERPRINT_DIR,
+        FLAG_NAME_INSPECT_TRACE
     ]
     tensor_tracer_flags = self._env.get(FLAGS_ENV_VAR)
     if not tensor_tracer_flags:
@@ -321,7 +298,10 @@ class TTParameters(object):
 
   def get_signature_to_agg_fn_map(self):
     """Returns a map that contains the aggregate function for each signature."""
-    return {TT_SUMMARY_NORM: linalg_ops.norm,
+    return {TRACE_MODE_NORM: linalg_ops.norm,
+            TRACE_MODE_MAX_ABS: math_ops.reduce_max,
+            TRACE_MODE_NAN_INF: math_ops.reduce_max,
+            TT_SUMMARY_NORM: linalg_ops.norm,
             TT_SUMMARY_MAX: math_ops.reduce_max,
             TT_SUMMARY_MIN: math_ops.reduce_min,
             TT_SUMMARY_MEAN: math_ops.reduce_mean,
@@ -358,7 +338,7 @@ class TTParameters(object):
     int_list = []
     found, flag_value = self.get_flag_value(wanted_flag_name)
 
-    if found:
+    if found and flag_value:
       try:
         integer_values = flag_value.split(',')
         int_list = [int(int_val) for int_val in integer_values]
diff --git a/tensorflow/python/tpu/tensor_tracer_report.py b/tensorflow/python/tpu/tensor_tracer_report.py
index 3270b2a2fd3..7eabfc3b786 100644
--- a/tensorflow/python/tpu/tensor_tracer_report.py
+++ b/tensorflow/python/tpu/tensor_tracer_report.py
@@ -266,8 +266,6 @@ class TTReportHandle(object):
     report.config.num_cores = tt_config.num_replicas
     report.config.num_hosts = tt_config.num_hosts
     report.config.num_cores_per_host = tt_config.num_replicas_per_host
-    for core in tt_parameters.included_cores:
-      report.config.included_cores.append(core)
     report.config.submode = tt_parameters.submode
     report.config.trace_mode = tt_parameters.trace_mode
 
@@ -351,12 +349,8 @@ class TTReportHandle(object):
                                   tt_parameters.trace_mode))
     self._write_report('%s %s\n'%(_FIELD_NAME_SUBMODE,
                                   tt_parameters.submode))
-    if tt_parameters.included_cores:
-      self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS,
-                                    len(tt_parameters.included_cores)))
-    else:
-      self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS,
-                                    tt_config.num_replicas))
+    self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS,
+                                  tt_config.num_replicas))
     self._write_report('%s %s\n'%(_FIELD_NAME_NUM_REPLICAS_PER_HOST,
                                   tt_config.num_replicas_per_host))
     self._write_report('%s %s\n'%(_FIELD_NAME_NUM_HOSTS, tt_config.num_hosts))
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index d90ad4aa60e..084ec1f3dba 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 
 import collections
 import enum
+import typing
+from typing import Any
 
 from absl import logging
 import numpy as np
@@ -286,6 +288,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     self._outer_device_function_stack = None
     self._oc_dev_fn_stack = None
     self._outside_compilation_cluster = None
+    self._outside_compilation_v2_context = None
     self._outside_compilation_counter = 0
     self._in_gradient_colocation = None
     self._gradient_colocation_stack = []
@@ -377,6 +380,21 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
 
   def EnterGradientColocation(self, op, gradient_uid):
     if op is not None:
+      if ops.get_default_graph()._control_flow_context is None:  # pylint: disable=protected-access
+        # If we are in TF 2 functions (control flow V2 functions, or
+        # tf.function()), we need to attach _xla_outside_compilation attribute
+        # directly because we are not in TPUReplicateContext.
+        try:
+          outside_attr = op.get_attr(_OUTSIDE_COMPILATION_ATTR).decode("ascii")
+        except ValueError:
+          # The attr was not present: do nothing.
+          return
+        parts = outside_attr.split(".")
+        cluster = parts[0] + "." + gradient_uid
+        self._outside_compilation_v2_context = OutsideCompilationV2Context(
+            cluster)
+        self._outside_compilation_v2_context.Enter()
+        return
       self._gradient_colocation_stack.append(op)
       if not self._outside_compilation_cluster:
         try:
@@ -416,6 +434,17 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
 
   def ExitGradientColocation(self, op, gradient_uid):
     if op is not None:
+      if ops.get_default_graph()._control_flow_context is None:  # pylint: disable=protected-access
+        # Inside a TF2 tf.function or control flow graph and `op` was not
+        # marked to be outside compiled.
+        assert self._outside_compilation_v2_context is None
+        return
+      if self._outside_compilation_v2_context is not None:
+        # Inside a TF2 tf.function or control flow graph and `op` was
+        # marked to be outside compiled.
+        self._outside_compilation_v2_context.Exit()
+        self._outside_compilation_v2_context = None
+        return
       if not self._gradient_colocation_stack:
         raise errors.InternalError(
             op.node_def, op,
@@ -604,6 +633,9 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
 
   def AddValue(self, val):
     """Add `val` to the current context and its outer context recursively."""
+    if not self._outer_context:
+      return val
+
     if val.name in self._values:
       # Use the real value if it comes from outer context.
       result = self._external_values.get(val.name)
@@ -1419,9 +1451,12 @@ def split_compile_and_replicate(computation,
 
     # tensor_tracer imports tpu.py. Local import to tensor_tracer to avoid
     # import-cycle
-    # pylint: disable=g-import-not-at-top
-    from tensorflow.python.tpu import tensor_tracer
-    # pylint: enable=g-import-not-at-top
+    if typing.TYPE_CHECKING:
+      tensor_tracer = Any
+    else:
+      # pylint: disable=g-import-not-at-top
+      from tensorflow.python.tpu import tensor_tracer
+      # pylint: enable=g-import-not-at-top
     if tensor_tracer.TensorTracer.is_enabled():
       tt = tensor_tracer.TensorTracer()
       output_tensors = tt.trace_tpu(ops.get_default_graph(),
diff --git a/tensorflow/python/tpu/tpu_embedding.py b/tensorflow/python/tpu/tpu_embedding.py
index e9d1a3be6b6..3a45a579631 100644
--- a/tensorflow/python/tpu/tpu_embedding.py
+++ b/tensorflow/python/tpu/tpu_embedding.py
@@ -23,6 +23,8 @@ import copy
 import math
 import re
 
+from typing import Optional
+
 import six
 
 from tensorflow.core.protobuf.tpu import optimization_parameters_pb2
@@ -314,6 +316,12 @@ AdamSlotVariableNames = collections.namedtuple(
 AdagradSlotVariableName = collections.namedtuple(
     'AdagradSlotVariableName', ['accumulator'])
 
+MomentumSlotVariableName = collections.namedtuple('MomentumSlotVariableName',
+                                                  ['momenta'])
+
+RMSPropSlotVariableNames = collections.namedtuple('RMSPropSlotVariableNames',
+                                                  ['ms', 'mom'])
+
 ProximalAdagradSlotVariableName = collections.namedtuple(
     'ProximalAdagradSlotVariableName', ['accumulator'])
 
@@ -326,6 +334,12 @@ ProximalYogiSlotVariableNames = collections.namedtuple(
 AdamSlotVariables = collections.namedtuple(
     'AdamSlotVariables', ['m', 'v'])
 
+MomentumSlotVariable = collections.namedtuple('MomentumSlotVariable',
+                                              ['momenta'])
+
+RMSPropSlotVariables = collections.namedtuple('RMSPropSlotVariables',
+                                              ['ms', 'mom'])
+
 AdagradSlotVariable = collections.namedtuple(
     'AdagradSlotVariable', ['accumulator'])
 
@@ -348,9 +362,15 @@ VariablesAndOps = collections.namedtuple(
 class _OptimizationParameters(object):
   """Parameters common to all optimizations."""
 
-  def __init__(self, learning_rate, use_gradient_accumulation, clip_weight_min,
-               clip_weight_max, weight_decay_factor,
-               multiply_weight_decay_factor_by_learning_rate):
+  def __init__(
+      self,
+      learning_rate: float,
+      use_gradient_accumulation: bool,
+      clip_weight_min: Optional[float],
+      clip_weight_max: Optional[float],
+      weight_decay_factor: Optional[float],
+      multiply_weight_decay_factor_by_learning_rate: Optional[bool],
+  ):
     self.learning_rate = learning_rate
     self.use_gradient_accumulation = use_gradient_accumulation
     self.clip_weight_min = clip_weight_min
@@ -380,14 +400,16 @@ class AdagradParameters(_OptimizationParameters):
 
   """
 
-  def __init__(self,
-               learning_rate,
-               initial_accumulator=0.1,
-               use_gradient_accumulation=True,
-               clip_weight_min=None,
-               clip_weight_max=None,
-               weight_decay_factor=None,
-               multiply_weight_decay_factor_by_learning_rate=None):
+  def __init__(
+      self,
+      learning_rate: float,
+      initial_accumulator: float = 0.1,
+      use_gradient_accumulation: bool = True,
+      clip_weight_min: Optional[float] = None,
+      clip_weight_max: Optional[float] = None,
+      weight_decay_factor: Optional[float] = None,
+      multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
+  ):
     """Optimization parameters for Adagrad.
 
     Args:
@@ -422,16 +444,18 @@ class ProximalAdagradParameters(_OptimizationParameters):
   for more details.
   """
 
-  def __init__(self,
-               learning_rate,
-               initial_accumulator=0.1,
-               l1_regularization_strength=0.0,
-               l2_regularization_strength=0.0,
-               use_gradient_accumulation=True,
-               clip_weight_min=None,
-               clip_weight_max=None,
-               weight_decay_factor=None,
-               multiply_weight_decay_factor_by_learning_rate=None):
+  def __init__(
+      self,
+      learning_rate: float,
+      initial_accumulator: float = 0.1,
+      l1_regularization_strength: float = 0.0,
+      l2_regularization_strength: float = 0.0,
+      use_gradient_accumulation: bool = True,
+      clip_weight_min: Optional[float] = None,
+      clip_weight_max: Optional[float] = None,
+      weight_decay_factor: Optional[float] = None,
+      multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
+  ):
     """Optimization parameters for Adagrad.
 
     Args:
@@ -490,18 +514,20 @@ class AdamParameters(_OptimizationParameters):
 
   """
 
-  def __init__(self,
-               learning_rate,
-               beta1=0.9,
-               beta2=0.999,
-               epsilon=1e-08,
-               lazy_adam=True,
-               sum_inside_sqrt=True,
-               use_gradient_accumulation=True,
-               clip_weight_min=None,
-               clip_weight_max=None,
-               weight_decay_factor=None,
-               multiply_weight_decay_factor_by_learning_rate=None):
+  def __init__(
+      self,
+      learning_rate: float,
+      beta1: float = 0.9,
+      beta2: float = 0.999,
+      epsilon: float = 1e-08,
+      lazy_adam: bool = True,
+      sum_inside_sqrt: bool = True,
+      use_gradient_accumulation: bool = True,
+      clip_weight_min: Optional[float] = None,
+      clip_weight_max: Optional[float] = None,
+      weight_decay_factor: Optional[float] = None,
+      multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
+  ):
     """Optimization parameters for Adam.
 
     Args:
@@ -567,20 +593,22 @@ class FtrlParameters(_OptimizationParameters):
 
   """
 
-  def __init__(self,
-               learning_rate,
-               learning_rate_power=-0.5,
-               initial_accumulator_value=0.1,
-               l1_regularization_strength=0.0,
-               l2_regularization_strength=0.0,
-               use_gradient_accumulation=True,
-               clip_weight_min=None,
-               clip_weight_max=None,
-               weight_decay_factor=None,
-               multiply_weight_decay_factor_by_learning_rate=None,
-               multiply_linear_by_learning_rate=False,
-               beta=0,
-               allow_zero_accumulator=False):
+  def __init__(
+      self,
+      learning_rate: float,
+      learning_rate_power: float = -0.5,
+      initial_accumulator_value: float = 0.1,
+      l1_regularization_strength: float = 0.0,
+      l2_regularization_strength: float = 0.0,
+      use_gradient_accumulation: bool = True,
+      clip_weight_min: Optional[float] = None,
+      clip_weight_max: Optional[float] = None,
+      weight_decay_factor: Optional[float] = None,
+      multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
+      multiply_linear_by_learning_rate: bool = False,
+      beta: float = 0,
+      allow_zero_accumulator: bool = False,
+  ):
     """Optimization parameters for Ftrl.
 
     Implements FTRL as described in the following [paper](
@@ -661,19 +689,21 @@ class ProximalYogiParameters(_OptimizationParameters):
   """
   # pylint: enable=line-too-long
 
-  def __init__(self,
-               learning_rate=0.01,
-               beta1=0.9,
-               beta2=0.999,
-               epsilon=1e-3,
-               l1_regularization_strength=0.0,
-               l2_regularization_strength=0.0,
-               initial_accumulator_value=1e-6,
-               use_gradient_accumulation=True,
-               clip_weight_min=None,
-               clip_weight_max=None,
-               weight_decay_factor=None,
-               multiply_weight_decay_factor_by_learning_rate=None):
+  def __init__(
+      self,
+      learning_rate: float = 0.01,
+      beta1: float = 0.9,
+      beta2: float = 0.999,
+      epsilon: float = 1e-3,
+      l1_regularization_strength: float = 0.0,
+      l2_regularization_strength: float = 0.0,
+      initial_accumulator_value: float = 1e-6,
+      use_gradient_accumulation: bool = True,
+      clip_weight_min: Optional[float] = None,
+      clip_weight_max: Optional[float] = None,
+      weight_decay_factor: Optional[float] = None,
+      multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
+  ):
     """Optimization parameters for Proximal Yogi.
 
     Args:
@@ -724,6 +754,135 @@ class ProximalYogiParameters(_OptimizationParameters):
     self.initial_accumulator_value = initial_accumulator_value
 
 
+class MomentumParameters(_OptimizationParameters):
+  """Optimization parameters for Momentum with TPU embeddings.
+
+  Pass this to `tf.estimator.tpu.experimental.EmbeddingConfigSpec` via the
+  `optimization_parameters` argument to set the optimizer and its parameters.
+  See the documentation for `tf.estimator.tpu.experimental.EmbeddingConfigSpec`
+  for more details.
+
+  ```
+  estimator = tf.estimator.tpu.TPUEstimator(
+      ...
+      embedding_spec=tf.estimator.tpu.experimental.EmbeddingConfigSpec(
+          ...
+          optimization_parameters=tf.tpu.experimental.MomentumParameters(0.1),
+          ...))
+  ```
+
+  """
+
+  def __init__(
+      self,
+      learning_rate: float,
+      momentum: float,
+      use_nesterov: bool = False,
+      use_gradient_accumulation: bool = True,
+      clip_weight_min: Optional[float] = None,
+      clip_weight_max: Optional[float] = None,
+      weight_decay_factor: Optional[float] = None,
+      multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
+  ):
+    """Optimization parameters for momentum.
+
+    Args:
+      learning_rate: a floating point value. The learning rate.
+      momentum: A `Tensor` or a floating point value.  The momentum.
+      use_nesterov: If `True` use Nesterov Momentum. See (Sutskever et al.,
+        2013). This implementation always computes gradients at the value of the
+        variable(s) passed to the optimizer. Using Nesterov Momentum makes the
+        variable(s) track the values called `theta_t + mu*v_t` in the paper.
+        This implementation is an approximation of the original formula, valid
+        for high values of momentum. It will compute the "adjusted gradient" in
+        NAG by assuming that the new gradient will be estimated by the current
+        average gradient plus the product of momentum and the change in the
+        average gradient.
+      use_gradient_accumulation: setting this to `False` makes embedding
+        gradients calculation less accurate but faster. Please see
+        `optimization_parameters.proto` for details.
+      clip_weight_min: the minimum value to clip by; None means -infinity.
+      clip_weight_max: the maximum value to clip by; None means +infinity.
+      weight_decay_factor: amount of weight decay to apply; None means that the
+        weights are not decayed.
+      multiply_weight_decay_factor_by_learning_rate: if true,
+        `weight_decay_factor` is multiplied by the current learning rate.
+    """
+    super(MomentumParameters, self).__init__(
+        learning_rate=learning_rate,
+        use_gradient_accumulation=use_gradient_accumulation,
+        clip_weight_min=clip_weight_min,
+        clip_weight_max=clip_weight_max,
+        weight_decay_factor=weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate=(
+            multiply_weight_decay_factor_by_learning_rate),
+    )
+    self.momentum = momentum
+    self.use_nesterov = use_nesterov
+
+
+class RMSPropParameters(_OptimizationParameters):
+  """Optimization parameters for RMSProp with TPU embeddings.
+
+  Pass this to `tf.estimator.tpu.experimental.EmbeddingConfigSpec` via the
+  `optimization_parameters` argument to set the optimizer and its parameters.
+  See the documentation for `tf.estimator.tpu.experimental.EmbeddingConfigSpec`
+  for more details.
+
+  ```
+  estimator = tf.estimator.tpu.TPUEstimator(
+      ...
+      embedding_spec=tf.estimator.tpu.experimental.EmbeddingConfigSpec(
+          ...
+          optimization_parameters=tf.tpu.experimental.MomentumParameters(0.1),
+          ...))
+  ```
+
+  """
+
+  def __init__(
+      self,
+      learning_rate: float,
+      rho: float,
+      momentum: float,
+      epsilon: float,
+      use_gradient_accumulation: bool = True,
+      clip_weight_min: Optional[float] = None,
+      clip_weight_max: Optional[float] = None,
+      weight_decay_factor: Optional[float] = None,
+      multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
+  ):
+    """Optimization parameters for RMS prop.
+
+    Args:
+      learning_rate: a floating point value. The learning rate.
+      rho: Discounting factor for the history/coming gradient
+      momentum: A scalar tensor.
+      epsilon: Small value to avoid zero denominator.
+      use_gradient_accumulation: setting this to `False` makes embedding
+        gradients calculation less accurate but faster. Please see
+        `optimization_parameters.proto` for details. for details.
+      clip_weight_min: the minimum value to clip by; None means -infinity.
+      clip_weight_max: the maximum value to clip by; None means +infinity.
+      weight_decay_factor: amount of weight decay to apply; None means that the
+        weights are not decayed.
+      multiply_weight_decay_factor_by_learning_rate: if true,
+        `weight_decay_factor` is multiplied by the current learning rate.
+    """
+    super(RMSPropParameters, self).__init__(
+        learning_rate=learning_rate,
+        use_gradient_accumulation=use_gradient_accumulation,
+        clip_weight_min=clip_weight_min,
+        clip_weight_max=clip_weight_max,
+        weight_decay_factor=weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate=(
+            multiply_weight_decay_factor_by_learning_rate),
+    )
+    self.rho = rho
+    self.momentum = momentum
+    self.epsilon = epsilon
+
+
 @tf_export(v1=['tpu.experimental.StochasticGradientDescentParameters'])
 class StochasticGradientDescentParameters(_OptimizationParameters):
   """Optimization parameters for stochastic gradient descent for TPU embeddings.
@@ -744,12 +903,14 @@ class StochasticGradientDescentParameters(_OptimizationParameters):
 
   """
 
-  def __init__(self,
-               learning_rate,
-               clip_weight_min=None,
-               clip_weight_max=None,
-               weight_decay_factor=None,
-               multiply_weight_decay_factor_by_learning_rate=None):
+  def __init__(
+      self,
+      learning_rate: float,
+      clip_weight_min: Optional[float] = None,
+      clip_weight_max: Optional[float] = None,
+      weight_decay_factor: Optional[float] = None,
+      multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
+  ):
     """Optimization parameters for stochastic gradient descent.
 
     Args:
@@ -1636,7 +1797,7 @@ def _validate_optimization_parameters(optimization_parameters,
   else:
     # Missing global optimization_parameters.
     if tbl_optimizer_missing:
-      ValueError('`optimization_parameters` is missing.')
+      raise ValueError('`optimization_parameters` is missing.')
 
 
 class _OptimizerHandler(object):
@@ -2099,6 +2260,173 @@ class _ProximalYogiHandler(_OptimizerHandler):
     return slot_variables, load_ops_fn, retrieve_ops_fn
 
 
+class _MomentumHandler(_OptimizerHandler):
+  """Handles Momentum specific logic."""
+
+  def set_optimization_parameters(self, table_descriptor):
+    (table_descriptor.optimization_parameters.momentum.SetInParent())
+    table_descriptor.optimization_parameters.momentum.momentum = (
+        self._optimization_parameters.momentum)
+    table_descriptor.optimization_parameters.momentum.use_nesterov = (
+        self._optimization_parameters.use_nesterov)
+
+  def get_default_slot_variable_names(self, table):
+    return MomentumSlotVariableName('{}/{}'.format(table, 'Momentum'))
+
+  def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
+                               table_config, table_variables, config_proto):
+
+    momenta_initializer = init_ops.zeros_initializer()
+    momenta_variables = _create_partitioned_variables(
+        name=slot_variable_names.momenta,
+        num_hosts=num_hosts,
+        vocabulary_size=table_config.vocabulary_size,
+        embedding_dimension=table_config.dimension,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+        initializer=momenta_initializer)
+    slot_variables = MomentumSlotVariable(momenta_variables)
+
+    def load_ops_fn():
+      """Returns the retrieve ops for Momentum embedding tables.
+
+      Returns:
+        A list of ops to load embedding and slot variables from CPU to TPU.
+      """
+      load_op_list = []
+      config = config_proto
+      for host_id, table_variable, momenta_variable in (zip(
+          range(num_hosts), table_variables, momenta_variables)):
+        with ops.colocate_with(table_variable):
+          load_parameters_op = tpu_ops.load_tpu_embedding_momentum_parameters(
+              parameters=table_variable,
+              momenta=momenta_variable,
+              table_name=table,
+              num_shards=num_hosts,
+              shard_id=host_id,
+              config=config,
+          )
+        config = None
+        load_op_list.append(load_parameters_op)
+      return load_op_list
+
+    def retrieve_ops_fn():
+      """Returns the retrieve ops for Momentum embedding tables.
+
+      Returns:
+        A list of ops to retrieve embedding and slot variables from TPU to CPU.
+      """
+      retrieve_op_list = []
+      config = config_proto
+      for host_id, table_variable, momenta_variable in (zip(
+          range(num_hosts), table_variables, momenta_variables)):
+        with ops.colocate_with(table_variable):
+          retrieved_table, retrieved_momenta = (
+              tpu_ops.retrieve_tpu_embedding_momentum_parameters(
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id,
+                  config=config,
+              ))
+          retrieve_parameters_op = control_flow_ops.group(
+              state_ops.assign(table_variable, retrieved_table),
+              state_ops.assign(momenta_variable, retrieved_momenta))
+        config = None
+        retrieve_op_list.append(retrieve_parameters_op)
+      return retrieve_op_list
+
+    return slot_variables, load_ops_fn, retrieve_ops_fn
+
+
+class _RMSPropHandler(_OptimizerHandler):
+  """Handles RMS prop specific logic."""
+
+  def set_optimization_parameters(self, table_descriptor):
+    (table_descriptor.optimization_parameters.rms_prop.SetInParent())
+    table_descriptor.optimization_parameters.rms_prop.rho = (
+        self._optimization_parameters.rho)
+    table_descriptor.optimization_parameters.rms_prop.epsilon = (
+        self._optimization_parameters.epsilon)
+    table_descriptor.optimization_parameters.rms_prop.momentum = (
+        self._optimization_parameters.momentum)
+
+  def get_default_slot_variable_names(self, table):
+    return RMSPropSlotVariableNames('{}/{}/ms'.format(table, 'RMSProp'),
+                                    '{}/{}/mom'.format(table, 'RMSProp'))
+
+  def create_variables_and_ops(self, table, slot_variable_names, num_hosts,
+                               table_config, table_variables, config_proto):
+
+    ms_variables = _create_partitioned_variables(
+        name=slot_variable_names.ms,
+        num_hosts=num_hosts,
+        vocabulary_size=table_config.vocabulary_size,
+        embedding_dimension=table_config.dimension,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+        initializer=init_ops.zeros_initializer(),
+    )
+    mom_variables = _create_partitioned_variables(
+        name=slot_variable_names.mom,
+        num_hosts=num_hosts,
+        vocabulary_size=table_config.vocabulary_size,
+        embedding_dimension=table_config.dimension,
+        collections=[ops.GraphKeys.GLOBAL_VARIABLES],
+        initializer=init_ops.zeros_initializer(),
+    )
+    slot_variables = RMSPropSlotVariables(ms_variables, mom_variables)
+
+    def load_ops_fn():
+      """Returns the retrieve ops for RMS Prop embedding tables.
+
+      Returns:
+        A list of ops to load embedding and slot variables from CPU to TPU.
+      """
+      load_op_list = []
+      config = config_proto
+      for host_id, table_variable, ms_variable, mom_variable in (zip(
+          range(num_hosts), table_variables, ms_variables, mom_variables)):
+        with ops.colocate_with(table_variable):
+          load_parameters_op = tpu_ops.load_tpu_embedding_rms_prop_parameters(
+              parameters=table_variable,
+              ms=ms_variable,
+              mom=mom_variable,
+              table_name=table,
+              num_shards=num_hosts,
+              shard_id=host_id,
+              config=config,
+          )
+        config = None
+        load_op_list.append(load_parameters_op)
+      return load_op_list
+
+    def retrieve_ops_fn():
+      """Returns the retrieve ops for RMS Prop embedding tables.
+
+      Returns:
+        A list of ops to retrieve embedding and slot variables from TPU to CPU.
+      """
+      retrieve_op_list = []
+      config = config_proto
+      for host_id, table_variable, ms_variable, mom_variable in (zip(
+          range(num_hosts), table_variables, ms_variables, mom_variables)):
+        with ops.colocate_with(table_variable):
+          retrieved_table, retrieved_ms, retrieved_mom = (
+              tpu_ops.retrieve_tpu_embedding_rms_prop_parameters(
+                  table_name=table,
+                  num_shards=num_hosts,
+                  shard_id=host_id,
+                  config=config,
+              ))
+          retrieve_parameters_op = control_flow_ops.group(
+              state_ops.assign(table_variable, retrieved_table),
+              state_ops.assign(ms_variable, retrieved_ms),
+              state_ops.assign(mom_variable, retrieved_mom))
+        config = None
+        retrieve_op_list.append(retrieve_parameters_op)
+      return retrieve_op_list
+
+    return slot_variables, load_ops_fn, retrieve_ops_fn
+
+
 class _StochasticGradientDescentHandler(_OptimizerHandler):
   """Handles stochastic gradient descent specific logic."""
 
@@ -2176,6 +2504,10 @@ def _get_optimization_handler(optimization_parameters):
     return _ProximalYogiHandler(optimization_parameters)
   elif isinstance(optimization_parameters, StochasticGradientDescentParameters):
     return _StochasticGradientDescentHandler(optimization_parameters)
+  elif isinstance(optimization_parameters, MomentumParameters):
+    return _MomentumHandler(optimization_parameters)
+  elif isinstance(optimization_parameters, RMSPropParameters):
+    return _RMSPropHandler(optimization_parameters)
   return NotImplementedError()
 
 
@@ -2234,7 +2566,7 @@ def _create_device_fn(hosts):
     if part_match:
       idx = int(part_match.group(1))
     else:
-      idx = int(dummy_match.group(1))
+      idx = int(dummy_match.group(1))  # pytype: disable=attribute-error
 
     device = hosts[idx]
     logging.debug('assigning {} to {}.', op, device)
diff --git a/tensorflow/python/tpu/tpu_embedding_v2.py b/tensorflow/python/tpu/tpu_embedding_v2.py
index 74f04bdd945..00b295c475a 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 from __future__ import unicode_literals
 
 import functools
+from typing import Any, Dict, Callable, List, Optional, Text, Tuple
+
 from absl import logging
 
 from tensorflow.core.framework import attr_value_pb2
@@ -49,8 +51,11 @@ from tensorflow.python.tpu.ops import tpu_ops
 from tensorflow.python.training.saving import saveable_hook
 from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import tracking
+from tensorflow.python.types import core
+from tensorflow.python.types import internal as internal_types
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -62,7 +67,7 @@ _NAME_KEY = "_tpu_embedding_layer"
 # sharded variables that can be used in the PSStrategy with optimizers.
 # We implement just enough of the of a tf.Variable so that this could be passed
 # to an optimizer.
-class TPUShardedVariable(sharded_variable.ShardedVariable):
+class TPUShardedVariable(sharded_variable.ShardedVariableMixin):
   """A ShardedVariable class for TPU."""
 
   @property
@@ -147,7 +152,7 @@ class TPUEmbedding(tracking.AutoTrackable):
 
   ```python
   distributed_dataset = (
-      strategy.experimental_distribute_datasets_from_function(
+      strategy.distribute_datasets_from_function(
           dataset_fn=...,
           options=tf.distribute.InputOptions(
               experimental_prefetch_to_device=False))
@@ -236,8 +241,11 @@ class TPUEmbedding(tracking.AutoTrackable):
 
   """
 
-  def __init__(self, feature_config, optimizer,
-               pipeline_execution_with_tensor_core=False):
+  def __init__(
+      self,
+      feature_config: Any,
+      optimizer: Optional[tpu_embedding_v2_utils._Optimizer],  # pylint:disable=protected-access
+      pipeline_execution_with_tensor_core: bool = False):
     """Creates the TPUEmbedding mid level API object.
 
     ```python
@@ -329,7 +337,7 @@ class TPUEmbedding(tracking.AutoTrackable):
 
     self._built = False
 
-  def build(self, per_replica_batch_size=None):
+  def build(self, per_replica_batch_size: Optional[int] = None):
     """Create the underlying variables and initializes the TPU for embeddings.
 
     This method creates the underlying variables (including slot variables). If
@@ -384,7 +392,7 @@ class TPUEmbedding(tracking.AutoTrackable):
     # This is internally conditioned self._built and self._using_tpu
     self._load_variables()
 
-  def _maybe_build(self, batch_size):
+  def _maybe_build(self, batch_size: Optional[int]):
     if not self._built:
       # This can be called while tracing a function, so we wrap the
       # initialization code with init_scope so it runs eagerly, this means that
@@ -395,7 +403,9 @@ class TPUEmbedding(tracking.AutoTrackable):
         self.build(batch_size)
 
   @property
-  def embedding_tables(self):
+  def embedding_tables(
+      self
+  ) -> Dict[tpu_embedding_v2_utils.TableConfig, tf_variables.Variable]:
     """Returns a dict of embedding tables, keyed by `TableConfig`.
 
     This property only works when the `TPUEmbedding` object is created under a
@@ -427,7 +437,9 @@ class TPUEmbedding(tracking.AutoTrackable):
     return {table: self._variables[table.name]["parameters"]
             for table in self._table_config}
 
-  def _create_config_proto(self):
+  def _create_config_proto(
+      self
+  ) -> tpu_embedding_configuration_pb2.TPUEmbeddingConfiguration:
     """Creates the TPUEmbeddingConfiguration proto.
 
     This proto is used to initialize the TPU embedding engine.
@@ -498,7 +510,10 @@ class TPUEmbedding(tracking.AutoTrackable):
 
     return config_proto
 
-  def _compute_per_table_gradients(self, gradients):
+  def _compute_per_table_gradients(
+      self,
+      gradients
+  ) -> Dict[Text, List[core.Tensor]]:
     """Computes a dict of lists of gradients, keyed by table name.
 
     Args:
@@ -507,7 +522,7 @@ class TPUEmbedding(tracking.AutoTrackable):
 
     Returns:
       A dict of lists of tensors, keyed by the table names, containing the
-    gradients in the correct order with None gradients repalaced by zeros.
+    gradients in the correct order with None gradients replaced by zeros.
     """
 
     nest.assert_same_structure(self._feature_config, gradients)
@@ -553,7 +568,7 @@ class TPUEmbedding(tracking.AutoTrackable):
 
     return per_table_gradients
 
-  def apply_gradients(self, gradients, name=None):
+  def apply_gradients(self, gradients, name: Text = None):
     """Applies the gradient update to the embedding tables.
 
     If a gradient of `None` is passed in any position of the nested structure,
@@ -568,7 +583,7 @@ class TPUEmbedding(tracking.AutoTrackable):
       embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
 
     distributed_dataset = (
-        strategy.experimental_distribute_datasets_from_function(
+        strategy.distribute_datasets_from_function(
             dataset_fn=...,
             options=tf.distribute.InputOptions(
                 experimental_prefetch_to_device=False))
@@ -648,7 +663,7 @@ class TPUEmbedding(tracking.AutoTrackable):
     if name is not None:
       _add_key_attr(op, name)
 
-  def dequeue(self, name=None):
+  def dequeue(self, name: Text = None):
     """Get the embedding results.
 
     Returns a nested structure of `tf.Tensor` objects, matching the structure of
@@ -665,7 +680,7 @@ class TPUEmbedding(tracking.AutoTrackable):
       embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
 
     distributed_dataset = (
-        strategy.experimental_distribute_datasets_from_function(
+        strategy.distribute_datasets_from_function(
             dataset_fn=...,
             options=tf.distribute.InputOptions(
                 experimental_prefetch_to_device=False))
@@ -761,7 +776,9 @@ class TPUEmbedding(tracking.AutoTrackable):
     # Pack the list back into the same nested structure as the features.
     return nest.pack_sequence_as(self._feature_config, per_feature_activations)
 
-  def _create_variables_and_slots(self):
+  def _create_variables_and_slots(
+      self
+  ) -> Dict[Text, Dict[Text, tf_variables.Variable]]:
     """Create variables for TPU embeddings.
 
     Note under TPUStrategy this will ensure that all creations happen within a
@@ -774,19 +791,19 @@ class TPUEmbedding(tracking.AutoTrackable):
 
     def create_variables(table):
       """Create all variables."""
-      shape = (table.vocabulary_size, table.dim)
+      variable_shape = (table.vocabulary_size, table.dim)
 
       def getter(name, shape, dtype, initializer, trainable):
-        # TODO(bfontain): make CheckpointInitialValue a callable rather than
-        # something that inherits from tensor.
-        if not isinstance(initializer, base.CheckpointInitialValue):
-          initial_value = functools.partial(initializer, shape, dtype=dtype)
-        else:
-          initial_value = initializer
-
+        del shape
+        # _add_variable_with_custom_getter clears the shape sometimes, so we
+        # take the global shape from outside the getter.
+        initial_value = functools.partial(initializer, variable_shape,
+                                          dtype=dtype)
         return tf_variables.Variable(
             name=name,
             initial_value=initial_value,
+            shape=variable_shape,
+            dtype=dtype,
             trainable=trainable)
 
       def variable_creator(name, initializer, trainable=True):
@@ -796,7 +813,7 @@ class TPUEmbedding(tracking.AutoTrackable):
         return self._add_variable_with_custom_getter(
             name=name,
             initializer=initializer,
-            shape=shape,
+            shape=variable_shape,
             dtype=dtypes.float32,
             getter=getter,
             trainable=trainable)
@@ -853,7 +870,9 @@ class TPUEmbedding(tracking.AutoTrackable):
                                self._variables,
                                self._table_config)
 
-  def _gather_saveables_for_checkpoint(self):
+  def _gather_saveables_for_checkpoint(
+      self
+  ) -> Dict[Text, Callable[[Text], "TPUEmbeddingSaveable"]]:
     """Overrides default Trackable implementation to add load/retrieve hook."""
     # This saveable should be here in both TPU and CPU checkpoints, so when on
     # CPU, we add the hook with no functions.
@@ -906,8 +925,14 @@ class TPUEmbedding(tracking.AutoTrackable):
     else:
       weights.append(float_zeros)
 
-  def _generate_enqueue_op(self, flat_inputs, flat_weights, flat_features,
-                           device_ordinal, mode_override):
+  def _generate_enqueue_op(
+      self,
+      flat_inputs: List[internal_types.NativeObject],
+      flat_weights: List[Optional[internal_types.NativeObject]],
+      flat_features: List[tpu_embedding_v2_utils.FeatureConfig],
+      device_ordinal: int,
+      mode_override: Text
+  ) -> ops.Operation:
     """Outputs a the enqueue op given the inputs and weights.
 
     Args:
@@ -1087,7 +1112,12 @@ class TPUEmbedding(tracking.AutoTrackable):
       else:
         check_device(path, input_tensor.device)
 
-  def enqueue(self, features, weights=None, training=True, name=None):
+  def enqueue(
+      self,
+      features,
+      weights=None,
+      training: bool = True,
+      name: Optional[Text] = None):
     """Enqueues id tensors for embedding lookup.
 
     This function enqueues a structure of features to be looked up in the
@@ -1095,7 +1125,7 @@ class TPUEmbedding(tracking.AutoTrackable):
     features matches the per core batch size. This will automatically happen if
     your input dataset is batched to the global batch size and you use
     `tf.distribute.TPUStrategy`'s `experimental_distribute_dataset`
-    or if you use `experimental_distribute_datasets_from_function` and batch
+    or if you use `distribute_datasets_from_function` and batch
     to the per core batch size computed by the context passed to your input
     function.
 
@@ -1105,7 +1135,7 @@ class TPUEmbedding(tracking.AutoTrackable):
       embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
 
     distributed_dataset = (
-        strategy.experimental_distribute_datasets_from_function(
+        strategy.distribute_datasets_from_function(
             dataset_fn=...,
             options=tf.distribute.InputOptions(
                 experimental_prefetch_to_device=False))
@@ -1253,7 +1283,7 @@ class TPUEmbedding(tracking.AutoTrackable):
           enqueue_ops.append(enqueue_op)
       ops.get_default_graph().control_outputs.extend(enqueue_ops)
 
-  def _get_batch_size(self, tensors, in_tpu_context):
+  def _get_batch_size(self, tensors, in_tpu_context: bool):
     """Gets the batch size from a nested structure of features."""
     batch_size = None
     for path, maybe_tensor in nest.flatten_with_joined_string_paths(tensors):
@@ -1283,7 +1313,11 @@ class TPUEmbedding(tracking.AutoTrackable):
 
 
 @def_function.function
-def _load_variables_impl(config, hosts, variables, table_config):
+def _load_variables_impl(
+    config: Text,
+    hosts: List[Tuple[int, Text]],
+    variables: Dict[Text, Dict[Text, tf_variables.Variable]],
+    table_config: tpu_embedding_v2_utils.TableConfig):
   """Load embedding tables to onto TPU for each table and host.
 
   Args:
@@ -1294,11 +1328,21 @@ def _load_variables_impl(config, hosts, variables, table_config):
     table_config: A list of tf.tpu.experimental.embedding.TableConfig objects.
   """
   def select_fn(host_id):
-    return lambda x: x.variables[host_id]
+
+    def select_or_zeros(x):
+      if host_id >= len(x.variables):
+        # In the edge case where we have more hosts than variables, due to using
+        # a small number of rows, we load zeros for the later hosts. We copy
+        # the shape of the first host's variables, which we assume is defined
+        # because TableConfig guarantees at least one row.
+        return array_ops.zeros_like(x.variables[0])
+      return x.variables[host_id]
+
+    return select_or_zeros
 
   for host_id, host in enumerate(hosts):
-    host_variables = nest.map_structure(select_fn(host_id), variables)
     with ops.device(host):
+      host_variables = nest.map_structure(select_fn(host_id), variables)
       for table in table_config:
         table.optimizer._load()(  # pylint: disable=protected-access
             table_name=table.name,
@@ -1314,7 +1358,11 @@ def _load_variables_impl(config, hosts, variables, table_config):
 
 
 @def_function.function
-def _retrieve_variables_impl(config, hosts, variables, table_config):
+def _retrieve_variables_impl(
+    config: Text,
+    hosts: List[Tuple[int, Text]],
+    variables: Dict[Text, Dict[Text, tf_variables.Variable]],
+    table_config: tpu_embedding_v2_utils.TableConfig):
   """Retrieve embedding tables from TPU to host memory.
 
   Args:
@@ -1342,8 +1390,11 @@ def _retrieve_variables_impl(config, hosts, variables, table_config):
                                  table.optimizer._slot_names()):  # pylint: disable=protected-access
           # We must assign the CPU variables the values of tensors that were
           # returned from the TPU.
-          variables[table.name][slot].variables[host_id].assign(
-              retrieved[i])
+          sharded_var = variables[table.name][slot]
+          if host_id < len(sharded_var.variables):
+            # In the edge case where we have more hosts than variables, due to
+            # using a small number of rows, we skip the later hosts.
+            sharded_var.variables[host_id].assign(retrieved[i])
         # Ensure that only the first table/first host gets a config so that we
         # don't bloat graph by attaching this large string to each op.
         # We have num tables * num hosts of these so for models with a large
@@ -1354,7 +1405,11 @@ def _retrieve_variables_impl(config, hosts, variables, table_config):
 class TPUEmbeddingSaveable(saveable_hook.SaveableHook):
   """Save/Restore hook to Retrieve/Load TPUEmbedding variables."""
 
-  def __init__(self, name, load, retrieve):
+  def __init__(
+      self,
+      name: Text,
+      load: Callable[[], Any],
+      retrieve: Callable[[], Any]):
     self._load = load
     self._retrieve = retrieve
     super(TPUEmbeddingSaveable, self).__init__(name=name)
@@ -1368,7 +1423,11 @@ class TPUEmbeddingSaveable(saveable_hook.SaveableHook):
       self._load()
 
 
-def _ragged_embedding_lookup_with_reduce(table, ragged, weights, combiner):
+def _ragged_embedding_lookup_with_reduce(
+    table: tf_variables.Variable,
+    ragged: ragged_tensor.RaggedTensor,
+    weights: ragged_tensor.RaggedTensor,
+    combiner: Text) -> core.Tensor:
   """Compute a ragged lookup followed by a reduce on axis 1.
 
   Args:
@@ -1393,13 +1452,64 @@ def _ragged_embedding_lookup_with_reduce(table, ragged, weights, combiner):
   return ragged_result
 
 
+@tf_export("tpu.experimental.embedding.serving_embedding_lookup")
 def cpu_embedding_lookup(inputs, weights, tables, feature_config):
-  """Uses CPU embedding lookup for embedding ids in features.
+  """Apply standard lookup ops with `tf.tpu.experimental.embedding` configs.
+
+  This function is a utility which allows using the
+  `tf.tpu.experimental.embedding` config objects with standard lookup functions.
+  This can be used when exporting a model which uses
+  `tf.tpu.experimental.embedding.TPUEmbedding` for serving on CPU. In particular
+  `tf.tpu.experimental.embedding.TPUEmbedding` only supports lookups on TPUs and
+  should not be part of your serving graph.
+
+  Note that TPU specific options (such as `max_sequence_length`) in the
+  configuration objects will be ignored.
+
+  In the following example we take take a trained model (see the documentation
+  for `tf.tpu.experimental.embedding.TPUEmbedding` for the context) and create a
+  saved model with a serving function that will perform the embedding lookup and
+  pass the results to your model:
+
+  ```python
+  model = model_fn(...)
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=1024,
+      optimizer=tf.tpu.experimental.embedding.SGD(0.1))
+  checkpoint = tf.train.Checkpoint(model=model, embedding=embedding)
+  checkpoint.restore(...)
+
+  @tf.function(input_signature=[{'feature_one': tf.TensorSpec(...),
+                                 'feature_two': tf.TensorSpec(...),
+                                 'feature_three': tf.TensorSpec(...)}])
+  def serve_tensors(embedding_featurese):
+    embedded_features = tf.tpu.experimental.embedding.serving_embedding_lookup(
+        embedding_features, None, embedding.embedding_tables,
+        feature_config)
+    return model(embedded_features)
+
+  model.embedding_api = embedding
+  tf.saved_model.save(model,
+                      export_dir=...,
+                      signatures={'serving_default': serve_tensors})
+
+  ```
+
+  NOTE: Its important to assign the embedding api object to a member of your
+  model as `tf.saved_model.save` only supports saving variables one `Trackable`
+  object. Since the model's weights are in `model` and the embedding table are
+  managed by `embedding`, we assign `embedding` to and attribute of `model` so
+  that tf.saved_model.save can find the embedding variables.
+
+  NOTE: The same `serve_tensors` function and `tf.saved_model.save` call will
+  work directly from training.
 
   Args:
     inputs: a nested structure of Tensors, SparseTensors or RaggedTensors.
     weights: a nested structure of Tensors, SparseTensors or RaggedTensors or
-      None for no weights.
+      None for no weights. If not None, structure must match that of inputs, but
+      entries are allowed to be None.
     tables: a dict of mapping TableConfig objects to Variables.
     feature_config: a nested structure of FeatureConfig objects with the same
       structure as inputs.
@@ -1450,7 +1560,7 @@ def cpu_embedding_lookup(inputs, weights, tables, feature_config):
   return nest.pack_sequence_as(feature_config, outputs)
 
 
-def get_list_of_hosts(strategy):
+def get_list_of_hosts(strategy: tpu_strategy.TPUStrategy) -> List[Text]:
   """Returns a sorted list of CPU devices for the remote jobs.
 
   Args:
@@ -1469,7 +1579,8 @@ def get_list_of_hosts(strategy):
   return list_of_hosts
 
 
-def extract_variable_info(kwargs):
+def extract_variable_info(
+    kwargs) -> Tuple[Text, Tuple[int, ...], dtypes.DType, Callable[[], Any]]:
   """Extracts the variable creation attributes from the kwargs.
 
   Args:
@@ -1477,7 +1588,7 @@ def extract_variable_info(kwargs):
       scope.
 
   Returns:
-    A tuple of variable name, initialization function, shape, and dtype.
+    A tuple of variable name, shape, dtype, initialization function.
   """
   if (isinstance(kwargs["initial_value"], functools.partial) and (
       "shape" in kwargs["initial_value"].keywords or
@@ -1490,10 +1601,8 @@ def extract_variable_info(kwargs):
     return (kwargs["name"], shape,
             kwargs["initial_value"].keywords.get("dtype", kwargs["dtype"]),
             kwargs["initial_value"].func)
-  elif isinstance(kwargs["initial_value"], base.CheckpointInitialValue):
-    return (kwargs["name"], kwargs["initial_value"].shape,
-            kwargs["initial_value"].dtype, kwargs["initial_value"])
-  elif "shape" not in kwargs or kwargs["shape"] is None:
+  elif "shape" not in kwargs or kwargs["shape"] is None or not callable(
+      kwargs["initial_value"]):
     raise ValueError(
         "Unable to extract initializer function and shape from {}. Please "
         "either pass a function that expects a shape and dtype as the "
@@ -1506,7 +1615,8 @@ def extract_variable_info(kwargs):
             kwargs["initial_value"])
 
 
-def make_sharded_variable_creator(hosts):
+def make_sharded_variable_creator(
+    hosts: List[Text]) -> Callable[..., TPUShardedVariable]:
   """Makes a sharded variable creator given a list of hosts.
 
   Args:
@@ -1516,39 +1626,48 @@ def make_sharded_variable_creator(hosts):
     A variable creator function.
   """
 
-  def sharded_variable_creator(next_creator, *args, **kwargs):
+  def sharded_variable_creator(
+      next_creator: Callable[..., tf_variables.Variable], *args, **kwargs):
     """The sharded variable creator."""
     kwargs["skip_mirrored_creator"] = True
 
     num_hosts = len(hosts)
-    name, shape, dtype, initial_value = extract_variable_info(kwargs)
+    name, shape, dtype, unwrapped_initial_value = extract_variable_info(kwargs)
+    initial_value = kwargs["initial_value"]
     rows = shape[0]
     cols = shape[1]
-    missing = rows % num_hosts
-    # we partition as if we were using MOD sharding.
-    partitions = ([rows // num_hosts + 1] * missing + [rows // num_hosts] *
-                  (num_hosts - missing))
+    partial_partition = rows % num_hosts
+    full_rows_per_host = rows // num_hosts
+    # We partition as if we were using MOD sharding: at least
+    # `full_rows_per_host` rows to `num_hosts` hosts, where the first
+    # `partial_partition` hosts get an additional row when the number of rows
+    # is not cleanly divisible. Note that `full_rows_per_host` may be zero.
+    partitions = (
+        [full_rows_per_host + 1] * partial_partition
+        + [full_rows_per_host] * (num_hosts - partial_partition))
     variables = []
-    newkwargs = kwargs
-    newkwargs["dtype"] = dtype
-    # TODO(bfontain): Remove this check once we can pass position and shape of
-    # shards to CheckpointInitialValue.
-    if isinstance(initial_value, base.CheckpointInitialValue) and num_hosts > 1:
-      raise RuntimeError("Delayed restoration of variables not available when "
-                         "there are multiple TPU hosts, please ensure that the "
-                         "api object has been built before you restore.")
+    sharding_aware = "shard_info" in tf_inspect.getargspec(initial_value).args
 
+    # Keep track of offset for sharding aware initializers.
+    offset = 0
+    kwargs["dtype"] = dtype
     for i, p in enumerate(partitions):
+      if p == 0:
+        # Skip variable creation for empty partitions, resulting from the edge
+        # case of 'rows < num_hosts'. This is safe because both load/restore
+        # can handle the missing values.
+        continue
       with ops.device(hosts[i]):
-        newkwargs["shape"] = (p, cols)
-        newkwargs["name"] = "{}_{}".format(name, i)
-        if isinstance(initial_value, base.CheckpointInitialValue):
-          # TODO(bfontain): Patch CheckpointInitialValue to take in account the
-          # position and shape of this shard.
-          newkwargs["initial_value"] = initial_value
+        kwargs["name"] = "{}_{}".format(name, i)
+        kwargs["shape"] = (p, cols)
+        if sharding_aware:
+          shard_info = base.ShardInfo(kwargs["shape"], (offset, 0))
+          kwargs["initial_value"] = functools.partial(
+              initial_value, shard_info=shard_info)
+          offset += p
         else:
-          newkwargs["initial_value"] = (
-              lambda: initial_value(newkwargs["shape"], dtype=dtype))
+          kwargs["initial_value"] = functools.partial(
+              unwrapped_initial_value, kwargs["shape"], dtype=dtype)
         variables.append(next_creator(*args, **kwargs))
     return TPUShardedVariable(variables, name=name)
   return sharded_variable_creator
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_correctness_test.py b/tensorflow/python/tpu/tpu_embedding_v2_correctness_test.py
index 7a9a727d956..8960d907be7 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_correctness_test.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_correctness_test.py
@@ -427,7 +427,7 @@ class TPUEmbeddingCorrectness(parameterized.TestCase, test.TestCase):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
 
     input_fn = self._create_dense_input_fn(strategy)
-    dist = strategy.experimental_distribute_datasets_from_function(
+    dist = strategy.distribute_datasets_from_function(
         input_fn,
         options=distribute_lib.InputOptions(
             experimental_prefetch_to_device=False))
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_test.py b/tensorflow/python/tpu/tpu_embedding_v2_test.py
index 5e081d6f9ef..4ad26ce5742 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_test.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_test.py
@@ -556,7 +556,7 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
 
     input_fn = self._create_dense_input_fn(strategy, include_weights=True)
-    dist = strategy.experimental_distribute_datasets_from_function(
+    dist = strategy.distribute_datasets_from_function(
         input_fn,
         options=distribute_lib.InputOptions(
             experimental_prefetch_to_device=False))
@@ -744,8 +744,7 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
 
     input_fn = self._create_dense_input_fn(strategy)
-    sparse_iter = iter(strategy.experimental_distribute_datasets_from_function(
-        input_fn))
+    sparse_iter = iter(strategy.distribute_datasets_from_function(input_fn))
 
     @def_function.function
     def test_fn():
@@ -768,8 +767,7 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
 
     input_fn = self._create_dense_input_fn(strategy)
-    sparse_iter = iter(strategy.experimental_distribute_datasets_from_function(
-        input_fn))
+    sparse_iter = iter(strategy.distribute_datasets_from_function(input_fn))
 
     @def_function.function
     def test_fn():
@@ -1127,7 +1125,8 @@ class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
     def input_fn(ctx):
       del ctx
       return dataset_ops.DatasetV2.from_tensors(feature).repeat()
-    dist = strategy.experimental_distribute_datasets_from_function(
+
+    dist = strategy.distribute_datasets_from_function(
         input_fn,
         options=distribute_lib.InputOptions(
             experimental_prefetch_to_device=False))
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils.py b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
index 86f85392681..e04f1f0281a 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_utils.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
@@ -21,26 +21,56 @@ from __future__ import unicode_literals
 
 import abc
 import math
+import typing
+from typing import Any, Dict, Callable, List, Optional, Text, Tuple, TypeVar, Union
 import six
 
 from tensorflow.core.protobuf.tpu import optimization_parameters_pb2
+from tensorflow.python.distribute import sharded_variable
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.types import core
 from tensorflow.python.util.tf_export import tf_export
 
 
+TableVariable = TypeVar("TableVariable", sharded_variable.ShardedVariable,
+                        tf_variables.Variable)
+SlotVarCreationFnType = Callable[
+    [TableVariable, List[Text], List[init_ops_v2.Initializer]],
+    Dict[Text, TableVariable]]
+ClipValueType = Union[Tuple[float, float], float]
+
+
 @six.add_metaclass(abc.ABCMeta)
 class _Optimizer(object):
   """Base class for all optimizers, with common parameters."""
 
-  def __init__(self, learning_rate, use_gradient_accumulation, clip_weight_min,
-               clip_weight_max, weight_decay_factor,
-               multiply_weight_decay_factor_by_learning_rate,
-               slot_variable_creation_fn=None):
+  def __init__(
+      self,
+      learning_rate: Union[float, Callable[[], float]],
+      use_gradient_accumulation: bool,
+      clip_weight_min: Optional[float],
+      clip_weight_max: Optional[float],
+      weight_decay_factor: Optional[float],
+      multiply_weight_decay_factor_by_learning_rate: bool,
+      clipvalue: Optional[ClipValueType] = None,
+      slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None):
     self.learning_rate = learning_rate
     self.use_gradient_accumulation = use_gradient_accumulation
     self.clip_weight_min = clip_weight_min
     self.clip_weight_max = clip_weight_max
+    if not use_gradient_accumulation and clipvalue is not None:
+      raise ValueError("Received non-None gradient clipping limit {} but "
+                       "use_gradient_accumulation is not set to True.".format(
+                           clipvalue))
+    if clipvalue is None:
+      clipvalue = (None, None)
+    elif not isinstance(clipvalue, tuple):
+      clipvalue = (-1. * clipvalue, clipvalue)
+    self.clip_gradient_min, self.clip_gradient_max = clipvalue
+
     self.weight_decay_factor = weight_decay_factor
     self.multiply_weight_decay_factor_by_learning_rate = (
         multiply_weight_decay_factor_by_learning_rate)
@@ -52,7 +82,7 @@ class _Optimizer(object):
     self.slot_variable_creation_fn = slot_variable_creation_fn
 
   @abc.abstractmethod
-  def _slot_names(self):
+  def _slot_names(self) -> List[Text]:
     """Returns the name of all the slot variables.
 
     This does not include the 'parameters' variable and these names must match
@@ -62,14 +92,15 @@ class _Optimizer(object):
     raise NotImplementedError
 
   @abc.abstractmethod
-  def _slot_initializers(self):
+  def _slot_initializers(self) -> List[init_ops_v2.Initializer]:
     """Returns initializers for slot variables.
 
     This returns a parallel list to self._slot_names().
     """
     raise NotImplementedError
 
-  def _set_optimization_parameters(self, parameters):
+  def _set_optimization_parameters(
+      self, parameters: optimization_parameters_pb2.OptimizationParameters):
     """Sets the optimizer fields in the OptimizationParameters."""
     if self.use_gradient_accumulation:
       parameters.gradient_accumulation_status = (
@@ -84,22 +115,32 @@ class _Optimizer(object):
     if self.clip_weight_max is not None:
       parameters.clipping_limits.upper.value = self.clip_weight_max
 
+    if self.clip_gradient_min is not None:
+      parameters.gradient_clipping_limits.lower.value = self.clip_gradient_min
+
+    if self.clip_gradient_max is not None:
+      parameters.gradient_clipping_limits.upper.value = self.clip_gradient_max
+
     if self.weight_decay_factor:
       parameters.weight_decay_factor = self.weight_decay_factor
       if self.multiply_weight_decay_factor_by_learning_rate:
         parameters.multiply_weight_decay_factor_by_learning_rate = True
 
   @abc.abstractmethod
-  def _load(self):
+  def _load(self) -> Callable[..., ops.Operation]:
     """Returns the load function for the optimizer."""
     raise NotImplementedError
 
   @abc.abstractmethod
-  def _retrieve(self):
+  def _retrieve(self) -> Callable[..., core.Tensor]:
     """Returns the retrieve function for the optimizer."""
     raise NotImplementedError
 
-  def _create_slots(self, table, variable_creator):
+  def _create_slots(
+      self, table: "TableConfig",
+      variable_creator: Callable[[Text, init_ops_v2.Initializer],
+                                 tf_variables.Variable]
+  ) -> Dict[Text, tf_variables.Variable]:
     """Creates slot variables for table.
 
     Args:
@@ -117,7 +158,7 @@ class _Optimizer(object):
       slots = {}
       for slot, initializer in zip(self._slot_names(),
                                    self._slot_initializers()):
-        slots[slot] = variable_creator(name=slot, initializer=initializer)
+        slots[slot] = variable_creator(slot, initializer)
       return slots
 
 
@@ -169,11 +210,12 @@ class SGD(_Optimizer):
   """
 
   def __init__(self,
-               learning_rate=0.01,
-               clip_weight_min=None,
-               clip_weight_max=None,
-               weight_decay_factor=None,
-               multiply_weight_decay_factor_by_learning_rate=None):
+               learning_rate: Union[float, Callable[[], float]] = 0.01,
+               clip_weight_min: Optional[float] = None,
+               clip_weight_max: Optional[float] = None,
+               weight_decay_factor: Optional[float] = None,
+               multiply_weight_decay_factor_by_learning_rate: bool = None,
+               clipvalue: Optional[ClipValueType] = None):
     """Optimization parameters for stochastic gradient descent.
 
     Args:
@@ -186,25 +228,38 @@ class SGD(_Optimizer):
         by this factor each step.
       multiply_weight_decay_factor_by_learning_rate: if true,
         `weight_decay_factor` is multiplied by the current learning rate.
+      clipvalue: Controls clipping of the gradient. Set to either a single
+        positive scalar value to get clipping or a tiple of scalar values (min,
+        max) to set a separate maximum or minimum. If one of the two entries is
+        None, then there will be no clipping that direction. Note if this is
+        set, you may see a decrease in performance as  gradient accumulation
+        will be enabled (it is normally off for SGD as it has no affect on
+        accuracy). See
+        'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for more
+        information on gradient accumulation and its impact on tpu embeddings.
     """
+    use_gradient_accumulation = clipvalue is not None
+
     super(SGD, self).__init__(
-        learning_rate, False, clip_weight_min, clip_weight_max,
-        weight_decay_factor, multiply_weight_decay_factor_by_learning_rate)
+        learning_rate, use_gradient_accumulation, clip_weight_min,
+        clip_weight_max, weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate, clipvalue)
 
-  def _slot_names(self):
+  def _slot_names(self) -> List[Text]:
     return []
 
-  def _slot_initializers(self):
+  def _slot_initializers(self) -> List[init_ops_v2.Initializer]:
     return []
 
-  def _set_optimization_parameters(self, parameters):
+  def _set_optimization_parameters(
+      self, parameters: optimization_parameters_pb2.OptimizationParameters):
     super(SGD, self)._set_optimization_parameters(parameters)
     parameters.stochastic_gradient_descent.SetInParent()
 
-  def _load(self):
+  def _load(self) -> Callable[..., ops.Operation]:
     return tpu_ops.load_tpu_embedding_stochastic_gradient_descent_parameters
 
-  def _retrieve(self):
+  def _retrieve(self) -> Callable[..., core.Tensor]:
     return tpu_ops.retrieve_tpu_embedding_stochastic_gradient_descent_parameters
 
 
@@ -255,15 +310,17 @@ class Adagrad(_Optimizer):
   algorithm.
   """
 
-  def __init__(self,
-               learning_rate=0.001,
-               initial_accumulator_value=0.1,
-               use_gradient_accumulation=True,
-               clip_weight_min=None,
-               clip_weight_max=None,
-               weight_decay_factor=None,
-               multiply_weight_decay_factor_by_learning_rate=None,
-               slot_variable_creation_fn=None):
+  def __init__(
+      self,
+      learning_rate: float = 0.001,
+      initial_accumulator_value: float = 0.1,
+      use_gradient_accumulation: bool = True,
+      clip_weight_min: Optional[float] = None,
+      clip_weight_max: Optional[float] = None,
+      weight_decay_factor: Optional[float] = None,
+      multiply_weight_decay_factor_by_learning_rate: bool = None,
+      slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
+      clipvalue: Optional[ClipValueType] = None):
     """Optimization parameters for Adagrad.
 
     Args:
@@ -278,36 +335,42 @@ class Adagrad(_Optimizer):
         weights are not decayed.
       multiply_weight_decay_factor_by_learning_rate: if true,
         `weight_decay_factor` is multiplied by the current learning rate.
-      slot_variable_creation_fn: Defaults to `None`. If you wish do directly
-        control the creation of the slot variables, set this to a callable
-        taking two parameters, a variable and a list of slot names to create for
-        it. This function should return a dict with the slot names as keys and
-        the created variables as values. When set to None (the default), uses
-        the built-in variable creation.
+      slot_variable_creation_fn: If you wish do directly control the creation of
+        the slot variables, set this to a callable taking three parameters: a
+          table variable, a list of slot names to create for it, and a list of
+          initializers. This function should return a dict with the slot names
+          as keys and the created variables as values with types matching the
+          table variable. When set to None (the default), uses the built-in
+          variable creation.
+      clipvalue: Controls clipping of the gradient. Set to either a single
+        positive scalar value to get clipping or a tuple of scalar values (min,
+        max) to set a separate maximum or minimum. If one of the two entries is
+        None, then there will be no clipping that direction.
     """
     super(Adagrad, self).__init__(
         learning_rate, use_gradient_accumulation, clip_weight_min,
         clip_weight_max, weight_decay_factor,
-        multiply_weight_decay_factor_by_learning_rate,
+        multiply_weight_decay_factor_by_learning_rate, clipvalue,
         slot_variable_creation_fn)
     if initial_accumulator_value <= 0:
       raise ValueError("Adagrad initial_accumulator_value must be positive")
     self.initial_accumulator_value = initial_accumulator_value
 
-  def _slot_names(self):
+  def _slot_names(self) -> List[Text]:
     return ["accumulators"]
 
-  def _slot_initializers(self):
+  def _slot_initializers(self) -> List[init_ops_v2.Initializer]:
     return [init_ops_v2.Constant(self.initial_accumulator_value)]
 
-  def _set_optimization_parameters(self, parameters):
+  def _set_optimization_parameters(
+      self, parameters: optimization_parameters_pb2.OptimizationParameters):
     super(Adagrad, self)._set_optimization_parameters(parameters)
     parameters.adagrad.SetInParent()
 
-  def _load(self):
+  def _load(self) -> Callable[..., ops.Operation]:
     return tpu_ops.load_tpu_embedding_adagrad_parameters
 
-  def _retrieve(self):
+  def _retrieve(self) -> Callable[..., core.Tensor]:
     return tpu_ops.retrieve_tpu_embedding_adagrad_parameters
 
 
@@ -362,19 +425,21 @@ class Adam(_Optimizer):
   algorithm.
   """
 
-  def __init__(self,
-               learning_rate=0.001,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=1e-07,
-               lazy_adam=True,
-               sum_inside_sqrt=True,
-               use_gradient_accumulation=True,
-               clip_weight_min=None,
-               clip_weight_max=None,
-               weight_decay_factor=None,
-               multiply_weight_decay_factor_by_learning_rate=None,
-               slot_variable_creation_fn=None):
+  def __init__(
+      self,
+      learning_rate: Union[float, Callable[[], float]] = 0.001,
+      beta_1: float = 0.9,
+      beta_2: float = 0.999,
+      epsilon: float = 1e-07,
+      lazy_adam: bool = True,
+      sum_inside_sqrt: bool = True,
+      use_gradient_accumulation: bool = True,
+      clip_weight_min: Optional[float] = None,
+      clip_weight_max: Optional[float] = None,
+      weight_decay_factor: Optional[float] = None,
+      multiply_weight_decay_factor_by_learning_rate: bool = None,
+      slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
+      clipvalue: Optional[ClipValueType] = None):
     """Optimization parameters for Adam.
 
     See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
@@ -384,10 +449,10 @@ class Adam(_Optimizer):
     Args:
       learning_rate: The learning rate. It should be a floating point value or a
         callable taking no arguments for a dynamic learning rate.
-      beta_1: A float value.
-        The exponential decay rate for the 1st moment estimates.
-      beta_2: A float value.
-        The exponential decay rate for the 2nd moment estimates.
+      beta_1: A float value. The exponential decay rate for the 1st moment
+        estimates.
+      beta_2: A float value. The exponential decay rate for the 2nd moment
+        estimates.
       epsilon: A small constant for numerical stability.
       lazy_adam: Use lazy Adam instead of Adam. Lazy Adam trains faster.
       sum_inside_sqrt: When this is true, the Adam update formula is changed
@@ -402,15 +467,22 @@ class Adam(_Optimizer):
         weights are not decayed.
       multiply_weight_decay_factor_by_learning_rate: if true,
         `weight_decay_factor` is multiplied by the current learning rate.
-      slot_variable_creation_fn: a callable taking two parameters, a variable
-        and a list of slot names to create for it. This function should return
-        a dict with the slot names as keys and the created variables as values.
-        When set to None (the default), uses the built-in variable creation.
+      slot_variable_creation_fn: If you wish do directly control the creation of
+        the slot variables, set this to a callable taking three parameters: a
+          table variable, a list of slot names to create for it, and a list of
+          initializers. This function should return a dict with the slot names
+          as keys and the created variables as values with types matching the
+          table variable. When set to None (the default), uses the built-in
+          variable creation.
+      clipvalue: Controls clipping of the gradient. Set to either a single
+        positive scalar value to get clipping or a tiple of scalar values (min,
+        max) to set a separate maximum or minimum. If one of the two entries is
+        None, then there will be no clipping that direction.
     """
     super(Adam, self).__init__(
         learning_rate, use_gradient_accumulation, clip_weight_min,
         clip_weight_max, weight_decay_factor,
-        multiply_weight_decay_factor_by_learning_rate,
+        multiply_weight_decay_factor_by_learning_rate, clipvalue,
         slot_variable_creation_fn)
     if beta_1 < 0. or beta_1 >= 1.:
       raise ValueError("beta1 must be in the range [0, 1), but received {}."
@@ -430,13 +502,14 @@ class Adam(_Optimizer):
     self.lazy_adam = lazy_adam
     self.sum_inside_sqrt = sum_inside_sqrt
 
-  def _slot_names(self):
+  def _slot_names(self) -> List[Text]:
     return ["momenta", "velocities"]
 
-  def _slot_initializers(self):
+  def _slot_initializers(self) -> List[init_ops_v2.Initializer]:
     return [init_ops_v2.Constant(), init_ops_v2.Constant()]
 
-  def _set_optimization_parameters(self, parameters):
+  def _set_optimization_parameters(
+      self, parameters: optimization_parameters_pb2.OptimizationParameters):
     super(Adam, self)._set_optimization_parameters(parameters)
     parameters.adam.beta1 = self.beta_1
     parameters.adam.beta2 = self.beta_2
@@ -444,10 +517,10 @@ class Adam(_Optimizer):
     parameters.adam.use_non_lazy_adam = not self.lazy_adam
     parameters.adam.use_sum_inside_sqrt = self.sum_inside_sqrt
 
-  def _load(self):
+  def _load(self) -> Callable[..., ops.Operation]:
     return tpu_ops.load_tpu_embedding_adam_parameters
 
-  def _retrieve(self):
+  def _retrieve(self) -> Callable[..., core.Tensor]:
     return tpu_ops.retrieve_tpu_embedding_adam_parameters
 
 
@@ -488,8 +561,13 @@ class TableConfig(object):
 
   """
 
-  def __init__(self, vocabulary_size, dim, initializer, optimizer=None,
-               combiner="mean", name=None):
+  def __init__(self,
+               vocabulary_size: int,
+               dim: int,
+               initializer: Optional[Callable[[Any], None]],
+               optimizer: Optional[_Optimizer] = None,
+               combiner: Text = "mean",
+               name: Optional[Text] = None):
     """Embedding table configuration.
 
     Args:
@@ -506,10 +584,10 @@ class TableConfig(object):
         `tf.tpu.experimental.embedding.Adam`. It set will override the global
         optimizer passed to `tf.tpu.experimental.embedding.TPUEmbedding`.
       combiner: A string specifying how to reduce if there are multiple entries
-        in a single row. Currently 'mean', 'sqrtn', 'sum' are
-        supported, with 'mean' the default. 'sqrtn' often achieves good
-        accuracy, in particular with bag-of-words columns. For more information,
-        see `tf.nn.embedding_lookup_sparse`.
+        in a single row. Currently 'mean', 'sqrtn', 'sum' are supported, with
+        'mean' the default. 'sqrtn' often achieves good accuracy, in particular
+        with bag-of-words columns. For more information, see
+        `tf.nn.embedding_lookup_sparse`.
       name: An optional string used to name the table. Useful for debugging.
 
     Returns:
@@ -543,6 +621,29 @@ class TableConfig(object):
     self.combiner = combiner
     self.name = name
 
+  def __repr__(self):
+    # If using the default initializer, just print "None" for clarity.
+    initializer = self.initializer
+
+    if isinstance(initializer, init_ops_v2.TruncatedNormal):
+      # PY2 type checking can't infer type of initializer even after if.
+      initializer = typing.cast(init_ops_v2.TruncatedNormal, initializer)
+      if (initializer.mean == 0.0
+          and math.isclose(initializer.stddev, 1/math.sqrt(self.dim))):  # pytype: disable=module-attr (math.isclose not in PY2)
+        initializer = None
+
+    return (
+        "TableConfig(vocabulary_size={vocabulary_size!r}, dim={dim!r}, "
+        "initializer={initializer!r}, optimizer={optimizer!r}, "
+        "combiner={combiner!r}, name={name!r})".format(
+            vocabulary_size=self.vocabulary_size,
+            dim=self.dim,
+            initializer=initializer,
+            optimizer=self.optimizer,
+            combiner=self.combiner,
+            name=self.name,)
+    )
+
 
 @tf_export("tpu.experimental.embedding.FeatureConfig")
 class FeatureConfig(object):
@@ -585,7 +686,10 @@ class FeatureConfig(object):
   will be `(batch_size, max_sequence_length, dim)`.
   """
 
-  def __init__(self, table, max_sequence_length=0, name=None):
+  def __init__(self,
+               table: TableConfig,
+               max_sequence_length: int = 0,
+               name: Optional[Text] = None):
     """Feature configuration.
 
     Args:
@@ -617,3 +721,13 @@ class FeatureConfig(object):
     self.table = table
     self.max_sequence_length = max_sequence_length
     self.name = name
+
+  def __repr__(self):
+    return (
+        "FeatureConfig(table={table!r}, "
+        "max_sequence_length={max_sequence_length!r}, name={name!r})"
+        .format(
+            table=self.table,
+            max_sequence_length=self.max_sequence_length,
+            name=self.name)
+    )
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py b/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py
new file mode 100644
index 00000000000..48797b00009
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py
@@ -0,0 +1,93 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TPU Embeddings mid level API utils on TPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_embedding_v2_utils
+
+
+class TPUEmbeddingOptimizerTest(parameterized.TestCase, test.TestCase):
+
+  @parameterized.parameters(tpu_embedding_v2_utils.Adagrad,
+                            tpu_embedding_v2_utils.Adam)
+  def test_grad_clip_with_accumulation_off(self, optimizer):
+    with self.assertRaisesRegex(ValueError, 'accumulation'):
+      optimizer(use_gradient_accumulation=False, clipvalue=0.)
+    with self.assertRaisesRegex(ValueError, 'accumulation'):
+      optimizer(use_gradient_accumulation=False, clipvalue=(None, 1.))
+
+  @parameterized.parameters(tpu_embedding_v2_utils.SGD,
+                            tpu_embedding_v2_utils.Adagrad,
+                            tpu_embedding_v2_utils.Adam)
+  def test_grad_clip_with_tuple(self, optimizer):
+    opt = optimizer(clipvalue=(-1., 1.))
+    self.assertEqual(-1., opt.clip_gradient_min)
+    self.assertEqual(1., opt.clip_gradient_max)
+
+  @parameterized.parameters(tpu_embedding_v2_utils.SGD,
+                            tpu_embedding_v2_utils.Adagrad,
+                            tpu_embedding_v2_utils.Adam)
+  def test_grad_clip_with_single_value(self, optimizer):
+    opt = optimizer(clipvalue=1.)
+    self.assertEqual(-1., opt.clip_gradient_min)
+    self.assertEqual(1., opt.clip_gradient_max)
+
+  @parameterized.parameters(tpu_embedding_v2_utils.SGD,
+                            tpu_embedding_v2_utils.Adagrad,
+                            tpu_embedding_v2_utils.Adam)
+  def test_grad_clip_with_tuple_and_none(self, optimizer):
+    opt = optimizer(clipvalue=(None, 1))
+    self.assertIsNone(opt.clip_gradient_min)
+    self.assertEqual(1., opt.clip_gradient_max)
+
+
+class ConfigTest(test.TestCase):
+
+  def test_table_config_repr(self):
+    table = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=2, dim=4, initializer=None,
+        combiner='sum', name='table')
+
+    self.assertEqual(
+        repr(table),
+        'TableConfig(vocabulary_size=2, dim=4, initializer=None, '
+        'optimizer=None, combiner=\'sum\', name=\'table\')')
+
+  def test_feature_config_repr(self):
+    table = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=2, dim=4, initializer=None,
+        combiner='sum', name='table')
+
+    feature_config = tpu_embedding_v2_utils.FeatureConfig(
+        table=table, name='feature')
+
+    self.assertEqual(
+        repr(feature_config),
+        'FeatureConfig(table=TableConfig(vocabulary_size=2, dim=4, '
+        'initializer=None, optimizer=None, combiner=\'sum\', name=\'table\'), '
+        'max_sequence_length=0, name=\'feature\')'
+    )
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/tpu/tpu_outside_compilation_test.py b/tensorflow/python/tpu/tpu_outside_compilation_test.py
index 7e0278aa343..30bfabdff7c 100644
--- a/tensorflow/python/tpu/tpu_outside_compilation_test.py
+++ b/tensorflow/python/tpu/tpu_outside_compilation_test.py
@@ -24,6 +24,8 @@ import tempfile
 from absl.testing import parameterized
 import numpy as np
 
+from tensorboard.plugins.histogram import summary_v2 as histogram_summary_v2
+from tensorboard.plugins.scalar import summary_v2 as scalar_summary_v2
 from tensorflow.core.util import event_pb2
 from tensorflow.python.distribute import tpu_strategy as tpu_lib
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
@@ -32,6 +34,8 @@ from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import tf_record
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -41,7 +45,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import summary_ops_v2 as summary
-from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import gfile
 from tensorflow.python.tpu import tpu
@@ -90,30 +93,9 @@ def _events_from_logdir(test_case, logdir):
 
 class TpuOutsideCompilationTest(test.TestCase, parameterized.TestCase):
 
-  def testResourceVariableAssignOnHost(self):
-    strategy = get_tpu_strategy()
-    with strategy.scope():
-      v = variables.Variable(
-          0.0, aggregation=variables.VariableAggregation.MEAN)
-    v2 = variables.Variable(0.0, aggregation=variables.VariableAggregation.MEAN)
-
-    def assign_fn():
-      v2.assign_add(4.0)
-
-    @def_function.function
-    def train_step():
-
-      def assign_add():
-        v.assign_add(2.0)
-        tpu.outside_compilation(assign_fn)
-        v.assign_add(3.0)
-
-      strategy.run(assign_add)
-      return
-
-    train_step()
-    self.assertAllEqual(4.0 * strategy.num_replicas_in_sync, v2.numpy())
-    self.assertAllEqual(5.0, v.numpy())
+  def setUp(self):
+    super(TpuOutsideCompilationTest, self).setUp()
+    config.set_soft_device_placement(False)
 
   def testHostNoInput(self):
     strategy = get_tpu_strategy()
@@ -312,7 +294,7 @@ class TpuOutsideCompilationTest(test.TestCase, parameterized.TestCase):
     strategy = get_tpu_strategy()
 
     def host_computation(x):
-      summary.scalar("x", x, step=0)
+      scalar_summary_v2.scalar("x", x, step=0)
       return x * 2.0
 
     @def_function.function
@@ -338,7 +320,7 @@ class TpuOutsideCompilationTest(test.TestCase, parameterized.TestCase):
     strategy = get_tpu_strategy()
 
     def host_computation(x):
-      summary.scalar("x", x, step=0)
+      scalar_summary_v2.scalar("x", x, step=0)
       return x * 2.0
 
     @def_function.function
@@ -373,7 +355,7 @@ class TpuOutsideCompilationTest(test.TestCase, parameterized.TestCase):
     strategy = get_tpu_strategy()
 
     def host_computation(x):
-      summary.scalar("x", x, step=0)
+      scalar_summary_v2.scalar("x", x, step=0)
       return x * 2.0
 
     @def_function.function
@@ -469,6 +451,36 @@ class TpuOutsideCompilationTest(test.TestCase, parameterized.TestCase):
         strategy.experimental_local_results(train_step()),
         constant_op.constant(2916., shape=(strategy.num_replicas_in_sync)))
 
+  def testColocateGradientWithOutsideCompiledOp(self):
+    strategy = get_tpu_strategy()
+
+    @def_function.function
+    def train_step():
+
+      @def_function.function
+      def tpu_fn(x):
+        x1 = tpu.outside_compilation(math_ops.sqrt, x)
+        grad = gradients_impl.gradients([x1], [x],
+                                        colocate_gradients_with_ops=True)[0]
+        sqrt = [
+            op for op in ops.get_default_graph().get_operations()
+            if op.type == "Sqrt"
+        ][0]
+        sqrt_grad = [
+            op for op in ops.get_default_graph().get_operations()
+            if op.type == "SqrtGrad"
+        ][0]
+        assert sqrt.get_attr(tpu._OUTSIDE_COMPILATION_ATTR) == b"0"
+        assert (sqrt_grad.get_attr(
+            tpu._OUTSIDE_COMPILATION_ATTR) == b"0.gradients/uid")
+        return grad
+
+      return strategy.run(tpu_fn, args=(25.0,))
+
+    self.assertAllEqual(
+        strategy.experimental_local_results(train_step()),
+        constant_op.constant(.1, shape=(strategy.num_replicas_in_sync)))
+
 
 class OutsideCompilationOnUnsupportedOpTest(test.TestCase,
                                             parameterized.TestCase):
@@ -511,7 +523,36 @@ class OutsideCompilationOnUnsupportedOpTest(test.TestCase,
     strategy = get_tpu_strategy()
 
     def host_computation(x):
-      summary.scalar("x", x, step=0)
+      scalar_summary_v2.scalar("x", x, step=0)
+      return x * 2.0
+
+    @def_function.function
+    def step():
+
+      def computation(x):
+        x = x + 1.0
+        y = host_computation(x)
+        return y + 1.0
+
+      return strategy.run(computation, args=(2.0,))
+
+    logdir = tempfile.mkdtemp()
+    summary_writer = summary.create_file_writer(logdir, flush_millis=10000)
+    with summary_writer.as_default(), summary.always_record_summaries():
+      self.assertAllEqual(
+          strategy.experimental_local_results(step()),
+          constant_op.constant(7., shape=(strategy.num_replicas_in_sync)))
+    events = _events_from_logdir(self, logdir)
+    # There will be 2 entries: 1 summary file header entry, and 1 entry
+    # written by host.
+    self.assertLen(events, 2)
+    self.assertEqual(events[1].summary.value[0].tag, "x")
+
+  def testHistogramSummaryWithAutoOutsideCompilation(self):
+    strategy = get_tpu_strategy()
+
+    def host_computation(x):
+      histogram_summary_v2.histogram("x", x, step=0)
       return x * 2.0
 
     @def_function.function
@@ -535,7 +576,6 @@ class OutsideCompilationOnUnsupportedOpTest(test.TestCase,
     # written by host.
     self.assertLen(events, 2)
     self.assertEqual(events[1].summary.value[0].tag, "x")
-    self.assertEqual(events[1].summary.value[0].simple_value, 3.0)
 
   @parameterized.parameters((True), (False))
   def testSummaryControlFlowIfWithAutoOutsideCompilation(
@@ -548,7 +588,7 @@ class OutsideCompilationOnUnsupportedOpTest(test.TestCase,
       def computation(x):
         x = x + 1.0
         if x < 5:
-          summary.scalar("x", x, step=0)
+          scalar_summary_v2.scalar("x", x, step=0)
           x = x * 2.0
         return x + 1.0
 
@@ -574,8 +614,10 @@ class OutsideCompilationOnUnsupportedOpTest(test.TestCase,
       #
       self.assertLen(events, 2)
       self.assertEqual(events[1].summary.value[0].tag, "cond/x")
-      self.assertEqual(events[1].summary.value[0].simple_value, 3.0)
 
+  @test_util.disable_mlir_bridge(
+      "TODO(b/168493455): Reenable this test once deadlock resolved."
+  )
   def testAutoOutsideCompilationWithFunctionalNodes(self):
     strategy = get_tpu_strategy()
 
diff --git a/tensorflow/python/tpu/tpu_strategy_util.py b/tensorflow/python/tpu/tpu_strategy_util.py
index c315d7c5e1b..b52d22c60dc 100644
--- a/tensorflow/python/tpu/tpu_strategy_util.py
+++ b/tensorflow/python/tpu/tpu_strategy_util.py
@@ -45,7 +45,8 @@ def initialize_tpu_system(cluster_resolver=None):
     cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
         which provides information about the TPU cluster.
   Returns:
-    The tf.tpu.Topology object for the topology of the TPU cluster.
+    The tf.tpu.Topology object for the topology of the TPU cluster. If called
+    inside tf.function, it returns the serialized topology object instead.
 
   Raises:
     RuntimeError: If running inside a tf.function.
@@ -72,17 +73,17 @@ def initialize_tpu_system(cluster_resolver=None):
 
   logging.info("Initializing the TPU system: %s", tpu_name)
 
-  if context.executing_eagerly():
-    # This function looks as it is for the following non-intuitive reasons.
-    # tpu.initialize_system creates a dummy op whose sole purpose is to trigger
-    # DistributedTPURewritePass. This pass actually adds real ops that
-    # initialize the TPU system. Thus, we can't simply run tpu.initialize_system
-    # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
-    if tpu_name not in _LOCAL_MASTERS:
-      # Explicitly place the tpu.initialize_system in the first worker to
-      # avoid the output node match multiple devices error.
-      job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name())
+  # This function looks as it is for the following non-intuitive reasons.
+  # tpu.initialize_system creates a dummy op whose sole purpose is to trigger
+  # DistributedTPURewritePass. This pass actually adds real ops that
+  # initialize the TPU system. Thus, we can't simply run tpu.initialize_system
+  # eagerly. We need to wrap it in defun and trigger the rewrite passes on it.
+  if tpu_name not in _LOCAL_MASTERS:
+    # Explicitly place the tpu.initialize_system in the first worker to
+    # avoid the output node match multiple devices error.
+    job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name())
 
+  if context.executing_eagerly():
     @function.defun
     def _tpu_init_fn():
       # In TF1, we usually close chips when compilation fails to clear the data
@@ -109,10 +110,6 @@ def initialize_tpu_system(cluster_resolver=None):
     context.context()._clear_caches()  # pylint: disable=protected-access
 
     serialized_topology = output.numpy()
-
-    # TODO(b/134094971): Remove this when lazy tensor copy in multi-device
-    # function has been implemented.
-    context.context().mirroring_policy = context.MIRRORING_ALL
   elif not ops.executing_eagerly_outside_functions():
     master = cluster_resolver.master()
     cluster_spec = cluster_resolver.cluster_spec()
@@ -125,8 +122,13 @@ def initialize_tpu_system(cluster_resolver=None):
       with session_lib.Session(config=session_config, target=master) as sess:
         serialized_topology = sess.run(tpu.initialize_system())
   else:
-    raise RuntimeError("initialize_tpu_system is not supported within "
-                       "tf.functions.")
+    with ops.device(tpu._tpu_system_device_name(job)):  # pylint: disable=protected-access
+      serialized_topology = tpu.initialize_system(
+          job=job, compilation_failure_closes_chips=False)
+      # If initialize_tpu_system is called inside tf.function, we only return
+      # the serialized topology object as the tf.tpu.Topology object has to be
+      # constructed in eager mode.
+      return serialized_topology
 
   logging.info("Finished initializing TPU system.")
   tpu_topology = topology.Topology(serialized=serialized_topology)
diff --git a/tensorflow/python/tpu/tpu_test.py b/tensorflow/python/tpu/tpu_test.py
index c1a7e4dae92..8d10d9404d8 100644
--- a/tensorflow/python/tpu/tpu_test.py
+++ b/tensorflow/python/tpu/tpu_test.py
@@ -19,11 +19,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -40,49 +40,77 @@ from tensorflow.python.tpu import training_loop
 
 class TPUContextTest(test.TestCase):
 
-  @test_util.deprecated_graph_mode_only
   def testIsInContext(self):
     """Test that control_flow_util can check that we're in a TPU context."""
-    z1 = array_ops.identity(1)
-    pivot = control_flow_ops.no_op()
-    context = tpu.TPUReplicateContext(b"context", 1, pivot=pivot)
-    context.Enter()
-    z2 = array_ops.identity(1)
-    context.Exit()
-    self.assertFalse(control_flow_util.IsInXLAContext(z1.op))
-    self.assertTrue(control_flow_util.IsInXLAContext(z2.op))
+    with ops.Graph().as_default():
+      z1 = array_ops.identity(1)
+      pivot = control_flow_ops.no_op()
+      context = tpu.TPUReplicateContext(b"context", 1, pivot=pivot)
+      context.Enter()
+      z2 = array_ops.identity(1)
+      context.Exit()
+      self.assertFalse(control_flow_util.IsInXLAContext(z1.op))
+      self.assertTrue(control_flow_util.IsInXLAContext(z2.op))
+
+  def testHandlesNameCollision(self):
+    """Test AddValue handles name collisions for ops from different graphs."""
+    with ops.Graph().as_default():
+      z = array_ops.zeros([2, 3], name="a")
+      assert z.name == "a:0", "Expected: a:0, Found: %s" % z.name
+
+      @def_function.function
+      def f():
+        pivot = control_flow_ops.no_op()
+        context = tpu.TPUReplicateContext(b"context", 1, pivot=pivot)
+        context.Enter()
+        array_ops.identity(z)  # Capture z.
+        z1 = array_ops.zeros([3, 2], name="a")
+        assert z1.name == "a:0", "Expected: a:0, Found: %s" % z1.name
+        z2 = array_ops.zeros([3, 2], name="a")
+        # Prior to fixing b/166794533 this would fail with a shape mismatch
+        # because context.AddValue would have cached `z` by its name which
+        # collides with z1's name.
+        result = z1 + z2
+        context.Exit()
+        return result
+
+      f.get_concrete_function()
 
 
 class TPULayerRewriteTest(test.TestCase):
 
-  @test_util.deprecated_graph_mode_only
   def testUsingInfeedQueueWithRegularizer(self):
     """Test that Layer regularizers can reference data created in loops."""
 
-    def make_regularizer(scale):
-      return lambda inputs: scale * math_ops.reduce_sum(math_ops.square(inputs))
+    with ops.Graph().as_default():
 
-    def training_step(inputs, scale):
-      outputs = convolutional.conv2d(
-          inputs,
-          filters=16,
-          kernel_size=(3, 3),
-          data_format="channels_first",
-          kernel_regularizer=make_regularizer(scale))
-      loss = math_ops.reduce_mean(math_ops.square(outputs))
-      return loss.op
+      def make_regularizer(scale):
+        def regularizer(inputs):
+          return scale * math_ops.reduce_sum(math_ops.square(inputs))
+        return regularizer
 
-    inputs = array_ops.zeros(shape=(128, 32, 32, 16))
-    scale = array_ops.ones(shape=())
-    infeed = tpu_feed.InfeedQueue(
-        tuple_types=[dtypes.float32, dtypes.float32],
-        tuple_shapes=[inputs.shape, scale.shape])
+      def training_step(inputs, scale):
+        outputs = convolutional.conv2d(
+            inputs,
+            filters=16,
+            kernel_size=(3, 3),
+            data_format="channels_first",
+            kernel_regularizer=make_regularizer(scale))
+        loss = math_ops.reduce_mean(math_ops.square(outputs))
+        return loss.op
 
-    def loop():
-      return training_loop.repeat(5, training_step, infeed_queue=infeed)
+      inputs = array_ops.zeros(shape=(128, 32, 32, 16))
+      scale = array_ops.ones(shape=())
+      infeed = tpu_feed.InfeedQueue(
+          tuple_types=[dtypes.float32, dtypes.float32],
+          tuple_shapes=[inputs.shape, scale.shape])
+
+      def loop():
+        return training_loop.repeat(5, training_step, infeed_queue=infeed)
+
+      # This should not throw an error.
+      tpu.rewrite(loop)
 
-    # This should not throw an error.
-    tpu.rewrite(loop)
 
 class TPUGraphPruneTest(test.TestCase):
 
diff --git a/tensorflow/python/training/BUILD b/tensorflow/python/training/BUILD
new file mode 100644
index 00000000000..cf2d89b0d1f
--- /dev/null
+++ b/tensorflow/python/training/BUILD
@@ -0,0 +1,1424 @@
+load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "cuda_py_tests")
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+exports_files(
+    # Used in a pybind extension whose rule must be in tensorflow/python
+    ["quantize_training_wrapper.cc"],
+    visibility = ["//tensorflow/python:__pkg__"],
+)
+
+exports_files(
+    # Used in a rule which visibility limits to tensorflow/python
+    ["learning_rate_decay.py"],
+    visibility = ["//tensorflow/python:__pkg__"],
+)
+
+# Files which have their own BUILD rules, but which for compatibility with
+# strict dep checking need to be direct dependencies of training_lib. Do not add
+# any new files to this list.
+filegroup(
+    name = "deprecated_inclusions_in_training_lib",
+    srcs = [
+        "adadelta.py",
+        "adagrad.py",
+        "adagrad_da.py",
+        "adam.py",
+        "basic_loops.py",
+        "checkpoint_ops.py",
+        "checkpoint_utils.py",
+        "coordinator.py",
+        "device_setter.py",
+        "evaluation.py",
+        "ftrl.py",
+        "gradient_descent.py",
+        "input.py",
+        "learning_rate_decay.py",
+        "momentum.py",
+        "monitored_session.py",
+        "moving_averages.py",
+        "optimizer.py",
+        "proximal_adagrad.py",
+        "proximal_gradient_descent.py",
+        "py_checkpoint_reader.py",
+        "quantize_training.py",
+        "queue_runner.py",
+        "queue_runner_impl.py",
+        "rmsprop.py",
+        "server_lib.py",
+        "session_manager.py",
+        "slot_creator.py",
+        "summary_io.py",
+        "supervisor.py",
+        "sync_replicas_optimizer.py",
+        "tensorboard_logging.py",
+        "training.py",
+        "training_ops.py",
+        "warm_starting_util.py",
+    ],
+    visibility = ["//tensorflow/python/training:__pkg__"],
+)
+
+py_library(
+    name = "training_lib",
+    srcs = [
+        "__init__.py",
+        "training.py",
+        ":deprecated_inclusions_in_training_lib",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":adadelta",
+        ":adagrad",
+        ":adagrad_da",
+        ":adam",
+        ":basic_loops",
+        ":basic_session_run_hooks",
+        ":checkpoint_management",
+        ":checkpoint_utils",
+        ":coordinator",
+        ":device_setter",
+        ":ftrl",
+        ":gradient_descent",
+        ":input",
+        ":momentum",
+        ":monitored_session",
+        ":moving_averages",
+        ":optimizer",
+        ":proximal_adagrad",
+        ":proximal_gradient_descent",
+        ":py_checkpoint_reader",
+        ":quantize_training",
+        ":queue_runner",
+        ":rmsprop",
+        ":saver",
+        ":server_lib",
+        ":session_manager",
+        ":session_run_hook",
+        ":summary_io",
+        ":supervisor",
+        ":sync_replicas_optimizer",
+        ":tensorboard_logging",
+        ":training_util",
+        ":warm_starting_util",
+        "//tensorflow/python:learning_rate_decay",
+        "//tensorflow/python:sdca_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/training/experimental:loss_scale_optimizer",
+        "//tensorflow/python/training/experimental:mixed_precision",
+    ],
+)
+
+py_library(
+    name = "training",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":training_lib",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/training/tracking:python_state",
+        "//tensorflow/python/training/tracking:util",
+    ],
+)
+
+py_library(
+    name = "adadelta",
+    srcs = ["adadelta.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "adagrad_da",
+    srcs = ["adagrad_da.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "adagrad",
+    srcs = ["adagrad.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:array_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "adam",
+    srcs = ["adam.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+py_library(
+    name = "basic_loops",
+    srcs = ["basic_loops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:errors",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "checkpoint_ops",
+    srcs = ["checkpoint_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:checkpoint_ops_gen",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
+py_library(
+    name = "checkpoint_utils",
+    srcs = ["checkpoint_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":checkpoint_management",
+        ":py_checkpoint_reader",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/training/saving:saveable_object_util",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "coordinator",
+    srcs = ["coordinator.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:errors",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "device_setter",
+    srcs = ["device_setter.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":server_lib",
+        "//tensorflow/python:device",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:tf_export",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "distribution_strategy_context",
+    srcs = ["distribution_strategy_context.py"],
+    srcs_version = "PY2AND3",
+    deps = ["//tensorflow/python/distribute:distribute_lib"],
+)
+
+py_library(
+    name = "evaluation",
+    srcs = ["evaluation.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":basic_session_run_hooks",
+        ":monitored_session",
+        ":session_run_hook",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variable_scope",
+    ],
+)
+
+py_library(
+    name = "ftrl",
+    srcs = ["ftrl.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "gradient_descent",
+    srcs = ["gradient_descent.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "input",
+    srcs = ["input.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":queue_runner",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:layers_util",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "momentum",
+    srcs = ["momentum.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "moving_averages",
+    srcs = ["moving_averages.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":slot_creator",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:reduce_util",
+    ],
+)
+
+py_library(
+    name = "optimizer",
+    srcs = ["optimizer.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":slot_creator",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/tracking:base",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "proximal_adagrad",
+    srcs = ["proximal_adagrad.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "proximal_gradient_descent",
+    srcs = ["proximal_gradient_descent.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "quantize_training",
+    srcs = ["quantize_training.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:_pywrap_quantize_training",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "queue_runner_impl",
+    srcs = ["queue_runner_impl.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:session",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+py_library(
+    name = "queue_runner",
+    srcs = ["queue_runner.py"],
+    srcs_version = "PY2AND3",
+    deps = [":queue_runner_impl"],
+)
+
+py_library(
+    name = "rmsprop",
+    srcs = ["rmsprop.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":training_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf_export",
+    ],
+)
+
+py_library(
+    name = "session_manager",
+    srcs = ["session_manager.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":checkpoint_management",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:session",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "slot_creator",
+    srcs = ["slot_creator.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
+    ],
+)
+
+py_library(
+    name = "summary_io",
+    srcs = ["summary_io.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "sync_replicas_optimizer",
+    srcs = ["sync_replicas_optimizer.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizer",
+        ":queue_runner",
+        ":session_manager",
+        ":session_run_hook",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
+    ],
+)
+
+py_library(
+    name = "tensorboard_logging",
+    srcs = ["tensorboard_logging.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:platform",
+    ],
+)
+
+py_library(
+    name = "training_ops",
+    srcs = [
+        "gen_training_ops.py",
+        "training_ops.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:training_ops_gen",
+    ],
+)
+
+py_library(
+    name = "warm_starting_util",
+    srcs = ["warm_starting_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":checkpoint_ops",
+        ":checkpoint_utils",
+        ":saver",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/training/saving:saveable_object_util",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "distribute",
+    srcs = [
+        "distribute.py",
+        "distribution_strategy_context.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/distribute:distribute_lib",
+    ],
+)
+
+tf_py_test(
+    name = "server_lib_test",
+    size = "small",
+    srcs = ["server_lib_test.py"],
+    grpc_enabled = True,
+    python_version = "PY3",
+    tags = [
+        "noasan",  # TODO(b/161236904): flaky timeout in trying to start gRPC server
+    ],
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "server_lib_multiple_containers_test",
+    size = "small",
+    srcs = ["server_lib_multiple_containers_test.py"],
+    grpc_enabled = True,
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "server_lib_same_variables_clear_container_test",
+    size = "small",
+    srcs = ["server_lib_same_variables_clear_container_test.py"],
+    grpc_enabled = True,
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "server_lib_same_variables_clear_test",
+    size = "small",
+    srcs = ["server_lib_same_variables_clear_test.py"],
+    grpc_enabled = True,
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "server_lib_same_variables_no_clear_test",
+    size = "small",
+    srcs = ["server_lib_same_variables_no_clear_test.py"],
+    grpc_enabled = True,
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "server_lib_sparse_job_test",
+    size = "small",
+    srcs = ["server_lib_sparse_job_test.py"],
+    grpc_enabled = True,
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "localhost_cluster_performance_test",
+    size = "medium",
+    srcs = [
+        "localhost_cluster_performance_test.py",
+    ],
+    grpc_enabled = True,
+    python_version = "PY3",
+    tags = [
+        "no_oss",  # Test flaky due to port collisions.
+        "oss_serial",
+    ],
+    tfrt_enabled = True,
+    deps = [
+        ":device_setter",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:distributed_framework_test_lib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:session",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "sync_replicas_optimizer_test",
+    size = "medium",
+    srcs = [
+        "sync_replicas_optimizer_test.py",
+    ],
+    grpc_enabled = True,
+    python_version = "PY3",
+    tags = [
+        "no_oss",  # Test flaky due to port collisions.
+        "notsan",  # data race due to b/62910646
+        "oss_serial",
+    ],
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
+    name = "evaluation_test",
+    size = "small",
+    srcs = ["evaluation_test.py"],
+    python_version = "PY3",
+    shard_count = 3,
+    tags = [
+        "manual",
+        "notap",  # Disabling until b/33000128 and b/33040312 are fixed.
+    ],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/ops/losses",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "py_checkpoint_reader",
+    srcs = ["py_checkpoint_reader.py"],
+    deps = [
+        "//tensorflow/python:_pywrap_checkpoint_reader",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+    ],
+)
+
+tf_proto_library(
+    name = "checkpoint_state",
+    srcs = ["checkpoint_state.proto"],
+    cc_api_version = 2,
+)
+
+py_library(
+    name = "checkpoint_management",
+    srcs = ["checkpoint_management.py"],
+    deps = [
+        ":training_util",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+cuda_py_test(
+    name = "checkpoint_management_test",
+    size = "small",
+    srcs = [
+        "checkpoint_management_test.py",
+    ],
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        ":checkpoint_management",
+        ":saver",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/tracking:util",
+    ],
+)
+
+py_library(
+    name = "saver",
+    srcs = ["saver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":checkpoint_management",
+        ":py_checkpoint_reader",
+        ":training_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:device",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:io_ops_gen",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:session",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/saving:saveable_object",
+        "//tensorflow/python/training/saving:saveable_object_util",
+        "//tensorflow/python/training/tracking:base",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "saver_test_utils",
+    srcs = ["saver_test_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":saver",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops_gen",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+cuda_py_test(
+    name = "saver_test",
+    size = "medium",
+    srcs = [
+        "saver_test.py",
+    ],
+    python_version = "PY3",
+    tags = ["multi_gpu"],
+    deps = [
+        ":adam",
+        ":checkpoint_management",
+        ":gradient_descent",
+        ":py_checkpoint_reader",
+        ":queue_runner_impl",
+        ":saver",
+        ":saver_test_utils",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:function",
+        "//tensorflow/python:gradients_impl",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_grad",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/tracking:base",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+tf_py_test(
+    name = "saver_large_variable_test",
+    size = "medium",
+    srcs = ["saver_large_variable_test.py"],
+    python_version = "PY3",
+    tags = [
+        "manual",
+        "noasan",  # http://b/30379628
+        "notsan",  # http://b/30379628
+    ],
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
+    name = "saver_large_partitioned_variable_test",
+    size = "medium",
+    srcs = ["saver_large_partitioned_variable_test.py"],
+    python_version = "PY3",
+    tags = [
+        "noasan",  # http://b/30782289
+        "notsan",  # http://b/30782289
+    ],
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_library(
+    name = "basic_session_run_hooks",
+    srcs = ["basic_session_run_hooks.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":session_run_hook",
+        ":summary_io",
+        ":training_util",
+        "//tensorflow/python:client",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:variable_scope",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "session_run_hook",
+    srcs = ["session_run_hook.py"],
+    srcs_version = "PY2AND3",
+    deps = ["//tensorflow/python:tf_export"],
+)
+
+py_library(
+    name = "supervisor",
+    srcs = ["supervisor.py"],
+    deps = [
+        ":coordinator",
+        ":saver",
+        ":session_manager",
+        ":training_util",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+tf_py_test(
+    name = "supervisor_test",
+    size = "small",
+    srcs = ["supervisor_test.py"],
+    grpc_enabled = True,
+    python_version = "PY3",
+    tags = ["no_windows"],
+    tfrt_enabled = True,
+    deps = [
+        ":checkpoint_management",
+        ":saver",
+        ":supervisor",
+        ":training",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:variables",
+    ],
+)
+
+py_library(
+    name = "server_lib",
+    srcs = ["server_lib.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:errors",
+        "//tensorflow/python:pywrap_tf_session",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "training_util",
+    srcs = ["training_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+tf_py_test(
+    name = "training_util_test",
+    size = "small",
+    srcs = ["training_util_test.py"],
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        ":training_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:variables",
+    ],
+)
+
+cuda_py_test(
+    name = "adam_test",
+    size = "medium",
+    srcs = ["adam_test.py"],
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        ":adam",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_test(
+    name = "moving_averages_test",
+    size = "small",
+    srcs = [
+        "moving_averages_test.py",
+    ],
+    python_version = "PY3",
+    tags = [
+        "no_windows",  # b/139083295: bfloat16 tests fail on Windows
+        "notsan",
+    ],
+    tfrt_enabled = True,
+    deps = [
+        ":moving_averages",
+        ":saver",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:state_ops_gen",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+cuda_py_tests(
+    name = "training_tests",
+    size = "medium",
+    srcs = [
+        "adadelta_test.py",
+        "adagrad_da_test.py",
+        "adagrad_test.py",
+        "basic_loops_test.py",
+        "coordinator_test.py",
+        "device_setter_test.py",
+        "ftrl_test.py",
+        "gradient_descent_test.py",
+        "momentum_test.py",
+        "optimizer_test.py",
+        "proximal_adagrad_test.py",
+        "proximal_gradient_descent_test.py",
+        "quantize_training_test.py",
+        "queue_runner_test.py",
+        "rmsprop_test.py",
+        "slot_creator_test.py",
+        "tensorboard_logging_test.py",
+        "training_ops_test.py",
+    ],
+    python_version = "PY3",
+    deps = [
+        ":training",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:data_flow_ops_gen",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn_grad",
+        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:state_ops_gen",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
+cuda_py_test(
+    name = "session_manager_test",
+    size = "medium",  # TODO(irving): Can this be made small?
+    srcs = ["session_manager_test.py"],
+    grpc_enabled = True,
+    main = "session_manager_test.py",
+    python_version = "PY3",
+    deps = [
+        ":checkpoint_management",
+        ":saver",
+        ":server_lib",
+        ":session_manager",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:session",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
+    name = "basic_session_run_hooks_test",
+    size = "medium",
+    srcs = ["basic_session_run_hooks_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_pip",  # Relies on contrib
+        "no_windows",
+        "notsan",  # intermittent races on a few percent of runs
+    ],
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:fake_summary_writer",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:nn_grad",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
+    name = "checkpoint_utils_test",
+    size = "small",
+    srcs = ["checkpoint_utils_test.py"],
+    python_version = "PY3",
+    tags = [
+        "manual",
+        "no_cuda_on_cpu_tap",
+        "no_oss",
+        "no_windows",
+        "notap",
+    ],
+    deps = [
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
+    name = "checkpoint_ops_test",
+    size = "small",
+    srcs = ["checkpoint_ops_test.py"],
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/python:checkpoint_ops_gen",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+    ],
+)
+
+tf_py_test(
+    name = "warm_starting_util_test",
+    size = "medium",
+    srcs = ["warm_starting_util_test.py"],
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "monitored_session",
+    srcs = ["monitored_session.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":basic_session_run_hooks",
+        ":coordinator",
+        ":queue_runner",
+        ":saver",
+        ":session_manager",
+        ":session_run_hook",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_coordinator_context",
+        "@six_archive//:six",
+    ],
+)
+
+tf_py_test(
+    name = "monitored_session_test",
+    size = "medium",
+    srcs = ["monitored_session_test.py"],
+    tags = [
+        "no_pip",
+        "notsan",  # b/67945581
+    ],
+    tfrt_enabled = True,
+    deps = [
+        ":checkpoint_management",
+        ":monitored_session",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:saver",
+        "//tensorflow/python:session",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:collective_all_reduce_strategy",
+        "//tensorflow/python/distribute:distribute_coordinator",
+    ],
+)
+
+tf_py_test(
+    name = "input_test",
+    size = "medium",
+    srcs = ["input_test.py"],
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index 7189fc09aaa..7e817a18690 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -244,9 +244,12 @@ class AdamOptimizerTest(test.TestCase):
       self.doTestBasic(use_resource=False)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_tfrt("b/168527439: invalid runtime fallback "
+                          "resource variable reference on GPU.")
   def testResourceBasic(self):
     self.doTestBasic(use_resource=True)
 
+  @test_util.disable_tfrt("b/153089059: cannot create half tensor on GPU.")
   def testBasicCallableParams(self):
     with context.eager_mode():
       self.doTestBasic(use_resource=True, use_callable_params=True)
@@ -335,6 +338,8 @@ class AdamOptimizerTest(test.TestCase):
             self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
             self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  @test_util.disable_tfrt("b/168527439: invalid runtime fallback "
+                          "resource variable reference on GPU.")
   def testTwoSessions(self):
     optimizer = adam.AdamOptimizer()
 
@@ -360,6 +365,8 @@ class AdamOptimizerTest(test.TestCase):
         # fails.
         optimizer.apply_gradients([(grads0, var0)])
 
+  @test_util.disable_tfrt("b/168527439: invalid runtime fallback "
+                          "resource variable reference on GPU.")
   def testSlotsUniqueEager(self):
     with context.eager_mode():
       v1 = resource_variable_ops.ResourceVariable(1.)
diff --git a/tensorflow/python/training/experimental/BUILD b/tensorflow/python/training/experimental/BUILD
new file mode 100644
index 00000000000..afc4cd673db
--- /dev/null
+++ b/tensorflow/python/training/experimental/BUILD
@@ -0,0 +1,149 @@
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+py_library(
+    name = "loss_scale",
+    srcs = ["loss_scale.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/tracking:base",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "loss_scale_optimizer",
+    srcs = ["loss_scale_optimizer.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":loss_scale",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:smart_cond",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/training:optimizer",
+    ],
+)
+
+py_test(
+    name = "loss_scale_optimizer_test",
+    size = "small",
+    srcs = ["loss_scale_optimizer_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":loss_scale",
+        ":loss_scale_optimizer",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training:gradient_descent",
+        "//tensorflow/python/training:momentum",
+        "//tensorflow/python/training/tracking:util",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "loss_scale_test",
+    size = "medium",
+    srcs = ["loss_scale_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":loss_scale",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/distribute:one_device_strategy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_library(
+    name = "mixed_precision_global_state",
+    srcs = ["mixed_precision_global_state.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_library(
+    name = "mixed_precision",
+    srcs = ["mixed_precision.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":loss_scale",
+        ":loss_scale_optimizer",
+        ":mixed_precision_global_state",
+        "//tensorflow/python:config",
+        "//tensorflow/python:util",
+    ],
+)
+
+cuda_py_test(
+    name = "mixed_precision_test",
+    size = "small",
+    srcs = ["mixed_precision_test.py"],
+    python_version = "PY3",
+    tfrt_enabled = True,
+    deps = [
+        ":mixed_precision",
+        "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_library(
+    name = "loss_scaling_gradient_tape",
+    srcs = ["loss_scaling_gradient_tape.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":loss_scale",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:unconnected_gradients",
+        "//tensorflow/python:util",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/eager:backprop",
+    ],
+)
+
+cuda_py_test(
+    name = "loss_scaling_gradient_tape_test",
+    size = "medium",
+    srcs = ["loss_scaling_gradient_tape_test.py"],
+    shard_count = 2,
+    deps = [
+        ":loss_scale",
+        ":loss_scaling_gradient_tape",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_combinations_lib",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/eager:def_function",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/training/ftrl.py b/tensorflow/python/training/ftrl.py
index 6c8a6ceadc5..21cc0d15ad5 100644
--- a/tensorflow/python/training/ftrl.py
+++ b/tensorflow/python/training/ftrl.py
@@ -149,7 +149,7 @@ class FtrlOptimizer(optimizer.Optimizer):
     # TensorFlow ops do not need to include that parameter.
     self._adjusted_l2_regularization_strength_tensor = ops.convert_to_tensor(
         self._l2_regularization_strength + self._beta /
-        (2. * self._learning_rate),
+        (2. * math_ops.maximum(self._learning_rate, 1e-36)),
         name="adjusted_l2_regularization_strength")
     assert self._adjusted_l2_regularization_strength_tensor is not None
     self._beta_tensor = ops.convert_to_tensor(self._beta, name="beta")
diff --git a/tensorflow/python/training/gen_training_ops.py b/tensorflow/python/training/gen_training_ops.py
new file mode 100644
index 00000000000..5590b5056f8
--- /dev/null
+++ b/tensorflow/python/training/gen_training_ops.py
@@ -0,0 +1,29 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Python wrappers for training ops."""
+# NOTE(allenl): The generated op wrappers for training ops were originally in
+# training/gen_training_ops.py. They moved to ops/gen_training_ops.py when
+# training/ became a module, and this is an alias to avoid breaking existing
+# imports.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import
+from tensorflow.python.ops.gen_training_ops import *
+# pylint: enable=wildcard-import
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 1fe8a8c729b..9e7d486123c 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -759,7 +759,7 @@ class Optimizer(
     if hasattr(var, "_distributed_container"):
       # NOTE: If this isn't patched, then there is no `handle` in
       # `_resource_apply_dense`.
-      distributed_container = var._distributed_container
+      distributed_container = var._distributed_container()
       assert distributed_container is not None
       if ops.executing_eagerly_outside_functions():
         key = distributed_container._unique_id
@@ -824,7 +824,7 @@ class Optimizer(
       with distribution_strategy.extended.colocate_vars_with(colocate_with):
         if eager:
           restored_initial_value = self._preload_simple_restoration(
-              name=name, shape=None)
+              name=name)
           if restored_initial_value is not None:
             initial_value = restored_initial_value
         v = variable_scope.variable(
@@ -1213,11 +1213,15 @@ class Optimizer(
         # (aside from double initialization), and makes variable creator scopes
         # behave the same way they do when graph building.
         and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
-      initializer = trackable.CheckpointInitialValue(
+      initializer = trackable.CheckpointInitialValueCallable(
           checkpoint_position=slot_variable_position)
-      slot_variable = self._get_or_make_slot(
+      # CheckpointInitialValueCallable will ignore the shape and dtype
+      # parameters but they must be passed.
+      slot_variable = self._get_or_make_slot_with_initializer(
           var=variable,
-          val=initializer,
+          initializer=initializer,
+          shape=variable.shape,
+          dtype=variable.dtype,
           slot_name=slot_name,
           op_name=self._name)
       # Slot variables are not owned by any one object (because we don't want to
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 9f28264f52d..65f6ab67915 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -26,7 +26,6 @@ from __future__ import print_function
 import collections
 import os.path
 import time
-import uuid
 
 import numpy as np
 from tensorflow.core.protobuf import meta_graph_pb2
@@ -255,7 +254,7 @@ class BaseSaverBuilder(object):
       _SHARDED_SUFFIX = array_ops.where(
           string_ops.regex_full_match(checkpoint_prefix, "^s3://.*"),
           constant_op.constant(".part"),
-          constant_op.constant("_temp_%s/part" % uuid.uuid4().hex))
+          constant_op.constant("_temp/part"))
       tmp_checkpoint_prefix = string_ops.string_join(
           [checkpoint_prefix, _SHARDED_SUFFIX])
 
diff --git a/tensorflow/python/training/saving/functional_saver.py b/tensorflow/python/training/saving/functional_saver.py
index c973c43009c..62b8b72ce2a 100644
--- a/tensorflow/python/training/saving/functional_saver.py
+++ b/tensorflow/python/training/saving/functional_saver.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import uuid
-
 from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
@@ -72,9 +70,14 @@ class _SingleDeviceSaver(object):
     tensor_slices = []
     for saveable in self._saveable_objects:
       for spec in saveable.specs:
-        tensor_names.append(spec.name)
-        tensors.append(spec.tensor)
-        tensor_slices.append(spec.slice_spec)
+        tensor = spec.tensor
+        # A tensor value of `None` indicates that this SaveableObject gets
+        # recorded in the object graph, but that no value is saved in the
+        # checkpoint.
+        if tensor is not None:
+          tensor_names.append(spec.name)
+          tensors.append(tensor)
+          tensor_slices.append(spec.slice_spec)
     save_device = options.experimental_io_device or "cpu:0"
     with ops.device(save_device):
       return io_ops.save_v2(file_prefix, tensor_names, tensor_slices, tensors)
@@ -247,7 +250,7 @@ class MultiDeviceSaver(object):
       sharded_suffix = array_ops.where(
           string_ops.regex_full_match(file_prefix, "^s3://.*"),
           constant_op.constant(".part"),
-          constant_op.constant("_temp_%s/part" % uuid.uuid4().hex))
+          constant_op.constant("_temp/part"))
       tmp_checkpoint_prefix = string_ops.string_join(
           [file_prefix, sharded_suffix])
 
diff --git a/tensorflow/python/training/saving/saveable_object.py b/tensorflow/python/training/saving/saveable_object.py
index 54f1d1fb237..4c67fc4feb1 100644
--- a/tensorflow/python/training/saving/saveable_object.py
+++ b/tensorflow/python/training/saving/saveable_object.py
@@ -26,6 +26,7 @@ class SaveSpec(object):
 
     Args:
       tensor: the tensor to save or callable that produces a tensor to save.
+        If the value is `None`, the `SaveSpec` is ignored.
       slice_spec: the slice to be saved. See `Variable.SaveSliceInfo`.
       name: the name to save the tensor under.
       dtype: The data type of the Tensor. Required if `tensor` is callable.
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
index c3c3570c0f8..2b235e70117 100644
--- a/tensorflow/python/training/saving/saveable_object_util.py
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -99,11 +99,16 @@ class ResourceVariableSaveable(saveable_object.SaveableObject):
       def _read_variable_closure(v):
         def f():
           with ops.device(v.device):
+            if context.executing_eagerly() and not v.is_initialized():
+              # A SaveSpec tensor value of `None` indicates that the variable is
+              # uninitialized.
+              return None
             x = v.read_value()
             # To allow variables placed on non-CPU devices to be checkpointed,
             # we copy them to CPU on the same machine first.
             with ops.device("/device:CPU:0"):
               return array_ops.identity(x)
+
         return f
 
       self.handle_op = var.handle
@@ -177,8 +182,8 @@ def saveable_objects_for_op(op, name):
         yield ReferenceVariableSaveable(
             variable, variable._save_slice_info.spec, name)
       else:
-        yield ResourceVariableSaveable(
-            variable, variable._save_slice_info.spec, name)
+        yield ResourceVariableSaveable(variable, variable._save_slice_info.spec,
+                                       name)
     # pylint: enable=protected-access
   elif isinstance(op, trackable.Trackable) and not isinstance(
       op, variables.Variable):
@@ -196,12 +201,10 @@ def saveable_objects_for_op(op, name):
   else:
     # A variable or tensor.
     if isinstance(op, resource_variable_ops.BaseResourceVariable):
-      # pylint: disable=protected-access
-      if op._in_graph_mode:
-        variable = op._graph_element
+      if op._in_graph_mode:  # pylint: disable=protected-access
+        variable = op._graph_element  # pylint: disable=protected-access
       else:
         variable = op
-      # pylint: enable=protected-access
       yield ResourceVariableSaveable(variable, "", name)
     else:
       if context.executing_eagerly():
@@ -217,8 +220,7 @@ def saveable_objects_for_op(op, name):
                               "AutoReloadVariable"]:
         yield ReferenceVariableSaveable(variable, "", name)
       else:
-        yield ResourceVariableSaveable(
-            variable, "", name)
+        yield ResourceVariableSaveable(variable, "", name)
 
 
 def op_list_to_dict(op_list, convert_variable_to_tensor=True):
@@ -326,7 +328,7 @@ def _add_saveable(saveables, seen_ops, saveable):
   Raises:
     ValueError: If the saveable has already been processed.
   """
-  if saveable.op in seen_ops:
+  if saveable.op is not None and saveable.op in seen_ops:
     raise ValueError("The same saveable will be restored with two names: %s" %
                      saveable.name)
   saveables.append(saveable)
diff --git a/tensorflow/python/training/tracking/BUILD b/tensorflow/python/training/tracking/BUILD
index 370b78c84f5..6001dc2cbbe 100644
--- a/tensorflow/python/training/tracking/BUILD
+++ b/tensorflow/python/training/tracking/BUILD
@@ -57,6 +57,7 @@ py_library(
 tf_py_test(
     name = "tracking_test",
     srcs = ["tracking_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":base",
         ":tracking",
@@ -158,6 +159,7 @@ tf_py_test(
     name = "util_test",
     srcs = ["util_test.py"],
     tags = ["notsan"],  # b/74395663
+    tfrt_enabled = True,
     deps = [
         ":base",
         ":graph_view",
@@ -198,6 +200,7 @@ tf_py_test(
     tags = [
         "notsan",  # b/74395663
     ],
+    tfrt_enabled = True,
     deps = [
         ":tracking",
         ":util",
@@ -240,6 +243,7 @@ tf_py_test(
 tf_py_test(
     name = "benchmarks_test",
     srcs = ["benchmarks_test.py"],
+    tfrt_enabled = True,
     deps = [
         ":util",
         "//tensorflow/python:framework_ops",
diff --git a/tensorflow/python/training/tracking/base.py b/tensorflow/python/training/tracking/base.py
index a8b0410dc77..67605c8fc67 100644
--- a/tensorflow/python/training/tracking/base.py
+++ b/tensorflow/python/training/tracking/base.py
@@ -33,6 +33,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
+from tensorflow.python.util.tf_export import tf_export
 
 # Key where the object graph proto is saved in a TensorBundle
 OBJECT_GRAPH_PROTO_KEY = "_CHECKPOINTABLE_OBJECT_GRAPH"
@@ -54,6 +55,37 @@ TrackableReference = collections.namedtuple(
     ])
 
 
+# TODO(bfontain):  Update once sharded initialization interface is finalized.
+ShardInfo = collections.namedtuple(
+    "CheckpointInitialValueShardInfo", ["shape", "offset"])
+
+
+class CheckpointInitialValueCallable(object):
+  """A callable object that returns a CheckpointInitialValue.
+
+  See CheckpointInitialValue for more information.
+  """
+
+  def __init__(self, checkpoint_position):
+    self._checkpoint_position = checkpoint_position
+
+  @property
+  def checkpoint_position(self):
+    return self._checkpoint_position
+
+  def __call__(self, shape=None, dtype=None, shard_info=None):
+    # Note that the signature here is for compatibility with normal callable
+    # initializers which take shape and dtype. Although dtype isn't used, it
+    # will get passed in by a functool.partial_wrapper in places like
+    # base_layer_utils.py's make_variable.
+    return CheckpointInitialValue(
+        self._checkpoint_position, shape, shard_info=shard_info)
+
+  @property
+  def restore_uid(self):
+    return self._checkpoint_position.restore_uid
+
+
 class CheckpointInitialValue(ops.Tensor):
   """Tensor wrapper for managing update UIDs in `Variables`.
 
@@ -67,8 +99,18 @@ class CheckpointInitialValue(ops.Tensor):
   how `CheckpointInitialValue` is used.
   """
 
-  def __init__(self, checkpoint_position, shape=None):
-    self.wrapped_value = checkpoint_position.value_tensors()[VARIABLE_VALUE_KEY]
+  def __init__(self, checkpoint_position, shape=None, shard_info=None):
+    if shard_info:
+      full_shape_str = " ".join("%d" % d for d in shape) + " "
+      slice_spec = ":".join(
+          "%d,%d" % (o, s) for o, s in zip(shard_info.offset, shard_info.shape))
+      shape_and_slice = full_shape_str + slice_spec
+      # Override shape here so we set the correct shape below.
+      shape = shard_info.shape
+    else:
+      shape_and_slice = ""
+    self.wrapped_value = checkpoint_position.value_tensors(
+        {VARIABLE_VALUE_KEY: shape_and_slice})[VARIABLE_VALUE_KEY]
     if shape:
       # We need to set the static shape information on the initializer if
       # possible so we don't get a variable with an unknown shape.
@@ -286,12 +328,18 @@ class CheckpointPosition(object):
             attributes[0].name == VARIABLE_VALUE_KEY and
             not self.object_proto.children)
 
-  def value_tensors(self):
+  def value_tensors(self, shape_and_slices=None):
     """Create value `Tensor`s for this object's attributes.
 
     Does not require that the Python object has been created. Used for
     restore-on-create when executing eagerly.
 
+    Args:
+      shape_and_slices: A dict mapping from object attribute names to a shape
+        and slice string that will be passed to a RestoreV2 op. If the dict is
+        None or if an object attribute is not in the dict, the full tensor will
+        be restored.
+
     Returns:
       A dictionary mapping from object attribute names to `Tensor`s.
     """
@@ -304,15 +352,20 @@ class CheckpointPosition(object):
       with ops.init_scope():
         with ops.device(io_device):
           # Run the restore itself on the io_device(CPU or specified).
+          if (shape_and_slices is not None and
+              serialized_tensor.name in shape_and_slices):
+            shape_and_slice = shape_and_slices[serialized_tensor.name]
+          else:
+            shape_and_slice = ""
           value, = io_ops.restore_v2(
               prefix=self._checkpoint.save_path_tensor,
               tensor_names=[checkpoint_key],
-              shape_and_slices=[""],
+              shape_and_slices=[shape_and_slice],
               dtypes=[base_type],
               name="%s_checkpoint_read" % (serialized_tensor.name,))
         # Copy the value to the current device if necessary.
         value_tensors[serialized_tensor.name] = array_ops.identity(value)
-      return value_tensors
+    return value_tensors
 
   def gather_ops_or_named_saveables(self):
     """Looks up or creates SaveableObjects which don't have cached ops."""
@@ -534,6 +587,7 @@ def no_automatic_dependency_tracking_scope(obj):
     obj._setattr_tracking = previous_value  # pylint: disable=protected-access
 
 
+@tf_export("__internal__.tracking.Trackable", v1=[])
 class Trackable(object):
   """Base class for `Trackable` objects without automatic dependencies.
 
@@ -735,11 +789,11 @@ class Trackable(object):
         # then assigning (when executing eagerly). This call returns None if
         # there is nothing to restore.
         checkpoint_initializer = self._preload_simple_restoration(
-            name=name, shape=shape)
+            name=name)
       else:
         checkpoint_initializer = None
       if (checkpoint_initializer is not None and
-          not (isinstance(initializer, CheckpointInitialValue) and
+          not (isinstance(initializer, CheckpointInitialValueCallable) and
                (initializer.restore_uid > checkpoint_initializer.restore_uid))):
         # If multiple Trackable objects are "creating" the same variable
         # via the magic of custom getters, the one with the highest restore UID
@@ -748,7 +802,6 @@ class Trackable(object):
         # then we'll catch that when we call _track_trackable. So this is
         # "best effort" to set the initializer with the highest restore UID.
         initializer = checkpoint_initializer
-        shape = None
     new_variable = getter(
         name=name,
         shape=shape,
@@ -767,7 +820,7 @@ class Trackable(object):
       # fallback once all get_variable() return types are Trackable.
       return new_variable
 
-  def _preload_simple_restoration(self, name, shape):
+  def _preload_simple_restoration(self, name):
     """Return a dependency's value for restore-on-create.
 
     Note the restoration is not deleted; if for some reason preload is called
@@ -778,7 +831,6 @@ class Trackable(object):
     Args:
       name: The object-local name of the dependency holding the variable's
         value.
-      shape: The shape of the variable being loaded into.
 
     Returns:
       An callable for use as a variable's initializer/initial_value, or None if
@@ -801,8 +853,8 @@ class Trackable(object):
     checkpoint_position = max(
         deferred_dependencies_list,
         key=lambda restore: restore.checkpoint.restore_uid)
-    return CheckpointInitialValue(
-        checkpoint_position=checkpoint_position, shape=shape)
+    return CheckpointInitialValueCallable(
+        checkpoint_position=checkpoint_position)
 
   def _track_trackable(self, trackable, name, overwrite=False):
     """Declare a dependency on another `Trackable` object.
diff --git a/tensorflow/python/training/tracking/tracking.py b/tensorflow/python/training/tracking/tracking.py
index 6b8bf3bd19d..3abafdcb233 100644
--- a/tensorflow/python/training/tracking/tracking.py
+++ b/tensorflow/python/training/tracking/tracking.py
@@ -47,6 +47,7 @@ class NotTrackable(object):
   pass
 
 
+@tf_export("__internal__.tracking.AutoTrackable", v1=[])
 class AutoTrackable(base.Trackable):
   """Manages dependencies on other objects.
 
diff --git a/tensorflow/python/training/tracking/util.py b/tensorflow/python/training/tracking/util.py
index 95c8f9d2b60..d6fdfbc04ee 100644
--- a/tensorflow/python/training/tracking/util.py
+++ b/tensorflow/python/training/tracking/util.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import abc
 import collections
+import functools
 import os
 import weakref
 
@@ -57,6 +58,7 @@ from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -427,10 +429,16 @@ def _default_getter(name,
       # Instantiate initializer if provided initializer is a type object.
       if isinstance(initializer, type(init_ops.Initializer)):
         initializer = initializer(dtype=dtype)
-
-      def initial_value():
-        return initializer(
-            shape_object.as_list(), dtype=dtype, partition_info=partition_info)
+      shape_list = None if shape is None else shape_object.as_list()
+      if "partition_info" in tf_inspect.getargspec(initializer).args:
+        initial_value = functools.partial(initializer,
+                                          shape_list,
+                                          dtype=dtype,
+                                          partition_info=partition_info)
+      else:
+        initial_value = functools.partial(initializer,
+                                          shape_list,
+                                          dtype=dtype)
 
     return variables.VariableV1(
         initial_value=initial_value,
diff --git a/tensorflow/python/training/training_ops.py b/tensorflow/python/training/training_ops.py
index d7133cfb500..ba53657d6e6 100644
--- a/tensorflow/python/training/training_ops.py
+++ b/tensorflow/python/training/training_ops.py
@@ -19,8 +19,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.training import gen_training_ops  # pylint: disable=unused-import
+from tensorflow.python.ops import gen_training_ops  # pylint: disable=unused-import
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
-from tensorflow.python.training.gen_training_ops import *
+from tensorflow.python.ops.gen_training_ops import *
 # pylint: enable=wildcard-import
diff --git a/tensorflow/python/types/BUILD b/tensorflow/python/types/BUILD
index 5f3f4fd0e31..d48f066d294 100644
--- a/tensorflow/python/types/BUILD
+++ b/tensorflow/python/types/BUILD
@@ -35,6 +35,7 @@ py_strict_library(
         ":doc_typealias",
         "//tensorflow/python:tf_export",
         "//third_party/py/numpy",
+        "@typing_extensions_archive//:typing_extensions",
     ],
 )
 
diff --git a/tensorflow/python/types/core.py b/tensorflow/python/types/core.py
index bec5aecaba0..b4506594a82 100644
--- a/tensorflow/python/types/core.py
+++ b/tensorflow/python/types/core.py
@@ -18,14 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import sys
 import textwrap
 
 from typing import Union
+
 import numpy as np
 
 from tensorflow.python.types import doc_typealias
 from tensorflow.python.util.tf_export import tf_export
 
+if sys.version_info >= (3, 8):
+  from typing import Protocol  # pylint:disable=g-import-not-at-top
+else:
+  from typing_extensions import Protocol  # pylint:disable=g-import-not-at-top
+
 # TODO(mdan): Consider adding ABC once the dependence on isinstance is reduced.
 # TODO(mdan): Add type annotations.
 
@@ -67,9 +74,24 @@ class Value(Tensor):
     pass
 
 
+class TensorProtocol(Protocol):
+  """Protocol type for objects that can be converted to Tensor."""
+
+  def __tf_tensor__(self, dtype=None, name=None):
+    """Converts this object to a Tensor.
+
+    Args:
+      dtype: data type for the returned Tensor
+      name: a name for the operations which create the Tensor
+    Returns:
+      A Tensor.
+    """
+    pass
+
+
 # TODO(rahulkamat): Add missing types that are convertible to Tensor.
-TensorLike = Union[Tensor, int, float, bool, str, complex, tuple, list,
-                   np.ndarray]
+TensorLike = Union[Tensor, TensorProtocol, int, float, bool, str, complex,
+                   tuple, list, np.ndarray]
 doc_typealias.document(
     obj=TensorLike,
     doc=textwrap.dedent("""\
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index 5e822f87e8c..e634a2c67cf 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -29,6 +29,7 @@ from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util import tf_stack
+from tensorflow.tools.docs import doc_controls
 
 
 # Allow deprecation warnings to be silenced temporarily with a context manager.
@@ -305,8 +306,23 @@ def deprecated(date, instructions, warn_once=True):
   """
   _validate_deprecation_args(date, instructions)
 
-  def deprecated_wrapper(func):
+  def deprecated_wrapper(func_or_class):
     """Deprecation wrapper."""
+    if isinstance(func_or_class, type):
+      # If a class is deprecated, you actually want to wrap the constructor.
+      cls = func_or_class
+      if cls.__new__ is object.__new__:
+        func = cls.__init__
+        constructor_name = '__init__'
+      else:
+        func = cls.__new__
+        constructor_name = '__new__'
+
+    else:
+      cls = None
+      constructor_name = None
+      func = func_or_class
+
     decorator_utils.validate_callable(func, 'deprecated')
     @functools.wraps(func)
     def new_func(*args, **kwargs):  # pylint: disable=missing-docstring
@@ -322,10 +338,25 @@ def deprecated(date, instructions, warn_once=True):
               'in a future version' if date is None else ('after %s' % date),
               instructions)
       return func(*args, **kwargs)
-    return tf_decorator.make_decorator(
+
+    doc_controls.set_deprecated(new_func)
+    new_func = tf_decorator.make_decorator(
         func, new_func, 'deprecated',
         _add_deprecated_function_notice_to_docstring(func.__doc__, date,
                                                      instructions))
+
+    if cls is None:
+      return new_func
+    else:
+      # Insert the wrapped function as the constructor
+      setattr(cls, constructor_name, new_func)
+
+      # And update the docstring of the class.
+      cls.__doc__ = _add_deprecated_function_notice_to_docstring(
+          cls.__doc__, date, instructions)
+
+      return cls
+
   return deprecated_wrapper
 
 
diff --git a/tensorflow/python/util/deprecation_test.py b/tensorflow/python/util/deprecation_test.py
index 20c0846cfb8..a8babf3b011 100644
--- a/tensorflow/python/util/deprecation_test.py
+++ b/tensorflow/python/util/deprecation_test.py
@@ -19,6 +19,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+import enum
+
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
@@ -95,6 +98,72 @@ class DeprecationTest(test.TestCase):
     _fn()
     self.assertEqual(1, mock_warning.call_count)
 
+  @test.mock.patch.object(logging, "warning", autospec=True)
+  def test_deprecated_init_class(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    @deprecation.deprecated(date, instructions, warn_once=True)
+    class MyClass():
+      """A test class."""
+
+      def __init__(self, a):
+        pass
+
+    MyClass("")
+    self.assertEqual(1, mock_warning.call_count)
+    MyClass("")
+    self.assertEqual(1, mock_warning.call_count)
+    self.assertIn("IS DEPRECATED", MyClass.__doc__)
+
+  @test.mock.patch.object(logging, "warning", autospec=True)
+  def test_deprecated_new_class(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    @deprecation.deprecated(date, instructions, warn_once=True)
+    class MyStr(str):
+
+      def __new__(cls, value):
+        return str.__new__(cls, value)
+
+    MyStr("abc")
+    self.assertEqual(1, mock_warning.call_count)
+    MyStr("abc")
+    self.assertEqual(1, mock_warning.call_count)
+    self.assertIn("IS DEPRECATED", MyStr.__doc__)
+
+  @test.mock.patch.object(logging, "warning", autospec=True)
+  def test_deprecated_enum(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    @deprecation.deprecated(date, instructions, warn_once=True)
+    class MyEnum(enum.Enum):
+      a = 1
+      b = 2
+
+    self.assertIs(MyEnum(1), MyEnum.a)
+    self.assertEqual(1, mock_warning.call_count)
+    self.assertIs(MyEnum(2), MyEnum.b)
+    self.assertEqual(1, mock_warning.call_count)
+    self.assertIn("IS DEPRECATED", MyEnum.__doc__)
+
+  @test.mock.patch.object(logging, "warning", autospec=True)
+  def test_deprecated_namedtuple(self, mock_warning):
+    date = "2016-07-04"
+    instructions = "This is how you update..."
+
+    mytuple = deprecation.deprecated(
+        date, instructions, warn_once=True)(
+            collections.namedtuple("my_tuple", ["field1", "field2"]))
+
+    mytuple(1, 2)
+    self.assertEqual(1, mock_warning.call_count)
+    mytuple(3, 4)
+    self.assertEqual(1, mock_warning.call_count)
+    self.assertIn("IS DEPRECATED", mytuple.__doc__)
+
   @test.mock.patch.object(logging, "warning", autospec=True)
   def test_silence(self, mock_warning):
     date = "2016-07-04"
diff --git a/tensorflow/python/util/function_parameter_canonicalizer.cc b/tensorflow/python/util/function_parameter_canonicalizer.cc
new file mode 100644
index 00000000000..2791e51eb73
--- /dev/null
+++ b/tensorflow/python/util/function_parameter_canonicalizer.cc
@@ -0,0 +1,164 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/util/function_parameter_canonicalizer.h"
+
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/python/lib/core/py_util.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
+
+namespace {
+inline const char* PyUnicodeAsUtf8Compat(PyObject* obj) {
+#if PY_MAJOR_VERSION < 3
+  return PyString_AS_STRING(obj);
+#else
+  return PyUnicode_AsUTF8(obj);
+#endif
+}
+
+inline PyObject* PyUnicodeInternFromStringCompat(const char* str) {
+#if PY_MAJOR_VERSION < 3
+  return PyString_InternFromString(str);
+#else
+  return PyUnicode_InternFromString(str);
+#endif
+}
+
+inline void PyUnicodeInternInPlaceCompat(PyObject** obj) {
+#if PY_MAJOR_VERSION < 3
+  PyString_InternInPlace(obj);
+#else
+  PyUnicode_InternInPlace(obj);
+#endif
+}
+
+}  // namespace
+
+namespace tensorflow {
+
+FunctionParameterCanonicalizer::FunctionParameterCanonicalizer(
+    absl::Span<const char*> arg_names, absl::Span<PyObject*> defaults)
+    : positional_args_size_(arg_names.size() - defaults.size()) {
+  DCheckPyGilState();
+  DCHECK_GE(positional_args_size_, 0);
+
+  interned_arg_names_.reserve(arg_names.size());
+  for (const char* obj : arg_names)
+    interned_arg_names_.emplace_back(PyUnicodeInternFromStringCompat(obj));
+
+  DCHECK(AreInternedArgNamesUnique());
+
+  for (PyObject* obj : defaults) Py_INCREF(obj);
+  defaults_ = std::vector<Safe_PyObjectPtr>(defaults.begin(), defaults.end());
+}
+
+bool FunctionParameterCanonicalizer::Canonicalize(
+    PyObject* args, PyObject* kwargs, absl::Span<PyObject*> result) {
+  // TODO(kkb): Closely follow `Python/ceval.c`'s logic and error handling.
+
+  DCheckPyGilState();
+  DCHECK(PyTuple_CheckExact(args));
+  DCHECK(PyDict_CheckExact(kwargs));
+  DCHECK_EQ(result.size(), interned_arg_names_.size());
+
+  const int args_size = Py_SIZE(args);
+  int remaining_positional_args_count = positional_args_size_ - args_size;
+
+  // Check if the number of input arguments are too many.
+  if (TF_PREDICT_FALSE(args_size > interned_arg_names_.size())) {
+    // TODO(kkb): Also report the actual numbers.
+    PyErr_SetString(PyExc_TypeError, "Too many arguments were given");
+    return false;
+  }
+
+  // Fill positional arguments.
+  for (int i = 0; i < args_size; ++i) result[i] = PyTuple_GET_ITEM(args, i);
+
+  // Fill default arguments.
+  for (int i = std::max(positional_args_size_, args_size);
+       i < interned_arg_names_.size(); ++i)
+    result[i] = defaults_[i - positional_args_size_].get();
+
+  // Fill keyword arguments.
+  if (kwargs != nullptr) {
+    PyObject *key, *value;
+    Py_ssize_t pos = 0;
+    while (PyDict_Next(kwargs, &pos, &key, &value)) {
+      std::size_t index = InternedArgNameLinearSearch(key);
+
+      // Check if key object(argument name) was found in the pre-built intern
+      // string table.
+      if (TF_PREDICT_FALSE(index == interned_arg_names_.size())) {
+        // `key` might not be an interend string, so get the interned string
+        // and try again.
+        PyUnicodeInternInPlaceCompat(&key);
+
+        index = InternedArgNameLinearSearch(key);
+
+        // Stil not found, then return an error.
+        if (TF_PREDICT_FALSE(index == interned_arg_names_.size())) {
+          PyErr_Format(PyExc_TypeError,
+                       "Got an unexpected keyword argument '%s'",
+                       PyUnicodeAsUtf8Compat(key));
+          return false;
+        }
+      }
+
+      // Check if the keyword argument overlaps with positional arguments.
+      if (TF_PREDICT_FALSE(index < args_size)) {
+        PyErr_Format(PyExc_TypeError, "Got multiple values for argument '%s'",
+                     PyUnicodeAsUtf8Compat(key));
+        return false;
+      }
+
+      if (TF_PREDICT_FALSE(index < positional_args_size_))
+        --remaining_positional_args_count;
+
+      result[index] = value;
+    }
+  }
+
+  // Check if all the arguments are filled.
+  // Example failure, not enough number of arguments passed: `matmul(x)`
+  if (TF_PREDICT_FALSE(remaining_positional_args_count > 0)) {
+    // TODO(kkb): Report what arguments are missing.
+    PyErr_SetString(PyExc_TypeError, "Missing required positional argument");
+    return false;
+  }
+
+  return true;
+}
+
+ABSL_MUST_USE_RESULT
+ABSL_ATTRIBUTE_HOT
+inline std::size_t FunctionParameterCanonicalizer::InternedArgNameLinearSearch(
+    PyObject* name) {
+  std::size_t result = interned_arg_names_.size();
+
+  for (std::size_t i = 0; i < interned_arg_names_.size(); ++i)
+    if (TF_PREDICT_FALSE(name == interned_arg_names_[i].get())) return i;
+
+  return result;
+}
+
+bool FunctionParameterCanonicalizer::AreInternedArgNamesUnique() {
+  absl::flat_hash_set<PyObject*> interned_arg_names_set;
+  for (const Safe_PyObjectPtr& obj : interned_arg_names_)
+    interned_arg_names_set.emplace(obj.get());
+
+  return interned_arg_names_set.size() == interned_arg_names_.size();
+}
+}  // namespace tensorflow
diff --git a/tensorflow/python/util/function_parameter_canonicalizer.h b/tensorflow/python/util/function_parameter_canonicalizer.h
new file mode 100644
index 00000000000..8e7fd7dd693
--- /dev/null
+++ b/tensorflow/python/util/function_parameter_canonicalizer.h
@@ -0,0 +1,73 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PYTHON_UTIL_FUNCTION_PARAMETER_CANONICALIZER_H_
+#define TENSORFLOW_PYTHON_UTIL_FUNCTION_PARAMETER_CANONICALIZER_H_
+
+#include <Python.h>
+
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
+
+namespace tensorflow {
+
+// A class that Canonicalizes Python arg & kwargs parameters.
+class FunctionParameterCanonicalizer {
+ public:
+  // `arg_names` is a list of argument names, and `defaults` is default PyObject
+  // instances for arguments. `default` is aligned to the end.
+  FunctionParameterCanonicalizer(absl::Span<const char*> arg_names,
+                                 absl::Span<PyObject*> defaults);
+
+  // Returns the total number of arguments.
+  ABSL_MUST_USE_RESULT
+  int GetArgSize() const { return interned_arg_names_.size(); }
+
+  // Canonicalizes `args` and `kwargs` by the spec specified at construction.
+  // It's written to `result`. Returns `true` if Canonicalization was
+  // successful, and `false` otherwise. When it fails, it also sets CPython
+  // error status.
+  // This function does not update reference counter of any Python objects.
+  // `PyObject*`s in `result` are borrowed references from `args`, `kwargs`, and
+  // possibly `defaults_`, and will be only valid if `args` and `kwargs` are
+  // still alive.
+  ABSL_MUST_USE_RESULT
+  ABSL_ATTRIBUTE_HOT
+  bool Canonicalize(PyObject* args, PyObject* kwargs,
+                    absl::Span<PyObject*> result);
+
+ private:
+  // Simple linear search of `name` in `interned_arg_names`. If found, returns
+  // the index. If not found, returns `interned_arg_names.size()`.
+  ABSL_MUST_USE_RESULT
+  ABSL_ATTRIBUTE_HOT
+  std::size_t InternedArgNameLinearSearch(PyObject* name);
+
+  // Check if `interned_arg_names_` is unique.
+  bool AreInternedArgNamesUnique();
+
+  // TODO(kkb): Use one `std::vector` and two `absl:Span`s instead to improve
+  // cache locality.
+  std::vector<Safe_PyObjectPtr> interned_arg_names_;
+  std::vector<Safe_PyObjectPtr> defaults_;
+  const int positional_args_size_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_UTIL_FUNCTION_PARAMETER_CANONICALIZER_H_
diff --git a/tensorflow/python/util/function_parameter_canonicalizer_binding_for_test.cc b/tensorflow/python/util/function_parameter_canonicalizer_binding_for_test.cc
new file mode 100644
index 00000000000..e93f6905734
--- /dev/null
+++ b/tensorflow/python/util/function_parameter_canonicalizer_binding_for_test.cc
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <Python.h>
+
+#include <vector>
+
+#include "absl/types/span.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/pytypes.h"
+#include "pybind11/stl.h"
+#include "tensorflow/python/util/function_parameter_canonicalizer.h"
+
+namespace py = pybind11;
+
+class FunctionParameterCanonicalizerWrapper {
+ public:
+  FunctionParameterCanonicalizerWrapper(absl::Span<const char*> arg_names,
+                                        absl::Span<PyObject*> defaults)
+      : function_parameter_canonicalizer_(arg_names, defaults) {}
+
+  tensorflow::FunctionParameterCanonicalizer function_parameter_canonicalizer_;
+};
+
+PYBIND11_MODULE(_function_parameter_canonicalizer_binding_for_test, m) {
+  py::class_<FunctionParameterCanonicalizerWrapper>(
+      m, "FunctionParameterCanonicalizer")
+      .def(py::init([](std::vector<std::string> arg_names, py::tuple defaults) {
+        std::vector<const char*> arg_names_c_str;
+        for (const std::string& name : arg_names)
+          arg_names_c_str.emplace_back(name.c_str());
+
+        tensorflow::Safe_PyObjectPtr defaults_fast(
+            PySequence_Fast(defaults.ptr(), "Expected tuple"));
+        if (!defaults) throw py::error_already_set();
+        PyObject** default_items = PySequence_Fast_ITEMS(defaults_fast.get());
+        return new FunctionParameterCanonicalizerWrapper(
+            absl::MakeSpan(arg_names_c_str),
+            absl::MakeSpan(default_items,
+                           PySequence_Fast_GET_SIZE(defaults_fast.get())));
+      }))
+      .def("canonicalize", [](FunctionParameterCanonicalizerWrapper& self,
+                              py::args args, py::kwargs kwargs) {
+        std::vector<PyObject*> result_raw(
+            self.function_parameter_canonicalizer_.GetArgSize());
+
+        bool is_suceeded = self.function_parameter_canonicalizer_.Canonicalize(
+            args.ptr(), kwargs.ptr(), absl::MakeSpan(result_raw));
+
+        if (!is_suceeded) {
+          CHECK(PyErr_Occurred());
+          throw py::error_already_set();
+        }
+
+        py::list result;
+        for (PyObject* obj : result_raw) result.append(obj);
+        return result;
+      });
+}
diff --git a/tensorflow/python/util/function_parameter_canonicalizer_test.py b/tensorflow/python/util/function_parameter_canonicalizer_test.py
new file mode 100644
index 00000000000..968265ff36f
--- /dev/null
+++ b/tensorflow/python/util/function_parameter_canonicalizer_test.py
@@ -0,0 +1,89 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tensorflow::FunctionParameterCanonicalizer`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import _function_parameter_canonicalizer_binding_for_test
+from tensorflow.python.platform import test
+
+
+class FunctionParameterCanonicalizerTest(test.TestCase):
+
+  def setUp(self):
+    super(FunctionParameterCanonicalizerTest, self).setUp()
+    self._matmul_func = (
+        _function_parameter_canonicalizer_binding_for_test
+        .FunctionParameterCanonicalizer([
+            'a', 'b', 'transpose_a', 'transpose_b', 'adjoint_a', 'adjoint_b',
+            'a_is_sparse', 'b_is_sparse', 'name'
+        ], (False, False, False, False, False, False, None)))
+
+  def testPosOnly(self):
+    self.assertEqual(
+        self._matmul_func.canonicalize(2, 3),
+        [2, 3, False, False, False, False, False, False, None])
+
+  def testPosOnly2(self):
+    self.assertEqual(
+        self._matmul_func.canonicalize(2, 3, True, False, True),
+        [2, 3, True, False, True, False, False, False, None])
+
+  def testPosAndKwd(self):
+    self.assertEqual(
+        self._matmul_func.canonicalize(
+            2, 3, transpose_a=True, name='my_matmul'),
+        [2, 3, True, False, False, False, False, False, 'my_matmul'])
+
+  def testPosAndKwd2(self):
+    self.assertEqual(
+        self._matmul_func.canonicalize(2, b=3),
+        [2, 3, False, False, False, False, False, False, None])
+
+  def testMissingPos(self):
+    with self.assertRaisesRegex(TypeError,
+                                'Missing required positional argument'):
+      self._matmul_func.canonicalize(2)
+
+  def testMissingPos2(self):
+    with self.assertRaisesRegex(TypeError,
+                                'Missing required positional argument'):
+      self._matmul_func.canonicalize(
+          transpose_a=True, transpose_b=True, adjoint_a=True)
+
+  def testTooManyArgs(self):
+    with self.assertRaisesRegex(TypeError, 'Too many arguments were given'):
+      self._matmul_func.canonicalize(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
+
+  def testInvalidKwd(self):
+    with self.assertRaisesRegex(TypeError,
+                                'Got an unexpected keyword argument'):
+      self._matmul_func.canonicalize(2, 3, hohoho=True)
+
+  def testDuplicatedArg(self):
+    with self.assertRaisesRegex(TypeError,
+                                "Got multiple values for argument 'b'"):
+      self._matmul_func.canonicalize(2, 3, False, b=4)
+
+  def testDuplicatedArg2(self):
+    with self.assertRaisesRegex(
+        TypeError, "Got multiple values for argument 'transpose_a'"):
+      self._matmul_func.canonicalize(2, 3, False, transpose_a=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/util/kernel_registry.cc b/tensorflow/python/util/kernel_registry.cc
index 7d47692f6b5..26e0d671844 100644
--- a/tensorflow/python/util/kernel_registry.cc
+++ b/tensorflow/python/util/kernel_registry.cc
@@ -49,9 +49,12 @@ string TryFindKernelClass(const string& serialized_node_def) {
     return "";
   }
   string class_name = "";
-  tensorflow::FindKernelDef(tensorflow::DeviceType(parsed_name.type.c_str()),
-                            node_def, nullptr /* kernel_def */, &class_name)
-      .IgnoreError();
+  status = tensorflow::FindKernelDef(
+      tensorflow::DeviceType(parsed_name.type.c_str()), node_def,
+      nullptr /* kernel_def */, &class_name);
+  if (!status.ok()) {
+    LOG(WARNING) << "Op [" << node_def.op() << "]: " << status;
+  }
   return class_name;
 }
 
diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 9f4ae1d9670..1a547b89d35 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -89,7 +89,7 @@ def _get_attrs_items(obj):
     A list of (attr_name, attr_value) pairs, sorted by attr_name.
   """
   attrs = getattr(obj.__class__, "__attrs_attrs__")
-  attr_names = [a.name for a in attrs]
+  attr_names = (a.name for a in attrs)
   return [(attr_name, getattr(obj, attr_name)) for attr_name in attr_names]
 
 
@@ -587,6 +587,21 @@ def map_structure(func, *structure, **kwargs):
   `structure[i]`.  All structures in `structure` must have the same arity,
   and the return value will contain results with the same structure layout.
 
+  Examples:
+
+  1. A single Python dict:
+
+  >>> a = {"hello": 24, "world": 76}
+  >>> tf.nest.map_structure(lambda p: p * 2, a)
+  {'hello': 48, 'world': 152}
+
+  2. Multiple Python dictionaries:
+
+  >>> d1 = {"hello": 24, "world": 76}
+  >>> d2 = {"hello": 36, "world": 14}
+  >>> tf.nest.map_structure(lambda p1, p2: p1 + p2, d1, d2)
+  {'hello': 60, 'world': 90}
+
   Args:
     func: A callable that accepts as many arguments as there are structures.
     *structure: scalar, or tuple or dict or list of constructed scalars and/or
@@ -637,7 +652,7 @@ def map_structure(func, *structure, **kwargs):
     assert_same_structure(structure[0], other, check_types=check_types,
                           expand_composites=expand_composites)
 
-  flat_structure = [flatten(s, expand_composites) for s in structure]
+  flat_structure = (flatten(s, expand_composites) for s in structure)
   entries = zip(*flat_structure)
 
   return pack_sequence_as(
@@ -960,7 +975,7 @@ def flatten_up_to(shallow_tree, input_tree, check_types=True,
                            check_types=check_types,
                            expand_composites=expand_composites)
   # Discard paths returned by _yield_flat_up_to.
-  return list(v for _, v in _yield_flat_up_to(shallow_tree, input_tree, is_seq))
+  return [v for _, v in _yield_flat_up_to(shallow_tree, input_tree, is_seq)]
 
 
 def flatten_with_tuple_paths_up_to(shallow_tree,
@@ -1231,17 +1246,17 @@ def map_structure_with_tuple_paths_up_to(shallow_tree, func, *inputs, **kwargs):
 
   # Flatten each input separately, apply the function to corresponding elements,
   # then repack based on the structure of the first input.
-  flat_value_lists = [
+  flat_value_gen = (
       flatten_up_to(  # pylint: disable=g-complex-comprehension
           shallow_tree,
           input_tree,
           check_types,
-          expand_composites=expand_composites) for input_tree in inputs
+          expand_composites=expand_composites) for input_tree in inputs)
+  flat_path_gen = (
+      path for path, _ in _yield_flat_up_to(shallow_tree, inputs[0], is_seq))
+  results = [
+      func(*args, **kwargs) for args in zip(flat_path_gen, *flat_value_gen)
   ]
-  flat_path_list = [path for path, _
-                    in _yield_flat_up_to(shallow_tree, inputs[0], is_seq)]
-  results = [func(*args, **kwargs) for args in zip(flat_path_list,
-                                                   *flat_value_lists)]
   return pack_sequence_as(structure=shallow_tree, flat_sequence=results,
                           expand_composites=expand_composites)
 
@@ -1380,7 +1395,8 @@ def flatten_with_joined_string_paths(structure, separator="/",
   flat_paths = yield_flat_paths(structure, expand_composites=expand_composites)
   def stringify_and_join(path_elements):
     return separator.join(str(path_element) for path_element in path_elements)
-  flat_string_paths = [stringify_and_join(path) for path in flat_paths]
+
+  flat_string_paths = (stringify_and_join(path) for path in flat_paths)
   return list(zip(flat_string_paths,
                   flatten(structure, expand_composites=expand_composites)))
 
diff --git a/tensorflow/python/util/object_identity.py b/tensorflow/python/util/object_identity.py
index dfdd08d1501..b4f4c63bc96 100644
--- a/tensorflow/python/util/object_identity.py
+++ b/tensorflow/python/util/object_identity.py
@@ -30,7 +30,7 @@ class _ObjectIdentityWrapper(object):
   _ListWrapper objects to object-identity collections.
   """
 
-  __slots__ = ["_wrapped"]
+  __slots__ = ["_wrapped", "__weakref__"]
 
   def __init__(self, wrapped):
     self._wrapped = wrapped
@@ -72,6 +72,8 @@ class _ObjectIdentityWrapper(object):
 
 class _WeakObjectIdentityWrapper(_ObjectIdentityWrapper):
 
+  __slots__ = ()
+
   def __init__(self, wrapped):
     super(_WeakObjectIdentityWrapper, self).__init__(weakref.ref(wrapped))
 
@@ -99,6 +101,8 @@ class Reference(_ObjectIdentityWrapper):
   ```
   """
 
+  __slots__ = ()
+
   # Disabling super class' unwrapped field.
   unwrapped = property()
 
@@ -153,6 +157,8 @@ class ObjectIdentityDictionary(collections_abc.MutableMapping):
 class ObjectIdentityWeakKeyDictionary(ObjectIdentityDictionary):
   """Like weakref.WeakKeyDictionary, but compares objects with "is"."""
 
+  __slots__ = ["__weakref__"]
+
   def _wrap_key(self, key):
     return _WeakObjectIdentityWrapper(key)
 
@@ -173,7 +179,7 @@ class ObjectIdentityWeakKeyDictionary(ObjectIdentityDictionary):
 class ObjectIdentitySet(collections_abc.MutableSet):
   """Like the built-in set, but compares objects with "is"."""
 
-  __slots__ = ["_storage"]
+  __slots__ = ["_storage", "__weakref__"]
 
   def __init__(self, *args):
     self._storage = set(self._wrap_key(obj) for obj in list(*args))
@@ -221,6 +227,8 @@ class ObjectIdentitySet(collections_abc.MutableSet):
 class ObjectIdentityWeakSet(ObjectIdentitySet):
   """Like weakref.WeakSet, but compares objects with "is"."""
 
+  __slots__ = ()
+
   def _wrap_key(self, key):
     return _WeakObjectIdentityWrapper(key)
 
diff --git a/tensorflow/python/util/tensor_float_32.cc b/tensorflow/python/util/tensor_float_32.cc
new file mode 100644
index 00000000000..6c3e8a267a4
--- /dev/null
+++ b/tensorflow/python/util/tensor_float_32.cc
@@ -0,0 +1,22 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "pybind11/pybind11.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
+
+PYBIND11_MODULE(_pywrap_tensor_float_32_execution, m) {
+  m.def("enable", &tensorflow::enable_tensor_float_32_execution);
+  m.def("is_enabled", &tensorflow::tensor_float_32_execution_enabled);
+}
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index 41b02a3dd4e..a34141716c6 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/lib/core/safe_pyobject_ptr.h"
 
 namespace tensorflow {
 namespace swig {
diff --git a/tensorflow/security/README.md b/tensorflow/security/README.md
index f7a756ed84e..9f02de4153d 100644
--- a/tensorflow/security/README.md
+++ b/tensorflow/security/README.md
@@ -10,6 +10,31 @@ in [SECURITY.md](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.m
 
 | Advisory Number | Type               | Versions affected | Reported by           | Additional Information      |
 |-----------------|--------------------|:-----------------:|-----------------------|-----------------------------|
+| [TFSA-2020-026](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-026.md)   | Segfault in `tf.raw_ops.Switch` in eager mode                                             | 2.2.0, 2.3.0        | Aivul Team from Qihoo 360                     |  |
+| [TFSA-2020-025](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-025.md)   | Undefined behavior in `dlpack.to_dlpack`                                                  | 2.2.0, 2.3.0        | Aivul Team from Qihoo 360                     |  |
+| [TFSA-2020-024](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-024.md)   | Memory leak in `dlpack.to_dlpack`                                                  | 2.2.0, 2.3.0        | Aivul Team from Qihoo 360                     |  |
+| [TFSA-2020-023](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-023.md)   | Memory corruption in `dlpack.to_dlpack`                                            | 2.2.0, 2.3.0        | Aivul Team from Qihoo 360                     |  |
+| [TFSA-2020-022](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-022.md)   | Crash due to invalid shape of `grad_values` in SparseFillEmptyRowsGrad             | >= 1.15.0, <= 2.3.0 | (variant analysis, Aivul Team from Qihoo 360) |  |
+| [TFSA-2020-021](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-021.md)   | Heap buffer overflow in SparseFillEmptyRowsGrad                                    | >= 1.15.0, <= 2.3.0 | Aivul Team from Qihoo 360                     |  |
+| [TFSA-2020-020](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-020.md)   | Heap buffer overflow in weighted sparse count ops                                  | 2.3.0               | (variant analysis, Aivul Team from Qihoo 360) |  |
+| [TFSA-2020-019](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-019.md)   | Crash due to invalid splits in SparseCountSparseOutput                             | 2.3.0               | (variant analysis, Aivul Team from Qihoo 360) |  |
+| [TFSA-2020-018](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-018.md)   | Heap buffer overflow due to invalid indices in SparseCountSparseOutput             | 2.3.0               | (variant analysis, Aivul Team from Qihoo 360) |  |
+| [TFSA-2020-017](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-017.md)   | Abort due to invalid splits in RaggedCountSparseOutput                             | 2.3.0               | (variant analysis, Aivul Team from Qihoo 360) |  |
+| [TFSA-2020-016](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-016.md)   | Segfault due to invalid splits in RaggedCountSparseOutput                          | 2.3.0               | (variant analysis, Aivul Team from Qihoo 360) |  |
+| [TFSA-2020-015](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-015.md)   | Heap buffer overflow due to invalid splits in RaggedCountSparseOutput              | 2.3.0               | Aivul Team from Qihoo 360                     |  |
+| [TFSA-2020-014](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-014.md)   | Integer truncation in Shard API usage                                              | >= 1.15.0, <= 2.3.0 | Aivul Team from Qihoo 360                     |  |
+| [TFSA-2020-013](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-013.md)   | Format-string vulnerability in TensorFlow's `as_string`                            | >= 1.15.0, <= 2.3.0 | Aivul Team from Qihoo 360                     |  |
+| [TFSA-2020-012](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-012.md)   | Segfault by calling session-only ops in eager mode                                 | >= 1.15.0, <= 2.3.0 | Aivul Team from Qihoo 360                     |  |
+| [TFSA-2020-011](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-011.md)   | Data leak in `tf.raw_ops.StringNGrams                                             `| >= 1.15.0, <= 2.3.0 | Aivul Team from Qihoo 360                     |  |
+| [TFSA-2020-010](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-010.md)   | Incomplete validation in TensorFlow's SavedModel's constant nodes causes segfaults | >= 1.15.0, <= 2.3.0 | Shuaike Dong, Alipay Tian Qian Security Lab   | [issue report](https://github.com/tensorflow/tensorflow/issues/41097) |
+| [TFSA-2020-009](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-009.md)   | Segfault and data corruption caused by negative indexing in TFLite                 | >= 1.15.0, <= 2.3.0 | Aivul Team from Qihoo 360                     |  |
+| [TFSA-2020-008](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-008.md)   | Data corruption due to dimension mismatch in TFLite                                | >= 1.15.0, <= 2.3.0 | Aivul Team from Qihoo 360                     |  |
+| [TFSA-2020-007](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-007.md)   | Null pointer dereference in TFLite                                                 | >= 1.15.0, <= 2.3.0 | Aivul Team from Qihoo 360, variant analysis   |  |
+| [TFSA-2020-006](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-006.md)   | Segmentation fault and/or data corruption due to invalid TFLite model              | >= 1.15.0, <= 2.3.0 | (variant analysis, Aivul Team from Qihoo 360) |  |
+| [TFSA-2020-005](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-005.md)   | Out of bounds access in TFLite operators                                           | >= 1.15.0, <= 2.3.0 | Aivul Team from Qihoo 360                     |  |
+| [TFSA-2020-004](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-004.md)   | Out of bounds access in TFLite implementation of segment sum                       | 2.2.0, 2.3.0        | (variant analysis, Aivul Team from Qihoo 360) |  |
+| [TFSA-2020-003](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-003.md)   | Denial of service from TFLite implementation of segment sum                        | 2.2.0, 2.3.0        | (variant analysis, Aivul Team from Qihoo 360) |  |
+| [TFSA-2020-002](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-002.md)   | Out of bounds write in TFLite implementation of segment sum                        | 2.2.0, 2.3.0        | Aivul Team from Qihoo 360                     |  |
 | [TFSA-2020-001](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-001.md)   | Segmentation fault when converting a Python string to `tf.float16` | >= 1.12.0, <= 2.1 | (found internally) |  |
 | [TFSA-2019-002](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2019-002.md)   | Heap buffer overflow in `UnsortedSegmentSum` | <= 1.14 | (found internally) |  |
 | [TFSA-2019-001](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2019-001.md)   | Null Pointer Dereference Error in Decoding GIF Files | <= 1.12 | Baidu Security Lab |  |
diff --git a/tensorflow/security/advisory/tfsa-2020-002.md b/tensorflow/security/advisory/tfsa-2020-002.md
new file mode 100644
index 00000000000..1f6e9c5c2f3
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-002.md
@@ -0,0 +1,68 @@
+## TFSA-2020-002: Out of bounds write in TFLite implementation of segment sum
+
+### CVE Number
+CVE-2020-15214
+
+### Impact
+In TensorFlow Lite models using segment sum can trigger a write out bounds /
+segmentation fault if the segment ids are not sorted. Code assumes that the
+segment ids are in increasing order, [using the last element of the tensor
+holding them to determine the dimensionality of output
+tensor](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/lite/kernels/segment_sum.cc#L39-L44):
+```cc
+  if (segment_id_size > 0) {
+    max_index = segment_ids->data.i32[segment_id_size - 1];
+  }
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(NumDimensions(data));
+  output_shape->data[0] = max_index + 1;
+```
+
+This results in allocating insufficient memory for the output tensor and in a
+[write outside the bounds of the output
+array](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/lite/kernels/internal/reference/reference_ops.h#L2625-L2631):
+```cc
+  memset(output_data, 0, sizeof(T) * output_shape.FlatSize());
+  for (int i = 0; i < input_shape.Dims(0); i++) {
+    int output_index = segment_ids_data[i];
+    for (int j = 0; j < segment_flat_size; ++j) {
+      output_data[output_index * segment_flat_size + j] +=
+          input_data[i * segment_flat_size + j];
+    }
+  }
+```
+
+This usually results in a segmentation fault, but depending on runtime
+conditions it can provide for a write gadget to be used in future memory
+corruption-based exploits.
+
+### Vulnerable Versions
+TensorFlow 2.2.0, 2.3.0.
+
+### Patches
+We have patched the issue in
+[204945b](https://github.com/tensorflow/tensorflow/commit/204945b) and will
+release patch releases for all affected versions.
+
+We recommend users to upgrade to TensorFlow 2.2.1, or 2.3.1.
+
+### Workarounds
+A potential workaround would be to add a custom `Verifier` to the model loading
+code to ensure that the segment ids are sorted, although this only handles the
+case when the segment ids are stored statically in the model.
+
+A similar validation could be done if the segment ids are generated at runtime
+between inference steps.
+
+If the segment ids are generated as outputs of a tensor during inference steps,
+then there are no possible workaround and users are advised to upgrade to
+patched code.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been reported by members of the Aivul Team from Qihoo
+360.
diff --git a/tensorflow/security/advisory/tfsa-2020-003.md b/tensorflow/security/advisory/tfsa-2020-003.md
new file mode 100644
index 00000000000..a62d138dbf7
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-003.md
@@ -0,0 +1,54 @@
+## TFSA-2020-003: Denial of service from TFLite implementation of segment sum
+
+### CVE Number
+CVE-2020-15213
+
+### Impact
+In TensorFlow Lite models using segment sum can trigger a denial of service by
+causing an out of memory allocation in the implementation of segment sum. Since
+code uses the last element of the tensor holding them to determine the
+dimensionality of output tensor, attackers can use a very large value to trigger
+a [large
+allocation](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/lite/kernels/segment_sum.cc#L39-L48):
+```cc
+  if (segment_id_size > 0) {
+    max_index = segment_ids->data.i32[segment_id_size - 1];
+  }
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(NumDimensions(data));
+  output_shape->data[0] = max_index + 1;
+  for (int i = 1; i < data_rank; ++i) {
+    output_shape->data[i] = data->dims->data[i];
+  }
+  return context->ResizeTensor(context, output, output_shape);
+```
+
+### Vulnerable Versions
+TensorFlow 2.2.0, 2.3.0.
+
+### Patches
+We have patched the issue in
+[204945b](https://github.com/tensorflow/tensorflow/commit/204945b) and will
+release patch releases for all affected versions.
+
+We recommend users to upgrade to TensorFlow 2.2.1, or 2.3.1.
+
+### Workarounds
+A potential workaround would be to add a custom `Verifier` to limit the maximum
+value in the segment ids tensor. This only handles the case when the segment ids
+are stored statically in the model, but a similar validation could be done if
+the segment ids are generated at runtime, between inference steps.
+
+However, if the segment ids are generated as outputs of a tensor during
+inference steps, then there are no possible workaround and users are advised to
+upgrade to patched code.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been discovered through a variant analysis of [a
+vulnerability reported by members of the Aivul Team from Qihoo
+360](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-002.md).
diff --git a/tensorflow/security/advisory/tfsa-2020-004.md b/tensorflow/security/advisory/tfsa-2020-004.md
new file mode 100644
index 00000000000..ae33a573bf1
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-004.md
@@ -0,0 +1,59 @@
+## TFSA-2020-004: Out of bounds access in TFLite implementation of segment sum
+
+### CVE Number
+CVE-2020-15212
+
+### Impact
+In TensorFlow Lite models using segment sum can trigger [writes outside of
+bounds of heap allocated
+buffers](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/lite/kernels/internal/reference/reference_ops.h#L2625-L2631)
+by inserting negative elements in the segment ids tensor:
+```cc
+  for (int i = 0; i < input_shape.Dims(0); i++) {
+    int output_index = segment_ids_data[i];
+    for (int j = 0; j < segment_flat_size; ++j) {
+      output_data[output_index * segment_flat_size + j] +=
+          input_data[i * segment_flat_size + j];
+    }
+  }
+```
+
+Users having access to `segment_ids_data` can alter `output_index` and then
+write to outside of `output_data` buffer.
+
+This might result in a segmentation fault but it can also be used to further
+corrupt the memory and can be chained with other vulnerabilities to create more
+advanced exploits.
+
+### Vulnerable Versions
+TensorFlow 2.2.0, 2.3.0.
+
+### Patches
+We have patched the issue in
+[204945b](https://github.com/tensorflow/tensorflow/commit/204945b) and will
+release patch releases for all affected versions.
+
+We recommend users to upgrade to TensorFlow 2.2.1, or 2.3.1.
+
+### Workarounds
+A potential workaround would be to add a custom `Verifier` to the model loading
+code to ensure that the segment ids are all positive, although this only handles
+the case when the segment ids are stored statically in the model.
+
+A similar validation could be done if the segment ids are generated at runtime
+between inference steps.
+
+If the segment ids are generated as outputs of a tensor during inference steps,
+then there are no possible workaround and users are advised to upgrade to
+patched code.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been discovered through a variant analysis of [a
+vulnerability reported by members of the Aivul Team from Qihoo
+360](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-002.md).
diff --git a/tensorflow/security/advisory/tfsa-2020-005.md b/tensorflow/security/advisory/tfsa-2020-005.md
new file mode 100644
index 00000000000..f614d250a94
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-005.md
@@ -0,0 +1,84 @@
+## TFSA-2020-005: Out of bounds access in TFLite operators
+
+### CVE Number
+CVE-2020-15211
+
+### Impact
+In TensorFlow Lite, saved models in the flatbuffer format use a double indexing
+scheme: a model has a set of subgraphs, each subgraph has a set of operators and
+each operator has a set of input/output tensors. The flatbuffer format uses
+indices for the tensors, indexing into an array of tensors that is owned by the
+subgraph. This results in a pattern of double array indexing when trying to
+[get the data of each
+tensor](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/lite/kernels/kernel_util.cc#L36):
+```cc
+  return &context->tensors[node->inputs->data[index]];
+```
+
+However, some operators can have some tensors be optional. To handle this
+scenario, the flatbuffer model uses a negative `-1` value as [index for these tensors](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/lite/c/common.h#L82):
+```cc
+#define kTfLiteOptionalTensor (-1)
+```
+
+This results in [special casing during validation at model loading
+time](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/lite/core/subgraph.cc#L566-L580):
+```cc
+  for (int i = 0; i < length; i++) {
+    int index = indices[i];
+    // Continue if index == kTfLiteOptionalTensor before additional comparisons
+    // below, size_t(-1) is always >= context_tensors_size.
+    if (index == kTfLiteOptionalTensor) {
+      continue;
+    }
+    if (index < 0 || static_cast<size_t>(index) >= context_.tensors_size) {
+      ReportError(
+          "Invalid tensor index %d in %s. The subgraph has %d tensors\n", index,
+          label, context_.tensors_size);
+      consistent_ = false;
+      return kTfLiteError;
+    }
+  }
+```
+
+Unfortunately, this means that the `-1` index is a valid tensor index for any
+operator, including those that don't expect optional inputs and including for
+output tensors. Thus, this allows writing and reading from outside the bounds of
+heap allocated arrays, although only at a specific offset from the start of
+these arrays.
+
+This results in both read and write gadgets, albeit very limited in scope.
+
+### Vulnerable Versions
+TensorFlow 1.15.0, 1.15.1, 1.15.2, 1.15.3, 2.0.0, 2.0.1, 2.0.2, 2.1.0, 2.1.1,
+2.2.0, 2.3.0.
+
+### Patches
+We have patched the issue in several commits
+([46d5b0852](https://github.com/tensorflow/tensorflow/commit/46d5b0852),
+[00302787b7](https://github.com/tensorflow/tensorflow/commit/00302787b7),
+[e11f5558](https://github.com/tensorflow/tensorflow/commit/e11f5558),
+[cd31fd0ce](https://github.com/tensorflow/tensorflow/commit/cd31fd0ce),
+[1970c21](https://github.com/tensorflow/tensorflow/commit/1970c21), and
+[fff2c83](https://github.com/tensorflow/tensorflow/commit/fff2c83)). We will
+release patch releases for all versions between 1.15 and 2.3.
+
+We recommend users to upgrade to TensorFlow 1.15.4, 2.0.3, 2.1.2, 2.2.1, or
+2.3.1.
+
+### Workarounds
+A potential workaround would be to add a custom `Verifier` to the model loading
+code to ensure that only operators which accept optional inputs use the `-1`
+special value and only for the tensors that they expect to be optional. Since
+this allow-list type approach is erro-prone, we advise upgrading to the patched
+code.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been reported by members of the Aivul Team from Qihoo
+360.
diff --git a/tensorflow/security/advisory/tfsa-2020-006.md b/tensorflow/security/advisory/tfsa-2020-006.md
new file mode 100644
index 00000000000..060c324eb22
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-006.md
@@ -0,0 +1,38 @@
+## TFSA-2020-006: Segmentation fault and/or data corruption due to invalid TFLite model
+
+### CVE Number
+CVE-2020-15210
+
+### Impact
+If a TFLite saved model uses the same tensor as both input and output of an
+operator, then, depending on the operator, we can observe a segmentation fault
+or just memory corruption.
+
+### Vulnerable Versions
+TensorFlow 1.15.0, 1.15.1, 1.15.2, 1.15.3, 2.0.0, 2.0.1, 2.0.2, 2.1.0, 2.1.1,
+2.2.0, 2.3.0.
+
+### Patches
+We have patched the issue in
+[d58c96946b](https://github.com/tensorflow/tensorflow/commit/d58c96946b) and
+will release patch releases for all versions between 1.15 and 2.3.
+
+We recommend users to upgrade to TensorFlow 1.15.4, 2.0.3, 2.1.2, 2.2.1, or
+2.3.1.
+
+### Workarounds
+A potential workaround would be to add a custom `Verifier` to the model loading
+code to ensure that no operator reuses tensors as both inputs and outputs. Care
+should be taken to check all types of inputs (i.e., constant or variable tensors
+as well as optional tensors).
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been discovered through a variant analysis of [a
+vulnerability reported by members of the Aivul Team from Qihoo
+360](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-005.md).
diff --git a/tensorflow/security/advisory/tfsa-2020-007.md b/tensorflow/security/advisory/tfsa-2020-007.md
new file mode 100644
index 00000000000..d8ddbb832b9
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-007.md
@@ -0,0 +1,52 @@
+## TFSA-2020-007: Null pointer dereference in TFLite
+
+### CVE Number
+CVE-2020-15209
+
+### Impact
+A crafted TFLite model can force a node to have as input a tensor backed by a
+`nullptr` buffer. This can be achieved by changing a buffer index in the
+flatbuffer serialization to convert a read-only tensor to a read-write one. The
+runtime assumes that these buffers are written to before a possible read, hence
+they are [initialized with
+`nullptr`](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/lite/core/subgraph.cc#L1224-L1227):
+```cc
+  TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
+                    GetLegacyQuantization(quantization),
+                    /*buffer=*/nullptr, required_bytes, allocation_type,
+                    nullptr, is_variable, &tensor);
+```
+
+However, by changing the buffer index for a tensor and implicitly converting
+that tensor to be a read-write one, as there is nothing in the model that writes
+to it, we get a null pointer dereference.
+
+### Vulnerable Versions
+TensorFlow 1.15.0, 1.15.1, 1.15.2, 1.15.3, 2.0.0, 2.0.1, 2.0.2, 2.1.0, 2.1.1,
+2.2.0, 2.3.0.
+
+### Patches
+We have patched the issue in
+[0b5662bc](https://github.com/tensorflow/tensorflow/commit/0b5662bc) and will
+release patch releases for all versions between 1.15 and 2.3.
+
+We recommend users to upgrade to TensorFlow 1.15.4, 2.0.3, 2.1.2, 2.2.1, or
+2.3.1.
+
+### Workarounds
+A potential workaround would be to add a custom `Verifier` to the model loading
+code to ensure that no operator reuses tensors as both inputs and outputs. Care
+should be taken to check all types of inputs (i.e., constant or variable tensors
+as well as optional tensors).
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been discovered by members of the Aivul Team and is also
+discoverable through a variant analysis of [another
+vulnerability reported by members of the Aivul Team from Qihoo
+360](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-005.md).
diff --git a/tensorflow/security/advisory/tfsa-2020-008.md b/tensorflow/security/advisory/tfsa-2020-008.md
new file mode 100644
index 00000000000..749b7c4a7d3
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-008.md
@@ -0,0 +1,44 @@
+## TFSA-2020-008: Data corruption due to dimension mismatch in TFLite
+
+### CVE Number
+CVE-2020-15208
+
+### Impact
+When determining the common dimension size of two tensors, [TFLite uses a
+`DCHECK`](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/lite/kernels/internal/types.h#L437-L442)
+which is no-op outside of debug compilation modes:
+```cc
+// Get common shape dim, DCHECKing that they all agree.
+inline int MatchingDim(const RuntimeShape& shape1, int index1,
+                       const RuntimeShape& shape2, int index2) {
+  TFLITE_DCHECK_EQ(shape1.Dims(index1), shape2.Dims(index2));
+  return shape1.Dims(index1);
+}
+```
+
+Since the function always returns the dimension of the first tensor, malicious
+attackers can craft cases where this is larger than that of the second tensor.
+In turn, this would result in reads/writes outside of bounds since the
+interpreter will wrongly assume that there is enough data in both tensors.
+
+### Vulnerable Versions
+TensorFlow 1.15.0, 1.15.1, 1.15.2, 1.15.3, 2.0.0, 2.0.1, 2.0.2, 2.1.0, 2.1.1,
+2.2.0, 2.3.0.
+
+### Patches
+We have patched the issue in
+[8ee24e7949a20](https://github.com/tensorflow/tensorflow/commit/8ee24e7949a20)
+and will release patch releases for all versions between 1.15 and 2.3.
+
+We recommend users to upgrade to TensorFlow 1.15.4, 2.0.3, 2.1.2, 2.2.1, or
+2.3.1.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been reported by members of the Aivul Team from Qihoo
+360.
diff --git a/tensorflow/security/advisory/tfsa-2020-009.md b/tensorflow/security/advisory/tfsa-2020-009.md
new file mode 100644
index 00000000000..86d43718ecb
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-009.md
@@ -0,0 +1,43 @@
+## TFSA-2020-009: Segfault and data corruption caused by negative indexing in TFLite
+
+### CVE Number
+CVE-2020-15207
+
+### Impact
+To mimic Python's indexing with negative values, TFLite uses `ResolveAxis` to
+convert negative values to positive indices. However, the only check that the
+converted index is now valid is [only present in debug
+builds](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/lite/kernels/internal/reference/reduce.h#L68-L72):
+```cc
+    // Handle negative index. A positive index 'p_idx' can be represented as a
+    // negative index 'n_idx' as: n_idx = p_idx-num_dims
+    // eg: For num_dims=3, [0, 1, 2] is the same as [-3, -2, -1]  */
+    int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
+    TFLITE_DCHECK(current >= 0 && current < num_dims);
+```
+
+If the `DCHECK` does not trigger, then code execution moves ahead with a
+negative index. This, in turn, results in accessing data out of bounds which
+results in segfaults and/or data corruption.
+
+### Vulnerable Versions
+TensorFlow 1.15.0, 1.15.1, 1.15.2, 1.15.3, 2.0.0, 2.0.1, 2.0.2, 2.1.0, 2.1.1,
+2.2.0, 2.3.0.
+
+### Patches
+We have patched the issue in
+[2d88f470dea2671b430884260f3626b1fe99830a](https://github.com/tensorflow/tensorflow/commit/2d88f470dea2671b430884260f3626b1fe99830a)
+and will release patch releases for all versions between 1.15 and 2.3.
+
+We recommend users to upgrade to TensorFlow 1.15.4, 2.0.3, 2.1.2, 2.2.1, or
+2.3.1.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been reported by members of the Aivul Team from Qihoo
+360.
diff --git a/tensorflow/security/advisory/tfsa-2020-010.md b/tensorflow/security/advisory/tfsa-2020-010.md
new file mode 100644
index 00000000000..0121f597884
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-010.md
@@ -0,0 +1,44 @@
+## TFSA-2020-010: Incomplete validation in TensorFlow's SavedModel's constant nodes causes segfaults
+
+### CVE Number
+CVE-2020-15206
+
+### Impact
+Changing the TensorFlow's `SavedModel` protocol buffer and altering the name of
+required keys results in segfaults and data corruption while loading the model.
+This can cause a denial of service in products using `tensorflow-serving` or
+other inference-as-a-service installments.
+
+We have added fixes to this in
+[f760f88b4267d981e13f4b302c437ae800445968](https://github.com/tensorflow/tensorflow/commit/f760f88b4267d981e13f4b302c437ae800445968)
+and
+[fcfef195637c6e365577829c4d67681695956e7d](https://github.com/tensorflow/tensorflow/commit/fcfef195637c6e365577829c4d67681695956e7d)
+(both going into TensorFlow 2.2.0 and 2.3.0 but not yet backported to earlier
+versions). However, this was not enough, as #41097 reports a different failure
+mode.
+
+### Vulnerable Versions
+TensorFlow 1.15.0, 1.15.1, 1.15.2, 1.15.3, 2.0.0, 2.0.1, 2.0.2, 2.1.0, 2.1.1,
+2.2.0, 2.3.0.
+
+### Patches
+We have patched the issue in
+[adf095206f25471e864a8e63a0f1caef53a0e3a6](https://github.com/tensorflow/tensorflow/commit/adf095206f25471e864a8e63a0f1caef53a0e3a6)
+and will release patch releases for all versions between 1.15 and 2.3. Patch
+releases for versions between 1.15 and 2.1 will also contain cherry-picks of
+[f760f88b4267d981e13f4b302c437ae800445968](https://github.com/tensorflow/tensorflow/commit/f760f88b4267d981e13f4b302c437ae800445968)
+and
+[fcfef195637c6e365577829c4d67681695956e7d](https://github.com/tensorflow/tensorflow/commit/fcfef195637c6e365577829c4d67681695956e7d).
+
+We recommend users to upgrade to TensorFlow 1.15.4, 2.0.3, 2.1.2, 2.2.1, or
+2.3.1.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been reported by Shuaike Dong, from Alipay Tian Qian
+Security Lab && Lab for Applied Security Research, CUHK.
diff --git a/tensorflow/security/advisory/tfsa-2020-011.md b/tensorflow/security/advisory/tfsa-2020-011.md
new file mode 100644
index 00000000000..67cd8f6cde7
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-011.md
@@ -0,0 +1,42 @@
+## TFSA-2020-011: Data leak in `tf.raw_ops.StringNGrams`
+
+### CVE Number
+CVE-2020-15205
+
+### Impact
+The `data_splits` argument of
+[`tf.raw_ops.StringNGrams`](https://www.tensorflow.org/api_docs/python/tf/raw_ops/StringNGrams)
+lacks validation. This allows a user to pass values that can cause heap
+overflow errors and even leak contents of memory
+```python
+>>> tf.raw_ops.StringNGrams(data=["aa", "bb", "cc", "dd", "ee", "ff"], data_splits=[0,8], separator=" ", ngram_widths=[3], left_pad="", right_pad="", pad_width=0, preserve_short_sequences=False)
+StringNGrams(ngrams=<tf.Tensor: shape=(6,), dtype=string, numpy=
+array([b'aa bb cc', b'bb cc dd', b'cc dd ee', b'dd ee ff',
+       b'ee ff \xf4j\xa7q\x7f\x00\x00q\x00\x00\x00\x00\x00\x00\x00\xd8\x9b~\xa8q\x7f\x00',
+       b'ff \xf4j\xa7q\x7f\x00\x00q\x00\x00\x00\x00\x00\x00\x00\xd8\x9b~\xa8q\x7f\x00 \x9b~\xa8q\x7f\x00\x00p\xf5j\xa7q\x7f\x00\x00H\xf8j\xa7q\x7f\x00\x00\xf0\xf3\xf7\x85q\x7f\x00\x00`}\xa6\x00\x00\x00\x00\x00`~\xa6\x00\x00\x00\x00\x00\xb0~\xeb\x9bq\x7f\x00'],...
+```
+
+All the binary strings after `ee ff` are contents from the memory stack. Since
+these can contain return addresses, this data leak can be used to defeat ASLR.
+
+### Vulnerable Versions
+TensorFlow 1.15.0, 1.15.1, 1.15.2, 1.15.3, 2.0.0, 2.0.1, 2.0.2, 2.1.0, 2.1.1,
+2.2.0, 2.3.0.
+
+### Patches
+We have patched the issue in
+[0462de5b544ed4731aa2fb23946ac22c01856b80](https://github.com/tensorflow/tensorflow/commit/0462de5b544ed4731aa2fb23946ac22c01856b80)
+and will release patch releases for all versions between 1.15 and 2.3.
+
+We recommend users to upgrade to TensorFlow 1.15.4, 2.0.3, 2.1.2, 2.2.1, or
+2.3.1.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been reported by members of the Aivul Team from Qihoo
+360.
diff --git a/tensorflow/security/advisory/tfsa-2020-012.md b/tensorflow/security/advisory/tfsa-2020-012.md
new file mode 100644
index 00000000000..90f36b39f8f
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-012.md
@@ -0,0 +1,38 @@
+## TFSA-2020-012: Segfault by calling session-only ops in eager mode
+
+### CVE Number
+CVE-2020-15204
+
+### Impact
+In eager mode, TensorFlow does not set the session state. Hence, calling
+`tf.raw_ops.GetSessionHandle` or `tf.raw_ops.GetSessionHandleV2` results in a
+[null pointer
+dereference](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/core/kernels/session_ops.cc#L45):
+```cc
+  int64 id = ctx->session_state()->GetNewId();
+```
+
+In the above snippet, in eager mode, `ctx->session_state()` returns `nullptr`.
+Since code immediately dereferences this, we get a segmentation fault.
+
+### Vulnerable Versions
+TensorFlow 1.15.0, 1.15.1, 1.15.2, 1.15.3, 2.0.0, 2.0.1, 2.0.2, 2.1.0, 2.1.1,
+2.2.0, 2.3.0.
+
+### Patches
+We have patched the issue in
+[9a133d73ae4b4664d22bd1aa6d654fec13c52ee1](https://github.com/tensorflow/tensorflow/commit/9a133d73ae4b4664d22bd1aa6d654fec13c52ee1)
+and will release patch releases for all versions between 1.15 and 2.3.
+
+We recommend users to upgrade to TensorFlow 1.15.4, 2.0.3, 2.1.2, 2.2.1, or
+2.3.1.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been reported by members of the Aivul Team from Qihoo
+360.
diff --git a/tensorflow/security/advisory/tfsa-2020-013.md b/tensorflow/security/advisory/tfsa-2020-013.md
new file mode 100644
index 00000000000..2672c070d6b
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-013.md
@@ -0,0 +1,81 @@
+## TFSA-2020-013: Format-string vulnerability in TensorFlow's `as_string`
+
+### CVE Number
+CVE-2020-15203
+
+### Impact
+By controlling the `fill` argument of
+[`tf.strings.as_string`](https://www.tensorflow.org/api_docs/python/tf/strings/as_string),
+a malicious attacker is able to trigger a format string vulnerability due to the
+way the internal format use in a `printf` call is
+[constructed](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/core/kernels/as_string_op.cc#L68-L74):
+```cc
+    format_ = "%";
+    if (width > -1) {
+      strings::Appendf(&format_, "%s%d", fill_string.c_str(), width);
+    }
+    if (precision > -1) {
+      strings::Appendf(&format_, ".%d", precision);
+    }
+```
+
+This can result in unexpected output:
+```python
+In [1]: tf.strings.as_string(input=[1234], width=6, fill='-')
+Out[1]: <tf.Tensor: shape=(1,), dtype=string, numpy=array(['1234  '],
+dtype=object)>
+In [2]: tf.strings.as_string(input=[1234], width=6, fill='+')
+Out[2]: <tf.Tensor: shape=(1,), dtype=string, numpy=array([' +1234'],
+dtype=object)>
+In [3]: tf.strings.as_string(input=[1234], width=6, fill="h")
+Out[3]: <tf.Tensor: shape=(1,), dtype=string, numpy=array(['%6d'],
+dtype=object)>
+In [4]: tf.strings.as_string(input=[1234], width=6, fill="d")
+Out[4]: <tf.Tensor: shape=(1,), dtype=string, numpy=array(['12346d'],
+dtype=object)>
+In [5]: tf.strings.as_string(input=[1234], width=6, fill="o")
+Out[5]: <tf.Tensor: shape=(1,), dtype=string, numpy=array(['23226d'],
+dtype=object)>
+In [6]: tf.strings.as_string(input=[1234], width=6, fill="x")
+Out[6]: <tf.Tensor: shape=(1,), dtype=string, numpy=array(['4d26d'],
+dtype=object)>
+In [7]: tf.strings.as_string(input=[1234], width=6, fill="g")
+Out[7]: <tf.Tensor: shape=(1,), dtype=string, numpy=array(['8.67458e-3116d'],
+dtype=object)>
+In [8]: tf.strings.as_string(input=[1234], width=6, fill="a")
+Out[8]: <tf.Tensor: shape=(1,), dtype=string,
+numpy=array(['0x0.00ff7eebb4d4p-10226d'], dtype=object)>
+In [9]: tf.strings.as_string(input=[1234], width=6, fill="c")
+Out[9]: <tf.Tensor: shape=(1,), dtype=string, numpy=array(['\xd26d'],
+dtype=object)>
+In [10]: tf.strings.as_string(input=[1234], width=6, fill="p")
+Out[10]: <tf.Tensor: shape=(1,), dtype=string, numpy=array(['0x4d26d'],
+dtype=object)>
+In [11]: tf.strings.as_string(input=[1234], width=6, fill='m')
+Out[11]: <tf.Tensor: shape=(1,), dtype=string, numpy=array(['Success6d'],
+dtype=object)>
+```
+
+However, passing in `n` or `s` results in segmentation fault.
+
+### Vulnerable Versions
+TensorFlow 1.15.0, 1.15.1, 1.15.2, 1.15.3, 2.0.0, 2.0.1, 2.0.2, 2.1.0, 2.1.1,
+2.2.0, 2.3.0.
+
+### Patches
+We have patched the issue in
+[33be22c65d86256e6826666662e40dbdfe70ee83](https://github.com/tensorflow/tensorflow/commit/33be22c65d86256e6826666662e40dbdfe70ee83)
+and will release patch releases for all versions between 1.15 and 2.3.
+
+We recommend users to upgrade to TensorFlow 1.15.4, 2.0.3, 2.1.2, 2.2.1, or
+2.3.1.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been reported by members of the Aivul Team from Qihoo
+360.
diff --git a/tensorflow/security/advisory/tfsa-2020-014.md b/tensorflow/security/advisory/tfsa-2020-014.md
new file mode 100644
index 00000000000..3c6294e44ac
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-014.md
@@ -0,0 +1,53 @@
+## TFSA-2020-014: Integer truncation in Shard API usage
+
+### CVE Number
+CVE-2020-15202
+
+### Impact
+The [`Shard`
+API](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/core/util/work_sharder.h#L59-L60)
+in TensorFlow expects the last argument to be a function taking two `int64`
+(i.e., `long long`) arguments:
+```cc
+void Shard(int max_parallelism, thread::ThreadPool* workers, int64 total,
+           int64 cost_per_unit, std::function<void(int64, int64)> work);
+```
+
+However, there are several places in TensorFlow where a lambda taking `int` or
+`int32` arguments is [being
+used](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/core/kernels/random_op.cc#L204-L205):
+```cc
+    auto DoWork = [samples_per_alpha, num_alphas, &rng, samples_flat,
+                   alpha_flat](int start_output, int limit_output) {...};
+    Shard(worker_threads.num_threads, worker_threads.workers,
+          num_alphas * samples_per_alpha, kElementCost, DoWork);
+```
+
+In these cases, if the amount of work to be parallelized is large enough,
+integer truncation occurs. Depending on how the two arguments of the lambda are
+used, this can result in segfaults, read/write outside of heap allocated arrays,
+stack overflows, or data corruption.
+
+### Vulnerable Versions
+TensorFlow 1.15.0, 1.15.1, 1.15.2, 1.15.3, 2.0.0, 2.0.1, 2.0.2, 2.1.0, 2.1.1,
+2.2.0, 2.3.0.
+
+### Patches
+We have patched the issue in
+[27b417360cbd671ef55915e4bb6bb06af8b8a832](https://github.com/tensorflow/tensorflow/commit/27b417360cbd671ef55915e4bb6bb06af8b8a832)
+and
+[ca8c013b5e97b1373b3bb1c97ea655e69f31a575](https://github.com/tensorflow/tensorflow/commit/ca8c013b5e97b1373b3bb1c97ea655e69f31a575).
+We will release patch releases for all versions between 1.15 and 2.3.
+
+We recommend users to upgrade to TensorFlow 1.15.4, 2.0.3, 2.1.2, 2.2.1, or
+2.3.1.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been reported by members of the Aivul Team from Qihoo
+360.
diff --git a/tensorflow/security/advisory/tfsa-2020-015.md b/tensorflow/security/advisory/tfsa-2020-015.md
new file mode 100644
index 00000000000..97e01fed628
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-015.md
@@ -0,0 +1,43 @@
+## TFSA-2020-015: Heap buffer overflow due to invalid splits in RaggedCountSparseOutput
+
+### CVE Number
+CVE-2020-15201
+
+### Impact
+The `RaggedCountSparseOutput` implementation does not validate that the input
+arguments form a valid ragged tensor. In particular, there is no validation that
+the values in the `splits` tensor generate a valid partitioning of the `values`
+tensor. Hence, this code is prone to [heap buffer
+overflow](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/core/kernels/count_ops.cc#L248-L251):
+```cc
+    for (int idx = 0; idx < num_values; ++idx) {
+      while (idx >= splits_values(batch_idx)) {
+        batch_idx++;
+      }
+      // ...
+    }
+```
+
+If `split_values` does not end with a value at least `num_values` then the
+`while` loop condition will trigger a read outside of the bounds of
+`split_values` once `batch_idx` grows too large.
+
+### Vulnerable Versions
+TensorFlow 2.3.0.
+
+### Patches
+We have patched the issue in
+[3cbb917b4714766030b28eba9fb41bb97ce9ee02](https://github.com/tensorflow/tensorflow/commit/3cbb917b4714766030b28eba9fb41bb97ce9ee02)
+and will release a patch release.
+
+We recommend users to upgrade to TensorFlow 2.3.1.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been reported by members of the Aivul Team from Qihoo
+360.
diff --git a/tensorflow/security/advisory/tfsa-2020-016.md b/tensorflow/security/advisory/tfsa-2020-016.md
new file mode 100644
index 00000000000..de066d31edb
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-016.md
@@ -0,0 +1,51 @@
+## TFSA-2020-016: Segfault due to invalid splits in RaggedCountSparseOutput
+
+### CVE Number
+CVE-2020-15200
+
+### Impact
+The `RaggedCountSparseOutput` implementation does not validate that the input
+arguments form a valid ragged tensor. In particular, there is no validation that
+the values in the `splits` tensor generate a valid partitioning of the `values`
+tensor. Thus, the [following
+code](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/core/kernels/count_ops.cc#L248-L265
+) sets up conditions to cause a heap buffer overflow:
+```cc
+    auto per_batch_counts = BatchedMap<W>(num_batches);
+    int batch_idx = 0;
+    for (int idx = 0; idx < num_values; ++idx) {
+      while (idx >= splits_values(batch_idx)) {
+        batch_idx++;
+      }
+      const auto& value = values_values(idx);
+      if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
+        per_batch_counts[batch_idx - 1][value] = 1;
+      }
+    }
+```
+
+A `BatchedMap` is equivalent to a vector where each element is a hashmap.
+However, if the first element of `splits_values` is not 0, `batch_idx` will
+never be 1, hence there will be no hashmap at index 0 in `per_batch_counts`.
+Trying to access that in the user code results in a segmentation fault.
+
+### Vulnerable Versions
+TensorFlow 2.3.0.
+
+### Patches
+We have patched the issue in
+[3cbb917b4714766030b28eba9fb41bb97ce9ee02](https://github.com/tensorflow/tensorflow/commit/3cbb917b4714766030b28eba9fb41bb97ce9ee02)
+and will release a patch release.
+
+We recommend users to upgrade to TensorFlow 2.3.1.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been discovered through a variant analysis of [a
+vulnerability reported by members of the Aivul Team from Qihoo
+360](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-015.md).
diff --git a/tensorflow/security/advisory/tfsa-2020-017.md b/tensorflow/security/advisory/tfsa-2020-017.md
new file mode 100644
index 00000000000..21b138cf5b1
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-017.md
@@ -0,0 +1,41 @@
+## TFSA-2020-017: Abort due to invalid splits in RaggedCountSparseOutput
+
+### CVE Number
+CVE-2020-15199
+
+### Impact
+The `RaggedCountSparseOutput` does not validate that the input arguments form a
+valid ragged tensor. In particular, there is no validation that the `splits`
+tensor has the minimum required number of elements. Code uses this quantity to
+[initialize a different data
+structure](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/core/kernels/count_ops.cc#L241-L244):
+```cc
+    int num_batches = splits.NumElements() - 1;
+    auto per_batch_counts = BatchedMap<W>(num_batches);
+```
+
+Since `BatchedMap` is equivalent to a vector, it needs to have at least one
+element to not be `nullptr`. If user passes a `splits` tensor that is empty or
+has exactly one element, we get a `SIGABRT` signal raised by the operating
+system.
+
+### Vulnerable Versions
+TensorFlow 2.3.0.
+
+### Patches
+We have patched the issue in
+[3cbb917b4714766030b28eba9fb41bb97ce9ee02](https://github.com/tensorflow/tensorflow/commit/3cbb917b4714766030b28eba9fb41bb97ce9ee02)
+and will release a patch release.
+
+We recommend users to upgrade to TensorFlow 2.3.1.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been discovered through a variant analysis of [a
+vulnerability reported by members of the Aivul Team from Qihoo
+360](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-015.md).
diff --git a/tensorflow/security/advisory/tfsa-2020-018.md b/tensorflow/security/advisory/tfsa-2020-018.md
new file mode 100644
index 00000000000..d794a2f457d
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-018.md
@@ -0,0 +1,42 @@
+## TFSA-2020-018: Heap buffer overflow due to invalid indices in SparseCountSparseOutput
+
+### CVE Number
+CVE-2020-15198
+
+### Impact
+The `SparseCountSparseOutput` implementation does not validate that the input
+arguments form a valid sparse tensor. In particular, there is no validation that
+the `indices` tensor has the same shape as the `values` one. The values in these
+tensors are always [accessed in
+parallel](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/core/kernels/count_ops.cc#L193-L195):
+```cc
+    for (int idx = 0; idx < num_values; ++idx) {
+      int batch = is_1d ? 0 : indices_values(idx, 0);
+      const auto& value = values_values(idx);
+      // ...
+    }
+```
+
+Thus, a shape mismatch can result in accesses outside the bounds of heap
+allocated buffers.
+
+### Vulnerable Versions
+TensorFlow 2.3.0.
+
+### Patches
+We have patched the issue in
+[3cbb917b4714766030b28eba9fb41bb97ce9ee02](https://github.com/tensorflow/tensorflow/commit/3cbb917b4714766030b28eba9fb41bb97ce9ee02)
+and will release a patch release.
+
+We recommend users to upgrade to TensorFlow 2.3.1.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been discovered through a variant analysis of [a
+vulnerability reported by members of the Aivul Team from Qihoo
+360](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-015.md).
diff --git a/tensorflow/security/advisory/tfsa-2020-019.md b/tensorflow/security/advisory/tfsa-2020-019.md
new file mode 100644
index 00000000000..6284c5a74e6
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-019.md
@@ -0,0 +1,40 @@
+## TFSA-2020-019: Crash due to invalid splits in SparseCountSparseOutput
+
+### CVE Number
+CVE-2020-15197
+
+### Impact
+The `SparseCountSparseOutput` implementation does not validate that the input
+arguments form a valid sparse tensor. In particular, there is no validation that
+the `indices` tensor has rank 2. This tensor must be a matrix because code
+assumes its elements are [accessed as elements of a
+matrix](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/core/kernels/count_ops.cc#L185):
+```cc
+    const auto indices_values = indices.matrix<int64>();
+```
+
+However, malicious users can pass in tensors of different rank, resulting in a
+`CHECK` assertion failure and a crash. This can be used to cause denial of
+service in serving installations, if users are allowed to control the components
+of the input sparse tensor.
+
+### Vulnerable Versions
+TensorFlow 2.3.0.
+
+### Patches
+We have patched the issue in
+[3cbb917b4714766030b28eba9fb41bb97ce9ee02](https://github.com/tensorflow/tensorflow/commit/3cbb917b4714766030b28eba9fb41bb97ce9ee02)
+and will release a patch release.
+
+We recommend users to upgrade to TensorFlow 2.3.1.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been discovered through a variant analysis of [a
+vulnerability reported by members of the Aivul Team from Qihoo
+360](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-015.md).
diff --git a/tensorflow/security/advisory/tfsa-2020-020.md b/tensorflow/security/advisory/tfsa-2020-020.md
new file mode 100644
index 00000000000..30f4fdcdccc
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-020.md
@@ -0,0 +1,55 @@
+## TFSA-2020-020: Heap buffer overflow in weighted sparse count ops
+
+### CVE Number
+CVE-2020-15196
+
+### Impact
+The `SparseCountSparseOutput` and `RaggedCountSparseOutput` implementations
+don't validate that the `weights` tensor has the same shape as the data. The
+check exists for `DenseCountSparseOutput`, where both tensors are [fully
+specified](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/core/kernels/count_ops.cc#L110-L117):
+```cc
+    if (use_weights) {
+      OP_REQUIRES(
+          context, weights.shape() == data.shape(),
+          errors::InvalidArgument(
+              "Weights and data must have the same shape. Weight shape: ",
+              weights.shape().DebugString(),
+              "; data shape: ", data.shape().DebugString()));
+    }
+```
+
+In the sparse and ragged count weights are still accessed [in parallel with the
+data](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/core/kernels/count_ops.cc#L199-L201):
+```cc
+    for (int idx = 0; idx < num_values; ++idx) {
+      int batch = is_1d ? 0 : indices_values(idx, 0);
+      const auto& value = values_values(idx);
+      per_batch_counts[batch][value] += weight_values(idx);
+   }
+```
+
+But, since there is no validation, a user passing fewer weights than the values
+for the tensors can generate a read from outside the bounds of the heap buffer
+allocated for the weights.
+
+### Vulnerable Versions
+TensorFlow 2.3.0.
+
+### Patches
+We have patched the issue in
+[3cbb917b4714766030b28eba9fb41bb97ce9ee02](https://github.com/tensorflow/tensorflow/commit/3cbb917b4714766030b28eba9fb41bb97ce9ee02)
+and will release a patch release.
+
+We recommend users to upgrade to TensorFlow 2.3.1.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been discovered through a variant analysis of [a
+vulnerability reported by members of the Aivul Team from Qihoo
+360](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-015.md).
diff --git a/tensorflow/security/advisory/tfsa-2020-021.md b/tensorflow/security/advisory/tfsa-2020-021.md
new file mode 100644
index 00000000000..4f9ebac95dc
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-021.md
@@ -0,0 +1,36 @@
+## TFSA-2020-021: Heap buffer overflow in SparseFillEmptyRowsGrad
+
+### CVE Number
+CVE-2020-15195
+
+### Impact
+The implementation of `SparseFillEmptyRowsGrad` uses a [double indexing
+pattern](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc#L263-L269):
+```cc
+      d_values(i) = grad_values(reverse_index_map(i));
+```
+
+It is possible for `reverse_index_map(i)` to be an index outside of bounds of
+`grad_values`, thus resulting in a heap buffer overflow.
+
+### Vulnerable Versions
+TensorFlow 1.15.0, 1.15.1, 1.15.2, 1.15.3, 2.0.0, 2.0.1, 2.0.2, 2.1.0, 2.1.1,
+2.2.0, 2.3.0.
+
+### Patches
+We have patched the issue in
+[390611e0d45c5793c7066110af37c8514e6a6c54](https://github.com/tensorflow/tensorflow/commit/390611e0d45c5793c7066110af37c8514e6a6c54)
+and will release a patch release for all affected versions.
+
+We recommend users to upgrade to TensorFlow 1.15.4, 2.0.3, 2.1.2, 2.2.1, or
+2.3.1.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been reported by members of the Aivul Team from Qihoo
+360.
diff --git a/tensorflow/security/advisory/tfsa-2020-022.md b/tensorflow/security/advisory/tfsa-2020-022.md
new file mode 100644
index 00000000000..5ed424235f2
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-022.md
@@ -0,0 +1,46 @@
+## TFSA-2020-022: Crash due to invalid shape of `grad_values` in SparseFillEmptyRowsGrad
+
+### CVE Number
+CVE-2020-15194
+
+### Impact
+The `SparseFillEmptyRowsGrad` implementation has [incomplete validation of the
+shapes of its
+arguments](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc#L235-L241):
+```cc
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVector(reverse_index_map_t->shape()),
+        errors::InvalidArgument("reverse_index_map must be a vector, saw: ",
+                                reverse_index_map_t->shape().DebugString()));
+
+    const auto reverse_index_map = reverse_index_map_t->vec<int64>();
+    const auto grad_values = grad_values_t->vec<T>();
+```
+
+Although `reverse_index_map_t` and `grad_values_t` are accessed in a similar
+pattern, only `reverse_index_map_t` is validated to be of proper shape. Hence,
+malicious users can pass a bad `grad_values_t` to trigger an assertion failure
+in `vec`, causing denial of service in serving installations.
+
+### Vulnerable Versions
+TensorFlow 1.15.0, 1.15.1, 1.15.2, 1.15.3, 2.0.0, 2.0.1, 2.0.2, 2.1.0, 2.1.1,
+2.2.0, 2.3.0.
+
+### Patches
+We have patched the issue in
+[390611e0d45c5793c7066110af37c8514e6a6c54](https://github.com/tensorflow/tensorflow/commit/390611e0d45c5793c7066110af37c8514e6a6c54)
+and will release a patch release for all affected versions.
+
+We recommend users to upgrade to TensorFlow 1.15.4, 2.0.3, 2.1.2, 2.2.1, or
+2.3.1.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been discovered through a variant analysis of [a
+vulnerability reported by members of the Aivul Team from Qihoo
+360](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-021.md).
diff --git a/tensorflow/security/advisory/tfsa-2020-023.md b/tensorflow/security/advisory/tfsa-2020-023.md
new file mode 100644
index 00000000000..53d4b329b07
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-023.md
@@ -0,0 +1,52 @@
+## TFSA-2020-023: Memory corruption in `dlpack.to_dlpack`
+
+### CVE Number
+CVE-2020-15193
+
+### Impact
+The implementation of `dlpack.to_dlpack` can be made to use uninitialized
+memory resulting in further memory corruption. This is because the pybind11
+glue code [assumes that the argument is a
+tensor](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/python/tfe_wrapper.cc#L1361):
+```cc
+    TFE_TensorHandle* thandle = EagerTensor_Handle(eager_tensor_pyobject_ptr);
+```
+
+However, there is nothing stopping users from passing in a Python object instead of a tensor.
+```python
+In [2]: tf.experimental.dlpack.to_dlpack([2])
+==1720623==WARNING: MemorySanitizer: use-of-uninitialized-value
+    #0 0x55b0ba5c410a in tensorflow::(anonymous namespace)::GetTensorFromHandle(TFE_TensorHandle*, TF_Status*) third_party/tensorflow/c/eager/dlpack.cc:46:7
+    #1 0x55b0ba5c38f4 in tensorflow::TFE_HandleToDLPack(TFE_TensorHandle*, TF_Status*) third_party/tensorflow/c/eager/dlpack.cc:252:26
+...
+```
+
+The uninitialized memory address is due to a
+[`reinterpret_cast`](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/python/eager/pywrap_tensor.cc#L848-L850):
+```cc
+TFE_TensorHandle* EagerTensor_Handle(const PyObject* o) {
+  return reinterpret_cast<const EagerTensor*>(o)->handle;
+}
+```
+
+Since the `PyObject` is a Python object, not a TensorFlow Tensor, the cast to `EagerTensor` fails.
+
+### Vulnerable Versions
+TensorFlow 2.2.0, 2.3.0.
+
+### Patches
+We have patched the issue in
+[22e07fb204386768e5bcbea563641ea11f96ceb8](https://github.com/tensorflow/tensorflow/commit/22e07fb204386768e5bcbea563641ea11f96ceb8)
+and will release a patch release for all affected versions.
+
+We recommend users to upgrade to TensorFlow 2.2.1 or 2.3.1.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been reported by members of the Aivul Team from Qihoo
+360.
diff --git a/tensorflow/security/advisory/tfsa-2020-024.md b/tensorflow/security/advisory/tfsa-2020-024.md
new file mode 100644
index 00000000000..b6cd333fb2d
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-024.md
@@ -0,0 +1,50 @@
+## TFSA-2020-024: Memory leak in `dlpack.to_dlpack`
+
+### CVE Number
+CVE-2020-15192
+
+### Impact
+If a user passes a list of strings to `dlpack.to_dlpack` there is a memory leak
+following an expected [validation failure](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/c/eager/dlpack.cc#L100-L104):
+```cc
+      status->status = tensorflow::errors::InvalidArgument(
+          DataType_Name(static_cast<DataType>(data_type)),
+          " is not supported by dlpack");
+```
+
+The allocated memory is
+[from](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/c/eager/dlpack.cc#L256):
+```cc
+  auto* tf_dlm_tensor_ctx = new TfDlManagedTensorCtx(tensor_ref);
+```
+
+The issue occurs because the `status` argument during validation failures [is not
+properly checked](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/c/eager/dlpack.cc#L265-L267):
+```cc
+  dlm_tensor->dl_tensor.data = TFE_TensorHandleDevicePointer(h, status);
+  dlm_tensor->dl_tensor.dtype = GetDlDataType(data_type, status);
+```
+
+Since each of the above methods can return an error status, the `status` value
+must be checked before continuing.
+
+### Vulnerable Versions
+TensorFlow 2.2.0, 2.3.0.
+
+### Patches
+We have patched the issue in
+[22e07fb204386768e5bcbea563641ea11f96ceb8](https://github.com/tensorflow/tensorflow/commit/22e07fb204386768e5bcbea563641ea11f96ceb8)
+and will release a patch release for all affected versions.
+
+We recommend users to upgrade to TensorFlow 2.2.1 or 2.3.1.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been discovered through a variant analysis of [a
+vulnerability reported by members of the Aivul Team from Qihoo
+360](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-023.md).
diff --git a/tensorflow/security/advisory/tfsa-2020-025.md b/tensorflow/security/advisory/tfsa-2020-025.md
new file mode 100644
index 00000000000..0f055894d5e
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-025.md
@@ -0,0 +1,47 @@
+## TFSA-2020-025: Undefined behavior in `dlpack.to_dlpack`
+
+### CVE Number
+CVE-2020-15191
+
+### Impact
+If a user passes an invalid argument to `dlpack.to_dlpack` the expected
+validations will cause variables to bind to `nullptr` while setting a `status`
+variable to the error condition.
+
+However, this `status` argument is not [properly
+checked](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/c/eager/dlpack.cc#L265-L267):
+```cc
+  dlm_tensor->dl_tensor.data = TFE_TensorHandleDevicePointer(h, status);
+  dlm_tensor->dl_tensor.dtype = GetDlDataType(data_type, status);
+```
+
+Hence, code following these methods will [bind references to null
+pointers](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/c/eager/dlpack.cc#L279-L285):
+```cc
+  dlm_tensor->dl_tensor.shape = &(*shape_arr)[0];
+  dlm_tensor->dl_tensor.strides = &(*stride_arr)[0];
+```
+
+This is undefined behavior and reported as an error if compiling with
+`-fsanitize=null`.
+
+### Vulnerable Versions
+TensorFlow 2.2.0, 2.3.0.
+
+### Patches
+We have patched the issue in
+[22e07fb204386768e5bcbea563641ea11f96ceb8](https://github.com/tensorflow/tensorflow/commit/22e07fb204386768e5bcbea563641ea11f96ceb8)
+and will release a patch release for all affected versions.
+
+We recommend users to upgrade to TensorFlow 2.2.1 or 2.3.1.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been discovered through a variant analysis of [a
+vulnerability reported by members of the Aivul Team from Qihoo
+360](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2020-023.md).
diff --git a/tensorflow/security/advisory/tfsa-2020-026.md b/tensorflow/security/advisory/tfsa-2020-026.md
new file mode 100644
index 00000000000..8dd2d14cf0c
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2020-026.md
@@ -0,0 +1,49 @@
+## TFSA-2020-026: Segfault in `tf.raw_ops.Switch` in eager mode
+
+### CVE Number
+CVE-2020-15190
+
+### Impact
+The
+[`tf.raw_ops.Switch`](https://www.tensorflow.org/api_docs/python/tf/raw_ops/Switch)
+operation takes as input a tensor and a boolean and outputs two tensors.
+Depending on the boolean value, one of the tensors is exactly the input tensor
+whereas the other one should be an empty tensor.
+
+However, the eager runtime [traverses all tensors in the
+output](https://github.com/tensorflow/tensorflow/blob/0e68f4d3295eb0281a517c3662f6698992b7b2cf/tensorflow/core/common_runtime/eager/kernel_and_device.cc#L308-L313):
+```cc
+  if (outputs != nullptr) {
+    outputs->clear();
+    for (int i = 0; i < context.num_outputs(); ++i) {
+      outputs->push_back(Tensor(*context.mutable_output(i)));
+    }
+  }
+```
+
+Since only one of the tensors is defined, the other one is `nullptr`, hence we
+are binding a reference to `nullptr`. This is undefined behavior and reported as
+an error if compiling with `-fsanitize=null`. In this case, this results in a
+segmentation fault.
+
+### Vulnerable Versions
+TensorFlow 1.15.0, 1.15.1, 1.15.2, 1.15.3, 2.0.0, 2.0.1, 2.0.2, 2.1.0, 2.1.1,
+2.2.0, 2.3.0.
+
+### Patches
+We have patched the issue in
+[da8558533d925694483d2c136a9220d6d49d843c](https://github.com/tensorflow/tensorflow/commit/da8558533d925694483d2c136a9220d6d49d843c)
+and will release a patch release for all affected versions.
+
+We recommend users to upgrade to TensorFlow 1.15.4, 2.0.3, 2.1.2, 2.2.1, or
+2.3.1.
+
+### For more information
+Please consult [our security
+guide](https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md) for
+more information regarding the security model and how to contact us with issues
+and questions.
+
+### Attribution
+This vulnerability has been reported by members of the Aivul Team from Qihoo
+360.
diff --git a/tensorflow/security/fuzzing/BUILD b/tensorflow/security/fuzzing/BUILD
index 75c62ec8bf1..2f2bf6d29b4 100644
--- a/tensorflow/security/fuzzing/BUILD
+++ b/tensorflow/security/fuzzing/BUILD
@@ -27,15 +27,6 @@ tf_fuzz_target(
     ],
 )
 
-tf_fuzz_target(
-    name = "consume_non_whitespace_fuzz",
-    srcs = ["consume_non_whitespace_fuzz.cc"],
-    deps = [
-        "//tensorflow/core/platform:str_util",
-        "//tensorflow/core/platform:stringpiece",
-    ],
-)
-
 tf_fuzz_target(
     name = "consume_leading_digits_fuzz",
     srcs = ["consume_leading_digits_fuzz.cc"],
diff --git a/tensorflow/security/fuzzing/consume_non_whitespace_fuzz.cc b/tensorflow/security/fuzzing/consume_non_whitespace_fuzz.cc
deleted file mode 100644
index 6d2b5b929b8..00000000000
--- a/tensorflow/security/fuzzing/consume_non_whitespace_fuzz.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <cstdint>
-#include <cstdlib>
-
-#include "tensorflow/core/platform/str_util.h"
-#include "tensorflow/core/platform/stringpiece.h"
-
-// This is a fuzzer for tensorflow::str_util::ConsumeNonWhitespace
-
-namespace {
-
-extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
-  uint8_t *byte_data = const_cast<uint8_t *>(data);
-  char *char_data = reinterpret_cast<char *>(byte_data);
-
-  tensorflow::StringPiece sp(char_data, size);
-  tensorflow::StringPiece spe;
-
-  while (!sp.empty()) {
-    const size_t initial_size = sp.size();
-    (void)initial_size;  // "use" initial_size even if assert is disabled
-
-    const bool leading_whitespace =
-        tensorflow::str_util::ConsumeNonWhitespace(&sp, &spe);
-
-    if (leading_whitespace) {
-      assert(!spe.empty());
-    }
-    assert(initial_size == (sp.size() + spe.size()));
-
-    tensorflow::str_util::RemoveLeadingWhitespace(&sp);
-    assert(initial_size > sp.size());
-  }
-
-  return 0;
-}
-
-}  // namespace
diff --git a/tensorflow/security/fuzzing/stringprintf_fuzz.cc b/tensorflow/security/fuzzing/stringprintf_fuzz.cc
index 5cb2afe04a1..6980e0b9083 100644
--- a/tensorflow/security/fuzzing/stringprintf_fuzz.cc
+++ b/tensorflow/security/fuzzing/stringprintf_fuzz.cc
@@ -30,14 +30,30 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
   const char split_a = split & 0x07;
   const char split_b = (split >> 3) & 0x07;
 
-  const std::string sa_string = fuzzed_data.ConsumeBytesAsString(split_a);
-  const std::string sb_string = fuzzed_data.ConsumeBytesAsString(split_b);
-  const std::string sc_string = fuzzed_data.ConsumeRemainingBytesAsString();
-  const char *sa = sa_string.c_str();
-  const char *sb = sb_string.c_str();
-  const char *sc = sc_string.c_str();
+  const std::string ss[3] = {
+      fuzzed_data.ConsumeBytesAsString(split_a),
+      fuzzed_data.ConsumeBytesAsString(split_b),
+      fuzzed_data.ConsumeRemainingBytesAsString(),
+  };
+  const std::string all = ss[0] + ss[1] + ss[2];
 
-  tensorflow::strings::Printf("%s %s %s", sa, sb, sc);
+  int n[4] = {-1, -1, -1, -1};
+  const std::string ret =
+      tensorflow::strings::Printf("%n%s%n%s%n%s%n", &n[0], ss[0].c_str(), &n[1],
+                                  ss[1].c_str(), &n[2], ss[2].c_str(), &n[3]);
+
+  int size_so_far = 0;
+  for (int i = 0; i < 3; i++) {
+    assert(n[i] >= 0);
+    assert(n[i] <= size_so_far);
+    size_so_far += ss[i].size();
+  }
+
+  assert(n[3] >= 0);
+  assert(n[3] <= size_so_far);
+  assert(n[3] <= all.size());
+  assert(n[3] <= size - 1);
+  assert(ret.size() == n[3]);
 
   return 0;
 }
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 22aa60a70a4..d6333d54c1e 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -4,6 +4,8 @@
 # Throughout this file, all targets are built with the standard crosstool and
 # do not link against restricted binary blobs.
 
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
@@ -59,7 +61,6 @@ cc_library(
         "blas.h",
         "device_description.h",
         "device_options.h",
-        "dnn.h",
         "event.cc",
         "fft.h",
         "kernel_cache_config.h",
@@ -101,7 +102,6 @@ cc_library(
 cc_library(
     name = "kernel",
     srcs = [
-        "dnn.h",
         "fft.h",
         "kernel.cc",
         "plugin.h",
@@ -298,6 +298,7 @@ cc_library(
     name = "host_or_device_scalar",
     hdrs = ["host_or_device_scalar.h"],
     deps = [
+        ":data_type",
         ":device_memory",
         "//tensorflow/stream_executor/platform",
     ],
@@ -329,7 +330,6 @@ cc_library(
     ],
     hdrs = [
         "blas.h",
-        "dnn.h",
         "executor_cache.h",
         "fft.h",
         "kernel.h",
@@ -421,11 +421,21 @@ tf_proto_library(
     make_default_target_header_only = True,
 )
 
+cc_library(
+    name = "data_type",
+    hdrs = ["data_type.h"],
+    deps = [
+        ":dnn_proto_cc",
+        "//tensorflow/stream_executor/platform",
+    ],
+)
+
 cc_library(
     name = "dnn",
     srcs = ["dnn.cc"],
     hdrs = ["dnn.h"],
     deps = [
+        ":data_type",
         ":device_memory",
         ":dnn_proto_cc",
         ":stream_executor_headers",
@@ -443,7 +453,6 @@ cc_library(
 cc_library(
     name = "stream_executor_internal",
     srcs = [
-        "dnn.h",
         "stream_executor_internal.cc",
     ],
     hdrs = [
@@ -472,7 +481,6 @@ cc_library(
     name = "stream_executor_pimpl_header",
     hdrs = [
         "device_description.h",
-        "dnn.h",
         "kernel.h",
         "kernel_cache_config.h",
         "stream_executor_pimpl.h",
@@ -679,8 +687,8 @@ cc_library(
         ":device_memory_allocator",
         ":platform",
         ":stream_executor_headers",
-        "//tensorflow/core:allocator",
         "//tensorflow/core:lib",
+        "//tensorflow/core/framework:allocator",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
diff --git a/tensorflow/stream_executor/blas.cc b/tensorflow/stream_executor/blas.cc
index f499b3003d0..ca597595969 100644
--- a/tensorflow/stream_executor/blas.cc
+++ b/tensorflow/stream_executor/blas.cc
@@ -95,5 +95,30 @@ std::ostream& operator<<(std::ostream& os, ComputationType ty) {
   return os << ComputationTypeString(ty);
 }
 
+string DataTypeString(DataType ty) {
+  switch (ty) {
+    case DataType::kHalf:
+      return "f16";
+    case DataType::kFloat:
+      return "f32";
+    case DataType::kDouble:
+      return "f64";
+    case DataType::kInt8:
+      return "i8";
+    case DataType::kInt32:
+      return "i32";
+    case DataType::kComplexFloat:
+      return "complex f32";
+    case DataType::kComplexDouble:
+      return "complex f64";
+    default:
+      LOG(FATAL) << "Unknown DataType " << static_cast<int32>(ty);
+  }
+}
+
+std::ostream& operator<<(std::ostream& os, DataType ty) {
+  return os << DataTypeString(ty);
+}
+
 }  // namespace blas
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h
index 5018d487ed1..20776b8416d 100644
--- a/tensorflow/stream_executor/blas.h
+++ b/tensorflow/stream_executor/blas.h
@@ -43,7 +43,7 @@ limitations under the License.
 #include <complex>
 #include <vector>
 
-#include "tensorflow/stream_executor/host_or_device_scalar.h"
+#include "tensorflow/stream_executor/dnn.h"  // For DataType, ToDataType
 #include "tensorflow/stream_executor/lib/array_slice.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/platform/port.h"
@@ -60,6 +60,9 @@ class ScratchAllocator;
 template <typename ElemT>
 class DeviceMemory;
 
+template <typename ElemT>
+class HostOrDeviceScalar;
+
 namespace blas {
 
 // Specifies whether the input matrix will be transposed or
@@ -101,6 +104,18 @@ enum class ComputationType {
   kI32,         // 32-bit integer
   kComplexF32,  // Complex number comprised of two f32s.
   kComplexF64,  // Complex number comprised of two f64s.
+  // The below values are only supported for BlasLt routines (both real and
+  // complex). They use float32 for accumulation but round the input mantissas
+  // to a smaller number of bits.
+  kTF32AsF32,  // 32-bit floating-point with reduced (>=10-bit) mantissa
+  kBF16AsF32,  // 32-bit floating-point with reduced (7-bit) mantissa
+};
+
+enum class Epilogue {
+  kDefault = 1,                   // No special postprocessing
+  kReLU = 2,                      // Apply ReLU func point-wise to the results
+  kBias = 4,                      // Add broadcasted bias vector to the results
+  kBiasThenReLU = kBias | kReLU,  // Apply bias and then ReLU transform
 };
 
 // Converts a ComputationType to a string.
@@ -108,6 +123,21 @@ std::string ComputationTypeString(ComputationType ty);
 
 std::ostream &operator<<(std::ostream &os, ComputationType ty);
 
+using dnn::DataType;
+using dnn::ToDataType;
+
+// Describes the type of pointers for the scaling factors alpha and beta in
+// blaslt routines.
+enum class PointerMode {
+  kHost,
+  kDevice,
+};
+
+// Converts a ComputationType to a string.
+string DataTypeString(DataType ty);
+
+std::ostream &operator<<(std::ostream &os, DataType ty);
+
 // Opaque identifier for an "algorithm" used by a blas routine.  This functions
 // as a hint to the blas library.
 typedef int64 AlgorithmType;
@@ -163,6 +193,44 @@ class AlgorithmConfig {
   AlgorithmType algorithm_;
 };
 
+struct IBlasLtMatmulPlan {
+  // Returns the data type of the A and B (input) matrices.
+  virtual DataType ab_type() const = 0;
+  // Returns the data type of the C (input/output) matrix.
+  virtual DataType c_type() const = 0;
+  virtual ~IBlasLtMatmulPlan() {}
+};
+
+struct IBlasLtMatmulAlgorithm {
+  virtual ~IBlasLtMatmulAlgorithm() {}
+  // Returns the index of the algorithm within the list returned by
+  // GetBlasLtMatmulAlgorithms.
+  virtual AlgorithmType index() const = 0;
+  // Returns the workspace size required by the algorithm in bytes.
+  virtual size_t workspace_size() const = 0;
+};
+
+// Parameters for the CreateBlasLtMatmulPlan method.
+struct BlasLtMatmulPlanParams {
+  DataType ab_type;
+  DataType c_type;
+  ComputationType computation_type;
+  PointerMode pointer_mode;
+  Epilogue epilogue;
+  Transpose transa;
+  Transpose transb;
+  uint64 m;
+  uint64 n;
+  uint64 k;
+  int64 lda;
+  int64 ldb;
+  int64 ldc;
+  int batch_count = 1;
+  int64 stride_a = 0;
+  int64 stride_b = 0;
+  int64 stride_c = 0;
+};
+
 // BLAS support interface -- this can be derived from a GPU executor when the
 // underlying platform has an BLAS library implementation available. See
 // StreamExecutor::AsBlas().
@@ -1383,6 +1451,71 @@ class BlasSupport {
                           const DeviceMemory<std::complex<double>> &a, int lda,
                           DeviceMemory<std::complex<double>> *b, int ldb) = 0;
 
+  // Creates a backend-specific plan object for a blaslt matmul operation, which
+  // can then be passed to DoBlasLtMatmul(). When possible, plans should be
+  // created once and reused for multiple calls to DoBlasLtMatmul().
+  virtual port::StatusOr<std::unique_ptr<blas::IBlasLtMatmulPlan>>
+  CreateBlasLtMatmulPlan(const blas::BlasLtMatmulPlanParams &params) = 0;
+
+  // Gets a list of supported algorithms for DoBlasLtMatmul. The algorithms are
+  // returned in the order of increasing estimated compute time according to an
+  // internal heuristic. The first returned algorithm can be used as the default
+  // algorithm if no autotuning is to be performed.
+  virtual port::StatusOr<
+      std::vector<std::unique_ptr<blas::IBlasLtMatmulAlgorithm>>>
+  GetBlasLtMatmulAlgorithms(const blas::IBlasLtMatmulPlan *plan,
+                            size_t max_workspace_size,
+                            int max_algorithm_count) = 0;
+
+  // Executes a blaslt matmul operation on the stream. If output_profile_result
+  // is not nullptr, the operation is profiled, error messages are
+  // suppressed, and output_profile_result->algorithm() is set to
+  // algorithm->index(). If epilogue was set to kBias or kBiasThenReLU when
+  // creating the plan, the bias argument here must refer to a valid device
+  // vector of length equal to the number of rows in matrix c. If epilogue was
+  // set to any other value then the bias argument here must be null. The bias
+  // vector is broadcast across the batch dimension.
+  // Note that the data types of a and b (c and bias) must match the ab_type
+  // (c_type) with which the plan was created, and the data types of alpha and
+  // beta must match the data type of c.
+  virtual bool DoBlasLtMatmul(
+      Stream *stream, const blas::IBlasLtMatmulPlan *plan,
+      const HostOrDeviceScalar<void> &alpha, DeviceMemoryBase a,
+      DeviceMemoryBase b, const HostOrDeviceScalar<void> &beta,
+      DeviceMemoryBase c, ScratchAllocator *scratch_allocator,
+      const blas::IBlasLtMatmulAlgorithm *algorithm, DeviceMemoryBase bias,
+      blas::ProfileResult *output_profile_result) = 0;
+
+  template <typename ABType, typename CType>
+  bool DoBlasLtMatmul(Stream *stream, const blas::IBlasLtMatmulPlan *plan,
+                      const HostOrDeviceScalar<CType> &alpha,
+                      const DeviceMemory<ABType> &a,
+                      const DeviceMemory<ABType> &b,
+                      const HostOrDeviceScalar<CType> &beta,
+                      DeviceMemory<CType> *c,
+                      ScratchAllocator *scratch_allocator,
+                      const blas::IBlasLtMatmulAlgorithm *algorithm,
+                      const DeviceMemory<CType> &bias = {},
+                      blas::ProfileResult *output_profile_result = nullptr) {
+    constexpr blas::DataType ab_type = blas::ToDataType<ABType>::value;
+    if (ab_type != plan->ab_type()) {
+      VLOG(2) << "DoBlasLtMatmul returning false because a and b type does "
+                 "not match plan: expected "
+              << plan->ab_type() << ", got " << ab_type;
+      return false;
+    }
+    constexpr blas::DataType c_type = blas::ToDataType<CType>::value;
+    if (c_type != plan->c_type()) {
+      VLOG(2) << "DoBlasLtMatmul returning false because c type does "
+                 "not match plan: expected "
+              << plan->c_type() << ", got " << c_type;
+      return false;
+    }
+    return DoBlasLtMatmul(stream, plan, alpha, a, b, beta, *c,
+                          scratch_allocator, algorithm, bias,
+                          output_profile_result);
+  }
+
   virtual port::Status GetVersion(std::string *version) = 0;
 
  protected:
@@ -2196,6 +2329,19 @@ class BlasSupport {
                   uint64 n, std::complex<double> alpha,                        \
                   const DeviceMemory<std::complex<double>> &a, int lda,        \
                   DeviceMemory<std::complex<double>> *b, int ldb) override;    \
+  port::StatusOr<std::unique_ptr<blas::IBlasLtMatmulPlan>>                     \
+  CreateBlasLtMatmulPlan(const blas::BlasLtMatmulPlanParams &params) override; \
+  port::StatusOr<std::vector<std::unique_ptr<blas::IBlasLtMatmulAlgorithm>>>   \
+  GetBlasLtMatmulAlgorithms(const blas::IBlasLtMatmulPlan *plan,               \
+                            size_t max_workspace_size,                         \
+                            int max_algorithm_count) override;                 \
+  bool DoBlasLtMatmul(                                                         \
+      Stream *stream, const blas::IBlasLtMatmulPlan *plan,                     \
+      const HostOrDeviceScalar<void> &alpha, DeviceMemoryBase a,               \
+      DeviceMemoryBase b, const HostOrDeviceScalar<void> &beta,                \
+      DeviceMemoryBase c, ScratchAllocator *scratch_allocator,                 \
+      const blas::IBlasLtMatmulAlgorithm *algorithm, DeviceMemoryBase bias,    \
+      blas::ProfileResult *output_profile_result) override;                    \
   port::Status GetVersion(std::string *version) override;
 
 }  // namespace blas
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index bd545f097cf..7086217fa8e 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -2,6 +2,7 @@
 #   CUDA-platform specific StreamExecutor support code.
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_copts", "tf_cuda_cc_test")
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load(
     "//tensorflow/stream_executor:build_defs.bzl",
     "stream_executor_friends",
@@ -17,6 +18,10 @@ load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
+load(
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
+)
 
 package(
     default_visibility = [":friends"],
@@ -101,6 +106,10 @@ cc_library(
 # an intermediate target.
 cc_library(name = "ptxas_wrapper")
 
+# Buildozer can not remove dependencies inside select guards, so we have to use
+# an intermediate target.
+cc_library(name = "fatbinary_wrapper")
+
 cc_library(
     name = "cuda_driver",
     srcs = if_cuda_is_configured(["cuda_driver.cc"]),
@@ -242,6 +251,31 @@ alias(
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "cublas_lt_stub",
+    srcs = if_cuda_is_configured(["cublasLt_stub.cc"]),
+    textual_hdrs = glob(["cublasLt_*.inc"]),
+    deps = if_cuda_is_configured([
+        # LINT.IfChange
+        "@local_config_cuda//cuda:cublas_headers",
+        # LINT.ThenChange(//tensorflow/copy.bara.sky:cublasLt_headers)
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/stream_executor/lib",
+        "//tensorflow/stream_executor/platform:dso_loader",
+    ]),
+)
+
+cc_library(name = "empty_lib")
+
+alias(
+    name = "cublas_lt_lib",
+    actual = select({
+        "//tensorflow:oss": ":cublas_lt_stub",
+        "//conditions:default": ":empty_lib",
+    }),
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "cublas_plugin",
     srcs = if_cuda_is_configured(["cuda_blas.cc"]),
@@ -249,6 +283,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cublas_lib",
+        ":cublas_lt_lib",
         ":cuda_activation",
         ":cuda_gpu_executor",
         ":cuda_platform_id",
@@ -263,7 +298,7 @@ cc_library(
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:tf32_utils",
+        "//tensorflow/core/platform:tensor_float_32_utils",
         "//tensorflow/stream_executor",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:host_or_device_scalar",
@@ -364,13 +399,14 @@ cc_library(
         ":cuda_timer",
         ":cudnn_version",
         ":cudnn_lib",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "//third_party/eigen3",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_config_cuda//cuda:cudnn_header",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:tf32_utils",
+        "//tensorflow/core/platform:tensor_float_32_utils",
         "//tensorflow/stream_executor:dnn",
         "//tensorflow/stream_executor:event",
         "//tensorflow/stream_executor:plugin_registry",
@@ -626,9 +662,9 @@ tf_cuda_cc_test(
     deps = [
         ":cuda_activation",
         ":cuda_gpu_executor",
-        "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor:event",
diff --git a/tensorflow/stream_executor/cuda/cublasLt_11_0.inc b/tensorflow/stream_executor/cuda/cublasLt_11_0.inc
new file mode 100644
index 00000000000..5645753c56b
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cublasLt_11_0.inc
@@ -0,0 +1,390 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+cublasStatus_t CUBLASWINAPI cublasLtCreate(cublasLtHandle_t *lightHandle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lightHandle);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtDestroy(cublasLtHandle_t lightHandle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lightHandle);
+}
+
+size_t CUBLASWINAPI cublasLtGetVersion(void) {
+  using FuncPtr = size_t(CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtGetVersion");
+  if (!func_ptr) return 0;
+  return func_ptr();
+}
+
+size_t CUBLASWINAPI cublasLtGetCudartVersion(void) {
+  using FuncPtr = size_t(CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtGetCudartVersion");
+  if (!func_ptr) return 0;
+  return func_ptr();
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtGetProperty(libraryPropertyType type,
+                                                int *value) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmul(
+    cublasLtHandle_t lightHandle, cublasLtMatmulDesc_t computeDesc,
+    const void *alpha, /* host or device pointer */
+    const void *A, cublasLtMatrixLayout_t Adesc, const void *B,
+    cublasLtMatrixLayout_t Bdesc, const void *beta, /* host or device pointer */
+    const void *C, cublasLtMatrixLayout_t Cdesc, void *D,
+    cublasLtMatrixLayout_t Ddesc, const cublasLtMatmulAlgo_t *algo,
+    void *workspace, size_t workspaceSizeInBytes, cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasLtHandle_t, cublasLtMatmulDesc_t, const void *, const void *,
+      cublasLtMatrixLayout_t, const void *, cublasLtMatrixLayout_t,
+      const void *, const void *, cublasLtMatrixLayout_t, void *,
+      cublasLtMatrixLayout_t, const cublasLtMatmulAlgo_t *, void *, size_t,
+      cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmul");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lightHandle, computeDesc, alpha, A, Adesc, B, Bdesc, beta, C,
+                  Cdesc, D, Ddesc, algo, workspace, workspaceSizeInBytes,
+                  stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransform(
+    cublasLtHandle_t lightHandle, cublasLtMatrixTransformDesc_t transformDesc,
+    const void *alpha, /* host or device pointer */
+    const void *A, cublasLtMatrixLayout_t Adesc,
+    const void *beta, /* host or device pointer */
+    const void *B, cublasLtMatrixLayout_t Bdesc, void *C,
+    cublasLtMatrixLayout_t Cdesc, cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasLtHandle_t, cublasLtMatrixTransformDesc_t, const void *,
+      const void *, cublasLtMatrixLayout_t, const void *, const void *,
+      cublasLtMatrixLayout_t, void *, cublasLtMatrixLayout_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatrixTransform");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lightHandle, transformDesc, alpha, A, Adesc, beta, B, Bdesc,
+                  C, Cdesc, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutInit_internal(  //
+    cublasLtMatrixLayout_t matLayout, size_t size, cudaDataType type,
+    uint64_t rows, uint64_t cols, int64_t ld) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatrixLayout_t, size_t, cudaDataType, uint64_t, uint64_t,
+      int64_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatrixLayoutInit_internal");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matLayout, size, type, rows, cols, ld);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutCreate(  //
+    cublasLtMatrixLayout_t *matLayout, cudaDataType type, uint64_t rows,
+    uint64_t cols, int64_t ld) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatrixLayout_t *, cudaDataType, uint64_t, uint64_t, int64_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatrixLayoutCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matLayout, type, rows, cols, ld);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasLtMatrixLayoutDestroy(cublasLtMatrixLayout_t matLayout) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtMatrixLayout_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatrixLayoutDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matLayout);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutSetAttribute(  //
+    cublasLtMatrixLayout_t matLayout, cublasLtMatrixLayoutAttribute_t attr,
+    const void *buf, size_t sizeInBytes) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatrixLayout_t, cublasLtMatrixLayoutAttribute_t, const void *,
+      size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatrixLayoutSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matLayout, attr, buf, sizeInBytes);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutGetAttribute(  //
+    cublasLtMatrixLayout_t matLayout, cublasLtMatrixLayoutAttribute_t attr,
+    void *buf, size_t sizeInBytes, size_t *sizeWritten) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatrixLayout_t, cublasLtMatrixLayoutAttribute_t, void *, size_t,
+      size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatrixLayoutGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matLayout, attr, buf, sizeInBytes, sizeWritten);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescInit_internal(  //
+    cublasLtMatmulDesc_t matmulDesc, size_t size,
+    cublasComputeType_t computeType, cudaDataType_t scaleType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatmulDesc_t, size_t, cublasComputeType_t, cudaDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulDescInit_internal");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matmulDesc, size, computeType, scaleType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescCreate(
+    cublasLtMatmulDesc_t *matmulDesc, cublasComputeType_t computeType,
+    cudaDataType_t scaleType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasLtMatmulDesc_t *, cublasComputeType_t, cudaDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulDescCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matmulDesc, computeType, scaleType);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasLtMatmulDescDestroy(cublasLtMatmulDesc_t matmulDesc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtMatmulDesc_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulDescDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matmulDesc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescSetAttribute(  //
+    cublasLtMatmulDesc_t matmulDesc, cublasLtMatmulDescAttributes_t attr,
+    const void *buf, size_t sizeInBytes) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatmulDesc_t, cublasLtMatmulDescAttributes_t, const void *,
+      size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulDescSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matmulDesc, attr, buf, sizeInBytes);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescGetAttribute(  //
+    cublasLtMatmulDesc_t matmulDesc, cublasLtMatmulDescAttributes_t attr,
+    void *buf, size_t sizeInBytes, size_t *sizeWritten) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatmulDesc_t, cublasLtMatmulDescAttributes_t, void *, size_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulDescGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(matmulDesc, attr, buf, sizeInBytes, sizeWritten);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescInit_internal(
+    cublasLtMatrixTransformDesc_t transformDesc, size_t size,
+    cudaDataType scaleType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtMatrixTransformDesc_t,
+                                                 size_t, cudaDataType);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatrixTransformDescInit_internal");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc, size, scaleType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescCreate(
+    cublasLtMatrixTransformDesc_t *transformDesc, cudaDataType scaleType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasLtMatrixTransformDesc_t *, cudaDataType);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatrixTransformDescCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc, scaleType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescDestroy(
+    cublasLtMatrixTransformDesc_t transformDesc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtMatrixTransformDesc_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatrixTransformDescDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescSetAttribute(  //
+    cublasLtMatrixTransformDesc_t transformDesc,
+    cublasLtMatrixTransformDescAttributes_t attr, const void *buf,
+    size_t sizeInBytes) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatrixTransformDesc_t, cublasLtMatrixTransformDescAttributes_t,
+      const void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatrixTransformDescSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc, attr, buf, sizeInBytes);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescGetAttribute(  //
+    cublasLtMatrixTransformDesc_t transformDesc,
+    cublasLtMatrixTransformDescAttributes_t attr, void *buf, size_t sizeInBytes,
+    size_t *sizeWritten) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatrixTransformDesc_t, cublasLtMatrixTransformDescAttributes_t,
+      void *, size_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatrixTransformDescGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc, attr, buf, sizeInBytes, sizeWritten);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceInit_internal(
+    cublasLtMatmulPreference_t pref, size_t size) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasLtMatmulPreference_t, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatmulPreferenceInit_internal");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pref, size);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasLtMatmulPreferenceCreate(cublasLtMatmulPreference_t *pref) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtMatmulPreference_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulPreferenceCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pref);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasLtMatmulPreferenceDestroy(cublasLtMatmulPreference_t pref) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLtMatmulPreference_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulPreferenceDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pref);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceSetAttribute(  //
+    cublasLtMatmulPreference_t pref, cublasLtMatmulPreferenceAttributes_t attr,
+    const void *buf, size_t sizeInBytes) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatmulPreference_t, cublasLtMatmulPreferenceAttributes_t,
+      const void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatmulPreferenceSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pref, attr, buf, sizeInBytes);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceGetAttribute(  //
+    cublasLtMatmulPreference_t pref, cublasLtMatmulPreferenceAttributes_t attr,
+    void *buf, size_t sizeInBytes, size_t *sizeWritten) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtMatmulPreference_t, cublasLtMatmulPreferenceAttributes_t, void *,
+      size_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatmulPreferenceGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pref, attr, buf, sizeInBytes, sizeWritten);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoGetHeuristic(
+    cublasLtHandle_t lightHandle, cublasLtMatmulDesc_t operationDesc,
+    cublasLtMatrixLayout_t Adesc, cublasLtMatrixLayout_t Bdesc,
+    cublasLtMatrixLayout_t Cdesc, cublasLtMatrixLayout_t Ddesc,
+    cublasLtMatmulPreference_t preference, int requestedAlgoCount,
+    cublasLtMatmulHeuristicResult_t heuristicResultsArray[],
+    int *returnAlgoCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasLtHandle_t, cublasLtMatmulDesc_t, cublasLtMatrixLayout_t,
+      cublasLtMatrixLayout_t, cublasLtMatrixLayout_t, cublasLtMatrixLayout_t,
+      cublasLtMatmulPreference_t, int, cublasLtMatmulHeuristicResult_t[],
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulAlgoGetHeuristic");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lightHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc,
+                  preference, requestedAlgoCount, heuristicResultsArray,
+                  returnAlgoCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoGetIds(
+    cublasLtHandle_t lightHandle, cublasComputeType_t computeType,
+    cudaDataType_t scaleType, cudaDataType_t Atype, cudaDataType_t Btype,
+    cudaDataType_t Ctype, cudaDataType_t Dtype, int requestedAlgoCount,
+    int algoIdsArray[], int *returnAlgoCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasLtHandle_t, cublasComputeType_t, cudaDataType_t, cudaDataType_t,
+      cudaDataType_t, cudaDataType_t, cudaDataType_t, int, int[], int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulAlgoGetIds");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lightHandle, computeType, scaleType, Atype, Btype, Ctype,
+                  Dtype, requestedAlgoCount, algoIdsArray, returnAlgoCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoInit(
+    cublasLtHandle_t lightHandle, cublasComputeType_t computeType,
+    cudaDataType_t scaleType, cudaDataType_t Atype, cudaDataType_t Btype,
+    cudaDataType_t Ctype, cudaDataType_t Dtype, int algoId,
+    cublasLtMatmulAlgo_t *algo) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasLtHandle_t, cublasComputeType_t, cudaDataType_t, cudaDataType_t,
+      cudaDataType_t, cudaDataType_t, cudaDataType_t, int,
+      cublasLtMatmulAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulAlgoInit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lightHandle, computeType, scaleType, Atype, Btype, Ctype,
+                  Dtype, algoId, algo);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoCheck(  //
+    cublasLtHandle_t lightHandle, cublasLtMatmulDesc_t operationDesc,
+    cublasLtMatrixLayout_t Adesc, cublasLtMatrixLayout_t Bdesc,
+    cublasLtMatrixLayout_t Cdesc, cublasLtMatrixLayout_t Ddesc,
+    const cublasLtMatmulAlgo_t *algo,  ///< may point to result->algo
+    cublasLtMatmulHeuristicResult_t *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(  //
+      cublasLtHandle_t, cublasLtMatmulDesc_t, cublasLtMatrixLayout_t,
+      cublasLtMatrixLayout_t, cublasLtMatrixLayout_t, cublasLtMatrixLayout_t,
+      const cublasLtMatmulAlgo_t *,  ///< may point to result->algo
+      cublasLtMatmulHeuristicResult_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLtMatmulAlgoCheck");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lightHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, algo,
+                  result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoCapGetAttribute(
+    const cublasLtMatmulAlgo_t *algo, cublasLtMatmulAlgoCapAttributes_t attr,
+    void *buf, size_t sizeInBytes, size_t *sizeWritten) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      const cublasLtMatmulAlgo_t *, cublasLtMatmulAlgoCapAttributes_t, void *,
+      size_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatmulAlgoCapGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algo, attr, buf, sizeInBytes, sizeWritten);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoConfigSetAttribute(
+    cublasLtMatmulAlgo_t *algo, cublasLtMatmulAlgoConfigAttributes_t attr,
+    const void *buf, size_t sizeInBytes) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasLtMatmulAlgo_t *, cublasLtMatmulAlgoConfigAttributes_t,
+      const void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatmulAlgoConfigSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algo, attr, buf, sizeInBytes);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoConfigGetAttribute(
+    const cublasLtMatmulAlgo_t *algo, cublasLtMatmulAlgoConfigAttributes_t attr,
+    void *buf, size_t sizeInBytes, size_t *sizeWritten) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      const cublasLtMatmulAlgo_t *, cublasLtMatmulAlgoConfigAttributes_t,
+      void *, size_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cublasLtMatmulAlgoConfigGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algo, attr, buf, sizeInBytes, sizeWritten);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cublasLt_stub.cc b/tensorflow/stream_executor/cuda/cublasLt_stub.cc
new file mode 100644
index 00000000000..aae8a94285b
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cublasLt_stub.cc
@@ -0,0 +1,59 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "third_party/gpus/cuda/include/cublasLt.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "tensorflow/stream_executor/lib/env.h"
+#include "tensorflow/stream_executor/platform/dso_loader.h"
+
+// Implements the cuBLASLt API by forwarding to cuBLASLt loaded from the DSO.
+
+namespace {
+// Returns DSO handle or null if loading the DSO fails.
+void* GetDsoHandle() {
+#ifdef PLATFORM_GOOGLE
+  return nullptr;
+#else
+  static auto handle = []() -> void* {
+    auto handle_or =
+        stream_executor::internal::DsoLoader::GetCublasLtDsoHandle();
+    if (!handle_or.ok()) return nullptr;
+    return handle_or.ValueOrDie();
+  }();
+  return handle;
+#endif
+}
+
+template <typename T>
+T LoadSymbol(const char* symbol_name) {
+  void* symbol = nullptr;
+  if (auto handle = GetDsoHandle()) {
+    stream_executor::port::Env::Default()
+        ->GetSymbolFromLibrary(handle, symbol_name, &symbol)
+        .IgnoreError();
+  }
+  return reinterpret_cast<T>(symbol);
+}
+
+void LogFatalSymbolNotFound(const char* symbol_name) {
+  LOG(FATAL) << symbol_name << " symbol not found.";
+}
+
+cublasStatus_t GetSymbolNotFoundError() { return CUBLAS_STATUS_INTERNAL_ERROR; }
+}  // namespace
+
+// We only use cublasLt from CUDA 11.0 onward.
+#if CUDA_VERSION >= 11000
+#include "tensorflow/stream_executor/cuda/cublasLt_11_0.inc"
+#endif
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index f32c8b3e81e..7d606d44ec3 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "third_party/gpus/cuda/include/cublasLt.h"
 #include "third_party/gpus/cuda/include/cublas_v2.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 
@@ -49,7 +50,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "third_party/eigen3/Eigen/Core"
-#include "tensorflow/core/platform/tf32_utils.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
@@ -226,17 +227,38 @@ bool CUDABlas::Init() {
     return false;
   }
 
+#if CUDA_VERSION >= 11000
+  ret = cublasLtCreate(&blasLt_);
+  if (ret != CUBLAS_STATUS_SUCCESS) {
+    LOG(ERROR) << "failed to create cublasLt handle: " << ToString(ret);
+    return false;
+  }
+#endif  // CUDA_VERSION >= 11000
+
   return true;
 }
 
 CUDABlas::CUDABlas(gpu::GpuExecutor *parent)
-    : parent_(CHECK_NOTNULL(parent)), blas_(nullptr) {}
+    : parent_(CHECK_NOTNULL(parent)),
+      blas_(nullptr)
+#if CUDA_VERSION >= 11000
+      ,
+      blasLt_(nullptr)
+#endif
+{
+}
 
 CUDABlas::~CUDABlas() {
   if (blas_ != nullptr) {
     gpu::ScopedActivateExecutorContext sac{parent_};
     cublasDestroy(blas_);
   }
+#if CUDA_VERSION >= 11000
+  if (blasLt_ != nullptr) {
+    gpu::ScopedActivateExecutorContext sac{parent_};
+    cublasLtDestroy(blasLt_);
+  }
+#endif
 }
 
 bool CUDABlas::SetStream(Stream *stream) {
@@ -253,6 +275,13 @@ bool CUDABlas::SetStream(Stream *stream) {
   return true;
 }
 
+cudaStream_t CUDABlas::CUDAStream(Stream *stream) {
+  CHECK(stream != nullptr);
+  CHECK(AsGpuStreamValue(stream) != nullptr);
+  gpu::ScopedActivateExecutorContext sac{parent_};
+  return AsGpuStreamValue(stream);
+}
+
 namespace {
 
 // Helper functions transforming blas arguments into cuBLAS arguments.
@@ -381,8 +410,122 @@ cudaDataType_t CUDAComputationType(blas::ComputationType ty) {
       return CUDA_C_32F;
     case blas::ComputationType::kComplexF64:
       return CUDA_C_64F;
+    case blas::ComputationType::kTF32AsF32:  // fall-through
+    case blas::ComputationType::kBF16AsF32:
+      // These cases are currently only supported in the blasLt routines, which
+      // use CUBLASComputationType() instead.
+      LOG(FATAL) << "Invalid value of blas::ComputationType.";
   }
 }
+
+#if CUDA_VERSION >= 11000
+cublasComputeType_t CUBLASComputationType(blas::ComputationType ty) {
+  switch (ty) {
+    case blas::ComputationType::kF16:
+      return CUBLAS_COMPUTE_16F;
+    case blas::ComputationType::kF32:  // fall-through
+    case blas::ComputationType::kComplexF32:
+      return CUBLAS_COMPUTE_32F;
+    case blas::ComputationType::kF64:  // fall-through
+    case blas::ComputationType::kComplexF64:
+      return CUBLAS_COMPUTE_64F;
+    case blas::ComputationType::kI32:
+      return CUBLAS_COMPUTE_32I;
+    case blas::ComputationType::kTF32AsF32:
+      return CUBLAS_COMPUTE_32F_FAST_TF32;
+    case blas::ComputationType::kBF16AsF32:
+      return CUBLAS_COMPUTE_32F_FAST_16BF;
+  }
+}
+#endif  // CUDA_VERSION >= 11000
+
+blas::DataType GetScaleType(blas::DataType data_type,
+                            blas::ComputationType compute_type) {
+  bool is_complex = data_type == blas::DataType::kComplexFloat ||
+                    data_type == blas::DataType::kComplexDouble;
+  switch (compute_type) {
+    case blas::ComputationType::kF16:
+      return blas::DataType::kHalf;
+    case blas::ComputationType::kF32:         // fall-through
+    case blas::ComputationType::kComplexF32:  // fall-through
+    case blas::ComputationType::kTF32AsF32:   // fall-through
+    case blas::ComputationType::kBF16AsF32:
+      return is_complex ? blas::DataType::kComplexFloat
+                        : blas::DataType::kFloat;
+    case blas::ComputationType::kF64:  // fall-through
+    case blas::ComputationType::kComplexF64:
+      return is_complex ? blas::DataType::kComplexDouble
+                        : blas::DataType::kDouble;
+    case blas::ComputationType::kI32:
+      return blas::DataType::kInt32;
+  }
+}
+
+#if CUDA_VERSION >= 11000
+cublasLtPointerMode_t CUBLASPointerMode(blas::PointerMode pointer_mode) {
+  switch (pointer_mode) {
+    case blas::PointerMode::kHost:
+      return CUBLASLT_POINTER_MODE_HOST;
+    case blas::PointerMode::kDevice:
+      return CUBLASLT_POINTER_MODE_DEVICE;
+  }
+}
+cublasLtEpilogue_t CUBLASEpilogue(blas::Epilogue epilogue) {
+  switch (epilogue) {
+    case blas::Epilogue::kDefault:
+      return CUBLASLT_EPILOGUE_DEFAULT;
+    case blas::Epilogue::kReLU:
+      return CUBLASLT_EPILOGUE_RELU;
+    case blas::Epilogue::kBias:
+      return CUBLASLT_EPILOGUE_BIAS;
+    case blas::Epilogue::kBiasThenReLU:
+      return CUBLASLT_EPILOGUE_RELU_BIAS;
+  }
+}
+#endif  // CUDA_VERSION >= 11000
+
+cudaDataType_t GetCUDADataType(blas::DataType ty) {
+  switch (ty) {
+    case blas::DataType::kHalf:
+      return CUDA_R_16F;
+    case blas::DataType::kFloat:
+      return CUDA_R_32F;
+    case blas::DataType::kDouble:
+      return CUDA_R_64F;
+    case blas::DataType::kInt8:
+      return CUDA_R_8I;
+    case blas::DataType::kInt32:
+      return CUDA_R_32I;
+    case blas::DataType::kComplexFloat:
+      return CUDA_C_32F;
+    case blas::DataType::kComplexDouble:
+      return CUDA_C_64F;
+    default:
+      LOG(FATAL) << "Invalid value of blas::DataType in GetCUDADataType";
+  }
+}
+
+int GetDataTypeSizeBytes(blas::DataType ty) {
+  switch (ty) {
+    case blas::DataType::kHalf:
+      return 2;
+    case blas::DataType::kFloat:
+      return 4;
+    case blas::DataType::kDouble:
+      return 8;
+    case blas::DataType::kInt8:
+      return 1;
+    case blas::DataType::kInt32:
+      return 4;
+    case blas::DataType::kComplexFloat:
+      return 8;
+    case blas::DataType::kComplexDouble:
+      return 16;
+    default:
+      LOG(FATAL) << "Invalid value of blas::DataType in GetDataTypeSizeBytes";
+  }
+}
+
 }  // namespace
 
 template <typename FuncT, typename... Args>
@@ -400,7 +543,7 @@ bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
   ScopedCublasMathMode math_mode{blas_};
 #if CUBLAS_VER_MAJOR >= 11
   if (math_type == CUBLAS_TF32_TENSOR_OP_MATH &&
-      tensorflow::tf32_execution_allowed()) {
+      tensorflow::tensor_float_32_execution_enabled()) {
 #else
   if (math_type == CUBLAS_TENSOR_OP_MATH) {
 #endif
@@ -1686,6 +1829,15 @@ bool CUDABlas::DoBlasGemm(Stream *stream, blas::Transpose transa,
   cublasMath_t math_type = CUBLAS_DEFAULT_MATH;
 #else
   cublasMath_t math_type = CUBLAS_TF32_TENSOR_OP_MATH;
+  int cc_major, cc_minor;
+  if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
+          &cc_major, &cc_minor) &&
+      cc_major >= 8) {
+    // TODO(reedwm): Remove or make this VLOG(1) once TensorFloat-32 is more
+    // well tested.
+    LOG_FIRST_N(INFO, 1) << "TensorFloat-32 will be used for the matrix "
+                            "multiplication. This will only be logged once.";
+  }
 #endif
 
   return DoBlasInternalImpl(
@@ -1952,7 +2104,7 @@ bool CUDABlas::DoBlasGemmWithAlgorithmImpl(
                 << " uses tensor ops, but tensor ops are not available in sm"
                 << cc_major << "X devices for float input types.";
         return false;
-      } else if (!tensorflow::tf32_execution_allowed()) {
+      } else if (!tensorflow::tensor_float_32_execution_enabled()) {
         VLOG(2) << "DoBlasGemmWithAlgorithm returning false because algorithm "
                 << algorithm
                 << " uses tensor ops, but tensor ops are disabled for fp32"
@@ -2294,10 +2446,11 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
 #if CUBLAS_VER_MAJOR >= 11
     } else if (data_type == CUDA_R_32F) {
       // DoBlassInternalImpl will switch math_type back to CUBLAS_DEFAULT_MATH
-      // if TF32 is disabled.
+      // if TensorFloat-32 is disabled.
       math_type = CUBLAS_TF32_TENSOR_OP_MATH;
-      algo = tensorflow::tf32_execution_allowed() ? CUBLAS_GEMM_DFALT_TENSOR_OP
-                                                  : CUBLAS_GEMM_DFALT;
+      algo = tensorflow::tensor_float_32_execution_enabled()
+                 ? CUBLAS_GEMM_DFALT_TENSOR_OP
+                 : CUBLAS_GEMM_DFALT;
 #endif
     } else {
       math_type = CUBLAS_DEFAULT_MATH;
@@ -2911,6 +3064,575 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side,
                         GpuComplex(GpuMemoryMutable(b)), ldb);
 }
 
+// We only use cublasLt from CUDA 11.0 onward.
+#if CUDA_VERSION >= 11000
+
+namespace {
+
+template <typename T>
+inline port::Status SetCublasLtAttr(cublasLtMatrixLayout_t handle,
+                                    cublasLtMatrixLayoutAttribute_t attr,
+                                    const T &value) {
+  cublasStatus_t status =
+      cublasLtMatrixLayoutSetAttribute(handle, attr, &value, sizeof(T));
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return port::Status(
+        port::error::INTERNAL,
+        absl::StrCat("cublasLtMatrixLayoutSetAttribute(attr=", attr,
+                     ", value=", value, ") failed: ", ToString(status)));
+  }
+  return port::Status::OK();
+}
+
+template <typename T>
+inline port::Status SetCublasLtAttr(cublasLtMatmulAlgo_t *handle,
+                                    cublasLtMatmulAlgoConfigAttributes_t attr,
+                                    const T &value) {
+  cublasStatus_t status =
+      cublasLtMatmulAlgoConfigSetAttribute(handle, attr, &value, sizeof(T));
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return port::Status(
+        port::error::INTERNAL,
+        absl::StrCat("cublasLtMatmulAlgoConfigSetAttribute(attr=", attr,
+                     ", value=", value, ") failed: ", ToString(status)));
+  }
+  return port::Status::OK();
+}
+
+template <typename T>
+inline port::Status SetCublasLtAttr(cublasLtMatmulPreference_t handle,
+                                    cublasLtMatmulPreferenceAttributes_t attr,
+                                    const T &value) {
+  cublasStatus_t status =
+      cublasLtMatmulPreferenceSetAttribute(handle, attr, &value, sizeof(value));
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return port::Status(
+        port::error::INTERNAL,
+        absl::StrCat("cublasLtMatmulPreferenceSetAttribute(attr=", attr,
+                     ", value=", value, ") failed: ", ToString(status)));
+  }
+  return port::Status::OK();
+}
+
+template <typename T>
+inline bool GetCublasLtAttr(const cublasLtMatmulAlgo_t *handle,
+                            cublasLtMatmulAlgoConfigAttributes_t attr,
+                            T *value) {
+  auto mutable_handle = const_cast<cublasLtMatmulAlgo_t *>(handle);
+  size_t bytes_written = 0;
+  return cublasLtMatmulAlgoConfigGetAttribute(mutable_handle, attr, value,
+                                              sizeof(T), &bytes_written) ==
+             CUBLAS_STATUS_SUCCESS &&
+         bytes_written == sizeof(T);
+}
+
+template <typename T>
+inline const T &ValueForStrCat(const T &value) {
+  return value;
+}
+template <typename T>
+inline absl::Hex ValueForStrCat(T *ptr) {
+  return absl::Hex(reinterpret_cast<uintptr_t>(ptr));
+}
+
+template <typename T>
+inline port::Status SetCublasLtAttr(cublasLtMatmulDesc_t handle,
+                                    cublasLtMatmulDescAttributes_t attr,
+                                    const T &value) {
+  cublasStatus_t status =
+      cublasLtMatmulDescSetAttribute(handle, attr, &value, sizeof(value));
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return port::Status(
+        port::error::INTERNAL,
+        absl::StrCat("cublasLtMatmulDescSetAttribute(attr=", attr, ", value=",
+                     ValueForStrCat(value), ") failed: ", ToString(status)));
+  }
+  return port::Status::OK();
+}
+
+struct MatmulDescDestroyer {
+  void operator()(cublasLtMatmulDesc_t matmul_desc) const {
+    cublasLtMatmulDescDestroy(matmul_desc);
+  }
+};
+struct LayoutDestroyer {
+  void operator()(cublasLtMatrixLayout_t layout) const {
+    cublasLtMatrixLayoutDestroy(layout);
+  }
+};
+struct MatmulPreferenceDestroyer {
+  void operator()(cublasLtMatmulPreference_t matmul_pref) const {
+    cublasLtMatmulPreferenceDestroy(matmul_pref);
+  }
+};
+using UniqueOpDesc =
+    std::unique_ptr<std::remove_pointer<cublasLtMatmulDesc_t>::type,
+                    MatmulDescDestroyer>;
+using UniqueLayoutDesc =
+    std::unique_ptr<std::remove_pointer<cublasLtMatrixLayout_t>::type,
+                    LayoutDestroyer>;
+using UniqueMatmulPreference =
+    std::unique_ptr<std::remove_pointer<cublasLtMatmulPreference_t>::type,
+                    MatmulPreferenceDestroyer>;
+
+port::StatusOr<UniqueOpDesc> CreateCublasLtOperationDesc(
+    blas::ComputationType computation_type, blas::DataType scale_type,
+    blas::PointerMode pointer_mode, blas::Epilogue epilogue,
+    blas::Transpose transa, blas::Transpose transb) {
+  cublasLtMatmulDesc_t desc;
+  cublasComputeType_t cublas_compute_type =
+      CUBLASComputationType(computation_type);
+  cudaDataType_t cuda_scale_type = GetCUDADataType(scale_type);
+  cublasStatus_t status =
+      cublasLtMatmulDescCreate(&desc, cublas_compute_type, cuda_scale_type);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return port::Status(
+        port::error::INTERNAL,
+        absl::StrCat("cublasLtMatmulDescCreate(computation_type=",
+                     computation_type, ") failed: ", ToString(status)));
+  }
+  UniqueOpDesc unique_desc(desc);
+  SE_RETURN_IF_ERROR(SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_POINTER_MODE,
+                                     CUBLASPointerMode(pointer_mode)));
+  SE_RETURN_IF_ERROR(SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_EPILOGUE,
+                                     CUBLASEpilogue(epilogue)));
+  SE_RETURN_IF_ERROR(SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_TRANSA,
+                                     CUDABlasTranspose(transa)));
+  SE_RETURN_IF_ERROR(SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_TRANSB,
+                                     CUDABlasTranspose(transb)));
+  return unique_desc;
+}
+
+port::StatusOr<UniqueLayoutDesc> CreateCublasLtLayoutDesc(
+    blas::DataType data_type, uint64 rows, uint64 cols, int64 ld, int64 stride,
+    int batch_count) {
+  cublasLtMatrixLayout_t desc;
+  cublasStatus_t status = cublasLtMatrixLayoutCreate(
+      &desc, GetCUDADataType(data_type), rows, cols, ld);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return port::Status(
+        port::error::INTERNAL,
+        absl::StrCat("cublasLtMatrixLayoutCreate failed: ", ToString(status)));
+  }
+  UniqueLayoutDesc unique_desc(desc);
+  SE_RETURN_IF_ERROR(
+      SetCublasLtAttr(desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, batch_count));
+  SE_RETURN_IF_ERROR(SetCublasLtAttr(
+      desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, stride));
+  return unique_desc;
+}
+
+// Helper function to allocate workspace.
+port::Status AllocateWorkspace(void **workspace,
+                               ScratchAllocator *scratch_allocator,
+                               size_t num_bytes) {
+  SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> workspace_bytes,
+                      scratch_allocator->AllocateBytes(num_bytes));
+  *workspace = (void *)GpuMemoryMutable(&workspace_bytes);
+  return port::Status::OK();
+}
+
+template <typename T>
+blas::ComputationType ToComputationType();
+template <>
+blas::ComputationType ToComputationType<Eigen::half>() {
+  return blas::ComputationType::kF16;
+}
+template <>
+blas::ComputationType ToComputationType<float>() {
+  return blas::ComputationType::kF32;
+}
+template <>
+blas::ComputationType ToComputationType<double>() {
+  return blas::ComputationType::kF64;
+}
+template <>
+blas::ComputationType ToComputationType<std::complex<float>>() {
+  return blas::ComputationType::kComplexF32;
+}
+template <>
+blas::ComputationType ToComputationType<std::complex<double>>() {
+  return blas::ComputationType::kComplexF64;
+}
+
+class CUDABlasLtMatmulPlan final : public blas::IBlasLtMatmulPlan {
+ public:
+  CUDABlasLtMatmulPlan(UniqueOpDesc op_desc, UniqueLayoutDesc a_desc,
+                       UniqueLayoutDesc b_desc, UniqueLayoutDesc c_desc,
+                       UniqueLayoutDesc d_desc, blas::DataType ab_type,
+                       blas::DataType c_type, blas::DataType scale_type,
+                       blas::PointerMode pointer_mode, blas::Epilogue epilogue,
+                       int batch_count, int64 stride_a, int64 stride_b,
+                       int64 stride_c, int64 stride_d)
+      : op_desc_(std::move(op_desc)),
+        a_desc_(std::move(a_desc)),
+        b_desc_(std::move(b_desc)),
+        c_desc_(std::move(c_desc)),
+        d_desc_(std::move(d_desc)),
+        ab_type_(ab_type),
+        c_type_(c_type),
+        scale_type_(scale_type),
+        pointer_mode_(pointer_mode),
+        epilogue_(epilogue),
+        batch_count_(batch_count),
+        stride_a_(stride_a),
+        stride_b_(stride_b),
+        stride_c_(stride_c),
+        stride_d_(stride_d) {}
+
+  cublasLtMatmulDesc_t op_desc() const { return op_desc_.get(); }
+  cublasLtMatrixLayout_t a_desc() const { return a_desc_.get(); }
+  cublasLtMatrixLayout_t b_desc() const { return b_desc_.get(); }
+  cublasLtMatrixLayout_t c_desc() const { return c_desc_.get(); }
+  cublasLtMatrixLayout_t d_desc() const { return d_desc_.get(); }
+  bool ok() { return op_desc_ && a_desc_ && b_desc_ && c_desc_ && d_desc_; }
+
+  blas::DataType ab_type() const override { return ab_type_; }
+  blas::DataType c_type() const override { return c_type_; }
+  blas::DataType scale_type() const { return scale_type_; }
+  blas::PointerMode pointer_mode() const { return pointer_mode_; }
+  blas::Epilogue epilogue() const { return epilogue_; }
+  int batch_count() const { return batch_count_; }
+  int64 stride_a() const { return stride_a_; }
+  int64 stride_b() const { return stride_b_; }
+  int64 stride_c() const { return stride_c_; }
+  int64 stride_d() const { return stride_d_; }
+
+  // Note: Must be const to satisfy API. This is always called before the plan
+  // is executed, so the state change is not observed in subsequent executions.
+  bool SetBiasPointer(const void *bias) const;
+
+ private:
+  UniqueOpDesc op_desc_;
+  UniqueLayoutDesc a_desc_;
+  UniqueLayoutDesc b_desc_;
+  UniqueLayoutDesc c_desc_;
+  UniqueLayoutDesc d_desc_;
+  blas::DataType ab_type_;
+  blas::DataType c_type_;
+  blas::DataType scale_type_;
+  blas::PointerMode pointer_mode_;
+  blas::Epilogue epilogue_;
+  int batch_count_;
+  int64 stride_a_;
+  int64 stride_b_;
+  int64 stride_c_;
+  int64 stride_d_;
+};
+
+bool CUDABlasLtMatmulPlan::SetBiasPointer(const void *bias) const {
+  return SetCublasLtAttr(op_desc_.get(), CUBLASLT_MATMUL_DESC_BIAS_POINTER,
+                         bias)
+      .ok();
+}
+
+class CUDABlasLtMatmulAlgorithm final : public blas::IBlasLtMatmulAlgorithm {
+ public:
+  CUDABlasLtMatmulAlgorithm(blas::AlgorithmType index,
+                            cublasLtMatmulAlgo_t algo, size_t workspace_size)
+      : index_(index), algo_(algo), workspace_size_(workspace_size) {}
+
+  blas::AlgorithmType index() const override { return index_; }
+
+  size_t workspace_size() const override { return workspace_size_; }
+
+  const cublasLtMatmulAlgo_t *algo() const { return &algo_; }
+
+  int algo_id() const {
+    int id;
+    GetCublasLtAttr(&algo_, CUBLASLT_ALGO_CONFIG_ID, &id);
+    return id;
+  }
+
+ private:
+  blas::AlgorithmType index_;
+  cublasLtMatmulAlgo_t algo_;
+  size_t workspace_size_;
+};
+
+port::StatusOr<UniqueMatmulPreference> CreateCublasLtMatmulPreference(
+    const blas::IBlasLtMatmulPlan *plan, size_t max_workspace_bytes) {
+  cublasLtMatmulPreference_t preference;
+  cublasStatus_t status = cublasLtMatmulPreferenceCreate(&preference);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return port::Status(port::error::INTERNAL,
+                        absl::StrCat("cublasLtMatmulPreferenceCreate failed: ",
+                                     ToString(status)));
+  }
+  UniqueMatmulPreference unique_preference(preference);
+  SE_RETURN_IF_ERROR(SetCublasLtAttr(preference,
+                                     CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+                                     max_workspace_bytes));
+
+  const auto &cuda_plan = *static_cast<const CUDABlasLtMatmulPlan *>(plan);
+  if (cuda_plan.batch_count() == 0) {
+    return unique_preference;
+  }
+  // This is a workaround for a known issue in cuBlasLt where the heuristic may
+  // in rare cases select an algo that does not support the specified stride.
+  // Specifying the alignment requirements manually like this avoids the issue.
+  auto get_alignment_bytes = [](int64 stride, blas::DataType dtype) {
+    return (stride & -stride) * GetDataTypeSizeBytes(dtype);
+  };
+  if (cuda_plan.stride_a()) {
+    SE_RETURN_IF_ERROR(
+        SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES,
+                        (uint32)get_alignment_bytes(cuda_plan.stride_a(),
+                                                    cuda_plan.ab_type())));
+  }
+  if (cuda_plan.stride_b()) {
+    SE_RETURN_IF_ERROR(
+        SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES,
+                        (uint32)get_alignment_bytes(cuda_plan.stride_b(),
+                                                    cuda_plan.ab_type())));
+  }
+  if (cuda_plan.stride_c()) {
+    SE_RETURN_IF_ERROR(SetCublasLtAttr(
+        preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES,
+        (uint32)get_alignment_bytes(cuda_plan.stride_c(), cuda_plan.c_type())));
+  }
+  if (cuda_plan.stride_d()) {
+    SE_RETURN_IF_ERROR(SetCublasLtAttr(
+        preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES,
+        (uint32)get_alignment_bytes(cuda_plan.stride_d(), cuda_plan.c_type())));
+  }
+  return unique_preference;
+}
+
+}  // namespace
+
+#endif  // CUDA_VERSION >= 11000
+
+port::StatusOr<std::unique_ptr<blas::IBlasLtMatmulPlan>>
+CUDABlas::CreateBlasLtMatmulPlan(const blas::BlasLtMatmulPlanParams &p) {
+#if CUDA_VERSION >= 11000
+  SE_ASSIGN_OR_RETURN(
+      auto op_desc,
+      CreateCublasLtOperationDesc(
+          p.computation_type, GetScaleType(p.c_type, p.computation_type),
+          p.pointer_mode, p.epilogue, p.transa, p.transb));
+  uint64 rows_a = p.transa == blas::Transpose::kNoTranspose ? p.m : p.k;
+  uint64 cols_a = p.transa == blas::Transpose::kNoTranspose ? p.k : p.m;
+  uint64 rows_b = p.transb == blas::Transpose::kNoTranspose ? p.k : p.n;
+  uint64 cols_b = p.transb == blas::Transpose::kNoTranspose ? p.n : p.k;
+  SE_ASSIGN_OR_RETURN(auto a_desc,
+                      CreateCublasLtLayoutDesc(p.ab_type, rows_a, cols_a, p.lda,
+                                               p.stride_a, p.batch_count));
+  SE_ASSIGN_OR_RETURN(auto b_desc,
+                      CreateCublasLtLayoutDesc(p.ab_type, rows_b, cols_b, p.ldb,
+                                               p.stride_b, p.batch_count));
+  SE_ASSIGN_OR_RETURN(auto c_desc,
+                      CreateCublasLtLayoutDesc(p.c_type, p.m, p.n, p.ldc,
+                                               p.stride_c, p.batch_count));
+  SE_ASSIGN_OR_RETURN(auto d_desc,
+                      CreateCublasLtLayoutDesc(p.c_type, p.m, p.n, p.ldc,
+                                               p.stride_c, p.batch_count));
+  blas::DataType scale_type = GetScaleType(p.c_type, p.computation_type);
+
+  return static_cast<std::unique_ptr<blas::IBlasLtMatmulPlan>>(
+      std::make_unique<CUDABlasLtMatmulPlan>(
+          std::move(op_desc), std::move(a_desc), std::move(b_desc),
+          std::move(c_desc), std::move(d_desc), p.ab_type, p.c_type, scale_type,
+          p.pointer_mode, p.epilogue, p.batch_count, p.stride_a, p.stride_b,
+          p.stride_c, p.stride_c));
+#else
+  return port::Status(
+      port::error::UNIMPLEMENTED,
+      "CreateBlasLtMatmulPlan is not supported with this version of CUDA");
+#endif
+}
+
+port::StatusOr<std::vector<std::unique_ptr<blas::IBlasLtMatmulAlgorithm>>>
+CUDABlas::GetBlasLtMatmulAlgorithms(const blas::IBlasLtMatmulPlan *plan,
+                                    size_t max_workspace_size,
+                                    int max_algorithm_count) {
+#if CUDA_VERSION >= 11000
+  SE_ASSIGN_OR_RETURN(UniqueMatmulPreference preference,
+                      CreateCublasLtMatmulPreference(plan, max_workspace_size));
+
+  std::vector<cublasLtMatmulHeuristicResult_t> results(max_algorithm_count);
+  {
+    absl::MutexLock lock(&mu_);
+
+    CHECK(blasLt_ != nullptr);
+
+    gpu::ScopedActivateExecutorContext sac{parent_};
+
+    int found_algorithm_count = 0;
+    const auto &cuda_plan = *static_cast<const CUDABlasLtMatmulPlan *>(plan);
+    cublasStatus_t status = cublasLtMatmulAlgoGetHeuristic(
+        blasLt_, cuda_plan.op_desc(), cuda_plan.a_desc(), cuda_plan.b_desc(),
+        cuda_plan.c_desc(), cuda_plan.d_desc(), preference.get(),
+        max_algorithm_count, results.data(), &found_algorithm_count);
+    if (status != CUBLAS_STATUS_SUCCESS) {
+      return port::Status(
+          port::error::INTERNAL,
+          absl::StrCat("cublasLtMatmulAlgoGetHeuristic failed: ",
+                       ToString(status)));
+    }
+    results.resize(found_algorithm_count);
+  }
+
+  std::vector<std::unique_ptr<blas::IBlasLtMatmulAlgorithm>> out_algorithms;
+  out_algorithms.reserve(results.size());
+  for (size_t i = 0; i < results.size(); ++i) {
+    const auto &result = results[i];
+    if (result.state != CUBLAS_STATUS_SUCCESS) continue;  // Skip failed algos
+    out_algorithms.emplace_back(std::make_unique<CUDABlasLtMatmulAlgorithm>(
+        i, result.algo, result.workspaceSize));
+  }
+  return out_algorithms;
+#else  // if CUDA_VERSION < 11000
+  return port::Status(
+      port::error::UNIMPLEMENTED,
+      "GetBlasLtMatmulAlgorithms is not supported with this version of CUDA");
+#endif
+}
+
+#if CUDA_VERSION >= 11000
+bool CUDABlas::DoBlasLtMatmulInternal(
+    Stream *stream, bool err_on_failure, const blas::IBlasLtMatmulPlan *plan,
+    const HostOrDeviceScalar<void> &alpha, DeviceMemoryBase a,
+    DeviceMemoryBase b, const HostOrDeviceScalar<void> &beta,
+    DeviceMemoryBase c, DeviceMemoryBase d, ScratchAllocator *scratch_allocator,
+    const blas::IBlasLtMatmulAlgorithm *algorithm, DeviceMemoryBase bias) {
+  const auto &cuda_plan = *static_cast<const CUDABlasLtMatmulPlan *>(plan);
+  const auto &cuda_algo =
+      *static_cast<const CUDABlasLtMatmulAlgorithm *>(algorithm);
+
+  if (alpha.data_type() != cuda_plan.scale_type() ||
+      beta.data_type() != cuda_plan.scale_type()) {
+    VLOG(2) << "DoBlasLtMatmul returning false because alpha and beta types do "
+               "not match plan: expected "
+            << cuda_plan.c_type() << ", got alpha=" << alpha.data_type()
+            << " beta=" << beta.data_type();
+    return false;
+  }
+  if (alpha.is_pointer() != beta.is_pointer()) {
+    VLOG(2) << "DoBlasLtMatmul returning false because one of `alpha` "
+               "and `beta` is a pointer, but the other is not.";
+    return false;
+  }
+  bool is_pointer_mode_host = !alpha.is_pointer();
+  if ((cuda_plan.pointer_mode() == blas::PointerMode::kHost) !=
+      is_pointer_mode_host) {
+    VLOG(2) << "DoBlasLtMatmul returning false because plan has wrong "
+               "pointer_mode for the given alpha/beta.";
+    return false;
+  }
+  if ((cuda_plan.epilogue() == blas::Epilogue::kBias ||
+       cuda_plan.epilogue() == blas::Epilogue::kBiasThenReLU) !=
+      (bias != nullptr)) {
+    VLOG(2) << "DoBlasLtMatmul returning false because plan has wrong "
+               "epilogue for the given bias pointer.";
+    return false;
+  }
+  if (bias != nullptr) {
+    if (!cuda_plan.SetBiasPointer(bias.opaque())) {
+      VLOG(2) << "DoBlasLtMatmul returning false because setting the bias "
+                 "pointer failed.";
+      return false;
+    }
+  }
+  const void *alpha_ptr = alpha.is_pointer() ? alpha.opaque_pointer().opaque()
+                                             : alpha.opaque_value();
+  const void *beta_ptr =
+      beta.is_pointer() ? beta.opaque_pointer().opaque() : beta.opaque_value();
+
+  void *workspace = nullptr;
+  if (cuda_algo.workspace_size()) {
+    port::Status allocation_status = AllocateWorkspace(
+        &workspace, scratch_allocator, cuda_algo.workspace_size());
+    if (!allocation_status.ok()) {
+      if (err_on_failure || VLOG_IS_ON(3)) {
+        LOG(ERROR)
+            << "Failed to allocate workspace for cublasLtMatmul algo with id: "
+            << cuda_algo.algo_id() << " requiring "
+            << cuda_algo.workspace_size() << " bytes of workspace";
+      }
+      return false;
+    }
+  }
+
+  cudaStream_t cuda_stream = CUDAStream(stream);
+
+  absl::MutexLock lock(&mu_);
+
+  CHECK(blasLt_ != nullptr);
+
+  gpu::ScopedActivateExecutorContext sac{parent_};
+
+  cublasStatus_t ret = cublasLtMatmul(
+      blasLt_, cuda_plan.op_desc(), alpha_ptr, a.opaque(), cuda_plan.a_desc(),
+      b.opaque(), cuda_plan.b_desc(), beta_ptr, c.opaque(), cuda_plan.c_desc(),
+      d.opaque(), cuda_plan.d_desc(), cuda_algo.algo(), workspace,
+      cuda_algo.workspace_size(), cuda_stream);
+  if (ret != CUBLAS_STATUS_SUCCESS) {
+    if (err_on_failure || VLOG_IS_ON(3)) {
+      LOG(ERROR) << "failed to run cublasLtMatmul routine: " << ToString(ret);
+    }
+    return false;
+  }
+  return true;
+}
+#endif  // CUDA_VERSION >= 11000
+
+bool CUDABlas::DoBlasLtMatmul(
+    Stream *stream, const blas::IBlasLtMatmulPlan *plan,
+    const HostOrDeviceScalar<void> &alpha, DeviceMemoryBase a,
+    DeviceMemoryBase b, const HostOrDeviceScalar<void> &beta,
+    DeviceMemoryBase c, ScratchAllocator *scratch_allocator,
+    const blas::IBlasLtMatmulAlgorithm *algorithm, DeviceMemoryBase bias,
+    blas::ProfileResult *output_profile_result) {
+#if CUDA_VERSION >= 11000
+  const auto &cuda_plan = *static_cast<const CUDABlasLtMatmulPlan *>(plan);
+  HostOrDeviceScalar<void> alpha_cast = alpha;
+  HostOrDeviceScalar<void> beta_cast = beta;
+  if (cuda_plan.c_type() == blas::DataType::kHalf &&
+      cuda_plan.scale_type() == blas::DataType::kFloat) {
+    // The given alpha and beta types are F16 (they always match c), but F32*
+    // computation type requires that they be F32, so we must cast them.
+    if (alpha.is_pointer() || beta.is_pointer()) {
+      // We cannot easily convert a pointer to f16 memory to a pointer to f32
+      // memory from here, so we don't support this for now.
+      return false;
+    }
+    alpha_cast = HostOrDeviceScalar<void>(
+        static_cast<float>(alpha.value<Eigen::half>()));
+    beta_cast =
+        HostOrDeviceScalar<void>(static_cast<float>(beta.value<Eigen::half>()));
+  }
+
+  std::unique_ptr<GpuTimer, GpuTimerDeleter> timer;
+  if (output_profile_result) {
+    timer.reset(new GpuTimer(parent_));
+    if (!timer->Init() || !timer->Start(AsGpuStream(stream))) {
+      return false;
+    }
+  }
+
+  bool err_on_failure = timer != nullptr;
+  bool result = DoBlasLtMatmulInternal(stream, err_on_failure, plan, alpha_cast,
+                                       a, b, beta_cast, c, c, scratch_allocator,
+                                       algorithm, bias);
+
+  if (timer && result) {
+    // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error
+    // state.
+    if (!timer->Stop(AsGpuStream(stream))) {
+      return false;
+    }
+    output_profile_result->set_is_valid(true);
+    output_profile_result->set_algorithm(algorithm->index());
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
+  }
+  return result;
+#else  // if CUDA_VERSION < 11000
+  return false;
+#endif
+}
+
 port::Status CUDABlas::GetVersion(std::string *version) {
   absl::MutexLock lock(&mu_);
 
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 9ff63102aaa..33d841b2c52 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -21,7 +21,9 @@ limitations under the License.
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
 
 #include "absl/synchronization/mutex.h"
+#include "third_party/gpus/cuda/include/cublasLt.h"
 #include "third_party/gpus/cuda/include/cublas_v2.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/stream_executor/blas.h"
 #include "tensorflow/stream_executor/host_or_device_scalar.h"
@@ -71,6 +73,9 @@ class CUDABlas : public blas::BlasSupport {
   // invoked before calling into cuBLAS.
   bool SetStream(Stream *stream) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
+  // Returns the underlying CUDA stream.
+  cudaStream_t CUDAStream(Stream *stream);
+
   // A helper function that calls the real cuBLAS function together with error
   // handling.
   //
@@ -134,6 +139,17 @@ class CUDABlas : public blas::BlasSupport {
                                    const T &beta, DeviceMemory<T> *y, int incy,
                                    blas::ProfileResult *output_profile_result);
 
+  // Helper function for implementing DoBlasLtMatmul.
+  bool DoBlasLtMatmulInternal(Stream *stream, bool err_on_failure,
+                              const blas::IBlasLtMatmulPlan *plan,
+                              const HostOrDeviceScalar<void> &alpha,
+                              DeviceMemoryBase a, DeviceMemoryBase b,
+                              const HostOrDeviceScalar<void> &beta,
+                              DeviceMemoryBase c, DeviceMemoryBase d,
+                              ScratchAllocator *scratch_allocator,
+                              const blas::IBlasLtMatmulAlgorithm *algorithm,
+                              DeviceMemoryBase bias);
+
   // Guards the cuBLAS handle for this device.
   absl::Mutex mu_;
 
@@ -144,6 +160,11 @@ class CUDABlas : public blas::BlasSupport {
   // cuBLAS library handle on the device.
   cublasHandle_t blas_ TF_GUARDED_BY(mu_);
 
+#if CUDA_VERSION >= 11000
+  // cuBLASLt library handle on the device.
+  cublasLtHandle_t blasLt_ GUARDED_BY(mu_);
+#endif
+
   SE_DISALLOW_COPY_AND_ASSIGN(CUDABlas);
 };
 
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 716eeea7cfa..a7ed5cedb4f 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -19,10 +19,11 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/tf32_utils.h"
+#include "tensorflow/core/platform/tensor_float_32_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
@@ -317,6 +318,16 @@ port::Status CudnnSupport::Init() {
       return port::Status(port::error::INTERNAL, error);
     }
 
+    // Preload sub libs for cudnn 8.0.4+
+#if CUDNN_MAJOR >= 8 && (CUDNN_MINOR > 0 || CUDNN_PATCHLEVEL >= 4)
+    cudnnOpsInferVersionCheck();
+    cudnnOpsTrainVersionCheck();
+    cudnnCnnInferVersionCheck();
+    cudnnCnnTrainVersionCheck();
+    cudnnAdvInferVersionCheck();
+    cudnnAdvTrainVersionCheck();
+#endif
+
     cudnn_.reset(new CudnnAccess(cudnn_handle));
     return port::Status::OK();
   }
@@ -740,7 +751,7 @@ static bool IsTensorMathOpSet(const CudnnConvolutionDescriptor& conv) {
 
 static bool TensorOpMathAvailable(int cc_major) { return cc_major >= 7; }
 
-static bool IsTensorMathAllowed(Stream* stream, dnn::DataType input_type) {
+static bool IsTensorMathEnabled(Stream* stream, dnn::DataType input_type) {
   int cc_major, cc_minor;
   std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
   if (!TensorOpMathAvailable(cc_major)) {
@@ -750,7 +761,7 @@ static bool IsTensorMathAllowed(Stream* stream, dnn::DataType input_type) {
 #if CUDNN_VERSION < 8000
     return false;
 #else
-    if (!tensorflow::tf32_execution_allowed()) {
+    if (!tensorflow::tensor_float_32_execution_enabled()) {
       return false;
     }
 #endif
@@ -1099,7 +1110,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
     // TODO(csigg): Minimal support cuDNN version is 7.3, clean up.
     bool allow_tensor_ops = data_type == CUDNN_DATA_HALF;
     if (data_type == CUDNN_DATA_FLOAT)
-      allow_tensor_ops = tensorflow::tf32_execution_allowed();
+      allow_tensor_ops = tensorflow::tensor_float_32_execution_enabled();
     bool use_tensor_ops =
         algorithm_config.algorithm().has_value()
             ? algorithm_config.algorithm()->tensor_ops_enabled()
@@ -2647,12 +2658,12 @@ port::StatusOr<bool> UseTensorOps(Stream* stream, dnn::DataType type,
   bool use_tensor_ops;
   if (desc.has_value()) {
     use_tensor_ops = desc->tensor_ops_enabled();
-    if (use_tensor_ops && !IsTensorMathAllowed(stream, type)) {
+    if (use_tensor_ops && !IsTensorMathEnabled(stream, type)) {
       return port::Status(port::error::INVALID_ARGUMENT,
-                          "Algo requests disallowed tensor op evaluation.");
+                          "Algo requests disabled tensor op evaluation.");
     }
   } else {
-    use_tensor_ops = IsTensorMathAllowed(stream, type);
+    use_tensor_ops = IsTensorMathEnabled(stream, type);
   }
   return use_tensor_ops;
 }
@@ -2708,7 +2719,7 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
   // no_scratch algorithm.
   if (!algo_desc.has_value()) {
     return port::Status(
-        port::error::INVALID_ARGUMENT,
+        scratch_or.status().code(),
         absl::StrCat("The primary convolution algorithm failed, ",
                      "while a secondary algorithm is not provided. ",
                      "Returned status: ", scratch_or.status().ToString()));
@@ -3703,7 +3714,7 @@ port::Status CudnnSupport::DoBatchNormalizationBackwardImpl(
   return port::Status::OK();
 }
 
-bool CudnnSupport::DoFusedConvolve(
+port::Status CudnnSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<double>& conv_input_data, double conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3716,18 +3727,16 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoFusedConvolveImpl(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data,
-          GetConvAccumulatorType(dnn::DataType::kDouble), scratch_allocator,
-          algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
+  return DoFusedConvolveImpl(
+      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+      side_input_scale, bias_descriptor, biases, activation_mode,
+      output_descriptor, output_data,
+      GetConvAccumulatorType(dnn::DataType::kDouble), scratch_allocator,
+      algorithm_config, output_profile_result);
 }
 
-bool CudnnSupport::DoFusedConvolve(
+port::Status CudnnSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<float>& conv_input_data, float conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3740,18 +3749,16 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoFusedConvolveImpl(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data,
-          GetConvAccumulatorType(dnn::DataType::kFloat), scratch_allocator,
-          algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
+  return DoFusedConvolveImpl(
+      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+      side_input_scale, bias_descriptor, biases, activation_mode,
+      output_descriptor, output_data,
+      GetConvAccumulatorType(dnn::DataType::kFloat), scratch_allocator,
+      algorithm_config, output_profile_result);
 }
 
-bool CudnnSupport::DoFusedConvolve(
+port::Status CudnnSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<Eigen::half>& conv_input_data, float conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3765,18 +3772,16 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoFusedConvolveImpl(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data,
-          GetConvAccumulatorType(dnn::DataType::kHalf), scratch_allocator,
-          algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
+  return DoFusedConvolveImpl(
+      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+      side_input_scale, bias_descriptor, biases, activation_mode,
+      output_descriptor, output_data,
+      GetConvAccumulatorType(dnn::DataType::kHalf), scratch_allocator,
+      algorithm_config, output_profile_result);
 }
 
-bool CudnnSupport::DoFusedConvolve(
+port::Status CudnnSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3793,23 +3798,21 @@ bool CudnnSupport::DoFusedConvolve(
   std::tie(cc_major, cc_minor) = GetCcMajorMinor(stream);
 
   if (cc_major < 6 || (cc_major == 6 && cc_minor < 1)) {
-    LOG(WARNING) << "cudnnConvolutionBiasActivationForward() for int8 is only "
-                    "supported on GPUs with compute capability 6.1 or later.";
-    return false;
+    return port::UnimplementedError(
+        "cudnnConvolutionBiasActivationForward() for int8 is only supported on "
+        "GPUs with compute capability 6.1 or later.");
   }
 
-  return IsStatusOk(
-      DoFusedConvolveImpl(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data,
-          GetConvAccumulatorType(dnn::DataType::kInt8), scratch_allocator,
-          algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
+  return DoFusedConvolveImpl(
+      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+      side_input_scale, bias_descriptor, biases, activation_mode,
+      output_descriptor, output_data,
+      GetConvAccumulatorType(dnn::DataType::kInt8), scratch_allocator,
+      algorithm_config, output_profile_result);
 }
 
-bool CudnnSupport::DoFusedConvolve(
+port::Status CudnnSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3826,20 +3829,18 @@ bool CudnnSupport::DoFusedConvolve(
   stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
                                                                    &cc_minor);
   if (cc_major < 6 || (cc_major == 6 && cc_minor < 1)) {
-    LOG(WARNING) << "cudnnConvolutionBiasActivationForward() for int8 is only "
-                    "supported on GPUs with compute capability 6.1 or later.";
-    return false;
+    return port::UnimplementedError(
+        "cudnnConvolutionBiasActivationForward() for int8 is only supported on "
+        "GPUs with compute capability 6.1 or later.");
   }
 
-  return IsStatusOk(
-      DoFusedConvolveImpl(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data,
-          GetConvAccumulatorType(dnn::DataType::kInt8), scratch_allocator,
-          algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
+  return DoFusedConvolveImpl(
+      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+      side_input_scale, bias_descriptor, biases, activation_mode,
+      output_descriptor, output_data,
+      GetConvAccumulatorType(dnn::DataType::kInt8), scratch_allocator,
+      algorithm_config, output_profile_result);
 }
 
 port::Status CudnnSupport::DoPrepareForCtcLoss(
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 181502e03ee..9cab982c9a1 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -277,7 +277,7 @@ class CudnnSupport : public dnn::DnnSupport {
       dnn::AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
       dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(
+  port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<double>& conv_input_data, double conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -291,7 +291,7 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(
+  port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<float>& conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -305,25 +305,23 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(Stream* stream,
-                       const dnn::BatchDescriptor& conv_input_descriptor,
-                       const DeviceMemory<Eigen::half>& conv_input_data,
-                       float conv_input_scale,
-                       const dnn::FilterDescriptor& filter_descriptor,
-                       const DeviceMemory<Eigen::half>& filter_data,
-                       const dnn::ConvolutionDescriptor& convolution_descriptor,
-                       const DeviceMemory<Eigen::half>& side_input_data,
-                       float side_input_scale,
-                       const dnn::BatchDescriptor& bias_descriptor,
-                       const DeviceMemory<Eigen::half>& biases,
-                       dnn::ActivationMode activation_mode,
-                       const dnn::BatchDescriptor& output_descriptor,
-                       DeviceMemory<Eigen::half>* output_data,
-                       ScratchAllocator* scratch_allocator,
-                       const dnn::AlgorithmConfig& algorithm_config,
-                       dnn::ProfileResult* output_profile_result) override;
+  port::Status DoFusedConvolve(
+      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
+      const DeviceMemory<Eigen::half>& conv_input_data, float conv_input_scale,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<Eigen::half>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const DeviceMemory<Eigen::half>& side_input_data, float side_input_scale,
+      const dnn::BatchDescriptor& bias_descriptor,
+      const DeviceMemory<Eigen::half>& biases,
+      dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<Eigen::half>* output_data,
+      ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(
+  port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -337,7 +335,7 @@ class CudnnSupport : public dnn::DnnSupport {
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(
+  port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
diff --git a/tensorflow/stream_executor/cuda/cudnn_8_0.inc b/tensorflow/stream_executor/cuda/cudnn_8_0.inc
index e6d7eb25408..9161dbc8cf9 100644
--- a/tensorflow/stream_executor/cuda/cudnn_8_0.inc
+++ b/tensorflow/stream_executor/cuda/cudnn_8_0.inc
@@ -55,6 +55,23 @@ cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
   return func_ptr(handle);
 }
 
+
+#if CUDNN_MAJOR>=8 && (CUDNN_MINOR > 0 || CUDNN_PATCHLEVEL >= 4)
+cudnnStatus_t CUDNNWINAPI cudnnCnnInferVersionCheck(void) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCnnInferVersionCheck");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cudnnStatus_t CUDNNWINAPI cudnnCnnTrainVersionCheck(void) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCnnTrainVersionCheck");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+#endif
+
 cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
                                          cudaStream_t streamId) {
   using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
diff --git a/tensorflow/stream_executor/data_type.h b/tensorflow/stream_executor/data_type.h
new file mode 100644
index 00000000000..a1ce663c51e
--- /dev/null
+++ b/tensorflow/stream_executor/data_type.h
@@ -0,0 +1,66 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_STREAM_EXECUTOR_DATA_TYPE_H_
+#define TENSORFLOW_STREAM_EXECUTOR_DATA_TYPE_H_
+
+#include <complex>
+
+#include "tensorflow/stream_executor/dnn.pb.h"
+#include "tensorflow/stream_executor/platform/port.h"
+
+namespace Eigen {
+struct half;
+}  // namespace Eigen
+
+namespace stream_executor {
+namespace dnn {
+
+// A helper class to convert C/C++ types to the proper enums.
+template <typename T>
+struct ToDataType;
+template <>
+struct ToDataType<float> {
+  static constexpr DataType value = DataType::kFloat;
+};
+template <>
+struct ToDataType<double> {
+  static constexpr DataType value = DataType::kDouble;
+};
+template <>
+struct ToDataType<Eigen::half> {
+  static constexpr DataType value = DataType::kHalf;
+};
+template <>
+struct ToDataType<tensorflow::int8> {
+  static constexpr DataType value = DataType::kInt8;
+};
+template <>
+struct ToDataType<tensorflow::int32> {
+  static constexpr DataType value = DataType::kInt32;
+};
+template <>
+struct ToDataType<std::complex<float>> {
+  static constexpr DataType value = DataType::kComplexFloat;
+};
+template <>
+struct ToDataType<std::complex<double>> {
+  static constexpr DataType value = DataType::kComplexDouble;
+};
+
+}  // namespace dnn
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_STREAM_EXECUTOR_DATA_TYPE_H_
diff --git a/tensorflow/stream_executor/device_memory_allocator.h b/tensorflow/stream_executor/device_memory_allocator.h
index f80352e1138..029072c018c 100644
--- a/tensorflow/stream_executor/device_memory_allocator.h
+++ b/tensorflow/stream_executor/device_memory_allocator.h
@@ -59,7 +59,9 @@ class ScopedDeviceMemory {
   //             out of scope.
   ScopedDeviceMemory(DeviceMemoryBase mem, int device_ordinal,
                      DeviceMemoryAllocator *allocator)
-      : wrapped_(mem), device_ordinal_(device_ordinal), allocator_(allocator) {}
+      : wrapped_(mem), device_ordinal_(device_ordinal), allocator_(allocator) {
+    DCHECK_GE(device_ordinal_, 0);
+  }
 
   // A helper constructor to generate a scoped device memory given an already
   // allocated memory and a stream executor.
@@ -79,8 +81,9 @@ class ScopedDeviceMemory {
   //
   // Postcondition: other == nullptr.
   ScopedDeviceMemory(ScopedDeviceMemory &&other)
-      : ScopedDeviceMemory(other.Release(), other.device_ordinal_,
-                           other.allocator_) {}
+      : wrapped_(other.Release()),
+        device_ordinal_(other.device_ordinal_),
+        allocator_(other.allocator_) {}
 
   // Releases the memory that was provided in the constructor, through the
   // "parent" StreamExecutor.
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 7b45ec2cc87..920f5fe246c 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -30,6 +30,7 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
+#include "tensorflow/stream_executor/data_type.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/dnn.pb.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
@@ -110,30 +111,6 @@ enum class QuantizedActivationMode {
   k32Bit = 4,
 };
 
-// A helper class to convert C/C++ types to the proper enums.
-template <typename T>
-struct ToDataType;
-template <>
-struct ToDataType<float> {
-  static constexpr DataType value = DataType::kFloat;
-};
-template <>
-struct ToDataType<double> {
-  static constexpr DataType value = DataType::kDouble;
-};
-template <>
-struct ToDataType<Eigen::half> {
-  static constexpr DataType value = DataType::kHalf;
-};
-template <>
-struct ToDataType<int8> {
-  static constexpr DataType value = DataType::kInt8;
-};
-template <>
-struct ToDataType<int32> {
-  static constexpr DataType value = DataType::kInt32;
-};
-
 // Specifies the types of a RNN model.
 enum class RnnMode {
   kRnnRelu = 0,
@@ -1163,7 +1140,7 @@ class DnnSupport {
   //   that if the inverse of the filter is applied to the output in VALID mode
   //   the result is the same size as the input - this requires even more
   //   padding of the input.
-  virtual bool DoFusedConvolve(
+  virtual port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<double>& conv_input_data, double conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -1176,11 +1153,12 @@ class DnnSupport {
       DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) {
-    return false;
+    return port::UnimplementedError(
+        "DnnSupport::DoFusedConvolve not implemented on this platform.");
   }
 
   // This is the float version of DoFusedConvolve.
-  virtual bool DoFusedConvolve(
+  virtual port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<float>& conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -1193,12 +1171,13 @@ class DnnSupport {
       DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) {
-    return false;
+    return port::UnimplementedError(
+        "DnnSupport::DoFusedConvolve not implemented on this platform.");
   }
 
   // This is the Eigen::half version of DoFusedConvolve.
   // The scaling parameters are still floats.
-  virtual bool DoFusedConvolve(
+  virtual port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<Eigen::half>& conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -1213,12 +1192,13 @@ class DnnSupport {
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) {
-    return false;
+    return port::UnimplementedError(
+        "DnnSupport::DoFusedConvolve not implemented on this platform.");
   }
 
   // This is the int8 version of DoFusedConvolve.
   // The bias input and scaling parameters are floats.
-  virtual bool DoFusedConvolve(
+  virtual port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -1231,12 +1211,13 @@ class DnnSupport {
       DeviceMemory<int8>* output_data, ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) {
-    return false;
+    return port::UnimplementedError(
+        "DnnSupport::DoFusedConvolve not implemented on this platform.");
   }
 
   // This is the int8 version of DoFusedConvolve.
   // The output, bias input and scaling parameters are floats.
-  virtual bool DoFusedConvolve(
+  virtual port::Status DoFusedConvolve(
       Stream* /*stream*/, const dnn::BatchDescriptor& /*conv_input_descriptor*/,
       const DeviceMemory<int8>& /*conv_input_data*/, float /*conv_input_scale*/,
       const dnn::FilterDescriptor& /*filter_descriptor*/,
@@ -1252,7 +1233,8 @@ class DnnSupport {
       ScratchAllocator* /*scratch_allocator*/,
       const dnn::AlgorithmConfig& /*algorithm_config*/,
       dnn::ProfileResult* /*output_profile_result*/) {
-    return false;
+    return port::UnimplementedError(
+        "DnnSupport::DoFusedConvolve not implemented on this platform.");
   }
 
   template <typename ElementType, typename OutputType>
diff --git a/tensorflow/stream_executor/dnn.proto b/tensorflow/stream_executor/dnn.proto
index 4d09e615e7d..f849b011eb3 100644
--- a/tensorflow/stream_executor/dnn.proto
+++ b/tensorflow/stream_executor/dnn.proto
@@ -12,6 +12,8 @@ enum DataType {
   kHalf = 2;
   kInt8 = 3;
   kInt32 = 4;
+  kComplexFloat = 5;
+  kComplexDouble = 6;
 }
 
 // Describes how a convolution input or output layer's data is formatted.
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
index 9744fc82593..258142f5d83 100644
--- a/tensorflow/stream_executor/gpu/BUILD
+++ b/tensorflow/stream_executor/gpu/BUILD
@@ -1,6 +1,7 @@
 # Description:
 #   GPU-platform specific StreamExecutor support code.
 
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load(
     "//tensorflow/stream_executor:build_defs.bzl",
     "if_gpu_is_configured",
@@ -9,8 +10,12 @@ load(
     "//tensorflow/core/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
-load("//tensorflow:tensorflow.bzl", "tf_copts")
+load("//tensorflow:tensorflow.bzl", "if_libtpu", "tf_copts")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+load(
+    "//tensorflow/core/platform:rules_cc.bzl",
+    "cc_library",
+)
 
 package(
     default_visibility = [
@@ -65,8 +70,10 @@ cc_library(
         "//tensorflow/stream_executor:device_options",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
+    ] + if_libtpu(
+        if_false = ["@local_config_cuda//cuda:cuda_headers"],
+        if_true = [],
+    ),
 )
 
 cc_library(
@@ -233,8 +240,8 @@ cc_library(
         ":gpu_driver_header",
         ":gpu_helpers_header",
         "//tensorflow/core:lib",
-        "//tensorflow/core:regexp_internal",
-        "//tensorflow/core:cuda_libdevice_path",
+        "//tensorflow/core/platform:regexp",
+        "//tensorflow/core/platform:cuda_libdevice_path",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/strings:str_format",
@@ -244,6 +251,7 @@ cc_library(
     ]) + if_cuda_is_configured([
         "//tensorflow/stream_executor/cuda:cuda_driver",
         "//tensorflow/stream_executor/cuda:ptxas_wrapper",
+        "//tensorflow/stream_executor/cuda:fatbinary_wrapper",
     ]),
 )
 
@@ -264,9 +272,9 @@ cc_library(
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
-        "//tensorflow/core:allocator",
+        "//tensorflow/core/framework:allocator",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/platform:stream_executor_no_cuda",
         "//tensorflow/stream_executor:device_memory",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor:stream_executor_headers",
diff --git a/tensorflow/stream_executor/gpu/asm_compiler.cc b/tensorflow/stream_executor/gpu/asm_compiler.cc
index 0f6fd4de910..6127f644471 100644
--- a/tensorflow/stream_executor/gpu/asm_compiler.cc
+++ b/tensorflow/stream_executor/gpu/asm_compiler.cc
@@ -140,34 +140,44 @@ port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
   return CompileGpuAsm(cc_major, cc_minor, ptx_contents, options);
 }
 
-port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
-                                                 const char* ptx_contents,
-                                                 GpuAsmOpts options) {
-  std::string ptxas_path;
-  auto env = tensorflow::Env::Default();
-  std::string ptxas_binary_name = "ptxas";
+static std::string findCudaExecutable(const std::string binary_name,
+                                      const std::string preferred_cuda_dir) {
 #if defined(PLATFORM_WINDOWS)
-  ptxas_binary_name += ".exe";
+  const std::string binary_filename = binary_name + ".exe";
+#else
+  const std::string& binary_filename = binary_name;
 #endif
 
+  // Search in cuda root candidates.
+  auto env = tensorflow::Env::Default();
+  std::string binary_path;
   for (const std::string& cuda_root :
-       tensorflow::CandidateCudaRoots(options.preferred_cuda_dir)) {
-    ptxas_path = tensorflow::io::JoinPath(cuda_root, "bin", ptxas_binary_name);
-    VLOG(2) << "Looking for ptxas at " << ptxas_path;
-    if (env->FileExists(ptxas_path).ok()) {
+       tensorflow::CandidateCudaRoots(preferred_cuda_dir)) {
+    binary_path = tensorflow::io::JoinPath(cuda_root, "bin", binary_filename);
+    VLOG(2) << "Looking for " << binary_filename << " at " << binary_path;
+    if (env->FileExists(binary_path).ok()) {
       break;
     }
   }
-  if (!env->FileExists(ptxas_path).ok()) {
+  if (!env->FileExists(binary_path).ok()) {
     // Rely on subprocess invocation to find the correct binary.
-    ptxas_path = ptxas_binary_name;
+    binary_path = binary_filename;
   }
-  VLOG(2) << "Using ptxas at " << ptxas_path;
+  VLOG(2) << "Using " << binary_filename << " at " << binary_path;
+  return binary_path;
+}
+
+port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
+                                                 const char* ptx_contents,
+                                                 GpuAsmOpts options) {
+  std::string ptxas_path =
+      findCudaExecutable("ptxas", options.preferred_cuda_dir);
 
   WarnIfBadPtxasVersion(ptxas_path);
 
   // Write ptx into a temporary file.
   std::string ptx_path;
+  auto env = tensorflow::Env::Default();
   if (!env->LocalTempFilename(&ptx_path)) {
     return port::InternalError("couldn't get temp PTX file name");
   }
@@ -215,6 +225,21 @@ port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
   int exit_status = ptxas_info_dumper.Communicate(
       /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
   if (exit_status != 0) {
+    //  It happens when the ptxas installed is too old for the current GPU.
+    //  Example error message associated with this error code:
+    //      ptxas fatal   : Value 'sm_80' is not defined for option 'gpu-name'
+    // In that case, fallback to the driver for compilation
+    if (absl::StartsWith(stderr_output, "ptxas fatal   : Value '") &&
+        absl::StrContains(stderr_output,
+                          "is not defined for option 'gpu-name'")) {
+      LOG(WARNING) << "Your CUDA software stack is old. We fallback to the"
+                   << " NVIDIA driver for some compilation. Update your CUDA"
+                   << " version to get the best performance."
+                   << " The ptxas error was: " << stderr_output;
+      return tensorflow::errors::Unimplemented(
+          ptxas_path, " ptxas too old. Falling back to the driver to compile.");
+    }
+
     return port::InternalError(
         absl::StrFormat("ptxas exited with non-zero error code %d, output: %s",
                         exit_status, stderr_output));
@@ -232,4 +257,78 @@ port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
   return cubin_vector;
 }
 
+port::StatusOr<std::vector<uint8>> BundleGpuAsm(
+    std::vector<CubinOrPTXImage> images, const std::string preferred_cuda_dir) {
+  std::string fatbinary_path =
+      findCudaExecutable("fatbinary", preferred_cuda_dir);
+
+  // Write images to temporary files.
+  std::vector<std::string> image_paths;
+  auto env = tensorflow::Env::Default();
+  for (const CubinOrPTXImage& img : images) {
+    std::string img_path;
+    if (!env->LocalTempFilename(&img_path)) {
+      return port::InternalError(
+          "Could not get temporary filenames for images.");
+    }
+    TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(
+        env, img_path, std::string(img.bytes.begin(), img.bytes.end())));
+    VLOG(2) << "image written to " << img_path;
+    image_paths.push_back(std::move(img_path));
+  }
+  auto image_files_cleaner = tensorflow::gtl::MakeCleanup([&image_paths] {
+    for (const auto& path : image_paths) {
+      TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(path));
+    }
+  });
+
+  // Prepare temorary result file.
+  std::string result_path;
+  if (!env->LocalTempFilename(&result_path)) {
+    return port::InternalError(
+        "Could not get temporary filename for fatbin result.");
+  }
+  auto result_file_cleaner = tensorflow::gtl::MakeCleanup([&result_path] {
+    // This file may never be created, so the failure to delete it should not
+    // propagate to TF.
+    tensorflow::Env::Default()->DeleteFile(result_path).IgnoreError();
+  });
+
+  // Invoke fatbinary and collect its output.
+  tensorflow::SubProcess fatbinary;
+  std::vector<std::string> fatbinary_args = {
+      fatbinary_path, "--64",           "--cmdline=--compile-only",
+      "--link",       "--compress-all", absl::StrCat("--create=", result_path)};
+  assert(images.size() == image_paths.size());
+  for (int i = 0; i < images.size(); i++) {
+    fatbinary_args.push_back(absl::StrFormat(
+        "--image=profile=%s,file=%s", images[i].profile, image_paths[i]));
+  }
+  if (VLOG_IS_ON(3)) {
+    VLOG(3) << absl::StrJoin(fatbinary_args, " ");
+  }
+  fatbinary.SetProgram(fatbinary_path, fatbinary_args);
+  fatbinary.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
+  if (!fatbinary.Start()) {
+    return port::InternalError("Failed to launch fatbinary.");
+  }
+  std::string stderr_output;
+  int exit_status = fatbinary.Communicate(
+      /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
+  if (exit_status != 0) {
+    return port::InternalError(absl::StrFormat(
+        "fatbinary exited with non-zero error code %d, output: %s", exit_status,
+        stderr_output));
+  }
+  if (!stderr_output.empty()) {
+    VLOG(2) << stderr_output;
+  }
+
+  // Read in the result and return it as a byte vector.
+  std::string result_blob;
+  TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
+                                                  result_path, &result_blob));
+  return std::vector<uint8>(result_blob.begin(), result_blob.end());
+}
+
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/gpu/asm_compiler.h b/tensorflow/stream_executor/gpu/asm_compiler.h
index e5f67a71242..513ac6ca867 100644
--- a/tensorflow/stream_executor/gpu/asm_compiler.h
+++ b/tensorflow/stream_executor/gpu/asm_compiler.h
@@ -52,6 +52,16 @@ port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
 port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
     int device_ordinal, const char* ptx, GpuAsmOpts compilation_options);
 
+struct CubinOrPTXImage {
+  std::string profile;
+  std::vector<uint8> bytes;
+};
+
+// Bundles the GPU machine code (cubins) and PTX if requested and returns the
+// resulting binary (i.e. a fatbin) as a byte array.
+port::StatusOr<std::vector<uint8>> BundleGpuAsm(
+    std::vector<CubinOrPTXImage> images, const std::string preferred_cuda_dir);
+
 }  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_
diff --git a/tensorflow/stream_executor/host/BUILD b/tensorflow/stream_executor/host/BUILD
index be5af1f6ee7..a48e1cdf5b0 100644
--- a/tensorflow/stream_executor/host/BUILD
+++ b/tensorflow/stream_executor/host/BUILD
@@ -1,6 +1,8 @@
 # Description:
 #   Host-platform specific StreamExecutor support code.
 
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/stream_executor:build_defs.bzl", "stream_executor_friends")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
@@ -54,6 +56,7 @@ cc_library(
         "//tensorflow/stream_executor:stream_executor_headers",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
     ],
diff --git a/tensorflow/stream_executor/host/host_platform.cc b/tensorflow/stream_executor/host/host_platform.cc
index a829ffc96fb..a8543881d50 100644
--- a/tensorflow/stream_executor/host/host_platform.cc
+++ b/tensorflow/stream_executor/host/host_platform.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <thread>
 
+#include "absl/memory/memory.h"
 #include "absl/strings/str_format.h"
 #include "tensorflow/stream_executor/host/host_gpu_executor.h"
 #include "tensorflow/stream_executor/host/host_platform_id.h"
diff --git a/tensorflow/stream_executor/host_or_device_scalar.h b/tensorflow/stream_executor/host_or_device_scalar.h
index 1f5d4b9260c..3274e7849fe 100644
--- a/tensorflow/stream_executor/host_or_device_scalar.h
+++ b/tensorflow/stream_executor/host_or_device_scalar.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
 #define TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
 
+#include "tensorflow/stream_executor/data_type.h"
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 
@@ -23,6 +24,7 @@ namespace stream_executor {
 
 // Allows to represent a value that is either a host scalar or a scalar stored
 // on the GPU device.
+// See also the specialization for ElemT=void below.
 template <typename ElemT>
 class HostOrDeviceScalar {
  public:
@@ -52,5 +54,154 @@ class HostOrDeviceScalar {
   bool is_pointer_;
 };
 
+// Specialization for wrapping a dynamically-typed value (via type erasure).
+template <>
+class HostOrDeviceScalar<void> {
+ public:
+  using DataType = dnn::DataType;
+
+  // Constructors not marked as explicit because when using this constructor, we
+  // usually want to set this to a compile-time constant.
+
+  // NOLINTNEXTLINE google-explicit-constructor
+  HostOrDeviceScalar(float value)
+      : float_(value), is_pointer_(false), dtype_(DataType::kFloat) {}
+  // NOLINTNEXTLINE google-explicit-constructor
+  HostOrDeviceScalar(double value)
+      : double_(value), is_pointer_(false), dtype_(DataType::kDouble) {}
+  // NOLINTNEXTLINE google-explicit-constructor
+  HostOrDeviceScalar(Eigen::half value)
+      : half_(value), is_pointer_(false), dtype_(DataType::kHalf) {}
+  // NOLINTNEXTLINE google-explicit-constructor
+  HostOrDeviceScalar(int8 value)
+      : int8_(value), is_pointer_(false), dtype_(DataType::kInt8) {}
+  // NOLINTNEXTLINE google-explicit-constructor
+  HostOrDeviceScalar(int32 value)
+      : int32_(value), is_pointer_(false), dtype_(DataType::kInt32) {}
+  // NOLINTNEXTLINE google-explicit-constructor
+  HostOrDeviceScalar(std::complex<float> value)
+      : complex_float_(value),
+        is_pointer_(false),
+        dtype_(DataType::kComplexFloat) {}
+  // NOLINTNEXTLINE google-explicit-constructor
+  HostOrDeviceScalar(std::complex<double> value)
+      : complex_double_(value),
+        is_pointer_(false),
+        dtype_(DataType::kComplexDouble) {}
+  template <typename T>
+  explicit HostOrDeviceScalar(const DeviceMemory<T>& pointer)
+      : pointer_(pointer),
+        is_pointer_(true),
+        dtype_(dnn::ToDataType<T>::value) {
+    CHECK_EQ(1, pointer.ElementCount());
+  }
+  // Construct from statically-typed version.
+  template <typename T, typename std::enable_if<!std::is_same<T, void>::value,
+                                                int>::type = 0>
+  // NOLINTNEXTLINE google-explicit-constructor
+  HostOrDeviceScalar(const HostOrDeviceScalar<T>& other) {
+    if (other.is_pointer()) {
+      *this = HostOrDeviceScalar(other.pointer());
+    } else {
+      *this = HostOrDeviceScalar(other.value());
+    }
+  }
+
+  bool is_pointer() const { return is_pointer_; }
+  template <typename T>
+  const DeviceMemory<T>& pointer() const {
+    CHECK(is_pointer());
+    CHECK(dtype_ == dnn::ToDataType<T>::value);
+    return pointer_;
+  }
+  template <typename T>
+  const T& value() const {
+    CHECK(!is_pointer());
+    CHECK(dtype_ == dnn::ToDataType<T>::value);
+    return value_impl<T>();
+  }
+  const DeviceMemoryBase& opaque_pointer() const {
+    CHECK(is_pointer());
+    return pointer_;
+  }
+  const void* opaque_value() const {
+    CHECK(!is_pointer());
+    switch (dtype_) {
+      case DataType::kFloat:
+        return &float_;
+      case DataType::kDouble:
+        return &double_;
+      case DataType::kHalf:
+        return &half_;
+      case DataType::kInt8:
+        return &int8_;
+      case DataType::kInt32:
+        return &int32_;
+      case DataType::kComplexFloat:
+        return &complex_float_;
+      case DataType::kComplexDouble:
+        return &complex_double_;
+      default:
+        return nullptr;
+    }
+  }
+  DataType data_type() const { return dtype_; }
+
+ private:
+  template <typename T>
+  const T& value_impl() const;
+
+  union {
+    float float_;
+    double double_;
+    Eigen::half half_;
+    int8 int8_;
+    int32 int32_;
+    std::complex<float> complex_float_;
+    std::complex<double> complex_double_;
+    DeviceMemoryBase pointer_;
+  };
+  bool is_pointer_;
+  DataType dtype_;
+};
+
+template <>
+inline const float& HostOrDeviceScalar<void>::value_impl<float>() const {
+  return float_;
+}
+
+template <>
+inline const double& HostOrDeviceScalar<void>::value_impl<double>() const {
+  return double_;
+}
+
+template <>
+inline const Eigen::half& HostOrDeviceScalar<void>::value_impl<Eigen::half>()
+    const {
+  return half_;
+}
+
+template <>
+inline const int8& HostOrDeviceScalar<void>::value_impl<int8>() const {
+  return int8_;
+}
+
+template <>
+inline const int32& HostOrDeviceScalar<void>::value_impl<int32>() const {
+  return int32_;
+}
+
+template <>
+inline const std::complex<float>&
+HostOrDeviceScalar<void>::value_impl<std::complex<float>>() const {
+  return complex_float_;
+}
+
+template <>
+inline const std::complex<double>&
+HostOrDeviceScalar<void>::value_impl<std::complex<double>>() const {
+  return complex_double_;
+}
+
 }  // namespace stream_executor
 #endif  // TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
diff --git a/tensorflow/stream_executor/lib/BUILD b/tensorflow/stream_executor/lib/BUILD
index 76fe0ed94e3..d0f57112471 100644
--- a/tensorflow/stream_executor/lib/BUILD
+++ b/tensorflow/stream_executor/lib/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:tensorflow.bzl", "filegroup")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "if_windows", "tf_cc_test")
 load("//tensorflow/stream_executor:build_defs.bzl", "stream_executor_friends")
 
diff --git a/tensorflow/stream_executor/platform/BUILD b/tensorflow/stream_executor/platform/BUILD
index 37bec24dfe5..d2172934b1e 100644
--- a/tensorflow/stream_executor/platform/BUILD
+++ b/tensorflow/stream_executor/platform/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/core/platform:build_config.bzl", "tf_platform_deps")
 load("//tensorflow/stream_executor:build_defs.bzl", "stream_executor_friends")
 
diff --git a/tensorflow/stream_executor/platform/default/BUILD b/tensorflow/stream_executor/platform/default/BUILD
index 032dc518f1a..ce9731e994f 100644
--- a/tensorflow/stream_executor/platform/default/BUILD
+++ b/tensorflow/stream_executor/platform/default/BUILD
@@ -1,9 +1,10 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow:tensorflow.bzl", "tf_copts")
+
 licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//tensorflow/stream_executor:__subpackages__"])
 
-load("//tensorflow:tensorflow.bzl", "tf_copts")
-
 cc_library(
     name = "platform",
     textual_hdrs = ["initialize.h"],
@@ -18,8 +19,12 @@ cc_library(
         "//conditions:default": ["dlopen_checker_stub.cc"],
     }),
     hdrs = ["dso_loader.h"],
+    compatible_with = [],
     copts = tf_copts(),
-    tags = ["nobuilder"],
+    tags = [
+        "manual",
+        "nobuilder",
+    ],
     deps = [
         "//tensorflow/stream_executor:platform",
         "//tensorflow/stream_executor/lib",
diff --git a/tensorflow/stream_executor/platform/default/dlopen_checker.cc b/tensorflow/stream_executor/platform/default/dlopen_checker.cc
index b55c9f53793..7b38dfcfec0 100644
--- a/tensorflow/stream_executor/platform/default/dlopen_checker.cc
+++ b/tensorflow/stream_executor/platform/default/dlopen_checker.cc
@@ -23,6 +23,7 @@ namespace DsoLoader {
 port::Status TryDlopenCUDALibraries() {
   auto cudart_status = GetCudaRuntimeDsoHandle();
   auto cublas_status = GetCublasDsoHandle();
+  auto cublaslt_status = GetCublasLtDsoHandle();
   auto cufft_status = GetCufftDsoHandle();
   auto curand_status = GetCurandDsoHandle();
   auto cusolver_status = GetCusolverDsoHandle();
@@ -31,7 +32,7 @@ port::Status TryDlopenCUDALibraries() {
   if (!cudart_status.status().ok() || !cublas_status.status().ok() ||
       !cufft_status.status().ok() || !curand_status.status().ok() ||
       !cusolver_status.status().ok() || !cusparse_status.status().ok() ||
-      !cudnn_status.status().ok()) {
+      !cudnn_status.status().ok() || !cublaslt_status.status().ok()) {
     return port::Status(port::error::INTERNAL,
                         absl::StrCat("Cannot dlopen all CUDA libraries."));
   } else {
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc
index 70b1ebe070a..8b8cb2ff937 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.cc
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@@ -31,6 +31,7 @@ namespace internal {
 
 namespace {
 string GetCudaVersion() { return TF_CUDA_VERSION; }
+string GetCudaRtVersion() { return TF_CUDART_VERSION; }
 string GetCudnnVersion() { return TF_CUDNN_VERSION; }
 string GetCublasVersion() { return TF_CUBLAS_VERSION; }
 string GetCusolverVersion() { return TF_CUSOLVER_VERSION; }
@@ -77,13 +78,17 @@ port::StatusOr<void*> GetCudaDriverDsoHandle() {
 }
 
 port::StatusOr<void*> GetCudaRuntimeDsoHandle() {
-  return GetDsoHandle("cudart", GetCudaVersion());
+  return GetDsoHandle("cudart", GetCudaRtVersion());
 }
 
 port::StatusOr<void*> GetCublasDsoHandle() {
   return GetDsoHandle("cublas", GetCublasVersion());
 }
 
+port::StatusOr<void*> GetCublasLtDsoHandle() {
+  return GetDsoHandle("cublasLt", GetCublasVersion());
+}
+
 port::StatusOr<void*> GetCufftDsoHandle() {
   return GetDsoHandle("cufft", GetCufftVersion());
 }
@@ -140,7 +145,7 @@ port::StatusOr<void*> GetHipsparseDsoHandle() {
   return GetDsoHandle("hipsparse", "");
 }
 
-port::StatusOr<void*> GetHipDsoHandle() { return GetDsoHandle("hip_hcc", ""); }
+port::StatusOr<void*> GetHipDsoHandle() { return GetDsoHandle("amdhip64", ""); }
 
 }  // namespace DsoLoader
 
@@ -160,6 +165,11 @@ port::StatusOr<void*> GetCublasDsoHandle() {
   return *result;
 }
 
+port::StatusOr<void*> GetCublasLtDsoHandle() {
+  static auto result = new auto(DsoLoader::GetCublasLtDsoHandle());
+  return *result;
+}
+
 port::StatusOr<void*> GetCurandDsoHandle() {
   static auto result = new auto(DsoLoader::GetCurandDsoHandle());
   return *result;
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.h b/tensorflow/stream_executor/platform/default/dso_loader.h
index 91138f713bd..7f087349fcf 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.h
+++ b/tensorflow/stream_executor/platform/default/dso_loader.h
@@ -37,6 +37,7 @@ namespace DsoLoader {
 port::StatusOr<void*> GetCudaDriverDsoHandle();
 port::StatusOr<void*> GetCudaRuntimeDsoHandle();
 port::StatusOr<void*> GetCublasDsoHandle();
+port::StatusOr<void*> GetCublasLtDsoHandle();
 port::StatusOr<void*> GetCufftDsoHandle();
 port::StatusOr<void*> GetCurandDsoHandle();
 port::StatusOr<void*> GetCusolverDsoHandle();
@@ -72,6 +73,7 @@ namespace CachedDsoLoader {
 port::StatusOr<void*> GetCudaDriverDsoHandle();
 port::StatusOr<void*> GetCudaRuntimeDsoHandle();
 port::StatusOr<void*> GetCublasDsoHandle();
+port::StatusOr<void*> GetCublasLtDsoHandle();
 port::StatusOr<void*> GetCufftDsoHandle();
 port::StatusOr<void*> GetCurandDsoHandle();
 port::StatusOr<void*> GetCusolverDsoHandle();
diff --git a/tensorflow/stream_executor/rocm/BUILD b/tensorflow/stream_executor/rocm/BUILD
index bd4c45382f8..39e73ae453c 100644
--- a/tensorflow/stream_executor/rocm/BUILD
+++ b/tensorflow/stream_executor/rocm/BUILD
@@ -1,11 +1,13 @@
 # Description:
 #   ROCm-platform specific StreamExecutor support code.
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow/stream_executor:build_defs.bzl",
     "stream_executor_friends",
 )
 load("//tensorflow:tensorflow.bzl", "tf_copts")
+load("//tensorflow:tensorflow.bzl", "filegroup")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 
diff --git a/tensorflow/stream_executor/rocm/rocm_blas.cc b/tensorflow/stream_executor/rocm/rocm_blas.cc
index 5ddad13ddf9..2223cb9ad67 100644
--- a/tensorflow/stream_executor/rocm/rocm_blas.cc
+++ b/tensorflow/stream_executor/rocm/rocm_blas.cc
@@ -2540,6 +2540,32 @@ port::Status ROCMBlas::GetVersion(string *version) {
   return port::UnimplementedError("");
 }
 
+port::StatusOr<std::unique_ptr<blas::IBlasLtMatmulPlan>>
+ROCMBlas::CreateBlasLtMatmulPlan(const blas::BlasLtMatmulPlanParams &p) {
+  return port::Status(
+      port::error::UNIMPLEMENTED,
+      "CreateBlasLtMatmulPlan is not supported with this version of ROCM");
+}
+
+port::StatusOr<std::vector<std::unique_ptr<blas::IBlasLtMatmulAlgorithm>>>
+ROCMBlas::GetBlasLtMatmulAlgorithms(const blas::IBlasLtMatmulPlan *plan,
+                                    size_t max_workspace_size,
+                                    int max_algorithm_count) {
+  return port::Status(
+      port::error::UNIMPLEMENTED,
+      "GetBlasLtMatmulAlgorithms is not supported with this version of ROCM");
+}
+
+bool ROCMBlas::DoBlasLtMatmul(
+    Stream *stream, const blas::IBlasLtMatmulPlan *plan,
+    const HostOrDeviceScalar<void> &alpha, DeviceMemoryBase a,
+    DeviceMemoryBase b, const HostOrDeviceScalar<void> &beta,
+    DeviceMemoryBase c, ScratchAllocator *scratch_allocator,
+    const blas::IBlasLtMatmulAlgorithm *algorithm, DeviceMemoryBase bias,
+    blas::ProfileResult *output_profile_result) {
+  return false;
+}
+
 }  // namespace gpu
 
 void initialize_rocblas() {
diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.cc b/tensorflow/stream_executor/rocm/rocm_dnn.cc
index 4b2761e7658..4c5a740dfb0 100644
--- a/tensorflow/stream_executor/rocm/rocm_dnn.cc
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.cc
@@ -113,9 +113,6 @@ string ToString(miopenConvFwdAlgorithm_t algorithm) {
     case miopenConvolutionFwdAlgoImplicitGEMM:
       s = "Implicit GEMM";
       break;
-    case miopenConvolutionFwdAlgoStaticCompiledGEMM:
-      s = "Static Compiled GEMM";
-      break;
   }
   return s;
 }
@@ -182,9 +179,6 @@ string ToString(miopenConvAlgorithm_t algorithm) {
     case miopenConvolutionAlgoImplicitGEMM:
       s = "Implicit GEMM";
       break;
-    case miopenConvolutionAlgoStaticCompiledGEMM:
-      s = "Static Compiled GEMM";
-      break;
   }
   return s;
 }
@@ -3680,7 +3674,7 @@ bool MIOpenSupport::DoBatchNormalizationBackwardImpl(
   return true;
 }
 
-bool MIOpenSupport::DoFusedConvolve(
+port::Status MIOpenSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<double>& conv_input_data, double conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3693,11 +3687,10 @@ bool MIOpenSupport::DoFusedConvolve(
     DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  LOG(ERROR) << "fused convolve not implemented yet";
-  return false;
+  return port::UnimplementedError("fused convolve not implemented yet");
 }
 
-bool MIOpenSupport::DoFusedConvolve(
+port::Status MIOpenSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<float>& conv_input_data, float conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3710,11 +3703,10 @@ bool MIOpenSupport::DoFusedConvolve(
     DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  LOG(ERROR) << "fused convolve not implemented yet";
-  return false;
+  return port::UnimplementedError("fused convolve not implemented yet");
 }
 
-bool MIOpenSupport::DoFusedConvolve(
+port::Status MIOpenSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<Eigen::half>& conv_input_data, float conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3728,11 +3720,10 @@ bool MIOpenSupport::DoFusedConvolve(
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  LOG(ERROR) << "fused convolve not implemented yet";
-  return false;
+  return port::UnimplementedError("fused convolve not implemented yet");
 }
 
-bool MIOpenSupport::DoFusedConvolve(
+port::Status MIOpenSupport::DoFusedConvolve(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -3745,8 +3736,7 @@ bool MIOpenSupport::DoFusedConvolve(
     DeviceMemory<int8>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  LOG(ERROR) << "fused convolve not implemented yet";
-  return false;
+  return port::UnimplementedError("fused convolve not implemented yet");
 }
 
 bool MIOpenSupport::DoTransformTensor(Stream* stream,
diff --git a/tensorflow/stream_executor/rocm/rocm_dnn.h b/tensorflow/stream_executor/rocm/rocm_dnn.h
index b01c1cc5290..654a1bf8f3a 100644
--- a/tensorflow/stream_executor/rocm/rocm_dnn.h
+++ b/tensorflow/stream_executor/rocm/rocm_dnn.h
@@ -315,7 +315,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       dnn::AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory,
       dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(
+  port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<double>& conv_input_data, double conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -329,7 +329,7 @@ class MIOpenSupport : public dnn::DnnSupport {
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(
+  port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<float>& conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -343,25 +343,23 @@ class MIOpenSupport : public dnn::DnnSupport {
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(Stream* stream,
-                       const dnn::BatchDescriptor& conv_input_descriptor,
-                       const DeviceMemory<Eigen::half>& conv_input_data,
-                       float conv_input_scale,
-                       const dnn::FilterDescriptor& filter_descriptor,
-                       const DeviceMemory<Eigen::half>& filter_data,
-                       const dnn::ConvolutionDescriptor& convolution_descriptor,
-                       const DeviceMemory<Eigen::half>& side_input_data,
-                       float side_input_scale,
-                       const dnn::BatchDescriptor& bias_descriptor,
-                       const DeviceMemory<Eigen::half>& biases,
-                       dnn::ActivationMode activation_mode,
-                       const dnn::BatchDescriptor& output_descriptor,
-                       DeviceMemory<Eigen::half>* output_data,
-                       ScratchAllocator* scratch_allocator,
-                       const dnn::AlgorithmConfig& algorithm_config,
-                       dnn::ProfileResult* output_profile_result) override;
+  port::Status DoFusedConvolve(
+      Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
+      const DeviceMemory<Eigen::half>& conv_input_data, float conv_input_scale,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<Eigen::half>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const DeviceMemory<Eigen::half>& side_input_data, float side_input_scale,
+      const dnn::BatchDescriptor& bias_descriptor,
+      const DeviceMemory<Eigen::half>& biases,
+      dnn::ActivationMode activation_mode,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<Eigen::half>* output_data,
+      ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::ProfileResult* output_profile_result) override;
 
-  bool DoFusedConvolve(
+  port::Status DoFusedConvolve(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<int8>& conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 505d54cf5bf..4ad9fc128cc 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -251,7 +251,7 @@ Stream::Stream(StreamExecutor *parent)
     : parent_(parent),
       implementation_(parent->implementation()->GetStreamImplementation()),
       allocated_(false),
-      ok_(false),
+      status_(port::InternalError("Uninitialized stream")),
       temporary_memory_manager_(this) {
   VLOG_CALL(PARAM(parent));
 }
@@ -261,7 +261,7 @@ Stream::Stream(StreamExecutor *parent,
     : parent_(parent),
       implementation_(implementation),
       allocated_(false),
-      ok_(false),
+      status_(port::InternalError("Uninitialized stream")),
       temporary_memory_manager_(this) {
   VLOG_CALL(PARAM(parent), PARAM(implementation));
 }
@@ -300,12 +300,12 @@ Stream &Stream::Init() {
   absl::MutexLock lock(&mu_);
   CHECK_EQ(false, allocated_)
       << "stream appears to already have been initialized";
-  CHECK(!ok_) << "stream should be in !ok() state pre-initialization";
+  CHECK(!status_.ok()) << "stream should be in !ok() state pre-initialization";
 
   if (parent_->AllocateStream(this)) {
     // Successful initialization!
     allocated_ = true;
-    ok_ = true;
+    status_ = port::Status::OK();
   } else {
     LOG(ERROR) << "failed to allocate stream during initialization";
   }
@@ -316,11 +316,7 @@ Stream &Stream::Init() {
 Stream &Stream::InitTimer(Timer *timer) {
   VLOG_CALL(PARAM(timer));
 
-  if (ok()) {
     CheckError(parent_->AllocateTimer(timer));
-  } else {
-    LOG(INFO) << "did not allocate timer: " << timer;
-  }
   return *this;
 }
 
@@ -359,17 +355,14 @@ Stream &Stream::ThenBatchNormalizationForward(
     ScratchAllocator *workspace_allocator) {
   VLOG_CALL(PARAM(x), PARAM(scale), PARAM(offset), PARAM(x_desc),
             PARAM(scale_offset_desc), PARAM(epsilon), PARAM(y));
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoBatchNormalizationForward(
-          this, x, scale, offset, estimated_mean, estimated_variance,
-          side_input, x_desc, scale_offset_desc, epsilon,
-          exponential_average_factor, activation_mode, y, batch_mean, batch_var,
-          saved_mean, saved_inv_var, is_training, reserve_space_allocator,
-          workspace_allocator));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoBatchNormalizationForward(
+        this, x, scale, offset, estimated_mean, estimated_variance, side_input,
+        x_desc, scale_offset_desc, epsilon, exponential_average_factor,
+        activation_mode, y, batch_mean, batch_var, saved_mean, saved_inv_var,
+        is_training, reserve_space_allocator, workspace_allocator));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -386,15 +379,13 @@ Stream &Stream::ThenBatchNormalizationBackward(
   VLOG_CALL(PARAM(y_backprop), PARAM(x), PARAM(scale), PARAM(x_desc),
             PARAM(scale_offset_desc), PARAM(epsilon), PARAM(x_backprop),
             PARAM(scale_backprop), PARAM(offset_backprop));
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoBatchNormalizationBackward(
-          this, y_backprop, x, scale, mean, inv_var, x_desc, scale_offset_desc,
-          epsilon, x_backprop, scale_backprop, offset_backprop,
-          reserve_space_data, workspace_allocator));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoBatchNormalizationBackward(
+        this, y_backprop, x, scale, mean, inv_var, x_desc, scale_offset_desc,
+        epsilon, x_backprop, scale_backprop, offset_backprop,
+        reserve_space_data, workspace_allocator));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -415,17 +406,14 @@ Stream &Stream::ThenBatchNormalizationForward(
     ScratchAllocator *workspace_allocator) {
   VLOG_CALL(PARAM(x), PARAM(scale), PARAM(offset), PARAM(x_desc),
             PARAM(scale_offset_desc), PARAM(epsilon), PARAM(y));
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoBatchNormalizationForward(
-          this, x, scale, offset, estimated_mean, estimated_variance,
-          side_input, x_desc, scale_offset_desc, epsilon,
-          exponential_average_factor, activation_mode, y, batch_mean, batch_var,
-          saved_mean, saved_inv_var, is_training, reserve_space_allocator,
-          workspace_allocator));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoBatchNormalizationForward(
+        this, x, scale, offset, estimated_mean, estimated_variance, side_input,
+        x_desc, scale_offset_desc, epsilon, exponential_average_factor,
+        activation_mode, y, batch_mean, batch_var, saved_mean, saved_inv_var,
+        is_training, reserve_space_allocator, workspace_allocator));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -443,21 +431,19 @@ Stream &Stream::ThenBatchNormalizationBackward(
   VLOG_CALL(PARAM(y_backprop), PARAM(x), PARAM(scale), PARAM(x_desc),
             PARAM(scale_offset_desc), PARAM(epsilon), PARAM(x_backprop),
             PARAM(scale_backprop), PARAM(offset_backprop));
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoBatchNormalizationBackward(
-          this, y_backprop, x, scale, mean, inv_var, x_desc, scale_offset_desc,
-          epsilon, x_backprop, scale_backprop, offset_backprop,
-          reserve_space_data, workspace_allocator));
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoBatchNormalizationBackward(
+        this, y_backprop, x, scale, mean, inv_var, x_desc, scale_offset_desc,
+        epsilon, x_backprop, scale_backprop, offset_backprop,
+        reserve_space_data, workspace_allocator));
 
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
 
-Stream &Stream::ThenFusedConvolveWithAlgorithm(
+port::Status Stream::FusedConvolveWithAlgorithm(
     const dnn::BatchDescriptor &conv_input_descriptor,
     const DeviceMemory<double> &conv_input_data, double conv_input_scale,
     const dnn::FilterDescriptor &filter_descriptor,
@@ -477,25 +463,18 @@ Stream &Stream::ThenFusedConvolveWithAlgorithm(
             PARAM(activation_mode), PARAM(output_descriptor), PARAM(output),
             PARAM(algorithm_config));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    return dnn->DoFusedConvolve(
+        this, conv_input_descriptor, conv_input_data, conv_input_scale,
+        filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+        side_input_scale, bias_descriptor, biases, activation_mode,
+        output_descriptor, output, scratch_allocator, algorithm_config,
+        output_profile_result);
   }
-  return *this;
+  return port::UnimplementedError("DNN library is not found.");
 }
 
-Stream &Stream::ThenFusedConvolveWithAlgorithm(
+port::Status Stream::FusedConvolveWithAlgorithm(
     const dnn::BatchDescriptor &conv_input_descriptor,
     const DeviceMemory<float> &conv_input_data, float conv_input_scale,
     const dnn::FilterDescriptor &filter_descriptor,
@@ -515,25 +494,18 @@ Stream &Stream::ThenFusedConvolveWithAlgorithm(
             PARAM(activation_mode), PARAM(output_descriptor), PARAM(output),
             PARAM(algorithm_config));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    return dnn->DoFusedConvolve(
+        this, conv_input_descriptor, conv_input_data, conv_input_scale,
+        filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+        side_input_scale, bias_descriptor, biases, activation_mode,
+        output_descriptor, output, scratch_allocator, algorithm_config,
+        output_profile_result);
   }
-  return *this;
+  return port::UnimplementedError("DNN library is not found.");
 }
 
-Stream &Stream::ThenFusedConvolveWithAlgorithm(
+port::Status Stream::FusedConvolveWithAlgorithm(
     const dnn::BatchDescriptor &conv_input_descriptor,
     const DeviceMemory<Eigen::half> &conv_input_data, float conv_input_scale,
     const dnn::FilterDescriptor &filter_descriptor,
@@ -554,25 +526,18 @@ Stream &Stream::ThenFusedConvolveWithAlgorithm(
             PARAM(bias_descriptor), PARAM(biases), PARAM(activation_mode),
             PARAM(output_descriptor), PARAM(output), PARAM(algorithm_config));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    return dnn->DoFusedConvolve(
+        this, conv_input_descriptor, conv_input_data, conv_input_scale,
+        filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+        side_input_scale, bias_descriptor, biases, activation_mode,
+        output_descriptor, output, scratch_allocator, algorithm_config,
+        output_profile_result);
   }
-  return *this;
+  return port::UnimplementedError("DNN library is not found.");
 }
 
-Stream &Stream::ThenFusedConvolveWithAlgorithm(
+port::Status Stream::FusedConvolveWithAlgorithm(
     const dnn::BatchDescriptor &conv_input_descriptor,
     const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
     const dnn::FilterDescriptor &filter_descriptor,
@@ -592,25 +557,18 @@ Stream &Stream::ThenFusedConvolveWithAlgorithm(
             PARAM(bias_descriptor), PARAM(biases), PARAM(activation_mode),
             PARAM(output_descriptor), PARAM(output), PARAM(algorithm_config));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    return dnn->DoFusedConvolve(
+        this, conv_input_descriptor, conv_input_data, conv_input_scale,
+        filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+        side_input_scale, bias_descriptor, biases, activation_mode,
+        output_descriptor, output, scratch_allocator, algorithm_config,
+        output_profile_result);
   }
-  return *this;
+  return port::UnimplementedError("DNN library is not found.");
 }
 
-Stream &Stream::ThenFusedConvolveWithAlgorithm(
+port::Status Stream::FusedConvolveWithAlgorithm(
     const dnn::BatchDescriptor &conv_input_descriptor,
     const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
     const dnn::FilterDescriptor &filter_descriptor,
@@ -630,232 +588,15 @@ Stream &Stream::ThenFusedConvolveWithAlgorithm(
             PARAM(bias_descriptor), PARAM(biases), PARAM(activation_mode),
             PARAM(output_descriptor), PARAM(output), PARAM(algorithm_config));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoFusedConvolve(
-          this, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output, scratch_allocator,
-          algorithm_config, output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    return dnn->DoFusedConvolve(
+        this, conv_input_descriptor, conv_input_data, conv_input_scale,
+        filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+        side_input_scale, bias_descriptor, biases, activation_mode,
+        output_descriptor, output, scratch_allocator, algorithm_config,
+        output_profile_result);
   }
-  return *this;
-}
-
-Stream &Stream::ThenConvolveWithAlgorithm(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<double> &input_data,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<double> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<double> *output,
-    ScratchAllocator *scratch_allocator,
-    const dnn::AlgorithmConfig &algorithm_config,
-    dnn::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
-            PARAM(filter_descriptor), PARAM(filter_data),
-            PARAM(convolution_descriptor), PARAM(output_descriptor),
-            PARAM(output), PARAM(algorithm_config));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::FORWARD, this, input_descriptor,
-                 input_data, filter_descriptor, filter_data, output_descriptor,
-                 *output, convolution_descriptor, algorithm_config,
-                 scratch_allocator, &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolve(
-            this, input_descriptor, input_data, filter_descriptor, filter_data,
-            convolution_descriptor, output_descriptor, output, algorithm_desc,
-            &scratch_memory, output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenConvolveWithAlgorithm(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<float> &input_data,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<float> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<float> *output,
-    ScratchAllocator *scratch_allocator,
-    const dnn::AlgorithmConfig &algorithm_config,
-    dnn::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
-            PARAM(filter_descriptor), PARAM(filter_data),
-            PARAM(convolution_descriptor), PARAM(output_descriptor),
-            PARAM(output), PARAM(algorithm_config));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::FORWARD, this, input_descriptor,
-                 input_data, filter_descriptor, filter_data, output_descriptor,
-                 *output, convolution_descriptor, algorithm_config,
-                 scratch_allocator, &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolve(
-            this, input_descriptor, input_data, filter_descriptor, filter_data,
-            convolution_descriptor, output_descriptor, output, algorithm_desc,
-            &scratch_memory, output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenConvolveWithAlgorithm(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<Eigen::half> &input_data,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<Eigen::half> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator,
-    const dnn::AlgorithmConfig &algorithm_config,
-    dnn::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
-            PARAM(filter_descriptor), PARAM(filter_data),
-            PARAM(convolution_descriptor), PARAM(output_descriptor),
-            PARAM(output), PARAM(algorithm_config));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::FORWARD, this, input_descriptor,
-                 input_data, filter_descriptor, filter_data, output_descriptor,
-                 *output, convolution_descriptor, algorithm_config,
-                 scratch_allocator, &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolve(
-            this, input_descriptor, input_data, filter_descriptor, filter_data,
-            convolution_descriptor, output_descriptor, output, algorithm_desc,
-            &scratch_memory, output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenConvolveWithAlgorithm(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<int8> &input_data,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<int8> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<float> *output,
-    ScratchAllocator *scratch_allocator,
-    const dnn::AlgorithmConfig &algorithm_config,
-    dnn::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
-            PARAM(filter_descriptor), PARAM(filter_data),
-            PARAM(convolution_descriptor), PARAM(output_descriptor),
-            PARAM(output), PARAM(algorithm_config));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::FORWARD, this, input_descriptor,
-                 input_data, filter_descriptor, filter_data, output_descriptor,
-                 *output, convolution_descriptor, algorithm_config,
-                 scratch_allocator, &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolve(
-            this, input_descriptor, input_data, filter_descriptor, filter_data,
-            convolution_descriptor, output_descriptor, output, algorithm_desc,
-            &scratch_memory, output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenConvolveWithAlgorithm(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<int8> &input_data,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<int8> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &output_descriptor, DeviceMemory<int8> *output,
-    ScratchAllocator *scratch_allocator,
-    const dnn::AlgorithmConfig &algorithm_config,
-    dnn::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
-            PARAM(filter_descriptor), PARAM(filter_data),
-            PARAM(convolution_descriptor), PARAM(output_descriptor),
-            PARAM(output), PARAM(algorithm_config));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::FORWARD, this, input_descriptor,
-                 input_data, filter_descriptor, filter_data, output_descriptor,
-                 *output, convolution_descriptor, algorithm_config,
-                 scratch_allocator, &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolve(
-            this, input_descriptor, input_data, filter_descriptor, filter_data,
-            convolution_descriptor, output_descriptor, output, algorithm_desc,
-            &scratch_memory, output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
+  return port::UnimplementedError("DNN library is not found.");
 }
 
 Stream &Stream::ThenConvolve(
@@ -866,11 +607,15 @@ Stream &Stream::ThenConvolve(
     const dnn::ConvolutionDescriptor &convolution_descriptor,
     const dnn::BatchDescriptor &output_descriptor,
     DeviceMemory<float> *output) {
-  return ThenConvolveWithAlgorithm(
-      input_descriptor, input_data, filter_descriptor, filter_data,
-      convolution_descriptor, output_descriptor, output,
-      /*scratch_allocator=*/nullptr, dnn::AlgorithmConfig(),
-      /*output_profile_result=*/nullptr);
+  if (ok()) {
+    CheckError(ConvolveWithAlgorithm(
+                   input_descriptor, input_data, filter_descriptor, filter_data,
+                   convolution_descriptor, output_descriptor, output,
+                   /*scratch_allocator=*/nullptr, dnn::AlgorithmConfig(),
+                   /*output_profile_result=*/nullptr)
+                   .ok());
+  }
+  return *this;
 }
 
 Stream &Stream::ThenConvolveQuantized(
@@ -887,18 +632,15 @@ Stream &Stream::ThenConvolveQuantized(
             PARAM(coefficient_scales), PARAM(convolution_descriptor),
             PARAM(output_descriptor), PARAM(output));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoConvolveQuantized(
-          this, input_descriptor, input_data, filter_descriptor,
-          filter_coefficients, coefficient_scales, convolution_descriptor,
-          output_descriptor, output));
-    } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoConvolveQuantized(
+        this, input_descriptor, input_data, filter_descriptor,
+        filter_coefficients, coefficient_scales, convolution_descriptor,
+        output_descriptor, output));
+  } else {
+    SetError();
+    LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor "
+                    "without DNN support";
   }
   return *this;
 }
@@ -917,18 +659,15 @@ Stream &Stream::ThenConvolveQuantized(
             PARAM(coefficient_scales), PARAM(convolution_descriptor),
             PARAM(output_descriptor), PARAM(output));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoConvolveQuantized(
-          this, input_descriptor, input_data, filter_descriptor,
-          filter_coefficients, coefficient_scales, convolution_descriptor,
-          output_descriptor, output));
-    } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoConvolveQuantized(
+        this, input_descriptor, input_data, filter_descriptor,
+        filter_coefficients, coefficient_scales, convolution_descriptor,
+        output_descriptor, output));
+  } else {
+    SetError();
+    LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor "
+                    "without DNN support";
   }
   return *this;
 }
@@ -947,285 +686,13 @@ Stream &Stream::ThenSeparableConvolve(
       PARAM(depth_multiplier), PARAM(first_weights), PARAM(second_weights),
       PARAM(convolution_descriptor), PARAM(output_descriptor), PARAM(output));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoSeparableConvolve(
-          this, batch_descriptor, input_data, filter_descriptor,
-          depth_multiplier, first_weights, second_weights,
-          convolution_descriptor, output_descriptor, output));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<double> &filter_data,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<double> backward_output_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &input_descriptor,
-    DeviceMemory<double> *backward_input_data,
-    ScratchAllocator *scratch_allocator,
-    const dnn::AlgorithmConfig &algorithm_config,
-    dnn::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(filter_descriptor), PARAM(filter_data),
-            PARAM(output_descriptor), PARAM(backward_output_data),
-            PARAM(convolution_descriptor), PARAM(input_descriptor),
-            PARAM(backward_input_data));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
-                 *backward_input_data, filter_descriptor, filter_data,
-                 output_descriptor, backward_output_data,
-                 convolution_descriptor, algorithm_config, scratch_allocator,
-                 &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolveBackwardData(
-            this, filter_descriptor, filter_data, output_descriptor,
-            backward_output_data, convolution_descriptor, input_descriptor,
-            backward_input_data, algorithm_desc, &scratch_memory,
-            output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<float> &filter_data,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<float> backward_output_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &input_descriptor,
-    DeviceMemory<float> *backward_input_data,
-    ScratchAllocator *scratch_allocator,
-    const dnn::AlgorithmConfig &algorithm_config,
-    dnn::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(filter_descriptor), PARAM(filter_data),
-            PARAM(output_descriptor), PARAM(backward_output_data),
-            PARAM(convolution_descriptor), PARAM(input_descriptor),
-            PARAM(backward_input_data));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
-                 *backward_input_data, filter_descriptor, filter_data,
-                 output_descriptor, backward_output_data,
-                 convolution_descriptor, algorithm_config, scratch_allocator,
-                 &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolveBackwardData(
-            this, filter_descriptor, filter_data, output_descriptor,
-            backward_output_data, convolution_descriptor, input_descriptor,
-            backward_input_data, algorithm_desc, &scratch_memory,
-            output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenConvolveBackwardDataWithAlgorithm(
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<Eigen::half> &filter_data,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<Eigen::half> backward_output_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &input_descriptor,
-    DeviceMemory<Eigen::half> *backward_input_data,
-    ScratchAllocator *scratch_allocator,
-    const dnn::AlgorithmConfig &algorithm_config,
-    dnn::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(filter_descriptor), PARAM(filter_data),
-            PARAM(output_descriptor), PARAM(backward_output_data),
-            PARAM(convolution_descriptor), PARAM(input_descriptor),
-            PARAM(backward_input_data), PARAM(algorithm_config));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
-                 *backward_input_data, filter_descriptor, filter_data,
-                 output_descriptor, backward_output_data,
-                 convolution_descriptor, algorithm_config, scratch_allocator,
-                 &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolveBackwardData(
-            this, filter_descriptor, filter_data, output_descriptor,
-            backward_output_data, convolution_descriptor, input_descriptor,
-            backward_input_data, algorithm_desc, &scratch_memory,
-            output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<double> &input_data,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<double> backward_output_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::FilterDescriptor &filter_descriptor,
-    DeviceMemory<double> *backward_filter_data,
-    ScratchAllocator *scratch_allocator,
-    const dnn::AlgorithmConfig &algorithm_config,
-    dnn::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
-            PARAM(output_descriptor), PARAM(backward_output_data),
-            PARAM(convolution_descriptor), PARAM(filter_descriptor),
-            PARAM(backward_filter_data));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
-                 input_data, filter_descriptor, *backward_filter_data,
-                 output_descriptor, backward_output_data,
-                 convolution_descriptor, algorithm_config, scratch_allocator,
-                 &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolveBackwardFilter(
-            this, input_descriptor, input_data, output_descriptor,
-            backward_output_data, convolution_descriptor, filter_descriptor,
-            backward_filter_data, algorithm_desc, &scratch_memory,
-            output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<float> &input_data,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<float> backward_output_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::FilterDescriptor &filter_descriptor,
-    DeviceMemory<float> *backward_filter_data,
-    ScratchAllocator *scratch_allocator,
-    const dnn::AlgorithmConfig &algorithm_config,
-    dnn::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
-            PARAM(output_descriptor), PARAM(backward_output_data),
-            PARAM(convolution_descriptor), PARAM(filter_descriptor),
-            PARAM(backward_filter_data));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
-                 input_data, filter_descriptor, *backward_filter_data,
-                 output_descriptor, backward_output_data,
-                 convolution_descriptor, algorithm_config, scratch_allocator,
-                 &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolveBackwardFilter(
-            this, input_descriptor, input_data, output_descriptor,
-            backward_output_data, convolution_descriptor, filter_descriptor,
-            backward_filter_data, algorithm_desc, &scratch_memory,
-            output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
-  }
-  return *this;
-}
-
-Stream &Stream::ThenConvolveBackwardFilterWithAlgorithm(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<Eigen::half> &input_data,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<Eigen::half> backward_output_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::FilterDescriptor &filter_descriptor,
-    DeviceMemory<Eigen::half> *backward_filter_data,
-    ScratchAllocator *scratch_allocator,
-    const dnn::AlgorithmConfig &algorithm_config,
-    dnn::ProfileResult *output_profile_result) {
-  VLOG_CALL(PARAM(input_descriptor), PARAM(input_data),
-            PARAM(output_descriptor), PARAM(backward_output_data),
-            PARAM(convolution_descriptor), PARAM(filter_descriptor),
-            PARAM(backward_filter_data));
-
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      dnn::AlgorithmDesc algorithm_desc;
-      auto status =
-          dnn->PrepareForConvolution(
-                 dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
-                 input_data, filter_descriptor, *backward_filter_data,
-                 output_descriptor, backward_output_data,
-                 convolution_descriptor, algorithm_config, scratch_allocator,
-                 &algorithm_desc, &scratch_memory)
-              .ok();
-      if (status) {
-        status = dnn->DoConvolveBackwardFilter(
-            this, input_descriptor, input_data, output_descriptor,
-            backward_output_data, convolution_descriptor, filter_descriptor,
-            backward_filter_data, algorithm_desc, &scratch_memory,
-            output_profile_result);
-      }
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoSeparableConvolve(
+        this, batch_descriptor, input_data, filter_descriptor, depth_multiplier,
+        first_weights, second_weights, convolution_descriptor,
+        output_descriptor, output));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1239,14 +706,12 @@ Stream &Stream::ThenConvolveBackwardBiasImpl(
   VLOG_CALL(PARAM(input_descriptor), PARAM(input_data), PARAM(bias_descriptor),
             PARAM(backward_bias_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoConvolveBackwardBias(this, input_descriptor, input_data,
-                                             bias_descriptor,
-                                             backward_bias_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoConvolveBackwardBias(this, input_descriptor, input_data,
+                                           bias_descriptor,
+                                           backward_bias_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1286,13 +751,11 @@ Stream &Stream::ThenMatMul(const DeviceMemory<float> &input_data,
   VLOG_CALL(PARAM(input_data), PARAM(weights), PARAM(input_dimensions),
             PARAM(output_dimensions), PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoMatMul(this, input_data, weights, input_dimensions,
-                               output_dimensions, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoMatMul(this, input_data, weights, input_dimensions,
+                             output_dimensions, output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1307,14 +770,12 @@ Stream &Stream::ThenMatMulQuantized(
             PARAM(input_dimensions), PARAM(output_dimensions),
             PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoMatMulQuantized(this, input_data, weights,
-                                        weight_scales, input_dimensions,
-                                        output_dimensions, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoMatMulQuantized(this, input_data, weights, weight_scales,
+                                      input_dimensions, output_dimensions,
+                                      output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1329,14 +790,12 @@ Stream &Stream::ThenMatMulQuantized(
             PARAM(input_dimensions), PARAM(output_dimensions),
             PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoMatMulQuantized(this, input_data, weights,
-                                        weight_scales, input_dimensions,
-                                        output_dimensions, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoMatMulQuantized(this, input_data, weights, weight_scales,
+                                      input_dimensions, output_dimensions,
+                                      output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1348,13 +807,11 @@ Stream &Stream::ThenBiasAdd(const DeviceMemory<float> &input_data,
   VLOG_CALL(PARAM(input_data), PARAM(biases), PARAM(dimensions),
             PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(
-          dnn->DoBiasAdd(this, input_data, biases, dimensions, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(
+        dnn->DoBiasAdd(this, input_data, biases, dimensions, output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1369,17 +826,14 @@ Stream &Stream::ThenPoolForward(
             PARAM(input_data), PARAM(output_dimensions), PARAM(output_data),
             PARAM(workspace_allocator));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
-                                    input_data, output_dimensions, output_data,
-                                    workspace_allocator));
-    } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
+                                  input_data, output_dimensions, output_data,
+                                  workspace_allocator));
+  } else {
+    SetError();
+    LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor "
+                    "without DNN support";
   }
   return *this;
 }
@@ -1394,14 +848,12 @@ Stream &Stream::ThenPoolForward(
             PARAM(input_data), PARAM(output_dimensions), PARAM(output_data),
             PARAM(workspace_allocator));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
-                                    input_data, output_dimensions, output_data,
-                                    workspace_allocator));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
+                                  input_data, output_dimensions, output_data,
+                                  workspace_allocator));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1417,14 +869,12 @@ Stream &Stream::ThenPoolForward(
             PARAM(input_data), PARAM(output_dimensions), PARAM(output_data),
             PARAM(workspace_allocator));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
-                                    input_data, output_dimensions, output_data,
-                                    workspace_allocator));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
+                                  input_data, output_dimensions, output_data,
+                                  workspace_allocator));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1439,14 +889,12 @@ Stream &Stream::ThenPoolForward(
             PARAM(input_data), PARAM(output_dimensions), PARAM(output_data),
             PARAM(workspace_allocator));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
-                                    input_data, output_dimensions, output_data,
-                                    workspace_allocator));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoPoolForward(this, pooling_dimensions, input_dimensions,
+                                  input_data, output_dimensions, output_data,
+                                  workspace_allocator));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1465,18 +913,15 @@ Stream &Stream::ThenPoolBackward(
             PARAM(input_diff_data), PARAM(output_diff_data),
             PARAM(workspace_allocator));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions,
-                                     input_data, output_dimensions, output_data,
-                                     input_diff_data, output_diff_data,
-                                     workspace_allocator));
-    } else {
-      SetError();
-      LOG(WARNING)
-          << "attempting to perform DNN operation using StreamExecutor "
-             "without DNN support";
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions,
+                                   input_data, output_dimensions, output_data,
+                                   input_diff_data, output_diff_data,
+                                   workspace_allocator));
+  } else {
+    SetError();
+    LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor "
+                    "without DNN support";
   }
   return *this;
 }
@@ -1495,15 +940,13 @@ Stream &Stream::ThenPoolBackward(
             PARAM(input_diff_data), PARAM(output_diff_data),
             PARAM(workspace_allocator));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions,
-                                     input_data, output_dimensions, output_data,
-                                     input_diff_data, output_diff_data,
-                                     workspace_allocator));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions,
+                                   input_data, output_dimensions, output_data,
+                                   input_diff_data, output_diff_data,
+                                   workspace_allocator));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1522,15 +965,13 @@ Stream &Stream::ThenPoolBackward(
             PARAM(input_diff_data), PARAM(output_diff_data),
             PARAM(workspace_allocator));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions,
-                                     input_data, output_dimensions, output_data,
-                                     input_diff_data, output_diff_data,
-                                     workspace_allocator));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoPoolBackward(this, pooling_dimensions, input_dimensions,
+                                   input_data, output_dimensions, output_data,
+                                   input_diff_data, output_diff_data,
+                                   workspace_allocator));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1542,13 +983,11 @@ Stream &Stream::ThenNormalizeWithDimensions(
   VLOG_CALL(PARAM(normalize_descriptor), PARAM(dimensions), PARAM(input_data),
             PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoNormalizeWithDimensions(
-          this, normalize_descriptor, dimensions, input_data, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoNormalizeWithDimensions(
+        this, normalize_descriptor, dimensions, input_data, output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1564,15 +1003,13 @@ Stream &Stream::ThenNormalizeBackwardWithDimensions(
             PARAM(normalized_data), PARAM(normalized_variable_gradient),
             PARAM(raw_variable_gradient), PARAM(workspace_allocator));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoNormalizeBackwardWithDimensions(
-          this, normalize_descriptor, dimensions, raw_data, normalized_data,
-          normalized_variable_gradient, raw_variable_gradient,
-          workspace_allocator));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoNormalizeBackwardWithDimensions(
+        this, normalize_descriptor, dimensions, raw_data, normalized_data,
+        normalized_variable_gradient, raw_variable_gradient,
+        workspace_allocator));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1593,13 +1030,11 @@ Stream &Stream::ThenActivateWithOptions(dnn::ActivationMode activation_mode,
   VLOG_CALL(PARAM(activation_mode), PARAM(dimensions), PARAM(input_data),
             PARAM(output_data), PARAM(options));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoActivate(this, activation_mode, dimensions, input_data,
-                                 output_data, options));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoActivate(this, activation_mode, dimensions, input_data,
+                               output_data, options));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1623,13 +1058,11 @@ Stream &Stream::ThenDepthConcatenate(
     }
   }
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoDepthConcatenate(this, input_dimensions, input_data,
-                                         output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoDepthConcatenate(this, input_dimensions, input_data,
+                                       output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1670,13 +1103,11 @@ Stream &Stream::ThenSpaceConcatenate(
       return *this;
     }
   }
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoSpaceConcatenate(this, input_dimensions, input_data,
-                                         output_data, concat_direction));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoSpaceConcatenate(this, input_dimensions, input_data,
+                                       output_data, concat_direction));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1688,13 +1119,11 @@ Stream &Stream::ThenReshape(const dnn::BatchDescriptor &input_dimensions,
   VLOG_CALL(PARAM(input_dimensions), PARAM(input_data),
             PARAM(output_dimensions), PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoReshape(this, input_dimensions, input_data,
-                                output_dimensions, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoReshape(this, input_dimensions, input_data,
+                              output_dimensions, output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1708,14 +1137,12 @@ Stream &Stream::ThenDepthToSpace(
             PARAM(depth_to_space_layout), PARAM(sqrt_depth_reduction),
             PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoDepthToSpace(this, input_dimensions, input_data,
-                                     depth_to_space_layout,
-                                     sqrt_depth_reduction, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoDepthToSpace(this, input_dimensions, input_data,
+                                   depth_to_space_layout, sqrt_depth_reduction,
+                                   output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1729,14 +1156,12 @@ Stream &Stream::ThenSpaceToDepth(
             PARAM(space_to_depth_layout), PARAM(sqrt_depth_increase),
             PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoSpaceToDepth(this, input_dimensions, input_data,
-                                     space_to_depth_layout, sqrt_depth_increase,
-                                     output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoSpaceToDepth(this, input_dimensions, input_data,
+                                   space_to_depth_layout, sqrt_depth_increase,
+                                   output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1750,14 +1175,12 @@ Stream &Stream::ThenElementwiseOperate(
   VLOG_CALL(PARAM(operation), PARAM(input_dimensions), PARAM(input_data),
             PARAM(output_dimensions), PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoElementwiseOperate(this, operation, input_dimensions,
-                                           input_data, output_dimensions,
-                                           output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoElementwiseOperate(this, operation, input_dimensions,
+                                         input_data, output_dimensions,
+                                         output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1773,14 +1196,12 @@ Stream &Stream::ThenElementwiseOperateScaledQuantized(
             PARAM(input_dimensions), PARAM(input_data),
             PARAM(output_dimensions), PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoElementwiseOperateScaledQuantized(
-          this, operation, input_multiplicands, output_divisor,
-          input_dimensions, input_data, output_dimensions, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoElementwiseOperateScaledQuantized(
+        this, operation, input_multiplicands, output_divisor, input_dimensions,
+        input_data, output_dimensions, output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1793,13 +1214,11 @@ Stream &Stream::ThenXYPad(const dnn::BatchDescriptor &dimensions,
             PARAM(right_pad), PARAM(top_pad), PARAM(bottom_pad),
             PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoXYPad(this, dimensions, input_data, left_pad, right_pad,
-                              top_pad, bottom_pad, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoXYPad(this, dimensions, input_data, left_pad, right_pad,
+                            top_pad, bottom_pad, output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1813,14 +1232,11 @@ Stream &Stream::ThenXYSlice(const dnn::BatchDescriptor &dimensions,
             PARAM(right_trim), PARAM(top_trim), PARAM(bottom_trim),
             PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoXYSlice(this, dimensions, input_data, left_trim,
-                                right_trim, top_trim, bottom_trim,
-                                output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoXYSlice(this, dimensions, input_data, left_trim,
+                              right_trim, top_trim, bottom_trim, output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1832,13 +1248,11 @@ Stream &Stream::ThenXYBroadcast(const dnn::BatchDescriptor &dimensions,
   VLOG_CALL(PARAM(dimensions), PARAM(input_data), PARAM(replicate_x),
             PARAM(replicate_y), PARAM(output_data));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoXYBroadcast(this, dimensions, input_data, replicate_x,
-                                    replicate_y, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoXYBroadcast(this, dimensions, input_data, replicate_x,
+                                  replicate_y, output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1849,13 +1263,11 @@ Stream &Stream::ThenMemcpyD2HQuantized(
   VLOG_CALL(PARAM(gpu_unquantized_src), PARAM(mode), PARAM(host_dst),
             PARAM(size));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoMemcpyD2HQuantized(this, gpu_unquantized_src, mode,
-                                           host_dst, size));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoMemcpyD2HQuantized(this, gpu_unquantized_src, mode,
+                                         host_dst, size));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1866,13 +1278,11 @@ Stream &Stream::ThenMemcpyH2DQuantized(
   VLOG_CALL(PARAM(host_src), PARAM(size), PARAM(mode),
             PARAM(gpu_unquantized_dst));
 
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoMemcpyH2DQuantized(this, host_src, size, mode,
-                                           gpu_unquantized_dst));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoMemcpyH2DQuantized(this, host_src, size, mode,
+                                         gpu_unquantized_dst));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -1920,7 +1330,7 @@ Stream *Stream::GetOrCreateSubStream() {
                             false);
   Stream *sub_stream = sub_streams_.back().first.get();
   sub_stream->Init();
-  if (!sub_stream->ok_) {
+  if (!sub_stream->ok()) {
     LOG(ERROR) << "sub-stream failed to be initialized";
   }
   VLOG(1) << DebugStreamPointers() << " created new sub_stream "
@@ -1972,24 +1382,14 @@ void Stream::ReturnSubStream(Stream *sub_stream) {
 Stream &Stream::ThenStartTimer(Timer *t) {
   VLOG_CALL(PARAM(t));
 
-  if (ok()) {
-    CheckError(parent_->StartTimer(this, t));
-  } else {
-    LOG(INFO) << DebugStreamPointers()
-              << " did not enqueue 'start timer': " << t;
-  }
+  CheckError(parent_->StartTimer(this, t));
   return *this;
 }
 
 Stream &Stream::ThenStopTimer(Timer *t) {
   VLOG_CALL(PARAM(t));
 
-  if (ok()) {
-    CheckError(parent_->StopTimer(this, t));
-  } else {
-    LOG(INFO) << DebugStreamPointers()
-              << " did not enqueue 'stop timer': " << t;
-  }
+  CheckError(parent_->StopTimer(this, t));
   return *this;
 }
 
@@ -2079,7 +1479,8 @@ Stream &Stream::ThenBlasAsum(uint64 elem_count, const DeviceMemory<double> &x,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<double> &, int,
-               DeviceMemory<double> *> impl;
+               DeviceMemory<double> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasAsum, elem_count, x, incx,
               result);
 }
@@ -2090,7 +1491,8 @@ Stream &Stream::ThenBlasAsum(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<float> *> impl;
+               DeviceMemory<float> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasAsum, elem_count, x, incx,
               result);
 }
@@ -2101,7 +1503,8 @@ Stream &Stream::ThenBlasAsum(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<double> *> impl;
+               DeviceMemory<double> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasAsum, elem_count, x, incx,
               result);
 }
@@ -2113,7 +1516,8 @@ Stream &Stream::ThenBlasAxpy(uint64 elem_count, float alpha,
             PARAM(incy));
 
   ThenBlasImpl<uint64, float, const DeviceMemory<float> &, int,
-               DeviceMemory<float> *, int> impl;
+               DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasAxpy, elem_count, alpha, x, incx,
               y, incy);
 }
@@ -2125,7 +1529,8 @@ Stream &Stream::ThenBlasAxpy(uint64 elem_count, double alpha,
             PARAM(incy));
 
   ThenBlasImpl<uint64, double, const DeviceMemory<double> &, int,
-               DeviceMemory<double> *, int> impl;
+               DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasAxpy, elem_count, alpha, x, incx,
               y, incy);
 }
@@ -2139,7 +1544,8 @@ Stream &Stream::ThenBlasAxpy(uint64 elem_count, std::complex<float> alpha,
 
   ThenBlasImpl<uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasAxpy, elem_count, alpha, x, incx,
               y, incy);
 }
@@ -2153,7 +1559,8 @@ Stream &Stream::ThenBlasAxpy(uint64 elem_count, std::complex<double> alpha,
 
   ThenBlasImpl<uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasAxpy, elem_count, alpha, x, incx,
               y, incy);
 }
@@ -2163,7 +1570,8 @@ Stream &Stream::ThenBlasCopy(uint64 elem_count, const DeviceMemory<float> &x,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy));
 
   ThenBlasImpl<uint64, const DeviceMemory<float> &, int, DeviceMemory<float> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasCopy, elem_count, x, incx, y,
               incy);
 }
@@ -2173,7 +1581,8 @@ Stream &Stream::ThenBlasCopy(uint64 elem_count, const DeviceMemory<double> &x,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy));
 
   ThenBlasImpl<uint64, const DeviceMemory<double> &, int,
-               DeviceMemory<double> *, int> impl;
+               DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasCopy, elem_count, x, incx, y,
               incy);
 }
@@ -2185,7 +1594,8 @@ Stream &Stream::ThenBlasCopy(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasCopy, elem_count, x, incx, y,
               incy);
 }
@@ -2197,7 +1607,8 @@ Stream &Stream::ThenBlasCopy(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasCopy, elem_count, x, incx, y,
               incy);
 }
@@ -2209,7 +1620,8 @@ Stream &Stream::ThenBlasDot(uint64 elem_count, const DeviceMemory<float> &x,
             PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<float> &, int,
-               const DeviceMemory<float> &, int, DeviceMemory<float> *> impl;
+               const DeviceMemory<float> &, int, DeviceMemory<float> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasDot, elem_count, x, incx, y, incy,
               result);
 }
@@ -2221,7 +1633,8 @@ Stream &Stream::ThenBlasDot(uint64 elem_count, const DeviceMemory<double> &x,
             PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<double> &, int,
-               const DeviceMemory<double> &, int, DeviceMemory<double> *> impl;
+               const DeviceMemory<double> &, int, DeviceMemory<double> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasDot, elem_count, x, incx, y, incy,
               result);
 }
@@ -2237,7 +1650,8 @@ Stream &Stream::ThenBlasDotc(uint64 elem_count,
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *> impl;
+               DeviceMemory<std::complex<float>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasDotc, elem_count, x, incx, y,
               incy, result);
 }
@@ -2253,7 +1667,8 @@ Stream &Stream::ThenBlasDotc(uint64 elem_count,
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *> impl;
+               DeviceMemory<std::complex<double>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasDotc, elem_count, x, incx, y,
               incy, result);
 }
@@ -2269,7 +1684,8 @@ Stream &Stream::ThenBlasDotu(uint64 elem_count,
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *> impl;
+               DeviceMemory<std::complex<float>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasDotu, elem_count, x, incx, y,
               incy, result);
 }
@@ -2285,7 +1701,8 @@ Stream &Stream::ThenBlasDotu(uint64 elem_count,
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *> impl;
+               DeviceMemory<std::complex<double>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasDotu, elem_count, x, incx, y,
               incy, result);
 }
@@ -2305,7 +1722,8 @@ Stream &Stream::ThenBlasNrm2(uint64 elem_count, const DeviceMemory<double> &x,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<double> &, int,
-               DeviceMemory<double> *> impl;
+               DeviceMemory<double> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasNrm2, elem_count, x, incx,
               result);
 }
@@ -2316,7 +1734,8 @@ Stream &Stream::ThenBlasNrm2(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<float> *> impl;
+               DeviceMemory<float> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasNrm2, elem_count, x, incx,
               result);
 }
@@ -2327,7 +1746,8 @@ Stream &Stream::ThenBlasNrm2(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<double> *> impl;
+               DeviceMemory<double> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasNrm2, elem_count, x, incx,
               result);
 }
@@ -2339,7 +1759,8 @@ Stream &Stream::ThenBlasRot(uint64 elem_count, DeviceMemory<float> *x, int incx,
             PARAM(c), PARAM(s));
 
   ThenBlasImpl<uint64, DeviceMemory<float> *, int, DeviceMemory<float> *, int,
-               float, float> impl;
+               float, float>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRot, elem_count, x, incx, y, incy,
               c, s);
 }
@@ -2351,7 +1772,8 @@ Stream &Stream::ThenBlasRot(uint64 elem_count, DeviceMemory<double> *x,
             PARAM(c), PARAM(s));
 
   ThenBlasImpl<uint64, DeviceMemory<double> *, int, DeviceMemory<double> *, int,
-               double, double> impl;
+               double, double>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRot, elem_count, x, incx, y, incy,
               c, s);
 }
@@ -2364,7 +1786,8 @@ Stream &Stream::ThenBlasRot(uint64 elem_count,
             PARAM(c), PARAM(s));
 
   ThenBlasImpl<uint64, DeviceMemory<std::complex<float>> *, int,
-               DeviceMemory<std::complex<float>> *, int, float, float> impl;
+               DeviceMemory<std::complex<float>> *, int, float, float>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRot, elem_count, x, incx, y, incy,
               c, s);
 }
@@ -2377,7 +1800,8 @@ Stream &Stream::ThenBlasRot(uint64 elem_count,
             PARAM(c), PARAM(s));
 
   ThenBlasImpl<uint64, DeviceMemory<std::complex<double>> *, int,
-               DeviceMemory<std::complex<double>> *, int, double, double> impl;
+               DeviceMemory<std::complex<double>> *, int, double, double>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRot, elem_count, x, incx, y, incy,
               c, s);
 }
@@ -2387,7 +1811,8 @@ Stream &Stream::ThenBlasRotg(DeviceMemory<float> *a, DeviceMemory<float> *b,
   VLOG_CALL(PARAM(a), PARAM(b), PARAM(c), PARAM(s));
 
   ThenBlasImpl<DeviceMemory<float> *, DeviceMemory<float> *,
-               DeviceMemory<float> *, DeviceMemory<float> *> impl;
+               DeviceMemory<float> *, DeviceMemory<float> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRotg, a, b, c, s);
 }
 
@@ -2396,7 +1821,8 @@ Stream &Stream::ThenBlasRotg(DeviceMemory<double> *a, DeviceMemory<double> *b,
   VLOG_CALL(PARAM(a), PARAM(b), PARAM(c), PARAM(s));
 
   ThenBlasImpl<DeviceMemory<double> *, DeviceMemory<double> *,
-               DeviceMemory<double> *, DeviceMemory<double> *> impl;
+               DeviceMemory<double> *, DeviceMemory<double> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRotg, a, b, c, s);
 }
 
@@ -2408,7 +1834,8 @@ Stream &Stream::ThenBlasRotg(DeviceMemory<std::complex<float>> *a,
 
   ThenBlasImpl<DeviceMemory<std::complex<float>> *,
                DeviceMemory<std::complex<float>> *, DeviceMemory<float> *,
-               DeviceMemory<std::complex<float>> *> impl;
+               DeviceMemory<std::complex<float>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRotg, a, b, c, s);
 }
 
@@ -2420,7 +1847,8 @@ Stream &Stream::ThenBlasRotg(DeviceMemory<std::complex<double>> *a,
 
   ThenBlasImpl<DeviceMemory<std::complex<double>> *,
                DeviceMemory<std::complex<double>> *, DeviceMemory<double> *,
-               DeviceMemory<std::complex<double>> *> impl;
+               DeviceMemory<std::complex<double>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRotg, a, b, c, s);
 }
 
@@ -2431,7 +1859,8 @@ Stream &Stream::ThenBlasRotm(uint64 elem_count, DeviceMemory<float> *x,
             PARAM(param));
 
   ThenBlasImpl<uint64, DeviceMemory<float> *, int, DeviceMemory<float> *, int,
-               const DeviceMemory<float> &> impl;
+               const DeviceMemory<float> &>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRotm, elem_count, x, incx, y,
               incy, param);
 }
@@ -2443,7 +1872,8 @@ Stream &Stream::ThenBlasRotm(uint64 elem_count, DeviceMemory<double> *x,
             PARAM(param));
 
   ThenBlasImpl<uint64, DeviceMemory<double> *, int, DeviceMemory<double> *, int,
-               const DeviceMemory<double> &> impl;
+               const DeviceMemory<double> &>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRotm, elem_count, x, incx, y,
               incy, param);
 }
@@ -2456,7 +1886,8 @@ Stream &Stream::ThenBlasRotmg(DeviceMemory<float> *d1, DeviceMemory<float> *d2,
 
   ThenBlasImpl<DeviceMemory<float> *, DeviceMemory<float> *,
                DeviceMemory<float> *, const DeviceMemory<float> &,
-               DeviceMemory<float> *> impl;
+               DeviceMemory<float> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRotmg, d1, d2, x1, y1, param);
 }
 
@@ -2469,7 +1900,8 @@ Stream &Stream::ThenBlasRotmg(DeviceMemory<double> *d1,
 
   ThenBlasImpl<DeviceMemory<double> *, DeviceMemory<double> *,
                DeviceMemory<double> *, const DeviceMemory<double> &,
-               DeviceMemory<double> *> impl;
+               DeviceMemory<double> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasRotmg, d1, d2, x1, y1, param);
 }
 
@@ -2510,7 +1942,8 @@ Stream &Stream::ThenBlasScal(uint64 elem_count, std::complex<float> alpha,
   VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx));
 
   ThenBlasImpl<uint64, std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx);
 }
 
@@ -2519,7 +1952,8 @@ Stream &Stream::ThenBlasScal(uint64 elem_count, std::complex<double> alpha,
   VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx));
 
   ThenBlasImpl<uint64, std::complex<double>,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx);
 }
 
@@ -2549,7 +1983,8 @@ Stream &Stream::ThenBlasSwap(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy));
 
   ThenBlasImpl<uint64, DeviceMemory<std::complex<float>> *, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSwap, elem_count, x, incx, y,
               incy);
 }
@@ -2560,7 +1995,8 @@ Stream &Stream::ThenBlasSwap(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy));
 
   ThenBlasImpl<uint64, DeviceMemory<std::complex<double>> *, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSwap, elem_count, x, incx, y,
               incy);
 }
@@ -2591,7 +2027,8 @@ Stream &Stream::ThenBlasIamax(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<int> *> impl;
+               DeviceMemory<int> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasIamax, elem_count, x, incx,
               result);
 }
@@ -2602,7 +2039,8 @@ Stream &Stream::ThenBlasIamax(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<int> *> impl;
+               DeviceMemory<int> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasIamax, elem_count, x, incx,
               result);
 }
@@ -2633,7 +2071,8 @@ Stream &Stream::ThenBlasIamin(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<int> *> impl;
+               DeviceMemory<int> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasIamin, elem_count, x, incx,
               result);
 }
@@ -2644,7 +2083,8 @@ Stream &Stream::ThenBlasIamin(uint64 elem_count,
   VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(result));
 
   ThenBlasImpl<uint64, const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<int> *> impl;
+               DeviceMemory<int> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasIamin, elem_count, x, incx,
               result);
 }
@@ -2660,7 +2100,8 @@ Stream &Stream::ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n,
 
   ThenBlasImpl<blas::Transpose, uint64, uint64, uint64, uint64, float,
                const DeviceMemory<float> &, int, const DeviceMemory<float> &,
-               int, float, DeviceMemory<float> *, int> impl;
+               int, float, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGbmv, trans, m, n, kl, ku, alpha,
               a, lda, x, incx, beta, y, incy);
 }
@@ -2676,7 +2117,8 @@ Stream &Stream::ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n,
 
   ThenBlasImpl<blas::Transpose, uint64, uint64, uint64, uint64, double,
                const DeviceMemory<double> &, int, const DeviceMemory<double> &,
-               int, double, DeviceMemory<double> *, int> impl;
+               int, double, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGbmv, trans, m, n, kl, ku, alpha,
               a, lda, x, incx, beta, y, incy);
 }
@@ -2695,8 +2137,8 @@ Stream &Stream::ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n,
   ThenBlasImpl<blas::Transpose, uint64, uint64, uint64, uint64,
                std::complex<float>, const DeviceMemory<std::complex<float>> &,
                int, const DeviceMemory<std::complex<float>> &, int,
-               std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               std::complex<float>, DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGbmv, trans, m, n, kl, ku, alpha,
               a, lda, x, incx, beta, y, incy);
 }
@@ -2715,8 +2157,8 @@ Stream &Stream::ThenBlasGbmv(blas::Transpose trans, uint64 m, uint64 n,
   ThenBlasImpl<blas::Transpose, uint64, uint64, uint64, uint64,
                std::complex<double>, const DeviceMemory<std::complex<double>> &,
                int, const DeviceMemory<std::complex<double>> &, int,
-               std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               std::complex<double>, DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGbmv, trans, m, n, kl, ku, alpha,
               a, lda, x, incx, beta, y, incy);
 }
@@ -2731,7 +2173,8 @@ Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n,
 
   ThenBlasImpl<blas::Transpose, uint64, uint64, float,
                const DeviceMemory<float> &, int, const DeviceMemory<float> &,
-               int, float, DeviceMemory<float> *, int> impl;
+               int, float, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda,
               x, incx, beta, y, incy);
 }
@@ -2746,7 +2189,8 @@ Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n,
 
   ThenBlasImpl<blas::Transpose, uint64, uint64, double,
                const DeviceMemory<double> &, int, const DeviceMemory<double> &,
-               int, double, DeviceMemory<double> *, int> impl;
+               int, double, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda,
               x, incx, beta, y, incy);
 }
@@ -2765,8 +2209,8 @@ Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n,
   ThenBlasImpl<blas::Transpose, uint64, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
                const DeviceMemory<std::complex<float>> &, int,
-               std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               std::complex<float>, DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda,
               x, incx, beta, y, incy);
 }
@@ -2785,8 +2229,8 @@ Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64 m, uint64 n,
   ThenBlasImpl<blas::Transpose, uint64, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
                const DeviceMemory<std::complex<double>> &, int,
-               std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               std::complex<double>, DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda,
               x, incx, beta, y, incy);
 }
@@ -2799,8 +2243,8 @@ Stream &Stream::ThenBlasGer(uint64 m, uint64 n, float alpha,
             PARAM(incy), PARAM(a), PARAM(lda));
 
   ThenBlasImpl<uint64, uint64, float, const DeviceMemory<float> &, int,
-               const DeviceMemory<float> &, int, DeviceMemory<float> *,
-               int> impl;
+               const DeviceMemory<float> &, int, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGer, m, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -2813,8 +2257,8 @@ Stream &Stream::ThenBlasGer(uint64 m, uint64 n, double alpha,
             PARAM(incy), PARAM(a), PARAM(lda));
 
   ThenBlasImpl<uint64, uint64, double, const DeviceMemory<double> &, int,
-               const DeviceMemory<double> &, int, DeviceMemory<double> *,
-               int> impl;
+               const DeviceMemory<double> &, int, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGer, m, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -2831,7 +2275,8 @@ Stream &Stream::ThenBlasGerc(uint64 m, uint64 n, std::complex<float> alpha,
   ThenBlasImpl<uint64, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGerc, m, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -2848,7 +2293,8 @@ Stream &Stream::ThenBlasGerc(uint64 m, uint64 n, std::complex<double> alpha,
   ThenBlasImpl<uint64, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGerc, m, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -2865,7 +2311,8 @@ Stream &Stream::ThenBlasGeru(uint64 m, uint64 n, std::complex<float> alpha,
   ThenBlasImpl<uint64, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGeru, m, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -2882,7 +2329,8 @@ Stream &Stream::ThenBlasGeru(uint64 m, uint64 n, std::complex<double> alpha,
   ThenBlasImpl<uint64, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGeru, m, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -2900,8 +2348,8 @@ Stream &Stream::ThenBlasHbmv(blas::UpperLower uplo, uint64 n, uint64 k,
   ThenBlasImpl<blas::UpperLower, uint64, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
                const DeviceMemory<std::complex<float>> &, int,
-               std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               std::complex<float>, DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHbmv, uplo, n, k, alpha, a, lda,
               x, incx, beta, y, incy);
 }
@@ -2919,8 +2367,8 @@ Stream &Stream::ThenBlasHbmv(blas::UpperLower uplo, uint64 n, uint64 k,
   ThenBlasImpl<blas::UpperLower, uint64, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
                const DeviceMemory<std::complex<double>> &, int,
-               std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               std::complex<double>, DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHbmv, uplo, n, k, alpha, a, lda,
               x, incx, beta, y, incy);
 }
@@ -2938,8 +2386,8 @@ Stream &Stream::ThenBlasHemv(blas::UpperLower uplo, uint64 n,
   ThenBlasImpl<blas::UpperLower, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
                const DeviceMemory<std::complex<float>> &, int,
-               std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               std::complex<float>, DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHemv, uplo, n, alpha, a, lda, x,
               incx, beta, y, incy);
 }
@@ -2957,8 +2405,8 @@ Stream &Stream::ThenBlasHemv(blas::UpperLower uplo, uint64 n,
   ThenBlasImpl<blas::UpperLower, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
                const DeviceMemory<std::complex<double>> &, int,
-               std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               std::complex<double>, DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHemv, uplo, n, alpha, a, lda, x,
               incx, beta, y, incy);
 }
@@ -2972,7 +2420,8 @@ Stream &Stream::ThenBlasHer(blas::UpperLower uplo, uint64 n, float alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, float,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHer, uplo, n, alpha, x, incx, a,
               lda);
 }
@@ -2986,7 +2435,8 @@ Stream &Stream::ThenBlasHer(blas::UpperLower uplo, uint64 n, double alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, double,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHer, uplo, n, alpha, x, incx, a,
               lda);
 }
@@ -3004,7 +2454,8 @@ Stream &Stream::ThenBlasHer2(blas::UpperLower uplo, uint64 n,
   ThenBlasImpl<blas::UpperLower, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHer2, uplo, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -3022,7 +2473,8 @@ Stream &Stream::ThenBlasHer2(blas::UpperLower uplo, uint64 n,
   ThenBlasImpl<blas::UpperLower, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHer2, uplo, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -3039,8 +2491,8 @@ Stream &Stream::ThenBlasHpmv(blas::UpperLower uplo, uint64 n,
   ThenBlasImpl<blas::UpperLower, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &,
                const DeviceMemory<std::complex<float>> &, int,
-               std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               std::complex<float>, DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHpmv, uplo, n, alpha, ap, x, incx,
               beta, y, incy);
 }
@@ -3057,8 +2509,8 @@ Stream &Stream::ThenBlasHpmv(blas::UpperLower uplo, uint64 n,
   ThenBlasImpl<blas::UpperLower, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &,
                const DeviceMemory<std::complex<double>> &, int,
-               std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               std::complex<double>, DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHpmv, uplo, n, alpha, ap, x, incx,
               beta, y, incy);
 }
@@ -3071,7 +2523,8 @@ Stream &Stream::ThenBlasHpr(blas::UpperLower uplo, uint64 n, float alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, float,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *> impl;
+               DeviceMemory<std::complex<float>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHpr, uplo, n, alpha, x, incx, ap);
 }
 
@@ -3083,7 +2536,8 @@ Stream &Stream::ThenBlasHpr(blas::UpperLower uplo, uint64 n, double alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, double,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *> impl;
+               DeviceMemory<std::complex<double>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHpr, uplo, n, alpha, x, incx, ap);
 }
 
@@ -3099,7 +2553,8 @@ Stream &Stream::ThenBlasHpr2(blas::UpperLower uplo, uint64 n,
   ThenBlasImpl<blas::UpperLower, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *> impl;
+               DeviceMemory<std::complex<float>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHpr2, uplo, n, alpha, x, incx, y,
               incy, ap);
 }
@@ -3116,7 +2571,8 @@ Stream &Stream::ThenBlasHpr2(blas::UpperLower uplo, uint64 n,
   ThenBlasImpl<blas::UpperLower, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *> impl;
+               DeviceMemory<std::complex<double>> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHpr2, uplo, n, alpha, x, incx, y,
               incy, ap);
 }
@@ -3130,7 +2586,8 @@ Stream &Stream::ThenBlasSbmv(blas::UpperLower uplo, uint64 n, uint64 k,
 
   ThenBlasImpl<blas::UpperLower, uint64, uint64, float,
                const DeviceMemory<float> &, int, const DeviceMemory<float> &,
-               int, float, DeviceMemory<float> *, int> impl;
+               int, float, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSbmv, uplo, n, k, alpha, a, lda,
               x, incx, beta, y, incy);
 }
@@ -3144,7 +2601,8 @@ Stream &Stream::ThenBlasSbmv(blas::UpperLower uplo, uint64 n, uint64 k,
 
   ThenBlasImpl<blas::UpperLower, uint64, uint64, double,
                const DeviceMemory<double> &, int, const DeviceMemory<double> &,
-               int, double, DeviceMemory<double> *, int> impl;
+               int, double, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSbmv, uplo, n, k, alpha, a, lda,
               x, incx, beta, y, incy);
 }
@@ -3158,7 +2616,8 @@ Stream &Stream::ThenBlasSpmv(blas::UpperLower uplo, uint64 n, float alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, float, const DeviceMemory<float> &,
                const DeviceMemory<float> &, int, float, DeviceMemory<float> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSpmv, uplo, n, alpha, ap, x, incx,
               beta, y, incy);
 }
@@ -3172,7 +2631,8 @@ Stream &Stream::ThenBlasSpmv(blas::UpperLower uplo, uint64 n, double alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, double, const DeviceMemory<double> &,
                const DeviceMemory<double> &, int, double,
-               DeviceMemory<double> *, int> impl;
+               DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSpmv, uplo, n, alpha, ap, x, incx,
               beta, y, incy);
 }
@@ -3184,7 +2644,8 @@ Stream &Stream::ThenBlasSpr(blas::UpperLower uplo, uint64 n, float alpha,
             PARAM(ap));
 
   ThenBlasImpl<blas::UpperLower, uint64, float, const DeviceMemory<float> &,
-               int, DeviceMemory<float> *> impl;
+               int, DeviceMemory<float> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSpr, uplo, n, alpha, x, incx, ap);
 }
 
@@ -3195,7 +2656,8 @@ Stream &Stream::ThenBlasSpr(blas::UpperLower uplo, uint64 n, double alpha,
             PARAM(ap));
 
   ThenBlasImpl<blas::UpperLower, uint64, double, const DeviceMemory<double> &,
-               int, DeviceMemory<double> *> impl;
+               int, DeviceMemory<double> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSpr, uplo, n, alpha, x, incx, ap);
 }
 
@@ -3207,8 +2669,8 @@ Stream &Stream::ThenBlasSpr2(blas::UpperLower uplo, uint64 n, float alpha,
             PARAM(y), PARAM(incy), PARAM(ap));
 
   ThenBlasImpl<blas::UpperLower, uint64, float, const DeviceMemory<float> &,
-               int, const DeviceMemory<float> &, int,
-               DeviceMemory<float> *> impl;
+               int, const DeviceMemory<float> &, int, DeviceMemory<float> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSpr2, uplo, n, alpha, x, incx, y,
               incy, ap);
 }
@@ -3221,8 +2683,8 @@ Stream &Stream::ThenBlasSpr2(blas::UpperLower uplo, uint64 n, double alpha,
             PARAM(y), PARAM(incy), PARAM(ap));
 
   ThenBlasImpl<blas::UpperLower, uint64, double, const DeviceMemory<double> &,
-               int, const DeviceMemory<double> &, int,
-               DeviceMemory<double> *> impl;
+               int, const DeviceMemory<double> &, int, DeviceMemory<double> *>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSpr2, uplo, n, alpha, x, incx, y,
               incy, ap);
 }
@@ -3236,7 +2698,8 @@ Stream &Stream::ThenBlasSymv(blas::UpperLower uplo, uint64 n, float alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, float, const DeviceMemory<float> &,
                int, const DeviceMemory<float> &, int, float,
-               DeviceMemory<float> *, int> impl;
+               DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSymv, uplo, n, alpha, a, lda, x,
               incx, beta, y, incy);
 }
@@ -3250,7 +2713,8 @@ Stream &Stream::ThenBlasSymv(blas::UpperLower uplo, uint64 n, double alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, double, const DeviceMemory<double> &,
                int, const DeviceMemory<double> &, int, double,
-               DeviceMemory<double> *, int> impl;
+               DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSymv, uplo, n, alpha, a, lda, x,
               incx, beta, y, incy);
 }
@@ -3262,7 +2726,8 @@ Stream &Stream::ThenBlasSyr(blas::UpperLower uplo, uint64 n, float alpha,
             PARAM(a), PARAM(lda));
 
   ThenBlasImpl<blas::UpperLower, uint64, float, const DeviceMemory<float> &,
-               int, DeviceMemory<float> *, int> impl;
+               int, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyr, uplo, n, alpha, x, incx, a,
               lda);
 }
@@ -3274,7 +2739,8 @@ Stream &Stream::ThenBlasSyr(blas::UpperLower uplo, uint64 n, double alpha,
             PARAM(a), PARAM(lda));
 
   ThenBlasImpl<blas::UpperLower, uint64, double, const DeviceMemory<double> &,
-               int, DeviceMemory<double> *, int> impl;
+               int, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyr, uplo, n, alpha, x, incx, a,
               lda);
 }
@@ -3288,7 +2754,8 @@ Stream &Stream::ThenBlasSyr2(blas::UpperLower uplo, uint64 n, float alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, float, const DeviceMemory<float> &,
                int, const DeviceMemory<float> &, int, DeviceMemory<float> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyr2, uplo, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -3302,7 +2769,8 @@ Stream &Stream::ThenBlasSyr2(blas::UpperLower uplo, uint64 n, double alpha,
 
   ThenBlasImpl<blas::UpperLower, uint64, double, const DeviceMemory<double> &,
                int, const DeviceMemory<double> &, int, DeviceMemory<double> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyr2, uplo, n, alpha, x, incx, y,
               incy, a, lda);
 }
@@ -3316,7 +2784,8 @@ Stream &Stream::ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                uint64, const DeviceMemory<float> &, int, DeviceMemory<float> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTbmv, uplo, trans, diag, n, k, a,
               lda, x, incx);
 }
@@ -3330,7 +2799,8 @@ Stream &Stream::ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                uint64, const DeviceMemory<double> &, int,
-               DeviceMemory<double> *, int> impl;
+               DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTbmv, uplo, trans, diag, n, k, a,
               lda, x, incx);
 }
@@ -3345,7 +2815,8 @@ Stream &Stream::ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                uint64, const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTbmv, uplo, trans, diag, n, k, a,
               lda, x, incx);
 }
@@ -3360,7 +2831,8 @@ Stream &Stream::ThenBlasTbmv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                uint64, const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTbmv, uplo, trans, diag, n, k, a,
               lda, x, incx);
 }
@@ -3374,7 +2846,8 @@ Stream &Stream::ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                uint64, const DeviceMemory<float> &, int, DeviceMemory<float> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTbsv, uplo, trans, diag, n, k, a,
               lda, x, incx);
 }
@@ -3388,7 +2861,8 @@ Stream &Stream::ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                uint64, const DeviceMemory<double> &, int,
-               DeviceMemory<double> *, int> impl;
+               DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTbsv, uplo, trans, diag, n, k, a,
               lda, x, incx);
 }
@@ -3403,7 +2877,8 @@ Stream &Stream::ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                uint64, const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTbsv, uplo, trans, diag, n, k, a,
               lda, x, incx);
 }
@@ -3418,7 +2893,8 @@ Stream &Stream::ThenBlasTbsv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                uint64, const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTbsv, uplo, trans, diag, n, k, a,
               lda, x, incx);
 }
@@ -3431,7 +2907,8 @@ Stream &Stream::ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans,
             PARAM(x), PARAM(incx));
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
-               const DeviceMemory<float> &, DeviceMemory<float> *, int> impl;
+               const DeviceMemory<float> &, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTpmv, uplo, trans, diag, n, ap, x,
               incx);
 }
@@ -3444,7 +2921,8 @@ Stream &Stream::ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans,
             PARAM(x), PARAM(incx));
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
-               const DeviceMemory<double> &, DeviceMemory<double> *, int> impl;
+               const DeviceMemory<double> &, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTpmv, uplo, trans, diag, n, ap, x,
               incx);
 }
@@ -3458,7 +2936,8 @@ Stream &Stream::ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                const DeviceMemory<std::complex<float>> &,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTpmv, uplo, trans, diag, n, ap, x,
               incx);
 }
@@ -3472,7 +2951,8 @@ Stream &Stream::ThenBlasTpmv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                const DeviceMemory<std::complex<double>> &,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTpmv, uplo, trans, diag, n, ap, x,
               incx);
 }
@@ -3485,7 +2965,8 @@ Stream &Stream::ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans,
             PARAM(x), PARAM(incx));
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
-               const DeviceMemory<float> &, DeviceMemory<float> *, int> impl;
+               const DeviceMemory<float> &, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTpsv, uplo, trans, diag, n, ap, x,
               incx);
 }
@@ -3498,7 +2979,8 @@ Stream &Stream::ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans,
             PARAM(x), PARAM(incx));
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
-               const DeviceMemory<double> &, DeviceMemory<double> *, int> impl;
+               const DeviceMemory<double> &, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTpsv, uplo, trans, diag, n, ap, x,
               incx);
 }
@@ -3512,7 +2994,8 @@ Stream &Stream::ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                const DeviceMemory<std::complex<float>> &,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTpsv, uplo, trans, diag, n, ap, x,
               incx);
 }
@@ -3526,7 +3009,8 @@ Stream &Stream::ThenBlasTpsv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                const DeviceMemory<std::complex<double>> &,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTpsv, uplo, trans, diag, n, ap, x,
               incx);
 }
@@ -3539,8 +3023,8 @@ Stream &Stream::ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans,
             PARAM(lda), PARAM(x), PARAM(incx));
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
-               const DeviceMemory<float> &, int, DeviceMemory<float> *,
-               int> impl;
+               const DeviceMemory<float> &, int, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrmv, uplo, trans, diag, n, a,
               lda, x, incx);
 }
@@ -3553,8 +3037,8 @@ Stream &Stream::ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans,
             PARAM(lda), PARAM(x), PARAM(incx));
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
-               const DeviceMemory<double> &, int, DeviceMemory<double> *,
-               int> impl;
+               const DeviceMemory<double> &, int, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrmv, uplo, trans, diag, n, a,
               lda, x, incx);
 }
@@ -3569,7 +3053,8 @@ Stream &Stream::ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrmv, uplo, trans, diag, n, a,
               lda, x, incx);
 }
@@ -3584,7 +3069,8 @@ Stream &Stream::ThenBlasTrmv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrmv, uplo, trans, diag, n, a,
               lda, x, incx);
 }
@@ -3597,8 +3083,8 @@ Stream &Stream::ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans,
             PARAM(lda), PARAM(x), PARAM(incx));
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
-               const DeviceMemory<float> &, int, DeviceMemory<float> *,
-               int> impl;
+               const DeviceMemory<float> &, int, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrsv, uplo, trans, diag, n, a,
               lda, x, incx);
 }
@@ -3611,8 +3097,8 @@ Stream &Stream::ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans,
             PARAM(lda), PARAM(x), PARAM(incx));
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
-               const DeviceMemory<double> &, int, DeviceMemory<double> *,
-               int> impl;
+               const DeviceMemory<double> &, int, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrsv, uplo, trans, diag, n, a,
               lda, x, incx);
 }
@@ -3627,7 +3113,8 @@ Stream &Stream::ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrsv, uplo, trans, diag, n, a,
               lda, x, incx);
 }
@@ -3642,7 +3129,8 @@ Stream &Stream::ThenBlasTrsv(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, blas::Diagonal, uint64,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrsv, uplo, trans, diag, n, a,
               lda, x, incx);
 }
@@ -3651,16 +3139,17 @@ Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
                              uint64 m, uint64 n, uint64 k, float alpha,
                              const DeviceMemory<Eigen::half> &a, int lda,
                              const DeviceMemory<Eigen::half> &b, int ldb,
-                             float beta,
-                             DeviceMemory<Eigen::half> *c, int ldc) {
+                             float beta, DeviceMemory<Eigen::half> *c,
+                             int ldc) {
   VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
             PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
             PARAM(beta), PARAM(c), PARAM(ldc));
 
   ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, float,
                const DeviceMemory<Eigen::half> &, int,
-               const DeviceMemory<Eigen::half> &, int,
-               float, DeviceMemory<Eigen::half> *, int> impl;
+               const DeviceMemory<Eigen::half> &, int, float,
+               DeviceMemory<Eigen::half> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGemm, transa, transb, m, n, k,
               alpha, a, lda, b, ldb, beta, c, ldc);
 }
@@ -3676,7 +3165,8 @@ Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
 
   ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, float,
                const DeviceMemory<float> &, int, const DeviceMemory<float> &,
-               int, float, DeviceMemory<float> *, int> impl;
+               int, float, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGemm, transa, transb, m, n, k,
               alpha, a, lda, b, ldb, beta, c, ldc);
 }
@@ -3692,7 +3182,8 @@ Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
 
   ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, double,
                const DeviceMemory<double> &, int, const DeviceMemory<double> &,
-               int, double, DeviceMemory<double> *, int> impl;
+               int, double, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGemm, transa, transb, m, n, k,
               alpha, a, lda, b, ldb, beta, c, ldc);
 }
@@ -3712,8 +3203,8 @@ Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
   ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64,
                std::complex<float>, const DeviceMemory<std::complex<float>> &,
                int, const DeviceMemory<std::complex<float>> &, int,
-               std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               std::complex<float>, DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGemm, transa, transb, m, n, k,
               alpha, a, lda, b, ldb, beta, c, ldc);
 }
@@ -3733,8 +3224,8 @@ Stream &Stream::ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
   ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64,
                std::complex<double>, const DeviceMemory<std::complex<double>> &,
                int, const DeviceMemory<std::complex<double>> &, int,
-               std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               std::complex<double>, DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasGemm, transa, transb, m, n, k,
               alpha, a, lda, b, ldb, beta, c, ldc);
 }
@@ -4100,8 +3591,8 @@ Stream &Stream::ThenBlasHemm(blas::Side side, blas::UpperLower uplo, uint64 m,
   ThenBlasImpl<blas::Side, blas::UpperLower, uint64, uint64,
                std::complex<float>, const DeviceMemory<std::complex<float>> &,
                int, const DeviceMemory<std::complex<float>> &, int,
-               std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               std::complex<float>, DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHemm, side, uplo, m, n, alpha, a,
               lda, b, ldb, beta, c, ldc);
 }
@@ -4120,8 +3611,8 @@ Stream &Stream::ThenBlasHemm(blas::Side side, blas::UpperLower uplo, uint64 m,
   ThenBlasImpl<blas::Side, blas::UpperLower, uint64, uint64,
                std::complex<double>, const DeviceMemory<std::complex<double>> &,
                int, const DeviceMemory<std::complex<double>> &, int,
-               std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               std::complex<double>, DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHemm, side, uplo, m, n, alpha, a,
               lda, b, ldb, beta, c, ldc);
 }
@@ -4136,7 +3627,8 @@ Stream &Stream::ThenBlasHerk(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, float,
                const DeviceMemory<std::complex<float>> &, int, float,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHerk, uplo, trans, n, k, alpha, a,
               lda, beta, c, ldc);
 }
@@ -4151,7 +3643,8 @@ Stream &Stream::ThenBlasHerk(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, double,
                const DeviceMemory<std::complex<double>> &, int, double,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHerk, uplo, trans, n, k, alpha, a,
               lda, beta, c, ldc);
 }
@@ -4170,7 +3663,8 @@ Stream &Stream::ThenBlasHer2k(blas::UpperLower uplo, blas::Transpose trans,
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64,
                std::complex<float>, const DeviceMemory<std::complex<float>> &,
                int, const DeviceMemory<std::complex<float>> &, int, float,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHer2k, uplo, trans, n, k, alpha,
               a, lda, b, ldb, beta, c, ldc);
 }
@@ -4189,7 +3683,8 @@ Stream &Stream::ThenBlasHer2k(blas::UpperLower uplo, blas::Transpose trans,
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64,
                std::complex<double>, const DeviceMemory<std::complex<double>> &,
                int, const DeviceMemory<std::complex<double>> &, int, double,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasHer2k, uplo, trans, n, k, alpha,
               a, lda, b, ldb, beta, c, ldc);
 }
@@ -4205,7 +3700,8 @@ Stream &Stream::ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m,
 
   ThenBlasImpl<blas::Side, blas::UpperLower, uint64, uint64, float,
                const DeviceMemory<float> &, int, const DeviceMemory<float> &,
-               int, float, DeviceMemory<float> *, int> impl;
+               int, float, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSymm, side, uplo, m, n, alpha, a,
               lda, b, ldb, beta, c, ldc);
 }
@@ -4221,7 +3717,8 @@ Stream &Stream::ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m,
 
   ThenBlasImpl<blas::Side, blas::UpperLower, uint64, uint64, double,
                const DeviceMemory<double> &, int, const DeviceMemory<double> &,
-               int, double, DeviceMemory<double> *, int> impl;
+               int, double, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSymm, side, uplo, m, n, alpha, a,
               lda, b, ldb, beta, c, ldc);
 }
@@ -4240,8 +3737,8 @@ Stream &Stream::ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m,
   ThenBlasImpl<blas::Side, blas::UpperLower, uint64, uint64,
                std::complex<float>, const DeviceMemory<std::complex<float>> &,
                int, const DeviceMemory<std::complex<float>> &, int,
-               std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               std::complex<float>, DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSymm, side, uplo, m, n, alpha, a,
               lda, b, ldb, beta, c, ldc);
 }
@@ -4260,8 +3757,8 @@ Stream &Stream::ThenBlasSymm(blas::Side side, blas::UpperLower uplo, uint64 m,
   ThenBlasImpl<blas::Side, blas::UpperLower, uint64, uint64,
                std::complex<double>, const DeviceMemory<std::complex<double>> &,
                int, const DeviceMemory<std::complex<double>> &, int,
-               std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               std::complex<double>, DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSymm, side, uplo, m, n, alpha, a,
               lda, b, ldb, beta, c, ldc);
 }
@@ -4275,7 +3772,8 @@ Stream &Stream::ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, float,
                const DeviceMemory<float> &, int, float, DeviceMemory<float> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyrk, uplo, trans, n, k, alpha, a,
               lda, beta, c, ldc);
 }
@@ -4289,7 +3787,8 @@ Stream &Stream::ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, double,
                const DeviceMemory<double> &, int, double,
-               DeviceMemory<double> *, int> impl;
+               DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyrk, uplo, trans, n, k, alpha, a,
               lda, beta, c, ldc);
 }
@@ -4305,7 +3804,8 @@ Stream &Stream::ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans,
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64,
                std::complex<float>, const DeviceMemory<std::complex<float>> &,
                int, std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyrk, uplo, trans, n, k, alpha, a,
               lda, beta, c, ldc);
 }
@@ -4321,7 +3821,8 @@ Stream &Stream::ThenBlasSyrk(blas::UpperLower uplo, blas::Transpose trans,
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64,
                std::complex<double>, const DeviceMemory<std::complex<double>> &,
                int, std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyrk, uplo, trans, n, k, alpha, a,
               lda, beta, c, ldc);
 }
@@ -4337,7 +3838,8 @@ Stream &Stream::ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, float,
                const DeviceMemory<float> &, int, const DeviceMemory<float> &,
-               int, float, DeviceMemory<float> *, int> impl;
+               int, float, DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyr2k, uplo, trans, n, k, alpha,
               a, lda, b, ldb, beta, c, ldc);
 }
@@ -4353,7 +3855,8 @@ Stream &Stream::ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans,
 
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64, double,
                const DeviceMemory<double> &, int, const DeviceMemory<double> &,
-               int, double, DeviceMemory<double> *, int> impl;
+               int, double, DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyr2k, uplo, trans, n, k, alpha,
               a, lda, b, ldb, beta, c, ldc);
 }
@@ -4372,8 +3875,8 @@ Stream &Stream::ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans,
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64,
                std::complex<float>, const DeviceMemory<std::complex<float>> &,
                int, const DeviceMemory<std::complex<float>> &, int,
-               std::complex<float>, DeviceMemory<std::complex<float>> *,
-               int> impl;
+               std::complex<float>, DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyr2k, uplo, trans, n, k, alpha,
               a, lda, b, ldb, beta, c, ldc);
 }
@@ -4392,8 +3895,8 @@ Stream &Stream::ThenBlasSyr2k(blas::UpperLower uplo, blas::Transpose trans,
   ThenBlasImpl<blas::UpperLower, blas::Transpose, uint64, uint64,
                std::complex<double>, const DeviceMemory<std::complex<double>> &,
                int, const DeviceMemory<std::complex<double>> &, int,
-               std::complex<double>, DeviceMemory<std::complex<double>> *,
-               int> impl;
+               std::complex<double>, DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasSyr2k, uplo, trans, n, k, alpha,
               a, lda, b, ldb, beta, c, ldc);
 }
@@ -4408,7 +3911,8 @@ Stream &Stream::ThenBlasTrmm(blas::Side side, blas::UpperLower uplo,
 
   ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
                uint64, uint64, float, const DeviceMemory<float> &, int,
-               DeviceMemory<float> *, int> impl;
+               DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrmm, side, uplo, transa, diag, m,
               n, alpha, a, lda, b, ldb);
 }
@@ -4423,7 +3927,8 @@ Stream &Stream::ThenBlasTrmm(blas::Side side, blas::UpperLower uplo,
 
   ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
                uint64, uint64, double, const DeviceMemory<double> &, int,
-               DeviceMemory<double> *, int> impl;
+               DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrmm, side, uplo, transa, diag, m,
               n, alpha, a, lda, b, ldb);
 }
@@ -4440,7 +3945,8 @@ Stream &Stream::ThenBlasTrmm(blas::Side side, blas::UpperLower uplo,
   ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
                uint64, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrmm, side, uplo, transa, diag, m,
               n, alpha, a, lda, b, ldb);
 }
@@ -4457,7 +3963,8 @@ Stream &Stream::ThenBlasTrmm(blas::Side side, blas::UpperLower uplo,
   ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
                uint64, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrmm, side, uplo, transa, diag, m,
               n, alpha, a, lda, b, ldb);
 }
@@ -4472,7 +3979,8 @@ Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
 
   ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
                uint64, uint64, float, const DeviceMemory<float> &, int,
-               DeviceMemory<float> *, int> impl;
+               DeviceMemory<float> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m,
               n, alpha, a, lda, b, ldb);
 }
@@ -4487,7 +3995,8 @@ Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
 
   ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
                uint64, uint64, double, const DeviceMemory<double> &, int,
-               DeviceMemory<double> *, int> impl;
+               DeviceMemory<double> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m,
               n, alpha, a, lda, b, ldb);
 }
@@ -4504,7 +4013,8 @@ Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
   ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
                uint64, uint64, std::complex<float>,
                const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int> impl;
+               DeviceMemory<std::complex<float>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m,
               n, alpha, a, lda, b, ldb);
 }
@@ -4521,7 +4031,8 @@ Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
   ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
                uint64, uint64, std::complex<double>,
                const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int> impl;
+               DeviceMemory<std::complex<double>> *, int>
+      impl;
   return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m,
               n, alpha, a, lda, b, ldb);
 }
@@ -4811,20 +4322,88 @@ Stream &Stream::ThenBlasGemmStridedBatched(
               c, ldc, stride_c, batch_count);
 }
 
+template <typename ABType, typename CType>
+Stream &Stream::ThenBlasLtMatmulImpl(
+    const blas::IBlasLtMatmulPlan *plan, const HostOrDeviceScalar<CType> &alpha,
+    const DeviceMemory<ABType> &a, const DeviceMemory<ABType> &b,
+    const HostOrDeviceScalar<CType> &beta, DeviceMemory<CType> *c,
+    ScratchAllocator *scratch_allocator,
+    const blas::IBlasLtMatmulAlgorithm *algorithm,
+    const DeviceMemory<CType> &bias,
+    blas::ProfileResult *output_profile_result) {
+  VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta),
+            PARAM(c), PARAM(algorithm), PARAM(bias));
+
+  ThenBlasWithProfileImpl<
+      const blas::IBlasLtMatmulPlan *, const HostOrDeviceScalar<CType> &,
+      const DeviceMemory<ABType> &, const DeviceMemory<ABType> &,
+      const HostOrDeviceScalar<CType> &, DeviceMemory<CType> *,
+      ScratchAllocator *, const blas::IBlasLtMatmulAlgorithm *,
+      const DeviceMemory<CType> &>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta,
+              c, scratch_allocator, algorithm, bias, output_profile_result);
+}
+
+// Explicit template instantiations for each supported type combination.
+template Stream &Stream::ThenBlasLtMatmulImpl<int8, int32>(
+    const blas::IBlasLtMatmulPlan *, const HostOrDeviceScalar<int32> &,
+    const DeviceMemory<int8> &, const DeviceMemory<int8> &,
+    const HostOrDeviceScalar<int32> &, DeviceMemory<int32> *,
+    ScratchAllocator *, const blas::IBlasLtMatmulAlgorithm *,
+    const DeviceMemory<int32> &, blas::ProfileResult *);
+
+template Stream &Stream::ThenBlasLtMatmulImpl<Eigen::half, Eigen::half>(
+    const blas::IBlasLtMatmulPlan *, const HostOrDeviceScalar<Eigen::half> &,
+    const DeviceMemory<Eigen::half> &, const DeviceMemory<Eigen::half> &,
+    const HostOrDeviceScalar<Eigen::half> &, DeviceMemory<Eigen::half> *,
+    ScratchAllocator *, const blas::IBlasLtMatmulAlgorithm *,
+    const DeviceMemory<Eigen::half> &, blas::ProfileResult *);
+
+template Stream &Stream::ThenBlasLtMatmulImpl<float, float>(
+    const blas::IBlasLtMatmulPlan *, const HostOrDeviceScalar<float> &,
+    const DeviceMemory<float> &, const DeviceMemory<float> &,
+    const HostOrDeviceScalar<float> &, DeviceMemory<float> *,
+    ScratchAllocator *, const blas::IBlasLtMatmulAlgorithm *,
+    const DeviceMemory<float> &, blas::ProfileResult *);
+
+template Stream &Stream::ThenBlasLtMatmulImpl<double, double>(
+    const blas::IBlasLtMatmulPlan *, const HostOrDeviceScalar<double> &,
+    const DeviceMemory<double> &, const DeviceMemory<double> &,
+    const HostOrDeviceScalar<double> &, DeviceMemory<double> *,
+    ScratchAllocator *, const blas::IBlasLtMatmulAlgorithm *,
+    const DeviceMemory<double> &, blas::ProfileResult *);
+
+template Stream &
+Stream::ThenBlasLtMatmulImpl<std::complex<float>, std::complex<float>>(
+    const blas::IBlasLtMatmulPlan *,
+    const HostOrDeviceScalar<std::complex<float>> &,
+    const DeviceMemory<std::complex<float>> &,
+    const DeviceMemory<std::complex<float>> &,
+    const HostOrDeviceScalar<std::complex<float>> &,
+    DeviceMemory<std::complex<float>> *, ScratchAllocator *,
+    const blas::IBlasLtMatmulAlgorithm *,
+    const DeviceMemory<std::complex<float>> &, blas::ProfileResult *);
+
+template Stream &
+Stream::ThenBlasLtMatmulImpl<std::complex<double>, std::complex<double>>(
+    const blas::IBlasLtMatmulPlan *,
+    const HostOrDeviceScalar<std::complex<double>> &,
+    const DeviceMemory<std::complex<double>> &,
+    const DeviceMemory<std::complex<double>> &,
+    const HostOrDeviceScalar<std::complex<double>> &,
+    DeviceMemory<std::complex<double>> *, ScratchAllocator *,
+    const blas::IBlasLtMatmulAlgorithm *,
+    const DeviceMemory<std::complex<double>> &, blas::ProfileResult *);
+
 Stream &Stream::ThenSetRngSeed(const uint8 *seed, uint64 seed_bytes) {
   VLOG_CALL(PARAM(seed), PARAM(seed_bytes));
 
-  if (ok()) {
-    if (rng::RngSupport *rng = parent_->AsRng()) {
-      CheckError(rng->SetSeed(this, seed, seed_bytes));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers() << " unable to initialize RNG";
-    }
+  if (rng::RngSupport *rng = parent_->AsRng()) {
+    CheckError(rng->SetSeed(this, seed, seed_bytes));
   } else {
-    LOG(INFO) << DebugStreamPointers()
-              << " did not set RNG seed: " << static_cast<const void *>(seed)
-              << "; bytes: " << seed_bytes;
+    SetError();
+    LOG(INFO) << DebugStreamPointers() << " unable to initialize RNG";
   }
   return *this;
 }
@@ -4832,15 +4411,13 @@ Stream &Stream::ThenSetRngSeed(const uint8 *seed, uint64 seed_bytes) {
 Stream &Stream::ThenPopulateRandUniform(DeviceMemory<float> *values) {
   VLOG_CALL(PARAM(values));
 
-  if (ok()) {
-    if (rng::RngSupport *rng = parent_->AsRng()) {
-      CheckError(rng->DoPopulateRandUniform(this, values));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform RNG operation using StreamExecutor"
-                   " without RNG support.";
-    }
+  if (rng::RngSupport *rng = parent_->AsRng()) {
+    CheckError(rng->DoPopulateRandUniform(this, values));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform RNG operation using StreamExecutor"
+                 " without RNG support.";
   }
   return *this;
 }
@@ -4849,15 +4426,13 @@ Stream &Stream::ThenPopulateRandGaussian(float mean, float sd,
                                          DeviceMemory<float> *values) {
   VLOG_CALL(PARAM(mean), PARAM(sd), PARAM(values));
 
-  if (ok()) {
-    if (rng::RngSupport *rng = parent_->AsRng()) {
-      CheckError(rng->DoPopulateRandGaussian(this, mean, sd, values));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform RNG operation using StreamExecutor"
-                   " without RNG support.";
-    }
+  if (rng::RngSupport *rng = parent_->AsRng()) {
+    CheckError(rng->DoPopulateRandGaussian(this, mean, sd, values));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform RNG operation using StreamExecutor"
+                 " without RNG support.";
   }
   return *this;
 }
@@ -4866,15 +4441,13 @@ Stream &Stream::ThenPopulateRandGaussian(double mean, double sd,
                                          DeviceMemory<double> *values) {
   VLOG_CALL(PARAM(mean), PARAM(sd), PARAM(values));
 
-  if (ok()) {
-    if (rng::RngSupport *rng = parent_->AsRng()) {
-      CheckError(rng->DoPopulateRandGaussian(this, mean, sd, values));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform RNG operation using StreamExecutor"
-                   " without RNG support.";
-    }
+  if (rng::RngSupport *rng = parent_->AsRng()) {
+    CheckError(rng->DoPopulateRandGaussian(this, mean, sd, values));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform RNG operation using StreamExecutor"
+                 " without RNG support.";
   }
   return *this;
 }
@@ -4882,15 +4455,13 @@ Stream &Stream::ThenPopulateRandGaussian(double mean, double sd,
 Stream &Stream::ThenPopulateRandUniform(DeviceMemory<double> *values) {
   VLOG_CALL(PARAM(values));
 
-  if (ok()) {
-    if (rng::RngSupport *rng = parent_->AsRng()) {
-      CheckError(rng->DoPopulateRandUniform(this, values));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform RNG operation using StreamExecutor"
-                   " without RNG support.";
-    }
+  if (rng::RngSupport *rng = parent_->AsRng()) {
+    CheckError(rng->DoPopulateRandUniform(this, values));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform RNG operation using StreamExecutor"
+                 " without RNG support.";
   }
   return *this;
 }
@@ -4899,15 +4470,13 @@ Stream &Stream::ThenPopulateRandUniform(
     DeviceMemory<std::complex<float>> *values) {
   VLOG_CALL(PARAM(values));
 
-  if (ok()) {
-    if (rng::RngSupport *rng = parent_->AsRng()) {
-      CheckError(rng->DoPopulateRandUniform(this, values));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform RNG operation using StreamExecutor"
-                   " without RNG support.";
-    }
+  if (rng::RngSupport *rng = parent_->AsRng()) {
+    CheckError(rng->DoPopulateRandUniform(this, values));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform RNG operation using StreamExecutor"
+                 " without RNG support.";
   }
   return *this;
 }
@@ -4916,15 +4485,13 @@ Stream &Stream::ThenPopulateRandUniform(
     DeviceMemory<std::complex<double>> *values) {
   VLOG_CALL(PARAM(values));
 
-  if (ok()) {
-    if (rng::RngSupport *rng = parent_->AsRng()) {
-      CheckError(rng->DoPopulateRandUniform(this, values));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform RNG operation using StreamExecutor"
-                   " without RNG support.";
-    }
+  if (rng::RngSupport *rng = parent_->AsRng()) {
+    CheckError(rng->DoPopulateRandUniform(this, values));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform RNG operation using StreamExecutor"
+                 " without RNG support.";
   }
   return *this;
 }
@@ -4933,12 +4500,7 @@ Stream &Stream::ThenMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
                            uint64 size) {
   VLOG_CALL(PARAM(host_dst), PARAM(gpu_src), PARAM(size));
 
-  if (ok()) {
-    CheckError(parent_->Memcpy(this, host_dst, gpu_src, size));
-  } else {
-    LOG(INFO) << DebugStreamPointers()
-              << " did not memcpy device-to-host; source: " << gpu_src.opaque();
-  }
+  CheckError(parent_->Memcpy(this, host_dst, gpu_src, size));
   return *this;
 }
 
@@ -4946,12 +4508,7 @@ Stream &Stream::ThenMemcpy(DeviceMemoryBase *gpu_dst, const void *host_src,
                            uint64 size) {
   VLOG_CALL(PARAM(gpu_dst), PARAM(host_src), PARAM(size));
 
-  if (ok()) {
-    CheckError(parent_->Memcpy(this, gpu_dst, host_src, size));
-  } else {
-    LOG(INFO) << DebugStreamPointers()
-              << " did not memcpy host-to-device; source: " << host_src;
-  }
+  CheckError(parent_->Memcpy(this, gpu_dst, host_src, size));
   return *this;
 }
 
@@ -4959,24 +4516,14 @@ Stream &Stream::ThenMemcpy(DeviceMemoryBase *gpu_dst,
                            const DeviceMemoryBase &gpu_src, uint64 size) {
   VLOG_CALL(PARAM(gpu_dst), PARAM(gpu_src), PARAM(size));
 
-  if (ok()) {
-    CheckError(parent_->MemcpyDeviceToDevice(this, gpu_dst, gpu_src, size));
-  } else {
-    LOG(INFO) << DebugStreamPointers()
-              << " did not memcpy gpu-to-gpu; source: " << &gpu_src;
-  }
+  CheckError(parent_->MemcpyDeviceToDevice(this, gpu_dst, gpu_src, size));
   return *this;
 }
 
 Stream &Stream::ThenMemZero(DeviceMemoryBase *location, uint64 size) {
   VLOG_CALL(PARAM(location), PARAM(size));
 
-  if (ok()) {
-    CheckStatus(parent_->MemZero(this, location, size));
-  } else {
-    LOG(INFO) << DebugStreamPointers()
-              << " did not memzero GPU location; source: " << location;
-  }
+  CheckStatus(parent_->MemZero(this, location, size));
   return *this;
 }
 
@@ -4984,13 +4531,7 @@ Stream &Stream::ThenMemset32(DeviceMemoryBase *location, uint32 pattern,
                              uint64 size) {
   VLOG_CALL(PARAM(location), PARAM(pattern), PARAM(size));
 
-  if (ok()) {
-    CheckStatus(parent_->Memset32(this, location, pattern, size));
-  } else {
-    LOG(INFO) << DebugStreamPointers()
-              << " did not memset GPU location; source: " << location
-              << "; size: " << size << "; pattern: " << std::hex << pattern;
-  }
+  CheckStatus(parent_->Memset32(this, location, pattern, size));
   return *this;
 }
 
@@ -5013,20 +4554,17 @@ Stream &Stream::ThenRnnForward(
     ScratchAllocator *workspace_allocator,
     dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoRnnForward(
-          this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
-          input_c_desc, input_c_data, params, output_desc, output_data,
-          output_h_desc, output_h_data, output_c_desc, output_c_data,
-          is_training, reserve_space_allocator, workspace_allocator,
-          output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    auto status = dnn->DoRnnForward(
+        this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+        input_c_desc, input_c_data, params, output_desc, output_data,
+        output_h_desc, output_h_data, output_c_desc, output_c_data, is_training,
+        reserve_space_allocator, workspace_allocator, output_profile_result);
+    if (!status && !output_profile_result) {
+      SetError();
     }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -5049,20 +4587,17 @@ Stream &Stream::ThenRnnForward(
     ScratchAllocator *workspace_allocator,
     dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoRnnForward(
-          this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
-          input_c_desc, input_c_data, params, output_desc, output_data,
-          output_h_desc, output_h_data, output_c_desc, output_c_data,
-          is_training, reserve_space_allocator, workspace_allocator,
-          output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    auto status = dnn->DoRnnForward(
+        this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+        input_c_desc, input_c_data, params, output_desc, output_data,
+        output_h_desc, output_h_data, output_c_desc, output_c_data, is_training,
+        reserve_space_allocator, workspace_allocator, output_profile_result);
+    if (!status && !output_profile_result) {
+      SetError();
     }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -5086,20 +4621,17 @@ Stream &Stream::ThenRnnForward(
     ScratchAllocator *workspace_allocator,
     dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoRnnForward(
-          this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
-          input_c_desc, input_c_data, params, output_desc, output_data,
-          output_h_desc, output_h_data, output_c_desc, output_c_data,
-          is_training, reserve_space_allocator, workspace_allocator,
-          output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    auto status = dnn->DoRnnForward(
+        this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+        input_c_desc, input_c_data, params, output_desc, output_data,
+        output_h_desc, output_h_data, output_c_desc, output_c_data, is_training,
+        reserve_space_allocator, workspace_allocator, output_profile_result);
+    if (!status && !output_profile_result) {
+      SetError();
     }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -5130,23 +4662,21 @@ Stream &Stream::ThenRnnBackward(
     ScratchAllocator *workspace_allocator,
     dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoRnnBackward(
-          this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
-          input_c_desc, input_c_data, params, output_desc, output_data,
-          output_h_desc, output_h_data, output_c_desc, output_c_data,
-          output_backprop_data, output_h_backprop_data, output_c_backprop_data,
-          input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-          params_backprop_data, reserve_space_data, workspace_allocator,
-          output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    auto status = dnn->DoRnnBackward(
+        this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+        input_c_desc, input_c_data, params, output_desc, output_data,
+        output_h_desc, output_h_data, output_c_desc, output_c_data,
+        output_backprop_data, output_h_backprop_data, output_c_backprop_data,
+        input_backprop_data, input_h_backprop_data, input_c_backprop_data,
+        params_backprop_data, reserve_space_data, workspace_allocator,
+        output_profile_result);
+    if (!status && !output_profile_result) {
       SetError();
-      LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
     }
+  } else {
+    SetError();
+    LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
   }
   return *this;
 }
@@ -5176,23 +4706,21 @@ Stream &Stream::ThenRnnBackward(
     ScratchAllocator *workspace_allocator,
     dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoRnnBackward(
-          this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
-          input_c_desc, input_c_data, params, output_desc, output_data,
-          output_h_desc, output_h_data, output_c_desc, output_c_data,
-          output_backprop_data, output_h_backprop_data, output_c_backprop_data,
-          input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-          params_backprop_data, reserve_space_data, workspace_allocator,
-          output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    auto status = dnn->DoRnnBackward(
+        this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+        input_c_desc, input_c_data, params, output_desc, output_data,
+        output_h_desc, output_h_data, output_c_desc, output_c_data,
+        output_backprop_data, output_h_backprop_data, output_c_backprop_data,
+        input_backprop_data, input_h_backprop_data, input_c_backprop_data,
+        params_backprop_data, reserve_space_data, workspace_allocator,
+        output_profile_result);
+    if (!status && !output_profile_result) {
       SetError();
-      LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
     }
+  } else {
+    SetError();
+    LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
   }
   return *this;
 }
@@ -5223,23 +4751,21 @@ Stream &Stream::ThenRnnBackward(
     ScratchAllocator *workspace_allocator,
     dnn::ProfileResult *output_profile_result) {
   // TODO(zhengxq): add VLOG PARAM calls.
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      auto status = dnn->DoRnnBackward(
-          this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
-          input_c_desc, input_c_data, params, output_desc, output_data,
-          output_h_desc, output_h_data, output_c_desc, output_c_data,
-          output_backprop_data, output_h_backprop_data, output_c_backprop_data,
-          input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-          params_backprop_data, reserve_space_data, workspace_allocator,
-          output_profile_result);
-      if (!status && !output_profile_result) {
-        SetError();
-      }
-    } else {
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    auto status = dnn->DoRnnBackward(
+        this, rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+        input_c_desc, input_c_data, params, output_desc, output_data,
+        output_h_desc, output_h_data, output_c_desc, output_c_data,
+        output_backprop_data, output_h_backprop_data, output_c_backprop_data,
+        input_backprop_data, input_h_backprop_data, input_c_backprop_data,
+        params_backprop_data, reserve_space_data, workspace_allocator,
+        output_profile_result);
+    if (!status && !output_profile_result) {
       SetError();
-      LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
     }
+  } else {
+    SetError();
+    LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
   }
   return *this;
 }
@@ -5253,28 +4779,26 @@ Stream &Stream::ThenCtcLoss(const dnn::RnnStateTensorDescriptor &probs_desc,
                             const dnn::RnnStateTensorDescriptor &grads_desc,
                             DeviceMemory<float> *grads_data,
                             ScratchAllocator *workspace_allocator) {
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      DeviceMemory<uint8> scratch_memory;
-      int ctc_loss_algo_id;
-      auto status =
-          dnn->PrepareForCtcLoss(this, probs_desc, probs_data, grads_desc,
-                                 labels_data, labels_lengths_data,
-                                 input_lengths_data, workspace_allocator,
-                                 &scratch_memory, &ctc_loss_algo_id)
-              .ok();
-      if (status) {
-        status = dnn->DoCtcLoss(this, probs_desc, probs_data, labels_data,
-                                labels_lengths_data, input_lengths_data,
-                                costs_data, grads_desc, grads_data,
-                                &scratch_memory, ctc_loss_algo_id);
-      }
-      if (!status) {
-        SetError();
-      }
-    } else {
-      SetErrorAndLogNoDnnSupport();
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    DeviceMemory<uint8> scratch_memory;
+    int ctc_loss_algo_id;
+    auto status =
+        dnn->PrepareForCtcLoss(this, probs_desc, probs_data, grads_desc,
+                               labels_data, labels_lengths_data,
+                               input_lengths_data, workspace_allocator,
+                               &scratch_memory, &ctc_loss_algo_id)
+            .ok();
+    if (status) {
+      status = dnn->DoCtcLoss(this, probs_desc, probs_data, labels_data,
+                              labels_lengths_data, input_lengths_data,
+                              costs_data, grads_desc, grads_data,
+                              &scratch_memory, ctc_loss_algo_id);
     }
+    if (!status) {
+      SetError();
+    }
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -5288,14 +4812,12 @@ Stream &Stream::ThenTransformTensor(const dnn::BatchDescriptor &input_desc,
   VLOG_CALL(PARAM(input_desc), PARAM(input_type), PARAM(input_data),
             PARAM(output_desc), PARAM(output_type), PARAM(scale),
             PARAM(output_data));
-  if (ok()) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      CheckError(dnn->DoTransformTensor(this, input_desc, input_type,
-                                        input_data, output_desc, output_type,
-                                        scale, output_data));
-    } else {
-      SetErrorAndLogNoDnnSupport();
-    }
+  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+    CheckError(dnn->DoTransformTensor(this, input_desc, input_type, input_data,
+                                      output_desc, output_type, scale,
+                                      output_data));
+  } else {
+    SetErrorAndLogNoDnnSupport();
   }
   return *this;
 }
@@ -5342,15 +4864,13 @@ Stream &Stream::ThenFft(fft::Plan *plan,
                         DeviceMemory<std::complex<float>> *output) {
   VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output));
 
-  if (ok()) {
-    if (fft::FftSupport *fft = parent_->AsFft()) {
-      CheckError(fft->DoFft(this, plan, input, output));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform FFT operation using StreamExecutor"
-                   " without FFT support";
-    }
+  if (fft::FftSupport *fft = parent_->AsFft()) {
+    CheckError(fft->DoFft(this, plan, input, output));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform FFT operation using StreamExecutor"
+                 " without FFT support";
   }
   return *this;
 }
@@ -5360,15 +4880,13 @@ Stream &Stream::ThenFft(fft::Plan *plan,
                         DeviceMemory<std::complex<double>> *output) {
   VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output));
 
-  if (ok()) {
-    if (fft::FftSupport *fft = parent_->AsFft()) {
-      CheckError(fft->DoFft(this, plan, input, output));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform FFT operation using StreamExecutor"
-                   " without FFT support";
-    }
+  if (fft::FftSupport *fft = parent_->AsFft()) {
+    CheckError(fft->DoFft(this, plan, input, output));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform FFT operation using StreamExecutor"
+                 " without FFT support";
   }
   return *this;
 }
@@ -5377,15 +4895,13 @@ Stream &Stream::ThenFft(fft::Plan *plan, const DeviceMemory<float> &input,
                         DeviceMemory<std::complex<float>> *output) {
   VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output));
 
-  if (ok()) {
-    if (fft::FftSupport *fft = parent_->AsFft()) {
-      CheckError(fft->DoFft(this, plan, input, output));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform FFT operation using StreamExecutor"
-                   " without FFT support";
-    }
+  if (fft::FftSupport *fft = parent_->AsFft()) {
+    CheckError(fft->DoFft(this, plan, input, output));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform FFT operation using StreamExecutor"
+                 " without FFT support";
   }
   return *this;
 }
@@ -5394,15 +4910,13 @@ Stream &Stream::ThenFft(fft::Plan *plan, const DeviceMemory<double> &input,
                         DeviceMemory<std::complex<double>> *output) {
   VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output));
 
-  if (ok()) {
-    if (fft::FftSupport *fft = parent_->AsFft()) {
-      CheckError(fft->DoFft(this, plan, input, output));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform FFT operation using StreamExecutor"
-                   " without FFT support";
-    }
+  if (fft::FftSupport *fft = parent_->AsFft()) {
+    CheckError(fft->DoFft(this, plan, input, output));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform FFT operation using StreamExecutor"
+                 " without FFT support";
   }
   return *this;
 }
@@ -5412,15 +4926,13 @@ Stream &Stream::ThenFft(fft::Plan *plan,
                         DeviceMemory<float> *output) {
   VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output));
 
-  if (ok()) {
-    if (fft::FftSupport *fft = parent_->AsFft()) {
-      CheckError(fft->DoFft(this, plan, input, output));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform FFT operation using StreamExecutor"
-                   " without FFT support";
-    }
+  if (fft::FftSupport *fft = parent_->AsFft()) {
+    CheckError(fft->DoFft(this, plan, input, output));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform FFT operation using StreamExecutor"
+                 " without FFT support";
   }
   return *this;
 }
@@ -5430,15 +4942,13 @@ Stream &Stream::ThenFft(fft::Plan *plan,
                         DeviceMemory<double> *output) {
   VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output));
 
-  if (ok()) {
-    if (fft::FftSupport *fft = parent_->AsFft()) {
-      CheckError(fft->DoFft(this, plan, input, output));
-    } else {
-      SetError();
-      LOG(INFO) << DebugStreamPointers()
-                << " attempting to perform FFT operation using StreamExecutor"
-                   " without FFT support";
-    }
+  if (fft::FftSupport *fft = parent_->AsFft()) {
+    CheckError(fft->DoFft(this, plan, input, output));
+  } else {
+    SetError();
+    LOG(INFO) << DebugStreamPointers()
+              << " attempting to perform FFT operation using StreamExecutor"
+                 " without FFT support";
   }
   return *this;
 }
@@ -5500,7 +5010,7 @@ void Stream::CheckStatus(port::Status status) {
   }
   LOG(ERROR) << status;
   absl::MutexLock lock(&mu_);
-  ok_ = false;
+  status_ = status;
 }
 
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index bf727d63da2..b1460b02935 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/launch_dim.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
 #include "tensorflow/stream_executor/platform/port.h"
+#include "tensorflow/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/stream_executor/temporary_memory_manager.h"
 
 namespace stream_executor {
@@ -74,6 +75,19 @@ class AlgorithmDesc;
 class StreamExecutor;
 class ScratchAllocator;
 
+namespace detail {
+
+// Helper class to prevent a template function argument from being deduced. This
+// is identical to std::type_identity in C++20.
+template <typename T>
+struct NonDeduced {
+  using type = T;
+};
+template <typename T>
+using NonDeducedType = typename NonDeduced<T>::type;
+
+}  // namespace detail
+
 // Convert a type to the corresponding QuantizedActivationMode.
 template <typename ElementType>
 struct Quantization;
@@ -308,62 +322,36 @@ class Stream {
       const dnn::BatchDescriptor &output_descriptor,
       DeviceMemory<float> *output_data);
 
-  Stream &ThenConvolveWithAlgorithm(
+  template <typename InputType, typename OutputType>
+  port::Status ConvolveWithAlgorithm(
       const dnn::BatchDescriptor &input_descriptor,
-      const DeviceMemory<double> &input_data,
+      const DeviceMemory<InputType> &input_data,
       const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<double> &filter_data,
+      const DeviceMemory<InputType> &filter_data,
       const dnn::ConvolutionDescriptor &convolution_descriptor,
       const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<double> *output, ScratchAllocator *scratch_allocator,
+      DeviceMemory<OutputType> *output, ScratchAllocator *scratch_allocator,
       const dnn::AlgorithmConfig &algorithm_config,
-      dnn::ProfileResult *output_profile_result);
+      dnn::ProfileResult *output_profile_result) {
+    DeviceMemory<uint8> scratch_memory;
+    dnn::AlgorithmDesc algorithm_desc;
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      TF_RETURN_IF_ERROR(dnn->PrepareForConvolution(
+          dnn::ConvolutionKind::FORWARD, this, input_descriptor, input_data,
+          filter_descriptor, filter_data, output_descriptor, *output,
+          convolution_descriptor, algorithm_config, scratch_allocator,
+          &algorithm_desc, &scratch_memory));
+      return dnn->DoConvolve(
+          dnn::ConvolutionKind::FORWARD, dnn::ToDataType<InputType>::value,
+          dnn::ToDataType<OutputType>::value, this, input_descriptor,
+          input_data, filter_descriptor, filter_data, output_descriptor,
+          *output, convolution_descriptor, algorithm_desc, scratch_memory,
+          output_profile_result);
+    }
+    return port::UnimplementedError("DNN library is not found.");
+  }
 
-  Stream &ThenConvolveWithAlgorithm(
-      const dnn::BatchDescriptor &input_descriptor,
-      const DeviceMemory<float> &input_data,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<float> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<float> *output, ScratchAllocator *scratch_allocator,
-      const dnn::AlgorithmConfig &algorithm_config,
-      dnn::ProfileResult *output_profile_result);
-
-  Stream &ThenConvolveWithAlgorithm(
-      const dnn::BatchDescriptor &input_descriptor,
-      const DeviceMemory<Eigen::half> &input_data,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<Eigen::half> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<Eigen::half> *output, ScratchAllocator *scratch_allocator,
-      const dnn::AlgorithmConfig &algorithm_config,
-      dnn::ProfileResult *output_profile_result);
-
-  Stream &ThenConvolveWithAlgorithm(
-      const dnn::BatchDescriptor &input_descriptor,
-      const DeviceMemory<int8> &input_data,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<int8> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<float> *output, ScratchAllocator *scratch_allocator,
-      const dnn::AlgorithmConfig &algorithm_config,
-      dnn::ProfileResult *output_profile_result);
-
-  Stream &ThenConvolveWithAlgorithm(
-      const dnn::BatchDescriptor &input_descriptor,
-      const DeviceMemory<int8> &input_data,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<int8> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::BatchDescriptor &output_descriptor, DeviceMemory<int8> *output,
-      ScratchAllocator *scratch_allocator,
-      const dnn::AlgorithmConfig &algorithm_config,
-      dnn::ProfileResult *output_profile_result);
-
-  Stream &ThenFusedConvolveWithAlgorithm(
+  port::Status FusedConvolveWithAlgorithm(
       const dnn::BatchDescriptor &conv_input_descriptor,
       const DeviceMemory<double> &conv_input_data, double conv_input_scale,
       const dnn::FilterDescriptor &filter_descriptor,
@@ -377,7 +365,7 @@ class Stream {
       const dnn::AlgorithmConfig &algorithm_config,
       dnn::ProfileResult *output_profile_result);
 
-  Stream &ThenFusedConvolveWithAlgorithm(
+  port::Status FusedConvolveWithAlgorithm(
       const dnn::BatchDescriptor &conv_input_descriptor,
       const DeviceMemory<float> &conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor &filter_descriptor,
@@ -391,7 +379,7 @@ class Stream {
       const dnn::AlgorithmConfig &algorithm_config,
       dnn::ProfileResult *output_profile_result);
 
-  Stream &ThenFusedConvolveWithAlgorithm(
+  port::Status FusedConvolveWithAlgorithm(
       const dnn::BatchDescriptor &conv_input_descriptor,
       const DeviceMemory<Eigen::half> &conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor &filter_descriptor,
@@ -406,7 +394,7 @@ class Stream {
       const dnn::AlgorithmConfig &algorithm_config,
       dnn::ProfileResult *output_profile_result);
 
-  Stream &ThenFusedConvolveWithAlgorithm(
+  port::Status FusedConvolveWithAlgorithm(
       const dnn::BatchDescriptor &conv_input_descriptor,
       const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor &filter_descriptor,
@@ -420,7 +408,7 @@ class Stream {
       const dnn::AlgorithmConfig &algorithm_config,
       dnn::ProfileResult *output_profile_result);
 
-  Stream &ThenFusedConvolveWithAlgorithm(
+  port::Status FusedConvolveWithAlgorithm(
       const dnn::BatchDescriptor &conv_input_descriptor,
       const DeviceMemory<int8> &conv_input_data, float conv_input_scale,
       const dnn::FilterDescriptor &filter_descriptor,
@@ -444,77 +432,69 @@ class Stream {
       const dnn::BatchDescriptor &output_descriptor,
       DeviceMemory<float> *output);
 
-  Stream &ThenConvolveBackwardDataWithAlgorithm(
+  template <typename ElementType>
+  port::Status ConvolveBackwardDataWithAlgorithm(
       const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<double> &filter_data,
+      const DeviceMemory<ElementType> &filter_data,
       const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<double> backward_output_data,
+      DeviceMemory<ElementType> backward_output_data,
       const dnn::ConvolutionDescriptor &convolution_descriptor,
       const dnn::BatchDescriptor &input_descriptor,
-      DeviceMemory<double> *backward_input_data,
+      DeviceMemory<ElementType> *backward_input_data,
       ScratchAllocator *scratch_allocator,
       const dnn::AlgorithmConfig &algorithm_config,
-      dnn::ProfileResult *output_profile_result);
+      dnn::ProfileResult *output_profile_result) {
+    DeviceMemory<uint8> scratch_memory;
+    dnn::AlgorithmDesc algorithm_desc;
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      TF_RETURN_IF_ERROR(dnn->PrepareForConvolution(
+          dnn::ConvolutionKind::BACKWARD_DATA, this, input_descriptor,
+          *backward_input_data, filter_descriptor, filter_data,
+          output_descriptor, backward_output_data, convolution_descriptor,
+          algorithm_config, scratch_allocator, &algorithm_desc,
+          &scratch_memory));
+      return dnn->DoConvolve(
+          dnn::ConvolutionKind::BACKWARD_DATA,
+          dnn::ToDataType<ElementType>::value,
+          dnn::ToDataType<ElementType>::value, this, input_descriptor,
+          *backward_input_data, filter_descriptor, filter_data,
+          output_descriptor, backward_output_data, convolution_descriptor,
+          algorithm_desc, scratch_memory, output_profile_result);
+    }
+    return port::UnimplementedError("DNN library is not found.");
+  }
 
-  Stream &ThenConvolveBackwardDataWithAlgorithm(
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<float> &filter_data,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<float> backward_output_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
+  template <typename ElementType>
+  port::Status ConvolveBackwardFilterWithAlgorithm(
       const dnn::BatchDescriptor &input_descriptor,
-      DeviceMemory<float> *backward_input_data,
-      ScratchAllocator *scratch_allocator,
-      const dnn::AlgorithmConfig &algorithm_config,
-      dnn::ProfileResult *output_profile_result);
-
-  Stream &ThenConvolveBackwardDataWithAlgorithm(
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<Eigen::half> &filter_data,
+      const DeviceMemory<ElementType> &input_data,
       const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<Eigen::half> backward_output_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::BatchDescriptor &input_descriptor,
-      DeviceMemory<Eigen::half> *backward_input_data,
-      ScratchAllocator *scratch_allocator,
-      const dnn::AlgorithmConfig &algorithm_config,
-      dnn::ProfileResult *output_profile_result);
-
-  Stream &ThenConvolveBackwardFilterWithAlgorithm(
-      const dnn::BatchDescriptor &input_descriptor,
-      const DeviceMemory<double> &input_data,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<double> backward_output_data,
+      DeviceMemory<ElementType> backward_output_data,
       const dnn::ConvolutionDescriptor &convolution_descriptor,
       const dnn::FilterDescriptor &filter_descriptor,
-      DeviceMemory<double> *backward_filter_data,
+      DeviceMemory<ElementType> *backward_filter_data,
       ScratchAllocator *scratch_allocator,
       const dnn::AlgorithmConfig &algorithm_config,
-      dnn::ProfileResult *output_profile_result);
-
-  Stream &ThenConvolveBackwardFilterWithAlgorithm(
-      const dnn::BatchDescriptor &input_descriptor,
-      const DeviceMemory<float> &input_data,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<float> backward_output_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::FilterDescriptor &filter_descriptor,
-      DeviceMemory<float> *backward_filter_data,
-      ScratchAllocator *scratch_allocator,
-      const dnn::AlgorithmConfig &algorithm_config,
-      dnn::ProfileResult *output_profile_result);
-
-  Stream &ThenConvolveBackwardFilterWithAlgorithm(
-      const dnn::BatchDescriptor &input_descriptor,
-      const DeviceMemory<Eigen::half> &input_data,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<Eigen::half> backward_output_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const dnn::FilterDescriptor &filter_descriptor,
-      DeviceMemory<Eigen::half> *backward_filter_data,
-      ScratchAllocator *scratch_allocator,
-      const dnn::AlgorithmConfig &algorithm_config,
-      dnn::ProfileResult *output_profile_result);
+      dnn::ProfileResult *output_profile_result) {
+    DeviceMemory<uint8> scratch_memory;
+    dnn::AlgorithmDesc algorithm_desc;
+    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
+      TF_RETURN_IF_ERROR(dnn->PrepareForConvolution(
+          dnn::ConvolutionKind::BACKWARD_FILTER, this, input_descriptor,
+          input_data, filter_descriptor, *backward_filter_data,
+          output_descriptor, backward_output_data, convolution_descriptor,
+          algorithm_config, scratch_allocator, &algorithm_desc,
+          &scratch_memory));
+      return dnn->DoConvolve(
+          dnn::ConvolutionKind::BACKWARD_FILTER,
+          dnn::ToDataType<ElementType>::value,
+          dnn::ToDataType<ElementType>::value, this, input_descriptor,
+          input_data, filter_descriptor, *backward_filter_data,
+          output_descriptor, backward_output_data, convolution_descriptor,
+          algorithm_desc, scratch_memory, output_profile_result);
+    }
+    return port::UnimplementedError("DNN library is not found.");
+  }
 
   Stream &ThenConvolveBackwardBias(const dnn::BatchDescriptor &input_descriptor,
                                    const DeviceMemory<double> &input_data,
@@ -1665,6 +1645,25 @@ class Stream {
                        const DeviceMemory<std::complex<double>> &a, int lda,
                        DeviceMemory<std::complex<double>> *b, int ldb);
 
+  // See BlasSupport::DoBlatLtMatmul.
+  // Note that we prevent alpha and beta from being used to deduce CType so that
+  // they can be constructed implicitly from values of type CType. Without this,
+  // type deduction would fail when this function is called with a value of type
+  // CType for alpha or beta.
+  template <typename ABType, typename CType>
+  Stream &ThenBlasLtMatmul(
+      const blas::IBlasLtMatmulPlan *plan,
+      const detail::NonDeducedType<HostOrDeviceScalar<CType>> &alpha,
+      const DeviceMemory<ABType> &a, const DeviceMemory<ABType> &b,
+      const detail::NonDeducedType<HostOrDeviceScalar<CType>> &beta,
+      DeviceMemory<CType> *c, ScratchAllocator *scratch_allocator,
+      const blas::IBlasLtMatmulAlgorithm *algorithm,
+      const DeviceMemory<CType> &bias = {},
+      blas::ProfileResult *output_profile_result = nullptr) {
+    return ThenBlasLtMatmulImpl(plan, alpha, a, b, beta, c, scratch_allocator,
+                                algorithm, bias, output_profile_result);
+  }
+
   // See FftSupport::DoFft.
   Stream &ThenFft(fft::Plan *plan,
                   const DeviceMemory<std::complex<float>> &input,
@@ -2026,7 +2025,7 @@ class Stream {
 
   bool InErrorState() const TF_LOCKS_EXCLUDED(mu_) {
     absl::ReaderMutexLock lock(&mu_);
-    return !ok_;
+    return !status_.ok();
   }
 
   // Sets the error state if operation_retcode is false.
@@ -2036,7 +2035,7 @@ class Stream {
       return;
     }
     absl::MutexLock lock(&mu_);
-    ok_ = false;
+    status_ = port::InternalError("Unknown error");
   }
 
   // Checks the status and logs the error message, if any.
@@ -2070,9 +2069,8 @@ class Stream {
   // See StreamExecutor::AllocateStream.
   bool allocated_ TF_GUARDED_BY(mu_);
 
-  // Whether all operations have entrained successfully to the current program
-  // point.
-  bool ok_ TF_GUARDED_BY(mu_);
+  // The last error (if any) of all method calls.
+  port::Status status_ TF_GUARDED_BY(mu_);
 
   // Sub-streams that are generated from this stream. Each element has a pointer
   // to sub-stream and a boolean value indicating if this substream is ready to
@@ -2098,12 +2096,51 @@ class Stream {
       const dnn::BatchDescriptor &bias_descriptor,
       DeviceMemory<T> *backward_bias_data);
 
+  // Implementation of ThenBlasLtMatmul that is shared by all types.
+  template <typename ABType, typename CType>
+  Stream &ThenBlasLtMatmulImpl(const blas::IBlasLtMatmulPlan *plan,
+                               const HostOrDeviceScalar<CType> &alpha,
+                               const DeviceMemory<ABType> &a,
+                               const DeviceMemory<ABType> &b,
+                               const HostOrDeviceScalar<CType> &beta,
+                               DeviceMemory<CType> *c,
+                               ScratchAllocator *scratch_allocator,
+                               const blas::IBlasLtMatmulAlgorithm *algorithm,
+                               const DeviceMemory<CType> &bias,
+                               blas::ProfileResult *output_profile_result);
+
   SE_DISALLOW_COPY_AND_ASSIGN(Stream);
 };
 
 ////////////
 // Inlines
 
+template <typename... Params, typename... Args>
+inline Stream &Stream::ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
+                                  const TypedKernel<Params...> &kernel,
+                                  Args... args) {
+  KernelInvocationChecker<std::tuple<Params...>,
+                          std::tuple<Args...>>::CheckAllStaticAssert();
+  if (ok()) {
+    // This is the core that allows type-safe kernel launching.
+    // Since the platforms take kernel arguments as tuples of (void *, size),
+    // we pack the variadic parameters passed as ...args into the desired
+    // tuple form and pass that packed form to the StreamExecutor::Launch()
+    // implementation.
+    KernelArgsArray<sizeof...(args)> kernel_args;
+    kernel.PackParams(&kernel_args, args...);
+    DCHECK(parent_ != nullptr);
+    bool ok =
+        parent_->Launch(this, thread_dims, block_dims, kernel, kernel_args)
+            .ok();
+    if (!ok) {
+      SetError();
+      LOG(WARNING) << "parent failed to launch kernel: " << &kernel;
+    }
+  }
+  return *this;
+}
+
 template <typename T>
 inline port::StatusOr<std::unique_ptr<TemporaryDeviceMemory<T>>>
 Stream::AllocateTemporaryArray(uint64 element_count) {
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index db4e8f9b694..3c6f70ae2b0 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/rng.h"
+#include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
 namespace {
@@ -336,6 +337,30 @@ bool StreamExecutor::GetBlasGemmAlgorithms(
   return blas_support->GetBlasGemmAlgorithms(out_algorithms);
 }
 
+port::StatusOr<std::unique_ptr<blas::IBlasLtMatmulPlan>>
+StreamExecutor::CreateBlasLtMatmulPlan(
+    const blas::BlasLtMatmulPlanParams &params) {
+  blas::BlasSupport *blas_support = AsBlas();
+  if (!blas_support) {
+    return port::Status(port::error::UNKNOWN,
+                        "Fail to find the blas implementation.");
+  }
+  return blas_support->CreateBlasLtMatmulPlan(params);
+}
+
+port::StatusOr<std::vector<std::unique_ptr<blas::IBlasLtMatmulAlgorithm>>>
+StreamExecutor::GetBlasLtMatmulAlgorithms(const blas::IBlasLtMatmulPlan *plan,
+                                          size_t max_workspace_size,
+                                          int max_algorithm_count) {
+  blas::BlasSupport *blas_support = AsBlas();
+  if (!blas_support) {
+    return port::Status(port::error::UNKNOWN,
+                        "Fail to find the blas implementation.");
+  }
+  return blas_support->GetBlasLtMatmulAlgorithms(plan, max_workspace_size,
+                                                 max_algorithm_count);
+}
+
 port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
 StreamExecutor::createRnnDescriptor(
     int num_layers, int hidden_size, int input_size, int cell_size,
@@ -743,6 +768,7 @@ bool StreamExecutor::AllocateStream(Stream *stream) {
     return false;
   }
 
+  RegisterStream(stream);
   return true;
 }
 
@@ -750,6 +776,7 @@ void StreamExecutor::DeallocateStream(Stream *stream) {
   implementation_->DeallocateStream(stream);
   CHECK_GE(live_stream_count_.fetch_sub(1), 0)
       << "live stream count should not dip below zero";
+  UnregisterStream(stream);
 }
 
 bool StreamExecutor::CreateStreamDependency(Stream *dependent, Stream *other) {
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index b9b118ca42c..2ee583477b9 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -35,12 +35,13 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/port.h"
 #include "tensorflow/stream_executor/rng.h"
-#include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/trace_listener.h"
 
 namespace stream_executor {
 
+class Stream;
+
 // Structure used for device memory leak checking.
 struct AllocRecord {
   // The requested allocation size of the buffer.
@@ -394,6 +395,21 @@ class StreamExecutor {
   // Get the list of supported algorithms for BLAS gemm.
   bool GetBlasGemmAlgorithms(std::vector<blas::AlgorithmType> *out_algorithms);
 
+  // Creates a backend-specific plan object for a blaslt matmul operation, which
+  // can then be passed to DoBlasLtMatmul(). When possible, plans should be
+  // created once and reused for multiple calls to DoBlasLtMatmul().
+  // Returns a null pointer on failure.
+  port::StatusOr<std::unique_ptr<blas::IBlasLtMatmulPlan>>
+  CreateBlasLtMatmulPlan(const blas::BlasLtMatmulPlanParams &params);
+
+  // Gets a list of supported algorithms for DoBlasLtMatmul. The algorithms are
+  // returned in the order of increasing estimated compute time according to an
+  // internal heuristic. The first returned algorithm can be used as the default
+  // algorithm if no autotuning is to be performed.
+  port::StatusOr<std::vector<std::unique_ptr<blas::IBlasLtMatmulAlgorithm>>>
+  GetBlasLtMatmulAlgorithms(const blas::IBlasLtMatmulPlan *plan,
+                            size_t max_workspace_size, int max_algorithm_count);
+
   // Create an RNN descriptor based on model shapes and configurations.
   // The caller retains the ownership of the descriptor.
   port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
@@ -512,6 +528,24 @@ class StreamExecutor {
   // allocation.
   StreamExecutorMemoryAllocator *GetAllocator() { return &allocator_; }
 
+  // Block host until all streams associated with this stream executor have
+  // finished all of enqueued work.
+  port::Status BlockHostUntilAllStreamsAreDone() {
+    std::vector<Stream *> streams;
+    {
+      absl::MutexLock lock(&mu_);
+      for (Stream *stream : streams_) {
+        streams.push_back(stream);
+      }
+    }
+
+    for (Stream *stream : streams) {
+      TF_RETURN_IF_ERROR(BlockHostUntilDone(stream));
+    }
+
+    return port::Status::OK();
+  }
+
  private:
   template <typename BeginCallT, typename CompleteCallT, typename ReturnT,
             typename... BeginArgsT>
@@ -641,6 +675,16 @@ class StreamExecutor {
   template <typename TraceCallT, typename... ArgsT>
   void SubmitTrace(TraceCallT trace_call, ArgsT &&...args);
 
+  void RegisterStream(Stream *stream) {
+    absl::MutexLock lock(&mu_);
+    streams_.insert(stream);
+  }
+
+  void UnregisterStream(Stream *stream) {
+    absl::MutexLock lock(&mu_);
+    streams_.erase(stream);
+  }
+
   // Reader/writer lock for class-static StreamExecutor members.
   static absl::Mutex static_mu_;
 
@@ -731,6 +775,9 @@ class StreamExecutor {
 
   StreamExecutorMemoryAllocator allocator_;
 
+  // Set of streams associated with this stream executor.
+  std::set<Stream *> streams_ TF_GUARDED_BY(mu_);
+
   SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutor);
 };
 
@@ -864,32 +911,6 @@ DeviceMemory<T> StreamExecutor::GetSubBuffer(DeviceMemory<T> *parent,
   return DeviceMemory<T>(DeviceMemoryBase(opaque, sizeof(T) * element_count));
 }
 
-template <typename... Params, typename... Args>
-inline Stream &Stream::ThenLaunch(ThreadDim thread_dims, BlockDim block_dims,
-                                  const TypedKernel<Params...> &kernel,
-                                  Args... args) {
-  KernelInvocationChecker<std::tuple<Params...>,
-                          std::tuple<Args...>>::CheckAllStaticAssert();
-  if (ok()) {
-    // This is the core that allows type-safe kernel launching.
-    // Since the platforms take kernel arguments as tuples of (void *, size),
-    // we pack the variadic parameters passed as ...args into the desired
-    // tuple form and pass that packed form to the StreamExecutor::Launch()
-    // implementation.
-    KernelArgsArray<sizeof...(args)> kernel_args;
-    kernel.PackParams(&kernel_args, args...);
-    DCHECK(parent_ != nullptr);
-    bool ok =
-        parent_->Launch(this, thread_dims, block_dims, kernel, kernel_args)
-            .ok();
-    if (!ok) {
-      SetError();
-      LOG(WARNING) << "parent failed to launch kernel: " << &kernel;
-    }
-  }
-  return *this;
-}
-
 }  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_STREAM_EXECUTOR_PIMPL_H_
diff --git a/tensorflow/stream_executor/tpu/BUILD b/tensorflow/stream_executor/tpu/BUILD
index 70423d8d878..98d59726b60 100644
--- a/tensorflow/stream_executor/tpu/BUILD
+++ b/tensorflow/stream_executor/tpu/BUILD
@@ -1,7 +1,12 @@
 # Description: StreamExecutor Interface for TPUs
 
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+
 package(
-    default_visibility = ["//tensorflow/core/tpu:__subpackages__"],
+    default_visibility = [
+        "//learning/brain/experimental/dtensor:__subpackages__",
+        "//tensorflow/core/tpu:__subpackages__",
+    ],
     licenses = ["notice"],  # Apache 2.0
 )
 
@@ -43,6 +48,8 @@ cc_library(
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:maybe_owning_device_memory",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core/tpu:tpu_api",
@@ -138,8 +145,10 @@ cc_library(
     ],
     deps = [
         ":c_api_conversions",
+        ":c_api_decl",
         ":status_helper",
         ":tpu_executor_c_api_hdrs",
+        ":tpu_topology_external",
         "//tensorflow/core:lib",
         "//tensorflow/core/tpu:tpu_api",
         "//tensorflow/stream_executor",
@@ -216,6 +225,7 @@ cc_library(
 cc_library(
     name = "tpu_transfer_manager",
     srcs = ["tpu_transfer_manager_registration.cc"],
+    visibility = ["//visibility:public"],
     deps = [
         ":tpu_executor",
         ":tpu_transfer_manager_base",
@@ -250,10 +260,12 @@ cc_library(
     name = "tpu_computation_placer",
     srcs = ["tpu_computation_placer.cc"],
     hdrs = ["tpu_computation_placer.h"],
+    visibility = ["//visibility:public"],
     deps = [
         ":status_helper",
         ":tpu_executor",
         ":tpu_executor_c_api_hdrs",
+        ":tpu_topology_external",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:computation_placer",
         "//tensorflow/core/tpu:tpu_api",
diff --git a/tensorflow/stream_executor/tpu/c_api_conversions.cc b/tensorflow/stream_executor/tpu/c_api_conversions.cc
index ddbd9ec2219..0a7801f45fc 100644
--- a/tensorflow/stream_executor/tpu/c_api_conversions.cc
+++ b/tensorflow/stream_executor/tpu/c_api_conversions.cc
@@ -23,7 +23,6 @@ limitations under the License.
 namespace ApiConverter {
 
 xla::ShapedBuffer FromC(XLA_ShapedBuffer* c_buffer) {
-  xla::Shape xla_on_host_shape = ApiConverter::FromC(&c_buffer->on_host_shape);
   xla::Shape xla_on_device_shape =
       ApiConverter::FromC(&c_buffer->on_device_shape);
 
@@ -36,21 +35,27 @@ xla::ShapedBuffer FromC(XLA_ShapedBuffer* c_buffer) {
   }
 
   xla::ShapedBuffer xla_shaped_buffer(
-      xla_on_host_shape, xla_on_device_shape,
+      xla_on_device_shape,
       tensorflow::tpu::TpuPlatformInterface::GetRegisteredPlatform(),
       c_buffer->device_ordinal);
   xla_shaped_buffer.set_buffers(xla_shape_tree);
   return xla_shaped_buffer;
 }
 
-SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceMemory& mem) {
+SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceMemory& mem,
+                               bool aliased) {
   SE_MaybeOwningDeviceMemory se_mem;
   se_mem.owned = mem.HasOwnership();
   se_mem.memory = ApiConverter::ToC(mem.AsDeviceMemoryBase());
   if (mem.HasOwnership()) {
-    auto owned = mem.Release().value();
-    se_mem.device_ordinal = owned.device_ordinal();
-    se_mem.allocator = ApiConverter::ToC(owned.allocator());
+    const stream_executor::OwningDeviceMemory* owned =
+        mem.AsOwningDeviceMemory();
+    se_mem.device_ordinal = owned->device_ordinal();
+    se_mem.allocator = ApiConverter::ToC(owned->allocator());
+    if (!aliased) {
+      // Underlying buffer is owned by se_mem now.
+      mem.Release()->Release();
+    }
   } else {
     se_mem.allocator =
         ToC(static_cast<stream_executor::DeviceMemoryAllocator*>(nullptr));
@@ -144,7 +149,7 @@ stream_executor::DeviceMemoryBase FromC(const SE_DeviceMemoryBase& se_base) {
   return base;
 }
 
-xla::Shape FromC(XLA_Shape* shape) {
+xla::Shape FromC(const XLA_Shape* shape) {
   xla::ShapeProto p;
   p.ParseFromArray(shape->bytes, shape->size);
   return xla::Shape(p);
@@ -193,7 +198,6 @@ xla::MutableBorrowingLiteral FromC(XLA_Literal* c_literal) {
 }
 
 void ToC(const xla::ShapedBuffer& buffer, XLA_ShapedBuffer* c_device_buffer) {
-  ApiConverter::ToC(buffer.on_host_shape(), &c_device_buffer->on_host_shape);
   ApiConverter::ToC(buffer.on_device_shape(),
                     &c_device_buffer->on_device_shape);
   c_device_buffer->device_ordinal = buffer.device_ordinal();
@@ -209,7 +213,7 @@ void ToC(const xla::ShapedBuffer& buffer, XLA_ShapedBuffer* c_device_buffer) {
 }
 
 void Free(XLA_Shape* shape) { delete[] shape->bytes; }
-void Free(XLA_ShapeIndex*) {}
+void Free(XLA_ShapeIndex* shape_index) { delete[] shape_index; }
 void Free(SE_DeviceMemoryBase*) {}
 
 void Free(XLA_Literal* c_literal) {
@@ -220,8 +224,109 @@ void Free(XLA_Literal* c_literal) {
 
 void Free(XLA_ShapedBuffer* c_buffer) {
   ApiConverter::Free(&c_buffer->on_device_shape);
-  ApiConverter::Free(&c_buffer->on_host_shape);
   delete[] c_buffer->bases;
 }
 
+XLA_HloModule ToC(const xla::HloModule& module) {
+  XLA_HloModule c_module;
+  c_module.proto = stream_executor::tpu::SerializeProto(module.ToProto());
+  c_module.module_config = ApiConverter::ToC(module.config());
+  return c_module;
+}
+
+xla::StatusOr<std::unique_ptr<xla::HloModule>> FromC(
+    const XLA_HloModule& c_module) {
+  xla::HloModuleProto module_proto =
+      stream_executor::tpu::DeserializeProto<xla::HloModuleProto>(
+          c_module.proto);
+  return xla::HloModule::CreateFromProto(
+      module_proto, ApiConverter::FromC(c_module.module_config));
+}
+
+void Free(XLA_HloModule* c_module) {
+  stream_executor::tpu::SerializedProto_Free(c_module->proto);
+  Free(&c_module->module_config);
+}
+
+static xla::HloModuleConfig ConfigWithLayout(
+    const XLA_HloModuleConfig& se_config) {
+  xla::ShapeLayout result_layout(
+      FromC(&se_config.entry_computation_layout.result_layout));
+  xla::ComputationLayout layout(result_layout);
+  for (int i = 0; i < se_config.entry_computation_layout.parameter_count; ++i) {
+    layout.add_parameter_layout(xla::ShapeLayout(
+        FromC(&se_config.entry_computation_layout.parameter_layouts[i])));
+  }
+  return xla::HloModuleConfig(layout);
+}
+
+XLA_HloModuleConfig ToC(const xla::HloModuleConfig& config) {
+  XLA_HloModuleConfig hlo_config;
+
+  hlo_config.seed = config.seed();
+  hlo_config.launch_id = config.launch_id();
+  hlo_config.replica_count = config.replica_count();
+  hlo_config.num_partitions = config.num_partitions();
+  hlo_config.use_spmd_partitioning = config.use_spmd_partitioning();
+  hlo_config.has_static_device_assignment =
+      config.has_static_device_assignment();
+  hlo_config.has_entry_computation_layout =
+      config.has_entry_computation_layout();
+
+  if (config.has_static_device_assignment()) {
+    xla::DeviceAssignmentProto dev_proto;
+    config.static_device_assignment().Serialize(&dev_proto).IgnoreError();
+    hlo_config.static_device_assignment =
+        stream_executor::tpu::SerializeProto(dev_proto);
+  }
+  if (config.has_entry_computation_layout()) {
+    const auto& layout = config.entry_computation_layout();
+    ApiConverter::ToC(layout.result_layout().shape(),
+                      &hlo_config.entry_computation_layout.result_layout);
+    hlo_config.entry_computation_layout.parameter_layouts =
+        new XLA_Shape[layout.parameter_count()];
+    for (int i = 0; i < layout.parameter_count(); ++i) {
+      ApiConverter::ToC(
+          layout.parameter_layout(i).shape(),
+          &hlo_config.entry_computation_layout.parameter_layouts[i]);
+    }
+    hlo_config.entry_computation_layout.parameter_count =
+        layout.parameter_count();
+  }
+  return hlo_config;
+}
+
+xla::HloModuleConfig FromC(const XLA_HloModuleConfig& c_config) {
+  xla::HloModuleConfig config = c_config.has_entry_computation_layout
+                                    ? ConfigWithLayout(c_config)
+                                    : xla::HloModuleConfig();
+  config.set_launch_id(c_config.launch_id);
+  config.set_seed(c_config.seed);
+  config.set_replica_count(c_config.replica_count);
+  config.set_num_partitions(c_config.num_partitions);
+  config.set_use_spmd_partitioning(c_config.use_spmd_partitioning);
+  if (c_config.has_static_device_assignment) {
+    auto device_assignment = xla::DeviceAssignment::Deserialize(
+        stream_executor::tpu::DeserializeProto<xla::DeviceAssignmentProto>(
+            c_config.static_device_assignment));
+    config.set_static_device_assignment(
+        *(device_assignment.ConsumeValueOrDie()));
+  }
+  return config;
+}
+
+void Free(XLA_HloModuleConfig* c_config) {
+  for (auto i = 0; i < c_config->entry_computation_layout.parameter_count;
+       ++i) {
+    ApiConverter::Free(
+        &c_config->entry_computation_layout.parameter_layouts[i]);
+  }
+  delete[] c_config->entry_computation_layout.parameter_layouts;
+  ApiConverter::Free(&c_config->entry_computation_layout.result_layout);
+  if (c_config->has_static_device_assignment) {
+    stream_executor::tpu::SerializedProto_Free(
+        c_config->static_device_assignment);
+  }
+}
+
 }  // namespace ApiConverter
diff --git a/tensorflow/stream_executor/tpu/c_api_conversions.h b/tensorflow/stream_executor/tpu/c_api_conversions.h
index bfe5f37204c..c4b5648e097 100644
--- a/tensorflow/stream_executor/tpu/c_api_conversions.h
+++ b/tensorflow/stream_executor/tpu/c_api_conversions.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@@ -41,7 +43,7 @@ stream_executor::DeviceMemoryBase FromC(const SE_DeviceMemoryBase& se_base);
 void Free(SE_DeviceMemoryBase*);
 
 // xla::Shape
-xla::Shape FromC(XLA_Shape* shape);
+xla::Shape FromC(const XLA_Shape* shape);
 void ToC(const xla::Shape& xla_shape, XLA_Shape* c_shape);
 void Free(XLA_Shape* shape);
 
@@ -65,11 +67,6 @@ SE_DeviceMemoryBase ToC(const stream_executor::DeviceMemoryBase& base);
 stream_executor::DeviceMemoryBase FromC(const SE_DeviceMemoryBase& se_base);
 void Free(SE_DeviceMemoryBase*);
 
-// xla::Shape
-xla::Shape FromC(XLA_Shape* shape);
-void ToC(const xla::Shape& xla_shape, XLA_Shape* c_shape);
-void Free(XLA_Shape* shape);
-
 // Literal
 void ToC(const xla::LiteralSlice& literal, XLA_Literal* c_literal);
 xla::MutableBorrowingLiteral FromC(XLA_Literal* c_literal);
@@ -89,7 +86,21 @@ SE_DeviceMemoryAllocator ToC(stream_executor::DeviceMemoryAllocator* allocator);
 
 // OwningDeviceMemory
 SE_MaybeOwningDeviceMemory ToC(stream_executor::OwningDeviceMemory* mem);
-SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceMemory& mem);
+// mem.HasOwnership() may be true if the buffer is aliased and shouldn't be
+// released. 'aliased' should be true in this case. 'aliased' has no effect if
+// 'mem' is unowned.
+SE_MaybeOwningDeviceMemory ToC(xla::MaybeOwningDeviceMemory& mem, bool aliased);
+
+// HloModule
+XLA_HloModule ToC(const xla::HloModule& module);
+xla::StatusOr<std::unique_ptr<xla::HloModule>> FromC(
+    const XLA_HloModule& c_module);
+void Free(XLA_HloModule* c_module);
+
+// HloModuleConfig
+XLA_HloModuleConfig ToC(const xla::HloModuleConfig& config);
+xla::HloModuleConfig FromC(const XLA_HloModuleConfig& c_config);
+void Free(XLA_HloModuleConfig* c_config);
 
 // Helper for managing stack based C -> C++ conversions.
 template <class CType>
diff --git a/tensorflow/stream_executor/tpu/c_api_decl.h b/tensorflow/stream_executor/tpu/c_api_decl.h
index a7b4c372e18..dcb53823e0c 100644
--- a/tensorflow/stream_executor/tpu/c_api_decl.h
+++ b/tensorflow/stream_executor/tpu/c_api_decl.h
@@ -35,6 +35,7 @@ enum TpuVersionEnum {
   kUnknownTpuVersion,
   kTpuV2,
   kTpuV3,
+  kTpuV4,
 };
 
 typedef struct SE_Status SE_Status;
@@ -176,7 +177,6 @@ typedef struct XLA_Shape {
 
 // Represents a leaf node for a XLA shaped buffer.
 typedef struct XLA_ShapedBuffer {
-  XLA_Shape on_host_shape;
   XLA_Shape on_device_shape;
   int device_ordinal;
 
@@ -207,7 +207,6 @@ typedef struct SE_ExecutionInput {
   XLA_ShapeIndex* unowned_indices;
   int unowned_indices_size;
   XLA_Shape dynamic_shape;
-  XLA_Shape host_shape;
 } SE_ExecutionInput;
 
 typedef struct SE_ExecutionOutput {
diff --git a/tensorflow/stream_executor/tpu/proto_helper.h b/tensorflow/stream_executor/tpu/proto_helper.h
index 29c322b0e9e..cd231e06c22 100644
--- a/tensorflow/stream_executor/tpu/proto_helper.h
+++ b/tensorflow/stream_executor/tpu/proto_helper.h
@@ -32,12 +32,13 @@ namespace tpu {
 
 using SerializedProto = TpuSerializedProto;
 
-// Serializes a proto and put the result in the given SerializedProto* argument.
+// Serializes a `proto` and put the result in the given `SerializedProtoType*`
+// argument.
 //
 // Users should call SerializedProto_Free on `serialized_proto` afterwards.
-template <class Proto>
-inline void SerializeProto(const Proto& proto,
-                           SerializedProto* serialized_proto) {
+template <class ProtoType, class SerializedProtoType>
+inline void SerializeProto(const ProtoType& proto,
+                           SerializedProtoType* serialized_proto) {
   auto size = proto.ByteSizeLong();
   auto bytes = new char[size];
   CHECK(proto.SerializeToArray(bytes, size));
@@ -48,8 +49,8 @@ inline void SerializeProto(const Proto& proto,
 // Serializes a proto and return the result as a SerializedProto value.
 //
 // Users should call SerializedProto_Free on the return value afterwards.
-template <class Proto>
-inline SerializedProto SerializeProto(const Proto& proto) {
+template <class ProtoType>
+inline SerializedProto SerializeProto(const ProtoType& proto) {
   SerializedProto serialized_proto;
   SerializeProto(proto, &serialized_proto);
   return serialized_proto;
@@ -57,9 +58,9 @@ inline SerializedProto SerializeProto(const Proto& proto) {
 
 // Deserializes a buffer and return the corresponding proto. If the buffer is
 // empty, return an empty proto.
-template <class Proto>
-inline Proto DeserializeProto(const SerializedProto& serialized_proto) {
-  Proto proto;
+template <class ProtoType, class SerializedProtoType>
+inline ProtoType DeserializeProto(const SerializedProtoType& serialized_proto) {
+  ProtoType proto;
   if (serialized_proto.bytes != nullptr) {
     CHECK_GT(serialized_proto.size, 0);
     CHECK(proto.ParseFromArray(serialized_proto.bytes, serialized_proto.size))
@@ -69,7 +70,8 @@ inline Proto DeserializeProto(const SerializedProto& serialized_proto) {
 }
 
 // Releases the memory allocated for serialized protos.
-inline void SerializedProto_Free(const SerializedProto& serialized_proto) {
+template <class SerializedProtoType>
+inline void SerializedProto_Free(const SerializedProtoType& serialized_proto) {
   CHECK_NE(serialized_proto.bytes, nullptr);
   CHECK_GT(serialized_proto.size, 0);
   delete[] serialized_proto.bytes;
diff --git a/tensorflow/stream_executor/tpu/tpu_computation_placer.cc b/tensorflow/stream_executor/tpu/tpu_computation_placer.cc
index 81cf97a792d..40e0117daad 100644
--- a/tensorflow/stream_executor/tpu/tpu_computation_placer.cc
+++ b/tensorflow/stream_executor/tpu/tpu_computation_placer.cc
@@ -19,6 +19,9 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/status_helper.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
 
+namespace tensorflow {
+namespace tpu {
+
 template <typename T>
 using StatusOr = TpuComputationPlacer::StatusOr<T>;
 
@@ -49,13 +52,31 @@ StatusOr<xla::DeviceAssignment> TpuComputationPlacer::AssignDevices(
   return result;
 }
 
+/*static*/ StatusOr<xla::DeviceAssignment>
+TpuComputationPlacer::AssignLocalDevices(TpuHostLocationExternal host_location,
+                                         int replica_count,
+                                         int computation_count) {
+  StatusHelper status;
+  xla::DeviceAssignment result(replica_count, computation_count);
+  tensorflow::tpu::ExecutorApiFn()->TpuComputationPlacer_AssignLocalDevicesFn(
+      host_location.impl(), replica_count, computation_count, result.data(),
+      status.c_status);
+  if (!status.ok()) {
+    return status.status();
+  }
+  return result;
+}
+
 static std::unique_ptr<xla::ComputationPlacer> CreateTpuComputationPlacer() {
   return std::make_unique<TpuComputationPlacer>();
 }
 
 static bool InitModule() {
-  xla::ComputationPlacer::RegisterComputationPlacer(
-      tensorflow::TpuPlatform::kId, CreateTpuComputationPlacer);
+  xla::ComputationPlacer::RegisterComputationPlacer(TpuPlatform::kId,
+                                                    CreateTpuComputationPlacer);
   return true;
 }
 static bool module_initialized = InitModule();
+
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/stream_executor/tpu/tpu_computation_placer.h b/tensorflow/stream_executor/tpu/tpu_computation_placer.h
index c8f4c9e3888..5e807f72035 100644
--- a/tensorflow/stream_executor/tpu/tpu_computation_placer.h
+++ b/tensorflow/stream_executor/tpu/tpu_computation_placer.h
@@ -19,6 +19,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
+#include "tensorflow/stream_executor/tpu/tpu_topology.h"
+
+namespace tensorflow {
+namespace tpu {
 
 class TpuComputationPlacer : public xla::ComputationPlacer {
  public:
@@ -34,8 +38,15 @@ class TpuComputationPlacer : public xla::ComputationPlacer {
   StatusOr<xla::DeviceAssignment> AssignDevices(int replica_count,
                                                 int computation_count) override;
 
+  static StatusOr<xla::DeviceAssignment> AssignLocalDevices(
+      TpuHostLocationExternal host_location, int replica_count,
+      int computation_count);
+
  private:
   XLA_ComputationPlacer* placer_;
 };
 
+}  // namespace tpu
+}  // namespace tensorflow
+
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_COMPUTATION_PLACER_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_event.h b/tensorflow/stream_executor/tpu/tpu_event.h
index af53d730ecf..219a0f332f3 100644
--- a/tensorflow/stream_executor/tpu/tpu_event.h
+++ b/tensorflow/stream_executor/tpu/tpu_event.h
@@ -19,6 +19,9 @@ limitations under the License.
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 #include "tensorflow/stream_executor/tpu/c_api_decl.h"
 
+namespace tensorflow {
+namespace tpu {
+
 class TpuEvent : public ::stream_executor::internal::EventInterface {
  public:
   explicit TpuEvent(SE_Event* event) : event_(event) {}
@@ -30,4 +33,7 @@ class TpuEvent : public ::stream_executor::internal::EventInterface {
   SE_Event* event_;
 };
 
+}  // namespace tpu
+}  // namespace tensorflow
+
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EVENT_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_executable.cc b/tensorflow/stream_executor/tpu/tpu_executable.cc
index dd28f21455e..3f7d88392e5 100644
--- a/tensorflow/stream_executor/tpu/tpu_executable.cc
+++ b/tensorflow/stream_executor/tpu/tpu_executable.cc
@@ -73,7 +73,7 @@ Status TpuExecutable::LoadProgramAndEnqueueToStream(
     c_dev_assign.size = dev_assign_serialized.size;
   }
 
-  auto platform = tensorflow::down_cast<tensorflow::TpuPlatform*>(
+  auto platform = tensorflow::down_cast<tensorflow::tpu::TpuPlatform*>(
       tensorflow::tpu::TpuPlatformInterface::GetRegisteredPlatform());
   auto stream = platform->stream_map()->at(
       run_options.run_options().stream()->implementation());
@@ -113,4 +113,9 @@ int64 TpuExecutable::ShapeSize(const Shape& shape) {
   return size;
 }
 
+absl::string_view TpuExecutable::fingerprint() const {
+  // TODO(skye): the fingerprint can be plumbed through via core_program_
+  LOG(FATAL) << "TpuExecutable::fingerprint() unimplemented";
+}
+
 }  // namespace xla
diff --git a/tensorflow/stream_executor/tpu/tpu_executable.h b/tensorflow/stream_executor/tpu/tpu_executable.h
index 3c9e60ba335..d2c3200c93d 100644
--- a/tensorflow/stream_executor/tpu/tpu_executable.h
+++ b/tensorflow/stream_executor/tpu/tpu_executable.h
@@ -46,6 +46,8 @@ class TpuExecutable : public TpuExecutableInterface {
 
   const XLA_TpuProgram* core_program() const { return core_program_; }
 
+  absl::string_view fingerprint() const override;
+
  private:
   Status LoadProgramAndEnqueueToStream(
       const ServiceExecutableRunOptions& run_options,
diff --git a/tensorflow/stream_executor/tpu/tpu_executable_interface.cc b/tensorflow/stream_executor/tpu/tpu_executable_interface.cc
index f260cc1631f..84ce8444420 100644
--- a/tensorflow/stream_executor/tpu/tpu_executable_interface.cc
+++ b/tensorflow/stream_executor/tpu/tpu_executable_interface.cc
@@ -90,8 +90,7 @@ TpuExecutableInterface::AllocateOutputMemoryWithInputReuse(
     }
   }
 
-  ExecutionOutput result(host_shape, std::move(device_shape), allocator,
-                         device_ordinal);
+  ExecutionOutput result(std::move(device_shape), allocator, device_ordinal);
   // Iterate through and allocate a buffer for each shape index, checking for
   // possible input buffer reuse.
   int64 reused_buffer_bytes = 0;
@@ -189,13 +188,12 @@ StatusOr<ExecutionOutput> TpuExecutableInterface::ExecuteAsyncOnStream(
           shape, alias_config, run_options->allocator(), &arguments, stream,
           run_options->run_options().host_to_device_stream()));
 
-  MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
-
   // Address of the buffer in TPU memory that is being speculated.
   absl::optional<se::DeviceMemoryBase> cross_program_prefetch_addr;
   if (hlo_module_) {
-    for (const auto& [parameter, index] :
-         hlo_module_->CrossProgramPrefetches()) {
+    for (const auto& prefetch : hlo_module_->CrossProgramPrefetches()) {
+      const auto& parameter = prefetch.first;
+      const auto& index = prefetch.second;
       CHECK_LT(parameter, arguments.size());
       // Ensure the cross program prefetched buffer doesn't alias with any
       // program outputs. If the input and output aliased, the buffer could be
@@ -203,6 +201,7 @@ StatusOr<ExecutionOutput> TpuExecutableInterface::ExecuteAsyncOnStream(
       // data from fast memory instead of fresh data in large memory.
       auto it = arguments[parameter].MutableBuffers()->find({index});
       CHECK(it != arguments[parameter].MutableBuffers()->end());
+      CHECK(!it->second.AsDeviceMemoryBase().is_null());
       if (absl::c_none_of(result.Result().buffers(), [&](auto index_addr_pair) {
             return index_addr_pair.second.IsSameAs(
                 it->second.AsDeviceMemoryBase());
@@ -213,6 +212,11 @@ StatusOr<ExecutionOutput> TpuExecutableInterface::ExecuteAsyncOnStream(
     }
   }
 
+  // MarkToBeReleasedArguments may std::move some elements of arguments, so it
+  // must run after the cross program prefetch address is calculated from the
+  // arguments.
+  MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
+
   TF_RETURN_IF_ERROR(LoadProgramAndEnqueueToStream(
       *run_options, memory_bases, result.Result().root_buffer(),
       cross_program_prefetch_addr));
diff --git a/tensorflow/stream_executor/tpu/tpu_executable_interface.h b/tensorflow/stream_executor/tpu/tpu_executable_interface.h
index d0e13b8aea8..18e74a7f19d 100644
--- a/tensorflow/stream_executor/tpu/tpu_executable_interface.h
+++ b/tensorflow/stream_executor/tpu/tpu_executable_interface.h
@@ -80,6 +80,8 @@ class TpuExecutableInterface : public Executable {
       absl::optional<stream_executor::DeviceMemoryBase>
           cross_program_prefetch_addr) = 0;
 
+  virtual absl::string_view fingerprint() const = 0;
+
  protected:
   virtual Shape HostShapeToDeviceShape(const Shape& host_shape) = 0;
 
diff --git a/tensorflow/stream_executor/tpu/tpu_executor.cc b/tensorflow/stream_executor/tpu/tpu_executor.cc
index 166deb716ca..177e813fc4c 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor.cc
+++ b/tensorflow/stream_executor/tpu/tpu_executor.cc
@@ -26,6 +26,7 @@ limitations under the License.
 using stream_executor::DeviceMemoryBase;
 
 namespace tensorflow {
+namespace tpu {
 
 namespace {
 using ::stream_executor::port::Status;
@@ -62,7 +63,7 @@ bool TpuExecutor::SynchronizeAllActivity() {
 Status TpuExecutor::BlockHostUntilDone(Stream* stream) {
   StatusHelper status;
   tpu::ExecutorApiFn()->TpuExecutor_BlockHostUntilDoneFn(
-      executor_, stream_map().at(stream->implementation()), status.c_status);
+      executor_, get_stream(stream->implementation()), status.c_status);
   return status.status();
 }
 
@@ -76,7 +77,7 @@ Status TpuExecutor::BlockUntilDoneOrFailed() {
 Status TpuExecutor::GetStatus(Stream* stream) {
   StatusHelper status;
   tpu::ExecutorApiFn()->TpuExecutor_GetStatusFn(
-      executor_, stream_map().at(stream->implementation()), status.c_status);
+      executor_, get_stream(stream->implementation()), status.c_status);
   return status.status();
 }
 
@@ -87,19 +88,21 @@ tpu::TpuCoreLocationExternal TpuExecutor::GetCoreLocationExternal() const {
 
 bool TpuExecutor::AllocateStream(Stream* stream) {
   return tpu::ExecutorApiFn()->TpuExecutor_AllocateStreamFn(
-      executor_, stream_map().at(stream->implementation()));
+      executor_, get_stream(stream->implementation()));
 }
 
 void TpuExecutor::DeallocateStream(Stream* stream) {
   tpu::ExecutorApiFn()->TpuExecutor_DeallocateStreamFn(
-      executor_, stream_map().at(stream->implementation()));
+      executor_, get_stream(stream->implementation()));
+  tpu_platform().mutex().lock();
   stream_map().erase(stream->implementation());
+  tpu_platform().mutex().unlock();
 }
 
 bool TpuExecutor::CreateStreamDependency(Stream* dependent, Stream* other) {
   return tpu::ExecutorApiFn()->TpuExecutor_CreateStreamDependencyFn(
-      executor_, stream_map().at(dependent->implementation()),
-      stream_map().at(other->implementation()));
+      executor_, get_stream(dependent->implementation()),
+      get_stream(other->implementation()));
 }
 
 Status TpuExecutor::AllocateEvent(Event* event) { return Status::OK(); }
@@ -116,13 +119,13 @@ void TpuExecutor::DeallocateTimer(Timer* timer) {}
 
 bool TpuExecutor::StartTimer(Stream* stream, ::stream_executor::Timer* timer) {
   return tpu::ExecutorApiFn()->TpuExecutor_StartTimerFn(
-      executor_, stream_map().at(stream->implementation()),
+      executor_, get_stream(stream->implementation()),
       timer_map_.at(timer->implementation()));
 }
 
 bool TpuExecutor::StopTimer(Stream* stream, ::stream_executor::Timer* timer) {
   return tpu::ExecutorApiFn()->TpuExecutor_StopTimerFn(
-      executor_, stream_map().at(stream->implementation()),
+      executor_, get_stream(stream->implementation()),
       timer_map_.at(timer->implementation()));
 }
 
@@ -139,7 +142,7 @@ Status TpuExecutor::RecordEvent(Stream* stream,
   StatusHelper status;
   auto se_event = tpu_platform().LookupEvent(event->implementation());
   tpu::ExecutorApiFn()->TpuExecutor_RecordEventFn(
-      executor_, stream_map().at(stream->implementation()), se_event,
+      executor_, get_stream(stream->implementation()), se_event,
       status.c_status);
   return status.status();
 }
@@ -149,7 +152,7 @@ Status TpuExecutor::WaitForEvent(Stream* stream,
   StatusHelper status;
   auto se_event = tpu_platform().LookupEvent(event->implementation());
   tpu::ExecutorApiFn()->TpuExecutor_WaitForEventFn(
-      executor_, stream_map().at(stream->implementation()), se_event,
+      executor_, get_stream(stream->implementation()), se_event,
       status.c_status);
   return status.status();
 }
@@ -173,7 +176,7 @@ TpuExecutor::GetTimerImplementation() {
 std::unique_ptr<::stream_executor::internal::StreamInterface>
 TpuExecutor::GetStreamImplementation() {
   SE_Stream* tpu_stream = tpu::ExecutorApiFn()->TpuStream_NewFn(executor_);
-  auto ptr = absl::make_unique<TpuStream>(tpu_stream);
+  auto ptr = absl::make_unique<tpu::TpuStream>(tpu_stream);
   tpu_platform().mutex().lock();
   stream_map()[ptr.get()] = tpu_stream;
   tpu_platform().mutex().unlock();
@@ -278,7 +281,7 @@ bool TpuExecutor::Memcpy(Stream* stream, void* host_dst,
                          uint64 size) {
   SE_DeviceMemoryBase se_base = ApiConverter::ToC(device_src);
   return tpu::ExecutorApiFn()->TpuExecutor_MemcpyToHostFn(
-      executor_, stream_map().at(stream->implementation()), host_dst, &se_base,
+      executor_, get_stream(stream->implementation()), host_dst, &se_base,
       size);
 }
 
@@ -287,7 +290,7 @@ bool TpuExecutor::Memcpy(Stream* stream,
                          const void* host_src, uint64 size) {
   SE_DeviceMemoryBase se_base = ApiConverter::ToC(*device_dst);
   return tpu::ExecutorApiFn()->TpuExecutor_MemcpyFromHostFn(
-      executor_, stream_map().at(stream->implementation()), &se_base, host_src,
+      executor_, get_stream(stream->implementation()), &se_base, host_src,
       size);
 }
 
@@ -341,8 +344,8 @@ bool TpuExecutor::HostCallback(Stream* stream,
                                std::function<Status()> callback) {
   HostCallbackContext* ctx = new HostCallbackContext{callback};
   return tpu::ExecutorApiFn()->TpuExecutor_HostCallbackFn(
-      executor_, stream_map().at(stream->implementation()),
-      &HostCallbackTrampoline, ctx);
+      executor_, get_stream(stream->implementation()), &HostCallbackTrampoline,
+      ctx);
 }
 
 TpuExecutor::StatusOr<std::unique_ptr<::stream_executor::DeviceDescription>>
@@ -370,4 +373,5 @@ TpuExecutor::CreateDeviceDescription() const {
   return status.status();
 }
 
+}  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/stream_executor/tpu/tpu_executor.h b/tensorflow/stream_executor/tpu/tpu_executor.h
index 1ba2e4f587d..f2c15deabaf 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor.h
@@ -34,8 +34,10 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_executor_interface.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
+#include "tensorflow/stream_executor/tpu/tpu_stream.h"
 
 namespace tensorflow {
+namespace tpu {
 
 class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
  public:
@@ -226,11 +228,17 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
     return *(tpu_platform().stream_map());
   }
 
+  SE_Stream* get_stream(StreamInterface* ptr) {
+    tensorflow::mutex_lock m(tpu_platform().mutex());
+    return stream_map()[ptr];
+  }
+
   TimerMap timer_map_;
   tensorflow::tpu::TpuPlatformInterface* platform_;
   SE_StreamExecutor* executor_;
 };
 
+}  // namespace tpu
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_EXECUTOR_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
index 622921d0fb6..193730567e7 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
@@ -127,6 +127,14 @@ void TpuStream_Free(SE_Stream*);
 void* TpuStream_Stream(SE_Stream*);
 bool TpuStream_Status(SE_Stream*);
 bool TpuStream_IsSameSharedMemoryLocation(SE_Stream*, SE_Stream*);
+void TpuStream_EnqueueTransferHostToDevice(SE_Stream* stream,
+                                           SE_DeviceMemoryBase device_dst,
+                                           void* host_src, uint64_t size,
+                                           SE_Status* status);
+void TpuStream_EnqueueTransferDeviceToHost(SE_Stream* stream,
+                                           SE_DeviceMemoryBase device_src,
+                                           void* host_dst, uint64_t size,
+                                           SE_Status* status);
 void TpuStream_TpuEnqueueOnDeviceSendRecvLocal(SE_Stream* stream,
                                                SE_DeviceMemoryBase send_buffer,
                                                SE_DeviceMemoryBase recv_buffer,
@@ -180,6 +188,15 @@ void TpuTransferManager_TransferLiteralFromDevice(
     XLA_StatusCallbackFn callback, void* ctx);
 int64_t TpuTransferManager_GetByteSizeRequirement(XLA_TransferManager* manager,
                                                   XLA_Shape* shape);
+void TpuTransferManager_ChooseCompactLayoutForShape(
+    XLA_TransferManager* manager, XLA_Shape* host_shape, XLA_Shape* output,
+    SE_Status* status);
+bool TpuTransferManager_CanShapedBufferBeAccessedNow(
+    XLA_TransferManager* manager, SE_StreamExecutor* executor,
+    XLA_ShapedBuffer* device_buffer);
+bool TpuTransferManager_CanBufferBeAccessedNow(
+    XLA_TransferManager* manager, SE_StreamExecutor* executor,
+    SE_DeviceMemoryBase* device_buffer);
 void TpuTransferManager_WriteSingleTupleIndexTable(
     XLA_TransferManager* manager, SE_Stream* stream,
     SE_DeviceMemoryBase* elements, size_t elements_len, XLA_Shape* shape,
@@ -219,11 +236,19 @@ void TpuComputationPlacer_AssignDevices(XLA_ComputationPlacer* placer,
                                         int replica_count,
                                         int computation_count, int* assignment,
                                         SE_Status* status);
+void TpuComputationPlacer_AssignLocalDevices(SE_TpuTopology_Host* host,
+                                             int replica_count,
+                                             int computation_count,
+                                             int* assignment,
+                                             SE_Status* status);
 
 int TpuTopology_LogicalDevicesPerHost(SE_TpuTopology* tpu_topology,
                                       TpuCoreTypeEnum tpu_core_type);
 int TpuTopology_LogicalDevicesPerChip(SE_TpuTopology* tpu_topology,
                                       TpuCoreTypeEnum tpu_core_type);
+int TpuTopology_HostCount(SE_TpuTopology* tpu_topology);
+int TpuTopology_ChipsPerHost(SE_TpuTopology* tpu_topology);
+
 int TpuTopology_ChipBounds_X(SE_TpuTopology* tpu_topology);
 int TpuTopology_ChipBounds_Y(SE_TpuTopology* tpu_topology);
 int TpuTopology_ChipBounds_Z(SE_TpuTopology* tpu_topology);
@@ -247,6 +272,12 @@ int TpuCoreLocation_Index(SE_TpuTopology_Core* tpu_core_location);
 int TpuCoreLocation_Id(SE_TpuTopology_Core* tpu_core_location);
 
 int TpuHostLocation_Id(SE_TpuTopology_Host* tpu_host_location);
+int TpuHostLocation_NumCores(SE_TpuTopology_Host* tpu_host_location,
+                             TpuCoreTypeEnum tpu_core_type);
+// 'cores' should be a preallocated array of size TpuHostLocation_NumCores.
+void TpuHostLocation_Cores(SE_TpuTopology_Host* tpu_host_location,
+                           TpuCoreTypeEnum tpu_core_type,
+                           SE_TpuTopology_Core** cores);
 
 // C API for XLA::Compiler interface
 
@@ -278,6 +309,15 @@ TFTPU_CAPI_EXPORT void TpuExecutable_ExecuteAsyncOnStream(
     SE_HloExecutionProfile* hlo_execution_profile, SE_ExecutionOutput* output,
     SE_Status* status);
 
+TFTPU_CAPI_EXPORT void TpuExecutable_Fingerprint(SE_Executable* executable,
+                                                 const char** fingerprint,
+                                                 size_t* size);
+
+// Caller is responsible for freeing the returned module's proto and its
+// config's proto.
+TFTPU_CAPI_EXPORT XLA_HloModule
+TpuExecutable_HloModule(SE_Executable* executable);
+
 TFTPU_CAPI_EXPORT void TpuExecutable_Free(SE_Executable*);
 
 // Converts an XLA `Shape` into its equivalent TPU `Shape` representation.
@@ -341,6 +381,8 @@ struct TfTpu_ExecutorApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuStream_Stream);
   TFTPU_ADD_FN_IN_STRUCT(TpuStream_Status);
   TFTPU_ADD_FN_IN_STRUCT(TpuStream_IsSameSharedMemoryLocation);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_EnqueueTransferHostToDevice);
+  TFTPU_ADD_FN_IN_STRUCT(TpuStream_EnqueueTransferDeviceToHost);
   TFTPU_ADD_FN_IN_STRUCT(TpuStream_TpuEnqueueOnDeviceSendRecvLocal);
 
   TFTPU_ADD_FN_IN_STRUCT(TpuEvent_New);
@@ -378,6 +420,9 @@ struct TfTpu_ExecutorApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_TransferLiteralToDeviceAsync);
   TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_TransferLiteralFromDevice);
   TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_GetByteSizeRequirement);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_ChooseCompactLayoutForShape);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_CanShapedBufferBeAccessedNow);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_CanBufferBeAccessedNow);
   TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_WriteSingleTupleIndexTable);
   TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_GetInfeedLayout);
   TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_LinearizeToBuffers);
@@ -390,9 +435,13 @@ struct TfTpu_ExecutorApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_New);
   TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_Free);
   TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_AssignDevices);
+  TFTPU_ADD_FN_IN_STRUCT(TpuComputationPlacer_AssignLocalDevices);
 
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_LogicalDevicesPerHost);
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_LogicalDevicesPerChip);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_HostCount);
+  TFTPU_ADD_FN_IN_STRUCT(TpuTopology_ChipsPerHost);
+
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_ChipBounds_X);
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_ChipBounds_Y);
   TFTPU_ADD_FN_IN_STRUCT(TpuTopology_ChipBounds_Z);
@@ -409,6 +458,8 @@ struct TfTpu_ExecutorApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuCoreLocation_Id);
 
   TFTPU_ADD_FN_IN_STRUCT(TpuHostLocation_Id);
+  TFTPU_ADD_FN_IN_STRUCT(TpuHostLocation_NumCores);
+  TFTPU_ADD_FN_IN_STRUCT(TpuHostLocation_Cores);
 
   TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_New);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_Free);
@@ -417,6 +468,8 @@ struct TfTpu_ExecutorApiFn {
   TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_Compile);
   TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_ShapeSize);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_ExecuteAsyncOnStream);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_Fingerprint);
+  TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_HloModule);
   TFTPU_ADD_FN_IN_STRUCT(TpuExecutable_Free);
 
   TFTPU_ADD_FN_IN_STRUCT(XlaShapeToTpuShapeRepresentation);
diff --git a/tensorflow/stream_executor/tpu/tpu_platform.cc b/tensorflow/stream_executor/tpu/tpu_platform.cc
index 90401a3dfb7..5a01848e78b 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform.cc
+++ b/tensorflow/stream_executor/tpu/tpu_platform.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_executor.h"
 
 namespace tensorflow {
+namespace tpu {
 
 PLATFORM_DEFINE_ID(TpuPlatform::kId);
 TpuPlatform* tpu_registered_platform = nullptr;
@@ -99,8 +100,7 @@ TpuPlatform::GetUncachedExecutor(
     return status.status();
   }
   return std::make_unique<stream_executor::StreamExecutor>(
-      this, std::make_unique<tensorflow::TpuExecutor>(this, executor),
-      config.ordinal);
+      this, std::make_unique<TpuExecutor>(this, executor), config.ordinal);
 }
 
 ::stream_executor::Platform::Id TpuPlatform::id() const {
@@ -163,11 +163,17 @@ Status TpuPlatform::TpuMemoryLimit(int64* memory_limit) {
 }
 
 bool RegisterTpuPlatform() {
+  // Silently bail if the underlying TPU C API isn't initialized. This is useful
+  // for code that unconditionally calls RegisterTpuPlatform() but doesn't link
+  // in the underlying TPU library when not running on TPU.
+  if (!tpu::IsInitialized(tpu::ExecutorApiFn())) {
+    return true;
+  }
   static bool tpu_platform_registered = false;
   if (!tpu_platform_registered) {
-    tensorflow::tpu_registered_platform = new tensorflow::TpuPlatform();
+    tpu_registered_platform = new TpuPlatform();
     std::unique_ptr<stream_executor::Platform> platform(
-        tensorflow::tpu_registered_platform);
+        tpu_registered_platform);
     SE_CHECK_OK(stream_executor::MultiPlatformManager::RegisterPlatform(
         std::move(platform)));
     tpu_platform_registered = true;
@@ -175,4 +181,5 @@ bool RegisterTpuPlatform() {
   return true;
 }
 
+}  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/stream_executor/tpu/tpu_platform.h b/tensorflow/stream_executor/tpu/tpu_platform.h
index a70634f7055..1b82b111294 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform.h
+++ b/tensorflow/stream_executor/tpu/tpu_platform.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
 
 namespace tensorflow {
+namespace tpu {
 
 class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
  public:
@@ -70,9 +71,7 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
   Status Initialize(
       const std::map<std::string, std::string>& platform_options) override;
 
-  Status Reset() override { return Reset(false); }
-
-  Status Reset(bool only_tear_down) override {
+  Status Reset(bool only_tear_down, absl::string_view reason) override {
     LOG(FATAL) << "Not yet implemented";
   }
 
@@ -149,6 +148,7 @@ class TpuPlatform : public ::tensorflow::tpu::TpuPlatformInterface {
 
 bool RegisterTpuPlatform();
 
+}  // namespace tpu
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_PLATFORM_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_platform_interface.cc b/tensorflow/stream_executor/tpu/tpu_platform_interface.cc
index 9b8b9cd8ed5..5330dd49d2c 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform_interface.cc
+++ b/tensorflow/stream_executor/tpu/tpu_platform_interface.cc
@@ -26,13 +26,8 @@ namespace tpu {
 
 namespace {
 TpuPlatformInterface* GetRegisteredPlatformStatic(bool initialize_platform,
-                                                  int tries_left = 5) {
-  if (tries_left <= 0) {
-    LOG(ERROR) << "Unable to find a TPU platform after exhausting all tries. "
-                  "Returning nullptr...";
-    return nullptr;
-  }
-
+                                                  int tries_left) {
+  DCHECK_GT(tries_left, 0);
   // Prefer TpuPlatform if it's registered.
   auto status_or_tpu_platform =
       stream_executor::MultiPlatformManager::PlatformWithName(
@@ -65,7 +60,8 @@ TpuPlatformInterface* GetRegisteredPlatformStatic(bool initialize_platform,
   }
 
   // If we find at least one thing, we return the first thing we see.
-  if (status_or_other_tpu_platforms.ok()) {
+  if (status_or_other_tpu_platforms.ok() &&
+      !status_or_other_tpu_platforms->empty()) {
     auto other_tpu_platforms = status_or_other_tpu_platforms.ValueOrDie();
     LOG(WARNING) << other_tpu_platforms.size()
                  << " TPU platforms registered, selecting "
@@ -73,26 +69,26 @@ TpuPlatformInterface* GetRegisteredPlatformStatic(bool initialize_platform,
     return static_cast<TpuPlatformInterface*>(other_tpu_platforms[0]);
   }
 
-  LOG(WARNING)
+  --tries_left;
+  if (tries_left <= 0) {
+    LOG(INFO) << "No TPU platform found.";
+    return nullptr;
+  }
+  LOG(INFO)
       << "No TPU platform registered. Waiting 1 second and trying again... ("
-      << (tries_left - 1) << " tries left)";
+      << tries_left << " tries left)";
   Env::Default()->SleepForMicroseconds(1000000);  // 1 second
-  return GetRegisteredPlatformStatic(initialize_platform, --tries_left);
+  return GetRegisteredPlatformStatic(initialize_platform, tries_left);
 }
 }  // namespace
 
-/* static */
-TpuPlatformInterface* TpuPlatformInterface::GetRegisteredPlatform() {
-  return GetRegisteredPlatform(/*initialize_platform=*/true);
-}
-
 /* static */
 TpuPlatformInterface* TpuPlatformInterface::GetRegisteredPlatform(
-    bool initialize_platform) {
+    bool initialize_platform, int num_tries) {
   static auto* mu = new mutex;
   static bool requested_initialize_platform = initialize_platform;
   static TpuPlatformInterface* tpu_registered_platform =
-      GetRegisteredPlatformStatic(initialize_platform);
+      GetRegisteredPlatformStatic(initialize_platform, num_tries);
 
   mutex_lock lock(*mu);
   if (!requested_initialize_platform && initialize_platform) {
@@ -100,7 +96,8 @@ TpuPlatformInterface* TpuPlatformInterface::GetRegisteredPlatform(
     // initializing the platform, but the next caller wants the platform
     // initialized, we will call GetRegisteredPlatformStatic again to initialize
     // the platform.
-    tpu_registered_platform = GetRegisteredPlatformStatic(initialize_platform);
+    tpu_registered_platform =
+        GetRegisteredPlatformStatic(initialize_platform, num_tries);
     requested_initialize_platform = true;
   }
 
diff --git a/tensorflow/stream_executor/tpu/tpu_platform_interface.h b/tensorflow/stream_executor/tpu/tpu_platform_interface.h
index 936de8d5c34..240148977e3 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform_interface.h
+++ b/tensorflow/stream_executor/tpu/tpu_platform_interface.h
@@ -35,14 +35,19 @@ class TpuPlatformInterface : public stream_executor::Platform {
   // Returns a TPU platform to be used by TPU ops. If multiple TPU platforms are
   // registered, finds the most suitable one. Returns nullptr if no TPU platform
   // is registered or an error occurred.
-  static TpuPlatformInterface* GetRegisteredPlatform();
+  //
+  // 'initialize_platform' can be set to false to not initialize a platform if
+  // not necessary. 'num_tries' specifies the number of tries if the TPU
+  // platform isn't initialized yet, with a 1-second delay between each try
+  // (num_tries == 1 means try once with no retries).
+  static TpuPlatformInterface* GetRegisteredPlatform(
+      bool initialize_platform = true, int num_tries = 5);
 
-  // Option to not initialize a platform if not necessary.
-  static TpuPlatformInterface* GetRegisteredPlatform(bool initialize_platform);
+  virtual Status Reset(bool only_tear_down, absl::string_view reason) = 0;
 
-  virtual Status Reset() { return Reset(false); }
+  Status Reset(absl::string_view reason) { return Reset(false, reason); }
 
-  virtual Status Reset(bool only_tear_down) = 0;
+  Status Reset() { return Reset(false, {}); }
 
   virtual int64 TpuMemoryLimit() = 0;
 
@@ -51,6 +56,10 @@ class TpuPlatformInterface : public stream_executor::Platform {
   virtual const TpuTopologyPtr GetTopologyPtr() = 0;
 
   virtual const TpuHostLocationExternal GetTpuHostLocation() const = 0;
+
+  TpuTopologyExternal topology() {
+    return TpuTopologyExternal(GetTopologyPtr());
+  }
 };
 
 }  // namespace tpu
diff --git a/tensorflow/stream_executor/tpu/tpu_platform_registration.cc b/tensorflow/stream_executor/tpu/tpu_platform_registration.cc
index f0447cf527c..505fb66a912 100644
--- a/tensorflow/stream_executor/tpu/tpu_platform_registration.cc
+++ b/tensorflow/stream_executor/tpu/tpu_platform_registration.cc
@@ -17,7 +17,8 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
 
 #if defined(PLATFORM_GOOGLE)
-REGISTER_MODULE_INITIALIZER(tpu_platform, tensorflow::RegisterTpuPlatform());
+REGISTER_MODULE_INITIALIZER(tpu_platform,
+                            tensorflow::tpu::RegisterTpuPlatform());
 
 DECLARE_MODULE_INITIALIZER(multi_platform_manager);
 DECLARE_MODULE_INITIALIZER(multi_platform_manager_listener);
diff --git a/tensorflow/stream_executor/tpu/tpu_stream.h b/tensorflow/stream_executor/tpu/tpu_stream.h
index ab84005c718..cd7637d12c6 100644
--- a/tensorflow/stream_executor/tpu/tpu_stream.h
+++ b/tensorflow/stream_executor/tpu/tpu_stream.h
@@ -23,6 +23,9 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_executor_c_api.h"
 #include "tensorflow/stream_executor/tpu/tpu_stream_interface.h"
 
+namespace tensorflow {
+namespace tpu {
+
 class TpuStream : public tensorflow::tpu::TpuStreamInterface {
  public:
   using Status = stream_executor::port::Status;
@@ -39,6 +42,26 @@ class TpuStream : public tensorflow::tpu::TpuStreamInterface {
             stream_, static_cast<TpuStream*>(other)->stream_);
   }
 
+  Status EnqueueTransferHostToDevice(
+      stream_executor::DeviceMemoryBase device_dst, const void* host_src,
+      uint64 size) {
+    StatusHelper status;
+    tensorflow::tpu::ExecutorApiFn()->TpuStream_EnqueueTransferHostToDeviceFn(
+        stream_, ApiConverter::ToC(device_dst), const_cast<void*>(host_src),
+        size, status.c_status);
+    return status.status();
+  }
+
+  Status EnqueueTransferDeviceToHost(
+      stream_executor::DeviceMemoryBase device_src, void* host_dst,
+      uint64 size) {
+    StatusHelper status;
+    tensorflow::tpu::ExecutorApiFn()->TpuStream_EnqueueTransferDeviceToHostFn(
+        stream_, ApiConverter::ToC(device_src), host_dst, size,
+        status.c_status);
+    return status.status();
+  }
+
   Status EnqueueOnTpuDeviceSendRecvLocal(
       stream_executor::DeviceMemoryBase send_buffer,
       stream_executor::DeviceMemoryBase recv_buffer) override {
@@ -56,4 +79,7 @@ class TpuStream : public tensorflow::tpu::TpuStreamInterface {
   mutable SE_Stream* stream_;
 };
 
+}  // namespace tpu
+}  // namespace tensorflow
+
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_STREAM_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_topology.cc b/tensorflow/stream_executor/tpu/tpu_topology.cc
index c86b399b34e..909f5bd9dac 100644
--- a/tensorflow/stream_executor/tpu/tpu_topology.cc
+++ b/tensorflow/stream_executor/tpu/tpu_topology.cc
@@ -46,6 +46,21 @@ int32 TpuHostLocationExternal::Id() const {
   return tpu::ExecutorApiFn()->TpuHostLocation_IdFn(host_location_);
 }
 
+std::vector<TpuCoreLocationExternal> TpuHostLocationExternal::Cores(
+    TpuCoreTypeEnum core_type) const {
+  int num_cores = tpu::ExecutorApiFn()->TpuHostLocation_NumCoresFn(
+      host_location_, core_type);
+  std::vector<SE_TpuTopology_Core*> core_ptrs(num_cores);
+  tpu::ExecutorApiFn()->TpuHostLocation_CoresFn(host_location_, core_type,
+                                                core_ptrs.data());
+  std::vector<TpuCoreLocationExternal> result;
+  result.reserve(num_cores);
+  for (SE_TpuTopology_Core* ptr : core_ptrs) {
+    result.emplace_back(ptr);
+  }
+  return result;
+}
+
 int32 TpuTopologyExternal::LogicalDevicesPerHost(
     TpuCoreTypeEnum core_type) const {
   return tpu::ExecutorApiFn()->TpuTopology_LogicalDevicesPerHostFn(topology_,
@@ -58,6 +73,14 @@ int32 TpuTopologyExternal::LogicalDevicesPerChip(
                                                                    core_type);
 }
 
+int32 TpuTopologyExternal::HostCount() const {
+  return tpu::ExecutorApiFn()->TpuTopology_HostCountFn(topology_);
+}
+
+int32 TpuTopologyExternal::ChipsPerHost() const {
+  return tpu::ExecutorApiFn()->TpuTopology_ChipsPerHostFn(topology_);
+}
+
 TpuTopologyChipBoundsExternal TpuTopologyExternal::chip_bounds() const {
   return {tpu::ExecutorApiFn()->TpuTopology_ChipBounds_XFn(topology_),
           tpu::ExecutorApiFn()->TpuTopology_ChipBounds_YFn(topology_),
@@ -107,6 +130,8 @@ std::string TpuVersionEnumToString(TpuVersionEnum version) {
       return "TPU v2";
     case kTpuV3:
       return "TPU v3";
+    case kTpuV4:
+      return "TPU v4";
   }
 }
 
diff --git a/tensorflow/stream_executor/tpu/tpu_topology.h b/tensorflow/stream_executor/tpu/tpu_topology.h
index 5219ba7017b..84e13a142b6 100644
--- a/tensorflow/stream_executor/tpu/tpu_topology.h
+++ b/tensorflow/stream_executor/tpu/tpu_topology.h
@@ -51,6 +51,9 @@ class TpuHostLocationExternal {
   explicit TpuHostLocationExternal(SE_TpuTopology_Host* host_location)
       : host_location_(host_location) {}
   int32 Id() const;
+  std::vector<TpuCoreLocationExternal> Cores(TpuCoreTypeEnum core_type) const;
+
+  SE_TpuTopology_Host* impl() const { return host_location_; }
 
  private:
   SE_TpuTopology_Host* host_location_;
@@ -68,6 +71,8 @@ class TpuTopologyExternal {
       : topology_(topology) {}
   int32 LogicalDevicesPerHost(TpuCoreTypeEnum core_type) const;
   int32 LogicalDevicesPerChip(TpuCoreTypeEnum core_type) const;
+  int32 HostCount() const;
+  int32 ChipsPerHost() const;
   TpuTopologyChipBoundsExternal chip_bounds() const;
   bool HasChip(int x, int y, int z) const;
   TpuCoreLocationExternal Core(int x, int y, int z, TpuCoreTypeEnum core_type,
diff --git a/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc b/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
index b49a0b56259..be9fcc43a29 100644
--- a/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
+++ b/tensorflow/stream_executor/tpu/tpu_transfer_manager.cc
@@ -30,8 +30,11 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_platform.h"
 
 namespace tensorflow {
+namespace tpu {
 
 using Status = stream_executor::port::Status;
+template <typename T>
+using StatusOr = stream_executor::port::StatusOr<T>;
 
 TpuTransferManager::TpuTransferManager() {
   manager_ = tpu::ExecutorApiFn()->TpuTransferManager_NewFn();
@@ -217,6 +220,49 @@ int64 TpuTransferManager::GetByteSizeRequirement(
   return size_in_bytes;
 }
 
+StatusOr<xla::Shape> TpuTransferManager::ChooseCompactLayoutForShape(
+    const xla::Shape& host_shape) const {
+  XLA_Shape c_host_shape;
+  ApiConverter::ToC(host_shape, &c_host_shape);
+  XLA_Shape c_output;
+  StatusHelper status;
+  tpu::ExecutorApiFn()->TpuTransferManager_ChooseCompactLayoutForShapeFn(
+      manager_, &c_host_shape, &c_output, status.c_status);
+  // TODO(skyewm): use a scoped version of XLA_Shape
+  ApiConverter::Free(&c_host_shape);
+  if (!status.status().ok()) {
+    ApiConverter::Free(&c_output);
+    return status.status();
+  }
+  xla::Shape output = ApiConverter::FromC(&c_output);
+  ApiConverter::Free(&c_output);
+  return output;
+}
+
+bool TpuTransferManager::CanShapedBufferBeAccessedNow(
+    stream_executor::StreamExecutor* executor,
+    const xla::ShapedBuffer& device_buffer) const {
+  auto* tpu_executor = down_cast<TpuExecutor*>(executor->implementation());
+  XLA_ShapedBuffer c_device_buffer;
+  ApiConverter::ToC(device_buffer, &c_device_buffer);
+  auto cleanup = xla::MakeCleanup(
+      [&c_device_buffer]() { ApiConverter::Free(&c_device_buffer); });
+  return tpu::ExecutorApiFn()
+      ->TpuTransferManager_CanShapedBufferBeAccessedNowFn(
+          manager_, tpu_executor->se_executor(), &c_device_buffer);
+}
+
+bool TpuTransferManager::CanBufferBeAccessedNow(
+    se::StreamExecutor* executor,
+    const se::DeviceMemoryBase& device_buffer) const {
+  auto* tpu_executor = down_cast<TpuExecutor*>(executor->implementation());
+  SE_DeviceMemoryBase c_device_buffer{const_cast<void*>(device_buffer.opaque()),
+                                      device_buffer.size(),
+                                      device_buffer.payload()};
+  return tpu::ExecutorApiFn()->TpuTransferManager_CanBufferBeAccessedNowFn(
+      manager_, tpu_executor->se_executor(), &c_device_buffer);
+}
+
 Status TpuTransferManager::WriteSingleTupleIndexTable(
     stream_executor::Stream* stream,
     absl::Span<const stream_executor::DeviceMemoryBase> elements,
@@ -274,4 +320,5 @@ Status TpuTransferManager::LinearizeToBuffers(
   return status.status();
 }
 
+}  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/stream_executor/tpu/tpu_transfer_manager.h b/tensorflow/stream_executor/tpu/tpu_transfer_manager.h
index 53596e09be2..cef9b90c47a 100644
--- a/tensorflow/stream_executor/tpu/tpu_transfer_manager.h
+++ b/tensorflow/stream_executor/tpu/tpu_transfer_manager.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h"
 
 namespace tensorflow {
+namespace tpu {
 
 class TpuTransferManager : public xla::TpuTransferManagerInterface {
  public:
@@ -67,6 +68,17 @@ class TpuTransferManager : public xla::TpuTransferManagerInterface {
 
   int64 GetByteSizeRequirement(const xla::Shape& shape) const override;
 
+  StatusOr<xla::Shape> ChooseCompactLayoutForShape(
+      const xla::Shape& host_shape) const override;
+
+  bool CanShapedBufferBeAccessedNow(
+      stream_executor::StreamExecutor* executor,
+      const xla::ShapedBuffer& device_buffer) const override;
+
+  bool CanBufferBeAccessedNow(
+      se::StreamExecutor* executor,
+      const se::DeviceMemoryBase& device_buffer) const override;
+
   Status WriteSingleTupleIndexTable(
       stream_executor::Stream* stream,
       absl::Span<const stream_executor::DeviceMemoryBase> elements,
@@ -81,6 +93,7 @@ class TpuTransferManager : public xla::TpuTransferManagerInterface {
   XLA_TransferManager* manager_;
 };
 
+}  // namespace tpu
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_TPU_TPU_TRANSFER_MANAGER_H_
diff --git a/tensorflow/stream_executor/tpu/tpu_transfer_manager_registration.cc b/tensorflow/stream_executor/tpu/tpu_transfer_manager_registration.cc
index f7f0c6fbe2c..f4af8882e18 100644
--- a/tensorflow/stream_executor/tpu/tpu_transfer_manager_registration.cc
+++ b/tensorflow/stream_executor/tpu/tpu_transfer_manager_registration.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_transfer_manager.h"
 
 namespace tensorflow {
+namespace tpu {
 
 static std::unique_ptr<xla::TransferManager> CreateTpuTransferManager() {
   return std::make_unique<TpuTransferManager>();
@@ -32,4 +33,5 @@ static bool InitModule() {
 }
 static bool module_initialized = InitModule();
 
+}  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 36ab1f146ac..b8488d54620 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -4,12 +4,10 @@ load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_dynamic_kernels",
     "if_static",
-    "register_extension_info",
     "tf_additional_grpc_deps_py",
     "tf_additional_xla_deps_py",
     "tf_exec_properties",
     "tf_gpu_tests_tags",
-    "tf_sycl_tests_tags",
 )
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
@@ -47,7 +45,6 @@ load(
 load(
     "//third_party/mkl_dnn:build_defs.bzl",
     "if_mkl_open_source_only",
-    "if_mkl_v1",
     "if_mkldnn_threadpool",
 )
 load(
@@ -263,6 +260,13 @@ def if_nccl(if_true, if_false = []):
         "//conditions:default": if_true,
     })
 
+def if_libtpu(if_true, if_false = []):
+    """Shorthand for select()ing whether to build support for using TPUs via libtpu.so"""
+    return select({
+        str(Label("//tensorflow:with_tpu_support")): if_true,
+        "//conditions:default": if_false,
+    })
+
 # Linux systems may required -lrt linker flag for e.g. clock_gettime
 # see https://github.com/tensorflow/tensorflow/issues/15129
 def lrt_if_needed():
@@ -323,11 +327,10 @@ def tf_copts(
         (if_not_windows(["-fno-exceptions"]) if not allow_exceptions else []) +
         if_cuda(["-DGOOGLE_CUDA=1"]) +
         if_nvcc(["-DTENSORFLOW_USE_NVCC=1"]) +
+        if_libtpu(["-DLIBTPU_ON_GCE"], []) +
         if_xla_available(["-DTENSORFLOW_USE_XLA=1"]) +
         if_tensorrt(["-DGOOGLE_TENSORRT=1"]) +
-        if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML"]) +
-        if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) +
-        if_mkl_v1(["-DENABLE_MKLDNN_V1", "-DENABLE_INTEL_MKL_BFLOAT16"]) +
+        if_mkl(["-DINTEL_MKL=1", "-DENABLE_MKLDNN_V1", "-DENABLE_INTEL_MKL_BFLOAT16", "-DINTEL_MKL_DNN_ONLY"]) +
         if_mkldnn_threadpool(["-DENABLE_MKLDNN_THREADPOOL"]) +
         if_enable_mkl(["-DENABLE_MKL"]) +
         if_ngraph(["-DINTEL_NGRAPH=1"]) +
@@ -352,12 +355,6 @@ def tf_copts(
 def tf_openmp_copts():
     return (if_mkl_lnx_x64(["-fopenmp"]) + if_mkldnn_threadpool(["-fno-openmp"]))
 
-def tfe_xla_copts():
-    return select({
-        "//tensorflow:with_xla_support": ["-DTENSORFLOW_EAGER_USE_XLA"],
-        "//conditions:default": [],
-    })
-
 def tf_opts_nortti():
     return [
         "-fno-rtti",
@@ -400,7 +397,12 @@ def tf_defines_nortti_if_lite_protos():
 
 # Given a list of "op_lib_names" (a list of files in the ops directory
 # without their .cc extensions), generate a library for that file.
-def tf_gen_op_libs(op_lib_names, deps = None, is_external = True):
+def tf_gen_op_libs(
+        op_lib_names,
+        sub_directory = "ops/",
+        deps = None,
+        is_external = True,
+        compatible_with = None):
     # Make library out of each op so it can also be used to generate wrappers
     # for various languages.
     if not deps:
@@ -409,8 +411,9 @@ def tf_gen_op_libs(op_lib_names, deps = None, is_external = True):
         cc_library(
             name = n + "_op_lib",
             copts = tf_copts(is_external = is_external),
-            srcs = ["ops/" + n + ".cc"],
+            srcs = [sub_directory + n + ".cc"],
             deps = deps + [clean_dep("//tensorflow/core:framework")],
+            compatible_with = compatible_with,
             visibility = ["//visibility:public"],
             alwayslink = 1,
             linkstatic = 1,
@@ -644,11 +647,6 @@ def tf_cc_shared_object(
             visibility = visibility,
         )
 
-register_extension_info(
-    extension_name = "tf_cc_shared_object",
-    label_regex_for_dep = "{extension_name}",
-)
-
 # Links in the framework shared object
 # (//third_party/tensorflow:libtensorflow_framework.so) when not building
 # statically. Also adds linker options (rpaths) so that the framework shared
@@ -704,11 +702,6 @@ def tf_cc_binary(
             visibility = visibility,
         )
 
-register_extension_info(
-    extension_name = "tf_cc_binary",
-    label_regex_for_dep = "{extension_name}.*",
-)
-
 # A simple wrap around native.cc_binary rule.
 # When using this rule, you should realize it doesn't link to any tensorflow
 # dependencies by default.
@@ -733,11 +726,6 @@ def tf_native_cc_binary(
         **kwargs
     )
 
-register_extension_info(
-    extension_name = "tf_native_cc_binary",
-    label_regex_for_dep = "{extension_name}.*",
-)
-
 def tf_gen_op_wrapper_cc(
         name,
         out_ops_file,
@@ -746,7 +734,8 @@ def tf_gen_op_wrapper_cc(
         deps = None,
         include_internal_ops = 0,
         # ApiDefs will be loaded in the order specified in this list.
-        api_def_srcs = []):
+        api_def_srcs = [],
+        compatible_with = []):
     # Construct an op generator binary for these ops.
     tool = out_ops_file + "_gen_cc"
     if deps == None:
@@ -788,6 +777,7 @@ def tf_gen_op_wrapper_cc(
         cmd = ("$(location :" + tool + ") $(location :" + out_ops_file + ".h) " +
                "$(location :" + out_ops_file + ".cc) " +
                str(include_internal_ops) + " " + api_def_args_str),
+        compatible_with = compatible_with,
     )
 
 # Given a list of "op_lib_names" (a list of files in the ops directory
@@ -837,7 +827,8 @@ def tf_gen_op_wrappers_cc(
         # ApiDefs will be loaded in the order specified in this list.
         api_def_srcs = [],
         # Any extra dependencies that the wrapper generator might need.
-        extra_gen_deps = []):
+        extra_gen_deps = [],
+        compatible_with = []):
     subsrcs = other_srcs[:]
     subhdrs = other_hdrs[:]
     internalsrcs = other_srcs_internal[:]
@@ -851,6 +842,7 @@ def tf_gen_op_wrappers_cc(
             op_gen = op_gen,
             pkg = pkg,
             deps = [pkg + ":" + n + "_op_lib"] + extra_gen_deps,
+            compatible_with = compatible_with,
         )
         subsrcs += ["ops/" + n + ".cc"]
         subhdrs += ["ops/" + n + ".h"]
@@ -873,6 +865,7 @@ def tf_gen_op_wrappers_cc(
         copts = tf_copts(),
         alwayslink = 1,
         visibility = visibility,
+        compatible_with = compatible_with,
     )
     cc_library(
         name = name + "_internal",
@@ -890,6 +883,7 @@ def tf_gen_op_wrappers_cc(
         copts = tf_copts(),
         alwayslink = 1,
         visibility = [clean_dep("//tensorflow:internal")],
+        compatible_with = compatible_with,
     )
 
 # Generates a Python library target wrapping the ops registered in "deps".
@@ -928,7 +922,8 @@ def tf_gen_op_wrapper_py(
         generated_target_name = None,
         op_whitelist = [],
         cc_linkopts = lrt_if_needed(),
-        api_def_srcs = []):
+        api_def_srcs = [],
+        compatible_with = []):
     _ = require_shape_functions  # Unused.
 
     if (hidden or hidden_file) and op_whitelist:
@@ -989,6 +984,7 @@ def tf_gen_op_wrapper_py(
             exec_tools = [tool_name] + tf_binary_additional_srcs(),
             cmd = ("$(location " + tool_name + ") " + api_def_args_str +
                    " @$(location " + hidden_file + ") > $@"),
+            compatible_with = compatible_with,
         )
     else:
         native.genrule(
@@ -999,6 +995,7 @@ def tf_gen_op_wrapper_py(
             cmd = ("$(location " + tool_name + ") " + api_def_args_str + " " +
                    op_list_arg + " " +
                    ("1" if op_list_is_whitelist else "0") + " > $@"),
+            compatible_with = compatible_with,
         )
 
     # Make a py_library out of the generated python file.
@@ -1016,6 +1013,7 @@ def tf_gen_op_wrapper_py(
         # creators will provide their own tf_custom_op_py_library based target
         # that wraps this one.
         tags = ["avoid_dep"],
+        compatible_with = compatible_with,
     )
 
 # Define a bazel macro that creates cc_test for tensorflow.
@@ -1077,11 +1075,6 @@ def tf_cc_test(
         **kwargs
     )
 
-register_extension_info(
-    extension_name = "tf_cc_test",
-    label_regex_for_dep = "{extension_name}.*",
-)
-
 # Part of the testing workflow requires a distinguishable name for the build
 # rules that involve a GPU, even if otherwise identical to the base rule.
 def tf_cc_test_gpu(
@@ -1106,11 +1099,6 @@ def tf_cc_test_gpu(
         tags = tags,
     )
 
-register_extension_info(
-    extension_name = "tf_cc_test_gpu",
-    label_regex_for_dep = "{extension_name}",
-)
-
 def tf_gpu_cc_test(
         name,
         srcs = [],
@@ -1161,20 +1149,10 @@ def tf_gpu_cc_test(
         ]),
     )
 
-register_extension_info(
-    extension_name = "tf_gpu_cc_test",
-    label_regex_for_dep = "{extension_name}",
-)
-
 # terminology changes: saving tf_cuda_* definition for compatibility
 def tf_cuda_cc_test(*args, **kwargs):
     tf_gpu_cc_test(*args, **kwargs)
 
-register_extension_info(
-    extension_name = "tf_cuda_cc_test",
-    label_regex_for_dep = "{extension_name}",
-)
-
 def tf_gpu_only_cc_test(
         name,
         srcs = [],
@@ -1214,20 +1192,10 @@ def tf_gpu_only_cc_test(
         exec_properties = tf_exec_properties({"tags": tags}),
     )
 
-register_extension_info(
-    extension_name = "tf_gpu_only_cc_test",
-    label_regex_for_dep = "{extension_name}_gpu",
-)
-
 # terminology changes: saving tf_cuda_* definition for compatibility
 def tf_cuda_only_cc_test(*args, **kwargs):
     tf_gpu_only_cc_test(*args, **kwargs)
 
-register_extension_info(
-    extension_name = "tf_cuda_only_cc_test",
-    label_regex_for_dep = "{extension_name}_gpu",
-)
-
 # Create a cc_test for each of the tensorflow tests listed in "tests", along
 # with a test suite of the given name, if provided.
 def tf_cc_tests(
@@ -1359,11 +1327,6 @@ def tf_java_test(
         **kwargs
     )
 
-register_extension_info(
-    extension_name = "tf_java_test",
-    label_regex_for_dep = "{extension_name}",
-)
-
 def _cuda_copts(opts = []):
     """Gets the appropriate set of copts for (maybe) CUDA compilation.
 
@@ -1415,11 +1378,6 @@ def tf_gpu_kernel_library(
         **kwargs
     )
 
-register_extension_info(
-    extension_name = "tf_gpu_kernel_library",
-    label_regex_for_dep = "{extension_name}",
-)
-
 def tf_gpu_library(deps = None, cuda_deps = None, copts = tf_copts(), **kwargs):
     """Generate a cc_library with a conditional set of CUDA dependencies.
 
@@ -1455,20 +1413,10 @@ def tf_gpu_library(deps = None, cuda_deps = None, copts = tf_copts(), **kwargs):
         **kwargs
     )
 
-register_extension_info(
-    extension_name = "tf_gpu_library",
-    label_regex_for_dep = "{extension_name}",
-)
-
 # terminology changes: saving tf_cuda_* definition for compatibility
 def tf_cuda_library(*args, **kwargs):
     tf_gpu_library(*args, **kwargs)
 
-register_extension_info(
-    extension_name = "tf_cuda_library",
-    label_regex_for_dep = "{extension_name}",
-)
-
 def tf_kernel_library(
         name,
         prefix = None,
@@ -1480,6 +1428,7 @@ def tf_kernel_library(
         copts = None,
         gpu_copts = None,
         is_external = False,
+        compatible_with = None,
         **kwargs):
     """A rule to build a TensorFlow OpKernel.
 
@@ -1567,6 +1516,7 @@ def tf_kernel_library(
         linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
         alwayslink = alwayslink,
         deps = deps,
+        compatible_with = compatible_with,
         **kwargs
     )
 
@@ -1579,11 +1529,6 @@ def tf_kernel_library(
         deps = deps,
     )
 
-register_extension_info(
-    extension_name = "tf_kernel_library",
-    label_regex_for_dep = "({extension_name}(_gpu)?|libtfkernel_{extension_name}\\.so)",
-)
-
 def tf_mkl_kernel_library(
         name,
         prefix = None,
@@ -1622,11 +1567,6 @@ def tf_mkl_kernel_library(
         features = disable_header_modules,
     )
 
-register_extension_info(
-    extension_name = "tf_mkl_kernel_library",
-    label_regex_for_dep = "{extension_name}",
-)
-
 def _get_transitive_headers(hdrs, deps):
     """Obtain the header files for a target and its transitive dependencies.
 
@@ -1742,16 +1682,22 @@ _transitive_parameters_library = rule(
 #   * Eigen: it's a header-only library.  Add it directly to your deps.
 #   * GRPC: add a direct dep on @com_github_grpc_grpc//:grpc++_public_hdrs.
 #
-def cc_header_only_library(name, deps = [], includes = [], extra_deps = [], **kwargs):
-    _transitive_hdrs(name = name + "_gather", deps = deps)
+def cc_header_only_library(name, deps = [], includes = [], extra_deps = [], compatible_with = None, **kwargs):
+    _transitive_hdrs(
+        name = name + "_gather",
+        deps = deps,
+        compatible_with = compatible_with,
+    )
     _transitive_parameters_library(
         name = name + "_gathered_parameters",
         original_deps = deps,
+        compatible_with = compatible_with,
     )
     cc_library(
         name = name,
         hdrs = [":" + name + "_gather"],
         includes = includes,
+        compatible_with = compatible_with,
         deps = [":" + name + "_gathered_parameters"] + extra_deps,
         **kwargs
     )
@@ -1888,10 +1834,9 @@ def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [
         **kwargs
     )
 
-register_extension_info(
-    extension_name = "tf_custom_op_library",
-    label_regex_for_dep = "{extension_name}",
-)
+# Placeholder to use until bazel supports py_strict_binary.
+def py_strict_binary(name, **kwargs):
+    native.py_binary(name = name, **kwargs)
 
 # Placeholder to use until bazel supports py_strict_library.
 def py_strict_library(name, **kwargs):
@@ -1919,11 +1864,6 @@ def tf_custom_op_py_library(
         deps = deps,
     )
 
-register_extension_info(
-    extension_name = "tf_custom_op_py_library",
-    label_regex_for_dep = "{extension_name}",
-)
-
 # In tf_py_wrap_cc_opensource generated libraries
 # module init functions are not exported unless
 # they contain one of the keywords in the version file
@@ -2145,11 +2085,6 @@ def py_test(deps = [], data = [], kernels = [], exec_properties = None, **kwargs
         **kwargs
     )
 
-register_extension_info(
-    extension_name = "py_test",
-    label_regex_for_dep = "{extension_name}",
-)
-
 # Similar to py_test above, this macro is used to exclude dependencies for some py_binary
 # targets in order to reduce the size of //tensorflow/tools/pip_package:simple_console_windows.
 # See https://github.com/tensorflow/tensorflow/issues/22390
@@ -2170,10 +2105,9 @@ def py_binary(name, deps = [], **kwargs):
         **kwargs
     )
 
-register_extension_info(
-    extension_name = "py_binary",
-    label_regex_for_dep = "{extension_name}",
-)
+def pytype_library(**kwargs):
+    # Types not enforced in OSS.
+    native.py_library(**kwargs)
 
 def tf_py_test(
         name,
@@ -2267,11 +2201,6 @@ def tf_py_test(
             **kwargs
         )
 
-register_extension_info(
-    extension_name = "tf_py_test",
-    label_regex_map = {"deps": "deps:{extension_name}"},
-)
-
 def gpu_py_test(
         name,
         srcs,
@@ -2332,58 +2261,10 @@ def gpu_py_test(
             **kwargs
         )
 
-register_extension_info(
-    extension_name = "gpu_py_test",
-    label_regex_map = {"deps": "deps:{extension_name}"},
-)
-
 # terminology changes: saving cuda_* definition for compatibility
 def cuda_py_test(*args, **kwargs):
     gpu_py_test(*args, **kwargs)
 
-register_extension_info(
-    extension_name = "cuda_py_test",
-    label_regex_map = {"deps": "deps:{extension_name}"},
-)
-
-def sycl_py_test(
-        name,
-        srcs,
-        size = "medium",
-        data = [],
-        main = None,
-        args = [],
-        shard_count = 1,
-        kernels = [],
-        tags = [],
-        flaky = 0,
-        xla_enabled = False,
-        grpc_enabled = False,
-        **kwargs):
-    test_tags = tags + tf_sycl_tests_tags()
-    if "additional_deps" in kwargs:
-        fail("Use `deps` to specify dependencies. `additional_deps` has been replaced with the standard pattern of `deps`.")
-    tf_py_test(
-        name = name,
-        size = size,
-        srcs = srcs,
-        args = args,
-        data = data,
-        flaky = flaky,
-        grpc_enabled = grpc_enabled,
-        kernels = kernels,
-        main = main,
-        shard_count = shard_count,
-        tags = test_tags,
-        xla_enabled = xla_enabled,
-        **kwargs
-    )
-
-register_extension_info(
-    extension_name = "sycl_py_test",
-    label_regex_map = {"deps": "deps:{extension_name}"},
-)
-
 def py_tests(
         name,
         srcs,
@@ -2477,7 +2358,7 @@ def cuda_py_tests(*args, **kwargs):
 #
 # Return a struct with fields (hdrs, srcs) containing the names of the
 # generated files.
-def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs, protodeps = [], deps = [], visibility = None):
+def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs, protodeps = [], deps = [], visibility = None, compatible_with = None):
     out_hdrs = (
         [
             p.replace(".proto", ".pb_text.h")
@@ -2496,15 +2377,18 @@ def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs, protodeps = []
         exec_tools = [
             clean_dep("//tensorflow/tools/proto_text:gen_proto_text_functions"),
         ],
+        compatible_with = compatible_with,
     )
 
     native.filegroup(
         name = name + "_hdrs",
         srcs = out_hdrs,
         visibility = visibility,
+        compatible_with = compatible_with,
     )
 
     cc_library(
+        compatible_with = compatible_with,
         name = name,
         srcs = out_srcs,
         hdrs = out_hdrs,
@@ -2577,12 +2461,13 @@ def _local_genrule(**kwargs):
         **kwargs
     )
 
-def tf_version_info_genrule(name, out):
+def tf_version_info_genrule(name, out, compatible_with = None):
     # TODO(gunan): Investigate making this action hermetic so we do not need
     # to run it locally.
     _local_genrule(
         name = name,
         out = out,
+        compatible_with = compatible_with,
         exec_tool = "//tensorflow/tools/git:gen_git_source",
         srcs = [
             "@local_config_git//:gen/spec.json",
@@ -2624,11 +2509,6 @@ def cc_library_with_android_deps(
     deps = if_not_android(deps) + if_android(android_deps) + common_deps
     cc_library(deps = deps, copts = copts, **kwargs)
 
-register_extension_info(
-    extension_name = "cc_library_with_android_deps",
-    label_regex_for_dep = "{extension_name}",
-)
-
 def tensorflow_opensource_extra_deps():
     return []
 
@@ -2775,7 +2655,8 @@ def tf_python_pybind_extension(
         deps = [],
         defines = [],
         visibility = None,
-        testonly = None):
+        testonly = None,
+        compatible_with = None):
     """A wrapper macro for pybind_extension that is used in tensorflow/python/BUILD.
 
     Please do not use it anywhere else as it may behave unexpectedly. b/146445820
@@ -2795,9 +2676,10 @@ def tf_python_pybind_extension(
         visibility = visibility,
         link_in_framework = True,
         testonly = testonly,
+        compatible_with = compatible_with,
     )
 
-def tf_pybind_cc_library_wrapper(name, deps, visibility = None):
+def tf_pybind_cc_library_wrapper(name, deps, visibility = None, **kwargs):
     """Wrapper for cc_library and proto dependencies used by tf_python_pybind_extension.
 
     This wrapper ensures that cc libraries' and protos' headers are made
@@ -2805,7 +2687,7 @@ def tf_pybind_cc_library_wrapper(name, deps, visibility = None):
     linked case.  The symbols in these deps symbols should be linked to, and
     exported by, the core pywrap_tensorflow_internal.so
     """
-    cc_header_only_library(name = name, deps = deps, visibility = visibility)
+    cc_header_only_library(name = name, deps = deps, visibility = visibility, **kwargs)
 
 def if_cuda_or_rocm(if_true, if_false = []):
     """Shorthand for select()'ing whether to build for either CUDA or ROCm.
@@ -2882,16 +2764,12 @@ def tf_enable_mlir_bridge():
         str(Label("//tensorflow:enable_mlir_bridge")): [
             "//tensorflow/python:is_mlir_bridge_test_true",
         ],
+        str(Label("//tensorflow:disable_mlir_bridge")): [
+            "//tensorflow/python:is_mlir_bridge_test_false",
+        ],
         "//conditions:default": [],
     })
 
-def if_tpu(if_true, if_false = []):
-    """Shorthand for select()ing whether to build for TPUs."""
-    return select({
-        str(Label("//tensorflow:with_tpu_support")): if_true,
-        "//conditions:default": if_false,
-    })
-
 def tfcompile_target_cpu():
     return ""
 
@@ -2928,3 +2806,24 @@ def tf_grpc_dependency():
 
 def tf_grpc_cc_dependency():
     return "//tensorflow:grpc++"
+
+def get_compatible_with_portable():
+    return []
+
+def get_compatible_with_cloud():
+    return []
+
+def filegroup(**kwargs):
+    native.filegroup(**kwargs)
+
+def genrule(**kwargs):
+    native.genrule(**kwargs)
+
+def internal_hlo_deps():
+    return []
+
+def internal_tfrt_deps():
+    return []
+
+def internal_cuda_deps():
+    return []
diff --git a/tensorflow/tools/android/inference_interface/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java b/tensorflow/tools/android/inference_interface/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
index 618c772e92d..976563ff199 100644
--- a/tensorflow/tools/android/inference_interface/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
+++ b/tensorflow/tools/android/inference_interface/java/org/tensorflow/contrib/android/TensorFlowInferenceInterface.java
@@ -43,8 +43,8 @@ import org.tensorflow.types.UInt8;
  * Wrapper over the TensorFlow API ({@link Graph}, {@link Session}) providing a smaller API surface
  * for inference.
  *
- * <p>See tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java for an
- * example usage.
+ * <p>See tensorflow/tools/android/test/src/org/tensorflow/demo/TensorFlowImageClassifier.java for
+ * an example usage.
  */
 public class TensorFlowInferenceInterface {
   private static final String TAG = "TensorFlowInferenceInterface";
diff --git a/tensorflow/tools/android/test/.gitignore b/tensorflow/tools/android/test/.gitignore
new file mode 100644
index 00000000000..fbd0a2dc7d7
--- /dev/null
+++ b/tensorflow/tools/android/test/.gitignore
@@ -0,0 +1,30 @@
+# This file is based on https://github.com/github/gitignore/blob/master/Android.gitignore
+*.iml
+.idea/compiler.xml
+.idea/copyright
+.idea/dictionaries
+.idea/gradle.xml
+.idea/libraries
+.idea/inspectionProfiles
+.idea/misc.xml
+.idea/modules.xml
+.idea/runConfigurations.xml
+.idea/tasks.xml
+.idea/workspace.xml
+.gradle
+local.properties
+.DS_Store
+build/
+gradleBuild/
+*.apk
+*.ap_
+*.dex
+*.class
+bin/
+gen/
+out/
+*.log
+.navigation/
+/captures
+.externalNativeBuild
+.cxx
diff --git a/tensorflow/examples/android/AndroidManifest.xml b/tensorflow/tools/android/test/AndroidManifest.xml
similarity index 100%
rename from tensorflow/examples/android/AndroidManifest.xml
rename to tensorflow/tools/android/test/AndroidManifest.xml
diff --git a/tensorflow/examples/android/BUILD b/tensorflow/tools/android/test/BUILD
similarity index 95%
rename from tensorflow/examples/android/BUILD
rename to tensorflow/tools/android/test/BUILD
index 00d9a901395..387924a2e65 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/tools/android/test/BUILD
@@ -69,7 +69,7 @@ android_binary(
     # Package assets from assets dir as well as all model targets. Remove undesired models
     # (and corresponding Activities in source) to reduce APK size.
     assets = [
-        "//tensorflow/examples/android/assets:asset_files",
+        "//tensorflow/tools/android/test/assets:asset_files",
         ":external_assets",
     ],
     assets_dir = "",
@@ -96,7 +96,7 @@ filegroup(
         "@stylize//:model_files",
     ],
 )
-# LINT.ThenChange(//tensorflow/examples/android/download-models.gradle)
+# LINT.ThenChange(//tensorflow/tools/android/test/download-models.gradle)
 
 filegroup(
     name = "java_files",
diff --git a/tensorflow/examples/android/README.md b/tensorflow/tools/android/test/README.md
similarity index 93%
rename from tensorflow/examples/android/README.md
rename to tensorflow/tools/android/test/README.md
index af93adfb1e4..977f9305498 100644
--- a/tensorflow/examples/android/README.md
+++ b/tensorflow/tools/android/test/README.md
@@ -1,5 +1,7 @@
 # TensorFlow Android Camera Demo
 
+DEPRECATED: These examples are deprecated.
+
 This folder contains an example application utilizing TensorFlow for Android
 devices.
 
@@ -20,22 +22,22 @@ on API >= 14 devices.
 
 ## Current samples:
 
-1. [TF Classify](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java):
+1. [TF Classify](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/android/test/src/org/tensorflow/demo/ClassifierActivity.java):
         Uses the [Google Inception](https://arxiv.org/abs/1409.4842)
         model to classify camera frames in real-time, displaying the top results
         in an overlay on the camera image.
-2. [TF Detect](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java):
+2. [TF Detect](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/android/test/src/org/tensorflow/demo/DetectorActivity.java):
         Demonstrates an SSD-Mobilenet model trained using the
         [Tensorflow Object Detection API](https://github.com/tensorflow/models/tree/master/research/object_detection/)
         introduced in [Speed/accuracy trade-offs for modern convolutional object detectors](https://arxiv.org/abs/1611.10012) to
         localize and track objects (from 80 categories) in the camera preview
         in real-time.
-3. [TF Stylize](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java):
+3. [TF Stylize](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/android/test/src/org/tensorflow/demo/StylizeActivity.java):
         Uses a model based on [A Learned Representation For Artistic
         Style](https://arxiv.org/abs/1610.07629) to restyle the camera preview
         image to that of a number of different artists.
 4.  [TF
-    Speech](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java):
+    Speech](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/android/test/src/org/tensorflow/demo/SpeechActivity.java):
     Runs a simple speech recognition model built by the [audio training
     tutorial](https://www.tensorflow.org/versions/master/tutorials/audio_recognition). Listens
     for a small set of words, and highlights them in the UI when they are
@@ -165,7 +167,7 @@ BASE_URL=https://storage.googleapis.com/download.tensorflow.org/models
 for MODEL_ZIP in inception5h.zip ssd_mobilenet_v1_android_export.zip stylize_v1.zip
 do
   curl -L ${BASE_URL}/${MODEL_ZIP} -o /tmp/${MODEL_ZIP}
-  unzip /tmp/${MODEL_ZIP} -d tensorflow/examples/android/assets/
+  unzip /tmp/${MODEL_ZIP} -d tensorflow/tools/android/test/assets/
 done
 ```
 
@@ -182,7 +184,7 @@ After editing your WORKSPACE file to update the SDK/NDK configuration, you may
 build the APK. Run this from your workspace root:
 
 ```bash
-bazel build --cxxopt='--std=c++11' -c opt //tensorflow/examples/android:tensorflow_demo
+bazel build --cxxopt='--std=c++11' -c opt //tensorflow/tools/android/test:tensorflow_demo
 ```
 
 ##### Install
@@ -192,7 +194,7 @@ device, then after building use the following command from your workspace root
 to install the APK:
 
 ```bash
-adb install -r bazel-bin/tensorflow/examples/android/tensorflow_demo.apk
+adb install -r bazel-bin/tensorflow/tools/android/test/tensorflow_demo.apk
 ```
 
 ### Android Studio with Bazel
@@ -202,7 +204,7 @@ make sure that you can build with Bazel following the above directions. Then,
 look at [build.gradle](build.gradle) and make sure that the path to Bazel
 matches that of your system.
 
-At this point you can add the tensorflow/examples/android directory as a new
+At this point you can add the tensorflow/tools/android/test directory as a new
 Android Studio project. Click through installing all the Gradle extensions it
 requests, and you should be able to have Android Studio build the demo like any
 other application (it will call out to Bazel to build the native code with the
diff --git a/tensorflow/examples/tutorials/word2vec/__init__.py b/tensorflow/tools/android/test/__init__.py
similarity index 100%
rename from tensorflow/examples/tutorials/word2vec/__init__.py
rename to tensorflow/tools/android/test/__init__.py
diff --git a/tensorflow/examples/android/assets/BUILD b/tensorflow/tools/android/test/assets/BUILD
similarity index 100%
rename from tensorflow/examples/android/assets/BUILD
rename to tensorflow/tools/android/test/assets/BUILD
diff --git a/tensorflow/examples/android/bin/AndroidManifest.xml b/tensorflow/tools/android/test/bin/AndroidManifest.xml
similarity index 100%
rename from tensorflow/examples/android/bin/AndroidManifest.xml
rename to tensorflow/tools/android/test/bin/AndroidManifest.xml
diff --git a/tensorflow/examples/android/build.gradle b/tensorflow/tools/android/test/build.gradle
similarity index 97%
rename from tensorflow/examples/android/build.gradle
rename to tensorflow/tools/android/test/build.gradle
index 17fda55d693..4f7bea6b19d 100644
--- a/tensorflow/examples/android/build.gradle
+++ b/tensorflow/tools/android/test/build.gradle
@@ -56,7 +56,7 @@ def nativeOutDir = 'libs/' + cpuType
 
 // Default to building with Bazel and override with make if requested.
 def nativeBuildRule = 'buildNativeBazel'
-def demoLibPath = '../../../bazel-bin/tensorflow/examples/android/libtensorflow_demo.so'
+def demoLibPath = '../../../bazel-bin/tensorflow/tools/android/test/libtensorflow_demo.so'
 def inferenceLibPath = '../../../bazel-bin/tensorflow/tools/android/inference_interface/libtensorflow_inference.so'
 
 // Override for Makefile builds.
@@ -149,7 +149,7 @@ android {
 task buildNativeBazel(type: Exec) {
     workingDir '../../..'
     commandLine bazelLocation, 'build', '-c', 'opt',  \
-         'tensorflow/examples/android:tensorflow_native_libs',  \
+         'tensorflow/tools/android/test:tensorflow_native_libs',  \
          '--crosstool_top=//external:android/crosstool',  \
          '--cpu=' + cpuType,  \
          '--host_crosstool_top=@bazel_tools//tools/cpp:toolchain'
diff --git a/tensorflow/examples/android/download-models.gradle b/tensorflow/tools/android/test/download-models.gradle
similarity index 96%
rename from tensorflow/examples/android/download-models.gradle
rename to tensorflow/tools/android/test/download-models.gradle
index 727ef2cc850..f581e58a04e 100644
--- a/tensorflow/examples/android/download-models.gradle
+++ b/tensorflow/tools/android/test/download-models.gradle
@@ -13,7 +13,7 @@ def models = ['inception_v1.zip',
               'object_detection/ssd_mobilenet_v1_android_export.zip',
               'stylize_v1.zip',
               'speech_commands_conv_actions.zip']
-// LINT.ThenChange(//tensorflow/examples/android/BUILD)
+// LINT.ThenChange(//tensorflow/tools/android/test/BUILD)
 
 // Root URL for model archives
 def MODEL_URL = 'https://storage.googleapis.com/download.tensorflow.org/models'
diff --git a/tensorflow/examples/android/gradlew b/tensorflow/tools/android/test/gradlew
similarity index 100%
rename from tensorflow/examples/android/gradlew
rename to tensorflow/tools/android/test/gradlew
diff --git a/tensorflow/examples/android/gradlew.bat b/tensorflow/tools/android/test/gradlew.bat
similarity index 100%
rename from tensorflow/examples/android/gradlew.bat
rename to tensorflow/tools/android/test/gradlew.bat
diff --git a/tensorflow/examples/android/jni/CMakeLists.txt b/tensorflow/tools/android/test/jni/CMakeLists.txt
similarity index 100%
rename from tensorflow/examples/android/jni/CMakeLists.txt
rename to tensorflow/tools/android/test/jni/CMakeLists.txt
diff --git a/third_party/sycl/crosstool/BUILD b/tensorflow/tools/android/test/jni/__init__.py
similarity index 100%
rename from third_party/sycl/crosstool/BUILD
rename to tensorflow/tools/android/test/jni/__init__.py
diff --git a/tensorflow/examples/android/jni/imageutils_jni.cc b/tensorflow/tools/android/test/jni/imageutils_jni.cc
similarity index 98%
rename from tensorflow/examples/android/jni/imageutils_jni.cc
rename to tensorflow/tools/android/test/jni/imageutils_jni.cc
index a7b39bdb316..18e834f7085 100644
--- a/tensorflow/examples/android/jni/imageutils_jni.cc
+++ b/tensorflow/tools/android/test/jni/imageutils_jni.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include <stdio.h>
 #include <stdlib.h>
 
-#include "tensorflow/examples/android/jni/rgb2yuv.h"
-#include "tensorflow/examples/android/jni/yuv2rgb.h"
+#include "tensorflow/tools/android/test/jni/rgb2yuv.h"
+#include "tensorflow/tools/android/test/jni/yuv2rgb.h"
 
 #define IMAGEUTILS_METHOD(METHOD_NAME) \
   Java_org_tensorflow_demo_env_ImageUtils_##METHOD_NAME  // NOLINT
diff --git a/tensorflow/examples/android/jni/object_tracking/config.h b/tensorflow/tools/android/test/jni/object_tracking/config.h
similarity index 99%
rename from tensorflow/examples/android/jni/object_tracking/config.h
rename to tensorflow/tools/android/test/jni/object_tracking/config.h
index 47de2d2c15b..830c64976a2 100644
--- a/tensorflow/examples/android/jni/object_tracking/config.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/config.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <math.h>
 
-#include "tensorflow/examples/android/jni/object_tracking/geom.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/geom.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/flow_cache.h b/tensorflow/tools/android/test/jni/object_tracking/flow_cache.h
similarity index 97%
rename from tensorflow/examples/android/jni/object_tracking/flow_cache.h
rename to tensorflow/tools/android/test/jni/object_tracking/flow_cache.h
index b62e334ecd7..b388addb63a 100644
--- a/tensorflow/examples/android/jni/object_tracking/flow_cache.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/flow_cache.h
@@ -16,11 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FLOW_CACHE_H_
 #define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FLOW_CACHE_H_
 
-#include "tensorflow/examples/android/jni/object_tracking/geom.h"
-#include "tensorflow/examples/android/jni/object_tracking/utils.h"
-
-#include "tensorflow/examples/android/jni/object_tracking/config.h"
-#include "tensorflow/examples/android/jni/object_tracking/optical_flow.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/config.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/geom.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/optical_flow.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/frame_pair.cc b/tensorflow/tools/android/test/jni/object_tracking/frame_pair.cc
similarity index 98%
rename from tensorflow/examples/android/jni/object_tracking/frame_pair.cc
rename to tensorflow/tools/android/test/jni/object_tracking/frame_pair.cc
index 66e422e87b6..770e2b145aa 100644
--- a/tensorflow/examples/android/jni/object_tracking/frame_pair.cc
+++ b/tensorflow/tools/android/test/jni/object_tracking/frame_pair.cc
@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/tools/android/test/jni/object_tracking/frame_pair.h"
+
 #include <float.h>
 
-#include "tensorflow/examples/android/jni/object_tracking/config.h"
-#include "tensorflow/examples/android/jni/object_tracking/frame_pair.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/config.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/frame_pair.h b/tensorflow/tools/android/test/jni/object_tracking/frame_pair.h
similarity index 98%
rename from tensorflow/examples/android/jni/object_tracking/frame_pair.h
rename to tensorflow/tools/android/test/jni/object_tracking/frame_pair.h
index 6c8ac9be981..7365a04b8c1 100644
--- a/tensorflow/examples/android/jni/object_tracking/frame_pair.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/frame_pair.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FRAME_PAIR_H_
 #define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_FRAME_PAIR_H_
 
-#include "tensorflow/examples/android/jni/object_tracking/keypoint.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/keypoint.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/geom.h b/tensorflow/tools/android/test/jni/object_tracking/geom.h
similarity index 98%
rename from tensorflow/examples/android/jni/object_tracking/geom.h
rename to tensorflow/tools/android/test/jni/object_tracking/geom.h
index c975e40144b..f46924db7e3 100644
--- a/tensorflow/examples/android/jni/object_tracking/geom.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/geom.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GEOM_H_
 #define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_GEOM_H_
 
-#include "tensorflow/examples/android/jni/object_tracking/logging.h"
-#include "tensorflow/examples/android/jni/object_tracking/utils.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/logging.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/gl_utils.h b/tensorflow/tools/android/test/jni/object_tracking/gl_utils.h
similarity index 96%
rename from tensorflow/examples/android/jni/object_tracking/gl_utils.h
rename to tensorflow/tools/android/test/jni/object_tracking/gl_utils.h
index a29e677d3c5..f6e2a445807 100755
--- a/tensorflow/examples/android/jni/object_tracking/gl_utils.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/gl_utils.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <GLES/gl.h>
 #include <GLES/glext.h>
 
-#include "tensorflow/examples/android/jni/object_tracking/geom.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/geom.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/image-inl.h b/tensorflow/tools/android/test/jni/object_tracking/image-inl.h
similarity index 99%
rename from tensorflow/examples/android/jni/object_tracking/image-inl.h
rename to tensorflow/tools/android/test/jni/object_tracking/image-inl.h
index 61d69908b55..8382e56bb0f 100644
--- a/tensorflow/examples/android/jni/object_tracking/image-inl.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/image-inl.h
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include "tensorflow/examples/android/jni/object_tracking/geom.h"
-#include "tensorflow/examples/android/jni/object_tracking/image.h"
-#include "tensorflow/examples/android/jni/object_tracking/utils.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/geom.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/image.h b/tensorflow/tools/android/test/jni/object_tracking/image.h
similarity index 98%
rename from tensorflow/examples/android/jni/object_tracking/image.h
rename to tensorflow/tools/android/test/jni/object_tracking/image.h
index a436f0e0a13..982cb0e715c 100644
--- a/tensorflow/examples/android/jni/object_tracking/image.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/image.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include "tensorflow/examples/android/jni/object_tracking/geom.h"
-#include "tensorflow/examples/android/jni/object_tracking/utils.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/geom.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
 
 // TODO(andrewharp): Make this a cast to uint32_t if/when we go unsigned for
 // operations.
diff --git a/tensorflow/examples/android/jni/object_tracking/image_data.h b/tensorflow/tools/android/test/jni/object_tracking/image_data.h
similarity index 94%
rename from tensorflow/examples/android/jni/object_tracking/image_data.h
rename to tensorflow/tools/android/test/jni/object_tracking/image_data.h
index c4f91d8cbd8..f89967c8d6f 100644
--- a/tensorflow/examples/android/jni/object_tracking/image_data.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/image_data.h
@@ -17,16 +17,16 @@ limitations under the License.
 #define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_IMAGE_DATA_H_
 
 #include <stdint.h>
+
 #include <memory>
 
-#include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
-#include "tensorflow/examples/android/jni/object_tracking/image.h"
-#include "tensorflow/examples/android/jni/object_tracking/image_utils.h"
-#include "tensorflow/examples/android/jni/object_tracking/integral_image.h"
-#include "tensorflow/examples/android/jni/object_tracking/time_log.h"
-#include "tensorflow/examples/android/jni/object_tracking/utils.h"
-
-#include "tensorflow/examples/android/jni/object_tracking/config.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/config.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image-inl.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image_utils.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/integral_image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/time_log.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/image_neon.cc b/tensorflow/tools/android/test/jni/object_tracking/image_neon.cc
similarity index 96%
rename from tensorflow/examples/android/jni/object_tracking/image_neon.cc
rename to tensorflow/tools/android/test/jni/object_tracking/image_neon.cc
index 5f1023cbed3..2bd31f5618b 100644
--- a/tensorflow/examples/android/jni/object_tracking/image_neon.cc
+++ b/tensorflow/tools/android/test/jni/object_tracking/image_neon.cc
@@ -19,13 +19,12 @@ limitations under the License.
 #ifdef __ARM_NEON
 
 #include <arm_neon.h>
-
 #include <stdint.h>
 
-#include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
-#include "tensorflow/examples/android/jni/object_tracking/image.h"
-#include "tensorflow/examples/android/jni/object_tracking/image_utils.h"
-#include "tensorflow/examples/android/jni/object_tracking/utils.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image-inl.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image_utils.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/image_utils.h b/tensorflow/tools/android/test/jni/object_tracking/image_utils.h
similarity index 97%
rename from tensorflow/examples/android/jni/object_tracking/image_utils.h
rename to tensorflow/tools/android/test/jni/object_tracking/image_utils.h
index b4ad7000b33..9e301e29a0e 100644
--- a/tensorflow/examples/android/jni/object_tracking/image_utils.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/image_utils.h
@@ -18,11 +18,10 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include "tensorflow/examples/android/jni/object_tracking/geom.h"
-#include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
-#include "tensorflow/examples/android/jni/object_tracking/image.h"
-#include "tensorflow/examples/android/jni/object_tracking/utils.h"
-
+#include "tensorflow/tools/android/test/jni/object_tracking/geom.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image-inl.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/integral_image.h b/tensorflow/tools/android/test/jni/object_tracking/integral_image.h
similarity index 95%
rename from tensorflow/examples/android/jni/object_tracking/integral_image.h
rename to tensorflow/tools/android/test/jni/object_tracking/integral_image.h
index caf9b7d2ab8..9306b2aa3d4 100755
--- a/tensorflow/examples/android/jni/object_tracking/integral_image.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/integral_image.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_INTEGRAL_IMAGE_H_
 #define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_INTEGRAL_IMAGE_H_
 
-#include "tensorflow/examples/android/jni/object_tracking/geom.h"
-#include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
-#include "tensorflow/examples/android/jni/object_tracking/image.h"
-#include "tensorflow/examples/android/jni/object_tracking/utils.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/geom.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image-inl.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/jni_utils.h b/tensorflow/tools/android/test/jni/object_tracking/jni_utils.h
similarity index 97%
rename from tensorflow/examples/android/jni/object_tracking/jni_utils.h
rename to tensorflow/tools/android/test/jni/object_tracking/jni_utils.h
index 5f622a2e65f..015e6c09b41 100644
--- a/tensorflow/examples/android/jni/object_tracking/jni_utils.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/jni_utils.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <jni.h>
 #include <stdint.h>
 
-#include "tensorflow/examples/android/jni/object_tracking/utils.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
 
 // The JniLongField class is used to access Java fields from native code. This
 // technique of hiding pointers to native objects in opaque Java fields is how
diff --git a/tensorflow/examples/android/jni/object_tracking/keypoint.h b/tensorflow/tools/android/test/jni/object_tracking/keypoint.h
similarity index 73%
rename from tensorflow/examples/android/jni/object_tracking/keypoint.h
rename to tensorflow/tools/android/test/jni/object_tracking/keypoint.h
index 93405a5b2a8..fb6cd3162b3 100644
--- a/tensorflow/examples/android/jni/object_tracking/keypoint.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/keypoint.h
@@ -16,14 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_H_
 #define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_H_
 
-#include "tensorflow/examples/android/jni/object_tracking/geom.h"
-#include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
-#include "tensorflow/examples/android/jni/object_tracking/image.h"
-#include "tensorflow/examples/android/jni/object_tracking/logging.h"
-#include "tensorflow/examples/android/jni/object_tracking/time_log.h"
-#include "tensorflow/examples/android/jni/object_tracking/utils.h"
-
-#include "tensorflow/examples/android/jni/object_tracking/config.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/config.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/geom.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image-inl.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/logging.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/time_log.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/keypoint_detector.cc b/tensorflow/tools/android/test/jni/object_tracking/keypoint_detector.cc
similarity index 97%
rename from tensorflow/examples/android/jni/object_tracking/keypoint_detector.cc
rename to tensorflow/tools/android/test/jni/object_tracking/keypoint_detector.cc
index fc60d2a5ca9..c1919ccd452 100644
--- a/tensorflow/examples/android/jni/object_tracking/keypoint_detector.cc
+++ b/tensorflow/tools/android/test/jni/object_tracking/keypoint_detector.cc
@@ -15,16 +15,16 @@ limitations under the License.
 
 // Various keypoint detecting functions.
 
+#include "tensorflow/tools/android/test/jni/object_tracking/keypoint_detector.h"
+
 #include <float.h>
 
-#include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
-#include "tensorflow/examples/android/jni/object_tracking/image.h"
-#include "tensorflow/examples/android/jni/object_tracking/time_log.h"
-#include "tensorflow/examples/android/jni/object_tracking/utils.h"
-
-#include "tensorflow/examples/android/jni/object_tracking/config.h"
-#include "tensorflow/examples/android/jni/object_tracking/keypoint.h"
-#include "tensorflow/examples/android/jni/object_tracking/keypoint_detector.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/config.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image-inl.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/keypoint.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/time_log.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/keypoint_detector.h b/tensorflow/tools/android/test/jni/object_tracking/keypoint_detector.h
similarity index 94%
rename from tensorflow/examples/android/jni/object_tracking/keypoint_detector.h
rename to tensorflow/tools/android/test/jni/object_tracking/keypoint_detector.h
index 2e85b835a70..3b96d1be4e4 100644
--- a/tensorflow/examples/android/jni/object_tracking/keypoint_detector.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/keypoint_detector.h
@@ -17,12 +17,13 @@ limitations under the License.
 #define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_KEYPOINT_DETECTOR_H_
 
 #include <stdint.h>
+
 #include <vector>
 
-#include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
-#include "tensorflow/examples/android/jni/object_tracking/image.h"
-#include "tensorflow/examples/android/jni/object_tracking/image_data.h"
-#include "tensorflow/examples/android/jni/object_tracking/optical_flow.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image-inl.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image_data.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/optical_flow.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/logging.cc b/tensorflow/tools/android/test/jni/object_tracking/logging.cc
similarity index 98%
rename from tensorflow/examples/android/jni/object_tracking/logging.cc
rename to tensorflow/tools/android/test/jni/object_tracking/logging.cc
index 0ba8a541ab7..dc82cce0620 100644
--- a/tensorflow/examples/android/jni/object_tracking/logging.cc
+++ b/tensorflow/tools/android/test/jni/object_tracking/logging.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/examples/android/jni/object_tracking/logging.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/logging.h"
 
 #ifdef STANDALONE_DEMO_LIB
 
diff --git a/tensorflow/examples/android/jni/object_tracking/logging.h b/tensorflow/tools/android/test/jni/object_tracking/logging.h
similarity index 100%
rename from tensorflow/examples/android/jni/object_tracking/logging.h
rename to tensorflow/tools/android/test/jni/object_tracking/logging.h
diff --git a/tensorflow/examples/android/jni/object_tracking/object_detector.cc b/tensorflow/tools/android/test/jni/object_tracking/object_detector.cc
similarity index 93%
rename from tensorflow/examples/android/jni/object_tracking/object_detector.cc
rename to tensorflow/tools/android/test/jni/object_tracking/object_detector.cc
index 7f65716fdf1..11eef293c2f 100644
--- a/tensorflow/examples/android/jni/object_tracking/object_detector.cc
+++ b/tensorflow/tools/android/test/jni/object_tracking/object_detector.cc
@@ -17,7 +17,7 @@ limitations under the License.
 // in this directory. This class remains mainly for historical reasons.
 // Detection in the TF demo is done through TensorFlowMultiBoxDetector.java.
 
-#include "tensorflow/examples/android/jni/object_tracking/object_detector.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/object_detector.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/object_detector.h b/tensorflow/tools/android/test/jni/object_tracking/object_detector.h
similarity index 92%
rename from tensorflow/examples/android/jni/object_tracking/object_detector.h
rename to tensorflow/tools/android/test/jni/object_tracking/object_detector.h
index a65c7b0db70..20e614b25df 100644
--- a/tensorflow/examples/android/jni/object_tracking/object_detector.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/object_detector.h
@@ -24,24 +24,24 @@ limitations under the License.
 #define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OBJECT_DETECTOR_H_
 
 #include <float.h>
+
 #include <map>
 #include <memory>
 #include <sstream>
 #include <string>
 #include <vector>
 
-#include "tensorflow/examples/android/jni/object_tracking/geom.h"
-#include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
-#include "tensorflow/examples/android/jni/object_tracking/image.h"
-#include "tensorflow/examples/android/jni/object_tracking/integral_image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/geom.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image-inl.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/integral_image.h"
 #ifdef __RENDER_OPENGL__
-#include "tensorflow/examples/android/jni/object_tracking/sprite.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/sprite.h"
 #endif
-#include "tensorflow/examples/android/jni/object_tracking/utils.h"
-
-#include "tensorflow/examples/android/jni/object_tracking/config.h"
-#include "tensorflow/examples/android/jni/object_tracking/image_data.h"
-#include "tensorflow/examples/android/jni/object_tracking/object_model.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/config.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image_data.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/object_model.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/object_model.h b/tensorflow/tools/android/test/jni/object_tracking/object_model.h
similarity index 82%
rename from tensorflow/examples/android/jni/object_tracking/object_model.h
rename to tensorflow/tools/android/test/jni/object_tracking/object_model.h
index 4bc4d5bc9eb..9a2ace01f0e 100644
--- a/tensorflow/examples/android/jni/object_tracking/object_model.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/object_model.h
@@ -29,18 +29,17 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/examples/android/jni/object_tracking/geom.h"
-#include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
-#include "tensorflow/examples/android/jni/object_tracking/image.h"
-#include "tensorflow/examples/android/jni/object_tracking/integral_image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/geom.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image-inl.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/integral_image.h"
 #ifdef __RENDER_OPENGL__
-#include "tensorflow/examples/android/jni/object_tracking/sprite.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/sprite.h"
 #endif
-#include "tensorflow/examples/android/jni/object_tracking/utils.h"
-
-#include "tensorflow/examples/android/jni/object_tracking/config.h"
-#include "tensorflow/examples/android/jni/object_tracking/image_data.h"
-#include "tensorflow/examples/android/jni/object_tracking/keypoint.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/config.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image_data.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/keypoint.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/object_tracker.cc b/tensorflow/tools/android/test/jni/object_tracking/object_tracker.cc
similarity index 95%
rename from tensorflow/examples/android/jni/object_tracking/object_tracker.cc
rename to tensorflow/tools/android/test/jni/object_tracking/object_tracker.cc
index 832dee5656f..a5345742c05 100644
--- a/tensorflow/examples/android/jni/object_tracking/object_tracker.cc
+++ b/tensorflow/tools/android/test/jni/object_tracking/object_tracker.cc
@@ -22,19 +22,19 @@ limitations under the License.
 #include <map>
 #include <string>
 
-#include "tensorflow/examples/android/jni/object_tracking/config.h"
-#include "tensorflow/examples/android/jni/object_tracking/flow_cache.h"
-#include "tensorflow/examples/android/jni/object_tracking/geom.h"
-#include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
-#include "tensorflow/examples/android/jni/object_tracking/image.h"
-#include "tensorflow/examples/android/jni/object_tracking/integral_image.h"
-#include "tensorflow/examples/android/jni/object_tracking/keypoint_detector.h"
-#include "tensorflow/examples/android/jni/object_tracking/logging.h"
-#include "tensorflow/examples/android/jni/object_tracking/object_detector.h"
-#include "tensorflow/examples/android/jni/object_tracking/object_tracker.h"
-#include "tensorflow/examples/android/jni/object_tracking/optical_flow.h"
-#include "tensorflow/examples/android/jni/object_tracking/time_log.h"
-#include "tensorflow/examples/android/jni/object_tracking/utils.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/config.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/flow_cache.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/geom.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image-inl.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/integral_image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/keypoint_detector.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/logging.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/object_detector.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/object_tracker.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/optical_flow.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/time_log.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/object_tracker.h b/tensorflow/tools/android/test/jni/object_tracking/object_tracker.h
similarity index 92%
rename from tensorflow/examples/android/jni/object_tracking/object_tracker.h
rename to tensorflow/tools/android/test/jni/object_tracking/object_tracker.h
index 20c7627fc5f..522f90e8470 100644
--- a/tensorflow/examples/android/jni/object_tracking/object_tracker.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/object_tracker.h
@@ -19,18 +19,17 @@ limitations under the License.
 #include <map>
 #include <string>
 
-#include "tensorflow/examples/android/jni/object_tracking/geom.h"
-#include "tensorflow/examples/android/jni/object_tracking/integral_image.h"
-#include "tensorflow/examples/android/jni/object_tracking/logging.h"
-#include "tensorflow/examples/android/jni/object_tracking/time_log.h"
-#include "tensorflow/examples/android/jni/object_tracking/utils.h"
-
-#include "tensorflow/examples/android/jni/object_tracking/config.h"
-#include "tensorflow/examples/android/jni/object_tracking/flow_cache.h"
-#include "tensorflow/examples/android/jni/object_tracking/keypoint_detector.h"
-#include "tensorflow/examples/android/jni/object_tracking/object_model.h"
-#include "tensorflow/examples/android/jni/object_tracking/optical_flow.h"
-#include "tensorflow/examples/android/jni/object_tracking/tracked_object.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/config.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/flow_cache.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/geom.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/integral_image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/keypoint_detector.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/logging.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/object_model.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/optical_flow.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/time_log.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/tracked_object.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/object_tracker_jni.cc b/tensorflow/tools/android/test/jni/object_tracking/object_tracker_jni.cc
similarity index 97%
rename from tensorflow/examples/android/jni/object_tracking/object_tracker_jni.cc
rename to tensorflow/tools/android/test/jni/object_tracking/object_tracker_jni.cc
index 216903802eb..33db7d52eb3 100644
--- a/tensorflow/examples/android/jni/object_tracking/object_tracker_jni.cc
+++ b/tensorflow/tools/android/test/jni/object_tracking/object_tracker_jni.cc
@@ -18,15 +18,15 @@ limitations under the License.
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
+
 #include <cstdint>
 
-#include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
-#include "tensorflow/examples/android/jni/object_tracking/image.h"
-#include "tensorflow/examples/android/jni/object_tracking/jni_utils.h"
-#include "tensorflow/examples/android/jni/object_tracking/time_log.h"
-
-#include "tensorflow/examples/android/jni/object_tracking/config.h"
-#include "tensorflow/examples/android/jni/object_tracking/object_tracker.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/config.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image-inl.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/jni_utils.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/object_tracker.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/time_log.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/optical_flow.cc b/tensorflow/tools/android/test/jni/object_tracking/optical_flow.cc
similarity index 95%
rename from tensorflow/examples/android/jni/object_tracking/optical_flow.cc
rename to tensorflow/tools/android/test/jni/object_tracking/optical_flow.cc
index 53c7b7328a4..35476cad9da 100644
--- a/tensorflow/examples/android/jni/object_tracking/optical_flow.cc
+++ b/tensorflow/tools/android/test/jni/object_tracking/optical_flow.cc
@@ -13,21 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/tools/android/test/jni/object_tracking/optical_flow.h"
+
 #include <math.h>
 
-#include "tensorflow/examples/android/jni/object_tracking/geom.h"
-#include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
-#include "tensorflow/examples/android/jni/object_tracking/image.h"
-#include "tensorflow/examples/android/jni/object_tracking/time_log.h"
-#include "tensorflow/examples/android/jni/object_tracking/utils.h"
-
-#include "tensorflow/examples/android/jni/object_tracking/config.h"
-#include "tensorflow/examples/android/jni/object_tracking/flow_cache.h"
-#include "tensorflow/examples/android/jni/object_tracking/frame_pair.h"
-#include "tensorflow/examples/android/jni/object_tracking/image_data.h"
-#include "tensorflow/examples/android/jni/object_tracking/keypoint.h"
-#include "tensorflow/examples/android/jni/object_tracking/keypoint_detector.h"
-#include "tensorflow/examples/android/jni/object_tracking/optical_flow.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/config.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/flow_cache.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/frame_pair.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/geom.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image-inl.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image_data.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/keypoint.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/keypoint_detector.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/time_log.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/optical_flow.h b/tensorflow/tools/android/test/jni/object_tracking/optical_flow.h
similarity index 86%
rename from tensorflow/examples/android/jni/object_tracking/optical_flow.h
rename to tensorflow/tools/android/test/jni/object_tracking/optical_flow.h
index f98ae22bd64..c33c15a897e 100644
--- a/tensorflow/examples/android/jni/object_tracking/optical_flow.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/optical_flow.h
@@ -16,15 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OPTICAL_FLOW_H_
 #define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_OPTICAL_FLOW_H_
 
-#include "tensorflow/examples/android/jni/object_tracking/geom.h"
-#include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
-#include "tensorflow/examples/android/jni/object_tracking/image.h"
-#include "tensorflow/examples/android/jni/object_tracking/utils.h"
-
-#include "tensorflow/examples/android/jni/object_tracking/config.h"
-#include "tensorflow/examples/android/jni/object_tracking/frame_pair.h"
-#include "tensorflow/examples/android/jni/object_tracking/image_data.h"
-#include "tensorflow/examples/android/jni/object_tracking/keypoint.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/config.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/frame_pair.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/geom.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image-inl.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image_data.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/keypoint.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/sprite.h b/tensorflow/tools/android/test/jni/object_tracking/sprite.h
similarity index 97%
rename from tensorflow/examples/android/jni/object_tracking/sprite.h
rename to tensorflow/tools/android/test/jni/object_tracking/sprite.h
index 964f1c30bfa..f12200303bb 100755
--- a/tensorflow/examples/android/jni/object_tracking/sprite.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/sprite.h
@@ -21,8 +21,8 @@ limitations under the License.
 #include <GLES/gl.h>
 #include <GLES/glext.h>
 
-#include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
-#include "tensorflow/examples/android/jni/object_tracking/image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image-inl.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/time_log.cc b/tensorflow/tools/android/test/jni/object_tracking/time_log.cc
similarity index 92%
rename from tensorflow/examples/android/jni/object_tracking/time_log.cc
rename to tensorflow/tools/android/test/jni/object_tracking/time_log.cc
index 8fce78f915f..56416bb2c20 100644
--- a/tensorflow/examples/android/jni/object_tracking/time_log.cc
+++ b/tensorflow/tools/android/test/jni/object_tracking/time_log.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/examples/android/jni/object_tracking/time_log.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/time_log.h"
 
 #ifdef LOG_TIME
 // Storage for logging functionality.
diff --git a/tensorflow/examples/android/jni/object_tracking/time_log.h b/tensorflow/tools/android/test/jni/object_tracking/time_log.h
similarity index 96%
rename from tensorflow/examples/android/jni/object_tracking/time_log.h
rename to tensorflow/tools/android/test/jni/object_tracking/time_log.h
index 0073e115963..5f9011c28b3 100644
--- a/tensorflow/examples/android/jni/object_tracking/time_log.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/time_log.h
@@ -20,8 +20,8 @@ limitations under the License.
 
 #include <stdint.h>
 
-#include "tensorflow/examples/android/jni/object_tracking/logging.h"
-#include "tensorflow/examples/android/jni/object_tracking/utils.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/logging.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
 
 #ifdef LOG_TIME
 
diff --git a/tensorflow/examples/android/jni/object_tracking/tracked_object.cc b/tensorflow/tools/android/test/jni/object_tracking/tracked_object.cc
similarity index 98%
rename from tensorflow/examples/android/jni/object_tracking/tracked_object.cc
rename to tensorflow/tools/android/test/jni/object_tracking/tracked_object.cc
index b243b84ef79..28e5158f68b 100644
--- a/tensorflow/examples/android/jni/object_tracking/tracked_object.cc
+++ b/tensorflow/tools/android/test/jni/object_tracking/tracked_object.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/examples/android/jni/object_tracking/tracked_object.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/tracked_object.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/tracked_object.h b/tensorflow/tools/android/test/jni/object_tracking/tracked_object.h
similarity index 97%
rename from tensorflow/examples/android/jni/object_tracking/tracked_object.h
rename to tensorflow/tools/android/test/jni/object_tracking/tracked_object.h
index 6a85449c1e1..31846d3d3a4 100644
--- a/tensorflow/examples/android/jni/object_tracking/tracked_object.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/tracked_object.h
@@ -17,9 +17,9 @@ limitations under the License.
 #define TENSORFLOW_EXAMPLES_ANDROID_JNI_OBJECT_TRACKING_TRACKED_OBJECT_H_
 
 #ifdef __RENDER_OPENGL__
-#include "tensorflow/examples/android/jni/object_tracking/gl_utils.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/gl_utils.h"
 #endif
-#include "tensorflow/examples/android/jni/object_tracking/object_detector.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/object_detector.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/object_tracking/utils.h b/tensorflow/tools/android/test/jni/object_tracking/utils.h
similarity index 99%
rename from tensorflow/examples/android/jni/object_tracking/utils.h
rename to tensorflow/tools/android/test/jni/object_tracking/utils.h
index 2e98734ec4e..09d06ecb014 100644
--- a/tensorflow/examples/android/jni/object_tracking/utils.h
+++ b/tensorflow/tools/android/test/jni/object_tracking/utils.h
@@ -28,7 +28,7 @@ limitations under the License.
 #include <sys/time.h>
 #endif  // ifdef HAVE_CLOCK_GETTIME
 
-#include "tensorflow/examples/android/jni/object_tracking/logging.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/logging.h"
 
 // TODO(andrewharp): clean up these macros to use the codebase statndard.
 
diff --git a/tensorflow/examples/android/jni/object_tracking/utils_neon.cc b/tensorflow/tools/android/test/jni/object_tracking/utils_neon.cc
similarity index 94%
rename from tensorflow/examples/android/jni/object_tracking/utils_neon.cc
rename to tensorflow/tools/android/test/jni/object_tracking/utils_neon.cc
index 5a5250e32ea..1f1dff53127 100755
--- a/tensorflow/examples/android/jni/object_tracking/utils_neon.cc
+++ b/tensorflow/tools/android/test/jni/object_tracking/utils_neon.cc
@@ -20,10 +20,10 @@ limitations under the License.
 
 #include <arm_neon.h>
 
-#include "tensorflow/examples/android/jni/object_tracking/geom.h"
-#include "tensorflow/examples/android/jni/object_tracking/image-inl.h"
-#include "tensorflow/examples/android/jni/object_tracking/image.h"
-#include "tensorflow/examples/android/jni/object_tracking/utils.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/geom.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image-inl.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/image.h"
+#include "tensorflow/tools/android/test/jni/object_tracking/utils.h"
 
 namespace tf_tracking {
 
diff --git a/tensorflow/examples/android/jni/rgb2yuv.cc b/tensorflow/tools/android/test/jni/rgb2yuv.cc
similarity index 98%
rename from tensorflow/examples/android/jni/rgb2yuv.cc
rename to tensorflow/tools/android/test/jni/rgb2yuv.cc
index b20ce580a27..6bb9e4847ea 100755
--- a/tensorflow/examples/android/jni/rgb2yuv.cc
+++ b/tensorflow/tools/android/test/jni/rgb2yuv.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // These utility functions allow for the conversion of RGB data to YUV data.
 
-#include "tensorflow/examples/android/jni/rgb2yuv.h"
+#include "tensorflow/tools/android/test/jni/rgb2yuv.h"
 
 static inline void WriteYUV(const int x, const int y, const int width,
                             const int r8, const int g8, const int b8,
diff --git a/tensorflow/examples/android/jni/rgb2yuv.h b/tensorflow/tools/android/test/jni/rgb2yuv.h
similarity index 100%
rename from tensorflow/examples/android/jni/rgb2yuv.h
rename to tensorflow/tools/android/test/jni/rgb2yuv.h
diff --git a/tensorflow/examples/android/jni/version_script.lds b/tensorflow/tools/android/test/jni/version_script.lds
similarity index 100%
rename from tensorflow/examples/android/jni/version_script.lds
rename to tensorflow/tools/android/test/jni/version_script.lds
diff --git a/tensorflow/examples/android/jni/yuv2rgb.cc b/tensorflow/tools/android/test/jni/yuv2rgb.cc
similarity index 99%
rename from tensorflow/examples/android/jni/yuv2rgb.cc
rename to tensorflow/tools/android/test/jni/yuv2rgb.cc
index 96d9632e572..91c163eec94 100644
--- a/tensorflow/examples/android/jni/yuv2rgb.cc
+++ b/tensorflow/tools/android/test/jni/yuv2rgb.cc
@@ -16,7 +16,7 @@ limitations under the License.
 // This is a collection of routines which converts various YUV image formats
 // to ARGB.
 
-#include "tensorflow/examples/android/jni/yuv2rgb.h"
+#include "tensorflow/tools/android/test/jni/yuv2rgb.h"
 
 #ifndef MAX
 #define MAX(a, b) ({__typeof__(a) _a = (a); __typeof__(b) _b = (b); _a > _b ? _a : _b; })
diff --git a/tensorflow/examples/android/jni/yuv2rgb.h b/tensorflow/tools/android/test/jni/yuv2rgb.h
similarity index 100%
rename from tensorflow/examples/android/jni/yuv2rgb.h
rename to tensorflow/tools/android/test/jni/yuv2rgb.h
diff --git a/tensorflow/examples/android/res/animator/color_animation.xml b/tensorflow/tools/android/test/res/animator/color_animation.xml
similarity index 100%
rename from tensorflow/examples/android/res/animator/color_animation.xml
rename to tensorflow/tools/android/test/res/animator/color_animation.xml
diff --git a/tensorflow/tools/android/test/res/drawable-hdpi/ic_action_info.png b/tensorflow/tools/android/test/res/drawable-hdpi/ic_action_info.png
new file mode 100644
index 00000000000..cd333b6f080
Binary files /dev/null and b/tensorflow/tools/android/test/res/drawable-hdpi/ic_action_info.png differ
diff --git a/tensorflow/tools/android/test/res/drawable-hdpi/ic_launcher.png b/tensorflow/tools/android/test/res/drawable-hdpi/ic_launcher.png
new file mode 100644
index 00000000000..52cf2ab9529
Binary files /dev/null and b/tensorflow/tools/android/test/res/drawable-hdpi/ic_launcher.png differ
diff --git a/tensorflow/tools/android/test/res/drawable-hdpi/tile.9.png b/tensorflow/tools/android/test/res/drawable-hdpi/tile.9.png
new file mode 100644
index 00000000000..a84e3ef52c6
Binary files /dev/null and b/tensorflow/tools/android/test/res/drawable-hdpi/tile.9.png differ
diff --git a/tensorflow/tools/android/test/res/drawable-mdpi/ic_action_info.png b/tensorflow/tools/android/test/res/drawable-mdpi/ic_action_info.png
new file mode 100644
index 00000000000..b08a3e0f54b
Binary files /dev/null and b/tensorflow/tools/android/test/res/drawable-mdpi/ic_action_info.png differ
diff --git a/tensorflow/tools/android/test/res/drawable-mdpi/ic_launcher.png b/tensorflow/tools/android/test/res/drawable-mdpi/ic_launcher.png
new file mode 100644
index 00000000000..b75f892c462
Binary files /dev/null and b/tensorflow/tools/android/test/res/drawable-mdpi/ic_launcher.png differ
diff --git a/tensorflow/tools/android/test/res/drawable-xhdpi/ic_action_info.png b/tensorflow/tools/android/test/res/drawable-xhdpi/ic_action_info.png
new file mode 100644
index 00000000000..eb174e17169
Binary files /dev/null and b/tensorflow/tools/android/test/res/drawable-xhdpi/ic_action_info.png differ
diff --git a/tensorflow/tools/android/test/res/drawable-xhdpi/ic_launcher.png b/tensorflow/tools/android/test/res/drawable-xhdpi/ic_launcher.png
new file mode 100644
index 00000000000..36e14c48d14
Binary files /dev/null and b/tensorflow/tools/android/test/res/drawable-xhdpi/ic_launcher.png differ
diff --git a/tensorflow/tools/android/test/res/drawable-xxhdpi/ic_action_info.png b/tensorflow/tools/android/test/res/drawable-xxhdpi/ic_action_info.png
new file mode 100644
index 00000000000..9654fd3b416
Binary files /dev/null and b/tensorflow/tools/android/test/res/drawable-xxhdpi/ic_action_info.png differ
diff --git a/tensorflow/tools/android/test/res/drawable-xxhdpi/ic_launcher.png b/tensorflow/tools/android/test/res/drawable-xxhdpi/ic_launcher.png
new file mode 100644
index 00000000000..06dd2a740ec
Binary files /dev/null and b/tensorflow/tools/android/test/res/drawable-xxhdpi/ic_launcher.png differ
diff --git a/tensorflow/examples/android/res/drawable/border.xml b/tensorflow/tools/android/test/res/drawable/border.xml
similarity index 100%
rename from tensorflow/examples/android/res/drawable/border.xml
rename to tensorflow/tools/android/test/res/drawable/border.xml
diff --git a/tensorflow/examples/android/res/layout/activity_camera.xml b/tensorflow/tools/android/test/res/layout/activity_camera.xml
similarity index 100%
rename from tensorflow/examples/android/res/layout/activity_camera.xml
rename to tensorflow/tools/android/test/res/layout/activity_camera.xml
diff --git a/tensorflow/examples/android/res/layout/activity_speech.xml b/tensorflow/tools/android/test/res/layout/activity_speech.xml
similarity index 100%
rename from tensorflow/examples/android/res/layout/activity_speech.xml
rename to tensorflow/tools/android/test/res/layout/activity_speech.xml
diff --git a/tensorflow/examples/android/res/layout/camera_connection_fragment.xml b/tensorflow/tools/android/test/res/layout/camera_connection_fragment.xml
similarity index 100%
rename from tensorflow/examples/android/res/layout/camera_connection_fragment.xml
rename to tensorflow/tools/android/test/res/layout/camera_connection_fragment.xml
diff --git a/tensorflow/examples/android/res/layout/camera_connection_fragment_stylize.xml b/tensorflow/tools/android/test/res/layout/camera_connection_fragment_stylize.xml
similarity index 100%
rename from tensorflow/examples/android/res/layout/camera_connection_fragment_stylize.xml
rename to tensorflow/tools/android/test/res/layout/camera_connection_fragment_stylize.xml
diff --git a/tensorflow/examples/android/res/layout/camera_connection_fragment_tracking.xml b/tensorflow/tools/android/test/res/layout/camera_connection_fragment_tracking.xml
similarity index 100%
rename from tensorflow/examples/android/res/layout/camera_connection_fragment_tracking.xml
rename to tensorflow/tools/android/test/res/layout/camera_connection_fragment_tracking.xml
diff --git a/tensorflow/examples/android/res/layout/list_text_item.xml b/tensorflow/tools/android/test/res/layout/list_text_item.xml
similarity index 100%
rename from tensorflow/examples/android/res/layout/list_text_item.xml
rename to tensorflow/tools/android/test/res/layout/list_text_item.xml
diff --git a/tensorflow/examples/android/res/values-sw600dp/template-dimens.xml b/tensorflow/tools/android/test/res/values-sw600dp/template-dimens.xml
similarity index 100%
rename from tensorflow/examples/android/res/values-sw600dp/template-dimens.xml
rename to tensorflow/tools/android/test/res/values-sw600dp/template-dimens.xml
diff --git a/tensorflow/examples/android/res/values-sw600dp/template-styles.xml b/tensorflow/tools/android/test/res/values-sw600dp/template-styles.xml
similarity index 100%
rename from tensorflow/examples/android/res/values-sw600dp/template-styles.xml
rename to tensorflow/tools/android/test/res/values-sw600dp/template-styles.xml
diff --git a/tensorflow/examples/android/res/values-v11/styles.xml b/tensorflow/tools/android/test/res/values-v11/styles.xml
similarity index 100%
rename from tensorflow/examples/android/res/values-v11/styles.xml
rename to tensorflow/tools/android/test/res/values-v11/styles.xml
diff --git a/tensorflow/examples/android/res/values-v11/template-styles.xml b/tensorflow/tools/android/test/res/values-v11/template-styles.xml
similarity index 100%
rename from tensorflow/examples/android/res/values-v11/template-styles.xml
rename to tensorflow/tools/android/test/res/values-v11/template-styles.xml
diff --git a/tensorflow/examples/android/res/values-v14/styles.xml b/tensorflow/tools/android/test/res/values-v14/styles.xml
similarity index 100%
rename from tensorflow/examples/android/res/values-v14/styles.xml
rename to tensorflow/tools/android/test/res/values-v14/styles.xml
diff --git a/tensorflow/examples/android/res/values-v21/base-colors.xml b/tensorflow/tools/android/test/res/values-v21/base-colors.xml
similarity index 100%
rename from tensorflow/examples/android/res/values-v21/base-colors.xml
rename to tensorflow/tools/android/test/res/values-v21/base-colors.xml
diff --git a/tensorflow/examples/android/res/values-v21/base-template-styles.xml b/tensorflow/tools/android/test/res/values-v21/base-template-styles.xml
similarity index 100%
rename from tensorflow/examples/android/res/values-v21/base-template-styles.xml
rename to tensorflow/tools/android/test/res/values-v21/base-template-styles.xml
diff --git a/tensorflow/examples/android/res/values/attrs.xml b/tensorflow/tools/android/test/res/values/attrs.xml
similarity index 100%
rename from tensorflow/examples/android/res/values/attrs.xml
rename to tensorflow/tools/android/test/res/values/attrs.xml
diff --git a/tensorflow/examples/android/res/values/base-strings.xml b/tensorflow/tools/android/test/res/values/base-strings.xml
similarity index 100%
rename from tensorflow/examples/android/res/values/base-strings.xml
rename to tensorflow/tools/android/test/res/values/base-strings.xml
diff --git a/tensorflow/examples/android/res/values/colors.xml b/tensorflow/tools/android/test/res/values/colors.xml
similarity index 100%
rename from tensorflow/examples/android/res/values/colors.xml
rename to tensorflow/tools/android/test/res/values/colors.xml
diff --git a/tensorflow/examples/android/res/values/strings.xml b/tensorflow/tools/android/test/res/values/strings.xml
similarity index 100%
rename from tensorflow/examples/android/res/values/strings.xml
rename to tensorflow/tools/android/test/res/values/strings.xml
diff --git a/tensorflow/examples/android/res/values/styles.xml b/tensorflow/tools/android/test/res/values/styles.xml
similarity index 100%
rename from tensorflow/examples/android/res/values/styles.xml
rename to tensorflow/tools/android/test/res/values/styles.xml
diff --git a/tensorflow/examples/android/res/values/template-dimens.xml b/tensorflow/tools/android/test/res/values/template-dimens.xml
similarity index 100%
rename from tensorflow/examples/android/res/values/template-dimens.xml
rename to tensorflow/tools/android/test/res/values/template-dimens.xml
diff --git a/tensorflow/examples/android/res/values/template-styles.xml b/tensorflow/tools/android/test/res/values/template-styles.xml
similarity index 100%
rename from tensorflow/examples/android/res/values/template-styles.xml
rename to tensorflow/tools/android/test/res/values/template-styles.xml
diff --git a/tensorflow/examples/android/sample_images/classify1.jpg b/tensorflow/tools/android/test/sample_images/classify1.jpg
similarity index 100%
rename from tensorflow/examples/android/sample_images/classify1.jpg
rename to tensorflow/tools/android/test/sample_images/classify1.jpg
diff --git a/tensorflow/examples/android/sample_images/detect1.jpg b/tensorflow/tools/android/test/sample_images/detect1.jpg
similarity index 100%
rename from tensorflow/examples/android/sample_images/detect1.jpg
rename to tensorflow/tools/android/test/sample_images/detect1.jpg
diff --git a/tensorflow/examples/android/sample_images/stylize1.jpg b/tensorflow/tools/android/test/sample_images/stylize1.jpg
similarity index 100%
rename from tensorflow/examples/android/sample_images/stylize1.jpg
rename to tensorflow/tools/android/test/sample_images/stylize1.jpg
diff --git a/tensorflow/examples/android/settings.gradle b/tensorflow/tools/android/test/settings.gradle
similarity index 100%
rename from tensorflow/examples/android/settings.gradle
rename to tensorflow/tools/android/test/settings.gradle
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/AutoFitTextureView.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/AutoFitTextureView.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/AutoFitTextureView.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/AutoFitTextureView.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/CameraActivity.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/CameraActivity.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/CameraActivity.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/CameraConnectionFragment.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/CameraConnectionFragment.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/CameraConnectionFragment.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/Classifier.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/Classifier.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/Classifier.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/Classifier.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/ClassifierActivity.java
similarity index 99%
rename from tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/ClassifierActivity.java
index e2c394dde92..69595674ad4 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/ClassifierActivity.java
+++ b/tensorflow/tools/android/test/src/org/tensorflow/demo/ClassifierActivity.java
@@ -26,8 +26,6 @@ import android.media.ImageReader.OnImageAvailableListener;
 import android.os.SystemClock;
 import android.util.Size;
 import android.util.TypedValue;
-import android.view.Display;
-import android.view.Surface;
 import java.util.List;
 import java.util.Vector;
 import org.tensorflow.demo.OverlayView.DrawCallback;
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/DetectorActivity.java
similarity index 99%
rename from tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/DetectorActivity.java
index f778f3de425..5a7893d7011 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java
+++ b/tensorflow/tools/android/test/src/org/tensorflow/demo/DetectorActivity.java
@@ -29,8 +29,6 @@ import android.media.ImageReader.OnImageAvailableListener;
 import android.os.SystemClock;
 import android.util.Size;
 import android.util.TypedValue;
-import android.view.Display;
-import android.view.Surface;
 import android.widget.Toast;
 import java.io.IOException;
 import java.util.LinkedList;
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/LegacyCameraConnectionFragment.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/OverlayView.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/OverlayView.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/OverlayView.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/OverlayView.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/RecognitionScoreView.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/RecognitionScoreView.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/RecognitionScoreView.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/RecognitionScoreView.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/RecognizeCommands.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/RecognizeCommands.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/RecognizeCommands.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/RecognizeCommands.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/ResultsView.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/ResultsView.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/ResultsView.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/ResultsView.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/SpeechActivity.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/SpeechActivity.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/SpeechActivity.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/StylizeActivity.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/StylizeActivity.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/TensorFlowImageClassifier.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowImageClassifier.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/TensorFlowImageClassifier.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/TensorFlowMultiBoxDetector.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowObjectDetectionAPIModel.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/TensorFlowObjectDetectionAPIModel.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowObjectDetectionAPIModel.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/TensorFlowObjectDetectionAPIModel.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowYoloDetector.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/TensorFlowYoloDetector.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/TensorFlowYoloDetector.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/TensorFlowYoloDetector.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/env/BorderedText.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/env/BorderedText.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/env/BorderedText.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/env/BorderedText.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/env/ImageUtils.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/env/ImageUtils.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/env/ImageUtils.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/env/Logger.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/env/Logger.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/env/Logger.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/env/Logger.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/env/Size.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/env/Size.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/env/Size.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/env/Size.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/env/SplitTimer.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/env/SplitTimer.java
similarity index 100%
rename from tensorflow/examples/android/src/org/tensorflow/demo/env/SplitTimer.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/env/SplitTimer.java
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
similarity index 99%
rename from tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
index af6af2bc8f5..b74c5797604 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
+++ b/tensorflow/tools/android/test/src/org/tensorflow/demo/tracking/MultiBoxTracker.java
@@ -215,7 +215,7 @@ public class MultiBoxTracker {
       if (objectTracker == null) {
         String message =
             "Object tracking support not found. "
-                + "See tensorflow/examples/android/README.md for details.";
+                + "See tensorflow/tools/android/test/README.md for details.";
         Toast.makeText(context, message, Toast.LENGTH_LONG).show();
         logger.e(message);
       }
diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java b/tensorflow/tools/android/test/src/org/tensorflow/demo/tracking/ObjectTracker.java
similarity index 99%
rename from tensorflow/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
rename to tensorflow/tools/android/test/src/org/tensorflow/demo/tracking/ObjectTracker.java
index 7ad4647d40c..1652d9090a9 100644
--- a/tensorflow/examples/android/src/org/tensorflow/demo/tracking/ObjectTracker.java
+++ b/tensorflow/tools/android/test/src/org/tensorflow/demo/tracking/ObjectTracker.java
@@ -208,7 +208,7 @@ public class ObjectTracker {
     if (!libraryFound) {
       LOGGER.e(
           "Native object tracking support not found. "
-              + "See tensorflow/examples/android/README.md for details.");
+              + "See tensorflow/tools/android/test/README.md for details.");
       return null;
     }
 
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
index 3d8187ca752..88fd63d9693 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
@@ -75,6 +75,13 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_BOOL
     }
+    field {
+      name: "mlir_bridge_rollout"
+      number: 17
+      label: LABEL_OPTIONAL
+      type: TYPE_ENUM
+      type_name: ".tensorflow.ConfigProto.Experimental.MlirBridgeRollout"
+    }
     field {
       name: "enable_mlir_graph_optimization"
       number: 16
@@ -93,6 +100,21 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_INT64
     }
+    enum_type {
+      name: "MlirBridgeRollout"
+      value: {
+        name: "MLIR_BRIDGE_ROLLOUT_UNSPECIFIED"
+        number: 0
+      }
+      value: {
+        name: "MLIR_BRIDGE_ROLLOUT_ENABLED"
+        number: 1
+      }
+      value: {
+        name: "MLIR_BRIDGE_ROLLOUT_DISABLED"
+        number: 2
+      }
+    }
     reserved_range {
       start: 2
       end: 3
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
index c32fdee5af0..a598071b970 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
@@ -204,6 +204,13 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_BOOL
       }
+      field {
+        name: "mlir_bridge_rollout"
+        number: 17
+        label: LABEL_OPTIONAL
+        type: TYPE_ENUM
+        type_name: ".tensorflow.ConfigProto.Experimental.MlirBridgeRollout"
+      }
       field {
         name: "enable_mlir_graph_optimization"
         number: 16
@@ -222,6 +229,21 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_INT64
       }
+      enum_type {
+        name: "MlirBridgeRollout"
+        value: {
+          name: "MLIR_BRIDGE_ROLLOUT_UNSPECIFIED"
+          number: 0
+        }
+        value: {
+          name: "MLIR_BRIDGE_ROLLOUT_ENABLED"
+          number: 1
+        }
+        value: {
+          name: "MLIR_BRIDGE_ROLLOUT_DISABLED"
+          number: 2
+        }
+      }
       reserved_range {
         start: 2
         end: 3
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor-spec.pbtxt
index 25c3f6e3e11..a9043abdacb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-ragged-tensor-spec.pbtxt
@@ -8,6 +8,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "flat_values_spec"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "ragged_rank"
     mtype: "<type \'property\'>"
@@ -26,7 +30,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'shape\', \'dtype\', \'ragged_rank\', \'row_splits_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \"<dtype: \'int64\'>\"], "
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'ragged_rank\', \'row_splits_dtype\', \'flat_values_spec\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "from_value"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.pbtxt
index 7397719e656..f6be3da59b8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.pbtxt
@@ -24,6 +24,10 @@ tf_module {
     name: "enable_mlir_graph_optimization"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "enable_tensor_float_32_execution"
+    argspec: "args=[\'enabled\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_device_details"
     argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"
@@ -36,6 +40,10 @@ tf_module {
     name: "get_memory_growth"
     argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_memory_usage"
+    argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_synchronous_execution"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -76,4 +84,8 @@ tf_module {
     name: "set_visible_devices"
     argspec: "args=[\'devices\', \'device_type\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "tensor_float_32_execution_enabled"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index 2621415a8ec..92d48198ca3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -65,7 +65,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 9c5735dfbb0..6ef93ca9890 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -67,7 +67,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index e200aacf19b..7249cdff1be 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -67,7 +67,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index 01ad7819cd2..79aeac1d2a1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -67,7 +67,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
index aae0721ce44..b81d19a161f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -67,7 +67,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
index e33265430ea..f0ada15d5cd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -19,6 +19,10 @@ tf_class {
     name: "autotune_cpu_budget"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune_ram_budget"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "filter_fusion"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
index 6463166881c..cacd6fa0d0a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -67,7 +67,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
index d037ee8e7cf..3cbcda297e4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -67,7 +67,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-dispatcher-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
new file mode 100644
index 00000000000..2364e41e4c5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
@@ -0,0 +1,39 @@
+path: "tensorflow.data.experimental.service.DispatcherConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.DispatcherConfig\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.DispatcherConfig\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "fault_tolerant_mode"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "job_gc_check_interval_ms"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "job_gc_timeout_ms"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "port"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "protocol"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "work_dir"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-worker-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-worker-config.pbtxt
new file mode 100644
index 00000000000..d8eaf9bc7d7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.-worker-config.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.data.experimental.service.WorkerConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.WorkerConfig\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.WorkerConfig\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "dispatcher_address"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "heartbeat_interval_ms"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "port"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "protocol"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_address"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt
index 3630c97da93..1d91c01c2a5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.data.experimental.service"
 tf_module {
+  member {
+    name: "DispatcherConfig"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "WorkerConfig"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "distribute"
     argspec: "args=[\'processing_mode\', \'service\', \'job_name\', \'max_outstanding_requests\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
index 1d2af017d84..6b64ad6d1ab 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.data"
 tf_module {
+  member {
+    name: "AUTOTUNE"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "Dataset"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-cross-device-ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-cross-device-ops.pbtxt
index 0a8e0b4421a..c35e409975d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-cross-device-ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-cross-device-ops.pbtxt
@@ -8,11 +8,11 @@ tf_class {
   }
   member_method {
     name: "batch_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "broadcast"
@@ -24,10 +24,10 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
index b5ccb39d075..68f62ba9e0b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
@@ -10,11 +10,11 @@ tf_class {
   }
   member_method {
     name: "batch_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "broadcast"
@@ -26,10 +26,10 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
index 85dd7f5eaa6..009e612f94f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -28,6 +28,10 @@ tf_class {
     name: "configure"
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-nccl-all-reduce.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-nccl-all-reduce.pbtxt
index 1a039b10501..266447848d0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-nccl-all-reduce.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-nccl-all-reduce.pbtxt
@@ -10,11 +10,11 @@ tf_class {
   }
   member_method {
     name: "batch_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "broadcast"
@@ -26,10 +26,10 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
index 23e03ceab02..e6729e82004 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -28,6 +28,10 @@ tf_class {
     name: "configure"
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduction-to-one-device.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduction-to-one-device.pbtxt
index 7876166dc40..55939070e23 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduction-to-one-device.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduction-to-one-device.pbtxt
@@ -9,11 +9,11 @@ tf_class {
   }
   member_method {
     name: "batch_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "broadcast"
@@ -25,10 +25,10 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
index 0101212e4cc..5a9c88dddc0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
@@ -24,7 +24,7 @@ tf_class {
   }
   member_method {
     name: "all_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "merge_call"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
index b2d9d4ee2cb..7d7efcca81e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
@@ -37,7 +37,7 @@ tf_class {
   }
   member_method {
     name: "batch_reduce_to"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "broadcast_to"
@@ -69,7 +69,7 @@ tf_class {
   }
   member_method {
     name: "reduce_to"
-    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "update"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
index 7fbd9dded22..00964c7cbb0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
@@ -27,6 +27,10 @@ tf_class {
     name: "configure"
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
index 2f7ba2db15c..b360ed1f628 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
@@ -28,6 +28,10 @@ tf_class {
     name: "configure"
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-collective-communication.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-collective-communication.pbtxt
index 7eca1c80d8b..a9deaeeebbc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-collective-communication.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-collective-communication.pbtxt
@@ -1,16 +1,16 @@
 path: "tensorflow.distribute.experimental.CollectiveCommunication"
 tf_class {
-  is_instance: "<enum \'CollectiveCommunication\'>"
+  is_instance: "<enum \'CommunicationImplemenation\'>"
   member {
     name: "AUTO"
-    mtype: "<enum \'CollectiveCommunication\'>"
+    mtype: "<enum \'CommunicationImplemenation\'>"
   }
   member {
     name: "NCCL"
-    mtype: "<enum \'CollectiveCommunication\'>"
+    mtype: "<enum \'CommunicationImplemenation\'>"
   }
   member {
     name: "RING"
-    mtype: "<enum \'CollectiveCommunication\'>"
+    mtype: "<enum \'CommunicationImplemenation\'>"
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-communication-implemenation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-communication-implemenation.pbtxt
new file mode 100644
index 00000000000..cc73b4a28ff
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-communication-implemenation.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.distribute.experimental.CommunicationImplemenation"
+tf_class {
+  is_instance: "<enum \'CommunicationImplemenation\'>"
+  member {
+    name: "AUTO"
+    mtype: "<enum \'CommunicationImplemenation\'>"
+  }
+  member {
+    name: "NCCL"
+    mtype: "<enum \'CommunicationImplemenation\'>"
+  }
+  member {
+    name: "RING"
+    mtype: "<enum \'CommunicationImplemenation\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-communication-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-communication-options.pbtxt
new file mode 100644
index 00000000000..34f226cedc8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-communication-options.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.distribute.experimental.CommunicationOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.collective_util._OptionsExported\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'bytes_per_pack\', \'timeout_seconds\', \'implementation\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'CommunicationImplemenation.AUTO\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
index dac5652c7fd..a61a0f486ce 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -18,7 +18,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'communication\', \'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'CollectiveCommunication.AUTO\', \'None\'], "
+    argspec: "args=[\'self\', \'communication\', \'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'CommunicationImplemenation.AUTO\', \'None\'], "
   }
   member_method {
     name: "colocate_vars_with"
@@ -28,6 +28,10 @@ tf_class {
     name: "configure"
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
index f63c16dec5a..0e78cb93209 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -28,6 +28,10 @@ tf_class {
     name: "configure"
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
index 53d5b756568..4cbcab923c6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
@@ -32,6 +32,10 @@ tf_class {
     name: "configure"
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "experimental_distribute_dataset"
     argspec: "args=[\'self\', \'dataset\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.pbtxt
index 9247db37925..704b774d205 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.experimental.pbtxt
@@ -12,6 +12,14 @@ tf_module {
     name: "CollectiveHints"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CommunicationImplemenation"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "CommunicationOptions"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "MultiWorkerMirroredStrategy"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-run-config.pbtxt
index b730913ca91..e124a23e9a3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-run-config.pbtxt
@@ -2,6 +2,10 @@ path: "tensorflow.estimator.RunConfig"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.run_config.RunConfig\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "checkpoint_save_graph_def"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "cluster_spec"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\', \'protocol\', \'eval_distribute\', \'experimental_distribute\', \'experimental_max_worker_delay_secs\', \'session_creation_timeout_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'7200\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\', \'protocol\', \'eval_distribute\', \'experimental_distribute\', \'experimental_max_worker_delay_secs\', \'session_creation_timeout_secs\', \'checkpoint_save_graph_def\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'7200\', \'True\'], "
   }
   member_method {
     name: "replace"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-run-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-run-config.pbtxt
index 3c94cf708e0..f4c2f888de0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-run-config.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.tpu.tpu_config.RunConfig\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.run_config.RunConfig\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "checkpoint_save_graph_def"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "cluster"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-t-p-u-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-t-p-u-config.pbtxt
index e329045123e..16f39f9fe2e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-t-p-u-config.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.-t-p-u-config.pbtxt
@@ -11,6 +11,10 @@ tf_class {
     name: "experimental_allow_per_host_v2_parallel_get_next"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "experimental_feed_hook"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "experimental_host_call_every_n_steps"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index a9f6f069560..da08722a7a3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -174,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 168539be647..1719c8bd9c7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -180,7 +180,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.estimator.pbtxt
index d0dca9a5a31..0a9ee49aecd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.estimator.pbtxt
@@ -2,6 +2,6 @@ path: "tensorflow.keras.estimator"
 tf_module {
   member_method {
     name: "model_to_estimator"
-    argspec: "args=[\'keras_model\', \'keras_model_path\', \'custom_objects\', \'model_dir\', \'config\', \'checkpoint_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'saver\'], "
+    argspec: "args=[\'keras_model\', \'keras_model_path\', \'custom_objects\', \'model_dir\', \'config\', \'checkpoint_format\', \'metric_names_map\', \'export_outputs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'saver\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
index 2aff054a51d..d93c018b073 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -175,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 3bd1bc2c939..1b50d327cd6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index ed49246e458..9fba915d01a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -175,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
index 4a83b58df83..340f2705b85 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.pbtxt
@@ -40,8 +40,4 @@ tf_module {
     name: "load_from_saved_model"
     argspec: "args=[\'saved_model_path\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "terminate_keras_multiprocessing_pools"
-    argspec: "args=[\'grace_period\', \'use_sigkill\'], varargs=None, keywords=None, defaults=[\'0.1\', \'False\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
index 96b809486a7..0c85d31934a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-additive-attention.pbtxt
@@ -150,7 +150,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
index ae2cb7f7e20..b2c3156cf7a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-attention.pbtxt
@@ -150,7 +150,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 7ed2d307cd0..a4f9e447bdf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
index 0a2ebd2cfe7..f0e84b8edd7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -190,7 +190,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 27cada1194e..13fbf554fd3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 080fb51d538..ff9e8b6df74 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -190,7 +190,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multi-head-attention.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multi-head-attention.pbtxt
index 070ee20ab30..89fbb32194a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multi-head-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multi-head-attention.pbtxt
@@ -149,7 +149,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'query\', \'value\', \'key\', \'attention_mask\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'query\', \'value\', \'key\', \'attention_mask\', \'return_attention_scores\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
index ee9a0254382..7c68f9ef783 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\', \'fill_value\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "adapt"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
index 7e1095e7503..73866dfcf50 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\', \'fill_value\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "adapt"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
index fd59a92a4af..392e2efef39 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'reflect\', \'bilinear\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\', \'fill_value\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'reflect\', \'bilinear\', \'None\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "adapt"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
index 57876312213..b74d27bc159 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
@@ -220,6 +220,14 @@ tf_module {
     name: "kullback_leibler_divergence"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "log_cosh"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logcosh"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "mae"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
index 58f8cf24495..3c016d331de 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
@@ -13,22 +13,18 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "learning_rate"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "loss_scale"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "lr"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "weights"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index 4368742d7bb..15c0ab5abbb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -174,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 8e9409f27a9..729fdd660ca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -180,7 +180,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
index fb341cb24dd..88e4ecfbb62 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
index d8039ed21ef..89e0718d5b6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
index 912f92f83a6..29b1fba5aae 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adam.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
index 3abc6d39b3f..c481aa07ace 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
index 6257c71809e..a2b9d310eb9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
index 2ce311d3504..650ac77d6df 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
index 4855395e020..50e3da3eda5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -11,6 +11,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index 80a1449613c..ab8391e0465 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
index 8acfe214256..2bad07d9998 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
deleted file mode 100644
index 6b832051a97..00000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
+++ /dev/null
@@ -1,29 +0,0 @@
-path: "tensorflow.keras.utils.HDF5Matrix"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.utils.io_utils.HDF5Matrix\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ndim"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "refs"
-    mtype: "<type \'collections.defaultdict\'>"
-  }
-  member {
-    name: "shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "size"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'datapath\', \'dataset\', \'start\', \'end\', \'normalizer\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt
index 0b8876a5988..5df4850d2d2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.pbtxt
@@ -8,10 +8,6 @@ tf_module {
     name: "GeneratorEnqueuer"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "HDF5Matrix"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "OrderedEnqueuer"
     mtype: "<type \'type\'>"
@@ -32,10 +28,6 @@ tf_module {
     name: "custom_object_scope"
     mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "convert_all_kernels_in_model"
-    argspec: "args=[\'model\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "deserialize_keras_object"
     argspec: "args=[\'identifier\', \'module_objects\', \'custom_objects\', \'printable_module_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'object\'], "
@@ -64,10 +56,6 @@ tf_module {
     name: "model_to_dot"
     argspec: "args=[\'model\', \'show_shapes\', \'show_dtype\', \'show_layer_names\', \'rankdir\', \'expand_nested\', \'dpi\', \'subgraph\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'True\', \'TB\', \'False\', \'96\', \'False\'], "
   }
-  member_method {
-    name: "multi_gpu_model"
-    argspec: "args=[\'model\', \'gpus\', \'cpu_merge\', \'cpu_relocation\'], varargs=None, keywords=None, defaults=[\'True\', \'False\'], "
-  }
   member_method {
     name: "normalize"
     argspec: "args=[\'x\', \'axis\', \'order\'], varargs=None, keywords=None, defaults=[\'-1\', \'2\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
index 1b865125114..2b2cbe615c3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
@@ -144,6 +144,10 @@ tf_module {
     name: "erfc"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "erfcinv"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "erfinv"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.mlir.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.mlir.experimental.pbtxt
index e268fcf8e73..7a140a13bc6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.mlir.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.mlir.experimental.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.mlir.experimental"
 tf_module {
+  member_method {
+    name: "convert_function"
+    argspec: "args=[\'concrete_function\', \'pass_pipeline\'], varargs=None, keywords=None, defaults=[\'tf-standard-pipeline\'], "
+  }
   member_method {
     name: "convert_graph_def"
     argspec: "args=[\'graph_def\', \'pass_pipeline\'], varargs=None, keywords=None, defaults=[\'tf-standard-pipeline\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index ba64d009908..abd33957365 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -1824,6 +1824,10 @@ tf_module {
     name: "quantize"
     argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'round_mode\', \'name\', \'narrow_range\', \'axis\', \'ensure_minimum_range\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'HALF_AWAY_FROM_ZERO\', \'None\', \'False\', \'None\', \'0.01\'], "
   }
+  member_method {
+    name: "quantize_and_dequantize_v4"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'False\', \'-1\', \'None\'], "
+  }
   member_method {
     name: "quantize_v2"
     argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'name\', \'round_mode\', \'narrow_range\', \'axis\', \'ensure_minimum_range\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\', \'HALF_AWAY_FROM_ZERO\', \'False\', \'None\', \'0.01\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
index 047fb4deda7..269873c63ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
@@ -36,6 +36,10 @@ tf_module {
     name: "quantize_and_dequantize"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\', \'narrow_range\', \'axis\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "quantize_and_dequantize_v2"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\', \'narrow_range\', \'axis\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "quantized_concat"
     argspec: "args=[\'concat_dim\', \'values\', \'input_mins\', \'input_maxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 6e228a20111..96be23b9e50 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -760,6 +760,10 @@ tf_module {
     name: "CollectiveGather"
     argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
+  member_method {
+    name: "CollectiveGatherV2"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
+  }
   member_method {
     name: "CollectivePermute"
     argspec: "args=[\'input\', \'source_target_pairs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -770,7 +774,7 @@ tf_module {
   }
   member_method {
     name: "CollectiveReduceV2"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CombinedNonMaxSuppression"
@@ -1880,6 +1884,10 @@ tf_module {
     name: "ImageProjectiveTransformV2"
     argspec: "args=[\'images\', \'transforms\', \'output_shape\', \'interpolation\', \'fill_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'None\'], "
   }
+  member_method {
+    name: "ImageProjectiveTransformV3"
+    argspec: "args=[\'images\', \'transforms\', \'output_shape\', \'fill_value\', \'interpolation\', \'fill_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'None\'], "
+  }
   member_method {
     name: "ImageSummary"
     argspec: "args=[\'tag\', \'tensor\', \'max_images\', \'bad_color\', \'name\'], varargs=None, keywords=None, defaults=[\'3\', \'dtype: DT_UINT8\\ntensor_shape {\\n  dim {\\n    size: 4\\n  }\\n}\\nint_val: 255\\nint_val: 0\\nint_val: 0\\nint_val: 255\\n\', \'None\'], "
@@ -2422,7 +2430,7 @@ tf_module {
   }
   member_method {
     name: "MaxPool"
-    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'explicit_paddings\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'NHWC\', \'None\'], "
   }
   member_method {
     name: "MaxPool3D"
@@ -2438,7 +2446,7 @@ tf_module {
   }
   member_method {
     name: "MaxPoolGrad"
-    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'explicit_paddings\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'NHWC\', \'None\'], "
   }
   member_method {
     name: "MaxPoolGradGrad"
@@ -2514,7 +2522,7 @@ tf_module {
   }
   member_method {
     name: "ModelDataset"
-    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'algorithm\', \'cpu_budget\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'algorithm\', \'cpu_budget\', \'ram_budget\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'None\'], "
   }
   member_method {
     name: "Mul"
@@ -2724,6 +2732,14 @@ tf_module {
     name: "OutfeedDequeueTuple"
     argspec: "args=[\'dtypes\', \'shapes\', \'device_ordinal\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
+  member_method {
+    name: "OutfeedDequeueTupleV2"
+    argspec: "args=[\'device_ordinal\', \'dtypes\', \'shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "OutfeedDequeueV2"
+    argspec: "args=[\'device_ordinal\', \'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "OutfeedEnqueue"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2862,7 +2878,7 @@ tf_module {
   }
   member_method {
     name: "PrefetchDataset"
-    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\', \'slack_period\', \'legacy_autotune\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\', \'slack_period\', \'legacy_autotune\', \'buffer_size_min\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'0\', \'None\'], "
   }
   member_method {
     name: "Prelinearize"
@@ -2924,6 +2940,14 @@ tf_module {
     name: "QuantizeAndDequantizeV3"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'num_bits\', \'signed_input\', \'range_given\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'False\', \'-1\', \'None\'], "
   }
+  member_method {
+    name: "QuantizeAndDequantizeV4"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'False\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizeAndDequantizeV4Grad"
+    argspec: "args=[\'gradients\', \'input\', \'input_min\', \'input_max\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
   member_method {
     name: "QuantizeDownAndShrinkRange"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3188,6 +3212,10 @@ tf_module {
     name: "RaggedTensorToVariant"
     argspec: "args=[\'rt_nested_splits\', \'rt_dense_values\', \'batched_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "RaggedTensorToVariantGradient"
+    argspec: "args=[\'encoded_ragged_grad\', \'row_splits\', \'dense_values_shape\', \'Tvalues\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "RandomCrop"
     argspec: "args=[\'image\', \'size\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
@@ -3784,6 +3812,10 @@ tf_module {
     name: "Rint"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "RngReadAndSkip"
+    argspec: "args=[\'resource\', \'alg\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "RngSkip"
     argspec: "args=[\'resource\', \'algorithm\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4090,7 +4122,7 @@ tf_module {
   }
   member_method {
     name: "SnapshotDatasetV2"
-    argspec: "args=[\'input_dataset\', \'path\', \'reader_func_other_args\', \'shard_func_other_args\', \'output_types\', \'output_shapes\', \'reader_func\', \'shard_func\', \'compression\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+    argspec: "args=[\'input_dataset\', \'path\', \'reader_func_other_args\', \'shard_func_other_args\', \'output_types\', \'output_shapes\', \'reader_func\', \'shard_func\', \'compression\', \'reader_prefix\', \'writer_prefix\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "SobolSample"
@@ -4536,10 +4568,18 @@ tf_module {
     name: "StatelessRandomGammaV2"
     argspec: "args=[\'shape\', \'seed\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "StatelessRandomGetKeyCounterAlg"
+    argspec: "args=[\'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "StatelessRandomNormal"
     argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "StatelessRandomNormalV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "StatelessRandomPoisson"
     argspec: "args=[\'shape\', \'seed\', \'lam\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4552,10 +4592,22 @@ tf_module {
     name: "StatelessRandomUniformFullInt"
     argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'uint64\'>\", \'None\'], "
   }
+  member_method {
+    name: "StatelessRandomUniformFullIntV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'uint64\'>\", \'None\'], "
+  }
   member_method {
     name: "StatelessRandomUniformInt"
     argspec: "args=[\'shape\', \'seed\', \'minval\', \'maxval\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "StatelessRandomUniformIntV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'minval\', \'maxval\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StatelessRandomUniformV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "StatelessSampleDistortedBoundingBox"
     argspec: "args=[\'image_size\', \'bounding_boxes\', \'min_object_covered\', \'seed\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'[0.75, 1.33]\', \'[0.05, 1]\', \'100\', \'False\', \'None\'], "
@@ -4564,6 +4616,10 @@ tf_module {
     name: "StatelessTruncatedNormal"
     argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "StatelessTruncatedNormalV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "StatelessWhile"
     argspec: "args=[\'input\', \'cond\', \'body\', \'output_shapes\', \'parallel_iterations\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'10\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
index e2c6bbd43d9..5762d8dafb8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt
index 941e81acbbb..f66e284a0fa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
index 9a3f47406b8..bf428575b2d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\'], varargs=None, keywords=None, defaults=[\'0.01\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.01\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt
index 9d4f24f4edd..9a11c9738d6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt
@@ -24,4 +24,8 @@ tf_module {
     name: "TableConfig"
     mtype: "<type \'type\'>"
   }
+  member_method {
+    name: "serving_embedding_lookup"
+    argspec: "args=[\'inputs\', \'weights\', \'tables\', \'feature_config\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor-spec.pbtxt
index 25c3f6e3e11..a9043abdacb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-ragged-tensor-spec.pbtxt
@@ -8,6 +8,10 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "flat_values_spec"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "ragged_rank"
     mtype: "<type \'property\'>"
@@ -26,7 +30,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'shape\', \'dtype\', \'ragged_rank\', \'row_splits_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \"<dtype: \'int64\'>\"], "
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'ragged_rank\', \'row_splits_dtype\', \'flat_values_spec\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "from_value"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-composite-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-composite-tensor.pbtxt
new file mode 100644
index 00000000000..6d786d1cb29
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-composite-tensor.pbtxt
@@ -0,0 +1,8 @@
+path: "tensorflow.__internal__.CompositeTensor"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.composite_tensor.CompositeTensor\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.decorator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.decorator.pbtxt
new file mode 100644
index 00000000000..48f08d5eaaa
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.decorator.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.__internal__.decorator"
+tf_module {
+  member_method {
+    name: "make_decorator"
+    argspec: "args=[\'target\', \'decorator_func\', \'decorator_name\', \'decorator_doc\', \'decorator_argspec\'], varargs=None, keywords=None, defaults=[\'None\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'maybe_tf_decorator\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.distribute.combinations.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.distribute.combinations.pbtxt
new file mode 100644
index 00000000000..c31afabba60
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.distribute.combinations.pbtxt
@@ -0,0 +1,75 @@
+path: "tensorflow.__internal__.distribute.combinations"
+tf_module {
+  member {
+    name: "central_storage_strategy_with_gpu_and_cpu"
+    mtype: "<class \'tensorflow.python.distribute.combinations.NamedDistribution\'>"
+  }
+  member {
+    name: "central_storage_strategy_with_two_gpus"
+    mtype: "<class \'tensorflow.python.distribute.combinations.NamedDistribution\'>"
+  }
+  member {
+    name: "cloud_tpu_strategy"
+    mtype: "<class \'tensorflow.python.distribute.combinations.NamedDistribution\'>"
+  }
+  member {
+    name: "default_strategy"
+    mtype: "<class \'tensorflow.python.distribute.combinations.NamedDistribution\'>"
+  }
+  member {
+    name: "mirrored_strategy_with_cpu_1_and_2"
+    mtype: "<class \'tensorflow.python.distribute.combinations.NamedDistribution\'>"
+  }
+  member {
+    name: "mirrored_strategy_with_gpu_and_cpu"
+    mtype: "<class \'tensorflow.python.distribute.combinations.NamedDistribution\'>"
+  }
+  member {
+    name: "mirrored_strategy_with_one_cpu"
+    mtype: "<class \'tensorflow.python.distribute.combinations.NamedDistribution\'>"
+  }
+  member {
+    name: "mirrored_strategy_with_one_gpu"
+    mtype: "<class \'tensorflow.python.distribute.combinations.NamedDistribution\'>"
+  }
+  member {
+    name: "mirrored_strategy_with_two_gpus"
+    mtype: "<class \'tensorflow.python.distribute.combinations.NamedDistribution\'>"
+  }
+  member {
+    name: "multi_worker_mirrored_2x1_cpu"
+    mtype: "<class \'tensorflow.python.distribute.combinations.NamedDistribution\'>"
+  }
+  member {
+    name: "multi_worker_mirrored_2x1_gpu"
+    mtype: "<class \'tensorflow.python.distribute.combinations.NamedDistribution\'>"
+  }
+  member {
+    name: "multi_worker_mirrored_2x2_gpu"
+    mtype: "<class \'tensorflow.python.distribute.combinations.NamedDistribution\'>"
+  }
+  member {
+    name: "one_device_strategy"
+    mtype: "<class \'tensorflow.python.distribute.combinations.NamedDistribution\'>"
+  }
+  member {
+    name: "one_device_strategy_gpu"
+    mtype: "<class \'tensorflow.python.distribute.combinations.NamedDistribution\'>"
+  }
+  member {
+    name: "tpu_strategy"
+    mtype: "<class \'tensorflow.python.distribute.combinations.NamedDistribution\'>"
+  }
+  member {
+    name: "tpu_strategy_one_core"
+    mtype: "<class \'tensorflow.python.distribute.combinations.NamedDistribution\'>"
+  }
+  member {
+    name: "tpu_strategy_packed_var"
+    mtype: "<class \'tensorflow.python.distribute.combinations.NamedDistribution\'>"
+  }
+  member_method {
+    name: "generate"
+    argspec: "args=[\'combinations\', \'test_combinations\'], varargs=None, keywords=None, defaults=[\'()\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.distribute.multi_process_runner.-not-initialized-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.distribute.multi_process_runner.-not-initialized-error.pbtxt
new file mode 100644
index 00000000000..c6d5ab61840
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.distribute.multi_process_runner.-not-initialized-error.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.__internal__.distribute.multi_process_runner.NotInitializedError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.multi_process_runner.NotInitializedError\'>"
+  is_instance: "<type \'exceptions.RuntimeError\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.distribute.multi_process_runner.-subprocess-timeout-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.distribute.multi_process_runner.-subprocess-timeout-error.pbtxt
new file mode 100644
index 00000000000..6251a8c03f0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.distribute.multi_process_runner.-subprocess-timeout-error.pbtxt
@@ -0,0 +1,13 @@
+path: "tensorflow.__internal__.distribute.multi_process_runner.SubprocessTimeoutError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.multi_process_runner.SubprocessTimeoutError\'>"
+  is_instance: "<type \'exceptions.RuntimeError\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'msg\', \'mpr_result\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.distribute.multi_process_runner.-unexpected-subprocess-exit-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.distribute.multi_process_runner.-unexpected-subprocess-exit-error.pbtxt
new file mode 100644
index 00000000000..2344e31ec48
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.distribute.multi_process_runner.-unexpected-subprocess-exit-error.pbtxt
@@ -0,0 +1,13 @@
+path: "tensorflow.__internal__.distribute.multi_process_runner.UnexpectedSubprocessExitError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.multi_process_runner.UnexpectedSubprocessExitError\'>"
+  is_instance: "<type \'exceptions.RuntimeError\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'msg\', \'mpr_result\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.distribute.multi_process_runner.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.distribute.multi_process_runner.pbtxt
new file mode 100644
index 00000000000..b297d573993
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.distribute.multi_process_runner.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.__internal__.distribute.multi_process_runner"
+tf_module {
+  member {
+    name: "NotInitializedError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SubprocessTimeoutError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "UnexpectedSubprocessExitError"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "create_cluster_spec"
+    argspec: "args=[\'has_chief\', \'num_workers\', \'num_ps\', \'has_eval\'], varargs=None, keywords=None, defaults=[\'False\', \'1\', \'0\', \'False\'], "
+  }
+  member_method {
+    name: "get_barrier"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run"
+    argspec: "args=[\'fn\', \'cluster_spec\', \'rpc_layer\', \'max_run_time\', \'return_output\', \'timeout\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'200\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "test_main"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.distribute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.distribute.pbtxt
new file mode 100644
index 00000000000..2a019dff514
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.distribute.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.__internal__.distribute"
+tf_module {
+  member {
+    name: "combinations"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "multi_process_runner"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.pbtxt
new file mode 100644
index 00000000000..effcae38787
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.pbtxt
@@ -0,0 +1,23 @@
+path: "tensorflow.__internal__"
+tf_module {
+  member {
+    name: "CompositeTensor"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "decorator"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "distribute"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "test"
+    mtype: "<type \'module\'>"
+  }
+  member {
+    name: "tracking"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.-named-object.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.-named-object.pbtxt
new file mode 100644
index 00000000000..de5dc8eca9d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.-named-object.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.__internal__.test.combinations.NamedObject"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.test_combinations.NamedObject\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'obj\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.-optional-parameter.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.-optional-parameter.pbtxt
new file mode 100644
index 00000000000..4dedda05bf6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.-optional-parameter.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.__internal__.test.combinations.OptionalParameter"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.test_combinations.OptionalParameter\'>"
+  is_instance: "<class \'tensorflow.python.framework.test_combinations.ParameterModifier\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "DO_NOT_PASS_TO_THE_TEST"
+    mtype: "<type \'object\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'parameter_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "modified_arguments"
+    argspec: "args=[\'self\', \'kwargs\', \'requested_parameters\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.-parameter-modifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.-parameter-modifier.pbtxt
new file mode 100644
index 00000000000..9b2438ccc8a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.-parameter-modifier.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.__internal__.test.combinations.ParameterModifier"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.test_combinations.ParameterModifier\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "DO_NOT_PASS_TO_THE_TEST"
+    mtype: "<type \'object\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'parameter_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "modified_arguments"
+    argspec: "args=[\'self\', \'kwargs\', \'requested_parameters\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.-test-combination.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.-test-combination.pbtxt
new file mode 100644
index 00000000000..d70cd20a7d5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.-test-combination.pbtxt
@@ -0,0 +1,20 @@
+path: "tensorflow.__internal__.test.combinations.TestCombination"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.test_combinations.TestCombination\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "context_managers"
+    argspec: "args=[\'self\', \'kwargs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "parameter_modifiers"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "should_execute_combination"
+    argspec: "args=[\'self\', \'kwargs\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.pbtxt
new file mode 100644
index 00000000000..b5190c37802
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.combinations.pbtxt
@@ -0,0 +1,31 @@
+path: "tensorflow.__internal__.test.combinations"
+tf_module {
+  member {
+    name: "NamedObject"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OptionalParameter"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ParameterModifier"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TestCombination"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "generate"
+    mtype: "<class \'functools.partial\'>"
+  }
+  member_method {
+    name: "combine"
+    argspec: "args=[], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "times"
+    argspec: "args=[], varargs=combined, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.pbtxt
new file mode 100644
index 00000000000..ec8f5b3dd96
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.test.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.__internal__.test"
+tf_module {
+  member {
+    name: "combinations"
+    mtype: "<type \'module\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.-auto-trackable.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.-auto-trackable.pbtxt
new file mode 100644
index 00000000000..f4d2526f68a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.-auto-trackable.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.__internal__.tracking.AutoTrackable"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.-trackable.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.-trackable.pbtxt
new file mode 100644
index 00000000000..dda2badfc66
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.-trackable.pbtxt
@@ -0,0 +1,8 @@
+path: "tensorflow.__internal__.tracking.Trackable"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.pbtxt
new file mode 100644
index 00000000000..6014c72d730
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.tracking.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.__internal__.tracking"
+tf_module {
+  member {
+    name: "AutoTrackable"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Trackable"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.pbtxt
index 7397719e656..f6be3da59b8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.pbtxt
@@ -24,6 +24,10 @@ tf_module {
     name: "enable_mlir_graph_optimization"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "enable_tensor_float_32_execution"
+    argspec: "args=[\'enabled\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_device_details"
     argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"
@@ -36,6 +40,10 @@ tf_module {
     name: "get_memory_growth"
     argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "get_memory_usage"
+    argspec: "args=[\'device\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_synchronous_execution"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -76,4 +84,8 @@ tf_module {
     name: "set_visible_devices"
     argspec: "args=[\'devices\', \'device_type\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "tensor_float_32_execution_enabled"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index 88d1cb71f09..1aee69e3a5f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -48,7 +48,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 1b40337f229..eec1f30e679 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -50,7 +50,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index 16c63b4aa54..a9d0eafa605 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -49,7 +49,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index 712e2ea3d70..c875e08a4c3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -50,7 +50,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index f228c4afacc..90c6edd4da4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -50,7 +50,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
index e33265430ea..f0ada15d5cd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -19,6 +19,10 @@ tf_class {
     name: "autotune_cpu_budget"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "autotune_ram_budget"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "filter_fusion"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index 67e7697ed91..ae2c22c237e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -50,7 +50,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index b9d9f7df03f..71726e9aebe 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -50,7 +50,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\', \'output_signature\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "from_tensor_slices"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatch-server.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatch-server.pbtxt
index 522cc00448a..9cf8100ade3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatch-server.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatch-server.pbtxt
@@ -8,7 +8,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'port\', \'protocol\', \'work_dir\', \'fault_tolerant_mode\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'config\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
   }
   member_method {
     name: "join"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatcher-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
new file mode 100644
index 00000000000..2364e41e4c5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-dispatcher-config.pbtxt
@@ -0,0 +1,39 @@
+path: "tensorflow.data.experimental.service.DispatcherConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.DispatcherConfig\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.DispatcherConfig\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "fault_tolerant_mode"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "job_gc_check_interval_ms"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "job_gc_timeout_ms"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "port"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "protocol"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "work_dir"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-config.pbtxt
new file mode 100644
index 00000000000..d8eaf9bc7d7
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-config.pbtxt
@@ -0,0 +1,35 @@
+path: "tensorflow.data.experimental.service.WorkerConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.WorkerConfig\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.WorkerConfig\'>"
+  is_instance: "<type \'tuple\'>"
+  member {
+    name: "dispatcher_address"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "heartbeat_interval_ms"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "port"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "protocol"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_address"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "count"
+  }
+  member_method {
+    name: "index"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-server.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-server.pbtxt
index 8d8b1fd8584..6c0a78abb23 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-server.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-server.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'port\', \'dispatcher_address\', \'worker_address\', \'protocol\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'config\', \'start\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
     name: "join"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt
index 3ec5cd90ff8..0dd42fcdc24 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt
@@ -4,6 +4,14 @@ tf_module {
     name: "DispatchServer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DispatcherConfig"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "WorkerConfig"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "WorkerServer"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
index b6ac6f8c88a..ead3f808875 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.data"
 tf_module {
+  member {
+    name: "AUTOTUNE"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "Dataset"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-cross-device-ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-cross-device-ops.pbtxt
index 0a8e0b4421a..c35e409975d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-cross-device-ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-cross-device-ops.pbtxt
@@ -8,11 +8,11 @@ tf_class {
   }
   member_method {
     name: "batch_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "broadcast"
@@ -24,10 +24,10 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
index b5ccb39d075..68f62ba9e0b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-hierarchical-copy-all-reduce.pbtxt
@@ -10,11 +10,11 @@ tf_class {
   }
   member_method {
     name: "batch_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "broadcast"
@@ -26,10 +26,10 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
index 148c8c9d71f..60c0e8d7663 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -29,8 +29,8 @@ tf_class {
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "experimental_assign_to_logical_device"
-    argspec: "args=[\'self\', \'tensor\', \'logical_device_id\'], varargs=None, keywords=None, defaults=None"
+    name: "distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_dataset"
@@ -48,22 +48,10 @@ tf_class {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_make_numpy_dataset"
-    argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_replicate_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_split_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-nccl-all-reduce.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-nccl-all-reduce.pbtxt
index 1a039b10501..266447848d0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-nccl-all-reduce.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-nccl-all-reduce.pbtxt
@@ -10,11 +10,11 @@ tf_class {
   }
   member_method {
     name: "batch_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "broadcast"
@@ -26,10 +26,10 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
index 51e0d889489..17af12cf279 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-one-device-strategy.pbtxt
@@ -29,8 +29,8 @@ tf_class {
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "experimental_assign_to_logical_device"
-    argspec: "args=[\'self\', \'tensor\', \'logical_device_id\'], varargs=None, keywords=None, defaults=None"
+    name: "distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_dataset"
@@ -48,22 +48,10 @@ tf_class {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_make_numpy_dataset"
-    argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_replicate_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_split_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduction-to-one-device.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduction-to-one-device.pbtxt
index 7876166dc40..55939070e23 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduction-to-one-device.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduction-to-one-device.pbtxt
@@ -9,11 +9,11 @@ tf_class {
   }
   member_method {
     name: "batch_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "broadcast"
@@ -25,10 +25,10 @@ tf_class {
   }
   member_method {
     name: "reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "reduce_implementation"
-    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'reduce_op\', \'per_replica_value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
index 0101212e4cc..5a9c88dddc0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
@@ -24,7 +24,7 @@ tf_class {
   }
   member_method {
     name: "all_reduce"
-    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "merge_call"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
index f3fa80427a4..a1447c2ded0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
@@ -20,28 +20,20 @@ tf_class {
   }
   member_method {
     name: "batch_reduce_to"
-    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "colocate_vars_with"
     argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "non_slot_devices"
-    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "reduce_to"
-    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'destinations\', \'experimental_hints\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'destinations\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "update"
     argspec: "args=[\'self\', \'var\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
   }
-  member_method {
-    name: "update_non_slot"
-    argspec: "args=[\'self\', \'colocate_with\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
-  }
   member_method {
     name: "value_container"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
index dbd329d6874..702cdc98e88 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
@@ -28,8 +28,8 @@ tf_class {
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "experimental_assign_to_logical_device"
-    argspec: "args=[\'self\', \'tensor\', \'logical_device_id\'], varargs=None, keywords=None, defaults=None"
+    name: "distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_dataset"
@@ -47,22 +47,10 @@ tf_class {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_make_numpy_dataset"
-    argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_replicate_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_split_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt
index 505c77be2e2..0cb06ec3b01 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-t-p-u-strategy.pbtxt
@@ -28,6 +28,10 @@ tf_class {
     name: "configure"
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "experimental_assign_to_logical_device"
     argspec: "args=[\'self\', \'tensor\', \'logical_device_id\'], varargs=None, keywords=None, defaults=None"
@@ -48,10 +52,6 @@ tf_class {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_make_numpy_dataset"
-    argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_replicate_to_logical_devices"
     argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
index 963ad04f6ab..e1b92b44b73 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-central-storage-strategy.pbtxt
@@ -29,8 +29,8 @@ tf_class {
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "experimental_assign_to_logical_device"
-    argspec: "args=[\'self\', \'tensor\', \'logical_device_id\'], varargs=None, keywords=None, defaults=None"
+    name: "distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_dataset"
@@ -48,22 +48,10 @@ tf_class {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_make_numpy_dataset"
-    argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_replicate_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_split_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-collective-communication.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-collective-communication.pbtxt
index 7eca1c80d8b..a9deaeeebbc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-collective-communication.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-collective-communication.pbtxt
@@ -1,16 +1,16 @@
 path: "tensorflow.distribute.experimental.CollectiveCommunication"
 tf_class {
-  is_instance: "<enum \'CollectiveCommunication\'>"
+  is_instance: "<enum \'CommunicationImplemenation\'>"
   member {
     name: "AUTO"
-    mtype: "<enum \'CollectiveCommunication\'>"
+    mtype: "<enum \'CommunicationImplemenation\'>"
   }
   member {
     name: "NCCL"
-    mtype: "<enum \'CollectiveCommunication\'>"
+    mtype: "<enum \'CommunicationImplemenation\'>"
   }
   member {
     name: "RING"
-    mtype: "<enum \'CollectiveCommunication\'>"
+    mtype: "<enum \'CommunicationImplemenation\'>"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-communication-implemenation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-communication-implemenation.pbtxt
new file mode 100644
index 00000000000..cc73b4a28ff
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-communication-implemenation.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.distribute.experimental.CommunicationImplemenation"
+tf_class {
+  is_instance: "<enum \'CommunicationImplemenation\'>"
+  member {
+    name: "AUTO"
+    mtype: "<enum \'CommunicationImplemenation\'>"
+  }
+  member {
+    name: "NCCL"
+    mtype: "<enum \'CommunicationImplemenation\'>"
+  }
+  member {
+    name: "RING"
+    mtype: "<enum \'CommunicationImplemenation\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-communication-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-communication-options.pbtxt
new file mode 100644
index 00000000000..34f226cedc8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-communication-options.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.distribute.experimental.CommunicationOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.collective_util._OptionsExported\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'bytes_per_pack\', \'timeout_seconds\', \'implementation\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'CommunicationImplemenation.AUTO\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
index 5a44eaf20b5..d0fb8c9a632 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-multi-worker-mirrored-strategy.pbtxt
@@ -18,7 +18,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'communication\', \'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'CollectiveCommunication.AUTO\', \'None\'], "
+    argspec: "args=[\'self\', \'communication\', \'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'CommunicationImplemenation.AUTO\', \'None\'], "
   }
   member_method {
     name: "colocate_vars_with"
@@ -29,8 +29,8 @@ tf_class {
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "experimental_assign_to_logical_device"
-    argspec: "args=[\'self\', \'tensor\', \'logical_device_id\'], varargs=None, keywords=None, defaults=None"
+    name: "distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_dataset"
@@ -48,22 +48,10 @@ tf_class {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_make_numpy_dataset"
-    argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_replicate_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_split_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
index 58bd5497817..e445d7c4dab 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-parameter-server-strategy.pbtxt
@@ -29,8 +29,8 @@ tf_class {
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "experimental_assign_to_logical_device"
-    argspec: "args=[\'self\', \'tensor\', \'logical_device_id\'], varargs=None, keywords=None, defaults=None"
+    name: "distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_dataset"
@@ -48,22 +48,10 @@ tf_class {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_make_numpy_dataset"
-    argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_replicate_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_split_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
index 4bcd2277411..6536eefe414 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.-t-p-u-strategy.pbtxt
@@ -29,8 +29,8 @@ tf_class {
     argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "experimental_assign_to_logical_device"
-    argspec: "args=[\'self\', \'tensor\', \'logical_device_id\'], varargs=None, keywords=None, defaults=None"
+    name: "distribute_datasets_from_function"
+    argspec: "args=[\'self\', \'dataset_fn\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "experimental_distribute_dataset"
@@ -48,22 +48,10 @@ tf_class {
     name: "experimental_local_results"
     argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "experimental_make_numpy_dataset"
-    argspec: "args=[\'self\', \'numpy_input\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "experimental_replicate_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "experimental_run"
     argspec: "args=[\'self\', \'fn\', \'input_iterator\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "experimental_split_to_logical_devices"
-    argspec: "args=[\'self\', \'tensor\', \'partition_dimensions\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "group"
     argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt
index 06151eee4b4..fdc27f7acab 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.experimental.pbtxt
@@ -12,6 +12,14 @@ tf_module {
     name: "CollectiveHints"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CommunicationImplemenation"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "CommunicationOptions"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "MultiWorkerMirroredStrategy"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-run-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-run-config.pbtxt
index b730913ca91..e124a23e9a3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-run-config.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-run-config.pbtxt
@@ -2,6 +2,10 @@ path: "tensorflow.estimator.RunConfig"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.run_config.RunConfig\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "checkpoint_save_graph_def"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "cluster_spec"
     mtype: "<type \'property\'>"
@@ -104,7 +108,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\', \'protocol\', \'eval_distribute\', \'experimental_distribute\', \'experimental_max_worker_delay_secs\', \'session_creation_timeout_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'7200\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'tf_random_seed\', \'save_summary_steps\', \'save_checkpoints_steps\', \'save_checkpoints_secs\', \'session_config\', \'keep_checkpoint_max\', \'keep_checkpoint_every_n_hours\', \'log_step_count_steps\', \'train_distribute\', \'device_fn\', \'protocol\', \'eval_distribute\', \'experimental_distribute\', \'experimental_max_worker_delay_secs\', \'session_creation_timeout_secs\', \'checkpoint_save_graph_def\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'100\', \'<object object instance>\', \'<object object instance>\', \'None\', \'5\', \'10000\', \'100\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'7200\', \'True\'], "
   }
   member_method {
     name: "replace"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index a9f6f069560..da08722a7a3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -174,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 168539be647..1719c8bd9c7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -180,7 +180,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.estimator.pbtxt
index d9415ba4e54..28d62d03936 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.estimator.pbtxt
@@ -2,6 +2,6 @@ path: "tensorflow.keras.estimator"
 tf_module {
   member_method {
     name: "model_to_estimator"
-    argspec: "args=[\'keras_model\', \'keras_model_path\', \'custom_objects\', \'model_dir\', \'config\', \'checkpoint_format\', \'metric_names_map\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'checkpoint\', \'None\'], "
+    argspec: "args=[\'keras_model\', \'keras_model_path\', \'custom_objects\', \'model_dir\', \'config\', \'checkpoint_format\', \'metric_names_map\', \'export_outputs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'checkpoint\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
index 2aff054a51d..d93c018b073 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -175,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index 3bd1bc2c939..1b50d327cd6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'1\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'hard_sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index ed49246e458..9fba915d01a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -175,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
index 655018a30c4..2a0f0c780f1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.pbtxt
@@ -32,8 +32,4 @@ tf_module {
     name: "WideDeepModel"
     mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "terminate_keras_multiprocessing_pools"
-    argspec: "args=[\'grace_period\', \'use_sigkill\'], varargs=None, keywords=None, defaults=[\'0.1\', \'False\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
index 96b809486a7..0c85d31934a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-additive-attention.pbtxt
@@ -150,7 +150,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
index ae2cb7f7e20..b2c3156cf7a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-attention.pbtxt
@@ -150,7 +150,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index 84576890c14..1a2338fe077 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'2\', \'True\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'True\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index bf71821c303..7fd64ab47ca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -192,7 +192,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'time_major\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'2\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'True\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'unroll\', \'time_major\', \'reset_after\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'True\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index fb7ee25f8f6..c39df6fa394 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'2\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index fd5eb6f50ce..3877cf015a2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -192,7 +192,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'implementation\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'time_major\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'2\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
+    argspec: "args=[\'self\', \'units\', \'activation\', \'recurrent_activation\', \'use_bias\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'dropout\', \'recurrent_dropout\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\', \'time_major\', \'unroll\'], varargs=None, keywords=kwargs, defaults=[\'tanh\', \'sigmoid\', \'True\', \'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'0.0\', \'0.0\', \'False\', \'False\', \'False\', \'False\', \'False\', \'False\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multi-head-attention.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multi-head-attention.pbtxt
index 070ee20ab30..89fbb32194a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multi-head-attention.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multi-head-attention.pbtxt
@@ -149,7 +149,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'query\', \'value\', \'key\', \'attention_mask\', \'return_attention_scores\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'query\', \'value\', \'key\', \'attention_mask\', \'return_attention_scores\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
index a58ffc1c2a5..c66604af334 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-normalization.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'axis\', \'dtype\'], varargs=None, keywords=kwargs, defaults=[\'-1\', \'None\'], "
+    argspec: "args=[\'self\', \'axis\', \'dtype\', \'mean\', \'variance\'], varargs=None, keywords=kwargs, defaults=[\'-1\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "adapt"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
index ee9a0254382..7c68f9ef783 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-rotation.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\', \'fill_value\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "adapt"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
index 7e1095e7503..73866dfcf50 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-translation.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\', \'fill_value\'], varargs=None, keywords=kwargs, defaults=[\'reflect\', \'bilinear\', \'None\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "adapt"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
index fd59a92a4af..392e2efef39 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
@@ -118,7 +118,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'reflect\', \'bilinear\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\', \'fill_value\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'reflect\', \'bilinear\', \'None\', \'None\', \'0.0\'], "
   }
   member_method {
     name: "adapt"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
index 4d5a28fc8b4..9fc7c410480 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'max_tokens\', \'standardize\', \'split\', \'ngrams\', \'output_mode\', \'output_sequence_length\', \'pad_to_max_tokens\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'lower_and_strip_punctuation\', \'whitespace\', \'None\', \'int\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'max_tokens\', \'standardize\', \'split\', \'ngrams\', \'output_mode\', \'output_sequence_length\', \'pad_to_max_tokens\', \'vocabulary\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'lower_and_strip_punctuation\', \'whitespace\', \'None\', \'int\', \'None\', \'True\', \'None\'], "
   }
   member_method {
     name: "adapt"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
index 17768aeafbe..cffbe8a2dde 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
@@ -212,6 +212,14 @@ tf_module {
     name: "kullback_leibler_divergence"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "log_cosh"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logcosh"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "mae"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
index 58f8cf24495..3c016d331de 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
@@ -13,22 +13,18 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "learning_rate"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "loss_scale"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "lr"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "weights"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt
index c37ee31da6b..e3435a32bef 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-policy.pbtxt
@@ -24,7 +24,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'loss_scale\'], varargs=None, keywords=None, defaults=[\'USE_DEFAULT\'], "
+    argspec: "args=[\'self\', \'name\', \'loss_scale\'], varargs=None, keywords=None, defaults=[\'auto\'], "
   }
   member_method {
     name: "from_config"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index 4368742d7bb..15c0ab5abbb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -174,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 8e9409f27a9..729fdd660ca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -180,7 +180,7 @@ tf_class {
   }
   member_method {
     name: "compile"
-    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'weighted_metrics\', \'run_eagerly\', \'steps_per_execution\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
index fb341cb24dd..88e4ecfbb62 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
index d8039ed21ef..89e0718d5b6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
index 912f92f83a6..29b1fba5aae 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adam.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
index 3abc6d39b3f..c481aa07ace 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
index 6257c71809e..a2b9d310eb9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-ftrl.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
index 2ce311d3504..650ac77d6df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
index 4855395e020..50e3da3eda5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -11,6 +11,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index 80a1449613c..ab8391e0465 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
index 8acfe214256..2bad07d9998 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
deleted file mode 100644
index 6b832051a97..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
+++ /dev/null
@@ -1,29 +0,0 @@
-path: "tensorflow.keras.utils.HDF5Matrix"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.utils.io_utils.HDF5Matrix\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ndim"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "refs"
-    mtype: "<type \'collections.defaultdict\'>"
-  }
-  member {
-    name: "shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "size"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'datapath\', \'dataset\', \'start\', \'end\', \'normalizer\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt
index 35fd96c0d17..c8de4602486 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.pbtxt
@@ -8,10 +8,6 @@ tf_module {
     name: "GeneratorEnqueuer"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "HDF5Matrix"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "OrderedEnqueuer"
     mtype: "<type \'type\'>"
@@ -32,10 +28,6 @@ tf_module {
     name: "custom_object_scope"
     mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "convert_all_kernels_in_model"
-    argspec: "args=[\'model\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "deserialize_keras_object"
     argspec: "args=[\'identifier\', \'module_objects\', \'custom_objects\', \'printable_module_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'object\'], "
@@ -64,10 +56,6 @@ tf_module {
     name: "model_to_dot"
     argspec: "args=[\'model\', \'show_shapes\', \'show_dtype\', \'show_layer_names\', \'rankdir\', \'expand_nested\', \'dpi\', \'subgraph\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'True\', \'TB\', \'False\', \'96\', \'False\'], "
   }
-  member_method {
-    name: "multi_gpu_model"
-    argspec: "args=[\'model\', \'gpus\', \'cpu_merge\', \'cpu_relocation\'], varargs=None, keywords=None, defaults=[\'True\', \'False\'], "
-  }
   member_method {
     name: "normalize"
     argspec: "args=[\'x\', \'axis\', \'order\'], varargs=None, keywords=None, defaults=[\'-1\', \'2\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
index 2ea4e8f84a6..77d0d2eeb70 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
@@ -144,6 +144,10 @@ tf_module {
     name: "erfc"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "erfcinv"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "erfinv"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
index b3c87d67d2b..494ebc98058 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
@@ -212,6 +212,14 @@ tf_module {
     name: "kullback_leibler_divergence"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "log_cosh"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "logcosh"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "mae"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.mlir.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.mlir.experimental.pbtxt
index e268fcf8e73..7a140a13bc6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.mlir.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.mlir.experimental.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.mlir.experimental"
 tf_module {
+  member_method {
+    name: "convert_function"
+    argspec: "args=[\'concrete_function\', \'pass_pipeline\'], varargs=None, keywords=None, defaults=[\'tf-standard-pipeline\'], "
+  }
   member_method {
     name: "convert_graph_def"
     argspec: "args=[\'graph_def\', \'pass_pipeline\'], varargs=None, keywords=None, defaults=[\'tf-standard-pipeline\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
index 06212bdc95d..605cb27c36b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adadelta.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
index 09fff0514d8..a436583fdd6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adagrad.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
index 195ba9e4f56..f874658cc25 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adam.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
index 9859da430bd..6798187be77 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-adamax.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
index b33838896ec..15efc6ada39 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-ftrl.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
index 128f223fdc7..00cf3e0e24e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-nadam.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
index 5633beb0916..881d15c5306 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-optimizer.pbtxt
@@ -11,6 +11,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
index db89ecbabe7..661e9cb5a58 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-r-m-sprop.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
index 0cb0205e65e..a14c9a4ce57 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.optimizers.-s-g-d.pbtxt
@@ -12,6 +12,10 @@ tf_class {
     name: "clipvalue"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "global_clipnorm"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "iterations"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 83baba1b1ce..0aa5e8924a3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -880,6 +880,10 @@ tf_module {
     name: "py_function"
     argspec: "args=[\'func\', \'inp\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "quantize_and_dequantize_v4"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'False\', \'-1\', \'None\'], "
+  }
   member_method {
     name: "range"
     argspec: "args=[\'start\', \'limit\', \'delta\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'range\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
index 047fb4deda7..269873c63ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
@@ -36,6 +36,10 @@ tf_module {
     name: "quantize_and_dequantize"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\', \'narrow_range\', \'axis\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "quantize_and_dequantize_v2"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\', \'narrow_range\', \'axis\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "quantized_concat"
     argspec: "args=[\'concat_dim\', \'values\', \'input_mins\', \'input_maxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 6e228a20111..96be23b9e50 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -760,6 +760,10 @@ tf_module {
     name: "CollectiveGather"
     argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
+  member_method {
+    name: "CollectiveGatherV2"
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
+  }
   member_method {
     name: "CollectivePermute"
     argspec: "args=[\'input\', \'source_target_pairs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -770,7 +774,7 @@ tf_module {
   }
   member_method {
     name: "CollectiveReduceV2"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CombinedNonMaxSuppression"
@@ -1880,6 +1884,10 @@ tf_module {
     name: "ImageProjectiveTransformV2"
     argspec: "args=[\'images\', \'transforms\', \'output_shape\', \'interpolation\', \'fill_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'None\'], "
   }
+  member_method {
+    name: "ImageProjectiveTransformV3"
+    argspec: "args=[\'images\', \'transforms\', \'output_shape\', \'fill_value\', \'interpolation\', \'fill_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'None\'], "
+  }
   member_method {
     name: "ImageSummary"
     argspec: "args=[\'tag\', \'tensor\', \'max_images\', \'bad_color\', \'name\'], varargs=None, keywords=None, defaults=[\'3\', \'dtype: DT_UINT8\\ntensor_shape {\\n  dim {\\n    size: 4\\n  }\\n}\\nint_val: 255\\nint_val: 0\\nint_val: 0\\nint_val: 255\\n\', \'None\'], "
@@ -2422,7 +2430,7 @@ tf_module {
   }
   member_method {
     name: "MaxPool"
-    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'explicit_paddings\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'NHWC\', \'None\'], "
   }
   member_method {
     name: "MaxPool3D"
@@ -2438,7 +2446,7 @@ tf_module {
   }
   member_method {
     name: "MaxPoolGrad"
-    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
+    argspec: "args=[\'orig_input\', \'orig_output\', \'grad\', \'ksize\', \'strides\', \'padding\', \'explicit_paddings\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'NHWC\', \'None\'], "
   }
   member_method {
     name: "MaxPoolGradGrad"
@@ -2514,7 +2522,7 @@ tf_module {
   }
   member_method {
     name: "ModelDataset"
-    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'algorithm\', \'cpu_budget\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
+    argspec: "args=[\'input_dataset\', \'output_types\', \'output_shapes\', \'algorithm\', \'cpu_budget\', \'ram_budget\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'None\'], "
   }
   member_method {
     name: "Mul"
@@ -2724,6 +2732,14 @@ tf_module {
     name: "OutfeedDequeueTuple"
     argspec: "args=[\'dtypes\', \'shapes\', \'device_ordinal\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
+  member_method {
+    name: "OutfeedDequeueTupleV2"
+    argspec: "args=[\'device_ordinal\', \'dtypes\', \'shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "OutfeedDequeueV2"
+    argspec: "args=[\'device_ordinal\', \'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "OutfeedEnqueue"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2862,7 +2878,7 @@ tf_module {
   }
   member_method {
     name: "PrefetchDataset"
-    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\', \'slack_period\', \'legacy_autotune\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
+    argspec: "args=[\'input_dataset\', \'buffer_size\', \'output_types\', \'output_shapes\', \'slack_period\', \'legacy_autotune\', \'buffer_size_min\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'0\', \'None\'], "
   }
   member_method {
     name: "Prelinearize"
@@ -2924,6 +2940,14 @@ tf_module {
     name: "QuantizeAndDequantizeV3"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'num_bits\', \'signed_input\', \'range_given\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'False\', \'-1\', \'None\'], "
   }
+  member_method {
+    name: "QuantizeAndDequantizeV4"
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'narrow_range\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'False\', \'-1\', \'None\'], "
+  }
+  member_method {
+    name: "QuantizeAndDequantizeV4Grad"
+    argspec: "args=[\'gradients\', \'input\', \'input_min\', \'input_max\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+  }
   member_method {
     name: "QuantizeDownAndShrinkRange"
     argspec: "args=[\'input\', \'input_min\', \'input_max\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -3188,6 +3212,10 @@ tf_module {
     name: "RaggedTensorToVariant"
     argspec: "args=[\'rt_nested_splits\', \'rt_dense_values\', \'batched_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "RaggedTensorToVariantGradient"
+    argspec: "args=[\'encoded_ragged_grad\', \'row_splits\', \'dense_values_shape\', \'Tvalues\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "RandomCrop"
     argspec: "args=[\'image\', \'size\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
@@ -3784,6 +3812,10 @@ tf_module {
     name: "Rint"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "RngReadAndSkip"
+    argspec: "args=[\'resource\', \'alg\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "RngSkip"
     argspec: "args=[\'resource\', \'algorithm\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4090,7 +4122,7 @@ tf_module {
   }
   member_method {
     name: "SnapshotDatasetV2"
-    argspec: "args=[\'input_dataset\', \'path\', \'reader_func_other_args\', \'shard_func_other_args\', \'output_types\', \'output_shapes\', \'reader_func\', \'shard_func\', \'compression\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+    argspec: "args=[\'input_dataset\', \'path\', \'reader_func_other_args\', \'shard_func_other_args\', \'output_types\', \'output_shapes\', \'reader_func\', \'shard_func\', \'compression\', \'reader_prefix\', \'writer_prefix\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'None\'], "
   }
   member_method {
     name: "SobolSample"
@@ -4536,10 +4568,18 @@ tf_module {
     name: "StatelessRandomGammaV2"
     argspec: "args=[\'shape\', \'seed\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "StatelessRandomGetKeyCounterAlg"
+    argspec: "args=[\'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "StatelessRandomNormal"
     argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "StatelessRandomNormalV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "StatelessRandomPoisson"
     argspec: "args=[\'shape\', \'seed\', \'lam\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -4552,10 +4592,22 @@ tf_module {
     name: "StatelessRandomUniformFullInt"
     argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'uint64\'>\", \'None\'], "
   }
+  member_method {
+    name: "StatelessRandomUniformFullIntV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'uint64\'>\", \'None\'], "
+  }
   member_method {
     name: "StatelessRandomUniformInt"
     argspec: "args=[\'shape\', \'seed\', \'minval\', \'maxval\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "StatelessRandomUniformIntV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'minval\', \'maxval\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "StatelessRandomUniformV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "StatelessSampleDistortedBoundingBox"
     argspec: "args=[\'image_size\', \'bounding_boxes\', \'min_object_covered\', \'seed\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'[0.75, 1.33]\', \'[0.05, 1]\', \'100\', \'False\', \'None\'], "
@@ -4564,6 +4616,10 @@ tf_module {
     name: "StatelessTruncatedNormal"
     argspec: "args=[\'shape\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "StatelessTruncatedNormalV2"
+    argspec: "args=[\'shape\', \'key\', \'counter\', \'alg\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
   member_method {
     name: "StatelessWhile"
     argspec: "args=[\'input\', \'cond\', \'body\', \'output_shapes\', \'parallel_iterations\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'10\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
index e2c6bbd43d9..5762d8dafb8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt
index 941e81acbbb..f66e284a0fa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
index 9a3f47406b8..bf428575b2d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
@@ -5,6 +5,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\'], varargs=None, keywords=None, defaults=[\'0.01\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'learning_rate\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'clipvalue\'], varargs=None, keywords=None, defaults=[\'0.01\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt
index 9d4f24f4edd..9a11c9738d6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt
@@ -24,4 +24,8 @@ tf_module {
     name: "TableConfig"
     mtype: "<type \'type\'>"
   }
+  member_method {
+    name: "serving_embedding_lookup"
+    argspec: "args=[\'inputs\', \'weights\', \'tables\', \'feature_config\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/benchmark/BUILD b/tensorflow/tools/benchmark/BUILD
index 674133431f1..cdd0e0d9452 100644
--- a/tensorflow/tools/benchmark/BUILD
+++ b/tensorflow/tools/benchmark/BUILD
@@ -62,7 +62,7 @@ tf_cc_test(
 
 # This binary may be built for either desktop or Android.
 # A typical Android build command will look like the following:
-# bazel build tensorflow/core:android_tensorflow_lib \
+# bazel build tensorflow/core:portable_tensorflow_lib \
 # --crosstool_top=//external:android/crosstool \
 # --cpu=armeabi-v7a \
 # --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
diff --git a/tensorflow/tools/benchmark/README.md b/tensorflow/tools/benchmark/README.md
index dee1a20f3f0..758e01c8072 100644
--- a/tensorflow/tools/benchmark/README.md
+++ b/tensorflow/tools/benchmark/README.md
@@ -9,9 +9,11 @@ both on desktop machines and on Android.
 
 ### On Android:
 
-(0) Refer to https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android to edit the `WORKSPACE` to configure the android NDK/SDK.
+(0) Refer to https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android
+to edit the `WORKSPACE` to configure the android NDK/SDK.
 
 (1) build for your specific platform, e.g.:
+
 ```
 bazel build -c opt \
   --crosstool_top=//external:android/crosstool \
@@ -23,14 +25,19 @@ bazel build -c opt \
 
 (2) Connect your phone. Push the binary to your phone with adb push
      (make the directory if required):
+
 ```
 adb push bazel-bin/tensorflow/tools/benchmark/benchmark_model /data/local/tmp
 ```
 
 (3) Push the compute graph that you need to test. For example:
-     adb push tensorflow_inception_graph.pb /data/local/tmp
+
+```
+adb push tensorflow_inception_graph.pb /data/local/tmp
+```
 
 (4) Run the benchmark. For example:
+
 ```
 adb shell /data/local/tmp/benchmark_model \
   --graph=/data/local/tmp/tensorflow_inception_graph.pb \
@@ -39,14 +46,17 @@ adb shell /data/local/tmp/benchmark_model \
   --input_layer_type="float" \
   --output_layer="output:0"
 ```
+
 ### On desktop:
 (1) build the binary
+
 ```
 bazel build -c opt tensorflow/tools/benchmark:benchmark_model
 ```
 
-(2) Run on your compute graph, similar to the Android case but without the need of adb shell.
-For example:
+(2) Run on your compute graph, similar to the Android case but without the need
+of adb shell. For example:
+
 ```
 bazel-bin/tensorflow/tools/benchmark/benchmark_model \
   --graph=tensorflow_inception_graph.pb \
diff --git a/tensorflow/tools/ci_build/Dockerfile.android b/tensorflow/tools/ci_build/Dockerfile.android
index 80949ac64eb..03edba478af 100644
--- a/tensorflow/tools/ci_build/Dockerfile.android
+++ b/tensorflow/tools/ci_build/Dockerfile.android
@@ -42,7 +42,7 @@ RUN cd ${ANDROID_DEV_HOME} && \
     echo y | android update sdk --no-ui -a --filter tools,platform-tools,android-${ANDROID_API_LEVEL},build-tools-${ANDROID_BUILD_TOOLS_VERSION}
 
 # Install Android NDK.
-ENV ANDROID_NDK_FILENAME android-ndk-r17c-linux-x86_64.zip
+ENV ANDROID_NDK_FILENAME android-ndk-r18b-linux-x86_64.zip
 ENV ANDROID_NDK_URL https://dl.google.com/android/repository/${ANDROID_NDK_FILENAME}
 ENV ANDROID_NDK_HOME ${ANDROID_DEV_HOME}/ndk
 ENV PATH ${PATH}:${ANDROID_NDK_HOME}
diff --git a/tensorflow/tools/ci_build/Dockerfile.horovod.gpu b/tensorflow/tools/ci_build/Dockerfile.horovod.gpu
new file mode 100644
index 00000000000..3607c52fc63
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.horovod.gpu
@@ -0,0 +1,62 @@
+FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04
+
+# Install GCC, Python3.7 and other dependencies.
+RUN apt-get update && \
+    apt-get install --assume-yes \
+        build-essential \
+        git \
+        wget \
+        cmake \
+        curl \
+        vim \
+        ca-certificates \
+        libjpeg-dev \
+        libpng-dev \
+        librdmacm1 \
+        libibverbs1 \
+        ibverbs-providers \
+        python3.7 \
+        python3.7-dev \
+        python3-pip \
+        python3.7-distutils && \
+    rm -rf /var/lib/apt/lists/* && \
+    rm -f /usr/bin/python && \
+    rm -f /usr/bin/python3 && \
+    ln -s /usr/bin/python3.7 /usr/bin/python && \
+    ln -s /usr/bin/python3.7 /usr/bin/python3 && \
+    gcc --version && \
+    g++ --version
+
+# Install tf-nightly and verify version.
+RUN python3.7 -m pip install --upgrade pip && \
+    pip3.7 install --no-cache --no-cache-dir tf-nightly && \
+    python3.7 -c "import tensorflow as tf; print(tf.__version__)"
+
+WORKDIR /tmp/openmpi_source
+
+# Download and install open-mpi.
+RUN wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.4.tar.gz && \
+    tar xvf openmpi-4.0.4.tar.gz && \
+    cd openmpi-4.0.4 && \
+    ./configure --enable-orterun-prefix-by-default && \
+    make -j $(nproc) all && \
+    make install
+
+# Set the path for OpenMPI binaries, libs and headers to be discoverable
+ENV LD_LIBRARY_PATH=/usr/local/lib/openmpi
+RUN ldconfig
+
+ENV HOROVOD_GPU_OPERATIONS=NCCL
+ENV HOROVOD_WITH_TENSORFLOW=1
+ENV HOROVOD_WITHOUT_PYTORCH=1
+ENV HOROVOD_WITHOUT_MXNET=1
+
+RUN pip3.7 install --no-cache --no-cache-dir \
+        git+https://github.com/horovod/horovod.git
+
+WORKDIR /workspace
+
+RUN git clone \
+        https://github.com/DEKHTIARJonathan/TF_HVD_Stability_Test.git \
+        /workspace && \
+    pip3.7 install --no-cache --no-cache-dir -r requirements.txt
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython
index dc7b85bca4d..1f27ab1d502 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu18.04-manylinux2010-multipython
@@ -44,9 +44,11 @@ COPY install/install_deb_packages.sh /install/
 RUN /install/install_deb_packages.sh
 
 # Install additional packages needed for this image:
+# - bsdmainutils (hexdump) for MLIR generated GPU kernels
 # - dependencies to build Python from source
 # - patchelf, as it is required by auditwheel
 RUN apt-get update && apt-get install -y \
+    bsdmainutils \
     libbz2-dev \
     libffi-dev \
     libgdbm-dev \
diff --git a/tensorflow/tools/ci_build/Dockerfile.rocm b/tensorflow/tools/ci_build/Dockerfile.rocm
index 4f5d3ae7291..a72915504be 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rocm
+++ b/tensorflow/tools/ci_build/Dockerfile.rocm
@@ -3,8 +3,10 @@
 FROM ubuntu:bionic
 MAINTAINER Jeff Poznanovic <jeffrey.poznanovic@amd.com>
 
-ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/3.3/
-ARG ROCM_PATH=/opt/rocm-3.3.0
+ARG ROCM_DEB_REPO=http://repo.radeon.com/rocm/apt/3.7/
+ARG ROCM_BUILD_NAME=xenial
+ARG ROCM_BUILD_NUM=main
+ARG ROCM_PATH=/opt/rocm-3.7.0
 
 ENV DEBIAN_FRONTEND noninteractive
 ENV TF_NEED_ROCM 1
@@ -13,12 +15,17 @@ RUN apt update && apt install -y wget software-properties-common
 
 # Add rocm repository
 RUN apt-get clean all
-RUN wget -qO - $DEB_ROCM_REPO/rocm.gpg.key | apt-key add -
-RUN sh -c  "echo deb [arch=amd64] $DEB_ROCM_REPO xenial main > /etc/apt/sources.list.d/rocm.list"
+RUN bin/bash -c 'if [[ $ROCM_DEB_REPO == http://repo.radeon.com/rocm/*  ]] ; then \
+      wget -qO - $ROCM_DEB_REPO/rocm.gpg.key | apt-key add -; \
+      echo "deb [arch=amd64] $ROCM_DEB_REPO $ROCM_BUILD_NAME $ROCM_BUILD_NUM" > /etc/apt/sources.list.d/rocm.list; \
+    else \
+      echo "deb [arch=amd64 trusted=yes] $ROCM_DEB_REPO $ROCM_BUILD_NAME $ROCM_BUILD_NUM" > /etc/apt/sources.list.d/rocm.list ; \
+    fi'
 
 # Install misc pkgs
 RUN apt-get update --allow-insecure-repositories && DEBIAN_FRONTEND=noninteractive apt-get install -y \
   build-essential \
+  bsdmainutils \
   clang-6.0 \
   clang-format-6.0 \
   clang-tidy-6.0 \
diff --git a/tensorflow/tools/ci_build/builds/android_full.sh b/tensorflow/tools/ci_build/builds/android_full.sh
index f76d1588e53..e0a0d65bf99 100755
--- a/tensorflow/tools/ci_build/builds/android_full.sh
+++ b/tensorflow/tools/ci_build/builds/android_full.sh
@@ -28,7 +28,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "${SCRIPT_DIR}/builds_common.sh"
 configure_android_workspace
 
-CPUS=armeabi-v7a,arm64-v8a,x86,x86_64
+CPUS=armeabi-v7a,arm64-v8a,x86
 
 OUT_DIR="$(pwd)/out/"
 AAR_LIB_TMP="$(pwd)/aar_libs"
@@ -44,13 +44,13 @@ do
         --compilation_mode=opt --cxxopt=-std=c++14 \
         --crosstool_top=//external:android/crosstool \
         --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
-        //tensorflow/core:android_tensorflow_lib \
+        //tensorflow/core:portable_tensorflow_lib \
         //tensorflow/tools/android/inference_interface:libtensorflow_inference.so \
-        //tensorflow/examples/android:libtensorflow_demo.so \
+        //tensorflow/tools/android/test:libtensorflow_demo.so \
         //tensorflow/tools/benchmark:benchmark_model
 
     copy_lib bazel-bin/tensorflow/tools/android/inference_interface/libtensorflow_inference.so
-    copy_lib bazel-bin/tensorflow/examples/android/libtensorflow_demo.so
+    copy_lib bazel-bin/tensorflow/tools/android/test/libtensorflow_demo.so
     copy_lib bazel-bin/tensorflow/tools/benchmark/benchmark_model
 
     mkdir -p ${AAR_LIB_TMP}/jni/${CPU}
@@ -68,10 +68,10 @@ bazel --bazelrc=/dev/null build --config=monolithic --fat_apk_cpu=${CPUS} \
     --spawn_strategy=sandboxed --genrule_strategy=sandboxed \
     //tensorflow/tools/android/inference_interface:android_tensorflow_inference_java \
     //tensorflow/tools/android/inference_interface:android_tensorflow_inference_java.aar \
-    //tensorflow/examples/android:tensorflow_demo
+    //tensorflow/tools/android/test:tensorflow_demo
 
 echo "Copying demo, AAR and Jar to ${OUT_DIR}"
-cp bazel-bin/tensorflow/examples/android/tensorflow_demo.apk \
+cp bazel-bin/tensorflow/tools/android/test/tensorflow_demo.apk \
     bazel-bin/tensorflow/tools/android/inference_interface/libandroid_tensorflow_inference_java.jar ${OUT_DIR}
 
 cp bazel-bin/tensorflow/tools/android/inference_interface/android_tensorflow_inference_java.aar \
diff --git a/tensorflow/tools/ci_build/builds/builds_common.sh b/tensorflow/tools/ci_build/builds/builds_common.sh
index c5698f1068e..461aa81d260 100644
--- a/tensorflow/tools/ci_build/builds/builds_common.sh
+++ b/tensorflow/tools/ci_build/builds/builds_common.sh
@@ -222,7 +222,7 @@ configure_android_workspace() {
       echo "ERROR: Your WORKSPACE file does not seems to have proper android"
       echo "       configuration and not all the environment variables expected"
       echo "       inside ci_build android docker container are set."
-      echo "       Please configure it manually. See: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android/README.md"
+      echo "       Please configure it manually. See: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/android/test/README.md"
     else
       cat << EOF >> WORKSPACE
 android_sdk_repository(
diff --git a/tensorflow/tools/ci_build/builds/docker_test.sh b/tensorflow/tools/ci_build/builds/docker_test.sh
index b2d1dbae433..eee0a9103ff 100755
--- a/tensorflow/tools/ci_build/builds/docker_test.sh
+++ b/tensorflow/tools/ci_build/builds/docker_test.sh
@@ -122,7 +122,7 @@ ${GPU_EXTRA_PARAMS} ${ROCM_EXTRA_PARAMS} \
 "${DOCKER_IMG_TAG}" \
 /bin/bash -c "tensorflow/tools/ci_build/builds/run_pip_tests.sh && "\
 "tensorflow/tools/ci_build/builds/test_tutorials.sh && "\
-"tensorflow/tools/ci_bukld/builds/integration_tests.sh"
+"tensorflow/tools/ci_build/builds/integration_tests.sh"
 
 RESULT=$?
 
diff --git a/tensorflow/tools/ci_build/builds/libtensorflow.sh b/tensorflow/tools/ci_build/builds/libtensorflow.sh
index 1ddc57d2ab5..bfd551f1772 100755
--- a/tensorflow/tools/ci_build/builds/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/builds/libtensorflow.sh
@@ -56,6 +56,7 @@ function build_libtensorflow_tarball() {
   if [ "${TF_NEED_CUDA}" == "1" ]; then
     BAZEL_OPTS="${BAZEL_OPTS} --config=cuda --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain"
     export TF_NEED_ROCM=0
+    export TF_CUDA_COMPUTE_CAPABILITIES="sm_35,sm_50,sm_60,sm_70,sm_75,compute_80"
   fi
   bazel clean --expunge
   yes "" | ./configure
diff --git a/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh b/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
index 19e1232cd92..523daf666d5 100644
--- a/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
+++ b/tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
@@ -23,7 +23,7 @@ MAC_CPU_MAX_WHL_SIZE=165M
 LINUX_CPU_MAX_WHL_SIZE=138M
 WIN_CPU_MAX_WHL_SIZE=113M
 # GPU size
-LINUX_GPU_MAX_WHL_SIZE=337M
+LINUX_GPU_MAX_WHL_SIZE=380M
 WIN_GPU_MAX_WHL_SIZE=252M
 
 function run_smoke_test() {
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index d41972f4e1a..59a5a5d38b7 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -162,7 +162,7 @@ mkdir -p ${WORKSPACE}/bazel-ci_build-cache
 # By default we cleanup - remove the container once it finish running (--rm)
 # and share the PID namespace (--pid=host) so the process inside does not have
 # pid 1 and SIGKILL is propagated to the process inside (jenkins can kill it).
-${DOCKER_BINARY} run --rm --pid=host \
+${DOCKER_BINARY} run --rm --name ${DOCKER_IMG_NAME} --pid=host \
     -v ${WORKSPACE}/bazel-ci_build-cache:${WORKSPACE}/bazel-ci_build-cache \
     -e "CI_BUILD_HOME=${WORKSPACE}/bazel-ci_build-cache" \
     -e "CI_BUILD_USER=$(id -u -n)" \
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index c3daaba6a58..fca6df53dbb 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -114,7 +114,8 @@ do_pylint() {
 "^tensorflow/python/kernel_tests/constant_op_eager_test.py.*\[E0303.*invalid-length-returned "\
 "^tensorflow/python/keras/utils/data_utils.py.*\[E1102.*not-callable "\
 "^tensorflow/python/autograph/.*_py3_test\.py.*\[E0001.*syntax-error "\
-"^tensorflow/python/keras/preprocessing/image\.py.*\[E0240.*Inconsistent method resolution "
+"^tensorflow/python/keras/preprocessing/image\.py.*\[E0240.*Inconsistent method resolution "\
+"^tensorflow/\.py.*\[C0326.*bad-whitespace.*No space allowed around keyword argument assignment "
 
   echo "ERROR_ALLOWLIST=\"${ERROR_ALLOWLIST}\""
 
@@ -126,6 +127,22 @@ do_pylint() {
 
   PYLINT_BIN="python3 -m pylint"
 
+  echo ""
+  echo "check whether pylint is available or not."
+  echo ""
+  ${PYLINT_BIN} --version
+  if [[ $? -eq 0 ]]
+  then
+    echo ""
+    echo "pylint available, proceeding with pylint sanity check."
+    echo ""
+  else
+    echo ""
+    echo "pylint not available." >&2
+    echo ""
+    return 1
+  fi
+
   if [[ "$1" == "--incremental" ]]; then
     PYTHON_SRC_FILES=$(get_py_files_to_check --incremental)
 
@@ -199,7 +216,7 @@ do_pylint() {
     IS_ALLOWLISTED=0
     for WL_REGEX in ${ERROR_ALLOWLIST}; do
       if echo ${LINE} | grep -q "${WL_REGEX}"; then
-        echo "Found a allowlisted error:"
+        echo "Found an allowlisted error:"
         echo "  ${LINE}"
         IS_ALLOWLISTED=1
       fi
diff --git a/tensorflow/tools/ci_build/horovod/gpu/nightly.sh b/tensorflow/tools/ci_build/horovod/gpu/nightly.sh
index 87e5f8003cf..90384742c50 100644
--- a/tensorflow/tools/ci_build/horovod/gpu/nightly.sh
+++ b/tensorflow/tools/ci_build/horovod/gpu/nightly.sh
@@ -14,70 +14,13 @@
 # limitations under the License.
 # ==============================================================================
 set -e
-set -x
 
-# Source the external common scripts.
-source tensorflow/tools/ci_build/release/common.sh
+# Build the docker image
+cd tensorflow/tools/ci_build
+docker build -t horovod_test_container:latest -f Dockerfile.horovod.gpu .
 
-# Exit src directory to avoid Python import issues.
-# We do not need TensorFlow source files.
-mkdir /tmp/horovod_test
-cd /tmp/horovod_test
+docker run --rm \
+  --gpus all \
+  --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 \
+  horovod_test_container:latest bash -c "python3.7 -m pytest"
 
-
-# Update the latest Python dependency packages via pip3.7
-install_ubuntu_16_pip_deps pip3.7
-
-# Install latest bazel
-install_bazelisk
-which bazel
-
-# Install realpath
-sudo apt-get install realpath
-
-# Install tf-nightly and verify version.
-pip3.7 install --user --upgrade tf-nightly
-
-python3.7 -c "import tensorflow as tf; print(tf.__version__)"
-
-# Download and install open-mpi.
-wget https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.4.tar.gz
-tar xvf openmpi-4.0.4.tar.gz
-
-# Install gcc.
-sudo apt install --assume-yes build-essential
-
-gcc --version
-
-cd openmpi-4.0.4
-./configure
-
-# Install open-mpi.
-sudo make all install
-export LD_LIBRARY_PATH=/usr/local/lib/openmpi
-sudo ldconfig
-
-sudo update-alternatives --install /usr/bin/gcc gcc /dt7/usr/bin/gcc 60 --slave /usr/bin/g++ g++ /dt7/usr/bin/g++
-
-g++ --version
-
-# Install Horovod.
-cd ..
-HOROVOD_GPU_OPERATIONS=NCCL
-HOROVOD_WITH_TENSORFLOW=1
-HOROVOD_WITHOUT_PYTORCH=1
-HOROVOD_WITHOUT_MXNET=1
-pip3.7 install horovod[tensorflow] --user
-
-# Install tests.
-git clone https://github.com/DEKHTIARJonathan/TF_HVD_Stability_Test.git
-
-# Install pytest.
-pip3.7 install -U pytest --user
-
-# Install requirements.
-cd TF_HVD_Stability_Test
-pip3.7 install -r requirements.txt --user
-
-# Run the tests.
-python3.7 -m pytest
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 3009213d43a..578967a67cf 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -96,13 +96,17 @@ pip3 install py-cpuinfo
 
 # pylint==1.6.4 requires python-astroid (>= 1.4.5) requires lazy-object-proxy
 # Latest version of lazy-object-proxy (1.4.2) fails to install from source
-# when using setuptools 39.1.0
+# when using setuptools 39.1.0.
+# NOTE: Using the updated version of pylint for python3 as python2 is EOL,
+# thus using the updated version of lazy-object-proxy==1.4.3
 pip2 install lazy-object-proxy==1.4.1
-pip3 install lazy-object-proxy==1.4.1
+pip3 install lazy-object-proxy==1.4.3
 
-# pylint tests require the following:
+# pylint tests require the following version. pylint==1.6.4 hangs erratically,
+# thus using the updated version of 2.5.3 only for python3 as python2 is EOL
+# and this version is not available.
 pip2 install pylint==1.6.4
-pip3 install pylint==1.6.4
+pip3 install pylint==2.5.3
 
 # pycodestyle tests require the following:
 pip2 install pycodestyle
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
index 1b255682671..7b2ba29de8c 100755
--- a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
@@ -58,7 +58,7 @@ ${DOCKER_BINARY} run \
   -e "TF_NEED_HDFS=0" \
   -e "TF_NEED_CUDA=${TF_NEED_CUDA}" \
   -e "TF_NEED_TENSORRT=${TF_NEED_CUDA}" \
+  -e "TF_CUDA_COMPUTE_CAPABILITIES=${TF_CUDA_COMPUTE_CAPABILITIES}" \
   -e "TF_NEED_ROCM=${TF_NEED_ROCM}" \
-  -e "TF_NEED_OPENCL_SYCL=0" \
   "${DOCKER_IMAGE}" \
   "/workspace/tensorflow/tools/ci_build/linux/libtensorflow.sh"
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
index b69431a938e..92d21cb133b 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_cc_core.sh
@@ -18,20 +18,27 @@
 set -e
 set -x
 
-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
+TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+TF_TESTS_PER_GPU=1
+N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
 
 echo ""
-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
+echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
 echo ""
 
+# First positional argument (if any) specifies the ROCM_INSTALL_DIR
+ROCM_INSTALL_DIR=/opt/rocm-3.7.0
+if [[ -n $1 ]]; then
+    ROCM_INSTALL_DIR=$1
+fi
+
 # Run configure.
 export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'
 
 export TF_NEED_ROCM=1
-export ROCM_PATH=/opt/rocm-3.3.0
-export TF_GPU_COUNT=${N_GPUS}
+export ROCM_PATH=$ROCM_INSTALL_DIR
 
 yes "" | $PYTHON_BIN_PATH configure.py
 
@@ -39,29 +46,36 @@ yes "" | $PYTHON_BIN_PATH configure.py
 bazel test \
       --config=rocm \
       -k \
-      --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
+      --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-multi_gpu,-v1only \
       --test_lang_filters=cc \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=${TF_GPU_COUNT}\
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
+      --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+      --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
       --test_timeout 600,900,2400,7200 \
       --build_tests_only \
       --test_output=errors \
       --test_sharding_strategy=disabled \
-      --test_size_filters=small,medium \
+      --test_size_filters=small,medium,large \
       --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
       -- \
       //tensorflow/... \
       -//tensorflow/compiler/... \
       -//tensorflow/lite/... \
+      -//tensorflow/python/integration_testing/... \
+      -//tensorflow/core/tpu/... \
 && bazel test \
       --config=rocm \
       -k \
       --test_tag_filters=gpu \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=1 \
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
+      --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+      --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
       --test_timeout 600,900,2400,7200 \
       --build_tests_only \
       --test_output=errors \
       --test_sharding_strategy=disabled \
+      --test_size_filters=small,medium,large \
       -- \
       //tensorflow/core/nccl:nccl_manager_test
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh b/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh
index df69044837f..80c0686e647 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_csb_tests.sh
@@ -18,20 +18,27 @@
 set -e
 set -x
 
-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
+TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+TF_TESTS_PER_GPU=1
+N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
 
 echo ""
-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
+echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
 echo ""
 
+# First positional argument (if any) specifies the ROCM_INSTALL_DIR
+ROCM_INSTALL_DIR=/opt/rocm-3.7.0
+if [[ -n $1 ]]; then
+    ROCM_INSTALL_DIR=$1
+fi
+
 # Run configure.
 export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'
 
 export TF_NEED_ROCM=1
-export ROCM_PATH=/opt/rocm-3.3.0
-export TF_GPU_COUNT=${N_GPUS}
+export ROCM_PATH=$ROCM_INSTALL_DIR
 
 yes "" | $PYTHON_BIN_PATH configure.py
 
@@ -40,8 +47,10 @@ bazel test \
       --config=rocm \
       -k \
       --test_tag_filters=gpu,-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=${TF_GPU_COUNT} \
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
+      --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+      --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
       --test_timeout 600,900,2400,7200 \
       --test_output=errors \
       --test_sharding_strategy=disabled \
@@ -52,14 +61,16 @@ bazel test \
       -//tensorflow/compiler/... \
       -//tensorflow/lite/... \
       -//tensorflow/python/compiler/tensorrt/... \
+      -//tensorflow/python/integration_testing/... \
+      -//tensorflow/core/tpu/... \
 && bazel test \
       --config=rocm \
       -k \
       --test_tag_filters=gpu \
       --test_timeout 600,900,2400,7200 \
       --test_output=errors \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=1 \
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
       --test_sharding_strategy=disabled \
       -- \
       //tensorflow/core/nccl:nccl_manager_test
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
index d55fa56f970..3a09081dd6a 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
@@ -18,20 +18,27 @@
 set -e
 set -x
 
-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
+TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+TF_TESTS_PER_GPU=1
+N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
 
 echo ""
-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
+echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
 echo ""
 
+# First positional argument (if any) specifies the ROCM_INSTALL_DIR
+ROCM_INSTALL_DIR=/opt/rocm-3.7.0
+if [[ -n $1 ]]; then
+    ROCM_INSTALL_DIR=$1
+fi
+
 # Run configure.
 export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'
 
 export TF_NEED_ROCM=1
-export ROCM_PATH=/opt/rocm-3.3.0
-export TF_GPU_COUNT=${N_GPUS}
+export ROCM_PATH=$ROCM_INSTALL_DIR
 
 yes "" | $PYTHON_BIN_PATH configure.py
 
@@ -41,8 +48,10 @@ bazel test \
       -k \
       --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
       --test_lang_filters=py \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=${TF_GPU_COUNT} \
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
+      --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+      --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
       --test_timeout 600,900,2400,7200 \
       --build_tests_only \
       --test_output=errors \
@@ -52,4 +61,6 @@ bazel test \
       -- \
       //tensorflow/... \
       -//tensorflow/compiler/... \
+      -//tensorflow/python/integration_testing/... \
+      -//tensorflow/core/tpu/... \
       -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nightly_release.sh b/tensorflow/tools/ci_build/nightly_release/macos/cpu_py35.sh
similarity index 100%
rename from tensorflow/tools/ci_build/release/macos/cpu_py35_full/nightly_release.sh
rename to tensorflow/tools/ci_build/nightly_release/macos/cpu_py35.sh
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nightly_release.sh b/tensorflow/tools/ci_build/nightly_release/macos/cpu_py36.sh
similarity index 92%
rename from tensorflow/tools/ci_build/release/macos/cpu_py2_full/nightly_release.sh
rename to tensorflow/tools/ci_build/nightly_release/macos/cpu_py36.sh
index 6dc3e3849ad..33e1491dd86 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/nightly_release/macos/cpu_py36.sh
@@ -23,14 +23,18 @@ install_bazelisk
 export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
 sudo xcode-select -s "${DEVELOPER_DIR}"
 
-# Install dependencies
-install_macos_pip_deps sudo
+install_macos_pip_deps sudo pip3.6
+
+# For python3 path on Mac
+export PATH=$PATH:/usr/local/bin
+
 sudo pip install twine
 
 ./tensorflow/tools/ci_build/update_version.py --nightly
 
 # Run configure.
-export PYTHON_BIN_PATH=$(which python2)
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.6)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Build the pip package
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/release.sh b/tensorflow/tools/ci_build/nightly_release/macos/cpu_py37.sh
similarity index 52%
rename from tensorflow/tools/ci_build/release/macos/cpu_py35_full/release.sh
rename to tensorflow/tools/ci_build/nightly_release/macos/cpu_py37.sh
index 8ee43fb1b2f..631aea318bd 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/release.sh
+++ b/tensorflow/tools/ci_build/nightly_release/macos/cpu_py37.sh
@@ -14,10 +14,17 @@
 # limitations under the License.
 # ==============================================================================
 set -e
+set -x
 
 source tensorflow/tools/ci_build/release/common.sh
 install_bazelisk
 
+# Pick a version of xcode
+export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
+sudo xcode-select -s "${DEVELOPER_DIR}"
+
+install_macos_pip_deps sudo pip3.7
+
 # For python3 path on Mac
 export PATH=$PATH:/usr/local/bin
 
@@ -26,22 +33,34 @@ sudo pip install twine
 ./tensorflow/tools/ci_build/update_version.py --nightly
 
 # Run configure.
-export TF_NEED_CUDA=0
 export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.5)
+export PYTHON_BIN_PATH=$(which python3.7)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Build the pip package
-bazel build --config=opt tensorflow/tools/pip_package:build_pip_package
+bazel build --config=release_cpu_macos tensorflow/tools/pip_package:build_pip_package
 mkdir pip_pkg
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --nightly_flag
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --cpu --nightly_flag
 
-# Also upload the python 3.5 package as python 3.3 and 3.4 packages.
-FILENAME="$(ls pip_pkg/tf_nightly-*dev*-macosx_*.whl)"
-tensorflow/tools/ci_build/copy_binary.py --filename "${FILENAME}" --new_py_ver 33
-tensorflow/tools/ci_build/copy_binary.py --filename "${FILENAME}" --new_py_ver 34
-
-for f in $(ls pip_pkg/tf_nightly-*dev*macosx*.whl); do
-  echo "Uploading package: ${f}"
-  twine upload -r pypi-warehouse "${f}" || echo
+# Copy and rename to tf_nightly
+for f in $(ls pip_pkg/tf_nightly_cpu-*dev*macosx*.whl); do
+  copy_to_new_project_name "${f}" tf_nightly
+done
+
+# Upload the built packages to pypi.
+for f in $(ls pip_pkg/tf_nightly*dev*macosx*.whl); do
+
+  # test the whl pip package
+  chmod +x tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
+  ./tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh ${f}
+  RETVAL=$?
+
+  # Upload the PIP package if whl test passes.
+  if [ ${RETVAL} -eq 0 ]; then
+    echo "Basic PIP test PASSED, Uploading package: ${f}"
+    twine upload -r pypi-warehouse "${f}" || echo
+  else
+    echo "Basic PIP test FAILED, will not upload ${f} package"
+    return 1
+  fi
 done
diff --git a/tensorflow/tools/ci_build/nightly_release/macos/cpu_py38.sh b/tensorflow/tools/ci_build/nightly_release/macos/cpu_py38.sh
new file mode 100644
index 00000000000..5ffef89188c
--- /dev/null
+++ b/tensorflow/tools/ci_build/nightly_release/macos/cpu_py38.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+install_bazelisk
+
+# Pick a version of xcode
+export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
+sudo xcode-select -s "${DEVELOPER_DIR}"
+
+install_macos_pip_deps sudo pip3.8
+
+# For python3 path on Mac
+export PATH=$PATH:/usr/local/bin
+
+sudo pip install twine
+
+./tensorflow/tools/ci_build/update_version.py --nightly
+
+# Run configure.
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.8)
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Build the pip package
+bazel build --config=release_cpu_macos tensorflow/tools/pip_package:build_pip_package
+mkdir pip_pkg
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --cpu --nightly_flag
+
+# Copy and rename to tf_nightly
+for f in $(ls pip_pkg/tf_nightly_cpu-*dev*macosx*.whl); do
+  copy_to_new_project_name "${f}" tf_nightly
+done
+
+# Upload the built packages to pypi.
+for f in $(ls pip_pkg/tf_nightly*dev*macosx*.whl); do
+
+  # test the whl pip package
+  chmod +x tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
+  ./tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh ${f}
+  RETVAL=$?
+
+  # Upload the PIP package if whl test passes.
+  if [ ${RETVAL} -eq 0 ]; then
+    echo "Basic PIP test PASSED, Uploading package: ${f}"
+    twine upload -r pypi-warehouse "${f}" || echo
+  else
+    echo "Basic PIP test FAILED, will not upload ${f} package"
+    return 1
+  fi
+done
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh b/tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py35.sh
similarity index 97%
rename from tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh
rename to tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py35.sh
index 200f3c41725..4af77739c55 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py35.sh
@@ -20,8 +20,6 @@ source tensorflow/tools/ci_build/release/common.sh
 
 install_ubuntu_16_pip_deps pip3.5
 
-pip3.7 install --upgrade auditwheel --user
-
 install_bazelisk
 
 python2.7 tensorflow/tools/ci_build/update_version.py --nightly
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nightly_release.sh b/tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py36.sh
similarity index 90%
rename from tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nightly_release.sh
rename to tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py36.sh
index b60fe5fdc51..9cca17e5517 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py36.sh
@@ -18,8 +18,7 @@ set -x
 
 source tensorflow/tools/ci_build/release/common.sh
 
-# Install python dependencies
-install_ubuntu_16_pip_deps pip2.7
+install_ubuntu_16_pip_deps pip3.6
 
 install_bazelisk
 
@@ -27,7 +26,7 @@ python2.7 tensorflow/tools/ci_build/update_version.py --nightly
 
 # Run configure.
 export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python2.7)
+export PYTHON_BIN_PATH=$(which python3.6)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Build the pip package
@@ -48,10 +47,10 @@ for WHL_PATH in $(ls pip_pkg/tf_nightly_cpu-*dev*.whl); do
   ./tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh ${AUDITED_WHL_NAME}
   RETVAL=$?
 
-  # Update results counter
+  # Upload the PIP package if whl test passes.
   if [ ${RETVAL} -eq 0 ]; then
     echo "Basic PIP test PASSED, Uploading package: ${AUDITED_WHL_NAME}"
-    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}" || echo
+    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}"
   else
     echo "Basic PIP test FAILED, will not upload ${AUDITED_WHL_NAME} package"
     return 1
diff --git a/tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py37.sh b/tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py37.sh
new file mode 100644
index 00000000000..29fe8f4c351
--- /dev/null
+++ b/tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py37.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.7
+
+install_bazelisk
+
+python2.7 tensorflow/tools/ci_build/update_version.py --nightly
+
+# Run configure.
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.7)
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Build the pip package
+bazel build --config=release_cpu_linux tensorflow/tools/pip_package:build_pip_package
+
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --cpu --nightly_flag
+
+# Upload the built packages to pypi.
+for WHL_PATH in $(ls pip_pkg/tf_nightly_cpu-*dev*.whl); do
+
+  WHL_DIR=$(dirname "${WHL_PATH}")
+  WHL_BASE_NAME=$(basename "${WHL_PATH}")
+  AUDITED_WHL_NAME="${WHL_DIR}"/$(echo "${WHL_BASE_NAME//linux/manylinux2010}")
+  auditwheel repair --plat manylinux2010_x86_64 -w "${WHL_DIR}" "${WHL_PATH}"
+
+  # test the whl pip package
+  chmod +x tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
+  ./tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh ${AUDITED_WHL_NAME}
+  RETVAL=$?
+
+  # Upload the PIP package if whl test passes.
+  if [ ${RETVAL} -eq 0 ]; then
+    echo "Basic PIP test PASSED, Uploading package: ${AUDITED_WHL_NAME}"
+    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}"
+  else
+    echo "Basic PIP test FAILED, will not upload ${AUDITED_WHL_NAME} package"
+    return 1
+  fi
+done
diff --git a/tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py38.sh b/tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py38.sh
new file mode 100644
index 00000000000..442d6a4cc76
--- /dev/null
+++ b/tensorflow/tools/ci_build/nightly_release/ubuntu/cpu_py38.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.8
+
+install_bazelisk
+
+python2.7 tensorflow/tools/ci_build/update_version.py --nightly
+
+# Run configure.
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.8)
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Build the pip package
+bazel build --config=release_cpu_linux tensorflow/tools/pip_package:build_pip_package
+
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --cpu --nightly_flag
+
+# Upload the built packages to pypi.
+for WHL_PATH in $(ls pip_pkg/tf_nightly_cpu-*dev*.whl); do
+
+  WHL_DIR=$(dirname "${WHL_PATH}")
+  WHL_BASE_NAME=$(basename "${WHL_PATH}")
+  AUDITED_WHL_NAME="${WHL_DIR}"/$(echo "${WHL_BASE_NAME//linux/manylinux2010}")
+  auditwheel repair --plat manylinux2010_x86_64 -w "${WHL_DIR}" "${WHL_PATH}"
+
+  # test the whl pip package
+  chmod +x tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
+  ./tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh ${AUDITED_WHL_NAME}
+  RETVAL=$?
+
+  # Upload the PIP package if whl test passes.
+  if [ ${RETVAL} -eq 0 ]; then
+    echo "Basic PIP test PASSED, Uploading package: ${AUDITED_WHL_NAME}"
+    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}"
+  else
+    echo "Basic PIP test FAILED, will not upload ${AUDITED_WHL_NAME} package"
+    return 1
+  fi
+done
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nightly_release.sh b/tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py35.sh
similarity index 100%
rename from tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nightly_release.sh
rename to tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py35.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nightly_release.sh b/tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py36.sh
similarity index 76%
rename from tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nightly_release.sh
rename to tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py36.sh
index 079b683a6d5..600b4b0be8e 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py36.sh
@@ -18,30 +18,19 @@ set -x
 
 source tensorflow/tools/ci_build/release/common.sh
 
-install_ubuntu_16_pip_deps pip2.7
+install_ubuntu_16_pip_deps pip3.6
 
 install_bazelisk
 
 python2.7 tensorflow/tools/ci_build/update_version.py --nightly
 
 # Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python2.7)
+export PYTHON_BIN_PATH=$(which python3.6)
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
 # Build the pip package
-bazel build --config=opt --config=v2 \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  tensorflow/tools/pip_package:build_pip_package
+bazel build --config=release_gpu_linux tensorflow/tools/pip_package:build_pip_package
 
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --nightly_flag
 ./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --gpu --nightly_flag
@@ -66,7 +55,7 @@ for WHL_PATH in $(ls pip_pkg/tf_nightly*dev*.whl); do
   # Upload the PIP package if whl test passes.
   if [ ${RETVAL} -eq 0 ]; then
     echo "Basic PIP test PASSED, Uploading package: ${AUDITED_WHL_NAME}"
-    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}" || echo
+    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}"
   else
     echo "Basic PIP test FAILED, will not upload ${AUDITED_WHL_NAME} package"
     return 1
diff --git a/tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py37.sh b/tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py37.sh
new file mode 100644
index 00000000000..a9e51461715
--- /dev/null
+++ b/tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py37.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.7
+
+install_bazelisk
+
+python2.7 tensorflow/tools/ci_build/update_version.py --nightly
+
+# Run configure.
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.7)
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Build the pip package
+bazel build --config=release_gpu_linux tensorflow/tools/pip_package:build_pip_package
+
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --nightly_flag
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --gpu --nightly_flag
+
+# Upload the built packages to pypi.
+for WHL_PATH in $(ls pip_pkg/tf_nightly*dev*.whl); do
+
+  WHL_DIR=$(dirname "${WHL_PATH}")
+  WHL_BASE_NAME=$(basename "${WHL_PATH}")
+  AUDITED_WHL_NAME="${WHL_DIR}"/$(echo "${WHL_BASE_NAME//linux/manylinux2010}")
+
+  # Copy and rename for gpu manylinux as we do not want auditwheel to package in libcudart.so
+  WHL_PATH=${AUDITED_WHL_NAME}
+  cp "${WHL_DIR}"/"${WHL_BASE_NAME}" "${WHL_PATH}"
+  echo "Copied manylinux2010 wheel file at: ${WHL_PATH}"
+
+  # test the whl pip package
+  chmod +x tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
+  ./tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh ${AUDITED_WHL_NAME}
+  RETVAL=$?
+
+  # Upload the PIP package if whl test passes.
+  if [ ${RETVAL} -eq 0 ]; then
+    echo "Basic PIP test PASSED, Uploading package: ${AUDITED_WHL_NAME}"
+    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}"
+  else
+    echo "Basic PIP test FAILED, will not upload ${AUDITED_WHL_NAME} package"
+    return 1
+  fi
+done
diff --git a/tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py38.sh b/tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py38.sh
new file mode 100644
index 00000000000..0b8fd1380f2
--- /dev/null
+++ b/tensorflow/tools/ci_build/nightly_release/ubuntu/gpu_py38.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+set -x
+
+source tensorflow/tools/ci_build/release/common.sh
+
+install_ubuntu_16_pip_deps pip3.8
+
+pip3.7 install --upgrade auditwheel --user
+
+update_bazel_linux
+
+python2.7 tensorflow/tools/ci_build/update_version.py --nightly
+
+# Run configure.
+export CC_OPT_FLAGS='-mavx'
+export PYTHON_BIN_PATH=$(which python3.8)
+yes "" | "$PYTHON_BIN_PATH" configure.py
+
+# Build the pip package
+bazel build --config=release_gpu_linux tensorflow/tools/pip_package:build_pip_package
+
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --nightly_flag
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package pip_pkg --gpu --nightly_flag
+
+# Upload the built packages to pypi.
+for WHL_PATH in $(ls pip_pkg/tf_nightly*dev*.whl); do
+
+  WHL_DIR=$(dirname "${WHL_PATH}")
+  WHL_BASE_NAME=$(basename "${WHL_PATH}")
+  AUDITED_WHL_NAME="${WHL_DIR}"/$(echo "${WHL_BASE_NAME//linux/manylinux2010}")
+
+  # Copy and rename for gpu manylinux as we do not want auditwheel to package in libcudart.so
+  WHL_PATH=${AUDITED_WHL_NAME}
+  cp "${WHL_DIR}"/"${WHL_BASE_NAME}" "${WHL_PATH}"
+  echo "Copied manylinux2010 wheel file at: ${WHL_PATH}"
+
+  # test the whl pip package
+  chmod +x tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh
+  ./tensorflow/tools/ci_build/builds/nightly_release_smoke_test.sh ${AUDITED_WHL_NAME}
+  RETVAL=$?
+
+  # Upload the PIP package if whl test passes.
+  if [ ${RETVAL} -eq 0 ]; then
+    echo "Basic PIP test PASSED, Uploading package: ${AUDITED_WHL_NAME}"
+    twine upload -r pypi-warehouse "${AUDITED_WHL_NAME}"
+  else
+    echo "Basic PIP test FAILED, will not upload ${AUDITED_WHL_NAME} package"
+    return 1
+  fi
+done
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly_release.bat b/tensorflow/tools/ci_build/nightly_release/windows/cpu_py35.bat
similarity index 100%
rename from tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly_release.bat
rename to tensorflow/tools/ci_build/nightly_release/windows/cpu_py35.bat
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly.bat b/tensorflow/tools/ci_build/nightly_release/windows/cpu_py36.bat
similarity index 85%
rename from tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly.bat
rename to tensorflow/tools/ci_build/nightly_release/windows/cpu_py36.bat
index ba8dee59853..3af98dddeae 100644
--- a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly.bat
+++ b/tensorflow/tools/ci_build/nightly_release/windows/cpu_py36.bat
@@ -13,8 +13,8 @@
 :: limitations under the License.
 :: =============================================================================
 
-SET PYTHON_DIRECTORY=Python35
+SET PYTHON_DIRECTORY=Python36
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --tf_nightly --project_name "tf_nightly_cpu"
diff --git a/tensorflow/tools/ci_build/rel/windows/cpu_py35.bat b/tensorflow/tools/ci_build/nightly_release/windows/cpu_py37.bat
similarity index 80%
rename from tensorflow/tools/ci_build/rel/windows/cpu_py35.bat
rename to tensorflow/tools/ci_build/nightly_release/windows/cpu_py37.bat
index 175917d7cad..850c21ee962 100644
--- a/tensorflow/tools/ci_build/rel/windows/cpu_py35.bat
+++ b/tensorflow/tools/ci_build/nightly_release/windows/cpu_py37.bat
@@ -13,8 +13,8 @@
 :: limitations under the License.
 :: =============================================================================
 
-SET PYTHON_DIRECTORY=Python35
+SET PYTHON_DIRECTORY=Python37
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --tf_nightly --project_name "tf_nightly_cpu"
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat b/tensorflow/tools/ci_build/nightly_release/windows/cpu_py38.bat
similarity index 80%
rename from tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat
rename to tensorflow/tools/ci_build/nightly_release/windows/cpu_py38.bat
index 175917d7cad..2456b1e26bb 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release.bat
+++ b/tensorflow/tools/ci_build/nightly_release/windows/cpu_py38.bat
@@ -13,8 +13,8 @@
 :: limitations under the License.
 :: =============================================================================
 
-SET PYTHON_DIRECTORY=Python35
+SET PYTHON_DIRECTORY=Python38
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --tf_nightly --project_name "tf_nightly_cpu"
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly_release.bat b/tensorflow/tools/ci_build/nightly_release/windows/gpu_py35.bat
similarity index 100%
rename from tensorflow/tools/ci_build/release/windows/gpu_py35_full/nightly_release.bat
rename to tensorflow/tools/ci_build/nightly_release/windows/gpu_py35.bat
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly.bat b/tensorflow/tools/ci_build/nightly_release/windows/gpu_py36.bat
similarity index 80%
rename from tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly.bat
rename to tensorflow/tools/ci_build/nightly_release/windows/gpu_py36.bat
index 979a30e046c..15ec83c054e 100644
--- a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/nightly.bat
+++ b/tensorflow/tools/ci_build/nightly_release/windows/gpu_py36.bat
@@ -12,11 +12,9 @@
 :: See the License for the specific language governing permissions and
 :: limitations under the License.
 :: =============================================================================
-echo on
-setlocal enableextensions enabledelayedexpansion
 
-SET PYTHON_DIRECTORY=Python35
+SET PYTHON_DIRECTORY=Python36
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --tf_nightly
diff --git a/tensorflow/tools/ci_build/nightly_release/windows/gpu_py37.bat b/tensorflow/tools/ci_build/nightly_release/windows/gpu_py37.bat
new file mode 100644
index 00000000000..1eb65d8a284
--- /dev/null
+++ b/tensorflow/tools/ci_build/nightly_release/windows/gpu_py37.bat
@@ -0,0 +1,20 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python37
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --tf_nightly
diff --git a/tensorflow/tools/ci_build/nightly_release/windows/gpu_py38.bat b/tensorflow/tools/ci_build/nightly_release/windows/gpu_py38.bat
new file mode 100644
index 00000000000..670793340e8
--- /dev/null
+++ b/tensorflow/tools/ci_build/nightly_release/windows/gpu_py38.bat
@@ -0,0 +1,20 @@
+:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+SET PYTHON_DIRECTORY=Python38
+
+CALL tensorflow\tools\ci_build\release\common_win.bat
+
+call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --tf_nightly
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release_pip_rename.sh b/tensorflow/tools/ci_build/nightly_release/windows/upload_nightly_pip.sh
similarity index 67%
rename from tensorflow/tools/ci_build/release/windows/gpu_py35_full/release_pip_rename.sh
rename to tensorflow/tools/ci_build/nightly_release/windows/upload_nightly_pip.sh
index 039f9516d86..31d21c46816 100644
--- a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release_pip_rename.sh
+++ b/tensorflow/tools/ci_build/nightly_release/windows/upload_nightly_pip.sh
@@ -18,7 +18,14 @@ set -x
 
 source tensorflow/tools/ci_build/release/common.sh
 
-# Copy and rename to tensorflow
-for f in $(ls py_test_dir/tensorflow-*cp3*-cp3*m-win_amd64.whl); do
-  copy_to_new_project_name "${f}" tensorflow_gpu
+sudo pip install --upgrade twine
+
+# Copy and rename to tf_nightly
+for f in $(ls "${KOKORO_GFILE_DIR}"/tf_nightly_gpu*dev*cp3*-cp3*-win_amd64.whl); do
+  copy_to_new_project_name "${f}" tf_nightly
+done
+
+# Upload the built packages to pypi.
+for f in $(ls "${KOKORO_GFILE_DIR}"/tf_nightly*dev*cp3*-cp3*-win_amd64.whl); do
+  twine upload -r pypi-warehouse "$f" || echo
 done
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
index 06798adc03b..69f01520e23 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
@@ -27,7 +27,6 @@ export PYTHON_BIN_PATH="/usr/bin/python"
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
 export TF_NEED_ROCM=0
-export TF_NEED_OPENCL_SYCL=0
 export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
 
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
index 95f1992d7d6..73920e94eec 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
@@ -28,7 +28,6 @@ export LD_LIBRARY_PATH="/usr/local/cuda/lib:/usr/local/cuda/extras/CUPTI/lib:${L
 export PYTHON_BIN_PATH="/usr/bin/python"
 export TF_NEED_HDFS=0
 export TF_NEED_ROCM=0
-export TF_NEED_OPENCL_SYCL=0
 export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
 
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_rocm.sh b/tensorflow/tools/ci_build/osx/libtensorflow_rocm.sh
index aeabc0e39e1..4f3b67f9c26 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_rocm.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_rocm.sh
@@ -28,7 +28,6 @@ export PYTHON_BIN_PATH="/usr/bin/python"
 export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
-export TF_NEED_OPENCL_SYCL=0
 export TF_NEED_MKL=0
 export COMPUTECPP_PATH="/usr/local"
 
diff --git a/tensorflow/tools/ci_build/presubmit/ubuntu_16/android/build.sh b/tensorflow/tools/ci_build/presubmit/ubuntu_16/android/build.sh
index c20a0b14677..81afa226d3f 100644
--- a/tensorflow/tools/ci_build/presubmit/ubuntu_16/android/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/ubuntu_16/android/build.sh
@@ -25,7 +25,7 @@ set +u
 set -x
 
 function run_build () {
-  export ANDROID_NDK_HOME="/opt/android-ndk-r17c"
+  export ANDROID_NDK_HOME="/opt/android-ndk-r18b"
   export NDK_HOME=$ANDROID_NDK_HOME
   export ANDROID_SDK_HOME="/opt/android-sdk/current"
   export ANDROID_API_LEVEL="23"
diff --git a/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py36_full/build.sh b/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py36_full/build.sh
index bde3c3d55e3..2be1dbc3948 100644
--- a/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py36_full/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py36_full/build.sh
@@ -65,11 +65,15 @@ function run_build () {
     --define=framework_shared_object=true \
     --define=with_xla_support=true \
     -c opt \
+    --host_copt="-w" \
     --copt="-w" \
+    --host_copt=-mavx \
     --copt=-mavx \
+    --host_linkopt=-lrt \
     --linkopt=-lrt \
     --distinct_host_configuration=false \
-    --remote_default_exec_properties=build=${CACHE_SILO_VAL} \
+    --remote_default_platform_properties="properties:{name:\"build\" value:\"${CACHE_SILO_VAL}\"}" \
+    --host_crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010:toolchain \
     --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010:toolchain \
     --host_javabase=@bazel_toolchains//configs/ubuntu16_04_clang/1.1:jdk8 \
     --javabase=@bazel_toolchains//configs/ubuntu16_04_clang/1.1:jdk8 \
diff --git a/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py36_full/build.sh b/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py36_full/build.sh
index a27cc881f41..dd461b7578f 100644
--- a/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py36_full/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py36_full/build.sh
@@ -94,12 +94,17 @@ function run_build () {
     --define=allow_oversize_protos=true \
     --define=grpc_no_ares=true \
     -c opt \
+    --host_copt="-w" \
     --copt="-w" \
+    --host_copt=-mavx \
     --copt=-mavx \
+    --host_linkopt=-lrt \
     --linkopt=-lrt \
+    --host_linkopt=-lm \
     --linkopt=-lm \
     --distinct_host_configuration=false \
     --remote_default_exec_properties=build=${CACHE_SILO_VAL} \
+    --host_crosstool_top="${TF_CUDA_CONFIG_REPO}//crosstool:toolchain" \
     --crosstool_top="${TF_CUDA_CONFIG_REPO}//crosstool:toolchain" \
     --host_javabase=@bazel_toolchains//configs/ubuntu16_04_clang/1.1:jdk8 \
     --javabase=@bazel_toolchains//configs/ubuntu16_04_clang/1.0:jdk8 \
diff --git a/tensorflow/tools/ci_build/pylintrc b/tensorflow/tools/ci_build/pylintrc
index 5d65c9644c7..1b24859e88e 100644
--- a/tensorflow/tools/ci_build/pylintrc
+++ b/tensorflow/tools/ci_build/pylintrc
@@ -38,7 +38,7 @@ enable=indexing-exception,old-raise-syntax
 # --enable=similarities". If you want to run only the classes checker, but have
 # no Warning level messages displayed, use"--disable=all --enable=classes
 # --disable=W"
-disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,not-context-manager
+disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,not-context-manager,invalid-sequence-index
 
 
 # Set the cache size for astng objects.
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_libtensorflow.sh b/tensorflow/tools/ci_build/rel/macos/cpu_libtensorflow.sh
index 3dfab5a2aaa..148ab16de6c 100644
--- a/tensorflow/tools/ci_build/rel/macos/cpu_libtensorflow.sh
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_libtensorflow.sh
@@ -13,21 +13,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-echo "chmod go+w lib_package/*" >> tensorflow/tools/ci_build/linux/libtensorflow.sh
-echo "bazel clean --expunge" >> tensorflow/tools/ci_build/linux/libtensorflow.sh
 
-# Install latest bazel
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
+if [[ "$IS_NIGHTLY" -eq 1 ]]; then
+  echo "chmod go+w lib_package/*" >> tensorflow/tools/ci_build/linux/libtensorflow.sh
+  echo "bazel clean --expunge" >> tensorflow/tools/ci_build/linux/libtensorflow.sh
 
-# Pick a version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
+  # Install latest bazel
+  source tensorflow/tools/ci_build/release/common.sh
+  install_bazelisk
 
-# Update the version string to nightly
-./tensorflow/tools/ci_build/update_version.py --nightly
+  # Pick a version of xcode
+  export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
+  sudo xcode-select -s "${DEVELOPER_DIR}"
 
-tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
+  # Update the version string to nightly
+  ./tensorflow/tools/ci_build/update_version.py --nightly
 
-# Copy the nightly version update script
-cp tensorflow/tools/ci_build/builds/libtensorflow_nightly_symlink.sh lib_package
+  tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
+
+  # Copy the nightly version update script
+  cp tensorflow/tools/ci_build/builds/libtensorflow_nightly_symlink.sh lib_package
+
+else
+  set -ex
+  # Install latest bazel
+  source tensorflow/tools/ci_build/release/common.sh
+  install_bazelisk
+  tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
+fi
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py35_nonpip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py35_nonpip.sh
deleted file mode 100644
index 7e85779a207..00000000000
--- a/tensorflow/tools/ci_build/rel/macos/cpu_py35_nonpip.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-python3.5 -m virtualenv tf_build_env --system-site-packages
-source tf_build_env/bin/activate
-
-# Install macos pip dependencies
-install_macos_pip_deps sudo pip3.5
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export TF2_BEHAVIOR=1
-export PYTHON_BIN_PATH=$(which python3.5)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py35,-v1only,-gpu,-tpu,-benchmark-test"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-set +e
-bazel test --test_output=errors --config=opt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} \
-  -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py35_pip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py35_pip.sh
deleted file mode 100644
index 99c2a149394..00000000000
--- a/tensorflow/tools/ci_build/rel/macos/cpu_py35_pip.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-
-# Install macos pip dependencies
-install_macos_pip_deps sudo pip3.5
-
-# Export required variables for running pip_new.sh
-export OS_TYPE="MACOS"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.5'
-export TF_BUILD_BOTH_CPU_PACKAGES=1
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_macos"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="//tensorflow/python/..."
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py35,-gpu,-tpu,-benchmark-test'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py36_nonpip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py36_nonpip.sh
index 07d4f7957af..eb245cb1d04 100644
--- a/tensorflow/tools/ci_build/rel/macos/cpu_py36_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_py36_nonpip.sh
@@ -41,11 +41,9 @@ tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py36,-v1only,-gpu,-tpu,-
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
-set +e
 bazel test --test_output=errors --config=opt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} \
   -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py37_nonpip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py37_nonpip.sh
index a23ca47a038..b9e6c3c9cf0 100644
--- a/tensorflow/tools/ci_build/rel/macos/cpu_py37_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_py37_nonpip.sh
@@ -41,11 +41,9 @@ tag_filters="-no_oss,-oss_serial,-nomac,-no_mac$(maybe_skip_v1),-gpu,-tpu,-bench
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
-set +e
 bazel test --test_output=errors --config=opt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} \
   -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/macos/cpu_py38_nonpip.sh b/tensorflow/tools/ci_build/rel/macos/cpu_py38_nonpip.sh
index 179ecdf97ca..a90d59ff492 100644
--- a/tensorflow/tools/ci_build/rel/macos/cpu_py38_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/macos/cpu_py38_nonpip.sh
@@ -41,11 +41,9 @@ tag_filters="-no_oss,-oss_serial,-nomac,-no_mac$(maybe_skip_v1),-gpu,-tpu,-bench
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
-set +e
 bazel test --test_output=errors --config=opt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} \
   -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_libtensorflow.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_libtensorflow.sh
index a0e3a7f4594..1504688dcbc 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_libtensorflow.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_libtensorflow.sh
@@ -38,3 +38,9 @@ if [ -n "${IS_NIGHTLY_BUILD}" ]; then
   cp tensorflow/tools/ci_build/builds/libtensorflow_nightly_symlink.sh lib_package
 fi
 
+# Upload to go/tf-sizetracker
+python3 ./tensorflow/tools/ci_build/sizetrack_helper.py \
+  --team tensorflow_libtensorflow \
+  --artifact_id ubuntu_cpu_nightly \
+  --upload \
+  --artifact "$(find lib_package -iname "libtensorflow*.tar.gz" -not -iname "*jni*" | head -n 1)"
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_nonpip.sh
deleted file mode 100644
index fee64f0beb1..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_nonpip.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.5)
-export TF2_BEHAVIOR=1
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py35,-v1only"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-set +e
-bazel test --test_output=errors --config=opt --test_lang_filters=py \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_pip.sh
deleted file mode 100644
index bdbb7f15e34..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_py35_pip.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.5'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_linux"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py35,-v1only'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_pip_on_cpu.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_pip_on_cpu.sh
index 6e67bf20730..22ca5b7b567 100755
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_pip_on_cpu.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_pip_on_cpu.sh
@@ -27,14 +27,14 @@ export TF_NEED_GCP=1
 export TF_NEED_HDFS=1
 export TF_NEED_S3=1
 export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.6)
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
@@ -42,7 +42,7 @@ yes "" | "$PYTHON_BIN_PATH" configure.py
 ## Build GPU pip package
 ########################
 bazel build --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
   tensorflow/tools/pip_package:build_pip_package
 
 # Set TF nightly flag so we get the proper version of estimator
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_nonpip.sh
deleted file mode 100644
index 47ed3c4fd2a..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_nonpip.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.5)
-export TF2_BEHAVIOR=1
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35"
-
-set +e
-bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --test_lang_filters=py \
-  --test_tag_filters=${tag_filters} \
-  --build_tag_filters=${tag_filters} \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --test_output=errors --verbose_failures=true --keep_going \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_pip.sh
deleted file mode 100644
index 2a5c550890b..00000000000
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py35_pip.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python3.5'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35'
-export TF_BUILD_FLAGS="--config=release_gpu_linux "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
---distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_gpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_nonpip.sh
index 70038a8d875..c52acec7784 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_nonpip.sh
@@ -27,8 +27,8 @@ export TF_NEED_GCP=1
 export TF_NEED_HDFS=1
 export TF_NEED_S3=1
 export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
@@ -36,7 +36,7 @@ export PYTHON_BIN_PATH=$(which python3.6)
 export TF2_BEHAVIOR=1
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
@@ -47,7 +47,7 @@ tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py36"
 
 set +e
 bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --test_lang_filters=py \
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_pip.sh
index 9aa724c27b9..9bc559a01ab 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py36_pip.sh
@@ -39,7 +39,7 @@ export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss
 export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_nonpip.sh
index 225b2cf4b7b..bf5fabba741 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_nonpip.sh
@@ -27,8 +27,8 @@ export TF_NEED_GCP=1
 export TF_NEED_HDFS=1
 export TF_NEED_S3=1
 export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
@@ -36,7 +36,7 @@ export PYTHON_BIN_PATH=$(which python3.7)
 export TF2_BEHAVIOR=1
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
@@ -47,7 +47,7 @@ tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py37"
 
 set +e
 bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --test_lang_filters=py \
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_pip.sh
index 9bfc6608a0b..71d6f3e6401 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py37_pip.sh
@@ -39,7 +39,7 @@ export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss
 export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_nonpip.sh
index f7678b7436f..5f29daf36e0 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_nonpip.sh
@@ -27,8 +27,8 @@ export TF_NEED_GCP=1
 export TF_NEED_HDFS=1
 export TF_NEED_S3=1
 export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
+export TF_CUDA_VERSION=11
+export TF_CUDNN_VERSION=8
 export TF_NEED_TENSORRT=1
 export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
@@ -36,7 +36,7 @@ export PYTHON_BIN_PATH=$(which python3.8)
 export TF2_BEHAVIOR=1
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
@@ -47,7 +47,7 @@ tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py38"
 
 test +e
 bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
+  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
   --linkopt=-lrt \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --test_lang_filters=py \
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_pip.sh
index d8838e7704a..f49b77bae70 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/gpu_py38_pip.sh
@@ -39,7 +39,7 @@ export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss
 export TF_BUILD_FLAGS="--config=release_gpu_linux "
 export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
 --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=10 --action_env=TF_CUDNN_VERSION=7 --test_env=TF2_BEHAVIOR=1 \
+--action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
 --config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
 --verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
diff --git a/tensorflow/tools/ci_build/rel/windows/cpu_py36.bat b/tensorflow/tools/ci_build/rel/windows/cpu_py36.bat
index 85b75053eff..fde52ca24a5 100644
--- a/tensorflow/tools/ci_build/rel/windows/cpu_py36.bat
+++ b/tensorflow/tools/ci_build/rel/windows/cpu_py36.bat
@@ -17,4 +17,8 @@ SET PYTHON_DIRECTORY=Python36
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+if "%IS_NIGHTLY%" == "1" (
+    call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
+) else (
+    call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+)
diff --git a/tensorflow/tools/ci_build/rel/windows/cpu_py37.bat b/tensorflow/tools/ci_build/rel/windows/cpu_py37.bat
index d8a6673ba4c..4b696bb744e 100644
--- a/tensorflow/tools/ci_build/rel/windows/cpu_py37.bat
+++ b/tensorflow/tools/ci_build/rel/windows/cpu_py37.bat
@@ -17,4 +17,8 @@ SET PYTHON_DIRECTORY=Python37
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+if "%IS_NIGHTLY%" == "1" (
+    call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
+) else (
+    call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+)
diff --git a/tensorflow/tools/ci_build/rel/windows/cpu_py38.bat b/tensorflow/tools/ci_build/rel/windows/cpu_py38.bat
index 86adcda0bb9..a1657b077cb 100644
--- a/tensorflow/tools/ci_build/rel/windows/cpu_py38.bat
+++ b/tensorflow/tools/ci_build/rel/windows/cpu_py38.bat
@@ -17,5 +17,8 @@ SET PYTHON_DIRECTORY=Python38
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
-
+if "%IS_NIGHTLY%" == "1" (
+    call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
+) else (
+    call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow_cpu"
+)
diff --git a/tensorflow/tools/ci_build/rel/windows/gpu_py35.bat b/tensorflow/tools/ci_build/rel/windows/gpu_py35.bat
deleted file mode 100644
index 86c118b2f83..00000000000
--- a/tensorflow/tools/ci_build/rel/windows/gpu_py35.bat
+++ /dev/null
@@ -1,23 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python35
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
-
-for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/rel/windows/gpu_py36.bat b/tensorflow/tools/ci_build/rel/windows/gpu_py36.bat
index cc4f84afbee..6737261ce69 100644
--- a/tensorflow/tools/ci_build/rel/windows/gpu_py36.bat
+++ b/tensorflow/tools/ci_build/rel/windows/gpu_py36.bat
@@ -17,7 +17,10 @@ SET PYTHON_DIRECTORY=Python36
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
-
-for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
\ No newline at end of file
+if "%IS_NIGHTLY%" == "1" (
+    call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
+) else (
+    call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+    for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
+    bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
+)
diff --git a/tensorflow/tools/ci_build/rel/windows/gpu_py37.bat b/tensorflow/tools/ci_build/rel/windows/gpu_py37.bat
index 5fa798e3eb8..7ecfd83927f 100644
--- a/tensorflow/tools/ci_build/rel/windows/gpu_py37.bat
+++ b/tensorflow/tools/ci_build/rel/windows/gpu_py37.bat
@@ -17,7 +17,10 @@ SET PYTHON_DIRECTORY=Python37
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
-
-for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
\ No newline at end of file
+if "%IS_NIGHTLY%" == "1" (
+    call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
+) else (
+    call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+    for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
+    bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
+)
diff --git a/tensorflow/tools/ci_build/rel/windows/gpu_py38.bat b/tensorflow/tools/ci_build/rel/windows/gpu_py38.bat
index fa1fc131145..8d152b10a2e 100644
--- a/tensorflow/tools/ci_build/rel/windows/gpu_py38.bat
+++ b/tensorflow/tools/ci_build/rel/windows/gpu_py38.bat
@@ -17,7 +17,10 @@ SET PYTHON_DIRECTORY=Python38
 
 CALL tensorflow\tools\ci_build\release\common_win.bat
 
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
-
-for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
+if "%IS_NIGHTLY%" == "1" (
+    call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --extra_test_flags "--test_env=TF2_BEHAVIOR=1"
+) else (
+    call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
+    for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
+    bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
+)
diff --git a/tensorflow/tools/ci_build/release/common.sh b/tensorflow/tools/ci_build/release/common.sh
index a22556a7d86..8ec879f3fec 100644
--- a/tensorflow/tools/ci_build/release/common.sh
+++ b/tensorflow/tools/ci_build/release/common.sh
@@ -108,11 +108,6 @@ function install_pip2 {
   sudo python2 get-pip.py
 }
 
-function install_pip3.5 {
-  curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
-  sudo python3.5 get-pip.py
-}
-
 function install_pip_deps {
   SUDO_CMD=""
   PIP_CMD="pip"
@@ -222,6 +217,7 @@ function install_macos_pip_deps {
   ${SUDO_CMD} ${PIP_CMD} install numpy==1.16.0
   ${SUDO_CMD} ${PIP_CMD} install gast==0.3.3
   ${SUDO_CMD} ${PIP_CMD} install h5py==2.10.0
+  ${SUDO_CMD} ${PIP_CMD} install typing_extensions
   ${SUDO_CMD} ${PIP_CMD} install --upgrade grpcio
   ${SUDO_CMD} ${PIP_CMD} install --upgrade tb-nightly
   ${PIP_CMD} install --user --upgrade flatbuffers
@@ -259,22 +255,22 @@ function copy_to_new_project_name {
   VERSION="$(echo "${FULL_TAG}" | cut -d '-' -f 1)"
 
   TMP_DIR="$(mktemp -d)"
-  cp "${WHL_PATH}" "${TMP_DIR}"
-  pushd "${TMP_DIR}"
-  unzip -q "${ORIGINAL_WHL_NAME}"
+  wheel unpack "${WHL_PATH}" -d "${TMP_DIR}"
+  TMP_UNPACKED_DIR="$(ls -d "${TMP_DIR}"/* | head -n 1)"
+  pushd "${TMP_UNPACKED_DIR}"
 
   ORIGINAL_WHL_DIR_PREFIX="${ORIGINAL_PROJECT_NAME}-${VERSION}"
   NEW_WHL_DIR_PREFIX="${NEW_PROJECT_NAME}-${VERSION}"
   mv "${ORIGINAL_WHL_DIR_PREFIX}.dist-info" "${NEW_WHL_DIR_PREFIX}.dist-info"
-  mv "${ORIGINAL_WHL_DIR_PREFIX}.data" "${NEW_WHL_DIR_PREFIX}.data" || echo
-  sed -i.bak "s/${ORIGINAL_PROJECT_NAME}/${NEW_PROJECT_NAME}/g" "${NEW_WHL_DIR_PREFIX}.dist-info/RECORD"
+  if [[ -d "${ORIGINAL_WHL_DIR_PREFIX}.data" ]]; then
+    mv "${ORIGINAL_WHL_DIR_PREFIX}.data" "${NEW_WHL_DIR_PREFIX}.data"
+  fi
 
   ORIGINAL_PROJECT_NAME_DASH="${ORIGINAL_PROJECT_NAME//_/-}"
   NEW_PROJECT_NAME_DASH="${NEW_PROJECT_NAME//_/-}"
   sed -i.bak "s/${ORIGINAL_PROJECT_NAME_DASH}/${NEW_PROJECT_NAME_DASH}/g" "${NEW_WHL_DIR_PREFIX}.dist-info/METADATA"
 
-  zip -rq "${NEW_WHL_NAME}" "${NEW_WHL_DIR_PREFIX}.dist-info" "${NEW_WHL_DIR_PREFIX}.data" "tensorflow" "tensorflow_core"
-  mv "${NEW_WHL_NAME}" "${ORIGINAL_WHL_DIR}"
+  wheel pack "${TMP_UNPACKED_DIR}" -d "${ORIGINAL_WHL_DIR}"
   popd
   rm -rf "${TMP_DIR}"
 }
diff --git a/tensorflow/tools/ci_build/release/common_win.bat b/tensorflow/tools/ci_build/release/common_win.bat
index 23dc09a8d59..41c536a58ff 100644
--- a/tensorflow/tools/ci_build/release/common_win.bat
+++ b/tensorflow/tools/ci_build/release/common_win.bat
@@ -37,9 +37,9 @@ SET PATH=%PATH%;C:\%PYTHON_DIRECTORY%
 %PIP_EXE% install protobuf --upgrade --no-deps
 %PIP_EXE% install keras_preprocessing==1.1.0 --upgrade --no-deps
 %PIP_EXE% install wrapt --upgrade --no-deps
+%PIP_EXE% install absl-py==0.9.0
 
 IF "%PYTHON_DIRECTORY%"=="Python37" (
-    %PIP_EXE% install absl-py==0.5.0
     %PIP_EXE% install colorama==0.3.9
     %PIP_EXE% install cycler==0.10.0
     %PIP_EXE% install jedi==0.11.1
@@ -57,6 +57,7 @@ IF "%PYTHON_DIRECTORY%"=="Python37" (
 @REM handle this case.
 %PIP_EXE% install gast==0.3.3
 %PIP_EXE% install astunparse==1.6.3
+%PIP_EXE% install typing_extensions
 
 :: Set cuda related environment variables. If we are not using CUDA, these are not used.
 IF NOT DEFINED TF_CUDA_VERSION (
@@ -65,7 +66,7 @@ IF NOT DEFINED TF_CUDA_VERSION (
 IF NOT DEFINED TF_CUDNN_VERSION (
   SET TF_CUDNN_VERSION=8
 )
-SET TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+SET TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 SET CUDA_TOOLKIT_PATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v%TF_CUDA_VERSION%
 SET CUDNN_INSTALL_PATH=C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v%TF_CUDA_VERSION%
 SET PATH=%CUDA_TOOLKIT_PATH%\extras\CUPTI\libx64;%PATH%
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh
deleted file mode 100644
index 9031cd9be63..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-python -m virtualenv tf_build_env --system-site-packages
-source tf_build_env/bin/activate
-
-# Install macos pip dependencies
-install_macos_pip_deps sudo
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export TF2_BEHAVIOR=1
-export PYTHON_BIN_PATH=$(which python2)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py2,-v1only,-gpu,-tpu,-benchmark-test"
-
-# Run tests
-set +e
-bazel test --test_output=errors --config=opt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} \
-  -//tensorflow/lite/...
-test_xml_summary_exit
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip_v1.sh
deleted file mode 100644
index 8817b19fa7b..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/nonpip_v1.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-sudo xcode-select --switch /Applications/Xcode_9.2.app/Contents/Developer
-
-# Install pip dependencies
-python -m virtualenv tf_build_env --system-site-packages
-source tf_build_env/bin/activate
-install_macos_pip_deps sudo
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python2)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# TODO(b/122370901): Fix nomac, no_mac inconsistency.
-tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py2"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-bazel test --test_output=errors --config=opt \
-  --incompatible_depset_union=false \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} \
-  -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
deleted file mode 100644
index bcc7b4500d6..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-
-# Install macos pip dependencies
-install_macos_pip_deps sudo
-
-# Export required variables for running pip_new.sh
-export OS_TYPE="MACOS"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python2'
-export TF_BUILD_BOTH_CPU_PACKAGES=1
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_macos"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="//tensorflow/python/..."
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py2,-v1only,-gpu,-tpu,-benchmark-test'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh
deleted file mode 100644
index 188e47fa74b..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Install pip dependencies
-install_macos_pip_deps sudo
-
-# Export required variables for running pip.sh
-export OS_TYPE="MACOS"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python2'
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going"
-export TF_TEST_TARGETS="//tensorflow/python/..."
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py2'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
-
-# Copy and rename to tensorflow_cpu
-for WHL_PATH in $(ls ${TF_ARTIFACTS_DIR}/tensorflow/${TF_PIP_TEST_ROOT}/whl/tensorflow*.whl); do
-  copy_to_new_project_name "${WHL_PATH}" tensorflow_cpu
-done
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_xla/build.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_xla/build.sh
deleted file mode 100644
index 367bc172ba0..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_xla/build.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-sudo xcode-select --switch /Applications/Xcode_9.2.app/Contents/Developer
-
-# Set bazel temporary directory to avoid space issues.
-export TEST_TMPDIR=/tmpfs/bazel_tmp
-mkdir "${TEST_TMPDIR}"
-source tensorflow/tools/ci_build/builds/builds_common.sh
-
-# Do the pyenv shim thing to avoid build breakages.
-export PATH=$(echo "$PATH" | sed "s#${HOME}/.pyenv/shims:##g")
-
-# Fix numpy version
-sudo pip2 install numpy==1.12.1
-sudo pip2 install grpcio
-sudo pip2 install --upgrade setuptools==39.1.0
-
-py_ver="2"
-
-bazel clean
-export TF_ENABLE_XLA=1
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export PYTHON_BIN_PATH=$(which python"${py_ver}")
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-benchmark-test,-nomac,-no_mac,-no_oss_py2"
-
-bazel test --test_output=errors -c opt \
-  --test_tag_filters="${tag_filters}" \
-  --build_tag_filters="${tag_filters}" \
-  --distinct_host_configuration=false --build_tests_only \
-  --java_toolchain=@bazel_tools//tools/jdk:toolchain_hostjdk8 \
-  --test_timeout 300,450,1200,3600 -- \
-  //tensorflow/... -//tensorflow/compiler/...
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh
deleted file mode 100644
index 7e85779a207..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip.sh
+++ /dev/null
@@ -1,51 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-python3.5 -m virtualenv tf_build_env --system-site-packages
-source tf_build_env/bin/activate
-
-# Install macos pip dependencies
-install_macos_pip_deps sudo pip3.5
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export TF2_BEHAVIOR=1
-export PYTHON_BIN_PATH=$(which python3.5)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py35,-v1only,-gpu,-tpu,-benchmark-test"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-set +e
-bazel test --test_output=errors --config=opt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} \
-  -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip_v1.sh
deleted file mode 100644
index f045e7103e0..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/nonpip_v1.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-sudo xcode-select --switch /Applications/Xcode_9.2.app/Contents/Developer
-
-# Install pip dependencies
-install_pip3.5
-install_macos_pip_deps sudo pip3.5
-
-export PATH=$PATH:/usr/local/bin
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.5)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py35"
-
-# Run tests
-bazel test --test_output=errors --config=opt \
-  --incompatible_depset_union=false \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  //tensorflow/... \
-  -//tensorflow/compiler/... \
-  -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
deleted file mode 100644
index 99c2a149394..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Pick a more recent version of xcode
-export DEVELOPER_DIR=/Applications/Xcode_10.3.app/Contents/Developer
-sudo xcode-select -s "${DEVELOPER_DIR}"
-
-# Install macos pip dependencies
-install_macos_pip_deps sudo pip3.5
-
-# Export required variables for running pip_new.sh
-export OS_TYPE="MACOS"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.5'
-export TF_BUILD_BOTH_CPU_PACKAGES=1
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_macos"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="//tensorflow/python/..."
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py35,-gpu,-tpu,-benchmark-test'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh
deleted file mode 100644
index dcbd5b504c8..00000000000
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-install_bazelisk
-
-# Install pip dependencies
-sudo pip3.5 install --upgrade pip
-install_macos_pip_deps sudo pip3.5
-
-# For python3 path on Mac
-export PATH=$PATH:/usr/local/bin
-
-# Export required variables for running pip.sh
-export OS_TYPE="MACOS"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.5'
-
-# Run configure.
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going"
-export TF_TEST_TARGETS="//tensorflow/python/..."
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py35'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
-
-# Copy and rename to tensorflow_cpu
-for WHL_PATH in $(ls ${TF_ARTIFACTS_DIR}/tensorflow/${TF_PIP_TEST_ROOT}/whl/tensorflow*.whl); do
-  copy_to_new_project_name "${WHL_PATH}" tensorflow_cpu
-done
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh
index 07d4f7957af..802c0978bbc 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/nonpip.sh
@@ -41,11 +41,10 @@ tag_filters="-no_oss,-oss_serial,-nomac,-no_mac,-no_oss_py36,-v1only,-gpu,-tpu,-
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
-set +e
 bazel test --test_output=errors --config=opt \
+  --copt=-DGRPC_BAZEL_BUILD \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} \
   -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh
index a23ca47a038..4ad1ef46159 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/nonpip.sh
@@ -41,11 +41,10 @@ tag_filters="-no_oss,-oss_serial,-nomac,-no_mac$(maybe_skip_v1),-gpu,-tpu,-bench
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
-set +e
 bazel test --test_output=errors --config=opt \
+  --copt=-DGRPC_BAZEL_BUILD \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} \
   -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
index 179ecdf97ca..c582582eeb6 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/nonpip.sh
@@ -41,11 +41,10 @@ tag_filters="-no_oss,-oss_serial,-nomac,-no_mac$(maybe_skip_v1),-gpu,-tpu,-bench
 source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 
 # Run tests
-set +e
 bazel test --test_output=errors --config=opt \
+  --copt=-DGRPC_BAZEL_BUILD \
   --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
   --build_tag_filters="${tag_filters}" \
   --test_tag_filters="${tag_filters}" -- \
   ${DEFAULT_BAZEL_TARGETS} \
   -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip.sh
deleted file mode 100644
index 8323625662f..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-# Install python dependencies
-install_ubuntu_16_pip_deps pip2.7
-# Update Bazel to the desired version
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python2.7)
-export TF2_BEHAVIOR=1
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py2,-v1only"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-set +e
-bazel test --test_output=errors --config=opt --test_lang_filters=py \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip_v1.sh
deleted file mode 100644
index f9241673fd1..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/nonpip_v1.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-# Install python dependencies
-install_ubuntu_16_pip_deps pip2.7
-
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python2.7)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py2"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-bazel test --test_output=errors --config=opt --test_lang_filters=py \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh
deleted file mode 100644
index aa1e4b52483..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-# Install python dependencies
-install_ubuntu_16_pip_deps pip2.7
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python2.7'
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --config=v2 --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py2,-v1only'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh
deleted file mode 100644
index bd2e27e8781..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-# Install python dependencies
-install_ubuntu_16_pip_deps pip2.7
-
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python2.7'
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going"
-export TF_TEST_TARGETS="//tensorflow/python/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py2'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip.sh
deleted file mode 100644
index fee64f0beb1..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.5)
-export TF2_BEHAVIOR=1
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py35,-v1only"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-set +e
-bazel test --test_output=errors --config=opt --test_lang_filters=py \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip_v1.sh
deleted file mode 100644
index 4231891fbdb..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/nonpip_v1.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.5)
-yes "" | "$PYTHON_BIN_PATH" configure.py
-tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py35"
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Run tests
-bazel test --test_output=errors --config=opt --test_lang_filters=py \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" -- \
-  ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh
deleted file mode 100644
index bdbb7f15e34..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.5'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=release_cpu_linux"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going --test_env=TF2_BEHAVIOR=1"
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py35,-v1only'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh
deleted file mode 100644
index 1e2665f4120..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="CPU"
-export TF_PYTHON_VERSION='python3.5'
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=0
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_BUILD_FLAGS="--config=opt --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
-export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py --test_output=errors --verbose_failures=true --keep_going"
-export TF_TEST_TARGETS="//tensorflow/python/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py35'
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_cpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh
index c54fe72a55a..9cca17e5517 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/nightly_release.sh
@@ -20,8 +20,6 @@ source tensorflow/tools/ci_build/release/common.sh
 
 install_ubuntu_16_pip_deps pip3.6
 
-pip3.7 install --upgrade auditwheel --user
-
 install_bazelisk
 
 python2.7 tensorflow/tools/ci_build/update_version.py --nightly
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nightly_release.sh
index 4bea46486c3..29fe8f4c351 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/nightly_release.sh
@@ -20,8 +20,6 @@ source tensorflow/tools/ci_build/release/common.sh
 
 install_ubuntu_16_pip_deps pip3.7
 
-pip3.7 install --upgrade auditwheel --user
-
 install_bazelisk
 
 python2.7 tensorflow/tools/ci_build/update_version.py --nightly
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nightly_release.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nightly_release.sh
index 3dc627f23ee..442d6a4cc76 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nightly_release.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/nightly_release.sh
@@ -20,8 +20,6 @@ source tensorflow/tools/ci_build/release/common.sh
 
 install_ubuntu_16_pip_deps pip3.8
 
-pip3.7 install --upgrade auditwheel --user
-
 install_bazelisk
 
 python2.7 tensorflow/tools/ci_build/update_version.py --nightly
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
index e6821a49ba9..22ca5b7b567 100755
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
@@ -34,7 +34,7 @@ export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
 export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.6)
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip.sh
deleted file mode 100644
index e8c8b763d4b..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip2.7
-# Install bazelisk
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python2.7)
-export TF2_BEHAVIOR=1
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py2"
-
-set +e
-bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --test_lang_filters=py \
-  --build_tag_filters="${tag_filters}" \
-  --test_tag_filters="${tag_filters}" \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --test_output=errors --verbose_failures=true --keep_going \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip_v1.sh
deleted file mode 100644
index 20e7977945f..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/nonpip_v1.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip2.7
-
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python2.7)
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py2"
-
-bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --test_lang_filters=py \
-  --build_tag_filters=${tag_filters} \
-  --test_tag_filters=${tag_filters} \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --test_output=errors --verbose_failures=true --keep_going \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh
deleted file mode 100644
index b3f7f158648..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip2.7
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python2.7'
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial'
-export TF_BUILD_FLAGS="--config=opt --config=v2 --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --test_env=TF2_BEHAVIOR=1 \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/..."
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh
deleted file mode 100644
index ea00d9f7539..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip2.7
-
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python2.7'
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial'
-export TF_BUILD_FLAGS="--config=opt --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
---distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
-export TF_TEST_TARGETS="//tensorflow/python/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh
deleted file mode 100644
index 8a0796723b2..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=11
-export TF_CUDNN_VERSION=8
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.5)
-export TF2_BEHAVIOR=1
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35"
-
-set +e
-bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda11:toolchain \
-  --linkopt=-lrt \
-  --action_env=TF2_BEHAVIOR="${TF2_BEHAVIOR}" \
-  --test_lang_filters=py \
-  --test_tag_filters=${tag_filters} \
-  --build_tag_filters=${tag_filters} \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --test_output=errors --verbose_failures=true --keep_going \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
-test_xml_summary_exit
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip_v1.sh
deleted file mode 100644
index e4a5a69c10f..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/nonpip_v1.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-
-install_bazelisk
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which python3.5)
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-tag_filters="gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35"
-
-bazel test --config=cuda --config=opt \
-  --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
-  --linkopt=-lrt \
-  --test_lang_filters=py \
-  --build_tag_filters=${tag_filters} \
-  --test_tag_filters=${tag_filters} \
-  --test_timeout="300,450,1200,3600" --local_test_jobs=4 \
-  --test_output=errors --verbose_failures=true --keep_going \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  -- ${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/...
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
deleted file mode 100644
index f178ac0754e..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-# Update bazel
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python3.5'
-
-# Run configure.
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Get the default test targets for bazel.
-source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35'
-export TF_BUILD_FLAGS="--config=release_gpu_linux "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
---distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION=11 --action_env=TF_CUDNN_VERSION=8 --test_env=TF2_BEHAVIOR=1 \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
-export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME="tensorflow_gpu"
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh
deleted file mode 100644
index a860decbe51..00000000000
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -e
-set -x
-
-source tensorflow/tools/ci_build/release/common.sh
-
-install_ubuntu_16_pip_deps pip3.5
-
-install_bazelisk
-
-# Export required variables for running pip.sh
-export OS_TYPE="UBUNTU"
-export CONTAINER_TYPE="GPU"
-export TF_PYTHON_VERSION='python3.5'
-
-# Run configure.
-export TF_NEED_GCP=1
-export TF_NEED_HDFS=1
-export TF_NEED_S3=1
-export TF_NEED_CUDA=1
-export TF_CUDA_VERSION=10
-export TF_CUDNN_VERSION=7
-export TF_NEED_TENSORRT=1
-export TENSORRT_INSTALL_PATH=/usr/local/tensorrt
-export CC_OPT_FLAGS='-mavx'
-export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
-export PROJECT_NAME="tensorflow_gpu"
-export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
-
-yes "" | "$PYTHON_BIN_PATH" configure.py
-
-# Export optional variables for running pip.sh
-export TF_TEST_FILTER_TAGS='gpu,requires-gpu,-no_gpu,-no_oss,-oss_serial,-no_oss_py35'
-export TF_BUILD_FLAGS="--config=opt --config=cuda --distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain "
-export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filters=${TF_TEST_FILTER_TAGS} \
---distinct_host_configuration=false \
---action_env=TF_CUDA_VERSION --action_env=TF_CUDNN_VERSION \
---config=cuda --test_output=errors --local_test_jobs=4 --test_lang_filters=py \
---verbose_failures=true --keep_going --define=no_tensorflow_py_deps=true \
---run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
-export TF_TEST_TARGETS="//tensorflow/python/... "
-export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
-export TF_PROJECT_NAME=${PROJECT_NAME}
-export TF_PIP_TEST_ROOT="pip_test"
-
-# To build both tensorflow and tensorflow-gpu pip packages
-export TF_BUILD_BOTH_GPU_PACKAGES=1
-
-./tensorflow/tools/ci_build/builds/pip_new.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh
index 42de0e5d137..c52acec7784 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip.sh
@@ -36,7 +36,7 @@ export PYTHON_BIN_PATH=$(which python3.6)
 export TF2_BEHAVIOR=1
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip_v1.sh
index aaa4d017546..1da93811d43 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/nonpip_v1.sh
@@ -35,7 +35,7 @@ export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.6)
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh
index 2ae067c53ce..e3da69ebc32 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh
@@ -40,7 +40,7 @@ export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh
index 86bdd99de0f..bf5fabba741 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip.sh
@@ -36,7 +36,7 @@ export PYTHON_BIN_PATH=$(which python3.7)
 export TF2_BEHAVIOR=1
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip_v1.sh
index 112f232a8e3..a620e3c92d2 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/nonpip_v1.sh
@@ -35,7 +35,7 @@ export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which python3.7)
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
index f6128448b99..a0fb0c40001 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
@@ -40,7 +40,7 @@ export CC_OPT_FLAGS='-mavx'
 export PYTHON_BIN_PATH=$(which ${TF_PYTHON_VERSION})
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
index 141a42fea62..5f29daf36e0 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
@@ -36,7 +36,7 @@ export PYTHON_BIN_PATH=$(which python3.8)
 export TF2_BEHAVIOR=1
 export PROJECT_NAME="tensorflow_gpu"
 export LD_LIBRARY_PATH="/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:$TENSORRT_INSTALL_PATH/lib"
-export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_37,sm_52,sm_60,sm_61,compute_70
+export TF_CUDA_COMPUTE_CAPABILITIES=sm_35,sm_50,sm_60,sm_70,sm_75,compute_80
 
 yes "" | "$PYTHON_BIN_PATH" configure.py
 
diff --git a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release_v1.bat b/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release_v1.bat
deleted file mode 100644
index e0f0bfeae7b..00000000000
--- a/tensorflow/tools/ci_build/release/windows/cpu_py35_full/release_v1.bat
+++ /dev/null
@@ -1,23 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python35
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\cpu\pip\run.bat --release_build
-
-for %%a in ("%~dp0.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release.bat b/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release.bat
deleted file mode 100644
index 86c118b2f83..00000000000
--- a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release.bat
+++ /dev/null
@@ -1,23 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python35
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --extra_test_flags "--test_env=TF2_BEHAVIOR=1" --project_name "tensorflow"
-
-for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release_v1.bat b/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release_v1.bat
deleted file mode 100644
index 55e4e4f5782..00000000000
--- a/tensorflow/tools/ci_build/release/windows/gpu_py35_full/release_v1.bat
+++ /dev/null
@@ -1,23 +0,0 @@
-:: Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::     http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS,
-:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-:: See the License for the specific language governing permissions and
-:: limitations under the License.
-:: =============================================================================
-
-SET PYTHON_DIRECTORY=Python35
-
-CALL tensorflow\tools\ci_build\release\common_win.bat
-
-call tensorflow\tools\ci_build\windows\gpu\pip\run.bat --release_build --project_name "tensorflow"
-
-for %%a in ("%~dp0\.") do set "PARENT_DIR=%%~nxa"
-bash -l tensorflow\tools\ci_build\release\windows\%PARENT_DIR%\release_pip_rename.sh
diff --git a/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
index 6ce1fad9cc7..d623b77d533 100755
--- a/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
+++ b/tensorflow/tools/ci_build/xla/linux/rocm/run_py3.sh
@@ -18,20 +18,27 @@
 set -e
 set -x
 
-N_JOBS=$(grep -c ^processor /proc/cpuinfo)
-N_GPUS=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+N_BUILD_JOBS=$(grep -c ^processor /proc/cpuinfo)
+TF_GPU_COUNT=$(lspci|grep 'controller'|grep 'AMD/ATI'|wc -l)
+TF_TESTS_PER_GPU=1
+N_TEST_JOBS=$(expr ${TF_GPU_COUNT} \* ${TF_TESTS_PER_GPU})
 
 echo ""
-echo "Bazel will use ${N_JOBS} concurrent build job(s) and ${N_GPUS} concurrent test job(s)."
+echo "Bazel will use ${N_BUILD_JOBS} concurrent build job(s) and ${N_TEST_JOBS} concurrent test job(s)."
 echo ""
 
+# First positional argument (if any) specifies the ROCM_INSTALL_DIR
+ROCM_INSTALL_DIR=/opt/rocm-3.7.0
+if [[ -n $1 ]]; then
+    ROCM_INSTALL_DIR=$1
+fi
+
 # Run configure.
 export PYTHON_BIN_PATH=`which python3`
 export CC_OPT_FLAGS='-mavx'
 
 export TF_NEED_ROCM=1
-export ROCM_PATH=/opt/rocm-3.3.0
-export TF_GPU_COUNT=${N_GPUS}
+export ROCM_PATH=$ROCM_INSTALL_DIR
 
 yes "" | $PYTHON_BIN_PATH configure.py
 echo "build --distinct_host_configuration=false" >> .tf_configure.bazelrc
@@ -41,9 +48,11 @@ bazel test \
       --config=rocm \
       --config=xla \
       -k \
-      --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=${TF_GPU_COUNT} \
+      --test_tag_filters=-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
+      --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+      --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
       --test_timeout 600,900,2400,7200 \
       --build_tests_only \
       --test_output=errors \
@@ -65,9 +74,11 @@ bazel test \
       --config=rocm \
       --config=xla \
       -k \
-      --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
-      --jobs=${N_JOBS} \
-      --local_test_jobs=${TF_GPU_COUNT} \
+      --test_tag_filters=-oss_serial,-no_gpu,-no_rocm,-benchmark-test,-rocm_multi_gpu,-v1only \
+      --jobs=${N_BUILD_JOBS} \
+      --local_test_jobs=${N_TEST_JOBS} \
+      --test_env=TF_GPU_COUNT=$TF_GPU_COUNT \
+      --test_env=TF_TESTS_PER_GPU=$TF_TESTS_PER_GPU \
       --test_timeout 600,900,2400,7200 \
       --build_tests_only \
       --test_output=errors \
diff --git a/tensorflow/tools/common/BUILD b/tensorflow/tools/common/BUILD
index 5cc9fa67e6d..80b1e59f1a4 100644
--- a/tensorflow/tools/common/BUILD
+++ b/tensorflow/tools/common/BUILD
@@ -8,8 +8,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 py_library(
     name = "public_api",
     srcs = ["public_api.py"],
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index 18a8d72a0b0..a15014f447b 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -4,6 +4,7 @@ load(
     "tf_cc_test",  # @unused
     "tf_copts",  # @unused
 )
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -80,6 +81,7 @@ py_test(
 py_library(
     name = "renames_v2",
     srcs = ["renames_v2.py"],
+    compatible_with = get_compatible_with_portable(),
     srcs_version = "PY2AND3",
 )
 
@@ -92,6 +94,7 @@ py_library(
 py_library(
     name = "all_renames_v2",
     srcs = ["all_renames_v2.py"],
+    compatible_with = get_compatible_with_portable(),
     srcs_version = "PY2AND3",
     visibility = [
         "//tensorflow:__pkg__",
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
index a8d922cf7a9..e2aa438cb1a 100644
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -282,6 +282,8 @@ renames = {
         'tf.compat.v1.disable_v2_behavior',
     'tf.disable_v2_tensorshape':
         'tf.compat.v1.disable_v2_tensorshape',
+    'tf.distribute.experimental.ParameterServerStrategy':
+        'tf.compat.v1.distribute.experimental.ParameterServerStrategy',
     'tf.distribute.get_loss_reduction':
         'tf.compat.v1.distribute.get_loss_reduction',
     'tf.distributions.Bernoulli':
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index bf149170977..5d795de68c5 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -895,13 +895,13 @@ class TFAPIChangeSpec(ast_edits.NoUpdateSpec):
         "changes in constructor. " + distribute_strategy_api_changes)
 
     contrib_ps_strategy_warning = (
-        ast_edits.ERROR,
-        "(Manual edit required) "
+        ast_edits.ERROR, "(Manual edit required) "
         "tf.contrib.distribute.ParameterServerStrategy has "
         "been migrated to "
-        "tf.distribute.experimental.ParameterServerStrategy (multi machine) "
-        " and tf.distribute.experimental.CentralStorageStrategy (one machine). "
-        "Note the changes in constructors. " + distribute_strategy_api_changes)
+        "tf.compat.v1.distribute.experimental.ParameterServerStrategy (multi "
+        "machine) and tf.distribute.experimental.CentralStorageStrategy (one "
+        "machine). Note the changes in constructors. " +
+        distribute_strategy_api_changes)
 
     keras_experimental_export_comment = (
         ast_edits.WARNING,
diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index 93305a0707c..6f50ae1a935 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -101,6 +101,7 @@ tensorflow::data::GrpcDataServerBase::Start
 tensorflow::data::GrpcDataServerBase::Stop
 tensorflow::data::GrpcDataServerBase::BoundPort
 tensorflow::data::DispatchGrpcDataServer::NumWorkers
+tensorflow::data::WorkerGrpcDataServer::NumTasks
 tensorflow::data::NewDispatchServer
 tensorflow::data::NewWorkerServer
 
@@ -122,6 +123,7 @@ toco::TocoConvert
 toco::TocoGetPotentiallySupportedOps
 toco::MlirQuantizeModel
 toco::MlirSparsifyModel
+toco::RegisterCustomOpdefs
 
 [transform_graph_lib] # transform_graph
 tensorflow::graph_transforms::TransformGraph
@@ -204,6 +206,9 @@ TFE_Py_TensorShapeOnDevice
 TFE_Py_EncodeArg
 TFE_Py_EnableInteractivePythonLogging
 TFE_Py_SetEagerContext
+tensorflow::MakeEagerContextThreadLocalData
+tensorflow::GetEagerContextThreadLocalData
+tensorflow::DestroyEagerContextThreadLocalData
 
 [eager_executor] # tfe
 tensorflow::EagerExecutor::~EagerExecutor
@@ -220,6 +225,7 @@ tensorflow::ExperimentalRunPassPipeline
 tensorflow::ExperimentalConvertSavedModelV1ToMlir
 tensorflow::ExperimentalConvertSavedModelToMlir
 tensorflow::ImportGraphDef
+tensorflow::ImportFunction
 
 [op_gen_lib] # tf_session
 tensorflow::ApiDefMap::~ApiDefMap
@@ -326,24 +332,24 @@ tensorflow::grappler::GetNumAvailableLogicalCPUCores
 [traceme_recorder_impl] # profiler
 tensorflow::profiler::TraceMeRecorder::Record
 
-[annotation_stack_impl] # profiler
-tensorflow::profiler::AnnotationStack::ThreadAnnotationStack
-
 [profiler_session_impl] # profiler
 tensorflow::ProfilerSession::Create
 tensorflow::ProfilerSession::CollectData
-tensorflow::ProfilerSession::SerializeToString
 tensorflow::ProfilerSession::Status
 tensorflow::ProfilerSession::~ProfilerSession
 
 [profiler_server_impl] # profiler
-tensorflow::ProfilerServer::StartProfilerServer
-tensorflow::ProfilerServer::~ProfilerServer
+tensorflow::profiler::ProfilerServer::StartProfilerServer
+tensorflow::profiler::ProfilerServer::~ProfilerServer
 
 [profiler_client_impl] # profiler
 tensorflow::profiler::ProfileGrpc
 tensorflow::profiler::NewSessionGrpc
 tensorflow::profiler::MonitorGrpc
+tensorflow::profiler::RemoteProfilerSession::Create
+tensorflow::profiler::RemoteProfilerSession::GetServiceAddress
+tensorflow::profiler::RemoteProfilerSession::WaitForCompletion
+tensorflow::profiler::RemoteProfilerSession::~RemoteProfilerSession
 
 [status_macros] # tfcompile
 xla::status_macros::MakeErrorStream::Impl::Impl
@@ -381,6 +387,14 @@ tensorflow::IsXlaEnabled
 tensorflow::GetMlirCommonFlags
 tensorflow::GetXlaDeviceFlags
 
-[tf32_utils] # tf32
-tensorflow::allow_tf32_execution
-tensorflow::tf32_execution_allowed
+[tensor_float_32_utils] # tensor_float_32
+tensorflow::enable_tensor_float_32_execution
+tensorflow::tensor_float_32_execution_enabled
+
+[get_compiler_ir] # tfe
+tensorflow::GetCompilerIr
+stream_executor::port::internal_statusor::Helper::Crash
+
+[tensor_handle] # tfe
+tensorflow::TensorHandle::Tensor
+
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
index 83e01bdfd16..23a4e33548a 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
@@ -28,7 +28,7 @@ FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=8.0.2.39-1
+ARG CUDNN=8.0.4.30-1
 ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
 ARG LIBNVINFER=7.1.3-1
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
index 60a3e57c294..a4770817596 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
@@ -28,7 +28,7 @@ FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=8.0.2.39-1
+ARG CUDNN=8.0.4.30-1
 ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
 ARG LIBNVINFER=7.1.3-1
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
index 911678b2ce3..7dda72e533b 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
@@ -28,7 +28,7 @@ FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=8.0.2.39-1
+ARG CUDNN=8.0.4.30-1
 ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
 ARG LIBNVINFER=7.1.3-1
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
index 228513d6736..279a7906c3d 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
@@ -28,7 +28,7 @@ FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=8.0.2.39-1
+ARG CUDNN=8.0.4.30-1
 ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
 ARG LIBNVINFER=7.1.3-1
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpi-horovod-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpi-horovod-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpi-horovod-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpi-horovod.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpi-horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpi-horovod.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpich-horovod-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpich-horovod-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpich-horovod-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpich-horovod.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn-mpich-horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpich-horovod.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-onednn.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-mpi-horovod-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpi-horovod-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-mpi-horovod-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-mpi-horovod.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpi-horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-mpi-horovod.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-mpich-horovod-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpich-horovod-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-mpich-horovod-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-mpich-horovod.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn-mpich-horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-mpich-horovod.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-onednn.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpi-horovod-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpi-horovod-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpi-horovod-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpi-horovod.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpi-horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpi-horovod.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpich-horovod-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpich-horovod-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpich-horovod-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpich-horovod.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn-mpich-horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpich-horovod.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-onednn.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-mpi-horovod-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpi-horovod-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-mpi-horovod-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-mpi-horovod.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpi-horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-mpi-horovod.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-mpich-horovod-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpich-horovod-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-mpich-horovod-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-mpich-horovod.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn-mpich-horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-mpich-horovod.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-onednn.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpi-horovod-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpi-horovod-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpi-horovod-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpi-horovod.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpi-horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpi-horovod.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpich-horovod-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpich-horovod-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpich-horovod-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpich-horovod.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn-mpich-horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpich-horovod.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-onednn.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-mpi-horovod-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpi-horovod-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-mpi-horovod-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-mpi-horovod.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpi-horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-mpi-horovod.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-mpich-horovod-jupyter.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpich-horovod-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-mpich-horovod-jupyter.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-mpich-horovod.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn-mpich-horovod.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-mpich-horovod.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04.Dockerfile
similarity index 100%
rename from tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-onednn.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
index ed310f39ecf..36187c2c9b7 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
@@ -5,7 +5,7 @@ FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=8.0.2.39-1
+ARG CUDNN=8.0.4.30-1
 ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
 ARG LIBNVINFER=7.1.3-1
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
index b2a7b46a7cb..ca9d38e51c7 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
@@ -5,7 +5,7 @@ FROM nvidia/cuda${ARCH:+-$ARCH}:${CUDA}-base-ubuntu${UBUNTU_VERSION} as base
 # (but their default value is retained if set previously)
 ARG ARCH
 ARG CUDA
-ARG CUDNN=8.0.2.39-1
+ARG CUDNN=8.0.4.30-1
 ARG CUDNN_MAJOR_VERSION=8
 ARG LIB_DIR_PREFIX=x86_64
 ARG LIBNVINFER=7.1.3-1
diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml
index 05f6c3c06e0..421f8c56bd1 100644
--- a/tensorflow/tools/dockerfiles/spec.yml
+++ b/tensorflow/tools/dockerfiles/spec.yml
@@ -155,102 +155,9 @@ slice_sets:
               - UBUNTU_VERSION=18.04
               - CHECKOUT_TF_SRC=1
 
-    ubuntu-onednn:
-        - add_to_name: "-16.04-onednn"
-          dockerfile_exclusive_name: "ubuntu-16.04-onednn"
-          dockerfile_subdirectory: "onednn"
-          partials:
-              - onednn/ubuntu/version
-              - onednn/ubuntu/cpu
-              - onednn/ubuntu/python
-              - tensorflow
-              - shell
-          tests:
-              - import-onednn.sh
-          args:
-              - TF_PACKAGE=intel-tensorflow
-              - UBUNTU_VERSION=16.04
-        - add_to_name: "-18.04-onednn"
-          dockerfile_exclusive_name: "ubuntu-18.04-onednn"
-          dockerfile_subdirectory: "onednn"
-          partials:
-              - onednn/ubuntu/version
-              - onednn/ubuntu/cpu
-              - onednn/ubuntu/python
-              - tensorflow
-              - shell
-          tests:
-              - import-onednn.sh
-          args:
-              - TF_PACKAGE=intel-tensorflow
-              - UBUNTU_VERSION=18.04
-        - add_to_name: "-20.04-onednn"
-          dockerfile_exclusive_name: "ubuntu-20.04-onednn"
-          dockerfile_subdirectory: "onednn"
-          partials:
-              - onednn/ubuntu/version
-              - onednn/ubuntu/cpu
-              - onednn/ubuntu/python3
-              - tensorflow
-              - shell
-          tests:
-              - import-onednn.sh
-          args:
-              - TF_PACKAGE=intel-tensorflow
-              - UBUNTU_VERSION=20.04
-              - PYTHON=python3.7
-
-    ubuntu-devel-onednn:
-        - add_to_name: "-16.04-devel-onednn"
-          dockerfile_exclusive_name: "ubuntu-16.04-devel-onednn"
-          dockerfile_subdirectory: "onednn"
-          partials:
-              - onednn/ubuntu/version
-              - onednn/ubuntu/devel
-              - onednn/ubuntu/python
-              - onednn/ubuntu/bazel
-              - shell
-          tests:
-              - ""
-          args:
-              - UBUNTU_VERSION=16.04
-              - CHECKOUT_TF_SRC=1
-              - TF_BRANCH=master
-        - add_to_name: "-18.04-devel-onednn"
-          dockerfile_exclusive_name: "ubuntu-18.04-devel-onednn"
-          dockerfile_subdirectory: "onednn"
-          partials:
-              - onednn/ubuntu/version
-              - onednn/ubuntu/devel
-              - onednn/ubuntu/python
-              - onednn/ubuntu/bazel
-              - shell
-          tests:
-              - ""
-          args:
-              - UBUNTU_VERSION=18.04
-              - CHECKOUT_TF_SRC=1
-              - TF_BRANCH=master
-        - add_to_name: "-20.04-devel-onednn"
-          dockerfile_exclusive_name: "ubuntu-20.04-devel-onednn"
-          dockerfile_subdirectory: "onednn"
-          partials:
-              - onednn/ubuntu/version
-              - onednn/ubuntu/devel
-              - onednn/ubuntu/python3
-              - onednn/ubuntu/bazel
-              - shell
-          tests:
-              - ""
-          args:
-              - UBUNTU_VERSION=20.04
-              - PYTHON=python3.7
-              - CHECKOUT_TF_SRC=1
-              - TF_BRANCH=master
-
     ubuntu-onednn-mpi-horovod:
-        - add_to_name: "-16.04-onednn-mpi-horovod"
-          dockerfile_exclusive_name: "ubuntu-16.04-onednn-mpi-horovod"
+        - add_to_name: "-16.04-mpi-horovod"
+          dockerfile_exclusive_name: "ubuntu-16.04-mpi-horovod"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -266,8 +173,8 @@ slice_sets:
               - UBUNTU_VERSION=16.04
               - DEBIAN_FRONTEND="noninteractive"
               - TF_PACKAGE=intel-tensorflow
-        - add_to_name: "-18.04-onednn-mpi-horovod"
-          dockerfile_exclusive_name: "ubuntu-18.04-onednn-mpi-horovod"
+        - add_to_name: "-18.04-mpi-horovod"
+          dockerfile_exclusive_name: "ubuntu-18.04-mpi-horovod"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -283,8 +190,8 @@ slice_sets:
               - UBUNTU_VERSION=18.04
               - DEBIAN_FRONTEND="noninteractive"
               - TF_PACKAGE=intel-tensorflow
-        - add_to_name: "-20.04-onednn-mpi-horovod"
-          dockerfile_exclusive_name: "ubuntu-20.04-onednn-mpi-horovod"
+        - add_to_name: "-20.04-mpi-horovod"
+          dockerfile_exclusive_name: "ubuntu-20.04-mpi-horovod"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -303,8 +210,8 @@ slice_sets:
               - TF_PACKAGE=intel-tensorflow
 
     ubuntu-devel-onednn-mpi-horovod:
-        - add_to_name: "-16.04-onednn-devel-mpi-horovod"
-          dockerfile_exclusive_name: "ubuntu-16.04-devel-onednn-mpi-horovod"
+        - add_to_name: "-16.04-devel-mpi-horovod"
+          dockerfile_exclusive_name: "ubuntu-16.04-devel-mpi-horovod"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -321,8 +228,8 @@ slice_sets:
               - CHECKOUT_TF_SRC=1
               - CHECKOUT_HOROVOD_SRC=1
               - HOROVOD_BRANCH=master
-        - add_to_name: "-18.04-onednn-devel-mpi-horovod"
-          dockerfile_exclusive_name: "ubuntu-18.04-devel-onednn-mpi-horovod"
+        - add_to_name: "-18.04-devel-mpi-horovod"
+          dockerfile_exclusive_name: "ubuntu-18.04-devel-mpi-horovod"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -339,8 +246,8 @@ slice_sets:
               - CHECKOUT_TF_SRC=1
               - CHECKOUT_HOROVOD_SRC=1
               - HOROVOD_BRANCH=master
-        - add_to_name: "-20.04-onednn-devel-mpi-horovod"
-          dockerfile_exclusive_name: "ubuntu-20.04-devel-onednn-mpi-horovod"
+        - add_to_name: "-20.04-devel-mpi-horovod"
+          dockerfile_exclusive_name: "ubuntu-20.04-devel-mpi-horovod"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -360,8 +267,8 @@ slice_sets:
               - HOROVOD_BRANCH=master
 
     ubuntu-onednn-mpich-horovod:
-        - add_to_name: "-16.04-onednn-mpich-horovod"
-          dockerfile_exclusive_name: "ubuntu-16.04-onednn-mpich-horovod"
+        - add_to_name: "-16.04-mpich-horovod"
+          dockerfile_exclusive_name: "ubuntu-16.04-mpich-horovod"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -377,8 +284,8 @@ slice_sets:
               - UBUNTU_VERSION=16.04
               - DEBIAN_FRONTEND="noninteractive"
               - TF_PACKAGE=intel-tensorflow
-        - add_to_name: "-18.04-onednn-mpich-horovod"
-          dockerfile_exclusive_name: "ubuntu-18.04-onednn-mpich-horovod"
+        - add_to_name: "-18.04-mpich-horovod"
+          dockerfile_exclusive_name: "ubuntu-18.04-mpich-horovod"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -394,8 +301,8 @@ slice_sets:
               - UBUNTU_VERSION=18.04
               - DEBIAN_FRONTEND="noninteractive"
               - TF_PACKAGE=intel-tensorflow
-        - add_to_name: "-20.04-onednn-mpich-horovod"
-          dockerfile_exclusive_name: "ubuntu-20.04-onednn-mpich-horovod"
+        - add_to_name: "-20.04-mpich-horovod"
+          dockerfile_exclusive_name: "ubuntu-20.04-mpich-horovod"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -414,8 +321,8 @@ slice_sets:
               - TF_PACKAGE=intel-tensorflow
 
     ubuntu-devel-onednn-mpich-horovod:
-        - add_to_name: "-16.04-onednn-devel-mpich-horovod"
-          dockerfile_exclusive_name: "ubuntu-16.04-devel-onednn-mpich-horovod"
+        - add_to_name: "-16.04-devel-mpich-horovod"
+          dockerfile_exclusive_name: "ubuntu-16.04-devel-mpich-horovod"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -432,8 +339,8 @@ slice_sets:
               - CHECKOUT_TF_SRC=1
               - CHECKOUT_HOROVOD_SRC=1
               - HOROVOD_BRANCH=master
-        - add_to_name: "-18.04-onednn-devel-mpich-horovod"
-          dockerfile_exclusive_name: "ubuntu-18.04-devel-onednn-mpich-horovod"
+        - add_to_name: "-18.04-devel-mpich-horovod"
+          dockerfile_exclusive_name: "ubuntu-18.04-devel-mpich-horovod"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -450,8 +357,8 @@ slice_sets:
               - CHECKOUT_TF_SRC=1
               - CHECKOUT_HOROVOD_SRC=1
               - HOROVOD_BRANCH=master
-        - add_to_name: "-20.04-onednn-devel-mpich-horovod"
-          dockerfile_exclusive_name: "ubuntu-20.04-devel-onednn-mpich-horovod"
+        - add_to_name: "-20.04-devel-mpich-horovod"
+          dockerfile_exclusive_name: "ubuntu-20.04-devel-mpich-horovod"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -471,8 +378,8 @@ slice_sets:
               - HOROVOD_BRANCH=master
 
     ubuntu-onednn:
-        - add_to_name: "-16.04-onednn"
-          dockerfile_exclusive_name: "ubuntu-16.04-onednn"
+        - add_to_name: "-16.04"
+          dockerfile_exclusive_name: "ubuntu-16.04"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -485,8 +392,8 @@ slice_sets:
           args:
               - TF_PACKAGE=intel-tensorflow
               - UBUNTU_VERSION=16.04
-        - add_to_name: "-18.04-onednn"
-          dockerfile_exclusive_name: "ubuntu-18.04-onednn"
+        - add_to_name: "-18.04"
+          dockerfile_exclusive_name: "ubuntu-18.04"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -499,8 +406,8 @@ slice_sets:
           args:
               - TF_PACKAGE=intel-tensorflow
               - UBUNTU_VERSION=18.04
-        - add_to_name: "-20.04-onednn"
-          dockerfile_exclusive_name: "ubuntu-20.04-onednn"
+        - add_to_name: "-20.04"
+          dockerfile_exclusive_name: "ubuntu-20.04"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -516,8 +423,8 @@ slice_sets:
               - PYTHON=python3.7
 
     ubuntu-devel-onednn:
-        - add_to_name: "-16.04-devel-onednn"
-          dockerfile_exclusive_name: "ubuntu-16.04-devel-onednn"
+        - add_to_name: "-16.04-devel"
+          dockerfile_exclusive_name: "ubuntu-16.04-devel"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -531,8 +438,8 @@ slice_sets:
               - UBUNTU_VERSION=16.04
               - CHECKOUT_TF_SRC=1
               - TF_BRANCH=master
-        - add_to_name: "-18.04-devel-onednn"
-          dockerfile_exclusive_name: "ubuntu-18.04-devel-onednn"
+        - add_to_name: "-18.04-devel"
+          dockerfile_exclusive_name: "ubuntu-18.04-devel"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
@@ -546,8 +453,8 @@ slice_sets:
               - UBUNTU_VERSION=18.04
               - CHECKOUT_TF_SRC=1
               - TF_BRANCH=master
-        - add_to_name: "-20.04-devel-onednn"
-          dockerfile_exclusive_name: "ubuntu-20.04-devel-onednn"
+        - add_to_name: "-20.04-devel"
+          dockerfile_exclusive_name: "ubuntu-20.04-devel"
           dockerfile_subdirectory: "onednn"
           partials:
               - onednn/ubuntu/version
diff --git a/tensorflow/tools/dockerfiles/tflite-android.Dockerfile b/tensorflow/tools/dockerfiles/tflite-android.Dockerfile
index 71286c69642..4501054b77a 100644
--- a/tensorflow/tools/dockerfiles/tflite-android.Dockerfile
+++ b/tensorflow/tools/dockerfiles/tflite-android.Dockerfile
@@ -19,7 +19,7 @@ RUN cd ${ANDROID_DEV_HOME} && \
     bash -c "ln -s ${ANDROID_DEV_HOME}/android-sdk-* ${ANDROID_SDK_HOME}"
 
 # Install Android NDK.
-ENV ANDROID_NDK_FILENAME android-ndk-r17c-linux-x86_64.zip
+ENV ANDROID_NDK_FILENAME android-ndk-r18b-linux-x86_64.zip
 ENV ANDROID_NDK_URL https://dl.google.com/android/repository/${ANDROID_NDK_FILENAME}
 ENV ANDROID_NDK_HOME ${ANDROID_DEV_HOME}/ndk
 ENV PATH ${PATH}:${ANDROID_NDK_HOME}
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 9814059be08..6adc0a73610 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -2,6 +2,9 @@
 #   Doc generator
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_portable")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
@@ -13,8 +16,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 tpu_module = [
     "tpu.",
     "distribute.tpu_strategy",
@@ -55,6 +56,8 @@ py_test(
         ":tf_doctest_lib",
         "//tensorflow:tensorflow_py",
         "//third_party/py/numpy",
+        "@absl_py//absl/flags",
+        "@absl_py//absl/testing:absltest",
     ],
 )
 
@@ -103,6 +106,8 @@ py_test(
         ":tf_doctest_lib",
         "//tensorflow:tensorflow_py",
         "//third_party/py/numpy",
+        "@absl_py//absl/flags",
+        "@absl_py//absl/testing:absltest",
     ],
 )
 
@@ -119,90 +124,19 @@ py_test(
     ],
     deps = [
         ":tf_doctest_lib",
+        "@absl_py//absl/testing:absltest",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
-    name = "doc_generator_visitor",
-    srcs = [
-        "doc_generator_visitor.py",
-    ],
-    srcs_version = "PY2AND3",
-    deps = ["@six_archive//:six"],
-)
-
 py_library(
     name = "doc_controls",
     srcs = ["doc_controls.py"],
+    compatible_with = get_compatible_with_portable(),
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
 )
 
-py_test(
-    name = "doc_controls_test",
-    size = "small",
-    srcs = ["doc_controls_test.py"],
-    python_version = "PY3",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":doc_controls",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
-py_library(
-    name = "parser",
-    srcs = ["parser.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":doc_controls",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:util",
-        "@astor_archive//:astor",
-        "@six_archive//:six",
-    ],
-)
-
-py_library(
-    name = "pretty_docs",
-    srcs = ["pretty_docs.py"],
-    srcs_version = "PY2AND3",
-    deps = ["@six_archive//:six"],
-)
-
-py_library(
-    name = "generate_lib",
-    srcs = ["generate_lib.py"],
-    srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":doc_controls",
-        ":doc_generator_visitor",
-        ":parser",
-        ":pretty_docs",
-        ":py_guide_parser",
-        "//tensorflow/python:util",
-        "//tensorflow/tools/common:public_api",
-        "//tensorflow/tools/common:traverse",
-        "@six_archive//:six",
-    ],
-)
-
-py_binary(
-    name = "generate",
-    srcs = ["generate.py"],
-    python_version = "PY3",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":generate_lib",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:util",
-        "//tensorflow/python/debug:debug_py",
-    ],
-)
-
 py_test(
     name = "generate2_test",
     size = "medium",
@@ -219,6 +153,8 @@ py_test(
     ],
     deps = [
         ":generate2_lib",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:platform_test",
     ],
 )
 
@@ -247,7 +183,11 @@ py_library(
     deps = [
         ":base_dir_oss",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tf_export",
         "//tensorflow/python:util",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
     ],
 )
 
@@ -261,10 +201,3 @@ py_binary(
         "@absl_py//absl/flags",
     ],
 )
-
-py_library(
-    name = "py_guide_parser",
-    srcs = ["py_guide_parser.py"],
-    srcs_version = "PY2AND3",
-    deps = ["@six_archive//:six"],
-)
diff --git a/tensorflow/tools/docs/build_cc_api_headers.py b/tensorflow/tools/docs/build_cc_api_headers.py
new file mode 100644
index 00000000000..c0b67429f73
--- /dev/null
+++ b/tensorflow/tools/docs/build_cc_api_headers.py
@@ -0,0 +1,63 @@
+# Lint as: python3
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Generate Java reference docs for TensorFlow.org."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pathlib
+import subprocess
+
+from absl import app
+from absl import flags
+
+FLAGS = flags.FLAGS
+
+# These flags are required by infrastructure, not all of them are used.
+flags.DEFINE_string('output_dir', None,
+                    ("Use this branch as the root version and don't"
+                     ' create in version directory'))
+
+# __file__ is the path to this file
+DOCS_TOOLS_DIR = pathlib.Path(__file__).resolve().parent
+TENSORFLOW_ROOT = DOCS_TOOLS_DIR.parents[2]
+
+
+def build_headers(output_dir):
+  """Builds the headers files for TF."""
+
+  # `$ yes | configure`
+  yes = subprocess.Popen(['yes', ''], stdout=subprocess.PIPE)
+  configure = subprocess.Popen([TENSORFLOW_ROOT / 'configure'],
+                               stdin=yes.stdout,
+                               cwd=TENSORFLOW_ROOT)
+  configure.communicate()
+
+  subprocess.check_call(['bazel', 'build', 'tensorflow/cc:cc_ops'],
+                        cwd=TENSORFLOW_ROOT)
+  subprocess.check_call(
+      ['cp', '--dereference', '-r', 'bazel-bin', output_dir / 'bazel-genfiles'],
+      cwd=TENSORFLOW_ROOT)
+
+
+def main(argv):
+  del argv
+  build_headers(pathlib.Path(FLAGS.output_dir))
+
+
+if __name__ == '__main__':
+  flags.mark_flags_as_required(['output_dir'])
+  app.run(main)
diff --git a/tensorflow/tools/docs/doc_controls.py b/tensorflow/tools/docs/doc_controls.py
index 27a1d2075e9..631ac89d426 100644
--- a/tensorflow/tools/docs/doc_controls.py
+++ b/tensorflow/tools/docs/doc_controls.py
@@ -18,6 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+_DEPRECATED = "_tf_docs_deprecated"
+
+
+def set_deprecated(obj):
+  """Explicitly tag an object as deprecated for the doc generator."""
+  setattr(obj, _DEPRECATED, None)
+  return obj
+
+
 _DO_NOT_DOC = "_tf_docs_do_not_document"
 
 
@@ -241,82 +250,3 @@ def for_subclass_implementers(obj):
 
 
 do_not_doc_in_subclasses = for_subclass_implementers
-
-
-def should_skip(obj):
-  """Returns true if docs generation should be skipped for this object.
-
-  checks for the `do_not_generate_docs` or `do_not_doc_inheritable` decorators.
-
-  Args:
-    obj: The object to document, or skip.
-
-  Returns:
-    True if the object should be skipped
-  """
-  # Unwrap fget if the object is a property
-  if isinstance(obj, property):
-    obj = obj.fget
-
-  return hasattr(obj, _DO_NOT_DOC) or hasattr(obj, _DO_NOT_DOC_INHERITABLE)
-
-
-def should_skip_class_attr(cls, name):
-  """Returns true if docs should be skipped for this class attribute.
-
-  Args:
-    cls: The class the attribute belongs to.
-    name: The name of the attribute.
-
-  Returns:
-    True if the attribute should be skipped.
-  """
-  # Get the object with standard lookup, from the nearest
-  # defining parent.
-  try:
-    obj = getattr(cls, name)
-  except AttributeError:
-    # Avoid error caused by enum metaclasses in python3
-    if name in ("name", "value"):
-      return True
-    raise
-
-  # Unwrap fget if the object is a property
-  if isinstance(obj, property):
-    obj = obj.fget
-
-  # Skip if the object is decorated with `do_not_generate_docs` or
-  # `do_not_doc_inheritable`
-  if should_skip(obj):
-    return True
-
-  # Use __dict__ lookup to get the version defined in *this* class.
-  obj = cls.__dict__.get(name, None)
-  if isinstance(obj, property):
-    obj = obj.fget
-  if obj is not None:
-    # If not none, the object is defined in *this* class.
-    # Do not skip if decorated with `for_subclass_implementers`.
-    if hasattr(obj, _FOR_SUBCLASS_IMPLEMENTERS):
-      return False
-
-  # for each parent class
-  for parent in cls.__mro__[1:]:
-    obj = getattr(parent, name, None)
-
-    if obj is None:
-      continue
-
-    if isinstance(obj, property):
-      obj = obj.fget
-
-    # Skip if the parent's definition is decorated with `do_not_doc_inheritable`
-    # or `for_subclass_implementers`
-    if hasattr(obj, _DO_NOT_DOC_INHERITABLE):
-      return True
-
-    if hasattr(obj, _FOR_SUBCLASS_IMPLEMENTERS):
-      return True
-
-  # No blockng decorators --> don't skip
-  return False
diff --git a/tensorflow/tools/docs/doc_controls_test.py b/tensorflow/tools/docs/doc_controls_test.py
deleted file mode 100644
index d5eb4ffc000..00000000000
--- a/tensorflow/tools/docs/doc_controls_test.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for documentation control decorators."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.platform import googletest
-from tensorflow.tools.docs import doc_controls
-
-
-class DocControlsTest(googletest.TestCase):
-
-  def test_do_not_generate_docs(self):
-
-    @doc_controls.do_not_generate_docs
-    def dummy_function():
-      pass
-
-    self.assertTrue(doc_controls.should_skip(dummy_function))
-
-  def test_do_not_doc_on_method(self):
-    """The simple decorator is not aware of inheritance."""
-
-    class Parent(object):
-
-      @doc_controls.do_not_generate_docs
-      def my_method(self):
-        pass
-
-    class Child(Parent):
-
-      def my_method(self):
-        pass
-
-    class GrandChild(Child):
-      pass
-
-    self.assertTrue(doc_controls.should_skip(Parent.my_method))
-    self.assertFalse(doc_controls.should_skip(Child.my_method))
-    self.assertFalse(doc_controls.should_skip(GrandChild.my_method))
-
-    self.assertTrue(doc_controls.should_skip_class_attr(Parent, 'my_method'))
-    self.assertFalse(doc_controls.should_skip_class_attr(Child, 'my_method'))
-    self.assertFalse(
-        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
-
-  def test_do_not_doc_inheritable(self):
-
-    class Parent(object):
-
-      @doc_controls.do_not_doc_inheritable
-      def my_method(self):
-        pass
-
-    class Child(Parent):
-
-      def my_method(self):
-        pass
-
-    class GrandChild(Child):
-      pass
-
-    self.assertTrue(doc_controls.should_skip(Parent.my_method))
-    self.assertFalse(doc_controls.should_skip(Child.my_method))
-    self.assertFalse(doc_controls.should_skip(GrandChild.my_method))
-
-    self.assertTrue(doc_controls.should_skip_class_attr(Parent, 'my_method'))
-    self.assertTrue(doc_controls.should_skip_class_attr(Child, 'my_method'))
-    self.assertTrue(
-        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
-
-  def test_do_not_doc_inheritable_property(self):
-
-    class Parent(object):
-
-      @property
-      @doc_controls.do_not_doc_inheritable
-      def my_method(self):
-        pass
-
-    class Child(Parent):
-
-      @property
-      def my_method(self):
-        pass
-
-    class GrandChild(Child):
-      pass
-
-    self.assertTrue(doc_controls.should_skip(Parent.my_method))
-    self.assertFalse(doc_controls.should_skip(Child.my_method))
-    self.assertFalse(doc_controls.should_skip(GrandChild.my_method))
-
-    self.assertTrue(doc_controls.should_skip_class_attr(Parent, 'my_method'))
-    self.assertTrue(doc_controls.should_skip_class_attr(Child, 'my_method'))
-    self.assertTrue(
-        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
-
-  def test_do_not_doc_inheritable_staticmethod(self):
-
-    class GrandParent(object):
-
-      def my_method(self):
-        pass
-
-    class Parent(GrandParent):
-
-      @staticmethod
-      @doc_controls.do_not_doc_inheritable
-      def my_method():
-        pass
-
-    class Child(Parent):
-
-      @staticmethod
-      def my_method():
-        pass
-
-    class GrandChild(Child):
-      pass
-
-    self.assertFalse(doc_controls.should_skip(GrandParent.my_method))
-    self.assertTrue(doc_controls.should_skip(Parent.my_method))
-    self.assertFalse(doc_controls.should_skip(Child.my_method))
-    self.assertFalse(doc_controls.should_skip(GrandChild.my_method))
-
-    self.assertFalse(
-        doc_controls.should_skip_class_attr(GrandParent, 'my_method'))
-    self.assertTrue(doc_controls.should_skip_class_attr(Parent, 'my_method'))
-    self.assertTrue(doc_controls.should_skip_class_attr(Child, 'my_method'))
-    self.assertTrue(
-        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
-
-  def test_for_subclass_implementers(self):
-
-    class GrandParent(object):
-
-      def my_method(self):
-        pass
-
-    class Parent(GrandParent):
-
-      @doc_controls.for_subclass_implementers
-      def my_method(self):
-        pass
-
-    class Child(Parent):
-      pass
-
-    class GrandChild(Child):
-
-      def my_method(self):
-        pass
-
-    class Grand2Child(Child):
-      pass
-
-    self.assertFalse(
-        doc_controls.should_skip_class_attr(GrandParent, 'my_method'))
-    self.assertFalse(doc_controls.should_skip_class_attr(Parent, 'my_method'))
-    self.assertTrue(doc_controls.should_skip_class_attr(Child, 'my_method'))
-    self.assertTrue(
-        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
-    self.assertTrue(
-        doc_controls.should_skip_class_attr(Grand2Child, 'my_method'))
-
-  def test_for_subclass_implementers_short_circuit(self):
-
-    class GrandParent(object):
-
-      @doc_controls.for_subclass_implementers
-      def my_method(self):
-        pass
-
-    class Parent(GrandParent):
-
-      def my_method(self):
-        pass
-
-    class Child(Parent):
-
-      @doc_controls.do_not_doc_inheritable
-      def my_method(self):
-        pass
-
-    class GrandChild(Child):
-
-      @doc_controls.for_subclass_implementers
-      def my_method(self):
-        pass
-
-    class Grand2Child(Child):
-      pass
-
-    self.assertFalse(
-        doc_controls.should_skip_class_attr(GrandParent, 'my_method'))
-    self.assertTrue(doc_controls.should_skip_class_attr(Parent, 'my_method'))
-    self.assertTrue(doc_controls.should_skip_class_attr(Child, 'my_method'))
-    self.assertFalse(
-        doc_controls.should_skip_class_attr(GrandChild, 'my_method'))
-    self.assertTrue(
-        doc_controls.should_skip_class_attr(Grand2Child, 'my_method'))
-
-
-if __name__ == '__main__':
-  googletest.main()
diff --git a/tensorflow/tools/docs/doc_generator_visitor.py b/tensorflow/tools/docs/doc_generator_visitor.py
deleted file mode 100644
index 592e7c1f966..00000000000
--- a/tensorflow/tools/docs/doc_generator_visitor.py
+++ /dev/null
@@ -1,286 +0,0 @@
-# Lint as: python2, python3
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A `traverse` visitor for processing documentation."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-
-from tensorflow.python.util import tf_export
-from tensorflow.python.util import tf_inspect
-
-
-class DocGeneratorVisitor(object):
-  """A visitor that generates docs for a python object when __call__ed."""
-
-  def __init__(self, root_name=''):
-    """Make a visitor.
-
-    As this visitor is starting its traversal at a module or class, it will not
-    be told the name of that object during traversal. `root_name` is the name it
-    should use for that object, effectively prefixing all names with
-    "root_name.".
-
-    Args:
-      root_name: The name of the root module/class.
-    """
-    self.set_root_name(root_name)
-    self._index = {}
-    self._tree = {}
-    self._reverse_index = None
-    self._duplicates = None
-    self._duplicate_of = None
-
-  def set_root_name(self, root_name):
-    """Sets the root name for subsequent __call__s."""
-    self._root_name = root_name or ''
-    self._prefix = (six.ensure_str(root_name) + '.') if root_name else ''
-
-  @property
-  def index(self):
-    """A map from fully qualified names to objects to be documented.
-
-    The index is filled when the visitor is passed to `traverse`.
-
-    Returns:
-      The index filled by traversal.
-    """
-    return self._index
-
-  @property
-  def tree(self):
-    """A map from fully qualified names to all its child names for traversal.
-
-    The full name to member names map is filled when the visitor is passed to
-    `traverse`.
-
-    Returns:
-      The full name to member name map filled by traversal.
-    """
-    return self._tree
-
-  @property
-  def reverse_index(self):
-    """A map from `id(object)` to the preferred fully qualified name.
-
-    This map only contains non-primitive objects (no numbers or strings) present
-    in `index` (for primitive objects, `id()` doesn't quite do the right thing).
-
-    It is computed when it, `duplicate_of`, or `duplicates` are first accessed.
-
-    Returns:
-      The `id(object)` to full name map.
-    """
-    self._maybe_find_duplicates()
-    return self._reverse_index
-
-  @property
-  def duplicate_of(self):
-    """A map from duplicate full names to a preferred fully qualified name.
-
-    This map only contains names that are not themself a preferred name.
-
-    It is computed when it, `reverse_index`, or `duplicates` are first accessed.
-
-    Returns:
-      The map from duplicate name to preferred name.
-    """
-    self._maybe_find_duplicates()
-    return self._duplicate_of
-
-  @property
-  def duplicates(self):
-    """A map from preferred full names to a list of all names for this symbol.
-
-    This function returns a map from preferred (master) name for a symbol to a
-    lexicographically sorted list of all aliases for that name (incl. the master
-    name). Symbols without duplicate names do not appear in this map.
-
-    It is computed when it, `reverse_index`, or `duplicate_of` are first
-    accessed.
-
-    Returns:
-      The map from master name to list of all duplicate names.
-    """
-    self._maybe_find_duplicates()
-    return self._duplicates
-
-  def _add_prefix(self, name):
-    """Adds the root name to a name."""
-    return self._prefix + name if name else self._root_name
-
-  def __call__(self, parent_name, parent, children):
-    """Visitor interface, see `tensorflow/tools/common:traverse` for details.
-
-    This method is called for each symbol found in a traversal using
-    `tensorflow/tools/common:traverse`. It should not be called directly in
-    user code.
-
-    Args:
-      parent_name: The fully qualified name of a symbol found during traversal.
-      parent: The Python object referenced by `parent_name`.
-      children: A list of `(name, py_object)` pairs enumerating, in alphabetical
-        order, the children (as determined by `tf_inspect.getmembers`) of
-          `parent`. `name` is the local name of `py_object` in `parent`.
-
-    Raises:
-      RuntimeError: If this visitor is called with a `parent` that is not a
-        class or module.
-    """
-    parent_name = self._add_prefix(parent_name)
-    self._index[parent_name] = parent
-    self._tree[parent_name] = []
-
-    if not (tf_inspect.ismodule(parent) or tf_inspect.isclass(parent)):
-      raise RuntimeError('Unexpected type in visitor -- %s: %r' % (parent_name,
-                                                                   parent))
-
-    for i, (name, child) in enumerate(list(children)):
-      # Don't document __metaclass__
-      if name in ['__metaclass__']:
-        del children[i]
-        continue
-
-      full_name = '.'.join([parent_name, name]) if parent_name else name
-      self._index[full_name] = child
-      self._tree[parent_name].append(name)
-
-  def _score_name(self, name):
-    """Return a tuple of scores indicating how to sort for the best name.
-
-    This function is meant to be used as the `key` to the `sorted` function.
-
-    This sorting in order:
-      Prefers names refering to the defining class, over a subclass.
-      Prefers names that are not in "contrib".
-      prefers submodules to the root namespace.
-      Prefers short names `tf.thing` over `tf.a.b.c.thing`
-      Sorts lexicographically on name parts.
-
-    Args:
-      name: the full name to score, for example `tf.estimator.Estimator`
-
-    Returns:
-      A tuple of scores. When sorted the preferred name will have the lowest
-      value.
-    """
-    parts = six.ensure_str(name).split('.')
-    short_name = parts[-1]
-
-    container = self._index['.'.join(parts[:-1])]
-
-    defining_class_score = 1
-    if tf_inspect.isclass(container):
-      if short_name in container.__dict__:
-        # prefer the defining class
-        defining_class_score = -1
-
-    contrib_score = -1
-    if 'contrib' in parts:
-      contrib_score = 1
-
-    while parts:
-      container = self._index['.'.join(parts)]
-      if tf_inspect.ismodule(container):
-        break
-      parts.pop()
-
-    module_length = len(parts)
-    if len(parts) == 2:
-      # `tf.submodule.thing` is better than `tf.thing`
-      module_length_score = -1
-    else:
-      # shorter is better
-      module_length_score = module_length
-
-    return (defining_class_score, contrib_score, module_length_score, name)
-
-  def _maybe_find_duplicates(self):
-    """Compute data structures containing information about duplicates.
-
-    Find duplicates in `index` and decide on one to be the "master" name.
-
-    Computes a reverse_index mapping each object id to its master name.
-
-    Also computes a map `duplicate_of` from aliases to their master name (the
-    master name itself has no entry in this map), and a map `duplicates` from
-    master names to a lexicographically sorted list of all aliases for that name
-    (incl. the master name).
-
-    All these are computed and set as fields if they haven't already.
-    """
-    if self._reverse_index is not None:
-      return
-
-    # Maps the id of a symbol to its fully qualified name. For symbols that have
-    # several aliases, this map contains the first one found.
-    # We use id(py_object) to get a hashable value for py_object. Note all
-    # objects in _index are in memory at the same time so this is safe.
-    reverse_index = {}
-
-    # Make a preliminary duplicates map. For all sets of duplicate names, it
-    # maps the first name found to a list of all duplicate names.
-    raw_duplicates = {}
-    for full_name, py_object in six.iteritems(self._index):
-      # We cannot use the duplicate mechanism for some constants, since e.g.,
-      # id(c1) == id(c2) with c1=1, c2=1. This is unproblematic since constants
-      # have no usable docstring and won't be documented automatically.
-      singelton_types = (
-          six.integer_types + six.string_types +
-          (six.binary_type, six.text_type, float, complex, bool))
-      if (py_object not in (None, ()) and
-          not isinstance(py_object, singelton_types)):
-        object_id = id(py_object)
-        if object_id in reverse_index:
-          master_name = reverse_index[object_id]
-          if master_name in raw_duplicates:
-            raw_duplicates[master_name].append(full_name)
-          else:
-            raw_duplicates[master_name] = [master_name, full_name]
-        else:
-          reverse_index[object_id] = full_name
-    # Decide on master names, rewire duplicates and make a duplicate_of map
-    # mapping all non-master duplicates to the master name. The master symbol
-    # does not have an entry in this map.
-    duplicate_of = {}
-    # Duplicates maps the main symbols to the set of all duplicates of that
-    # symbol (incl. itself).
-    duplicates = {}
-    for names in raw_duplicates.values():
-      names = sorted(names)
-      master_name = (
-          tf_export.get_canonical_name_for_symbol(self._index[names[0]])
-          if names else None)
-      if master_name:
-        master_name = 'tf.%s' % master_name
-      else:
-        # Choose the master name with a lexical sort on the tuples returned by
-        # by _score_name.
-        master_name = min(names, key=self._score_name)
-
-      duplicates[master_name] = names
-      for name in names:
-        if name != master_name:
-          duplicate_of[name] = master_name
-
-      # Set the reverse index to the canonical name.
-      reverse_index[id(self._index[master_name])] = master_name
-
-    self._duplicate_of = duplicate_of
-    self._duplicates = duplicates
-    self._reverse_index = reverse_index
diff --git a/tensorflow/tools/docs/generate.py b/tensorflow/tools/docs/generate.py
deleted file mode 100644
index fc93085e3e0..00000000000
--- a/tensorflow/tools/docs/generate.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Generate docs for the TensorFlow Python API."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-
-import tensorflow as tf
-
-from tensorflow.python import debug as tf_debug
-from tensorflow.python.util import tf_inspect
-from tensorflow.tools.docs import generate_lib
-
-if __name__ == '__main__':
-  doc_generator = generate_lib.DocGenerator()
-  doc_generator.add_output_dir_argument()
-  doc_generator.add_src_dir_argument()
-
-  # This doc generator works on the TensorFlow codebase. Since this script lives
-  # at tensorflow/tools/docs, and all code is defined somewhere inside
-  # tensorflow/, we can compute the base directory (two levels up), which is
-  # valid unless we're trying to apply this to a different code base, or are
-  # moving the script around.
-  script_dir = os.path.dirname(tf_inspect.getfile(tf_inspect.currentframe()))
-  default_base_dir = os.path.join(script_dir, '..', '..')
-  doc_generator.add_base_dir_argument(default_base_dir)
-
-  flags = doc_generator.parse_known_args()
-
-  # tf_debug is not imported with tf, it's a separate module altogether
-  doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
-
-  sys.exit(doc_generator.build(flags))
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index 66715ca2b5e..0b3b9e00bb6 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -179,6 +179,11 @@ def build_docs(output_dir, code_url_prefix, search_hints, gen_report):
         cls=cls,
         skip=["__init__"])
 
+  try:
+    doc_controls.do_not_generate_docs(tf.__internal__)
+  except AttributeError:
+    pass
+
   try:
     doc_controls.do_not_generate_docs(tf.__operators__)
   except AttributeError:
@@ -220,6 +225,9 @@ def build_docs(output_dir, code_url_prefix, search_hints, gen_report):
 
   doc_generator.build(output_dir)
 
+  if gen_report:
+    return
+
   out_path = pathlib.Path(output_dir)
 
   expected_path_contents = {
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
deleted file mode 100644
index 3bd0fd615f4..00000000000
--- a/tensorflow/tools/docs/generate_lib.py
+++ /dev/null
@@ -1,644 +0,0 @@
-# Lint as: python2, python3
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Generate docs for the TensorFlow Python API."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import fnmatch
-import os
-import shutil
-import tempfile
-
-import six
-
-from tensorflow.python.util import tf_inspect
-from tensorflow.tools.common import public_api
-from tensorflow.tools.common import traverse
-from tensorflow.tools.docs import doc_controls
-from tensorflow.tools.docs import doc_generator_visitor
-from tensorflow.tools.docs import parser
-from tensorflow.tools.docs import pretty_docs
-from tensorflow.tools.docs import py_guide_parser
-
-
-def write_docs(output_dir,
-               parser_config,
-               yaml_toc,
-               root_title='TensorFlow',
-               search_hints=True,
-               site_api_path='api_docs/python'):
-  """Write previously extracted docs to disk.
-
-  Write a docs page for each symbol included in the indices of parser_config to
-  a tree of docs at `output_dir`.
-
-  Symbols with multiple aliases will have only one page written about
-  them, which is referenced for all aliases.
-
-  Args:
-    output_dir: Directory to write documentation markdown files to. Will be
-      created if it doesn't exist.
-    parser_config: A `parser.ParserConfig` object, containing all the necessary
-      indices.
-    yaml_toc: Set to `True` to generate a "_toc.yaml" file.
-    root_title: The title name for the root level index.md.
-    search_hints: (bool) include meta-data search hints at the top of each
-      output file.
-    site_api_path: The output path relative to the site root. Used in the
-      `_toc.yaml` and `_redirects.yaml` files.
-
-  Raises:
-    ValueError: if `output_dir` is not an absolute path
-  """
-  # Make output_dir.
-  if not os.path.isabs(output_dir):
-    raise ValueError("'output_dir' must be an absolute path.\n"
-                     "    output_dir='%s'" % output_dir)
-
-  if not os.path.exists(output_dir):
-    os.makedirs(output_dir)
-
-  # These dictionaries are used for table-of-contents generation below
-  # They will contain, after the for-loop below::
-  #  - module name(string):classes and functions the module contains(list)
-  module_children = {}
-  #  - symbol name(string):pathname (string)
-  symbol_to_file = {}
-
-  # Collect redirects for an api _redirects.yaml file.
-  redirects = []
-
-  # Parse and write Markdown pages, resolving cross-links (@{symbol}).
-  for full_name, py_object in six.iteritems(parser_config.index):
-    parser_config.reference_resolver.current_doc_full_name = full_name
-
-    if full_name in parser_config.duplicate_of:
-      continue
-
-    # Methods and some routines are documented only as part of their class.
-    if not (tf_inspect.ismodule(py_object) or tf_inspect.isclass(py_object) or
-            parser.is_free_function(py_object, full_name, parser_config.index)):
-      continue
-
-    sitepath = os.path.join(parser.documentation_path(full_name)[:-3])
-
-    # For TOC, we need to store a mapping from full_name to the file
-    # we're generating
-    symbol_to_file[full_name] = sitepath
-
-    # For a module, remember the module for the table-of-contents
-    if tf_inspect.ismodule(py_object):
-      if full_name in parser_config.tree:
-        module_children.setdefault(full_name, [])
-
-    # For something else that's documented,
-    # figure out what module it lives in
-    else:
-      subname = str(full_name)
-      while True:
-        subname = subname[:subname.rindex('.')]
-        if tf_inspect.ismodule(parser_config.index[subname]):
-          module_children.setdefault(subname, []).append(full_name)
-          break
-
-    # Generate docs for `py_object`, resolving references.
-    page_info = parser.docs_for_object(full_name, py_object, parser_config)
-
-    path = os.path.join(output_dir, parser.documentation_path(full_name))
-    directory = os.path.dirname(path)
-    try:
-      if not os.path.exists(directory):
-        os.makedirs(directory)
-      # This function returns raw bytes in PY2 or unicode in PY3.
-      if search_hints:
-        content = [page_info.get_metadata_html()]
-      else:
-        content = ['']
-
-      content.append(pretty_docs.build_md_page(page_info))
-      text = '\n'.join(content)
-      if six.PY3:
-        text = text.encode('utf-8')
-      with open(path, 'wb') as f:
-        f.write(text)
-    except OSError:
-      raise OSError(
-          'Cannot write documentation for %s to %s' % (full_name, directory))
-
-    duplicates = parser_config.duplicates.get(full_name, [])
-    if not duplicates:
-      continue
-
-    duplicates = [item for item in duplicates if item != full_name]
-
-    for dup in duplicates:
-      from_path = os.path.join(site_api_path,
-                               six.ensure_str(dup).replace('.', '/'))
-      to_path = os.path.join(site_api_path,
-                             six.ensure_str(full_name).replace('.', '/'))
-      redirects.append((
-          os.path.join('/', from_path),
-          os.path.join('/', to_path)))
-
-  if redirects:
-    redirects = sorted(redirects)
-    template = ('- from: {}\n'
-                '  to: {}\n')
-    redirects = [template.format(f, t) for f, t in redirects]
-    api_redirects_path = os.path.join(output_dir, '_redirects.yaml')
-    with open(api_redirects_path, 'w') as redirect_file:
-      redirect_file.write('redirects:\n')
-      redirect_file.write(''.join(redirects))
-
-  if yaml_toc:
-    # Generate table of contents
-
-    # Put modules in alphabetical order, case-insensitive
-    modules = sorted(list(module_children.keys()), key=lambda a: a.upper())
-
-    leftnav_path = os.path.join(output_dir, '_toc.yaml')
-    with open(leftnav_path, 'w') as f:
-
-      # Generate header
-      f.write('# Automatically generated file; please do not edit\ntoc:\n')
-      for module in modules:
-        indent_num = module.count('.')
-        # Don't list `tf.submodule` inside `tf`
-        indent_num = max(indent_num, 1)
-        indent = '  '*indent_num
-
-        if indent_num > 1:
-          # tf.contrib.baysflow.entropy will be under
-          #   tf.contrib->baysflow->entropy
-          title = six.ensure_str(module).split('.')[-1]
-        else:
-          title = module
-
-        header = [
-            '- title: ' + six.ensure_str(title), '  section:',
-            '  - title: Overview', '    path: ' +
-            os.path.join('/', site_api_path, symbol_to_file[module])
-        ]
-        header = ''.join([indent+line+'\n' for line in header])
-        f.write(header)
-
-        symbols_in_module = module_children.get(module, [])
-        # Sort case-insensitive, if equal sort case sensitive (upper first)
-        symbols_in_module.sort(key=lambda a: (a.upper(), a))
-
-        for full_name in symbols_in_module:
-          item = [
-              '  - title: ' + full_name[len(module) + 1:],
-              '    path: ' + os.path.join('/', site_api_path,
-                                          symbol_to_file[full_name])]
-          item = ''.join([indent+line+'\n' for line in item])
-          f.write(item)
-
-  # Write a global index containing all full names with links.
-  with open(os.path.join(output_dir, 'index.md'), 'w') as f:
-    f.write(
-        six.ensure_str(
-            parser.generate_global_index(root_title, parser_config.index,
-                                         parser_config.reference_resolver)))
-
-
-def add_dict_to_dict(add_from, add_to):
-  for key in add_from:
-    if key in add_to:
-      add_to[key].extend(add_from[key])
-    else:
-      add_to[key] = add_from[key]
-
-
-# Exclude some libraries in contrib from the documentation altogether.
-def _get_default_private_map():
-  return {
-      'tf.test': ['mock'],
-      'tf': ['contrib'],
-      'tf.compat': ['v1', 'v2'],
-  }
-
-
-# Exclude members of some libraries.
-def _get_default_do_not_descend_map():
-  # TODO(markdaoust): Use docs_controls decorators, locally, instead.
-  return {
-      'tf': ['cli', 'lib', 'wrappers'],
-  }
-
-
-class DocControlsAwareCrawler(public_api.PublicAPIVisitor):
-  """A `docs_controls` aware API-crawler."""
-
-  def _is_private(self, path, name, obj):
-    if doc_controls.should_skip(obj):
-      return True
-    return super(DocControlsAwareCrawler, self)._is_private(path, name, obj)
-
-
-def extract(py_modules,
-            private_map,
-            do_not_descend_map,
-            visitor_cls=doc_generator_visitor.DocGeneratorVisitor):
-  """Extract docs from tf namespace and write them to disk."""
-  # Traverse the first module.
-  visitor = visitor_cls(py_modules[0][0])
-  api_visitor = DocControlsAwareCrawler(visitor)
-  api_visitor.set_root_name(py_modules[0][0])
-  add_dict_to_dict(private_map, api_visitor.private_map)
-  add_dict_to_dict(do_not_descend_map, api_visitor.do_not_descend_map)
-
-  traverse.traverse(py_modules[0][1], api_visitor)
-
-  # Traverse all py_modules after the first:
-  for module_name, module in py_modules[1:]:
-    visitor.set_root_name(module_name)
-    api_visitor.set_root_name(module_name)
-    traverse.traverse(module, api_visitor)
-
-  return visitor
-
-
-class _GetMarkdownTitle(py_guide_parser.PyGuideParser):
-  """Extract the title from a .md file."""
-
-  def __init__(self):
-    self.title = None
-    py_guide_parser.PyGuideParser.__init__(self)
-
-  def process_title(self, _, title):
-    if self.title is None:  # only use the first title
-      self.title = title
-
-
-class _DocInfo(object):
-  """A simple struct for holding a doc's url and title."""
-
-  def __init__(self, url, title):
-    self.url = url
-    self.title = title
-
-
-def build_doc_index(src_dir):
-  """Build an index from a keyword designating a doc to _DocInfo objects."""
-  doc_index = {}
-  if not os.path.isabs(src_dir):
-    raise ValueError("'src_dir' must be an absolute path.\n"
-                     "    src_dir='%s'" % src_dir)
-
-  if not os.path.exists(src_dir):
-    raise ValueError("'src_dir' path must exist.\n"
-                     "    src_dir='%s'" % src_dir)
-
-  for dirpath, _, filenames in os.walk(src_dir):
-    suffix = os.path.relpath(path=dirpath, start=src_dir)
-    for base_name in filenames:
-      if not six.ensure_str(base_name).endswith('.md'):
-        continue
-      title_parser = _GetMarkdownTitle()
-      title_parser.process(os.path.join(dirpath, base_name))
-      if title_parser.title is None:
-        msg = ('`{}` has no markdown title (# title)'.format(
-            os.path.join(dirpath, base_name)))
-        raise ValueError(msg)
-      key_parts = six.ensure_str(os.path.join(suffix,
-                                              base_name[:-3])).split('/')
-      if key_parts[-1] == 'index':
-        key_parts = key_parts[:-1]
-      doc_info = _DocInfo(os.path.join(suffix, base_name), title_parser.title)
-      doc_index[key_parts[-1]] = doc_info
-      if len(key_parts) > 1:
-        doc_index['/'.join(key_parts[-2:])] = doc_info
-
-  return doc_index
-
-
-class _GuideRef(object):
-
-  def __init__(self, base_name, title, section_title, section_tag):
-    self.url = 'api_guides/python/' + six.ensure_str(
-        (('%s#%s' % (base_name, section_tag)) if section_tag else base_name))
-    self.link_text = (('%s > %s' % (title, section_title))
-                      if section_title else title)
-
-  def make_md_link(self, url_prefix):
-    return '[%s](%s%s)' % (self.link_text, url_prefix, self.url)
-
-
-class _GenerateGuideIndex(py_guide_parser.PyGuideParser):
-  """Turn guide files into an index from symbol name to a list of _GuideRefs."""
-
-  def __init__(self):
-    self.index = {}
-    py_guide_parser.PyGuideParser.__init__(self)
-
-  def process(self, full_path, base_name):
-    """Index a file, reading from `full_path`, with `base_name` as the link."""
-    self.full_path = full_path
-    self.base_name = base_name
-    self.title = None
-    self.section_title = None
-    self.section_tag = None
-    py_guide_parser.PyGuideParser.process(self, full_path)
-
-  def process_title(self, _, title):
-    if self.title is None:  # only use the first title
-      self.title = title
-
-  def process_section(self, _, section_title, tag):
-    self.section_title = section_title
-    self.section_tag = tag
-
-  def process_line(self, _, line):
-    """Index the file and section of each `symbol` reference."""
-    for match in parser.AUTO_REFERENCE_RE.finditer(line):
-      val = self.index.get(match.group(1), [])
-      val.append(
-          _GuideRef(self.base_name, self.title, self.section_title,
-                    self.section_tag))
-      self.index[match.group(1)] = val
-
-
-def _build_guide_index(guide_src_dir):
-  """Return dict: symbol name -> _GuideRef from the files in `guide_src_dir`."""
-  index_generator = _GenerateGuideIndex()
-  if os.path.exists(guide_src_dir):
-    for full_path, base_name in py_guide_parser.md_files_in_dir(guide_src_dir):
-      index_generator.process(full_path, base_name)
-  return index_generator.index
-
-
-class _UpdateTags(py_guide_parser.PyGuideParser):
-  """Rewrites a Python guide so that each section has an explicit id tag.
-
-  "section" here refers to blocks delimited by second level headings.
-  """
-
-  def process_section(self, line_number, section_title, tag):
-    self.replace_line(line_number, '<h2 id="%s">%s</h2>' % (tag, section_title))
-
-
-def update_id_tags_inplace(src_dir):
-  """Set explicit ids on all second-level headings to ensure back-links work.
-
-  Args:
-    src_dir: The directory of md-files to convert (inplace).
-  """
-  tag_updater = _UpdateTags()
-
-  for dirpath, _, filenames in os.walk(src_dir):
-    for base_name in filenames:
-      if not base_name.endswith('.md'):
-        continue
-      full_path = os.path.join(src_dir, dirpath, base_name)
-
-      # Tag updater loads the file, makes the replacements, and returns the
-      # modified file contents
-      content = tag_updater.process(full_path)
-      with open(full_path, 'w') as f:
-        f.write(six.ensure_str(content))
-
-
-EXCLUDED = set(['__init__.py', 'OWNERS', 'README.txt'])
-
-
-def replace_refs(src_dir,
-                 output_dir,
-                 reference_resolver,
-                 file_pattern='*.md',
-                 api_docs_relpath='api_docs'):
-  """Fix @{} references in all files under `src_dir` matching `file_pattern`.
-
-  A matching directory structure, with the modified files is
-  written to `output_dir`.
-
-  `{"__init__.py","OWNERS","README.txt"}` are skipped.
-
-  Files not matching `file_pattern` (using `fnmatch`) are copied with no change.
-
-  Also, files in the `api_guides/python` directory get explicit ids set on all
-  heading-2s to ensure back-links work.
-
-  Args:
-    src_dir: The directory to convert files from.
-    output_dir: The root directory to write the resulting files to.
-    reference_resolver: A `parser.ReferenceResolver` to make the replacements.
-    file_pattern: Only replace references in files matching file_patters,
-      using fnmatch. Non-matching files are copied unchanged.
-    api_docs_relpath: Relative-path string to the api_docs, from the src_dir.
-  """
-  # Iterate through all the source files and process them.
-  for dirpath, _, filenames in os.walk(src_dir):
-    depth = os.path.relpath(src_dir, start=dirpath)
-    # How to get from `dirpath` to api_docs/python/
-    relative_path_to_root = os.path.join(depth, api_docs_relpath, 'python')
-
-    # Make the directory under output_dir.
-    new_dir = os.path.join(output_dir,
-                           os.path.relpath(path=dirpath, start=src_dir))
-    if not os.path.exists(new_dir):
-      os.makedirs(new_dir)
-
-    for base_name in filenames:
-      if base_name in EXCLUDED:
-        continue
-      full_in_path = os.path.join(dirpath, base_name)
-
-      # Set the `current_doc_full_name` so bad files can be reported on errors.
-      reference_resolver.current_doc_full_name = full_in_path
-
-      suffix = os.path.relpath(path=full_in_path, start=src_dir)
-      full_out_path = os.path.join(output_dir, suffix)
-      # Copy files that do not match the file_pattern, unmodified.
-      if not fnmatch.fnmatch(base_name, file_pattern):
-        if full_in_path != full_out_path:
-          shutil.copyfile(full_in_path, full_out_path)
-        continue
-
-      with open(full_in_path, 'rb') as f:
-        content = f.read().decode('utf-8')
-
-      content = reference_resolver.replace_references(content,
-                                                      relative_path_to_root)
-      with open(full_out_path, 'wb') as f:
-        f.write(six.ensure_binary(content, 'utf-8'))
-
-
-class DocGenerator(object):
-  """Main entry point for generating docs."""
-
-  def __init__(self):
-    self.argument_parser = argparse.ArgumentParser()
-    self._py_modules = None
-    self._private_map = _get_default_private_map()
-    self._do_not_descend_map = _get_default_do_not_descend_map()
-    self.yaml_toc = True
-
-    self.argument_parser.add_argument(
-        '--no_search_hints',
-        dest='search_hints',
-        action='store_false',
-        default=True)
-
-    self.argument_parser.add_argument(
-        '--site_api_path',
-        type=str, default='api_docs/python',
-        help='The path from the site-root to api_docs'
-             'directory for this project')
-
-    self.argument_parser.add_argument(
-        '--api_cache_out_path',
-        type=str,
-        default=None,
-        help='Path to store a json-serialized api-index, so links can be '
-        'inserted into docs without rebuilding the api_docs')
-
-  def add_output_dir_argument(self):
-    self.argument_parser.add_argument(
-        '--output_dir',
-        type=str,
-        default=None,
-        required=True,
-        help='Directory to write docs to.')
-
-  def add_src_dir_argument(self):
-    self.argument_parser.add_argument(
-        '--src_dir',
-        type=str,
-        default=tempfile.mkdtemp(),
-        required=False,
-        help='Optional directory of source docs to add api_docs links to')
-
-  def add_base_dir_argument(self, default_base_dir):
-    self.argument_parser.add_argument(
-        '--base_dir',
-        type=str,
-        default=default_base_dir,
-        help='Base directory to strip from file names referenced in docs.')
-
-  def parse_known_args(self):
-    flags, _ = self.argument_parser.parse_known_args()
-    return flags
-
-  def add_to_private_map(self, d):
-    add_dict_to_dict(d, self._private_map)
-
-  def add_to_do_not_descend_map(self, d):
-    add_dict_to_dict(d, self._do_not_descend_map)
-
-  def set_private_map(self, d):
-    self._private_map = d
-
-  def set_do_not_descend_map(self, d):
-    self._do_not_descend_map = d
-
-  def set_py_modules(self, py_modules):
-    self._py_modules = py_modules
-
-  def py_module_names(self):
-    if self._py_modules is None:
-      raise RuntimeError(
-          'Must call set_py_modules() before running py_module_names().')
-    return [name for (name, _) in self._py_modules]
-
-  def make_reference_resolver(self, visitor, doc_index):
-    return parser.ReferenceResolver.from_visitor(
-        visitor, doc_index, py_module_names=self.py_module_names())
-
-  def make_parser_config(self, visitor, reference_resolver, guide_index,
-                         base_dir):
-    return parser.ParserConfig(
-        reference_resolver=reference_resolver,
-        duplicates=visitor.duplicates,
-        duplicate_of=visitor.duplicate_of,
-        tree=visitor.tree,
-        index=visitor.index,
-        reverse_index=visitor.reverse_index,
-        guide_index=guide_index,
-        base_dir=base_dir)
-
-  def run_extraction(self):
-    return extract(self._py_modules, self._private_map,
-                   self._do_not_descend_map)
-
-  def build(self, flags):
-    """Build all the docs.
-
-    This produces two outputs
-
-    python api docs:
-
-      * generated from modules set with `set_py_modules`.
-      * written to '{FLAGS.output_dir}/api_docs/python/'
-
-    non-api docs:
-
-      * Everything in '{FLAGS.src_dir}' is copied to '{FLAGS.output_dir}'.
-      * '@{}' references in '.md' files are replaced with links.
-      * '.md' files under 'api_guides/python' have explicit ids set for their
-        second level headings.
-
-    Args:
-      flags:
-        * src_dir: Where to fetch the non-api-docs.
-        * base_dir: Base of the docs directory (Used to build correct
-          relative links).
-        * output_dir: Where to write the resulting docs.
-
-    Returns:
-      The number of errors encountered while processing.
-    """
-    # Extract the python api from the _py_modules
-    doc_index = build_doc_index(flags.src_dir)
-    visitor = self.run_extraction()
-    reference_resolver = self.make_reference_resolver(visitor, doc_index)
-
-    if getattr(flags, 'api_cache_out_path', None):
-      reference_resolver.to_json_file(flags.api_cache_out_path)
-
-    # Build the guide_index for the api_docs back links.
-    root_title = getattr(flags, 'root_title', 'TensorFlow')
-    guide_index = _build_guide_index(
-        os.path.join(flags.src_dir, 'api_guides/python'))
-
-    # Write the api docs.
-    parser_config = self.make_parser_config(visitor, reference_resolver,
-                                            guide_index, flags.base_dir)
-    output_dir = os.path.join(flags.output_dir, 'api_docs/python')
-
-    write_docs(
-        output_dir,
-        parser_config,
-        yaml_toc=self.yaml_toc,
-        root_title=root_title,
-        search_hints=getattr(flags, 'search_hints', True),
-        site_api_path=getattr(flags, 'site_api_path', ''))
-
-    # Replace all the @{} references in files under `FLAGS.src_dir`
-    replace_refs(flags.src_dir, flags.output_dir, reference_resolver, '*.md')
-    # Fix the tags in the guide dir.
-    guide_dir = os.path.join(flags.output_dir, 'api_guides/python')
-    if os.path.exists(guide_dir):
-      update_id_tags_inplace(guide_dir)
-
-    # Report all errors found by the reference resolver, and return the error
-    # code.
-    parser_config.reference_resolver.log_errors()
-
-    return parser_config.reference_resolver.num_errors()
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
deleted file mode 100644
index 61518bcbd46..00000000000
--- a/tensorflow/tools/docs/parser.py
+++ /dev/null
@@ -1,1788 +0,0 @@
-# Lint as: python2, python3
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Turn Python docstrings into Markdown for TensorFlow documentation."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import ast
-import collections
-import functools
-import itertools
-import json
-import os
-import re
-
-import astor
-import six
-from six.moves import zip
-
-from google.protobuf.message import Message as ProtoMessage
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import tf_inspect
-from tensorflow.tools.docs import doc_controls
-
-
-def is_free_function(py_object, full_name, index):
-  """Check if input is a free function (and not a class- or static method).
-
-  Args:
-    py_object: The the object in question.
-    full_name: The full name of the object, like `tf.module.symbol`.
-    index: The {full_name:py_object} dictionary for the public API.
-
-  Returns:
-    True if the obeject is a stand-alone function, and not part of a class
-    definition.
-  """
-  if not tf_inspect.isfunction(py_object):
-    return False
-
-  parent_name = six.ensure_str(full_name).rsplit('.', 1)[0]
-  if tf_inspect.isclass(index[parent_name]):
-    return False
-
-  return True
-
-
-# A regular expression capturing a python identifier.
-IDENTIFIER_RE = r'[a-zA-Z_]\w*'
-
-
-class TFDocsError(Exception):
-  pass
-
-
-class _Errors(object):
-  """A collection of errors."""
-
-  def __init__(self):
-    self._errors = []
-
-  def log_all(self):
-    """Log all the collected errors to the standard error."""
-    template = 'ERROR:\n    output file name: %s\n    %s\n\n'
-
-    for full_name, message in self._errors:
-      logging.warn(template, full_name, message)
-
-  def append(self, full_name, message):
-    """Add an error to the collection.
-
-    Args:
-      full_name: The path to the file in which the error occurred.
-      message: The message to display with the error.
-    """
-    self._errors.append((full_name, message))
-
-  def __len__(self):
-    return len(self._errors)
-
-  def __eq__(self, other):
-    if not isinstance(other, _Errors):
-      return False
-    return self._errors == other._errors  # pylint: disable=protected-access
-
-
-def documentation_path(full_name, is_fragment=False):
-  """Returns the file path for the documentation for the given API symbol.
-
-  Given the fully qualified name of a library symbol, compute the path to which
-  to write the documentation for that symbol (relative to a base directory).
-  Documentation files are organized into directories that mirror the python
-  module/class structure.
-
-  Args:
-    full_name: Fully qualified name of a library symbol.
-    is_fragment: If `False` produce a direct markdown link (`tf.a.b.c` -->
-      `tf/a/b/c.md`). If `True` produce fragment link, `tf.a.b.c` -->
-      `tf/a/b.md#c`
-  Returns:
-    The file path to which to write the documentation for `full_name`.
-  """
-  parts = six.ensure_str(full_name).split('.')
-  if is_fragment:
-    parts, fragment = parts[:-1], parts[-1]
-
-  result = six.ensure_str(os.path.join(*parts)) + '.md'
-
-  if is_fragment:
-    result = six.ensure_str(result) + '#' + six.ensure_str(fragment)
-
-  return result
-
-
-def _get_raw_docstring(py_object):
-  """Get the docs for a given python object.
-
-  Args:
-    py_object: A python object to retrieve the docs for (class, function/method,
-      or module).
-
-  Returns:
-    The docstring, or the empty string if no docstring was found.
-  """
-  # For object instances, tf_inspect.getdoc does give us the docstring of their
-  # type, which is not what we want. Only return the docstring if it is useful.
-  if (tf_inspect.isclass(py_object) or tf_inspect.ismethod(py_object) or
-      tf_inspect.isfunction(py_object) or tf_inspect.ismodule(py_object) or
-      isinstance(py_object, property)):
-    return tf_inspect.getdoc(py_object) or ''
-  else:
-    return ''
-
-
-# A regular expression for capturing a @{symbol} reference.
-SYMBOL_REFERENCE_RE = re.compile(
-    r"""
-    # Start with a literal "@{".
-    @\{
-      # Group at least 1 symbol, not "}".
-      ([^}]+)
-    # Followed by a closing "}"
-    \}
-    """,
-    flags=re.VERBOSE)
-
-AUTO_REFERENCE_RE = re.compile(r'`([a-zA-Z0-9_.]+?)`')
-
-
-class ReferenceResolver(object):
-  """Class for replacing @{...} references with Markdown links.
-
-  Attributes:
-    current_doc_full_name: A string (or None) indicating the name of the
-      document currently being processed, so errors can reference the broken
-      doc.
-  """
-
-  def __init__(self, duplicate_of, doc_index, is_fragment, py_module_names):
-    """Initializes a Reference Resolver.
-
-    Args:
-      duplicate_of: A map from duplicate names to preferred names of API
-        symbols.
-      doc_index: A `dict` mapping symbol name strings to objects with `url`
-        and `title` fields. Used to resolve @{$doc} references in docstrings.
-      is_fragment: A map from full names to bool for each symbol. If True the
-        object lives at a page fragment `tf.a.b.c` --> `tf/a/b#c`. If False
-        object has a page to itself: `tf.a.b.c` --> `tf/a/b/c`.
-      py_module_names: A list of string names of Python modules.
-    """
-    self._duplicate_of = duplicate_of
-    self._doc_index = doc_index
-    self._is_fragment = is_fragment
-    self._all_names = set(is_fragment.keys())
-    self._py_module_names = py_module_names
-
-    self.current_doc_full_name = None
-    self._errors = _Errors()
-
-  def add_error(self, message):
-    self._errors.append(self.current_doc_full_name, message)
-
-  def log_errors(self):
-    self._errors.log_all()
-
-  def num_errors(self):
-    return len(self._errors)
-
-  @classmethod
-  def from_visitor(cls, visitor, doc_index, **kwargs):
-    """A factory function for building a ReferenceResolver from a visitor.
-
-    Args:
-      visitor: an instance of `DocGeneratorVisitor`
-      doc_index: a dictionary mapping document names to references objects with
-        "title" and "url" fields
-      **kwargs: all remaining args are passed to the constructor
-    Returns:
-      an instance of `ReferenceResolver` ()
-    """
-    is_fragment = {}
-    for name, obj in visitor.index.items():
-      has_page = (
-          tf_inspect.isclass(obj) or tf_inspect.ismodule(obj) or
-          is_free_function(obj, name, visitor.index))
-
-      is_fragment[name] = not has_page
-
-    return cls(
-        duplicate_of=visitor.duplicate_of,
-        doc_index=doc_index,
-        is_fragment=is_fragment,
-        **kwargs)
-
-  @classmethod
-  def from_json_file(cls, filepath, doc_index):
-    with open(filepath) as f:
-      json_dict = json.load(f)
-
-    return cls(doc_index=doc_index, **json_dict)
-
-  def to_json_file(self, filepath):
-    """Converts the RefenceResolver to json and writes it to the specified file.
-
-    Args:
-      filepath: The file path to write the json to.
-    """
-    try:
-      os.makedirs(os.path.dirname(filepath))
-    except OSError:
-      pass
-    json_dict = {}
-    for key, value in self.__dict__.items():
-      # Drop these two fields. `_doc_index` is not serializable. `_all_names` is
-      # generated by the constructor.
-      if key in ('_doc_index', '_all_names',
-                 '_errors', 'current_doc_full_name'):
-        continue
-
-      # Strip off any leading underscores on field names as these are not
-      # recognized by the constructor.
-      json_dict[key.lstrip('_')] = value
-
-    with open(filepath, 'w') as f:
-      json.dump(json_dict, f, indent=2, sort_keys=True)
-
-  def replace_references(self, string, relative_path_to_root):
-    """Replace "@{symbol}" references with links to symbol's documentation page.
-
-    This functions finds all occurrences of "@{symbol}" in `string`
-    and replaces them with markdown links to the documentation page
-    for "symbol".
-
-    `relative_path_to_root` is the relative path from the document
-    that contains the "@{symbol}" reference to the root of the API
-    documentation that is linked to. If the containing page is part of
-    the same API docset, `relative_path_to_root` can be set to
-    `os.path.dirname(documentation_path(name))`, where `name` is the
-    python name of the object whose documentation page the reference
-    lives on.
-
-    Args:
-      string: A string in which "@{symbol}" references should be replaced.
-      relative_path_to_root: The relative path from the containing document to
-        the root of the API documentation that is being linked to.
-
-    Returns:
-      `string`, with "@{symbol}" references replaced by Markdown links.
-    """
-
-    def strict_one_ref(match):
-      try:
-        return self._one_ref(match, relative_path_to_root)
-      except TFDocsError as e:
-        self.add_error(e.message)
-        return 'BAD_LINK'
-
-    string = re.sub(SYMBOL_REFERENCE_RE, strict_one_ref, six.ensure_str(string))
-
-    def sloppy_one_ref(match):
-      try:
-        return self._one_ref(match, relative_path_to_root)
-      except TFDocsError:
-        return match.group(0)
-
-    string = re.sub(AUTO_REFERENCE_RE, sloppy_one_ref, string)
-
-    return string
-
-  def python_link(self, link_text, ref_full_name, relative_path_to_root,
-                  code_ref=True):
-    """Resolve a "@{python symbol}" reference to a Markdown link.
-
-    This will pick the canonical location for duplicate symbols.  The
-    input to this function should already be stripped of the '@' and
-    '{}'.  This function returns a Markdown link. If `code_ref` is
-    true, it is assumed that this is a code reference, so the link
-    text will be rendered as code (using backticks).
-    `link_text` should refer to a library symbol, starting with 'tf.'.
-
-    Args:
-      link_text: The text of the Markdown link.
-      ref_full_name: The fully qualified name of the symbol to link to.
-      relative_path_to_root: The relative path from the location of the current
-        document to the root of the API documentation.
-      code_ref: If true (the default), put `link_text` in `...`.
-
-    Returns:
-      A markdown link to the documentation page of `ref_full_name`.
-    """
-    url = self.reference_to_url(ref_full_name, relative_path_to_root)
-
-    if code_ref:
-      link_text = link_text.join(['<code>', '</code>'])
-    else:
-      link_text = self._link_text_to_html(link_text)
-
-    return '<a href="{}">{}</a>'.format(url, link_text)
-
-  @staticmethod
-  def _link_text_to_html(link_text):
-    code_re = '`(.*?)`'
-    return re.sub(code_re, r'<code>\1</code>', six.ensure_str(link_text))
-
-  def py_master_name(self, full_name):
-    """Return the master name for a Python symbol name."""
-    return self._duplicate_of.get(full_name, full_name)
-
-  def reference_to_url(self, ref_full_name, relative_path_to_root):
-    """Resolve a "@{python symbol}" reference to a relative path.
-
-    The input to this function should already be stripped of the '@'
-    and '{}', and its output is only the link, not the full Markdown.
-
-    If `ref_full_name` is the name of a class member, method, or property, the
-    link will point to the page of the containing class, and it will include the
-    method name as an anchor. For example, `tf.module.MyClass.my_method` will be
-    translated into a link to
-    `os.join.path(relative_path_to_root, 'tf/module/MyClass.md#my_method')`.
-
-    Args:
-      ref_full_name: The fully qualified name of the symbol to link to.
-      relative_path_to_root: The relative path from the location of the current
-        document to the root of the API documentation.
-
-    Returns:
-      A relative path that links from the documentation page of `from_full_name`
-      to the documentation page of `ref_full_name`.
-
-    Raises:
-      RuntimeError: If `ref_full_name` is not documented.
-      TFDocsError: If the @{} syntax cannot be decoded.
-    """
-    master_name = self._duplicate_of.get(ref_full_name, ref_full_name)
-
-    # Check whether this link exists
-    if master_name not in self._all_names:
-      raise TFDocsError(
-          'Cannot make link to "%s": Not in index.' % master_name)
-
-    ref_path = documentation_path(master_name, self._is_fragment[master_name])
-    return os.path.join(relative_path_to_root, ref_path)
-
-  def _one_ref(self, match, relative_path_to_root):
-    """Return a link for a single "@{symbol}" reference."""
-    string = match.group(1)
-
-    # Look for link text after $.
-    dollar = string.rfind('$')
-    if dollar > 0:  # Ignore $ in first character
-      link_text = string[dollar + 1:]
-      string = string[:dollar]
-      manual_link_text = True
-    else:
-      link_text = string
-      manual_link_text = False
-
-    # Handle different types of references.
-    if six.ensure_str(string).startswith('$'):  # Doc reference
-      return self._doc_link(string, link_text, manual_link_text,
-                            relative_path_to_root)
-
-    elif six.ensure_str(string).startswith('tensorflow::'):
-      # C++ symbol
-      return self._cc_link(string, link_text, manual_link_text,
-                           relative_path_to_root)
-
-    else:
-      is_python = False
-      for py_module_name in self._py_module_names:
-        if string == py_module_name or string.startswith(
-            six.ensure_str(py_module_name) + '.'):
-          is_python = True
-          break
-      if is_python:  # Python symbol
-        return self.python_link(
-            link_text,
-            string,
-            relative_path_to_root,
-            code_ref=not manual_link_text)
-
-    # Error!
-    raise TFDocsError('Did not understand "%s"' % match.group(0),
-                      'BROKEN_LINK')
-
-  def _doc_link(self, string, link_text, manual_link_text,
-                relative_path_to_root):
-    """Generate a link for a @{$...} reference."""
-    string = string[1:]  # remove leading $
-
-    # If string has a #, split that part into `hash_tag`
-    hash_pos = six.ensure_str(string).find('#')
-    if hash_pos > -1:
-      hash_tag = string[hash_pos:]
-      string = string[:hash_pos]
-    else:
-      hash_tag = ''
-
-    if string in self._doc_index:
-      if not manual_link_text: link_text = self._doc_index[string].title
-      url = os.path.normpath(os.path.join(
-          relative_path_to_root, '../..', self._doc_index[string].url))
-      link_text = self._link_text_to_html(link_text)
-      return '<a href="{}{}">{}</a>'.format(url, hash_tag, link_text)
-
-    return self._doc_missing(string, hash_tag, link_text, manual_link_text,
-                             relative_path_to_root)
-
-  def _doc_missing(self, string, unused_hash_tag, unused_link_text,
-                   unused_manual_link_text, unused_relative_path_to_root):
-    """Generate an error for unrecognized @{$...} references."""
-    raise TFDocsError('Unknown Document "%s"' % string)
-
-  def _cc_link(self, string, link_text, unused_manual_link_text,
-               relative_path_to_root):
-    """Generate a link for a @{tensorflow::...} reference."""
-    # TODO(josh11b): Fix this hard-coding of paths.
-    if string == 'tensorflow::ClientSession':
-      ret = 'class/tensorflow/client-session.md'
-    elif string == 'tensorflow::Scope':
-      ret = 'class/tensorflow/scope.md'
-    elif string == 'tensorflow::Status':
-      ret = 'class/tensorflow/status.md'
-    elif string == 'tensorflow::Tensor':
-      ret = 'class/tensorflow/tensor.md'
-    elif string == 'tensorflow::ops::Const':
-      ret = 'namespace/tensorflow/ops.md#const'
-    else:
-      raise TFDocsError('C++ reference not understood: "%s"' % string)
-
-    # relative_path_to_root gets you to api_docs/python, we go from there
-    # to api_docs/cc, and then add ret.
-    cc_relative_path = os.path.normpath(os.path.join(
-        relative_path_to_root, '../cc', ret))
-
-    return '<a href="{}"><code>{}</code></a>'.format(cc_relative_path,
-                                                     link_text)
-
-
-# TODO(aselle): Collect these into a big list for all modules and functions
-# and make a rosetta stone page.
-def _handle_compatibility(doc):
-  """Parse and remove compatibility blocks from the main docstring.
-
-  Args:
-    doc: The docstring that contains compatibility notes"
-
-  Returns:
-    a tuple of the modified doc string and a hash that maps from compatibility
-    note type to the text of the note.
-  """
-  compatibility_notes = {}
-  match_compatibility = re.compile(r'[ \t]*@compatibility\((\w+)\)\s*\n'
-                                   r'((?:[^@\n]*\n)+)'
-                                   r'\s*@end_compatibility')
-  for f in match_compatibility.finditer(doc):
-    compatibility_notes[f.group(1)] = f.group(2)
-  return match_compatibility.subn(r'', doc)[0], compatibility_notes
-
-
-def _gen_pairs(items):
-  """Given an list of items [a,b,a,b...], generate pairs [(a,b),(a,b)...].
-
-  Args:
-    items: A list of items (length must be even)
-
-  Yields:
-    The original items, in pairs
-  """
-  assert len(items) % 2 == 0
-  items = iter(items)
-  while True:
-    try:
-      yield next(items), next(items)
-    except StopIteration:
-      return
-
-
-class _FunctionDetail(
-    collections.namedtuple('_FunctionDetail', ['keyword', 'header', 'items'])):
-  """A simple class to contain function details.
-
-  Composed of a "keyword", a possibly empty "header" string, and a possibly
-  empty
-  list of key-value pair "items".
-  """
-  __slots__ = []
-
-  def __str__(self):
-    """Return the original string that represents the function detail."""
-    parts = [six.ensure_str(self.keyword) + ':\n']
-    parts.append(self.header)
-    for key, value in self.items:
-      parts.append('  ' + six.ensure_str(key) + ': ')
-      parts.append(value)
-
-    return ''.join(parts)
-
-
-def _parse_function_details(docstring):
-  r"""Given a docstring, split off the header and parse the function details.
-
-  For example the docstring of tf.nn.relu:
-
-  '''Computes rectified linear: `max(features, 0)`.
-
-  Args:
-    features: A `Tensor`. Must be one of the following types: `float32`,
-      `float64`, `int32`, `int64`, `uint8`, `int16`, `int8`, `uint16`,
-      `half`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor`. Has the same type as `features`.
-  '''
-
-  This is parsed, and returned as:
-
-  ```
-  ('Computes rectified linear: `max(features, 0)`.\n\n', [
-      _FunctionDetail(
-          keyword='Args',
-          header='',
-          items=[
-              ('features', ' A `Tensor`. Must be ...'),
-              ('name', ' A name for the operation (optional).\n\n')]),
-      _FunctionDetail(
-          keyword='Returns',
-          header='  A `Tensor`. Has the same type as `features`.',
-          items=[])
-  ])
-  ```
-
-  Args:
-    docstring: The docstring to parse
-
-  Returns:
-    A (header, function_details) pair, where header is a string and
-    function_details is a (possibly empty) list of `_FunctionDetail` objects.
-  """
-
-  detail_keywords = '|'.join([
-      'Args', 'Arguments', 'Fields', 'Returns', 'Yields', 'Raises', 'Attributes'
-  ])
-  tag_re = re.compile('(?<=\n)(' + detail_keywords + '):\n', re.MULTILINE)
-  parts = tag_re.split(docstring)
-
-  # The first part is the main docstring
-  docstring = parts[0]
-
-  # Everything else alternates keyword-content
-  pairs = list(_gen_pairs(parts[1:]))
-
-  function_details = []
-  item_re = re.compile(r'^   ? ?(\*?\*?\w[\w.]*?\s*):\s', re.MULTILINE)
-
-  for keyword, content in pairs:
-    content = item_re.split(six.ensure_str(content))
-    header = content[0]
-    items = list(_gen_pairs(content[1:]))
-
-    function_details.append(_FunctionDetail(keyword, header, items))
-
-  return docstring, function_details
-
-
-_DocstringInfo = collections.namedtuple('_DocstringInfo', [
-    'brief', 'docstring', 'function_details', 'compatibility'
-])
-
-
-def _parse_md_docstring(py_object, relative_path_to_root, reference_resolver):
-  """Parse the object's docstring and return a `_DocstringInfo`.
-
-  This function clears @@'s from the docstring, and replaces @{} references
-  with markdown links.
-
-  For links within the same set of docs, the `relative_path_to_root` for a
-  docstring on the page for `full_name` can be set to:
-
-  ```python
-  relative_path_to_root = os.path.relpath(
-    path='.', start=os.path.dirname(documentation_path(full_name)) or '.')
-  ```
-
-  Args:
-    py_object: A python object to retrieve the docs for (class, function/method,
-      or module).
-    relative_path_to_root: The relative path from the location of the current
-      document to the root of the Python API documentation. This is used to
-      compute links for "@{symbol}" references.
-    reference_resolver: An instance of ReferenceResolver.
-
-  Returns:
-    A _DocstringInfo object, all fields will be empty if no docstring was found.
-  """
-  # TODO(wicke): If this is a partial, use the .func docstring and add a note.
-  raw_docstring = _get_raw_docstring(py_object)
-
-  raw_docstring = reference_resolver.replace_references(
-      raw_docstring, relative_path_to_root)
-
-  atat_re = re.compile(r' *@@[a-zA-Z_.0-9]+ *$')
-  raw_docstring = '\n'.join(
-      line for line in six.ensure_str(raw_docstring).split('\n')
-      if not atat_re.match(six.ensure_str(line)))
-
-  docstring, compatibility = _handle_compatibility(raw_docstring)
-  docstring, function_details = _parse_function_details(docstring)
-
-  if 'Generated by: tensorflow/tools/api/generator' in docstring:
-    docstring = ''
-
-  return _DocstringInfo(
-      docstring.split('\n')[0], docstring, function_details, compatibility)
-
-
-def _get_arg_spec(func):
-  """Extracts signature information from a function or functools.partial object.
-
-  For functions, uses `tf_inspect.getfullargspec`. For `functools.partial`
-  objects, corrects the signature of the underlying function to take into
-  account the removed arguments.
-
-  Args:
-    func: A function whose signature to extract.
-
-  Returns:
-    An `FullArgSpec` namedtuple `(args, varargs, varkw, defaults, etc.)`,
-    as returned by `tf_inspect.getfullargspec`.
-  """
-  # getfullargspec does not work for functools.partial objects directly.
-  if isinstance(func, functools.partial):
-    argspec = tf_inspect.getfullargspec(func.func)
-    # Remove the args from the original function that have been used up.
-    first_default_arg = (
-        len(argspec.args or []) - len(argspec.defaults or []))
-    partial_args = len(func.args)
-    argspec_args = []
-
-    if argspec.args:
-      argspec_args = list(argspec.args[partial_args:])
-
-    argspec_defaults = list(argspec.defaults or ())
-    if argspec.defaults and partial_args > first_default_arg:
-      argspec_defaults = list(argspec.defaults[partial_args-first_default_arg:])
-
-    first_default_arg = max(0, first_default_arg - partial_args)
-    for kwarg in (func.keywords or []):
-      if kwarg in (argspec.args or []):
-        i = argspec_args.index(kwarg)
-        argspec_args.pop(i)
-        if i >= first_default_arg:
-          argspec_defaults.pop(i-first_default_arg)
-        else:
-          first_default_arg -= 1
-    return tf_inspect.FullArgSpec(
-        args=argspec_args,
-        varargs=argspec.varargs,
-        varkw=argspec.varkw,
-        defaults=tuple(argspec_defaults),
-        kwonlyargs=[],
-        kwonlydefaults=None,
-        annotations={})
-  else:  # Regular function or method, getargspec will work fine.
-    return tf_inspect.getfullargspec(func)
-
-
-def _remove_first_line_indent(string):
-  indent = len(re.match(r'^\s*', six.ensure_str(string)).group(0))
-  return '\n'.join(
-      [line[indent:] for line in six.ensure_str(string).split('\n')])
-
-
-PAREN_NUMBER_RE = re.compile(r'^\(([0-9.e-]+)\)')
-
-
-def _generate_signature(func, reverse_index):
-  """Given a function, returns a list of strings representing its args.
-
-  This function produces a list of strings representing the arguments to a
-  python function. It uses tf_inspect.getfullargspec, which
-  does not generalize well to Python 3.x, which is more flexible in how *args
-  and **kwargs are handled. This is not a problem in TF, since we have to remain
-  compatible to Python 2.7 anyway.
-
-  This function uses `__name__` for callables if it is available. This can lead
-  to poor results for functools.partial and other callable objects.
-
-  The returned string is Python code, so if it is included in a Markdown
-  document, it should be typeset as code (using backticks), or escaped.
-
-  Args:
-    func: A function, method, or functools.partial to extract the signature for.
-    reverse_index: A map from object ids to canonical full names to use.
-
-  Returns:
-    A list of strings representing the argument signature of `func` as python
-    code.
-  """
-
-  args_list = []
-
-  argspec = _get_arg_spec(func)
-  first_arg_with_default = (
-      len(argspec.args or []) - len(argspec.defaults or []))
-
-  # Python documentation skips `self` when printing method signatures.
-  # Note we cannot test for ismethod here since unbound methods do not register
-  # as methods (in Python 3).
-  first_arg = 1 if 'self' in argspec.args[:1] else 0
-
-  # Add all args without defaults.
-  for arg in argspec.args[first_arg:first_arg_with_default]:
-    args_list.append(arg)
-
-  # Add all args with defaults.
-  if argspec.defaults:
-    try:
-      source = _remove_first_line_indent(tf_inspect.getsource(func))
-      func_ast = ast.parse(source)
-      ast_defaults = func_ast.body[0].args.defaults
-    except IOError:  # If this is a builtin, getsource fails with IOError
-      # If we cannot get the source, assume the AST would be equal to the repr
-      # of the defaults.
-      ast_defaults = [None] * len(argspec.defaults)
-
-    for arg, default, ast_default in zip(
-        argspec.args[first_arg_with_default:], argspec.defaults, ast_defaults):
-      if id(default) in reverse_index:
-        default_text = reverse_index[id(default)]
-      elif ast_default is not None:
-        default_text = (
-            six.ensure_str(astor.to_source(ast_default)).rstrip('\n').replace(
-                '\t', '\\t').replace('\n', '\\n').replace('"""', "'"))
-        default_text = PAREN_NUMBER_RE.sub('\\1', six.ensure_str(default_text))
-
-        if default_text != repr(default):
-          # This may be an internal name. If so, handle the ones we know about.
-          # TODO(wicke): This should be replaced with a lookup in the index.
-          # TODO(wicke): (replace first ident with tf., check if in index)
-          internal_names = {
-              'ops.GraphKeys': 'tf.GraphKeys',
-              '_ops.GraphKeys': 'tf.GraphKeys',
-              'init_ops.zeros_initializer': 'tf.zeros_initializer',
-              'init_ops.ones_initializer': 'tf.ones_initializer',
-              'saver_pb2.SaverDef': 'tf.train.SaverDef',
-          }
-          full_name_re = '^%s(.%s)+' % (IDENTIFIER_RE, IDENTIFIER_RE)
-          match = re.match(full_name_re, default_text)
-          if match:
-            lookup_text = default_text
-            for internal_name, public_name in six.iteritems(internal_names):
-              if match.group(0).startswith(internal_name):
-                lookup_text = public_name + default_text[len(internal_name):]
-                break
-            if default_text is lookup_text:
-              logging.warn(
-                  'WARNING: Using default arg, failed lookup: %s, repr: %r',
-                  default_text, default)
-            else:
-              default_text = lookup_text
-      else:
-        default_text = repr(default)
-
-      args_list.append('%s=%s' % (arg, default_text))
-
-  # Add *args and *kwargs.
-  if argspec.varargs:
-    args_list.append('*' + six.ensure_str(argspec.varargs))
-  if argspec.varkw:
-    args_list.append('**' + six.ensure_str(argspec.varkw))
-
-  return args_list
-
-
-def _get_guides_markdown(duplicate_names, guide_index, relative_path):
-  all_guides = []
-  for name in duplicate_names:
-    all_guides.extend(guide_index.get(name, []))
-  if not all_guides: return ''
-  prefix = '../' * (relative_path.count('/') + 3)
-  links = sorted(set([guide_ref.make_md_link(prefix)
-                      for guide_ref in all_guides]))
-  return 'See the guide%s: %s\n\n' % (
-      's' if len(links) > 1 else '', ', '.join(links))
-
-
-def _get_defining_class(py_class, name):
-  for cls in tf_inspect.getmro(py_class):
-    if name in cls.__dict__:
-      return cls
-  return None
-
-
-class _LinkInfo(
-    collections.namedtuple(
-        '_LinkInfo', ['short_name', 'full_name', 'obj', 'doc', 'url'])):
-
-  __slots__ = []
-
-  def is_link(self):
-    return True
-
-
-class _OtherMemberInfo(
-    collections.namedtuple('_OtherMemberInfo',
-                           ['short_name', 'full_name', 'obj', 'doc'])):
-
-  __slots__ = []
-
-  def is_link(self):
-    return False
-
-
-_PropertyInfo = collections.namedtuple(
-    '_PropertyInfo', ['short_name', 'full_name', 'obj', 'doc'])
-
-_MethodInfo = collections.namedtuple('_MethodInfo', [
-    'short_name', 'full_name', 'obj', 'doc', 'signature', 'decorators'
-])
-
-
-class _FunctionPageInfo(object):
-  """Collects docs For a function Page."""
-
-  def __init__(self, full_name):
-    self._full_name = full_name
-    self._defined_in = None
-    self._aliases = None
-    self._doc = None
-    self._guides = None
-
-    self._signature = None
-    self._decorators = []
-
-  def for_function(self):
-    return True
-
-  def for_class(self):
-    return False
-
-  def for_module(self):
-    return False
-
-  @property
-  def full_name(self):
-    return self._full_name
-
-  @property
-  def short_name(self):
-    return six.ensure_str(self._full_name).split('.')[-1]
-
-  @property
-  def defined_in(self):
-    return self._defined_in
-
-  def set_defined_in(self, defined_in):
-    assert self.defined_in is None
-    self._defined_in = defined_in
-
-  @property
-  def aliases(self):
-    return self._aliases
-
-  def set_aliases(self, aliases):
-    assert self.aliases is None
-    self._aliases = aliases
-
-  @property
-  def doc(self):
-    return self._doc
-
-  def set_doc(self, doc):
-    assert self.doc is None
-    self._doc = doc
-
-  @property
-  def guides(self):
-    return self._guides
-
-  def set_guides(self, guides):
-    assert self.guides is None
-    self._guides = guides
-
-  @property
-  def signature(self):
-    return self._signature
-
-  def set_signature(self, function, reverse_index):
-    """Attach the function's signature.
-
-    Args:
-      function: The python function being documented.
-      reverse_index: A map from object ids in the index to full names.
-    """
-
-    assert self.signature is None
-    self._signature = _generate_signature(function, reverse_index)
-
-  @property
-  def decorators(self):
-    return list(self._decorators)
-
-  def add_decorator(self, dec):
-    self._decorators.append(dec)
-
-  def get_metadata_html(self):
-    return _Metadata(self.full_name).build_html()
-
-
-class _ClassPageInfo(object):
-  """Collects docs for a class page.
-
-  Attributes:
-    full_name: The fully qualified name of the object at the master
-      location. Aka `master_name`. For example: `tf.nn.sigmoid`.
-    short_name: The last component of the `full_name`. For example: `sigmoid`.
-    defined_in: The path to the file where this object is defined.
-    aliases: The list of all fully qualified names for the locations where the
-      object is visible in the public api. This includes the master location.
-    doc: A `_DocstringInfo` object representing the object's docstring (can be
-      created with `_parse_md_docstring`).
-    guides: A markdown string, of back links pointing to the api_guides that
-      reference this object.
-    bases: A list of `_LinkInfo` objects pointing to the docs for the parent
-      classes.
-    properties: A list of `_PropertyInfo` objects documenting the class'
-      properties (attributes that use `@property`).
-    methods: A list of `_MethodInfo` objects documenting the class' methods.
-    classes: A list of `_LinkInfo` objects pointing to docs for any nested
-      classes.
-    other_members: A list of `_OtherMemberInfo` objects documenting any other
-      object's defined inside the class object (mostly enum style fields).
-  """
-
-  def __init__(self, full_name):
-    self._full_name = full_name
-    self._defined_in = None
-    self._aliases = None
-    self._doc = None
-    self._guides = None
-    self._namedtuplefields = None
-
-    self._bases = None
-    self._properties = []
-    self._methods = []
-    self._classes = []
-    self._other_members = []
-
-  def for_function(self):
-    """Returns true if this object documents a function."""
-    return False
-
-  def for_class(self):
-    """Returns true if this object documents a class."""
-    return True
-
-  def for_module(self):
-    """Returns true if this object documents a module."""
-    return False
-
-  @property
-  def full_name(self):
-    """Returns the documented object's fully qualified name."""
-    return self._full_name
-
-  @property
-  def short_name(self):
-    """Returns the documented object's short name."""
-    return six.ensure_str(self._full_name).split('.')[-1]
-
-  @property
-  def defined_in(self):
-    """Returns the path to the file where the documented object is defined."""
-    return self._defined_in
-
-  def set_defined_in(self, defined_in):
-    """Sets the `defined_in` path."""
-    assert self.defined_in is None
-    self._defined_in = defined_in
-
-  @property
-  def aliases(self):
-    """Returns a list of all full names for the documented object."""
-    return self._aliases
-
-  def set_aliases(self, aliases):
-    """Sets the `aliases` list.
-
-    Args:
-      aliases: A list of strings. Containing all the object's full names.
-    """
-    assert self.aliases is None
-    self._aliases = aliases
-
-  @property
-  def doc(self):
-    """Returns a `_DocstringInfo` created from the object's docstring."""
-    return self._doc
-
-  def set_doc(self, doc):
-    """Sets the `doc` field.
-
-    Args:
-      doc: An instance of `_DocstringInfo`.
-    """
-    assert self.doc is None
-    self._doc = doc
-
-  @property
-  def guides(self):
-    """Returns a markdown string containing backlinks to relevant api_guides."""
-    return self._guides
-
-  def set_guides(self, guides):
-    """Sets the `guides` field.
-
-    Args:
-      guides: A markdown string containing backlinks to all the api_guides that
-        link to the documented object.
-    """
-    assert self.guides is None
-    self._guides = guides
-
-  @property
-  def namedtuplefields(self):
-    return self._namedtuplefields
-
-  def set_namedtuplefields(self, py_class):
-    if issubclass(py_class, tuple):
-      if all(
-          hasattr(py_class, attr)
-          for attr in ('_asdict', '_fields', '_make', '_replace')):
-        self._namedtuplefields = py_class._fields
-
-  @property
-  def bases(self):
-    """Returns a list of `_LinkInfo` objects pointing to the class' parents."""
-    return self._bases
-
-  def _set_bases(self, relative_path, parser_config):
-    """Builds the `bases` attribute, to document this class' parent-classes.
-
-    This method sets the `bases` to a list of `_LinkInfo` objects point to the
-    doc pages for the class' parents.
-
-    Args:
-      relative_path: The relative path from the doc this object describes to
-        the documentation root.
-      parser_config: An instance of `ParserConfig`.
-    """
-    bases = []
-    obj = parser_config.py_name_to_object(self.full_name)
-    for base in obj.__bases__:
-      base_full_name = parser_config.reverse_index.get(id(base), None)
-      if base_full_name is None:
-        continue
-      base_doc = _parse_md_docstring(base, relative_path,
-                                     parser_config.reference_resolver)
-      base_url = parser_config.reference_resolver.reference_to_url(
-          base_full_name, relative_path)
-
-      link_info = _LinkInfo(
-          short_name=six.ensure_str(base_full_name).split('.')[-1],
-          full_name=base_full_name,
-          obj=base,
-          doc=base_doc,
-          url=base_url)
-      bases.append(link_info)
-
-    self._bases = bases
-
-  @property
-  def properties(self):
-    """Returns a list of `_PropertyInfo` describing the class' properties."""
-    props_dict = {prop.short_name: prop for prop in self._properties}
-    props = []
-    if self.namedtuplefields:
-      for field in self.namedtuplefields:
-        props.append(props_dict.pop(field))
-
-    props.extend(sorted(props_dict.values()))
-
-    return props
-
-  def _add_property(self, short_name, full_name, obj, doc):
-    """Adds a `_PropertyInfo` entry to the `properties` list.
-
-    Args:
-      short_name: The property's short name.
-      full_name: The property's fully qualified name.
-      obj: The property object itself
-      doc: The property's parsed docstring, a `_DocstringInfo`.
-    """
-    # Hide useless namedtuple docs-trings
-    if re.match('Alias for field number [0-9]+', six.ensure_str(doc.docstring)):
-      doc = doc._replace(docstring='', brief='')
-    property_info = _PropertyInfo(short_name, full_name, obj, doc)
-    self._properties.append(property_info)
-
-  @property
-  def methods(self):
-    """Returns a list of `_MethodInfo` describing the class' methods."""
-    return self._methods
-
-  def _add_method(self, short_name, full_name, obj, doc, signature, decorators):
-    """Adds a `_MethodInfo` entry to the `methods` list.
-
-    Args:
-      short_name: The method's short name.
-      full_name: The method's fully qualified name.
-      obj: The method object itself
-      doc: The method's parsed docstring, a `_DocstringInfo`
-      signature: The method's parsed signature (see: `_generate_signature`)
-      decorators: A list of strings describing the decorators that should be
-        mentioned on the object's docs page.
-    """
-
-    method_info = _MethodInfo(short_name, full_name, obj, doc, signature,
-                              decorators)
-
-    self._methods.append(method_info)
-
-  @property
-  def classes(self):
-    """Returns a list of `_LinkInfo` pointing to any nested classes."""
-    return self._classes
-
-  def get_metadata_html(self):
-    meta_data = _Metadata(self.full_name)
-    for item in itertools.chain(self.classes, self.properties, self.methods,
-                                self.other_members):
-      meta_data.append(item)
-
-    return meta_data.build_html()
-
-  def _add_class(self, short_name, full_name, obj, doc, url):
-    """Adds a `_LinkInfo` for a nested class to `classes` list.
-
-    Args:
-      short_name: The class' short name.
-      full_name: The class' fully qualified name.
-      obj: The class object itself
-      doc: The class' parsed docstring, a `_DocstringInfo`
-      url: A url pointing to where the nested class is documented.
-    """
-    page_info = _LinkInfo(short_name, full_name, obj, doc, url)
-
-    self._classes.append(page_info)
-
-  @property
-  def other_members(self):
-    """Returns a list of `_OtherMemberInfo` describing any other contents."""
-    return self._other_members
-
-  def _add_other_member(self, short_name, full_name, obj, doc):
-    """Adds an `_OtherMemberInfo` entry to the `other_members` list.
-
-    Args:
-      short_name: The class' short name.
-      full_name: The class' fully qualified name.
-      obj: The class object itself
-      doc: The class' parsed docstring, a `_DocstringInfo`
-    """
-    other_member_info = _OtherMemberInfo(short_name, full_name, obj, doc)
-    self._other_members.append(other_member_info)
-
-  def collect_docs_for_class(self, py_class, parser_config):
-    """Collects information necessary specifically for a class's doc page.
-
-    Mainly, this is details about the class's members.
-
-    Args:
-      py_class: The class object being documented
-      parser_config: An instance of ParserConfig.
-    """
-    self.set_namedtuplefields(py_class)
-    doc_path = documentation_path(self.full_name)
-    relative_path = os.path.relpath(
-        path='.', start=os.path.dirname(doc_path) or '.')
-
-    self._set_bases(relative_path, parser_config)
-
-    for short_name in parser_config.tree[self.full_name]:
-      # Remove builtin members that we never want to document.
-      if short_name in [
-          '__class__', '__base__', '__weakref__', '__doc__', '__module__',
-          '__dict__', '__abstractmethods__', '__slots__', '__getnewargs__',
-          '__str__', '__repr__', '__hash__', '__reduce__'
-      ]:
-        continue
-
-      child_name = '.'.join([self.full_name, short_name])
-      child = parser_config.py_name_to_object(child_name)
-
-      # Don't document anything that is defined in object or by protobuf.
-      defining_class = _get_defining_class(py_class, short_name)
-      if defining_class in [object, type, tuple, BaseException, Exception]:
-        continue
-
-      # The following condition excludes most protobuf-defined symbols.
-      if (defining_class and
-          defining_class.__name__ in ['CMessage', 'Message', 'MessageMeta']):
-        continue
-      # TODO(markdaoust): Add a note in child docs showing the defining class.
-
-      if doc_controls.should_skip_class_attr(py_class, short_name):
-        continue
-
-      child_doc = _parse_md_docstring(child, relative_path,
-                                      parser_config.reference_resolver)
-
-      if isinstance(child, property):
-        self._add_property(short_name, child_name, child, child_doc)
-
-      elif tf_inspect.isclass(child):
-        if defining_class is None:
-          continue
-        url = parser_config.reference_resolver.reference_to_url(
-            child_name, relative_path)
-        self._add_class(short_name, child_name, child, child_doc, url)
-
-      elif (tf_inspect.ismethod(child) or tf_inspect.isfunction(child) or
-            tf_inspect.isroutine(child)):
-        if defining_class is None:
-          continue
-
-        # Omit methods defined by namedtuple.
-        original_method = defining_class.__dict__[short_name]
-        if (hasattr(original_method, '__module__') and six.ensure_str(
-            (original_method.__module__ or '')).startswith('namedtuple')):
-          continue
-
-        # Some methods are often overridden without documentation. Because it's
-        # obvious what they do, don't include them in the docs if there's no
-        # docstring.
-        if not child_doc.brief.strip() and short_name in [
-            '__del__', '__copy__'
-        ]:
-          continue
-
-        try:
-          child_signature = _generate_signature(child,
-                                                parser_config.reverse_index)
-        except TypeError:
-          # If this is a (dynamically created) slot wrapper, tf_inspect will
-          # raise typeerror when trying to get to the code. Ignore such
-          # functions.
-          continue
-
-        child_decorators = []
-        try:
-          if isinstance(py_class.__dict__[short_name], classmethod):
-            child_decorators.append('classmethod')
-        except KeyError:
-          pass
-
-        try:
-          if isinstance(py_class.__dict__[short_name], staticmethod):
-            child_decorators.append('staticmethod')
-        except KeyError:
-          pass
-
-        self._add_method(short_name, child_name, child, child_doc,
-                         child_signature, child_decorators)
-      else:
-        # Exclude members defined by protobuf that are useless
-        if issubclass(py_class, ProtoMessage):
-          if (six.ensure_str(short_name).endswith('_FIELD_NUMBER') or
-              short_name in ['__slots__', 'DESCRIPTOR']):
-            continue
-
-        # TODO(wicke): We may want to also remember the object itself.
-        self._add_other_member(short_name, child_name, child, child_doc)
-
-
-class _ModulePageInfo(object):
-  """Collects docs for a module page."""
-
-  def __init__(self, full_name):
-    self._full_name = full_name
-    self._defined_in = None
-    self._aliases = None
-    self._doc = None
-    self._guides = None
-
-    self._modules = []
-    self._classes = []
-    self._functions = []
-    self._other_members = []
-
-  def for_function(self):
-    return False
-
-  def for_class(self):
-    return False
-
-  def for_module(self):
-    return True
-
-  @property
-  def full_name(self):
-    return self._full_name
-
-  @property
-  def short_name(self):
-    return six.ensure_str(self._full_name).split('.')[-1]
-
-  @property
-  def defined_in(self):
-    return self._defined_in
-
-  def set_defined_in(self, defined_in):
-    assert self.defined_in is None
-    self._defined_in = defined_in
-
-  @property
-  def aliases(self):
-    return self._aliases
-
-  def set_aliases(self, aliases):
-    assert self.aliases is None
-    self._aliases = aliases
-
-  @property
-  def doc(self):
-    return self._doc
-
-  def set_doc(self, doc):
-    assert self.doc is None
-    self._doc = doc
-
-  @property
-  def guides(self):
-    return self._guides
-
-  def set_guides(self, guides):
-    assert self.guides is None
-    self._guides = guides
-
-  @property
-  def modules(self):
-    return self._modules
-
-  def _add_module(self, short_name, full_name, obj, doc, url):
-    self._modules.append(_LinkInfo(short_name, full_name, obj, doc, url))
-
-  @property
-  def classes(self):
-    return self._classes
-
-  def _add_class(self, short_name, full_name, obj, doc, url):
-    self._classes.append(_LinkInfo(short_name, full_name, obj, doc, url))
-
-  @property
-  def functions(self):
-    return self._functions
-
-  def _add_function(self, short_name, full_name, obj, doc, url):
-    self._functions.append(_LinkInfo(short_name, full_name, obj, doc, url))
-
-  @property
-  def other_members(self):
-    return self._other_members
-
-  def _add_other_member(self, short_name, full_name, obj, doc):
-    self._other_members.append(
-        _OtherMemberInfo(short_name, full_name, obj, doc))
-
-  def get_metadata_html(self):
-    meta_data = _Metadata(self.full_name)
-
-    # Objects with their own pages are not added to the matadata list for the
-    # module, the module only has a link to the object page. No docs.
-    for item in self.other_members:
-      meta_data.append(item)
-
-    return meta_data.build_html()
-
-  def collect_docs_for_module(self, parser_config):
-    """Collect information necessary specifically for a module's doc page.
-
-    Mainly this is information about the members of the module.
-
-    Args:
-      parser_config: An instance of ParserConfig.
-    """
-    relative_path = os.path.relpath(
-        path='.',
-        start=os.path.dirname(documentation_path(self.full_name)) or '.')
-
-    member_names = parser_config.tree.get(self.full_name, [])
-    for name in member_names:
-
-      if name in ['__builtins__', '__doc__', '__file__',
-                  '__name__', '__path__', '__package__',
-                  '__cached__', '__loader__', '__spec__']:
-        continue
-
-      member_full_name = six.ensure_str(self.full_name) + '.' + six.ensure_str(
-          name) if self.full_name else name
-      member = parser_config.py_name_to_object(member_full_name)
-
-      member_doc = _parse_md_docstring(member, relative_path,
-                                       parser_config.reference_resolver)
-
-      url = parser_config.reference_resolver.reference_to_url(
-          member_full_name, relative_path)
-
-      if tf_inspect.ismodule(member):
-        self._add_module(name, member_full_name, member, member_doc, url)
-
-      elif tf_inspect.isclass(member):
-        self._add_class(name, member_full_name, member, member_doc, url)
-
-      elif tf_inspect.isfunction(member):
-        self._add_function(name, member_full_name, member, member_doc, url)
-
-      else:
-        self._add_other_member(name, member_full_name, member, member_doc)
-
-
-class ParserConfig(object):
-  """Stores all indexes required to parse the docs."""
-
-  def __init__(self, reference_resolver, duplicates, duplicate_of, tree, index,
-               reverse_index, guide_index, base_dir):
-    """Object with the common config for docs_for_object() calls.
-
-    Args:
-      reference_resolver: An instance of ReferenceResolver.
-      duplicates: A `dict` mapping fully qualified names to a set of all
-        aliases of this name. This is used to automatically generate a list of
-        all aliases for each name.
-      duplicate_of: A map from duplicate names to preferred names of API
-        symbols.
-      tree: A `dict` mapping a fully qualified name to the names of all its
-        members. Used to populate the members section of a class or module page.
-      index: A `dict` mapping full names to objects.
-      reverse_index: A `dict` mapping object ids to full names.
-
-      guide_index: A `dict` mapping symbol name strings to objects with a
-        `make_md_link()` method.
-
-      base_dir: A base path that is stripped from file locations written to the
-        docs.
-    """
-    self.reference_resolver = reference_resolver
-    self.duplicates = duplicates
-    self.duplicate_of = duplicate_of
-    self.tree = tree
-    self.reverse_index = reverse_index
-    self.index = index
-    self.guide_index = guide_index
-    self.base_dir = base_dir
-    self.defined_in_prefix = 'tensorflow/'
-    self.code_url_prefix = (
-        '/code/stable/tensorflow/')  # pylint: disable=line-too-long
-
-  def py_name_to_object(self, full_name):
-    """Return the Python object for a Python symbol name."""
-    return self.index[full_name]
-
-
-def docs_for_object(full_name, py_object, parser_config):
-  """Return a PageInfo object describing a given object from the TF API.
-
-  This function uses _parse_md_docstring to parse the docs pertaining to
-  `object`.
-
-  This function resolves '@{symbol}' references in the docstrings into links to
-  the appropriate location. It also adds a list of alternative names for the
-  symbol automatically.
-
-  It assumes that the docs for each object live in a file given by
-  `documentation_path`, and that relative links to files within the
-  documentation are resolvable.
-
-  Args:
-    full_name: The fully qualified name of the symbol to be
-      documented.
-    py_object: The Python object to be documented. Its documentation is sourced
-      from `py_object`'s docstring.
-    parser_config: A ParserConfig object.
-
-  Returns:
-    Either a `_FunctionPageInfo`, `_ClassPageInfo`, or a `_ModulePageInfo`
-    depending on the type of the python object being documented.
-
-  Raises:
-    RuntimeError: If an object is encountered for which we don't know how
-      to make docs.
-  """
-
-  # Which other aliases exist for the object referenced by full_name?
-  master_name = parser_config.reference_resolver.py_master_name(full_name)
-  duplicate_names = parser_config.duplicates.get(master_name, [full_name])
-
-  # TODO(wicke): Once other pieces are ready, enable this also for partials.
-  if (tf_inspect.ismethod(py_object) or tf_inspect.isfunction(py_object) or
-      # Some methods in classes from extensions come in as routines.
-      tf_inspect.isroutine(py_object)):
-    page_info = _FunctionPageInfo(master_name)
-    page_info.set_signature(py_object, parser_config.reverse_index)
-
-  elif tf_inspect.isclass(py_object):
-    page_info = _ClassPageInfo(master_name)
-    page_info.collect_docs_for_class(py_object, parser_config)
-
-  elif tf_inspect.ismodule(py_object):
-    page_info = _ModulePageInfo(master_name)
-    page_info.collect_docs_for_module(parser_config)
-
-  else:
-    raise RuntimeError('Cannot make docs for object %s: %r' % (full_name,
-                                                               py_object))
-
-  relative_path = os.path.relpath(
-      path='.', start=os.path.dirname(documentation_path(full_name)) or '.')
-
-  page_info.set_doc(_parse_md_docstring(
-      py_object, relative_path, parser_config.reference_resolver))
-
-  page_info.set_aliases(duplicate_names)
-
-  page_info.set_guides(_get_guides_markdown(
-      duplicate_names, parser_config.guide_index, relative_path))
-
-  page_info.set_defined_in(_get_defined_in(py_object, parser_config))
-
-  return page_info
-
-
-class _PythonBuiltin(object):
-  """This class indicated that the object in question is a python builtin.
-
-  This can be used for the `defined_in` slot of the `PageInfo` objects.
-  """
-
-  def is_builtin(self):
-    return True
-
-  def is_python_file(self):
-    return False
-
-  def is_generated_file(self):
-    return False
-
-  def __str__(self):
-    return 'This is an alias for a Python built-in.\n\n'
-
-
-class _PythonFile(object):
-  """This class indicates that the object is defined in a regular python file.
-
-  This can be used for the `defined_in` slot of the `PageInfo` objects.
-  """
-
-  def __init__(self, path, parser_config):
-    self.path = path
-    self.path_prefix = parser_config.defined_in_prefix
-    self.code_url_prefix = parser_config.code_url_prefix
-
-  def is_builtin(self):
-    return False
-
-  def is_python_file(self):
-    return True
-
-  def is_generated_file(self):
-    return False
-
-  def __str__(self):
-    return 'Defined in [`{prefix}{path}`]({code_prefix}{path}).\n\n'.format(
-        path=self.path, prefix=self.path_prefix,
-        code_prefix=self.code_url_prefix)
-
-
-class _ProtoFile(object):
-  """This class indicates that the object is defined in a .proto file.
-
-  This can be used for the `defined_in` slot of the `PageInfo` objects.
-  """
-
-  def __init__(self, path, parser_config):
-    self.path = path
-    self.path_prefix = parser_config.defined_in_prefix
-    self.code_url_prefix = parser_config.code_url_prefix
-
-  def is_builtin(self):
-    return False
-
-  def is_python_file(self):
-    return False
-
-  def is_generated_file(self):
-    return False
-
-  def __str__(self):
-    return 'Defined in [`{prefix}{path}`]({code_prefix}{path}).\n\n'.format(
-        path=self.path, prefix=self.path_prefix,
-        code_prefix=self.code_url_prefix)
-
-
-class _GeneratedFile(object):
-  """This class indicates that the object is defined in a generated python file.
-
-  Generated files should not be linked to directly.
-
-  This can be used for the `defined_in` slot of the `PageInfo` objects.
-  """
-
-  def __init__(self, path, parser_config):
-    self.path = path
-    self.path_prefix = parser_config.defined_in_prefix
-
-  def is_builtin(self):
-    return False
-
-  def is_python_file(self):
-    return False
-
-  def is_generated_file(self):
-    return True
-
-  def __str__(self):
-    return ''
-
-
-def _get_defined_in(py_object, parser_config):
-  """Returns a description of where the passed in python object was defined.
-
-  Args:
-    py_object: The Python object.
-    parser_config: A ParserConfig object.
-
-  Returns:
-    Either a `_PythonBuiltin`, `_PythonFile`, or a `_GeneratedFile`
-  """
-  # Every page gets a note about where this object is defined
-  # TODO(wicke): If py_object is decorated, get the decorated object instead.
-  # TODO(wicke): Only use decorators that support this in TF.
-
-  try:
-    path = os.path.relpath(path=tf_inspect.getfile(py_object),
-                           start=parser_config.base_dir)
-  except TypeError:  # getfile throws TypeError if py_object is a builtin.
-    return _PythonBuiltin()
-
-  # TODO(wicke): If this is a generated file, link to the source instead.
-  # TODO(wicke): Move all generated files to a generated/ directory.
-  # TODO(wicke): And make their source file predictable from the file name.
-
-  # In case this is compiled, point to the original
-  if six.ensure_str(path).endswith('.pyc'):
-    path = path[:-1]
-
-  # Never include links outside this code base.
-  if six.ensure_str(path).startswith('..') or re.search(r'\b_api\b',
-                                                        six.ensure_str(path)):
-    return None
-
-  if re.match(r'.*/gen_[^/]*\.py$', six.ensure_str(path)):
-    return _GeneratedFile(path, parser_config)
-  if 'genfiles' in path or 'tools/api/generator' in path:
-    return _GeneratedFile(path, parser_config)
-  elif re.match(r'.*_pb2\.py$', six.ensure_str(path)):
-    # The _pb2.py files all appear right next to their defining .proto file.
-    return _ProtoFile(six.ensure_str(path[:-7]) + '.proto', parser_config)
-  else:
-    return _PythonFile(path, parser_config)
-
-
-# TODO(markdaoust): This should just parse, pretty_docs should generate the md.
-def generate_global_index(library_name, index, reference_resolver):
-  """Given a dict of full names to python objects, generate an index page.
-
-  The index page generated contains a list of links for all symbols in `index`
-  that have their own documentation page.
-
-  Args:
-    library_name: The name for the documented library to use in the title.
-    index: A dict mapping full names to python objects.
-    reference_resolver: An instance of ReferenceResolver.
-
-  Returns:
-    A string containing an index page as Markdown.
-  """
-  symbol_links = []
-  for full_name, py_object in six.iteritems(index):
-    if (tf_inspect.ismodule(py_object) or tf_inspect.isfunction(py_object) or
-        tf_inspect.isclass(py_object)):
-      # In Python 3, unbound methods are functions, so eliminate those.
-      if tf_inspect.isfunction(py_object):
-        if full_name.count('.') == 0:
-          parent_name = ''
-        else:
-          parent_name = full_name[:full_name.rfind('.')]
-        if parent_name in index and tf_inspect.isclass(index[parent_name]):
-          # Skip methods (=functions with class parents).
-          continue
-      symbol_links.append((
-          full_name, reference_resolver.python_link(full_name, full_name, '.')))
-
-  lines = ['# All symbols in %s' % library_name, '']
-  for _, link in sorted(symbol_links, key=lambda x: x[0]):
-    lines.append('*  %s' % link)
-
-  # TODO(markdaoust): use a _ModulePageInfo -> prety_docs.build_md_page()
-  return '\n'.join(lines)
-
-
-class _Metadata(object):
-  """A class for building a page's Metadata block.
-
-  Attributes:
-    name: The name of the page being described by the Metadata block.
-    version: The source version.
-  """
-
-  def __init__(self, name, version='Stable'):
-    """Creates a Metadata builder.
-
-    Args:
-      name: The name of the page being described by the Metadata block.
-      version: The source version.
-    """
-    self.name = name
-    self.version = version
-    self._content = []
-
-  def append(self, item):
-    """Adds an item from the page to the Metadata block.
-
-    Args:
-      item: The parsed page section to add.
-    """
-    self._content.append(item.short_name)
-
-  def build_html(self):
-    """Returns the Metadata block as an Html string."""
-    schema = 'http://developers.google.com/ReferenceObject'
-    parts = ['<div itemscope itemtype="%s">' % schema]
-
-    parts.append('<meta itemprop="name" content="%s" />' % self.name)
-    parts.append('<meta itemprop="path" content="%s" />' % self.version)
-    for item in self._content:
-      parts.append('<meta itemprop="property" content="%s"/>' % item)
-
-    parts.extend(['</div>', ''])
-
-    return '\n'.join(parts)
diff --git a/tensorflow/tools/docs/pretty_docs.py b/tensorflow/tools/docs/pretty_docs.py
deleted file mode 100644
index 98b5c7a3b39..00000000000
--- a/tensorflow/tools/docs/pretty_docs.py
+++ /dev/null
@@ -1,328 +0,0 @@
-# Lint as: python2, python3
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A module for converting parsed doc content into markdown pages.
-
-The adjacent `parser` module creates `PageInfo` objects, containing all data
-necessary to document an element of the TensorFlow API.
-
-This module contains one public function, which handels the conversion of these
-`PageInfo` objects into a markdown string:
-
-    md_page = build_md_page(page_info)
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import textwrap
-
-import six
-
-
-def build_md_page(page_info):
-  """Given a PageInfo object, return markdown for the page.
-
-  Args:
-    page_info: must be a `parser.FunctionPageInfo`, `parser.ClassPageInfo`, or
-        `parser.ModulePageInfo`
-
-  Returns:
-    Markdown for the page
-
-  Raises:
-    ValueError: if `page_info` is an instance of an unrecognized class
-  """
-  if page_info.for_function():
-    return _build_function_page(page_info)
-
-  if page_info.for_class():
-    return _build_class_page(page_info)
-
-  if page_info.for_module():
-    return _build_module_page(page_info)
-
-  raise ValueError('Unknown Page Info Type: %s' % type(page_info))
-
-
-def _build_function_page(page_info):
-  """Given a FunctionPageInfo object Return the page as an md string."""
-  parts = ['# %s\n\n' % page_info.full_name]
-
-  parts.append(_build_aliases(page_info.aliases))
-
-  if page_info.signature is not None:
-    parts.append(_build_signature(page_info))
-
-  if page_info.defined_in:
-    parts.append('\n\n')
-    parts.append(str(page_info.defined_in))
-
-  parts.append(page_info.guides)
-  parts.append(page_info.doc.docstring)
-  parts.append(_build_function_details(page_info.doc.function_details))
-  parts.append(_build_compatibility(page_info.doc.compatibility))
-
-  return ''.join(parts)
-
-
-def _build_class_page(page_info):
-  """Given a ClassPageInfo object Return the page as an md string."""
-  parts = ['# {page_info.full_name}\n\n'.format(page_info=page_info)]
-
-  parts.append('## Class `%s`\n\n' %
-               six.ensure_str(page_info.full_name).split('.')[-1])
-  if page_info.bases:
-    parts.append('Inherits From: ')
-
-    link_template = '[`{short_name}`]({url})'
-    parts.append(', '.join(
-        link_template.format(**base._asdict()) for base in page_info.bases))
-
-  parts.append('\n\n')
-
-  # Sort the methods list, but make sure constructors come first.
-  constructor_names = ['__init__', '__new__']
-  constructors = sorted(
-      method for method in page_info.methods
-      if method.short_name in constructor_names)
-  other_methods = sorted(
-      method for method in page_info.methods
-      if method.short_name not in constructor_names)
-
-  parts.append(_build_aliases(page_info.aliases))
-
-  if page_info.defined_in is not None:
-    parts.append('\n\n')
-    parts.append(str(page_info.defined_in))
-
-  parts.append(page_info.guides)
-  parts.append(page_info.doc.docstring)
-  parts.append(_build_function_details(page_info.doc.function_details))
-  parts.append(_build_compatibility(page_info.doc.compatibility))
-
-  parts.append('\n\n')
-
-  if constructors:
-    for method_info in constructors:
-      parts.append(_build_method_section(method_info, heading_level=2))
-    parts.append('\n\n')
-
-  if page_info.classes:
-    parts.append('## Child Classes\n')
-
-    link_template = ('[`class {class_info.short_name}`]'
-                     '({class_info.url})\n\n')
-    class_links = sorted(
-        link_template.format(class_info=class_info)
-        for class_info in page_info.classes)
-
-    parts.extend(class_links)
-
-  if page_info.properties:
-    parts.append('## Properties\n\n')
-    for prop_info in page_info.properties:
-      h3 = '<h3 id="{short_name}"><code>{short_name}</code></h3>\n\n'
-      parts.append(h3.format(short_name=prop_info.short_name))
-
-      parts.append(prop_info.doc.docstring)
-      parts.append(_build_function_details(prop_info.doc.function_details))
-      parts.append(_build_compatibility(prop_info.doc.compatibility))
-
-      parts.append('\n\n')
-
-    parts.append('\n\n')
-
-  if other_methods:
-    parts.append('## Methods\n\n')
-
-    for method_info in other_methods:
-      parts.append(_build_method_section(method_info))
-    parts.append('\n\n')
-
-  if page_info.other_members:
-    parts.append('## Class Members\n\n')
-
-    # TODO(markdaoust): Document the value of the members,
-    #                   at least for basic types.
-
-    h3 = '<h3 id="{short_name}"><code>{short_name}</code></h3>\n\n'
-    others_member_headings = (h3.format(short_name=info.short_name)
-                              for info in sorted(page_info.other_members))
-    parts.extend(others_member_headings)
-
-  return ''.join(parts)
-
-
-def _build_method_section(method_info, heading_level=3):
-  """Generates a markdown section for a method.
-
-  Args:
-    method_info: A `MethodInfo` object.
-    heading_level: An Int, which HTML heading level to use.
-
-  Returns:
-    A markdown string.
-  """
-  parts = []
-  heading = ('<h{heading_level} id="{short_name}">'
-             '<code>{short_name}</code>'
-             '</h{heading_level}>\n\n')
-  parts.append(heading.format(heading_level=heading_level,
-                              **method_info._asdict()))
-
-  if method_info.signature is not None:
-    parts.append(_build_signature(method_info, use_full_name=False))
-
-  parts.append(method_info.doc.docstring)
-  parts.append(_build_function_details(method_info.doc.function_details))
-  parts.append(_build_compatibility(method_info.doc.compatibility))
-  parts.append('\n\n')
-  return ''.join(parts)
-
-
-def _build_module_page(page_info):
-  """Given a ClassPageInfo object Return the page as an md string."""
-  parts = ['# Module: {full_name}\n\n'.format(full_name=page_info.full_name)]
-
-  parts.append(_build_aliases(page_info.aliases))
-
-  if page_info.defined_in is not None:
-    parts.append('\n\n')
-    parts.append(str(page_info.defined_in))
-
-  parts.append(page_info.doc.docstring)
-  parts.append(_build_compatibility(page_info.doc.compatibility))
-
-  parts.append('\n\n')
-
-  if page_info.modules:
-    parts.append('## Modules\n\n')
-    template = '[`{short_name}`]({url}) module'
-
-    for item in page_info.modules:
-      parts.append(template.format(**item._asdict()))
-
-      if item.doc.brief:
-        parts.append(': ' + six.ensure_str(item.doc.brief))
-
-      parts.append('\n\n')
-
-  if page_info.classes:
-    parts.append('## Classes\n\n')
-    template = '[`class {short_name}`]({url})'
-
-    for item in page_info.classes:
-      parts.append(template.format(**item._asdict()))
-
-      if item.doc.brief:
-        parts.append(': ' + six.ensure_str(item.doc.brief))
-
-      parts.append('\n\n')
-
-  if page_info.functions:
-    parts.append('## Functions\n\n')
-    template = '[`{short_name}(...)`]({url})'
-
-    for item in page_info.functions:
-      parts.append(template.format(**item._asdict()))
-
-      if item.doc.brief:
-        parts.append(': ' + six.ensure_str(item.doc.brief))
-
-      parts.append('\n\n')
-
-  if page_info.other_members:
-    # TODO(markdaoust): Document the value of the members,
-    #                   at least for basic types.
-    parts.append('## Other Members\n\n')
-
-    h3 = '<h3 id="{short_name}"><code>{short_name}</code></h3>\n\n'
-    for item in page_info.other_members:
-      parts.append(h3.format(**item._asdict()))
-
-  return ''.join(parts)
-
-
-def _build_signature(obj_info, use_full_name=True):
-  """Returns a md code block showing the function signature."""
-  # Special case tf.range, since it has an optional first argument
-  if obj_info.full_name == 'tf.range':
-    return (
-        '``` python\n'
-        "tf.range(limit, delta=1, dtype=None, name='range')\n"
-        "tf.range(start, limit, delta=1, dtype=None, name='range')\n"
-        '```\n\n')
-
-  parts = ['``` python']
-  parts.extend(['@' + six.ensure_str(dec) for dec in obj_info.decorators])
-  signature_template = '{name}({sig})'
-
-  if not obj_info.signature:
-    sig = ''
-  elif len(obj_info.signature) == 1:
-    sig = obj_info.signature[0]
-  else:
-    sig = ',\n'.join('    %s' % sig_item for sig_item in obj_info.signature)
-    sig = '\n'+sig+'\n'
-
-  if use_full_name:
-    obj_name = obj_info.full_name
-  else:
-    obj_name = obj_info.short_name
-  parts.append(signature_template.format(name=obj_name, sig=sig))
-  parts.append('```\n\n')
-
-  return '\n'.join(parts)
-
-
-def _build_compatibility(compatibility):
-  """Return the compatibility section as an md string."""
-  parts = []
-  sorted_keys = sorted(compatibility.keys())
-  for key in sorted_keys:
-
-    value = compatibility[key]
-    # Dedent so that it does not trigger markdown code formatting.
-    value = textwrap.dedent(value)
-    parts.append('\n\n#### %s Compatibility\n%s\n' % (key.title(), value))
-
-  return ''.join(parts)
-
-
-def _build_function_details(function_details):
-  """Return the function details section as an md string."""
-  parts = []
-  for detail in function_details:
-    sub = []
-    sub.append('#### ' + six.ensure_str(detail.keyword) + ':\n\n')
-    sub.append(textwrap.dedent(detail.header))
-    for key, value in detail.items:
-      sub.append('* <b>`%s`</b>: %s' % (key, value))
-    parts.append(''.join(sub))
-
-  return '\n'.join(parts)
-
-
-def _build_aliases(aliases):
-  aliases = sorted(aliases, key=lambda x: ('compat.v' in x, x))
-  parts = []
-  if len(aliases) > 1:
-    parts.append('**Aliases**: ')
-    parts.extend(', '.join('`{}`'.format(name) for name in aliases))
-    parts.append('\n\n')
-
-  return ''.join(parts)
diff --git a/tensorflow/tools/docs/py_guide_parser.py b/tensorflow/tools/docs/py_guide_parser.py
deleted file mode 100644
index 8d1cee6912a..00000000000
--- a/tensorflow/tools/docs/py_guide_parser.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Lint as: python2, python3
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Library for operating on Python API Guide files."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import re
-
-import six
-
-
-def md_files_in_dir(py_guide_src_dir):
-  """Returns a list of filename (full_path, base) pairs for guide files."""
-  all_in_dir = [(os.path.join(py_guide_src_dir, f), f)
-                for f in os.listdir(py_guide_src_dir)]
-  return [(full, f)
-          for full, f in all_in_dir
-          if os.path.isfile(full) and six.ensure_str(f).endswith('.md')]
-
-
-class PyGuideParser(object):
-  """Simple parsing of a guide .md file.
-
-  Descendants can override the process_*() functions (called by process())
-  to either record information from the guide, or call replace_line()
-  to affect the return value of process().
-  """
-
-  def __init__(self):
-    self._lines = None
-
-  def process(self, full_path):
-    """Read and process the file at `full_path`."""
-    with open(full_path, 'rb') as f:
-      md_string = f.read().decode('utf-8')
-    self._lines = md_string.split('\n')
-    seen = set()
-
-    in_blockquote = False
-    for i, line in enumerate(self._lines):
-      if '```' in line:
-        in_blockquote = not in_blockquote
-
-      if not in_blockquote and line.startswith('# '):
-        self.process_title(i, line[2:])
-      elif not in_blockquote and line.startswith('## '):
-        section_title = line.strip()[3:]
-        existing_tag = re.search(' {([^}]+)} *$', line)
-        if existing_tag:
-          tag = existing_tag.group(1)
-        else:
-          tag = re.sub('[^a-zA-Z0-9]+', '_', section_title)
-          if tag in seen:
-            suffix = 0
-            while True:
-              candidate = '%s_%d' % (tag, suffix)
-              if candidate not in seen:
-                tag = candidate
-                break
-        seen.add(tag)
-        self.process_section(i, section_title, tag)
-
-      elif in_blockquote:
-        self.process_in_blockquote(i, line)
-      else:
-        self.process_line(i, line)
-
-    ret = '\n'.join(self._lines)
-    self._lines = None
-    return ret
-
-  def replace_line(self, line_number, line):
-    """Replace the contents of line numbered `line_number` with `line`."""
-    self._lines[line_number] = line
-
-  def process_title(self, line_number, title):
-    pass
-
-  def process_section(self, line_number, section_title, tag):
-    pass
-
-  def process_in_blockquote(self, line_number, line):
-    pass
-
-  def process_line(self, line_number, line):
-    pass
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index b900867a2d3..2f5eceb5e08 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -9,14 +9,13 @@ load(
     "tf_copts",
     "tf_py_test",
 )
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
 
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 cc_library(
     name = "transform_utils",
     srcs = [
@@ -25,6 +24,7 @@ cc_library(
     hdrs = [
         "transform_utils.h",
     ],
+    compatible_with = get_compatible_with_cloud(),
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index b336ff21b05..6d5ec6a5484 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -157,7 +157,6 @@ genrule(
         "@llvm-project//llvm:LICENSE.TXT",
         "@llvm-project//mlir:LICENSE.TXT",
         "@lmdb//:LICENSE",
-        "@local_config_sycl//sycl:LICENSE.text",
         "@local_config_tensorrt//:LICENSE",
         "@nasm//:LICENSE",
         "@nsync//:LICENSE",
@@ -238,7 +237,6 @@ genrule(
         "@llvm-project//llvm:LICENSE.TXT",
         "@llvm-project//mlir:LICENSE.TXT",
         "@lmdb//:LICENSE",
-        "@local_config_sycl//sycl:LICENSE.text",
         "@local_config_tensorrt//:LICENSE",
         "@nasm//:LICENSE",
         "@nsync//:LICENSE",
diff --git a/tensorflow/tools/mlpbtxt/BUILD b/tensorflow/tools/mlpbtxt/BUILD
index 68966404730..0786fc26512 100644
--- a/tensorflow/tools/mlpbtxt/BUILD
+++ b/tensorflow/tools/mlpbtxt/BUILD
@@ -9,10 +9,7 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files([
-    "LICENSE",
-    "placeholder.txt",
-])
+exports_files(["placeholder.txt"])
 
 tf_cc_binary(
     name = "tomlpbtxt",
diff --git a/tensorflow/tools/optimization/BUILD b/tensorflow/tools/optimization/BUILD
index 73a8a15b121..08df199bf4d 100644
--- a/tensorflow/tools/optimization/BUILD
+++ b/tensorflow/tools/optimization/BUILD
@@ -6,18 +6,18 @@ load(
     "tf_cc_binary",
     "tf_cuda_library",
 )
+load("//tensorflow:tensorflow.bzl", "get_compatible_with_cloud")
 
 package(
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
-
 tf_cuda_library(
     name = "optimization_pass_runner_lib",
     srcs = ["optimization_pass_runner.cc"],
     hdrs = ["optimization_pass_runner.h"],
+    compatible_with = get_compatible_with_cloud(),
     deps = [
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_base",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 74585cbb11d..ae3a0b8a2fd 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -32,10 +32,14 @@ transitive_hdrs(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:stream_executor",
+        "//tensorflow/core/platform:stream_executor",
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/cc/saved_model:reader",
         "//tensorflow/cc/saved_model:bundle_v2",
+        "//tensorflow/c/experimental/filesystem:filesystem_interface",
+        "//tensorflow/c/experimental/stream_executor:stream_executor_hdrs",
+        "//tensorflow/c:kernels_hdrs",
+        "//tensorflow/c:ops_hdrs",
         # WARNING: None of the C/C++ code under python/ has any API guarantees, and TF team
         # reserves the right to change APIs and other header-level interfaces.  If your custom
         # op uses these headers, it may break when users upgrade their version of tensorflow.
@@ -97,7 +101,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/compiler/tf2xla:xla_compiled_cpu_runtime_srcs",
     "//tensorflow/compiler/mlir/tensorflow:gen_mlir_passthrough_op_py",
     "//tensorflow/core:protos_all_proto_srcs",
-    "//tensorflow/examples/saved_model/integration_tests:mnist_util",
     "//tensorflow/lite/python/testdata:interpreter_test_data",
     "//tensorflow/lite/python:tflite_convert",
     "//tensorflow/lite/toco/python:toco_from_protos",
@@ -109,17 +112,19 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python:cond_v2",
     "//tensorflow/python:distributed_framework_test_lib",
     "//tensorflow/python/distribute:distribute_test_lib_pip",
-    "//tensorflow/python:loss_scale",
-    "//tensorflow/python:loss_scale_optimizer",
+    "//tensorflow/python/training/experimental:loss_scale",
+    "//tensorflow/python/training/experimental:loss_scale_optimizer",
     "//tensorflow/python:memory_checker",
     "//tensorflow/python:meta_graph_testdata",
     "//tensorflow/python:util_example_parser_configuration",
     "//tensorflow/python/data/benchmarks:benchmark_base",
     "//tensorflow/python/data/experimental/kernel_tests/serialization:dataset_serialization_test_base",
+    "//tensorflow/python/data/experimental/kernel_tests:data_service_test_base",
     "//tensorflow/python/data/experimental/kernel_tests:reader_dataset_ops_test_base",
     "//tensorflow/python/data/experimental/kernel_tests:stats_dataset_test_base",
     "//tensorflow/python/data/experimental/ops:testing",
     "//tensorflow/python/data/experimental/service:server_lib",
+    "//tensorflow/python/ops/ragged:ragged_tensor_test_ops",
     "//tensorflow/python/data/kernel_tests:test_base",
     "//tensorflow/python/debug:debug_pip",
     "//tensorflow/python/distribute:combinations",
@@ -142,16 +147,14 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/tools:tools_pip",
     "//tensorflow/python/tools/api/generator:create_python_api",
     "//tensorflow/python/tpu",
+    "//tensorflow/python:image_grad_test_base",
     "//tensorflow/python:test_ops",
     "//tensorflow/python:while_v2",
     "//tensorflow/tools/common:public_api",
     "//tensorflow/tools/common:test_module1",
-    "//tensorflow/tools/docs:doc_generator_visitor",
-    "//tensorflow/tools/docs:generate_lib",
-    "//tensorflow/tools/docs:parser",
-    "//tensorflow/tools/docs:py_guide_parser",
+    "//tensorflow/tools/common:traverse",
+    "//tensorflow/python/distribute:parameter_server_strategy_v2",
     "//tensorflow/python/distribute/client:client",
-    "//tensorflow/python/distribute/client:parameter_server_client",
     "//tensorflow/python/distribute/client:remote_eager_lib",
     "//tensorflow/python/distribute/client:metric_utils",
 ]
@@ -180,7 +183,6 @@ filegroup(
         "@ruy//:LICENSE",
         "@arm_neon_2_x86_sse//:LICENSE",
         "@astunparse_archive//:LICENSE",
-        "@astor_archive//:LICENSE",
         "@boringssl//:LICENSE",
         "@com_google_absl//:LICENSE",
         "@com_google_protobuf//:LICENSE",
@@ -205,7 +207,6 @@ filegroup(
         "@llvm-project//llvm:LICENSE.TXT",
         "@llvm-project//mlir:LICENSE.TXT",
         "@lmdb//:LICENSE",
-        "@local_config_sycl//sycl:LICENSE.text",
         "@local_config_tensorrt//:LICENSE",
         "@nasm//:LICENSE",
         "@nsync//:LICENSE",
@@ -218,6 +219,7 @@ filegroup(
         "@sobol_data//:LICENSE",
         "@tblib_archive//:LICENSE",
         "@termcolor_archive//:COPYING.txt",
+        "@typing_extensions_archive//:LICENSE",
         "@zlib//:zlib.h",
         "@clog//:LICENSE",
         "@cpuinfo//:LICENSE",
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index d2002b58598..60e1ae5b656 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -74,6 +74,7 @@ PYTHON_TARGETS, PY_TEST_QUERY_EXPRESSION = BuildPyTestDependencies()
 # List of dependencies that should not included in the pip package.
 DEPENDENCY_DENYLIST = [
     "//tensorflow/python:extra_py_tests_deps",
+    "//tensorflow/cc/saved_model:saved_model_test_files",
     "//tensorflow/cc/saved_model:saved_model_half_plus_two",
     "//tensorflow:no_tensorflow_py_deps",
     "//tensorflow/tools/pip_package:win_pip_package_marker",
@@ -81,7 +82,7 @@ DEPENDENCY_DENYLIST = [
     "//tensorflow/python:tf_optimizer",
     "//tensorflow/python:compare_test_proto_py",
     "//tensorflow/core:image_testdata",
-    "//tensorflow/core:lmdb_testdata",
+    "//tensorflow/core/lib/lmdb/testdata:lmdb_testdata",
     "//tensorflow/core/kernels/cloud:bigquery_reader_ops",
     "//tensorflow/python/debug:grpc_tensorflow_server.par",
     "//tensorflow/python/feature_column:vocabulary_testdata",
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 5917b0fca7f..8d85d7d4c9b 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -44,6 +44,7 @@ from setuptools import setup
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
+
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
@@ -51,34 +52,10 @@ from setuptools.dist import Distribution
 # tensorflow/core/public/version.h
 _VERSION = '2.4.0'
 
-REQUIRED_PACKAGES = [
-    'absl-py >= 0.7.0',
-    'astunparse == 1.6.3',
-    'flatbuffers >= 1.12',
-    'gast == 0.3.3',
-    'google_pasta >= 0.1.8',
-    'h5py >= 2.10.0, < 2.11.0',
-    'keras_preprocessing >= 1.1.1, < 1.2',
-    # TODO(mihaimaruseac): numpy 1.19.0 has ABI breakage
-    # https://github.com/numpy/numpy/pull/15355
-    'numpy >= 1.16.0, < 1.19.0',
-    'opt_einsum >= 2.3.2',
-    'protobuf >= 3.9.2',
-    'tensorboard >= 2.3.0, < 3',
-    'tensorflow_estimator >= 2.3.0, < 2.4.0',
-    'termcolor >= 1.1.0',
-    'typing_extensions >= 3.7.4.2',
-    'wrapt >= 1.11.1',
-    'wheel >= 0.26',
-    'six >= 1.12.0',
-]
-
-if sys.byteorder == 'little':
-  # grpcio does not build correctly on big-endian machines due to lack of
-  # BoringSSL support.
-  # See https://github.com/tensorflow/tensorflow/issues/17882.
-  REQUIRED_PACKAGES.append('grpcio >= 1.8.6')
 
+# We use the same setup.py for all tensorflow_* packages and for the nightly
+# equivalents (tf_nightly_*). The package is controlled from the argument line
+# when building the pip package.
 project_name = 'tensorflow'
 if '--project_name' in sys.argv:
   project_name_idx = sys.argv.index('--project_name')
@@ -86,13 +63,70 @@ if '--project_name' in sys.argv:
   sys.argv.remove('--project_name')
   sys.argv.pop(project_name_idx)
 
-# tf-nightly should depend on tb-nightly
+
+# All versions of TF need these packages. We use the `~=` syntax to pin packages
+# to the latest major.minor release accepting all other patches on top of that.
+# If we already know of a patched version, we pin to that.
+# For packages that don't have yet a stable release, we pin using `~= 0.x` which
+# means we accept any `0.y` version (y >= x) but not the first major release. We
+# will need additional testing for that.
+# NOTE: This assumes that all packages follow SemVer. If a packages follows a
+# different versioning scheme (e.g., PVP), we use different bound specifier and
+# comment the versioning scheme.
+# NOTE: Please add test only packages to `TEST_PACKAGES` below.
+REQUIRED_PACKAGES = [
+    'absl-py ~= 0.10',
+    'astunparse ~= 1.6.3',
+    'flatbuffers ~= 1.12.0',
+    'gast ~= 0.4',
+    'google_pasta ~= 0.2',
+    'h5py ~= 2.10.0',
+    'keras_preprocessing ~= 1.1.2',
+    'numpy ~= 1.19.2',
+    'opt_einsum ~= 3.3.0',
+    'protobuf ~= 3.13.0',
+    'six ~= 1.15.0',
+    'termcolor ~= 1.1.0',
+    'typing_extensions ~= 3.7.4',
+    'wheel ~= 0.35',
+    'wrapt ~= 1.12.1',
+    # TensorFlow ecosystem packages that TF exposes API for
+    # These need to be in sync with the existing TF version
+    # They are updated during the release process
+    # When updating these, please also update the nightly versions below
+    'tensorboard ~= 2.3',
+    'tensorflow_estimator ~= 2.3.0',
+]
+
+
+# For nightly packages, instead of dependening on tensorboard and
+# tensorflow_estimator, we depend on their nightly equivalent.
+# When updating these, make sure to also update the release versions above.
+# NOTE: the nightly versions are one version ahead of the release ones!
+# NOTE: the nightly versions specify alpha/dev!
 if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
-      REQUIRED_PACKAGES[i] = 'tb-nightly >= 2.4.0a0, < 3.0.0a0'
+      REQUIRED_PACKAGES[i] = 'tb-nightly ~= 2.4.0.a'
     elif 'tensorflow_estimator' in pkg:
-      REQUIRED_PACKAGES[i] = 'tf-estimator-nightly'
+      REQUIRED_PACKAGES[i] = 'tf-estimator-nightly ~= 2.4.0.dev'
+
+
+# grpcio does not build correctly on big-endian machines due to lack of
+# BoringSSL support.
+# See https://github.com/tensorflow/tensorflow/issues/17882.
+if sys.byteorder == 'little':
+  REQUIRED_PACKAGES.append('grpcio ~= 1.32.0')
+
+
+# Packages which are only needed for testing code.
+# Please don't add test-only packages to `REQUIRED_PACKAGES`!
+# Follows the same conventions as `REQUIRED_PACKAGES`
+TEST_PACKAGES = [
+    'portpicker ~= 1.3.1',
+    'scipy ~= 1.5.2',
+]
+
 
 DOCLINES = __doc__.split('\n')
 if project_name.endswith('-gpu'):
@@ -125,10 +159,6 @@ CONSOLE_SCRIPTS = [
 if 'tf_nightly' in project_name:
   CONSOLE_SCRIPTS.remove('tensorboard = tensorboard.main:run_main')
 
-TEST_PACKAGES = [
-    'scipy >= 0.15.1',
-]
-
 
 class BinaryDistribution(Distribution):
 
@@ -260,6 +290,7 @@ setup(
     version=_VERSION.replace('-', ''),
     description=DOCLINES[0],
     long_description='\n'.join(DOCLINES[2:]),
+    long_description_content_type="text/markdown",
     url='https://www.tensorflow.org/',
     download_url='https://github.com/tensorflow/tensorflow/tags',
     author='Google Inc.',
@@ -288,12 +319,13 @@ setup(
     # PyPI package information.
     classifiers=sorted([
         'Development Status :: 5 - Production/Stable',
+        # TODO(angerson) Add IFTTT when possible
+        'Environment :: GPU :: NVIDIA CUDA :: 11.0',
         'Intended Audience :: Developers',
         'Intended Audience :: Education',
         'Intended Audience :: Science/Research',
         'License :: OSI Approved :: Apache Software License',
         'Programming Language :: Python :: 3',
-        'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
diff --git a/tensorflow/tools/proto_text/BUILD b/tensorflow/tools/proto_text/BUILD
index 9e0f9763fb1..19ed935b38f 100644
--- a/tensorflow/tools/proto_text/BUILD
+++ b/tensorflow/tools/proto_text/BUILD
@@ -18,7 +18,7 @@ load(
 # For platform specific build config
 load(
     "//tensorflow/core/platform:build_config.bzl",
-    "tf_proto_library_cc",
+    "tf_proto_library",
 )
 
 package(
@@ -26,10 +26,7 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files([
-    "LICENSE",
-    "placeholder.txt",
-])
+exports_files(["placeholder.txt"])
 
 cc_binary(
     name = "gen_proto_text_functions",
@@ -70,7 +67,7 @@ cc_library(
     ] + if_ios(["//tensorflow/core/platform:logging"]),
 )
 
-tf_proto_library_cc(
+tf_proto_library(
     name = "test_proto",
     srcs = ["test.proto"],
     cc_api_version = 2,
diff --git a/tensorflow/tools/test/BUILD b/tensorflow/tools/test/BUILD
index 30c8d991b5f..f314dcfff11 100644
--- a/tensorflow/tools/test/BUILD
+++ b/tensorflow/tools/test/BUILD
@@ -12,10 +12,7 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files([
-    "LICENSE",
-    "run_and_gather_logs_lib.py",
-])
+exports_files(["run_and_gather_logs_lib.py"])
 
 py_library(
     name = "system_info_lib",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 5083b29a12f..ee1e8d8b67b 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -7,7 +7,6 @@ load("//third_party/nccl:nccl_configure.bzl", "nccl_configure")
 load("//third_party/mkl:build_defs.bzl", "mkl_repository")
 load("//third_party/git:git_configure.bzl", "git_configure")
 load("//third_party/py:python_configure.bzl", "python_configure")
-load("//third_party/sycl:sycl_configure.bzl", "sycl_configure")
 load("//third_party/systemlibs:syslibs_configure.bzl", "syslibs_configure")
 load("//third_party/toolchains/remote:configure.bzl", "remote_execution_configure")
 load("//third_party/toolchains/clang6:repo.bzl", "clang6_configure")
@@ -99,7 +98,6 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tensorrt_configure(name = "local_config_tensorrt")
     nccl_configure(name = "local_config_nccl")
     git_configure(name = "local_config_git")
-    sycl_configure(name = "local_config_sycl")
     syslibs_configure(name = "local_config_syslibs")
     python_configure(name = "local_config_python")
     rocm_configure(name = "local_config_rocm")
@@ -127,16 +125,6 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         armhf_repo = "../armhf_linux_toolchain",
     )
 
-    mkl_repository(
-        name = "mkl_linux",
-        build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
-        sha256 = "a936d6b277a33d2a027a024ea8e65df62bd2e162c7ca52c48486ed9d5dc27160",
-        strip_prefix = "mklml_lnx_2019.0.5.20190502",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/releases/download/v0.21/mklml_lnx_2019.0.5.20190502.tgz",
-            "https://github.com/intel/mkl-dnn/releases/download/v0.21/mklml_lnx_2019.0.5.20190502.tgz",
-        ],
-    )
     mkl_repository(
         name = "mkl_windows",
         build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
@@ -164,11 +152,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "742eb377e0d304a0bfcb64fccfee2b3fe27932a2d5a95a22bfbc7a6fb4459e1a",
-        strip_prefix = "XNNPACK-0af63ab36b899559bd1a92bbc327f8137e53c15c",
+        sha256 = "4b199c96fb2d551450b48eb5549843b41c023ad200aa86760a7c56d0dc0da806",
+        strip_prefix = "XNNPACK-68447302abcfad0d4b6b19a1efe7d7eef8833f4a",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/0af63ab36b899559bd1a92bbc327f8137e53c15c.zip",
-            "https://github.com/google/XNNPACK/archive/0af63ab36b899559bd1a92bbc327f8137e53c15c.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/68447302abcfad0d4b6b19a1efe7d7eef8833f4a.zip",
+            "https://github.com/google/XNNPACK/archive/68447302abcfad0d4b6b19a1efe7d7eef8833f4a.zip",
         ],
     )
 
@@ -211,11 +199,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "mkl_dnn_v1",
         build_file = clean_dep("//third_party/mkl_dnn:mkldnn_v1.BUILD"),
-        sha256 = "aef4d2a726f76f5b98902491a1a4ac69954039aa8e5a1d67ef6ce58ed00e23a6",
-        strip_prefix = "oneDNN-1.5.1",
+        sha256 = "5369f7b2f0b52b40890da50c0632c3a5d1082d98325d0f2bff125d19d0dcaa1d",
+        strip_prefix = "oneDNN-1.6.4",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/oneapi-src/oneDNN/archive/v1.5.1.tar.gz",
-            "https://github.com/oneapi-src/oneDNN/archive/v1.5.1.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/oneapi-src/oneDNN/archive/v1.6.4.tar.gz",
+            "https://github.com/oneapi-src/oneDNN/archive/v1.6.4.tar.gz",
         ],
     )
 
@@ -237,11 +225,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "a3c10a8c14f55e9f09f98b0a0ac6874c21bda91f65b7469d9b1f6925990e867b",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-d10b27fe37736d2944630ecd7557cefa95cf87c9",
+        sha256 = "e807a6a6f3a0e8ab10adeb59bb5a9bbb113e8e1684f9b4b32f73f58fd758b4cf",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-011e0db31d1bed8b7f73662be6d57d9f30fa457a",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/d10b27fe37736d2944630ecd7557cefa95cf87c9/eigen-d10b27fe37736d2944630ecd7557cefa95cf87c9.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/d10b27fe37736d2944630ecd7557cefa95cf87c9/eigen-d10b27fe37736d2944630ecd7557cefa95cf87c9.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/011e0db31d1bed8b7f73662be6d57d9f30fa457a/eigen-011e0db31d1bed8b7f73662be6d57d9f30fa457a.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/011e0db31d1bed8b7f73662be6d57d9f30fa457a/eigen-011e0db31d1bed8b7f73662be6d57d9f30fa457a.tar.gz",
         ],
     )
 
@@ -314,15 +302,6 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    http_file(
-        name = "com_github_nlohmann_json_single_header",
-        sha256 = "63da6d1f22b2a7bb9e4ff7d6b255cf691a161ff49532dcc45d398a53e295835f",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nlohmann/json/releases/download/v3.4.0/json.hpp",
-            "https://github.com/nlohmann/json/releases/download/v3.4.0/json.hpp",
-        ],
-    )
-
     tf_http_archive(
         name = "com_github_google_crc32c",
         sha256 = "6b3b1d861bb8307658b2407bc7a4c59e566855ef5368a60b35c893551e4788e9",
@@ -336,18 +315,19 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "com_github_googlecloudplatform_google_cloud_cpp",
-        sha256 = "d9d1358f464328b8fd6d24a98d4c2876fde0d3fdb06c8b6bd617be7fb9b0fbac",
-        strip_prefix = "google-cloud-cpp-1.16.0",
+        sha256 = "ff82045b9491f0d880fc8e5c83fd9542eafb156dcac9ff8c6209ced66ed2a7f0",
+        strip_prefix = "google-cloud-cpp-1.17.1",
         repo_mapping = {
             "@com_github_curl_curl": "@curl",
+            "@com_github_nlohmann_json": "@nlohmann_json_lib",
         },
         system_build_file = clean_dep("//third_party/systemlibs:google_cloud_cpp.BUILD"),
         system_link_files = {
             "//third_party/systemlibs:google_cloud_cpp.google.cloud.bigtable.BUILD": "google/cloud/bigtable/BUILD",
         },
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/googleapis/google-cloud-cpp/archive/v1.16.0.tar.gz",
-            "https://github.com/googleapis/google-cloud-cpp/archive/v1.16.0.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/googleapis/google-cloud-cpp/archive/v1.17.1.tar.gz",
+            "https://github.com/googleapis/google-cloud-cpp/archive/v1.17.1.tar.gz",
         ],
     )
 
@@ -409,12 +389,12 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "org_sqlite",
         build_file = clean_dep("//third_party:sqlite.BUILD"),
-        sha256 = "e9cec01d4519e2d49b3810615237325263fe1feaceae390ee12b4a29bd73dbe2",
-        strip_prefix = "sqlite-amalgamation-3320300",
+        sha256 = "b34f4c0c0eefad9a7e515c030c18702e477f4ef7d8ade6142bdab8011b487ac6",
+        strip_prefix = "sqlite-amalgamation-3330000",
         system_build_file = clean_dep("//third_party/systemlibs:sqlite.BUILD"),
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/www.sqlite.org/2020/sqlite-amalgamation-3320300.zip",
-            "https://www.sqlite.org/2020/sqlite-amalgamation-3320300.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/www.sqlite.org/2020/sqlite-amalgamation-3330000.zip",
+            "https://www.sqlite.org/2020/sqlite-amalgamation-3330000.zip",
         ],
     )
 
@@ -518,14 +498,25 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "typing_extensions_archive",
         build_file = clean_dep("//third_party:typing_extensions.BUILD"),
         sha256 = "79ee589a3caca649a9bfd2a8de4709837400dfa00b6cc81962a1e6a1815969ae",
-        strip_prefix = "typing_extensions-3.7.4.2",
-        system_build_file = clean_dep("//third_party/systemlibs:six.BUILD"),
+        strip_prefix = "typing_extensions-3.7.4.2/src_py3",
+        system_build_file = clean_dep("//third_party/systemlibs:typing_extensions.BUILD"),
         urls = [
             "http://mirror.tensorflow.org/files.pythonhosted.org/packages/6a/28/d32852f2af6b5ead85d396249d5bdf450833f3a69896d76eb480d9c5e406/typing_extensions-3.7.4.2.tar.gz",
             "https://files.pythonhosted.org/packages/6a/28/d32852f2af6b5ead85d396249d5bdf450833f3a69896d76eb480d9c5e406/typing_extensions-3.7.4.2.tar.gz",
         ],
     )
 
+    filegroup_external(
+        name = "typing_extensions_license",
+        licenses = ["notice"],  # PSFL
+        sha256_urls = {
+            "ff17ce94e102024deb68773eb1cc74ca76da4e658f373531f0ac22d68a6bb1ad": [
+                "http://mirror.tensorflow.org/raw.githubusercontent.com/python/typing/master/typing_extensions/LICENSE",
+                "https://raw.githubusercontent.com/python/typing/master/typing_extensions/LICENSE",
+            ],
+        },
+    )
+
     tf_http_archive(
         name = "opt_einsum_archive",
         build_file = clean_dep("//third_party:opt_einsum.BUILD"),
@@ -711,8 +702,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "bad7d6b3735d1d855ffb07f32a272049cff085e6"
-    LLVM_SHA256 = "363948fc7b6ab6e87ba074ad40604f4cfe2cd2f0ce983108f445f6147233b877"
+    LLVM_COMMIT = "b740899c500ba6a707711b74bfdacf104e8a8067"
+    LLVM_SHA256 = "927c29d87f4ef69b5bc803d045583f986d3df2ea83669aeb4c3f96b3dc3155e5"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
@@ -729,6 +720,18 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         },
     )
 
+    # Intel openMP that is part of LLVM sources.
+    tf_http_archive(
+        name = "llvm_openmp",
+        build_file = clean_dep("//third_party/llvm_openmp:BUILD"),
+        sha256 = "d19f728c8e04fb1e94566c8d76aef50ec926cd2f95ef3bf1e0a5de4909b28b44",
+        strip_prefix = "openmp-10.0.1.src",
+        urls = [
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/releases/download/llvmorg-10.0.1/openmp-10.0.1.src.tar.xz",
+            "https://github.com/llvm/llvm-project/releases/download/llvmorg-10.0.1/openmp-10.0.1.src.tar.xz",
+        ],
+    )
+
     tf_http_archive(
         name = "lmdb",
         build_file = clean_dep("//third_party:lmdb.BUILD"),
@@ -1184,6 +1187,17 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
+    tf_http_archive(
+        name = "tf_toolchains",
+        sha256 = "eb175afa73e5a33d2b5d2aabcfde6c8c3395fd7001eb5ba765a5cd98cce714ba",
+        strip_prefix = "toolchains-0.0.2",
+        build_file = clean_dep("//third_party:tf_toolchains.BUILD"),
+        urls = [
+            "http://mirror.tensorflow.org/github.com/tensorflow/toolchains/archive/v0.0.2.tar.gz",
+            "https://github.com/tensorflow/toolchains/archive/v0.0.2.tar.gz",
+        ],
+    )
+
 def tf_bind():
     """Bind targets for some external repositories"""
     ##############################################################################
diff --git a/third_party/aws/aws-checksums.bazel b/third_party/aws/aws-checksums.bazel
index f620a96d2c8..26cc7718006 100644
--- a/third_party/aws/aws-checksums.bazel
+++ b/third_party/aws/aws-checksums.bazel
@@ -7,6 +7,13 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("@bazel_skylib//lib:selects.bzl", "selects")
+
+selects.config_setting_group(
+    name = "is_linux_debug",
+    match_all = ["@org_tensorflow//tensorflow:linux_x86_64", "@org_tensorflow//tensorflow:debug"],
+)
+
 cc_library(
     name = "aws-checksums",
     srcs = select({
@@ -28,4 +35,10 @@ cc_library(
     deps = [
         "@aws-c-common",
     ],
+    defines = select({
+        ":is_linux_debug": [
+            "DEBUG_BUILD"
+        ],
+        "//conditions:default": [],
+    }),
 )
diff --git a/third_party/eigen3/BUILD b/third_party/eigen3/BUILD
index 595321fda8d..bbe74cf1f24 100644
--- a/third_party/eigen3/BUILD
+++ b/third_party/eigen3/BUILD
@@ -2,6 +2,8 @@
 #   Eigen is a C++ template library for linear algebra: vectors,
 #   matrices, and related algorithms.
 
+load("//third_party/mkl:build_defs.bzl", "if_mkl")
+
 licenses([
     # Note: Eigen is an MPL2 library that includes GPL v3 and LGPL v2.1+ code.
     #       We've taken special care to not reference any restricted code.
@@ -11,8 +13,6 @@ licenses([
 
 exports_files(["LICENSE"])
 
-load("//third_party/mkl:build_defs.bzl", "if_mkl")
-
 EIGEN3_THIRD_PARTY_HEADERS = [
     "Eigen/Core",
     "Eigen/LU",
@@ -37,7 +37,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "@eigen_archive//:eigen",
-        "@local_config_sycl//sycl",
     ],
 )
 
diff --git a/third_party/flatbuffers/BUILD.bazel b/third_party/flatbuffers/BUILD.bazel
index 1ee46f05235..108c0cd8e3b 100644
--- a/third_party/flatbuffers/BUILD.bazel
+++ b/third_party/flatbuffers/BUILD.bazel
@@ -1,4 +1,5 @@
 load("@build_bazel_rules_android//android:rules.bzl", "android_library")
+load(":build_defs.bzl", "flatbuffer_py_strip_prefix_srcs")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -102,8 +103,8 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
-filegroup(
-    name = "runtime_py_srcs",
+flatbuffer_py_strip_prefix_srcs(
+    name = "flatbuffer_py_strip_prefix",
     srcs = [
         "python/flatbuffers/__init__.py",
         "python/flatbuffers/builder.py",
@@ -114,6 +115,21 @@ filegroup(
         "python/flatbuffers/table.py",
         "python/flatbuffers/util.py",
     ],
+    strip_prefix = "python/flatbuffers/",
+)
+
+filegroup(
+    name = "runtime_py_srcs",
+    srcs = [
+        "__init__.py",
+        "builder.py",
+        "compat.py",
+        "encode.py",
+        "number_types.py",
+        "packer.py",
+        "table.py",
+        "util.py",
+    ],
 )
 
 py_library(
diff --git a/third_party/flatbuffers/build_defs.bzl b/third_party/flatbuffers/build_defs.bzl
index 1fbe629e66a..4fe9629b9d1 100644
--- a/third_party/flatbuffers/build_defs.bzl
+++ b/third_party/flatbuffers/build_defs.bzl
@@ -24,6 +24,7 @@ def flatbuffer_library_public(
         out_prefix = "",
         includes = [],
         include_paths = [],
+        compatible_with = [],
         flatc_args = DEFAULT_FLATC_ARGS,
         reflection_name = "",
         reflection_visibility = None,
@@ -43,6 +44,8 @@ def flatbuffer_library_public(
           single source targets. Usually is a directory name.
       includes: Optional, list of filegroups of schemas that the srcs depend on.
       include_paths: Optional, list of paths the includes files can be found in.
+      compatible_with: Optional, passed to genrule for environments this rule
+          can be built for.
       flatc_args: Optional, list of additional arguments to pass to flatc.
       reflection_name: Optional, if set this will generate the flatbuffer
         reflection binaries for the schemas.
@@ -72,6 +75,7 @@ def flatbuffer_library_public(
         srcs = srcs,
         outs = outs,
         output_to_bindir = output_to_bindir,
+        compatible_with = compatible_with,
         tools = includes + [flatc_path],
         cmd = genrule_cmd,
         message = "Generating flatbuffer files for %s:" % (name),
@@ -97,6 +101,7 @@ def flatbuffer_library_public(
             srcs = srcs,
             outs = reflection_outs,
             output_to_bindir = output_to_bindir,
+            compatible_with = compatible_with,
             tools = includes + [flatc_path],
             cmd = reflection_genrule_cmd,
             message = "Generating flatbuffer reflection binary for %s:" % (name),
@@ -111,6 +116,7 @@ def flatbuffer_library_public(
         #         native.FilesetEntry(files = reflection_outs),
         #     ],
         #     visibility = reflection_visibility,
+        #     compatible_with = compatible_with,
         # )
 
 def flatbuffer_cc_library(
@@ -120,6 +126,7 @@ def flatbuffer_cc_library(
         out_prefix = "",
         includes = [],
         include_paths = [],
+        compatible_with = [],
         flatc_args = DEFAULT_FLATC_ARGS,
         visibility = None,
         srcs_filegroup_visibility = None,
@@ -175,6 +182,8 @@ def flatbuffer_cc_library(
       includes: Optional, list of filegroups of schemas that the srcs depend on.
           ** SEE REMARKS BELOW **
       include_paths: Optional, list of paths the includes files can be found in.
+      compatible_with: Optional, passed to genrule for environments this rule
+          can be built for
       flatc_args: Optional list of additional arguments to pass to flatc
           (e.g. --gen-mutable).
       visibility: The visibility of the generated cc_library. By default, use the
@@ -198,6 +207,7 @@ def flatbuffer_cc_library(
         out_prefix = out_prefix,
         includes = includes,
         include_paths = include_paths,
+        compatible_with = compatible_with,
         flatc_args = flatc_args,
         reflection_name = reflection_name,
         reflection_visibility = visibility,
@@ -215,6 +225,7 @@ def flatbuffer_cc_library(
         includes = ["."],
         linkstatic = 1,
         visibility = visibility,
+        compatible_with = compatible_with,
     )
 
     # A filegroup for the `srcs`. That is, all the schema files for this
@@ -223,6 +234,7 @@ def flatbuffer_cc_library(
         name = srcs_filegroup_name if srcs_filegroup_name else "%s_includes" % (name),
         srcs = srcs,
         visibility = srcs_filegroup_visibility if srcs_filegroup_visibility != None else visibility,
+        compatible_with = compatible_with,
     )
 
 # Custom provider to track dependencies transitively.
@@ -349,18 +361,30 @@ _gen_flatbuffer_srcs = rule(
     output_to_genfiles = True,
 )
 
+def flatbuffer_py_strip_prefix_srcs(name, srcs = [], strip_prefix = ""):
+    """Strips path prefix.
+
+    Args:
+      name: Rule name. (required)
+      srcs: Source .py files. (required)
+      strip_prefix: Path that needs to be stripped from the srcs filepaths. (required)
+    """
+    for src in srcs:
+        native.genrule(
+            name = name + "_" + src.replace(".", "_").replace("/", "_"),
+            srcs = [src],
+            outs = [src.replace(strip_prefix, "")],
+            cmd = "cp $< $@",
+        )
+
 def _concat_flatbuffer_py_srcs_impl(ctx):
-    # Merge all generated python files. The files are concatenated and the
-    # import statements are removed. Finally we import the flatbuffer runtime
-    # library.
+    # Merge all generated python files.
+    command = "find '%s' -name '*.py' -exec cat {} + | sed '/import flatbuffers/d'"
+    command += " | sed '1s/^/import flatbuffers\\'$'\\n/' > %s"
     ctx.actions.run_shell(
         inputs = ctx.attr.deps[0].files,
         outputs = [ctx.outputs.out],
-        command = (
-            "find '%s' -name '*.py' -exec cat {} + |" +
-            "sed 's/from flatbuffers.compat import import_numpy/import numpy as np' |" +
-            "sed '/np = import_numpy()/d' > %s"
-        ) % (
+        command = command % (
             ctx.attr.deps[0].files.to_list()[0].path,
             ctx.outputs.out.path,
         ),
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
index 8848bd32c2e..d5bfe78c644 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
@@ -34,8 +34,6 @@ HIPCC_ENV = '%{hipcc_env}'
 HIPCC_IS_HIPCLANG = '%{hipcc_is_hipclang}'=="True"
 HIP_RUNTIME_PATH = '%{hip_runtime_path}'
 HIP_RUNTIME_LIBRARY = '%{hip_runtime_library}'
-HCC_RUNTIME_PATH = '%{hcc_runtime_path}'
-HCC_RUNTIME_LIBRARY = '%{hcc_runtime_library}'
 ROCR_RUNTIME_PATH = '%{rocr_runtime_path}'
 ROCR_RUNTIME_LIBRARY = '%{rocr_runtime_library}'
 VERBOSE = '%{crosstool_verbose}'=='1'
@@ -267,11 +265,6 @@ def main():
     gpu_linker_flags.append('-L' + ROCR_RUNTIME_PATH)
     gpu_linker_flags.append('-Wl,-rpath=' + ROCR_RUNTIME_PATH)
     gpu_linker_flags.append('-l' + ROCR_RUNTIME_LIBRARY)
-    # do not link with HCC runtime library in case hip-clang toolchain is used
-    if not HIPCC_IS_HIPCLANG:
-      gpu_linker_flags.append('-L' + HCC_RUNTIME_PATH)
-      gpu_linker_flags.append('-Wl,-rpath=' + HCC_RUNTIME_PATH)
-      gpu_linker_flags.append('-l' + HCC_RUNTIME_LIBRARY)
     gpu_linker_flags.append('-L' + HIP_RUNTIME_PATH)
     gpu_linker_flags.append('-Wl,-rpath=' + HIP_RUNTIME_PATH)
     gpu_linker_flags.append('-l' + HIP_RUNTIME_LIBRARY)
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index a4a21abc367..70eacf82883 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -127,6 +127,13 @@ cc_library(
     linkstatic = 1,
 )
 
+cc_library(
+    name = "cublasLt",
+    srcs = ["cuda/lib/%{cublasLt_lib}"],
+    data = ["cuda/lib/%{cublasLt_lib}"],
+    linkstatic = 1,
+)
+
 cc_library(
     name = "cusolver",
     srcs = ["cuda/lib/%{cusolver_lib}"],
@@ -168,6 +175,7 @@ cc_library(
     name = "cuda",
     deps = [
         ":cublas",
+        ":cublasLt",
         ":cuda_headers",
         ":cudart",
         ":cudnn",
diff --git a/third_party/gpus/cuda/cuda_config.h.tpl b/third_party/gpus/cuda/cuda_config.h.tpl
index b59889938b1..ab26686ccb8 100644
--- a/third_party/gpus/cuda/cuda_config.h.tpl
+++ b/third_party/gpus/cuda/cuda_config.h.tpl
@@ -17,6 +17,7 @@ limitations under the License.
 #define CUDA_CUDA_CONFIG_H_
 
 #define TF_CUDA_VERSION "%{cuda_version}"
+#define TF_CUDART_VERSION "%{cudart_version}"
 #define TF_CUBLAS_VERSION "%{cublas_version}"
 #define TF_CUSOLVER_VERSION "%{cusolver_version}"
 #define TF_CURAND_VERSION "%{curand_version}"
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index ea33963fe19..3ba34470b93 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -534,14 +534,14 @@ def _find_libs(repository_ctx, check_cuda_libs_script, cuda_config):
             "cudart",
             cpu_value,
             cuda_config.config["cuda_library_dir"],
-            cuda_config.cuda_version,
+            cuda_config.cudart_version,
             static = False,
         ),
         "cudart_static": _check_cuda_lib_params(
             "cudart_static",
             cpu_value,
             cuda_config.config["cuda_library_dir"],
-            cuda_config.cuda_version,
+            cuda_config.cudart_version,
             static = True,
         ),
         "cublas": _check_cuda_lib_params(
@@ -551,6 +551,13 @@ def _find_libs(repository_ctx, check_cuda_libs_script, cuda_config):
             cuda_config.cublas_version,
             static = False,
         ),
+        "cublasLt": _check_cuda_lib_params(
+            "cublasLt",
+            cpu_value,
+            cuda_config.config["cublas_library_dir"],
+            cuda_config.cublas_version,
+            static = False,
+        ),
         "cusolver": _check_cuda_lib_params(
             "cusolver",
             cpu_value,
@@ -651,6 +658,7 @@ def _get_cuda_config(repository_ctx, find_cuda_config_script):
           cuda_toolkit_path: The CUDA toolkit installation directory.
           cudnn_install_basedir: The cuDNN installation directory.
           cuda_version: The version of CUDA on the system.
+          cudart_version: The CUDA runtime version on the system.
           cudnn_version: The version of cuDNN on the system.
           compute_capabilities: A list of the system's CUDA compute capabilities.
           cpu_value: The name of the host operating system.
@@ -668,6 +676,11 @@ def _get_cuda_config(repository_ctx, find_cuda_config_script):
     cudnn_version = ("64_%s" if is_windows else "%s") % config["cudnn_version"]
 
     if int(cuda_major) >= 11:
+        # The libcudart soname in CUDA 11.x is versioned as 11.0 for backward compatability.
+        if int(cuda_major) == 11:
+            cudart_version = "64_110" if is_windows else "11.0"
+        else:
+            cudart_version = ("64_%s" if is_windows else "%s") % cuda_major
         cublas_version = ("64_%s" if is_windows else "%s") % config["cublas_version"].split(".")[0]
         cusolver_version = ("64_%s" if is_windows else "%s") % config["cusolver_version"].split(".")[0]
         curand_version = ("64_%s" if is_windows else "%s") % config["curand_version"].split(".")[0]
@@ -677,12 +690,14 @@ def _get_cuda_config(repository_ctx, find_cuda_config_script):
         # cuda_lib_version is for libraries like cuBLAS, cuFFT, cuSOLVER, etc.
         # It changed from 'x.y' to just 'x' in CUDA 10.1.
         cuda_lib_version = ("64_%s" if is_windows else "%s") % cuda_major
+        cudart_version = cuda_version
         cublas_version = cuda_lib_version
         cusolver_version = cuda_lib_version
         curand_version = cuda_lib_version
         cufft_version = cuda_lib_version
         cusparse_version = cuda_lib_version
     else:
+        cudart_version = cuda_version
         cublas_version = cuda_version
         cusolver_version = cuda_version
         curand_version = cuda_version
@@ -693,6 +708,7 @@ def _get_cuda_config(repository_ctx, find_cuda_config_script):
         cuda_toolkit_path = toolkit_path,
         cuda_version = cuda_version,
         cuda_version_major = cuda_major,
+        cudart_version = cudart_version,
         cublas_version = cublas_version,
         cusolver_version = cusolver_version,
         curand_version = curand_version,
@@ -771,6 +787,7 @@ def _create_dummy_repository(repository_ctx):
             "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
             "%{cudart_lib}": lib_name("cudart", cpu_value),
             "%{cublas_lib}": lib_name("cublas", cpu_value),
+            "%{cublasLt_lib}": lib_name("cublasLt", cpu_value),
             "%{cusolver_lib}": lib_name("cusolver", cpu_value),
             "%{cudnn_lib}": lib_name("cudnn", cpu_value),
             "%{cufft_lib}": lib_name("cufft", cpu_value),
@@ -802,6 +819,7 @@ filegroup(name="cudnn-include")
         "cuda/cuda/lib/%s" % lib_name("cudart_static", cpu_value),
     )
     repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cublas", cpu_value))
+    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cublasLt", cpu_value))
     repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cusolver", cpu_value))
     repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudnn", cpu_value))
     repository_ctx.file("cuda/cuda/lib/%s" % lib_name("curand", cpu_value))
@@ -816,6 +834,7 @@ filegroup(name="cudnn-include")
         "cuda:cuda_config.h",
         {
             "%{cuda_version}": "",
+            "%{cudart_version}": "",
             "%{cublas_version}": "",
             "%{cusolver_version}": "",
             "%{curand_version}": "",
@@ -992,11 +1011,13 @@ def _create_local_cuda_repository(repository_ctx):
             cublas_include_path + "/cublas.h",
             cublas_include_path + "/cublas_v2.h",
             cublas_include_path + "/cublas_api.h",
+            cublas_include_path + "/cublasLt.h",
         ],
         outs = [
             "cublas/include/cublas.h",
             "cublas/include/cublas_v2.h",
             "cublas/include/cublas_api.h",
+            "cublas/include/cublasLt.h",
         ],
     ))
 
@@ -1137,6 +1158,7 @@ def _create_local_cuda_repository(repository_ctx):
             "%{cudart_static_linkopt}": _cudart_static_linkopt(cuda_config.cpu_value),
             "%{cudart_lib}": _basename(repository_ctx, cuda_libs["cudart"]),
             "%{cublas_lib}": _basename(repository_ctx, cuda_libs["cublas"]),
+            "%{cublasLt_lib}": _basename(repository_ctx, cuda_libs["cublasLt"]),
             "%{cusolver_lib}": _basename(repository_ctx, cuda_libs["cusolver"]),
             "%{cudnn_lib}": _basename(repository_ctx, cuda_libs["cudnn"]),
             "%{cufft_lib}": _basename(repository_ctx, cuda_libs["cufft"]),
@@ -1281,6 +1303,7 @@ def _create_local_cuda_repository(repository_ctx):
         tpl_paths["cuda:cuda_config.h"],
         {
             "%{cuda_version}": cuda_config.cuda_version,
+            "%{cudart_version}": cuda_config.cudart_version,
             "%{cublas_version}": cuda_config.cublas_version,
             "%{cusolver_version}": cuda_config.cusolver_version,
             "%{curand_version}": cuda_config.curand_version,
diff --git a/third_party/gpus/find_cuda_config.py b/third_party/gpus/find_cuda_config.py
index 091cd32d5fe..80f343023cd 100644
--- a/third_party/gpus/find_cuda_config.py
+++ b/third_party/gpus/find_cuda_config.py
@@ -176,6 +176,7 @@ def _header_paths():
       "include/*-linux-gnu",
       "extras/CUPTI/include",
       "include/cuda/CUPTI",
+      "local/cuda/extras/CUPTI/include",
   ]
 
 
@@ -188,6 +189,8 @@ def _library_paths():
       "lib/*-linux-gnu",
       "lib/x64",
       "extras/CUPTI/*",
+      "local/cuda/lib64",
+      "local/cuda/extras/CUPTI/lib64",
   ]
 
 
@@ -268,12 +271,14 @@ def _find_cuda_config(base_paths, required_version):
   nvcc_path, nvcc_version = _find_versioned_file(base_paths, [
       "",
       "bin",
+      "local/cuda/bin",
   ], nvcc_name, cuda_version, get_nvcc_version)
 
   nvvm_path = _find_file(base_paths, [
       "nvvm/libdevice",
       "share/cuda",
       "lib/nvidia-cuda-toolkit/libdevice",
+      "local/cuda/nvvm/libdevice",
   ], "libdevice*.10.bc")
 
   cupti_header_path = _find_file(base_paths, _header_paths(), "cupti.h")
diff --git a/third_party/gpus/rocm/BUILD.tpl b/third_party/gpus/rocm/BUILD.tpl
index 3c233b4f5b0..d2533a08de1 100644
--- a/third_party/gpus/rocm/BUILD.tpl
+++ b/third_party/gpus/rocm/BUILD.tpl
@@ -143,4 +143,12 @@ cc_library(
     data = ["rocm/lib/%{hipsparse_lib}"],
 )
 
+filegroup(
+    name = "rocm_root",
+    srcs = [
+        "rocm/bin/clang-offload-bundler",
+        "rocm/bin/bin2c.py",
+    ],
+)
+
 %{copy_rules}
diff --git a/third_party/gpus/rocm/build_defs.bzl.tpl b/third_party/gpus/rocm/build_defs.bzl.tpl
index 08c59f95a07..ce4c1b04399 100644
--- a/third_party/gpus/rocm/build_defs.bzl.tpl
+++ b/third_party/gpus/rocm/build_defs.bzl.tpl
@@ -34,6 +34,10 @@ def rocm_is_configured():
     """Returns true if ROCm was enabled during the configure process."""
     return %{rocm_is_configured}
 
+def rocm_gpu_architectures():
+    """Returns a list of supported GPU architectures."""
+    return %{rocm_gpu_architectures}
+
 def if_rocm_is_configured(x):
     """Tests if the ROCm was enabled during the configure process.
 
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 752f48aa25b..05082795188 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -9,8 +9,7 @@
   * `TF_ROCM_VERSION`: The version of the ROCm toolkit. If this is blank, then
     use the system default.
   * `TF_MIOPEN_VERSION`: The version of the MIOpen library.
-  * `TF_ROCM_AMDGPU_TARGETS`: The AMDGPU targets. Default is
-    `gfx803,gfx900`.
+  * `TF_ROCM_AMDGPU_TARGETS`: The AMDGPU targets.
 """
 
 load(
@@ -44,7 +43,6 @@ _TF_ROCM_CONFIG_REPO = "TF_ROCM_CONFIG_REPO"
 _DEFAULT_ROCM_VERSION = ""
 _DEFAULT_MIOPEN_VERSION = ""
 _DEFAULT_ROCM_TOOLKIT_PATH = "/opt/rocm"
-_DEFAULT_ROCM_AMDGPU_TARGETS = ["gfx803", "gfx900"]
 
 def verify_build_defines(params):
     """Verify all variables that crosstool/BUILD.rocm.tpl expects are substituted.
@@ -228,11 +226,14 @@ def _rocm_toolkit_path(repository_ctx, bash_bin):
         auto_configure_fail("Cannot find rocm toolkit path.")
     return rocm_toolkit_path
 
-def _amdgpu_targets(repository_ctx):
+def _amdgpu_targets(repository_ctx, rocm_toolkit_path, bash_bin):
     """Returns a list of strings representing AMDGPU targets."""
     amdgpu_targets_str = get_host_environ(repository_ctx, _TF_ROCM_AMDGPU_TARGETS)
     if not amdgpu_targets_str:
-        return _DEFAULT_ROCM_AMDGPU_TARGETS
+        cmd = "%s/bin/rocm_agent_enumerator" % rocm_toolkit_path
+        result = execute(repository_ctx, [bash_bin, "-c", cmd])
+        targets = [target for target in result.stdout.strip().split("\n") if target != "gfx000"]
+        amdgpu_targets_str = ",".join(targets)
     amdgpu_targets = amdgpu_targets_str.split(",")
     for amdgpu_target in amdgpu_targets:
         if amdgpu_target[:3] != "gfx" or not amdgpu_target[3:].isdigit():
@@ -389,7 +390,7 @@ def _find_libs(repository_ctx, rocm_config, bash_bin):
     libs_paths = [
         (name, _rocm_lib_paths(repository_ctx, name, path))
         for name, path in [
-            ("hip_hcc", rocm_config.rocm_toolkit_path + "/hip"),
+            ("amdhip64", rocm_config.rocm_toolkit_path + "/hip"),
             ("rocblas", rocm_config.rocm_toolkit_path + "/rocblas"),
             ("rocfft", rocm_config.rocm_toolkit_path + "/rocfft"),
             ("hiprand", rocm_config.rocm_toolkit_path + "/hiprand"),
@@ -416,7 +417,7 @@ def _get_rocm_config(repository_ctx, bash_bin):
     rocm_toolkit_path = _rocm_toolkit_path(repository_ctx, bash_bin)
     return struct(
         rocm_toolkit_path = rocm_toolkit_path,
-        amdgpu_targets = _amdgpu_targets(repository_ctx),
+        amdgpu_targets = _amdgpu_targets(repository_ctx, rocm_toolkit_path, bash_bin),
     )
 
 def _tpl_path(repository_ctx, labelname):
@@ -464,6 +465,7 @@ def _create_dummy_repository(repository_ctx):
         {
             "%{rocm_is_configured}": "False",
             "%{rocm_extra_copts}": "[]",
+            "%{rocm_gpu_architectures}": "[]",
         },
     )
     _tpl(
@@ -532,12 +534,8 @@ def _genrule(src_dir, genrule_name, command, outs):
     )
 
 def _compute_rocm_extra_copts(repository_ctx, amdgpu_targets):
-    if False:
-        amdgpu_target_flags = ["--amdgpu-target=" +
-                               amdgpu_target for amdgpu_target in amdgpu_targets]
-    else:
-        # AMDGPU targets are handled in the "crosstool_wrapper_driver_is_not_gcc"
-        amdgpu_target_flags = []
+    amdgpu_target_flags = ["--amdgpu-target=" +
+                           amdgpu_target for amdgpu_target in amdgpu_targets]
     return str(amdgpu_target_flags)
 
 def _create_local_rocm_repository(repository_ctx):
@@ -611,6 +609,26 @@ def _create_local_rocm_repository(repository_ctx):
         outs = rocm_lib_outs,
     ))
 
+    clang_offload_bundler_path = rocm_toolkit_path + _if_hipcc_is_hipclang(
+        repository_ctx,
+        rocm_config,
+        bash_bin,
+        "/llvm/bin/",
+        "/hcc/bin/",
+    ) + "clang-offload-bundler"
+
+    # copy files mentioned in third_party/gpus/rocm/BUILD
+    copy_rules.append(make_copy_files_rule(
+        repository_ctx,
+        name = "rocm-bin",
+        srcs = [
+            clang_offload_bundler_path,
+        ],
+        outs = [
+            "rocm/bin/" + "clang-offload-bundler",
+        ],
+    ))
+
     # Set up BUILD file for rocm/
     repository_ctx.template(
         "rocm/build_defs.bzl",
@@ -621,13 +639,14 @@ def _create_local_rocm_repository(repository_ctx):
                 repository_ctx,
                 rocm_config.amdgpu_targets,
             ),
+            "%{rocm_gpu_architectures}": str(rocm_config.amdgpu_targets),
         },
     )
     repository_ctx.template(
         "rocm/BUILD",
         tpl_paths["rocm:BUILD"],
         {
-            "%{hip_lib}": rocm_libs["hip_hcc"].file_name,
+            "%{hip_lib}": rocm_libs["amdhip64"].file_name,
             "%{rocblas_lib}": rocm_libs["rocblas"].file_name,
             "%{rocfft_lib}": rocm_libs["rocfft"].file_name,
             "%{hiprand_lib}": rocm_libs["hiprand"].file_name,
@@ -714,14 +733,9 @@ def _create_local_rocm_repository(repository_ctx):
             "%{rocr_runtime_path}": rocm_config.rocm_toolkit_path + "/lib",
             "%{rocr_runtime_library}": "hsa-runtime64",
             "%{hip_runtime_path}": rocm_config.rocm_toolkit_path + "/hip/lib",
-            "%{hip_runtime_library}": "hip_hcc",
-            "%{hcc_runtime_path}": rocm_config.rocm_toolkit_path + "/hcc/lib",
-            "%{hcc_runtime_library}": "mcwamp",
+            "%{hip_runtime_library}": "amdhip64",
             "%{crosstool_verbose}": _crosstool_verbose(repository_ctx),
             "%{gcc_host_compiler_path}": str(cc),
-            "%{rocm_amdgpu_targets}": ",".join(
-                ["\"%s\"" % c for c in rocm_config.amdgpu_targets],
-            ),
         },
     )
 
diff --git a/third_party/llvm/BUILD b/third_party/llvm/BUILD
index 1a5634a6285..88f1574ce9e 100644
--- a/third_party/llvm/BUILD
+++ b/third_party/llvm/BUILD
@@ -2,5 +2,8 @@ py_binary(
     name = "expand_cmake_vars",
     srcs = ["expand_cmake_vars.py"],
     srcs_version = "PY2AND3",
-    visibility = ["@llvm-project//:__subpackages__"],
+    visibility = [
+        "@llvm-project//:__subpackages__",
+        "@llvm_openmp//:__subpackages__",
+    ],
 )
diff --git a/third_party/llvm/expand_cmake_vars.py b/third_party/llvm/expand_cmake_vars.py
index ffc6a255fd1..a8a4b9673ed 100644
--- a/third_party/llvm/expand_cmake_vars.py
+++ b/third_party/llvm/expand_cmake_vars.py
@@ -25,6 +25,7 @@ import sys
 _CMAKE_DEFINE_REGEX = re.compile(r"\s*#cmakedefine\s+([A-Za-z_0-9]*)(\s.*)?$")
 _CMAKE_DEFINE01_REGEX = re.compile(r"\s*#cmakedefine01\s+([A-Za-z_0-9]*)")
 _CMAKE_VAR_REGEX = re.compile(r"\${([A-Za-z_0-9]*)}")
+_CMAKE_ATVAR_REGEX = re.compile(r"@([A-Za-z_0-9]*)@")
 
 
 def _parse_args(argv):
@@ -37,10 +38,10 @@ def _parse_args(argv):
 
 
 def _expand_variables(input_str, cmake_vars):
-  """Expands ${VARIABLE}s in 'input_str', using dictionary 'cmake_vars'.
+  """Expands ${VARIABLE}s and @VARIABLE@s in 'input_str', using dictionary 'cmake_vars'.
 
   Args:
-    input_str: the string containing ${VARIABLE} expressions to expand.
+    input_str: the string containing ${VARIABLE} or @VARIABLE@ expressions to expand.
     cmake_vars: a dictionary mapping variable names to their values.
 
   Returns:
@@ -50,7 +51,7 @@ def _expand_variables(input_str, cmake_vars):
     if match.group(1) in cmake_vars:
       return cmake_vars[match.group(1)]
     return ""
-  return _CMAKE_VAR_REGEX.sub(replace, input_str)
+  return _CMAKE_ATVAR_REGEX.sub(replace,_CMAKE_VAR_REGEX.sub(replace, input_str))
 
 
 def _expand_cmakedefines(line, cmake_vars):
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 92d1535b5ee..579005926a4 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -419,6 +419,19 @@ cc_binary(
     ],
 )
 
+cc_library(
+    name = "FileCheckLib",
+    srcs = glob([
+        "lib/FileCheck/*.cpp",
+        "lib/FileCheck/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/FileCheck/*.h",
+    ]),
+    includes = ["include"],
+    deps = [":Support"],
+)
+
 cc_binary(
     name = "FileCheck",
     testonly = 1,
@@ -429,7 +442,10 @@ cc_binary(
     copts = llvm_copts,
     linkopts = llvm_linkopts,
     stamp = 0,
-    deps = [":Support"],
+    deps = [
+        ":FileCheckLib",
+        ":Support",
+    ],
 )
 
 llvm_target_list = [
@@ -532,6 +548,8 @@ llvm_target_list = [
             ("-gen-callingconv", "lib/Target/PowerPC/PPCGenCallingConv.inc"),
             ("-gen-subtarget", "lib/Target/PowerPC/PPCGenSubtargetInfo.inc"),
             ("-gen-disassembler", "lib/Target/PowerPC/PPCGenDisassemblerTables.inc"),
+            ("-gen-register-bank", "lib/Target/PowerPC/PPCGenRegisterBank.inc"),
+            ("-gen-global-isel", "lib/Target/PowerPC/PPCGenGlobalISel.inc"),
         ],
     },
     {
@@ -634,6 +652,7 @@ gentbl(
         ":common_target_td_sources",
     ] + glob([
         "lib/Target/" + target["dir_name"] + "/*.td",
+        "lib/Target/" + target["name"] + "/GISel/*.td",
     ]),
     deps = target.get("tbl_deps", []),
 ) for target in llvm_target_list]
@@ -1753,6 +1772,50 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "CSKYCodeGen",
+    srcs = glob([
+        "lib/Target/CSKY/*.c",
+        "lib/Target/CSKY/*.cpp",
+        "lib/Target/CSKY/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/CSKY/*.h",
+        "include/llvm/Target/CSKY/*.def",
+        "include/llvm/Target/CSKY/*.inc",
+        "lib/Target/CSKY/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/CSKY"],
+    deps = [
+        ":CSKYInfo",
+        ":CodeGen",
+        ":Core",
+        ":Support",
+        ":Target",
+        ":config",
+    ],
+)
+
+cc_library(
+    name = "CSKYInfo",
+    srcs = glob([
+        "lib/Target/CSKY/TargetInfo/*.c",
+        "lib/Target/CSKY/TargetInfo/*.cpp",
+        "lib/Target/CSKY/TargetInfo/*.inc",
+    ]),
+    hdrs = glob([
+        "include/llvm/Target/CSKY/TargetInfo/*.h",
+        "include/llvm/Target/CSKY/TargetInfo/*.def",
+        "include/llvm/Target/CSKY/TargetInfo/*.inc",
+        "lib/Target/CSKY/TargetInfo/*.h",
+    ]),
+    copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/CSKY"],
+    deps = [
+        ":Support",
+        ":config",
+    ],
+)
+
 cc_library(
     name = "CodeGen",
     srcs = glob([
@@ -2095,7 +2158,10 @@ cc_library(
         "include/llvm/Extensions/*.inc",
     ]),
     copts = llvm_copts,
-    deps = [":config"],
+    deps = [
+        ":Support",
+        ":config",
+    ],
 )
 
 cc_library(
@@ -2175,6 +2241,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "HelloNew",
+    srcs = glob([
+        "lib/Transforms/HelloNew/*.c",
+        "lib/Transforms/HelloNew/*.cpp",
+        "lib/Transforms/HelloNew/*.inc",
+        "lib/Transforms/HelloNew/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Transforms/HelloNew/*.h",
+        "include/llvm/Transforms/HelloNew/*.def",
+        "include/llvm/Transforms/HelloNew/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":Core",
+        ":Support",
+        ":config",
+    ],
+)
+
 cc_library(
     name = "HexagonAsmParser",
     srcs = glob([
@@ -3321,9 +3408,11 @@ cc_library(
         ":CodeGen",
         ":Core",
         ":Coroutines",
+        ":HelloNew",
         ":IPO",
         ":InstCombine",
         ":Instrumentation",
+        ":ObjCARC",
         ":Scalar",
         ":Support",
         ":Target",
@@ -3363,6 +3452,7 @@ cc_library(
         "lib/Target/PowerPC/*.c",
         "lib/Target/PowerPC/*.cpp",
         "lib/Target/PowerPC/*.inc",
+        "lib/Target/PowerPC/GISel/*.cpp",
     ]),
     hdrs = glob([
         "include/llvm/Target/PowerPC/*.h",
@@ -3376,6 +3466,7 @@ cc_library(
         ":AsmPrinter",
         ":CodeGen",
         ":Core",
+        ":GlobalISel",
         ":MC",
         ":PowerPCDesc",
         ":PowerPCInfo",
@@ -3479,6 +3570,7 @@ cc_library(
     copts = llvm_copts,
     deps = [
         ":Core",
+        ":Demangle",
         ":Support",
         ":config",
     ],
diff --git a/third_party/llvm/llvm.bzl b/third_party/llvm/llvm.bzl
index 48b986bbdc1..dcbaab9edd4 100644
--- a/third_party/llvm/llvm.bzl
+++ b/third_party/llvm/llvm.bzl
@@ -190,6 +190,7 @@ posix_cmake_vars = {
     "HAVE_PTHREAD_H": 1,
     "HAVE_SIGNAL_H": 1,
     "HAVE_STDINT_H": 1,
+    "HAVE_SYSEXITS_H": 1,
     "HAVE_SYS_IOCTL_H": 1,
     "HAVE_SYS_MMAN_H": 1,
     "HAVE_SYS_PARAM_H": 1,
diff --git a/third_party/llvm_openmp/BUILD b/third_party/llvm_openmp/BUILD
new file mode 100644
index 00000000000..9ebbce4c799
--- /dev/null
+++ b/third_party/llvm_openmp/BUILD
@@ -0,0 +1,127 @@
+# Build file for OpenMP library that is part of llvm
+load(
+    "@org_tensorflow//third_party/llvm:llvm.bzl",
+    "cmake_var_string",
+    "expand_cmake_vars",
+)
+
+exports_files(["LICENSE.txt"])
+
+genrule(
+    name = "kmp_i18n_id",
+    srcs = [
+        "runtime/tools/message-converter.pl",
+        "runtime/src/i18n/en_US.txt",
+    ],
+    outs = ["include/kmp_i18n_id.inc"],
+    cmd = "perl $(location runtime/tools/message-converter.pl) --os=lin --prefix=kmp_i18n --enum=$@  $(location runtime/src/i18n/en_US.txt)",
+)
+
+genrule(
+    name = "kmp_i18n_default",
+    srcs = [
+        "runtime/tools/message-converter.pl",
+        "runtime/src/i18n/en_US.txt",
+    ],
+    outs = ["include/kmp_i18n_default.inc"],
+    cmd = "perl $(location runtime/tools/message-converter.pl) --os=lin --prefix=kmp_i18n --default=$@ $(location runtime/src/i18n/en_US.txt)",
+)
+
+# Bazel doesn't accept .txt as an input, rename the ldscript to .inc to workaround.
+genrule(
+    name = "ldscript",
+    srcs = ["runtime/src/exports_so.txt"],
+    outs = ["exports_so.inc"],
+    cmd = "cp $(location runtime/src/exports_so.txt) $@",
+)
+
+# Cmake vars to replace.
+omp_vars = {
+    "LIBOMP_USE_VERSION_SYMBOLS": 1,
+    "LIBOMP_HAVE_WEAK_ATTRIBUTE": 1,
+    "LIBOMP_USE_ADAPTIVE_LOCKS": 1,
+    "LIBOMP_ENABLE_ASSERTIONS": 1,
+    "LIBOMP_ENABLE_SHARED": 1,
+    "LIBOMP_LEGAL_ARCH": "Intel(R) 64",
+    "LIBOMP_LIB_FILE": "libiomp5",
+    "LIBOMP_VERSION_MAJOR": 5,
+    "LIBOMP_VERSION_MINOR": 0,
+}
+
+omp_all_cmake_vars = cmake_var_string(omp_vars)
+
+expand_cmake_vars(
+    name = "config_kmp",
+    src = "runtime/src/kmp_config.h.cmake",
+    cmake_vars = omp_all_cmake_vars,
+    dst = "include/kmp_config.h",
+)
+
+expand_cmake_vars(
+    name = "config_omp",
+    src = "runtime/src/include/omp.h.var",
+    cmake_vars = omp_all_cmake_vars,
+    dst = "include/omp.h",
+)
+
+# TODO(Intel-tf) Replace the following cc_binary call with cc_library.
+# cc_library should be used for files that are not independently executed. Using
+# cc_library here results in the following linking errors.
+# ERROR: //tensorflow/BUILD:689:1: Linking of rule '//tensorflow:libtensorflow_framework.so.2.4.0' failed (Exit 1)
+# /usr/bin/ld.gold: error: symbol GOMP_parallel_loop_nonmonotonic_guided has undefined version VERSION
+# /usr/bin/ld.gold: error: symbol GOMP_parallel_start has undefined version GOMP_1.0
+# /usr/bin/ld.gold: error: symbol GOMP_cancellation_point has undefined version GOMP_4.0
+# /usr/bin/ld.gold: error: symbol omp_set_num_threads has undefined version OMP_1.0
+# ......
+# ......
+
+cc_binary(
+    name = "libiomp5.so",
+    srcs = [
+        ":config_kmp",
+        ":config_omp",
+        ":kmp_i18n_id",
+        ":kmp_i18n_default",
+        ":ldscript",
+        "runtime/src/kmp_alloc.cpp",
+        "runtime/src/kmp_atomic.cpp",
+        "runtime/src/kmp_csupport.cpp",
+        "runtime/src/kmp_debug.cpp",
+        "runtime/src/kmp_itt.cpp",
+        "runtime/src/kmp_environment.cpp",
+        "runtime/src/kmp_error.cpp",
+        "runtime/src/kmp_global.cpp",
+        "runtime/src/kmp_i18n.cpp",
+        "runtime/src/kmp_io.cpp",
+        "runtime/src/kmp_runtime.cpp",
+        "runtime/src/kmp_settings.cpp",
+        "runtime/src/kmp_str.cpp",
+        "runtime/src/kmp_tasking.cpp",
+        "runtime/src/kmp_threadprivate.cpp",
+        "runtime/src/kmp_utility.cpp",
+        "runtime/src/kmp_barrier.cpp",
+        "runtime/src/kmp_wait_release.cpp",
+        "runtime/src/kmp_affinity.cpp",
+        "runtime/src/kmp_dispatch.cpp",
+        "runtime/src/kmp_lock.cpp",
+        "runtime/src/kmp_sched.cpp",
+        "runtime/src/kmp_taskdeps.cpp",
+        "runtime/src/kmp_cancel.cpp",
+        "runtime/src/kmp_ftn_cdecl.cpp",
+        "runtime/src/kmp_ftn_extra.cpp",
+        "runtime/src/kmp_version.cpp",
+
+        #linux specific files
+        "runtime/src/z_Linux_util.cpp",
+        "runtime/src/kmp_gsupport.cpp",
+        "runtime/src/z_Linux_asm.S",
+    ],
+    copts = ["-Domp_EXPORTS -D_GNU_SOURCE -D_REENTRANT"],
+    includes = [
+        "include/",
+        "runtime/src/",
+    ],
+    linkopts = ["-lpthread -ldl -Wl,--version-script=$(location :ldscript)"],
+    linkshared = True,
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/mkl/BUILD b/third_party/mkl/BUILD
index 66a2bf8ceb9..371f87964b2 100644
--- a/third_party/mkl/BUILD
+++ b/third_party/mkl/BUILD
@@ -21,6 +21,14 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "build_with_mkl_aarch64",
+    define_values = {
+        "build_with_mkl_aarch64": "true",
+    },
+    visibility = ["//visibility:public"],
+)
+
 config_setting(
     name = "enable_mkl",
     define_values = {
@@ -34,7 +42,7 @@ filegroup(
     name = "LICENSE",
     srcs = ["MKL_LICENSE"] + select({
         "@org_tensorflow//tensorflow:linux_x86_64": [
-            "@mkl_linux//:LICENSE",
+            "@llvm_openmp//:LICENSE.txt",
         ],
         "@org_tensorflow//tensorflow:macos": [
             "@mkl_darwin//:LICENSE",
@@ -47,13 +55,23 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
+# TODO(Intel-tf) Remove the following call to cc_library and replace all uses
+# of mkl_libs_linux with @llvm_openmp//:libiomp5.so directly.
+
+cc_library(
+    name = "mkl_libs_linux",
+    srcs = [
+        "@llvm_openmp//:libiomp5.so",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "intel_binary_blob",
     visibility = ["//visibility:public"],
     deps = select({
         "@org_tensorflow//tensorflow:linux_x86_64": [
-            "@mkl_linux//:mkl_headers",
-            "@mkl_linux//:mkl_libs_linux",
+            ":mkl_libs_linux",
         ],
         "@org_tensorflow//tensorflow:macos": [
             "@mkl_darwin//:mkl_headers",
diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl
index 851403fd13a..b3efa4d9ca7 100644
--- a/third_party/mkl/build_defs.bzl
+++ b/third_party/mkl/build_defs.bzl
@@ -90,12 +90,8 @@ def mkl_deps():
       inclusion in the deps attribute of rules.
     """
     return select({
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_only": ["@mkl_dnn"],
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_v1_only": ["@mkl_dnn_v1//:mkl_dnn"],
-        "@org_tensorflow//third_party/mkl:build_with_mkl": [
-            "@org_tensorflow//third_party/mkl:intel_binary_blob",
-            "@mkl_dnn",
-        ],
+        "@org_tensorflow//third_party/mkl:build_with_mkl": ["@mkl_dnn_v1//:mkl_dnn"],
+        "@org_tensorflow//third_party/mkl:build_with_mkl_aarch64": ["@mkl_dnn_v1//:mkl_dnn_aarch64"],
         "//conditions:default": [],
     })
 
diff --git a/third_party/mkl/mkl.BUILD b/third_party/mkl/mkl.BUILD
index 72370182c41..32d2965780f 100644
--- a/third_party/mkl/mkl.BUILD
+++ b/third_party/mkl/mkl.BUILD
@@ -17,15 +17,6 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
-cc_library(
-    name = "mkl_libs_linux",
-    srcs = [
-        "lib/libiomp5.so",
-        "lib/libmklml_intel.so",
-    ],
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "mkl_libs_darwin",
     srcs = [
diff --git a/third_party/mkl_dnn/BUILD b/third_party/mkl_dnn/BUILD
index c3059a3dc5c..e7051774570 100644
--- a/third_party/mkl_dnn/BUILD
+++ b/third_party/mkl_dnn/BUILD
@@ -9,39 +9,19 @@ package(
 
 exports_files(["LICENSE"])
 
-config_setting(
-    name = "build_with_mkl_dnn_only",
-    define_values = {
-        "build_with_mkl": "true",
-        "build_with_mkl_dnn_only": "true",
-    },
-    visibility = ["//visibility:public"],
-)
-
 config_setting(
     name = "build_with_mkl_opensource",
     define_values = {
         "build_with_mkl": "true",
-        "build_with_mkl_dnn_v1_only": "true",
         "build_with_mkl_opensource": "true",
     },
     visibility = ["//visibility:public"],
 )
 
-config_setting(
-    name = "build_with_mkl_dnn_v1_only",
-    define_values = {
-        "build_with_mkl": "true",
-        "build_with_mkl_dnn_v1_only": "true",
-    },
-    visibility = ["//visibility:public"],
-)
-
 config_setting(
     name = "build_with_mkldnn_threadpool",
     define_values = {
         "build_with_mkl": "true",
-        "build_with_mkl_dnn_v1_only": "true",
         "build_with_mkl_opensource": "true",
         "build_with_mkldnn_threadpool": "true",
     },
diff --git a/third_party/mkl_dnn/build_defs.bzl b/third_party/mkl_dnn/build_defs.bzl
index 6a3e4f827ce..b3bbd3b087c 100644
--- a/third_party/mkl_dnn/build_defs.bzl
+++ b/third_party/mkl_dnn/build_defs.bzl
@@ -14,22 +14,6 @@ def if_mkl_open_source_only(if_true, if_false = []):
         "//conditions:default": if_false,
     })
 
-def if_mkl_v1(if_true, if_false = []):
-    """Returns `if_true` if MKL-DNN v1.x is used.
-
-    Shorthand for select()'ing on whether we're building with
-    MKL-DNN v1.x open source library only, without depending on MKL binary form.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with MKL-DNN v1.x open source library only. Otherwise, the
-    select statement evaluates to if_false.
-
-    """
-    return select({
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_v1_only": if_true,
-        "//conditions:default": if_false,
-    })
-
 def if_mkldnn_threadpool(if_true, if_false = []):
     """Returns `if_true` if MKL-DNN v1.x is used.
 
diff --git a/third_party/mkl_dnn/mkldnn.BUILD b/third_party/mkl_dnn/mkldnn.BUILD
index 5279043ad29..11b9b917fa0 100644
--- a/third_party/mkl_dnn/mkldnn.BUILD
+++ b/third_party/mkl_dnn/mkldnn.BUILD
@@ -1,10 +1,5 @@
 exports_files(["LICENSE"])
 
-load(
-    "@org_tensorflow//third_party/mkl_dnn:build_defs.bzl",
-    "if_mkl_open_source_only",
-    "if_mkl_v1",
-)
 load(
     "@org_tensorflow//third_party:common.bzl",
     "template_rule",
@@ -50,65 +45,6 @@ template_rule(
     },
 )
 
-cc_library(
-    name = "mkl_dnn",
-    srcs = glob([
-        "src/common/*.cpp",
-        "src/common/*.hpp",
-        "src/cpu/*.cpp",
-        "src/cpu/*.hpp",
-        "src/cpu/**/*.cpp",
-        "src/cpu/**/*.hpp",
-        "src/cpu/xbyak/*.h",
-    ]) + if_mkl_v1([
-        ":mkldnn_config_h",
-    ]) + [":mkldnn_version_h"],
-    hdrs = glob(["include/*"]),
-    copts = [
-        "-fexceptions",
-        "-DUSE_MKL",
-        "-DUSE_CBLAS",
-    ] + if_mkl_open_source_only([
-        "-UUSE_MKL",
-        "-UUSE_CBLAS",
-    ]) + if_mkl_v1([
-        "-UUSE_MKL",
-        "-UUSE_CBLAS",
-    ]) + select({
-        "@org_tensorflow//tensorflow:linux_x86_64": [
-            "-fopenmp",  # only works with gcc
-        ],
-        # TODO(ibiryukov): enable openmp with clang by including libomp as a
-        # dependency.
-        ":clang_linux_x86_64": [],
-        "//conditions:default": [],
-    }),
-    includes = [
-        "include",
-        "src",
-        "src/common",
-        "src/cpu",
-        "src/cpu/gemm",
-        "src/cpu/xbyak",
-    ],
-    visibility = ["//visibility:public"],
-    deps = select({
-        "@org_tensorflow//tensorflow:linux_x86_64": [
-            "@mkl_linux//:mkl_headers",
-            "@mkl_linux//:mkl_libs_linux",
-        ],
-        "@org_tensorflow//tensorflow:macos": [
-            "@mkl_darwin//:mkl_headers",
-            "@mkl_darwin//:mkl_libs_darwin",
-        ],
-        "@org_tensorflow//tensorflow:windows": [
-            "@mkl_windows//:mkl_headers",
-            "@mkl_windows//:mkl_libs_windows",
-        ],
-        "//conditions:default": [],
-    }),
-)
-
 cc_library(
     name = "mkldnn_single_threaded",
     srcs = glob([
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index 445b5474065..8e7a3d61564 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -3,7 +3,6 @@ exports_files(["LICENSE"])
 load(
     "@org_tensorflow//third_party/mkl_dnn:build_defs.bzl",
     "if_mkl_open_source_only",
-    "if_mkl_v1",
     "if_mkldnn_threadpool",
 )
 load(
@@ -59,8 +58,8 @@ template_rule(
     out = "include/dnnl_version.h",
     substitutions = {
         "@DNNL_VERSION_MAJOR@": "1",
-        "@DNNL_VERSION_MINOR@": "5",
-        "@DNNL_VERSION_PATCH@": "1",
+        "@DNNL_VERSION_MINOR@": "6",
+        "@DNNL_VERSION_PATCH@": "4",
         "@DNNL_VERSION_HASH@": "N/A",
     },
 )
@@ -75,8 +74,8 @@ cc_library(
         "src/cpu/**/*.cpp",
         "src/cpu/**/*.hpp",
         "src/cpu/xbyak/*.h",
-        "src/cpu/jit_utils/jitprofiling/*.c",
-        "src/cpu/jit_utils/jitprofiling/*.h",
+        "src/cpu/x64/jit_utils/jitprofiling/*.c",
+        "src/cpu/x64/jit_utils/jitprofiling/*.h",
     ]) + [
         ":dnnl_config_h",
         ":dnnl_version_h",
@@ -84,18 +83,9 @@ cc_library(
     hdrs = glob(["include/*"]),
     copts = [
         "-fexceptions",
-        "-DUSE_MKL",
-        "-DUSE_CBLAS",
-    ] + if_mkl_open_source_only([
         "-UUSE_MKL",
         "-UUSE_CBLAS",
-    ]) + if_mkl_v1([
-        "-UUSE_MKL",
-        "-UUSE_CBLAS",
-    ]) + if_mkldnn_threadpool([
-        "-UUSE_MKL",
-        "-UUSE_CBLAS",
-    ]) + select({
+    ] + select({
         "@org_tensorflow//tensorflow:linux_x86_64": [
             "-fopenmp",  # only works with gcc
         ],
@@ -145,3 +135,36 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
 )
+
+cc_library(
+    name = "mkl_dnn_aarch64",
+    srcs = glob([
+        "src/common/*.cpp",
+        "src/common/*.hpp",
+        "src/cpu/*.cpp",
+        "src/cpu/*.hpp",
+        "src/cpu/rnn/*.cpp",
+        "src/cpu/rnn/*.hpp",
+        "src/cpu/matmul/*.cpp",
+        "src/cpu/matmul/*.hpp",
+        "src/cpu/gemm/**/*",
+    ]) + [
+        ":dnnl_config_h",
+        ":dnnl_version_h",
+    ],
+    hdrs = glob(["include/*"]),
+    copts = [
+        "-fexceptions",
+        "-UUSE_MKL",
+        "-UUSE_CBLAS",
+    ],
+    includes = [
+        "include",
+        "src",
+        "src/common",
+        "src/cpu",
+        "src/cpu/gemm",
+    ],
+    linkopts = ["-lgomp"],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 94129a29b84..4aeb2b8c88b 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -4,9 +4,10 @@
 load("@org_tensorflow//third_party/mlir:tblgen.bzl", "gentbl")
 load("@org_tensorflow//third_party/mlir:linalggen.bzl", "genlinalg")
 
-licenses(["notice"])
-
-package(default_visibility = [":friends"])
+package(
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
 
 package_group(
     name = "subpackages",
@@ -120,28 +121,49 @@ cc_library(
     name = "CAPIIR",
     srcs = [
         "lib/CAPI/IR/AffineMap.cpp",
+        "lib/CAPI/IR/Diagnostics.cpp",
         "lib/CAPI/IR/IR.cpp",
         "lib/CAPI/IR/StandardAttributes.cpp",
         "lib/CAPI/IR/StandardTypes.cpp",
+        "lib/CAPI/IR/Support.cpp",
+        "lib/CAPI/Standard/StandardDialect.cpp",
     ],
     hdrs = [
         "include/mlir-c/AffineMap.h",
+        "include/mlir-c/Diagnostics.h",
         "include/mlir-c/IR.h",
         "include/mlir-c/StandardAttributes.h",
+        "include/mlir-c/StandardDialect.h",
         "include/mlir-c/StandardTypes.h",
+        "include/mlir-c/Support.h",
         "include/mlir/CAPI/AffineMap.h",
+        "include/mlir/CAPI/Diagnostics.h",
         "include/mlir/CAPI/IR.h",
+        "include/mlir/CAPI/Support.h",
+        "include/mlir/CAPI/Utils.h",
         "include/mlir/CAPI/Wrap.h",
     ],
     includes = ["include"],
     deps = [
         ":IR",
         ":Parser",
+        ":StandardOps",
         ":Support",
         "@llvm-project//llvm:Support",
     ],
 )
 
+cc_library(
+    name = "MLIRBindingsPythonExtension",
+    hdrs = [
+        "include/mlir-c/Bindings/Python/Interop.h",
+    ],
+    deps = [
+        ":CAPIIR",
+        "//third_party/python_runtime:headers",
+    ],
+)
+
 cc_library(
     name = "CAPIRegistration",
     srcs = [
@@ -160,7 +182,6 @@ cc_library(
 filegroup(
     name = "OpBaseTdFiles",
     srcs = [
-        "include/mlir/Dialect/Affine/IR/AffineOpsBase.td",
         "include/mlir/Dialect/StandardOps/IR/StandardOpsBase.td",
         "include/mlir/IR/OpBase.td",
     ],
@@ -182,7 +203,6 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.td",
         "include/mlir/Dialect/Affine/IR/AffineOps.td",
-        "include/mlir/Dialect/Affine/IR/AffineOpsBase.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
         "include/mlir/Interfaces/LoopLikeInterface.td",
         "include/mlir/Interfaces/SideEffectInterfaces.td",
@@ -234,6 +254,44 @@ gentbl(
     ],
 )
 
+##---------------------------------------------------------------------------##
+# Async dialect.
+##---------------------------------------------------------------------------##
+
+filegroup(
+    name = "AsyncOpsTdFiles",
+    srcs = [
+        "include/mlir/Dialect/Async/IR/AsyncBase.td",
+        "include/mlir/Dialect/Async/IR/AsyncOps.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
+        ":OpBaseTdFiles",
+    ],
+)
+
+gentbl(
+    name = "AsyncOpsIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            "-gen-op-decls",
+            "include/mlir/Dialect/Async/IR/AsyncOps.h.inc",
+        ),
+        (
+            "-gen-op-defs",
+            "include/mlir/Dialect/Async/IR/AsyncOps.cpp.inc",
+        ),
+        (
+            "-gen-dialect-decls",
+            "include/mlir/Dialect/Async/IR/AsyncOpsDialect.h.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/Async/IR/AsyncOps.td",
+    td_srcs = [
+        ":AsyncOpsTdFiles",
+    ],
+)
+
 ##---------------------------------------------------------------------------##
 # AVX512 dialect.
 ##---------------------------------------------------------------------------##
@@ -503,6 +561,26 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "Async",
+    srcs = glob([
+        "lib/Dialect/Async/IR/*.cpp",
+    ]),
+    hdrs = glob([
+        "include/mlir/Dialect/Async/IR/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":AsyncOpsIncGen",
+        ":Dialect",
+        ":IR",
+        ":SideEffectInterfaces",
+        ":StandardOps",
+        ":Support",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "AffineUtils",
     srcs = glob(
@@ -595,16 +673,17 @@ cc_library(
         ":LinalgToLLVM",
         ":LinalgToSPIRV",
         ":LinalgToStandard",
+        ":OpenMPToLLVM",
         ":SCFToGPUPass",
         ":SCFToStandard",
         ":SPIRVToLLVM",
-        ":ShapeToSCF",
         ":ShapeToStandard",
         ":StandardToLLVM",
         ":StandardToSPIRVTransforms",
         ":VectorToLLVM",
         ":VectorToROCDL",
         ":VectorToSCF",
+        ":VectorToSPIRV",
     ],
 )
 
@@ -801,25 +880,6 @@ cc_library(
     ]) + ["lib/Conversion/PassDetail.h"],
     hdrs = ["include/mlir/Conversion/ShapeToStandard/ShapeToStandard.h"],
     includes = ["include"],
-    deps = [
-        ":ConversionPassIncGen",
-        ":Pass",
-        ":SCFDialect",
-        ":Shape",
-        ":StandardOps",
-        ":Support",
-        ":Transforms",
-    ],
-)
-
-cc_library(
-    name = "ShapeToSCF",
-    srcs = glob([
-        "lib/Conversion/ShapeToSCF/*.cpp",
-        "lib/Conversion/ShapeToSCF/*.h",
-    ]) + ["lib/Conversion/PassDetail.h"],
-    hdrs = ["include/mlir/Conversion/ShapeToSCF/ShapeToSCF.h"],
-    includes = ["include"],
     deps = [
         ":ConversionPassIncGen",
         ":IR",
@@ -827,6 +887,7 @@ cc_library(
         ":SCFDialect",
         ":Shape",
         ":StandardOps",
+        ":Support",
         ":Transforms",
     ],
 )
@@ -951,6 +1012,7 @@ cc_library(
         ":Support",
         ":VectorInterfaces",
         ":VectorOpsIncGen",
+        ":ViewLikeInterface",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -1000,7 +1062,6 @@ cc_library(
     ],
     includes = ["include"],
     deps = [
-        ":Analysis",
         ":IR",
         ":ParserTokenKinds",
         ":Support",
@@ -1210,6 +1271,30 @@ gentbl(
     ],
 )
 
+gentbl(
+    name = "GPUBaseIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            "-gen-dialect-decls -dialect=gpu",
+            "include/mlir/Dialect/GPU/GPUOpsDialect.h.inc",
+        ),
+        (
+            "-gen-op-interface-decls",
+            "include/mlir/Dialect/GPU/GPUOpInterfaces.h.inc",
+        ),
+        (
+            "-gen-op-interface-defs",
+            "include/mlir/Dialect/GPU/GPUOpInterfaces.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/GPU/GPUBase.td",
+    td_srcs = [
+        ":GPUOpsTdFiles",
+    ],
+)
+
 gentbl(
     name = "GPUOpsIncGen",
     strip_include_prefix = "include",
@@ -1222,10 +1307,6 @@ gentbl(
             "-gen-op-defs",
             "include/mlir/Dialect/GPU/GPUOps.cpp.inc",
         ),
-        (
-            "-gen-dialect-decls -dialect=gpu",
-            "include/mlir/Dialect/GPU/GPUOpsDialect.h.inc",
-        ),
     ],
     tblgen = ":mlir-tblgen",
     td_file = "include/mlir/Dialect/GPU/GPUOps.td",
@@ -1247,12 +1328,14 @@ cc_library(
     ]),
     includes = ["include"],
     deps = [
+        ":GPUBaseIncGen",
         ":GPUOpsIncGen",
         ":IR",
         ":LLVMDialect",
         ":SideEffectInterfaces",
         ":StandardOps",
         ":Support",
+        "@llvm-project//llvm:Support",
     ],
 )
 
@@ -1399,6 +1482,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "VectorToSPIRV",
+    srcs = [
+        "lib/Conversion/PassDetail.h",
+        "lib/Conversion/VectorToSPIRV/VectorToSPIRV.cpp",
+    ],
+    hdrs = [
+        "include/mlir/Conversion/VectorToSPIRV/ConvertVectorToSPIRV.h",
+        "include/mlir/Conversion/VectorToSPIRV/ConvertVectorToSPIRVPass.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":ConversionPassIncGen",
+        ":Pass",
+        ":SPIRVDialect",
+        ":SPIRVLowering",
+        ":Transforms",
+        ":VectorOps",
+    ],
+)
+
 gentbl(
     name = "GPUToROCDLTGen",
     strip_include_prefix = "lib/Conversion/GPUToROCDL",
@@ -1537,6 +1641,7 @@ cc_library(
         ":StandardToSPIRVTransforms",
         ":Support",
         ":Transforms",
+        ":VectorToSPIRV",
     ],
 )
 
@@ -1820,6 +1925,61 @@ gentbl(
     ],
 )
 
+cc_library(
+    name = "PDLInterpDialect",
+    srcs = glob([
+        "lib/Dialect/PDLInterp/IR/*.cpp",
+        "lib/Dialect/PDLInterp/IR/*.h",
+    ]),
+    hdrs = glob([
+        "include/mlir/Dialect/PDLInterp/IR/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":IR",
+        ":InferTypeOpInterface",
+        ":PDLDialect",
+        ":PDLInterpOpsIncGen",
+        ":SideEffects",
+        ":Support",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+filegroup(
+    name = "PDLInterpOpsTdFiles",
+    srcs = [
+        "include/mlir/Dialect/PDL/IR/PDLBase.td",
+        "include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
+        ":OpBaseTdFiles",
+    ],
+)
+
+gentbl(
+    name = "PDLInterpOpsIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            "-gen-op-decls",
+            "include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.h.inc",
+        ),
+        (
+            "-gen-op-defs",
+            "include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.cpp.inc",
+        ),
+        (
+            "-gen-dialect-decls -dialect=pdl_interp",
+            "include/mlir/Dialect/PDLInterp/IR/PDLInterpOpsDialect.h.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/PDLInterp/IR/PDLInterpOps.td",
+    td_srcs = [
+        ":PDLInterpOpsTdFiles",
+    ],
+)
+
 # TODO(gcmn): Update SPIRV dependencies so that they map better to cmake files.
 filegroup(
     name = "SPIRVOpsTdFiles",
@@ -2645,6 +2805,7 @@ cc_library(
         ":Affine",
         ":CallOpInterfaces",
         ":IR",
+        ":LinalgOps",
         ":SCFDialect",
         ":StandardOps",
         ":Support",
@@ -2843,7 +3004,6 @@ cc_library(
         ":Parser",
         ":Pass",
         ":SCFTransforms",
-        ":ShapeToSCF",
         ":ShapeToStandard",
         ":ShapeTransforms",
         ":StandardOpsTransforms",
@@ -2911,6 +3071,7 @@ cc_library(
         ":AffinePassIncGen",
         ":AffineToStandard",
         ":AffineTransforms",
+        ":Async",
         ":ConversionPasses",
         ":GPUDialect",
         ":GPUPassIncGen",
@@ -2934,7 +3095,9 @@ cc_library(
         ":NVVMDialect",
         ":OpenACCDialect",
         ":OpenMPDialect",
+        ":OpenMPToLLVM",
         ":PDLDialect",
+        ":PDLInterpDialect",
         ":QuantOps",
         ":QuantPassIncGen",
         ":ROCDLDialect",
@@ -2949,7 +3112,6 @@ cc_library(
         ":SPIRVPassIncGen",
         ":SPIRVToLLVM",
         ":Shape",
-        ":ShapeToSCF",
         ":ShapeToStandard",
         ":ShapeTransforms",
         ":ShapeTransformsPassIncGen",
@@ -2964,16 +3126,15 @@ cc_library(
         ":VectorToLLVM",
         ":VectorToROCDL",
         ":VectorToSCF",
+        ":VectorToSPIRV",
     ],
 )
 
 cc_library(
     name = "AllPassesAndDialects",
-    srcs = ["@org_tensorflow//third_party/mlir:mlir-auto-init.cpp"],
     deps = [
         ":AllPassesAndDialectsNoRegistration",
     ],
-    alwayslink = 1,
 )
 
 cc_binary(
@@ -3031,6 +3192,7 @@ cc_library(
     name = "mlir_c_runner_utils",
     srcs = [
         "lib/ExecutionEngine/CRunnerUtils.cpp",
+        "lib/ExecutionEngine/SparseUtils.cpp",
     ],
     hdrs = [
         "include/mlir/ExecutionEngine/CRunnerUtils.h",
@@ -3064,10 +3226,10 @@ cc_binary(
     ],
 )
 
-cc_binary(
-    name = "tools/libcuda-runtime-wrappers.so",
+cc_library(
+    name = "tools/libcuda-runtime-wrappers",
     srcs = ["tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp"],
-    linkshared = True,
+    compatible_with = ["//buildenv/target:prod"],
     deps = [
         ":mlir_c_runner_utils",
         "//third_party/gpus/cuda:cuda_headers",
@@ -3077,6 +3239,12 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "tools/libcuda-runtime-wrappers.so",
+    linkshared = True,
+    deps = [":tools/libcuda-runtime-wrappers"],
+)
+
 cc_library(
     name = "VulkanRuntime",
     srcs = [
@@ -3359,6 +3527,30 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "OpenMPToLLVM",
+    srcs = glob([
+        "lib/Conversion/OpenMPToLLVM/*.cpp",
+        "lib/Conversion/OpenMPToLLVM/*.h",
+    ]) + ["lib/Conversion/PassDetail.h"],
+    hdrs = glob([
+        "include/mlir/Conversion/OpenMPToLLVM/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":ConversionPassIncGen",
+        ":IR",
+        ":LLVMDialect",
+        ":OpenMPDialect",
+        ":Pass",
+        ":StandardOps",
+        ":StandardToLLVM",
+        ":Transforms",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 ## QuantOps dialect
 filegroup(
     name = "QuantizationOpsTdFiles",
@@ -3629,6 +3821,7 @@ cc_library(
         ":ConversionPassIncGen",
         ":IR",
         ":LinalgOps",
+        ":LinalgTransforms",
         ":Pass",
         ":SCFDialect",
         ":StandardOps",
@@ -3725,6 +3918,7 @@ cc_library(
         "include/mlir/Dialect/Linalg/EDSC/Builders.h",
         "include/mlir/Dialect/Linalg/EDSC/FoldedIntrinsics.h",
         "include/mlir/Dialect/Linalg/Passes.h",
+        "include/mlir/Dialect/Linalg/Transforms/CodegenStrategy.h",
         "include/mlir/Dialect/Linalg/Transforms/Hoisting.h",
         "include/mlir/Dialect/Linalg/Transforms/Transforms.h",
         "include/mlir/Dialect/Linalg/Utils/Utils.h",
@@ -3752,6 +3946,7 @@ cc_library(
         ":Transforms",
         ":TransformsPassIncGen",
         ":VectorOps",
+        ":VectorToSCF",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
     ],
@@ -3762,6 +3957,7 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/Vector/VectorOps.td",
         "include/mlir/Interfaces/VectorInterfaces.td",
+        "include/mlir/Interfaces/ViewLikeInterface.td",
         ":AffineOpsTdFiles",
         ":OpBaseTdFiles",
     ],
@@ -3839,7 +4035,6 @@ cc_library(
         ":EDSC",
         ":IR",
         ":LLVMDialect",
-        ":LinalgTransforms",
         ":Pass",
         ":SCFDialect",
         ":StandardOps",
@@ -3864,6 +4059,7 @@ exports_files(
         "include/mlir/Interfaces/CallInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.h",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
+        "include/mlir/Interfaces/CopyOpInterface.td",
         "include/mlir/Interfaces/SideEffectInterfaces.td",
         "include/mlir/Interfaces/VectorInterfaces.td",
         "include/mlir/Interfaces/ViewLikeInterface.td",
diff --git a/third_party/mlir/mlir-auto-init.cpp b/third_party/mlir/mlir-auto-init.cpp
deleted file mode 100644
index af22a2c966e..00000000000
--- a/third_party/mlir/mlir-auto-init.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-#include "mlir/InitAllDialects.h"
-#include "mlir/InitAllPasses.h"
-
-namespace mlir {
-// This target is a convenient dependency for users to auto-initialize MLIR
-// internals.
-static bool auto_init = []() {
-  registerAllDialects();
-  registerAllPasses();
-
-  return true;
-}();
-
-} // namespace mlir
diff --git a/third_party/mlir/tblgen.bzl b/third_party/mlir/tblgen.bzl
index 10dde932da5..45d53281952 100644
--- a/third_party/mlir/tblgen.bzl
+++ b/third_party/mlir/tblgen.bzl
@@ -1,6 +1,6 @@
 """BUILD extensions for MLIR table generation."""
 
-def gentbl(name, tblgen, td_file, tbl_outs, td_srcs = [], td_includes = [], td_relative_includes = [], strip_include_prefix = None, test = False):
+def gentbl(name, tblgen, td_file, tbl_outs, td_srcs = [], td_includes = [], td_relative_includes = [], strip_include_prefix = None, test = False, **kwargs):
     """gentbl() generates tabular code from a table definition file.
 
     Args:
@@ -13,8 +13,9 @@ def gentbl(name, tblgen, td_file, tbl_outs, td_srcs = [], td_includes = [], td_r
       td_srcs: A list of table definition files included transitively.
       td_includes: A list of include paths for relative includes, provided as build targets.
       td_relative_includes: A list of include paths for relative includes, provided as relative path.
-      strip_include_prefix: attribute to pass through to cc_library.
-      test: whether to create a test to invoke the tool too.
+      strip_include_prefix: Attribute to pass through to cc_library.
+      test: Whether to create a test to invoke the tool too.
+      **kwargs: Extra keyword arguments to pass to native rules such as cc_library below.
     """
     srcs = []
     srcs += td_srcs
@@ -47,6 +48,7 @@ def gentbl(name, tblgen, td_file, tbl_outs, td_srcs = [], td_includes = [], td_r
             outs = ["%s.gen.sh" % name],
             cmd = ("echo \"\\$$1\" %s \\$${@:2} -o /dev/null > $@" % local_inc),
             executable = 1,
+            **kwargs
         )
 
     for (opts, out) in tbl_outs:
@@ -68,6 +70,7 @@ def gentbl(name, tblgen, td_file, tbl_outs, td_srcs = [], td_includes = [], td_r
             tools = [tblgen],
             message = "Generating code from table: %s" % td_file,
             cmd = (" ".join(base_args) + " %s -o $@" % local_inc),
+            **kwargs
         )
 
         # Optionally generate rule to test tblgen invocation.
@@ -80,6 +83,7 @@ def gentbl(name, tblgen, td_file, tbl_outs, td_srcs = [], td_includes = [], td_r
                 args = base_args,
                 data = srcs + [tblgen],
                 tags = ["no_windows"],
+                **kwargs
             )
 
     # List of opts that do not generate cc files.
@@ -91,4 +95,5 @@ def gentbl(name, tblgen, td_file, tbl_outs, td_srcs = [], td_includes = [], td_r
         hdrs = hdrs if strip_include_prefix else [],
         strip_include_prefix = strip_include_prefix,
         textual_hdrs = hdrs,
+        **kwargs
     )
diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index bea0710db89..c46788047e3 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -1,8 +1,9 @@
 load("@org_tensorflow//third_party/mlir:tblgen.bzl", "gentbl")
 
-licenses(["notice"])
-
-package(default_visibility = [":test_friends"])
+package(
+    default_visibility = [":test_friends"],
+    licenses = ["notice"],
+)
 
 # Please only depend on this from MLIR tests.
 package_group(
@@ -16,6 +17,21 @@ cc_library(
     includes = ["."],
 )
 
+filegroup(
+    name = "TestOpTdFiles",
+    srcs = [
+        "lib/Dialect/Test/TestOps.td",
+        "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:include/mlir/IR/OpAsmInterface.td",
+        "@llvm-project//mlir:include/mlir/IR/RegionKindInterface.td",
+        "@llvm-project//mlir:include/mlir/IR/SymbolInterfaces.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/ControlFlowInterfaces.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
+    ],
+)
+
 gentbl(
     name = "TestOpsIncGen",
     strip_include_prefix = "lib/Dialect/Test",
@@ -56,14 +72,7 @@ gentbl(
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "lib/Dialect/Test/TestOps.td",
     td_srcs = [
-        "@llvm-project//mlir:OpBaseTdFiles",
-        "@llvm-project//mlir:include/mlir/IR/OpAsmInterface.td",
-        "@llvm-project//mlir:include/mlir/IR/RegionKindInterface.td",
-        "@llvm-project//mlir:include/mlir/IR/SymbolInterfaces.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/ControlFlowInterfaces.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
+        ":TestOpTdFiles",
     ],
     test = True,
 )
@@ -89,11 +98,34 @@ gentbl(
     test = True,
 )
 
+gentbl(
+    name = "TestTypeDefsIncGen",
+    strip_include_prefix = "lib/Dialect/Test",
+    tbl_outs = [
+        (
+            "-gen-typedef-decls",
+            "lib/Dialect/Test/TestTypeDefs.h.inc",
+        ),
+        (
+            "-gen-typedef-defs",
+            "lib/Dialect/Test/TestTypeDefs.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "lib/Dialect/Test/TestTypeDefs.td",
+    td_srcs = [
+        ":TestOpTdFiles",
+    ],
+    test = True,
+)
+
 cc_library(
     name = "TestDialect",
     srcs = [
         "lib/Dialect/Test/TestDialect.cpp",
         "lib/Dialect/Test/TestPatterns.cpp",
+        "lib/Dialect/Test/TestTraits.cpp",
+        "lib/Dialect/Test/TestTypes.cpp",
     ],
     hdrs = [
         "lib/Dialect/Test/TestDialect.h",
@@ -105,6 +137,7 @@ cc_library(
     deps = [
         ":TestInterfacesIncGen",
         ":TestOpsIncGen",
+        ":TestTypeDefsIncGen",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ControlFlowInterfaces",
         "@llvm-project//mlir:DerivedAttributeOpInterface",
@@ -126,14 +159,19 @@ cc_library(
         "lib/IR/TestFunc.cpp",
         "lib/IR/TestInterfaces.cpp",
         "lib/IR/TestMatchers.cpp",
+        "lib/IR/TestPrintDefUse.cpp",
+        "lib/IR/TestPrintNesting.cpp",
         "lib/IR/TestSideEffects.cpp",
+        "lib/IR/TestSlicing.cpp",
         "lib/IR/TestSymbolUses.cpp",
         "lib/IR/TestTypes.cpp",
     ],
     deps = [
         ":TestDialect",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
@@ -174,6 +212,7 @@ cc_library(
         ":TestDialect",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Affine",
+        "@llvm-project//mlir:AffineTransforms",
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:EDSC",
         "@llvm-project//mlir:GPUDialect",
diff --git a/third_party/nasm/BUILD.system b/third_party/nasm/BUILD.system
index 7f74da7595a..52f608187fe 100644
--- a/third_party/nasm/BUILD.system
+++ b/third_party/nasm/BUILD.system
@@ -5,8 +5,14 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
+genrule(
+    name = "lnnasmlink",
+    outs = ["nasmlink"],
+    cmd = "ln -s $$(which nasm) $@",
+)
+
 sh_binary(
     name = "nasm",
-    srcs = ["nasm"],
+    srcs = ["nasmlink"],
     visibility = ["@libjpeg_turbo//:__pkg__"],
 )
diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
index b520f71d0f1..7dd6ea58a2c 100644
--- a/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/nccl/build_defs.bzl.tpl
@@ -392,7 +392,11 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
     merged = name + "_merged"
     _merge_archive(
         name = merged,
-        srcs = [pruned, dlink],
+
+        # TODO(b/166662245): We're deliberately not using `pruned` here.
+        # Pruning __nv_relfatbin also seems to prune out the PTX shipped with
+        # NCCL.
+        srcs = [lib, dlink],
     )
 
     # Create cc target from archive.
diff --git a/third_party/ngraph/ngraph.BUILD b/third_party/ngraph/ngraph.BUILD
index afbe79b94d0..715148d38f6 100644
--- a/third_party/ngraph/ngraph.BUILD
+++ b/third_party/ngraph/ngraph.BUILD
@@ -117,7 +117,8 @@ cc_library(
     deps = [
         ":ngraph_headers",
         "@eigen_archive//:eigen",
-        "@mkl_dnn",
+        "@mkl_dnn_v1//:mkl_dnn",
+        "@mkl_dnn_v1//:mkl_dnn_aarch64",
         "@nlohmann_json_lib",
         "@tbb",
     ],
diff --git a/third_party/ngraph/nlohmann_json.BUILD b/third_party/ngraph/nlohmann_json.BUILD
index 04c8db6a961..1d8fa92e9c1 100644
--- a/third_party/ngraph/nlohmann_json.BUILD
+++ b/third_party/ngraph/nlohmann_json.BUILD
@@ -13,3 +13,10 @@ cc_library(
     visibility = ["//visibility:public"],
     alwayslink = 1,
 )
+
+cc_library(
+    name = "nlohmann_json",
+    includes = ["include"],
+    visibility = ["//visibility:public"],
+    deps = ["nlohmann_json_lib"],
+)
diff --git a/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.ndarray.pbtxt b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.ndarray.pbtxt
index f54ecbdbf47..4edc5f08e84 100644
--- a/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.ndarray.pbtxt
+++ b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.ndarray.pbtxt
@@ -35,6 +35,10 @@ tf_class {
     name: "astype"
     argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "clip"
+    argspec: "args=[\'a\', \'a_min\', \'a_max\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "from_tensor"
     argspec: "args=[\'cls\', \'tensor\'], varargs=None, keywords=None, defaults=None"
diff --git a/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.pbtxt b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.pbtxt
index f5ffcf9e244..7fdb4d0f38a 100644
--- a/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.pbtxt
+++ b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.pbtxt
@@ -784,6 +784,10 @@ tf_module {
     name: "sinh"
     argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "size"
+    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "sort"
     argspec: "args=[\'a\', \'axis\', \'kind\', \'order\'], varargs=None, keywords=None, defaults=[\'-1\', \'quicksort\', \'None\'], "
diff --git a/third_party/sycl/BUILD b/third_party/sycl/BUILD
deleted file mode 100644
index 2b86f73b98b..00000000000
--- a/third_party/sycl/BUILD
+++ /dev/null
@@ -1,4 +0,0 @@
-package(
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],  # Apache 2.0
-)
diff --git a/third_party/sycl/crosstool/BUILD.tpl b/third_party/sycl/crosstool/BUILD.tpl
deleted file mode 100755
index 72744334aaf..00000000000
--- a/third_party/sycl/crosstool/BUILD.tpl
+++ /dev/null
@@ -1,27 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:public"])
-
-cc_toolchain_suite(
-    name = "toolchain",
-    toolchains = {
-        "local|compiler": ":cc-compiler-local",
-    },
-)
-
-cc_toolchain(
-    name = "cc-compiler-local",
-    all_files = ":empty",
-    compiler_files = ":empty",
-    cpu = "local",
-    dwp_files = ":empty",
-    linker_files = ":empty",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 1,
-)
-
-filegroup(
-    name = "empty",
-    srcs = [],
-)
diff --git a/third_party/sycl/crosstool/CROSSTOOL.tpl b/third_party/sycl/crosstool/CROSSTOOL.tpl
deleted file mode 100755
index f8e50efcc65..00000000000
--- a/third_party/sycl/crosstool/CROSSTOOL.tpl
+++ /dev/null
@@ -1,217 +0,0 @@
-major_version: "local"
-minor_version: ""
-default_target_cpu: "same_as_host"
-
-default_toolchain {
-  cpu: "k8"
-  toolchain_identifier: "local_linux"
-}
-
-default_toolchain {
-  cpu: "arm"
-  toolchain_identifier: "local_arm"
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  supports_gold_linker: false
-  supports_incremental_linker: false
-  supports_fission: false
-  supports_interface_shared_objects: false
-  supports_normalizing_ar: false
-  supports_start_end_lib: false
-  supports_thin_archives: false
-  target_libc: "local"
-  target_cpu: "local"
-  target_system_name: "local"
-  toolchain_identifier: "local_linux"
-
-  tool_path { name: "ar" path: "/usr/bin/ar" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  tool_path { name: "gcc" path: "%{sycl_impl}" }
-  # Use "-std=c++11" for nvcc. For consistency, force both the host compiler
-  # and the device compiler to use "-std=c++11".
-  cxx_flag: "%{c++_std}"
-  linker_flag: "-Wl,-no-as-needed"
-  linker_flag: "-lstdc++"
-  linker_flag: "-B/usr/bin/"
-
-  # TODO(bazel-team): In theory, the path here ought to exactly match the path
-  # used by gcc. That works because bazel currently doesn't track files at
-  # absolute locations and has no remote execution, yet. However, this will need
-  # to be fixed, maybe with auto-detection?
-  cxx_builtin_include_directory: "/usr/lib/gcc/"
-  cxx_builtin_include_directory: "/usr/lib"
-  cxx_builtin_include_directory: "/usr/lib64"
-  cxx_builtin_include_directory: "/usr/local/include"
-  cxx_builtin_include_directory: "/usr/include"
-
-  cxx_builtin_include_directory: "%{sycl_include_dir}"
-  cxx_builtin_include_directory: "%{python_lib_path}"
-
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-
-  # C(++) compiles invoke the compiler (as that is the one knowing where
-  # to find libraries), but we provide LD so other rules can invoke the linker.
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  objcopy_embed_flag: "-I"
-  objcopy_embed_flag: "binary"
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Make C++ compilation deterministic. Use linkstamping instead of these
-  # compiler symbols.
-  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
-  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
-
-  compiler_flag: "-fPIE"
-
-  # Keep stack frames for debugging, even in opt mode.
-  compiler_flag: "-fno-omit-frame-pointer"
-
-  # Anticipated future default.
-  linker_flag: "-no-canonical-prefixes"
-  unfiltered_cxx_flag: "-fno-canonical-system-headers"
-
-  # Have gcc return the exit code from ld.
-  linker_flag: "-pass-exit-codes"
-
-  # All warnings are enabled. Maybe enable -Werror as well?
-  compiler_flag: "-Wall"
-
-  # Enable SSE instructions by default
-  compiler_flag: "-msse3"
-
-  # Anticipated future default.
-  linker_flag: "-Wl,-no-as-needed"
-  # Stamp the binary with a unique identifier.
-  linker_flag: "-Wl,--build-id=md5"
-  linker_flag: "-Wl,--hash-style=gnu"
-
-  linking_mode_flags { mode: DYNAMIC }
-
-  compilation_mode_flags {
-    mode: FASTBUILD
-    compiler_flag: "-O0"
-  }
-
-  compilation_mode_flags {
-    mode: DBG
-    compiler_flag: "-g"
-  }
-
-  compilation_mode_flags {
-    mode: OPT
-    compiler_flag: "-g0"
-    compiler_flag: "-O2"
-    compiler_flag: "-DNDEBUG"
-    compiler_flag: "-ffunction-sections"
-    compiler_flag: "-fdata-sections"
-    linker_flag: "-Wl,--gc-sections"
-  }
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  supports_gold_linker: false
-  supports_incremental_linker: false
-  supports_fission: false
-  supports_interface_shared_objects: false
-  supports_normalizing_ar: false
-  supports_start_end_lib: false
-  supports_thin_archives: false
-  target_libc: "local"
-  target_cpu: "local"
-  target_system_name: "local"
-  toolchain_identifier: "local_arm"
-
-  tool_path { name: "ar" path: "/usr/bin/ar" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  tool_path { name: "gcc" path: "computecpp" }
-  # Use "-std=c++11" for nvcc. For consistency, force both the host compiler
-  # and the device compiler to use "-std=c++11".
-  cxx_flag: "-std=c++11"
-  linker_flag: "-Wl,-no-as-needed"
-  linker_flag: "-lstdc++"
-  linker_flag: "-B/usr/bin/"
-
-  # TODO(bazel-team): In theory, the path here ought to exactly match the path
-  # used by gcc. That works because bazel currently doesn't track files at
-  # absolute locations and has no remote execution, yet. However, this will need
-  # to be fixed, maybe with auto-detection?
-  cxx_builtin_include_directory: "/usr/lib/gcc/"
-  cxx_builtin_include_directory: "/usr/lib"
-  cxx_builtin_include_directory: "/usr/lib64"
-  cxx_builtin_include_directory: "/usr/local/include"
-  cxx_builtin_include_directory: "/usr/include"
-
-  cxx_builtin_include_directory: "%{computecpp_toolkit_path}"
-  cxx_builtin_include_directory: "%{python_lib_path}"
-
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-
-  # C(++) compiles invoke the compiler (as that is the one knowing where
-  # to find libraries), but we provide LD so other rules can invoke the linker.
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  objcopy_embed_flag: "-I"
-  objcopy_embed_flag: "binary"
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Make C++ compilation deterministic. Use linkstamping instead of these
-  # compiler symbols.
-  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
-  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
-
-  # All warnings are enabled. Maybe enable -Werror as well?
-  compiler_flag: "-Wall"
-
-  # Anticipated future default.
-  linker_flag: "-Wl,-no-as-needed"
-  # Stamp the binary with a unique identifier.
-  linker_flag: "-Wl,--build-id=md5"
-  linker_flag: "-Wl,--hash-style=gnu"
-
-  linking_mode_flags { mode: DYNAMIC }
-
-  compilation_mode_flags {
-    mode: FASTBUILD
-    compiler_flag: "-O0"
-  }
-
-  compilation_mode_flags {
-    mode: DBG
-    compiler_flag: "-g"
-  }
-
-  compilation_mode_flags {
-    mode: OPT
-    compiler_flag: "-g0"
-    compiler_flag: "-O2"
-    compiler_flag: "-DNDEBUG"
-  }
-}
\ No newline at end of file
diff --git a/third_party/sycl/crosstool/computecpp.tpl b/third_party/sycl/crosstool/computecpp.tpl
deleted file mode 100755
index ac27e81bc88..00000000000
--- a/third_party/sycl/crosstool/computecpp.tpl
+++ /dev/null
@@ -1,94 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import sys
-import tempfile
-from subprocess import call, Popen, PIPE
-
-CPU_CXX_COMPILER = ('%{host_cxx_compiler}')
-CPU_C_COMPILER = ('%{host_c_compiler}')
-
-CURRENT_DIR = os.path.dirname(sys.argv[0])
-COMPUTECPP_ROOT = CURRENT_DIR + '/../sycl/'
-COMPUTECPP_DRIVER= COMPUTECPP_ROOT + 'bin/compute++'
-COMPUTECPP_INCLUDE = COMPUTECPP_ROOT + 'include'
-
-def main():
-  remove_flags = ('-Wl,--no-undefined', '-Wno-unused-but-set-variable', '-Wignored-attributes')
-  # remove -fsanitize-coverage from string with g++
-  if 'g++' in CPU_CXX_COMPILER:
-    remove_flags += ('-fsanitize-coverage',)
-  compiler_flags = [flag for flag in sys.argv[1:] if not flag.startswith(remove_flags)]
-
-  output_file_index = compiler_flags.index('-o') + 1
-  output_file_name = compiler_flags[output_file_index]
-
-  if output_file_index == 1:
-    # we are linking
-    return call([CPU_CXX_COMPILER] + compiler_flags + ['-Wl,--no-undefined'])
-
-  # find what we compile
-  compiling_cpp = False
-  if '-c' in compiler_flags:
-    compiled_file_index = compiler_flags.index('-c') + 1
-    compiled_file_name = compiler_flags[compiled_file_index]
-    compiling_cpp = compiled_file_name.endswith(('.cc', '.c++', '.cpp', '.CPP', '.C', '.cxx'))
-
-  # add -D_GLIBCXX_USE_CXX11_ABI=0 to the command line if you have custom installation of GCC/Clang
-  compiler_flags = compiler_flags + ['-DEIGEN_USE_SYCL=1', '-DTENSORFLOW_USE_SYCL', '-DEIGEN_HAS_C99_MATH']
-
-  if not compiling_cpp:
-    # compile for C
-    return call([CPU_C_COMPILER] + compiler_flags)
-
-  # create a denylist of folders that will be skipped when compiling with ComputeCpp
-  skip_extensions = [".cu.cc"]
-  skip_folders = ["tensorflow/compiler", "tensorflow/docs_src", "third_party", "external", "hexagon"]
-  skip_folders = [(folder + '/') for folder in skip_folders]
-  # if compiling external project skip computecpp
-  if any(compiled_file_name.endswith(_ext) for _ext in skip_extensions) or any(_folder in output_file_name for _folder in skip_folders):
-    return call([CPU_CXX_COMPILER] + compiler_flags)
-
-  # this is an optimisation that will check if compiled file has to be compiled with ComputeCpp
-  flags_without_output = list(compiler_flags)
-  del flags_without_output[output_file_index]   # remove output_file_name
-  del flags_without_output[output_file_index - 1] # remove '-o'
-  # create preprocessed of the file and store it for later use
-  pipe = Popen([CPU_CXX_COMPILER] + flags_without_output + ["-E"], stdout=PIPE)
-  preprocessed_file_str = pipe.communicate()[0]
-  if pipe.returncode != 0:
-    return pipe.returncode
-
-  # check if it has parallel_for in it
-  if not '.parallel_for' in preprocessed_file_str:
-    # call CXX compiler like usual
-    with tempfile.NamedTemporaryFile(suffix=".ii") as preprocessed_file: # Force '.ii' extension so that g++ does not preprocess the file again
-      preprocessed_file.write(preprocessed_file_str)
-      preprocessed_file.flush()
-      compiler_flags[compiled_file_index] = preprocessed_file.name
-      return call([CPU_CXX_COMPILER] + compiler_flags)
-  del preprocessed_file_str   # save some memory as this string can be quite big
-
-  filename, file_extension = os.path.splitext(output_file_name)
-  bc_out = filename + '.sycl'
-
-  # strip asan for the device
-  computecpp_device_compiler_flags = ['-sycl-compress-name', '-Wno-unused-variable', '-Wno-c++11-narrowing',
-                                      '-I', COMPUTECPP_INCLUDE, '-isystem', COMPUTECPP_INCLUDE,
-                                      '-std=c++11', '-sycl', '-emit-llvm', '-no-serial-memop',
-                                      '-Xclang', '-cl-denorms-are-zero', '-Xclang', '-cl-fp32-correctly-rounded-divide-sqrt']
-  # disable flags enabling SIMD instructions
-  computecpp_device_compiler_flags += [flag for flag in compiler_flags if \
-    not any(x in flag.lower() for x in ('-fsanitize', '-fno-canonical-system-headers', '=native', '=core2', 'msse', 'vectorize', 'mavx', 'mmmx', 'm3dnow', 'fma'))]
-
-  x = call([COMPUTECPP_DRIVER] + computecpp_device_compiler_flags)
-  if x == 0:
-    # dont want that in case of compiling with computecpp first
-    host_compiler_flags = [flag for flag in compiler_flags if (not flag.startswith(('-MF', '-MD',)) and not '.d' in flag)]
-    host_compiler_flags[host_compiler_flags.index('-c')] = "--include"
-    host_compiler_flags = ['-xc++', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '-c', bc_out] + host_compiler_flags
-    x = call([CPU_CXX_COMPILER] + host_compiler_flags)
-  return x
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/sycl/crosstool/trisycl.tpl b/third_party/sycl/crosstool/trisycl.tpl
deleted file mode 100644
index 8206a1a94b1..00000000000
--- a/third_party/sycl/crosstool/trisycl.tpl
+++ /dev/null
@@ -1,85 +0,0 @@
-#!/usr/bin/env python
-
-import os
-import sys
-import tempfile
-from subprocess import call
-
-CPU_CXX_COMPILER = ('%{host_cxx_compiler}')
-CPU_C_COMPILER = ('%{host_c_compiler}')
-
-CURRENT_DIR = os.path.dirname(sys.argv[0])
-TRISYCL_INCLUDE_DIR = CURRENT_DIR + '/../sycl/include'
-
-
-def main():
-  compiler_flags = []
-
-  remove_flags = ('-Wl,--no-undefined', '-Wno-unused-but-set-variable',
-                  '-Wignored-attributes', '-fno-exceptions')
-  # remove -fsamotoze-coverage from string with g++
-  if 'g++' in CPU_CXX_COMPILER:
-    remove_flags += ('-fsanitize-coverage',)
-    compiler_flags += ['-fopenmp']
-  else:
-    compiler_flags += ['-fopenmp=libomp']
-
-  compiler_flags += [
-      flag for flag in sys.argv[1:] if not flag.startswith(remove_flags)
-  ]
-
-  output_file_index = compiler_flags.index('-o') + 1
-  output_file_name = compiler_flags[output_file_index]
-
-  if (output_file_index == 1):
-    # we are linking
-    return call([CPU_CXX_COMPILER] + compiler_flags + ['-Wl,--no-undefined'])
-
-  # find what we compile
-  compiling_cpp = 0
-  if ('-c' in compiler_flags):
-    compiled_file_index = compiler_flags.index('-c') + 1
-    compiled_file_name = compiler_flags[compiled_file_index]
-    if (compiled_file_name.endswith(('.cc', '.c++', '.cpp', '.CPP', '.C',
-                                     '.cxx'))):
-      compiling_cpp = 1
-
-  debug_flags = [
-      '-DTRISYCL_DEBUG', '-DBOOST_LOG_DYN_LINK', '-DTRISYCL_TRACE_KERNEL',
-      '-lpthread', '-lboost_log', '-g', '-rdynamic'
-  ]
-
-  opt_flags = ['-DNDEBUG', '-DBOOST_DISABLE_ASSERTS', '-O3']
-
-  compiler_flags = compiler_flags + [
-      '-DEIGEN_USE_SYCL=1', '-DEIGEN_HAS_C99_MATH',
-      '-DEIGEN_MAX_ALIGN_BYTES=16', '-DTENSORFLOW_USE_SYCL'
-  ] + opt_flags
-
-  if (compiling_cpp == 1):
-    # create a denylist of folders that will be skipped when compiling
-    # with triSYCL
-    skip_extensions = ['.cu.cc']
-    skip_folders = [
-        'tensorflow/compiler', 'tensorflow/docs_src', 'tensorflow/tensorboard',
-        'third_party', 'external', 'hexagon'
-    ]
-    skip_folders = [(folder + '/') for folder in skip_folders]
-    # if compiling external project skip triSYCL
-    if any(
-        compiled_file_name.endswith(_ext) for _ext in skip_extensions) or any(
-            _folder in output_file_name for _folder in skip_folders):
-      return call([CPU_CXX_COMPILER] + compiler_flags)
-
-    host_compiler_flags = [
-        '-xc++', '-Wno-unused-variable', '-I', TRISYCL_INCLUDE_DIR
-    ] + compiler_flags
-    x = call([CPU_CXX_COMPILER] + host_compiler_flags)
-    return x
-  else:
-    # compile for C
-    return call([CPU_C_COMPILER] + compiler_flags)
-
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/third_party/sycl/sycl/BUILD b/third_party/sycl/sycl/BUILD
deleted file mode 100644
index 65f5a8414c4..00000000000
--- a/third_party/sycl/sycl/BUILD
+++ /dev/null
@@ -1,8 +0,0 @@
-# Description:
-# A minimal BUILD file to make template files in this folder available. Without this BUILD file,
-# bazel returns errors when trying to access tpl files in this folder.
-
-package(
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],  # Apache 2.0
-)
diff --git a/third_party/sycl/sycl/BUILD.tpl b/third_party/sycl/sycl/BUILD.tpl
deleted file mode 100755
index b7e9aa8edb4..00000000000
--- a/third_party/sycl/sycl/BUILD.tpl
+++ /dev/null
@@ -1,56 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
-load(":platform.bzl", "sycl_library_path")
-
-load(":platform.bzl", "readlink_command")
-
-package(default_visibility = ["//visibility:public"])
-
-exports_files(["LICENSE.text"])
-
-config_setting(
-    name = "using_sycl_ccpp",
-    define_values = {
-        "using_sycl": "true",
-        "using_trisycl": "false",
-    },
-)
-
-config_setting(
-    name = "using_sycl_trisycl",
-    define_values = {
-        "using_sycl": "true",
-        "using_trisycl": "true",
-    },
-)
-
-
-cc_library(
-    name = "sycl_headers",
-    hdrs = glob([
-        "**/*.h",
-        "**/*.hpp",
-    ]),
-    includes = [".", "include"],
-)
-
-cc_library(
-    name = "syclrt",
-    srcs = [
-        sycl_library_path("ComputeCpp")
-    ],
-    data = [
-        sycl_library_path("ComputeCpp")
-    ],
-    includes = ["include/"],
-    linkstatic = 0,
-)
-
-cc_library(
-    name = "sycl",
-    deps = if_sycl([
-        ":sycl_headers",
-        ":syclrt",
-    ]),
-)
diff --git a/third_party/sycl/sycl/LICENSE.text b/third_party/sycl/sycl/LICENSE.text
deleted file mode 100644
index 8d3f050b392..00000000000
--- a/third_party/sycl/sycl/LICENSE.text
+++ /dev/null
@@ -1,268 +0,0 @@
-
----------------------------------------------------------------------
-
-SOFTWARE LICENSE AGREEMENT
-
----------------------------------------------------------------------
----------------------------------------------------------------------
-
-By downloading, installing, copying, or otherwise using the
-ComputeCpp Community Edition software, including any associated
-components, media, printed materials, and electronic documentation
-("Software"), the user agrees to the following terms and conditions
-of this Software License Agreement ("Agreement"). Please read the
-terms of this Agreement carefully before beginning your download, as
-pressing the "I AGREE" button at the end of this Agreement will
-confirm your assent. If you do not agree to these terms, then
-Codeplay Software Limited is unwilling to license the Software to
-you; so please press the "CANCEL" button to cancel your download.
-
- 1. License. Codeplay Software Ltd., a company incorporated in
-    England and Wales with registered number 04567874 and having its
-    registered office at Regent House, 316 Beulah Hill, London,
-    United Kingdom, SE19 3HF ("Codeplay") hereby grants the user,
-    free of charge, a non-exclusive worldwide license to use and
-    replicate (but not modify) the Software for any use, whether
-    commercial or non-commercial, in accordance with this Agreement.
-    Codeplay reserves all rights to the Software that are not
-    expressly granted by this Agreement.
- 2. Redistribution. The user may copy and redistribute unmodified
-    copies of only those components of the Software which are
-    specified below ("Redistributable Components"), in object code
-    form, as part of the user’s software applications or libraries
-    ("Applications"). The user acknowledges and agrees that it has no
-    right to modify the Redistributable Components in any way. Any
-    use of the Redistributable Components within the user’s
-    Applications will continue to be subject to the terms and
-    conditions of this Agreement, and the user must also distribute a
-    copy of this Agreement and reproduce and include all notices of
-    copyrights or other proprietary rights in the Software. The
-    user’s redistribution of the Redistributable Components will not
-    entitle it to any payment from Codeplay. The user may not
-    transfer any of its rights or obligations under this Agreement.
-
-+-------------------------------------------+
-|Redistributable Component|File Name        |
-|-------------------------+-----------------|
-|Runtime (for Linux)      |libComputeCpp.so |
-|-------------------------+-----------------|
-|Runtime (for Windows)    |libComputeCpp.dll|
-+-------------------------------------------+
-
- 3. Restrictions. The user shall not:
-
-     a. circumvent or bypass any technological protection measures in
-        or relating to the Software;
-     b. use the Software to perform any unauthorized transfer of
-        information or for any illegal purpose;
-     c. de-compile, decrypt, disassemble, hack, emulate, exploit or
-        reverse-engineer the Software (other than to the limited
-        extent permitted by law);
-     d. copy or redistribute any components of the Software that are
-        not listed in the table of Redistributable Components;
-     e. publish, rent, lease, sell, export, import, or lend the
-        Software;
-     f. represent in any way that it is selling the Software itself
-        or any license to use the Software, nor refer to Codeplay or
-        ComputeCpp within its marketing materials, without the
-        express prior written permission of Codeplay.
- 4. Support. Codeplay does not provide any guarantees of support for
-    the Software to the user. Codeplay will use reasonable endeavors
-    to respond to users' support requests, for the most recent
-    release only, via the community support website at https://
-    computecpp.codeplay.com.
- 5. Intellectual Property. The Software is owned by Codeplay or its
-    licensors, and is protected by the copyright laws of the United
-    Kingdom and other countries and international treaty provisions.
-    Codeplay (and/or its licensors, as the case may be) retains all
-    copyrights, trade secrets and other proprietary rights in the
-    Software, including the rights to make and license the use of all
-    copies. To the extent that any patents owned by Codeplay or its
-    licensors relate to any component of the Software, the license
-    granted to the user in accordance with this Agreement allows for
-    the lawful use of such patents but only for the purposes of this
-    Agreement and not further or otherwise. Therefore, the user may
-    make no copies of the Software, or the written materials that
-    accompany the Software, or reproduce it in any way, except as set
-    forth above.
- 6. Terms. This Agreement is effective until terminated. Codeplay or
-    the user may terminate it immediately at any time. Any violation
-    of the terms of this Agreement by the user will result in
-    immediate termination by Codeplay. Upon termination, the user
-    must return or destroy the Software and accompanying materials
-    and notify Codeplay of its actions by email to info@codeplay.com.
- 7. NO WARRANTIES. Codeplay expressly disclaims any warranty for the
-    Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
-    ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-    WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE
-    AND NON-INFRINGEMENT. IN NO EVENT SHALL CODEPLAY BE LIABLE FOR
-    ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
-    CONTRACT, DELICT OR TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-    SOFTWARE. In particular, Codeplay provides no guarantees of
-    application performance on the target hardware.
- 8. General. The invalidity of any portion or provision of this
-    Agreement shall not affect any other portions or provisions. This
-    Agreement shall be governed by the laws of Scotland. This
-    Agreement is the complete and exclusive agreement between the
-    user and Codeplay regarding the Software, and it supersedes any
-    prior agreement, oral or written, and any other communication
-    between the user and Codeplay relating to the subject matter of
-    the Agreement. Any amendment or modification of this Agreement
-    must be in writing and signed by both parties. If the user does
-    not agree to the terms of this Agreement, the user must not
-    install or use the Software.
- 9. Third Party Licenses. The following licenses are for third-party
-    components included in the software.
-
-     a. License for Clang/LLVM compiler technology components:
-
-==============================================================================
-
-LLVM Release License
-
-==============================================================================
-
-University of Illinois/NCSA
-
-Open Source License
-
-Copyright (c) 2007-2014 University of Illinois at Urbana-Champaign.
-
-All rights reserved.
-
-Developed by:
-
- LLVM Team
-
- University of Illinois at Urbana-Champaign
-
- http://llvm.org
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal with
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
- * Redistributions of source code must retain the above copyright notice,
- this list of conditions and the following disclaimers.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimers in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the names of the LLVM Team, University of Illinois at
- Urbana-Champaign, nor the names of its contributors may be used to
- endorse or promote products derived from this Software without specific
- prior written permission.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
-FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
-SOFTWARE.
-
-==============================================================================
-
- b. License for OpenBSD regex components:
-
-$OpenBSD: COPYRIGHT,v 1.3 2003/06/02 20:18:36 millert Exp $
-Copyright 1992, 1993, 1994 Henry Spencer. All rights reserved.
-This software is not subject to any license of the American Telephone
-and Telegraph Company or of the Regents of the University of California.
-Permission is granted to anyone to use this software for any purpose on
-any computer system, and to alter it and redistribute it, subject
-to the following restrictions:
-
-1. The author is not responsible for the consequences of use of this
- software, no matter how awful, even if they arise from flaws in it.
-
-2. The origin of this software must not be misrepresented, either by
- explicit claim or by omission. Since few users ever read sources,
- credits must appear in the documentation.
-
-3. Altered versions must be plainly marked as such, and must not be
- misrepresented as being the original software. Since few users
- ever read sources, credits must appear in the documentation.
-
-4. This notice may not be removed or altered.
-
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-
-/*-
- * Copyright (c) 1994
- *      The Regents of the University of California. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *                  @(#)COPYRIGHT8.1 (Berkeley) 3/16/94
- */
-
- c. License for MD5 components:
-
-/*
- * This code is derived from (original license follows):
- *
- * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
- * MD5 Message-Digest Algorithm (RFC 1321).
- *
- * Homepage:
- *  http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
- *
- * Author:
- * Alexander Peslyak, better known as Solar Designer <solar at openwall.com>
- *
- * This software was written by Alexander Peslyak in 2001. No copyright is
- * claimed, and the software is hereby placed in the public domain.
- * In case this attempt to disclaim copyright and place the software in the
- * public domain is deemed null and void, then the software is
- * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the
- * general public under the following terms:
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted.
- *
- * There's ABSOLUTELY NO WARRANTY, express or implied.
- *
- * (This is a heavily cut-down "BSD license".)
- *
- * This differs from Colin Plumb's older public domain implementation in that
- * no exactly 32-bit integer data type is required (any 32-bit or wider
- * unsigned integer data type will do), there's no compile-time endianness
- * configuration, and the function prototypes match OpenSSL's. No code from
- * Colin Plumb's implementation has been reused; this comment merely compares
- * the properties of the two independent implementations.
- *
- * The primary goals of this implementation are portability and ease of use.
- * It is meant to be fast, but not as fast as possible. Some known
- * optimizations are not included to reduce source code size and avoid
- * compile-time configuration.
- */
-
-
diff --git a/third_party/sycl/sycl/build_defs.bzl.tpl b/third_party/sycl/sycl/build_defs.bzl.tpl
deleted file mode 100755
index a726c8d953c..00000000000
--- a/third_party/sycl/sycl/build_defs.bzl.tpl
+++ /dev/null
@@ -1,28 +0,0 @@
-# Macros for building SYCL code.
-
-def if_sycl(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with SYCL.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with SYCL enabled.  Otherwise, the select statement evaluates to if_false.
-    If we are building with triSYCL instead of ComputeCPP, a list with
-    the first element of if_true is returned.
-    """
-    return select({
-        "@local_config_sycl//sycl:using_sycl_ccpp": if_true,
-        "@local_config_sycl//sycl:using_sycl_trisycl": if_true[0:1],
-        "//conditions:default": if_false,
-    })
-
-def if_ccpp(if_true, if_false = []):
-    """Shorthand for select()'ing if we are building with ComputeCPP.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with ComputeCPP enabled. Otherwise, the select statement evaluates
-    to if_false.
-    """
-    return select({
-        "@local_config_sycl//sycl:using_sycl_ccpp": if_true,
-        "@local_config_sycl//sycl:using_sycl_trisycl": if_false,
-        "//conditions:default": if_false,
-    })
diff --git a/third_party/sycl/sycl/platform.bzl.tpl b/third_party/sycl/sycl/platform.bzl.tpl
deleted file mode 100755
index cb4b3356b22..00000000000
--- a/third_party/sycl/sycl/platform.bzl.tpl
+++ /dev/null
@@ -1,5 +0,0 @@
-def sycl_library_path(name):
-    return "lib/lib{}.so".format(name)
-
-def readlink_command():
-    return "readlink"
diff --git a/third_party/sycl/sycl_configure.bzl b/third_party/sycl/sycl_configure.bzl
deleted file mode 100644
index 185160af9e3..00000000000
--- a/third_party/sycl/sycl_configure.bzl
+++ /dev/null
@@ -1,260 +0,0 @@
-"""SYCL autoconfiguration.
-`sycl_configure` depends on the following environment variables:
-
-  * HOST_CXX_COMPILER:  The host C++ compiler
-  * HOST_C_COMPILER:    The host C compiler
-  * COMPUTECPP_TOOLKIT_PATH: The path to the ComputeCpp toolkit.
-  * TRISYCL_INCLUDE_DIR: The path to the include directory of triSYCL.
-                         (if using triSYCL instead of ComputeCPP)
-  * PYTHON_LIB_PATH: The path to the python lib
-"""
-
-_HOST_CXX_COMPILER = "HOST_CXX_COMPILER"
-_HOST_C_COMPILER = "HOST_C_COMPILER"
-_COMPUTECPP_TOOLKIT_PATH = "COMPUTECPP_TOOLKIT_PATH"
-_TRISYCL_INCLUDE_DIR = "TRISYCL_INCLUDE_DIR"
-_PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
-
-def _enable_sycl(repository_ctx):
-    if "TF_NEED_OPENCL_SYCL" in repository_ctx.os.environ:
-        enable_sycl = repository_ctx.os.environ["TF_NEED_OPENCL_SYCL"].strip()
-        return enable_sycl == "1"
-    return False
-
-def _enable_compute_cpp(repository_ctx):
-    return _COMPUTECPP_TOOLKIT_PATH in repository_ctx.os.environ
-
-def auto_configure_fail(msg):
-    """Output failure message when auto configuration fails."""
-    red = "\033[0;31m"
-    no_color = "\033[0m"
-    fail("\n%sAuto-Configuration Error:%s %s\n" % (red, no_color, msg))
-
-# END cc_configure common functions (see TODO above).
-
-def find_c(repository_ctx):
-    """Find host C compiler."""
-    c_name = "gcc"
-    if _HOST_C_COMPILER in repository_ctx.os.environ:
-        c_name = repository_ctx.os.environ[_HOST_C_COMPILER].strip()
-    if c_name.startswith("/"):
-        return c_name
-    c = repository_ctx.which(c_name)
-    if c == None:
-        fail("Cannot find C compiler, please correct your path.")
-    return c
-
-def find_cc(repository_ctx):
-    """Find host C++ compiler."""
-    cc_name = "g++"
-    if _HOST_CXX_COMPILER in repository_ctx.os.environ:
-        cc_name = repository_ctx.os.environ[_HOST_CXX_COMPILER].strip()
-    if cc_name.startswith("/"):
-        return cc_name
-    cc = repository_ctx.which(cc_name)
-    if cc == None:
-        fail("Cannot find C++ compiler, please correct your path.")
-    return cc
-
-def find_computecpp_root(repository_ctx):
-    """Find ComputeCpp compiler."""
-    sycl_name = ""
-    if _COMPUTECPP_TOOLKIT_PATH in repository_ctx.os.environ:
-        sycl_name = repository_ctx.os.environ[_COMPUTECPP_TOOLKIT_PATH].strip()
-    if sycl_name.startswith("/"):
-        return sycl_name
-    fail("Cannot find SYCL compiler, please correct your path")
-
-def find_trisycl_include_dir(repository_ctx):
-    """Find triSYCL include directory. """
-    if _TRISYCL_INCLUDE_DIR in repository_ctx.os.environ:
-        sycl_name = repository_ctx.os.environ[_TRISYCL_INCLUDE_DIR].strip()
-        if sycl_name.startswith("/"):
-            return sycl_name
-    fail("Cannot find triSYCL include directory, please correct your path")
-
-def find_python_lib(repository_ctx):
-    """Returns python path."""
-    if _PYTHON_LIB_PATH in repository_ctx.os.environ:
-        return repository_ctx.os.environ[_PYTHON_LIB_PATH].strip()
-    fail("Environment variable PYTHON_LIB_PATH was not specified re-run ./configure")
-
-def _check_lib(repository_ctx, toolkit_path, lib):
-    """Checks if lib exists under sycl_toolkit_path or fail if it doesn't.
-
-    Args:
-      repository_ctx: The repository context.
-      toolkit_path: The toolkit directory containing the libraries.
-      ib: The library to look for under toolkit_path.
-    """
-    lib_path = toolkit_path + "/" + lib
-    if not repository_ctx.path(lib_path).exists:
-        auto_configure_fail("Cannot find %s" % lib_path)
-
-def _check_dir(repository_ctx, directory):
-    """Checks whether the directory exists and fail if it does not.
-
-    Args:
-      repository_ctx: The repository context.
-      directory: The directory to check the existence of.
-    """
-    if not repository_ctx.path(directory).exists:
-        auto_configure_fail("Cannot find dir: %s" % directory)
-
-def _symlink_dir(repository_ctx, src_dir, dest_dir):
-    """Symlinks all the files in a directory.
-
-    Args:
-      repository_ctx: The repository context.
-      src_dir: The source directory.
-      dest_dir: The destination directory to create the symlinks in.
-    """
-    files = repository_ctx.path(src_dir).readdir()
-    for src_file in files:
-        repository_ctx.symlink(src_file, dest_dir + "/" + src_file.basename)
-
-def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
-    if not out:
-        out = tpl.replace(":", "/")
-    repository_ctx.template(
-        out,
-        Label("//third_party/sycl/%s.tpl" % tpl),
-        substitutions,
-    )
-
-def _file(repository_ctx, label):
-    repository_ctx.template(
-        label.replace(":", "/"),
-        Label("//third_party/sycl/%s" % label),
-        {},
-    )
-
-_DUMMY_CROSSTOOL_BZL_FILE = """
-def error_sycl_disabled():
-  fail("ERROR: Building with --config=sycl but TensorFlow is not configured " +
-       "to build with SYCL support. Please re-run ./configure and enter 'Y' " +
-       "at the prompt to build with SYCL support.")
-
-  native.genrule(
-      name = "error_gen_crosstool",
-      outs = ["CROSSTOOL"],
-      cmd = "echo 'Should not be run.' && exit 1",
-  )
-
-  native.filegroup(
-      name = "crosstool",
-      srcs = [":CROSSTOOL"],
-      output_licenses = ["unencumbered"],
-  )
-"""
-
-_DUMMY_CROSSTOOL_BUILD_FILE = """
-load("//crosstool:error_sycl_disabled.bzl", "error_sycl_disabled")
-
-error_sycl_disabled()
-"""
-
-def _create_dummy_repository(repository_ctx):
-    # Set up BUILD file for sycl/.
-    _tpl(repository_ctx, "sycl:build_defs.bzl")
-    _tpl(repository_ctx, "sycl:BUILD")
-    _file(repository_ctx, "sycl:LICENSE.text")
-    _tpl(repository_ctx, "sycl:platform.bzl")
-
-    # Create dummy files for the SYCL toolkit since they are still required by
-    # tensorflow/sycl/platform/default/build_config:sycl.
-    repository_ctx.file("sycl/include/sycl.hpp", "")
-    repository_ctx.file("sycl/lib/libComputeCpp.so", "")
-
-    # If sycl_configure is not configured to build with SYCL support, and the user
-    # attempts to build with --config=sycl, add a dummy build rule to intercept
-    # this and fail with an actionable error message.
-    repository_ctx.file(
-        "crosstool/error_sycl_disabled.bzl",
-        _DUMMY_CROSSTOOL_BZL_FILE,
-    )
-    repository_ctx.file("crosstool/BUILD", _DUMMY_CROSSTOOL_BUILD_FILE)
-
-def _sycl_autoconf_imp(repository_ctx):
-    """Implementation of the sycl_autoconf rule."""
-    if not _enable_sycl(repository_ctx):
-        _create_dummy_repository(repository_ctx)
-    else:
-        # copy template files
-        _tpl(repository_ctx, "sycl:build_defs.bzl")
-        _tpl(repository_ctx, "sycl:BUILD")
-        _tpl(repository_ctx, "sycl:platform.bzl")
-        _tpl(repository_ctx, "crosstool:BUILD")
-        _file(repository_ctx, "sycl:LICENSE.text")
-
-        if _enable_compute_cpp(repository_ctx):
-            _tpl(
-                repository_ctx,
-                "crosstool:computecpp",
-                {
-                    "%{host_cxx_compiler}": find_cc(repository_ctx),
-                    "%{host_c_compiler}": find_c(repository_ctx),
-                },
-            )
-
-            computecpp_root = find_computecpp_root(repository_ctx)
-            _check_dir(repository_ctx, computecpp_root)
-
-            _tpl(
-                repository_ctx,
-                "crosstool:CROSSTOOL",
-                {
-                    "%{sycl_include_dir}": computecpp_root,
-                    "%{sycl_impl}": "computecpp",
-                    "%{c++_std}": "-std=c++11",
-                    "%{python_lib_path}": find_python_lib(repository_ctx),
-                },
-            )
-
-            # symlink libraries
-            _check_lib(repository_ctx, computecpp_root + "/lib", "libComputeCpp.so")
-            _symlink_dir(repository_ctx, computecpp_root + "/lib", "sycl/lib")
-            _symlink_dir(repository_ctx, computecpp_root + "/include", "sycl/include")
-            _symlink_dir(repository_ctx, computecpp_root + "/bin", "sycl/bin")
-        else:
-            trisycl_include_dir = find_trisycl_include_dir(repository_ctx)
-            _check_dir(repository_ctx, trisycl_include_dir)
-
-            _tpl(
-                repository_ctx,
-                "crosstool:trisycl",
-                {
-                    "%{host_cxx_compiler}": find_cc(repository_ctx),
-                    "%{host_c_compiler}": find_c(repository_ctx),
-                    "%{trisycl_include_dir}": trisycl_include_dir,
-                },
-            )
-
-            _tpl(
-                repository_ctx,
-                "crosstool:CROSSTOOL",
-                {
-                    "%{sycl_include_dir}": trisycl_include_dir,
-                    "%{sycl_impl}": "trisycl",
-                    "%{c++_std}": "-std=c++1y",
-                    "%{python_lib_path}": find_python_lib(repository_ctx),
-                },
-            )
-
-            _symlink_dir(repository_ctx, trisycl_include_dir, "sycl/include")
-
-sycl_configure = repository_rule(
-    implementation = _sycl_autoconf_imp,
-    local = True,
-)
-"""Detects and configures the SYCL toolchain.
-
-Add the following to your WORKSPACE FILE:
-
-```python
-sycl_configure(name = "local_config_sycl")
-```
-
-Args:
-  name: A unique name for this workspace rule.
-"""
diff --git a/third_party/systemlibs/jsoncpp.BUILD b/third_party/systemlibs/jsoncpp.BUILD
index 7d54f9289bf..b5951e3a340 100644
--- a/third_party/systemlibs/jsoncpp.BUILD
+++ b/third_party/systemlibs/jsoncpp.BUILD
@@ -5,35 +5,8 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
-HEADERS = [
-    "include/json/allocator.h",
-    "include/json/assertions.h",
-    "include/json/autolink.h",
-    "include/json/config.h",
-    "include/json/features.h",
-    "include/json/forwards.h",
-    "include/json/json.h",
-    "include/json/reader.h",
-    "include/json/value.h",
-    "include/json/version.h",
-    "include/json/writer.h",
-]
-
-genrule(
-    name = "link_headers",
-    outs = HEADERS,
-    cmd = """
-      for i in $(OUTS); do
-        i=$${i##*/}
-        ln -sf $(INCLUDEDIR)/jsoncpp/json/$$i $(@D)/include/json/$$i
-      done
-    """,
-)
-
 cc_library(
     name = "jsoncpp",
-    hdrs = HEADERS,
-    includes = ["."],
     linkopts = ["-ljsoncpp"],
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/systemlibs/protobuf.BUILD b/third_party/systemlibs/protobuf.BUILD
index 118135d1290..ccf2ab4dc7d 100644
--- a/third_party/systemlibs/protobuf.BUILD
+++ b/third_party/systemlibs/protobuf.BUILD
@@ -12,38 +12,24 @@ filegroup(
     visibility = ["//visibility:public"],
 )
 
-HEADERS = [
-    "google/protobuf/any.pb.h",
+PROTO_FILES = [
     "google/protobuf/any.proto",
-    "google/protobuf/arena.h",
-    "google/protobuf/compiler/importer.h",
-    "google/protobuf/descriptor.h",
-    "google/protobuf/descriptor.pb.h",
+    "google/protobuf/api.proto",
+    "google/protobuf/compiler/plugin.proto",
     "google/protobuf/descriptor.proto",
-    "google/protobuf/duration.pb.h",
     "google/protobuf/duration.proto",
-    "google/protobuf/dynamic_message.h",
-    "google/protobuf/empty.pb.h",
     "google/protobuf/empty.proto",
-    "google/protobuf/field_mask.pb.h",
     "google/protobuf/field_mask.proto",
-    "google/protobuf/io/coded_stream.h",
-    "google/protobuf/io/zero_copy_stream.h",
-    "google/protobuf/io/zero_copy_stream_impl_lite.h",
-    "google/protobuf/map.h",
-    "google/protobuf/repeated_field.h",
-    "google/protobuf/text_format.h",
-    "google/protobuf/timestamp.pb.h",
+    "google/protobuf/source_context.proto",
+    "google/protobuf/struct.proto",
     "google/protobuf/timestamp.proto",
-    "google/protobuf/util/json_util.h",
-    "google/protobuf/util/type_resolver_util.h",
-    "google/protobuf/wrappers.pb.h",
+    "google/protobuf/type.proto",
     "google/protobuf/wrappers.proto",
 ]
 
 genrule(
-    name = "link_headers",
-    outs = HEADERS,
+    name = "link_proto_files",
+    outs = PROTO_FILES,
     cmd = """
       for i in $(OUTS); do
         f=$${i#$(@D)/}
@@ -55,14 +41,12 @@ genrule(
 
 cc_library(
     name = "protobuf",
-    hdrs = HEADERS,
     linkopts = ["-lprotobuf"],
     visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "protobuf_headers",
-    hdrs = HEADERS,
     linkopts = ["-lprotobuf"],
     visibility = ["//visibility:public"],
 )
@@ -83,7 +67,6 @@ genrule(
 
 cc_proto_library(
     name = "cc_wkt_protos",
-    hdrs = HEADERS,
     internal_bootstrap_hack = 1,
     protoc = ":protoc",
     visibility = ["//visibility:public"],
@@ -98,7 +81,78 @@ proto_gen(
 
 py_library(
     name = "protobuf_python",
-    data = [":link_headers"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
 )
+
+proto_library(
+    name = "any_proto",
+    srcs = ["google/protobuf/any.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "api_proto",
+    srcs = ["google/protobuf/api.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "compiler_plugin_proto",
+    srcs = ["google/protobuf/compiler/plugin.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "descriptor_proto",
+    srcs = ["google/protobuf/descriptor.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "duration_proto",
+    srcs = ["google/protobuf/duration.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "empty_proto",
+    srcs = ["google/protobuf/empty.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "field_mask_proto",
+    srcs = ["google/protobuf/field_mask.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "source_context_proto",
+    srcs = ["google/protobuf/source_context.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "struct_proto",
+    srcs = ["google/protobuf/struct.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "timestamp_proto",
+    srcs = ["google/protobuf/timestamp.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "type_proto",
+    srcs = ["google/protobuf/type.proto"],
+    visibility = ["//visibility:public"],
+)
+
+proto_library(
+    name = "wrappers_proto",
+    srcs = ["google/protobuf/wrappers.proto"],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/tf_toolchains.BUILD b/third_party/tf_toolchains.BUILD
new file mode 100644
index 00000000000..ffd0fb0cdc5
--- /dev/null
+++ b/third_party/tf_toolchains.BUILD
@@ -0,0 +1 @@
+package(default_visibility = ["//visibility:public"])
diff --git a/third_party/toolchains/clang6/CROSSTOOL.tpl b/third_party/toolchains/clang6/CROSSTOOL.tpl
index f98bef7a09b..16e36a1cd1f 100644
--- a/third_party/toolchains/clang6/CROSSTOOL.tpl
+++ b/third_party/toolchains/clang6/CROSSTOOL.tpl
@@ -531,7 +531,6 @@ toolchain {
   linker_flag: "-L/usr/lib/gcc/x86_64-linux-gnu/4.8"
 
   # This is the minimum x86 architecture TensorFlow supports.
-  compiler_flag: "-DARCH_K8"
   compiler_flag: "-m64"
 
   # These are for Linux.
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index 6751fbce1f7..8efa702bf1c 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -11,7 +11,7 @@ container_digests = {
     "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:5e6d21c8ef226316eb6df5e2e6015244c16a8e5d936b52a09820442d2f8a919f",
     "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:3f890a951c81a201d60d0161a56ce628a90323be0c7f795550caa37f6f41a85c",
     "cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython": "sha256:bd7666d1ef49b2b2e2a64981f1c9234deeccdb0d5198b30ff4289c3dfcffedbf",
-    "cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython": "sha256:b52edb4e35c780334ba417b008927722ae668847715a1624e9b2984e99c05338",
-    "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:ac52a60d12d0c9f81e558782b5431127b93bb1a13dab7294b3a5b3de91173019",
+    "cuda11.0-cudnn8-ubuntu18.04-manylinux2010-multipython": "sha256:f436545b7e14b014393b42975923dcd01f408496b1399abb5a35608f888ca140",
+    "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:8c6ba5a831c23906716cc9e9c201081f2b5632e3bf3cbc0207da0ddbef18d525",
     "windows-1803": "sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",
 }
diff --git a/third_party/typing_extensions.BUILD b/third_party/typing_extensions.BUILD
index efd526cd491..f3b6c26e295 100644
--- a/third_party/typing_extensions.BUILD
+++ b/third_party/typing_extensions.BUILD
@@ -4,11 +4,17 @@
 
 licenses(["notice"])  # PSF
 
-exports_files(["LICENSE"])
-
 py_library(
     name = "typing_extensions",
-    srcs = ["src_py3/typing_extensions.py"],
+    srcs = ["typing_extensions.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
 )
+
+genrule(
+    name = "license",
+    srcs = ["@astunparse_license"],
+    outs = ["LICENSE"],
+    cmd = "cp $< $@",
+    visibility = ["//visibility:public"],
+)